From f09b9b0ab706cfe25994a7ed35066e61e5af5d8f Mon Sep 17 00:00:00 2001
From: nikochiko <kaustubh299792458@gmail.com>
Date: Mon, 18 Nov 2019 23:36:54 +0530
Subject: [PATCH 0001/1390] Revert "Update save.py"

This reverts commit fdadd0e5e524df6488cd763c4ab7595d469ed1ef.
---
 tensorflow/python/keras/saving/save.py | 44 +++++---------------------
 1 file changed, 8 insertions(+), 36 deletions(-)

diff --git a/tensorflow/python/keras/saving/save.py b/tensorflow/python/keras/saving/save.py
index 9f7f5778afe..4be3aa0bbda 100644
--- a/tensorflow/python/keras/saving/save.py
+++ b/tensorflow/python/keras/saving/save.py
@@ -23,7 +23,6 @@ import os
 import six
 
 from tensorflow.python import tf2
-from tensorflow.python.keras.engine.network import _is_hdf5_filepath
 from tensorflow.python.keras.saving import hdf5_format
 from tensorflow.python.keras.saving.saved_model import load as saved_model_load
 from tensorflow.python.keras.saving.saved_model import save as saved_model_save
@@ -37,6 +36,9 @@ except ImportError:
   h5py = None
 # pylint: enable=g-import-not-at-top
 
+_HDF5_EXTENSIONS = ['.h5', '.hdf5', '.keras']
+
+
 # TODO(kathywu): Remove this when Keras SavedModel is not experimental.
 _KERAS_SAVED_MODEL_STILL_EXPERIMENTAL = True
 
@@ -90,42 +92,12 @@ def save_model(model,
   """
   from tensorflow.python.keras.engine import sequential  # pylint: disable=g-import-not-at-top
 
-  if type(filepath) != str and not isinstance(filepath, h5py.File):
-      raise ValueError(
-          'Expected `filepath` to be a String or `h5py.File` object. Got'
-          'unsupported value %s of type %s'
-          % (filepath, type(filepath)))
+  default_format = 'tf' if tf2.enabled() else 'h5'
+  save_format = save_format or default_format
 
-  filepath_is_h5py_file = h5py is not None and isinstance(filepath, h5py.File)
-  filepath_is_h5 = type(filepath) == str and _is_hdf5_filepath(filepath)
-  if save_format is None:
-    if (filepath_is_h5 or
-        (filepath_is_h5py_file)):
-      save_format = 'h5'
-    else:
-      save_format = 'tf' if tf2.enabled() else 'h5'
-  else:
-    user_format = save_format.lower().strip()
-    if user_format in ('tensorflow', 'tf'):
-      save_format = 'tf'
-    elif user_format in ('hdf5', 'h5', 'keras'):
-      save_format = 'h5'
-    else:
-      raise ValueError(
-          'Unknown format "%s". Was expecting one of {"tf", "h5"}.' % (
-          save_format,))
-  if save_format == 'tf' and filepath_is_h5:
-    raise ValueError(
-      ('`save` got save_format="tf"/"tensorflow", but the '
-       'filepath ("%s") looks like an HDF5 file. Omit the ".h5"/".keras" '
-       'when saving in TensorFlow format.')
-      % filepath)
-  if save_format == 'tf' and filepath_is_h5py_file:
-    raise ValueError(
-        '`save` got save_format="tf"/"tensorflow", but the given `filepath`'
-        'is an `h5py.File` object.')
-
-  if save_format == 'h5':
+  if (save_format == 'h5' or
+      (h5py is not None and isinstance(filepath, h5py.File)) or
+      os.path.splitext(filepath)[1] in _HDF5_EXTENSIONS):
     # TODO(b/130258301): add utility method for detecting model type.
     if (not model._is_graph_network and  # pylint:disable=protected-access
         not isinstance(model, sequential.Sequential)):

From ae86fd4b96ee934a8af239b5d01381bba8a470f1 Mon Sep 17 00:00:00 2001
From: nikochiko <kaustubh299792458@gmail.com>
Date: Wed, 20 Nov 2019 15:48:54 +0530
Subject: [PATCH 0002/1390] Update docstring for tf.ensure_shape

Update docstring
Enhance example.
---
 tensorflow/python/ops/check_ops.py | 42 ++++++++++++++++++------------
 1 file changed, 25 insertions(+), 17 deletions(-)

diff --git a/tensorflow/python/ops/check_ops.py b/tensorflow/python/ops/check_ops.py
index 34106f61fd8..68546d792a8 100644
--- a/tensorflow/python/ops/check_ops.py
+++ b/tensorflow/python/ops/check_ops.py
@@ -144,7 +144,7 @@ def _unary_assert_doc(sym, sym_name):
 
     Raises:
       InvalidArgumentError: if the check can be performed immediately and
-        `x {sym}` is False. The check can be performed immediately during 
+        `x {sym}` is False. The check can be performed immediately during
         eager execution or if `x` is statically known.
     """.format(
         sym=sym, sym_name=cap_sym_name, opname=opname)
@@ -207,7 +207,7 @@ def _binary_assert_doc(sym):
 
     Raises:
       InvalidArgumentError: if the check can be performed immediately and
-        `x {sym} y` is False. The check can be performed immediately during 
+        `x {sym} y` is False. The check can be performed immediately during
         eager execution or if `x` and `y` are statically known.
     """.format(
         sym=sym, opname=opname)
@@ -2107,24 +2107,30 @@ def ensure_shape(x, shape, name=None):
   """Updates the shape of a tensor and checks at runtime that the shape holds.
 
   For example:
-  ```python
-  x = tf.compat.v1.placeholder(tf.int32)
-  print(x.shape)
-  ==> TensorShape(None)
-  y = x * 2
-  print(y.shape)
-  ==> TensorShape(None)
+
+  >>> # tf.placeholder() is not compatible with eager execution
+  ...
+  >>> tf.compat.v1.disable_eager_execution()
+  >>> x = tf.compat.v1.placeholder(tf.int32)
+  >>> print(x.shape)
+  TensorShape(None)
+  >>> y = x * 2
+  >>> print(y.shape)
+  TensorShape(None)
 
   y = tf.ensure_shape(y, (None, 3, 3))
   print(y.shape)
-  ==> TensorShape([Dimension(None), Dimension(3), Dimension(3)])
+  TensorShape([None, 3, 3])
 
-  with tf.compat.v1.Session() as sess:
-    # Raises tf.errors.InvalidArgumentError, because the shape (3,) is not
-    # compatible with the shape (None, 3, 3)
-    sess.run(y, feed_dict={x: [1, 2, 3]})
+  >>> with tf.compat.v1.Session() as sess:
+  >>> sess.run(y, feed_dict={x: [1, 2, 3]})
+  Traceback (most recent call last):
+      ...
+  InvalidArgumentError: Shape of tensor mul [3] is not compatible with
+   expected shape [?,3,3].
 
-  ```
+  The above example raises `tf.errors.InvalidArgumentError`,
+  because the shape (3,) is not compatible with the shape (None, 3, 3)
 
   NOTE: This differs from `Tensor.set_shape` in that it sets the static shape
   of the resulting tensor and enforces it at runtime, raising an error if the
@@ -2140,8 +2146,10 @@ def ensure_shape(x, shape, name=None):
     name: A name for this operation (optional). Defaults to "EnsureShape".
 
   Returns:
-    A `Tensor`. Has the same type and contents as `x`. At runtime, raises a
-    `tf.errors.InvalidArgumentError` if `shape` is incompatible with the shape
+    A `Tensor`. Has the same type and contents as `x`.
+
+  Raises:
+    tf.errors.InvalidArgumentError: If `shape` is incompatible with the shape
     of `x`.
   """
   if not isinstance(shape, tensor_shape.TensorShape):

From c6281f660b3f593054737de0bc9388113181aa4d Mon Sep 17 00:00:00 2001
From: nikochiko <kaustubh299792458@gmail.com>
Date: Thu, 21 Nov 2019 00:06:38 +0530
Subject: [PATCH 0003/1390] Make requested changes

---
 tensorflow/python/ops/array_ops.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/tensorflow/python/ops/array_ops.py b/tensorflow/python/ops/array_ops.py
index 6a18d08f22f..b31c28deb28 100644
--- a/tensorflow/python/ops/array_ops.py
+++ b/tensorflow/python/ops/array_ops.py
@@ -444,7 +444,7 @@ def broadcast_dynamic_shape(shape_x, shape_y):
 
   >>> shape_x = [1, 2, 3]
   >>> shape_y = [5, 1, 3]
-  >>> broadcast_dynamic_shape(shape_x, shape_y)
+  >>> tf.broadcast_dynamic_shape(shape_x, shape_y)
   <tf.Tensor: id=..., shape=(3,), dtype=int32, numpy=array([5, 2, 3], dtype=int32)>
 
   Args:
@@ -479,7 +479,7 @@ def broadcast_static_shape(shape_x, shape_y):
 
   >>> shape_x = tf.TensorShape([1, 2, 3])
   >>> shape_y = tf.TensorShape([5, 1 ,3])
-  >>> broadcast_static_shape(shape_x, shape_y)
+  >>> tf.broadcast_static_shape(shape_x, shape_y)
   TensorShape([Dimension(5), Dimension(2), Dimension(3)])
 
   Args:
@@ -1556,12 +1556,12 @@ def boolean_mask(tensor, mask, name="boolean_mask", axis=None):
   # 1-D example
   tensor = [0, 1, 2, 3]
   mask = np.array([True, False, True, False])
-  boolean_mask(tensor, mask)  # [0, 2]
+  tf.boolean_mask(tensor, mask)  # [0, 2]
 
   # 2-D example
   tensor = [[1, 2], [3, 4], [5, 6]]
   mask = np.array([True, False, True])
-  boolean_mask(tensor, mask)  # [[1, 2], [5, 6]]
+  tf.boolean_mask(tensor, mask)  # [[1, 2], [5, 6]]
   ```
 
   Args:
@@ -1640,12 +1640,12 @@ def boolean_mask_v2(tensor, mask, axis=None, name="boolean_mask"):
 
   >>> tensor = [0, 1, 2, 3]  # 1-D example
   >>> mask = np.array([True, False, True, False])
-  >>> boolean_mask(tensor, mask)
+  >>> tf.boolean_mask(tensor, mask)
   <tf.Tensor: id=..., shape=(2,), dtype=int32, numpy=array([0, 2], dtype=int32)>
 
   >>> tensor = [[1, 2], [3, 4], [5, 6]] # 2-D example
   >>> mask = np.array([True, False, True])
-  >>> boolean_mask(tensor, mask)
+  >>> tf.boolean_mask(tensor, mask)
   <tf.Tensor: id=..., shape=(2, 2), dtype=int32, numpy=array([[1, 2], [5, 6]], dtype=int32)>
 
   Args:
@@ -3175,7 +3175,7 @@ def edit_distance(hypothesis, truth, normalize=True, name="edit_distance"):
   ...    [1, 1, 0]],
   ...    ["a", "b", "c", "a"],
   ...    (2, 2, 2))
-  >>> edit_distance(hypothesis, truth, normalize=True)
+  >>> tf.edit_distance(hypothesis, truth, normalize=True)
   <tf.Tensor: id=..., shape=(2, 2), dtype=float32, numpy=
   array([[inf, 1. ],
          [0.5, 1. ]], dtype=float32)>

From d6cff22e271692d770ebc94a8aec6b83e2d533b9 Mon Sep 17 00:00:00 2001
From: nikochiko <kaustubh299792458@gmail.com>
Date: Thu, 21 Nov 2019 14:47:02 +0530
Subject: [PATCH 0004/1390] Make requested changes

- Shapes from lists to tuples
- TensorShapes to `TensorShape([...])`
---
 tensorflow/python/ops/array_ops.py | 21 +++++++++++----------
 tensorflow/python/ops/check_ops.py | 10 ++++------
 2 files changed, 15 insertions(+), 16 deletions(-)

diff --git a/tensorflow/python/ops/array_ops.py b/tensorflow/python/ops/array_ops.py
index b31c28deb28..a345d290f69 100644
--- a/tensorflow/python/ops/array_ops.py
+++ b/tensorflow/python/ops/array_ops.py
@@ -442,8 +442,8 @@ def broadcast_dynamic_shape(shape_x, shape_y):
 
   Example:
 
-  >>> shape_x = [1, 2, 3]
-  >>> shape_y = [5, 1, 3]
+  >>> shape_x = (1, 2, 3)
+  >>> shape_y = (5, 1, 3)
   >>> tf.broadcast_dynamic_shape(shape_x, shape_y)
   <tf.Tensor: id=..., shape=(3,), dtype=int32, numpy=array([5, 2, 3], dtype=int32)>
 
@@ -469,8 +469,9 @@ def broadcast_static_shape(shape_x, shape_y):
   `TensorShape` which is the shape of the result of a broadcasting op applied in
   tensors of shapes `shape_x` and `shape_y`.
 
-  For example, if shape_x is [1, 2, 3] and shape_y is [5, 1, 3], the result is a
-  TensorShape whose value is [5, 2, 3].
+  For example, if shape_x is `TensorShape([1, 2, 3])` and shape_y is
+  `TensorShape([5, 1, 3])`, the result is a TensorShape whose value is 
+  `TensorShape([5, 2, 3])`.
 
   This is useful when validating the result of a broadcasting operation when the
   tensors have statically known shapes.
@@ -3160,8 +3161,8 @@ def edit_distance(hypothesis, truth, normalize=True, name="edit_distance"):
   For example:
 
   Given the following input,
-  * `hypothesis` is a `tf.SparseTensor` of shape `[2, 1, 1]`
-  * `truth` is a `tf.SparseTensor` of shape `[2, 2, 2]`
+  * `hypothesis` is a `tf.SparseTensor` of shape `(2, 1, 1)`
+  * `truth` is a `tf.SparseTensor` of shape `(2, 2, 2)`
 
   >>> hypothesis = tf.SparseTensor(
   ...   [[0, 0, 0],
@@ -3180,7 +3181,7 @@ def edit_distance(hypothesis, truth, normalize=True, name="edit_distance"):
   array([[inf, 1. ],
          [0.5, 1. ]], dtype=float32)>
 
-  The operaton returns a dense Tensor of shape `[2, 2]` with
+  The operaton returns a dense Tensor of shape `(2, 2)` with
   edit distances normalized by `truth` lengths.
 
   **Note**: It is possible to calculate edit distance between two
@@ -3190,14 +3191,14 @@ def edit_distance(hypothesis, truth, normalize=True, name="edit_distance"):
   For the following  inputs,
 
   ```python
-  # 'hypothesis' is a tensor of shape `[2, 1]` with variable-length values:
+  # 'hypothesis' is a tensor of shape `(2, 1)` with variable-length values:
   hypothesis = tf.SparseTensor(
     [[0, 0],
      [1,0]],
     ["a", "b"],
     (2, 1))
 
-  # 'truth' is a tensor of shape `[2, 2]` with variable-length values:
+  # 'truth' is a tensor of shape `(2, 2)` with variable-length values:
   truth = tf.SparseTensor(
     [[0, 1],
      [1, 0],
@@ -3207,7 +3208,7 @@ def edit_distance(hypothesis, truth, normalize=True, name="edit_distance"):
 
   normalize = True
 
-  # The output would be a dense Tensor of shape `[2,]`, with edit distances
+  # The output would be a dense Tensor of shape `(2,)`, with edit distances
   noramlized by 'truth' lengths.
   # output => array([0., 0.5], dtype=float32)
   ```
diff --git a/tensorflow/python/ops/check_ops.py b/tensorflow/python/ops/check_ops.py
index 68546d792a8..c76f3939f1e 100644
--- a/tensorflow/python/ops/check_ops.py
+++ b/tensorflow/python/ops/check_ops.py
@@ -2117,13 +2117,11 @@ def ensure_shape(x, shape, name=None):
   >>> y = x * 2
   >>> print(y.shape)
   TensorShape(None)
-
-  y = tf.ensure_shape(y, (None, 3, 3))
-  print(y.shape)
-  TensorShape([None, 3, 3])
-
+  >>> y = tf.ensure_shape(y, (None, 3, 3))
+  >>> print(y.shape)
+  TensorShape([Dimension(None), Dimension(3), Dimension(3)])
   >>> with tf.compat.v1.Session() as sess:
-  >>> sess.run(y, feed_dict={x: [1, 2, 3]})
+  >>>   sess.run(y, feed_dict={x: [1, 2, 3]})
   Traceback (most recent call last):
       ...
   InvalidArgumentError: Shape of tensor mul [3] is not compatible with

From 9ccf3973b0af7232e83d18a8ea6c6fc27632f0fc Mon Sep 17 00:00:00 2001
From: nikochiko <kaustubh299792458@gmail.com>
Date: Tue, 26 Nov 2019 21:47:35 +0530
Subject: [PATCH 0005/1390] Minor changes

Made few more minor changes to documentation of `tf.extract_volume_patches` and `tf.fill`
---
 .../api_def_ExtractVolumePatches.pbtxt        |  8 +++----
 tensorflow/python/ops/array_ops.py            | 21 ++++++++++---------
 2 files changed, 15 insertions(+), 14 deletions(-)

diff --git a/tensorflow/core/api_def/base_api/api_def_ExtractVolumePatches.pbtxt b/tensorflow/core/api_def/base_api/api_def_ExtractVolumePatches.pbtxt
index 9c4015eaa4c..32a3c33dc35 100644
--- a/tensorflow/core/api_def/base_api/api_def_ExtractVolumePatches.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ExtractVolumePatches.pbtxt
@@ -34,13 +34,13 @@ END
     description: <<END
 The type of padding algorithm to use.
 
-We specify the size-related attributes as:
+The size-related attributes are specified as follows:
 
 ```python
-      ksizes = [1, ksize_planes, ksize_rows, ksize_cols, 1]
-      strides = [1, stride_planes, strides_rows, strides_cols, 1]
+ksizes = [1, ksize_planes, ksize_rows, ksize_cols, 1]
+strides = [1, stride_planes, strides_rows, strides_cols, 1]
 ```
 END
   }
-  summary: "Extract `patches` from `input` and put them in the \"depth\" output dimension. 3D extension of `extract_image_patches`."
+  summary: "Extract `patches` from `input` and put them in the `\"depth\"` output dimension. 3D extension of `extract_image_patches`."
 }
diff --git a/tensorflow/python/ops/array_ops.py b/tensorflow/python/ops/array_ops.py
index a345d290f69..d097e36a9d2 100644
--- a/tensorflow/python/ops/array_ops.py
+++ b/tensorflow/python/ops/array_ops.py
@@ -213,18 +213,19 @@ def fill(dims, value, name=None):
 
   *   `tf.fill` only supports scalar contents, whereas `tf.constant` supports
       Tensor values.
-  *   `tf.fill` creates an Op in the computation graph that constructs the
-  actual
-      Tensor value at runtime. This is in contrast to `tf.constant` which embeds
-      the entire Tensor into the graph with a `Const` node.
-  *   Because `tf.fill` evaluates at graph runtime, it supports dynamic shapes
-      based on other runtime Tensors, unlike `tf.constant`.
+  *   In graph mode:
+        * `tf.fill` creates an Op in the computation graph that constructs the
+          actual Tensor value at runtime. This is in contrast to `tf.constant` which
+          embeds the entire Tensor into the graph with a `Const` node.
+        * Because `tf.fill` evaluates at graph runtime, it supports dynamic shapes
+          based on other runtime Tensors, unlike `tf.constant`.
+
+  Equivalent to `np.full` in NumPy.
 
   Args:
-    dims: A `Tensor`. Must be one of the following types: `int32`, `int64`. 1-D.
+    dims: A 1-D `Tensor`. Must be one of the following types: `int32`, `int64`.
       Represents the shape of the output tensor.
-    value: A `Tensor`. 0-D (scalar). Value to fill the returned tensor.
-      @compatibility(numpy) Equivalent to np.full @end_compatibility
+    value: A `Tensor`, 0-D (scalar). Value to fill the returned tensor.
     name: A name for the operation (optional).
 
   Returns:
@@ -470,7 +471,7 @@ def broadcast_static_shape(shape_x, shape_y):
   tensors of shapes `shape_x` and `shape_y`.
 
   For example, if shape_x is `TensorShape([1, 2, 3])` and shape_y is
-  `TensorShape([5, 1, 3])`, the result is a TensorShape whose value is 
+  `TensorShape([5, 1, 3])`, the result is a TensorShape whose value is
   `TensorShape([5, 2, 3])`.
 
   This is useful when validating the result of a broadcasting operation when the

From b1c1547bcf3a38e9275e1c0a3c4ddee6cbf47ab7 Mon Sep 17 00:00:00 2001
From: Kaustubh Maske Patil <37668193+nikochiko@users.noreply.github.com>
Date: Sat, 14 Dec 2019 22:23:21 +0530
Subject: [PATCH 0006/1390] Update example to use tf.function

---
 tensorflow/python/ops/check_ops.py | 27 +++++++++------------------
 1 file changed, 9 insertions(+), 18 deletions(-)

diff --git a/tensorflow/python/ops/check_ops.py b/tensorflow/python/ops/check_ops.py
index e53a00bc232..ed28b1b093a 100644
--- a/tensorflow/python/ops/check_ops.py
+++ b/tensorflow/python/ops/check_ops.py
@@ -2121,25 +2121,16 @@ def ensure_shape(x, shape, name=None):
 
   For example:
 
-  >>> # tf.placeholder() is not compatible with eager execution
-  ...
-  >>> tf.compat.v1.disable_eager_execution()
-  >>> x = tf.compat.v1.placeholder(tf.int32)
-  >>> print(x.shape)
-  TensorShape(None)
-  >>> y = x * 2
-  >>> print(y.shape)
-  TensorShape(None)
-  >>> y = tf.ensure_shape(y, (None, 3, 3))
-  >>> print(y.shape)
-  TensorShape([Dimension(None), Dimension(3), Dimension(3)])
-  >>> with tf.compat.v1.Session() as sess:
-  >>>   sess.run(y, feed_dict={x: [1, 2, 3]})
+  >>> @tf.function(input_signature=[tf.TensorSpec(dtype=tf.float32, shape=None)])
+  >>> def f(tensor):
+  >>>   return tf.ensure_shape(x, [3, 3])
+  >>>
+  >>> f(tf.zeros([3, 3])) # Passes
+  >>> f([1, 2, 3]) # fails
   Traceback (most recent call last):
-      ...
-  InvalidArgumentError: Shape of tensor mul [3] is not compatible with
-   expected shape [?,3,3].
-
+  ...
+  InvalidArgumentError:  Shape of tensor x [3] is not compatible with expected shape [3,3].
+  
   The above example raises `tf.errors.InvalidArgumentError`,
   because the shape (3,) is not compatible with the shape (None, 3, 3)
 

From e96d18ac0fa564b666db80bba1e044fb88d7251a Mon Sep 17 00:00:00 2001
From: Stephan Uphoff <ups@stups.com>
Date: Sun, 3 Nov 2019 17:11:03 -0700
Subject: [PATCH 0007/1390] Set g_pdm_dma_error_reporter pointer to the
 error_reporter passed to InitAudioRecording()

---
 .../micro/examples/micro_speech/apollo3evb/audio_provider.cc  | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/lite/micro/examples/micro_speech/apollo3evb/audio_provider.cc b/tensorflow/lite/micro/examples/micro_speech/apollo3evb/audio_provider.cc
index 0f9a91a9dba..cf10785071b 100644
--- a/tensorflow/lite/micro/examples/micro_speech/apollo3evb/audio_provider.cc
+++ b/tensorflow/lite/micro/examples/micro_speech/apollo3evb/audio_provider.cc
@@ -247,7 +247,6 @@ void pdm_start_dma(tflite::ErrorReporter* error_reporter) {
 
   // Reset the PDM DMA flags.
   g_pdm_dma_error = false;
-  g_pdm_dma_error_reporter = error_reporter;
 }
 
 #if USE_MAYA
@@ -460,11 +459,12 @@ TfLiteStatus InitAudioRecording(tflite::ErrorReporter* error_reporter) {
 #endif  // USE_TIME_STAMP
 
   // Configure, turn on PDM
+  g_pdm_dma_error_reporter = error_reporter;
   pdm_init();
   am_hal_interrupt_master_enable();
   am_hal_pdm_fifo_flush(g_pdm_handle);
   // Trigger the PDM DMA for the first time manually.
-  pdm_start_dma(g_pdm_dma_error_reporter);
+  pdm_start_dma(error_reporter);
 
   error_reporter->Report("\nPDM DMA Threshold = %d", PDMn(0)->FIFOTHR);
 

From 6d6411b339ef2b14f38c57e38c05b2f24122364a Mon Sep 17 00:00:00 2001
From: nikochiko <kaustubh299792458@gmail.com>
Date: Tue, 17 Dec 2019 16:50:45 +0530
Subject: [PATCH 0008/1390] Fix sanity build errors

---
 tensorflow/python/ops/array_ops.py | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/tensorflow/python/ops/array_ops.py b/tensorflow/python/ops/array_ops.py
index 52c265608fe..65a55b5b2bd 100644
--- a/tensorflow/python/ops/array_ops.py
+++ b/tensorflow/python/ops/array_ops.py
@@ -490,10 +490,10 @@ setdiff1d.__doc__ = gen_array_ops.list_diff.__doc__
 def broadcast_dynamic_shape(shape_x, shape_y):
   """Computes the shape of a broadcast given symbolic shapes.
 
-  When `shape_x` and `shape_y` are Tensors representing shapes (i.e. the result of
-  calling tf.shape on another Tensor) this computes a Tensor which is the shape
-  of the result of a broadcasting op applied in tensors of shapes `shape_x` and
-  `shape_y`.
+  When `shape_x` and `shape_y` are Tensors representing shapes (i.e. the result
+  of calling tf.shape on another Tensor) this computes a Tensor which is the
+  shape of the result of a broadcasting op applied in tensors of shapes
+  `shape_x` and `shape_y`.
 
   This is useful when validating the result of a broadcasting operation when the
   tensors do not have statically known shapes.
@@ -503,7 +503,8 @@ def broadcast_dynamic_shape(shape_x, shape_y):
   >>> shape_x = (1, 2, 3)
   >>> shape_y = (5, 1, 3)
   >>> tf.broadcast_dynamic_shape(shape_x, shape_y)
-  <tf.Tensor: id=..., shape=(3,), dtype=int32, numpy=array([5, 2, 3], dtype=int32)>
+  <tf.Tensor: id=..., shape=(3,), dtype=int32, numpy=array([5, 2, 3],
+  dtype=int32)>
 
   Args:
     shape_x: A rank 1 integer `Tensor`, representing the shape of x.
@@ -1732,7 +1733,8 @@ def boolean_mask_v2(tensor, mask, axis=None, name="boolean_mask"):
   >>> tensor = [[1, 2], [3, 4], [5, 6]] # 2-D example
   >>> mask = np.array([True, False, True])
   >>> tf.boolean_mask(tensor, mask)
-  <tf.Tensor: id=..., shape=(2, 2), dtype=int32, numpy=array([[1, 2], [5, 6]], dtype=int32)>
+  <tf.Tensor: id=..., shape=(2, 2), dtype=int32, numpy=array([[1, 2], [5, 6]],
+  dtype=int32)>
 
   Args:
     tensor:  N-D Tensor.

From 2f3f3ee15bbc837a22b33c6177e064b5b92d5fec Mon Sep 17 00:00:00 2001
From: nikochiko <kaustubh299792458@gmail.com>
Date: Tue, 17 Dec 2019 16:58:07 +0530
Subject: [PATCH 0009/1390] Fix pylint, wrap at 80 chars

---
 tensorflow/python/ops/array_ops.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tensorflow/python/ops/array_ops.py b/tensorflow/python/ops/array_ops.py
index 65a55b5b2bd..16dda049420 100644
--- a/tensorflow/python/ops/array_ops.py
+++ b/tensorflow/python/ops/array_ops.py
@@ -503,8 +503,8 @@ def broadcast_dynamic_shape(shape_x, shape_y):
   >>> shape_x = (1, 2, 3)
   >>> shape_y = (5, 1, 3)
   >>> tf.broadcast_dynamic_shape(shape_x, shape_y)
-  <tf.Tensor: id=..., shape=(3,), dtype=int32, numpy=array([5, 2, 3],
-  dtype=int32)>
+  <tf.Tensor: id=..., shape=(3,), dtype=int32, numpy=array([5, 2, 3], \
+dtype=int32)>
 
   Args:
     shape_x: A rank 1 integer `Tensor`, representing the shape of x.
@@ -1733,8 +1733,8 @@ def boolean_mask_v2(tensor, mask, axis=None, name="boolean_mask"):
   >>> tensor = [[1, 2], [3, 4], [5, 6]] # 2-D example
   >>> mask = np.array([True, False, True])
   >>> tf.boolean_mask(tensor, mask)
-  <tf.Tensor: id=..., shape=(2, 2), dtype=int32, numpy=array([[1, 2], [5, 6]],
-  dtype=int32)>
+  <tf.Tensor: id=..., shape=(2, 2), dtype=int32, numpy=array([[1, 2], [5, 6]], \
+dtype=int32)>
 
   Args:
     tensor:  N-D Tensor.

From d25cff86db264b870439dbf2b6f37ae30096af0b Mon Sep 17 00:00:00 2001
From: Kaustubh Maske Patil <37668193+nikochiko@users.noreply.github.com>
Date: Sat, 21 Dec 2019 20:25:12 +0530
Subject: [PATCH 0010/1390] Update array_ops.py

---
 tensorflow/python/ops/array_ops.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/ops/array_ops.py b/tensorflow/python/ops/array_ops.py
index 16dda049420..d980babf99d 100644
--- a/tensorflow/python/ops/array_ops.py
+++ b/tensorflow/python/ops/array_ops.py
@@ -503,7 +503,7 @@ def broadcast_dynamic_shape(shape_x, shape_y):
   >>> shape_x = (1, 2, 3)
   >>> shape_y = (5, 1, 3)
   >>> tf.broadcast_dynamic_shape(shape_x, shape_y)
-  <tf.Tensor: id=..., shape=(3,), dtype=int32, numpy=array([5, 2, 3], \
+  <tf.Tensor: shape=(3,), dtype=int32, numpy=array([5, 2, 3], \
 dtype=int32)>
 
   Args:

From 6746a6b8317313d952c8c8083be2476c75d7cc23 Mon Sep 17 00:00:00 2001
From: Kaustubh Maske Patil <37668193+nikochiko@users.noreply.github.com>
Date: Sat, 21 Dec 2019 20:26:41 +0530
Subject: [PATCH 0011/1390] Update array_ops.py

---
 tensorflow/python/ops/array_ops.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tensorflow/python/ops/array_ops.py b/tensorflow/python/ops/array_ops.py
index d980babf99d..57d953481a9 100644
--- a/tensorflow/python/ops/array_ops.py
+++ b/tensorflow/python/ops/array_ops.py
@@ -1728,12 +1728,12 @@ def boolean_mask_v2(tensor, mask, axis=None, name="boolean_mask"):
   >>> tensor = [0, 1, 2, 3]  # 1-D example
   >>> mask = np.array([True, False, True, False])
   >>> tf.boolean_mask(tensor, mask)
-  <tf.Tensor: id=..., shape=(2,), dtype=int32, numpy=array([0, 2], dtype=int32)>
+  <tf.Tensor: shape=(2,), dtype=int32, numpy=array([0, 2], dtype=int32)>
 
   >>> tensor = [[1, 2], [3, 4], [5, 6]] # 2-D example
   >>> mask = np.array([True, False, True])
   >>> tf.boolean_mask(tensor, mask)
-  <tf.Tensor: id=..., shape=(2, 2), dtype=int32, numpy=array([[1, 2], [5, 6]], \
+  <tf.Tensor: shape=(2, 2), dtype=int32, numpy=array([[1, 2], [5, 6]], \
 dtype=int32)>
 
   Args:
@@ -3459,7 +3459,7 @@ def edit_distance(hypothesis, truth, normalize=True, name="edit_distance"):
   ...    ["a", "b", "c", "a"],
   ...    (2, 2, 2))
   >>> tf.edit_distance(hypothesis, truth, normalize=True)
-  <tf.Tensor: id=..., shape=(2, 2), dtype=float32, numpy=
+  <tf.Tensor: shape=(2, 2), dtype=float32, numpy=
   array([[inf, 1. ],
          [0.5, 1. ]], dtype=float32)>
 

From 6a89b01c85893ecba281447e4d54c77889cd4d83 Mon Sep 17 00:00:00 2001
From: nikochiko <kaustubh299792458@gmail.com>
Date: Sat, 4 Jan 2020 13:01:11 +0530
Subject: [PATCH 0012/1390] Fix docstrings

---
 tensorflow/python/ops/array_ops.py |  7 ++++---
 tensorflow/python/ops/check_ops.py | 14 +++++++-------
 2 files changed, 11 insertions(+), 10 deletions(-)

diff --git a/tensorflow/python/ops/array_ops.py b/tensorflow/python/ops/array_ops.py
index 57d953481a9..a208272a7e2 100644
--- a/tensorflow/python/ops/array_ops.py
+++ b/tensorflow/python/ops/array_ops.py
@@ -540,7 +540,7 @@ def broadcast_static_shape(shape_x, shape_y):
   >>> shape_x = tf.TensorShape([1, 2, 3])
   >>> shape_y = tf.TensorShape([5, 1 ,3])
   >>> tf.broadcast_static_shape(shape_x, shape_y)
-  TensorShape([Dimension(5), Dimension(2), Dimension(3)])
+  TensorShape([5, 2, 3])
 
   Args:
     shape_x: A `TensorShape`
@@ -1733,8 +1733,9 @@ def boolean_mask_v2(tensor, mask, axis=None, name="boolean_mask"):
   >>> tensor = [[1, 2], [3, 4], [5, 6]] # 2-D example
   >>> mask = np.array([True, False, True])
   >>> tf.boolean_mask(tensor, mask)
-  <tf.Tensor: shape=(2, 2), dtype=int32, numpy=array([[1, 2], [5, 6]], \
-dtype=int32)>
+  <tf.Tensor: shape=(2, 2), dtype=int32, numpy=
+  array([[1, 2],
+         [5, 6]], dtype=int32)>
 
   Args:
     tensor:  N-D Tensor.
diff --git a/tensorflow/python/ops/check_ops.py b/tensorflow/python/ops/check_ops.py
index ed28b1b093a..2088dea7c12 100644
--- a/tensorflow/python/ops/check_ops.py
+++ b/tensorflow/python/ops/check_ops.py
@@ -1595,7 +1595,7 @@ def assert_shapes_v2(shapes, data=None, summarize=None, message=None,
   >>> n = 10
   >>> q = 3
   >>> d = 7
-  >>> x = tf.zeros([n,q]) 
+  >>> x = tf.zeros([n,q])
   >>> y = tf.ones([n,d])
   >>> param = tf.Variable([1.0, 2.0, 3.0])
   >>> scalar = 1.0
@@ -1605,9 +1605,9 @@ def assert_shapes_v2(shapes, data=None, summarize=None, message=None,
   ...  (param, ('Q',)),
   ...  (scalar, ()),
   ... ])
-  
+
   >>> tf.debugging.assert_shapes([
-  ...   (x, ('N', 'D')), 
+  ...   (x, ('N', 'D')),
   ...   (y, ('N', 'D'))
   ... ])
   Traceback (most recent call last):
@@ -2121,16 +2121,16 @@ def ensure_shape(x, shape, name=None):
 
   For example:
 
-  >>> @tf.function(input_signature=[tf.TensorSpec(dtype=tf.float32, shape=None)])
-  >>> def f(tensor):
-  >>>   return tf.ensure_shape(x, [3, 3])
+  >>> @tf.function(input_signature=[tf.TensorSpec(shape=None, dtype=tf.float32)])
+  ... def f(tensor):
+  ...   return tf.ensure_shape(tensor, [3, 3])
   >>>
   >>> f(tf.zeros([3, 3])) # Passes
   >>> f([1, 2, 3]) # fails
   Traceback (most recent call last):
   ...
   InvalidArgumentError:  Shape of tensor x [3] is not compatible with expected shape [3,3].
-  
+
   The above example raises `tf.errors.InvalidArgumentError`,
   because the shape (3,) is not compatible with the shape (None, 3, 3)
 

From aee9e604872ecb6bdd15cd6a1864b2d1108b671b Mon Sep 17 00:00:00 2001
From: Kaustubh Maske Patil <37668193+nikochiko@users.noreply.github.com>
Date: Sun, 12 Jan 2020 10:41:37 +0530
Subject: [PATCH 0013/1390] Fix doctest for ensure_shape

---
 tensorflow/python/ops/check_ops.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tensorflow/python/ops/check_ops.py b/tensorflow/python/ops/check_ops.py
index 2088dea7c12..d38b3958205 100644
--- a/tensorflow/python/ops/check_ops.py
+++ b/tensorflow/python/ops/check_ops.py
@@ -2126,6 +2126,10 @@ def ensure_shape(x, shape, name=None):
   ...   return tf.ensure_shape(tensor, [3, 3])
   >>>
   >>> f(tf.zeros([3, 3])) # Passes
+  <tf.Tensor: shape=(3, 3), dtype=float32, numpy=
+  array([[0., 0., 0.],
+         [0., 0., 0.],
+         [0., 0., 0.]], dtype=float32)>
   >>> f([1, 2, 3]) # fails
   Traceback (most recent call last):
   ...

From a7899d7544230fce8dae4895733d82623af2b934 Mon Sep 17 00:00:00 2001
From: Elena Zhelezina <elena.zhelezina@arm.com>
Date: Tue, 21 Jan 2020 13:18:55 +0000
Subject: [PATCH 0014/1390] Added an option
 TFLITE_BUILTINS_ACTIVATIONS_INT16_WEIGHTS_INT8 to enable sym quantization
 with activations in 16-bit and weigths in 8-bit.

---
 tensorflow/lite/python/convert.py             |   6 +
 tensorflow/lite/python/lite.py                |  13 +-
 tensorflow/lite/python/lite_constants.py      |   3 +
 tensorflow/lite/python/lite_test.py           |  14 +-
 .../python/optimize/calibration_wrapper.cc    |   8 +-
 .../python/optimize/calibration_wrapper.h     |   3 +-
 tensorflow/lite/python/optimize/calibrator.py |   6 +-
 .../lite/python/optimize/calibrator_test.py   |  39 ++-
 .../lite/tools/optimize/operator_property.cc  |  17 +-
 .../lite/tools/optimize/operator_property.h   |  10 +-
 .../lite/tools/optimize/quantization_utils.cc | 102 +++++--
 .../lite/tools/optimize/quantization_utils.h  |  10 +-
 .../tools/optimize/quantization_utils_test.cc |   4 +-
 .../tools/optimize/quantization_wrapper.cc    |   4 +-
 .../lite/tools/optimize/quantize_model.cc     | 175 +++++++-----
 .../lite/tools/optimize/quantize_model.h      |   7 +-
 .../tools/optimize/quantize_model_test.cc     | 258 ++++++++++++------
 17 files changed, 477 insertions(+), 202 deletions(-)

diff --git a/tensorflow/lite/python/convert.py b/tensorflow/lite/python/convert.py
index 2fe4d172487..494f32a515c 100644
--- a/tensorflow/lite/python/convert.py
+++ b/tensorflow/lite/python/convert.py
@@ -93,6 +93,12 @@ class OpsSet(enum.Enum):
   # quantized implementations.
   TFLITE_BUILTINS_INT8 = "TFLITE_BUILTINS_INT8"
 
+  # Convert model using only TensorFlow Lite operations with quantized int8 weights
+  # and int16 activations.
+  # Specifying this will throw an error for operations that do not yet have
+  # quantized implementations.
+  TFLITE_BUILTINS_ACTIVATIONS_INT16_WEIGHTS_INT8 = "TFLITE_BUILTINS_ACTIVATIONS_INT16_WEIGHTS_INT8"
+
   def __str__(self):
     return self.value
 
diff --git a/tensorflow/lite/python/lite.py b/tensorflow/lite/python/lite.py
index 657cfea1bb8..fc9c064faf0 100644
--- a/tensorflow/lite/python/lite.py
+++ b/tensorflow/lite/python/lite.py
@@ -224,6 +224,10 @@ class TFLiteConverterBase(object):
         self.target_spec.supported_ops) or
             self._smallest_supported_type() == constants.INT8)
 
+  def _is_int16x8_target_required(self):
+    return (set([OpsSet.TFLITE_BUILTINS_ACTIVATIONS_INT16_WEIGHTS_INT8]) ==
+        set(self.target_spec.supported_ops))
+
   def _smallest_supported_type(self):
     if self.target_spec.supported_types:
       return min(self.target_spec.supported_types, key=lambda x: x.size)
@@ -238,7 +242,9 @@ class TFLiteConverterBase(object):
         ]))
 
   def _is_post_training_optimize(self):
-    return self._is_int8_target_required() or self._any_optimization_enabled()
+    return self._is_int8_target_required() or \
+      self._is_int16x8_target_required() or \
+      self._any_optimization_enabled()
 
   def _is_int8_weight_only_quantize(self):
     return (self._is_post_training_optimize() and
@@ -255,11 +261,12 @@ class TFLiteConverterBase(object):
 
   def _calibrate_quantize_model(self, result, inference_input_type,
                                 inference_output_type, enable_mlir_quantizer):
-    allow_float = not self._is_int8_target_required()
+    allow_float = not self._is_int8_target_required() and not self._is_int16x8_target_required()
     calibrate_quantize = _calibrator.Calibrator(result)
+    activations_type = constants.INT16 if self._is_int16x8_target_required() else constants.INT8
     return calibrate_quantize.calibrate_and_quantize(
         self.representative_dataset.input_gen, inference_input_type,
-        inference_output_type, allow_float, enable_mlir_quantizer)
+        inference_output_type, allow_float, activations_type, enable_mlir_quantizer)
 
   def _get_base_converter_args(self):
     """Returns the base converter args.
diff --git a/tensorflow/lite/python/lite_constants.py b/tensorflow/lite/python/lite_constants.py
index d43452c775b..4902f23795e 100644
--- a/tensorflow/lite/python/lite_constants.py
+++ b/tensorflow/lite/python/lite_constants.py
@@ -30,6 +30,7 @@ INT64 = dtypes.int64
 STRING = dtypes.string
 QUANTIZED_UINT8 = dtypes.uint8
 INT8 = dtypes.int8
+INT16 = dtypes.int16
 COMPLEX64 = dtypes.complex64
 TENSORFLOW_GRAPHDEF = _toco_flags_pb2.TENSORFLOW_GRAPHDEF
 TFLITE = _toco_flags_pb2.TFLITE
@@ -43,6 +44,7 @@ _tf_export(v1=["lite.constants.STRING"]).export_constant(__name__, "STRING")
 _tf_export(v1=["lite.constants.QUANTIZED_UINT8"]).export_constant(
     __name__, "QUANTIZED_UINT8")
 _tf_export(v1=["lite.constants.INT8"]).export_constant(__name__, "INT8")
+_tf_export(v1=["lite.constants.INT16"]).export_constant(__name__, "INT16")
 _tf_export(v1=["lite.constants.TFLITE"]).export_constant(__name__, "TFLITE")
 _tf_export(v1=["lite.constants.GRAPHVIZ_DOT"]).export_constant(
     __name__, "GRAPHVIZ_DOT")
@@ -62,6 +64,7 @@ _allowed_symbols = [
     "STRING",
     "QUANTIZED_UINT8",
     "INT8",
+    "INT16",
     "COMPLEX64",
     "TENSORFLOW_GRAPHDEF",
     "TFLITE",
diff --git a/tensorflow/lite/python/lite_test.py b/tensorflow/lite/python/lite_test.py
index 16959c84146..ef5e5d1cdf4 100644
--- a/tensorflow/lite/python/lite_test.py
+++ b/tensorflow/lite/python/lite_test.py
@@ -769,9 +769,13 @@ class FromSessionTest(TestModels, parameterized.TestCase):
     self.assertLess(len(quantized_tflite), len(float_tflite))
 
   @parameterized.named_parameters(
-      ('EnableMlirConverter', True),  # enable mlir
-      ('DisableMlirConverter', False))  # disable mlir
-  def testCalibrateAndQuantizeBuiltinInt8(self, enable_mlir):
+      # Quantize model to Int8: with enable mlir
+      ('UseTfliteBuiltinsIntEnableMLIR', [lite.OpsSet.TFLITE_BUILTINS_INT8], True),
+      # Quantize model to Int8: with disable mlir
+      ('UseTfliteBuiltinsIntDisableMLIR', [lite.OpsSet.TFLITE_BUILTINS_INT8], False),
+      # Quantize model to Int16: with disable mlir
+      ('UseTfliteBuiltinsInt16DisableMLIR', [lite.OpsSet.TFLITE_BUILTINS_ACTIVATIONS_INT16_WEIGHTS_INT8], False))
+  def testCalibrateAndQuantizeBuiltinInt(self, supported_ops, enable_mlir):
     with ops.Graph().as_default():
       inp, output, calibration_gen = self._getCalibrationQuantizeModel()
       sess = session.Session()
@@ -787,9 +791,7 @@ class FromSessionTest(TestModels, parameterized.TestCase):
     quantized_converter = lite.TFLiteConverter.from_session(
         sess, [inp], [output])
     quantized_converter.experimental_new_converter = enable_mlir
-    quantized_converter.target_spec.supported_ops = [
-        lite.OpsSet.TFLITE_BUILTINS_INT8
-    ]
+    quantized_converter.target_spec.supported_ops = supported_ops
     quantized_converter.representative_dataset = calibration_gen
     quantized_tflite = quantized_converter.convert()
     self.assertTrue(quantized_tflite)
diff --git a/tensorflow/lite/python/optimize/calibration_wrapper.cc b/tensorflow/lite/python/optimize/calibration_wrapper.cc
index 89ffb3430ea..88995136726 100644
--- a/tensorflow/lite/python/optimize/calibration_wrapper.cc
+++ b/tensorflow/lite/python/optimize/calibration_wrapper.cc
@@ -204,6 +204,7 @@ PyObject* CalibrationWrapper::SetTensor(int index, PyObject* value) {
 PyObject* CalibrationWrapper::QuantizeModel(int input_py_type,
                                             int output_py_type,
                                             bool allow_float,
+                                            int activations_py_type,
                                             bool enable_mlir_quantizer) {
   if (NoOpModel(*model_)) {
     return python_utils::ConvertToPyString(model_str_->data(),
@@ -212,6 +213,9 @@ PyObject* CalibrationWrapper::QuantizeModel(int input_py_type,
 
   TfLiteType input_type = python_utils::TfLiteTypeFromPyType(input_py_type);
   TfLiteType output_type = python_utils::TfLiteTypeFromPyType(output_py_type);
+  TfLiteType activations_type =
+      python_utils::TfLiteTypeFromPyType(activations_py_type);
+
   if (input_type == kTfLiteNoType || output_type == kTfLiteNoType) {
     PyErr_SetString(PyExc_ValueError,
                     "Input/output type cannot be kTfLiteNoType");
@@ -230,7 +234,7 @@ PyObject* CalibrationWrapper::QuantizeModel(int input_py_type,
     status = tflite::optimize::QuantizeModel(
         &builder, tflite_model.get(), TfLiteTypeToSchemaType(input_type),
         TfLiteTypeToSchemaType(output_type), allow_float,
-        error_reporter_.get());
+        TfLiteTypeToSchemaType(activations_type), error_reporter_.get());
   }
 
   if (status != kTfLiteOk) {
@@ -262,7 +266,7 @@ PyObject* CalibrationWrapper::QuantizeModel(int input_py_type,
   auto status = tflite::optimize::QuantizeModel(
       &builder, tflite_model.get(), TfLiteTypeToSchemaType(input_type),
       TfLiteTypeToSchemaType(output_type), allow_float, {op_name},
-      error_reporter_.get());
+      TensorType_INT8, error_reporter_.get());
   if (status != kTfLiteOk) {
     error_reporter_->exception();
     return nullptr;
diff --git a/tensorflow/lite/python/optimize/calibration_wrapper.h b/tensorflow/lite/python/optimize/calibration_wrapper.h
index 0fefc29dd81..e72fe15e958 100644
--- a/tensorflow/lite/python/optimize/calibration_wrapper.h
+++ b/tensorflow/lite/python/optimize/calibration_wrapper.h
@@ -60,7 +60,8 @@ class CalibrationWrapper {
   PyObject* FeedTensor(PyObject* input_value);
 
   PyObject* QuantizeModel(int input_py_type, int output_py_type,
-                          bool allow_float, bool enable_mlir_quantizer = false);
+                          bool allow_float, int activations_py_type,
+                          bool enable_mlir_quantizer = false);
 
   // Allows quantizing only the operator that produces the tensor with name
   // operator_output_name. (This can be used to help debug.).
diff --git a/tensorflow/lite/python/optimize/calibrator.py b/tensorflow/lite/python/optimize/calibrator.py
index 6d9a29236f0..1f962917551 100644
--- a/tensorflow/lite/python/optimize/calibrator.py
+++ b/tensorflow/lite/python/optimize/calibrator.py
@@ -19,6 +19,7 @@ from __future__ import print_function
 
 import numpy as np
 from tensorflow.python.util.lazy_loader import LazyLoader
+from tensorflow.lite.python import lite_constants
 
 # Lazy load since some of the performance benchmark skylark rules
 # break dependencies. Must use double quotes to match code internal rewrite
@@ -55,7 +56,8 @@ class Calibrator(object):
       raise ValueError("Failed to parse the model.")
 
   def calibrate_and_quantize(self, dataset_gen, input_type, output_type,
-                             allow_float, enable_mlir_quantizer=False):
+                             allow_float, activations_type = lite_constants.INT8,
+                             enable_mlir_quantizer=False):
     """Calibrates the model with specified generator and then quantizes it.
 
     Returns:
@@ -69,6 +71,7 @@ class Calibrator(object):
                    computation, useful when targeting an integer-only backend.
                    If False, an error will be thrown if an operation cannot be
                    quantized, otherwise the model will fallback to float ops.
+      activations_type: A tf.dtype representing the desired type for activations
       enable_mlir_quantizer: A boolean. True if wants to use mlir quantizer to
                              quantize the calibrated model.
     """
@@ -78,6 +81,7 @@ class Calibrator(object):
     return self._calibrator.QuantizeModel(
         np.dtype(input_type.as_numpy_dtype()).num,
         np.dtype(output_type.as_numpy_dtype()).num, allow_float,
+        np.dtype(activations_type.as_numpy_dtype()).num,
         enable_mlir_quantizer)
 
   def calibrate_and_quantize_single(self, dataset_gen, input_type, output_type,
diff --git a/tensorflow/lite/python/optimize/calibrator_test.py b/tensorflow/lite/python/optimize/calibrator_test.py
index 28e8723f23d..7ec5f8f526c 100644
--- a/tensorflow/lite/python/optimize/calibrator_test.py
+++ b/tensorflow/lite/python/optimize/calibrator_test.py
@@ -33,9 +33,13 @@ from tensorflow.python.platform import test
 class CalibratorTest(test_util.TensorFlowTestCase, parameterized.TestCase):
 
   @parameterized.named_parameters(
-      ('EnableMlirQuantizer', True),  # enable mlir quantizer
-      ('DisableMlirQuantizer', False))  # disable mlir quantizer
-  def test_calibration_with_quantization(self, enable_mlir):
+      # Activation type Int8 - enable mlir quantizer
+      ('UseActivationTypeInt8EnabledMlir', constants.INT8, True),
+      # Activation type Int8 - disable mlir quantizer
+      ('UseActivationTypeInt8DisabledMlir', constants.INT8, False),
+      # Activation type Int16
+      ('UseActivationTypeInt16', constants.INT16, False))
+  def test_calibration_with_quantization(self, activations_type, enable_mlir):
     model_path = resource_loader.get_path_to_datafile(
         'test_data/mobilenet_like_model.bin')
     float_model = open(model_path, 'rb').read()
@@ -49,13 +53,18 @@ class CalibratorTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     quantized_model = quantizer.calibrate_and_quantize(input_gen,
                                                        constants.FLOAT,
                                                        constants.FLOAT, False,
+                                                       activations_type,
                                                        enable_mlir)
     self.assertIsNotNone(quantized_model)
 
   @parameterized.named_parameters(
-      ('EnableMlirQuantizer', True),  # enable mlir quantizer
-      ('DisableMlirQuantizer', False))  # disable mlir quantizer
-  def test_calibration_with_quantization_allow_float(self, enable_mlir):
+      # Activation type Int8 - enable mlir quantizer
+      ('UseActivationTypeInt8EnabledMlir', constants.INT8, True),
+      # Activation type Int8 - disable mlir quantizer
+      ('UseActivationTypeInt8DisableMlir', constants.INT8, False),
+      # Activation type Int16 - disable mlir quantizer
+      ('UseActivationTypeInt16', constants.INT16, False))
+  def test_calibration_with_quantization_allow_float(self, activations_type, enable_mlir):
     model_path = resource_loader.get_path_to_datafile(
         'test_data/mobilenet_like_model.bin')
     float_model = open(model_path, 'rb').read()
@@ -69,6 +78,7 @@ class CalibratorTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     quantized_model = quantizer.calibrate_and_quantize(input_gen,
                                                        constants.FLOAT,
                                                        constants.FLOAT, True,
+                                                       activations_type,
                                                        enable_mlir)
     self.assertIsNotNone(quantized_model)
 
@@ -88,9 +98,13 @@ class CalibratorTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     self.assertIsNotNone(quantized_model)
 
   @parameterized.named_parameters(
-      ('EnableMlirQuantizer', True),  # enable mlir quantizer
-      ('DisableMlirQuantizer', False))  # disable mlir quantizer
-  def test_calibration_with_quantization_multiple_inputs(self, enable_mlir):
+      # Activation type Int8 - enable mlir quantizer
+      ('UseActivationTypeInt8 - EnableMlirQuantizer', constants.INT8, True),
+      # Activation type Int8 - disable mlir quantizer
+      ('UseActivationTypeInt8 - DisableMlirQuantizer', constants.INT8, False),
+      # Activation type Int16 - disable mlir quantizer
+      ('UseActivationTypeInt16 - DisableEnableMlirQuantizer', constants.INT16, False))
+  def test_calibration_with_quantization_multiple_inputs(self, activations_type, enable_mlir):
     # Load multi add model from test data.
     # This model has 4 inputs of size (1, 8, 8, 3).
     model_path = resource_loader.get_path_to_datafile(
@@ -106,6 +120,7 @@ class CalibratorTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     quantized_model = quantizer.calibrate_and_quantize(input_gen,
                                                        constants.FLOAT,
                                                        constants.FLOAT, False,
+                                                       activations_type,
                                                        enable_mlir)
     self.assertIsNotNone(quantized_model)
 
@@ -148,7 +163,8 @@ class CalibratorTest(test_util.TensorFlowTestCase, parameterized.TestCase):
 
     with self.assertRaisesRegex(ValueError, 'Size mismatch'):
       quantizer.calibrate_and_quantize(input_gen, constants.FLOAT,
-                                       constants.FLOAT, False, enable_mlir)
+                                       constants.FLOAT, False,
+                                       enable_mlir)
 
   @parameterized.named_parameters(
       ('EnableMlirQuantizer', True),  # enable mlir quantizer
@@ -166,7 +182,8 @@ class CalibratorTest(test_util.TensorFlowTestCase, parameterized.TestCase):
 
     with self.assertRaises(ValueError):
       quantizer.calibrate_and_quantize(input_gen, constants.FLOAT,
-                                       constants.FLOAT, False, enable_mlir)
+                                       constants.FLOAT, False,
+                                       constants.INT8, enable_mlir)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/lite/tools/optimize/operator_property.cc b/tensorflow/lite/tools/optimize/operator_property.cc
index 13f63092761..1f2d8bb4a4d 100644
--- a/tensorflow/lite/tools/optimize/operator_property.cc
+++ b/tensorflow/lite/tools/optimize/operator_property.cc
@@ -64,6 +64,7 @@ OperatorProperty GetOperatorProperty(const ModelT* model, int subgraph_index,
       property.inputs = {{0, {}}, {1, {}}};
       property.outputs = {{0, {}}};
       property.version = 2;
+      property.quantize_input_as_activations = true;
       break;
     case BuiltinOperator_ARG_MAX:
       property.inputs = {{0, {}}};
@@ -176,7 +177,7 @@ OperatorProperty GetOperatorProperty(const ModelT* model, int subgraph_index,
       // LogSoftmax requires output with 16/256 as scale and 127 as zero point.
       TensorProperty tensor_property;
       tensor_property.restriction = true;
-      tensor_property.restricted_value = {16.0 / 256.0, 127};
+      tensor_property.restricted_value_int8 = {16.0 / 256.0, 127};
       property.outputs = {{0, tensor_property}};
       property.version = 2;
       break;
@@ -186,7 +187,8 @@ OperatorProperty GetOperatorProperty(const ModelT* model, int subgraph_index,
       // Logistic requires output with 1/256 as scale and -128 as zero point.
       TensorProperty tensor_property;
       tensor_property.restriction = true;
-      tensor_property.restricted_value = {1 / 256.0, -128};
+      tensor_property.restricted_value_int8 = {1 / 256.0, -128};
+      tensor_property.restricted_value_int16 = {1 / 32768.0, 0};
       property.outputs = {{0, tensor_property}};
       property.version = 2;
       break;
@@ -741,7 +743,7 @@ OperatorProperty GetOperatorProperty(const ModelT* model, int subgraph_index,
       // L2 Norm requires output with 1/128 as scale and 0 as zero point.
       TensorProperty tensor_property;
       tensor_property.restriction = true;
-      tensor_property.restricted_value = {1 / 128.0, 0};
+      tensor_property.restricted_value_int8 = {1 / 128.0, 0};
       property.outputs = {{0, tensor_property}};
       property.version = 2;
       break;
@@ -756,6 +758,7 @@ OperatorProperty GetOperatorProperty(const ModelT* model, int subgraph_index,
       property.arbitrary_inputs = true;
       property.outputs = {{0, {}}};
       property.restrict_same_input_output_scale = true;
+      property.quantize_input_as_activations = true;
       property.version = 2;
       break;
     case BuiltinOperator_MEAN:
@@ -767,6 +770,7 @@ OperatorProperty GetOperatorProperty(const ModelT* model, int subgraph_index,
       property.arbitrary_inputs = true;
       property.outputs = {{0, {}}};
       property.restrict_same_input_output_scale = true;
+      property.quantize_input_as_activations = true;
       property.version = 2;
       break;
     case BuiltinOperator_MUL:
@@ -778,6 +782,7 @@ OperatorProperty GetOperatorProperty(const ModelT* model, int subgraph_index,
       property.arbitrary_inputs = true;
       property.outputs = {{0, {}}};
       property.restrict_same_input_output_scale = true;
+      property.restrict_same_input_output_scale = true;
       property.version = 2;
       break;
     case BuiltinOperator_PAD:
@@ -840,7 +845,8 @@ OperatorProperty GetOperatorProperty(const ModelT* model, int subgraph_index,
       // Softmax requires output with 1/256 as scale and -128 as zero point.
       TensorProperty tensor_property;
       tensor_property.restriction = true;
-      tensor_property.restricted_value = {1 / 256.0, -128};
+      tensor_property.restricted_value_int8 = {1 / 256.0, -128};
+      tensor_property.restricted_value_int16 = {1 / 32768.0, 0};
       property.outputs = {{0, tensor_property}};
       property.version = 2;
       break;
@@ -866,7 +872,8 @@ OperatorProperty GetOperatorProperty(const ModelT* model, int subgraph_index,
       // Tanh requires output with 1/128 as scale and 0 as zero point.
       TensorProperty tensor_property;
       tensor_property.restriction = true;
-      tensor_property.restricted_value = {1 / 128.0, 0};
+      tensor_property.restricted_value_int8 = {1 / 128.0, 0};
+      tensor_property.restricted_value_int16 = {1 / 32768.0, 0};
       property.outputs = {{0, tensor_property}};
       property.version = 2;
       break;
diff --git a/tensorflow/lite/tools/optimize/operator_property.h b/tensorflow/lite/tools/optimize/operator_property.h
index 5d37aa304e5..23052308568 100644
--- a/tensorflow/lite/tools/optimize/operator_property.h
+++ b/tensorflow/lite/tools/optimize/operator_property.h
@@ -43,7 +43,8 @@ struct TensorProperty {
   // Constraints.
   bool restriction = false;
   // scale/zero_point hardcoded.
-  std::pair<float, int> restricted_value = {0.0, 0};
+  std::pair<float, int> restricted_value_int8 = {0.0, 0};
+  std::pair<float, int> restricted_value_int16 = {0.0, 0};
 
   // Use derived scale.
   bool use_derived_scale = false;
@@ -93,6 +94,13 @@ struct OperatorProperty {
 
   // Op version.
   int version = 1;
+
+  // When we quantize activations into 16 bit and weights into 8 bit,
+  // we want to quantize all inputs, including constant tensors,
+  // for the operators like Add, Mul into 16-bit as well. The constant
+  // inputs are quantized as weights and this variable indicates
+  // that we want to do quantizations of these tensors as activations.
+  bool quantize_input_as_activations = false;
 };
 
 OperatorProperty GetOperatorProperty(const ModelT* model, int subgraph_index,
diff --git a/tensorflow/lite/tools/optimize/quantization_utils.cc b/tensorflow/lite/tools/optimize/quantization_utils.cc
index 10680758d72..4bc9686ec2c 100644
--- a/tensorflow/lite/tools/optimize/quantization_utils.cc
+++ b/tensorflow/lite/tools/optimize/quantization_utils.cc
@@ -20,7 +20,6 @@ limitations under the License.
 #include <string>
 
 #include "absl/memory/memory.h"
-#include "third_party/eigen3/Eigen/Core"
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/core/api/error_reporter.h"
 #include "tensorflow/lite/kernels/internal/quantization_util.h"
@@ -30,6 +29,7 @@ limitations under the License.
 #include "tensorflow/lite/minimal_logging.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 #include "tensorflow/lite/tools/optimize/model_utils.h"
+#include "third_party/eigen3/Eigen/Core"
 
 namespace tflite {
 namespace optimize {
@@ -85,6 +85,46 @@ void GetAsymmetricQuantizationParams(
   quantization_params->zero_point = std::vector<int64_t>(1, zero_point);
 }
 
+void GetSymmetricQuantizationParams(
+    float min, float max, const int half_quant_range,
+    QuantizationParametersT* quantization_params) {
+  // Adjust the boundaries to guarantee 0 is included.
+  min = std::min(min, 0.0f);
+  max = std::max(max, 0.0f);
+  const float scale = std::max(std::abs(max), std::abs(min)) / half_quant_range;
+  int64_t zero_point = 0;
+  quantization_params->min = std::vector<float>(1, min);
+  quantization_params->max = std::vector<float>(1, max);
+  quantization_params->scale = std::vector<float>(1, scale);
+  quantization_params->zero_point = std::vector<int64_t>(1, 0);
+}
+
+TfLiteStatus GetQuantizationParams(TensorT* tensor, TensorType activations_type,
+                                   QuantizationParametersT* quantization_params,
+                                   ErrorReporter* error_reporter) {
+  if (activations_type == TensorType_INT8) {
+    GetAsymmetricQuantizationParams(
+        tensor->quantization->min[0], tensor->quantization->max[0],
+        std::numeric_limits<int8_t>::min(), std::numeric_limits<int8_t>::max(),
+        quantization_params);
+  } else if (activations_type == TensorType_INT16) {
+    float range = std::max(std::abs(tensor->quantization->min[0]),
+                           std::abs(tensor->quantization->max[0]));
+    const float quantized_range = 32767.0;
+    const float scale = range / quantized_range;
+    quantization_params->min = std::vector<float>(1, -range);
+    quantization_params->max = std::vector<float>(1, range);
+    quantization_params->scale = std::vector<float>(1, scale);
+    quantization_params->zero_point = std::vector<int64_t>(1, 0);
+  } else {
+    error_reporter->Report(
+        "Unsupported activation type for quantize-activation: %s",
+        activations_type);
+    return kTfLiteError;
+  }
+  return kTfLiteOk;
+}
+
 // Set the max and min quantization parameter for a single tensor given its
 // values.
 void FillSingleMinMax(const float* const input, const uint64_t input_size,
@@ -536,6 +576,7 @@ TfLiteStatus SymmetricQuantizeTensorPerChannel(ModelT* model, TensorT* tensor,
                                model, tensor, error_reporter);
 }
 
+template <class BiasType>
 TfLiteStatus SymmetricPerLayerBiasQuantize(ModelT* model, TensorT* tensor,
                                            float scaling_factor,
                                            ErrorReporter* error_reporter) {
@@ -548,25 +589,38 @@ TfLiteStatus SymmetricPerLayerBiasQuantize(ModelT* model, TensorT* tensor,
   uint64_t num_elements;
   TF_LITE_ENSURE_STATUS(NumElements(*tensor, &num_elements));
 
-  std::vector<int32_t> final_buffer(num_elements);
-  const int32_t kScale = std::numeric_limits<int32_t>::max();
+  std::vector<BiasType> final_buffer(num_elements);
+  const BiasType kScale = std::numeric_limits<BiasType>::max();
 
   for (size_t i = 0; i < num_elements; i++) {
-    const int32_t quantized_value = tflite::SafeCast<int32_t>(
+    const BiasType quantized_value = tflite::SafeCast<BiasType>(
         TfLiteRound(float_data[i] * scaling_factor_inv));
     final_buffer[i] = std::min(kScale, std::max(-kScale, quantized_value));
   }
 
   // Set the buffers and output type.
   uint8_t* uint8_buffer = reinterpret_cast<uint8_t*>(final_buffer.data());
-  size_t buffer_size = num_elements * sizeof(int32_t);
+  size_t buffer_size = num_elements * sizeof(BiasType);
   std::vector<float> scales(1, scaling_factor);
   std::vector<int64_t> zero_points(1, 0);
+
+  auto output_type = std::is_same<BiasType, std::int32_t>::value
+                         ? TensorType_INT32
+                         : TensorType_INT64;
   return AddQuantizationParams(scales, zero_points, 0, uint8_buffer,
-                               buffer_size, TensorType_INT32, model, tensor,
+                               buffer_size, output_type, model, tensor,
                                error_reporter);
 }
 
+template TfLiteStatus SymmetricPerLayerBiasQuantize<std::int32_t>(
+    ModelT* model, TensorT* tensor, float scaling_factor,
+    ErrorReporter* error_reporter);
+
+template TfLiteStatus SymmetricPerLayerBiasQuantize<std::int64_t>(
+    ModelT* model, TensorT* tensor, float scaling_factor,
+    ErrorReporter* error_reporter);
+
+template <class BiasType>
 TfLiteStatus SymmetricPerChannelBiasQuantize(ModelT* model, TensorT* tensor,
                                              float input_scale,
                                              const float* weight_scales,
@@ -583,14 +637,14 @@ TfLiteStatus SymmetricPerChannelBiasQuantize(ModelT* model, TensorT* tensor,
   uint64_t num_elements;
   TF_LITE_ENSURE_STATUS(NumElements(*tensor, &num_elements));
 
-  std::vector<int32_t> final_buffer(num_elements);
-  const int32_t kScale = std::numeric_limits<int32_t>::max();
+  std::vector<BiasType> final_buffer(num_elements);
+  const BiasType kScale = std::numeric_limits<BiasType>::max();
 
   for (int32_t channel_idx = 0; channel_idx < number_of_dimension;
        channel_idx++) {
     float scaling_factor = scales[channel_idx];
     float scaling_factor_inv = (scaling_factor == 0) ? 0 : 1.0 / scaling_factor;
-    const int32_t quantized_value = tflite::SafeCast<int32_t>(
+    const BiasType quantized_value = tflite::SafeCast<BiasType>(
         TfLiteRound(float_data[channel_idx] * scaling_factor_inv));
     final_buffer[channel_idx] =
         std::min(kScale, std::max(-kScale, quantized_value));
@@ -598,12 +652,26 @@ TfLiteStatus SymmetricPerChannelBiasQuantize(ModelT* model, TensorT* tensor,
 
   // Set the buffers and output type.
   uint8_t* uint8_buffer = reinterpret_cast<uint8_t*>(final_buffer.data());
-  size_t buffer_size = num_elements * sizeof(int32_t);
+  size_t buffer_size = num_elements * sizeof(BiasType);
   std::vector<int64_t> zero_point(scales.size(), 0);
+
+  auto output_type = std::is_same<BiasType, std::int32_t>::value
+                         ? TensorType_INT32
+                         : TensorType_INT64;
   return AddQuantizationParams(scales, zero_point, 0, uint8_buffer, buffer_size,
-                               TensorType_INT32, model, tensor, error_reporter);
+                               output_type, model, tensor, error_reporter);
 }
 
+template TfLiteStatus SymmetricPerChannelBiasQuantize<std::int64_t>(
+    ModelT* model, TensorT* tensor, float input_scale,
+    const float* weight_scales, int number_of_dimension,
+    ErrorReporter* error_reporter);
+
+template TfLiteStatus SymmetricPerChannelBiasQuantize<std::int32_t>(
+    ModelT* model, TensorT* tensor, float input_scale,
+    const float* weight_scales, int number_of_dimension,
+    ErrorReporter* error_reporter);
+
 TfLiteStatus QuantizeWeight(ModelT* model, TensorT* tensor, bool per_channel,
                             int per_axis_index, ErrorReporter* error_reporter) {
   // TODO(suharshs): Currently we conflate quantizing weights and constants. Its
@@ -645,12 +713,12 @@ float GetEffectiveScale(ModelT* model, SubGraphT* subgraph, int op_idx,
   return scale;
 }
 
-void QuantizeActivation(TensorT* tensor) {
-  GetAsymmetricQuantizationParams(
-      tensor->quantization->min[0], tensor->quantization->max[0],
-      std::numeric_limits<int8_t>::min(), std::numeric_limits<int8_t>::max(),
-      tensor->quantization.get());
-  tensor->type = TensorType_INT8;
+TfLiteStatus QuantizeActivation(TensorT* tensor, TensorType activations_type,
+                                ErrorReporter* error_reporter) {
+  TF_LITE_ENSURE_STATUS(GetQuantizationParams(
+      tensor, activations_type, tensor->quantization.get(), error_reporter));
+  tensor->type = activations_type;
+  return kTfLiteOk;
 }
 
 TfLiteStatus QuantizeActivationToInt16(TensorT* tensor, float scale) {
diff --git a/tensorflow/lite/tools/optimize/quantization_utils.h b/tensorflow/lite/tools/optimize/quantization_utils.h
index 18ed707e175..752b4253250 100644
--- a/tensorflow/lite/tools/optimize/quantization_utils.h
+++ b/tensorflow/lite/tools/optimize/quantization_utils.h
@@ -113,12 +113,14 @@ TfLiteStatus SymmetricQuantizeFloatsToInt16(ModelT* model, TensorT* tensor,
                                             ErrorReporter* error_reporter);
 
 // Symmetrically quantized the bias for per-layer ops (i.e. FullyConnected).
+template <typename BiasType>
 TfLiteStatus SymmetricPerLayerBiasQuantize(ModelT* model, TensorT* tensor,
                                            float scaling_factor,
                                            ErrorReporter* error_reporter);
 
 // Symmetrically quantizes the bias for ops like Conv and DepthwiseConv.
 // The scale of bias if weight_per_channel_scale[channel] * input_scale.
+template <typename BiasType>
 TfLiteStatus SymmetricPerChannelBiasQuantize(ModelT* model, TensorT* tensor,
                                              float input_scale,
                                              const float* weight_scales,
@@ -135,8 +137,14 @@ float GetEffectiveScale(ModelT* model, SubGraphT* subgraph, int op_idx,
                         std::vector<int> intermediate_index,
                         std::vector<float> factors);
 
+// Return quantization parameters depending on activations type.
+TfLiteStatus GetQuantizationParams(TensorT* tensor, TensorType activations_type,
+                                   QuantizationParametersT* quantization_params,
+                                   ErrorReporter* error_reporter);
+
 // Quantize activation.
-void QuantizeActivation(TensorT* tensor);
+TfLiteStatus QuantizeActivation(TensorT* tensor, TensorType activations_type,
+                                ErrorReporter* error_reporter);
 
 // Quantize activation to 16bit.
 TfLiteStatus QuantizeActivationToInt16(TensorT* tensor, float scale);
diff --git a/tensorflow/lite/tools/optimize/quantization_utils_test.cc b/tensorflow/lite/tools/optimize/quantization_utils_test.cc
index ece0123d166..49009e49600 100644
--- a/tensorflow/lite/tools/optimize/quantization_utils_test.cc
+++ b/tensorflow/lite/tools/optimize/quantization_utils_test.cc
@@ -701,7 +701,7 @@ TEST_F(QuantizationUtilsTest, SymmetricPerLayerBiasQuantize) {
   model->buffers.push_back(std::move(buffer));
 
   // Call and verify.
-  EXPECT_EQ(SymmetricPerLayerBiasQuantize(
+  EXPECT_EQ(SymmetricPerLayerBiasQuantize<int32_t>(
                 model.get(), model->subgraphs[0]->tensors[0].get(),
                 input_scale * weight_scale, &error_reporter_),
             kTfLiteOk);
@@ -759,7 +759,7 @@ TEST_F(QuantizationUtilsTest, SymmetricPerChannelBiasQuantize) {
   model->buffers.push_back(std::move(buffer));
 
   // Call and verify.
-  EXPECT_EQ(SymmetricPerChannelBiasQuantize(
+  EXPECT_EQ(SymmetricPerChannelBiasQuantize<int32_t>(
                 model.get(), model->subgraphs[0]->tensors[0].get(), input_scale,
                 weight_scales.data(), 2, &error_reporter_),
             kTfLiteOk);
diff --git a/tensorflow/lite/tools/optimize/quantization_wrapper.cc b/tensorflow/lite/tools/optimize/quantization_wrapper.cc
index bd3331da6bf..5002c382bc7 100644
--- a/tensorflow/lite/tools/optimize/quantization_wrapper.cc
+++ b/tensorflow/lite/tools/optimize/quantization_wrapper.cc
@@ -42,7 +42,9 @@ bool CreateQuantizedModel(const std::string& path) {
   tflite::StderrReporter error_reporter;
   if (tflite::optimize::QuantizeModel(
           &builder, &model, tflite::TensorType_FLOAT32,
-          tflite::TensorType_FLOAT32, &error_reporter) != kTfLiteOk) {
+          tflite::TensorType_FLOAT32,
+          // TODO: Pass required activation type if needed
+          tflite::TensorType_INT8, &error_reporter) != kTfLiteOk) {
     return false;
   }
   return WriteFile(path, builder.GetBufferPointer(), builder.GetSize());
diff --git a/tensorflow/lite/tools/optimize/quantize_model.cc b/tensorflow/lite/tools/optimize/quantize_model.cc
index 6fc19ff2a56..ee562fe9c4c 100644
--- a/tensorflow/lite/tools/optimize/quantize_model.cc
+++ b/tensorflow/lite/tools/optimize/quantize_model.cc
@@ -64,6 +64,7 @@ operator_property::OperatorProperty GetOperatorProperty(
 TfLiteStatus QuantizeBias(ModelT* model, const TensorT* input_tensor,
                           const TensorT* weight_tensor, TensorT* bias_tensor,
                           bool is_per_channel, int channel_dim_index,
+                          const TensorType& activations_type,
                           ErrorReporter* error_reporter) {
   if (bias_tensor->shape.size() != 1) {
     error_reporter->Report("Expected bias tensor shape to be 1.");
@@ -92,9 +93,15 @@ TfLiteStatus QuantizeBias(ModelT* model, const TensorT* input_tensor,
                              weight_scales.size());
       return kTfLiteError;
     }
-    return utils::SymmetricPerChannelBiasQuantize(
-        model, bias_tensor, input_tensor->quantization->scale[0],
-        weight_scales.data(), channel_dim_size, error_reporter);
+    if (activations_type == tflite::TensorType_INT16) {
+      return utils::SymmetricPerChannelBiasQuantize<std::int64_t>(
+          model, bias_tensor, input_tensor->quantization->scale[0],
+          weight_scales.data(), channel_dim_size, error_reporter);
+    } else {
+      return utils::SymmetricPerChannelBiasQuantize<std::int32_t>(
+          model, bias_tensor, input_tensor->quantization->scale[0],
+          weight_scales.data(), channel_dim_size, error_reporter);
+    }
   } else {
     if (weight_scales.size() != 1) {
       error_reporter->Report(
@@ -102,40 +109,54 @@ TfLiteStatus QuantizeBias(ModelT* model, const TensorT* input_tensor,
           weight_scales.size());
       return kTfLiteError;
     }
-    return utils::SymmetricPerLayerBiasQuantize(
-        model, bias_tensor,
-        input_tensor->quantization->scale[0] * weight_scales[0],
-        error_reporter);
+    if (activations_type == tflite::TensorType_INT16) {
+      return utils::SymmetricPerLayerBiasQuantize<std::int64_t>(
+          model, bias_tensor,
+          input_tensor->quantization->scale[0] * weight_scales[0],
+          error_reporter);
+    } else {
+      return utils::SymmetricPerLayerBiasQuantize<std::int32_t>(
+          model, bias_tensor,
+          input_tensor->quantization->scale[0] * weight_scales[0],
+          error_reporter);
+    }
   }
   return kTfLiteError;
 }
 
 // True if the tensor type has to be modified.
 bool TensorTypeChangeRequired(const TensorT* tensor, const TensorType& type) {
-  // The quantized model is type INT8, so if the user provided type is INT8, we
-  // do not have to do any custom logic. Additionally, if the current tensor
-  // isn't INT8 quantized, the custom type doesn't apply.
-  return (type != TensorType_INT8 && tensor->type == TensorType_INT8 &&
-          !tensor->quantization->scale.empty());
+  // The quantized model is type INT8/INT16, so if the user provided type is
+  // INT8/INT16, we do not have to do any custom logic. Additionally, if the
+  // current tensor isn't INT8/INT16 quantized, the custom type doesn't apply.
+  bool int8check = type != TensorType_INT8 && tensor->type == TensorType_INT8 &&
+                   !tensor->quantization->scale.empty();
+  bool int16check = type != TensorType_INT16 &&
+                    tensor->type == TensorType_INT16 &&
+                    !tensor->quantization->scale.empty();
+  return (int8check || int16check);
 }
 
 // Sets the input type, adding a Leading Op node at the start of the model if
 // necessary.
 // Returns the new input tensor index.
 int32_t SetInputType(ModelT* model, SubGraphT* subgraph,
-                     const int32_t tensor_idx, const TensorType& input_type) {
+                     const int32_t tensor_idx, const TensorType& input_type,
+                     const TensorType& activations_type) {
   TensorT* tensor = subgraph->tensors[tensor_idx].get();
   if (!TensorTypeChangeRequired(tensor, input_type)) {
     return -1;
   }
   if (input_type == TensorType_FLOAT32 || input_type == TensorType_UINT8) {
+    std::string type_string =
+        activations_type == TensorType_INT16 ? "int16" : "int8";
     // Create a new tensor to be the input of the leading Op.
     std::unique_ptr<TensorT> leading_op_input;
     if (input_type == TensorType_FLOAT32) {
       // Add tensor for quantize operator. Scales and zero points are not
       // needed.
       const string leading_op_name = tensor->name;
-      const string new_name_original_input = tensor->name + "_int8";
+      const string new_name_original_input = tensor->name + "_" + type_string;
       tensor->name = new_name_original_input;
       utils::MakeTensor(leading_op_name, tensor->shape, input_type,
                         &leading_op_input);
@@ -150,7 +171,7 @@ int32_t SetInputType(ModelT* model, SubGraphT* subgraph,
       TFLITE_DCHECK_GE(zero_point, -128);
       TFLITE_DCHECK_LE(zero_point, 127);
       const string leading_op_name = tensor->name;
-      const string new_name_original_input = tensor->name + "_int8";
+      const string new_name_original_input = tensor->name + "_" + type_string;
       tensor->name = new_name_original_input;
       utils::MakeTensorWithQuantParam(leading_op_name, tensor->shape,
                                       input_type, scale, zero_point + 128,
@@ -177,17 +198,20 @@ int32_t SetInputType(ModelT* model, SubGraphT* subgraph,
 // necessary.
 // Returns the new output tensor index.
 int32_t SetOutputType(ModelT* model, SubGraphT* subgraph,
-                      const int32_t tensor_idx, const TensorType& output_type) {
+                      const int32_t tensor_idx, const TensorType& output_type,
+                      const TensorType& activations_type) {
   TensorT* tensor = subgraph->tensors[tensor_idx].get();
   if (!TensorTypeChangeRequired(tensor, output_type)) {
     return -1;
   }
   if (output_type == TensorType_FLOAT32 || output_type == TensorType_UINT8) {
+    std::string type_string =
+        activations_type == TensorType_INT16 ? "int16" : "int8";
     // Create a new tensor to be the output of the tailing op.
     std::unique_ptr<TensorT> tailing_op_output;
     if (output_type == TensorType_FLOAT32) {
       const string tailing_op_name = tensor->name;
-      const string new_name_original_output = tensor->name + "_int8";
+      const string new_name_original_output = tensor->name + "_" + type_string;
       tensor->name = new_name_original_output;
       utils::MakeTensor(tailing_op_name, tensor->shape, output_type,
                         &tailing_op_output);
@@ -202,7 +226,7 @@ int32_t SetOutputType(ModelT* model, SubGraphT* subgraph,
       TFLITE_DCHECK_GE(zero_point, -128);
       TFLITE_DCHECK_LE(zero_point, 127);
       const string tailing_op_name = tensor->name;
-      const string new_name_original_output = tensor->name + "_int8";
+      const string new_name_original_output = tensor->name + "_" + type_string;
       tensor->name = new_name_original_output;
       utils::MakeTensorWithQuantParam(tailing_op_name, tensor->shape,
                                       output_type, scale, zero_point + 128,
@@ -238,6 +262,7 @@ int32_t SetOutputType(ModelT* model, SubGraphT* subgraph,
 // uint8, can be thought as "requant").
 TfLiteStatus SetInputAndOutputTypes(ModelT* model, const TensorType& input_type,
                                     const TensorType& output_type,
+                                    const TensorType& activations_type,
                                     ErrorReporter* error_reporter) {
   for (int subgraph_idx = 0; subgraph_idx < model->subgraphs.size();
        subgraph_idx++) {
@@ -253,8 +278,8 @@ TfLiteStatus SetInputAndOutputTypes(ModelT* model, const TensorType& input_type,
             EnumNameTensorType(tensor->type));
         return kTfLiteError;
       }
-      const int32_t input_idx =
-          SetInputType(model, subgraph, subgraph->inputs[i], input_type);
+      const int32_t input_idx = SetInputType(
+          model, subgraph, subgraph->inputs[i], input_type, activations_type);
       if (input_idx < 0) {
         continue;
       }
@@ -270,8 +295,8 @@ TfLiteStatus SetInputAndOutputTypes(ModelT* model, const TensorType& input_type,
             EnumNameTensorType(tensor->type));
         return kTfLiteError;
       }
-      const int32_t output_idx =
-          SetOutputType(model, subgraph, subgraph->outputs[i], output_type);
+      const int32_t output_idx = SetOutputType(
+          model, subgraph, subgraph->outputs[i], output_type, activations_type);
       if (output_idx < 0) {
         continue;
       }
@@ -287,6 +312,7 @@ TfLiteStatus SetInputAndOutputTypes(ModelT* model, const TensorType& input_type,
 // The other ones with constraints are handled in QuantizeWeightsAndInput.
 TfLiteStatus ApplyConstraints(ModelT* model,
                               const std::unordered_set<string>& operator_names,
+                              TensorType activations_type,
                               ErrorReporter* error_reporter) {
   for (int subgraph_idx = 0; subgraph_idx < model->subgraphs.size();
        subgraph_idx++) {
@@ -332,7 +358,7 @@ TfLiteStatus ApplyConstraints(ModelT* model,
         std::unique_ptr<TensorT> additional_tensor;
         const string requant_tensor_name = input_tensor->name + "_requantized";
         utils::MakeTensorWithQuantParam(
-            requant_tensor_name, input_tensor->shape, TensorType_INT8,
+            requant_tensor_name, input_tensor->shape, activations_type,
             output_scale, output_zp, &additional_tensor);
         const int32_t additional_tensor_idx = subgraph->tensors.size();
         subgraph->tensors.push_back(std::move(additional_tensor));
@@ -382,7 +408,8 @@ std::vector<std::pair<int, operator_property::TensorProperty>> GetOutputs(
 
 bool ShouldRestrictSameInputOutputScale(
     operator_property::OperatorProperty property) {
-  // Ops with multiple inputs (i.e. concat) gets restricted in ApplyConstraints.
+  // Ops with multiple inputs (i.e. concat, max and min) gets restricted in
+  // ApplyConstraints.
   return (!property.arbitrary_inputs &&
           property.restrict_same_input_output_scale);
 }
@@ -401,7 +428,7 @@ TfLiteStatus QuantizeOpInput(
     ModelT* model, int32_t subgraph_idx, size_t* op_idx,
     operator_property::OperatorProperty property,
     const std::pair<int32_t, operator_property::TensorProperty>& input,
-    ErrorReporter* error_reporter) {
+    const TensorType& activations_type, ErrorReporter* error_reporter) {
   int32_t input_idx = input.first;
   operator_property::TensorProperty tensor_property = input.second;
   SubGraphT* subgraph = model->subgraphs.at(subgraph_idx).get();
@@ -429,7 +456,9 @@ TfLiteStatus QuantizeOpInput(
     if (utils::HasBuffer(model, subgraph, tensor_idx)) {
       // TODO(suharshs): Look at consumers, throw error if one consumer is
       // per-channel and one per-layer.
-      if (tensor_property.number_of_bits == 8) {
+      bool quantize_const_input = property.quantize_input_as_activations &&
+                                  activations_type == TensorType_INT16;
+      if (tensor_property.number_of_bits == 8 && !quantize_const_input) {
         if (tensor_property.use_derived_scale) {
           // Currently 8bit tensors in input do not accept derived scale.
           return kTfLiteError;
@@ -444,7 +473,7 @@ TfLiteStatus QuantizeOpInput(
               *op_idx);
           return kTfLiteError;
         }
-      } else if (tensor_property.number_of_bits == 16) {
+      } else if (tensor_property.number_of_bits == 16 || quantize_const_input) {
         if (tensor_property.use_derived_scale) {
           // Currently 16bit tensors in input do not accept derived scale.
           return kTfLiteError;
@@ -476,8 +505,8 @@ TfLiteStatus QuantizeOpInput(
             tensor_property.derived_scale.input_tensors,
             tensor_property.derived_scale.intermediate_tensors,
             tensor_property.derived_scale.factors);
-        return utils::SymmetricPerLayerBiasQuantize(model, tensor, scale,
-                                                    error_reporter);
+        return utils::SymmetricPerLayerBiasQuantize<std::int32_t>(
+            model, tensor, scale, error_reporter);
 
       } else if (tensor_property.number_of_bits == 10) {
         // When the number of bits is 10 (instead of 16), quantize the tensor to
@@ -514,7 +543,8 @@ TfLiteStatus QuantizeOpInput(
             // Currently 8bit tensors in input do not accept derived scale.
             return kTfLiteError;
           }
-          utils::QuantizeActivation(tensor);
+          TF_LITE_ENSURE_STATUS(utils::QuantizeActivation(
+              tensor, activations_type, error_reporter));
         } else if (tensor_property.number_of_bits == 16) {
           TensorT* tensor = subgraph->tensors[tensor_idx].get();
           float quantized_range = 32767.0;
@@ -532,13 +562,16 @@ TfLiteStatus QuantizeOpInput(
       } else {
         // If the tensor is not a model input, we need to add a Quantize
         // operation since the preceding op may require a float output.
+        std::string type_string =
+            activations_type == TensorType_INT16 ? "int16" : "int8";
         std::unique_ptr<TensorT> op_output;
-        utils::MakeTensor(tensor->name + "_int8", tensor->shape,
-                          TensorType_INT8, &op_output);
+        utils::MakeTensor(tensor->name + "_" + type_string, tensor->shape,
+                          activations_type, &op_output);
         op_output->quantization = absl::make_unique<QuantizationParametersT>();
         op_output->quantization->min.push_back(tensor->quantization->min[0]);
         op_output->quantization->max.push_back(tensor->quantization->max[0]);
-        utils::QuantizeActivation(op_output.get());
+        TF_LITE_ENSURE_STATUS(utils::QuantizeActivation(
+            op_output.get(), activations_type, error_reporter));
         const int32_t quant_op_output_idx = subgraph->tensors.size();
         subgraph->tensors.push_back(std::move(op_output));
         std::unique_ptr<OperatorT> quant_op;
@@ -580,7 +613,7 @@ TfLiteStatus QuantizeOpOutput(
     ModelT* model, int32_t subgraph_idx, int32_t op_idx,
     operator_property::OperatorProperty property,
     const std::pair<int32_t, operator_property::TensorProperty>& output,
-    ErrorReporter* error_reporter) {
+    TensorType activations_type, ErrorReporter* error_reporter) {
   int32_t output_idx = output.first;
   operator_property::TensorProperty tensor_property = output.second;
   // If the operator is not quantizable, we don't need to do anything for the
@@ -644,18 +677,22 @@ TfLiteStatus QuantizeOpOutput(
       const float max = input_tensor->quantization->max[0];
       output_tensor->quantization->max = {max};
     }
-    output_tensor->type = TensorType_INT8;
+    output_tensor->type = activations_type;
   } else if (tensor_property.restriction) {
-    const auto scale_and_zp = tensor_property.restricted_value;
+    const auto scale_and_zp = activations_type == TensorType_INT16
+                                  ? tensor_property.restricted_value_int16
+                                  : tensor_property.restricted_value_int8;
+
     // Apply to output.
     output_tensor->quantization = absl::make_unique<QuantizationParametersT>();
     output_tensor->quantization->scale.push_back(scale_and_zp.first);
     output_tensor->quantization->zero_point.push_back(scale_and_zp.second);
-    output_tensor->type = TensorType_INT8;
+    output_tensor->type = activations_type;
   } else {
     // Process regular output that doesn't have any restrictions.
     if (utils::HasMinMax(output_tensor)) {
-      utils::QuantizeActivation(output_tensor);
+      utils::QuantizeActivation(output_tensor, activations_type,
+                                error_reporter);
     } else {
       error_reporter->Report(
           "Unable to find min/max value for output %d in %s in "
@@ -668,6 +705,7 @@ TfLiteStatus QuantizeOpOutput(
 }
 
 TfLiteStatus QuantizeIntemediateTensors(ModelT* model,
+                                        TensorType activations_type,
                                         ErrorReporter* error_reporter) {
   for (size_t subgraph_idx = 0; subgraph_idx < model->subgraphs.size();
        subgraph_idx++) {
@@ -691,7 +729,8 @@ TfLiteStatus QuantizeIntemediateTensors(ModelT* model,
               input.second.symmetric == false) {
             TensorT* tensor = subgraph->tensors[index_global].get();
             if (utils::HasMinMax(tensor)) {
-              utils::QuantizeActivation(tensor);
+              utils::QuantizeActivation(tensor, activations_type,
+                                        error_reporter);
             } else {
               error_reporter->Report(
                   "Unable to find min/max value for output %d in %s in "
@@ -793,7 +832,7 @@ TfLiteStatus QuantizeSharedRange(ModelT* model, ErrorReporter* error_reporter) {
 TfLiteStatus QuantizeWeightsInputOutput(
     ModelT* model, bool allow_float,
     const std::unordered_set<string>& operator_names,
-    ErrorReporter* error_reporter) {
+    const TensorType& activations_type, ErrorReporter* error_reporter) {
   for (size_t subgraph_idx = 0; subgraph_idx < model->subgraphs.size();
        subgraph_idx++) {
     SubGraphT* subgraph = model->subgraphs.at(subgraph_idx).get();
@@ -815,14 +854,16 @@ TfLiteStatus QuantizeWeightsInputOutput(
       for (const std::pair<int, operator_property::TensorProperty>& input :
            GetInputs(op, property)) {
         TF_LITE_ENSURE_STATUS(QuantizeOpInput(model, subgraph_idx, &op_idx,
-                                              property, input, error_reporter));
+                                              property, input, activations_type,
+                                              error_reporter));
       }
 
       // Quantize operator outputs.
       for (const std::pair<int, operator_property::TensorProperty>& output :
            GetOutputs(op, property)) {
-        TF_LITE_ENSURE_STATUS(QuantizeOpOutput(
-            model, subgraph_idx, op_idx, property, output, error_reporter));
+        TF_LITE_ENSURE_STATUS(
+            QuantizeOpOutput(model, subgraph_idx, op_idx, property, output,
+                             activations_type, error_reporter));
       }
     }
   }
@@ -832,6 +873,7 @@ TfLiteStatus QuantizeWeightsInputOutput(
 // Quantize bias.
 TfLiteStatus QuantizeBiases(ModelT* model,
                             const std::unordered_set<string>& operator_names,
+                            const TensorType& activations_type,
                             ErrorReporter* error_reporter) {
   for (size_t subgraph_idx = 0; subgraph_idx < model->subgraphs.size();
        subgraph_idx++) {
@@ -877,10 +919,10 @@ TfLiteStatus QuantizeBiases(ModelT* model,
                 subgraph->tensors[op->inputs[property.inputs[1].first]].get();
             operator_property::TensorProperty weight_property =
                 property.inputs[1].second;
-            TF_LITE_ENSURE_STATUS(
-                QuantizeBias(model, input_tensor, weight_tensor, bias_tensor,
-                             weight_property.per_axis,
-                             weight_property.per_axis_index, error_reporter));
+            TF_LITE_ENSURE_STATUS(QuantizeBias(
+                model, input_tensor, weight_tensor, bias_tensor,
+                weight_property.per_axis, weight_property.per_axis_index,
+                activations_type, error_reporter));
           }
         }
       }
@@ -1000,7 +1042,7 @@ TfLiteStatus FillQuantizationParams(
 // Check compatibility of activation, weight and bias scales. Adjust if needed.
 TfLiteStatus EnsureBiasScaleCompatibility(
     ModelT* model, const std::unordered_set<string>& operator_names,
-    ErrorReporter* error_reporter) {
+    TensorType activations_type, ErrorReporter* error_reporter) {
   for (size_t subgraph_idx = 0; subgraph_idx < model->subgraphs.size();
        subgraph_idx++) {
     SubGraphT* subgraph = model->subgraphs.at(subgraph_idx).get();
@@ -1049,11 +1091,9 @@ TfLiteStatus EnsureBiasScaleCompatibility(
 
           // Get input scale for assymmetric quantization.
           QuantizationParametersT temp_quant_params = QuantizationParametersT();
-          utils::GetAsymmetricQuantizationParams(
-              input_tensor->quantization->min[0],
-              input_tensor->quantization->max[0],
-              std::numeric_limits<int8_t>::min(),
-              std::numeric_limits<int8_t>::max(), &temp_quant_params);
+          TF_LITE_ENSURE_STATUS(
+              utils::GetQuantizationParams(input_tensor, activations_type,
+                                           &temp_quant_params, error_reporter));
           if (temp_quant_params.scale.size() != 1) {
             error_reporter->Report("Unexpected input quantization scale size.");
             return kTfLiteError;
@@ -1132,21 +1172,24 @@ TfLiteStatus QuantizeModel(flatbuffers::FlatBufferBuilder* builder,
                            ModelT* model, const TensorType& input_type,
                            const TensorType& output_type, bool allow_float,
                            const std::unordered_set<string>& operator_names,
+                           const TensorType& activations_type,
                            ErrorReporter* error_reporter) {
   TF_LITE_ENSURE_STATUS(
       FillQuantizationParams(model, operator_names, error_reporter));
+  TF_LITE_ENSURE_STATUS(EnsureBiasScaleCompatibility(
+      model, operator_names, activations_type, error_reporter));
   TF_LITE_ENSURE_STATUS(
-      EnsureBiasScaleCompatibility(model, operator_names, error_reporter));
-  TF_LITE_ENSURE_STATUS(QuantizeIntemediateTensors(model, error_reporter));
+      QuantizeIntemediateTensors(model, activations_type, error_reporter));
   TF_LITE_ENSURE_STATUS(QuantizeSharedRange(model, error_reporter));
   TF_LITE_ENSURE_STATUS(QuantizeWeightsInputOutput(
-      model, allow_float, operator_names, error_reporter));
+      model, allow_float, operator_names, activations_type, error_reporter));
+  TF_LITE_ENSURE_STATUS(ApplyConstraints(model, operator_names,
+                                         activations_type, error_reporter));
   TF_LITE_ENSURE_STATUS(
-      ApplyConstraints(model, operator_names, error_reporter));
-  TF_LITE_ENSURE_STATUS(QuantizeBiases(model, operator_names, error_reporter));
+      QuantizeBiases(model, operator_names, activations_type, error_reporter));
   utils::SetOperatorCodeVersion(model);
-  TF_LITE_ENSURE_STATUS(
-      SetInputAndOutputTypes(model, input_type, output_type, error_reporter));
+  TF_LITE_ENSURE_STATUS(SetInputAndOutputTypes(
+      model, input_type, output_type, activations_type, error_reporter));
 
   flatbuffers::Offset<Model> output_model_location =
       Model::Pack(*builder, model);
@@ -1158,23 +1201,27 @@ TfLiteStatus QuantizeModel(flatbuffers::FlatBufferBuilder* builder,
 TfLiteStatus QuantizeModel(flatbuffers::FlatBufferBuilder* builder,
                            ModelT* model, const TensorType& input_type,
                            const TensorType& output_type, bool allow_float,
+                           const TensorType& activations_type,
                            ErrorReporter* error_reporter) {
   return QuantizeModel(builder, model, input_type, output_type, allow_float,
-                       GetAllOperatorOutputs(model), error_reporter);
+                       GetAllOperatorOutputs(model), activations_type,
+                       error_reporter);
 }
 
 TfLiteStatus QuantizeModel(flatbuffers::FlatBufferBuilder* builder,
                            ModelT* model, const TensorType& input_type,
                            const TensorType& output_type,
+                           const TensorType& activations_type,
                            ErrorReporter* error_reporter) {
   return QuantizeModel(builder, model, input_type, output_type,
-                       /*allow_float=*/false, error_reporter);
+                       /*allow_float=*/false, activations_type, error_reporter);
 }
 
 TfLiteStatus QuantizeModel(flatbuffers::FlatBufferBuilder* builder,
-                           ModelT* model, ErrorReporter* error_reporter) {
+                           ModelT* model, const TensorType& activations_type,
+                           ErrorReporter* error_reporter) {
   return QuantizeModel(builder, model, TensorType_FLOAT32, TensorType_FLOAT32,
-                       /*allow_float=*/false, error_reporter);
+                       /*allow_float=*/false, activations_type, error_reporter);
 }
 
 }  // namespace optimize
diff --git a/tensorflow/lite/tools/optimize/quantize_model.h b/tensorflow/lite/tools/optimize/quantize_model.h
index 9b0353f6b6b..cc801ec9870 100644
--- a/tensorflow/lite/tools/optimize/quantize_model.h
+++ b/tensorflow/lite/tools/optimize/quantize_model.h
@@ -35,7 +35,9 @@ namespace optimize {
 //
 // Note: This is a private API, subject to change.
 TfLiteStatus QuantizeModel(flatbuffers::FlatBufferBuilder* builder,
-                           ModelT* input_model, ErrorReporter* error_reporter);
+                           ModelT* input_model,
+                           const TensorType& activations_type,
+                           ErrorReporter* error_reporter);
 
 // Same as above, but the types of quantized inputs and outputs are
 // configurable.
@@ -44,6 +46,7 @@ TfLiteStatus QuantizeModel(flatbuffers::FlatBufferBuilder* builder,
 TfLiteStatus QuantizeModel(flatbuffers::FlatBufferBuilder* builder,
                            ModelT* input_model, const TensorType& input_type,
                            const TensorType& output_type,
+                           const TensorType& activations_type,
                            ErrorReporter* error_reporter);
 
 // Same as above, but can enable allowing float intermediate operations for ops
@@ -53,6 +56,7 @@ TfLiteStatus QuantizeModel(flatbuffers::FlatBufferBuilder* builder,
 TfLiteStatus QuantizeModel(flatbuffers::FlatBufferBuilder* builder,
                            ModelT* input_model, const TensorType& input_type,
                            const TensorType& output_type, bool allow_float,
+                           const TensorType& activations_type,
                            ErrorReporter* error_reporter);
 
 // Same as above, but enables only quantizing a whitelist of operations,
@@ -63,6 +67,7 @@ TfLiteStatus QuantizeModel(flatbuffers::FlatBufferBuilder* builder,
                            ModelT* input_model, const TensorType& input_type,
                            const TensorType& output_type, bool allow_float,
                            const std::unordered_set<string>& operator_names,
+                           const TensorType& activations_type,
                            ErrorReporter* error_reporter);
 
 }  // namespace optimize
diff --git a/tensorflow/lite/tools/optimize/quantize_model_test.cc b/tensorflow/lite/tools/optimize/quantize_model_test.cc
index da1b293c84b..166d60ecc66 100644
--- a/tensorflow/lite/tools/optimize/quantize_model_test.cc
+++ b/tensorflow/lite/tools/optimize/quantize_model_test.cc
@@ -80,28 +80,35 @@ class QuantizeModelTest : public testing::Test {
   internal::FailOnErrorReporter error_reporter_;
 };
 
-class QuantizeConvModelTest : public QuantizeModelTest {
+class QuantizeConvModelTest : public QuantizeModelTest,
+                              public testing::WithParamInterface<TensorType> {
  protected:
   QuantizeConvModelTest() {
+    tensor_type_ = GetParam();
     input_model_ = ReadModel(internal::kConvModelWith0Plus10Weights);
     readonly_model_ = input_model_->GetModel();
     readonly_model_->UnPackTo(&model_);
   }
+  TensorType tensor_type_;
 };
 
-TEST_F(QuantizeConvModelTest, QuantizationSucceeds) {
-  auto status = QuantizeModel(&builder_, &model_, TensorType_INT8,
-                              TensorType_INT8, &error_reporter_);
+INSTANTIATE_TEST_SUITE_P(QuantizeConvModelTestInst, QuantizeConvModelTest,
+                         testing::ValuesIn({TensorType_INT8,
+                                            TensorType_INT16}));
+
+TEST_P(QuantizeConvModelTest, QuantizationSucceeds) {
+  auto status = QuantizeModel(&builder_, &model_, tensor_type_, tensor_type_,
+                              tensor_type_, &error_reporter_);
   EXPECT_EQ(status, kTfLiteOk);
   const uint8_t* buffer = builder_.GetBufferPointer();
   const Model* output_model = GetModel(buffer);
   ASSERT_TRUE(output_model);
 }
 
-TEST_F(QuantizeConvModelTest, SkipUnspecifiedLayer) {
+TEST_P(QuantizeConvModelTest, SkipUnspecifiedLayer) {
   auto status =
       QuantizeModel(&builder_, &model_, TensorType_FLOAT32, TensorType_FLOAT32,
-                    /*allow_float=*/true, {}, &error_reporter_);
+                    /*allow_float=*/true, {}, tensor_type_, &error_reporter_);
   EXPECT_EQ(status, kTfLiteOk);
   ASSERT_EQ(model_.subgraphs.size(), readonly_model_->subgraphs()->size());
   // The resulting model should be the same.
@@ -123,9 +130,9 @@ TEST_F(QuantizeConvModelTest, SkipUnspecifiedLayer) {
   }
 }
 
-TEST_F(QuantizeConvModelTest, TensorShapesAndStructureIsUnchanged) {
-  auto status = QuantizeModel(&builder_, &model_, TensorType_INT8,
-                              TensorType_INT8, &error_reporter_);
+TEST_P(QuantizeConvModelTest, TensorShapesAndStructureIsUnchanged) {
+  auto status = QuantizeModel(&builder_, &model_, tensor_type_, tensor_type_,
+                              tensor_type_, &error_reporter_);
   EXPECT_EQ(status, kTfLiteOk);
   ASSERT_EQ(model_.subgraphs.size(), readonly_model_->subgraphs()->size());
   for (size_t subgraph_idx = 0; subgraph_idx < model_.subgraphs.size();
@@ -148,9 +155,9 @@ TEST_F(QuantizeConvModelTest, TensorShapesAndStructureIsUnchanged) {
   EXPECT_EQ(model_.operator_codes[0]->version, 3);
 }
 
-TEST_F(QuantizeConvModelTest, OperatorsAreUnchanged) {
-  auto status = QuantizeModel(&builder_, &model_, TensorType_INT8,
-                              TensorType_INT8, &error_reporter_);
+TEST_P(QuantizeConvModelTest, OperatorsAreUnchanged) {
+  auto status = QuantizeModel(&builder_, &model_, tensor_type_, tensor_type_,
+                              tensor_type_, &error_reporter_);
   EXPECT_EQ(status, kTfLiteOk);
   ASSERT_EQ(model_.operator_codes.size(),
             readonly_model_->operator_codes()->size());
@@ -182,20 +189,28 @@ TEST_F(QuantizeConvModelTest, OperatorsAreUnchanged) {
   }
 }
 
-TEST_F(QuantizeConvModelTest, GraphIsFullyQuantized) {
-  auto status = QuantizeModel(&builder_, &model_, TensorType_INT8,
-                              TensorType_INT8, &error_reporter_);
+TEST_P(QuantizeConvModelTest, GraphIsFullyQuantized) {
+  auto status = QuantizeModel(&builder_, &model_, tensor_type_, tensor_type_,
+                              tensor_type_, &error_reporter_);
   EXPECT_EQ(status, kTfLiteOk);
   for (const auto& subgraph : model_.subgraphs) {
     for (const auto& tensor : subgraph->tensors) {
-      EXPECT_TRUE(tensor->type == TensorType_INT32 ||
-                  tensor->type == TensorType_INT8);
+      if (tensor_type_ == TensorType_INT8) {
+        EXPECT_TRUE(tensor->type == TensorType_INT32 ||
+                    tensor->type == TensorType_INT8);
+      } else if (tensor_type_ == TensorType_INT16) {
+        EXPECT_TRUE(tensor->type == TensorType_INT64 ||  // bias
+                    tensor->type == TensorType_INT8 ||   // weights
+                    tensor->type == TensorType_INT16);   // activations
+      }
     }
   }
 }
 
-TEST_F(QuantizeConvModelTest, FloatInputAndOutput) {
-  auto status = QuantizeModel(&builder_, &model_, &error_reporter_);
+TEST_P(QuantizeConvModelTest, FloatInputAndOutput) {
+  auto status =
+      QuantizeModel(&builder_, &model_, TensorType_FLOAT32, TensorType_FLOAT32,
+                    tensor_type_, &error_reporter_);
   EXPECT_EQ(status, kTfLiteOk);
 
   for (int32_t subgraph_idx = 0; subgraph_idx < model_.subgraphs.size();
@@ -234,22 +249,33 @@ TEST_F(QuantizeConvModelTest, FloatInputAndOutput) {
     EXPECT_EQ(subgraph->tensors[output_idx]->type, TensorType_FLOAT32);
     EXPECT_EQ(subgraph->tensors[output_idx]->name, "output");
     // The original input and output has been renamed.
-    EXPECT_EQ(subgraph->tensors[quant_op->outputs[0]]->name, "input_int8");
-    EXPECT_EQ(subgraph->tensors[dequant_op->inputs[0]]->name, "output_int8");
+    std::string control_suffix =
+        (tensor_type_ == TensorType_INT16) ? "int16" : "int8";
+    EXPECT_EQ(subgraph->tensors[quant_op->outputs[0]]->name,
+              "input_" + control_suffix);
+    EXPECT_EQ(subgraph->tensors[dequant_op->inputs[0]]->name,
+              "output_" + control_suffix);
     for (int tensor_idx = 0; tensor_idx < subgraph->tensors.size();
          ++tensor_idx) {
       const auto& tensor = subgraph->tensors[tensor_idx];
       if (input_idx != tensor_idx && output_idx != tensor_idx) {
-        EXPECT_TRUE(tensor->type == TensorType_INT32 ||
-                    tensor->type == TensorType_INT8);
+        if (tensor_type_ == TensorType_INT8) {
+          EXPECT_TRUE(tensor->type == TensorType_INT32 ||
+                      tensor->type == TensorType_INT8);
+        } else if (tensor_type_ == TensorType_INT16) {
+          EXPECT_TRUE(tensor->type == TensorType_INT64 ||  // bias
+                      tensor->type == TensorType_INT8 ||   // weights
+                      tensor->type == TensorType_INT16);   // activations
+        }
       }
     }
   }
 }
 
-TEST_F(QuantizeConvModelTest, Uint8InputAndOutput) {
-  auto status = QuantizeModel(&builder_, &model_, TensorType_UINT8,
-                              TensorType_UINT8, &error_reporter_);
+TEST_P(QuantizeConvModelTest, Uint8InputAndOutput) {
+  auto status =
+      QuantizeModel(&builder_, &model_, TensorType_UINT8, TensorType_UINT8,
+                    TensorType_INT8, &error_reporter_);
   EXPECT_EQ(status, kTfLiteOk);
 
   for (int32_t subgraph_idx = 0; subgraph_idx < model_.subgraphs.size();
@@ -326,21 +352,25 @@ class QuantizeConvNoBiasModelTest : public QuantizeModelTest {
 };
 
 TEST_F(QuantizeConvNoBiasModelTest, QuantizationSucceeds) {
-  auto status = QuantizeModel(&builder_, &model_, TensorType_INT8,
-                              TensorType_INT8, &error_reporter_);
+  auto status =
+      QuantizeModel(&builder_, &model_, TensorType_INT8, TensorType_INT8,
+                    TensorType_INT8, &error_reporter_);
   EXPECT_EQ(status, kTfLiteOk);
   const uint8_t* buffer = builder_.GetBufferPointer();
   const Model* output_model = GetModel(buffer);
   ASSERT_TRUE(output_model);
 }
 
-class QuantizeConcatModelTest : public QuantizeModelTest {
+class QuantizeConcatModelTest : public QuantizeModelTest,
+                                public testing::WithParamInterface<TensorType> {
  protected:
   QuantizeConcatModelTest() {
     input_model_ = ReadModel(internal::kFloatConcatMax5Max10Max10);
     readonly_model_ = input_model_->GetModel();
     readonly_model_->UnPackTo(&model_);
   }
+
+  TensorType tensor_type_;
 };
 
 // There are two inputs for concat, "input0" and "input1". "input0" has [0, 5]
@@ -352,9 +382,9 @@ class QuantizeConcatModelTest : public QuantizeModelTest {
 // input0 -> requant -> input0_requant \
 //                                       concat - output
 //                              input1 /
-TEST_F(QuantizeConcatModelTest, AddRequantBeforeConcat) {
-  auto status = QuantizeModel(&builder_, &model_, TensorType_INT8,
-                              TensorType_INT8, &error_reporter_);
+TEST_P(QuantizeConcatModelTest, AddRequantBeforeConcat) {
+  auto status = QuantizeModel(&builder_, &model_, tensor_type_, tensor_type_,
+                              tensor_type_, &error_reporter_);
   EXPECT_EQ(status, kTfLiteOk);
 
   // There is only one subgraph.
@@ -373,32 +403,51 @@ TEST_F(QuantizeConcatModelTest, AddRequantBeforeConcat) {
   EXPECT_EQ(model_.operator_codes[concat->opcode_index]->builtin_code,
             BuiltinOperator_CONCATENATION);
 
+  auto zero_point_control = tensor_type_ == TensorType_INT8 ? -128 : 0;
+  /*
+     input0_scale_control
+        INT8: (5-0) / (2^8 - 1)
+        INT16: (5-0) / (2^16 / 2 - 1)
+     input1_scale
+        INT8: (10-0) / (2^8 - 1)
+        INT16: (10-0) / (2^16 / 2 - 1)
+  */
+  auto input0_scale_control =
+      tensor_type_ == TensorType_INT8 ? 0.019607844 : 0.00015259254;
+  auto input1_scale =
+      tensor_type_ == TensorType_INT8 ? 0.039215688 : 0.00030518509;
+
   // There should be 4 tensors: input0, input1, input0_requantized, output.
   EXPECT_EQ(subgraph->tensors.size(), 4);
-  EXPECT_EQ(subgraph->tensors[0]->type, TensorType_INT8);
+  EXPECT_EQ(subgraph->tensors[0]->type, tensor_type_);
   EXPECT_EQ(subgraph->tensors[0]->name, "input0");
   EXPECT_EQ(subgraph->tensors[0]->quantization->scale.size(), 1);
   EXPECT_EQ(subgraph->tensors[0]->quantization->zero_point.size(), 1);
-  EXPECT_FLOAT_EQ(subgraph->tensors[0]->quantization->scale[0], 0.019607844);
-  EXPECT_FLOAT_EQ(subgraph->tensors[0]->quantization->zero_point[0], -128);
-  EXPECT_EQ(subgraph->tensors[1]->type, TensorType_INT8);
+  EXPECT_FLOAT_EQ(subgraph->tensors[0]->quantization->scale[0],
+                  input0_scale_control);
+  EXPECT_FLOAT_EQ(subgraph->tensors[0]->quantization->zero_point[0],
+                  zero_point_control);
+  EXPECT_EQ(subgraph->tensors[1]->type, tensor_type_);
   EXPECT_EQ(subgraph->tensors[1]->name, "input1");
   EXPECT_EQ(subgraph->tensors[1]->quantization->scale.size(), 1);
   EXPECT_EQ(subgraph->tensors[1]->quantization->zero_point.size(), 1);
-  EXPECT_FLOAT_EQ(subgraph->tensors[1]->quantization->scale[0], 0.039215688);
-  EXPECT_FLOAT_EQ(subgraph->tensors[1]->quantization->zero_point[0], -128);
-  EXPECT_EQ(subgraph->tensors[2]->type, TensorType_INT8);
+  EXPECT_FLOAT_EQ(subgraph->tensors[1]->quantization->scale[0], input1_scale);
+  EXPECT_FLOAT_EQ(subgraph->tensors[1]->quantization->zero_point[0],
+                  zero_point_control);
+  EXPECT_EQ(subgraph->tensors[2]->type, tensor_type_);
   EXPECT_EQ(subgraph->tensors[2]->name, "output");
   EXPECT_EQ(subgraph->tensors[2]->quantization->scale.size(), 1);
   EXPECT_EQ(subgraph->tensors[2]->quantization->zero_point.size(), 1);
-  EXPECT_FLOAT_EQ(subgraph->tensors[2]->quantization->scale[0], 0.039215688);
-  EXPECT_FLOAT_EQ(subgraph->tensors[2]->quantization->zero_point[0], -128);
-  EXPECT_EQ(subgraph->tensors[3]->type, TensorType_INT8);
+  EXPECT_FLOAT_EQ(subgraph->tensors[2]->quantization->scale[0], input1_scale);
+  EXPECT_FLOAT_EQ(subgraph->tensors[2]->quantization->zero_point[0],
+                  zero_point_control);
+  EXPECT_EQ(subgraph->tensors[3]->type, tensor_type_);
   EXPECT_EQ(subgraph->tensors[3]->name, "input0_requantized");
   EXPECT_EQ(subgraph->tensors[3]->quantization->scale.size(), 1);
   EXPECT_EQ(subgraph->tensors[3]->quantization->zero_point.size(), 1);
-  EXPECT_FLOAT_EQ(subgraph->tensors[3]->quantization->scale[0], 0.039215688);
-  EXPECT_FLOAT_EQ(subgraph->tensors[3]->quantization->zero_point[0], -128);
+  EXPECT_FLOAT_EQ(subgraph->tensors[3]->quantization->scale[0], input1_scale);
+  EXPECT_FLOAT_EQ(subgraph->tensors[3]->quantization->zero_point[0],
+                  zero_point_control);
 
   // The connection should be what is described in the comment.
   EXPECT_EQ(requant->inputs.size(), 1);
@@ -419,7 +468,9 @@ TEST_F(QuantizeConcatModelTest, AddRequantBeforeConcat) {
   EXPECT_EQ(model_.operator_codes[1]->builtin_code, BuiltinOperator_QUANTIZE);
   EXPECT_EQ(model_.operator_codes[1]->version, 2);
 }
-
+INSTANTIATE_TEST_SUITE_P(QuantizeConcatModelInst, QuantizeConcatModelTest,
+                         testing::ValuesIn({TensorType_INT8,
+                                            TensorType_INT16}));
 class QuantizeSplitModelTest : public QuantizeModelTest {
  protected:
   QuantizeSplitModelTest() {
@@ -432,8 +483,9 @@ class QuantizeSplitModelTest : public QuantizeModelTest {
 // There are two outputs for split with different scales, the resulting model
 // should have the scales be hardcodes to the input scale value.
 TEST_F(QuantizeSplitModelTest, QuantizeSplit) {
-  auto status = QuantizeModel(&builder_, &model_, TensorType_INT8,
-                              TensorType_INT8, &error_reporter_);
+  auto status =
+      QuantizeModel(&builder_, &model_, TensorType_INT8, TensorType_INT8,
+                    TensorType_INT8, &error_reporter_);
   EXPECT_EQ(status, kTfLiteOk);
 
   // There is only one subgraph.
@@ -496,8 +548,9 @@ class QuantizeConvModel1Test : public QuantizeModelTest {
 };
 
 TEST_F(QuantizeConvModel1Test, VerifyConvQuantizationWithUnitScale) {
-  auto status = QuantizeModel(&builder_, &model_, TensorType_INT8,
-                              TensorType_INT8, &error_reporter_);
+  auto status =
+      QuantizeModel(&builder_, &model_, TensorType_INT8, TensorType_INT8,
+                    TensorType_INT8, &error_reporter_);
   EXPECT_EQ(status, kTfLiteOk);
   const auto& subgraph = model_.subgraphs[0];
 
@@ -587,18 +640,25 @@ TEST_F(QuantizeConvModel1Test, VerifyConvQuantizationWithUnitScale) {
   EXPECT_EQ(model_.operator_codes[0]->version, 3);
 }
 
-class QuantizeConvModel2Test : public QuantizeModelTest {
+class QuantizeConvModel2Test : public QuantizeModelTest,
+                               public testing::WithParamInterface<TensorType> {
  protected:
   QuantizeConvModel2Test() {
+    tensor_type_ = GetParam();
     input_model_ = ReadModel(internal::kConvModelWith0Plus10Weights);
     readonly_model_ = input_model_->GetModel();
     readonly_model_->UnPackTo(&model_);
   }
-};
 
-TEST_F(QuantizeConvModel2Test, VerifyConvQuantization) {
-  auto status = QuantizeModel(&builder_, &model_, TensorType_INT8,
-                              TensorType_INT8, &error_reporter_);
+  TensorType tensor_type_;
+};
+INSTANTIATE_TEST_SUITE_P(QuantizeConvModel2TestInst, QuantizeConvModel2Test,
+                         testing::ValuesIn({TensorType_INT8,
+                                            TensorType_INT16}));
+
+TEST_P(QuantizeConvModel2Test, VerifyConvQuantization) {
+  auto status = QuantizeModel(&builder_, &model_, tensor_type_, tensor_type_,
+                              tensor_type_, &error_reporter_);
   ASSERT_EQ(kTfLiteOk, status);
   const auto& subgraph = model_.subgraphs[0];
   auto conv_op = subgraph->operators[0].get();
@@ -615,8 +675,10 @@ TEST_F(QuantizeConvModel2Test, VerifyConvQuantization) {
   const auto output_tensor =
       subgraph->tensors[conv_op->outputs[output_tensor_idx]].get();
 
-  EXPECT_EQ(bias_tensor->type, TensorType_INT32);
-  EXPECT_EQ(input_tensor->type, TensorType_INT8);
+  EXPECT_EQ(bias_tensor->type, tensor_type_ == TensorType_INT8
+                                   ? TensorType_INT32
+                                   : TensorType_INT64);
+  EXPECT_EQ(input_tensor->type, tensor_type_);
   EXPECT_EQ(weights_tensor->type, TensorType_INT8);
 
   ASSERT_TRUE(weights_tensor->quantization);
@@ -644,17 +706,28 @@ TEST_F(QuantizeConvModel2Test, VerifyConvQuantization) {
   }
 
   const auto bias_buffer = model_.buffers[bias_tensor->buffer].get();
-  ASSERT_EQ(bias_buffer->data.size(), sizeof(int32_t) * bias_tensor->shape[0]);
-  const int32_t* bias_values =
-      reinterpret_cast<int32_t*>(bias_buffer->data.data());
+  auto control_size = tensor_type_ == TensorType_INT8
+                          ? sizeof(int32_t) * bias_tensor->shape[0]
+                          : sizeof(int64_t) * bias_tensor->shape[0];
+
+  ASSERT_EQ(bias_buffer->data.size(), control_size);
   const auto original_bias_buffer =
       readonly_model_->buffers()->Get(bias_tensor->buffer);
   const float* bias_float_buffer =
       reinterpret_cast<const float*>(original_bias_buffer->data()->data());
 
-  for (size_t i = 0; i < out_channel_size; i++) {
-    auto dequantized_value = bias_values[i] * bias_scales[i];
-    EXPECT_NEAR(dequantized_value, bias_float_buffer[i], bias_scales[i] / 2);
+  if (tensor_type_ == TensorType_INT8) {
+    int32_t* bias_values = reinterpret_cast<int32_t*>(bias_buffer->data.data());
+    for (size_t i = 0; i < out_channel_size; i++) {
+      auto dequantized_value = bias_values[i] * bias_scales[i];
+      EXPECT_NEAR(dequantized_value, bias_float_buffer[i], bias_scales[i] / 2);
+    }
+  } else if (tensor_type_ == TensorType_INT16) {
+    int64_t* bias_values = reinterpret_cast<int64_t*>(bias_buffer->data.data());
+    for (size_t i = 0; i < out_channel_size; i++) {
+      auto dequantized_value = bias_values[i] * bias_scales[i];
+      EXPECT_NEAR(dequantized_value, bias_float_buffer[i], bias_scales[i] / 2);
+    }
   }
 
   const auto weights_buffer = model_.buffers[weights_tensor->buffer].get();
@@ -695,8 +768,9 @@ class QuantizeSoftmaxTest : public QuantizeModelTest {
 };
 
 TEST_F(QuantizeSoftmaxTest, VerifySoftmaxQuantization) {
-  auto status = QuantizeModel(&builder_, &model_, TensorType_INT8,
-                              TensorType_INT8, &error_reporter_);
+  auto status =
+      QuantizeModel(&builder_, &model_, TensorType_INT8, TensorType_INT8,
+                    TensorType_INT8, &error_reporter_);
   ASSERT_EQ(kTfLiteOk, status);
 
   const auto& subgraph = model_.subgraphs[0];
@@ -755,8 +829,9 @@ class QuantizeAvgPoolTest : public QuantizeModelTest {
 };
 
 TEST_F(QuantizeAvgPoolTest, VerifyAvgPoolQuantization) {
-  auto status = QuantizeModel(&builder_, &model_, TensorType_INT8,
-                              TensorType_INT8, &error_reporter_);
+  auto status =
+      QuantizeModel(&builder_, &model_, TensorType_INT8, TensorType_INT8,
+                    TensorType_INT8, &error_reporter_);
   ASSERT_EQ(kTfLiteOk, status);
 
   const auto& subgraph = model_.subgraphs[0];
@@ -816,8 +891,9 @@ class QuantizeMultiInputAddWithReshapeTest : public QuantizeModelTest {
 };
 
 TEST_F(QuantizeMultiInputAddWithReshapeTest, VerifyReshapeQuantization) {
-  auto status = QuantizeModel(&builder_, &model_, TensorType_INT8,
-                              TensorType_INT8, &error_reporter_);
+  auto status =
+      QuantizeModel(&builder_, &model_, TensorType_INT8, TensorType_INT8,
+                    TensorType_INT8, &error_reporter_);
   ASSERT_EQ(kTfLiteOk, status);
 
   // Verify Reshape is quantized.
@@ -863,8 +939,9 @@ TEST_F(QuantizeMultiInputAddWithReshapeTest, VerifyReshapeQuantization) {
 }
 
 TEST_F(QuantizeMultiInputAddWithReshapeTest, VerifyAddQuantization) {
-  auto status = QuantizeModel(&builder_, &model_, TensorType_INT8,
-                              TensorType_INT8, &error_reporter_);
+  auto status =
+      QuantizeModel(&builder_, &model_, TensorType_INT8, TensorType_INT8,
+                    TensorType_INT8, &error_reporter_);
   ASSERT_EQ(kTfLiteOk, status);
 
   // Verify ADD is quantized.
@@ -923,8 +1000,9 @@ class QuantizeConstInputTest : public QuantizeModelTest {
 };
 
 TEST_F(QuantizeConstInputTest, VerifyConstOpInput) {
-  auto status = QuantizeModel(&builder_, &model_, TensorType_INT8,
-                              TensorType_INT8, &error_reporter_);
+  auto status =
+      QuantizeModel(&builder_, &model_, TensorType_INT8, TensorType_INT8,
+                    TensorType_INT8, &error_reporter_);
   ASSERT_EQ(kTfLiteOk, status);
 
   // Verify ConstOp is quantized.
@@ -965,8 +1043,9 @@ class QuantizeArgMaxTest : public QuantizeModelTest {
 };
 
 TEST_F(QuantizeArgMaxTest, VerifyArgMax) {
-  auto status = QuantizeModel(&builder_, &model_, TensorType_INT8,
-                              TensorType_INT8, &error_reporter_);
+  auto status =
+      QuantizeModel(&builder_, &model_, TensorType_INT8, TensorType_INT8,
+                    TensorType_INT8, &error_reporter_);
   ASSERT_EQ(kTfLiteOk, status);
 
   const auto& subgraph = model_.subgraphs[0];
@@ -1008,8 +1087,9 @@ class QuantizeLSTMTest : public QuantizeModelTest {
 
 TEST_F(QuantizeLSTMTest, VerifyLSTM) {
   // Quantize model.
-  auto status = QuantizeModel(&builder_, &model_, TensorType_FLOAT32,
-                              TensorType_FLOAT32, &error_reporter_);
+  auto status =
+      QuantizeModel(&builder_, &model_, TensorType_FLOAT32, TensorType_FLOAT32,
+                    TensorType_INT8, &error_reporter_);
   ASSERT_EQ(kTfLiteOk, status);
 
   // Read expected model.
@@ -1067,8 +1147,9 @@ class QuantizeLSTM2Test : public QuantizeModelTest {
 
 TEST_F(QuantizeLSTM2Test, VerifyLSTM) {
   // Quantize model.
-  auto status = QuantizeModel(&builder_, &model_, TensorType_FLOAT32,
-                              TensorType_FLOAT32, &error_reporter_);
+  auto status =
+      QuantizeModel(&builder_, &model_, TensorType_FLOAT32, TensorType_FLOAT32,
+                    TensorType_INT8, &error_reporter_);
   ASSERT_EQ(kTfLiteOk, status);
 
   // Read expected model.
@@ -1126,8 +1207,9 @@ class QuantizeSVDFTest : public QuantizeModelTest {
 
 TEST_F(QuantizeSVDFTest, VerifySVDF) {
   // Quantize model.
-  auto status = QuantizeModel(&builder_, &model_, TensorType_INT8,
-                              TensorType_INT8, &error_reporter_);
+  auto status =
+      QuantizeModel(&builder_, &model_, TensorType_INT8, TensorType_INT8,
+                    TensorType_INT8, &error_reporter_);
   ASSERT_EQ(kTfLiteOk, status);
 
   // Read expected model.
@@ -1184,8 +1266,9 @@ class QuantizeFCTest : public QuantizeModelTest {
 };
 
 TEST_F(QuantizeFCTest, VerifyFC) {
-  auto status = QuantizeModel(&builder_, &model_, TensorType_INT8,
-                              TensorType_INT8, &error_reporter_);
+  auto status =
+      QuantizeModel(&builder_, &model_, TensorType_INT8, TensorType_INT8,
+                    TensorType_INT8, &error_reporter_);
   ASSERT_EQ(kTfLiteOk, status);
 
   const auto& subgraph = model_.subgraphs[0];
@@ -1236,7 +1319,7 @@ class QuantizeCustomOpTest : public QuantizeModelTest {
 TEST_F(QuantizeCustomOpTest, VerifyMixedQuantization) {
   auto status =
       QuantizeModel(&builder_, &model_, TensorType_INT8, TensorType_INT8,
-                    /*allow_float=*/true, &error_reporter_);
+                    /*allow_float=*/true, TensorType_INT8, &error_reporter_);
   ASSERT_EQ(kTfLiteOk, status);
   const auto& subgraph = model_.subgraphs[0];
   auto float_graph = readonly_model_->subgraphs()->Get(0);
@@ -1270,7 +1353,8 @@ class QuantizePackTest : public QuantizeModelTest {
 };
 
 TEST_F(QuantizePackTest, VerifyPack) {
-  auto status = QuantizeModel(&builder_, &model_, &error_reporter_);
+  auto status =
+      QuantizeModel(&builder_, &model_, TensorType_INT8, &error_reporter_);
 
   ASSERT_EQ(kTfLiteOk, status);
 
@@ -1334,7 +1418,8 @@ class QuantizeMinimumMaximumTest
 };
 
 TEST_P(QuantizeMinimumMaximumTest, VerifyMinimumMaximum) {
-  auto status = QuantizeModel(&builder_, &model_, &error_reporter_);
+  auto status =
+      QuantizeModel(&builder_, &model_, TensorType_INT8, &error_reporter_);
   ASSERT_EQ(kTfLiteOk, status);
   const auto& subgraph = model_.subgraphs[0];
 
@@ -1415,7 +1500,8 @@ class QuantizeUnpackTest : public QuantizeModelTest {
   }
 };
 TEST_F(QuantizeUnpackTest, VerifyUnpack) {
-  auto status = QuantizeModel(&builder_, &model_, &error_reporter_);
+  auto status =
+      QuantizeModel(&builder_, &model_, TensorType_INT8, &error_reporter_);
 
   ASSERT_EQ(kTfLiteOk, status);
 

From 3130f64c4783d987ef32ae6853f61d1c60983d34 Mon Sep 17 00:00:00 2001
From: jmsmdy <jmsmdy@gmail.com>
Date: Fri, 7 Feb 2020 21:11:21 -0500
Subject: [PATCH 0015/1390] Corrected docstring for tf.signal.frame

---
 tensorflow/python/ops/signal/shape_ops.py | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

diff --git a/tensorflow/python/ops/signal/shape_ops.py b/tensorflow/python/ops/signal/shape_ops.py
index b0e8537c2be..bb4ac7ec33c 100644
--- a/tensorflow/python/ops/signal/shape_ops.py
+++ b/tensorflow/python/ops/signal/shape_ops.py
@@ -73,15 +73,25 @@ def frame(signal, frame_length, frame_step, pad_end=False, pad_value=0, axis=-1,
   audio = tf.random.normal([3, 9152])
 
   # Compute overlapping frames of length 512 with a step of 180 (frames overlap
-  # by 332 samples). By default, only 50 frames are generated since the last
-  # 152 samples do not form a full frame.
+  # by 332 samples). By default, only 49 frames are generated since a frame
+  # with start position j*180 for j > 48 would overhang the end.
   frames = tf.signal.frame(audio, 512, 180)
-  frames.shape.assert_is_compatible_with([3, 50, 512])
+  frames.shape.assert_is_compatible_with([3, 49, 512])
 
-  # When pad_end is enabled, the final frame is kept (padded with zeros).
+  # When pad_end is enabled, the final two frames are kept (padded with zeros).
   frames = tf.signal.frame(audio, 512, 180, pad_end=True)
   frames.shape.assert_is_compatible_with([3, 51, 512])
   ```
+  
+  If the dimension along `axis` is N, and `pad_end=False`, the number of frames
+  can be computed by:
+   ```python
+   frames = 1 + (N - frame_size) // frame_step
+   ```
+   If `pad_end=True`, the number of frames can be computed by:
+  ```python
+  frames = -(-N // frame_step) # ceiling division
+  ```
 
   Args:
     signal: A `[..., samples, ...]` `Tensor`. The rank and dimensions

From de6afc5d6b509c8f3d709bf1e275373864ec0936 Mon Sep 17 00:00:00 2001
From: Elena Zhelezina <elena.zhelezina@arm.com>
Date: Wed, 12 Feb 2020 13:51:24 +0000
Subject: [PATCH 0016/1390] Changed per reviewer comments.

---
 .../python/optimize/calibration_wrapper.cc    |  3 +-
 .../lite/tools/optimize/quantization_utils.cc | 10 ++-----
 .../lite/tools/optimize/quantize_model.cc     | 19 +++++++++----
 .../lite/tools/optimize/quantize_model.h      | 25 +++++++++++++----
 .../tools/optimize/quantize_model_test.cc     | 28 +++++++++----------
 5 files changed, 51 insertions(+), 34 deletions(-)

diff --git a/tensorflow/lite/python/optimize/calibration_wrapper.cc b/tensorflow/lite/python/optimize/calibration_wrapper.cc
index 88995136726..cdc8adaaf2b 100644
--- a/tensorflow/lite/python/optimize/calibration_wrapper.cc
+++ b/tensorflow/lite/python/optimize/calibration_wrapper.cc
@@ -266,7 +266,8 @@ PyObject* CalibrationWrapper::QuantizeModel(int input_py_type,
   auto status = tflite::optimize::QuantizeModel(
       &builder, tflite_model.get(), TfLiteTypeToSchemaType(input_type),
       TfLiteTypeToSchemaType(output_type), allow_float, {op_name},
-      TensorType_INT8, error_reporter_.get());
+      TensorType_INT8,
+      error_reporter_.get());
   if (status != kTfLiteOk) {
     error_reporter_->exception();
     return nullptr;
diff --git a/tensorflow/lite/tools/optimize/quantization_utils.cc b/tensorflow/lite/tools/optimize/quantization_utils.cc
index 4bc9686ec2c..ba43416cf04 100644
--- a/tensorflow/lite/tools/optimize/quantization_utils.cc
+++ b/tensorflow/lite/tools/optimize/quantization_utils.cc
@@ -108,14 +108,10 @@ TfLiteStatus GetQuantizationParams(TensorT* tensor, TensorType activations_type,
         std::numeric_limits<int8_t>::min(), std::numeric_limits<int8_t>::max(),
         quantization_params);
   } else if (activations_type == TensorType_INT16) {
-    float range = std::max(std::abs(tensor->quantization->min[0]),
-                           std::abs(tensor->quantization->max[0]));
     const float quantized_range = 32767.0;
-    const float scale = range / quantized_range;
-    quantization_params->min = std::vector<float>(1, -range);
-    quantization_params->max = std::vector<float>(1, range);
-    quantization_params->scale = std::vector<float>(1, scale);
-    quantization_params->zero_point = std::vector<int64_t>(1, 0);
+    GetSymmetricQuantizationParams(tensor->quantization->min[0],
+                                   tensor->quantization->max[0],
+                                   quantized_range, quantization_params);
   } else {
     error_reporter->Report(
         "Unsupported activation type for quantize-activation: %s",
diff --git a/tensorflow/lite/tools/optimize/quantize_model.cc b/tensorflow/lite/tools/optimize/quantize_model.cc
index ee562fe9c4c..bbb40080fbc 100644
--- a/tensorflow/lite/tools/optimize/quantize_model.cc
+++ b/tensorflow/lite/tools/optimize/quantize_model.cc
@@ -1210,18 +1210,25 @@ TfLiteStatus QuantizeModel(flatbuffers::FlatBufferBuilder* builder,
 
 TfLiteStatus QuantizeModel(flatbuffers::FlatBufferBuilder* builder,
                            ModelT* model, const TensorType& input_type,
-                           const TensorType& output_type,
-                           const TensorType& activations_type,
+                           const TensorType& output_type, bool allow_float,
                            ErrorReporter* error_reporter) {
-  return QuantizeModel(builder, model, input_type, output_type,
-                       /*allow_float=*/false, activations_type, error_reporter);
+  return QuantizeModel(builder, model, input_type, output_type, allow_float,
+                       GetAllOperatorOutputs(model), TensorType_INT8,
+                       error_reporter);
 }
 
 TfLiteStatus QuantizeModel(flatbuffers::FlatBufferBuilder* builder,
-                           ModelT* model, const TensorType& activations_type,
+                           ModelT* model, const TensorType& input_type,
+                           const TensorType& output_type,
                            ErrorReporter* error_reporter) {
+  return QuantizeModel(builder, model, input_type, output_type,
+                       /*allow_float=*/false, error_reporter);
+}
+
+TfLiteStatus QuantizeModel(flatbuffers::FlatBufferBuilder* builder,
+                           ModelT* model, ErrorReporter* error_reporter) {
   return QuantizeModel(builder, model, TensorType_FLOAT32, TensorType_FLOAT32,
-                       /*allow_float=*/false, activations_type, error_reporter);
+                       /*allow_float=*/false, error_reporter);
 }
 
 }  // namespace optimize
diff --git a/tensorflow/lite/tools/optimize/quantize_model.h b/tensorflow/lite/tools/optimize/quantize_model.h
index cc801ec9870..06c30b88fd0 100644
--- a/tensorflow/lite/tools/optimize/quantize_model.h
+++ b/tensorflow/lite/tools/optimize/quantize_model.h
@@ -35,9 +35,7 @@ namespace optimize {
 //
 // Note: This is a private API, subject to change.
 TfLiteStatus QuantizeModel(flatbuffers::FlatBufferBuilder* builder,
-                           ModelT* input_model,
-                           const TensorType& activations_type,
-                           ErrorReporter* error_reporter);
+                           ModelT* input_model, ErrorReporter* error_reporter);
 
 // Same as above, but the types of quantized inputs and outputs are
 // configurable.
@@ -46,7 +44,6 @@ TfLiteStatus QuantizeModel(flatbuffers::FlatBufferBuilder* builder,
 TfLiteStatus QuantizeModel(flatbuffers::FlatBufferBuilder* builder,
                            ModelT* input_model, const TensorType& input_type,
                            const TensorType& output_type,
-                           const TensorType& activations_type,
                            ErrorReporter* error_reporter);
 
 // Same as above, but can enable allowing float intermediate operations for ops
@@ -56,7 +53,6 @@ TfLiteStatus QuantizeModel(flatbuffers::FlatBufferBuilder* builder,
 TfLiteStatus QuantizeModel(flatbuffers::FlatBufferBuilder* builder,
                            ModelT* input_model, const TensorType& input_type,
                            const TensorType& output_type, bool allow_float,
-                           const TensorType& activations_type,
                            ErrorReporter* error_reporter);
 
 // Same as above, but enables only quantizing a whitelist of operations,
@@ -67,6 +63,25 @@ TfLiteStatus QuantizeModel(flatbuffers::FlatBufferBuilder* builder,
                            ModelT* input_model, const TensorType& input_type,
                            const TensorType& output_type, bool allow_float,
                            const std::unordered_set<string>& operator_names,
+                           ErrorReporter* error_reporter);
+
+// Same as above, but enables to provide activation type, which
+// could be TensorType_INT16 or TensorType_INT8.
+//
+// Note: This is a private API, subject to change.
+TfLiteStatus QuantizeModel(flatbuffers::FlatBufferBuilder* builder,
+                           ModelT* model, const TensorType& input_type,
+                           const TensorType& output_type, bool allow_float,
+                           const TensorType& activations_type,
+                           ErrorReporter* error_reporter);
+
+// Quantizes input_model and populates the provided builder with the new model
+// with all possible input parameters.
+// All functions above call this function underneath.
+TfLiteStatus QuantizeModel(flatbuffers::FlatBufferBuilder* builder,
+                           ModelT* model, const TensorType& input_type,
+                           const TensorType& output_type, bool allow_float,
+                           const std::unordered_set<string>& operator_names,
                            const TensorType& activations_type,
                            ErrorReporter* error_reporter);
 
diff --git a/tensorflow/lite/tools/optimize/quantize_model_test.cc b/tensorflow/lite/tools/optimize/quantize_model_test.cc
index 166d60ecc66..ef46b3fbd5d 100644
--- a/tensorflow/lite/tools/optimize/quantize_model_test.cc
+++ b/tensorflow/lite/tools/optimize/quantize_model_test.cc
@@ -106,9 +106,9 @@ TEST_P(QuantizeConvModelTest, QuantizationSucceeds) {
 }
 
 TEST_P(QuantizeConvModelTest, SkipUnspecifiedLayer) {
-  auto status =
-      QuantizeModel(&builder_, &model_, TensorType_FLOAT32, TensorType_FLOAT32,
-                    /*allow_float=*/true, {}, tensor_type_, &error_reporter_);
+  auto status = QuantizeModel(
+      &builder_, &model_, TensorType_FLOAT32, TensorType_FLOAT32,
+      /*allow_float=*/true, {}, TensorType_FLOAT32, &error_reporter_);
   EXPECT_EQ(status, kTfLiteOk);
   ASSERT_EQ(model_.subgraphs.size(), readonly_model_->subgraphs()->size());
   // The resulting model should be the same.
@@ -190,8 +190,9 @@ TEST_P(QuantizeConvModelTest, OperatorsAreUnchanged) {
 }
 
 TEST_P(QuantizeConvModelTest, GraphIsFullyQuantized) {
-  auto status = QuantizeModel(&builder_, &model_, tensor_type_, tensor_type_,
-                              tensor_type_, &error_reporter_);
+  auto status =
+      QuantizeModel(&builder_, &model_, tensor_type_, tensor_type_,
+                    /*allow_float*/ false, tensor_type_, &error_reporter_);
   EXPECT_EQ(status, kTfLiteOk);
   for (const auto& subgraph : model_.subgraphs) {
     for (const auto& tensor : subgraph->tensors) {
@@ -210,7 +211,7 @@ TEST_P(QuantizeConvModelTest, GraphIsFullyQuantized) {
 TEST_P(QuantizeConvModelTest, FloatInputAndOutput) {
   auto status =
       QuantizeModel(&builder_, &model_, TensorType_FLOAT32, TensorType_FLOAT32,
-                    tensor_type_, &error_reporter_);
+                    /*allow_float*/ false, tensor_type_, &error_reporter_);
   EXPECT_EQ(status, kTfLiteOk);
 
   for (int32_t subgraph_idx = 0; subgraph_idx < model_.subgraphs.size();
@@ -384,7 +385,7 @@ class QuantizeConcatModelTest : public QuantizeModelTest,
 //                              input1 /
 TEST_P(QuantizeConcatModelTest, AddRequantBeforeConcat) {
   auto status = QuantizeModel(&builder_, &model_, tensor_type_, tensor_type_,
-                              tensor_type_, &error_reporter_);
+                              false, tensor_type_, &error_reporter_);
   EXPECT_EQ(status, kTfLiteOk);
 
   // There is only one subgraph.
@@ -549,7 +550,7 @@ class QuantizeConvModel1Test : public QuantizeModelTest {
 
 TEST_F(QuantizeConvModel1Test, VerifyConvQuantizationWithUnitScale) {
   auto status =
-      QuantizeModel(&builder_, &model_, TensorType_INT8, TensorType_INT8,
+      QuantizeModel(&builder_, &model_, TensorType_INT8, TensorType_INT8, false,
                     TensorType_INT8, &error_reporter_);
   EXPECT_EQ(status, kTfLiteOk);
   const auto& subgraph = model_.subgraphs[0];
@@ -658,7 +659,7 @@ INSTANTIATE_TEST_SUITE_P(QuantizeConvModel2TestInst, QuantizeConvModel2Test,
 
 TEST_P(QuantizeConvModel2Test, VerifyConvQuantization) {
   auto status = QuantizeModel(&builder_, &model_, tensor_type_, tensor_type_,
-                              tensor_type_, &error_reporter_);
+                              false, tensor_type_, &error_reporter_);
   ASSERT_EQ(kTfLiteOk, status);
   const auto& subgraph = model_.subgraphs[0];
   auto conv_op = subgraph->operators[0].get();
@@ -1353,8 +1354,7 @@ class QuantizePackTest : public QuantizeModelTest {
 };
 
 TEST_F(QuantizePackTest, VerifyPack) {
-  auto status =
-      QuantizeModel(&builder_, &model_, TensorType_INT8, &error_reporter_);
+  auto status = QuantizeModel(&builder_, &model_, &error_reporter_);
 
   ASSERT_EQ(kTfLiteOk, status);
 
@@ -1418,8 +1418,7 @@ class QuantizeMinimumMaximumTest
 };
 
 TEST_P(QuantizeMinimumMaximumTest, VerifyMinimumMaximum) {
-  auto status =
-      QuantizeModel(&builder_, &model_, TensorType_INT8, &error_reporter_);
+  auto status = QuantizeModel(&builder_, &model_, &error_reporter_);
   ASSERT_EQ(kTfLiteOk, status);
   const auto& subgraph = model_.subgraphs[0];
 
@@ -1500,8 +1499,7 @@ class QuantizeUnpackTest : public QuantizeModelTest {
   }
 };
 TEST_F(QuantizeUnpackTest, VerifyUnpack) {
-  auto status =
-      QuantizeModel(&builder_, &model_, TensorType_INT8, &error_reporter_);
+  auto status = QuantizeModel(&builder_, &model_, &error_reporter_);
 
   ASSERT_EQ(kTfLiteOk, status);
 

From 792f553fd078a425d66c81567ca8f3588d44fcdc Mon Sep 17 00:00:00 2001
From: Elena Zhelezina <elena.zhelezina@arm.com>
Date: Wed, 5 Feb 2020 11:55:27 +0000
Subject: [PATCH 0017/1390] Added non-strict mode for 16x8 quantization

---
 tensorflow/lite/python/lite.py                |  16 ++++--
 tensorflow/lite/tools/optimize/BUILD          |   1 +
 .../lite/tools/optimize/operator_property.cc  |  16 ++++++
 .../lite/tools/optimize/operator_property.h   |   3 +-
 .../lite/tools/optimize/quantize_model.cc     |  50 +++++++++++-------
 .../tools/optimize/quantize_model_test.cc     |  50 +++++++++++++++---
 tensorflow/lite/tools/optimize/test_util.cc   |   1 +
 tensorflow/lite/tools/optimize/test_util.h    |   5 ++
 .../tools/optimize/testdata/mixed16x8.bin     | Bin 0 -> 1184 bytes
 9 files changed, 111 insertions(+), 31 deletions(-)
 create mode 100644 tensorflow/lite/tools/optimize/testdata/mixed16x8.bin

diff --git a/tensorflow/lite/python/lite.py b/tensorflow/lite/python/lite.py
index fc9c064faf0..1e0c89d3aa5 100644
--- a/tensorflow/lite/python/lite.py
+++ b/tensorflow/lite/python/lite.py
@@ -220,13 +220,16 @@ class TFLiteConverterBase(object):
                          "type to be INT8.")
 
   def _is_int8_target_required(self):
-    return (set([OpsSet.TFLITE_BUILTINS_INT8]) == set(
+    return ((set([OpsSet.TFLITE_BUILTINS_INT8]) == set(
         self.target_spec.supported_ops) or
-            self._smallest_supported_type() == constants.INT8)
+            self._smallest_supported_type() == constants.INT8) and
+        not self._is_int16x8_target_required())
 
   def _is_int16x8_target_required(self):
-    return (set([OpsSet.TFLITE_BUILTINS_ACTIVATIONS_INT16_WEIGHTS_INT8]) ==
-        set(self.target_spec.supported_ops))
+    return bool(
+          set(self.target_spec.supported_ops).intersection([
+            OpsSet.TFLITE_BUILTINS_ACTIVATIONS_INT16_WEIGHTS_INT8
+        ]))
 
   def _smallest_supported_type(self):
     if self.target_spec.supported_types:
@@ -262,6 +265,11 @@ class TFLiteConverterBase(object):
   def _calibrate_quantize_model(self, result, inference_input_type,
                                 inference_output_type, enable_mlir_quantizer):
     allow_float = not self._is_int8_target_required() and not self._is_int16x8_target_required()
+    if (self._is_int16x8_target_required()):
+      allow_float = bool(
+        set(self.target_spec.supported_ops).intersection([
+            OpsSet.TFLITE_BUILTINS
+        ]))
     calibrate_quantize = _calibrator.Calibrator(result)
     activations_type = constants.INT16 if self._is_int16x8_target_required() else constants.INT8
     return calibrate_quantize.calibrate_and_quantize(
diff --git a/tensorflow/lite/tools/optimize/BUILD b/tensorflow/lite/tools/optimize/BUILD
index 27be0f829ba..ee5e845b96b 100644
--- a/tensorflow/lite/tools/optimize/BUILD
+++ b/tensorflow/lite/tools/optimize/BUILD
@@ -245,6 +245,7 @@ tf_cc_test(
         "//tensorflow/lite/tools/optimize:testdata/maximum.bin",
         "//tensorflow/lite/tools/optimize:testdata/minimum.bin",
         "//tensorflow/lite/tools/optimize:testdata/mixed.bin",
+        "//tensorflow/lite/tools/optimize:testdata/mixed16x8.bin",
         "//tensorflow/lite/tools/optimize:testdata/multi_input_add_reshape.bin",
         "//tensorflow/lite/tools/optimize:testdata/pack.bin",
         "//tensorflow/lite/tools/optimize:testdata/single_avg_pool_min_minus_5_max_plus_5.bin",
diff --git a/tensorflow/lite/tools/optimize/operator_property.cc b/tensorflow/lite/tools/optimize/operator_property.cc
index 1f2d8bb4a4d..c31ad9dbb1e 100644
--- a/tensorflow/lite/tools/optimize/operator_property.cc
+++ b/tensorflow/lite/tools/optimize/operator_property.cc
@@ -70,6 +70,7 @@ OperatorProperty GetOperatorProperty(const ModelT* model, int subgraph_index,
       property.inputs = {{0, {}}};
       // ArgMax has no quantizable output.
       property.version = 2;
+      property.quantizable_int16 = false;
       break;
     case BuiltinOperator_AVERAGE_POOL_2D:
       property.inputs = {{0, {}}};
@@ -85,6 +86,7 @@ OperatorProperty GetOperatorProperty(const ModelT* model, int subgraph_index,
       property.outputs = {{0, {}}};
       property.restrict_same_input_output_scale = true;
       property.version = 2;
+      property.quantizable_int16 = false;
       break;
     case BuiltinOperator_SPLIT:
       // We skip input 0 since it is the split dim which is not real valued.
@@ -143,6 +145,7 @@ OperatorProperty GetOperatorProperty(const ModelT* model, int subgraph_index,
       property.inputs = {{0, {}}, {1, {}}};
       // Comparisons have no quantizable outputs.
       property.version = 2;
+      property.quantizable_int16 = false;
       break;
     case BuiltinOperator_EXPAND_DIMS:
       // We skip input 1 as it is not real valued (it's the index of axis) and
@@ -165,11 +168,13 @@ OperatorProperty GetOperatorProperty(const ModelT* model, int subgraph_index,
       property.outputs = {{0, {}}};
       property.restrict_same_input_output_scale = true;
       property.version = 2;
+      property.quantizable_int16 = false;
       break;
     case BuiltinOperator_HARD_SWISH: {
       property.inputs = {{0, {}}};
       property.outputs = {{0, {}}};
       property.version = 1;
+      property.quantizable_int16 = false;
       break;
     }
     case BuiltinOperator_LOG_SOFTMAX: {
@@ -180,6 +185,7 @@ OperatorProperty GetOperatorProperty(const ModelT* model, int subgraph_index,
       tensor_property.restricted_value_int8 = {16.0 / 256.0, 127};
       property.outputs = {{0, tensor_property}};
       property.version = 2;
+      property.quantizable_int16 = false;
       break;
     }
     case BuiltinOperator_LOGISTIC: {
@@ -736,6 +742,7 @@ OperatorProperty GetOperatorProperty(const ModelT* model, int subgraph_index,
         property.restrict_scale = {{18, 0}};
         property.version = 2;
       }
+      property.quantizable_int16 = false;
       break;
     }
     case BuiltinOperator_L2_NORMALIZATION: {
@@ -746,6 +753,7 @@ OperatorProperty GetOperatorProperty(const ModelT* model, int subgraph_index,
       tensor_property.restricted_value_int8 = {1 / 128.0, 0};
       property.outputs = {{0, tensor_property}};
       property.version = 2;
+      property.quantizable_int16 = false;
       break;
     }
     case BuiltinOperator_MAX_POOL_2D:
@@ -765,6 +773,7 @@ OperatorProperty GetOperatorProperty(const ModelT* model, int subgraph_index,
       property.inputs = {{0, {}}};
       property.outputs = {{0, {}}};
       property.version = 2;
+      property.quantizable_int16 = false;
       break;
     case BuiltinOperator_MINIMUM:
       property.arbitrary_inputs = true;
@@ -791,6 +800,7 @@ OperatorProperty GetOperatorProperty(const ModelT* model, int subgraph_index,
       property.outputs = {{0, {}}};
       property.restrict_same_input_output_scale = true;
       property.version = 2;
+      property.quantizable_int16 = false;
       break;
     case BuiltinOperator_QUANTIZE:
       property.inputs = {{0, {}}};
@@ -802,11 +812,13 @@ OperatorProperty GetOperatorProperty(const ModelT* model, int subgraph_index,
       property.inputs = {{0, {}}};
       property.outputs = {{0, {}}};
       property.version = 2;
+      property.quantizable_int16 = false;
       break;
     case BuiltinOperator_RELU_N1_TO_1:
       property.inputs = {{0, {}}};
       property.outputs = {{0, {}}};
       property.version = 1;
+      property.quantizable_int16 = false;
       break;
     case BuiltinOperator_RESHAPE:
       property.inputs = {{0, {}}};
@@ -820,6 +832,7 @@ OperatorProperty GetOperatorProperty(const ModelT* model, int subgraph_index,
       property.outputs = {{0, {}}};
       property.restrict_same_input_output_scale = true;
       property.version = 2;
+      property.quantizable_int16 = false;
       break;
     case BuiltinOperator_SHAPE:
       property.inputs = {{0, {}}};
@@ -866,6 +879,7 @@ OperatorProperty GetOperatorProperty(const ModelT* model, int subgraph_index,
       property.inputs = {{0, {}}};
       property.outputs = {{0, {}}};
       property.version = 2;
+      property.quantizable_int16 = false;
       break;
     case BuiltinOperator_TANH: {
       property.inputs = {{0, {}}};
@@ -899,6 +913,7 @@ OperatorProperty GetOperatorProperty(const ModelT* model, int subgraph_index,
                          {3, tensor_property_bias}};
       property.outputs = {{0, {}}};
       property.version = 3;
+      property.quantizable_int16 = false;
       break;
     }
     case BuiltinOperator_TRANSPOSE:
@@ -916,6 +931,7 @@ OperatorProperty GetOperatorProperty(const ModelT* model, int subgraph_index,
     default:
       // No quantized implementation exists for this operation.
       property.quantizable = false;
+      property.quantizable_int16 = false;
   }
   return property;
 }
diff --git a/tensorflow/lite/tools/optimize/operator_property.h b/tensorflow/lite/tools/optimize/operator_property.h
index 23052308568..151e314f335 100644
--- a/tensorflow/lite/tools/optimize/operator_property.h
+++ b/tensorflow/lite/tools/optimize/operator_property.h
@@ -65,7 +65,8 @@ struct TensorProperty {
 struct OperatorProperty {
   // Is a quantized operations currently supported.
   bool quantizable = true;
-
+  // Is a quantized operations currently supported for 16x8
+  bool quantizable_int16 = true;
   // Op has arbitrary number of inputs, such as concat.
   bool arbitrary_inputs = false;
   // Op has arbitrary number of outputs, such as slice.
diff --git a/tensorflow/lite/tools/optimize/quantize_model.cc b/tensorflow/lite/tools/optimize/quantize_model.cc
index bbb40080fbc..ceae3c29d9e 100644
--- a/tensorflow/lite/tools/optimize/quantize_model.cc
+++ b/tensorflow/lite/tools/optimize/quantize_model.cc
@@ -43,13 +43,17 @@ namespace {
 // operator_names.
 operator_property::OperatorProperty GetOperatorProperty(
     const std::unordered_set<string>& operator_names, const ModelT* model,
-    int subgraph_index, int op_idx, const string& operator_name) {
+    int subgraph_index, int op_idx, const string& operator_name,
+    const TensorType& activations_type) {
   operator_property::OperatorProperty property =
       operator_property::GetOperatorProperty(model, subgraph_index, op_idx);
   const OperatorT* op =
       model->subgraphs[subgraph_index]->operators[op_idx].get();
   const BuiltinOperator op_code =
       model->operator_codes[op->opcode_index]->builtin_code;
+  if (activations_type == TensorType_INT16 && !property.quantizable_int16) {
+    property.quantizable = false;
+  }
   // The algorithm adds Dequantize and Quantize, so we don't require them to be
   // in the operator_names.
   if (op_code != BuiltinOperator_DEQUANTIZE &&
@@ -320,9 +324,9 @@ TfLiteStatus ApplyConstraints(ModelT* model,
     // Iterate backward to avoid messing with index.
     for (int op_idx = subgraph->operators.size() - 1; op_idx >= 0; op_idx--) {
       OperatorT* op = subgraph->operators[op_idx].get();
-      operator_property::OperatorProperty property =
-          GetOperatorProperty(operator_names, model, subgraph_idx, op_idx,
-                              subgraph->tensors[op->outputs[0]]->name);
+      operator_property::OperatorProperty property = GetOperatorProperty(
+          operator_names, model, subgraph_idx, op_idx,
+          subgraph->tensors[op->outputs[0]]->name, activations_type);
       if (!property.quantizable) {
         continue;
       }
@@ -840,11 +844,17 @@ TfLiteStatus QuantizeWeightsInputOutput(
       OperatorT* op = subgraph->operators[op_idx].get();
       const BuiltinOperator op_code =
           model->operator_codes[op->opcode_index]->builtin_code;
-      operator_property::OperatorProperty property =
-          GetOperatorProperty(operator_names, model, subgraph_idx, op_idx,
-                              subgraph->tensors[op->outputs[0]]->name);
+      operator_property::OperatorProperty property = GetOperatorProperty(
+          operator_names, model, subgraph_idx, op_idx,
+          subgraph->tensors[op->outputs[0]]->name, activations_type);
 
-      if (!property.quantizable && !allow_float) {
+      if (activations_type == TensorType_INT16 && !property.quantizable &&
+          !allow_float) {
+        error_reporter->Report(
+            "Quantization to 16x8-bit not yet supported for op: %s",
+            EnumNameBuiltinOperator(op_code));
+        return kTfLiteError;
+      } else if (!property.quantizable && !allow_float) {
         error_reporter->Report("Quantization not yet supported for op: %s",
                                EnumNameBuiltinOperator(op_code));
         return kTfLiteError;
@@ -882,9 +892,9 @@ TfLiteStatus QuantizeBiases(ModelT* model,
       OperatorT* op = subgraph->operators[op_idx].get();
       const BuiltinOperator op_code =
           model->operator_codes[op->opcode_index]->builtin_code;
-      operator_property::OperatorProperty property =
-          GetOperatorProperty(operator_names, model, subgraph_idx, op_idx,
-                              subgraph->tensors[op->outputs[0]]->name);
+      operator_property::OperatorProperty property = GetOperatorProperty(
+          operator_names, model, subgraph_idx, op_idx,
+          subgraph->tensors[op->outputs[0]]->name, activations_type);
       if (!property.quantizable) {
         continue;
       }
@@ -951,15 +961,15 @@ std::unordered_set<string> GetAllOperatorOutputs(ModelT* model) {
 // will not be filled by this function.
 TfLiteStatus FillQuantizationParams(
     ModelT* model, const std::unordered_set<string>& operator_names,
-    ErrorReporter* error_reporter) {
+    const TensorType& activations_type, ErrorReporter* error_reporter) {
   for (size_t subgraph_idx = 0; subgraph_idx < model->subgraphs.size();
        subgraph_idx++) {
     SubGraphT* subgraph = model->subgraphs.at(subgraph_idx).get();
     for (size_t op_idx = 0; op_idx < subgraph->operators.size(); op_idx++) {
       OperatorT* op = subgraph->operators[op_idx].get();
-      operator_property::OperatorProperty property =
-          GetOperatorProperty(operator_names, model, subgraph_idx, op_idx,
-                              subgraph->tensors[op->outputs[0]]->name);
+      operator_property::OperatorProperty property = GetOperatorProperty(
+          operator_names, model, subgraph_idx, op_idx,
+          subgraph->tensors[op->outputs[0]]->name, activations_type);
 
       // Populate max, min for each input tensor.
       for (const std::pair<int, operator_property::TensorProperty>& input :
@@ -1048,9 +1058,9 @@ TfLiteStatus EnsureBiasScaleCompatibility(
     SubGraphT* subgraph = model->subgraphs.at(subgraph_idx).get();
     for (size_t op_idx = 0; op_idx < subgraph->operators.size(); op_idx++) {
       OperatorT* op = subgraph->operators[op_idx].get();
-      operator_property::OperatorProperty property =
-          GetOperatorProperty(operator_names, model, subgraph_idx, op_idx,
-                              subgraph->tensors[op->outputs[0]]->name);
+      operator_property::OperatorProperty property = GetOperatorProperty(
+          operator_names, model, subgraph_idx, op_idx,
+          subgraph->tensors[op->outputs[0]]->name, activations_type);
 
       // Loop over all bias tensors.
       for (const int bias_idx : property.biases) {
@@ -1174,8 +1184,8 @@ TfLiteStatus QuantizeModel(flatbuffers::FlatBufferBuilder* builder,
                            const std::unordered_set<string>& operator_names,
                            const TensorType& activations_type,
                            ErrorReporter* error_reporter) {
-  TF_LITE_ENSURE_STATUS(
-      FillQuantizationParams(model, operator_names, error_reporter));
+  TF_LITE_ENSURE_STATUS(FillQuantizationParams(
+      model, operator_names, activations_type, error_reporter));
   TF_LITE_ENSURE_STATUS(EnsureBiasScaleCompatibility(
       model, operator_names, activations_type, error_reporter));
   TF_LITE_ENSURE_STATUS(
diff --git a/tensorflow/lite/tools/optimize/quantize_model_test.cc b/tensorflow/lite/tools/optimize/quantize_model_test.cc
index ef46b3fbd5d..b73cb9a79ca 100644
--- a/tensorflow/lite/tools/optimize/quantize_model_test.cc
+++ b/tensorflow/lite/tools/optimize/quantize_model_test.cc
@@ -1308,7 +1308,8 @@ TEST_F(QuantizeFCTest, VerifyFC) {
   EXPECT_EQ(model_.operator_codes[1]->version, 1);
 }
 
-class QuantizeCustomOpTest : public QuantizeModelTest {
+class QuantizeCustomOpTest : public QuantizeModelTest,
+    public ::testing::WithParamInterface<tflite::TensorType> {
  protected:
   QuantizeCustomOpTest() {
     input_model_ = ReadModel(internal::kModelMixed);
@@ -1317,10 +1318,10 @@ class QuantizeCustomOpTest : public QuantizeModelTest {
   }
 };
 
-TEST_F(QuantizeCustomOpTest, VerifyMixedQuantization) {
+TEST_P(QuantizeCustomOpTest, VerifyMixedQuantization) {
   auto status =
-      QuantizeModel(&builder_, &model_, TensorType_INT8, TensorType_INT8,
-                    /*allow_float=*/true, TensorType_INT8, &error_reporter_);
+      QuantizeModel(&builder_, &model_, GetParam(), GetParam(),
+                    /*allow_float=*/true, GetParam(), &error_reporter_);
   ASSERT_EQ(kTfLiteOk, status);
   const auto& subgraph = model_.subgraphs[0];
   auto float_graph = readonly_model_->subgraphs()->Get(0);
@@ -1334,8 +1335,45 @@ TEST_F(QuantizeCustomOpTest, VerifyMixedQuantization) {
       BuiltinOperator_CUSTOM,   BuiltinOperator_CUSTOM,
       BuiltinOperator_QUANTIZE, BuiltinOperator_SQUEEZE};
   const std::vector<TensorType> op_input_types = {
-      TensorType_INT8,    TensorType_INT8,    TensorType_FLOAT32,
-      TensorType_FLOAT32, TensorType_FLOAT32, TensorType_INT8};
+      GetParam(),    GetParam(),    TensorType_FLOAT32,
+      TensorType_FLOAT32, TensorType_FLOAT32, GetParam()};
+  for (int i = 0; i < subgraph->operators.size(); ++i) {
+    OperatorT* op = subgraph->operators[i].get();
+    ASSERT_EQ(model_.operator_codes[op->opcode_index]->builtin_code,
+              op_codes[i]);
+    ASSERT_EQ(subgraph->tensors[op->inputs[0]]->type, op_input_types[i]);
+  }
+}
+
+INSTANTIATE_TEST_SUITE_P(QuantizeCustomOpTest, QuantizeCustomOpTest,
+                         ::testing::Values(TensorType_INT8, TensorType_INT16));
+
+class QuantizeOp16x8Test : public QuantizeModelTest {
+ protected:
+  QuantizeOp16x8Test() {
+    input_model_ = ReadModel(internal::kModelMixed16x8);
+    readonly_model_ = input_model_->GetModel();
+    readonly_model_->UnPackTo(&model_);
+  }
+};
+
+TEST_F(QuantizeOp16x8Test, VerifyMixedQuantization16x8) {
+  auto status =
+      QuantizeModel(&builder_, &model_, TensorType_INT16, TensorType_FLOAT32,
+                    /*allow_float=*/true, TensorType_INT16, &error_reporter_);
+  ASSERT_EQ(kTfLiteOk, status);
+  const auto& subgraph = model_.subgraphs[0];
+  auto float_graph = readonly_model_->subgraphs()->Get(0);
+  // The original model conv_2d->log_softmax
+  ASSERT_EQ(float_graph->operators()->size(), 2);
+  // The resulting model should be:
+  // conv_2d->dequantize->log_softmax
+  ASSERT_EQ(subgraph->operators.size(), 3);
+  const std::vector<BuiltinOperator> op_codes = {
+      BuiltinOperator_CONV_2D,  BuiltinOperator_DEQUANTIZE,
+      BuiltinOperator_LOG_SOFTMAX};
+  const std::vector<TensorType> op_input_types = {
+      TensorType_INT16,    TensorType_INT16,    TensorType_FLOAT32};
   for (int i = 0; i < subgraph->operators.size(); ++i) {
     OperatorT* op = subgraph->operators[i].get();
     ASSERT_EQ(model_.operator_codes[op->opcode_index]->builtin_code,
diff --git a/tensorflow/lite/tools/optimize/test_util.cc b/tensorflow/lite/tools/optimize/test_util.cc
index 7d5e9d65f06..379be64059f 100644
--- a/tensorflow/lite/tools/optimize/test_util.cc
+++ b/tensorflow/lite/tools/optimize/test_util.cc
@@ -48,6 +48,7 @@ const char* kModelWithArgMaxOp = "argmax.bin";
 const char* kModelWithFCOp = "fc.bin";
 
 const char* kModelMixed = "mixed.bin";
+const char* kModelMixed16x8 = "mixed16x8.bin";
 
 const char* kModelSplit = "split.bin";
 
diff --git a/tensorflow/lite/tools/optimize/test_util.h b/tensorflow/lite/tools/optimize/test_util.h
index abcdbc21d36..a49f3500288 100644
--- a/tensorflow/lite/tools/optimize/test_util.h
+++ b/tensorflow/lite/tools/optimize/test_util.h
@@ -76,6 +76,11 @@ extern const char* kModelWithFCOp;
 // reshape->custom->custom->squeeze.
 extern const char* kModelMixed;
 
+// Test model with mixed quantizable and
+// and un-quantizable ops for
+// activations in 16-bit.
+extern const char* kModelMixed16x8;
+
 // Test model with split op.
 extern const char* kModelSplit;
 
diff --git a/tensorflow/lite/tools/optimize/testdata/mixed16x8.bin b/tensorflow/lite/tools/optimize/testdata/mixed16x8.bin
new file mode 100644
index 0000000000000000000000000000000000000000..c1f615e966eb164341d8ba4d56e1e359ef516388
GIT binary patch
literal 1184
zcmb1PU|<Mw^D$;%5Mq#FU}4~3;9(G85Mf|okYEsEU|?Vd$+Ivp=rA%ch%hoRm@qIf
zurM$%FhXg8KmY%8FfcH%F)%O)GcYjZX6D5gmFAUX=BCD%r4|)u=I1f6GcYh*VPIg`
z!@$6>hJk@Whk=1Xg@J(qWH-pHr2qf_gJ_T$hI)o3dIk&(3IG59Px=4<e;rgVhk=3N
zM8_7J6rsa*F83tt!!pzCW8ZACjp5O@%fH}Zzb@6l-sA9UJCUH>cJI#2x6#}#XYV(I
z+0J^akG-Ncll|PMa(3Mu8TRL0SnTgR%h)@mnb}(lm)YNIw6YgxVzg)bKG|+RlZpKt
zJw5xz<5l)73~XS3f_%oo0OCRX4RR*~*zYW0d45p%_&fV6IOpe;r52T>rs#pgMu35V
zfrmi^9-=%93=C%&7#I#PFfeQZ>t|qKVqjp10EaXKg9uzL$aD~f!~!UsCBSAu><7t!
z#9%atEy4g&0g5?x29UiF(?BX17%iaT4N?JfI|Bn3g8&0NIIg+C$`}|y<{)EGyo1z(
z+z4WW<UmS6G)SD4fq~%xBLhPSBg9`Ie}KfD{{8<CvJ=Dx#W9FB`S<@n$Xy^dNIwHZ
zy)%Sn0H=-2yn@n_cq4Flg2X{)gD}Y6N&o);Pl2X4n0YOD%rj(QU<fWrEJ;l(%?U^>
zD#<L#%+E_raZb$1(a+6KNzI8j)KAXOD>KjsMVOIGd{Sm&G1PpJ-Aw=g{|BWtklir*
zzTmOXnne5bgHjVy!V-%z6O(dM{R`lJ2H6M0ZU6uOPhenR0EGd}4Hf@!hao6Gk>Y1i
zJi_7vW+%w+3=DkWRLB5M83GJ^3=9k|43KmRvICTcL8%N>N`Ue_t~dqBd8VZ1m1LGw
i;<5{ru0Z)8WF{z&u`#fM%Mb=5HU<s`b_kn+ff)eEuD3e?

literal 0
HcmV?d00001


From cb8783f2b5834c3458bea78002d5745c88c43c32 Mon Sep 17 00:00:00 2001
From: jmsmdy <jmsmdy@gmail.com>
Date: Tue, 25 Feb 2020 16:20:43 -0500
Subject: [PATCH 0018/1390] added doctest to tf.signal.frame

---
 tensorflow/python/ops/signal/shape_ops.py | 32 +++++++++++------------
 1 file changed, 15 insertions(+), 17 deletions(-)

diff --git a/tensorflow/python/ops/signal/shape_ops.py b/tensorflow/python/ops/signal/shape_ops.py
index bb4ac7ec33c..89b35014005 100644
--- a/tensorflow/python/ops/signal/shape_ops.py
+++ b/tensorflow/python/ops/signal/shape_ops.py
@@ -68,29 +68,27 @@ def frame(signal, frame_length, frame_step, pad_end=False, pad_value=0, axis=-1,
 
   For example:
 
-  ```python
-  # A batch size 3 tensor of 9152 audio samples.
-  audio = tf.random.normal([3, 9152])
-
-  # Compute overlapping frames of length 512 with a step of 180 (frames overlap
-  # by 332 samples). By default, only 49 frames are generated since a frame
-  # with start position j*180 for j > 48 would overhang the end.
-  frames = tf.signal.frame(audio, 512, 180)
-  frames.shape.assert_is_compatible_with([3, 49, 512])
-
-  # When pad_end is enabled, the final two frames are kept (padded with zeros).
-  frames = tf.signal.frame(audio, 512, 180, pad_end=True)
-  frames.shape.assert_is_compatible_with([3, 51, 512])
-  ```
+  >>> # A batch size 3 tensor of 9152 audio samples.
+  >>> audio = tf.random.normal([3, 9152])
+  >>> 
+  >>> # Compute overlapping frames of length 512 with a step of 180 (frames overlap
+  >>> # by 332 samples). By default, only 49 frames are generated since a frame
+  >>> # with start position j*180 for j > 48 would overhang the end.
+  >>> frames = tf.signal.frame(audio, 512, 180)
+  >>> frames.shape.assert_is_compatible_with([3, 49, 512])
+  >>> 
+  >>> # When pad_end is enabled, the final two frames are kept (padded with zeros).
+  >>> frames = tf.signal.frame(audio, 512, 180, pad_end=True)
+  >>> frames.shape.assert_is_compatible_with([3, 51, 512])
   
   If the dimension along `axis` is N, and `pad_end=False`, the number of frames
   can be computed by:
    ```python
-   frames = 1 + (N - frame_size) // frame_step
+   num_frames = 1 + (N - frame_size) // frame_step
    ```
    If `pad_end=True`, the number of frames can be computed by:
   ```python
-  frames = -(-N // frame_step) # ceiling division
+  num_frames = -(-N // frame_step) # ceiling division
   ```
 
   Args:
@@ -106,7 +104,7 @@ def frame(signal, frame_length, frame_step, pad_end=False, pad_value=0, axis=-1,
     name: An optional name for the operation.
 
   Returns:
-    A `Tensor` of frames with shape `[..., frames, frame_length, ...]`.
+    A `Tensor` of frames with shape `[..., num_frames, frame_length, ...]`.
 
   Raises:
     ValueError: If `frame_length`, `frame_step`, `pad_value`, or `axis` are not

From db0d4681210c831f505a455a0951fa37482184d6 Mon Sep 17 00:00:00 2001
From: Elena Zhelezina <elena.zhelezina@arm.com>
Date: Tue, 17 Mar 2020 16:30:32 +0000
Subject: [PATCH 0019/1390] Corrected after merge with master. Tested: strict
 mode and non-strict mode.

Change-Id: I7e03d08133f39cc65a18875e65ce5cdddaf2d6a4
---
 tensorflow/lite/python/lite.py                | 62 ++++++++++++++++---
 .../optimize/calibration_wrapper_pybind11.cc  |  8 +--
 2 files changed, 59 insertions(+), 11 deletions(-)

diff --git a/tensorflow/lite/python/lite.py b/tensorflow/lite/python/lite.py
index 7e5f8ce704f..900398d7a6f 100644
--- a/tensorflow/lite/python/lite.py
+++ b/tensorflow/lite/python/lite.py
@@ -180,11 +180,13 @@ class QuantizationMode(object):
   def post_training_int8_no_float(self):
     """Post training int8 quantize, disallow float fallback."""
     return (self._is_int8_target_required() and
+            not self._is_int16x8_target_required() and
             self._representative_dataset is not None)
 
   def post_training_int8_allow_float(self):
     """Post training int8 quantize, allow float fallback."""
     return (self._any_optimization_enabled() and
+            not self._is_int16x8_target_required() and
             self._representative_dataset is not None and
             self._smallest_supported_type() == constants.INT8)
 
@@ -193,6 +195,18 @@ class QuantizationMode(object):
     return (self._any_optimization_enabled() and
             self._contains_training_quant_op())
 
+  def post_training_int16x8_no_float(self):
+    """Post training int16x8 quantize, disallow float fallback."""
+    return (not self._is_int8_target_required() and
+            self._is_int16x8_target_required() and
+            not self._is_allow_float() and
+            self._representative_dataset is not None)
+
+  def post_training_int16x8_allow_float(self):
+    """Post training int16x8 quantize, allow float fallback."""
+    return (self._is_int16x8_target_required() and
+            self._is_allow_float())
+
   def post_training_dynamic_range_int8(self):
     """Post training int8 const, on-the-fly int8 quantize of dynamic tensors."""
     # Post-training dynamic range quantization is only enabled if post-training
@@ -212,9 +226,14 @@ class QuantizationMode(object):
     return not (self.post_training_int8_no_float() or
                 self.post_training_int8_allow_float() or
                 self.training_time_int8_allow_float() or
+                self.post_training_int16x8_no_float() or
+                self.post_training_int16x8_allow_float() or
                 self.post_training_dynamic_range_int8() or
                 self.post_training_fp16())
 
+  def activations_type(self):
+    return constants.INT16 if self._is_int16x8_target_required() else constants.INT8
+
   # Below are helpers for the above functions.
 
   def _validate_int8_required(self):
@@ -244,6 +263,18 @@ class QuantizationMode(object):
         self._target_spec.supported_ops) or
             set(self._target_spec.supported_types) == set([constants.INT8]))
 
+  def _is_int16x8_target_required(self):
+    return bool(
+          set(self._target_spec.supported_ops).intersection([
+            OpsSet.TFLITE_BUILTINS_ACTIVATIONS_INT16_WEIGHTS_INT8
+        ]))
+
+  def _is_allow_float(self):
+    return bool(
+          set(self._target_spec.supported_ops).intersection([
+            OpsSet.TFLITE_BUILTINS
+        ]))
+
   def _any_optimization_enabled(self):
     return bool(
         set(self._optimizations).intersection([
@@ -309,13 +340,13 @@ class TFLiteConverterBase(object):
     return _get_grappler_config(optimizers)
 
   def _calibrate_quantize_model(self, result, inference_input_type,
-                                inference_output_type, allow_float):
+                                inference_output_type, activations_type, allow_float):
     if not isinstance(self.representative_dataset, RepresentativeDataset):
       self.representative_dataset = RepresentativeDataset(
           self.representative_dataset)
 
     calibrate_quantize = _calibrator.Calibrator(result)
-    activations_type = constants.INT16 if self._is_int16x8_target_required() else constants.INT8
+
     if (self.experimental_calibrate_only:
       return calibrate_quantize.calibrate(self.representative_dataset.input_gen)
     else:
@@ -608,12 +639,20 @@ class TFLiteConverterV2(TFLiteConverterBase):
         output_tensors=output_tensors,
         **converter_kwargs)
 
+    activations_type = quant_mode.activations_type()
+
     if quant_mode.post_training_int8_no_float():
       result = self._calibrate_quantize_model(result, constants.FLOAT,
-                                              constants.FLOAT, False)
+                                              constants.FLOAT, activations_type, False)
     elif quant_mode.post_training_int8_allow_float():
       result = self._calibrate_quantize_model(result, constants.FLOAT,
-                                              constants.FLOAT, True)
+                                              constants.FLOAT, activations_type, True)
+    elif quant_mode.post_training_int16x8_no_float():
+      result = self._calibrate_quantize_model(result, constants.FLOAT,
+                                              constants.FLOAT, activations_type, False)
+    elif quant_mode.post_training_int16x8_allow_float():
+      result = self._calibrate_quantize_model(result, constants.FLOAT,
+                                              constants.FLOAT, activations_type, True)
 
     return result
 
@@ -1114,6 +1153,8 @@ class TFLiteConverter(TFLiteConverterBase):
         quant_mode.post_training_int8_no_float() or
         quant_mode.post_training_int8_allow_float() or
         quant_mode.post_training_dynamic_range_int8() or
+        quant_mode.post_training_int16x8_no_float() or
+        quant_mode.post_training_int16x8_allow_float() or
         quant_mode.post_training_fp16())
     if post_training_optimize:
       # Post training optimizations require that TOCO outputs a float model.
@@ -1223,12 +1264,20 @@ class TFLiteConverter(TFLiteConverterBase):
           output_arrays=self._output_arrays,
           **converter_kwargs)
 
+    activations_type = quant_mode.activations_type()
+
     if quant_mode.post_training_int8_no_float():
       result = self._calibrate_quantize_model(result, inference_input_type,
-                                              inference_output_type, False)
+                                              inference_output_type, activations_type, False)
     elif quant_mode.post_training_int8_allow_float():
       result = self._calibrate_quantize_model(result, inference_input_type,
-                                              inference_output_type, True)
+                                              inference_output_type, activations_type, True)
+    elif quant_mode.post_training_int16x8_no_float():
+      result = self._calibrate_quantize_model(result, inference_input_type,
+                                              inference_output_type, activations_type, False)
+    elif quant_mode.post_training_int16x8_allow_float():
+      result = self._calibrate_quantize_model(result, inference_input_type,
+                                              inference_output_type, activations_type, True)
 
     return result
 
@@ -1334,7 +1383,6 @@ class TocoConverter(object):
 
   @classmethod
   @_deprecation.deprecated(
-      None, "Use `lite.TFLiteConverter.from_keras_model_file` instead.")
   def from_keras_model_file(cls,
                             model_file,
                             input_arrays=None,
diff --git a/tensorflow/lite/python/optimize/calibration_wrapper_pybind11.cc b/tensorflow/lite/python/optimize/calibration_wrapper_pybind11.cc
index f56b23090b9..9a8fea5d1f6 100644
--- a/tensorflow/lite/python/optimize/calibration_wrapper_pybind11.cc
+++ b/tensorflow/lite/python/optimize/calibration_wrapper_pybind11.cc
@@ -40,17 +40,17 @@ PYBIND11_MODULE(_pywrap_tensorflow_lite_calibration_wrapper, m) {
           })
       .def("QuantizeModel",
            [](CalibrationWrapper& self, int input_py_type, int output_py_type,
-              bool allow_float, bool enable_mlir_quantizer) {
+              bool allow_float, int activations_py_type, bool enable_mlir_quantizer) {
              return tensorflow::pyo_or_throw(
                  self.QuantizeModel(input_py_type, output_py_type, allow_float,
-                                    enable_mlir_quantizer));
+                                    activations_py_type, enable_mlir_quantizer));
            })
       .def("QuantizeModel",
            [](CalibrationWrapper& self, int input_py_type, int output_py_type,
-              bool allow_float) {
+              bool allow_float, int activations_py_type) {
              return tensorflow::pyo_or_throw(
                  self.QuantizeModel(input_py_type, output_py_type, allow_float,
-                                    /*enable_mlir_quantizer=*/false));
+                                    activations_py_type, /*enable_mlir_quantizer=*/false));
            })
       .def("QuantizeModel", [](CalibrationWrapper& self, int input_py_type,
                                int output_py_type, bool allow_float,

From fea3433cfc42d1a7ed050779092f3c92b077cd48 Mon Sep 17 00:00:00 2001
From: Tomohiro Ubukata <ykmr729@gmail.com>
Date: Sun, 15 Mar 2020 05:22:41 +0000
Subject: [PATCH 0020/1390] Add error checks

---
 .../core/platform/cloud/curl_http_request.cc  |  8 +++--
 .../platform/default/posix_file_system.cc     | 14 ++++++--
 .../core/platform/default/subprocess.cc       | 36 ++++++++++++++-----
 tensorflow/core/platform/path.cc              |  4 ++-
 tensorflow/core/platform/platform_strings.cc  |  4 ++-
 .../android_armv7a_cpu_utils_helper.cc        |  8 +++--
 6 files changed, 56 insertions(+), 18 deletions(-)

diff --git a/tensorflow/core/platform/cloud/curl_http_request.cc b/tensorflow/core/platform/cloud/curl_http_request.cc
index a227edb1fb0..39a1f7a35b2 100644
--- a/tensorflow/core/platform/cloud/curl_http_request.cc
+++ b/tensorflow/core/platform/cloud/curl_http_request.cc
@@ -166,7 +166,9 @@ CurlHttpRequest::~CurlHttpRequest() {
     libcurl_->curl_slist_free_all(resolve_list_);
   }
   if (put_body_) {
-    fclose(put_body_);
+    if (fclose(put_body_) != 0) {
+      LOG(ERROR) << "fclose() failed: " << strerror(errno);
+    }
   }
   if (curl_) {
     libcurl_->curl_easy_cleanup(curl_);
@@ -237,7 +239,9 @@ Status CurlHttpRequest::SetPutFromFile(const string& body_filepath,
   is_method_set_ = true;
   method_ = RequestMethod::kPut;
   if (put_body_) {
-    fclose(put_body_);
+    if (fclose(put_body_) != 0) {
+      LOG(ERROR) << "fclose() failed: " << strerror(errno);
+    }
   }
   put_body_ = fopen(body_filepath.c_str(), "r");
   if (!put_body_) {
diff --git a/tensorflow/core/platform/default/posix_file_system.cc b/tensorflow/core/platform/default/posix_file_system.cc
index 106a0412fb7..05c2b2762d4 100644
--- a/tensorflow/core/platform/default/posix_file_system.cc
+++ b/tensorflow/core/platform/default/posix_file_system.cc
@@ -51,7 +51,11 @@ class PosixRandomAccessFile : public RandomAccessFile {
  public:
   PosixRandomAccessFile(const string& fname, int fd)
       : filename_(fname), fd_(fd) {}
-  ~PosixRandomAccessFile() override { close(fd_); }
+  ~PosixRandomAccessFile() override {
+    if (close(fd_) < 0) {
+      LOG(ERROR) << "close() failed: " << strerror(errno);
+    }
+  }
 
   Status Name(StringPiece* result) const override {
     *result = filename_;
@@ -229,7 +233,9 @@ Status PosixFileSystem::NewReadOnlyMemoryRegionFromFile(
     } else {
       result->reset(new PosixReadOnlyMemoryRegion(address, st.st_size));
     }
-    close(fd);
+    if (close(fd) < 0) {
+      s = IOError(fname, errno);
+    }
   }
   return s;
 }
@@ -256,7 +262,9 @@ Status PosixFileSystem::GetChildren(const string& dir,
       result->push_back(entry->d_name);
     }
   }
-  closedir(d);
+  if (closedir(d) < 0) {
+    return IOError(dir, errno);
+  }
   return Status::OK();
 }
 
diff --git a/tensorflow/core/platform/default/subprocess.cc b/tensorflow/core/platform/default/subprocess.cc
index 562f4cd2d0c..acf7073b9a4 100644
--- a/tensorflow/core/platform/default/subprocess.cc
+++ b/tensorflow/core/platform/default/subprocess.cc
@@ -102,11 +102,15 @@ void SubProcess::FreeArgs() {
 void SubProcess::ClosePipes() {
   for (int i = 0; i < kNFds; i++) {
     if (parent_pipe_[i] >= 0) {
-      close(parent_pipe_[i]);
+      if (close(parent_pipe_[i]) < 0) {
+        LOG(ERROR) << "close() failed: " << strerror(errno);
+      }
       parent_pipe_[i] = -1;
     }
     if (child_pipe_[i] >= 0) {
-      close(child_pipe_[i]);
+      if (close(child_pipe_[i]) < 0) {
+        LOG(ERROR) << "close() failed: " << strerror(errno);
+      }
       child_pipe_[i] = -1;
     }
   }
@@ -215,7 +219,9 @@ bool SubProcess::Start() {
     running_ = true;
     for (int i = 0; i < kNFds; i++) {
       if (child_pipe_[i] >= 0) {
-        close(child_pipe_[i]);
+        if (close(child_pipe_[i]) < 0) {
+          LOG(ERROR) << "close() failed: " << strerror(errno);
+        }
         child_pipe_[i] = -1;
       }
     }
@@ -227,7 +233,9 @@ bool SubProcess::Start() {
   int devnull_fd = -1;
   for (int i = 0; i < kNFds; i++) {
     if (parent_pipe_[i] >= 0) {
-      close(parent_pipe_[i]);
+      if (close(parent_pipe_[i]) < 0) {
+        LOG(ERROR) << "close() failed: " << strerror(errno);
+      }
       parent_pipe_[i] = -1;
     }
 
@@ -242,7 +250,9 @@ bool SubProcess::Start() {
             _exit(1);
           }
         }
-        close(child_pipe_[i]);
+        if (close(child_pipe_[i]) < 0) {
+          LOG(ERROR) << "close() failed: " << strerror(errno);
+        }
         child_pipe_[i] = -1;
         break;
 
@@ -264,14 +274,18 @@ bool SubProcess::Start() {
             }
           }
         } else {
-          close(i);
+          if (close(i) < 0) {
+            LOG(ERROR) << "close() failed: " << strerror(errno);
+          }
         }
         break;
     }
   }
 
   if (devnull_fd >= 0) {
-    close(devnull_fd);
+    if (close(devnull_fd) < 0) {
+      LOG(ERROR) << "close() failed: " << strerror(errno);
+    }
   }
 
   // Execute the child program.
@@ -379,7 +393,9 @@ int SubProcess::Communicate(const string* stdin_input, string* stdout_output,
           // Special case: if no data is given to send to the child process,
           // close the pipe to unblock the child, and skip the file descriptor.
           if (stdin_input == nullptr) {
-            close(parent_pipe_[i]);
+            if (close(parent_pipe_[i]) < 0) {
+              LOG(ERROR) << "close() failed: " << strerror(errno);
+            }
             parent_pipe_[i] = -1;
             continue;
           }
@@ -441,7 +457,9 @@ int SubProcess::Communicate(const string* stdin_input, string* stdout_output,
               fds[i].fd = -1;
               fd_remain--;
               // Close the child's stdin pipe to unblock the process.
-              close(parent_pipe_[CHAN_STDIN]);
+              if (close(parent_pipe_[CHAN_STDIN]) < 0) {
+                LOG(ERROR) << "close() failed: " << strerror(errno);
+              }
               parent_pipe_[CHAN_STDIN] = -1;
             }
           } else if (!retry(errno)) {
diff --git a/tensorflow/core/platform/path.cc b/tensorflow/core/platform/path.cc
index 1e88328aace..a041ac67d72 100644
--- a/tensorflow/core/platform/path.cc
+++ b/tensorflow/core/platform/path.cc
@@ -320,7 +320,9 @@ string GetTempFilename(const string& extension) {
       if (fd < 0) {
         LOG(FATAL) << "Failed to create temp file.";
       } else {
-        close(fd);
+        if (close(fd) < 0) {
+          LOG(ERROR) << "close() failed: " << strerror(errno);
+        }
         return tmp_filepath;
       }
     }
diff --git a/tensorflow/core/platform/platform_strings.cc b/tensorflow/core/platform/platform_strings.cc
index 489a211ccf7..af8787f4fbc 100644
--- a/tensorflow/core/platform/platform_strings.cc
+++ b/tensorflow/core/platform/platform_strings.cc
@@ -52,7 +52,9 @@ int GetPlatformStrings(const std::string& path,
     }
 
     result = (ferror(ifp) == 0) ? 0 : errno;
-    fclose(ifp);
+    if (fclose(ifp) != 0) {
+      result = errno;
+    }
   } else {
     result = errno;
   }
diff --git a/tensorflow/core/platform/profile_utils/android_armv7a_cpu_utils_helper.cc b/tensorflow/core/platform/profile_utils/android_armv7a_cpu_utils_helper.cc
index 0534443d17c..f75d3533d17 100644
--- a/tensorflow/core/platform/profile_utils/android_armv7a_cpu_utils_helper.cc
+++ b/tensorflow/core/platform/profile_utils/android_armv7a_cpu_utils_helper.cc
@@ -118,10 +118,14 @@ int64 AndroidArmV7ACpuUtilsHelper::ReadCpuFrequencyFile(
   const int retval = fscanf(fp, "%lld", &freq_in_khz);
   if (retval < 0) {
     LOG(WARNING) << "Failed to \"" << file_path << "\"";
-    fclose(fp);
+    if (fclose(fp) != 0) {
+      LOG(WARNING) << "fclose() failed: " << strerror(errno);
+    }
     return INVALID_CPU_FREQUENCY;
   }
-  fclose(fp);
+  if (fclose(fp) != 0) {
+    LOG(WARNING) << "fclose() failed: " << strerror(errno);
+  }
   return freq_in_khz * 1000;  // The file contains cpu frequency in khz
 }
 

From e5e81a3b52cabd767ce504a209f3b1f1bf9fa58b Mon Sep 17 00:00:00 2001
From: khaled besrour <khaledbesrour2@gmail.com>
Date: Sat, 21 Mar 2020 18:13:50 +0100
Subject: [PATCH 0021/1390] Bug Fix : resolve cast error by adding cleaner type
 for MKL

---
 tensorflow/core/kernels/mkl_conv_ops.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/kernels/mkl_conv_ops.h b/tensorflow/core/kernels/mkl_conv_ops.h
index 7d1e19566ee..2ee2a621067 100644
--- a/tensorflow/core/kernels/mkl_conv_ops.h
+++ b/tensorflow/core/kernels/mkl_conv_ops.h
@@ -51,7 +51,7 @@ using mkldnn::stream;
 namespace tensorflow {
 
 #ifdef ENABLE_MKLDNN_V1
-#define MKLDNN_SIZE_DTYPE long int
+#define MKLDNN_SIZE_DTYPE memory::dim
 #else
 #define MKLDNN_SIZE_DTYPE int
 #endif  // ENABLE_MKLDNN_V1

From ce81212fdfe7c5e84e354b1a52b24299a20899e9 Mon Sep 17 00:00:00 2001
From: khaled besrour <khaledbesrour2@gmail.com>
Date: Sat, 21 Mar 2020 18:19:39 +0100
Subject: [PATCH 0022/1390] Bug Fix : correct compilation erreur by using
 dynamic allocation

---
 tensorflow/core/util/mkl_util.h | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/tensorflow/core/util/mkl_util.h b/tensorflow/core/util/mkl_util.h
index e0a399f2d6c..a1b6cc758f8 100644
--- a/tensorflow/core/util/mkl_util.h
+++ b/tensorflow/core/util/mkl_util.h
@@ -1260,8 +1260,8 @@ inline Status CreateBlockedMemDescHelper(const memory::dims& dim,
   DCHECK_EQ(dim.size(), strides.size());
 #ifdef ENABLE_MKLDNN_V1
   const int kNumDims = dim.size();
-  mkldnn_dim_t input_dims[kNumDims];
-  mkldnn_dim_t input_strides[kNumDims];
+  mkldnn_dim_t* input_dims = new mkldnn_dim_t[kNumDims];
+  mkldnn_dim_t* input_strides = new mkldnn_dim_t[kNumDims];
   for (int i = 0; i < kNumDims; ++i) {
     input_dims[i] = dim[i];
     input_strides[i] = strides[i];
@@ -1270,7 +1270,11 @@ inline Status CreateBlockedMemDescHelper(const memory::dims& dim,
     mkldnn_memory_desc_init_by_strides(blocked_md, kNumDims, input_dims,
                                        memory::convert_to_c(dtype),
                                        input_strides);
+    delete[] input_dims;
+    delete[] input_strides;
   } catch (mkldnn::error& e) {
+    delete[] input_dims;
+    delete[] input_strides;
     return Status(error::Code::INTERNAL,
                   tensorflow::strings::StrCat(
                       "Failed to create blocked memory descriptor.",

From 6da9fa6111ab1be251b1138a0a591bc022b58cdf Mon Sep 17 00:00:00 2001
From: Peng Meng <pengmeng@tencent.com>
Date: Thu, 26 Mar 2020 00:05:02 +0800
Subject: [PATCH 0023/1390] fixOpFuse

---
 .../core/grappler/optimizers/remapper.cc      | 31 ++++++++++++-------
 1 file changed, 20 insertions(+), 11 deletions(-)

diff --git a/tensorflow/core/grappler/optimizers/remapper.cc b/tensorflow/core/grappler/optimizers/remapper.cc
index 5b41ad38089..b47c71fd5c8 100644
--- a/tensorflow/core/grappler/optimizers/remapper.cc
+++ b/tensorflow/core/grappler/optimizers/remapper.cc
@@ -1476,6 +1476,13 @@ bool RequiresInferredShapes(const RemapperContext& ctx, int node_index) {
     return true;
   };
 
+  const auto is_conv2d_candidate = [&]() -> bool {
+    if (!IsConv2D(*node_def)) return false;
+    if (GetDataTypeFromAttr(*node_def, "T") != DT_FLOAT) return false;
+
+    return true;
+  };
+
   // Candidate for a FusedBatchNorm fusion.
   const auto is_batch_norm_fusion_candidate = [&]() -> bool {
     if (!IsRelu(*node_def)) return false;
@@ -1506,7 +1513,7 @@ bool RequiresInferredShapes(const RemapperContext& ctx, int node_index) {
     return false;
   };
 
-  return is_batch_norm_candidate() || is_batch_norm_fusion_candidate();
+  return is_conv2d_candidate() || is_batch_norm_candidate() || is_batch_norm_fusion_candidate();
 }
 
 }  // namespace
@@ -1564,6 +1571,17 @@ Status Remapper::Optimize(Cluster* cluster, const GrapplerItem& item,
     }
 #endif  //! INTEL_MKL
 
+    // Infer properties lazily in case they are not needed.
+    if (!ctx.inferred_graph_properties && RequiresInferredShapes(ctx, i)) {
+      const bool assume_valid_feeds = opt_level_ == RewriterConfig::AGGRESSIVE;
+      TF_RETURN_IF_ERROR(ctx.graph_properties.InferStatically(
+          assume_valid_feeds,
+          /*aggressive_shape_inference=*/false,
+          /*include_input_tensor_values=*/true,
+          /*include_output_tensor_values=*/false));
+      ctx.inferred_graph_properties = true;
+    }
+
     // Remap {Conv2D,MatMul}+BiasAdd into the _Fused{Conv2D,MatMul}
     ContractionWithBiasAdd contract_with_bias;
     if (allow_non_differentiable_rewrites &&
@@ -1592,6 +1610,7 @@ Status Remapper::Optimize(Cluster* cluster, const GrapplerItem& item,
 // Remove this once TF-MKL supports _FusedConv2D with these operations.
 #ifndef INTEL_MKL
     // Remap Conv2D+Squeeze+BiasAdd into the _FusedConv2D+Squeeze.
+
     ContractionWithSqueezeAndBiasAdd contract_with_squeeze_and_bias;
     if (allow_non_differentiable_rewrites &&
         FindConv2DWithSqueezeAndBias(ctx, i, &contract_with_squeeze_and_bias)) {
@@ -1624,16 +1643,6 @@ Status Remapper::Optimize(Cluster* cluster, const GrapplerItem& item,
     }
 #endif  // !INTEL_MKL
 
-    // Infer properties lazily in case they are not needed.
-    if (!ctx.inferred_graph_properties && RequiresInferredShapes(ctx, i)) {
-      const bool assume_valid_feeds = opt_level_ == RewriterConfig::AGGRESSIVE;
-      TF_RETURN_IF_ERROR(ctx.graph_properties.InferStatically(
-          assume_valid_feeds,
-          /*aggressive_shape_inference=*/false,
-          /*include_input_tensor_values=*/true,
-          /*include_output_tensor_values=*/false));
-      ctx.inferred_graph_properties = true;
-    }
 
     // Remap FusedBatchNorm+<SideInput>+<Activation> into the _FusedBatchNormEx.
     FusedBatchNormEx fused_batch_norm_ex;

From 5673a09ff777ee08660ce9f71d90958e18bd995c Mon Sep 17 00:00:00 2001
From: Peng Meng <pengmeng@tencent.com>
Date: Thu, 26 Mar 2020 00:30:54 +0800
Subject: [PATCH 0024/1390] fix code format

---
 tensorflow/core/grappler/optimizers/remapper.cc | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tensorflow/core/grappler/optimizers/remapper.cc b/tensorflow/core/grappler/optimizers/remapper.cc
index b47c71fd5c8..fdfb8e379b2 100644
--- a/tensorflow/core/grappler/optimizers/remapper.cc
+++ b/tensorflow/core/grappler/optimizers/remapper.cc
@@ -1513,7 +1513,8 @@ bool RequiresInferredShapes(const RemapperContext& ctx, int node_index) {
     return false;
   };
 
-  return is_conv2d_candidate() || is_batch_norm_candidate() || is_batch_norm_fusion_candidate();
+  return is_conv2d_candidate() || is_batch_norm_candidate() ||
+         is_batch_norm_fusion_candidate();
 }
 
 }  // namespace

From 69ee4de053a14bdf883a0e6726bb2b374b71c973 Mon Sep 17 00:00:00 2001
From: Elena Zhelezina <elena.zhelezina@arm.com>
Date: Fri, 27 Mar 2020 15:59:37 +0000
Subject: [PATCH 0025/1390] Fix for the broken 16-bit interface after latest
 changes to master.

---
 .../python/optimize/calibration_wrapper.cc    | 14 ++---
 .../lite/tools/optimize/quantize_model.cc     | 12 ++--
 .../lite/tools/optimize/quantize_model.h      | 12 ++--
 .../tools/optimize/quantize_model_test.cc     | 55 ++++++++++---------
 4 files changed, 49 insertions(+), 44 deletions(-)

diff --git a/tensorflow/lite/python/optimize/calibration_wrapper.cc b/tensorflow/lite/python/optimize/calibration_wrapper.cc
index 660ea8d2d1b..ad82581bcba 100644
--- a/tensorflow/lite/python/optimize/calibration_wrapper.cc
+++ b/tensorflow/lite/python/optimize/calibration_wrapper.cc
@@ -233,12 +233,11 @@ PyObject* CalibrationWrapper::QuantizeModel(int input_py_type,
   reader_->AddCalibrationToModel(tflite_model.get(), /*update=*/false);
   flatbuffers::FlatBufferBuilder builder;
   auto status = kTfLiteOk;
-  
-  status = tflite::optimize::QuantizeModel(
-    &builder, tflite_model.get(), TfLiteTypeToSchemaType(input_type),
-    TfLiteTypeToSchemaType(output_type), allow_float,
-    TfLiteTypeToSchemaType(activations_type), error_reporter_.get());
-  }
+
+  status = tflite::optimize::QuantizeModelAllOperators(
+      &builder, tflite_model.get(), TfLiteTypeToSchemaType(input_type),
+      TfLiteTypeToSchemaType(output_type), allow_float,
+      TfLiteTypeToSchemaType(activations_type), error_reporter_.get());
 
   if (status != kTfLiteOk) {
     error_reporter_->exception();
@@ -269,8 +268,7 @@ PyObject* CalibrationWrapper::QuantizeModel(int input_py_type,
   auto status = tflite::optimize::QuantizeModel(
       &builder, tflite_model.get(), TfLiteTypeToSchemaType(input_type),
       TfLiteTypeToSchemaType(output_type), allow_float, {op_name},
-      TensorType_INT8,
-      error_reporter_.get());
+      TensorType_INT8, error_reporter_.get());
   if (status != kTfLiteOk) {
     error_reporter_->exception();
     return nullptr;
diff --git a/tensorflow/lite/tools/optimize/quantize_model.cc b/tensorflow/lite/tools/optimize/quantize_model.cc
index 32cd2b8c25a..0892e7ae52a 100644
--- a/tensorflow/lite/tools/optimize/quantize_model.cc
+++ b/tensorflow/lite/tools/optimize/quantize_model.cc
@@ -1240,11 +1240,13 @@ TfLiteStatus QuantizeModel(flatbuffers::FlatBufferBuilder* builder,
   return kTfLiteOk;
 }
 
-TfLiteStatus QuantizeModel(flatbuffers::FlatBufferBuilder* builder,
-                           ModelT* model, const TensorType& input_type,
-                           const TensorType& output_type, bool allow_float,
-                           const TensorType& activations_type,
-                           ErrorReporter* error_reporter) {
+TfLiteStatus QuantizeModelAllOperators(flatbuffers::FlatBufferBuilder* builder,
+                                       ModelT* model,
+                                       const TensorType& input_type,
+                                       const TensorType& output_type,
+                                       bool allow_float,
+                                       const TensorType& activations_type,
+                                       ErrorReporter* error_reporter) {
   return QuantizeModel(builder, model, input_type, output_type, allow_float,
                        GetAllOperatorOutputs(model), activations_type,
                        error_reporter);
diff --git a/tensorflow/lite/tools/optimize/quantize_model.h b/tensorflow/lite/tools/optimize/quantize_model.h
index 06c30b88fd0..29f581d2b35 100644
--- a/tensorflow/lite/tools/optimize/quantize_model.h
+++ b/tensorflow/lite/tools/optimize/quantize_model.h
@@ -69,11 +69,13 @@ TfLiteStatus QuantizeModel(flatbuffers::FlatBufferBuilder* builder,
 // could be TensorType_INT16 or TensorType_INT8.
 //
 // Note: This is a private API, subject to change.
-TfLiteStatus QuantizeModel(flatbuffers::FlatBufferBuilder* builder,
-                           ModelT* model, const TensorType& input_type,
-                           const TensorType& output_type, bool allow_float,
-                           const TensorType& activations_type,
-                           ErrorReporter* error_reporter);
+TfLiteStatus QuantizeModelAllOperators(flatbuffers::FlatBufferBuilder* builder,
+                                       ModelT* model,
+                                       const TensorType& input_type,
+                                       const TensorType& output_type,
+                                       bool allow_float,
+                                       const TensorType& activations_type,
+                                       ErrorReporter* error_reporter);
 
 // Quantizes input_model and populates the provided builder with the new model
 // with all possible input parameters.
diff --git a/tensorflow/lite/tools/optimize/quantize_model_test.cc b/tensorflow/lite/tools/optimize/quantize_model_test.cc
index 0f780e4d3da..885fa98992c 100644
--- a/tensorflow/lite/tools/optimize/quantize_model_test.cc
+++ b/tensorflow/lite/tools/optimize/quantize_model_test.cc
@@ -190,9 +190,9 @@ TEST_P(QuantizeConvModelTest, OperatorsAreUnchanged) {
 }
 
 TEST_P(QuantizeConvModelTest, GraphIsFullyQuantized) {
-  auto status =
-      QuantizeModel(&builder_, &model_, tensor_type_, tensor_type_,
-                    /*allow_float*/ false, tensor_type_, &error_reporter_);
+  auto status = QuantizeModelAllOperators(
+      &builder_, &model_, tensor_type_, tensor_type_,
+      /*allow_float*/ false, tensor_type_, &error_reporter_);
   EXPECT_EQ(status, kTfLiteOk);
   for (const auto& subgraph : model_.subgraphs) {
     for (const auto& tensor : subgraph->tensors) {
@@ -209,9 +209,9 @@ TEST_P(QuantizeConvModelTest, GraphIsFullyQuantized) {
 }
 
 TEST_P(QuantizeConvModelTest, FloatInputAndOutput) {
-  auto status =
-      QuantizeModel(&builder_, &model_, TensorType_FLOAT32, TensorType_FLOAT32,
-                    /*allow_float*/ false, tensor_type_, &error_reporter_);
+  auto status = QuantizeModelAllOperators(
+      &builder_, &model_, TensorType_FLOAT32, TensorType_FLOAT32,
+      /*allow_float*/ false, tensor_type_, &error_reporter_);
   EXPECT_EQ(status, kTfLiteOk);
 
   for (int32_t subgraph_idx = 0; subgraph_idx < model_.subgraphs.size();
@@ -384,8 +384,9 @@ class QuantizeConcatModelTest : public QuantizeModelTest,
 //                                       concat - output
 //                              input1 /
 TEST_P(QuantizeConcatModelTest, AddRequantBeforeConcat) {
-  auto status = QuantizeModel(&builder_, &model_, tensor_type_, tensor_type_,
-                              false, tensor_type_, &error_reporter_);
+  auto status =
+      QuantizeModelAllOperators(&builder_, &model_, tensor_type_, tensor_type_,
+                                false, tensor_type_, &error_reporter_);
   EXPECT_EQ(status, kTfLiteOk);
 
   // There is only one subgraph.
@@ -549,9 +550,9 @@ class QuantizeConvModel1Test : public QuantizeModelTest {
 };
 
 TEST_F(QuantizeConvModel1Test, VerifyConvQuantizationWithUnitScale) {
-  auto status =
-      QuantizeModel(&builder_, &model_, TensorType_INT8, TensorType_INT8, false,
-                    TensorType_INT8, &error_reporter_);
+  auto status = QuantizeModelAllOperators(&builder_, &model_, TensorType_INT8,
+                                          TensorType_INT8, false,
+                                          TensorType_INT8, &error_reporter_);
   EXPECT_EQ(status, kTfLiteOk);
   const auto& subgraph = model_.subgraphs[0];
 
@@ -658,8 +659,9 @@ INSTANTIATE_TEST_SUITE_P(QuantizeConvModel2TestInst, QuantizeConvModel2Test,
                                             TensorType_INT16}));
 
 TEST_P(QuantizeConvModel2Test, VerifyConvQuantization) {
-  auto status = QuantizeModel(&builder_, &model_, tensor_type_, tensor_type_,
-                              false, tensor_type_, &error_reporter_);
+  auto status =
+      QuantizeModelAllOperators(&builder_, &model_, tensor_type_, tensor_type_,
+                                false, tensor_type_, &error_reporter_);
   ASSERT_EQ(kTfLiteOk, status);
   const auto& subgraph = model_.subgraphs[0];
   auto conv_op = subgraph->operators[0].get();
@@ -1308,8 +1310,9 @@ TEST_F(QuantizeFCTest, VerifyFC) {
   EXPECT_EQ(model_.operator_codes[1]->version, 1);
 }
 
-class QuantizeCustomOpTest : public QuantizeModelTest,
-    public ::testing::WithParamInterface<tflite::TensorType> {
+class QuantizeCustomOpTest
+    : public QuantizeModelTest,
+      public ::testing::WithParamInterface<tflite::TensorType> {
  protected:
   QuantizeCustomOpTest() {
     input_model_ = ReadModel(internal::kModelMixed);
@@ -1319,9 +1322,9 @@ class QuantizeCustomOpTest : public QuantizeModelTest,
 };
 
 TEST_P(QuantizeCustomOpTest, VerifyMixedQuantization) {
-  auto status =
-      QuantizeModel(&builder_, &model_, GetParam(), GetParam(),
-                    /*allow_float=*/true, GetParam(), &error_reporter_);
+  auto status = QuantizeModelAllOperators(
+      &builder_, &model_, GetParam(), GetParam(),
+      /*allow_float=*/true, GetParam(), &error_reporter_);
   ASSERT_EQ(kTfLiteOk, status);
   const auto& subgraph = model_.subgraphs[0];
   auto float_graph = readonly_model_->subgraphs()->Get(0);
@@ -1335,7 +1338,7 @@ TEST_P(QuantizeCustomOpTest, VerifyMixedQuantization) {
       BuiltinOperator_CUSTOM,   BuiltinOperator_CUSTOM,
       BuiltinOperator_QUANTIZE, BuiltinOperator_SQUEEZE};
   const std::vector<TensorType> op_input_types = {
-      GetParam(),    GetParam(),    TensorType_FLOAT32,
+      GetParam(),         GetParam(),         TensorType_FLOAT32,
       TensorType_FLOAT32, TensorType_FLOAT32, GetParam()};
   for (int i = 0; i < subgraph->operators.size(); ++i) {
     OperatorT* op = subgraph->operators[i].get();
@@ -1358,9 +1361,9 @@ class QuantizeOp16x8Test : public QuantizeModelTest {
 };
 
 TEST_F(QuantizeOp16x8Test, VerifyMixedQuantization16x8) {
-  auto status =
-      QuantizeModel(&builder_, &model_, TensorType_INT16, TensorType_FLOAT32,
-                    /*allow_float=*/true, TensorType_INT16, &error_reporter_);
+  auto status = QuantizeModelAllOperators(
+      &builder_, &model_, TensorType_INT16, TensorType_FLOAT32,
+      /*allow_float=*/true, TensorType_INT16, &error_reporter_);
   ASSERT_EQ(kTfLiteOk, status);
   const auto& subgraph = model_.subgraphs[0];
   auto float_graph = readonly_model_->subgraphs()->Get(0);
@@ -1369,11 +1372,11 @@ TEST_F(QuantizeOp16x8Test, VerifyMixedQuantization16x8) {
   // The resulting model should be:
   // conv_2d->dequantize->log_softmax
   ASSERT_EQ(subgraph->operators.size(), 3);
-  const std::vector<BuiltinOperator> op_codes = {
-      BuiltinOperator_CONV_2D,  BuiltinOperator_DEQUANTIZE,
-      BuiltinOperator_LOG_SOFTMAX};
+  const std::vector<BuiltinOperator> op_codes = {BuiltinOperator_CONV_2D,
+                                                 BuiltinOperator_DEQUANTIZE,
+                                                 BuiltinOperator_LOG_SOFTMAX};
   const std::vector<TensorType> op_input_types = {
-      TensorType_INT16,    TensorType_INT16,    TensorType_FLOAT32};
+      TensorType_INT16, TensorType_INT16, TensorType_FLOAT32};
   for (int i = 0; i < subgraph->operators.size(); ++i) {
     OperatorT* op = subgraph->operators[i].get();
     ASSERT_EQ(model_.operator_codes[op->opcode_index]->builtin_code,

From e6fd34c57c4ad7402c2c127a3380cfba65ecdb55 Mon Sep 17 00:00:00 2001
From: Fredrik Knutsson <fredrik.knutsson@arm.com>
Date: Wed, 11 Sep 2019 09:43:52 +0200
Subject: [PATCH 0026/1390] Add support for offline planned tensor allocations

By adding metadata to the model, it is possible to set arena offset
for each tensor.

Change-Id: Idd646c00a6e34e0c2603896d748cd5680a57f015
---
 .../memory_planner/greedy_memory_planner.cc   | 181 +++++++++++-------
 .../memory_planner/greedy_memory_planner.h    |  19 +-
 tensorflow/lite/micro/micro_allocator.cc      |  82 +++++++-
 3 files changed, 210 insertions(+), 72 deletions(-)

diff --git a/tensorflow/lite/micro/memory_planner/greedy_memory_planner.cc b/tensorflow/lite/micro/memory_planner/greedy_memory_planner.cc
index faea73e9169..7763afa2075 100644
--- a/tensorflow/lite/micro/memory_planner/greedy_memory_planner.cc
+++ b/tensorflow/lite/micro/memory_planner/greedy_memory_planner.cc
@@ -42,8 +42,8 @@ GreedyMemoryPlanner::GreedyMemoryPlanner(unsigned char* scratch_buffer,
                                          int scratch_buffer_size)
     : buffer_count_(0), need_to_calculate_offsets_(true) {
   const int per_buffer_size = sizeof(BufferRequirements) +  // requirements_
-                              sizeof(int) +  // buffer_sizes_sorted_by_size_
-                              sizeof(int) +  // buffer_ids_sorted_by_size_
+                              sizeof(int) +        // buffer_sizes_sorted_
+                              sizeof(int) +        // buffer_ids_sorted_
                               sizeof(ListEntry) +  // buffers_sorted_by_offset_
                               sizeof(int);         // buffer_offsets_;
   // Allocate the arrays we need within the scratch buffer arena.
@@ -53,10 +53,10 @@ GreedyMemoryPlanner::GreedyMemoryPlanner(unsigned char* scratch_buffer,
   requirements_ = reinterpret_cast<BufferRequirements*>(next_free);
   next_free += sizeof(BufferRequirements) * max_buffer_count_;
 
-  buffer_sizes_sorted_by_size_ = reinterpret_cast<int*>(next_free);
+  buffer_sizes_sorted_ = reinterpret_cast<int*>(next_free);
   next_free += sizeof(int) * max_buffer_count_;
 
-  buffer_ids_sorted_by_size_ = reinterpret_cast<int*>(next_free);
+  buffer_ids_sorted_ = reinterpret_cast<int*>(next_free);
   next_free += sizeof(int) * max_buffer_count_;
 
   buffers_sorted_by_offset_ = reinterpret_cast<ListEntry*>(next_free);
@@ -81,11 +81,24 @@ TfLiteStatus GreedyMemoryPlanner::AddBuffer(
   current->size = size;
   current->first_time_used = first_time_used;
   current->last_time_used = last_time_used;
+  current->offline_offset = kOnlinePlannedBuffer;
   ++buffer_count_;
   need_to_calculate_offsets_ = true;
   return kTfLiteOk;
 }
 
+TfLiteStatus GreedyMemoryPlanner::AddBuffer(
+    tflite::ErrorReporter* error_reporter, int size, int first_time_used,
+    int last_time_used, int offline_offset) {
+  BufferRequirements* current = &requirements_[buffer_count_];
+  if (AddBuffer(error_reporter, size, first_time_used, last_time_used) !=
+      kTfLiteOk) {
+    return kTfLiteError;
+  }
+  current->offline_offset = offline_offset;
+  return kTfLiteOk;
+}
+
 bool GreedyMemoryPlanner::DoesEntryOverlapInTime(
     const GreedyMemoryPlanner::ListEntry* entry, const int first_time_used,
     const int last_time_used) const {
@@ -107,7 +120,7 @@ GreedyMemoryPlanner::NextSimultaneouslyActiveBuffer(
   ListEntry* result = nullptr;
   ListEntry* candidate_next_entry;
   if (start == nullptr) {
-    candidate_next_entry = &buffers_sorted_by_offset_[0];
+    candidate_next_entry = &buffers_sorted_by_offset_[first_entry_index_];
   } else {
     if (start->next_entry_index == -1) {
       return nullptr;
@@ -139,29 +152,51 @@ void GreedyMemoryPlanner::CalculateOffsetsIfNeeded() {
   // This helps find a more compact layout. Intuitively, you can think
   // about putting the large buffers in place first, and then the
   // smaller buffers can fit in the gaps, rather than fragmenting the
-  // gaps with small buffers at the beginning.
+  // gaps with small buffers at the beginning. Add offline planned offsets
+  // first in the list, since they have a predetermined offset.
+  int idx_from_tail = buffer_count_;
+  int idx_from_head = 0;
   for (int i = 0; i < buffer_count_; ++i) {
-    buffer_sizes_sorted_by_size_[i] = requirements_[i].size;
-    buffer_ids_sorted_by_size_[i] = i;
-    buffer_offsets_[i] = -1;
+    if (requirements_[i].offline_offset == kOnlinePlannedBuffer) {
+      idx_from_tail--;
+      buffer_sizes_sorted_[idx_from_tail] = requirements_[i].size;
+      buffer_ids_sorted_[idx_from_tail] = i;
+      buffer_offsets_[i] = -1;
+    } else {
+      buffer_sizes_sorted_[idx_from_head] = requirements_[i].size;
+      buffer_ids_sorted_[idx_from_head] = i;
+      buffer_offsets_[i] = requirements_[i].offline_offset;
+      idx_from_head++;
+    }
   }
-  // This sorting algorithm is naive, and may end up taking a very long time
-  // with hundreds of buffers.
-  ReverseSortInPlace(buffer_sizes_sorted_by_size_, buffer_ids_sorted_by_size_,
-                     buffer_count_);
 
-  // Put the largest buffer at offset zero to start the process.
-  ListEntry* first_entry = &buffers_sorted_by_offset_[0];
-  first_entry->offset = 0;
-  first_entry->requirements_index = buffer_ids_sorted_by_size_[0];
-  first_entry->next_entry_index = -1;
+  // This sorting algorithm is naive, and may end up taking a very long time
+  // with hundreds of buffers. Do not sort the offline planned offsets.
+  ReverseSortInPlace(&buffer_sizes_sorted_[idx_from_head],
+                     &buffer_ids_sorted_[idx_from_head],
+                     buffer_count_ - idx_from_head);
+
+  // Initialize the first entry to the first buffer in
+  // buffer_ids_sorted_.
+  //   - If there are no offline planned offsets, the largest buffer will be
+  //     first, and the buffers will be handled in size order.
+  //   - If offline offsets are present, these will be handled first in order
+  //     for the greedy algorithm to utilized gaps in the offline plan.
+  first_entry_index_ = 0;
   next_free_entry_ = 1;
-  buffer_offsets_[buffer_ids_sorted_by_size_[0]] = 0;
+  ListEntry* first_entry = &buffers_sorted_by_offset_[first_entry_index_];
+  first_entry->next_entry_index = -1;  // to mark the entry as end of list
+  int buffer_id = buffer_ids_sorted_[0];
+  first_entry->requirements_index = buffer_id;
+  if (requirements_[buffer_id].offline_offset == kOnlinePlannedBuffer) {
+    buffer_offsets_[buffer_id] = 0;
+  }
+  first_entry->offset = buffer_offsets_[buffer_id];
 
   // Work through the rest of the buffers to find a good gap to place each one.
   for (int i = 1; i < buffer_count_; ++i) {
     // The id is the order the buffer was originally added by the client.
-    const int buffer_id = buffer_ids_sorted_by_size_[i];
+    const int buffer_id = buffer_ids_sorted_[i];
     // Look at what size and time range the buffer needs to be active.
     BufferRequirements* wanted_requirements = &requirements_[buffer_id];
     const int wanted_size = wanted_requirements->size;
@@ -173,37 +208,43 @@ void GreedyMemoryPlanner::CalculateOffsetsIfNeeded() {
     // so that it's easy to find the next buffer in memory, and so the gap.
     // The candidate_entry variable holds the buffer that we're considering
     // placing the current buffer after.
-    ListEntry* prior_entry = nullptr;
+
     int candidate_offset = 0;
     // Loop through the offset-ordered list of buffers, looking for gaps.
-    while (true) {
-      // Find out what the next active buffer is.
-      ListEntry* next_entry = NextSimultaneouslyActiveBuffer(
-          prior_entry, wanted_first_time_used, wanted_last_time_used);
+    if (wanted_requirements->offline_offset == kOnlinePlannedBuffer) {
+      ListEntry* prior_entry = nullptr;
+      while (true) {
+        // Find out what the next active buffer is.
+        ListEntry* next_entry = NextSimultaneouslyActiveBuffer(
+            prior_entry, wanted_first_time_used, wanted_last_time_used);
 
-      if (prior_entry) {
-        BufferRequirements* candidate_requirements =
-            &requirements_[prior_entry->requirements_index];
-        const int prior_entry_offset =
-            prior_entry->offset + candidate_requirements->size;
-        if (prior_entry_offset > candidate_offset) {
-          candidate_offset = prior_entry_offset;
+        if (prior_entry) {
+          BufferRequirements* candidate_requirements =
+              &requirements_[prior_entry->requirements_index];
+          const int prior_entry_offset =
+              prior_entry->offset + candidate_requirements->size;
+          if (prior_entry_offset > candidate_offset) {
+            candidate_offset = prior_entry_offset;
+          }
         }
+        if (next_entry == nullptr) {
+          // We're at the end of the list, so we can always append the buffer
+          // here.
+          break;
+        }
+        // Find out how much space there is between us and the next buffer.
+        const int gap = next_entry->offset - candidate_offset;
+        if (gap >= wanted_size) {
+          // This entry has a big enough gap between it and the next, so
+          // use it!
+          break;
+        }
+        // The gap wasn't big enough, so move on to another candidate.
+        prior_entry = next_entry;
       }
-      if (next_entry == nullptr) {
-        // We're at the end of the list, so we can always append the buffer
-        // here.
-        break;
-      }
-      // Find out how much space there is between us and the next buffer.
-      const int gap = next_entry->offset - candidate_offset;
-      if (gap >= wanted_size) {
-        // This entry has a big enough gap between it and the next, so
-        // use it!
-        break;
-      }
-      // The gap wasn't big enough, so move on to another candidate.
-      prior_entry = next_entry;
+    } else {
+      // Offline planned offset are to be considered constant
+      candidate_offset = wanted_requirements->offline_offset;
     }
     // At this point, we've either found a gap (possibly at the end of the
     // list) and want to place the buffer there, or there are no other active
@@ -217,26 +258,36 @@ void GreedyMemoryPlanner::CalculateOffsetsIfNeeded() {
     new_entry->requirements_index = buffer_id;
     const int new_entry_index = next_free_entry_;
     ++next_free_entry_;
-    ListEntry* current_entry = first_entry;
-    // Make sure that we insert the buffer at the correct place in the ordered
-    // list.
-    while (true) {
-      const int next_entry_index = current_entry->next_entry_index;
-      if (next_entry_index == -1) {
-        // We're at the end of the list, so just add the new entry here.
-        current_entry->next_entry_index = new_entry_index;
-        new_entry->next_entry_index = -1;
-        break;
+
+    if (first_entry->offset > candidate_offset) {
+      // The new entry offset is smaller than the first entry offset =>
+      // replace the first entry
+      first_entry = new_entry;
+      first_entry->next_entry_index = first_entry_index_;
+      first_entry_index_ = new_entry_index;
+    } else {
+      ListEntry* current_entry = first_entry;
+      // Make sure that we insert the buffer at the correct place in the
+      // buffer-offset-ordered list
+      while (true) {
+        const int next_entry_index = current_entry->next_entry_index;
+        if (next_entry_index == -1) {
+          // We're at the end of the list, so just add the new entry here.
+          current_entry->next_entry_index = new_entry_index;
+          new_entry->next_entry_index = -1;
+          break;
+        }
+        // not at the end of the list -> take a look at next entry
+        ListEntry* next_entry = &buffers_sorted_by_offset_[next_entry_index];
+        if (next_entry->offset > candidate_offset) {
+          // We're at the right spot to do an insertion and retain the sorting
+          // order, so place the new entry here.
+          new_entry->next_entry_index = current_entry->next_entry_index;
+          current_entry->next_entry_index = new_entry_index;
+          break;
+        }
+        current_entry = next_entry;
       }
-      ListEntry* next_entry = &buffers_sorted_by_offset_[next_entry_index];
-      if (next_entry->offset > candidate_offset) {
-        // We're at the right spot to do an insertion and retain the sorting
-        // order, so place the new entry here.
-        new_entry->next_entry_index = current_entry->next_entry_index;
-        current_entry->next_entry_index = new_entry_index;
-        break;
-      }
-      current_entry = next_entry;
     }
   }
 }
diff --git a/tensorflow/lite/micro/memory_planner/greedy_memory_planner.h b/tensorflow/lite/micro/memory_planner/greedy_memory_planner.h
index f2c77ed94f3..d874b70e732 100644
--- a/tensorflow/lite/micro/memory_planner/greedy_memory_planner.h
+++ b/tensorflow/lite/micro/memory_planner/greedy_memory_planner.h
@@ -21,6 +21,8 @@ limitations under the License.
 
 namespace tflite {
 
+constexpr int kOnlinePlannedBuffer = -1;
+
 // A memory planner that uses a greedy algorithm to arrange buffers in memory
 // to minimize the overall arena size needed.
 //
@@ -59,6 +61,12 @@ class GreedyMemoryPlanner : public MemoryPlanner {
   TfLiteStatus AddBuffer(ErrorReporter* error_reporter, int size,
                          int first_time_used, int last_time_used) override;
 
+  // Record details of an offline planned buffer offset we want to place.
+  // offline_offset is the buffer offset from the start of the arena.
+  TfLiteStatus AddBuffer(ErrorReporter* error_reporter, int size,
+                         int first_time_used, int last_time_used,
+                         int offline_offset);
+
   // Returns the high-water mark of used memory. This is the minimum size of a
   // memory arena you'd need to allocate to hold these buffers.
   size_t GetMaximumMemorySize() override;
@@ -110,16 +118,23 @@ class GreedyMemoryPlanner : public MemoryPlanner {
   // Records the client-provided information about each buffer.
   struct BufferRequirements {
     int size;
+    int offline_offset;
     int first_time_used;
     int last_time_used;
   };
 
   // Working arrays used during the layout algorithm.
   BufferRequirements* requirements_;
-  int* buffer_sizes_sorted_by_size_;
-  int* buffer_ids_sorted_by_size_;
+  // buffer_sizes_sorted_ and buffer_ids_sorted_ are sorted according to:
+  //   {
+  //     offline planned buffers,
+  //     online planned buffers sorted by size
+  //   }
+  int* buffer_sizes_sorted_;
+  int* buffer_ids_sorted_;
   ListEntry* buffers_sorted_by_offset_;
   int next_free_entry_;
+  int first_entry_index_;
 
   // Stores the outcome of the plan, the location of each buffer in the arena.
   int* buffer_offsets_;
diff --git a/tensorflow/lite/micro/micro_allocator.cc b/tensorflow/lite/micro/micro_allocator.cc
index c3044a0351f..46edec2bf43 100644
--- a/tensorflow/lite/micro/micro_allocator.cc
+++ b/tensorflow/lite/micro/micro_allocator.cc
@@ -37,6 +37,7 @@ struct AllocationInfo {
   int last_used;
   bool needs_allocating;
   void** output_ptr;
+  int offline_offset;
 };
 
 // We align tensor buffers to 16-byte boundaries, since this is a common
@@ -112,9 +113,17 @@ class AllocationInfoBuilder {
     return Allocate();
   }
 
+  // Check if model contains offline planned buffer offsets.
+  //  - If there's no metadata available, offline_planner_offsets is not set
+  //  - If there's metadata available, offline_planner_offsets will point to the
+  //    first offset in the metadata buffer list.
+  TfLiteStatus GetOfflinePlannedOffsets(const Model* model,
+                                        int** offline_planner_offsets);
+
   // Add allocaiton information for the tensors.
-  TfLiteStatus AddTensors(const SubGraph* subgraph,
+  TfLiteStatus AddTensors(const SubGraph* subgraph, int* offline_offsets,
                           TfLiteTensor* runtime_tensors);
+
   // Add allocation information for the scratch buffers.
   TfLiteStatus AddScratchBuffers(internal::ScratchBufferHandle* buffer_handles);
 
@@ -148,6 +157,7 @@ TfLiteStatus AllocationInfoBuilder::Allocate() {
 }
 
 TfLiteStatus AllocationInfoBuilder::AddTensors(const SubGraph* subgraph,
+                                               int* offline_offsets,
                                                TfLiteTensor* runtime_tensors) {
   // Set up allocation info for all tensors.
   for (size_t i = 0; i < tensor_count_; ++i) {
@@ -159,6 +169,11 @@ TfLiteStatus AllocationInfoBuilder::AddTensors(const SubGraph* subgraph,
     current->last_used = -1;
     current->needs_allocating = (runtime_tensors[i].data.raw == nullptr) &&
                                 (!subgraph->tensors()->Get(i)->is_variable());
+    if (offline_offsets) {
+      current->offline_offset = offline_offsets[i];
+    } else {
+      current->offline_offset = kOnlinePlannedBuffer;
+    }
   }
 
   for (size_t i = 0; i < subgraph->inputs()->size(); ++i) {
@@ -216,6 +231,51 @@ TfLiteStatus AllocationInfoBuilder::AddTensors(const SubGraph* subgraph,
   return kTfLiteOk;
 }
 
+// The tensor offsets will be encoded in the metadata:[Metadata] field of the
+// Model. The following encoding applies:
+//
+// | Metadata component |                 Value                                |
+// |    name:string     | “OfflineMemoryAllocation”                            |
+// |    buffer:unit     | Index of buffer containing memory allocation data    |
+//
+// The buffer contents for the memory allocation is a list of 32-bit integers of
+// the following format:
+//
+// |  Offset |                            Value                                |
+// |    0    | Offline allocation format version – set to 0                    |
+// |    1    | Subgraph index to which this allocation applies                 |
+// |    2    | Number offsets following: n                                     |
+// |    3    | Arena byte offset of tensor #0 or -1 to allocate at runtime     |
+// |    4    | Arena byte offset of tensor #1 or -1 to allocate at runtime     |
+// | 3+(n-1) | Arena byte offset of tensor #(n-1) or -1 to allocate at runtime |
+TfLiteStatus AllocationInfoBuilder::GetOfflinePlannedOffsets(
+    const Model* model, int** offline_planner_offsets) {
+  if (model->metadata()) {
+    for (int i = 0; i < model->metadata()->size(); ++i) {
+      auto metadata = model->metadata()->Get(i);
+      if (strncmp(metadata->name()->c_str(), "OfflineMemoryAllocation",
+                  strlen("OfflineMemoryAllocation")) == 0) {
+        const flatbuffers::Vector<flatbuffers::Offset<Buffer>>* buffers =
+            model->buffers();
+        auto* buffer = (*buffers)[metadata->buffer()];
+        auto* array = buffer->data();
+        const uint32_t* metadata_buffer = (uint32_t*)array->data();
+        const int32_t nbr_tensors = metadata_buffer[2];
+        *offline_planner_offsets = (int32_t*)&metadata_buffer[3];
+
+        if (tensor_count_ != nbr_tensors) {
+          TF_LITE_REPORT_ERROR(reporter_,
+                               "Nbr of offline buffer offsets (%d) in metadata "
+                               "not equal nbr tensors (%d)\n",
+                               nbr_tensors, tensor_count_);
+          return kTfLiteError;
+        }
+      }
+    }
+  }
+  return kTfLiteOk;
+}
+
 TfLiteStatus AllocationInfoBuilder::AddScratchBuffers(
     internal::ScratchBufferHandle* buffer_handles) {
   // Set up allocation info for buffers.
@@ -241,9 +301,17 @@ TfLiteStatus CreatePlan(ErrorReporter* error_reporter, MemoryPlanner* planner,
     if (current->needs_allocating) {
       size_t aligned_bytes_required =
           AlignSizeUp(current->bytes, kBufferAlignment);
-      TF_LITE_ENSURE_STATUS(
-          planner->AddBuffer(error_reporter, aligned_bytes_required,
-                             current->first_created, current->last_used));
+      if (current->offline_offset == kOnlinePlannedBuffer) {
+        TF_LITE_ENSURE_STATUS(
+            planner->AddBuffer(error_reporter, aligned_bytes_required,
+                               current->first_created, current->last_used));
+      } else {
+        TF_LITE_ENSURE_STATUS(
+            (static_cast<GreedyMemoryPlanner*>(planner))
+                ->AddBuffer(error_reporter, aligned_bytes_required,
+                            current->first_created, current->last_used,
+                            current->offline_offset));
+      }
     }
   }
   return kTfLiteOk;
@@ -546,7 +614,11 @@ TfLiteStatus MicroAllocator::FinishTensorAllocation() {
     AllocationInfoBuilder builder(error_reporter_, &tmp_allocator);
     TF_LITE_ENSURE_STATUS(
         builder.Init(tensors_->size(), scratch_buffer_count_));
-    TF_LITE_ENSURE_STATUS(builder.AddTensors(subgraph_, context_->tensors));
+    int* offline_planner_offsets = nullptr;
+    TF_LITE_ENSURE_STATUS(
+        builder.GetOfflinePlannedOffsets(model_, &offline_planner_offsets));
+    TF_LITE_ENSURE_STATUS(builder.AddTensors(subgraph_, offline_planner_offsets,
+                                             context_->tensors));
     TF_LITE_ENSURE_STATUS(builder.AddScratchBuffers(scratch_buffer_handles_));
     const AllocationInfo* allocation_info = builder.Finish();
 

From 2ac92c48cd31721ec882b110e2af95edc505dc66 Mon Sep 17 00:00:00 2001
From: Fredrik Knutsson <fredrik.knutsson@arm.com>
Date: Wed, 25 Mar 2020 12:30:25 +0100
Subject: [PATCH 0027/1390] Add helper functions in micro allocator

One function to check metadata correctness and another to print
model data.

Change-Id: I2500fbbac25b376d068e3d9a1d190249da461eef
---
 tensorflow/lite/micro/micro_allocator.cc | 88 ++++++++++++++++++++++++
 1 file changed, 88 insertions(+)

diff --git a/tensorflow/lite/micro/micro_allocator.cc b/tensorflow/lite/micro/micro_allocator.cc
index 46edec2bf43..a9324eb35cd 100644
--- a/tensorflow/lite/micro/micro_allocator.cc
+++ b/tensorflow/lite/micro/micro_allocator.cc
@@ -96,6 +96,94 @@ TfLiteStatus AllocateVariables(
   return kTfLiteOk;
 }
 
+// Helper function to print model flatbuffer data. This function is not called
+// by default. Hence it's not linked in to the final binary code.
+void PrintModelData(const Model* model, ErrorReporter* error_reporter) {
+  auto* subgraphs = model->subgraphs();
+  const SubGraph* subgraph = (*subgraphs)[0];
+  const flatbuffers::Vector<flatbuffers::Offset<Tensor>>* tensors =
+      subgraph->tensors();
+  const flatbuffers::Vector<flatbuffers::Offset<Buffer>>* buffers =
+      model->buffers();
+  TF_LITE_REPORT_ERROR(error_reporter, "==== Model info: =====");
+  for (int i = 0; i < tensors->size(); ++i) {
+    const tflite::Tensor& flatbuffer_tensor = *tensors->Get(i);
+    auto* quantization = flatbuffer_tensor.quantization();
+    size_t type_size, tensor_size;
+    auto* buffer = (*buffers)[flatbuffer_tensor.buffer()];
+    auto* array = buffer->data();
+    int array_size = 0;
+    if (array) {
+      array_size = array->size();
+    }
+    BytesRequiredForTensor(flatbuffer_tensor, &tensor_size, &type_size,
+                           error_reporter);
+    TF_LITE_REPORT_ERROR(
+        error_reporter, "Tensor index: %d arena tensor %d size %d", i,
+        !array_size && !flatbuffer_tensor.is_variable(), tensor_size);
+  }
+}
+
+// Helper function to check flatbuffer metadata correctness. This function is
+// not called by default. Hence it's not linked in to the final binary code.
+TfLiteStatus CheckOfflinePlannedOffsets(const Model* model,
+                                        ErrorReporter* error_reporter) {
+  if (model->metadata()) {
+    for (int i = 0; i < model->metadata()->size(); ++i) {
+      auto metadata = model->metadata()->Get(i);
+      if (strncmp(metadata->name()->c_str(), "OfflineMemoryAllocation",
+                  strlen("OfflineMemoryAllocation")) == 0) {
+        auto* subgraphs = model->subgraphs();
+        const SubGraph* subgraph = (*subgraphs)[0];
+        const flatbuffers::Vector<flatbuffers::Offset<Tensor>>* tensors =
+            subgraph->tensors();
+        const flatbuffers::Vector<flatbuffers::Offset<Buffer>>* buffers =
+            model->buffers();
+        int nbr_tflite_tensors = tensors->size();
+        auto* buffer = (*buffers)[metadata->buffer()];
+        auto* array = buffer->data();
+        const uint32_t* metadata_buffer = (uint32_t*)array->data();
+        int version = metadata_buffer[0];
+        int subgraph_idx = metadata_buffer[1];
+        const int nbr_offline_offsets = metadata_buffer[2];
+        int* offline_planner_offsets = (int*)&metadata_buffer[3];
+
+        TF_LITE_REPORT_ERROR(error_reporter, "==== Model metadata info: =====");
+        TF_LITE_REPORT_ERROR(error_reporter,
+                             "Offline planner metadata found, version %d, "
+                             "subgraph %d, nbr offline offsets %d",
+                             version, subgraph_idx, nbr_offline_offsets);
+        for (int i = 0; i < nbr_offline_offsets; ++i) {
+          TF_LITE_REPORT_ERROR(
+              error_reporter,
+              "Offline planner tensor index %d, offline offset: %d", i,
+              offline_planner_offsets[i]);
+        }
+
+        if (version != 1) {
+          TF_LITE_REPORT_ERROR(error_reporter, "Version not supported! (%d)\n",
+                               version);
+          return kTfLiteError;
+        }
+        if (subgraph_idx != 0) {
+          TF_LITE_REPORT_ERROR(error_reporter,
+                               "Only 1 subgraph supported! Subgraph idx (%d)\n",
+                               subgraph_idx);
+          return kTfLiteError;
+        }
+        if (nbr_tflite_tensors != nbr_offline_offsets) {
+          TF_LITE_REPORT_ERROR(error_reporter,
+                               "Nbr of offline buffer offsets (%d) in metadata "
+                               "not equal nbr tensors (%d)\n",
+                               nbr_offline_offsets, nbr_tflite_tensors);
+          return kTfLiteError;
+        }
+      }
+    }
+  }
+  return kTfLiteOk;
+}
+
 // A helper class to construct AllocationInfo array. This array contains the
 // lifetime of tensors / scratch_buffer and will be used to calculate the memory
 // plan. Methods need to be called in order from `Init`, `Add*`, to `Finish`.

From dd673e306a0fec5b87f9c979d4c29524c575b766 Mon Sep 17 00:00:00 2001
From: Jens Elofsson <jens.elofsson@arm.com>
Date: Fri, 6 Mar 2020 14:04:46 +0100
Subject: [PATCH 0028/1390] OfflinePlanner: Swap offsets in
 TestAllocationForModelsWithBranches

The offline planner sorts the tensors in reverse order, so the testcase
have to be updated accordingly.

Change-Id: Ic3a1193489d6ad5f592db1c9a289b01083ad9c62
---
 tensorflow/lite/micro/micro_allocator_test.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/lite/micro/micro_allocator_test.cc b/tensorflow/lite/micro/micro_allocator_test.cc
index 47eefff90b5..765a447c044 100644
--- a/tensorflow/lite/micro/micro_allocator_test.cc
+++ b/tensorflow/lite/micro/micro_allocator_test.cc
@@ -185,10 +185,10 @@ TF_LITE_MICRO_TEST(TestAllocationForModelsWithBranches) {
   // bytes = 2 * 2 * 3 * sizeof(float32) = 48, same for other tensors.
   TF_LITE_MICRO_EXPECT_EQ(48, context.tensors[0].bytes);
   // t1 can't reuse any memory, as n0 requires both t0 and t1.
-  TF_LITE_MICRO_EXPECT_EQ(48, context.tensors[1].data.uint8 - start);
+  TF_LITE_MICRO_EXPECT_EQ(96, context.tensors[1].data.uint8 - start);
   // t2 can't reuse any memory, as n1 requires both t0 and t2. Also n2 requires
   // both t1 and t2.
-  TF_LITE_MICRO_EXPECT_EQ(96, context.tensors[2].data.uint8 - start);
+  TF_LITE_MICRO_EXPECT_EQ(48, context.tensors[2].data.uint8 - start);
   // t3 reuses the same memory from t0 as t0 is not an input to any node.
   TF_LITE_MICRO_EXPECT_EQ(0, context.tensors[3].data.uint8 - start);
 }

From 7737c6222fafdb6425d0d2395a1be5c6f4205780 Mon Sep 17 00:00:00 2001
From: Jens Elofsson <jens.elofsson@arm.com>
Date: Wed, 11 Mar 2020 12:49:23 +0100
Subject: [PATCH 0029/1390] Testcases for offline planned tensors

Added testcases. Expanded ModelBuilder for have
metadata support.

Change-Id: I7e8bf3b3537d126086aef52bb3d6fe572aa8e7a0
---
 tensorflow/lite/micro/micro_allocator_test.cc | 183 ++++++++++++++++++
 tensorflow/lite/micro/test_helpers.cc         | 106 +++++++++-
 tensorflow/lite/micro/test_helpers.h          |  13 ++
 3 files changed, 292 insertions(+), 10 deletions(-)

diff --git a/tensorflow/lite/micro/micro_allocator_test.cc b/tensorflow/lite/micro/micro_allocator_test.cc
index 765a447c044..69fb82910b0 100644
--- a/tensorflow/lite/micro/micro_allocator_test.cc
+++ b/tensorflow/lite/micro/micro_allocator_test.cc
@@ -227,4 +227,187 @@ TF_LITE_MICRO_TEST(TestFinishComplexTensorAllocation) {
   tflite::testing::EnsureUniqueVariableTensorBuffer(&context, 7);
 }
 
+TF_LITE_MICRO_TEST(OfflinePlannerBranchesAllOnline) {
+  int version = 1;
+  int subgraph = 0;
+  int nbr_tensors = 4;
+  const int32_t metadata_buffer[tflite::testing::kOfflinePlannerHeaderSize +
+                                nbr_tensors] = {version, subgraph,
+                                                nbr_tensors,  // header
+                                                // memory offsets:
+                                                -1, -1, -1, -1};
+
+  // The structure is identical to the one in
+  // TestAllocationForModelsWithBranches
+  std::vector<tflite::testing::NodeConnection> node_list = {
+      {
+          {0},  // input
+          {1}   // output
+      },
+      {
+          {0},  // input
+          {2}   // output
+      },
+      {
+          {1, 2},  // input1, input2
+          {3}      // output
+      }};
+
+  const tflite::Model* model = tflite::testing::GetModelWithOfflinePlanning(
+      nbr_tensors, metadata_buffer, node_list);
+
+  TfLiteContext context;
+  constexpr size_t arena_size = 4096;
+  uint8_t arena[arena_size];
+  tflite::MicroAllocator allocator(&context, model, arena, arena_size,
+                                   micro_test::reporter);
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, allocator.FinishTensorAllocation());
+
+  // Since all of the tensors are online planned and the model structure is
+  // identical to that in TestAllocationForModelsWithBranches,
+  // the offsets be should identical to that test.
+  uint8_t* start = context.tensors[0].data.uint8;
+  TF_LITE_MICRO_EXPECT_EQ(0, context.tensors[0].data.uint8 - start);
+  TF_LITE_MICRO_EXPECT_EQ(48, context.tensors[0].bytes);
+  TF_LITE_MICRO_EXPECT_EQ(96, context.tensors[1].data.uint8 - start);
+  TF_LITE_MICRO_EXPECT_EQ(48, context.tensors[2].data.uint8 - start);
+  TF_LITE_MICRO_EXPECT_EQ(0, context.tensors[3].data.uint8 - start);
+}
+
+TF_LITE_MICRO_TEST(OfflinePlannerBasic) {
+  int nbr_tensors = 4;
+  const int32_t metadata_buffer[tflite::testing::kOfflinePlannerHeaderSize +
+                                nbr_tensors] = {1,  0, nbr_tensors,
+                                                0,    // t0
+                                                48,   // t1
+                                                0,    // t2
+                                                48};  // t3
+
+  int t0 = 0;
+  int t1 = 1;
+  int t2 = 2;
+  int t3 = 3;
+
+  std::vector<tflite::testing::NodeConnection> node_list = {{
+                                                                {t0},  // input
+                                                                {t1}   // output
+                                                            },
+                                                            {
+                                                                {t1},  // input
+                                                                {t2}   // output
+                                                            },
+                                                            {
+                                                                {t2},  // input
+                                                                {t3}   // output
+                                                            }};
+
+  const tflite::Model* model = tflite::testing::GetModelWithOfflinePlanning(
+      nbr_tensors, metadata_buffer, node_list);
+
+  TfLiteContext context;
+  constexpr size_t arena_size = 4096;
+  uint8_t arena[arena_size];
+  tflite::MicroAllocator allocator(&context, model, arena, arena_size,
+                                   micro_test::reporter);
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, allocator.FinishTensorAllocation());
+
+  uint8_t* start = context.tensors[0].data.uint8;
+  TF_LITE_MICRO_EXPECT_EQ(0, context.tensors[0].data.uint8 - start);
+  TF_LITE_MICRO_EXPECT_EQ(48, context.tensors[1].data.uint8 - start);
+  TF_LITE_MICRO_EXPECT_EQ(0, context.tensors[2].data.uint8 - start);
+  TF_LITE_MICRO_EXPECT_EQ(48, context.tensors[3].data.uint8 - start);
+}
+
+TF_LITE_MICRO_TEST(OfflinePlannerOverlappingAllocation) {
+  int nbr_tensors = 4;
+  const int32_t metadata_buffer[tflite::testing::kOfflinePlannerHeaderSize +
+                                nbr_tensors] = {
+      1, 0, nbr_tensors,  // header: version, subgraph, nbr tensors
+      // memory offsets:
+      0,    // t0
+      0,    // t1
+      48,   // t2
+      -1};  // t3
+
+  int t0 = 0;
+  int t1 = 1;
+  int t2 = 2;
+  int t3 = 3;
+
+  std::vector<tflite::testing::NodeConnection> node_list = {
+      {
+          {t0, t1},  // input, scratch
+          {t2}       // output
+      },
+      {
+          {t2},  // input
+          {t3}   // output
+      },
+  };
+
+  const tflite::Model* model = tflite::testing::GetModelWithOfflinePlanning(
+      nbr_tensors, metadata_buffer, node_list);
+
+  TfLiteContext context;
+  constexpr size_t arena_size = 4096;
+  uint8_t arena[arena_size];
+  tflite::MicroAllocator allocator(&context, model, arena, arena_size,
+                                   micro_test::reporter);
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, allocator.FinishTensorAllocation());
+
+  uint8_t* start = context.tensors[0].data.uint8;
+  TF_LITE_MICRO_EXPECT_EQ(0, context.tensors[0].data.uint8 - start);
+  TF_LITE_MICRO_EXPECT_EQ(0, context.tensors[1].data.uint8 - start);
+  TF_LITE_MICRO_EXPECT_EQ(48, context.tensors[2].data.uint8 - start);
+  TF_LITE_MICRO_EXPECT_EQ(0, context.tensors[3].data.uint8 - start);
+  TF_LITE_MICRO_EXPECT_EQ(48, context.tensors[0].bytes);
+}
+
+TF_LITE_MICRO_TEST(OfflinePlannerOfflineOnline) {
+  int nbr_tensors = 5;
+  const int32_t metadata_buffer[tflite::testing::kOfflinePlannerHeaderSize +
+                                nbr_tensors] = {
+      1, 0, nbr_tensors,  // header: version, subgraph, nbr tensors
+      // memory offsets:
+      0,    // t0
+      48,   // t1
+      -1,   // t2
+      0,    // t3
+      -1};  // t4
+
+  int t0 = 0;
+  int t1 = 1;
+  int t2 = 2;
+  int t3 = 3;
+  int t4 = 4;
+
+  std::vector<tflite::testing::NodeConnection> node_list = {
+      {
+          {t0, t1},  // input, scratch
+          {t2},      // output
+      },
+      {
+          {t2},      // input
+          {t3, t4},  // output1, output2
+      },
+  };
+
+  const tflite::Model* model = tflite::testing::GetModelWithOfflinePlanning(
+      nbr_tensors, metadata_buffer, node_list);
+
+  TfLiteContext context;
+  constexpr size_t arena_size = 4096;
+  uint8_t arena[arena_size];
+  tflite::MicroAllocator allocator(&context, model, arena, arena_size,
+                                   micro_test::reporter);
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, allocator.FinishTensorAllocation());
+
+  uint8_t* start = context.tensors[0].data.uint8;
+  TF_LITE_MICRO_EXPECT_EQ(0, context.tensors[0].data.uint8 - start);
+  TF_LITE_MICRO_EXPECT_EQ(48, context.tensors[1].data.uint8 - start);
+  TF_LITE_MICRO_EXPECT_EQ(96, context.tensors[2].data.uint8 - start);
+  TF_LITE_MICRO_EXPECT_EQ(48, context.tensors[4].data.uint8 - start);
+  TF_LITE_MICRO_EXPECT_EQ(0, context.tensors[3].data.uint8 - start);
+}
+
 TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/micro/test_helpers.cc b/tensorflow/lite/micro/test_helpers.cc
index 77a1cc82f3b..b39d3b2916f 100644
--- a/tensorflow/lite/micro/test_helpers.cc
+++ b/tensorflow/lite/micro/test_helpers.cc
@@ -48,7 +48,7 @@ class StackAllocator : public flatbuffers::Allocator {
     return *inst;
   }
 
-  static constexpr size_t kStackAllocatorSize = 4096;
+  static constexpr size_t kStackAllocatorSize = 8192;
 
  private:
   uint8_t data_backing_[kStackAllocatorSize];
@@ -94,6 +94,10 @@ class ModelBuilder {
   Node AddNode(Operator op, std::initializer_list<Tensor> inputs,
                std::initializer_list<Tensor> outputs);
 
+  void AddMetadata(const char* description_string,
+                      const int32_t* metadata_buffer_data,
+                      size_t num_elements);
+
   // Constructs the flatbuffer model using `builder_` and return a pointer to
   // it. The returned model has the same lifetime as `builder_`.
   const Model* BuildModel(std::initializer_list<Tensor> inputs,
@@ -116,6 +120,16 @@ class ModelBuilder {
 
   static constexpr int kMaxTensors = 50;
   flatbuffers::Offset<tflite::Tensor> tensors_[kMaxTensors];
+
+  static constexpr int kMaxMetadataBuffers = 10;
+
+  static constexpr int kMaxMetadatas = 10;
+  flatbuffers::Offset<Metadata> metadata_[kMaxMetadatas];
+
+  flatbuffers::Offset<Buffer> metadata_buffers_[kMaxMetadataBuffers];
+
+  int nbr_of_metadata_buffers_ = 0;
+
   int next_tensor_id_ = 0;
 };
 
@@ -142,13 +156,34 @@ ModelBuilder::Node ModelBuilder::AddNode(
   return next_operator_id_ - 1;
 }
 
+void ModelBuilder::AddMetadata(const char* description_string,
+                            const int32_t* metadata_buffer_data,
+                            size_t num_elements) {
+  metadata_[ModelBuilder::nbr_of_metadata_buffers_] =
+              CreateMetadata(*builder_,
+                             builder_->CreateString(description_string),
+                             1 + ModelBuilder::nbr_of_metadata_buffers_);
+
+  metadata_buffers_[nbr_of_metadata_buffers_] = tflite::CreateBuffer(*builder_,
+                                                                       builder_->CreateVector((uint8_t*)metadata_buffer_data,
+                                                                       sizeof(uint32_t) * num_elements));
+
+  ModelBuilder::nbr_of_metadata_buffers_++;
+}
+
 const Model* ModelBuilder::BuildModel(
     std::initializer_list<ModelBuilder::Tensor> inputs,
     std::initializer_list<ModelBuilder::Tensor> outputs) {
   // Model schema requires an empty buffer at idx 0.
-  constexpr size_t kBufferSize = 1;
-  const flatbuffers::Offset<Buffer> buffers[kBufferSize] = {
-      tflite::CreateBuffer(*builder_)};
+  size_t kBufferSize = 1 + ModelBuilder::nbr_of_metadata_buffers_;
+  flatbuffers::Offset<Buffer> buffers[kBufferSize];
+  buffers[0] = tflite::CreateBuffer(*builder_);
+
+  // Place the metadata buffers first in the buffer since the indices for them
+  // have already been set in AddMetadata()
+  for (int i = 1; i < ModelBuilder::nbr_of_metadata_buffers_ + 1; ++i) {
+      buffers[i] = metadata_buffers_[i - 1];
+  }
 
   // TFLM only supports single subgraph.
   constexpr size_t subgraphs_size = 1;
@@ -159,12 +194,26 @@ const Model* ModelBuilder::BuildModel(
           builder_->CreateVector(outputs.begin(), outputs.size()),
           builder_->CreateVector(operators_, next_operator_id_),
           builder_->CreateString("test_subgraph"))};
-  const flatbuffers::Offset<Model> model_offset = tflite::CreateModel(
-      *builder_, 0,
-      builder_->CreateVector(operator_codes_, next_operator_code_id_),
-      builder_->CreateVector(subgraphs, subgraphs_size),
-      builder_->CreateString("teset_model"),
-      builder_->CreateVector(buffers, kBufferSize));
+
+  flatbuffers::Offset<Model> model_offset;
+  if (ModelBuilder::nbr_of_metadata_buffers_ > 0) {
+    model_offset = tflite::CreateModel(
+        *builder_, 0,
+        builder_->CreateVector(operator_codes_, next_operator_code_id_),
+        builder_->CreateVector(subgraphs, subgraphs_size),
+        builder_->CreateString("teset_model"),
+        builder_->CreateVector(buffers, kBufferSize),
+        0,
+        builder_->CreateVector(metadata_, ModelBuilder::nbr_of_metadata_buffers_));
+  } else {
+    model_offset = tflite::CreateModel(
+        *builder_, 0,
+        builder_->CreateVector(operator_codes_, next_operator_code_id_),
+        builder_->CreateVector(subgraphs, subgraphs_size),
+        builder_->CreateString("teset_model"),
+        builder_->CreateVector(buffers, kBufferSize));
+  }
+
   tflite::FinishModelBuffer(*builder_, model_offset);
   void* model_pointer = builder_->GetBufferPointer();
   const Model* model = flatbuffers::GetRoot<Model>(model_pointer);
@@ -243,6 +292,35 @@ const Model* BuildSimpleModelWithBranch() {
   return model_builder.BuildModel({t0}, {t3});
 }
 
+const Model* BuildModelWithOfflinePlanning(int number_of_tensors,
+    const int32_t* metadata_buffer,
+    std::vector<NodeConnection> node_conn) {
+  using flatbuffers::Offset;
+  flatbuffers::FlatBufferBuilder* fb_builder = BuilderInstance();
+
+  ModelBuilder model_builder(fb_builder);
+
+  const int op_id =
+      model_builder.RegisterOp(BuiltinOperator_CUSTOM, "mock_custom",
+                               /* version= */ 0);
+
+  int tensors[number_of_tensors];
+
+  for (int i = 0; i < number_of_tensors; ++i) {
+    tensors[i] = model_builder.AddTensor(TensorType_FLOAT32, {2, 2, 3});
+  }
+
+  for (int i = 0; i < node_conn.size(); i++) {
+    model_builder.AddNode(op_id, node_conn[i].input, node_conn[i].output);
+  }
+
+  model_builder.AddMetadata("OfflineMemoryAllocation",
+                            metadata_buffer, number_of_tensors + tflite::testing::kOfflinePlannerHeaderSize);
+
+  return model_builder.BuildModel(node_conn[0].input,
+                                  node_conn[node_conn.size() - 1].output);
+}
+
 const Model* BuildSimpleMockModel() {
   using flatbuffers::Offset;
   flatbuffers::FlatBufferBuilder* builder = BuilderInstance();
@@ -496,6 +574,14 @@ const Model* GetSimpleModelWithBranch() {
   return model;
 }
 
+const Model* GetModelWithOfflinePlanning(int num_tensors,
+                                         const int32_t* metadata_buffer,
+                                         std::vector<NodeConnection> node_conn) {
+  const Model* model =
+    BuildModelWithOfflinePlanning(num_tensors, metadata_buffer, node_conn);
+  return model;
+}
+
 const Model* GetSimpleStatefulModel() {
   static Model* model = nullptr;
   if (!model) {
diff --git a/tensorflow/lite/micro/test_helpers.h b/tensorflow/lite/micro/test_helpers.h
index f4e7fa8dfba..26aeeb086ef 100644
--- a/tensorflow/lite/micro/test_helpers.h
+++ b/tensorflow/lite/micro/test_helpers.h
@@ -27,6 +27,14 @@ limitations under the License.
 namespace tflite {
 namespace testing {
 
+constexpr int kOfflinePlannerHeaderSize = 3;
+
+struct NodeConnection_ {
+  std::initializer_list<int> input;
+  std::initializer_list<int> output;
+};
+typedef struct NodeConnection_ NodeConnection;
+
 // Returns a simple example flatbuffer TensorFlow Lite model. Contains 1 input,
 // 1 layer of weights, 1 output Tensor, and 1 operator.
 const Model* GetSimpleMockModel();
@@ -38,6 +46,11 @@ const Model* GetComplexMockModel();
 // Returns a simple flatbuffer model with two branches.
 const Model* GetSimpleModelWithBranch();
 
+// Returns a simple flatbuffer model with offline planned tensors
+const Model* GetModelWithOfflinePlanning(int num_tensors,
+                                         const int32_t* metadata_buffer,
+                                         std::vector<NodeConnection> node_conn);
+
 // Returns a flatbuffer model with `simple_stateful_op`
 const Model* GetSimpleStatefulModel();
 

From b8571e365d2e907f66693551b488ce2f72e3507b Mon Sep 17 00:00:00 2001
From: Jens Elofsson <jens.elofsson@arm.com>
Date: Thu, 2 Apr 2020 14:31:17 +0200
Subject: [PATCH 0030/1390] Fix compile error when building for ARM Cortex-M4
 etc.

---
 tensorflow/lite/micro/micro_allocator.cc | 12 ++++++------
 tensorflow/lite/micro/test_helpers.h     |  4 ++--
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/tensorflow/lite/micro/micro_allocator.cc b/tensorflow/lite/micro/micro_allocator.cc
index a9324eb35cd..fdfeb9d409c 100644
--- a/tensorflow/lite/micro/micro_allocator.cc
+++ b/tensorflow/lite/micro/micro_allocator.cc
@@ -37,7 +37,7 @@ struct AllocationInfo {
   int last_used;
   bool needs_allocating;
   void** output_ptr;
-  int offline_offset;
+  int32_t offline_offset;
 };
 
 // We align tensor buffers to 16-byte boundaries, since this is a common
@@ -206,10 +206,10 @@ class AllocationInfoBuilder {
   //  - If there's metadata available, offline_planner_offsets will point to the
   //    first offset in the metadata buffer list.
   TfLiteStatus GetOfflinePlannedOffsets(const Model* model,
-                                        int** offline_planner_offsets);
+                                        int32_t** offline_planner_offsets);
 
   // Add allocaiton information for the tensors.
-  TfLiteStatus AddTensors(const SubGraph* subgraph, int* offline_offsets,
+  TfLiteStatus AddTensors(const SubGraph* subgraph, int32_t* offline_offsets,
                           TfLiteTensor* runtime_tensors);
 
   // Add allocation information for the scratch buffers.
@@ -245,7 +245,7 @@ TfLiteStatus AllocationInfoBuilder::Allocate() {
 }
 
 TfLiteStatus AllocationInfoBuilder::AddTensors(const SubGraph* subgraph,
-                                               int* offline_offsets,
+                                               int32_t* offline_offsets,
                                                TfLiteTensor* runtime_tensors) {
   // Set up allocation info for all tensors.
   for (size_t i = 0; i < tensor_count_; ++i) {
@@ -337,7 +337,7 @@ TfLiteStatus AllocationInfoBuilder::AddTensors(const SubGraph* subgraph,
 // |    4    | Arena byte offset of tensor #1 or -1 to allocate at runtime     |
 // | 3+(n-1) | Arena byte offset of tensor #(n-1) or -1 to allocate at runtime |
 TfLiteStatus AllocationInfoBuilder::GetOfflinePlannedOffsets(
-    const Model* model, int** offline_planner_offsets) {
+    const Model* model, int32_t** offline_planner_offsets) {
   if (model->metadata()) {
     for (int i = 0; i < model->metadata()->size(); ++i) {
       auto metadata = model->metadata()->Get(i);
@@ -702,7 +702,7 @@ TfLiteStatus MicroAllocator::FinishTensorAllocation() {
     AllocationInfoBuilder builder(error_reporter_, &tmp_allocator);
     TF_LITE_ENSURE_STATUS(
         builder.Init(tensors_->size(), scratch_buffer_count_));
-    int* offline_planner_offsets = nullptr;
+    int32_t* offline_planner_offsets = nullptr;
     TF_LITE_ENSURE_STATUS(
         builder.GetOfflinePlannedOffsets(model_, &offline_planner_offsets));
     TF_LITE_ENSURE_STATUS(builder.AddTensors(subgraph_, offline_planner_offsets,
diff --git a/tensorflow/lite/micro/test_helpers.h b/tensorflow/lite/micro/test_helpers.h
index 26aeeb086ef..81416e06dcc 100644
--- a/tensorflow/lite/micro/test_helpers.h
+++ b/tensorflow/lite/micro/test_helpers.h
@@ -30,8 +30,8 @@ namespace testing {
 constexpr int kOfflinePlannerHeaderSize = 3;
 
 struct NodeConnection_ {
-  std::initializer_list<int> input;
-  std::initializer_list<int> output;
+  std::initializer_list<int32_t> input;
+  std::initializer_list<int32_t> output;
 };
 typedef struct NodeConnection_ NodeConnection;
 

From 26e4ac1d76b43d9cf5288c914985e42fd0bfdbbf Mon Sep 17 00:00:00 2001
From: Mihai Maruseac <mihai.maruseac@gmail.com>
Date: Mon, 6 Apr 2020 16:50:17 +0000
Subject: [PATCH 0031/1390] Update tensorflow/python/ops/array_ops.py

---
 tensorflow/python/ops/array_ops.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/ops/array_ops.py b/tensorflow/python/ops/array_ops.py
index 9367374717e..fc29aafb7de 100644
--- a/tensorflow/python/ops/array_ops.py
+++ b/tensorflow/python/ops/array_ops.py
@@ -3507,7 +3507,7 @@ def edit_distance(hypothesis, truth, normalize=True, name="edit_distance"):
   For example:
 
   Given the following input,
-  * `hypothesis` is a `tf.SparseTensor` of shape `(2, 1, 1)`
+  * `hypothesis` is a `tf.SparseTensor` of shape `[2, 1, 1]`
   * `truth` is a `tf.SparseTensor` of shape `(2, 2, 2)`
 
   >>> hypothesis = tf.SparseTensor(

From 05edf7e11bb3d91f0e2b75e67914c2379cf0cd89 Mon Sep 17 00:00:00 2001
From: Mihai Maruseac <mihai.maruseac@gmail.com>
Date: Mon, 6 Apr 2020 16:50:26 +0000
Subject: [PATCH 0032/1390] Update tensorflow/python/ops/array_ops.py

---
 tensorflow/python/ops/array_ops.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/ops/array_ops.py b/tensorflow/python/ops/array_ops.py
index fc29aafb7de..004bbeac7f6 100644
--- a/tensorflow/python/ops/array_ops.py
+++ b/tensorflow/python/ops/array_ops.py
@@ -3527,7 +3527,7 @@ def edit_distance(hypothesis, truth, normalize=True, name="edit_distance"):
   array([[inf, 1. ],
          [0.5, 1. ]], dtype=float32)>
 
-  The operaton returns a dense Tensor of shape `(2, 2)` with
+  The operaton returns a dense Tensor of shape `[2, 2]` with
   edit distances normalized by `truth` lengths.
 
   **Note**: It is possible to calculate edit distance between two

From 28769b2b746570883ae8a57e042f50c4defba2b5 Mon Sep 17 00:00:00 2001
From: Mihai Maruseac <mihai.maruseac@gmail.com>
Date: Mon, 6 Apr 2020 16:50:35 +0000
Subject: [PATCH 0033/1390] Update tensorflow/python/ops/array_ops.py

---
 tensorflow/python/ops/array_ops.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/ops/array_ops.py b/tensorflow/python/ops/array_ops.py
index 004bbeac7f6..e508165a820 100644
--- a/tensorflow/python/ops/array_ops.py
+++ b/tensorflow/python/ops/array_ops.py
@@ -3508,7 +3508,7 @@ def edit_distance(hypothesis, truth, normalize=True, name="edit_distance"):
 
   Given the following input,
   * `hypothesis` is a `tf.SparseTensor` of shape `[2, 1, 1]`
-  * `truth` is a `tf.SparseTensor` of shape `(2, 2, 2)`
+  * `truth` is a `tf.SparseTensor` of shape `[2, 2, 2]`
 
   >>> hypothesis = tf.SparseTensor(
   ...   [[0, 0, 0],

From 2759cdca671dee4b7a2035710cf08725a97ce73c Mon Sep 17 00:00:00 2001
From: Elena Zhelezina <elena.zhelezina@arm.com>
Date: Tue, 7 Apr 2020 16:25:36 +0100
Subject: [PATCH 0034/1390] Fix after merging with master.

---
 tensorflow/lite/python/optimize/calibrator.py      | 4 ++--
 tensorflow/lite/python/optimize/calibrator_test.py | 4 +++-
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/tensorflow/lite/python/optimize/calibrator.py b/tensorflow/lite/python/optimize/calibrator.py
index e31983b834e..90c43fcddfa 100644
--- a/tensorflow/lite/python/optimize/calibrator.py
+++ b/tensorflow/lite/python/optimize/calibrator.py
@@ -60,8 +60,8 @@ class Calibrator(object):
                              input_type,
                              output_type,
                              allow_float,
-                             resize_input=True,
-                             activations_type = lite_constants.INT8):
+                             activations_type=lite_constants.INT8,
+                             resize_input=True):
     """Calibrates the model with specified generator and then quantizes it.
 
     The input shapes of the calibrator are resized with the calibration data if
diff --git a/tensorflow/lite/python/optimize/calibrator_test.py b/tensorflow/lite/python/optimize/calibrator_test.py
index f8a1171a629..f778c8a555d 100644
--- a/tensorflow/lite/python/optimize/calibrator_test.py
+++ b/tensorflow/lite/python/optimize/calibrator_test.py
@@ -148,7 +148,9 @@ class CalibratorTest(test_util.TensorFlowTestCase, parameterized.TestCase):
 
     with self.assertRaisesRegex(ValueError, 'Size mismatch'):
       quantizer.calibrate_and_quantize(input_gen, constants.FLOAT,
-                                       constants.FLOAT, False, False)
+                                       constants.FLOAT, False,
+                                       constants.INT8,
+                                       False)
 
   def test_invalid_type_calibrator_gen(self):
     model_path = resource_loader.get_path_to_datafile(

From cc08b5dff7346b2e2cd9fb80409c8f09bfebc089 Mon Sep 17 00:00:00 2001
From: Jens Elofsson <jens.elofsson@arm.com>
Date: Tue, 7 Apr 2020 11:20:02 +0200
Subject: [PATCH 0035/1390] Update network_tester.

New features:
Multiple inputs
Multiple outputs
Output in json format
Can call invoke() more than once

Updated README
---
 .../examples/network_tester/Makefile.inc      |  4 +
 .../micro/examples/network_tester/README.md   | 34 ++++++-
 .../network_tester/expected_output_data.h     |  2 +-
 .../examples/network_tester/input_data.h      |  4 +-
 .../examples/network_tester/network_model.h   |  5 +-
 .../network_tester/network_tester_test.cc     | 88 +++++++++++++------
 6 files changed, 104 insertions(+), 33 deletions(-)

diff --git a/tensorflow/lite/micro/examples/network_tester/Makefile.inc b/tensorflow/lite/micro/examples/network_tester/Makefile.inc
index 27f54a66763..a5c911238c8 100644
--- a/tensorflow/lite/micro/examples/network_tester/Makefile.inc
+++ b/tensorflow/lite/micro/examples/network_tester/Makefile.inc
@@ -33,6 +33,10 @@ ifeq ($(COMPARE_OUTPUT_DATA),no)
   CXXFLAGS += -DNO_COMPARE_OUTPUT_DATA
 endif
 
+ifdef NUM_INFERENCES
+  CXXFLAGS += -DNUM_INFERENCES=$(NUM_INFERENCES)
+endif
+
 # Builds a standalone object recognition binary.
 $(eval $(call microlite_test,network_tester_test,\
 $(NETWORK_TESTER_TEST_SRCS),$(NETWORK_TESTER_TEST_HDRS)))
diff --git a/tensorflow/lite/micro/examples/network_tester/README.md b/tensorflow/lite/micro/examples/network_tester/README.md
index 7c4c48e4eb1..0cb709dce0a 100644
--- a/tensorflow/lite/micro/examples/network_tester/README.md
+++ b/tensorflow/lite/micro/examples/network_tester/README.md
@@ -34,8 +34,40 @@ make -f tensorflow/lite/micro/tools/make/Makefile network_tester_test \
 `ARENA_SIZE`: The size of the memory to be allocated (in bytes) by the
 interpreter. \
 `NUM_BYTES_TO_PRINT`: The number of bytes of the output data to print. \
-Defaults to 0 if not specified. \
+If set to 0, all bytes of the output are printed. \
 `COMPARE_OUTPUT_DATA`: If set to "no" the output data is not compared to the
 expected output data. This could be useful e.g. if the execution time needs to
 be minimized, or there is no expected output data. If omitted, the output data
 is compared to the expected output.
+`NUM_INFERENCES`: Define how many inferences that are made. Defaults to 1. \
+
+The output is printed in JSON format using printf:
+```
+num_of_outputs: 1
+output_begin
+[
+{
+"dims": [4,1,2,2,1],
+"data_address": "0x000000",
+"data":"0x06,0x08,0x0e,0x10"
+}]
+output_end
+```
+
+If there are multiple output tensors, the output will look like this:
+```
+num_of_outputs: 2
+output_begin
+[
+{
+"dims": [4,1,2,2,1],
+"data_address": "0x000000",
+"data":"0x06,0x08,0x0e,0x10"
+},
+{
+"dims": [4,1,2,2,1],
+"data_address": "0x111111",
+"data":"0x06,0x08,0x0e,0x10"
+}]
+output_end
+```
diff --git a/tensorflow/lite/micro/examples/network_tester/expected_output_data.h b/tensorflow/lite/micro/examples/network_tester/expected_output_data.h
index 03e21954b7f..934722bad94 100644
--- a/tensorflow/lite/micro/examples/network_tester/expected_output_data.h
+++ b/tensorflow/lite/micro/examples/network_tester/expected_output_data.h
@@ -17,6 +17,6 @@ limitations under the License.
 #define TENSORFLOW_LITE_MICRO_EXAMPLES_NETWORK_TESTER_EXPECTED_OUTPUT_DATA_H_
 
 static unsigned int expected_output_data_len = 4;
-static unsigned char expected_output_data[] = {6, 8, 14, 16};
+static unsigned char expected_output_data[1][4] = {6, 8, 14, 16};
 
 #endif  // TENSORFLOW_LITE_MICRO_EXAMPLES_NETWORK_TESTER_EXPECTED_OUTPUT_DATA_H_
diff --git a/tensorflow/lite/micro/examples/network_tester/input_data.h b/tensorflow/lite/micro/examples/network_tester/input_data.h
index b47277cca93..b3710313dd2 100644
--- a/tensorflow/lite/micro/examples/network_tester/input_data.h
+++ b/tensorflow/lite/micro/examples/network_tester/input_data.h
@@ -17,7 +17,7 @@ limitations under the License.
 #define TENSORFLOW_LITE_MICRO_EXAMPLES_NETWORK_TESTER_INPUT_DATA_H_
 
 static const int input_data_len = 16;
-static const unsigned char input_data[] = {1, 2,  3,  4,  5,  6,  7,  8,
-                                           9, 10, 11, 12, 13, 14, 15, 16};
+static const unsigned char input_data[1][16] = {{1, 2, 3, 4, 5, 6, 7, 8,
+                                                 9, 10, 11, 12, 13, 14, 15, 16}};
 
 #endif  // TENSORFLOW_LITE_MICRO_EXAMPLES_NETWORK_TESTER_INPUT_DATA_H_
diff --git a/tensorflow/lite/micro/examples/network_tester/network_model.h b/tensorflow/lite/micro/examples/network_tester/network_model.h
index 4c275dbfbba..0431d7deee7 100644
--- a/tensorflow/lite/micro/examples/network_tester/network_model.h
+++ b/tensorflow/lite/micro/examples/network_tester/network_model.h
@@ -1,8 +1,11 @@
 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
+
     http://www.apache.org/licenses/LICENSE-2.0
+
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -64,4 +67,4 @@ const unsigned char network_model[] = {
     0x08, 0x00, 0x07, 0x00, 0x06, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11};
 const unsigned int network_model_len = 576;
 
-#endif  // TENSORFLOW_LITE_MICRO_EXAMPLES_NETWORK_TESTER_NETWORK_MODEL_H_
+#endif
diff --git a/tensorflow/lite/micro/examples/network_tester/network_tester_test.cc b/tensorflow/lite/micro/examples/network_tester/network_tester_test.cc
index 0650222b970..5a307fb5c2a 100644
--- a/tensorflow/lite/micro/examples/network_tester/network_tester_test.cc
+++ b/tensorflow/lite/micro/examples/network_tester/network_tester_test.cc
@@ -1,8 +1,11 @@
 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
+
     http://www.apache.org/licenses/LICENSE-2.0
+
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -10,44 +13,54 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/lite/micro/examples/network_tester/expected_output_data.h"
-#include "tensorflow/lite/micro/examples/network_tester/input_data.h"
-#include "tensorflow/lite/micro/examples/network_tester/network_model.h"
 #include "tensorflow/lite/micro/kernels/all_ops_resolver.h"
 #include "tensorflow/lite/micro/micro_error_reporter.h"
 #include "tensorflow/lite/micro/micro_interpreter.h"
-#include "tensorflow/lite/micro/testing/micro_test.h"
-#include "tensorflow/lite/micro/testing/test_utils.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 #include "tensorflow/lite/version.h"
 
+#include "tensorflow/lite/micro/examples/network_tester/expected_output_data.h"
+#include "tensorflow/lite/micro/examples/network_tester/input_data.h"
+#include "tensorflow/lite/micro/examples/network_tester/network_model.h"
+
+#include "tensorflow/lite/micro/testing/micro_test.h"
+#include "tensorflow/lite/micro/testing/test_utils.h"
+
 #ifndef TENSOR_ARENA_SIZE
 #define TENSOR_ARENA_SIZE (1024)
 #endif
 
+#ifndef NUM_INFERENCES
+#define NUM_INFERENCES 1
+#endif
+
 uint8_t tensor_arena[TENSOR_ARENA_SIZE];
 
 #ifdef NUM_BYTES_TO_PRINT
 inline void print_output_data(TfLiteTensor* output) {
   int num_bytes_to_print =
-      (output->bytes < NUM_BYTES_TO_PRINT) ? output->bytes : NUM_BYTES_TO_PRINT;
+      ((output->bytes < NUM_BYTES_TO_PRINT) || NUM_BYTES_TO_PRINT == 0)
+          ? output->bytes
+          : NUM_BYTES_TO_PRINT;
 
   int dims_size = output->dims->size;
-  printf("dims: {%d,", dims_size);
+  printf("{\n");
+  printf("\"dims\": [%d,", dims_size);
   for (int i = 0; i < output->dims->size - 1; ++i) {
     printf("%d,", output->dims->data[i]);
   }
-  printf("%d}\n", output->dims->data[dims_size - 1]);
+  printf("%d],\n", output->dims->data[dims_size - 1]);
 
-  printf("data_address: %p\n", output->data.raw);
-  printf("data:\n{");
+  printf("\"data_address\": \"%p\",\n", output->data.raw);
+  printf("\"data\":\"");
   for (int i = 0; i < num_bytes_to_print - 1; ++i) {
-    if (i % 16 == 0) {
+    if (i % 16 == 0 && i != 0) {
       printf("\n");
     }
     printf("0x%02x,", output->data.uint8[i]);
   }
-  printf("0x%02x\n}\n", output->data.uint8[num_bytes_to_print - 1]);
+  printf("0x%02x\"\n", output->data.uint8[num_bytes_to_print - 1]);
+  printf("}");
 }
 #endif
 
@@ -63,7 +76,7 @@ TF_LITE_MICRO_TEST(TestInvoke) {
                          "Model provided is schema version %d not equal "
                          "to supported version %d.\n",
                          model->version(), TFLITE_SCHEMA_VERSION);
-    return 1;
+    return kTfLiteError;
   }
 
   tflite::ops::micro::AllOpsResolver resolver;
@@ -74,29 +87,48 @@ TF_LITE_MICRO_TEST(TestInvoke) {
   TfLiteStatus allocate_status = interpreter.AllocateTensors();
   if (allocate_status != kTfLiteOk) {
     TF_LITE_REPORT_ERROR(error_reporter, "Tensor allocation failed\n");
+    return kTfLiteError;
   }
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, allocate_status);
 
-  TfLiteTensor* input = interpreter.input(0);
-  memcpy(input->data.uint8, input_data, input->bytes);
-
-  TfLiteStatus invoke_status = interpreter.Invoke();
-  if (invoke_status != kTfLiteOk) {
-    TF_LITE_REPORT_ERROR(error_reporter, "Invoke failed\n");
-  }
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, invoke_status);
-
-  TfLiteTensor* output = interpreter.output(0);
+  for (int n = 0; n < NUM_INFERENCES; n++) {
+    for (int i = 0; i < interpreter.inputs_size(); ++i) {
+      TfLiteTensor* input = interpreter.input(i);
+      memcpy(input->data.uint8, input_data[i], input->bytes);
+    }
+    TfLiteStatus invoke_status = interpreter.Invoke();
+    if (invoke_status != kTfLiteOk) {
+      TF_LITE_REPORT_ERROR(error_reporter, "Invoke failed\n");
+      return kTfLiteError;
+    }
+    TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, invoke_status);
 
 #ifdef NUM_BYTES_TO_PRINT
-  print_output_data(output);
+    // Print all of the output data, or the first NUM_BYTES_TO_PRINT bytes,
+    // whichever comes first as well as the output shape.
+    printf("num_of_outputs: %d\n", interpreter.outputs_size());
+    printf("output_begin\n");
+    printf("[\n");
+    for (int i = 0; i < interpreter.outputs_size(); i++) {
+      TfLiteTensor* output = interpreter.output(i);
+      print_output_data(output);
+      if (i != interpreter.outputs_size() - 1) {
+        printf(",\n");
+      }
+    }
+    printf("]\n");
+    printf("output_end\n");
 #endif
 
 #ifndef NO_COMPARE_OUTPUT_DATA
-  for (int i = 0; i < output->bytes; ++i) {
-    TF_LITE_MICRO_EXPECT_EQ(output->data.uint8[i], expected_output_data[i]);
-  }
+    for (int i = 0; i < interpreter.outputs_size(); i++) {
+      TfLiteTensor* output = interpreter.output(i);
+      for (int j = 0; j < output->bytes; ++j) {
+        TF_LITE_MICRO_EXPECT_EQ(output->data.uint8[j],
+                                expected_output_data[i][j]);
+      }
+    }
 #endif
+  }
   TF_LITE_REPORT_ERROR(error_reporter, "Ran successfully\n");
 }
 

From 43036088fc3cfda3bd8d9c3a5df114d7d393618a Mon Sep 17 00:00:00 2001
From: Peng Meng <pengmeng@tencent.com>
Date: Mon, 13 Apr 2020 16:36:59 +0800
Subject: [PATCH 0036/1390] add unit test

---
 .../core/grappler/optimizers/remapper.cc      |  6 +-
 .../core/grappler/optimizers/remapper_test.cc | 70 +++++++++++++++++++
 2 files changed, 73 insertions(+), 3 deletions(-)

diff --git a/tensorflow/core/grappler/optimizers/remapper.cc b/tensorflow/core/grappler/optimizers/remapper.cc
index fdfb8e379b2..239002c9da3 100644
--- a/tensorflow/core/grappler/optimizers/remapper.cc
+++ b/tensorflow/core/grappler/optimizers/remapper.cc
@@ -1476,8 +1476,8 @@ bool RequiresInferredShapes(const RemapperContext& ctx, int node_index) {
     return true;
   };
 
-  const auto is_conv2d_candidate = [&]() -> bool {
-    if (!IsConv2D(*node_def)) return false;
+  const auto is_relu_candidate = [&]() -> bool {
+    if (!IsRelu(*node_def)) return false;
     if (GetDataTypeFromAttr(*node_def, "T") != DT_FLOAT) return false;
 
     return true;
@@ -1513,7 +1513,7 @@ bool RequiresInferredShapes(const RemapperContext& ctx, int node_index) {
     return false;
   };
 
-  return is_conv2d_candidate() || is_batch_norm_candidate() ||
+  return is_relu_candidate() || is_batch_norm_candidate() ||
          is_batch_norm_fusion_candidate();
 }
 
diff --git a/tensorflow/core/grappler/optimizers/remapper_test.cc b/tensorflow/core/grappler/optimizers/remapper_test.cc
index 35e09b28205..831aa51ee72 100644
--- a/tensorflow/core/grappler/optimizers/remapper_test.cc
+++ b/tensorflow/core/grappler/optimizers/remapper_test.cc
@@ -449,6 +449,76 @@ TEST_F(RemapperTest, FuseMatMulWithBias) {
   test::ExpectTensorNear<float>(tensors[0], tensors_expected[0], 1e-6);
 }
 
+TEST_F(RemapperTest, FuseConv2DWithBiasAndActivationOnGPU) {
+  using ::tensorflow::ops::Placeholder;
+
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+
+  auto input_shape = Placeholder::Shape({8, 32, 32, 3});
+  auto filter_shape = Placeholder::Shape({3, 3, 3, 128});
+  auto bias_shape = Placeholder::Shape({128});
+
+  auto input = Placeholder(s.WithOpName("input"), DT_FLOAT, input_shape);
+  auto filter = Placeholder(s.WithOpName("filter"), DT_FLOAT, filter_shape);
+  auto bias = Placeholder(s.WithOpName("bias"), DT_FLOAT, bias_shape);
+
+  std::vector<int> strides = {1, 1, 1, 1};
+  auto conv =
+    ops::Conv2D(s.WithOpName("conv"), input, filter, strides, "SAME");
+  auto bias_add = ops::BiasAdd(s.WithOpName("bias_add"), conv, bias);
+
+  ops::Identity fetch = [&]() -> ops::Identity {
+    auto activate = s.WithOpName("activation");
+    auto fetch = s.WithOpName("fetch");
+    return ops::Identity(fetch, ops::Relu(activate, bias_add));
+  }();
+
+  auto input_t = GenerateRandomTensor<DT_FLOAT>({8, 32, 32, 3});
+  auto filter_t = GenerateRandomTensor<DT_FLOAT>({3, 3, 3, 128});
+  auto bias_t = GenerateRandomTensor<DT_FLOAT>({128});
+
+  GrapplerItem item;
+  item.fetch = {"fetch"};
+  item.feed = {{"input", input_t}, {"filter", filter_t}, {"bias", bias_t}};
+  TF_ASSERT_OK(s.ToGraphDef(&item.graph));
+
+  // Place all nodes on GPU.
+  for (int i = 0; i < item.graph.node_size(); ++i) {
+    item.graph.mutable_node(i)->set_device("/device:GPU:0");
+  }
+
+  Remapper optimizer(RewriterConfig::AGGRESSIVE);  // trust placeholders shape
+  //Remapper optimizer(RewriterConfig::ON);
+  GraphDef output;
+  TF_ASSERT_OK(optimizer.Optimize(nullptr, item, &output));
+
+  int found = 0;
+  for (const NodeDef& node : output.node()) {
+    if (node.name() == "activation") {
+      EXPECT_EQ(node.op(), "_FusedConv2D");
+      ASSERT_GE(node.input_size(), 3);
+      EXPECT_EQ(node.input(0), "input");
+      EXPECT_EQ(node.input(1), "filter");
+
+      EXPECT_EQ(node.attr().at("num_args").i(), 1);
+      EXPECT_EQ(node.input(2), "bias");
+
+      const auto fused_ops = node.attr().at("fused_ops").list().s();
+      ASSERT_EQ(fused_ops.size(), 2);
+      EXPECT_EQ(fused_ops[0], "BiasAdd");
+      EXPECT_EQ(fused_ops[1], "Relu");
+      found++;
+    }
+  }
+  EXPECT_EQ(found, 1);
+
+  auto tensors_expected = EvaluateNodes(item.graph, item.fetch, item.feed);
+  ASSERT_EQ(tensors_expected.size(), 1);
+  auto tensors = EvaluateNodes(output, item.fetch, item.feed);
+  ASSERT_EQ(tensors.size(), 1);
+  test::ExpectTensorNear<float>(tensors[0], tensors_expected[0], 1e-6);
+}
+
 TEST_F(RemapperTest, FuseConv2DWithBiasAndActivation) {
   using ::tensorflow::ops::Placeholder;
 

From 8b791cbf71aa68a91c69080b4313f115cb60fbf5 Mon Sep 17 00:00:00 2001
From: Peng Meng <pengmeng@tencent.com>
Date: Mon, 13 Apr 2020 17:36:16 +0800
Subject: [PATCH 0037/1390] optimize shape infer condition

---
 .../core/grappler/optimizers/remapper.cc      | 25 +++++++++++++++++--
 1 file changed, 23 insertions(+), 2 deletions(-)

diff --git a/tensorflow/core/grappler/optimizers/remapper.cc b/tensorflow/core/grappler/optimizers/remapper.cc
index 239002c9da3..427a3f14aca 100644
--- a/tensorflow/core/grappler/optimizers/remapper.cc
+++ b/tensorflow/core/grappler/optimizers/remapper.cc
@@ -1461,6 +1461,7 @@ Status AddBatchNormNodes(RemapperContext* ctx, const FusedBatchNorm& matched) {
 // shapes:
 //   (1) Splitting FusedBatchNorm into primitives.
 //   (2) Fusing side input and/or activation into FusedBatchNorm.
+//   (3) Fusing Conv2D biasadd and relu on GPU
 bool RequiresInferredShapes(const RemapperContext& ctx, int node_index) {
   // Candidate for a FusedBatchNorm splitting.
   const auto* node_view = ctx.graph_view.GetNode(node_index);
@@ -1476,10 +1477,30 @@ bool RequiresInferredShapes(const RemapperContext& ctx, int node_index) {
     return true;
   };
 
-  const auto is_relu_candidate = [&]() -> bool {
+  const auto is_relu_biasadd_conv2d_candidate = [&]() -> bool {
     if (!IsRelu(*node_def)) return false;
     if (GetDataTypeFromAttr(*node_def, "T") != DT_FLOAT) return false;
 
+    if (node_view->NumRegularFanins() < 1) return false;
+    const auto& relu_fanin_0 = node_view->GetRegularFanin(0);
+    const auto* relu_fanin_0_node_view = relu_fanin_0.node_view();
+    const auto* relu_fanin_0_node_def = relu_fanin_0_node_view->node();
+
+    if (!IsBiasAdd(*relu_fanin_0_node_def)) return false;
+    if (GetDataTypeFromAttr(*relu_fanin_0_node_def, "T") != DT_FLOAT)
+      return false;
+
+    if (relu_fanin_0_node_view->NumRegularFanins() < 1) return false;
+
+    const auto& biasadd_fanin_0 =
+          relu_fanin_0_node_view->GetRegularFanin(0);
+    const auto* biasadd_fanin_0_node_def =
+          biasadd_fanin_0.node_view()->node();
+
+    if (!IsConv2D(*biasadd_fanin_0_node_def)) return false;
+    if (GetDataTypeFromAttr(*biasadd_fanin_0_node_def, "T") != DT_FLOAT)
+      return false;
+
     return true;
   };
 
@@ -1513,7 +1534,7 @@ bool RequiresInferredShapes(const RemapperContext& ctx, int node_index) {
     return false;
   };
 
-  return is_relu_candidate() || is_batch_norm_candidate() ||
+  return is_relu_biasadd_conv2d_candidate() || is_batch_norm_candidate() ||
          is_batch_norm_fusion_candidate();
 }
 

From 9502277aa4a5e7219bc967a5016219fd970fff2f Mon Sep 17 00:00:00 2001
From: Biagio Montaruli <biagio.hkr@gmail.com>
Date: Tue, 21 Apr 2020 15:59:34 +0200
Subject: [PATCH 0038/1390] Update README.md of micro_speech example

* Fix link to 'Animation on Arduino'
* Substituite 'sample' with 'example' in the documentation
---
 .../lite/micro/examples/micro_speech/README.md   | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/tensorflow/lite/micro/examples/micro_speech/README.md b/tensorflow/lite/micro/examples/micro_speech/README.md
index 7ccaa806366..2a1898d25fb 100644
--- a/tensorflow/lite/micro/examples/micro_speech/README.md
+++ b/tensorflow/lite/micro/examples/micro_speech/README.md
@@ -7,7 +7,7 @@ The application listens to its surroundings with a microphone and indicates
 when it has detected a word by lighting an LED or displaying data on a
 screen, depending on the capabilities of the device.
 
-![Animation on Arduino](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/micro/examples/hello_world/images/animation_on_arduino.gif)
+![Animation on Arduino](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/micro/examples/micro_speech/images/animation_on_arduino.gif)
 
 The code has a small footprint (for example, around 22 kilobytes on a Cortex
 M3) and only uses about 10 kilobytes of RAM for working memory, so it's able to
@@ -27,10 +27,10 @@ kilobytes of Flash.
 
 ## Deploy to Arduino
 
-The following instructions will help you build and deploy this sample
+The following instructions will help you build and deploy this example
 to [Arduino](https://www.arduino.cc/) devices.
 
-The sample has been tested with the following devices:
+The example has been tested with the following devices:
 
 - [Arduino Nano 33 BLE Sense](https://store.arduino.cc/usa/nano-33-ble-sense-with-headers)
 
@@ -84,11 +84,11 @@ If you don't see any output, repeat the process again.
 
 ## Deploy to ESP32
 
-The following instructions will help you build and deploy this sample to
+The following instructions will help you build and deploy this example to
 [ESP32](https://www.espressif.com/en/products/hardware/esp32/overview) devices
 using the [ESP IDF](https://github.com/espressif/esp-idf).
 
-The sample has been tested on ESP-IDF version 4.0 with the following devices: -
+The example has been tested on ESP-IDF version 4.0 with the following devices: -
 [ESP32-DevKitC](http://esp-idf.readthedocs.io/en/latest/get-started/get-started-devkitc.html) -
 [ESP-EYE](https://github.com/espressif/esp-who/blob/master/docs/en/get-started/ESP-EYE_Getting_Started_Guide.md)
 
@@ -139,7 +139,7 @@ monitor`
 
 ## Deploy to SparkFun Edge
 
-The following instructions will help you build and deploy this sample on the
+The following instructions will help you build and deploy this example on the
 [SparkFun Edge development board](https://sparkfun.com/products/15170).
 
 The program will toggle the blue LED on and off with each inference. It will
@@ -288,7 +288,7 @@ followed by the `K` key, then hit the `Y` key.
 
 ## Deploy to STM32F746
 
-The following instructions will help you build and deploy the sample to the
+The following instructions will help you build and deploy the example to the
 [STM32F7 discovery kit](https://os.mbed.com/platforms/ST-Discovery-F746NG/)
 using [ARM Mbed](https://github.com/ARMmbed/mbed-cli).
 
@@ -392,7 +392,7 @@ followed by the `K` key, then hit the `Y` key.
 
 ## Deploy to NXP FRDM K66F
 
-The following instructions will help you build and deploy the sample to the
+The following instructions will help you build and deploy the example to the
 [NXP FRDM K66F](https://www.nxp.com/design/development-boards/freedom-development-boards/mcu-boards/freedom-development-platform-for-kinetis-k66-k65-and-k26-mcus:FRDM-K66F)
 using [ARM Mbed](https://github.com/ARMmbed/mbed-cli).
 

From 8bcb69a2fc5e82c7b2b8028cfb1c3cc9a89e59f6 Mon Sep 17 00:00:00 2001
From: Biagio Montaruli <biagio.hkr@gmail.com>
Date: Tue, 21 Apr 2020 16:18:16 +0200
Subject: [PATCH 0039/1390] Update training documentation related to
 micro_speech example

* Update link in the 'Overview' section
* Update table in the 'Trained Models' section
* Update spacing
---
 .../examples/micro_speech/train/README.md     | 19 ++++++++-----------
 1 file changed, 8 insertions(+), 11 deletions(-)

diff --git a/tensorflow/lite/micro/examples/micro_speech/train/README.md b/tensorflow/lite/micro/examples/micro_speech/train/README.md
index 5793985a6e0..8228a71970f 100644
--- a/tensorflow/lite/micro/examples/micro_speech/train/README.md
+++ b/tensorflow/lite/micro/examples/micro_speech/train/README.md
@@ -34,14 +34,13 @@ go
 
 ## Overview
 
-1. Training Jupyter Notebook: [`train_micro_speech_model.ipynb`]
-(https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/micro/examples/micro_speech/train/train_micro_speech_model.ipynb)
-. The training scripts used in this notebook is defined the
+1. Training Jupyter Notebook: [`train_micro_speech_model.ipynb`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/micro/examples/micro_speech/train/train_micro_speech_model.ipynb).
+The training scripts used in this notebook is defined the
 [Simple Audio Recognition](https://www.tensorflow.org/tutorials/sequences/audio_recognition)
 tutorial.
 2. Dataset Type: **Speech**
-3. Dataset: Speech Commands, Version 2. ([Download Link](https://storage.cloud.google.com/download.tensorflow.org/data/speech_commands_v0.02.tar.gz)
-, [Paper](https://arxiv.org/abs/1804.03209))
+3. Dataset: Speech Commands, Version 2. ([Download Link](https://storage.cloud.google.com/download.tensorflow.org/data/speech_commands_v0.02.tar.gz), 
+[Paper](https://arxiv.org/abs/1804.03209))
 4. Deep Learning Framework: **TensorFlow 1.5**
 5. Language: **Python 3.7**
 6. Model Size: **<20 kB**
@@ -60,10 +59,8 @@ includes the following 3 model files:
 | Name | Format | Target Framework | Target Device |
 | :------------- |:-------------|:-------------|-----|
 | `model.pb` | Frozen GraphDef | TensorFlow | Large-Scale/Cloud/Servers   |
-| `model.tflite` *(<20 kB)*  | Fully Quantized* TFLite Model |
-TensorFlow Lite | Mobile Devices|
-| `model.cc`  | C Source File | TensorFlow Lite for Microcontrollers |
-Microcontrollers |
+| `model.tflite` *(<20 kB)* | Fully Quantized* TFLite Model | TensorFlow Lite | Mobile Devices|
+| `model.cc` | C Source File | TensorFlow Lite for Microcontrollers | Microcontrollers |
 
 **Fully quantized implies that the model is **strictly int8** quantized
 including the input(s)and output(s).*
@@ -154,8 +151,8 @@ simpler model for accurate results.
 
 ## Dataset
 
-The Speech Commands Dataset. ([Download Link](https://storage.cloud.google.com/download.tensorflow.org/data/speech_commands_v0.02.tar.gz)
-,[Paper](https://arxiv.org/abs/1804.03209)) consists of over 105,000 WAVE audio
+The Speech Commands Dataset. ([Download Link](https://storage.cloud.google.com/download.tensorflow.org/data/speech_commands_v0.02.tar.gz), 
+[Paper](https://arxiv.org/abs/1804.03209)) consists of over 105,000 WAVE audio
 files of people saying thirty different words. This data was collected by
 Google and released under a CC BY license. You can help improve it by
 contributing five minutes of your own voice. The archive is over 2GB, so this

From e9b965a29cda282bc5dd8b2bcbc7991bb623734e Mon Sep 17 00:00:00 2001
From: Fredrik Knutsson <fredrik.knutsson@arm.com>
Date: Fri, 24 Apr 2020 15:19:39 +0200
Subject: [PATCH 0040/1390] Fix review comments, 24/4

---
 .../person_detection/person_detection_test.cc |  2 +
 .../memory_planner/greedy_memory_planner.h    |  6 ++-
 tensorflow/lite/micro/micro_allocator.cc      | 52 +++++--------------
 .../lite/micro/micro_optional_debug_tools.cc  | 31 +++++++++++
 .../lite/micro/micro_optional_debug_tools.h   |  3 ++
 5 files changed, 54 insertions(+), 40 deletions(-)

diff --git a/tensorflow/lite/micro/examples/person_detection/person_detection_test.cc b/tensorflow/lite/micro/examples/person_detection/person_detection_test.cc
index 51a61881ead..f57e6ea88f1 100644
--- a/tensorflow/lite/micro/examples/person_detection/person_detection_test.cc
+++ b/tensorflow/lite/micro/examples/person_detection/person_detection_test.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/lite/micro/micro_error_reporter.h"
 #include "tensorflow/lite/micro/micro_interpreter.h"
 #include "tensorflow/lite/micro/micro_mutable_op_resolver.h"
+#include "tensorflow/lite/micro/micro_optional_debug_tools.h"
 #include "tensorflow/lite/micro/testing/micro_test.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 #include "tensorflow/lite/version.h"
@@ -46,6 +47,7 @@ TF_LITE_MICRO_TEST(TestInvoke) {
                          "to supported version %d.\n",
                          model->version(), TFLITE_SCHEMA_VERSION);
   }
+  PrintModelData(model, error_reporter);
 
   // Pull in only the operation implementations we need.
   // This relies on a complete list of all the ops needed by this graph.
diff --git a/tensorflow/lite/micro/memory_planner/greedy_memory_planner.h b/tensorflow/lite/micro/memory_planner/greedy_memory_planner.h
index c849e57645c..19a36f342fd 100644
--- a/tensorflow/lite/micro/memory_planner/greedy_memory_planner.h
+++ b/tensorflow/lite/micro/memory_planner/greedy_memory_planner.h
@@ -144,8 +144,10 @@ class GreedyMemoryPlanner : public MemoryPlanner {
   int* buffer_sizes_sorted_;
   int* buffer_ids_sorted_;
   ListEntry* buffers_sorted_by_offset_;
-  int next_free_entry_;
-  int first_entry_index_;
+  int next_free_entry_; // Index of the next free entry of
+                        // buffers_sorted_by_offset_
+  int first_entry_index_; // Index of the first entry (smallest offset) of
+                          // buffers_sorted_by_offset_
 
   // Stores the outcome of the plan, the location of each buffer in the arena.
   int* buffer_offsets_;
diff --git a/tensorflow/lite/micro/micro_allocator.cc b/tensorflow/lite/micro/micro_allocator.cc
index daa7cbcb0c9..c57294f5745 100644
--- a/tensorflow/lite/micro/micro_allocator.cc
+++ b/tensorflow/lite/micro/micro_allocator.cc
@@ -45,6 +45,8 @@ struct AllocationInfo {
 // requirement for SIMD extensions.
 constexpr int kBufferAlignment = 16;
 
+constexpr char kOfflineMemAllocMetadata[] = "OfflineMemoryAllocation";
+
 class MicroBuiltinDataAllocator : public BuiltinDataAllocator {
  public:
   explicit MicroBuiltinDataAllocator(SimpleMemoryAllocator* memory_allocator)
@@ -81,33 +83,6 @@ TfLiteStatus AllocateVariables(
   return kTfLiteOk;
 }
 
-// Helper function to print model flatbuffer data. This function is not called
-// by default. Hence it's not linked in to the final binary code.
-void PrintModelData(const Model* model, ErrorReporter* error_reporter) {
-  auto* subgraphs = model->subgraphs();
-  const SubGraph* subgraph = (*subgraphs)[0];
-  const flatbuffers::Vector<flatbuffers::Offset<Tensor>>* tensors =
-      subgraph->tensors();
-  const flatbuffers::Vector<flatbuffers::Offset<Buffer>>* buffers =
-      model->buffers();
-  TF_LITE_REPORT_ERROR(error_reporter, "==== Model info: =====");
-  for (int i = 0; i < tensors->size(); ++i) {
-    const tflite::Tensor& flatbuffer_tensor = *tensors->Get(i);
-    auto* quantization = flatbuffer_tensor.quantization();
-    size_t type_size, tensor_size;
-    auto* buffer = (*buffers)[flatbuffer_tensor.buffer()];
-    auto* array = buffer->data();
-    int array_size = 0;
-    if (array) {
-      array_size = array->size();
-    }
-    BytesRequiredForTensor(flatbuffer_tensor, &tensor_size, &type_size,
-                           error_reporter);
-    TF_LITE_REPORT_ERROR(
-        error_reporter, "Tensor index: %d arena tensor %d size %d", i,
-        !array_size && !flatbuffer_tensor.is_variable(), tensor_size);
-  }
-}
 
 // Helper function to check flatbuffer metadata correctness. This function is
 // not called by default. Hence it's not linked in to the final binary code.
@@ -116,8 +91,8 @@ TfLiteStatus CheckOfflinePlannedOffsets(const Model* model,
   if (model->metadata()) {
     for (int i = 0; i < model->metadata()->size(); ++i) {
       auto metadata = model->metadata()->Get(i);
-      if (strncmp(metadata->name()->c_str(), "OfflineMemoryAllocation",
-                  strlen("OfflineMemoryAllocation")) == 0) {
+      if (strncmp(metadata->name()->c_str(), kOfflineMemAllocMetadata,
+                  strlen(kOfflineMemAllocMetadata)) == 0) {
         auto* subgraphs = model->subgraphs();
         const SubGraph* subgraph = (*subgraphs)[0];
         const flatbuffers::Vector<flatbuffers::Offset<Tensor>>* tensors =
@@ -311,8 +286,9 @@ TfLiteStatus AllocationInfoBuilder::AddTensors(const SubGraph* subgraph,
 // |    name:string     | “OfflineMemoryAllocation”                            |
 // |    buffer:unit     | Index of buffer containing memory allocation data    |
 //
-// The buffer contents for the memory allocation is a list of 32-bit integers of
-// the following format:
+// The buffer contents for the memory allocation is a list of 32-bit integers.
+// The number of tensors, n, must be equal to the number of tensors defined in
+// the model. The following encoding applies:
 //
 // |  Offset |                            Value                                |
 // |    0    | Offline allocation format version – set to 0                    |
@@ -326,8 +302,8 @@ TfLiteStatus AllocationInfoBuilder::GetOfflinePlannedOffsets(
   if (model->metadata()) {
     for (int i = 0; i < model->metadata()->size(); ++i) {
       auto metadata = model->metadata()->Get(i);
-      if (strncmp(metadata->name()->c_str(), "OfflineMemoryAllocation",
-                  strlen("OfflineMemoryAllocation")) == 0) {
+      if (strncmp(metadata->name()->c_str(), kOfflineMemAllocMetadata,
+                  strlen(kOfflineMemAllocMetadata)) == 0) {
         const flatbuffers::Vector<flatbuffers::Offset<Buffer>>* buffers =
             model->buffers();
         auto* buffer = (*buffers)[metadata->buffer()];
@@ -365,7 +341,8 @@ TfLiteStatus AllocationInfoBuilder::AddScratchBuffers(
   return kTfLiteOk;
 }
 
-TfLiteStatus CreatePlan(ErrorReporter* error_reporter, MemoryPlanner* planner,
+TfLiteStatus CreatePlan(ErrorReporter* error_reporter,
+                        GreedyMemoryPlanner* planner,
                         const AllocationInfo* allocation_info,
                         size_t allocation_info_size) {
   // Add the tensors to our allocation plan.
@@ -380,10 +357,9 @@ TfLiteStatus CreatePlan(ErrorReporter* error_reporter, MemoryPlanner* planner,
                                current->first_created, current->last_used));
       } else {
         TF_LITE_ENSURE_STATUS(
-            (static_cast<GreedyMemoryPlanner*>(planner))
-                ->AddBuffer(error_reporter, aligned_bytes_required,
-                            current->first_created, current->last_used,
-                            current->offline_offset));
+            planner->AddBuffer(error_reporter, aligned_bytes_required,
+                               current->first_created, current->last_used,
+                               current->offline_offset));
       }
     }
   }
diff --git a/tensorflow/lite/micro/micro_optional_debug_tools.cc b/tensorflow/lite/micro/micro_optional_debug_tools.cc
index 70f16c78d79..10373d3c034 100644
--- a/tensorflow/lite/micro/micro_optional_debug_tools.cc
+++ b/tensorflow/lite/micro/micro_optional_debug_tools.cc
@@ -22,6 +22,8 @@ limitations under the License.
 #include <cinttypes>
 
 #include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/micro/memory_helpers.h"
+
 namespace tflite {
 namespace {
 
@@ -100,6 +102,35 @@ const char* AllocTypeName(TfLiteAllocationType type) {
 }
 }  // namespace
 
+// Helper function to print model flatbuffer data. This function is not called
+// by default. Hence it's not linked in to the final binary code.
+void PrintModelData(const Model* model, ErrorReporter* error_reporter) {
+  auto* subgraphs = model->subgraphs();
+  const SubGraph* subgraph = (*subgraphs)[0];
+  const flatbuffers::Vector<flatbuffers::Offset<Tensor>>* tensors =
+    subgraph->tensors();
+  const flatbuffers::Vector<flatbuffers::Offset<Buffer>>* buffers =
+    model->buffers();
+  TF_LITE_REPORT_ERROR(error_reporter, "==== Model info: =====");
+  for (int i = 0; i < tensors->size(); ++i) {
+    const tflite::Tensor& flatbuffer_tensor = *tensors->Get(i);
+    auto* quantization = flatbuffer_tensor.quantization();
+    size_t type_size, tensor_size;
+    auto* buffer = (*buffers)[flatbuffer_tensor.buffer()];
+    auto* array = buffer->data();
+    int array_size = 0;
+    if (array) {
+      array_size = array->size();
+    }
+    BytesRequiredForTensor(flatbuffer_tensor, &tensor_size, &type_size,
+                           error_reporter);
+    TF_LITE_REPORT_ERROR(
+      error_reporter,
+      "Tensor index: %d arena tensor %d size %d ",
+      i, !array_size && !flatbuffer_tensor.is_variable(), tensor_size);
+  }
+}
+
 // Prints a dump of what tensors and what nodes are in the interpreter.
 void PrintInterpreterState(MicroInterpreter* interpreter) {
   printf("Interpreter has %zu tensors and %zu nodes\n",
diff --git a/tensorflow/lite/micro/micro_optional_debug_tools.h b/tensorflow/lite/micro/micro_optional_debug_tools.h
index ae96b62ab3c..cc9630e6f12 100644
--- a/tensorflow/lite/micro/micro_optional_debug_tools.h
+++ b/tensorflow/lite/micro/micro_optional_debug_tools.h
@@ -20,6 +20,9 @@ limitations under the License.
 #include "tensorflow/lite/micro/micro_interpreter.h"
 
 namespace tflite {
+// Helper function to print model flatbuffer data. This function is not called
+// by default. Hence it's not linked in to the final binary code.
+void PrintModelData(const Model* model, ErrorReporter* error_reporter);
 // Prints a dump of what tensors and what nodes are in the interpreter.
 void PrintInterpreterState(MicroInterpreter* interpreter);
 }  // namespace tflite

From 7d3237ca0951e102dfcc04f5bd98e0bc1fa1e22c Mon Sep 17 00:00:00 2001
From: Jens Elofsson <jens.elofsson@arm.com>
Date: Mon, 27 Apr 2020 13:16:49 +0200
Subject: [PATCH 0041/1390] Address reviewer comments.

---
 tensorflow/lite/micro/micro_allocator_test.cc | 65 ++++++++++---------
 tensorflow/lite/micro/test_helpers.cc         | 56 ++++++++--------
 tensorflow/lite/micro/test_helpers.h          |  3 +-
 3 files changed, 64 insertions(+), 60 deletions(-)

diff --git a/tensorflow/lite/micro/micro_allocator_test.cc b/tensorflow/lite/micro/micro_allocator_test.cc
index 45bc3b06b24..b5db8fdd626 100644
--- a/tensorflow/lite/micro/micro_allocator_test.cc
+++ b/tensorflow/lite/micro/micro_allocator_test.cc
@@ -247,22 +247,22 @@ TF_LITE_MICRO_TEST(OfflinePlannerBranchesAllOnline) {
 
   // The structure is identical to the one in
   // TestAllocationForModelsWithBranches
-  std::vector<tflite::testing::NodeConnection> node_list = {
-      {
-          {0},  // input
-          {1}   // output
-      },
-      {
-          {0},  // input
-          {2}   // output
-      },
-      {
-          {1, 2},  // input1, input2
-          {3}      // output
-      }};
+  int num_conns = 3;
+  tflite::testing::NodeConnection node_list[3] = {{
+                                                      {0},  // input
+                                                      {1}   // output
+                                                  },
+                                                  {
+                                                      {0},  // input
+                                                      {2}   // output
+                                                  },
+                                                  {
+                                                      {1, 2},  // input1, input2
+                                                      {3}      // output
+                                                  }};
 
   const tflite::Model* model = tflite::testing::GetModelWithOfflinePlanning(
-      nbr_tensors, metadata_buffer, node_list);
+      nbr_tensors, metadata_buffer, node_list, num_conns);
 
   TfLiteContext context;
   constexpr size_t arena_size = 4096;
@@ -296,21 +296,22 @@ TF_LITE_MICRO_TEST(OfflinePlannerBasic) {
   int t2 = 2;
   int t3 = 3;
 
-  std::vector<tflite::testing::NodeConnection> node_list = {{
-                                                                {t0},  // input
-                                                                {t1}   // output
-                                                            },
-                                                            {
-                                                                {t1},  // input
-                                                                {t2}   // output
-                                                            },
-                                                            {
-                                                                {t2},  // input
-                                                                {t3}   // output
-                                                            }};
+  int num_conns = 3;
+  tflite::testing::NodeConnection node_list[3] = {{
+                                                      {t0},  // input
+                                                      {t1}   // output
+                                                  },
+                                                  {
+                                                      {t1},  // input
+                                                      {t2}   // output
+                                                  },
+                                                  {
+                                                      {t2},  // input
+                                                      {t3}   // output
+                                                  }};
 
   const tflite::Model* model = tflite::testing::GetModelWithOfflinePlanning(
-      nbr_tensors, metadata_buffer, node_list);
+      nbr_tensors, metadata_buffer, node_list, num_conns);
 
   TfLiteContext context;
   constexpr size_t arena_size = 4096;
@@ -342,7 +343,8 @@ TF_LITE_MICRO_TEST(OfflinePlannerOverlappingAllocation) {
   int t2 = 2;
   int t3 = 3;
 
-  std::vector<tflite::testing::NodeConnection> node_list = {
+  int num_conns = 2;
+  tflite::testing::NodeConnection node_list[2] = {
       {
           {t0, t1},  // input, scratch
           {t2}       // output
@@ -354,7 +356,7 @@ TF_LITE_MICRO_TEST(OfflinePlannerOverlappingAllocation) {
   };
 
   const tflite::Model* model = tflite::testing::GetModelWithOfflinePlanning(
-      nbr_tensors, metadata_buffer, node_list);
+      nbr_tensors, metadata_buffer, node_list, num_conns);
 
   TfLiteContext context;
   constexpr size_t arena_size = 4096;
@@ -389,7 +391,8 @@ TF_LITE_MICRO_TEST(OfflinePlannerOfflineOnline) {
   int t3 = 3;
   int t4 = 4;
 
-  std::vector<tflite::testing::NodeConnection> node_list = {
+  int num_conns = 2;
+  tflite::testing::NodeConnection node_list[2] = {
       {
           {t0, t1},  // input, scratch
           {t2},      // output
@@ -401,7 +404,7 @@ TF_LITE_MICRO_TEST(OfflinePlannerOfflineOnline) {
   };
 
   const tflite::Model* model = tflite::testing::GetModelWithOfflinePlanning(
-      nbr_tensors, metadata_buffer, node_list);
+      nbr_tensors, metadata_buffer, node_list, num_conns);
 
   TfLiteContext context;
   constexpr size_t arena_size = 4096;
diff --git a/tensorflow/lite/micro/test_helpers.cc b/tensorflow/lite/micro/test_helpers.cc
index b39d3b2916f..f52ebdc4d45 100644
--- a/tensorflow/lite/micro/test_helpers.cc
+++ b/tensorflow/lite/micro/test_helpers.cc
@@ -95,8 +95,7 @@ class ModelBuilder {
                std::initializer_list<Tensor> outputs);
 
   void AddMetadata(const char* description_string,
-                      const int32_t* metadata_buffer_data,
-                      size_t num_elements);
+                   const int32_t* metadata_buffer_data, size_t num_elements);
 
   // Constructs the flatbuffer model using `builder_` and return a pointer to
   // it. The returned model has the same lifetime as `builder_`.
@@ -157,16 +156,15 @@ ModelBuilder::Node ModelBuilder::AddNode(
 }
 
 void ModelBuilder::AddMetadata(const char* description_string,
-                            const int32_t* metadata_buffer_data,
-                            size_t num_elements) {
+                               const int32_t* metadata_buffer_data,
+                               size_t num_elements) {
   metadata_[ModelBuilder::nbr_of_metadata_buffers_] =
-              CreateMetadata(*builder_,
-                             builder_->CreateString(description_string),
-                             1 + ModelBuilder::nbr_of_metadata_buffers_);
+      CreateMetadata(*builder_, builder_->CreateString(description_string),
+                     1 + ModelBuilder::nbr_of_metadata_buffers_);
 
-  metadata_buffers_[nbr_of_metadata_buffers_] = tflite::CreateBuffer(*builder_,
-                                                                       builder_->CreateVector((uint8_t*)metadata_buffer_data,
-                                                                       sizeof(uint32_t) * num_elements));
+  metadata_buffers_[nbr_of_metadata_buffers_] = tflite::CreateBuffer(
+      *builder_, builder_->CreateVector((uint8_t*)metadata_buffer_data,
+                                        sizeof(uint32_t) * num_elements));
 
   ModelBuilder::nbr_of_metadata_buffers_++;
 }
@@ -175,14 +173,14 @@ const Model* ModelBuilder::BuildModel(
     std::initializer_list<ModelBuilder::Tensor> inputs,
     std::initializer_list<ModelBuilder::Tensor> outputs) {
   // Model schema requires an empty buffer at idx 0.
-  size_t kBufferSize = 1 + ModelBuilder::nbr_of_metadata_buffers_;
-  flatbuffers::Offset<Buffer> buffers[kBufferSize];
+  size_t buffer_size = 1 + ModelBuilder::nbr_of_metadata_buffers_;
+  flatbuffers::Offset<Buffer> buffers[kMaxMetadataBuffers];
   buffers[0] = tflite::CreateBuffer(*builder_);
 
   // Place the metadata buffers first in the buffer since the indices for them
   // have already been set in AddMetadata()
   for (int i = 1; i < ModelBuilder::nbr_of_metadata_buffers_ + 1; ++i) {
-      buffers[i] = metadata_buffers_[i - 1];
+    buffers[i] = metadata_buffers_[i - 1];
   }
 
   // TFLM only supports single subgraph.
@@ -202,16 +200,16 @@ const Model* ModelBuilder::BuildModel(
         builder_->CreateVector(operator_codes_, next_operator_code_id_),
         builder_->CreateVector(subgraphs, subgraphs_size),
         builder_->CreateString("teset_model"),
-        builder_->CreateVector(buffers, kBufferSize),
-        0,
-        builder_->CreateVector(metadata_, ModelBuilder::nbr_of_metadata_buffers_));
+        builder_->CreateVector(buffers, buffer_size), 0,
+        builder_->CreateVector(metadata_,
+                               ModelBuilder::nbr_of_metadata_buffers_));
   } else {
     model_offset = tflite::CreateModel(
         *builder_, 0,
         builder_->CreateVector(operator_codes_, next_operator_code_id_),
         builder_->CreateVector(subgraphs, subgraphs_size),
         builder_->CreateString("teset_model"),
-        builder_->CreateVector(buffers, kBufferSize));
+        builder_->CreateVector(buffers, buffer_size));
   }
 
   tflite::FinishModelBuffer(*builder_, model_offset);
@@ -293,8 +291,9 @@ const Model* BuildSimpleModelWithBranch() {
 }
 
 const Model* BuildModelWithOfflinePlanning(int number_of_tensors,
-    const int32_t* metadata_buffer,
-    std::vector<NodeConnection> node_conn) {
+                                           const int32_t* metadata_buffer,
+                                           NodeConnection* node_conn,
+                                           int num_conns) {
   using flatbuffers::Offset;
   flatbuffers::FlatBufferBuilder* fb_builder = BuilderInstance();
 
@@ -310,15 +309,16 @@ const Model* BuildModelWithOfflinePlanning(int number_of_tensors,
     tensors[i] = model_builder.AddTensor(TensorType_FLOAT32, {2, 2, 3});
   }
 
-  for (int i = 0; i < node_conn.size(); i++) {
+  for (int i = 0; i < num_conns; ++i) {
     model_builder.AddNode(op_id, node_conn[i].input, node_conn[i].output);
   }
 
-  model_builder.AddMetadata("OfflineMemoryAllocation",
-                            metadata_buffer, number_of_tensors + tflite::testing::kOfflinePlannerHeaderSize);
+  model_builder.AddMetadata(
+      "OfflineMemoryAllocation", metadata_buffer,
+      number_of_tensors + tflite::testing::kOfflinePlannerHeaderSize);
 
   return model_builder.BuildModel(node_conn[0].input,
-                                  node_conn[node_conn.size() - 1].output);
+                                  node_conn[num_conns - 1].output);
 }
 
 const Model* BuildSimpleMockModel() {
@@ -408,8 +408,7 @@ const Model* BuildComplexMockModel() {
   constexpr size_t buffers_size = 7;
   const Offset<Buffer> buffers[buffers_size] = {
       // Op 1 buffers:
-      CreateBuffer(*builder),
-      CreateBuffer(*builder),
+      CreateBuffer(*builder), CreateBuffer(*builder),
       CreateBuffer(*builder,
                    builder->CreateVector(buffer_data_1, buffer_data_size)),
       // Op 2 buffers:
@@ -576,9 +575,10 @@ const Model* GetSimpleModelWithBranch() {
 
 const Model* GetModelWithOfflinePlanning(int num_tensors,
                                          const int32_t* metadata_buffer,
-                                         std::vector<NodeConnection> node_conn) {
-  const Model* model =
-    BuildModelWithOfflinePlanning(num_tensors, metadata_buffer, node_conn);
+                                         NodeConnection* node_conn,
+                                         int num_conns) {
+  const Model* model = BuildModelWithOfflinePlanning(
+      num_tensors, metadata_buffer, node_conn, num_conns);
   return model;
 }
 
diff --git a/tensorflow/lite/micro/test_helpers.h b/tensorflow/lite/micro/test_helpers.h
index e31f5061de8..647ffb92cff 100644
--- a/tensorflow/lite/micro/test_helpers.h
+++ b/tensorflow/lite/micro/test_helpers.h
@@ -49,7 +49,8 @@ const Model* GetSimpleModelWithBranch();
 // Returns a simple flatbuffer model with offline planned tensors
 const Model* GetModelWithOfflinePlanning(int num_tensors,
                                          const int32_t* metadata_buffer,
-                                         std::vector<NodeConnection> node_conn);
+                                         NodeConnection* node_conn,
+                                         int num_conns);
 
 // Returns a flatbuffer model with `simple_stateful_op`
 const Model* GetSimpleStatefulModel();

From d46acc6016666abfc27d28e35936ed70979c822e Mon Sep 17 00:00:00 2001
From: "902449@58880@bigcat_chen" <bigcat_chen@himax.com.tw>
Date: Wed, 29 Apr 2020 15:05:06 +0800
Subject: [PATCH 0042/1390] debug_message

---
 tensorflow/lite/micro/we_i/debug_log.cc | 33 +++++++++++++++++++++++++
 1 file changed, 33 insertions(+)
 create mode 100644 tensorflow/lite/micro/we_i/debug_log.cc

diff --git a/tensorflow/lite/micro/we_i/debug_log.cc b/tensorflow/lite/micro/we_i/debug_log.cc
new file mode 100644
index 00000000000..a115d476aff
--- /dev/null
+++ b/tensorflow/lite/micro/we_i/debug_log.cc
@@ -0,0 +1,33 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Implementation for the DebugLog() function that prints to the UART on the
+// SparkFun Edge microcontroller. The same should work for other targets using
+// the Ambiq Apollo 3.
+
+#include "tensorflow/lite/micro/debug_log.h"
+#include "xprintf.h"
+#include "console_io.h"
+#include <cstdio>
+
+extern "C" void DebugLog(const char* s) {
+  static bool is_initialized = false;
+  if (!is_initialized) {
+	  xprintf_setup();
+	  is_initialized = true;
+  }
+
+  xprintf("%s", s);
+}

From c8a2f59e2e2775808b82c877433ead9545fd7a84 Mon Sep 17 00:00:00 2001
From: Eugene Kuznetsov <eugene.kuznetsov@amd.com>
Date: Wed, 22 Jan 2020 12:53:00 -0800
Subject: [PATCH 0043/1390] Fixing and enabling
 //tensorflow/core/util:gpu_kernel_helper_test_gpu

---
 tensorflow/core/util/gpu_device_functions.h   |   4 +
 .../core/util/gpu_kernel_helper_test.cu.cc    | 127 +++++++++++-------
 2 files changed, 84 insertions(+), 47 deletions(-)

diff --git a/tensorflow/core/util/gpu_device_functions.h b/tensorflow/core/util/gpu_device_functions.h
index 61d1e3c9453..083d42b7de4 100644
--- a/tensorflow/core/util/gpu_device_functions.h
+++ b/tensorflow/core/util/gpu_device_functions.h
@@ -53,6 +53,8 @@ using gpuEvent_t = cudaEvent_t;
 #define gpuEventCreate cudaEventCreate
 #define gpuEventCreateWithFlags cudaEventCreateWithFlags
 #define gpuEventDisableTiming cudaEventDisableTiming
+#define gpuDeviceSynchronize cudaDeviceSynchronize
+#define gpuFree cudaFree
 #elif TENSORFLOW_USE_ROCM
 using gpuFloatComplex = hipFloatComplex;
 using gpuDoubleComplex = hipDoubleComplex;
@@ -68,6 +70,8 @@ using cudaError_t = int;
 #define gpuEventCreate hipEventCreate
 #define gpuEventCreateWithFlags hipEventCreateWithFlags
 #define gpuEventDisableTiming hipEventDisableTiming
+#define gpuDeviceSynchronize hipDeviceSynchronize
+#define gpuFree hipFree
 static std::string cudaGetErrorString(int err) { return std::to_string(err); }
 #endif
 
diff --git a/tensorflow/core/util/gpu_kernel_helper_test.cu.cc b/tensorflow/core/util/gpu_kernel_helper_test.cu.cc
index c089511e964..3135d25d1b8 100644
--- a/tensorflow/core/util/gpu_kernel_helper_test.cu.cc
+++ b/tensorflow/core/util/gpu_kernel_helper_test.cu.cc
@@ -13,9 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #define EIGEN_USE_GPU
 
+#include <time.h>
 #include <numeric>
 
 #include "tensorflow/core/lib/core/status_test_util.h"
@@ -25,14 +26,14 @@ limitations under the License.
 
 #define CUDA_EXPECT_SUCCESS                                 \
   {                                                         \
-    cudaDeviceSynchronize();                                \
+    gpuDeviceSynchronize();                                 \
     cudaError_t err = cudaGetLastError();                   \
     EXPECT_EQ(cudaSuccess, err) << cudaGetErrorString(err); \
   }
 
 #define CUDA_ASSERT_SUCCESS                                 \
   {                                                         \
-    cudaDeviceSynchronize();                                \
+    gpuDeviceSynchronize();                                 \
     cudaError_t err = cudaGetLastError();                   \
     ASSERT_EQ(cudaSuccess, err) << cudaGetErrorString(err); \
   }
@@ -94,8 +95,7 @@ __global__ void Count3D(Gpu3DLaunchConfig config, int bufsize,
   }
 }
 
-__global__ void CudaShuffleGetSrcLaneTest(
-    unsigned* __restrict__ failure_count) {
+__global__ void GpuShuffleGetSrcLaneTest(unsigned* __restrict__ failure_count) {
   unsigned lane_id = GpuLaneId();
   for (int width = warpSize; width > 1; width /= 2) {
     auto check_result = [&](const char* op_name, int param, unsigned actual,
@@ -103,31 +103,38 @@ __global__ void CudaShuffleGetSrcLaneTest(
       if (actual != expected) {
         printf("Cuda%sGetSrcLane(%d, %d) for lane %d returned %d, not %d\n",
                op_name, param, width, lane_id, actual, expected);
-        CudaAtomicAdd(failure_count, 1);
+        GpuAtomicAdd(failure_count, 1);
       }
     };
+
     for (int src_lane = -warpSize; src_lane <= warpSize; ++src_lane) {
-      unsigned actual_lane = detail::CudaShuffleGetSrcLane(src_lane, width);
+#if TENSORFLOW_USE_ROCM
+      if (src_lane < 0 || src_lane >= width) continue;
+#endif
+      unsigned actual_lane = detail::GpuShuffleGetSrcLane(src_lane, width);
       unsigned expect_lane =
-          CudaShuffleSync(kCudaWarpAll, lane_id, src_lane, width);
+          GpuShuffleSync(kCudaWarpAll, lane_id, src_lane, width);
       check_result("Shuffle", src_lane, actual_lane, expect_lane);
     }
+
     for (unsigned delta = 0; delta <= warpSize; ++delta) {
-      unsigned actual_lane = detail::CudaShuffleUpGetSrcLane(delta, width);
+      unsigned actual_lane = detail::GpuShuffleUpGetSrcLane(delta, width);
       unsigned expect_lane =
-          CudaShuffleUpSync(kCudaWarpAll, lane_id, delta, width);
+          GpuShuffleUpSync(kCudaWarpAll, lane_id, delta, width);
       check_result("ShuffleUp", delta, actual_lane, expect_lane);
     }
+
     for (unsigned delta = 0; delta <= warpSize; ++delta) {
-      unsigned actual_lane = detail::CudaShuffleDownGetSrcLane(delta, width);
+      unsigned actual_lane = detail::GpuShuffleDownGetSrcLane(delta, width);
       unsigned expect_lane =
-          CudaShuffleDownSync(kCudaWarpAll, lane_id, delta, width);
+          GpuShuffleDownSync(kCudaWarpAll, lane_id, delta, width);
       check_result("ShuffleDown", delta, actual_lane, expect_lane);
     }
+
     for (int lane_lane = warpSize; lane_lane > 0; lane_lane /= 2) {
-      unsigned actual_lane = detail::CudaShuffleXorGetSrcLane(lane_lane, width);
+      unsigned actual_lane = detail::GpuShuffleXorGetSrcLane(lane_lane, width);
       unsigned expect_lane =
-          CudaShuffleXorSync(kCudaWarpAll, lane_id, lane_lane, width);
+          GpuShuffleXorSync(kCudaWarpAll, lane_id, lane_lane, width);
       check_result("ShuffleXor", lane_lane, actual_lane, expect_lane);
     }
   }
@@ -137,19 +144,32 @@ __global__ void CudaShuffleGetSrcLaneTest(
 
 class GpuLaunchConfigTest : public ::testing::Test {
  protected:
-  const int bufsize = 1024;
+  static const int bufsize = 1024;
   int* outbuf = nullptr;
+  int* outbuf_host = nullptr;
+  int hostbuf[bufsize];
   Eigen::GpuStreamDevice stream;
   Eigen::GpuDevice d = Eigen::GpuDevice(&stream);
 
+  void copyToHost() {
+#if TENSORFLOW_USE_ROCM
+    hipMemcpy(hostbuf, outbuf, sizeof(int) * bufsize, hipMemcpyDeviceToHost);
+#endif
+  }
   virtual void SetUp() {
+#if GOOGLE_CUDA
     cudaError_t err = cudaMallocManaged(&outbuf, sizeof(int) * bufsize);
+    outbuf_host = outbuf;
+#else
+    cudaError_t err = hipMalloc(&outbuf, sizeof(int) * bufsize);
+    outbuf_host = hostbuf;
+#endif
     ASSERT_EQ(cudaSuccess, err) << cudaGetErrorString(err);
   }
 
   virtual void TearDown() {
-    cudaDeviceSynchronize();
-    cudaFree(outbuf);
+    gpuDeviceSynchronize();
+    gpuFree(outbuf);
     outbuf = nullptr;
   }
 };
@@ -158,28 +178,32 @@ TEST_F(GpuLaunchConfigTest, GetGpuLaunchConfig) {
   GpuLaunchConfig cfg;
 
 // test valid inputs
-#define TEST_LAUNCH_PARAMETER(work_element_count)                              \
-  cfg = GetGpuLaunchConfig(bufsize, d);                                        \
-  TF_CHECK_OK(GpuLaunchKernel(SetOutbufZero, cfg.block_count,                  \
-                              cfg.thread_per_block, 0, d.stream(), cfg,        \
-                              outbuf));                                        \
-  CUDA_ASSERT_SUCCESS                                                          \
-  cfg = GetGpuLaunchConfig(work_element_count, d);                             \
-  TF_CHECK_OK(GpuLaunchKernel(Count1D, cfg.block_count, cfg.thread_per_block,  \
-                              0, d.stream(), cfg, bufsize, outbuf));           \
-  CUDA_EXPECT_SUCCESS                                                          \
-  EXPECT_EQ(work_element_count, std::accumulate(outbuf, outbuf + bufsize, 0)); \
-                                                                               \
-  cfg = GetGpuLaunchConfig(bufsize, d, SetOutbufZero, 0, 0);                   \
-  TF_CHECK_OK(GpuLaunchKernel(SetOutbufZero, cfg.block_count,                  \
-                              cfg.thread_per_block, 0, d.stream(), cfg,        \
-                              outbuf));                                        \
-  CUDA_ASSERT_SUCCESS                                                          \
-  cfg = GetGpuLaunchConfig(work_element_count, d, Count1D, 0, 0);              \
-  TF_CHECK_OK(GpuLaunchKernel(Count1D, cfg.block_count, cfg.thread_per_block,  \
-                              0, d.stream(), cfg, bufsize, outbuf));           \
-  CUDA_EXPECT_SUCCESS                                                          \
-  EXPECT_EQ(work_element_count, std::accumulate(outbuf, outbuf + bufsize, 0))
+#define TEST_LAUNCH_PARAMETER(work_element_count)                             \
+  cfg = GetGpuLaunchConfig(bufsize, d);                                       \
+  TF_CHECK_OK(GpuLaunchKernel(SetOutbufZero, cfg.block_count,                 \
+                              cfg.thread_per_block, 0, d.stream(), cfg,       \
+                              outbuf));                                       \
+  CUDA_ASSERT_SUCCESS                                                         \
+  cfg = GetGpuLaunchConfig(work_element_count, d);                            \
+  TF_CHECK_OK(GpuLaunchKernel(Count1D, cfg.block_count, cfg.thread_per_block, \
+                              0, d.stream(), cfg, bufsize, outbuf));          \
+  CUDA_EXPECT_SUCCESS                                                         \
+  copyToHost();                                                               \
+  EXPECT_EQ(work_element_count,                                               \
+            std::accumulate(outbuf_host, outbuf_host + bufsize, 0));          \
+                                                                              \
+  cfg = GetGpuLaunchConfig(bufsize, d, SetOutbufZero, 0, 0);                  \
+  TF_CHECK_OK(GpuLaunchKernel(SetOutbufZero, cfg.block_count,                 \
+                              cfg.thread_per_block, 0, d.stream(), cfg,       \
+                              outbuf));                                       \
+  CUDA_ASSERT_SUCCESS                                                         \
+  cfg = GetGpuLaunchConfig(work_element_count, d, Count1D, 0, 0);             \
+  TF_CHECK_OK(GpuLaunchKernel(Count1D, cfg.block_count, cfg.thread_per_block, \
+                              0, d.stream(), cfg, bufsize, outbuf));          \
+  CUDA_EXPECT_SUCCESS                                                         \
+  copyToHost();                                                               \
+  EXPECT_EQ(work_element_count,                                               \
+            std::accumulate(outbuf_host, outbuf_host + bufsize, 0));
 
   TEST_LAUNCH_PARAMETER(128);
   TEST_LAUNCH_PARAMETER(129);
@@ -221,7 +245,9 @@ TEST_F(GpuLaunchConfigTest, GetGpu2DLaunchConfig) {
   TF_EXPECT_OK(GpuLaunchKernel(Count2D, cfg.block_count, cfg.thread_per_block, \
                                0, d.stream(), cfg, bufsize, outbuf));          \
   CUDA_EXPECT_SUCCESS                                                          \
-  EXPECT_EQ(dimx* dimy, std::accumulate(outbuf, outbuf + bufsize, 0));         \
+  copyToHost();                                                                \
+  EXPECT_EQ(dimx* dimy,                                                        \
+            std::accumulate(outbuf_host, outbuf_host + bufsize, 0));           \
                                                                                \
   cfg1d = GetGpuLaunchConfig(bufsize, d, SetOutbufZero, 0, 0);                 \
   TF_EXPECT_OK(GpuLaunchKernel(SetOutbufZero, cfg1d.block_count,               \
@@ -232,7 +258,8 @@ TEST_F(GpuLaunchConfigTest, GetGpu2DLaunchConfig) {
   TF_EXPECT_OK(GpuLaunchKernel(Count2D, cfg.block_count, cfg.thread_per_block, \
                                0, d.stream(), cfg, bufsize, outbuf));          \
   CUDA_EXPECT_SUCCESS                                                          \
-  EXPECT_EQ(dimx* dimy, std::accumulate(outbuf, outbuf + bufsize, 0))
+  copyToHost();                                                                \
+  EXPECT_EQ(dimx* dimy, std::accumulate(outbuf_host, outbuf_host + bufsize, 0))
 
   TEST_LAUNCH_PARAMETER(128, 128);
   TEST_LAUNCH_PARAMETER(129, 64);
@@ -263,7 +290,9 @@ TEST_F(GpuLaunchConfigTest, GetGpu3DLaunchConfig) {
   TF_EXPECT_OK(GpuLaunchKernel(Count3D, cfg.block_count, cfg.thread_per_block, \
                                0, d.stream(), cfg, bufsize, outbuf));          \
   CUDA_EXPECT_SUCCESS                                                          \
-  EXPECT_EQ(dimx* dimy* dimz, std::accumulate(outbuf, outbuf + bufsize, 0))
+  copyToHost();                                                                \
+  EXPECT_EQ(dimx* dimy* dimz,                                                  \
+            std::accumulate(outbuf_host, outbuf_host + bufsize, 0))
 
   TEST_LAUNCH_PARAMETER(128, 128, 128);
   TEST_LAUNCH_PARAMETER(129, 64, 1024);
@@ -282,15 +311,19 @@ TEST_F(GpuLaunchConfigTest, GetGpu3DLaunchConfig) {
 
 TEST(CudaDeviceFunctionsTest, ShuffleGetSrcLane) {
   unsigned* failure_count;
+#if GOOGLE_CUDA
   ASSERT_EQ(cudaMallocManaged(&failure_count, sizeof(unsigned)), cudaSuccess);
+#else
+  ASSERT_EQ(hipHostMalloc(&failure_count, sizeof(unsigned), 0), cudaSuccess);
+#endif
   *failure_count = 0;
-  TF_EXPECT_OK(GpuLaunchKernel(CudaShuffleGetSrcLaneTest, 1, 32, 0, nullptr,
-                               failure_count));
-  ASSERT_EQ(cudaDeviceSynchronize(), cudaSuccess);
+  TF_EXPECT_OK(GpuLaunchKernel(GpuShuffleGetSrcLaneTest, 1, TF_RED_WARPSIZE, 0,
+                               nullptr, failure_count));
+  ASSERT_EQ(gpuDeviceSynchronize(), cudaSuccess);
   ASSERT_EQ(*failure_count, 0);
-  cudaFree(failure_count);
+  gpuFree(failure_count);
 }
 
 }  // namespace tensorflow
 
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM

From e5d5522d827e1c60b3ac830000b4489206480f95 Mon Sep 17 00:00:00 2001
From: Elena Zhelezina <elena.zhelezina@arm.com>
Date: Mon, 4 May 2020 14:12:51 +0100
Subject: [PATCH 0044/1390] Both inputs should be in int16 for MUL operator.
 Some networks have one of inputs as a constant.

---
 tensorflow/lite/tools/optimize/operator_property.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/lite/tools/optimize/operator_property.cc b/tensorflow/lite/tools/optimize/operator_property.cc
index 38c34706fbe..2ffe9fa3671 100644
--- a/tensorflow/lite/tools/optimize/operator_property.cc
+++ b/tensorflow/lite/tools/optimize/operator_property.cc
@@ -807,6 +807,7 @@ OperatorProperty GetOperatorProperty(const ModelT* model, int subgraph_index,
     case BuiltinOperator_MUL:
       property.inputs = {{0, {}}, {1, {}}};
       property.outputs = {{0, {}}};
+      property.quantize_input_as_activations = true;
       property.version = 2;
       break;
     case BuiltinOperator_PACK:

From 0ee6b3a69da0f17d14d40e34b7008651012638da Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Wed, 6 May 2020 22:29:23 +0000
Subject: [PATCH 0045/1390] Return ValueError in case of empty list input for
 tf.map_fn

This PR tries to address the issue raised in 39229 where
empty lists input was not checked and throw out a non-obvious error:
```python
>>> import numpy as np
>>> import tensorflow as tf
>>> fn = lambda x: x
>>> tf.map_fn(fn, [])
Traceback (most recent call last):
  File "<stdin>", line 1, in <module>
  File "/Library/Python/3.7/site-packages/tensorflow/python/util/deprecation.py", line 574, in new_func
    return func(*args, **kwargs)
  File "/Library/Python/3.7/site-packages/tensorflow/python/ops/map_fn.py", line 425, in map_fn_v2
    name=name)
  File "/Library/Python/3.7/site-packages/tensorflow/python/ops/map_fn.py", line 213, in map_fn
    static_shape = elems_flat[0].shape
IndexError: list index out of range
>>>
```

In case of empty list the behavior is undefined as we even don't know the output dtype.

This PR update to perform a check and thrown out
`ValueError("elems must not be empty")` to help clarify.

This PR fixes 39229.

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/python/kernel_tests/map_fn_test.py | 6 ++++++
 tensorflow/python/ops/map_fn.py               | 7 +++++++
 2 files changed, 13 insertions(+)

diff --git a/tensorflow/python/kernel_tests/map_fn_test.py b/tensorflow/python/kernel_tests/map_fn_test.py
index 1e10d689886..9825939933e 100644
--- a/tensorflow/python/kernel_tests/map_fn_test.py
+++ b/tensorflow/python/kernel_tests/map_fn_test.py
@@ -217,6 +217,12 @@ class MapFnTest(test.TestCase):
       self.assertAllEqual([0, 3, 2], map_return.get_shape().dims)
       self.assertAllEqual([0, 3, 2], self.evaluate(map_return).shape)
 
+  @test_util.run_in_graph_and_eager_modes
+  def testMapEmptyList(self):
+    x = []
+    with self.assertRaisesRegexp(
+        ValueError, r"elems must be a Tensor or"):
+      _ = map_fn.map_fn(lambda e: e, x)
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/ops/map_fn.py b/tensorflow/python/ops/map_fn.py
index 2c9c678336e..6f59bcf5599 100644
--- a/tensorflow/python/ops/map_fn.py
+++ b/tensorflow/python/ops/map_fn.py
@@ -375,6 +375,13 @@ def map_fn(fn,
 
   # Flatten the input tensors, and get the TypeSpec for each one.
   elems_flat = nest.flatten(elems)
+
+  # Check in case this is an empty list
+  if len(elems_flat) == 0:
+    raise ValueError(
+        "elems must be a Tensor or (possibly nested) sequence of Tensors. "
+        "Got {}, which does not contain any Tensors.".format(elems))
+
   elems_flat_signature = [type_spec.type_spec_from_value(e) for e in elems_flat]
   elems_unflatten = lambda x: nest.pack_sequence_as(elems, x)
 

From 28f2af10ecdde4ab8e24247a728032ea1891d730 Mon Sep 17 00:00:00 2001
From: Jens Elofsson <jens.elofsson@arm.com>
Date: Tue, 12 May 2020 17:41:28 +0200
Subject: [PATCH 0046/1390] Realign AllocationInfo struct.

After adding offline_offset, sizeof(AllocationInfo) = 40, which caused
hello_world_test to crash. After realigning it's back to its original
size (32).
---
 tensorflow/lite/micro/micro_allocator.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/lite/micro/micro_allocator.cc b/tensorflow/lite/micro/micro_allocator.cc
index f1c1d65f1cc..5ffda9209d9 100644
--- a/tensorflow/lite/micro/micro_allocator.cc
+++ b/tensorflow/lite/micro/micro_allocator.cc
@@ -34,11 +34,11 @@ namespace {
 // Used to hold information used during allocation calculations.
 struct AllocationInfo {
   size_t bytes;
+  void** output_ptr;
   int first_created;
   int last_used;
-  bool needs_allocating;
-  void** output_ptr;
   int32_t offline_offset;
+  bool needs_allocating;
 };
 
 // We align tensor buffers to 16-byte boundaries, since this is a common

From 489926629dea271e28417a3c427bf698b7d21d64 Mon Sep 17 00:00:00 2001
From: Niranjan Hasabnis <niranjan.hasabnis@intel.com>
Date: Tue, 12 May 2020 15:26:43 -0700
Subject: [PATCH 0047/1390] [Intel MKL] Adding MklTanh op

---
 .../core/common_runtime/mkl_layout_pass.cc    |  14 +-
 .../common_runtime/mkl_layout_pass_test.cc    |  60 ++++++
 tensorflow/core/kernels/BUILD                 |  21 ++
 tensorflow/core/kernels/mkl_relu_op.cc        |  65 ++++--
 tensorflow/core/kernels/mkl_relu_op_test.cc   | 193 ++++++++++++++++++
 5 files changed, 325 insertions(+), 28 deletions(-)
 create mode 100644 tensorflow/core/kernels/mkl_relu_op_test.cc

diff --git a/tensorflow/core/common_runtime/mkl_layout_pass.cc b/tensorflow/core/common_runtime/mkl_layout_pass.cc
index 2941845a604..f4de923cb2d 100644
--- a/tensorflow/core/common_runtime/mkl_layout_pass.cc
+++ b/tensorflow/core/common_runtime/mkl_layout_pass.cc
@@ -675,18 +675,12 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     rinfo_.push_back(
         {csinfo_.requantize, mkl_op_registry::GetMklOpName(csinfo_.requantize),
          CopyAttrsAll, AlwaysRewrite, kRewriteForLayoutPropagation});
-    // Disable these two MKL operators for now due to some test failures caused
-    // by these two ops
-    /*
-    rinfo_.push_back({csinfo_.tanh,
-                      mkl_op_registry::GetMklOpName(csinfo_.tanh),
+    rinfo_.push_back({csinfo_.tanh, mkl_op_registry::GetMklOpName(csinfo_.tanh),
                       CopyAttrsAll, AlwaysRewrite,
                       kRewriteForLayoutPropagation});
-    rinfo_.push_back({csinfo_.tanh_grad,
-                      mkl_op_registry::GetMklOpName(csinfo_.tanh_grad),
-                      CopyAttrsAll, AlwaysRewrite,
-                      kRewriteForLayoutPropagation});
-    */
+    rinfo_.push_back(
+        {csinfo_.tanh_grad, mkl_op_registry::GetMklOpName(csinfo_.tanh_grad),
+         CopyAttrsAll, AlwaysRewrite, kRewriteForLayoutPropagation});
     rinfo_.push_back(
         {csinfo_.reshape, mkl_op_registry::GetMklOpName(csinfo_.reshape),
          CopyAttrsAll, AlwaysRewrite, kRewriteForLayoutPropagation});
diff --git a/tensorflow/core/common_runtime/mkl_layout_pass_test.cc b/tensorflow/core/common_runtime/mkl_layout_pass_test.cc
index c6d5331852e..daa7f42620c 100644
--- a/tensorflow/core/common_runtime/mkl_layout_pass_test.cc
+++ b/tensorflow/core/common_runtime/mkl_layout_pass_test.cc
@@ -2949,6 +2949,66 @@ TEST_F(MklLayoutPassTest, NodeRewrite_LeakyReluLeakyReluGrad_Positive) {
       "DMT/_1->C:2");
 }
 
+// clang-format off
+#define REGISTER_TEST(NAME, T, INPUT)                                        \
+  TEST_F(MklLayoutPassTest, NAME##_##T) {                                    \
+    DCHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS);     \
+    InitGraph(                                                               \
+      "node { name: 'A' op: '" #INPUT "'}"                                   \
+      "node { name: 'B' op: 'Tanh'"                                          \
+      " attr { key: 'T'                value { type: " #T " } }"             \
+      " input: ['A'] }"                                                      \
+      "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: " #T " } }" \
+      " input: ['A', 'B'] }");                                               \
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),                                   \
+            "A(" #INPUT ");B(_MklTanh);C(Zeta);DMT/_0(Const)|A->B;A->C;"     \
+            "A:control->DMT/_0:control;B->C:1;DMT/_0->B:1");                 \
+}
+REGISTER_TEST_ALL_TYPES(NodeRewrite_Tanh_Positive);
+#undef REGISTER_TEST
+
+#define REGISTER_TEST(NAME, T, INPUT)                                            \
+  TEST_F(MklLayoutPassTest, NAME##_##T) {                                        \
+    DCHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS);         \
+    InitGraph(                                                                   \
+      "node { name: 'A' op: '" #INPUT "'}"                                       \
+      "node { name: 'B' op: '" #INPUT "'}"                                       \
+      "node { name: 'C' op: 'TanhGrad'"                                          \
+      " attr { key: 'T'                value { type: " #T " } }"                 \
+      " input: ['A', 'B'] }"                                                     \
+      "node { name: 'D' op: 'Zeta' attr { key: 'T' value { type: " #T " } }"     \
+      " input: ['A', 'C'] }");                                                   \
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),                                       \
+            "A(" #INPUT ");B(" #INPUT ");C(_MklTanhGrad);D(Zeta);DMT/_0(Const);" \
+            "DMT/_1(Const)|A->C;A->D;A:control->DMT/_0:control;"                 \
+            "A:control->DMT/_1:control;B->C:1;C->D:1;DMT/_0->C:2;DMT/_1->C:3");  \
+}
+REGISTER_TEST_ALL_TYPES(NodeRewrite_TanhGrad_Positive);
+#undef REGISTER_TEST
+
+#define REGISTER_TEST(NAME, T, INPUT)                                          \
+  TEST_F(MklLayoutPassTest, NAME##_##T) {                                      \
+    DCHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS);       \
+    InitGraph(                                                                 \
+      "node { name: 'A' op: '" #INPUT "'}"                                     \
+      "node { name: 'B' op: 'Tanh'"                                            \
+      " attr { key: 'T'                value { type: " #T " } }"               \
+      " input: ['A'] }"                                                        \
+      "node { name: 'C' op: 'TanhGrad'"                                        \
+      " attr { key: 'T'                value { type: " #T " } }"               \
+      " input: ['B', 'A'] }"                                                   \
+      "node { name: 'D' op: 'Zeta' attr { key: 'T' value { type: " #T " } }"   \
+      " input: ['A', 'C'] }");                                                 \
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),                                     \
+            "A(" #INPUT ");B(_MklTanh);C(_MklTanhGrad);D(Zeta);DMT/_0(Const);" \
+            "DMT/_1(Const)|A->B;A->C:1;A->D;A:control->DMT/_0:control;"        \
+            "B->C;B:1->C:2;B:control->DMT/_1:control;C->D:1;DMT/_0->B:1;"      \
+            "DMT/_1->C:3");                                                    \
+}
+REGISTER_TEST_ALL_TYPES(NodeRewrite_TanhTanhGrad_Positive);
+#undef REGISTER_TEST
+// clang-format on
+
 TEST_F(MklLayoutPassTest, NodeRewrite_AvgPool_Positive) {
   InitGraph(
       "node { name: 'A' op: 'Input'}"
diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index e47c681bb61..3f6c2fbfb04 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -8084,6 +8084,27 @@ tf_cc_test_mkl(
     ],
 )
 
+tf_cc_test_mkl(
+    name = "mkl_relu_op_test",
+    size = "small",
+    srcs = ["mkl_relu_op_test.cc"],
+    linkstatic = 1,  # Fixes dyld error on MacOS.
+    deps = [
+        ":ops_testutil",
+        ":ops_util",
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:tensorflow",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+    ],
+)
+
 tf_mkl_kernel_library(
     name = "mkl_tfconv_op",
     prefix = "mkl_tfconv",
diff --git a/tensorflow/core/kernels/mkl_relu_op.cc b/tensorflow/core/kernels/mkl_relu_op.cc
index ffbc1e28355..7f885bfdc01 100644
--- a/tensorflow/core/kernels/mkl_relu_op.cc
+++ b/tensorflow/core/kernels/mkl_relu_op.cc
@@ -19,7 +19,6 @@ limitations under the License.
 #include <unordered_map>
 
 #include "mkldnn.hpp"
-#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
@@ -27,6 +26,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/util/mkl_types.h"
 #include "tensorflow/core/util/mkl_util.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 
 using mkldnn::algorithm;
 using mkldnn::eltwise_forward;
@@ -266,15 +266,19 @@ class MklEltwiseBwdParams {
   algorithm alg_kind;
   float alpha;
   float beta;
+  // Whether the input that grad op gets from forward op is SRC
+  // of forward op or DST of forward op.
+  int forward_input_type;
 
   MklEltwiseBwdParams(const memory::dims& src_dims,
                       const memory::desc& common_md, algorithm alg_kind,
-                      float alpha, float beta)
+                      float alpha, float beta, int forward_input_type)
       : src_dims(src_dims),
         common_md(common_md),
         alg_kind(alg_kind),
         alpha(alpha),
-        beta(beta) {}
+        beta(beta),
+        forward_input_type(forward_input_type) {}
 };
 
 template <typename T>
@@ -430,7 +434,7 @@ class MklEltwiseBwdPrimitive : public MklPrimitive {
     // Create eltwise primitive and add it to net.
     context_.eltwise_bwd.reset(new mkldnn::eltwise_backward(*context_.bwd_pd));
     context_.bwd_primitives_args.push_back(
-        {{MKLDNN_ARG_SRC, *context_.src_mem},
+        {{bwdParams.forward_input_type, *context_.src_mem},
          {MKLDNN_ARG_DIFF_DST, *context_.diff_dst_mem},
          { MKLDNN_ARG_DIFF_SRC,
            *context_.diff_src_mem }});
@@ -631,14 +635,30 @@ class MklReluGradOpBase : public OpKernel {
 
   virtual void Compute_Scalar(OpKernelContext* context) = 0;
 
+  // All activation functions that are part of NN ops, such as Relu, Elu,
+  // LeakyRelu, Relu6, etc have dy at index 0 and y at index 1.
+  //
+  // if forward op is defined as: y = f(x),
+  // {Relu,Elu,Relu6,LeakyRelu}Grad is: z = f_grad(dy,x)
+  // TanhGrad is: z = tanh_grad(y,dy)
+  //
+  // Src below refers to a tensor that gradient op receives from forward
+  // operator. From Relu-family ops, it is 'x'; while for TanhGrad, it is 'y'.
+  virtual int GetDiffDstIndex() const { return 0; }
+  virtual int GetSrcIndex() const { return 1; }
+  virtual int GetDiffSrcIndex() const { return 0; }
+  // What is the type of input tensor that grad op receives from forward op --
+  // is it 'x' (SRC) or 'y' (DST). For Relu-family, it is 'x', so fwd op SRC.
+  virtual int GetTypeOfInputTensorFromFwdOp() const { return MKLDNN_ARG_SRC; }
+
   void Compute(OpKernelContext* context) {
     try {
       MklDnnData<T> src(&cpu_engine);
       MklDnnData<T> diff_dst(&cpu_engine);
 
-      const size_t diff_dst_index = 0;  // index of diff_dst input tensor
-      const size_t src_index = 1;       // index of src input tensor
-      const size_t diff_src_index = 0;  // index of diff_src output tensor
+      size_t diff_dst_index = GetDiffDstIndex();
+      size_t src_index = GetSrcIndex();
+      const size_t diff_src_index = GetDiffSrcIndex();
 
       const Tensor& src_tensor = MklGetInput(context, src_index);
       const Tensor& diff_dst_tensor = MklGetInput(context, diff_dst_index);
@@ -722,7 +742,7 @@ class MklReluGradOpBase : public OpKernel {
       }
 
       MklEltwiseBwdParams<T> bwdParams(src_dims, common_md, alg_kind, alpha_,
-                                       beta_);
+                                       beta_, GetTypeOfInputTensorFromFwdOp());
       MklEltwiseBwdPrimitive<T>* eltwise_bwd =
           MklEltwiseBwdPrimitiveFactory<T>::Get(bwdParams);
 
@@ -976,18 +996,28 @@ class MklTanhOp : public MklReluOpBase<Device, T, ALGORITHM::eltwise_tanh> {
 
 template <typename Device, typename T>
 class MklTanhGradOp
-    : public MklReluGradOpBase<Device, T, ALGORITHM::eltwise_tanh> {
+    : public MklReluGradOpBase<Device, T,
+                               ALGORITHM::eltwise_tanh_use_dst_for_bwd> {
  public:
   ~MklTanhGradOp() {}
 
   explicit MklTanhGradOp(OpKernelConstruction* context)
-      : MklReluGradOpBase<Device, T, ALGORITHM::eltwise_tanh>(context, 0.0f,
-                                                              0.0f) {}
+      : MklReluGradOpBase<Device, T, ALGORITHM::eltwise_tanh_use_dst_for_bwd>(
+            context, 0.0f, 0.0f) {}
+
+  virtual int GetDiffDstIndex() const { return 1; }
+  virtual int GetSrcIndex() const { return 0; }
+  virtual int GetDiffSrcIndex() const { return 0; }
+
+  // TanhGrad gets 'y' from Tanh, where 'y' is output of Tanh(x).
+  virtual int GetTypeOfInputTensorFromFwdOp() const { return MKLDNN_ARG_DST; }
 
   virtual void Compute_Scalar(OpKernelContext* context) {
-    const size_t diff_dst_index = 0;  // index of diff_dst input tensor
-    const size_t src_index = 1;       // index of src input tensor
-    const size_t diff_src_index = 0;  // index of diff_src output tensor
+    // NOTE: Order of y and dy for Tanh is reverse of that for Relu/Elu/other
+    // element-wise ops. Tanh is math op in Tensorflow; others are NN ops.
+    const size_t diff_dst_index = GetDiffDstIndex();
+    const size_t src_index = GetSrcIndex();
+    const size_t diff_src_index = GetDiffSrcIndex();
     const Tensor& src_tensor = MklGetInput(context, src_index);
     const Tensor& diff_dst_tensor = MklGetInput(context, diff_dst_index);
     Tensor* diff_src_tensor = nullptr;
@@ -1003,10 +1033,9 @@ class MklTanhGradOp
     void* user_i =
         static_cast<void*>(const_cast<T*>(src_tensor.flat<T>().data()));
     // gradient of tanh(x) = 1 - tanh(x)^2
-    T feature = (static_cast<T*>(user_i))[0];
-    T e1 = std::exp(feature);
-    T e2 = std::exp(-feature);
-    T tanh = (e1 - e2) / (e1 + e2);
+    // Input to TanhGrad is output of Tanh. So we do not need to compute
+    // Tanh again.
+    T tanh = (static_cast<T*>(user_i))[0];
     void* user_g =
         static_cast<void*>(const_cast<T*>(diff_dst_tensor.flat<T>().data()));
     (static_cast<T*>(out_o))[0] =
diff --git a/tensorflow/core/kernels/mkl_relu_op_test.cc b/tensorflow/core/kernels/mkl_relu_op_test.cc
new file mode 100644
index 00000000000..7a3dffef0de
--- /dev/null
+++ b/tensorflow/core/kernels/mkl_relu_op_test.cc
@@ -0,0 +1,193 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#undef INTEL_MKL
+
+#ifdef INTEL_MKL
+
+#include "tensorflow/cc/ops/const_op.h"
+#include "tensorflow/cc/ops/nn_ops.h"
+#include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h"
+#include "tensorflow/core/framework/fake_input.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/kernels/ops_testutil.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/test_benchmark.h"
+#include "tensorflow/core/public/session.h"
+
+#include "mkldnn.hpp"
+#include "tensorflow/core/util/mkl_util.h"
+
+// Compare performance of default Tensorflow convolution kernels (Eigen) with
+// MKL kernels on CPU.
+
+// Before running these benchmarks configure OpenMP environment variables:
+//   export KMP_BLOCKTIME=0
+//   export OMP_NUM_THREADS=${num_threads}
+
+namespace tensorflow {
+static Tensor NonMklTensor() {
+  MklDnnShape non_mkl_shape;
+  non_mkl_shape.SetMklTensor(false);
+
+  auto size = static_cast<int64>(non_mkl_shape.GetSerializeBufferSize());
+  Tensor tensor(DT_UINT8, {size});
+
+  non_mkl_shape.SerializeMklDnnShape(tensor.flat<uint8>().data(),
+                                     size * sizeof(uint8));
+  return tensor;
+}
+
+static Tensor GetRandomTensor(const TensorShape& shape) {
+  Tensor tensor(DT_FLOAT, TensorShape(shape));
+  tensor.flat<float>() = tensor.flat<float>().setRandom();
+  return tensor;
+}
+
+#define CREATE_DEFAULT_FWD_OP(NODE_NAME, OP_NAME)                 \
+  static Graph* NODE_NAME(const TensorShape& shape) {             \
+    auto* graph = new Graph(OpRegistry::Global());                \
+    Tensor input_t = GetRandomTensor(shape);                      \
+    Node* input = test::graph::Constant(graph, input_t, "input"); \
+    Node* op;                                                     \
+    TF_CHECK_OK(NodeBuilder(graph->NewName(#NODE_NAME), #OP_NAME) \
+                    .Input(input)                                 \
+                    .Attr("T", DT_FLOAT)                          \
+                    .Finalize(graph, &op));                       \
+    return graph;                                                 \
+  }
+CREATE_DEFAULT_FWD_OP(Default_Tanh, Tanh)
+CREATE_DEFAULT_FWD_OP(Default_Elu, Elu)
+CREATE_DEFAULT_FWD_OP(Default_Relu, Relu)
+CREATE_DEFAULT_FWD_OP(Default_Relu6, Relu6)
+CREATE_DEFAULT_FWD_OP(Default_LeakyRelu, LeakyRelu)
+
+#define CREATE_DEFAULT_BWD_OP(NODE_NAME, OP_NAME)                 \
+  static Graph* NODE_NAME(const TensorShape& shape) {             \
+    auto* graph = new Graph(OpRegistry::Global());                \
+    Tensor input_t = GetRandomTensor(shape);                      \
+    Node* input = test::graph::Constant(graph, input_t, "input"); \
+    Tensor grad_t = GetRandomTensor(shape);                       \
+    Node* grad = test::graph::Constant(graph, grad_t, "grad");    \
+    Node* op;                                                     \
+    TF_CHECK_OK(NodeBuilder(graph->NewName(#NODE_NAME), #OP_NAME) \
+                    .Input(grad)                                  \
+                    .Input(input)                                 \
+                    .Attr("T", DT_FLOAT)                          \
+                    .Finalize(graph, &op));                       \
+    return graph;                                                 \
+  }
+CREATE_DEFAULT_BWD_OP(Default_TanhGrad, TanhGrad)
+CREATE_DEFAULT_BWD_OP(Default_EluGrad, EluGrad)
+CREATE_DEFAULT_BWD_OP(Default_ReluGrad, ReluGrad)
+CREATE_DEFAULT_BWD_OP(Default_Relu6Grad, Relu6Grad)
+CREATE_DEFAULT_BWD_OP(Default_LeakyReluGrad, LeakyReluGrad)
+
+#define CREATE_MKL_FWD_OP(NODE_NAME, OP_NAME)                     \
+  static Graph* NODE_NAME(const TensorShape& shape) {             \
+    auto* graph = new Graph(OpRegistry::Global());                \
+                                                                  \
+    Tensor input_t = GetRandomTensor(shape);                      \
+    Node* input = test::graph::Constant(graph, input_t, "input"); \
+                                                                  \
+    Node* not_mkl_shape =                                         \
+        test::graph::Constant(graph, NonMklTensor(), "not_mkl");  \
+                                                                  \
+    Node* op;                                                     \
+    TF_CHECK_OK(NodeBuilder(graph->NewName(#NODE_NAME), #OP_NAME) \
+                    .Input(input)                                 \
+                    .Input(not_mkl_shape)                         \
+                    .Attr("T", DT_FLOAT)                          \
+                    .Attr("_kernel", "MklLayoutDependentOp")      \
+                    .Finalize(graph, &op));                       \
+                                                                  \
+    return graph;                                                 \
+  }
+
+CREATE_MKL_FWD_OP(Mkl_Tanh, _MklTanh)
+CREATE_MKL_FWD_OP(Mkl_Elu, _MklElu)
+CREATE_MKL_FWD_OP(Mkl_Relu, _MklRelu)
+CREATE_MKL_FWD_OP(Mkl_Relu6, _MklRelu6)
+CREATE_MKL_FWD_OP(Mkl_LeakyRelu, _MklLeakyRelu)
+
+#define CREATE_MKL_BWD_OP(NODE_NAME, OP_NAME)                     \
+  static Graph* NODE_NAME(const TensorShape& shape) {             \
+    auto* graph = new Graph(OpRegistry::Global());                \
+                                                                  \
+    Tensor input_t = GetRandomTensor(shape);                      \
+    Node* input = test::graph::Constant(graph, input_t, "input"); \
+    Tensor grad_t = GetRandomTensor(shape);                       \
+    Node* grad = test::graph::Constant(graph, grad_t, "grad");    \
+                                                                  \
+    Node* not_mkl_shape =                                         \
+        test::graph::Constant(graph, NonMklTensor(), "not_mkl");  \
+                                                                  \
+    Node* op;                                                     \
+    TF_CHECK_OK(NodeBuilder(graph->NewName(#NODE_NAME), #OP_NAME) \
+                    .Input(grad)                                  \
+                    .Input(input)                                 \
+                    .Input(not_mkl_shape)                         \
+                    .Input(not_mkl_shape)                         \
+                    .Attr("T", DT_FLOAT)                          \
+                    .Attr("_kernel", "MklLayoutDependentOp")      \
+                    .Finalize(graph, &op));                       \
+                                                                  \
+    return graph;                                                 \
+  }
+
+CREATE_MKL_BWD_OP(Mkl_TanhGrad, _MklTanhGrad)
+CREATE_MKL_BWD_OP(Mkl_EluGrad, _MklEluGrad)
+CREATE_MKL_BWD_OP(Mkl_ReluGrad, _MklReluGrad)
+CREATE_MKL_BWD_OP(Mkl_Relu6Grad, _MklRelu6Grad)
+CREATE_MKL_BWD_OP(Mkl_LeakyReluGrad, _MklLeakyReluGrad)
+
+#define BM_Activation(op, kind, A, B, C, D, type)                            \
+  static void BM_##op##_##kind##_##type##_##A##_##B##_##C##_##D(int iters) { \
+    int64 num_computed_elements = (A) * (B) * (C) * (D);                     \
+    int64 flops_per_iter = num_computed_elements;                            \
+    testing::ItemsProcessed(static_cast<int64>(iters) * flops_per_iter);     \
+                                                                             \
+    test::Benchmark(#type, kind##_##op({A, B, C, D})).Run(iters);            \
+  }                                                                          \
+  BENCHMARK(BM_##op##_##kind##_##type##_##A##_##B##_##C##_##D)
+
+#define BM(op, A, B, C, D, type)                \
+  BM_Activation(op, Default, A, B, C, D, type); \
+  BM_Activation(op, Mkl, A, B, C, D, type);
+
+#define TEST_ALL_SIZES(OP)       \
+  BM(OP, 2, 4, 8, 16, cpu);      \
+  BM(OP, 3, 5, 9, 17, cpu);      \
+  BM(OP, 32, 64, 128, 256, cpu); \
+  BM(OP, 33, 65, 129, 257, cpu);
+
+TEST_ALL_SIZES(Tanh)
+TEST_ALL_SIZES(TanhGrad)
+TEST_ALL_SIZES(Relu)
+TEST_ALL_SIZES(ReluGrad)
+TEST_ALL_SIZES(Elu)
+TEST_ALL_SIZES(EluGrad)
+TEST_ALL_SIZES(Relu6)
+TEST_ALL_SIZES(Relu6Grad)
+TEST_ALL_SIZES(LeakyRelu)
+TEST_ALL_SIZES(LeakyReluGrad)
+
+}  // namespace tensorflow
+
+#endif  // INTEL_MKL

From f87cf9c19b393657fb4971dd9513f44d73589b16 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20Kubov=C4=8D=C3=ADk?= <markub3327@gmail.com>
Date: Mon, 18 May 2020 00:49:25 +0200
Subject: [PATCH 0048/1390] add lock file

---
 tensorflow/python/keras/saving/hdf5_format.py | 31 ++++++++++++++++++-
 1 file changed, 30 insertions(+), 1 deletion(-)

diff --git a/tensorflow/python/keras/saving/hdf5_format.py b/tensorflow/python/keras/saving/hdf5_format.py
index f3adb2d0695..93f06c018d1 100644
--- a/tensorflow/python/keras/saving/hdf5_format.py
+++ b/tensorflow/python/keras/saving/hdf5_format.py
@@ -53,7 +53,24 @@ sequential_lib = LazyLoader(
 # pylint:enable=g-inconsistent-quotes
 
 
-def save_model_to_hdf5(model, filepath, overwrite=True, include_optimizer=True):
+# create lock file
+def create_lockfile(filepath):
+  lockfile_path = f"{filepath}.lock"
+
+  f = open(lockfile_path, 'w')
+  f.write(f"{os.getpid()}")
+  f.close()
+
+  return lockfile_path 
+
+def check_lockfile(filepath):
+  lockfile_path = f"{filepath}.lock"
+  if os.path.exists(lockfile_path):
+    # use PID?
+    return True
+  return False
+
+def save_model_to_hdf5(model, filepath, overwrite=True, lockFile=True, include_optimizer=True):
   """Saves a model to a HDF5 file.
 
   The saved model contains:
@@ -99,6 +116,10 @@ def save_model_to_hdf5(model, filepath, overwrite=True, include_optimizer=True):
       if not proceed:
         return
 
+    # create lock file
+    if (lockFile == True):
+      lockfile_path = create_lockfile(filepath)
+
     f = h5py.File(filepath, mode='w')
     opened_new_file = True
   else:
@@ -129,6 +150,10 @@ def save_model_to_hdf5(model, filepath, overwrite=True, include_optimizer=True):
     if opened_new_file:
       f.close()
 
+      # remove lock file
+      if (lockFile == True):
+        os.remove(lockfile_path)
+
 
 def load_model_from_hdf5(filepath, custom_objects=None, compile=True):  # pylint: disable=redefined-builtin
   """Loads a model saved via `save_model_to_hdf5`.
@@ -163,6 +188,10 @@ def load_model_from_hdf5(filepath, custom_objects=None, compile=True):  # pylint
 
   opened_new_file = not isinstance(filepath, h5py.File)
   if opened_new_file:
+    # check if lock file exist
+    if check_lockfile(filepath) == True:
+      raise ValueError('Cannot read from file at this time.')
+
     f = h5py.File(filepath, mode='r')
   else:
     f = filepath

From 93cc43bef97f4371379c2ea6e87b260a2a2cf7af Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20Kubov=C4=8D=C3=ADk?= <markub3327@gmail.com>
Date: Mon, 18 May 2020 00:54:34 +0200
Subject: [PATCH 0049/1390] add lockFile argument to save_model()

---
 tensorflow/python/keras/saving/save.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tensorflow/python/keras/saving/save.py b/tensorflow/python/keras/saving/save.py
index 43c09a62ea9..100fb05943a 100644
--- a/tensorflow/python/keras/saving/save.py
+++ b/tensorflow/python/keras/saving/save.py
@@ -48,6 +48,7 @@ _KERAS_SAVED_MODEL_STILL_EXPERIMENTAL = True
 def save_model(model,
                filepath,
                overwrite=True,
+               lockFile=True,
                include_optimizer=True,
                save_format=None,
                signatures=None,
@@ -95,6 +96,7 @@ def save_model(model,
       overwrite: Whether we should overwrite any existing model at the target
         location, or instead ask the user with a manual prompt.
       include_optimizer: If True, save optimizer's state together.
+      lockFile: If True, protect model file while saving model.
       save_format: Either 'tf' or 'h5', indicating whether to save the model
         to Tensorflow SavedModel or HDF5. Defaults to 'tf' in TF 2.X, and 'h5'
         in TF 1.X.
@@ -128,7 +130,7 @@ def save_model(model,
           'to the Tensorflow SavedModel format (by setting save_format="tf") '
           'or using `save_weights`.')
     hdf5_format.save_model_to_hdf5(
-        model, filepath, overwrite, include_optimizer)
+        model, filepath, overwrite, lockFile, include_optimizer)
   else:
     saved_model_save.save(model, filepath, overwrite, include_optimizer,
                           signatures, options)

From 80fc2a1dd49afe2e22d7ea6621cba6aefb38818c Mon Sep 17 00:00:00 2001
From: settle <31239886+settle@users.noreply.github.com>
Date: Tue, 19 May 2020 16:09:25 -0700
Subject: [PATCH 0050/1390] Added aarch64 dockerfiles

---
 tensorflow/tools/dockerfiles/spec.yml | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/tensorflow/tools/dockerfiles/spec.yml b/tensorflow/tools/dockerfiles/spec.yml
index 436ef41c15a..f68f627e1d0 100644
--- a/tensorflow/tools/dockerfiles/spec.yml
+++ b/tensorflow/tools/dockerfiles/spec.yml
@@ -50,6 +50,7 @@ releases:
             - "{ubuntu-devel-ppc64le}{jupyter}"
             - "{ubuntu-horovod}{jupyter}"
             - "{ubuntu-devel-horovod}{jupyter}"
+            - "{ubuntu-devel-aarch64}{jupyter}"
 
 slice_sets:
 
@@ -109,6 +110,22 @@ slice_sets:
           args:
               - CHECKOUT_TF_SRC=1
 
+    ubuntu-devel-aarch64:
+        - add_to_name: "devel-aarch64"
+          dockerfile_exclusive_name: "devel-cpu-aarch64"
+          dockerfile_subdirectory: "aarch64"
+          partials:
+              - ubuntu/version
+              - ubuntu/devel-cpu
+              - ubuntu/python
+              - ubuntu/bazelbuild
+              - shell
+          tests:
+              - build-cpu.sh
+          args:
+              - UBUNTU_VERSION=18.04
+              - CHECKOUT_TF_SRC=1
+
     ubuntu-horovod:
         - add_to_name: "-horovod"
           dockerfile_exclusive_name: "horovod"

From 9df2c846ee87174dc35aabd301f9f9eb1df89c7f Mon Sep 17 00:00:00 2001
From: settle <31239886+settle@users.noreply.github.com>
Date: Tue, 19 May 2020 16:15:50 -0700
Subject: [PATCH 0051/1390] Update bazelbuild.partial.Dockerfile

---
 .../ubuntu/bazelbuild.partial.Dockerfile      | 23 +++++++++++--------
 1 file changed, 13 insertions(+), 10 deletions(-)

diff --git a/tensorflow/tools/dockerfiles/partials/ubuntu/bazelbuild.partial.Dockerfile b/tensorflow/tools/dockerfiles/partials/ubuntu/bazelbuild.partial.Dockerfile
index 2b4761abc39..d8f88672a84 100644
--- a/tensorflow/tools/dockerfiles/partials/ubuntu/bazelbuild.partial.Dockerfile
+++ b/tensorflow/tools/dockerfiles/partials/ubuntu/bazelbuild.partial.Dockerfile
@@ -7,17 +7,20 @@ RUN apt-get update && apt-get install -y \
     virtualenv \
     swig
 
+RUN apt-get update && apt-get install -y \
+    python3-pil \
+    python3-h5py \
+    python3-keras-preprocessing \
+    python3-matplotlib \
+    python3-mock \
+    python3-numpy \
+    python3-scipy \
+    python3-sklearn \
+    python3-pandas \
+    python3-future \
+    python3-portpicker
+
 RUN python3 -m pip --no-cache-dir install \
-    Pillow \
-    h5py \
-    keras_preprocessing \
-    matplotlib \
-    mock \
-    numpy \
-    scipy \
-    sklearn \
-    pandas \
-    portpicker \
     enum34
 
  # Build and install bazel

From 3d2e5f8c1dd13f9d066621867a4316ca190fcd06 Mon Sep 17 00:00:00 2001
From: settle <31239886+settle@users.noreply.github.com>
Date: Tue, 19 May 2020 16:39:44 -0700
Subject: [PATCH 0052/1390] Update bazelbuild.partial.Dockerfile

---
 .../dockerfiles/partials/ubuntu/bazelbuild.partial.Dockerfile    | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tensorflow/tools/dockerfiles/partials/ubuntu/bazelbuild.partial.Dockerfile b/tensorflow/tools/dockerfiles/partials/ubuntu/bazelbuild.partial.Dockerfile
index d8f88672a84..6c050496324 100644
--- a/tensorflow/tools/dockerfiles/partials/ubuntu/bazelbuild.partial.Dockerfile
+++ b/tensorflow/tools/dockerfiles/partials/ubuntu/bazelbuild.partial.Dockerfile
@@ -17,7 +17,6 @@ RUN apt-get update && apt-get install -y \
     python3-scipy \
     python3-sklearn \
     python3-pandas \
-    python3-future \
     python3-portpicker
 
 RUN python3 -m pip --no-cache-dir install \

From fffea34c27c8b177c5ca5e2de00b3d8628c669f7 Mon Sep 17 00:00:00 2001
From: settle <31239886+settle@users.noreply.github.com>
Date: Thu, 21 May 2020 10:55:42 -0700
Subject: [PATCH 0053/1390] Update bazelbuild.partial.Dockerfile

---
 .../dockerfiles/partials/ubuntu/bazelbuild.partial.Dockerfile   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/tools/dockerfiles/partials/ubuntu/bazelbuild.partial.Dockerfile b/tensorflow/tools/dockerfiles/partials/ubuntu/bazelbuild.partial.Dockerfile
index 6c050496324..b833657aa69 100644
--- a/tensorflow/tools/dockerfiles/partials/ubuntu/bazelbuild.partial.Dockerfile
+++ b/tensorflow/tools/dockerfiles/partials/ubuntu/bazelbuild.partial.Dockerfile
@@ -22,7 +22,7 @@ RUN apt-get update && apt-get install -y \
 RUN python3 -m pip --no-cache-dir install \
     enum34
 
- # Build and install bazel
+# Build and install bazel
 ENV BAZEL_VERSION 3.0.0
 WORKDIR /
 RUN mkdir /bazel && \

From ab809024a4a5b0887c360b3e5542c149f4a5f14d Mon Sep 17 00:00:00 2001
From: Kaixi Hou <kaixih@nvidia.com>
Date: Tue, 19 May 2020 15:53:07 -0700
Subject: [PATCH 0054/1390] Enable wider vector for reluGrad

---
 tensorflow/core/kernels/relu_op_gpu.cu.cc | 87 +++++++++++++++++++++--
 1 file changed, 80 insertions(+), 7 deletions(-)

diff --git a/tensorflow/core/kernels/relu_op_gpu.cu.cc b/tensorflow/core/kernels/relu_op_gpu.cu.cc
index 27fd5f64249..ca1a4235f3a 100644
--- a/tensorflow/core/kernels/relu_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/relu_op_gpu.cu.cc
@@ -35,6 +35,7 @@ namespace tensorflow {
 
 typedef Eigen::GpuDevice GPUDevice;
 
+static constexpr int VectorSize = 8;
 namespace functor {
 
 // This kernel computes ReluGrad by processing one half2, two fp16, at a time.
@@ -93,6 +94,64 @@ __global__ void ReluGradHalfKernel(const Eigen::half* __restrict__ gradient,
   }
 }
 
+__global__ void ReluGradHalfKernelVector(
+    const Eigen::half* __restrict__ gradient,
+    const Eigen::half* __restrict__ feature,
+    Eigen::half* __restrict__ backprop, int32 count) {
+  int32 half8_count = count / VectorSize;
+  int32 index = blockIdx.x * blockDim.x + threadIdx.x;
+
+  if (index < half8_count) {
+    float4 gradient_h8 = reinterpret_cast<const float4*>(gradient)[index];
+    float4 feature_h8 = reinterpret_cast<const float4*>(feature)[index];
+    float4* p_backprop_h8 = reinterpret_cast<float4*>(backprop) + index;
+
+    half2 *gradient_h2 = reinterpret_cast<half2*>(&gradient_h8);
+    half2 *feature_h2 = reinterpret_cast<half2*>(&feature_h8);
+    float4 backprop_h8;
+    half2* p_backprop_h2 = reinterpret_cast<half2*>(&backprop_h8);
+
+    // Fast path, when half2 primitives are available.
+#if __CUDA_ARCH__ >= 530
+    const half2 kZeroH2 = __float2half2_rn(0.f);
+#endif
+    for (int i = 0; i < VectorSize / 2; i++) {
+#if __CUDA_ARCH__ >= 530
+      // mask = (feature > 0)
+      half2 mask_h2 = __hgt2(feature_h2[i], kZeroH2);
+      // backprop = mask * gradient
+      half2 backprop_h2 = __hmul2(mask_h2, gradient_h2[i]);
+#else
+      // Fall back: convert half2 to float2 for processing.
+      float2 feature_f2 = __half22float2(feature_h2[i]);
+      float2 gradient_f2 = __half22float2(gradient_h2[i]);
+      float2 backprop_f2 = make_float2((feature_f2.x > 0) ? gradient_f2.x : 0,
+                                       (feature_f2.y > 0) ? gradient_f2.y : 0);
+      // Convert back to half2.
+      half2 backprop_h2 = __float22half2_rn(backprop_f2);
+#endif
+      p_backprop_h2[i] = backprop_h2;
+    }
+    // Write back the result.
+    *p_backprop_h8 = backprop_h8;
+  }
+
+  int remaining_count = (count % VectorSize);
+
+  if (index < remaining_count) {
+    // Use first threads to process the remaining elements.
+    Eigen::half grad_h = gradient[half8_count * VectorSize + index];
+    Eigen::half feature_h = feature[half8_count * VectorSize + index];
+
+    float grad_f = static_cast<float>(grad_h);
+    float feature_f = static_cast<float>(feature_h);
+    float backprop_f = (feature_f > 0) ? grad_f : 0;
+
+    Eigen::half backprop_h(backprop_f);
+    backprop[half8_count * VectorSize + index] = backprop_h;
+  }
+}
+
 template <typename Device>
 struct ReluGrad<Device, Eigen::half> {
   // Computes ReluGrad backprop.
@@ -108,15 +167,29 @@ struct ReluGrad<Device, Eigen::half> {
     // NOTE: When the activation is exactly zero, we do not propagate the
     // associated gradient value. This allows the output of the Relu to be used,
     // as well as its input.
+    auto gradient_ptr = reinterpret_cast<uintptr_t>(gradient.data());
+    auto feature_ptr = reinterpret_cast<uintptr_t>(feature.data());
+    auto backprop_ptr = reinterpret_cast<uintptr_t>(backprop.data());
+    bool aligned = gradient_ptr % 16 == 0 && feature_ptr % 16 == 0 &&
+                   backprop_ptr % 16 == 0;
     int32 count = gradient.size();
-    if (count == 0) return;
-    int32 half2_count = Eigen::divup(count, 2);
     constexpr int32 kThreadInBlock = 512;
-    GpuLaunchConfig config = GetGpuLaunchConfigFixedBlockSize(
-        half2_count, d, ReluGradHalfKernel, 0, kThreadInBlock);
-    TF_CHECK_OK(GpuLaunchKernel(
-        ReluGradHalfKernel, config.block_count, config.thread_per_block, 0,
-        d.stream(), gradient.data(), feature.data(), backprop.data(), count));
+    if (count == 0) return;
+    if (aligned) {
+      int32 half8_count = Eigen::divup(count, VectorSize);
+      int32 kBlock = Eigen::divup(half8_count, kThreadInBlock);
+      TF_CHECK_OK(GpuLaunchKernel(
+          ReluGradHalfKernelVector, kBlock, kThreadInBlock,
+          0, d.stream(), gradient.data(), feature.data(), backprop.data(),
+          count));
+    } else {
+      int32 half2_count = Eigen::divup(count, 2);
+      GpuLaunchConfig config = GetGpuLaunchConfigFixedBlockSize(
+          half2_count, d, ReluGradHalfKernel, 0, kThreadInBlock);
+      TF_CHECK_OK(GpuLaunchKernel(
+          ReluGradHalfKernel, config.block_count, config.thread_per_block, 0,
+          d.stream(), gradient.data(), feature.data(), backprop.data(), count));
+    }
   }
 };
 

From 695acfc91defb5e55110abd64bcc6b7f6ac65e41 Mon Sep 17 00:00:00 2001
From: Steenu Johnson <steenu.johnson@students.iiserpune.ac.in>
Date: Sat, 23 May 2020 08:09:25 +0530
Subject: [PATCH 0055/1390] Exclude col property in CSVDataset.

Signed-off-by: Steenu Johnson <steenu.johnson@students.iiserpune.ac.in>
---
 .../data/experimental/csv_dataset_op.cc       | 73 ++++++++++++++-----
 .../core/ops/experimental_dataset_ops.cc      | 10 ++-
 .../kernel_tests/csv_dataset_test.py          | 10 +++
 .../python/data/experimental/ops/readers.py   | 13 +++-
 4 files changed, 85 insertions(+), 21 deletions(-)

diff --git a/tensorflow/core/kernels/data/experimental/csv_dataset_op.cc b/tensorflow/core/kernels/data/experimental/csv_dataset_op.cc
index 8d1bd7acfd9..62d27294f04 100644
--- a/tensorflow/core/kernels/data/experimental/csv_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/csv_dataset_op.cc
@@ -62,6 +62,11 @@ class CSVDatasetOp : public DatasetOpKernel {
     OP_REQUIRES(ctx, select_cols_tensor->dims() == 1,
                 errors::InvalidArgument("`select_cols` must be a vector."));
 
+    const Tensor* exclude_cols_tensor;
+    OP_REQUIRES_OK(ctx, ctx->input("exclude_cols", &exclude_cols_tensor));
+    OP_REQUIRES(ctx, exclude_cols_tensor->dims() == 1,
+                errors::InvalidArgument("`exclude_cols` must be a vector"));
+
     int64 buffer_size = 0;
     OP_REQUIRES_OK(
         ctx, ParseScalarArgument<int64>(ctx, "buffer_size", &buffer_size));
@@ -126,11 +131,29 @@ class CSVDatasetOp : public DatasetOpKernel {
         ctx, select_cols.empty() || select_cols.front() >= 0,
         errors::InvalidArgument("select_cols should be non-negative indices"));
 
-    *output = new Dataset(ctx, std::move(filenames), header,
-                          std::move(compression_type), zlib_compression_options,
-                          output_types_, output_shapes_,
-                          std::move(record_defaults), std::move(select_cols),
-                          use_quote_delim, delim[0], std::move(na_value));
+    std::vector<int64> exclude_cols;
+    exclude_cols.reserve(exclude_cols_tensor->NumElements());
+    for (int i = 0; i < exclude_cols_tensor->NumElements(); ++i) {
+      exclude_cols.push_back(exclude_cols_tensor->flat<int64>()(i));
+    }
+    OP_REQUIRES(ctx, select_cols.empty() || exclude_cols.empty(),
+                errors::InvalidArgument(
+                    "Either select_cols or exlcude_cols should be empty"));
+    for (int i = 1; i < exclude_cols.size(); i++) {
+      OP_REQUIRES(ctx, exclude_cols[i - 1] < exclude_cols[i],
+                  errors::InvalidArgument(
+                      "exclude_cols should be strictly increasing indices"));
+    }
+    OP_REQUIRES(
+        ctx, exclude_cols.empty() || exclude_cols.front() >= 0,
+        errors::InvalidArgument("exclude_cols should be non-negative indices"));
+
+    *output =
+        new Dataset(ctx, std::move(filenames), header,
+                    std::move(compression_type), zlib_compression_options,
+                    output_types_, output_shapes_, std::move(record_defaults),
+                    std::move(select_cols), std::move(exclude_cols),
+                    use_quote_delim, delim[0], std::move(na_value));
   }
 
  private:
@@ -141,7 +164,8 @@ class CSVDatasetOp : public DatasetOpKernel {
             const DataTypeVector& output_types,
             const std::vector<PartialTensorShape>& output_shapes,
             std::vector<Tensor> record_defaults, std::vector<int64> select_cols,
-            bool use_quote_delim, char delim, string na_value)
+            std::vector<int64> exclude_cols, bool use_quote_delim, char delim,
+            string na_value)
         : DatasetBase(DatasetContext(ctx)),
           filenames_(std::move(filenames)),
           header_(header),
@@ -149,6 +173,7 @@ class CSVDatasetOp : public DatasetOpKernel {
           output_shapes_(output_shapes),
           record_defaults_(std::move(record_defaults)),
           select_cols_(std::move(select_cols)),
+          exclude_cols_(std::move(exclude_cols)),
           use_quote_delim_(use_quote_delim),
           delim_(delim),
           na_value_(std::move(na_value)),
@@ -184,6 +209,7 @@ class CSVDatasetOp : public DatasetOpKernel {
       Node* use_quote_delim = nullptr;
       Node* na_value = nullptr;
       Node* select_cols = nullptr;
+      Node* exclude_cols = nullptr;
 
       std::vector<Node*> record_defaults;
       record_defaults.reserve(record_defaults_.size());
@@ -204,16 +230,18 @@ class CSVDatasetOp : public DatasetOpKernel {
       TF_RETURN_IF_ERROR(b->AddScalar(use_quote_delim_, &use_quote_delim));
       TF_RETURN_IF_ERROR(b->AddScalar(na_value_, &na_value));
       TF_RETURN_IF_ERROR(b->AddVector(select_cols_, &select_cols));
+      TF_RETURN_IF_ERROR(b->AddVector(exclude_cols_, &exclude_cols));
 
       TF_RETURN_IF_ERROR(b->AddDataset(
           this,
           {std::make_pair(0, filenames), std::make_pair(1, compression_type),
            std::make_pair(2, buffer_size), std::make_pair(3, header),
            std::make_pair(4, delim), std::make_pair(5, use_quote_delim),
-           std::make_pair(6, na_value),
-           std::make_pair(7, select_cols)},      // Single tensor inputs
-          {std::make_pair(8, record_defaults)},  // Tensor list inputs
-          {}, output));
+           std::make_pair(6, na_value), std::make_pair(7, select_cols),
+           std::make_pair(8, exclude_cols)},     // Single tensor inputs
+          {std::make_pair(9, record_defaults)},  // Tensor list inputs
+          {},
+          output));
       return Status::OK();
     }
 
@@ -227,12 +255,14 @@ class CSVDatasetOp : public DatasetOpKernel {
                              std::vector<Tensor>* out_tensors,
                              bool* end_of_sequence) override {
         mutex_lock l(mu_);
-        bool select_all = dataset()->select_cols_.empty();
+        bool select_all =
+            dataset()->select_cols_.empty() && dataset()->exclude_cols_.empty();
         do {
           // We are currently processing a file, so try to read the next record
           if (input_stream_) {
-            Status s = ReadRecord(ctx, out_tensors, select_all,
-                                  dataset()->select_cols_);
+            Status s =
+                ReadRecord(ctx, out_tensors, select_all,
+                           dataset()->select_cols_, dataset()->exclude_cols_);
             if (s.ok()) {
               // Validate output
               if (out_tensors->size() != dataset()->out_type_.size()) {
@@ -336,7 +366,8 @@ class CSVDatasetOp : public DatasetOpKernel {
       // Note: ctx and out_tensors are only used in this function
       // when fields are included in the record.
       Status ReadRecord(IteratorContext* ctx, std::vector<Tensor>* out_tensors,
-                        bool select_all, const std::vector<int64>& selected)
+                        bool select_all, const std::vector<int64>& selected,
+                        const std::vector<int64>& excluded)
           TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
         if (pos_ >= buffer_.size()) {
           // At the end of the file, this will return errors::OutOfRange
@@ -350,13 +381,17 @@ class CSVDatasetOp : public DatasetOpKernel {
         bool end_of_record = false;  // Keep track of when we find \n, \r or EOF
         size_t num_parsed = 0;
         size_t num_selected_parsed = 0;
+        size_t num_excluded_parsed = 0;
 
         Status result;
 
         while (!end_of_record) {  // Read till we reach \n, \r or EOF
-          bool include =
-              select_all || (num_selected_parsed < selected.size() &&
-                             selected[num_selected_parsed] == num_parsed);
+          bool exclude = num_excluded_parsed < excluded.size() &&
+                         excluded[num_excluded_parsed] == num_parsed;
+          bool include = select_all ||
+                         (num_selected_parsed < selected.size() &&
+                          selected[num_selected_parsed] == num_parsed) ||
+                         (!excluded.empty() && !exclude);
 
           // Don't fail fast, so that the next call to GetNext may still return
           // a valid record
@@ -365,6 +400,7 @@ class CSVDatasetOp : public DatasetOpKernel {
 
           num_parsed++;
           if (include) num_selected_parsed++;
+          if (exclude) num_excluded_parsed++;
         }
 
         return result;
@@ -815,7 +851,7 @@ class CSVDatasetOp : public DatasetOpKernel {
           // the first newline because it might contain quoted fields with
           // newlines in the header as well
           std::vector<int64> empty;
-          Status s = ReadRecord(nullptr, nullptr, false, empty);
+          Status s = ReadRecord(nullptr, nullptr, false, empty, empty);
           if (!s.ok()) {
             return errors::InvalidArgument("Can't read header of file");
           }
@@ -849,6 +885,7 @@ class CSVDatasetOp : public DatasetOpKernel {
     const std::vector<PartialTensorShape> output_shapes_;
     const std::vector<Tensor> record_defaults_;
     const std::vector<int64> select_cols_;
+    const std::vector<int64> exclude_cols_;
     const bool use_quote_delim_;
     const char delim_;
     const tstring na_value_;
diff --git a/tensorflow/core/ops/experimental_dataset_ops.cc b/tensorflow/core/ops/experimental_dataset_ops.cc
index aa4bd64270a..9910472a2c6 100644
--- a/tensorflow/core/ops/experimental_dataset_ops.cc
+++ b/tensorflow/core/ops/experimental_dataset_ops.cc
@@ -154,6 +154,7 @@ REGISTER_OP("CSVDataset")
     .Input("use_quote_delim: bool")
     .Input("na_value: string")
     .Input("select_cols: int64")
+    .Input("exclude_cols: int64")
     .Input("record_defaults: output_types")
     .Output("handle: variant")
     .Attr("output_types: list({float,double,int32,int64,string}) >= 1")
@@ -174,8 +175,10 @@ REGISTER_OP("CSVDataset")
       TF_RETURN_IF_ERROR(c->WithRank(c->input(6), 0, &unused));
       // `select_cols` must be a vector
       TF_RETURN_IF_ERROR(c->WithRank(c->input(7), 1, &unused));
+      //`exclude_cols` must be a vecotr
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(8), 1, &unused));
       // `record_defaults` must be lists of scalars
-      for (size_t i = 8; i < c->num_inputs(); ++i) {
+      for (size_t i = 9; i < c->num_inputs(); ++i) {
         shape_inference::ShapeHandle v;
         TF_RETURN_IF_ERROR(c->WithRankAtMost(c->input(i), 1, &v));
         if (c->Rank(c->input(i)) == 1 && c->Value(c->Dim(v, 0)) > 1) {
@@ -196,6 +199,7 @@ REGISTER_OP("ExperimentalCSVDataset")
     .Input("use_quote_delim: bool")
     .Input("na_value: string")
     .Input("select_cols: int64")
+    .Input("exclude_cols: int64")
     .Input("record_defaults: output_types")
     .Output("handle: variant")
     .Attr("output_types: list({float,double,int32,int64,string}) >= 1")
@@ -216,8 +220,10 @@ REGISTER_OP("ExperimentalCSVDataset")
       TF_RETURN_IF_ERROR(c->WithRank(c->input(6), 0, &unused));
       // `select_cols` must be a vector
       TF_RETURN_IF_ERROR(c->WithRank(c->input(7), 1, &unused));
+      // `exclude_cols` must be a vector
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(8), 1, &unused));
       // `record_defaults` must be lists of scalars
-      for (size_t i = 8; i < c->num_inputs(); ++i) {
+      for (size_t i = 9; i < c->num_inputs(); ++i) {
         shape_inference::ShapeHandle v;
         TF_RETURN_IF_ERROR(c->WithRankAtMost(c->input(i), 1, &v));
         if (c->Rank(c->input(i)) == 1 && c->Value(c->Dim(v, 0)) > 1) {
diff --git a/tensorflow/python/data/experimental/kernel_tests/csv_dataset_test.py b/tensorflow/python/data/experimental/kernel_tests/csv_dataset_test.py
index 13948305aea..42a96812bb4 100644
--- a/tensorflow/python/data/experimental/kernel_tests/csv_dataset_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/csv_dataset_test.py
@@ -414,6 +414,16 @@ class CsvDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
         record_defaults=record_defaults,
         select_cols=[0])
 
+  @combinations.generate(test_base.default_test_combinations())
+  def testCsvDataset_withExcludeCol(self):
+    record_defaults = [['']]
+    inputs = [['1,2,3', '5,6,7']]
+    self._test_dataset(
+        inputs,
+        expected_output=[['1'], ['5']],
+        record_defaults=record_defaults,
+        exclude_cols=[1, 2])
+
   @combinations.generate(test_base.default_test_combinations())
   def testCsvDataset_withMultipleNewLines(self):
     # In this case, we expect it to behave differently from
diff --git a/tensorflow/python/data/experimental/ops/readers.py b/tensorflow/python/data/experimental/ops/readers.py
index b8f4c34f40e..fcf92f5aaf9 100644
--- a/tensorflow/python/data/experimental/ops/readers.py
+++ b/tensorflow/python/data/experimental/ops/readers.py
@@ -612,7 +612,8 @@ class CsvDatasetV2(dataset_ops.DatasetSource):
                field_delim=",",
                use_quote_delim=True,
                na_value="",
-               select_cols=None):
+               select_cols=None,
+               exclude_cols=None):
     """Creates a `CsvDataset` by reading and decoding CSV files.
 
     The elements of this dataset correspond to records from the file(s).
@@ -679,6 +680,9 @@ class CsvDatasetV2(dataset_ops.DatasetSource):
       select_cols: (Optional.) A sorted list of column indices to select from
         the input data. If specified, only this subset of columns will be
         parsed. Defaults to parsing all columns.
+      exclude_cols:(Optional.) A sorted list of column indices to exclude from
+        the input data. If specified, only the complement of this set of column
+        will be parsed. Defaults to parsing all columns.
     """
     self._filenames = ops.convert_to_tensor(
         filenames, dtype=dtypes.string, name="filenames")
@@ -710,6 +714,12 @@ class CsvDatasetV2(dataset_ops.DatasetSource):
         argument_default=[],
         argument_dtype=dtypes.int64,
     )
+    self._exclude_cols = convert.optional_param_to_tensor(
+        "exclude_cols",
+        exclude_cols,
+        argument_default=[],
+        argument_dtype=dtypes.int64,
+    )
     self._element_spec = tuple(
         tensor_spec.TensorSpec([], d.dtype) for d in self._record_defaults)
     variant_tensor = gen_experimental_dataset_ops.csv_dataset(
@@ -722,6 +732,7 @@ class CsvDatasetV2(dataset_ops.DatasetSource):
         use_quote_delim=self._use_quote_delim,
         na_value=self._na_value,
         select_cols=self._select_cols,
+        exclude_cols=self._exclude_cols,
         compression_type=self._compression_type)
     super(CsvDatasetV2, self).__init__(variant_tensor)
 

From fb190dcf851b96c3175417f253865d94f9eb4785 Mon Sep 17 00:00:00 2001
From: settle <31239886+settle@users.noreply.github.com>
Date: Tue, 26 May 2020 17:56:14 -0700
Subject: [PATCH 0056/1390] Update spec.yml

---
 tensorflow/tools/dockerfiles/spec.yml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tensorflow/tools/dockerfiles/spec.yml b/tensorflow/tools/dockerfiles/spec.yml
index f68f627e1d0..9a7325c7441 100644
--- a/tensorflow/tools/dockerfiles/spec.yml
+++ b/tensorflow/tools/dockerfiles/spec.yml
@@ -50,7 +50,7 @@ releases:
             - "{ubuntu-devel-ppc64le}{jupyter}"
             - "{ubuntu-horovod}{jupyter}"
             - "{ubuntu-devel-horovod}{jupyter}"
-            - "{ubuntu-devel-aarch64}{jupyter}"
+            - "{ubuntu-devel-arm64v8}{jupyter}"
 
 slice_sets:
 
@@ -111,9 +111,9 @@ slice_sets:
               - CHECKOUT_TF_SRC=1
 
     ubuntu-devel-aarch64:
-        - add_to_name: "devel-aarch64"
-          dockerfile_exclusive_name: "devel-cpu-aarch64"
-          dockerfile_subdirectory: "aarch64"
+        - add_to_name: "devel-arm64v8"
+          dockerfile_exclusive_name: "devel-cpu-arm64v8"
+          dockerfile_subdirectory: "arm64v8"
           partials:
               - ubuntu/version
               - ubuntu/devel-cpu

From 3dc3f491c4c53a87e6a40b4f9d00415d0c8634e8 Mon Sep 17 00:00:00 2001
From: settle <31239886+settle@users.noreply.github.com>
Date: Tue, 26 May 2020 17:57:47 -0700
Subject: [PATCH 0057/1390] Update spec.yml

---
 tensorflow/tools/dockerfiles/spec.yml | 19 ++++++++++++++++++-
 1 file changed, 18 insertions(+), 1 deletion(-)

diff --git a/tensorflow/tools/dockerfiles/spec.yml b/tensorflow/tools/dockerfiles/spec.yml
index 9a7325c7441..6928cca8bce 100644
--- a/tensorflow/tools/dockerfiles/spec.yml
+++ b/tensorflow/tools/dockerfiles/spec.yml
@@ -50,6 +50,7 @@ releases:
             - "{ubuntu-devel-ppc64le}{jupyter}"
             - "{ubuntu-horovod}{jupyter}"
             - "{ubuntu-devel-horovod}{jupyter}"
+            - "{ubuntu-devel-arm32v7}{jupyter}"
             - "{ubuntu-devel-arm64v8}{jupyter}"
 
 slice_sets:
@@ -110,7 +111,23 @@ slice_sets:
           args:
               - CHECKOUT_TF_SRC=1
 
-    ubuntu-devel-aarch64:
+    ubuntu-devel-arm32v7:
+        - add_to_name: "devel-arm32v7"
+          dockerfile_exclusive_name: "devel-cpu-arm32v7"
+          dockerfile_subdirectory: "arm32v7"
+          partials:
+              - ubuntu/version
+              - ubuntu/devel-cpu
+              - ubuntu/python
+              - ubuntu/bazelbuild
+              - shell
+          tests:
+              - build-cpu.sh
+          args:
+              - UBUNTU_VERSION=18.04
+              - CHECKOUT_TF_SRC=1
+
+    ubuntu-devel-arm64v8:
         - add_to_name: "devel-arm64v8"
           dockerfile_exclusive_name: "devel-cpu-arm64v8"
           dockerfile_subdirectory: "arm64v8"

From 57032278576a43f654c45230ae055f5e0ea06cd6 Mon Sep 17 00:00:00 2001
From: settle <31239886+settle@users.noreply.github.com>
Date: Tue, 26 May 2020 18:00:52 -0700
Subject: [PATCH 0058/1390] Create bazelbuild-arm32v7.partial.Dockerfile

---
 .../bazelbuild-arm32v7.partial.Dockerfile     | 35 +++++++++++++++++++
 1 file changed, 35 insertions(+)
 create mode 100644 tensorflow/tools/dockerfiles/partials/ubuntu/bazelbuild-arm32v7.partial.Dockerfile

diff --git a/tensorflow/tools/dockerfiles/partials/ubuntu/bazelbuild-arm32v7.partial.Dockerfile b/tensorflow/tools/dockerfiles/partials/ubuntu/bazelbuild-arm32v7.partial.Dockerfile
new file mode 100644
index 00000000000..b833657aa69
--- /dev/null
+++ b/tensorflow/tools/dockerfiles/partials/ubuntu/bazelbuild-arm32v7.partial.Dockerfile
@@ -0,0 +1,35 @@
+RUN apt-get update && apt-get install -y \
+    build-essential \
+    curl \
+    git \
+    openjdk-8-jdk \
+    python3-dev \
+    virtualenv \
+    swig
+
+RUN apt-get update && apt-get install -y \
+    python3-pil \
+    python3-h5py \
+    python3-keras-preprocessing \
+    python3-matplotlib \
+    python3-mock \
+    python3-numpy \
+    python3-scipy \
+    python3-sklearn \
+    python3-pandas \
+    python3-portpicker
+
+RUN python3 -m pip --no-cache-dir install \
+    enum34
+
+# Build and install bazel
+ENV BAZEL_VERSION 3.0.0
+WORKDIR /
+RUN mkdir /bazel && \
+    cd /bazel && \
+    curl -fSsL -O https://github.com/bazelbuild/bazel/releases/download/$BAZEL_VERSION/bazel-$BAZEL_VERSION-dist.zip && \
+    unzip bazel-$BAZEL_VERSION-dist.zip && \
+    bash ./compile.sh && \
+    cp output/bazel /usr/local/bin/ && \
+    rm -rf /bazel && \
+    cd -

From a1e2ea51681b06086d2ce041fc976951e3faa29f Mon Sep 17 00:00:00 2001
From: settle <31239886+settle@users.noreply.github.com>
Date: Tue, 26 May 2020 18:01:39 -0700
Subject: [PATCH 0059/1390] Create bazelbuild-arm64v8.partial.Dockerfile

---
 .../bazelbuild-arm64v8.partial.Dockerfile     | 35 +++++++++++++++++++
 1 file changed, 35 insertions(+)
 create mode 100644 tensorflow/tools/dockerfiles/partials/ubuntu/bazelbuild-arm64v8.partial.Dockerfile

diff --git a/tensorflow/tools/dockerfiles/partials/ubuntu/bazelbuild-arm64v8.partial.Dockerfile b/tensorflow/tools/dockerfiles/partials/ubuntu/bazelbuild-arm64v8.partial.Dockerfile
new file mode 100644
index 00000000000..b833657aa69
--- /dev/null
+++ b/tensorflow/tools/dockerfiles/partials/ubuntu/bazelbuild-arm64v8.partial.Dockerfile
@@ -0,0 +1,35 @@
+RUN apt-get update && apt-get install -y \
+    build-essential \
+    curl \
+    git \
+    openjdk-8-jdk \
+    python3-dev \
+    virtualenv \
+    swig
+
+RUN apt-get update && apt-get install -y \
+    python3-pil \
+    python3-h5py \
+    python3-keras-preprocessing \
+    python3-matplotlib \
+    python3-mock \
+    python3-numpy \
+    python3-scipy \
+    python3-sklearn \
+    python3-pandas \
+    python3-portpicker
+
+RUN python3 -m pip --no-cache-dir install \
+    enum34
+
+# Build and install bazel
+ENV BAZEL_VERSION 3.0.0
+WORKDIR /
+RUN mkdir /bazel && \
+    cd /bazel && \
+    curl -fSsL -O https://github.com/bazelbuild/bazel/releases/download/$BAZEL_VERSION/bazel-$BAZEL_VERSION-dist.zip && \
+    unzip bazel-$BAZEL_VERSION-dist.zip && \
+    bash ./compile.sh && \
+    cp output/bazel /usr/local/bin/ && \
+    rm -rf /bazel && \
+    cd -

From 40094b18f499d59bfcd21fde380ae6abb24a959e Mon Sep 17 00:00:00 2001
From: settle <31239886+settle@users.noreply.github.com>
Date: Tue, 26 May 2020 18:02:49 -0700
Subject: [PATCH 0060/1390] Update spec.yml

---
 tensorflow/tools/dockerfiles/spec.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/tools/dockerfiles/spec.yml b/tensorflow/tools/dockerfiles/spec.yml
index 6928cca8bce..2556e740e1d 100644
--- a/tensorflow/tools/dockerfiles/spec.yml
+++ b/tensorflow/tools/dockerfiles/spec.yml
@@ -119,7 +119,7 @@ slice_sets:
               - ubuntu/version
               - ubuntu/devel-cpu
               - ubuntu/python
-              - ubuntu/bazelbuild
+              - ubuntu/bazelbuild-arm32v7
               - shell
           tests:
               - build-cpu.sh
@@ -135,7 +135,7 @@ slice_sets:
               - ubuntu/version
               - ubuntu/devel-cpu
               - ubuntu/python
-              - ubuntu/bazelbuild
+              - ubuntu/bazelbuild-arm64v8
               - shell
           tests:
               - build-cpu.sh

From e71c33a671f38c77f48cd30e89972ef47aa56adb Mon Sep 17 00:00:00 2001
From: settle <31239886+settle@users.noreply.github.com>
Date: Tue, 26 May 2020 18:08:37 -0700
Subject: [PATCH 0061/1390] Update bazelbuild.partial.Dockerfile

---
 .../ubuntu/bazelbuild.partial.Dockerfile      | 22 +++++++++----------
 1 file changed, 10 insertions(+), 12 deletions(-)

diff --git a/tensorflow/tools/dockerfiles/partials/ubuntu/bazelbuild.partial.Dockerfile b/tensorflow/tools/dockerfiles/partials/ubuntu/bazelbuild.partial.Dockerfile
index b833657aa69..628e56890a8 100644
--- a/tensorflow/tools/dockerfiles/partials/ubuntu/bazelbuild.partial.Dockerfile
+++ b/tensorflow/tools/dockerfiles/partials/ubuntu/bazelbuild.partial.Dockerfile
@@ -7,19 +7,17 @@ RUN apt-get update && apt-get install -y \
     virtualenv \
     swig
 
-RUN apt-get update && apt-get install -y \
-    python3-pil \
-    python3-h5py \
-    python3-keras-preprocessing \
-    python3-matplotlib \
-    python3-mock \
-    python3-numpy \
-    python3-scipy \
-    python3-sklearn \
-    python3-pandas \
-    python3-portpicker
-
 RUN python3 -m pip --no-cache-dir install \
+    Pillow \
+    h5py \
+    keras_preprocessing \
+    matplotlib \
+    mock \
+    numpy \
+    scipy \
+    sklearn \
+    pandas \
+    portpicker \
     enum34
 
 # Build and install bazel

From d3775b48823caeff6f9393bfe3e3d6ddca670d84 Mon Sep 17 00:00:00 2001
From: settle <31239886+settle@users.noreply.github.com>
Date: Tue, 26 May 2020 18:14:00 -0700
Subject: [PATCH 0062/1390] Delete bazelbuild-arm32v7.partial.Dockerfile

---
 .../bazelbuild-arm32v7.partial.Dockerfile     | 35 -------------------
 1 file changed, 35 deletions(-)
 delete mode 100644 tensorflow/tools/dockerfiles/partials/ubuntu/bazelbuild-arm32v7.partial.Dockerfile

diff --git a/tensorflow/tools/dockerfiles/partials/ubuntu/bazelbuild-arm32v7.partial.Dockerfile b/tensorflow/tools/dockerfiles/partials/ubuntu/bazelbuild-arm32v7.partial.Dockerfile
deleted file mode 100644
index b833657aa69..00000000000
--- a/tensorflow/tools/dockerfiles/partials/ubuntu/bazelbuild-arm32v7.partial.Dockerfile
+++ /dev/null
@@ -1,35 +0,0 @@
-RUN apt-get update && apt-get install -y \
-    build-essential \
-    curl \
-    git \
-    openjdk-8-jdk \
-    python3-dev \
-    virtualenv \
-    swig
-
-RUN apt-get update && apt-get install -y \
-    python3-pil \
-    python3-h5py \
-    python3-keras-preprocessing \
-    python3-matplotlib \
-    python3-mock \
-    python3-numpy \
-    python3-scipy \
-    python3-sklearn \
-    python3-pandas \
-    python3-portpicker
-
-RUN python3 -m pip --no-cache-dir install \
-    enum34
-
-# Build and install bazel
-ENV BAZEL_VERSION 3.0.0
-WORKDIR /
-RUN mkdir /bazel && \
-    cd /bazel && \
-    curl -fSsL -O https://github.com/bazelbuild/bazel/releases/download/$BAZEL_VERSION/bazel-$BAZEL_VERSION-dist.zip && \
-    unzip bazel-$BAZEL_VERSION-dist.zip && \
-    bash ./compile.sh && \
-    cp output/bazel /usr/local/bin/ && \
-    rm -rf /bazel && \
-    cd -

From 8882e4b89a707d68ab6f71704d388a43ae7de76f Mon Sep 17 00:00:00 2001
From: settle <31239886+settle@users.noreply.github.com>
Date: Tue, 26 May 2020 18:14:41 -0700
Subject: [PATCH 0063/1390] Update spec.yml

---
 tensorflow/tools/dockerfiles/spec.yml | 17 -----------------
 1 file changed, 17 deletions(-)

diff --git a/tensorflow/tools/dockerfiles/spec.yml b/tensorflow/tools/dockerfiles/spec.yml
index 2556e740e1d..ea05d77d001 100644
--- a/tensorflow/tools/dockerfiles/spec.yml
+++ b/tensorflow/tools/dockerfiles/spec.yml
@@ -50,7 +50,6 @@ releases:
             - "{ubuntu-devel-ppc64le}{jupyter}"
             - "{ubuntu-horovod}{jupyter}"
             - "{ubuntu-devel-horovod}{jupyter}"
-            - "{ubuntu-devel-arm32v7}{jupyter}"
             - "{ubuntu-devel-arm64v8}{jupyter}"
 
 slice_sets:
@@ -111,22 +110,6 @@ slice_sets:
           args:
               - CHECKOUT_TF_SRC=1
 
-    ubuntu-devel-arm32v7:
-        - add_to_name: "devel-arm32v7"
-          dockerfile_exclusive_name: "devel-cpu-arm32v7"
-          dockerfile_subdirectory: "arm32v7"
-          partials:
-              - ubuntu/version
-              - ubuntu/devel-cpu
-              - ubuntu/python
-              - ubuntu/bazelbuild-arm32v7
-              - shell
-          tests:
-              - build-cpu.sh
-          args:
-              - UBUNTU_VERSION=18.04
-              - CHECKOUT_TF_SRC=1
-
     ubuntu-devel-arm64v8:
         - add_to_name: "devel-arm64v8"
           dockerfile_exclusive_name: "devel-cpu-arm64v8"

From b4062c71c659bde06e5e15c53424a6853aa1ee3c Mon Sep 17 00:00:00 2001
From: rangjiaheng <rangjiaheng@gmail.com>
Date: Wed, 27 May 2020 13:27:18 +0800
Subject: [PATCH 0064/1390] add TF_LOCKS_EXCLUDED to
 MutableDenseHashTable::MemoryUsed

---
 tensorflow/core/kernels/lookup_table_op.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/kernels/lookup_table_op.cc b/tensorflow/core/kernels/lookup_table_op.cc
index 9807247ed4f..f269aa65b4e 100644
--- a/tensorflow/core/kernels/lookup_table_op.cc
+++ b/tensorflow/core/kernels/lookup_table_op.cc
@@ -557,7 +557,7 @@ class MutableDenseHashTable final : public LookupInterface {
 
   TensorShape value_shape() const override { return value_shape_; }
 
-  int64 MemoryUsed() const override {
+  int64 MemoryUsed() const override TF_LOCKS_EXCLUDED(mu_) {
     tf_shared_lock l(mu_);
     return sizeof(MutableDenseHashTable) + key_buckets_.AllocatedBytes() +
            value_buckets_.AllocatedBytes() + empty_key_.AllocatedBytes();

From 18aa35c75fcf64dfd99e2a3e7cdcd62bafbc030f Mon Sep 17 00:00:00 2001
From: Fredrik Knutsson <fredrik.knutsson@arm.com>
Date: Wed, 27 May 2020 19:18:06 +0200
Subject: [PATCH 0065/1390] Fixed review comments 27/5

---
 .../micro/memory_planner/greedy_memory_planner.cc   |  2 +-
 tensorflow/lite/micro/micro_allocator.cc            | 13 +++++++------
 tensorflow/lite/micro/micro_optional_debug_tools.cc |  3 +--
 tensorflow/lite/micro/test_helpers.cc               |  4 +---
 4 files changed, 10 insertions(+), 12 deletions(-)

diff --git a/tensorflow/lite/micro/memory_planner/greedy_memory_planner.cc b/tensorflow/lite/micro/memory_planner/greedy_memory_planner.cc
index 47bb7cfb8c0..8f21a167f67 100644
--- a/tensorflow/lite/micro/memory_planner/greedy_memory_planner.cc
+++ b/tensorflow/lite/micro/memory_planner/greedy_memory_planner.cc
@@ -292,7 +292,7 @@ size_t GreedyMemoryPlanner::GetMaximumMemorySize() {
   if (buffer_count_ == 0) {
     return 0;
   }
-  ListEntry* entry = &buffers_sorted_by_offset_[0];
+  ListEntry* entry = &buffers_sorted_by_offset_[first_entry_index_];
   size_t max_size = 0;
   while (entry) {
     BufferRequirements* requirements =
diff --git a/tensorflow/lite/micro/micro_allocator.cc b/tensorflow/lite/micro/micro_allocator.cc
index 9c4de6e3035..870e466b4e4 100644
--- a/tensorflow/lite/micro/micro_allocator.cc
+++ b/tensorflow/lite/micro/micro_allocator.cc
@@ -91,7 +91,7 @@ TfLiteStatus AllocateVariables(
 TfLiteStatus CheckOfflinePlannedOffsets(const Model* model,
                                         ErrorReporter* error_reporter) {
   if (model->metadata()) {
-    for (int i = 0; i < model->metadata()->size(); ++i) {
+    for (size_t i = 0; i < model->metadata()->size(); ++i) {
       auto metadata = model->metadata()->Get(i);
       if (strncmp(metadata->name()->c_str(), kOfflineMemAllocMetadata,
                   strlen(kOfflineMemAllocMetadata)) == 0) {
@@ -115,11 +115,11 @@ TfLiteStatus CheckOfflinePlannedOffsets(const Model* model,
                              "Offline planner metadata found, version %d, "
                              "subgraph %d, nbr offline offsets %d",
                              version, subgraph_idx, nbr_offline_offsets);
-        for (int i = 0; i < nbr_offline_offsets; ++i) {
+        for (int j = 0; j < nbr_offline_offsets; ++j) {
           TF_LITE_REPORT_ERROR(
               error_reporter,
-              "Offline planner tensor index %d, offline offset: %d", i,
-              offline_planner_offsets[i]);
+              "Offline planner tensor index %d, offline offset: %d", j,
+              offline_planner_offsets[j]);
         }
 
         if (version != 1) {
@@ -302,7 +302,7 @@ TfLiteStatus AllocationInfoBuilder::AddTensors(const SubGraph* subgraph,
 TfLiteStatus AllocationInfoBuilder::GetOfflinePlannedOffsets(
     const Model* model, int32_t** offline_planner_offsets) {
   if (model->metadata()) {
-    for (int i = 0; i < model->metadata()->size(); ++i) {
+    for (size_t i = 0; i < model->metadata()->size(); ++i) {
       auto metadata = model->metadata()->Get(i);
       if (strncmp(metadata->name()->c_str(), kOfflineMemAllocMetadata,
                   strlen(kOfflineMemAllocMetadata)) == 0) {
@@ -311,7 +311,7 @@ TfLiteStatus AllocationInfoBuilder::GetOfflinePlannedOffsets(
         auto* buffer = (*buffers)[metadata->buffer()];
         auto* array = buffer->data();
         const uint32_t* metadata_buffer = (uint32_t*)array->data();
-        const int32_t nbr_tensors = metadata_buffer[2];
+        const size_t nbr_tensors = (size_t)metadata_buffer[2];
         *offline_planner_offsets = (int32_t*)&metadata_buffer[3];
 
         if (tensor_count_ != nbr_tensors) {
@@ -339,6 +339,7 @@ TfLiteStatus AllocationInfoBuilder::AddScratchBuffers(
     current->first_created = handle->node_idx;
     current->last_used = handle->node_idx;
     current->needs_allocating = true;
+    current->offline_offset = kOnlinePlannedBuffer;
   }
   return kTfLiteOk;
 }
diff --git a/tensorflow/lite/micro/micro_optional_debug_tools.cc b/tensorflow/lite/micro/micro_optional_debug_tools.cc
index 418347a5b25..22b170094d5 100644
--- a/tensorflow/lite/micro/micro_optional_debug_tools.cc
+++ b/tensorflow/lite/micro/micro_optional_debug_tools.cc
@@ -122,9 +122,8 @@ void PrintModelData(const Model* model, ErrorReporter* error_reporter) {
   const flatbuffers::Vector<flatbuffers::Offset<Buffer>>* buffers =
     model->buffers();
   TF_LITE_REPORT_ERROR(error_reporter, "==== Model info: =====");
-  for (int i = 0; i < tensors->size(); ++i) {
+  for (size_t i = 0; i < tensors->size(); ++i) {
     const tflite::Tensor& flatbuffer_tensor = *tensors->Get(i);
-    auto* quantization = flatbuffer_tensor.quantization();
     size_t type_size, tensor_size;
     auto* buffer = (*buffers)[flatbuffer_tensor.buffer()];
     auto* array = buffer->data();
diff --git a/tensorflow/lite/micro/test_helpers.cc b/tensorflow/lite/micro/test_helpers.cc
index b60d3065020..96e000b1b6d 100644
--- a/tensorflow/lite/micro/test_helpers.cc
+++ b/tensorflow/lite/micro/test_helpers.cc
@@ -308,10 +308,8 @@ const Model* BuildModelWithOfflinePlanning(int number_of_tensors,
       model_builder.RegisterOp(BuiltinOperator_CUSTOM, "mock_custom",
                                /* version= */ 0);
 
-  int tensors[number_of_tensors];
-
   for (int i = 0; i < number_of_tensors; ++i) {
-    tensors[i] = model_builder.AddTensor(TensorType_FLOAT32, {2, 2, 3});
+    model_builder.AddTensor(TensorType_FLOAT32, {2, 2, 3});
   }
 
   for (int i = 0; i < num_conns; ++i) {

From 0c7160babc2213269798ea64a292801123edd0d4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?M=C3=A5ns=20Nilsson?= <mans.nilsson@arm.com>
Date: Tue, 21 Apr 2020 15:25:11 +0200
Subject: [PATCH 0066/1390] TFLu: Add MVE flag to cmsis-nn glue for clarity

__ARM_FEATURE_MVE is now autodetected so ARM_MATH_MVEI is no longer
needed.
---
 tensorflow/lite/micro/kernels/cmsis-nn/conv.cc            | 4 ++--
 tensorflow/lite/micro/kernels/cmsis-nn/depthwise_conv.cc  | 6 +++---
 tensorflow/lite/micro/kernels/cmsis-nn/fully_connected.cc | 7 ++++---
 tensorflow/lite/micro/kernels/cmsis-nn/pooling.cc         | 4 ++--
 tensorflow/lite/micro/tools/make/ext_libs/cmsis.inc       | 5 -----
 5 files changed, 11 insertions(+), 15 deletions(-)

diff --git a/tensorflow/lite/micro/kernels/cmsis-nn/conv.cc b/tensorflow/lite/micro/kernels/cmsis-nn/conv.cc
index 6e8272b221a..c505670d747 100644
--- a/tensorflow/lite/micro/kernels/cmsis-nn/conv.cc
+++ b/tensorflow/lite/micro/kernels/cmsis-nn/conv.cc
@@ -115,7 +115,7 @@ void* Init(TfLiteContext* context, const char* buffer, size_t length) {
 }
 
 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
-#if defined(__ARM_FEATURE_DSP)
+#if defined(__ARM_FEATURE_DSP) || defined(__ARM_FEATURE_MVE)
   OpData data;
   int32_t buf_size = 0;
 
@@ -240,7 +240,7 @@ TfLiteStatus EvalQuantizedPerChannel(
   quant_params.multiplier = data->per_channel_output_multiplier;
   quant_params.shift = data->per_channel_output_shift;
 
-#if defined(__ARM_FEATURE_DSP)
+#if defined(__ARM_FEATURE_DSP) || defined(__ARM_FEATURE_MVE)
   RuntimeShape filter_shape = GetTensorShape(filter);
   RuntimeShape input_shape = GetTensorShape(input);
   RuntimeShape output_shape = GetTensorShape(output);
diff --git a/tensorflow/lite/micro/kernels/cmsis-nn/depthwise_conv.cc b/tensorflow/lite/micro/kernels/cmsis-nn/depthwise_conv.cc
index 7ba03d3890d..f18c3170174 100644
--- a/tensorflow/lite/micro/kernels/cmsis-nn/depthwise_conv.cc
+++ b/tensorflow/lite/micro/kernels/cmsis-nn/depthwise_conv.cc
@@ -104,7 +104,7 @@ void* Init(TfLiteContext* context, const char* buffer, size_t length) {
 }
 
 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
-#if defined(__ARM_FEATURE_DSP)
+#if defined(__ARM_FEATURE_DSP) || defined(__ARM_FEATURE_MVE)
   auto* params =
       reinterpret_cast<TfLiteDepthwiseConvParams*>(node->builtin_data);
 
@@ -186,7 +186,7 @@ TfLiteStatus EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node,
   op_params.quantized_activation_min = std::numeric_limits<int8_t>::min();
   op_params.quantized_activation_max = std::numeric_limits<int8_t>::max();
 
-#if defined(__ARM_FEATURE_DSP)
+#if defined(__ARM_FEATURE_DSP) || defined(__ARM_FEATURE_MVE)
   RuntimeShape filter_shape = GetTensorShape(filter);
   const int filter_height = filter_shape.Dims(1);
   const int filter_width = filter_shape.Dims(2);
@@ -284,7 +284,7 @@ TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
   // Legacy ops used mixed left and right shifts. Now all are +ve-means-left.
   op_params.output_shift = -data->output_shift;
 
-#if defined(__ARM_FEATURE_DSP)
+#if defined(__ARM_FEATURE_DSP) || defined(__ARM_FEATURE_MVE)
   // optimizations utilize loop unrolling which requires the following power
   // of two kernel dimensions
   RuntimeShape filter_shape = GetTensorShape(filter);
diff --git a/tensorflow/lite/micro/kernels/cmsis-nn/fully_connected.cc b/tensorflow/lite/micro/kernels/cmsis-nn/fully_connected.cc
index 78787ea2547..0ab32ecc3c3 100644
--- a/tensorflow/lite/micro/kernels/cmsis-nn/fully_connected.cc
+++ b/tensorflow/lite/micro/kernels/cmsis-nn/fully_connected.cc
@@ -84,11 +84,11 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, input->type, output->type);
   TF_LITE_ENSURE_MSG(context, input->type == filter->type,
                      "Hybrid models are not supported on TFLite Micro.");
-#if defined(__ARM_FEATURE_DSP)
+
+#if defined(__ARM_FEATURE_DSP) || defined(__ARM_FEATURE_MVE)
   RuntimeShape filter_shape = GetTensorShape(filter);
   const int filter_dim_count = filter_shape.DimensionsCount();
   const int accum_depth = filter_shape.Dims(filter_dim_count - 1);
-
   const int32_t buf_size = arm_fully_connected_s8_get_buffer_size(accum_depth);
 
   int* buffer_idx = reinterpret_cast<int*>(node->user_data);
@@ -101,6 +101,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
     *buffer_idx = -1;
   }
 #endif
+
   return kTfLiteOk;
 }
 
@@ -116,7 +117,7 @@ TfLiteStatus EvalQuantizedInt8(TfLiteContext* context, TfLiteNode* node,
   const int filter_dim_count = filter_shape.DimensionsCount();
   const int accum_depth = filter_shape.Dims(filter_dim_count - 1);
 
-#if defined(__ARM_FEATURE_DSP)
+#if defined(__ARM_FEATURE_DSP) || defined(__ARM_FEATURE_MVE)
   int16_t* buf = nullptr;
 
   auto* buffer_idx = reinterpret_cast<int*>(node->user_data);
diff --git a/tensorflow/lite/micro/kernels/cmsis-nn/pooling.cc b/tensorflow/lite/micro/kernels/cmsis-nn/pooling.cc
index a12f628e721..8447ad041cc 100644
--- a/tensorflow/lite/micro/kernels/cmsis-nn/pooling.cc
+++ b/tensorflow/lite/micro/kernels/cmsis-nn/pooling.cc
@@ -106,7 +106,7 @@ TfLiteStatus AverageEvalInt8(TfLiteContext* context, const TfLiteNode* node,
 
   TFLITE_DCHECK_LE(activation_min, activation_max);
 
-#if defined(__ARM_FEATURE_DSP)
+#if defined(__ARM_FEATURE_DSP) || defined(__ARM_FEATURE_MVE)
   RuntimeShape input_shape = GetTensorShape(input);
   TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
 
@@ -283,7 +283,7 @@ void* Init(TfLiteContext* context, const char* buffer, size_t length) {
 }
 
 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
-#if defined(__ARM_FEATURE_DSP)
+#if defined(__ARM_FEATURE_DSP) || defined(__ARM_FEATURE_MVE)
   const TfLiteTensor* input = GetInput(context, node, kInputTensor);
   const TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
 
diff --git a/tensorflow/lite/micro/tools/make/ext_libs/cmsis.inc b/tensorflow/lite/micro/tools/make/ext_libs/cmsis.inc
index 657d4fa87cf..cfd87089a84 100644
--- a/tensorflow/lite/micro/tools/make/ext_libs/cmsis.inc
+++ b/tensorflow/lite/micro/tools/make/ext_libs/cmsis.inc
@@ -8,11 +8,6 @@ ifneq ($(filter cmsis-nn,$(ALL_TAGS)),)
     THIRD_PARTY_DOWNLOADS += \
       $(eval $(call add_third_party_download,$(CMSIS_URL),$(CMSIS_MD5),cmsis,))
 
-    ifneq (,$(filter $(TARGET_ARCH), cortex-m55))
-      CCFLAGS += -DARM_MATH_MVEI
-      CXXFLAGS += -DARM_MATH_MVEI
-    endif
-
     CMSIS_PATH = $(MAKEFILE_DIR)/downloads/cmsis/
 
     # Include CMSIS-NN files

From 1faa442c02c6bb479b958a46eb2f858c1985966a Mon Sep 17 00:00:00 2001
From: storypku <storypku@gmail.com>
Date: Fri, 29 May 2020 23:00:06 +0800
Subject: [PATCH 0067/1390] Make third_party/gpus/... build pass. fix issue
 #39759

---
 third_party/gpus/cuda/BUILD | 10 +---------
 third_party/gpus/rocm/BUILD |  7 +------
 2 files changed, 2 insertions(+), 15 deletions(-)

diff --git a/third_party/gpus/cuda/BUILD b/third_party/gpus/cuda/BUILD
index 413d28a2723..9c988668ebc 100644
--- a/third_party/gpus/cuda/BUILD
+++ b/third_party/gpus/cuda/BUILD
@@ -1,9 +1 @@
-load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
-
-bzl_library(
-    name = "build_defs_bzl",
-    srcs = ["build_defs.bzl"],
-    deps = [
-        "@bazel_skylib//lib:selects",
-    ],
-)
+# Dummy BUILD for Bazel
diff --git a/third_party/gpus/rocm/BUILD b/third_party/gpus/rocm/BUILD
index bc2dd419259..dd64bfb0f36 100644
--- a/third_party/gpus/rocm/BUILD
+++ b/third_party/gpus/rocm/BUILD
@@ -1,6 +1 @@
-load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
-
-bzl_library(
-    name = "build_defs_bzl",
-    srcs = ["build_defs.bzl"],
-)
+# placeholder for bazel

From cbf60b5223997eceb6c4221ef4868fd6c792622c Mon Sep 17 00:00:00 2001
From: Elena Zhelezina <elena.zhelezina@arm.com>
Date: Wed, 3 Jun 2020 12:38:44 +0100
Subject: [PATCH 0068/1390] Addressed reviewer's comments.

Change-Id: I3b7842c42b8c905ed44e0cd556134210cb45479c
---
 tensorflow/lite/python/lite.py | 37 ++++++++++++++++------------------
 1 file changed, 17 insertions(+), 20 deletions(-)

diff --git a/tensorflow/lite/python/lite.py b/tensorflow/lite/python/lite.py
index 92c0d5a95d9..010952820b9 100644
--- a/tensorflow/lite/python/lite.py
+++ b/tensorflow/lite/python/lite.py
@@ -258,7 +258,7 @@ class QuantizationMode(object):
 
     if self.training_time_int8_allow_float():
       return {
-          "inference_type": inference_ty if inference_ty else constants.INT8,
+          "inference_type": inference_ty if inference_ty else self.activations_type(),
           "inference_input_type":
               inference_input_ty if inference_input_ty else constants.FLOAT,
           "post_training_quantize": False,  # disable dynamic range quantization
@@ -297,12 +297,28 @@ class QuantizationMode(object):
       return True, {
           "inference_input_type": inference_input_type,
           "inference_output_type": inference_output_type,
+          "activations_type": constants.INT8,
           "allow_float": False
       }
     elif self.post_training_int8_allow_float():
       return True, {
           "inference_input_type": inference_input_type,
           "inference_output_type": inference_output_type,
+          "activations_type": constants.INT8,
+          "allow_float": True
+      }
+    elif self.post_training_int16x8_no_float():
+      return True, {
+          "inference_input_type": inference_input_type,
+          "inference_output_type": inference_output_type,
+          "activations_type": constants.INT16,
+          "allow_float": False
+      }
+    elif self.post_training_int16x8_allow_float():
+      return True, {
+          "inference_input_type": inference_input_type,
+          "inference_output_type": inference_output_type,
+          "activations_type": constants.INT16,
           "allow_float": True
       }
     else:
@@ -573,25 +589,6 @@ class TFLiteConverterBaseV2(TFLiteConverterBase):
         output_tensors=output_tensors,
         **converter_kwargs)
 
-    activations_type = quant_mode.activations_type()
-
-    if quant_mode.post_training_int8_no_float():
-      result = self._calibrate_quantize_model(result, constants.FLOAT,
-                                              constants.FLOAT, activations_type,
-                                              False)
-    elif quant_mode.post_training_int8_allow_float():
-      result = self._calibrate_quantize_model(result, constants.FLOAT,
-                                              constants.FLOAT, activations_type,
-                                              True)
-    elif quant_mode.post_training_int16x8_no_float():
-      result = self._calibrate_quantize_model(result, constants.FLOAT,
-                                              constants.FLOAT, activations_type,
-                                              False)
-    elif quant_mode.post_training_int16x8_allow_float():
-      result = self._calibrate_quantize_model(result, constants.FLOAT,
-                                              constants.FLOAT, activations_type,
-                                              True)
-
     calibrate_and_quantize, flags = quant_mode.quantizer_flags()
     if calibrate_and_quantize:
       result = self._calibrate_quantize_model(result, **flags)

From 67ea57b15bb223e72a60265c24082ae5a31d0f0e Mon Sep 17 00:00:00 2001
From: Elena Zhelezina <elena.zhelezina@arm.com>
Date: Wed, 3 Jun 2020 15:57:18 +0100
Subject: [PATCH 0069/1390] Small fix for inference.

Change-Id: Ifd8670ccb9604ecced3d013f529ddbe16fcd75cf
---
 tensorflow/lite/python/util.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/lite/python/util.py b/tensorflow/lite/python/util.py
index 32a2d596629..b56b9f49b7a 100644
--- a/tensorflow/lite/python/util.py
+++ b/tensorflow/lite/python/util.py
@@ -49,6 +49,7 @@ _MAP_TF_TO_TFLITE_TYPES = {
     dtypes.string: _types_pb2.STRING,
     dtypes.uint8: _types_pb2.QUANTIZED_UINT8,
     dtypes.int8: _types_pb2.INT8,
+    dtypes.int16: _types_pb2.QUANTIZED_INT16,
     dtypes.complex64: _types_pb2.COMPLEX64,
     dtypes.bool: _types_pb2.BOOL,
 }

From 29fdee8e85e750d04f6e9d378e85443ba5c7a239 Mon Sep 17 00:00:00 2001
From: Elena Zhelezina <elena.zhelezina@arm.com>
Date: Wed, 3 Jun 2020 17:50:28 +0100
Subject: [PATCH 0070/1390] Fix for error_reporter.

Change-Id: I58745cc97872af74b1ad5b0af3ad778b39f01555
---
 .../lite/tools/optimize/quantization_utils.cc    |  3 ++-
 tensorflow/lite/tools/optimize/quantize_model.cc | 16 +++++++++-------
 2 files changed, 11 insertions(+), 8 deletions(-)

diff --git a/tensorflow/lite/tools/optimize/quantization_utils.cc b/tensorflow/lite/tools/optimize/quantization_utils.cc
index abbcb642287..cdf2743585e 100644
--- a/tensorflow/lite/tools/optimize/quantization_utils.cc
+++ b/tensorflow/lite/tools/optimize/quantization_utils.cc
@@ -113,7 +113,8 @@ TfLiteStatus GetQuantizationParams(TensorT* tensor, TensorType activations_type,
                                    tensor->quantization->max[0],
                                    quantized_range, quantization_params);
   } else {
-    error_reporter->Report(
+    TF_LITE_REPORT_ERROR(
+        error_reporter,
         "Unsupported activation type for quantize-activation: %s",
         activations_type);
     return kTfLiteError;
diff --git a/tensorflow/lite/tools/optimize/quantize_model.cc b/tensorflow/lite/tools/optimize/quantize_model.cc
index 6dd8ddd2d8c..0cf69eee3b4 100644
--- a/tensorflow/lite/tools/optimize/quantize_model.cc
+++ b/tensorflow/lite/tools/optimize/quantize_model.cc
@@ -370,9 +370,9 @@ TfLiteStatus ApplyConstraints(ModelT* model,
         std::unique_ptr<TensorT> additional_tensor;
         const string requant_tensor_name = input_tensor->name + "_requantized";
         utils::MakeTensorWithQuantParam(
-            requant_tensor_name, input_tensor->shape, 
-            input_tensor->shape_signature, activations_type,
-            output_scale, output_zp, &additional_tensor);
+            requant_tensor_name, input_tensor->shape,
+            input_tensor->shape_signature, activations_type, output_scale,
+            output_zp, &additional_tensor);
         const int32_t additional_tensor_idx = subgraph->tensors.size();
         subgraph->tensors.push_back(std::move(additional_tensor));
 
@@ -869,13 +869,15 @@ TfLiteStatus QuantizeWeightsInputOutput(
 
       if (activations_type == TensorType_INT16 && !property.quantizable &&
           !allow_float) {
-        error_reporter->Report(
-            "Quantization to 16x8-bit not yet supported for op: %s",
+        TF_LITE_REPORT_ERROR(
+            error_reporter,
+            "Quantization to 16x8-bit not yet supported for op: %",
             EnumNameBuiltinOperator(op_code));
         return kTfLiteError;
       } else if (!property.quantizable && !allow_float) {
-        error_reporter->Report("Quantization not yet supported for op: %s",
-                               EnumNameBuiltinOperator(op_code));
+        TF_LITE_REPORT_ERROR(error_reporter,
+                             "Quantization not yet supported for op: %",
+                             EnumNameBuiltinOperator(op_code));
         return kTfLiteError;
       }
 

From 761d850ac6456aed93ab250ff49af3f0a6a62960 Mon Sep 17 00:00:00 2001
From: Elena Zhelezina <elena.zhelezina@arm.com>
Date: Wed, 3 Jun 2020 17:56:39 +0100
Subject: [PATCH 0071/1390] Renamed option with the prefix EXPERIMENTAL_.

Change-Id: Idb84736507d5c07ebdf182b8a15d55906d0d7fc0
---
 tensorflow/lite/python/convert.py   | 2 +-
 tensorflow/lite/python/lite.py      | 2 +-
 tensorflow/lite/python/lite_test.py | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/tensorflow/lite/python/convert.py b/tensorflow/lite/python/convert.py
index c30987a5898..939de61c608 100644
--- a/tensorflow/lite/python/convert.py
+++ b/tensorflow/lite/python/convert.py
@@ -98,7 +98,7 @@ class OpsSet(enum.Enum):
   # and int16 activations.
   # Specifying this will throw an error for operations that do not yet have
   # quantized implementations.
-  TFLITE_BUILTINS_ACTIVATIONS_INT16_WEIGHTS_INT8 = "TFLITE_BUILTINS_ACTIVATIONS_INT16_WEIGHTS_INT8"
+  EXPERIMENTAL_TFLITE_BUILTINS_ACTIVATIONS_INT16_WEIGHTS_INT8 = "EXPERIMENTAL_TFLITE_BUILTINS_ACTIVATIONS_INT16_WEIGHTS_INT8"
 
   def __str__(self):
     return self.value
diff --git a/tensorflow/lite/python/lite.py b/tensorflow/lite/python/lite.py
index 010952820b9..781007241b4 100644
--- a/tensorflow/lite/python/lite.py
+++ b/tensorflow/lite/python/lite.py
@@ -356,7 +356,7 @@ class QuantizationMode(object):
   def _is_int16x8_target_required(self):
     return bool(
           set(self._target_spec.supported_ops).intersection([
-            OpsSet.TFLITE_BUILTINS_ACTIVATIONS_INT16_WEIGHTS_INT8
+            OpsSet.EXPERIMENTAL_TFLITE_BUILTINS_ACTIVATIONS_INT16_WEIGHTS_INT8
         ]))
 
   def _is_allow_float(self):
diff --git a/tensorflow/lite/python/lite_test.py b/tensorflow/lite/python/lite_test.py
index 4075a887943..1d052b88c10 100644
--- a/tensorflow/lite/python/lite_test.py
+++ b/tensorflow/lite/python/lite_test.py
@@ -885,7 +885,7 @@ class FromSessionTest(TestModels, parameterized.TestCase):
       # Quantize model to Int8: with disable mlir
       ('UseTfliteBuiltinsIntDisableMLIR', [lite.OpsSet.TFLITE_BUILTINS_INT8], False),
       # Quantize model to Int16: with disable mlir
-      ('UseTfliteBuiltinsInt16DisableMLIR', [lite.OpsSet.TFLITE_BUILTINS_ACTIVATIONS_INT16_WEIGHTS_INT8], False))
+      ('UseTfliteBuiltinsInt16DisableMLIR', [lite.OpsSet.EXPERIMENTAL_TFLITE_BUILTINS_ACTIVATIONS_INT16_WEIGHTS_INT8], False))
   def testCalibrateAndQuantizeBuiltinInt(self, supported_ops, enable_mlir):
     with ops.Graph().as_default():
       inp, output, calibration_gen = self._getCalibrationQuantizeModel()

From ab6b2ffde37bf4443c7dadc39312f0429f417db6 Mon Sep 17 00:00:00 2001
From: Nathan Luehr <nluehr@nvidia.com>
Date: Fri, 15 May 2020 11:40:22 -0500
Subject: [PATCH 0072/1390] Removed TENSOR_OP disable env vars.

* TF_DISABLE_CUBLAS_TENSOR_OP_MATH
* TF_DISABLE_CUDNN_TENSOR_OP_MATH
* TF_DISABLE_CUDNN_RNN_TENSOR_OP_MATH
---
 tensorflow/stream_executor/cuda/cuda_blas.cc | 21 ++------
 tensorflow/stream_executor/cuda/cuda_dnn.cc  | 55 +++++---------------
 2 files changed, 16 insertions(+), 60 deletions(-)

diff --git a/tensorflow/stream_executor/cuda/cuda_blas.cc b/tensorflow/stream_executor/cuda/cuda_blas.cc
index c9f0fc462c9..65c07e72154 100644
--- a/tensorflow/stream_executor/cuda/cuda_blas.cc
+++ b/tensorflow/stream_executor/cuda/cuda_blas.cc
@@ -101,18 +101,6 @@ static std::string ToString(cublasStatus_t status) {
   }
 }
 
-// Decide whether to enable TENSOR_OP_MATH
-static bool TensorOpMathEnabled() {
-  static bool is_enabled = [] {
-    bool is_disabled;
-    TF_CHECK_OK(
-        tensorflow::ReadBoolFromEnvVar("TF_DISABLE_CUBLAS_TENSOR_OP_MATH",
-                                       /*default_val=*/false, &is_disabled));
-    return !is_disabled;
-  }();
-  return is_enabled;
-}
-
 // cuBLAS has interfaces that permit pointers to be passed from either the host
 // memory space or the device memory space; however, you must instruct it as to
 // which address space those pointers are in with cublasSetPointerMode.
@@ -1640,7 +1628,7 @@ bool CUDABlas::DoBlasGemm(
                                                                    &cc_minor);
 
   // GPUs < sm_70 don't support tensor ops.
-  if (cc_major >= 7 && TensorOpMathEnabled()) {
+  if (cc_major >= 7) {
     use_tensor_ops = true;
   }
 #endif
@@ -1921,8 +1909,7 @@ static bool TensorOpsAvailable(int cc_major) {
   // strictly correct.  We can't simply enable it, though, as that would change
   // clients' behavior significantly: Using tensor ops on fp32 inputs cause them
   // to be rounded to fp16.
-  if (cc_major >= 7 && TensorOpMathEnabled() &&
-      std::is_same<InType, Eigen::half>::value) {
+  if (cc_major >= 7 && std::is_same<InType, Eigen::half>::value) {
     return true;
   }
 #endif
@@ -2270,7 +2257,7 @@ port::Status CUDABlas::DoBlasGemmBatchedInternal(
   if (stream->parent()->GetDeviceDescription().cuda_compute_capability(
           &cc_major, &cc_minor) &&
       cc_major >= 5) {
-    bool use_tensor_ops = TensorOpMathEnabled() && data_type == CUDA_R_16F;
+    bool use_tensor_ops = data_type == CUDA_R_16F;
     cublasGemmAlgo_t algo =
         (use_tensor_ops ? CUBLAS_GEMM_DFALT_TENSOR_OP : CUBLAS_GEMM_DFALT);
     cudaDataType_t compute_type =
@@ -2425,7 +2412,7 @@ bool CUDABlas::DoBlasGemmStridedBatched(
   if (stream->parent()->GetDeviceDescription().cuda_compute_capability(
           &cc_major, &cc_minor)) {
     // GPUs < sm_70 don't support tensor ops.
-    if (cc_major >= 7 && TensorOpMathEnabled()) {
+    if (cc_major >= 7) {
       use_tensor_ops = true;
     }
 #if CUDA_VERSION >= 9010
diff --git a/tensorflow/stream_executor/cuda/cuda_dnn.cc b/tensorflow/stream_executor/cuda/cuda_dnn.cc
index 6122877f91f..780f1475c2c 100755
--- a/tensorflow/stream_executor/cuda/cuda_dnn.cc
+++ b/tensorflow/stream_executor/cuda/cuda_dnn.cc
@@ -603,31 +603,6 @@ class CudnnFilterDescriptor {
   SE_DISALLOW_COPY_AND_ASSIGN(CudnnFilterDescriptor);
 };
 
-// A helper function to decide whether to enable the TENSOR_OP_MATH math type
-bool TensorOpMathEnabled() {
-  static bool is_enabled = [] {
-    bool is_disabled = false;
-    TF_CHECK_OK(
-        tensorflow::ReadBoolFromEnvVar("TF_DISABLE_CUDNN_TENSOR_OP_MATH",
-                                       /*default_val=*/false, &is_disabled));
-    return !is_disabled;
-  }();
-  return is_enabled;
-}
-
-// A helper function to decide whether to enable the TENSOR_OP_MATH math type
-// for RNNs.
-bool RnnTensorOpMathEnabled() {
-  static bool is_enabled = [] {
-    bool is_disabled = false;
-    TF_CHECK_OK(
-        tensorflow::ReadBoolFromEnvVar("TF_DISABLE_CUDNN_RNN_TENSOR_OP_MATH",
-                                       /*default_val=*/false, &is_disabled));
-    return !is_disabled;
-  }();
-  return is_enabled;
-}
-
 // A helper function to decide whether to use
 // CUDNN_BATCHNORM_SPATIAL_PERSISTENT in batchnorm. This mode can be faster in
 // some tasks because an optimized path may be selected for CUDNN_DATA_FLOAT
@@ -751,9 +726,7 @@ class CudnnConvolutionDescriptor {
 #if CUDNN_VERSION >= 7000
     cudnnMathType_t math_type =
         (use_tensor_op_math ? CUDNN_TENSOR_OP_MATH : CUDNN_DEFAULT_MATH);
-    if (TensorOpMathEnabled()) {
-      CHECK_CUDNN_OK(cudnnSetConvolutionMathType(handle_.get(), math_type));
-    }
+    CHECK_CUDNN_OK(cudnnSetConvolutionMathType(handle_.get(), math_type));
 #endif
   }
 
@@ -1157,21 +1130,19 @@ class CudnnRnnDescriptor : public dnn::RnnDescriptor {
     // in profile mode, which is run with algorithms returned from
     // GetRnnAlgorithms() (which are non-default and explicitly set whether to
     // use tensor ops). CuDNN 7.2.1 fixed this issue
-    if (RnnTensorOpMathEnabled()) {
-      cudnnMathType_t math_type;
-      if (algorithm_config.algorithm().has_value()) {
-        math_type = algorithm_config.algorithm()->tensor_ops_enabled()
-                        ? CUDNN_TENSOR_OP_MATH
-                        : CUDNN_DEFAULT_MATH;
-      } else {
+    cudnnMathType_t math_type;
+    if (algorithm_config.algorithm().has_value()) {
+      math_type = algorithm_config.algorithm()->tensor_ops_enabled()
+                      ? CUDNN_TENSOR_OP_MATH
+                      : CUDNN_DEFAULT_MATH;
+    } else {
 #if CUDNN_VERSION >= 7201
-        math_type = CUDNN_TENSOR_OP_MATH;
+      math_type = CUDNN_TENSOR_OP_MATH;
 #else
-        math_type = CUDNN_DEFAULT_MATH;
+      math_type = CUDNN_DEFAULT_MATH;
 #endif  // CUDNN_VERSION >= 7201
-      }
-      CHECK_CUDNN_OK(cudnnSetRNNMatrixMathType(rnn_desc.get(), math_type));
     }
+    CHECK_CUDNN_OK(cudnnSetRNNMatrixMathType(rnn_desc.get(), math_type));
 #endif  // CUDNN_VERSION >= 7000
 
     return CudnnRnnDescriptor(cudnn, std::move(rnn_desc), std::move(rnn_plan),
@@ -2605,7 +2576,7 @@ AllocateCudnnConvolutionBackwardFilterWorkspace(
 }
 
 static bool TensorOpMathAvailable(int cc_major) {
-  return cc_major >= 7 && CUDNN_VERSION >= 7000 && TensorOpMathEnabled();
+  return cc_major >= 7 && CUDNN_VERSION >= 7000;
 }
 
 port::StatusOr<dnn::AlgorithmDesc> GetCudnnConvolutionForwardAlgorithm(
@@ -3399,9 +3370,7 @@ bool CudnnSupport::GetRnnAlgorithms(
   for (auto i : algo_types) {
     out_algorithms->push_back({i, /*use_tensor_ops=*/false});
 #if CUDNN_VERSION >= 7100
-    if (RnnTensorOpMathEnabled()) {
-      out_algorithms->push_back({i, /*use_tensor_ops=*/true});
-    }
+    out_algorithms->push_back({i, /*use_tensor_ops=*/true});
 #endif
   }
   return true;

From 3cad62e356b8c72d03af13ba29f7ace29a6f0772 Mon Sep 17 00:00:00 2001
From: Nathan Luehr <nluehr@nvidia.com>
Date: Fri, 15 May 2020 11:46:41 -0500
Subject: [PATCH 0073/1390] Add global setting control TF32 execution

---
 tensorflow/core/platform/BUILD         |  7 +++++++
 tensorflow/core/platform/tf32_utils.cc | 27 ++++++++++++++++++++++++++
 tensorflow/core/platform/tf32_utils.h  | 27 ++++++++++++++++++++++++++
 3 files changed, 61 insertions(+)
 create mode 100644 tensorflow/core/platform/tf32_utils.cc
 create mode 100644 tensorflow/core/platform/tf32_utils.h

diff --git a/tensorflow/core/platform/BUILD b/tensorflow/core/platform/BUILD
index c7ff378d2ac..f27d2f09208 100644
--- a/tensorflow/core/platform/BUILD
+++ b/tensorflow/core/platform/BUILD
@@ -937,6 +937,13 @@ cc_library(
     alwayslink = 1,
 )
 
+cc_library(
+    name = "tf32_utils",
+    srcs = ["tf32_utils.cc"],
+    hdrs = ["tf32_utils.h"],
+    copts = tf_copts(),
+)
+
 tf_cc_tests(
     name = "low_level_library_tests",
     size = "small",
diff --git a/tensorflow/core/platform/tf32_utils.cc b/tensorflow/core/platform/tf32_utils.cc
new file mode 100644
index 00000000000..715b5996dc3
--- /dev/null
+++ b/tensorflow/core/platform/tf32_utils.cc
@@ -0,0 +1,27 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/platform/tf32_utils.h"
+
+namespace tensorflow {
+
+// TODO(nluehr): enable tf32 execution by default after TF32 Ampere testing.
+static bool tf32_enabled = false;
+
+void allow_tf32_execution(bool allow) { tf32_enabled = allow; }
+
+bool tf32_execution_allowed() { return tf32_enabled; }
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/platform/tf32_utils.h b/tensorflow/core/platform/tf32_utils.h
new file mode 100644
index 00000000000..a0ce58f9bbd
--- /dev/null
+++ b/tensorflow/core/platform/tf32_utils.h
@@ -0,0 +1,27 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PLATFORM_TF32_UTILS_H_
+#define TENSORFLOW_CORE_PLATFORM_TF32_UTILS_H_
+
+namespace tensorflow {
+
+void allow_tf32_execution(bool allow);
+
+bool tf32_execution_allowed();
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PLATFORM_TF32_UTILS_H_

From 8bfee17f5880eccdb759fb47ab11b782f201cf0f Mon Sep 17 00:00:00 2001
From: Nathan Luehr <nluehr@nvidia.com>
Date: Fri, 15 May 2020 13:33:02 -0500
Subject: [PATCH 0074/1390] Python tf.config tf32 interface

---
 tensorflow/python/BUILD               | 11 +++++++++++
 tensorflow/python/framework/config.py | 26 ++++++++++++++++++++++++++
 tensorflow/python/util/tf32.cc        | 22 ++++++++++++++++++++++
 3 files changed, 59 insertions(+)
 create mode 100644 tensorflow/python/util/tf32.cc

diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index a49e4b74def..997ec6c924f 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -746,6 +746,16 @@ tf_python_pybind_extension(
     ],
 )
 
+tf_python_pybind_extension(
+    name = "_pywrap_tf32_execution",
+    srcs = ["util/tf32.cc"],
+    module_name = "_pywrap_tf32_execution",
+    deps = [
+        "//tensorflow/core/platform:tf32_utils",
+        "@pybind11",
+    ],
+)
+
 tf_python_pybind_extension(
     name = "_pywrap_util_port",
     srcs = ["util/port_wrapper.cc"],
@@ -5573,6 +5583,7 @@ py_library(
         "//tensorflow:composite_tensor_whitelist",
     ],
     deps = [
+        ":_pywrap_tf32_execution",
         ":tf_decorator",
         ":tf_export",
         ":tf_stack",
diff --git a/tensorflow/python/framework/config.py b/tensorflow/python/framework/config.py
index 5361d7290e8..042af4d1023 100644
--- a/tensorflow/python/framework/config.py
+++ b/tensorflow/python/framework/config.py
@@ -18,10 +18,36 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python import _pywrap_tf32_execution
 from tensorflow.python.eager import context
 from tensorflow.python.util import deprecation
 from tensorflow.python.util.tf_export import tf_export
 
+def tensor_float32_execution_allowed():
+  """Get if TensorFloat-32 operations are enabled on supported hardware.
+
+  Returns:
+    True if TensorFloat-32 execution is enabled and False otherwise.
+  """
+  return _pywrap_tf32_execution.is_allowed()
+
+def allow_tensor_float_32_execution(allow):
+  """Allow use of TensorFloat-32 with float32 ops on supported hardware.
+
+  TensorFloat-32 is a math mode introduced with the NVIDIA Ampere architecture.
+  TensorFloat-32 kernels take float32 inputs and produce float32 outputs.
+  Internally, the inputs are cast to a custom representation with 10-bit
+  mantissa (similar to float16) and 8-bit exponent (similar to float32) and are
+  executed using TensorCores with float32 accumulation. For more information,
+  see https://blogs.nvidia.com/blog/2020/05/14/tensorfloat-32-precision-format/.
+
+  TensorFloat-32 execution is disabled by default, but this may change in a
+  future version.
+  
+  Args:
+    allow: whether to allow TensorFloat-32 execution
+  """
+  _pywrap_tf32_execution.allow(allow)
 
 @tf_export('config.threading.get_intra_op_parallelism_threads')
 def get_intra_op_parallelism_threads():
diff --git a/tensorflow/python/util/tf32.cc b/tensorflow/python/util/tf32.cc
new file mode 100644
index 00000000000..7dece6ccdae
--- /dev/null
+++ b/tensorflow/python/util/tf32.cc
@@ -0,0 +1,22 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "pybind11/pybind11.h"
+#include "tensorflow/core/platform/tf32_utils.h"
+
+PYBIND11_MODULE(_pywrap_tf32_execution, m) {
+  m.def("allow", &tensorflow::allow_tf32_execution);
+  m.def("is_allowed", &tensorflow::tf32_execution_allowed);
+}

From dedb51aec2a766bdeb8b4c2ab1700bfcf7687966 Mon Sep 17 00:00:00 2001
From: Nathan Luehr <nluehr@nvidia.com>
Date: Tue, 19 May 2020 14:58:30 -0500
Subject: [PATCH 0075/1390] Convolution TF32 Plumbing

---
 tensorflow/stream_executor/cuda/BUILD       |   1 +
 tensorflow/stream_executor/cuda/cuda_dnn.cc | 200 +++++++++++++-------
 2 files changed, 135 insertions(+), 66 deletions(-)
 mode change 100755 => 100644 tensorflow/stream_executor/cuda/cuda_dnn.cc

diff --git a/tensorflow/stream_executor/cuda/BUILD b/tensorflow/stream_executor/cuda/BUILD
index 1457a36beaf..2749281335e 100644
--- a/tensorflow/stream_executor/cuda/BUILD
+++ b/tensorflow/stream_executor/cuda/BUILD
@@ -353,6 +353,7 @@ cc_library(
         "@local_config_cuda//cuda:cudnn_header",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "//tensorflow/core/platform:tf32_utils",
         "//tensorflow/stream_executor:dnn",
         "//tensorflow/stream_executor:event",
         "//tensorflow/stream_executor:plugin_registry",
diff --git a/tensorflow/stream_executor/cuda/cuda_dnn.cc b/tensorflow/stream_executor/cuda/cuda_dnn.cc
old mode 100755
new mode 100644
index 780f1475c2c..53296f4eea5
--- a/tensorflow/stream_executor/cuda/cuda_dnn.cc
+++ b/tensorflow/stream_executor/cuda/cuda_dnn.cc
@@ -20,8 +20,8 @@ limitations under the License.
 #include <utility>
 
 #include "absl/strings/str_cat.h"
-#include "third_party/eigen3/Eigen/Core"
 #include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/platform/tf32_utils.h"
 #include "tensorflow/core/util/env_var.h"
 #include "tensorflow/stream_executor/cuda/cuda_activation.h"
 #include "tensorflow/stream_executor/cuda/cuda_diagnostics.h"
@@ -42,6 +42,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/scratch_allocator.h"
 #include "tensorflow/stream_executor/stream.h"
 #include "tensorflow/stream_executor/stream_executor_pimpl.h"
+#include "third_party/eigen3/Eigen/Core"
 // clang-format off
 #include "third_party/gpus/cudnn/cudnn.h"
 #include "absl/strings/string_view.h"
@@ -707,10 +708,6 @@ class CudnnConvolutionDescriptor {
             : CUDNN_CROSS_CORRELATION,
         data_type));
 
-    // NOTE(benbarsdell): This only applies if tensor op math is enabled
-    //                      and algo selection is set to Default.
-    this->set_use_tensor_op_math(true);
-
 #if CUDNN_MAJOR >= 7
     VLOG(2) << "Requesting grouped convolution: "
             << convolution_descriptor.group_count();
@@ -722,10 +719,14 @@ class CudnnConvolutionDescriptor {
 #endif
   }
 
-  void set_use_tensor_op_math(bool use_tensor_op_math) const {
+  void set_use_tensor_op_math(bool use_tensor_op_math) {
 #if CUDNN_VERSION >= 7000
     cudnnMathType_t math_type =
+#if CUDNN_VERSION >= 8000
+        (use_tensor_op_math ? CUDNN_DEFAULT_MATH : CUDNN_FMA_MATH);
+#else
         (use_tensor_op_math ? CUDNN_TENSOR_OP_MATH : CUDNN_DEFAULT_MATH);
+#endif
     CHECK_CUDNN_OK(cudnnSetConvolutionMathType(handle_.get(), math_type));
 #endif
   }
@@ -738,6 +739,38 @@ class CudnnConvolutionDescriptor {
   SE_DISALLOW_COPY_AND_ASSIGN(CudnnConvolutionDescriptor);
 };
 
+// A helper function to query if a CudnnConvolutionDescriptor has tensor_op_math
+// set
+static bool IsTensorMathOpSet(const CudnnConvolutionDescriptor& conv) {
+  cudnnMathType_t math_type;
+  CHECK_CUDNN_OK(cudnnGetConvolutionMathType(conv.handle(), &math_type));
+#if CUDNN_VERSION >= 8000
+  return math_type != CUDNN_FMA_MATH;
+#else
+  return math_type == CUDNN_TENSOR_OP_MATH;
+#endif
+}
+
+static bool TensorOpMathAvailable(int cc_major) {
+  return cc_major >= 7 && CUDNN_VERSION >= 7000;
+}
+
+static bool IsTensorMathAllowed(Stream* stream, dnn::DataType input_type) {
+  int cc_major, cc_minor;
+  std::tie(cc_major, cc_minor) = GetCcMajorMinor(stream);
+  if (!TensorOpMathAvailable(cc_major)) {
+    return false;
+  }
+  if (input_type == dnn::DataType::kFloat) {
+    if (CUDNN_VERSION < 8000) {
+      return false;
+    } else if (!tensorflow::tf32_execution_allowed()) {
+      return false;
+    }
+  }
+  return true;
+}
+
 // Turns a PoolingDescriptor structure into a cudnn pooling descriptor handle
 // within a scope.
 class CudnnPoolingDescriptor {
@@ -2450,10 +2483,11 @@ port::StatusOr<DeviceMemory<uint8>> AllocateCudnnConvolutionForwardWorkspace(
     const CudnnTensorDescriptor& output_nd,
     const dnn::AlgorithmDesc& algorithm_desc,
     ScratchAllocator* scratch_allocator) {
-  // TODO(csigg): This has side effects on the convolution descriptor. It is
-  // functionally correct because the convolution is run with the algorithm of
-  // the last call to this function, but should be fixed anyway.
-  conv.set_use_tensor_op_math(algorithm_desc.tensor_ops_enabled());
+  if (IsTensorMathOpSet(conv) != algorithm_desc.tensor_ops_enabled()) {
+    return port::Status(
+        port::error::INTERNAL,
+        "Mismatch between cudnn conv and algorithm descriptors.");
+  }
 
   // Query the size of the workspace and allocate it.
   size_t size_in_bytes;
@@ -2493,10 +2527,11 @@ AllocateCudnnConvolutionBackwardDataWorkspace(
     const CudnnTensorDescriptor& output_nd,
     const dnn::AlgorithmDesc& algorithm_desc,
     ScratchAllocator* scratch_allocator) {
-  // TODO(csigg): This has side effects on the convolution descriptor. It is
-  // functionally correct because the convolution is run with the algorithm of
-  // the last call to this function, but should be fixed anyway.
-  conv.set_use_tensor_op_math(algorithm_desc.tensor_ops_enabled());
+  if (IsTensorMathOpSet(conv) != algorithm_desc.tensor_ops_enabled()) {
+    return port::Status(
+        port::error::INTERNAL,
+        "Mismatch between cudnn conv and algorithm descriptors.");
+  }
 
   // Query the size of the workspace and allocate it.
   size_t size_in_bytes;
@@ -2538,10 +2573,11 @@ AllocateCudnnConvolutionBackwardFilterWorkspace(
     const CudnnTensorDescriptor& output_nd,
     const dnn::AlgorithmDesc& algorithm_desc,
     ScratchAllocator* scratch_allocator) {
-  // TODO(csigg): This has side effects on the convolution descriptor. It is
-  // functionally correct because the convolution is run with the algorithm of
-  // the last call to this function, but should be fixed anyway.
-  conv.set_use_tensor_op_math(algorithm_desc.tensor_ops_enabled());
+  if (IsTensorMathOpSet(conv) != algorithm_desc.tensor_ops_enabled()) {
+    return port::Status(
+        port::error::INTERNAL,
+        "Mismatch between cudnn conv and algorithm descriptors.");
+  }
 
   // Query the size of the workspace and allocate it.
   size_t size_in_bytes;
@@ -2575,18 +2611,39 @@ AllocateCudnnConvolutionBackwardFilterWorkspace(
   return scratch_allocator->AllocateBytes(size_in_bytes);
 }
 
-static bool TensorOpMathAvailable(int cc_major) {
-  return cc_major >= 7 && CUDNN_VERSION >= 7000;
+port::StatusOr<bool> UseTensorOps(Stream* stream, dnn::DataType type,
+                                  absl::optional<dnn::AlgorithmDesc> desc) {
+  bool use_tensor_ops;
+  if (desc.has_value()) {
+    use_tensor_ops = desc->tensor_ops_enabled();
+    if (use_tensor_ops && !IsTensorMathAllowed(stream, type)) {
+      return port::Status(port::error::INVALID_ARGUMENT,
+                          "Algo requests disallowed tensor op evaluation.");
+    }
+  } else {
+    use_tensor_ops = IsTensorMathAllowed(stream, type);
+  }
+  return use_tensor_ops;
 }
 
 port::StatusOr<dnn::AlgorithmDesc> GetCudnnConvolutionForwardAlgorithm(
     Stream* stream, const CudnnHandle& cudnn,
     const dnn::AlgorithmConfig& algorithm_config,
     const CudnnTensorDescriptor& input_nd, const CudnnFilterDescriptor& filter,
-    const CudnnConvolutionDescriptor& conv,
+    dnn::DataType element_type,
+    const dnn::ConvolutionDescriptor& convolution_descriptor,
     const CudnnTensorDescriptor& output_nd, ScratchAllocator* scratch_allocator,
     DeviceMemory<uint8>* scratch) {
   absl::optional<dnn::AlgorithmDesc> algo_desc = algorithm_config.algorithm();
+
+  CudnnConvolutionDescriptor conv(
+      convolution_descriptor,
+      ToCudnnDataType(GetConvAccumulatorType(element_type)));
+  bool use_tensor_ops;
+  SE_ASSIGN_OR_RETURN(use_tensor_ops,
+                      UseTensorOps(stream, element_type, algo_desc));
+  conv.set_use_tensor_op_math(use_tensor_ops);
+
   if (!algo_desc.has_value()) {
     // Pick fastest algorithm within memory limit according to cuDNN's
     // heuristics.
@@ -2599,10 +2656,7 @@ port::StatusOr<dnn::AlgorithmDesc> GetCudnnConvolutionForwardAlgorithm(
                         GetCudnnConvolutionForwardAlgo(
                             cudnn, input_nd, filter, conv, output_nd,
                             specify_workspace_limit, memory_limit_bytes));
-    int cc_major, cc_minor;
-    std::tie(cc_major, cc_minor) = GetCcMajorMinor(stream);
-    algo_desc = dnn::AlgorithmDesc(
-        algo, /*use_tensor_ops=*/TensorOpMathAvailable(cc_major));
+    algo_desc = dnn::AlgorithmDesc(algo, use_tensor_ops);
   }
 
   const auto scratch_or = AllocateCudnnConvolutionForwardWorkspace(
@@ -2626,6 +2680,9 @@ port::StatusOr<dnn::AlgorithmDesc> GetCudnnConvolutionForwardAlgorithm(
                      "Returned status: ", scratch_or.status().ToString()));
   }
 
+  SE_ASSIGN_OR_RETURN(use_tensor_ops,
+                      UseTensorOps(stream, element_type, algo_desc));
+  conv.set_use_tensor_op_math(use_tensor_ops);
   SE_ASSIGN_OR_RETURN(*scratch, AllocateCudnnConvolutionForwardWorkspace(
                                     stream, cudnn, input_nd, filter, conv,
                                     output_nd, *algo_desc, scratch_allocator));
@@ -2636,10 +2693,19 @@ port::StatusOr<dnn::AlgorithmDesc> GetCudnnConvolutionBackwardDataAlgorithm(
     Stream* stream, const CudnnHandle& cudnn,
     const dnn::AlgorithmConfig& algorithm_config,
     const CudnnTensorDescriptor& input_nd, const CudnnFilterDescriptor& filter,
-    const CudnnConvolutionDescriptor& conv,
+    dnn::DataType element_type,
+    const dnn::ConvolutionDescriptor& convolution_descriptor,
     const CudnnTensorDescriptor& output_nd, ScratchAllocator* scratch_allocator,
     DeviceMemory<uint8>* scratch) {
   absl::optional<dnn::AlgorithmDesc> algo_desc = algorithm_config.algorithm();
+  CudnnConvolutionDescriptor conv(
+      convolution_descriptor,
+      ToCudnnDataType(GetConvAccumulatorType(element_type)));
+  bool use_tensor_ops;
+  SE_ASSIGN_OR_RETURN(use_tensor_ops,
+                      UseTensorOps(stream, element_type, algo_desc));
+  conv.set_use_tensor_op_math(use_tensor_ops);
+
   if (!algo_desc.has_value()) {
     // Pick fastest algorithm within memory limit according to cuDNN's
     // heuristics.
@@ -2652,10 +2718,7 @@ port::StatusOr<dnn::AlgorithmDesc> GetCudnnConvolutionBackwardDataAlgorithm(
                         GetCudnnConvolutionBackwardDataAlgo(
                             cudnn, input_nd, filter, conv, output_nd,
                             specify_workspace_limit, memory_limit_bytes));
-    int cc_major, cc_minor;
-    std::tie(cc_major, cc_minor) = GetCcMajorMinor(stream);
-    algo_desc = dnn::AlgorithmDesc(
-        algo, /*use_tensor_ops=*/TensorOpMathAvailable(cc_major));
+    algo_desc = dnn::AlgorithmDesc(algo, use_tensor_ops);
   }
 
   const auto scratch_or = AllocateCudnnConvolutionBackwardDataWorkspace(
@@ -2678,6 +2741,9 @@ port::StatusOr<dnn::AlgorithmDesc> GetCudnnConvolutionBackwardDataAlgorithm(
         "while a secondary algorithm is not provided.");
   }
 
+  SE_ASSIGN_OR_RETURN(use_tensor_ops,
+                      UseTensorOps(stream, element_type, algo_desc));
+  conv.set_use_tensor_op_math(use_tensor_ops);
   SE_ASSIGN_OR_RETURN(*scratch, AllocateCudnnConvolutionBackwardDataWorkspace(
                                     stream, cudnn, input_nd, filter, conv,
                                     output_nd, *algo_desc, scratch_allocator));
@@ -2688,10 +2754,19 @@ port::StatusOr<dnn::AlgorithmDesc> GetCudnnConvolutionBackwardFilterAlgorithm(
     Stream* stream, const CudnnHandle& cudnn,
     const dnn::AlgorithmConfig& algorithm_config,
     const CudnnTensorDescriptor& input_nd, const CudnnFilterDescriptor& filter,
-    const CudnnConvolutionDescriptor& conv,
+    dnn::DataType element_type,
+    const dnn::ConvolutionDescriptor& convolution_descriptor,
     const CudnnTensorDescriptor& output_nd, ScratchAllocator* scratch_allocator,
     DeviceMemory<uint8>* scratch) {
   absl::optional<dnn::AlgorithmDesc> algo_desc = algorithm_config.algorithm();
+  CudnnConvolutionDescriptor conv(
+      convolution_descriptor,
+      ToCudnnDataType(GetConvAccumulatorType(element_type)));
+  bool use_tensor_ops;
+  SE_ASSIGN_OR_RETURN(use_tensor_ops,
+                      UseTensorOps(stream, element_type, algo_desc));
+  conv.set_use_tensor_op_math(use_tensor_ops);
+
   if (!algo_desc.has_value()) {
     // Pick fastest algorithm within memory limit according to cuDNN's
     // heuristics.
@@ -2704,10 +2779,7 @@ port::StatusOr<dnn::AlgorithmDesc> GetCudnnConvolutionBackwardFilterAlgorithm(
                         GetCudnnConvolutionBackwardFilterAlgo(
                             cudnn, input_nd, filter, conv, output_nd,
                             specify_workspace_limit, memory_limit_bytes));
-    int cc_major, cc_minor;
-    std::tie(cc_major, cc_minor) = GetCcMajorMinor(stream);
-    algo_desc = dnn::AlgorithmDesc(
-        algo, /*use_tensor_ops=*/TensorOpMathAvailable(cc_major));
+    algo_desc = dnn::AlgorithmDesc(algo, use_tensor_ops);
   }
 
   auto scratch_or = AllocateCudnnConvolutionBackwardFilterWorkspace(
@@ -2730,6 +2802,9 @@ port::StatusOr<dnn::AlgorithmDesc> GetCudnnConvolutionBackwardFilterAlgorithm(
         "while a secondary algorithm is not provided.");
   }
 
+  SE_ASSIGN_OR_RETURN(use_tensor_ops,
+                      UseTensorOps(stream, element_type, algo_desc));
+  conv.set_use_tensor_op_math(use_tensor_ops);
   SE_ASSIGN_OR_RETURN(*scratch, AllocateCudnnConvolutionBackwardFilterWorkspace(
                                     stream, cudnn, input_nd, filter, conv,
                                     output_nd, *algo_desc, scratch_allocator));
@@ -2894,35 +2969,32 @@ port::Status CudnnSupport::DoPrepareForConvolution(
   CudnnTensorDescriptor output_nd(
       output_descriptor,
       ToCudnnDataType(element_type, output_descriptor.layout()));
-  CudnnConvolutionDescriptor conv(
-      convolution_descriptor,
-      ToCudnnDataType(GetConvAccumulatorType(element_type)));
 
   auto cudnn = cudnn_->GetHandle(parent_, stream);
 
   switch (kind) {
     case dnn::ConvolutionKind::FORWARD: {
-      SE_ASSIGN_OR_RETURN(
-          *algorithm_desc,
-          GetCudnnConvolutionForwardAlgorithm(
-              stream, cudnn, algorithm_config, input_nd, filter_nd, conv,
-              output_nd, scratch_allocator, scratch_memory));
+      SE_ASSIGN_OR_RETURN(*algorithm_desc,
+                          GetCudnnConvolutionForwardAlgorithm(
+                              stream, cudnn, algorithm_config, input_nd,
+                              filter_nd, element_type, convolution_descriptor,
+                              output_nd, scratch_allocator, scratch_memory));
       break;
     }
     case dnn::ConvolutionKind::BACKWARD_DATA: {
-      SE_ASSIGN_OR_RETURN(
-          *algorithm_desc,
-          GetCudnnConvolutionBackwardDataAlgorithm(
-              stream, cudnn, algorithm_config, input_nd, filter_nd, conv,
-              output_nd, scratch_allocator, scratch_memory));
+      SE_ASSIGN_OR_RETURN(*algorithm_desc,
+                          GetCudnnConvolutionBackwardDataAlgorithm(
+                              stream, cudnn, algorithm_config, input_nd,
+                              filter_nd, element_type, convolution_descriptor,
+                              output_nd, scratch_allocator, scratch_memory));
       break;
     }
     case dnn::ConvolutionKind::BACKWARD_FILTER: {
-      SE_ASSIGN_OR_RETURN(
-          *algorithm_desc,
-          GetCudnnConvolutionBackwardFilterAlgorithm(
-              stream, cudnn, algorithm_config, input_nd, filter_nd, conv,
-              output_nd, scratch_allocator, scratch_memory));
+      SE_ASSIGN_OR_RETURN(*algorithm_desc,
+                          GetCudnnConvolutionBackwardFilterAlgorithm(
+                              stream, cudnn, algorithm_config, input_nd,
+                              filter_nd, element_type, convolution_descriptor,
+                              output_nd, scratch_allocator, scratch_memory));
       break;
     }
     default:
@@ -2951,8 +3023,9 @@ port::Status CudnnSupport::DoConvolve(
   auto accumulator_type = GetConvAccumulatorType(element_type);
   CudnnConvolutionDescriptor conv(convolution_descriptor,
                                   ToCudnnDataType(accumulator_type));
-  // Set use_tensor_math param to correct value
-  conv.set_use_tensor_op_math(algorithm_desc.tensor_ops_enabled());
+  SE_ASSIGN_OR_RETURN(bool use_tensor_ops,
+                      UseTensorOps(stream, element_type, algorithm_desc));
+  conv.set_use_tensor_op_math(use_tensor_ops);
 
   auto cudnn = cudnn_->GetHandle(parent_, stream);
   // Alpha is the scaling factor for input.
@@ -3185,14 +3258,6 @@ port::Status CudnnSupport::DoConvolve(
   return port::Status::OK();
 }
 
-// A helper function to query if a CudnnConvolutionDescriptor has tensor_op_math
-// set
-static bool IsTensorMathOpSet(const CudnnConvolutionDescriptor& conv) {
-  cudnnMathType_t math_type;
-  CHECK_CUDNN_OK(cudnnGetConvolutionMathType(conv.handle(), &math_type));
-  return math_type == CUDNN_TENSOR_OP_MATH;
-}
-
 template <typename ElementType, typename BiasType, typename ScaleType,
           typename OutputType>
 port::Status CudnnSupport::DoFusedConvolveImpl(
@@ -3226,8 +3291,6 @@ port::Status CudnnSupport::DoFusedConvolveImpl(
       filter_descriptor,
       GetCudnnDataType<ElementType>(conv_input_descriptor.layout()));
   CudnnTensorDescriptor bias_nd(bias_descriptor, GetCudnnDataType<BiasType>());
-  CudnnConvolutionDescriptor conv(convolution_descriptor,
-                                  ToCudnnDataType(accumulator_type));
 
   auto cudnn = cudnn_->GetHandle(parent_, stream);
 
@@ -3237,9 +3300,14 @@ port::Status CudnnSupport::DoFusedConvolveImpl(
   SE_ASSIGN_OR_RETURN(
       dnn::AlgorithmDesc algo_desc,
       GetCudnnConvolutionForwardAlgorithm(
-          stream, cudnn, algorithm_config, conv_input_nd, filter, conv,
+          stream, cudnn, algorithm_config, conv_input_nd, filter,
+          dnn::ToDataType<ElementType>::value, convolution_descriptor,
           output_nd, scratch_allocator, &scratch));
 
+  CudnnConvolutionDescriptor conv(convolution_descriptor,
+                                  ToCudnnDataType(accumulator_type));
+  conv.set_use_tensor_op_math(algo_desc.tensor_ops_enabled());
+
   std::unique_ptr<GpuTimer, GpuTimerDeleter> timer;
   if (is_profiling) {
     timer.reset(new GpuTimer(parent_));  // NOLINT

From 0f58bb63090222cef0eebe74630b2d4d9d886a2f Mon Sep 17 00:00:00 2001
From: Nathan Luehr <nluehr@nvidia.com>
Date: Tue, 19 May 2020 15:54:10 -0500
Subject: [PATCH 0076/1390] Plumb TF32 for RNN

---
 tensorflow/stream_executor/cuda/cuda_dnn.cc | 30 ++++++++++++++-------
 1 file changed, 21 insertions(+), 9 deletions(-)

diff --git a/tensorflow/stream_executor/cuda/cuda_dnn.cc b/tensorflow/stream_executor/cuda/cuda_dnn.cc
index 53296f4eea5..fa06d410323 100644
--- a/tensorflow/stream_executor/cuda/cuda_dnn.cc
+++ b/tensorflow/stream_executor/cuda/cuda_dnn.cc
@@ -1163,17 +1163,26 @@ class CudnnRnnDescriptor : public dnn::RnnDescriptor {
     // in profile mode, which is run with algorithms returned from
     // GetRnnAlgorithms() (which are non-default and explicitly set whether to
     // use tensor ops). CuDNN 7.2.1 fixed this issue
-    cudnnMathType_t math_type;
+    bool allow_tensor_ops =
+        data_type != CUDNN_DATA_FLOAT || tensorflow::tf32_execution_allowed();
+    bool use_tensor_ops;
     if (algorithm_config.algorithm().has_value()) {
-      math_type = algorithm_config.algorithm()->tensor_ops_enabled()
-                      ? CUDNN_TENSOR_OP_MATH
-                      : CUDNN_DEFAULT_MATH;
+      use_tensor_ops = algorithm_config.algorithm()->tensor_ops_enabled();
     } else {
-#if CUDNN_VERSION >= 7201
-      math_type = CUDNN_TENSOR_OP_MATH;
-#else
-      math_type = CUDNN_DEFAULT_MATH;
-#endif  // CUDNN_VERSION >= 7201
+      use_tensor_ops = CUDNN_VERSION >= 7201 && allow_tensor_ops;
+    }
+
+    if (use_tensor_ops && !allow_tensor_ops) {
+      return port::Status(port::error::INVALID_ARGUMENT,
+                          "Algo requests disallowed tensor op evaluation.");
+    }
+
+    cudnnMathType_t math_type;
+    if (use_tensor_ops) {
+      math_type =
+          CUDNN_VERSION >= 8000 ? CUDNN_DEFAULT_MATH : CUDNN_TENSOR_OP_MATH;
+    } else {
+      math_type = CUDNN_VERSION >= 8000 ? CUDNN_FMA_MATH : CUDNN_DEFAULT_MATH;
     }
     CHECK_CUDNN_OK(cudnnSetRNNMatrixMathType(rnn_desc.get(), math_type));
 #endif  // CUDNN_VERSION >= 7000
@@ -2626,6 +2635,9 @@ port::StatusOr<bool> UseTensorOps(Stream* stream, dnn::DataType type,
   return use_tensor_ops;
 }
 
+cudnnDataType_t GetRnnComputeType(dnn::DataType data_type);
+dnn::DataType GetConvAccumulatorType(dnn::DataType data_type);
+
 port::StatusOr<dnn::AlgorithmDesc> GetCudnnConvolutionForwardAlgorithm(
     Stream* stream, const CudnnHandle& cudnn,
     const dnn::AlgorithmConfig& algorithm_config,

From b67608e66c54224fa52200095fba09df0f2b3c71 Mon Sep 17 00:00:00 2001
From: Nathan Luehr <nluehr@nvidia.com>
Date: Wed, 20 May 2020 10:06:35 -0500
Subject: [PATCH 0077/1390] Plumb TF32 for cublas gemm

---
 tensorflow/stream_executor/cuda/BUILD        |  1 +
 tensorflow/stream_executor/cuda/cuda_blas.cc | 84 +++++++++-----------
 tensorflow/stream_executor/cuda/cuda_blas.h  |  8 +-
 3 files changed, 43 insertions(+), 50 deletions(-)

diff --git a/tensorflow/stream_executor/cuda/BUILD b/tensorflow/stream_executor/cuda/BUILD
index 2749281335e..519033a62d8 100644
--- a/tensorflow/stream_executor/cuda/BUILD
+++ b/tensorflow/stream_executor/cuda/BUILD
@@ -251,6 +251,7 @@ cc_library(
         "@local_config_cuda//cuda:cuda_headers",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "//tensorflow/core/platform:tf32_utils",
         "//tensorflow/stream_executor",
         "//tensorflow/stream_executor:event",
         "//tensorflow/stream_executor:host_or_device_scalar",
diff --git a/tensorflow/stream_executor/cuda/cuda_blas.cc b/tensorflow/stream_executor/cuda/cuda_blas.cc
index 65c07e72154..e2cbb0b75df 100644
--- a/tensorflow/stream_executor/cuda/cuda_blas.cc
+++ b/tensorflow/stream_executor/cuda/cuda_blas.cc
@@ -48,7 +48,7 @@ limitations under the License.
 
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
-#include "third_party/eigen3/Eigen/Core"
+#include "tensorflow/core/platform/tf32_utils.h"
 #include "tensorflow/core/util/env_var.h"
 #include "tensorflow/stream_executor/cuda/cuda_activation.h"
 #include "tensorflow/stream_executor/cuda/cuda_gpu_executor.h"
@@ -66,6 +66,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/plugin_registry.h"
 #include "tensorflow/stream_executor/scratch_allocator.h"
 #include "tensorflow/stream_executor/stream_executor.h"
+#include "third_party/eigen3/Eigen/Core"
 
 namespace stream_executor {
 namespace gpu {
@@ -225,6 +226,18 @@ bool CUDABlas::Init() {
     return false;
   }
 
+#if CUDA_VERSION >= 9000
+#if CUBLAS_VER_MAJOR >= 11
+  ret = cublasSetMathMode(blas_, CUBLAS_TF32_TENSOR_OP_MATH);
+#else
+  ret = cublasSetMathMode(blas_, CUBLAS_TENSOR_OP_MATH);
+#endif
+  if (ret != CUBLAS_STATUS_SUCCESS) {
+    LOG(ERROR) << "failed to set cublas default math mode: " << ToString(ret);
+    return false;
+  }
+#endif
+
   return true;
 }
 
@@ -387,7 +400,7 @@ cudaDataType_t CUDAComputationType(blas::ComputationType ty) {
 template <typename FuncT, typename... Args>
 bool CUDABlas::DoBlasInternalImpl(FuncT cublas_func, Stream *stream,
                                   bool pointer_mode_host, bool err_on_failure,
-                                  bool use_tensor_op_math, Args... args) {
+                                  Args... args) {
   absl::MutexLock lock(&mu_);
 
   CHECK(blas_ != nullptr);
@@ -401,10 +414,10 @@ bool CUDABlas::DoBlasInternalImpl(FuncT cublas_func, Stream *stream,
                                            : CUBLAS_POINTER_MODE_DEVICE)) {
     return false;
   }
-#if CUDA_VERSION >= 9000
+#if CUBLAS_VER_MAJOR >= 11
   ScopedCublasMathMode math_mode{blas_};
-  if (use_tensor_op_math) {
-    if (!math_mode.Init(CUBLAS_TENSOR_OP_MATH)) {
+  if (!tensorflow::tf32_execution_allowed()) {
+    if (!math_mode.Init(CUBLAS_DEFAULT_MATH)) {
       return false;
     }
   }
@@ -1621,21 +1634,9 @@ bool CUDABlas::DoBlasGemm(
     }
   }
 
-  bool use_tensor_ops = false;
-#if CUDA_VERSION >= 9000
-  int cc_major, cc_minor;
-  stream->parent()->GetDeviceDescription().cuda_compute_capability(&cc_major,
-                                                                   &cc_minor);
-
-  // GPUs < sm_70 don't support tensor ops.
-  if (cc_major >= 7) {
-    use_tensor_ops = true;
-  }
-#endif
-
   return DoBlasInternalImpl(
       cublasSgemmEx, stream, true /* = pointer_mode_host */,
-      true /* = err_on_failure= */, use_tensor_ops, CUDABlasTranspose(transa),
+      true /* = err_on_failure= */, CUDABlasTranspose(transa),
       CUDABlasTranspose(transb), m, n, k, &alpha, GpuMemory(a),
       SE_CUDA_DATA_HALF, lda, GpuMemory(b), SE_CUDA_DATA_HALF, ldb, &beta,
       GpuMemoryMutable(c), SE_CUDA_DATA_HALF, ldc);
@@ -2257,7 +2258,8 @@ port::Status CUDABlas::DoBlasGemmBatchedInternal(
   if (stream->parent()->GetDeviceDescription().cuda_compute_capability(
           &cc_major, &cc_minor) &&
       cc_major >= 5) {
-    bool use_tensor_ops = data_type == CUDA_R_16F;
+    bool use_tensor_ops =
+        data_type == CUDA_R_16F || tensorflow::tf32_execution_allowed();
     cublasGemmAlgo_t algo =
         (use_tensor_ops ? CUBLAS_GEMM_DFALT_TENSOR_OP : CUBLAS_GEMM_DFALT);
     cudaDataType_t compute_type =
@@ -2271,7 +2273,7 @@ port::Status CUDABlas::DoBlasGemmBatchedInternal(
     bool ok;
     ok = DoBlasInternalImpl(
         AS_LAMBDA(cublasGemmBatchedEx), stream, true /* = pointer_mode_host */,
-        true /* = err_on_failure */, use_tensor_ops, CUDABlasTranspose(transa),
+        true /* = err_on_failure */, CUDABlasTranspose(transa),
         CUDABlasTranspose(transb), m, n, k, &alpha, a_void_ptrs, data_type, lda,
         b_void_ptrs, data_type, ldb, &beta, c_void_ptrs, data_type, ldc,
         batch_count, compute_type, algo);
@@ -2406,33 +2408,25 @@ bool CUDABlas::DoBlasGemmStridedBatched(
     int lda, int64 stride_a, const DeviceMemory<Eigen::half> &b, int ldb,
     int64 stride_b, float beta, DeviceMemory<Eigen::half> *c, int ldc,
     int64 stride_c, int batch_count) {
-  bool use_tensor_ops = false;
-#if CUDA_VERSION >= 9000
+#if CUDA_VERSION >= 9010
   int cc_major, cc_minor;
   if (stream->parent()->GetDeviceDescription().cuda_compute_capability(
-          &cc_major, &cc_minor)) {
-    // GPUs < sm_70 don't support tensor ops.
-    if (cc_major >= 7) {
-      use_tensor_ops = true;
+          &cc_major, &cc_minor) &&
+      cc_major >= 5) {
+    cublasGemmAlgo_t algo =
+        (cc_major >= 7 ? CUBLAS_GEMM_DFALT_TENSOR_OP : CUBLAS_GEMM_DFALT);
+    bool ok = DoBlasInternalImpl(
+        AS_LAMBDA(cublasGemmStridedBatchedEx), stream,
+        true /* = pointer_mode_host */, true /* = err_on_failure */,
+        CUDABlasTranspose(transa), CUDABlasTranspose(transb), m, n, k, &alpha,
+        GpuMemory(a), CUDA_R_16F, lda, stride_a, GpuMemory(b), CUDA_R_16F, ldb,
+        stride_b, &beta, GpuMemoryMutable(c), CUDA_R_16F, ldc, stride_c,
+        batch_count, CUDA_R_32F, algo);
+    if (ok) {
+      return true;
     }
-#if CUDA_VERSION >= 9010
-    if (cc_major >= 5) {
-      cublasGemmAlgo_t algo =
-          (use_tensor_ops ? CUBLAS_GEMM_DFALT_TENSOR_OP : CUBLAS_GEMM_DFALT);
-      bool ok = DoBlasInternalImpl(
-          AS_LAMBDA(cublasGemmStridedBatchedEx), stream,
-          true /* = pointer_mode_host */, true /* = err_on_failure */,
-          use_tensor_ops, CUDABlasTranspose(transa), CUDABlasTranspose(transb),
-          m, n, k, &alpha, GpuMemory(a), CUDA_R_16F, lda, stride_a,
-          GpuMemory(b), CUDA_R_16F, ldb, stride_b, &beta, GpuMemoryMutable(c),
-          CUDA_R_16F, ldc, stride_c, batch_count, CUDA_R_32F, algo);
-      if (ok) {
-        return true;
-      }
-      LOG(ERROR) << "failed BLAS call, see log for details";
-      return false;
-    }
-#endif
+    LOG(ERROR) << "failed BLAS call, see log for details";
+    return false;
   }
 #endif
   // Either CUDA_VERSION < 9.1 or SM < 5.0. Fall back to a loop.
@@ -2445,7 +2439,7 @@ bool CUDABlas::DoBlasGemmStridedBatched(
         reinterpret_cast<__half *>(GpuMemoryMutable(c) + batch * stride_c);
     bool ok = DoBlasInternalImpl(
         cublasSgemmEx, stream, true /* = pointer_mode_host */,
-        true /* = err_on_failure= */, use_tensor_ops, CUDABlasTranspose(transa),
+        true /* = err_on_failure= */, CUDABlasTranspose(transa),
         CUDABlasTranspose(transb), m, n, k, &alpha, a_matrix, SE_CUDA_DATA_HALF,
         lda, b_matrix, SE_CUDA_DATA_HALF, ldb, &beta, c_matrix,
         SE_CUDA_DATA_HALF, ldc);
diff --git a/tensorflow/stream_executor/cuda/cuda_blas.h b/tensorflow/stream_executor/cuda/cuda_blas.h
index 817bdb72777..556456c83db 100644
--- a/tensorflow/stream_executor/cuda/cuda_blas.h
+++ b/tensorflow/stream_executor/cuda/cuda_blas.h
@@ -83,7 +83,7 @@ class CUDABlas : public blas::BlasSupport {
   template <typename FuncT, typename... Args>
   bool DoBlasInternalImpl(FuncT cublas_func, Stream *stream,
                           bool pointer_mode_host, bool err_on_failure,
-                          bool use_tensor_op_math, Args... args);
+                          Args... args);
 
   // Convenience functions that call DoBlasInternalImpl with different values
   // for err_on_failure.
@@ -91,8 +91,7 @@ class CUDABlas : public blas::BlasSupport {
   bool DoBlasInternal(FuncT cublas_func, Stream *stream, bool pointer_mode_host,
                       Args... args) {
     return DoBlasInternalImpl(cublas_func, stream, pointer_mode_host,
-                              /*err_on_failure=*/true, /*use_tensor_ops=*/false,
-                              args...);
+                              /*err_on_failure=*/true, args...);
   }
   template <typename FuncT, typename... Args>
   bool DoBlasInternalFailureOK(FuncT cublas_func, Stream *stream,
@@ -100,8 +99,7 @@ class CUDABlas : public blas::BlasSupport {
     // Tensor ops are hard-coded off in this path, but can still be enabled with
     // a specific algorithm choice as in DoBlasGemmWithAlgorithmImpl().
     return DoBlasInternalImpl(cublas_func, stream, pointer_mode_host,
-                              /*err_on_failure=*/false,
-                              /*use_tensor_ops=*/false, args...);
+                              /*err_on_failure=*/false, args...);
   }
 
   // A helper function to implement DoBlasGemmBatched interfaces for generic

From b1e74b227c681588a62768042816702a9518f642 Mon Sep 17 00:00:00 2001
From: Jens Elofsson <jens.elofsson@arm.com>
Date: Thu, 4 Jun 2020 09:22:37 +0200
Subject: [PATCH 0078/1390] Fix compile errors.

---
 tensorflow/lite/micro/memory_planner/greedy_memory_planner.cc | 2 +-
 tensorflow/lite/micro/micro_allocator.cc                      | 3 +++
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/tensorflow/lite/micro/memory_planner/greedy_memory_planner.cc b/tensorflow/lite/micro/memory_planner/greedy_memory_planner.cc
index 8f21a167f67..39991ab758b 100644
--- a/tensorflow/lite/micro/memory_planner/greedy_memory_planner.cc
+++ b/tensorflow/lite/micro/memory_planner/greedy_memory_planner.cc
@@ -191,7 +191,7 @@ void GreedyMemoryPlanner::CalculateOffsetsIfNeeded() {
   // Work through the rest of the buffers to find a good gap to place each one.
   for (int i = 1; i < buffer_count_; ++i) {
     // The id is the order the buffer was originally added by the client.
-    const int buffer_id = buffer_ids_sorted_[i];
+    buffer_id = buffer_ids_sorted_[i];
     // Look at what size and time range the buffer needs to be active.
     BufferRequirements* wanted_requirements = &requirements_[buffer_id];
     const int wanted_size = wanted_requirements->size;
diff --git a/tensorflow/lite/micro/micro_allocator.cc b/tensorflow/lite/micro/micro_allocator.cc
index c204f4460b4..8fac421750d 100644
--- a/tensorflow/lite/micro/micro_allocator.cc
+++ b/tensorflow/lite/micro/micro_allocator.cc
@@ -99,6 +99,9 @@ TfLiteStatus AllocateVariables(
 // not called by default. Hence it's not linked in to the final binary code.
 TfLiteStatus CheckOfflinePlannedOffsets(const Model* model,
                                         ErrorReporter* error_reporter) {
+  // Suppress compile warning for unused function
+  (void)CheckOfflinePlannedOffsets;
+
   if (model->metadata()) {
     for (size_t i = 0; i < model->metadata()->size(); ++i) {
       auto metadata = model->metadata()->Get(i);

From 521b7595b7adba5627a2687befc3fb41bc5c2bec Mon Sep 17 00:00:00 2001
From: "902449@58880@bigcat_chen@ASIC" <bigcat_chen@himax.com.tw>
Date: Thu, 4 Jun 2020 17:19:31 +0800
Subject: [PATCH 0079/1390] TFLM: add HIMAX WE1 EVB to support TFLM
 example(hello word and person detection INT8)

---
 .../lite/micro/examples/hello_world/README.md | 132 +++-
 .../micro/examples/hello_world/README.md~     | 595 ++++++++++++++++++
 .../hello_world/himax_we1_evb/constants.cc    |  19 +
 .../himax_we1_evb/output_handler.cc           |  35 ++
 .../himax_we1_evb/output_handler.cc~          |  53 ++
 .../person_detection_experimental/README.md   | 129 +++-
 .../himax_we1_evb/detection_responder.cc      |  34 +
 .../himax_we1_evb/image_provider.cc           |  44 ++
 .../himax_we1_evb/image_provider.cc~          |  44 ++
 .../himax_we1_evb/main_functions.cc           | 127 ++++
 .../{we_i => himax_we1_evb}/debug_log.cc      |   9 +-
 .../make/targets/himax_we1_evb_makefile.inc   |  91 +++
 .../make/targets/himax_we1_evb_makefile.inc~  |  93 +++
 .../tools/make/third_party_downloads.inc      |   4 +
 .../tools/make/third_party_downloads.inc~     |  86 +++
 15 files changed, 1476 insertions(+), 19 deletions(-)
 create mode 100644 tensorflow/lite/micro/examples/hello_world/README.md~
 create mode 100644 tensorflow/lite/micro/examples/hello_world/himax_we1_evb/constants.cc
 create mode 100644 tensorflow/lite/micro/examples/hello_world/himax_we1_evb/output_handler.cc
 create mode 100644 tensorflow/lite/micro/examples/hello_world/himax_we1_evb/output_handler.cc~
 create mode 100644 tensorflow/lite/micro/examples/person_detection_experimental/himax_we1_evb/detection_responder.cc
 create mode 100644 tensorflow/lite/micro/examples/person_detection_experimental/himax_we1_evb/image_provider.cc
 create mode 100644 tensorflow/lite/micro/examples/person_detection_experimental/himax_we1_evb/image_provider.cc~
 create mode 100644 tensorflow/lite/micro/examples/person_detection_experimental/himax_we1_evb/main_functions.cc
 rename tensorflow/lite/micro/{we_i => himax_we1_evb}/debug_log.cc (90%)
 create mode 100644 tensorflow/lite/micro/tools/make/targets/himax_we1_evb_makefile.inc
 create mode 100644 tensorflow/lite/micro/tools/make/targets/himax_we1_evb_makefile.inc~
 create mode 100644 tensorflow/lite/micro/tools/make/third_party_downloads.inc~

diff --git a/tensorflow/lite/micro/examples/hello_world/README.md b/tensorflow/lite/micro/examples/hello_world/README.md
index 3b633890306..9c0a5e2306a 100644
--- a/tensorflow/lite/micro/examples/hello_world/README.md
+++ b/tensorflow/lite/micro/examples/hello_world/README.md
@@ -14,13 +14,34 @@ of the device.
 
 ## Table of contents
 
--   [Deploy to ARC EM SDP](#deploy-to-arc-em-sdp)
--   [Deploy to Arduino](#deploy-to-arduino)
--   [Deploy to ESP32](#deploy-to-esp32)
--   [Deploy to SparkFun Edge](#deploy-to-sparkfun-edge)
--   [Deploy to STM32F746](#deploy-to-STM32F746)
--   [Run the tests on a development machine](#run-the-tests-on-a-development-machine)
--   [Train your own model](#train-your-own-model)
+- [Hello World Example](#hello-world-example)
+  - [Table of contents](#table-of-contents)
+  - [Deploy to ARC EM SDP](#deploy-to-arc-em-sdp)
+    - [Initial Setup](#initial-setup)
+    - [Generate Example Project](#generate-example-project)
+    - [Build and Run Example](#build-and-run-example)
+  - [Deploy to Arduino](#deploy-to-arduino)
+    - [Install the Arduino_TensorFlowLite library](#install-the-arduinotensorflowlite-library)
+    - [Load and run the example](#load-and-run-the-example)
+  - [Deploy to ESP32](#deploy-to-esp32)
+    - [Install the ESP IDF](#install-the-esp-idf)
+    - [Generate the examples](#generate-the-examples)
+    - [Building the example](#building-the-example)
+    - [Load and run the example](#load-and-run-the-example-1)
+  - [Deploy to himax WE1 EVB](#deploy-to-himax-we1-evb)
+    - [Initial Setup](#initial-setup-1)
+      - [MetaWare Development Toolkit](#metaware-development-toolkit)
+      - [Make Tool version](#make-tool-version)
+      - [Serial Terminal Emulation Application](#serial-terminal-emulation-application)
+    - [Generate Example Project](#generate-example-project-1)
+    - [Build and Burn Example](#build-and-burn-example)
+  - [Deploy to SparkFun Edge](#deploy-to-sparkfun-edge)
+    - [Compile the binary](#compile-the-binary)
+    - [Sign the binary](#sign-the-binary)
+    - [Flash the binary](#flash-the-binary)
+  - [Deploy to STM32F746](#deploy-to-stm32f746)
+    - [Run the tests on a development machine](#run-the-tests-on-a-development-machine)
+    - [Train your own model](#train-your-own-model)
 
 ## Deploy to ARC EM SDP
 
@@ -191,6 +212,103 @@ The previous two commands can be combined:
 idf.py --port /dev/ttyUSB0 flash monitor
 ```
 
+## Deploy to himax WE1 EVB
+
+The following instructions will help you build and deploy this example to
+[HIMAX WE1 EVB](https://github.com/HimaxWiseEyePlus/bsp_tflu/tree/master/HIMAX_WE1_EVB_board_brief)
+board. To undstand more about using this board, please check 
+[HIMAX WE1 EVB user guide](https://github.com/HimaxWiseEyePlus/bsp_tflu/tree/master/HIMAX_WE1_EVB_user_guide).
+
+### Initial Setup
+
+To use the HIMAX WE1 EVB, please make sure following software are installed:
+
+#### MetaWare Development Toolkit
+
+See
+[Install the Synopsys DesignWare ARC MetaWare Development Toolkit](/tensorflow/lite/micro/tools/make/targets/arc/README.md#install-the-synopsys-designware-arc-metaware-development-toolkit)
+section for instructions on toolchain installation.
+
+#### Make Tool version
+
+A `'make'` tool is required for deploying Tensorflow Lite Micro
+applications on HIMAX WE1 EVB, See 
+[Check make tool version](/tensorflow/lite/micro/tools/make/targets/arc/README.md#make-tool)
+section for proper environment.
+
+#### Serial Terminal Emulation Application
+
+There are 2 main purposes for HIMAX WE1 EVB Debug UART port 
+
+- print application output
+- burn application to flash by using xmodem send application binary
+
+You can use any terminal emulation program (like [PuTTY](https://www.putty.org/) or [minicom](https://linux.die.net/man/1/minicom)).
+
+
+### Generate Example Project
+
+The example project for HIMAX WE1 EVB platform can be generated with the following
+command:
+
+Download related third party data
+
+```
+make -f tensorflow/lite/micro/tools/make/Makefile TARGET=himax_we1_evb third_party_downloads
+```
+
+Generate hello world project
+
+```
+make -f tensorflow/lite/micro/tools/make/Makefile generate_hello_world_make_project TARGET=himax_we1_evb TAGS=no_arc_mli
+```
+
+### Build and Burn Example
+
+Following the Steps to run hello world example at HIMAX WE1 EVB platform. 
+
+1.  Go to the generated example project directory.
+
+    ```
+    cd tensorflow/lite/micro/tools/make/gen/himax_we1_evb_arc/prj/hello_world/make
+    ```
+    
+2.  Build the example using
+
+    ```
+    make app
+    ```
+
+3.  After example build finish, copy ELF file and map file to image generate tool directory.  
+    image generate tool directory  located at `'tensorflow/lite/micro/tools/make/downloads/himax_we1_sdk/image_gen_linux_v3/'` 
+
+    ```
+    cp hello_world.elf himax_we1_evb.map ../../../../../downloads/himax_we1_sdk/image_gen_linux_v3/
+    ```
+
+4.  Go to flash image generate tool directory. 
+
+    ```
+    cd ../../../../../downloads/himax_we1_sdk/image_gen_linux_v3/
+    ```
+
+5.  run image generate tool, generate flash image file. 
+
+    *   Before running image generate tool, by typing `sudo chmod +x image_gen`
+        and `sudo chmod +x sign_tool` to make sure it is executable. 
+
+    ```
+    image_gen -e hello_world.elf -m himax_we1_evb.map -o out.img
+    ```    
+       
+
+6.  Download flash image file to HIMAX WE1 EVB by UART:
+
+    *   more detail about download image through UART can be found at [HIMAX WE1 EVB update Flash image](https://github.com/HimaxWiseEyePlus/bsp_tflu/tree/master/HIMAX_WE1_EVB_user_guide#flash-image-update)
+
+After these steps, press reset button on the HIMAX WE1 EVB, you will see application output in the serial
+terminal.
+
 ## Deploy to SparkFun Edge
 
 The following instructions will help you build and deploy this sample on the
diff --git a/tensorflow/lite/micro/examples/hello_world/README.md~ b/tensorflow/lite/micro/examples/hello_world/README.md~
new file mode 100644
index 00000000000..011711493d5
--- /dev/null
+++ b/tensorflow/lite/micro/examples/hello_world/README.md~
@@ -0,0 +1,595 @@
+# Hello World Example
+
+This example is designed to demonstrate the absolute basics of using [TensorFlow
+Lite for Microcontrollers](https://www.tensorflow.org/lite/microcontrollers).
+It includes the full end-to-end workflow of training a model, converting it for
+use with TensorFlow Lite for Microcontrollers for running inference on a
+microcontroller.
+
+The model is trained to replicate a `sine` function and generates a pattern of
+data to either blink LEDs or control an animation, depending on the capabilities
+of the device.
+
+![Animation on STM32F746](images/animation_on_STM32F746.gif)
+
+## Table of contents
+
+- [Hello World Example](#hello-world-example)
+  - [Table of contents](#table-of-contents)
+  - [Deploy to ARC EM SDP](#deploy-to-arc-em-sdp)
+    - [Initial Setup](#initial-setup)
+    - [Generate Example Project](#generate-example-project)
+    - [Build and Run Example](#build-and-run-example)
+  - [Deploy to Arduino](#deploy-to-arduino)
+    - [Install the Arduino_TensorFlowLite library](#install-the-arduinotensorflowlite-library)
+    - [Load and run the example](#load-and-run-the-example)
+  - [Deploy to ESP32](#deploy-to-esp32)
+    - [Install the ESP IDF](#install-the-esp-idf)
+    - [Generate the examples](#generate-the-examples)
+    - [Building the example](#building-the-example)
+    - [Load and run the example](#load-and-run-the-example-1)
+  - [Deploy to himax WE1 EVB](#deploy-to-himax-we1-evb)
+    - [Initial Setup](#initial-setup-1)
+      - [MetaWare Development Toolkit](#metaware-development-toolkit)
+      - [Make Tool version](#make-tool-version)
+      - [Serial Terminal Emulation Application](#serial-terminal-emulation-application)
+    - [Generate Example Project](#generate-example-project-1)
+    - [Build and Burn Example](#build-and-burn-example)
+  - [Deploy to SparkFun Edge](#deploy-to-sparkfun-edge)
+    - [Compile the binary](#compile-the-binary)
+    - [Sign the binary](#sign-the-binary)
+    - [Flash the binary](#flash-the-binary)
+  - [Deploy to STM32F746](#deploy-to-stm32f746)
+    - [Run the tests on a development machine](#run-the-tests-on-a-development-machine)
+    - [Train your own model](#train-your-own-model)
+
+## Deploy to ARC EM SDP
+
+The following instructions will help you to build and deploy this example to
+[ARC EM SDP](https://www.synopsys.com/dw/ipdir.php?ds=arc-em-software-development-platform)
+board. General information and instructions on using the board with TensorFlow
+Lite Micro can be found in the common
+[ARC targets description](/tensorflow/lite/micro/tools/make/targets/arc/README.md).
+
+### Initial Setup
+
+Follow the instructions on the
+[ARC EM SDP Initial Setup](/tensorflow/lite/micro/tools/make/targets/arc/README.md#ARC-EM-Software-Development-Platform-ARC-EM-SDP)
+to get and install all required tools for work with ARC EM SDP.
+
+### Generate Example Project
+
+The example project for ARC EM SDP platform can be generated with the following
+command:
+
+```
+make -f tensorflow/lite/micro/tools/make/Makefile TARGET=arc_emsdp TAGS=no_arc_mli generate_hello_world_make_project
+```
+
+### Build and Run Example
+
+For more detailed information on building and running examples see the
+appropriate sections of general descriptions of the
+[ARC EM SDP usage with TFLM](/tensorflow/lite/micro/tools/make/targets/arc/README.md#ARC-EM-Software-Development-Platform-ARC-EM-SDP).
+In the directory with generated project you can also find a
+*README_ARC_EMSDP.md* file with instructions and options on building and
+running. Here we only briefly mention main steps which are typically enough to
+get it started.
+
+1.  You need to
+    [connect the board](/tensorflow/lite/micro/tools/make/targets/arc/README.md#connect-the-board)
+    and open an serial connection.
+
+2.  Go to the generated example project director
+
+    ```
+    cd tensorflow/lite/micro/tools/make/gen/arc_emsdp_arc/prj/hello_world/make
+    ```
+
+3.  Build the example using
+
+    ```
+    make app
+    ```
+
+4.  To generate artefacts for self-boot of example from the board use
+
+    ```
+    make flash
+    ```
+
+5.  To run application from the board using microSD card:
+
+    *   Copy the content of the created /bin folder into the root of microSD
+        card. Note that the card must be formatted as FAT32 with default cluster
+        size (but less than 32 Kbytes)
+    *   Plug in the microSD card into the J11 connector.
+    *   Push the RST button. If a red LED is lit beside RST button, push the CFG
+        button.
+
+6.  If you have the MetaWare Debugger installed in your environment:
+
+    *   To run application from the console using it type `make run`.
+    *   To stop the execution type `Ctrl+C` in the console several times.
+
+In both cases (step 5 and 6) you will see the application output in the serial
+terminal.
+
+## Deploy to Arduino
+
+The following instructions will help you build and deploy this sample
+to [Arduino](https://www.arduino.cc/) devices.
+
+![Animation on Arduino MKRZERO](images/animation_on_arduino_mkrzero.gif)
+
+The sample has been tested with the following devices:
+
+- [Arduino Nano 33 BLE Sense](https://store.arduino.cc/usa/nano-33-ble-sense-with-headers)
+- [Arduino MKRZERO](https://store.arduino.cc/usa/arduino-mkrzero)
+
+The sample will use PWM to fade an LED on and off according to the model's
+output. In the code, the `LED_BUILTIN` constant is used to specify the board's
+built-in LED as the one being controlled. However, on some boards, this built-in
+LED is not attached to a pin with PWM capabilities. In this case, the LED will
+blink instead of fading.
+
+### Install the Arduino_TensorFlowLite library
+
+This example application is included as part of the official TensorFlow Lite
+Arduino library. To install it, open the Arduino library manager in
+`Tools -> Manage Libraries...` and search for `Arduino_TensorFlowLite`.
+
+### Load and run the example
+
+Once the library has been added, go to `File -> Examples`. You should see an
+example near the bottom of the list named `TensorFlowLite:hello_world`. Select
+it and click `hello_world` to load the example.
+
+Use the Arduino IDE to build and upload the example. Once it is running,
+you should see the built-in LED on your device flashing.
+
+The Arduino Desktop IDE includes a plotter that we can use to display the sine
+wave graphically. To view it, go to `Tools -> Serial Plotter`. You will see one
+datapoint being logged for each inference cycle, expressed as a number between 0
+and 255.
+
+## Deploy to ESP32
+
+The following instructions will help you build and deploy this sample
+to [ESP32](https://www.espressif.com/en/products/hardware/esp32/overview)
+devices using the [ESP IDF](https://github.com/espressif/esp-idf).
+
+The sample has been tested on ESP-IDF version 4.0 with the following devices:
+- [ESP32-DevKitC](http://esp-idf.readthedocs.io/en/latest/get-started/get-started-devkitc.html)
+- [ESP-EYE](https://github.com/espressif/esp-who/blob/master/docs/en/get-started/ESP-EYE_Getting_Started_Guide.md)
+
+### Install the ESP IDF
+
+Follow the instructions of the
+[ESP-IDF get started guide](https://docs.espressif.com/projects/esp-idf/en/latest/get-started/index.html)
+to setup the toolchain and the ESP-IDF itself.
+
+The next steps assume that the
+[IDF environment variables are set](https://docs.espressif.com/projects/esp-idf/en/latest/get-started/index.html#step-4-set-up-the-environment-variables) :
+
+ * The `IDF_PATH` environment variable is set
+ * `idf.py` and Xtensa-esp32 tools (e.g. `xtensa-esp32-elf-gcc`) are in `$PATH`
+
+### Generate the examples
+The example project can be generated with the following command:
+```
+make -f tensorflow/lite/micro/tools/make/Makefile TARGET=esp generate_hello_world_esp_project
+```
+
+### Building the example
+
+Go the the example project directory
+```
+cd tensorflow/lite/micro/tools/make/gen/esp_xtensa-esp32/prj/hello_world/esp-idf
+```
+
+Then build with `idf.py`
+```
+idf.py build
+```
+
+### Load and run the example
+
+To flash (replace `/dev/ttyUSB0` with the device serial port):
+```
+idf.py --port /dev/ttyUSB0 flash
+```
+
+Monitor the serial output:
+```
+idf.py --port /dev/ttyUSB0 monitor
+```
+
+Use `Ctrl+]` to exit.
+
+The previous two commands can be combined:
+```
+idf.py --port /dev/ttyUSB0 flash monitor
+```
+
+## Deploy to himax WE1 EVB
+
+The following instructions will help you build and deploy this example to
+[HIMAX WE1 EVB](https://github.com/HimaxWiseEyePlus/bsp_tflu/tree/master/HIMAX_WE1_EVB_board_brief)
+board. To undstand more about using this board, please check 
+[HIMAX WE1 EVB user guide](https://github.com/HimaxWiseEyePlus/bsp_tflu/tree/master/HIMAX_WE1_EVB_user_guide).
+
+### Initial Setup
+
+To use the HIMAX WE1 EVB, please make sure following software are installed:
+
+#### MetaWare Development Toolkit
+
+See
+[Install the Synopsys DesignWare ARC MetaWare Development Toolkit](/tensorflow/lite/micro/tools/make/targets/arc/README.md#install-the-synopsys-designware-arc-metaware-development-toolkit)
+section for instructions on toolchain installation.
+
+#### Make Tool version
+
+A `'make'` tool is required for deploying Tensorflow Lite Micro
+applications on HIMAX WE1 EVB, See 
+[Check make tool version](/tensorflow/lite/micro/tools/make/targets/arc/README.md#make-tool)
+section for proper environment.
+
+#### Serial Terminal Emulation Application
+
+There are 2 main purposes for HIMAX WE1 EVB Debug UART port 
+
+- print application output
+- burn application to flash by using xmodem send application binary
+
+You can use any terminal emulation program (like [PuTTY](https://www.putty.org/) or [minicom](https://linux.die.net/man/1/minicom)).
+
+
+### Generate Example Project
+
+The example project for HIMAX WE1 EVB platform can be generated with the following
+command:
+
+Download related third party data
+
+```
+make -f tensorflow/lite/micro/tools/make/Makefile TARGET=himax_we1_evb third_party_downloads
+```
+
+Generate hello world project
+
+```
+make -f tensorflow/lite/micro/tools/make/Makefile generate_hello_world_make_project TARGET=himax_we1_evb
+```
+
+### Build and Burn Example
+
+Following the Steps to run hello world example at HIMAX WE1 EVB platform. 
+
+1.  Go to the generated example project directory.
+
+    ```
+    cd tensorflow/lite/micro/tools/make/gen/himax_we1_evb_arc/prj/hello_world/make
+    ```
+    
+2.  Build the example using
+
+    ```
+    make app
+    ```
+
+3.  After example build finish, copy ELF file and map file to image generate tool directory.  
+    image generate tool directory  located at `'tensorflow/lite/micro/tools/make/downloads/himax_we1_sdk/image_gen_linux_v3/'` 
+
+    ```
+    cp hello_world.elf himax_we1_evb.map ../../../../../downloads/himax_we1_sdk/image_gen_linux_v3/
+    ```
+
+4.  Go to flash image generate tool directory. 
+
+    ```
+    cd ../../../../../downloads/himax_we1_sdk/image_gen_linux_v3/
+    ```
+
+5.  run image generate tool, generate flash image file. 
+
+    *   Before running image generate tool, by typing `sudo chmod +x image_gen`
+        and `sudo chmod +x sign_tool` to make sure it is executable. 
+
+    ```
+    image_gen -e hello_world.elf -m himax_we1_evb.map -o out.img
+    ```    
+       
+
+6.  Download flash image file to HIMAX WE1 EVB by UART:
+
+    *   more detail about download image through UART can be found at [HIMAX WE1 EVB update Flash image](https://github.com/HimaxWiseEyePlus/bsp_tflu/tree/master/HIMAX_WE1_EVB_user_guide#flash-image-update)
+
+After these steps, press reset button on the HIMAX WE1 EVB, you will see application output in the serial
+terminal.
+
+## Deploy to SparkFun Edge
+
+The following instructions will help you build and deploy this sample on the
+[SparkFun Edge development board](https://sparkfun.com/products/15170).
+
+![Animation on SparkFun Edge](images/animation_on_sparkfun_edge.gif)
+
+If you're new to using this board, we recommend walking through the
+[AI on a microcontroller with TensorFlow Lite and SparkFun Edge](https://codelabs.developers.google.com/codelabs/sparkfun-tensorflow)
+codelab to get an understanding of the workflow.
+
+### Compile the binary
+
+The following command will download the required dependencies and then compile a
+binary for the SparkFun Edge:
+
+```
+make -f tensorflow/lite/micro/tools/make/Makefile TARGET=sparkfun_edge hello_world_bin
+```
+
+The binary will be created in the following location:
+
+```
+tensorflow/lite/micro/tools/make/gen/sparkfun_edge_cortex-m4/bin/hello_world.bin
+```
+
+### Sign the binary
+
+The binary must be signed with cryptographic keys to be deployed to the device.
+We'll now run some commands that will sign our binary so it can be flashed to
+the SparkFun Edge. The scripts we are using come from the Ambiq SDK, which is
+downloaded when the `Makefile` is run.
+
+Enter the following command to set up some dummy cryptographic keys we can use
+for development:
+
+```
+cp tensorflow/lite/micro/tools/make/downloads/AmbiqSuite-Rel2.2.0/tools/apollo3_scripts/keys_info0.py \
+tensorflow/lite/micro/tools/make/downloads/AmbiqSuite-Rel2.2.0/tools/apollo3_scripts/keys_info.py
+```
+
+Next, run the following command to create a signed binary:
+
+```
+python3 tensorflow/lite/micro/tools/make/downloads/AmbiqSuite-Rel2.2.0/tools/apollo3_scripts/create_cust_image_blob.py \
+--bin tensorflow/lite/micro/tools/make/gen/sparkfun_edge_cortex-m4/bin/hello_world.bin \
+--load-address 0xC000 \
+--magic-num 0xCB \
+-o main_nonsecure_ota \
+--version 0x0
+```
+
+This will create the file `main_nonsecure_ota.bin`. We'll now run another
+command to create a final version of the file that can be used to flash our
+device with the bootloader script we will use in the next step:
+
+```
+python3 tensorflow/lite/micro/tools/make/downloads/AmbiqSuite-Rel2.2.0/tools/apollo3_scripts/create_cust_wireupdate_blob.py \
+--load-address 0x20000 \
+--bin main_nonsecure_ota.bin \
+-i 6 \
+-o main_nonsecure_wire \
+--options 0x1
+```
+
+You should now have a file called `main_nonsecure_wire.bin` in the directory
+where you ran the commands. This is the file we'll be flashing to the device.
+
+### Flash the binary
+
+Next, attach the board to your computer via a USB-to-serial adapter.
+
+**Note:** If you're using the [SparkFun Serial Basic Breakout](https://www.sparkfun.com/products/15096),
+you should [install the latest drivers](https://learn.sparkfun.com/tutorials/sparkfun-serial-basic-ch340c-hookup-guide#drivers-if-you-need-them)
+before you continue.
+
+Once connected, assign the USB device name to an environment variable:
+
+```
+export DEVICENAME=put your device name here
+```
+
+Set another variable with the baud rate:
+
+```
+export BAUD_RATE=921600
+```
+
+Now, hold the button marked `14` on the device. While still holding the button,
+hit the button marked `RST`. Continue holding the button marked `14` while
+running the following command:
+
+```
+python3 tensorflow/lite/micro/tools/make/downloads/AmbiqSuite-Rel2.2.0/tools/apollo3_scripts/uart_wired_update.py \
+-b ${BAUD_RATE} ${DEVICENAME} \
+-r 1 \
+-f main_nonsecure_wire.bin \
+-i 6
+```
+
+You should see a long stream of output as the binary is flashed to the device.
+Once you see the following lines, flashing is complete:
+
+```
+Sending Reset Command.
+Done.
+```
+
+If you don't see these lines, flashing may have failed. Try running through the
+steps in [Flash the binary](#flash-the-binary) again (you can skip over setting
+the environment variables). If you continue to run into problems, follow the
+[AI on a microcontroller with TensorFlow Lite and SparkFun Edge](https://codelabs.developers.google.com/codelabs/sparkfun-tensorflow)
+codelab, which includes more comprehensive instructions for the flashing
+process.
+
+The binary should now be deployed to the device. Hit the button marked `RST` to
+reboot the board. You should see the device's four LEDs flashing in sequence.
+
+Debug information is logged by the board while the program is running. To view
+it, establish a serial connection to the board using a baud rate of `115200`.
+On OSX and Linux, the following command should work:
+
+```
+screen ${DEVICENAME} 115200
+```
+
+You will see a lot of output flying past! To stop the scrolling, hit `Ctrl+A`,
+immediately followed by `Esc`. You can then use the arrow keys to explore the
+output, which will contain the results of running inference on various `x`
+values:
+
+```
+x_value: 1.1843798*2^2, y_value: -1.9542645*2^-1
+```
+
+To stop viewing the debug output with `screen`, hit `Ctrl+A`, immediately
+followed by the `K` key, then hit the `Y` key.
+
+
+## Deploy to STM32F746
+
+The following instructions will help you build and deploy the sample to the
+[STM32F7 discovery kit](https://os.mbed.com/platforms/ST-Discovery-F746NG/)
+using [ARM Mbed](https://github.com/ARMmbed/mbed-cli).
+
+![Animation on STM32F746](images/animation_on_STM32F746.gif)
+
+Before we begin, you'll need the following:
+
+- STM32F7 discovery kit board
+- Mini-USB cable
+- ARM Mbed CLI ([installation instructions](https://os.mbed.com/docs/mbed-os/v5.12/tools/installation-and-setup.html))
+- Python 2.7 and pip
+
+Since Mbed requires a special folder structure for projects, we'll first run a
+command to generate a subfolder containing the required source files in this
+structure:
+
+```
+make -f tensorflow/lite/micro/tools/make/Makefile TARGET=mbed TAGS="CMSIS disco_f746ng" generate_hello_world_mbed_project
+```
+
+This will result in the creation of a new folder:
+
+```
+tensorflow/lite/micro/tools/make/gen/mbed_cortex-m4/prj/hello_world/mbed
+```
+
+This folder contains all of the example's dependencies structured in the correct
+way for Mbed to be able to build it.
+
+Change into the directory and run the following commands, making sure you are
+using Python 2.7.15.
+
+First, tell Mbed that the current directory is the root of an Mbed project:
+
+```
+mbed config root .
+```
+
+Next, tell Mbed to download the dependencies and prepare to build:
+
+```
+mbed deploy
+```
+
+By default, Mbed will build the project using C++98. However, TensorFlow Lite
+requires C++11. Run the following Python snippet to modify the Mbed
+configuration files so that it uses C++11:
+
+```
+python -c 'import fileinput, glob;
+for filename in glob.glob("mbed-os/tools/profiles/*.json"):
+  for line in fileinput.input(filename, inplace=True):
+    print line.replace("\"-std=gnu++98\"","\"-std=c++11\", \"-fpermissive\"")'
+
+```
+
+Finally, run the following command to compile:
+
+```
+mbed compile -m DISCO_F746NG -t GCC_ARM
+```
+
+This should result in a binary at the following path:
+
+```
+./BUILD/DISCO_F746NG/GCC_ARM/mbed.bin
+```
+
+To deploy, plug in your STM board and copy the file to it. On MacOS, you can do
+this with the following command:
+
+```
+cp ./BUILD/DISCO_F746NG/GCC_ARM/mbed.bin /Volumes/DIS_F746NG/
+```
+
+Copying the file will initiate the flashing process. Once this is complete, you
+should see an animation on the device's screen.
+
+
+```
+screen /dev/tty.usbmodem14403 9600
+```
+
+In addition to this animation, debug information is logged by the board while
+the program is running. To view it, establish a serial connection to the board
+using a baud rate of `9600`. On OSX and Linux, the following command should
+work, replacing `/dev/tty.devicename` with the name of your device as it appears
+in `/dev`:
+
+```
+screen /dev/tty.devicename 9600
+```
+
+You will see a lot of output flying past! To stop the scrolling, hit `Ctrl+A`,
+immediately followed by `Esc`. You can then use the arrow keys to explore the
+output, which will contain the results of running inference on various `x`
+values:
+
+```
+x_value: 1.1843798*2^2, y_value: -1.9542645*2^-1
+```
+
+To stop viewing the debug output with `screen`, hit `Ctrl+A`, immediately
+followed by the `K` key, then hit the `Y` key.
+
+### Run the tests on a development machine
+
+To compile and test this example on a desktop Linux or macOS machine, first
+clone the TensorFlow repository from GitHub to a convenient place:
+
+```bash
+git clone --depth 1 https://github.com/tensorflow/tensorflow.git
+```
+
+Next, `cd` into the source directory from a terminal, and then run the following
+command:
+
+```bash
+make -f tensorflow/lite/micro/tools/make/Makefile test_hello_world_test
+```
+
+This will take a few minutes, and downloads frameworks the code uses. Once the
+process has finished, you should see a series of files get compiled, followed by
+some logging output from a test, which should conclude with
+`~~~ALL TESTS PASSED~~~`.
+
+If you see this, it means that a small program has been built and run that loads
+the trained TensorFlow model, runs some example inputs through it, and got the
+expected outputs.
+
+To understand how TensorFlow Lite does this, you can look at the source in
+[hello_world_test.cc](hello_world_test.cc).
+It's a fairly small amount of code that creates an interpreter, gets a handle to
+a model that's been compiled into the program, and then invokes the interpreter
+with the model and sample inputs.
+
+### Train your own model
+
+So far you have used an existing trained model to run inference on
+microcontrollers. If you wish to train your own model, follow the instructions
+given in the [train/](train/) directory.
+
diff --git a/tensorflow/lite/micro/examples/hello_world/himax_we1_evb/constants.cc b/tensorflow/lite/micro/examples/hello_world/himax_we1_evb/constants.cc
new file mode 100644
index 00000000000..1816a2f3207
--- /dev/null
+++ b/tensorflow/lite/micro/examples/hello_world/himax_we1_evb/constants.cc
@@ -0,0 +1,19 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/micro/examples/hello_world/constants.h"
+
+// This is tuned so that a full cycle takes ~4 seconds on a SparkFun Edge.
+const int kInferencesPerCycle = 1000;
diff --git a/tensorflow/lite/micro/examples/hello_world/himax_we1_evb/output_handler.cc b/tensorflow/lite/micro/examples/hello_world/himax_we1_evb/output_handler.cc
new file mode 100644
index 00000000000..8ca028acc55
--- /dev/null
+++ b/tensorflow/lite/micro/examples/hello_world/himax_we1_evb/output_handler.cc
@@ -0,0 +1,35 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/micro/examples/hello_world/output_handler.h"
+
+
+/*
+This function trigger different device's LEDthrough y value.
+y value range -1 <= y <= 1.
+| Range is from -1~1  | LEDs    |
+| 0  <= y <= 1        | [ 0 1 ] |
+| -1 <= y < 0         | [ 1 0 ] |
+
+*/
+void HandleOutput(tflite::ErrorReporter* error_reporter, float x_value,
+                  float y_value) {
+  // The first time this method runs, set up our LEDs correctly
+
+  // Log the current X and Y values
+  TF_LITE_REPORT_ERROR(error_reporter, "x_value: %f, y_value: %f\n",
+                       static_cast<double>(x_value),
+                       static_cast<double>(y_value));
+}
diff --git a/tensorflow/lite/micro/examples/hello_world/himax_we1_evb/output_handler.cc~ b/tensorflow/lite/micro/examples/hello_world/himax_we1_evb/output_handler.cc~
new file mode 100644
index 00000000000..b59242d0b6f
--- /dev/null
+++ b/tensorflow/lite/micro/examples/hello_world/himax_we1_evb/output_handler.cc~
@@ -0,0 +1,53 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/micro/examples/hello_world/output_handler.h"
+
+
+/*
+This function trigger different device's LEDthrough y value.
+y value range -1 <= y <= 1.
+| Range is from -1~1  | LEDs    |
+| 0  <= y <= 1        | [ 0 1 ] |
+| -1 <= y < 0         | [ 1 0 ] |
+
+*/
+void HandleOutput(tflite::ErrorReporter* error_reporter, float x_value,
+                  float y_value) {
+  // The first time this method runs, set up our LEDs correctly
+/*  static bool is_initialized = false;
+  if (!is_initialized) {
+    // TODO Setup LED's as outputs
+
+	// end of setup
+    is_initialized = true;
+  }
+
+  // Set the LEDs to represent negative values
+  if (y_value < 0) {
+    //enable LED1
+
+	//enable LED0
+  } else if (y_value > 0) {
+	//enable LED0
+
+	//enable LED1
+  }
+  */
+  // Log the current X and Y values
+  TF_LITE_REPORT_ERROR(error_reporter, "x_value: %f, y_value: %f\n",
+                       static_cast<double>(x_value),
+                       static_cast<double>(y_value));
+}
diff --git a/tensorflow/lite/micro/examples/person_detection_experimental/README.md b/tensorflow/lite/micro/examples/person_detection_experimental/README.md
index bf99b40d776..4d53e551431 100644
--- a/tensorflow/lite/micro/examples/person_detection_experimental/README.md
+++ b/tensorflow/lite/micro/examples/person_detection_experimental/README.md
@@ -7,13 +7,31 @@ This uses the experimental int8 quantized version of the person detection model.
 
 ## Table of contents
 
--   [Getting started](#getting-started)
--   [Running on ARC EM SDP](#running-on-arc-em-sdp)
--   [Running on Arduino](#running-on-arduino)
--   [Running on SparkFun Edge](#running-on-sparkfun-edge)
--   [Run the tests on a development machine](#run-the-tests-on-a-development-machine)
--   [Debugging image capture](#debugging-image-capture)
--   [Training your own model](#training-your-own-model)
+- [Person detection example](#person-detection-example)
+  - [Table of contents](#table-of-contents)
+  - [Running on ARC EM SDP](#running-on-arc-em-sdp)
+    - [Initial setup](#initial-setup)
+    - [Generate Example Project](#generate-example-project)
+    - [Build and Run Example](#build-and-run-example)
+  - [Running on Arduino](#running-on-arduino)
+    - [Hardware](#hardware)
+    - [Install the Arduino_TensorFlowLite library](#install-the-arduinotensorflowlite-library)
+    - [Install other libraries](#install-other-libraries)
+    - [Load and run the example](#load-and-run-the-example)
+  - [Running on HIMAX WE1 EVB](#running-on-himax-we1-evb)
+    - [Initial Setup](#initial-setup)
+      - [MetaWare Development Toolkit](#metaware-development-toolkit)
+      - [Make Tool version](#make-tool-version)
+      - [Serial Terminal Emulation Application](#serial-terminal-emulation-application)
+    - [Generate Example Project](#generate-example-project-1)
+    - [Build and Burn Example](#build-and-burn-example)
+  - [Running on SparkFun Edge](#running-on-sparkfun-edge)
+    - [Compile the binary](#compile-the-binary)
+    - [Sign the binary](#sign-the-binary)
+    - [Flash the binary](#flash-the-binary)
+  - [Run the tests on a development machine](#run-the-tests-on-a-development-machine)
+  - [Debugging image capture](#debugging-image-capture)
+  - [Training your own model](#training-your-own-model)
 
 ## Running on ARC EM SDP
 
@@ -260,6 +278,103 @@ From the log, we can see that it took around 170 ms to capture and read the
 image data from the camera module, 180 ms to decode the JPEG and convert it to
 greyscale, and 18.6 seconds to run inference.
 
+## Running on HIMAX WE1 EVB
+
+The following instructions will help you build and deploy this example to
+[HIMAX WE1 EVB](https://github.com/HimaxWiseEyePlus/bsp_tflu/tree/master/HIMAX_WE1_EVB_board_brief)
+board. To undstand more about using this board, please check 
+[HIMAX WE1 EVB user guide](https://github.com/HimaxWiseEyePlus/bsp_tflu/tree/master/HIMAX_WE1_EVB_user_guide).
+
+### Initial Setup
+
+To use the HIMAX WE1 EVB, please make sure following software are installed:
+
+#### MetaWare Development Toolkit
+
+See
+[Install the Synopsys DesignWare ARC MetaWare Development Toolkit](/tensorflow/lite/micro/tools/make/targets/arc/README.md#install-the-synopsys-designware-arc-metaware-development-toolkit)
+section for instructions on toolchain installation.
+
+#### Make Tool version
+
+A `'make'` tool is required for deploying Tensorflow Lite Micro
+applications on HIMAX WE1 EVB, See 
+[Check make tool version](/tensorflow/lite/micro/tools/make/targets/arc/README.md#make-tool)
+section for proper environment.
+
+#### Serial Terminal Emulation Application
+
+There are 2 main purposes for HIMAX WE1 EVB Debug UART port 
+
+- print application output
+- burn application to flash by using xmodem send application binary
+
+You can use any terminal emulation program (like [PuTTY](https://www.putty.org/) or [minicom](https://linux.die.net/man/1/minicom)).
+
+
+### Generate Example Project
+
+The example project for HIMAX WE1 EVB platform can be generated with the following
+command:
+
+Download related third party data
+
+```
+make -f tensorflow/lite/micro/tools/make/Makefile TARGET=himax_we1_evb third_party_downloads
+```
+
+Generate person detection project
+
+```
+make -f tensorflow/lite/micro/tools/make/Makefile generate_person_detection_int8_make_project TARGET=himax_we1_evb
+```
+
+### Build and Burn Example
+
+Following the Steps to run person detection example at HIMAX WE1 EVB platform. 
+
+1.  Go to the generated example project directory.
+
+    ```
+    cd tensorflow/lite/micro/tools/make/gen/himax_we1_evb_arc/prj/person_detection_int8/make
+    ```
+    
+2.  Build the example using
+
+    ```
+    make app
+    ```
+
+3.  After example build finish, copy ELF file and map file to image generate tool directory.  
+    image generate tool directory  located at `'tensorflow/lite/micro/tools/make/downloads/himax_we1_sdk/image_gen_linux_v3/'`  
+
+    ```
+    cp person_detection_int8.elf himax_we1_evb.map ../../../../../downloads/himax_we1_sdk/image_gen_linux_v3/
+    ```
+
+4.  Go to flash image generate tool directory. 
+
+    ```
+    cd ../../../../../downloads/himax_we1_sdk/image_gen_linux_v3/
+    ```
+
+5.  run image generate tool, generate flash image file. 
+
+    *   Before running image generate tool, by typing `sudo chmod +x image_gen`
+        and `sudo chmod +x sign_tool` to make sure it is executable. 
+
+    ```
+    image_gen -e person_detection_int8.elf -m himax_we1_evb.map -o out.img
+    ```    
+       
+
+6.  Download flash image file to HIMAX WE1 EVB by UART:
+
+    *   more detail about download image through UART can be found at [HIMAX WE1 EVB update Flash image](https://github.com/HimaxWiseEyePlus/bsp_tflu/tree/master/HIMAX_WE1_EVB_user_guide#flash-image-update)
+
+After these steps, press reset button on the HIMAX WE1 EVB, you will see application output in the serial
+terminal.
+
 ## Running on SparkFun Edge
 
 The following instructions will help you build and deploy this sample on the
diff --git a/tensorflow/lite/micro/examples/person_detection_experimental/himax_we1_evb/detection_responder.cc b/tensorflow/lite/micro/examples/person_detection_experimental/himax_we1_evb/detection_responder.cc
new file mode 100644
index 00000000000..a353dc8a9b8
--- /dev/null
+++ b/tensorflow/lite/micro/examples/person_detection_experimental/himax_we1_evb/detection_responder.cc
@@ -0,0 +1,34 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/micro/examples/person_detection_experimental/detection_responder.h"
+
+#include "hx_drv_tflm.h"
+
+// This dummy implementation writes person and no person scores to the error
+// console. Real applications will want to take some custom action instead, and
+// should implement their own versions of this function.
+void RespondToDetection(tflite::ErrorReporter* error_reporter,
+                        int8_t person_score, int8_t no_person_score) {
+
+  if (person_score > no_person_score) {
+    hx_drv_led_on(HX_DRV_LED_GREEN);
+  } else {
+    hx_drv_led_off(HX_DRV_LED_GREEN);
+  }
+
+  TF_LITE_REPORT_ERROR(error_reporter, "person score:%d no person score %d",
+                       person_score, no_person_score);
+}
diff --git a/tensorflow/lite/micro/examples/person_detection_experimental/himax_we1_evb/image_provider.cc b/tensorflow/lite/micro/examples/person_detection_experimental/himax_we1_evb/image_provider.cc
new file mode 100644
index 00000000000..727d93c61d1
--- /dev/null
+++ b/tensorflow/lite/micro/examples/person_detection_experimental/himax_we1_evb/image_provider.cc
@@ -0,0 +1,44 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/micro/examples/person_detection_experimental/image_provider.h"
+
+#include "tensorflow/lite/micro/examples/person_detection_experimental/model_settings.h"
+
+#include "hx_drv_tflm.h"
+
+hx_drv_sensor_image_config_t g_pimg_config;
+
+
+TfLiteStatus GetImage(tflite::ErrorReporter* error_reporter, int image_width,
+                      int image_height, int channels, int8_t* image_data) {
+  static bool is_initialized = false;
+
+  if (!is_initialized) {
+    if(hx_drv_sensor_initial(&g_pimg_config)!= HX_DRV_LIB_PASS)
+    {
+      return kTfLiteError;
+    }
+    is_initialized = true;
+  }
+
+  hx_drv_sensor_capture(&g_pimg_config);
+
+  hx_drv_image_rescale((uint8_t*)g_pimg_config.raw_address, g_pimg_config.img_width, g_pimg_config.img_height,
+                     image_data, image_width, image_height);
+
+
+  return kTfLiteOk;
+}
diff --git a/tensorflow/lite/micro/examples/person_detection_experimental/himax_we1_evb/image_provider.cc~ b/tensorflow/lite/micro/examples/person_detection_experimental/himax_we1_evb/image_provider.cc~
new file mode 100644
index 00000000000..d5b4d136642
--- /dev/null
+++ b/tensorflow/lite/micro/examples/person_detection_experimental/himax_we1_evb/image_provider.cc~
@@ -0,0 +1,44 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/micro/examples/person_detection_experimental/image_provider.h"
+
+#include "tensorflow/lite/micro/examples/person_detection_experimental/model_settings.h"
+
+#include "hx_drv_tflm.h"
+
+hx_drv_sensor_image_config_t g_pimg_config;
+
+
+TfLiteStatus GetImage(tflite::ErrorReporter* error_reporter, int image_width,
+                      int image_height, int channels, int8_t* image_data) {
+  static bool is_initialized = false;
+
+  if (!is_initialized) {
+    if(hx_drv_sensor_initial(&g_pimg_config)!= HX_DRV_LIB_PASS)
+    {
+      return kTfLiteError;
+    }
+    is_initialized = true;
+  }
+
+  hx_drv_sensor_capture(&g_pimg_config);
+
+  hx_drv_image_rescale((uint8_t*)g_pimg_config.raw_address, g_pimg_config.img_width, g_pimg_config.img_height,
+                     image_data, image_data, image_height);
+
+
+  return kTfLiteOk;
+}
diff --git a/tensorflow/lite/micro/examples/person_detection_experimental/himax_we1_evb/main_functions.cc b/tensorflow/lite/micro/examples/person_detection_experimental/himax_we1_evb/main_functions.cc
new file mode 100644
index 00000000000..552b52c9c51
--- /dev/null
+++ b/tensorflow/lite/micro/examples/person_detection_experimental/himax_we1_evb/main_functions.cc
@@ -0,0 +1,127 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/micro/examples/person_detection_experimental/main_functions.h"
+
+#include "tensorflow/lite/micro/examples/person_detection_experimental/detection_responder.h"
+#include "tensorflow/lite/micro/examples/person_detection_experimental/image_provider.h"
+#include "tensorflow/lite/micro/examples/person_detection_experimental/model_settings.h"
+#include "tensorflow/lite/micro/examples/person_detection_experimental/person_detect_model_data.h"
+#include "tensorflow/lite/micro/kernels/micro_ops.h"
+#include "tensorflow/lite/micro/micro_error_reporter.h"
+#include "tensorflow/lite/micro/micro_interpreter.h"
+#include "tensorflow/lite/micro/micro_mutable_op_resolver.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/version.h"
+
+// Globals, used for compatibility with Arduino-style sketches.
+namespace {
+tflite::ErrorReporter* error_reporter = nullptr;
+const tflite::Model* model = nullptr;
+tflite::MicroInterpreter* interpreter = nullptr;
+TfLiteTensor* input = nullptr;
+
+// In order to use optimized tensorflow lite kernels, a signed int8 quantized
+// model is preferred over the legacy unsigned model format. This means that
+// throughout this project, input images must be converted from unisgned to
+// signed format. The easiest and quickest way to convert from unsigned to
+// signed 8-bit integers is to subtract 128 from the unsigned value to get a
+// signed value.
+
+// An area of memory to use for input, output, and intermediate arrays.
+constexpr int kTensorArenaSize = 125 * 1024;
+#pragma Bss(".tensor_arena")
+static uint8_t tensor_arena[kTensorArenaSize];
+#pragma Bss()
+}  // namespace
+
+// The name of this function is important for Arduino compatibility.
+void setup() {
+  // Set up logging. Google style is to avoid globals or statics because of
+  // lifetime uncertainty, but since this has a trivial destructor it's okay.
+  // NOLINTNEXTLINE(runtime-global-variables)
+  static tflite::MicroErrorReporter micro_error_reporter;
+  error_reporter = &micro_error_reporter;
+
+  // Map the model into a usable data structure. This doesn't involve any
+  // copying or parsing, it's a very lightweight operation.
+  model = tflite::GetModel(g_person_detect_model_data);
+  if (model->version() != TFLITE_SCHEMA_VERSION) {
+    TF_LITE_REPORT_ERROR(error_reporter,
+                         "Model provided is schema version %d not equal "
+                         "to supported version %d.",
+                         model->version(), TFLITE_SCHEMA_VERSION);
+    return;
+  }
+
+  // Pull in only the operation implementations we need.
+  // This relies on a complete list of all the ops needed by this graph.
+  // An easier approach is to just use the AllOpsResolver, but this will
+  // incur some penalty in code space for op implementations that are not
+  // needed by this graph.
+  //
+  // tflite::ops::micro::AllOpsResolver resolver;
+  // NOLINTNEXTLINE(runtime-global-variables)
+  static tflite::MicroOpResolver<12> micro_op_resolver;
+  micro_op_resolver.AddBuiltin(tflite::BuiltinOperator_DEPTHWISE_CONV_2D,
+                               tflite::ops::micro::Register_DEPTHWISE_CONV_2D(),
+                               1, 3);
+  micro_op_resolver.AddBuiltin(tflite::BuiltinOperator_CONV_2D,
+                               tflite::ops::micro::Register_CONV_2D(), 1, 3);
+  micro_op_resolver.AddBuiltin(tflite::BuiltinOperator_AVERAGE_POOL_2D,
+                               tflite::ops::micro::Register_AVERAGE_POOL_2D(),
+                               1, 2);
+  micro_op_resolver.AddBuiltin(tflite::BuiltinOperator_RESHAPE,
+                               tflite::ops::micro::Register_RESHAPE());
+  micro_op_resolver.AddBuiltin(tflite::BuiltinOperator_SOFTMAX,
+                               tflite::ops::micro::Register_SOFTMAX(), 1, 3);
+
+  // Build an interpreter to run the model with.
+  // NOLINTNEXTLINE(runtime-global-variables)
+  static tflite::MicroInterpreter static_interpreter(
+      model, micro_op_resolver, tensor_arena, kTensorArenaSize, error_reporter);
+  interpreter = &static_interpreter;
+
+  // Allocate memory from the tensor_arena for the model's tensors.
+  TfLiteStatus allocate_status = interpreter->AllocateTensors();
+  if (allocate_status != kTfLiteOk) {
+    TF_LITE_REPORT_ERROR(error_reporter, "AllocateTensors() failed");
+    return;
+  }
+
+  // Get information about the memory area to use for the model's input.
+  input = interpreter->input(0);
+}
+
+// The name of this function is important for Arduino compatibility.
+void loop() {
+  // Get image from provider.
+  if (kTfLiteOk != GetImage(error_reporter, kNumCols, kNumRows, kNumChannels,
+                            input->data.int8)) {
+    TF_LITE_REPORT_ERROR(error_reporter, "Image capture failed.");
+  }
+
+  // Run the model on this input and make sure it succeeds.
+  if (kTfLiteOk != interpreter->Invoke()) {
+    TF_LITE_REPORT_ERROR(error_reporter, "Invoke failed.");
+  }
+
+  TfLiteTensor* output = interpreter->output(0);
+
+  // Process the inference results.
+  int8_t person_score = output->data.uint8[kPersonIndex];
+  int8_t no_person_score = output->data.uint8[kNotAPersonIndex];
+  RespondToDetection(error_reporter, person_score, no_person_score);
+}
diff --git a/tensorflow/lite/micro/we_i/debug_log.cc b/tensorflow/lite/micro/himax_we1_evb/debug_log.cc
similarity index 90%
rename from tensorflow/lite/micro/we_i/debug_log.cc
rename to tensorflow/lite/micro/himax_we1_evb/debug_log.cc
index a115d476aff..32af2625630 100644
--- a/tensorflow/lite/micro/we_i/debug_log.cc
+++ b/tensorflow/lite/micro/himax_we1_evb/debug_log.cc
@@ -18,16 +18,15 @@ limitations under the License.
 // the Ambiq Apollo 3.
 
 #include "tensorflow/lite/micro/debug_log.h"
-#include "xprintf.h"
-#include "console_io.h"
-#include <cstdio>
+#include "hx_drv_tflm.h"
+
 
 extern "C" void DebugLog(const char* s) {
   static bool is_initialized = false;
   if (!is_initialized) {
-	  xprintf_setup();
+	  hx_drv_uart_initial();
 	  is_initialized = true;
   }
 
-  xprintf("%s", s);
+  hx_drv_uart_print("%s", s);
 }
diff --git a/tensorflow/lite/micro/tools/make/targets/himax_we1_evb_makefile.inc b/tensorflow/lite/micro/tools/make/targets/himax_we1_evb_makefile.inc
new file mode 100644
index 00000000000..60fc2e7cca1
--- /dev/null
+++ b/tensorflow/lite/micro/tools/make/targets/himax_we1_evb_makefile.inc
@@ -0,0 +1,91 @@
+# Settings for himax WE_1 evb.
+ifeq ($(TARGET), himax_we1_evb)
+  
+  CC_TOOL = ccac
+  AR_TOOL = arac
+  CXX_TOOL = ccac
+  LD_TOOL := ccac
+  TARGET_ARCH := arc
+  #ARC_TOOLCHAIN := mwdt 
+
+  BUILD_ARC_MLI := false
+  ARC_MLI_PRE_COMPILED_TARGET := himax_arcem9d_r16
+  
+  include $(MAKEFILE_DIR)/targets/arc/arc_common.inc
+
+  #download SDK & MLI
+  HIMAX_WE1_SDK_NAME := himax_we1_sdk
+  $(eval $(call add_third_party_download,$(HIMAX_WE1_SDK_URL),$(HIMAX_WE1_SDK_MD5),$(HIMAX_WE1_SDK_NAME),))
+
+  #export path of toolchain
+  #export PATH := $(MAKEFILE_DIR)/downloads/$(HIMAX_WE1_SDK_NAME)/image_gen_linux_v3/:$(PATH)
+  
+  TCF_FILE := $(PWD)/$(MAKEFILE_DIR)/downloads/$(HIMAX_WE1_SDK_NAME)/arcem9d_wei_r16.tcf
+  LCF_FILE := $(PWD)/$(MAKEFILE_DIR)/downloads/$(HIMAX_WE1_SDK_NAME)/memory.lcf
+  ARCLIB_FILE :=  $(PWD)/$(MAKEFILE_DIR)/downloads/$(HIMAX_WE1_SDK_NAME)/libembarc.a
+  LIB_HEADER_FILE :=  $(PWD)/$(MAKEFILE_DIR)/downloads/$(HIMAX_WE1_SDK_NAME)/hx_drv_tflm.h
+  
+
+  DEFAULT_HEAPSZ := 8192
+  DEFAULT_STACKSZ := 8192
+
+  TCF_FILE_NAME = $(notdir $(TCF_FILE))
+  ARC_TARGET_FILES_DIRS = $(dir $(TCF_FILE_NAME))
+  MAKE_PROJECT_FILES += $(TCF_FILE_NAME)
+    
+  LCF_FILE_NAME = $(notdir $(LCF_FILE))
+  ARC_TARGET_FILES_DIRS += $(dir $(LCF_FILE))
+  MAKE_PROJECT_FILES += $(LCF_FILE_NAME)
+  
+  ARCLIB_FILE_NAME = $(notdir $(ARCLIB_FILE))
+  ARC_TARGET_FILES_DIRS += $(dir $(ARCLIB_FILE))
+  MAKE_PROJECT_FILES += $(ARCLIB_FILE_NAME)
+  
+  LIB_HEADER_FILE_NAME = $(notdir $(LIB_HEADER_FILE))
+  ARC_TARGET_FILES_DIRS += $(dir $(LIB_HEADER_FILE))
+  MAKE_PROJECT_FILES += $(LIB_HEADER_FILE_NAME)
+  
+  
+    
+  # Need a pointer to the TCF and lcf file
+
+  PLATFORM_FLAGS = \
+    -DNDEBUG \
+    -g \
+    -DCPU_ARC \
+    -Hnosdata \
+    -DTF_LITE_STATIC_MEMORY \
+    -tcf=$(TCF_FILE_NAME) \
+    -Hnocopyr \
+    -Hpurge \
+    -Hcl \
+    -fslp-vectorize-aggressive \
+    -ffunction-sections \
+    -fdata-sections \
+    -tcf_core_config \
+
+  CXXFLAGS += -fno-rtti -DSCRATCH_MEM_Z_SIZE=0x10000 $(PLATFORM_FLAGS)
+  CCFLAGS += $(PLATFORM_FLAGS)
+
+  INCLUDES+= \
+    -I $(MAKEFILE_DIR)/downloads/$(WEI_SDK_NAME) \
+    -I $(MAKEFILE_DIR)/downloads/kissfft
+
+  GENERATED_PROJECT_INCLUDES += \
+    -I. \
+    -I./third_party/kissfft
+
+  LDFLAGS += \
+    -Hheap=8192 \
+    -tcf=$(TCF_FILE_NAME) \
+    -Hnocopyr \
+    -m \
+    -Hldopt=-Coutput=$(TARGET).map \
+    $(LCF_FILE_NAME) \
+    -Hldopt=-Bgrouplib $(ARCLIB_FILE_NAME)
+
+  CXXFLAGS := $(filter-out -std=c++11,$(CXXFLAGS))
+  CCFLAGS := $(filter-out -std=c11,$(CCFLAGS))
+  MICROLITE_LIBS := $(filter-out -lm,$(MICROLITE_LIBS))
+
+endif
diff --git a/tensorflow/lite/micro/tools/make/targets/himax_we1_evb_makefile.inc~ b/tensorflow/lite/micro/tools/make/targets/himax_we1_evb_makefile.inc~
new file mode 100644
index 00000000000..733f258fbbb
--- /dev/null
+++ b/tensorflow/lite/micro/tools/make/targets/himax_we1_evb_makefile.inc~
@@ -0,0 +1,93 @@
+# Settings for himax WE_1 evb.
+ifeq ($(TARGET), himax_we1_evb)
+  
+  CC_TOOL = ccac
+  AR_TOOL = arac
+  CXX_TOOL = ccac
+  LD_TOOL := ccac
+  TARGET_ARCH := arc
+  #ARC_TOOLCHAIN := mwdt 
+
+  BUILD_ARC_MLI := false
+  ARC_MLI_PRE_COMPILED_TARGET := himax_arcem9d_r16
+  
+include $(MAKEFILE_DIR)/targets/arc/arc_common.inc
+  #download SDK & MLI
+  HIMAX_WE1_SDK_NAME := himax_we1_sdk
+  #MLI_LIB_DIR = arc_mli_package
+  #MLI_LIB_DIR = arc_mli_package
+  #$(eval $(call add_third_party_download,$(EMBARC_MLI_PRE_COMPILED_URL),$(EMBARC_MLI_PRE_COMPILED_MD5),$(MLI_LIB_DIR),))
+  $(eval $(call add_third_party_download,$(HIMAX_WE1_SDK_URL),$(HIMAX_WE1_SDK_MD5),$(HIMAX_WE1_SDK_NAME),))
+
+  #export path of toolchain
+  #export PATH := $(MAKEFILE_DIR)/downloads/$(HIMAX_WE1_SDK_NAME)/image_gen_linux_v3/:$(PATH)
+  
+  TCF_FILE := $(PWD)/$(MAKEFILE_DIR)/downloads/$(HIMAX_WE1_SDK_NAME)/arcem9d_wei_r16.tcf
+  LCF_FILE := $(PWD)/$(MAKEFILE_DIR)/downloads/$(HIMAX_WE1_SDK_NAME)/memory.lcf
+  ARCLIB_FILE :=  $(PWD)/$(MAKEFILE_DIR)/downloads/$(HIMAX_WE1_SDK_NAME)/libembarc.a
+  LIB_HEADER_FILE :=  $(PWD)/$(MAKEFILE_DIR)/downloads/$(HIMAX_WE1_SDK_NAME)/hx_drv_tflm.h
+  
+
+  DEFAULT_HEAPSZ := 8192
+  DEFAULT_STACKSZ := 8192
+
+  TCF_FILE_NAME = $(notdir $(TCF_FILE))
+  ARC_TARGET_FILES_DIRS = $(dir $(TCF_FILE_NAME))
+  MAKE_PROJECT_FILES += $(TCF_FILE_NAME)
+    
+  LCF_FILE_NAME = $(notdir $(LCF_FILE))
+  ARC_TARGET_FILES_DIRS += $(dir $(LCF_FILE))
+  MAKE_PROJECT_FILES += $(LCF_FILE_NAME)
+  
+  ARCLIB_FILE_NAME = $(notdir $(ARCLIB_FILE))
+  ARC_TARGET_FILES_DIRS += $(dir $(ARCLIB_FILE))
+  MAKE_PROJECT_FILES += $(ARCLIB_FILE_NAME)
+  
+  LIB_HEADER_FILE_NAME = $(notdir $(LIB_HEADER_FILE))
+  ARC_TARGET_FILES_DIRS += $(dir $(LIB_HEADER_FILE))
+  MAKE_PROJECT_FILES += $(LIB_HEADER_FILE_NAME)
+  
+  
+    
+  # Need a pointer to the TCF and lcf file
+
+  PLATFORM_FLAGS = \
+    -DNDEBUG \
+    -g \
+    -DCPU_ARC \
+    -Hnosdata \
+    -DTF_LITE_STATIC_MEMORY \
+    -tcf=$(TCF_FILE_NAME) \
+    -Hnocopyr \
+    -Hpurge \
+    -Hcl \
+    -fslp-vectorize-aggressive \
+    -ffunction-sections \
+    -fdata-sections \
+    -tcf_core_config \
+
+  CXXFLAGS += -fno-rtti -DSCRATCH_MEM_Z_SIZE=0x10000 $(PLATFORM_FLAGS)
+  CCFLAGS += $(PLATFORM_FLAGS)
+
+  INCLUDES+= \
+    -I $(MAKEFILE_DIR)/downloads/$(WEI_SDK_NAME) \
+    -I $(MAKEFILE_DIR)/downloads/kissfft
+
+  GENERATED_PROJECT_INCLUDES += \
+    -I. \
+    -I./third_party/kissfft
+
+  LDFLAGS += \
+    -Hheap=8192 \
+    -tcf=$(TCF_FILE_NAME) \
+    -Hnocopyr \
+    -m \
+    -Hldopt=-Coutput=$(TARGET).map \
+    $(LCF_FILE_NAME) \
+    -Hldopt=-Bgrouplib $(ARCLIB_FILE_NAME)
+
+  CXXFLAGS := $(filter-out -std=c++11,$(CXXFLAGS))
+  CCFLAGS := $(filter-out -std=c11,$(CCFLAGS))
+  MICROLITE_LIBS := $(filter-out -lm,$(MICROLITE_LIBS))
+
+endif
diff --git a/tensorflow/lite/micro/tools/make/third_party_downloads.inc b/tensorflow/lite/micro/tools/make/third_party_downloads.inc
index 806501a004a..75a51e0df10 100644
--- a/tensorflow/lite/micro/tools/make/third_party_downloads.inc
+++ b/tensorflow/lite/micro/tools/make/third_party_downloads.inc
@@ -80,3 +80,7 @@ EMBARC_MLI_PRE_COMPILED_MD5 := "a95ff9e0370434484f14e7e4114327f6"
 XTENSA_HIFI4_URL :="https://github.com/foss-xtensa/nnlib-hifi4/raw/master/archive/xa_nnlib_04_07.zip"
 XTENSA_HIFI4_MD5 :="f234764928f9a42901df33a27e118c8b"
 
+HIMAX_WE1_SDK_URL ="https://www.himax.com.tw/we-i/himax_we1_sdk_v02.zip"
+HIMAX_WE1_SDK_MD5 ="9a4b2f29b16052764e437b64bdcba816"
+                    
+
diff --git a/tensorflow/lite/micro/tools/make/third_party_downloads.inc~ b/tensorflow/lite/micro/tools/make/third_party_downloads.inc~
new file mode 100644
index 00000000000..3c7ee1b64d2
--- /dev/null
+++ b/tensorflow/lite/micro/tools/make/third_party_downloads.inc~
@@ -0,0 +1,86 @@
+# Add URLs and MD5 checksums for third-party libraries here.
+
+GEMMLOWP_URL := "https://github.com/google/gemmlowp/archive/719139ce755a0f31cbf1c37f7f98adcc7fc9f425.zip"
+GEMMLOWP_MD5 := "7e8191b24853d75de2af87622ad293ba"
+
+ifeq ($(HOST_OS),windows)
+  FLATBUFFERS_URL := "https://github.com/google/flatbuffers/archive/v1.12.0.zip"
+  FLATBUFFERS_MD5 := "a1afdbf114dec01a861c1b8c917d0fc7"
+else
+  FLATBUFFERS_URL := "https://github.com/google/flatbuffers/archive/v1.12.0.tar.gz"
+  FLATBUFFERS_MD5 := "c62ffefb3d4548b127cca14ce047f16c"
+endif
+
+ifeq ($(HOST_OS),osx)
+  GCC_EMBEDDED_URL := "https://developer.arm.com/-/media/Files/downloads/gnu-rm/7-2018q2/gcc-arm-none-eabi-7-2018-q2-update-mac.tar.bz2"
+  GCC_EMBEDDED_MD5 := "a66be9828cf3c57d7d21178e07cd8904"
+else ifeq ($(HOST_OS),windows)
+  GCC_EMBEDDED_URL := "https://developer.arm.com/-/media/Files/downloads/gnu-rm/7-2018q2/gcc-arm-none-eabi-7-2018-q2-update-win32.zip"
+  GCC_EMBEDDED_MD5 := "bc8ae26d7c429f30d583a605a4bcf9bc"
+else
+  GCC_EMBEDDED_URL := "https://developer.arm.com/-/media/Files/downloads/gnu-rm/7-2018q2/gcc-arm-none-eabi-7-2018-q2-update-linux.tar.bz2"
+  GCC_EMBEDDED_MD5 := "299ebd3f1c2c90930d28ab82e5d8d6c0"
+endif
+
+LEON_BCC2_URL := "https://www.gaisler.com/anonftp/bcc2/bin/bcc-2.0.7-gcc-linux64.tar.xz"
+LEON_BCC2_MD5 := "cdf78082be4882da2a92c9baa82fe765"
+
+TSIM_URL := "https://www.gaisler.com/anonftp/tsim/tsim-eval-2.0.63.tar.gz"
+TSIM_MD5 := "afa0095d3ed989a949e1467f94e41d2f"
+
+CMSIS_URL := "https://github.com/ARM-software/CMSIS_5/archive/1150e71e07c79b538efd842aba5b210a31827ae5.zip"
+CMSIS_MD5 := "e05f4222ef58825193910b41a0871dcb"
+
+AM_SDK_URL := "http://s3.asia.ambiqmicro.com/downloads/AmbiqSuite-Rel2.2.0.zip"
+AM_SDK_MD5 := "7605fa2d4d97e6bb7a1190c92b66b597"
+AM_SDK_DEST := AmbiqSuite-Rel2.2.0
+
+SF_BSPS_URL := "https://github.com/sparkfun/SparkFun_Apollo3_AmbiqSuite_BSPs/archive/v0.0.7.zip"
+SF_BSPS_MD5 := "34199f7e754735661d1c8a70a40ca7a3"
+SF_BSPS_DEST := boards_sfe
+
+STM32_BARE_LIB_URL := "https://github.com/google/stm32_bare_lib/archive/c07d611fb0af58450c5a3e0ab4d52b47f99bc82d.zip"
+STM32_BARE_LIB_MD5 := "282bff40d4d0b92278fd123a3b6e3123"
+
+ifeq ($(HOST_OS),osx)
+  RISCV_TOOLCHAIN_URL := "https://static.dev.sifive.com/dev-tools/riscv64-unknown-elf-gcc-8.1.0-2019.01.0-x86_64-apple-darwin.tar.gz"
+  RISCV_TOOLCHAIN_MD5 := "2ac2fa00618b9ab7fa0c7d0ec173de94"
+else
+  RISCV_TOOLCHAIN_URL := "https://static.dev.sifive.com/dev-tools/riscv64-unknown-elf-gcc-20181030-x86_64-linux-ubuntu14.tar.gz"
+  RISCV_TOOLCHAIN_MD5="2366b7afe36a54dc94fb0ff8a0830934"
+endif
+
+SIFIVE_FE310_LIB_URL := "https://github.com/sifive/freedom-e-sdk/archive/baeeb8fd497a99b3c141d7494309ec2e64f19bdf.zip"
+SIFIVE_FE310_LIB_MD5 := "06ee24c4956f8e21670ab3395861fe64"
+
+KISSFFT_URL="https://github.com/mborgerding/kissfft/archive/v130.zip"
+KISSFFT_MD5="438ba1fef5783cc5f5f201395cc477ca"
+
+RUY_URL="https://github.com/google/ruy/archive/1b313682ef8b8fc8ed08719c610d1c3503b016bf.zip"
+RUY_MD5="2d54f058f8f7120dfc1ecee79dbf259e"
+
+CIFAR10_DATASET_URL="https://www.cs.toronto.edu/~kriz/cifar-10-binary.tar.gz"
+CIFAR10_DATASET_MD5="c32a1d4ab5d03f1284b67883e8d87530"
+
+IMAGE_RECOGNITION_MODEL_URL := "https://storage.googleapis.com/download.tensorflow.org/models/tflite/cifar_image_recognition_model_2020_4_14.zip"
+IMAGE_RECOGNITION_MODEL_MD5 := "2b886156e7ef4d6e53d0f1a4bc800e56"
+
+PERSON_MODEL_URL := "https://storage.googleapis.com/download.tensorflow.org/data/tf_lite_micro_person_data_grayscale_2019_11_21.zip"
+PERSON_MODEL_MD5 := "fe2934bd0788f1dcc7af3f0a954542ab"
+
+PERSON_MODEL_INT8_URL := "https://storage.googleapis.com/download.tensorflow.org/data/tf_lite_micro_person_data_int8_grayscale_2020_01_13.zip"
+PERSON_MODEL_INT8_MD5 := "8a7d2c70325f53136faea6dde517b8cc"
+
+EMBARC_MLI_URL := "https://github.com/foss-for-synopsys-dwc-arc-processors/embarc_mli/archive/58284867ca52d1f43b25045e8601999d7359d986.zip"
+EMBARC_MLI_MD5 := "2bf4982a327fdaa9d475803ce014d1ef"
+
+EMBARC_MLI_PRE_COMPILED_URL := "https://github.com/foss-for-synopsys-dwc-arc-processors/embarc_mli/releases/download/Release_1.1_RC2/embARC_MLI_package.zip"
+EMBARC_MLI_PRE_COMPILED_MD5 := "a95ff9e0370434484f14e7e4114327f6"
+
+XTENSA_HIFI4_URL :="https://github.com/foss-xtensa/nnlib-hifi4/raw/master/archive/xa_nnlib_04_07.zip"
+XTENSA_HIFI4_MD5 :="f234764928f9a42901df33a27e118c8b"
+
+HIMAX_WE1_SDK_URL ="https://www.himax.com.tw/we-i/himax_we1_sdk_v02.zip"
+HIMAX_WE1_SDK_MD5 ="5063c24d298fbcfe118163f3ccc43079"
+                    
+

From 96eb311826d68179bf85a228e294ae55f39ef2e4 Mon Sep 17 00:00:00 2001
From: "902449@58880@bigcat_chen@ASIC" <bigcat_chen@himax.com.tw>
Date: Thu, 4 Jun 2020 18:08:43 +0800
Subject: [PATCH 0080/1390] TFLM: update example readme

---
 .../lite/micro/examples/hello_world/README.md |  36 +-
 .../micro/examples/hello_world/README.md~     |   2 +-
 .../hello_world/himax_we1_evb/constants.cc    |  19 -
 .../himax_we1_evb/output_handler.cc           |  35 --
 .../himax_we1_evb/output_handler.cc~          |  53 --
 .../person_detection_experimental/README.md   |  33 +-
 .../person_detection_experimental/README.md~  | 568 ++++++++++++++++++
 .../tools/make/third_party_downloads.inc~     |  86 ---
 8 files changed, 585 insertions(+), 247 deletions(-)
 delete mode 100644 tensorflow/lite/micro/examples/hello_world/himax_we1_evb/constants.cc
 delete mode 100644 tensorflow/lite/micro/examples/hello_world/himax_we1_evb/output_handler.cc
 delete mode 100644 tensorflow/lite/micro/examples/hello_world/himax_we1_evb/output_handler.cc~
 create mode 100644 tensorflow/lite/micro/examples/person_detection_experimental/README.md~
 delete mode 100644 tensorflow/lite/micro/tools/make/third_party_downloads.inc~

diff --git a/tensorflow/lite/micro/examples/hello_world/README.md b/tensorflow/lite/micro/examples/hello_world/README.md
index 9c0a5e2306a..d3762ada790 100644
--- a/tensorflow/lite/micro/examples/hello_world/README.md
+++ b/tensorflow/lite/micro/examples/hello_world/README.md
@@ -14,34 +14,14 @@ of the device.
 
 ## Table of contents
 
-- [Hello World Example](#hello-world-example)
-  - [Table of contents](#table-of-contents)
-  - [Deploy to ARC EM SDP](#deploy-to-arc-em-sdp)
-    - [Initial Setup](#initial-setup)
-    - [Generate Example Project](#generate-example-project)
-    - [Build and Run Example](#build-and-run-example)
-  - [Deploy to Arduino](#deploy-to-arduino)
-    - [Install the Arduino_TensorFlowLite library](#install-the-arduinotensorflowlite-library)
-    - [Load and run the example](#load-and-run-the-example)
-  - [Deploy to ESP32](#deploy-to-esp32)
-    - [Install the ESP IDF](#install-the-esp-idf)
-    - [Generate the examples](#generate-the-examples)
-    - [Building the example](#building-the-example)
-    - [Load and run the example](#load-and-run-the-example-1)
-  - [Deploy to himax WE1 EVB](#deploy-to-himax-we1-evb)
-    - [Initial Setup](#initial-setup-1)
-      - [MetaWare Development Toolkit](#metaware-development-toolkit)
-      - [Make Tool version](#make-tool-version)
-      - [Serial Terminal Emulation Application](#serial-terminal-emulation-application)
-    - [Generate Example Project](#generate-example-project-1)
-    - [Build and Burn Example](#build-and-burn-example)
-  - [Deploy to SparkFun Edge](#deploy-to-sparkfun-edge)
-    - [Compile the binary](#compile-the-binary)
-    - [Sign the binary](#sign-the-binary)
-    - [Flash the binary](#flash-the-binary)
-  - [Deploy to STM32F746](#deploy-to-stm32f746)
-    - [Run the tests on a development machine](#run-the-tests-on-a-development-machine)
-    - [Train your own model](#train-your-own-model)
+-   [Deploy to ARC EM SDP](#deploy-to-arc-em-sdp)
+-   [Deploy to Arduino](#deploy-to-arduino)
+-   [Deploy to ESP32](#deploy-to-esp32)
+-   [Deploy to himax WE1 EVB](#deploy-to-himax-we1-evb)
+-   [Deploy to SparkFun Edge](#deploy-to-sparkfun-edge)
+-   [Deploy to STM32F746](#deploy-to-STM32F746)
+-   [Run the tests on a development machine](#run-the-tests-on-a-development-machine)
+-   [Train your own model](#train-your-own-model)
 
 ## Deploy to ARC EM SDP
 
diff --git a/tensorflow/lite/micro/examples/hello_world/README.md~ b/tensorflow/lite/micro/examples/hello_world/README.md~
index 011711493d5..9c0a5e2306a 100644
--- a/tensorflow/lite/micro/examples/hello_world/README.md~
+++ b/tensorflow/lite/micro/examples/hello_world/README.md~
@@ -260,7 +260,7 @@ make -f tensorflow/lite/micro/tools/make/Makefile TARGET=himax_we1_evb third_par
 Generate hello world project
 
 ```
-make -f tensorflow/lite/micro/tools/make/Makefile generate_hello_world_make_project TARGET=himax_we1_evb
+make -f tensorflow/lite/micro/tools/make/Makefile generate_hello_world_make_project TARGET=himax_we1_evb TAGS=no_arc_mli
 ```
 
 ### Build and Burn Example
diff --git a/tensorflow/lite/micro/examples/hello_world/himax_we1_evb/constants.cc b/tensorflow/lite/micro/examples/hello_world/himax_we1_evb/constants.cc
deleted file mode 100644
index 1816a2f3207..00000000000
--- a/tensorflow/lite/micro/examples/hello_world/himax_we1_evb/constants.cc
+++ /dev/null
@@ -1,19 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/lite/micro/examples/hello_world/constants.h"
-
-// This is tuned so that a full cycle takes ~4 seconds on a SparkFun Edge.
-const int kInferencesPerCycle = 1000;
diff --git a/tensorflow/lite/micro/examples/hello_world/himax_we1_evb/output_handler.cc b/tensorflow/lite/micro/examples/hello_world/himax_we1_evb/output_handler.cc
deleted file mode 100644
index 8ca028acc55..00000000000
--- a/tensorflow/lite/micro/examples/hello_world/himax_we1_evb/output_handler.cc
+++ /dev/null
@@ -1,35 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/lite/micro/examples/hello_world/output_handler.h"
-
-
-/*
-This function trigger different device's LEDthrough y value.
-y value range -1 <= y <= 1.
-| Range is from -1~1  | LEDs    |
-| 0  <= y <= 1        | [ 0 1 ] |
-| -1 <= y < 0         | [ 1 0 ] |
-
-*/
-void HandleOutput(tflite::ErrorReporter* error_reporter, float x_value,
-                  float y_value) {
-  // The first time this method runs, set up our LEDs correctly
-
-  // Log the current X and Y values
-  TF_LITE_REPORT_ERROR(error_reporter, "x_value: %f, y_value: %f\n",
-                       static_cast<double>(x_value),
-                       static_cast<double>(y_value));
-}
diff --git a/tensorflow/lite/micro/examples/hello_world/himax_we1_evb/output_handler.cc~ b/tensorflow/lite/micro/examples/hello_world/himax_we1_evb/output_handler.cc~
deleted file mode 100644
index b59242d0b6f..00000000000
--- a/tensorflow/lite/micro/examples/hello_world/himax_we1_evb/output_handler.cc~
+++ /dev/null
@@ -1,53 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/lite/micro/examples/hello_world/output_handler.h"
-
-
-/*
-This function trigger different device's LEDthrough y value.
-y value range -1 <= y <= 1.
-| Range is from -1~1  | LEDs    |
-| 0  <= y <= 1        | [ 0 1 ] |
-| -1 <= y < 0         | [ 1 0 ] |
-
-*/
-void HandleOutput(tflite::ErrorReporter* error_reporter, float x_value,
-                  float y_value) {
-  // The first time this method runs, set up our LEDs correctly
-/*  static bool is_initialized = false;
-  if (!is_initialized) {
-    // TODO Setup LED's as outputs
-
-	// end of setup
-    is_initialized = true;
-  }
-
-  // Set the LEDs to represent negative values
-  if (y_value < 0) {
-    //enable LED1
-
-	//enable LED0
-  } else if (y_value > 0) {
-	//enable LED0
-
-	//enable LED1
-  }
-  */
-  // Log the current X and Y values
-  TF_LITE_REPORT_ERROR(error_reporter, "x_value: %f, y_value: %f\n",
-                       static_cast<double>(x_value),
-                       static_cast<double>(y_value));
-}
diff --git a/tensorflow/lite/micro/examples/person_detection_experimental/README.md b/tensorflow/lite/micro/examples/person_detection_experimental/README.md
index 4d53e551431..06f5640986f 100644
--- a/tensorflow/lite/micro/examples/person_detection_experimental/README.md
+++ b/tensorflow/lite/micro/examples/person_detection_experimental/README.md
@@ -7,31 +7,14 @@ This uses the experimental int8 quantized version of the person detection model.
 
 ## Table of contents
 
-- [Person detection example](#person-detection-example)
-  - [Table of contents](#table-of-contents)
-  - [Running on ARC EM SDP](#running-on-arc-em-sdp)
-    - [Initial setup](#initial-setup)
-    - [Generate Example Project](#generate-example-project)
-    - [Build and Run Example](#build-and-run-example)
-  - [Running on Arduino](#running-on-arduino)
-    - [Hardware](#hardware)
-    - [Install the Arduino_TensorFlowLite library](#install-the-arduinotensorflowlite-library)
-    - [Install other libraries](#install-other-libraries)
-    - [Load and run the example](#load-and-run-the-example)
-  - [Running on HIMAX WE1 EVB](#running-on-himax-we1-evb)
-    - [Initial Setup](#initial-setup)
-      - [MetaWare Development Toolkit](#metaware-development-toolkit)
-      - [Make Tool version](#make-tool-version)
-      - [Serial Terminal Emulation Application](#serial-terminal-emulation-application)
-    - [Generate Example Project](#generate-example-project-1)
-    - [Build and Burn Example](#build-and-burn-example)
-  - [Running on SparkFun Edge](#running-on-sparkfun-edge)
-    - [Compile the binary](#compile-the-binary)
-    - [Sign the binary](#sign-the-binary)
-    - [Flash the binary](#flash-the-binary)
-  - [Run the tests on a development machine](#run-the-tests-on-a-development-machine)
-  - [Debugging image capture](#debugging-image-capture)
-  - [Training your own model](#training-your-own-model)
+-   [Getting started](#getting-started)
+-   [Running on ARC EM SDP](#running-on-arc-em-sdp)
+-   [Running on Arduino](#running-on-arduino)
+-   [Running on HIMAX WE1 EVB](#running-on-himax-we1-evb)
+-   [Running on SparkFun Edge](#running-on-sparkfun-edge)
+-   [Run the tests on a development machine](#run-the-tests-on-a-development-machine)
+-   [Debugging image capture](#debugging-image-capture)
+-   [Training your own model](#training-your-own-model)
 
 ## Running on ARC EM SDP
 
diff --git a/tensorflow/lite/micro/examples/person_detection_experimental/README.md~ b/tensorflow/lite/micro/examples/person_detection_experimental/README.md~
new file mode 100644
index 00000000000..4d53e551431
--- /dev/null
+++ b/tensorflow/lite/micro/examples/person_detection_experimental/README.md~
@@ -0,0 +1,568 @@
+# Person detection example
+
+This example shows how you can use Tensorflow Lite to run a 250 kilobyte neural
+network to recognize people in images captured by a camera.  It is designed to
+run on systems with small amounts of memory such as microcontrollers and DSPs.
+This uses the experimental int8 quantized version of the person detection model.
+
+## Table of contents
+
+- [Person detection example](#person-detection-example)
+  - [Table of contents](#table-of-contents)
+  - [Running on ARC EM SDP](#running-on-arc-em-sdp)
+    - [Initial setup](#initial-setup)
+    - [Generate Example Project](#generate-example-project)
+    - [Build and Run Example](#build-and-run-example)
+  - [Running on Arduino](#running-on-arduino)
+    - [Hardware](#hardware)
+    - [Install the Arduino_TensorFlowLite library](#install-the-arduinotensorflowlite-library)
+    - [Install other libraries](#install-other-libraries)
+    - [Load and run the example](#load-and-run-the-example)
+  - [Running on HIMAX WE1 EVB](#running-on-himax-we1-evb)
+    - [Initial Setup](#initial-setup)
+      - [MetaWare Development Toolkit](#metaware-development-toolkit)
+      - [Make Tool version](#make-tool-version)
+      - [Serial Terminal Emulation Application](#serial-terminal-emulation-application)
+    - [Generate Example Project](#generate-example-project-1)
+    - [Build and Burn Example](#build-and-burn-example)
+  - [Running on SparkFun Edge](#running-on-sparkfun-edge)
+    - [Compile the binary](#compile-the-binary)
+    - [Sign the binary](#sign-the-binary)
+    - [Flash the binary](#flash-the-binary)
+  - [Run the tests on a development machine](#run-the-tests-on-a-development-machine)
+  - [Debugging image capture](#debugging-image-capture)
+  - [Training your own model](#training-your-own-model)
+
+## Running on ARC EM SDP
+
+The following instructions will help you to build and deploy this example to
+[ARC EM SDP](https://www.synopsys.com/dw/ipdir.php?ds=arc-em-software-development-platform)
+board. General information and instructions on using the board with TensorFlow
+Lite Micro can be found in the common
+[ARC targets description](/tensorflow/lite/micro/tools/make/targets/arc/README.md).
+
+This example uses asymmetric int8 quantization and can therefore leverage
+optimized int8 kernels from the embARC MLI library
+
+The ARC EM SDP board contains a rich set of extension interfaces. You can choose
+any compatible camera and modify
+[image_provider.cc](/tensorflow/lite/micro/examples/person_detection_experimental/image_provider.cc)
+file accordingly to use input from your specific camera. By default, results of
+running this example are printed to the console. If you would like to instead
+implement some target-specific actions, you need to modify
+[detection_responder.cc](/tensorflow/lite/micro/examples/person_detection_experimental/detection_responder.cc)
+accordingly.
+
+The reference implementations of these files are used by default on the EM SDP.
+
+### Initial setup
+
+Follow the instructions on the
+[ARC EM SDP Initial Setup](/tensorflow/lite/micro/tools/make/targets/arc/README.md#ARC-EM-Software-Development-Platform-ARC-EM-SDP)
+to get and install all required tools for work with ARC EM SDP.
+
+### Generate Example Project
+
+The example project for ARC EM SDP platform can be generated with the following
+command:
+
+```
+make -f tensorflow/lite/micro/tools/make/Makefile TARGET=arc_emsdp generate_person_detection_int8_make_project
+```
+
+### Build and Run Example
+
+For more detailed information on building and running examples see the
+appropriate sections of general descriptions of the
+[ARC EM SDP usage with TFLM](/tensorflow/lite/micro/tools/make/targets/arc/README.md#ARC-EM-Software-Development-Platform-ARC-EM-SDP).
+In the directory with generated project you can also find a
+*README_ARC_EMSDP.md* file with instructions and options on building and
+running. Here we only briefly mention main steps which are typically enough to
+get it started.
+
+1.  You need to
+    [connect the board](/tensorflow/lite/micro/tools/make/targets/arc/README.md#connect-the-board)
+    and open an serial connection.
+
+2.  Go to the generated example project director
+
+    ```
+    cd tensorflow/lite/micro/tools/make/gen/arc_emsdp_arc/prj/person_detection_int8/make
+    ```
+
+3.  Build the example using
+
+    ```
+    make app
+    ```
+
+4.  To generate artefacts for self-boot of example from the board use
+
+    ```
+    make flash
+    ```
+
+5.  To run application from the board using microSD card:
+
+    *   Copy the content of the created /bin folder into the root of microSD
+        card. Note that the card must be formatted as FAT32 with default cluster
+        size (but less than 32 Kbytes)
+    *   Plug in the microSD card into the J11 connector.
+    *   Push the RST button. If a red LED is lit beside RST button, push the CFG
+        button.
+
+6.  If you have the MetaWare Debugger installed in your environment:
+
+    *   To run application from the console using it type `make run`.
+    *   To stop the execution type `Ctrl+C` in the console several times.
+
+In both cases (step 5 and 6) you will see the application output in the serial
+terminal.
+
+## Running on Arduino
+
+The following instructions will help you build and deploy this sample
+to [Arduino](https://www.arduino.cc/) devices.
+
+The sample has been tested with the following device:
+
+- [Arduino Nano 33 BLE Sense](https://store.arduino.cc/usa/nano-33-ble-sense-with-headers)
+
+You will also need the following camera module:
+
+- [Arducam Mini 2MP Plus](https://www.amazon.com/Arducam-Module-Megapixels-Arduino-Mega2560/dp/B012UXNDOY)
+
+### Hardware
+
+Connect the Arducam pins as follows:
+
+|Arducam pin name|Arduino pin name|
+|----------------|----------------|
+|CS|D7 (unlabelled, immediately to the right of D6)|
+|MOSI|D11|
+|MISO|D12|
+|SCK|D13|
+|GND|GND (either pin marked GND is fine)|
+|VCC|3.3 V|
+|SDA|A4|
+|SCL|A5|
+
+### Install the Arduino_TensorFlowLite library
+
+Download the current nightly build of the library:
+[person_detection.zip](https://storage.googleapis.com/download.tensorflow.org/data/tf_lite_micro_person_data_int8_grayscale_2020_01_13.zip)
+
+This example application is included as part of the official TensorFlow Lite
+Arduino library. To install it, open the Arduino library manager in
+`Tools -> Manage Libraries...` and search for `Arduino_TensorFlowLite`.
+
+### Install other libraries
+
+In addition to the TensorFlow library, you'll also need to install two
+libraries:
+
+* The Arducam library, so our code can interface with the hardware
+* The JPEGDecoder library, so we can decode JPEG-encoded images
+
+The Arducam Arduino library is available from GitHub at
+[https://github.com/ArduCAM/Arduino](https://github.com/ArduCAM/Arduino).
+To install it, download or clone the repository. Next, copy its `ArduCAM`
+subdirectory into your `Arduino/libraries` directory. To find this directory on
+your machine, check the *Sketchbook location* in the Arduino IDE's
+*Preferences* window.
+
+After downloading the library, you'll need to edit one of its files to make sure
+it is configured for the Arducam Mini 2MP Plus. To do so, open the following
+file:
+
+```
+Arduino/libraries/ArduCAM/memorysaver.h
+```
+
+You'll see a bunch of `#define` statements listed. Make sure that they are all
+commented out, except for `#define OV2640_MINI_2MP_PLUS`, as so:
+
+```
+//Step 1: select the hardware platform, only one at a time
+//#define OV2640_MINI_2MP
+//#define OV3640_MINI_3MP
+//#define OV5642_MINI_5MP
+//#define OV5642_MINI_5MP_BIT_ROTATION_FIXED
+#define OV2640_MINI_2MP_PLUS
+//#define OV5642_MINI_5MP_PLUS
+//#define OV5640_MINI_5MP_PLUS
+```
+
+Once you save the file, we're done configuring the Arducam library.
+
+Our next step is to install the JPEGDecoder library. We can do this from within
+the Arduino IDE. First, go to the *Manage Libraries...* option in the *Tools*
+menu and search for `JPEGDecoder`. You should install version _1.8.0_ of the
+library.
+
+Once the library has installed, we'll need to configure it to disable some
+optional components that are not compatible with the Arduino Nano 33 BLE Sense.
+Open the following file:
+
+```
+Arduino/libraries/JPEGDecoder/src/User_Config.h
+```
+
+Make sure that both `#define LOAD_SD_LIBRARY` and `#define LOAD_SDFAT_LIBRARY`
+are commented out, as shown in this excerpt from the file:
+
+```c++
+// Comment out the next #defines if you are not using an SD Card to store the JPEGs
+// Commenting out the line is NOT essential but will save some FLASH space if
+// SD Card access is not needed. Note: use of SdFat is currently untested!
+
+//#define LOAD_SD_LIBRARY // Default SD Card library
+//#define LOAD_SDFAT_LIBRARY // Use SdFat library instead, so SD Card SPI can be bit bashed
+```
+
+Once you've saved the file, you are done installing libraries.
+
+### Load and run the example
+
+Go to `File -> Examples`. You should see an
+example near the bottom of the list named `TensorFlowLite`. Select
+it and click `person_detection` to load the example. Connect your device, then
+build and upload the example.
+
+To test the camera, start by pointing the device's camera at something that is
+definitely not a person, or just covering it up. The next time the blue LED
+flashes, the device will capture a frame from the camera and begin to run
+inference. Since the vision model we are using for person detection is
+relatively large, it takes a long time to run inference—around 19 seconds at the
+time of writing, though it's possible TensorFlow Lite has gotten faster since
+then.
+
+After 19 seconds or so, the inference result will be translated into another LED
+being lit. Since you pointed the camera at something that isn't a person, the
+red LED should light up.
+
+Now, try pointing the device's camera at yourself! The next time the blue LED
+flashes, the device will capture another image and begin to run inference. After
+19 seconds, the green LED should light up!
+
+Remember, image data is captured as a snapshot before each inference, whenever
+the blue LED flashes. Whatever the camera is pointed at during that moment is
+what will be fed into the model. It doesn't matter where the camera is pointed
+until the next time an image is captured, when the blue LED will flash again.
+
+If you're getting seemingly incorrect results, make sure you are in an
+environment with good lighting. You should also make sure that the camera is
+oriented correctly, with the pins pointing downwards, so that the images it
+captures are the right way up—the model was not trained to recognize upside-down
+people! In addition, it's good to remember that this is a tiny model, which
+trades accuracy for small size. It works very well, but it isn't accurate 100%
+of the time.
+
+We can also see the results of inference via the Arduino Serial Monitor. To do
+this, open the *Serial Monitor* from the *Tools* menu. You'll see a detailed
+log of what is happening while our application runs. It's also interesting to
+check the *Show timestamp* box, so you can see how long each part of the process
+takes:
+
+```
+14:17:50.714 -> Starting capture
+14:17:50.714 -> Image captured
+14:17:50.784 -> Reading 3080 bytes from ArduCAM
+14:17:50.887 -> Finished reading
+14:17:50.887 -> Decoding JPEG and converting to greyscale
+14:17:51.074 -> Image decoded and processed
+14:18:09.710 -> Person score: 246 No person score: 66
+```
+
+From the log, we can see that it took around 170 ms to capture and read the
+image data from the camera module, 180 ms to decode the JPEG and convert it to
+greyscale, and 18.6 seconds to run inference.
+
+## Running on HIMAX WE1 EVB
+
+The following instructions will help you build and deploy this example to
+[HIMAX WE1 EVB](https://github.com/HimaxWiseEyePlus/bsp_tflu/tree/master/HIMAX_WE1_EVB_board_brief)
+board. To undstand more about using this board, please check 
+[HIMAX WE1 EVB user guide](https://github.com/HimaxWiseEyePlus/bsp_tflu/tree/master/HIMAX_WE1_EVB_user_guide).
+
+### Initial Setup
+
+To use the HIMAX WE1 EVB, please make sure following software are installed:
+
+#### MetaWare Development Toolkit
+
+See
+[Install the Synopsys DesignWare ARC MetaWare Development Toolkit](/tensorflow/lite/micro/tools/make/targets/arc/README.md#install-the-synopsys-designware-arc-metaware-development-toolkit)
+section for instructions on toolchain installation.
+
+#### Make Tool version
+
+A `'make'` tool is required for deploying Tensorflow Lite Micro
+applications on HIMAX WE1 EVB, See 
+[Check make tool version](/tensorflow/lite/micro/tools/make/targets/arc/README.md#make-tool)
+section for proper environment.
+
+#### Serial Terminal Emulation Application
+
+There are 2 main purposes for HIMAX WE1 EVB Debug UART port 
+
+- print application output
+- burn application to flash by using xmodem send application binary
+
+You can use any terminal emulation program (like [PuTTY](https://www.putty.org/) or [minicom](https://linux.die.net/man/1/minicom)).
+
+
+### Generate Example Project
+
+The example project for HIMAX WE1 EVB platform can be generated with the following
+command:
+
+Download related third party data
+
+```
+make -f tensorflow/lite/micro/tools/make/Makefile TARGET=himax_we1_evb third_party_downloads
+```
+
+Generate person detection project
+
+```
+make -f tensorflow/lite/micro/tools/make/Makefile generate_person_detection_int8_make_project TARGET=himax_we1_evb
+```
+
+### Build and Burn Example
+
+Following the Steps to run person detection example at HIMAX WE1 EVB platform. 
+
+1.  Go to the generated example project directory.
+
+    ```
+    cd tensorflow/lite/micro/tools/make/gen/himax_we1_evb_arc/prj/person_detection_int8/make
+    ```
+    
+2.  Build the example using
+
+    ```
+    make app
+    ```
+
+3.  After example build finish, copy ELF file and map file to image generate tool directory.  
+    image generate tool directory  located at `'tensorflow/lite/micro/tools/make/downloads/himax_we1_sdk/image_gen_linux_v3/'`  
+
+    ```
+    cp person_detection_int8.elf himax_we1_evb.map ../../../../../downloads/himax_we1_sdk/image_gen_linux_v3/
+    ```
+
+4.  Go to flash image generate tool directory. 
+
+    ```
+    cd ../../../../../downloads/himax_we1_sdk/image_gen_linux_v3/
+    ```
+
+5.  run image generate tool, generate flash image file. 
+
+    *   Before running image generate tool, by typing `sudo chmod +x image_gen`
+        and `sudo chmod +x sign_tool` to make sure it is executable. 
+
+    ```
+    image_gen -e person_detection_int8.elf -m himax_we1_evb.map -o out.img
+    ```    
+       
+
+6.  Download flash image file to HIMAX WE1 EVB by UART:
+
+    *   more detail about download image through UART can be found at [HIMAX WE1 EVB update Flash image](https://github.com/HimaxWiseEyePlus/bsp_tflu/tree/master/HIMAX_WE1_EVB_user_guide#flash-image-update)
+
+After these steps, press reset button on the HIMAX WE1 EVB, you will see application output in the serial
+terminal.
+
+## Running on SparkFun Edge
+
+The following instructions will help you build and deploy this sample on the
+[SparkFun Edge development board](https://sparkfun.com/products/15170).  This
+sample requires the Sparkfun Himax camera for the Sparkfun Edge board.  It is
+not available for purchase yet.
+
+If you're new to using this board, we recommend walking through the
+[AI on a microcontroller with TensorFlow Lite and SparkFun Edge](https://codelabs.developers.google.com/codelabs/sparkfun-tensorflow)
+codelab to get an understanding of the workflow.
+
+### Compile the binary
+
+The following command will download the required dependencies and then compile a
+binary for the SparkFun Edge:
+
+```
+make -f tensorflow/lite/micro/tools/make/Makefile TARGET=sparkfun_edge person_detection_bin
+```
+
+The binary will be created in the following location:
+
+```
+tensorflow/lite/micro/tools/make/gen/sparkfun_edge_cortex-m4/bin/person_detection.bin
+```
+
+### Sign the binary
+
+The binary must be signed with cryptographic keys to be deployed to the device.
+We'll now run some commands that will sign our binary so it can be flashed to
+the SparkFun Edge. The scripts we are using come from the Ambiq SDK, which is
+downloaded when the `Makefile` is run.
+
+Enter the following command to set up some dummy cryptographic keys we can use
+for development:
+
+```
+cp tensorflow/lite/micro/tools/make/downloads/AmbiqSuite-Rel2.0.0/tools/apollo3_scripts/keys_info0.py \
+tensorflow/lite/micro/tools/make/downloads/AmbiqSuite-Rel2.0.0/tools/apollo3_scripts/keys_info.py
+```
+
+Next, run the following command to create a signed binary:
+
+```
+python3 tensorflow/lite/micro/tools/make/downloads/AmbiqSuite-Rel2.0.0/tools/apollo3_scripts/create_cust_image_blob.py \
+--bin tensorflow/lite/micro/tools/make/gen/sparkfun_edge_cortex-m4/bin/person_detection.bin \
+--load-address 0xC000 \
+--magic-num 0xCB \
+-o main_nonsecure_ota \
+--version 0x0
+```
+
+This will create the file `main_nonsecure_ota.bin`. We'll now run another
+command to create a final version of the file that can be used to flash our
+device with the bootloader script we will use in the next step:
+
+```
+python3 tensorflow/lite/micro/tools/make/downloads/AmbiqSuite-Rel2.0.0/tools/apollo3_scripts/create_cust_wireupdate_blob.py \
+--load-address 0x20000 \
+--bin main_nonsecure_ota.bin \
+-i 6 \
+-o main_nonsecure_wire \
+--options 0x1
+```
+
+You should now have a file called `main_nonsecure_wire.bin` in the directory
+where you ran the commands. This is the file we'll be flashing to the device.
+
+### Flash the binary
+
+Next, attach the board to your computer via a USB-to-serial adapter.
+
+**Note:** If you're using the [SparkFun Serial Basic Breakout](https://www.sparkfun.com/products/15096),
+you should [install the latest drivers](https://learn.sparkfun.com/tutorials/sparkfun-serial-basic-ch340c-hookup-guide#drivers-if-you-need-them)
+before you continue.
+
+Once connected, assign the USB device name to an environment variable:
+
+```
+export DEVICENAME=put your device name here
+```
+
+Set another variable with the baud rate:
+
+```
+export BAUD_RATE=921600
+```
+
+Now, hold the button marked `14` on the device. While still holding the button,
+hit the button marked `RST`. Continue holding the button marked `14` while
+running the following command:
+
+```
+python3 tensorflow/lite/micro/tools/make/downloads/AmbiqSuite-Rel2.0.0/tools/apollo3_scripts/uart_wired_update.py \
+-b ${BAUD_RATE} ${DEVICENAME} \
+-r 1 \
+-f main_nonsecure_wire.bin \
+-i 6
+```
+
+You should see a long stream of output as the binary is flashed to the device.
+Once you see the following lines, flashing is complete:
+
+```
+Sending Reset Command.
+Done.
+```
+
+If you don't see these lines, flashing may have failed. Try running through the
+steps in [Flash the binary](#flash-the-binary) again (you can skip over setting
+the environment variables). If you continue to run into problems, follow the
+[AI on a microcontroller with TensorFlow Lite and SparkFun Edge](https://codelabs.developers.google.com/codelabs/sparkfun-tensorflow)
+codelab, which includes more comprehensive instructions for the flashing
+process.
+
+The binary should now be deployed to the device. Hit the button marked `RST` to
+reboot the board. You should see the device's four LEDs flashing in sequence.
+
+Debug information is logged by the board while the program is running. To view
+it, establish a serial connection to the board using a baud rate of `115200`.
+On OSX and Linux, the following command should work:
+
+```
+screen ${DEVICENAME} 115200
+```
+
+To stop viewing the debug output with `screen`, hit `Ctrl+A`, immediately
+followed by the `K` key, then hit the `Y` key.
+
+## Run the tests on a development machine
+
+To compile and test this example on a desktop Linux or MacOS machine, download
+[the TensorFlow source code](https://github.com/tensorflow/tensorflow), `cd`
+into the source directory from a terminal, and then run the following command:
+
+```
+make -f tensorflow/lite/micro/tools/make/Makefile
+```
+
+This will take a few minutes, and downloads frameworks the code uses like
+[CMSIS](https://developer.arm.com/embedded/cmsis) and
+[flatbuffers](https://google.github.io/flatbuffers/). Once that process has
+finished, run:
+
+```
+make -f tensorflow/lite/micro/tools/make/Makefile test_person_detection_test
+```
+
+You should see a series of files get compiled, followed by some logging output
+from a test, which should conclude with `~~~ALL TESTS PASSED~~~`. If you see
+this, it means that a small program has been built and run that loads a trained
+TensorFlow model, runs some example images through it, and got the expected
+outputs. This particular test runs images with a and without a person in them,
+and checks that the network correctly identifies them.
+
+To understand how TensorFlow Lite does this, you can look at the `TestInvoke()`
+function in
+[person_detection_test.cc](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/micro/examples/person_detection/person_detection_test.cc).
+It's a fairly small amount of code, creating an interpreter, getting a handle to
+a model that's been compiled into the program, and then invoking the interpreter
+with the model and sample inputs.
+
+## Debugging image capture
+When the sample is running, check the LEDs to determine whether the inference is
+running correctly.  If the red light is stuck on, it means there was an error
+communicating with the camera.  This is likely due to an incorrectly connected
+or broken camera.
+
+During inference, the blue LED will toggle every time inference is complete. The
+orange LED indicates that no person was found, and the green LED indicates a
+person was found. The red LED should never turn on, since it indicates an error.
+
+In order to view the captured image, set the DUMP_IMAGE define in main.cc.  This
+causes the board to log raw image info to the console. After the board has been
+flashed and reset, dump the log to a text file:
+
+
+```
+screen -L -Logfile <dump file> ${DEVICENAME} 115200
+```
+
+Next, run the raw to bitmap converter to view captured images:
+
+```
+python3 raw_to_bitmap.py -r GRAY -i <dump file>
+```
+
+## Training your own model
+
+You can train your own model with some easy-to-use scripts. See
+[training_a_model.md](training_a_model.md) for instructions.
diff --git a/tensorflow/lite/micro/tools/make/third_party_downloads.inc~ b/tensorflow/lite/micro/tools/make/third_party_downloads.inc~
deleted file mode 100644
index 3c7ee1b64d2..00000000000
--- a/tensorflow/lite/micro/tools/make/third_party_downloads.inc~
+++ /dev/null
@@ -1,86 +0,0 @@
-# Add URLs and MD5 checksums for third-party libraries here.
-
-GEMMLOWP_URL := "https://github.com/google/gemmlowp/archive/719139ce755a0f31cbf1c37f7f98adcc7fc9f425.zip"
-GEMMLOWP_MD5 := "7e8191b24853d75de2af87622ad293ba"
-
-ifeq ($(HOST_OS),windows)
-  FLATBUFFERS_URL := "https://github.com/google/flatbuffers/archive/v1.12.0.zip"
-  FLATBUFFERS_MD5 := "a1afdbf114dec01a861c1b8c917d0fc7"
-else
-  FLATBUFFERS_URL := "https://github.com/google/flatbuffers/archive/v1.12.0.tar.gz"
-  FLATBUFFERS_MD5 := "c62ffefb3d4548b127cca14ce047f16c"
-endif
-
-ifeq ($(HOST_OS),osx)
-  GCC_EMBEDDED_URL := "https://developer.arm.com/-/media/Files/downloads/gnu-rm/7-2018q2/gcc-arm-none-eabi-7-2018-q2-update-mac.tar.bz2"
-  GCC_EMBEDDED_MD5 := "a66be9828cf3c57d7d21178e07cd8904"
-else ifeq ($(HOST_OS),windows)
-  GCC_EMBEDDED_URL := "https://developer.arm.com/-/media/Files/downloads/gnu-rm/7-2018q2/gcc-arm-none-eabi-7-2018-q2-update-win32.zip"
-  GCC_EMBEDDED_MD5 := "bc8ae26d7c429f30d583a605a4bcf9bc"
-else
-  GCC_EMBEDDED_URL := "https://developer.arm.com/-/media/Files/downloads/gnu-rm/7-2018q2/gcc-arm-none-eabi-7-2018-q2-update-linux.tar.bz2"
-  GCC_EMBEDDED_MD5 := "299ebd3f1c2c90930d28ab82e5d8d6c0"
-endif
-
-LEON_BCC2_URL := "https://www.gaisler.com/anonftp/bcc2/bin/bcc-2.0.7-gcc-linux64.tar.xz"
-LEON_BCC2_MD5 := "cdf78082be4882da2a92c9baa82fe765"
-
-TSIM_URL := "https://www.gaisler.com/anonftp/tsim/tsim-eval-2.0.63.tar.gz"
-TSIM_MD5 := "afa0095d3ed989a949e1467f94e41d2f"
-
-CMSIS_URL := "https://github.com/ARM-software/CMSIS_5/archive/1150e71e07c79b538efd842aba5b210a31827ae5.zip"
-CMSIS_MD5 := "e05f4222ef58825193910b41a0871dcb"
-
-AM_SDK_URL := "http://s3.asia.ambiqmicro.com/downloads/AmbiqSuite-Rel2.2.0.zip"
-AM_SDK_MD5 := "7605fa2d4d97e6bb7a1190c92b66b597"
-AM_SDK_DEST := AmbiqSuite-Rel2.2.0
-
-SF_BSPS_URL := "https://github.com/sparkfun/SparkFun_Apollo3_AmbiqSuite_BSPs/archive/v0.0.7.zip"
-SF_BSPS_MD5 := "34199f7e754735661d1c8a70a40ca7a3"
-SF_BSPS_DEST := boards_sfe
-
-STM32_BARE_LIB_URL := "https://github.com/google/stm32_bare_lib/archive/c07d611fb0af58450c5a3e0ab4d52b47f99bc82d.zip"
-STM32_BARE_LIB_MD5 := "282bff40d4d0b92278fd123a3b6e3123"
-
-ifeq ($(HOST_OS),osx)
-  RISCV_TOOLCHAIN_URL := "https://static.dev.sifive.com/dev-tools/riscv64-unknown-elf-gcc-8.1.0-2019.01.0-x86_64-apple-darwin.tar.gz"
-  RISCV_TOOLCHAIN_MD5 := "2ac2fa00618b9ab7fa0c7d0ec173de94"
-else
-  RISCV_TOOLCHAIN_URL := "https://static.dev.sifive.com/dev-tools/riscv64-unknown-elf-gcc-20181030-x86_64-linux-ubuntu14.tar.gz"
-  RISCV_TOOLCHAIN_MD5="2366b7afe36a54dc94fb0ff8a0830934"
-endif
-
-SIFIVE_FE310_LIB_URL := "https://github.com/sifive/freedom-e-sdk/archive/baeeb8fd497a99b3c141d7494309ec2e64f19bdf.zip"
-SIFIVE_FE310_LIB_MD5 := "06ee24c4956f8e21670ab3395861fe64"
-
-KISSFFT_URL="https://github.com/mborgerding/kissfft/archive/v130.zip"
-KISSFFT_MD5="438ba1fef5783cc5f5f201395cc477ca"
-
-RUY_URL="https://github.com/google/ruy/archive/1b313682ef8b8fc8ed08719c610d1c3503b016bf.zip"
-RUY_MD5="2d54f058f8f7120dfc1ecee79dbf259e"
-
-CIFAR10_DATASET_URL="https://www.cs.toronto.edu/~kriz/cifar-10-binary.tar.gz"
-CIFAR10_DATASET_MD5="c32a1d4ab5d03f1284b67883e8d87530"
-
-IMAGE_RECOGNITION_MODEL_URL := "https://storage.googleapis.com/download.tensorflow.org/models/tflite/cifar_image_recognition_model_2020_4_14.zip"
-IMAGE_RECOGNITION_MODEL_MD5 := "2b886156e7ef4d6e53d0f1a4bc800e56"
-
-PERSON_MODEL_URL := "https://storage.googleapis.com/download.tensorflow.org/data/tf_lite_micro_person_data_grayscale_2019_11_21.zip"
-PERSON_MODEL_MD5 := "fe2934bd0788f1dcc7af3f0a954542ab"
-
-PERSON_MODEL_INT8_URL := "https://storage.googleapis.com/download.tensorflow.org/data/tf_lite_micro_person_data_int8_grayscale_2020_01_13.zip"
-PERSON_MODEL_INT8_MD5 := "8a7d2c70325f53136faea6dde517b8cc"
-
-EMBARC_MLI_URL := "https://github.com/foss-for-synopsys-dwc-arc-processors/embarc_mli/archive/58284867ca52d1f43b25045e8601999d7359d986.zip"
-EMBARC_MLI_MD5 := "2bf4982a327fdaa9d475803ce014d1ef"
-
-EMBARC_MLI_PRE_COMPILED_URL := "https://github.com/foss-for-synopsys-dwc-arc-processors/embarc_mli/releases/download/Release_1.1_RC2/embARC_MLI_package.zip"
-EMBARC_MLI_PRE_COMPILED_MD5 := "a95ff9e0370434484f14e7e4114327f6"
-
-XTENSA_HIFI4_URL :="https://github.com/foss-xtensa/nnlib-hifi4/raw/master/archive/xa_nnlib_04_07.zip"
-XTENSA_HIFI4_MD5 :="f234764928f9a42901df33a27e118c8b"
-
-HIMAX_WE1_SDK_URL ="https://www.himax.com.tw/we-i/himax_we1_sdk_v02.zip"
-HIMAX_WE1_SDK_MD5 ="5063c24d298fbcfe118163f3ccc43079"
-                    
-

From 233f1d53f829f157788ed687c07698b12cc8e091 Mon Sep 17 00:00:00 2001
From: "902449@58880@bigcat_chen@ASIC" <bigcat_chen@himax.com.tw>
Date: Thu, 4 Jun 2020 18:16:32 +0800
Subject: [PATCH 0081/1390] remove temp readme in example directory

---
 .../micro/examples/hello_world/README.md~     | 595 ------------------
 .../person_detection_experimental/README.md~  | 568 -----------------
 2 files changed, 1163 deletions(-)
 delete mode 100644 tensorflow/lite/micro/examples/hello_world/README.md~
 delete mode 100644 tensorflow/lite/micro/examples/person_detection_experimental/README.md~

diff --git a/tensorflow/lite/micro/examples/hello_world/README.md~ b/tensorflow/lite/micro/examples/hello_world/README.md~
deleted file mode 100644
index 9c0a5e2306a..00000000000
--- a/tensorflow/lite/micro/examples/hello_world/README.md~
+++ /dev/null
@@ -1,595 +0,0 @@
-# Hello World Example
-
-This example is designed to demonstrate the absolute basics of using [TensorFlow
-Lite for Microcontrollers](https://www.tensorflow.org/lite/microcontrollers).
-It includes the full end-to-end workflow of training a model, converting it for
-use with TensorFlow Lite for Microcontrollers for running inference on a
-microcontroller.
-
-The model is trained to replicate a `sine` function and generates a pattern of
-data to either blink LEDs or control an animation, depending on the capabilities
-of the device.
-
-![Animation on STM32F746](images/animation_on_STM32F746.gif)
-
-## Table of contents
-
-- [Hello World Example](#hello-world-example)
-  - [Table of contents](#table-of-contents)
-  - [Deploy to ARC EM SDP](#deploy-to-arc-em-sdp)
-    - [Initial Setup](#initial-setup)
-    - [Generate Example Project](#generate-example-project)
-    - [Build and Run Example](#build-and-run-example)
-  - [Deploy to Arduino](#deploy-to-arduino)
-    - [Install the Arduino_TensorFlowLite library](#install-the-arduinotensorflowlite-library)
-    - [Load and run the example](#load-and-run-the-example)
-  - [Deploy to ESP32](#deploy-to-esp32)
-    - [Install the ESP IDF](#install-the-esp-idf)
-    - [Generate the examples](#generate-the-examples)
-    - [Building the example](#building-the-example)
-    - [Load and run the example](#load-and-run-the-example-1)
-  - [Deploy to himax WE1 EVB](#deploy-to-himax-we1-evb)
-    - [Initial Setup](#initial-setup-1)
-      - [MetaWare Development Toolkit](#metaware-development-toolkit)
-      - [Make Tool version](#make-tool-version)
-      - [Serial Terminal Emulation Application](#serial-terminal-emulation-application)
-    - [Generate Example Project](#generate-example-project-1)
-    - [Build and Burn Example](#build-and-burn-example)
-  - [Deploy to SparkFun Edge](#deploy-to-sparkfun-edge)
-    - [Compile the binary](#compile-the-binary)
-    - [Sign the binary](#sign-the-binary)
-    - [Flash the binary](#flash-the-binary)
-  - [Deploy to STM32F746](#deploy-to-stm32f746)
-    - [Run the tests on a development machine](#run-the-tests-on-a-development-machine)
-    - [Train your own model](#train-your-own-model)
-
-## Deploy to ARC EM SDP
-
-The following instructions will help you to build and deploy this example to
-[ARC EM SDP](https://www.synopsys.com/dw/ipdir.php?ds=arc-em-software-development-platform)
-board. General information and instructions on using the board with TensorFlow
-Lite Micro can be found in the common
-[ARC targets description](/tensorflow/lite/micro/tools/make/targets/arc/README.md).
-
-### Initial Setup
-
-Follow the instructions on the
-[ARC EM SDP Initial Setup](/tensorflow/lite/micro/tools/make/targets/arc/README.md#ARC-EM-Software-Development-Platform-ARC-EM-SDP)
-to get and install all required tools for work with ARC EM SDP.
-
-### Generate Example Project
-
-The example project for ARC EM SDP platform can be generated with the following
-command:
-
-```
-make -f tensorflow/lite/micro/tools/make/Makefile TARGET=arc_emsdp TAGS=no_arc_mli generate_hello_world_make_project
-```
-
-### Build and Run Example
-
-For more detailed information on building and running examples see the
-appropriate sections of general descriptions of the
-[ARC EM SDP usage with TFLM](/tensorflow/lite/micro/tools/make/targets/arc/README.md#ARC-EM-Software-Development-Platform-ARC-EM-SDP).
-In the directory with generated project you can also find a
-*README_ARC_EMSDP.md* file with instructions and options on building and
-running. Here we only briefly mention main steps which are typically enough to
-get it started.
-
-1.  You need to
-    [connect the board](/tensorflow/lite/micro/tools/make/targets/arc/README.md#connect-the-board)
-    and open an serial connection.
-
-2.  Go to the generated example project director
-
-    ```
-    cd tensorflow/lite/micro/tools/make/gen/arc_emsdp_arc/prj/hello_world/make
-    ```
-
-3.  Build the example using
-
-    ```
-    make app
-    ```
-
-4.  To generate artefacts for self-boot of example from the board use
-
-    ```
-    make flash
-    ```
-
-5.  To run application from the board using microSD card:
-
-    *   Copy the content of the created /bin folder into the root of microSD
-        card. Note that the card must be formatted as FAT32 with default cluster
-        size (but less than 32 Kbytes)
-    *   Plug in the microSD card into the J11 connector.
-    *   Push the RST button. If a red LED is lit beside RST button, push the CFG
-        button.
-
-6.  If you have the MetaWare Debugger installed in your environment:
-
-    *   To run application from the console using it type `make run`.
-    *   To stop the execution type `Ctrl+C` in the console several times.
-
-In both cases (step 5 and 6) you will see the application output in the serial
-terminal.
-
-## Deploy to Arduino
-
-The following instructions will help you build and deploy this sample
-to [Arduino](https://www.arduino.cc/) devices.
-
-![Animation on Arduino MKRZERO](images/animation_on_arduino_mkrzero.gif)
-
-The sample has been tested with the following devices:
-
-- [Arduino Nano 33 BLE Sense](https://store.arduino.cc/usa/nano-33-ble-sense-with-headers)
-- [Arduino MKRZERO](https://store.arduino.cc/usa/arduino-mkrzero)
-
-The sample will use PWM to fade an LED on and off according to the model's
-output. In the code, the `LED_BUILTIN` constant is used to specify the board's
-built-in LED as the one being controlled. However, on some boards, this built-in
-LED is not attached to a pin with PWM capabilities. In this case, the LED will
-blink instead of fading.
-
-### Install the Arduino_TensorFlowLite library
-
-This example application is included as part of the official TensorFlow Lite
-Arduino library. To install it, open the Arduino library manager in
-`Tools -> Manage Libraries...` and search for `Arduino_TensorFlowLite`.
-
-### Load and run the example
-
-Once the library has been added, go to `File -> Examples`. You should see an
-example near the bottom of the list named `TensorFlowLite:hello_world`. Select
-it and click `hello_world` to load the example.
-
-Use the Arduino IDE to build and upload the example. Once it is running,
-you should see the built-in LED on your device flashing.
-
-The Arduino Desktop IDE includes a plotter that we can use to display the sine
-wave graphically. To view it, go to `Tools -> Serial Plotter`. You will see one
-datapoint being logged for each inference cycle, expressed as a number between 0
-and 255.
-
-## Deploy to ESP32
-
-The following instructions will help you build and deploy this sample
-to [ESP32](https://www.espressif.com/en/products/hardware/esp32/overview)
-devices using the [ESP IDF](https://github.com/espressif/esp-idf).
-
-The sample has been tested on ESP-IDF version 4.0 with the following devices:
-- [ESP32-DevKitC](http://esp-idf.readthedocs.io/en/latest/get-started/get-started-devkitc.html)
-- [ESP-EYE](https://github.com/espressif/esp-who/blob/master/docs/en/get-started/ESP-EYE_Getting_Started_Guide.md)
-
-### Install the ESP IDF
-
-Follow the instructions of the
-[ESP-IDF get started guide](https://docs.espressif.com/projects/esp-idf/en/latest/get-started/index.html)
-to setup the toolchain and the ESP-IDF itself.
-
-The next steps assume that the
-[IDF environment variables are set](https://docs.espressif.com/projects/esp-idf/en/latest/get-started/index.html#step-4-set-up-the-environment-variables) :
-
- * The `IDF_PATH` environment variable is set
- * `idf.py` and Xtensa-esp32 tools (e.g. `xtensa-esp32-elf-gcc`) are in `$PATH`
-
-### Generate the examples
-The example project can be generated with the following command:
-```
-make -f tensorflow/lite/micro/tools/make/Makefile TARGET=esp generate_hello_world_esp_project
-```
-
-### Building the example
-
-Go the the example project directory
-```
-cd tensorflow/lite/micro/tools/make/gen/esp_xtensa-esp32/prj/hello_world/esp-idf
-```
-
-Then build with `idf.py`
-```
-idf.py build
-```
-
-### Load and run the example
-
-To flash (replace `/dev/ttyUSB0` with the device serial port):
-```
-idf.py --port /dev/ttyUSB0 flash
-```
-
-Monitor the serial output:
-```
-idf.py --port /dev/ttyUSB0 monitor
-```
-
-Use `Ctrl+]` to exit.
-
-The previous two commands can be combined:
-```
-idf.py --port /dev/ttyUSB0 flash monitor
-```
-
-## Deploy to himax WE1 EVB
-
-The following instructions will help you build and deploy this example to
-[HIMAX WE1 EVB](https://github.com/HimaxWiseEyePlus/bsp_tflu/tree/master/HIMAX_WE1_EVB_board_brief)
-board. To undstand more about using this board, please check 
-[HIMAX WE1 EVB user guide](https://github.com/HimaxWiseEyePlus/bsp_tflu/tree/master/HIMAX_WE1_EVB_user_guide).
-
-### Initial Setup
-
-To use the HIMAX WE1 EVB, please make sure following software are installed:
-
-#### MetaWare Development Toolkit
-
-See
-[Install the Synopsys DesignWare ARC MetaWare Development Toolkit](/tensorflow/lite/micro/tools/make/targets/arc/README.md#install-the-synopsys-designware-arc-metaware-development-toolkit)
-section for instructions on toolchain installation.
-
-#### Make Tool version
-
-A `'make'` tool is required for deploying Tensorflow Lite Micro
-applications on HIMAX WE1 EVB, See 
-[Check make tool version](/tensorflow/lite/micro/tools/make/targets/arc/README.md#make-tool)
-section for proper environment.
-
-#### Serial Terminal Emulation Application
-
-There are 2 main purposes for HIMAX WE1 EVB Debug UART port 
-
-- print application output
-- burn application to flash by using xmodem send application binary
-
-You can use any terminal emulation program (like [PuTTY](https://www.putty.org/) or [minicom](https://linux.die.net/man/1/minicom)).
-
-
-### Generate Example Project
-
-The example project for HIMAX WE1 EVB platform can be generated with the following
-command:
-
-Download related third party data
-
-```
-make -f tensorflow/lite/micro/tools/make/Makefile TARGET=himax_we1_evb third_party_downloads
-```
-
-Generate hello world project
-
-```
-make -f tensorflow/lite/micro/tools/make/Makefile generate_hello_world_make_project TARGET=himax_we1_evb TAGS=no_arc_mli
-```
-
-### Build and Burn Example
-
-Following the Steps to run hello world example at HIMAX WE1 EVB platform. 
-
-1.  Go to the generated example project directory.
-
-    ```
-    cd tensorflow/lite/micro/tools/make/gen/himax_we1_evb_arc/prj/hello_world/make
-    ```
-    
-2.  Build the example using
-
-    ```
-    make app
-    ```
-
-3.  After example build finish, copy ELF file and map file to image generate tool directory.  
-    image generate tool directory  located at `'tensorflow/lite/micro/tools/make/downloads/himax_we1_sdk/image_gen_linux_v3/'` 
-
-    ```
-    cp hello_world.elf himax_we1_evb.map ../../../../../downloads/himax_we1_sdk/image_gen_linux_v3/
-    ```
-
-4.  Go to flash image generate tool directory. 
-
-    ```
-    cd ../../../../../downloads/himax_we1_sdk/image_gen_linux_v3/
-    ```
-
-5.  run image generate tool, generate flash image file. 
-
-    *   Before running image generate tool, by typing `sudo chmod +x image_gen`
-        and `sudo chmod +x sign_tool` to make sure it is executable. 
-
-    ```
-    image_gen -e hello_world.elf -m himax_we1_evb.map -o out.img
-    ```    
-       
-
-6.  Download flash image file to HIMAX WE1 EVB by UART:
-
-    *   more detail about download image through UART can be found at [HIMAX WE1 EVB update Flash image](https://github.com/HimaxWiseEyePlus/bsp_tflu/tree/master/HIMAX_WE1_EVB_user_guide#flash-image-update)
-
-After these steps, press reset button on the HIMAX WE1 EVB, you will see application output in the serial
-terminal.
-
-## Deploy to SparkFun Edge
-
-The following instructions will help you build and deploy this sample on the
-[SparkFun Edge development board](https://sparkfun.com/products/15170).
-
-![Animation on SparkFun Edge](images/animation_on_sparkfun_edge.gif)
-
-If you're new to using this board, we recommend walking through the
-[AI on a microcontroller with TensorFlow Lite and SparkFun Edge](https://codelabs.developers.google.com/codelabs/sparkfun-tensorflow)
-codelab to get an understanding of the workflow.
-
-### Compile the binary
-
-The following command will download the required dependencies and then compile a
-binary for the SparkFun Edge:
-
-```
-make -f tensorflow/lite/micro/tools/make/Makefile TARGET=sparkfun_edge hello_world_bin
-```
-
-The binary will be created in the following location:
-
-```
-tensorflow/lite/micro/tools/make/gen/sparkfun_edge_cortex-m4/bin/hello_world.bin
-```
-
-### Sign the binary
-
-The binary must be signed with cryptographic keys to be deployed to the device.
-We'll now run some commands that will sign our binary so it can be flashed to
-the SparkFun Edge. The scripts we are using come from the Ambiq SDK, which is
-downloaded when the `Makefile` is run.
-
-Enter the following command to set up some dummy cryptographic keys we can use
-for development:
-
-```
-cp tensorflow/lite/micro/tools/make/downloads/AmbiqSuite-Rel2.2.0/tools/apollo3_scripts/keys_info0.py \
-tensorflow/lite/micro/tools/make/downloads/AmbiqSuite-Rel2.2.0/tools/apollo3_scripts/keys_info.py
-```
-
-Next, run the following command to create a signed binary:
-
-```
-python3 tensorflow/lite/micro/tools/make/downloads/AmbiqSuite-Rel2.2.0/tools/apollo3_scripts/create_cust_image_blob.py \
---bin tensorflow/lite/micro/tools/make/gen/sparkfun_edge_cortex-m4/bin/hello_world.bin \
---load-address 0xC000 \
---magic-num 0xCB \
--o main_nonsecure_ota \
---version 0x0
-```
-
-This will create the file `main_nonsecure_ota.bin`. We'll now run another
-command to create a final version of the file that can be used to flash our
-device with the bootloader script we will use in the next step:
-
-```
-python3 tensorflow/lite/micro/tools/make/downloads/AmbiqSuite-Rel2.2.0/tools/apollo3_scripts/create_cust_wireupdate_blob.py \
---load-address 0x20000 \
---bin main_nonsecure_ota.bin \
--i 6 \
--o main_nonsecure_wire \
---options 0x1
-```
-
-You should now have a file called `main_nonsecure_wire.bin` in the directory
-where you ran the commands. This is the file we'll be flashing to the device.
-
-### Flash the binary
-
-Next, attach the board to your computer via a USB-to-serial adapter.
-
-**Note:** If you're using the [SparkFun Serial Basic Breakout](https://www.sparkfun.com/products/15096),
-you should [install the latest drivers](https://learn.sparkfun.com/tutorials/sparkfun-serial-basic-ch340c-hookup-guide#drivers-if-you-need-them)
-before you continue.
-
-Once connected, assign the USB device name to an environment variable:
-
-```
-export DEVICENAME=put your device name here
-```
-
-Set another variable with the baud rate:
-
-```
-export BAUD_RATE=921600
-```
-
-Now, hold the button marked `14` on the device. While still holding the button,
-hit the button marked `RST`. Continue holding the button marked `14` while
-running the following command:
-
-```
-python3 tensorflow/lite/micro/tools/make/downloads/AmbiqSuite-Rel2.2.0/tools/apollo3_scripts/uart_wired_update.py \
--b ${BAUD_RATE} ${DEVICENAME} \
--r 1 \
--f main_nonsecure_wire.bin \
--i 6
-```
-
-You should see a long stream of output as the binary is flashed to the device.
-Once you see the following lines, flashing is complete:
-
-```
-Sending Reset Command.
-Done.
-```
-
-If you don't see these lines, flashing may have failed. Try running through the
-steps in [Flash the binary](#flash-the-binary) again (you can skip over setting
-the environment variables). If you continue to run into problems, follow the
-[AI on a microcontroller with TensorFlow Lite and SparkFun Edge](https://codelabs.developers.google.com/codelabs/sparkfun-tensorflow)
-codelab, which includes more comprehensive instructions for the flashing
-process.
-
-The binary should now be deployed to the device. Hit the button marked `RST` to
-reboot the board. You should see the device's four LEDs flashing in sequence.
-
-Debug information is logged by the board while the program is running. To view
-it, establish a serial connection to the board using a baud rate of `115200`.
-On OSX and Linux, the following command should work:
-
-```
-screen ${DEVICENAME} 115200
-```
-
-You will see a lot of output flying past! To stop the scrolling, hit `Ctrl+A`,
-immediately followed by `Esc`. You can then use the arrow keys to explore the
-output, which will contain the results of running inference on various `x`
-values:
-
-```
-x_value: 1.1843798*2^2, y_value: -1.9542645*2^-1
-```
-
-To stop viewing the debug output with `screen`, hit `Ctrl+A`, immediately
-followed by the `K` key, then hit the `Y` key.
-
-
-## Deploy to STM32F746
-
-The following instructions will help you build and deploy the sample to the
-[STM32F7 discovery kit](https://os.mbed.com/platforms/ST-Discovery-F746NG/)
-using [ARM Mbed](https://github.com/ARMmbed/mbed-cli).
-
-![Animation on STM32F746](images/animation_on_STM32F746.gif)
-
-Before we begin, you'll need the following:
-
-- STM32F7 discovery kit board
-- Mini-USB cable
-- ARM Mbed CLI ([installation instructions](https://os.mbed.com/docs/mbed-os/v5.12/tools/installation-and-setup.html))
-- Python 2.7 and pip
-
-Since Mbed requires a special folder structure for projects, we'll first run a
-command to generate a subfolder containing the required source files in this
-structure:
-
-```
-make -f tensorflow/lite/micro/tools/make/Makefile TARGET=mbed TAGS="CMSIS disco_f746ng" generate_hello_world_mbed_project
-```
-
-This will result in the creation of a new folder:
-
-```
-tensorflow/lite/micro/tools/make/gen/mbed_cortex-m4/prj/hello_world/mbed
-```
-
-This folder contains all of the example's dependencies structured in the correct
-way for Mbed to be able to build it.
-
-Change into the directory and run the following commands, making sure you are
-using Python 2.7.15.
-
-First, tell Mbed that the current directory is the root of an Mbed project:
-
-```
-mbed config root .
-```
-
-Next, tell Mbed to download the dependencies and prepare to build:
-
-```
-mbed deploy
-```
-
-By default, Mbed will build the project using C++98. However, TensorFlow Lite
-requires C++11. Run the following Python snippet to modify the Mbed
-configuration files so that it uses C++11:
-
-```
-python -c 'import fileinput, glob;
-for filename in glob.glob("mbed-os/tools/profiles/*.json"):
-  for line in fileinput.input(filename, inplace=True):
-    print line.replace("\"-std=gnu++98\"","\"-std=c++11\", \"-fpermissive\"")'
-
-```
-
-Finally, run the following command to compile:
-
-```
-mbed compile -m DISCO_F746NG -t GCC_ARM
-```
-
-This should result in a binary at the following path:
-
-```
-./BUILD/DISCO_F746NG/GCC_ARM/mbed.bin
-```
-
-To deploy, plug in your STM board and copy the file to it. On MacOS, you can do
-this with the following command:
-
-```
-cp ./BUILD/DISCO_F746NG/GCC_ARM/mbed.bin /Volumes/DIS_F746NG/
-```
-
-Copying the file will initiate the flashing process. Once this is complete, you
-should see an animation on the device's screen.
-
-
-```
-screen /dev/tty.usbmodem14403 9600
-```
-
-In addition to this animation, debug information is logged by the board while
-the program is running. To view it, establish a serial connection to the board
-using a baud rate of `9600`. On OSX and Linux, the following command should
-work, replacing `/dev/tty.devicename` with the name of your device as it appears
-in `/dev`:
-
-```
-screen /dev/tty.devicename 9600
-```
-
-You will see a lot of output flying past! To stop the scrolling, hit `Ctrl+A`,
-immediately followed by `Esc`. You can then use the arrow keys to explore the
-output, which will contain the results of running inference on various `x`
-values:
-
-```
-x_value: 1.1843798*2^2, y_value: -1.9542645*2^-1
-```
-
-To stop viewing the debug output with `screen`, hit `Ctrl+A`, immediately
-followed by the `K` key, then hit the `Y` key.
-
-### Run the tests on a development machine
-
-To compile and test this example on a desktop Linux or macOS machine, first
-clone the TensorFlow repository from GitHub to a convenient place:
-
-```bash
-git clone --depth 1 https://github.com/tensorflow/tensorflow.git
-```
-
-Next, `cd` into the source directory from a terminal, and then run the following
-command:
-
-```bash
-make -f tensorflow/lite/micro/tools/make/Makefile test_hello_world_test
-```
-
-This will take a few minutes, and downloads frameworks the code uses. Once the
-process has finished, you should see a series of files get compiled, followed by
-some logging output from a test, which should conclude with
-`~~~ALL TESTS PASSED~~~`.
-
-If you see this, it means that a small program has been built and run that loads
-the trained TensorFlow model, runs some example inputs through it, and got the
-expected outputs.
-
-To understand how TensorFlow Lite does this, you can look at the source in
-[hello_world_test.cc](hello_world_test.cc).
-It's a fairly small amount of code that creates an interpreter, gets a handle to
-a model that's been compiled into the program, and then invokes the interpreter
-with the model and sample inputs.
-
-### Train your own model
-
-So far you have used an existing trained model to run inference on
-microcontrollers. If you wish to train your own model, follow the instructions
-given in the [train/](train/) directory.
-
diff --git a/tensorflow/lite/micro/examples/person_detection_experimental/README.md~ b/tensorflow/lite/micro/examples/person_detection_experimental/README.md~
deleted file mode 100644
index 4d53e551431..00000000000
--- a/tensorflow/lite/micro/examples/person_detection_experimental/README.md~
+++ /dev/null
@@ -1,568 +0,0 @@
-# Person detection example
-
-This example shows how you can use Tensorflow Lite to run a 250 kilobyte neural
-network to recognize people in images captured by a camera.  It is designed to
-run on systems with small amounts of memory such as microcontrollers and DSPs.
-This uses the experimental int8 quantized version of the person detection model.
-
-## Table of contents
-
-- [Person detection example](#person-detection-example)
-  - [Table of contents](#table-of-contents)
-  - [Running on ARC EM SDP](#running-on-arc-em-sdp)
-    - [Initial setup](#initial-setup)
-    - [Generate Example Project](#generate-example-project)
-    - [Build and Run Example](#build-and-run-example)
-  - [Running on Arduino](#running-on-arduino)
-    - [Hardware](#hardware)
-    - [Install the Arduino_TensorFlowLite library](#install-the-arduinotensorflowlite-library)
-    - [Install other libraries](#install-other-libraries)
-    - [Load and run the example](#load-and-run-the-example)
-  - [Running on HIMAX WE1 EVB](#running-on-himax-we1-evb)
-    - [Initial Setup](#initial-setup)
-      - [MetaWare Development Toolkit](#metaware-development-toolkit)
-      - [Make Tool version](#make-tool-version)
-      - [Serial Terminal Emulation Application](#serial-terminal-emulation-application)
-    - [Generate Example Project](#generate-example-project-1)
-    - [Build and Burn Example](#build-and-burn-example)
-  - [Running on SparkFun Edge](#running-on-sparkfun-edge)
-    - [Compile the binary](#compile-the-binary)
-    - [Sign the binary](#sign-the-binary)
-    - [Flash the binary](#flash-the-binary)
-  - [Run the tests on a development machine](#run-the-tests-on-a-development-machine)
-  - [Debugging image capture](#debugging-image-capture)
-  - [Training your own model](#training-your-own-model)
-
-## Running on ARC EM SDP
-
-The following instructions will help you to build and deploy this example to
-[ARC EM SDP](https://www.synopsys.com/dw/ipdir.php?ds=arc-em-software-development-platform)
-board. General information and instructions on using the board with TensorFlow
-Lite Micro can be found in the common
-[ARC targets description](/tensorflow/lite/micro/tools/make/targets/arc/README.md).
-
-This example uses asymmetric int8 quantization and can therefore leverage
-optimized int8 kernels from the embARC MLI library
-
-The ARC EM SDP board contains a rich set of extension interfaces. You can choose
-any compatible camera and modify
-[image_provider.cc](/tensorflow/lite/micro/examples/person_detection_experimental/image_provider.cc)
-file accordingly to use input from your specific camera. By default, results of
-running this example are printed to the console. If you would like to instead
-implement some target-specific actions, you need to modify
-[detection_responder.cc](/tensorflow/lite/micro/examples/person_detection_experimental/detection_responder.cc)
-accordingly.
-
-The reference implementations of these files are used by default on the EM SDP.
-
-### Initial setup
-
-Follow the instructions on the
-[ARC EM SDP Initial Setup](/tensorflow/lite/micro/tools/make/targets/arc/README.md#ARC-EM-Software-Development-Platform-ARC-EM-SDP)
-to get and install all required tools for work with ARC EM SDP.
-
-### Generate Example Project
-
-The example project for ARC EM SDP platform can be generated with the following
-command:
-
-```
-make -f tensorflow/lite/micro/tools/make/Makefile TARGET=arc_emsdp generate_person_detection_int8_make_project
-```
-
-### Build and Run Example
-
-For more detailed information on building and running examples see the
-appropriate sections of general descriptions of the
-[ARC EM SDP usage with TFLM](/tensorflow/lite/micro/tools/make/targets/arc/README.md#ARC-EM-Software-Development-Platform-ARC-EM-SDP).
-In the directory with generated project you can also find a
-*README_ARC_EMSDP.md* file with instructions and options on building and
-running. Here we only briefly mention main steps which are typically enough to
-get it started.
-
-1.  You need to
-    [connect the board](/tensorflow/lite/micro/tools/make/targets/arc/README.md#connect-the-board)
-    and open an serial connection.
-
-2.  Go to the generated example project director
-
-    ```
-    cd tensorflow/lite/micro/tools/make/gen/arc_emsdp_arc/prj/person_detection_int8/make
-    ```
-
-3.  Build the example using
-
-    ```
-    make app
-    ```
-
-4.  To generate artefacts for self-boot of example from the board use
-
-    ```
-    make flash
-    ```
-
-5.  To run application from the board using microSD card:
-
-    *   Copy the content of the created /bin folder into the root of microSD
-        card. Note that the card must be formatted as FAT32 with default cluster
-        size (but less than 32 Kbytes)
-    *   Plug in the microSD card into the J11 connector.
-    *   Push the RST button. If a red LED is lit beside RST button, push the CFG
-        button.
-
-6.  If you have the MetaWare Debugger installed in your environment:
-
-    *   To run application from the console using it type `make run`.
-    *   To stop the execution type `Ctrl+C` in the console several times.
-
-In both cases (step 5 and 6) you will see the application output in the serial
-terminal.
-
-## Running on Arduino
-
-The following instructions will help you build and deploy this sample
-to [Arduino](https://www.arduino.cc/) devices.
-
-The sample has been tested with the following device:
-
-- [Arduino Nano 33 BLE Sense](https://store.arduino.cc/usa/nano-33-ble-sense-with-headers)
-
-You will also need the following camera module:
-
-- [Arducam Mini 2MP Plus](https://www.amazon.com/Arducam-Module-Megapixels-Arduino-Mega2560/dp/B012UXNDOY)
-
-### Hardware
-
-Connect the Arducam pins as follows:
-
-|Arducam pin name|Arduino pin name|
-|----------------|----------------|
-|CS|D7 (unlabelled, immediately to the right of D6)|
-|MOSI|D11|
-|MISO|D12|
-|SCK|D13|
-|GND|GND (either pin marked GND is fine)|
-|VCC|3.3 V|
-|SDA|A4|
-|SCL|A5|
-
-### Install the Arduino_TensorFlowLite library
-
-Download the current nightly build of the library:
-[person_detection.zip](https://storage.googleapis.com/download.tensorflow.org/data/tf_lite_micro_person_data_int8_grayscale_2020_01_13.zip)
-
-This example application is included as part of the official TensorFlow Lite
-Arduino library. To install it, open the Arduino library manager in
-`Tools -> Manage Libraries...` and search for `Arduino_TensorFlowLite`.
-
-### Install other libraries
-
-In addition to the TensorFlow library, you'll also need to install two
-libraries:
-
-* The Arducam library, so our code can interface with the hardware
-* The JPEGDecoder library, so we can decode JPEG-encoded images
-
-The Arducam Arduino library is available from GitHub at
-[https://github.com/ArduCAM/Arduino](https://github.com/ArduCAM/Arduino).
-To install it, download or clone the repository. Next, copy its `ArduCAM`
-subdirectory into your `Arduino/libraries` directory. To find this directory on
-your machine, check the *Sketchbook location* in the Arduino IDE's
-*Preferences* window.
-
-After downloading the library, you'll need to edit one of its files to make sure
-it is configured for the Arducam Mini 2MP Plus. To do so, open the following
-file:
-
-```
-Arduino/libraries/ArduCAM/memorysaver.h
-```
-
-You'll see a bunch of `#define` statements listed. Make sure that they are all
-commented out, except for `#define OV2640_MINI_2MP_PLUS`, as so:
-
-```
-//Step 1: select the hardware platform, only one at a time
-//#define OV2640_MINI_2MP
-//#define OV3640_MINI_3MP
-//#define OV5642_MINI_5MP
-//#define OV5642_MINI_5MP_BIT_ROTATION_FIXED
-#define OV2640_MINI_2MP_PLUS
-//#define OV5642_MINI_5MP_PLUS
-//#define OV5640_MINI_5MP_PLUS
-```
-
-Once you save the file, we're done configuring the Arducam library.
-
-Our next step is to install the JPEGDecoder library. We can do this from within
-the Arduino IDE. First, go to the *Manage Libraries...* option in the *Tools*
-menu and search for `JPEGDecoder`. You should install version _1.8.0_ of the
-library.
-
-Once the library has installed, we'll need to configure it to disable some
-optional components that are not compatible with the Arduino Nano 33 BLE Sense.
-Open the following file:
-
-```
-Arduino/libraries/JPEGDecoder/src/User_Config.h
-```
-
-Make sure that both `#define LOAD_SD_LIBRARY` and `#define LOAD_SDFAT_LIBRARY`
-are commented out, as shown in this excerpt from the file:
-
-```c++
-// Comment out the next #defines if you are not using an SD Card to store the JPEGs
-// Commenting out the line is NOT essential but will save some FLASH space if
-// SD Card access is not needed. Note: use of SdFat is currently untested!
-
-//#define LOAD_SD_LIBRARY // Default SD Card library
-//#define LOAD_SDFAT_LIBRARY // Use SdFat library instead, so SD Card SPI can be bit bashed
-```
-
-Once you've saved the file, you are done installing libraries.
-
-### Load and run the example
-
-Go to `File -> Examples`. You should see an
-example near the bottom of the list named `TensorFlowLite`. Select
-it and click `person_detection` to load the example. Connect your device, then
-build and upload the example.
-
-To test the camera, start by pointing the device's camera at something that is
-definitely not a person, or just covering it up. The next time the blue LED
-flashes, the device will capture a frame from the camera and begin to run
-inference. Since the vision model we are using for person detection is
-relatively large, it takes a long time to run inference—around 19 seconds at the
-time of writing, though it's possible TensorFlow Lite has gotten faster since
-then.
-
-After 19 seconds or so, the inference result will be translated into another LED
-being lit. Since you pointed the camera at something that isn't a person, the
-red LED should light up.
-
-Now, try pointing the device's camera at yourself! The next time the blue LED
-flashes, the device will capture another image and begin to run inference. After
-19 seconds, the green LED should light up!
-
-Remember, image data is captured as a snapshot before each inference, whenever
-the blue LED flashes. Whatever the camera is pointed at during that moment is
-what will be fed into the model. It doesn't matter where the camera is pointed
-until the next time an image is captured, when the blue LED will flash again.
-
-If you're getting seemingly incorrect results, make sure you are in an
-environment with good lighting. You should also make sure that the camera is
-oriented correctly, with the pins pointing downwards, so that the images it
-captures are the right way up—the model was not trained to recognize upside-down
-people! In addition, it's good to remember that this is a tiny model, which
-trades accuracy for small size. It works very well, but it isn't accurate 100%
-of the time.
-
-We can also see the results of inference via the Arduino Serial Monitor. To do
-this, open the *Serial Monitor* from the *Tools* menu. You'll see a detailed
-log of what is happening while our application runs. It's also interesting to
-check the *Show timestamp* box, so you can see how long each part of the process
-takes:
-
-```
-14:17:50.714 -> Starting capture
-14:17:50.714 -> Image captured
-14:17:50.784 -> Reading 3080 bytes from ArduCAM
-14:17:50.887 -> Finished reading
-14:17:50.887 -> Decoding JPEG and converting to greyscale
-14:17:51.074 -> Image decoded and processed
-14:18:09.710 -> Person score: 246 No person score: 66
-```
-
-From the log, we can see that it took around 170 ms to capture and read the
-image data from the camera module, 180 ms to decode the JPEG and convert it to
-greyscale, and 18.6 seconds to run inference.
-
-## Running on HIMAX WE1 EVB
-
-The following instructions will help you build and deploy this example to
-[HIMAX WE1 EVB](https://github.com/HimaxWiseEyePlus/bsp_tflu/tree/master/HIMAX_WE1_EVB_board_brief)
-board. To undstand more about using this board, please check 
-[HIMAX WE1 EVB user guide](https://github.com/HimaxWiseEyePlus/bsp_tflu/tree/master/HIMAX_WE1_EVB_user_guide).
-
-### Initial Setup
-
-To use the HIMAX WE1 EVB, please make sure following software are installed:
-
-#### MetaWare Development Toolkit
-
-See
-[Install the Synopsys DesignWare ARC MetaWare Development Toolkit](/tensorflow/lite/micro/tools/make/targets/arc/README.md#install-the-synopsys-designware-arc-metaware-development-toolkit)
-section for instructions on toolchain installation.
-
-#### Make Tool version
-
-A `'make'` tool is required for deploying Tensorflow Lite Micro
-applications on HIMAX WE1 EVB, See 
-[Check make tool version](/tensorflow/lite/micro/tools/make/targets/arc/README.md#make-tool)
-section for proper environment.
-
-#### Serial Terminal Emulation Application
-
-There are 2 main purposes for HIMAX WE1 EVB Debug UART port 
-
-- print application output
-- burn application to flash by using xmodem send application binary
-
-You can use any terminal emulation program (like [PuTTY](https://www.putty.org/) or [minicom](https://linux.die.net/man/1/minicom)).
-
-
-### Generate Example Project
-
-The example project for HIMAX WE1 EVB platform can be generated with the following
-command:
-
-Download related third party data
-
-```
-make -f tensorflow/lite/micro/tools/make/Makefile TARGET=himax_we1_evb third_party_downloads
-```
-
-Generate person detection project
-
-```
-make -f tensorflow/lite/micro/tools/make/Makefile generate_person_detection_int8_make_project TARGET=himax_we1_evb
-```
-
-### Build and Burn Example
-
-Following the Steps to run person detection example at HIMAX WE1 EVB platform. 
-
-1.  Go to the generated example project directory.
-
-    ```
-    cd tensorflow/lite/micro/tools/make/gen/himax_we1_evb_arc/prj/person_detection_int8/make
-    ```
-    
-2.  Build the example using
-
-    ```
-    make app
-    ```
-
-3.  After example build finish, copy ELF file and map file to image generate tool directory.  
-    image generate tool directory  located at `'tensorflow/lite/micro/tools/make/downloads/himax_we1_sdk/image_gen_linux_v3/'`  
-
-    ```
-    cp person_detection_int8.elf himax_we1_evb.map ../../../../../downloads/himax_we1_sdk/image_gen_linux_v3/
-    ```
-
-4.  Go to flash image generate tool directory. 
-
-    ```
-    cd ../../../../../downloads/himax_we1_sdk/image_gen_linux_v3/
-    ```
-
-5.  run image generate tool, generate flash image file. 
-
-    *   Before running image generate tool, by typing `sudo chmod +x image_gen`
-        and `sudo chmod +x sign_tool` to make sure it is executable. 
-
-    ```
-    image_gen -e person_detection_int8.elf -m himax_we1_evb.map -o out.img
-    ```    
-       
-
-6.  Download flash image file to HIMAX WE1 EVB by UART:
-
-    *   more detail about download image through UART can be found at [HIMAX WE1 EVB update Flash image](https://github.com/HimaxWiseEyePlus/bsp_tflu/tree/master/HIMAX_WE1_EVB_user_guide#flash-image-update)
-
-After these steps, press reset button on the HIMAX WE1 EVB, you will see application output in the serial
-terminal.
-
-## Running on SparkFun Edge
-
-The following instructions will help you build and deploy this sample on the
-[SparkFun Edge development board](https://sparkfun.com/products/15170).  This
-sample requires the Sparkfun Himax camera for the Sparkfun Edge board.  It is
-not available for purchase yet.
-
-If you're new to using this board, we recommend walking through the
-[AI on a microcontroller with TensorFlow Lite and SparkFun Edge](https://codelabs.developers.google.com/codelabs/sparkfun-tensorflow)
-codelab to get an understanding of the workflow.
-
-### Compile the binary
-
-The following command will download the required dependencies and then compile a
-binary for the SparkFun Edge:
-
-```
-make -f tensorflow/lite/micro/tools/make/Makefile TARGET=sparkfun_edge person_detection_bin
-```
-
-The binary will be created in the following location:
-
-```
-tensorflow/lite/micro/tools/make/gen/sparkfun_edge_cortex-m4/bin/person_detection.bin
-```
-
-### Sign the binary
-
-The binary must be signed with cryptographic keys to be deployed to the device.
-We'll now run some commands that will sign our binary so it can be flashed to
-the SparkFun Edge. The scripts we are using come from the Ambiq SDK, which is
-downloaded when the `Makefile` is run.
-
-Enter the following command to set up some dummy cryptographic keys we can use
-for development:
-
-```
-cp tensorflow/lite/micro/tools/make/downloads/AmbiqSuite-Rel2.0.0/tools/apollo3_scripts/keys_info0.py \
-tensorflow/lite/micro/tools/make/downloads/AmbiqSuite-Rel2.0.0/tools/apollo3_scripts/keys_info.py
-```
-
-Next, run the following command to create a signed binary:
-
-```
-python3 tensorflow/lite/micro/tools/make/downloads/AmbiqSuite-Rel2.0.0/tools/apollo3_scripts/create_cust_image_blob.py \
---bin tensorflow/lite/micro/tools/make/gen/sparkfun_edge_cortex-m4/bin/person_detection.bin \
---load-address 0xC000 \
---magic-num 0xCB \
--o main_nonsecure_ota \
---version 0x0
-```
-
-This will create the file `main_nonsecure_ota.bin`. We'll now run another
-command to create a final version of the file that can be used to flash our
-device with the bootloader script we will use in the next step:
-
-```
-python3 tensorflow/lite/micro/tools/make/downloads/AmbiqSuite-Rel2.0.0/tools/apollo3_scripts/create_cust_wireupdate_blob.py \
---load-address 0x20000 \
---bin main_nonsecure_ota.bin \
--i 6 \
--o main_nonsecure_wire \
---options 0x1
-```
-
-You should now have a file called `main_nonsecure_wire.bin` in the directory
-where you ran the commands. This is the file we'll be flashing to the device.
-
-### Flash the binary
-
-Next, attach the board to your computer via a USB-to-serial adapter.
-
-**Note:** If you're using the [SparkFun Serial Basic Breakout](https://www.sparkfun.com/products/15096),
-you should [install the latest drivers](https://learn.sparkfun.com/tutorials/sparkfun-serial-basic-ch340c-hookup-guide#drivers-if-you-need-them)
-before you continue.
-
-Once connected, assign the USB device name to an environment variable:
-
-```
-export DEVICENAME=put your device name here
-```
-
-Set another variable with the baud rate:
-
-```
-export BAUD_RATE=921600
-```
-
-Now, hold the button marked `14` on the device. While still holding the button,
-hit the button marked `RST`. Continue holding the button marked `14` while
-running the following command:
-
-```
-python3 tensorflow/lite/micro/tools/make/downloads/AmbiqSuite-Rel2.0.0/tools/apollo3_scripts/uart_wired_update.py \
--b ${BAUD_RATE} ${DEVICENAME} \
--r 1 \
--f main_nonsecure_wire.bin \
--i 6
-```
-
-You should see a long stream of output as the binary is flashed to the device.
-Once you see the following lines, flashing is complete:
-
-```
-Sending Reset Command.
-Done.
-```
-
-If you don't see these lines, flashing may have failed. Try running through the
-steps in [Flash the binary](#flash-the-binary) again (you can skip over setting
-the environment variables). If you continue to run into problems, follow the
-[AI on a microcontroller with TensorFlow Lite and SparkFun Edge](https://codelabs.developers.google.com/codelabs/sparkfun-tensorflow)
-codelab, which includes more comprehensive instructions for the flashing
-process.
-
-The binary should now be deployed to the device. Hit the button marked `RST` to
-reboot the board. You should see the device's four LEDs flashing in sequence.
-
-Debug information is logged by the board while the program is running. To view
-it, establish a serial connection to the board using a baud rate of `115200`.
-On OSX and Linux, the following command should work:
-
-```
-screen ${DEVICENAME} 115200
-```
-
-To stop viewing the debug output with `screen`, hit `Ctrl+A`, immediately
-followed by the `K` key, then hit the `Y` key.
-
-## Run the tests on a development machine
-
-To compile and test this example on a desktop Linux or MacOS machine, download
-[the TensorFlow source code](https://github.com/tensorflow/tensorflow), `cd`
-into the source directory from a terminal, and then run the following command:
-
-```
-make -f tensorflow/lite/micro/tools/make/Makefile
-```
-
-This will take a few minutes, and downloads frameworks the code uses like
-[CMSIS](https://developer.arm.com/embedded/cmsis) and
-[flatbuffers](https://google.github.io/flatbuffers/). Once that process has
-finished, run:
-
-```
-make -f tensorflow/lite/micro/tools/make/Makefile test_person_detection_test
-```
-
-You should see a series of files get compiled, followed by some logging output
-from a test, which should conclude with `~~~ALL TESTS PASSED~~~`. If you see
-this, it means that a small program has been built and run that loads a trained
-TensorFlow model, runs some example images through it, and got the expected
-outputs. This particular test runs images with a and without a person in them,
-and checks that the network correctly identifies them.
-
-To understand how TensorFlow Lite does this, you can look at the `TestInvoke()`
-function in
-[person_detection_test.cc](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/micro/examples/person_detection/person_detection_test.cc).
-It's a fairly small amount of code, creating an interpreter, getting a handle to
-a model that's been compiled into the program, and then invoking the interpreter
-with the model and sample inputs.
-
-## Debugging image capture
-When the sample is running, check the LEDs to determine whether the inference is
-running correctly.  If the red light is stuck on, it means there was an error
-communicating with the camera.  This is likely due to an incorrectly connected
-or broken camera.
-
-During inference, the blue LED will toggle every time inference is complete. The
-orange LED indicates that no person was found, and the green LED indicates a
-person was found. The red LED should never turn on, since it indicates an error.
-
-In order to view the captured image, set the DUMP_IMAGE define in main.cc.  This
-causes the board to log raw image info to the console. After the board has been
-flashed and reset, dump the log to a text file:
-
-
-```
-screen -L -Logfile <dump file> ${DEVICENAME} 115200
-```
-
-Next, run the raw to bitmap converter to view captured images:
-
-```
-python3 raw_to_bitmap.py -r GRAY -i <dump file>
-```
-
-## Training your own model
-
-You can train your own model with some easy-to-use scripts. See
-[training_a_model.md](training_a_model.md) for instructions.

From 39e65d52e400f8c343195e2f8ac34f286648a415 Mon Sep 17 00:00:00 2001
From: "902449@58880@bigcat_chen@ASIC" <bigcat_chen@himax.com.tw>
Date: Thu, 4 Jun 2020 18:19:58 +0800
Subject: [PATCH 0082/1390] remove temp makefile in target

---
 .../make/targets/himax_we1_evb_makefile.inc~  | 93 -------------------
 1 file changed, 93 deletions(-)
 delete mode 100644 tensorflow/lite/micro/tools/make/targets/himax_we1_evb_makefile.inc~

diff --git a/tensorflow/lite/micro/tools/make/targets/himax_we1_evb_makefile.inc~ b/tensorflow/lite/micro/tools/make/targets/himax_we1_evb_makefile.inc~
deleted file mode 100644
index 733f258fbbb..00000000000
--- a/tensorflow/lite/micro/tools/make/targets/himax_we1_evb_makefile.inc~
+++ /dev/null
@@ -1,93 +0,0 @@
-# Settings for himax WE_1 evb.
-ifeq ($(TARGET), himax_we1_evb)
-  
-  CC_TOOL = ccac
-  AR_TOOL = arac
-  CXX_TOOL = ccac
-  LD_TOOL := ccac
-  TARGET_ARCH := arc
-  #ARC_TOOLCHAIN := mwdt 
-
-  BUILD_ARC_MLI := false
-  ARC_MLI_PRE_COMPILED_TARGET := himax_arcem9d_r16
-  
-include $(MAKEFILE_DIR)/targets/arc/arc_common.inc
-  #download SDK & MLI
-  HIMAX_WE1_SDK_NAME := himax_we1_sdk
-  #MLI_LIB_DIR = arc_mli_package
-  #MLI_LIB_DIR = arc_mli_package
-  #$(eval $(call add_third_party_download,$(EMBARC_MLI_PRE_COMPILED_URL),$(EMBARC_MLI_PRE_COMPILED_MD5),$(MLI_LIB_DIR),))
-  $(eval $(call add_third_party_download,$(HIMAX_WE1_SDK_URL),$(HIMAX_WE1_SDK_MD5),$(HIMAX_WE1_SDK_NAME),))
-
-  #export path of toolchain
-  #export PATH := $(MAKEFILE_DIR)/downloads/$(HIMAX_WE1_SDK_NAME)/image_gen_linux_v3/:$(PATH)
-  
-  TCF_FILE := $(PWD)/$(MAKEFILE_DIR)/downloads/$(HIMAX_WE1_SDK_NAME)/arcem9d_wei_r16.tcf
-  LCF_FILE := $(PWD)/$(MAKEFILE_DIR)/downloads/$(HIMAX_WE1_SDK_NAME)/memory.lcf
-  ARCLIB_FILE :=  $(PWD)/$(MAKEFILE_DIR)/downloads/$(HIMAX_WE1_SDK_NAME)/libembarc.a
-  LIB_HEADER_FILE :=  $(PWD)/$(MAKEFILE_DIR)/downloads/$(HIMAX_WE1_SDK_NAME)/hx_drv_tflm.h
-  
-
-  DEFAULT_HEAPSZ := 8192
-  DEFAULT_STACKSZ := 8192
-
-  TCF_FILE_NAME = $(notdir $(TCF_FILE))
-  ARC_TARGET_FILES_DIRS = $(dir $(TCF_FILE_NAME))
-  MAKE_PROJECT_FILES += $(TCF_FILE_NAME)
-    
-  LCF_FILE_NAME = $(notdir $(LCF_FILE))
-  ARC_TARGET_FILES_DIRS += $(dir $(LCF_FILE))
-  MAKE_PROJECT_FILES += $(LCF_FILE_NAME)
-  
-  ARCLIB_FILE_NAME = $(notdir $(ARCLIB_FILE))
-  ARC_TARGET_FILES_DIRS += $(dir $(ARCLIB_FILE))
-  MAKE_PROJECT_FILES += $(ARCLIB_FILE_NAME)
-  
-  LIB_HEADER_FILE_NAME = $(notdir $(LIB_HEADER_FILE))
-  ARC_TARGET_FILES_DIRS += $(dir $(LIB_HEADER_FILE))
-  MAKE_PROJECT_FILES += $(LIB_HEADER_FILE_NAME)
-  
-  
-    
-  # Need a pointer to the TCF and lcf file
-
-  PLATFORM_FLAGS = \
-    -DNDEBUG \
-    -g \
-    -DCPU_ARC \
-    -Hnosdata \
-    -DTF_LITE_STATIC_MEMORY \
-    -tcf=$(TCF_FILE_NAME) \
-    -Hnocopyr \
-    -Hpurge \
-    -Hcl \
-    -fslp-vectorize-aggressive \
-    -ffunction-sections \
-    -fdata-sections \
-    -tcf_core_config \
-
-  CXXFLAGS += -fno-rtti -DSCRATCH_MEM_Z_SIZE=0x10000 $(PLATFORM_FLAGS)
-  CCFLAGS += $(PLATFORM_FLAGS)
-
-  INCLUDES+= \
-    -I $(MAKEFILE_DIR)/downloads/$(WEI_SDK_NAME) \
-    -I $(MAKEFILE_DIR)/downloads/kissfft
-
-  GENERATED_PROJECT_INCLUDES += \
-    -I. \
-    -I./third_party/kissfft
-
-  LDFLAGS += \
-    -Hheap=8192 \
-    -tcf=$(TCF_FILE_NAME) \
-    -Hnocopyr \
-    -m \
-    -Hldopt=-Coutput=$(TARGET).map \
-    $(LCF_FILE_NAME) \
-    -Hldopt=-Bgrouplib $(ARCLIB_FILE_NAME)
-
-  CXXFLAGS := $(filter-out -std=c++11,$(CXXFLAGS))
-  CCFLAGS := $(filter-out -std=c11,$(CCFLAGS))
-  MICROLITE_LIBS := $(filter-out -lm,$(MICROLITE_LIBS))
-
-endif

From bdf6adfc0e78b5d1e21df527200b9cfaad5830c2 Mon Sep 17 00:00:00 2001
From: Nishidha Panpaliya <npanpa23@in.ibm.com>
Date: Thu, 4 Jun 2020 14:52:03 +0000
Subject: [PATCH 0083/1390] Review comments addressed

---
 tensorflow/compiler/xla/service/cpu/BUILD     | 10 +++++++
 .../service/cpu/test_target_triple_helper.h   | 27 +++++++++++++++++++
 .../compiler/xla/service/cpu/tests/BUILD      |  4 +++
 .../service/cpu/tests/cpu_dyn_shape_test.cc   |  3 ++-
 .../cpu/tests/cpu_eigen_dot_operation_test.cc |  3 ++-
 .../cpu/tests/cpu_key_value_sort_test.cc      |  3 ++-
 .../cpu/tests/cpu_literal_caching_test.cc     |  5 ++--
 .../xla/service/cpu/tests/cpu_outfeed_test.cc |  5 ++--
 ...ed_reduce_with_no_vector_registers_test.cc |  3 ++-
 .../xla/tests/local_client_aot_test_helper.cc |  2 ++
 10 files changed, 57 insertions(+), 8 deletions(-)
 create mode 100644 tensorflow/compiler/xla/service/cpu/test_target_triple_helper.h

diff --git a/tensorflow/compiler/xla/service/cpu/BUILD b/tensorflow/compiler/xla/service/cpu/BUILD
index 3460e65b0a2..7be4d3e724a 100644
--- a/tensorflow/compiler/xla/service/cpu/BUILD
+++ b/tensorflow/compiler/xla/service/cpu/BUILD
@@ -30,6 +30,15 @@ filegroup(
     ]),
 )
 
+cc_library(
+    name = "test_header_helper",
+    testonly = True,
+    hdrs = ["test_target_triple_helper.h"],
+    deps = [
+        "//tensorflow/core:test",
+    ],
+)
+
 filegroup(
     name = "single_threaded_runtime_srcs",
     srcs = [
@@ -1071,6 +1080,7 @@ tf_cc_test(
     deps = [
         ":cpu_compiler",
         ":cpu_transfer_manager",
+	":test_header_helper",
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
diff --git a/tensorflow/compiler/xla/service/cpu/test_target_triple_helper.h b/tensorflow/compiler/xla/service/cpu/test_target_triple_helper.h
new file mode 100644
index 00000000000..e248f6de8bd
--- /dev/null
+++ b/tensorflow/compiler/xla/service/cpu/test_target_triple_helper.h
@@ -0,0 +1,27 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+  
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_TEST_TARGET_TRIPLE_HELPER_H_
+#define TENSORFLOW_TEST_TARGET_TRIPLE_HELPER_H_
+
+#if (defined(__powerpc__) || defined(__ppc__) && (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__))
+static const std::string kTargetCpuForHost="ppc";
+static const std::string kTargetTripleForHost="ppc64le-ibm-linux-gnu";
+#else
+static const std::string kTargetCpuForHost="";
+static const std::string kTargetTripleForHost="x86_64-pc-linux";
+#endif
+
+#endif
diff --git a/tensorflow/compiler/xla/service/cpu/tests/BUILD b/tensorflow/compiler/xla/service/cpu/tests/BUILD
index 1ac8509cdb1..18624330a26 100644
--- a/tensorflow/compiler/xla/service/cpu/tests/BUILD
+++ b/tensorflow/compiler/xla/service/cpu/tests/BUILD
@@ -42,6 +42,7 @@ tf_cc_test(
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service/cpu:cpu_compiler",
         "//tensorflow/compiler/xla/service/cpu/tests:cpu_codegen_test",
+	"//tensorflow/compiler/xla/service/cpu:test_header_helper",
         "//tensorflow/core:lib",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
@@ -216,6 +217,7 @@ tf_cc_test(
         "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/service/cpu:cpu_compiler",
         "//tensorflow/compiler/xla/service/cpu/tests:cpu_codegen_test",
+	"//tensorflow/compiler/xla/service/cpu:test_header_helper",
         "//tensorflow/core:lib",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
@@ -229,6 +231,7 @@ tf_cc_test(
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service/cpu:cpu_compiler",
         "//tensorflow/compiler/xla/service/cpu/tests:cpu_codegen_test",
+	"//tensorflow/compiler/xla/service/cpu:test_header_helper",
         "//tensorflow/core:lib",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
@@ -242,6 +245,7 @@ tf_cc_test(
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service/cpu:cpu_compiler",
         "//tensorflow/compiler/xla/service/cpu/tests:cpu_codegen_test",
+	"//tensorflow/compiler/xla/service/cpu:test_header_helper",
         "//tensorflow/core:lib",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
diff --git a/tensorflow/compiler/xla/service/cpu/tests/cpu_dyn_shape_test.cc b/tensorflow/compiler/xla/service/cpu/tests/cpu_dyn_shape_test.cc
index 46249caa0c7..7f9fab5dab0 100644
--- a/tensorflow/compiler/xla/service/cpu/tests/cpu_dyn_shape_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/tests/cpu_dyn_shape_test.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/cpu/cpu_compiler.h"
 #include "tensorflow/compiler/xla/service/cpu/tests/cpu_codegen_test.h"
+#include "tensorflow/compiler/xla/service/cpu/test_target_triple_helper.h"
 
 namespace xla {
 namespace cpu {
@@ -46,7 +47,7 @@ TEST_F(CpuDynamicShapeTest, DynamicShapeR2) {
 )";
 
   CpuAotCompilationOptions options{
-      /*triple=*/"x86_64", /*cpu_name=*/"", /*features=*/"",
+      /*triple=*/kTargetTripleForHost, /*cpu_name=*/kTargetCpuForHost, /*features=*/"",
       /*entry_point_name=*/"entry",
       /*relocation_model=*/CpuAotCompilationOptions::RelocationModel::Static};
 
diff --git a/tensorflow/compiler/xla/service/cpu/tests/cpu_eigen_dot_operation_test.cc b/tensorflow/compiler/xla/service/cpu/tests/cpu_eigen_dot_operation_test.cc
index 8b7f843582b..40314ae5158 100644
--- a/tensorflow/compiler/xla/service/cpu/tests/cpu_eigen_dot_operation_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/tests/cpu_eigen_dot_operation_test.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/cpu/tests/cpu_codegen_test.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/tests/test_utils.h"
+#include "tensorflow/compiler/xla/service/cpu/test_target_triple_helper.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace xla {
@@ -45,7 +46,7 @@ class CpuEigenDotOperationTest
   void CompileAndCheck(std::unique_ptr<HloComputation> entry_computation,
                        const string& filecheck_lines) {
     CpuAotCompilationOptions options{
-        /*triple=*/"x86_64", /*cpu_name=*/"", /*features=*/"",
+        /*triple=*/kTargetTripleForHost, /*cpu_name=*/kTargetCpuForHost, /*features=*/"",
         /*entry_point_name=*/"entry",
         /*relocation_model=*/CpuAotCompilationOptions::RelocationModel::Static};
 
diff --git a/tensorflow/compiler/xla/service/cpu/tests/cpu_key_value_sort_test.cc b/tensorflow/compiler/xla/service/cpu/tests/cpu_key_value_sort_test.cc
index f3b7b91b2b5..8bb8acb557d 100644
--- a/tensorflow/compiler/xla/service/cpu/tests/cpu_key_value_sort_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/tests/cpu_key_value_sort_test.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/cpu/cpu_compiler.h"
 #include "tensorflow/compiler/xla/service/cpu/tests/cpu_codegen_test.h"
+#include "tensorflow/compiler/xla/service/cpu/test_target_triple_helper.h"
 
 namespace xla {
 namespace cpu {
@@ -48,7 +49,7 @@ CHECK: call void @__xla_cpu_runtime_KeyValueSort
   TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(hlo_text));
 
   CpuAotCompilationOptions options{
-      /*triple=*/"x86_64", /*cpu_name=*/"", /*features=*/"",
+      /*triple=*/kTargetTripleForHost, /*cpu_name=*/kTargetCpuForHost, /*features=*/"",
       /*entry_point_name=*/"entry",
       /*relocation_model=*/CpuAotCompilationOptions::RelocationModel::Static};
 
diff --git a/tensorflow/compiler/xla/service/cpu/tests/cpu_literal_caching_test.cc b/tensorflow/compiler/xla/service/cpu/tests/cpu_literal_caching_test.cc
index fc670201125..b86e23fc7b6 100644
--- a/tensorflow/compiler/xla/service/cpu/tests/cpu_literal_caching_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/tests/cpu_literal_caching_test.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/cpu/cpu_compiler.h"
 #include "tensorflow/compiler/xla/service/cpu/tests/cpu_codegen_test.h"
 #include "tensorflow/compiler/xla/service/hlo_parser.h"
+#include "tensorflow/compiler/xla/service/cpu/test_target_triple_helper.h"
 
 namespace xla {
 namespace cpu {
@@ -64,7 +65,7 @@ CHECK-NOT: private unnamed_addr constant [48 x i8]
                           ParseAndReturnVerifiedModule(hlo_text));
 
   CpuAotCompilationOptions options{
-      /*triple=*/"x86_64-pc-linux", /*cpu_name=*/"", /*features=*/"",
+      /*triple=*/kTargetTripleForHost, /*cpu_name=*/kTargetCpuForHost, /*features=*/"",
       /*entry_point_name=*/"entry",
       /*relocation_model=*/CpuAotCompilationOptions::RelocationModel::Static};
 
@@ -112,7 +113,7 @@ CHECK-NOT: private unnamed_addr constant [8 x i8]
                           ParseAndReturnVerifiedModule(hlo_text));
 
   CpuAotCompilationOptions options{
-      /*triple=*/"x86_64-pc-linux", /*cpu_name=*/"", /*features=*/"",
+      /*triple=*/kTargetTripleForHost, /*cpu_name=*/kTargetCpuForHost, /*features=*/"",
       /*entry_point_name=*/"entry",
       /*relocation_model=*/CpuAotCompilationOptions::RelocationModel::Static};
 
diff --git a/tensorflow/compiler/xla/service/cpu/tests/cpu_outfeed_test.cc b/tensorflow/compiler/xla/service/cpu/tests/cpu_outfeed_test.cc
index ad83c485998..ea5f282fb2b 100644
--- a/tensorflow/compiler/xla/service/cpu/tests/cpu_outfeed_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/tests/cpu_outfeed_test.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/cpu/cpu_compiler.h"
 #include "tensorflow/compiler/xla/service/cpu/tests/cpu_codegen_test.h"
+#include "tensorflow/compiler/xla/service/cpu/test_target_triple_helper.h"
 
 namespace xla {
 namespace cpu {
@@ -46,7 +47,7 @@ CHECK: private unnamed_addr constant [48 x i8]
   TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(hlo_text));
 
   CpuAotCompilationOptions options{
-      /*triple=*/"x86_64-pc-linux", /*cpu_name=*/"", /*features=*/"",
+      /*triple=*/kTargetTripleForHost, /*cpu_name=*/kTargetCpuForHost, /*features=*/"",
       /*entry_point_name=*/"entry",
       /*relocation_model=*/CpuAotCompilationOptions::RelocationModel::Static};
 
@@ -73,7 +74,7 @@ CHECK: Outfeed
   TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(hlo_text));
 
   CpuAotCompilationOptions options{
-      /*triple=*/"x86_64-pc-linux", /*cpu_name=*/"", /*features=*/"",
+      /*triple=*/kTargetTripleForHost, /*cpu_name=*/kTargetCpuForHost, /*features=*/"",
       /*entry_point_name=*/"entry",
       /*relocation_model=*/CpuAotCompilationOptions::RelocationModel::Static};
 
diff --git a/tensorflow/compiler/xla/service/cpu/vectorized_reduce_with_no_vector_registers_test.cc b/tensorflow/compiler/xla/service/cpu/vectorized_reduce_with_no_vector_registers_test.cc
index 754885d8744..e59a531b114 100644
--- a/tensorflow/compiler/xla/service/cpu/vectorized_reduce_with_no_vector_registers_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/vectorized_reduce_with_no_vector_registers_test.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/cpu/cpu_compiler.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/compiler/xla/service/cpu/test_target_triple_helper.h"
 
 namespace xla {
 namespace {
@@ -75,7 +76,7 @@ ENTRY main {
 
   // Check that the GetTargetVectorRegisterByteSize is itself working.
   TF_ASSERT_OK_AND_ASSIGN(unsigned vector_register_byte_size_for_x86_64,
-                          GetTargetVectorRegisterByteSize("x86_64-pc-linux"));
+                          GetTargetVectorRegisterByteSize(kTargetTripleForHost));
   ASSERT_EQ(vector_register_byte_size_for_x86_64, 16);
 
   std::string triple = "i686-none-android";
diff --git a/tensorflow/compiler/xla/tests/local_client_aot_test_helper.cc b/tensorflow/compiler/xla/tests/local_client_aot_test_helper.cc
index 53c0d84854e..3e9a3ec2314 100644
--- a/tensorflow/compiler/xla/tests/local_client_aot_test_helper.cc
+++ b/tensorflow/compiler/xla/tests/local_client_aot_test_helper.cc
@@ -71,6 +71,8 @@ int main(int argc, char** argv) {
     triple_string = "aarch64-none-linux-gnu";
   } else if (target_cpu == "x64_windows") {
     triple_string = "x86_64-pc-windows-msvc19";
+  } else if (target_cpu == "ppc") {
+    triple_string = "ppc64le-ibm-linux-gnu";
   } else if (target_cpu == "local") {
     triple_string = llvm::sys::getDefaultTargetTriple();
   } else {

From 17b7e169135127e0e866b50577ad8b213abc1d97 Mon Sep 17 00:00:00 2001
From: Dominic Jack <thedomjack@gmail.com>
Date: Fri, 5 Jun 2020 08:54:38 +1000
Subject: [PATCH 0084/1390] ensure model initialized on ANY trackable attr set

---
 tensorflow/python/keras/engine/training.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/keras/engine/training.py b/tensorflow/python/keras/engine/training.py
index 29ff31d56db..2b4ba8af3f5 100644
--- a/tensorflow/python/keras/engine/training.py
+++ b/tensorflow/python/keras/engine/training.py
@@ -319,7 +319,7 @@ class Model(base_layer.Layer, version_utils.ModelVersionSelector):
       super(Model, self).__setattr__(name, value)
       return
 
-    if all(
+    if any(
         isinstance(v, (base_layer.Layer,
                        data_structures.TrackableDataStructure)) or
         trackable_layer_utils.has_weights(v) for v in nest.flatten(value)):

From ac123654efc63ffa17240479a5b926ca6357c766 Mon Sep 17 00:00:00 2001
From: "902449@58880@bigcat_chen@ASIC" <bigcat_chen@himax.com.tw>
Date: Fri, 5 Jun 2020 10:03:47 +0800
Subject: [PATCH 0085/1390] TFLM: update hello world example readme

---
 .../lite/micro/examples/hello_world/README.md |   4 +-
 .../micro/examples/hello_world/README.md~     | 575 ++++++++++++++++++
 2 files changed, 577 insertions(+), 2 deletions(-)
 create mode 100644 tensorflow/lite/micro/examples/hello_world/README.md~

diff --git a/tensorflow/lite/micro/examples/hello_world/README.md b/tensorflow/lite/micro/examples/hello_world/README.md
index d3762ada790..26b0f12c83a 100644
--- a/tensorflow/lite/micro/examples/hello_world/README.md
+++ b/tensorflow/lite/micro/examples/hello_world/README.md
@@ -17,7 +17,7 @@ of the device.
 -   [Deploy to ARC EM SDP](#deploy-to-arc-em-sdp)
 -   [Deploy to Arduino](#deploy-to-arduino)
 -   [Deploy to ESP32](#deploy-to-esp32)
--   [Deploy to himax WE1 EVB](#deploy-to-himax-we1-evb)
+-   [Deploy to Himax WE1 EVB](#deploy-to-himax-we1-evb)
 -   [Deploy to SparkFun Edge](#deploy-to-sparkfun-edge)
 -   [Deploy to STM32F746](#deploy-to-STM32F746)
 -   [Run the tests on a development machine](#run-the-tests-on-a-development-machine)
@@ -192,7 +192,7 @@ The previous two commands can be combined:
 idf.py --port /dev/ttyUSB0 flash monitor
 ```
 
-## Deploy to himax WE1 EVB
+## Deploy to Himax WE1 EVB
 
 The following instructions will help you build and deploy this example to
 [HIMAX WE1 EVB](https://github.com/HimaxWiseEyePlus/bsp_tflu/tree/master/HIMAX_WE1_EVB_board_brief)
diff --git a/tensorflow/lite/micro/examples/hello_world/README.md~ b/tensorflow/lite/micro/examples/hello_world/README.md~
new file mode 100644
index 00000000000..d3762ada790
--- /dev/null
+++ b/tensorflow/lite/micro/examples/hello_world/README.md~
@@ -0,0 +1,575 @@
+# Hello World Example
+
+This example is designed to demonstrate the absolute basics of using [TensorFlow
+Lite for Microcontrollers](https://www.tensorflow.org/lite/microcontrollers).
+It includes the full end-to-end workflow of training a model, converting it for
+use with TensorFlow Lite for Microcontrollers for running inference on a
+microcontroller.
+
+The model is trained to replicate a `sine` function and generates a pattern of
+data to either blink LEDs or control an animation, depending on the capabilities
+of the device.
+
+![Animation on STM32F746](images/animation_on_STM32F746.gif)
+
+## Table of contents
+
+-   [Deploy to ARC EM SDP](#deploy-to-arc-em-sdp)
+-   [Deploy to Arduino](#deploy-to-arduino)
+-   [Deploy to ESP32](#deploy-to-esp32)
+-   [Deploy to himax WE1 EVB](#deploy-to-himax-we1-evb)
+-   [Deploy to SparkFun Edge](#deploy-to-sparkfun-edge)
+-   [Deploy to STM32F746](#deploy-to-STM32F746)
+-   [Run the tests on a development machine](#run-the-tests-on-a-development-machine)
+-   [Train your own model](#train-your-own-model)
+
+## Deploy to ARC EM SDP
+
+The following instructions will help you to build and deploy this example to
+[ARC EM SDP](https://www.synopsys.com/dw/ipdir.php?ds=arc-em-software-development-platform)
+board. General information and instructions on using the board with TensorFlow
+Lite Micro can be found in the common
+[ARC targets description](/tensorflow/lite/micro/tools/make/targets/arc/README.md).
+
+### Initial Setup
+
+Follow the instructions on the
+[ARC EM SDP Initial Setup](/tensorflow/lite/micro/tools/make/targets/arc/README.md#ARC-EM-Software-Development-Platform-ARC-EM-SDP)
+to get and install all required tools for work with ARC EM SDP.
+
+### Generate Example Project
+
+The example project for ARC EM SDP platform can be generated with the following
+command:
+
+```
+make -f tensorflow/lite/micro/tools/make/Makefile TARGET=arc_emsdp TAGS=no_arc_mli generate_hello_world_make_project
+```
+
+### Build and Run Example
+
+For more detailed information on building and running examples see the
+appropriate sections of general descriptions of the
+[ARC EM SDP usage with TFLM](/tensorflow/lite/micro/tools/make/targets/arc/README.md#ARC-EM-Software-Development-Platform-ARC-EM-SDP).
+In the directory with generated project you can also find a
+*README_ARC_EMSDP.md* file with instructions and options on building and
+running. Here we only briefly mention main steps which are typically enough to
+get it started.
+
+1.  You need to
+    [connect the board](/tensorflow/lite/micro/tools/make/targets/arc/README.md#connect-the-board)
+    and open an serial connection.
+
+2.  Go to the generated example project director
+
+    ```
+    cd tensorflow/lite/micro/tools/make/gen/arc_emsdp_arc/prj/hello_world/make
+    ```
+
+3.  Build the example using
+
+    ```
+    make app
+    ```
+
+4.  To generate artefacts for self-boot of example from the board use
+
+    ```
+    make flash
+    ```
+
+5.  To run application from the board using microSD card:
+
+    *   Copy the content of the created /bin folder into the root of microSD
+        card. Note that the card must be formatted as FAT32 with default cluster
+        size (but less than 32 Kbytes)
+    *   Plug in the microSD card into the J11 connector.
+    *   Push the RST button. If a red LED is lit beside RST button, push the CFG
+        button.
+
+6.  If you have the MetaWare Debugger installed in your environment:
+
+    *   To run application from the console using it type `make run`.
+    *   To stop the execution type `Ctrl+C` in the console several times.
+
+In both cases (step 5 and 6) you will see the application output in the serial
+terminal.
+
+## Deploy to Arduino
+
+The following instructions will help you build and deploy this sample
+to [Arduino](https://www.arduino.cc/) devices.
+
+![Animation on Arduino MKRZERO](images/animation_on_arduino_mkrzero.gif)
+
+The sample has been tested with the following devices:
+
+- [Arduino Nano 33 BLE Sense](https://store.arduino.cc/usa/nano-33-ble-sense-with-headers)
+- [Arduino MKRZERO](https://store.arduino.cc/usa/arduino-mkrzero)
+
+The sample will use PWM to fade an LED on and off according to the model's
+output. In the code, the `LED_BUILTIN` constant is used to specify the board's
+built-in LED as the one being controlled. However, on some boards, this built-in
+LED is not attached to a pin with PWM capabilities. In this case, the LED will
+blink instead of fading.
+
+### Install the Arduino_TensorFlowLite library
+
+This example application is included as part of the official TensorFlow Lite
+Arduino library. To install it, open the Arduino library manager in
+`Tools -> Manage Libraries...` and search for `Arduino_TensorFlowLite`.
+
+### Load and run the example
+
+Once the library has been added, go to `File -> Examples`. You should see an
+example near the bottom of the list named `TensorFlowLite:hello_world`. Select
+it and click `hello_world` to load the example.
+
+Use the Arduino IDE to build and upload the example. Once it is running,
+you should see the built-in LED on your device flashing.
+
+The Arduino Desktop IDE includes a plotter that we can use to display the sine
+wave graphically. To view it, go to `Tools -> Serial Plotter`. You will see one
+datapoint being logged for each inference cycle, expressed as a number between 0
+and 255.
+
+## Deploy to ESP32
+
+The following instructions will help you build and deploy this sample
+to [ESP32](https://www.espressif.com/en/products/hardware/esp32/overview)
+devices using the [ESP IDF](https://github.com/espressif/esp-idf).
+
+The sample has been tested on ESP-IDF version 4.0 with the following devices:
+- [ESP32-DevKitC](http://esp-idf.readthedocs.io/en/latest/get-started/get-started-devkitc.html)
+- [ESP-EYE](https://github.com/espressif/esp-who/blob/master/docs/en/get-started/ESP-EYE_Getting_Started_Guide.md)
+
+### Install the ESP IDF
+
+Follow the instructions of the
+[ESP-IDF get started guide](https://docs.espressif.com/projects/esp-idf/en/latest/get-started/index.html)
+to setup the toolchain and the ESP-IDF itself.
+
+The next steps assume that the
+[IDF environment variables are set](https://docs.espressif.com/projects/esp-idf/en/latest/get-started/index.html#step-4-set-up-the-environment-variables) :
+
+ * The `IDF_PATH` environment variable is set
+ * `idf.py` and Xtensa-esp32 tools (e.g. `xtensa-esp32-elf-gcc`) are in `$PATH`
+
+### Generate the examples
+The example project can be generated with the following command:
+```
+make -f tensorflow/lite/micro/tools/make/Makefile TARGET=esp generate_hello_world_esp_project
+```
+
+### Building the example
+
+Go the the example project directory
+```
+cd tensorflow/lite/micro/tools/make/gen/esp_xtensa-esp32/prj/hello_world/esp-idf
+```
+
+Then build with `idf.py`
+```
+idf.py build
+```
+
+### Load and run the example
+
+To flash (replace `/dev/ttyUSB0` with the device serial port):
+```
+idf.py --port /dev/ttyUSB0 flash
+```
+
+Monitor the serial output:
+```
+idf.py --port /dev/ttyUSB0 monitor
+```
+
+Use `Ctrl+]` to exit.
+
+The previous two commands can be combined:
+```
+idf.py --port /dev/ttyUSB0 flash monitor
+```
+
+## Deploy to himax WE1 EVB
+
+The following instructions will help you build and deploy this example to
+[HIMAX WE1 EVB](https://github.com/HimaxWiseEyePlus/bsp_tflu/tree/master/HIMAX_WE1_EVB_board_brief)
+board. To undstand more about using this board, please check 
+[HIMAX WE1 EVB user guide](https://github.com/HimaxWiseEyePlus/bsp_tflu/tree/master/HIMAX_WE1_EVB_user_guide).
+
+### Initial Setup
+
+To use the HIMAX WE1 EVB, please make sure following software are installed:
+
+#### MetaWare Development Toolkit
+
+See
+[Install the Synopsys DesignWare ARC MetaWare Development Toolkit](/tensorflow/lite/micro/tools/make/targets/arc/README.md#install-the-synopsys-designware-arc-metaware-development-toolkit)
+section for instructions on toolchain installation.
+
+#### Make Tool version
+
+A `'make'` tool is required for deploying Tensorflow Lite Micro
+applications on HIMAX WE1 EVB, See 
+[Check make tool version](/tensorflow/lite/micro/tools/make/targets/arc/README.md#make-tool)
+section for proper environment.
+
+#### Serial Terminal Emulation Application
+
+There are 2 main purposes for HIMAX WE1 EVB Debug UART port 
+
+- print application output
+- burn application to flash by using xmodem send application binary
+
+You can use any terminal emulation program (like [PuTTY](https://www.putty.org/) or [minicom](https://linux.die.net/man/1/minicom)).
+
+
+### Generate Example Project
+
+The example project for HIMAX WE1 EVB platform can be generated with the following
+command:
+
+Download related third party data
+
+```
+make -f tensorflow/lite/micro/tools/make/Makefile TARGET=himax_we1_evb third_party_downloads
+```
+
+Generate hello world project
+
+```
+make -f tensorflow/lite/micro/tools/make/Makefile generate_hello_world_make_project TARGET=himax_we1_evb TAGS=no_arc_mli
+```
+
+### Build and Burn Example
+
+Following the Steps to run hello world example at HIMAX WE1 EVB platform. 
+
+1.  Go to the generated example project directory.
+
+    ```
+    cd tensorflow/lite/micro/tools/make/gen/himax_we1_evb_arc/prj/hello_world/make
+    ```
+    
+2.  Build the example using
+
+    ```
+    make app
+    ```
+
+3.  After example build finish, copy ELF file and map file to image generate tool directory.  
+    image generate tool directory  located at `'tensorflow/lite/micro/tools/make/downloads/himax_we1_sdk/image_gen_linux_v3/'` 
+
+    ```
+    cp hello_world.elf himax_we1_evb.map ../../../../../downloads/himax_we1_sdk/image_gen_linux_v3/
+    ```
+
+4.  Go to flash image generate tool directory. 
+
+    ```
+    cd ../../../../../downloads/himax_we1_sdk/image_gen_linux_v3/
+    ```
+
+5.  run image generate tool, generate flash image file. 
+
+    *   Before running image generate tool, by typing `sudo chmod +x image_gen`
+        and `sudo chmod +x sign_tool` to make sure it is executable. 
+
+    ```
+    image_gen -e hello_world.elf -m himax_we1_evb.map -o out.img
+    ```    
+       
+
+6.  Download flash image file to HIMAX WE1 EVB by UART:
+
+    *   more detail about download image through UART can be found at [HIMAX WE1 EVB update Flash image](https://github.com/HimaxWiseEyePlus/bsp_tflu/tree/master/HIMAX_WE1_EVB_user_guide#flash-image-update)
+
+After these steps, press reset button on the HIMAX WE1 EVB, you will see application output in the serial
+terminal.
+
+## Deploy to SparkFun Edge
+
+The following instructions will help you build and deploy this sample on the
+[SparkFun Edge development board](https://sparkfun.com/products/15170).
+
+![Animation on SparkFun Edge](images/animation_on_sparkfun_edge.gif)
+
+If you're new to using this board, we recommend walking through the
+[AI on a microcontroller with TensorFlow Lite and SparkFun Edge](https://codelabs.developers.google.com/codelabs/sparkfun-tensorflow)
+codelab to get an understanding of the workflow.
+
+### Compile the binary
+
+The following command will download the required dependencies and then compile a
+binary for the SparkFun Edge:
+
+```
+make -f tensorflow/lite/micro/tools/make/Makefile TARGET=sparkfun_edge hello_world_bin
+```
+
+The binary will be created in the following location:
+
+```
+tensorflow/lite/micro/tools/make/gen/sparkfun_edge_cortex-m4/bin/hello_world.bin
+```
+
+### Sign the binary
+
+The binary must be signed with cryptographic keys to be deployed to the device.
+We'll now run some commands that will sign our binary so it can be flashed to
+the SparkFun Edge. The scripts we are using come from the Ambiq SDK, which is
+downloaded when the `Makefile` is run.
+
+Enter the following command to set up some dummy cryptographic keys we can use
+for development:
+
+```
+cp tensorflow/lite/micro/tools/make/downloads/AmbiqSuite-Rel2.2.0/tools/apollo3_scripts/keys_info0.py \
+tensorflow/lite/micro/tools/make/downloads/AmbiqSuite-Rel2.2.0/tools/apollo3_scripts/keys_info.py
+```
+
+Next, run the following command to create a signed binary:
+
+```
+python3 tensorflow/lite/micro/tools/make/downloads/AmbiqSuite-Rel2.2.0/tools/apollo3_scripts/create_cust_image_blob.py \
+--bin tensorflow/lite/micro/tools/make/gen/sparkfun_edge_cortex-m4/bin/hello_world.bin \
+--load-address 0xC000 \
+--magic-num 0xCB \
+-o main_nonsecure_ota \
+--version 0x0
+```
+
+This will create the file `main_nonsecure_ota.bin`. We'll now run another
+command to create a final version of the file that can be used to flash our
+device with the bootloader script we will use in the next step:
+
+```
+python3 tensorflow/lite/micro/tools/make/downloads/AmbiqSuite-Rel2.2.0/tools/apollo3_scripts/create_cust_wireupdate_blob.py \
+--load-address 0x20000 \
+--bin main_nonsecure_ota.bin \
+-i 6 \
+-o main_nonsecure_wire \
+--options 0x1
+```
+
+You should now have a file called `main_nonsecure_wire.bin` in the directory
+where you ran the commands. This is the file we'll be flashing to the device.
+
+### Flash the binary
+
+Next, attach the board to your computer via a USB-to-serial adapter.
+
+**Note:** If you're using the [SparkFun Serial Basic Breakout](https://www.sparkfun.com/products/15096),
+you should [install the latest drivers](https://learn.sparkfun.com/tutorials/sparkfun-serial-basic-ch340c-hookup-guide#drivers-if-you-need-them)
+before you continue.
+
+Once connected, assign the USB device name to an environment variable:
+
+```
+export DEVICENAME=put your device name here
+```
+
+Set another variable with the baud rate:
+
+```
+export BAUD_RATE=921600
+```
+
+Now, hold the button marked `14` on the device. While still holding the button,
+hit the button marked `RST`. Continue holding the button marked `14` while
+running the following command:
+
+```
+python3 tensorflow/lite/micro/tools/make/downloads/AmbiqSuite-Rel2.2.0/tools/apollo3_scripts/uart_wired_update.py \
+-b ${BAUD_RATE} ${DEVICENAME} \
+-r 1 \
+-f main_nonsecure_wire.bin \
+-i 6
+```
+
+You should see a long stream of output as the binary is flashed to the device.
+Once you see the following lines, flashing is complete:
+
+```
+Sending Reset Command.
+Done.
+```
+
+If you don't see these lines, flashing may have failed. Try running through the
+steps in [Flash the binary](#flash-the-binary) again (you can skip over setting
+the environment variables). If you continue to run into problems, follow the
+[AI on a microcontroller with TensorFlow Lite and SparkFun Edge](https://codelabs.developers.google.com/codelabs/sparkfun-tensorflow)
+codelab, which includes more comprehensive instructions for the flashing
+process.
+
+The binary should now be deployed to the device. Hit the button marked `RST` to
+reboot the board. You should see the device's four LEDs flashing in sequence.
+
+Debug information is logged by the board while the program is running. To view
+it, establish a serial connection to the board using a baud rate of `115200`.
+On OSX and Linux, the following command should work:
+
+```
+screen ${DEVICENAME} 115200
+```
+
+You will see a lot of output flying past! To stop the scrolling, hit `Ctrl+A`,
+immediately followed by `Esc`. You can then use the arrow keys to explore the
+output, which will contain the results of running inference on various `x`
+values:
+
+```
+x_value: 1.1843798*2^2, y_value: -1.9542645*2^-1
+```
+
+To stop viewing the debug output with `screen`, hit `Ctrl+A`, immediately
+followed by the `K` key, then hit the `Y` key.
+
+
+## Deploy to STM32F746
+
+The following instructions will help you build and deploy the sample to the
+[STM32F7 discovery kit](https://os.mbed.com/platforms/ST-Discovery-F746NG/)
+using [ARM Mbed](https://github.com/ARMmbed/mbed-cli).
+
+![Animation on STM32F746](images/animation_on_STM32F746.gif)
+
+Before we begin, you'll need the following:
+
+- STM32F7 discovery kit board
+- Mini-USB cable
+- ARM Mbed CLI ([installation instructions](https://os.mbed.com/docs/mbed-os/v5.12/tools/installation-and-setup.html))
+- Python 2.7 and pip
+
+Since Mbed requires a special folder structure for projects, we'll first run a
+command to generate a subfolder containing the required source files in this
+structure:
+
+```
+make -f tensorflow/lite/micro/tools/make/Makefile TARGET=mbed TAGS="CMSIS disco_f746ng" generate_hello_world_mbed_project
+```
+
+This will result in the creation of a new folder:
+
+```
+tensorflow/lite/micro/tools/make/gen/mbed_cortex-m4/prj/hello_world/mbed
+```
+
+This folder contains all of the example's dependencies structured in the correct
+way for Mbed to be able to build it.
+
+Change into the directory and run the following commands, making sure you are
+using Python 2.7.15.
+
+First, tell Mbed that the current directory is the root of an Mbed project:
+
+```
+mbed config root .
+```
+
+Next, tell Mbed to download the dependencies and prepare to build:
+
+```
+mbed deploy
+```
+
+By default, Mbed will build the project using C++98. However, TensorFlow Lite
+requires C++11. Run the following Python snippet to modify the Mbed
+configuration files so that it uses C++11:
+
+```
+python -c 'import fileinput, glob;
+for filename in glob.glob("mbed-os/tools/profiles/*.json"):
+  for line in fileinput.input(filename, inplace=True):
+    print line.replace("\"-std=gnu++98\"","\"-std=c++11\", \"-fpermissive\"")'
+
+```
+
+Finally, run the following command to compile:
+
+```
+mbed compile -m DISCO_F746NG -t GCC_ARM
+```
+
+This should result in a binary at the following path:
+
+```
+./BUILD/DISCO_F746NG/GCC_ARM/mbed.bin
+```
+
+To deploy, plug in your STM board and copy the file to it. On MacOS, you can do
+this with the following command:
+
+```
+cp ./BUILD/DISCO_F746NG/GCC_ARM/mbed.bin /Volumes/DIS_F746NG/
+```
+
+Copying the file will initiate the flashing process. Once this is complete, you
+should see an animation on the device's screen.
+
+
+```
+screen /dev/tty.usbmodem14403 9600
+```
+
+In addition to this animation, debug information is logged by the board while
+the program is running. To view it, establish a serial connection to the board
+using a baud rate of `9600`. On OSX and Linux, the following command should
+work, replacing `/dev/tty.devicename` with the name of your device as it appears
+in `/dev`:
+
+```
+screen /dev/tty.devicename 9600
+```
+
+You will see a lot of output flying past! To stop the scrolling, hit `Ctrl+A`,
+immediately followed by `Esc`. You can then use the arrow keys to explore the
+output, which will contain the results of running inference on various `x`
+values:
+
+```
+x_value: 1.1843798*2^2, y_value: -1.9542645*2^-1
+```
+
+To stop viewing the debug output with `screen`, hit `Ctrl+A`, immediately
+followed by the `K` key, then hit the `Y` key.
+
+### Run the tests on a development machine
+
+To compile and test this example on a desktop Linux or macOS machine, first
+clone the TensorFlow repository from GitHub to a convenient place:
+
+```bash
+git clone --depth 1 https://github.com/tensorflow/tensorflow.git
+```
+
+Next, `cd` into the source directory from a terminal, and then run the following
+command:
+
+```bash
+make -f tensorflow/lite/micro/tools/make/Makefile test_hello_world_test
+```
+
+This will take a few minutes, and downloads frameworks the code uses. Once the
+process has finished, you should see a series of files get compiled, followed by
+some logging output from a test, which should conclude with
+`~~~ALL TESTS PASSED~~~`.
+
+If you see this, it means that a small program has been built and run that loads
+the trained TensorFlow model, runs some example inputs through it, and got the
+expected outputs.
+
+To understand how TensorFlow Lite does this, you can look at the source in
+[hello_world_test.cc](hello_world_test.cc).
+It's a fairly small amount of code that creates an interpreter, gets a handle to
+a model that's been compiled into the program, and then invokes the interpreter
+with the model and sample inputs.
+
+### Train your own model
+
+So far you have used an existing trained model to run inference on
+microcontrollers. If you wish to train your own model, follow the instructions
+given in the [train/](train/) directory.
+

From 1c26e6abd76fe700ecf87d892ceed1dc5bfa90d3 Mon Sep 17 00:00:00 2001
From: "902449@58880@bigcat_chen@ASIC" <bigcat_chen@himax.com.tw>
Date: Fri, 5 Jun 2020 10:12:46 +0800
Subject: [PATCH 0086/1390] TFLM: delete temp readme file in hello world
 example

---
 .../micro/examples/hello_world/README.md~     | 575 ------------------
 1 file changed, 575 deletions(-)
 delete mode 100644 tensorflow/lite/micro/examples/hello_world/README.md~

diff --git a/tensorflow/lite/micro/examples/hello_world/README.md~ b/tensorflow/lite/micro/examples/hello_world/README.md~
deleted file mode 100644
index d3762ada790..00000000000
--- a/tensorflow/lite/micro/examples/hello_world/README.md~
+++ /dev/null
@@ -1,575 +0,0 @@
-# Hello World Example
-
-This example is designed to demonstrate the absolute basics of using [TensorFlow
-Lite for Microcontrollers](https://www.tensorflow.org/lite/microcontrollers).
-It includes the full end-to-end workflow of training a model, converting it for
-use with TensorFlow Lite for Microcontrollers for running inference on a
-microcontroller.
-
-The model is trained to replicate a `sine` function and generates a pattern of
-data to either blink LEDs or control an animation, depending on the capabilities
-of the device.
-
-![Animation on STM32F746](images/animation_on_STM32F746.gif)
-
-## Table of contents
-
--   [Deploy to ARC EM SDP](#deploy-to-arc-em-sdp)
--   [Deploy to Arduino](#deploy-to-arduino)
--   [Deploy to ESP32](#deploy-to-esp32)
--   [Deploy to himax WE1 EVB](#deploy-to-himax-we1-evb)
--   [Deploy to SparkFun Edge](#deploy-to-sparkfun-edge)
--   [Deploy to STM32F746](#deploy-to-STM32F746)
--   [Run the tests on a development machine](#run-the-tests-on-a-development-machine)
--   [Train your own model](#train-your-own-model)
-
-## Deploy to ARC EM SDP
-
-The following instructions will help you to build and deploy this example to
-[ARC EM SDP](https://www.synopsys.com/dw/ipdir.php?ds=arc-em-software-development-platform)
-board. General information and instructions on using the board with TensorFlow
-Lite Micro can be found in the common
-[ARC targets description](/tensorflow/lite/micro/tools/make/targets/arc/README.md).
-
-### Initial Setup
-
-Follow the instructions on the
-[ARC EM SDP Initial Setup](/tensorflow/lite/micro/tools/make/targets/arc/README.md#ARC-EM-Software-Development-Platform-ARC-EM-SDP)
-to get and install all required tools for work with ARC EM SDP.
-
-### Generate Example Project
-
-The example project for ARC EM SDP platform can be generated with the following
-command:
-
-```
-make -f tensorflow/lite/micro/tools/make/Makefile TARGET=arc_emsdp TAGS=no_arc_mli generate_hello_world_make_project
-```
-
-### Build and Run Example
-
-For more detailed information on building and running examples see the
-appropriate sections of general descriptions of the
-[ARC EM SDP usage with TFLM](/tensorflow/lite/micro/tools/make/targets/arc/README.md#ARC-EM-Software-Development-Platform-ARC-EM-SDP).
-In the directory with generated project you can also find a
-*README_ARC_EMSDP.md* file with instructions and options on building and
-running. Here we only briefly mention main steps which are typically enough to
-get it started.
-
-1.  You need to
-    [connect the board](/tensorflow/lite/micro/tools/make/targets/arc/README.md#connect-the-board)
-    and open an serial connection.
-
-2.  Go to the generated example project director
-
-    ```
-    cd tensorflow/lite/micro/tools/make/gen/arc_emsdp_arc/prj/hello_world/make
-    ```
-
-3.  Build the example using
-
-    ```
-    make app
-    ```
-
-4.  To generate artefacts for self-boot of example from the board use
-
-    ```
-    make flash
-    ```
-
-5.  To run application from the board using microSD card:
-
-    *   Copy the content of the created /bin folder into the root of microSD
-        card. Note that the card must be formatted as FAT32 with default cluster
-        size (but less than 32 Kbytes)
-    *   Plug in the microSD card into the J11 connector.
-    *   Push the RST button. If a red LED is lit beside RST button, push the CFG
-        button.
-
-6.  If you have the MetaWare Debugger installed in your environment:
-
-    *   To run application from the console using it type `make run`.
-    *   To stop the execution type `Ctrl+C` in the console several times.
-
-In both cases (step 5 and 6) you will see the application output in the serial
-terminal.
-
-## Deploy to Arduino
-
-The following instructions will help you build and deploy this sample
-to [Arduino](https://www.arduino.cc/) devices.
-
-![Animation on Arduino MKRZERO](images/animation_on_arduino_mkrzero.gif)
-
-The sample has been tested with the following devices:
-
-- [Arduino Nano 33 BLE Sense](https://store.arduino.cc/usa/nano-33-ble-sense-with-headers)
-- [Arduino MKRZERO](https://store.arduino.cc/usa/arduino-mkrzero)
-
-The sample will use PWM to fade an LED on and off according to the model's
-output. In the code, the `LED_BUILTIN` constant is used to specify the board's
-built-in LED as the one being controlled. However, on some boards, this built-in
-LED is not attached to a pin with PWM capabilities. In this case, the LED will
-blink instead of fading.
-
-### Install the Arduino_TensorFlowLite library
-
-This example application is included as part of the official TensorFlow Lite
-Arduino library. To install it, open the Arduino library manager in
-`Tools -> Manage Libraries...` and search for `Arduino_TensorFlowLite`.
-
-### Load and run the example
-
-Once the library has been added, go to `File -> Examples`. You should see an
-example near the bottom of the list named `TensorFlowLite:hello_world`. Select
-it and click `hello_world` to load the example.
-
-Use the Arduino IDE to build and upload the example. Once it is running,
-you should see the built-in LED on your device flashing.
-
-The Arduino Desktop IDE includes a plotter that we can use to display the sine
-wave graphically. To view it, go to `Tools -> Serial Plotter`. You will see one
-datapoint being logged for each inference cycle, expressed as a number between 0
-and 255.
-
-## Deploy to ESP32
-
-The following instructions will help you build and deploy this sample
-to [ESP32](https://www.espressif.com/en/products/hardware/esp32/overview)
-devices using the [ESP IDF](https://github.com/espressif/esp-idf).
-
-The sample has been tested on ESP-IDF version 4.0 with the following devices:
-- [ESP32-DevKitC](http://esp-idf.readthedocs.io/en/latest/get-started/get-started-devkitc.html)
-- [ESP-EYE](https://github.com/espressif/esp-who/blob/master/docs/en/get-started/ESP-EYE_Getting_Started_Guide.md)
-
-### Install the ESP IDF
-
-Follow the instructions of the
-[ESP-IDF get started guide](https://docs.espressif.com/projects/esp-idf/en/latest/get-started/index.html)
-to setup the toolchain and the ESP-IDF itself.
-
-The next steps assume that the
-[IDF environment variables are set](https://docs.espressif.com/projects/esp-idf/en/latest/get-started/index.html#step-4-set-up-the-environment-variables) :
-
- * The `IDF_PATH` environment variable is set
- * `idf.py` and Xtensa-esp32 tools (e.g. `xtensa-esp32-elf-gcc`) are in `$PATH`
-
-### Generate the examples
-The example project can be generated with the following command:
-```
-make -f tensorflow/lite/micro/tools/make/Makefile TARGET=esp generate_hello_world_esp_project
-```
-
-### Building the example
-
-Go the the example project directory
-```
-cd tensorflow/lite/micro/tools/make/gen/esp_xtensa-esp32/prj/hello_world/esp-idf
-```
-
-Then build with `idf.py`
-```
-idf.py build
-```
-
-### Load and run the example
-
-To flash (replace `/dev/ttyUSB0` with the device serial port):
-```
-idf.py --port /dev/ttyUSB0 flash
-```
-
-Monitor the serial output:
-```
-idf.py --port /dev/ttyUSB0 monitor
-```
-
-Use `Ctrl+]` to exit.
-
-The previous two commands can be combined:
-```
-idf.py --port /dev/ttyUSB0 flash monitor
-```
-
-## Deploy to himax WE1 EVB
-
-The following instructions will help you build and deploy this example to
-[HIMAX WE1 EVB](https://github.com/HimaxWiseEyePlus/bsp_tflu/tree/master/HIMAX_WE1_EVB_board_brief)
-board. To undstand more about using this board, please check 
-[HIMAX WE1 EVB user guide](https://github.com/HimaxWiseEyePlus/bsp_tflu/tree/master/HIMAX_WE1_EVB_user_guide).
-
-### Initial Setup
-
-To use the HIMAX WE1 EVB, please make sure following software are installed:
-
-#### MetaWare Development Toolkit
-
-See
-[Install the Synopsys DesignWare ARC MetaWare Development Toolkit](/tensorflow/lite/micro/tools/make/targets/arc/README.md#install-the-synopsys-designware-arc-metaware-development-toolkit)
-section for instructions on toolchain installation.
-
-#### Make Tool version
-
-A `'make'` tool is required for deploying Tensorflow Lite Micro
-applications on HIMAX WE1 EVB, See 
-[Check make tool version](/tensorflow/lite/micro/tools/make/targets/arc/README.md#make-tool)
-section for proper environment.
-
-#### Serial Terminal Emulation Application
-
-There are 2 main purposes for HIMAX WE1 EVB Debug UART port 
-
-- print application output
-- burn application to flash by using xmodem send application binary
-
-You can use any terminal emulation program (like [PuTTY](https://www.putty.org/) or [minicom](https://linux.die.net/man/1/minicom)).
-
-
-### Generate Example Project
-
-The example project for HIMAX WE1 EVB platform can be generated with the following
-command:
-
-Download related third party data
-
-```
-make -f tensorflow/lite/micro/tools/make/Makefile TARGET=himax_we1_evb third_party_downloads
-```
-
-Generate hello world project
-
-```
-make -f tensorflow/lite/micro/tools/make/Makefile generate_hello_world_make_project TARGET=himax_we1_evb TAGS=no_arc_mli
-```
-
-### Build and Burn Example
-
-Following the Steps to run hello world example at HIMAX WE1 EVB platform. 
-
-1.  Go to the generated example project directory.
-
-    ```
-    cd tensorflow/lite/micro/tools/make/gen/himax_we1_evb_arc/prj/hello_world/make
-    ```
-    
-2.  Build the example using
-
-    ```
-    make app
-    ```
-
-3.  After example build finish, copy ELF file and map file to image generate tool directory.  
-    image generate tool directory  located at `'tensorflow/lite/micro/tools/make/downloads/himax_we1_sdk/image_gen_linux_v3/'` 
-
-    ```
-    cp hello_world.elf himax_we1_evb.map ../../../../../downloads/himax_we1_sdk/image_gen_linux_v3/
-    ```
-
-4.  Go to flash image generate tool directory. 
-
-    ```
-    cd ../../../../../downloads/himax_we1_sdk/image_gen_linux_v3/
-    ```
-
-5.  run image generate tool, generate flash image file. 
-
-    *   Before running image generate tool, by typing `sudo chmod +x image_gen`
-        and `sudo chmod +x sign_tool` to make sure it is executable. 
-
-    ```
-    image_gen -e hello_world.elf -m himax_we1_evb.map -o out.img
-    ```    
-       
-
-6.  Download flash image file to HIMAX WE1 EVB by UART:
-
-    *   more detail about download image through UART can be found at [HIMAX WE1 EVB update Flash image](https://github.com/HimaxWiseEyePlus/bsp_tflu/tree/master/HIMAX_WE1_EVB_user_guide#flash-image-update)
-
-After these steps, press reset button on the HIMAX WE1 EVB, you will see application output in the serial
-terminal.
-
-## Deploy to SparkFun Edge
-
-The following instructions will help you build and deploy this sample on the
-[SparkFun Edge development board](https://sparkfun.com/products/15170).
-
-![Animation on SparkFun Edge](images/animation_on_sparkfun_edge.gif)
-
-If you're new to using this board, we recommend walking through the
-[AI on a microcontroller with TensorFlow Lite and SparkFun Edge](https://codelabs.developers.google.com/codelabs/sparkfun-tensorflow)
-codelab to get an understanding of the workflow.
-
-### Compile the binary
-
-The following command will download the required dependencies and then compile a
-binary for the SparkFun Edge:
-
-```
-make -f tensorflow/lite/micro/tools/make/Makefile TARGET=sparkfun_edge hello_world_bin
-```
-
-The binary will be created in the following location:
-
-```
-tensorflow/lite/micro/tools/make/gen/sparkfun_edge_cortex-m4/bin/hello_world.bin
-```
-
-### Sign the binary
-
-The binary must be signed with cryptographic keys to be deployed to the device.
-We'll now run some commands that will sign our binary so it can be flashed to
-the SparkFun Edge. The scripts we are using come from the Ambiq SDK, which is
-downloaded when the `Makefile` is run.
-
-Enter the following command to set up some dummy cryptographic keys we can use
-for development:
-
-```
-cp tensorflow/lite/micro/tools/make/downloads/AmbiqSuite-Rel2.2.0/tools/apollo3_scripts/keys_info0.py \
-tensorflow/lite/micro/tools/make/downloads/AmbiqSuite-Rel2.2.0/tools/apollo3_scripts/keys_info.py
-```
-
-Next, run the following command to create a signed binary:
-
-```
-python3 tensorflow/lite/micro/tools/make/downloads/AmbiqSuite-Rel2.2.0/tools/apollo3_scripts/create_cust_image_blob.py \
---bin tensorflow/lite/micro/tools/make/gen/sparkfun_edge_cortex-m4/bin/hello_world.bin \
---load-address 0xC000 \
---magic-num 0xCB \
--o main_nonsecure_ota \
---version 0x0
-```
-
-This will create the file `main_nonsecure_ota.bin`. We'll now run another
-command to create a final version of the file that can be used to flash our
-device with the bootloader script we will use in the next step:
-
-```
-python3 tensorflow/lite/micro/tools/make/downloads/AmbiqSuite-Rel2.2.0/tools/apollo3_scripts/create_cust_wireupdate_blob.py \
---load-address 0x20000 \
---bin main_nonsecure_ota.bin \
--i 6 \
--o main_nonsecure_wire \
---options 0x1
-```
-
-You should now have a file called `main_nonsecure_wire.bin` in the directory
-where you ran the commands. This is the file we'll be flashing to the device.
-
-### Flash the binary
-
-Next, attach the board to your computer via a USB-to-serial adapter.
-
-**Note:** If you're using the [SparkFun Serial Basic Breakout](https://www.sparkfun.com/products/15096),
-you should [install the latest drivers](https://learn.sparkfun.com/tutorials/sparkfun-serial-basic-ch340c-hookup-guide#drivers-if-you-need-them)
-before you continue.
-
-Once connected, assign the USB device name to an environment variable:
-
-```
-export DEVICENAME=put your device name here
-```
-
-Set another variable with the baud rate:
-
-```
-export BAUD_RATE=921600
-```
-
-Now, hold the button marked `14` on the device. While still holding the button,
-hit the button marked `RST`. Continue holding the button marked `14` while
-running the following command:
-
-```
-python3 tensorflow/lite/micro/tools/make/downloads/AmbiqSuite-Rel2.2.0/tools/apollo3_scripts/uart_wired_update.py \
--b ${BAUD_RATE} ${DEVICENAME} \
--r 1 \
--f main_nonsecure_wire.bin \
--i 6
-```
-
-You should see a long stream of output as the binary is flashed to the device.
-Once you see the following lines, flashing is complete:
-
-```
-Sending Reset Command.
-Done.
-```
-
-If you don't see these lines, flashing may have failed. Try running through the
-steps in [Flash the binary](#flash-the-binary) again (you can skip over setting
-the environment variables). If you continue to run into problems, follow the
-[AI on a microcontroller with TensorFlow Lite and SparkFun Edge](https://codelabs.developers.google.com/codelabs/sparkfun-tensorflow)
-codelab, which includes more comprehensive instructions for the flashing
-process.
-
-The binary should now be deployed to the device. Hit the button marked `RST` to
-reboot the board. You should see the device's four LEDs flashing in sequence.
-
-Debug information is logged by the board while the program is running. To view
-it, establish a serial connection to the board using a baud rate of `115200`.
-On OSX and Linux, the following command should work:
-
-```
-screen ${DEVICENAME} 115200
-```
-
-You will see a lot of output flying past! To stop the scrolling, hit `Ctrl+A`,
-immediately followed by `Esc`. You can then use the arrow keys to explore the
-output, which will contain the results of running inference on various `x`
-values:
-
-```
-x_value: 1.1843798*2^2, y_value: -1.9542645*2^-1
-```
-
-To stop viewing the debug output with `screen`, hit `Ctrl+A`, immediately
-followed by the `K` key, then hit the `Y` key.
-
-
-## Deploy to STM32F746
-
-The following instructions will help you build and deploy the sample to the
-[STM32F7 discovery kit](https://os.mbed.com/platforms/ST-Discovery-F746NG/)
-using [ARM Mbed](https://github.com/ARMmbed/mbed-cli).
-
-![Animation on STM32F746](images/animation_on_STM32F746.gif)
-
-Before we begin, you'll need the following:
-
-- STM32F7 discovery kit board
-- Mini-USB cable
-- ARM Mbed CLI ([installation instructions](https://os.mbed.com/docs/mbed-os/v5.12/tools/installation-and-setup.html))
-- Python 2.7 and pip
-
-Since Mbed requires a special folder structure for projects, we'll first run a
-command to generate a subfolder containing the required source files in this
-structure:
-
-```
-make -f tensorflow/lite/micro/tools/make/Makefile TARGET=mbed TAGS="CMSIS disco_f746ng" generate_hello_world_mbed_project
-```
-
-This will result in the creation of a new folder:
-
-```
-tensorflow/lite/micro/tools/make/gen/mbed_cortex-m4/prj/hello_world/mbed
-```
-
-This folder contains all of the example's dependencies structured in the correct
-way for Mbed to be able to build it.
-
-Change into the directory and run the following commands, making sure you are
-using Python 2.7.15.
-
-First, tell Mbed that the current directory is the root of an Mbed project:
-
-```
-mbed config root .
-```
-
-Next, tell Mbed to download the dependencies and prepare to build:
-
-```
-mbed deploy
-```
-
-By default, Mbed will build the project using C++98. However, TensorFlow Lite
-requires C++11. Run the following Python snippet to modify the Mbed
-configuration files so that it uses C++11:
-
-```
-python -c 'import fileinput, glob;
-for filename in glob.glob("mbed-os/tools/profiles/*.json"):
-  for line in fileinput.input(filename, inplace=True):
-    print line.replace("\"-std=gnu++98\"","\"-std=c++11\", \"-fpermissive\"")'
-
-```
-
-Finally, run the following command to compile:
-
-```
-mbed compile -m DISCO_F746NG -t GCC_ARM
-```
-
-This should result in a binary at the following path:
-
-```
-./BUILD/DISCO_F746NG/GCC_ARM/mbed.bin
-```
-
-To deploy, plug in your STM board and copy the file to it. On MacOS, you can do
-this with the following command:
-
-```
-cp ./BUILD/DISCO_F746NG/GCC_ARM/mbed.bin /Volumes/DIS_F746NG/
-```
-
-Copying the file will initiate the flashing process. Once this is complete, you
-should see an animation on the device's screen.
-
-
-```
-screen /dev/tty.usbmodem14403 9600
-```
-
-In addition to this animation, debug information is logged by the board while
-the program is running. To view it, establish a serial connection to the board
-using a baud rate of `9600`. On OSX and Linux, the following command should
-work, replacing `/dev/tty.devicename` with the name of your device as it appears
-in `/dev`:
-
-```
-screen /dev/tty.devicename 9600
-```
-
-You will see a lot of output flying past! To stop the scrolling, hit `Ctrl+A`,
-immediately followed by `Esc`. You can then use the arrow keys to explore the
-output, which will contain the results of running inference on various `x`
-values:
-
-```
-x_value: 1.1843798*2^2, y_value: -1.9542645*2^-1
-```
-
-To stop viewing the debug output with `screen`, hit `Ctrl+A`, immediately
-followed by the `K` key, then hit the `Y` key.
-
-### Run the tests on a development machine
-
-To compile and test this example on a desktop Linux or macOS machine, first
-clone the TensorFlow repository from GitHub to a convenient place:
-
-```bash
-git clone --depth 1 https://github.com/tensorflow/tensorflow.git
-```
-
-Next, `cd` into the source directory from a terminal, and then run the following
-command:
-
-```bash
-make -f tensorflow/lite/micro/tools/make/Makefile test_hello_world_test
-```
-
-This will take a few minutes, and downloads frameworks the code uses. Once the
-process has finished, you should see a series of files get compiled, followed by
-some logging output from a test, which should conclude with
-`~~~ALL TESTS PASSED~~~`.
-
-If you see this, it means that a small program has been built and run that loads
-the trained TensorFlow model, runs some example inputs through it, and got the
-expected outputs.
-
-To understand how TensorFlow Lite does this, you can look at the source in
-[hello_world_test.cc](hello_world_test.cc).
-It's a fairly small amount of code that creates an interpreter, gets a handle to
-a model that's been compiled into the program, and then invokes the interpreter
-with the model and sample inputs.
-
-### Train your own model
-
-So far you have used an existing trained model to run inference on
-microcontrollers. If you wish to train your own model, follow the instructions
-given in the [train/](train/) directory.
-

From d6676205f20e6a9476f6e0eca8f5b00367f9c623 Mon Sep 17 00:00:00 2001
From: "902449@58880@bigcat_chen@ASIC" <bigcat_chen@himax.com.tw>
Date: Fri, 5 Jun 2020 10:16:04 +0800
Subject: [PATCH 0087/1390] TFLM: remove temp file in person detection example

---
 .../himax_we1_evb/image_provider.cc~          | 44 -------------------
 1 file changed, 44 deletions(-)
 delete mode 100644 tensorflow/lite/micro/examples/person_detection_experimental/himax_we1_evb/image_provider.cc~

diff --git a/tensorflow/lite/micro/examples/person_detection_experimental/himax_we1_evb/image_provider.cc~ b/tensorflow/lite/micro/examples/person_detection_experimental/himax_we1_evb/image_provider.cc~
deleted file mode 100644
index d5b4d136642..00000000000
--- a/tensorflow/lite/micro/examples/person_detection_experimental/himax_we1_evb/image_provider.cc~
+++ /dev/null
@@ -1,44 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/lite/micro/examples/person_detection_experimental/image_provider.h"
-
-#include "tensorflow/lite/micro/examples/person_detection_experimental/model_settings.h"
-
-#include "hx_drv_tflm.h"
-
-hx_drv_sensor_image_config_t g_pimg_config;
-
-
-TfLiteStatus GetImage(tflite::ErrorReporter* error_reporter, int image_width,
-                      int image_height, int channels, int8_t* image_data) {
-  static bool is_initialized = false;
-
-  if (!is_initialized) {
-    if(hx_drv_sensor_initial(&g_pimg_config)!= HX_DRV_LIB_PASS)
-    {
-      return kTfLiteError;
-    }
-    is_initialized = true;
-  }
-
-  hx_drv_sensor_capture(&g_pimg_config);
-
-  hx_drv_image_rescale((uint8_t*)g_pimg_config.raw_address, g_pimg_config.img_width, g_pimg_config.img_height,
-                     image_data, image_data, image_height);
-
-
-  return kTfLiteOk;
-}

From 4f222bed159eeb2743fb8c97fa5e36d5bea4e5da Mon Sep 17 00:00:00 2001
From: Nishidha Panpaliya <npanpa23@in.ibm.com>
Date: Fri, 5 Jun 2020 07:49:19 +0000
Subject: [PATCH 0088/1390] Fixed copyright

---
 tensorflow/compiler/xla/service/cpu/test_target_triple_helper.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/compiler/xla/service/cpu/test_target_triple_helper.h b/tensorflow/compiler/xla/service/cpu/test_target_triple_helper.h
index e248f6de8bd..a28d48b7eb0 100644
--- a/tensorflow/compiler/xla/service/cpu/test_target_triple_helper.h
+++ b/tensorflow/compiler/xla/service/cpu/test_target_triple_helper.h
@@ -1,4 +1,4 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
   
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.

From 66eb52eaee54576c7c1b5fe887d5d0400b557f86 Mon Sep 17 00:00:00 2001
From: Nishidha Panpaliya <npanpa23@in.ibm.com>
Date: Fri, 5 Jun 2020 08:19:55 +0000
Subject: [PATCH 0089/1390] Fixed build error in one of the xla tests

---
 tensorflow/compiler/xla/service/cpu/tests/BUILD | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/compiler/xla/service/cpu/tests/BUILD b/tensorflow/compiler/xla/service/cpu/tests/BUILD
index 18624330a26..9036c5c9024 100644
--- a/tensorflow/compiler/xla/service/cpu/tests/BUILD
+++ b/tensorflow/compiler/xla/service/cpu/tests/BUILD
@@ -138,6 +138,7 @@ tf_cc_test(
         "//tensorflow/compiler/xla/service/cpu:cpu_compiler",
         "//tensorflow/compiler/xla/service/cpu/tests:cpu_codegen_test",
         "//tensorflow/compiler/xla/tests:test_utils",
+	"//tensorflow/compiler/xla/service/cpu:test_header_helper",
         "//tensorflow/core:lib",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",

From 4d7aadeda14cdc41606faaf2bf397a3904847d7d Mon Sep 17 00:00:00 2001
From: nihui <shuizhuyuanluo@126.com>
Date: Fri, 5 Jun 2020 16:49:23 +0800
Subject: [PATCH 0090/1390] Update tf_generated_ops.td

---
 tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td b/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
index 1c9297485b4..467a119c174 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
@@ -8651,11 +8651,11 @@ particular,
 begin = [1, 2, x, x, 0, x] # x denotes don't care (usually 0)
 end = [2, 4, x, x, -3, x]
 strides = [1, 1, x, x, -1, 1]
-begin_mask = 1<<4 | 1 << 5 = 48
+begin_mask = 1<<4 | 1<<5 = 48
 end_mask = 1<<5 = 32
 ellipsis_mask = 1<<3 = 8
-new_axis_mask = 1<<2 4
-shrink_axis_mask = 1<<0
+new_axis_mask = 1<<2 = 4
+shrink_axis_mask = 1<<0 = 1
 ```
 
 In this case if `foo.shape` is (5, 5, 5, 5, 5, 5) the final shape of

From d8bb6569bc6c820b098fc0b56c3c1a7a318422a9 Mon Sep 17 00:00:00 2001
From: Yixing Fu <yxfu93@hotmail.com>
Date: Fri, 5 Jun 2020 18:55:32 -0400
Subject: [PATCH 0091/1390] correct summing total blocks

---
 tensorflow/python/keras/applications/efficientnet.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/keras/applications/efficientnet.py b/tensorflow/python/keras/applications/efficientnet.py
index e1413b08533..e3c1a261e80 100644
--- a/tensorflow/python/keras/applications/efficientnet.py
+++ b/tensorflow/python/keras/applications/efficientnet.py
@@ -334,7 +334,7 @@ def EfficientNet(
   blocks_args = copy.deepcopy(blocks_args)
 
   b = 0
-  blocks = float(sum(args['repeats'] for args in blocks_args))
+  blocks = float(sum(round_repeats(args['repeats']) for args in blocks_args))
   for (i, args) in enumerate(blocks_args):
     assert args['repeats'] > 0
     # Update block input and output filters based on depth multiplier.

From 8b81960fcb60222b03f04751e5f99a36b24d27ca Mon Sep 17 00:00:00 2001
From: nihui <shuizhuyuanluo@126.com>
Date: Sat, 6 Jun 2020 12:00:15 +0800
Subject: [PATCH 0092/1390] Update api_def_StridedSlice.pbtxt

---
 tensorflow/core/api_def/base_api/api_def_StridedSlice.pbtxt | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tensorflow/core/api_def/base_api/api_def_StridedSlice.pbtxt b/tensorflow/core/api_def/base_api/api_def_StridedSlice.pbtxt
index 9a89a4e8e75..2714e31ac28 100644
--- a/tensorflow/core/api_def/base_api/api_def_StridedSlice.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_StridedSlice.pbtxt
@@ -123,11 +123,11 @@ particular,
 begin = [1, 2, x, x, 0, x] # x denotes don't care (usually 0)
 end = [2, 4, x, x, -3, x]
 strides = [1, 1, x, x, -1, 1]
-begin_mask = 1<<4 | 1 << 5 = 48
+begin_mask = 1<<4 | 1<<5 = 48
 end_mask = 1<<5 = 32
 ellipsis_mask = 1<<3 = 8
-new_axis_mask = 1<<2 4
-shrink_axis_mask = 1<<0
+new_axis_mask = 1<<2 = 4
+shrink_axis_mask = 1<<0 = 1
 ```
 
 In this case if `foo.shape` is (5, 5, 5, 5, 5, 5) the final shape of

From b8f57874ba13cf24a54878c3f4f6cd3f387a0c44 Mon Sep 17 00:00:00 2001
From: cclauss <cclauss@me.com>
Date: Sun, 7 Jun 2020 15:56:59 +0200
Subject: [PATCH 0093/1390] Fix SyntaxWarnings on Python >= 3.8

---
 tensorflow/python/kernel_tests/matrix_band_part_op_test.py   | 2 +-
 tensorflow/python/kernel_tests/matrix_solve_ls_op_test.py    | 2 +-
 .../python/ops/ragged/ragged_batch_gather_with_default_op.py | 3 +--
 tensorflow/python/ops/random_ops.py                          | 4 ++--
 tensorflow/tools/docs/doc_generator_visitor.py               | 5 ++---
 5 files changed, 7 insertions(+), 9 deletions(-)

diff --git a/tensorflow/python/kernel_tests/matrix_band_part_op_test.py b/tensorflow/python/kernel_tests/matrix_band_part_op_test.py
index fdb7e4a1a4e..25b502cf814 100644
--- a/tensorflow/python/kernel_tests/matrix_band_part_op_test.py
+++ b/tensorflow/python/kernel_tests/matrix_band_part_op_test.py
@@ -56,7 +56,7 @@ def _GetMatrixBandPartTest(dtype_, batch_shape_, shape_):
           band_np = np.triu(band_np, -lower)
         if upper >= 0:
           band_np = np.tril(band_np, upper)
-        if batch_shape_ is not ():
+        if batch_shape_ != ():
           band_np = np.tile(band_np, batch_shape_ + (1, 1))
         for index_dtype in [dtypes_lib.int32, dtypes_lib.int64]:
           with self.cached_session(use_gpu=False):
diff --git a/tensorflow/python/kernel_tests/matrix_solve_ls_op_test.py b/tensorflow/python/kernel_tests/matrix_solve_ls_op_test.py
index b7a159e2eff..889ea0dbd6c 100644
--- a/tensorflow/python/kernel_tests/matrix_solve_ls_op_test.py
+++ b/tensorflow/python/kernel_tests/matrix_solve_ls_op_test.py
@@ -107,7 +107,7 @@ class MatrixSolveLsOpTest(test_lib.TestCase):
       np_ans = _SolveWithNumpy(x, y, l2_regularizer=l2_regularizer)
       np_r = np.dot(np.conj(a.T), b - np.dot(a, np_ans))
       np_r_norm = np.sqrt(np.sum(np.conj(np_r) * np_r))
-      if batch_shape is not ():
+      if batch_shape != ():
         a = np.tile(a, batch_shape + (1, 1))
         b = np.tile(b, batch_shape + (1, 1))
         np_ans = np.tile(np_ans, batch_shape + (1, 1))
diff --git a/tensorflow/python/ops/ragged/ragged_batch_gather_with_default_op.py b/tensorflow/python/ops/ragged/ragged_batch_gather_with_default_op.py
index 377fd84f96e..06690f86a50 100644
--- a/tensorflow/python/ops/ragged/ragged_batch_gather_with_default_op.py
+++ b/tensorflow/python/ops/ragged/ragged_batch_gather_with_default_op.py
@@ -81,8 +81,7 @@ def batch_gather_with_default(params,
                                               return_dtype=True))
     # TODO(hterry): lift this restriction and support default_values of
     #               of rank > 1
-    if (default_value.shape.ndims is not 0
-        and default_value.shape.ndims is not 1):
+    if default_value.shape.ndims not in (0, 1):
       raise ValueError('"default_value" must be a scalar or vector')
     upper_bounds = None
     if indices.shape.ndims is None:
diff --git a/tensorflow/python/ops/random_ops.py b/tensorflow/python/ops/random_ops.py
index 1af91ed0dd3..9932a76b678 100644
--- a/tensorflow/python/ops/random_ops.py
+++ b/tensorflow/python/ops/random_ops.py
@@ -288,8 +288,8 @@ def random_uniform(shape,
     shape = tensor_util.shape_tensor(shape)
     # In case of [0,1) floating results, minval and maxval is unused. We do an
     # `is` comparison here since this is cheaper than isinstance or  __eq__.
-    minval_is_zero = minval is 0  # pylint: disable=literal-comparison
-    maxval_is_one = maxval is 1  # pylint: disable=literal-comparison
+    minval_is_zero = minval == 0
+    maxval_is_one = maxval == 1
     if not minval_is_zero or not maxval_is_one or dtype.is_integer:
       minval = ops.convert_to_tensor(minval, dtype=dtype, name="min")
       maxval = ops.convert_to_tensor(maxval, dtype=dtype, name="max")
diff --git a/tensorflow/tools/docs/doc_generator_visitor.py b/tensorflow/tools/docs/doc_generator_visitor.py
index ec2102a5935..ac5b09346ec 100644
--- a/tensorflow/tools/docs/doc_generator_visitor.py
+++ b/tensorflow/tools/docs/doc_generator_visitor.py
@@ -240,10 +240,9 @@ class DocGeneratorVisitor(object):
       # We cannot use the duplicate mechanism for some constants, since e.g.,
       # id(c1) == id(c2) with c1=1, c2=1. This is unproblematic since constants
       # have no usable docstring and won't be documented automatically.
-      if (py_object is not None and
+      if (py_object not in (None, ())
           not isinstance(py_object, six.integer_types + six.string_types +
-                         (six.binary_type, six.text_type, float, complex, bool))
-          and py_object is not ()):  # pylint: disable=literal-comparison
+                         (six.binary_type, six.text_type, float, complex, bool))):
         object_id = id(py_object)
         if object_id in reverse_index:
           master_name = reverse_index[object_id]

From e474fc8ebaff4100c427a75926518fce8eb9807a Mon Sep 17 00:00:00 2001
From: Sean Settle <sean.settle@gmail.com>
Date: Sun, 7 Jun 2020 19:14:41 -0700
Subject: [PATCH 0094/1390] Re-generated the full Dockerfiles

---
 .../devel-cpu-arm64v8-jupyter.Dockerfile      | 134 ++++++++++++++++++
 .../arm64v8/devel-cpu-arm64v8.Dockerfile      | 108 ++++++++++++++
 .../devel-cpu-ppc64le-jupyter.Dockerfile      |   2 +-
 .../ppc64le/devel-cpu-ppc64le.Dockerfile      |   2 +-
 .../devel-gpu-ppc64le-jupyter.Dockerfile      |   2 +-
 .../ppc64le/devel-gpu-ppc64le.Dockerfile      |   2 +-
 6 files changed, 246 insertions(+), 4 deletions(-)
 create mode 100644 tensorflow/tools/dockerfiles/dockerfiles/arm64v8/devel-cpu-arm64v8-jupyter.Dockerfile
 create mode 100644 tensorflow/tools/dockerfiles/dockerfiles/arm64v8/devel-cpu-arm64v8.Dockerfile

diff --git a/tensorflow/tools/dockerfiles/dockerfiles/arm64v8/devel-cpu-arm64v8-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/arm64v8/devel-cpu-arm64v8-jupyter.Dockerfile
new file mode 100644
index 00000000000..704b1b344aa
--- /dev/null
+++ b/tensorflow/tools/dockerfiles/dockerfiles/arm64v8/devel-cpu-arm64v8-jupyter.Dockerfile
@@ -0,0 +1,134 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+#
+# THIS IS A GENERATED DOCKERFILE.
+#
+# This file was assembled from multiple pieces, whose use is documented
+# throughout. Please refer to the TensorFlow dockerfiles documentation
+# for more information.
+
+ARG UBUNTU_VERSION=18.04
+
+FROM ubuntu:${UBUNTU_VERSION} AS base
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+        build-essential \
+        curl \
+        git \
+        libcurl3-dev \
+        libfreetype6-dev \
+        libhdf5-serial-dev \
+        libzmq3-dev \
+        pkg-config \
+        rsync \
+        software-properties-common \
+        sudo \
+        unzip \
+        zip \
+        zlib1g-dev \
+        openjdk-8-jdk \
+        openjdk-8-jre-headless \
+        && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
+ENV CI_BUILD_PYTHON python
+
+# CACHE_STOP is used to rerun future commands, otherwise cloning tensorflow will be cached and will not pull the most recent version
+ARG CACHE_STOP=1
+# Check out TensorFlow source code if --build-arg CHECKOUT_TF_SRC=1
+ARG CHECKOUT_TF_SRC=0
+# In case of Python 2.7+ we need to add passwd entries for user and group id
+RUN chmod a+w /etc/passwd /etc/group
+RUN test "${CHECKOUT_TF_SRC}" -eq 1 && git clone https://github.com/tensorflow/tensorflow.git /tensorflow_src || true
+
+# See http://bugs.python.org/issue19846
+ENV LANG C.UTF-8
+
+RUN apt-get update && apt-get install -y \
+    python3 \
+    python3-pip
+
+RUN python3 -m pip --no-cache-dir install --upgrade \
+    pip \
+    setuptools
+
+# Some TF tools expect a "python" binary
+RUN ln -s $(which python3) /usr/local/bin/python
+
+RUN apt-get update && apt-get install -y \
+    build-essential \
+    curl \
+    git \
+    openjdk-8-jdk \
+    python3-dev \
+    virtualenv \
+    swig
+
+RUN apt-get update && apt-get install -y \
+    python3-pil \
+    python3-h5py \
+    python3-keras-preprocessing \
+    python3-matplotlib \
+    python3-mock \
+    python3-numpy \
+    python3-scipy \
+    python3-sklearn \
+    python3-pandas \
+    python3-portpicker
+
+RUN python3 -m pip --no-cache-dir install \
+    enum34
+
+# Build and install bazel
+ENV BAZEL_VERSION 3.0.0
+WORKDIR /
+RUN mkdir /bazel && \
+    cd /bazel && \
+    curl -fSsL -O https://github.com/bazelbuild/bazel/releases/download/$BAZEL_VERSION/bazel-$BAZEL_VERSION-dist.zip && \
+    unzip bazel-$BAZEL_VERSION-dist.zip && \
+    bash ./compile.sh && \
+    cp output/bazel /usr/local/bin/ && \
+    rm -rf /bazel && \
+    cd -
+
+COPY bashrc /etc/bash.bashrc
+RUN chmod a+rwx /etc/bash.bashrc
+
+RUN python3 -m pip install --no-cache-dir jupyter matplotlib
+# Pin ipykernel and nbformat; see https://github.com/ipython/ipykernel/issues/422
+RUN python3 -m pip install --no-cache-dir jupyter_http_over_ws ipykernel==5.1.1 nbformat==4.4.0
+RUN jupyter serverextension enable --py jupyter_http_over_ws
+
+RUN mkdir -p /tf/tensorflow-tutorials && chmod -R a+rwx /tf/
+RUN mkdir /.local && chmod a+rwx /.local
+RUN apt-get install -y --no-install-recommends wget
+# some examples require git to fetch dependencies
+RUN apt-get install -y --no-install-recommends git
+WORKDIR /tf/tensorflow-tutorials
+RUN wget https://raw.githubusercontent.com/tensorflow/docs/master/site/en/tutorials/keras/classification.ipynb
+RUN wget https://raw.githubusercontent.com/tensorflow/docs/master/site/en/tutorials/keras/overfit_and_underfit.ipynb
+RUN wget https://raw.githubusercontent.com/tensorflow/docs/master/site/en/tutorials/keras/regression.ipynb
+RUN wget https://raw.githubusercontent.com/tensorflow/docs/master/site/en/tutorials/keras/save_and_load.ipynb
+RUN wget https://raw.githubusercontent.com/tensorflow/docs/master/site/en/tutorials/keras/text_classification.ipynb
+RUN wget https://raw.githubusercontent.com/tensorflow/docs/master/site/en/tutorials/keras/text_classification_with_hub.ipynb
+COPY readme-for-jupyter.md README.md
+RUN apt-get autoremove -y && apt-get remove -y wget
+WORKDIR /tf
+EXPOSE 8888
+
+RUN python3 -m ipykernel.kernelspec
+
+CMD ["bash", "-c", "source /etc/bash.bashrc && jupyter notebook --notebook-dir=/tf --ip 0.0.0.0 --no-browser --allow-root"]
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/arm64v8/devel-cpu-arm64v8.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/arm64v8/devel-cpu-arm64v8.Dockerfile
new file mode 100644
index 00000000000..10e4512cf29
--- /dev/null
+++ b/tensorflow/tools/dockerfiles/dockerfiles/arm64v8/devel-cpu-arm64v8.Dockerfile
@@ -0,0 +1,108 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+#
+# THIS IS A GENERATED DOCKERFILE.
+#
+# This file was assembled from multiple pieces, whose use is documented
+# throughout. Please refer to the TensorFlow dockerfiles documentation
+# for more information.
+
+ARG UBUNTU_VERSION=18.04
+
+FROM ubuntu:${UBUNTU_VERSION} AS base
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+        build-essential \
+        curl \
+        git \
+        libcurl3-dev \
+        libfreetype6-dev \
+        libhdf5-serial-dev \
+        libzmq3-dev \
+        pkg-config \
+        rsync \
+        software-properties-common \
+        sudo \
+        unzip \
+        zip \
+        zlib1g-dev \
+        openjdk-8-jdk \
+        openjdk-8-jre-headless \
+        && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
+ENV CI_BUILD_PYTHON python
+
+# CACHE_STOP is used to rerun future commands, otherwise cloning tensorflow will be cached and will not pull the most recent version
+ARG CACHE_STOP=1
+# Check out TensorFlow source code if --build-arg CHECKOUT_TF_SRC=1
+ARG CHECKOUT_TF_SRC=0
+# In case of Python 2.7+ we need to add passwd entries for user and group id
+RUN chmod a+w /etc/passwd /etc/group
+RUN test "${CHECKOUT_TF_SRC}" -eq 1 && git clone https://github.com/tensorflow/tensorflow.git /tensorflow_src || true
+
+# See http://bugs.python.org/issue19846
+ENV LANG C.UTF-8
+
+RUN apt-get update && apt-get install -y \
+    python3 \
+    python3-pip
+
+RUN python3 -m pip --no-cache-dir install --upgrade \
+    pip \
+    setuptools
+
+# Some TF tools expect a "python" binary
+RUN ln -s $(which python3) /usr/local/bin/python
+
+RUN apt-get update && apt-get install -y \
+    build-essential \
+    curl \
+    git \
+    openjdk-8-jdk \
+    python3-dev \
+    virtualenv \
+    swig
+
+RUN apt-get update && apt-get install -y \
+    python3-pil \
+    python3-h5py \
+    python3-keras-preprocessing \
+    python3-matplotlib \
+    python3-mock \
+    python3-numpy \
+    python3-scipy \
+    python3-sklearn \
+    python3-pandas \
+    python3-portpicker
+
+RUN python3 -m pip --no-cache-dir install \
+    enum34
+
+# Build and install bazel
+ENV BAZEL_VERSION 3.0.0
+WORKDIR /
+RUN mkdir /bazel && \
+    cd /bazel && \
+    curl -fSsL -O https://github.com/bazelbuild/bazel/releases/download/$BAZEL_VERSION/bazel-$BAZEL_VERSION-dist.zip && \
+    unzip bazel-$BAZEL_VERSION-dist.zip && \
+    bash ./compile.sh && \
+    cp output/bazel /usr/local/bin/ && \
+    rm -rf /bazel && \
+    cd -
+
+COPY bashrc /etc/bash.bashrc
+RUN chmod a+rwx /etc/bash.bashrc
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-cpu-ppc64le-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-cpu-ppc64le-jupyter.Dockerfile
index 53ccffd1403..905faca7893 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-cpu-ppc64le-jupyter.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-cpu-ppc64le-jupyter.Dockerfile
@@ -90,7 +90,7 @@ RUN python3 -m pip --no-cache-dir install \
     portpicker \
     enum34
 
- # Build and install bazel
+# Build and install bazel
 ENV BAZEL_VERSION 3.0.0
 WORKDIR /
 RUN mkdir /bazel && \
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-cpu-ppc64le.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-cpu-ppc64le.Dockerfile
index 1bbe7129479..378c5f8279b 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-cpu-ppc64le.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-cpu-ppc64le.Dockerfile
@@ -90,7 +90,7 @@ RUN python3 -m pip --no-cache-dir install \
     portpicker \
     enum34
 
- # Build and install bazel
+# Build and install bazel
 ENV BAZEL_VERSION 3.0.0
 WORKDIR /
 RUN mkdir /bazel && \
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-gpu-ppc64le-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-gpu-ppc64le-jupyter.Dockerfile
index 0700a354d3c..083ce05d2b2 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-gpu-ppc64le-jupyter.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-gpu-ppc64le-jupyter.Dockerfile
@@ -132,7 +132,7 @@ RUN python3 -m pip --no-cache-dir install \
     portpicker \
     enum34
 
- # Build and install bazel
+# Build and install bazel
 ENV BAZEL_VERSION 3.0.0
 WORKDIR /
 RUN mkdir /bazel && \
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-gpu-ppc64le.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-gpu-ppc64le.Dockerfile
index b6d8ff8b90e..2c13e47c257 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-gpu-ppc64le.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-gpu-ppc64le.Dockerfile
@@ -132,7 +132,7 @@ RUN python3 -m pip --no-cache-dir install \
     portpicker \
     enum34
 
- # Build and install bazel
+# Build and install bazel
 ENV BAZEL_VERSION 3.0.0
 WORKDIR /
 RUN mkdir /bazel && \

From 7fea4ccf1c91ff52ef1fbd36db6a1743d147e253 Mon Sep 17 00:00:00 2001
From: Sean Settle <sean.settle@gmail.com>
Date: Sun, 7 Jun 2020 20:34:46 -0700
Subject: [PATCH 0095/1390] Re-generated the full Dockerfiles

---
 .../dockerfiles/arm64v8/devel-cpu-arm64v8-jupyter.Dockerfile    | 2 +-
 .../dockerfiles/arm64v8/devel-cpu-arm64v8.Dockerfile            | 2 +-
 .../partials/ubuntu/bazelbuild-arm64v8.partial.Dockerfile       | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/tensorflow/tools/dockerfiles/dockerfiles/arm64v8/devel-cpu-arm64v8-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/arm64v8/devel-cpu-arm64v8-jupyter.Dockerfile
index 704b1b344aa..168e57d363a 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/arm64v8/devel-cpu-arm64v8-jupyter.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/arm64v8/devel-cpu-arm64v8-jupyter.Dockerfile
@@ -80,7 +80,6 @@ RUN apt-get update && apt-get install -y \
 RUN apt-get update && apt-get install -y \
     python3-pil \
     python3-h5py \
-    python3-keras-preprocessing \
     python3-matplotlib \
     python3-mock \
     python3-numpy \
@@ -90,6 +89,7 @@ RUN apt-get update && apt-get install -y \
     python3-portpicker
 
 RUN python3 -m pip --no-cache-dir install \
+    keras_preprocessing \
     enum34
 
 # Build and install bazel
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/arm64v8/devel-cpu-arm64v8.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/arm64v8/devel-cpu-arm64v8.Dockerfile
index 10e4512cf29..70d6df8df14 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/arm64v8/devel-cpu-arm64v8.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/arm64v8/devel-cpu-arm64v8.Dockerfile
@@ -80,7 +80,6 @@ RUN apt-get update && apt-get install -y \
 RUN apt-get update && apt-get install -y \
     python3-pil \
     python3-h5py \
-    python3-keras-preprocessing \
     python3-matplotlib \
     python3-mock \
     python3-numpy \
@@ -90,6 +89,7 @@ RUN apt-get update && apt-get install -y \
     python3-portpicker
 
 RUN python3 -m pip --no-cache-dir install \
+    keras_preprocessing \
     enum34
 
 # Build and install bazel
diff --git a/tensorflow/tools/dockerfiles/partials/ubuntu/bazelbuild-arm64v8.partial.Dockerfile b/tensorflow/tools/dockerfiles/partials/ubuntu/bazelbuild-arm64v8.partial.Dockerfile
index b833657aa69..2f923e84737 100644
--- a/tensorflow/tools/dockerfiles/partials/ubuntu/bazelbuild-arm64v8.partial.Dockerfile
+++ b/tensorflow/tools/dockerfiles/partials/ubuntu/bazelbuild-arm64v8.partial.Dockerfile
@@ -10,7 +10,6 @@ RUN apt-get update && apt-get install -y \
 RUN apt-get update && apt-get install -y \
     python3-pil \
     python3-h5py \
-    python3-keras-preprocessing \
     python3-matplotlib \
     python3-mock \
     python3-numpy \
@@ -20,6 +19,7 @@ RUN apt-get update && apt-get install -y \
     python3-portpicker
 
 RUN python3 -m pip --no-cache-dir install \
+    keras_preprocessing \
     enum34
 
 # Build and install bazel

From b9db1ee4174f26a94ac332ff8f60c9e0152403a8 Mon Sep 17 00:00:00 2001
From: "902449@58880@bigcat_chen@ASIC" <bigcat_chen@himax.com.tw>
Date: Mon, 8 Jun 2020 16:53:21 +0800
Subject: [PATCH 0096/1390] sync third_party_downloads to avoid conflict

---
 .../micro/tools/make/third_party_downloads.inc | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/tensorflow/lite/micro/tools/make/third_party_downloads.inc b/tensorflow/lite/micro/tools/make/third_party_downloads.inc
index 75a51e0df10..85016dc49b6 100644
--- a/tensorflow/lite/micro/tools/make/third_party_downloads.inc
+++ b/tensorflow/lite/micro/tools/make/third_party_downloads.inc
@@ -62,14 +62,14 @@ RUY_MD5="2d54f058f8f7120dfc1ecee79dbf259e"
 CIFAR10_DATASET_URL="https://www.cs.toronto.edu/~kriz/cifar-10-binary.tar.gz"
 CIFAR10_DATASET_MD5="c32a1d4ab5d03f1284b67883e8d87530"
 
-IMAGE_RECOGNITION_MODEL_URL := "https://storage.googleapis.com/download.tensorflow.org/models/tflite/cifar_image_recognition_model_2020_4_14.zip"
-IMAGE_RECOGNITION_MODEL_MD5 := "2b886156e7ef4d6e53d0f1a4bc800e56"
+IMAGE_RECOGNITION_MODEL_URL := "https://storage.googleapis.com/download.tensorflow.org/models/tflite/cifar_image_recognition_model_2020_05_27.zip"
+IMAGE_RECOGNITION_MODEL_MD5 := "1f4607b05ac45b8a6146fb883dbc2d7b"
 
-PERSON_MODEL_URL := "https://storage.googleapis.com/download.tensorflow.org/data/tf_lite_micro_person_data_grayscale_2019_11_21.zip"
-PERSON_MODEL_MD5 := "fe2934bd0788f1dcc7af3f0a954542ab"
+PERSON_MODEL_URL := "https://storage.googleapis.com/download.tensorflow.org/data/tf_lite_micro_person_data_grayscale_2020_05_27.zip"
+PERSON_MODEL_MD5 := "55b85f76e2995153e660391d4a209ef1"
 
-PERSON_MODEL_INT8_URL := "https://storage.googleapis.com/download.tensorflow.org/data/tf_lite_micro_person_data_int8_grayscale_2020_01_13.zip"
-PERSON_MODEL_INT8_MD5 := "8a7d2c70325f53136faea6dde517b8cc"
+PERSON_MODEL_INT8_URL := "https://storage.googleapis.com/download.tensorflow.org/data/tf_lite_micro_person_data_int8_grayscale_2020_05_27.zip"
+PERSON_MODEL_INT8_MD5 := "a0ede2d058aa2a1d413893455dd55352"
 
 EMBARC_MLI_URL := "https://github.com/foss-for-synopsys-dwc-arc-processors/embarc_mli/archive/58284867ca52d1f43b25045e8601999d7359d986.zip"
 EMBARC_MLI_MD5 := "2bf4982a327fdaa9d475803ce014d1ef"
@@ -77,9 +77,15 @@ EMBARC_MLI_MD5 := "2bf4982a327fdaa9d475803ce014d1ef"
 EMBARC_MLI_PRE_COMPILED_URL := "https://github.com/foss-for-synopsys-dwc-arc-processors/embarc_mli/releases/download/Release_1.1_RC2/embARC_MLI_package.zip"
 EMBARC_MLI_PRE_COMPILED_MD5 := "a95ff9e0370434484f14e7e4114327f6"
 
+ZEPHYR_URL := "https://github.com/antmicro/zephyr/archive/55e36b9.zip"
+ZEPHYR_MD5 := "755622eb4812fde918a6382b65d50c3b"
+
 XTENSA_HIFI4_URL :="https://github.com/foss-xtensa/nnlib-hifi4/raw/master/archive/xa_nnlib_04_07.zip"
 XTENSA_HIFI4_MD5 :="f234764928f9a42901df33a27e118c8b"
 
+ETHOSU_URL := "https://git.mlplatform.org/ml/ethos-u/ethos-u-core-driver.git/snapshot/ethos-u-core-driver-bcb5aaa99756f1b5c1295b079ebdd60996bc75a5.tar.gz"
+ETHOSU_MD5 := "d2073c8d88fc167fd5c46b5dcda58ea1"
+
 HIMAX_WE1_SDK_URL ="https://www.himax.com.tw/we-i/himax_we1_sdk_v02.zip"
 HIMAX_WE1_SDK_MD5 ="9a4b2f29b16052764e437b64bdcba816"
                     

From 60f6c58f9ea92c53c7434129045bc72761a5080a Mon Sep 17 00:00:00 2001
From: Steenu Johnson <steenu.johnson@students.iiserpune.ac.in>
Date: Mon, 8 Jun 2020 17:57:31 +0530
Subject: [PATCH 0097/1390] This commit creates a new op CSVDatasetV2 with an
 additional parameter exclude_cols for backwards compatibility. Other changes
 include:   Making the new op forward compatible.   Adding error info to the
 docstring.   Adding couple of tests to verify errors are being raised.

Signed-off-by: Steenu Johnson <steenu.johnson@students.iiserpune.ac.in>
---
 .../data/experimental/csv_dataset_op.cc       | 28 ++++++----
 .../core/ops/experimental_dataset_ops.cc      | 56 ++++++++++++++++---
 .../kernel_tests/csv_dataset_test.py          | 31 ++++++++++
 .../python/data/experimental/ops/readers.py   | 49 +++++++++++-----
 4 files changed, 133 insertions(+), 31 deletions(-)

diff --git a/tensorflow/core/kernels/data/experimental/csv_dataset_op.cc b/tensorflow/core/kernels/data/experimental/csv_dataset_op.cc
index 62d27294f04..d2023ecec6e 100644
--- a/tensorflow/core/kernels/data/experimental/csv_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/csv_dataset_op.cc
@@ -28,7 +28,9 @@ namespace {
 
 class CSVDatasetOp : public DatasetOpKernel {
  public:
-  explicit CSVDatasetOp(OpKernelConstruction* ctx) : DatasetOpKernel(ctx) {
+  explicit CSVDatasetOp(OpKernelConstruction* ctx)
+      : DatasetOpKernel(ctx),
+        op_version_(ctx->def().op() == "CSVDataset" ? 1 : 2) {
     OP_REQUIRES_OK(ctx, ctx->GetAttr("output_types", &output_types_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("output_shapes", &output_shapes_));
   }
@@ -62,8 +64,11 @@ class CSVDatasetOp : public DatasetOpKernel {
     OP_REQUIRES(ctx, select_cols_tensor->dims() == 1,
                 errors::InvalidArgument("`select_cols` must be a vector."));
 
-    const Tensor* exclude_cols_tensor;
-    OP_REQUIRES_OK(ctx, ctx->input("exclude_cols", &exclude_cols_tensor));
+    const Tensor* exclude_cols_tensor = new const Tensor();
+    if (op_version_ > 1) {
+      OP_REQUIRES_OK(ctx, ctx->input("exclude_cols", &exclude_cols_tensor));
+    }
+
     OP_REQUIRES(ctx, exclude_cols_tensor->dims() == 1,
                 errors::InvalidArgument("`exclude_cols` must be a vector"));
 
@@ -138,7 +143,7 @@ class CSVDatasetOp : public DatasetOpKernel {
     }
     OP_REQUIRES(ctx, select_cols.empty() || exclude_cols.empty(),
                 errors::InvalidArgument(
-                    "Either select_cols or exlcude_cols should be empty"));
+                    "Either select_cols or exclude_cols should be empty"));
     for (int i = 1; i < exclude_cols.size(); i++) {
       OP_REQUIRES(ctx, exclude_cols[i - 1] < exclude_cols[i],
                   errors::InvalidArgument(
@@ -238,8 +243,8 @@ class CSVDatasetOp : public DatasetOpKernel {
            std::make_pair(2, buffer_size), std::make_pair(3, header),
            std::make_pair(4, delim), std::make_pair(5, use_quote_delim),
            std::make_pair(6, na_value), std::make_pair(7, select_cols),
-           std::make_pair(8, exclude_cols)},     // Single tensor inputs
-          {std::make_pair(9, record_defaults)},  // Tensor list inputs
+           std::make_pair(9, exclude_cols)},     // Single tensor inputs
+          {std::make_pair(8, record_defaults)},  // Tensor list inputs
           {},
           output));
       return Status::OK();
@@ -386,12 +391,12 @@ class CSVDatasetOp : public DatasetOpKernel {
         Status result;
 
         while (!end_of_record) {  // Read till we reach \n, \r or EOF
-          bool exclude = num_excluded_parsed < excluded.size() &&
-                         excluded[num_excluded_parsed] == num_parsed;
+          bool explicit_exclude = num_excluded_parsed < excluded.size() &&
+                                  excluded[num_excluded_parsed] == num_parsed;
           bool include = select_all ||
                          (num_selected_parsed < selected.size() &&
                           selected[num_selected_parsed] == num_parsed) ||
-                         (!excluded.empty() && !exclude);
+                         (!excluded.empty() && !explicit_exclude);
 
           // Don't fail fast, so that the next call to GetNext may still return
           // a valid record
@@ -400,7 +405,7 @@ class CSVDatasetOp : public DatasetOpKernel {
 
           num_parsed++;
           if (include) num_selected_parsed++;
-          if (exclude) num_excluded_parsed++;
+          if (explicit_exclude) num_excluded_parsed++;
         }
 
         return result;
@@ -894,6 +899,8 @@ class CSVDatasetOp : public DatasetOpKernel {
     const io::ZlibCompressionOptions options_;
   };  // class Dataset
 
+  const int op_version_;
+
   DataTypeVector output_types_;
   std::vector<PartialTensorShape> output_shapes_;
 };  // class CSVDatasetOp
@@ -901,6 +908,7 @@ class CSVDatasetOp : public DatasetOpKernel {
 REGISTER_KERNEL_BUILDER(Name("CSVDataset").Device(DEVICE_CPU), CSVDatasetOp);
 REGISTER_KERNEL_BUILDER(Name("ExperimentalCSVDataset").Device(DEVICE_CPU),
                         CSVDatasetOp);
+REGISTER_KERNEL_BUILDER(Name("CSVDatasetV2").Device(DEVICE_CPU), CSVDatasetOp);
 
 }  // namespace
 }  // namespace experimental
diff --git a/tensorflow/core/ops/experimental_dataset_ops.cc b/tensorflow/core/ops/experimental_dataset_ops.cc
index 9910472a2c6..33aa416fe56 100644
--- a/tensorflow/core/ops/experimental_dataset_ops.cc
+++ b/tensorflow/core/ops/experimental_dataset_ops.cc
@@ -154,7 +154,6 @@ REGISTER_OP("CSVDataset")
     .Input("use_quote_delim: bool")
     .Input("na_value: string")
     .Input("select_cols: int64")
-    .Input("exclude_cols: int64")
     .Input("record_defaults: output_types")
     .Output("handle: variant")
     .Attr("output_types: list({float,double,int32,int64,string}) >= 1")
@@ -175,10 +174,8 @@ REGISTER_OP("CSVDataset")
       TF_RETURN_IF_ERROR(c->WithRank(c->input(6), 0, &unused));
       // `select_cols` must be a vector
       TF_RETURN_IF_ERROR(c->WithRank(c->input(7), 1, &unused));
-      //`exclude_cols` must be a vecotr
-      TF_RETURN_IF_ERROR(c->WithRank(c->input(8), 1, &unused));
       // `record_defaults` must be lists of scalars
-      for (size_t i = 9; i < c->num_inputs(); ++i) {
+      for (size_t i = 8; i < c->num_inputs(); ++i) {
         shape_inference::ShapeHandle v;
         TF_RETURN_IF_ERROR(c->WithRankAtMost(c->input(i), 1, &v));
         if (c->Rank(c->input(i)) == 1 && c->Value(c->Dim(v, 0)) > 1) {
@@ -190,7 +187,7 @@ REGISTER_OP("CSVDataset")
       return shape_inference::ScalarShape(c);
     });
 
-REGISTER_OP("ExperimentalCSVDataset")
+REGISTER_OP("CSVDatasetV2")
     .Input("filenames: string")
     .Input("compression_type: string")
     .Input("buffer_size: int64")
@@ -199,8 +196,8 @@ REGISTER_OP("ExperimentalCSVDataset")
     .Input("use_quote_delim: bool")
     .Input("na_value: string")
     .Input("select_cols: int64")
-    .Input("exclude_cols: int64")
     .Input("record_defaults: output_types")
+    .Input("exclude_cols: int64")
     .Output("handle: variant")
     .Attr("output_types: list({float,double,int32,int64,string}) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
@@ -221,9 +218,52 @@ REGISTER_OP("ExperimentalCSVDataset")
       // `select_cols` must be a vector
       TF_RETURN_IF_ERROR(c->WithRank(c->input(7), 1, &unused));
       // `exclude_cols` must be a vector
-      TF_RETURN_IF_ERROR(c->WithRank(c->input(8), 1, &unused));
+      TF_RETURN_IF_ERROR(
+          c->WithRank(c->input(c->num_inputs() - 1), 1, &unused));
       // `record_defaults` must be lists of scalars
-      for (size_t i = 9; i < c->num_inputs(); ++i) {
+      for (size_t i = 8; i < c->num_inputs() - 1; ++i) {
+        shape_inference::ShapeHandle v;
+        TF_RETURN_IF_ERROR(c->WithRankAtMost(c->input(i), 1, &v));
+        if (c->Rank(c->input(i)) == 1 && c->Value(c->Dim(v, 0)) > 1) {
+          return errors::InvalidArgument(
+              "Shape of a default must be a length-0 or length-1 vector, or a "
+              "scalar.");
+        }
+      }
+      return shape_inference::ScalarShape(c);
+    });
+
+REGISTER_OP("ExperimentalCSVDataset")
+    .Input("filenames: string")
+    .Input("compression_type: string")
+    .Input("buffer_size: int64")
+    .Input("header: bool")
+    .Input("field_delim: string")
+    .Input("use_quote_delim: bool")
+    .Input("na_value: string")
+    .Input("select_cols: int64")
+    .Input("record_defaults: output_types")
+    .Output("handle: variant")
+    .Attr("output_types: list({float,double,int32,int64,string}) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .SetDoNotOptimize()  // TODO(b/123753214): Source dataset ops must
+                         // disable constant folding.
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle unused;
+      // `filenames` must be a scalar or a vector.
+      TF_RETURN_IF_ERROR(c->WithRankAtMost(c->input(0), 1, &unused));
+      // `compression_type`, `buffer_size`, `header`, `field_delim`,
+      // `use_quote_delim`, `na_value` must be scalars
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(4), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(5), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(6), 0, &unused));
+      // `select_cols` must be a vector
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(7), 1, &unused));
+      // `record_defaults` must be lists of scalars
+      for (size_t i = 8; i < c->num_inputs(); ++i) {
         shape_inference::ShapeHandle v;
         TF_RETURN_IF_ERROR(c->WithRankAtMost(c->input(i), 1, &v));
         if (c->Rank(c->input(i)) == 1 && c->Value(c->Dim(v, 0)) > 1) {
diff --git a/tensorflow/python/data/experimental/kernel_tests/csv_dataset_test.py b/tensorflow/python/data/experimental/kernel_tests/csv_dataset_test.py
index 42a96812bb4..20c983cf2a9 100644
--- a/tensorflow/python/data/experimental/kernel_tests/csv_dataset_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/csv_dataset_test.py
@@ -424,6 +424,37 @@ class CsvDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
         record_defaults=record_defaults,
         exclude_cols=[1, 2])
 
+  @combinations.generate(test_base.default_test_combinations())
+  def testCsvDataset_withSelectandExcludeCol(self):
+    record_defaults = [['']]
+    inputs = [['1,2,3', '5,6,7']]
+    self._test_dataset(
+        inputs,
+        expected_err_re='Either select_cols or exclude_cols should be empty',
+        record_defaults=record_defaults,
+        select_cols=[0],
+        exclude_cols=[1, 2])
+
+  @combinations.generate(test_base.default_test_combinations())
+  def testCsvDataset_withExcludeColandRecordDefaultsTooLow(self):
+    record_defaults = [['']]
+    inputs = [['1,2,3', '5,6,7']]
+    self._test_dataset(
+        inputs,
+        expected_err_re='Expect 1 fields but have more in record',
+        record_defaults=record_defaults,
+        exclude_cols=[0])
+
+  @combinations.generate(test_base.default_test_combinations())
+  def testCsvDataset_withExcludeColandRecordDefaultsTooHigh(self):
+    record_defaults = [['']]*3
+    inputs = [['1,2,3', '5,6,7']]
+    self._test_dataset(
+        inputs,
+        expected_err_re='Expect 3 fields but have 2 in record',
+        record_defaults=record_defaults,
+        exclude_cols=[0])
+
   @combinations.generate(test_base.default_test_combinations())
   def testCsvDataset_withMultipleNewLines(self):
     # In this case, we expect it to behave differently from
diff --git a/tensorflow/python/data/experimental/ops/readers.py b/tensorflow/python/data/experimental/ops/readers.py
index fcf92f5aaf9..14a507580ad 100644
--- a/tensorflow/python/data/experimental/ops/readers.py
+++ b/tensorflow/python/data/experimental/ops/readers.py
@@ -41,6 +41,7 @@ from tensorflow.python.ops import gen_experimental_dataset_ops
 from tensorflow.python.ops import io_ops
 from tensorflow.python.platform import gfile
 from tensorflow.python.util.tf_export import tf_export
+from tensorflow.python.compat import compat
 
 _ACCEPTABLE_CSV_TYPES = (dtypes.float32, dtypes.float64, dtypes.int32,
                          dtypes.int64, dtypes.string)
@@ -661,7 +662,9 @@ class CsvDatasetV2(dataset_ops.DatasetSource):
         column if it is optional, or `DType` or empty `Tensor` if required. If
         both this and `select_columns` are specified, these must have the same
         lengths, and `column_defaults` is assumed to be sorted in order of
-        increasing column index.
+        increasing column index. If both this and 'exclude_cols' are specified,
+        the sum of lengths of record_defaults and exclude_cols should equal
+        the total number of columns in the CSV file.
       compression_type: (Optional.) A `tf.string` scalar evaluating to one of
         `""` (no compression), `"ZLIB"`, or `"GZIP"`. Defaults to no
         compression.
@@ -683,6 +686,13 @@ class CsvDatasetV2(dataset_ops.DatasetSource):
       exclude_cols:(Optional.) A sorted list of column indices to exclude from
         the input data. If specified, only the complement of this set of column
         will be parsed. Defaults to parsing all columns.
+
+    Raises:
+       InvalidArgumentError: If exclude_cols is not None and
+           len(exclude_cols) + len(record_defaults) does not match the total
+           number of columns in the file(s)
+
+
     """
     self._filenames = ops.convert_to_tensor(
         filenames, dtype=dtypes.string, name="filenames")
@@ -722,18 +732,31 @@ class CsvDatasetV2(dataset_ops.DatasetSource):
     )
     self._element_spec = tuple(
         tensor_spec.TensorSpec([], d.dtype) for d in self._record_defaults)
-    variant_tensor = gen_experimental_dataset_ops.csv_dataset(
-        filenames=self._filenames,
-        record_defaults=self._record_defaults,
-        buffer_size=self._buffer_size,
-        header=self._header,
-        output_shapes=self._flat_shapes,
-        field_delim=self._field_delim,
-        use_quote_delim=self._use_quote_delim,
-        na_value=self._na_value,
-        select_cols=self._select_cols,
-        exclude_cols=self._exclude_cols,
-        compression_type=self._compression_type)
+    if compat.forward_compatible(2020, 6, 25):
+      variant_tensor = gen_experimental_dataset_ops.csv_dataset_v2(
+          filenames=self._filenames,
+          record_defaults=self._record_defaults,
+          buffer_size=self._buffer_size,
+          header=self._header,
+          output_shapes=self._flat_shapes,
+          field_delim=self._field_delim,
+          use_quote_delim=self._use_quote_delim,
+          na_value=self._na_value,
+          select_cols=self._select_cols,
+          exclude_cols=self._exclude_cols,
+          compression_type=self._compression_type)
+    else:
+      variant_tensor = gen_experimental_dataset_ops.csv_dataset(
+          filenames=self._filenames,
+          record_defaults=self._record_defaults,
+          buffer_size=self._buffer_size,
+          header=self._header,
+          output_shapes=self._flat_shapes,
+          field_delim=self._field_delim,
+          use_quote_delim=self._use_quote_delim,
+          na_value=self._na_value,
+          select_cols=self._select_cols,
+          compression_type=self._compression_type)
     super(CsvDatasetV2, self).__init__(variant_tensor)
 
   @property

From bacc5e5927360ad119cdb9967d53df4c95c53fd4 Mon Sep 17 00:00:00 2001
From: Jens Elofsson <jens.elofsson@arm.com>
Date: Mon, 8 Jun 2020 14:38:12 +0200
Subject: [PATCH 0098/1390] Fix build issues.

---
 tensorflow/lite/micro/micro_allocator.cc      | 3 ++-
 tensorflow/lite/micro/micro_allocator_test.cc | 8 ++++----
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/tensorflow/lite/micro/micro_allocator.cc b/tensorflow/lite/micro/micro_allocator.cc
index 8fac421750d..18c82bca57d 100644
--- a/tensorflow/lite/micro/micro_allocator.cc
+++ b/tensorflow/lite/micro/micro_allocator.cc
@@ -94,7 +94,7 @@ TfLiteStatus AllocateVariables(
   return kTfLiteOk;
 }
 
-
+#if !defined(__clang__)
 // Helper function to check flatbuffer metadata correctness. This function is
 // not called by default. Hence it's not linked in to the final binary code.
 TfLiteStatus CheckOfflinePlannedOffsets(const Model* model,
@@ -157,6 +157,7 @@ TfLiteStatus CheckOfflinePlannedOffsets(const Model* model,
   }
   return kTfLiteOk;
 }
+#endif
 
 // A helper class to construct AllocationInfo array. This array contains the
 // lifetime of tensors / scratch_buffer and will be used to calculate the memory
diff --git a/tensorflow/lite/micro/micro_allocator_test.cc b/tensorflow/lite/micro/micro_allocator_test.cc
index 67052ce12d9..ac581305340 100644
--- a/tensorflow/lite/micro/micro_allocator_test.cc
+++ b/tensorflow/lite/micro/micro_allocator_test.cc
@@ -238,7 +238,7 @@ TF_LITE_MICRO_TEST(TestFinishComplexTensorAllocation) {
 TF_LITE_MICRO_TEST(OfflinePlannerBranchesAllOnline) {
   int version = 1;
   int subgraph = 0;
-  int nbr_tensors = 4;
+  constexpr int nbr_tensors = 4;
   const int32_t metadata_buffer[tflite::testing::kOfflinePlannerHeaderSize +
                                 nbr_tensors] = {version, subgraph,
                                                 nbr_tensors,  // header
@@ -283,7 +283,7 @@ TF_LITE_MICRO_TEST(OfflinePlannerBranchesAllOnline) {
 }
 
 TF_LITE_MICRO_TEST(OfflinePlannerBasic) {
-  int nbr_tensors = 4;
+  constexpr int nbr_tensors = 4;
   const int32_t metadata_buffer[tflite::testing::kOfflinePlannerHeaderSize +
                                 nbr_tensors] = {1,  0, nbr_tensors,
                                                 0,    // t0
@@ -328,7 +328,7 @@ TF_LITE_MICRO_TEST(OfflinePlannerBasic) {
 }
 
 TF_LITE_MICRO_TEST(OfflinePlannerOverlappingAllocation) {
-  int nbr_tensors = 4;
+  constexpr int nbr_tensors = 4;
   const int32_t metadata_buffer[tflite::testing::kOfflinePlannerHeaderSize +
                                 nbr_tensors] = {
       1, 0, nbr_tensors,  // header: version, subgraph, nbr tensors
@@ -374,7 +374,7 @@ TF_LITE_MICRO_TEST(OfflinePlannerOverlappingAllocation) {
 }
 
 TF_LITE_MICRO_TEST(OfflinePlannerOfflineOnline) {
-  int nbr_tensors = 5;
+  constexpr int nbr_tensors = 5;
   const int32_t metadata_buffer[tflite::testing::kOfflinePlannerHeaderSize +
                                 nbr_tensors] = {
       1, 0, nbr_tensors,  // header: version, subgraph, nbr tensors

From 0c8343f7f3a8066c507b97ff84d8b298655cc5f4 Mon Sep 17 00:00:00 2001
From: Nathan Luehr <nluehr@nvidia.com>
Date: Mon, 8 Jun 2020 11:21:49 -0500
Subject: [PATCH 0099/1390] Address review comments

---
 tensorflow/core/platform/tf32_utils.cc | 10 ++++++----
 tensorflow/core/platform/tf32_utils.h  |  2 +-
 tensorflow/python/framework/config.py  |  7 +++++--
 3 files changed, 12 insertions(+), 7 deletions(-)

diff --git a/tensorflow/core/platform/tf32_utils.cc b/tensorflow/core/platform/tf32_utils.cc
index 715b5996dc3..4456e768c0a 100644
--- a/tensorflow/core/platform/tf32_utils.cc
+++ b/tensorflow/core/platform/tf32_utils.cc
@@ -14,14 +14,16 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/core/platform/tf32_utils.h"
+#include <atomic>
 
 namespace tensorflow {
 
-// TODO(nluehr): enable tf32 execution by default after TF32 Ampere testing.
-static bool tf32_enabled = false;
+// Whether TensorFloat-32 should be used where supported.
+// TODO(nluehr): Maybe enable by default after TF32 Ampere testing.
+static std::atomic<bool> tf32_allowed{false};
 
-void allow_tf32_execution(bool allow) { tf32_enabled = allow; }
+void allow_tf32_execution(bool allowed) { tf32_allowed = allowed; }
 
-bool tf32_execution_allowed() { return tf32_enabled; }
+bool tf32_execution_allowed() { return tf32_allowed; }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/platform/tf32_utils.h b/tensorflow/core/platform/tf32_utils.h
index a0ce58f9bbd..7a158d00ad3 100644
--- a/tensorflow/core/platform/tf32_utils.h
+++ b/tensorflow/core/platform/tf32_utils.h
@@ -18,7 +18,7 @@ limitations under the License.
 
 namespace tensorflow {
 
-void allow_tf32_execution(bool allow);
+void allow_tf32_execution(bool allowed);
 
 bool tf32_execution_allowed();
 
diff --git a/tensorflow/python/framework/config.py b/tensorflow/python/framework/config.py
index 042af4d1023..a356e6d9a16 100644
--- a/tensorflow/python/framework/config.py
+++ b/tensorflow/python/framework/config.py
@@ -23,6 +23,8 @@ from tensorflow.python.eager import context
 from tensorflow.python.util import deprecation
 from tensorflow.python.util.tf_export import tf_export
 
+
+# No tf_export until TF is built against CUDA11 which is required for TF32.
 def tensor_float32_execution_allowed():
   """Get if TensorFloat-32 operations are enabled on supported hardware.
 
@@ -31,7 +33,8 @@ def tensor_float32_execution_allowed():
   """
   return _pywrap_tf32_execution.is_allowed()
 
-def allow_tensor_float_32_execution(allow):
+# No tf_export until TF is built against CUDA11 which is required for TF32.
+def allow_tensor_float_32_execution(allowed):
   """Allow use of TensorFloat-32 with float32 ops on supported hardware.
 
   TensorFloat-32 is a math mode introduced with the NVIDIA Ampere architecture.
@@ -47,7 +50,7 @@ def allow_tensor_float_32_execution(allow):
   Args:
     allow: whether to allow TensorFloat-32 execution
   """
-  _pywrap_tf32_execution.allow(allow)
+  _pywrap_tf32_execution.allow(allowed)
 
 @tf_export('config.threading.get_intra_op_parallelism_threads')
 def get_intra_op_parallelism_threads():

From ed41bb08250cad9f3ddbb6c7fb83e1216ee06031 Mon Sep 17 00:00:00 2001
From: Elena Zhelezina <elena.zhelezina@arm.com>
Date: Mon, 8 Jun 2020 17:57:27 +0100
Subject: [PATCH 0100/1390] Fix for CI failure.

Change-Id: I66a5b5ab559207071ea62619e9e612fda9a73202
---
 tensorflow/tools/api/golden/v1/tensorflow.lite.-ops-set.pbtxt | 4 ++++
 .../tools/api/golden/v1/tensorflow.lite.constants.pbtxt       | 4 ++++
 tensorflow/tools/api/golden/v2/tensorflow.lite.-ops-set.pbtxt | 4 ++++
 3 files changed, 12 insertions(+)

diff --git a/tensorflow/tools/api/golden/v1/tensorflow.lite.-ops-set.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.lite.-ops-set.pbtxt
index c3199b24d98..9538fe382a0 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.lite.-ops-set.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.lite.-ops-set.pbtxt
@@ -1,6 +1,10 @@
 path: "tensorflow.lite.OpsSet"
 tf_class {
   is_instance: "<enum \'OpsSet\'>"
+  member {
+    name: "EXPERIMENTAL_TFLITE_BUILTINS_ACTIVATIONS_INT16_WEIGHTS_INT8"
+    mtype: "<enum \'OpsSet\'>"
+  }
   member {
     name: "SELECT_TF_OPS"
     mtype: "<enum \'OpsSet\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.lite.constants.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.lite.constants.pbtxt
index 27c227dac64..7f62da6662a 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.lite.constants.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.lite.constants.pbtxt
@@ -12,6 +12,10 @@ tf_module {
     name: "GRAPHVIZ_DOT"
     mtype: "<type \'int\'>"
   }
+  member {
+    name: "INT16"
+    mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
+  }
   member {
     name: "INT32"
     mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.lite.-ops-set.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.lite.-ops-set.pbtxt
index c3199b24d98..9538fe382a0 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.lite.-ops-set.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.lite.-ops-set.pbtxt
@@ -1,6 +1,10 @@
 path: "tensorflow.lite.OpsSet"
 tf_class {
   is_instance: "<enum \'OpsSet\'>"
+  member {
+    name: "EXPERIMENTAL_TFLITE_BUILTINS_ACTIVATIONS_INT16_WEIGHTS_INT8"
+    mtype: "<enum \'OpsSet\'>"
+  }
   member {
     name: "SELECT_TF_OPS"
     mtype: "<enum \'OpsSet\'>"

From ad4323e93479caf12a11b6e089174be2a7ec7462 Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Thu, 7 May 2020 17:34:50 +0000
Subject: [PATCH 0101/1390] Update tf.map_fn to specify that at least one
 tensor must be present

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>

Typo fix

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/python/ops/map_fn.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/python/ops/map_fn.py b/tensorflow/python/ops/map_fn.py
index 6f59bcf5599..287070af07c 100644
--- a/tensorflow/python/ops/map_fn.py
+++ b/tensorflow/python/ops/map_fn.py
@@ -267,7 +267,7 @@ def map_fn(fn,
     elems: A tensor or (possibly nested) sequence of tensors, each of which will
       be unstacked along their first dimension.  `fn` will be applied to the
       nested sequence of the resulting slices.  `elems` may include ragged and
-      sparse tensors.
+      sparse tensors. `elems` must consist of at least one tensor.
     dtype: Deprecated: Equivalent to `fn_output_signature`.
     parallel_iterations: (optional) The number of iterations allowed to run in
       parallel. When graph building, the default value is 10. While executing
@@ -296,7 +296,7 @@ def map_fn(fn,
     TypeError: if `fn` is not callable or the structure of the output of
       `fn` and `fn_output_signature` do not match.
     ValueError: if the lengths of the output of `fn` and `fn_output_signature`
-      do not match.
+      do not match, or if the `elems` does not contain any tensor.
 
   Examples:
 

From dbc7faeecdbdc223c67b03284ba5dc7d25668d3c Mon Sep 17 00:00:00 2001
From: Elena Zhelezina <elena.zhelezina@arm.com>
Date: Mon, 8 Jun 2020 20:20:22 +0100
Subject: [PATCH 0102/1390] Addressed reviewer's comment.

Change-Id: I5bda332514d8070731b807b750ee7a423d6b4d78
---
 tensorflow/lite/python/convert.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/tensorflow/lite/python/convert.py b/tensorflow/lite/python/convert.py
index b1095a469f6..52edb700195 100644
--- a/tensorflow/lite/python/convert.py
+++ b/tensorflow/lite/python/convert.py
@@ -94,10 +94,15 @@ class OpsSet(enum.Enum):
   # quantized implementations.
   TFLITE_BUILTINS_INT8 = "TFLITE_BUILTINS_INT8"
 
-  # Convert model using only TensorFlow Lite operations with quantized int8 weights
-  # and int16 activations.
+  # Convert model using only TensorFlow Lite operations with quantized int8 weights,
+  # int16 activations and int64 bias.
   # Specifying this will throw an error for operations that do not yet have
   # quantized implementations.
+  # This quantization mode should be used in models for super-resolution,
+  # audio signal processing or image de-noising. It improves accuracy
+  # significantly, but only slightly increases the model size.   
+  # WARNING: These ops are currently experimental and have not yet been finalized.
+  # They are only compatible with CPU execution, and have not been optimized for production.
   EXPERIMENTAL_TFLITE_BUILTINS_ACTIVATIONS_INT16_WEIGHTS_INT8 = "EXPERIMENTAL_TFLITE_BUILTINS_ACTIVATIONS_INT16_WEIGHTS_INT8"
 
   def __str__(self):

From 13d3b343498d499f87230f2e596b738be5cf1109 Mon Sep 17 00:00:00 2001
From: "902449@58880@bigcat_chen@ASIC" <bigcat_chen@himax.com.tw>
Date: Tue, 9 Jun 2020 09:28:23 +0800
Subject: [PATCH 0103/1390] modify example main API usage

---
 .../himax_we1_evb/main_functions.cc             | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/tensorflow/lite/micro/examples/person_detection_experimental/himax_we1_evb/main_functions.cc b/tensorflow/lite/micro/examples/person_detection_experimental/himax_we1_evb/main_functions.cc
index 552b52c9c51..f0c7a405974 100644
--- a/tensorflow/lite/micro/examples/person_detection_experimental/himax_we1_evb/main_functions.cc
+++ b/tensorflow/lite/micro/examples/person_detection_experimental/himax_we1_evb/main_functions.cc
@@ -72,21 +72,20 @@ void setup() {
   // incur some penalty in code space for op implementations that are not
   // needed by this graph.
   //
-  // tflite::ops::micro::AllOpsResolver resolver;
+  // tflite::AllOpsResolver resolver;
   // NOLINTNEXTLINE(runtime-global-variables)
-  static tflite::MicroOpResolver<12> micro_op_resolver;
-  micro_op_resolver.AddBuiltin(tflite::BuiltinOperator_DEPTHWISE_CONV_2D,
-                               tflite::ops::micro::Register_DEPTHWISE_CONV_2D(),
-                               1, 3);
+  static tflite::MicroMutableOpResolver<5> micro_op_resolver;
+  micro_op_resolver.AddBuiltin(
+      tflite::BuiltinOperator_DEPTHWISE_CONV_2D,
+      tflite::ops::micro::Register_DEPTHWISE_CONV_2D());
   micro_op_resolver.AddBuiltin(tflite::BuiltinOperator_CONV_2D,
-                               tflite::ops::micro::Register_CONV_2D(), 1, 3);
+                               tflite::ops::micro::Register_CONV_2D());
   micro_op_resolver.AddBuiltin(tflite::BuiltinOperator_AVERAGE_POOL_2D,
-                               tflite::ops::micro::Register_AVERAGE_POOL_2D(),
-                               1, 2);
+                               tflite::ops::micro::Register_AVERAGE_POOL_2D());
   micro_op_resolver.AddBuiltin(tflite::BuiltinOperator_RESHAPE,
                                tflite::ops::micro::Register_RESHAPE());
   micro_op_resolver.AddBuiltin(tflite::BuiltinOperator_SOFTMAX,
-                               tflite::ops::micro::Register_SOFTMAX(), 1, 3);
+                               tflite::ops::micro::Register_SOFTMAX());
 
   // Build an interpreter to run the model with.
   // NOLINTNEXTLINE(runtime-global-variables)

From 4644b47db199ccda856c81ca88d6a3c58c41890f Mon Sep 17 00:00:00 2001
From: Eugene Kuznetsov <eugene.kuznetsov@amd.com>
Date: Tue, 9 Jun 2020 05:13:35 +0000
Subject: [PATCH 0104/1390] Remove duplicate macros

---
 .../core/kernels/non_max_suppression_op.cu.cc | 19 -------------------
 1 file changed, 19 deletions(-)

diff --git a/tensorflow/core/kernels/non_max_suppression_op.cu.cc b/tensorflow/core/kernels/non_max_suppression_op.cu.cc
index 53559b20419..c2cae2ab212 100644
--- a/tensorflow/core/kernels/non_max_suppression_op.cu.cc
+++ b/tensorflow/core/kernels/non_max_suppression_op.cu.cc
@@ -28,25 +28,6 @@ limitations under the License.
 #include "tensorflow/core/util/gpu_launch_config.h"
 #include "tensorflow/stream_executor/stream_executor.h"
 
-#define TF_RETURN_IF_CUDA_ERROR(result)                   \
-  do {                                                    \
-    cudaError_t error(result);                            \
-    if (!SE_PREDICT_TRUE(error == cudaSuccess)) {         \
-      return errors::Internal("Cuda call failed with ",   \
-                              cudaGetErrorString(error)); \
-    }                                                     \
-  } while (0)
-
-#define TF_OP_REQUIRES_CUDA_SUCCESS(context, result)                   \
-  do {                                                                 \
-    cudaError_t error(result);                                         \
-    if (!SE_PREDICT_TRUE(error == cudaSuccess)) {                      \
-      context->SetStatus(errors::Internal("Cuda call failed with",     \
-                                          cudaGetErrorString(error))); \
-      return;                                                          \
-    }                                                                  \
-  } while (0)
-
 struct __align__(16) Box {
   float x1, y1, x2, y2;
 };

From 8681b1bf543a33c62a9bec29625594d1fd9b921a Mon Sep 17 00:00:00 2001
From: Eugene Kuznetsov <eugene.kuznetsov@amd.com>
Date: Tue, 9 Jun 2020 15:11:45 +0000
Subject: [PATCH 0105/1390] Adding 3d Pooling using latest MIOpen API

---
 tensorflow/cc/gradients/nn_grad_test.cc       |   8 -
 tensorflow/core/kernels/cudnn_pooling_gpu.cc  |  31 ++
 tensorflow/python/eager/backprop_test.py      |   3 -
 tensorflow/python/kernel_tests/BUILD          |   1 -
 tensorflow/python/kernel_tests/pool_test.py   |   4 -
 .../ops/parallel_for/control_flow_ops_test.py |   2 -
 tensorflow/stream_executor/rocm/rocm_dnn.cc   | 443 +++++++++++-------
 tensorflow/stream_executor/rocm/rocm_dnn.h    |  68 +++
 8 files changed, 360 insertions(+), 200 deletions(-)

diff --git a/tensorflow/cc/gradients/nn_grad_test.cc b/tensorflow/cc/gradients/nn_grad_test.cc
index 942ec08f451..f5a09e09dcd 100644
--- a/tensorflow/cc/gradients/nn_grad_test.cc
+++ b/tensorflow/cc/gradients/nn_grad_test.cc
@@ -259,9 +259,6 @@ TEST_F(NNGradTest, MaxPoolGradV2Helper) {
   RunTest(x, x_init_value, y, y_shape);
 }
 
-// TODO(rocm):
-// Re-enable this test once 3D pooling is supported on ROCm platform
-#ifndef TENSORFLOW_USE_ROCM
 TEST_F(NNGradTest, MaxPool3DGradHelper) {
   TensorShape x_shape({1, 3, 3, 3, 1});
   TensorShape y_shape({1, 1, 1, 1, 1});
@@ -274,7 +271,6 @@ TEST_F(NNGradTest, MaxPool3DGradHelper) {
   SetRandomValuesForMaxPooling<float>(&x_init_value);
   RunTest(x, x_init_value, y, y_shape);
 }
-#endif
 
 TEST_F(NNGradTest, AvgPoolGradHelper) {
   TensorShape x_shape({1, 2, 2, 1});
@@ -287,9 +283,6 @@ TEST_F(NNGradTest, AvgPoolGradHelper) {
   RunTest(x, x_shape, y, y_shape);
 }
 
-// TODO(rocm):
-// Re-enable this test once 3D pooling is supported on ROCm platform
-#ifndef TENSORFLOW_USE_ROCM
 TEST_F(NNGradTest, AvgPool3DGradHelper) {
   TensorShape x_shape({1, 3, 3, 3, 1});
   TensorShape y_shape({1, 1, 1, 1, 1});
@@ -300,7 +293,6 @@ TEST_F(NNGradTest, AvgPool3DGradHelper) {
   auto y = AvgPool3D(scope_, x, ksize, strides, "SAME");
   RunTest(x, x_shape, y, y_shape);
 }
-#endif
 
 TEST_F(NNGradTest, LRN) {
   TensorShape x_shape({1, 1, 2, 1});
diff --git a/tensorflow/core/kernels/cudnn_pooling_gpu.cc b/tensorflow/core/kernels/cudnn_pooling_gpu.cc
index eb7d16e3074..60088133d5d 100644
--- a/tensorflow/core/kernels/cudnn_pooling_gpu.cc
+++ b/tensorflow/core/kernels/cudnn_pooling_gpu.cc
@@ -98,10 +98,25 @@ void DnnPooling3dOp<T>::Compute(OpKernelContext* context,
   auto* stream = context->op_device_context()->stream();
   OP_REQUIRES(context, stream, errors::Internal("No GPU stream available."));
 
+#if TENSORFLOW_USE_ROCM
+  static int64 PoolingScratchSize = GetDnnWorkspaceLimit(
+      // default value is in bytes despite the name of the environment variable
+      "TF_CUDNN_WORKSPACE_LIMIT_IN_MB", 1LL << 32  // 4GB
+  );
+
+  DnnScratchAllocator scratch_allocator(PoolingScratchSize, context);
+  bool status =
+      stream
+          ->ThenPoolForward(pooling_desc, input_desc, input_data, output_desc,
+                            &output_data, &scratch_allocator)
+          .ok();
+#else
   bool status = stream
                     ->ThenPoolForward(pooling_desc, input_desc, input_data,
                                       output_desc, &output_data)
                     .ok();
+#endif
+
   OP_REQUIRES(context, status,
               errors::Internal("dnn PoolForward launch failed"));
 
@@ -225,12 +240,28 @@ void DnnPooling3dGradOp<T>::Compute(
   auto* stream = context->op_device_context()->stream();
   OP_REQUIRES(context, stream, errors::Internal("No GPU stream available."));
 
+#if TENSORFLOW_USE_ROCM
+  static int64 PoolingScratchSize = GetDnnWorkspaceLimit(
+      // default value is in bytes despite the name of the environment variable
+      "TF_CUDNN_WORKSPACE_LIMIT_IN_MB", 1LL << 32  // 4GB
+  );
+
+  DnnScratchAllocator scratch_allocator(PoolingScratchSize, context);
+  bool status = stream
+                    ->ThenPoolBackward(pooling_desc, orig_input_desc,
+                                       orig_input_data, orig_output_desc,
+                                       orig_output_data, output_backprop_data,
+                                       &input_backprop_data, &scratch_allocator)
+                    .ok();
+#else
   bool status =
       stream
           ->ThenPoolBackward(pooling_desc, orig_input_desc, orig_input_data,
                              orig_output_desc, orig_output_data,
                              output_backprop_data, &input_backprop_data)
           .ok();
+#endif
+
   OP_REQUIRES(context, status,
               errors::Internal("dnn PoolBackward launch failed"));
 
diff --git a/tensorflow/python/eager/backprop_test.py b/tensorflow/python/eager/backprop_test.py
index b28aaa3a626..bb909433fa7 100644
--- a/tensorflow/python/eager/backprop_test.py
+++ b/tensorflow/python/eager/backprop_test.py
@@ -1455,9 +1455,6 @@ class BackpropTest(test.TestCase, parameterized.TestCase):
   @test_util.run_in_graph_and_eager_modes
   def testMaxPooling3DGradient(self):
 
-    if test.is_built_with_rocm():
-      self.skipTest('Pooling with 3D tensors is not supported in ROCm')
-
     def forward(a):
       r = max_pooling3d(a, pool_size=pool_size, strides=strides, padding='SAME')
       return r
diff --git a/tensorflow/python/kernel_tests/BUILD b/tensorflow/python/kernel_tests/BUILD
index a04c874c9d6..ae4e0244357 100644
--- a/tensorflow/python/kernel_tests/BUILD
+++ b/tensorflow/python/kernel_tests/BUILD
@@ -2995,7 +2995,6 @@ cuda_py_test(
     name = "pooling_ops_3d_test",
     size = "medium",
     srcs = ["pooling_ops_3d_test.py"],
-    tags = ["no_rocm"],
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
diff --git a/tensorflow/python/kernel_tests/pool_test.py b/tensorflow/python/kernel_tests/pool_test.py
index 0f0eaa25402..01680f5c1f0 100644
--- a/tensorflow/python/kernel_tests/pool_test.py
+++ b/tensorflow/python/kernel_tests/pool_test.py
@@ -219,8 +219,6 @@ class PoolingTest(test.TestCase):
                     strides=strides)
 
   def testPool3D(self):
-    if test.is_built_with_rocm():
-      self.skipTest("Pooling with 3D tensors is not supported in ROCm")
     with self.session(use_gpu=test.is_gpu_available()):
       for padding in ["SAME", "VALID"]:
         for pooling_type in ["MAX", "AVG"]:
@@ -363,8 +361,6 @@ class PoolingTest(test.TestCase):
 
   @test_util.run_deprecated_v1
   def testGradient3D(self):
-    if test.is_built_with_rocm():
-      self.skipTest("Pooling with 3D tensors is not supported in ROCm")
     with self.session(use_gpu=test.is_gpu_available()):
       for padding in ["SAME", "VALID"]:
         for pooling_type in ["AVG", "MAX"]:
diff --git a/tensorflow/python/ops/parallel_for/control_flow_ops_test.py b/tensorflow/python/ops/parallel_for/control_flow_ops_test.py
index 243471553d9..a6546fee742 100644
--- a/tensorflow/python/ops/parallel_for/control_flow_ops_test.py
+++ b/tensorflow/python/ops/parallel_for/control_flow_ops_test.py
@@ -488,8 +488,6 @@ class NNTest(PForTestCase):
     self._test_loop_fn(loop_fn, 3)
 
   def test_max_pool3d(self):
-    if test.is_built_with_rocm():
-      self.skipTest("Pooling with 3D tensors is not supported in ROCm")
     with backprop.GradientTape(persistent=True) as g:
       x = random_ops.random_uniform([3, 3, 2, 12, 12, 3])
       g.watch(x)
diff --git a/tensorflow/stream_executor/rocm/rocm_dnn.cc b/tensorflow/stream_executor/rocm/rocm_dnn.cc
index e0ead6d57e8..9c09784b3f1 100644
--- a/tensorflow/stream_executor/rocm/rocm_dnn.cc
+++ b/tensorflow/stream_executor/rocm/rocm_dnn.cc
@@ -263,7 +263,8 @@ namespace wrap {
   __macro(miopenFindConvolutionForwardAlgorithm)                     \
   __macro(miopenCreateTensorDescriptor)                              \
   __macro(miopenDestroyTensorDescriptor)                             \
-  __macro(miopenSet2dPoolingDescriptor)                              \
+  __macro(miopenSetNdPoolingDescriptor)                              \
+  __macro(miopenSetPoolingIndexType)                                 \
   __macro(miopenSetLRNDescriptor)                                    \
   __macro(miopenLRNGetWorkSpaceSize)                                 \
   __macro(miopenCreateConvolutionDescriptor)                         \
@@ -290,7 +291,7 @@ namespace wrap {
   __macro(miopenSetTensorDescriptor)                                 \
   __macro(miopenGetTensorDescriptorSize)                             \
   __macro(miopenPoolingForward)                                      \
-  __macro(miopenPoolingGetWorkSpaceSize)                             \
+  __macro(miopenPoolingGetWorkSpaceSizeV2                            \
   __macro(miopenPoolingBackward)                                     \
   __macro(miopenLRNForward)                                          \
   __macro(miopenLRNBackward)                                         \
@@ -605,6 +606,11 @@ MIOpenSupport::MIOpenSupport(GpuExecutor* parent) : parent_(parent) {
   // swich to Find Mode if env var TF_ROCM_USE_IMMEDIATE_MODE is set
   tensorflow::ReadBoolFromEnvVar("TF_ROCM_USE_IMMEDIATE_MODE", false,
                                  &use_immediate_mode_);
+
+  bool enable_pooling_cache = false;
+  tensorflow::ReadBoolFromEnvVar("TF_ROCM_BW_POOL_CACHE", false,
+                                 &enable_pooling_cache);
+  if (enable_pooling_cache) m_pooling_cache_allowed = true;
 }
 
 port::Status MIOpenSupport::Init() {
@@ -844,17 +850,19 @@ class ScopedPoolingDescriptor {
     std::transform(shape64.cbegin(), shape64.cend(), shape.begin(),
                    &CheckedNarrowing<int64, int>);
 
-    if (nd != 2) {
-      LOG(FATAL) << "miopen requires pooling dimensions be 2"
-                 << ToString(status);
-    }
-
-    status = wrap::miopenSet2dPoolingDescriptor(
+    status = wrap::miopenSetNdPoolingDescriptor(
         handle_,
         (pooling_descriptor.mode() == dnn::PoolingMode::kMaximum
              ? miopenPoolingMax
              : miopenPoolingAverage),
-        shape[0], shape[1], padding[0], padding[1], strides[0], strides[1]);
+        nd, shape.data(), padding.data(), strides.data());
+
+    // Note: The index type has to be uint32 type for now because MIOpen
+    // API assumes all input indexes to be the same type. Since a tensor
+    // descriptor can only use int32 type, the index type here need to be
+    // aligned with the tensor index type of the (input) tensor descritptor
+    status = wrap::miopenSetPoolingIndexType(handle_, miopenIndexUint32);
+
     if (status != miopenStatusSuccess) {
       LOG(FATAL) << "could not set miopen pooling descriptor: "
                  << ToString(status);
@@ -4009,10 +4017,94 @@ bool MIOpenSupport::DoPoolForward(
     const DeviceMemory<double>& input_data,
     const dnn::BatchDescriptor& output_dimensions,
     DeviceMemory<double>* output_data, ScratchAllocator* workspace_allocator) {
-  LOG(ERROR) << "miopen does not support pooling for dobule type yet";
+  LOG(ERROR) << "miopen does not support pooling for double type yet";
   return false;
 }
 
+bool PoolingWorkspaceDescriptor::IsSame(
+    const dnn::BatchDescriptor& input_dimensions,
+    const dnn::BatchDescriptor& output_dimensions,
+    const dnn::PoolingDescriptor& pooling_dimensions, int _type) {
+  return dtype == _type &&
+         input_dims ==
+             input_dimensions.full_dims(dnn::DataLayout::kBatchDepthYX) &&
+         output_dims ==
+             output_dimensions.full_dims(dnn::DataLayout::kBatchDepthYX) &&
+         op.mode() == pooling_dimensions.mode() &&
+         op.window() == pooling_dimensions.window() &&
+         op.padding() == pooling_dimensions.padding() &&
+         op.strides() == pooling_dimensions.strides();
+}
+
+bool PoolingWorkspaceCache::find(
+    const void* p, const dnn::BatchDescriptor& input_dimensions,
+    const dnn::BatchDescriptor& output_dimensions,
+    const dnn::PoolingDescriptor& pooling_dimensions, int _type,
+    PoolingWorkspaceDescriptor*& pdesc) {
+  pdesc = 0;
+  auto it = cache.find(p);
+  if (it == cache.end()) {
+    return false;
+  }
+  if (!it->second.IsSame(input_dimensions, output_dimensions,
+                         pooling_dimensions, _type)) {
+    return false;
+  }
+  pdesc = &it->second;
+  return true;
+}
+
+void PoolingWorkspaceCache::insert(
+    const void* p, const dnn::BatchDescriptor& input_dimensions,
+    const dnn::BatchDescriptor& output_dimensions,
+    const dnn::PoolingDescriptor& pooling_dimensions, int _type,
+    std::unique_ptr<TemporaryDeviceMemory<uint8>>& workspace, size_t wsp_size,
+    hipStream_t hip_stream) {
+  PoolingWorkspaceDescriptor* desc = 0;
+  auto it = cache.find(p);
+  if (it != cache.end()) {
+    // replacing an entry with the same pointer but different attributes
+    // (if everything matches, the caller is expected to reuse the entry)
+    desc = &it->second;
+    hipStreamSynchronize(hip_stream);
+    memory_used -= desc->workspace_size;
+  } else {
+    cache[p] = PoolingWorkspaceDescriptor();
+    desc = &cache[p];
+  }
+  desc->input_dims = input_dimensions.full_dims(dnn::DataLayout::kBatchDepthYX);
+  desc->output_dims =
+      output_dimensions.full_dims(dnn::DataLayout::kBatchDepthYX);
+  desc->op = pooling_dimensions;
+  desc->dtype = _type;
+  desc->timestamp = timestamp;
+  timestamp++;
+  desc->workspace = std::move(workspace);
+  desc->workspace_size = wsp_size;
+  memory_used += wsp_size;
+  trim(hip_stream);
+}
+
+void PoolingWorkspaceCache::trim(hipStream_t hip_stream) {
+  if (memory_used < memory_budget && cache.size() < trim_size) return;
+  bool must_sync = true;
+  while (true) {
+    int new_size = cache.size() - (cache.size() >> 2);
+    std::vector<const void*> old_entries;
+    for (auto& x : cache)
+      if (x.second.timestamp + new_size < timestamp)
+        old_entries.push_back(x.first);
+    if (old_entries.empty()) break;
+    if (must_sync) hipStreamSynchronize(hip_stream);
+    must_sync = true;
+    for (auto x : old_entries) {
+      memory_used -= cache[x].workspace_size;
+      cache.erase(x);
+    }
+    if (memory_used < memory_budget || cache.size() < 10) break;
+  }
+}
+
 bool MIOpenSupport::DoPoolForward(
     Stream* stream, const dnn::PoolingDescriptor& pooling_dimensions,
     const dnn::BatchDescriptor& input_dimensions,
@@ -4020,7 +4112,6 @@ bool MIOpenSupport::DoPoolForward(
     const dnn::BatchDescriptor& output_dimensions,
     DeviceMemory<float>* output_data, ScratchAllocator* workspace_allocator) {
   auto miopen = miopen_->GetHandle(parent_, stream);
-
   // Alpha is the scaling factor for input.
   float alpha = 1.0;
   // Beta is the scaling factor for output.
@@ -4030,10 +4121,48 @@ bool MIOpenSupport::DoPoolForward(
   ScopedTensorDescriptor dest_desc{output_dimensions, miopenFloat};
   ScopedPoolingDescriptor pooling_desc{pooling_dimensions};
 
+  bool do_backward = false;
+  uint8* workspace = 0;
+  size_t workspace_size = 0;
+  std::unique_ptr<TemporaryDeviceMemory<uint8>> wsp_mem;
+  if (m_pooling_cache_enabled) {
+    do_backward = true;
+    auto status = wrap::miopenPoolingGetWorkSpaceSizeV2(
+        pooling_desc.handle(), dest_desc.handle(), &workspace_size);
+    if (status != miopenStatusSuccess) {
+      LOG(ERROR)
+          << "failed to obtain workspace size for backward pooling on stream: "
+          << ToString(status);
+      return false;
+    }
+    if (workspace_size != 0) {
+      PoolingWorkspaceDescriptor* pdesc = 0;
+      bool cache_hit =
+          m_pooling_cache_allowed &&
+          m_pooling_cache.find(input_data.opaque(), input_dimensions,
+                               output_dimensions, pooling_dimensions,
+                               miopenFloat, pdesc);
+      if (cache_hit) {
+        // reusing the same buffer
+        workspace = reinterpret_cast<uint8*>(
+            pdesc->workspace->mutable_device_memory()->opaque());
+      } else {
+        wsp_mem = stream->AllocateTemporaryArray<uint8>(workspace_size)
+                      .ConsumeValueOrDie();
+        workspace = reinterpret_cast<uint8*>(
+            wsp_mem->mutable_device_memory()->opaque());
+        m_pooling_cache.insert(input_data.opaque(), input_dimensions,
+                               output_dimensions, pooling_dimensions,
+                               miopenFloat, wsp_mem, workspace_size,
+                               AsGpuStreamValue(stream));
+      }
+    }
+  }
+
   auto status = wrap::miopenPoolingForward(
       miopen.handle(), pooling_desc.handle(), &alpha, src_desc.handle(),
       input_data.opaque(), &beta, dest_desc.handle(), output_data->opaque(),
-      false, nullptr, 0);
+      do_backward, workspace, workspace_size);
   if (status != miopenStatusSuccess) {
     LOG(ERROR) << "failed to enqueue forward pooling on stream: "
                << ToString(status);
@@ -4072,6 +4201,118 @@ bool MIOpenSupport::DoPoolForward(
   return true;
 }
 
+template <class T>
+bool MIOpenSupport::DoPoolBackwardImpl(
+    Stream* stream, const dnn::PoolingDescriptor& pooling_dimensions,
+    const dnn::BatchDescriptor& input_dimensions,
+    const DeviceMemory<T>& input_data,
+    const dnn::BatchDescriptor& output_dimensions,
+    const DeviceMemory<T>& output_data, const DeviceMemory<T>& input_diff_data,
+    DeviceMemory<T>* output_diff_data, ScratchAllocator* workspace_allocator) {
+  auto miopen = miopen_->GetHandle(parent_, stream);
+  if (m_pooling_cache_allowed) m_pooling_cache_enabled = true;
+  // Alpha is the scaling factor for input.
+  float alpha = 1.0;
+  // Beta is the scaling factor for output.
+  float beta = 0.0;
+
+  auto type =
+      std::is_same<T, float>::value
+          ? miopenFloat
+          : (std::is_same<T, Eigen::half>::value ? miopenHalf
+                                                 : (miopenDataType_t)-1);
+
+  ScopedTensorDescriptor src_desc{input_dimensions, type};
+  ScopedTensorDescriptor dest_desc{output_dimensions, type};
+  ScopedPoolingDescriptor pooling_desc{pooling_dimensions};
+
+  uint8* workspace_ptr = 0;
+  DeviceMemory<uint8> workspace;
+  PoolingWorkspaceDescriptor* pdesc = 0;
+
+  size_t workspace_size_in_bytes = 0;
+  auto status = wrap::miopenPoolingGetWorkSpaceSizeV2(
+      pooling_desc.handle(), dest_desc.handle(), &workspace_size_in_bytes);
+  if (status != miopenStatusSuccess) {
+    LOG(ERROR)
+        << "failed to obtain workspace size for backward pooling on stream: "
+        << ToString(status);
+    return false;
+  }
+
+  // Allocate the workspace.
+  if (workspace_size_in_bytes > 0) {
+    bool cache_hit = m_pooling_cache_allowed &&
+                     m_pooling_cache.find(input_data.opaque(), input_dimensions,
+                                          output_dimensions, pooling_dimensions,
+                                          type, pdesc);
+    if (cache_hit) {
+      assert(pdesc != 0);
+      workspace_ptr = reinterpret_cast<uint8*>(
+          pdesc->workspace->mutable_device_memory()->opaque());
+      VLOG(1) << "Pooling cache hit";
+    } else {
+      VLOG(1) << "Pooling cache miss";
+      assert(workspace_allocator);
+      auto allocated =
+          workspace_allocator->AllocateBytes(workspace_size_in_bytes);
+      if (!allocated.ok() || (workspace = allocated.ValueOrDie()) == nullptr) {
+        LOG(ERROR) << "Failed to allocate backward pooling workspace";
+        return false;
+      }
+      DeviceMemory<uint8> dest2;  // duplicated dest from forward:
+      int64 dest2_size = 0;
+
+      // miopen requires the strides and dims to be ordered as BDYX.
+      std::vector<int64> dims64 =
+          output_dimensions.full_dims(dnn::DataLayout::kBatchDepthYX);
+      // miopen does not use strides and must have 4D tensor.
+      // std::vector<int> dims(pooling_dimensions.ndims() + 2);
+
+      dest2_size = sizeof(T);
+      for (auto& x : dims64) dest2_size *= x;
+
+      if (dest2_size > 0) {
+        assert(workspace_allocator);
+        auto allocated = workspace_allocator->AllocateBytes(dest2_size);
+        if (!allocated.ok() || (dest2 = allocated.ValueOrDie()) == nullptr) {
+          LOG(ERROR) << "Failed to allocate backward pooling workspace";
+          return false;
+        }
+      } else {
+        LOG(ERROR) << "Failed to calculate tensor size to chain forward and "
+                      "backward pooling";
+      }
+
+      status = wrap::miopenPoolingForward(
+          miopen.handle(), pooling_desc.handle(), &alpha, src_desc.handle(),
+          input_data.opaque(), &beta, dest_desc.handle(), dest2.opaque(), true,
+          workspace.opaque(), workspace_size_in_bytes);
+
+      if (status != miopenStatusSuccess) {
+        LOG(ERROR)
+            << "failed to enqueue forward pooling (before backward) on stream: "
+            << ToString(status);
+        return false;
+      }
+      workspace_ptr = reinterpret_cast<uint8*>(workspace.opaque());
+    }
+  }
+  status = wrap::miopenPoolingBackward(
+      miopen.handle(), pooling_desc.handle(), &alpha, dest_desc.handle(),
+      output_data.opaque(), dest_desc.handle(), input_diff_data.opaque(),
+      src_desc.handle(), input_data.opaque(), &beta, src_desc.handle(),
+      output_diff_data->opaque(), workspace_ptr);
+
+  if (status != miopenStatusSuccess) {
+    LOG(ERROR) << "failed to enqueue backward pooling on stream: "
+               << ToString(status);
+    return false;
+  }
+
+  return true;
+}
+
 bool MIOpenSupport::DoPoolBackward(
     Stream* stream, const dnn::PoolingDescriptor& pooling_dimensions,
     const dnn::BatchDescriptor& input_dimensions,
@@ -4094,91 +4335,10 @@ bool MIOpenSupport::DoPoolBackward(
     const DeviceMemory<float>& input_diff_data,
     DeviceMemory<float>* output_diff_data,
     ScratchAllocator* workspace_allocator) {
-  auto miopen = miopen_->GetHandle(parent_, stream);
-
-  // Alpha is the scaling factor for input.
-  float alpha = 1.0;
-  // Beta is the scaling factor for output.
-  float beta = 0.0;
-
-  ScopedTensorDescriptor src_desc{input_dimensions, miopenFloat};
-  ScopedTensorDescriptor dest_desc{output_dimensions, miopenFloat};
-  ScopedPoolingDescriptor pooling_desc{pooling_dimensions};
-
-  DeviceMemory<uint8> workspace;
-  size_t workspace_size_in_bytes = 0;
-  auto status = wrap::miopenPoolingGetWorkSpaceSize(dest_desc.handle(),
-                                                    &workspace_size_in_bytes);
-
-  if (status != miopenStatusSuccess) {
-    LOG(ERROR)
-        << "failed to obtain workspace size for backward pooling on stream: "
-        << ToString(status);
-    return false;
-  }
-
-  // Allocate the workspace.
-  if (workspace_size_in_bytes > 0) {
-    assert(workspace_allocator);
-    auto allocated =
-        workspace_allocator->AllocateBytes(workspace_size_in_bytes);
-    if (!allocated.ok() || (workspace = allocated.ValueOrDie()) == nullptr) {
-      LOG(ERROR) << "Failed to allocate backward pooling workspace";
-      return false;
-    }
-  }
-
-  DeviceMemory<uint8> dest2;  // duplicated dest from forward:
-  int dest2_size = 0;
-
-  // miopen requires the strides and dims to be ordered as BDYX.
-  std::vector<int64> dims64 =
-      output_dimensions.full_dims(dnn::DataLayout::kBatchDepthYX);
-
-  // miopen does not use strides and must have 4D tensor.
-  std::vector<int> dims(4);
-
-  std::transform(dims64.cbegin(), dims64.cend(), dims.begin(),
-                 &CheckedNarrowing<int64, int>);
-
-  dest2_size = dims[0] * dims[1] * dims[2] * dims[3] * sizeof(float);
-
-  if (dest2_size > 0) {
-    assert(workspace_allocator);
-    auto allocated = workspace_allocator->AllocateBytes(dest2_size);
-    if (!allocated.ok() || (dest2 = allocated.ValueOrDie()) == nullptr) {
-      LOG(ERROR) << "Failed to allocate backward pooling workspace";
-      return false;
-    }
-  } else {
-    LOG(ERROR) << "Failed to calculate tensor size to chain forward and "
-                  "backward pooling";
-  }
-
-  status = wrap::miopenPoolingForward(
-      miopen.handle(), pooling_desc.handle(), &alpha, src_desc.handle(),
-      input_data.opaque(), &beta, dest_desc.handle(), dest2.opaque(), true,
-      workspace.opaque(), workspace_size_in_bytes);
-
-  if (status != miopenStatusSuccess) {
-    LOG(ERROR)
-        << "failed to enqueue forward pooling (before backward) on stream: "
-        << ToString(status);
-    return false;
-  }
-
-  status = wrap::miopenPoolingBackward(
-      miopen.handle(), pooling_desc.handle(), &alpha, dest_desc.handle(),
-      dest2.opaque(), dest_desc.handle(), input_diff_data.opaque(),
-      src_desc.handle(), input_data.opaque(), &beta, src_desc.handle(),
-      output_diff_data->opaque(), workspace.opaque());
-
-  if (status != miopenStatusSuccess) {
-    LOG(ERROR) << "failed to enqueue backward pooling on stream: "
-               << ToString(status);
-    return false;
-  }
-  return true;
+  return DoPoolBackwardImpl(stream, pooling_dimensions, input_dimensions,
+                            input_data, output_dimensions, output_data,
+                            input_diff_data, output_diff_data,
+                            workspace_allocator);
 }
 
 bool MIOpenSupport::DoPoolBackward(
@@ -4190,91 +4350,10 @@ bool MIOpenSupport::DoPoolBackward(
     const DeviceMemory<Eigen::half>& input_diff_data,
     DeviceMemory<Eigen::half>* output_diff_data,
     ScratchAllocator* workspace_allocator) {
-  auto miopen = miopen_->GetHandle(parent_, stream);
-
-  // Alpha is the scaling factor for input.
-  float alpha = 1.0;
-  // Beta is the scaling factor for output.
-  float beta = 0.0;
-
-  ScopedTensorDescriptor src_desc{input_dimensions, miopenHalf};
-  ScopedTensorDescriptor dest_desc{output_dimensions, miopenHalf};
-  ScopedPoolingDescriptor pooling_desc{pooling_dimensions};
-
-  DeviceMemory<uint8> workspace;
-  size_t workspace_size_in_bytes = 0;
-  auto status = wrap::miopenPoolingGetWorkSpaceSize(dest_desc.handle(),
-                                                    &workspace_size_in_bytes);
-
-  if (status != miopenStatusSuccess) {
-    LOG(ERROR)
-        << "failed to obtain workspace size for backward pooling on stream: "
-        << ToString(status);
-    return false;
-  }
-
-  // Allocate the workspace.
-  if (workspace_size_in_bytes > 0) {
-    assert(workspace_allocator);
-    auto allocated =
-        workspace_allocator->AllocateBytes(workspace_size_in_bytes);
-    if (!allocated.ok() || (workspace = allocated.ValueOrDie()) == nullptr) {
-      LOG(ERROR) << "Failed to allocate backward pooling workspace";
-      return false;
-    }
-  }
-
-  DeviceMemory<uint8> dest2;  // duplicated dest from forward:
-  int dest2_size = 0;
-
-  // miopen requires the strides and dims to be ordered as BDYX.
-  std::vector<int64> dims64 =
-      output_dimensions.full_dims(dnn::DataLayout::kBatchDepthYX);
-
-  // miopen does not use strides and must have 4D tensor.
-  std::vector<int> dims(4);
-
-  std::transform(dims64.cbegin(), dims64.cend(), dims.begin(),
-                 &CheckedNarrowing<int64, int>);
-
-  dest2_size = dims[0] * dims[1] * dims[2] * dims[3] * sizeof(float);
-
-  if (dest2_size > 0) {
-    assert(workspace_allocator);
-    auto allocated = workspace_allocator->AllocateBytes(dest2_size);
-    if (!allocated.ok() || (dest2 = allocated.ValueOrDie()) == nullptr) {
-      LOG(ERROR) << "Failed to allocate backward pooling workspace";
-      return false;
-    }
-  } else {
-    LOG(ERROR) << "Failed to calculate tensor size to chain forward and "
-                  "backward pooling";
-  }
-
-  status = wrap::miopenPoolingForward(
-      miopen.handle(), pooling_desc.handle(), &alpha, src_desc.handle(),
-      input_data.opaque(), &beta, dest_desc.handle(), dest2.opaque(), true,
-      workspace.opaque(), workspace_size_in_bytes);
-
-  if (status != miopenStatusSuccess) {
-    LOG(ERROR)
-        << "failed to enqueue forward pooling (before backward) on stream: "
-        << ToString(status);
-    return false;
-  }
-
-  status = wrap::miopenPoolingBackward(
-      miopen.handle(), pooling_desc.handle(), &alpha, dest_desc.handle(),
-      dest2.opaque(), dest_desc.handle(), input_diff_data.opaque(),
-      src_desc.handle(), input_data.opaque(), &beta, src_desc.handle(),
-      output_diff_data->opaque(), workspace.opaque());
-
-  if (status != miopenStatusSuccess) {
-    LOG(ERROR) << "failed to enqueue backward pooling on stream: "
-               << ToString(status);
-    return false;
-  }
-  return true;
+  return DoPoolBackwardImpl(stream, pooling_dimensions, input_dimensions,
+                            input_data, output_dimensions, output_data,
+                            input_diff_data, output_diff_data,
+                            workspace_allocator);
 }
 
 bool MIOpenSupport::DoNormalizeWithDimensions(
diff --git a/tensorflow/stream_executor/rocm/rocm_dnn.h b/tensorflow/stream_executor/rocm/rocm_dnn.h
index 40e156b5f74..4f568702d96 100644
--- a/tensorflow/stream_executor/rocm/rocm_dnn.h
+++ b/tensorflow/stream_executor/rocm/rocm_dnn.h
@@ -20,6 +20,7 @@ limitations under the License.
 #define TENSORFLOW_STREAM_EXECUTOR_ROCM_ROCM_DNN_H_
 
 #include "absl/synchronization/mutex.h"
+#include "rocm/include/miopen/miopen.h"
 #include "tensorflow/core/platform/thread_annotations.h"
 #include "tensorflow/stream_executor/dnn.h"
 #include "tensorflow/stream_executor/lib/status.h"
@@ -38,6 +39,39 @@ class MIOpenCTCLossDescriptor;
 // Opaque and unique identifier for the MIOpen plugin.
 extern const PluginId kMIOpenPlugin;
 
+struct PoolingWorkspaceDescriptor {
+  std::vector<int64> input_dims;
+  std::vector<int64> output_dims;
+  dnn::PoolingDescriptor op;
+  int dtype;
+  uint64_t timestamp;
+  std::unique_ptr<TemporaryDeviceMemory<uint8>> workspace;
+  size_t workspace_size;
+  bool IsSame(const dnn::BatchDescriptor& input_dimensions,
+              const dnn::BatchDescriptor& output_dimensions,
+              const dnn::PoolingDescriptor& pooling_dimensions, int _type);
+};
+
+struct PoolingWorkspaceCache {
+  std::map<const void*, PoolingWorkspaceDescriptor> cache;
+  const int trim_size = 1000;
+  const uint64_t memory_budget = 2e7;
+  uint64_t timestamp = 0;
+  uint64_t memory_used = 0;
+  bool find(const void* p, const dnn::BatchDescriptor& input_dimensions,
+            const dnn::BatchDescriptor& output_dimensions,
+            const dnn::PoolingDescriptor& pooling_dimensions, int _type,
+            PoolingWorkspaceDescriptor*& pdesc);
+  void insert(const void* p, const dnn::BatchDescriptor& input_dimensions,
+              const dnn::BatchDescriptor& output_dimensions,
+              const dnn::PoolingDescriptor& pooling_dimensions, int _type,
+              std::unique_ptr<TemporaryDeviceMemory<uint8>>& workspace,
+              size_t wsp_size, hipStream_t hip_stream);
+
+ private:
+  void trim(hipStream_t hip_stream);
+};
+
 // miopen-library based DNN support. For details on overridden interface
 // functions, see dnn.h.
 class MIOpenSupport : public dnn::DnnSupport {
@@ -664,6 +698,10 @@ class MIOpenSupport : public dnn::DnnSupport {
   // Provide access to the MIOpen handle.
   std::unique_ptr<class MIOpenAccess> miopen_;
 
+  PoolingWorkspaceCache m_pooling_cache;
+  bool m_pooling_cache_allowed = false;
+  bool m_pooling_cache_enabled = false;
+
   template <class T, class U>
   bool DoBatchNormalizationForwardImpl(
       Stream* stream, dnn::DataType input_data_type,
@@ -847,6 +885,36 @@ class MIOpenSupport : public dnn::DnnSupport {
       ScratchAllocator* scratch_allocator,
       std::vector<dnn::ProfileResult>* out_algorithms);
 
+  port::Status DoCtcLossImpl(
+      Stream* stream, const MIOpenRnnStateTensorDescriptor& probs_desc,
+      const DeviceMemoryBase probs_data, absl::Span<const int> labels_data,
+      absl::Span<const int> labels_lengths_data,
+      absl::Span<const int> input_lengths_data, DeviceMemoryBase costs_data,
+      const MIOpenRnnStateTensorDescriptor& grads_desc,
+      DeviceMemoryBase grads_data, const MIOpenCTCLossDescriptor& ctc_loss_desc,
+      DeviceMemory<uint8> scratch_memory);
+
+  port::Status DoPrepareForCtcLoss(
+      Stream* stream, dnn::DataType element_type,
+      const dnn::RnnStateTensorDescriptor& probs_desc,
+      const dnn::RnnStateTensorDescriptor& grads_desc,
+      absl::Span<const int> labels_data,
+      absl::Span<const int> labels_lengths_data,
+      absl::Span<const int> input_lengths_data,
+      ScratchAllocator* scratch_allocator,
+      DeviceMemory<uint8>* scratch_memory) override;
+
+  template <class T>
+  bool DoPoolBackwardImpl(Stream* stream,
+                          const dnn::PoolingDescriptor& pooling_dimensions,
+                          const dnn::BatchDescriptor& input_dimensions,
+                          const DeviceMemory<T>& input_data,
+                          const dnn::BatchDescriptor& output_dimensions,
+                          const DeviceMemory<T>& output_data,
+                          const DeviceMemory<T>& input_diff_data,
+                          DeviceMemory<T>* output_diff_data,
+                          ScratchAllocator* workspace_allocator = nullptr);
+
   SE_DISALLOW_COPY_AND_ASSIGN(MIOpenSupport);
 };
 

From 507c7549317221bcf5b418a66fd0212cd4a7443b Mon Sep 17 00:00:00 2001
From: Elena Zhelezina <elena.zhelezina@arm.com>
Date: Tue, 9 Jun 2020 17:47:16 +0100
Subject: [PATCH 0106/1390] Fix for pylint errors.

Change-Id: Idd96d7a41fd459c86ab0f6fbb63e5d543509145d
---
 tensorflow/lite/python/convert.py             |  3 ++-
 tensorflow/lite/python/lite.py                | 27 ++++++++++---------
 tensorflow/lite/python/lite_test.py           | 10 ++++---
 tensorflow/lite/python/optimize/calibrator.py |  3 ++-
 .../lite/python/optimize/calibrator_test.py   |  3 ++-
 5 files changed, 28 insertions(+), 18 deletions(-)

diff --git a/tensorflow/lite/python/convert.py b/tensorflow/lite/python/convert.py
index 52edb700195..68e23634b2e 100644
--- a/tensorflow/lite/python/convert.py
+++ b/tensorflow/lite/python/convert.py
@@ -103,7 +103,8 @@ class OpsSet(enum.Enum):
   # significantly, but only slightly increases the model size.   
   # WARNING: These ops are currently experimental and have not yet been finalized.
   # They are only compatible with CPU execution, and have not been optimized for production.
-  EXPERIMENTAL_TFLITE_BUILTINS_ACTIVATIONS_INT16_WEIGHTS_INT8 = "EXPERIMENTAL_TFLITE_BUILTINS_ACTIVATIONS_INT16_WEIGHTS_INT8"
+  EXPERIMENTAL_TFLITE_BUILTINS_ACTIVATIONS_INT16_WEIGHTS_INT8 = \
+    "EXPERIMENTAL_TFLITE_BUILTINS_ACTIVATIONS_INT16_WEIGHTS_INT8"
 
   def __str__(self):
     return self.value
diff --git a/tensorflow/lite/python/lite.py b/tensorflow/lite/python/lite.py
index 26c6f0855af..bed48860b00 100644
--- a/tensorflow/lite/python/lite.py
+++ b/tensorflow/lite/python/lite.py
@@ -251,7 +251,8 @@ class QuantizationMode(object):
                 self.post_training_fp16())
 
   def activations_type(self):
-    return constants.INT16 if self._is_int16x8_target_required() else constants.INT8
+    return constants.INT16 if self._is_int16x8_target_required() \
+      else constants.INT8
 
   def converter_flags(self, inference_ty=None, inference_input_ty=None):
     """Flags to the converter."""
@@ -262,7 +263,8 @@ class QuantizationMode(object):
 
     if self.training_time_int8_allow_float():
       return {
-          "inference_type": inference_ty if inference_ty else self.activations_type(),
+          "inference_type": inference_ty if inference_ty else \
+            self.activations_type(),
           "inference_input_type":
               inference_input_ty if inference_input_ty else constants.FLOAT,
           "post_training_quantize": False,  # disable dynamic range quantization
@@ -359,15 +361,15 @@ class QuantizationMode(object):
 
   def _is_int16x8_target_required(self):
     return bool(
-          set(self._target_spec.supported_ops).intersection([
-            OpsSet.EXPERIMENTAL_TFLITE_BUILTINS_ACTIVATIONS_INT16_WEIGHTS_INT8
-        ]))
+      set(self._target_spec.supported_ops).intersection([
+        OpsSet.EXPERIMENTAL_TFLITE_BUILTINS_ACTIVATIONS_INT16_WEIGHTS_INT8
+      ]))
 
   def _is_allow_float(self):
     return bool(
-          set(self._target_spec.supported_ops).intersection([
-            OpsSet.TFLITE_BUILTINS
-        ]))
+      set(self._target_spec.supported_ops).intersection([
+        OpsSet.TFLITE_BUILTINS
+      ]))
 
   def _any_optimization_enabled(self):
     return bool(
@@ -441,7 +443,8 @@ class TFLiteConverterBase(object):
     return _get_grappler_config(optimizers)
 
   def _calibrate_quantize_model(self, result, inference_input_type,
-                                inference_output_type, activations_type, allow_float):
+                                inference_output_type, activations_type,
+                                allow_float):
     """Calibrate and quantize the model."""
     if not isinstance(self.representative_dataset, RepresentativeDataset):
       self.representative_dataset = RepresentativeDataset(
@@ -458,8 +461,8 @@ class TFLiteConverterBase(object):
       return _mlir_quantize(calibrated)
     else:
       return calibrate_quantize.calibrate_and_quantize(
-        self.representative_dataset.input_gen, inference_input_type,
-        inference_output_type, allow_float, activations_type)
+          self.representative_dataset.input_gen, inference_input_type,
+          inference_output_type, allow_float, activations_type)
 
   def _is_unknown_shapes_allowed(self):
     # Unknown dimensions are only allowed with the new converter.
@@ -1992,7 +1995,7 @@ class TocoConverter(object):
 
   @classmethod
   @_deprecation.deprecated(
-    None, "Use `lite.TFLiteConverter.from_keras_model_file` instead.")
+      None, "Use `lite.TFLiteConverter.from_keras_model_file` instead.")
   def from_keras_model_file(cls,
                             model_file,
                             input_arrays=None,
diff --git a/tensorflow/lite/python/lite_test.py b/tensorflow/lite/python/lite_test.py
index 044b1211e17..cae49cb147f 100644
--- a/tensorflow/lite/python/lite_test.py
+++ b/tensorflow/lite/python/lite_test.py
@@ -882,11 +882,15 @@ class FromSessionTest(TestModels, parameterized.TestCase):
 
   @parameterized.named_parameters(
       # Quantize model to Int8: with enable mlir
-      ('UseTfliteBuiltinsIntEnableMLIR', [lite.OpsSet.TFLITE_BUILTINS_INT8], True),
+      ('UseTfliteBuiltinsIntEnableMLIR',
+        [lite.OpsSet.TFLITE_BUILTINS_INT8], True),
       # Quantize model to Int8: with disable mlir
-      ('UseTfliteBuiltinsIntDisableMLIR', [lite.OpsSet.TFLITE_BUILTINS_INT8], False),
+      ('UseTfliteBuiltinsIntDisableMLIR',
+        [lite.OpsSet.TFLITE_BUILTINS_INT8], False),
       # Quantize model to Int16: with disable mlir
-      ('UseTfliteBuiltinsInt16DisableMLIR', [lite.OpsSet.EXPERIMENTAL_TFLITE_BUILTINS_ACTIVATIONS_INT16_WEIGHTS_INT8], False))
+      ('UseTfliteBuiltinsInt16DisableMLIR',
+        [lite.OpsSet.EXPERIMENTAL_TFLITE_BUILTINS_ACTIVATIONS_INT16_WEIGHTS_INT8],
+        False))
   def testCalibrateAndQuantizeBuiltinInt(self, supported_ops, enable_mlir):
     with ops.Graph().as_default():
       inp, output, calibration_gen = self._getCalibrationQuantizeModel()
diff --git a/tensorflow/lite/python/optimize/calibrator.py b/tensorflow/lite/python/optimize/calibrator.py
index 90c43fcddfa..2b08ec690ff 100644
--- a/tensorflow/lite/python/optimize/calibrator.py
+++ b/tensorflow/lite/python/optimize/calibrator.py
@@ -78,7 +78,8 @@ class Calibrator(object):
                    computation, useful when targeting an integer-only backend.
                    If False, an error will be thrown if an operation cannot be
                    quantized, otherwise the model will fallback to float ops.
-      activations_type: A tf.dtype representing the desired type for activations.
+      activations_type: A tf.dtype representing the desired type for
+                   activations.
       resize_input: A boolean. True if the shape of the sample data is different
         from the input.
     """
diff --git a/tensorflow/lite/python/optimize/calibrator_test.py b/tensorflow/lite/python/optimize/calibrator_test.py
index f778c8a555d..d79d76b09ed 100644
--- a/tensorflow/lite/python/optimize/calibrator_test.py
+++ b/tensorflow/lite/python/optimize/calibrator_test.py
@@ -96,7 +96,8 @@ class CalibratorTest(test_util.TensorFlowTestCase, parameterized.TestCase):
       ('UseActivationTypeInt8 - EnableMlirQuantizer', constants.INT8),
       # Activation type Int16 
       ('UseActivationTypeInt16 - DisableEnableMlirQuantizer', constants.INT16))
-  def test_calibration_with_quantization_multiple_inputs(self, activations_type):
+  def test_calibration_with_quantization_multiple_inputs(self,
+                                                         activations_type):
     # Load multi add model from test data.
     # This model has 4 inputs of size (1, 8, 8, 3).
     model_path = resource_loader.get_path_to_datafile(

From 3fc256a0dc31eae6711d2f7680493925f0fa4091 Mon Sep 17 00:00:00 2001
From: Elena Zhelezina <elena.zhelezina@arm.com>
Date: Tue, 9 Jun 2020 17:51:37 +0100
Subject: [PATCH 0107/1390] Fix for pylint.

Change-Id: If2674380c25eb8973e73a407b75660088098e6da
---
 tensorflow/lite/python/lite_test.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/tensorflow/lite/python/lite_test.py b/tensorflow/lite/python/lite_test.py
index cae49cb147f..e6661c82894 100644
--- a/tensorflow/lite/python/lite_test.py
+++ b/tensorflow/lite/python/lite_test.py
@@ -883,14 +883,15 @@ class FromSessionTest(TestModels, parameterized.TestCase):
   @parameterized.named_parameters(
       # Quantize model to Int8: with enable mlir
       ('UseTfliteBuiltinsIntEnableMLIR',
-        [lite.OpsSet.TFLITE_BUILTINS_INT8], True),
+       [lite.OpsSet.TFLITE_BUILTINS_INT8], True),
       # Quantize model to Int8: with disable mlir
       ('UseTfliteBuiltinsIntDisableMLIR',
-        [lite.OpsSet.TFLITE_BUILTINS_INT8], False),
+       [lite.OpsSet.TFLITE_BUILTINS_INT8], False),
       # Quantize model to Int16: with disable mlir
       ('UseTfliteBuiltinsInt16DisableMLIR',
-        [lite.OpsSet.EXPERIMENTAL_TFLITE_BUILTINS_ACTIVATIONS_INT16_WEIGHTS_INT8],
-        False))
+       [lite.OpsSet.\
+       EXPERIMENTAL_TFLITE_BUILTINS_ACTIVATIONS_INT16_WEIGHTS_INT8],
+       False))
   def testCalibrateAndQuantizeBuiltinInt(self, supported_ops, enable_mlir):
     with ops.Graph().as_default():
       inp, output, calibration_gen = self._getCalibrationQuantizeModel()

From 15b97dd44a42ae1ab4cc4192a5f7c820ac90b9ac Mon Sep 17 00:00:00 2001
From: nammbash <niroopshankar.ammbashankar@intel.com>
Date: Tue, 9 Jun 2020 11:25:45 -0700
Subject: [PATCH 0108/1390] CPU Feature Guard MKL brand needs to change to
 oneDNNchange to one dnn

---
 tensorflow/core/platform/cpu_feature_guard.cc | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/tensorflow/core/platform/cpu_feature_guard.cc b/tensorflow/core/platform/cpu_feature_guard.cc
index c5a5c287283..a020d3fd70e 100644
--- a/tensorflow/core/platform/cpu_feature_guard.cc
+++ b/tensorflow/core/platform/cpu_feature_guard.cc
@@ -138,8 +138,16 @@ void InfoAboutUnusedCPUFeatures() {
     CheckIfFeatureUnused(CPUFeature::FMA, "FMA", missing_instructions);
 #endif  // __FMA__
 #endif  // else of if defined(_MSC_VER) && !defined(__clang__)
-    if (!missing_instructions.empty()) {
-      LOG(INFO) << "This TensorFlow binary is optimized with Intel(R) MKL-DNN "
+
+    string intel_library_official_name(
+        "Intel(R) oneAPI Deep Neural Network Library (oneDNN) ");
+#ifndef INTEL_MKL
+    intel_library_official_name = "oneAPI Deep Neural Network Library (oneDNN) ";
+#endif
+
+        if (!missing_instructions.empty()) {
+      LOG(INFO) << "This TensorFlow binary is optimized with "
+                << intel_library_official_name
                 << "to use the following CPU instructions in performance-"
                 << "critical operations: " << missing_instructions << std::endl
                 << "To enable them in other operations, rebuild TensorFlow "

From 47674cac85b61f7a438c6970a6a7ca49946a2622 Mon Sep 17 00:00:00 2001
From: Elena Zhelezina <elena.zhelezina@arm.com>
Date: Tue, 9 Jun 2020 22:27:57 +0100
Subject: [PATCH 0109/1390] Fix for pylint

Change-Id: If03f60a3eebc7aed61c10870c545fe6035bcb2a3
---
 tensorflow/lite/python/lite.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/tensorflow/lite/python/lite.py b/tensorflow/lite/python/lite.py
index bed48860b00..06796ba820b 100644
--- a/tensorflow/lite/python/lite.py
+++ b/tensorflow/lite/python/lite.py
@@ -361,15 +361,15 @@ class QuantizationMode(object):
 
   def _is_int16x8_target_required(self):
     return bool(
-      set(self._target_spec.supported_ops).intersection([
-        OpsSet.EXPERIMENTAL_TFLITE_BUILTINS_ACTIVATIONS_INT16_WEIGHTS_INT8
-      ]))
+        set(self._target_spec.supported_ops).intersection([
+            OpsSet.EXPERIMENTAL_TFLITE_BUILTINS_ACTIVATIONS_INT16_WEIGHTS_INT8
+        ]))
 
   def _is_allow_float(self):
     return bool(
-      set(self._target_spec.supported_ops).intersection([
-        OpsSet.TFLITE_BUILTINS
-      ]))
+        set(self._target_spec.supported_ops).intersection([
+            OpsSet.TFLITE_BUILTINS
+        ]))
 
   def _any_optimization_enabled(self):
     return bool(

From 84afc268a77f543fe64ecb45832701278a9eb129 Mon Sep 17 00:00:00 2001
From: Elena Zhelezina <elena.zhelezina@arm.com>
Date: Fri, 5 Jun 2020 11:47:09 +0100
Subject: [PATCH 0110/1390] Documentation on the new experimental option for
 16x8.

---
 .../performance/post_training_quantization.md | 43 +++++++++++++++++++
 1 file changed, 43 insertions(+)

diff --git a/tensorflow/lite/g3doc/performance/post_training_quantization.md b/tensorflow/lite/g3doc/performance/post_training_quantization.md
index af7d9dbf02d..c48a2820d2f 100644
--- a/tensorflow/lite/g3doc/performance/post_training_quantization.md
+++ b/tensorflow/lite/g3doc/performance/post_training_quantization.md
@@ -151,6 +151,49 @@ The disadvantages of float16 quantization are as follows:
     to float32 when run on the CPU. (Note that the GPU delegate will not perform
     this dequantization, since it can operate on float16 data.)
 
+### Integer only: 16-bit activations with 8-bit weights (experimental)
+
+This is an experimental quantization scheme. It is similar to the "integer only"
+scheme, but activations are quantized based on their range to 16-bits, weights are
+quantized in 8-bit integer and bias is quantized into 64-bit integer.
+This is referred to as 16x8 quantization further.
+
+The main advantage of this quantization is that it can improve accuracy
+significantly, but only slightly increase model size.
+
+<pre>
+import tensorflow as tf
+converter = tf.lite.TFLiteConverter.from_saved_model(saved_model_dir)
+<b>converter.optimizations = [tf.lite.Optimize.DEFAULT]
+converter.target_spec.supported_types = [tf.lite.constants.EXPERIMENTAL_TFLITE_BUILTINS_ACTIVATIONS_INT16_WEIGHTS_INT8]</b>
+tflite_quant_model = converter.convert()
+</pre>
+
+If 16x8 quantization is not supported for some operators in the model,
+then the model still can be quantized, but unsupported operators kept in float.
+The following option should be added to the target_spec to allow this.
+<pre>
+import tensorflow as tf
+converter = tf.lite.TFLiteConverter.from_saved_model(saved_model_dir)
+converter.optimizations = [tf.lite.Optimize.DEFAULT]
+converter.target_spec.supported_types = [tf.lite.constants.EXPERIMENTAL_TFLITE_BUILTINS_ACTIVATIONS_INT16_WEIGHTS_INT8,
+<b>tf.lite.OpsSet.TFLITE_BUILTINS</b>]
+tflite_quant_model = converter.convert()
+</pre>
+
+Examples of the use cases where accuracy improvements provided by this quantization scheme include:
+*   super-resolution,
+*   audio signal processing such as noise cancelling and beamforming,
+*   image de-noising,
+*   HDR reconstruction from a single image.
+
+The disadvantage of this quantization is:
+
+*   Currently inference is noticeably slower than 8-bit full integer due to the lack of optimized kernel implementation.
+*   Currently it is incompatible with the existing hardware accelerated TFLite delegates.
+
+Note: This is an experimental feature.
+
 ### Model accuracy
 
 Since weights are quantized post training, there could be an accuracy loss,

From dcfc2175c79ee6c610770b597c8d637daa1649bc Mon Sep 17 00:00:00 2001
From: Elena Zhelezina <elena.zhelezina@arm.com>
Date: Tue, 9 Jun 2020 22:35:57 +0100
Subject: [PATCH 0111/1390] Small change of comment per reviewer's note.

Change-Id: I1233b95282befebfa0e6c06173f5e928aef60b22
---
 tensorflow/lite/python/convert.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/lite/python/convert.py b/tensorflow/lite/python/convert.py
index 68e23634b2e..ec70f793f21 100644
--- a/tensorflow/lite/python/convert.py
+++ b/tensorflow/lite/python/convert.py
@@ -98,9 +98,9 @@ class OpsSet(enum.Enum):
   # int16 activations and int64 bias.
   # Specifying this will throw an error for operations that do not yet have
   # quantized implementations.
-  # This quantization mode should be used in models for super-resolution,
+  # This quantization mode may be used in models for super-resolution,
   # audio signal processing or image de-noising. It improves accuracy
-  # significantly, but only slightly increases the model size.   
+  # significantly, but only slightly increases the model size.
   # WARNING: These ops are currently experimental and have not yet been finalized.
   # They are only compatible with CPU execution, and have not been optimized for production.
   EXPERIMENTAL_TFLITE_BUILTINS_ACTIVATIONS_INT16_WEIGHTS_INT8 = \

From aaf693d29bb5a52d6dfd8c106e45b2ff513e6d84 Mon Sep 17 00:00:00 2001
From: shwetaoj <shweta.ojha@intel.com>
Date: Tue, 2 Jun 2020 08:07:53 -0700
Subject: [PATCH 0112/1390] Added code to build multiinstance/multinode
 container

---
 .../ci_build/linux/mkl/Dockerfile.devel-mkl   | 16 ++++--
 .../ci_build/linux/mkl/build-dev-container.sh | 26 ++++++++++
 .../linux/mkl/install_openmpi_horovod.sh      | 49 +++++++++++++++++++
 3 files changed, 88 insertions(+), 3 deletions(-)
 create mode 100755 tensorflow/tools/ci_build/linux/mkl/install_openmpi_horovod.sh

diff --git a/tensorflow/tools/ci_build/linux/mkl/Dockerfile.devel-mkl b/tensorflow/tools/ci_build/linux/mkl/Dockerfile.devel-mkl
index 45ccf67d707..3893f61d940 100755
--- a/tensorflow/tools/ci_build/linux/mkl/Dockerfile.devel-mkl
+++ b/tensorflow/tools/ci_build/linux/mkl/Dockerfile.devel-mkl
@@ -15,6 +15,11 @@ ARG CONFIG_BFLOAT16_BUILD=""
 ARG ENABLE_SECURE_BUILD
 ARG BAZEL_VERSION=""
 ARG ENABLE_DNNL1=""
+ARG ENABLE_HOROVOD=""
+ARG OPENMPI_VERSION=""
+ARG OPENMPI_DOWNLOAD_URL=""
+
+ENV DEBIAN_FRONTEND=noninteractive
 
 # Upgrade Bazel version if argument is passed
 RUN if [ "${BAZEL_VERSION}" != "" ]; then \
@@ -45,9 +50,6 @@ RUN ${PYTHON} set-build-env.py -p ${TARGET_PLATFORM} -f /root/.mkl.bazelrc \
 # Pull the compiler flags we just wrote into root user's .bazelrc file
 RUN echo "import /root/.mkl.bazelrc" >>/root/.bazelrc
 
-# Install futures>=0.17.1 for Python2.7 compatibility mode
-RUN ${PIP} install future>=0.17.1
-
 RUN bazel --bazelrc=/root/.bazelrc build -c opt \
     tensorflow/tools/pip_package:build_pip_package && \
     bazel-bin/tensorflow/tools/pip_package/build_pip_package "${WHL_DIR}" && \
@@ -55,6 +57,14 @@ RUN bazel --bazelrc=/root/.bazelrc build -c opt \
     rm -rf /root/.cache
     # Clean up Bazel cache when done.
 
+#Install OpenMPI/Horovod
+COPY install_openmpi_horovod.sh .
+RUN if [ "${ENABLE_HOROVOD}" = "yes" ]; then \
+        chmod +x install_openmpi_horovod.sh && \
+        ${OPENMPI_VERSION} ${OPENMPI_DOWNLOAD_URL} install_openmpi_horovod.sh && \
+        rm -rf install_openmpi_horovod.sh; \
+    fi
+
 # TensorBoard
 EXPOSE 6006
 # IPython
diff --git a/tensorflow/tools/ci_build/linux/mkl/build-dev-container.sh b/tensorflow/tools/ci_build/linux/mkl/build-dev-container.sh
index eceef65aa38..da647153cdb 100755
--- a/tensorflow/tools/ci_build/linux/mkl/build-dev-container.sh
+++ b/tensorflow/tools/ci_build/linux/mkl/build-dev-container.sh
@@ -64,6 +64,9 @@ ENABLE_SECURE_BUILD=${ENABLE_SECURE_BUILD:-no}
 BAZEL_VERSION=${BAZEL_VERSION}
 BUILD_PY2_CONTAINERS=${BUILD_PY2_CONTAINERS:-no}
 ENABLE_DNNL1=${ENABLE_DNNL1:-no}
+ENABLE_HOROVOD=${ENABLE_HOROVOD:-no}
+OPENMPI_VERSION=${OPENMPI_VERSION}
+OPENMPI_DOWNLOAD_URL=${OPENMPI_DOWNLOAD_URL}
 
 debug "ROOT_CONTAINER=${ROOT_CONTAINER}"
 debug "TF_ROOT_CONTAINER_TAG=${TF_ROOT_CONTAINER_TAG}"
@@ -82,6 +85,9 @@ debug "TMP_DIR=${TMP_DIR}"
 debug "BAZEL_VERSION=${BAZEL_VERSION}"
 debug "BUILD_PY2_CONTAINERS=${BUILD_PY2_CONTAINERS}"
 debug "ENABLE_DNNL1=${ENABLE_DNNL1}"
+debug "ENABLE_HOROVOD=${ENABLE_HOROVOD}"
+debug "OPENMPI_VERSION=${OPENMPI_VERSION}"
+debug "OPENMPI_DOWNLOAD_URL=${OPENMPI_DOWNLOAD_URL}"
 
 function build_container()
 {
@@ -131,6 +137,13 @@ function build_container()
     TF_DOCKER_BUILD_ARGS+=("--build-arg BAZEL_VERSION=${BAZEL_VERSION}")
   fi
 
+  # Add build arg for installing OpenMPI/Horovod
+  if [[ ${ENABLE_HOROVOD} == "yes" ]]; then
+    TF_DOCKER_BUILD_ARGS+=("--build-arg ENABLE_HOROVOD=${ENABLE_HOROVOD}")
+    TF_DOCKER_BUILD_ARGS+=("--build-arg OPENMPI_VERSION=${OPENMPI_VERSION}")
+    TF_DOCKER_BUILD_ARGS+=("--build-arg OPENMPI_DOWNLOAD_URL=${OPENMPI_DOWNLOAD_URL}")
+  fi
+
   # Perform docker build
   debug "Building docker image with image name and tag: ${TEMP_IMAGE_NAME}"
   CMD="${DOCKER_BINARY} build ${TF_DOCKER_BUILD_ARGS[@]} --no-cache --pull -t ${TEMP_IMAGE_NAME} -f Dockerfile.devel-mkl ."
@@ -188,6 +201,19 @@ function test_container()
       die "FAIL: MKL enabled test in ${TEMP_IMAGE_NAME}"
   fi
 
+  # Test to check if horovod is installed successfully
+  debug "Test horovod in the container..."
+  if [[ ${ENABLE_HOROVOD} == "yes" ]]; then
+      HOROVOD_TEST_CMD=$(${DOCKER_BINARY} exec ${CONTAINER_ID} bash -c "${PYTHON} -c 'import horovod.tensorflow as hvd;'")  
+      ${HOROVOD_TEST_CMD}
+      if [[ $? == "0" ]]; then
+          echo "PASS: HOROVOD installation test in ${TEMP_IMAGE_NAME}"
+      else
+          die "FAIL: HOROVOD installation test in ${TEMP_IMAGE_NAME}"
+      fi
+  fi
+  
+
   # Stop the running docker container
   sleep 1
   "${DOCKER_BINARY}" stop --time=0 ${CONTAINER_ID}
diff --git a/tensorflow/tools/ci_build/linux/mkl/install_openmpi_horovod.sh b/tensorflow/tools/ci_build/linux/mkl/install_openmpi_horovod.sh
new file mode 100755
index 00000000000..d1b297726ed
--- /dev/null
+++ b/tensorflow/tools/ci_build/linux/mkl/install_openmpi_horovod.sh
@@ -0,0 +1,49 @@
+#!/usr/bin/env bash
+# install OpenMPI, OpenSSH and Horovod
+
+set -e
+
+apt-get clean && apt-get update -y
+
+# Install Open MPI
+OPENMPI_VERSION=${OPENMPI_VERSION:-openmpi-2.1.1}
+OPENMPI_DOWNLOAD_URL=${OPENMPI_DOWNLOAD_URL:-https://www.open-mpi.org/software/ompi/v2.1/downloads/${OPENMPI_VERSION}.tar.gz}
+echo "Installing OpenMPI version ${OPENMPI_VERSION}..."
+echo "OpenMPI Download url ${OPENMPI_DOWNLOAD_URL}..."
+
+mkdir /tmp/openmpi
+cd /tmp/openmpi
+curl -fSsL -O ${OPENMPI_DOWNLOAD_URL}
+tar zxf ${OPENMPI_VERSION}.tar.gz
+cd ${OPENMPI_VERSION}
+./configure --enable-mpirun-prefix-by-default
+make -j $(nproc) all
+make install
+ldconfig
+cd /
+rm -rf /tmp/openmpi
+
+# Create a wrapper for OpenMPI to allow running as root by default
+mv /usr/local/bin/mpirun /usr/local/bin/mpirun.real
+echo '#!/bin/bash' > /usr/local/bin/mpirun
+echo 'mpirun.real --allow-run-as-root "$@"' >> /usr/local/bin/mpirun
+chmod a+x /usr/local/bin/mpirun
+
+# Configure OpenMPI to run good defaults:
+echo "btl_tcp_if_exclude = lo,docker0" >> /usr/local/etc/openmpi-mca-params.conf
+
+#Check mpi version
+echo 'OpenMPI version:'
+mpirun --version
+
+# Install OpenSSH for MPI to communicate between containers
+apt-get install -y --no-install-recommends --fix-missing openssh-client openssh-server libnuma-dev
+mkdir -p /var/run/sshd
+# Allow OpenSSH to talk to containers without asking for confirmation
+cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new
+echo " StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new
+mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config
+
+#Install Horovod
+HOROVOD_WITH_TENSORFLOW=1
+python3 -m pip install --no-cache-dir horovod==0.19.1

From ff359d4a48aeb1905f767d32e7da1a2d01d4ce6a Mon Sep 17 00:00:00 2001
From: shwetaoj <shweta.ojha@intel.com>
Date: Tue, 2 Jun 2020 13:46:16 -0700
Subject: [PATCH 0113/1390] Setting default values for OpenMPI versions

---
 .../tools/ci_build/linux/mkl/Dockerfile.devel-mkl     |  2 +-
 .../ci_build/linux/mkl/install_openmpi_horovod.sh     | 11 +++++++++--
 2 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/tensorflow/tools/ci_build/linux/mkl/Dockerfile.devel-mkl b/tensorflow/tools/ci_build/linux/mkl/Dockerfile.devel-mkl
index 3893f61d940..1fd54ff703f 100755
--- a/tensorflow/tools/ci_build/linux/mkl/Dockerfile.devel-mkl
+++ b/tensorflow/tools/ci_build/linux/mkl/Dockerfile.devel-mkl
@@ -61,7 +61,7 @@ RUN bazel --bazelrc=/root/.bazelrc build -c opt \
 COPY install_openmpi_horovod.sh .
 RUN if [ "${ENABLE_HOROVOD}" = "yes" ]; then \
         chmod +x install_openmpi_horovod.sh && \
-        ${OPENMPI_VERSION} ${OPENMPI_DOWNLOAD_URL} install_openmpi_horovod.sh && \
+        ./install_openmpi_horovod.sh ${OPENMPI_VERSION} ${OPENMPI_DOWNLOAD_URL} && \
         rm -rf install_openmpi_horovod.sh; \
     fi
 
diff --git a/tensorflow/tools/ci_build/linux/mkl/install_openmpi_horovod.sh b/tensorflow/tools/ci_build/linux/mkl/install_openmpi_horovod.sh
index d1b297726ed..4c8b04f6024 100755
--- a/tensorflow/tools/ci_build/linux/mkl/install_openmpi_horovod.sh
+++ b/tensorflow/tools/ci_build/linux/mkl/install_openmpi_horovod.sh
@@ -5,9 +5,16 @@ set -e
 
 apt-get clean && apt-get update -y
 
+# Set default
+if [[ $# -gt 1 ]]; then
+  OPENMPI_VERSION="${1}"
+  OPENMPI_DOWNLOAD_URL="${2}"
+else
+  OPENMPI_VERSION=openmpi-2.1.1
+  OPENMPI_DOWNLOAD_URL=https://www.open-mpi.org/software/ompi/v2.1/downloads/${OPENMPI_VERSION}.tar.gz  
+fi
+
 # Install Open MPI
-OPENMPI_VERSION=${OPENMPI_VERSION:-openmpi-2.1.1}
-OPENMPI_DOWNLOAD_URL=${OPENMPI_DOWNLOAD_URL:-https://www.open-mpi.org/software/ompi/v2.1/downloads/${OPENMPI_VERSION}.tar.gz}
 echo "Installing OpenMPI version ${OPENMPI_VERSION}..."
 echo "OpenMPI Download url ${OPENMPI_DOWNLOAD_URL}..."
 

From aa5bfd35fa5292d820493483da8540f8a6386c5f Mon Sep 17 00:00:00 2001
From: shwetaoj <shweta.ojha@intel.com>
Date: Wed, 3 Jun 2020 12:15:24 -0700
Subject: [PATCH 0114/1390] Added license to the shell script

---
 .../linux/mkl/install_openmpi_horovod.sh        | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/tensorflow/tools/ci_build/linux/mkl/install_openmpi_horovod.sh b/tensorflow/tools/ci_build/linux/mkl/install_openmpi_horovod.sh
index 4c8b04f6024..0f5a670f0f2 100755
--- a/tensorflow/tools/ci_build/linux/mkl/install_openmpi_horovod.sh
+++ b/tensorflow/tools/ci_build/linux/mkl/install_openmpi_horovod.sh
@@ -1,5 +1,20 @@
 #!/usr/bin/env bash
-# install OpenMPI, OpenSSH and Horovod
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+# Install OpenMPI, OpenSSH and Horovod in Intel(R) MKL support
+# Usage: install_openmpi_horovod.sh [openmpi version] [openmpi download url]
 
 set -e
 

From 615d3ce1af92614a3285807caaf42f50acd66fae Mon Sep 17 00:00:00 2001
From: shwetaoj <shweta.ojha@intel.com>
Date: Wed, 3 Jun 2020 13:14:39 -0700
Subject: [PATCH 0115/1390] Added install futures for backward compatibility

---
 tensorflow/tools/ci_build/linux/mkl/Dockerfile.devel-mkl | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tensorflow/tools/ci_build/linux/mkl/Dockerfile.devel-mkl b/tensorflow/tools/ci_build/linux/mkl/Dockerfile.devel-mkl
index 1fd54ff703f..f4ab7ba21c4 100755
--- a/tensorflow/tools/ci_build/linux/mkl/Dockerfile.devel-mkl
+++ b/tensorflow/tools/ci_build/linux/mkl/Dockerfile.devel-mkl
@@ -50,6 +50,9 @@ RUN ${PYTHON} set-build-env.py -p ${TARGET_PLATFORM} -f /root/.mkl.bazelrc \
 # Pull the compiler flags we just wrote into root user's .bazelrc file
 RUN echo "import /root/.mkl.bazelrc" >>/root/.bazelrc
 
+# Install futures>=0.17.1 for Python2.7 compatibility mode
+RUN ${PIP} install future>=0.17.1
+
 RUN bazel --bazelrc=/root/.bazelrc build -c opt \
     tensorflow/tools/pip_package:build_pip_package && \
     bazel-bin/tensorflow/tools/pip_package/build_pip_package "${WHL_DIR}" && \

From 7a048082c1c7aa9057c8448b77ee4cde069ec3a7 Mon Sep 17 00:00:00 2001
From: shwetaoj <shweta.ojha@intel.com>
Date: Wed, 3 Jun 2020 13:19:53 -0700
Subject: [PATCH 0116/1390] Removed extra line

---
 tensorflow/tools/ci_build/linux/mkl/build-dev-container.sh | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/tensorflow/tools/ci_build/linux/mkl/build-dev-container.sh b/tensorflow/tools/ci_build/linux/mkl/build-dev-container.sh
index da647153cdb..7278724ff64 100755
--- a/tensorflow/tools/ci_build/linux/mkl/build-dev-container.sh
+++ b/tensorflow/tools/ci_build/linux/mkl/build-dev-container.sh
@@ -202,8 +202,8 @@ function test_container()
   fi
 
   # Test to check if horovod is installed successfully
-  debug "Test horovod in the container..."
   if [[ ${ENABLE_HOROVOD} == "yes" ]]; then
+      debug "Test horovod in the container..."
       HOROVOD_TEST_CMD=$(${DOCKER_BINARY} exec ${CONTAINER_ID} bash -c "${PYTHON} -c 'import horovod.tensorflow as hvd;'")  
       ${HOROVOD_TEST_CMD}
       if [[ $? == "0" ]]; then
@@ -213,7 +213,6 @@ function test_container()
       fi
   fi
   
-
   # Stop the running docker container
   sleep 1
   "${DOCKER_BINARY}" stop --time=0 ${CONTAINER_ID}

From 50f0ba885bc112b24b437c8a974c6a8deaace96b Mon Sep 17 00:00:00 2001
From: shwetaoj <shweta.ojha@intel.com>
Date: Wed, 3 Jun 2020 20:05:47 -0700
Subject: [PATCH 0117/1390] Added parameter to pass horovod version and fixed
 comments

---
 .../ci_build/linux/mkl/Dockerfile.devel-mkl   |  5 ++--
 .../ci_build/linux/mkl/build-dev-container.sh |  3 +++
 .../linux/mkl/install_openmpi_horovod.sh      | 26 ++++++++-----------
 3 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/tensorflow/tools/ci_build/linux/mkl/Dockerfile.devel-mkl b/tensorflow/tools/ci_build/linux/mkl/Dockerfile.devel-mkl
index f4ab7ba21c4..8a5a0a42050 100755
--- a/tensorflow/tools/ci_build/linux/mkl/Dockerfile.devel-mkl
+++ b/tensorflow/tools/ci_build/linux/mkl/Dockerfile.devel-mkl
@@ -18,6 +18,7 @@ ARG ENABLE_DNNL1=""
 ARG ENABLE_HOROVOD=""
 ARG OPENMPI_VERSION=""
 ARG OPENMPI_DOWNLOAD_URL=""
+ARG HOROVOD_VERSION=""
 
 ENV DEBIAN_FRONTEND=noninteractive
 
@@ -60,11 +61,11 @@ RUN bazel --bazelrc=/root/.bazelrc build -c opt \
     rm -rf /root/.cache
     # Clean up Bazel cache when done.
 
-#Install OpenMPI/Horovod
+# Install OpenMPI/Horovod
 COPY install_openmpi_horovod.sh .
 RUN if [ "${ENABLE_HOROVOD}" = "yes" ]; then \
         chmod +x install_openmpi_horovod.sh && \
-        ./install_openmpi_horovod.sh ${OPENMPI_VERSION} ${OPENMPI_DOWNLOAD_URL} && \
+        ./install_openmpi_horovod.sh OPENMPI_VERSION=${OPENMPI_VERSION} OPENMPI_DOWNLOAD_URL=${OPENMPI_DOWNLOAD_URL} HOROVOD_VERSION=${HOROVOD_VERSION} && \
         rm -rf install_openmpi_horovod.sh; \
     fi
 
diff --git a/tensorflow/tools/ci_build/linux/mkl/build-dev-container.sh b/tensorflow/tools/ci_build/linux/mkl/build-dev-container.sh
index 7278724ff64..e9d7f1ff388 100755
--- a/tensorflow/tools/ci_build/linux/mkl/build-dev-container.sh
+++ b/tensorflow/tools/ci_build/linux/mkl/build-dev-container.sh
@@ -67,6 +67,7 @@ ENABLE_DNNL1=${ENABLE_DNNL1:-no}
 ENABLE_HOROVOD=${ENABLE_HOROVOD:-no}
 OPENMPI_VERSION=${OPENMPI_VERSION}
 OPENMPI_DOWNLOAD_URL=${OPENMPI_DOWNLOAD_URL}
+HOROVOD_VERSION=${HOROVOD_VERSION}
 
 debug "ROOT_CONTAINER=${ROOT_CONTAINER}"
 debug "TF_ROOT_CONTAINER_TAG=${TF_ROOT_CONTAINER_TAG}"
@@ -88,6 +89,7 @@ debug "ENABLE_DNNL1=${ENABLE_DNNL1}"
 debug "ENABLE_HOROVOD=${ENABLE_HOROVOD}"
 debug "OPENMPI_VERSION=${OPENMPI_VERSION}"
 debug "OPENMPI_DOWNLOAD_URL=${OPENMPI_DOWNLOAD_URL}"
+debug "HOROVOD_VERSION=${HOROVOD_VERSION}"
 
 function build_container()
 {
@@ -142,6 +144,7 @@ function build_container()
     TF_DOCKER_BUILD_ARGS+=("--build-arg ENABLE_HOROVOD=${ENABLE_HOROVOD}")
     TF_DOCKER_BUILD_ARGS+=("--build-arg OPENMPI_VERSION=${OPENMPI_VERSION}")
     TF_DOCKER_BUILD_ARGS+=("--build-arg OPENMPI_DOWNLOAD_URL=${OPENMPI_DOWNLOAD_URL}")
+    TF_DOCKER_BUILD_ARGS+=("--build-arg HOROVOD_VERSION=${HOROVOD_VERSION}")
   fi
 
   # Perform docker build
diff --git a/tensorflow/tools/ci_build/linux/mkl/install_openmpi_horovod.sh b/tensorflow/tools/ci_build/linux/mkl/install_openmpi_horovod.sh
index 0f5a670f0f2..b8d9739ceb6 100755
--- a/tensorflow/tools/ci_build/linux/mkl/install_openmpi_horovod.sh
+++ b/tensorflow/tools/ci_build/linux/mkl/install_openmpi_horovod.sh
@@ -1,5 +1,5 @@
 #!/usr/bin/env bash
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -13,25 +13,21 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-# Install OpenMPI, OpenSSH and Horovod in Intel(R) MKL support
-# Usage: install_openmpi_horovod.sh [openmpi version] [openmpi download url]
+# Install OpenMPI, OpenSSH and Horovod during Intel(R) MKL container build
+# Usage: install_openmpi_horovod.sh [OPENMPI_VERSION=<openmpi version>] [OPENMPI_DOWNLOAD_URL=<openmpi download url>] [HOROVOD_VERSION=<horovod version>]
 
 set -e
 
 apt-get clean && apt-get update -y
 
 # Set default
-if [[ $# -gt 1 ]]; then
-  OPENMPI_VERSION="${1}"
-  OPENMPI_DOWNLOAD_URL="${2}"
-else
-  OPENMPI_VERSION=openmpi-2.1.1
-  OPENMPI_DOWNLOAD_URL=https://www.open-mpi.org/software/ompi/v2.1/downloads/${OPENMPI_VERSION}.tar.gz  
-fi
+OPENMPI_VERSION=${OPENMPI_VERSION:-openmpi-2.1.1}
+OPENMPI_DOWNLOAD_URL=${OPENMPI_DOWNLOAD_URL:-https://www.open-mpi.org/software/ompi/v2.1/downloads/${OPENMPI_VERSION}.tar.gz}
+HOROVOD_VERSION=${HOROVOD_VERSION:-0.19.1}
 
 # Install Open MPI
-echo "Installing OpenMPI version ${OPENMPI_VERSION}..."
-echo "OpenMPI Download url ${OPENMPI_DOWNLOAD_URL}..."
+echo "Installing OpenMPI version ${OPENMPI_VERSION} ..."
+echo "OpenMPI Download url ${OPENMPI_DOWNLOAD_URL} ..."
 
 mkdir /tmp/openmpi
 cd /tmp/openmpi
@@ -54,7 +50,7 @@ chmod a+x /usr/local/bin/mpirun
 # Configure OpenMPI to run good defaults:
 echo "btl_tcp_if_exclude = lo,docker0" >> /usr/local/etc/openmpi-mca-params.conf
 
-#Check mpi version
+# Check mpi version
 echo 'OpenMPI version:'
 mpirun --version
 
@@ -66,6 +62,6 @@ cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.ne
 echo " StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new
 mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config
 
-#Install Horovod
+# Install Horovod
 HOROVOD_WITH_TENSORFLOW=1
-python3 -m pip install --no-cache-dir horovod==0.19.1
+python3 -m pip install --no-cache-dir horovod==${HOROVOD_VERSION}

From 1ed2ab4638c56b3cce6d0f85efeaea3600b75214 Mon Sep 17 00:00:00 2001
From: justkw <karen.wu@intel.com>
Date: Thu, 4 Jun 2020 09:22:07 -0700
Subject: [PATCH 0118/1390] Adding parameter to use --nightly_flag to install
 specific packages if building the nightly build

---
 tensorflow/tools/ci_build/linux/mkl/Dockerfile.devel-mkl   | 5 +++--
 tensorflow/tools/ci_build/linux/mkl/build-dev-container.sh | 7 +++++++
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/tensorflow/tools/ci_build/linux/mkl/Dockerfile.devel-mkl b/tensorflow/tools/ci_build/linux/mkl/Dockerfile.devel-mkl
index 8a5a0a42050..a78d13c7755 100755
--- a/tensorflow/tools/ci_build/linux/mkl/Dockerfile.devel-mkl
+++ b/tensorflow/tools/ci_build/linux/mkl/Dockerfile.devel-mkl
@@ -19,6 +19,7 @@ ARG ENABLE_HOROVOD=""
 ARG OPENMPI_VERSION=""
 ARG OPENMPI_DOWNLOAD_URL=""
 ARG HOROVOD_VERSION=""
+ARG TF_NIGHTLY_FLAG=""
 
 ENV DEBIAN_FRONTEND=noninteractive
 
@@ -56,8 +57,8 @@ RUN ${PIP} install future>=0.17.1
 
 RUN bazel --bazelrc=/root/.bazelrc build -c opt \
     tensorflow/tools/pip_package:build_pip_package && \
-    bazel-bin/tensorflow/tools/pip_package/build_pip_package "${WHL_DIR}" && \
-    ${PIP} --no-cache-dir install --upgrade "${WHL_DIR}"/tensorflow-*.whl && \
+    bazel-bin/tensorflow/tools/pip_package/build_pip_package "${TF_NIGHTLY_FLAG}" "${WHL_DIR}" && \
+    ${PIP} --no-cache-dir install --upgrade "${WHL_DIR}"/*.whl && \
     rm -rf /root/.cache
     # Clean up Bazel cache when done.
 
diff --git a/tensorflow/tools/ci_build/linux/mkl/build-dev-container.sh b/tensorflow/tools/ci_build/linux/mkl/build-dev-container.sh
index e9d7f1ff388..83b3ebaf9c9 100755
--- a/tensorflow/tools/ci_build/linux/mkl/build-dev-container.sh
+++ b/tensorflow/tools/ci_build/linux/mkl/build-dev-container.sh
@@ -68,6 +68,7 @@ ENABLE_HOROVOD=${ENABLE_HOROVOD:-no}
 OPENMPI_VERSION=${OPENMPI_VERSION}
 OPENMPI_DOWNLOAD_URL=${OPENMPI_DOWNLOAD_URL}
 HOROVOD_VERSION=${HOROVOD_VERSION}
+IS_NIGHTLY=${IS_NIGHTLY:-no}
 
 debug "ROOT_CONTAINER=${ROOT_CONTAINER}"
 debug "TF_ROOT_CONTAINER_TAG=${TF_ROOT_CONTAINER_TAG}"
@@ -90,6 +91,7 @@ debug "ENABLE_HOROVOD=${ENABLE_HOROVOD}"
 debug "OPENMPI_VERSION=${OPENMPI_VERSION}"
 debug "OPENMPI_DOWNLOAD_URL=${OPENMPI_DOWNLOAD_URL}"
 debug "HOROVOD_VERSION=${HOROVOD_VERSION}"
+debug "IS_NIGHTLY=${IS_NIGHTLY}"
 
 function build_container()
 {
@@ -147,6 +149,11 @@ function build_container()
     TF_DOCKER_BUILD_ARGS+=("--build-arg HOROVOD_VERSION=${HOROVOD_VERSION}")
   fi
 
+  # Add build arg --nightly_flag for the nightly build
+  if [[ ${IS_NIGHTLY} == "yes" ]]; then
+    TF_DOCKER_BUILD_ARGS+=("--build-arg TF_NIGHTLY_FLAG=--nightly_flag")
+  fi
+
   # Perform docker build
   debug "Building docker image with image name and tag: ${TEMP_IMAGE_NAME}"
   CMD="${DOCKER_BINARY} build ${TF_DOCKER_BUILD_ARGS[@]} --no-cache --pull -t ${TEMP_IMAGE_NAME} -f Dockerfile.devel-mkl ."

From 83decf0d382b76771e2b1ad4fa43d208d5e40eb0 Mon Sep 17 00:00:00 2001
From: shwetaoj <shweta.ojha@intel.com>
Date: Mon, 8 Jun 2020 07:50:14 -0700
Subject: [PATCH 0119/1390] Support multiple OS

---
 .../ci_build/linux/mkl/install_openmpi_horovod.sh   | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/tensorflow/tools/ci_build/linux/mkl/install_openmpi_horovod.sh b/tensorflow/tools/ci_build/linux/mkl/install_openmpi_horovod.sh
index b8d9739ceb6..6044927d2ce 100755
--- a/tensorflow/tools/ci_build/linux/mkl/install_openmpi_horovod.sh
+++ b/tensorflow/tools/ci_build/linux/mkl/install_openmpi_horovod.sh
@@ -55,7 +55,18 @@ echo 'OpenMPI version:'
 mpirun --version
 
 # Install OpenSSH for MPI to communicate between containers
-apt-get install -y --no-install-recommends --fix-missing openssh-client openssh-server libnuma-dev
+( apt-get update && apt-get install -y --no-install-recommends --fix-missing \
+        libnuma-dev \
+        openssh-server \
+        openssh-client \        
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/* ) || \
+    ( yum -y update && yum -y install \
+            numactl-devel \
+            openssh-server \
+            openssh-clients \            
+    yum clean all ) || \
+    ( echo "Unsupported Linux distribution. Aborting!" && exit 1 )
 mkdir -p /var/run/sshd
 # Allow OpenSSH to talk to containers without asking for confirmation
 cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new

From 371c2e6f4f3f233041eda2d292a13824d98d769f Mon Sep 17 00:00:00 2001
From: shwetaoj <shweta.ojha@intel.com>
Date: Mon, 8 Jun 2020 08:52:49 -0700
Subject: [PATCH 0120/1390] Bug fix

---
 .../tools/ci_build/linux/mkl/install_openmpi_horovod.sh       | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/tools/ci_build/linux/mkl/install_openmpi_horovod.sh b/tensorflow/tools/ci_build/linux/mkl/install_openmpi_horovod.sh
index 6044927d2ce..276d9945ab6 100755
--- a/tensorflow/tools/ci_build/linux/mkl/install_openmpi_horovod.sh
+++ b/tensorflow/tools/ci_build/linux/mkl/install_openmpi_horovod.sh
@@ -58,13 +58,13 @@ mpirun --version
 ( apt-get update && apt-get install -y --no-install-recommends --fix-missing \
         libnuma-dev \
         openssh-server \
-        openssh-client \        
+        openssh-client && \        
     apt-get clean && \
     rm -rf /var/lib/apt/lists/* ) || \
     ( yum -y update && yum -y install \
             numactl-devel \
             openssh-server \
-            openssh-clients \            
+            openssh-clients && \            
     yum clean all ) || \
     ( echo "Unsupported Linux distribution. Aborting!" && exit 1 )
 mkdir -p /var/run/sshd

From da18384ad585b2d88a08119268c9a7134ee36bf5 Mon Sep 17 00:00:00 2001
From: shwetaoj <shweta.ojha@intel.com>
Date: Mon, 8 Jun 2020 09:51:25 -0700
Subject: [PATCH 0121/1390] Bug fix

---
 tensorflow/tools/ci_build/linux/mkl/install_openmpi_horovod.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/tools/ci_build/linux/mkl/install_openmpi_horovod.sh b/tensorflow/tools/ci_build/linux/mkl/install_openmpi_horovod.sh
index 276d9945ab6..b765dbd70a6 100755
--- a/tensorflow/tools/ci_build/linux/mkl/install_openmpi_horovod.sh
+++ b/tensorflow/tools/ci_build/linux/mkl/install_openmpi_horovod.sh
@@ -58,7 +58,7 @@ mpirun --version
 ( apt-get update && apt-get install -y --no-install-recommends --fix-missing \
         libnuma-dev \
         openssh-server \
-        openssh-client && \        
+        openssh-clients && \        
     apt-get clean && \
     rm -rf /var/lib/apt/lists/* ) || \
     ( yum -y update && yum -y install \

From ab86bb82faabec7b1d29c61df1cae0b45d0b0e8e Mon Sep 17 00:00:00 2001
From: shwetaoj <shweta.ojha@intel.com>
Date: Tue, 9 Jun 2020 07:53:05 -0700
Subject: [PATCH 0122/1390] Bug fix

---
 .../linux/mkl/install_openmpi_horovod.sh      | 33 ++++++++++---------
 1 file changed, 17 insertions(+), 16 deletions(-)

diff --git a/tensorflow/tools/ci_build/linux/mkl/install_openmpi_horovod.sh b/tensorflow/tools/ci_build/linux/mkl/install_openmpi_horovod.sh
index b765dbd70a6..aec40543a17 100755
--- a/tensorflow/tools/ci_build/linux/mkl/install_openmpi_horovod.sh
+++ b/tensorflow/tools/ci_build/linux/mkl/install_openmpi_horovod.sh
@@ -14,15 +14,14 @@
 # limitations under the License.
 # ==============================================================================
 # Install OpenMPI, OpenSSH and Horovod during Intel(R) MKL container build
-# Usage: install_openmpi_horovod.sh [OPENMPI_VERSION=<openmpi version>] [OPENMPI_DOWNLOAD_URL=<openmpi download url>] [HOROVOD_VERSION=<horovod version>]
+# Usage: install_openmpi_horovod.sh [OPENMPI_VERSION=<openmpi version>] [OPENMPI_DOWNLOAD_URL=<openmpi download url>] 
+# [HOROVOD_VERSION=<horovod version>]
 
 set -e
 
-apt-get clean && apt-get update -y
-
 # Set default
 OPENMPI_VERSION=${OPENMPI_VERSION:-openmpi-2.1.1}
-OPENMPI_DOWNLOAD_URL=${OPENMPI_DOWNLOAD_URL:-https://www.open-mpi.org/software/ompi/v2.1/downloads/${OPENMPI_VERSION}.tar.gz}
+OPENMPI_DOWNLOAD_URL=${OPENMPI_DOWNLOAD_URL:-https://www.open-mpi.org/software/ompi/v2.1/downloads/openmpi-2.1.1.tar.gz}
 HOROVOD_VERSION=${HOROVOD_VERSION:-0.19.1}
 
 # Install Open MPI
@@ -55,18 +54,20 @@ echo 'OpenMPI version:'
 mpirun --version
 
 # Install OpenSSH for MPI to communicate between containers
-( apt-get update && apt-get install -y --no-install-recommends --fix-missing \
-        libnuma-dev \
-        openssh-server \
-        openssh-clients && \        
-    apt-get clean && \
-    rm -rf /var/lib/apt/lists/* ) || \
-    ( yum -y update && yum -y install \
-            numactl-devel \
-            openssh-server \
-            openssh-clients && \            
-    yum clean all ) || \
-    ( echo "Unsupported Linux distribution. Aborting!" && exit 1 )
+apt-get clean && apt-get update && apt-get install -y --no-install-recommends --fix-missing \
+    openssh-client openssh-server libnuma-dev && \
+    rm -rf /var/lib/apt/lists/*
+if [[ $?  == "0" ]]; then
+    echo "PASS: OpenSSH installation"
+else
+    yum -y update && yum -y install numactl-devel openssh-server openssh-clients && \
+        yum clean all
+    if [[ $?  == "0" ]]; then
+        echo "PASS: OpenSSH installation"
+    else
+        echo "Unsupported Linux distribution. Aborting!" && exit 1
+    fi
+fi
 mkdir -p /var/run/sshd
 # Allow OpenSSH to talk to containers without asking for confirmation
 cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new

From ae408bb512e614469e24ccf0db6c031f6aeac030 Mon Sep 17 00:00:00 2001
From: shwetaoj <shweta.ojha@intel.com>
Date: Tue, 9 Jun 2020 18:00:47 -0700
Subject: [PATCH 0123/1390] remvoe trailing white space

---
 tensorflow/tools/ci_build/linux/mkl/build-dev-container.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/tools/ci_build/linux/mkl/build-dev-container.sh b/tensorflow/tools/ci_build/linux/mkl/build-dev-container.sh
index 83b3ebaf9c9..6e789a54e87 100755
--- a/tensorflow/tools/ci_build/linux/mkl/build-dev-container.sh
+++ b/tensorflow/tools/ci_build/linux/mkl/build-dev-container.sh
@@ -214,7 +214,7 @@ function test_container()
   # Test to check if horovod is installed successfully
   if [[ ${ENABLE_HOROVOD} == "yes" ]]; then
       debug "Test horovod in the container..."
-      HOROVOD_TEST_CMD=$(${DOCKER_BINARY} exec ${CONTAINER_ID} bash -c "${PYTHON} -c 'import horovod.tensorflow as hvd;'")  
+      HOROVOD_TEST_CMD=$(${DOCKER_BINARY} exec ${CONTAINER_ID} bash -c "${PYTHON} -c 'import horovod.tensorflow as hvd;'")
       ${HOROVOD_TEST_CMD}
       if [[ $? == "0" ]]; then
           echo "PASS: HOROVOD installation test in ${TEMP_IMAGE_NAME}"

From 50403ea24fa2a4a9eda2a74cfc1879ab396eb8eb Mon Sep 17 00:00:00 2001
From: Prashant Kumar <prashantk@polymagelabs.com>
Date: Tue, 2 Jun 2020 09:40:47 +0000
Subject: [PATCH 0124/1390] [MLIR]Add conversions dot op from LHLO to Affine

Add conversions from MLIR Lhlo dot op to affine loops.These conversions are run
as part of -lhlo-legalize-to-affine pass.

Signed-off-by: Prashant Kumar <prashantk@polymagelabs.com>
---
 .../xla/tests/lhlo-legalize-to-affine.mlir    | 36 ++++++++++
 .../xla/transforms/lhlo_legalize_to_affine.cc | 70 ++++++++++++++++---
 .../xla/transforms/map_xla_to_scalar_op.h     | 24 +++++++
 3 files changed, 122 insertions(+), 8 deletions(-)

diff --git a/tensorflow/compiler/mlir/xla/tests/lhlo-legalize-to-affine.mlir b/tensorflow/compiler/mlir/xla/tests/lhlo-legalize-to-affine.mlir
index aaf65b5a38a..483204cf0d5 100644
--- a/tensorflow/compiler/mlir/xla/tests/lhlo-legalize-to-affine.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/lhlo-legalize-to-affine.mlir
@@ -143,3 +143,39 @@ func @int_sub_op(%lhs: memref<7xi32>, %rhs: memref<7xi32>,
       : (memref<7xi32>, memref<7xi32>, memref<7xi32>) -> ()
   return
 }
+
+// Dot tests.
+// CHECK-LABEL: func @float_dot_op
+func @float_dot_op(%lhs: memref<7x3xf32>, %rhs:
+                  memref<3x4xf32>, %result: memref<7x4xf32> ) -> () {
+    // CHECK-NEXT: affine.for %[[I:.*]] = 0 to 7 {
+    // CHECK-NEXT:  affine.for %[[J:.*]] = 0 to 4 {
+    // CHECK-NEXT:    affine.for %[[K:.*]] = 0 to 3 {
+    // CHECK-NEXT:      %[[LHS:.*]] = affine.load %{{.*}}[%[[I]], %[[K]]] : memref<7x3xf32>
+    // CHECK-NEXT:      %[[RHS:.*]] = affine.load %{{.*}}[%[[K]], %[[J]]] : memref<3x4xf32>
+    // CHECK-NEXT:      %[[RESULT:.*]] = affine.load %{{.*}}[%[[I]], %[[J]]] : memref<7x4xf32>
+    // CHECK-NEXT:      %[[MULT:.*]] = mulf %[[LHS]], %[[RHS]] : f32
+    // CHECK-NEXT:      %[[ADD:.*]] =  addf %[[MULT]], %[[RESULT]] : f32
+    // CHECK-NEXT:      affine.store %[[ADD]], %{{.*}}[%[[I]], %[[J]]] : memref<7x4xf32>
+    // CHECK: return
+  "xla_lhlo.dot"(%lhs, %rhs, %result) :
+    (memref<7x3xf32>, memref<3x4xf32>, memref<7x4xf32>) -> ()
+  return
+}
+// CHECK-LABEL: func @int_dot_op
+func @int_dot_op(%lhs: memref<7x3xi32>, %rhs:
+                  memref<3x4xi32>, %result: memref<7x4xi32> ) -> () {
+    // CHECK-NEXT: affine.for %[[I:.*]] = 0 to 7 {
+    // CHECK-NEXT:  affine.for %[[J:.*]] = 0 to 4 {
+    // CHECK-NEXT:    affine.for %[[K:.*]] = 0 to 3 {
+    // CHECK-NEXT:      %[[LHS:.*]] = affine.load %{{.*}}[%[[I]], %[[K]]] : memref<7x3xi32>
+    // CHECK-NEXT:      %[[RHS:.*]] = affine.load %{{.*}}[%[[K]], %[[J]]] : memref<3x4xi32>
+    // CHECK-NEXT:      %[[RESULT:.*]] = affine.load %{{.*}}[%[[I]], %[[J]]] : memref<7x4xi32>
+    // CHECK-NEXT:      %[[MULT:.*]] = muli %[[LHS]], %[[RHS]] : i32
+    // CHECK-NEXT:      %[[ADD:.*]] =  addi %[[MULT]], %[[RESULT]] : i32
+    // CHECK-NEXT:      affine.store %[[ADD]], %{{.*}}[%[[I]], %[[J]]] : memref<7x4xi32>
+    // CHECK: return
+  "xla_lhlo.dot"(%lhs, %rhs, %result) :
+    (memref<7x3xi32>, memref<3x4xi32>, memref<7x4xi32>) -> ()
+  return
+}
diff --git a/tensorflow/compiler/mlir/xla/transforms/lhlo_legalize_to_affine.cc b/tensorflow/compiler/mlir/xla/transforms/lhlo_legalize_to_affine.cc
index f7f5537f882..4cc77292494 100644
--- a/tensorflow/compiler/mlir/xla/transforms/lhlo_legalize_to_affine.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/lhlo_legalize_to_affine.cc
@@ -17,13 +17,13 @@ limitations under the License.
 
 #include "absl/memory/memory.h"
 #include "mlir/Dialect/Affine/IR/AffineOps.h"  // from @llvm-project
-#include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
-#include "mlir/IR/Attributes.h"  // from @llvm-project
-#include "mlir/IR/Location.h"  // from @llvm-project
-#include "mlir/IR/MLIRContext.h"  // from @llvm-project
-#include "mlir/IR/PatternMatch.h"  // from @llvm-project
-#include "mlir/IR/StandardTypes.h"  // from @llvm-project
-#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Dialect/StandardOps/IR/Ops.h"   // from @llvm-project
+#include "mlir/IR/Attributes.h"                // from @llvm-project
+#include "mlir/IR/Location.h"                  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"               // from @llvm-project
+#include "mlir/IR/PatternMatch.h"              // from @llvm-project
+#include "mlir/IR/StandardTypes.h"             // from @llvm-project
+#include "mlir/Pass/Pass.h"                    // from @llvm-project
 #include "tensorflow/compiler/mlir/xla/ir/lhlo_ops.h"
 #include "tensorflow/compiler/mlir/xla/transforms/map_xla_to_scalar_op.h"
 
@@ -31,6 +31,59 @@ namespace mlir {
 namespace xla_lhlo {
 namespace {
 
+struct DotOpConverter : public OpRewritePattern<DotOp> {
+  using OpRewritePattern<DotOp>::OpRewritePattern;
+
+  // Supports only rank-2 tensors for LHS and RHS.
+  LogicalResult matchAndRewrite(DotOp op,
+                                PatternRewriter& rewriter) const override {
+    Value lhs = op.lhs();
+    Value rhs = op.rhs();
+    MemRefType lhs_type = lhs.getType().cast<MemRefType>();
+    MemRefType rhs_type = rhs.getType().cast<MemRefType>();
+    Type element_type = lhs_type.getElementType();
+    ArrayRef<int64_t> shape_lhs = lhs_type.getShape();
+    ArrayRef<int64_t> shape_rhs = rhs_type.getShape();
+
+    if ((lhs_type.getRank() != 2) || (rhs_type.getRank() != 2)) {
+      return failure();
+    }
+    SmallVector<Value, 4> lhs_indices, rhs_indices, result_indices;
+    const auto& loc = op.getLoc();
+
+    // Create the canonical ijk form of matmul.
+    auto forOp = rewriter.create<AffineForOp>(loc, 0, shape_lhs[0]);
+    lhs_indices.push_back(forOp.getInductionVar());
+    result_indices.push_back(forOp.getInductionVar());
+
+    rewriter.setInsertionPointToStart(forOp.getBody());
+    forOp = rewriter.create<AffineForOp>(loc, 0, shape_rhs.back());
+    result_indices.push_back(forOp.getInductionVar());
+    rhs_indices.resize(2);
+    rhs_indices[1] = forOp.getInductionVar();
+
+    rewriter.setInsertionPointToStart(forOp.getBody());
+    forOp = rewriter.create<AffineForOp>(loc, 0, shape_rhs.front());
+    lhs_indices.push_back(forOp.getInductionVar());
+    rhs_indices[0] = forOp.getInductionVar();
+
+    // Construct the innermost loop body.
+    rewriter.setInsertionPointToStart(forOp.getBody());
+    auto l = rewriter.create<AffineLoadOp>(loc, lhs, lhs_indices);
+    auto r = rewriter.create<AffineLoadOp>(loc, rhs, rhs_indices);
+    auto result =
+        rewriter.create<AffineLoadOp>(loc, op.output(), result_indices);
+    Value op_result = xla_lhlo::XlaOpToStdScalarOp::map<DotOp>(
+        op, element_type, {l, r, result}, &rewriter);
+    if (op_result == nullptr) {
+      return failure();
+    }
+    rewriter.create<AffineStoreOp>(loc, op_result, op.output(), result_indices);
+    rewriter.eraseOp(op);
+    return success();
+  }
+};
+
 template <typename LhloOpTy>
 struct BinaryOpConverter : public OpRewritePattern<LhloOpTy> {
   using OpRewritePattern<LhloOpTy>::OpRewritePattern;
@@ -77,7 +130,8 @@ void populateLHLOToAffineConversionPattern(MLIRContext* context,
       BinaryOpConverter<xla_lhlo::MaxOp>,
       BinaryOpConverter<xla_lhlo::MinOp>,
       BinaryOpConverter<xla_lhlo::MulOp>,
-      BinaryOpConverter<xla_lhlo::SubOp>>(context);
+      BinaryOpConverter<xla_lhlo::SubOp>,
+      DotOpConverter>(context);
   // clang-format on
 }
 
diff --git a/tensorflow/compiler/mlir/xla/transforms/map_xla_to_scalar_op.h b/tensorflow/compiler/mlir/xla/transforms/map_xla_to_scalar_op.h
index c317dc36b3c..07d4d3dd138 100644
--- a/tensorflow/compiler/mlir/xla/transforms/map_xla_to_scalar_op.h
+++ b/tensorflow/compiler/mlir/xla/transforms/map_xla_to_scalar_op.h
@@ -287,6 +287,30 @@ inline Value MapLhloOpToStdScalarOp<xla_lhlo::ConvertOp>(
   return nullptr;
 }
 
+template <>
+inline Value MapLhloOpToStdScalarOp<xla_lhlo::DotOp>(
+    Location loc, ArrayRef<Type> result_types, ArrayRef<Value> args,
+    OpBuilder* b) {
+  // Dot Op converter from lhlo to affine only accepts float and integer types.
+  const auto& lhs = args[0];
+  const auto& rhs = args[1];
+  const auto& result = args[2];
+  Type element_type = lhs.getType();
+  if (element_type.isa<FloatType>()) {
+    Value float_mul = MapLhloOpToStdScalarOpImpl<FloatType, ::mlir::MulFOp>{}(
+        loc, result_types, {lhs, rhs}, b);
+    return MapLhloOpToStdScalarOpImpl<FloatType, ::mlir::AddFOp>{}(
+        loc, result_types, {float_mul, result}, b);
+  }
+  if (element_type.isa<IntegerType>()) {
+    Value int_mul = MapLhloOpToStdScalarOpImpl<IntegerType, ::mlir::MulIOp>{}(
+        loc, result_types, {lhs, rhs}, b);
+    return MapLhloOpToStdScalarOpImpl<IntegerType, ::mlir::AddIOp>{}(
+        loc, result_types, {int_mul, result}, b);
+  }
+  return nullptr;
+}
+
 template <>
 inline Value MapLhloOpToStdScalarOp<xla_lhlo::CosOp>(
     Location loc, ArrayRef<Type> result_types, ArrayRef<Value> args,

From 77e5fb550de9aeed3a3454de06f1a7571d5e5ba3 Mon Sep 17 00:00:00 2001
From: Jens Elofsson <jens.elofsson@arm.com>
Date: Wed, 10 Jun 2020 10:47:54 +0200
Subject: [PATCH 0125/1390] Fix compile errors by using the new
 MicroAllocator::Create()

---
 tensorflow/lite/micro/micro_allocator_test.cc | 24 +++++++++----------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/tensorflow/lite/micro/micro_allocator_test.cc b/tensorflow/lite/micro/micro_allocator_test.cc
index a71115dc67b..f52742ca723 100644
--- a/tensorflow/lite/micro/micro_allocator_test.cc
+++ b/tensorflow/lite/micro/micro_allocator_test.cc
@@ -288,9 +288,9 @@ TF_LITE_MICRO_TEST(OfflinePlannerBranchesAllOnline) {
   TfLiteContext context;
   constexpr size_t arena_size = 4096;
   uint8_t arena[arena_size];
-  tflite::MicroAllocator allocator(&context, model, arena, arena_size,
-                                   micro_test::reporter);
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, allocator.FinishTensorAllocation());
+  tflite::MicroAllocator* allocator = tflite::MicroAllocator::Create(
+      &context, model, arena, arena_size, micro_test::reporter);
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, allocator->FinishTensorAllocation());
 
   // Since all of the tensors are online planned and the model structure is
   // identical to that in TestAllocationForModelsWithBranches,
@@ -337,9 +337,9 @@ TF_LITE_MICRO_TEST(OfflinePlannerBasic) {
   TfLiteContext context;
   constexpr size_t arena_size = 4096;
   uint8_t arena[arena_size];
-  tflite::MicroAllocator allocator(&context, model, arena, arena_size,
-                                   micro_test::reporter);
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, allocator.FinishTensorAllocation());
+  tflite::MicroAllocator* allocator = tflite::MicroAllocator::Create(
+      &context, model, arena, arena_size, micro_test::reporter);
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, allocator->FinishTensorAllocation());
 
   uint8_t* start = context.tensors[0].data.uint8;
   TF_LITE_MICRO_EXPECT_EQ(0, context.tensors[0].data.uint8 - start);
@@ -382,9 +382,9 @@ TF_LITE_MICRO_TEST(OfflinePlannerOverlappingAllocation) {
   TfLiteContext context;
   constexpr size_t arena_size = 4096;
   uint8_t arena[arena_size];
-  tflite::MicroAllocator allocator(&context, model, arena, arena_size,
-                                   micro_test::reporter);
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, allocator.FinishTensorAllocation());
+  tflite::MicroAllocator* allocator = tflite::MicroAllocator::Create(
+      &context, model, arena, arena_size, micro_test::reporter);
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, allocator->FinishTensorAllocation());
 
   uint8_t* start = context.tensors[0].data.uint8;
   TF_LITE_MICRO_EXPECT_EQ(0, context.tensors[0].data.uint8 - start);
@@ -430,9 +430,9 @@ TF_LITE_MICRO_TEST(OfflinePlannerOfflineOnline) {
   TfLiteContext context;
   constexpr size_t arena_size = 4096;
   uint8_t arena[arena_size];
-  tflite::MicroAllocator allocator(&context, model, arena, arena_size,
-                                   micro_test::reporter);
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, allocator.FinishTensorAllocation());
+  tflite::MicroAllocator* allocator = tflite::MicroAllocator::Create(
+      &context, model, arena, arena_size, micro_test::reporter);
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, allocator->FinishTensorAllocation());
 
   uint8_t* start = context.tensors[0].data.uint8;
   TF_LITE_MICRO_EXPECT_EQ(0, context.tensors[0].data.uint8 - start);

From 764e2641e05e58ccd7ffdfcff0307f21fbb3fed0 Mon Sep 17 00:00:00 2001
From: Niranjan Hasabnis <niranjan.hasabnis@intel.com>
Date: Wed, 10 Jun 2020 09:15:38 -0700
Subject: [PATCH 0126/1390] Addressing review comments

---
 tensorflow/core/kernels/BUILD                |   1 -
 tensorflow/core/kernels/mkl_conv_ops_test.cc |  23 +--
 tensorflow/core/kernels/mkl_relu_op_test.cc  | 200 ++++++++-----------
 tensorflow/core/util/mkl_util.h              |  15 ++
 4 files changed, 103 insertions(+), 136 deletions(-)

diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index a68eeb4479b..5abedf441a1 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -8124,7 +8124,6 @@ tf_cc_test_mkl(
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:tensorflow",
         "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
     ],
 )
diff --git a/tensorflow/core/kernels/mkl_conv_ops_test.cc b/tensorflow/core/kernels/mkl_conv_ops_test.cc
index 9d11b0fb006..7e7b78e004d 100644
--- a/tensorflow/core/kernels/mkl_conv_ops_test.cc
+++ b/tensorflow/core/kernels/mkl_conv_ops_test.cc
@@ -103,20 +103,6 @@ static Tensor GetFilterSizesTensor(const Conv2DDimensions& dims) {
                                 dims.input_depth, dims.filter_count});
 }
 
-#if defined(INTEL_MKL_DNN_ONLY)
-static Tensor NonMklTensor() {
-  MklDnnShape non_mkl_shape;
-  non_mkl_shape.SetMklTensor(false);
-
-  auto size = static_cast<int64>(non_mkl_shape.GetSerializeBufferSize());
-  Tensor tensor(DT_UINT8, {size});
-
-  non_mkl_shape.SerializeMklDnnShape(tensor.flat<uint8>().data(),
-                                     size * sizeof(uint8));
-  return tensor;
-}
-#endif
-
 static Graph* DefaultConv2D(const Conv2DDimensions& dims) {
   auto* graph = new Graph(OpRegistry::Global());
 
@@ -148,7 +134,8 @@ static Graph* MklConv2D(const Conv2DDimensions& dims) {
   Node* input = test::graph::Constant(graph, input_t, "input");
   Node* filter = test::graph::Constant(graph, filter_t, "filter");
 
-  Node* not_mkl_shape = test::graph::Constant(graph, NonMklTensor(), "not_mkl");
+  Node* not_mkl_shape =
+      test::graph::Constant(graph, GetMklMetaTensor(), "not_mkl");
 
   Node* conv2d;
   TF_CHECK_OK(NodeBuilder(graph->NewName("mkl_conv_2d"), "_MklConv2D")
@@ -207,7 +194,8 @@ static Graph* MklConv2DBwdInput(const Conv2DDimensions& dims) {
   Node* out_backprop =
       test::graph::Constant(graph, out_backprop_t, "out_backprop");
 
-  Node* not_mkl_shape = test::graph::Constant(graph, NonMklTensor(), "not_mkl");
+  Node* not_mkl_shape =
+      test::graph::Constant(graph, GetMklMetaTensor(), "not_mkl");
 
   Node* conv2d_bwd_input;
   TF_CHECK_OK(NodeBuilder(graph->NewName("conv_2d_bwd_input"),
@@ -271,7 +259,8 @@ static Graph* MklConv2DBwdFilter(const Conv2DDimensions& dims) {
   Node* out_backprop =
       test::graph::Constant(graph, out_backprop_t, "out_backprop");
 
-  Node* not_mkl_shape = test::graph::Constant(graph, NonMklTensor(), "not_mkl");
+  Node* not_mkl_shape =
+      test::graph::Constant(graph, GetMklMetaTensor(), "not_mkl");
 
   Node* conv2d_bwd_filter;
   TF_CHECK_OK(NodeBuilder(graph->NewName("conv_2d_bwd_filter"),
diff --git a/tensorflow/core/kernels/mkl_relu_op_test.cc b/tensorflow/core/kernels/mkl_relu_op_test.cc
index 7a3dffef0de..30f75cd23df 100644
--- a/tensorflow/core/kernels/mkl_relu_op_test.cc
+++ b/tensorflow/core/kernels/mkl_relu_op_test.cc
@@ -13,10 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#undef INTEL_MKL
-
 #ifdef INTEL_MKL
 
+#include "absl/strings/match.h"
 #include "tensorflow/cc/ops/const_op.h"
 #include "tensorflow/cc/ops/nn_ops.h"
 #include "tensorflow/cc/ops/standard_ops.h"
@@ -27,6 +26,10 @@ limitations under the License.
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/kernels/ops_testutil.h"
 #include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/platform/cpu_info.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/stacktrace_handler.h"
+#include "tensorflow/core/platform/str_util.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/test_benchmark.h"
 #include "tensorflow/core/public/session.h"
@@ -37,133 +40,72 @@ limitations under the License.
 // Compare performance of default Tensorflow convolution kernels (Eigen) with
 // MKL kernels on CPU.
 
-// Before running these benchmarks configure OpenMP environment variables:
-//   export KMP_BLOCKTIME=0
-//   export OMP_NUM_THREADS=${num_threads}
-
 namespace tensorflow {
-static Tensor NonMklTensor() {
-  MklDnnShape non_mkl_shape;
-  non_mkl_shape.SetMklTensor(false);
 
-  auto size = static_cast<int64>(non_mkl_shape.GetSerializeBufferSize());
-  Tensor tensor(DT_UINT8, {size});
+static Graph* Activation(const string& op_name, const string& kind,
+                         const TensorShape& shape) {
+  auto* graph = new Graph(OpRegistry::Global());
+  const string node_name = kind + "_" + op_name;
+  const bool isForwardOp = !tensorflow::str_util::EndsWith(op_name, "Grad");
+  const bool isDefault = (kind == "Default");
 
-  non_mkl_shape.SerializeMklDnnShape(tensor.flat<uint8>().data(),
-                                     size * sizeof(uint8));
-  return tensor;
+  Tensor input_t(DT_FLOAT, shape);
+  input_t.flat<float>().setRandom();
+  Node* input = test::graph::Constant(graph, input_t, "input");
+  Node* not_mkl_shape =
+      test::graph::Constant(graph, GetMklMetaTensor(), "not_mkl");
+
+  if (isForwardOp) {
+    // Default forward op.
+    if (isDefault) {
+      TF_CHECK_OK(NodeBuilder(graph->NewName(node_name), op_name)
+                      .Input(input)
+                      .Attr("T", DT_FLOAT)
+                      .Finalize(graph, nullptr));
+      return graph;
+    }
+    // MKL forward op.
+    TF_CHECK_OK(NodeBuilder(graph->NewName(node_name), "_Mkl" + op_name)
+                    .Input(input)
+                    .Input(not_mkl_shape)
+                    .Attr("T", DT_FLOAT)
+                    .Attr("_kernel", "MklLayoutDependentOp")
+                    .Finalize(graph, nullptr));
+    return graph;
+  }
+
+  // Default backward op.
+  Tensor grad_t(DT_FLOAT, shape);
+  grad_t.flat<float>().setRandom();
+  Node* grad = test::graph::Constant(graph, grad_t, "grad");
+  if (isDefault) {
+    TF_CHECK_OK(NodeBuilder(graph->NewName(node_name), op_name)
+                    .Input(grad)
+                    .Input(input)
+                    .Attr("T", DT_FLOAT)
+                    .Finalize(graph, nullptr));
+    return graph;
+  }
+
+  // MKL backward op.
+  TF_CHECK_OK(NodeBuilder(graph->NewName(node_name), "_Mkl" + op_name)
+                  .Input(grad)
+                  .Input(input)
+                  .Input(not_mkl_shape)
+                  .Input(not_mkl_shape)
+                  .Attr("T", DT_FLOAT)
+                  .Attr("_kernel", "MklLayoutDependentOp")
+                  .Finalize(graph, nullptr));
+  return graph;
 }
 
-static Tensor GetRandomTensor(const TensorShape& shape) {
-  Tensor tensor(DT_FLOAT, TensorShape(shape));
-  tensor.flat<float>() = tensor.flat<float>().setRandom();
-  return tensor;
-}
-
-#define CREATE_DEFAULT_FWD_OP(NODE_NAME, OP_NAME)                 \
-  static Graph* NODE_NAME(const TensorShape& shape) {             \
-    auto* graph = new Graph(OpRegistry::Global());                \
-    Tensor input_t = GetRandomTensor(shape);                      \
-    Node* input = test::graph::Constant(graph, input_t, "input"); \
-    Node* op;                                                     \
-    TF_CHECK_OK(NodeBuilder(graph->NewName(#NODE_NAME), #OP_NAME) \
-                    .Input(input)                                 \
-                    .Attr("T", DT_FLOAT)                          \
-                    .Finalize(graph, &op));                       \
-    return graph;                                                 \
-  }
-CREATE_DEFAULT_FWD_OP(Default_Tanh, Tanh)
-CREATE_DEFAULT_FWD_OP(Default_Elu, Elu)
-CREATE_DEFAULT_FWD_OP(Default_Relu, Relu)
-CREATE_DEFAULT_FWD_OP(Default_Relu6, Relu6)
-CREATE_DEFAULT_FWD_OP(Default_LeakyRelu, LeakyRelu)
-
-#define CREATE_DEFAULT_BWD_OP(NODE_NAME, OP_NAME)                 \
-  static Graph* NODE_NAME(const TensorShape& shape) {             \
-    auto* graph = new Graph(OpRegistry::Global());                \
-    Tensor input_t = GetRandomTensor(shape);                      \
-    Node* input = test::graph::Constant(graph, input_t, "input"); \
-    Tensor grad_t = GetRandomTensor(shape);                       \
-    Node* grad = test::graph::Constant(graph, grad_t, "grad");    \
-    Node* op;                                                     \
-    TF_CHECK_OK(NodeBuilder(graph->NewName(#NODE_NAME), #OP_NAME) \
-                    .Input(grad)                                  \
-                    .Input(input)                                 \
-                    .Attr("T", DT_FLOAT)                          \
-                    .Finalize(graph, &op));                       \
-    return graph;                                                 \
-  }
-CREATE_DEFAULT_BWD_OP(Default_TanhGrad, TanhGrad)
-CREATE_DEFAULT_BWD_OP(Default_EluGrad, EluGrad)
-CREATE_DEFAULT_BWD_OP(Default_ReluGrad, ReluGrad)
-CREATE_DEFAULT_BWD_OP(Default_Relu6Grad, Relu6Grad)
-CREATE_DEFAULT_BWD_OP(Default_LeakyReluGrad, LeakyReluGrad)
-
-#define CREATE_MKL_FWD_OP(NODE_NAME, OP_NAME)                     \
-  static Graph* NODE_NAME(const TensorShape& shape) {             \
-    auto* graph = new Graph(OpRegistry::Global());                \
-                                                                  \
-    Tensor input_t = GetRandomTensor(shape);                      \
-    Node* input = test::graph::Constant(graph, input_t, "input"); \
-                                                                  \
-    Node* not_mkl_shape =                                         \
-        test::graph::Constant(graph, NonMklTensor(), "not_mkl");  \
-                                                                  \
-    Node* op;                                                     \
-    TF_CHECK_OK(NodeBuilder(graph->NewName(#NODE_NAME), #OP_NAME) \
-                    .Input(input)                                 \
-                    .Input(not_mkl_shape)                         \
-                    .Attr("T", DT_FLOAT)                          \
-                    .Attr("_kernel", "MklLayoutDependentOp")      \
-                    .Finalize(graph, &op));                       \
-                                                                  \
-    return graph;                                                 \
-  }
-
-CREATE_MKL_FWD_OP(Mkl_Tanh, _MklTanh)
-CREATE_MKL_FWD_OP(Mkl_Elu, _MklElu)
-CREATE_MKL_FWD_OP(Mkl_Relu, _MklRelu)
-CREATE_MKL_FWD_OP(Mkl_Relu6, _MklRelu6)
-CREATE_MKL_FWD_OP(Mkl_LeakyRelu, _MklLeakyRelu)
-
-#define CREATE_MKL_BWD_OP(NODE_NAME, OP_NAME)                     \
-  static Graph* NODE_NAME(const TensorShape& shape) {             \
-    auto* graph = new Graph(OpRegistry::Global());                \
-                                                                  \
-    Tensor input_t = GetRandomTensor(shape);                      \
-    Node* input = test::graph::Constant(graph, input_t, "input"); \
-    Tensor grad_t = GetRandomTensor(shape);                       \
-    Node* grad = test::graph::Constant(graph, grad_t, "grad");    \
-                                                                  \
-    Node* not_mkl_shape =                                         \
-        test::graph::Constant(graph, NonMklTensor(), "not_mkl");  \
-                                                                  \
-    Node* op;                                                     \
-    TF_CHECK_OK(NodeBuilder(graph->NewName(#NODE_NAME), #OP_NAME) \
-                    .Input(grad)                                  \
-                    .Input(input)                                 \
-                    .Input(not_mkl_shape)                         \
-                    .Input(not_mkl_shape)                         \
-                    .Attr("T", DT_FLOAT)                          \
-                    .Attr("_kernel", "MklLayoutDependentOp")      \
-                    .Finalize(graph, &op));                       \
-                                                                  \
-    return graph;                                                 \
-  }
-
-CREATE_MKL_BWD_OP(Mkl_TanhGrad, _MklTanhGrad)
-CREATE_MKL_BWD_OP(Mkl_EluGrad, _MklEluGrad)
-CREATE_MKL_BWD_OP(Mkl_ReluGrad, _MklReluGrad)
-CREATE_MKL_BWD_OP(Mkl_Relu6Grad, _MklRelu6Grad)
-CREATE_MKL_BWD_OP(Mkl_LeakyReluGrad, _MklLeakyReluGrad)
-
 #define BM_Activation(op, kind, A, B, C, D, type)                            \
   static void BM_##op##_##kind##_##type##_##A##_##B##_##C##_##D(int iters) { \
     int64 num_computed_elements = (A) * (B) * (C) * (D);                     \
     int64 flops_per_iter = num_computed_elements;                            \
     testing::ItemsProcessed(static_cast<int64>(iters) * flops_per_iter);     \
                                                                              \
-    test::Benchmark(#type, kind##_##op({A, B, C, D})).Run(iters);            \
+    test::Benchmark(#type, Activation(#op, #kind, {A, B, C, D})).Run(iters); \
   }                                                                          \
   BENCHMARK(BM_##op##_##kind##_##type##_##A##_##B##_##C##_##D)
 
@@ -190,4 +132,26 @@ TEST_ALL_SIZES(LeakyReluGrad)
 
 }  // namespace tensorflow
 
+// --------------------------------------------------------------------------
+
+GTEST_API_ int main(int argc, char** argv) {
+  // Sets OpenMP environment variables.
+  // TODO(intel-tf): Remove this when OpenMP is removed.
+  tensorflow::setenv("KMP_BLOCKTIME", "0", true /*overwrite*/);
+  tensorflow::setenv("OMP_NUM_THREADS",
+                     std::to_string(tensorflow::port::MaxParallelism()).c_str(),
+                     true /*overwrite*/);
+
+  tensorflow::testing::InstallStacktraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  for (int i = 1; i < argc; i++) {
+    if (absl::StartsWith(argv[i], "--benchmarks=")) {
+      const char* pattern = argv[i] + strlen("--benchmarks=");
+      tensorflow::testing::Benchmark::Run(pattern);
+      return 0;
+    }
+  }
+  return RUN_ALL_TESTS();
+}
+
 #endif  // INTEL_MKL
diff --git a/tensorflow/core/util/mkl_util.h b/tensorflow/core/util/mkl_util.h
index 7f6272b09c1..884f23b23c7 100644
--- a/tensorflow/core/util/mkl_util.h
+++ b/tensorflow/core/util/mkl_util.h
@@ -1028,6 +1028,21 @@ inline void ForwardMklMetaDataInToOut(OpKernelContext* context,
   }
 }
 
+// -------------------------------------------------------------------
+//          Common utility functions used by MKL unit tests
+
+inline Tensor GetMklMetaTensor() {
+  MklDnnShape non_mkl_shape;
+  non_mkl_shape.SetMklTensor(false);
+
+  auto size = static_cast<int64>(non_mkl_shape.GetSerializeBufferSize());
+  Tensor tensor(DT_UINT8, {size});
+
+  non_mkl_shape.SerializeMklDnnShape(tensor.flat<uint8>().data(),
+                                     size * sizeof(uint8));
+  return tensor;
+}
+
 // -------------------------------------------------------------------
 
 /// Return MKL-DNN data type (memory::data_type) for input type T

From 565490c3387dceab0c1eb0c9480a28215a30f779 Mon Sep 17 00:00:00 2001
From: Niranjan Hasabnis <niranjan.hasabnis@intel.com>
Date: Wed, 10 Jun 2020 12:45:28 -0700
Subject: [PATCH 0127/1390] Fixing unit test failure in eigen build

---
 tensorflow/core/kernels/BUILD               |  1 +
 tensorflow/core/kernels/mkl_relu_op_test.cc | 22 ---------------------
 2 files changed, 1 insertion(+), 22 deletions(-)

diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index 5abedf441a1..a68eeb4479b 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -8124,6 +8124,7 @@ tf_cc_test_mkl(
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:tensorflow",
         "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
     ],
 )
diff --git a/tensorflow/core/kernels/mkl_relu_op_test.cc b/tensorflow/core/kernels/mkl_relu_op_test.cc
index 30f75cd23df..0949b30cab7 100644
--- a/tensorflow/core/kernels/mkl_relu_op_test.cc
+++ b/tensorflow/core/kernels/mkl_relu_op_test.cc
@@ -132,26 +132,4 @@ TEST_ALL_SIZES(LeakyReluGrad)
 
 }  // namespace tensorflow
 
-// --------------------------------------------------------------------------
-
-GTEST_API_ int main(int argc, char** argv) {
-  // Sets OpenMP environment variables.
-  // TODO(intel-tf): Remove this when OpenMP is removed.
-  tensorflow::setenv("KMP_BLOCKTIME", "0", true /*overwrite*/);
-  tensorflow::setenv("OMP_NUM_THREADS",
-                     std::to_string(tensorflow::port::MaxParallelism()).c_str(),
-                     true /*overwrite*/);
-
-  tensorflow::testing::InstallStacktraceHandler();
-  ::testing::InitGoogleTest(&argc, argv);
-  for (int i = 1; i < argc; i++) {
-    if (absl::StartsWith(argv[i], "--benchmarks=")) {
-      const char* pattern = argv[i] + strlen("--benchmarks=");
-      tensorflow::testing::Benchmark::Run(pattern);
-      return 0;
-    }
-  }
-  return RUN_ALL_TESTS();
-}
-
 #endif  // INTEL_MKL

From eef7be73987ab556e954260cbef3e82ccbcee8e8 Mon Sep 17 00:00:00 2001
From: Niranjan Hasabnis <niranjan.hasabnis@intel.com>
Date: Wed, 10 Jun 2020 12:52:23 -0700
Subject: [PATCH 0128/1390] Adding comment back

---
 tensorflow/core/kernels/mkl_relu_op_test.cc | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tensorflow/core/kernels/mkl_relu_op_test.cc b/tensorflow/core/kernels/mkl_relu_op_test.cc
index 0949b30cab7..6c0ad6facab 100644
--- a/tensorflow/core/kernels/mkl_relu_op_test.cc
+++ b/tensorflow/core/kernels/mkl_relu_op_test.cc
@@ -39,6 +39,9 @@ limitations under the License.
 
 // Compare performance of default Tensorflow convolution kernels (Eigen) with
 // MKL kernels on CPU.
+// Before running these benchmarks configure OpenMP environment variables:
+//   export KMP_BLOCKTIME=0
+//   export OMP_NUM_THREADS=${num_threads}
 
 namespace tensorflow {
 

From 02c6b9edaf14fb7ca35c2b4f0bc80b74cc8a551a Mon Sep 17 00:00:00 2001
From: jerryyin <zhuoryin@amd.com>
Date: Wed, 10 Jun 2020 21:11:38 +0000
Subject: [PATCH 0129/1390] [ROCm][mlir] Disable mlir saved model test

---
 .../mlir/tensorflow/tests/tf_saved_model/build_defs.bzl       | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/build_defs.bzl b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/build_defs.bzl
index 594afa10453..95ad05aa1e6 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/build_defs.bzl
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/build_defs.bzl
@@ -4,8 +4,6 @@ load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "lit_test")
 
 def tf_saved_model_test(name, data, tags = None):
     """Create a SavedModel test."""
-    if tags == None:
-        tags = ["no_rocm"]
     native.py_binary(
         name = name,
         testonly = 1,
@@ -26,5 +24,5 @@ def tf_saved_model_test(name, data, tags = None):
         name = name + ".py",
         data = [name] + data,
         driver = "@llvm-project//mlir:run_lit.sh",
-        tags = tags,
+        tags = tags + ["no_rocm"],
     )

From 3b95c2c54df8a7bc3641871197262841b803f8cd Mon Sep 17 00:00:00 2001
From: "902449@58880@bigcat_chen@ASIC" <bigcat_chen@himax.com.tw>
Date: Thu, 11 Jun 2020 13:54:16 +0800
Subject: [PATCH 0130/1390] Correcting for PR comments

---
 .../himax_we1_evb/detection_responder.cc      |   1 -
 .../himax_we1_evb/image_provider.cc           |  10 +-
 .../himax_we1_evb/main_functions.cc           | 126 ------------------
 .../lite/micro/himax_we1_evb/debug_log.cc     |   5 +-
 4 files changed, 6 insertions(+), 136 deletions(-)
 delete mode 100644 tensorflow/lite/micro/examples/person_detection_experimental/himax_we1_evb/main_functions.cc

diff --git a/tensorflow/lite/micro/examples/person_detection_experimental/himax_we1_evb/detection_responder.cc b/tensorflow/lite/micro/examples/person_detection_experimental/himax_we1_evb/detection_responder.cc
index a353dc8a9b8..ae5de962fd3 100644
--- a/tensorflow/lite/micro/examples/person_detection_experimental/himax_we1_evb/detection_responder.cc
+++ b/tensorflow/lite/micro/examples/person_detection_experimental/himax_we1_evb/detection_responder.cc
@@ -22,7 +22,6 @@ limitations under the License.
 // should implement their own versions of this function.
 void RespondToDetection(tflite::ErrorReporter* error_reporter,
                         int8_t person_score, int8_t no_person_score) {
-
   if (person_score > no_person_score) {
     hx_drv_led_on(HX_DRV_LED_GREEN);
   } else {
diff --git a/tensorflow/lite/micro/examples/person_detection_experimental/himax_we1_evb/image_provider.cc b/tensorflow/lite/micro/examples/person_detection_experimental/himax_we1_evb/image_provider.cc
index 727d93c61d1..4a3ab5775be 100644
--- a/tensorflow/lite/micro/examples/person_detection_experimental/himax_we1_evb/image_provider.cc
+++ b/tensorflow/lite/micro/examples/person_detection_experimental/himax_we1_evb/image_provider.cc
@@ -21,14 +21,12 @@ limitations under the License.
 
 hx_drv_sensor_image_config_t g_pimg_config;
 
-
 TfLiteStatus GetImage(tflite::ErrorReporter* error_reporter, int image_width,
                       int image_height, int channels, int8_t* image_data) {
   static bool is_initialized = false;
 
   if (!is_initialized) {
-    if(hx_drv_sensor_initial(&g_pimg_config)!= HX_DRV_LIB_PASS)
-    {
+    if (hx_drv_sensor_initial(&g_pimg_config) != HX_DRV_LIB_PASS) {
       return kTfLiteError;
     }
     is_initialized = true;
@@ -36,9 +34,9 @@ TfLiteStatus GetImage(tflite::ErrorReporter* error_reporter, int image_width,
 
   hx_drv_sensor_capture(&g_pimg_config);
 
-  hx_drv_image_rescale((uint8_t*)g_pimg_config.raw_address, g_pimg_config.img_width, g_pimg_config.img_height,
-                     image_data, image_width, image_height);
-
+  hx_drv_image_rescale((uint8_t*)g_pimg_config.raw_address,
+                       g_pimg_config.img_width, g_pimg_config.img_height,
+                       image_data, image_width, image_height);
 
   return kTfLiteOk;
 }
diff --git a/tensorflow/lite/micro/examples/person_detection_experimental/himax_we1_evb/main_functions.cc b/tensorflow/lite/micro/examples/person_detection_experimental/himax_we1_evb/main_functions.cc
deleted file mode 100644
index f0c7a405974..00000000000
--- a/tensorflow/lite/micro/examples/person_detection_experimental/himax_we1_evb/main_functions.cc
+++ /dev/null
@@ -1,126 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/lite/micro/examples/person_detection_experimental/main_functions.h"
-
-#include "tensorflow/lite/micro/examples/person_detection_experimental/detection_responder.h"
-#include "tensorflow/lite/micro/examples/person_detection_experimental/image_provider.h"
-#include "tensorflow/lite/micro/examples/person_detection_experimental/model_settings.h"
-#include "tensorflow/lite/micro/examples/person_detection_experimental/person_detect_model_data.h"
-#include "tensorflow/lite/micro/kernels/micro_ops.h"
-#include "tensorflow/lite/micro/micro_error_reporter.h"
-#include "tensorflow/lite/micro/micro_interpreter.h"
-#include "tensorflow/lite/micro/micro_mutable_op_resolver.h"
-#include "tensorflow/lite/schema/schema_generated.h"
-#include "tensorflow/lite/version.h"
-
-// Globals, used for compatibility with Arduino-style sketches.
-namespace {
-tflite::ErrorReporter* error_reporter = nullptr;
-const tflite::Model* model = nullptr;
-tflite::MicroInterpreter* interpreter = nullptr;
-TfLiteTensor* input = nullptr;
-
-// In order to use optimized tensorflow lite kernels, a signed int8 quantized
-// model is preferred over the legacy unsigned model format. This means that
-// throughout this project, input images must be converted from unisgned to
-// signed format. The easiest and quickest way to convert from unsigned to
-// signed 8-bit integers is to subtract 128 from the unsigned value to get a
-// signed value.
-
-// An area of memory to use for input, output, and intermediate arrays.
-constexpr int kTensorArenaSize = 125 * 1024;
-#pragma Bss(".tensor_arena")
-static uint8_t tensor_arena[kTensorArenaSize];
-#pragma Bss()
-}  // namespace
-
-// The name of this function is important for Arduino compatibility.
-void setup() {
-  // Set up logging. Google style is to avoid globals or statics because of
-  // lifetime uncertainty, but since this has a trivial destructor it's okay.
-  // NOLINTNEXTLINE(runtime-global-variables)
-  static tflite::MicroErrorReporter micro_error_reporter;
-  error_reporter = &micro_error_reporter;
-
-  // Map the model into a usable data structure. This doesn't involve any
-  // copying or parsing, it's a very lightweight operation.
-  model = tflite::GetModel(g_person_detect_model_data);
-  if (model->version() != TFLITE_SCHEMA_VERSION) {
-    TF_LITE_REPORT_ERROR(error_reporter,
-                         "Model provided is schema version %d not equal "
-                         "to supported version %d.",
-                         model->version(), TFLITE_SCHEMA_VERSION);
-    return;
-  }
-
-  // Pull in only the operation implementations we need.
-  // This relies on a complete list of all the ops needed by this graph.
-  // An easier approach is to just use the AllOpsResolver, but this will
-  // incur some penalty in code space for op implementations that are not
-  // needed by this graph.
-  //
-  // tflite::AllOpsResolver resolver;
-  // NOLINTNEXTLINE(runtime-global-variables)
-  static tflite::MicroMutableOpResolver<5> micro_op_resolver;
-  micro_op_resolver.AddBuiltin(
-      tflite::BuiltinOperator_DEPTHWISE_CONV_2D,
-      tflite::ops::micro::Register_DEPTHWISE_CONV_2D());
-  micro_op_resolver.AddBuiltin(tflite::BuiltinOperator_CONV_2D,
-                               tflite::ops::micro::Register_CONV_2D());
-  micro_op_resolver.AddBuiltin(tflite::BuiltinOperator_AVERAGE_POOL_2D,
-                               tflite::ops::micro::Register_AVERAGE_POOL_2D());
-  micro_op_resolver.AddBuiltin(tflite::BuiltinOperator_RESHAPE,
-                               tflite::ops::micro::Register_RESHAPE());
-  micro_op_resolver.AddBuiltin(tflite::BuiltinOperator_SOFTMAX,
-                               tflite::ops::micro::Register_SOFTMAX());
-
-  // Build an interpreter to run the model with.
-  // NOLINTNEXTLINE(runtime-global-variables)
-  static tflite::MicroInterpreter static_interpreter(
-      model, micro_op_resolver, tensor_arena, kTensorArenaSize, error_reporter);
-  interpreter = &static_interpreter;
-
-  // Allocate memory from the tensor_arena for the model's tensors.
-  TfLiteStatus allocate_status = interpreter->AllocateTensors();
-  if (allocate_status != kTfLiteOk) {
-    TF_LITE_REPORT_ERROR(error_reporter, "AllocateTensors() failed");
-    return;
-  }
-
-  // Get information about the memory area to use for the model's input.
-  input = interpreter->input(0);
-}
-
-// The name of this function is important for Arduino compatibility.
-void loop() {
-  // Get image from provider.
-  if (kTfLiteOk != GetImage(error_reporter, kNumCols, kNumRows, kNumChannels,
-                            input->data.int8)) {
-    TF_LITE_REPORT_ERROR(error_reporter, "Image capture failed.");
-  }
-
-  // Run the model on this input and make sure it succeeds.
-  if (kTfLiteOk != interpreter->Invoke()) {
-    TF_LITE_REPORT_ERROR(error_reporter, "Invoke failed.");
-  }
-
-  TfLiteTensor* output = interpreter->output(0);
-
-  // Process the inference results.
-  int8_t person_score = output->data.uint8[kPersonIndex];
-  int8_t no_person_score = output->data.uint8[kNotAPersonIndex];
-  RespondToDetection(error_reporter, person_score, no_person_score);
-}
diff --git a/tensorflow/lite/micro/himax_we1_evb/debug_log.cc b/tensorflow/lite/micro/himax_we1_evb/debug_log.cc
index 32af2625630..36ac3f3fa03 100644
--- a/tensorflow/lite/micro/himax_we1_evb/debug_log.cc
+++ b/tensorflow/lite/micro/himax_we1_evb/debug_log.cc
@@ -20,12 +20,11 @@ limitations under the License.
 #include "tensorflow/lite/micro/debug_log.h"
 #include "hx_drv_tflm.h"
 
-
 extern "C" void DebugLog(const char* s) {
   static bool is_initialized = false;
   if (!is_initialized) {
-	  hx_drv_uart_initial();
-	  is_initialized = true;
+    hx_drv_uart_initial();
+    is_initialized = true;
   }
 
   hx_drv_uart_print("%s", s);

From b34536e5784f6d87a8905e73021f75a5cf98077a Mon Sep 17 00:00:00 2001
From: stjohnso98 <44154075+stjohnso98@users.noreply.github.com>
Date: Thu, 11 Jun 2020 14:44:04 +0530
Subject: [PATCH 0131/1390] Update
 tensorflow/core/kernels/data/experimental/csv_dataset_op.cc

Co-authored-by: Rachel Lim <lim.rachelys@gmail.com>
---
 .../kernels/data/experimental/csv_dataset_op.cc     | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/tensorflow/core/kernels/data/experimental/csv_dataset_op.cc b/tensorflow/core/kernels/data/experimental/csv_dataset_op.cc
index d2023ecec6e..906e57066a2 100644
--- a/tensorflow/core/kernels/data/experimental/csv_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/csv_dataset_op.cc
@@ -64,10 +64,15 @@ class CSVDatasetOp : public DatasetOpKernel {
     OP_REQUIRES(ctx, select_cols_tensor->dims() == 1,
                 errors::InvalidArgument("`select_cols` must be a vector."));
 
-    const Tensor* exclude_cols_tensor = new const Tensor();
-    if (op_version_ > 1) {
-      OP_REQUIRES_OK(ctx, ctx->input("exclude_cols", &exclude_cols_tensor));
-    }
+std::vector<int64> exclude_cols;
+if (op_version_ > 1) {
+  const Tensor* exclude_cols_tensor;
+  OP_REQUIRES_OK(ctx, ctx->input("exclude_cols", &exclude_cols_tensor));
+  exclude_cols.reserve(exclude_cols_tensor->NumElements());
+  for (int i = 0; i < exclude_cols_tensor->NumElements(); ++i) {
+    exclude_cols.push_back(exclude_cols_tensor->flat<int64>()(i));
+  }
+}
 
     OP_REQUIRES(ctx, exclude_cols_tensor->dims() == 1,
                 errors::InvalidArgument("`exclude_cols` must be a vector"));

From 5cbd0bcf412c56c6610c24ae12c83840dc9724a6 Mon Sep 17 00:00:00 2001
From: Tare Gaskin <taregaskin@google.com>
Date: Thu, 11 Jun 2020 01:03:25 +0000
Subject: [PATCH 0132/1390] [-Wsign-compare] batch resolution 1

---
 .../quantization/import_quant_stats_pass.cc   |  4 +--
 .../lite/quantization/quantization_config.cc  |  4 +--
 .../lite/quantization/quantization_driver.cc  |  4 +--
 .../lite/quantization/quantization_utils.cc   | 10 +++----
 .../mlir/tensorflow/utils/dump_mlir_util.cc   |  2 +-
 tensorflow/compiler/mlir/xla/ir/chlo_ops.cc   |  2 +-
 tensorflow/compiler/mlir/xla/ir/hlo_ops.cc    |  6 ++--
 tensorflow/compiler/xla/window_util.cc        |  2 +-
 tensorflow/core/kernels/batch_kernels.cc      |  6 ++--
 .../core/kernels/data/prefetch_autotuner.cc   |  4 +--
 tensorflow/core/kernels/quantization_utils.h  |  2 +-
 tensorflow/core/platform/s3/s3_file_system.cc |  2 +-
 .../core/profiler/utils/derived_timeline.cc   |  2 +-
 .../core/profiler/utils/derived_timeline.h    |  2 +-
 .../core/profiler/utils/xplane_utils.cc       |  2 +-
 tensorflow/core/util/bcast.h                  |  4 +--
 .../convert_trivial_tile_to_concat.cc         |  2 +-
 .../convert_trivial_transpose_to_reshape.cc   |  2 +-
 .../toco/graph_transformations/dequantize.cc  |  2 +-
 .../graph_transformations/drop_fake_quant.cc  |  2 +-
 ...int8_weights_safe_for_fast_int8_kernels.cc |  2 +-
 .../fuse_broadcast_into_following_binary.cc   |  2 +-
 .../group_bidirectional_sequence_ops.cc       |  4 +--
 .../graph_transformations/hardcode_min_max.cc |  2 +-
 .../identify_nearest_upsample.cc              |  2 +-
 .../merge_reshape_into_preceding_transpose.cc |  4 +--
 .../propagate_array_data_types.cc             |  2 +-
 .../propagate_fake_quant_num_bits.cc          |  2 +-
 .../propagate_fixed_sizes.cc                  | 28 +++++++++----------
 .../remove_successive_transpose.cc            | 10 +++----
 .../remove_trivial_passthrough.cc             |  2 +-
 .../reorder_elementwise_unary.cc              |  4 +--
 .../reorder_reshape_transpose.cc              | 12 ++++----
 .../resolve_batch_normalization.cc            | 10 +++----
 .../resolve_constant_concatenation.cc         |  2 +-
 .../resolve_constant_pack.cc                  |  2 +-
 .../resolve_constant_slice.cc                 |  2 +-
 .../resolve_constant_transpose.cc             |  2 +-
 .../resolve_constant_unary.cc                 |  4 +--
 .../unpartition_embedding_lookup.cc           |  4 +--
 tensorflow/lite/toco/model_cmdline_flags.cc   |  8 +++---
 tensorflow/lite/toco/toco_cmdline_flags.cc    |  2 +-
 42 files changed, 89 insertions(+), 89 deletions(-)

diff --git a/tensorflow/compiler/mlir/lite/quantization/import_quant_stats_pass.cc b/tensorflow/compiler/mlir/lite/quantization/import_quant_stats_pass.cc
index d924a3e82ac..5419a0d5e1b 100644
--- a/tensorflow/compiler/mlir/lite/quantization/import_quant_stats_pass.cc
+++ b/tensorflow/compiler/mlir/lite/quantization/import_quant_stats_pass.cc
@@ -76,7 +76,7 @@ class ImportQuantStatsPass
   // If the index is out of range, this method returns false. Otherwise it
   // returns true if the value is a float tensor.
   bool IsQuantizableResult(Operation *op, int index) {
-    if (index < 0 || index >= op->getNumResults()) return false;
+    if (index < 0 || index >= static_cast<int>(op->getNumResults())) return false;
     Value res = op->getResult(index);
     return res.getType().isa<ShapedType>() &&
            res.getType().cast<ShapedType>().getElementType().isa<FloatType>();
@@ -158,7 +158,7 @@ void ImportQuantStatsPass::ImportAsStatsOps(OpBuilder b, Operation *op,
     InsertStatsOpAtResult(b, op->getResult(index), layer_stats, axis_stats,
                           axis);
   } else {
-    for (int i = 0; i < op->getNumResults(); ++i) {
+    for (int i = 0; i < static_cast<int>(op->getNumResults()); ++i) {
       if (IsQuantizableResult(op, i)) {
         InsertStatsOpAtResult(b, op->getResult(i), layer_stats, axis_stats,
                               axis);
diff --git a/tensorflow/compiler/mlir/lite/quantization/quantization_config.cc b/tensorflow/compiler/mlir/lite/quantization/quantization_config.cc
index 6b897bd5608..c4cf6e71cf3 100644
--- a/tensorflow/compiler/mlir/lite/quantization/quantization_config.cc
+++ b/tensorflow/compiler/mlir/lite/quantization/quantization_config.cc
@@ -48,7 +48,7 @@ bool ParseInputNodeQuantSpecs(absl::string_view node_names,
   std::vector<double> node_mins;
   if (!min_values.empty()) {
     std::vector<std::string> node_mins_str = absl::StrSplit(min_values, ',');
-    for (int i = 0; i < node_mins_str.size(); i++) {
+    for (size_t i = 0; i < node_mins_str.size(); i++) {
       double value;
       if (!absl::SimpleAtod(node_mins_str[i], &value)) {
         return true;
@@ -60,7 +60,7 @@ bool ParseInputNodeQuantSpecs(absl::string_view node_names,
   std::vector<double> node_maxs;
   if (!max_values.empty()) {
     std::vector<std::string> node_maxs_str = absl::StrSplit(max_values, ',');
-    for (int i = 0; i < node_maxs_str.size(); i++) {
+    for (size_t i = 0; i < node_maxs_str.size(); i++) {
       double value;
       if (!absl::SimpleAtod(node_maxs_str[i], &value)) {
         llvm::errs() << "Unexpected mins: " << node_maxs_str[i] << "\n";
diff --git a/tensorflow/compiler/mlir/lite/quantization/quantization_driver.cc b/tensorflow/compiler/mlir/lite/quantization/quantization_driver.cc
index 2964a3e79f8..fc11604ef8a 100644
--- a/tensorflow/compiler/mlir/lite/quantization/quantization_driver.cc
+++ b/tensorflow/compiler/mlir/lite/quantization/quantization_driver.cc
@@ -294,7 +294,7 @@ class QuantizationDriver {
         return;
       if (current_op == op) llvm::errs() << "===>>>";
       llvm::errs() << op->getName() << " : (";
-      for (auto i = 0; i < op->getNumOperands(); ++i) {
+      for (size_t i = 0; i < op->getNumOperands(); ++i) {
         if (auto params = GetOperandQuantState(op, i).params)
           params.print(llvm::errs());
         else
@@ -303,7 +303,7 @@ class QuantizationDriver {
         llvm::errs() << ",";
       }
       llvm::errs() << ") -> (";
-      for (auto i = 0; i < op->getNumResults(); ++i) {
+      for (size_t i = 0; i < op->getNumResults(); ++i) {
         if (auto params = GetResultQuantState(op, i).params)
           params.print(llvm::errs());
         else
diff --git a/tensorflow/compiler/mlir/lite/quantization/quantization_utils.cc b/tensorflow/compiler/mlir/lite/quantization/quantization_utils.cc
index 3d50f280d0f..b9ca5329519 100644
--- a/tensorflow/compiler/mlir/lite/quantization/quantization_utils.cc
+++ b/tensorflow/compiler/mlir/lite/quantization/quantization_utils.cc
@@ -54,7 +54,7 @@ static Type GetQuantizedType(Builder builder, Type input_type,
   } else if (min.size() == max.size()) {
     auto shape = input_type.dyn_cast<ShapedType>();
     if (!shape || shape.getRank() <= quant_dim ||
-        min.size() != shape.getDimSize(quant_dim)) {
+        static_cast<int64_t>(min.size()) != shape.getDimSize(quant_dim)) {
       return {};
     }
     // TODO(b/141508873): the quantization dim is set to the last dimension.
@@ -75,7 +75,7 @@ TypeAttr RescaleQuantizedType(Type input, Attribute factor) {
   if (auto qtype = ele_type.dyn_cast<quant::UniformQuantizedPerAxisType>()) {
     ArrayRef<double> scales = qtype.getScales();
     // Broadcasting hasn't been implemented yet.
-    if (scales.size() != factor_values.getNumElements()) return {};
+    if (static_cast<int64_t>(scales.size()) != factor_values.getNumElements()) return {};
     SmallVector<double, 4> new_scales;
     new_scales.reserve(scales.size());
     auto scales_iter = scales.begin();
@@ -269,7 +269,7 @@ Type GetUniformQuantizedPerAxisTypeForWeight(ElementsAttr attr, int quant_dim,
                                              bool narrow_range) {
   Builder builder(attr.getContext());
   auto shape = attr.getType().cast<ShapedType>().getShape();
-  if (shape.size() <= quant_dim) return {};
+  if (static_cast<int>(shape.size()) <= quant_dim) return {};
   // `symmetric` can only be used when it is `signed` and `narrow_range`.
   if (symmetric && (!is_signed || !narrow_range)) return {};
 
@@ -334,7 +334,7 @@ quant::QuantizedType GetUniformQuantizedTypeForBias(
     const std::vector<quant::QuantizedType>& op_types) {
   if (op_types.empty()) return {};
 
-  int axis_size = 1;
+  size_t axis_size = 1;
   int32_t quant_dim = -1;
   Type expressed_type;
   // Requires all the op types are valid UniformQuantizedTypes or
@@ -368,7 +368,7 @@ quant::QuantizedType GetUniformQuantizedTypeForBias(
         scales[index_scale.index()] *= index_scale.value();
       }
     } else if (auto type = op_type.dyn_cast<quant::UniformQuantizedType>()) {
-      for (int index = 0; index != axis_size; ++index) {
+      for (size_t index = 0; index != axis_size; ++index) {
         scales[index] *= type.getScale();
       }
     }
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc b/tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc
index 797687ea658..b5a6c922707 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc
@@ -41,7 +41,7 @@ std::string MakeUniqueFilename(string name) {
   static NameCounts& instance = *new NameCounts;
 
   // Remove illegal characters from `name`.
-  for (int i = 0; i < name.size(); ++i) {
+  for (size_t i = 0; i < name.size(); ++i) {
     char ch = name[i];
     if (ch == '/' || ch == '[' || ch == ']' || ch == '*' || ch == '?' ||
         ch == '\\') {
diff --git a/tensorflow/compiler/mlir/xla/ir/chlo_ops.cc b/tensorflow/compiler/mlir/xla/ir/chlo_ops.cc
index 26db4549a2a..f5b895f0c76 100644
--- a/tensorflow/compiler/mlir/xla/ir/chlo_ops.cc
+++ b/tensorflow/compiler/mlir/xla/ir/chlo_ops.cc
@@ -49,7 +49,7 @@ static Type GetBroadcastType(Type x, Type y, Type element_type,
 
   if (shape_x.size() == shape_y.size()) {
     llvm::SmallVector<int64_t, 4> out_shape(shape_x.size());
-    for (int i = 0; i < shape_x.size(); i++) {
+    for (size_t i = 0; i < shape_x.size(); i++) {
       auto x_val = shape_x[i];
       auto y_val = shape_y[i];
       if (x_val == -1 || y_val == -1) {
diff --git a/tensorflow/compiler/mlir/xla/ir/hlo_ops.cc b/tensorflow/compiler/mlir/xla/ir/hlo_ops.cc
index d20f1713eba..569e45912a2 100644
--- a/tensorflow/compiler/mlir/xla/ir/hlo_ops.cc
+++ b/tensorflow/compiler/mlir/xla/ir/hlo_ops.cc
@@ -143,7 +143,7 @@ DenseIntElementsAttr BuildConvPaddingAttrs(
 
   int rank = padding_low.size();
   SmallVector<int64_t, 8> padding;
-  for (unsigned i = 0; i < rank; ++i) {
+  for (unsigned i = 0; i < static_cast<size_t>(rank); ++i) {
     padding.push_back(GetPaddingValue(padding_attr, {i, 0}) + padding_low[i]);
     padding.push_back(GetPaddingValue(padding_attr, {i, 1}) + padding_high[i]);
   }
@@ -853,7 +853,7 @@ static Attribute foldConcatenateHelper(ConcatenateOp* op,
   auto shape = type.getShape();
 
   size_t top_size = 1;
-  for (int i = 0; i < axis; i++) {
+  for (size_t i = 0; i < axis; i++) {
     top_size = top_size * shape[i];
   }
 
@@ -1118,7 +1118,7 @@ static LogicalResult Verify(MapOp op) {
   // increasing.
   auto values = op.dimensions().getValues<int64_t>();
   auto dimensions = std::vector<int64_t>{values.begin(), values.end()};
-  for (int i = 0; i < dimensions.size(); ++i) {
+  for (int i = 0; static_cast<size_t>(i) < dimensions.size(); ++i) {
     if (dimensions[i] != i)
       return op.emitOpError() << "requires monotonically increasing dimension "
                                  "numbers, but got: "
diff --git a/tensorflow/compiler/xla/window_util.cc b/tensorflow/compiler/xla/window_util.cc
index a58179c3ee0..e33d0b6d1dc 100644
--- a/tensorflow/compiler/xla/window_util.cc
+++ b/tensorflow/compiler/xla/window_util.cc
@@ -42,7 +42,7 @@ Window MakeWindow(absl::Span<const int64> sizes,
                   absl::Span<const int64> strides) {
   Window window;
   CHECK_EQ(sizes.size(), strides.size());
-  for (auto nb = 0; nb < sizes.size(); ++nb) {
+  for (auto nb = 0; static_cast<size_t>(nb) < sizes.size(); ++nb) {
     auto* dimension = window.add_dimensions();
     dimension->set_size(sizes[nb]);
     dimension->set_stride(strides[nb]);
diff --git a/tensorflow/core/kernels/batch_kernels.cc b/tensorflow/core/kernels/batch_kernels.cc
index 151f2367c95..ee271f1a123 100644
--- a/tensorflow/core/kernels/batch_kernels.cc
+++ b/tensorflow/core/kernels/batch_kernels.cc
@@ -486,18 +486,18 @@ class BatchResource : public ResourceBase {
     std::map<string, std::vector<Tensor>> split_tensors;
 
     DCHECK_EQ(batch->task(0).context->num_outputs(), combined_outputs.size());
-    if (combined_outputs.size() != batch->task(0).context->num_outputs()) {
+    if (static_cast<int>(combined_outputs.size()) != batch->task(0).context->num_outputs()) {
       return errors::Internal("Wrong number of batched output tensors");
     }
 
     // Generate 'split_tensors' and populate the context outputs.
-    for (int i = 0; i < combined_outputs.size(); ++i) {
+    for (size_t i = 0; i < combined_outputs.size(); ++i) {
       const Tensor& output_tensor = combined_outputs[i];
       if (output_tensor.shape().dims() == 0) {
         return errors::FailedPrecondition(
             "Batched output tensor has 0 dimensions");
       }
-      if (output_tensor.shape().dim_size(0) != batch->size() + padding_size) {
+      if (output_tensor.shape().dim_size(0) != static_cast<long long int>(batch->size() + padding_size)) {
         return errors::FailedPrecondition(
             "Batched output tensor's 0th dimension does not equal the sum of "
             "the 0th dimension sizes of the input tensors");
diff --git a/tensorflow/core/kernels/data/prefetch_autotuner.cc b/tensorflow/core/kernels/data/prefetch_autotuner.cc
index a3bb1acc352..a3fd9919d6b 100644
--- a/tensorflow/core/kernels/data/prefetch_autotuner.cc
+++ b/tensorflow/core/kernels/data/prefetch_autotuner.cc
@@ -40,13 +40,13 @@ void PrefetchAutotuner::RecordConsumption(size_t current_buffer_size) {
     case Mode::kDisabled:
       return;
     case Mode::kUpswing:
-      if (current_buffer_size == buffer_limit_) {
+      if (static_cast<tensorflow::int64>(current_buffer_size) == buffer_limit_) {
         mode_ = Mode::kDownswing;
       }
       return;
     case Mode::kDownswing:
       if (current_buffer_size == 0) {
-        if (buffer_limit_ >= kBufferLimitThreshold) {
+        if (buffer_limit_ >= static_cast<tensorflow::int64>(kBufferLimitThreshold)) {
           buffer_limit_ += kBufferLimitThreshold;
         } else {
           buffer_limit_ *= 2;
diff --git a/tensorflow/core/kernels/quantization_utils.h b/tensorflow/core/kernels/quantization_utils.h
index 315616f3fb3..06c901967b0 100644
--- a/tensorflow/core/kernels/quantization_utils.h
+++ b/tensorflow/core/kernels/quantization_utils.h
@@ -268,7 +268,7 @@ inline void RequantizeManyInNewRangeReference(const qint32* input, int64 count,
   // that could be easily adapted for a SIMD implementation. It should also be
   // possible to perform all the calculations in 32-bit rather than 64, but
   // that's not been implemented yet.
-  for (size_t index = 0; index < count; ++index) {
+  for (size_t index = 0; static_cast<tensorflow::int64>(index) < count; ++index) {
     const int64 input_value = static_cast<int64>(input[index]);
     const int64 fp_value =
         ((input_value * range_scale_fp) >> 32) + input_offset_fp;
diff --git a/tensorflow/core/platform/s3/s3_file_system.cc b/tensorflow/core/platform/s3/s3_file_system.cc
index 1726c9fbc6c..45d648abcc0 100644
--- a/tensorflow/core/platform/s3/s3_file_system.cc
+++ b/tensorflow/core/platform/s3/s3_file_system.cc
@@ -906,7 +906,7 @@ Status S3FileSystem::MultiPartCopy(const Aws::String& source,
       // wait on the mutex until notify is called
       // then check the finished parts as there could be false notifications
       multi_part_copy_cv.wait(lock, [&finishedPartStates, num_parts] {
-        return finishedPartStates.size() == num_parts;
+        return static_cast<const int>(finishedPartStates.size()) == num_parts;
       });
     }
     // check if there was any error for any part
diff --git a/tensorflow/core/profiler/utils/derived_timeline.cc b/tensorflow/core/profiler/utils/derived_timeline.cc
index 112c0977763..3d03fc22c16 100644
--- a/tensorflow/core/profiler/utils/derived_timeline.cc
+++ b/tensorflow/core/profiler/utils/derived_timeline.cc
@@ -130,7 +130,7 @@ void DerivedXLineBuilder::ExpandOrAddLevelEvent(const XEvent& event,
 }
 
 void DerivedXLineBuilder::ResetLastEvents(int level) {
-  for (int i = level; i < last_event_by_level_.size(); ++i) {
+  for (int i = level; i < static_cast<int>(last_event_by_level_.size()); ++i) {
     last_event_by_level_[i] = absl::nullopt;
   }
   if (level == 0) ResetDependentLines();
diff --git a/tensorflow/core/profiler/utils/derived_timeline.h b/tensorflow/core/profiler/utils/derived_timeline.h
index cd4da7996c5..92489399b8f 100644
--- a/tensorflow/core/profiler/utils/derived_timeline.h
+++ b/tensorflow/core/profiler/utils/derived_timeline.h
@@ -37,7 +37,7 @@ class DerivedXLineBuilder {
                       std::vector<DerivedXLineBuilder*> dependent_lines);
 
   void ExpandOrAddEvents(const std::vector<XEvent>& event_per_level) {
-    for (int level = 0; level < event_per_level.size(); ++level) {
+    for (size_t level = 0; level < event_per_level.size(); ++level) {
       ExpandOrAddLevelEvent(event_per_level[level], level);
     }
   }
diff --git a/tensorflow/core/profiler/utils/xplane_utils.cc b/tensorflow/core/profiler/utils/xplane_utils.cc
index 7f5221c5391..1fe476ce79c 100644
--- a/tensorflow/core/profiler/utils/xplane_utils.cc
+++ b/tensorflow/core/profiler/utils/xplane_utils.cc
@@ -266,7 +266,7 @@ void SortXSpace(XSpace* space) {
 // smaller than these value.
 void NormalizeTimestamps(XPlane* plane, uint64 start_time_ns) {
   for (XLine& line : *plane->mutable_lines()) {
-    if (line.timestamp_ns() >= start_time_ns) {
+    if (line.timestamp_ns() >= static_cast<long int>(start_time_ns)) {
       line.set_timestamp_ns(line.timestamp_ns() - start_time_ns);
     }
   }
diff --git a/tensorflow/core/util/bcast.h b/tensorflow/core/util/bcast.h
index 7bb8ea18ad3..075de84964e 100644
--- a/tensorflow/core/util/bcast.h
+++ b/tensorflow/core/util/bcast.h
@@ -139,7 +139,7 @@ BCastList<N>::BCastList(const BCastList::Vec (&x)[N],
     if (x[i] != x[0]) {
       all_equal = false;
     }
-    if (x[i].size() > largest_rank) {
+    if (static_cast<int>(x[i].size()) > largest_rank) {
       largest_rank = x[i].size();
     }
   }
@@ -176,7 +176,7 @@ BCastList<N>::BCastList(const BCastList::Vec (&x)[N],
 
   // 1-extend and align all vectors.
   for (int i = 0; i < N; ++i) {
-    if (copy[i].size() < largest_rank) {
+    if (static_cast<int>(copy[i].size()) < largest_rank) {
       copy[i].resize(largest_rank, 1);
     }
   }
diff --git a/tensorflow/lite/toco/graph_transformations/convert_trivial_tile_to_concat.cc b/tensorflow/lite/toco/graph_transformations/convert_trivial_tile_to_concat.cc
index 46288d2a1ed..c19ccf676c9 100644
--- a/tensorflow/lite/toco/graph_transformations/convert_trivial_tile_to_concat.cc
+++ b/tensorflow/lite/toco/graph_transformations/convert_trivial_tile_to_concat.cc
@@ -52,7 +52,7 @@ namespace toco {
   // It then just becomes a concat along that dimension.
   int non_one_dims = 0;
   int concat_axis = 0;
-  for (int i = 0; i < multiples.size(); ++i) {
+  for (size_t i = 0; i < multiples.size(); ++i) {
     if (multiples[i] != 1) {
       ++non_one_dims;
       concat_axis = i;
diff --git a/tensorflow/lite/toco/graph_transformations/convert_trivial_transpose_to_reshape.cc b/tensorflow/lite/toco/graph_transformations/convert_trivial_transpose_to_reshape.cc
index 2b5aaea2b23..fa8a69a1e7a 100644
--- a/tensorflow/lite/toco/graph_transformations/convert_trivial_transpose_to_reshape.cc
+++ b/tensorflow/lite/toco/graph_transformations/convert_trivial_transpose_to_reshape.cc
@@ -31,7 +31,7 @@ bool TransposeAffectsMemoryOrder(std::vector<int> perm,
   // just the shape) then the flat buffer representation shouldn't change.
   std::vector<int> old_major_index_ordering;
   std::vector<int> new_major_index_ordering;
-  for (int i = 0; i < in_shape.size(); i++) {
+  for (int i = 0; static_cast<size_t>(i) < in_shape.size(); i++) {
     if (in_shape[i] != 1) {
       old_major_index_ordering.push_back(i);
     }
diff --git a/tensorflow/lite/toco/graph_transformations/dequantize.cc b/tensorflow/lite/toco/graph_transformations/dequantize.cc
index cc5dddbb40e..c87c305a70d 100644
--- a/tensorflow/lite/toco/graph_transformations/dequantize.cc
+++ b/tensorflow/lite/toco/graph_transformations/dequantize.cc
@@ -35,7 +35,7 @@ void DequantizeBuffer(Array* array) {
   auto& new_data = array->GetMutableBuffer<ArrayDataType::kFloat>().data;
   new_data.resize(old_data.size());
   const auto& qparams = array->GetQuantizationParams();
-  for (int i = 0; i < old_data.size(); i++) {
+  for (size_t i = 0; i < old_data.size(); i++) {
     new_data[i] = qparams.scale * (old_data[i] - qparams.zero_point);
   }
 }
diff --git a/tensorflow/lite/toco/graph_transformations/drop_fake_quant.cc b/tensorflow/lite/toco/graph_transformations/drop_fake_quant.cc
index bb8679bced8..3a0b4d0103f 100644
--- a/tensorflow/lite/toco/graph_transformations/drop_fake_quant.cc
+++ b/tensorflow/lite/toco/graph_transformations/drop_fake_quant.cc
@@ -45,7 +45,7 @@ namespace toco {
   }
 
   // Drop min/max inputs
-  for (int i = 1; i < fakequant_op->inputs.size(); i++) {
+  for (size_t i = 1; i < fakequant_op->inputs.size(); i++) {
     if (CountOpsWithInput(*model, fakequant_op->inputs[i]) == 1) {
       model->EraseArray(fakequant_op->inputs[i]);
     }
diff --git a/tensorflow/lite/toco/graph_transformations/ensure_uint8_weights_safe_for_fast_int8_kernels.cc b/tensorflow/lite/toco/graph_transformations/ensure_uint8_weights_safe_for_fast_int8_kernels.cc
index 918bb489995..ce4574cdfbf 100644
--- a/tensorflow/lite/toco/graph_transformations/ensure_uint8_weights_safe_for_fast_int8_kernels.cc
+++ b/tensorflow/lite/toco/graph_transformations/ensure_uint8_weights_safe_for_fast_int8_kernels.cc
@@ -166,7 +166,7 @@ namespace toco {
   int index_of_previous_bad_value = 0;
   bool changed = false;
 
-  for (int i = 0; i < buffer_data.size(); i++) {
+  for (size_t i = 0; i < buffer_data.size(); i++) {
     if (buffer_data[i] == 0) {
       count_bad++;
       if (count_bad > 1) {
diff --git a/tensorflow/lite/toco/graph_transformations/fuse_broadcast_into_following_binary.cc b/tensorflow/lite/toco/graph_transformations/fuse_broadcast_into_following_binary.cc
index ba3e277f676..2c5c2cbb5f1 100644
--- a/tensorflow/lite/toco/graph_transformations/fuse_broadcast_into_following_binary.cc
+++ b/tensorflow/lite/toco/graph_transformations/fuse_broadcast_into_following_binary.cc
@@ -34,7 +34,7 @@ bool IsBroadcastingOp(const Model& model, Operator* op) {
   // Concatenation of identical inputs is usually a broadcast.
   if (op->type == OperatorType::kConcatenation) {
     // Verify that all inputs are the same.
-    for (int i = 1; i < op->inputs.size(); ++i) {
+    for (size_t i = 1; i < op->inputs.size(); ++i) {
       if (op->inputs[i] != op->inputs[0]) {
         return false;
       }
diff --git a/tensorflow/lite/toco/graph_transformations/group_bidirectional_sequence_ops.cc b/tensorflow/lite/toco/graph_transformations/group_bidirectional_sequence_ops.cc
index fa252b1a61b..a6d95ec43b1 100644
--- a/tensorflow/lite/toco/graph_transformations/group_bidirectional_sequence_ops.cc
+++ b/tensorflow/lite/toco/graph_transformations/group_bidirectional_sequence_ops.cc
@@ -125,7 +125,7 @@ bool CheckTwoUnidirectionalSequenceOpsAreValid(
       return false;
 
     // Make sure the inputs datatype matches.
-    for (int i = 0; i < fw_sequence_op->inputs.size(); ++i) {
+    for (size_t i = 0; i < fw_sequence_op->inputs.size(); ++i) {
       const auto& fw_input_array_name = fw_sequence_op->inputs[i];
       const auto& bw_input_array_name = bw_sequence_op->inputs[i];
       if (model.HasArray(fw_input_array_name) &&
@@ -137,7 +137,7 @@ bool CheckTwoUnidirectionalSequenceOpsAreValid(
     }
 
     // Make sure the outputs datatype matches.
-    for (int i = 0; i < fw_sequence_op->outputs.size(); ++i) {
+    for (size_t i = 0; i < fw_sequence_op->outputs.size(); ++i) {
       const auto& fw_output_array_name = fw_sequence_op->outputs[i];
       const auto& bw_output_array_name = bw_sequence_op->outputs[i];
       if (model.HasArray(fw_output_array_name) &&
diff --git a/tensorflow/lite/toco/graph_transformations/hardcode_min_max.cc b/tensorflow/lite/toco/graph_transformations/hardcode_min_max.cc
index 171d522daa7..4250668bcf5 100644
--- a/tensorflow/lite/toco/graph_transformations/hardcode_min_max.cc
+++ b/tensorflow/lite/toco/graph_transformations/hardcode_min_max.cc
@@ -405,7 +405,7 @@ bool HardcodeMinMaxForPack(Model* model, Operator* op) {
   }
   const auto& first_input_minmax = first_input_array.GetMinMax();
 
-  for (int i = 1; i < op->inputs.size(); i++) {
+  for (size_t i = 1; i < op->inputs.size(); i++) {
     const auto& input_array = model->GetArray(op->inputs[i]);
     if (!input_array.minmax) {
       return false;
diff --git a/tensorflow/lite/toco/graph_transformations/identify_nearest_upsample.cc b/tensorflow/lite/toco/graph_transformations/identify_nearest_upsample.cc
index 2ab6692a3a8..08894c93a5b 100644
--- a/tensorflow/lite/toco/graph_transformations/identify_nearest_upsample.cc
+++ b/tensorflow/lite/toco/graph_transformations/identify_nearest_upsample.cc
@@ -199,7 +199,7 @@ std::vector<std::unique_ptr<Operator>>::iterator FindOperator(
   shape_array.data_type = ArrayDataType::kInt32;
   auto& shape_buffer = shape_array.GetMutableBuffer<ArrayDataType::kInt32>();
   // This is what imagined as the original shape.
-  for (int i = 0; i < imagined_original_shape.size(); ++i) {
+  for (size_t i = 0; i < imagined_original_shape.size(); ++i) {
     shape_buffer.data.push_back(imagined_original_shape.at(i));
   }
 
diff --git a/tensorflow/lite/toco/graph_transformations/merge_reshape_into_preceding_transpose.cc b/tensorflow/lite/toco/graph_transformations/merge_reshape_into_preceding_transpose.cc
index 80170fe8bcb..a76ae1a0635 100644
--- a/tensorflow/lite/toco/graph_transformations/merge_reshape_into_preceding_transpose.cc
+++ b/tensorflow/lite/toco/graph_transformations/merge_reshape_into_preceding_transpose.cc
@@ -70,7 +70,7 @@ std::vector<int32> ReshapeToTranspose(const Model& model,
   std::vector<int> not_one_indices;
 
   // Separate into one indices and not one indices.
-  for (int i = 0; i < in_shape.size(); i++) {
+  for (size_t i = 0; i < in_shape.size(); i++) {
     if (in_shape[i] == 1) {
       one_indices.push_back(i);
     } else {
@@ -167,7 +167,7 @@ std::vector<int32> ReshapeToTranspose(const Model& model,
 
   // Combine the permutations.
   const auto& transpose_perm = transpose_op->perm;
-  for (int i = 0; i < merged_perm.size(); i++) {
+  for (size_t i = 0; i < merged_perm.size(); i++) {
     merged_perm[i] = transpose_perm[merged_perm[i]];
   }
 
diff --git a/tensorflow/lite/toco/graph_transformations/propagate_array_data_types.cc b/tensorflow/lite/toco/graph_transformations/propagate_array_data_types.cc
index 49d59de860b..2f316934311 100644
--- a/tensorflow/lite/toco/graph_transformations/propagate_array_data_types.cc
+++ b/tensorflow/lite/toco/graph_transformations/propagate_array_data_types.cc
@@ -170,7 +170,7 @@ void SetDataTypeForAllOutputs(Model* model, Operator* op,
       if (unsupported_op->output_data_types.size() < op->outputs.size()) {
         return ::tensorflow::Status::OK();
       }
-      for (int i = 0; i < op->outputs.size(); ++i) {
+      for (size_t i = 0; i < op->outputs.size(); ++i) {
         const string& output = op->outputs[i];
         const ArrayDataType data_type = unsupported_op->output_data_types[i];
         model->GetArray(output).data_type = data_type;
diff --git a/tensorflow/lite/toco/graph_transformations/propagate_fake_quant_num_bits.cc b/tensorflow/lite/toco/graph_transformations/propagate_fake_quant_num_bits.cc
index 1ed618879c1..94779f54af2 100644
--- a/tensorflow/lite/toco/graph_transformations/propagate_fake_quant_num_bits.cc
+++ b/tensorflow/lite/toco/graph_transformations/propagate_fake_quant_num_bits.cc
@@ -149,7 +149,7 @@ bool RecursivelyBackwardPropagateDataType(GraphTransformation* transformation,
                                           ArrayDataType new_data_type,
                                           const MinMax& new_minmax) {
   bool did_change = false;
-  for (int input_index = 0; input_index < op->inputs.size(); ++input_index) {
+  for (size_t input_index = 0; input_index < op->inputs.size(); ++input_index) {
     const auto& input = op->inputs[input_index];
     auto& input_array = model->GetArray(input);
 
diff --git a/tensorflow/lite/toco/graph_transformations/propagate_fixed_sizes.cc b/tensorflow/lite/toco/graph_transformations/propagate_fixed_sizes.cc
index 006e624eb7a..520cd8b495a 100644
--- a/tensorflow/lite/toco/graph_transformations/propagate_fixed_sizes.cc
+++ b/tensorflow/lite/toco/graph_transformations/propagate_fixed_sizes.cc
@@ -431,7 +431,7 @@ void ProcessTensorFlowReshapeOperator(Model* model,
   bool has_wildcard = false;
   int wildcard_index = 0;
   int product_non_wildcard_dims = 1;
-  for (int i = 0; i < shape_data.size(); i++) {
+  for (size_t i = 0; i < shape_data.size(); i++) {
     if (shape_data[i] == -1) {
       CHECK(!has_wildcard);
       has_wildcard = true;
@@ -574,7 +574,7 @@ void ProcessTensorFlowReductionOperator(Model* model, Operator* op) {
     std::set<int32> true_indices;
     const auto& reduction_indices =
         reduction_indices_array.GetBuffer<ArrayDataType::kInt32>().data;
-    for (int i = 0; i < reduction_indices.size(); ++i) {
+    for (size_t i = 0; i < reduction_indices.size(); ++i) {
       const int32 reduction_index = reduction_indices[i];
       if (reduction_index < -input_rank || reduction_index >= input_rank) {
         CHECK(false) << "Invalid reduction dimension " << reduction_index
@@ -627,7 +627,7 @@ void ProcessSliceOperator(Model* model, SliceOperator* op) {
   CHECK_EQ(op->begin.size(), op->size.size());
 
   std::vector<int> output_dims;
-  for (int i = 0; i < op->begin.size(); ++i) {
+  for (size_t i = 0; i < op->begin.size(); ++i) {
     int size = op->size[i];
     if (size == -1) {
       size = input_array.shape().dims(i) - op->begin[i];
@@ -883,7 +883,7 @@ void ProcessTensorFlowSplitVOperator(Model* model,
 
   CHECK_EQ(op->outputs.size(), op->num_split);
 
-  for (int i = 0; i < op->outputs.size(); ++i) {
+  for (size_t i = 0; i < op->outputs.size(); ++i) {
     const auto& output = op->outputs[i];
     Shape output_shape = input_shape;
     (*output_shape.mutable_dims())[axis] = size_splits_vector.at(i);
@@ -1514,7 +1514,7 @@ void ProcessPadOperator(Model* model, PadOperator* op) {
   std::vector<int>& dims = *output_shape.mutable_dims();
   CHECK_EQ(op->left_padding.size(), dims.size());
 
-  for (int i = 0; i < op->left_padding.size(); ++i) {
+  for (size_t i = 0; i < op->left_padding.size(); ++i) {
     dims[i] += op->left_padding[i] + op->right_padding[i];
   }
 
@@ -1540,7 +1540,7 @@ void ProcessPadV2Operator(Model* model, PadV2Operator* op) {
   std::vector<int>& dims = *output_shape.mutable_dims();
   CHECK_EQ(op->left_padding.size(), dims.size());
 
-  for (int i = 0; i < op->left_padding.size(); ++i) {
+  for (size_t i = 0; i < op->left_padding.size(); ++i) {
     dims[i] += op->left_padding[i] + op->right_padding[i];
   }
 
@@ -1683,7 +1683,7 @@ void ProcessStridedSliceOperator(Model* model, StridedSliceOperator* op) {
   CHECK_LE(op->strides.size(), num_input_axes)
       << "StridedSlice op with output \"" << op->outputs[0]
       << "\", requires no more than " << num_input_axes << " strides";
-  for (int i = 0; i < op->strides.size(); i++) {
+  for (size_t i = 0; i < op->strides.size(); i++) {
     CHECK_NE(op->strides[i], 0) << "Strides must be non-zero. Axis " << i
                                 << " has stride=" << op->strides[i] << ".";
   }
@@ -1814,7 +1814,7 @@ void ProcessTransposeOperator(Model* model, TransposeOperator* op) {
       << "Transpose permutation input " << op->inputs[1]
       << " must be same length as input dimensions";
   std::vector<int>* output_dims = output_array.mutable_shape()->mutable_dims();
-  for (int i = 0; i < perm.size(); i++) {
+  for (size_t i = 0; i < perm.size(); i++) {
     int axis = perm[i];
     CHECK_GE(axis, 0);
     CHECK_LT(axis, input_shape.dimensions_count());
@@ -1856,8 +1856,8 @@ void ProcessArgMinMaxOperator(Model* model, Op* op) {
   std::vector<int> output_dims;
 
   output_dims.reserve(input_dims.size() - 1);
-  for (int i = 0; i < input_dims.size(); ++i) {
-    if (i != axis) {
+  for (size_t i = 0; i < input_dims.size(); ++i) {
+    if ( static_cast<int>(i) != axis) {
       output_dims.push_back(input_dims[i]);
     }
   }
@@ -1938,7 +1938,7 @@ void ProcessTileOperator(Model* model, TensorFlowTileOperator* op) {
 
   auto* mutable_dims = output_array.mutable_shape()->mutable_dims();
   mutable_dims->resize(multiples.size());
-  for (int i = 0; i < mutable_dims->size(); ++i) {
+  for (size_t i = 0; i < mutable_dims->size(); ++i) {
     (*mutable_dims)[i] = input_shape.dims(i) * multiples[i];
   }
 }
@@ -2010,8 +2010,8 @@ void ProcessUnpackOperator(Model* model, UnpackOperator* op) {
   std::vector<int> output_dims;
 
   output_dims.reserve(input_dims.size() - 1);
-  for (int i = 0; i < input_dims.size(); ++i) {
-    if (i != op->axis) {
+  for (size_t i = 0; i < input_dims.size(); ++i) {
+    if ( static_cast<int>(i) != op->axis) {
       output_dims.push_back(input_dims[i]);
     }
   }
@@ -2399,7 +2399,7 @@ void ProcessScatterNdOperator(Model* model, ScatterNdOperator* op) {
       if (unsupported_op->output_shapes.size() < op->outputs.size()) {
         return ::tensorflow::Status::OK();
       }
-      for (int i = 0; i < op->outputs.size(); ++i) {
+      for (size_t i = 0; i < op->outputs.size(); ++i) {
         const string& output = op->outputs[i];
         model->GetArray(output).copy_shape(unsupported_op->output_shapes.at(i));
       }
diff --git a/tensorflow/lite/toco/graph_transformations/remove_successive_transpose.cc b/tensorflow/lite/toco/graph_transformations/remove_successive_transpose.cc
index 6eccda04c18..1cb3a300127 100644
--- a/tensorflow/lite/toco/graph_transformations/remove_successive_transpose.cc
+++ b/tensorflow/lite/toco/graph_transformations/remove_successive_transpose.cc
@@ -31,12 +31,12 @@ bool TransformsToIdentity(std::vector<int> const& perm1,
   // perm1 is the order of the indices after first transpose. When perm1 is
   // reordered according to perm2, if the result is simple increasing sequence
   // i.e., range(0, perm1.size()), then the two transposes cancel each other.
-  for (int i = 0; i < perm1.size(); ++i) {
-    if (perm1[i] < 0 || perm1[i] >= perm1.size() || perm2[i] < 0 ||
-        perm2[i] >= perm1.size()) {
+  for (size_t i = 0; i < perm1.size(); ++i) {
+    if (perm1[i] < 0 || perm1[i] >= static_cast<int>(perm1.size()) || perm2[i] < 0 ||
+        perm2[i] >= static_cast<int>(perm1.size())) {
       return false;
     }
-    if (perm1[perm2[i]] != i) {
+    if (perm1[perm2[i]] != static_cast<int>(i)) {
       return false;
     }
   }
@@ -46,7 +46,7 @@ bool TransformsToIdentity(std::vector<int> const& perm1,
 void ReplaceOpInputsWith(Model* model, const string& lookfor,
                          const string& replacewith) {
   for (const auto& op : model->operators) {
-    for (int i = 0; i < op->inputs.size(); ++i) {
+    for (size_t i = 0; i < op->inputs.size(); ++i) {
       if (op->inputs[i] == lookfor) {
         op->inputs[i] = replacewith;
       }
diff --git a/tensorflow/lite/toco/graph_transformations/remove_trivial_passthrough.cc b/tensorflow/lite/toco/graph_transformations/remove_trivial_passthrough.cc
index bd529bd9ecd..eeb8751bf86 100644
--- a/tensorflow/lite/toco/graph_transformations/remove_trivial_passthrough.cc
+++ b/tensorflow/lite/toco/graph_transformations/remove_trivial_passthrough.cc
@@ -82,7 +82,7 @@ bool RemoveTrivialPassthroughOp(GraphTransformation* transformation,
     // We call 'main input' the unique nonconstant input array if there is one,
     // or else the 0-th input.
     int count_nonconstant_input_arrays = 0;
-    for (int i = 0; i < passthru_op->inputs.size(); i++) {
+    for (size_t i = 0; i < passthru_op->inputs.size(); i++) {
       if (!model->GetArray(passthru_op->inputs[i]).buffer) {
         count_nonconstant_input_arrays++;
         if (count_nonconstant_input_arrays == 1) {
diff --git a/tensorflow/lite/toco/graph_transformations/reorder_elementwise_unary.cc b/tensorflow/lite/toco/graph_transformations/reorder_elementwise_unary.cc
index 17a5e9a1d6a..38edff76d55 100644
--- a/tensorflow/lite/toco/graph_transformations/reorder_elementwise_unary.cc
+++ b/tensorflow/lite/toco/graph_transformations/reorder_elementwise_unary.cc
@@ -127,9 +127,9 @@ bool IsMoveOperator(OperatorType optype) {
     move_op->outputs[0] = output_name;
   } else {
     // The intermediate array is now the output array.
-    for (int i = 0; i < model->operators.size(); i++) {
+    for (size_t i = 0; i < model->operators.size(); i++) {
       Operator* consumer = model->operators[i].get();
-      for (int j = 0; j < consumer->inputs.size(); j++) {
+      for (size_t j = 0; j < consumer->inputs.size(); j++) {
         if (consumer->inputs[j] == output_name) {
           consumer->inputs[j] = intermediate_name;
         }
diff --git a/tensorflow/lite/toco/graph_transformations/reorder_reshape_transpose.cc b/tensorflow/lite/toco/graph_transformations/reorder_reshape_transpose.cc
index 0fbcf9f73b1..b2d184cdc31 100644
--- a/tensorflow/lite/toco/graph_transformations/reorder_reshape_transpose.cc
+++ b/tensorflow/lite/toco/graph_transformations/reorder_reshape_transpose.cc
@@ -60,7 +60,7 @@ std::vector<int> ComputeNewPerm(std::vector<int> input_dims,
                                 std::vector<int> perm) {
   // These are the major axis of the input.
   std::vector<int> input_indices;
-  for (int i = 0; i < input_dims.size(); i++) {
+  for (size_t i = 0; i < input_dims.size(); i++) {
     if (input_dims[i] != 1) {
       input_indices.push_back(i);
     }
@@ -69,7 +69,7 @@ std::vector<int> ComputeNewPerm(std::vector<int> input_dims,
   // This maps which indices of the input produced the intermediate indices for
   // non-unary dimensions.
   std::unordered_map<int, int> intermediate_to_input_indices_map;
-  for (int i = 0; i < intermediate_dims.size(); i++) {
+  for (size_t i = 0; i < intermediate_dims.size(); i++) {
     if (intermediate_dims[i] != 1) {
       intermediate_to_input_indices_map[i] =
           input_indices[intermediate_to_input_indices_map.size()];
@@ -80,14 +80,14 @@ std::vector<int> ComputeNewPerm(std::vector<int> input_dims,
   // major indices.
   std::vector<int> new_perm;
   new_perm.reserve(input_dims.size());
-  for (int i = 0; i < perm.size(); i++) {
+  for (size_t i = 0; i < perm.size(); i++) {
     if (intermediate_dims[perm[i]] == 1) continue;
 
     new_perm.push_back(intermediate_to_input_indices_map[perm[i]]);
   }
 
   // Fill the rest of the transpose in with the ones.
-  for (int index = 0; index < input_dims.size(); index++) {
+  for (size_t index = 0; index < input_dims.size(); index++) {
     if (input_dims[index] == 1) {
       new_perm.push_back(index);
     }
@@ -193,9 +193,9 @@ std::vector<int> ComputeNewPerm(std::vector<int> input_dims,
     DeleteArrayIfUnused(intermediate_name, model);
   } else {
     // The intermediate array is now the output array.
-    for (int i = 0; i < model->operators.size(); i++) {
+    for (size_t i = 0; i < model->operators.size(); i++) {
       Operator* consumer = model->operators[i].get();
-      for (int j = 0; j < consumer->inputs.size(); j++) {
+      for (size_t j = 0; j < consumer->inputs.size(); j++) {
         if (consumer->inputs[j] == output_name) {
           consumer->inputs[j] = intermediate_name;
         }
diff --git a/tensorflow/lite/toco/graph_transformations/resolve_batch_normalization.cc b/tensorflow/lite/toco/graph_transformations/resolve_batch_normalization.cc
index 6e5815ee94d..545c53fb31a 100644
--- a/tensorflow/lite/toco/graph_transformations/resolve_batch_normalization.cc
+++ b/tensorflow/lite/toco/graph_transformations/resolve_batch_normalization.cc
@@ -124,11 +124,11 @@ namespace toco {
   const auto& offset_float_data =
       offset_array.GetBuffer<ArrayDataType::kFloat>().data;
 
-  CHECK(mul_float_data.size() == buffer_size);
-  CHECK(add_float_data.size() == buffer_size);
-  CHECK(mean_float_data.size() == buffer_size);
-  CHECK(multiplier_float_data.size() == buffer_size);
-  CHECK(offset_float_data.size() == buffer_size);
+  CHECK(static_cast<int>(mul_float_data.size()) == buffer_size);
+  CHECK(static_cast<int>(add_float_data.size()) == buffer_size);
+  CHECK(static_cast<int>(mean_float_data.size()) == buffer_size);
+  CHECK(static_cast<int>(multiplier_float_data.size()) == buffer_size);
+  CHECK(static_cast<int>(offset_float_data.size()) == buffer_size);
 
   for (int i = 0; i < buffer_size; i++) {
     mul_float_data[i] = multiplier_float_data[i];
diff --git a/tensorflow/lite/toco/graph_transformations/resolve_constant_concatenation.cc b/tensorflow/lite/toco/graph_transformations/resolve_constant_concatenation.cc
index 7c9aa025f64..20e805a29e0 100644
--- a/tensorflow/lite/toco/graph_transformations/resolve_constant_concatenation.cc
+++ b/tensorflow/lite/toco/graph_transformations/resolve_constant_concatenation.cc
@@ -64,7 +64,7 @@ void CopyTensorSegments(const std::vector<Array*>& input_arrays,
   // Copy the data from input_arrays to concatenated_array_buffer.
   T* dest_ptr = concatenated_array_buffer.data();
   for (int s = 0; s < total_copy_steps; s++) {
-    for (int i = 0; i < input_arrays.size(); i++) {
+    for (size_t i = 0; i < input_arrays.size(); i++) {
       std::copy(src_ptr[i], src_ptr[i] + array_copy_size[i], dest_ptr);
       src_ptr[i] += array_copy_size[i];
       dest_ptr += array_copy_size[i];
diff --git a/tensorflow/lite/toco/graph_transformations/resolve_constant_pack.cc b/tensorflow/lite/toco/graph_transformations/resolve_constant_pack.cc
index 0df35509d3d..c6dc093ba00 100644
--- a/tensorflow/lite/toco/graph_transformations/resolve_constant_pack.cc
+++ b/tensorflow/lite/toco/graph_transformations/resolve_constant_pack.cc
@@ -36,7 +36,7 @@ void Pack(Model* model, PackOperator const& op) {
   // Pack inputs into buffer
   CHECK_EQ(op.axis, 0) << "Packing only supported along first axis";
   int dst_offset = 0;
-  for (int i = 0; i < op.inputs.size(); i++) {
+  for (size_t i = 0; i < op.inputs.size(); i++) {
     // Append array data to output for each input array
     const auto& input_array = model->GetArray(op.inputs[i]);
     int input_size = RequiredBufferSizeForShape(input_array.shape());
diff --git a/tensorflow/lite/toco/graph_transformations/resolve_constant_slice.cc b/tensorflow/lite/toco/graph_transformations/resolve_constant_slice.cc
index fd71fb1873a..34a1a1ce899 100644
--- a/tensorflow/lite/toco/graph_transformations/resolve_constant_slice.cc
+++ b/tensorflow/lite/toco/graph_transformations/resolve_constant_slice.cc
@@ -50,7 +50,7 @@ bool Slice(SliceOperator const& op, Array const& input_array,
   CHECK_LE(size.size(), 4);
   std::vector<int> begin = op.begin;
   std::vector<int> end;
-  for (int i = 0; i < begin.size(); ++i) {
+  for (size_t i = 0; i < begin.size(); ++i) {
     int dim_size = size[i];
     if (dim_size == -1) {
       // -1 means the rest of the dimension.
diff --git a/tensorflow/lite/toco/graph_transformations/resolve_constant_transpose.cc b/tensorflow/lite/toco/graph_transformations/resolve_constant_transpose.cc
index 7ceffe6307e..a822f7b79e3 100644
--- a/tensorflow/lite/toco/graph_transformations/resolve_constant_transpose.cc
+++ b/tensorflow/lite/toco/graph_transformations/resolve_constant_transpose.cc
@@ -40,7 +40,7 @@ void Transpose(Model* model, const Array& input_array,
   CHECK(input_shape.dimensions_count() == output_shape.dimensions_count());
   const int dim = input_shape.dimensions_count();
   CHECK_LE(dim, 4);
-  CHECK(perm.size() >= dim);
+  CHECK(static_cast<int>(perm.size()) >= dim);
   for (int i = 0; i < dim; i++) {
     CHECK(perm[i] >= 0 && perm[i] < dim);
     CHECK(input_shape.dims(perm[i]) == output_shape.dims(i));
diff --git a/tensorflow/lite/toco/graph_transformations/resolve_constant_unary.cc b/tensorflow/lite/toco/graph_transformations/resolve_constant_unary.cc
index 197e17eee16..4d6cd188729 100644
--- a/tensorflow/lite/toco/graph_transformations/resolve_constant_unary.cc
+++ b/tensorflow/lite/toco/graph_transformations/resolve_constant_unary.cc
@@ -62,7 +62,7 @@ void ReduceGeneric(bool keep_dims, const std::vector<int>& axes,
   }
 
   std::vector<int> output_indices(input_shape.dimensions_count());
-  for (int input_offset = 0; input_offset < input.size(); ++input_offset) {
+  for (size_t input_offset = 0; input_offset < input.size(); ++input_offset) {
     std::vector<int> input_indices = ReverseOffset(input_shape, input_offset);
     // Calculate the output location by squashing input indices to 0
     // in reduced axes.
@@ -319,7 +319,7 @@ bool CopyMinMaxFromFirstInput(const Operator& op, Model* model) {
   } else if (unary_op->type == OperatorType::kRelu6 ||
              unary_op->type == OperatorType::kRelu1 ||
              unary_op->type == OperatorType::kRelu) {
-    for (size_t i = 0; i < output_buffer_size; ++i) {
+    for (int i = 0; i < output_buffer_size; ++i) {
       const float value = (*input_float_data)[i];
       float new_value = 0.0f;
       switch (unary_op->type) {
diff --git a/tensorflow/lite/toco/graph_transformations/unpartition_embedding_lookup.cc b/tensorflow/lite/toco/graph_transformations/unpartition_embedding_lookup.cc
index 1f7035c21e2..84d5922aae8 100644
--- a/tensorflow/lite/toco/graph_transformations/unpartition_embedding_lookup.cc
+++ b/tensorflow/lite/toco/graph_transformations/unpartition_embedding_lookup.cc
@@ -57,10 +57,10 @@ namespace toco {
   // Split up the DynamicStitch inputs into the indices and data.
   std::vector<string> stitch_indices_inputs;
   std::vector<string> stitch_data_inputs;
-  for (size_t i = 0; i < stitch_op->num_partitions; ++i) {
+  for (int i = 0; i < stitch_op->num_partitions; ++i) {
     stitch_indices_inputs.push_back(stitch_op->inputs[i]);
   }
-  for (size_t i = stitch_op->num_partitions; i < stitch_op->num_partitions * 2;
+  for (int i = stitch_op->num_partitions; i < stitch_op->num_partitions * 2;
        ++i) {
     stitch_data_inputs.push_back(stitch_op->inputs[i]);
   }
diff --git a/tensorflow/lite/toco/model_cmdline_flags.cc b/tensorflow/lite/toco/model_cmdline_flags.cc
index 2434481272f..351884fbf1e 100644
--- a/tensorflow/lite/toco/model_cmdline_flags.cc
+++ b/tensorflow/lite/toco/model_cmdline_flags.cc
@@ -263,7 +263,7 @@ void ReadModelFlagsFromCommandLineFlags(
     QCHECK(uses_multi_input_flags);
     std::vector<string> mean_values =
         absl::StrSplit(parsed_model_flags.mean_values.value(), ',');
-    QCHECK(mean_values.size() == model_flags->input_arrays_size());
+    QCHECK(static_cast<int>(mean_values.size()) == model_flags->input_arrays_size());
     for (size_t i = 0; i < mean_values.size(); ++i) {
       char* last = nullptr;
       model_flags->mutable_input_arrays(i)->set_mean_value(
@@ -280,7 +280,7 @@ void ReadModelFlagsFromCommandLineFlags(
     QCHECK(uses_multi_input_flags);
     std::vector<string> std_values =
         absl::StrSplit(parsed_model_flags.std_values.value(), ',');
-    QCHECK(std_values.size() == model_flags->input_arrays_size());
+    QCHECK( static_cast<int>(std_values.size()) == model_flags->input_arrays_size());
     for (size_t i = 0; i < std_values.size(); ++i) {
       char* last = nullptr;
       model_flags->mutable_input_arrays(i)->set_std_value(
@@ -298,7 +298,7 @@ void ReadModelFlagsFromCommandLineFlags(
     QCHECK(uses_multi_input_flags);
     std::vector<string> input_data_types =
         absl::StrSplit(parsed_model_flags.input_data_types.value(), ',');
-    QCHECK(input_data_types.size() == model_flags->input_arrays_size());
+    QCHECK(static_cast<int>(input_data_types.size()) == model_flags->input_arrays_size());
     for (size_t i = 0; i < input_data_types.size(); ++i) {
       IODataType type;
       QCHECK(IODataType_Parse(input_data_types[i], &type));
@@ -321,7 +321,7 @@ void ReadModelFlagsFromCommandLineFlags(
     QCHECK(uses_multi_input_flags);
     std::vector<string> input_shapes =
         absl::StrSplit(parsed_model_flags.input_shapes.value(), ':');
-    QCHECK(input_shapes.size() == model_flags->input_arrays_size());
+    QCHECK(static_cast<int>(input_shapes.size()) == model_flags->input_arrays_size());
     for (size_t i = 0; i < input_shapes.size(); ++i) {
       auto* shape = model_flags->mutable_input_arrays(i)->mutable_shape();
       shape->clear_dims();
diff --git a/tensorflow/lite/toco/toco_cmdline_flags.cc b/tensorflow/lite/toco/toco_cmdline_flags.cc
index c133db8f2a4..9697a1ecbbd 100644
--- a/tensorflow/lite/toco/toco_cmdline_flags.cc
+++ b/tensorflow/lite/toco/toco_cmdline_flags.cc
@@ -320,7 +320,7 @@ void ReadTocoFlagsFromCommandLineFlags(const ParsedTocoFlags& parsed_toco_flags,
     std::vector<string> input_types =
         absl::StrSplit(parsed_toco_flags.input_types.value(), ',');
     QCHECK(!input_types.empty());
-    for (int i = 1; i < input_types.size(); i++) {
+    for (size_t i = 1; i < input_types.size(); i++) {
       QCHECK_EQ(input_types[i], input_types[0]);
     }
     toco::IODataType input_type;

From ceed71957998d0d69c1a47e21647461282b60b48 Mon Sep 17 00:00:00 2001
From: Kaixi Hou <kaixih@nvidia.com>
Date: Thu, 11 Jun 2020 11:13:15 -0700
Subject: [PATCH 0133/1390] Change varible names

---
 tensorflow/core/kernels/relu_op_gpu.cu.cc | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/tensorflow/core/kernels/relu_op_gpu.cu.cc b/tensorflow/core/kernels/relu_op_gpu.cu.cc
index ca1a4235f3a..b17303238f4 100644
--- a/tensorflow/core/kernels/relu_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/relu_op_gpu.cu.cc
@@ -35,7 +35,7 @@ namespace tensorflow {
 
 typedef Eigen::GpuDevice GPUDevice;
 
-static constexpr int VectorSize = 8;
+static constexpr int VectorSizeElements = 8;
 namespace functor {
 
 // This kernel computes ReluGrad by processing one half2, two fp16, at a time.
@@ -98,10 +98,11 @@ __global__ void ReluGradHalfKernelVector(
     const Eigen::half* __restrict__ gradient,
     const Eigen::half* __restrict__ feature,
     Eigen::half* __restrict__ backprop, int32 count) {
-  int32 half8_count = count / VectorSize;
+  int32 half8_count = count / VectorSizeElements;
   int32 index = blockIdx.x * blockDim.x + threadIdx.x;
 
   if (index < half8_count) {
+    // Cast to xx_h8 for vector load and store.
     float4 gradient_h8 = reinterpret_cast<const float4*>(gradient)[index];
     float4 feature_h8 = reinterpret_cast<const float4*>(feature)[index];
     float4* p_backprop_h8 = reinterpret_cast<float4*>(backprop) + index;
@@ -115,7 +116,7 @@ __global__ void ReluGradHalfKernelVector(
 #if __CUDA_ARCH__ >= 530
     const half2 kZeroH2 = __float2half2_rn(0.f);
 #endif
-    for (int i = 0; i < VectorSize / 2; i++) {
+    for (int i = 0; i < VectorSizeElements / 2; i++) {
 #if __CUDA_ARCH__ >= 530
       // mask = (feature > 0)
       half2 mask_h2 = __hgt2(feature_h2[i], kZeroH2);
@@ -136,19 +137,19 @@ __global__ void ReluGradHalfKernelVector(
     *p_backprop_h8 = backprop_h8;
   }
 
-  int remaining_count = (count % VectorSize);
+  int remaining_count = (count % VectorSizeElements);
 
   if (index < remaining_count) {
     // Use first threads to process the remaining elements.
-    Eigen::half grad_h = gradient[half8_count * VectorSize + index];
-    Eigen::half feature_h = feature[half8_count * VectorSize + index];
+    Eigen::half grad_h = gradient[half8_count * VectorSizeElements + index];
+    Eigen::half feature_h = feature[half8_count * VectorSizeElements + index];
 
     float grad_f = static_cast<float>(grad_h);
     float feature_f = static_cast<float>(feature_h);
     float backprop_f = (feature_f > 0) ? grad_f : 0;
 
     Eigen::half backprop_h(backprop_f);
-    backprop[half8_count * VectorSize + index] = backprop_h;
+    backprop[half8_count * VectorSizeElements + index] = backprop_h;
   }
 }
 
@@ -176,7 +177,7 @@ struct ReluGrad<Device, Eigen::half> {
     constexpr int32 kThreadInBlock = 512;
     if (count == 0) return;
     if (aligned) {
-      int32 half8_count = Eigen::divup(count, VectorSize);
+      int32 half8_count = Eigen::divup(count, VectorSizeElements);
       int32 kBlock = Eigen::divup(half8_count, kThreadInBlock);
       TF_CHECK_OK(GpuLaunchKernel(
           ReluGradHalfKernelVector, kBlock, kThreadInBlock,

From 013ddd96d0ce111ca5ec1422b2899b66ec41a036 Mon Sep 17 00:00:00 2001
From: Nathan Luehr <nluehr@nvidia.com>
Date: Tue, 9 Jun 2020 15:35:47 -0500
Subject: [PATCH 0134/1390] Use CUDNN_TENSOR_OP_MATH to enable tensor cores.

---
 tensorflow/stream_executor/cuda/cuda_dnn.cc | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/tensorflow/stream_executor/cuda/cuda_dnn.cc b/tensorflow/stream_executor/cuda/cuda_dnn.cc
index fa06d410323..28ec6a842bb 100644
--- a/tensorflow/stream_executor/cuda/cuda_dnn.cc
+++ b/tensorflow/stream_executor/cuda/cuda_dnn.cc
@@ -723,7 +723,7 @@ class CudnnConvolutionDescriptor {
 #if CUDNN_VERSION >= 7000
     cudnnMathType_t math_type =
 #if CUDNN_VERSION >= 8000
-        (use_tensor_op_math ? CUDNN_DEFAULT_MATH : CUDNN_FMA_MATH);
+        (use_tensor_op_math ? CUDNN_TENSOR_OP_MATH : CUDNN_FMA_MATH);
 #else
         (use_tensor_op_math ? CUDNN_TENSOR_OP_MATH : CUDNN_DEFAULT_MATH);
 #endif
@@ -1179,8 +1179,7 @@ class CudnnRnnDescriptor : public dnn::RnnDescriptor {
 
     cudnnMathType_t math_type;
     if (use_tensor_ops) {
-      math_type =
-          CUDNN_VERSION >= 8000 ? CUDNN_DEFAULT_MATH : CUDNN_TENSOR_OP_MATH;
+      math_type = CUDNN_TENSOR_OP_MATH;
     } else {
       math_type = CUDNN_VERSION >= 8000 ? CUDNN_FMA_MATH : CUDNN_DEFAULT_MATH;
     }

From da0d85808e77aa62287ba22e822a9e83866a43a4 Mon Sep 17 00:00:00 2001
From: Nathan Luehr <nluehr@nvidia.com>
Date: Thu, 11 Jun 2020 15:35:04 -0500
Subject: [PATCH 0135/1390] Make python names consistent

---
 tensorflow/python/framework/config.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/framework/config.py b/tensorflow/python/framework/config.py
index a356e6d9a16..1ff2fa613da 100644
--- a/tensorflow/python/framework/config.py
+++ b/tensorflow/python/framework/config.py
@@ -34,7 +34,7 @@ def tensor_float32_execution_allowed():
   return _pywrap_tf32_execution.is_allowed()
 
 # No tf_export until TF is built against CUDA11 which is required for TF32.
-def allow_tensor_float_32_execution(allowed):
+def allow_tensor_float32_execution(allowed):
   """Allow use of TensorFloat-32 with float32 ops on supported hardware.
 
   TensorFloat-32 is a math mode introduced with the NVIDIA Ampere architecture.

From d03b86fe4462ebad1f73d460c2aceab47372b239 Mon Sep 17 00:00:00 2001
From: Elena Zhelezina <elena.zhelezina@arm.com>
Date: Thu, 11 Jun 2020 23:05:40 +0100
Subject: [PATCH 0136/1390] Extended test to the case when new converter is
 enabled.

Change-Id: I83f20d025027ad1266f99f9d79932cab4f1a9ed5
---
 tensorflow/lite/python/lite_test.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/tensorflow/lite/python/lite_test.py b/tensorflow/lite/python/lite_test.py
index e6661c82894..478840c5549 100644
--- a/tensorflow/lite/python/lite_test.py
+++ b/tensorflow/lite/python/lite_test.py
@@ -891,7 +891,11 @@ class FromSessionTest(TestModels, parameterized.TestCase):
       ('UseTfliteBuiltinsInt16DisableMLIR',
        [lite.OpsSet.\
        EXPERIMENTAL_TFLITE_BUILTINS_ACTIVATIONS_INT16_WEIGHTS_INT8],
-       False))
+       False),
+      ('UseTfliteBuiltinsInt16EnableMLIR',
+       [lite.OpsSet.\
+       EXPERIMENTAL_TFLITE_BUILTINS_ACTIVATIONS_INT16_WEIGHTS_INT8],
+       True))
   def testCalibrateAndQuantizeBuiltinInt(self, supported_ops, enable_mlir):
     with ops.Graph().as_default():
       inp, output, calibration_gen = self._getCalibrationQuantizeModel()

From 4bd42cdd0e68cebe8b280b323cde4d01a9d2bf3a Mon Sep 17 00:00:00 2001
From: Reed <reedwm@google.com>
Date: Thu, 11 Jun 2020 17:16:28 -0700
Subject: [PATCH 0137/1390] Use float_32 instead of float32 in function names

---
 tensorflow/python/framework/config.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/python/framework/config.py b/tensorflow/python/framework/config.py
index 1ff2fa613da..bbaa2ca8248 100644
--- a/tensorflow/python/framework/config.py
+++ b/tensorflow/python/framework/config.py
@@ -25,7 +25,7 @@ from tensorflow.python.util.tf_export import tf_export
 
 
 # No tf_export until TF is built against CUDA11 which is required for TF32.
-def tensor_float32_execution_allowed():
+def tensor_float_32_execution_allowed():
   """Get if TensorFloat-32 operations are enabled on supported hardware.
 
   Returns:
@@ -34,7 +34,7 @@ def tensor_float32_execution_allowed():
   return _pywrap_tf32_execution.is_allowed()
 
 # No tf_export until TF is built against CUDA11 which is required for TF32.
-def allow_tensor_float32_execution(allowed):
+def allow_tensor_float_32_execution(allowed):
   """Allow use of TensorFloat-32 with float32 ops on supported hardware.
 
   TensorFloat-32 is a math mode introduced with the NVIDIA Ampere architecture.

From 77e46ebf9745d798863b5c7ed26d6bf077700008 Mon Sep 17 00:00:00 2001
From: Kaixi Hou <kaixih@nvidia.com>
Date: Mon, 30 Mar 2020 17:25:38 -0700
Subject: [PATCH 0138/1390] conv3d ndhwc plumbing

---
 tensorflow/core/kernels/conv_grad_ops_3d.cc | 148 ++++++++++++++++----
 tensorflow/core/kernels/conv_ops_3d.cc      |  88 +++++++++---
 2 files changed, 193 insertions(+), 43 deletions(-)

diff --git a/tensorflow/core/kernels/conv_grad_ops_3d.cc b/tensorflow/core/kernels/conv_grad_ops_3d.cc
index 2183d0e0885..fc2d58ec94f 100644
--- a/tensorflow/core/kernels/conv_grad_ops_3d.cc
+++ b/tensorflow/core/kernels/conv_grad_ops_3d.cc
@@ -50,6 +50,7 @@ using stream_executor::dnn::DimIndex;
 #include "tensorflow/stream_executor/gpu/gpu_asm_opts.h"
 #include "tensorflow/stream_executor/gpu/redzone_allocator.h"
 #include "tensorflow/stream_executor/tf_allocator_adapter.h"
+#include "third_party/gpus/cudnn/cudnn.h"
 #endif  // GOOGLE_CUDA
 
 namespace {
@@ -1264,26 +1265,56 @@ class Conv3DBackpropInputOp<GPUDevice, T> : public OpKernel {
     CHECK(padding_rows >= 0 && padding_cols >= 0 && padding_planes >= 0)
         << "Negative paddings: (" << padding_rows << ", " << padding_cols
         << ", " << padding_planes << ")";
+
+#if GOOGLE_CUDA
+    const bool compute_in_nhwc = CUDNN_VERSION >= 8000 &&
+                                 DataTypeToEnum<T>::value == DT_HALF;
+#else
+    // fast NDHWC implementation is a CUDA only feature
+    const bool compute_in_nhwc = false;
+#endif
+    const TensorFormat compute_data_format =
+        (compute_in_nhwc && data_format_ == FORMAT_NHWC) ? FORMAT_NHWC
+                                                         : FORMAT_NCHW;
+
+    VLOG(3) << "Compute Conv3DBackpropInput with cuDNN:"
+            << " data_format=" << ToString(data_format_)
+            << " compute_data_format=" << ToString(compute_data_format);
+
+    constexpr auto kComputeInNHWC =
+        std::make_tuple(se::dnn::DataLayout::kBatchYXDepth,
+                        se::dnn::FilterLayout::kOutputYXInput);
+    constexpr auto kComputeInNCHW =
+        std::make_tuple(se::dnn::DataLayout::kBatchDepthYX,
+                        se::dnn::FilterLayout::kOutputInputYX);
+
+    se::dnn::DataLayout compute_data_layout;
+    se::dnn::FilterLayout filter_layout;
+
+    std::tie(compute_data_layout, filter_layout) =
+        compute_data_format == FORMAT_NHWC ? kComputeInNHWC : kComputeInNCHW;
+
     se::dnn::BatchDescriptor input_desc(3);
     input_desc.set_count(dims.batch_size)
         .set_spatial_dim(DimIndex::X, compatible_input_shape.dim_size(4))
         .set_spatial_dim(DimIndex::Y, compatible_input_shape.dim_size(3))
         .set_spatial_dim(DimIndex::Z, compatible_input_shape.dim_size(2))
         .set_feature_map_count(dims.in_depth)
-        .set_layout(se::dnn::DataLayout::kBatchDepthYX);
+        .set_layout(compute_data_layout);
     se::dnn::BatchDescriptor output_desc(3);
     output_desc.set_count(dims.batch_size)
         .set_spatial_dim(DimIndex::X, dims.output_size(2))
         .set_spatial_dim(DimIndex::Y, dims.output_size(1))
         .set_spatial_dim(DimIndex::Z, dims.output_size(0))
         .set_feature_map_count(dims.out_depth)
-        .set_layout(se::dnn::DataLayout::kBatchDepthYX);
+        .set_layout(compute_data_layout);
     se::dnn::FilterDescriptor filter_desc(3);
     filter_desc.set_spatial_dim(DimIndex::X, dims.filter_size(2))
         .set_spatial_dim(DimIndex::Y, dims.filter_size(1))
         .set_spatial_dim(DimIndex::Z, dims.filter_size(0))
         .set_input_feature_map_count(filter_shape.dim_size(3))
-        .set_output_feature_map_count(filter_shape.dim_size(4));
+        .set_output_feature_map_count(filter_shape.dim_size(4))
+        .set_layout(filter_layout);
     se::dnn::ConvolutionDescriptor conv_desc(3);
     conv_desc.set_dilation_rate(DimIndex::X, dims.dilation(2))
         .set_dilation_rate(DimIndex::Y, dims.dilation(1))
@@ -1298,21 +1329,33 @@ class Conv3DBackpropInputOp<GPUDevice, T> : public OpKernel {
 
     // Shape: out, in, z, y, x.
     Tensor transformed_filter;
+    auto dst_format = 
+        compute_data_format == FORMAT_NCHW ? FORMAT_OIHW: FORMAT_OHWI;
+    TensorShape dst_shape =
+			  dst_format == FORMAT_OIHW
+            ? TensorShape({filter_shape.dim_size(4), filter_shape.dim_size(3),
+                           dims.filter_size(0),
+                           dims.filter_size(1),
+                           dims.filter_size(2)})
+            : TensorShape({filter_shape.dim_size(4),
+                           dims.filter_size(0),
+                           dims.filter_size(1),
+                           dims.filter_size(2),
+                           filter_shape.dim_size(3)});
     OP_REQUIRES_OK(
         context, context->allocate_temp(
                      DataTypeToEnum<T>::value,
-                     TensorShape({filter_shape.dim_size(4),
-                                  filter_shape.dim_size(3), dims.filter_size(0),
-                                  dims.filter_size(1), dims.filter_size(2)}),
+                     dst_shape,
                      &transformed_filter));
+
     functor::TransformFilter<GPUDevice, T, int, 5>()(
-        context->eigen_device<GPUDevice>(), FORMAT_OIHW,
+        context->eigen_device<GPUDevice>(), dst_format,
         To32Bit(filter.tensor<T, 5>()),
         To32Bit(transformed_filter.tensor<T, 5>()));
 
     // Shape: batch, filters, z, y, x.
     Tensor transformed_out_backprop;
-    if (data_format_ == FORMAT_NHWC) {
+    if (data_format_ == FORMAT_NHWC && compute_data_format == FORMAT_NCHW) {
       TensorShape nchw_shape = {dims.batch_size, dims.out_depth,
                                 dims.output_size(0), dims.output_size(1),
                                 dims.output_size(2)};
@@ -1333,8 +1376,15 @@ class Conv3DBackpropInputOp<GPUDevice, T> : public OpKernel {
     Tensor pre_transformed_in_backprop;
     OP_REQUIRES_OK(
         context,
-        context->allocate_temp(DataTypeToEnum<T>::value, compatible_input_shape,
-                               &pre_transformed_in_backprop));
+        context->allocate_temp(
+            DataTypeToEnum<T>::value,
+	          ShapeFromFormat(
+                compute_data_format, compatible_input_shape.dim_size(0),
+                {{compatible_input_shape.dim_size(2),
+                  compatible_input_shape.dim_size(3),
+                  compatible_input_shape.dim_size(4)}},
+                compatible_input_shape.dim_size(1)),
+            &pre_transformed_in_backprop));
 
     auto out_backprop_ptr =
         AsDeviceMemory(transformed_out_backprop.template flat<T>().data(),
@@ -1355,7 +1405,7 @@ class Conv3DBackpropInputOp<GPUDevice, T> : public OpKernel {
         dims.batch_size,
         dims.in_depth,
         {{dims.input_size(0), dims.input_size(1), dims.input_size(2)}},
-        FORMAT_NCHW,
+        compute_data_format,
         dims.out_depth,
         {{dims.filter_size(0), dims.filter_size(1), dims.filter_size(2)}},
         {{dims.dilation(0), dims.dilation(1), dims.dilation(2)}},
@@ -1500,8 +1550,11 @@ class Conv3DBackpropInputOp<GPUDevice, T> : public OpKernel {
       OP_REQUIRES_OK(context,
                      context->allocate_temp(
                          DataTypeToEnum<T>::value,
-                         {dims.batch_size, dims.in_depth, dims.input_size(0),
-                          dims.input_size(1), dims.input_size(2)},
+	                       ShapeFromFormat(
+                             compute_data_format, dims.batch_size,
+                             {{dims.input_size(0), dims.input_size(1),
+                               dims.input_size(2)}},
+                             dims.in_depth),
                          &in_backprop_remove_padding));
 
       // Remove the padding for odd spatial dimensions.
@@ -1510,12 +1563,13 @@ class Conv3DBackpropInputOp<GPUDevice, T> : public OpKernel {
           To32Bit(const_cast<const Tensor&>(pre_transformed_in_backprop)
                       .tensor<T, 5>()),
           {{0, 0, 0}}, {{-planes_odd, -rows_odd, -cols_odd}},
-          To32Bit(in_backprop_remove_padding.tensor<T, 5>()), FORMAT_NCHW);
+          To32Bit(in_backprop_remove_padding.tensor<T, 5>()),
+          compute_data_format);
 
       pre_transformed_in_backprop = in_backprop_remove_padding;
     }
 
-    if (data_format_ == FORMAT_NHWC) {
+    if (data_format_ == FORMAT_NHWC && compute_data_format == FORMAT_NCHW) {
       auto toConstTensor = [](const Tensor& x) -> const Tensor { return x; };
       functor::NCHWToNHWC<GPUDevice, T, 5>()(
           context->eigen_device<GPUDevice>(),
@@ -1723,6 +1777,35 @@ class Conv3DBackpropFilterOp<GPUDevice, T> : public OpKernel {
     CHECK(padding_rows >= 0 && padding_cols >= 0 && padding_planes >= 0)
         << "Negative paddings: (" << padding_rows << ", " << padding_cols
         << ", " << padding_planes << ")";
+
+#if GOOGLE_CUDA
+    const bool compute_in_nhwc = CUDNN_VERSION >= 8000 &&
+                                 DataTypeToEnum<T>::value == DT_HALF;
+#else
+    // fast NDHWC implementation is a CUDA only feature
+    const bool compute_in_nhwc = false;
+#endif
+		const TensorFormat compute_data_format =
+        (compute_in_nhwc && data_format_ == FORMAT_NHWC) ? FORMAT_NHWC
+                                                         : FORMAT_NCHW;
+
+    VLOG(3) << "Compute Conv3DBackpropFilter with cuDNN:"
+            << " data_format=" << ToString(data_format_)
+            << " compute_data_format=" << ToString(compute_data_format);
+
+    constexpr auto kComputeInNHWC =
+        std::make_tuple(se::dnn::DataLayout::kBatchYXDepth,
+                        se::dnn::FilterLayout::kOutputYXInput);
+    constexpr auto kComputeInNCHW =
+        std::make_tuple(se::dnn::DataLayout::kBatchDepthYX,
+                        se::dnn::FilterLayout::kOutputInputYX);
+  
+    se::dnn::DataLayout compute_data_layout;
+    se::dnn::FilterLayout filter_layout;
+  
+    std::tie(compute_data_layout, filter_layout) =
+        compute_data_format == FORMAT_NHWC ? kComputeInNHWC : kComputeInNCHW;
+    
     se::dnn::BatchDescriptor input_desc(3);
     input_desc.set_count(dims.batch_size)
         .set_spatial_dim(DimIndex::X,
@@ -1732,20 +1815,21 @@ class Conv3DBackpropFilterOp<GPUDevice, T> : public OpKernel {
         .set_spatial_dim(DimIndex::Z,
                          GetTensorDim(compatible_input, data_format_, '0'))
         .set_feature_map_count(dims.in_depth)
-        .set_layout(se::dnn::DataLayout::kBatchDepthYX);
+        .set_layout(compute_data_layout);
     se::dnn::BatchDescriptor output_desc(3);
     output_desc.set_count(dims.batch_size)
         .set_spatial_dim(DimIndex::X, dims.output_size(2))
         .set_spatial_dim(DimIndex::Y, dims.output_size(1))
         .set_spatial_dim(DimIndex::Z, dims.output_size(0))
         .set_feature_map_count(dims.out_depth)
-        .set_layout(se::dnn::DataLayout::kBatchDepthYX);
+        .set_layout(compute_data_layout);
     se::dnn::FilterDescriptor filter_desc(3);
     filter_desc.set_spatial_dim(DimIndex::X, dims.filter_size(2))
         .set_spatial_dim(DimIndex::Y, dims.filter_size(1))
         .set_spatial_dim(DimIndex::Z, dims.filter_size(0))
         .set_input_feature_map_count(filter_shape.dim_size(3))
-        .set_output_feature_map_count(filter_shape.dim_size(4));
+        .set_output_feature_map_count(filter_shape.dim_size(4))
+			  .set_layout(filter_layout);
     se::dnn::ConvolutionDescriptor conv_desc(3);
     conv_desc.set_dilation_rate(DimIndex::X, dims.dilation(2))
         .set_dilation_rate(DimIndex::Y, dims.dilation(1))
@@ -1757,17 +1841,30 @@ class Conv3DBackpropFilterOp<GPUDevice, T> : public OpKernel {
         .set_zero_padding(DimIndex::Y, padding_rows / 2)
         .set_zero_padding(DimIndex::Z, padding_planes / 2)
         .set_group_count(dims.in_depth / filter_shape.dim_size(3));
+
     Tensor pre_transformed_filter_backprop;
+    auto dst_format = 
+        compute_data_format == FORMAT_NCHW ? FORMAT_OIHW: FORMAT_OHWI;
+    TensorShape dst_shape =
+			  dst_format == FORMAT_OIHW
+            ? TensorShape({filter_shape.dim_size(4), filter_shape.dim_size(3),
+                           dims.filter_size(0),
+                           dims.filter_size(1),
+                           dims.filter_size(2)})
+            : TensorShape({filter_shape.dim_size(4),
+                           dims.filter_size(0),
+                           dims.filter_size(1),
+                           dims.filter_size(2),
+                           filter_shape.dim_size(3)});
     OP_REQUIRES_OK(
         context, context->allocate_temp(
                      DataTypeToEnum<T>::value,
-                     TensorShape({filter_shape.dim_size(4),
-                                  filter_shape.dim_size(3), dims.filter_size(0),
-                                  dims.filter_size(1), dims.filter_size(2)}),
+                     dst_shape,
                      &pre_transformed_filter_backprop));
 
     Tensor transformed_out_backprop;
-    if (data_format_ == FORMAT_NHWC) {
+    if (data_format_ == FORMAT_NHWC && compute_data_format == FORMAT_NCHW) {
+			VLOG(4) << "Convert the `out_backprop` tensor from NDHWC to NCDHW.";
       TensorShape nchw_shape = {dims.batch_size, dims.out_depth,
                                 dims.output_size(0), dims.output_size(1),
                                 dims.output_size(2)};
@@ -1785,7 +1882,8 @@ class Conv3DBackpropFilterOp<GPUDevice, T> : public OpKernel {
       transformed_out_backprop = out_backprop;
     }
     Tensor transformed_input;
-    if (data_format_ == FORMAT_NHWC) {
+    if (data_format_ == FORMAT_NHWC && compute_data_format == FORMAT_NCHW) {
+			VLOG(4) << "Convert the `input` tensor from NDHWC to NCDHW.";
       TensorShape nchw_shape = {
           dims.batch_size, dims.in_depth, compatible_input.dim_size(1),
           compatible_input.dim_size(2), compatible_input.dim_size(3)};
@@ -1823,7 +1921,7 @@ class Conv3DBackpropFilterOp<GPUDevice, T> : public OpKernel {
         dims.batch_size,
         dims.in_depth,
         {{dims.input_size(0), dims.input_size(1), dims.input_size(2)}},
-        FORMAT_NCHW,
+        compute_data_format,
         dims.out_depth,
         {{dims.filter_size(0), dims.filter_size(1), dims.filter_size(2)}},
         {{dims.dilation(0), dims.dilation(1), dims.dilation(2)}},
@@ -1947,7 +2045,7 @@ class Conv3DBackpropFilterOp<GPUDevice, T> : public OpKernel {
 
     auto toConstTensor = [](const Tensor& x) -> const Tensor { return x; };
     functor::ReverseTransformFilter<GPUDevice, T, 5>()(
-        context->eigen_device<GPUDevice>(), /*src_filter_format=*/FORMAT_OIHW,
+        context->eigen_device<GPUDevice>(), /*src_filter_format=*/dst_format,
         toConstTensor(pre_transformed_filter_backprop).template tensor<T, 5>(),
         filter_backprop->tensor<T, 5>());
   }
diff --git a/tensorflow/core/kernels/conv_ops_3d.cc b/tensorflow/core/kernels/conv_ops_3d.cc
index 69e6fba4192..c1fe6c690cd 100644
--- a/tensorflow/core/kernels/conv_ops_3d.cc
+++ b/tensorflow/core/kernels/conv_ops_3d.cc
@@ -43,6 +43,7 @@ using stream_executor::dnn::DimIndex;
 #include "tensorflow/stream_executor/gpu/asm_compiler.h"
 #include "tensorflow/stream_executor/gpu/redzone_allocator.h"
 #include "tensorflow/stream_executor/tf_allocator_adapter.h"
+#include "third_party/gpus/cudnn/cudnn.h"
 #endif  // GOOGLE_CUDA
 
 namespace tensorflow {
@@ -201,7 +202,23 @@ struct LaunchConvOp<GPUDevice, T, OpKernelContext> {
       }
     }
 
-    if (data_format == FORMAT_NHWC) {
+#if GOOGLE_CUDA
+    const bool compute_in_nhwc = CUDNN_VERSION >= 8000 &&
+                                 DataTypeToEnum<T>::value == DT_HALF;
+#else
+    // fast NHWC implementation is a CUDA only feature
+    const bool compute_in_nhwc = false;
+#endif
+    const TensorFormat compute_data_format =
+        (compute_in_nhwc && data_format == FORMAT_NHWC) ? FORMAT_NHWC
+                                                        : FORMAT_NCHW;
+
+    VLOG(3) << "Compute Conv3D with cuDNN:"
+          << " data_format=" << ToString(data_format)
+          << " compute_data_format=" << ToString(compute_data_format);
+
+    if (data_format == FORMAT_NHWC && compute_data_format == FORMAT_NCHW) {
+			VLOG(4) << "Convert the input tensor from NDHWC to NCDHW.";
       const TensorShape nchw_shape = ShapeFromFormat(
           FORMAT_NCHW, in_batch, {{in_planes, in_rows, in_cols}}, in_depth);
       if (in_depth > 1) {
@@ -219,8 +236,26 @@ struct LaunchConvOp<GPUDevice, T, OpKernelContext> {
       } else {
         CHECK(input.CopyFrom(input, nchw_shape));
       }
+    } else {
+      CHECK(data_format == compute_data_format)  // Crash OK
+          << "Illegal data and compute format pair:"
+          << " data_format=" << ToString(data_format)
+          << " compute_data_format=" << ToString(compute_data_format);
     }
 
+	  constexpr auto kComputeInNHWC =
+        std::make_tuple(se::dnn::DataLayout::kBatchYXDepth,
+                        se::dnn::FilterLayout::kOutputYXInput);
+    constexpr auto kComputeInNCHW =
+        std::make_tuple(se::dnn::DataLayout::kBatchDepthYX,
+                        se::dnn::FilterLayout::kOutputInputYX);
+
+    se::dnn::DataLayout compute_data_layout;
+    se::dnn::FilterLayout filter_layout;
+
+    std::tie(compute_data_layout, filter_layout) =
+        compute_data_format == FORMAT_NHWC ? kComputeInNHWC : kComputeInNCHW;
+
     CHECK(pad_rows >= 0 && pad_cols >= 0 && pad_planes >= 0)
         << "Negative paddings: (" << pad_rows << ", " << pad_cols << ", "
         << pad_planes << ")";
@@ -230,20 +265,21 @@ struct LaunchConvOp<GPUDevice, T, OpKernelContext> {
         .set_spatial_dim(DimIndex::X, in_cols)
         .set_spatial_dim(DimIndex::Y, in_rows)
         .set_spatial_dim(DimIndex::Z, in_planes)
-        .set_layout(se::dnn::DataLayout::kBatchDepthYX);
+        .set_layout(compute_data_layout);
     se::dnn::BatchDescriptor output_desc(3);
     output_desc.set_count(in_batch)
         .set_spatial_dim(DimIndex::X, out_cols)
         .set_spatial_dim(DimIndex::Y, out_rows)
         .set_spatial_dim(DimIndex::Z, out_planes)
         .set_feature_map_count(out_depth)
-        .set_layout(se::dnn::DataLayout::kBatchDepthYX);
+        .set_layout(compute_data_layout);
     se::dnn::FilterDescriptor filter_desc(3);
     filter_desc.set_spatial_dim(DimIndex::X, filter_cols)
         .set_spatial_dim(DimIndex::Y, filter_rows)
         .set_spatial_dim(DimIndex::Z, filter_planes)
         .set_input_feature_map_count(filter_depth)
-        .set_output_feature_map_count(out_depth);
+        .set_output_feature_map_count(out_depth)
+        .set_layout(filter_layout);
     se::dnn::ConvolutionDescriptor conv_desc(3);
     conv_desc.set_dilation_rate(DimIndex::X, dilations[2])
         .set_dilation_rate(DimIndex::Y, dilations[1])
@@ -257,25 +293,42 @@ struct LaunchConvOp<GPUDevice, T, OpKernelContext> {
         .set_group_count(in_depth / filter_depth);
 
     Tensor transformed_filter;
+    auto dst_format = 
+        compute_data_format == FORMAT_NCHW ? FORMAT_OIHW: FORMAT_OHWI;
+    VLOG(4) << "Transform filter tensor from " << ToString(FORMAT_HWIO)
+            << " to " << ToString(dst_format);
+    TensorShape dst_shape =
+			  dst_format == FORMAT_OIHW
+            ? TensorShape({filter.dim_size(4), filter.dim_size(3),
+                           filter.dim_size(0), filter.dim_size(1),
+                           filter.dim_size(2)})
+            : TensorShape({filter.dim_size(4), filter.dim_size(0),
+                           filter.dim_size(1), filter.dim_size(2),
+                           filter.dim_size(3)});
     OP_REQUIRES_OK(
         ctx, ctx->allocate_temp(DataTypeToEnum<T>::value,
-                                TensorShape({out_depth, in_depth, filter_planes,
-                                             filter_rows, filter_cols}),
+                                dst_shape,
                                 &transformed_filter));
     // filter: [x, y, z, in, out]
-    // t_filter: [out, in, x, y, z]
+    // t_filter: [out, in, x, y, z] (NCDHW) or
+    // t_filter: [out, x, y, z, in] (NDHWC)
     functor::TransformFilter<GPUDevice, T, int, 5>()(
-        ctx->eigen_device<GPUDevice>(), FORMAT_OIHW,
+        ctx->eigen_device<GPUDevice>(), dst_format,
         To32Bit(filter.tensor<T, 5>()),
         To32Bit(transformed_filter.tensor<T, 5>()));
 
     Tensor transformed_output;
-    OP_REQUIRES_OK(
-        ctx, ctx->allocate_temp(
-                 DataTypeToEnum<T>::value,
-                 ShapeFromFormat(FORMAT_NCHW, in_batch,
-                                 {{out_planes, out_rows, out_cols}}, out_depth),
-                 &transformed_output));
+		if (data_format != compute_data_format) {
+			VLOG(4) << "Allocate temporary memory for output in compute data format";
+      OP_REQUIRES_OK(
+          ctx, ctx->allocate_temp(
+                   DataTypeToEnum<T>::value,
+                   ShapeFromFormat(FORMAT_NCHW, in_batch,
+                                   {{out_planes, out_rows, out_cols}}, out_depth),
+                   &transformed_output));
+    } else {
+			transformed_output = *output;
+    }
 
     auto input_ptr = AsDeviceMemory(input.template flat<T>().data(),
                                     input.template flat<T>().size());
@@ -295,7 +348,7 @@ struct LaunchConvOp<GPUDevice, T, OpKernelContext> {
         in_batch,
         in_depth,
         {{in_planes, in_rows, in_cols}},
-        FORMAT_NCHW,
+        compute_data_format,
         out_depth,
         {{filter_planes, filter_rows, filter_cols}},
         {{dilations[0], dilations[1], dilations[2]}},
@@ -455,15 +508,14 @@ struct LaunchConvOp<GPUDevice, T, OpKernelContext> {
           ") filter shape(", filter.shape().DebugString(), ")"));
     }
 
-    if (data_format == FORMAT_NHWC) {
+    if (data_format == FORMAT_NHWC && compute_data_format == FORMAT_NCHW) {
+			VLOG(4) << "Convert the output tensor back from NCDHW to NDHWC.";
       // t_output: [b, out, x, y, z]
       // output: [b, x, y, z, out]
       functor::NCHWToNHWC<GPUDevice, T, 5>()(
           ctx->eigen_device<GPUDevice>(),
           const_cast<const Tensor&>(transformed_output).tensor<T, 5>(),
           output->tensor<T, 5>());
-    } else {
-      *output = transformed_output;
     }
   }
 };

From 744010d48dbe7c3877fec417797fb128882b6ec5 Mon Sep 17 00:00:00 2001
From: Steenu Johnson <steenu.johnson@students.iiserpune.ac.in>
Date: Fri, 12 Jun 2020 11:39:54 +0530
Subject: [PATCH 0139/1390] Forward compatiblity and backwards compatibility
 changes.

Changing forward compatibility date to be 3 weeks from the current commit.
Using V2 version when user specifies exclude_cols.
Making op_version in kernel more specific to CSVDatasetV2.
Serialization done only for version 2 op.
Removing clashes from the previous commit and fixing indentation errors.

Signed-off-by: Steenu Johnson <steenu.johnson@students.iiserpune.ac.in>
---
 .../data/experimental/csv_dataset_op.cc       | 81 +++++++++++--------
 .../python/data/experimental/ops/readers.py   |  2 +-
 2 files changed, 47 insertions(+), 36 deletions(-)

diff --git a/tensorflow/core/kernels/data/experimental/csv_dataset_op.cc b/tensorflow/core/kernels/data/experimental/csv_dataset_op.cc
index 906e57066a2..b1c5937747d 100644
--- a/tensorflow/core/kernels/data/experimental/csv_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/csv_dataset_op.cc
@@ -30,7 +30,7 @@ class CSVDatasetOp : public DatasetOpKernel {
  public:
   explicit CSVDatasetOp(OpKernelConstruction* ctx)
       : DatasetOpKernel(ctx),
-        op_version_(ctx->def().op() == "CSVDataset" ? 1 : 2) {
+        op_version_(ctx->def().op() == "CSVDatasetV2" ? 2 : 1) {
     OP_REQUIRES_OK(ctx, ctx->GetAttr("output_types", &output_types_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("output_shapes", &output_shapes_));
   }
@@ -64,18 +64,17 @@ class CSVDatasetOp : public DatasetOpKernel {
     OP_REQUIRES(ctx, select_cols_tensor->dims() == 1,
                 errors::InvalidArgument("`select_cols` must be a vector."));
 
-std::vector<int64> exclude_cols;
-if (op_version_ > 1) {
-  const Tensor* exclude_cols_tensor;
-  OP_REQUIRES_OK(ctx, ctx->input("exclude_cols", &exclude_cols_tensor));
-  exclude_cols.reserve(exclude_cols_tensor->NumElements());
-  for (int i = 0; i < exclude_cols_tensor->NumElements(); ++i) {
-    exclude_cols.push_back(exclude_cols_tensor->flat<int64>()(i));
-  }
-}
-
-    OP_REQUIRES(ctx, exclude_cols_tensor->dims() == 1,
-                errors::InvalidArgument("`exclude_cols` must be a vector"));
+    std::vector<int64> exclude_cols;
+    if (op_version_ > 1) {
+      const Tensor* exclude_cols_tensor;
+      OP_REQUIRES_OK(ctx, ctx->input("exclude_cols", &exclude_cols_tensor));
+      OP_REQUIRES(ctx, exclude_cols_tensor->dims() == 1,
+                  errors::InvalidArgument("`exclude_cols` must be a vector"));
+      exclude_cols.reserve(exclude_cols_tensor->NumElements());
+      for (int i = 0; i < exclude_cols_tensor->NumElements(); ++i) {
+        exclude_cols.push_back(exclude_cols_tensor->flat<int64>()(i));
+      }
+    }
 
     int64 buffer_size = 0;
     OP_REQUIRES_OK(
@@ -141,11 +140,6 @@ if (op_version_ > 1) {
         ctx, select_cols.empty() || select_cols.front() >= 0,
         errors::InvalidArgument("select_cols should be non-negative indices"));
 
-    std::vector<int64> exclude_cols;
-    exclude_cols.reserve(exclude_cols_tensor->NumElements());
-    for (int i = 0; i < exclude_cols_tensor->NumElements(); ++i) {
-      exclude_cols.push_back(exclude_cols_tensor->flat<int64>()(i));
-    }
     OP_REQUIRES(ctx, select_cols.empty() || exclude_cols.empty(),
                 errors::InvalidArgument(
                     "Either select_cols or exclude_cols should be empty"));
@@ -158,12 +152,12 @@ if (op_version_ > 1) {
         ctx, exclude_cols.empty() || exclude_cols.front() >= 0,
         errors::InvalidArgument("exclude_cols should be non-negative indices"));
 
-    *output =
-        new Dataset(ctx, std::move(filenames), header,
-                    std::move(compression_type), zlib_compression_options,
-                    output_types_, output_shapes_, std::move(record_defaults),
-                    std::move(select_cols), std::move(exclude_cols),
-                    use_quote_delim, delim[0], std::move(na_value));
+    *output = new Dataset(ctx, std::move(filenames), header,
+                          std::move(compression_type), zlib_compression_options,
+                          output_types_, output_shapes_,
+                          std::move(record_defaults), std::move(select_cols),
+                          std::move(exclude_cols), use_quote_delim, delim[0],
+                          std::move(na_value), op_version_);
   }
 
  private:
@@ -175,7 +169,7 @@ if (op_version_ > 1) {
             const std::vector<PartialTensorShape>& output_shapes,
             std::vector<Tensor> record_defaults, std::vector<int64> select_cols,
             std::vector<int64> exclude_cols, bool use_quote_delim, char delim,
-            string na_value)
+            string na_value, int op_version)
         : DatasetBase(DatasetContext(ctx)),
           filenames_(std::move(filenames)),
           header_(header),
@@ -187,6 +181,7 @@ if (op_version_ > 1) {
           use_quote_delim_(use_quote_delim),
           delim_(delim),
           na_value_(std::move(na_value)),
+          op_version_(op_version),
           use_compression_(!compression_type.empty()),
           compression_type_(std::move(compression_type)),
           options_(options) {}
@@ -242,16 +237,31 @@ if (op_version_ > 1) {
       TF_RETURN_IF_ERROR(b->AddVector(select_cols_, &select_cols));
       TF_RETURN_IF_ERROR(b->AddVector(exclude_cols_, &exclude_cols));
 
-      TF_RETURN_IF_ERROR(b->AddDataset(
-          this,
-          {std::make_pair(0, filenames), std::make_pair(1, compression_type),
-           std::make_pair(2, buffer_size), std::make_pair(3, header),
-           std::make_pair(4, delim), std::make_pair(5, use_quote_delim),
-           std::make_pair(6, na_value), std::make_pair(7, select_cols),
-           std::make_pair(9, exclude_cols)},     // Single tensor inputs
-          {std::make_pair(8, record_defaults)},  // Tensor list inputs
-          {},
-          output));
+      if (op_version_ > 1) {
+        TF_RETURN_IF_ERROR(b->AddDataset(
+            this,
+            {std::make_pair(0, filenames), std::make_pair(1, compression_type),
+             std::make_pair(2, buffer_size), std::make_pair(3, header),
+             std::make_pair(4, delim), std::make_pair(5, use_quote_delim),
+             std::make_pair(6, na_value), std::make_pair(7, select_cols),
+             std::make_pair(9, exclude_cols)},     // Single tensor inputs
+            {std::make_pair(8, record_defaults)},  // Tensor list inputs
+            {},
+            output));
+      } else {
+        TF_RETURN_IF_ERROR(b->AddDataset(
+            this,
+            {
+                std::make_pair(0, filenames),
+                std::make_pair(1, compression_type),
+                std::make_pair(2, buffer_size), std::make_pair(3, header),
+                std::make_pair(4, delim), std::make_pair(5, use_quote_delim),
+                std::make_pair(6, na_value), std::make_pair(7, select_cols),
+            },                                     // Single tensor inputs
+            {std::make_pair(8, record_defaults)},  // Tensor list inputs
+            {},
+            output));
+      }
       return Status::OK();
     }
 
@@ -899,6 +909,7 @@ if (op_version_ > 1) {
     const bool use_quote_delim_;
     const char delim_;
     const tstring na_value_;
+    const int op_version_;
     const bool use_compression_;
     const tstring compression_type_;
     const io::ZlibCompressionOptions options_;
diff --git a/tensorflow/python/data/experimental/ops/readers.py b/tensorflow/python/data/experimental/ops/readers.py
index 14a507580ad..eed955e51b0 100644
--- a/tensorflow/python/data/experimental/ops/readers.py
+++ b/tensorflow/python/data/experimental/ops/readers.py
@@ -732,7 +732,7 @@ class CsvDatasetV2(dataset_ops.DatasetSource):
     )
     self._element_spec = tuple(
         tensor_spec.TensorSpec([], d.dtype) for d in self._record_defaults)
-    if compat.forward_compatible(2020, 6, 25):
+    if compat.forward_compatible(2020, 7, 3) or exclude_cols is not None:
       variant_tensor = gen_experimental_dataset_ops.csv_dataset_v2(
           filenames=self._filenames,
           record_defaults=self._record_defaults,

From bb5e2e63d9bfc197f632c0e28446d47ffdba7661 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?M=C3=A5ns=20Nilsson?= <mans.nilsson@arm.com>
Date: Mon, 18 May 2020 16:15:38 +0200
Subject: [PATCH 0140/1390] TFLu: Add support for unknown output dimensions

Some models (e.g. Convnet) with broadcasting have unspecified output
dimensions for some operators, where the dimensions can be deduced from
the input dimensions.

Change-Id: Ica7d3715c4f4cfa0ce05580afe5f25bb2cfce981
---
 tensorflow/lite/micro/kernels/add.cc          | 15 ++++++++-
 tensorflow/lite/micro/kernels/cmsis-nn/add.cc | 15 ++++++++-
 tensorflow/lite/micro/kernels/cmsis-nn/mul.cc | 15 ++++++++-
 tensorflow/lite/micro/kernels/mul.cc          | 15 ++++++++-
 tensorflow/lite/micro/memory_helpers.cc       | 31 +++++++++++++++++++
 tensorflow/lite/micro/memory_helpers.h        |  5 +++
 6 files changed, 92 insertions(+), 4 deletions(-)

diff --git a/tensorflow/lite/micro/kernels/add.cc b/tensorflow/lite/micro/kernels/add.cc
index 42609301232..feb5f2c0b29 100644
--- a/tensorflow/lite/micro/kernels/add.cc
+++ b/tensorflow/lite/micro/kernels/add.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/kernels/op_macros.h"
+#include "tensorflow/lite/micro/memory_helpers.h"
 
 namespace tflite {
 namespace ops {
@@ -94,6 +95,18 @@ TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteAddParams* params,
   return kTfLiteOk;
 }
 
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  const TfLiteTensor* input1 = GetInput(context, node, kInputTensor1);
+  const TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  if (output->dims->size == 0) {
+    return AllocateOutputDimensionsFromInput(context, input1, input2, output);
+  }
+
+  return kTfLiteOk;
+}
+
 void EvalAdd(TfLiteContext* context, TfLiteNode* node, TfLiteAddParams* params,
              const OpData* data, const TfLiteTensor* input1,
              const TfLiteTensor* input2, TfLiteTensor* output) {
@@ -190,7 +203,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 TfLiteRegistration* Register_ADD() {
   static TfLiteRegistration r = {/*init=*/nullptr,
                                  /*free=*/nullptr,
-                                 /*prepare=*/nullptr,
+                                 /*prepare=*/add::Prepare,
                                  /*invoke=*/add::Eval,
                                  /*profiling_string=*/nullptr,
                                  /*builtin_code=*/0,
diff --git a/tensorflow/lite/micro/kernels/cmsis-nn/add.cc b/tensorflow/lite/micro/kernels/cmsis-nn/add.cc
index 6dbe4a618ab..d1f3eae233a 100644
--- a/tensorflow/lite/micro/kernels/cmsis-nn/add.cc
+++ b/tensorflow/lite/micro/kernels/cmsis-nn/add.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/kernels/op_macros.h"
+#include "tensorflow/lite/micro/memory_helpers.h"
 
 namespace tflite {
 namespace ops {
@@ -94,6 +95,18 @@ TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteAddParams* params,
   return kTfLiteOk;
 }
 
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  const TfLiteTensor* input1 = GetInput(context, node, kInputTensor1);
+  const TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  if (output->dims->size == 0) {
+    return AllocateOutputDimensionsFromInput(context, input1, input2, output);
+  }
+
+  return kTfLiteOk;
+}
+
 void EvalAdd(TfLiteContext* context, TfLiteNode* node, TfLiteAddParams* params,
              const OpData* data, const TfLiteTensor* input1,
              const TfLiteTensor* input2, TfLiteTensor* output) {
@@ -199,7 +212,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 
 TfLiteRegistration* Register_ADD() {
   static TfLiteRegistration r = {nullptr /* Init */, nullptr /* Free */,
-                                 nullptr /* Prepare */, add::Eval};
+                                 add::Prepare, add::Eval};
   return &r;
 }
 
diff --git a/tensorflow/lite/micro/kernels/cmsis-nn/mul.cc b/tensorflow/lite/micro/kernels/cmsis-nn/mul.cc
index d746166ebd9..6d0592d6c09 100644
--- a/tensorflow/lite/micro/kernels/cmsis-nn/mul.cc
+++ b/tensorflow/lite/micro/kernels/cmsis-nn/mul.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/reference/process_broadcast_shapes.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/micro/memory_helpers.h"
 
 namespace tflite {
 namespace ops {
@@ -64,6 +65,18 @@ TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteNode* node,
   return kTfLiteOk;
 }
 
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  const TfLiteTensor* input1 = GetInput(context, node, kInput1Tensor);
+  const TfLiteTensor* input2 = GetInput(context, node, kInput2Tensor);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  if (output->dims->size == 0) {
+    return AllocateOutputDimensionsFromInput(context, input1, input2, output);
+  }
+
+  return kTfLiteOk;
+}
+
 void EvalQuantized(TfLiteContext* context, TfLiteNode* node,
                    TfLiteMulParams* params, OpData* data,
                    const TfLiteTensor* input1, const TfLiteTensor* input2,
@@ -167,7 +180,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 
 TfLiteRegistration* Register_MUL() {
   static TfLiteRegistration r = {nullptr /* Init */, nullptr /* Free */,
-                                 nullptr /* Prepare */, mul::Eval};
+                                 mul::Prepare, mul::Eval};
   return &r;
 }
 
diff --git a/tensorflow/lite/micro/kernels/mul.cc b/tensorflow/lite/micro/kernels/mul.cc
index fb47728a1a4..b9c989e1f87 100644
--- a/tensorflow/lite/micro/kernels/mul.cc
+++ b/tensorflow/lite/micro/kernels/mul.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/reference/process_broadcast_shapes.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/micro/memory_helpers.h"
 
 namespace tflite {
 namespace ops {
@@ -65,6 +66,18 @@ TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteNode* node,
   return kTfLiteOk;
 }
 
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  const TfLiteTensor* input1 = GetInput(context, node, kInput1Tensor);
+  const TfLiteTensor* input2 = GetInput(context, node, kInput2Tensor);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  if (output->dims->size == 0) {
+    return AllocateOutputDimensionsFromInput(context, input1, input2, output);
+  }
+
+  return kTfLiteOk;
+}
+
 void EvalQuantized(TfLiteContext* context, TfLiteNode* node,
                    TfLiteMulParams* params, OpData* data,
                    const TfLiteTensor* input1, const TfLiteTensor* input2,
@@ -161,7 +174,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 TfLiteRegistration* Register_MUL() {
   static TfLiteRegistration r = {/*init=*/nullptr,
                                  /*free=*/nullptr,
-                                 /*prepare=*/nullptr,
+                                 /*prepare=*/mul::Prepare,
                                  /*invoke=*/mul::Eval,
                                  /*profiling_string=*/nullptr,
                                  /*builtin_code=*/0,
diff --git a/tensorflow/lite/micro/memory_helpers.cc b/tensorflow/lite/micro/memory_helpers.cc
index 37c78162b62..a0c0d85429a 100644
--- a/tensorflow/lite/micro/memory_helpers.cc
+++ b/tensorflow/lite/micro/memory_helpers.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "tensorflow/lite/core/api/error_reporter.h"
 #include "tensorflow/lite/core/api/flatbuffer_conversions.h"
 #include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 
 namespace tflite {
 
@@ -101,4 +102,34 @@ TfLiteStatus BytesRequiredForTensor(const tflite::Tensor& flatbuffer_tensor,
   return kTfLiteOk;
 }
 
+TfLiteStatus AllocateOutputDimensionsFromInput(TfLiteContext* context, const TfLiteTensor* input1,
+                                               const TfLiteTensor* input2, TfLiteTensor* output) {
+    int size = 1, i = 0;
+    const TfLiteTensor* input = nullptr;
+
+    TF_LITE_ENSURE(context, input1->dims != nullptr);
+    TF_LITE_ENSURE(context, input2->dims != nullptr);
+    TF_LITE_ENSURE(context, output->dims->size == 0);
+
+    input = input1->dims->size > input2->dims->size ? input1 : input2;
+    TF_LITE_ENSURE(context, output->type == input->type);
+
+    const int dimensions_count = tflite::GetTensorShape(input).DimensionsCount();
+    for (i = 0; i < dimensions_count; i++) {
+      size *= input->dims->data[i];
+    }
+    output->bytes += size;
+
+    TF_LITE_ENSURE_STATUS(context->AllocatePersistentBuffer(
+        context, TfLiteIntArrayGetSizeInBytes(size),
+      reinterpret_cast<void**>(&output->dims)));
+
+    output->dims->size = input->dims->size;
+    for (i = 0; i < dimensions_count; i++) {
+      output->dims->data[i] = input->dims->data[i];
+    }
+
+    return kTfLiteOk;
+}
+
 }  // namespace tflite
diff --git a/tensorflow/lite/micro/memory_helpers.h b/tensorflow/lite/micro/memory_helpers.h
index f52da062271..1b6fa48dd99 100644
--- a/tensorflow/lite/micro/memory_helpers.h
+++ b/tensorflow/lite/micro/memory_helpers.h
@@ -42,6 +42,11 @@ TfLiteStatus BytesRequiredForTensor(const tflite::Tensor& flatbuffer_tensor,
                                     size_t* bytes, size_t* type_size,
                                     ErrorReporter* error_reporter);
 
+// Deduce output dimensions from input and allocate given size.
+// Useful for operators with two inputs where the largest input should equal the output dimension.
+TfLiteStatus AllocateOutputDimensionsFromInput(TfLiteContext* context, const TfLiteTensor* input1,
+                                               const TfLiteTensor* input2, TfLiteTensor* output);
+
 }  // namespace tflite
 
 #endif  // TENSORFLOW_LITE_MICRO_MEMORY_HELPERS_H_

From 2f9642602d9d3da721d603e583b2569c765704a1 Mon Sep 17 00:00:00 2001
From: Jens Elofsson <jens.elofsson@arm.com>
Date: Fri, 12 Jun 2020 10:49:57 +0200
Subject: [PATCH 0141/1390] Adapt to changes in micro_allocator.

---
 tensorflow/lite/micro/micro_allocator.cc      | 20 ++++---
 tensorflow/lite/micro/micro_allocator.h       |  3 +-
 tensorflow/lite/micro/micro_allocator_test.cc | 52 ++++++++++++++-----
 3 files changed, 55 insertions(+), 20 deletions(-)

diff --git a/tensorflow/lite/micro/micro_allocator.cc b/tensorflow/lite/micro/micro_allocator.cc
index 7cd40e54435..16b3b986a52 100644
--- a/tensorflow/lite/micro/micro_allocator.cc
+++ b/tensorflow/lite/micro/micro_allocator.cc
@@ -375,10 +375,9 @@ TfLiteStatus CreatePlan(ErrorReporter* error_reporter,
             planner->AddBuffer(error_reporter, aligned_bytes_required,
                                current->first_created, current->last_used));
       } else {
-        TF_LITE_ENSURE_STATUS(
-            planner->AddBuffer(error_reporter, aligned_bytes_required,
-                               current->first_created, current->last_used,
-                               current->offline_offset));
+        TF_LITE_ENSURE_STATUS(planner->AddBuffer(
+            error_reporter, aligned_bytes_required, current->first_created,
+            current->last_used, current->offline_offset));
       }
     }
   }
@@ -647,7 +646,7 @@ TfLiteStatus MicroAllocator::FinishModelAllocation(const Model* model,
   const SubGraph* subgraph = GetSubGraphFromModel(model);
   TFLITE_DCHECK(subgraph != nullptr);
 
-  TF_LITE_ENSURE_STATUS(CommitStaticMemoryPlan(subgraph, context));
+  TF_LITE_ENSURE_STATUS(CommitStaticMemoryPlan(model, subgraph, context));
   TF_LITE_ENSURE_STATUS(AllocateVariables(subgraph->tensors(), context->tensors,
                                           memory_allocator_));
 
@@ -874,7 +873,8 @@ const SubGraph* MicroAllocator::GetSubGraphFromModel(const Model* model) {
   return (*subgraphs)[0];
 }
 
-TfLiteStatus MicroAllocator::CommitStaticMemoryPlan(const SubGraph* subgraph,
+TfLiteStatus MicroAllocator::CommitStaticMemoryPlan(const Model* model,
+                                                    const SubGraph* subgraph,
                                                     TfLiteContext* context) {
   // Create static memory plan
   // 1. Calculate AllocationInfo to know the lifetime of each tensor/buffer.
@@ -891,7 +891,13 @@ TfLiteStatus MicroAllocator::CommitStaticMemoryPlan(const SubGraph* subgraph,
     AllocationInfoBuilder builder(error_reporter_, &tmp_allocator);
     TF_LITE_ENSURE_STATUS(
         builder.Init(subgraph->tensors()->size(), scratch_buffer_count_));
-    TF_LITE_ENSURE_STATUS(builder.AddTensors(subgraph, context->tensors));
+
+    int32_t* offline_planner_offsets = nullptr;
+    TF_LITE_ENSURE_STATUS(
+        builder.GetOfflinePlannedOffsets(model, &offline_planner_offsets));
+    TF_LITE_ENSURE_STATUS(builder.AddTensors(subgraph, offline_planner_offsets,
+                                             context->tensors));
+
     TF_LITE_ENSURE_STATUS(builder.AddScratchBuffers(scratch_buffer_handles_));
     const AllocationInfo* allocation_info = builder.Finish();
 
diff --git a/tensorflow/lite/micro/micro_allocator.h b/tensorflow/lite/micro/micro_allocator.h
index 7fc091196a5..9cfc1793fc7 100644
--- a/tensorflow/lite/micro/micro_allocator.h
+++ b/tensorflow/lite/micro/micro_allocator.h
@@ -185,7 +185,8 @@ class MicroAllocator {
 
   // Commits a memory plan for all non-persistent buffer allocations in the
   // 'head' section of the memory arena.
-  virtual TfLiteStatus CommitStaticMemoryPlan(const SubGraph* subgraph,
+  virtual TfLiteStatus CommitStaticMemoryPlan(const Model* model,
+                                              const SubGraph* subgraph,
                                               TfLiteContext* context);
 
   // A simple memory allocator that always allocate from the arena tail or head.
diff --git a/tensorflow/lite/micro/micro_allocator_test.cc b/tensorflow/lite/micro/micro_allocator_test.cc
index 04f4732b9d3..f3f3f32611e 100644
--- a/tensorflow/lite/micro/micro_allocator_test.cc
+++ b/tensorflow/lite/micro/micro_allocator_test.cc
@@ -312,6 +312,8 @@ TF_LITE_MICRO_TEST(OfflinePlannerBranchesAllOnline) {
   int version = 1;
   int subgraph = 0;
   constexpr int nbr_tensors = 4;
+  tflite::testing::MockOpResolver mock_resolver;
+  tflite::NodeAndRegistration* node_and_registration;
   const int32_t metadata_buffer[tflite::testing::kOfflinePlannerHeaderSize +
                                 nbr_tensors] = {version, subgraph,
                                                 nbr_tensors,  // header
@@ -340,9 +342,14 @@ TF_LITE_MICRO_TEST(OfflinePlannerBranchesAllOnline) {
   TfLiteContext context;
   constexpr size_t arena_size = 4096;
   uint8_t arena[arena_size];
-  tflite::MicroAllocator* allocator = tflite::MicroAllocator::Create(
-      &context, model, arena, arena_size, micro_test::reporter);
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, allocator->FinishTensorAllocation());
+  tflite::MicroAllocator* allocator =
+      tflite::MicroAllocator::Create(arena, arena_size, micro_test::reporter);
+
+  TF_LITE_MICRO_EXPECT_EQ(
+      kTfLiteOk, allocator->StartModelAllocation(model, &context, mock_resolver,
+                                                 &node_and_registration));
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk,
+                          allocator->FinishModelAllocation(model, &context));
 
   // Since all of the tensors are online planned and the model structure is
   // identical to that in TestAllocationForModelsWithBranches,
@@ -357,6 +364,8 @@ TF_LITE_MICRO_TEST(OfflinePlannerBranchesAllOnline) {
 
 TF_LITE_MICRO_TEST(OfflinePlannerBasic) {
   constexpr int nbr_tensors = 4;
+  tflite::testing::MockOpResolver mock_resolver;
+  tflite::NodeAndRegistration* node_and_registration;
   const int32_t metadata_buffer[tflite::testing::kOfflinePlannerHeaderSize +
                                 nbr_tensors] = {1,  0, nbr_tensors,
                                                 0,    // t0
@@ -389,9 +398,14 @@ TF_LITE_MICRO_TEST(OfflinePlannerBasic) {
   TfLiteContext context;
   constexpr size_t arena_size = 4096;
   uint8_t arena[arena_size];
-  tflite::MicroAllocator* allocator = tflite::MicroAllocator::Create(
-      &context, model, arena, arena_size, micro_test::reporter);
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, allocator->FinishTensorAllocation());
+  tflite::MicroAllocator* allocator =
+      tflite::MicroAllocator::Create(arena, arena_size, micro_test::reporter);
+
+  TF_LITE_MICRO_EXPECT_EQ(
+      kTfLiteOk, allocator->StartModelAllocation(model, &context, mock_resolver,
+                                                 &node_and_registration));
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk,
+                          allocator->FinishModelAllocation(model, &context));
 
   uint8_t* start = context.tensors[0].data.uint8;
   TF_LITE_MICRO_EXPECT_EQ(0, context.tensors[0].data.uint8 - start);
@@ -402,6 +416,8 @@ TF_LITE_MICRO_TEST(OfflinePlannerBasic) {
 
 TF_LITE_MICRO_TEST(OfflinePlannerOverlappingAllocation) {
   constexpr int nbr_tensors = 4;
+  tflite::testing::MockOpResolver mock_resolver;
+  tflite::NodeAndRegistration* node_and_registration;
   const int32_t metadata_buffer[tflite::testing::kOfflinePlannerHeaderSize +
                                 nbr_tensors] = {
       1, 0, nbr_tensors,  // header: version, subgraph, nbr tensors
@@ -434,9 +450,14 @@ TF_LITE_MICRO_TEST(OfflinePlannerOverlappingAllocation) {
   TfLiteContext context;
   constexpr size_t arena_size = 4096;
   uint8_t arena[arena_size];
-  tflite::MicroAllocator* allocator = tflite::MicroAllocator::Create(
-      &context, model, arena, arena_size, micro_test::reporter);
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, allocator->FinishTensorAllocation());
+  tflite::MicroAllocator* allocator =
+      tflite::MicroAllocator::Create(arena, arena_size, micro_test::reporter);
+
+  TF_LITE_MICRO_EXPECT_EQ(
+      kTfLiteOk, allocator->StartModelAllocation(model, &context, mock_resolver,
+                                                 &node_and_registration));
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk,
+                          allocator->FinishModelAllocation(model, &context));
 
   uint8_t* start = context.tensors[0].data.uint8;
   TF_LITE_MICRO_EXPECT_EQ(0, context.tensors[0].data.uint8 - start);
@@ -448,6 +469,8 @@ TF_LITE_MICRO_TEST(OfflinePlannerOverlappingAllocation) {
 
 TF_LITE_MICRO_TEST(OfflinePlannerOfflineOnline) {
   constexpr int nbr_tensors = 5;
+  tflite::testing::MockOpResolver mock_resolver;
+  tflite::NodeAndRegistration* node_and_registration;
   const int32_t metadata_buffer[tflite::testing::kOfflinePlannerHeaderSize +
                                 nbr_tensors] = {
       1, 0, nbr_tensors,  // header: version, subgraph, nbr tensors
@@ -482,9 +505,14 @@ TF_LITE_MICRO_TEST(OfflinePlannerOfflineOnline) {
   TfLiteContext context;
   constexpr size_t arena_size = 4096;
   uint8_t arena[arena_size];
-  tflite::MicroAllocator* allocator = tflite::MicroAllocator::Create(
-      &context, model, arena, arena_size, micro_test::reporter);
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, allocator->FinishTensorAllocation());
+  tflite::MicroAllocator* allocator =
+      tflite::MicroAllocator::Create(arena, arena_size, micro_test::reporter);
+
+  TF_LITE_MICRO_EXPECT_EQ(
+      kTfLiteOk, allocator->StartModelAllocation(model, &context, mock_resolver,
+                                                 &node_and_registration));
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk,
+                          allocator->FinishModelAllocation(model, &context));
 
   uint8_t* start = context.tensors[0].data.uint8;
   TF_LITE_MICRO_EXPECT_EQ(0, context.tensors[0].data.uint8 - start);

From 3626d6cf24924957b001ac92db934967139cc3c5 Mon Sep 17 00:00:00 2001
From: Michael137 <michaelbuch12@gmail.com>
Date: Sun, 14 Jun 2020 11:22:21 -0400
Subject: [PATCH 0142/1390] Document usage of the profiling_output_csv_file
 option

---
 tensorflow/lite/tools/benchmark/README.md | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/tensorflow/lite/tools/benchmark/README.md b/tensorflow/lite/tools/benchmark/README.md
index a43383cff9d..d76e8d4d031 100644
--- a/tensorflow/lite/tools/benchmark/README.md
+++ b/tensorflow/lite/tools/benchmark/README.md
@@ -48,6 +48,12 @@ and the following optional parameters:
     'enable_op_profiling'. Note, the platform-wide tracing might not work if the
     tool runs as a commandline native binary. For example, on Android, the
     ATrace-based tracing only works when the tool is launched as an APK.
+*   `profiling_output_csv_file`: `str` (default="") \
+    File path to export profile data to as CSV.
+    The results are printed to `stdout` if option is not set.
+    Requires `enable_op_profiling` to be `true` and the path
+    to include the name of the output CSV; otherwise
+    results are printed to `stdout`.
 
 ### TFLite delegate parameters
 The tool supports all runtime/delegate parameters introduced by

From ec0217f4ac194741cf6c566f01747a5eb771edc6 Mon Sep 17 00:00:00 2001
From: tg-at-google <taregaskin@google.com>
Date: Sun, 14 Jun 2020 11:29:41 -0400
Subject: [PATCH 0143/1390] Update quantization_config.cc

---
 .../compiler/mlir/lite/quantization/quantization_config.cc      | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/compiler/mlir/lite/quantization/quantization_config.cc b/tensorflow/compiler/mlir/lite/quantization/quantization_config.cc
index c4cf6e71cf3..634d212409e 100644
--- a/tensorflow/compiler/mlir/lite/quantization/quantization_config.cc
+++ b/tensorflow/compiler/mlir/lite/quantization/quantization_config.cc
@@ -60,7 +60,7 @@ bool ParseInputNodeQuantSpecs(absl::string_view node_names,
   std::vector<double> node_maxs;
   if (!max_values.empty()) {
     std::vector<std::string> node_maxs_str = absl::StrSplit(max_values, ',');
-    for (size_t i = 0; i < node_maxs_str.size(); i++) {
+    for (int i : llvm::seq<int>(node_maxs_str.size())) {
       double value;
       if (!absl::SimpleAtod(node_maxs_str[i], &value)) {
         llvm::errs() << "Unexpected mins: " << node_maxs_str[i] << "\n";

From fc205f91b67f38b02e026245fa39f3f61981e605 Mon Sep 17 00:00:00 2001
From: Chen Lei <chillychen1991@gmail.com>
Date: Mon, 15 Jun 2020 00:07:37 +0800
Subject: [PATCH 0144/1390] Fix wrong comments

Reference
[Probot: Stale](https://github.com/probot/stale)
---
 .github/stale.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/stale.yml b/.github/stale.yml
index e1184ce37b4..5f8dd12f477 100644
--- a/.github/stale.yml
+++ b/.github/stale.yml
@@ -23,7 +23,7 @@
 daysUntilStale: 7
 # Number of days of inactivity before a stale Issue or Pull Request is closed
 daysUntilClose: 7
-# Issues or Pull Requests with these labels will never be considered stale. Set to `[]` to disable
+# Only issues or pull requests with all of these labels are check if stale. Defaults to `[]` (disabled)
 onlyLabels:
  - stat:awaiting response
 # Comment to post when marking as stale. Set to `false` to disable

From 02090cac6a7f1958920155522d912de0b2769301 Mon Sep 17 00:00:00 2001
From: tg-at-google <taregaskin@google.com>
Date: Sun, 14 Jun 2020 12:20:43 -0400
Subject: [PATCH 0145/1390] Update import_quant_stats_pass.cc

---
 .../compiler/mlir/lite/quantization/import_quant_stats_pass.cc  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/compiler/mlir/lite/quantization/import_quant_stats_pass.cc b/tensorflow/compiler/mlir/lite/quantization/import_quant_stats_pass.cc
index 5419a0d5e1b..e00a088c38c 100644
--- a/tensorflow/compiler/mlir/lite/quantization/import_quant_stats_pass.cc
+++ b/tensorflow/compiler/mlir/lite/quantization/import_quant_stats_pass.cc
@@ -158,7 +158,7 @@ void ImportQuantStatsPass::ImportAsStatsOps(OpBuilder b, Operation *op,
     InsertStatsOpAtResult(b, op->getResult(index), layer_stats, axis_stats,
                           axis);
   } else {
-    for (int i = 0; i < static_cast<int>(op->getNumResults()); ++i) {
+    for (int i = 0, e = op->getNumResults(); i < e; ++i) {
       if (IsQuantizableResult(op, i)) {
         InsertStatsOpAtResult(b, op->getResult(i), layer_stats, axis_stats,
                               axis);

From 0052918f2b9332e7eabe4b2ababbbdb464889cec Mon Sep 17 00:00:00 2001
From: tg-at-google <taregaskin@google.com>
Date: Sun, 14 Jun 2020 12:23:40 -0400
Subject: [PATCH 0146/1390] Update quantization_config.cc

---
 .../compiler/mlir/lite/quantization/quantization_config.cc      | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/compiler/mlir/lite/quantization/quantization_config.cc b/tensorflow/compiler/mlir/lite/quantization/quantization_config.cc
index 634d212409e..b299fa8f4c2 100644
--- a/tensorflow/compiler/mlir/lite/quantization/quantization_config.cc
+++ b/tensorflow/compiler/mlir/lite/quantization/quantization_config.cc
@@ -48,7 +48,7 @@ bool ParseInputNodeQuantSpecs(absl::string_view node_names,
   std::vector<double> node_mins;
   if (!min_values.empty()) {
     std::vector<std::string> node_mins_str = absl::StrSplit(min_values, ',');
-    for (size_t i = 0; i < node_mins_str.size(); i++) {
+    for (int i : llvm::seq<int>(node_mins_str.size())) {
       double value;
       if (!absl::SimpleAtod(node_mins_str[i], &value)) {
         return true;

From 09f45f4f5edba341b21c89266431c7dd9e950af8 Mon Sep 17 00:00:00 2001
From: tg-at-google <taregaskin@google.com>
Date: Sun, 14 Jun 2020 12:23:47 -0400
Subject: [PATCH 0147/1390] Update quantization_config.cc


From 79eead7a46147cced45480d7da02e798e6e4ba56 Mon Sep 17 00:00:00 2001
From: Vo Van Nghia <vovannghia2409@gmail.com>
Date: Sun, 14 Jun 2020 23:41:06 +0700
Subject: [PATCH 0148/1390] Add GCS Path Parser

---
 .../experimental/filesystem/plugins/gcs/BUILD |  1 +
 .../filesystem/plugins/gcs/gcs_filesystem.cc  | 40 +++++++++++++++++++
 2 files changed, 41 insertions(+)

diff --git a/tensorflow/c/experimental/filesystem/plugins/gcs/BUILD b/tensorflow/c/experimental/filesystem/plugins/gcs/BUILD
index c7a2d68ac3e..c9fee433589 100644
--- a/tensorflow/c/experimental/filesystem/plugins/gcs/BUILD
+++ b/tensorflow/c/experimental/filesystem/plugins/gcs/BUILD
@@ -27,5 +27,6 @@ cc_library(
         "//tensorflow/c:tf_status",
         "//tensorflow/c/experimental/filesystem:filesystem_interface",
         "@com_github_googlecloudplatform_google_cloud_cpp//:storage_client",
+        "@com_google_absl//absl/strings",
     ],
 )
diff --git a/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem.cc b/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem.cc
index 7918e7a7310..11641fac53b 100644
--- a/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem.cc
+++ b/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem.cc
@@ -15,6 +15,7 @@ limitations under the License.
 #include <stdlib.h>
 #include <string.h>
 
+#include "absl/strings/string_view.h"
 #include "google/cloud/storage/client.h"
 #include "tensorflow/c/experimental/filesystem/filesystem_interface.h"
 #include "tensorflow/c/tf_status.h"
@@ -35,6 +36,45 @@ static inline void TF_SetStatusFromGCSStatus(
 static void* plugin_memory_allocate(size_t size) { return calloc(1, size); }
 static void plugin_memory_free(void* ptr) { free(ptr); }
 
+static void ParseGCSPath(absl::string_view fname, bool object_empty_ok, char** bucket,
+                         char** object, TF_Status* status) {
+  size_t scheme_end = fname.find("://") + 2;
+  if (fname.substr(0, scheme_end + 1) != "gs://") {
+    TF_SetStatus(status, TF_INVALID_ARGUMENT,
+                 "GCS path doesn't start with 'gs://'.");
+    return;
+  }
+
+  size_t bucket_end = fname.find("/", scheme_end + 1);
+  if (bucket_end == absl::string_view::npos) {
+    TF_SetStatus(status, TF_INVALID_ARGUMENT,
+                 "GCS path doesn't contain a bucket name.");
+    return;
+  }
+  absl::string_view bucket_view =
+      fname.substr(scheme_end + 1, bucket_end - scheme_end - 1);
+  *bucket =
+      static_cast<char*>(plugin_memory_allocate(bucket_view.length() + 1));
+  memcpy(*bucket, bucket_view.data(), bucket_view.length());
+  (*bucket)[bucket_view.length()] = '\0';
+
+  absl::string_view object_view = fname.substr(bucket_end + 1);
+  if (object_view == "") {
+    if (object_empty_ok) {
+      *object = nullptr;
+      return;
+    } else {
+      TF_SetStatus(status, TF_INVALID_ARGUMENT,
+                   "GCS path doesn't contain an object name.");
+      return;
+    }
+  }
+  *object =
+      static_cast<char*>(plugin_memory_allocate(object_view.length() + 1));
+  // object_view.data() is a null-terminated string_view because fname is.
+  strcpy(*object, object_view.data());
+}
+
 // SECTION 1. Implementation for `TF_RandomAccessFile`
 // ----------------------------------------------------------------------------
 namespace tf_random_access_file {

From 2f515039033a27b6253ae788e79c7ead32265007 Mon Sep 17 00:00:00 2001
From: tg-at-google <taregaskin@google.com>
Date: Sun, 14 Jun 2020 14:36:49 -0400
Subject: [PATCH 0149/1390] Update quantization_driver.cc

---
 .../compiler/mlir/lite/quantization/quantization_driver.cc    | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/compiler/mlir/lite/quantization/quantization_driver.cc b/tensorflow/compiler/mlir/lite/quantization/quantization_driver.cc
index fc11604ef8a..a9f4eb78431 100644
--- a/tensorflow/compiler/mlir/lite/quantization/quantization_driver.cc
+++ b/tensorflow/compiler/mlir/lite/quantization/quantization_driver.cc
@@ -294,7 +294,7 @@ class QuantizationDriver {
         return;
       if (current_op == op) llvm::errs() << "===>>>";
       llvm::errs() << op->getName() << " : (";
-      for (size_t i = 0; i < op->getNumOperands(); ++i) {
+      for (int i = 0, e = op->getNumOperands(); i < e; ++i) {
         if (auto params = GetOperandQuantState(op, i).params)
           params.print(llvm::errs());
         else
@@ -303,7 +303,7 @@ class QuantizationDriver {
         llvm::errs() << ",";
       }
       llvm::errs() << ") -> (";
-      for (size_t i = 0; i < op->getNumResults(); ++i) {
+      for (int i = 0, e = op->getNumResults(); i < e; ++i) {
         if (auto params = GetResultQuantState(op, i).params)
           params.print(llvm::errs());
         else

From 5ac4a4a3ea522b2d0b6b7fa0058e7bc1bb5ba6d3 Mon Sep 17 00:00:00 2001
From: tg-at-google <taregaskin@google.com>
Date: Sun, 14 Jun 2020 14:47:27 -0400
Subject: [PATCH 0150/1390] Update quantization_utils.cc

---
 .../compiler/mlir/lite/quantization/quantization_utils.cc       | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/compiler/mlir/lite/quantization/quantization_utils.cc b/tensorflow/compiler/mlir/lite/quantization/quantization_utils.cc
index b9ca5329519..57b24eb8772 100644
--- a/tensorflow/compiler/mlir/lite/quantization/quantization_utils.cc
+++ b/tensorflow/compiler/mlir/lite/quantization/quantization_utils.cc
@@ -368,7 +368,7 @@ quant::QuantizedType GetUniformQuantizedTypeForBias(
         scales[index_scale.index()] *= index_scale.value();
       }
     } else if (auto type = op_type.dyn_cast<quant::UniformQuantizedType>()) {
-      for (size_t index = 0; index != axis_size; ++index) {
+      for (int index = 0, e = axis_size; index != e; ++index) {
         scales[index] *= type.getScale();
       }
     }

From eabb1453f2efc62a648096c21b2ed993079d8c51 Mon Sep 17 00:00:00 2001
From: nammbash <niroopshankar.ammbashankar@intel.com>
Date: Sun, 14 Jun 2020 11:47:44 -0700
Subject: [PATCH 0151/1390] Fix and Refactor NonAVX512 CPU platform

---
 tensorflow/core/graph/mkl_graph_util.h | 49 ++++++++++++++++++--------
 1 file changed, 34 insertions(+), 15 deletions(-)

diff --git a/tensorflow/core/graph/mkl_graph_util.h b/tensorflow/core/graph/mkl_graph_util.h
index cd09ac522d7..3c4c186b791 100644
--- a/tensorflow/core/graph/mkl_graph_util.h
+++ b/tensorflow/core/graph/mkl_graph_util.h
@@ -126,22 +126,19 @@ inline string GetMklEagerOpName(const string& name) {
 }
 
 #ifdef ENABLE_INTEL_MKL_BFLOAT16
-static inline bool CheckBfloat16Support(DataType T) {
-  static absl::once_flag cpu_bfloat16_warn_once_flag;
-  // Restrict bfloat16 ops to platforms with at least AVX512 support, fall back
-  // to Eigen implementation otherwise.
-  if (!(port::TestCPUFeature(port::CPUFeature::AVX512F)) && T == DT_BFLOAT16) {
-    absl::call_once(cpu_bfloat16_warn_once_flag, [] {
-      LOG(ERROR)
-          << "oneDNN BFloat16 support are only on platforms with AVX512. "
-             "Falling back to default implementation if present.";
-    });
-    return false;
-  }
-  return true;
+static inline bool IsBF16SupportedByOneDNNOnThisCPU() {
+  return port::TestCPUFeature(port::CPUFeature::AVX512F);
 }
 #endif
 
+static inline void BF16UnsupportedWarning() {
+  static absl::once_flag cpu_bfloat16_warn_once_flag;
+  absl::call_once(cpu_bfloat16_warn_once_flag, [] {
+    LOG(ERROR) << "oneDNN BFloat16 support are only on platforms with AVX512. "
+                  "Falling back to default implementation if present.";
+  });
+}
+
 // Check whether opname with type T is registered as MKL operator
 // that can accept input tensors in MKL layout.
 //
@@ -159,7 +156,18 @@ static inline bool IsMklLayoutDependentOp(const string& op_name, DataType T) {
 #ifdef ENABLE_INTEL_MKL_BFLOAT16
   // Restrict regular ops to FLOAT and BFLOAT16
   if (kernel.find(kMklLayoutDependentOpLabelPattern) != string::npos) {
-    return (T == DT_FLOAT || CheckBfloat16Support(T));
+    if (T == DT_FLOAT) return true;
+    if (T == DT_BFLOAT16) {
+      if (IsBF16SupportedByOneDNNOnThisCPU()) {
+        return true;
+      } else {
+        // Restrict bfloat16 ops to platforms with at least AVX512 support, fall
+        // back to Eigen implementation otherwise.
+        BF16UnsupportedWarning();
+        return false;
+      }
+    }
+    return false;
   }
 #else
   // Restrict regular ops to FLOAT
@@ -216,7 +224,18 @@ static inline bool IsMklNameChangeOp(const string& op_name, DataType T) {
     isTypeAllowed = (T == DT_COMPLEX128 || T == DT_COMPLEX64 ||
                      T == DT_DOUBLE || T == DT_FLOAT);
 #ifdef ENABLE_INTEL_MKL_BFLOAT16
-    isTypeAllowed = (isTypeAllowed || CheckBfloat16Support(T));
+    if (!isTypeAllowed) {
+      if (T == DT_BFLOAT16) {
+        if (IsBF16SupportedByOneDNNOnThisCPU()) {
+          isTypeAllowed = true;
+        } else {
+          // Restrict bfloat16 ops to platforms with at least AVX512 support,
+          // fall back to Eigen implementation otherwise.
+          BF16UnsupportedWarning();
+          isTypeAllowed = false;
+        }
+      }
+    }
 #endif
     return isTypeAllowed;
   }

From 5394d892605fd158ad6a9c366e88e95b675a0227 Mon Sep 17 00:00:00 2001
From: tg-at-google <taregaskin@google.com>
Date: Sun, 14 Jun 2020 14:48:54 -0400
Subject: [PATCH 0152/1390] Update dump_mlir_util.cc

---
 tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc b/tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc
index b5a6c922707..febf2bc096d 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc
@@ -41,7 +41,7 @@ std::string MakeUniqueFilename(string name) {
   static NameCounts& instance = *new NameCounts;
 
   // Remove illegal characters from `name`.
-  for (size_t i = 0; i < name.size(); ++i) {
+  for (int i = 0, e = name.size(); i < e; ++i) {
     char ch = name[i];
     if (ch == '/' || ch == '[' || ch == ']' || ch == '*' || ch == '?' ||
         ch == '\\') {

From 1bccc9e1959fca38ce94a3cb1cfe7be1b6d4050c Mon Sep 17 00:00:00 2001
From: tg-at-google <taregaskin@google.com>
Date: Sun, 14 Jun 2020 14:52:29 -0400
Subject: [PATCH 0153/1390] Update chlo_ops.cc

---
 tensorflow/compiler/mlir/xla/ir/chlo_ops.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/compiler/mlir/xla/ir/chlo_ops.cc b/tensorflow/compiler/mlir/xla/ir/chlo_ops.cc
index f5b895f0c76..3408f3ed0cc 100644
--- a/tensorflow/compiler/mlir/xla/ir/chlo_ops.cc
+++ b/tensorflow/compiler/mlir/xla/ir/chlo_ops.cc
@@ -49,7 +49,7 @@ static Type GetBroadcastType(Type x, Type y, Type element_type,
 
   if (shape_x.size() == shape_y.size()) {
     llvm::SmallVector<int64_t, 4> out_shape(shape_x.size());
-    for (size_t i = 0; i < shape_x.size(); i++) {
+    for (int i = 0, e = shape_x.size(); i < e; i++) {
       auto x_val = shape_x[i];
       auto y_val = shape_y[i];
       if (x_val == -1 || y_val == -1) {

From 340053608bd1ac4168dc1be35e019cc1ac9d595a Mon Sep 17 00:00:00 2001
From: tg-at-google <taregaskin@google.com>
Date: Sun, 14 Jun 2020 14:58:56 -0400
Subject: [PATCH 0154/1390] Update hlo_ops.cc

---
 tensorflow/compiler/mlir/xla/ir/hlo_ops.cc | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tensorflow/compiler/mlir/xla/ir/hlo_ops.cc b/tensorflow/compiler/mlir/xla/ir/hlo_ops.cc
index 569e45912a2..7f313b56925 100644
--- a/tensorflow/compiler/mlir/xla/ir/hlo_ops.cc
+++ b/tensorflow/compiler/mlir/xla/ir/hlo_ops.cc
@@ -143,7 +143,7 @@ DenseIntElementsAttr BuildConvPaddingAttrs(
 
   int rank = padding_low.size();
   SmallVector<int64_t, 8> padding;
-  for (unsigned i = 0; i < static_cast<size_t>(rank); ++i) {
+  for (unsigned i = 0, e = rank; i < e; ++i) {
     padding.push_back(GetPaddingValue(padding_attr, {i, 0}) + padding_low[i]);
     padding.push_back(GetPaddingValue(padding_attr, {i, 1}) + padding_high[i]);
   }
@@ -853,7 +853,7 @@ static Attribute foldConcatenateHelper(ConcatenateOp* op,
   auto shape = type.getShape();
 
   size_t top_size = 1;
-  for (size_t i = 0; i < axis; i++) {
+  for (int i = 0, e = axis; i < e; i++) {
     top_size = top_size * shape[i];
   }
 
@@ -1118,7 +1118,7 @@ static LogicalResult Verify(MapOp op) {
   // increasing.
   auto values = op.dimensions().getValues<int64_t>();
   auto dimensions = std::vector<int64_t>{values.begin(), values.end()};
-  for (int i = 0; static_cast<size_t>(i) < dimensions.size(); ++i) {
+  for (int i = 0, e = dimensions.size(); i < e; ++i) {
     if (dimensions[i] != i)
       return op.emitOpError() << "requires monotonically increasing dimension "
                                  "numbers, but got: "

From 9f535c3290cc3a8bdc503a4c48d3b0640b9f4798 Mon Sep 17 00:00:00 2001
From: tg-at-google <taregaskin@google.com>
Date: Sun, 14 Jun 2020 15:22:51 -0400
Subject: [PATCH 0155/1390] Update quantization_config.cc

---
 .../mlir/lite/quantization/quantization_config.cc         | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tensorflow/compiler/mlir/lite/quantization/quantization_config.cc b/tensorflow/compiler/mlir/lite/quantization/quantization_config.cc
index b299fa8f4c2..cdff93502f2 100644
--- a/tensorflow/compiler/mlir/lite/quantization/quantization_config.cc
+++ b/tensorflow/compiler/mlir/lite/quantization/quantization_config.cc
@@ -48,9 +48,9 @@ bool ParseInputNodeQuantSpecs(absl::string_view node_names,
   std::vector<double> node_mins;
   if (!min_values.empty()) {
     std::vector<std::string> node_mins_str = absl::StrSplit(min_values, ',');
-    for (int i : llvm::seq<int>(node_mins_str.size())) {
+    for (const std::string&node_min : node_mins_str.size()) {
       double value;
-      if (!absl::SimpleAtod(node_mins_str[i], &value)) {
+      if (!absl::SimpleAtod(node_min, &value)) {
         return true;
       }
       node_mins.push_back(value);
@@ -60,9 +60,9 @@ bool ParseInputNodeQuantSpecs(absl::string_view node_names,
   std::vector<double> node_maxs;
   if (!max_values.empty()) {
     std::vector<std::string> node_maxs_str = absl::StrSplit(max_values, ',');
-    for (int i : llvm::seq<int>(node_maxs_str.size())) {
+    for (const std::string&node_max : node_maxs_str.size()) {
       double value;
-      if (!absl::SimpleAtod(node_maxs_str[i], &value)) {
+      if (!absl::SimpleAtod(node_max, &value)) {
         llvm::errs() << "Unexpected mins: " << node_maxs_str[i] << "\n";
         return true;
       }

From 6bb481c58b27b3fc99c0d8fc71b9d58f13e8b0ba Mon Sep 17 00:00:00 2001
From: Amedeo Cavallo <amedeo.cavallo96@gmail.com>
Date: Mon, 15 Jun 2020 11:56:50 +0200
Subject: [PATCH 0156/1390] C linkage for stm32l4HAL target

C linkage for output retargeting on stm32l4HAL target
---
 tensorflow/lite/micro/stm32f4HAL/debug_log.cc | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/tensorflow/lite/micro/stm32f4HAL/debug_log.cc b/tensorflow/lite/micro/stm32f4HAL/debug_log.cc
index 4be3b40e782..90dd7cfd787 100644
--- a/tensorflow/lite/micro/stm32f4HAL/debug_log.cc
+++ b/tensorflow/lite/micro/stm32f4HAL/debug_log.cc
@@ -22,6 +22,10 @@ limitations under the License.
 
 extern UART_HandleTypeDef DEBUG_UART_HANDLE;
 
+#ifdef __cplusplus
+ extern "C" {
+#endif
+
 #ifdef __GNUC__
 int __io_putchar(int ch) {
   HAL_UART_Transmit(&DEBUG_UART_HANDLE, (uint8_t *)&ch, 1, HAL_MAX_DELAY);
@@ -36,4 +40,8 @@ int fputc(int ch, FILE *f) {
 }
 #endif /* __GNUC__ */
 
-extern "C" void DebugLog(const char *s) { fprintf(stderr, "%s", s); }
+void DebugLog(const char *s) { fprintf(stderr, "%s", s); }
+
+#ifdef __cplusplus
+}
+#endif

From f926d8c10efb07176ae559d0e098cdfdb4d03219 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 15 Jun 2020 03:38:44 -0700
Subject: [PATCH 0157/1390] Introduces a new experimental package that: -
 Defines a schema for configuring delegates - Defines a C++ plugin mechanism
 using the schema, so that code can support   configuring arbitrary delegates
 without a build-time dependency

PiperOrigin-RevId: 316433209
Change-Id: Id538d0d3885bba9dd4094892915e9b6b736efd7d
---
 .../acceleration/configuration/BUILD          | 160 ++++++++++++++
 .../configuration/configuration.proto         | 208 ++++++++++++++++++
 .../configuration/delegate_registry.cc        |  60 +++++
 .../configuration/delegate_registry.h         |  95 ++++++++
 .../acceleration/configuration/gpu_plugin.cc  |  62 ++++++
 .../configuration/hexagon_plugin.cc           |  73 ++++++
 .../configuration/nnapi_plugin.cc             |  93 ++++++++
 .../configuration/nnapi_plugin_test.cc        | 175 +++++++++++++++
 .../configuration/proto_to_flatbuffer.cc      |  58 +++++
 .../configuration/proto_to_flatbuffer.h       |  32 +++
 10 files changed, 1016 insertions(+)
 create mode 100644 tensorflow/lite/experimental/acceleration/configuration/BUILD
 create mode 100644 tensorflow/lite/experimental/acceleration/configuration/configuration.proto
 create mode 100644 tensorflow/lite/experimental/acceleration/configuration/delegate_registry.cc
 create mode 100644 tensorflow/lite/experimental/acceleration/configuration/delegate_registry.h
 create mode 100644 tensorflow/lite/experimental/acceleration/configuration/gpu_plugin.cc
 create mode 100644 tensorflow/lite/experimental/acceleration/configuration/hexagon_plugin.cc
 create mode 100644 tensorflow/lite/experimental/acceleration/configuration/nnapi_plugin.cc
 create mode 100644 tensorflow/lite/experimental/acceleration/configuration/nnapi_plugin_test.cc
 create mode 100644 tensorflow/lite/experimental/acceleration/configuration/proto_to_flatbuffer.cc
 create mode 100644 tensorflow/lite/experimental/acceleration/configuration/proto_to_flatbuffer.h

diff --git a/tensorflow/lite/experimental/acceleration/configuration/BUILD b/tensorflow/lite/experimental/acceleration/configuration/BUILD
new file mode 100644
index 00000000000..38d28d5cc2e
--- /dev/null
+++ b/tensorflow/lite/experimental/acceleration/configuration/BUILD
@@ -0,0 +1,160 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+load("@flatbuffers//:build_defs.bzl", "flatbuffer_cc_library", "flatbuffer_java_library", "flatc_path")
+
+package(
+    default_visibility = [
+        "//visibility:public",
+    ],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+genrule(
+    name = "configuration_schema",
+    srcs = ["configuration.proto"],
+    outs = ["configuration.fbs"],
+    # We rename the namespace since otherwise the proto classes and flatbuffer
+    # classes would have the same names.
+    cmd = """
+    $(location {}) --proto -o $(@D) $(location :configuration.proto)
+    perl -p -i -e 's/tflite.proto/tflite/' $(@D)/configuration.fbs
+    """.format(flatc_path),
+    tools = [
+        flatc_path,
+    ],
+)
+
+genrule(
+    name = "configuration_fbs_contents_cc",
+    srcs = ["configuration.fbs"],
+    outs = ["configuration_fbs_contents-inl.h"],
+    cmd = """
+      echo 'constexpr char configuration_fbs_contents[] = R"Delimiter(' > $(@)
+      cat < $(<) >> $(@)
+      echo ')Delimiter";' >> $(@)
+    """,
+)
+
+proto_library(
+    name = "configuration_proto",
+    srcs = [
+        "configuration.proto",
+    ],
+)
+
+cc_proto_library(
+    name = "configuration_cc_proto",
+    deps = [":configuration_proto"],
+)
+
+java_lite_proto_library(
+    name = "configuration_java_proto_lite",
+    deps = [":configuration_proto"],
+)
+
+flatbuffer_cc_library(
+    name = "configuration_fbs",
+    srcs = [":configuration.fbs"],
+)
+
+flatbuffer_java_library(
+    name = "configuration_fbs_java",
+    srcs = [":configuration.fbs"],
+)
+
+cc_library(
+    name = "proto_to_flatbuffer",
+    srcs = [
+        "configuration_fbs_contents-inl.h",
+        "proto_to_flatbuffer.cc",
+    ],
+    hdrs = ["proto_to_flatbuffer.h"],
+    deps = [
+        ":configuration_cc_proto",
+        ":configuration_fbs",
+        "//tensorflow/core/platform:protobuf",
+        "//tensorflow/lite:minimal_logging",
+        "@flatbuffers",
+    ],
+)
+
+cc_library(
+    name = "delegate_registry",
+    srcs = ["delegate_registry.cc"],
+    hdrs = ["delegate_registry.h"],
+    deps = [
+        ":configuration_fbs",
+        "//tensorflow/lite/c:common",
+        "@com_google_absl//absl/synchronization",
+    ],
+)
+
+cc_library(
+    name = "nnapi_plugin",
+    srcs = ["nnapi_plugin.cc"],
+    deps = [
+        ":configuration_fbs",
+        ":delegate_registry",
+        "//tensorflow/lite/delegates/nnapi:nnapi_delegate",
+        "@com_google_absl//absl/memory",
+    ],
+    alwayslink = 1,  # For registration to always run.
+)
+
+cc_test(
+    name = "nnapi_plugin_test",
+    srcs = ["nnapi_plugin_test.cc"],
+    deps = [
+        ":configuration_fbs",
+        ":delegate_registry",
+        ":nnapi_plugin",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/delegates/nnapi:nnapi_delegate",
+        "//tensorflow/lite/delegates/nnapi:nnapi_delegate_mock_test",
+        "//tensorflow/lite/kernels:test_util",
+        "@com_google_googletest//:gtest_main",
+        "@flatbuffers",
+    ],
+)
+
+cc_library(
+    name = "hexagon_plugin",
+    srcs = ["hexagon_plugin.cc"],
+    deps = [
+        ":configuration_fbs",
+        ":delegate_registry",
+        "@com_google_absl//absl/memory",
+    ] + select({
+        "//tensorflow:android": [
+            "//tensorflow/lite/delegates/hexagon:hexagon_delegate",
+        ],
+        "//conditions:default": [],
+    }),
+    alwayslink = 1,  # For registration to always run.
+)
+
+cc_library(
+    name = "gpu_plugin",
+    srcs = ["gpu_plugin.cc"],
+    deps = [
+        ":configuration_fbs",
+        ":delegate_registry",
+        "//tensorflow/lite/delegates/gpu:delegate",
+        "@com_google_absl//absl/memory",
+    ],
+    alwayslink = 1,  # For registration to always run.
+)
diff --git a/tensorflow/lite/experimental/acceleration/configuration/configuration.proto b/tensorflow/lite/experimental/acceleration/configuration/configuration.proto
new file mode 100644
index 00000000000..e1c49f02856
--- /dev/null
+++ b/tensorflow/lite/experimental/acceleration/configuration/configuration.proto
@@ -0,0 +1,208 @@
+// Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// This schema defines how to configure TFLite for delegation. These
+// definitions can be used in multiple ways: as output of a compatibility list,
+// in benchmarking tools and to decouple delegate instantiation from code.
+//
+// The schema is work-in-progress, covering the most broadly used delegates and
+// options.
+
+syntax = "proto2";
+
+package tflite.proto;
+
+// ExecutionPreference is used to match accelerators against the preferences of
+// the current application or usecase. Some of the values here can appear both
+// in the compatibility list and as input, some only as input.
+//
+// These are separate from NNAPIExecutionPreference - the compatibility list
+// design doesn't assume a one-to-one mapping between which usecases
+// compatibility list entries have been developed for and what settings are used
+// for NNAPI.
+enum ExecutionPreference {
+  // Match any selected preference. Whitelist (semantically - value is same as
+  // on input).
+  ANY = 0;
+  // Match low latency preference. Both compatibility list and input.
+  LOW_LATENCY = 1;
+  // Math low power preference. Both compatibility list and input.
+  LOW_POWER = 2;
+  // Never accelerate. Can be used for input to compatibility list or for
+  // standalone Acceleration configuration.
+  FORCE_CPU = 3;
+}
+
+// TFLite delegate to use.
+enum Delegate {
+  NONE = 0;
+  NNAPI = 1;
+  GPU = 2;
+  HEXAGON = 3;
+  XNNPACK = 4;
+  // TODO(b/157893534): Support exposing edgetpu tflite delegate creation
+  // options.
+  EDGETPU = 5;
+}
+
+enum NNAPIExecutionPreference {
+  // Undefined.
+  UNDEFINED = 0;
+  // Prefer executing in a way that minimizes battery drain.
+  NNAPI_LOW_POWER = 1;
+  // Prefer returning a single answer as fast as possible, even if this causes
+  // more power consumption.
+  NNAPI_FAST_SINGLE_ANSWER = 2;
+  // Prefer maximizing the throughput of successive frames, for example when
+  // processing successive frames coming from the camera.
+  NNAPI_SUSTAINED_SPEED = 3;
+}
+
+// One possible acceleration configuration.
+message ComputeSettings {
+  // Which preference to use this accelerator for.
+  optional ExecutionPreference preference = 1;
+  // How to configure TFLite
+  optional TFLiteSettings tflite_settings = 2;
+  // Identifiers to use for instrumentation and telemetry.
+  optional string model_namespace_for_statistics = 3;
+  optional string model_identifier_for_statistics = 4;
+}
+
+// NNAPI delegate settings.
+message NNAPISettings {
+  // Which instance (NNAPI accelerator) to use. One driver may provide several
+  // accelerators (though a driver may also hide several back-ends behind one
+  // name, at the choice of the driver vendor).
+  // Note that driver introspection is only available in Android Q and later.
+  optional string accelerator_name = 1;
+
+  // NNAPI model compilation caching settings to be passed to
+  // tflite::StatefulNnApiDelegate
+  optional string cache_directory = 2;
+  optional string model_token = 3;
+
+  // NNAPI execution preference to pass. See
+  // https://developer.android.com/ndk/reference/group/neural-networks.html
+  optional NNAPIExecutionPreference execution_preference = 4;
+
+  // Number of instances to cache for the same model (for input size
+  // changes). This is mandatory for getting reasonable performance in that
+  // case.
+  optional int32 no_of_nnapi_instances_to_cache = 5;
+
+  // Whether to automatically fall back to TFLite CPU path.
+  optional FallbackSettings fallback_settings = 6;
+
+  // Whether to allow use of NNAPI CPU (nnapi-reference accelerator) on Android
+  // 10+ when an accelerator name is not specified. The NNAPI CPU typically
+  // performs less well than the TfLite built-in kernels; but allowing allows a
+  // model to be partially accelerated which may be a win.
+  optional bool allow_nnapi_cpu_on_android_10_plus = 7;
+}
+
+// Which GPU backend to select. Default behaviour on Android is to try OpenCL
+// and if it's not available fall back to OpenGL.
+enum GPUBackend {
+  UNSET = 0;
+  OPENCL = 1;
+  OPENGL = 2;
+  // Not yet supported.
+  // VULKAN = 3;
+  // METAL = 4;
+}
+
+// GPU Delegate settings.
+//
+// See
+// https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/delegates/gpu/delegate.h
+message GPUSettings {
+  optional bool is_precision_loss_allowed = 1;
+  optional bool enable_quantized_inference = 2 [default = true];
+  optional GPUBackend force_backend = 3;
+  // TODO(b/152019007): add remaining options.
+}
+
+// Hexagon Delegate settings.
+//
+// See
+// https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/delegates/hexagon/hexagon_delegate.h
+message HexagonSettings {
+  optional int32 debug_level = 1;
+  optional int32 powersave_level = 2;
+  optional bool print_graph_profile = 3;
+  optional bool print_graph_debug = 4;
+}
+
+// XNNPack Delegate settings.
+//
+// See
+// https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h
+message XNNPackSettings {
+  optional int32 num_threads = 1;
+}
+
+message CPUSettings {
+  optional int32 num_threads = 1;
+}
+
+// How to configure TFLite.
+message TFLiteSettings {
+  // Which delegate to use.
+  optional Delegate delegate = 1;
+
+  // How to configure the chosen delegate.
+  // (In principle we would like to use 'oneof', but flatc turns that into an
+  // nested anonymous table rather than a union. See
+  // https://github.com/google/flatbuffers/issues/4628).
+  optional NNAPISettings nnapi_settings = 2;
+  optional GPUSettings gpu_settings = 3;
+  optional HexagonSettings hexagon_settings = 4;
+  optional XNNPackSettings xnnpack_settings = 5;
+
+  // How to configure CPU execution.
+  optional CPUSettings cpu_settings = 6;
+
+  // Shared delegation settings.
+  optional int32 max_delegated_partitions = 7;
+}
+
+// Whether to automatically fallback to TFLite CPU path on delegation errors.
+//
+// Typically fallback is enabled in production use but disabled in tests and
+// benchmarks to ensure they test the intended path.
+message FallbackSettings {
+  // Whether to allow automatically falling back to TfLite CPU path on
+  // compilation failure. Default is not allowing automatic fallback.
+  //
+  // This is useful in naive production usecases where the caller would prefer
+  // for the model to run even if it's not accelerated. More advanced users will
+  // implement fallback themselves; e.g., by using a different model on CPU.
+  //
+  // Note that compilation errors may occur either at initial
+  // ModifyGraphWithDelegate() time, or when calling AllocateTensors() after
+  // resizing.
+  optional bool allow_automatic_fallback_on_compilation_error = 7;
+  // Whether to allow automatically falling back to TfLite CPU path on
+  // execution error. Default is not allowing automatic fallback.
+  //
+  // Experimental, use with care (only when you have complete control over the
+  // client code).
+  //
+  // The caveat above for compilation error holds.  Additionally, execution-time
+  // errors are harder to handle automatically as they require invalidating the
+  // TfLite interpreter which most client code has not been designed to deal
+  // with.
+  optional bool allow_automatic_fallback_on_execution_error = 8;
+}
diff --git a/tensorflow/lite/experimental/acceleration/configuration/delegate_registry.cc b/tensorflow/lite/experimental/acceleration/configuration/delegate_registry.cc
new file mode 100644
index 00000000000..b8d80342d5f
--- /dev/null
+++ b/tensorflow/lite/experimental/acceleration/configuration/delegate_registry.cc
@@ -0,0 +1,60 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/experimental/acceleration/configuration/delegate_registry.h"
+
+#include "absl/synchronization/mutex.h"
+
+namespace tflite {
+namespace delegates {
+
+void DelegatePluginRegistry::RegisterImpl(
+    const std::string& name,
+    std::function<
+        std::unique_ptr<DelegatePluginInterface>(const TFLiteSettings&)>
+        creator_function) {
+  absl::MutexLock lock(&mutex_);
+  factories_[name] = creator_function;
+}
+
+std::unique_ptr<DelegatePluginInterface> DelegatePluginRegistry::CreateImpl(
+    const std::string& name, const TFLiteSettings& settings) {
+  absl::MutexLock lock(&mutex_);
+  auto it = factories_.find(name);
+  if (it != factories_.end()) {
+    return it->second(settings);
+  } else {
+    return nullptr;
+  }
+}
+
+DelegatePluginRegistry* DelegatePluginRegistry::GetSingleton() {
+  static auto* instance = new DelegatePluginRegistry();
+  return instance;
+}
+
+std::unique_ptr<DelegatePluginInterface> DelegatePluginRegistry::CreateByName(
+    const std::string& name, const TFLiteSettings& settings) {
+  auto* const instance = DelegatePluginRegistry::GetSingleton();
+  return instance->CreateImpl(name, settings);
+}
+
+DelegatePluginRegistry::Register::Register(const std::string& name,
+                                           CreatorFunction creator_function) {
+  auto* const instance = DelegatePluginRegistry::GetSingleton();
+  instance->RegisterImpl(name, creator_function);
+}
+
+}  // namespace delegates
+}  // namespace tflite
diff --git a/tensorflow/lite/experimental/acceleration/configuration/delegate_registry.h b/tensorflow/lite/experimental/acceleration/configuration/delegate_registry.h
new file mode 100644
index 00000000000..c86759dcc3f
--- /dev/null
+++ b/tensorflow/lite/experimental/acceleration/configuration/delegate_registry.h
@@ -0,0 +1,95 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_CONFIGURATION_DELEGATE_REGISTRY_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_CONFIGURATION_DELEGATE_REGISTRY_H_
+
+#include <memory>
+#include <unordered_map>
+
+#include "absl/synchronization/mutex.h"
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/experimental/acceleration/configuration/configuration_generated.h"
+
+// Defines an interface for TFLite delegate plugins.
+//
+// The acceleration library aims to support all TFLite delegates based on
+// configuration expressed as data (flatbuffers). However, consumers tend to
+// care about size and also use a subset of delegates. Hence we don't want to
+// statically build against all delegates.
+//
+// This interface allows plugins to handle specific delegates.
+//
+// Goal of this interface is not to abstract away all the differences between
+// delegates. The goal is only to avoid static linking.
+//
+// Note to implementers: this interface may change if new delegates don't fit
+// into the same design.
+namespace tflite {
+namespace delegates {
+
+// Same w/ Interpreter::TfLiteDelegatePtr to avoid pulling
+// tensorflow/lite/interpreter.h dependency
+using TfLiteDelegatePtr =
+    std::unique_ptr<TfLiteDelegate, void (*)(TfLiteDelegate*)>;
+
+class DelegatePluginInterface {
+ public:
+  virtual TfLiteDelegatePtr Create() = 0;
+  virtual int GetDelegateErrno(TfLiteDelegate* from_delegate) = 0;
+  virtual ~DelegatePluginInterface() = default;
+};
+
+// A stripped-down registry that allows delegate plugins to be created by name.
+//
+// Limitations:
+// - Doesn't allow deregistration.
+// - Doesn't check for duplication registration.
+//
+class DelegatePluginRegistry {
+ public:
+  typedef std::function<std::unique_ptr<DelegatePluginInterface>(
+      const TFLiteSettings&)>
+      CreatorFunction;
+  // Returns a DelegatePluginInterface registered with `name` or nullptr if no
+  // matching plugin found.
+  // TFLiteSettings is per-plugin, so that the corresponding delegate options
+  // data lifetime is maintained.
+  static std::unique_ptr<DelegatePluginInterface> CreateByName(
+      const std::string& name, const TFLiteSettings& settings);
+
+  // Struct to be statically allocated for registration.
+  struct Register {
+    Register(const std::string& name, CreatorFunction creator_function);
+  };
+
+ private:
+  void RegisterImpl(const std::string& name, CreatorFunction creator_function);
+  std::unique_ptr<DelegatePluginInterface> CreateImpl(
+      const std::string& name, const TFLiteSettings& settings);
+  static DelegatePluginRegistry* GetSingleton();
+  std::unordered_map<std::string, CreatorFunction> factories_;
+  absl::Mutex mutex_;
+};
+
+}  // namespace delegates
+}  // namespace tflite
+
+#define TFLITE_REGISTER_DELEGATE_FACTORY_FUNCTION_VNAME(name, f) \
+  static auto* g_delegate_plugin_##name##_ =                     \
+      new DelegatePluginRegistry::Register(#name, f);
+#define TFLITE_REGISTER_DELEGATE_FACTORY_FUNCTION(name, f) \
+  TFLITE_REGISTER_DELEGATE_FACTORY_FUNCTION_VNAME(name, f);
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_CONFIGURATION_DELEGATE_REGISTRY_H_
diff --git a/tensorflow/lite/experimental/acceleration/configuration/gpu_plugin.cc b/tensorflow/lite/experimental/acceleration/configuration/gpu_plugin.cc
new file mode 100644
index 00000000000..25b8171c5ea
--- /dev/null
+++ b/tensorflow/lite/experimental/acceleration/configuration/gpu_plugin.cc
@@ -0,0 +1,62 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <memory>
+
+#include "absl/memory/memory.h"
+#include "tensorflow/lite/delegates/gpu/delegate.h"
+#include "tensorflow/lite/experimental/acceleration/configuration/configuration_generated.h"
+#include "tensorflow/lite/experimental/acceleration/configuration/delegate_registry.h"
+
+namespace tflite {
+namespace delegates {
+class GpuPlugin : public DelegatePluginInterface {
+ public:
+  TfLiteDelegatePtr Create() override {
+    return TfLiteDelegatePtr(TfLiteGpuDelegateV2Create(&options_),
+                             TfLiteGpuDelegateV2Delete);
+  }
+  int GetDelegateErrno(TfLiteDelegate* from_delegate) override { return 0; }
+  static std::unique_ptr<DelegatePluginInterface> New(
+      const TFLiteSettings& acceleration) {
+    return absl::make_unique<GpuPlugin>(acceleration);
+  }
+  explicit GpuPlugin(const TFLiteSettings& tflite_settings)
+      : options_(TfLiteGpuDelegateOptionsV2Default()) {
+    const auto* gpu_settings = tflite_settings.gpu_settings();
+    if (gpu_settings) {
+      options_.inference_priority1 =
+          gpu_settings->is_precision_loss_allowed()
+              ? TFLITE_GPU_INFERENCE_PRIORITY_MIN_LATENCY
+              : TFLITE_GPU_INFERENCE_PRIORITY_MAX_PRECISION;
+      if (gpu_settings->enable_quantized_inference()) {
+        options_.experimental_flags |=
+            TFLITE_GPU_EXPERIMENTAL_FLAGS_ENABLE_QUANT;
+      }
+      if (gpu_settings->force_backend() == GPUBackend_OPENCL) {
+        options_.experimental_flags |= TFLITE_GPU_EXPERIMENTAL_FLAGS_CL_ONLY;
+      } else if (gpu_settings->force_backend() == GPUBackend_OPENGL) {
+        options_.experimental_flags |= TFLITE_GPU_EXPERIMENTAL_FLAGS_GL_ONLY;
+      }
+    }
+  }
+
+ private:
+  TfLiteGpuDelegateOptionsV2 options_;
+};
+
+TFLITE_REGISTER_DELEGATE_FACTORY_FUNCTION(GpuPlugin, GpuPlugin::New);
+
+}  // namespace delegates
+}  // namespace tflite
diff --git a/tensorflow/lite/experimental/acceleration/configuration/hexagon_plugin.cc b/tensorflow/lite/experimental/acceleration/configuration/hexagon_plugin.cc
new file mode 100644
index 00000000000..7f2674604b0
--- /dev/null
+++ b/tensorflow/lite/experimental/acceleration/configuration/hexagon_plugin.cc
@@ -0,0 +1,73 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <memory>
+
+#include "absl/memory/memory.h"
+#include "tensorflow/lite/experimental/acceleration/configuration/configuration_generated.h"
+#include "tensorflow/lite/experimental/acceleration/configuration/delegate_registry.h"
+
+#if defined(__ARM_ARCH)
+#include "tensorflow/lite/delegates/hexagon/hexagon_delegate.h"
+#endif
+
+namespace tflite {
+namespace delegates {
+class HexagonPlugin : public DelegatePluginInterface {
+ public:
+  TfLiteDelegatePtr Create() override {
+#if defined(__ARM_ARCH)
+    TfLiteHexagonInit();
+    auto* delegate_ptr = TfLiteHexagonDelegateCreate(&options_);
+    TfLiteDelegatePtr delegate(delegate_ptr, [](TfLiteDelegate* delegate) {
+      TfLiteHexagonDelegateDelete(delegate);
+      TfLiteHexagonTearDown();
+    });
+    return delegate;
+#else   // !defined(__ARM_ARCH)
+    return TfLiteDelegatePtr(nullptr, [](TfLiteDelegate*) {});
+#endif  // defined(__ARM_ARCH)
+  }
+  int GetDelegateErrno(TfLiteDelegate* /* from_delegate */) override {
+    return 0;
+  }
+  static std::unique_ptr<HexagonPlugin> New(
+      const TFLiteSettings& tflite_settings) {
+    return absl::make_unique<HexagonPlugin>(tflite_settings);
+  }
+  explicit HexagonPlugin(const TFLiteSettings& tflite_settings) {
+    const HexagonSettings* settings = tflite_settings.hexagon_settings();
+#if defined(__ARM_ARCH)
+    options_ = TfLiteHexagonDelegateOptions({0});
+    if (settings) {
+      options_.debug_level = settings->debug_level();
+      options_.powersave_level = settings->powersave_level();
+      options_.print_graph_profile = settings->print_graph_profile();
+      options_.print_graph_debug = settings->print_graph_debug();
+    }
+#else
+    (void)settings;
+#endif
+  }
+
+ private:
+#if defined(__ARM_ARCH)
+  TfLiteHexagonDelegateOptions options_;
+#endif
+};
+
+TFLITE_REGISTER_DELEGATE_FACTORY_FUNCTION(HexagonPlugin, HexagonPlugin::New);
+
+}  // namespace delegates
+}  // namespace tflite
diff --git a/tensorflow/lite/experimental/acceleration/configuration/nnapi_plugin.cc b/tensorflow/lite/experimental/acceleration/configuration/nnapi_plugin.cc
new file mode 100644
index 00000000000..7301983a815
--- /dev/null
+++ b/tensorflow/lite/experimental/acceleration/configuration/nnapi_plugin.cc
@@ -0,0 +1,93 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <memory>
+
+#include "absl/memory/memory.h"
+#include "tensorflow/lite/delegates/nnapi/nnapi_delegate.h"
+#include "tensorflow/lite/experimental/acceleration/configuration/configuration_generated.h"
+#include "tensorflow/lite/experimental/acceleration/configuration/delegate_registry.h"
+
+namespace tflite {
+namespace delegates {
+
+inline tflite::StatefulNnApiDelegate::Options::ExecutionPreference
+ConvertExecutionPrefence(
+    NNAPIExecutionPreference from_compatibility_preference) {
+  using TflitePreference =
+      tflite::StatefulNnApiDelegate::Options::ExecutionPreference;
+  switch (from_compatibility_preference) {
+    case NNAPIExecutionPreference_NNAPI_LOW_POWER:
+      return TflitePreference::kLowPower;
+    case NNAPIExecutionPreference_NNAPI_FAST_SINGLE_ANSWER:
+      return TflitePreference::kFastSingleAnswer;
+    case NNAPIExecutionPreference_NNAPI_SUSTAINED_SPEED:
+      return TflitePreference::kSustainedSpeed;
+    default:
+      return TflitePreference::kUndefined;
+  }
+}
+
+class NnapiPlugin : public DelegatePluginInterface {
+ public:
+  TfLiteDelegatePtr Create() override {
+    auto nnapi_delegate =
+        absl::make_unique<tflite::StatefulNnApiDelegate>(options_);
+    return TfLiteDelegatePtr(
+        nnapi_delegate.release(), [](TfLiteDelegate* delegate) {
+          delete reinterpret_cast<tflite::StatefulNnApiDelegate*>(delegate);
+        });
+  }
+  int GetDelegateErrno(TfLiteDelegate* from_delegate) override {
+    auto nnapi_delegate =
+        reinterpret_cast<tflite::StatefulNnApiDelegate*>(from_delegate);
+    return nnapi_delegate->GetNnApiErrno();
+  }
+  static std::unique_ptr<NnapiPlugin> New(
+      const TFLiteSettings& tflite_settings) {
+    return absl::make_unique<NnapiPlugin>(tflite_settings);
+  }
+  explicit NnapiPlugin(const TFLiteSettings& tflite_settings) {
+    const NNAPISettings* nnapi_settings = tflite_settings.nnapi_settings();
+    if (!nnapi_settings) return;
+    if (nnapi_settings->accelerator_name() &&
+        nnapi_settings->accelerator_name()->Length() != 0) {
+      accelerator_ = nnapi_settings->accelerator_name()->str();
+      options_.accelerator_name = accelerator_.c_str();
+    }
+    if (nnapi_settings->cache_directory() &&
+        nnapi_settings->cache_directory()->Length() != 0) {
+      cache_dir_ = nnapi_settings->cache_directory()->str();
+      options_.cache_dir = cache_dir_.c_str();
+    }
+    if (nnapi_settings->model_token() &&
+        nnapi_settings->model_token()->Length() != 0) {
+      model_token_ = nnapi_settings->model_token()->str();
+      options_.model_token = model_token_.c_str();
+    }
+    options_.execution_preference =
+        ConvertExecutionPrefence(nnapi_settings->execution_preference());
+    options_.disallow_nnapi_cpu =
+        !nnapi_settings->allow_nnapi_cpu_on_android_10_plus();
+  }
+
+ private:
+  std::string accelerator_, cache_dir_, model_token_;
+  tflite::StatefulNnApiDelegate::Options options_;
+};
+
+TFLITE_REGISTER_DELEGATE_FACTORY_FUNCTION(NnapiPlugin, NnapiPlugin::New);
+
+}  // namespace delegates
+}  // namespace tflite
diff --git a/tensorflow/lite/experimental/acceleration/configuration/nnapi_plugin_test.cc b/tensorflow/lite/experimental/acceleration/configuration/nnapi_plugin_test.cc
new file mode 100644
index 00000000000..4f9f5dd08c1
--- /dev/null
+++ b/tensorflow/lite/experimental/acceleration/configuration/nnapi_plugin_test.cc
@@ -0,0 +1,175 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <memory>
+
+#include <gtest/gtest.h>
+#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/delegates/nnapi/nnapi_delegate.h"
+#include "tensorflow/lite/delegates/nnapi/nnapi_delegate_mock_test.h"
+#include "tensorflow/lite/experimental/acceleration/configuration/configuration_generated.h"
+#include "tensorflow/lite/experimental/acceleration/configuration/delegate_registry.h"
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/test_util.h"
+
+// Tests for checking that the NNAPI Delegate plugin correctly handles all the
+// options from the flatbuffer.
+//
+// Checking done at NNAPI call level, as that is where we have a mockable
+// layer.
+namespace tflite {
+namespace {
+
+using delegate::nnapi::NnApiMock;
+
+class SingleAddOpModel : tflite::SingleOpModel {
+ public:
+  void Build() {
+    int input = AddInput({tflite::TensorType_FLOAT32, {1, 2, 2}});
+    int constant = AddConstInput({tflite::TensorType_FLOAT32, {1, 2, 2}},
+                                 {1.0f, 1.0f, 1.0f, 1.0f});
+    AddOutput({tflite::TensorType_FLOAT32, {}});
+
+    SetBuiltinOp(tflite::BuiltinOperator_ADD, tflite::BuiltinOptions_AddOptions,
+                 tflite::CreateAddOptions(builder_).Union());
+    BuildInterpreter({GetShape(input), GetShape(constant)});
+  }
+
+  tflite::Interpreter* Interpreter() const { return interpreter_.get(); }
+};
+
+class NNAPIPluginTest : public ::testing::Test {
+ protected:
+  NNAPIPluginTest() : delegate_(nullptr, [](TfLiteDelegate*) {}) {}
+  void SetUp() override {
+    nnapi_ = const_cast<NnApi*>(NnApiImplementation());
+    nnapi_mock_ = absl::make_unique<NnApiMock>(nnapi_);
+    nnapi_->ANeuralNetworksModel_getSupportedOperationsForDevices =
+        [](const ANeuralNetworksModel* model,
+           const ANeuralNetworksDevice* const* devices, uint32_t numDevices,
+           bool* supportedOps) -> int {
+      supportedOps[0] = true;
+      return 0;
+    };
+    model_.Build();
+  }
+  template <NNAPIExecutionPreference input, int output>
+  void CheckExecutionPreference() {
+    // Note - this uses a template since the NNAPI functions are C function
+    // pointers rather than lambdas so can't capture variables.
+    nnapi_->ANeuralNetworksCompilation_setPreference =
+        [](ANeuralNetworksCompilation* compilation, int32_t preference) {
+          return preference - output;
+        };
+    CreateDelegate(CreateNNAPISettings(fbb_, 0, 0, 0, input));
+    // Since delegation succeeds, the model becomes immutable and hence can't
+    // reuse it.
+    SingleAddOpModel model;
+    model.Build();
+    EXPECT_EQ(model.Interpreter()->ModifyGraphWithDelegate(delegate_.get()),
+              kTfLiteOk)
+        << " given input: " << input << " expected output: " << output;
+  }
+
+  void CreateDelegate(flatbuffers::Offset<NNAPISettings> settings) {
+    settings_ = flatbuffers::GetTemporaryPointer(
+        fbb_, CreateTFLiteSettings(fbb_, tflite::Delegate_NNAPI, settings));
+
+    plugin_ = delegates::DelegatePluginRegistry::CreateByName("NnapiPlugin",
+                                                              *settings_);
+    delegate_ = plugin_->Create();
+  }
+
+  NnApi* nnapi_;
+  std::unique_ptr<NnApiMock> nnapi_mock_;
+  SingleAddOpModel model_;
+  flatbuffers::FlatBufferBuilder fbb_;
+  const TFLiteSettings* settings_ = nullptr;
+  delegates::TfLiteDelegatePtr delegate_;
+  std::unique_ptr<delegates::DelegatePluginInterface> plugin_;
+};
+
+TEST_F(NNAPIPluginTest, PassesAcceleratorName) {
+  // Fails with non-existent "foo".
+  CreateDelegate(CreateNNAPISettings(fbb_, fbb_.CreateString("foo")));
+  EXPECT_EQ(model_.Interpreter()->ModifyGraphWithDelegate(delegate_.get()),
+            kTfLiteDelegateError);
+
+  // Succeeds with "test-device" supported by the mock.
+  CreateDelegate(CreateNNAPISettings(fbb_, fbb_.CreateString("test-device")));
+  EXPECT_EQ(model_.Interpreter()->ModifyGraphWithDelegate(delegate_.get()),
+            kTfLiteOk);
+}
+
+TEST_F(NNAPIPluginTest, PassesExecutionPreference) {
+  CheckExecutionPreference<NNAPIExecutionPreference_UNDEFINED,
+                           StatefulNnApiDelegate::Options::kUndefined>();
+  CheckExecutionPreference<NNAPIExecutionPreference_NNAPI_LOW_POWER,
+                           StatefulNnApiDelegate::Options::kLowPower>();
+  CheckExecutionPreference<NNAPIExecutionPreference_NNAPI_FAST_SINGLE_ANSWER,
+                           StatefulNnApiDelegate::Options::kFastSingleAnswer>();
+  CheckExecutionPreference<NNAPIExecutionPreference_NNAPI_SUSTAINED_SPEED,
+                           StatefulNnApiDelegate::Options::kSustainedSpeed>();
+}
+
+TEST_F(NNAPIPluginTest, PassesCachingParameters) {
+  nnapi_->ANeuralNetworksCompilation_setCaching =
+      [](ANeuralNetworksCompilation* compilation, const char* cacheDir,
+         const uint8_t* token) -> int {
+    if (std::string(cacheDir) != "d") return 1;
+    // Token is hashed with other bits, just check that it's not empty.
+    if (std::string(reinterpret_cast<const char*>(token)).empty()) return 2;
+    return 0;
+  };
+  CreateDelegate(CreateNNAPISettings(fbb_, 0, fbb_.CreateString("d"),
+                                     fbb_.CreateString("t")));
+  EXPECT_EQ(model_.Interpreter()->ModifyGraphWithDelegate(delegate_.get()),
+            kTfLiteOk);
+}
+
+TEST_F(NNAPIPluginTest, PassesFalseNNAPICpuFlag) {
+  CreateDelegate(CreateNNAPISettings(fbb_, 0, 0, 0,
+                                     NNAPIExecutionPreference_UNDEFINED, 0, 0,
+                                     /* allow CPU */ false));
+  nnapi_->ANeuralNetworksModel_getSupportedOperationsForDevices =
+      [](const ANeuralNetworksModel* model,
+         const ANeuralNetworksDevice* const* devices, uint32_t numDevices,
+         bool* supportedOps) -> int {
+    supportedOps[0] = true;
+    // Since no CPU, should only pass one device.
+    return numDevices - 1;
+  };
+  EXPECT_EQ(model_.Interpreter()->ModifyGraphWithDelegate(delegate_.get()),
+            kTfLiteOk);
+}
+
+TEST_F(NNAPIPluginTest, PassesTrueNNAPICpuFlag) {
+  CreateDelegate(CreateNNAPISettings(fbb_, 0, 0, 0,
+                                     NNAPIExecutionPreference_UNDEFINED, 0, 0,
+                                     /* allow CPU */ true));
+  nnapi_->ANeuralNetworksModel_getSupportedOperationsForDevices =
+      [](const ANeuralNetworksModel* model,
+         const ANeuralNetworksDevice* const* devices, uint32_t numDevices,
+         bool* supportedOps) -> int {
+    supportedOps[0] = true;
+    // With CPU allowed, should pass two devices.
+    return numDevices - 2;
+  };
+  EXPECT_EQ(model_.Interpreter()->ModifyGraphWithDelegate(delegate_.get()),
+            kTfLiteOk);
+}
+
+}  // namespace
+}  // namespace tflite
diff --git a/tensorflow/lite/experimental/acceleration/configuration/proto_to_flatbuffer.cc b/tensorflow/lite/experimental/acceleration/configuration/proto_to_flatbuffer.cc
new file mode 100644
index 00000000000..709bb70ca70
--- /dev/null
+++ b/tensorflow/lite/experimental/acceleration/configuration/proto_to_flatbuffer.cc
@@ -0,0 +1,58 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/experimental/acceleration/configuration/proto_to_flatbuffer.h"
+
+#include <string>
+
+#include "flatbuffers/idl.h"  // from @flatbuffers
+#include "flatbuffers/util.h"  // from @flatbuffers
+#include "tensorflow/core/platform/protobuf.h"
+#include "tensorflow/lite/minimal_logging.h"
+
+namespace tflite {
+
+namespace {
+#include "tensorflow/lite/experimental/acceleration/configuration/configuration_fbs_contents-inl.h"
+}
+
+const ComputeSettings* ConvertFromProto(
+    flatbuffers::Parser* parser, const proto::ComputeSettings& proto_settings) {
+  std::string json;
+  tensorflow::protobuf::util::JsonPrintOptions options;
+  options.preserve_proto_field_names = true;
+  options.always_print_primitive_fields = true;  // For catching problems.
+  auto status = tensorflow::protobuf::util::MessageToJsonString(proto_settings,
+                                                                &json, options);
+  if (!status.ok()) {
+    TFLITE_LOG_PROD(TFLITE_LOG_ERROR, "Failed to convert to Json: %s",
+                    status.ToString().c_str());
+    return nullptr;
+  }
+  if (!parser->Parse(configuration_fbs_contents)) {
+    TFLITE_LOG_PROD(TFLITE_LOG_ERROR, "Failed to parse schema: %s",
+                    parser->error_.c_str());
+    return nullptr;
+  }
+  parser->SetRootType("tflite.ComputeSettings");
+  if (!parser->Parse(json.c_str())) {
+    TFLITE_LOG_PROD(TFLITE_LOG_ERROR, "Failed to parse json: %s",
+                    parser->error_.c_str());
+    return nullptr;
+  }
+  return flatbuffers::GetRoot<ComputeSettings>(
+      parser->builder_.GetBufferPointer());
+}
+
+}  // namespace tflite
diff --git a/tensorflow/lite/experimental/acceleration/configuration/proto_to_flatbuffer.h b/tensorflow/lite/experimental/acceleration/configuration/proto_to_flatbuffer.h
new file mode 100644
index 00000000000..3b69e8465a5
--- /dev/null
+++ b/tensorflow/lite/experimental/acceleration/configuration/proto_to_flatbuffer.h
@@ -0,0 +1,32 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_CONFIGURATION_PROTO_TO_FLATBUFFER_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_CONFIGURATION_PROTO_TO_FLATBUFFER_H_
+
+#include "flatbuffers/idl.h"  // from @flatbuffers
+#include "tensorflow/lite/experimental/acceleration/configuration/configuration.pb.h"
+#include "tensorflow/lite/experimental/acceleration/configuration/configuration_generated.h"
+
+namespace tflite {
+
+// Converts the protobuf version ComputeSettings to the flatbuffer version, via
+// json. The parser is used for state - the returned pointer is valid only as
+// long as the parser is kept alive and unmutated.
+const ComputeSettings* ConvertFromProto(
+    flatbuffers::Parser* parser, const proto::ComputeSettings& proto_settings);
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_CONFIGURATION_PROTO_TO_FLATBUFFER_H_

From af52bd27dce7a568ff111b87f922582f6dbeff7e Mon Sep 17 00:00:00 2001
From: Jacques Pienaar <jpienaar@google.com>
Date: Mon, 15 Jun 2020 06:25:37 -0700
Subject: [PATCH 0158/1390] Update ::OperandAdaptor to ::Adaptor

The Adaptor can refer to attributes too, so update naming.

Follow up from https://reviews.llvm.org/D81741

PiperOrigin-RevId: 316451631
Change-Id: If2882e8ef2e75f70ae2c9193b4e8286ab3b0326f
---
 .../compiler/mlir/xla/transforms/lhlo_legalize_to_llvm.cc     | 4 ++--
 .../compiler/mlir/xla/transforms/xla_legalize_to_linalg.cc    | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/tensorflow/compiler/mlir/xla/transforms/lhlo_legalize_to_llvm.cc b/tensorflow/compiler/mlir/xla/transforms/lhlo_legalize_to_llvm.cc
index 8846d7918c7..99d2c08aa98 100644
--- a/tensorflow/compiler/mlir/xla/transforms/lhlo_legalize_to_llvm.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/lhlo_legalize_to_llvm.cc
@@ -35,7 +35,7 @@ struct StaticMemRefCastOpConverter
     auto loc = op->getLoc();
     auto cast_op = cast<StaticMemRefCastOp>(op);
 
-    StaticMemRefCastOp::OperandAdaptor operands_adaptor(operands);
+    StaticMemRefCastOp::Adaptor operands_adaptor(operands);
     MemRefDescriptor sourceMemRef(operands_adaptor.operand());
 
     MemRefType targetMemRefType =
@@ -86,7 +86,7 @@ struct DynamicMemRefCastOpConverter
     auto loc = op->getLoc();
     auto cast_op = cast<DynamicMemRefCastOp>(op);
 
-    DynamicMemRefCastOp::OperandAdaptor operands_adaptor(operands);
+    DynamicMemRefCastOp::Adaptor operands_adaptor(operands);
     MemRefDescriptor sourceMemRef(operands_adaptor.operand());
 
     MemRefType targetMemRefType =
diff --git a/tensorflow/compiler/mlir/xla/transforms/xla_legalize_to_linalg.cc b/tensorflow/compiler/mlir/xla/transforms/xla_legalize_to_linalg.cc
index 0e4842537ef..ad78a01100b 100644
--- a/tensorflow/compiler/mlir/xla/transforms/xla_legalize_to_linalg.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/xla_legalize_to_linalg.cc
@@ -415,7 +415,7 @@ class LhloBroadcastInDimConverter
   LogicalResult matchAndRewrite(
       xla_lhlo::BroadcastInDimOp op, ArrayRef<Value> args,
       ConversionPatternRewriter& rewriter) const final {
-    xla_lhlo::BroadcastInDimOp::OperandAdaptor operand_adaptor(args);
+    xla_lhlo::BroadcastInDimOp::Adaptor operand_adaptor(args);
     auto result_type = operand_adaptor.output().getType().cast<MemRefType>();
     auto result_shape = result_type.getShape();
 
@@ -476,7 +476,7 @@ class LhloBroadcastInDimConverter
   std::pair<Value, SmallVector<int64_t, 2>> InsertReshapeIfNecessary(
       xla_lhlo::BroadcastInDimOp op, ArrayRef<Value> args,
       ConversionPatternRewriter& rewriter) const {
-    xla_lhlo::BroadcastInDimOp::OperandAdaptor operand_adaptor(args);
+    xla_lhlo::BroadcastInDimOp::Adaptor operand_adaptor(args);
     Value operand = operand_adaptor.operand();
     auto operand_type = operand_adaptor.operand().getType().cast<MemRefType>();
     auto operand_shape = operand_type.getShape();

From fb173c22bd9e35df96828ee2a6273506902d456d Mon Sep 17 00:00:00 2001
From: Guillaume Klein <guillaume.klein@systrangroup.com>
Date: Mon, 15 Jun 2020 15:51:55 +0200
Subject: [PATCH 0159/1390] Fix docstring format of tf.executing_eagerly

---
 tensorflow/python/eager/context.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/tensorflow/python/eager/context.py b/tensorflow/python/eager/context.py
index b01f0795c72..4560c3b634e 100644
--- a/tensorflow/python/eager/context.py
+++ b/tensorflow/python/eager/context.py
@@ -1877,9 +1877,8 @@ def executing_eagerly():
   True
   False
 
-  Inside `tf.function` after
+  Inside `tf.function` after `tf.config.run_functions_eagerly(True)` is called:
 
-  `tf.config.run_functions_eagerly(True)` is called:
   >>> tf.config.run_functions_eagerly(True)
   >>> @tf.function
   ... def fn():

From 08311077edd04cb9fb90939bfb0e5caf3059ccb2 Mon Sep 17 00:00:00 2001
From: Guillaume Klein <guillaume.klein@systrangroup.com>
Date: Mon, 15 Jun 2020 15:57:20 +0200
Subject: [PATCH 0160/1390] Copy reverse_sequence docstring to _v2 to remove
 deprecation notices

---
 tensorflow/python/ops/array_ops.py | 41 +++++++++++++++++++++++++++++-
 1 file changed, 40 insertions(+), 1 deletion(-)

diff --git a/tensorflow/python/ops/array_ops.py b/tensorflow/python/ops/array_ops.py
index c77977bf7d2..6ca0eb975ac 100644
--- a/tensorflow/python/ops/array_ops.py
+++ b/tensorflow/python/ops/array_ops.py
@@ -4535,6 +4535,46 @@ def reverse_sequence_v2(input,
                         seq_axis=None,
                         batch_axis=None,
                         name=None):
+  """Reverses variable length slices.
+
+  This op first slices `input` along the dimension `batch_axis`, and for
+  each slice `i`, reverses the first `seq_lengths[i]` elements along the
+  dimension `seq_axis`.
+
+  The elements of `seq_lengths` must obey `seq_lengths[i] <=
+  input.dims[seq_axis]`, and `seq_lengths` must be a vector of length
+  `input.dims[batch_axis]`.
+
+  The output slice `i` along dimension `batch_axis` is then given by
+  input slice `i`, with the first `seq_lengths[i]` slices along
+  dimension `seq_axis` reversed.
+
+  Example usage:
+
+  >>> seq_lengths = [7, 2, 3, 5]
+  >>> input = [[1, 2, 3, 4, 5, 0, 0, 0], [1, 2, 0, 0, 0, 0, 0, 0],
+  ...          [1, 2, 3, 4, 0, 0, 0, 0], [1, 2, 3, 4, 5, 6, 7, 8]]
+  >>> output = tf.reverse_sequence(input, seq_lengths, seq_axis=1, batch_axis=0)
+  >>> output
+  <tf.Tensor: shape=(4, 8), dtype=int32, numpy=
+  array([[0, 0, 5, 4, 3, 2, 1, 0],
+         [2, 1, 0, 0, 0, 0, 0, 0],
+         [3, 2, 1, 4, 0, 0, 0, 0],
+         [5, 4, 3, 2, 1, 6, 7, 8]], dtype=int32)>
+
+  Args:
+    input: A `Tensor`. The input to reverse.
+    seq_lengths: A `Tensor`. Must be one of the following types: `int32`,
+      `int64`. 1-D with length `input.dims(batch_axis)` and `max(seq_lengths) <=
+      input.dims(seq_axis)`
+    seq_axis: An `int`. The dimension which is partially reversed.
+    batch_axis: An optional `int`. Defaults to `0`. The dimension along which
+      reversal is performed.
+    name: A name for the operation (optional).
+
+  Returns:
+    A Tensor. Has the same type as input.
+  """
   return gen_array_ops.reverse_sequence(
       input=input,
       seq_lengths=seq_lengths,
@@ -4542,7 +4582,6 @@ def reverse_sequence_v2(input,
       batch_dim=batch_axis,
       name=name)
 
-reverse_sequence_v2.__doc__ = reverse_sequence.__doc__
 # pylint: enable=redefined-builtin
 
 
From eabae7b8e94da529e9da72bbacd534904c5e4b79 Mon Sep 17 00:00:00 2001
From: Kibeom Kim <kkb@google.com>
Date: Mon, 15 Jun 2020 07:00:00 -0700
Subject: [PATCH 0161/1390] Set async as default for TFRT runtime

PiperOrigin-RevId: 316456716
Change-Id: Ifdffb129c4609108285892a7ead5758e67802641
---
 tensorflow/python/eager/context.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/eager/context.py b/tensorflow/python/eager/context.py
index b01f0795c72..68234985d15 100644
--- a/tensorflow/python/eager/context.py
+++ b/tensorflow/python/eager/context.py
@@ -427,7 +427,7 @@ class Context(object):
       raise ValueError(
           "execution_mode should be None/SYNC/ASYNC. Got %s" % execution_mode)
     if execution_mode is None:
-      execution_mode = SYNC
+      execution_mode = ASYNC if is_tfrt_enabled() else SYNC
     self._default_is_async = execution_mode == ASYNC
     self._lazy_remote_inputs_copy = None
     self._use_tfrt = is_tfrt_enabled()

From 83b4360a3fd7955bf807e12c05734c388b67c2f2 Mon Sep 17 00:00:00 2001
From: Tres Popp <tpopp@google.com>
Date: Mon, 15 Jun 2020 07:19:59 -0700
Subject: [PATCH 0162/1390] Add shape constraints to CHLO->HLO lowering.

PiperOrigin-RevId: 316459663
Change-Id: Ifff45b67a039c5a8e7cf8fa1bedf187c33900091
---
 .../chlo_legalize_to_hlo_broadcasts.mlir      | 52 ++++++----
 .../tests/legalize-tf-binary-elementwise.mlir | 99 +++++++++----------
 .../xla/transforms/chlo_legalize_to_hlo.cc    | 19 +++-
 3 files changed, 95 insertions(+), 75 deletions(-)

diff --git a/tensorflow/compiler/mlir/xla/tests/chlo_legalize_to_hlo_broadcasts.mlir b/tensorflow/compiler/mlir/xla/tests/chlo_legalize_to_hlo_broadcasts.mlir
index 107a668c0a7..65285021fd4 100644
--- a/tensorflow/compiler/mlir/xla/tests/chlo_legalize_to_hlo_broadcasts.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/chlo_legalize_to_hlo_broadcasts.mlir
@@ -1,4 +1,4 @@
-// RUN: xla-opt -test-xla-chlo-legalize-to-hlo -split-input-file -verify-diagnostics %s -o - | FileCheck %s
+// RUN: xla-opt -test-xla-chlo-legalize-to-hlo -cse -split-input-file -verify-diagnostics %s -o - | FileCheck %s
 
 // Check the non-broadcast case for each registered op, then just check a
 // representative op for detailed broadcast semantics.
@@ -14,14 +14,18 @@ func @addWithoutBroadcast(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<
 // CHECK-SAME: %[[ARG0:.+]]: tensor<?xf32>
 // CHECK-SAME: %[[ARG1:.+]]: tensor<?x?xf32>
 func @dynamicBroadcast(%arg0: tensor<?xf32>, %arg1: tensor<?x?xf32>) -> tensor<?x?xf32> {
-  // CHECK-DAG: %[[ARG0_S:.+]] = shape.shape_of %[[ARG0]]
-  // CHECK-DAG: %[[ARG1_S:.+]] = shape.shape_of %[[ARG1]]
-  // CHECK-DAG: %[[RESULT_S:.+]] = "shape.broadcast"(%[[ARG0_S]], %[[ARG1_S]])
-  // CHECK: %[[RESULT_EXTENTS:.+]] = shape.to_extent_tensor %[[RESULT_S]]
-  // CHECK-DAG: %[[ARG0_B:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%[[ARG0]], %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<1> : tensor<1xi64>}
-  // CHECK-DAG: %[[ARG1_B:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%[[ARG1]], %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>}
-  // CHECK-DAG: %[[RESULT:.+]] = xla_hlo.add %[[ARG0_B]], %[[ARG1_B]]
-  // CHECK: return %[[RESULT]] : tensor<?x?xf32>
+  // CHECK-DAG:  %[[ARG0_S:.+]] = shape.shape_of %[[ARG0]]
+  // CHECK-DAG:  %[[ARG1_S:.+]] = shape.shape_of %[[ARG1]]
+  // CHECK-NEXT: %[[WITNESS:.+]] = shape.cstr_broadcastable %[[ARG0_S]], %[[ARG1_S]]
+  // CHECK-NEXT: %[[FINAL_RESULT:.+]] = shape.assuming %[[WITNESS]]
+  // CHECK-DAG:    %[[RESULT_S:.+]] = "shape.broadcast"(%[[ARG0_S]], %[[ARG1_S]])
+  // CHECK:        %[[RESULT_EXTENTS:.+]] = shape.to_extent_tensor %[[RESULT_S]]
+  // CHECK-DAG:    %[[ARG0_B:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%[[ARG0]], %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<1> : tensor<1xi64>}
+  // CHECK-DAG:    %[[ARG1_B:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%[[ARG1]], %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>}
+  // CHECK-NEXT:   %[[RESULT:.+]] = xla_hlo.add %[[ARG0_B]], %[[ARG1_B]]
+  // CHECK-NEXT:   shape.assuming_yield %[[RESULT]]
+  // CHECK-NEXT: }
+  // CHECK-NEXT:      return %[[FINAL_RESULT]] : tensor<?x?xf32>
   %0 = xla_chlo.broadcast_add %arg0, %arg1 : (tensor<?xf32>, tensor<?x?xf32>) -> tensor<?x?xf32>
   return %0 : tensor<?x?xf32>
 }
@@ -31,14 +35,18 @@ func @dynamicBroadcast(%arg0: tensor<?xf32>, %arg1: tensor<?x?xf32>) -> tensor<?
 // CHECK-SAME: %[[ARG0:.+]]: tensor<?xf32>
 // CHECK-SAME: %[[ARG1:.+]]: tensor<?x?xf32>
 func @dynamicBroadcastComplex(%arg0: tensor<?xf32>, %arg1: tensor<?x?xf32>) -> tensor<?x?xcomplex<f32>> {
-  // CHECK-DAG: %[[ARG0_S:.+]] = shape.shape_of %[[ARG0]]
-  // CHECK-DAG: %[[ARG1_S:.+]] = shape.shape_of %[[ARG1]]
-  // CHECK-DAG: %[[RESULT_S:.+]] = "shape.broadcast"(%[[ARG0_S]], %[[ARG1_S]])
-  // CHECK: %[[RESULT_EXTENTS:.+]] = shape.to_extent_tensor %[[RESULT_S]]
-  // CHECK-DAG: %[[ARG0_B:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%[[ARG0]], %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<?xf32>, tensor<2xindex>) -> tensor<?x?xf32>
-  // CHECK-DAG: %[[ARG1_B:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%[[ARG1]], %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>} : (tensor<?x?xf32>, tensor<2xindex>) -> tensor<?x?xf32>
-  // CHECK-DAG: %[[RESULT:.+]] = "xla_hlo.complex"(%[[ARG0_B]], %[[ARG1_B]]) : (tensor<?x?xf32>, tensor<?x?xf32>) -> tensor<?x?xcomplex<f32>>
-  // CHECK: return %[[RESULT]] : tensor<?x?xcomplex<f32>>
+  // CHECK-DAG:  %[[ARG0_S:.+]] = shape.shape_of %[[ARG0]]
+  // CHECK-DAG:  %[[ARG1_S:.+]] = shape.shape_of %[[ARG1]]
+  // CHECK-NEXT: %[[WITNESS:.+]] = shape.cstr_broadcastable %[[ARG0_S]], %[[ARG1_S]]
+  // CHECK-NEXT: %[[FINAL_RESULT:.+]] = shape.assuming %[[WITNESS]]
+  // CHECK-NEXT:   %[[RESULT_S:.+]] = "shape.broadcast"(%[[ARG0_S]], %[[ARG1_S]])
+  // CHECK-NEXT:   %[[RESULT_EXTENTS:.+]] = shape.to_extent_tensor %[[RESULT_S]]
+  // CHECK-DAG:    %[[ARG0_B:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%[[ARG0]], %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<?xf32>, tensor<2xindex>) -> tensor<?x?xf32>
+  // CHECK-DAG:    %[[ARG1_B:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%[[ARG1]], %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>} : (tensor<?x?xf32>, tensor<2xindex>) -> tensor<?x?xf32>
+  // CHECK-NEXT:   %[[RESULT:.+]] = "xla_hlo.complex"(%[[ARG0_B]], %[[ARG1_B]]) : (tensor<?x?xf32>, tensor<?x?xf32>) -> tensor<?x?xcomplex<f32>>
+  // CHECK-NEXT:   shape.assuming_yield %[[RESULT]]
+  // CHECK-NEXT: }
+  // CHECK-NEXT: return %[[FINAL_RESULT]] : tensor<?x?xcomplex<f32>>
   %0 = xla_chlo.broadcast_complex %arg0, %arg1 : (tensor<?xf32>, tensor<?x?xf32>) -> tensor<?x?xcomplex<f32>>
   return %0 : tensor<?x?xcomplex<f32>>
 }
@@ -50,12 +58,16 @@ func @dynamicBroadcastComplex(%arg0: tensor<?xf32>, %arg1: tensor<?x?xf32>) -> t
 func @dynamicBroadcastCompare(%arg0: tensor<?xf32>, %arg1: tensor<?x?xf32>) -> tensor<?x?xi1> {
   // CHECK-DAG: %[[ARG0_S:.+]] = shape.shape_of %[[ARG0]]
   // CHECK-DAG: %[[ARG1_S:.+]] = shape.shape_of %[[ARG1]]
-  // CHECK-DAG: %[[RESULT_S:.+]] = "shape.broadcast"(%[[ARG0_S]], %[[ARG1_S]])
+  // CHECK: %[[WITNESS:.+]] = shape.cstr_broadcastable %[[ARG0_S]], %[[ARG1_S]]
+  // CHECK: %[[FINAL_RESULT:.+]] = shape.assuming %[[WITNESS]]
+  // CHECK: %[[RESULT_S:.+]] = "shape.broadcast"(%[[ARG0_S]], %[[ARG1_S]])
   // CHECK: %[[RESULT_EXTENTS:.+]] = shape.to_extent_tensor %[[RESULT_S]]
   // CHECK-DAG: %[[ARG0_B:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%[[ARG0]], %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<?xf32>, tensor<2xindex>) -> tensor<?x?xf32>
   // CHECK-DAG: %[[ARG1_B:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%[[ARG1]], %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>} : (tensor<?x?xf32>, tensor<2xindex>) -> tensor<?x?xf32>
-  // CHECK-DAG: %[[RESULT:.+]] = "xla_hlo.compare"(%[[ARG0_B]], %[[ARG1_B]]) {comparison_direction = "EQ"} : (tensor<?x?xf32>, tensor<?x?xf32>) -> tensor<?x?xi1>
-  // CHECK: return %[[RESULT]] : tensor<?x?xi1>
+  // CHECK: %[[RESULT:.+]] = "xla_hlo.compare"(%[[ARG0_B]], %[[ARG1_B]]) {comparison_direction = "EQ"} : (tensor<?x?xf32>, tensor<?x?xf32>) -> tensor<?x?xi1>
+  // CHECK: shape.assuming_yield %[[RESULT]]
+  // CHECK-NEXT: }
+  // CHECK: return %[[FINAL_RESULT]] : tensor<?x?xi1>
   %0 = xla_chlo.broadcast_compare %arg0, %arg1 {comparison_direction = "EQ"} : (tensor<?xf32>, tensor<?x?xf32>) -> tensor<?x?xi1>
   return %0 : tensor<?x?xi1>
 }
diff --git a/tensorflow/compiler/mlir/xla/tests/legalize-tf-binary-elementwise.mlir b/tensorflow/compiler/mlir/xla/tests/legalize-tf-binary-elementwise.mlir
index 2153258993a..3d270a52f48 100644
--- a/tensorflow/compiler/mlir/xla/tests/legalize-tf-binary-elementwise.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/legalize-tf-binary-elementwise.mlir
@@ -1,7 +1,7 @@
 // Note that binary elementwise tests are run with chlo legalization enabled
 // (unlike the rest), since this is the primary use case for such ops and
 // verification of shapes and broadcasts is desired.
-// RUN: tf-opt "-xla-legalize-tf=allow-partial-conversion legalize-chlo=true" %s | FileCheck %s
+// RUN: tf-opt "-xla-legalize-tf=allow-partial-conversion legalize-chlo=true" -canonicalize %s | FileCheck %s
 
 //===----------------------------------------------------------------------===//
 // Binary op legalizations.
@@ -24,13 +24,8 @@ func @add(%arg0: tensor<2xi32>) -> tensor<2xi32> {
 // patterns unambiguous and more interesting (once broadcastable trait is
 // fixed upstream).
 func @broadcast_add(%arg0: tensor<1xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi32> {
-  // CHECK: %[[UNUSED_LHS_SHAPE:.+]] = shape.const_shape [1]
-  // CHECK: %[[UNUSED_RHS_SHAPE:.+]] = shape.const_shape [1, 2]
-  // CHECK: %[[RESULT_SHAPE:.+]] = shape.const_shape [1, 2]
-  // CHECK-DAG: %[[RESULT_EXTENTS:.+]] = shape.to_extent_tensor %[[RESULT_SHAPE]]
-  // CHECK-DAG: %[[LHS_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg0, %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<1> : tensor<1xi64>}
-  // CHECK-DAG: %[[RHS_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg1, %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>}
-  // CHECK: xla_hlo.add %[[LHS_BCAST]], %[[RHS_BCAST]]
+  // CHECK-NEXT: %[[LHS_BCAST:.+]] = "xla_hlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<1> : tensor<1xi64>}
+  // CHECK-NEXT: xla_hlo.add %[[LHS_BCAST]], %arg1
   %0 = "tf.Add"(%arg0, %arg1) : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi32>
   return %0: tensor<1x2xi32>
 }
@@ -39,26 +34,26 @@ func @broadcast_add(%arg0: tensor<1xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2x
 // TODO(laurenzo): Change this to a (4x1x1 + 1x4x4x4) shaped add once upstream
 // broadcastable bug is fixed (helps make the CHECK matching unambiguous)
 func @broadcast_multi_dim_add(%arg0: tensor<4x1x1xi32>, %arg1: tensor<4x4x4x4xi32>) -> tensor<4x4x4x4xi32> {
-  // CHECK: %[[UNUSED_LHS_SHAPE:.+]] = shape.const_shape [4, 1, 1]
-  // CHECK: %[[UNUSED_RHS_SHAPE:.+]] = shape.const_shape [4, 4, 4, 4]
-  // CHECK: %[[RESULT_SHAPE:.+]] = shape.const_shape [4, 4, 4, 4]
-  // CHECK-DAG: %[[RESULT_EXTENTS:.+]] = shape.to_extent_tensor %[[RESULT_SHAPE]]
-  // CHECK-DAG: %[[LHS_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg0, %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<[1, 2, 3]> : tensor<3xi64>}
-  // CHECK-DAG: %[[RHS_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg1, %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<[0, 1, 2, 3]> : tensor<4xi64>}
-  // CHECK: xla_hlo.add %[[LHS_BCAST]], %[[RHS_BCAST]]
+  // CHECK-NEXT: %[[LHS_BCAST:.+]] = "xla_hlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<[1, 2, 3]> : tensor<3xi64>}
+  // CHECK-NEXT: xla_hlo.add %[[LHS_BCAST]], %arg1
   %0 = "tf.Add"(%arg0, %arg1) : (tensor<4x1x1xi32>, tensor<4x4x4x4xi32>) -> tensor<4x4x4x4xi32>
   return %0: tensor<4x4x4x4xi32>
 }
 
 // CHECK-LABEL: func @add_dynamic
 func @add_dynamic(%arg0: tensor<?xi32>, %arg1: tensor<?x?xi32>) -> tensor<?x?xi32> {
-  // CHECK-DAG: %[[LHS_SHAPE:.+]] = shape.shape_of %arg0
-  // CHECK-DAG: %[[RHS_SHAPE:.+]] = shape.shape_of %arg1
-  // CHECK-DAG: %[[RESULT_SHAPE:.+]] = "shape.broadcast"(%[[LHS_SHAPE]], %[[RHS_SHAPE]])
-  // CHECK-DAG: %[[RESULT_EXTENTS:.+]] = shape.to_extent_tensor %[[RESULT_SHAPE]]
-  // CHECK-DAG: %[[LHS_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg0, %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<1> : tensor<1xi64>}
-  // CHECK-DAG: %[[RHS_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg1, %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>}
-  // CHECK: xla_hlo.add %4, %5 : tensor<?x?xi32>
+  // CHECK-DAG:  %[[CSTR_LHS_SHAPE:.+]] = shape.shape_of %arg0
+  // CHECK-DAG:  %[[CSTR_RHS_SHAPE:.+]] = shape.shape_of %arg1
+  // CHECK-NEXT: %[[WITNESS:.+]] = shape.cstr_broadcastable %[[CSTR_LHS_SHAPE]], %[[CSTR_RHS_SHAPE]]
+  // CHECK-NEXT: shape.assuming %[[WITNESS:.+]]
+  // CHECK-DAG:    %[[LHS_SHAPE:.+]] = shape.shape_of %arg0
+  // CHECK-DAG:    %[[RHS_SHAPE:.+]] = shape.shape_of %arg1
+  // CHECK-NEXT:   %[[RESULT_SHAPE:.+]] = "shape.broadcast"(%[[LHS_SHAPE]], %[[RHS_SHAPE]])
+  // CHECK-NEXT:   %[[RESULT_EXTENTS:.+]] = shape.to_extent_tensor %[[RESULT_SHAPE]]
+  // CHECK-NEXT:   %[[LHS_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg0, %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<1> : tensor<1xi64>}
+  // CHECK-NEXT:   %[[RHS_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg1, %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>}
+  // CHECK-NEXT:   %[[RESULT:.+]] = xla_hlo.add %[[LHS_BCAST]], %[[RHS_BCAST]] : tensor<?x?xi32>
+  // CHECK-NEXT:   shape.assuming_yield %[[RESULT]]
   %0 = "tf.Add"(%arg0, %arg1) : (tensor<?xi32>, tensor<?x?xi32>) -> tensor<?x?xi32>
   return %0: tensor<?x?xi32>
 }
@@ -80,21 +75,21 @@ func @shift_left(%arg0: tensor<4xi32>, %arg1: tensor<4xi32>) -> tensor<4xi32> {
 
 // CHECK-LABEL: func @div_unranked
 func @div_unranked(%arg0: tensor<*xi32>, %arg1: tensor<?x?xi32>) -> tensor<?x?xi32> {
-  // CHECK: tf.Div
+  // CHECK-NEXT: tf.Div
   %0 = "tf.Div"(%arg0, %arg1) : (tensor<*xi32>, tensor<?x?xi32>) -> tensor<?x?xi32>
   return %0: tensor<?x?xi32>
 }
 
 // CHECK-LABEL: func @maximum
 func @maximum(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
-  // CHECK:  xla_hlo.maximum %arg0, %arg1 : tensor<4xf32>
+  // CHECK-NEXT:  xla_hlo.maximum %arg0, %arg1 : tensor<4xf32>
   %0 = "tf.Maximum"(%arg0, %arg1) : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
   return %0 : tensor<4xf32>
 }
 
 // CHECK-LABEL: func @minimum
 func @minimum(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
-  // CHECK:  xla_hlo.minimum %arg0, %arg1 : tensor<4xf32>
+  // CHECK-NEXT:  xla_hlo.minimum %arg0, %arg1 : tensor<4xf32>
   %0 = "tf.Minimum"(%arg0, %arg1) : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
   return %0 : tensor<4xf32>
 }
@@ -200,26 +195,25 @@ func @equal(%arg0: tensor<2xi32>) -> tensor<2xi1> {
 
 // CHECK-LABEL: func @equal_dynamic
 func @equal_dynamic(%arg0: tensor<?xi32>, %arg1: tensor<1xi32>) -> tensor<?xi1> {
-  // CHECK-DAG: %[[LHS_SHAPE:.+]] = shape.shape_of %arg0
-  // CHECK-DAG: %[[RHS_SHAPE:.+]] = shape.const_shape [1]
-  // CHECK-DAG: %[[RESULT_SHAPE:.+]] = "shape.broadcast"(%[[LHS_SHAPE]], %[[RHS_SHAPE]])
-  // CHECK-DAG: %[[RESULT_EXTENTS:.+]] = shape.to_extent_tensor %[[RESULT_SHAPE]]
-  // CHECK-DAG: %[[LHS_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg0, %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<0> : tensor<1xi64>}
-  // CHECK-DAG: %[[RHS_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg1, %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<0> : tensor<1xi64>}
-  // CHECK: "xla_hlo.compare"(%[[LHS_BCAST]], %[[RHS_BCAST]]) {comparison_direction = "EQ"}
+  // CHECK-DAG:  %[[LHS_SHAPE:.+]] = shape.shape_of %arg0
+  // CHECK-DAG:  %[[RHS_SHAPE:.+]] = shape.const_shape [1]
+  // CHECK-NEXT: %[[WITNESS:.+]] = shape.cstr_broadcastable %[[LHS_SHAPE]], %[[RHS_SHAPE]]
+  // CHECK-NEXT: shape.assuming %[[WITNESS]] -> (tensor<?xi1>) {
+  // CHECK-DAG:    %[[LHS_SHAPE1:.+]] = shape.shape_of %arg0
+  // CHECK-NEXT:   %[[RESULT_SHAPE:.+]] = "shape.broadcast"(%[[LHS_SHAPE1]], %[[RHS_SHAPE]])
+  // CHECK-NEXT:   %[[RESULT_EXTENTS:.+]] = shape.to_extent_tensor %[[RESULT_SHAPE]]
+  // CHECK-DAG:    %[[LHS_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg0, %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<0> : tensor<1xi64>}
+  // CHECK-DAG:    %[[RHS_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg1, %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<0> : tensor<1xi64>}
+  // CHECK-NEXT:   %[[RESULT:.+]] = "xla_hlo.compare"(%[[LHS_BCAST]], %[[RHS_BCAST]]) {comparison_direction = "EQ"}
+  // CHECK-NEXT:   shape.assuming_yield %[[RESULT]]
   %0 = "tf.Equal"(%arg0, %arg1) : (tensor<?xi32>, tensor<1xi32>) -> tensor<?xi1>
   return %0: tensor<?xi1>
 }
 
 // CHECK-LABEL: func @equal_broadcast
 func @equal_broadcast(%arg0: tensor<1xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi1> {
-  // CHECK-DAG: %[[LHS_SHAPE:.+]] = shape.const_shape [1]
-  // CHECK-DAG: %[[RHS_SHAPE:.+]] = shape.const_shape [1, 2]
-  // CHECK-DAG: %[[RESULT_SHAPE:.+]] = shape.const_shape [1, 2]
-  // CHECK-DAG: %[[RESULT_EXTENTS:.+]] = shape.to_extent_tensor %[[RESULT_SHAPE]]
-  // CHECK-DAG: %[[LHS_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg0, %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<1> : tensor<1xi64>}
-  // CHECK-DAG: %[[RHS_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg1, %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>}
-  // CHECK: "xla_hlo.compare"(%[[LHS_BCAST]], %[[RHS_BCAST]]) {comparison_direction = "EQ"}
+  // CHECK-DAG: %[[LHS_BCAST:.+]] = "xla_hlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<1> : tensor<1xi64>}
+  // CHECK-NEXT: "xla_hlo.compare"(%[[LHS_BCAST]], %arg1) {comparison_direction = "EQ"}
   %0 = "tf.Equal"(%arg0, %arg1) : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
   return %0: tensor<1x2xi1>
 }
@@ -281,26 +275,25 @@ func @greater(%arg0: tensor<2xi32>) -> tensor<2xi1> {
 
 // CHECK-LABEL: func @broadcast_greater
 func @broadcast_greater(%arg0: tensor<1xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi1> {
-  // CHECK-DAG: %[[LHS_SHAPE:.+]] = shape.const_shape [1]
-  // CHECK-DAG: %[[RHS_SHAPE:.+]] = shape.const_shape [1, 2]
-  // CHECK-DAG: %[[RESULT_SHAPE:.+]] = shape.const_shape [1, 2]
-  // CHECK-DAG: %[[RESULT_EXTENTS:.+]] = shape.to_extent_tensor %[[RESULT_SHAPE]]
-  // CHECK-DAG: %[[LHS_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg0, %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<1> : tensor<1xi64>}
-  // CHECK-DAG: %[[RHS_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg1, %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>}
-  // CHECK: "xla_hlo.compare"(%[[LHS_BCAST]], %[[RHS_BCAST]]) {comparison_direction = "GT"}
+  // CHECK-NEXT: %[[LHS_BCAST:.+]] = "xla_hlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<1> : tensor<1xi64>}
+  // CHECK-NEXT: "xla_hlo.compare"(%[[LHS_BCAST]], %arg1) {comparison_direction = "GT"}
   %0 = "tf.Greater"(%arg0, %arg1) : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
   return %0: tensor<1x2xi1>
 }
 
 // CHECK-LABEL: func @greater_dynamic
 func @greater_dynamic(%arg0: tensor<?xi32>, %arg1: tensor<?xi32>) -> tensor<?xi1> {
-  // CHECK-DAG: %[[LHS_SHAPE:.+]] = shape.shape_of %arg0
-  // CHECK-DAG: %[[RHS_SHAPE:.+]] = shape.shape_of %arg1
-  // CHECK-DAG: %[[RESULT_SHAPE:.+]] = "shape.broadcast"(%[[LHS_SHAPE]], %[[RHS_SHAPE]])
-  // CHECK-DAG: %[[RESULT_EXTENTS:.+]] = shape.to_extent_tensor %[[RESULT_SHAPE]]
-  // CHECK-DAG: %[[LHS_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg0, %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<0> : tensor<1xi64>}
-  // CHECK-DAG: %[[RHS_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg1, %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<0> : tensor<1xi64>}
-  // CHECK: "xla_hlo.compare"(%[[LHS_BCAST]], %[[RHS_BCAST]]) {comparison_direction = "GT"}
+  // CHECK-DAG:  %[[LHS_SHAPE:.+]] = shape.shape_of %arg0
+  // CHECK-DAG:  %[[RHS_SHAPE:.+]] = shape.shape_of %arg1
+  // CHECK-NEXT: %[[WITNESS:.+]] = shape.cstr_broadcastable %[[LHS_SHAPE]], %[[RHS_SHAPE]]
+  // CHECK-NEXT: shape.assuming %[[WITNESS]]
+  // CHECK-DAG:    %[[LHS_SHAPE1:.+]] = shape.shape_of %arg0
+  // CHECK-DAG:    %[[RHS_SHAPE1:.+]] = shape.shape_of %arg1
+  // CHECK-NEXT:   %[[RESULT_SHAPE:.+]] = "shape.broadcast"(%[[LHS_SHAPE1]], %[[RHS_SHAPE1]])
+  // CHECK-NEXT:   %[[RESULT_EXTENTS:.+]] = shape.to_extent_tensor %[[RESULT_SHAPE]]
+  // CHECK-DAG:    %[[LHS_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg0, %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<0> : tensor<1xi64>}
+  // CHECK-DAG:    %[[RHS_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg1, %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<0> : tensor<1xi64>}
+  // CHECK-NEXT:   "xla_hlo.compare"(%[[LHS_BCAST]], %[[RHS_BCAST]]) {comparison_direction = "GT"}
   %0 = "tf.Greater"(%arg0, %arg1) : (tensor<?xi32>, tensor<?xi32>) -> tensor<?xi1>
   return %0: tensor<?xi1>
 }
diff --git a/tensorflow/compiler/mlir/xla/transforms/chlo_legalize_to_hlo.cc b/tensorflow/compiler/mlir/xla/transforms/chlo_legalize_to_hlo.cc
index e5a79616d5b..97afa9617c4 100644
--- a/tensorflow/compiler/mlir/xla/transforms/chlo_legalize_to_hlo.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/chlo_legalize_to_hlo.cc
@@ -112,6 +112,19 @@ struct ConvertRankedDynamicBroadcastBinaryOp
 
     // Compute result shape.
     auto loc = op.getLoc();
+
+    // Insert a constraint on the shapes being broadcastable and insert all
+    // future code into an assuming block reliant on the constraint.
+    Value lhs_shape = rewriter.create<shape::ShapeOfOp>(loc, lhs);
+    Value rhs_shape = rewriter.create<shape::ShapeOfOp>(loc, rhs);
+    auto broadcastable_cstr =
+        rewriter.create<shape::CstrBroadcastableOp>(loc, lhs_shape, rhs_shape);
+    auto assuming_op = rewriter.create<shape::AssumingOp>(
+        loc, ArrayRef<Type>{result_type}, broadcastable_cstr.result());
+
+    OpBuilder::InsertionGuard guard(rewriter);
+    rewriter.createBlock(&assuming_op.doRegion());
+
     int64_t result_rank = std::max(lhs_type.getRank(), rhs_type.getRank());
     Value result_extents =
         xla::ComputeBinaryElementwiseBroadcastingResultExtents(loc, lhs, rhs,
@@ -140,8 +153,10 @@ struct ConvertRankedDynamicBroadcastBinaryOp
         rewriter.getI64TensorAttr(rhs_broadcast_dimensions));
 
     // And generate the final non-broadcasted binary op.
-    rewriter.replaceOp(op, {Adaptor::CreateOp(op, result_type, broadcasted_lhs,
-                                              broadcasted_rhs, rewriter)});
+    Value final_result = Adaptor::CreateOp(op, result_type, broadcasted_lhs,
+                                           broadcasted_rhs, rewriter);
+    rewriter.create<shape::AssumingYieldOp>(loc, final_result);
+    rewriter.replaceOp(op, {assuming_op.getResult(0)});
     return success();
   }
 };

From 1a0909a9f44cd60419aa9ffa0c5710957c83901d Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 15 Jun 2020 07:50:42 -0700
Subject: [PATCH 0163/1390] Add shape constraints to CHLO->HLO lowering.

PiperOrigin-RevId: 316464141
Change-Id: If31be3a8f1644335897feb7c693eed02ce52f029
---
 .../chlo_legalize_to_hlo_broadcasts.mlir      | 52 ++++------
 .../tests/legalize-tf-binary-elementwise.mlir | 99 ++++++++++---------
 .../xla/transforms/chlo_legalize_to_hlo.cc    | 19 +---
 3 files changed, 75 insertions(+), 95 deletions(-)

diff --git a/tensorflow/compiler/mlir/xla/tests/chlo_legalize_to_hlo_broadcasts.mlir b/tensorflow/compiler/mlir/xla/tests/chlo_legalize_to_hlo_broadcasts.mlir
index 65285021fd4..107a668c0a7 100644
--- a/tensorflow/compiler/mlir/xla/tests/chlo_legalize_to_hlo_broadcasts.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/chlo_legalize_to_hlo_broadcasts.mlir
@@ -1,4 +1,4 @@
-// RUN: xla-opt -test-xla-chlo-legalize-to-hlo -cse -split-input-file -verify-diagnostics %s -o - | FileCheck %s
+// RUN: xla-opt -test-xla-chlo-legalize-to-hlo -split-input-file -verify-diagnostics %s -o - | FileCheck %s
 
 // Check the non-broadcast case for each registered op, then just check a
 // representative op for detailed broadcast semantics.
@@ -14,18 +14,14 @@ func @addWithoutBroadcast(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<
 // CHECK-SAME: %[[ARG0:.+]]: tensor<?xf32>
 // CHECK-SAME: %[[ARG1:.+]]: tensor<?x?xf32>
 func @dynamicBroadcast(%arg0: tensor<?xf32>, %arg1: tensor<?x?xf32>) -> tensor<?x?xf32> {
-  // CHECK-DAG:  %[[ARG0_S:.+]] = shape.shape_of %[[ARG0]]
-  // CHECK-DAG:  %[[ARG1_S:.+]] = shape.shape_of %[[ARG1]]
-  // CHECK-NEXT: %[[WITNESS:.+]] = shape.cstr_broadcastable %[[ARG0_S]], %[[ARG1_S]]
-  // CHECK-NEXT: %[[FINAL_RESULT:.+]] = shape.assuming %[[WITNESS]]
-  // CHECK-DAG:    %[[RESULT_S:.+]] = "shape.broadcast"(%[[ARG0_S]], %[[ARG1_S]])
-  // CHECK:        %[[RESULT_EXTENTS:.+]] = shape.to_extent_tensor %[[RESULT_S]]
-  // CHECK-DAG:    %[[ARG0_B:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%[[ARG0]], %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<1> : tensor<1xi64>}
-  // CHECK-DAG:    %[[ARG1_B:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%[[ARG1]], %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>}
-  // CHECK-NEXT:   %[[RESULT:.+]] = xla_hlo.add %[[ARG0_B]], %[[ARG1_B]]
-  // CHECK-NEXT:   shape.assuming_yield %[[RESULT]]
-  // CHECK-NEXT: }
-  // CHECK-NEXT:      return %[[FINAL_RESULT]] : tensor<?x?xf32>
+  // CHECK-DAG: %[[ARG0_S:.+]] = shape.shape_of %[[ARG0]]
+  // CHECK-DAG: %[[ARG1_S:.+]] = shape.shape_of %[[ARG1]]
+  // CHECK-DAG: %[[RESULT_S:.+]] = "shape.broadcast"(%[[ARG0_S]], %[[ARG1_S]])
+  // CHECK: %[[RESULT_EXTENTS:.+]] = shape.to_extent_tensor %[[RESULT_S]]
+  // CHECK-DAG: %[[ARG0_B:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%[[ARG0]], %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<1> : tensor<1xi64>}
+  // CHECK-DAG: %[[ARG1_B:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%[[ARG1]], %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>}
+  // CHECK-DAG: %[[RESULT:.+]] = xla_hlo.add %[[ARG0_B]], %[[ARG1_B]]
+  // CHECK: return %[[RESULT]] : tensor<?x?xf32>
   %0 = xla_chlo.broadcast_add %arg0, %arg1 : (tensor<?xf32>, tensor<?x?xf32>) -> tensor<?x?xf32>
   return %0 : tensor<?x?xf32>
 }
@@ -35,18 +31,14 @@ func @dynamicBroadcast(%arg0: tensor<?xf32>, %arg1: tensor<?x?xf32>) -> tensor<?
 // CHECK-SAME: %[[ARG0:.+]]: tensor<?xf32>
 // CHECK-SAME: %[[ARG1:.+]]: tensor<?x?xf32>
 func @dynamicBroadcastComplex(%arg0: tensor<?xf32>, %arg1: tensor<?x?xf32>) -> tensor<?x?xcomplex<f32>> {
-  // CHECK-DAG:  %[[ARG0_S:.+]] = shape.shape_of %[[ARG0]]
-  // CHECK-DAG:  %[[ARG1_S:.+]] = shape.shape_of %[[ARG1]]
-  // CHECK-NEXT: %[[WITNESS:.+]] = shape.cstr_broadcastable %[[ARG0_S]], %[[ARG1_S]]
-  // CHECK-NEXT: %[[FINAL_RESULT:.+]] = shape.assuming %[[WITNESS]]
-  // CHECK-NEXT:   %[[RESULT_S:.+]] = "shape.broadcast"(%[[ARG0_S]], %[[ARG1_S]])
-  // CHECK-NEXT:   %[[RESULT_EXTENTS:.+]] = shape.to_extent_tensor %[[RESULT_S]]
-  // CHECK-DAG:    %[[ARG0_B:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%[[ARG0]], %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<?xf32>, tensor<2xindex>) -> tensor<?x?xf32>
-  // CHECK-DAG:    %[[ARG1_B:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%[[ARG1]], %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>} : (tensor<?x?xf32>, tensor<2xindex>) -> tensor<?x?xf32>
-  // CHECK-NEXT:   %[[RESULT:.+]] = "xla_hlo.complex"(%[[ARG0_B]], %[[ARG1_B]]) : (tensor<?x?xf32>, tensor<?x?xf32>) -> tensor<?x?xcomplex<f32>>
-  // CHECK-NEXT:   shape.assuming_yield %[[RESULT]]
-  // CHECK-NEXT: }
-  // CHECK-NEXT: return %[[FINAL_RESULT]] : tensor<?x?xcomplex<f32>>
+  // CHECK-DAG: %[[ARG0_S:.+]] = shape.shape_of %[[ARG0]]
+  // CHECK-DAG: %[[ARG1_S:.+]] = shape.shape_of %[[ARG1]]
+  // CHECK-DAG: %[[RESULT_S:.+]] = "shape.broadcast"(%[[ARG0_S]], %[[ARG1_S]])
+  // CHECK: %[[RESULT_EXTENTS:.+]] = shape.to_extent_tensor %[[RESULT_S]]
+  // CHECK-DAG: %[[ARG0_B:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%[[ARG0]], %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<?xf32>, tensor<2xindex>) -> tensor<?x?xf32>
+  // CHECK-DAG: %[[ARG1_B:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%[[ARG1]], %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>} : (tensor<?x?xf32>, tensor<2xindex>) -> tensor<?x?xf32>
+  // CHECK-DAG: %[[RESULT:.+]] = "xla_hlo.complex"(%[[ARG0_B]], %[[ARG1_B]]) : (tensor<?x?xf32>, tensor<?x?xf32>) -> tensor<?x?xcomplex<f32>>
+  // CHECK: return %[[RESULT]] : tensor<?x?xcomplex<f32>>
   %0 = xla_chlo.broadcast_complex %arg0, %arg1 : (tensor<?xf32>, tensor<?x?xf32>) -> tensor<?x?xcomplex<f32>>
   return %0 : tensor<?x?xcomplex<f32>>
 }
@@ -58,16 +50,12 @@ func @dynamicBroadcastComplex(%arg0: tensor<?xf32>, %arg1: tensor<?x?xf32>) -> t
 func @dynamicBroadcastCompare(%arg0: tensor<?xf32>, %arg1: tensor<?x?xf32>) -> tensor<?x?xi1> {
   // CHECK-DAG: %[[ARG0_S:.+]] = shape.shape_of %[[ARG0]]
   // CHECK-DAG: %[[ARG1_S:.+]] = shape.shape_of %[[ARG1]]
-  // CHECK: %[[WITNESS:.+]] = shape.cstr_broadcastable %[[ARG0_S]], %[[ARG1_S]]
-  // CHECK: %[[FINAL_RESULT:.+]] = shape.assuming %[[WITNESS]]
-  // CHECK: %[[RESULT_S:.+]] = "shape.broadcast"(%[[ARG0_S]], %[[ARG1_S]])
+  // CHECK-DAG: %[[RESULT_S:.+]] = "shape.broadcast"(%[[ARG0_S]], %[[ARG1_S]])
   // CHECK: %[[RESULT_EXTENTS:.+]] = shape.to_extent_tensor %[[RESULT_S]]
   // CHECK-DAG: %[[ARG0_B:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%[[ARG0]], %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<?xf32>, tensor<2xindex>) -> tensor<?x?xf32>
   // CHECK-DAG: %[[ARG1_B:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%[[ARG1]], %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>} : (tensor<?x?xf32>, tensor<2xindex>) -> tensor<?x?xf32>
-  // CHECK: %[[RESULT:.+]] = "xla_hlo.compare"(%[[ARG0_B]], %[[ARG1_B]]) {comparison_direction = "EQ"} : (tensor<?x?xf32>, tensor<?x?xf32>) -> tensor<?x?xi1>
-  // CHECK: shape.assuming_yield %[[RESULT]]
-  // CHECK-NEXT: }
-  // CHECK: return %[[FINAL_RESULT]] : tensor<?x?xi1>
+  // CHECK-DAG: %[[RESULT:.+]] = "xla_hlo.compare"(%[[ARG0_B]], %[[ARG1_B]]) {comparison_direction = "EQ"} : (tensor<?x?xf32>, tensor<?x?xf32>) -> tensor<?x?xi1>
+  // CHECK: return %[[RESULT]] : tensor<?x?xi1>
   %0 = xla_chlo.broadcast_compare %arg0, %arg1 {comparison_direction = "EQ"} : (tensor<?xf32>, tensor<?x?xf32>) -> tensor<?x?xi1>
   return %0 : tensor<?x?xi1>
 }
diff --git a/tensorflow/compiler/mlir/xla/tests/legalize-tf-binary-elementwise.mlir b/tensorflow/compiler/mlir/xla/tests/legalize-tf-binary-elementwise.mlir
index 3d270a52f48..2153258993a 100644
--- a/tensorflow/compiler/mlir/xla/tests/legalize-tf-binary-elementwise.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/legalize-tf-binary-elementwise.mlir
@@ -1,7 +1,7 @@
 // Note that binary elementwise tests are run with chlo legalization enabled
 // (unlike the rest), since this is the primary use case for such ops and
 // verification of shapes and broadcasts is desired.
-// RUN: tf-opt "-xla-legalize-tf=allow-partial-conversion legalize-chlo=true" -canonicalize %s | FileCheck %s
+// RUN: tf-opt "-xla-legalize-tf=allow-partial-conversion legalize-chlo=true" %s | FileCheck %s
 
 //===----------------------------------------------------------------------===//
 // Binary op legalizations.
@@ -24,8 +24,13 @@ func @add(%arg0: tensor<2xi32>) -> tensor<2xi32> {
 // patterns unambiguous and more interesting (once broadcastable trait is
 // fixed upstream).
 func @broadcast_add(%arg0: tensor<1xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi32> {
-  // CHECK-NEXT: %[[LHS_BCAST:.+]] = "xla_hlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<1> : tensor<1xi64>}
-  // CHECK-NEXT: xla_hlo.add %[[LHS_BCAST]], %arg1
+  // CHECK: %[[UNUSED_LHS_SHAPE:.+]] = shape.const_shape [1]
+  // CHECK: %[[UNUSED_RHS_SHAPE:.+]] = shape.const_shape [1, 2]
+  // CHECK: %[[RESULT_SHAPE:.+]] = shape.const_shape [1, 2]
+  // CHECK-DAG: %[[RESULT_EXTENTS:.+]] = shape.to_extent_tensor %[[RESULT_SHAPE]]
+  // CHECK-DAG: %[[LHS_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg0, %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<1> : tensor<1xi64>}
+  // CHECK-DAG: %[[RHS_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg1, %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>}
+  // CHECK: xla_hlo.add %[[LHS_BCAST]], %[[RHS_BCAST]]
   %0 = "tf.Add"(%arg0, %arg1) : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi32>
   return %0: tensor<1x2xi32>
 }
@@ -34,26 +39,26 @@ func @broadcast_add(%arg0: tensor<1xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2x
 // TODO(laurenzo): Change this to a (4x1x1 + 1x4x4x4) shaped add once upstream
 // broadcastable bug is fixed (helps make the CHECK matching unambiguous)
 func @broadcast_multi_dim_add(%arg0: tensor<4x1x1xi32>, %arg1: tensor<4x4x4x4xi32>) -> tensor<4x4x4x4xi32> {
-  // CHECK-NEXT: %[[LHS_BCAST:.+]] = "xla_hlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<[1, 2, 3]> : tensor<3xi64>}
-  // CHECK-NEXT: xla_hlo.add %[[LHS_BCAST]], %arg1
+  // CHECK: %[[UNUSED_LHS_SHAPE:.+]] = shape.const_shape [4, 1, 1]
+  // CHECK: %[[UNUSED_RHS_SHAPE:.+]] = shape.const_shape [4, 4, 4, 4]
+  // CHECK: %[[RESULT_SHAPE:.+]] = shape.const_shape [4, 4, 4, 4]
+  // CHECK-DAG: %[[RESULT_EXTENTS:.+]] = shape.to_extent_tensor %[[RESULT_SHAPE]]
+  // CHECK-DAG: %[[LHS_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg0, %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<[1, 2, 3]> : tensor<3xi64>}
+  // CHECK-DAG: %[[RHS_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg1, %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<[0, 1, 2, 3]> : tensor<4xi64>}
+  // CHECK: xla_hlo.add %[[LHS_BCAST]], %[[RHS_BCAST]]
   %0 = "tf.Add"(%arg0, %arg1) : (tensor<4x1x1xi32>, tensor<4x4x4x4xi32>) -> tensor<4x4x4x4xi32>
   return %0: tensor<4x4x4x4xi32>
 }
 
 // CHECK-LABEL: func @add_dynamic
 func @add_dynamic(%arg0: tensor<?xi32>, %arg1: tensor<?x?xi32>) -> tensor<?x?xi32> {
-  // CHECK-DAG:  %[[CSTR_LHS_SHAPE:.+]] = shape.shape_of %arg0
-  // CHECK-DAG:  %[[CSTR_RHS_SHAPE:.+]] = shape.shape_of %arg1
-  // CHECK-NEXT: %[[WITNESS:.+]] = shape.cstr_broadcastable %[[CSTR_LHS_SHAPE]], %[[CSTR_RHS_SHAPE]]
-  // CHECK-NEXT: shape.assuming %[[WITNESS:.+]]
-  // CHECK-DAG:    %[[LHS_SHAPE:.+]] = shape.shape_of %arg0
-  // CHECK-DAG:    %[[RHS_SHAPE:.+]] = shape.shape_of %arg1
-  // CHECK-NEXT:   %[[RESULT_SHAPE:.+]] = "shape.broadcast"(%[[LHS_SHAPE]], %[[RHS_SHAPE]])
-  // CHECK-NEXT:   %[[RESULT_EXTENTS:.+]] = shape.to_extent_tensor %[[RESULT_SHAPE]]
-  // CHECK-NEXT:   %[[LHS_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg0, %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<1> : tensor<1xi64>}
-  // CHECK-NEXT:   %[[RHS_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg1, %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>}
-  // CHECK-NEXT:   %[[RESULT:.+]] = xla_hlo.add %[[LHS_BCAST]], %[[RHS_BCAST]] : tensor<?x?xi32>
-  // CHECK-NEXT:   shape.assuming_yield %[[RESULT]]
+  // CHECK-DAG: %[[LHS_SHAPE:.+]] = shape.shape_of %arg0
+  // CHECK-DAG: %[[RHS_SHAPE:.+]] = shape.shape_of %arg1
+  // CHECK-DAG: %[[RESULT_SHAPE:.+]] = "shape.broadcast"(%[[LHS_SHAPE]], %[[RHS_SHAPE]])
+  // CHECK-DAG: %[[RESULT_EXTENTS:.+]] = shape.to_extent_tensor %[[RESULT_SHAPE]]
+  // CHECK-DAG: %[[LHS_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg0, %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<1> : tensor<1xi64>}
+  // CHECK-DAG: %[[RHS_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg1, %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>}
+  // CHECK: xla_hlo.add %4, %5 : tensor<?x?xi32>
   %0 = "tf.Add"(%arg0, %arg1) : (tensor<?xi32>, tensor<?x?xi32>) -> tensor<?x?xi32>
   return %0: tensor<?x?xi32>
 }
@@ -75,21 +80,21 @@ func @shift_left(%arg0: tensor<4xi32>, %arg1: tensor<4xi32>) -> tensor<4xi32> {
 
 // CHECK-LABEL: func @div_unranked
 func @div_unranked(%arg0: tensor<*xi32>, %arg1: tensor<?x?xi32>) -> tensor<?x?xi32> {
-  // CHECK-NEXT: tf.Div
+  // CHECK: tf.Div
   %0 = "tf.Div"(%arg0, %arg1) : (tensor<*xi32>, tensor<?x?xi32>) -> tensor<?x?xi32>
   return %0: tensor<?x?xi32>
 }
 
 // CHECK-LABEL: func @maximum
 func @maximum(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
-  // CHECK-NEXT:  xla_hlo.maximum %arg0, %arg1 : tensor<4xf32>
+  // CHECK:  xla_hlo.maximum %arg0, %arg1 : tensor<4xf32>
   %0 = "tf.Maximum"(%arg0, %arg1) : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
   return %0 : tensor<4xf32>
 }
 
 // CHECK-LABEL: func @minimum
 func @minimum(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
-  // CHECK-NEXT:  xla_hlo.minimum %arg0, %arg1 : tensor<4xf32>
+  // CHECK:  xla_hlo.minimum %arg0, %arg1 : tensor<4xf32>
   %0 = "tf.Minimum"(%arg0, %arg1) : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
   return %0 : tensor<4xf32>
 }
@@ -195,25 +200,26 @@ func @equal(%arg0: tensor<2xi32>) -> tensor<2xi1> {
 
 // CHECK-LABEL: func @equal_dynamic
 func @equal_dynamic(%arg0: tensor<?xi32>, %arg1: tensor<1xi32>) -> tensor<?xi1> {
-  // CHECK-DAG:  %[[LHS_SHAPE:.+]] = shape.shape_of %arg0
-  // CHECK-DAG:  %[[RHS_SHAPE:.+]] = shape.const_shape [1]
-  // CHECK-NEXT: %[[WITNESS:.+]] = shape.cstr_broadcastable %[[LHS_SHAPE]], %[[RHS_SHAPE]]
-  // CHECK-NEXT: shape.assuming %[[WITNESS]] -> (tensor<?xi1>) {
-  // CHECK-DAG:    %[[LHS_SHAPE1:.+]] = shape.shape_of %arg0
-  // CHECK-NEXT:   %[[RESULT_SHAPE:.+]] = "shape.broadcast"(%[[LHS_SHAPE1]], %[[RHS_SHAPE]])
-  // CHECK-NEXT:   %[[RESULT_EXTENTS:.+]] = shape.to_extent_tensor %[[RESULT_SHAPE]]
-  // CHECK-DAG:    %[[LHS_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg0, %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<0> : tensor<1xi64>}
-  // CHECK-DAG:    %[[RHS_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg1, %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<0> : tensor<1xi64>}
-  // CHECK-NEXT:   %[[RESULT:.+]] = "xla_hlo.compare"(%[[LHS_BCAST]], %[[RHS_BCAST]]) {comparison_direction = "EQ"}
-  // CHECK-NEXT:   shape.assuming_yield %[[RESULT]]
+  // CHECK-DAG: %[[LHS_SHAPE:.+]] = shape.shape_of %arg0
+  // CHECK-DAG: %[[RHS_SHAPE:.+]] = shape.const_shape [1]
+  // CHECK-DAG: %[[RESULT_SHAPE:.+]] = "shape.broadcast"(%[[LHS_SHAPE]], %[[RHS_SHAPE]])
+  // CHECK-DAG: %[[RESULT_EXTENTS:.+]] = shape.to_extent_tensor %[[RESULT_SHAPE]]
+  // CHECK-DAG: %[[LHS_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg0, %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<0> : tensor<1xi64>}
+  // CHECK-DAG: %[[RHS_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg1, %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<0> : tensor<1xi64>}
+  // CHECK: "xla_hlo.compare"(%[[LHS_BCAST]], %[[RHS_BCAST]]) {comparison_direction = "EQ"}
   %0 = "tf.Equal"(%arg0, %arg1) : (tensor<?xi32>, tensor<1xi32>) -> tensor<?xi1>
   return %0: tensor<?xi1>
 }
 
 // CHECK-LABEL: func @equal_broadcast
 func @equal_broadcast(%arg0: tensor<1xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi1> {
-  // CHECK-DAG: %[[LHS_BCAST:.+]] = "xla_hlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<1> : tensor<1xi64>}
-  // CHECK-NEXT: "xla_hlo.compare"(%[[LHS_BCAST]], %arg1) {comparison_direction = "EQ"}
+  // CHECK-DAG: %[[LHS_SHAPE:.+]] = shape.const_shape [1]
+  // CHECK-DAG: %[[RHS_SHAPE:.+]] = shape.const_shape [1, 2]
+  // CHECK-DAG: %[[RESULT_SHAPE:.+]] = shape.const_shape [1, 2]
+  // CHECK-DAG: %[[RESULT_EXTENTS:.+]] = shape.to_extent_tensor %[[RESULT_SHAPE]]
+  // CHECK-DAG: %[[LHS_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg0, %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<1> : tensor<1xi64>}
+  // CHECK-DAG: %[[RHS_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg1, %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>}
+  // CHECK: "xla_hlo.compare"(%[[LHS_BCAST]], %[[RHS_BCAST]]) {comparison_direction = "EQ"}
   %0 = "tf.Equal"(%arg0, %arg1) : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
   return %0: tensor<1x2xi1>
 }
@@ -275,25 +281,26 @@ func @greater(%arg0: tensor<2xi32>) -> tensor<2xi1> {
 
 // CHECK-LABEL: func @broadcast_greater
 func @broadcast_greater(%arg0: tensor<1xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi1> {
-  // CHECK-NEXT: %[[LHS_BCAST:.+]] = "xla_hlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<1> : tensor<1xi64>}
-  // CHECK-NEXT: "xla_hlo.compare"(%[[LHS_BCAST]], %arg1) {comparison_direction = "GT"}
+  // CHECK-DAG: %[[LHS_SHAPE:.+]] = shape.const_shape [1]
+  // CHECK-DAG: %[[RHS_SHAPE:.+]] = shape.const_shape [1, 2]
+  // CHECK-DAG: %[[RESULT_SHAPE:.+]] = shape.const_shape [1, 2]
+  // CHECK-DAG: %[[RESULT_EXTENTS:.+]] = shape.to_extent_tensor %[[RESULT_SHAPE]]
+  // CHECK-DAG: %[[LHS_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg0, %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<1> : tensor<1xi64>}
+  // CHECK-DAG: %[[RHS_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg1, %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>}
+  // CHECK: "xla_hlo.compare"(%[[LHS_BCAST]], %[[RHS_BCAST]]) {comparison_direction = "GT"}
   %0 = "tf.Greater"(%arg0, %arg1) : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
   return %0: tensor<1x2xi1>
 }
 
 // CHECK-LABEL: func @greater_dynamic
 func @greater_dynamic(%arg0: tensor<?xi32>, %arg1: tensor<?xi32>) -> tensor<?xi1> {
-  // CHECK-DAG:  %[[LHS_SHAPE:.+]] = shape.shape_of %arg0
-  // CHECK-DAG:  %[[RHS_SHAPE:.+]] = shape.shape_of %arg1
-  // CHECK-NEXT: %[[WITNESS:.+]] = shape.cstr_broadcastable %[[LHS_SHAPE]], %[[RHS_SHAPE]]
-  // CHECK-NEXT: shape.assuming %[[WITNESS]]
-  // CHECK-DAG:    %[[LHS_SHAPE1:.+]] = shape.shape_of %arg0
-  // CHECK-DAG:    %[[RHS_SHAPE1:.+]] = shape.shape_of %arg1
-  // CHECK-NEXT:   %[[RESULT_SHAPE:.+]] = "shape.broadcast"(%[[LHS_SHAPE1]], %[[RHS_SHAPE1]])
-  // CHECK-NEXT:   %[[RESULT_EXTENTS:.+]] = shape.to_extent_tensor %[[RESULT_SHAPE]]
-  // CHECK-DAG:    %[[LHS_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg0, %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<0> : tensor<1xi64>}
-  // CHECK-DAG:    %[[RHS_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg1, %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<0> : tensor<1xi64>}
-  // CHECK-NEXT:   "xla_hlo.compare"(%[[LHS_BCAST]], %[[RHS_BCAST]]) {comparison_direction = "GT"}
+  // CHECK-DAG: %[[LHS_SHAPE:.+]] = shape.shape_of %arg0
+  // CHECK-DAG: %[[RHS_SHAPE:.+]] = shape.shape_of %arg1
+  // CHECK-DAG: %[[RESULT_SHAPE:.+]] = "shape.broadcast"(%[[LHS_SHAPE]], %[[RHS_SHAPE]])
+  // CHECK-DAG: %[[RESULT_EXTENTS:.+]] = shape.to_extent_tensor %[[RESULT_SHAPE]]
+  // CHECK-DAG: %[[LHS_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg0, %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<0> : tensor<1xi64>}
+  // CHECK-DAG: %[[RHS_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg1, %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<0> : tensor<1xi64>}
+  // CHECK: "xla_hlo.compare"(%[[LHS_BCAST]], %[[RHS_BCAST]]) {comparison_direction = "GT"}
   %0 = "tf.Greater"(%arg0, %arg1) : (tensor<?xi32>, tensor<?xi32>) -> tensor<?xi1>
   return %0: tensor<?xi1>
 }
diff --git a/tensorflow/compiler/mlir/xla/transforms/chlo_legalize_to_hlo.cc b/tensorflow/compiler/mlir/xla/transforms/chlo_legalize_to_hlo.cc
index 97afa9617c4..e5a79616d5b 100644
--- a/tensorflow/compiler/mlir/xla/transforms/chlo_legalize_to_hlo.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/chlo_legalize_to_hlo.cc
@@ -112,19 +112,6 @@ struct ConvertRankedDynamicBroadcastBinaryOp
 
     // Compute result shape.
     auto loc = op.getLoc();
-
-    // Insert a constraint on the shapes being broadcastable and insert all
-    // future code into an assuming block reliant on the constraint.
-    Value lhs_shape = rewriter.create<shape::ShapeOfOp>(loc, lhs);
-    Value rhs_shape = rewriter.create<shape::ShapeOfOp>(loc, rhs);
-    auto broadcastable_cstr =
-        rewriter.create<shape::CstrBroadcastableOp>(loc, lhs_shape, rhs_shape);
-    auto assuming_op = rewriter.create<shape::AssumingOp>(
-        loc, ArrayRef<Type>{result_type}, broadcastable_cstr.result());
-
-    OpBuilder::InsertionGuard guard(rewriter);
-    rewriter.createBlock(&assuming_op.doRegion());
-
     int64_t result_rank = std::max(lhs_type.getRank(), rhs_type.getRank());
     Value result_extents =
         xla::ComputeBinaryElementwiseBroadcastingResultExtents(loc, lhs, rhs,
@@ -153,10 +140,8 @@ struct ConvertRankedDynamicBroadcastBinaryOp
         rewriter.getI64TensorAttr(rhs_broadcast_dimensions));
 
     // And generate the final non-broadcasted binary op.
-    Value final_result = Adaptor::CreateOp(op, result_type, broadcasted_lhs,
-                                           broadcasted_rhs, rewriter);
-    rewriter.create<shape::AssumingYieldOp>(loc, final_result);
-    rewriter.replaceOp(op, {assuming_op.getResult(0)});
+    rewriter.replaceOp(op, {Adaptor::CreateOp(op, result_type, broadcasted_lhs,
+                                              broadcasted_rhs, rewriter)});
     return success();
   }
 };

From 06b1b45e42aefe9dbfbedeca8abff47c20b28a21 Mon Sep 17 00:00:00 2001
From: Shanqing Cai <cais@google.com>
Date: Mon, 15 Jun 2020 08:47:31 -0700
Subject: [PATCH 0164/1390] [tfdbg2] A few fixes and improvements to example
 debug_mnist_v2

1. Change the default `--dump_tensor_debug_mode` flag value to
   `FULL_HEALTH`, a mode more suitable for numerical instability
   debugging for the particular bug in this example than the previous
   default value `NO_TENSOR`.
2. Change the default `--dump_circular_buffer_size` value to -1, to
   accommodate the possibility of longer runs where user would
   want to see the debug data in its entirety.
3. Rename a few weight variables. They were previously named in a
   confusing way.
4. Change "logits" to "probs", as they are generated by a `softmax`
   operation and hence are more accurately described as probability
   scores.

PiperOrigin-RevId: 316473164
Change-Id: I4eb13f9581a4d4e550b3b3a5cd132eeffc7dd043
---
 .../debug/examples/v2/debug_mnist_v2.py       | 34 +++++++++++++------
 1 file changed, 23 insertions(+), 11 deletions(-)

diff --git a/tensorflow/python/debug/examples/v2/debug_mnist_v2.py b/tensorflow/python/debug/examples/v2/debug_mnist_v2.py
index 539be3cd54f..f00e54500fa 100644
--- a/tensorflow/python/debug/examples/v2/debug_mnist_v2.py
+++ b/tensorflow/python/debug/examples/v2/debug_mnist_v2.py
@@ -98,16 +98,18 @@ def parse_args():
   parser.add_argument(
       "--dump_tensor_debug_mode",
       type=str,
-      default="NO_TENSOR",
+      default="FULL_HEALTH",
       help="Mode for dumping tensor values. Options: NO_TENSOR, CURT_HEALTH, "
-      "CONCISE_HEALTH, SHAPE, FULL_TENSOR. This is relevant only when "
+      "CONCISE_HEALTH, SHAPE, FULL_HEALTH. This is relevant only when "
       "--dump_dir is set.")
   # TODO(cais): Add more tensor debug mode strings once they are supported.
   parser.add_argument(
       "--dump_circular_buffer_size",
       type=int,
-      default=1000,
+      default=-1,
       help="Size of the circular buffer used to dump execution events. "
+      "A value <= 0 disables the circular-buffer behavior and causes "
+      "all instrumented tensor values to be dumped. "
       "This is relevant only when --dump_dir is set.")
   parser.add_argument(
       "--use_random_config_path",
@@ -178,9 +180,9 @@ def main(_):
     return activations
 
   # init model
-  hidden = get_dense_weights(IMAGE_SIZE**2, HIDDEN_SIZE)
-  logits = get_dense_weights(HIDDEN_SIZE, NUM_LABELS)
-  variables = hidden + logits
+  hidden_weights = get_dense_weights(IMAGE_SIZE**2, HIDDEN_SIZE)
+  output_weights = get_dense_weights(HIDDEN_SIZE, NUM_LABELS)
+  variables = hidden_weights + output_weights
 
   @tf.function
   def model(x):
@@ -193,15 +195,25 @@ def main(_):
     Returns:
       A (?, 10) tensor containing the class scores for each example.
     """
-    hidden_act = dense_layer(hidden, x)
-    logits_act = dense_layer(logits, hidden_act, tf.identity)
+    hidden_act = dense_layer(hidden_weights, x)
+    logits_act = dense_layer(output_weights, hidden_act, tf.identity)
     y = tf.nn.softmax(logits_act)
     return y
 
   @tf.function
-  def loss(logits, labels):
-    """Calculates cross entropy loss."""
-    diff = -(labels * tf.math.log(logits))
+  def loss(probs, labels):
+    """Calculates cross entropy loss.
+
+    Args:
+      probs: Class probabilities predicted by the model. The shape is expected
+        to be (?, 10).
+      labels: Truth labels for the classes, as one-hot encoded vectors. The
+        shape is expected to be the same as `probs`.
+
+    Returns:
+      A scalar loss tensor.
+    """
+    diff = -labels * tf.math.log(probs)
     loss = tf.reduce_mean(diff)
     return loss
 

From 81601f9bb422d18368f1ded540e68911bcebb7f8 Mon Sep 17 00:00:00 2001
From: Andrew Audibert <aaudibert@google.com>
Date: Mon, 15 Jun 2020 08:51:03 -0700
Subject: [PATCH 0165/1390] [tf.data service] Increase default number of
 uncompress threads to 4.

A single thread may not be able to uncompress data as quickly as it is requested.

PiperOrigin-RevId: 316473727
Change-Id: Ic041b70f2d3081d9333f3272f493d52ec3704d91
---
 .../data/experimental/ops/data_service_ops.py |  3 ++-
 .../kernel_tests/data_service_ops_test.py     | 23 +++++++++++--------
 2 files changed, 16 insertions(+), 10 deletions(-)

diff --git a/tensorflow/python/data/experimental/ops/data_service_ops.py b/tensorflow/python/data/experimental/ops/data_service_ops.py
index 39790d843ba..dd81614fa45 100644
--- a/tensorflow/python/data/experimental/ops/data_service_ops.py
+++ b/tensorflow/python/data/experimental/ops/data_service_ops.py
@@ -241,7 +241,8 @@ def _distribute(processing_mode,
     # TODO(b/157105111): Make this an autotuned parallel map when we have a way
     # to limit memory usage.
     dataset = dataset.map(
-        lambda x: compression_ops.uncompress(x, output_spec=uncompressed_spec))
+        lambda x: compression_ops.uncompress(x, output_spec=uncompressed_spec),
+        num_parallel_calls=4)
 
     # Disable autosharding for shared jobs.
     if job_name:
diff --git a/tensorflow/python/data/kernel_tests/data_service_ops_test.py b/tensorflow/python/data/kernel_tests/data_service_ops_test.py
index d316009ce0c..440a4f46a20 100644
--- a/tensorflow/python/data/kernel_tests/data_service_ops_test.py
+++ b/tensorflow/python/data/kernel_tests/data_service_ops_test.py
@@ -201,13 +201,18 @@ class DataServiceOpsTest(test_base.DatasetTestBase, parameterized.TestCase):
     self._new_worker = server_lib.WorkerServer(
         port=port, master_address=self._master._address, protocol=PROTOCOL)
 
-    # The dataset starts over now that we read from the new worker.
-    for i in range(num_elements):
+    # There may have been some elements prefetched from the first worker
+    # before it was stopped.
+    while True:
+      val = next(iterator).numpy()
+      if val == 0:
+        break
+
+    # The dataset starts over now that we read from the new worker.
+    # TODO(b/157086991): Iterate until end of sequence when we support
+    # detecting lost workers.
+    for i in range(1, num_elements // 2):
       val = next(iterator).numpy()
-      if val == midpoint and i != midpoint:
-        # There may have been one last element prefetched from the first worker
-        # before it was stopped.
-        val = next(iterator).numpy()
       self.assertEqual(i, val)
 
   @combinations.generate(test_base.eager_only_combinations())
@@ -291,7 +296,7 @@ class DataServiceOpsTest(test_base.DatasetTestBase, parameterized.TestCase):
 
   @combinations.generate(test_base.eager_only_combinations())
   def testSharedJobNameRepeat(self):
-    num_elements = 10
+    num_elements = 100
     num_repetitions = 3
     master_address = self.create_cluster(1)
     ds = dataset_ops.Dataset.range(num_elements)
@@ -302,9 +307,9 @@ class DataServiceOpsTest(test_base.DatasetTestBase, parameterized.TestCase):
     results = []
     iter1 = iter(ds1)
     iter2 = iter(ds2)
-    for _ in range(((num_elements * num_repetitions) // 2) - 1):
+    for _ in range(((num_elements * num_repetitions) // 3)):
       results.append(next(iter1).numpy())
-    for _ in range(((num_elements * num_repetitions) // 2) - 1):
+    for _ in range(((num_elements * num_repetitions) // 3)):
       results.append(next(iter2).numpy())
     for elem in iter1:
       results.append(elem.numpy())

From 7d36cebea4e10f108117d5c5423100cfd8f24509 Mon Sep 17 00:00:00 2001
From: Scott Zhu <scottzhu@google.com>
Date: Mon, 15 Jun 2020 08:59:00 -0700
Subject: [PATCH 0166/1390] Fork the keras related saver_test to keras.

PiperOrigin-RevId: 316475172
Change-Id: I9b38d7624ed01a724c15ae571c5be1a7f0e049ad
---
 tensorflow/python/keras/tests/BUILD         |  23 +++
 tensorflow/python/keras/tests/saver_test.py | 158 ++++++++++++++++++++
 tensorflow/python/training/saver_test.py    | 121 ---------------
 3 files changed, 181 insertions(+), 121 deletions(-)
 create mode 100644 tensorflow/python/keras/tests/saver_test.py

diff --git a/tensorflow/python/keras/tests/BUILD b/tensorflow/python/keras/tests/BUILD
index d270e6f638c..4bb7d5358e5 100644
--- a/tensorflow/python/keras/tests/BUILD
+++ b/tensorflow/python/keras/tests/BUILD
@@ -335,6 +335,29 @@ tf_py_test(
     ],
 )
 
+cuda_py_test(
+    name = "saver_test",
+    size = "medium",
+    srcs = ["saver_test.py"],
+    python_version = "PY3",
+    deps = [
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:nn_grad",
+        "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python:saver",
+        "//tensorflow/python:training_lib",
+        "//tensorflow/python:training_util",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/keras/engine",
+        "//tensorflow/python/keras/layers:core",
+        "//tensorflow/python/training/tracking",
+        "//tensorflow/python/training/tracking:util",
+    ],
+)
+
 tf_py_test(
     name = "temporal_sample_weights_correctness_test",
     srcs = ["temporal_sample_weights_correctness_test.py"],
diff --git a/tensorflow/python/keras/tests/saver_test.py b/tensorflow/python/keras/tests/saver_test.py
new file mode 100644
index 00000000000..f425414a932
--- /dev/null
+++ b/tensorflow/python/keras/tests/saver_test.py
@@ -0,0 +1,158 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+"""Tests for tensorflow.python.training.saver.py."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import functools
+import os
+
+from tensorflow.python.eager import context
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import ops as ops_lib
+from tensorflow.python.keras.engine import training
+from tensorflow.python.keras.layers import core
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.platform import test
+from tensorflow.python.training import adam
+from tensorflow.python.training import saver as saver_module
+from tensorflow.python.training import training_util
+from tensorflow.python.training.tracking import tracking as trackable_tracking
+from tensorflow.python.training.tracking import util as trackable_utils
+
+
+class NonLayerTrackable(trackable_tracking.AutoTrackable):
+
+  def __init__(self):
+    super(NonLayerTrackable, self).__init__()
+    self.a_variable = trackable_utils.add_variable(
+        self, name="a_variable", shape=[])
+
+
+class MyModel(training.Model):
+  """A concrete Model for testing."""
+
+  def __init__(self):
+    super(MyModel, self).__init__()
+    self._named_dense = core.Dense(1, use_bias=True)
+    self._second = core.Dense(1, use_bias=False)
+    # We can still track Trackables which aren't Layers.
+    self._non_layer = NonLayerTrackable()
+
+  def call(self, values):
+    ret = self._second(self._named_dense(values))
+    return ret
+
+
+class TrackableCompatibilityTests(test.TestCase):
+
+  def _initialized_model(self):
+    input_value = constant_op.constant([[3.]])
+    model = MyModel()
+    optimizer = adam.AdamOptimizer(0.001)
+    optimizer_step = training_util.get_or_create_global_step()
+    root_trackable = trackable_utils.Checkpoint(
+        optimizer=optimizer, model=model, optimizer_step=optimizer_step)
+    train_op = optimizer.minimize(
+        functools.partial(model, input_value),
+        global_step=optimizer_step)
+    self.evaluate(trackable_utils.gather_initializers(
+        root_trackable))
+    self.evaluate(train_op)
+    # A regular variable, a slot variable, and a non-slot Optimizer variable
+    # with known values to check when loading.
+    self.evaluate(model._named_dense.bias.assign([1.]))
+    self.evaluate(optimizer.get_slot(
+        var=model._named_dense.bias, name="m").assign([2.]))
+    beta1_power, _ = optimizer._get_beta_accumulators()
+    self.evaluate(beta1_power.assign(3.))
+    return root_trackable
+
+  def _set_sentinels(self, root_trackable):
+    self.evaluate(root_trackable.model._named_dense.bias.assign([101.]))
+    self.evaluate(
+        root_trackable.optimizer.get_slot(
+            var=root_trackable.model._named_dense.bias, name="m")
+        .assign([102.]))
+    beta1_power, _ = root_trackable.optimizer._get_beta_accumulators()
+    self.evaluate(beta1_power.assign(103.))
+
+  def _check_sentinels(self, root_trackable):
+    self.assertAllEqual(
+        [1.], self.evaluate(root_trackable.model._named_dense.bias))
+    self.assertAllEqual([2.], self.evaluate(
+        root_trackable.optimizer.get_slot(
+            var=root_trackable.model._named_dense.bias, name="m")))
+    beta1_power, _ = root_trackable.optimizer._get_beta_accumulators()
+    self.assertAllEqual(3., self.evaluate(beta1_power))
+
+  def testLoadFromObjectBasedGraph(self):
+    checkpoint_directory = self.get_temp_dir()
+    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+
+    save_graph = ops_lib.Graph()
+    with save_graph.as_default(), self.session(graph=save_graph) as sess:
+      root = self._initialized_model()
+      object_saver = trackable_utils.Checkpoint(root=root)
+      save_path = object_saver.save(file_prefix=checkpoint_prefix)
+
+      # An incompatible object-based checkpoint to check error messages
+      var = resource_variable_ops.ResourceVariable(1., name="a")
+      self.evaluate(var.initializer)
+      second_saver = trackable_utils.Checkpoint(v=var)
+      second_path = second_saver.save(file_prefix=os.path.join(
+          checkpoint_directory, "second"))
+
+    restore_graph = ops_lib.Graph()
+    with restore_graph.as_default(), self.session(
+        graph=restore_graph) as sess:
+      root = self._initialized_model()
+      self._set_sentinels(root)
+      saver = saver_module.Saver()
+      saver.restore(sess=sess, save_path=save_path)
+      self._check_sentinels(root)
+      before_second_restore_ops = restore_graph.get_operations()
+      # Test that multiple restores do not pollute the graph
+      saver.restore(sess=sess, save_path=save_path)
+      self.assertEqual(before_second_restore_ops,
+                       restore_graph.get_operations())
+      with self.assertRaisesRegexp(errors.NotFoundError,
+                                   "Could not find some variables"):
+        saver.restore(sess=sess, save_path=second_path)
+
+  def testLoadFromObjectBasedEager(self):
+    checkpoint_directory = self.get_temp_dir()
+    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+
+    save_graph = ops_lib.Graph()
+    with save_graph.as_default(), self.session(graph=save_graph):
+      root = self._initialized_model()
+      object_saver = trackable_utils.Checkpoint(root=root)
+      save_path = object_saver.save(file_prefix=checkpoint_prefix)
+
+    with context.eager_mode():
+      root = self._initialized_model()
+      self._set_sentinels(root)
+      saver = saver_module.Saver(
+          root.model.variables + root.optimizer.variables())
+      saver.restore(sess=None, save_path=save_path)
+      self._check_sentinels(root)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/training/saver_test.py b/tensorflow/python/training/saver_test.py
index 2c8bdadd5d7..5c87be37e4c 100644
--- a/tensorflow/python/training/saver_test.py
+++ b/tensorflow/python/training/saver_test.py
@@ -18,7 +18,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import functools
 import glob
 import math
 import os
@@ -48,8 +47,6 @@ from tensorflow.python.framework import graph_io
 from tensorflow.python.framework import meta_graph
 from tensorflow.python.framework import ops as ops_lib
 from tensorflow.python.framework import test_util
-from tensorflow.python.keras.engine import training
-from tensorflow.python.keras.layers import core
 from tensorflow.python.lib.io import file_io
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
@@ -74,10 +71,7 @@ from tensorflow.python.training import py_checkpoint_reader
 from tensorflow.python.training import queue_runner_impl
 from tensorflow.python.training import saver as saver_module
 from tensorflow.python.training import saver_test_utils
-from tensorflow.python.training import training_util
 from tensorflow.python.training.tracking import base as trackable_base
-from tensorflow.python.training.tracking import tracking as trackable_tracking
-from tensorflow.python.training.tracking import util as trackable_utils
 from tensorflow.python.util import compat
 
 
@@ -3024,29 +3018,6 @@ class _OwnsMirroredVariables(trackable_base.Trackable):
     return self.non_dep_variable.name
 
 
-class NonLayerTrackable(trackable_tracking.AutoTrackable):
-
-  def __init__(self):
-    super(NonLayerTrackable, self).__init__()
-    self.a_variable = trackable_utils.add_variable(
-        self, name="a_variable", shape=[])
-
-
-class MyModel(training.Model):
-  """A concrete Model for testing."""
-
-  def __init__(self):
-    super(MyModel, self).__init__()
-    self._named_dense = core.Dense(1, use_bias=True)
-    self._second = core.Dense(1, use_bias=False)
-    # We can still track Trackables which aren't Layers.
-    self._non_layer = NonLayerTrackable()
-
-  def call(self, values):
-    ret = self._second(self._named_dense(values))
-    return ret
-
-
 class TrackableCompatibilityTests(test.TestCase):
 
   # TODO(allenl): Track down python3 reference cycles in these tests.
@@ -3112,46 +3083,6 @@ class TrackableCompatibilityTests(test.TestCase):
         saver.restore(sess, save_path)
         self.assertEqual(1, v.eval_count)
 
-  def _initialized_model(self):
-    input_value = constant_op.constant([[3.]])
-    model = MyModel()
-    optimizer = adam.AdamOptimizer(0.001)
-    optimizer_step = training_util.get_or_create_global_step()
-    root_trackable = trackable_utils.Checkpoint(
-        optimizer=optimizer, model=model, optimizer_step=optimizer_step)
-    train_op = optimizer.minimize(
-        functools.partial(model, input_value),
-        global_step=optimizer_step)
-    self.evaluate(trackable_utils.gather_initializers(
-        root_trackable))
-    self.evaluate(train_op)
-    # A regular variable, a slot variable, and a non-slot Optimizer variable
-    # with known values to check when loading.
-    self.evaluate(model._named_dense.bias.assign([1.]))
-    self.evaluate(optimizer.get_slot(
-        var=model._named_dense.bias, name="m").assign([2.]))
-    beta1_power, _ = optimizer._get_beta_accumulators()
-    self.evaluate(beta1_power.assign(3.))
-    return root_trackable
-
-  def _set_sentinels(self, root_trackable):
-    self.evaluate(root_trackable.model._named_dense.bias.assign([101.]))
-    self.evaluate(
-        root_trackable.optimizer.get_slot(
-            var=root_trackable.model._named_dense.bias, name="m")
-        .assign([102.]))
-    beta1_power, _ = root_trackable.optimizer._get_beta_accumulators()
-    self.evaluate(beta1_power.assign(103.))
-
-  def _check_sentinels(self, root_trackable):
-    self.assertAllEqual(
-        [1.], self.evaluate(root_trackable.model._named_dense.bias))
-    self.assertAllEqual([2.], self.evaluate(
-        root_trackable.optimizer.get_slot(
-            var=root_trackable.model._named_dense.bias, name="m")))
-    beta1_power, _ = root_trackable.optimizer._get_beta_accumulators()
-    self.assertAllEqual(3., self.evaluate(beta1_power))
-
   def testVariableNotFoundErrorRaised(self):
     # Restore does some tricky exception handling to figure out if it should
     # load an object-based checkpoint. Tests that the exception handling isn't
@@ -3199,58 +3130,6 @@ class TrackableCompatibilityTests(test.TestCase):
             "a mismatch between the current graph and the graph"):
           a_saver.restore(sess=sess, save_path=save_path)
 
-  def testLoadFromObjectBasedGraph(self):
-    checkpoint_directory = self.get_temp_dir()
-    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
-
-    save_graph = ops_lib.Graph()
-    with save_graph.as_default(), self.session(graph=save_graph) as sess:
-      root = self._initialized_model()
-      object_saver = trackable_utils.Checkpoint(root=root)
-      save_path = object_saver.save(file_prefix=checkpoint_prefix)
-
-      # An incompatible object-based checkpoint to check error messages
-      var = resource_variable_ops.ResourceVariable(1., name="a")
-      self.evaluate(var.initializer)
-      second_saver = trackable_utils.Checkpoint(v=var)
-      second_path = second_saver.save(file_prefix=os.path.join(
-          checkpoint_directory, "second"))
-
-    restore_graph = ops_lib.Graph()
-    with restore_graph.as_default(), self.session(
-        graph=restore_graph) as sess:
-      root = self._initialized_model()
-      self._set_sentinels(root)
-      saver = saver_module.Saver()
-      saver.restore(sess=sess, save_path=save_path)
-      self._check_sentinels(root)
-      before_second_restore_ops = restore_graph.get_operations()
-      # Test that multiple restores do not pollute the graph
-      saver.restore(sess=sess, save_path=save_path)
-      self.assertEqual(before_second_restore_ops,
-                       restore_graph.get_operations())
-      with self.assertRaisesRegexp(errors.NotFoundError,
-                                   "Could not find some variables"):
-        saver.restore(sess=sess, save_path=second_path)
-
-  def testLoadFromObjectBasedEager(self):
-    checkpoint_directory = self.get_temp_dir()
-    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
-
-    save_graph = ops_lib.Graph()
-    with save_graph.as_default(), self.session(graph=save_graph):
-      root = self._initialized_model()
-      object_saver = trackable_utils.Checkpoint(root=root)
-      save_path = object_saver.save(file_prefix=checkpoint_prefix)
-
-    with context.eager_mode():
-      root = self._initialized_model()
-      self._set_sentinels(root)
-      saver = saver_module.Saver(
-          root.model.variables + root.optimizer.variables())
-      saver.restore(sess=None, save_path=save_path)
-      self._check_sentinels(root)
-
 
 if __name__ == "__main__":
   test.main()

From 01b30fa03f636396f977a76628123199b772463a Mon Sep 17 00:00:00 2001
From: Elena Zhelezina <elena.zhelezina@arm.com>
Date: Mon, 15 Jun 2020 17:30:29 +0100
Subject: [PATCH 0167/1390] Fix for the linter.

Change-Id: Ie1185cc2cca9157655b22e1d3bb49ddc017a8f0e
---
 tensorflow/lite/tools/optimize/quantization_utils.cc | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tensorflow/lite/tools/optimize/quantization_utils.cc b/tensorflow/lite/tools/optimize/quantization_utils.cc
index cdf2743585e..cdc794c20c4 100644
--- a/tensorflow/lite/tools/optimize/quantization_utils.cc
+++ b/tensorflow/lite/tools/optimize/quantization_utils.cc
@@ -92,7 +92,6 @@ void GetSymmetricQuantizationParams(
   min = std::min(min, 0.0f);
   max = std::max(max, 0.0f);
   const float scale = std::max(std::abs(max), std::abs(min)) / half_quant_range;
-  int64_t zero_point = 0;
   quantization_params->min = std::vector<float>(1, min);
   quantization_params->max = std::vector<float>(1, max);
   quantization_params->scale = std::vector<float>(1, scale);

From 16ac7c04d4d8727ae917c20678eb776fb1a4bacb Mon Sep 17 00:00:00 2001
From: Scott Zhu <scottzhu@google.com>
Date: Mon, 15 Jun 2020 09:35:04 -0700
Subject: [PATCH 0168/1390] Fork keras related tracking test to keras/tests

PiperOrigin-RevId: 316482123
Change-Id: I20645bbfdd926e2c83136ee27c6ef9325cb1f438
---
 tensorflow/python/keras/tests/BUILD           |  65 ++
 .../python/keras/tests/tracking_test.py       | 610 ++++++++++++
 .../python/keras/tests/tracking_util_test.py  | 926 ++++++++++++++++++
 tensorflow/python/training/tracking/BUILD     |   7 -
 .../training/tracking/data_structures_test.py | 503 +---------
 .../python/training/tracking/tracking_test.py |  63 --
 .../python/training/tracking/util_test.py     | 861 ----------------
 7 files changed, 1602 insertions(+), 1433 deletions(-)
 create mode 100644 tensorflow/python/keras/tests/tracking_test.py
 create mode 100644 tensorflow/python/keras/tests/tracking_util_test.py

diff --git a/tensorflow/python/keras/tests/BUILD b/tensorflow/python/keras/tests/BUILD
index 4bb7d5358e5..ad52d33abc6 100644
--- a/tensorflow/python/keras/tests/BUILD
+++ b/tensorflow/python/keras/tests/BUILD
@@ -370,6 +370,71 @@ tf_py_test(
     ],
 )
 
+tf_py_test(
+    name = "tracking_test",
+    srcs = ["tracking_test.py"],
+    python_version = "PY3",
+    tags = [
+        "no_windows",
+        "nomac",
+    ],
+    deps = [
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:test",
+        "//tensorflow/python/keras/engine",
+        "//tensorflow/python/keras/layers:core",
+        "//tensorflow/python/keras/layers:normalization",
+        "//tensorflow/python/module",
+        "//tensorflow/python/training/tracking",
+        "//tensorflow/python/training/tracking:data_structures",
+        "//tensorflow/python/training/tracking:util",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
+tf_py_test(
+    name = "tracking_util_test",
+    srcs = ["tracking_util_test.py"],
+    python_version = "PY3",
+    tags = ["notsan"],  # b/74395663
+    deps = [
+        "//tensorflow/compiler/tests:xla_test",
+        "//tensorflow/python:checkpoint_management",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:extra_py_tests_deps",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:init_ops",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python:saver",
+        "//tensorflow/python:state_ops",
+        "//tensorflow/python:template",
+        "//tensorflow/python:training_util",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/eager:backprop",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/keras/engine",
+        "//tensorflow/python/keras/layers:core",
+        "//tensorflow/python/keras/optimizer_v2",
+        "//tensorflow/python/training/tracking",
+        "//tensorflow/python/training/tracking:graph_view",
+        "//tensorflow/python/training/tracking:util",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
 py_library(
     name = "get_config_samples",
     srcs = ["get_config_samples.py"],
diff --git a/tensorflow/python/keras/tests/tracking_test.py b/tensorflow/python/keras/tests/tracking_test.py
new file mode 100644
index 00000000000..b5ce6911d92
--- /dev/null
+++ b/tensorflow/python/keras/tests/tracking_test.py
@@ -0,0 +1,610 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+from absl.testing import parameterized
+import numpy
+import six
+
+from tensorflow.python.eager import context
+from tensorflow.python.eager import test
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.keras.engine import sequential
+from tensorflow.python.keras.engine import training
+from tensorflow.python.keras.layers import core
+from tensorflow.python.keras.layers import normalization
+from tensorflow.python.module import module
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.training.tracking import base
+from tensorflow.python.training.tracking import data_structures
+from tensorflow.python.training.tracking import tracking
+from tensorflow.python.training.tracking import util
+
+
+class HasList(training.Model):
+
+  def __init__(self):
+    super(HasList, self).__init__()
+    self.layer_list = data_structures.List([core.Dense(3)])
+    self.layer_list.append(core.Dense(4))
+    self.layer_list.extend(
+        [core.Dense(5),
+         core.Dense(6, kernel_regularizer=math_ops.reduce_sum)])
+    self.layer_list += [
+        core.Dense(7, bias_regularizer=math_ops.reduce_sum),
+        core.Dense(8)
+    ]
+    self.layer_list += (
+        data_structures.List([core.Dense(9)]) + data_structures.List(
+            [core.Dense(10)]))
+    self.layer_list.extend(
+        data_structures.List(
+            list([core.Dense(11)]) + [core.Dense(12)]))
+    self.layers_with_updates = data_structures.List(
+        (normalization.BatchNormalization(),))
+
+  def call(self, x):
+    aggregation = 0.
+    for l in self.layer_list:
+      x = l(x)
+      aggregation += math_ops.reduce_sum(x)
+    bn, = self.layers_with_updates
+    return bn(x) / aggregation
+
+
+class ListTests(test.TestCase):
+
+  @test_util.run_in_graph_and_eager_modes
+  @test_util.run_v1_only("b/120545219")
+  def testTracking(self):
+    model = HasList()
+    output = model(array_ops.ones([32, 2]))
+    self.assertAllEqual([32, 12], output.shape)
+    self.assertEqual(11, len(model.layers))
+    self.assertEqual(10, len(model.layer_list.layers))
+    six.assertCountEqual(
+        self,
+        model.layers,
+        model.layer_list.layers + model.layers_with_updates)
+    for index in range(10):
+      self.assertEqual(3 + index, model.layer_list.layers[index].units)
+    self.assertEqual(2, len(model._checkpoint_dependencies))
+    self.assertIs(model.layer_list, model._checkpoint_dependencies[0].ref)
+    self.assertIs(model.layers_with_updates,
+                  model._checkpoint_dependencies[1].ref)
+    self.assertEqual(
+        10, len(model._checkpoint_dependencies[0].ref._checkpoint_dependencies))
+    self.evaluate([v.initializer for v in model.variables])
+    self.evaluate(model.variables[0].assign([[1., 2., 3.], [4., 5., 6.]]))
+    save_path = os.path.join(self.get_temp_dir(), "ckpt")
+    model.save_weights(save_path)
+    self.evaluate(model.variables[0].assign(array_ops.zeros([2, 3])))
+    model.load_weights(save_path)
+    self.assertAllEqual([[1., 2., 3.], [4., 5., 6.]],
+                        self.evaluate(model.variables[0]))
+    v = variables.Variable(1.)
+    model.var_list = [v]
+    self.assertIn(v, model.variables)
+    self.assertIn(v, model.trainable_variables)
+    self.assertNotIn(v, model.non_trainable_variables)
+    self.assertIn(model.layer_list[0].trainable_weights[0],
+                  model.trainable_weights)
+
+  def testSubModelTracking(self):
+    model = training.Model()
+    model.v = variables.Variable(1.)
+    self.assertIn(model.v, model.trainable_weights)
+    model2 = training.Model()
+    model2.m = [model]
+    self.assertIn(model.v, model2.trainable_weights)
+
+  def testSubSequentialTracking(self):
+
+    class _Subclassed(training.Model):
+
+      def __init__(self, wrapped):
+        super(_Subclassed, self).__init__()
+        self._wrapped = wrapped
+
+      def call(self, x):
+        return self._wrapped(x)
+
+    model = sequential.Sequential()
+    layer = core.Dense(1)
+    model.add(layer)
+    model2 = _Subclassed(model)
+    model2(array_ops.ones([1, 2]))
+    model2.m = [model]
+    self.assertIn(layer.kernel, model2.trainable_weights)
+
+  def testLayerTrackedThroughSequential(self):
+    class AttrDict(dict):
+
+      def __init__(self, *args, **kwargs):
+        super(AttrDict, self).__init__(*args, **kwargs)
+        self.__dict__ = self
+
+    def ffnet(layer_sizes, name):
+      ff = sequential.Sequential(name=name)
+      for i, width in enumerate(layer_sizes):
+        ff.add(core.Dense(
+            width,
+            activation=("relu" if i < len(layer_sizes)-1 else None)))
+      return ff
+
+    class MyModel2(training.Model):
+
+      def __init__(self, config, name="my_model_2"):
+        super(MyModel2, self).__init__(name=name)
+        self._num_tokens = config.num_tokens
+
+        # list of sub-models
+        self._ffnet = [ffnet(config.module_layers + (self._num_tokens,), "ff")]
+
+      def null_input(self):
+        return array_ops.zeros([1, self._num_tokens], dtype=dtypes.float32)
+
+      def call(self, input_, module_index=None):
+        return self._ffnet[0](input_)
+
+    m2 = MyModel2(AttrDict(
+        num_tokens=5,
+        module_layers=(50, 30)))
+
+    # Construct
+    m2(m2.null_input())
+    self.assertLen(m2.trainable_variables, 6)
+
+  @test_util.run_v1_only("b/120545219")
+  def testUpdatesForwarded(self):
+    with context.graph_mode():
+      model = HasList()
+      model_input = array_ops.ones([32, 2])
+      model(model_input)
+      self.assertGreater(len(model.layers_with_updates[0].updates), 0)
+      self.assertEqual(set(model.layers_with_updates[0].updates),
+                       set(model.updates))
+
+    with context.eager_mode():
+      model = HasList()
+      model_input = array_ops.ones([32, 2])
+      model(model_input)
+      self.assertEqual(0, len(model.updates))
+
+  @test_util.run_in_graph_and_eager_modes
+  @test_util.run_v1_only("b/120545219")
+  def testLossesForwarded(self):
+    model = HasList()
+    model_input = array_ops.ones([32, 2])
+    model(model_input)
+    self.assertEqual(2, len(model.losses))
+
+  def testModelContainersCompareEqual(self):
+    class HasEqualContainers(training.Model):
+
+      def __init__(self):
+        super(HasEqualContainers, self).__init__()
+        self.l1 = []
+        self.l2 = []
+
+    model = HasEqualContainers()
+    first_layer = HasEqualContainers()
+    model.l1.append(first_layer)
+    second_layer = HasEqualContainers()
+    model.l2.append(second_layer)
+    self.assertEqual([first_layer, second_layer], model.layers)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testTensorConversion(self):
+
+    class ListToTensor(training.Model):
+
+      def __init__(self):
+        super(ListToTensor, self).__init__()
+        self.l = [1., 2., 3.]
+
+    self.assertAllEqual(
+        [1., 2., 3.],
+        self.evaluate(constant_op.constant(ListToTensor().l)))
+
+    self.assertAllEqual(
+        [1., 2., 3.],
+        self.evaluate(array_ops.pack(ListToTensor().l)))
+
+
+class ListWrapperTest(test.TestCase):
+
+  def testLayerCollectionWithExternalMutation(self):
+    l = []
+    l_wrapper = data_structures.ListWrapper(l)
+    layer = core.Dense(1)
+    l.append(layer)
+    self.assertEqual([layer], l_wrapper.layers)
+
+
+class HasMapping(training.Model):
+
+  def __init__(self):
+    super(HasMapping, self).__init__()
+    self.layer_dict = data_structures.Mapping(output=core.Dense(7))
+    self.layer_dict["norm"] = data_structures.List()
+    self.layer_dict["dense"] = data_structures.List()
+    self.layer_dict["dense"].extend(
+        [core.Dense(5),
+         core.Dense(6, kernel_regularizer=math_ops.reduce_sum)])
+    self.layer_dict["norm"].append(
+        normalization.BatchNormalization())
+    self.layer_dict["norm"].append(
+        normalization.BatchNormalization())
+
+  def call(self, x):
+    aggregation = 0.
+    for norm, dense in zip(self.layer_dict["norm"], self.layer_dict["dense"]):
+      x = norm(dense(x))
+      aggregation += math_ops.reduce_sum(x)
+    return self.layer_dict["output"](x) / aggregation
+
+
+class MappingTests(test.TestCase):
+
+  @test_util.run_in_graph_and_eager_modes
+  def testTracking(self):
+    model = HasMapping()
+    output = model(array_ops.ones([32, 2]))
+    self.assertAllEqual([32, 7], output.shape.as_list())
+    self.assertEqual(5, len(model.layers))
+    six.assertCountEqual(self, model.layers, model.layer_dict.layers)
+    self.assertEqual(1, len(model._checkpoint_dependencies))
+    self.assertIs(model.layer_dict, model._checkpoint_dependencies[0].ref)
+    self.evaluate([v.initializer for v in model.variables])
+    test_var = model.layer_dict["output"].kernel
+    self.evaluate(test_var.assign(array_ops.ones([6, 7])))
+    save_path = os.path.join(self.get_temp_dir(), "ckpt")
+    model.save_weights(save_path)
+    self.evaluate(test_var.assign(array_ops.zeros([6, 7])))
+    model.load_weights(save_path)
+    self.assertAllEqual(numpy.ones([6, 7]),
+                        self.evaluate(test_var))
+
+  def testLayerCollectionWithExternalMutation(self):
+    d = {}
+    root = tracking.AutoTrackable()
+    root.wrapper = d
+    self.assertEqual([], root.wrapper.layers)
+    self.assertEqual([], root.wrapper.trainable_weights)
+    layer1 = core.Dense(1)
+    layer2 = core.Dense(1)
+    d["a"] = layer1
+    d["b"] = layer2
+    self.assertEqual([layer1, layer2], root.wrapper.layers)
+    # The layers have still not created variables
+    self.assertEqual([], root.wrapper.trainable_weights)
+
+  def testDictWrapperBadKeys(self):
+    a = tracking.AutoTrackable()
+    a.d = {}
+    a.d[1] = data_structures.List()
+    model = training.Model()
+    model.sub = a
+    save_path = os.path.join(self.get_temp_dir(), "ckpt")
+    with self.assertRaisesRegexp(ValueError, "non-string key"):
+      model.save_weights(save_path)
+
+  def testDictWrapperNoDependency(self):
+    a = tracking.AutoTrackable()
+    a.d = data_structures.NoDependency({})
+    a.d[1] = [3]
+    self.assertEqual([a], util.list_objects(a))
+    model = training.Model()
+    model.sub = a
+    save_path = os.path.join(self.get_temp_dir(), "ckpt")
+    model.save_weights(save_path)
+    model.load_weights(save_path)
+
+  def testNonStringKeyNotTrackableValue(self):
+    a = tracking.AutoTrackable()
+    a.d = {}
+    a.d["a"] = [3]
+    a.d[1] = data_structures.NoDependency([3])
+    self.assertEqual([a, a.d, a.d["a"]], util.list_objects(a))
+    model = training.Model()
+    model.sub = a
+    save_path = os.path.join(self.get_temp_dir(), "ckpt")
+    model.save_weights(save_path)
+    model.load_weights(save_path)
+
+  def testNonAppendNotTrackable(self):
+    # Non-append mutations (deleting or overwriting values) are OK when the
+    # values aren't tracked.
+    a = tracking.AutoTrackable()
+    a.d = {}
+    a.d["a"] = [3]
+    a.d[1] = 3
+    a.d[1] = 2
+    self.assertEqual(2, a.d[1])
+    del a.d[1]
+    a.d[2] = data_structures.NoDependency(tracking.AutoTrackable())
+    second = tracking.AutoTrackable()
+    a.d[2] = data_structures.NoDependency(second)
+    self.assertIs(second, a.d[2])
+    self.assertEqual([a, a.d, a.d["a"]], util.list_objects(a))
+    model = training.Model()
+    model.sub = a
+    save_path = os.path.join(self.get_temp_dir(), "ckpt")
+    model.save_weights(save_path)
+    model.load_weights(save_path)
+
+  def testPopNoSave(self):
+    model = training.Model()
+    model.d = {}
+    model.d["a"] = []
+    model.d.pop("a")
+    save_path = os.path.join(self.get_temp_dir(), "ckpt")
+    with self.assertRaisesRegexp(ValueError, "Unable to save"):
+      model.save_weights(save_path)
+
+  def testExternalModificationNoSave(self):
+    model = training.Model()
+    external_reference = {}
+    model.d = external_reference
+    external_reference["a"] = []
+    save_path = os.path.join(self.get_temp_dir(), "ckpt")
+    with self.assertRaisesRegexp(ValueError, "modified outside the wrapper"):
+      model.save_weights(save_path)
+
+  def testOverwriteCanStillSave(self):
+    model = training.Model()
+    model.d = {}
+    model.d["a"] = {}
+    model.d["a"] = {}
+    save_path = os.path.join(self.get_temp_dir(), "ckpt")
+    model.save_weights(save_path)
+
+  def testIter(self):
+    model = training.Model()
+    model.d = {1: 3}
+    model.d[1] = 3
+    self.assertEqual([1], list(model.d))
+    new_dict = {}
+    # This update() is super tricky. If the dict wrapper subclasses dict,
+    # CPython will access its storage directly instead of calling any
+    # methods/properties on the object. So the options are either not to
+    # subclass dict (in which case update will call normal iter methods, but the
+    # object won't pass isinstance checks) or to subclass dict and keep that
+    # storage updated (no shadowing all its methods like ListWrapper).
+    new_dict.update(model.d)
+    self.assertEqual({1: 3}, new_dict)
+
+
+class HasTuple(training.Model):
+
+  def __init__(self):
+    super(HasTuple, self).__init__()
+    self.layer_list = (
+        core.Dense(3), core.Dense(4),
+        core.Dense(5, kernel_regularizer=math_ops.reduce_sum))
+    self.layers_with_updates = (normalization.BatchNormalization(),)
+
+  def call(self, x):
+    aggregation = 0.
+    for l in self.layer_list:
+      x = l(x)
+      aggregation += math_ops.reduce_sum(x)
+    bn, = self.layers_with_updates
+    return bn(x) / aggregation
+
+
+class TupleTests(test.TestCase, parameterized.TestCase):
+
+  @test_util.run_in_graph_and_eager_modes
+  def testTracking(self):
+    model = HasTuple()
+    output = model(array_ops.ones([32, 2]))
+    self.assertAllEqual([32, 5], output.shape.as_list())
+    self.assertLen(model.layers, 4)
+    self.assertLen(model.layer_list.layers, 3)
+    six.assertCountEqual(
+        self,
+        model.layers,
+        tuple(model.layer_list.layers) + model.layers_with_updates)
+    self.assertEqual(3, model.layer_list.layers[0].units)
+    self.assertEqual(4, model.layer_list.layers[1].units)
+    self.assertEqual(5, model.layer_list.layers[2].units)
+    self.assertLen(model._checkpoint_dependencies, 2)
+    self.assertIs(model.layer_list, model._checkpoint_dependencies[0].ref)
+    self.assertIs(model.layers_with_updates,
+                  model._checkpoint_dependencies[1].ref)
+    self.assertLen(
+        model._checkpoint_dependencies[0].ref._checkpoint_dependencies, 3)
+    self.evaluate([v.initializer for v in model.variables])
+    self.evaluate(model.variables[0].assign([[1., 2., 3.], [4., 5., 6.]]))
+    save_path = os.path.join(self.get_temp_dir(), "ckpt")
+    model.save_weights(save_path)
+    self.evaluate(model.variables[0].assign(array_ops.zeros([2, 3])))
+    model.load_weights(save_path)
+    self.assertAllEqual([[1., 2., 3.], [4., 5., 6.]],
+                        self.evaluate(model.variables[0]))
+    v = variables.Variable(1.)
+    model.var_list = (v,)
+    self.assertIn(id(v), [id(obj) for obj in model.variables])
+    self.assertIn(id(v), [id(obj) for obj in model.trainable_variables])
+    self.assertNotIn(id(v), [id(obj) for obj in model.non_trainable_variables])
+    self.assertIn(id(model.layer_list[0].trainable_weights[0]),
+                  [id(obj) for obj in model.trainable_weights])
+
+  @parameterized.named_parameters(
+      ("Module", module.Module),
+      ("Model", training.Model),
+  )
+  def testSubModelTracking(self, module_subclass):
+    model = module_subclass()
+    model.v = variables.Variable(1.)
+    self.assertIn(model.v, model.trainable_variables)
+    model2 = module_subclass()
+    model2.m = (model,)
+    self.assertIn(model.v, model2.trainable_variables)
+
+  def testSubSequentialTracking(self):
+
+    class _Subclassed(training.Model):
+
+      def __init__(self, wrapped):
+        super(_Subclassed, self).__init__()
+        self._wrapped = wrapped
+
+      def call(self, x):
+        return self._wrapped(x)
+
+    model = sequential.Sequential()
+    layer = core.Dense(1)
+    model.add(layer)
+    model2 = _Subclassed(model)
+    model2(array_ops.ones([1, 2]))
+    model2.m = (model,)
+    self.assertIn(layer.kernel, model2.trainable_weights)
+
+  def testUpdatesForwarded(self):
+    with ops.Graph().as_default():
+      model = HasTuple()
+      model_input = array_ops.ones([32, 2])
+      model(model_input)
+      self.assertNotEmpty(model.layers_with_updates[0].updates)
+      self.assertEqual(set(model.layers_with_updates[0].updates),
+                       set(model.updates))
+
+    model = HasTuple()
+    model_input = array_ops.ones([32, 2])
+    model(model_input)
+    self.assertEmpty(model.updates)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testLossesForwarded(self):
+    model = HasTuple()
+    model_input = array_ops.ones([32, 2])
+    model(model_input)
+    self.assertLen(model.losses, 1)
+
+  def testModelContainersCompareEqual(self):
+    class HasEqualContainers(training.Model):
+
+      def __init__(self):
+        super(HasEqualContainers, self).__init__()
+        self.l1 = ()
+        self.l2 = ()
+
+    model = HasEqualContainers()
+    first_layer = HasEqualContainers()
+    model.l1 = (first_layer,)
+    second_layer = HasEqualContainers()
+    model.l2 = (second_layer,)
+    self.assertEqual((first_layer,), model.l1)
+    d = {model.l1: 1, model.l2: 2}
+    self.assertEqual(1, d[model.l1])
+    self.assertEqual(1, d[(first_layer,)])
+    self.assertEqual(2, d[model.l2])
+    self.assertEqual(2, d[(second_layer,)])
+    self.assertEqual([first_layer, second_layer], model.layers)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testTensorConversion(self):
+
+    class TupleToTensor(training.Model):
+
+      def __init__(self):
+        super(TupleToTensor, self).__init__()
+        self.l = (1., 2., 3.)
+
+    self.assertAllEqual(
+        (1., 2., 3.),
+        self.evaluate(constant_op.constant(TupleToTensor().l)))
+
+    self.assertAllEqual(
+        (1., 2., 3.),
+        self.evaluate(array_ops.pack(TupleToTensor().l)))
+
+
+class InterfaceTests(test.TestCase):
+
+  def testNoDependency(self):
+    root = tracking.AutoTrackable()
+    hasdep = tracking.AutoTrackable()
+    root.hasdep = hasdep
+    nodep = tracking.AutoTrackable()
+    root.nodep = data_structures.NoDependency(nodep)
+    self.assertEqual(1, len(root._checkpoint_dependencies))
+    self.assertIs(root._checkpoint_dependencies[0].ref, root.hasdep)
+    self.assertIs(root.hasdep, hasdep)
+    self.assertIs(root.nodep, nodep)
+
+    class NoDependencyModel(training.Model):
+
+      @base.no_automatic_dependency_tracking
+      def __init__(self):
+        super(NoDependencyModel, self).__init__()
+        self.a = []
+        self.b = tracking.AutoTrackable()
+
+    nodeps = NoDependencyModel()
+    self.assertEqual([nodeps], util.list_objects(nodeps))
+
+  @test_util.run_in_graph_and_eager_modes
+  def testDictionariesBasic(self):
+    a = training.Model()
+    b = training.Model()
+    a.attribute = {"b": b}
+    c = training.Model()
+    a.attribute["c"] = []
+    a.attribute["c"].append(c)
+    a_deps = util.list_objects(a)
+    self.assertIn(b, a_deps)
+    self.assertIn(c, a_deps)
+    self.assertIs(b, a.attribute["b"])
+    six.assertCountEqual(
+        self,
+        ["b", "c"],
+        [dep.name for dep in a.attribute._checkpoint_dependencies])
+    self.assertEqual([b, c], a.layers)
+    self.assertEqual([b, c], a.attribute.layers)
+    self.assertEqual([c], a.attribute["c"].layers)
+    checkpoint = util.Checkpoint(a=a)
+    save_path = checkpoint.save(os.path.join(self.get_temp_dir(), "ckpt"))
+    with self.cached_session():
+      checkpoint.restore(save_path).assert_consumed().initialize_or_restore()
+
+  @test_util.run_in_graph_and_eager_modes
+  def testNoDepList(self):
+    a = training.Model()
+    a.l1 = data_structures.NoDependency([])
+    a.l1.insert(1, 0)
+    self.assertIsInstance(a.l1, list)
+    checkpoint = util.Checkpoint(a=a)
+    checkpoint.save(os.path.join(self.get_temp_dir(), "ckpt"))
+    a.l2 = []
+    a.l2.insert(1, module.Module())
+    with self.assertRaisesRegexp(ValueError, "A list element was replaced"):
+      checkpoint.save(os.path.join(self.get_temp_dir(), "ckpt"))
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/keras/tests/tracking_util_test.py b/tensorflow/python/keras/tests/tracking_util_test.py
new file mode 100644
index 00000000000..ee5d7428fcc
--- /dev/null
+++ b/tensorflow/python/keras/tests/tracking_util_test.py
@@ -0,0 +1,926 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import functools
+import os
+import weakref
+
+from absl.testing import parameterized
+import six
+
+from tensorflow.python.eager import backprop
+from tensorflow.python.eager import context
+from tensorflow.python.eager import def_function
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.keras.engine import input_layer
+from tensorflow.python.keras.engine import sequential
+from tensorflow.python.keras.engine import training
+from tensorflow.python.keras.layers import core
+from tensorflow.python.keras.optimizer_v2 import adam
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import template
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops import variables as variables_lib
+from tensorflow.python.platform import test
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.training import checkpoint_management
+from tensorflow.python.training import saver as saver_lib
+from tensorflow.python.training import training_util
+from tensorflow.python.training.tracking import graph_view
+from tensorflow.python.training.tracking import tracking
+from tensorflow.python.training.tracking import util as trackable_utils
+
+
+# pylint: disable=not-callable
+class MyModel(training.Model):
+  """A concrete Model for testing."""
+
+  def __init__(self):
+    super(MyModel, self).__init__()
+    self._named_dense = core.Dense(1, use_bias=True)
+    self._second = core.Dense(1, use_bias=False)
+    # We can still track Trackables which aren't Layers.
+    self._non_layer = NonLayerTrackable()
+
+  def call(self, values):
+    ret = self._second(self._named_dense(values))
+    return ret
+
+
+class NonLayerTrackable(tracking.AutoTrackable):
+
+  def __init__(self):
+    super(NonLayerTrackable, self).__init__()
+    self.a_variable = trackable_utils.add_variable(
+        self, name="a_variable", shape=[])
+
+
+class InterfaceTests(test.TestCase):
+
+  def testLayerDeduplication(self):
+    model = training.Model()
+    layer_one = core.Dense(1)
+    layer_two = core.Dense(1)
+    model.other_path = [layer_one, layer_two]
+    model.l2 = layer_two
+    model.l1 = layer_one
+    self.assertEqual([layer_one, layer_two], model.layers)
+
+  def testSaveWithOnlyKerasSession(self):
+
+    with ops.Graph().as_default():
+      inp = input_layer.Input([1])
+      dense = core.Dense(1)(inp)
+      model = training.Model(inp, dense)
+      model.compile(optimizer="sgd", loss="mse")
+      model.fit([1.], [2.])
+      checkpoint = trackable_utils.Checkpoint(model=model)
+      checkpoint.save(os.path.join(self.get_temp_dir(), "ckpt"))
+
+  def testObjectMetadata(self):
+    with context.eager_mode():
+      checkpoint_directory = self.get_temp_dir()
+      checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+      dense = core.Dense(1)
+      checkpoint = trackable_utils.Checkpoint(dense=dense)
+      dense(constant_op.constant([[1.]]))
+      save_path = checkpoint.save(checkpoint_prefix)
+
+    objects = trackable_utils.object_metadata(save_path)
+    all_variable_names = []
+    for obj in objects.nodes:
+      for attribute in obj.attributes:
+        all_variable_names.append(attribute.full_name)
+    self.assertIn("dense/kernel", all_variable_names)
+
+
+class CheckpointingTests(parameterized.TestCase, test.TestCase):
+
+  @test_util.run_in_graph_and_eager_modes(assert_no_eager_garbage=True)
+  def testNamingWithOptimizer(self):
+    input_value = constant_op.constant([[3.]])
+    model = MyModel()
+    # A nuisance Model using the same optimizer. Its slot variables should not
+    # go in the checkpoint, since it is never depended on.
+    other_model = MyModel()
+    optimizer = adam.Adam(0.001)
+    step = training_util.get_or_create_global_step()
+    root_trackable = trackable_utils.Checkpoint(
+        optimizer=optimizer, model=model, step=step)
+
+    with backprop.GradientTape() as tape:
+      loss = model(input_value)
+    variables = model.trainable_variables
+    gradients = tape.gradient(loss, variables)
+    train_op = control_flow_ops.group(
+        optimizer.apply_gradients(zip(gradients, variables)),
+        step.assign_add(1))
+
+    with backprop.GradientTape() as tape:
+      loss = other_model(input_value)
+    variables = other_model.trainable_variables
+    gradients = tape.gradient(loss, variables)
+    optimizer.apply_gradients(zip(gradients, variables))
+
+    self.evaluate(trackable_utils.gather_initializers(
+        root_trackable))
+    self.evaluate(train_op)
+    named_variables, serialized_graph, _ = graph_view.ObjectGraphView(
+        root_trackable).serialize_object_graph()
+    expected_slot_keys = (
+        "model/_second/kernel/.OPTIMIZER_SLOT/optimizer/m",
+        "model/_second/kernel/.OPTIMIZER_SLOT/optimizer/v",
+        "model/_named_dense/kernel/.OPTIMIZER_SLOT/optimizer/m",
+        "model/_named_dense/kernel/.OPTIMIZER_SLOT/optimizer/v",
+        "model/_named_dense/bias/.OPTIMIZER_SLOT/optimizer/m",
+        "model/_named_dense/bias/.OPTIMIZER_SLOT/optimizer/v",
+    )
+    expected_checkpoint_names = (
+        # Created in the root node, so no prefix.
+        "step",
+        "model/_second/kernel",
+        "model/_named_dense/kernel",
+        "model/_named_dense/bias",
+        # non-Layer dependency of the model
+        "model/_non_layer/a_variable",
+        "optimizer/learning_rate",
+        "optimizer/beta_1",
+        "optimizer/beta_2",
+        "optimizer/iter",
+        "optimizer/decay",
+    ) + expected_slot_keys
+    suffix = "/.ATTRIBUTES/VARIABLE_VALUE"
+    expected_checkpoint_names = [
+        name + suffix for name in expected_checkpoint_names]
+    named_variables = {v.name: v for v in named_variables}
+    six.assertCountEqual(self, expected_checkpoint_names,
+                         named_variables.keys())
+    # Check that we've mapped to the right variable objects (not exhaustive)
+    self.assertEqual(
+        "global_step",
+        named_variables["step" + suffix].full_name)
+    self.assertEqual(
+        "my_model/dense_1/kernel",
+        named_variables["model/_second/kernel" + suffix].full_name)
+    self.assertEqual(
+        "my_model/dense/kernel",
+        named_variables["model/_named_dense/kernel" + suffix].full_name)
+    self.assertEqual("Adam/beta_1",
+                     named_variables["optimizer/beta_1" + suffix].full_name)
+    self.assertEqual("Adam/beta_2",
+                     named_variables["optimizer/beta_2" + suffix].full_name)
+    # Spot check the generated protocol buffers.
+    self.assertEqual("optimizer",
+                     serialized_graph.nodes[0].children[1].local_name)
+    optimizer_node = serialized_graph.nodes[
+        serialized_graph.nodes[0].children[1].node_id]
+    children = [node.local_name for node in optimizer_node.children]
+    six.assertCountEqual(
+        self,
+        # hyper variable dependencies
+        ["beta_1", "beta_2", "iter", "decay", "learning_rate"],
+        children)
+    serialized_slot_keys = []
+    for slot in optimizer_node.slot_variables:
+      for attribute in (
+          serialized_graph.nodes[slot.slot_variable_node_id].attributes):
+        serialized_slot_keys.append(attribute.checkpoint_key)
+    six.assertCountEqual(
+        self,
+        [key + suffix for key in expected_slot_keys],
+        serialized_slot_keys)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testSaveRestore(self):
+    model = MyModel()
+    optimizer = adam.Adam(0.001)
+    root_trackable = trackable_utils.Checkpoint(
+        optimizer=optimizer, model=model)
+    input_value = constant_op.constant([[3.]])
+    with backprop.GradientTape() as tape:
+      loss = model(input_value)
+    variables = model.trainable_variables
+    gradients = tape.gradient(loss, variables)
+    train_op = optimizer.apply_gradients(zip(gradients, variables))
+    self.assertFalse(root_trackable.save_counter.trainable)
+    self.evaluate(trackable_utils.gather_initializers(
+        root_trackable))
+    self.evaluate(train_op)
+    prefix = os.path.join(self.get_temp_dir(), "ckpt")
+    self.evaluate(state_ops.assign(model._named_dense.variables[1], [42.]))
+    m_bias_slot = optimizer.get_slot(model._named_dense.variables[1], "m")
+    self.evaluate(state_ops.assign(m_bias_slot, [1.5]))
+    save_path = root_trackable.save(file_prefix=prefix)
+    self.evaluate(state_ops.assign(model._named_dense.variables[1], [43.]))
+    self.evaluate(state_ops.assign(root_trackable.save_counter, 3))
+    optimizer_variables = self.evaluate(
+        sorted(optimizer.variables(), key=lambda v: v.name))
+    self.evaluate(state_ops.assign(m_bias_slot, [-2.]))
+    # Immediate restoration
+    status = root_trackable.restore(save_path=save_path).assert_consumed()
+    status.run_restore_ops()
+    self.assertAllEqual([42.], self.evaluate(model._named_dense.variables[1]))
+    self.assertAllEqual(1, self.evaluate(root_trackable.save_counter))
+    self.assertAllEqual([1.5], self.evaluate(m_bias_slot))
+    if not context.executing_eagerly():
+      return  # Restore-on-create is only supported when executing eagerly
+    on_create_model = MyModel()
+    on_create_optimizer = adam.Adam(0.001)
+    on_create_root = trackable_utils.Checkpoint(
+        optimizer=on_create_optimizer, model=on_create_model)
+    # Deferred restoration
+    status = on_create_root.restore(save_path=save_path)
+    status.assert_nontrivial_match()
+    status.assert_existing_objects_matched()
+    with self.assertRaises(AssertionError):
+      status.assert_consumed()
+    on_create_model(constant_op.constant([[3.]]))  # create variables
+    self.assertAllEqual(1, self.evaluate(on_create_root.save_counter))
+    self.assertAllEqual([42.],
+                        self.evaluate(
+                            on_create_model._named_dense.variables[1]))
+    on_create_m_bias_slot = on_create_optimizer.get_slot(
+        on_create_model._named_dense.variables[1], "m")
+    status.assert_existing_objects_matched()
+    if not context.executing_eagerly():
+      with self.assertRaises(AssertionError):
+        status.assert_consumed()
+    # Optimizer slot variables are created when the original variable is
+    # restored.
+    self.assertAllEqual([1.5], self.evaluate(on_create_m_bias_slot))
+    dummy_var = resource_variable_ops.ResourceVariable([1.])
+    on_create_optimizer.minimize(loss=dummy_var.read_value,
+                                 var_list=[dummy_var])
+    status.assert_existing_objects_matched()
+    status.assert_consumed()
+    self.assertAllEqual(
+        optimizer_variables,
+        # Creation order is different, so .variables() needs to be re-sorted.
+        self.evaluate(sorted(optimizer.variables(), key=lambda v: v.name)))
+
+  # TODO(allenl): Debug garbage created by this test in python3.
+  def testDeferredRestorationUsageEager(self):
+    """An idiomatic eager execution example."""
+    num_training_steps = 10
+    checkpoint_directory = self.get_temp_dir()
+    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+    for training_continuation in range(3):
+      model = MyModel()
+      optimizer = adam.Adam(0.001)
+      root = trackable_utils.Checkpoint(
+          optimizer=optimizer, model=model)
+      root.restore(checkpoint_management.latest_checkpoint(
+          checkpoint_directory))
+      for _ in range(num_training_steps):
+        # TODO(allenl): Use a Dataset and serialize/checkpoint it.
+        input_value = constant_op.constant([[3.]])
+        with backprop.GradientTape() as tape:
+          loss = model(input_value)
+        variables = model.trainable_variables
+        gradients = tape.gradient(loss, variables)
+        optimizer.apply_gradients(zip(gradients, variables))
+      root.save(file_prefix=checkpoint_prefix)
+      self.assertEqual((training_continuation + 1) * num_training_steps,
+                       root.optimizer.iterations.numpy())
+
+  def testUsageGraph(self):
+    """Expected usage when graph building."""
+    with context.graph_mode():
+      num_training_steps = 10
+      checkpoint_directory = self.get_temp_dir()
+      checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+      for training_continuation in range(3):
+        with ops.Graph().as_default():
+          model = MyModel()
+          optimizer = adam.Adam(0.001)
+          root = trackable_utils.CheckpointV1(
+              optimizer=optimizer, model=model)
+          input_value = constant_op.constant([[3.]])
+          with backprop.GradientTape() as tape:
+            loss = model(input_value)
+          variables = model.trainable_variables
+          gradients = tape.gradient(loss, variables)
+          train_op = optimizer.apply_gradients(zip(gradients, variables))
+
+          checkpoint_path = checkpoint_management.latest_checkpoint(
+              checkpoint_directory)
+          with self.session(graph=ops.get_default_graph()) as session:
+            status = root.restore(save_path=checkpoint_path)
+            status.initialize_or_restore(session=session)
+            if checkpoint_path is None:
+              self.assertEqual(0, training_continuation)
+              with self.assertRaises(AssertionError):
+                status.assert_consumed()
+              with self.assertRaises(AssertionError):
+                status.assert_existing_objects_matched()
+            else:
+              status.assert_consumed()
+              status.assert_existing_objects_matched()
+            for _ in range(num_training_steps):
+              session.run(train_op)
+            root.save(file_prefix=checkpoint_prefix, session=session)
+            self.assertEqual((training_continuation + 1) * num_training_steps,
+                             session.run(root.optimizer.iterations))
+            self.assertEqual(training_continuation + 1,
+                             session.run(root.save_counter))
+
+  @test_util.run_in_graph_and_eager_modes
+  def testAgnosticUsage(self):
+    """Graph/eager agnostic usage."""
+    # Does create garbage when executing eagerly due to ops.Graph() creation.
+    num_training_steps = 10
+    checkpoint_directory = self.get_temp_dir()
+    def _train_fn(model, input_value):
+      with backprop.GradientTape() as tape:
+        loss = model(input_value)
+      variables = model.trainable_variables
+      gradients = tape.gradient(loss, variables)
+      return optimizer.apply_gradients(zip(gradients, variables))
+    for training_continuation in range(3):
+      with test_util.device(use_gpu=True):
+        model = MyModel()
+        optimizer = adam.Adam(0.001)
+        root = trackable_utils.Checkpoint(
+            optimizer=optimizer, model=model)
+        manager = checkpoint_management.CheckpointManager(
+            root, checkpoint_directory, max_to_keep=1)
+        status = root.restore(save_path=manager.latest_checkpoint)
+        input_value = constant_op.constant([[3.]])
+        train_fn = functools.partial(_train_fn, model, input_value)
+        if not context.executing_eagerly():
+          train_fn = functools.partial(self.evaluate, train_fn())
+        status.initialize_or_restore()
+        for _ in range(num_training_steps):
+          train_fn()
+        manager.save()
+        self.assertEqual((training_continuation + 1) * num_training_steps,
+                         self.evaluate(root.optimizer.iterations))
+        self.assertEqual(training_continuation + 1,
+                         self.evaluate(root.save_counter))
+
+  def testPartialRestoreWarningObject(self):
+    with context.eager_mode():
+      optimizer = adam.Adam(0.0)
+      original_root = trackable_utils.Checkpoint(v1=variables_lib.Variable(2.),
+                                                 v2=variables_lib.Variable(3.),
+                                                 optimizer=optimizer)
+      # Create a slot variable to save
+      optimizer.minimize(original_root.v1.read_value, [original_root.v1])
+      prefix = os.path.join(self.get_temp_dir(), "ckpt")
+      save_path = original_root.save(prefix)
+      partial_root = trackable_utils.Checkpoint(v1=variables_lib.Variable(0.))
+      weak_partial_root = weakref.ref(partial_root)
+      weak_v1 = weakref.ref(partial_root.v1)
+      partial_root.restore(save_path)
+      self.assertEqual(2., partial_root.v1.numpy())
+      with test.mock.patch.object(logging, "warning") as mock_log:
+        del partial_root
+        self.assertIsNone(weak_partial_root())
+        self.assertIsNone(weak_v1())
+        messages = str(mock_log.call_args_list)
+      self.assertIn("(root).v2'", messages)
+      self.assertIn("(root).optimizer's state 'm' for (root).v1", messages)
+      self.assertNotIn("(root).v1'", messages)
+      self.assertIn("expect_partial()", messages)
+
+  # pylint: disable=cell-var-from-loop
+  @test_util.run_in_graph_and_eager_modes
+  @test_util.run_v1_only("b/120545219")
+  def testWithDefun(self):
+    num_training_steps = 2
+    checkpoint_directory = self.get_temp_dir()
+    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+    for training_continuation in range(3):
+      with test_util.device(use_gpu=True):
+        model = MyModel()
+        # Don't actually train so we can test variable values
+        optimizer = adam.Adam(0.)
+        root = trackable_utils.Checkpoint(
+            optimizer=optimizer, model=model)
+        checkpoint_path = checkpoint_management.latest_checkpoint(
+            checkpoint_directory)
+        status = root.restore(save_path=checkpoint_path)
+        def train_fn():
+          @def_function.function
+          def _call_model(x):
+            return model(x)
+          with backprop.GradientTape() as tape:
+            loss = _call_model(constant_op.constant([[3.]]))
+          gradients = tape.gradient(loss, model.variables)
+          return optimizer.apply_gradients(zip(gradients, model.variables))
+        if not context.executing_eagerly():
+          train_fn = functools.partial(
+              self.evaluate, train_fn())
+        status.initialize_or_restore()
+        for _ in range(num_training_steps):
+          train_fn()
+        if training_continuation > 0:
+          status.assert_consumed()
+          self.assertAllClose([[42.]], self.evaluate(model.variables[0]))
+        else:
+          self.evaluate(model.variables[0].assign([[42.]]))
+        root.save(file_prefix=checkpoint_prefix)
+        self.assertEqual((training_continuation + 1) * num_training_steps,
+                         self.evaluate(optimizer.iterations))
+        self.assertEqual(training_continuation + 1,
+                         self.evaluate(root.save_counter))
+  # pylint: enable=cell-var-from-loop
+
+  def testAnonymousVarsInInit(self):
+
+    class Model(training.Model):
+
+      def __init__(self):
+        super(Model, self).__init__()
+        self.w = resource_variable_ops.ResourceVariable(0.0)
+        self.b = resource_variable_ops.ResourceVariable(0.0)
+        self.vars = [self.w, self.b]
+
+      def call(self, x):
+        return x * self.w + self.b
+
+    with context.eager_mode():
+      model = Model()
+      optimizer = adam.Adam(learning_rate=0.05)
+      checkpoint_directory = self.get_temp_dir()
+      checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+      checkpoint = trackable_utils.Checkpoint(
+          model=model, optimizer=optimizer)
+      for _ in range(2):
+        checkpoint.save(checkpoint_prefix)
+        with backprop.GradientTape() as tape:
+          loss = (constant_op.constant(1.)
+                  - model(constant_op.constant(1.))) ** 2
+        grad = tape.gradient(loss, model.vars)
+        optimizer.apply_gradients(
+            [(g, v) for g, v in zip(grad, model.vars)])
+
+  @test_util.run_in_graph_and_eager_modes
+  def testDeferredSlotRestoration(self):
+    checkpoint_directory = self.get_temp_dir()
+
+    root = trackable_utils.Checkpoint()
+    root.var = trackable_utils.add_variable(
+        root, name="var", initializer=0.)
+    optimizer = adam.Adam(0.1)
+    variables = [root.var]
+    gradients = [1.]
+    train_op = optimizer.apply_gradients(zip(gradients, variables))
+    # Note that `optimizer` has not been added as a dependency of
+    # `root`. Create a one-off grouping so that slot variables for `root.var`
+    # get initialized too.
+    self.evaluate(trackable_utils.gather_initializers(
+        trackable_utils.Checkpoint(root=root, optimizer=optimizer)))
+    self.evaluate(train_op)
+    self.evaluate(state_ops.assign(root.var, 12.))
+    no_slots_path = root.save(os.path.join(checkpoint_directory, "no_slots"))
+    root.optimizer = optimizer
+    self.evaluate(state_ops.assign(root.var, 13.))
+    self.evaluate(state_ops.assign(
+        optimizer.get_slot(slot_name="m", var=root.var),
+        14.))
+    slots_path = root.save(os.path.join(checkpoint_directory, "with_slots"))
+    new_root = trackable_utils.Checkpoint()
+    # Load the slot-containing checkpoint (deferred), then immediately overwrite
+    # the non-slot variable (also deferred).
+    slot_status = new_root.restore(slots_path)
+    no_slot_status = new_root.restore(no_slots_path)
+    with self.assertRaises(AssertionError):
+      no_slot_status.assert_consumed()
+    new_root.var = trackable_utils.add_variable(
+        new_root, name="var", shape=[])
+    no_slot_status.assert_consumed()
+    no_slot_status.run_restore_ops()
+    self.assertEqual(12., self.evaluate(new_root.var))
+    new_root.optimizer = adam.Adam(0.1)
+    slot_status.assert_existing_objects_matched()
+    if not context.executing_eagerly():
+      with self.assertRaisesRegexp(AssertionError, "Unresolved object"):
+        slot_status.assert_consumed()
+    self.assertEqual(12., self.evaluate(new_root.var))
+    if context.executing_eagerly():
+      # Slot variables are only created with restoring initializers when
+      # executing eagerly.
+      self.assertEqual(14., self.evaluate(
+          new_root.optimizer.get_slot(slot_name="m", var=new_root.var)))
+    else:
+      # Slot variables are not created eagerly when graph building.
+      with self.assertRaises(KeyError):
+        new_root.optimizer.get_slot(slot_name="m", var=new_root.var)
+    variables = [new_root.var]
+    gradients = [1.]
+    train_op = new_root.optimizer.apply_gradients(zip(gradients, variables))
+    # The slot variable now exists; restore() didn't create it, but we should
+    # now have a restore op for it.
+    slot_status.run_restore_ops()
+    if not context.executing_eagerly():
+      # The train op hasn't run when graph building, so the slot variable has
+      # its restored value. It has run in eager, so the value will be different.
+      self.assertEqual(14., self.evaluate(
+          new_root.optimizer.get_slot(slot_name="m", var=new_root.var)))
+    self.evaluate(train_op)
+    slot_status.assert_consumed()
+
+  def testManySavesGraph(self):
+    """Saves after the first should not modify the graph."""
+    with context.graph_mode():
+      graph = ops.Graph()
+      with graph.as_default(), self.session(graph):
+        checkpoint_directory = self.get_temp_dir()
+        checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+        obj = trackable_utils.Checkpoint()
+        obj.var = variables_lib.Variable(0., name="v")
+        obj.opt = adam.Adam(0.1)
+        variables = [obj.var]
+        gradients = [1.]
+        obj.opt.apply_gradients(zip(gradients, variables))
+        self.evaluate(trackable_utils.gather_initializers(obj))
+        obj.save(checkpoint_prefix)
+        graph.finalize()
+        obj.save(checkpoint_prefix)
+
+  def testManyRestoresGraph(self):
+    """Restores after the first should not modify the graph."""
+    with context.graph_mode():
+      graph = ops.Graph()
+      with graph.as_default(), self.session(graph):
+        checkpoint_directory = self.get_temp_dir()
+        checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+        obj = trackable_utils.Checkpoint()
+        obj.var = variables_lib.Variable(0., name="v")
+        obj.opt = adam.Adam(0.1)
+        variables = [obj.var]
+        gradients = [1.]
+        obj.opt.apply_gradients(zip(gradients, variables))
+        self.evaluate(trackable_utils.gather_initializers(obj))
+        save_path = obj.save(checkpoint_prefix)
+        obj.restore(save_path)
+        graph.finalize()
+        obj.restore(save_path)
+
+  @test_util.run_in_graph_and_eager_modes
+  def test_sequential(self):
+    model = sequential.Sequential()
+    checkpoint = trackable_utils.Checkpoint(model=model)
+    model.add(core.Dense(4))
+    second_dense = core.Dense(5)
+    model.add(second_dense)
+    model(constant_op.constant([[1.]]))
+    checkpoint.restore(None).initialize_or_restore()
+    self.evaluate(second_dense.bias.assign(
+        constant_op.constant([1., 2., 3., 4., 5.])))
+    checkpoint_directory = self.get_temp_dir()
+    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+    save_path = checkpoint.save(checkpoint_prefix)
+    self.evaluate(second_dense.bias.assign(
+        constant_op.constant([5., 6., 7., 8., 9.])))
+    checkpoint.restore(save_path).assert_consumed().run_restore_ops()
+    self.assertAllEqual([1., 2., 3., 4., 5.], self.evaluate(second_dense.bias))
+
+    deferred_sequential = sequential.Sequential()
+    deferred_sequential_checkpoint = trackable_utils.Checkpoint(
+        model=deferred_sequential)
+    status = deferred_sequential_checkpoint.restore(save_path)
+    deferred_sequential.add(core.Dense(4))
+    deferred_second_dense = core.Dense(5)
+    deferred_sequential.add(deferred_second_dense)
+    deferred_sequential(constant_op.constant([[1.]]))
+    status.run_restore_ops()
+    self.assertAllEqual([1., 2., 3., 4., 5.],
+                        self.evaluate(deferred_second_dense.bias))
+
+  @test_util.run_in_graph_and_eager_modes
+  def test_initialize_if_not_restoring(self):
+    checkpoint_directory = self.get_temp_dir()
+    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+    optimizer_only_prefix = os.path.join(checkpoint_directory, "opt")
+    with test_util.device(use_gpu=True):
+      model = MyModel()
+      optimizer = adam.Adam(0.001)
+      root = trackable_utils.Checkpoint(
+          model=model)  # Do not save the optimizer with the checkpoint.
+      optimizer_checkpoint = trackable_utils.Checkpoint(
+          optimizer=optimizer)
+
+      checkpoint_path = checkpoint_management.latest_checkpoint(
+          checkpoint_directory)
+      status = root.restore(save_path=checkpoint_path)
+      input_value = constant_op.constant([[3.]])
+      def train_fn():
+        with backprop.GradientTape() as tape:
+          loss = model(input_value)
+        variables = model.trainable_variables
+        gradients = tape.gradient(loss, variables)
+        return optimizer.apply_gradients(zip(gradients, variables))
+      if not context.executing_eagerly():
+        train_fn = functools.partial(self.evaluate, train_fn())
+      status.initialize_or_restore()
+      # TODO(tanzheny): Add hyper variables to .variables(), and set them with
+      # set_weights etc.
+      variables_not_in_the_variables_property = [
+          obj for obj in optimizer._hyper.values()
+          if isinstance(obj, variables_lib.Variable)]
+      self.evaluate([v.initializer for v
+                     in optimizer.variables()
+                     + variables_not_in_the_variables_property])
+      train_fn()
+      model_save_path = root.save(file_prefix=checkpoint_prefix)
+      self.evaluate(optimizer.beta_1.assign(42.))
+      optimizer_save_path = optimizer_checkpoint.save(optimizer_only_prefix)
+    del train_fn
+
+    # Restore into a graph with the optimizer
+    with test_util.device(use_gpu=True):
+      model = MyModel()
+      optimizer = adam.Adam(0.001)
+      root = trackable_utils.Checkpoint(
+          optimizer=optimizer, model=model)
+      status = root.restore(save_path=model_save_path)
+      input_value = constant_op.constant([[3.]])
+      def train_fn1():
+        with backprop.GradientTape() as tape:
+          loss = model(input_value)
+        variables = model.trainable_variables
+        gradients = tape.gradient(loss, variables)
+        return optimizer.apply_gradients(zip(gradients, variables))
+      if not context.executing_eagerly():
+        train_fn1 = functools.partial(self.evaluate, train_fn1())
+      status.initialize_or_restore()
+      train_fn1()
+      with self.assertRaises(AssertionError):
+        status.assert_existing_objects_matched()
+      with self.assertRaises(AssertionError):
+        status.assert_consumed()
+    del train_fn1
+
+    # Make sure initialization doesn't clobber later restores
+    with test_util.device(use_gpu=True):
+      model = MyModel()
+      optimizer = adam.Adam(0.001, beta_1=1.0)
+      root = trackable_utils.Checkpoint(
+          optimizer=optimizer, model=model)
+      opt_root = trackable_utils.Checkpoint(
+          optimizer=optimizer)
+      status = root.restore(save_path=model_save_path)
+      init_only_optimizer_status = opt_root.restore(save_path=None)
+      optimizer_status = opt_root.restore(save_path=optimizer_save_path)
+      input_value = constant_op.constant([[3.]])
+      def train_fn2():
+        with backprop.GradientTape() as tape:
+          loss = model(input_value)
+        variables = model.trainable_variables
+        gradients = tape.gradient(loss, variables)
+        return optimizer.apply_gradients(zip(gradients, variables))
+      if not context.executing_eagerly():
+        train_fn2 = functools.partial(self.evaluate, train_fn2())
+      optimizer_status.run_restore_ops()
+      status.initialize_or_restore()
+      init_only_optimizer_status.initialize_or_restore()
+      train_fn2()
+      self.assertEqual(42., self.evaluate(optimizer.beta_1))
+
+
+class _ManualScope(tracking.AutoTrackable):
+
+  def __call__(self):
+    with variable_scope.variable_scope("ManualScope") as vs:
+      self.variable_scope = vs
+      with trackable_utils.capture_dependencies(template=self):
+        return self._build()
+
+  def _build(self):
+    return variable_scope.get_variable(name="in_manual_scope", shape=[])
+
+
+class TemplateTests(parameterized.TestCase, test.TestCase):
+
+  @test_util.run_in_graph_and_eager_modes
+  def test_trackable_save_restore(self):
+
+    def _templated():
+      v = variable_scope.get_variable(
+          "v", shape=[1], initializer=init_ops.zeros_initializer(),
+          use_resource=True)
+      v2 = variable_scope.get_variable(
+          "v2", shape=[1], initializer=init_ops.zeros_initializer(),
+          use_resource=True)
+      manual = _ManualScope()
+      return v, v + 1., v2, manual, manual()
+
+    save_template = template.make_template("s1", _templated)
+    v1_save, _, v2_save, manual_scope, manual_scope_v = save_template()
+    six.assertCountEqual(
+        self,
+        [id(v1_save), id(v2_save), id(manual_scope),
+         id(manual_scope_v), id(save_template)],
+        map(id, trackable_utils.list_objects(save_template)))
+    manual_dep, = manual_scope._checkpoint_dependencies
+    self.assertEqual("in_manual_scope", manual_dep.name)
+    self.assertIs(manual_scope_v, manual_dep.ref)
+    optimizer = adam.Adam(0.0)
+    save_root = trackable_utils.Checkpoint(
+        my_template=save_template, optimizer=optimizer)
+    optimizer.minimize(v1_save.read_value,
+                       var_list=[v1_save])
+    self.evaluate([v.initializer for v in save_template.variables])
+    optimizer_variables = optimizer.variables() + list(
+        optimizer._hyper.values())
+    self.evaluate([v.initializer for v in optimizer_variables])
+    self.evaluate(v1_save.assign([12.]))
+    self.evaluate(v2_save.assign([14.]))
+    checkpoint_directory = self.get_temp_dir()
+    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+    save_path = save_root.save(checkpoint_prefix)
+
+    load_template = template.make_template("s2", _templated)
+    load_optimizer = adam.Adam(0.0)
+    load_root = trackable_utils.Checkpoint(
+        my_template=load_template, optimizer=load_optimizer)
+    status = load_root.restore(save_path)
+    var, var_plus_one, var2, _, _ = load_template()
+    load_optimizer.minimize(var.read_value, var_list=[var])
+    self.assertLen(load_template._checkpoint_dependencies, 3)
+    self.assertEqual("v", load_template._checkpoint_dependencies[0].name)
+    self.assertEqual("v2", load_template._checkpoint_dependencies[1].name)
+    self.assertEqual("ManualScope",
+                     load_template._checkpoint_dependencies[2].name)
+    status.assert_consumed().run_restore_ops()
+    self.assertAllEqual([12.], self.evaluate(var))
+    self.assertAllEqual([13.], self.evaluate(var_plus_one))
+    self.assertAllEqual([14.], self.evaluate(var2))
+
+
+class CheckpointCompatibilityTests(test.TestCase):
+
+  def _initialized_model(self):
+    input_value = constant_op.constant([[3.]])
+    model = MyModel()
+    optimizer = adam.Adam(0.001)
+    root_trackable = trackable_utils.Checkpoint(
+        optimizer=optimizer, model=model)
+    with backprop.GradientTape() as tape:
+      loss = model(input_value)
+    variables = model.trainable_variables
+    gradients = tape.gradient(loss, variables)
+    train_op = optimizer.apply_gradients(zip(gradients, variables))
+    self.evaluate(trackable_utils.gather_initializers(
+        root_trackable))
+    self.evaluate(train_op)
+    # A regular variable, a slot variable, and a non-slot Optimizer variable
+    # with known values to check when loading.
+    self.evaluate(model._named_dense.bias.assign([1.]))
+    self.evaluate(optimizer.get_slot(
+        var=model._named_dense.bias, slot_name="m").assign([2.]))
+    self.evaluate(optimizer.beta_1.assign(3.))
+    return root_trackable
+
+  def _set_sentinels(self, root_trackable):
+    self.evaluate(root_trackable.model._named_dense.bias.assign([101.]))
+    self.evaluate(
+        root_trackable.optimizer.get_slot(
+            var=root_trackable.model._named_dense.bias, slot_name="m")
+        .assign([102.]))
+    self.evaluate(root_trackable.optimizer.beta_1.assign(103.))
+
+  def _check_sentinels(self, root_trackable):
+    self.assertAllEqual(
+        [1.], self.evaluate(root_trackable.model._named_dense.bias))
+    self.assertAllEqual([2.], self.evaluate(
+        root_trackable.optimizer.get_slot(
+            var=root_trackable.model._named_dense.bias, slot_name="m")))
+    self.assertAllEqual(3.,
+                        self.evaluate(root_trackable.optimizer.beta_1))
+
+  def _write_name_based_checkpoint(self):
+    checkpoint_directory = self.get_temp_dir()
+    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+    with context.graph_mode():
+      save_graph = ops.Graph()
+      with save_graph.as_default(), self.session(
+          graph=save_graph) as session:
+        root = self._initialized_model()
+        name_saver = saver_lib.Saver()
+        return name_saver.save(
+            sess=session,
+            save_path=checkpoint_prefix,
+            global_step=root.optimizer.iterations)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testLoadFromNameBasedSaver(self):
+    """Save a name-based checkpoint, load it using the object-based API."""
+    with test_util.device(use_gpu=True):
+      save_path = self._write_name_based_checkpoint()
+      root = self._initialized_model()
+      self._set_sentinels(root)
+      with self.assertRaises(AssertionError):
+        self._check_sentinels(root)
+      object_saver = trackable_utils.TrackableSaver(
+          graph_view.ObjectGraphView(root))
+      self._set_sentinels(root)
+      status = object_saver.restore(save_path)
+      if context.executing_eagerly():
+        self._check_sentinels(root)
+      if context.executing_eagerly():
+        status.assert_consumed()
+        status.assert_existing_objects_matched()
+        status.assert_nontrivial_match()
+      else:
+        # When graph building, we haven't read any keys, so we don't know
+        # whether the restore will be complete.
+        with self.assertRaisesRegexp(AssertionError, "not restored"):
+          status.assert_consumed()
+        with self.assertRaisesRegexp(AssertionError, "not restored"):
+          status.assert_existing_objects_matched()
+        with self.assertRaisesRegexp(AssertionError, "not restored"):
+          status.assert_nontrivial_match()
+      status.run_restore_ops()
+      self._check_sentinels(root)
+      self._set_sentinels(root)
+      status = object_saver.restore(save_path)
+      status.initialize_or_restore()
+      status.assert_nontrivial_match()
+      self._check_sentinels(root)
+      # Check that there is no error when keys are missing from the name-based
+      # checkpoint.
+      root.not_in_name_checkpoint = resource_variable_ops.ResourceVariable([1.])
+      status = object_saver.restore(save_path)
+      with self.assertRaises(AssertionError):
+        status.assert_existing_objects_matched()
+
+  def testSaveGraphLoadEager(self):
+    checkpoint_directory = self.get_temp_dir()
+    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+    with context.graph_mode():
+      save_graph = ops.Graph()
+      with save_graph.as_default(), self.session(
+          graph=save_graph):
+        root = self._initialized_model()
+        save_path = root.save(file_prefix=checkpoint_prefix)
+    with context.eager_mode():
+      root = self._initialized_model()
+      self._set_sentinels(root)
+      root.restore(save_path).assert_consumed()
+      self._check_sentinels(root)
+
+  def testSaveEagerLoadGraph(self):
+    checkpoint_directory = self.get_temp_dir()
+    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+    with context.eager_mode():
+      root = self._initialized_model()
+      save_path = root.save(file_prefix=checkpoint_prefix)
+    with context.graph_mode():
+      save_graph = ops.Graph()
+      with save_graph.as_default(), self.session(
+          graph=save_graph):
+        root = self._initialized_model()
+        self._set_sentinels(root)
+        root.restore(save_path).assert_consumed().run_restore_ops()
+        self._check_sentinels(root)
+
+  def testIgnoreSaveCounter(self):
+    checkpoint_directory = self.get_temp_dir()
+    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+    with self.cached_session() as session:
+      # Create and save a model using Saver() before using a Checkpoint. This
+      # generates a snapshot without the Checkpoint's `save_counter`.
+      model = sequential.Sequential()
+      model.add(core.Flatten(input_shape=(1,)))
+      model.add(core.Dense(1))
+      name_saver = saver_lib.Saver(model.trainable_variables)
+      save_path = name_saver.save(
+          sess=session, save_path=checkpoint_prefix, global_step=1)
+      # Checkpoint.restore must successfully load that checkpoint.
+      ckpt = trackable_utils.Checkpoint(model=model)
+      status = ckpt.restore(save_path)
+      status.assert_existing_objects_matched()
+      # It should, however, refuse to load a checkpoint where an unrelated
+      # `save_counter` variable is missing.
+      model.layers[1].var = variables_lib.Variable(0., name="save_counter")
+      status = ckpt.restore(save_path)
+      with self.assertRaises(AssertionError):
+        status.assert_existing_objects_matched()
+
+
+if __name__ == "__main__":
+  ops.enable_eager_execution()
+  test.main()
diff --git a/tensorflow/python/training/tracking/BUILD b/tensorflow/python/training/tracking/BUILD
index f893e29feab..36ca3cf4b66 100644
--- a/tensorflow/python/training/tracking/BUILD
+++ b/tensorflow/python/training/tracking/BUILD
@@ -98,8 +98,6 @@ tf_py_test(
         "//tensorflow/python:math_ops",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:test",
-        "//tensorflow/python/keras:engine",
-        "//tensorflow/python/keras/layers",
     ],
 )
 
@@ -149,7 +147,6 @@ py_library(
         "//tensorflow/python:variables",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:def_function",
-        "//tensorflow/python/keras:backend",
         "//tensorflow/python/training/saving:checkpoint_options",
         "//tensorflow/python/training/saving:functional_saver",
         "//tensorflow/python/training/saving:saveable_object_util",
@@ -188,10 +185,6 @@ tf_py_test(
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/eager:test",
-        "//tensorflow/python/keras:backend",
-        "//tensorflow/python/keras:engine",
-        "//tensorflow/python/keras/layers",
-        "//tensorflow/python/keras/optimizer_v2",
         "//tensorflow/python/training/saving:checkpoint_options",
         "@absl_py//absl/testing:parameterized",
         "@six_archive//:six",
diff --git a/tensorflow/python/training/tracking/data_structures_test.py b/tensorflow/python/training/tracking/data_structures_test.py
index 79c88d6873a..be795601678 100644
--- a/tensorflow/python/training/tracking/data_structures_test.py
+++ b/tensorflow/python/training/tracking/data_structures_test.py
@@ -23,26 +23,16 @@ import os
 import pickle
 
 from absl.testing import parameterized
-import numpy
-import six
 
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
 from tensorflow.python.eager import test
 from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
-from tensorflow.python.framework import test_util
-from tensorflow.python.keras.engine import sequential
-from tensorflow.python.keras.engine import training
-from tensorflow.python.keras.layers import core
-from tensorflow.python.keras.layers import normalization
 from tensorflow.python.layers import core as non_keras_core
 from tensorflow.python.module import module
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.training.tracking import data_structures
@@ -52,184 +42,13 @@ from tensorflow.python.util import nest
 from tensorflow.python.util import serialization
 
 
-class HasList(training.Model):
-
-  def __init__(self):
-    super(HasList, self).__init__()
-    self.layer_list = data_structures.List([core.Dense(3)])
-    self.layer_list.append(core.Dense(4))
-    self.layer_list.extend(
-        [core.Dense(5),
-         core.Dense(6, kernel_regularizer=math_ops.reduce_sum)])
-    self.layer_list += [
-        core.Dense(7, bias_regularizer=math_ops.reduce_sum),
-        core.Dense(8)
-    ]
-    self.layer_list += (
-        data_structures.List([core.Dense(9)]) + data_structures.List(
-            [core.Dense(10)]))
-    self.layer_list.extend(
-        data_structures.List(
-            list([core.Dense(11)]) + [core.Dense(12)]))
-    self.layers_with_updates = data_structures.List(
-        (normalization.BatchNormalization(),))
-
-  def call(self, x):
-    aggregation = 0.
-    for l in self.layer_list:
-      x = l(x)
-      aggregation += math_ops.reduce_sum(x)
-    bn, = self.layers_with_updates
-    return bn(x) / aggregation
-
-
 class ListTests(test.TestCase):
 
-  @test_util.run_in_graph_and_eager_modes
-  @test_util.run_v1_only("b/120545219")
-  def testTracking(self):
-    model = HasList()
-    output = model(array_ops.ones([32, 2]))
-    self.assertAllEqual([32, 12], output.shape)
-    self.assertEqual(11, len(model.layers))
-    self.assertEqual(10, len(model.layer_list.layers))
-    six.assertCountEqual(
-        self,
-        model.layers,
-        model.layer_list.layers + model.layers_with_updates)
-    for index in range(10):
-      self.assertEqual(3 + index, model.layer_list.layers[index].units)
-    self.assertEqual(2, len(model._checkpoint_dependencies))
-    self.assertIs(model.layer_list, model._checkpoint_dependencies[0].ref)
-    self.assertIs(model.layers_with_updates,
-                  model._checkpoint_dependencies[1].ref)
-    self.assertEqual(
-        10, len(model._checkpoint_dependencies[0].ref._checkpoint_dependencies))
-    self.evaluate([v.initializer for v in model.variables])
-    self.evaluate(model.variables[0].assign([[1., 2., 3.], [4., 5., 6.]]))
-    save_path = os.path.join(self.get_temp_dir(), "ckpt")
-    model.save_weights(save_path)
-    self.evaluate(model.variables[0].assign(array_ops.zeros([2, 3])))
-    model.load_weights(save_path)
-    self.assertAllEqual([[1., 2., 3.], [4., 5., 6.]],
-                        self.evaluate(model.variables[0]))
-    v = variables.Variable(1.)
-    model.var_list = [v]
-    self.assertIn(v, model.variables)
-    self.assertIn(v, model.trainable_variables)
-    self.assertNotIn(v, model.non_trainable_variables)
-    self.assertIn(model.layer_list[0].trainable_weights[0],
-                  model.trainable_weights)
-
-  def testSubModelTracking(self):
-    model = training.Model()
-    model.v = variables.Variable(1.)
-    self.assertIn(model.v, model.trainable_weights)
-    model2 = training.Model()
-    model2.m = [model]
-    self.assertIn(model.v, model2.trainable_weights)
-
-  def testSubSequentialTracking(self):
-
-    class _Subclassed(training.Model):
-
-      def __init__(self, wrapped):
-        super(_Subclassed, self).__init__()
-        self._wrapped = wrapped
-
-      def call(self, x):
-        return self._wrapped(x)
-
-    model = sequential.Sequential()
-    layer = core.Dense(1)
-    model.add(layer)
-    model2 = _Subclassed(model)
-    model2(array_ops.ones([1, 2]))
-    model2.m = [model]
-    self.assertIn(layer.kernel, model2.trainable_weights)
-
-  def testLayerTrackedThroughSequential(self):
-    class AttrDict(dict):
-
-      def __init__(self, *args, **kwargs):
-        super(AttrDict, self).__init__(*args, **kwargs)
-        self.__dict__ = self
-
-    def ffnet(layer_sizes, name):
-      ff = sequential.Sequential(name=name)
-      for i, width in enumerate(layer_sizes):
-        ff.add(core.Dense(
-            width,
-            activation=("relu" if i < len(layer_sizes)-1 else None)))
-      return ff
-
-    class MyModel2(training.Model):
-
-      def __init__(self, config, name="my_model_2"):
-        super(MyModel2, self).__init__(name=name)
-        self._num_tokens = config.num_tokens
-
-        # list of sub-models
-        self._ffnet = [ffnet(config.module_layers + (self._num_tokens,), "ff")]
-
-      def null_input(self):
-        return array_ops.zeros([1, self._num_tokens], dtype=dtypes.float32)
-
-      def call(self, input_, module_index=None):
-        return self._ffnet[0](input_)
-
-    m2 = MyModel2(AttrDict(
-        num_tokens=5,
-        module_layers=(50, 30)))
-
-    # Construct
-    m2(m2.null_input())
-    self.assertLen(m2.trainable_variables, 6)
-
   def testJSONSerialization(self):
     obj = tracking.AutoTrackable()
     obj.l = [1]
     json.dumps(obj.l, default=serialization.get_json_type)
 
-  @test_util.run_v1_only("b/120545219")
-  def testUpdatesForwarded(self):
-    with context.graph_mode():
-      model = HasList()
-      model_input = array_ops.ones([32, 2])
-      model(model_input)
-      self.assertGreater(len(model.layers_with_updates[0].updates), 0)
-      self.assertEqual(set(model.layers_with_updates[0].updates),
-                       set(model.updates))
-
-    with context.eager_mode():
-      model = HasList()
-      model_input = array_ops.ones([32, 2])
-      model(model_input)
-      self.assertEqual(0, len(model.updates))
-
-  @test_util.run_in_graph_and_eager_modes
-  @test_util.run_v1_only("b/120545219")
-  def testLossesForwarded(self):
-    model = HasList()
-    model_input = array_ops.ones([32, 2])
-    model(model_input)
-    self.assertEqual(2, len(model.losses))
-
-  def testModelContainersCompareEqual(self):
-    class HasEqualContainers(training.Model):
-
-      def __init__(self):
-        super(HasEqualContainers, self).__init__()
-        self.l1 = []
-        self.l2 = []
-
-    model = HasEqualContainers()
-    first_layer = HasEqualContainers()
-    model.l1.append(first_layer)
-    second_layer = HasEqualContainers()
-    model.l2.append(second_layer)
-    self.assertEqual([first_layer, second_layer], model.layers)
-
   def testNotTrackable(self):
     class NotTrackable(object):
       pass
@@ -245,23 +64,6 @@ class ListTests(test.TestCase):
     with self.assertRaises(AttributeError):
       data_structures.List().pop()
 
-  @test_util.run_in_graph_and_eager_modes
-  def testTensorConversion(self):
-
-    class ListToTensor(training.Model):
-
-      def __init__(self):
-        super(ListToTensor, self).__init__()
-        self.l = [1., 2., 3.]
-
-    self.assertAllEqual(
-        [1., 2., 3.],
-        self.evaluate(constant_op.constant(ListToTensor().l)))
-
-    self.assertAllEqual(
-        [1., 2., 3.],
-        self.evaluate(array_ops.pack(ListToTensor().l)))
-
   def testNesting(self):
     with context.graph_mode():
       inner = data_structures.List()
@@ -315,8 +117,7 @@ class ListTests(test.TestCase):
     self.assertEqual(l[:-1], [v1, v2, v3])
 
   def testHash(self):
-    has_sequences = set([data_structures.List(),
-                         data_structures.List()])
+    has_sequences = {data_structures.List(), data_structures.List()}
     self.assertEqual(2, len(has_sequences))
     self.assertNotIn(data_structures.List(), has_sequences)
 
@@ -454,13 +255,6 @@ class ListWrapperTest(test.TestCase):
     l.append(1)
     self.assertEqual([1], l_wrapper)
 
-  def testLayerCollectionWithExternalMutation(self):
-    l = []
-    l_wrapper = data_structures.ListWrapper(l)
-    layer = core.Dense(1)
-    l.append(layer)
-    self.assertEqual([layer], l_wrapper.layers)
-
   def testNotHashable(self):
     with self.assertRaises(TypeError):
       hash(data_structures.ListWrapper())
@@ -538,50 +332,8 @@ class ListWrapperTest(test.TestCase):
       return l._checkpoint_dependencies  # pylint: disable=protected-access
 
 
-class HasMapping(training.Model):
-
-  def __init__(self):
-    super(HasMapping, self).__init__()
-    self.layer_dict = data_structures.Mapping(output=core.Dense(7))
-    self.layer_dict["norm"] = data_structures.List()
-    self.layer_dict["dense"] = data_structures.List()
-    self.layer_dict["dense"].extend(
-        [core.Dense(5),
-         core.Dense(6, kernel_regularizer=math_ops.reduce_sum)])
-    self.layer_dict["norm"].append(
-        normalization.BatchNormalization())
-    self.layer_dict["norm"].append(
-        normalization.BatchNormalization())
-
-  def call(self, x):
-    aggregation = 0.
-    for norm, dense in zip(self.layer_dict["norm"], self.layer_dict["dense"]):
-      x = norm(dense(x))
-      aggregation += math_ops.reduce_sum(x)
-    return self.layer_dict["output"](x) / aggregation
-
-
 class MappingTests(test.TestCase):
 
-  @test_util.run_in_graph_and_eager_modes
-  def testTracking(self):
-    model = HasMapping()
-    output = model(array_ops.ones([32, 2]))
-    self.assertAllEqual([32, 7], output.shape.as_list())
-    self.assertEqual(5, len(model.layers))
-    six.assertCountEqual(self, model.layers, model.layer_dict.layers)
-    self.assertEqual(1, len(model._checkpoint_dependencies))
-    self.assertIs(model.layer_dict, model._checkpoint_dependencies[0].ref)
-    self.evaluate([v.initializer for v in model.variables])
-    test_var = model.layer_dict["output"].kernel
-    self.evaluate(test_var.assign(array_ops.ones([6, 7])))
-    save_path = os.path.join(self.get_temp_dir(), "ckpt")
-    model.save_weights(save_path)
-    self.evaluate(test_var.assign(array_ops.zeros([6, 7])))
-    model.load_weights(save_path)
-    self.assertAllEqual(numpy.ones([6, 7]),
-                        self.evaluate(test_var))
-
   def testJSONSerialization(self):
     obj = tracking.AutoTrackable()
     obj.d = {"a": 2}
@@ -605,20 +357,6 @@ class MappingTests(test.TestCase):
     with self.assertRaises(TypeError):
       mapping[1] = data_structures.List()
 
-  def testLayerCollectionWithExternalMutation(self):
-    d = {}
-    root = tracking.AutoTrackable()
-    root.wrapper = d
-    self.assertEqual([], root.wrapper.layers)
-    self.assertEqual([], root.wrapper.trainable_weights)
-    layer1 = core.Dense(1)
-    layer2 = core.Dense(1)
-    d["a"] = layer1
-    d["b"] = layer2
-    self.assertEqual([layer1, layer2], root.wrapper.layers)
-    # The layers have still not created variables
-    self.assertEqual([], root.wrapper.trainable_weights)
-
   def testHashing(self):
     has_mappings = set([data_structures.Mapping(),
                         data_structures.Mapping()])
@@ -633,101 +371,6 @@ class MappingTests(test.TestCase):
     with self.assertRaisesRegexp(TypeError, "unhashable"):
       set([a.d])
 
-  def testDictWrapperBadKeys(self):
-    a = tracking.AutoTrackable()
-    a.d = {}
-    a.d[1] = data_structures.List()
-    model = training.Model()
-    model.sub = a
-    save_path = os.path.join(self.get_temp_dir(), "ckpt")
-    with self.assertRaisesRegexp(ValueError, "non-string key"):
-      model.save_weights(save_path)
-
-  def testDictWrapperNoDependency(self):
-    a = tracking.AutoTrackable()
-    a.d = data_structures.NoDependency({})
-    a.d[1] = [3]
-    self.assertEqual([a], util.list_objects(a))
-    model = training.Model()
-    model.sub = a
-    save_path = os.path.join(self.get_temp_dir(), "ckpt")
-    model.save_weights(save_path)
-    model.load_weights(save_path)
-
-  def testNonStringKeyNotTrackableValue(self):
-    a = tracking.AutoTrackable()
-    a.d = {}
-    a.d["a"] = [3]
-    a.d[1] = data_structures.NoDependency([3])
-    self.assertEqual([a, a.d, a.d["a"]], util.list_objects(a))
-    model = training.Model()
-    model.sub = a
-    save_path = os.path.join(self.get_temp_dir(), "ckpt")
-    model.save_weights(save_path)
-    model.load_weights(save_path)
-
-  def testNonAppendNotTrackable(self):
-    # Non-append mutations (deleting or overwriting values) are OK when the
-    # values aren't tracked.
-    a = tracking.AutoTrackable()
-    a.d = {}
-    a.d["a"] = [3]
-    a.d[1] = 3
-    a.d[1] = 2
-    self.assertEqual(2, a.d[1])
-    del a.d[1]
-    a.d[2] = data_structures.NoDependency(tracking.AutoTrackable())
-    second = tracking.AutoTrackable()
-    a.d[2] = data_structures.NoDependency(second)
-    self.assertIs(second, a.d[2])
-    self.assertEqual([a, a.d, a.d["a"]], util.list_objects(a))
-    model = training.Model()
-    model.sub = a
-    save_path = os.path.join(self.get_temp_dir(), "ckpt")
-    model.save_weights(save_path)
-    model.load_weights(save_path)
-
-  def testPopNoSave(self):
-    model = training.Model()
-    model.d = {}
-    model.d["a"] = []
-    model.d.pop("a")
-    save_path = os.path.join(self.get_temp_dir(), "ckpt")
-    with self.assertRaisesRegexp(ValueError, "Unable to save"):
-      model.save_weights(save_path)
-
-  def testExternalModificationNoSave(self):
-    model = training.Model()
-    external_reference = {}
-    model.d = external_reference
-    external_reference["a"] = []
-    save_path = os.path.join(self.get_temp_dir(), "ckpt")
-    with self.assertRaisesRegexp(ValueError, "modified outside the wrapper"):
-      model.save_weights(save_path)
-
-  def testOverwriteCanStillSave(self):
-    model = training.Model()
-    model.d = {}
-    model.d["a"] = {}
-    model.d["a"] = {}
-    save_path = os.path.join(self.get_temp_dir(), "ckpt")
-    model.save_weights(save_path)
-
-  def testIter(self):
-    model = training.Model()
-    model.d = {1: 3}
-    model.d[1] = 3
-    self.assertEqual([1], list(model.d))
-    new_dict = {}
-    # This update() is super tricky. If the dict wrapper subclasses dict,
-    # CPython will access its storage directly instead of calling any
-    # methods/properties on the object. So the options are either not to
-    # subclass dict (in which case update will call normal iter methods, but the
-    # object won't pass isinstance checks) or to subclass dict and keep that
-    # storage updated (no shadowing all its methods like ListWrapper).
-    new_dict.update(model.d)
-    self.assertEqual({1: 3}, new_dict)
-
   def testListShallowCopy(self):
     root = tracking.AutoTrackable()
     orig_list = [[1.]]
@@ -871,157 +514,13 @@ class MappingTests(test.TestCase):
     self.assertIs(first_trace, second_trace)
 
 
-class HasTuple(training.Model):
-
-  def __init__(self):
-    super(HasTuple, self).__init__()
-    self.layer_list = (
-        core.Dense(3), core.Dense(4),
-        core.Dense(5, kernel_regularizer=math_ops.reduce_sum))
-    self.layers_with_updates = (normalization.BatchNormalization(),)
-
-  def call(self, x):
-    aggregation = 0.
-    for l in self.layer_list:
-      x = l(x)
-      aggregation += math_ops.reduce_sum(x)
-    bn, = self.layers_with_updates
-    return bn(x) / aggregation
-
-
 class TupleTests(test.TestCase, parameterized.TestCase):
 
-  @test_util.run_in_graph_and_eager_modes
-  def testTracking(self):
-    model = HasTuple()
-    output = model(array_ops.ones([32, 2]))
-    self.assertAllEqual([32, 5], output.shape.as_list())
-    self.assertLen(model.layers, 4)
-    self.assertLen(model.layer_list.layers, 3)
-    six.assertCountEqual(
-        self,
-        model.layers,
-        tuple(model.layer_list.layers) + model.layers_with_updates)
-    self.assertEqual(3, model.layer_list.layers[0].units)
-    self.assertEqual(4, model.layer_list.layers[1].units)
-    self.assertEqual(5, model.layer_list.layers[2].units)
-    self.assertLen(model._checkpoint_dependencies, 2)
-    self.assertIs(model.layer_list, model._checkpoint_dependencies[0].ref)
-    self.assertIs(model.layers_with_updates,
-                  model._checkpoint_dependencies[1].ref)
-    self.assertLen(
-        model._checkpoint_dependencies[0].ref._checkpoint_dependencies, 3)
-    self.evaluate([v.initializer for v in model.variables])
-    self.evaluate(model.variables[0].assign([[1., 2., 3.], [4., 5., 6.]]))
-    save_path = os.path.join(self.get_temp_dir(), "ckpt")
-    model.save_weights(save_path)
-    self.evaluate(model.variables[0].assign(array_ops.zeros([2, 3])))
-    model.load_weights(save_path)
-    self.assertAllEqual([[1., 2., 3.], [4., 5., 6.]],
-                        self.evaluate(model.variables[0]))
-    v = variables.Variable(1.)
-    model.var_list = (v,)
-    self.assertIn(id(v), [id(obj) for obj in model.variables])
-    self.assertIn(id(v), [id(obj) for obj in model.trainable_variables])
-    self.assertNotIn(id(v), [id(obj) for obj in model.non_trainable_variables])
-    self.assertIn(id(model.layer_list[0].trainable_weights[0]),
-                  [id(obj) for obj in model.trainable_weights])
-
-  @parameterized.named_parameters(
-      ("Module", module.Module),
-      ("Model", training.Model),
-  )
-  def testSubModelTracking(self, module_subclass):
-    model = module_subclass()
-    model.v = variables.Variable(1.)
-    self.assertIn(model.v, model.trainable_variables)
-    model2 = module_subclass()
-    model2.m = (model,)
-    self.assertIn(model.v, model2.trainable_variables)
-
-  def testSubSequentialTracking(self):
-
-    class _Subclassed(training.Model):
-
-      def __init__(self, wrapped):
-        super(_Subclassed, self).__init__()
-        self._wrapped = wrapped
-
-      def call(self, x):
-        return self._wrapped(x)
-
-    model = sequential.Sequential()
-    layer = core.Dense(1)
-    model.add(layer)
-    model2 = _Subclassed(model)
-    model2(array_ops.ones([1, 2]))
-    model2.m = (model,)
-    self.assertIn(layer.kernel, model2.trainable_weights)
-
   def testJSONSerialization(self):
     obj = tracking.AutoTrackable()
     obj.l = (1,)
     json.dumps(obj.l, default=serialization.get_json_type)
 
-  def testUpdatesForwarded(self):
-    with ops.Graph().as_default():
-      model = HasTuple()
-      model_input = array_ops.ones([32, 2])
-      model(model_input)
-      self.assertNotEmpty(model.layers_with_updates[0].updates)
-      self.assertEqual(set(model.layers_with_updates[0].updates),
-                       set(model.updates))
-
-    model = HasTuple()
-    model_input = array_ops.ones([32, 2])
-    model(model_input)
-    self.assertEmpty(model.updates)
-
-  @test_util.run_in_graph_and_eager_modes
-  def testLossesForwarded(self):
-    model = HasTuple()
-    model_input = array_ops.ones([32, 2])
-    model(model_input)
-    self.assertLen(model.losses, 1)
-
-  def testModelContainersCompareEqual(self):
-    class HasEqualContainers(training.Model):
-
-      def __init__(self):
-        super(HasEqualContainers, self).__init__()
-        self.l1 = ()
-        self.l2 = ()
-
-    model = HasEqualContainers()
-    first_layer = HasEqualContainers()
-    model.l1 = (first_layer,)
-    second_layer = HasEqualContainers()
-    model.l2 = (second_layer,)
-    self.assertEqual((first_layer,), model.l1)
-    d = {model.l1: 1, model.l2: 2}
-    self.assertEqual(1, d[model.l1])
-    self.assertEqual(1, d[(first_layer,)])
-    self.assertEqual(2, d[model.l2])
-    self.assertEqual(2, d[(second_layer,)])
-    self.assertEqual([first_layer, second_layer], model.layers)
-
-  @test_util.run_in_graph_and_eager_modes
-  def testTensorConversion(self):
-
-    class TupleToTensor(training.Model):
-
-      def __init__(self):
-        super(TupleToTensor, self).__init__()
-        self.l = (1., 2., 3.)
-
-    self.assertAllEqual(
-        (1., 2., 3.),
-        self.evaluate(constant_op.constant(TupleToTensor().l)))
-
-    self.assertAllEqual(
-        (1., 2., 3.),
-        self.evaluate(array_ops.pack(TupleToTensor().l)))
-
   def testNonLayerVariables(self):
     v = resource_variable_ops.ResourceVariable([1.])
     l = data_structures._TupleWrapper((v,))
diff --git a/tensorflow/python/training/tracking/tracking_test.py b/tensorflow/python/training/tracking/tracking_test.py
index cf2da4c9afa..4dff392cf9f 100644
--- a/tensorflow/python/training/tracking/tracking_test.py
+++ b/tensorflow/python/training/tracking/tracking_test.py
@@ -25,14 +25,10 @@ import time
 import timeit
 
 import numpy as np
-import six
 
 from tensorflow.python.framework import test_util
-from tensorflow.python.keras.engine import training
-from tensorflow.python.module import module
 from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
-from tensorflow.python.training.tracking import base
 from tensorflow.python.training.tracking import data_structures
 from tensorflow.python.training.tracking import tracking
 from tensorflow.python.training.tracking import util
@@ -73,28 +69,6 @@ class InterfaceTests(test.TestCase):
     (_, dep_object), = root._checkpoint_dependencies
     self.assertIs(duplicate_name_dep, dep_object)
 
-  def testNoDependency(self):
-    root = tracking.AutoTrackable()
-    hasdep = tracking.AutoTrackable()
-    root.hasdep = hasdep
-    nodep = tracking.AutoTrackable()
-    root.nodep = data_structures.NoDependency(nodep)
-    self.assertEqual(1, len(root._checkpoint_dependencies))
-    self.assertIs(root._checkpoint_dependencies[0].ref, root.hasdep)
-    self.assertIs(root.hasdep, hasdep)
-    self.assertIs(root.nodep, nodep)
-
-    class NoDependencyModel(training.Model):
-
-      @base.no_automatic_dependency_tracking
-      def __init__(self):
-        super(NoDependencyModel, self).__init__()
-        self.a = []
-        self.b = tracking.AutoTrackable()
-
-    nodeps = NoDependencyModel()
-    self.assertEqual([nodeps], util.list_objects(nodeps))
-
   def testRemoveDependency(self):
     root = tracking.AutoTrackable()
     root.a = tracking.AutoTrackable()
@@ -183,43 +157,6 @@ class InterfaceTests(test.TestCase):
     with self.assertRaisesRegexp(ValueError, "A list element was replaced"):
       checkpoint.save(os.path.join(self.get_temp_dir(), "ckpt"))
 
-  @test_util.run_in_graph_and_eager_modes
-  def testDictionariesBasic(self):
-    a = training.Model()
-    b = training.Model()
-    a.attribute = {"b": b}
-    c = training.Model()
-    a.attribute["c"] = []
-    a.attribute["c"].append(c)
-    a_deps = util.list_objects(a)
-    self.assertIn(b, a_deps)
-    self.assertIn(c, a_deps)
-    self.assertIs(b, a.attribute["b"])
-    six.assertCountEqual(
-        self,
-        ["b", "c"],
-        [dep.name for dep in a.attribute._checkpoint_dependencies])
-    self.assertEqual([b, c], a.layers)
-    self.assertEqual([b, c], a.attribute.layers)
-    self.assertEqual([c], a.attribute["c"].layers)
-    checkpoint = util.Checkpoint(a=a)
-    save_path = checkpoint.save(os.path.join(self.get_temp_dir(), "ckpt"))
-    with self.cached_session():
-      checkpoint.restore(save_path).assert_consumed().initialize_or_restore()
-
-  @test_util.run_in_graph_and_eager_modes
-  def testNoDepList(self):
-    a = training.Model()
-    a.l1 = data_structures.NoDependency([])
-    a.l1.insert(1, 0)
-    self.assertTrue(isinstance(a.l1, list))
-    checkpoint = util.Checkpoint(a=a)
-    checkpoint.save(os.path.join(self.get_temp_dir(), "ckpt"))
-    a.l2 = []
-    a.l2.insert(1, module.Module())
-    with self.assertRaisesRegexp(ValueError, "A list element was replaced"):
-      checkpoint.save(os.path.join(self.get_temp_dir(), "ckpt"))
-
   @test_util.run_in_graph_and_eager_modes
   def testAssertions(self):
     a = tracking.AutoTrackable()
diff --git a/tensorflow/python/training/tracking/util_test.py b/tensorflow/python/training/tracking/util_test.py
index 7a96fedc89b..6c0b08426e7 100644
--- a/tensorflow/python/training/tracking/util_test.py
+++ b/tensorflow/python/training/tracking/util_test.py
@@ -16,25 +16,18 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import functools
 import os
 import weakref
 
 from absl.testing import parameterized
 import six
 
-from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
-from tensorflow.python.keras.engine import input_layer
-from tensorflow.python.keras.engine import sequential
-from tensorflow.python.keras.engine import training
-from tensorflow.python.keras.layers import core
-from tensorflow.python.keras.optimizer_v2 import adam
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import resource_variable_ops
@@ -46,7 +39,6 @@ from tensorflow.python.platform import test
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import checkpoint_management
 from tensorflow.python.training import saver as saver_lib
-from tensorflow.python.training import training_util
 from tensorflow.python.training.saving import checkpoint_options
 from tensorflow.python.training.tracking import base
 from tensorflow.python.training.tracking import graph_view
@@ -62,44 +54,8 @@ class NonLayerTrackable(tracking.AutoTrackable):
         self, name="a_variable", shape=[])
 
 
-# pylint: disable=not-callable
-class MyModel(training.Model):
-  """A concrete Model for testing."""
-
-  def __init__(self):
-    super(MyModel, self).__init__()
-    self._named_dense = core.Dense(1, use_bias=True)
-    self._second = core.Dense(1, use_bias=False)
-    # We can still track Trackables which aren't Layers.
-    self._non_layer = NonLayerTrackable()
-
-  def call(self, values):
-    ret = self._second(self._named_dense(values))
-    return ret
-
-
 class InterfaceTests(test.TestCase):
 
-  def testLayerDeduplication(self):
-    model = training.Model()
-    layer_one = core.Dense(1)
-    layer_two = core.Dense(1)
-    model.other_path = [layer_one, layer_two]
-    model.l2 = layer_two
-    model.l1 = layer_one
-    self.assertEqual([layer_one, layer_two], model.layers)
-
-  def testSaveWithOnlyKerasSession(self):
-
-    with ops.Graph().as_default():
-      inp = input_layer.Input([1])
-      dense = core.Dense(1)(inp)
-      model = training.Model(inp, dense)
-      model.compile(optimizer="sgd", loss="mse")
-      model.fit([1.], [2.])
-      checkpoint = trackable_utils.Checkpoint(model=model)
-      checkpoint.save(os.path.join(self.get_temp_dir(), "ckpt"))
-
   @test_util.run_in_graph_and_eager_modes(assert_no_eager_garbage=True)
   def testAddVariable(self):
     obj = NonLayerTrackable()
@@ -184,22 +140,6 @@ class InterfaceTests(test.TestCase):
     self.assertEqual(dtypes.float64, v2.dtype)
     self.assertAllEqual([1., 1., 1.], self.evaluate(v2))
 
-  def testObjectMetadata(self):
-    with context.eager_mode():
-      checkpoint_directory = self.get_temp_dir()
-      checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
-      dense = core.Dense(1)
-      checkpoint = trackable_utils.Checkpoint(dense=dense)
-      dense(constant_op.constant([[1.]]))
-      save_path = checkpoint.save(checkpoint_prefix)
-
-    objects = trackable_utils.object_metadata(save_path)
-    all_variable_names = []
-    for obj in objects.nodes:
-      for attribute in obj.attributes:
-        all_variable_names.append(attribute.full_name)
-    self.assertIn("dense/kernel", all_variable_names)
-
   def testNotTrackable(self):
 
     class CallsFunctionalStuff(
@@ -268,100 +208,6 @@ class _OwnsMirroredVariables(base.Trackable):
 
 class CheckpointingTests(parameterized.TestCase, test.TestCase):
 
-  @test_util.run_in_graph_and_eager_modes(assert_no_eager_garbage=True)
-  def testNamingWithOptimizer(self):
-    input_value = constant_op.constant([[3.]])
-    model = MyModel()
-    # A nuisance Model using the same optimizer. Its slot variables should not
-    # go in the checkpoint, since it is never depended on.
-    other_model = MyModel()
-    optimizer = adam.Adam(0.001)
-    step = training_util.get_or_create_global_step()
-    root_trackable = trackable_utils.Checkpoint(
-        optimizer=optimizer, model=model, step=step)
-
-    with backprop.GradientTape() as tape:
-      loss = model(input_value)
-    variables = model.trainable_variables
-    gradients = tape.gradient(loss, variables)
-    train_op = control_flow_ops.group(
-        optimizer.apply_gradients(zip(gradients, variables)),
-        step.assign_add(1))
-
-    with backprop.GradientTape() as tape:
-      loss = other_model(input_value)
-    variables = other_model.trainable_variables
-    gradients = tape.gradient(loss, variables)
-    optimizer.apply_gradients(zip(gradients, variables))
-
-    self.evaluate(trackable_utils.gather_initializers(
-        root_trackable))
-    self.evaluate(train_op)
-    named_variables, serialized_graph, _ = graph_view.ObjectGraphView(
-        root_trackable).serialize_object_graph()
-    expected_slot_keys = (
-        "model/_second/kernel/.OPTIMIZER_SLOT/optimizer/m",
-        "model/_second/kernel/.OPTIMIZER_SLOT/optimizer/v",
-        "model/_named_dense/kernel/.OPTIMIZER_SLOT/optimizer/m",
-        "model/_named_dense/kernel/.OPTIMIZER_SLOT/optimizer/v",
-        "model/_named_dense/bias/.OPTIMIZER_SLOT/optimizer/m",
-        "model/_named_dense/bias/.OPTIMIZER_SLOT/optimizer/v",
-    )
-    expected_checkpoint_names = (
-        # Created in the root node, so no prefix.
-        "step",
-        "model/_second/kernel",
-        "model/_named_dense/kernel",
-        "model/_named_dense/bias",
-        # non-Layer dependency of the model
-        "model/_non_layer/a_variable",
-        "optimizer/learning_rate",
-        "optimizer/beta_1",
-        "optimizer/beta_2",
-        "optimizer/iter",
-        "optimizer/decay",
-    ) + expected_slot_keys
-    suffix = "/.ATTRIBUTES/VARIABLE_VALUE"
-    expected_checkpoint_names = [
-        name + suffix for name in expected_checkpoint_names]
-    named_variables = {v.name: v for v in named_variables}
-    six.assertCountEqual(self, expected_checkpoint_names,
-                         named_variables.keys())
-    # Check that we've mapped to the right variable objects (not exhaustive)
-    self.assertEqual(
-        "global_step",
-        named_variables["step" + suffix].full_name)
-    self.assertEqual(
-        "my_model/dense_1/kernel",
-        named_variables["model/_second/kernel" + suffix].full_name)
-    self.assertEqual(
-        "my_model/dense/kernel",
-        named_variables["model/_named_dense/kernel" + suffix].full_name)
-    self.assertEqual("Adam/beta_1",
-                     named_variables["optimizer/beta_1" + suffix].full_name)
-    self.assertEqual("Adam/beta_2",
-                     named_variables["optimizer/beta_2" + suffix].full_name)
-    # Spot check the generated protocol buffers.
-    self.assertEqual("optimizer",
-                     serialized_graph.nodes[0].children[1].local_name)
-    optimizer_node = serialized_graph.nodes[serialized_graph.nodes[0].children[
-        1].node_id]
-    children = [node.local_name for node in optimizer_node.children]
-    six.assertCountEqual(
-        self,
-        # hyper variable dependencies
-        ["beta_1", "beta_2", "iter", "decay", "learning_rate"],
-        children)
-    serialized_slot_keys = []
-    for slot in optimizer_node.slot_variables:
-      for attribute in (
-          serialized_graph.nodes[slot.slot_variable_node_id].attributes):
-        serialized_slot_keys.append(attribute.checkpoint_key)
-    six.assertCountEqual(
-        self,
-        [key + suffix for key in expected_slot_keys],
-        serialized_slot_keys)
-
   @test_util.run_in_graph_and_eager_modes
   def testMoreComplexSaveableReturned(self):
     v = _OwnsMirroredVariables()
@@ -432,174 +278,6 @@ class CheckpointingTests(parameterized.TestCase, test.TestCase):
         if op.type in ("SaveV2", "RestoreV2"):
           self.assertEqual(localhost, op.device)
 
-  @test_util.run_in_graph_and_eager_modes
-  def testSaveRestore(self):
-    model = MyModel()
-    optimizer = adam.Adam(0.001)
-    root_trackable = trackable_utils.Checkpoint(
-        optimizer=optimizer, model=model)
-    input_value = constant_op.constant([[3.]])
-    with backprop.GradientTape() as tape:
-      loss = model(input_value)
-    variables = model.trainable_variables
-    gradients = tape.gradient(loss, variables)
-    train_op = optimizer.apply_gradients(zip(gradients, variables))
-    self.assertFalse(root_trackable.save_counter.trainable)
-    self.evaluate(trackable_utils.gather_initializers(
-        root_trackable))
-    self.evaluate(train_op)
-    prefix = os.path.join(self.get_temp_dir(), "ckpt")
-    self.evaluate(state_ops.assign(model._named_dense.variables[1], [42.]))
-    m_bias_slot = optimizer.get_slot(model._named_dense.variables[1], "m")
-    self.evaluate(state_ops.assign(m_bias_slot, [1.5]))
-    save_path = root_trackable.save(file_prefix=prefix)
-    self.evaluate(state_ops.assign(model._named_dense.variables[1], [43.]))
-    self.evaluate(state_ops.assign(root_trackable.save_counter, 3))
-    optimizer_variables = self.evaluate(
-        sorted(optimizer.variables(), key=lambda v: v.name))
-    self.evaluate(state_ops.assign(m_bias_slot, [-2.]))
-    # Immediate restoration
-    status = root_trackable.restore(save_path=save_path).assert_consumed()
-    status.run_restore_ops()
-    self.assertAllEqual([42.], self.evaluate(model._named_dense.variables[1]))
-    self.assertAllEqual(1, self.evaluate(root_trackable.save_counter))
-    self.assertAllEqual([1.5], self.evaluate(m_bias_slot))
-    if not context.executing_eagerly():
-      return  # Restore-on-create is only supported when executing eagerly
-    on_create_model = MyModel()
-    on_create_optimizer = adam.Adam(0.001)
-    on_create_root = trackable_utils.Checkpoint(
-        optimizer=on_create_optimizer, model=on_create_model)
-    # Deferred restoration
-    status = on_create_root.restore(save_path=save_path)
-    status.assert_nontrivial_match()
-    status.assert_existing_objects_matched()
-    with self.assertRaises(AssertionError):
-      status.assert_consumed()
-    on_create_model(constant_op.constant([[3.]]))  # create variables
-    self.assertAllEqual(1, self.evaluate(on_create_root.save_counter))
-    self.assertAllEqual([42.],
-                        self.evaluate(
-                            on_create_model._named_dense.variables[1]))
-    on_create_m_bias_slot = on_create_optimizer.get_slot(
-        on_create_model._named_dense.variables[1], "m")
-    status.assert_existing_objects_matched()
-    if not context.executing_eagerly():
-      with self.assertRaises(AssertionError):
-        status.assert_consumed()
-    # Optimizer slot variables are created when the original variable is
-    # restored.
-    self.assertAllEqual([1.5], self.evaluate(on_create_m_bias_slot))
-    dummy_var = resource_variable_ops.ResourceVariable([1.])
-    on_create_optimizer.minimize(loss=dummy_var.read_value,
-                                 var_list=[dummy_var])
-    status.assert_existing_objects_matched()
-    status.assert_consumed()
-    self.assertAllEqual(
-        optimizer_variables,
-        # Creation order is different, so .variables() needs to be re-sorted.
-        self.evaluate(sorted(optimizer.variables(), key=lambda v: v.name)))
-
-  # TODO(allenl): Debug garbage created by this test in python3.
-  def testDeferredRestorationUsageEager(self):
-    """An idiomatic eager execution example."""
-    num_training_steps = 10
-    checkpoint_directory = self.get_temp_dir()
-    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
-    for training_continuation in range(3):
-      model = MyModel()
-      optimizer = adam.Adam(0.001)
-      root = trackable_utils.Checkpoint(
-          optimizer=optimizer, model=model)
-      root.restore(checkpoint_management.latest_checkpoint(
-          checkpoint_directory))
-      for _ in range(num_training_steps):
-        # TODO(allenl): Use a Dataset and serialize/checkpoint it.
-        input_value = constant_op.constant([[3.]])
-        with backprop.GradientTape() as tape:
-          loss = model(input_value)
-        variables = model.trainable_variables
-        gradients = tape.gradient(loss, variables)
-        optimizer.apply_gradients(zip(gradients, variables))
-      root.save(file_prefix=checkpoint_prefix)
-      self.assertEqual((training_continuation + 1) * num_training_steps,
-                       root.optimizer.iterations.numpy())
-
-  def testUsageGraph(self):
-    """Expected usage when graph building."""
-    with context.graph_mode():
-      num_training_steps = 10
-      checkpoint_directory = self.get_temp_dir()
-      checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
-      for training_continuation in range(3):
-        with ops.Graph().as_default():
-          model = MyModel()
-          optimizer = adam.Adam(0.001)
-          root = trackable_utils.CheckpointV1(
-              optimizer=optimizer, model=model)
-          input_value = constant_op.constant([[3.]])
-          with backprop.GradientTape() as tape:
-            loss = model(input_value)
-          variables = model.trainable_variables
-          gradients = tape.gradient(loss, variables)
-          train_op = optimizer.apply_gradients(zip(gradients, variables))
-
-          checkpoint_path = checkpoint_management.latest_checkpoint(
-              checkpoint_directory)
-          with self.session(graph=ops.get_default_graph()) as session:
-            status = root.restore(save_path=checkpoint_path)
-            status.initialize_or_restore(session=session)
-            if checkpoint_path is None:
-              self.assertEqual(0, training_continuation)
-              with self.assertRaises(AssertionError):
-                status.assert_consumed()
-              with self.assertRaises(AssertionError):
-                status.assert_existing_objects_matched()
-            else:
-              status.assert_consumed()
-              status.assert_existing_objects_matched()
-            for _ in range(num_training_steps):
-              session.run(train_op)
-            root.save(file_prefix=checkpoint_prefix, session=session)
-            self.assertEqual((training_continuation + 1) * num_training_steps,
-                             session.run(root.optimizer.iterations))
-            self.assertEqual(training_continuation + 1,
-                             session.run(root.save_counter))
-
-  @test_util.run_in_graph_and_eager_modes
-  def testAgnosticUsage(self):
-    """Graph/eager agnostic usage."""
-    # Does create garbage when executing eagerly due to ops.Graph() creation.
-    num_training_steps = 10
-    checkpoint_directory = self.get_temp_dir()
-    def _train_fn(model, input_value):
-      with backprop.GradientTape() as tape:
-        loss = model(input_value)
-      variables = model.trainable_variables
-      gradients = tape.gradient(loss, variables)
-      return optimizer.apply_gradients(zip(gradients, variables))
-    for training_continuation in range(3):
-      with test_util.device(use_gpu=True):
-        model = MyModel()
-        optimizer = adam.Adam(0.001)
-        root = trackable_utils.Checkpoint(
-            optimizer=optimizer, model=model)
-        manager = checkpoint_management.CheckpointManager(
-            root, checkpoint_directory, max_to_keep=1)
-        status = root.restore(save_path=manager.latest_checkpoint)
-        input_value = constant_op.constant([[3.]])
-        train_fn = functools.partial(_train_fn, model, input_value)
-        if not context.executing_eagerly():
-          train_fn = functools.partial(self.evaluate, train_fn())
-        status.initialize_or_restore()
-        for _ in range(num_training_steps):
-          train_fn()
-        manager.save()
-        self.assertEqual((training_continuation + 1) * num_training_steps,
-                         self.evaluate(root.optimizer.iterations))
-        self.assertEqual(training_continuation + 1,
-                         self.evaluate(root.save_counter))
-
   @test_util.run_in_graph_and_eager_modes
   def testFreezing(self):
     with test_util.use_gpu():
@@ -656,31 +334,6 @@ class CheckpointingTests(parameterized.TestCase, test.TestCase):
         self.fail("%s should have suffix %s" % (path, expected_suffix))
       self.evaluate(step.assign_add(2))
 
-  def testPartialRestoreWarningObject(self):
-    with context.eager_mode():
-      optimizer = adam.Adam(0.0)
-      original_root = trackable_utils.Checkpoint(v1=variables_lib.Variable(2.),
-                                                 v2=variables_lib.Variable(3.),
-                                                 optimizer=optimizer)
-      # Create a slot variable to save
-      optimizer.minimize(original_root.v1.read_value, [original_root.v1])
-      prefix = os.path.join(self.get_temp_dir(), "ckpt")
-      save_path = original_root.save(prefix)
-      partial_root = trackable_utils.Checkpoint(v1=variables_lib.Variable(0.))
-      weak_partial_root = weakref.ref(partial_root)
-      weak_v1 = weakref.ref(partial_root.v1)
-      partial_root.restore(save_path)
-      self.assertEqual(2., partial_root.v1.numpy())
-      with test.mock.patch.object(logging, "warning") as mock_log:
-        del partial_root
-        self.assertIsNone(weak_partial_root())
-        self.assertIsNone(weak_v1())
-        messages = str(mock_log.call_args_list)
-      self.assertIn("(root).v2'", messages)
-      self.assertIn("(root).optimizer's state 'm' for (root).v1", messages)
-      self.assertNotIn("(root).v1'", messages)
-      self.assertIn("expect_partial()", messages)
-
   def testPartialRestoreWarningAttribute(self):
     with context.eager_mode():
       original_root = trackable_utils.Checkpoint(v1=variables_lib.Variable(2.),
@@ -734,49 +387,6 @@ class CheckpointingTests(parameterized.TestCase, test.TestCase):
         self.assertIsNone(weak_v1())
         self.assertEmpty(mock_log.call_args_list)
 
-  # pylint: disable=cell-var-from-loop
-  @test_util.run_in_graph_and_eager_modes
-  @test_util.run_v1_only("b/120545219")
-  def testWithDefun(self):
-    num_training_steps = 2
-    checkpoint_directory = self.get_temp_dir()
-    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
-    for training_continuation in range(3):
-      with test_util.device(use_gpu=True):
-        model = MyModel()
-        # Don't actually train so we can test variable values
-        optimizer = adam.Adam(0.)
-        root = trackable_utils.Checkpoint(
-            optimizer=optimizer, model=model)
-        checkpoint_path = checkpoint_management.latest_checkpoint(
-            checkpoint_directory)
-        status = root.restore(save_path=checkpoint_path)
-        def train_fn():
-          @def_function.function
-          def _call_model(x):
-            return model(x)
-          with backprop.GradientTape() as tape:
-            loss = _call_model(constant_op.constant([[3.]]))
-          gradients = tape.gradient(loss, model.variables)
-          return optimizer.apply_gradients(zip(gradients, model.variables))
-        if not context.executing_eagerly():
-          train_fn = functools.partial(
-              self.evaluate, train_fn())
-        status.initialize_or_restore()
-        for _ in range(num_training_steps):
-          train_fn()
-        if training_continuation > 0:
-          status.assert_consumed()
-          self.assertAllClose([[42.]], self.evaluate(model.variables[0]))
-        else:
-          self.evaluate(model.variables[0].assign([[42.]]))
-        root.save(file_prefix=checkpoint_prefix)
-        self.assertEqual((training_continuation + 1) * num_training_steps,
-                         self.evaluate(optimizer.iterations))
-        self.assertEqual(training_continuation + 1,
-                         self.evaluate(root.save_counter))
-  # pylint: enable=cell-var-from-loop
-
   def _get_checkpoint_name(self, name):
     root = tracking.AutoTrackable()
     trackable_utils.add_variable(
@@ -819,35 +429,6 @@ class CheckpointingTests(parameterized.TestCase, test.TestCase):
     self.assertEqual("..ATTRIBUTES/a/.ATTRIBUTES/VARIABLE_VALUE",
                      named_variable.name)
 
-  def testAnonymousVarsInInit(self):
-
-    class Model(training.Model):
-
-      def __init__(self):
-        super(Model, self).__init__()
-        self.w = resource_variable_ops.ResourceVariable(0.0)
-        self.b = resource_variable_ops.ResourceVariable(0.0)
-        self.vars = [self.w, self.b]
-
-      def call(self, x):
-        return x * self.w + self.b
-
-    with context.eager_mode():
-      model = Model()
-      optimizer = adam.Adam(learning_rate=0.05)
-      checkpoint_directory = self.get_temp_dir()
-      checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
-      checkpoint = trackable_utils.Checkpoint(
-          model=model, optimizer=optimizer)
-      for _ in range(2):
-        checkpoint.save(checkpoint_prefix)
-        with backprop.GradientTape() as tape:
-          loss = (constant_op.constant(1.)
-                  - model(constant_op.constant(1.))) ** 2
-        grad = tape.gradient(loss, model.vars)
-        optimizer.apply_gradients(
-            [(g, v) for g, v in zip(grad, model.vars)])
-
   @test_util.run_in_graph_and_eager_modes
   def testLateDependencyTracking(self):
 
@@ -909,72 +490,6 @@ class CheckpointingTests(parameterized.TestCase, test.TestCase):
     status.run_restore_ops()
     self.assertEqual(-14., self.evaluate(loaded_dep_after_var.dep.var))
 
-  @test_util.run_in_graph_and_eager_modes
-  def testDeferredSlotRestoration(self):
-    checkpoint_directory = self.get_temp_dir()
-
-    root = trackable_utils.Checkpoint()
-    root.var = trackable_utils.add_variable(
-        root, name="var", initializer=0.)
-    optimizer = adam.Adam(0.1)
-    variables = [root.var]
-    gradients = [1.]
-    train_op = optimizer.apply_gradients(zip(gradients, variables))
-    # Note that `optimizer` has not been added as a dependency of
-    # `root`. Create a one-off grouping so that slot variables for `root.var`
-    # get initialized too.
-    self.evaluate(trackable_utils.gather_initializers(
-        trackable_utils.Checkpoint(root=root, optimizer=optimizer)))
-    self.evaluate(train_op)
-    self.evaluate(state_ops.assign(root.var, 12.))
-    no_slots_path = root.save(os.path.join(checkpoint_directory, "no_slots"))
-    root.optimizer = optimizer
-    self.evaluate(state_ops.assign(root.var, 13.))
-    self.evaluate(state_ops.assign(
-        optimizer.get_slot(slot_name="m", var=root.var),
-        14.))
-    slots_path = root.save(os.path.join(checkpoint_directory, "with_slots"))
-    new_root = trackable_utils.Checkpoint()
-    # Load the slot-containing checkpoint (deferred), then immediately overwrite
-    # the non-slot variable (also deferred).
-    slot_status = new_root.restore(slots_path)
-    no_slot_status = new_root.restore(no_slots_path)
-    with self.assertRaises(AssertionError):
-      no_slot_status.assert_consumed()
-    new_root.var = trackable_utils.add_variable(
-        new_root, name="var", shape=[])
-    no_slot_status.assert_consumed()
-    no_slot_status.run_restore_ops()
-    self.assertEqual(12., self.evaluate(new_root.var))
-    new_root.optimizer = adam.Adam(0.1)
-    slot_status.assert_existing_objects_matched()
-    if not context.executing_eagerly():
-      with self.assertRaisesRegexp(AssertionError, "Unresolved object"):
-        slot_status.assert_consumed()
-    self.assertEqual(12., self.evaluate(new_root.var))
-    if context.executing_eagerly():
-      # Slot variables are only created with restoring initializers when
-      # executing eagerly.
-      self.assertEqual(14., self.evaluate(
-          new_root.optimizer.get_slot(slot_name="m", var=new_root.var)))
-    else:
-      # Slot variables are not created eagerly when graph building.
-      with self.assertRaises(KeyError):
-        new_root.optimizer.get_slot(slot_name="m", var=new_root.var)
-    variables = [new_root.var]
-    gradients = [1.]
-    train_op = new_root.optimizer.apply_gradients(zip(gradients, variables))
-    # The slot variable now exists; restore() didn't create it, but we should
-    # now have a restore op for it.
-    slot_status.run_restore_ops()
-    if not context.executing_eagerly():
-      # The train op hasn't run when graph building, so the slot variable has
-      # its restored value. It has run in eager, so the value will be different.
-      self.assertEqual(14., self.evaluate(
-          new_root.optimizer.get_slot(slot_name="m", var=new_root.var)))
-    self.evaluate(train_op)
-    slot_status.assert_consumed()
-
   @test_util.run_in_graph_and_eager_modes
   def testOverlappingRestores(self):
     checkpoint_directory = self.get_temp_dir()
@@ -1154,24 +669,6 @@ class CheckpointingTests(parameterized.TestCase, test.TestCase):
     status.run_restore_ops()
     self.assertEqual(4., self.evaluate(recreated_var1))
 
-  def testManySavesGraph(self):
-    """Saves after the first should not modify the graph."""
-    with context.graph_mode():
-      graph = ops.Graph()
-      with graph.as_default(), self.session(graph):
-        checkpoint_directory = self.get_temp_dir()
-        checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
-        obj = trackable_utils.Checkpoint()
-        obj.var = variables_lib.Variable(0., name="v")
-        obj.opt = adam.Adam(0.1)
-        variables = [obj.var]
-        gradients = [1.]
-        obj.opt.apply_gradients(zip(gradients, variables))
-        self.evaluate(trackable_utils.gather_initializers(obj))
-        obj.save(checkpoint_prefix)
-        graph.finalize()
-        obj.save(checkpoint_prefix)
-
   @test_util.run_in_graph_and_eager_modes
   def testCheckpointState(self):
     # No checkpoints are deleted by default
@@ -1237,146 +734,6 @@ class CheckpointingTests(parameterized.TestCase, test.TestCase):
     self.assertEqual(1, self.evaluate(checkpoint.var_1))
     self.assertEqual(0, self.evaluate(checkpoint.var_0))
 
-  def testManyRestoresGraph(self):
-    """Restores after the first should not modify the graph."""
-    with context.graph_mode():
-      graph = ops.Graph()
-      with graph.as_default(), self.session(graph):
-        checkpoint_directory = self.get_temp_dir()
-        checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
-        obj = trackable_utils.Checkpoint()
-        obj.var = variables_lib.Variable(0., name="v")
-        obj.opt = adam.Adam(0.1)
-        variables = [obj.var]
-        gradients = [1.]
-        obj.opt.apply_gradients(zip(gradients, variables))
-        self.evaluate(trackable_utils.gather_initializers(obj))
-        save_path = obj.save(checkpoint_prefix)
-        obj.restore(save_path)
-        graph.finalize()
-        obj.restore(save_path)
-
-  @test_util.run_in_graph_and_eager_modes
-  def test_sequential(self):
-    model = sequential.Sequential()
-    checkpoint = trackable_utils.Checkpoint(model=model)
-    model.add(core.Dense(4))
-    second_dense = core.Dense(5)
-    model.add(second_dense)
-    model(constant_op.constant([[1.]]))
-    checkpoint.restore(None).initialize_or_restore()
-    self.evaluate(second_dense.bias.assign(
-        constant_op.constant([1., 2., 3., 4., 5.])))
-    checkpoint_directory = self.get_temp_dir()
-    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
-    save_path = checkpoint.save(checkpoint_prefix)
-    self.evaluate(second_dense.bias.assign(
-        constant_op.constant([5., 6., 7., 8., 9.])))
-    checkpoint.restore(save_path).assert_consumed().run_restore_ops()
-    self.assertAllEqual([1., 2., 3., 4., 5.], self.evaluate(second_dense.bias))
-
-    deferred_sequential = sequential.Sequential()
-    deferred_sequential_checkpoint = trackable_utils.Checkpoint(
-        model=deferred_sequential)
-    status = deferred_sequential_checkpoint.restore(save_path)
-    deferred_sequential.add(core.Dense(4))
-    deferred_second_dense = core.Dense(5)
-    deferred_sequential.add(deferred_second_dense)
-    deferred_sequential(constant_op.constant([[1.]]))
-    status.run_restore_ops()
-    self.assertAllEqual([1., 2., 3., 4., 5.],
-                        self.evaluate(deferred_second_dense.bias))
-
-  @test_util.run_in_graph_and_eager_modes
-  def test_initialize_if_not_restoring(self):
-    checkpoint_directory = self.get_temp_dir()
-    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
-    optimizer_only_prefix = os.path.join(checkpoint_directory, "opt")
-    with test_util.device(use_gpu=True):
-      model = MyModel()
-      optimizer = adam.Adam(0.001)
-      root = trackable_utils.Checkpoint(
-          model=model)  # Do not save the optimizer with the checkpoint.
-      optimizer_checkpoint = trackable_utils.Checkpoint(
-          optimizer=optimizer)
-
-      checkpoint_path = checkpoint_management.latest_checkpoint(
-          checkpoint_directory)
-      status = root.restore(save_path=checkpoint_path)
-      input_value = constant_op.constant([[3.]])
-      def train_fn():
-        with backprop.GradientTape() as tape:
-          loss = model(input_value)
-        variables = model.trainable_variables
-        gradients = tape.gradient(loss, variables)
-        return optimizer.apply_gradients(zip(gradients, variables))
-      if not context.executing_eagerly():
-        train_fn = functools.partial(self.evaluate, train_fn())
-      status.initialize_or_restore()
-      # TODO(tanzheny): Add hyper variables to .variables(), and set them with
-      # set_weights etc.
-      variables_not_in_the_variables_property = [
-          obj for obj in optimizer._hyper.values()
-          if isinstance(obj, variables_lib.Variable)]
-      self.evaluate([v.initializer for v
-                     in optimizer.variables()
-                     + variables_not_in_the_variables_property])
-      train_fn()
-      model_save_path = root.save(file_prefix=checkpoint_prefix)
-      self.evaluate(optimizer.beta_1.assign(42.))
-      optimizer_save_path = optimizer_checkpoint.save(optimizer_only_prefix)
-    del train_fn
-
-    # Restore into a graph with the optimizer
-    with test_util.device(use_gpu=True):
-      model = MyModel()
-      optimizer = adam.Adam(0.001)
-      root = trackable_utils.Checkpoint(
-          optimizer=optimizer, model=model)
-      status = root.restore(save_path=model_save_path)
-      input_value = constant_op.constant([[3.]])
-      def train_fn1():
-        with backprop.GradientTape() as tape:
-          loss = model(input_value)
-        variables = model.trainable_variables
-        gradients = tape.gradient(loss, variables)
-        return optimizer.apply_gradients(zip(gradients, variables))
-      if not context.executing_eagerly():
-        train_fn1 = functools.partial(self.evaluate, train_fn1())
-      status.initialize_or_restore()
-      train_fn1()
-      with self.assertRaises(AssertionError):
-        status.assert_existing_objects_matched()
-      with self.assertRaises(AssertionError):
-        status.assert_consumed()
-    del train_fn1
-
-    # Make sure initialization doesn't clobber later restores
-    with test_util.device(use_gpu=True):
-      model = MyModel()
-      optimizer = adam.Adam(0.001, beta_1=1.0)
-      root = trackable_utils.Checkpoint(
-          optimizer=optimizer, model=model)
-      opt_root = trackable_utils.Checkpoint(
-          optimizer=optimizer)
-      status = root.restore(save_path=model_save_path)
-      init_only_optimizer_status = opt_root.restore(save_path=None)
-      optimizer_status = opt_root.restore(save_path=optimizer_save_path)
-      input_value = constant_op.constant([[3.]])
-      def train_fn2():
-        with backprop.GradientTape() as tape:
-          loss = model(input_value)
-        variables = model.trainable_variables
-        gradients = tape.gradient(loss, variables)
-        return optimizer.apply_gradients(zip(gradients, variables))
-      if not context.executing_eagerly():
-        train_fn2 = functools.partial(self.evaluate, train_fn2())
-      optimizer_status.run_restore_ops()
-      status.initialize_or_restore()
-      init_only_optimizer_status.initialize_or_restore()
-      train_fn2()
-      self.assertEqual(42., self.evaluate(optimizer.beta_1))
-
   @test_util.run_in_graph_and_eager_modes
   def test_restore_after_adding_empty_trackable_data_structure(self):
     model = NonLayerTrackable()
@@ -1439,75 +796,8 @@ class CheckpointingTests(parameterized.TestCase, test.TestCase):
     self.assertAllClose(self.evaluate(load_checkpoint.b), {"a": 2, "b": 3})
 
 
-class _ManualScope(tracking.AutoTrackable):
-
-  def __call__(self):
-    with variable_scope.variable_scope("ManualScope") as vs:
-      self.variable_scope = vs
-      with trackable_utils.capture_dependencies(template=self):
-        return self._build()
-
-  def _build(self):
-    return variable_scope.get_variable(name="in_manual_scope", shape=[])
-
-
 class TemplateTests(parameterized.TestCase, test.TestCase):
 
-  @test_util.run_in_graph_and_eager_modes
-  def test_trackable_save_restore(self):
-
-    def _templated():
-      v = variable_scope.get_variable(
-          "v", shape=[1], initializer=init_ops.zeros_initializer(),
-          use_resource=True)
-      v2 = variable_scope.get_variable(
-          "v2", shape=[1], initializer=init_ops.zeros_initializer(),
-          use_resource=True)
-      manual = _ManualScope()
-      return v, v + 1., v2, manual, manual()
-
-    save_template = template.make_template("s1", _templated)
-    v1_save, _, v2_save, manual_scope, manual_scope_v = save_template()
-    six.assertCountEqual(
-        self,
-        [id(v1_save), id(v2_save), id(manual_scope),
-         id(manual_scope_v), id(save_template)],
-        map(id, trackable_utils.list_objects(save_template)))
-    manual_dep, = manual_scope._checkpoint_dependencies
-    self.assertEqual("in_manual_scope", manual_dep.name)
-    self.assertIs(manual_scope_v, manual_dep.ref)
-    optimizer = adam.Adam(0.0)
-    save_root = trackable_utils.Checkpoint(
-        my_template=save_template, optimizer=optimizer)
-    optimizer.minimize(v1_save.read_value,
-                       var_list=[v1_save])
-    self.evaluate([v.initializer for v in save_template.variables])
-    optimizer_variables = optimizer.variables() + list(
-        optimizer._hyper.values())
-    self.evaluate([v.initializer for v in optimizer_variables])
-    self.evaluate(v1_save.assign([12.]))
-    self.evaluate(v2_save.assign([14.]))
-    checkpoint_directory = self.get_temp_dir()
-    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
-    save_path = save_root.save(checkpoint_prefix)
-
-    load_template = template.make_template("s2", _templated)
-    load_optimizer = adam.Adam(0.0)
-    load_root = trackable_utils.Checkpoint(
-        my_template=load_template, optimizer=load_optimizer)
-    status = load_root.restore(save_path)
-    var, var_plus_one, var2, _, _ = load_template()
-    load_optimizer.minimize(var.read_value, var_list=[var])
-    self.assertLen(load_template._checkpoint_dependencies, 3)
-    self.assertEqual("v", load_template._checkpoint_dependencies[0].name)
-    self.assertEqual("v2", load_template._checkpoint_dependencies[1].name)
-    self.assertEqual("ManualScope",
-                     load_template._checkpoint_dependencies[2].name)
-    status.assert_consumed().run_restore_ops()
-    self.assertAllEqual([12.], self.evaluate(var))
-    self.assertAllEqual([13.], self.evaluate(var_plus_one))
-    self.assertAllEqual([14.], self.evaluate(var2))
-
   @test_util.run_in_graph_and_eager_modes
   def test_trackable_save_restore_nested(self):
 
@@ -1554,157 +844,6 @@ class TemplateTests(parameterized.TestCase, test.TestCase):
     self.assertAllEqual([25.], self.evaluate(v3))
 
 
-class CheckpointCompatibilityTests(test.TestCase):
-
-  def _initialized_model(self):
-    input_value = constant_op.constant([[3.]])
-    model = MyModel()
-    optimizer = adam.Adam(0.001)
-    root_trackable = trackable_utils.Checkpoint(
-        optimizer=optimizer, model=model)
-    with backprop.GradientTape() as tape:
-      loss = model(input_value)
-    variables = model.trainable_variables
-    gradients = tape.gradient(loss, variables)
-    train_op = optimizer.apply_gradients(zip(gradients, variables))
-    self.evaluate(trackable_utils.gather_initializers(
-        root_trackable))
-    self.evaluate(train_op)
-    # A regular variable, a slot variable, and a non-slot Optimizer variable
-    # with known values to check when loading.
-    self.evaluate(model._named_dense.bias.assign([1.]))
-    self.evaluate(optimizer.get_slot(
-        var=model._named_dense.bias, slot_name="m").assign([2.]))
-    self.evaluate(optimizer.beta_1.assign(3.))
-    return root_trackable
-
-  def _set_sentinels(self, root_trackable):
-    self.evaluate(root_trackable.model._named_dense.bias.assign([101.]))
-    self.evaluate(
-        root_trackable.optimizer.get_slot(
-            var=root_trackable.model._named_dense.bias, slot_name="m")
-        .assign([102.]))
-    self.evaluate(root_trackable.optimizer.beta_1.assign(103.))
-
-  def _check_sentinels(self, root_trackable):
-    self.assertAllEqual(
-        [1.], self.evaluate(root_trackable.model._named_dense.bias))
-    self.assertAllEqual([2.], self.evaluate(
-        root_trackable.optimizer.get_slot(
-            var=root_trackable.model._named_dense.bias, slot_name="m")))
-    self.assertAllEqual(3.,
-                        self.evaluate(root_trackable.optimizer.beta_1))
-
-  def _write_name_based_checkpoint(self):
-    checkpoint_directory = self.get_temp_dir()
-    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
-    with context.graph_mode():
-      save_graph = ops.Graph()
-      with save_graph.as_default(), self.session(
-          graph=save_graph) as session:
-        root = self._initialized_model()
-        name_saver = saver_lib.Saver()
-        return name_saver.save(
-            sess=session,
-            save_path=checkpoint_prefix,
-            global_step=root.optimizer.iterations)
-
-  @test_util.run_in_graph_and_eager_modes
-  def testLoadFromNameBasedSaver(self):
-    """Save a name-based checkpoint, load it using the object-based API."""
-    with test_util.device(use_gpu=True):
-      save_path = self._write_name_based_checkpoint()
-      root = self._initialized_model()
-      self._set_sentinels(root)
-      with self.assertRaises(AssertionError):
-        self._check_sentinels(root)
-      object_saver = trackable_utils.TrackableSaver(
-          graph_view.ObjectGraphView(root))
-      self._set_sentinels(root)
-      status = object_saver.restore(save_path)
-      if context.executing_eagerly():
-        self._check_sentinels(root)
-      if context.executing_eagerly():
-        status.assert_consumed()
-        status.assert_existing_objects_matched()
-        status.assert_nontrivial_match()
-      else:
-        # When graph building, we haven't read any keys, so we don't know
-        # whether the restore will be complete.
-        with self.assertRaisesRegexp(AssertionError, "not restored"):
-          status.assert_consumed()
-        with self.assertRaisesRegexp(AssertionError, "not restored"):
-          status.assert_existing_objects_matched()
-        with self.assertRaisesRegexp(AssertionError, "not restored"):
-          status.assert_nontrivial_match()
-      status.run_restore_ops()
-      self._check_sentinels(root)
-      self._set_sentinels(root)
-      status = object_saver.restore(save_path)
-      status.initialize_or_restore()
-      status.assert_nontrivial_match()
-      self._check_sentinels(root)
-      # Check that there is no error when keys are missing from the name-based
-      # checkpoint.
-      root.not_in_name_checkpoint = resource_variable_ops.ResourceVariable([1.])
-      status = object_saver.restore(save_path)
-      with self.assertRaises(AssertionError):
-        status.assert_existing_objects_matched()
-
-  def testSaveGraphLoadEager(self):
-    checkpoint_directory = self.get_temp_dir()
-    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
-    with context.graph_mode():
-      save_graph = ops.Graph()
-      with save_graph.as_default(), self.session(
-          graph=save_graph):
-        root = self._initialized_model()
-        save_path = root.save(file_prefix=checkpoint_prefix)
-    with context.eager_mode():
-      root = self._initialized_model()
-      self._set_sentinels(root)
-      root.restore(save_path).assert_consumed()
-      self._check_sentinels(root)
-
-  def testSaveEagerLoadGraph(self):
-    checkpoint_directory = self.get_temp_dir()
-    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
-    with context.eager_mode():
-      root = self._initialized_model()
-      save_path = root.save(file_prefix=checkpoint_prefix)
-    with context.graph_mode():
-      save_graph = ops.Graph()
-      with save_graph.as_default(), self.session(
-          graph=save_graph):
-        root = self._initialized_model()
-        self._set_sentinels(root)
-        root.restore(save_path).assert_consumed().run_restore_ops()
-        self._check_sentinels(root)
-
-  def testIgnoreSaveCounter(self):
-    checkpoint_directory = self.get_temp_dir()
-    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
-    with self.cached_session() as session:
-      # Create and save a model using Saver() before using a Checkpoint. This
-      # generates a snapshot without the Checkpoint's `save_counter`.
-      model = sequential.Sequential()
-      model.add(core.Flatten(input_shape=(1,)))
-      model.add(core.Dense(1))
-      name_saver = saver_lib.Saver(model.trainable_variables)
-      save_path = name_saver.save(
-          sess=session, save_path=checkpoint_prefix, global_step=1)
-      # Checkpoint.restore must successfully load that checkpoint.
-      ckpt = trackable_utils.Checkpoint(model=model)
-      status = ckpt.restore(save_path)
-      status.assert_existing_objects_matched()
-      # It should, however, refuse to load a checkpoint where an unrelated
-      # `save_counter` variable is missing.
-      model.layers[1].var = variables_lib.Variable(0., name="save_counter")
-      status = ckpt.restore(save_path)
-      with self.assertRaises(AssertionError):
-        status.assert_existing_objects_matched()
-
-
 if __name__ == "__main__":
   ops.enable_eager_execution()
   test.main()

From a2fb617adb26cda76925b5009bd8cd861e2e4943 Mon Sep 17 00:00:00 2001
From: Edward Loper <edloper@google.com>
Date: Mon, 15 Jun 2020 09:42:58 -0700
Subject: [PATCH 0169/1390] Internal visibility change

PiperOrigin-RevId: 316483590
Change-Id: Ia62c9b89846b6165be67c6cc2a3ce9f17e1b3eb3
---
 tensorflow/BUILD                       | 6 ++++++
 tensorflow/python/ops/structured/BUILD | 3 +--
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/tensorflow/BUILD b/tensorflow/BUILD
index efbdf89ecea..ce759634232 100644
--- a/tensorflow/BUILD
+++ b/tensorflow/BUILD
@@ -531,6 +531,7 @@ package_group(
 
 # Packages that use composite tensors or dispatch.
 # TODO(b/154762408) Remove this package group once it's no longer needed.
+# If this is modified, then copy.bara.sky must also be modified.
 package_group(name = "composite_tensor_whitelist")
 
 # Packages that use private types symbols, until they are exported.
@@ -540,6 +541,11 @@ package_group(
     packages = ["//learning/deepmind/tensorflow/replicator/..."],
 )
 
+# Packages that use StructuredTensors.
+# TODO(b/159007891) Remove this package once StructuredTensor is exported.
+# If this is modified, then copy.bara.sky must also be modified.
+package_group(name = "structured_tensor_whitelist")
+
 filegroup(
     name = "intel_binary_blob",
     data = if_mkl_ml(
diff --git a/tensorflow/python/ops/structured/BUILD b/tensorflow/python/ops/structured/BUILD
index 64b7bd7f1d5..33834f0e914 100644
--- a/tensorflow/python/ops/structured/BUILD
+++ b/tensorflow/python/ops/structured/BUILD
@@ -4,9 +4,8 @@ load("//tensorflow:tensorflow.bzl", "py_test")
 
 package(
     default_visibility = [
-        "//learning/tfx/autotfx:__subpackages__",
-        "//research/graph/convolutions/model/autotfx:__subpackages__",
         "//tensorflow:internal",
+        "//tensorflow:structured_tensor_whitelist",
     ],
     licenses = ["notice"],  # Apache 2.0
 )

From 05bec441e33a291e71f21de18d430feca28f00f9 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 15 Jun 2020 09:57:26 -0700
Subject: [PATCH 0170/1390] Integrate LLVM at
 https://github.com/llvm/llvm-project/commit/f47a7766287a

PiperOrigin-RevId: 316486707
Change-Id: I37ac1f5c8ad95e7a4c2cf5976e361fb06272b08b
---
 third_party/mlir/BUILD | 1 +
 1 file changed, 1 insertion(+)

diff --git a/third_party/mlir/BUILD b/third_party/mlir/BUILD
index 4c6717ea024..5f3f0a4b99b 100644
--- a/third_party/mlir/BUILD
+++ b/third_party/mlir/BUILD
@@ -2126,6 +2126,7 @@ cc_library(
         ":Analysis",
         ":ControlFlowInterfaces",
         ":IR",
+        ":LinalgOps",
         ":LoopLikeInterface",
         ":Pass",
         ":SCFDialect",

From a208b5cd2900eacae5aaf66d06e34fb30ed2ea43 Mon Sep 17 00:00:00 2001
From: Gaurav Jain <gjn@google.com>
Date: Mon, 15 Jun 2020 09:57:28 -0700
Subject: [PATCH 0171/1390] Allow ZeroDivisionTest to run eagerly

PiperOrigin-RevId: 316486715
Change-Id: I5a705b27562c57760ed8efeae52c67a91539ee7c
---
 tensorflow/python/kernel_tests/zero_division_test.py | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/tensorflow/python/kernel_tests/zero_division_test.py b/tensorflow/python/kernel_tests/zero_division_test.py
index 7f2d100f1e3..f62ac9f7f26 100644
--- a/tensorflow/python/kernel_tests/zero_division_test.py
+++ b/tensorflow/python/kernel_tests/zero_division_test.py
@@ -20,26 +20,25 @@ from __future__ import print_function
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import errors_impl
+from tensorflow.python.framework import errors
 from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test
 
 
 class ZeroDivisionTest(test.TestCase):
 
-  @test_util.run_deprecated_v1
   def testZeros(self):
     with test_util.use_gpu():
       for dtype in dtypes.uint8, dtypes.int16, dtypes.int32, dtypes.int64:
         zero = constant_op.constant(0, dtype=dtype)
         one = constant_op.constant(1, dtype=dtype)
-        bads = [one // zero]
+        bads = [lambda x, y: x // y]
         if dtype in (dtypes.int32, dtypes.int64):
-          bads.append(one % zero)
+          bads.append(lambda x, y: x % y)
         for bad in bads:
           try:
-            result = self.evaluate(bad)
-          except errors_impl.OpError as e:
+            result = self.evaluate(bad(one, zero))
+          except (errors.OpError, errors.InvalidArgumentError) as e:
             # Ideally, we'd get a nice exception.  In theory, this should only
             # happen on CPU, but 32 bit integer GPU division is actually on
             # CPU due to a placer bug.

From b09d410f3beb692c700e1951fb91f6235452940f Mon Sep 17 00:00:00 2001
From: Rahul Joshi <jurahul@google.com>
Date: Mon, 15 Jun 2020 09:58:30 -0700
Subject: [PATCH 0172/1390] Add transforms to convert between functional and
 region based If.

- Add this pair of transforms in TF -> TFLite conversion pass manager
- Add inliner hook to the TF dialect to allow inlining of call's within the
  regions of the IfRegion ops'
- Add a end-to-end test case using 2 If's that demonstrate inlining and followed
  by constant sinking and constant folding.

PiperOrigin-RevId: 316486959
Change-Id: I467b9f3fbc3eafc2cb37e46f61ff6ccfa34afcd5
---
 .../mlir/lite/tests/end2end/if_op.pbtxt       | 421 ++++++++++++++++++
 .../compiler/mlir/lite/tf_tfl_passes.cc       |   6 +
 tensorflow/compiler/mlir/tensorflow/BUILD     |   3 +
 .../compiler/mlir/tensorflow/ir/tf_ops.cc     |   9 +
 .../functional-control-flow-to-regions.mlir   | 113 +++++
 .../region-control-flow-to-functional.mlir    | 188 ++++++++
 .../functional_control_flow_to_regions.cc     | 117 +++++
 .../mlir/tensorflow/transforms/passes.h       |  14 +-
 .../region_control_flow_to_functional.cc      | 340 ++++++++++++++
 9 files changed, 1209 insertions(+), 2 deletions(-)
 create mode 100644 tensorflow/compiler/mlir/lite/tests/end2end/if_op.pbtxt
 create mode 100644 tensorflow/compiler/mlir/tensorflow/tests/functional-control-flow-to-regions.mlir
 create mode 100644 tensorflow/compiler/mlir/tensorflow/tests/region-control-flow-to-functional.mlir
 create mode 100644 tensorflow/compiler/mlir/tensorflow/transforms/functional_control_flow_to_regions.cc
 create mode 100644 tensorflow/compiler/mlir/tensorflow/transforms/region_control_flow_to_functional.cc

diff --git a/tensorflow/compiler/mlir/lite/tests/end2end/if_op.pbtxt b/tensorflow/compiler/mlir/lite/tests/end2end/if_op.pbtxt
new file mode 100644
index 00000000000..f482e3db6b9
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/tests/end2end/if_op.pbtxt
@@ -0,0 +1,421 @@
+# RUN: tf_tfl_translate -tf-input-arrays=a,b -tf-input-data-types=DT_FLOAT,DT_FLOAT -tf-input-shapes=4:4 -tf-output-arrays=StatefulIf,StatelessIf %s -o - --output-mlir | FileCheck %s
+node {
+  name: "tf.Less"
+  op: "Less"
+  input: "a"
+  input: "b"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  experimental_debug_info {
+  }
+}
+node {
+  name: "my_equal"
+  op: "Equal"
+  input: "a"
+  input: "b"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  experimental_debug_info {
+  }
+}
+node {
+  name: "cst0"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 4
+          }
+        }
+        float_val: 1.0
+        float_val: 2.0
+        float_val: 3.0
+        float_val: 4.0
+      }
+    }
+  }
+}
+node {
+  name: "cst1"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 4
+          }
+        }
+        float_val: 5.0
+        float_val: 6.0
+        float_val: 7.0
+        float_val: 8.0
+      }
+    }
+  }
+}
+node {
+  name: "StatefulIf"
+  op: "If"
+  input: "tf.Less"
+  input: "a"
+  input: "b"
+  input: "cst0"
+  input: "cst1"
+  attr {
+    key: "Tcond"
+    value {
+      type: DT_BOOL
+    }
+  }
+  attr {
+    key: "Tin"
+    value {
+      list {
+        type: DT_FLOAT
+        type: DT_FLOAT
+        type: DT_FLOAT
+        type: DT_FLOAT
+      }
+    }
+  }
+  attr {
+    key: "Tout"
+    value {
+      list {
+        type: DT_FLOAT
+      }
+    }
+  }
+  attr {
+    key: "else_branch"
+    value {
+      func {
+        name: "cond_false"
+      }
+    }
+  }
+  attr {
+    key: "then_branch"
+    value {
+      func {
+        name: "cond_true"
+      }
+    }
+  }
+  experimental_debug_info {
+  }
+}
+node {
+  name: "StatelessIf"
+  op: "StatelessIf"
+  input: "my_equal"
+  input: "a"
+  input: "b"
+  attr {
+    key: "Tcond"
+    value {
+      type: DT_BOOL
+    }
+  }
+  attr {
+    key: "Tin"
+    value {
+      list {
+        type: DT_FLOAT
+        type: DT_FLOAT
+      }
+    }
+  }
+  attr {
+    key: "Tout"
+    value {
+      list {
+        type: DT_FLOAT
+      }
+    }
+  }
+  attr {
+    key: "else_branch"
+    value {
+      func {
+        name: "cond_false_1"
+      }
+    }
+  }
+  attr {
+    key: "then_branch"
+    value {
+      func {
+        name: "cond_true_1"
+      }
+    }
+  }
+  experimental_debug_info {
+  }
+}
+node {
+  name: "main"
+  op: "_Retval"
+  input: "StatefulIf"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "index"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "main1"
+  op: "_Retval"
+  input: "StatelessIf"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "index"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "a"
+  op: "Placeholder"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  experimental_debug_info {
+  }
+}
+node {
+  name: "b"
+  op: "Placeholder"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  experimental_debug_info {
+  }
+}
+library {
+  function {
+    signature {
+      name: "cond_true"
+      input_arg {
+        name: "cond_true_arg0"
+        type: DT_FLOAT
+      }
+      input_arg {
+        name: "cond_true_arg1"
+        type: DT_FLOAT
+      }
+      input_arg {
+        name: "cond_true_arg2"
+        type: DT_FLOAT
+      }
+      input_arg {
+        name: "cond_true_arg3"
+        type: DT_FLOAT
+      }
+      output_arg {
+        name: "cond_true_ret"
+        type: DT_FLOAT
+      }
+    }
+    node_def {
+      name: "tf.Add"
+      op: "Add"
+      input: "cond_true_arg2"
+      input: "cond_true_arg3"
+      attr {
+        key: "T"
+        value {
+          type: DT_FLOAT
+        }
+      }
+      experimental_debug_info {
+        original_node_names: "tf.Add"
+      }
+    }
+    ret {
+      key: "cond_true_ret"
+      value: "tf.Add:z:0"
+    }
+  }
+  function {
+    signature {
+      name: "cond_false"
+      input_arg {
+        name: "cond_false_arg0"
+        type: DT_FLOAT
+      }
+      input_arg {
+        name: "cond_false_arg1"
+        type: DT_FLOAT
+      }
+      input_arg {
+        name: "cond_false_arg2"
+        type: DT_FLOAT
+      }
+      input_arg {
+        name: "cond_false_arg3"
+        type: DT_FLOAT
+      }
+      output_arg {
+        name: "cond_false_ret"
+        type: DT_FLOAT
+      }
+    }
+    node_def {
+      name: "tf.Mul"
+      op: "Mul"
+      input: "cond_false_arg0"
+      input: "cond_false_arg3"
+      attr {
+        key: "T"
+        value {
+          type: DT_FLOAT
+        }
+      }
+      experimental_debug_info {
+        original_node_names: "tf.Mul"
+      }
+    }
+    ret {
+      key: "cond_false_ret"
+      value: "tf.Mul:z:0"
+    }
+  }
+  function {
+    signature {
+      name: "cond_true_1"
+      input_arg {
+        name: "cond_true_arg0"
+        type: DT_FLOAT
+      }
+      input_arg {
+        name: "cond_true_arg1"
+        type: DT_FLOAT
+      }
+      output_arg {
+        name: "cond_true_ret"
+        type: DT_FLOAT
+      }
+    }
+    node_def {
+      name: "tf.Sub"
+      op: "Sub"
+      input: "cond_true_arg0"
+      input: "cond_true_arg1"
+      attr {
+        key: "T"
+        value {
+          type: DT_FLOAT
+        }
+      }
+      experimental_debug_info {
+        original_node_names: "tf.Sub"
+      }
+    }
+    ret {
+      key: "cond_true_ret"
+      value: "tf.Sub:z:0"
+    }
+  }
+  function {
+    signature {
+      name: "cond_false_1"
+      input_arg {
+        name: "cond_false_arg0"
+        type: DT_FLOAT
+      }
+      input_arg {
+        name: "cond_false_arg1"
+        type: DT_FLOAT
+      }
+      output_arg {
+        name: "cond_false_ret"
+        type: DT_FLOAT
+      }
+    }
+    node_def {
+      name: "tf.Div"
+      op: "Div"
+      input: "cond_false_arg0"
+      input: "cond_false_arg1"
+      attr {
+        key: "T"
+        value {
+          type: DT_FLOAT
+        }
+      }
+      experimental_debug_info {
+        original_node_names: "tf.Div"
+      }
+    }
+    ret {
+      key: "cond_false_ret"
+      value: "tf.Div:z:0"
+    }
+  }
+}
+versions {
+  producer: 115
+  min_consumer: 12
+}
+
+# CHECK:      func @StatefulIf_else
+# CHECK-NEXT: constant dense<[5.000000e+00, 6.000000e+00, 7.000000e+00, 8.000000e+00]>
+# CHECK-NEXT: tfl.mul
+# CHECK:      func @StatefulIf_then
+# CHECK-NEXT: constant dense<[6.000000e+00, 8.000000e+00, 1.000000e+01, 1.200000e+01]>
+# CHECK-NEXT: return
+# CHECK:      func @StatelessIf_else
+# CHECK-NEXT: tfl.div
+# CHECK:      func @StatelessIf_then
+# CHECK-NEXT: tfl.sub
+# CHECK:      "tf.If"{{.+}}else_branch = @StatelessIf_else{{.+}}then_branch = @StatelessIf_then
+# CHECK:      "tf.If"{{.+}}else_branch = @StatefulIf_else{{.+}}then_branch = @StatefulIf_then
+
diff --git a/tensorflow/compiler/mlir/lite/tf_tfl_passes.cc b/tensorflow/compiler/mlir/lite/tf_tfl_passes.cc
index 49fe0eb7100..06fe8684ce4 100644
--- a/tensorflow/compiler/mlir/lite/tf_tfl_passes.cc
+++ b/tensorflow/compiler/mlir/lite/tf_tfl_passes.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/lite/tf_tfl_passes.h"
 
 #include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/Function.h"  // from @llvm-project
 #include "mlir/IR/Module.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Pass/PassManager.h"  // from @llvm-project
@@ -74,6 +75,8 @@ void AddTFToTFLConversionPasses(const mlir::TFL::PassConfig& pass_config,
             pass_config.quant_specs.serialized_quant_stats));
   }
 
+  pass_manager->addPass(mlir::TF::CreateTFFunctionalControlFlowToRegions());
+
   // The conversion pipeline has to follow the following orders:
   // 1) Saved model related optimization like decompose resource ops
   // 2) Convert composite functions like lstm/rnns, along with proper function
@@ -111,6 +114,9 @@ void AddTFToTFLConversionPasses(const mlir::TFL::PassConfig& pass_config,
     // Add a shape inference pass to optimize away the unnecessary casts.
     pass_manager->addPass(mlir::TF::CreateTFShapeInferencePass());
   }
+
+  pass_manager->addPass(mlir::TF::CreateTFRegionControlFlowToFunctional());
+
   // Legalize while early to allow further constant folding.
   // TODO(jpienaar): This may not actually matter as we do canonicalization
   // after the legalize below, for now it needs to be below the above passes
diff --git a/tensorflow/compiler/mlir/tensorflow/BUILD b/tensorflow/compiler/mlir/tensorflow/BUILD
index b6ff0f581d3..9e5688cd230 100644
--- a/tensorflow/compiler/mlir/tensorflow/BUILD
+++ b/tensorflow/compiler/mlir/tensorflow/BUILD
@@ -416,6 +416,7 @@ cc_library(
         "transforms/fold_switch.cc",
         "transforms/freeze_global_tensors.cc",
         "transforms/functional_control_flow_to_cfg.cc",
+        "transforms/functional_control_flow_to_regions.cc",
         "transforms/fused_kernel_matcher.cc",
         "transforms/generated_canonicalize.inc",
         "transforms/generated_optimize.inc",
@@ -430,6 +431,7 @@ cc_library(
         "transforms/parallel_execute_to_islands.cc",
         "transforms/promote_resources_to_args.cc",
         "transforms/readonly_references_to_resources.cc",
+        "transforms/region_control_flow_to_functional.cc",
         "transforms/replicate_invariant_op_hoisting.cc",
         "transforms/replicate_to_island.cc",
         "transforms/resource_device_inference.cc",
@@ -487,6 +489,7 @@ cc_library(
         ":translate_utils",
         ":unroll_batch_matmul_pass",
         ":xla_sharding_util",
+        "//tensorflow/compiler/mlir:op_or_arg_name_mapper",
         "//tensorflow/compiler/mlir/lite:validators",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
         "//tensorflow/compiler/xla:xla_proto_cc",
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc
index 512011c3a0f..f4e5dc05eb0 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc
@@ -4066,6 +4066,15 @@ struct TFInlinerInterface : public DialectInlinerInterface {
   // Analysis Hooks
   //===--------------------------------------------------------------------===//
 
+  // Defines the legality of inlinining 'src' region into the 'dest' region
+  // attached to a TF operation
+  bool isLegalToInline(Region *dest, Region *src,
+                       BlockAndValueMapping &valueMapping) const final {
+    // Allow inlining in regions attached to region based control flow
+    // operations only if the src region is a single block region
+    return isa<IfRegionOp>(dest->getParentOp()) && llvm::hasSingleElement(*src);
+  }
+
   // Defines the legality of inlining TF operations.
   bool isLegalToInline(Operation *, Region *,
                        BlockAndValueMapping &) const final {
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/functional-control-flow-to-regions.mlir b/tensorflow/compiler/mlir/tensorflow/tests/functional-control-flow-to-regions.mlir
new file mode 100644
index 00000000000..a7e9b22d72b
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/tests/functional-control-flow-to-regions.mlir
@@ -0,0 +1,113 @@
+// RUN: tf-opt %s -tf-functional-control-flow-to-regions -split-input-file | FileCheck %s --dump-input=fail
+
+// CHECK: func @testIf1Then{{.+}}
+// CHECK: func @testIf1Else{{.+}}
+func @testIf1Then(tensor<*xf32>) -> tensor<*xf32>
+func @testIf1Else(tensor<*xf32>) -> tensor<*xf32>
+
+// CHECK-LABEL: func @testIf1Result(%arg0: tensor<i1>, %arg1: tensor<*xf32>)
+func @testIf1Result(%arg0: tensor<i1>, %arg1: tensor<*xf32>) -> tensor<*xf32> {
+  %0 = "tf.If"(%arg0, %arg1) {
+    then_branch = @testIf1Then, else_branch = @testIf1Else, is_stateless = false
+  } : (tensor<i1>, tensor<*xf32>) -> tensor<*xf32>
+
+  // CHECK: "tf.IfRegion"
+  // CHECK: [[Result0:%.*]] = call @testIf1Then
+  // CHECK: "tf.Yield"([[Result0]])
+  // CHECK: [[Result1:%.*]] = call @testIf1Else
+  // CHECK: "tf.Yield"([[Result1]])
+  return %0 : tensor<*xf32>
+}
+
+// -----
+
+// With mismatching input types
+
+// CHECK: func @testIf1Then{{.+}}
+// CHECK: func @testIf1Else{{.+}}
+func @testIf1Then(tensor<*xf32>) -> tensor<*xf32>
+func @testIf1Else(tensor<*xf32>) -> tensor<*xf32>
+
+// CHECK-LABEL: func @testIf2Result(%arg0: tensor<i1>, %arg1: tensor<2xf32>)
+func @testIf2Result(%arg0: tensor<i1>, %arg1: tensor<2xf32>) -> tensor<2xf32> {
+  %0 = "tf.If"(%arg0, %arg1) {
+    then_branch = @testIf1Then, else_branch = @testIf1Else, is_stateless = false
+  } : (tensor<i1>, tensor<2xf32>) -> tensor<2xf32>
+
+  // CHECK: "tf.IfRegion"
+  // CHECK: "tf.Cast"
+  // CHECK: [[Result0:%.*]] = call @testIf1Then
+  // CHECK: "tf.Yield"([[Result0]])
+  // CHECK: "tf.Cast"
+  // CHECK: [[Result1:%.*]] = call @testIf1Else
+  // CHECK: "tf.Yield"([[Result1]])
+  return %0 : tensor<2xf32>
+}
+
+// -----
+
+// No inputs, some outputs
+// CHECK: func @testIf1Then{{.+}}
+// CHECK: func @testIf1Else{{.+}}
+func @testIf1Then() -> tensor<*xf32>
+func @testIf1Else() -> tensor<*xf32>
+
+// CHECK-LABEL: func @testIfNoInputs(%arg0: tensor<i1>)
+func @testIfNoInputs(%arg0: tensor<i1>) -> tensor<2xf32> {
+  %0 = "tf.If"(%arg0) {
+    then_branch = @testIf1Then, else_branch = @testIf1Else, is_stateless = false
+  } : (tensor<i1>) -> tensor<2xf32>
+
+  // CHECK: "tf.IfRegion"
+  // CHECK: [[Result0:%.*]] = call @testIf1Then
+  // CHECK: "tf.Yield"([[Result0]])
+  // CHECK: [[Result1:%.*]] = call @testIf1Else
+  // CHECK: "tf.Yield"([[Result1]])
+  return %0 : tensor<2xf32>
+}
+
+// -----
+
+// No outputs, some inputs
+// CHECK: func @testIf1Then{{.+}}
+// CHECK: func @testIf1Else{{.+}}
+func @testIf1Then(tensor<*xf32>) -> ()
+func @testIf1Else(tensor<*xf32>) -> ()
+
+// CHECK-LABEL: func @testIfNoResult(%arg0: tensor<i1>, %arg1: tensor<2xf32>)
+func @testIfNoResult(%arg0: tensor<i1>, %arg1: tensor<2xf32>) -> () {
+  "tf.If"(%arg0, %arg1) {
+    then_branch = @testIf1Then, else_branch = @testIf1Else, is_stateless = false
+  } : (tensor<i1>, tensor<2xf32>) -> ()
+
+  // CHECK: "tf.IfRegion"
+  // CHECK: "tf.Cast"
+  // CHECK: call @testIf1Then
+  // CHECK: "tf.Yield"()
+  // CHECK: "tf.Cast"
+  // CHECK: call @testIf1Else
+  // CHECK: "tf.Yield"()
+  return
+}
+
+// -----
+// No outputs, No inputs
+// CHECK: func @testIf1Then{{.+}}
+// CHECK: func @testIf1Else{{.+}}
+func @testIf1Then() -> ()
+func @testIf1Else() -> ()
+
+// CHECK-LABEL: func @testIfNoInputAndNoResult(%arg0: tensor<i1>)
+func @testIfNoInputAndNoResult(%arg0: tensor<i1>) -> () {
+  "tf.If"(%arg0) {
+    then_branch = @testIf1Then, else_branch = @testIf1Else, is_stateless = false
+  } : (tensor<i1>) -> ()
+
+  // CHECK: "tf.IfRegion"
+  // CHECK: call @testIf1Then
+  // CHECK: "tf.Yield"()
+  // CHECK: call @testIf1Else
+  // CHECK: "tf.Yield"()
+  return
+}
+
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/region-control-flow-to-functional.mlir b/tensorflow/compiler/mlir/tensorflow/tests/region-control-flow-to-functional.mlir
new file mode 100644
index 00000000000..5ea863852ad
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/tests/region-control-flow-to-functional.mlir
@@ -0,0 +1,188 @@
+// RUN: tf-opt %s -tf-region-control-flow-to-functional -split-input-file
+//| FileCheck %s --dump-input=fail
+
+// CHECK: func @tf.IfRegion_else(%arg0: tensor<*xf32>) -> tensor<*xf32>
+// CHECK-NEXT:   "tf.Neg"
+// CHECK: func @tf.IfRegion_then(%arg0: tensor<*xf32>) -> tensor<*xf32>
+// CHECK-NEXT:   "tf.Abs"
+func @testSimple(%arg0: tensor<i1>, %arg1: tensor<*xf32>) -> tensor<*xf32> {
+  // CHECK: "tf.If"{{.+}}else_branch = @tf.IfRegion_else{{.+}}then_branch = @tf.IfRegion_then
+  %0 = "tf.IfRegion"(%arg0) ({
+    %1 = "tf.Abs"(%arg1) : (tensor<*xf32>) -> tensor<*xf32>
+    "tf.Yield"(%1) : (tensor<*xf32>) -> ()
+    }, {
+    %2 = "tf.Neg"(%arg1) : (tensor<*xf32>) -> tensor<*xf32>
+    "tf.Yield"(%2) : (tensor<*xf32>) -> ()
+    }) { is_stateless = true } :  (tensor<i1>) -> tensor<*xf32>
+  return %0 : tensor<*xf32>
+}
+
+// -----
+
+// Use if condition inside the regions
+// CHECK: func @tf.IfRegion_else(%arg0: tensor<i1>, %arg1: tensor<2xf32>, %arg2: tensor<2xf32>, %arg3: tensor<2xf32>) -> tensor<2xf32>
+// CHECK-NEXT: "tf.Select"(%arg0, %arg2, %arg3)
+// CHECK: func @tf.IfRegion_then(%arg0: tensor<i1>, %arg1: tensor<2xf32>, %arg2: tensor<2xf32>, %arg3: tensor<2xf32>) -> tensor<2xf32>
+// CHECK-NEXT: "tf.Select"(%arg0, %arg1, %arg2)
+func @testIfCondition(%arg0: tensor<i1>, %arg1: tensor<2xf32>) -> tensor<2xf32> {
+  %0 = "tf.Add"(%arg1, %arg1) : (tensor<2xf32>, tensor<2xf32>) -> tensor<2xf32>
+  %1 = "tf.Mul"(%arg1, %arg1) : (tensor<2xf32>, tensor<2xf32>) -> tensor<2xf32>
+  %2 = "tf.Div"(%arg1, %arg1) : (tensor<2xf32>, tensor<2xf32>) -> tensor<2xf32>
+
+  // CHECK: "tf.If"{{.+}}else_branch = @tf.IfRegion_else{{.+}}then_branch = @tf.IfRegion_then
+  %3 = "tf.IfRegion"(%arg0) ({
+     %4 = "tf.Select"(%arg0, %0, %1) : (tensor<i1>, tensor<2xf32>, tensor<2xf32>) -> tensor<2xf32>
+     "tf.Yield"(%4) : (tensor<2xf32>) -> ()
+    }, {
+     %5 = "tf.Select"(%arg0, %1, %2):  (tensor<i1>, tensor<2xf32>, tensor<2xf32>) -> tensor<2xf32>
+     "tf.Yield"(%5) : (tensor<2xf32>) -> ()
+    }) { is_stateless = true} : (tensor<i1>) -> tensor<2xf32>
+   return %3 : tensor<2xf32>
+}
+
+// -----
+
+// Constant sinking
+
+// CHECK: func @tf.IfRegion_else() -> tensor<2xf32>
+// CHECK-NEXT: constant dense<1.0
+// CHECK: func @tf.IfRegion_then() -> tensor<2xf32>
+// CHECK-NEXT: constant dense<0.0
+func @testIfConstant(%arg0: tensor<i1>) -> tensor<2xf32> {
+  %cst_zero = constant dense<0.0> : tensor<2xf32>
+  // CHECK: "tf.If"(%arg0) {else_branch = @tf.IfRegion_else{{.+}}then_branch = @tf.IfRegion_then
+  %0 = "tf.IfRegion"(%arg0) ({
+     "tf.Yield"(%cst_zero) : (tensor<2xf32>) -> ()
+    }, {
+     %cst_one = constant dense<1.0> : tensor<2xf32>
+     "tf.Yield"(%cst_one) : (tensor<2xf32>) -> ()
+    }) { is_stateless = true} : (tensor<i1>) -> tensor<2xf32>
+   return %0 : tensor<2xf32>
+}
+
+// -----
+
+// Nested IfRegions
+// CHECK: func @tf.IfRegion1_else
+// CHECK-NEXT: "tf.Acos"
+// CHECK-NEXT: "tf.Abs"
+
+// CHECK: func @tf.IfRegion1_then
+// CHECK-NEXT: "tf.LogicalNot"
+// CHECK-NEXT: "tf.Asin"
+// CHECK-NEXT: "tf.If"({{.+}}) {else_branch = @tf.IfRegion_else, {{.+}} then_branch = @tf.IfRegion_then}
+
+// CHECK: func @tf.IfRegion_else
+// CHECK-NEXT: "tf.Neg"
+// CHECK: func @tf.IfRegion_then
+// CHECK-NEXT: "tf.Abs"
+
+func @testNested(%arg0: tensor<i1>, %arg1: tensor<*xf32>) -> tensor<*xf32> {
+  // CHECK: "tf.If"({{.+}}) {else_branch = @tf.IfRegion1_else, {{.+}} then_branch = @tf.IfRegion1_then}
+  %0 = "tf.IfRegion"(%arg0) ({
+    // Outer Then
+    %cond = "tf.LogicalNot"(%arg0) : (tensor<i1>) -> tensor<i1>
+    %asin = "tf.Asin"(%arg1) : (tensor<*xf32>) -> tensor<*xf32>
+
+    // nested IfRegion
+    %1 = "tf.IfRegion"(%cond) ({
+        %2 = "tf.Abs"(%asin) : (tensor<*xf32>) -> tensor<*xf32>
+        "tf.Yield"(%2) : (tensor<*xf32>) -> ()
+      }, {
+        %2 = "tf.Neg"(%arg1) : (tensor<*xf32>) -> tensor<*xf32>
+        "tf.Yield"(%2) : (tensor<*xf32>) -> ()
+      }) { is_stateless = true } :  (tensor<i1>) -> tensor<*xf32>
+
+    "tf.Yield"(%1) : (tensor<*xf32>) -> ()
+    }, {
+    // Outer Else
+    %acos = "tf.Acos"(%arg1) : (tensor<*xf32>) -> tensor<*xf32>
+    %3 = "tf.Abs"(%acos) : (tensor<*xf32>) -> tensor<*xf32>
+    "tf.Yield"(%3) : (tensor<*xf32>) -> ()
+    }) { is_stateless = true } :  (tensor<i1>) -> tensor<*xf32>
+  return %0 : tensor<*xf32>
+}
+
+// -----
+
+// Match existing function->Region pattern (simple)
+func @testIf1Then(tensor<*xf32>) -> tensor<*xf32>
+func @testIf1Else(tensor<*xf32>) -> tensor<*xf32>
+func @testIf1Result(%arg0: tensor<i1>, %arg1: tensor<*xf32>) -> tensor<*xf32> {
+  // CHECK: "tf.If"({{.+}}) {else_branch = @testIf1Else, {{.+}} then_branch = @testIf1Then}
+  %0 = "tf.IfRegion"(%arg0) ( {
+    %1 = call @testIf1Then(%arg1) : (tensor<*xf32>) -> tensor<*xf32>
+    "tf.Yield"(%1) : (tensor<*xf32>) -> ()
+  },  {
+    %1 = call @testIf1Else(%arg1) : (tensor<*xf32>) -> tensor<*xf32>
+    "tf.Yield"(%1) : (tensor<*xf32>) -> ()
+ }) {is_stateless = false} : (tensor<i1>) -> tensor<*xf32>
+  return %0 : tensor<*xf32>
+}
+
+// -----
+
+// Match existing function->Region pattern (with casts)
+
+func @testIf1Then(tensor<*xf32>) -> tensor<*xf32>
+func @testIf1Else(tensor<*xf32>) -> tensor<*xf32>
+func @testIf2Result(%arg0: tensor<i1>, %arg1: tensor<2xf32>) -> tensor<2xf32> {
+  // CHECK: "tf.If"({{.+}}) {else_branch = @testIf1Else, {{.+}} then_branch = @testIf1Then}
+  %0 = "tf.IfRegion"(%arg0) ( {
+    %1 = "tf.Cast"(%arg1) {Truncate = false} : (tensor<2xf32>) -> tensor<*xf32>
+    %2 = call @testIf1Then(%1) : (tensor<*xf32>) -> tensor<*xf32>
+    "tf.Yield"(%2) : (tensor<*xf32>) -> ()
+  },  {
+    %1 = "tf.Cast"(%arg1) {Truncate = false} : (tensor<2xf32>) -> tensor<*xf32>
+    %2 = call @testIf1Else(%1) : (tensor<*xf32>) -> tensor<*xf32>
+    "tf.Yield"(%2) : (tensor<*xf32>) -> ()
+  }) {is_stateless = false} : (tensor<i1>) -> tensor<2xf32>
+  return %0 : tensor<2xf32>
+}
+
+// -----
+
+// No inputs, some outputs
+// CHECK: func @tf.IfRegion_else() -> tensor<2xf32>
+// CHECK-NEXT:    constant dense<1.000000e+00>
+// CHECK-NEXT:   "tf.Neg"
+// CHECK: func @tf.IfRegion_then() -> tensor<2xf32>
+// CHECK-NEXT:   constant dense<0.000000e+00>
+// CHECK-NEXT:   "tf.Abs"
+func @testSimple(%arg0: tensor<i1>) -> tensor<2xf32> {
+  // CHECK: "tf.If"{{.+}}else_branch = @tf.IfRegion_else{{.+}}then_branch = @tf.IfRegion_then
+  %0 = "tf.IfRegion"(%arg0) ({
+    %cst_zero = constant dense<0.0> : tensor<2xf32>
+    %1 = "tf.Abs"(%cst_zero) : (tensor<2xf32>) -> tensor<2xf32>
+    "tf.Yield"(%1) : (tensor<2xf32>) -> ()
+    }, {
+    %cst_one = constant dense<1.0> : tensor<2xf32>
+    %2 = "tf.Neg"(%cst_one) : (tensor<2xf32>) -> tensor<2xf32>
+    "tf.Yield"(%2) : (tensor<2xf32>) -> ()
+    }) { is_stateless = true } :  (tensor<i1>) -> tensor<2xf32>
+  return %0 : tensor<2xf32>
+}
+
+// -----
+
+// No outputs, some inputs
+//
+// CHECK: func @tf.IfRegion_else(%arg0: tensor<*xf32>)
+// CHECK-NEXT:   "tf.Neg"
+// CHECK: func @tf.IfRegion_then(%arg0: tensor<*xf32>)
+// CHECK-NEXT:   "tf.Abs"
+func @printer(tensor<*xf32>) -> ()
+func @testNoOutputs(%arg0: tensor<i1>, %arg1: tensor<*xf32>) -> () {
+  // CHECK: "tf.If"{{.+}}else_branch = @tf.IfRegion_else{{.+}}then_branch = @tf.IfRegion_then
+  "tf.IfRegion"(%arg0) ({
+    %1 = "tf.Abs"(%arg1) : (tensor<*xf32>) -> tensor<*xf32>
+    call @printer(%1) : (tensor<*xf32>) -> ()
+    "tf.Yield"() : () -> ()
+    }, {
+    %2 = "tf.Neg"(%arg1) : (tensor<*xf32>) -> tensor<*xf32>
+    call @printer(%2) : (tensor<*xf32>) -> ()
+    "tf.Yield"() : () -> ()
+    }) { is_stateless = false } :  (tensor<i1>) -> ()
+  return
+}
+
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/functional_control_flow_to_regions.cc b/tensorflow/compiler/mlir/tensorflow/transforms/functional_control_flow_to_regions.cc
new file mode 100644
index 00000000000..5ab0eda08c6
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/functional_control_flow_to_regions.cc
@@ -0,0 +1,117 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This transformation pass transforms functional control flow operations in the
+// TensorFlow dialect to their region based counterparts, i.e.,
+// tf.If -> tf.IfRegion
+
+#include "llvm/Support/raw_ostream.h"
+#include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/Function.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/TypeUtilities.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/IR/Verifier.h"  // from @llvm-project
+#include "mlir/IR/Visitors.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Pass/PassRegistry.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
+#include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
+
+namespace mlir {
+namespace TF {
+
+namespace {
+
+struct FunctionalControlFlowToRegions
+    : public PassWrapper<FunctionalControlFlowToRegions,
+                         OperationPass<ModuleOp>> {
+  void runOnOperation() override;
+};
+
+// Create a call to function `fn` with arguments `args` and return the CallOp.
+// The arguments are cast to the required type before the call.
+CallOp CreateCall(Location loc, Operation::operand_range args, FuncOp fn,
+                  OpBuilder* builder) {
+  FunctionType fn_type = fn.getType();
+  llvm::SmallVector<Value, 4> operands;
+  int num_operands = fn_type.getNumInputs();
+  operands.reserve(num_operands);
+  for (const auto& ArgAndType : zip(args, fn_type.getInputs())) {
+    Value arg = std::get<0>(ArgAndType);
+    Type expected_type = std::get<1>(ArgAndType);
+    if (arg.getType() != expected_type) {
+      arg = builder->create<CastOp>(loc, expected_type, arg,
+                                    /*Truncate=*/builder->getBoolAttr(false));
+    }
+    operands.push_back(arg);
+  }
+  return builder->create<CallOp>(loc, fn, operands);
+}
+
+// Transform a functional IfOp to a region based IfRegionOp.
+LogicalResult ConvertIfOp(IfOp if_op) {
+  auto if_region = OpBuilder(if_op).create<TF::IfRegionOp>(
+      if_op.getLoc(), if_op.getResultTypes(), if_op.cond(),
+      if_op.is_stateless());
+
+  // Insert call to the given function into the 'region'.
+  auto create_region_with_call = [&if_op](FlatSymbolRefAttr symbol,
+                                          Region& region) {
+    OpBuilder builder(region);
+    builder.createBlock(&region);
+    auto func = if_op.getParentOfType<ModuleOp>().lookupSymbol<FuncOp>(
+        symbol.getValue());
+    auto call = CreateCall(if_op.getLoc(), if_op.input(), func, &builder);
+    builder.create<YieldOp>(if_op.getLoc(), call.getResults());
+  };
+
+  create_region_with_call(if_op.then_branchAttr(), if_region.then_branch());
+  create_region_with_call(if_op.else_branchAttr(), if_region.else_branch());
+
+  if_op.replaceAllUsesWith(if_region.getResults());
+  if_op.erase();
+  return success();
+}
+
+void FunctionalControlFlowToRegions::runOnOperation() {
+  ModuleOp module = getOperation();
+  auto result = module.walk([](Operation* op) {
+    if (IfOp if_op = llvm::dyn_cast<IfOp>(op)) {
+      if (failed(ConvertIfOp(if_op))) {
+        if_op.emitOpError() << " failed to convert to region form";
+        return WalkResult::interrupt();
+      }
+    }
+    return WalkResult::advance();
+  });
+  if (result.wasInterrupted()) return signalPassFailure();
+}
+}  // namespace
+
+std::unique_ptr<OperationPass<ModuleOp>>
+CreateTFFunctionalControlFlowToRegions() {
+  return std::make_unique<FunctionalControlFlowToRegions>();
+}
+
+static PassRegistration<FunctionalControlFlowToRegions> pass(
+    "tf-functional-control-flow-to-regions",
+    "Transform functional control flow Ops to Region based counterparts");
+
+}  // namespace TF
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/passes.h b/tensorflow/compiler/mlir/tensorflow/transforms/passes.h
index 8f18c904420..7158d0f6be0 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/passes.h
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/passes.h
@@ -32,10 +32,20 @@ std::unique_ptr<OperationPass<FuncOp>>
 CreateFunctionalToExecutorDialectConversionPass();
 
 namespace TF {
-// Transforms functional control flow operations in the standard TensorFlow
-// dialect to MLIR Control Flow Graph (CFG) form.
+// Transforms functional control flow operations in the TensorFlow dialect to
+// MLIR Control Flow Graph (CFG) form.
 std::unique_ptr<OperationPass<FuncOp>> CreateTFFunctionalControlFlowToCFG();
 
+// Transforms functional control flow operations in the TensorFlow dialect to
+// their region based counterparts.
+std::unique_ptr<OperationPass<ModuleOp>>
+CreateTFFunctionalControlFlowToRegions();
+
+// Transforms region bases control flow operations in the TensorFlow dialect to
+// their functional counterparts.
+std::unique_ptr<OperationPass<ModuleOp>>
+CreateTFRegionControlFlowToFunctional();
+
 // Materialize the MlirPassthroughOp by replacing it with the MLIR module
 // attached as an attribute.
 std::unique_ptr<OperationPass<FuncOp>> CreateMaterializePassthroughOpPass();
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/region_control_flow_to_functional.cc b/tensorflow/compiler/mlir/tensorflow/transforms/region_control_flow_to_functional.cc
new file mode 100644
index 00000000000..ca0467942ca
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/region_control_flow_to_functional.cc
@@ -0,0 +1,340 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This transformation pass transforms region bases control flow operations in
+// the TensorFlow dialect to their functional counterparts, i.e.,
+// tf.IfRegion ->  tf.If
+
+#include "llvm/ADT/STLExtras.h"
+#include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/Function.h"  // from @llvm-project
+#include "mlir/IR/Module.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/TypeUtilities.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/IR/Verifier.h"  // from @llvm-project
+#include "mlir/IR/Visitors.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Pass/PassRegistry.h"  // from @llvm-project
+#include "mlir/Transforms/RegionUtils.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/op_or_arg_name_mapper.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
+#include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
+
+namespace mlir {
+namespace TF {
+
+namespace {
+
+struct RegionControlFlowToFunctional
+    : public PassWrapper<RegionControlFlowToFunctional,
+                         OperationPass<ModuleOp>> {
+  void runOnOperation() override;
+
+ private:
+  LogicalResult ConvertIfOp(IfRegionOp if_region);
+
+  // Get unique name by using the loc to name mapping.
+  std::string GetName(Operation* op, StringRef suffix);
+
+  tensorflow::OpOrArgLocNameMapper mapper;
+  llvm::SmallVector<FuncOp, 4> worklist;
+};
+
+std::string RegionControlFlowToFunctional::GetName(Operation* op,
+                                                   StringRef suffix) {
+  return (mapper.GetUniqueName(op) + suffix).str();
+}
+
+// Returns all the external values referenced from the given set of regions. If
+// the external value is a constant, sink it into the region instead (and do not
+// add it to the returned vector).
+llvm::SmallVector<Value, 4> CollectExternValues(ArrayRef<Region*> regions) {
+  llvm::SetVector<Value> extern_values_set;
+
+  for (auto region : regions) {
+    llvm::SetVector<Value> region_extern_values;
+    getUsedValuesDefinedAbove(*region, region_extern_values);
+
+    // Sink down constants into the functions.
+    for (auto extern_value : region_extern_values) {
+      if (!matchPattern(extern_value, m_Constant())) {
+        extern_values_set.insert(extern_value);
+        continue;
+      }
+      // Add constant at start of region.
+      auto const_builder = OpBuilder::atBlockBegin(&region->front());
+      auto const_value = const_builder.clone(*extern_value.getDefiningOp());
+      replaceAllUsesInRegionWith(extern_value, const_value->getResult(0),
+                                 *region);
+    }
+  }
+
+  return {extern_values_set.begin(), extern_values_set.end()};
+}
+
+// Extracts the contents of a region with a single block into a new function.
+// `extern_values` is the set of external values that the region refers to.
+//
+// Any inputs to the terminator of the region are converted to return values of
+// the function. If any of these values is not exact type as the function's
+// return type, appropriate cast operations will be inserted
+void ExtractSingleBlockRegion(Region& region, FunctionType type, StringRef name,
+                              llvm::SmallVectorImpl<Value>& extern_values,
+                              llvm::SmallVectorImpl<FuncOp>& worklist) {
+  ModuleOp module = region.getParentOfType<ModuleOp>();
+  auto builder = OpBuilder::atBlockBegin(module.getBody());
+  auto loc = region.getParentOp()->getLoc();
+
+  // Create new function and extract region body into the function.
+  auto outlined_func =
+      builder.create<FuncOp>(loc, name, type, ArrayRef<NamedAttribute>{});
+
+  outlined_func.getBody().takeBody(region);
+  Region& func_region = outlined_func.getBody();
+  Block& first_block = func_region.front();
+
+  // Replace all external uses with function arguments.
+  for (auto it : llvm::enumerate(extern_values)) {
+    Value arg = first_block.addArgument(it.value().getType());
+    replaceAllUsesInRegionWith(it.value(), arg, func_region);
+  }
+
+  // Replace the existing terminator with a return.
+  Operation* terminator = outlined_func.getBody().front().getTerminator();
+  builder.setInsertionPoint(terminator);
+
+  SmallVector<Value, 4> return_values;
+  return_values.reserve(terminator->getNumOperands());
+  for (auto it : llvm::enumerate(type.getResults())) {
+    Value ret_val = terminator->getOperand(it.index());
+    // Add a cast operation if types do not match.
+    if (ret_val.getType() != it.value()) {
+      ret_val =
+          builder.create<CastOp>(terminator->getLoc(), it.value(), ret_val);
+    }
+    return_values.push_back(ret_val);
+  }
+  builder.create<ReturnOp>(terminator->getLoc(), return_values);
+  terminator->erase();
+  outlined_func.setVisibility(FuncOp::Visibility::Private);
+
+  // Add the outlined function to the worklist in case its body has
+  // IfRegion ops that need to converted.
+  worklist.push_back(outlined_func);
+}
+
+// Returns call for region with single call whose result feeds into the
+// terminator of the region. Returns none if the region doesn't contain just
+// call and non-truncting casts ops.
+llvm::Optional<CallOp> IsSingleCallRegion(Region& region) {
+  if (!llvm::hasSingleElement(region)) return llvm::None;
+
+  Block& block = region.front();
+  auto it = block.rbegin();
+  YieldOp yield = dyn_cast<YieldOp>(*it++);
+
+  if (it == block.rend()) return llvm::None;
+
+  // Check if there is a Call before the Yield.
+  CallOp call = dyn_cast<CallOp>(*it++);
+  if (!call) return llvm::None;
+
+  // There can only be non-truncating cast op's prior to the call.
+  for (; it != block.rend(); ++it) {
+    CastOp cast = dyn_cast<CastOp>(*it);
+    if (!cast || cast.Truncate()) return llvm::None;
+  }
+
+  // All results of the call should feed into the yield.
+  if (call.getNumResults() != yield.getNumOperands()) return llvm::None;
+
+  for (auto res_it : llvm::zip(call.getResults(), yield.getOperands()))
+    if (std::get<0>(res_it) != std::get<1>(res_it)) return llvm::None;
+
+  return call;
+}
+
+// Returns whether the arguments of the given call are same as the given list of
+// arguments (after looking through cast ops).
+bool MatchCallArgs(CallOp call, llvm::SmallVectorImpl<Value>& args) {
+  if (call.getNumOperands() != args.size()) return false;
+
+  for (auto it : llvm::enumerate(args)) {
+    Value arg = call.getOperand(it.index());
+    if (auto cast = dyn_cast_or_null<CastOp>(arg.getDefiningOp()))
+      arg = cast.getOperand();
+
+    if (arg != it.value()) return false;
+  }
+  return true;
+}
+
+// Summary information for trivially transforming region based op's to
+// functional ops. A trivial transformation can be done when the regions are
+// just calls to functions, in which case no outlining is needed.
+struct TrivialTransformInfo {
+  // Can the op be transformed trivially?
+  bool can_transform = false;
+
+  // List of callee names (one for each region).
+  llvm::SmallVector<StringRef, 4> callee_names;
+
+  // List of arguments used in these call (each call uses the same arguments
+  // potentially through casts).
+  llvm::SmallVector<Value, 4> call_args;
+};
+
+// Analyzes the given set of regions (attached to the same parent op) to check
+// if the parent op be transformed to functional form trivially (i.e., reusing
+// existing functions and without outlining). This is possible when all the
+// regions are single call regions and the all the calls have the same
+// arguments.
+//
+// If this trivial transformation is possible, return the relevant information
+// needed for the transformation (in `TrivialTransformInfo`), else indicate that
+// a trivial transformation is not possible by setting `can_transform` false.
+TrivialTransformInfo AnalyzeForTrivialTransform(ArrayRef<Region*> regions) {
+  const TrivialTransformInfo cannot_transform;
+
+  if (regions.empty()) return cannot_transform;
+
+  llvm::SmallVector<CallOp, 2> calls;
+  calls.reserve(regions.size());
+
+  // Verify each region is a single call and collect these calls.
+  for (Region* region : regions) {
+    auto call = IsSingleCallRegion(*region);
+    if (!call.hasValue()) return cannot_transform;
+    calls.push_back(call.getValue());
+  }
+
+  llvm::SmallVector<StringRef, 4> callees;
+  callees.reserve(regions.size());
+
+  CallOp call0 = calls[0];
+  int num_args = call0.getNumOperands();
+
+  // Collect arguments of the first call.
+  llvm::SmallVector<Value, 4> call0_args;
+  call0_args.reserve(num_args);
+  for (Value arg : call0.getArgOperands()) {
+    if (auto cast = dyn_cast_or_null<CastOp>(arg.getDefiningOp()))
+      arg = cast.getOperand();
+    call0_args.push_back(arg);
+  }
+
+  // Match arguments of rest of the calls with those of the first call.
+  for (auto call : calls) {
+    if (call != call0 && !MatchCallArgs(call, call0_args))
+      return cannot_transform;
+    callees.push_back(call.getCallee());
+  }
+
+  return {true, callees, call0_args};
+}
+
+// Transform IfRegionOp to IfOp.
+LogicalResult RegionControlFlowToFunctional::ConvertIfOp(IfRegionOp if_region) {
+  const TrivialTransformInfo tti = AnalyzeForTrivialTransform(
+      {&if_region.then_branch(), &if_region.else_branch()});
+
+  std::string then_name, else_name;
+  llvm::SmallVector<Value, 4> extern_values;
+
+  if (tti.can_transform) {
+    // We can transform to functional form trivially without outlining.
+    then_name = tti.callee_names[0].str();
+    else_name = tti.callee_names[1].str();
+    extern_values = tti.call_args;
+  } else {
+    // Collect external values that are used within the else and then bodies.
+    extern_values = CollectExternValues(
+        {&if_region.then_branch(), &if_region.else_branch()});
+
+    // These external values need to be added as inputs to the generated If. The
+    // order is determined by the order of these values the `extern_vales`.
+
+    // Build the type for the outlined function.
+    llvm::SmallVector<Type, 4> input_types;
+    input_types.reserve(extern_values.size());
+    for (auto input : extern_values) input_types.push_back(input.getType());
+
+    FunctionType func_type = FunctionType::get(
+        input_types, if_region.getResultTypes(), if_region.getContext());
+
+    // Create 2 new functions with the input signature matching this order,
+    // and outline the `then` and `else` regions by moving the bodies of these
+    // regions into these functions. Replace tf.yield with a regular return.
+    then_name = GetName(if_region, "_then");
+    ExtractSingleBlockRegion(if_region.then_branch(), func_type, then_name,
+                             extern_values, worklist);
+
+    else_name = GetName(if_region, "_else");
+    ExtractSingleBlockRegion(if_region.else_branch(), func_type, else_name,
+                             extern_values, worklist);
+  }
+
+  // Once we have the `then` and `else` functions ready (either outlined or
+  // existing ones), replace the region based op with a functional control flow
+  // op.
+  OpBuilder builder(if_region);
+  auto if_op = builder.create<IfOp>(
+      if_region.getLoc(), if_region.getResultTypes(), if_region.cond(),
+      extern_values, then_name, else_name, if_region.is_stateless());
+  if_region.replaceAllUsesWith(if_op.getResults());
+  if_region.erase();
+  return success();
+}
+
+void RegionControlFlowToFunctional::runOnOperation() {
+  ModuleOp module = getOperation();
+
+  // Seed worklist with all functions in the module.
+  worklist = llvm::to_vector<4>(module.getOps<FuncOp>());
+
+  while (!worklist.empty()) {
+    FuncOp function = worklist.pop_back_val();
+
+    auto result = function.walk([&](Operation* op) {
+      if (IfRegionOp if_region = llvm::dyn_cast<IfRegionOp>(op)) {
+        if (failed(ConvertIfOp(if_region))) {
+          if_region.emitOpError() << " failed to convert to functional form";
+          return WalkResult::interrupt();
+        }
+      }
+      return WalkResult::advance();
+    });
+
+    if (result.wasInterrupted()) return signalPassFailure();
+  }
+}
+
+}  // namespace
+
+std::unique_ptr<OperationPass<ModuleOp>>
+CreateTFRegionControlFlowToFunctional() {
+  return std::make_unique<RegionControlFlowToFunctional>();
+}
+
+static PassRegistration<RegionControlFlowToFunctional> pass(
+    "tf-region-control-flow-to-functional",
+    "Transform region bases control flow Ops to functional counterparts");
+
+}  // namespace TF
+}  // namespace mlir

From cf83ab15b0b38c82bd136537de95cb44bcb2624e Mon Sep 17 00:00:00 2001
From: Stephan Herhut <herhut@google.com>
Date: Mon, 15 Jun 2020 09:59:35 -0700
Subject: [PATCH 0173/1390] Add support for f16 generated tanh kernel.

PiperOrigin-RevId: 316487197
Change-Id: Id7daff4dc6264071c9371e9eb31c1f57ac044389
---
 tensorflow/core/kernels/cubin_headers/BUILD          |  1 +
 tensorflow/core/kernels/cwise_op_gpu_tanh.cu.cc      |  4 +---
 tensorflow/core/kernels/cwise_op_tanh.cc             |  4 +---
 .../kernels/mlir_generated_cwise_op_gpu_tanh.cu.cc   | 12 ++++++++++++
 4 files changed, 15 insertions(+), 6 deletions(-)

diff --git a/tensorflow/core/kernels/cubin_headers/BUILD b/tensorflow/core/kernels/cubin_headers/BUILD
index a7f810eeded..49ab1b8a911 100644
--- a/tensorflow/core/kernels/cubin_headers/BUILD
+++ b/tensorflow/core/kernels/cubin_headers/BUILD
@@ -32,6 +32,7 @@ gen_kernel_library(
     name = "tanh",
     tile_size = "256",
     types = [
+        "f16",
         "f32",
         "f64",
     ],
diff --git a/tensorflow/core/kernels/cwise_op_gpu_tanh.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_tanh.cu.cc
index f3fb33a8635..59754a7260d 100644
--- a/tensorflow/core/kernels/cwise_op_gpu_tanh.cu.cc
+++ b/tensorflow/core/kernels/cwise_op_gpu_tanh.cu.cc
@@ -20,9 +20,7 @@ limitations under the License.
 
 namespace tensorflow {
 namespace functor {
-#if MLIR_GENERATED_GPU_KERNELS_ENABLED
-DEFINE_UNARY(tanh, Eigen::half);
-#else
+#ifndef MLIR_GENERATED_GPU_KERNELS_ENABLED
 DEFINE_UNARY3(tanh, Eigen::half, float, double);
 #endif
 DEFINE_SIMPLE_BINARY3(tanh_grad, Eigen::half, float, double);
diff --git a/tensorflow/core/kernels/cwise_op_tanh.cc b/tensorflow/core/kernels/cwise_op_tanh.cc
index 83152019608..1b6da56e537 100644
--- a/tensorflow/core/kernels/cwise_op_tanh.cc
+++ b/tensorflow/core/kernels/cwise_op_tanh.cc
@@ -21,9 +21,7 @@ REGISTER5(UnaryOp, CPU, "Tanh", functor::tanh, float, Eigen::half, double,
           complex64, complex128);
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-#if MLIR_GENERATED_GPU_KERNELS_ENABLED
-REGISTER(UnaryOp, GPU, "Tanh", functor::tanh, Eigen::half);
-#else
+#ifndef MLIR_GENERATED_GPU_KERNELS_ENABLED
 REGISTER3(UnaryOp, GPU, "Tanh", functor::tanh, float, Eigen::half, double);
 #endif
 #endif
diff --git a/tensorflow/core/kernels/mlir_generated_cwise_op_gpu_tanh.cu.cc b/tensorflow/core/kernels/mlir_generated_cwise_op_gpu_tanh.cu.cc
index ab4337c058f..40dd7c7e49e 100644
--- a/tensorflow/core/kernels/mlir_generated_cwise_op_gpu_tanh.cu.cc
+++ b/tensorflow/core/kernels/mlir_generated_cwise_op_gpu_tanh.cu.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/kernels/cubin_headers/tanh_f16_kernel.h"
 #include "tensorflow/core/kernels/cubin_headers/tanh_f32_kernel.h"
 #include "tensorflow/core/kernels/cubin_headers/tanh_f64_kernel.h"
 #include "tensorflow/core/lib/core/errors.h"
@@ -102,6 +103,14 @@ class MlirGenerateTanhOp : public OpKernel {
   std::mutex mu_;
 };
 
+class MlirGenerateTanhF16Op : public MlirGenerateTanhOp {
+ public:
+  explicit MlirGenerateTanhF16Op(OpKernelConstruction* ctx)
+      : MlirGenerateTanhOp(ctx) {
+    cubin_data_ = kTanhF16Kernel;
+  }
+};
+
 class MlirGenerateTanhF32Op : public MlirGenerateTanhOp {
  public:
   explicit MlirGenerateTanhF32Op(OpKernelConstruction* ctx)
@@ -119,6 +128,9 @@ class MlirGenerateTanhF64Op : public MlirGenerateTanhOp {
 };
 }  // namespace
 
+REGISTER_KERNEL_BUILDER(
+    Name("Tanh").Device(DEVICE_GPU).TypeConstraint<Eigen::half>("T"),
+    MlirGenerateTanhF16Op);
 REGISTER_KERNEL_BUILDER(
     Name("Tanh").Device(DEVICE_GPU).TypeConstraint<float>("T"),
     MlirGenerateTanhF32Op);

From 4d1593ce36f4b555062a8107471c2651bd7d2d34 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 15 Jun 2020 10:01:00 -0700
Subject: [PATCH 0174/1390] internal change only

PiperOrigin-RevId: 316487501
Change-Id: Icdc8cd56a73e200fc4309e3103a40f7ef977d073
---
 .../profiler/convert/xplane_to_op_stats.cc    |  8 ++---
 .../convert/xplane_to_op_stats_test.cc        | 36 -------------------
 2 files changed, 2 insertions(+), 42 deletions(-)

diff --git a/tensorflow/core/profiler/convert/xplane_to_op_stats.cc b/tensorflow/core/profiler/convert/xplane_to_op_stats.cc
index 9d3aca9d831..eb2e13dbb4a 100644
--- a/tensorflow/core/profiler/convert/xplane_to_op_stats.cc
+++ b/tensorflow/core/profiler/convert/xplane_to_op_stats.cc
@@ -96,8 +96,7 @@ void SetRunEnvironment(int32 accelerator_count, RunEnvironment* env) {
 }
 
 void ProcessHostPlane(const XPlane* host_plane, bool use_device_step_events,
-                      OpMetricsDb* op_metrics_db, StepEvents* step_events,
-                      TfFunctionDb* tf_function_db) {
+                      OpMetricsDb* op_metrics_db, StepEvents* step_events) {
   absl::flat_hash_map<int64, TfOp> tf_ops =
       CollectTfOpsFromHostThreadsXPlane(*host_plane);
   OpMetricsDbCombiner combiner(op_metrics_db);
@@ -108,8 +107,6 @@ void ProcessHostPlane(const XPlane* host_plane, bool use_device_step_events,
     CombineStepEvents(ConvertHostThreadsXLineToStepEvents(
                           line, use_device_step_events, *step_events),
                       step_events);
-    CombineTfFunctionDb(ConvertHostThreadsXLineToTfFunctionDb(line),
-                        tf_function_db);
   });
 }
 
@@ -166,8 +163,7 @@ OpStats ConvertXSpaceToOpStats(const XSpace& space) {
   bool has_device = !device_planes.empty();
   if (host_plane) {
     ProcessHostPlane(host_plane, has_device,
-                     op_stats.mutable_host_op_metrics_db(), &step_events,
-                     op_stats.mutable_tf_function_db());
+                     op_stats.mutable_host_op_metrics_db(), &step_events);
   }
   StepEvents nonoverlapped_step_events = ToNonOverlappedStepEvents(step_events);
   *op_stats.mutable_step_db() =
diff --git a/tensorflow/core/profiler/convert/xplane_to_op_stats_test.cc b/tensorflow/core/profiler/convert/xplane_to_op_stats_test.cc
index 1812a5592bc..138bcee72be 100644
--- a/tensorflow/core/profiler/convert/xplane_to_op_stats_test.cc
+++ b/tensorflow/core/profiler/convert/xplane_to_op_stats_test.cc
@@ -157,42 +157,6 @@ TEST(ConvertXPlaneToOpStats, GpuStepDbTest) {
   EXPECT_EQ(precision_stats.compute_32bit_ps(), 40);
 }
 
-TEST(ConcertXPlaneToOpStats, TfFunctionTest) {
-  XSpace space;
-  XPlaneBuilder host_plane_builder(space.add_planes());
-  host_plane_builder.SetName(kHostThreads);
-  host_plane_builder.ReserveLines(1);
-  std::string kFunctionName = "increment";
-
-  auto main_thread = host_plane_builder.GetOrCreateLine(0);
-  CreateTfFunctionCallEvent(&host_plane_builder, &main_thread, kFunctionName,
-                            10, 100, "traced-nonXla", 1);
-  CreateTfFunctionCallEvent(&host_plane_builder, &main_thread, kFunctionName,
-                            150, 20, "notTraced-nonXla", 1);
-  CreateTfFunctionCallEvent(&host_plane_builder, &main_thread, kFunctionName,
-                            200, 80, "traced-nonXla", 2);
-
-  OpStats op_stats = ConvertXSpaceToOpStats(space);
-  const TfFunctionDb& tf_function_db = op_stats.tf_function_db();
-
-  EXPECT_EQ(tf_function_db.tf_functions().size(), 1);
-  EXPECT_EQ(tf_function_db.tf_functions().count(kFunctionName), 1);
-  const TfFunction& tf_function =
-      tf_function_db.tf_functions().at(kFunctionName);
-  EXPECT_EQ(tf_function.total_tracing_count(), 2);
-  EXPECT_EQ(tf_function.compiler(), OTHER_COMPILER);
-  const auto& metrics = tf_function.metrics();
-  EXPECT_EQ(metrics.size(), 2);
-  EXPECT_EQ(metrics.count(TRACED_MODE), 1);
-  EXPECT_EQ(metrics.count(NOT_TRACED_MODE), 1);
-  const auto& traced_mode = metrics.at(TRACED_MODE);
-  EXPECT_EQ(traced_mode.count(), 2);
-  EXPECT_EQ(traced_mode.self_time_ps(), 180);
-  const auto& not_traced_mode = metrics.at(NOT_TRACED_MODE);
-  EXPECT_EQ(not_traced_mode.count(), 1);
-  EXPECT_EQ(not_traced_mode.self_time_ps(), 20);
-}
-
 TEST(ConvertXPlaneToOpStats, PropagateAndDedupErrors) {
   XSpace space;
   static constexpr char kError[] = "host: error";

From 8d32eb3bd10aceea68118556e500e87f5565a983 Mon Sep 17 00:00:00 2001
From: Prakalp Srivastava <prakalps@google.com>
Date: Mon, 15 Jun 2020 10:12:43 -0700
Subject: [PATCH 0175/1390] Handle tf.Case in tf-tensor-list-ops-decomposition
 pass.

PiperOrigin-RevId: 316490068
Change-Id: I8f9502c3b8361e767b6333428cffa68fe3d8a3ad
---
 .../tests/tensor_list_ops_decomposition.mlir  | 62 ++++++++++++
 .../tensor_list_ops_decomposition.cc          | 97 +++++++++++--------
 2 files changed, 119 insertions(+), 40 deletions(-)

diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tensor_list_ops_decomposition.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tensor_list_ops_decomposition.mlir
index 49365db57f6..c453a3815f2 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tensor_list_ops_decomposition.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tensor_list_ops_decomposition.mlir
@@ -248,6 +248,68 @@ func @if_else(%arg0: tensor<!tf.variant<tensor<f32>>>) -> tensor<!tf.variant<ten
 
 // -----
 
+// Tests CaseOp.
+
+// CHECK-LABEL: func @main
+func @main(%arg0: tensor<i32>) -> () {
+  %elem_shape = "tf.Const"() {value = dense<[]> : tensor<0xi32>} : () -> tensor<0xi32>
+  %max_size = "tf.Const"() {value = dense<10> : tensor<i32>} : () -> tensor<i32>
+  // CHECK-NOT: tf.EmptyTensorList
+  %tl = "tf.EmptyTensorList"(%elem_shape, %max_size) : (tensor<0xi32>, tensor<i32>) -> tensor<!tf.variant<tensor<f32>>>
+  %case_op = "tf.Case"(%arg0, %tl) {branches = [@branch_0, @branch_1, @branch_2]}
+    : (tensor<i32>, tensor<!tf.variant<tensor<f32>>>) -> tensor<!tf.variant<tensor<f32>>>
+  // CHECK: "tf.Slice"
+  %pop:2 = "tf.TensorListPopBack"(%case_op, %elem_shape) : (tensor<!tf.variant<tensor<f32>>>, tensor<0xi32>) -> (tensor<!tf.variant<tensor<f32>>>, tensor<f32>)
+  // CHECK-NOT: tf.TensorListPopBack
+  // CHECK: return
+  return
+}
+// CHECK: func @branch_0(%[[TARG0:.*]]: tensor<10xf32>, %[[TARG1:.*]]: tensor<1xi32>) -> (tensor<10xf32>, tensor<1xi32>)
+func @branch_0(%arg0: tensor<!tf.variant<tensor<f32>>>) -> tensor<!tf.variant<tensor<f32>>> {
+  %elem = "tf._SomeOp"() : () -> tensor<f32>
+  // CHECK-NOT: "tf.TensorListPushBack"
+  // CHECK: %[[UPDATE:.*]] = "tf.XlaDynamicUpdateSlice"
+  // CHECK: %[[CONST1:.*]] = "tf.Const"() {value = dense<1> : tensor<1xi32>} : () -> tensor<1xi32>
+  // CHECK: %[[ADD:.*]] = "tf.AddV2"(%[[TARG1]], %[[CONST1]])
+  // CHECK-NOT: "tf.TensorListPushBack"
+  %push = "tf.TensorListPushBack"(%arg0, %elem) : (tensor<!tf.variant<tensor<f32>>>, tensor<f32>) -> tensor<!tf.variant<tensor<f32>>>
+  // CHECK: return %[[UPDATE]], %[[ADD]]
+  return %push : tensor<!tf.variant<tensor<f32>>>
+}
+// CHECK: func @branch_1(%[[EARG0:.*]]: tensor<10xf32>, %[[EARG1:.*]]: tensor<1xi32>) -> (tensor<10xf32>, tensor<1xi32>)
+func @branch_1(%arg0: tensor<!tf.variant<tensor<f32>>>) -> tensor<!tf.variant<tensor<f32>>> {
+  %elem_shape = "tf.Const"() {value = dense<[]> : tensor<0xi32>} : () -> tensor<0xi32>
+  // CHECK-NOT: "tf.TensorListPopBack"
+  // CHECK: %[[COPY:.*]] = "tf.Identity"(%[[EARG0]])
+  // CHECK: %[[CONST1_1:.*]] = "tf.Const"() {value = dense<1> : tensor<1xi32>} : () -> tensor<1xi32>
+  // CHECK: %[[SUB:.*]] = "tf.Sub"(%[[EARG1]], %[[CONST1_1]])
+  // CHECK: %[[SLICE_SIZE:.*]] = "tf.Const"() {value = dense<1> : tensor<1xi32>} : () -> tensor<1xi32>
+  // CHECK: %[[SLICE:.*]] = "tf.Slice"(%[[COPY]], %[[SUB]], %[[SLICE_SIZE]]) : (tensor<10xf32>, tensor<1xi32>, tensor<1xi32>) -> tensor<1xf32>
+  // CHECK: %[[ELEM_SHAPE:.*]] = "tf.Const"() {value = dense<[]> : tensor<0xi32>} : () -> tensor<0xi32>
+  // CHECK: %[[ELEM:.*]] = "tf.Reshape"(%[[SLICE]], %[[ELEM_SHAPE]]) : (tensor<1xf32>, tensor<0xi32>) -> tensor<f32>
+  // CHECK-NOT: "tf.TensorListPopBack"
+  %pop:2 = "tf.TensorListPopBack"(%arg0, %elem_shape) : (tensor<!tf.variant<tensor<f32>>>, tensor<0xi32>) -> (tensor<!tf.variant<tensor<f32>>>, tensor<f32>)
+  // CHECK:  return %[[COPY]], %[[SUB]]
+  return %pop#0 : tensor<!tf.variant<tensor<f32>>>
+}
+// CHECK: func @branch_2(%[[EARG0:.*]]: tensor<10xf32>, %[[EARG1:.*]]: tensor<1xi32>) -> (tensor<10xf32>, tensor<1xi32>)
+func @branch_2(%arg0: tensor<!tf.variant<tensor<f32>>>) -> tensor<!tf.variant<tensor<f32>>> {
+  %elem_shape = "tf.Const"() {value = dense<[]> : tensor<0xi32>} : () -> tensor<0xi32>
+  // CHECK-NOT: "tf.TensorListPopBack"
+  // CHECK: %[[COPY:.*]] = "tf.Identity"(%[[EARG0]])
+  // CHECK: %[[CONST1_1:.*]] = "tf.Const"() {value = dense<1> : tensor<1xi32>} : () -> tensor<1xi32>
+  // CHECK: %[[SUB:.*]] = "tf.Sub"(%[[EARG1]], %[[CONST1_1]])
+  // CHECK: %[[SLICE_SIZE:.*]] = "tf.Const"() {value = dense<1> : tensor<1xi32>} : () -> tensor<1xi32>
+  // CHECK: %[[SLICE:.*]] = "tf.Slice"(%[[COPY]], %[[SUB]], %[[SLICE_SIZE]]) : (tensor<10xf32>, tensor<1xi32>, tensor<1xi32>) -> tensor<1xf32>
+  // CHECK: %[[ELEM_SHAPE:.*]] = "tf.Const"() {value = dense<[]> : tensor<0xi32>} : () -> tensor<0xi32>
+  // CHECK: %[[ELEM:.*]] = "tf.Reshape"(%[[SLICE]], %[[ELEM_SHAPE]]) : (tensor<1xf32>, tensor<0xi32>) -> tensor<f32>
+  // CHECK-NOT: "tf.TensorListPopBack"
+  %pop:2 = "tf.TensorListPopBack"(%arg0, %elem_shape) : (tensor<!tf.variant<tensor<f32>>>, tensor<0xi32>) -> (tensor<!tf.variant<tensor<f32>>>, tensor<f32>)
+  // CHECK:  return %[[COPY]], %[[SUB]]
+  return %pop#0 : tensor<!tf.variant<tensor<f32>>>
+}
+// -----
+
 // Tests PartitionedCall/StatefulPartitionedCall.
 
 // CHECK-LABEL: func @main
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tensor_list_ops_decomposition.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tensor_list_ops_decomposition.cc
index b2203c890e3..9733bfe2290 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tensor_list_ops_decomposition.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tensor_list_ops_decomposition.cc
@@ -216,59 +216,62 @@ LogicalResult HandleWhileOp(
   return success();
 }
 
-LogicalResult HandleIfOp(TF::IfOp if_op, ModuleOp module,
-                         llvm::SmallDenseMap<Value, SizeInfo>* buffer_to_size,
-                         llvm::StringMap<PartitionedCallDecompositionInfo>*
-                             decomposed_partitioned_call_callees) {
+template <class CaseOrIfOp>
+LogicalResult HandleCaseOrIfOp(
+    CaseOrIfOp op, ArrayRef<FuncOp> branches, ModuleOp module,
+    llvm::SmallDenseMap<Value, SizeInfo>* buffer_to_size,
+    llvm::StringMap<PartitionedCallDecompositionInfo>*
+        decomposed_partitioned_call_callees) {
   // Rewrite the branches.
-  auto then_branch = module.lookupSymbol<FuncOp>(if_op.then_branch());
-  auto else_branch = module.lookupSymbol<FuncOp>(if_op.else_branch());
-  llvm::SmallDenseMap<Value, SizeInfo> then_map;
-  llvm::SmallDenseMap<Value, SizeInfo> else_map;
+  SmallVector<llvm::SmallDenseMap<Value, SizeInfo>, 2> branch_maps;
+  branch_maps.resize(branches.size());
 
   auto find_arg_buffer_type = [&](int64_t index) -> llvm::Optional<Type> {
-    auto it = buffer_to_size->find(if_op.getOperand(index + 1));
+    auto it = buffer_to_size->find(op.getOperand(index + 1));
     if (it == buffer_to_size->end()) return llvm::None;
     return it->getFirst().getType();
   };
   auto arg_buffer_size_is_fixed = [&](int64_t index) {
-    return (*buffer_to_size)[if_op.getOperand(index + 1)].fixed;
+    return (*buffer_to_size)[op.getOperand(index + 1)].fixed;
   };
-  OpBuilder builder(if_op);
-  ModifyFunctionSignature(then_branch, cutil::GetSizeType(builder), &then_map,
-                          find_arg_buffer_type, arg_buffer_size_is_fixed);
-  ModifyFunctionSignature(else_branch, cutil::GetSizeType(builder), &else_map,
-                          find_arg_buffer_type, arg_buffer_size_is_fixed);
-  const bool arg_no_changed = then_map.empty();
-  if (failed(DecomposeTensorListOpsInternal(
-          &then_branch.front(), module, &then_map,
-          decomposed_partitioned_call_callees)) ||
-      failed(DecomposeTensorListOpsInternal(
-          &else_branch.front(), module, &else_map,
-          decomposed_partitioned_call_callees))) {
-    return failure();
+  OpBuilder builder(op);
+  for (const auto& pair : llvm::zip(branches, branch_maps)) {
+    FuncOp branch = std::get<0>(pair);
+    llvm::SmallDenseMap<Value, SizeInfo>& branch_map = std::get<1>(pair);
+    ModifyFunctionSignature(branch, cutil::GetSizeType(builder), &branch_map,
+                            find_arg_buffer_type, arg_buffer_size_is_fixed);
+
+    if (failed(DecomposeTensorListOpsInternal(
+            &branch.front(), module, &branch_map,
+            decomposed_partitioned_call_callees)))
+      return failure();
   }
+
+  const bool arg_no_changed = branch_maps.front().empty();
   auto output_buffer_to_size =
-      AddTensorListSizesToReturn(then_branch, then_map);
-  AddTensorListSizesToReturn(else_branch, else_map);
+      AddTensorListSizesToReturn(branches.front(), branch_maps.front());
+  for (const auto& pair : llvm::drop_begin(llvm::zip(branches, branch_maps), 1))
+    AddTensorListSizesToReturn(std::get<0>(pair), std::get<1>(pair));
+
   if (output_buffer_to_size.empty() && arg_no_changed) return success();
-  // Recreate the If op.
-  auto new_if_operands = llvm::to_vector<8>(if_op.getOperands());
-  for (int64_t i = 1; i < if_op.getNumOperands(); ++i) {
-    auto it = buffer_to_size->find(if_op.getOperand(i));
+
+  // Recreate the op.
+  auto new_operands = llvm::to_vector<8>(op.getOperands());
+  for (int64_t i = 1; i < op.getNumOperands(); ++i) {
+    auto it = buffer_to_size->find(op.getOperand(i));
     if (it == buffer_to_size->end()) continue;
-    new_if_operands.push_back(it->getSecond().size);
+    new_operands.push_back(it->getSecond().size);
   }
-  auto new_if = OpBuilder(if_op).create<TF::IfOp>(
-      if_op.getLoc(), then_branch.getType().getResults(), new_if_operands,
-      if_op.getAttrs());
+  FuncOp first_branch = branches.front();
+  auto new_op = OpBuilder(op).create<CaseOrIfOp>(
+      op.getLoc(), first_branch.getType().getResults(), new_operands,
+      op.getAttrs());
   for (const auto& entry : output_buffer_to_size) {
-    (*buffer_to_size)[new_if.getResult(std::get<0>(entry))] = {
-        new_if.getResult(std::get<1>(entry)), std::get<2>(entry)};
+    (*buffer_to_size)[new_op.getResult(std::get<0>(entry))] = {
+        new_op.getResult(std::get<1>(entry)), std::get<2>(entry)};
   }
-  if_op.replaceAllUsesWith(
-      new_if.getResults().take_front(if_op.getNumResults()));
-  if_op.erase();
+  op.replaceAllUsesWith(new_op.getResults().take_front(op.getNumResults()));
+  op.erase();
   return success();
 }
 
@@ -710,8 +713,22 @@ LogicalResult DecomposeTensorListOpsInternal(
         return failure();
       }
     } else if (auto if_op = llvm::dyn_cast<TF::IfOp>(&op)) {
-      if (failed(HandleIfOp(if_op, module, buffer_to_size,
-                            decomposed_partitioned_call_callees))) {
+      auto then_branch = module.lookupSymbol<FuncOp>(if_op.then_branch());
+      auto else_branch = module.lookupSymbol<FuncOp>(if_op.else_branch());
+
+      if (failed(HandleCaseOrIfOp(if_op, {then_branch, else_branch}, module,
+                                  buffer_to_size,
+                                  decomposed_partitioned_call_callees))) {
+        return failure();
+      }
+    } else if (auto case_op = llvm::dyn_cast<TF::CaseOp>(&op)) {
+      SmallVector<FuncOp, 2> branches;
+      for (auto branch_symbol : case_op.branches()) {
+        branches.push_back(module.lookupSymbol<FuncOp>(
+            branch_symbol.cast<FlatSymbolRefAttr>()));
+      }
+      if (failed(HandleCaseOrIfOp(case_op, branches, module, buffer_to_size,
+                                  decomposed_partitioned_call_callees))) {
         return failure();
       }
     } else if (auto pcall = llvm::dyn_cast<TF::PartitionedCallOp>(&op)) {

From 54f1e52b95bb5f4c5a204b736a9a815ace108e68 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 15 Jun 2020 10:12:45 -0700
Subject: [PATCH 0176/1390] fix deprecated message.

PiperOrigin-RevId: 316490075
Change-Id: I68d52d80ccf9c2f77f870dd7f4f1d146e1e44a46
---
 tensorflow/python/keras/saving/saving_utils.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/tensorflow/python/keras/saving/saving_utils.py b/tensorflow/python/keras/saving/saving_utils.py
index 9a407f64faa..3c9c33531bf 100644
--- a/tensorflow/python/keras/saving/saving_utils.py
+++ b/tensorflow/python/keras/saving/saving_utils.py
@@ -17,7 +17,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import collections
 import copy
 import os
 import six
@@ -31,6 +30,7 @@ from tensorflow.python.keras.utils import generic_utils
 from tensorflow.python.keras.utils.io_utils import ask_to_proceed_with_overwrite
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import nest
+from tensorflow.python.util.compat import collections_abc
 
 
 def extract_model_metrics(model):
@@ -79,7 +79,8 @@ def model_input_signature(model, keep_original_batch_size=False):
     return None
   input_specs = _enforce_names_consistency(input_specs)
   # Return a list with a single element as the model's input signature.
-  if isinstance(input_specs, collections.Sequence) and len(input_specs) == 1:
+  if isinstance(input_specs,
+                collections_abc.Sequence) and len(input_specs) == 1:
     # Note that the isinstance check filters out single-element dictionaries,
     # which should also be wrapped as a single-element list.
     return input_specs

From e3b8a8e7207a4bc16287af77a5e9fd0b0df86db2 Mon Sep 17 00:00:00 2001
From: Andrew Audibert <aaudibert@google.com>
Date: Mon, 15 Jun 2020 10:39:49 -0700
Subject: [PATCH 0177/1390] [tf.data service] Increase default number of
 uncompress threads to 4.

A single thread may not be able to uncompress data as quickly as it is requested.

PiperOrigin-RevId: 316496226
Change-Id: I25842acc4485c509987654bcec8f38f4b4a067b7
---
 .../data/experimental/ops/data_service_ops.py |  3 +--
 .../kernel_tests/data_service_ops_test.py     | 21 +++++++------------
 2 files changed, 9 insertions(+), 15 deletions(-)

diff --git a/tensorflow/python/data/experimental/ops/data_service_ops.py b/tensorflow/python/data/experimental/ops/data_service_ops.py
index dd81614fa45..39790d843ba 100644
--- a/tensorflow/python/data/experimental/ops/data_service_ops.py
+++ b/tensorflow/python/data/experimental/ops/data_service_ops.py
@@ -241,8 +241,7 @@ def _distribute(processing_mode,
     # TODO(b/157105111): Make this an autotuned parallel map when we have a way
     # to limit memory usage.
     dataset = dataset.map(
-        lambda x: compression_ops.uncompress(x, output_spec=uncompressed_spec),
-        num_parallel_calls=4)
+        lambda x: compression_ops.uncompress(x, output_spec=uncompressed_spec))
 
     # Disable autosharding for shared jobs.
     if job_name:
diff --git a/tensorflow/python/data/kernel_tests/data_service_ops_test.py b/tensorflow/python/data/kernel_tests/data_service_ops_test.py
index 440a4f46a20..d316009ce0c 100644
--- a/tensorflow/python/data/kernel_tests/data_service_ops_test.py
+++ b/tensorflow/python/data/kernel_tests/data_service_ops_test.py
@@ -201,18 +201,13 @@ class DataServiceOpsTest(test_base.DatasetTestBase, parameterized.TestCase):
     self._new_worker = server_lib.WorkerServer(
         port=port, master_address=self._master._address, protocol=PROTOCOL)
 
-    # There may have been some elements prefetched from the first worker
-    # before it was stopped.
-    while True:
-      val = next(iterator).numpy()
-      if val == 0:
-        break
-
     # The dataset starts over now that we read from the new worker.
-    # TODO(b/157086991): Iterate until end of sequence when we support
-    # detecting lost workers.
-    for i in range(1, num_elements // 2):
+    for i in range(num_elements):
       val = next(iterator).numpy()
+      if val == midpoint and i != midpoint:
+        # There may have been one last element prefetched from the first worker
+        # before it was stopped.
+        val = next(iterator).numpy()
       self.assertEqual(i, val)
 
   @combinations.generate(test_base.eager_only_combinations())
@@ -296,7 +291,7 @@ class DataServiceOpsTest(test_base.DatasetTestBase, parameterized.TestCase):
 
   @combinations.generate(test_base.eager_only_combinations())
   def testSharedJobNameRepeat(self):
-    num_elements = 100
+    num_elements = 10
     num_repetitions = 3
     master_address = self.create_cluster(1)
     ds = dataset_ops.Dataset.range(num_elements)
@@ -307,9 +302,9 @@ class DataServiceOpsTest(test_base.DatasetTestBase, parameterized.TestCase):
     results = []
     iter1 = iter(ds1)
     iter2 = iter(ds2)
-    for _ in range(((num_elements * num_repetitions) // 3)):
+    for _ in range(((num_elements * num_repetitions) // 2) - 1):
       results.append(next(iter1).numpy())
-    for _ in range(((num_elements * num_repetitions) // 3)):
+    for _ in range(((num_elements * num_repetitions) // 2) - 1):
       results.append(next(iter2).numpy())
     for elem in iter1:
       results.append(elem.numpy())

From a2dc78458bd1267625b16469730b8b743ac1514c Mon Sep 17 00:00:00 2001
From: Scott Zhu <scottzhu@google.com>
Date: Mon, 15 Jun 2020 10:40:48 -0700
Subject: [PATCH 0178/1390] Fork tracking/util_with_v1_optimizers_test to
 keras/tests.

PiperOrigin-RevId: 316496459
Change-Id: I9563ba20c0d3cd72bc0651e49ab9a48e0b355d34
---
 tensorflow/python/keras/tests/BUILD           |  31 +
 .../tracking_util_with_v1_optimizers_test.py  | 703 ++++++++++++++++++
 tensorflow/python/training/tracking/BUILD     |  15 -
 .../tracking/util_with_v1_optimizers_test.py  | 664 -----------------
 4 files changed, 734 insertions(+), 679 deletions(-)
 create mode 100644 tensorflow/python/keras/tests/tracking_util_with_v1_optimizers_test.py

diff --git a/tensorflow/python/keras/tests/BUILD b/tensorflow/python/keras/tests/BUILD
index ad52d33abc6..d03b1bd1ee8 100644
--- a/tensorflow/python/keras/tests/BUILD
+++ b/tensorflow/python/keras/tests/BUILD
@@ -435,6 +435,37 @@ tf_py_test(
     ],
 )
 
+tf_py_test(
+    name = "tracking_util_with_v1_optimizers_test",
+    srcs = ["tracking_util_with_v1_optimizers_test.py"],
+    tags = [
+        "notsan",  # b/74395663
+    ],
+    deps = [
+        "//tensorflow/python:checkpoint_management",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:extra_py_tests_deps",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python:saver",
+        "//tensorflow/python:state_ops",
+        "//tensorflow/python:training_lib",
+        "//tensorflow/python:training_util",
+        "//tensorflow/python/distribute:mirrored_strategy",
+        "//tensorflow/python/eager:backprop",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/eager:test",
+        "//tensorflow/python/keras/engine",
+        "//tensorflow/python/keras/layers:core",
+        "//tensorflow/python/training/tracking",
+        "//tensorflow/python/training/tracking:graph_view",
+        "//tensorflow/python/training/tracking:util",
+    ],
+)
+
 py_library(
     name = "get_config_samples",
     srcs = ["get_config_samples.py"],
diff --git a/tensorflow/python/keras/tests/tracking_util_with_v1_optimizers_test.py b/tensorflow/python/keras/tests/tracking_util_with_v1_optimizers_test.py
new file mode 100644
index 00000000000..b6711ea6fe5
--- /dev/null
+++ b/tensorflow/python/keras/tests/tracking_util_with_v1_optimizers_test.py
@@ -0,0 +1,703 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for object-based saving which use tf.train.* optimizers."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import functools
+import os
+
+import six
+
+from tensorflow.python.distribute import mirrored_strategy
+from tensorflow.python.eager import backprop
+from tensorflow.python.eager import context
+from tensorflow.python.eager import def_function
+from tensorflow.python.eager import test
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.keras.engine import training
+from tensorflow.python.keras.layers import core
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import state_ops
+from tensorflow.python.training import adam
+from tensorflow.python.training import checkpoint_management
+from tensorflow.python.training import saver as saver_lib
+from tensorflow.python.training import training_util
+from tensorflow.python.training.tracking import graph_view
+from tensorflow.python.training.tracking import tracking
+from tensorflow.python.training.tracking import util as trackable_utils
+
+
+class NonLayerTrackable(tracking.AutoTrackable):
+
+  def __init__(self):
+    super(NonLayerTrackable, self).__init__()
+    self.a_variable = trackable_utils.add_variable(
+        self, name="a_variable", shape=[])
+
+
+# pylint: disable=not-callable
+class MyModel(training.Model):
+  """A concrete Model for testing."""
+
+  def __init__(self):
+    super(MyModel, self).__init__()
+    self._named_dense = core.Dense(1, use_bias=True)
+    self._second = core.Dense(1, use_bias=False)
+    # We can still track Trackables which aren't Layers.
+    self._non_layer = NonLayerTrackable()
+
+  def call(self, values):
+    ret = self._second(self._named_dense(values))
+    return ret
+
+
+class CheckpointingTests(test.TestCase):
+
+  @test_util.run_in_graph_and_eager_modes(assert_no_eager_garbage=True)
+  def testNamingWithOptimizer(self):
+    input_value = constant_op.constant([[3.]])
+    model = MyModel()
+    # A nuisance Model using the same optimizer. Its slot variables should not
+    # go in the checkpoint, since it is never depended on.
+    other_model = MyModel()
+    optimizer = adam.AdamOptimizer(0.001)
+    optimizer_step = training_util.get_or_create_global_step()
+    root_trackable = trackable_utils.Checkpoint(
+        optimizer=optimizer, model=model, optimizer_step=optimizer_step)
+    if context.executing_eagerly():
+      optimizer.minimize(
+          lambda: model(input_value),
+          global_step=optimizer_step)
+      optimizer.minimize(
+          lambda: other_model(input_value),
+          global_step=optimizer_step)
+    else:
+      train_op = optimizer.minimize(
+          model(input_value), global_step=optimizer_step)
+      optimizer.minimize(
+          other_model(input_value),
+          global_step=optimizer_step)
+      self.evaluate(trackable_utils.gather_initializers(
+          root_trackable))
+      self.evaluate(train_op)
+    named_variables, serialized_graph, _ = graph_view.ObjectGraphView(
+        root_trackable).serialize_object_graph()
+    expected_checkpoint_names = (
+        # Created in the root node, so no prefix.
+        "optimizer_step",
+        "model/_second/kernel",
+        "model/_named_dense/kernel",
+        "model/_named_dense/bias",
+        # non-Layer dependency of the model
+        "model/_non_layer/a_variable",
+        # The optimizer creates two non-slot variables
+        "optimizer/beta1_power",
+        "optimizer/beta2_power",
+        # Slot variables
+        "model/_second/kernel/.OPTIMIZER_SLOT/optimizer/m",
+        "model/_second/kernel/.OPTIMIZER_SLOT/optimizer/v",
+        "model/_named_dense/kernel/.OPTIMIZER_SLOT/optimizer/m",
+        "model/_named_dense/kernel/.OPTIMIZER_SLOT/optimizer/v",
+        "model/_named_dense/bias/.OPTIMIZER_SLOT/optimizer/m",
+        "model/_named_dense/bias/.OPTIMIZER_SLOT/optimizer/v",
+    )
+    suffix = "/.ATTRIBUTES/VARIABLE_VALUE"
+    expected_checkpoint_names = [
+        name + suffix for name in expected_checkpoint_names]
+    named_variables = {v.name: v for v in named_variables}
+    six.assertCountEqual(self, expected_checkpoint_names,
+                         named_variables.keys())
+    # Check that we've mapped to the right variable objects (not exhaustive)
+    self.assertEqual(
+        "global_step",
+        named_variables["optimizer_step" + suffix].full_name)
+    self.assertEqual(
+        "my_model/dense_1/kernel",
+        named_variables["model/_second/kernel" + suffix].full_name)
+    self.assertEqual(
+        "my_model/dense/kernel",
+        named_variables["model/_named_dense/kernel" + suffix].full_name)
+    self.assertEqual(
+        "beta1_power",
+        named_variables["optimizer/beta1_power" + suffix].full_name)
+    self.assertEqual(
+        "beta2_power",
+        named_variables["optimizer/beta2_power" + suffix].full_name)
+    # Spot check the generated protocol buffers.
+    self.assertEqual("optimizer",
+                     serialized_graph.nodes[0].children[1].local_name)
+    optimizer_node = serialized_graph.nodes[serialized_graph.nodes[0].children[
+        1].node_id]
+    self.assertEqual("beta1_power",
+                     optimizer_node.children[0].local_name)
+    self.assertEqual("beta1_power",
+                     serialized_graph.nodes[optimizer_node.children[0].node_id]
+                     .attributes[0].full_name)
+    self.assertEqual(
+        "my_model/dense/kernel",
+        serialized_graph.nodes[optimizer_node.slot_variables[0]
+                               .original_variable_node_id]
+        .attributes[0].full_name)
+    # We strip off the :0 suffix, as variable.name-based saving does.
+    self.assertEqual(
+        "my_model/dense/kernel/Adam",
+        serialized_graph.nodes[optimizer_node.slot_variables[0]
+                               .slot_variable_node_id]
+        .attributes[0].full_name)
+    self.assertEqual(
+        "my_model/dense/kernel/Adam:0",
+        optimizer.get_slot(
+            var=model._named_dense.kernel,
+            name="m").name)
+    self.assertEqual(
+        "model/_named_dense/kernel" + suffix,
+        serialized_graph.nodes[
+            optimizer_node.slot_variables[0]
+            .original_variable_node_id].attributes[0].checkpoint_key)
+    self.assertEqual("m", optimizer_node.slot_variables[0].slot_name)
+    self.assertEqual(
+        "model/_named_dense/kernel/.OPTIMIZER_SLOT/optimizer/m" + suffix,
+        serialized_graph.nodes[
+            optimizer_node.slot_variables[0]
+            .slot_variable_node_id].attributes[0].checkpoint_key)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testSaveRestore(self):
+    model = MyModel()
+    optimizer = adam.AdamOptimizer(0.001)
+    root_trackable = trackable_utils.Checkpoint(
+        optimizer=optimizer, model=model)
+    input_value = constant_op.constant([[3.]])
+    if context.executing_eagerly():
+      optimizer.minimize(
+          lambda: model(input_value))
+    else:
+      train_op = optimizer.minimize(model(input_value))
+      # TODO(allenl): Make initialization more pleasant when graph building.
+      root_trackable.save_counter  # pylint: disable=pointless-statement
+      self.evaluate(trackable_utils.gather_initializers(
+          root_trackable))
+      self.evaluate(train_op)
+    prefix = os.path.join(self.get_temp_dir(), "ckpt")
+    self.evaluate(state_ops.assign(model._named_dense.variables[1], [42.]))
+    m_bias_slot = optimizer.get_slot(model._named_dense.variables[1], "m")
+    self.evaluate(state_ops.assign(m_bias_slot, [1.5]))
+    save_path = root_trackable.save(file_prefix=prefix)
+    self.evaluate(state_ops.assign(model._named_dense.variables[1], [43.]))
+    self.evaluate(state_ops.assign(root_trackable.save_counter, 3))
+    optimizer_variables = self.evaluate(optimizer.variables())
+    self.evaluate(state_ops.assign(m_bias_slot, [-2.]))
+    # Immediate restoration
+    status = root_trackable.restore(save_path=save_path).assert_consumed()
+    status.run_restore_ops()
+    self.assertAllEqual([42.], self.evaluate(model._named_dense.variables[1]))
+    self.assertAllEqual(1, self.evaluate(root_trackable.save_counter))
+    self.assertAllEqual([1.5], self.evaluate(m_bias_slot))
+    if not context.executing_eagerly():
+      return  # Restore-on-create is only supported when executing eagerly
+    on_create_model = MyModel()
+    on_create_optimizer = adam.AdamOptimizer(
+        0.001,
+        # Preserve beta1_power and beta2_power when applying gradients so we can
+        # test that they've been restored correctly.
+        beta1=1.0,
+        beta2=1.0)
+    on_create_root = trackable_utils.Checkpoint(
+        optimizer=on_create_optimizer, model=on_create_model)
+    # Deferred restoration
+    status = on_create_root.restore(save_path=save_path)
+    status.assert_nontrivial_match()
+    status.assert_existing_objects_matched()
+    with self.assertRaises(AssertionError):
+      status.assert_consumed()
+    on_create_model(constant_op.constant([[3.]]))  # create variables
+    self.assertAllEqual(1, self.evaluate(on_create_root.save_counter))
+    self.assertAllEqual([42.],
+                        self.evaluate(
+                            on_create_model._named_dense.variables[1]))
+    on_create_m_bias_slot = on_create_optimizer.get_slot(
+        on_create_model._named_dense.variables[1], "m")
+    status.assert_existing_objects_matched()
+    with self.assertRaises(AssertionError):
+      status.assert_consumed()
+    # Optimizer slot variables are created when the original variable is
+    # restored.
+    self.assertAllEqual([1.5], self.evaluate(on_create_m_bias_slot))
+    self.assertAllEqual(optimizer_variables[2:],
+                        self.evaluate(on_create_optimizer.variables()))
+    dummy_var = resource_variable_ops.ResourceVariable([1.])
+    on_create_optimizer.minimize(loss=dummy_var.read_value)
+    status.assert_existing_objects_matched()
+    status.assert_consumed()
+    beta1_power, beta2_power = on_create_optimizer._get_beta_accumulators()
+    self.assertAllEqual(optimizer_variables[0], self.evaluate(beta1_power))
+    self.assertAllEqual(optimizer_variables[1], self.evaluate(beta2_power))
+
+  # TODO(allenl): Debug garbage created by this test in python3.
+  def testDeferredRestorationUsageEager(self):
+    """An idiomatic eager execution example."""
+    num_training_steps = 10
+    checkpoint_directory = self.get_temp_dir()
+    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+    for training_continuation in range(3):
+      model = MyModel()
+      optimizer = adam.AdamOptimizer(0.001)
+      root = trackable_utils.Checkpoint(
+          optimizer=optimizer, model=model,
+          optimizer_step=training_util.get_or_create_global_step())
+      root.restore(checkpoint_management.latest_checkpoint(
+          checkpoint_directory))
+      for _ in range(num_training_steps):
+        # TODO(allenl): Use a Dataset and serialize/checkpoint it.
+        input_value = constant_op.constant([[3.]])
+        optimizer.minimize(
+            lambda: model(input_value),  # pylint: disable=cell-var-from-loop
+            global_step=root.optimizer_step)
+      root.save(file_prefix=checkpoint_prefix)
+      self.assertEqual((training_continuation + 1) * num_training_steps,
+                       root.optimizer_step.numpy())
+
+  def testEagerDistributionStrategy(self):
+    num_training_steps = 10
+    checkpoint_directory = self.get_temp_dir()
+    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+
+    def _train_fn(optimizer, model):
+      input_value = constant_op.constant([[3.]])
+      optimizer.minimize(
+          functools.partial(model, input_value),
+          global_step=root.optimizer_step)
+
+    strategy = mirrored_strategy.MirroredStrategy()
+    with strategy.scope():
+      for training_continuation in range(3):
+        model = MyModel()
+        optimizer = adam.AdamOptimizer(0.001)
+        root = trackable_utils.Checkpoint(
+            optimizer=optimizer,
+            model=model,
+            optimizer_step=training_util.get_or_create_global_step())
+        root.restore(
+            checkpoint_management.latest_checkpoint(checkpoint_directory))
+
+        for _ in range(num_training_steps):
+          strategy.extended.call_for_each_replica(
+              functools.partial(_train_fn, optimizer, model))
+        root.save(file_prefix=checkpoint_prefix)
+        self.assertEqual((training_continuation + 1) * num_training_steps,
+                         root.optimizer_step.numpy())
+
+  def testGraphDistributionStrategy(self):
+    self.skipTest("b/121381184")
+    num_training_steps = 10
+    checkpoint_directory = self.get_temp_dir()
+    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+
+    def _train_fn(optimizer, model):
+      input_value = constant_op.constant([[3.]])
+      return optimizer.minimize(
+          functools.partial(model, input_value),
+          global_step=root.optimizer_step)
+
+    for training_continuation in range(3):
+      with ops.Graph().as_default():
+        strategy = mirrored_strategy.MirroredStrategy()
+        with strategy.scope():
+          model = MyModel()
+          optimizer = adam.AdamOptimizer(0.001)
+          root = trackable_utils.Checkpoint(
+              optimizer=optimizer, model=model,
+              optimizer_step=training_util.get_or_create_global_step())
+          status = root.restore(checkpoint_management.latest_checkpoint(
+              checkpoint_directory))
+          train_op = strategy.extended.call_for_each_replica(
+              functools.partial(_train_fn, optimizer, model))
+          with self.session() as session:
+            if training_continuation > 0:
+              status.assert_consumed()
+            status.initialize_or_restore()
+            for _ in range(num_training_steps):
+              session.run(train_op)
+            root.save(file_prefix=checkpoint_prefix)
+        self.assertEqual((training_continuation + 1) * num_training_steps,
+                         root.optimizer_step.numpy())
+
+  def testUsageGraph(self):
+    """Expected usage when graph building."""
+    with context.graph_mode():
+      num_training_steps = 10
+      checkpoint_directory = self.get_temp_dir()
+      checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+      for training_continuation in range(3):
+        with ops.Graph().as_default():
+          model = MyModel()
+          optimizer = adam.AdamOptimizer(0.001)
+          root = trackable_utils.CheckpointV1(
+              optimizer=optimizer, model=model,
+              global_step=training_util.get_or_create_global_step())
+          input_value = constant_op.constant([[3.]])
+          train_op = optimizer.minimize(
+              model(input_value),
+              global_step=root.global_step)
+          checkpoint_path = checkpoint_management.latest_checkpoint(
+              checkpoint_directory)
+          with self.session(graph=ops.get_default_graph()) as session:
+            status = root.restore(save_path=checkpoint_path)
+            status.initialize_or_restore(session=session)
+            if checkpoint_path is None:
+              self.assertEqual(0, training_continuation)
+              with self.assertRaises(AssertionError):
+                status.assert_consumed()
+              with self.assertRaises(AssertionError):
+                status.assert_existing_objects_matched()
+            else:
+              status.assert_consumed()
+              status.assert_existing_objects_matched()
+            for _ in range(num_training_steps):
+              session.run(train_op)
+            root.save(file_prefix=checkpoint_prefix, session=session)
+            self.assertEqual((training_continuation + 1) * num_training_steps,
+                             session.run(root.global_step))
+            self.assertEqual(training_continuation + 1,
+                             session.run(root.save_counter))
+
+  @test_util.run_in_graph_and_eager_modes
+  def testAgnosticUsage(self):
+    """Graph/eager agnostic usage."""
+    # Does create garbage when executing eagerly due to ops.Graph() creation.
+    num_training_steps = 10
+    checkpoint_directory = self.get_temp_dir()
+    for training_continuation in range(3):
+      with test_util.device(use_gpu=True):
+        model = MyModel()
+        optimizer = adam.AdamOptimizer(0.001)
+        root = trackable_utils.Checkpoint(
+            optimizer=optimizer, model=model,
+            global_step=training_util.get_or_create_global_step())
+        manager = checkpoint_management.CheckpointManager(
+            root, checkpoint_directory, max_to_keep=1)
+        status = root.restore(save_path=manager.latest_checkpoint)
+        input_value = constant_op.constant([[3.]])
+        train_fn = functools.partial(
+            optimizer.minimize,
+            functools.partial(model, input_value),
+            global_step=root.global_step)
+        if not context.executing_eagerly():
+          train_fn = functools.partial(self.evaluate, train_fn())
+        status.initialize_or_restore()
+        for _ in range(num_training_steps):
+          train_fn()
+        manager.save()
+        self.assertEqual((training_continuation + 1) * num_training_steps,
+                         self.evaluate(root.global_step))
+        self.assertEqual(training_continuation + 1,
+                         self.evaluate(root.save_counter))
+
+  # pylint: disable=cell-var-from-loop
+  @test_util.run_in_graph_and_eager_modes
+  def testWithDefun(self):
+    num_training_steps = 2
+    checkpoint_directory = self.get_temp_dir()
+    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+    for training_continuation in range(3):
+      with test_util.device(use_gpu=True):
+        model = MyModel()
+        # Don't actually train so we can test variable values
+        optimizer = adam.AdamOptimizer(0.)
+        root = trackable_utils.Checkpoint(
+            optimizer=optimizer, model=model,
+            global_step=training_util.get_or_create_global_step())
+        checkpoint_path = checkpoint_management.latest_checkpoint(
+            checkpoint_directory)
+        status = root.restore(save_path=checkpoint_path)
+        def train_fn():
+          @def_function.function
+          def _call_model(x):
+            return model(x)
+          with backprop.GradientTape() as tape:
+            loss = _call_model(constant_op.constant([[3.]]))
+          gradients = tape.gradient(loss, model.variables)
+          return optimizer.apply_gradients(zip(gradients, model.variables),
+                                           global_step=root.global_step)
+        if not context.executing_eagerly():
+          train_fn = functools.partial(
+              self.evaluate, train_fn())
+        status.initialize_or_restore()
+        for _ in range(num_training_steps):
+          train_fn()
+        if training_continuation > 0:
+          status.assert_consumed()
+          self.assertAllClose([[42.]], self.evaluate(model.variables[0]))
+        else:
+          self.evaluate(model.variables[0].assign([[42.]]))
+        root.save(file_prefix=checkpoint_prefix)
+        self.assertEqual((training_continuation + 1) * num_training_steps,
+                         self.evaluate(root.global_step))
+        self.assertEqual(training_continuation + 1,
+                         self.evaluate(root.save_counter))
+  # pylint: enable=cell-var-from-loop
+
+  def _get_checkpoint_name(self, name):
+    root = tracking.AutoTrackable()
+    trackable_utils.add_variable(
+        root, name=name, shape=[1, 2], dtype=dtypes.float64)
+    (named_variable,), _, _ = trackable_utils._serialize_object_graph(
+        root, saveables_cache=None)
+    with ops.name_scope("root/" + named_variable.name):
+      pass  # Make sure we can use this as an op name if we prefix it.
+    return named_variable.name
+
+  def testAnonymousVarsInInit(self):
+
+    class Model(training.Model):
+
+      def __init__(self):
+        super(Model, self).__init__()
+        self.w = resource_variable_ops.ResourceVariable(0.0)
+        self.b = resource_variable_ops.ResourceVariable(0.0)
+        self.vars = [self.w, self.b]
+
+      def call(self, x):
+        return x * self.w + self.b
+
+    with context.eager_mode():
+      model = Model()
+      optimizer = adam.AdamOptimizer(learning_rate=0.05)
+      checkpoint_directory = self.get_temp_dir()
+      checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+      checkpoint = trackable_utils.Checkpoint(
+          model=model, optimizer=optimizer)
+      for _ in range(2):
+        checkpoint.save(checkpoint_prefix)
+        with backprop.GradientTape() as tape:
+          loss = (constant_op.constant(1.)
+                  - model(constant_op.constant(1.))) ** 2
+        grad = tape.gradient(loss, model.vars)
+        optimizer.apply_gradients(
+            [(g, v) for g, v in zip(grad, model.vars)])
+
+  @test_util.run_in_graph_and_eager_modes
+  def test_initialize_if_not_restoring(self):
+    checkpoint_directory = self.get_temp_dir()
+    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+    optimizer_only_prefix = os.path.join(checkpoint_directory, "opt")
+    with test_util.device(use_gpu=True):
+      model = MyModel()
+      optimizer = adam.AdamOptimizer(0.001)
+      root = trackable_utils.Checkpoint(
+          model=model,  # Do not save the optimizer with the checkpoint.
+          global_step=training_util.get_or_create_global_step())
+      optimizer_checkpoint = trackable_utils.Checkpoint(
+          optimizer=optimizer)
+
+      checkpoint_path = checkpoint_management.latest_checkpoint(
+          checkpoint_directory)
+      status = root.restore(save_path=checkpoint_path)
+      input_value = constant_op.constant([[3.]])
+      train_fn = functools.partial(
+          optimizer.minimize,
+          functools.partial(model, input_value),
+          global_step=root.global_step)
+      if not context.executing_eagerly():
+        train_fn = functools.partial(self.evaluate, train_fn())
+      status.initialize_or_restore()
+      self.evaluate([v.initializer for v in optimizer.variables()])
+      train_fn()
+      model_save_path = root.save(file_prefix=checkpoint_prefix)
+      self.evaluate(optimizer.variables()[0].assign(42.))
+      optimizer_save_path = optimizer_checkpoint.save(optimizer_only_prefix)
+
+    # Restore into a graph with the optimizer
+    with test_util.device(use_gpu=True):
+      model = MyModel()
+      optimizer = adam.AdamOptimizer(0.001)
+      root = trackable_utils.Checkpoint(
+          optimizer=optimizer, model=model,
+          global_step=training_util.get_or_create_global_step())
+      status = root.restore(save_path=model_save_path)
+      input_value = constant_op.constant([[3.]])
+      train_fn = functools.partial(
+          optimizer.minimize,
+          functools.partial(model, input_value),
+          global_step=root.global_step)
+      if not context.executing_eagerly():
+        train_fn = functools.partial(self.evaluate, train_fn())
+      status.initialize_or_restore()
+      train_fn()
+      with self.assertRaises(AssertionError):
+        status.assert_existing_objects_matched()
+      with self.assertRaises(AssertionError):
+        status.assert_consumed()
+
+    # Make sure initialization doesn't clobber later restores
+    with test_util.device(use_gpu=True):
+      model = MyModel()
+      optimizer = adam.AdamOptimizer(0.001, beta1=1.0)
+      root = trackable_utils.Checkpoint(
+          optimizer=optimizer, model=model,
+          global_step=training_util.get_or_create_global_step())
+      opt_root = trackable_utils.Checkpoint(
+          optimizer=optimizer)
+      status = root.restore(save_path=model_save_path)
+      init_only_optimizer_status = opt_root.restore(save_path=None)
+      optimizer_status = opt_root.restore(save_path=optimizer_save_path)
+      input_value = constant_op.constant([[3.]])
+      train_fn = functools.partial(
+          optimizer.minimize,
+          functools.partial(model, input_value),
+          global_step=root.global_step)
+      if not context.executing_eagerly():
+        train_fn = functools.partial(self.evaluate, train_fn())
+      optimizer_status.run_restore_ops()
+      status.initialize_or_restore()
+      init_only_optimizer_status.initialize_or_restore()
+      train_fn()
+      self.assertEqual(42., self.evaluate(optimizer.variables()[0]))
+
+
+class CheckpointCompatibilityTests(test.TestCase):
+
+  def _initialized_model(self):
+    input_value = constant_op.constant([[3.]])
+    model = MyModel()
+    optimizer = adam.AdamOptimizer(0.001)
+    optimizer_step = training_util.get_or_create_global_step()
+    root_trackable = trackable_utils.Checkpoint(
+        optimizer=optimizer, model=model, optimizer_step=optimizer_step)
+    train_op = optimizer.minimize(
+        functools.partial(model, input_value),
+        global_step=optimizer_step)
+    self.evaluate(trackable_utils.gather_initializers(
+        root_trackable))
+    self.evaluate(train_op)
+    # A regular variable, a slot variable, and a non-slot Optimizer variable
+    # with known values to check when loading.
+    self.evaluate(model._named_dense.bias.assign([1.]))
+    self.evaluate(optimizer.get_slot(
+        var=model._named_dense.bias, name="m").assign([2.]))
+    beta1_power, _ = optimizer._get_beta_accumulators()
+    self.evaluate(beta1_power.assign(3.))
+    return root_trackable
+
+  def _set_sentinels(self, root_trackable):
+    self.evaluate(root_trackable.model._named_dense.bias.assign([101.]))
+    self.evaluate(
+        root_trackable.optimizer.get_slot(
+            var=root_trackable.model._named_dense.bias, name="m")
+        .assign([102.]))
+    beta1_power, _ = root_trackable.optimizer._get_beta_accumulators()
+    self.evaluate(beta1_power.assign(103.))
+
+  def _check_sentinels(self, root_trackable):
+    self.assertAllEqual(
+        [1.], self.evaluate(root_trackable.model._named_dense.bias))
+    self.assertAllEqual([2.], self.evaluate(
+        root_trackable.optimizer.get_slot(
+            var=root_trackable.model._named_dense.bias, name="m")))
+    beta1_power, _ = root_trackable.optimizer._get_beta_accumulators()
+    self.assertAllEqual(3., self.evaluate(beta1_power))
+
+  def _write_name_based_checkpoint(self):
+    checkpoint_directory = self.get_temp_dir()
+    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+    with context.graph_mode():
+      save_graph = ops.Graph()
+      with save_graph.as_default(), self.session(
+          graph=save_graph) as session:
+        root = self._initialized_model()
+        name_saver = saver_lib.Saver()
+        return name_saver.save(
+            sess=session, save_path=checkpoint_prefix,
+            global_step=root.optimizer_step)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testLoadFromNameBasedSaver(self):
+    """Save a name-based checkpoint, load it using the object-based API."""
+    with test_util.device(use_gpu=True):
+      save_path = self._write_name_based_checkpoint()
+      root = self._initialized_model()
+      self._set_sentinels(root)
+      with self.assertRaises(AssertionError):
+        self._check_sentinels(root)
+      object_saver = trackable_utils.TrackableSaver(
+          graph_view.ObjectGraphView(root))
+      self._set_sentinels(root)
+      status = object_saver.restore(save_path)
+      if context.executing_eagerly():
+        self._check_sentinels(root)
+      if context.executing_eagerly():
+        status.assert_consumed()
+        status.assert_existing_objects_matched()
+        status.assert_nontrivial_match()
+      else:
+        # When graph building, we haven't read any keys, so we don't know
+        # whether the restore will be complete.
+        with self.assertRaisesRegexp(AssertionError, "not restored"):
+          status.assert_consumed()
+        with self.assertRaisesRegexp(AssertionError, "not restored"):
+          status.assert_existing_objects_matched()
+        with self.assertRaisesRegexp(AssertionError, "not restored"):
+          status.assert_nontrivial_match()
+      status.run_restore_ops()
+      self._check_sentinels(root)
+      self._set_sentinels(root)
+      status = object_saver.restore(save_path)
+      status.initialize_or_restore()
+      self._check_sentinels(root)
+      # Check that there is no error when keys are missing from the name-based
+      # checkpoint.
+      root.not_in_name_checkpoint = resource_variable_ops.ResourceVariable([1.])
+      status = object_saver.restore(save_path)
+      with self.assertRaises(AssertionError):
+        status.assert_existing_objects_matched()
+
+  def testSaveGraphLoadEager(self):
+    checkpoint_directory = self.get_temp_dir()
+    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+    with context.graph_mode():
+      save_graph = ops.Graph()
+      with save_graph.as_default(), self.session(
+          graph=save_graph):
+        root = self._initialized_model()
+        save_path = root.save(file_prefix=checkpoint_prefix)
+    with context.eager_mode():
+      root = self._initialized_model()
+      self._set_sentinels(root)
+      root.restore(save_path).assert_consumed()
+      self._check_sentinels(root)
+
+  def testSaveEagerLoadGraph(self):
+    checkpoint_directory = self.get_temp_dir()
+    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+    with context.eager_mode():
+      root = self._initialized_model()
+      save_path = root.save(file_prefix=checkpoint_prefix)
+    with context.graph_mode():
+      save_graph = ops.Graph()
+      with save_graph.as_default(), self.session(
+          graph=save_graph):
+        root = self._initialized_model()
+        self._set_sentinels(root)
+        root.restore(save_path).assert_consumed().run_restore_ops()
+        self._check_sentinels(root)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/training/tracking/BUILD b/tensorflow/python/training/tracking/BUILD
index 36ca3cf4b66..88dfd8eba55 100644
--- a/tensorflow/python/training/tracking/BUILD
+++ b/tensorflow/python/training/tracking/BUILD
@@ -223,34 +223,19 @@ tf_py_test(
         "notsan",  # b/74395663
     ],
     deps = [
-        ":base",
-        ":graph_view",
         ":tracking",
         ":util",
-        "//tensorflow/python:checkpoint_management",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:init_ops",
-        "//tensorflow/python:pywrap_tensorflow",
         "//tensorflow/python:resource_variable_ops",
-        "//tensorflow/python:saver",
         "//tensorflow/python:session",
         "//tensorflow/python:state_ops",
         "//tensorflow/python:template",
         "//tensorflow/python:training",
-        "//tensorflow/python:training_util",
         "//tensorflow/python:variable_scope",
-        "//tensorflow/python/distribute:mirrored_strategy",
-        "//tensorflow/python/eager:backprop",
         "//tensorflow/python/eager:context",
-        "//tensorflow/python/eager:def_function",
         "//tensorflow/python/eager:test",
-        "//tensorflow/python/keras:engine",
-        "//tensorflow/python/keras/layers",
-        "@absl_py//absl/testing:parameterized",
         "@six_archive//:six",
     ],
 )
diff --git a/tensorflow/python/training/tracking/util_with_v1_optimizers_test.py b/tensorflow/python/training/tracking/util_with_v1_optimizers_test.py
index d4857677046..a5af8e1f876 100644
--- a/tensorflow/python/training/tracking/util_with_v1_optimizers_test.py
+++ b/tensorflow/python/training/tracking/util_with_v1_optimizers_test.py
@@ -17,486 +17,27 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import functools
 import os
 
 import six
 
 from tensorflow.python.client import session as session_lib
-from tensorflow.python.distribute import mirrored_strategy
-from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
-from tensorflow.python.eager import def_function
 from tensorflow.python.eager import test
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
-from tensorflow.python.keras.engine import training
-from tensorflow.python.keras.layers import core
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import template
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.training import adam
-from tensorflow.python.training import checkpoint_management
-from tensorflow.python.training import saver as saver_lib
-from tensorflow.python.training import training_util
-from tensorflow.python.training.tracking import graph_view
 from tensorflow.python.training.tracking import tracking
 from tensorflow.python.training.tracking import util as trackable_utils
 
 
-class NonLayerTrackable(tracking.AutoTrackable):
-
-  def __init__(self):
-    super(NonLayerTrackable, self).__init__()
-    self.a_variable = trackable_utils.add_variable(
-        self, name="a_variable", shape=[])
-
-
-# pylint: disable=not-callable
-class MyModel(training.Model):
-  """A concrete Model for testing."""
-
-  def __init__(self):
-    super(MyModel, self).__init__()
-    self._named_dense = core.Dense(1, use_bias=True)
-    self._second = core.Dense(1, use_bias=False)
-    # We can still track Trackables which aren't Layers.
-    self._non_layer = NonLayerTrackable()
-
-  def call(self, values):
-    ret = self._second(self._named_dense(values))
-    return ret
-
-
 class CheckpointingTests(test.TestCase):
 
-  @test_util.run_in_graph_and_eager_modes(assert_no_eager_garbage=True)
-  def testNamingWithOptimizer(self):
-    input_value = constant_op.constant([[3.]])
-    model = MyModel()
-    # A nuisance Model using the same optimizer. Its slot variables should not
-    # go in the checkpoint, since it is never depended on.
-    other_model = MyModel()
-    optimizer = adam.AdamOptimizer(0.001)
-    optimizer_step = training_util.get_or_create_global_step()
-    root_trackable = trackable_utils.Checkpoint(
-        optimizer=optimizer, model=model, optimizer_step=optimizer_step)
-    if context.executing_eagerly():
-      optimizer.minimize(
-          lambda: model(input_value),
-          global_step=optimizer_step)
-      optimizer.minimize(
-          lambda: other_model(input_value),
-          global_step=optimizer_step)
-    else:
-      train_op = optimizer.minimize(
-          model(input_value), global_step=optimizer_step)
-      optimizer.minimize(
-          other_model(input_value),
-          global_step=optimizer_step)
-      self.evaluate(trackable_utils.gather_initializers(
-          root_trackable))
-      self.evaluate(train_op)
-    named_variables, serialized_graph, _ = graph_view.ObjectGraphView(
-        root_trackable).serialize_object_graph()
-    expected_checkpoint_names = (
-        # Created in the root node, so no prefix.
-        "optimizer_step",
-        "model/_second/kernel",
-        "model/_named_dense/kernel",
-        "model/_named_dense/bias",
-        # non-Layer dependency of the model
-        "model/_non_layer/a_variable",
-        # The optimizer creates two non-slot variables
-        "optimizer/beta1_power",
-        "optimizer/beta2_power",
-        # Slot variables
-        "model/_second/kernel/.OPTIMIZER_SLOT/optimizer/m",
-        "model/_second/kernel/.OPTIMIZER_SLOT/optimizer/v",
-        "model/_named_dense/kernel/.OPTIMIZER_SLOT/optimizer/m",
-        "model/_named_dense/kernel/.OPTIMIZER_SLOT/optimizer/v",
-        "model/_named_dense/bias/.OPTIMIZER_SLOT/optimizer/m",
-        "model/_named_dense/bias/.OPTIMIZER_SLOT/optimizer/v",
-    )
-    suffix = "/.ATTRIBUTES/VARIABLE_VALUE"
-    expected_checkpoint_names = [
-        name + suffix for name in expected_checkpoint_names]
-    named_variables = {v.name: v for v in named_variables}
-    six.assertCountEqual(self, expected_checkpoint_names,
-                         named_variables.keys())
-    # Check that we've mapped to the right variable objects (not exhaustive)
-    self.assertEqual(
-        "global_step",
-        named_variables["optimizer_step" + suffix].full_name)
-    self.assertEqual(
-        "my_model/dense_1/kernel",
-        named_variables["model/_second/kernel" + suffix].full_name)
-    self.assertEqual(
-        "my_model/dense/kernel",
-        named_variables["model/_named_dense/kernel" + suffix].full_name)
-    self.assertEqual(
-        "beta1_power",
-        named_variables["optimizer/beta1_power" + suffix].full_name)
-    self.assertEqual(
-        "beta2_power",
-        named_variables["optimizer/beta2_power" + suffix].full_name)
-    # Spot check the generated protocol buffers.
-    self.assertEqual("optimizer",
-                     serialized_graph.nodes[0].children[1].local_name)
-    optimizer_node = serialized_graph.nodes[serialized_graph.nodes[0].children[
-        1].node_id]
-    self.assertEqual("beta1_power",
-                     optimizer_node.children[0].local_name)
-    self.assertEqual("beta1_power",
-                     serialized_graph.nodes[optimizer_node.children[0].node_id]
-                     .attributes[0].full_name)
-    self.assertEqual(
-        "my_model/dense/kernel",
-        serialized_graph.nodes[optimizer_node.slot_variables[0]
-                               .original_variable_node_id]
-        .attributes[0].full_name)
-    # We strip off the :0 suffix, as variable.name-based saving does.
-    self.assertEqual(
-        "my_model/dense/kernel/Adam",
-        serialized_graph.nodes[optimizer_node.slot_variables[0]
-                               .slot_variable_node_id]
-        .attributes[0].full_name)
-    self.assertEqual(
-        "my_model/dense/kernel/Adam:0",
-        optimizer.get_slot(
-            var=model._named_dense.kernel,
-            name="m").name)
-    self.assertEqual(
-        "model/_named_dense/kernel" + suffix,
-        serialized_graph.nodes[
-            optimizer_node.slot_variables[0]
-            .original_variable_node_id].attributes[0].checkpoint_key)
-    self.assertEqual("m", optimizer_node.slot_variables[0].slot_name)
-    self.assertEqual(
-        "model/_named_dense/kernel/.OPTIMIZER_SLOT/optimizer/m" + suffix,
-        serialized_graph.nodes[
-            optimizer_node.slot_variables[0]
-            .slot_variable_node_id].attributes[0].checkpoint_key)
-
-  @test_util.run_in_graph_and_eager_modes
-  def testSaveRestore(self):
-    model = MyModel()
-    optimizer = adam.AdamOptimizer(0.001)
-    root_trackable = trackable_utils.Checkpoint(
-        optimizer=optimizer, model=model)
-    input_value = constant_op.constant([[3.]])
-    if context.executing_eagerly():
-      optimizer.minimize(
-          lambda: model(input_value))
-    else:
-      train_op = optimizer.minimize(model(input_value))
-      # TODO(allenl): Make initialization more pleasant when graph building.
-      root_trackable.save_counter  # pylint: disable=pointless-statement
-      self.evaluate(trackable_utils.gather_initializers(
-          root_trackable))
-      self.evaluate(train_op)
-    prefix = os.path.join(self.get_temp_dir(), "ckpt")
-    self.evaluate(state_ops.assign(model._named_dense.variables[1], [42.]))
-    m_bias_slot = optimizer.get_slot(model._named_dense.variables[1], "m")
-    self.evaluate(state_ops.assign(m_bias_slot, [1.5]))
-    save_path = root_trackable.save(file_prefix=prefix)
-    self.evaluate(state_ops.assign(model._named_dense.variables[1], [43.]))
-    self.evaluate(state_ops.assign(root_trackable.save_counter, 3))
-    optimizer_variables = self.evaluate(optimizer.variables())
-    self.evaluate(state_ops.assign(m_bias_slot, [-2.]))
-    # Immediate restoration
-    status = root_trackable.restore(save_path=save_path).assert_consumed()
-    status.run_restore_ops()
-    self.assertAllEqual([42.], self.evaluate(model._named_dense.variables[1]))
-    self.assertAllEqual(1, self.evaluate(root_trackable.save_counter))
-    self.assertAllEqual([1.5], self.evaluate(m_bias_slot))
-    if not context.executing_eagerly():
-      return  # Restore-on-create is only supported when executing eagerly
-    on_create_model = MyModel()
-    on_create_optimizer = adam.AdamOptimizer(
-        0.001,
-        # Preserve beta1_power and beta2_power when applying gradients so we can
-        # test that they've been restored correctly.
-        beta1=1.0,
-        beta2=1.0)
-    on_create_root = trackable_utils.Checkpoint(
-        optimizer=on_create_optimizer, model=on_create_model)
-    # Deferred restoration
-    status = on_create_root.restore(save_path=save_path)
-    status.assert_nontrivial_match()
-    status.assert_existing_objects_matched()
-    with self.assertRaises(AssertionError):
-      status.assert_consumed()
-    on_create_model(constant_op.constant([[3.]]))  # create variables
-    self.assertAllEqual(1, self.evaluate(on_create_root.save_counter))
-    self.assertAllEqual([42.],
-                        self.evaluate(
-                            on_create_model._named_dense.variables[1]))
-    on_create_m_bias_slot = on_create_optimizer.get_slot(
-        on_create_model._named_dense.variables[1], "m")
-    status.assert_existing_objects_matched()
-    with self.assertRaises(AssertionError):
-      status.assert_consumed()
-    # Optimizer slot variables are created when the original variable is
-    # restored.
-    self.assertAllEqual([1.5], self.evaluate(on_create_m_bias_slot))
-    self.assertAllEqual(optimizer_variables[2:],
-                        self.evaluate(on_create_optimizer.variables()))
-    dummy_var = resource_variable_ops.ResourceVariable([1.])
-    on_create_optimizer.minimize(loss=dummy_var.read_value)
-    status.assert_existing_objects_matched()
-    status.assert_consumed()
-    beta1_power, beta2_power = on_create_optimizer._get_beta_accumulators()
-    self.assertAllEqual(optimizer_variables[0], self.evaluate(beta1_power))
-    self.assertAllEqual(optimizer_variables[1], self.evaluate(beta2_power))
-
-  # TODO(allenl): Debug garbage created by this test in python3.
-  def testDeferredRestorationUsageEager(self):
-    """An idiomatic eager execution example."""
-    num_training_steps = 10
-    checkpoint_directory = self.get_temp_dir()
-    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
-    for training_continuation in range(3):
-      model = MyModel()
-      optimizer = adam.AdamOptimizer(0.001)
-      root = trackable_utils.Checkpoint(
-          optimizer=optimizer, model=model,
-          optimizer_step=training_util.get_or_create_global_step())
-      root.restore(checkpoint_management.latest_checkpoint(
-          checkpoint_directory))
-      for _ in range(num_training_steps):
-        # TODO(allenl): Use a Dataset and serialize/checkpoint it.
-        input_value = constant_op.constant([[3.]])
-        optimizer.minimize(
-            lambda: model(input_value),  # pylint: disable=cell-var-from-loop
-            global_step=root.optimizer_step)
-      root.save(file_prefix=checkpoint_prefix)
-      self.assertEqual((training_continuation + 1) * num_training_steps,
-                       root.optimizer_step.numpy())
-
-  def testEagerDistributionStrategy(self):
-    num_training_steps = 10
-    checkpoint_directory = self.get_temp_dir()
-    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
-
-    def _train_fn(optimizer, model):
-      input_value = constant_op.constant([[3.]])
-      optimizer.minimize(
-          functools.partial(model, input_value),
-          global_step=root.optimizer_step)
-
-    strategy = mirrored_strategy.MirroredStrategy()
-    with strategy.scope():
-      for training_continuation in range(3):
-        model = MyModel()
-        optimizer = adam.AdamOptimizer(0.001)
-        root = trackable_utils.Checkpoint(
-            optimizer=optimizer,
-            model=model,
-            optimizer_step=training_util.get_or_create_global_step())
-        root.restore(
-            checkpoint_management.latest_checkpoint(checkpoint_directory))
-
-        for _ in range(num_training_steps):
-          strategy.extended.call_for_each_replica(
-              functools.partial(_train_fn, optimizer, model))
-        root.save(file_prefix=checkpoint_prefix)
-        self.assertEqual((training_continuation + 1) * num_training_steps,
-                         root.optimizer_step.numpy())
-
-  def testGraphDistributionStrategy(self):
-    self.skipTest("b/121381184")
-    num_training_steps = 10
-    checkpoint_directory = self.get_temp_dir()
-    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
-
-    def _train_fn(optimizer, model):
-      input_value = constant_op.constant([[3.]])
-      return optimizer.minimize(
-          functools.partial(model, input_value),
-          global_step=root.optimizer_step)
-
-    for training_continuation in range(3):
-      with ops.Graph().as_default():
-        strategy = mirrored_strategy.MirroredStrategy()
-        with strategy.scope():
-          model = MyModel()
-          optimizer = adam.AdamOptimizer(0.001)
-          root = trackable_utils.Checkpoint(
-              optimizer=optimizer, model=model,
-              optimizer_step=training_util.get_or_create_global_step())
-          status = root.restore(checkpoint_management.latest_checkpoint(
-              checkpoint_directory))
-          train_op = strategy.extended.call_for_each_replica(
-              functools.partial(_train_fn, optimizer, model))
-          with self.session() as session:
-            if training_continuation > 0:
-              status.assert_consumed()
-            status.initialize_or_restore()
-            for _ in range(num_training_steps):
-              session.run(train_op)
-            root.save(file_prefix=checkpoint_prefix)
-        self.assertEqual((training_continuation + 1) * num_training_steps,
-                         root.optimizer_step.numpy())
-
-  def testUsageGraph(self):
-    """Expected usage when graph building."""
-    with context.graph_mode():
-      num_training_steps = 10
-      checkpoint_directory = self.get_temp_dir()
-      checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
-      for training_continuation in range(3):
-        with ops.Graph().as_default():
-          model = MyModel()
-          optimizer = adam.AdamOptimizer(0.001)
-          root = trackable_utils.CheckpointV1(
-              optimizer=optimizer, model=model,
-              global_step=training_util.get_or_create_global_step())
-          input_value = constant_op.constant([[3.]])
-          train_op = optimizer.minimize(
-              model(input_value),
-              global_step=root.global_step)
-          checkpoint_path = checkpoint_management.latest_checkpoint(
-              checkpoint_directory)
-          with self.session(graph=ops.get_default_graph()) as session:
-            status = root.restore(save_path=checkpoint_path)
-            status.initialize_or_restore(session=session)
-            if checkpoint_path is None:
-              self.assertEqual(0, training_continuation)
-              with self.assertRaises(AssertionError):
-                status.assert_consumed()
-              with self.assertRaises(AssertionError):
-                status.assert_existing_objects_matched()
-            else:
-              status.assert_consumed()
-              status.assert_existing_objects_matched()
-            for _ in range(num_training_steps):
-              session.run(train_op)
-            root.save(file_prefix=checkpoint_prefix, session=session)
-            self.assertEqual((training_continuation + 1) * num_training_steps,
-                             session.run(root.global_step))
-            self.assertEqual(training_continuation + 1,
-                             session.run(root.save_counter))
-
-  @test_util.run_in_graph_and_eager_modes
-  def testAgnosticUsage(self):
-    """Graph/eager agnostic usage."""
-    # Does create garbage when executing eagerly due to ops.Graph() creation.
-    num_training_steps = 10
-    checkpoint_directory = self.get_temp_dir()
-    for training_continuation in range(3):
-      with test_util.device(use_gpu=True):
-        model = MyModel()
-        optimizer = adam.AdamOptimizer(0.001)
-        root = trackable_utils.Checkpoint(
-            optimizer=optimizer, model=model,
-            global_step=training_util.get_or_create_global_step())
-        manager = checkpoint_management.CheckpointManager(
-            root, checkpoint_directory, max_to_keep=1)
-        status = root.restore(save_path=manager.latest_checkpoint)
-        input_value = constant_op.constant([[3.]])
-        train_fn = functools.partial(
-            optimizer.minimize,
-            functools.partial(model, input_value),
-            global_step=root.global_step)
-        if not context.executing_eagerly():
-          train_fn = functools.partial(self.evaluate, train_fn())
-        status.initialize_or_restore()
-        for _ in range(num_training_steps):
-          train_fn()
-        manager.save()
-        self.assertEqual((training_continuation + 1) * num_training_steps,
-                         self.evaluate(root.global_step))
-        self.assertEqual(training_continuation + 1,
-                         self.evaluate(root.save_counter))
-
-  # pylint: disable=cell-var-from-loop
-  @test_util.run_in_graph_and_eager_modes
-  def testWithDefun(self):
-    num_training_steps = 2
-    checkpoint_directory = self.get_temp_dir()
-    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
-    for training_continuation in range(3):
-      with test_util.device(use_gpu=True):
-        model = MyModel()
-        # Don't actually train so we can test variable values
-        optimizer = adam.AdamOptimizer(0.)
-        root = trackable_utils.Checkpoint(
-            optimizer=optimizer, model=model,
-            global_step=training_util.get_or_create_global_step())
-        checkpoint_path = checkpoint_management.latest_checkpoint(
-            checkpoint_directory)
-        status = root.restore(save_path=checkpoint_path)
-        def train_fn():
-          @def_function.function
-          def _call_model(x):
-            return model(x)
-          with backprop.GradientTape() as tape:
-            loss = _call_model(constant_op.constant([[3.]]))
-          gradients = tape.gradient(loss, model.variables)
-          return optimizer.apply_gradients(zip(gradients, model.variables),
-                                           global_step=root.global_step)
-        if not context.executing_eagerly():
-          train_fn = functools.partial(
-              self.evaluate, train_fn())
-        status.initialize_or_restore()
-        for _ in range(num_training_steps):
-          train_fn()
-        if training_continuation > 0:
-          status.assert_consumed()
-          self.assertAllClose([[42.]], self.evaluate(model.variables[0]))
-        else:
-          self.evaluate(model.variables[0].assign([[42.]]))
-        root.save(file_prefix=checkpoint_prefix)
-        self.assertEqual((training_continuation + 1) * num_training_steps,
-                         self.evaluate(root.global_step))
-        self.assertEqual(training_continuation + 1,
-                         self.evaluate(root.save_counter))
-  # pylint: enable=cell-var-from-loop
-
-  def _get_checkpoint_name(self, name):
-    root = tracking.AutoTrackable()
-    trackable_utils.add_variable(
-        root, name=name, shape=[1, 2], dtype=dtypes.float64)
-    (named_variable,), _, _ = trackable_utils._serialize_object_graph(
-        root, saveables_cache=None)
-    with ops.name_scope("root/" + named_variable.name):
-      pass  # Make sure we can use this as an op name if we prefix it.
-    return named_variable.name
-
-  def testAnonymousVarsInInit(self):
-
-    class Model(training.Model):
-
-      def __init__(self):
-        super(Model, self).__init__()
-        self.w = resource_variable_ops.ResourceVariable(0.0)
-        self.b = resource_variable_ops.ResourceVariable(0.0)
-        self.vars = [self.w, self.b]
-
-      def call(self, x):
-        return x * self.w + self.b
-
-    with context.eager_mode():
-      model = Model()
-      optimizer = adam.AdamOptimizer(learning_rate=0.05)
-      checkpoint_directory = self.get_temp_dir()
-      checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
-      checkpoint = trackable_utils.Checkpoint(
-          model=model, optimizer=optimizer)
-      for _ in range(2):
-        checkpoint.save(checkpoint_prefix)
-        with backprop.GradientTape() as tape:
-          loss = (constant_op.constant(1.)
-                  - model(constant_op.constant(1.))) ** 2
-        grad = tape.gradient(loss, model.vars)
-        optimizer.apply_gradients(
-            [(g, v) for g, v in zip(grad, model.vars)])
-
   @test_util.run_in_graph_and_eager_modes
   def testDeferredSlotRestoration(self):
     checkpoint_directory = self.get_temp_dir()
@@ -652,84 +193,6 @@ class CheckpointingTests(test.TestCase):
         beta1_power, _ = optimizer._get_beta_accumulators()
         self.assertAllEqual(3., self.evaluate(beta1_power))
 
-  @test_util.run_in_graph_and_eager_modes
-  def test_initialize_if_not_restoring(self):
-    checkpoint_directory = self.get_temp_dir()
-    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
-    optimizer_only_prefix = os.path.join(checkpoint_directory, "opt")
-    with test_util.device(use_gpu=True):
-      model = MyModel()
-      optimizer = adam.AdamOptimizer(0.001)
-      root = trackable_utils.Checkpoint(
-          model=model,  # Do not save the optimizer with the checkpoint.
-          global_step=training_util.get_or_create_global_step())
-      optimizer_checkpoint = trackable_utils.Checkpoint(
-          optimizer=optimizer)
-
-      checkpoint_path = checkpoint_management.latest_checkpoint(
-          checkpoint_directory)
-      status = root.restore(save_path=checkpoint_path)
-      input_value = constant_op.constant([[3.]])
-      train_fn = functools.partial(
-          optimizer.minimize,
-          functools.partial(model, input_value),
-          global_step=root.global_step)
-      if not context.executing_eagerly():
-        train_fn = functools.partial(self.evaluate, train_fn())
-      status.initialize_or_restore()
-      self.evaluate([v.initializer for v in optimizer.variables()])
-      train_fn()
-      model_save_path = root.save(file_prefix=checkpoint_prefix)
-      self.evaluate(optimizer.variables()[0].assign(42.))
-      optimizer_save_path = optimizer_checkpoint.save(optimizer_only_prefix)
-
-    # Restore into a graph with the optimizer
-    with test_util.device(use_gpu=True):
-      model = MyModel()
-      optimizer = adam.AdamOptimizer(0.001)
-      root = trackable_utils.Checkpoint(
-          optimizer=optimizer, model=model,
-          global_step=training_util.get_or_create_global_step())
-      status = root.restore(save_path=model_save_path)
-      input_value = constant_op.constant([[3.]])
-      train_fn = functools.partial(
-          optimizer.minimize,
-          functools.partial(model, input_value),
-          global_step=root.global_step)
-      if not context.executing_eagerly():
-        train_fn = functools.partial(self.evaluate, train_fn())
-      status.initialize_or_restore()
-      train_fn()
-      with self.assertRaises(AssertionError):
-        status.assert_existing_objects_matched()
-      with self.assertRaises(AssertionError):
-        status.assert_consumed()
-
-    # Make sure initialization doesn't clobber later restores
-    with test_util.device(use_gpu=True):
-      model = MyModel()
-      optimizer = adam.AdamOptimizer(0.001, beta1=1.0)
-      root = trackable_utils.Checkpoint(
-          optimizer=optimizer, model=model,
-          global_step=training_util.get_or_create_global_step())
-      opt_root = trackable_utils.Checkpoint(
-          optimizer=optimizer)
-      status = root.restore(save_path=model_save_path)
-      init_only_optimizer_status = opt_root.restore(save_path=None)
-      optimizer_status = opt_root.restore(save_path=optimizer_save_path)
-      input_value = constant_op.constant([[3.]])
-      train_fn = functools.partial(
-          optimizer.minimize,
-          functools.partial(model, input_value),
-          global_step=root.global_step)
-      if not context.executing_eagerly():
-        train_fn = functools.partial(self.evaluate, train_fn())
-      optimizer_status.run_restore_ops()
-      status.initialize_or_restore()
-      init_only_optimizer_status.initialize_or_restore()
-      train_fn()
-      self.assertEqual(42., self.evaluate(optimizer.variables()[0]))
-
 
 class _ManualScope(tracking.AutoTrackable):
 
@@ -797,132 +260,5 @@ class TemplateTests(test.TestCase):
     self.assertAllEqual([14.], self.evaluate(var2))
 
 
-class CheckpointCompatibilityTests(test.TestCase):
-
-  def _initialized_model(self):
-    input_value = constant_op.constant([[3.]])
-    model = MyModel()
-    optimizer = adam.AdamOptimizer(0.001)
-    optimizer_step = training_util.get_or_create_global_step()
-    root_trackable = trackable_utils.Checkpoint(
-        optimizer=optimizer, model=model, optimizer_step=optimizer_step)
-    train_op = optimizer.minimize(
-        functools.partial(model, input_value),
-        global_step=optimizer_step)
-    self.evaluate(trackable_utils.gather_initializers(
-        root_trackable))
-    self.evaluate(train_op)
-    # A regular variable, a slot variable, and a non-slot Optimizer variable
-    # with known values to check when loading.
-    self.evaluate(model._named_dense.bias.assign([1.]))
-    self.evaluate(optimizer.get_slot(
-        var=model._named_dense.bias, name="m").assign([2.]))
-    beta1_power, _ = optimizer._get_beta_accumulators()
-    self.evaluate(beta1_power.assign(3.))
-    return root_trackable
-
-  def _set_sentinels(self, root_trackable):
-    self.evaluate(root_trackable.model._named_dense.bias.assign([101.]))
-    self.evaluate(
-        root_trackable.optimizer.get_slot(
-            var=root_trackable.model._named_dense.bias, name="m")
-        .assign([102.]))
-    beta1_power, _ = root_trackable.optimizer._get_beta_accumulators()
-    self.evaluate(beta1_power.assign(103.))
-
-  def _check_sentinels(self, root_trackable):
-    self.assertAllEqual(
-        [1.], self.evaluate(root_trackable.model._named_dense.bias))
-    self.assertAllEqual([2.], self.evaluate(
-        root_trackable.optimizer.get_slot(
-            var=root_trackable.model._named_dense.bias, name="m")))
-    beta1_power, _ = root_trackable.optimizer._get_beta_accumulators()
-    self.assertAllEqual(3., self.evaluate(beta1_power))
-
-  def _write_name_based_checkpoint(self):
-    checkpoint_directory = self.get_temp_dir()
-    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
-    with context.graph_mode():
-      save_graph = ops.Graph()
-      with save_graph.as_default(), self.session(
-          graph=save_graph) as session:
-        root = self._initialized_model()
-        name_saver = saver_lib.Saver()
-        return name_saver.save(
-            sess=session, save_path=checkpoint_prefix,
-            global_step=root.optimizer_step)
-
-  @test_util.run_in_graph_and_eager_modes
-  def testLoadFromNameBasedSaver(self):
-    """Save a name-based checkpoint, load it using the object-based API."""
-    with test_util.device(use_gpu=True):
-      save_path = self._write_name_based_checkpoint()
-      root = self._initialized_model()
-      self._set_sentinels(root)
-      with self.assertRaises(AssertionError):
-        self._check_sentinels(root)
-      object_saver = trackable_utils.TrackableSaver(
-          graph_view.ObjectGraphView(root))
-      self._set_sentinels(root)
-      status = object_saver.restore(save_path)
-      if context.executing_eagerly():
-        self._check_sentinels(root)
-      if context.executing_eagerly():
-        status.assert_consumed()
-        status.assert_existing_objects_matched()
-        status.assert_nontrivial_match()
-      else:
-        # When graph building, we haven't read any keys, so we don't know
-        # whether the restore will be complete.
-        with self.assertRaisesRegexp(AssertionError, "not restored"):
-          status.assert_consumed()
-        with self.assertRaisesRegexp(AssertionError, "not restored"):
-          status.assert_existing_objects_matched()
-        with self.assertRaisesRegexp(AssertionError, "not restored"):
-          status.assert_nontrivial_match()
-      status.run_restore_ops()
-      self._check_sentinels(root)
-      self._set_sentinels(root)
-      status = object_saver.restore(save_path)
-      status.initialize_or_restore()
-      self._check_sentinels(root)
-      # Check that there is no error when keys are missing from the name-based
-      # checkpoint.
-      root.not_in_name_checkpoint = resource_variable_ops.ResourceVariable([1.])
-      status = object_saver.restore(save_path)
-      with self.assertRaises(AssertionError):
-        status.assert_existing_objects_matched()
-
-  def testSaveGraphLoadEager(self):
-    checkpoint_directory = self.get_temp_dir()
-    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
-    with context.graph_mode():
-      save_graph = ops.Graph()
-      with save_graph.as_default(), self.session(
-          graph=save_graph):
-        root = self._initialized_model()
-        save_path = root.save(file_prefix=checkpoint_prefix)
-    with context.eager_mode():
-      root = self._initialized_model()
-      self._set_sentinels(root)
-      root.restore(save_path).assert_consumed()
-      self._check_sentinels(root)
-
-  def testSaveEagerLoadGraph(self):
-    checkpoint_directory = self.get_temp_dir()
-    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
-    with context.eager_mode():
-      root = self._initialized_model()
-      save_path = root.save(file_prefix=checkpoint_prefix)
-    with context.graph_mode():
-      save_graph = ops.Graph()
-      with save_graph.as_default(), self.session(
-          graph=save_graph):
-        root = self._initialized_model()
-        self._set_sentinels(root)
-        root.restore(save_path).assert_consumed().run_restore_ops()
-        self._check_sentinels(root)
-
-
 if __name__ == "__main__":
   test.main()

From 8d5d9a50d3c8422d26cce81000310ec941b56030 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 15 Jun 2020 10:47:19 -0700
Subject: [PATCH 0179/1390] More compatibility fixes for typing.Generic:  *
 types.new_class is required in some distributions  * avoid calling isinstance
 on some function objects in python 3.6 Required for #40132.

PiperOrigin-RevId: 316497932
Change-Id: I3441e8b099d2b10c965d45cd362a4859c7c29bb9
---
 tensorflow/python/framework/test_util.py |  2 --
 tensorflow/python/util/tf_should_use.py  | 18 +++++-------------
 2 files changed, 5 insertions(+), 15 deletions(-)

diff --git a/tensorflow/python/framework/test_util.py b/tensorflow/python/framework/test_util.py
index 572e8aac987..2967bb3de84 100644
--- a/tensorflow/python/framework/test_util.py
+++ b/tensorflow/python/framework/test_util.py
@@ -732,8 +732,6 @@ def assert_no_new_tensors(f):
     """Finds existing Tensors, runs the test, checks for new Tensors."""
 
     def _is_tensorflow_object(obj):
-      if not hasattr(obj, "__class__"):
-        return False
       try:
         return isinstance(obj,
                           (ops.Tensor, variables.Variable,
diff --git a/tensorflow/python/util/tf_should_use.py b/tensorflow/python/util/tf_should_use.py
index 41c3220f5ca..1671b078fa3 100644
--- a/tensorflow/python/util/tf_should_use.py
+++ b/tensorflow/python/util/tf_should_use.py
@@ -21,12 +21,15 @@ import copy
 import sys
 import textwrap
 import traceback
-import types
+
+import six  # pylint: disable=unused-import
+
 
 from tensorflow.python.eager import context
 from tensorflow.python.framework import ops
 from tensorflow.python.platform import tf_logging
 from tensorflow.python.util import tf_decorator
+# pylint: enable=g-bad-import-order,g-import-not-at-top
 
 
 class _TFShouldUseHelper(object):
@@ -151,18 +154,7 @@ def _get_wrapper(x, tf_should_use_helper):
   tx = copy.deepcopy(type_x)
   # Prefer using __orig_bases__, which preserve generic type arguments.
   bases = getattr(tx, '__orig_bases__', tx.__bases__)
-
-  # Use types.new_class when available, which is preferred over plain type in
-  # some distributions.
-  if sys.version_info >= (3, 5):
-    def set_body(ns):
-      ns.update(tx.__dict__)
-      return ns
-
-    copy_tx = types.new_class(tx.__name__, bases, exec_body=set_body)
-  else:
-    copy_tx = type(tx.__name__, bases, dict(tx.__dict__))
-
+  copy_tx = type(tx.__name__, bases, dict(tx.__dict__))
   copy_tx.__init__ = _new__init__
   copy_tx.__getattribute__ = _new__getattribute__
   copy_tx.mark_used = _new_mark_used

From 80b3b4fa9f8ab6565a91273dc38aeabda0258bb6 Mon Sep 17 00:00:00 2001
From: Alex Zinenko <zinenko@google.com>
Date: Mon, 15 Jun 2020 10:47:31 -0700
Subject: [PATCH 0180/1390] [XLA] LHLO-to-Ploop: explicitly construct
 ValueRange when building scf::ParallelOp

The builder API is about to change in LLVM, which would lead to ambiguity in
overload resolution in these cases. Proactively fix the issue by constructing
ValueRange of steps explicitly before it is passed to the builder function.

PiperOrigin-RevId: 316497983
Change-Id: Ibb5dfe006071ab9513634c9613f43ce60a0efd22
---
 .../mlir/xla/transforms/lhlo_legalize_to_parallel_loops.cc    | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/compiler/mlir/xla/transforms/lhlo_legalize_to_parallel_loops.cc b/tensorflow/compiler/mlir/xla/transforms/lhlo_legalize_to_parallel_loops.cc
index 734a75a4307..b3112d49103 100644
--- a/tensorflow/compiler/mlir/xla/transforms/lhlo_legalize_to_parallel_loops.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/lhlo_legalize_to_parallel_loops.cc
@@ -261,7 +261,7 @@ class ReduceOpConverter : public OpConversionPattern<xla_lhlo::ReduceOp> {
       rewriter->setInsertionPointToStart(outer.getBody());
     }
     scf::ParallelOp inner = rewriter->create<scf::ParallelOp>(
-        loc, reduce_lower, reduce_upper, reduce_step, init_value);
+        loc, reduce_lower, reduce_upper, reduce_step, ValueRange(init_value));
     Value reduction_result = *inner.getResults().begin();
 
     SmallVector<Value, 1> out_indices;
@@ -406,7 +406,7 @@ class ReduceWindowOpConverter
           rewriter->create<ConstantIndexOp>(loc, window_dim.getSExtValue()));
     }
     auto window_loop = rewriter->create<scf::ParallelOp>(
-        loc, window_lower, window_upper, window_step, init_value);
+        loc, window_lower, window_upper, window_step, ValueRange(init_value));
 
     Value reduction_result = *window_loop.getResults().begin();
     auto output_ivs = output_loop.getInductionVars();

From 67487368bbc3f35a87314bb54ec5e12a6e3d7f93 Mon Sep 17 00:00:00 2001
From: Yujing Zhang <yujingzhang@google.com>
Date: Mon, 15 Jun 2020 10:48:56 -0700
Subject: [PATCH 0181/1390] Preserve FunctionDef.arg_attr in
 GrapplerFunctionItem.

PiperOrigin-RevId: 316498288
Change-Id: I6c3288c725bb281cca17256146c9ec3fd8cec5f0
---
 .../core/framework/graph_to_functiondef.cc    |  4 ++-
 tensorflow/core/grappler/utils/functions.cc   | 27 ++++++++++++++++---
 tensorflow/core/grappler/utils/functions.h    |  5 ++++
 .../core/grappler/utils/functions_test.cc     | 17 ++++++++++++
 4 files changed, 49 insertions(+), 4 deletions(-)

diff --git a/tensorflow/core/framework/graph_to_functiondef.cc b/tensorflow/core/framework/graph_to_functiondef.cc
index bbd70151849..e825aa722b5 100644
--- a/tensorflow/core/framework/graph_to_functiondef.cc
+++ b/tensorflow/core/framework/graph_to_functiondef.cc
@@ -434,9 +434,11 @@ Status GraphToFunctionDef(const Graph& fn_body, const string& fn_name,
       // _Arg/Placeholder nodes.
       if (absl::StartsWith(attr.first, "_")) {
         arg_attrs.mutable_attr()->insert(attr);
-      } else if (attr.first == "shape") {
+      } else if (attr.first == "shape" && argdef->type() != DT_RESOURCE) {
         // Preserve known shapes by moving them to the _output_shapes list.
         // The _Arg shape function knows how to extract them from there.
+        // Don't preserve the shape of a resource arg node, which is a scalar
+        // resource handle.
         AttrValue value;
         *(value.mutable_list()->add_shape()) = attr.second.shape();
         arg_attrs.mutable_attr()->insert({"_output_shapes", value});
diff --git a/tensorflow/core/grappler/utils/functions.cc b/tensorflow/core/grappler/utils/functions.cc
index 780e3c7e3f2..a83fb824cc3 100644
--- a/tensorflow/core/grappler/utils/functions.cc
+++ b/tensorflow/core/grappler/utils/functions.cc
@@ -38,12 +38,14 @@ namespace grappler {
 
 GrapplerFunctionItem::GrapplerFunctionItem(
     string func_name, string description, AttrSlice func_attr,
+    std::vector<const FunctionDef::ArgAttrs*> arg_attr,
     std::vector<InputArgInstantiation> input_args,
     std::vector<OutputArgInstantiation> output_args,
     std::vector<ControlOutput> control_outputs, const int graph_def_version,
     const bool is_stateful, GraphDef&& function_body)
     : description_(std::move(description)),
       func_attr_(func_attr),
+      arg_attr_(std::move(arg_attr)),
       input_args_(std::move(input_args)),
       output_args_(std::move(output_args)),
       control_outputs_(std::move(control_outputs)),
@@ -108,6 +110,11 @@ const std::size_t GrapplerFunctionItem::control_output_size() const {
 
 const AttrSlice& GrapplerFunctionItem::func_attr() const { return func_attr_; }
 
+const std::vector<const FunctionDef::ArgAttrs*>&
+GrapplerFunctionItem::arg_attr() const {
+  return arg_attr_;
+}
+
 const GraphDef& GrapplerFunctionItem::function_body() const { return graph; }
 
 GraphDef& GrapplerFunctionItem::mutable_function_body() { return graph; }
@@ -278,12 +285,17 @@ Status MakeGrapplerFunctionItem(const FunctionDef& func,
     control_outputs.push_back({control_ret.first, control_ret.second});
   }
 
+  std::vector<const FunctionDef::ArgAttrs*> arg_attr(inputs.size(), nullptr);
+  for (const auto& attr : func.arg_attr()) {
+    arg_attr.at(attr.first) = &attr.second;
+  }
+
   *item = GrapplerFunctionItem(
       /*func_name=*/signature.name(),
       /*description=*/signature.description(),
-      /*func_attr=*/AttrSlice(&func.attr()), std::move(inputs),
-      std::move(outputs), std::move(control_outputs), graph_def_version,
-      signature.is_stateful(), std::move(function_body));
+      /*func_attr=*/AttrSlice(&func.attr()), std::move(arg_attr),
+      std::move(inputs), std::move(outputs), std::move(control_outputs),
+      graph_def_version, signature.is_stateful(), std::move(function_body));
   return Status::OK();
 }
 
@@ -330,6 +342,7 @@ Status ReplaceInputWithConst(const NodeDef& input_const, int input_index,
   }
 
   item->input_args_.erase(item->input_args_.begin() + input_index);
+  item->arg_attr_.erase(item->arg_attr_.begin() + input_index);
 
   return Status::OK();
 }
@@ -566,6 +579,14 @@ Status MakeFunctionDef(const GrapplerFunctionItem& item,
     (*func->mutable_attr())[attr_name] = attr_value;
   }
 
+  // Copy function arg attributes.
+  for (int i = 0; i < item.arg_attr().size(); ++i) {
+    const auto* attr = item.arg_attr().at(i);
+    if (attr != nullptr) {
+      (*func->mutable_arg_attr())[i] = *attr;
+    }
+  }
+
   // Copy function body nodes to the FunctionDef and update input format
   for (const NodeDef& func_node : item.function_body().node()) {
     // Skip original `_Arg` and `_Retval` nodes. If node was converted to some
diff --git a/tensorflow/core/grappler/utils/functions.h b/tensorflow/core/grappler/utils/functions.h
index b03b89af2ab..2f1fd5d2ed6 100644
--- a/tensorflow/core/grappler/utils/functions.h
+++ b/tensorflow/core/grappler/utils/functions.h
@@ -76,6 +76,7 @@ class GrapplerFunctionItem : public GrapplerItem {
   const std::size_t control_output_size() const;
 
   const AttrSlice& func_attr() const;
+  const std::vector<const FunctionDef::ArgAttrs*>& arg_attr() const;
   const GraphDef& function_body() const;
   GraphDef& mutable_function_body();
 
@@ -95,6 +96,7 @@ class GrapplerFunctionItem : public GrapplerItem {
 
   GrapplerFunctionItem(string func_name, string description,
                        AttrSlice func_attr,
+                       std::vector<const FunctionDef::ArgAttrs*> arg_attr,
                        std::vector<InputArgInstantiation> input_args,
                        std::vector<OutputArgInstantiation> output_args,
                        std::vector<ControlOutput> control_outputs,
@@ -105,6 +107,9 @@ class GrapplerFunctionItem : public GrapplerItem {
   AttrSlice func_attr_;  // Attributes specific to function definition that
                          // produced this item (FuncDef.attr field).
 
+  // Attributes of function arguments
+  std::vector<const FunctionDef::ArgAttrs*> arg_attr_;
+
   std::vector<InputArgInstantiation> input_args_;
   std::vector<OutputArgInstantiation> output_args_;
   std::vector<ControlOutput> control_outputs_;
diff --git a/tensorflow/core/grappler/utils/functions_test.cc b/tensorflow/core/grappler/utils/functions_test.cc
index 8cc938ec845..66320d60f27 100644
--- a/tensorflow/core/grappler/utils/functions_test.cc
+++ b/tensorflow/core/grappler/utils/functions_test.cc
@@ -523,6 +523,14 @@ TEST_F(FunctionsTest, MakeFunctionDef) {
           {{"y"}, "Mul", {"x", "scale"}, {{"T", "$T"}}},
       });
 
+  // Add an attribute to _Arg 0;
+  const uint32 arg_index = 0;
+  const std::pair<string, string> arg_attr_key_and_value = {"_arg_attr", "abc"};
+  FunctionDef::ArgAttrs arg_attr;
+  (*arg_attr.mutable_attr())[arg_attr_key_and_value.first].set_s(
+      arg_attr_key_and_value.second);
+  (*func.mutable_arg_attr())[arg_index] = arg_attr;
+
   protobuf::Map<string, AttrValue> func_instantiation_attr;
   func_instantiation_attr["T"].set_type(DT_FLOAT);
   FunctionLibraryDefinition flib(OpRegistry::Global(), FunctionDefLibrary());
@@ -541,6 +549,15 @@ TEST_F(FunctionsTest, MakeFunctionDef) {
   EXPECT_EQ("y", specialized.signature().output_arg(0).name());
   EXPECT_EQ(DT_FLOAT, specialized.signature().output_arg(0).type());
 
+  EXPECT_EQ(specialized.arg_attr().size(), 1);
+  EXPECT_EQ(specialized.arg_attr().at(arg_index).attr().size(), 1);
+  EXPECT_EQ(specialized.arg_attr()
+                .at(arg_index)
+                .attr()
+                .at(arg_attr_key_and_value.first)
+                .s(),
+            arg_attr_key_and_value.second);
+
   // Function body specialized for instantiation types.
   int count = 0;
   for (const NodeDef &node : specialized.node_def()) {

From b7d66ef92658ffcf7fffa7d606293bfc314d82d2 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 15 Jun 2020 10:50:51 -0700
Subject: [PATCH 0182/1390] Add unit test that runs TPU Embedding layouter with
 MLIR bridge.

PiperOrigin-RevId: 316498711
Change-Id: I90365d953f7469464cd81a306ddaf50db580ba93
---
 tensorflow/python/tpu/tpu_embedding_v2_test.py | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/tensorflow/python/tpu/tpu_embedding_v2_test.py b/tensorflow/python/tpu/tpu_embedding_v2_test.py
index a8b21480919..0c257010f6a 100644
--- a/tensorflow/python/tpu/tpu_embedding_v2_test.py
+++ b/tensorflow/python/tpu/tpu_embedding_v2_test.py
@@ -34,6 +34,7 @@ from tensorflow.python.distribute.cluster_resolver import tpu_cluster_resolver
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import def_function
 from tensorflow.python.eager import remote
+from tensorflow.python.framework import config
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -719,7 +720,11 @@ class TPUEmbeddingTest(parameterized.TestCase, test.TestCase):
 
     self.assertAllClose(golden, weights0)
 
-  def test_enqueue_with_outside_compilation(self):
+  @parameterized.parameters([True, False])
+  def test_enqueue_with_outside_compilation(self, use_mlir):
+    if use_mlir:
+      config.enable_mlir_bridge()
+
     strategy, mid_level_api, _ = self._create_strategy_and_mid_level('sgd')
     dataset = self._create_sparse_dataset(strategy)
     dataset_iter = iter(strategy.experimental_distribute_dataset(dataset))
@@ -749,7 +754,11 @@ class TPUEmbeddingTest(parameterized.TestCase, test.TestCase):
 
     self.assertAllClose(activations_oc0, activations0)
 
-  def test_enqueue_with_outside_compilation_in_control_flow(self):
+  @parameterized.parameters(True, False)
+  def test_enqueue_with_outside_compilation_in_control_flow(self, use_mlir):
+    if use_mlir:
+      config.enable_mlir_bridge()
+
     strategy, mid_level_api, _ = self._create_strategy_and_mid_level('sgd')
     dataset = self._create_sparse_dataset(strategy)
     dataset_iter = iter(strategy.experimental_distribute_dataset(dataset))

From ba658404f279df1b2c86ade146fbdccffcf68f8f Mon Sep 17 00:00:00 2001
From: George Karpenkov <cheshire@google.com>
Date: Mon, 15 Jun 2020 11:10:33 -0700
Subject: [PATCH 0183/1390] [TF/XLA] Only force retracing for non-unique XLA
 context ID for TPUReplicatedContext

Fixes https://github.com/tensorflow/tensorflow/issues/39872

PiperOrigin-RevId: 316503485
Change-Id: Ice63983fcdf2fdedca60a9054f3b76ac60e1ff15
---
 .../python/eager/def_function_xla_jit_test.py | 19 -------------------
 tensorflow/python/eager/function.py           |  7 +++----
 tensorflow/python/ops/control_flow_ops.py     |  5 -----
 tensorflow/python/tpu/tpu.py                  |  6 ------
 4 files changed, 3 insertions(+), 34 deletions(-)

diff --git a/tensorflow/python/eager/def_function_xla_jit_test.py b/tensorflow/python/eager/def_function_xla_jit_test.py
index 78d44a81b0b..b63a3b434d4 100644
--- a/tensorflow/python/eager/def_function_xla_jit_test.py
+++ b/tensorflow/python/eager/def_function_xla_jit_test.py
@@ -29,7 +29,6 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import control_flow_util
 from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import tensor_array_ops
 from tensorflow.python.platform import test
@@ -386,24 +385,6 @@ class DefFunctionTest(test.TestCase):
     f64_input = constant_op.constant([1.1, 2.2, 3.3], dtype=dtypes.float64)
     self.assertAllClose([1.1, 3.3, 6.6], f(f64_input))
 
-  def testNoExcessiveRetracing(self):
-    inner_retracings = 0
-
-    @def_function.function(experimental_compile=True)
-    def inner(a, b):
-      nonlocal inner_retracings
-      inner_retracings += 1
-      return a * b + a
-
-    def outer(a, b):
-      return inner(a, b)
-
-    func_input = random_ops.random_normal([10, 10])
-    for _ in range(2):
-      def_function.function(outer)(func_input, func_input)
-
-    self.assertEqual(inner_retracings, 1)
-
 
 if __name__ == '__main__':
   ops.enable_eager_execution()
diff --git a/tensorflow/python/eager/function.py b/tensorflow/python/eager/function.py
index c02318cb814..a40eaf886b3 100644
--- a/tensorflow/python/eager/function.py
+++ b/tensorflow/python/eager/function.py
@@ -2981,10 +2981,9 @@ class Function(object):
     if not executing_eagerly:
       # We want to force function retracing for each different
       # XLAControlFlowContext, so add `xla_context_id` to the cache key.
-      xla_context = _enclosing_xla_context()
-      if xla_context is not None and \
-            xla_context.RequiresUniqueFunctionRetracing():
-        xla_context_id = id(xla_context)
+      tpu_context = _enclosing_xla_context()
+      if tpu_context is not None:
+        xla_context_id = id(tpu_context)
 
       with ops.init_scope():
         # The graph, or whether we're executing eagerly, should be a part of the
diff --git a/tensorflow/python/ops/control_flow_ops.py b/tensorflow/python/ops/control_flow_ops.py
index 748f842a9e0..3398308d42e 100644
--- a/tensorflow/python/ops/control_flow_ops.py
+++ b/tensorflow/python/ops/control_flow_ops.py
@@ -3682,11 +3682,6 @@ class XLAControlFlowContext(ControlFlowContext):
   def AddValue(self, x):
     return x
 
-  def RequiresUniqueFunctionRetracing(self):
-    """Returns whether the tf.function should be retraced if the context changes.
-    """
-    return False
-
 
 def from_control_flow_context_def(context_def, import_scope=None):
   """Deserializes `context_def` into the appropriate ControlFlowContext.
diff --git a/tensorflow/python/tpu/tpu.py b/tensorflow/python/tpu/tpu.py
index ce3aaa8a058..28eba69b7da 100644
--- a/tensorflow/python/tpu/tpu.py
+++ b/tensorflow/python/tpu/tpu.py
@@ -639,12 +639,6 @@ class TPUReplicateContext(control_flow_ops.XLAControlFlowContext):
   def GetControlPivot(self):
     return self._pivot
 
-  def RequiresUniqueFunctionRetracing(self):
-    # More context: b/158152827. TPU stack uses the TPUReplicateContext to
-    # create replicated variable handles and cluster TPU computations, thus we
-    # always retrace a tf.function when the wrapped TPUReplicateContext changes.
-    return True
-
 
 class OutsideCompilationV2Context(control_flow_ops.ControlFlowContext):
   """The context for outside compilation in Tensorflow 2.0.

From d1a34523f44b853998bc2740d5b59a472a12eb86 Mon Sep 17 00:00:00 2001
From: Raman Sarokin <sorokin@google.com>
Date: Mon, 15 Jun 2020 11:21:03 -0700
Subject: [PATCH 0184/1390] Added OpenCL versions 2.1/2.2/3.0.

PiperOrigin-RevId: 316505978
Change-Id: I5e35dc8e625aef2feb8f59b10a6a60d175a08314
---
 tensorflow/lite/delegates/gpu/cl/cl_device.cc | 18 +++++++++++++++++-
 tensorflow/lite/delegates/gpu/cl/cl_device.h  | 10 +++++++++-
 2 files changed, 26 insertions(+), 2 deletions(-)

diff --git a/tensorflow/lite/delegates/gpu/cl/cl_device.cc b/tensorflow/lite/delegates/gpu/cl/cl_device.cc
index 13e299b181b..aea81d5e659 100644
--- a/tensorflow/lite/delegates/gpu/cl/cl_device.cc
+++ b/tensorflow/lite/delegates/gpu/cl/cl_device.cc
@@ -105,8 +105,18 @@ OpenCLVersion ParseCLVersion(const std::string& version) {
     } else {
       return OpenCLVersion::CL_1_0;
     }
+  } else if (major == 2) {
+    if (minor == 2) {
+      return OpenCLVersion::CL_2_2;
+    } else if (minor == 1) {
+      return OpenCLVersion::CL_2_1;
+    } else {
+      return OpenCLVersion::CL_2_0;
+    }
+  } else if (major == 3) {
+    return OpenCLVersion::CL_3_0;
   } else {
-    return OpenCLVersion::CL_2_0;
+    return OpenCLVersion::CL_1_0;
   }
 }
 
@@ -227,6 +237,12 @@ std::string OpenCLVersionToString(OpenCLVersion version) {
       return "1.2";
     case OpenCLVersion::CL_2_0:
       return "2.0";
+    case OpenCLVersion::CL_2_1:
+      return "2.1";
+    case OpenCLVersion::CL_2_2:
+      return "2.2";
+    case OpenCLVersion::CL_3_0:
+      return "3.0";
   }
 }
 
diff --git a/tensorflow/lite/delegates/gpu/cl/cl_device.h b/tensorflow/lite/delegates/gpu/cl/cl_device.h
index 4fd683b78ff..1df16aa3bad 100644
--- a/tensorflow/lite/delegates/gpu/cl/cl_device.h
+++ b/tensorflow/lite/delegates/gpu/cl/cl_device.h
@@ -31,7 +31,15 @@ namespace cl {
 enum class Vendor { QUALCOMM, MALI, POWERVR, NVIDIA, AMD, INTEL, UNKNOWN };
 std::string VendorToString(Vendor v);
 
-enum class OpenCLVersion { CL_1_0, CL_1_1, CL_1_2, CL_2_0 };
+enum class OpenCLVersion {
+  CL_1_0,
+  CL_1_1,
+  CL_1_2,
+  CL_2_0,
+  CL_2_1,
+  CL_2_2,
+  CL_3_0
+};
 std::string OpenCLVersionToString(OpenCLVersion version);
 
 // for use only in cl_device.cc, but putted here to make tests

From 555be8943e70a775f4617a342587857904a9a7c1 Mon Sep 17 00:00:00 2001
From: George Karpenkov <cheshire@google.com>
Date: Mon, 15 Jun 2020 11:27:37 -0700
Subject: [PATCH 0185/1390] [XLA] [NFC] Reduce duplication between Executable
 subclasses

Factor out the code to mark owning arguments as to-be-released.

PiperOrigin-RevId: 316507574
Change-Id: I22fce6e69d0933baa25db09e25bda4037beceb17
---
 .../compiler/xla/service/cpu/cpu_executable.cc       | 11 +----------
 tensorflow/compiler/xla/service/executable.cc        | 12 ++++++++++++
 tensorflow/compiler/xla/service/executable.h         |  9 +++++++++
 .../compiler/xla/service/gpu/gpu_executable.cc       | 10 +---------
 .../xla/service/interpreter/executable_base.cc       |  9 +--------
 5 files changed, 24 insertions(+), 27 deletions(-)

diff --git a/tensorflow/compiler/xla/service/cpu/cpu_executable.cc b/tensorflow/compiler/xla/service/cpu/cpu_executable.cc
index e0c8adcbbe1..4552d7b5ba9 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_executable.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_executable.cc
@@ -350,16 +350,7 @@ StatusOr<ExecutionOutput> CpuExecutable::ExecuteAsyncOnStream(
                        std::move(buffers)),
                    hlo_execution_profile});
 
-  // TODO(cheshire): Duplication with other executables.
-  for (ExecutionInput& argument : arguments) {
-    for (auto& index_buffer : *argument.MutableBuffers()) {
-      absl::optional<se::OwningDeviceMemory> maybe_owning_buffer =
-          index_buffer.second.Release();
-      if (maybe_owning_buffer) {
-        result.AddToBeReleased(std::move(*maybe_owning_buffer));
-      }
-    }
-  }
+  MarkToBeReleasedArguments(absl::MakeSpan(arguments), result);
   return std::move(result);
 }
 
diff --git a/tensorflow/compiler/xla/service/executable.cc b/tensorflow/compiler/xla/service/executable.cc
index 4f210442005..ebf7cc440dd 100644
--- a/tensorflow/compiler/xla/service/executable.cc
+++ b/tensorflow/compiler/xla/service/executable.cc
@@ -258,4 +258,16 @@ StatusOr<ExecutionOutput> Executable::ExecuteAsyncOnStreamWrapper(
 
 int64 Executable::SizeOfGeneratedCodeInBytes() const { return -1; }
 
+void Executable::MarkToBeReleasedArguments(absl::Span<ExecutionInput> arguments,
+                                           ExecutionOutput& result) {
+  for (ExecutionInput& argument : arguments) {
+    for (auto& index_buffer : *argument.MutableBuffers()) {
+      if (absl::optional<se::OwningDeviceMemory> maybe_owning_buffer =
+              index_buffer.second.Release()) {
+        result.AddToBeReleased(std::move(*maybe_owning_buffer));
+      }
+    }
+  }
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/executable.h b/tensorflow/compiler/xla/service/executable.h
index 49614c1af00..2c979662d24 100644
--- a/tensorflow/compiler/xla/service/executable.h
+++ b/tensorflow/compiler/xla/service/executable.h
@@ -331,6 +331,15 @@ class Executable {
   bool dumping_snapshot() const { return hlo_proto_ != nullptr; }
   HloProto const* hlo_proto() const { return hlo_proto_.get(); }
 
+  // Gather unused but donated buffers, return them to the caller of this API.
+  // We don't free buffers inside this function since the caller could have
+  // different preferences for buffer deallocation. For example, in TensorFlow,
+  // buffers are mostly efficiently deallocated as soon as a program has been
+  // launched. However, in XRT, the buffers are expected to be deallocated after
+  // the program has finished since XRT doesn't support async deallocation.
+  void MarkToBeReleasedArguments(absl::Span<ExecutionInput> arguments,
+                                 ExecutionOutput& result);
+
  protected:
   // HloModule this was compiled from. BufferAssignment keeps pointers to
   // HloInstructions owned by the HloModule so we need to keep the HloModule
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
index c8b11cab31a..520bbedbaeb 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
@@ -541,15 +541,7 @@ StatusOr<ExecutionOutput> GpuExecutable::ExecuteAsyncOnStream(
       buffer_allocations.TearDown(buffers_in_result, assignment_.get()));
 
   // Free allocations for arguments.
-  for (ExecutionInput& argument : arguments) {
-    for (auto& index_buffer : *argument.MutableBuffers()) {
-      if (absl::optional<se::OwningDeviceMemory> owning =
-              index_buffer.second.Release()) {
-        result.AddToBeReleased(std::move(*owning));
-      }
-    }
-  }
-
+  MarkToBeReleasedArguments(absl::MakeSpan(arguments), result);
   return std::move(result);
 }
 
diff --git a/tensorflow/compiler/xla/service/interpreter/executable_base.cc b/tensorflow/compiler/xla/service/interpreter/executable_base.cc
index 5850cbf005b..4b020ea2d32 100644
--- a/tensorflow/compiler/xla/service/interpreter/executable_base.cc
+++ b/tensorflow/compiler/xla/service/interpreter/executable_base.cc
@@ -122,14 +122,7 @@ StatusOr<ExecutionOutput> InterpreterExecutableBase::ExecuteAsyncOnStream(
     const double nanoseconds = (end_micros - start_micros) * 1000.0;
     profile->set_compute_time_ns(std::max(nanoseconds, 1.0));
   }
-  for (auto& argument : arguments) {
-    for (auto& index_buffer : *argument.MutableBuffers()) {
-      auto maybe_owning_buffer = index_buffer.second.Release();
-      if (maybe_owning_buffer) {
-        result.AddToBeReleased(std::move(*maybe_owning_buffer));
-      }
-    }
-  }
+  MarkToBeReleasedArguments(absl::MakeSpan(arguments), result);
   return std::move(result);
 }
 

From 49ba207bdaf810293efeba4d0ce1b1bce56e5804 Mon Sep 17 00:00:00 2001
From: George Karpenkov <cheshire@google.com>
Date: Mon, 15 Jun 2020 11:40:07 -0700
Subject: [PATCH 0186/1390] [TF/XLA] Remove the wrapping function
 XlaTensor::RefCountIsOne

PiperOrigin-RevId: 316510507
Change-Id: I7bbee7208348a1ae43b11671ecd95fe1e104f280
---
 tensorflow/compiler/jit/xla_tensor.cc | 4 ----
 tensorflow/compiler/jit/xla_tensor.h  | 2 --
 2 files changed, 6 deletions(-)

diff --git a/tensorflow/compiler/jit/xla_tensor.cc b/tensorflow/compiler/jit/xla_tensor.cc
index e56d2714b0a..6bad1b703b6 100644
--- a/tensorflow/compiler/jit/xla_tensor.cc
+++ b/tensorflow/compiler/jit/xla_tensor.cc
@@ -27,10 +27,6 @@ namespace tensorflow {
   return xla_tensor;
 }
 
-/*static*/ bool XlaTensor::RefCountIsOne(const Tensor& tensor) {
-  return tensor.RefCountIsOne();
-}
-
 /*static*/ se::DeviceMemoryBase XlaTensor::DeviceMemoryFromTensor(
     const Tensor& tensor) {
   const XlaTensor* xla_tensor = FromTensor(&tensor);
diff --git a/tensorflow/compiler/jit/xla_tensor.h b/tensorflow/compiler/jit/xla_tensor.h
index 7f7d97e3b3f..a6de405ec9e 100644
--- a/tensorflow/compiler/jit/xla_tensor.h
+++ b/tensorflow/compiler/jit/xla_tensor.h
@@ -39,8 +39,6 @@ class XlaTensor {
   // fails.
   static XlaTensor* FromTensor(const Tensor* tensor);
 
-  static bool RefCountIsOne(const Tensor& tensor);
-
   // Create a DeviceMemoryBase from a Tensor. The Tensor can be an XlaTensor, in
   // which case the returned value is shaped_buffer()->root_buffer(), or a
   // normal Tensor in which case the returned value is

From 1569e9d09748b452941fc54f3095cd2ec7c309a3 Mon Sep 17 00:00:00 2001
From: Amit Patankar <amitpatankar@google.com>
Date: Mon, 15 Jun 2020 11:45:04 -0700
Subject: [PATCH 0187/1390] Re-enable Windows build after updating RBE images
 with estimator nightly pip fix.

PiperOrigin-RevId: 316511566
Change-Id: Ie6e26911d080301439bc59dc7cd03b2e33ed3d45
---
 tensorflow/core/platform/BUILD     | 1 -
 tensorflow/python/distribute/BUILD | 1 -
 2 files changed, 2 deletions(-)

diff --git a/tensorflow/core/platform/BUILD b/tensorflow/core/platform/BUILD
index 30734a840d1..70bb8a89417 100644
--- a/tensorflow/core/platform/BUILD
+++ b/tensorflow/core/platform/BUILD
@@ -386,7 +386,6 @@ py_test(
     name = "ram_file_system_test",
     srcs = ["ram_file_system_test.py"],
     python_version = "PY3",
-    tags = ["no_windows"],  # TODO(b/156428279): reenable this test once the image is updated.
     deps = [
         "//tensorflow:tensorflow_py",
     ],
diff --git a/tensorflow/python/distribute/BUILD b/tensorflow/python/distribute/BUILD
index 3d5ae4f4215..77ef98d1cb7 100644
--- a/tensorflow/python/distribute/BUILD
+++ b/tensorflow/python/distribute/BUILD
@@ -1695,7 +1695,6 @@ cuda_py_test(
     python_version = "PY3",
     tags = [
         "multi_and_single_gpu",
-        "no_windows",  # TODO(b/156428279): reenable this test once the image is updated.
     ],
     # b/141096229: Non-atomic AssignAdd
     xla_enable_strict_auto_jit = False,

From 91cd70ddc1dd49f1d316cfd6ae0bba6413039d2d Mon Sep 17 00:00:00 2001
From: Meghna Natraj <mnatraj@google.com>
Date: Mon, 15 Jun 2020 11:45:34 -0700
Subject: [PATCH 0188/1390] Update post_training_quantization.md: Update
 TFLiteConverter signature, remove usage of deprecated tf.lite.constants.

PiperOrigin-RevId: 316511691
Change-Id: Ic7cae9d887af7b8641e71c050e29e38ab36f6a60
---
 .../performance/post_training_quantization.md | 25 ++++++-------------
 1 file changed, 8 insertions(+), 17 deletions(-)

diff --git a/tensorflow/lite/g3doc/performance/post_training_quantization.md b/tensorflow/lite/g3doc/performance/post_training_quantization.md
index af7d9dbf02d..1a579430656 100644
--- a/tensorflow/lite/g3doc/performance/post_training_quantization.md
+++ b/tensorflow/lite/g3doc/performance/post_training_quantization.md
@@ -15,11 +15,11 @@ summary table of the choices and the benefits they provide:
 
 | Technique            | Benefits                  | Hardware         |
 | -------------------- | ------------------------- | ---------------- |
-| Dynamic range        | 4x smaller, 2-3x speedup  | CPU              |
+| Dynamic range        | 4x smaller, 2x-3x speedup | CPU              |
 : quantization         :                           :                  :
 | Full integer         | 4x smaller, 3x+ speedup   | CPU, Edge TPU,   |
 : quantization         :                           : Microcontrollers :
-| Float16 quantization | 2x smaller, potential GPU | CPU, GPU         |
+| Float16 quantization | 2x smaller, GPU           | CPU, GPU         |
 :                      : acceleration              :                  :
 
 The following decision tree can help determine which post-training quantization
@@ -34,7 +34,7 @@ weights from floating point to integer, which has 8-bits of precision:
 
 <pre>
 import tensorflow as tf
-converter = tf.lite.TFLiteConverter.from_saved_model(saved_model_dir)
+converter = tf.lite.TFLiteConverter.from_keras_model(keras_model)
 <b>converter.optimizations = [tf.lite.Optimize.DEFAULT]</b>
 tflite_quant_model = converter.convert()
 </pre>
@@ -48,16 +48,7 @@ activations based on their range to 8-bits and perform computations with 8-bit
 weights and activations. This optimization provides latencies close to fully
 fixed-point inference. However, the outputs are still stored using floating
 point so that the speedup with dynamic-range ops is less than a full fixed-point
-computation. Dynamic-range ops are available for the most compute-intensive
-operators in a network:
-
-*   `tf.keras.layers.Dense`
-*   `tf.keras.layers.Conv2D`
-*   `tf.keras.layers.LSTM`
-*   `tf.nn.embedding_lookup`
-*   `tf.compat.v1.nn.rnn_cell.BasicRNNCell`
-*   `tf.compat.v1.nn.bidirectional_dynamic_rnn`
-*   `tf.compat.v1.nn.dynamic_rnn`
+computation.
 
 ### Full integer quantization
 
@@ -77,7 +68,7 @@ the following steps:
 
 <pre>
 import tensorflow as tf
-converter = tf.lite.TFLiteConverter.from_saved_model(saved_model_dir)
+converter = tf.lite.TFLiteConverter.from_keras_model(keras_model)
 <b>converter.optimizations = [tf.lite.Optimize.DEFAULT]
 def representative_dataset_gen():
   for _ in range(num_calibration_steps):
@@ -105,7 +96,7 @@ the following steps:
 
 <pre>
 import tensorflow as tf
-converter = tf.lite.TFLiteConverter.from_saved_model(saved_model_dir)
+converter = tf.lite.TFLiteConverter.from_keras_model(keras_model)
 converter.optimizations = [tf.lite.Optimize.DEFAULT]
 def representative_dataset_gen():
   for _ in range(num_calibration_steps):
@@ -129,9 +120,9 @@ quantization of weights, use the following steps:
 
 <pre>
 import tensorflow as tf
-converter = tf.lite.TFLiteConverter.from_saved_model(saved_model_dir)
+converter = tf.lite.TFLiteConverter.from_keras_model(keras_model)
 <b>converter.optimizations = [tf.lite.Optimize.DEFAULT]
-converter.target_spec.supported_types = [tf.lite.constants.FLOAT16]</b>
+converter.target_spec.supported_types = [tf.float16]</b>
 tflite_quant_model = converter.convert()
 </pre>
 

From a975ef0f5c81d913d7bb1a3a2a2e84fe1f255a71 Mon Sep 17 00:00:00 2001
From: George Karpenkov <cheshire@google.com>
Date: Mon, 15 Jun 2020 12:02:13 -0700
Subject: [PATCH 0189/1390] [XLA] Support aliasing in XLA:Python bindings

PiperOrigin-RevId: 316515355
Change-Id: I9dae67c98209188ddacbae3b83ccfabceec9ea23
---
 tensorflow/compiler/xla/pjrt/pjrt_client.cc   | 54 ++++++++++++++++---
 .../compiler/xla/python/xla_client_test.py    |  5 --
 2 files changed, 48 insertions(+), 11 deletions(-)

diff --git a/tensorflow/compiler/xla/pjrt/pjrt_client.cc b/tensorflow/compiler/xla/pjrt/pjrt_client.cc
index 56370fa23a9..c1b433845b2 100644
--- a/tensorflow/compiler/xla/pjrt/pjrt_client.cc
+++ b/tensorflow/compiler/xla/pjrt/pjrt_client.cc
@@ -202,16 +202,58 @@ StatusOr<DeviceAssignment> PjRtClient::GetDefaultDeviceAssignment(
 
 StatusOr<absl::flat_hash_set<int>> PjRtClient::GetParametersThatMustBeDonated(
     const LocalExecutable& executable, bool tuple_inputs) const {
-  // TODO(b/149489114) support buffer donation on CPU/GPU when XLA supports it.
+  HloComputation* computation =
+      executable.executable()->module().entry_computation();
+  int number_of_parameters = [&]() -> int {
+    if (tuple_inputs) {
+      CHECK_EQ(computation->num_parameters(), 1);
+      const Shape& input_tuple_shape =
+          computation->parameter_instruction(0)->shape();
+      CHECK(input_tuple_shape.IsTuple());
+      return input_tuple_shape.tuple_shapes_size();
+    } else {
+      return computation->num_parameters();
+    }
+  }();
+  // If any buffer in a parameter is aliased we will donate the entire input
+  // parameter.
+  absl::flat_hash_set<int> parameters_to_donate;
   const HloInputOutputAliasConfig& config =
       executable.executable()->module().input_output_alias_config();
   TF_RETURN_IF_ERROR(config.ForEachAliasWithStatus(
-      [](const ShapeIndex& output_index,
-         const HloInputOutputAliasConfig::Alias& alias) {
-        return InvalidArgument(
-            "Buffer aliasing is not supported by XLA for non-TPU backends.");
+      [&](const ShapeIndex& output_index,
+          const HloInputOutputAliasConfig::Alias& alias) {
+        if (tuple_inputs) {
+          if (alias.parameter_number != 0) {
+            return InvalidArgument(
+                "Unexpected parameter number %d in alias config with tupled "
+                "inputs",
+                alias.parameter_number);
+          }
+          const ShapeIndex& index = alias.parameter_index;
+          if (!index.empty()) {
+            int this_parameter = index.data()[0];
+            if (this_parameter >= number_of_parameters) {
+              return InvalidArgument(
+                  "Unexpected parameter index %s in alias config with tupled "
+                  "inputs and %d parameters",
+                  index.ToString(), number_of_parameters);
+            }
+            parameters_to_donate.insert(this_parameter);
+          }
+        } else {
+          int this_parameter = alias.parameter_number;
+          if (this_parameter >= number_of_parameters) {
+            return InvalidArgument(
+                "Unexpected parameter number %d in alias config without tupled "
+                "inputs and %d parameters",
+                this_parameter, number_of_parameters);
+          }
+          parameters_to_donate.insert(this_parameter);
+        }
+        return Status::OK();
       }));
-  return absl::flat_hash_set<int>();
+  return parameters_to_donate;
 }
 
 namespace {
diff --git a/tensorflow/compiler/xla/python/xla_client_test.py b/tensorflow/compiler/xla/python/xla_client_test.py
index 0fc0bcae954..6a316044734 100644
--- a/tensorflow/compiler/xla/python/xla_client_test.py
+++ b/tensorflow/compiler/xla/python/xla_client_test.py
@@ -1909,11 +1909,6 @@ def TestFactory(xla_backend, cloud_tpu=False):
       out = ops.Add(p1, p2)
       c.setup_alias([], 0, [])
       c = c.build(out)
-      if self.backend.platform != "tpu":
-        with self.assertRaisesRegex(
-            RuntimeError, "Buffer aliasing is not supported "
-            "by XLA for non-TPU backends"):
-          self.backend.compile(c)
 
   tests.append(AliasTest)
 

From 32c25efaa929b07054cde6a0b7cf6638cd11ba79 Mon Sep 17 00:00:00 2001
From: Tim Shen <timshen@google.com>
Date: Mon, 15 Jun 2020 12:28:00 -0700
Subject: [PATCH 0190/1390] [MLIR/GPU] Add fix_signature knob to turn off
 FixKernelFunctionSignatures.

PiperOrigin-RevId: 316520783
Change-Id: I56de509059bff1e3eb87f5a618e68362ee7c6e66
---
 .../mlir/tools/kernel_gen/cubin_creator.cc    | 10 ++++++---
 .../xla/service/mlir_gpu/kernel_lowering.cc   | 22 +++++++++----------
 .../xla/service/mlir_gpu/kernel_lowering.h    | 11 +++++++---
 3 files changed, 26 insertions(+), 17 deletions(-)

diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/cubin_creator.cc b/tensorflow/compiler/mlir/tools/kernel_gen/cubin_creator.cc
index d1e5c09cf59..30b60e8079f 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/cubin_creator.cc
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/cubin_creator.cc
@@ -237,9 +237,13 @@ StatusOr<std::vector<uint8_t>> tensorflow::kernel_gen::GenerateCubinForTfCode(
   mlir::OwningModuleRef module = mlir::parseSourceString(tf_code, &context);
 
   TF_RETURN_IF_ERROR(LowerTfOpToLhloWithDynamicShapes(module.get()));
-  TF_RETURN_IF_ERROR(
-      xla::mlir_gpu::LowerLHLOToGPU(module.get(), tile_sizes, unroll_factors,
-                                    /*collapseParallelLoops=*/false));
+  {
+    xla::mlir_gpu::LowerLHLOToGPUOptions options;
+    options.tile_sizes = tile_sizes;
+    options.unroll_factors = unroll_factors;
+    options.collapse_parallel_loops = false;
+    TF_RETURN_IF_ERROR(xla::mlir_gpu::LowerLHLOToGPU(module.get(), options));
+  }
   TF_RETURN_IF_ERROR(xla::mlir_gpu::LowerKernelBodiesToNVVM(module.get()));
   // TODO(b/156985522): Figure out why we get a segfault when generating Tanh
   // with 'same_shape' containing {0, 1}. We would also get the crash if we
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/kernel_lowering.cc b/tensorflow/compiler/xla/service/mlir_gpu/kernel_lowering.cc
index 4645b084eb6..f1e01bba27e 100644
--- a/tensorflow/compiler/xla/service/mlir_gpu/kernel_lowering.cc
+++ b/tensorflow/compiler/xla/service/mlir_gpu/kernel_lowering.cc
@@ -387,10 +387,7 @@ struct ParallelLoopCollapsingToFirstDim
 };
 }  // namespace
 
-Status LowerLHLOToGPU(mlir::ModuleOp module,
-                      llvm::ArrayRef<unsigned> tile_sizes,
-                      llvm::ArrayRef<unsigned> unroll_factors,
-                      bool collapseParallelLoops) {
+Status LowerLHLOToGPU(mlir::ModuleOp module, LowerLHLOToGPUOptions options) {
   mlir::PassManager pm(module.getContext());
   applyPassManagerCLOptions(pm);
 
@@ -399,14 +396,15 @@ Status LowerLHLOToGPU(mlir::ModuleOp module,
   // needed.
   llvm::SmallVector<unsigned, 4> tiling_for_unrolling;
   llvm::SmallVector<int64_t, 4> as_int64;
-  if (!unroll_factors.empty()) {
-    tiling_for_unrolling.reserve(tile_sizes.size());
-    for (auto pair : llvm::zip(tile_sizes, unroll_factors)) {
+  if (!options.unroll_factors.empty()) {
+    tiling_for_unrolling.reserve(options.tile_sizes.size());
+    for (auto pair : llvm::zip(options.tile_sizes, options.unroll_factors)) {
       tiling_for_unrolling.push_back(std::get<0>(pair) * std::get<1>(pair));
       as_int64.push_back(std::get<1>(pair));
     }
   } else {
-    tiling_for_unrolling.append(tile_sizes.begin(), tile_sizes.end());
+    tiling_for_unrolling.append(options.tile_sizes.begin(),
+                                options.tile_sizes.end());
   }
 
   // Legalize from HLO to LHLO.
@@ -441,11 +439,11 @@ Status LowerLHLOToGPU(mlir::ModuleOp module,
   pm.addPass(absl::make_unique<StoreForwardingPass>());
   // Remove now unused temporary buffers.
   pm.addPass(absl::make_unique<DeadTempBufferRemoval>());
-  if (!unroll_factors.empty()) {
+  if (!options.unroll_factors.empty()) {
     pm.addPass(::mlir::createParallelLoopTilingPass(as_int64));
   }
   // Project all loop dimensions to X if necessary.
-  if (collapseParallelLoops) {
+  if (options.collapse_parallel_loops) {
     pm.addPass(absl::make_unique<ParallelLoopCollapsingToFirstDim>());
   }
   // Some basic cleanup.
@@ -464,7 +462,9 @@ Status LowerLHLOToGPU(mlir::ModuleOp module,
   pm.addPass(::mlir::createGpuKernelOutliningPass());
   // Make sure the kernel signature resembled the original function's
   // signature
-  pm.addPass(absl::make_unique<FixKernelFunctionSignatures>());
+  if (options.fix_signature) {
+    pm.addPass(absl::make_unique<FixKernelFunctionSignatures>());
+  }
   if (failed(pm.run(module))) {
     return InternalError("Lowering to GPU kernels failed.");
   }
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/kernel_lowering.h b/tensorflow/compiler/xla/service/mlir_gpu/kernel_lowering.h
index ab045808477..7b5d5c35c05 100644
--- a/tensorflow/compiler/xla/service/mlir_gpu/kernel_lowering.h
+++ b/tensorflow/compiler/xla/service/mlir_gpu/kernel_lowering.h
@@ -23,10 +23,15 @@ limitations under the License.
 namespace xla {
 namespace mlir_gpu {
 
+struct LowerLHLOToGPUOptions {
+  llvm::ArrayRef<unsigned> tile_sizes = {16, 64};
+  llvm::ArrayRef<unsigned> unroll_factors = {};
+  bool collapse_parallel_loops = true;
+  bool fix_signature = true;
+};
+
 Status LowerLHLOToGPU(mlir::ModuleOp module,
-                      llvm::ArrayRef<unsigned> tile_sizes = {16, 64},
-                      llvm::ArrayRef<unsigned> unroll_factors = {},
-                      bool collapseParallelLoops = true);
+                      LowerLHLOToGPUOptions options = {});
 
 Status LowerKernelBodiesToNVVM(mlir::ModuleOp module);
 

From 09cf51da59d77ae14c44116473e247cd88347ac7 Mon Sep 17 00:00:00 2001
From: Christian Sigg <csigg@google.com>
Date: Mon, 15 Jun 2020 12:35:00 -0700
Subject: [PATCH 0191/1390] Fix switch statement for
 CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT_TILING.

PiperOrigin-RevId: 316522188
Change-Id: I6557506f90d4bf6a91e97b65abd494d58769e3a3
---
 tensorflow/stream_executor/cuda/cuda_dnn.cc | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/tensorflow/stream_executor/cuda/cuda_dnn.cc b/tensorflow/stream_executor/cuda/cuda_dnn.cc
index 2dbd2c58ebd..be18c989861 100644
--- a/tensorflow/stream_executor/cuda/cuda_dnn.cc
+++ b/tensorflow/stream_executor/cuda/cuda_dnn.cc
@@ -254,10 +254,8 @@ cudnnConvolutionBwdFilterAlgo_t ToConvBackwardFilterAlgo(
     // Based on cudnn.h, the following is not implemented.
     // case CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD:
     case CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD_NONFUSED:
+    case CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT_TILING:
       return algo;
-    // Produces incorrect results for some shapes. Disabled for now, see
-    // NVIDIA bug 2072856. TODO(csigg): Only disable for subset of shapes.
-    // case CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT_TILING:
     default:
       LOG(FATAL)
           << "Unsupported Cudnn convolution backward algorithm for filter: "

From 08cbfe4090c322267c69dc67b6e070bc718914cb Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 15 Jun 2020 12:37:32 -0700
Subject: [PATCH 0192/1390] Fixed error from printing out of IndexedSlices
 objects.

PiperOrigin-RevId: 316522660
Change-Id: Ib7cc467ec2e9f2d465aa51db441574b9aca4f36e
---
 tensorflow/python/framework/indexed_slices.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/framework/indexed_slices.py b/tensorflow/python/framework/indexed_slices.py
index a2746d22650..6ddf9410fd7 100644
--- a/tensorflow/python/framework/indexed_slices.py
+++ b/tensorflow/python/framework/indexed_slices.py
@@ -147,7 +147,7 @@ class IndexedSlices(internal.NativeObject, composite_tensor.CompositeTensor):
     return "IndexedSlices(indices=%s, values=%s%s)" % (
         self._indices, self._values,
         (", dense_shape=%s" %
-         self._dense_shape) if self._dense_shape is not None else "")
+         (self._dense_shape,)) if self._dense_shape is not None else "")
 
   def __neg__(self):
     return IndexedSlices(-self.values, self.indices, self.dense_shape)

From 51373058dec434e10113da90e0a620e29715b36e Mon Sep 17 00:00:00 2001
From: George Karpenkov <cheshire@google.com>
Date: Mon, 15 Jun 2020 12:43:34 -0700
Subject: [PATCH 0193/1390] [XLA] Do not needlessly store wrapped Tensor and
 ScopedShapedBuffer inside XlaTensor on a heap

Use absl::optional instead of std::unique_ptr to store them inside the class instead.

PiperOrigin-RevId: 316523861
Change-Id: I8f54f64e5661a877b7c9807465983d8132920474
---
 tensorflow/compiler/jit/xla_tensor.h | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

diff --git a/tensorflow/compiler/jit/xla_tensor.h b/tensorflow/compiler/jit/xla_tensor.h
index a6de405ec9e..dc358760534 100644
--- a/tensorflow/compiler/jit/xla_tensor.h
+++ b/tensorflow/compiler/jit/xla_tensor.h
@@ -55,7 +55,7 @@ class XlaTensor {
   // manage the memory for these tensors a ShapedBuffer may be required.
 
   // Return true if this XlaTensor contains a ShapedBuffer.
-  bool has_shaped_buffer() const { return shaped_buffer_ != nullptr; }
+  bool has_shaped_buffer() const { return shaped_buffer_.has_value(); }
   // Return the contained ShapedBuffer.
   // REQUIRES: has_shaped_buffer()
   const xla::ShapedBuffer& shaped_buffer() const {
@@ -68,8 +68,7 @@ class XlaTensor {
   }
   // Mutates the XlaTensor to set the ShapedBuffer.
   void set_shaped_buffer(xla::ScopedShapedBuffer shaped_buffer) {
-    shaped_buffer_ =
-        absl::make_unique<xla::ScopedShapedBuffer>(std::move(shaped_buffer));
+    shaped_buffer_ = std::move(shaped_buffer);
   }
 
   // Some tensors on the device may have known values on the host. We use these
@@ -77,14 +76,12 @@ class XlaTensor {
   // host value already.
 
   // Return true if this XlaTensor contains a host tensor.
-  bool has_host_tensor() const { return host_tensor_ != nullptr; }
+  bool has_host_tensor() const { return host_tensor_.has_value(); }
   // Return the contained host tensor.
   // REQUIRES: has_host_tensor()
   const Tensor& host_tensor() const { return *host_tensor_; }
   // Sets the contained host tensor.
-  void set_host_tensor(const Tensor& tensor) {
-    host_tensor_.reset(new Tensor(tensor));
-  }
+  void set_host_tensor(const Tensor& tensor) { host_tensor_.emplace(tensor); }
 
   // Adds synchronization events to 'stream' that wait for this tensor to be
   // defined on 'stream'. Does nothing if the tensor is already defined on that
@@ -111,9 +108,9 @@ class XlaTensor {
 
  private:
   // The optional contained ShapedBuffer.
-  std::unique_ptr<xla::ScopedShapedBuffer> shaped_buffer_;
+  absl::optional<xla::ScopedShapedBuffer> shaped_buffer_;
   // An optional host tensor value.
-  std::unique_ptr<Tensor> host_tensor_;
+  absl::optional<Tensor> host_tensor_;
   // An optional event that is triggered when the tensor's content has been
   // defined. If this event is nullptr, it is assumed that the tensor's content
   // is always defined.

From 540852285d931813f599fd66811d8d3413b0da4c Mon Sep 17 00:00:00 2001
From: Geoffrey Martin-Noble <gcmn@google.com>
Date: Mon, 15 Jun 2020 12:51:16 -0700
Subject: [PATCH 0194/1390] Remove forwarding aliases from LLVM Bazel BUILD
 file

All uses have been migrated, so these are no longer necessary.

PiperOrigin-RevId: 316525401
Change-Id: I2f5c886f8580bcc7fe936e4f4ea7c440d6c20635
---
 .../xla/service/gpu/llvm_gpu_backend/BUILD    |    2 +-
 third_party/llvm/llvm.autogenerated.BUILD     | 1012 +----------------
 2 files changed, 48 insertions(+), 966 deletions(-)

diff --git a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/BUILD b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/BUILD
index 5a3dc91d48f..c3ef02a04f2 100644
--- a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/BUILD
+++ b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/BUILD
@@ -39,6 +39,7 @@ cc_library(
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
+        "@llvm-project//llvm:AMDGPUCodeGen",
         "@llvm-project//llvm:Analysis",
         "@llvm-project//llvm:BitReader",
         "@llvm-project//llvm:BitWriter",
@@ -52,7 +53,6 @@ cc_library(
         "@llvm-project//llvm:Scalar",
         "@llvm-project//llvm:Support",
         "@llvm-project//llvm:Target",
-        "@llvm-project//llvm:amdgpu_code_gen",
     ],
 )
 
diff --git a/third_party/llvm/llvm.autogenerated.BUILD b/third_party/llvm/llvm.autogenerated.BUILD
index 2857de01ecc..88f007dff1d 100644
--- a/third_party/llvm/llvm.autogenerated.BUILD
+++ b/third_party/llvm/llvm.autogenerated.BUILD
@@ -411,10 +411,10 @@ cc_binary(
     linkopts = llvm_linkopts,
     stamp = 0,
     deps = [
+        ":Support",
+        ":TableGen",
         ":config",
-        ":support",
-        ":tablegen",
-        ":utils_tablegen",
+        ":tblgen",
     ],
 )
 
@@ -428,7 +428,7 @@ cc_binary(
     copts = llvm_copts,
     linkopts = llvm_linkopts,
     stamp = 0,
-    deps = [":support"],
+    deps = [":Support"],
 )
 
 llvm_target_list = [
@@ -606,24 +606,18 @@ gentbl(
     ]),
 )
 
-[[
-    [gentbl(
-        name = target["name"] + "CommonTableGen",
-        tbl_outs = target["tbl_outs"],
-        tblgen = ":llvm-tblgen",
-        td_file = "lib/Target/" + target["dir_name"] + "/" + target["short_name"] + ".td",
-        td_srcs = [
-            ":common_target_td_sources",
-        ] + glob([
-            "lib/Target/" + target["dir_name"] + "/*.td",
-        ]),
-        deps = target.get("tbl_deps", []),
-    )],
-    [alias(
-        name = target["lower_name"] + "_target_gen",
-        actual = target["name"] + "CommonTableGen",
-    )],
-] for target in llvm_target_list]
+[gentbl(
+    name = target["name"] + "CommonTableGen",
+    tbl_outs = target["tbl_outs"],
+    tblgen = ":llvm-tblgen",
+    td_file = "lib/Target/" + target["dir_name"] + "/" + target["short_name"] + ".td",
+    td_srcs = [
+        ":common_target_td_sources",
+    ] + glob([
+        "lib/Target/" + target["dir_name"] + "/*.td",
+    ]),
+    deps = target.get("tbl_deps", []),
+) for target in llvm_target_list]
 
 # This target is used to provide *.def files to x86_code_gen.
 # Files with '.def' extension are not allowed in 'srcs' of 'cc_library' rule.
@@ -660,7 +654,7 @@ cc_binary(
     copts = llvm_copts,
     linkopts = llvm_linkopts,
     deps = [
-        ":support",
+        ":Support",
     ],
 )
 
@@ -698,11 +692,6 @@ cc_library(
     ],
 )
 
-alias(
-    name = "aarch64_asm_parser",
-    actual = ":AArch64AsmParser",
-)
-
 cc_library(
     name = "AArch64CodeGen",
     srcs = glob([
@@ -737,11 +726,6 @@ cc_library(
     ],
 )
 
-alias(
-    name = "aarch64_code_gen",
-    actual = ":AArch64CodeGen",
-)
-
 cc_library(
     name = "AArch64Desc",
     srcs = glob([
@@ -757,12 +741,12 @@ cc_library(
     ]),
     copts = llvm_copts + ["-Iexternal/llvm-project/llvm/lib/Target/AArch64"],
     deps = [
+        ":AArch64CommonTableGen",
         ":AArch64Info",
         ":AArch64Utils",
         ":BinaryFormat",
         ":MC",
         ":Support",
-        ":aarch64_target_gen",
         ":attributes_gen",
         ":config",
         ":intrinsic_enums_gen",
@@ -770,11 +754,6 @@ cc_library(
     ],
 )
 
-alias(
-    name = "aarch64_desc",
-    actual = ":AArch64Desc",
-)
-
 cc_library(
     name = "AArch64Disassembler",
     srcs = glob([
@@ -800,11 +779,6 @@ cc_library(
     ],
 )
 
-alias(
-    name = "aarch64_disassembler",
-    actual = ":AArch64Disassembler",
-)
-
 cc_library(
     name = "AArch64Info",
     srcs = glob([
@@ -823,18 +797,13 @@ cc_library(
     ]),
     copts = llvm_copts + ["-Iexternal/llvm-project/llvm/lib/Target/AArch64"],
     deps = [
+        ":CodeGen",
         ":Support",
-        ":code_gen",
+        ":Target",
         ":config",
-        ":target",
     ],
 )
 
-alias(
-    name = "aarch64_info",
-    actual = ":AArch64Info",
-)
-
 cc_library(
     name = "AArch64Utils",
     srcs = glob([
@@ -851,18 +820,13 @@ cc_library(
     ]),
     copts = llvm_copts + ["-Iexternal/llvm-project/llvm/lib/Target/AArch64"],
     deps = [
+        ":AArch64CommonTableGen",
+        ":MC",
         ":Support",
-        ":aarch64_target_gen",
         ":config",
-        ":mc",
     ],
 )
 
-alias(
-    name = "aarch64_utils",
-    actual = ":AArch64Utils",
-)
-
 cc_library(
     name = "AMDGPUAsmParser",
     srcs = glob([
@@ -888,11 +852,6 @@ cc_library(
     ],
 )
 
-alias(
-    name = "amdgpu_asm_parser",
-    actual = ":AMDGPUAsmParser",
-)
-
 cc_library(
     name = "AMDGPUCodeGen",
     srcs = glob([
@@ -930,11 +889,6 @@ cc_library(
     ],
 )
 
-alias(
-    name = "amdgpu_code_gen",
-    actual = ":AMDGPUCodeGen",
-)
-
 cc_library(
     name = "AMDGPUDesc",
     srcs = glob([
@@ -960,11 +914,6 @@ cc_library(
     ],
 )
 
-alias(
-    name = "amdgpu_desc",
-    actual = ":AMDGPUDesc",
-)
-
 cc_library(
     name = "AMDGPUDisassembler",
     srcs = glob([
@@ -990,11 +939,6 @@ cc_library(
     ],
 )
 
-alias(
-    name = "amdgpu_disassembler",
-    actual = ":AMDGPUDisassembler",
-)
-
 cc_library(
     name = "AMDGPUInfo",
     srcs = glob([
@@ -1010,19 +954,14 @@ cc_library(
     ]),
     copts = llvm_copts + ["-Iexternal/llvm-project/llvm/lib/Target/AMDGPU"],
     deps = [
+        ":AMDGPUCommonTableGen",
+        ":Core",
         ":Support",
-        ":amdgpu_target_gen",
         ":config",
-        ":core",
         ":r600_target_gen",
     ],
 )
 
-alias(
-    name = "amdgpu_info",
-    actual = ":AMDGPUInfo",
-)
-
 cc_library(
     name = "AMDGPUUtils",
     srcs = glob([
@@ -1038,21 +977,16 @@ cc_library(
     ]),
     copts = llvm_copts + ["-Iexternal/llvm-project/llvm/lib/Target/AMDGPU"],
     deps = [
+        ":AMDGPUCommonTableGen",
         ":BinaryFormat",
         ":Core",
         ":MC",
         ":Support",
-        ":amdgpu_target_gen",
         ":config",
         ":r600_target_gen",
     ],
 )
 
-alias(
-    name = "amdgpu_utils",
-    actual = ":AMDGPUUtils",
-)
-
 cc_library(
     name = "ARCCodeGen",
     srcs = glob([
@@ -1083,11 +1017,6 @@ cc_library(
     ],
 )
 
-alias(
-    name = "arc_code_gen",
-    actual = ":ARCCodeGen",
-)
-
 cc_library(
     name = "ARCDesc",
     srcs = glob([
@@ -1110,11 +1039,6 @@ cc_library(
     ],
 )
 
-alias(
-    name = "arc_desc",
-    actual = ":ARCDesc",
-)
-
 cc_library(
     name = "ARCDisassembler",
     srcs = glob([
@@ -1137,11 +1061,6 @@ cc_library(
     ],
 )
 
-alias(
-    name = "arc_disassembler",
-    actual = ":ARCDisassembler",
-)
-
 cc_library(
     name = "ARCInfo",
     srcs = glob([
@@ -1162,11 +1081,6 @@ cc_library(
     ],
 )
 
-alias(
-    name = "arc_info",
-    actual = ":ARCInfo",
-)
-
 cc_library(
     name = "ARMAsmParser",
     srcs = glob([
@@ -1192,11 +1106,6 @@ cc_library(
     ],
 )
 
-alias(
-    name = "arm_asm_parser",
-    actual = ":ARMAsmParser",
-)
-
 cc_library(
     name = "ARMCodeGen",
     srcs = glob([
@@ -1231,11 +1140,6 @@ cc_library(
     ],
 )
 
-alias(
-    name = "arm_code_gen",
-    actual = ":ARMCodeGen",
-)
-
 cc_library(
     name = "ARMDesc",
     srcs = glob([
@@ -1253,13 +1157,13 @@ cc_library(
     ]),
     copts = llvm_copts + ["-Iexternal/llvm-project/llvm/lib/Target/ARM"],
     deps = [
+        ":ARMCommonTableGen",
         ":ARMInfo",
         ":ARMUtils",
         ":BinaryFormat",
         ":MC",
         ":MCDisassembler",
         ":Support",
-        ":arm_target_gen",
         ":attributes_gen",
         ":config",
         ":intrinsic_enums_gen",
@@ -1267,11 +1171,6 @@ cc_library(
     ],
 )
 
-alias(
-    name = "arm_desc",
-    actual = ":ARMDesc",
-)
-
 cc_library(
     name = "ARMDisassembler",
     srcs = glob([
@@ -1296,11 +1195,6 @@ cc_library(
     ],
 )
 
-alias(
-    name = "arm_disassembler",
-    actual = ":ARMDisassembler",
-)
-
 cc_library(
     name = "ARMInfo",
     srcs = glob([
@@ -1317,18 +1211,13 @@ cc_library(
     ]),
     copts = llvm_copts + ["-Iexternal/llvm-project/llvm/lib/Target/ARM"],
     deps = [
+        ":ARMCommonTableGen",
         ":Support",
-        ":arm_target_gen",
+        ":Target",
         ":config",
-        ":target",
     ],
 )
 
-alias(
-    name = "arm_info",
-    actual = ":ARMInfo",
-)
-
 cc_library(
     name = "ARMUtils",
     srcs = glob([
@@ -1345,18 +1234,13 @@ cc_library(
     ]),
     copts = llvm_copts + ["-Iexternal/llvm-project/llvm/lib/Target/ARM"],
     deps = [
+        ":ARMCommonTableGen",
+        ":MC",
         ":Support",
-        ":arm_target_gen",
         ":config",
-        ":mc",
     ],
 )
 
-alias(
-    name = "arm_utils",
-    actual = ":ARMUtils",
-)
-
 cc_library(
     name = "AVRAsmParser",
     srcs = glob([
@@ -1381,11 +1265,6 @@ cc_library(
     ],
 )
 
-alias(
-    name = "avr_asm_parser",
-    actual = ":AVRAsmParser",
-)
-
 cc_library(
     name = "AVRCodeGen",
     srcs = glob([
@@ -1414,11 +1293,6 @@ cc_library(
     ],
 )
 
-alias(
-    name = "avr_code_gen",
-    actual = ":AVRCodeGen",
-)
-
 cc_library(
     name = "AVRDesc",
     srcs = glob([
@@ -1441,11 +1315,6 @@ cc_library(
     ],
 )
 
-alias(
-    name = "avr_desc",
-    actual = ":AVRDesc",
-)
-
 cc_library(
     name = "AVRDisassembler",
     srcs = glob([
@@ -1468,11 +1337,6 @@ cc_library(
     ],
 )
 
-alias(
-    name = "avr_disassembler",
-    actual = ":AVRDisassembler",
-)
-
 cc_library(
     name = "AVRInfo",
     srcs = glob([
@@ -1493,11 +1357,6 @@ cc_library(
     ],
 )
 
-alias(
-    name = "avr_info",
-    actual = ":AVRInfo",
-)
-
 cc_library(
     name = "AggressiveInstCombine",
     srcs = glob([
@@ -1521,11 +1380,6 @@ cc_library(
     ],
 )
 
-alias(
-    name = "aggressive_inst_combine",
-    actual = ":AggressiveInstCombine",
-)
-
 cc_library(
     name = "Analysis",
     srcs = glob([
@@ -1552,11 +1406,6 @@ cc_library(
     ],
 )
 
-alias(
-    name = "analysis",
-    actual = ":Analysis",
-)
-
 cc_library(
     name = "AsmParser",
     srcs = glob([
@@ -1579,11 +1428,6 @@ cc_library(
     ],
 )
 
-alias(
-    name = "asm_parser",
-    actual = ":AsmParser",
-)
-
 cc_library(
     name = "AsmPrinter",
     srcs = glob([
@@ -1616,11 +1460,6 @@ cc_library(
     ],
 )
 
-alias(
-    name = "asm_printer",
-    actual = ":AsmPrinter",
-)
-
 cc_library(
     name = "BPFAsmParser",
     srcs = glob([
@@ -1645,11 +1484,6 @@ cc_library(
     ],
 )
 
-alias(
-    name = "bpf_asm_parser",
-    actual = ":BPFAsmParser",
-)
-
 cc_library(
     name = "BPFCodeGen",
     srcs = glob([
@@ -1678,11 +1512,6 @@ cc_library(
     ],
 )
 
-alias(
-    name = "bpf_code_gen",
-    actual = ":BPFCodeGen",
-)
-
 cc_library(
     name = "BPFDesc",
     srcs = glob([
@@ -1705,11 +1534,6 @@ cc_library(
     ],
 )
 
-alias(
-    name = "bpf_desc",
-    actual = ":BPFDesc",
-)
-
 cc_library(
     name = "BPFDisassembler",
     srcs = glob([
@@ -1732,11 +1556,6 @@ cc_library(
     ],
 )
 
-alias(
-    name = "bpf_disassembler",
-    actual = ":BPFDisassembler",
-)
-
 cc_library(
     name = "BPFInfo",
     srcs = glob([
@@ -1757,11 +1576,6 @@ cc_library(
     ],
 )
 
-alias(
-    name = "bpf_info",
-    actual = ":BPFInfo",
-)
-
 cc_library(
     name = "BinaryFormat",
     srcs = glob([
@@ -1784,11 +1598,6 @@ cc_library(
     ],
 )
 
-alias(
-    name = "binary_format",
-    actual = ":BinaryFormat",
-)
-
 cc_library(
     name = "BitReader",
     srcs = glob([
@@ -1812,11 +1621,6 @@ cc_library(
     ],
 )
 
-alias(
-    name = "bit_reader",
-    actual = ":BitReader",
-)
-
 cc_library(
     name = "BitWriter",
     srcs = glob([
@@ -1844,11 +1648,6 @@ cc_library(
     ],
 )
 
-alias(
-    name = "bit_writer",
-    actual = ":BitWriter",
-)
-
 cc_library(
     name = "BitstreamReader",
     srcs = glob([
@@ -1869,11 +1668,6 @@ cc_library(
     ],
 )
 
-alias(
-    name = "bitstream_reader",
-    actual = ":BitstreamReader",
-)
-
 cc_library(
     name = "CFGuard",
     srcs = glob([
@@ -1895,11 +1689,6 @@ cc_library(
     ],
 )
 
-alias(
-    name = "cf_guard",
-    actual = ":CFGuard",
-)
-
 cc_library(
     name = "CodeGen",
     srcs = glob([
@@ -1920,6 +1709,7 @@ cc_library(
         ":BitReader",
         ":BitWriter",
         ":Core",
+        ":Instrumentation",
         ":MC",
         ":ProfileData",
         ":Scalar",
@@ -1927,15 +1717,9 @@ cc_library(
         ":Target",
         ":TransformUtils",
         ":config",
-        ":instrumentation",
     ],
 )
 
-alias(
-    name = "code_gen",
-    actual = ":CodeGen",
-)
-
 cc_library(
     name = "Core",
     srcs = glob([
@@ -1983,11 +1767,6 @@ cc_library(
     ],
 )
 
-alias(
-    name = "core",
-    actual = ":Core",
-)
-
 cc_library(
     name = "Coroutines",
     srcs = glob([
@@ -2013,11 +1792,6 @@ cc_library(
     ],
 )
 
-alias(
-    name = "coroutines",
-    actual = ":Coroutines",
-)
-
 cc_library(
     name = "Coverage",
     srcs = glob([
@@ -2041,11 +1815,6 @@ cc_library(
     ],
 )
 
-alias(
-    name = "coverage",
-    actual = ":Coverage",
-)
-
 cc_library(
     name = "DWARFLinker",
     srcs = glob([
@@ -2071,11 +1840,6 @@ cc_library(
     ],
 )
 
-alias(
-    name = "dwarf_linker",
-    actual = ":DWARFLinker",
-)
-
 cc_library(
     name = "DebugInfoCodeView",
     srcs = glob([
@@ -2091,18 +1855,13 @@ cc_library(
     ]),
     copts = llvm_copts,
     deps = [
+        ":BinaryFormat",
         ":DebugInfoMSF",
         ":Support",
-        ":binary_format",
         ":config",
     ],
 )
 
-alias(
-    name = "debug_info_code_view",
-    actual = ":DebugInfoCodeView",
-)
-
 cc_library(
     name = "DebugInfoDWARF",
     srcs = glob([
@@ -2126,11 +1885,6 @@ cc_library(
     ],
 )
 
-alias(
-    name = "debug_info_dwarf",
-    actual = ":DebugInfoDWARF",
-)
-
 cc_library(
     name = "DebugInfoGSYM",
     srcs = glob([
@@ -2154,11 +1908,6 @@ cc_library(
     ],
 )
 
-alias(
-    name = "debug_info_gsym",
-    actual = ":DebugInfoGSYM",
-)
-
 cc_library(
     name = "DebugInfoMSF",
     srcs = glob([
@@ -2179,11 +1928,6 @@ cc_library(
     ],
 )
 
-alias(
-    name = "debug_info_msf",
-    actual = ":DebugInfoMSF",
-)
-
 cc_library(
     name = "DebugInfoPDB",
     srcs = glob([
@@ -2208,11 +1952,6 @@ cc_library(
     ],
 )
 
-alias(
-    name = "debug_info_pdb",
-    actual = ":DebugInfoPDB",
-)
-
 cc_library(
     name = "Demangle",
     srcs = glob([
@@ -2230,11 +1969,6 @@ cc_library(
     deps = [":config"],
 )
 
-alias(
-    name = "demangle",
-    actual = ":Demangle",
-)
-
 cc_library(
     name = "DlltoolDriver",
     srcs = glob([
@@ -2257,11 +1991,6 @@ cc_library(
     ],
 )
 
-alias(
-    name = "dlltool_driver",
-    actual = ":DlltoolDriver",
-)
-
 cc_library(
     name = "ExecutionEngine",
     srcs = glob([
@@ -2287,11 +2016,6 @@ cc_library(
     ],
 )
 
-alias(
-    name = "execution_engine",
-    actual = ":ExecutionEngine",
-)
-
 cc_library(
     name = "Extensions",
     srcs = glob([
@@ -2309,11 +2033,6 @@ cc_library(
     deps = [":config"],
 )
 
-alias(
-    name = "extensions",
-    actual = ":Extensions",
-)
-
 cc_library(
     name = "FrontendOpenMP",
     srcs = glob([
@@ -2336,11 +2055,6 @@ cc_library(
     ],
 )
 
-alias(
-    name = "frontend_open_mp",
-    actual = ":FrontendOpenMP",
-)
-
 cc_library(
     name = "FuzzMutate",
     srcs = glob([
@@ -2367,11 +2081,6 @@ cc_library(
     ],
 )
 
-alias(
-    name = "fuzz_mutate",
-    actual = ":FuzzMutate",
-)
-
 cc_library(
     name = "GlobalISel",
     srcs = glob([
@@ -2399,11 +2108,6 @@ cc_library(
     ],
 )
 
-alias(
-    name = "global_i_sel",
-    actual = ":GlobalISel",
-)
-
 cc_library(
     name = "HexagonAsmParser",
     srcs = glob([
@@ -2428,11 +2132,6 @@ cc_library(
     ],
 )
 
-alias(
-    name = "hexagon_asm_parser",
-    actual = ":HexagonAsmParser",
-)
-
 cc_library(
     name = "HexagonCodeGen",
     srcs = glob([
@@ -2466,11 +2165,6 @@ cc_library(
     ],
 )
 
-alias(
-    name = "hexagon_code_gen",
-    actual = ":HexagonCodeGen",
-)
-
 cc_library(
     name = "HexagonDesc",
     srcs = glob([
@@ -2493,11 +2187,6 @@ cc_library(
     ],
 )
 
-alias(
-    name = "hexagon_desc",
-    actual = ":HexagonDesc",
-)
-
 cc_library(
     name = "HexagonDisassembler",
     srcs = glob([
@@ -2522,11 +2211,6 @@ cc_library(
     ],
 )
 
-alias(
-    name = "hexagon_disassembler",
-    actual = ":HexagonDisassembler",
-)
-
 cc_library(
     name = "HexagonInfo",
     srcs = glob([
@@ -2547,11 +2231,6 @@ cc_library(
     ],
 )
 
-alias(
-    name = "hexagon_info",
-    actual = ":HexagonInfo",
-)
-
 cc_library(
     name = "IPO",
     srcs = glob([
@@ -2590,11 +2269,6 @@ cc_library(
     ],
 )
 
-alias(
-    name = "ipo",
-    actual = ":IPO",
-)
-
 cc_library(
     name = "IRReader",
     srcs = glob([
@@ -2618,11 +2292,6 @@ cc_library(
     ],
 )
 
-alias(
-    name = "ir_reader",
-    actual = ":IRReader",
-)
-
 cc_library(
     name = "InstCombine",
     srcs = glob([
@@ -2640,18 +2309,13 @@ cc_library(
     deps = [
         ":Analysis",
         ":Core",
+        ":InstCombineTableGen",
         ":Support",
         ":TransformUtils",
         ":config",
-        ":instcombine_transforms_gen",
     ],
 )
 
-alias(
-    name = "inst_combine",
-    actual = ":InstCombine",
-)
-
 cc_library(
     name = "Instrumentation",
     srcs = glob([
@@ -2681,11 +2345,6 @@ cc_library(
     ],
 )
 
-alias(
-    name = "instrumentation",
-    actual = ":Instrumentation",
-)
-
 cc_library(
     name = "Interpreter",
     srcs = glob([
@@ -2709,11 +2368,6 @@ cc_library(
     ],
 )
 
-alias(
-    name = "interpreter",
-    actual = ":Interpreter",
-)
-
 cc_library(
     name = "JITLink",
     srcs = glob([
@@ -2736,11 +2390,6 @@ cc_library(
     ],
 )
 
-alias(
-    name = "jit_link",
-    actual = ":JITLink",
-)
-
 cc_library(
     name = "LTO",
     srcs = glob([
@@ -2780,11 +2429,6 @@ cc_library(
     ],
 )
 
-alias(
-    name = "lto",
-    actual = ":LTO",
-)
-
 cc_library(
     name = "LanaiAsmParser",
     srcs = glob([
@@ -2809,11 +2453,6 @@ cc_library(
     ],
 )
 
-alias(
-    name = "lanai_asm_parser",
-    actual = ":LanaiAsmParser",
-)
-
 cc_library(
     name = "LanaiCodeGen",
     srcs = glob([
@@ -2845,11 +2484,6 @@ cc_library(
     ],
 )
 
-alias(
-    name = "lanai_code_gen",
-    actual = ":LanaiCodeGen",
-)
-
 cc_library(
     name = "LanaiDesc",
     srcs = glob([
@@ -2873,11 +2507,6 @@ cc_library(
     ],
 )
 
-alias(
-    name = "lanai_desc",
-    actual = ":LanaiDesc",
-)
-
 cc_library(
     name = "LanaiDisassembler",
     srcs = glob([
@@ -2902,11 +2531,6 @@ cc_library(
     ],
 )
 
-alias(
-    name = "lanai_disassembler",
-    actual = ":LanaiDisassembler",
-)
-
 cc_library(
     name = "LanaiInfo",
     srcs = glob([
@@ -2927,11 +2551,6 @@ cc_library(
     ],
 )
 
-alias(
-    name = "lanai_info",
-    actual = ":LanaiInfo",
-)
-
 cc_library(
     name = "LibDriver",
     srcs = glob([
@@ -2956,11 +2575,6 @@ cc_library(
     ],
 )
 
-alias(
-    name = "lib_driver",
-    actual = ":LibDriver",
-)
-
 cc_library(
     name = "LineEditor",
     srcs = glob([
@@ -2981,11 +2595,6 @@ cc_library(
     ],
 )
 
-alias(
-    name = "line_editor",
-    actual = ":LineEditor",
-)
-
 cc_library(
     name = "Linker",
     srcs = glob([
@@ -3008,11 +2617,6 @@ cc_library(
     ],
 )
 
-alias(
-    name = "linker",
-    actual = ":Linker",
-)
-
 cc_library(
     name = "MC",
     srcs = glob([
@@ -3031,17 +2635,10 @@ cc_library(
         ":BinaryFormat",
         ":DebugInfoCodeView",
         ":Support",
-        ":binary_format",
         ":config",
-        ":debug_info_code_view",
     ],
 )
 
-alias(
-    name = "mc",
-    actual = ":MC",
-)
-
 cc_library(
     name = "MCA",
     srcs = glob([
@@ -3063,11 +2660,6 @@ cc_library(
     ],
 )
 
-alias(
-    name = "mca",
-    actual = ":MCA",
-)
-
 cc_library(
     name = "MCDisassembler",
     srcs = glob([
@@ -3089,11 +2681,6 @@ cc_library(
     ],
 )
 
-alias(
-    name = "mc_disassembler",
-    actual = ":MCDisassembler",
-)
-
 cc_library(
     name = "MCJIT",
     srcs = glob([
@@ -3119,11 +2706,6 @@ cc_library(
     ],
 )
 
-alias(
-    name = "mcjit",
-    actual = ":MCJIT",
-)
-
 cc_library(
     name = "MCParser",
     srcs = glob([
@@ -3145,11 +2727,6 @@ cc_library(
     ],
 )
 
-alias(
-    name = "mc_parser",
-    actual = ":MCParser",
-)
-
 cc_library(
     name = "MIRParser",
     srcs = glob([
@@ -3176,11 +2753,6 @@ cc_library(
     ],
 )
 
-alias(
-    name = "mir_parser",
-    actual = ":MIRParser",
-)
-
 cc_library(
     name = "MLPolicies",
     srcs = glob([
@@ -3202,11 +2774,6 @@ cc_library(
     ],
 )
 
-alias(
-    name = "ml_policies",
-    actual = ":MLPolicies",
-)
-
 cc_library(
     name = "MSP430AsmParser",
     srcs = glob([
@@ -3231,11 +2798,6 @@ cc_library(
     ],
 )
 
-alias(
-    name = "msp430_asm_parser",
-    actual = ":MSP430AsmParser",
-)
-
 cc_library(
     name = "MSP430CodeGen",
     srcs = glob([
@@ -3264,11 +2826,6 @@ cc_library(
     ],
 )
 
-alias(
-    name = "msp430_code_gen",
-    actual = ":MSP430CodeGen",
-)
-
 cc_library(
     name = "MSP430Desc",
     srcs = glob([
@@ -3291,11 +2848,6 @@ cc_library(
     ],
 )
 
-alias(
-    name = "msp430_desc",
-    actual = ":MSP430Desc",
-)
-
 cc_library(
     name = "MSP430Disassembler",
     srcs = glob([
@@ -3318,11 +2870,6 @@ cc_library(
     ],
 )
 
-alias(
-    name = "msp430_disassembler",
-    actual = ":MSP430Disassembler",
-)
-
 cc_library(
     name = "MSP430Info",
     srcs = glob([
@@ -3343,11 +2890,6 @@ cc_library(
     ],
 )
 
-alias(
-    name = "msp430_info",
-    actual = ":MSP430Info",
-)
-
 cc_library(
     name = "MipsAsmParser",
     srcs = glob([
@@ -3372,11 +2914,6 @@ cc_library(
     ],
 )
 
-alias(
-    name = "mips_asm_parser",
-    actual = ":MipsAsmParser",
-)
-
 cc_library(
     name = "MipsCodeGen",
     srcs = glob([
@@ -3407,11 +2944,6 @@ cc_library(
     ],
 )
 
-alias(
-    name = "mips_code_gen",
-    actual = ":MipsCodeGen",
-)
-
 cc_library(
     name = "MipsDesc",
     srcs = glob([
@@ -3434,11 +2966,6 @@ cc_library(
     ],
 )
 
-alias(
-    name = "mips_desc",
-    actual = ":MipsDesc",
-)
-
 cc_library(
     name = "MipsDisassembler",
     srcs = glob([
@@ -3461,11 +2988,6 @@ cc_library(
     ],
 )
 
-alias(
-    name = "mips_disassembler",
-    actual = ":MipsDisassembler",
-)
-
 cc_library(
     name = "MipsInfo",
     srcs = glob([
@@ -3486,11 +3008,6 @@ cc_library(
     ],
 )
 
-alias(
-    name = "mips_info",
-    actual = ":MipsInfo",
-)
-
 cc_library(
     name = "NVPTXCodeGen",
     srcs = glob([
@@ -3524,11 +3041,6 @@ cc_library(
     ],
 )
 
-alias(
-    name = "nvptx_code_gen",
-    actual = ":NVPTXCodeGen",
-)
-
 cc_library(
     name = "NVPTXDesc",
     srcs = glob([
@@ -3544,19 +3056,14 @@ cc_library(
     ]),
     copts = llvm_copts + ["-Iexternal/llvm-project/llvm/lib/Target/NVPTX"],
     deps = [
-        "nvptx_target_gen",
         ":MC",
+        ":NVPTXCommonTableGen",
         ":NVPTXInfo",
         ":Support",
         ":config",
     ],
 )
 
-alias(
-    name = "nvptx_desc",
-    actual = ":NVPTXDesc",
-)
-
 cc_library(
     name = "NVPTXInfo",
     srcs = glob([
@@ -3574,20 +3081,15 @@ cc_library(
     ]),
     copts = llvm_copts + ["-Iexternal/llvm-project/llvm/lib/Target/NVPTX"],
     deps = [
-        "nvptx_target_gen",
+        ":Core",
+        ":NVPTXCommonTableGen",
         ":Support",
+        ":Target",
         ":attributes_gen",
         ":config",
-        ":core",
-        ":target",
     ],
 )
 
-alias(
-    name = "nvptx_info",
-    actual = ":NVPTXInfo",
-)
-
 cc_library(
     name = "ObjCARC",
     srcs = glob([
@@ -3612,11 +3114,6 @@ cc_library(
     ],
 )
 
-alias(
-    name = "objc_arc",
-    actual = ":ObjCARC",
-)
-
 cc_library(
     name = "Object",
     srcs = glob([
@@ -3643,11 +3140,6 @@ cc_library(
     ],
 )
 
-alias(
-    name = "object",
-    actual = ":Object",
-)
-
 cc_library(
     name = "ObjectYAML",
     srcs = glob([
@@ -3671,11 +3163,6 @@ cc_library(
     ],
 )
 
-alias(
-    name = "object_yaml",
-    actual = ":ObjectYAML",
-)
-
 cc_library(
     name = "Option",
     srcs = glob([
@@ -3696,11 +3183,6 @@ cc_library(
     ],
 )
 
-alias(
-    name = "option",
-    actual = ":Option",
-)
-
 cc_library(
     name = "OrcError",
     srcs = glob([
@@ -3721,11 +3203,6 @@ cc_library(
     ],
 )
 
-alias(
-    name = "orc_error",
-    actual = ":OrcError",
-)
-
 cc_library(
     name = "OrcJIT",
     srcs = glob([
@@ -3756,11 +3233,6 @@ cc_library(
     ],
 )
 
-alias(
-    name = "orc_jit",
-    actual = ":OrcJIT",
-)
-
 cc_library(
     name = "Passes",
     srcs = glob([
@@ -3794,11 +3266,6 @@ cc_library(
     ],
 )
 
-alias(
-    name = "passes",
-    actual = ":Passes",
-)
-
 cc_library(
     name = "PowerPCAsmParser",
     srcs = glob([
@@ -3823,11 +3290,6 @@ cc_library(
     ],
 )
 
-alias(
-    name = "powerpc_asm_parser",
-    actual = ":PowerPCAsmParser",
-)
-
 cc_library(
     name = "PowerPCCodeGen",
     srcs = glob([
@@ -3859,11 +3321,6 @@ cc_library(
     ],
 )
 
-alias(
-    name = "powerpc_code_gen",
-    actual = ":PowerPCCodeGen",
-)
-
 cc_library(
     name = "PowerPCDesc",
     srcs = glob([
@@ -3881,21 +3338,16 @@ cc_library(
     deps = [
         ":BinaryFormat",
         ":MC",
+        ":PowerPCCommonTableGen",
         ":PowerPCInfo",
         ":Support",
         ":attributes_gen",
         ":config",
         ":intrinsic_enums_gen",
         ":intrinsics_impl_gen",
-        ":powerpc_target_gen",
     ],
 )
 
-alias(
-    name = "powerpc_desc",
-    actual = ":PowerPCDesc",
-)
-
 cc_library(
     name = "PowerPCDisassembler",
     srcs = glob([
@@ -3918,11 +3370,6 @@ cc_library(
     ],
 )
 
-alias(
-    name = "powerpc_disassembler",
-    actual = ":PowerPCDisassembler",
-)
-
 cc_library(
     name = "PowerPCInfo",
     srcs = glob([
@@ -3940,20 +3387,15 @@ cc_library(
     ]),
     copts = llvm_copts + ["-Iexternal/llvm-project/llvm/lib/Target/PowerPC"],
     deps = [
+        ":Core",
+        ":PowerPCCommonTableGen",
         ":Support",
+        ":Target",
         ":attributes_gen",
         ":config",
-        ":core",
-        ":powerpc_target_gen",
-        ":target",
     ],
 )
 
-alias(
-    name = "powerpc_info",
-    actual = ":PowerPCInfo",
-)
-
 cc_library(
     name = "ProfileData",
     srcs = glob([
@@ -3975,11 +3417,6 @@ cc_library(
     ],
 )
 
-alias(
-    name = "profile_data",
-    actual = ":ProfileData",
-)
-
 cc_library(
     name = "RISCVAsmParser",
     srcs = glob([
@@ -4005,11 +3442,6 @@ cc_library(
     ],
 )
 
-alias(
-    name = "riscv_asm_parser",
-    actual = ":RISCVAsmParser",
-)
-
 cc_library(
     name = "RISCVCodeGen",
     srcs = glob([
@@ -4041,11 +3473,6 @@ cc_library(
     ],
 )
 
-alias(
-    name = "riscv_code_gen",
-    actual = ":RISCVCodeGen",
-)
-
 cc_library(
     name = "RISCVDesc",
     srcs = glob([
@@ -4069,11 +3496,6 @@ cc_library(
     ],
 )
 
-alias(
-    name = "riscv_desc",
-    actual = ":RISCVDesc",
-)
-
 cc_library(
     name = "RISCVDisassembler",
     srcs = glob([
@@ -4096,11 +3518,6 @@ cc_library(
     ],
 )
 
-alias(
-    name = "riscv_disassembler",
-    actual = ":RISCVDisassembler",
-)
-
 cc_library(
     name = "RISCVInfo",
     srcs = glob([
@@ -4121,11 +3538,6 @@ cc_library(
     ],
 )
 
-alias(
-    name = "riscv_info",
-    actual = ":RISCVInfo",
-)
-
 cc_library(
     name = "RISCVUtils",
     srcs = glob([
@@ -4146,11 +3558,6 @@ cc_library(
     ],
 )
 
-alias(
-    name = "riscv_utils",
-    actual = ":RISCVUtils",
-)
-
 cc_library(
     name = "Remarks",
     srcs = glob([
@@ -4172,11 +3579,6 @@ cc_library(
     ],
 )
 
-alias(
-    name = "remarks",
-    actual = ":Remarks",
-)
-
 cc_library(
     name = "RuntimeDyld",
     srcs = glob([
@@ -4201,18 +3603,13 @@ cc_library(
     copts = llvm_copts,
     deps = [
         ":MC",
+        ":MCDisassembler",
         ":Object",
         ":Support",
         ":config",
-        ":mc_disassembler",
     ],
 )
 
-alias(
-    name = "runtime_dyld",
-    actual = ":RuntimeDyld",
-)
-
 cc_library(
     name = "Scalar",
     srcs = glob([
@@ -4238,17 +3635,12 @@ cc_library(
         ":Core",
         ":InstCombine",
         ":Support",
+        ":Target",
         ":TransformUtils",
         ":config",
-        ":target",
     ],
 )
 
-alias(
-    name = "scalar",
-    actual = ":Scalar",
-)
-
 cc_library(
     name = "SelectionDAG",
     srcs = glob([
@@ -4275,11 +3667,6 @@ cc_library(
     ],
 )
 
-alias(
-    name = "selection_dag",
-    actual = ":SelectionDAG",
-)
-
 cc_library(
     name = "SparcAsmParser",
     srcs = glob([
@@ -4304,11 +3691,6 @@ cc_library(
     ],
 )
 
-alias(
-    name = "sparc_asm_parser",
-    actual = ":SparcAsmParser",
-)
-
 cc_library(
     name = "SparcCodeGen",
     srcs = glob([
@@ -4337,11 +3719,6 @@ cc_library(
     ],
 )
 
-alias(
-    name = "sparc_code_gen",
-    actual = ":SparcCodeGen",
-)
-
 cc_library(
     name = "SparcDesc",
     srcs = glob([
@@ -4364,11 +3741,6 @@ cc_library(
     ],
 )
 
-alias(
-    name = "sparc_desc",
-    actual = ":SparcDesc",
-)
-
 cc_library(
     name = "SparcDisassembler",
     srcs = glob([
@@ -4391,11 +3763,6 @@ cc_library(
     ],
 )
 
-alias(
-    name = "sparc_disassembler",
-    actual = ":SparcDisassembler",
-)
-
 cc_library(
     name = "SparcInfo",
     srcs = glob([
@@ -4416,11 +3783,6 @@ cc_library(
     ],
 )
 
-alias(
-    name = "sparc_info",
-    actual = ":SparcInfo",
-)
-
 cc_library(
     name = "Support",
     srcs = glob([
@@ -4452,11 +3814,6 @@ cc_library(
     ],
 )
 
-alias(
-    name = "support",
-    actual = ":Support",
-)
-
 cc_library(
     name = "Symbolize",
     srcs = glob([
@@ -4481,11 +3838,6 @@ cc_library(
     ],
 )
 
-alias(
-    name = "symbolize",
-    actual = ":Symbolize",
-)
-
 cc_library(
     name = "SystemZAsmParser",
     srcs = glob([
@@ -4510,11 +3862,6 @@ cc_library(
     ],
 )
 
-alias(
-    name = "system_z_asm_parser",
-    actual = ":SystemZAsmParser",
-)
-
 cc_library(
     name = "SystemZCodeGen",
     srcs = glob([
@@ -4545,11 +3892,6 @@ cc_library(
     ],
 )
 
-alias(
-    name = "system_z_code_gen",
-    actual = ":SystemZCodeGen",
-)
-
 cc_library(
     name = "SystemZDesc",
     srcs = glob([
@@ -4572,11 +3914,6 @@ cc_library(
     ],
 )
 
-alias(
-    name = "system_z_desc",
-    actual = ":SystemZDesc",
-)
-
 cc_library(
     name = "SystemZDisassembler",
     srcs = glob([
@@ -4601,11 +3938,6 @@ cc_library(
     ],
 )
 
-alias(
-    name = "system_z_disassembler",
-    actual = ":SystemZDisassembler",
-)
-
 cc_library(
     name = "SystemZInfo",
     srcs = glob([
@@ -4626,11 +3958,6 @@ cc_library(
     ],
 )
 
-alias(
-    name = "system_z_info",
-    actual = ":SystemZInfo",
-)
-
 cc_library(
     name = "TableGen",
     srcs = glob([
@@ -4648,17 +3975,12 @@ cc_library(
     ]),
     copts = llvm_copts,
     deps = [
+        ":MC",
         ":Support",
         ":config",
-        ":mc",
     ],
 )
 
-alias(
-    name = "tablegen",
-    actual = ":TableGen",
-)
-
 cc_library(
     name = "Target",
     srcs = glob([
@@ -4687,11 +4009,6 @@ cc_library(
     ],
 )
 
-alias(
-    name = "target",
-    actual = ":Target",
-)
-
 cc_library(
     name = "TestingSupport",
     srcs = glob([
@@ -4712,11 +4029,6 @@ cc_library(
     ],
 )
 
-alias(
-    name = "testing_support",
-    actual = ":TestingSupport",
-)
-
 cc_library(
     name = "TextAPI",
     srcs = glob([
@@ -4752,11 +4064,6 @@ cc_library(
     ],
 )
 
-alias(
-    name = "text_api",
-    actual = ":TextAPI",
-)
-
 cc_library(
     name = "TransformUtils",
     srcs = glob([
@@ -4781,11 +4088,6 @@ cc_library(
     ],
 )
 
-alias(
-    name = "transform_utils",
-    actual = ":TransformUtils",
-)
-
 cc_library(
     name = "VEAsmParser",
     srcs = glob([
@@ -4810,11 +4112,6 @@ cc_library(
     ],
 )
 
-alias(
-    name = "ve_asm_parser",
-    actual = ":VEAsmParser",
-)
-
 cc_library(
     name = "VECodeGen",
     srcs = glob([
@@ -4844,11 +4141,6 @@ cc_library(
     ],
 )
 
-alias(
-    name = "ve_code_gen",
-    actual = ":VECodeGen",
-)
-
 cc_library(
     name = "VEDesc",
     srcs = glob([
@@ -4871,11 +4163,6 @@ cc_library(
     ],
 )
 
-alias(
-    name = "ve_desc",
-    actual = ":VEDesc",
-)
-
 cc_library(
     name = "VEDisassembler",
     srcs = glob([
@@ -4898,11 +4185,6 @@ cc_library(
     ],
 )
 
-alias(
-    name = "ve_disassembler",
-    actual = ":VEDisassembler",
-)
-
 cc_library(
     name = "VEInfo",
     srcs = glob([
@@ -4923,11 +4205,6 @@ cc_library(
     ],
 )
 
-alias(
-    name = "ve_info",
-    actual = ":VEInfo",
-)
-
 cc_library(
     name = "Vectorize",
     srcs = glob([
@@ -4947,18 +4224,13 @@ cc_library(
     deps = [
         ":Analysis",
         ":Core",
+        ":Scalar",
         ":Support",
         ":TransformUtils",
         ":config",
-        ":scalar",
     ],
 )
 
-alias(
-    name = "vectorize",
-    actual = ":Vectorize",
-)
-
 cc_library(
     name = "WebAssemblyAsmParser",
     srcs = glob([
@@ -4982,11 +4254,6 @@ cc_library(
     ],
 )
 
-alias(
-    name = "web_assembly_asm_parser",
-    actual = ":WebAssemblyAsmParser",
-)
-
 cc_library(
     name = "WebAssemblyCodeGen",
     srcs = glob([
@@ -5019,11 +4286,6 @@ cc_library(
     ],
 )
 
-alias(
-    name = "web_assembly_code_gen",
-    actual = ":WebAssemblyCodeGen",
-)
-
 cc_library(
     name = "WebAssemblyDesc",
     srcs = glob([
@@ -5046,11 +4308,6 @@ cc_library(
     ],
 )
 
-alias(
-    name = "web_assembly_desc",
-    actual = ":WebAssemblyDesc",
-)
-
 cc_library(
     name = "WebAssemblyDisassembler",
     srcs = glob([
@@ -5075,11 +4332,6 @@ cc_library(
     ],
 )
 
-alias(
-    name = "web_assembly_disassembler",
-    actual = ":WebAssemblyDisassembler",
-)
-
 cc_library(
     name = "WebAssemblyInfo",
     srcs = glob([
@@ -5100,11 +4352,6 @@ cc_library(
     ],
 )
 
-alias(
-    name = "web_assembly_info",
-    actual = ":WebAssemblyInfo",
-)
-
 cc_library(
     name = "WindowsManifest",
     srcs = glob([
@@ -5125,11 +4372,6 @@ cc_library(
     ],
 )
 
-alias(
-    name = "windows_manifest",
-    actual = ":WindowsManifest",
-)
-
 cc_library(
     name = "X86AsmParser",
     srcs = glob([
@@ -5154,11 +4396,6 @@ cc_library(
     ],
 )
 
-alias(
-    name = "x86_asm_parser",
-    actual = ":X86AsmParser",
-)
-
 cc_library(
     name = "X86CodeGen",
     srcs = glob([
@@ -5192,11 +4429,6 @@ cc_library(
     ],
 )
 
-alias(
-    name = "x86_code_gen",
-    actual = ":X86CodeGen",
-)
-
 cc_library(
     name = "X86Desc",
     srcs = glob([
@@ -5221,11 +4453,6 @@ cc_library(
     ],
 )
 
-alias(
-    name = "x86_desc",
-    actual = ":X86Desc",
-)
-
 cc_library(
     name = "X86Disassembler",
     srcs = glob([
@@ -5248,11 +4475,6 @@ cc_library(
     ],
 )
 
-alias(
-    name = "x86_disassembler",
-    actual = ":X86Disassembler",
-)
-
 cc_library(
     name = "X86Info",
     srcs = glob([
@@ -5269,18 +4491,13 @@ cc_library(
     ]),
     copts = llvm_copts + ["-Iexternal/llvm-project/llvm/lib/Target/X86"],
     deps = [
+        ":MC",
         ":Support",
+        ":X86CommonTableGen",
         ":config",
-        ":mc",
-        ":x86_target_gen",
     ],
 )
 
-alias(
-    name = "x86_info",
-    actual = ":X86Info",
-)
-
 cc_library(
     name = "XCoreCodeGen",
     srcs = glob([
@@ -5311,11 +4528,6 @@ cc_library(
     ],
 )
 
-alias(
-    name = "x_core_code_gen",
-    actual = ":XCoreCodeGen",
-)
-
 cc_library(
     name = "XCoreDesc",
     srcs = glob([
@@ -5338,11 +4550,6 @@ cc_library(
     ],
 )
 
-alias(
-    name = "x_core_desc",
-    actual = ":XCoreDesc",
-)
-
 cc_library(
     name = "XCoreDisassembler",
     srcs = glob([
@@ -5365,11 +4572,6 @@ cc_library(
     ],
 )
 
-alias(
-    name = "x_core_disassembler",
-    actual = ":XCoreDisassembler",
-)
-
 cc_library(
     name = "XCoreInfo",
     srcs = glob([
@@ -5390,11 +4592,6 @@ cc_library(
     ],
 )
 
-alias(
-    name = "x_core_info",
-    actual = ":XCoreInfo",
-)
-
 cc_library(
     name = "XRay",
     srcs = glob([
@@ -5416,11 +4613,6 @@ cc_library(
     ],
 )
 
-alias(
-    name = "x_ray",
-    actual = ":XRay",
-)
-
 cc_library(
     name = "gtest",
     srcs = glob([
@@ -5460,113 +4652,3 @@ cc_library(
         ":gtest",
     ],
 )
-
-alias(
-    name = "aarch64_target",
-    actual = ":aarch64_code_gen",
-)
-
-alias(
-    name = "aarch64_target_disassembler",
-    actual = ":aarch64_disassembler",
-)
-
-alias(
-    name = "arm_target",
-    actual = ":arm_code_gen",
-)
-
-alias(
-    name = "arm_target_disassembler",
-    actual = ":arm_disassembler",
-)
-
-alias(
-    name = "codegen",
-    actual = ":code_gen",
-)
-
-alias(
-    name = "frontend_openmp",
-    actual = ":frontend_open_mp",
-)
-
-alias(
-    name = "ipo_transforms",
-    actual = ":ipo",
-)
-
-alias(
-    name = "ir",
-    actual = ":core",
-)
-
-alias(
-    name = "machine_code",
-    actual = ":mc",
-)
-
-alias(
-    name = "machine_code_disassembler",
-    actual = ":mc_disassembler",
-)
-
-alias(
-    name = "nvptx_target",
-    actual = ":nvptx_code_gen",
-)
-
-alias(
-    name = "objcarc_transforms",
-    actual = ":objc_arc",
-)
-
-alias(
-    name = "orcjit",
-    actual = ":orc_jit",
-)
-
-alias(
-    name = "powerpc_target",
-    actual = ":powerpc_code_gen",
-)
-
-alias(
-    name = "powerpc_target_disassembler",
-    actual = ":powerpc_disassembler",
-)
-
-alias(
-    name = "scalar_transforms",
-    actual = ":scalar",
-)
-
-alias(
-    name = "target_base",
-    actual = ":target",
-)
-
-alias(
-    name = "x86_target",
-    actual = ":x86_code_gen",
-)
-
-alias(
-    name = "x86_target_disassembler",
-    actual = ":x86_disassembler",
-)
-
-alias(
-    name = "all_targets",
-    actual = ":AllTargetsCodeGens",
-)
-
-alias(
-    name = "instcombine_transforms_gen",
-    actual = ":InstCombineTableGen",
-)
-
-alias(
-    name = "utils_tablegen",
-    actual = ":tblgen",
-)

From 18e0e6450bc7b87a118565156ff90b1effee2118 Mon Sep 17 00:00:00 2001
From: Francois Chollet <fchollet@google.com>
Date: Mon, 15 Jun 2020 13:02:46 -0700
Subject: [PATCH 0195/1390] Prevent Keras dataset loading from affecting the
 global RNG seed.

PiperOrigin-RevId: 316527944
Change-Id: I13fc997ffafc02f25b94e45265c7aa97b6efc6c4
---
 tensorflow/python/keras/datasets/boston_housing.py | 4 ++--
 tensorflow/python/keras/datasets/reuters.py        | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/tensorflow/python/keras/datasets/boston_housing.py b/tensorflow/python/keras/datasets/boston_housing.py
index 2c0badfefba..8886634a4b7 100644
--- a/tensorflow/python/keras/datasets/boston_housing.py
+++ b/tensorflow/python/keras/datasets/boston_housing.py
@@ -67,9 +67,9 @@ def load_data(path='boston_housing.npz', test_split=0.2, seed=113):
     x = f['x']
     y = f['y']
 
-  np.random.seed(seed)
+  rng = np.random.RandomState(seed)
   indices = np.arange(len(x))
-  np.random.shuffle(indices)
+  rng.shuffle(indices)
   x = x[indices]
   y = y[indices]
 
diff --git a/tensorflow/python/keras/datasets/reuters.py b/tensorflow/python/keras/datasets/reuters.py
index 46ac9249637..b71440fd632 100644
--- a/tensorflow/python/keras/datasets/reuters.py
+++ b/tensorflow/python/keras/datasets/reuters.py
@@ -119,9 +119,9 @@ def load_data(path='reuters.npz',
   with np.load(path, allow_pickle=True) as f:
     xs, labels = f['x'], f['y']
 
-  np.random.seed(seed)
+  rng = np.random.RandomState(seed)
   indices = np.arange(len(xs))
-  np.random.shuffle(indices)
+  rng.shuffle(indices)
   xs = xs[indices]
   labels = labels[indices]
 

From 2fe65568b1475f23afc2ee9387e8f447d5eda4d8 Mon Sep 17 00:00:00 2001
From: sshiddib <sharada.shiddibhavi@intel.com>
Date: Mon, 15 Jun 2020 13:14:03 -0700
Subject: [PATCH 0196/1390] [Intel MKL] Adding DNNL ops (part 1) supporting
 threadpool work

---
 .../core/kernels/mkl_conv_grad_filter_ops.cc  | 17 ++++++-
 .../core/kernels/mkl_conv_grad_input_ops.cc   | 11 ++++-
 tensorflow/core/kernels/mkl_conv_ops.cc       | 14 ++++++
 .../core/kernels/mkl_fused_batch_norm_op.cc   | 46 ++++++++++++++++++-
 .../core/kernels/mkl_pooling_ops_common.cc    | 25 +++++++++-
 tensorflow/core/kernels/mkl_quantize_op.cc    |  5 ++
 tensorflow/core/kernels/mkl_relu_op.cc        | 18 +++++++-
 tensorflow/core/kernels/mkl_slice_op.cc       |  8 +++-
 tensorflow/core/kernels/mkl_softmax_op.cc     |  8 +++-
 9 files changed, 143 insertions(+), 9 deletions(-)

diff --git a/tensorflow/core/kernels/mkl_conv_grad_filter_ops.cc b/tensorflow/core/kernels/mkl_conv_grad_filter_ops.cc
index 4c3cea4b6ff..12581d0bfa5 100644
--- a/tensorflow/core/kernels/mkl_conv_grad_filter_ops.cc
+++ b/tensorflow/core/kernels/mkl_conv_grad_filter_ops.cc
@@ -114,6 +114,21 @@ class MklConvBwdFilterPrimitive : public MklPrimitive {
   void Execute(const T* src_data, const T* diff_filter_data,
                const T* diff_bias_data, const T* diff_dst_data,
                std::shared_ptr<stream> bwd_filter_stream) {
+    // TODO: Create a common function and avoid the duplicate code
+#ifdef ENABLE_MKLDNN_THREADPOOL
+    context_.src_mem->set_data_handle(
+        static_cast<void*>(const_cast<T*>(src_data)), *bwd_filter_stream);
+    context_.diff_filter_mem->set_data_handle(
+        static_cast<void*>(const_cast<T*>(diff_filter_data)),
+        *bwd_filter_stream);
+    if (diff_bias_data != nullptr) {
+      context_.diff_bias_mem->set_data_handle(
+          static_cast<void*>(const_cast<T*>(diff_bias_data)),
+          *bwd_filter_stream);
+    }
+    context_.diff_dst_mem->set_data_handle(
+        static_cast<void*>(const_cast<T*>(diff_dst_data)), *bwd_filter_stream);
+#else
     context_.src_mem->set_data_handle(
         static_cast<void*>(const_cast<T*>(src_data)));
     context_.diff_filter_mem->set_data_handle(
@@ -124,7 +139,7 @@ class MklConvBwdFilterPrimitive : public MklPrimitive {
     }
     context_.diff_dst_mem->set_data_handle(
         static_cast<void*>(const_cast<T*>(diff_dst_data)));
-
+#endif  // ENABLE_MKLDNN_THREADPOOL
 #ifdef ENABLE_MKLDNN_V1
     execute_primitives(context_.bwd_filter_primitives, bwd_filter_stream,
                        context_.bwd_filter_primitives_args);
diff --git a/tensorflow/core/kernels/mkl_conv_grad_input_ops.cc b/tensorflow/core/kernels/mkl_conv_grad_input_ops.cc
index f9c8d11c67c..7177431029a 100644
--- a/tensorflow/core/kernels/mkl_conv_grad_input_ops.cc
+++ b/tensorflow/core/kernels/mkl_conv_grad_input_ops.cc
@@ -116,13 +116,22 @@ class MklConvBwdInputPrimitive : public MklPrimitive {
   void Execute(const T* diff_src_data, const T* filter_data,
                const T* diff_dst_data,
                std::shared_ptr<stream> bwd_input_stream) {
+    // TODO: Create a common function and avoid the duplicate code
+#ifdef ENABLE_MKLDNN_THREADPOOL
+    context_.diff_src_mem->set_data_handle(
+        static_cast<T*>(const_cast<T*>(diff_src_data)), *bwd_input_stream);
+    context_.filter_mem->set_data_handle(
+        static_cast<T*>(const_cast<T*>(filter_data)), *bwd_input_stream);
+    context_.diff_dst_mem->set_data_handle(
+        static_cast<T*>(const_cast<T*>(diff_dst_data)), *bwd_input_stream);
+#else
     context_.diff_src_mem->set_data_handle(
         static_cast<T*>(const_cast<T*>(diff_src_data)));
     context_.filter_mem->set_data_handle(
         static_cast<T*>(const_cast<T*>(filter_data)));
     context_.diff_dst_mem->set_data_handle(
         static_cast<T*>(const_cast<T*>(diff_dst_data)));
-
+#endif  // ENABLE_MKLDNN_THREADPOOL
 #ifdef ENABLE_MKLDNN_V1
     execute_primitives(context_.bwd_input_primitives, bwd_input_stream,
                        context_.bwd_input_primitives_args);
diff --git a/tensorflow/core/kernels/mkl_conv_ops.cc b/tensorflow/core/kernels/mkl_conv_ops.cc
index 7d0510d03ac..210044436aa 100644
--- a/tensorflow/core/kernels/mkl_conv_ops.cc
+++ b/tensorflow/core/kernels/mkl_conv_ops.cc
@@ -110,6 +110,19 @@ class MklConvFwdPrimitive : public MklPrimitive {
   void Execute(const Tinput* src_data, const Tfilter* filter_data,
                const Tbias* bias_data, const Toutput* dst_data,
                std::shared_ptr<stream> fwd_stream) {
+    // TODO: Create a common function and avoid the duplicate code
+#ifdef ENABLE_MKLDNN_THREADPOOL
+    context_.src_mem->set_data_handle(
+        static_cast<void*>(const_cast<Tinput*>(src_data)), *fwd_stream);
+    context_.filter_mem->set_data_handle(
+        static_cast<void*>(const_cast<Tfilter*>(filter_data)), *fwd_stream);
+    if (bias_data != nullptr) {
+      context_.bias_mem->set_data_handle(
+          static_cast<void*>(const_cast<Tbias*>(bias_data)), *fwd_stream);
+    }
+    context_.dst_mem->set_data_handle(
+        static_cast<void*>(const_cast<Toutput*>(dst_data)), *fwd_stream);
+#else
     context_.src_mem->set_data_handle(
         static_cast<void*>(const_cast<Tinput*>(src_data)));
     context_.filter_mem->set_data_handle(
@@ -120,6 +133,7 @@ class MklConvFwdPrimitive : public MklPrimitive {
     }
     context_.dst_mem->set_data_handle(
         static_cast<void*>(const_cast<Toutput*>(dst_data)));
+#endif  // ENABLE_MKLDNN_THREADPOOL
 #ifdef ENABLE_MKLDNN_V1
     DCHECK_EQ(context_.fwd_primitives.size(),
               context_.fwd_primitives_args.size());
diff --git a/tensorflow/core/kernels/mkl_fused_batch_norm_op.cc b/tensorflow/core/kernels/mkl_fused_batch_norm_op.cc
index 954ae0492df..3b2c4f84039 100644
--- a/tensorflow/core/kernels/mkl_fused_batch_norm_op.cc
+++ b/tensorflow/core/kernels/mkl_fused_batch_norm_op.cc
@@ -94,6 +94,28 @@ class MklFusedBatchNormFwdPrimitive : public MklPrimitive {
   void Execute(const T* src_data, const U* weights_data, T* dst_data,
                U* mean_data, U* variance_data,
                std::shared_ptr<stream> fwd_stream, U* workspace_data) {
+    // TODO: Create a common function and avoid the duplicate code
+#ifdef ENABLE_MKLDNN_THREADPOOL
+    context_.src_mem->set_data_handle(
+        static_cast<void*>(const_cast<T*>(src_data)), *fwd_stream);
+    context_.dst_mem->set_data_handle(static_cast<void*>(dst_data),
+                                      *fwd_stream);
+
+    if (IS_SET(use_scale_shift))
+      context_.weights_mem->set_data_handle(
+          static_cast<void*>(const_cast<U*>(weights_data)), *fwd_stream);
+
+    if ((context_.pkind == prop_kind::forward_training) ||
+        (IS_SET(use_global_stats))) {
+      context_.mean_mem->set_data_handle(static_cast<void*>(mean_data),
+                                         *fwd_stream);
+      context_.variance_mem->set_data_handle(static_cast<void*>(variance_data),
+                                             *fwd_stream);
+    }
+    if (workspace_data != nullptr) {
+      context_.ws_mem->set_data_handle(workspace_data, *fwd_stream);
+    }
+#else
     context_.src_mem->set_data_handle(
         static_cast<void*>(const_cast<T*>(src_data)));
     context_.dst_mem->set_data_handle(static_cast<void*>(dst_data));
@@ -110,6 +132,7 @@ class MklFusedBatchNormFwdPrimitive : public MklPrimitive {
     if (workspace_data != nullptr) {
       context_.ws_mem->set_data_handle(workspace_data);
     }
+#endif  // ENABLE_MKLDNN_THREADPOOL
 #ifdef ENABLE_MKLDNN_V1
     // Execute batch-normalization forward primitives.
     execute_primitives(context_.fwd_primitives, fwd_stream, context_.net_args);
@@ -503,6 +526,27 @@ class MklFusedBatchNormBwdPrimitive : public MklPrimitive {
                const T* diff_dst_data, const U* weights_data, T* diff_src_data,
                U* diff_weights_data, U* res_space_data,
                std::shared_ptr<stream> bwd_stream) {
+    // TODO: Create a common function and avoid the duplicate code
+#ifdef ENABLE_MKLDNN_THREADPOOL
+    context_.src_mem->set_data_handle(
+        static_cast<void*>(const_cast<T*>(src_data)), *bwd_stream);
+    context_.mean_mem->set_data_handle(
+        static_cast<void*>(const_cast<U*>(mean_data)), *bwd_stream);
+    context_.variance_mem->set_data_handle(
+        static_cast<void*>(const_cast<U*>(variance_data)), *bwd_stream);
+    context_.diff_dst_mem->set_data_handle(
+        static_cast<void*>(const_cast<T*>(diff_dst_data)), *bwd_stream);
+
+    if (IS_SET(use_scale_shift)) {
+      context_.weights_mem->set_data_handle(
+          static_cast<void*>(const_cast<U*>(weights_data)), *bwd_stream);
+      context_.diff_weights_mem->set_data_handle(
+          static_cast<void*>(diff_weights_data), *bwd_stream);
+    }
+
+    context_.diff_src_mem->set_data_handle(static_cast<void*>(diff_src_data),
+                                           *bwd_stream);
+#else
     context_.src_mem->set_data_handle(
         static_cast<void*>(const_cast<T*>(src_data)));
     context_.mean_mem->set_data_handle(
@@ -520,7 +564,7 @@ class MklFusedBatchNormBwdPrimitive : public MklPrimitive {
     }
 
     context_.diff_src_mem->set_data_handle(static_cast<void*>(diff_src_data));
-
+#endif  // ENABLE_MKLDNN_THREADPOOL
 #ifdef ENABLE_MKLDNN_V1
     // Execute backward batch-normalization primitives.
     DCHECK_EQ(context_.bwd_primitives.size(), context_.net_args.size());
diff --git a/tensorflow/core/kernels/mkl_pooling_ops_common.cc b/tensorflow/core/kernels/mkl_pooling_ops_common.cc
index 2dfc6db0075..5f1c9129ec3 100644
--- a/tensorflow/core/kernels/mkl_pooling_ops_common.cc
+++ b/tensorflow/core/kernels/mkl_pooling_ops_common.cc
@@ -127,6 +127,17 @@ template <typename T>
 void MklPoolingFwdPrimitive<T>::Execute(const T* src_data, T* dst_data,
                                         void* ws_data,
                                         std::shared_ptr<stream> fwd_stream) {
+#ifdef ENABLE_MKLDNN_THREADPOOL
+  context_.src_mem->set_data_handle(
+      static_cast<void*>(const_cast<T*>(src_data)), *fwd_stream);
+  context_.dst_mem->set_data_handle(static_cast<void*>(dst_data), *fwd_stream);
+  if (context_.alg_kind == ALGORITHM::pooling_max &&
+      context_.prop_kind ==
+          prop_kind::forward_training) {  // Max pooling must have workspace.
+    DCHECK(ws_data != nullptr);
+    context_.ws_mem->set_data_handle(ws_data, *fwd_stream);
+  }
+#else
   context_.src_mem->set_data_handle(
       static_cast<void*>(const_cast<T*>(src_data)));
   context_.dst_mem->set_data_handle(static_cast<void*>(dst_data));
@@ -136,7 +147,7 @@ void MklPoolingFwdPrimitive<T>::Execute(const T* src_data, T* dst_data,
     DCHECK(ws_data != nullptr);
     context_.ws_mem->set_data_handle(ws_data);
   }
-
+#endif  // ENABLE_MKLDNN_THREADPOOL
 #ifdef ENABLE_MKLDNN_V1
   execute_primitives(context_.fwd_primitives, fwd_stream, context_.net_args);
 #else
@@ -269,6 +280,16 @@ template <typename T>
 void MklPoolingBwdPrimitive<T>::Execute(const T* diff_dst_data,
                                         T* diff_src_data, const void* ws_data,
                                         std::shared_ptr<stream> bwd_stream) {
+#ifdef ENABLE_MKLDNN_THREADPOOL
+  context_.diff_dst_mem->set_data_handle(
+      static_cast<void*>(const_cast<T*>(diff_dst_data)), *bwd_stream);
+  context_.diff_src_mem->set_data_handle(static_cast<void*>(diff_src_data),
+                                         *bwd_stream);
+  if (context_.alg_kind == ALGORITHM::pooling_max) {
+    DCHECK(ws_data != nullptr);
+    context_.ws_mem->set_data_handle(const_cast<void*>(ws_data), *bwd_stream);
+  }
+#else
   context_.diff_dst_mem->set_data_handle(
       static_cast<void*>(const_cast<T*>(diff_dst_data)));
   context_.diff_src_mem->set_data_handle(static_cast<void*>(diff_src_data));
@@ -276,7 +297,7 @@ void MklPoolingBwdPrimitive<T>::Execute(const T* diff_dst_data,
     DCHECK(ws_data != nullptr);
     context_.ws_mem->set_data_handle(const_cast<void*>(ws_data));
   }
-
+#endif  // ENABLE_MKLDNN_THREADPOOL
 #ifdef ENABLE_MKLDNN_V1
   execute_primitives(context_.bwd_primitives, bwd_stream, context_.net_args);
 #else
diff --git a/tensorflow/core/kernels/mkl_quantize_op.cc b/tensorflow/core/kernels/mkl_quantize_op.cc
index 5adb9862250..177cbb43d0b 100644
--- a/tensorflow/core/kernels/mkl_quantize_op.cc
+++ b/tensorflow/core/kernels/mkl_quantize_op.cc
@@ -88,8 +88,13 @@ class MklReorderWithScalePrimitive : public MklPrimitive {
 
   void Execute(void* src_data, void* dst_data,
                std::shared_ptr<stream> reorder_stream) {
+#ifdef ENABLE_MKLDNN_THREADPOOL
+    context_.src_mem->set_data_handle(src_data, *reorder_stream);
+    context_.dst_mem->set_data_handle(dst_data, *reorder_stream);
+#else
     context_.src_mem->set_data_handle(src_data);
     context_.dst_mem->set_data_handle(dst_data);
+#endif  // ENABLE_MKLDNN_THREADPOOL
 #ifndef ENABLE_MKLDNN_V1
     reorder_stream->submit(context_.net);
 #else
diff --git a/tensorflow/core/kernels/mkl_relu_op.cc b/tensorflow/core/kernels/mkl_relu_op.cc
index 784bbc682dc..9af580de777 100644
--- a/tensorflow/core/kernels/mkl_relu_op.cc
+++ b/tensorflow/core/kernels/mkl_relu_op.cc
@@ -79,10 +79,16 @@ class MklEltwiseFwdPrimitive : public MklPrimitive {
   //   dst_data:  output data buffer of dst
   void Execute(const T* src_data, T* dst_data,
                std::shared_ptr<stream> fwd_stream) {
+#ifdef ENABLE_MKLDNN_THREADPOOL
+    context_.src_mem->set_data_handle(
+        static_cast<void*>(const_cast<T*>(src_data)), *fwd_stream);
+    context_.dst_mem->set_data_handle(static_cast<void*>(dst_data),
+                                      *fwd_stream);
+#else
     context_.src_mem->set_data_handle(
         static_cast<void*>(const_cast<T*>(src_data)));
     context_.dst_mem->set_data_handle(static_cast<void*>(dst_data));
-
+#endif  // ENABLE_MKLDNN_THREADPOOL
 #ifdef ENABLE_MKLDNN_V1
     DCHECK_EQ(context_.fwd_primitives.size(),
               context_.fwd_primitives_args.size());
@@ -293,12 +299,20 @@ class MklEltwiseBwdPrimitive : public MklPrimitive {
   //   diff_src_data:  output data buffer of diff_src
   void Execute(const T* src_data, const T* diff_dst_data, T* diff_src_data,
                std::shared_ptr<stream> bwd_stream) {
+#ifdef ENABLE_MKLDNN_THREADPOOL
+    context_.src_mem->set_data_handle(
+        static_cast<void*>(const_cast<T*>(src_data)), *bwd_stream);
+    context_.diff_dst_mem->set_data_handle(
+        static_cast<void*>(const_cast<T*>(diff_dst_data)), *bwd_stream);
+    context_.diff_src_mem->set_data_handle(static_cast<void*>(diff_src_data),
+                                           *bwd_stream);
+#else
     context_.src_mem->set_data_handle(
         static_cast<void*>(const_cast<T*>(src_data)));
     context_.diff_dst_mem->set_data_handle(
         static_cast<void*>(const_cast<T*>(diff_dst_data)));
     context_.diff_src_mem->set_data_handle(static_cast<void*>(diff_src_data));
-
+#endif  // ENABLE_MKLDNN_THREADPOOL
 #ifdef ENABLE_MKLDNN_V1
     DCHECK_EQ(context_.bwd_primitives.size(),
               context_.bwd_primitives_args.size());
diff --git a/tensorflow/core/kernels/mkl_slice_op.cc b/tensorflow/core/kernels/mkl_slice_op.cc
index 4115691c79d..7e293e14d98 100644
--- a/tensorflow/core/kernels/mkl_slice_op.cc
+++ b/tensorflow/core/kernels/mkl_slice_op.cc
@@ -189,9 +189,15 @@ class MklSlicePrimitive : public MklPrimitive {
 
   void Execute(const MklSliceParams& sliceParams,
                std::shared_ptr<stream> slice_stream) {
+#ifdef ENABLE_MKLDNN_THREADPOOL
+    context_.src_mem->set_data_handle(sliceParams.from->get_data_handle(),
+                                      *slice_stream);
+    context_.dst_mem->set_data_handle(sliceParams.to->get_data_handle(),
+                                      *slice_stream);
+#else
     context_.src_mem->set_data_handle(sliceParams.from->get_data_handle());
     context_.dst_mem->set_data_handle(sliceParams.to->get_data_handle());
-
+#endif  // ENABLE_MKLDNN_THREADPOOL
 #ifdef ENABLE_MKLDNN_V1
     execute_primitives(context_.slice_primitives, slice_stream,
                        context_.slice_primitives_args);
diff --git a/tensorflow/core/kernels/mkl_softmax_op.cc b/tensorflow/core/kernels/mkl_softmax_op.cc
index 4d1cf90f28d..2f51573fe13 100644
--- a/tensorflow/core/kernels/mkl_softmax_op.cc
+++ b/tensorflow/core/kernels/mkl_softmax_op.cc
@@ -59,10 +59,16 @@ class MklSoftmaxPrimitive : public MklPrimitive {
   //   dst_data:  output data buffer of dst
   void Execute(const T* src_data, T* dst_data,
                std::shared_ptr<stream> fwd_cpu_stream) {
+#ifdef ENABLE_MKLDNN_THREADPOOL
+    context_.src_mem->set_data_handle(
+        static_cast<void*>(const_cast<T*>(src_data)), *fwd_cpu_stream);
+    context_.dst_mem->set_data_handle(static_cast<void*>(dst_data),
+                                      *fwd_cpu_stream);
+#else
     context_.src_mem->set_data_handle(
         static_cast<void*>(const_cast<T*>(src_data)));
     context_.dst_mem->set_data_handle(static_cast<void*>(dst_data));
-
+#endif  // ENABLE_MKLDNN_THREADPOOL
 #ifdef ENABLE_MKLDNN_V1
     DCHECK_EQ(context_.fwd_primitives.size(), context_.fwd_net_args.size());
     execute_primitives(context_.fwd_primitives, fwd_cpu_stream,

From 5016da312802f8372672ccbfa7a4207b8683a8e5 Mon Sep 17 00:00:00 2001
From: sshiddib <sharada.shiddibhavi@intel.com>
Date: Mon, 15 Jun 2020 13:21:21 -0700
Subject: [PATCH 0197/1390] [Intel MKL] Adding DNNL ops (part 2) supporting
 threadpool work

---
 tensorflow/core/kernels/mkl_aggregate_ops.cc |  7 +++++--
 tensorflow/core/kernels/mkl_concat_op.cc     | 17 ++++++++++++++---
 tensorflow/core/kernels/mkl_dequantize_op.cc |  7 +++++--
 tensorflow/core/kernels/mkl_lrn_op.cc        |  6 ++++--
 tensorflow/core/kernels/mkl_transpose_op.cc  |  5 +++--
 tensorflow/core/util/mkl_util.h              | 14 ++++++++++++--
 6 files changed, 43 insertions(+), 13 deletions(-)

diff --git a/tensorflow/core/kernels/mkl_aggregate_ops.cc b/tensorflow/core/kernels/mkl_aggregate_ops.cc
index ec5f80cb3fa..90e0ea9aa95 100644
--- a/tensorflow/core/kernels/mkl_aggregate_ops.cc
+++ b/tensorflow/core/kernels/mkl_aggregate_ops.cc
@@ -178,6 +178,9 @@ class MklAddNOp : public OpKernel {
         dnn_fmt = MklTensorFormatToMklDnnDataFormat(mkl_data_format);
       }
 
+      std::shared_ptr<stream> fwd_cpu_stream;
+      fwd_cpu_stream.reset(CreateStream(ctx, cpu_engine));
+
       // Create memory descriptor for MKL-DNN.
       // If all input in Tensorflow format, create block memory descriptor,
       // else convert TF format to MKL memory descriptor
@@ -215,6 +218,7 @@ class MklAddNOp : public OpKernel {
         srcs_pd.push_back(memory::primitive_desc(md, cpu_engine));
 #endif
         src.SetUsrMem(md, &src_tensor);
+        src.SetUsrMemDataHandle(&src_tensor, fwd_cpu_stream);
         inputs.push_back(src.GetOpMem());
       }
 
@@ -240,11 +244,10 @@ class MklAddNOp : public OpKernel {
       }
       AllocateOutputSetMklShape(ctx, kOutputIdx, &dst_tensor, output_tf_shape,
                                 output_mkl_shape);
-      dst.SetUsrMemDataHandle(dst_tensor);
+      dst.SetUsrMemDataHandle(dst_tensor, fwd_cpu_stream);
 
       // Create Sum op, and submit net for execution.
       std::vector<primitive> net;
-      stream* fwd_cpu_stream = CreateStream(ctx, cpu_engine);
 #ifdef ENABLE_MKLDNN_V1
       mkldnn::sum sum_op(sum_pd);
       std::unordered_map<int, memory> net_args = {
diff --git a/tensorflow/core/kernels/mkl_concat_op.cc b/tensorflow/core/kernels/mkl_concat_op.cc
index 976f778424e..4a5cb0a0d4f 100644
--- a/tensorflow/core/kernels/mkl_concat_op.cc
+++ b/tensorflow/core/kernels/mkl_concat_op.cc
@@ -281,11 +281,19 @@ class MklConcatFwdPrimitive : public MklPrimitive {
                std::shared_ptr<stream> fwd_stream) {
     DCHECK_EQ(in_data.size(), context_.data_mem.size());
     for (size_t i = 0; i < concat_fwd_dims.num_inputs; i++) {
+#ifdef ENABLE_MKLDNN_THREADPOOL
+      context_.data_mem_shdptr[i]->set_data_handle(
+          static_cast<void*>(in_data[i].get_data_handle()), *fwd_stream);
+    }
+    context_.dst_mem->set_data_handle(
+        static_cast<void*>(dst_data.get_data_handle()), *fwd_stream);
+#else
       context_.data_mem_shdptr[i]->set_data_handle(
           static_cast<void*>(in_data[i].get_data_handle()));
     }
     context_.dst_mem->set_data_handle(
         static_cast<void*>(dst_data.get_data_handle()));
+#endif  // ENABLE_MKLDNN_THREADPOOL
 
     for (size_t i = 0; i < concat_fwd_dims.num_inputs; i++) {
       context_.data_mem[i] = *context_.data_mem_shdptr[i];
@@ -788,11 +796,13 @@ class MklConcatOp : public OpKernel {
                                     dnn_shape_dst);
           DCHECK(dst_tensor != nullptr) << "Output tensor pointer is NULL";
 
+          std::shared_ptr<stream> fwd_cpu_stream;
+          fwd_cpu_stream.reset(CreateStream(context, cpu_engine));
+
           if (dnn_shape_dst.IsMklTensor())
             dst_md = dnn_shape_dst.GetMklLayout();
           dst.SetUsrMem(dst_md, dst_tensor);
-          std::shared_ptr<stream> fwd_cpu_stream;
-          fwd_cpu_stream.reset(CreateStream(context, cpu_engine));
+          dst.SetUsrMemDataHandle(dst_tensor, fwd_cpu_stream);
 #ifdef ENABLE_MKLDNN_V1
           auto concat_op = concat(concat_pd);
           std::unordered_map<int, memory> net_args = {
@@ -830,9 +840,10 @@ class MklConcatOp : public OpKernel {
 
           dst_md = dnn_shape_dst.IsMklTensor() ? dnn_shape_dst.GetMklLayout()
                                                : dst_md;
-          dst.SetUsrMem(dst_md, dst_tensor);
           std::shared_ptr<stream> fwd_cpu_stream;
           fwd_cpu_stream.reset(CreateStream(context, concat_fwd->GetEngine()));
+          dst.SetUsrMem(dst_md, dst_tensor);
+          dst.SetUsrMemDataHandle(dst_tensor, fwd_cpu_stream);
           // Execute concat
           concat_fwd->Execute(srcs_mem, dst.GetOpMem(), concat_fwd_dims,
                               fwd_cpu_stream);
diff --git a/tensorflow/core/kernels/mkl_dequantize_op.cc b/tensorflow/core/kernels/mkl_dequantize_op.cc
index 06570c1db1c..82d78250576 100644
--- a/tensorflow/core/kernels/mkl_dequantize_op.cc
+++ b/tensorflow/core/kernels/mkl_dequantize_op.cc
@@ -75,6 +75,9 @@ class MklDequantizeOp : public OpKernel {
       MklDnnData<T> src(&cpu_engine);
       MklDnnData<float> dst(&cpu_engine);
 
+      std::shared_ptr<stream> reorder_stream;
+      reorder_stream.reset(CreateStream(ctx, cpu_engine));
+
       // If input is in MKL layout, then simply grab input layout; otherwise,
       // construct input TF layout. For TF layout, although input shape
       // (src_dims) required is in MKL-DNN order, the layout is Tensorflow's
@@ -85,6 +88,7 @@ class MklDequantizeOp : public OpKernel {
               : memory::desc(src_dims, MklDnnType<T>(), MEMORY_FORMAT::nhwc);
 
       src.SetUsrMem(src_md, &src_tensor);
+      src.SetUsrMemDataHandle(&src_tensor, reorder_stream);
 
       Tensor* output_tensor = nullptr;
       MklDnnShape output_mkl_shape;
@@ -129,6 +133,7 @@ class MklDequantizeOp : public OpKernel {
       AllocateOutputSetMklShape(ctx, 0, &output_tensor, output_tf_shape,
                                 output_mkl_shape);
       dst.SetUsrMem(dst_md, output_tensor);
+      dst.SetUsrMemDataHandle(output_tensor, reorder_stream);
 
       // The quantization logic here for mode SCALED is similar to the logic
       // in QuantizeAndDequantizeV2 and QuantizeAndDequantizeV3.
@@ -155,8 +160,6 @@ class MklDequantizeOp : public OpKernel {
       // Also it does not define round_nearest (enum).
       attr.set_int_output_round_mode(mkldnn::round_mode::round_nearest);
 #endif  // !ENABLE_MKLDNN_V1
-      std::shared_ptr<stream> reorder_stream;
-      reorder_stream.reset(CreateStream(ctx, cpu_engine));
       std::vector<primitive> net;
 
       // Create reorder primitive and then execute.
diff --git a/tensorflow/core/kernels/mkl_lrn_op.cc b/tensorflow/core/kernels/mkl_lrn_op.cc
index a11e7ebcbf5..3e512d0792b 100644
--- a/tensorflow/core/kernels/mkl_lrn_op.cc
+++ b/tensorflow/core/kernels/mkl_lrn_op.cc
@@ -137,6 +137,7 @@ class MklLRNOp : public OpKernel {
       // that input is in NHWC layout with Channel being the last dimension.
       src_dnn_data.SetUsrMem(src_md, &src_tensor);
       src_dnn_data.SetOpMemDesc(input_dims, MEMORY_FORMAT::nhwc);
+      src_dnn_data.SetUsrMemDataHandle(&src_tensor, fwd_stream_);
 
       // dst_dnn_data has the same shape as input.
       dst_dnn_data.SetUsrMem(src_md);
@@ -157,7 +158,7 @@ class MklLRNOp : public OpKernel {
                            &output_tensor);
       OP_REQUIRES_OK(context, context->status());
       DCHECK(output_tensor != nullptr);
-      dst_dnn_data.SetUsrMemDataHandle(output_tensor);
+      dst_dnn_data.SetUsrMemDataHandle(output_tensor, fwd_stream_);
 
       // Handle workspace required for MKL-DNN.
       AllocateWorkspaceTensor(context, lrn_prim_desc, &workspace_dnn_data);
@@ -393,6 +394,7 @@ class MklLRNGradOp : public OpKernel {
           orig_input_dnn_shape.GetSizesAsMklDnnDims();
       orig_input_dnn_data.SetUsrMem(orig_input_md, &orig_input_tensor);
       orig_input_dnn_data.SetOpMemDesc(orig_input_dims, MEMORY_FORMAT::nhwc);
+      orig_input_dnn_data.SetUsrMemDataHandle(&orig_input_tensor, bwd_stream_);
 
       // output_dnn_data has the same shape as original input
       output_dnn_data.SetUsrMem(orig_input_md);
@@ -421,7 +423,7 @@ class MklLRNGradOp : public OpKernel {
                            orig_input_format, &output_tensor);
       OP_REQUIRES_OK(context, context->status());
       DCHECK(output_tensor != nullptr);
-      output_dnn_data.SetUsrMemDataHandle(output_tensor);
+      output_dnn_data.SetUsrMemDataHandle(output_tensor, bwd_stream_);
 
       // Create LRN primitive and add it to the net
       // At this point, workspace is enabled, so we don't need
diff --git a/tensorflow/core/kernels/mkl_transpose_op.cc b/tensorflow/core/kernels/mkl_transpose_op.cc
index 77a68afa752..2e5c6d2719b 100644
--- a/tensorflow/core/kernels/mkl_transpose_op.cc
+++ b/tensorflow/core/kernels/mkl_transpose_op.cc
@@ -137,6 +137,7 @@ Status MKLTransposeND(OpKernelContext* context, const Tensor& in_tensor,
     memory::dims out_strides =
         ReorderStrides(CalculateTFStrides(out_dims), perm);
 
+    std::shared_ptr<stream> transpose_stream;
     in.SetUsrMem(in_dims, in_strides, &in_tensor);
     // Output dimensions are same as input dimensions. We adjust the layout
     // using strides.
@@ -144,16 +145,16 @@ Status MKLTransposeND(OpKernelContext* context, const Tensor& in_tensor,
 
     std::vector<primitive> net;
 #ifdef ENABLE_MKLDNN_V1
-    std::shared_ptr<stream> transpose_stream;
     auto* prim = FindOrCreateReorder<T>(in.GetUsrMem(), out.GetUsrMem());
     transpose_stream.reset(CreateStream(context, prim->GetEngine()));
+    in.SetUsrMemDataHandle(&in_tensor, transpose_stream);
+    out.SetUsrMemDataHandle(out_tensor, transpose_stream);
     net.push_back(*(prim->GetPrimitive()));
     std::vector<MemoryArgsMap> net_args;
     net_args.push_back({{MKLDNN_ARG_FROM, *in.GetUsrMem()},
                         {MKLDNN_ARG_TO, *out.GetUsrMem()}});
     execute_primitives(net, transpose_stream, net_args);
 #else
-    std::shared_ptr<stream> transpose_stream;
     transpose_stream.reset(new CPU_STREAM(cpu_engine));
     net.push_back(FindOrCreateReorder<T>(in.GetUsrMem(), out.GetUsrMem()));
     transpose_stream->submit(net).wait();
diff --git a/tensorflow/core/util/mkl_util.h b/tensorflow/core/util/mkl_util.h
index 7f6272b09c1..996984eebc0 100644
--- a/tensorflow/core/util/mkl_util.h
+++ b/tensorflow/core/util/mkl_util.h
@@ -1524,17 +1524,27 @@ class MklDnnData {
   }
 
   /// Set function for data buffer of user memory primitive.
-  inline void SetUsrMemDataHandle(void* data_buffer) {
+  inline void SetUsrMemDataHandle(void* data_buffer,
+                                  std::shared_ptr<stream> t_stream = nullptr) {
     CHECK_NOTNULL(user_memory_);
     CHECK_NOTNULL(data_buffer);
+#ifdef ENABLE_MKLDNN_THREADPOOL
+    user_memory_->set_data_handle(data_buffer, *t_stream);
+#else
     user_memory_->set_data_handle(data_buffer);
+#endif  // ENABLE_MKLDNN_THREADPOOL
   }
 
   /// Set function for data buffer of user memory primitive.
-  inline void SetUsrMemDataHandle(const Tensor* tensor) {
+  inline void SetUsrMemDataHandle(const Tensor* tensor,
+                                  std::shared_ptr<stream> t_stream = nullptr) {
     CHECK_NOTNULL(user_memory_);
     CHECK_NOTNULL(tensor);
+#ifdef ENABLE_MKLDNN_THREADPOOL
+    user_memory_->set_data_handle(GetTensorBuffer(tensor), *t_stream);
+#else
     user_memory_->set_data_handle(GetTensorBuffer(tensor));
+#endif  // ENABLE_MKLDNN_THREADPOOL
   }
 
   /// allocate function for data buffer

From df4ea0c1a5ffd82ea6b9159f864fd354dc19d8f7 Mon Sep 17 00:00:00 2001
From: Scott Zhu <scottzhu@google.com>
Date: Mon, 15 Jun 2020 13:18:42 -0700
Subject: [PATCH 0198/1390] Move tracking_util_xla_test to keras/tests

PiperOrigin-RevId: 316531126
Change-Id: I8d93adaca9c51bf85a6a938ce6a0dacc160a8a48
---
 tensorflow/python/keras/tests/BUILD           | 27 +++++++++++++++++++
 .../tests/tracking_util_xla_test.py}          |  0
 tensorflow/python/training/tracking/BUILD     | 26 ------------------
 3 files changed, 27 insertions(+), 26 deletions(-)
 rename tensorflow/python/{training/tracking/util_xla_test.py => keras/tests/tracking_util_xla_test.py} (100%)

diff --git a/tensorflow/python/keras/tests/BUILD b/tensorflow/python/keras/tests/BUILD
index d03b1bd1ee8..36af32184e6 100644
--- a/tensorflow/python/keras/tests/BUILD
+++ b/tensorflow/python/keras/tests/BUILD
@@ -6,6 +6,7 @@ load("//tensorflow:tensorflow.bzl", "cuda_py_test")
 # buildifier: disable=same-origin-load
 load("//tensorflow:tensorflow.bzl", "tf_py_test")
 load("//tensorflow/python/tpu:tpu.bzl", "tpu_py_test")
+load("//tensorflow/compiler/tests:build_defs.bzl", "tf_xla_py_test")
 
 package(
     default_visibility = [
@@ -466,6 +467,32 @@ tf_py_test(
     ],
 )
 
+tf_xla_py_test(
+    name = "tracking_util_xla_test",
+    srcs = ["tracking_util_xla_test.py"],
+    python_version = "PY3",
+    tags = [
+        "no_pip",
+        "no_windows",
+        "nomac",
+        "notsan",  # b/74395663
+    ],
+    deps = [
+        "//tensorflow/compiler/tests:xla_test",
+        "//tensorflow/python:checkpoint_management",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python/eager:backprop",
+        "//tensorflow/python/keras:metrics",
+        "//tensorflow/python/keras/engine",
+        "//tensorflow/python/keras/layers:core",
+        "//tensorflow/python/keras/optimizer_v2",
+        "//tensorflow/python/training/tracking",
+        "//tensorflow/python/training/tracking:util",
+    ],
+)
+
 py_library(
     name = "get_config_samples",
     srcs = ["get_config_samples.py"],
diff --git a/tensorflow/python/training/tracking/util_xla_test.py b/tensorflow/python/keras/tests/tracking_util_xla_test.py
similarity index 100%
rename from tensorflow/python/training/tracking/util_xla_test.py
rename to tensorflow/python/keras/tests/tracking_util_xla_test.py
diff --git a/tensorflow/python/training/tracking/BUILD b/tensorflow/python/training/tracking/BUILD
index 88dfd8eba55..ffc43964fb4 100644
--- a/tensorflow/python/training/tracking/BUILD
+++ b/tensorflow/python/training/tracking/BUILD
@@ -6,7 +6,6 @@ load(
     "//tensorflow/tools/test:performance.bzl",
     "tf_py_logged_benchmark",
 )
-load("//tensorflow/compiler/tests:build_defs.bzl", "tf_xla_py_test")
 
 package(
     default_visibility = [
@@ -191,31 +190,6 @@ tf_py_test(
     ],
 )
 
-tf_xla_py_test(
-    name = "util_xla_test",
-    srcs = ["util_xla_test.py"],
-    python_version = "PY3",
-    tags = [
-        "no_pip",
-        "no_windows",
-        "nomac",
-        "notsan",  # b/74395663
-    ],
-    deps = [
-        ":tracking",
-        ":util",
-        "//tensorflow/compiler/tests:xla_test",
-        "//tensorflow/python:checkpoint_management",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python/eager:backprop",
-        "//tensorflow/python/keras:engine",
-        "//tensorflow/python/keras/layers",
-        "//tensorflow/python/keras/optimizer_v2",
-    ],
-)
-
 tf_py_test(
     name = "util_with_v1_optimizers_test",
     srcs = ["util_with_v1_optimizers_test.py"],

From 52736a6adc5bf2d30512e0bdb627f048d64a6561 Mon Sep 17 00:00:00 2001
From: Francois Chollet <fchollet@google.com>
Date: Mon, 15 Jun 2020 13:30:30 -0700
Subject: [PATCH 0199/1390] Prevent Keras dataset loading from affecting the
 global RNG

PiperOrigin-RevId: 316533425
Change-Id: I6099847f9a7ead24786fb2fecd5ba488f53456e6
---
 tensorflow/python/keras/datasets/imdb.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tensorflow/python/keras/datasets/imdb.py b/tensorflow/python/keras/datasets/imdb.py
index 61fbf92eaef..37403228edf 100644
--- a/tensorflow/python/keras/datasets/imdb.py
+++ b/tensorflow/python/keras/datasets/imdb.py
@@ -113,14 +113,14 @@ def load_data(path='imdb.npz',
     x_train, labels_train = f['x_train'], f['y_train']
     x_test, labels_test = f['x_test'], f['y_test']
 
-  np.random.seed(seed)
+  rng = np.random.RandomState(seed)
   indices = np.arange(len(x_train))
-  np.random.shuffle(indices)
+  rng.shuffle(indices)
   x_train = x_train[indices]
   labels_train = labels_train[indices]
 
   indices = np.arange(len(x_test))
-  np.random.shuffle(indices)
+  rng.shuffle(indices)
   x_test = x_test[indices]
   labels_test = labels_test[indices]
 

From a8950d70bfe0405fa405127cf5fd824a7a778aac Mon Sep 17 00:00:00 2001
From: Shanqing Cai <cais@google.com>
Date: Mon, 15 Jun 2020 13:49:39 -0700
Subject: [PATCH 0200/1390] [tfdbg2] Add tfdbg_run_id to metadata of data dumps

- A data dump file set generated by tfdbg2 can contain
  multiple subsets when there are multiple hosts involved
  in the instrumented TensorFlow job (e.g., TPUs and Parameter Servers).
  Currently, there is no bit in those subset of files that
  indicates they belong to the same instrumented TF job.
  - This CL addresses this problem by adding a field to the
    metadata proto used by those files (`tfdbg_run_id`)
- The DebugEventsWriter code is revised, so that this new
  field is written to the metadata file of the file set on the writer's
  construction.
- Also in this CL: remove the previous 1-arg `GetDebugEventsWriter(dump_root)`
  that creates the writer object if it doesn't exist at the specified
  dump_root. Replace it with `LookUpDebugEventsWriter(dump_root)` that only
  looks up the writer object and returns a non-OK status if such an object
  hasn't been created at `dump_root`. This makes the code less error prone by
  keeping only the fully-explicit, 3-arg `GetDebugEventsWriter()`.

PiperOrigin-RevId: 316537044
Change-Id: Id5be0b771fbf37c0fc796f1514ed858a0e6d38f0
---
 tensorflow/core/kernels/debug_ops.h           |  11 +-
 tensorflow/core/ops/debug_ops.cc              |   1 +
 tensorflow/core/protobuf/debug_event.proto    |   6 +
 tensorflow/core/util/debug_events_writer.cc   |  23 +++-
 tensorflow/core/util/debug_events_writer.h    |  33 ++++--
 .../core/util/debug_events_writer_test.cc     | 104 ++++++++++--------
 .../client/debug_events_writer_wrapper.cc     |  59 +++++-----
 .../python/debug/lib/debug_events_reader.py   |   5 +
 .../python/debug/lib/debug_events_writer.py   |   6 +-
 .../debug/lib/debug_events_writer_test.py     |  39 +++++--
 .../python/debug/lib/debug_v2_ops_test.py     |   7 +-
 .../python/debug/lib/dumping_callback.py      |  15 ++-
 .../debug/lib/dumping_callback_test_lib.py    |   2 +
 .../api/golden/v1/tensorflow.raw_ops.pbtxt    |   2 +-
 .../api/golden/v2/tensorflow.raw_ops.pbtxt    |   2 +-
 15 files changed, 203 insertions(+), 112 deletions(-)

diff --git a/tensorflow/core/kernels/debug_ops.h b/tensorflow/core/kernels/debug_ops.h
index 3fef822244d..498cd6146a8 100644
--- a/tensorflow/core/kernels/debug_ops.h
+++ b/tensorflow/core/kernels/debug_ops.h
@@ -410,7 +410,8 @@ class DebugIdentityV2Op : public OpKernel {
       : OpKernel(context),
         device_name_(context->device()->name()),
         output_slot_(-1),
-        tensor_debug_mode_(0) {
+        tensor_debug_mode_(0),
+        tfdbg_run_id_() {
     std::vector<string> debug_urls;
     OP_REQUIRES_OK(context, context->GetAttr("debug_urls", &debug_urls));
     for (const string& debug_url : debug_urls) {
@@ -435,14 +436,17 @@ class DebugIdentityV2Op : public OpKernel {
       circular_buffer_size_ =
           tfdbg::DebugEventsWriter::kDefaultCyclicBufferSize;
     }
+    if (context->HasAttr("tfdbg_run_id")) {
+      OP_REQUIRES_OK(context, context->GetAttr("tfdbg_run_id", &tfdbg_run_id_));
+    }
   }
 
   void Compute(OpKernelContext* context) override {
     const Tensor& tensor = context->input(0);
     for (const string& dump_root : dump_roots_) {
       tfdbg::DebugEventsWriter* debug_events_writer =
-          tfdbg::DebugEventsWriter::GetDebugEventsWriter(dump_root,
-                                                         circular_buffer_size_);
+          tfdbg::DebugEventsWriter::GetDebugEventsWriter(
+              dump_root, tfdbg_run_id_, circular_buffer_size_);
       OP_REQUIRES_OK(context, debug_events_writer->WriteGraphExecutionTrace(
                                   tfdbg_context_id_, device_name_, op_name_,
                                   output_slot_, tensor_debug_mode_, tensor));
@@ -458,6 +462,7 @@ class DebugIdentityV2Op : public OpKernel {
   int32 output_slot_;
   int32 tensor_debug_mode_;
   int64 circular_buffer_size_;
+  string tfdbg_run_id_;
 };
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
diff --git a/tensorflow/core/ops/debug_ops.cc b/tensorflow/core/ops/debug_ops.cc
index 0ecc58a6a8f..ac67a0f75f3 100644
--- a/tensorflow/core/ops/debug_ops.cc
+++ b/tensorflow/core/ops/debug_ops.cc
@@ -91,6 +91,7 @@ REGISTER_OP("DebugIdentityV2")
     .Attr("tensor_debug_mode: int = -1")
     .Attr("debug_urls: list(string) = []")
     .Attr("circular_buffer_size: int = 1000")
+    .Attr("tfdbg_run_id: string = ''")
     .SetIsStateful()
     .SetShapeFn(shape_inference::UnchangedShape);
 
diff --git a/tensorflow/core/protobuf/debug_event.proto b/tensorflow/core/protobuf/debug_event.proto
index 005abe53194..5541c397fb8 100644
--- a/tensorflow/core/protobuf/debug_event.proto
+++ b/tensorflow/core/protobuf/debug_event.proto
@@ -115,6 +115,12 @@ message DebugMetadata {
   // Version of the DebugEvent file format.
   // Has a format of "debug.Event:<number>", e.g., "debug.Event:1".
   string file_version = 2;
+
+  // A unique ID for the current run of tfdbg.
+  // A run of tfdbg is defined as a TensorFlow job instrumented by tfdbg.
+  // Multiple hosts in a distributed TensorFlow job instrumented by tfdbg
+  // have the same ID.
+  string tfdbg_run_id = 3;
 }
 
 // Content of a source file involved in the execution of the debugged TensorFlow
diff --git a/tensorflow/core/util/debug_events_writer.cc b/tensorflow/core/util/debug_events_writer.cc
index d9c3393ce3c..8ee42959131 100644
--- a/tensorflow/core/util/debug_events_writer.cc
+++ b/tensorflow/core/util/debug_events_writer.cc
@@ -122,23 +122,31 @@ DebugEventsWriter::~DebugEventsWriter() { Close().IgnoreError(); }
 
 // static
 DebugEventsWriter* DebugEventsWriter::GetDebugEventsWriter(
-    const string& dump_root, int64 circular_buffer_size) {
+    const string& dump_root, const string& tfdbg_run_id,
+    int64 circular_buffer_size) {
   mutex_lock l(DebugEventsWriter::factory_mu_);
   std::unordered_map<string, std::unique_ptr<DebugEventsWriter>>* writer_pool =
       DebugEventsWriter::GetDebugEventsWriterMap();
   if (writer_pool->find(dump_root) == writer_pool->end()) {
     std::unique_ptr<DebugEventsWriter> writer(
-        new DebugEventsWriter(dump_root, circular_buffer_size));
+        new DebugEventsWriter(dump_root, tfdbg_run_id, circular_buffer_size));
     writer_pool->insert(std::make_pair(dump_root, std::move(writer)));
   }
   return (*writer_pool)[dump_root].get();
 }
 
 // static
-DebugEventsWriter* DebugEventsWriter::GetDebugEventsWriter(
-    const string& dump_root) {
-  return DebugEventsWriter::GetDebugEventsWriter(dump_root,
-                                                 kDefaultCyclicBufferSize);
+Status DebugEventsWriter::LookUpDebugEventsWriter(
+    const string& dump_root, DebugEventsWriter** debug_events_writer) {
+  mutex_lock l(DebugEventsWriter::factory_mu_);
+  std::unordered_map<string, std::unique_ptr<DebugEventsWriter>>* writer_pool =
+      DebugEventsWriter::GetDebugEventsWriterMap();
+  if (writer_pool->find(dump_root) == writer_pool->end()) {
+    return errors::FailedPrecondition(
+        "No DebugEventsWriter has been created at dump root ", dump_root);
+  }
+  *debug_events_writer = (*writer_pool)[dump_root].get();
+  return Status::OK();
 }
 
 Status DebugEventsWriter::Init() {
@@ -179,6 +187,7 @@ Status DebugEventsWriter::Init() {
   metadata->set_tensorflow_version(TF_VERSION_STRING);
   metadata->set_file_version(
       strings::Printf("%s%d", kVersionPrefix, kCurrentFormatVersion));
+  metadata->set_tfdbg_run_id(tfdbg_run_id_);
   TF_RETURN_IF_ERROR(SerializeAndWriteDebugEvent(&debug_event, METADATA));
   TF_RETURN_WITH_CONTEXT_IF_ERROR(
       metadata_writer_->Flush(), "Failed to flush debug event metadata writer");
@@ -457,9 +466,11 @@ DebugEventsWriter::GetDebugEventsWriterMap() {
 }
 
 DebugEventsWriter::DebugEventsWriter(const string& dump_root,
+                                     const string& tfdbg_run_id,
                                      int64 circular_buffer_size)
     : env_(Env::Default()),
       dump_root_(dump_root),
+      tfdbg_run_id_(tfdbg_run_id),
       is_initialized_(false),
       initialization_mu_(),
       circular_buffer_size_(circular_buffer_size),
diff --git a/tensorflow/core/util/debug_events_writer.h b/tensorflow/core/util/debug_events_writer.h
index 39835adf1a6..412f947e22d 100644
--- a/tensorflow/core/util/debug_events_writer.h
+++ b/tensorflow/core/util/debug_events_writer.h
@@ -93,18 +93,27 @@ class DebugEventsWriter {
   // sets of six. The singleton pattern avoids storing multiple sets in a single
   // folder, which might cause confusion.
   //
+  // If an instance of DebugEventsWriter has already been created at a
+  // `dump_root`, calling this method with the same `dump_root` will return
+  // the existing instance.
+  //
   // Args:
   //   dump_root: Dump root directory. If it doesn't exist, will be created.
+  //   tfdbg_run_id: Debugging run ID of the writer.
   //   circular_buffer_size: Circular buffer size (in number of DebugEvent
   //     protos). If set to a value <=0, will abolish the circular-buffer
   //     behavior.
   // Returns:
   //   A pointer to a DebugEventsWriter object: a per-dump_root singleton.
   static DebugEventsWriter* GetDebugEventsWriter(const string& dump_root,
+                                                 const string& tfdbg_run_id,
                                                  int64 circular_buffer_size);
-  // Same as the 2-arg factory method above, but uses the default circular
-  // buffer size.
-  static DebugEventsWriter* GetDebugEventsWriter(const string& dump_root);
+  // Look up existing events writer by dump_root.
+  // If no DebugEventsWriter has been created at the dump_root, a non-OK
+  // Status will be returned. Else an OK status will be returned, with
+  // the pointer to the existing instance provided by reference.
+  static Status LookUpDebugEventsWriter(
+      const string& dump_root, DebugEventsWriter** debug_events_writer);
   ~DebugEventsWriter();
 
   // Sets the debug event filenames and opens file for writing.
@@ -116,8 +125,8 @@ class DebugEventsWriter {
   // deleted by another process), this will open a new file.
   Status Init();
 
-  // The four DebugEvent fields below are written _without_ the circular buffer.
-  // Source file contents are written to the *.source_files file.
+  // The four DebugEvent fields below are written _without_ the circular
+  // buffer. Source file contents are written to the *.source_files file.
   // Takes ownership of source_file.
   Status WriteSourceFile(SourceFile* source_file);
   // Stack frames are written to the *.code_locations file.
@@ -132,9 +141,8 @@ class DebugEventsWriter {
 
   // The two DebugEvent fields below are written to the circular buffer
   // and saved to disk only at the FlushExecutionFiles() call.
-  // Execution events (eager execution of an op or a tf.function) are written to
-  // the *.execution file.
-  // Takes ownership of execution.
+  // Execution events (eager execution of an op or a tf.function) are written
+  // to the *.execution file. Takes ownership of execution.
   Status WriteExecution(Execution* execution);
   // Graph execution traces (graph-internal tensor values or their summaries)
   // are written to the *.graph_execution_traces file.
@@ -151,8 +159,9 @@ class DebugEventsWriter {
   //     which the trace concerns multiple tensors, this is an empty string.
   //   output_slot: Output slot index of the op that this trace is concerned
   //     with.
-  //   tensor_debug_mode: An integer that represents the tensor-debug mode enum.
-  //   tensor_value: The value of the tensor that describes the tensor(s)
+  //   tensor_debug_mode: An integer that represents the tensor-debug mode
+  //   enum. tensor_value: The value of the tensor that describes the
+  //   tensor(s)
   //     that this trace is concerned with. The semantics of this tensor value
   //     depends on the value of `tensor_debug_mode`.
   Status WriteGraphExecutionTrace(const string& tfdbg_context_id,
@@ -208,7 +217,8 @@ class DebugEventsWriter {
   // Guards calls to the GetDebugEventsWriter() method.
   static mutex factory_mu_;
 
-  DebugEventsWriter(const string& dump_root, int64 circular_buffer_size);
+  DebugEventsWriter(const string& dump_root, const string& tfdbg_run_id,
+                    int64 circular_buffer_size);
 
   // Get the path prefix. The same for all files, which differ only in the
   // suffix.
@@ -227,6 +237,7 @@ class DebugEventsWriter {
 
   Env* env_;
   const string dump_root_;
+  const string tfdbg_run_id_;
 
   string file_prefix_;
   bool is_initialized_ TF_GUARDED_BY(initialization_mu_);
diff --git a/tensorflow/core/util/debug_events_writer_test.cc b/tensorflow/core/util/debug_events_writer_test.cc
index bd0c731bc90..45895763673 100644
--- a/tensorflow/core/util/debug_events_writer_test.cc
+++ b/tensorflow/core/util/debug_events_writer_test.cc
@@ -71,6 +71,7 @@ class DebugEventsWriterTest : public ::testing::Test {
     dump_root_ = io::JoinPath(
         testing::TmpDir(),
         strings::Printf("%010lld", static_cast<long long>(env()->NowMicros())));
+    tfdbg_run_id_ = "test_tfdbg_run_id";
   }
 
   void TearDown() override {
@@ -85,14 +86,15 @@ class DebugEventsWriterTest : public ::testing::Test {
   }
 
   string dump_root_;
+  string tfdbg_run_id_;
 };
 
 TEST_F(DebugEventsWriterTest, GetDebugEventsWriterSameRootGivesSameObject) {
   // Test the per-dump_root_ singleton pattern.
-  DebugEventsWriter* writer_1 =
-      DebugEventsWriter::GetDebugEventsWriter(dump_root_);
-  DebugEventsWriter* writer_2 =
-      DebugEventsWriter::GetDebugEventsWriter(dump_root_);
+  DebugEventsWriter* writer_1 = DebugEventsWriter::GetDebugEventsWriter(
+      dump_root_, tfdbg_run_id_, DebugEventsWriter::kDefaultCyclicBufferSize);
+  DebugEventsWriter* writer_2 = DebugEventsWriter::GetDebugEventsWriter(
+      dump_root_, tfdbg_run_id_, DebugEventsWriter::kDefaultCyclicBufferSize);
   EXPECT_EQ(writer_1, writer_2);
 }
 
@@ -103,8 +105,8 @@ TEST_F(DebugEventsWriterTest, ConcurrentGetDebugEventsWriterSameDumpRoot) {
   std::vector<DebugEventsWriter*> writers;
   mutex mu;
   auto fn = [this, &writers, &mu]() {
-    DebugEventsWriter* writer =
-        DebugEventsWriter::GetDebugEventsWriter(dump_root_);
+    DebugEventsWriter* writer = DebugEventsWriter::GetDebugEventsWriter(
+        dump_root_, tfdbg_run_id_, DebugEventsWriter::kDefaultCyclicBufferSize);
     {
       mutex_lock l(mu);
       writers.push_back(writer);
@@ -131,8 +133,9 @@ TEST_F(DebugEventsWriterTest, ConcurrentGetDebugEventsWriterDiffDumpRoots) {
   auto fn = [this, &counter, &writers, &mu]() {
     const string new_dump_root =
         io::JoinPath(dump_root_, strings::Printf("%ld", counter.fetch_add(1)));
-    DebugEventsWriter* writer =
-        DebugEventsWriter::GetDebugEventsWriter(new_dump_root);
+    DebugEventsWriter* writer = DebugEventsWriter::GetDebugEventsWriter(
+        new_dump_root, tfdbg_run_id_,
+        DebugEventsWriter::kDefaultCyclicBufferSize);
     {
       mutex_lock l(mu);
       writers.push_back(writer);
@@ -151,17 +154,17 @@ TEST_F(DebugEventsWriterTest, ConcurrentGetDebugEventsWriterDiffDumpRoots) {
 
 TEST_F(DebugEventsWriterTest, GetDebugEventsWriterDifferentRoots) {
   // Test the DebugEventsWriters for different directories are different.
-  DebugEventsWriter* writer_1 =
-      DebugEventsWriter::GetDebugEventsWriter(dump_root_);
+  DebugEventsWriter* writer_1 = DebugEventsWriter::GetDebugEventsWriter(
+      dump_root_, tfdbg_run_id_, DebugEventsWriter::kDefaultCyclicBufferSize);
   const string dump_root_2 = io::JoinPath(dump_root_, "subdirectory");
-  DebugEventsWriter* writer_2 =
-      DebugEventsWriter::GetDebugEventsWriter(dump_root_2);
+  DebugEventsWriter* writer_2 = DebugEventsWriter::GetDebugEventsWriter(
+      dump_root_2, tfdbg_run_id_, DebugEventsWriter::kDefaultCyclicBufferSize);
   EXPECT_NE(writer_1, writer_2);
 }
 
 TEST_F(DebugEventsWriterTest, GetAndInitDebugEventsWriter) {
-  DebugEventsWriter* writer =
-      DebugEventsWriter::GetDebugEventsWriter(dump_root_);
+  DebugEventsWriter* writer = DebugEventsWriter::GetDebugEventsWriter(
+      dump_root_, tfdbg_run_id_, DebugEventsWriter::kDefaultCyclicBufferSize);
   TF_ASSERT_OK(writer->Init());
   TF_ASSERT_OK(writer->Close());
 
@@ -174,6 +177,8 @@ TEST_F(DebugEventsWriterTest, GetAndInitDebugEventsWriter) {
   const string file_version = actuals[0].debug_metadata().file_version();
   EXPECT_EQ(file_version.find(DebugEventsWriter::kVersionPrefix), 0);
   EXPECT_GT(file_version.size(), strlen(DebugEventsWriter::kVersionPrefix));
+  // Check the tfdbg run ID.
+  EXPECT_EQ(actuals[0].debug_metadata().tfdbg_run_id(), "test_tfdbg_run_id");
 
   // Verify that the .source_files file has been created and is empty.
   ReadDebugEventProtos(writer, DebugEventFileType::SOURCE_FILES, &actuals);
@@ -182,22 +187,22 @@ TEST_F(DebugEventsWriterTest, GetAndInitDebugEventsWriter) {
 }
 
 TEST_F(DebugEventsWriterTest, CallingCloseWithoutInitIsOkay) {
-  DebugEventsWriter* writer =
-      DebugEventsWriter::GetDebugEventsWriter(dump_root_);
+  DebugEventsWriter* writer = DebugEventsWriter::GetDebugEventsWriter(
+      dump_root_, tfdbg_run_id_, DebugEventsWriter::kDefaultCyclicBufferSize);
   TF_ASSERT_OK(writer->Close());
 }
 
 TEST_F(DebugEventsWriterTest, CallingCloseTwiceIsOkay) {
-  DebugEventsWriter* writer =
-      DebugEventsWriter::GetDebugEventsWriter(dump_root_);
+  DebugEventsWriter* writer = DebugEventsWriter::GetDebugEventsWriter(
+      dump_root_, tfdbg_run_id_, DebugEventsWriter::kDefaultCyclicBufferSize);
   TF_ASSERT_OK(writer->Close());
   TF_ASSERT_OK(writer->Close());
 }
 
 TEST_F(DebugEventsWriterTest, ConcurrentInitCalls) {
   // Test that concurrent calls to Init() works correctly.
-  DebugEventsWriter* writer =
-      DebugEventsWriter::GetDebugEventsWriter(dump_root_);
+  DebugEventsWriter* writer = DebugEventsWriter::GetDebugEventsWriter(
+      dump_root_, tfdbg_run_id_, DebugEventsWriter::kDefaultCyclicBufferSize);
 
   thread::ThreadPool* thread_pool =
       new thread::ThreadPool(Env::Default(), "test_pool", 4);
@@ -218,6 +223,7 @@ TEST_F(DebugEventsWriterTest, ConcurrentInitCalls) {
   const string file_version = actuals[0].debug_metadata().file_version();
   EXPECT_EQ(file_version.find(DebugEventsWriter::kVersionPrefix), 0);
   EXPECT_GT(file_version.size(), strlen(DebugEventsWriter::kVersionPrefix));
+  EXPECT_EQ(actuals[0].debug_metadata().tfdbg_run_id(), "test_tfdbg_run_id");
 
   // Verify that the .source_files file has been created and is empty.
   ReadDebugEventProtos(writer, DebugEventFileType::SOURCE_FILES, &actuals);
@@ -227,14 +233,15 @@ TEST_F(DebugEventsWriterTest, ConcurrentInitCalls) {
 
 TEST_F(DebugEventsWriterTest, InitTwiceDoesNotCreateNewMetadataFile) {
   // Test that Init() is idempotent.
-  DebugEventsWriter* writer =
-      DebugEventsWriter::GetDebugEventsWriter(dump_root_);
+  DebugEventsWriter* writer = DebugEventsWriter::GetDebugEventsWriter(
+      dump_root_, tfdbg_run_id_, DebugEventsWriter::kDefaultCyclicBufferSize);
   TF_ASSERT_OK(writer->Init());
 
   std::vector<DebugEvent> actuals;
   ReadDebugEventProtos(writer, DebugEventFileType::METADATA, &actuals);
   EXPECT_EQ(actuals.size(), 1);
   EXPECT_GT(actuals[0].debug_metadata().tensorflow_version().length(), 0);
+  EXPECT_EQ(actuals[0].debug_metadata().tfdbg_run_id(), "test_tfdbg_run_id");
   EXPECT_GE(actuals[0].debug_metadata().file_version().size(), 0);
 
   string metadata_path_1 =
@@ -248,12 +255,13 @@ TEST_F(DebugEventsWriterTest, InitTwiceDoesNotCreateNewMetadataFile) {
   ReadDebugEventProtos(writer, DebugEventFileType::METADATA, &actuals);
   EXPECT_EQ(actuals.size(), 1);
   EXPECT_GT(actuals[0].debug_metadata().tensorflow_version().length(), 0);
+  EXPECT_EQ(actuals[0].debug_metadata().tfdbg_run_id(), "test_tfdbg_run_id");
   EXPECT_GE(actuals[0].debug_metadata().file_version().size(), 0);
 }
 
 TEST_F(DebugEventsWriterTest, WriteSourceFile) {
-  DebugEventsWriter* writer =
-      DebugEventsWriter::GetDebugEventsWriter(dump_root_);
+  DebugEventsWriter* writer = DebugEventsWriter::GetDebugEventsWriter(
+      dump_root_, tfdbg_run_id_, DebugEventsWriter::kDefaultCyclicBufferSize);
   TF_ASSERT_OK(writer->Init());
 
   SourceFile* source_file_1 = new SourceFile();
@@ -313,8 +321,8 @@ TEST_F(DebugEventsWriterTest, WriteSourceFile) {
 }
 
 TEST_F(DebugEventsWriterTest, WriteStackFramesFile) {
-  DebugEventsWriter* writer =
-      DebugEventsWriter::GetDebugEventsWriter(dump_root_);
+  DebugEventsWriter* writer = DebugEventsWriter::GetDebugEventsWriter(
+      dump_root_, tfdbg_run_id_, DebugEventsWriter::kDefaultCyclicBufferSize);
   TF_ASSERT_OK(writer->Init());
 
   StackFrameWithId* stack_frame_1 = new StackFrameWithId();
@@ -375,8 +383,8 @@ TEST_F(DebugEventsWriterTest, WriteStackFramesFile) {
 }
 
 TEST_F(DebugEventsWriterTest, WriteGraphOpCreationAndDebuggedGraph) {
-  DebugEventsWriter* writer =
-      DebugEventsWriter::GetDebugEventsWriter(dump_root_);
+  DebugEventsWriter* writer = DebugEventsWriter::GetDebugEventsWriter(
+      dump_root_, tfdbg_run_id_, DebugEventsWriter::kDefaultCyclicBufferSize);
   TF_ASSERT_OK(writer->Init());
 
   GraphOpCreation* graph_op_creation = new GraphOpCreation();
@@ -415,8 +423,8 @@ TEST_F(DebugEventsWriterTest, WriteGraphOpCreationAndDebuggedGraph) {
 
 TEST_F(DebugEventsWriterTest, ConcurrentWriteCallsToTheSameFile) {
   const size_t kConcurrentWrites = 100;
-  DebugEventsWriter* writer =
-      DebugEventsWriter::GetDebugEventsWriter(dump_root_);
+  DebugEventsWriter* writer = DebugEventsWriter::GetDebugEventsWriter(
+      dump_root_, tfdbg_run_id_, DebugEventsWriter::kDefaultCyclicBufferSize);
   TF_ASSERT_OK(writer->Init());
 
   thread::ThreadPool* thread_pool =
@@ -456,8 +464,8 @@ TEST_F(DebugEventsWriterTest, ConcurrentWriteCallsToTheSameFile) {
 
 TEST_F(DebugEventsWriterTest, ConcurrentWriteAndFlushCallsToTheSameFile) {
   const size_t kConcurrentWrites = 100;
-  DebugEventsWriter* writer =
-      DebugEventsWriter::GetDebugEventsWriter(dump_root_);
+  DebugEventsWriter* writer = DebugEventsWriter::GetDebugEventsWriter(
+      dump_root_, tfdbg_run_id_, DebugEventsWriter::kDefaultCyclicBufferSize);
   TF_ASSERT_OK(writer->Init());
 
   thread::ThreadPool* thread_pool =
@@ -498,8 +506,8 @@ TEST_F(DebugEventsWriterTest, ConcurrentWriteAndFlushCallsToTheSameFile) {
 
 TEST_F(DebugEventsWriterTest, ConcurrentWriteCallsToTheDifferentFiles) {
   const int32 kConcurrentWrites = 30;
-  DebugEventsWriter* writer =
-      DebugEventsWriter::GetDebugEventsWriter(dump_root_);
+  DebugEventsWriter* writer = DebugEventsWriter::GetDebugEventsWriter(
+      dump_root_, tfdbg_run_id_, DebugEventsWriter::kDefaultCyclicBufferSize);
   TF_ASSERT_OK(writer->Init());
 
   thread::ThreadPool* thread_pool =
@@ -576,8 +584,8 @@ TEST_F(DebugEventsWriterTest, ConcurrentWriteCallsToTheDifferentFiles) {
 TEST_F(DebugEventsWriterTest, WriteExecutionWithCyclicBufferNoFlush) {
   // Verify that no writing to disk happens until the flushing method is called.
   const size_t kCyclicBufferSize = 10;
-  DebugEventsWriter* writer =
-      DebugEventsWriter::GetDebugEventsWriter(dump_root_, kCyclicBufferSize);
+  DebugEventsWriter* writer = DebugEventsWriter::GetDebugEventsWriter(
+      dump_root_, tfdbg_run_id_, kCyclicBufferSize);
   TF_ASSERT_OK(writer->Init());
 
   // First, try writing and flushing more debug events than the capacity
@@ -601,8 +609,8 @@ TEST_F(DebugEventsWriterTest, WriteExecutionWithCyclicBufferNoFlush) {
 TEST_F(DebugEventsWriterTest, WriteExecutionWithCyclicBufferFlush) {
   // Verify that writing to disk happens when the flushing method is called.
   const size_t kCyclicBufferSize = 10;
-  DebugEventsWriter* writer =
-      DebugEventsWriter::GetDebugEventsWriter(dump_root_, kCyclicBufferSize);
+  DebugEventsWriter* writer = DebugEventsWriter::GetDebugEventsWriter(
+      dump_root_, tfdbg_run_id_, kCyclicBufferSize);
   TF_ASSERT_OK(writer->Init());
 
   // First, try writing and flushing more debug events than the capacity
@@ -673,8 +681,8 @@ TEST_F(DebugEventsWriterTest, WriteExecutionWithCyclicBufferFlush) {
 TEST_F(DebugEventsWriterTest, WriteGrahExecutionTraceWithCyclicBufferNoFlush) {
   // Check no writing to disk happens before the flushing method is called.
   const size_t kCyclicBufferSize = 10;
-  DebugEventsWriter* writer =
-      DebugEventsWriter::GetDebugEventsWriter(dump_root_, kCyclicBufferSize);
+  DebugEventsWriter* writer = DebugEventsWriter::GetDebugEventsWriter(
+      dump_root_, tfdbg_run_id_, kCyclicBufferSize);
   TF_ASSERT_OK(writer->Init());
 
   // First, try writing and flushing more debug events than the capacity
@@ -697,8 +705,8 @@ TEST_F(DebugEventsWriterTest, WriteGrahExecutionTraceWithCyclicBufferNoFlush) {
 
 TEST_F(DebugEventsWriterTest, WriteGrahExecutionTraceWithoutPreviousInitCall) {
   const size_t kCyclicBufferSize = -1;
-  DebugEventsWriter* writer =
-      DebugEventsWriter::GetDebugEventsWriter(dump_root_, kCyclicBufferSize);
+  DebugEventsWriter* writer = DebugEventsWriter::GetDebugEventsWriter(
+      dump_root_, tfdbg_run_id_, kCyclicBufferSize);
   // NOTE(cais): `writer->Init()` is not called here before
   // WriteGraphExecutionTrace() is called. This test checks that this is okay
   // and the `GraphExecutionTrace` gets written correctly even without `Init()`
@@ -722,8 +730,8 @@ TEST_F(DebugEventsWriterTest, WriteGrahExecutionTraceWithoutPreviousInitCall) {
 
 TEST_F(DebugEventsWriterTest, WriteGrahExecutionTraceWithCyclicBufferFlush) {
   const size_t kCyclicBufferSize = 10;
-  DebugEventsWriter* writer =
-      DebugEventsWriter::GetDebugEventsWriter(dump_root_, kCyclicBufferSize);
+  DebugEventsWriter* writer = DebugEventsWriter::GetDebugEventsWriter(
+      dump_root_, tfdbg_run_id_, kCyclicBufferSize);
   TF_ASSERT_OK(writer->Init());
 
   // First, try writing and flushing more debug events than the capacity
@@ -788,8 +796,8 @@ TEST_F(DebugEventsWriterTest, WriteGrahExecutionTraceWithCyclicBufferFlush) {
 }
 
 TEST_F(DebugEventsWriterTest, RegisterDeviceAndGetIdTrace) {
-  DebugEventsWriter* writer =
-      DebugEventsWriter::GetDebugEventsWriter(dump_root_);
+  DebugEventsWriter* writer = DebugEventsWriter::GetDebugEventsWriter(
+      dump_root_, tfdbg_run_id_, DebugEventsWriter::kDefaultCyclicBufferSize);
   TF_ASSERT_OK(writer->Init());
 
   // Register and get some device IDs in a concurrent fashion.
@@ -833,8 +841,8 @@ TEST_F(DebugEventsWriterTest, RegisterDeviceAndGetIdTrace) {
 
 TEST_F(DebugEventsWriterTest, DisableCyclicBufferBehavior) {
   const size_t kCyclicBufferSize = 0;  // A value <= 0 disables cyclic behavior.
-  DebugEventsWriter* writer =
-      DebugEventsWriter::GetDebugEventsWriter(dump_root_, kCyclicBufferSize);
+  DebugEventsWriter* writer = DebugEventsWriter::GetDebugEventsWriter(
+      dump_root_, tfdbg_run_id_, kCyclicBufferSize);
   TF_ASSERT_OK(writer->Init());
 
   const size_t kNumEvents = 20;
diff --git a/tensorflow/python/client/debug_events_writer_wrapper.cc b/tensorflow/python/client/debug_events_writer_wrapper.cc
index a786c6f2db6..15802df40fe 100644
--- a/tensorflow/python/client/debug_events_writer_wrapper.cc
+++ b/tensorflow/python/client/debug_events_writer_wrapper.cc
@@ -29,9 +29,10 @@ PYBIND11_MODULE(_pywrap_debug_events_writer, m) {
   using namespace tensorflow::tfdbg;  // NOLINT(build/namespaces)
 
   m.def("Init",
-        [](const std::string& dump_root, const int64 circular_buffer_size) {
+        [](const std::string& dump_root, const std::string& tfdbg_run_id,
+           const int64 circular_buffer_size) {
           DebugEventsWriter* writer = DebugEventsWriter::GetDebugEventsWriter(
-              dump_root, circular_buffer_size);
+              dump_root, tfdbg_run_id, circular_buffer_size);
           if (!writer->Init().ok()) {
             throw py::value_error(tensorflow::strings::Printf(
                 "Failed to initialize debug events writer at: %s",
@@ -41,8 +42,9 @@ PYBIND11_MODULE(_pywrap_debug_events_writer, m) {
   m.def("WriteSourceFile",
         [](const std::string& dump_root, const py::object obj) {
           CheckProtoType(obj, "tensorflow.DebugEvent");
-          DebugEventsWriter* writer =
-              DebugEventsWriter::GetDebugEventsWriter(dump_root);
+          DebugEventsWriter* writer = nullptr;
+          TF_CHECK_OK(
+              DebugEventsWriter::LookUpDebugEventsWriter(dump_root, &writer));
           writer->WriteSerializedNonExecutionDebugEvent(
               obj.attr("SerializeToString")().cast<std::string>(),
               tfdbg::DebugEventFileType::SOURCE_FILES);
@@ -50,8 +52,9 @@ PYBIND11_MODULE(_pywrap_debug_events_writer, m) {
   m.def("WriteStackFrameWithId",
         [](const std::string& dump_root, const py::object& obj) {
           CheckProtoType(obj, "tensorflow.DebugEvent");
-          DebugEventsWriter* writer =
-              DebugEventsWriter::GetDebugEventsWriter(dump_root);
+          DebugEventsWriter* writer = nullptr;
+          TF_CHECK_OK(
+              DebugEventsWriter::LookUpDebugEventsWriter(dump_root, &writer));
           writer->WriteSerializedNonExecutionDebugEvent(
               obj.attr("SerializeToString")().cast<std::string>(),
               tfdbg::DebugEventFileType::STACK_FRAMES);
@@ -59,8 +62,9 @@ PYBIND11_MODULE(_pywrap_debug_events_writer, m) {
   m.def("WriteGraphOpCreation",
         [](const std::string& dump_root, const py::object& obj) {
           CheckProtoType(obj, "tensorflow.DebugEvent");
-          DebugEventsWriter* writer =
-              DebugEventsWriter::GetDebugEventsWriter(dump_root);
+          DebugEventsWriter* writer = nullptr;
+          TF_CHECK_OK(
+              DebugEventsWriter::LookUpDebugEventsWriter(dump_root, &writer));
           writer->WriteSerializedNonExecutionDebugEvent(
               obj.attr("SerializeToString")().cast<std::string>(),
               tfdbg::DebugEventFileType::GRAPHS);
@@ -68,8 +72,9 @@ PYBIND11_MODULE(_pywrap_debug_events_writer, m) {
   m.def("WriteDebuggedGraph",
         [](const std::string& dump_root, const py::object& obj) {
           CheckProtoType(obj, "tensorflow.DebugEvent");
-          DebugEventsWriter* writer =
-              DebugEventsWriter::GetDebugEventsWriter(dump_root);
+          DebugEventsWriter* writer = nullptr;
+          TF_CHECK_OK(
+              DebugEventsWriter::LookUpDebugEventsWriter(dump_root, &writer));
           writer->WriteSerializedNonExecutionDebugEvent(
               obj.attr("SerializeToString")().cast<std::string>(),
               tfdbg::DebugEventFileType::GRAPHS);
@@ -77,8 +82,9 @@ PYBIND11_MODULE(_pywrap_debug_events_writer, m) {
   m.def("WriteExecution",
         [](const std::string& dump_root, const py::object& obj) {
           CheckProtoType(obj, "tensorflow.DebugEvent");
-          DebugEventsWriter* writer =
-              DebugEventsWriter::GetDebugEventsWriter(dump_root);
+          DebugEventsWriter* writer = nullptr;
+          TF_CHECK_OK(
+              DebugEventsWriter::LookUpDebugEventsWriter(dump_root, &writer));
           writer->WriteSerializedExecutionDebugEvent(
               obj.attr("SerializeToString")().cast<std::string>(),
               tfdbg::DebugEventFileType::EXECUTION);
@@ -86,31 +92,32 @@ PYBIND11_MODULE(_pywrap_debug_events_writer, m) {
   m.def("WriteGraphExecutionTrace",
         [](const std::string& dump_root, const py::object& obj) {
           CheckProtoType(obj, "tensorflow.DebugEvent");
-          DebugEventsWriter* writer =
-              DebugEventsWriter::GetDebugEventsWriter(dump_root);
+          DebugEventsWriter* writer = nullptr;
+          TF_CHECK_OK(
+              DebugEventsWriter::LookUpDebugEventsWriter(dump_root, &writer));
           writer->WriteSerializedExecutionDebugEvent(
               obj.attr("SerializeToString")().cast<std::string>(),
               tfdbg::DebugEventFileType::GRAPH_EXECUTION_TRACES);
         });
-  m.def("RegisterDeviceAndGetId",
-        [](const std::string& dump_root, const std::string& device_name) {
-          DebugEventsWriter* writer =
-              DebugEventsWriter::GetDebugEventsWriter(dump_root);
-          return writer->RegisterDeviceAndGetId(device_name);
-        });
+  m.def("RegisterDeviceAndGetId", [](const std::string& dump_root,
+                                     const std::string& device_name) {
+    DebugEventsWriter* writer = nullptr;
+    TF_CHECK_OK(DebugEventsWriter::LookUpDebugEventsWriter(dump_root, &writer));
+    return writer->RegisterDeviceAndGetId(device_name);
+  });
   m.def("FlushNonExecutionFiles", [](const std::string& dump_root) {
-    DebugEventsWriter* writer =
-        DebugEventsWriter::GetDebugEventsWriter(dump_root);
+    DebugEventsWriter* writer = nullptr;
+    TF_CHECK_OK(DebugEventsWriter::LookUpDebugEventsWriter(dump_root, &writer));
     writer->FlushNonExecutionFiles();
   });
   m.def("FlushExecutionFiles", [](const std::string& dump_root) {
-    DebugEventsWriter* writer =
-        DebugEventsWriter::GetDebugEventsWriter(dump_root);
+    DebugEventsWriter* writer = nullptr;
+    TF_CHECK_OK(DebugEventsWriter::LookUpDebugEventsWriter(dump_root, &writer));
     writer->FlushExecutionFiles();
   });
   m.def("Close", [](const std::string& dump_root) {
-    DebugEventsWriter* writer =
-        DebugEventsWriter::GetDebugEventsWriter(dump_root);
+    DebugEventsWriter* writer = nullptr;
+    TF_CHECK_OK(DebugEventsWriter::LookUpDebugEventsWriter(dump_root, &writer));
     writer->Close();
   });
 };
diff --git a/tensorflow/python/debug/lib/debug_events_reader.py b/tensorflow/python/debug/lib/debug_events_reader.py
index 4adb97de25b..743cea7103a 100644
--- a/tensorflow/python/debug/lib/debug_events_reader.py
+++ b/tensorflow/python/debug/lib/debug_events_reader.py
@@ -863,6 +863,7 @@ class DebugDataReader(object):
     debug_event = next(metadata_iter).debug_event
     self._starting_wall_time = debug_event.wall_time
     self._tensorflow_version = debug_event.debug_metadata.tensorflow_version
+    self._tfdbg_run_id = debug_event.debug_metadata.tfdbg_run_id
 
   def _load_source_files(self):
     """Incrementally read the .source_files DebugEvent file."""
@@ -1071,6 +1072,10 @@ class DebugDataReader(object):
     """
     return self._tensorflow_version
 
+  def tfdbg_run_id(self):
+    """Get the debugger run ID of the debugged TensorFlow program."""
+    return self._tfdbg_run_id
+
   def outermost_graphs(self):
     """Get the number of outer most graphs read so far."""
     return [graph for graph in self._graph_by_id.values()
diff --git a/tensorflow/python/debug/lib/debug_events_writer.py b/tensorflow/python/debug/lib/debug_events_writer.py
index 3de0ab78b8a..f223abdd099 100644
--- a/tensorflow/python/debug/lib/debug_events_writer.py
+++ b/tensorflow/python/debug/lib/debug_events_writer.py
@@ -32,6 +32,7 @@ class DebugEventsWriter(object):
 
   def __init__(self,
                dump_root,
+               tfdbg_run_id,
                circular_buffer_size=DEFAULT_CIRCULAR_BUFFER_SIZE):
     """Construct a DebugEventsWriter object.
 
@@ -43,6 +44,7 @@ class DebugEventsWriter(object):
     Args:
       dump_root: The root directory for dumping debug data. If `dump_root` does
         not exist as a directory, it will be created.
+      tfdbg_run_id: Debugger Run ID.
       circular_buffer_size: Size of the circular buffer for each of the two
         execution-related debug events files: with the following suffixes: -
           .execution - .graph_execution_traces If <= 0, the circular-buffer
@@ -51,7 +53,9 @@ class DebugEventsWriter(object):
     if not dump_root:
       raise ValueError("Empty or None dump root")
     self._dump_root = dump_root
-    _pywrap_debug_events_writer.Init(self._dump_root, circular_buffer_size)
+    self._tfdbg_run_id = tfdbg_run_id
+    _pywrap_debug_events_writer.Init(self._dump_root, self._tfdbg_run_id,
+                                     circular_buffer_size)
 
   def WriteSourceFile(self, source_file):
     """Write a SourceFile proto with the writer.
diff --git a/tensorflow/python/debug/lib/debug_events_writer_test.py b/tensorflow/python/debug/lib/debug_events_writer_test.py
index 57721c1450f..7b06bf772be 100644
--- a/tensorflow/python/debug/lib/debug_events_writer_test.py
+++ b/tensorflow/python/debug/lib/debug_events_writer_test.py
@@ -41,7 +41,7 @@ class DebugEventsWriterTest(dumping_callback_test_lib.DumpingCallbackTestBase,
 
   def testMultiThreadedConstructorCallWorks(self):
     def init_writer():
-      debug_events_writer.DebugEventsWriter(self.dump_root)
+      debug_events_writer.DebugEventsWriter(self.dump_root, self.tfdbg_run_id)
 
     num_threads = 4
     threads = []
@@ -66,7 +66,8 @@ class DebugEventsWriterTest(dumping_callback_test_lib.DumpingCallbackTestBase,
     self._readAndCheckMetadataFile()
 
   def testWriteSourceFilesAndStackFrames(self):
-    writer = debug_events_writer.DebugEventsWriter(self.dump_root)
+    writer = debug_events_writer.DebugEventsWriter(self.dump_root,
+                                                   self.tfdbg_run_id)
     num_protos = 10
     for i in range(num_protos):
       source_file = debug_event_pb2.SourceFile()
@@ -99,7 +100,8 @@ class DebugEventsWriterTest(dumping_callback_test_lib.DumpingCallbackTestBase,
         self.assertEqual(actuals[i].file_line_col.file_index, i * 10)
 
   def testWriteGraphOpCreationAndDebuggedGraphs(self):
-    writer = debug_events_writer.DebugEventsWriter(self.dump_root)
+    writer = debug_events_writer.DebugEventsWriter(self.dump_root,
+                                                   self.tfdbg_run_id)
     num_op_creations = 10
     for i in range(num_op_creations):
       graph_op_creation = debug_event_pb2.GraphOpCreation()
@@ -122,7 +124,8 @@ class DebugEventsWriterTest(dumping_callback_test_lib.DumpingCallbackTestBase,
                      "deadbeaf")
 
   def testConcurrentWritesToNonExecutionFilesWorks(self):
-    writer = debug_events_writer.DebugEventsWriter(self.dump_root)
+    writer = debug_events_writer.DebugEventsWriter(self.dump_root,
+                                                   self.tfdbg_run_id)
 
     source_file_state = {"counter": 0, "lock": threading.Lock()}
 
@@ -201,15 +204,18 @@ class DebugEventsWriterTest(dumping_callback_test_lib.DumpingCallbackTestBase,
 
   def testWriteAndReadMetadata(self):
     t0 = time.time()
-    writer = debug_events_writer.DebugEventsWriter(self.dump_root)
+    writer = debug_events_writer.DebugEventsWriter(self.dump_root,
+                                                   self.tfdbg_run_id)
     writer.Close()
     with debug_events_reader.DebugDataReader(self.dump_root) as reader:
       self.assertIsInstance(reader.starting_wall_time(), float)
       self.assertGreaterEqual(reader.starting_wall_time(), t0)
       self.assertEqual(reader.tensorflow_version(), versions.__version__)
+      self.assertTrue(reader.tfdbg_run_id())
 
   def testWriteExecutionEventsWithCircularBuffer(self):
-    writer = debug_events_writer.DebugEventsWriter(self.dump_root)
+    writer = debug_events_writer.DebugEventsWriter(self.dump_root,
+                                                   self.tfdbg_run_id)
     num_execution_events = debug_events_writer.DEFAULT_CIRCULAR_BUFFER_SIZE * 2
     for i in range(num_execution_events):
       execution = debug_event_pb2.Execution()
@@ -232,7 +238,8 @@ class DebugEventsWriterTest(dumping_callback_test_lib.DumpingCallbackTestBase,
 
   def testWriteExecutionEventsWithoutCircularBufferBehavior(self):
     # A circular buffer size of 0 abolishes the circular buffer behavior.
-    writer = debug_events_writer.DebugEventsWriter(self.dump_root, 0)
+    writer = debug_events_writer.DebugEventsWriter(self.dump_root,
+                                                   self.tfdbg_run_id, 0)
     num_execution_events = debug_events_writer.DEFAULT_CIRCULAR_BUFFER_SIZE * 2
     for i in range(num_execution_events):
       execution = debug_event_pb2.Execution()
@@ -248,7 +255,8 @@ class DebugEventsWriterTest(dumping_callback_test_lib.DumpingCallbackTestBase,
         self.assertEqual(execution.op_type, "OpType%d" % i)
 
   def testWriteGraphExecutionTraceEventsWithCircularBuffer(self):
-    writer = debug_events_writer.DebugEventsWriter(self.dump_root)
+    writer = debug_events_writer.DebugEventsWriter(self.dump_root,
+                                                   self.tfdbg_run_id)
     num_execution_events = debug_events_writer.DEFAULT_CIRCULAR_BUFFER_SIZE * 2
     for i in range(num_execution_events):
       trace = debug_event_pb2.GraphExecutionTrace()
@@ -272,7 +280,8 @@ class DebugEventsWriterTest(dumping_callback_test_lib.DumpingCallbackTestBase,
 
   def testWriteGraphExecutionTraceEventsWithoutCircularBufferBehavior(self):
     # A circular buffer size of 0 abolishes the circular buffer behavior.
-    writer = debug_events_writer.DebugEventsWriter(self.dump_root, 0)
+    writer = debug_events_writer.DebugEventsWriter(self.dump_root,
+                                                   self.tfdbg_run_id, 0)
     num_execution_events = debug_events_writer.DEFAULT_CIRCULAR_BUFFER_SIZE * 2
     for i in range(num_execution_events):
       trace = debug_event_pb2.GraphExecutionTrace()
@@ -290,6 +299,7 @@ class DebugEventsWriterTest(dumping_callback_test_lib.DumpingCallbackTestBase,
   def testConcurrentWritesToExecutionFiles(self):
     circular_buffer_size = 5
     writer = debug_events_writer.DebugEventsWriter(self.dump_root,
+                                                   self.tfdbg_run_id,
                                                    circular_buffer_size)
     debugged_graph = debug_event_pb2.DebuggedGraph(graph_id="graph1",
                                                    graph_name="graph1")
@@ -345,7 +355,8 @@ class DebugEventsWriterTest(dumping_callback_test_lib.DumpingCallbackTestBase,
       self.assertLen(op_names, len(set(op_names)))
 
   def testConcurrentSourceFileRandomReads(self):
-    writer = debug_events_writer.DebugEventsWriter(self.dump_root)
+    writer = debug_events_writer.DebugEventsWriter(self.dump_root,
+                                                   self.tfdbg_run_id)
 
     for i in range(100):
       source_file = debug_event_pb2.SourceFile(
@@ -376,6 +387,7 @@ class DebugEventsWriterTest(dumping_callback_test_lib.DumpingCallbackTestBase,
   def testConcurrentExecutionUpdateAndRandomRead(self):
     circular_buffer_size = -1
     writer = debug_events_writer.DebugEventsWriter(self.dump_root,
+                                                   self.tfdbg_run_id,
                                                    circular_buffer_size)
 
     writer_state = {"counter": 0, "done": False}
@@ -410,6 +422,7 @@ class DebugEventsWriterTest(dumping_callback_test_lib.DumpingCallbackTestBase,
   def testConcurrentExecutionRandomReads(self):
     circular_buffer_size = -1
     writer = debug_events_writer.DebugEventsWriter(self.dump_root,
+                                                   self.tfdbg_run_id,
                                                    circular_buffer_size)
 
     for i in range(100):
@@ -445,6 +458,7 @@ class DebugEventsWriterTest(dumping_callback_test_lib.DumpingCallbackTestBase,
   def testConcurrentGraphExecutionTraceUpdateAndRandomRead(self):
     circular_buffer_size = -1
     writer = debug_events_writer.DebugEventsWriter(self.dump_root,
+                                                   self.tfdbg_run_id,
                                                    circular_buffer_size)
     debugged_graph = debug_event_pb2.DebuggedGraph(graph_id="graph1",
                                                    graph_name="graph1")
@@ -487,6 +501,7 @@ class DebugEventsWriterTest(dumping_callback_test_lib.DumpingCallbackTestBase,
   def testConcurrentGraphExecutionTraceRandomReads(self):
     circular_buffer_size = -1
     writer = debug_events_writer.DebugEventsWriter(self.dump_root,
+                                                   self.tfdbg_run_id,
                                                    circular_buffer_size)
     debugged_graph = debug_event_pb2.DebuggedGraph(graph_id="graph1",
                                                    graph_name="graph1")
@@ -534,7 +549,7 @@ class DebugEventsWriterTest(dumping_callback_test_lib.DumpingCallbackTestBase,
   def testRangeReadingExecutions(self, begin, end, expected_begin,
                                  expected_end):
     writer = debug_events_writer.DebugEventsWriter(
-        self.dump_root, circular_buffer_size=-1)
+        self.dump_root, self.tfdbg_run_id, circular_buffer_size=-1)
     for i in range(5):
       execution = debug_event_pb2.Execution(op_type="OpType%d" % i)
       writer.WriteExecution(execution)
@@ -559,7 +574,7 @@ class DebugEventsWriterTest(dumping_callback_test_lib.DumpingCallbackTestBase,
   def testRangeReadingGraphExecutionTraces(self, begin, end, expected_begin,
                                            expected_end):
     writer = debug_events_writer.DebugEventsWriter(
-        self.dump_root, circular_buffer_size=-1)
+        self.dump_root, self.tfdbg_run_id, circular_buffer_size=-1)
     debugged_graph = debug_event_pb2.DebuggedGraph(
         graph_id="graph1", graph_name="graph1")
     writer.WriteDebuggedGraph(debugged_graph)
diff --git a/tensorflow/python/debug/lib/debug_v2_ops_test.py b/tensorflow/python/debug/lib/debug_v2_ops_test.py
index 10de01f4f2e..d715869f359 100644
--- a/tensorflow/python/debug/lib/debug_v2_ops_test.py
+++ b/tensorflow/python/debug/lib/debug_v2_ops_test.py
@@ -52,8 +52,9 @@ class DebugIdentityV2OpTest(dumping_callback_test_lib.DumpingCallbackTestBase):
     super(DebugIdentityV2OpTest, self).setUp()
     # Testing using a small circular-buffer size.
     self.circular_buffer_size = 4
+    self.tfdbg_run_id = "test_tfdbg_run"
     self.writer = debug_events_writer.DebugEventsWriter(
-        self.dump_root, self.circular_buffer_size)
+        self.dump_root, self.tfdbg_run_id, self.circular_buffer_size)
 
   def tearDown(self):
     self.writer.Close()
@@ -192,7 +193,8 @@ class DebugIdentityV2OpTest(dumping_callback_test_lib.DumpingCallbackTestBase):
   def testTwoDumpRoots(self):
     another_dump_root = os.path.join(self.dump_root, "another")
     another_debug_url = "file://%s" % another_dump_root
-    another_writer = debug_events_writer.DebugEventsWriter(another_dump_root)
+    another_writer = debug_events_writer.DebugEventsWriter(
+        another_dump_root, "test_tfdbg_run")
 
     @def_function.function
     def write_debug_trace(x):
@@ -264,6 +266,7 @@ class DebugIdentityV2OpUninitializedWriterTest(
       self.assertAllClose(
           write_debug_trace(np.array([i]).astype(np.float32)), [i**2.0])
     writer = debug_events_writer.DebugEventsWriter(self.dump_root,
+                                                   "test_tfdbg_run",
                                                    circular_buffer_size)
     writer.FlushNonExecutionFiles()
     writer.FlushExecutionFiles()
diff --git a/tensorflow/python/debug/lib/dumping_callback.py b/tensorflow/python/debug/lib/dumping_callback.py
index 0f5836e0644..563b52f8f63 100644
--- a/tensorflow/python/debug/lib/dumping_callback.py
+++ b/tensorflow/python/debug/lib/dumping_callback.py
@@ -69,6 +69,10 @@ def _debug_identity_v2_grad(op, dy):
   return dy
 
 
+def _get_tfdbg_run_id():
+  return str(uuid.uuid4())[:8]
+
+
 def _get_id():
   """Get a short unique ID."""
   return str(uuid.uuid4())
@@ -88,6 +92,7 @@ class _DumpingCallback(object):
                op_regex,
                tensor_dtypes):
     self._dump_root = dump_root
+    self._tfdbg_run_id = _get_tfdbg_run_id()
     self._tensor_debug_mode = tensor_debug_mode
     self._circular_buffer_size = circular_buffer_size
     self._op_regex = op_regex
@@ -148,6 +153,10 @@ class _DumpingCallback(object):
       self._dump_root = dump_root
       self._writer = None
 
+  @property
+  def tfdbg_run_id(self):
+    return self._tfdbg_run_id
+
   @property
   def tensor_debug_mode(self):
     return self._tensor_debug_mode
@@ -161,6 +170,7 @@ class _DumpingCallback(object):
     if not self._writer:
       self._writer = debug_events_writer.DebugEventsWriter(
           self._dump_root,
+          self._tfdbg_run_id,
           circular_buffer_size=self._circular_buffer_size)
     return self._writer
 
@@ -365,6 +375,8 @@ class _DumpingCallback(object):
       if tf_compat.forward_compatible(2020, 6, 24):
         debug_identity_op_kwargs[
             "circular_buffer_size"] = self._circular_buffer_size
+      if tf_compat.forward_compatible(2020, 7, 1):
+        debug_identity_op_kwargs["tfdbg_run_id"] = self._tfdbg_run_id
       if tensor_debug_mode == debug_event_pb2.TensorDebugMode.NO_TENSOR:
         if (not self._should_dump_tensor(op_type, tensor.dtype) or
             not tensor.dtype.is_numpy_compatible):
@@ -873,7 +885,8 @@ def disable_dump_debug_info():
   """
   if hasattr(_state, "dumping_callback"):
     dump_root = _state.dumping_callback.dump_root
-    debug_events_writer.DebugEventsWriter(dump_root).Close()
+    tfdbg_run_id = _state.dumping_callback.tfdbg_run_id
+    debug_events_writer.DebugEventsWriter(dump_root, tfdbg_run_id).Close()
     op_callbacks.remove_op_callback(_state.dumping_callback.callback)
     function_lib.remove_function_callback(
         _state.dumping_callback.function_callback)
diff --git a/tensorflow/python/debug/lib/dumping_callback_test_lib.py b/tensorflow/python/debug/lib/dumping_callback_test_lib.py
index 164644c57fa..05bf3aeb6da 100644
--- a/tensorflow/python/debug/lib/dumping_callback_test_lib.py
+++ b/tensorflow/python/debug/lib/dumping_callback_test_lib.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 import os
 import shutil
 import tempfile
+import uuid
 
 from tensorflow.python.debug.lib import check_numerics_callback
 from tensorflow.python.debug.lib import debug_events_reader
@@ -35,6 +36,7 @@ class DumpingCallbackTestBase(test_util.TensorFlowTestCase):
   def setUp(self):
     super(DumpingCallbackTestBase, self).setUp()
     self.dump_root = tempfile.mkdtemp()
+    self.tfdbg_run_id = str(uuid.uuid4())
 
   def tearDown(self):
     if os.path.isdir(self.dump_root):
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt
index 75ddf32cbe0..54d15b601c5 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt
@@ -982,7 +982,7 @@ tf_module {
   }
   member_method {
     name: "DebugIdentityV2"
-    argspec: "args=[\'input\', \'tfdbg_context_id\', \'op_name\', \'output_slot\', \'tensor_debug_mode\', \'debug_urls\', \'circular_buffer_size\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'\', \'-1\', \'-1\', \'[]\', \'1000\', \'None\'], "
+    argspec: "args=[\'input\', \'tfdbg_context_id\', \'op_name\', \'output_slot\', \'tensor_debug_mode\', \'debug_urls\', \'circular_buffer_size\', \'tfdbg_run_id\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'\', \'-1\', \'-1\', \'[]\', \'1000\', \'\', \'None\'], "
   }
   member_method {
     name: "DebugNanCount"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt
index 75ddf32cbe0..54d15b601c5 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt
@@ -982,7 +982,7 @@ tf_module {
   }
   member_method {
     name: "DebugIdentityV2"
-    argspec: "args=[\'input\', \'tfdbg_context_id\', \'op_name\', \'output_slot\', \'tensor_debug_mode\', \'debug_urls\', \'circular_buffer_size\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'\', \'-1\', \'-1\', \'[]\', \'1000\', \'None\'], "
+    argspec: "args=[\'input\', \'tfdbg_context_id\', \'op_name\', \'output_slot\', \'tensor_debug_mode\', \'debug_urls\', \'circular_buffer_size\', \'tfdbg_run_id\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'\', \'-1\', \'-1\', \'[]\', \'1000\', \'\', \'None\'], "
   }
   member_method {
     name: "DebugNanCount"

From 7d76bc4b60e020bbfd1339923ea5a7c3ab007217 Mon Sep 17 00:00:00 2001
From: Tomer Kaftan <kaftan@google.com>
Date: Mon, 15 Jun 2020 13:56:55 -0700
Subject: [PATCH 0201/1390] Fix sparse kerastensors to maintain dense shape
 information after converting to a placeholder.

PiperOrigin-RevId: 316538468
Change-Id: I8e53a7e96067a8b7edd3f57cd8a8a89eb912824b
---
 .../python/keras/engine/keras_tensor.py       | 19 +++++++++++++++----
 .../utils/composite_tensor_support_test.py    |  3 +--
 tensorflow/python/ops/array_ops.py            |  1 +
 3 files changed, 17 insertions(+), 6 deletions(-)

diff --git a/tensorflow/python/keras/engine/keras_tensor.py b/tensorflow/python/keras/engine/keras_tensor.py
index 4ea01da8db2..c5c0068c652 100644
--- a/tensorflow/python/keras/engine/keras_tensor.py
+++ b/tensorflow/python/keras/engine/keras_tensor.py
@@ -19,6 +19,7 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import type_spec as type_spec_module
 from tensorflow.python.ops import array_ops
 from tensorflow.python.util import nest
@@ -210,10 +211,20 @@ class _KerasTensorIterator(object):
 def keras_tensor_to_placeholder(x):
   """TODO(kaftan): Docstring."""
   if isinstance(x, KerasTensor):
-    def tensor_spec_to_placeholder(tensorspec):
-      return array_ops.placeholder(tensorspec.dtype, tensorspec.shape)
-    ph = nest.map_structure(tensor_spec_to_placeholder, x.type_spec,
-                            expand_composites=True)
+    spec = x.type_spec
+    if isinstance(spec, sparse_tensor.SparseTensorSpec):
+      # nest.map_structure loses dense shape information for sparse tensors.
+      # So, we special-case sparse placeholder creation.
+      # This only preserves shape information for top-level sparse tensors;
+      # not for sparse tensors that are nested inside another composite
+      # tensor.
+      return array_ops.sparse_placeholder(dtype=spec.dtype, shape=spec.shape)
+
+    def component_to_placeholder(component):
+      return array_ops.placeholder(component.dtype, component.shape)
+
+    ph = nest.map_structure(
+        component_to_placeholder, spec, expand_composites=True)
     return ph
   else:
     return x
diff --git a/tensorflow/python/keras/utils/composite_tensor_support_test.py b/tensorflow/python/keras/utils/composite_tensor_support_test.py
index f31558ddba8..daba188414a 100644
--- a/tensorflow/python/keras/utils/composite_tensor_support_test.py
+++ b/tensorflow/python/keras/utils/composite_tensor_support_test.py
@@ -603,8 +603,7 @@ class RaggedTensorInputValidationTest(keras_parameterized.TestCase,
 
 
 @keras_parameterized.run_with_all_model_types()
-@keras_parameterized.run_all_keras_modes(always_skip_v1=True,
-                                         skip_keras_tensors=True)
+@keras_parameterized.run_all_keras_modes(always_skip_v1=True)
 class CompositeTensorModelPredictTest(keras_parameterized.TestCase):
 
   def _normalize_shape(self, shape):
diff --git a/tensorflow/python/ops/array_ops.py b/tensorflow/python/ops/array_ops.py
index c77977bf7d2..1c00b81c9ca 100644
--- a/tensorflow/python/ops/array_ops.py
+++ b/tensorflow/python/ops/array_ops.py
@@ -3184,6 +3184,7 @@ def sparse_placeholder(dtype, shape=None, name=None):
       # `SparseTensor`
       dense_shape_default = tensor_shape.TensorShape(
           tuple(None if dim == -1 else dim for dim in shape))
+      shape = tuple(tensor_shape.dimension_value(dim) for dim in shape)
       shape = tuple(-1 if dim is None else dim for dim in shape)
       shape = ops.convert_to_tensor(
           shape, dtype=dtypes.int64, name=default_shape_name)

From 0e40b3e0c30caff9427c1da54c40b6236608ec15 Mon Sep 17 00:00:00 2001
From: Marat Dukhan <maratek@google.com>
Date: Mon, 15 Jun 2020 14:05:30 -0700
Subject: [PATCH 0202/1390] Exclude dependencies on FP16 XNNPACK micro-kernels

PiperOrigin-RevId: 316540175
Change-Id: Id02758822c004d52181eb1d317ba13e94df77f49
---
 tensorflow/lite/delegates/xnnpack/BUILD | 2 +-
 tensorflow/workspace.bzl                | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/tensorflow/lite/delegates/xnnpack/BUILD b/tensorflow/lite/delegates/xnnpack/BUILD
index c7ff1f55a49..efbaf0cfc42 100644
--- a/tensorflow/lite/delegates/xnnpack/BUILD
+++ b/tensorflow/lite/delegates/xnnpack/BUILD
@@ -26,7 +26,7 @@ cc_library(
         "//tensorflow/lite/schema:schema_fbs",
         "//tensorflow/lite/tools/optimize/sparsity:format_converter",
         "@FP16",
-        "@XNNPACK",
+        "@XNNPACK//:xnnpack_f32",
     ],
 )
 
diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index f44c8dea6a2..6b0143e397f 100755
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -164,11 +164,11 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
 
     tf_http_archive(
         name = "XNNPACK",
-        sha256 = "7469a0a634bfa90395ed311d07a21b1d0003604b37b12745bad1cf17860984e1",
-        strip_prefix = "XNNPACK-a059b7da184954fb6c01db0e7959352ee805e9f3",
+        sha256 = "bd5fd63a09222cd092f0c058b576cf044fb4074f2c4ce8a6fc32fc43d155f9c7",
+        strip_prefix = "XNNPACK-ae046f5a5127084bfe41090afdf1c1d4c9874b77",
         urls = [
-            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/google/XNNPACK/archive/a059b7da184954fb6c01db0e7959352ee805e9f3.zip",
-            "https://github.com/google/XNNPACK/archive/a059b7da184954fb6c01db0e7959352ee805e9f3.zip",
+            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/google/XNNPACK/archive/ae046f5a5127084bfe41090afdf1c1d4c9874b77.zip",
+            "https://github.com/google/XNNPACK/archive/ae046f5a5127084bfe41090afdf1c1d4c9874b77.zip",
         ],
     )
 

From fe6201c57b7fd78e344b4e5ee7fe6c7f7151b08a Mon Sep 17 00:00:00 2001
From: Jinliang Wei <jlwei@google.com>
Date: Mon, 15 Jun 2020 14:08:55 -0700
Subject: [PATCH 0203/1390] Fix a bug in data flow analysis for asynchronous
 collective-permute.

PiperOrigin-RevId: 316540956
Change-Id: Icb1fb9d1d445d5aa3cf7afa580eed06607c4ecb3
---
 .../compiler/xla/service/hlo_dataflow_analysis.cc   | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/tensorflow/compiler/xla/service/hlo_dataflow_analysis.cc b/tensorflow/compiler/xla/service/hlo_dataflow_analysis.cc
index d0d533e0b06..f19882c9347 100644
--- a/tensorflow/compiler/xla/service/hlo_dataflow_analysis.cc
+++ b/tensorflow/compiler/xla/service/hlo_dataflow_analysis.cc
@@ -698,7 +698,7 @@ bool HloDataflowAnalysis::UpdateCollectivePermuteDoneValueSet(
   CHECK_EQ(collective_permute_done->opcode(),
            HloOpcode::kCollectivePermuteDone);
   bool changed = false;
-  // CollectivePermuteDone forwards the operand value at {0} to its output.
+  // CollectivePermuteDone forwards the operand value at {1} to its output.
   const HloValueSet& operand_value_set =
       GetValueSet(collective_permute_done->operand(0), {1});
   HloValueSet& value_set = GetValueSet(collective_permute_done);
@@ -945,6 +945,17 @@ Status HloDataflowAnalysis::InitializeInstructionValueSets() {
           // CopyDone consumes a tuple produced by CopyStart and produces an
           // element. Its output aliases its input tuple element {0}.
           break;
+        case HloOpcode::kCollectivePermuteStart:
+          // CollectivePermuteStart produces a tuple of
+          // {aliased operand, destination buffer, U32 context, U32 context}.
+          define_value_at(/*index=*/{});
+          define_value_at(/*index=*/{1});
+          define_value_at(/*index=*/{2});
+          define_value_at(/*index=*/{3});
+          break;
+        case HloOpcode::kCollectivePermuteDone:
+          // CollectivePermuteDone's output aliases its input tuple element {1}.
+          break;
         case HloOpcode::kRecvDone:
           // RecvDone produces a two-element tuple. Element zero aliases its
           // input tuple element {0}; element one is a token.

From 7f3de617db0c4442ac0877cbdcf6261bbe734087 Mon Sep 17 00:00:00 2001
From: Tare Gaskin <taregaskin@google.com>
Date: Mon, 15 Jun 2020 21:18:56 +0000
Subject: [PATCH 0204/1390] segrating changes 2

---
 tensorflow/core/framework/tensor_shape.cc     |  2 +-
 tensorflow/core/kernels/batch_kernels.cc      |  6 ++--
 .../core/kernels/data/prefetch_autotuner.cc   |  4 +--
 tensorflow/core/kernels/quantization_utils.h  |  2 +-
 tensorflow/core/lib/io/inputbuffer.cc         |  6 ++--
 tensorflow/core/lib/io/random_inputstream.cc  |  2 +-
 .../core/lib/io/snappy/snappy_inputbuffer.cc  |  2 +-
 .../core/lib/io/snappy/snappy_outputbuffer.cc |  6 ++--
 tensorflow/core/lib/io/zlib_outputbuffer.cc   |  6 ++--
 tensorflow/core/platform/env.cc               |  2 +-
 tensorflow/core/platform/file_system.cc       |  2 +-
 .../core/platform/file_system_helper.cc       |  2 +-
 tensorflow/core/platform/s3/s3_file_system.cc |  2 +-
 tensorflow/core/platform/status.cc            |  4 +--
 .../profiler/internal/parse_annotation.cc     |  2 +-
 .../core/profiler/utils/derived_timeline.cc   |  2 +-
 .../core/profiler/utils/derived_timeline.h    |  2 +-
 .../core/profiler/utils/xplane_utils.cc       |  2 +-
 tensorflow/core/util/bcast.h                  |  4 +--
 .../convert_trivial_tile_to_concat.cc         |  2 +-
 .../convert_trivial_transpose_to_reshape.cc   |  2 +-
 .../toco/graph_transformations/dequantize.cc  |  2 +-
 .../graph_transformations/drop_fake_quant.cc  |  2 +-
 ...int8_weights_safe_for_fast_int8_kernels.cc |  2 +-
 .../fuse_broadcast_into_following_binary.cc   |  2 +-
 .../group_bidirectional_sequence_ops.cc       |  4 +--
 .../graph_transformations/hardcode_min_max.cc |  2 +-
 .../identify_nearest_upsample.cc              |  2 +-
 .../merge_reshape_into_preceding_transpose.cc |  4 +--
 .../propagate_array_data_types.cc             |  2 +-
 .../propagate_fake_quant_num_bits.cc          |  2 +-
 .../propagate_fixed_sizes.cc                  | 28 +++++++++----------
 .../remove_successive_transpose.cc            | 10 +++----
 .../remove_trivial_passthrough.cc             |  2 +-
 .../reorder_elementwise_unary.cc              |  4 +--
 .../reorder_reshape_transpose.cc              | 12 ++++----
 .../resolve_batch_normalization.cc            | 10 +++----
 .../resolve_constant_concatenation.cc         |  2 +-
 .../resolve_constant_pack.cc                  |  2 +-
 .../resolve_constant_slice.cc                 |  2 +-
 .../resolve_constant_transpose.cc             |  2 +-
 .../resolve_constant_unary.cc                 |  4 +--
 .../unpartition_embedding_lookup.cc           |  4 +--
 tensorflow/lite/toco/model_cmdline_flags.cc   |  8 +++---
 tensorflow/lite/toco/toco_cmdline_flags.cc    |  2 +-
 45 files changed, 89 insertions(+), 91 deletions(-)

diff --git a/tensorflow/core/framework/tensor_shape.cc b/tensorflow/core/framework/tensor_shape.cc
index 79d0cc0822d..f4b440e9cd1 100644
--- a/tensorflow/core/framework/tensor_shape.cc
+++ b/tensorflow/core/framework/tensor_shape.cc
@@ -187,7 +187,7 @@ void TensorShapeBase<Shape>::InitDims(gtl::ArraySlice<int64> dim_sizes) {
                 "bad overflow check");
   bool large_size = false;
   for (auto s : dim_sizes) {
-    if (static_cast<size_t>(s) > static_cast<size_t>(kMaxSmall)) {
+    if (s > kMaxSmall) {
       large_size = true;
       break;
     }
diff --git a/tensorflow/core/kernels/batch_kernels.cc b/tensorflow/core/kernels/batch_kernels.cc
index ee271f1a123..151f2367c95 100644
--- a/tensorflow/core/kernels/batch_kernels.cc
+++ b/tensorflow/core/kernels/batch_kernels.cc
@@ -486,18 +486,18 @@ class BatchResource : public ResourceBase {
     std::map<string, std::vector<Tensor>> split_tensors;
 
     DCHECK_EQ(batch->task(0).context->num_outputs(), combined_outputs.size());
-    if (static_cast<int>(combined_outputs.size()) != batch->task(0).context->num_outputs()) {
+    if (combined_outputs.size() != batch->task(0).context->num_outputs()) {
       return errors::Internal("Wrong number of batched output tensors");
     }
 
     // Generate 'split_tensors' and populate the context outputs.
-    for (size_t i = 0; i < combined_outputs.size(); ++i) {
+    for (int i = 0; i < combined_outputs.size(); ++i) {
       const Tensor& output_tensor = combined_outputs[i];
       if (output_tensor.shape().dims() == 0) {
         return errors::FailedPrecondition(
             "Batched output tensor has 0 dimensions");
       }
-      if (output_tensor.shape().dim_size(0) != static_cast<long long int>(batch->size() + padding_size)) {
+      if (output_tensor.shape().dim_size(0) != batch->size() + padding_size) {
         return errors::FailedPrecondition(
             "Batched output tensor's 0th dimension does not equal the sum of "
             "the 0th dimension sizes of the input tensors");
diff --git a/tensorflow/core/kernels/data/prefetch_autotuner.cc b/tensorflow/core/kernels/data/prefetch_autotuner.cc
index a3fd9919d6b..a3bb1acc352 100644
--- a/tensorflow/core/kernels/data/prefetch_autotuner.cc
+++ b/tensorflow/core/kernels/data/prefetch_autotuner.cc
@@ -40,13 +40,13 @@ void PrefetchAutotuner::RecordConsumption(size_t current_buffer_size) {
     case Mode::kDisabled:
       return;
     case Mode::kUpswing:
-      if (static_cast<tensorflow::int64>(current_buffer_size) == buffer_limit_) {
+      if (current_buffer_size == buffer_limit_) {
         mode_ = Mode::kDownswing;
       }
       return;
     case Mode::kDownswing:
       if (current_buffer_size == 0) {
-        if (buffer_limit_ >= static_cast<tensorflow::int64>(kBufferLimitThreshold)) {
+        if (buffer_limit_ >= kBufferLimitThreshold) {
           buffer_limit_ += kBufferLimitThreshold;
         } else {
           buffer_limit_ *= 2;
diff --git a/tensorflow/core/kernels/quantization_utils.h b/tensorflow/core/kernels/quantization_utils.h
index 06c901967b0..315616f3fb3 100644
--- a/tensorflow/core/kernels/quantization_utils.h
+++ b/tensorflow/core/kernels/quantization_utils.h
@@ -268,7 +268,7 @@ inline void RequantizeManyInNewRangeReference(const qint32* input, int64 count,
   // that could be easily adapted for a SIMD implementation. It should also be
   // possible to perform all the calculations in 32-bit rather than 64, but
   // that's not been implemented yet.
-  for (size_t index = 0; static_cast<tensorflow::int64>(index) < count; ++index) {
+  for (size_t index = 0; index < count; ++index) {
     const int64 input_value = static_cast<int64>(input[index]);
     const int64 fp_value =
         ((input_value * range_scale_fp) >> 32) + input_offset_fp;
diff --git a/tensorflow/core/lib/io/inputbuffer.cc b/tensorflow/core/lib/io/inputbuffer.cc
index d005ee11d78..2b138b825e4 100644
--- a/tensorflow/core/lib/io/inputbuffer.cc
+++ b/tensorflow/core/lib/io/inputbuffer.cc
@@ -85,7 +85,7 @@ Status InputBuffer::ReadNBytes(int64 bytes_to_read, string* result) {
   result->resize(bytes_to_read);
   size_t bytes_read = 0;
   Status status = ReadNBytes(bytes_to_read, &(*result)[0], &bytes_read);
-  if (static_cast<int64>(bytes_read) < bytes_to_read) result->resize(bytes_read);
+  if (bytes_read < bytes_to_read) result->resize(bytes_read);
   return status;
 }
 
@@ -204,7 +204,7 @@ Status InputBuffer::Hint(int64 bytes_to_read) {
   }
 
   // The internal buffer is too small. Do nothing.
-  if (bytes_to_read > static_cast<int64>(size_)) {
+  if (bytes_to_read > size_) {
     return Status::OK();
   }
 
@@ -230,7 +230,7 @@ Status InputBuffer::Hint(int64 bytes_to_read) {
   limit_ += data.size();
   file_pos_ += data.size();
 
-  if (errors::IsOutOfRange(s) && data.size() == static_cast<size_t>(bytes_to_read)) {
+  if (errors::IsOutOfRange(s) && data.size() == bytes_to_read) {
     return Status::OK();
   } else {
     return s;
diff --git a/tensorflow/core/lib/io/random_inputstream.cc b/tensorflow/core/lib/io/random_inputstream.cc
index bd0054ce753..10f734a5bae 100644
--- a/tensorflow/core/lib/io/random_inputstream.cc
+++ b/tensorflow/core/lib/io/random_inputstream.cc
@@ -92,7 +92,7 @@ Status RandomAccessInputStream::SkipNBytes(int64 bytes_to_skip) {
     } else {
       return s;
     }
-    if (data.size() < static_cast<size_t>(bytes_to_read)) {
+    if (data.size() < bytes_to_read) {
       return errors::OutOfRange("reached end of file");
     }
     bytes_to_skip -= bytes_to_read;
diff --git a/tensorflow/core/lib/io/snappy/snappy_inputbuffer.cc b/tensorflow/core/lib/io/snappy/snappy_inputbuffer.cc
index 53939f2d8a3..a331d4173cf 100644
--- a/tensorflow/core/lib/io/snappy/snappy_inputbuffer.cc
+++ b/tensorflow/core/lib/io/snappy/snappy_inputbuffer.cc
@@ -134,7 +134,7 @@ Status SnappyInputBuffer::ReadCompressedBlockLength(uint32* length) {
     }
     size_t readable = std::min(bytes_to_read, avail_in_);
 
-    for (size_t i = 0; i < readable; i++) {
+    for (int i = 0; i < readable; i++) {
       // The "unsigned char" type cast is intentional to avoid implicit type
       // casting of the signed char to unsigned int during bitwise OR which
       // causes weird overflow errors.
diff --git a/tensorflow/core/lib/io/snappy/snappy_outputbuffer.cc b/tensorflow/core/lib/io/snappy/snappy_outputbuffer.cc
index fe3a53c6c25..563503a1319 100644
--- a/tensorflow/core/lib/io/snappy/snappy_outputbuffer.cc
+++ b/tensorflow/core/lib/io/snappy/snappy_outputbuffer.cc
@@ -76,7 +76,7 @@ Status SnappyOutputBuffer::Write(StringPiece data) {
 
   // If there is sufficient free space in input_buffer_ to fit data we
   // add it there and return.
-  if (static_cast<int32>(bytes_to_write) <= AvailableInputSpace()) {
+  if (bytes_to_write <= AvailableInputSpace()) {
     AddToInputBuffer(data);
     return Status::OK();
   }
@@ -87,7 +87,7 @@ Status SnappyOutputBuffer::Write(StringPiece data) {
   TF_RETURN_IF_ERROR(DeflateBuffered());
 
   // input_buffer_ should be empty at this point.
-  if (static_cast<int32>(bytes_to_write) <= AvailableInputSpace()) {
+  if (bytes_to_write <= AvailableInputSpace()) {
     AddToInputBuffer(data);
     return Status::OK();
   }
@@ -144,7 +144,7 @@ void SnappyOutputBuffer::AddToInputBuffer(StringPiece data) {
   const int32 free_tail_bytes =
       input_buffer_capacity_ - (read_bytes + unread_bytes);
 
-  if (static_cast<int32>(bytes_to_write) > free_tail_bytes) {
+  if (bytes_to_write > free_tail_bytes) {
     memmove(input_buffer_.get(), next_in_, avail_in_);
     next_in_ = input_buffer_.get();
   }
diff --git a/tensorflow/core/lib/io/zlib_outputbuffer.cc b/tensorflow/core/lib/io/zlib_outputbuffer.cc
index d475d0eaa5c..5840ca60242 100644
--- a/tensorflow/core/lib/io/zlib_outputbuffer.cc
+++ b/tensorflow/core/lib/io/zlib_outputbuffer.cc
@@ -98,7 +98,7 @@ void ZlibOutputBuffer::AddToInputBuffer(StringPiece data) {
   int32 unread_bytes = z_stream_->avail_in;
   int32 free_tail_bytes = input_buffer_capacity_ - (read_bytes + unread_bytes);
 
-  if (static_cast<int32>(bytes_to_write) > free_tail_bytes) {
+  if (bytes_to_write > free_tail_bytes) {
     memmove(z_stream_input_.get(), z_stream_->next_in, z_stream_->avail_in);
     z_stream_->next_in = z_stream_input_.get();
   }
@@ -154,7 +154,7 @@ Status ZlibOutputBuffer::Append(StringPiece data) {
 
   size_t bytes_to_write = data.size();
 
-  if (static_cast<int32>(bytes_to_write) <= AvailableInputSpace()) {
+  if (bytes_to_write <= AvailableInputSpace()) {
     AddToInputBuffer(data);
     return Status::OK();
   }
@@ -162,7 +162,7 @@ Status ZlibOutputBuffer::Append(StringPiece data) {
   TF_RETURN_IF_ERROR(DeflateBuffered(zlib_options_.flush_mode));
 
   // At this point input stream should be empty.
-  if (static_cast<int32>(bytes_to_write) <= AvailableInputSpace()) {
+  if (bytes_to_write <= AvailableInputSpace()) {
     AddToInputBuffer(data);
     return Status::OK();
   }
diff --git a/tensorflow/core/platform/env.cc b/tensorflow/core/platform/env.cc
index 05d95ba0425..b29cad05459 100644
--- a/tensorflow/core/platform/env.cc
+++ b/tensorflow/core/platform/env.cc
@@ -214,7 +214,7 @@ bool Env::FilesExist(const std::vector<string>& files,
     }
     if (fs_status) {
       result &= fs_result;
-      for (size_t i = 0; i < itr.second.size(); ++i) {
+      for (int i = 0; i < itr.second.size(); ++i) {
         per_file_status[itr.second[i]] = fs_status->at(i);
       }
     } else if (!fs_result) {
diff --git a/tensorflow/core/platform/file_system.cc b/tensorflow/core/platform/file_system.cc
index c9657e2339f..9e96ceedbdc 100644
--- a/tensorflow/core/platform/file_system.cc
+++ b/tensorflow/core/platform/file_system.cc
@@ -308,7 +308,7 @@ StringPiece FileSystem::Basename(StringPiece path) const {
 StringPiece FileSystem::Extension(StringPiece path) const {
   StringPiece basename = this->Basename(path);
 
-  size_t pos = basename.rfind('.');
+  int pos = basename.rfind('.');
   if (pos == StringPiece::npos) {
     return StringPiece(path.data() + path.size(), 0);
   } else {
diff --git a/tensorflow/core/platform/file_system_helper.cc b/tensorflow/core/platform/file_system_helper.cc
index 909752389e1..64b175c4d17 100644
--- a/tensorflow/core/platform/file_system_helper.cc
+++ b/tensorflow/core/platform/file_system_helper.cc
@@ -103,7 +103,7 @@ Status GetMatchingPaths(FileSystem* fs, Env* env, const string& pattern,
                 children_dir_status[i] = fs->IsDirectory(child_path);
               }
             });
-    for (size_t i = 0; i < children.size(); ++i) {
+    for (int i = 0; i < children.size(); ++i) {
       const string child_path = io::JoinPath(current_dir, children[i]);
       // If the IsDirectory call was cancelled we bail.
       if (children_dir_status[i].code() == tensorflow::error::CANCELLED) {
diff --git a/tensorflow/core/platform/s3/s3_file_system.cc b/tensorflow/core/platform/s3/s3_file_system.cc
index 45d648abcc0..1726c9fbc6c 100644
--- a/tensorflow/core/platform/s3/s3_file_system.cc
+++ b/tensorflow/core/platform/s3/s3_file_system.cc
@@ -906,7 +906,7 @@ Status S3FileSystem::MultiPartCopy(const Aws::String& source,
       // wait on the mutex until notify is called
       // then check the finished parts as there could be false notifications
       multi_part_copy_cv.wait(lock, [&finishedPartStates, num_parts] {
-        return static_cast<const int>(finishedPartStates.size()) == num_parts;
+        return finishedPartStates.size() == num_parts;
       });
     }
     // check if there was any error for any part
diff --git a/tensorflow/core/platform/status.cc b/tensorflow/core/platform/status.cc
index e303c18091c..756b8314148 100644
--- a/tensorflow/core/platform/status.cc
+++ b/tensorflow/core/platform/status.cc
@@ -74,9 +74,7 @@ class StatusLogSink : public TFLogSink {
 
     mutex_lock lock(mu_);
     messages_.emplace_back(entry.ToString());
-    if (messages_.size() > static_cast<size_t>(num_messages_)){
-        messages_.pop_front();
-    }
+    if (messages_.size() > num_messages_) messages_.pop_front();
   }
 
  private:
diff --git a/tensorflow/core/profiler/internal/parse_annotation.cc b/tensorflow/core/profiler/internal/parse_annotation.cc
index a4cdc09739d..32c26befa3d 100644
--- a/tensorflow/core/profiler/internal/parse_annotation.cc
+++ b/tensorflow/core/profiler/internal/parse_annotation.cc
@@ -50,7 +50,7 @@ std::vector<absl::string_view> SplitNameAndMetadata(
 std::vector<absl::string_view> SplitPairs(absl::string_view metadata) {
   std::vector<absl::string_view> key_value_pairs;
   std::stack<char> quotes;
-  size_t start = 0, end = 0;
+  int start = 0, end = 0;
   for (; end < metadata.size(); ++end) {
     char ch = metadata[end];
     switch (ch) {
diff --git a/tensorflow/core/profiler/utils/derived_timeline.cc b/tensorflow/core/profiler/utils/derived_timeline.cc
index 3d03fc22c16..112c0977763 100644
--- a/tensorflow/core/profiler/utils/derived_timeline.cc
+++ b/tensorflow/core/profiler/utils/derived_timeline.cc
@@ -130,7 +130,7 @@ void DerivedXLineBuilder::ExpandOrAddLevelEvent(const XEvent& event,
 }
 
 void DerivedXLineBuilder::ResetLastEvents(int level) {
-  for (int i = level; i < static_cast<int>(last_event_by_level_.size()); ++i) {
+  for (int i = level; i < last_event_by_level_.size(); ++i) {
     last_event_by_level_[i] = absl::nullopt;
   }
   if (level == 0) ResetDependentLines();
diff --git a/tensorflow/core/profiler/utils/derived_timeline.h b/tensorflow/core/profiler/utils/derived_timeline.h
index 92489399b8f..cd4da7996c5 100644
--- a/tensorflow/core/profiler/utils/derived_timeline.h
+++ b/tensorflow/core/profiler/utils/derived_timeline.h
@@ -37,7 +37,7 @@ class DerivedXLineBuilder {
                       std::vector<DerivedXLineBuilder*> dependent_lines);
 
   void ExpandOrAddEvents(const std::vector<XEvent>& event_per_level) {
-    for (size_t level = 0; level < event_per_level.size(); ++level) {
+    for (int level = 0; level < event_per_level.size(); ++level) {
       ExpandOrAddLevelEvent(event_per_level[level], level);
     }
   }
diff --git a/tensorflow/core/profiler/utils/xplane_utils.cc b/tensorflow/core/profiler/utils/xplane_utils.cc
index 1fe476ce79c..7f5221c5391 100644
--- a/tensorflow/core/profiler/utils/xplane_utils.cc
+++ b/tensorflow/core/profiler/utils/xplane_utils.cc
@@ -266,7 +266,7 @@ void SortXSpace(XSpace* space) {
 // smaller than these value.
 void NormalizeTimestamps(XPlane* plane, uint64 start_time_ns) {
   for (XLine& line : *plane->mutable_lines()) {
-    if (line.timestamp_ns() >= static_cast<long int>(start_time_ns)) {
+    if (line.timestamp_ns() >= start_time_ns) {
       line.set_timestamp_ns(line.timestamp_ns() - start_time_ns);
     }
   }
diff --git a/tensorflow/core/util/bcast.h b/tensorflow/core/util/bcast.h
index 075de84964e..7bb8ea18ad3 100644
--- a/tensorflow/core/util/bcast.h
+++ b/tensorflow/core/util/bcast.h
@@ -139,7 +139,7 @@ BCastList<N>::BCastList(const BCastList::Vec (&x)[N],
     if (x[i] != x[0]) {
       all_equal = false;
     }
-    if (static_cast<int>(x[i].size()) > largest_rank) {
+    if (x[i].size() > largest_rank) {
       largest_rank = x[i].size();
     }
   }
@@ -176,7 +176,7 @@ BCastList<N>::BCastList(const BCastList::Vec (&x)[N],
 
   // 1-extend and align all vectors.
   for (int i = 0; i < N; ++i) {
-    if (static_cast<int>(copy[i].size()) < largest_rank) {
+    if (copy[i].size() < largest_rank) {
       copy[i].resize(largest_rank, 1);
     }
   }
diff --git a/tensorflow/lite/toco/graph_transformations/convert_trivial_tile_to_concat.cc b/tensorflow/lite/toco/graph_transformations/convert_trivial_tile_to_concat.cc
index c19ccf676c9..46288d2a1ed 100644
--- a/tensorflow/lite/toco/graph_transformations/convert_trivial_tile_to_concat.cc
+++ b/tensorflow/lite/toco/graph_transformations/convert_trivial_tile_to_concat.cc
@@ -52,7 +52,7 @@ namespace toco {
   // It then just becomes a concat along that dimension.
   int non_one_dims = 0;
   int concat_axis = 0;
-  for (size_t i = 0; i < multiples.size(); ++i) {
+  for (int i = 0; i < multiples.size(); ++i) {
     if (multiples[i] != 1) {
       ++non_one_dims;
       concat_axis = i;
diff --git a/tensorflow/lite/toco/graph_transformations/convert_trivial_transpose_to_reshape.cc b/tensorflow/lite/toco/graph_transformations/convert_trivial_transpose_to_reshape.cc
index fa8a69a1e7a..2b5aaea2b23 100644
--- a/tensorflow/lite/toco/graph_transformations/convert_trivial_transpose_to_reshape.cc
+++ b/tensorflow/lite/toco/graph_transformations/convert_trivial_transpose_to_reshape.cc
@@ -31,7 +31,7 @@ bool TransposeAffectsMemoryOrder(std::vector<int> perm,
   // just the shape) then the flat buffer representation shouldn't change.
   std::vector<int> old_major_index_ordering;
   std::vector<int> new_major_index_ordering;
-  for (int i = 0; static_cast<size_t>(i) < in_shape.size(); i++) {
+  for (int i = 0; i < in_shape.size(); i++) {
     if (in_shape[i] != 1) {
       old_major_index_ordering.push_back(i);
     }
diff --git a/tensorflow/lite/toco/graph_transformations/dequantize.cc b/tensorflow/lite/toco/graph_transformations/dequantize.cc
index c87c305a70d..cc5dddbb40e 100644
--- a/tensorflow/lite/toco/graph_transformations/dequantize.cc
+++ b/tensorflow/lite/toco/graph_transformations/dequantize.cc
@@ -35,7 +35,7 @@ void DequantizeBuffer(Array* array) {
   auto& new_data = array->GetMutableBuffer<ArrayDataType::kFloat>().data;
   new_data.resize(old_data.size());
   const auto& qparams = array->GetQuantizationParams();
-  for (size_t i = 0; i < old_data.size(); i++) {
+  for (int i = 0; i < old_data.size(); i++) {
     new_data[i] = qparams.scale * (old_data[i] - qparams.zero_point);
   }
 }
diff --git a/tensorflow/lite/toco/graph_transformations/drop_fake_quant.cc b/tensorflow/lite/toco/graph_transformations/drop_fake_quant.cc
index 3a0b4d0103f..bb8679bced8 100644
--- a/tensorflow/lite/toco/graph_transformations/drop_fake_quant.cc
+++ b/tensorflow/lite/toco/graph_transformations/drop_fake_quant.cc
@@ -45,7 +45,7 @@ namespace toco {
   }
 
   // Drop min/max inputs
-  for (size_t i = 1; i < fakequant_op->inputs.size(); i++) {
+  for (int i = 1; i < fakequant_op->inputs.size(); i++) {
     if (CountOpsWithInput(*model, fakequant_op->inputs[i]) == 1) {
       model->EraseArray(fakequant_op->inputs[i]);
     }
diff --git a/tensorflow/lite/toco/graph_transformations/ensure_uint8_weights_safe_for_fast_int8_kernels.cc b/tensorflow/lite/toco/graph_transformations/ensure_uint8_weights_safe_for_fast_int8_kernels.cc
index ce4574cdfbf..918bb489995 100644
--- a/tensorflow/lite/toco/graph_transformations/ensure_uint8_weights_safe_for_fast_int8_kernels.cc
+++ b/tensorflow/lite/toco/graph_transformations/ensure_uint8_weights_safe_for_fast_int8_kernels.cc
@@ -166,7 +166,7 @@ namespace toco {
   int index_of_previous_bad_value = 0;
   bool changed = false;
 
-  for (size_t i = 0; i < buffer_data.size(); i++) {
+  for (int i = 0; i < buffer_data.size(); i++) {
     if (buffer_data[i] == 0) {
       count_bad++;
       if (count_bad > 1) {
diff --git a/tensorflow/lite/toco/graph_transformations/fuse_broadcast_into_following_binary.cc b/tensorflow/lite/toco/graph_transformations/fuse_broadcast_into_following_binary.cc
index 2c5c2cbb5f1..ba3e277f676 100644
--- a/tensorflow/lite/toco/graph_transformations/fuse_broadcast_into_following_binary.cc
+++ b/tensorflow/lite/toco/graph_transformations/fuse_broadcast_into_following_binary.cc
@@ -34,7 +34,7 @@ bool IsBroadcastingOp(const Model& model, Operator* op) {
   // Concatenation of identical inputs is usually a broadcast.
   if (op->type == OperatorType::kConcatenation) {
     // Verify that all inputs are the same.
-    for (size_t i = 1; i < op->inputs.size(); ++i) {
+    for (int i = 1; i < op->inputs.size(); ++i) {
       if (op->inputs[i] != op->inputs[0]) {
         return false;
       }
diff --git a/tensorflow/lite/toco/graph_transformations/group_bidirectional_sequence_ops.cc b/tensorflow/lite/toco/graph_transformations/group_bidirectional_sequence_ops.cc
index a6d95ec43b1..fa252b1a61b 100644
--- a/tensorflow/lite/toco/graph_transformations/group_bidirectional_sequence_ops.cc
+++ b/tensorflow/lite/toco/graph_transformations/group_bidirectional_sequence_ops.cc
@@ -125,7 +125,7 @@ bool CheckTwoUnidirectionalSequenceOpsAreValid(
       return false;
 
     // Make sure the inputs datatype matches.
-    for (size_t i = 0; i < fw_sequence_op->inputs.size(); ++i) {
+    for (int i = 0; i < fw_sequence_op->inputs.size(); ++i) {
       const auto& fw_input_array_name = fw_sequence_op->inputs[i];
       const auto& bw_input_array_name = bw_sequence_op->inputs[i];
       if (model.HasArray(fw_input_array_name) &&
@@ -137,7 +137,7 @@ bool CheckTwoUnidirectionalSequenceOpsAreValid(
     }
 
     // Make sure the outputs datatype matches.
-    for (size_t i = 0; i < fw_sequence_op->outputs.size(); ++i) {
+    for (int i = 0; i < fw_sequence_op->outputs.size(); ++i) {
       const auto& fw_output_array_name = fw_sequence_op->outputs[i];
       const auto& bw_output_array_name = bw_sequence_op->outputs[i];
       if (model.HasArray(fw_output_array_name) &&
diff --git a/tensorflow/lite/toco/graph_transformations/hardcode_min_max.cc b/tensorflow/lite/toco/graph_transformations/hardcode_min_max.cc
index 4250668bcf5..171d522daa7 100644
--- a/tensorflow/lite/toco/graph_transformations/hardcode_min_max.cc
+++ b/tensorflow/lite/toco/graph_transformations/hardcode_min_max.cc
@@ -405,7 +405,7 @@ bool HardcodeMinMaxForPack(Model* model, Operator* op) {
   }
   const auto& first_input_minmax = first_input_array.GetMinMax();
 
-  for (size_t i = 1; i < op->inputs.size(); i++) {
+  for (int i = 1; i < op->inputs.size(); i++) {
     const auto& input_array = model->GetArray(op->inputs[i]);
     if (!input_array.minmax) {
       return false;
diff --git a/tensorflow/lite/toco/graph_transformations/identify_nearest_upsample.cc b/tensorflow/lite/toco/graph_transformations/identify_nearest_upsample.cc
index 08894c93a5b..2ab6692a3a8 100644
--- a/tensorflow/lite/toco/graph_transformations/identify_nearest_upsample.cc
+++ b/tensorflow/lite/toco/graph_transformations/identify_nearest_upsample.cc
@@ -199,7 +199,7 @@ std::vector<std::unique_ptr<Operator>>::iterator FindOperator(
   shape_array.data_type = ArrayDataType::kInt32;
   auto& shape_buffer = shape_array.GetMutableBuffer<ArrayDataType::kInt32>();
   // This is what imagined as the original shape.
-  for (size_t i = 0; i < imagined_original_shape.size(); ++i) {
+  for (int i = 0; i < imagined_original_shape.size(); ++i) {
     shape_buffer.data.push_back(imagined_original_shape.at(i));
   }
 
diff --git a/tensorflow/lite/toco/graph_transformations/merge_reshape_into_preceding_transpose.cc b/tensorflow/lite/toco/graph_transformations/merge_reshape_into_preceding_transpose.cc
index a76ae1a0635..80170fe8bcb 100644
--- a/tensorflow/lite/toco/graph_transformations/merge_reshape_into_preceding_transpose.cc
+++ b/tensorflow/lite/toco/graph_transformations/merge_reshape_into_preceding_transpose.cc
@@ -70,7 +70,7 @@ std::vector<int32> ReshapeToTranspose(const Model& model,
   std::vector<int> not_one_indices;
 
   // Separate into one indices and not one indices.
-  for (size_t i = 0; i < in_shape.size(); i++) {
+  for (int i = 0; i < in_shape.size(); i++) {
     if (in_shape[i] == 1) {
       one_indices.push_back(i);
     } else {
@@ -167,7 +167,7 @@ std::vector<int32> ReshapeToTranspose(const Model& model,
 
   // Combine the permutations.
   const auto& transpose_perm = transpose_op->perm;
-  for (size_t i = 0; i < merged_perm.size(); i++) {
+  for (int i = 0; i < merged_perm.size(); i++) {
     merged_perm[i] = transpose_perm[merged_perm[i]];
   }
 
diff --git a/tensorflow/lite/toco/graph_transformations/propagate_array_data_types.cc b/tensorflow/lite/toco/graph_transformations/propagate_array_data_types.cc
index 2f316934311..49d59de860b 100644
--- a/tensorflow/lite/toco/graph_transformations/propagate_array_data_types.cc
+++ b/tensorflow/lite/toco/graph_transformations/propagate_array_data_types.cc
@@ -170,7 +170,7 @@ void SetDataTypeForAllOutputs(Model* model, Operator* op,
       if (unsupported_op->output_data_types.size() < op->outputs.size()) {
         return ::tensorflow::Status::OK();
       }
-      for (size_t i = 0; i < op->outputs.size(); ++i) {
+      for (int i = 0; i < op->outputs.size(); ++i) {
         const string& output = op->outputs[i];
         const ArrayDataType data_type = unsupported_op->output_data_types[i];
         model->GetArray(output).data_type = data_type;
diff --git a/tensorflow/lite/toco/graph_transformations/propagate_fake_quant_num_bits.cc b/tensorflow/lite/toco/graph_transformations/propagate_fake_quant_num_bits.cc
index 94779f54af2..1ed618879c1 100644
--- a/tensorflow/lite/toco/graph_transformations/propagate_fake_quant_num_bits.cc
+++ b/tensorflow/lite/toco/graph_transformations/propagate_fake_quant_num_bits.cc
@@ -149,7 +149,7 @@ bool RecursivelyBackwardPropagateDataType(GraphTransformation* transformation,
                                           ArrayDataType new_data_type,
                                           const MinMax& new_minmax) {
   bool did_change = false;
-  for (size_t input_index = 0; input_index < op->inputs.size(); ++input_index) {
+  for (int input_index = 0; input_index < op->inputs.size(); ++input_index) {
     const auto& input = op->inputs[input_index];
     auto& input_array = model->GetArray(input);
 
diff --git a/tensorflow/lite/toco/graph_transformations/propagate_fixed_sizes.cc b/tensorflow/lite/toco/graph_transformations/propagate_fixed_sizes.cc
index 520cd8b495a..006e624eb7a 100644
--- a/tensorflow/lite/toco/graph_transformations/propagate_fixed_sizes.cc
+++ b/tensorflow/lite/toco/graph_transformations/propagate_fixed_sizes.cc
@@ -431,7 +431,7 @@ void ProcessTensorFlowReshapeOperator(Model* model,
   bool has_wildcard = false;
   int wildcard_index = 0;
   int product_non_wildcard_dims = 1;
-  for (size_t i = 0; i < shape_data.size(); i++) {
+  for (int i = 0; i < shape_data.size(); i++) {
     if (shape_data[i] == -1) {
       CHECK(!has_wildcard);
       has_wildcard = true;
@@ -574,7 +574,7 @@ void ProcessTensorFlowReductionOperator(Model* model, Operator* op) {
     std::set<int32> true_indices;
     const auto& reduction_indices =
         reduction_indices_array.GetBuffer<ArrayDataType::kInt32>().data;
-    for (size_t i = 0; i < reduction_indices.size(); ++i) {
+    for (int i = 0; i < reduction_indices.size(); ++i) {
       const int32 reduction_index = reduction_indices[i];
       if (reduction_index < -input_rank || reduction_index >= input_rank) {
         CHECK(false) << "Invalid reduction dimension " << reduction_index
@@ -627,7 +627,7 @@ void ProcessSliceOperator(Model* model, SliceOperator* op) {
   CHECK_EQ(op->begin.size(), op->size.size());
 
   std::vector<int> output_dims;
-  for (size_t i = 0; i < op->begin.size(); ++i) {
+  for (int i = 0; i < op->begin.size(); ++i) {
     int size = op->size[i];
     if (size == -1) {
       size = input_array.shape().dims(i) - op->begin[i];
@@ -883,7 +883,7 @@ void ProcessTensorFlowSplitVOperator(Model* model,
 
   CHECK_EQ(op->outputs.size(), op->num_split);
 
-  for (size_t i = 0; i < op->outputs.size(); ++i) {
+  for (int i = 0; i < op->outputs.size(); ++i) {
     const auto& output = op->outputs[i];
     Shape output_shape = input_shape;
     (*output_shape.mutable_dims())[axis] = size_splits_vector.at(i);
@@ -1514,7 +1514,7 @@ void ProcessPadOperator(Model* model, PadOperator* op) {
   std::vector<int>& dims = *output_shape.mutable_dims();
   CHECK_EQ(op->left_padding.size(), dims.size());
 
-  for (size_t i = 0; i < op->left_padding.size(); ++i) {
+  for (int i = 0; i < op->left_padding.size(); ++i) {
     dims[i] += op->left_padding[i] + op->right_padding[i];
   }
 
@@ -1540,7 +1540,7 @@ void ProcessPadV2Operator(Model* model, PadV2Operator* op) {
   std::vector<int>& dims = *output_shape.mutable_dims();
   CHECK_EQ(op->left_padding.size(), dims.size());
 
-  for (size_t i = 0; i < op->left_padding.size(); ++i) {
+  for (int i = 0; i < op->left_padding.size(); ++i) {
     dims[i] += op->left_padding[i] + op->right_padding[i];
   }
 
@@ -1683,7 +1683,7 @@ void ProcessStridedSliceOperator(Model* model, StridedSliceOperator* op) {
   CHECK_LE(op->strides.size(), num_input_axes)
       << "StridedSlice op with output \"" << op->outputs[0]
       << "\", requires no more than " << num_input_axes << " strides";
-  for (size_t i = 0; i < op->strides.size(); i++) {
+  for (int i = 0; i < op->strides.size(); i++) {
     CHECK_NE(op->strides[i], 0) << "Strides must be non-zero. Axis " << i
                                 << " has stride=" << op->strides[i] << ".";
   }
@@ -1814,7 +1814,7 @@ void ProcessTransposeOperator(Model* model, TransposeOperator* op) {
       << "Transpose permutation input " << op->inputs[1]
       << " must be same length as input dimensions";
   std::vector<int>* output_dims = output_array.mutable_shape()->mutable_dims();
-  for (size_t i = 0; i < perm.size(); i++) {
+  for (int i = 0; i < perm.size(); i++) {
     int axis = perm[i];
     CHECK_GE(axis, 0);
     CHECK_LT(axis, input_shape.dimensions_count());
@@ -1856,8 +1856,8 @@ void ProcessArgMinMaxOperator(Model* model, Op* op) {
   std::vector<int> output_dims;
 
   output_dims.reserve(input_dims.size() - 1);
-  for (size_t i = 0; i < input_dims.size(); ++i) {
-    if ( static_cast<int>(i) != axis) {
+  for (int i = 0; i < input_dims.size(); ++i) {
+    if (i != axis) {
       output_dims.push_back(input_dims[i]);
     }
   }
@@ -1938,7 +1938,7 @@ void ProcessTileOperator(Model* model, TensorFlowTileOperator* op) {
 
   auto* mutable_dims = output_array.mutable_shape()->mutable_dims();
   mutable_dims->resize(multiples.size());
-  for (size_t i = 0; i < mutable_dims->size(); ++i) {
+  for (int i = 0; i < mutable_dims->size(); ++i) {
     (*mutable_dims)[i] = input_shape.dims(i) * multiples[i];
   }
 }
@@ -2010,8 +2010,8 @@ void ProcessUnpackOperator(Model* model, UnpackOperator* op) {
   std::vector<int> output_dims;
 
   output_dims.reserve(input_dims.size() - 1);
-  for (size_t i = 0; i < input_dims.size(); ++i) {
-    if ( static_cast<int>(i) != op->axis) {
+  for (int i = 0; i < input_dims.size(); ++i) {
+    if (i != op->axis) {
       output_dims.push_back(input_dims[i]);
     }
   }
@@ -2399,7 +2399,7 @@ void ProcessScatterNdOperator(Model* model, ScatterNdOperator* op) {
       if (unsupported_op->output_shapes.size() < op->outputs.size()) {
         return ::tensorflow::Status::OK();
       }
-      for (size_t i = 0; i < op->outputs.size(); ++i) {
+      for (int i = 0; i < op->outputs.size(); ++i) {
         const string& output = op->outputs[i];
         model->GetArray(output).copy_shape(unsupported_op->output_shapes.at(i));
       }
diff --git a/tensorflow/lite/toco/graph_transformations/remove_successive_transpose.cc b/tensorflow/lite/toco/graph_transformations/remove_successive_transpose.cc
index 1cb3a300127..6eccda04c18 100644
--- a/tensorflow/lite/toco/graph_transformations/remove_successive_transpose.cc
+++ b/tensorflow/lite/toco/graph_transformations/remove_successive_transpose.cc
@@ -31,12 +31,12 @@ bool TransformsToIdentity(std::vector<int> const& perm1,
   // perm1 is the order of the indices after first transpose. When perm1 is
   // reordered according to perm2, if the result is simple increasing sequence
   // i.e., range(0, perm1.size()), then the two transposes cancel each other.
-  for (size_t i = 0; i < perm1.size(); ++i) {
-    if (perm1[i] < 0 || perm1[i] >= static_cast<int>(perm1.size()) || perm2[i] < 0 ||
-        perm2[i] >= static_cast<int>(perm1.size())) {
+  for (int i = 0; i < perm1.size(); ++i) {
+    if (perm1[i] < 0 || perm1[i] >= perm1.size() || perm2[i] < 0 ||
+        perm2[i] >= perm1.size()) {
       return false;
     }
-    if (perm1[perm2[i]] != static_cast<int>(i)) {
+    if (perm1[perm2[i]] != i) {
       return false;
     }
   }
@@ -46,7 +46,7 @@ bool TransformsToIdentity(std::vector<int> const& perm1,
 void ReplaceOpInputsWith(Model* model, const string& lookfor,
                          const string& replacewith) {
   for (const auto& op : model->operators) {
-    for (size_t i = 0; i < op->inputs.size(); ++i) {
+    for (int i = 0; i < op->inputs.size(); ++i) {
       if (op->inputs[i] == lookfor) {
         op->inputs[i] = replacewith;
       }
diff --git a/tensorflow/lite/toco/graph_transformations/remove_trivial_passthrough.cc b/tensorflow/lite/toco/graph_transformations/remove_trivial_passthrough.cc
index eeb8751bf86..bd529bd9ecd 100644
--- a/tensorflow/lite/toco/graph_transformations/remove_trivial_passthrough.cc
+++ b/tensorflow/lite/toco/graph_transformations/remove_trivial_passthrough.cc
@@ -82,7 +82,7 @@ bool RemoveTrivialPassthroughOp(GraphTransformation* transformation,
     // We call 'main input' the unique nonconstant input array if there is one,
     // or else the 0-th input.
     int count_nonconstant_input_arrays = 0;
-    for (size_t i = 0; i < passthru_op->inputs.size(); i++) {
+    for (int i = 0; i < passthru_op->inputs.size(); i++) {
       if (!model->GetArray(passthru_op->inputs[i]).buffer) {
         count_nonconstant_input_arrays++;
         if (count_nonconstant_input_arrays == 1) {
diff --git a/tensorflow/lite/toco/graph_transformations/reorder_elementwise_unary.cc b/tensorflow/lite/toco/graph_transformations/reorder_elementwise_unary.cc
index 38edff76d55..17a5e9a1d6a 100644
--- a/tensorflow/lite/toco/graph_transformations/reorder_elementwise_unary.cc
+++ b/tensorflow/lite/toco/graph_transformations/reorder_elementwise_unary.cc
@@ -127,9 +127,9 @@ bool IsMoveOperator(OperatorType optype) {
     move_op->outputs[0] = output_name;
   } else {
     // The intermediate array is now the output array.
-    for (size_t i = 0; i < model->operators.size(); i++) {
+    for (int i = 0; i < model->operators.size(); i++) {
       Operator* consumer = model->operators[i].get();
-      for (size_t j = 0; j < consumer->inputs.size(); j++) {
+      for (int j = 0; j < consumer->inputs.size(); j++) {
         if (consumer->inputs[j] == output_name) {
           consumer->inputs[j] = intermediate_name;
         }
diff --git a/tensorflow/lite/toco/graph_transformations/reorder_reshape_transpose.cc b/tensorflow/lite/toco/graph_transformations/reorder_reshape_transpose.cc
index b2d184cdc31..0fbcf9f73b1 100644
--- a/tensorflow/lite/toco/graph_transformations/reorder_reshape_transpose.cc
+++ b/tensorflow/lite/toco/graph_transformations/reorder_reshape_transpose.cc
@@ -60,7 +60,7 @@ std::vector<int> ComputeNewPerm(std::vector<int> input_dims,
                                 std::vector<int> perm) {
   // These are the major axis of the input.
   std::vector<int> input_indices;
-  for (size_t i = 0; i < input_dims.size(); i++) {
+  for (int i = 0; i < input_dims.size(); i++) {
     if (input_dims[i] != 1) {
       input_indices.push_back(i);
     }
@@ -69,7 +69,7 @@ std::vector<int> ComputeNewPerm(std::vector<int> input_dims,
   // This maps which indices of the input produced the intermediate indices for
   // non-unary dimensions.
   std::unordered_map<int, int> intermediate_to_input_indices_map;
-  for (size_t i = 0; i < intermediate_dims.size(); i++) {
+  for (int i = 0; i < intermediate_dims.size(); i++) {
     if (intermediate_dims[i] != 1) {
       intermediate_to_input_indices_map[i] =
           input_indices[intermediate_to_input_indices_map.size()];
@@ -80,14 +80,14 @@ std::vector<int> ComputeNewPerm(std::vector<int> input_dims,
   // major indices.
   std::vector<int> new_perm;
   new_perm.reserve(input_dims.size());
-  for (size_t i = 0; i < perm.size(); i++) {
+  for (int i = 0; i < perm.size(); i++) {
     if (intermediate_dims[perm[i]] == 1) continue;
 
     new_perm.push_back(intermediate_to_input_indices_map[perm[i]]);
   }
 
   // Fill the rest of the transpose in with the ones.
-  for (size_t index = 0; index < input_dims.size(); index++) {
+  for (int index = 0; index < input_dims.size(); index++) {
     if (input_dims[index] == 1) {
       new_perm.push_back(index);
     }
@@ -193,9 +193,9 @@ std::vector<int> ComputeNewPerm(std::vector<int> input_dims,
     DeleteArrayIfUnused(intermediate_name, model);
   } else {
     // The intermediate array is now the output array.
-    for (size_t i = 0; i < model->operators.size(); i++) {
+    for (int i = 0; i < model->operators.size(); i++) {
       Operator* consumer = model->operators[i].get();
-      for (size_t j = 0; j < consumer->inputs.size(); j++) {
+      for (int j = 0; j < consumer->inputs.size(); j++) {
         if (consumer->inputs[j] == output_name) {
           consumer->inputs[j] = intermediate_name;
         }
diff --git a/tensorflow/lite/toco/graph_transformations/resolve_batch_normalization.cc b/tensorflow/lite/toco/graph_transformations/resolve_batch_normalization.cc
index 545c53fb31a..6e5815ee94d 100644
--- a/tensorflow/lite/toco/graph_transformations/resolve_batch_normalization.cc
+++ b/tensorflow/lite/toco/graph_transformations/resolve_batch_normalization.cc
@@ -124,11 +124,11 @@ namespace toco {
   const auto& offset_float_data =
       offset_array.GetBuffer<ArrayDataType::kFloat>().data;
 
-  CHECK(static_cast<int>(mul_float_data.size()) == buffer_size);
-  CHECK(static_cast<int>(add_float_data.size()) == buffer_size);
-  CHECK(static_cast<int>(mean_float_data.size()) == buffer_size);
-  CHECK(static_cast<int>(multiplier_float_data.size()) == buffer_size);
-  CHECK(static_cast<int>(offset_float_data.size()) == buffer_size);
+  CHECK(mul_float_data.size() == buffer_size);
+  CHECK(add_float_data.size() == buffer_size);
+  CHECK(mean_float_data.size() == buffer_size);
+  CHECK(multiplier_float_data.size() == buffer_size);
+  CHECK(offset_float_data.size() == buffer_size);
 
   for (int i = 0; i < buffer_size; i++) {
     mul_float_data[i] = multiplier_float_data[i];
diff --git a/tensorflow/lite/toco/graph_transformations/resolve_constant_concatenation.cc b/tensorflow/lite/toco/graph_transformations/resolve_constant_concatenation.cc
index 20e805a29e0..7c9aa025f64 100644
--- a/tensorflow/lite/toco/graph_transformations/resolve_constant_concatenation.cc
+++ b/tensorflow/lite/toco/graph_transformations/resolve_constant_concatenation.cc
@@ -64,7 +64,7 @@ void CopyTensorSegments(const std::vector<Array*>& input_arrays,
   // Copy the data from input_arrays to concatenated_array_buffer.
   T* dest_ptr = concatenated_array_buffer.data();
   for (int s = 0; s < total_copy_steps; s++) {
-    for (size_t i = 0; i < input_arrays.size(); i++) {
+    for (int i = 0; i < input_arrays.size(); i++) {
       std::copy(src_ptr[i], src_ptr[i] + array_copy_size[i], dest_ptr);
       src_ptr[i] += array_copy_size[i];
       dest_ptr += array_copy_size[i];
diff --git a/tensorflow/lite/toco/graph_transformations/resolve_constant_pack.cc b/tensorflow/lite/toco/graph_transformations/resolve_constant_pack.cc
index c6dc093ba00..0df35509d3d 100644
--- a/tensorflow/lite/toco/graph_transformations/resolve_constant_pack.cc
+++ b/tensorflow/lite/toco/graph_transformations/resolve_constant_pack.cc
@@ -36,7 +36,7 @@ void Pack(Model* model, PackOperator const& op) {
   // Pack inputs into buffer
   CHECK_EQ(op.axis, 0) << "Packing only supported along first axis";
   int dst_offset = 0;
-  for (size_t i = 0; i < op.inputs.size(); i++) {
+  for (int i = 0; i < op.inputs.size(); i++) {
     // Append array data to output for each input array
     const auto& input_array = model->GetArray(op.inputs[i]);
     int input_size = RequiredBufferSizeForShape(input_array.shape());
diff --git a/tensorflow/lite/toco/graph_transformations/resolve_constant_slice.cc b/tensorflow/lite/toco/graph_transformations/resolve_constant_slice.cc
index 34a1a1ce899..fd71fb1873a 100644
--- a/tensorflow/lite/toco/graph_transformations/resolve_constant_slice.cc
+++ b/tensorflow/lite/toco/graph_transformations/resolve_constant_slice.cc
@@ -50,7 +50,7 @@ bool Slice(SliceOperator const& op, Array const& input_array,
   CHECK_LE(size.size(), 4);
   std::vector<int> begin = op.begin;
   std::vector<int> end;
-  for (size_t i = 0; i < begin.size(); ++i) {
+  for (int i = 0; i < begin.size(); ++i) {
     int dim_size = size[i];
     if (dim_size == -1) {
       // -1 means the rest of the dimension.
diff --git a/tensorflow/lite/toco/graph_transformations/resolve_constant_transpose.cc b/tensorflow/lite/toco/graph_transformations/resolve_constant_transpose.cc
index a822f7b79e3..7ceffe6307e 100644
--- a/tensorflow/lite/toco/graph_transformations/resolve_constant_transpose.cc
+++ b/tensorflow/lite/toco/graph_transformations/resolve_constant_transpose.cc
@@ -40,7 +40,7 @@ void Transpose(Model* model, const Array& input_array,
   CHECK(input_shape.dimensions_count() == output_shape.dimensions_count());
   const int dim = input_shape.dimensions_count();
   CHECK_LE(dim, 4);
-  CHECK(static_cast<int>(perm.size()) >= dim);
+  CHECK(perm.size() >= dim);
   for (int i = 0; i < dim; i++) {
     CHECK(perm[i] >= 0 && perm[i] < dim);
     CHECK(input_shape.dims(perm[i]) == output_shape.dims(i));
diff --git a/tensorflow/lite/toco/graph_transformations/resolve_constant_unary.cc b/tensorflow/lite/toco/graph_transformations/resolve_constant_unary.cc
index 4d6cd188729..197e17eee16 100644
--- a/tensorflow/lite/toco/graph_transformations/resolve_constant_unary.cc
+++ b/tensorflow/lite/toco/graph_transformations/resolve_constant_unary.cc
@@ -62,7 +62,7 @@ void ReduceGeneric(bool keep_dims, const std::vector<int>& axes,
   }
 
   std::vector<int> output_indices(input_shape.dimensions_count());
-  for (size_t input_offset = 0; input_offset < input.size(); ++input_offset) {
+  for (int input_offset = 0; input_offset < input.size(); ++input_offset) {
     std::vector<int> input_indices = ReverseOffset(input_shape, input_offset);
     // Calculate the output location by squashing input indices to 0
     // in reduced axes.
@@ -319,7 +319,7 @@ bool CopyMinMaxFromFirstInput(const Operator& op, Model* model) {
   } else if (unary_op->type == OperatorType::kRelu6 ||
              unary_op->type == OperatorType::kRelu1 ||
              unary_op->type == OperatorType::kRelu) {
-    for (int i = 0; i < output_buffer_size; ++i) {
+    for (size_t i = 0; i < output_buffer_size; ++i) {
       const float value = (*input_float_data)[i];
       float new_value = 0.0f;
       switch (unary_op->type) {
diff --git a/tensorflow/lite/toco/graph_transformations/unpartition_embedding_lookup.cc b/tensorflow/lite/toco/graph_transformations/unpartition_embedding_lookup.cc
index 84d5922aae8..1f7035c21e2 100644
--- a/tensorflow/lite/toco/graph_transformations/unpartition_embedding_lookup.cc
+++ b/tensorflow/lite/toco/graph_transformations/unpartition_embedding_lookup.cc
@@ -57,10 +57,10 @@ namespace toco {
   // Split up the DynamicStitch inputs into the indices and data.
   std::vector<string> stitch_indices_inputs;
   std::vector<string> stitch_data_inputs;
-  for (int i = 0; i < stitch_op->num_partitions; ++i) {
+  for (size_t i = 0; i < stitch_op->num_partitions; ++i) {
     stitch_indices_inputs.push_back(stitch_op->inputs[i]);
   }
-  for (int i = stitch_op->num_partitions; i < stitch_op->num_partitions * 2;
+  for (size_t i = stitch_op->num_partitions; i < stitch_op->num_partitions * 2;
        ++i) {
     stitch_data_inputs.push_back(stitch_op->inputs[i]);
   }
diff --git a/tensorflow/lite/toco/model_cmdline_flags.cc b/tensorflow/lite/toco/model_cmdline_flags.cc
index 351884fbf1e..2434481272f 100644
--- a/tensorflow/lite/toco/model_cmdline_flags.cc
+++ b/tensorflow/lite/toco/model_cmdline_flags.cc
@@ -263,7 +263,7 @@ void ReadModelFlagsFromCommandLineFlags(
     QCHECK(uses_multi_input_flags);
     std::vector<string> mean_values =
         absl::StrSplit(parsed_model_flags.mean_values.value(), ',');
-    QCHECK(static_cast<int>(mean_values.size()) == model_flags->input_arrays_size());
+    QCHECK(mean_values.size() == model_flags->input_arrays_size());
     for (size_t i = 0; i < mean_values.size(); ++i) {
       char* last = nullptr;
       model_flags->mutable_input_arrays(i)->set_mean_value(
@@ -280,7 +280,7 @@ void ReadModelFlagsFromCommandLineFlags(
     QCHECK(uses_multi_input_flags);
     std::vector<string> std_values =
         absl::StrSplit(parsed_model_flags.std_values.value(), ',');
-    QCHECK( static_cast<int>(std_values.size()) == model_flags->input_arrays_size());
+    QCHECK(std_values.size() == model_flags->input_arrays_size());
     for (size_t i = 0; i < std_values.size(); ++i) {
       char* last = nullptr;
       model_flags->mutable_input_arrays(i)->set_std_value(
@@ -298,7 +298,7 @@ void ReadModelFlagsFromCommandLineFlags(
     QCHECK(uses_multi_input_flags);
     std::vector<string> input_data_types =
         absl::StrSplit(parsed_model_flags.input_data_types.value(), ',');
-    QCHECK(static_cast<int>(input_data_types.size()) == model_flags->input_arrays_size());
+    QCHECK(input_data_types.size() == model_flags->input_arrays_size());
     for (size_t i = 0; i < input_data_types.size(); ++i) {
       IODataType type;
       QCHECK(IODataType_Parse(input_data_types[i], &type));
@@ -321,7 +321,7 @@ void ReadModelFlagsFromCommandLineFlags(
     QCHECK(uses_multi_input_flags);
     std::vector<string> input_shapes =
         absl::StrSplit(parsed_model_flags.input_shapes.value(), ':');
-    QCHECK(static_cast<int>(input_shapes.size()) == model_flags->input_arrays_size());
+    QCHECK(input_shapes.size() == model_flags->input_arrays_size());
     for (size_t i = 0; i < input_shapes.size(); ++i) {
       auto* shape = model_flags->mutable_input_arrays(i)->mutable_shape();
       shape->clear_dims();
diff --git a/tensorflow/lite/toco/toco_cmdline_flags.cc b/tensorflow/lite/toco/toco_cmdline_flags.cc
index 9697a1ecbbd..c133db8f2a4 100644
--- a/tensorflow/lite/toco/toco_cmdline_flags.cc
+++ b/tensorflow/lite/toco/toco_cmdline_flags.cc
@@ -320,7 +320,7 @@ void ReadTocoFlagsFromCommandLineFlags(const ParsedTocoFlags& parsed_toco_flags,
     std::vector<string> input_types =
         absl::StrSplit(parsed_toco_flags.input_types.value(), ',');
     QCHECK(!input_types.empty());
-    for (size_t i = 1; i < input_types.size(); i++) {
+    for (int i = 1; i < input_types.size(); i++) {
       QCHECK_EQ(input_types[i], input_types[0]);
     }
     toco::IODataType input_type;

From 0e634188b335bf90c154dc1488d5b292ddddb0d0 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 15 Jun 2020 14:16:30 -0700
Subject: [PATCH 0205/1390] fix kokoro build :deprecated message.

PiperOrigin-RevId: 316542716
Change-Id: I62ad7d88c36cd5f8551dea8efa8193c6ac8691ec
---
 tensorflow/python/training/tracking/data_structures.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/training/tracking/data_structures.py b/tensorflow/python/training/tracking/data_structures.py
index 8dc252efbf9..a7d9b70a560 100644
--- a/tensorflow/python/training/tracking/data_structures.py
+++ b/tensorflow/python/training/tracking/data_structures.py
@@ -736,7 +736,7 @@ class _DictWrapper(TrackableDataStructure, wrapt.ObjectProxy):
     if wrapped_dict is None:
       # Allow zero-argument construction, e.g. from session.run's re-wrapping.
       wrapped_dict = {}
-    if not isinstance(wrapped_dict, collections.Mapping):
+    if not isinstance(wrapped_dict, collections_abc.Mapping):
       # Allow construction from a sequence, e.g. from nest.pack_sequence_as.
       wrapped_dict = dict(wrapped_dict)
     wrapt.ObjectProxy.__init__(self, wrapped_dict)

From a2442ea4077e61a564ab598ac983f4160d9546be Mon Sep 17 00:00:00 2001
From: Tare Gaskin <taregaskin@google.com>
Date: Mon, 15 Jun 2020 21:24:02 +0000
Subject: [PATCH 0206/1390] segragrating changes

---
 tensorflow/core/public/version.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index 3724f06ba4b..9db20363349 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -108,7 +108,7 @@ limitations under the License.
 
 #define TF_GRAPH_DEF_VERSION_MIN_PRODUCER 0
 #define TF_GRAPH_DEF_VERSION_MIN_CONSUMER 0
-#define TF_GRAPH_DEF_VERSION 409  // Updated: 2020/5/22
+#define TF_GRAPH_DEF_VERSION 408  // Updated: 2020/5/21
 
 // Checkpoint compatibility versions (the versions field in SavedSliceMeta).
 //

From fc531df8ce3df4fd434230dd3bd0e7fdfe5a6cf1 Mon Sep 17 00:00:00 2001
From: Tomer Kaftan <kaftan@google.com>
Date: Mon, 15 Jun 2020 14:20:38 -0700
Subject: [PATCH 0207/1390] Stop skipping KerasTensors for Keras tests that now
 work with KerasTensors.

PiperOrigin-RevId: 316543523
Change-Id: Iea54fa7ed735e239cda293304c6a17207b136ab7
---
 .../python/keras/engine/training_test.py      |  5 +---
 .../preprocessing/category_encoding_test.py   | 26 ++++++++-----------
 .../python/keras/premade/linear_test.py       |  3 +--
 .../python/keras/premade/wide_deep_test.py    |  3 +--
 tensorflow/python/keras/regularizers_test.py  |  8 +++---
 5 files changed, 18 insertions(+), 27 deletions(-)

diff --git a/tensorflow/python/keras/engine/training_test.py b/tensorflow/python/keras/engine/training_test.py
index bb6bfc32921..aa01463582c 100644
--- a/tensorflow/python/keras/engine/training_test.py
+++ b/tensorflow/python/keras/engine/training_test.py
@@ -1563,10 +1563,7 @@ class TrainingTest(keras_parameterized.TestCase):
     self.assertEqual(self.evaluate(layer.v), 1.)
 
   @keras_parameterized.run_all_keras_modes(
-      always_skip_v1=True,
-      # TODO(kaftan): this is failing with KerasTensors
-      # in a way that seems orthogonal to what the code is testing
-      skip_keras_tensors=True)
+      always_skip_v1=True)
   @parameterized.named_parameters(
       ('numpy_array', 'numpy_array'),
       ('dataset_array', 'dataset_array'),
diff --git a/tensorflow/python/keras/layers/preprocessing/category_encoding_test.py b/tensorflow/python/keras/layers/preprocessing/category_encoding_test.py
index ff1a06a3ae7..7e7f7f32be0 100644
--- a/tensorflow/python/keras/layers/preprocessing/category_encoding_test.py
+++ b/tensorflow/python/keras/layers/preprocessing/category_encoding_test.py
@@ -31,7 +31,6 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.keras import backend
 from tensorflow.python.keras import keras_parameterized
-from tensorflow.python.keras import testing_utils
 from tensorflow.python.keras.layers import core
 from tensorflow.python.keras.layers.preprocessing import category_encoding
 from tensorflow.python.keras.layers.preprocessing import category_encoding_v1
@@ -252,24 +251,21 @@ class CategoryEncodingInputTest(keras_parameterized.TestCase,
         sparse_ops.sparse_tensor_to_dense(sp_output_dataset, default_value=0),
         output_dataset)
 
-  # TODO(b/158570051): Support KerasTensor
-  # Keras functional model doesn't support dense layer stacked with sparse out.
   def test_sparse_output_and_dense_layer(self):
-    with testing_utils.use_keras_tensors_scope(False):
-      input_array = constant_op.constant([[1, 2, 3], [3, 3, 0]])
+    input_array = constant_op.constant([[1, 2, 3], [3, 3, 0]])
 
-      max_tokens = 4
+    max_tokens = 4
 
-      input_data = keras.Input(shape=(None,), dtype=dtypes.int32)
-      encoding_layer = get_layer_class()(
-          max_tokens=max_tokens, output_mode=category_encoding.COUNT,
-          sparse=True)
-      int_data = encoding_layer(input_data)
-      dense_layer = keras.layers.Dense(units=1)
-      output_data = dense_layer(int_data)
+    input_data = keras.Input(shape=(None,), dtype=dtypes.int32)
+    encoding_layer = get_layer_class()(
+        max_tokens=max_tokens, output_mode=category_encoding.COUNT,
+        sparse=True)
+    int_data = encoding_layer(input_data)
+    dense_layer = keras.layers.Dense(units=1)
+    output_data = dense_layer(int_data)
 
-      model = keras.Model(inputs=input_data, outputs=output_data)
-      _ = model.predict(input_array, steps=1)
+    model = keras.Model(inputs=input_data, outputs=output_data)
+    _ = model.predict(input_array, steps=1)
 
 
 @keras_parameterized.run_all_keras_modes
diff --git a/tensorflow/python/keras/premade/linear_test.py b/tensorflow/python/keras/premade/linear_test.py
index 6fa1767a60a..676f29bb840 100644
--- a/tensorflow/python/keras/premade/linear_test.py
+++ b/tensorflow/python/keras/premade/linear_test.py
@@ -40,8 +40,7 @@ from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 
 
-@keras_parameterized.run_all_keras_modes(always_skip_v1=True,
-                                         skip_keras_tensors=True)
+@keras_parameterized.run_all_keras_modes(always_skip_v1=True)
 class LinearModelTest(keras_parameterized.TestCase):
 
   def test_linear_model_with_single_input(self):
diff --git a/tensorflow/python/keras/premade/wide_deep_test.py b/tensorflow/python/keras/premade/wide_deep_test.py
index eae28c31df8..591b53e9a84 100644
--- a/tensorflow/python/keras/premade/wide_deep_test.py
+++ b/tensorflow/python/keras/premade/wide_deep_test.py
@@ -37,8 +37,7 @@ from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 
 
-@keras_parameterized.run_all_keras_modes(always_skip_v1=True,
-                                         skip_keras_tensors=True)
+@keras_parameterized.run_all_keras_modes(always_skip_v1=True)
 class WideDeepModelTest(keras_parameterized.TestCase):
 
   def test_wide_deep_model(self):
diff --git a/tensorflow/python/keras/regularizers_test.py b/tensorflow/python/keras/regularizers_test.py
index c2c2e6c4a01..b10218ba114 100644
--- a/tensorflow/python/keras/regularizers_test.py
+++ b/tensorflow/python/keras/regularizers_test.py
@@ -83,7 +83,7 @@ class KerasRegularizersTest(keras_parameterized.TestCase,
     self.assertEqual(len(model.losses), 1)
     model.fit(x_train, y_train, batch_size=10, epochs=1, verbose=0)
 
-  @keras_parameterized.run_all_keras_modes(skip_keras_tensors=True)
+  @keras_parameterized.run_all_keras_modes
   @parameterized.named_parameters([
       ('l1', regularizers.l1()),
       ('l2', regularizers.l2()),
@@ -126,7 +126,7 @@ class KerasRegularizersTest(keras_parameterized.TestCase,
         model.get_config(), custom_objects={'my_regularizer': my_regularizer})
     self.assertEqual(model2.layers[1].kernel_regularizer, my_regularizer)
 
-  @keras_parameterized.run_all_keras_modes(skip_keras_tensors=True)
+  @keras_parameterized.run_all_keras_modes
   @parameterized.named_parameters([
       ('l1', regularizers.l1()),
       ('l2', regularizers.l2()),
@@ -144,7 +144,7 @@ class KerasRegularizersTest(keras_parameterized.TestCase,
         run_eagerly=testing_utils.should_run_eagerly())
     self.assertLen(model.losses, 5)
 
-  @keras_parameterized.run_all_keras_modes(skip_keras_tensors=True)
+  @keras_parameterized.run_all_keras_modes
   @parameterized.named_parameters([
       ('l1', regularizers.l1()),
       ('l2', regularizers.l2()),
@@ -166,7 +166,7 @@ class KerasRegularizersTest(keras_parameterized.TestCase,
         run_eagerly=testing_utils.should_run_eagerly())
     self.assertLen(model.losses, 6)
 
-  @keras_parameterized.run_all_keras_modes(skip_keras_tensors=True)
+  @keras_parameterized.run_all_keras_modes
   @parameterized.named_parameters([
       ('l1', regularizers.l1()),
       ('l2', regularizers.l2()),

From 4c004feb3e9b08961d2e3e17639b30104800efd5 Mon Sep 17 00:00:00 2001
From: Tare Gaskin <taregaskin@google.com>
Date: Mon, 15 Jun 2020 21:37:05 +0000
Subject: [PATCH 0208/1390] segragation attempt 4

---
 tensorflow/core/framework/tensor_shape.cc             | 2 +-
 tensorflow/core/lib/io/random_inputstream.cc          | 2 +-
 tensorflow/core/lib/io/snappy/snappy_inputbuffer.cc   | 2 +-
 tensorflow/core/lib/io/snappy/snappy_outputbuffer.cc  | 6 +++---
 tensorflow/core/lib/io/zlib_outputbuffer.cc           | 6 +++---
 tensorflow/core/platform/env.cc                       | 2 +-
 tensorflow/core/platform/file_system.cc               | 2 +-
 tensorflow/core/platform/file_system_helper.cc        | 2 +-
 tensorflow/core/platform/status.cc                    | 4 +++-
 tensorflow/core/profiler/internal/parse_annotation.cc | 2 +-
 tensorflow/core/public/version.h                      | 2 +-
 11 files changed, 17 insertions(+), 15 deletions(-)

diff --git a/tensorflow/core/framework/tensor_shape.cc b/tensorflow/core/framework/tensor_shape.cc
index f4b440e9cd1..8040a316a45 100644
--- a/tensorflow/core/framework/tensor_shape.cc
+++ b/tensorflow/core/framework/tensor_shape.cc
@@ -182,7 +182,7 @@ void TensorShapeBase<Shape>::InitDims(gtl::ArraySlice<int64> dim_sizes) {
 
   // Allow sizes that are under kint64max^0.25 so that 4-way multiplication
   // below cannot overflow.
-  static const uint64 kMaxSmall = 0xd744;
+  static const int64 kMaxSmall = 0xd744;
   static_assert(kMaxSmall * kMaxSmall * kMaxSmall * kMaxSmall <= kint64max,
                 "bad overflow check");
   bool large_size = false;
diff --git a/tensorflow/core/lib/io/random_inputstream.cc b/tensorflow/core/lib/io/random_inputstream.cc
index 10f734a5bae..bd0054ce753 100644
--- a/tensorflow/core/lib/io/random_inputstream.cc
+++ b/tensorflow/core/lib/io/random_inputstream.cc
@@ -92,7 +92,7 @@ Status RandomAccessInputStream::SkipNBytes(int64 bytes_to_skip) {
     } else {
       return s;
     }
-    if (data.size() < bytes_to_read) {
+    if (data.size() < static_cast<size_t>(bytes_to_read)) {
       return errors::OutOfRange("reached end of file");
     }
     bytes_to_skip -= bytes_to_read;
diff --git a/tensorflow/core/lib/io/snappy/snappy_inputbuffer.cc b/tensorflow/core/lib/io/snappy/snappy_inputbuffer.cc
index a331d4173cf..53939f2d8a3 100644
--- a/tensorflow/core/lib/io/snappy/snappy_inputbuffer.cc
+++ b/tensorflow/core/lib/io/snappy/snappy_inputbuffer.cc
@@ -134,7 +134,7 @@ Status SnappyInputBuffer::ReadCompressedBlockLength(uint32* length) {
     }
     size_t readable = std::min(bytes_to_read, avail_in_);
 
-    for (int i = 0; i < readable; i++) {
+    for (size_t i = 0; i < readable; i++) {
       // The "unsigned char" type cast is intentional to avoid implicit type
       // casting of the signed char to unsigned int during bitwise OR which
       // causes weird overflow errors.
diff --git a/tensorflow/core/lib/io/snappy/snappy_outputbuffer.cc b/tensorflow/core/lib/io/snappy/snappy_outputbuffer.cc
index 563503a1319..fe3a53c6c25 100644
--- a/tensorflow/core/lib/io/snappy/snappy_outputbuffer.cc
+++ b/tensorflow/core/lib/io/snappy/snappy_outputbuffer.cc
@@ -76,7 +76,7 @@ Status SnappyOutputBuffer::Write(StringPiece data) {
 
   // If there is sufficient free space in input_buffer_ to fit data we
   // add it there and return.
-  if (bytes_to_write <= AvailableInputSpace()) {
+  if (static_cast<int32>(bytes_to_write) <= AvailableInputSpace()) {
     AddToInputBuffer(data);
     return Status::OK();
   }
@@ -87,7 +87,7 @@ Status SnappyOutputBuffer::Write(StringPiece data) {
   TF_RETURN_IF_ERROR(DeflateBuffered());
 
   // input_buffer_ should be empty at this point.
-  if (bytes_to_write <= AvailableInputSpace()) {
+  if (static_cast<int32>(bytes_to_write) <= AvailableInputSpace()) {
     AddToInputBuffer(data);
     return Status::OK();
   }
@@ -144,7 +144,7 @@ void SnappyOutputBuffer::AddToInputBuffer(StringPiece data) {
   const int32 free_tail_bytes =
       input_buffer_capacity_ - (read_bytes + unread_bytes);
 
-  if (bytes_to_write > free_tail_bytes) {
+  if (static_cast<int32>(bytes_to_write) > free_tail_bytes) {
     memmove(input_buffer_.get(), next_in_, avail_in_);
     next_in_ = input_buffer_.get();
   }
diff --git a/tensorflow/core/lib/io/zlib_outputbuffer.cc b/tensorflow/core/lib/io/zlib_outputbuffer.cc
index 5840ca60242..d475d0eaa5c 100644
--- a/tensorflow/core/lib/io/zlib_outputbuffer.cc
+++ b/tensorflow/core/lib/io/zlib_outputbuffer.cc
@@ -98,7 +98,7 @@ void ZlibOutputBuffer::AddToInputBuffer(StringPiece data) {
   int32 unread_bytes = z_stream_->avail_in;
   int32 free_tail_bytes = input_buffer_capacity_ - (read_bytes + unread_bytes);
 
-  if (bytes_to_write > free_tail_bytes) {
+  if (static_cast<int32>(bytes_to_write) > free_tail_bytes) {
     memmove(z_stream_input_.get(), z_stream_->next_in, z_stream_->avail_in);
     z_stream_->next_in = z_stream_input_.get();
   }
@@ -154,7 +154,7 @@ Status ZlibOutputBuffer::Append(StringPiece data) {
 
   size_t bytes_to_write = data.size();
 
-  if (bytes_to_write <= AvailableInputSpace()) {
+  if (static_cast<int32>(bytes_to_write) <= AvailableInputSpace()) {
     AddToInputBuffer(data);
     return Status::OK();
   }
@@ -162,7 +162,7 @@ Status ZlibOutputBuffer::Append(StringPiece data) {
   TF_RETURN_IF_ERROR(DeflateBuffered(zlib_options_.flush_mode));
 
   // At this point input stream should be empty.
-  if (bytes_to_write <= AvailableInputSpace()) {
+  if (static_cast<int32>(bytes_to_write) <= AvailableInputSpace()) {
     AddToInputBuffer(data);
     return Status::OK();
   }
diff --git a/tensorflow/core/platform/env.cc b/tensorflow/core/platform/env.cc
index b29cad05459..05d95ba0425 100644
--- a/tensorflow/core/platform/env.cc
+++ b/tensorflow/core/platform/env.cc
@@ -214,7 +214,7 @@ bool Env::FilesExist(const std::vector<string>& files,
     }
     if (fs_status) {
       result &= fs_result;
-      for (int i = 0; i < itr.second.size(); ++i) {
+      for (size_t i = 0; i < itr.second.size(); ++i) {
         per_file_status[itr.second[i]] = fs_status->at(i);
       }
     } else if (!fs_result) {
diff --git a/tensorflow/core/platform/file_system.cc b/tensorflow/core/platform/file_system.cc
index 9e96ceedbdc..c9657e2339f 100644
--- a/tensorflow/core/platform/file_system.cc
+++ b/tensorflow/core/platform/file_system.cc
@@ -308,7 +308,7 @@ StringPiece FileSystem::Basename(StringPiece path) const {
 StringPiece FileSystem::Extension(StringPiece path) const {
   StringPiece basename = this->Basename(path);
 
-  int pos = basename.rfind('.');
+  size_t pos = basename.rfind('.');
   if (pos == StringPiece::npos) {
     return StringPiece(path.data() + path.size(), 0);
   } else {
diff --git a/tensorflow/core/platform/file_system_helper.cc b/tensorflow/core/platform/file_system_helper.cc
index 64b175c4d17..909752389e1 100644
--- a/tensorflow/core/platform/file_system_helper.cc
+++ b/tensorflow/core/platform/file_system_helper.cc
@@ -103,7 +103,7 @@ Status GetMatchingPaths(FileSystem* fs, Env* env, const string& pattern,
                 children_dir_status[i] = fs->IsDirectory(child_path);
               }
             });
-    for (int i = 0; i < children.size(); ++i) {
+    for (size_t i = 0; i < children.size(); ++i) {
       const string child_path = io::JoinPath(current_dir, children[i]);
       // If the IsDirectory call was cancelled we bail.
       if (children_dir_status[i].code() == tensorflow::error::CANCELLED) {
diff --git a/tensorflow/core/platform/status.cc b/tensorflow/core/platform/status.cc
index 756b8314148..e303c18091c 100644
--- a/tensorflow/core/platform/status.cc
+++ b/tensorflow/core/platform/status.cc
@@ -74,7 +74,9 @@ class StatusLogSink : public TFLogSink {
 
     mutex_lock lock(mu_);
     messages_.emplace_back(entry.ToString());
-    if (messages_.size() > num_messages_) messages_.pop_front();
+    if (messages_.size() > static_cast<size_t>(num_messages_)){
+        messages_.pop_front();
+    }
   }
 
  private:
diff --git a/tensorflow/core/profiler/internal/parse_annotation.cc b/tensorflow/core/profiler/internal/parse_annotation.cc
index 32c26befa3d..a4cdc09739d 100644
--- a/tensorflow/core/profiler/internal/parse_annotation.cc
+++ b/tensorflow/core/profiler/internal/parse_annotation.cc
@@ -50,7 +50,7 @@ std::vector<absl::string_view> SplitNameAndMetadata(
 std::vector<absl::string_view> SplitPairs(absl::string_view metadata) {
   std::vector<absl::string_view> key_value_pairs;
   std::stack<char> quotes;
-  int start = 0, end = 0;
+  size_t start = 0, end = 0;
   for (; end < metadata.size(); ++end) {
     char ch = metadata[end];
     switch (ch) {
diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index 9db20363349..3724f06ba4b 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -108,7 +108,7 @@ limitations under the License.
 
 #define TF_GRAPH_DEF_VERSION_MIN_PRODUCER 0
 #define TF_GRAPH_DEF_VERSION_MIN_CONSUMER 0
-#define TF_GRAPH_DEF_VERSION 408  // Updated: 2020/5/21
+#define TF_GRAPH_DEF_VERSION 409  // Updated: 2020/5/22
 
 // Checkpoint compatibility versions (the versions field in SavedSliceMeta).
 //

From 813bd968d0c33c7eec0f6839ce372abc160e8e24 Mon Sep 17 00:00:00 2001
From: Raman Sarokin <sorokin@google.com>
Date: Mon, 15 Jun 2020 14:24:42 -0700
Subject: [PATCH 0209/1390] Choosing better convolution for Intel GPUs.

PiperOrigin-RevId: 316544335
Change-Id: Idf9d653eebf4cff8195ab9ca3b3e34b1a8959c1f
---
 .../delegates/gpu/cl/kernels/conv_powervr.cc  | 20 +++++++++++++++++++
 .../gpu/cl/selectors/convolution_selector.cc  |  2 ++
 2 files changed, 22 insertions(+)

diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/conv_powervr.cc b/tensorflow/lite/delegates/gpu/cl/kernels/conv_powervr.cc
index 9bb52b3e9c2..184e070202a 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/conv_powervr.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/conv_powervr.cc
@@ -802,6 +802,26 @@ ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
     conv_params.fixed_work_group_size = false;
     conv_params.src_depth_loop_size = 1;
     conv_params.weights_upload_type = WeightsUploadType::GLOBAL_MEM;
+  } else if (device.IsIntel()) {
+    conv_params.block_size = int3(1, 1, 4);
+    conv_params.work_group_size = int3(8, 2, 1);
+    conv_params.work_group_launch_order = int3(0, 1, 2);
+    conv_params.fixed_work_group_size = true;
+    conv_params.src_depth_loop_size = 1;
+    conv_params.weights_upload_type = WeightsUploadType::LOCAL_MEM_BY_THREADS;
+    if (dst_depth % 4 == 0 || dst_depth >= 8) {
+      conv_params.block_size.z = 4;
+    } else if (dst_depth % 2 == 0 || dst_depth >= 4) {
+      conv_params.block_size.z = 2;
+    } else {
+      conv_params.block_size.z = dst_depth;
+    }
+    if (src_depth % 2 == 0) {
+      conv_params.src_depth_loop_size = 2;
+    }
+    if (src_depth % 4 == 0 && conv_params.block_size.z <= 2) {
+      conv_params.src_depth_loop_size = 4;
+    }
   } else {
     conv_params.block_size = int3(1, 1, 4);
     conv_params.work_group_size = int3(8, 2, 1);
diff --git a/tensorflow/lite/delegates/gpu/cl/selectors/convolution_selector.cc b/tensorflow/lite/delegates/gpu/cl/selectors/convolution_selector.cc
index dc34dd7faee..3841c415301 100644
--- a/tensorflow/lite/delegates/gpu/cl/selectors/convolution_selector.cc
+++ b/tensorflow/lite/delegates/gpu/cl/selectors/convolution_selector.cc
@@ -170,6 +170,7 @@ absl::Status SelectConvolution(const Convolution2DAttributes& attr,
       return SelectConvolutionAdreno(attr, dst_shape, creation_context, op_def,
                                      hints, ptr);
     case Vendor::POWERVR:
+    case Vendor::INTEL:
     case Vendor::AMD:
       return SelectConvolutionPowerVR(attr, creation_context, op_def, ptr);
     case Vendor::NVIDIA:
@@ -193,6 +194,7 @@ absl::Status SelectConvolutionForWinograd(
                                              op_def, hints, ptr);
     case Vendor::POWERVR:
     case Vendor::AMD:
+    case Vendor::INTEL:
     case Vendor::NVIDIA: {
       ConvPowerVR conv;
       RETURN_IF_ERROR(

From 92bd07fee5541e941b7ea3da9bc90469b0dcf049 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 15 Jun 2020 14:26:51 -0700
Subject: [PATCH 0210/1390] Polish some comments in dot decomposer.

PiperOrigin-RevId: 316544755
Change-Id: I46ce48dcbf64119e4795b923f5b45b814e8bb8c7
---
 .../compiler/xla/service/dot_decomposer.cc       | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/tensorflow/compiler/xla/service/dot_decomposer.cc b/tensorflow/compiler/xla/service/dot_decomposer.cc
index 40354dec3c6..573b82a1e0f 100644
--- a/tensorflow/compiler/xla/service/dot_decomposer.cc
+++ b/tensorflow/compiler/xla/service/dot_decomposer.cc
@@ -29,10 +29,12 @@ namespace xla {
 
 namespace {
 
-// Convert a dot into a canonical form where non-contracting and contracting
-// dimensions are reshaped together and batch dimensions are the most major
-// dimensions. This requires transposing and reshapes of the lhs and rhs and
-// reshaping the output batch to the original shape.
+// Convert a dot into a canonical form;
+// * Non-contracting dimensions are reshaped together,
+// * Contracting dimensions are reshaped together,
+// * Batch dimensions are the most major dimensions.
+// This requires transposing and reshaping of the lhs and rhs, and reshaping the
+// output batch to the original shape.
 Status CanonicalizeDot(HloInstruction* original_dot) {
   auto computation = original_dot->parent();
   const auto& original_dnums = original_dot->dot_dimension_numbers();
@@ -63,7 +65,8 @@ Status CanonicalizeDot(HloInstruction* original_dot) {
     }
   }
   // The canonical form of the lhs is
-  // [BatchDims, NonContractingDims, ContractingsDims]
+  // [BatchDims, NonContractingDimsProduct, ContractingsDimsProduct]
+  // If NonContractingDimsProduct is 1, it is omitted.
   std::vector<int64> lhs_transpose;
   lhs_transpose.reserve(lhs_rank);
   lhs_transpose.insert(lhs_transpose.end(),
@@ -109,7 +112,8 @@ Status CanonicalizeDot(HloInstruction* original_dot) {
   }
 
   // The canonical form of the rhs is
-  // [BatchDims, ContractingsDims, NonContractingDims]
+  // [BatchDims, NonContractingDimsProduct, ContractingsDimsProduct]
+  // If NonContractingDimsProduct is 1, it is omitted.
   std::vector<int64> rhs_transpose;
   rhs_transpose.reserve(rhs_rank);
   rhs_transpose.insert(rhs_transpose.end(),

From 0096d0a19b5543b368a5d2426cb2810931913272 Mon Sep 17 00:00:00 2001
From: Tare Gaskin <taregaskin@google.com>
Date: Mon, 15 Jun 2020 21:49:18 +0000
Subject: [PATCH 0211/1390] final segratation

---
 tensorflow/compiler/xla/window_util.cc                | 2 +-
 tensorflow/core/framework/tensor_shape.cc             | 2 +-
 tensorflow/core/lib/io/random_inputstream.cc          | 2 +-
 tensorflow/core/lib/io/snappy/snappy_inputbuffer.cc   | 2 +-
 tensorflow/core/lib/io/snappy/snappy_outputbuffer.cc  | 6 +++---
 tensorflow/core/lib/io/zlib_outputbuffer.cc           | 6 +++---
 tensorflow/core/platform/env.cc                       | 2 +-
 tensorflow/core/platform/file_system.cc               | 2 +-
 tensorflow/core/platform/file_system_helper.cc        | 2 +-
 tensorflow/core/platform/status.cc                    | 4 +---
 tensorflow/core/profiler/internal/parse_annotation.cc | 2 +-
 11 files changed, 15 insertions(+), 17 deletions(-)

diff --git a/tensorflow/compiler/xla/window_util.cc b/tensorflow/compiler/xla/window_util.cc
index e33d0b6d1dc..a58179c3ee0 100644
--- a/tensorflow/compiler/xla/window_util.cc
+++ b/tensorflow/compiler/xla/window_util.cc
@@ -42,7 +42,7 @@ Window MakeWindow(absl::Span<const int64> sizes,
                   absl::Span<const int64> strides) {
   Window window;
   CHECK_EQ(sizes.size(), strides.size());
-  for (auto nb = 0; static_cast<size_t>(nb) < sizes.size(); ++nb) {
+  for (auto nb = 0; nb < sizes.size(); ++nb) {
     auto* dimension = window.add_dimensions();
     dimension->set_size(sizes[nb]);
     dimension->set_stride(strides[nb]);
diff --git a/tensorflow/core/framework/tensor_shape.cc b/tensorflow/core/framework/tensor_shape.cc
index 8040a316a45..f4b440e9cd1 100644
--- a/tensorflow/core/framework/tensor_shape.cc
+++ b/tensorflow/core/framework/tensor_shape.cc
@@ -182,7 +182,7 @@ void TensorShapeBase<Shape>::InitDims(gtl::ArraySlice<int64> dim_sizes) {
 
   // Allow sizes that are under kint64max^0.25 so that 4-way multiplication
   // below cannot overflow.
-  static const int64 kMaxSmall = 0xd744;
+  static const uint64 kMaxSmall = 0xd744;
   static_assert(kMaxSmall * kMaxSmall * kMaxSmall * kMaxSmall <= kint64max,
                 "bad overflow check");
   bool large_size = false;
diff --git a/tensorflow/core/lib/io/random_inputstream.cc b/tensorflow/core/lib/io/random_inputstream.cc
index bd0054ce753..10f734a5bae 100644
--- a/tensorflow/core/lib/io/random_inputstream.cc
+++ b/tensorflow/core/lib/io/random_inputstream.cc
@@ -92,7 +92,7 @@ Status RandomAccessInputStream::SkipNBytes(int64 bytes_to_skip) {
     } else {
       return s;
     }
-    if (data.size() < static_cast<size_t>(bytes_to_read)) {
+    if (data.size() < bytes_to_read) {
       return errors::OutOfRange("reached end of file");
     }
     bytes_to_skip -= bytes_to_read;
diff --git a/tensorflow/core/lib/io/snappy/snappy_inputbuffer.cc b/tensorflow/core/lib/io/snappy/snappy_inputbuffer.cc
index 53939f2d8a3..a331d4173cf 100644
--- a/tensorflow/core/lib/io/snappy/snappy_inputbuffer.cc
+++ b/tensorflow/core/lib/io/snappy/snappy_inputbuffer.cc
@@ -134,7 +134,7 @@ Status SnappyInputBuffer::ReadCompressedBlockLength(uint32* length) {
     }
     size_t readable = std::min(bytes_to_read, avail_in_);
 
-    for (size_t i = 0; i < readable; i++) {
+    for (int i = 0; i < readable; i++) {
       // The "unsigned char" type cast is intentional to avoid implicit type
       // casting of the signed char to unsigned int during bitwise OR which
       // causes weird overflow errors.
diff --git a/tensorflow/core/lib/io/snappy/snappy_outputbuffer.cc b/tensorflow/core/lib/io/snappy/snappy_outputbuffer.cc
index fe3a53c6c25..563503a1319 100644
--- a/tensorflow/core/lib/io/snappy/snappy_outputbuffer.cc
+++ b/tensorflow/core/lib/io/snappy/snappy_outputbuffer.cc
@@ -76,7 +76,7 @@ Status SnappyOutputBuffer::Write(StringPiece data) {
 
   // If there is sufficient free space in input_buffer_ to fit data we
   // add it there and return.
-  if (static_cast<int32>(bytes_to_write) <= AvailableInputSpace()) {
+  if (bytes_to_write <= AvailableInputSpace()) {
     AddToInputBuffer(data);
     return Status::OK();
   }
@@ -87,7 +87,7 @@ Status SnappyOutputBuffer::Write(StringPiece data) {
   TF_RETURN_IF_ERROR(DeflateBuffered());
 
   // input_buffer_ should be empty at this point.
-  if (static_cast<int32>(bytes_to_write) <= AvailableInputSpace()) {
+  if (bytes_to_write <= AvailableInputSpace()) {
     AddToInputBuffer(data);
     return Status::OK();
   }
@@ -144,7 +144,7 @@ void SnappyOutputBuffer::AddToInputBuffer(StringPiece data) {
   const int32 free_tail_bytes =
       input_buffer_capacity_ - (read_bytes + unread_bytes);
 
-  if (static_cast<int32>(bytes_to_write) > free_tail_bytes) {
+  if (bytes_to_write > free_tail_bytes) {
     memmove(input_buffer_.get(), next_in_, avail_in_);
     next_in_ = input_buffer_.get();
   }
diff --git a/tensorflow/core/lib/io/zlib_outputbuffer.cc b/tensorflow/core/lib/io/zlib_outputbuffer.cc
index d475d0eaa5c..5840ca60242 100644
--- a/tensorflow/core/lib/io/zlib_outputbuffer.cc
+++ b/tensorflow/core/lib/io/zlib_outputbuffer.cc
@@ -98,7 +98,7 @@ void ZlibOutputBuffer::AddToInputBuffer(StringPiece data) {
   int32 unread_bytes = z_stream_->avail_in;
   int32 free_tail_bytes = input_buffer_capacity_ - (read_bytes + unread_bytes);
 
-  if (static_cast<int32>(bytes_to_write) > free_tail_bytes) {
+  if (bytes_to_write > free_tail_bytes) {
     memmove(z_stream_input_.get(), z_stream_->next_in, z_stream_->avail_in);
     z_stream_->next_in = z_stream_input_.get();
   }
@@ -154,7 +154,7 @@ Status ZlibOutputBuffer::Append(StringPiece data) {
 
   size_t bytes_to_write = data.size();
 
-  if (static_cast<int32>(bytes_to_write) <= AvailableInputSpace()) {
+  if (bytes_to_write <= AvailableInputSpace()) {
     AddToInputBuffer(data);
     return Status::OK();
   }
@@ -162,7 +162,7 @@ Status ZlibOutputBuffer::Append(StringPiece data) {
   TF_RETURN_IF_ERROR(DeflateBuffered(zlib_options_.flush_mode));
 
   // At this point input stream should be empty.
-  if (static_cast<int32>(bytes_to_write) <= AvailableInputSpace()) {
+  if (bytes_to_write <= AvailableInputSpace()) {
     AddToInputBuffer(data);
     return Status::OK();
   }
diff --git a/tensorflow/core/platform/env.cc b/tensorflow/core/platform/env.cc
index 05d95ba0425..b29cad05459 100644
--- a/tensorflow/core/platform/env.cc
+++ b/tensorflow/core/platform/env.cc
@@ -214,7 +214,7 @@ bool Env::FilesExist(const std::vector<string>& files,
     }
     if (fs_status) {
       result &= fs_result;
-      for (size_t i = 0; i < itr.second.size(); ++i) {
+      for (int i = 0; i < itr.second.size(); ++i) {
         per_file_status[itr.second[i]] = fs_status->at(i);
       }
     } else if (!fs_result) {
diff --git a/tensorflow/core/platform/file_system.cc b/tensorflow/core/platform/file_system.cc
index c9657e2339f..9e96ceedbdc 100644
--- a/tensorflow/core/platform/file_system.cc
+++ b/tensorflow/core/platform/file_system.cc
@@ -308,7 +308,7 @@ StringPiece FileSystem::Basename(StringPiece path) const {
 StringPiece FileSystem::Extension(StringPiece path) const {
   StringPiece basename = this->Basename(path);
 
-  size_t pos = basename.rfind('.');
+  int pos = basename.rfind('.');
   if (pos == StringPiece::npos) {
     return StringPiece(path.data() + path.size(), 0);
   } else {
diff --git a/tensorflow/core/platform/file_system_helper.cc b/tensorflow/core/platform/file_system_helper.cc
index 909752389e1..64b175c4d17 100644
--- a/tensorflow/core/platform/file_system_helper.cc
+++ b/tensorflow/core/platform/file_system_helper.cc
@@ -103,7 +103,7 @@ Status GetMatchingPaths(FileSystem* fs, Env* env, const string& pattern,
                 children_dir_status[i] = fs->IsDirectory(child_path);
               }
             });
-    for (size_t i = 0; i < children.size(); ++i) {
+    for (int i = 0; i < children.size(); ++i) {
       const string child_path = io::JoinPath(current_dir, children[i]);
       // If the IsDirectory call was cancelled we bail.
       if (children_dir_status[i].code() == tensorflow::error::CANCELLED) {
diff --git a/tensorflow/core/platform/status.cc b/tensorflow/core/platform/status.cc
index e303c18091c..756b8314148 100644
--- a/tensorflow/core/platform/status.cc
+++ b/tensorflow/core/platform/status.cc
@@ -74,9 +74,7 @@ class StatusLogSink : public TFLogSink {
 
     mutex_lock lock(mu_);
     messages_.emplace_back(entry.ToString());
-    if (messages_.size() > static_cast<size_t>(num_messages_)){
-        messages_.pop_front();
-    }
+    if (messages_.size() > num_messages_) messages_.pop_front();
   }
 
  private:
diff --git a/tensorflow/core/profiler/internal/parse_annotation.cc b/tensorflow/core/profiler/internal/parse_annotation.cc
index a4cdc09739d..32c26befa3d 100644
--- a/tensorflow/core/profiler/internal/parse_annotation.cc
+++ b/tensorflow/core/profiler/internal/parse_annotation.cc
@@ -50,7 +50,7 @@ std::vector<absl::string_view> SplitNameAndMetadata(
 std::vector<absl::string_view> SplitPairs(absl::string_view metadata) {
   std::vector<absl::string_view> key_value_pairs;
   std::stack<char> quotes;
-  size_t start = 0, end = 0;
+  int start = 0, end = 0;
   for (; end < metadata.size(); ++end) {
     char ch = metadata[end];
     switch (ch) {

From 176ab11d0a9211fe9f8da0a7e6c75381c7ca26e8 Mon Sep 17 00:00:00 2001
From: Raman Sarokin <sorokin@google.com>
Date: Mon, 15 Jun 2020 14:54:50 -0700
Subject: [PATCH 0212/1390] Added handling of Intel in choosing best storage
 types.

PiperOrigin-RevId: 316550810
Change-Id: I7e81cd0df1522be4b705df57a5658397328b5a18
---
 tensorflow/lite/delegates/gpu/cl/environment.cc | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tensorflow/lite/delegates/gpu/cl/environment.cc b/tensorflow/lite/delegates/gpu/cl/environment.cc
index 01d034fb1f7..6b6ab84f148 100644
--- a/tensorflow/lite/delegates/gpu/cl/environment.cc
+++ b/tensorflow/lite/delegates/gpu/cl/environment.cc
@@ -242,6 +242,8 @@ TensorStorageType GetFastestStorageType(const CLDevice& gpu) {
   } else if (gpu.IsAMD()) {
     return gpu.SupportsImageBuffer() ? TensorStorageType::IMAGE_BUFFER
                                      : TensorStorageType::BUFFER;
+  } else if (gpu.IsIntel()) {
+    return TensorStorageType::BUFFER;
   }
   return TensorStorageType::BUFFER;
 }
@@ -264,6 +266,8 @@ TensorStorageType GetStorageTypeWithMinimalMemoryConsumption(
   } else if (gpu.IsAMD()) {
     return gpu.SupportsImageBuffer() ? TensorStorageType::IMAGE_BUFFER
                                      : TensorStorageType::BUFFER;
+  } else if (gpu.IsIntel()) {
+    return TensorStorageType::BUFFER;
   }
   return TensorStorageType::BUFFER;
 }

From b59b2a10b16e17dcf439bf973b7bbd5da65d3c25 Mon Sep 17 00:00:00 2001
From: Lucy Fox <lucyfox@google.com>
Date: Mon, 15 Jun 2020 15:08:04 -0700
Subject: [PATCH 0213/1390] [NFC] Add comment to specify status of
 FusedKernelMatcher pass.

We're porting over this pass based on usage/need, so I'm updating the comments to make this clear.

PiperOrigin-RevId: 316553704
Change-Id: I5db9a4637edd3bcb27b4db6259b7113ae0f8dd0c
---
 .../tensorflow/transforms/fused_kernel_matcher.cc  | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/fused_kernel_matcher.cc b/tensorflow/compiler/mlir/tensorflow/transforms/fused_kernel_matcher.cc
index 3ccdd957798..4b10550df7b 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/fused_kernel_matcher.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/fused_kernel_matcher.cc
@@ -32,11 +32,15 @@ namespace TF {
 
 namespace {
 
-// Note: This implements fusions performed in the old Remapper Grappler pass.
-// That pass has specific cases for GPU and based on different target
-// configurations on both CPU and GPU (Intel MKL, ROCm, etc.). This MLIR pass
-// covers the general CPU case and at the moment does not account for any
-// target-specific configurations.
+// Note: This implements the fusions performed in the old Remapper Grappler
+// pass. That pass has specific cases for GPU and based on different
+// target configurations on both CPU and GPU (Intel MKL, ROCm, etc.). This MLIR
+// pass covers (some of) the general CPU case and at the moment does not account
+// for any target-specific configurations.
+
+// This pass is being ported over from the Grappler Remapper pass based on
+// need/usage. File a bug to request porting over additional fusions.
+
 // TODO(b/158265178): Support GPU-specific fusions.
 // TODO(b/158266710): Support CPU MKL configurations.
 

From 1b412edc891b0d2cf98dbd2291d871d5ee67a0fb Mon Sep 17 00:00:00 2001
From: Lu Wang <luwa@google.com>
Date: Mon, 15 Jun 2020 15:09:29 -0700
Subject: [PATCH 0214/1390] Create the script to generate TFLite Java API
 documentation.

PiperOrigin-RevId: 316553983
Change-Id: I08888ba83f5335d38ba588ed632c2471dcfa2d9f
---
 tensorflow/lite/g3doc/tools/BUILD             | 11 ++++
 .../lite/g3doc/tools/build_java_api_docs.py   | 66 +++++++++++++++++++
 2 files changed, 77 insertions(+)
 create mode 100644 tensorflow/lite/g3doc/tools/build_java_api_docs.py

diff --git a/tensorflow/lite/g3doc/tools/BUILD b/tensorflow/lite/g3doc/tools/BUILD
index f2c6d8efedc..5c891a67128 100644
--- a/tensorflow/lite/g3doc/tools/BUILD
+++ b/tensorflow/lite/g3doc/tools/BUILD
@@ -13,3 +13,14 @@ py_binary(
         "@absl_py//absl/flags",
     ],
 )
+
+py_binary(
+    name = "build_java_api_docs",
+    srcs = ["build_java_api_docs.py"],
+    python_version = "PY3",
+    srcs_version = "PY3",
+    deps = [
+        "@absl_py//absl:app",
+        "@absl_py//absl/flags",
+    ],
+)
diff --git a/tensorflow/lite/g3doc/tools/build_java_api_docs.py b/tensorflow/lite/g3doc/tools/build_java_api_docs.py
new file mode 100644
index 00000000000..9c598ad8ec5
--- /dev/null
+++ b/tensorflow/lite/g3doc/tools/build_java_api_docs.py
@@ -0,0 +1,66 @@
+# Lint as: python3
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Generate TensorFlow Lite Java reference docs for TensorFlow.org."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import pathlib
+import shutil
+import tempfile
+
+from absl import app
+from absl import flags
+
+from tensorflow_docs.api_generator import gen_java
+
+FLAGS = flags.FLAGS
+
+# These flags are required by infrastructure, not all of them are used.
+flags.DEFINE_string('output_dir', '/tmp/lite_api/',
+                    ("Use this branch as the root version and don't"
+                     ' create in version directory'))
+
+flags.DEFINE_string('site_path', 'lite/api_docs/java',
+                    'Path prefix in the _toc.yaml')
+
+flags.DEFINE_string('code_url_prefix', None,
+                    '[UNUSED] The url prefix for links to code.')
+
+flags.DEFINE_bool(
+    'search_hints', True,
+    '[UNUSED] Include metadata search hints in the generated files')
+
+# __file__ is the path to this file
+DOCS_TOOLS_DIR = pathlib.Path(__file__).resolve().parent
+TENSORFLOW_ROOT = DOCS_TOOLS_DIR.parents[3]
+SOURCE_PATH = TENSORFLOW_ROOT / 'tensorflow/lite/java/src/main/java/'
+
+
+def main(unused_argv):
+  merged_source = pathlib.Path(tempfile.mkdtemp())
+  shutil.copytree(SOURCE_PATH, merged_source / 'java')
+
+  gen_java.gen_java_docs(
+      package='org.tensorflow.lite',
+      source_path=merged_source / 'java',
+      output_dir=pathlib.Path(FLAGS.output_dir),
+      site_path=pathlib.Path(FLAGS.site_path))
+
+
+if __name__ == '__main__':
+  flags.mark_flags_as_required(['output_dir'])
+  app.run(main)

From 9136f5775e29cd0540fd594585687703849e64ab Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 15 Jun 2020 15:13:45 -0700
Subject: [PATCH 0215/1390] [XLA] Refactor memory_space_assignment.cc.

Refactors IsIntervalAllowedInAlternateMemory() to a separate utils file so that it can be reused.

PiperOrigin-RevId: 316554798
Change-Id: Ibc6a4cffde6a1df233d375358164b373ea4ee7a6
---
 tensorflow/compiler/xla/service/BUILD         | 10 ++
 .../xla/service/memory_space_assignment.cc    | 82 +---------------
 .../xla/service/memory_space_assignment.h     |  4 -
 .../service/memory_space_assignment_utils.cc  | 95 +++++++++++++++++++
 .../service/memory_space_assignment_utils.h   | 34 +++++++
 5 files changed, 143 insertions(+), 82 deletions(-)
 create mode 100644 tensorflow/compiler/xla/service/memory_space_assignment_utils.cc
 create mode 100644 tensorflow/compiler/xla/service/memory_space_assignment_utils.h

diff --git a/tensorflow/compiler/xla/service/BUILD b/tensorflow/compiler/xla/service/BUILD
index 40c237c5e6d..acd35cbc153 100644
--- a/tensorflow/compiler/xla/service/BUILD
+++ b/tensorflow/compiler/xla/service/BUILD
@@ -3304,6 +3304,15 @@ tf_cc_test(
     ],
 )
 
+cc_library(
+    name = "memory_space_assignment_utils",
+    srcs = ["memory_space_assignment_utils.cc"],
+    hdrs = ["memory_space_assignment_utils.h"],
+    deps = [
+        ":heap_simulator",
+    ],
+)
+
 cc_library(
     name = "memory_space_assignment",
     srcs = ["memory_space_assignment.cc"],
@@ -3311,6 +3320,7 @@ cc_library(
     deps = [
         ":heap_simulator",
         ":hlo_cost_analysis",
+        ":memory_space_assignment_utils",
         "//tensorflow/compiler/xla:debug_options_flags",
         "//tensorflow/core/lib/math:math_util",
     ],
diff --git a/tensorflow/compiler/xla/service/memory_space_assignment.cc b/tensorflow/compiler/xla/service/memory_space_assignment.cc
index 21baaf1c7d5..388a2e18f38 100644
--- a/tensorflow/compiler/xla/service/memory_space_assignment.cc
+++ b/tensorflow/compiler/xla/service/memory_space_assignment.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/memory_space_assignment.h"
 
 #include "tensorflow/compiler/xla/debug_options_flags.h"
+#include "tensorflow/compiler/xla/service/memory_space_assignment_utils.h"
 #include "tensorflow/core/lib/math/math_util.h"
 namespace xla {
 
@@ -597,81 +598,6 @@ AlternateMemoryBestFitHeap::GetSortedColocatedIntervals(
   return colocated_intervals;
 }
 
-bool AlternateMemoryBestFitHeap::IsIntervalAllowedInAlternateMemory(
-    const BufferInterval& interval) const {
-  // If the buffer is a tuple, don't use this algorithm for now. The buffers
-  // that are pointed to by the tuple will still use this algorithm.  Because
-  // tuples are cheap to place in the alternate memory (they are just pointers)
-  // we don't need to use prefetch/evict logic.
-  if (interval.buffer->shape().IsTuple()) {
-    VLOG(4) << "Keeping value " << interval.buffer->ToShortString()
-            << " in default mem because it is a tuple.";
-    return false;
-  }
-
-  // Don't place scalars in the alternate memory.
-  if (ShapeUtil::IsEffectiveScalar(interval.buffer->shape())) {
-    VLOG(4) << "Keeping value " << interval.buffer->ToShortString()
-            << " in default mem because it is a scalar.";
-    return false;
-  }
-
-  // The semantics of TupleSelect are weird: TupleSelect doesn't define a
-  // buffer, but just forwards the buffers in the either left or right side.
-  // This means the two different inputs to TupleSelect must not alias, yet they
-  // should be allocated in the same memory space, and both buffers must be kept
-  // alive for the entire live range of TupleSelect. Instead, just don't
-  // allocate TupleSelect in the alternate memory space.
-  // TODO(berkin): Not allocating add-dependencies either since they need to be
-  // treated specially. We should revisit this later.
-  for (const HloPosition& position : interval.buffer->positions()) {
-    if (position.instruction->opcode() == HloOpcode::kTupleSelect ||
-        position.instruction->opcode() == HloOpcode::kAddDependency) {
-      VLOG(4) << "Keeping value " << interval.buffer->ToShortString()
-              << " in default mem because it has a tuple-select or "
-              << "add-dependency position.";
-      return false;
-    }
-  }
-
-  // Send and Recv HLOs return a request identifier. These should not be
-  // allocated in the alternate memory.
-  for (const HloPosition& position : interval.buffer->positions()) {
-    if ((position.instruction->opcode() == HloOpcode::kSend ||
-         position.instruction->opcode() == HloOpcode::kRecv)) {
-      // TODO(berkin): Send/recv buffers need a stable buffer allocation
-      // throughout sending/receiving. Disable memory space allocation for these
-      // for now.
-      if (position.index == ShapeIndex({0})) {
-        VLOG(4) << "Keeping value " << interval.buffer->ToShortString()
-                << " in default mem because it is a send/recv buffer.";
-        return false;
-      } else if (position.index == ShapeIndex({1})) {
-        VLOG(4) << "Keeping value " << interval.buffer->ToShortString()
-                << " in default mem because it is a request identifier for "
-                   "send/recv.";
-        return false;
-      }
-    }
-
-    if ((position.instruction->opcode() == HloOpcode::kCollectivePermuteStart ||
-         position.instruction->opcode() == HloOpcode::kCollectivePermuteDone)) {
-      // Disable memory space allocation for these for now.
-      if (position.index == ShapeIndex({0})) {
-        VLOG(4) << "Keeping value " << interval.buffer->ToShortString()
-                << " in default mem because it is a collective-permute buffer.";
-        return false;
-      } else if (position.index == ShapeIndex({1})) {
-        VLOG(4) << "Keeping value " << interval.buffer->ToShortString()
-                << " in default mem because it is a collective-permute buffer.";
-        return false;
-      }
-    }
-  }
-
-  return true;
-}
-
 bool AlternateMemoryBestFitHeap::IsUseAllowedInAlternateMemory(
     const AllocationValue& value, const HloUse& use) const {
   const auto& instruction_schedule = hlo_live_range_.instruction_schedule();
@@ -710,8 +636,7 @@ bool AlternateMemoryBestFitHeap::IsUseAllowedInAlternateMemory(
     if (!options_.prefetch_interval_picker->CanAllocateInAlternateMemoryNoCopy(
             shape, parameter_time, min_use_time)) {
       VLOG(4) << "While allocation not allowed in alternate memory. "
-              << "use time = " << min_use_time
-              << ", root time = " << root_time;
+              << "use time = " << min_use_time << ", root time = " << root_time;
       return false;
     }
     // Check if there is a required assignment for the while loop output.
@@ -897,7 +822,8 @@ HeapSimulator::Result AlternateMemoryBestFitHeap::Finish() {
       continue;
     }
 
-    if (!IsIntervalAllowedInAlternateMemory(interval)) {
+    if (!MemorySpaceAssignmentUtils::IsIntervalAllowedInAlternateMemory(
+            interval)) {
       continue;
     }
 
diff --git a/tensorflow/compiler/xla/service/memory_space_assignment.h b/tensorflow/compiler/xla/service/memory_space_assignment.h
index b8f47e73b8c..f9e5738d17e 100644
--- a/tensorflow/compiler/xla/service/memory_space_assignment.h
+++ b/tensorflow/compiler/xla/service/memory_space_assignment.h
@@ -909,10 +909,6 @@ class AlternateMemoryBestFitHeap : public GlobalDecreasingSizeBestFitHeap {
   static MemorySpaceAssignment::Allocation* GetLiveAllocationAt(
       const MemorySpaceAssignment::AllocationSequence& allocations, int64 time);
 
-  // Returns true if this buffer is allowed to be placed in the alternate
-  // memory.
-  bool IsIntervalAllowedInAlternateMemory(const BufferInterval& interval) const;
-
   // Returns true if the use is allowed in the alternate memory.
   bool IsUseAllowedInAlternateMemory(const AllocationValue& value,
                                      const HloUse& use) const;
diff --git a/tensorflow/compiler/xla/service/memory_space_assignment_utils.cc b/tensorflow/compiler/xla/service/memory_space_assignment_utils.cc
new file mode 100644
index 00000000000..0215f007c9c
--- /dev/null
+++ b/tensorflow/compiler/xla/service/memory_space_assignment_utils.cc
@@ -0,0 +1,95 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/memory_space_assignment_utils.h"
+
+namespace xla {
+
+bool MemorySpaceAssignmentUtils::IsIntervalAllowedInAlternateMemory(
+    const GlobalDecreasingSizeBestFitHeap::BufferInterval& interval) {
+  // If the buffer is a tuple, don't use this algorithm for now. The buffers
+  // that are pointed to by the tuple will still use this algorithm.  Because
+  // tuples are cheap to place in the alternate memory (they are just pointers)
+  // we don't need to use prefetch/evict logic.
+  if (interval.buffer->shape().IsTuple()) {
+    VLOG(4) << "Keeping value " << interval.buffer->ToShortString()
+            << " in default mem because it is a tuple.";
+    return false;
+  }
+
+  // Don't place scalars in the alternate memory.
+  if (ShapeUtil::IsEffectiveScalar(interval.buffer->shape())) {
+    VLOG(4) << "Keeping value " << interval.buffer->ToShortString()
+            << " in default mem because it is a scalar.";
+    return false;
+  }
+
+  // The semantics of TupleSelect are weird: TupleSelect doesn't define a
+  // buffer, but just forwards the buffers in the either left or right side.
+  // This means the two different inputs to TupleSelect must not alias, yet they
+  // should be allocated in the same memory space, and both buffers must be kept
+  // alive for the entire live range of TupleSelect. Instead, just don't
+  // allocate TupleSelect in the alternate memory space.
+  // TODO(berkin): Not allocating add-dependencies either since they need to be
+  // treated specially. We should revisit this later.
+  for (const HloPosition& position : interval.buffer->positions()) {
+    if (position.instruction->opcode() == HloOpcode::kTupleSelect ||
+        position.instruction->opcode() == HloOpcode::kAddDependency) {
+      VLOG(4) << "Keeping value " << interval.buffer->ToShortString()
+              << " in default mem because it has a tuple-select or "
+              << "add-dependency position.";
+      return false;
+    }
+  }
+
+  // Send and Recv HLOs return a request identifier. These should not be
+  // allocated in the alternate memory.
+  for (const HloPosition& position : interval.buffer->positions()) {
+    if ((position.instruction->opcode() == HloOpcode::kSend ||
+         position.instruction->opcode() == HloOpcode::kRecv)) {
+      // TODO(berkin): Send/recv buffers need a stable buffer allocation
+      // throughout sending/receiving. Disable memory space allocation for these
+      // for now.
+      if (position.index == ShapeIndex({0})) {
+        VLOG(4) << "Keeping value " << interval.buffer->ToShortString()
+                << " in default mem because it is a send/recv buffer.";
+        return false;
+      } else if (position.index == ShapeIndex({1})) {
+        VLOG(4) << "Keeping value " << interval.buffer->ToShortString()
+                << " in default mem because it is a request identifier for "
+                   "send/recv.";
+        return false;
+      }
+    }
+
+    if ((position.instruction->opcode() == HloOpcode::kCollectivePermuteStart ||
+         position.instruction->opcode() == HloOpcode::kCollectivePermuteDone)) {
+      // Disable memory space allocation for these for now.
+      if (position.index == ShapeIndex({0})) {
+        VLOG(4) << "Keeping value " << interval.buffer->ToShortString()
+                << " in default mem because it is a collective-permute buffer.";
+        return false;
+      } else if (position.index == ShapeIndex({1})) {
+        VLOG(4) << "Keeping value " << interval.buffer->ToShortString()
+                << " in default mem because it is a collective-permute buffer.";
+        return false;
+      }
+    }
+  }
+
+  return true;
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/memory_space_assignment_utils.h b/tensorflow/compiler/xla/service/memory_space_assignment_utils.h
new file mode 100644
index 00000000000..651ac107c25
--- /dev/null
+++ b/tensorflow/compiler/xla/service/memory_space_assignment_utils.h
@@ -0,0 +1,34 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_MEMORY_SPACE_ASSIGNMENT_UTILS_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_MEMORY_SPACE_ASSIGNMENT_UTILS_H_
+
+#include "tensorflow/compiler/xla/service/heap_simulator.h"
+
+namespace xla {
+
+// Encapsulates common utility methods for memory space assignment.
+class MemorySpaceAssignmentUtils {
+ public:
+  // Returns true if this buffer is allowed to be placed in the alternate
+  // memory.
+  static bool IsIntervalAllowedInAlternateMemory(
+      const GlobalDecreasingSizeBestFitHeap::BufferInterval& interval);
+};
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_MEMORY_SPACE_ASSIGNMENT_UTILS_H_

From 072c2f5d0dead92e2c41ef168986e927481db463 Mon Sep 17 00:00:00 2001
From: Nick Kreeger <kreeger@google.com>
Date: Mon, 15 Jun 2020 15:22:07 -0700
Subject: [PATCH 0216/1390] Reduce excessive RAM in TFLM by using the existing
 flatbuffer quantization data for scales.

Currently, TFLM manually allocates a tail chunk to store "quantization" tensor data on TfLiteTensor objects. The size of these allocations vary based on the type of model - conv1d/2d models tend to be rich since quantization data is stored "per channel".

This change simply points the scale data at the existing value in the flatbuffer. The flatbuffer schema stores float values as flatbuffers::Vector<float> and the TfLiteAffineQuantization struct can point the scale pointer at these values. Unfortunately, the zero point values are stored as flatbuffers::Vector<int64_t> and can not be reused. This allocation will be addressed in a future change.

Keyword Model ~2% reduction in tail allocation:
-----------------------------------------------
[RecordingMicroAllocator] Arena allocation total 21040 bytes
[RecordingMicroAllocator] Arena allocation head 672 bytes
[RecordingMicroAllocator] Arena allocation tail 20368 bytes
[RecordingMicroAllocator] 'TfLiteTensor struct' used 6048 bytes with alignment overhead (requested 6048 bytes for 54 tensors)
[RecordingMicroAllocator] 'TfLiteTensor quantization data' used 1728 bytes with alignment overhead (requested 1728 bytes for 108 allocations)
[RecordingMicroAllocator] 'TfLiteTensor variable buffer data' used 10240 bytes with alignment overhead (requested 10240 bytes for 7 allocations)
[RecordingMicroAllocator] 'NodeAndRegistration struct' used 1200 bytes with alignment overhead (requested 1200 bytes for 15 NodeAndRegistration structs)
[RecordingMicroAllocator] 'Operator runtime data' used 148 bytes with alignment overhead (requested 148 bytes for 13 OpData structs)

Test Conv Model ~10% reduction in tail allocation:
-----------------------------------------------
[RecordingMicroAllocator] Arena allocation total 11680 bytes
[RecordingMicroAllocator] Arena allocation head 7744 bytes
[RecordingMicroAllocator] Arena allocation tail 3936 bytes
[RecordingMicroAllocator] 'TfLiteTensor struct' used 1680 bytes with alignment overhead (requested 1680 bytes for 15 tensors)
[RecordingMicroAllocator] 'TfLiteTensor quantization data' used 768 bytes with alignment overhead (requested 752 bytes for 24 allocations)
[RecordingMicroAllocator] 'TfLiteTensor variable buffer data' used 0 bytes with alignment overhead (requested 0 bytes for 0 allocations)
[RecordingMicroAllocator] 'NodeAndRegistration struct' used 560 bytes with alignment overhead (requested 560 bytes for 7 NodeAndRegistration structs)
[RecordingMicroAllocator] 'Operator runtime data' used 136 bytes with alignment overhead (requested 136 bytes for 5 OpData structs)

PiperOrigin-RevId: 316556393
Change-Id: Iadadab51019d2787d11af9713b3639f087afa7bc
---
 .../lite/micro/memory_arena_threshold_test.cc | 31 +++++++++++++------
 tensorflow/lite/micro/micro_allocator.cc      | 22 ++++++-------
 .../micro/recording_micro_allocator_test.cc   |  6 ++--
 3 files changed, 34 insertions(+), 25 deletions(-)

diff --git a/tensorflow/lite/micro/memory_arena_threshold_test.cc b/tensorflow/lite/micro/memory_arena_threshold_test.cc
index 19c3d0f1e06..58d3eff8df5 100644
--- a/tensorflow/lite/micro/memory_arena_threshold_test.cc
+++ b/tensorflow/lite/micro/memory_arena_threshold_test.cc
@@ -41,11 +41,11 @@ constexpr int kKeywordModelNodeAndRegistrationCount = 15;
 
 // NOTE: These values are measured on x86-64:
 // TODO(b/158651472): Consider auditing these values on non-64 bit systems.
-constexpr int kKeywordModelTotalSize = 21472;
+constexpr int kKeywordModelTotalSize = 21040;
 constexpr int kKeywordModelHeadSize = 672;
-constexpr int kKeywordModelTailSize = 20800;
+constexpr int kKeywordModelTailSize = 20368;
 constexpr int kKeywordModelTfLiteTensorVariableBufferDataSize = 10240;
-constexpr int kKeywordModelTfLiteTensorQuantizationDataSize = 2160;
+constexpr int kKeywordModelTfLiteTensorQuantizationDataSize = 1728;
 constexpr int kKeywordModelOpRuntimeDataSize = 148;
 
 constexpr int kTestConvModelArenaSize = 12 * 1024;
@@ -56,10 +56,10 @@ constexpr int kTestConvModelNodeAndRegistrationCount = 7;
 
 // NOTE: These values are measured on x86-64:
 // TODO(b/158651472): Consider auditing these values on non-64 bit systems.
-constexpr int kTestConvModelTotalSize = 12128;
+constexpr int kTestConvModelTotalSize = 11680;
 constexpr int kTestConvModelHeadSize = 7744;
-constexpr int kTestConvModelTailSize = 4384;
-constexpr int kTestConvModelTfLiteTensorQuantizationDataSize = 1216;
+constexpr int kTestConvModelTailSize = 3936;
+constexpr int kTestConvModelTfLiteTensorQuantizationDataSize = 768;
 constexpr int kTestConvModelOpRuntimeDataSize = 136;
 
 struct ModelAllocationThresholds {
@@ -73,11 +73,17 @@ struct ModelAllocationThresholds {
   size_t op_runtime_data_size = 0;
 };
 
-void EnsureAllocatedSizeThreshold(size_t actual, size_t expected) {
+void EnsureAllocatedSizeThreshold(const char* allocation_type, size_t actual,
+                                  size_t expected) {
   // TODO(b/158651472): Better auditing of non-64 bit systems:
   if (kIs64BitSystem) {
     // 64-bit systems should check floor and ceiling to catch memory savings:
     TF_LITE_MICRO_EXPECT_NEAR(actual, expected, kAllocationThreshold);
+    if (actual != expected) {
+      TF_LITE_REPORT_ERROR(micro_test::reporter,
+                           "%s threshold failed: %ld != %ld", allocation_type,
+                           actual, expected);
+    }
   } else {
     // Non-64 bit systems should just expect allocation does not exceed the
     // ceiling:
@@ -91,33 +97,37 @@ void ValidateModelAllocationThresholds(
   allocator.PrintAllocations();
 
   EnsureAllocatedSizeThreshold(
-      allocator.GetSimpleMemoryAllocator()->GetUsedBytes(),
+      "Total", allocator.GetSimpleMemoryAllocator()->GetUsedBytes(),
       thresholds.total_alloc_size);
   EnsureAllocatedSizeThreshold(
-      allocator.GetSimpleMemoryAllocator()->GetHeadUsedBytes(),
+      "Head", allocator.GetSimpleMemoryAllocator()->GetHeadUsedBytes(),
       thresholds.head_alloc_size);
   EnsureAllocatedSizeThreshold(
-      allocator.GetSimpleMemoryAllocator()->GetTailUsedBytes(),
+      "Tail", allocator.GetSimpleMemoryAllocator()->GetTailUsedBytes(),
       thresholds.tail_alloc_size);
   EnsureAllocatedSizeThreshold(
+      "TfLiteTensor",
       allocator
           .GetRecordedAllocation(
               tflite::RecordedAllocationType::kTfLiteTensorArray)
           .used_bytes,
       sizeof(TfLiteTensor) * thresholds.tensor_count);
   EnsureAllocatedSizeThreshold(
+      "VariableBufferData",
       allocator
           .GetRecordedAllocation(
               tflite::RecordedAllocationType::kTfLiteTensorVariableBufferData)
           .used_bytes,
       thresholds.tensor_variable_buffer_data_size);
   EnsureAllocatedSizeThreshold(
+      "QuantizationData",
       allocator
           .GetRecordedAllocation(tflite::RecordedAllocationType::
                                      kTfLiteTensorArrayQuantizationData)
           .used_bytes,
       thresholds.tensor_quantization_data_size);
   EnsureAllocatedSizeThreshold(
+      "NodeAndRegistration",
       allocator
           .GetRecordedAllocation(
               tflite::RecordedAllocationType::kNodeAndRegistrationArray)
@@ -125,6 +135,7 @@ void ValidateModelAllocationThresholds(
       sizeof(tflite::NodeAndRegistration) *
           thresholds.node_and_registration_count);
   EnsureAllocatedSizeThreshold(
+      "OpData",
       allocator.GetRecordedAllocation(tflite::RecordedAllocationType::kOpData)
           .used_bytes,
       thresholds.op_runtime_data_size);
diff --git a/tensorflow/lite/micro/micro_allocator.cc b/tensorflow/lite/micro/micro_allocator.cc
index bfe44cab73a..f3b64bc9f39 100644
--- a/tensorflow/lite/micro/micro_allocator.cc
+++ b/tensorflow/lite/micro/micro_allocator.cc
@@ -466,6 +466,8 @@ TfLiteStatus InitializeTfLiteTensorFromFlatbuffer(
   TF_LITE_ENSURE_STATUS(BytesRequiredForTensor(
       flatbuffer_tensor, &result->bytes, &type_size, error_reporter));
 
+  // TODO(b/159043126): Cleanup endian casting by doing all endian casting in
+  // one spot:
   if (flatbuffer_tensor.shape() == nullptr) {
     // flatbuffer_tensor.shape() can return a nullptr in the case of a scalar
     // tensor.
@@ -513,6 +515,10 @@ TfLiteStatus InitializeTfLiteTensorFromFlatbuffer(
                            "Unable to allocate TfLiteAffineQuantization.\n");
       return kTfLiteError;
     }
+
+    // TODO(b/153688719): Reduce tail allocation by using a global zero-point
+    // buffer. This value can not be reused from the flatbuffer since the
+    // zero_point is stored as a int64_t.
     quantization->zero_point =
         reinterpret_cast<TfLiteIntArray*>(allocator->AllocateFromTail(
             TfLiteIntArrayGetSizeInBytes(channels), alignof(TfLiteIntArray)));
@@ -522,22 +528,14 @@ TfLiteStatus InitializeTfLiteTensorFromFlatbuffer(
       return kTfLiteError;
     }
 
-    quantization->scale = reinterpret_cast<TfLiteFloatArray*>(
-        allocator->AllocateFromTail(TfLiteFloatArrayGetSizeInBytes(channels),
-                                    alignof(TfLiteFloatArray)));
-    if (quantization->scale == nullptr) {
-      TF_LITE_REPORT_ERROR(error_reporter,
-                           "Unable to allocate quantization->scale.\n");
-      return kTfLiteError;
-    }
+    // TODO(b/159043126): Check for big endian before casting flatbuffer values.
+    quantization->scale = const_cast<TfLiteFloatArray*>(
+        reinterpret_cast<const TfLiteFloatArray*>(src_quantization->scale()));
 
     quantization->zero_point->size = channels;
-    quantization->scale->size = channels;
     int* zero_point_data = quantization->zero_point->data;
-    float* scale_data = quantization->scale->data;
     for (int i = 0; i < channels; i++) {
       zero_point_data[i] = src_quantization->zero_point()->Get(i);
-      scale_data[i] = src_quantization->scale()->Get(i);
     }
     // TODO(rocky): Need to add a micro_allocator test case that fails when
     // this is not copied:
@@ -815,8 +813,10 @@ TfLiteStatus MicroAllocator::PrepareNodeAndRegistrationDataFromFlatbuffer(
     }
 
     // Disregard const qualifier to workaround with existing API.
+    // TODO(b/159043126): Check for big endian before casting flatbuffer values.
     TfLiteIntArray* inputs_array = const_cast<TfLiteIntArray*>(
         reinterpret_cast<const TfLiteIntArray*>(op->inputs()));
+    // TODO(b/159043126): Check for big endian before casting flatbuffer values.
     TfLiteIntArray* outputs_array = const_cast<TfLiteIntArray*>(
         reinterpret_cast<const TfLiteIntArray*>(op->outputs()));
 
diff --git a/tensorflow/lite/micro/recording_micro_allocator_test.cc b/tensorflow/lite/micro/recording_micro_allocator_test.cc
index 9bbe0f405d4..775a2de2dfd 100644
--- a/tensorflow/lite/micro/recording_micro_allocator_test.cc
+++ b/tensorflow/lite/micro/recording_micro_allocator_test.cc
@@ -93,7 +93,6 @@ TF_LITE_MICRO_TEST(TestRecordsTensorArrayQuantizationData) {
       quantized_tensor_count++;
       size_t num_channels = quantization_params->scale()->size();
       quantized_channel_bytes += TfLiteIntArrayGetSizeInBytes(num_channels);
-      quantized_channel_bytes += TfLiteFloatArrayGetSizeInBytes(num_channels);
     }
   }
 
@@ -106,10 +105,9 @@ TF_LITE_MICRO_TEST(TestRecordsTensorArrayQuantizationData) {
       micro_allocator->GetRecordedAllocation(
           tflite::RecordedAllocationType::kTfLiteTensorArrayQuantizationData);
 
-  // Each quantized tensors has 3 mallocs (quant struct, scale dimensions, zero
-  // point dimensions):
+  // Each quantized tensors has 2 mallocs (quant struct, zero point dimensions):
   TF_LITE_MICRO_EXPECT_EQ(recorded_allocation.count,
-                          quantized_tensor_count * 3);
+                          quantized_tensor_count * 2);
   TF_LITE_MICRO_EXPECT_EQ(recorded_allocation.requested_bytes,
                           expected_requested_bytes);
   TF_LITE_MICRO_EXPECT_GE(recorded_allocation.used_bytes,

From 5d4c6e105f775cd60c376a28eb540b2d286c4605 Mon Sep 17 00:00:00 2001
From: Raman Sarokin <sorokin@google.com>
Date: Mon, 15 Jun 2020 15:23:29 -0700
Subject: [PATCH 0217/1390] Added new ways of weights uploading.

PiperOrigin-RevId: 316556676
Change-Id: I343e4f6461a26a7d921699b23ee3ccf65ecb3bee
---
 .../delegates/gpu/cl/kernels/conv_powervr.cc  | 101 +++++++++++++-----
 .../delegates/gpu/cl/kernels/conv_powervr.h   |   5 +
 2 files changed, 82 insertions(+), 24 deletions(-)

diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/conv_powervr.cc b/tensorflow/lite/delegates/gpu/cl/kernels/conv_powervr.cc
index 184e070202a..6ab22bf545d 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/conv_powervr.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/conv_powervr.cc
@@ -308,6 +308,41 @@ std::string GenerateConv(
       conv_params.weights_upload_type ==
           ConvPowerVR::WeightsUploadType::LOCAL_MEM_ASYNC_SUBGROUP;
 
+  const int local_mem_size =
+      conv_params.block_size.z * 4 * conv_params.src_depth_loop_size;
+
+  const bool use_simd_broadcast =
+      conv_params.weights_upload_type ==
+          ConvPowerVR::WeightsUploadType::PRIVATE_MEM_SIMD8_BROADCAST ||
+      conv_params.weights_upload_type ==
+          ConvPowerVR::WeightsUploadType::PRIVATE_MEM_SIMD16_BROADCAST ||
+      conv_params.weights_upload_type ==
+          ConvPowerVR::WeightsUploadType::PRIVATE_MEM_SIMD32_BROADCAST ||
+      conv_params.weights_upload_type ==
+          ConvPowerVR::WeightsUploadType::PRIVATE_MEM_SIMD64_BROADCAST ||
+      conv_params.weights_upload_type ==
+          ConvPowerVR::WeightsUploadType::PRIVATE_MEM_SIMD128_BROADCAST;
+
+  int simd_size = 1;
+  if (conv_params.weights_upload_type ==
+      ConvPowerVR::WeightsUploadType::PRIVATE_MEM_SIMD8_BROADCAST) {
+    simd_size = 8;
+  } else if (conv_params.weights_upload_type ==
+             ConvPowerVR::WeightsUploadType::PRIVATE_MEM_SIMD16_BROADCAST) {
+    simd_size = 16;
+  } else if (conv_params.weights_upload_type ==
+             ConvPowerVR::WeightsUploadType::PRIVATE_MEM_SIMD32_BROADCAST) {
+    simd_size = 32;
+  } else if (conv_params.weights_upload_type ==
+             ConvPowerVR::WeightsUploadType::PRIVATE_MEM_SIMD64_BROADCAST) {
+    simd_size = 64;
+  } else if (conv_params.weights_upload_type ==
+             ConvPowerVR::WeightsUploadType::PRIVATE_MEM_SIMD128_BROADCAST) {
+    simd_size = 128;
+  }
+
+  bool late_oob_check = need_local_mem || use_simd_broadcast;
+
   const std::string weights_space =
       conv_params.weights_upload_type ==
               ConvPowerVR::WeightsUploadType::CONSTANT_MEM
@@ -328,6 +363,10 @@ std::string GenerateConv(
          std::to_string(work_group_size.y) + ", " +
          std::to_string(work_group_size.z) + ")))\n";
   }
+  if (use_simd_broadcast && device.IsIntel()) {
+    c += "__attribute__((intel_reqd_work_group_size(" +
+         std::to_string(simd_size) + ")))\n";
+  }
   c += "__kernel void main_function(\n";
   c += src_tensor.GetDeclaration(AccessType::READ) + ",\n";
   c += "    " + weights_global_ptr + " filters_buffer,    \n";
@@ -355,7 +394,7 @@ std::string GenerateConv(
   for (int y = 0; y < conv_params.block_size.y; ++y) {
     dst_y[y] = "(Y + " + std::to_string(y) + ")";
   }
-  if (!need_local_mem) {
+  if (!late_oob_check) {
     c += "  if (X >= dst_size.x || Y >= dst_size.y || Z >= dst_size.z) {\n";
     c += "    return;\n";
     c += "  }\n";
@@ -396,13 +435,8 @@ std::string GenerateConv(
   }
   if (need_local_mem) {
     c += "  __local " + weights_data_type + " weights_cache[" +
-         std::to_string(block_size.z * 4 * conv_params.src_depth_loop_size) +
-         "];\n";
-  }
-  if (conv_params.weights_upload_type ==
-          ConvPowerVR::WeightsUploadType::GLOBAL_MEM ||
-      conv_params.weights_upload_type ==
-          ConvPowerVR::WeightsUploadType::CONSTANT_MEM) {
+         std::to_string(local_mem_size) + "];\n";
+  } else {
     c += "    " + weights_global_ptr + " weights_cache;\n";
   }
   if (is1x1) {
@@ -521,9 +555,17 @@ std::string GenerateConv(
           for (int y = 0; y < block_size.y; ++y) {
             for (int x = 0; x < block_size.x; ++x) {
               std::string id = std::to_string(y) + std::to_string(x);
-              c += "    r" + std::to_string(z) + id + " += weights_cache[" +
-                   std::to_string(z * 4 + ch + shared_offset) + "] * src" + id +
-                   "." + channels[ch] + ";\n";
+              std::string w_val = "weights_cache[" +
+                                  std::to_string(z * 4 + ch + shared_offset) +
+                                  "]";
+              if (use_simd_broadcast) {
+                int simd_id = (z * 4 + ch + shared_offset) / simd_size;
+                int thread_id = (z * 4 + ch + shared_offset) % simd_size;
+                w_val = "sub_group_broadcast(simd_w" + std::to_string(simd_id) +
+                        ", " + std::to_string(thread_id) + "u)";
+              }
+              c += "    r" + std::to_string(z) + id + " += " + w_val +
+                   " * src" + id + "." + channels[ch] + ";\n";
             }
           }
         }
@@ -554,17 +596,30 @@ std::string GenerateConv(
       work_group_size.x * work_group_size.y * work_group_size.z;
   if (conv_params.weights_upload_type ==
       ConvPowerVR::WeightsUploadType::LOCAL_MEM_ASYNC_SUBGROUP) {
-    c +=
-        GenerateAsyncUpload("weights_cache", "filters_loc",
-                            /*global_offset_name*/ "",
-                            block_size.z * 4 * conv_params.src_depth_loop_size);
+    c += GenerateAsyncUpload("weights_cache", "filters_loc",
+                             /*global_offset_name*/ "", local_mem_size);
   } else if (conv_params.weights_upload_type ==
              ConvPowerVR::WeightsUploadType::LOCAL_MEM_BY_THREADS) {
     c += "    barrier(CLK_LOCAL_MEM_FENCE);\n";
-    c += GenerateUploadByThreads(
-        "weights_cache", "filters_loc",
-        /*global_offset_name*/ "", "lid", total_work_items,
-        block_size.z * 4 * conv_params.src_depth_loop_size);
+    c += GenerateUploadByThreads("weights_cache", "filters_loc",
+                                 /*global_offset_name*/ "", "lid",
+                                 total_work_items, local_mem_size);
+  } else if (use_simd_broadcast) {
+    int parts = local_mem_size / simd_size;
+    int reminder = local_mem_size % simd_size;
+    for (int i = 0; i < parts; ++i) {
+      c += "    FLT4 simd_w" + std::to_string(i) +
+           " = filters_loc[get_sub_group_local_id() + " +
+           std::to_string(i * simd_size) + "];\n";
+    }
+    if (reminder) {
+      c += "    FLT4 simd_w" + std::to_string(parts) + ";\n";
+      c += "    if (simd_id < " + std::to_string(reminder) + ") {\n";
+      c += "      simd_w" + std::to_string(parts) +
+           " = filters_loc[get_sub_group_local_id() + " +
+           std::to_string(parts * simd_size) + "];\n";
+      c += "    }\n";
+    }
   } else {  // GLOBAL_MEM/CONSTANT_MEM
     c += "    weights_cache = filters_loc;\n";
   }
@@ -580,9 +635,7 @@ std::string GenerateConv(
     conv_core(i * block_size.z * 4);
     c += "    s += 1;\n";
   }
-  c += "    filters_loc += " +
-       std::to_string(block_size.z * 4 * conv_params.src_depth_loop_size) +
-       ";\n";
+  c += "    filters_loc += " + std::to_string(local_mem_size) + ";\n";
   c += "  } while (s < src_size.z);\n";
   if (!is1x1) {
     c += "  };\n";
@@ -597,10 +650,10 @@ std::string GenerateConv(
     c += GenerateUploadByThreads("weights_cache", "biases", "Z", "lid",
                                  total_work_items, block_size.z);
     c += "    barrier(CLK_LOCAL_MEM_FENCE);\n";
-  } else {  // GLOBAL_MEM/CONSTANT_MEM
+  } else {
     c += "    weights_cache = biases + Z;\n";
   }
-  if (need_local_mem) {
+  if (late_oob_check) {
     c += "  if (X >= dst_size.x || Y >= dst_size.y || Z >= dst_size.z) {\n";
     c += "    return;\n";
     c += "  }\n";
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/conv_powervr.h b/tensorflow/lite/delegates/gpu/cl/kernels/conv_powervr.h
index 5eff4b36053..a729098bded 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/conv_powervr.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/conv_powervr.h
@@ -64,6 +64,11 @@ class ConvPowerVR : public GPUOperation {
     LOCAL_MEM_BY_THREADS,
     GLOBAL_MEM,
     CONSTANT_MEM,
+    PRIVATE_MEM_SIMD8_BROADCAST,
+    PRIVATE_MEM_SIMD16_BROADCAST,
+    PRIVATE_MEM_SIMD32_BROADCAST,
+    PRIVATE_MEM_SIMD64_BROADCAST,
+    PRIVATE_MEM_SIMD128_BROADCAST,
   };
 
   struct ConvParams {

From 67fb07ba9fe4587d65ed2cfe83a364797f9c97b4 Mon Sep 17 00:00:00 2001
From: Ken Franko <kfranko@google.com>
Date: Mon, 15 Jun 2020 15:28:17 -0700
Subject: [PATCH 0218/1390] Add load option for loading SavedModel from
 specific io_device for distributed training.

A new class LoadOptions is created similar to the existing SavedOptions.  The option experimental_io_device is the only option added at this time and usd to set the io_device when loading a SavedModel for distributed training.

PiperOrigin-RevId: 316557681
Change-Id: If3f1eae18b09085ff11dc8a6882fabcb18f5f48e
---
 tensorflow/python/keras/engine/training.py    | 10 +++-
 tensorflow/python/keras/saving/save.py        |  6 +-
 .../python/keras/saving/saved_model/load.py   |  9 ++-
 .../python/keras/saving/saved_model/save.py   |  2 +-
 tensorflow/python/saved_model/BUILD           |  8 +++
 tensorflow/python/saved_model/load.py         | 28 ++++++---
 tensorflow/python/saved_model/load_options.py | 57 +++++++++++++++++++
 tensorflow/python/saved_model/load_test.py    |  7 +++
 .../golden/v1/tensorflow.keras.-model.pbtxt   |  2 +-
 .../v1/tensorflow.keras.-sequential.pbtxt     |  2 +-
 ...low.keras.experimental.-linear-model.pbtxt |  2 +-
 ....keras.experimental.-wide-deep-model.pbtxt |  2 +-
 .../v1/tensorflow.keras.models.-model.pbtxt   |  2 +-
 .../tensorflow.keras.models.-sequential.pbtxt |  2 +-
 .../golden/v1/tensorflow.keras.models.pbtxt   |  2 +-
 .../golden/v1/tensorflow.saved_model.pbtxt    |  2 +-
 .../golden/v2/tensorflow.keras.-model.pbtxt   |  2 +-
 .../v2/tensorflow.keras.-sequential.pbtxt     |  2 +-
 ...low.keras.experimental.-linear-model.pbtxt |  2 +-
 ....keras.experimental.-wide-deep-model.pbtxt |  2 +-
 .../v2/tensorflow.keras.models.-model.pbtxt   |  2 +-
 .../tensorflow.keras.models.-sequential.pbtxt |  2 +-
 .../golden/v2/tensorflow.keras.models.pbtxt   |  2 +-
 ...tensorflow.saved_model.-load-options.pbtxt | 13 +++++
 .../golden/v2/tensorflow.saved_model.pbtxt    |  6 +-
 25 files changed, 144 insertions(+), 32 deletions(-)
 create mode 100644 tensorflow/python/saved_model/load_options.py
 create mode 100644 tensorflow/tools/api/golden/v2/tensorflow.saved_model.-load-options.pbtxt

diff --git a/tensorflow/python/keras/engine/training.py b/tensorflow/python/keras/engine/training.py
index 87782adff46..b7a4795d768 100644
--- a/tensorflow/python/keras/engine/training.py
+++ b/tensorflow/python/keras/engine/training.py
@@ -2078,7 +2078,11 @@ class Model(base_layer.Layer, version_utils.ModelVersionSelector):
           save_relative_paths=True,
           all_model_checkpoint_paths=[filepath])
 
-  def load_weights(self, filepath, by_name=False, skip_mismatch=False):
+  def load_weights(self,
+                   filepath,
+                   by_name=False,
+                   skip_mismatch=False,
+                   options=None):
     """Loads all layer weights, either from a TensorFlow or an HDF5 weight file.
 
     If `by_name` is False weights are loaded based on the network's
@@ -2108,6 +2112,8 @@ class Model(base_layer.Layer, version_utils.ModelVersionSelector):
         skip_mismatch: Boolean, whether to skip loading of layers where there is
             a mismatch in the number of weights, or a mismatch in the shape of
             the weight (only valid when `by_name=True`).
+        options: Optional `tf.train.CheckpointOptions` object that specifies
+            options for loading weights.
 
     Returns:
         When loading a weight file in TensorFlow format, returns the same status
@@ -2145,7 +2151,7 @@ class Model(base_layer.Layer, version_utils.ModelVersionSelector):
         # The checkpoint is not readable in TensorFlow format. Try HDF5.
         save_format = 'h5'
     if save_format == 'tf':
-      status = self._trackable_saver.restore(filepath)
+      status = self._trackable_saver.restore(filepath, options)
       if by_name:
         raise NotImplementedError(
             'Weights may only be loaded based on topology into Models when '
diff --git a/tensorflow/python/keras/saving/save.py b/tensorflow/python/keras/saving/save.py
index 7f725d3978e..9c83914d380 100644
--- a/tensorflow/python/keras/saving/save.py
+++ b/tensorflow/python/keras/saving/save.py
@@ -135,7 +135,7 @@ def save_model(model,
 
 
 @keras_export('keras.models.load_model')
-def load_model(filepath, custom_objects=None, compile=True):  # pylint: disable=redefined-builtin
+def load_model(filepath, custom_objects=None, compile=True, options=None):  # pylint: disable=redefined-builtin
   """Loads a model saved via `model.save()`.
 
   Usage:
@@ -162,6 +162,8 @@ def load_model(filepath, custom_objects=None, compile=True):  # pylint: disable=
           considered during deserialization.
       compile: Boolean, whether to compile the model
           after loading.
+      options: Optional `tf.saved_model.LoadOptions` object that specifies
+        options for loading from SavedModel.
 
   Returns:
       A Keras model instance. If the original model was compiled, and saved with
@@ -182,7 +184,7 @@ def load_model(filepath, custom_objects=None, compile=True):  # pylint: disable=
     filepath = path_to_string(filepath)
     if isinstance(filepath, six.string_types):
       loader_impl.parse_saved_model(filepath)
-      return saved_model_load.load(filepath, compile)
+      return saved_model_load.load(filepath, compile, options)
 
   raise IOError(
       'Unable to load model. Filepath is not an hdf5 file (or h5py is not '
diff --git a/tensorflow/python/keras/saving/saved_model/load.py b/tensorflow/python/keras/saving/saved_model/load.py
index ca8164c9407..7e67bf6305c 100644
--- a/tensorflow/python/keras/saving/saved_model/load.py
+++ b/tensorflow/python/keras/saving/saved_model/load.py
@@ -90,7 +90,7 @@ KERAS_OBJECT_IDENTIFIERS = (
     '_tf_keras_rnn_layer')
 
 
-def load(path, compile=True):  # pylint: disable=redefined-builtin
+def load(path, compile=True, options=None):  # pylint: disable=redefined-builtin
   """Loads Keras objects from a SavedModel.
 
   Any Keras layer or model saved to the SavedModel will be loaded back
@@ -107,13 +107,18 @@ def load(path, compile=True):  # pylint: disable=redefined-builtin
   Args:
     path: Path to SavedModel.
     compile: If true, compile the model after loading it.
+    options: Optional `tf.saved_model.LoadOptions` object that specifies
+      options for loading from SavedModel.
+
 
   Returns:
     Object loaded from SavedModel.
   """
   # TODO(kathywu): Add saving/loading of optimizer, compiled losses and metrics.
   # TODO(kathywu): Add code to load from objects that contain all endpoints
-  model = tf_load.load_internal(path, loader_cls=KerasObjectLoader)
+
+  model = tf_load.load_internal(
+      path, options=options, loader_cls=KerasObjectLoader)
 
   # pylint: disable=protected-access
   if isinstance(model, training_lib.Model) and compile:
diff --git a/tensorflow/python/keras/saving/saved_model/save.py b/tensorflow/python/keras/saving/saved_model/save.py
index 9d4ca5e2c59..7d6bc120758 100644
--- a/tensorflow/python/keras/saving/saved_model/save.py
+++ b/tensorflow/python/keras/saving/saved_model/save.py
@@ -49,7 +49,7 @@ def save(model, filepath, overwrite, include_optimizer, signatures=None,
     signatures: Signatures to save with the SavedModel. Applicable to the 'tf'
       format only. Please see the `signatures` argument in `tf.saved_model.save`
       for details.
-    options: Optional`tf.saved_model.SaveOptions` object that specifies
+    options: Optional `tf.saved_model.SaveOptions` object that specifies
       options for saving to SavedModel.
 
   Raises:
diff --git a/tensorflow/python/saved_model/BUILD b/tensorflow/python/saved_model/BUILD
index 6e17b8af206..240b60f43f6 100644
--- a/tensorflow/python/saved_model/BUILD
+++ b/tensorflow/python/saved_model/BUILD
@@ -348,6 +348,7 @@ py_library(
     deps = [
         ":constants",
         ":function_deserialization",
+        ":load_options",
         ":load_v1_in_v2",
         ":loader",
         ":nested_structure_coder",
@@ -522,6 +523,13 @@ py_library(
     ],
 )
 
+py_library(
+    name = "load_options",
+    srcs = ["load_options.py"],
+    deps = [
+    ],
+)
+
 py_library(
     name = "method_name_updater",
     srcs = ["method_name_updater.py"],
diff --git a/tensorflow/python/saved_model/load.py b/tensorflow/python/saved_model/load.py
index fe2919c88dc..74b030a3797 100644
--- a/tensorflow/python/saved_model/load.py
+++ b/tensorflow/python/saved_model/load.py
@@ -37,11 +37,13 @@ from tensorflow.python.ops import lookup_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.saved_model import function_deserialization
+from tensorflow.python.saved_model import load_options
 from tensorflow.python.saved_model import load_v1_in_v2
 from tensorflow.python.saved_model import loader_impl
 from tensorflow.python.saved_model import nested_structure_coder
 from tensorflow.python.saved_model import revived_types
 from tensorflow.python.saved_model import utils_impl as saved_model_utils
+from tensorflow.python.training.saving import checkpoint_options
 from tensorflow.python.training.tracking import base
 from tensorflow.python.training.tracking import graph_view
 from tensorflow.python.training.tracking import tracking
@@ -105,7 +107,8 @@ class _WrapperFunction(function.ConcreteFunction):
 class Loader(object):
   """Helper class to load an object-based SavedModel."""
 
-  def __init__(self, object_graph_proto, saved_model_proto, export_dir):
+  def __init__(self, object_graph_proto, saved_model_proto, export_dir,
+               ckpt_options):
     meta_graph = saved_model_proto.meta_graphs[0]
     self._asset_file_def = meta_graph.asset_file_def
     self._operation_attributes = {
@@ -115,6 +118,7 @@ class Loader(object):
     self._concrete_functions = (
         function_deserialization.load_function_def_library(
             meta_graph.graph_def.library))
+    self._checkpoint_options = ckpt_options
 
     for name, concrete_function in self._concrete_functions.items():
       # Wrap all the concrete function so that they are capable of dealing with
@@ -306,9 +310,10 @@ class Loader(object):
     with ops.device("CPU"):
       saver._file_prefix_placeholder = constant_op.constant(variables_path)
     if self._expect_partial_checkpoint:
-      load_status = saver.restore(variables_path).expect_partial()
+      load_status = saver.restore(variables_path,
+                                  self._checkpoint_options).expect_partial()
     else:
-      load_status = saver.restore(variables_path)
+      load_status = saver.restore(variables_path, self._checkpoint_options)
     load_status.assert_existing_objects_matched()
     checkpoint = load_status._checkpoint
 
@@ -491,7 +496,7 @@ def _call_attribute(instance, *args, **kwargs):
 
 
 @tf_export("saved_model.load", v1=["saved_model.load_v2"])
-def load(export_dir, tags=None):
+def load(export_dir, tags=None, options=None):
   """Load a SavedModel from `export_dir`.
 
   Signatures associated with the SavedModel are available as functions:
@@ -569,6 +574,8 @@ def load(export_dir, tags=None):
     tags: A tag or sequence of tags identifying the MetaGraph to load. Optional
       if the SavedModel contains a single MetaGraph, as for those exported from
       `tf.saved_model.save`.
+    options: Optional, `tf.saved_model.LoadOptions` object that specifies
+      options for loading.
 
   Returns:
     A trackable object with a `signatures` attribute mapping from signature
@@ -579,11 +586,12 @@ def load(export_dir, tags=None):
   Raises:
     ValueError: If `tags` don't match a MetaGraph in the SavedModel.
   """
-  return load_internal(export_dir, tags)
+  return load_internal(export_dir, tags, options)
 
 
-def load_internal(export_dir, tags=None, loader_cls=Loader):
+def load_internal(export_dir, tags=None, options=None, loader_cls=Loader):
   """Loader implementation."""
+  options = options or load_options.LoadOptions()
   if tags is not None and not isinstance(tags, set):
     # Supports e.g. tags=SERVING and tags=[SERVING]. Sets aren't considered
     # sequences for nest.flatten, so we put those through as-is.
@@ -602,10 +610,12 @@ def load_internal(export_dir, tags=None, loader_cls=Loader):
            "it, pass 'None', or pass matching tags.")
           .format(export_dir, meta_graph_def.meta_info_def.tags, tags))
     object_graph_proto = meta_graph_def.object_graph_def
+
+    ckpt_options = checkpoint_options.CheckpointOptions(
+        experimental_io_device=options.experimental_io_device)
     with ops.init_scope():
-      loader = loader_cls(object_graph_proto,
-                          saved_model_proto,
-                          export_dir)
+      loader = loader_cls(object_graph_proto, saved_model_proto, export_dir,
+                          ckpt_options)
       root = loader.get(0)
       if isinstance(loader, Loader):
         root.graph_debug_info = loader.adjust_debug_info_func_names(debug_info)
diff --git a/tensorflow/python/saved_model/load_options.py b/tensorflow/python/saved_model/load_options.py
new file mode 100644
index 00000000000..9718d8ffed9
--- /dev/null
+++ b/tensorflow/python/saved_model/load_options.py
@@ -0,0 +1,57 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Options for saving SavedModels."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.util.tf_export import tf_export
+
+
+@tf_export("saved_model.LoadOptions", v1=[])
+class LoadOptions(object):
+  """Options for loading a SavedModel.
+
+  This function may be used in the `options` argument in functions that
+  load a SavedModel (`tf.saved_model.load`, `tf.keras.models.load_model`).
+  """
+
+  # Define object attributes in __slots__ for improved memory and performance.
+  __slots__ = ("experimental_io_device",)
+
+  def __init__(self,
+               experimental_io_device=None):
+    """Creates an object that stores options for SavedModel loading.
+
+    Args:
+      experimental_io_device: string. Applies in a distributed setting.
+        Tensorflow device to use to access the filesystem. If `None` (default)
+        then for each variable the filesystem is accessed from the CPU:0 device
+        of the host where that variable is assigned. If specified, the
+        filesystem is instead accessed from that device for all variables.
+        This is for example useful if you want to load from a local directory,
+        such as "/tmp" when running in a distributed setting. In that case
+        pass a device for the host where the "/tmp" directory is accessible.
+
+    Example:
+
+      load_options = tf.saved_model.LoadOptions(experimental_io_device=
+        '/job:localhost')
+      restoredmodel = tf.keras.models.load_model(saved_model_path,
+                                                 options=load_options)
+
+    """
+    self.experimental_io_device = experimental_io_device
diff --git a/tensorflow/python/saved_model/load_test.py b/tensorflow/python/saved_model/load_test.py
index 7bd2e87c739..5449cc1c9a2 100644
--- a/tensorflow/python/saved_model/load_test.py
+++ b/tensorflow/python/saved_model/load_test.py
@@ -56,6 +56,7 @@ from tensorflow.python.ops import variables
 from tensorflow.python.ops.ragged import ragged_factory_ops
 from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.saved_model import load
+from tensorflow.python.saved_model import load_options
 from tensorflow.python.saved_model import save
 from tensorflow.python.saved_model import tag_constants
 from tensorflow.python.training import monitored_session
@@ -1788,6 +1789,12 @@ class LoadTest(test.TestCase, parameterized.TestCase):
     self.assertAllEqual(imported2.f(rt, 2), [[3, 4], [5]])
     self.assertAllEqual(imported2.f(rt, 3), [[4, 5], [6]])
 
+  def test_accepts_io_device(self, cycles):
+    options = load_options.LoadOptions()
+    self.assertIsNone(options.experimental_io_device)
+    options = load_options.LoadOptions(experimental_io_device="/job:localhost")
+    self.assertEqual("/job:localhost", options.experimental_io_device)
+
 
 class SingleCycleTests(test.TestCase, parameterized.TestCase):
 
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.-model.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.-model.pbtxt
index ea2945b5bf6..b62814e81cb 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.-model.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.-model.pbtxt
@@ -258,7 +258,7 @@ tf_class {
   }
   member_method {
     name: "load_weights"
-    argspec: "args=[\'self\', \'filepath\', \'by_name\', \'skip_mismatch\'], varargs=None, keywords=None, defaults=[\'False\', \'False\'], "
+    argspec: "args=[\'self\', \'filepath\', \'by_name\', \'skip_mismatch\', \'options\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'None\'], "
   }
   member_method {
     name: "make_predict_function"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.-sequential.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.-sequential.pbtxt
index c1dea9335c0..7485a0b3c62 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.-sequential.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.-sequential.pbtxt
@@ -264,7 +264,7 @@ tf_class {
   }
   member_method {
     name: "load_weights"
-    argspec: "args=[\'self\', \'filepath\', \'by_name\', \'skip_mismatch\'], varargs=None, keywords=None, defaults=[\'False\', \'False\'], "
+    argspec: "args=[\'self\', \'filepath\', \'by_name\', \'skip_mismatch\', \'options\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'None\'], "
   }
   member_method {
     name: "make_predict_function"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-linear-model.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-linear-model.pbtxt
index ba87c0a2a7a..bf980e5d116 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-linear-model.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-linear-model.pbtxt
@@ -259,7 +259,7 @@ tf_class {
   }
   member_method {
     name: "load_weights"
-    argspec: "args=[\'self\', \'filepath\', \'by_name\', \'skip_mismatch\'], varargs=None, keywords=None, defaults=[\'False\', \'False\'], "
+    argspec: "args=[\'self\', \'filepath\', \'by_name\', \'skip_mismatch\', \'options\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'None\'], "
   }
   member_method {
     name: "make_predict_function"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-wide-deep-model.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-wide-deep-model.pbtxt
index 37fb2051f81..c214a5c3419 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-wide-deep-model.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-wide-deep-model.pbtxt
@@ -259,7 +259,7 @@ tf_class {
   }
   member_method {
     name: "load_weights"
-    argspec: "args=[\'self\', \'filepath\', \'by_name\', \'skip_mismatch\'], varargs=None, keywords=None, defaults=[\'False\', \'False\'], "
+    argspec: "args=[\'self\', \'filepath\', \'by_name\', \'skip_mismatch\', \'options\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'None\'], "
   }
   member_method {
     name: "make_predict_function"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-model.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-model.pbtxt
index 7439fc2dd6b..86868c9d17f 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-model.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-model.pbtxt
@@ -258,7 +258,7 @@ tf_class {
   }
   member_method {
     name: "load_weights"
-    argspec: "args=[\'self\', \'filepath\', \'by_name\', \'skip_mismatch\'], varargs=None, keywords=None, defaults=[\'False\', \'False\'], "
+    argspec: "args=[\'self\', \'filepath\', \'by_name\', \'skip_mismatch\', \'options\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'None\'], "
   }
   member_method {
     name: "make_predict_function"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-sequential.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-sequential.pbtxt
index 24e8bf57611..05aa19a915a 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-sequential.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-sequential.pbtxt
@@ -264,7 +264,7 @@ tf_class {
   }
   member_method {
     name: "load_weights"
-    argspec: "args=[\'self\', \'filepath\', \'by_name\', \'skip_mismatch\'], varargs=None, keywords=None, defaults=[\'False\', \'False\'], "
+    argspec: "args=[\'self\', \'filepath\', \'by_name\', \'skip_mismatch\', \'options\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'None\'], "
   }
   member_method {
     name: "make_predict_function"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.models.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.models.pbtxt
index 6f85b3c2150..ac80126aaa3 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.models.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.models.pbtxt
@@ -14,7 +14,7 @@ tf_module {
   }
   member_method {
     name: "load_model"
-    argspec: "args=[\'filepath\', \'custom_objects\', \'compile\'], varargs=None, keywords=None, defaults=[\'None\', \'True\'], "
+    argspec: "args=[\'filepath\', \'custom_objects\', \'compile\', \'options\'], varargs=None, keywords=None, defaults=[\'None\', \'True\', \'None\'], "
   }
   member_method {
     name: "model_from_config"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.saved_model.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.saved_model.pbtxt
index 8833f02b0db..2dde9c495cc 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.saved_model.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.saved_model.pbtxt
@@ -182,7 +182,7 @@ tf_module {
   }
   member_method {
     name: "load_v2"
-    argspec: "args=[\'export_dir\', \'tags\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'export_dir\', \'tags\', \'options\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "main_op_with_restore"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.-model.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.-model.pbtxt
index ea2945b5bf6..b62814e81cb 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.-model.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.-model.pbtxt
@@ -258,7 +258,7 @@ tf_class {
   }
   member_method {
     name: "load_weights"
-    argspec: "args=[\'self\', \'filepath\', \'by_name\', \'skip_mismatch\'], varargs=None, keywords=None, defaults=[\'False\', \'False\'], "
+    argspec: "args=[\'self\', \'filepath\', \'by_name\', \'skip_mismatch\', \'options\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'None\'], "
   }
   member_method {
     name: "make_predict_function"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.-sequential.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.-sequential.pbtxt
index c1dea9335c0..7485a0b3c62 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.-sequential.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.-sequential.pbtxt
@@ -264,7 +264,7 @@ tf_class {
   }
   member_method {
     name: "load_weights"
-    argspec: "args=[\'self\', \'filepath\', \'by_name\', \'skip_mismatch\'], varargs=None, keywords=None, defaults=[\'False\', \'False\'], "
+    argspec: "args=[\'self\', \'filepath\', \'by_name\', \'skip_mismatch\', \'options\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'None\'], "
   }
   member_method {
     name: "make_predict_function"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-linear-model.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-linear-model.pbtxt
index ba87c0a2a7a..bf980e5d116 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-linear-model.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-linear-model.pbtxt
@@ -259,7 +259,7 @@ tf_class {
   }
   member_method {
     name: "load_weights"
-    argspec: "args=[\'self\', \'filepath\', \'by_name\', \'skip_mismatch\'], varargs=None, keywords=None, defaults=[\'False\', \'False\'], "
+    argspec: "args=[\'self\', \'filepath\', \'by_name\', \'skip_mismatch\', \'options\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'None\'], "
   }
   member_method {
     name: "make_predict_function"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-wide-deep-model.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-wide-deep-model.pbtxt
index 37fb2051f81..c214a5c3419 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-wide-deep-model.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-wide-deep-model.pbtxt
@@ -259,7 +259,7 @@ tf_class {
   }
   member_method {
     name: "load_weights"
-    argspec: "args=[\'self\', \'filepath\', \'by_name\', \'skip_mismatch\'], varargs=None, keywords=None, defaults=[\'False\', \'False\'], "
+    argspec: "args=[\'self\', \'filepath\', \'by_name\', \'skip_mismatch\', \'options\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'None\'], "
   }
   member_method {
     name: "make_predict_function"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-model.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-model.pbtxt
index 7439fc2dd6b..86868c9d17f 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-model.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-model.pbtxt
@@ -258,7 +258,7 @@ tf_class {
   }
   member_method {
     name: "load_weights"
-    argspec: "args=[\'self\', \'filepath\', \'by_name\', \'skip_mismatch\'], varargs=None, keywords=None, defaults=[\'False\', \'False\'], "
+    argspec: "args=[\'self\', \'filepath\', \'by_name\', \'skip_mismatch\', \'options\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'None\'], "
   }
   member_method {
     name: "make_predict_function"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-sequential.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-sequential.pbtxt
index 24e8bf57611..05aa19a915a 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-sequential.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-sequential.pbtxt
@@ -264,7 +264,7 @@ tf_class {
   }
   member_method {
     name: "load_weights"
-    argspec: "args=[\'self\', \'filepath\', \'by_name\', \'skip_mismatch\'], varargs=None, keywords=None, defaults=[\'False\', \'False\'], "
+    argspec: "args=[\'self\', \'filepath\', \'by_name\', \'skip_mismatch\', \'options\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'None\'], "
   }
   member_method {
     name: "make_predict_function"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.models.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.models.pbtxt
index 6f85b3c2150..ac80126aaa3 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.models.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.models.pbtxt
@@ -14,7 +14,7 @@ tf_module {
   }
   member_method {
     name: "load_model"
-    argspec: "args=[\'filepath\', \'custom_objects\', \'compile\'], varargs=None, keywords=None, defaults=[\'None\', \'True\'], "
+    argspec: "args=[\'filepath\', \'custom_objects\', \'compile\', \'options\'], varargs=None, keywords=None, defaults=[\'None\', \'True\', \'None\'], "
   }
   member_method {
     name: "model_from_config"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.saved_model.-load-options.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.saved_model.-load-options.pbtxt
new file mode 100644
index 00000000000..20216d93c3d
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.saved_model.-load-options.pbtxt
@@ -0,0 +1,13 @@
+path: "tensorflow.saved_model.LoadOptions"
+tf_class {
+  is_instance: "<class \'tensorflow.python.saved_model.load_options.LoadOptions\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "experimental_io_device"
+    mtype: "<type \'member_descriptor\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'experimental_io_device\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.saved_model.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.saved_model.pbtxt
index 0a82cfd0873..0adfbd30102 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.saved_model.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.saved_model.pbtxt
@@ -44,6 +44,10 @@ tf_module {
     name: "GPU"
     mtype: "<type \'str\'>"
   }
+  member {
+    name: "LoadOptions"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "PREDICT_INPUTS"
     mtype: "<type \'str\'>"
@@ -110,7 +114,7 @@ tf_module {
   }
   member_method {
     name: "load"
-    argspec: "args=[\'export_dir\', \'tags\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'export_dir\', \'tags\', \'options\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "save"

From 1158611838055b81804e3e208334544048a16366 Mon Sep 17 00:00:00 2001
From: Robert David <lrdx@google.com>
Date: Mon, 15 Jun 2020 15:42:09 -0700
Subject: [PATCH 0219/1390] LSTM: do projection to output_state instead of
 output. Because the two arrays are only different in stride (state has no
 stride), this allows us to do the projection in a batched manner.

Copy the result to the strided output after projection.

PiperOrigin-RevId: 316560275
Change-Id: I60c544d10a64437ece1fa75eea891af4b97df231
---
 tensorflow/lite/kernels/lstm_eval.cc          | 85 +++++++------------
 .../calibration/builtin_logging_ops/lstm.cc   | 43 ++++------
 2 files changed, 46 insertions(+), 82 deletions(-)

diff --git a/tensorflow/lite/kernels/lstm_eval.cc b/tensorflow/lite/kernels/lstm_eval.cc
index 0a2c381ebf1..7fa3d85687c 100644
--- a/tensorflow/lite/kernels/lstm_eval.cc
+++ b/tensorflow/lite/kernels/lstm_eval.cc
@@ -391,40 +391,29 @@ inline void LstmStepFloat(
   const bool use_projection_weight = (projection_weights_ptr != nullptr);
   const bool use_projection_bias = (projection_bias_ptr != nullptr);
 
-  // For each batch: update the projection and output_state. Note that since
-  // the output batch rows may not be contiguous (output_batch_leading_dim !=
-  // n_output), we unroll batched operations.
+  // For each batch: update output_state.
   if (use_projection_weight) {
     if (use_projection_bias) {
-      for (int b = 0; b < n_batch; b++) {
-        std::copy_n(projection_bias_ptr, n_output,
-                    output_ptr + b * output_batch_leading_dim);
-      }
+      tensor_utils::VectorBatchVectorAssign(projection_bias_ptr, n_output,
+                                            n_batch, output_state_ptr);
     } else {
-      for (int b = 0; b < n_batch; b++) {
-        std::fill_n(output_ptr + b * output_batch_leading_dim, n_output, 0.0f);
-      }
+      std::fill_n(output_state_ptr, n_batch * n_output, 0.0f);
     }
-    for (int b = 0; b < n_batch; b++) {
-      tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-          projection_weights_ptr, n_output, n_cell,
-          output_gate_scratch + b * n_cell,
-          /*n_batch=*/1, output_ptr + b * output_batch_leading_dim);
-      if (params->proj_clip > 0.0) {
-        tensor_utils::ClipVector(output_ptr + b * output_batch_leading_dim,
-                                 n_output, params->proj_clip,
-                                 output_ptr + b * output_batch_leading_dim);
-      }
+    tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+        projection_weights_ptr, n_output, n_cell, output_gate_scratch, n_batch,
+        output_state_ptr);
+    if (params->proj_clip > 0.0) {
+      tensor_utils::ClipVector(output_state_ptr, n_batch * n_output,
+                               params->proj_clip, output_state_ptr);
     }
   } else {
-    for (int b = 0; b < n_batch; b++) {
-      std::copy_n(output_gate_scratch + b * n_output, n_output,
-                  output_ptr + b * output_batch_leading_dim);
-    }
+    std::copy_n(output_gate_scratch, n_batch * n_output, output_state_ptr);
   }
+  // Copy output_state to the output. Note that the output batch rows may not be
+  // contiguous (output_batch_leading_dim != n_output).
   for (int b = 0; b < n_batch; b++) {
-    std::copy_n(output_ptr + b * output_batch_leading_dim, n_output,
-                output_state_ptr + b * n_output);
+    std::copy_n(output_state_ptr + b * n_output, n_output,
+                output_ptr + b * output_batch_leading_dim);
   }
 }
 // LINT.ThenChange(//tensorflow/lite/tools/optimize/calibration/builtin_logging_ops/lstm.cc)
@@ -863,14 +852,10 @@ inline void LstmStepHybrid(
   // n_output), we unroll the batched operations.
   if (use_projection_weight) {
     if (use_projection_bias) {
-      for (int b = 0; b < n_batch; b++) {
-        std::copy_n(projection_bias_ptr, n_output,
-                    output_ptr + b * output_batch_leading_dim);
-      }
+      tensor_utils::VectorBatchVectorAssign(projection_bias_ptr, n_output,
+                                            n_batch, output_state_ptr);
     } else {
-      for (int b = 0; b < n_batch; b++) {
-        std::fill_n(output_ptr + b * output_batch_leading_dim, n_output, 0.0f);
-      }
+      std::fill_n(output_state_ptr, n_batch * n_output, 0.0f);
     }
     if (!tensor_utils::IsZeroVector(output_gate_scratch, n_batch * n_cell)) {
       // Save quantization and matmul computation for all zero input.
@@ -881,35 +866,25 @@ inline void LstmStepHybrid(
         scaling_factors_scratch[b] =
             scaling_factors[b] * projection_weights_scale;
       }
-      for (int b = 0; b < n_batch; b++) {
-        tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-            projection_weights_ptr, n_output, n_cell,
-            quantized_cell_state_ptr + b * n_cell, &scaling_factors_scratch[b],
-            /*n_batch=*/1, output_ptr + b * output_batch_leading_dim,
-            /*per_channel_scale=*/nullptr,
-            asymmetric_quantize_inputs ? &zero_points[b] : nullptr,
-            accum_scratch_ptr, projection_weights_row_sums, compute_row_sums,
-            context);
-      }
+      tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+          projection_weights_ptr, n_output, n_cell, quantized_cell_state_ptr,
+          scaling_factors_scratch, n_batch, output_state_ptr,
+          /*per_channel_scale=*/nullptr,
+          asymmetric_quantize_inputs ? zero_points : nullptr, accum_scratch_ptr,
+          projection_weights_row_sums, compute_row_sums, context);
     }
     if (params->proj_clip > 0.0) {
-      for (int b = 0; b < n_batch; b++) {
-        tensor_utils::ClipVector(output_ptr + b * output_batch_leading_dim,
-                                 n_output, params->proj_clip,
-                                 output_ptr + b * output_batch_leading_dim);
-      }
+      tensor_utils::ClipVector(output_state_ptr, n_batch * n_output,
+                               params->proj_clip, output_state_ptr);
     }
   } else {
-    for (int b = 0; b < n_batch; b++) {
-      std::copy_n(output_gate_scratch + b * n_output, n_output,
-                  output_ptr + b * output_batch_leading_dim);
-    }
+    std::copy_n(output_gate_scratch, n_batch * n_output, output_state_ptr);
   }
   for (int b = 0; b < n_batch; b++) {
-    std::copy_n(output_ptr + b * output_batch_leading_dim, n_output,
-                output_state_ptr + b * n_output);
+    std::copy_n(output_state_ptr + b * n_output, n_output,
+                output_ptr + b * output_batch_leading_dim);
   }
-}
+}  // namespace
 
 // Fully quantized lstm kernel for 16 bit gate matmul output.
 //
diff --git a/tensorflow/lite/tools/optimize/calibration/builtin_logging_ops/lstm.cc b/tensorflow/lite/tools/optimize/calibration/builtin_logging_ops/lstm.cc
index 41a03f16d63..b58900c0bc6 100644
--- a/tensorflow/lite/tools/optimize/calibration/builtin_logging_ops/lstm.cc
+++ b/tensorflow/lite/tools/optimize/calibration/builtin_logging_ops/lstm.cc
@@ -249,40 +249,29 @@ inline void LstmStepWithAuxInput(
   const bool use_projection_weight = (projection_weights_ptr != nullptr);
   const bool use_projection_bias = (projection_bias_ptr != nullptr);
 
-  // For each batch: update the projection and output_state. Note that since
-  // the output batch rows may not be contiguous (output_batch_leading_dim !=
-  // n_output), we unroll batched operations.
+  // For each batch: update output_state.
   if (use_projection_weight) {
     if (use_projection_bias) {
-      for (int k = 0; k < n_batch; k++) {
-        std::copy_n(projection_bias_ptr, n_output,
-                    output_ptr + k * output_batch_leading_dim);
-      }
+      tensor_utils::VectorBatchVectorAssign(projection_bias_ptr, n_output,
+                                            n_batch, output_state_ptr);
     } else {
-      for (int k = 0; k < n_batch; k++) {
-        std::fill_n(output_ptr + k * output_batch_leading_dim, n_output, 0.0f);
-      }
+      std::fill_n(output_state_ptr, n_batch * n_output, 0.0f);
     }
-    for (int k = 0; k < n_batch; k++) {
-      tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-          projection_weights_ptr, n_output, n_cell,
-          output_gate_scratch + k * n_cell,
-          /*n_batch=*/1, output_ptr + k * output_batch_leading_dim);
-      if (params->proj_clip > 0.0) {
-        tensor_utils::ClipVector(output_ptr + k * output_batch_leading_dim,
-                                 n_output, params->proj_clip,
-                                 output_ptr + k * output_batch_leading_dim);
-      }
+    tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+        projection_weights_ptr, n_output, n_cell, output_gate_scratch, n_batch,
+        output_state_ptr);
+    if (params->proj_clip > 0.0) {
+      tensor_utils::ClipVector(output_state_ptr, n_batch * n_output,
+                               params->proj_clip, output_state_ptr);
     }
   } else {
-    for (int k = 0; k < n_batch; k++) {
-      std::copy_n(output_gate_scratch + k * n_output, n_output,
-                  output_ptr + k * output_batch_leading_dim);
-    }
+    std::copy_n(output_gate_scratch, n_batch * n_output, output_state_ptr);
   }
-  for (int k = 0; k < n_batch; k++) {
-    std::copy_n(output_ptr + k * output_batch_leading_dim, n_output,
-                output_state_ptr + k * n_output);
+  // Copy output_state to the output. Note that the output batch rows may not be
+  // contiguous (output_batch_leading_dim != n_output).
+  for (int b = 0; b < n_batch; b++) {
+    std::copy_n(output_state_ptr + b * n_output, n_output,
+                output_ptr + b * output_batch_leading_dim);
   }
 }
 

From 7292433984f91ddaec03cb07fc3749b781199984 Mon Sep 17 00:00:00 2001
From: Jaesung Chung <jaesung@google.com>
Date: Mon, 15 Jun 2020 15:57:32 -0700
Subject: [PATCH 0220/1390] Support flex ops in calibration optimization

This CL makes the tool generate a user-friendly error message as well.

In order to use the correct logger for mobile, it uses the error_reporter.

PiperOrigin-RevId: 316563081
Change-Id: Ib56f80330087750777725ed6ad3c97f54b1fa80b
---
 .../lite/tools/optimize/calibration/BUILD     |  2 +
 .../tools/optimize/calibration/calibrator.cc  |  4 +-
 .../calibration/logging_op_resolver.cc        | 33 ++++++++++-
 .../calibration/logging_op_resolver.h         |  3 +-
 .../calibration/logging_op_resolver_test.cc   | 55 +++++++++++++++++--
 5 files changed, 89 insertions(+), 8 deletions(-)

diff --git a/tensorflow/lite/tools/optimize/calibration/BUILD b/tensorflow/lite/tools/optimize/calibration/BUILD
index a394156786f..11d00efe103 100644
--- a/tensorflow/lite/tools/optimize/calibration/BUILD
+++ b/tensorflow/lite/tools/optimize/calibration/BUILD
@@ -89,6 +89,8 @@ cc_library(
     deps = [
         ":calibration_common",
         "//tensorflow/lite:framework",
+        "//tensorflow/lite:minimal_logging",
+        "//tensorflow/lite:util",
         "//tensorflow/lite/core/api",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
diff --git a/tensorflow/lite/tools/optimize/calibration/calibrator.cc b/tensorflow/lite/tools/optimize/calibration/calibrator.cc
index 579fb8cd52f..fb1677fda99 100644
--- a/tensorflow/lite/tools/optimize/calibration/calibrator.cc
+++ b/tensorflow/lite/tools/optimize/calibration/calibrator.cc
@@ -378,8 +378,8 @@ TfLiteStatus BuildLoggingInterpreter(
   // Prepare the logging op resolver to use |LoggingEval| for kernel
   // invocations.
   auto logging_op_resolver = absl::make_unique<LoggingOpResolver>(
-      builtin_op_and_versions, custom_op_and_versions, op_resolver,
-      LoggingEval);
+      builtin_op_and_versions, custom_op_and_versions, op_resolver, LoggingEval,
+      error_reporter);
   tflite::InterpreterBuilder(tflite_model, *logging_op_resolver,
                              error_reporter)(interpreter);
 
diff --git a/tensorflow/lite/tools/optimize/calibration/logging_op_resolver.cc b/tensorflow/lite/tools/optimize/calibration/logging_op_resolver.cc
index 634b2a76a3a..92601b2a459 100644
--- a/tensorflow/lite/tools/optimize/calibration/logging_op_resolver.cc
+++ b/tensorflow/lite/tools/optimize/calibration/logging_op_resolver.cc
@@ -15,6 +15,10 @@ limitations under the License.
 #include "tensorflow/lite/tools/optimize/calibration/logging_op_resolver.h"
 
 #include "absl/memory/memory.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_join.h"
+#include "tensorflow/lite/minimal_logging.h"
+#include "tensorflow/lite/util.h"
 
 namespace tflite {
 namespace optimize {
@@ -23,10 +27,18 @@ namespace calibration {
 LoggingOpResolver::LoggingOpResolver(
     const BuiltinOpsSet& builtin_ops_to_replace,
     const CustomOpsSet& custom_ops_to_replace, const OpResolver& base_resolver,
-    KernelEvalFuncPtr logging_eval_fn) {
+    KernelEvalFuncPtr logging_eval_fn, ErrorReporter* error_reporter) {
+  std::vector<std::string> unresolved_builtin_ops;
+  std::vector<std::string> unresolved_custom_ops;
+
   for (const auto& op_and_version : builtin_ops_to_replace) {
     const TfLiteRegistration* base_registration =
         base_resolver.FindOp(op_and_version.first, op_and_version.second);
+    if (!base_registration) {
+      unresolved_builtin_ops.push_back(
+          EnumNameBuiltinOperator(op_and_version.first));
+      continue;
+    }
     BuiltinOperatorKey key = op_and_version;
     builtin_op_evalfn_map_[key] = base_registration->invoke;
     auto logging_registration =
@@ -37,6 +49,11 @@ LoggingOpResolver::LoggingOpResolver(
   for (const auto& op_and_version : custom_ops_to_replace) {
     const TfLiteRegistration* base_registration = base_resolver.FindOp(
         op_and_version.first.c_str(), op_and_version.second);
+    if (!base_registration) {
+      if (!IsFlexOp(op_and_version.first.c_str()))
+        unresolved_custom_ops.push_back(op_and_version.first.c_str());
+      continue;
+    }
     CustomOperatorKey key = op_and_version;
     custom_op_evalfn_map_[key] = base_registration->invoke;
     auto logging_registration =
@@ -44,6 +61,20 @@ LoggingOpResolver::LoggingOpResolver(
     logging_registration->invoke = logging_eval_fn;
     custom_op_registration_map_[key] = std::move(logging_registration);
   }
+
+  if (!unresolved_builtin_ops.empty() || !unresolved_custom_ops.empty()) {
+    if (!error_reporter) return;
+    std::string error_message =
+        "Failed to initialize op resolver for calibration:";
+    if (!unresolved_builtin_ops.empty())
+      absl::StrAppend(&error_message, "\nThere are unresolved builtin ops: [",
+                      absl::StrJoin(unresolved_builtin_ops, ", "), "]");
+    if (!unresolved_custom_ops.empty()) {
+      absl::StrAppend(&error_message, "\nThere are unresolved custom ops: [",
+                      absl::StrJoin(unresolved_builtin_ops, ", "), "]");
+    }
+    TF_LITE_REPORT_ERROR(error_reporter, error_message.c_str());
+  }
 }
 
 const TfLiteRegistration* LoggingOpResolver::FindOp(BuiltinOperator op,
diff --git a/tensorflow/lite/tools/optimize/calibration/logging_op_resolver.h b/tensorflow/lite/tools/optimize/calibration/logging_op_resolver.h
index bbdfef60d92..25138c38098 100644
--- a/tensorflow/lite/tools/optimize/calibration/logging_op_resolver.h
+++ b/tensorflow/lite/tools/optimize/calibration/logging_op_resolver.h
@@ -39,7 +39,8 @@ class LoggingOpResolver : public OpResolver {
   LoggingOpResolver(const BuiltinOpsSet& builtin_ops_to_replace,
                     const CustomOpsSet& custom_ops_to_replace,
                     const OpResolver& base_resolver,
-                    KernelEvalFuncPtr logging_eval_fn);
+                    KernelEvalFuncPtr logging_eval_fn,
+                    ErrorReporter* error_reporter);
 
   const TfLiteRegistration* FindOp(BuiltinOperator op,
                                    int version) const override;
diff --git a/tensorflow/lite/tools/optimize/calibration/logging_op_resolver_test.cc b/tensorflow/lite/tools/optimize/calibration/logging_op_resolver_test.cc
index 511e4d0288d..9513e1f144d 100644
--- a/tensorflow/lite/tools/optimize/calibration/logging_op_resolver_test.cc
+++ b/tensorflow/lite/tools/optimize/calibration/logging_op_resolver_test.cc
@@ -70,7 +70,7 @@ TEST(LoggingOpResolverTest, KernelInvokesAreReplaced) {
   };
 
   LoggingOpResolver resolver(ops_to_replace, CustomOpsSet(), base_resolver,
-                             WrappingInvoke);
+                             WrappingInvoke, /*error_reporter=*/nullptr);
 
   auto reg = resolver.FindOp(BuiltinOperator_CONV_2D, 1);
 
@@ -104,7 +104,7 @@ TEST(LoggingOpResolverTest, OriginalKernelInvokesAreRetained) {
   };
 
   LoggingOpResolver resolver(ops_to_replace, CustomOpsSet(), base_resolver,
-                             WrappingInvoke);
+                             WrappingInvoke, /*error_reporter=*/nullptr);
   auto kernel_invoke =
       resolver.GetWrappedKernelInvoke(BuiltinOperator_CONV_2D, 1);
   EXPECT_TRUE(kernel_invoke == ConvEval);
@@ -131,7 +131,7 @@ TEST(LoggingOpResolverTest, OnlyOpsInReplacementSetAreReplaces) {
   };
 
   LoggingOpResolver resolver(ops_to_replace, CustomOpsSet(), base_resolver,
-                             WrappingInvoke);
+                             WrappingInvoke, /*error_reporter=*/nullptr);
   auto reg = resolver.FindOp(BuiltinOperator_CONV_2D, 1);
   EXPECT_EQ(reg->builtin_code, BuiltinOperator_CONV_2D);
   EXPECT_TRUE(reg->prepare == ConvPrepare);
@@ -155,7 +155,7 @@ TEST(LoggingOpResolverTest, CustomOps) {
   };
 
   LoggingOpResolver resolver(BuiltinOpsSet(), ops_to_replace, base_resolver,
-                             WrappingInvoke);
+                             WrappingInvoke, /*error_reporter=*/nullptr);
 
   auto reg = resolver.FindOp(custom_op_name.c_str(), 1);
 
@@ -165,6 +165,53 @@ TEST(LoggingOpResolverTest, CustomOps) {
   EXPECT_TRUE(reg->invoke == WrappingInvoke);
 }
 
+TEST(LoggingOpResolverTest, UnresolvedCustomOps) {
+  // No custom op registration.
+  MutableOpResolver base_resolver;
+
+  std::string custom_op_name = "unresolved_custom_op";
+
+  CustomOpsSet ops_to_replace = {
+      {custom_op_name, /*version*/ 1},
+  };
+
+  // Expect no death.
+  LoggingOpResolver(BuiltinOpsSet(), ops_to_replace, base_resolver,
+                    WrappingInvoke, /*error_reporter=*/nullptr);
+}
+
+TEST(LoggingOpResolverTest, UnresolvedBuiltinOps) {
+  // No builtin op registration.
+  MutableOpResolver base_resolver;
+
+  BuiltinOpsSet ops_to_replace = {
+      {BuiltinOperator_CONV_2D, /*version*/ 1},
+      {BuiltinOperator_ADD, /*version*/ 1},
+  };
+
+  // Expect no death.
+  LoggingOpResolver resolver(ops_to_replace, CustomOpsSet(), base_resolver,
+                             WrappingInvoke, /*error_reporter=*/nullptr);
+}
+
+TEST(LoggingOpResolverTest, FlexOps) {
+  // No flex op registration.
+  MutableOpResolver base_resolver;
+
+  std::string custom_op_name = "FlexAdd";
+
+  CustomOpsSet ops_to_replace = {
+      {custom_op_name, /*version*/ 1},
+  };
+
+  LoggingOpResolver resolver(BuiltinOpsSet(), ops_to_replace, base_resolver,
+                             WrappingInvoke, /*error_reporter=*/nullptr);
+
+  auto reg = resolver.FindOp(custom_op_name.c_str(), 1);
+
+  EXPECT_TRUE(!reg);
+}
+
 }  // namespace
 }  // namespace calibration
 }  // namespace optimize

From d29d8af754f281844dfc6870da52faa571c8b948 Mon Sep 17 00:00:00 2001
From: Bruce Fontaine <bfontain@google.com>
Date: Mon, 15 Jun 2020 16:01:40 -0700
Subject: [PATCH 0221/1390] Add InputOptions to
 experimental_distribute_dataset(s_from_function).

PiperOrigin-RevId: 316563848
Change-Id: I00d54d309395754a6182829725f42e1f968f14c4
---
 .../collective_all_reduce_strategy.py         |  5 +-
 .../python/distribute/distribute_lib.py       | 60 +++++++++++++---
 .../python/distribute/distribute_lib_test.py  |  3 +-
 .../python/distribute/mirrored_strategy.py    |  5 +-
 .../python/distribute/one_device_strategy.py  |  5 +-
 .../distribute/parameter_server_strategy.py   |  5 +-
 tensorflow/python/distribute/tpu_strategy.py  | 70 +++++++++----------
 .../python/distribute/tpu_strategy_test.py    | 42 +++++++++++
 ...orflow.distribute.-mirrored-strategy.pbtxt |  4 +-
 ...flow.distribute.-one-device-strategy.pbtxt |  4 +-
 .../v1/tensorflow.distribute.-strategy.pbtxt  |  4 +-
 ...perimental.-central-storage-strategy.pbtxt |  4 +-
 ...ntal.-multi-worker-mirrored-strategy.pbtxt |  4 +-
 ...erimental.-parameter-server-strategy.pbtxt |  4 +-
 ...tribute.experimental.-t-p-u-strategy.pbtxt |  4 +-
 ...tensorflow.distribute.-input-options.pbtxt | 19 +++++
 ...orflow.distribute.-mirrored-strategy.pbtxt |  4 +-
 .../v2/tensorflow.distribute.-strategy.pbtxt  |  4 +-
 ...ntal.-multi-worker-mirrored-strategy.pbtxt |  4 +-
 ...tribute.experimental.-t-p-u-strategy.pbtxt |  4 +-
 .../api/golden/v2/tensorflow.distribute.pbtxt |  4 ++
 21 files changed, 186 insertions(+), 76 deletions(-)
 create mode 100644 tensorflow/tools/api/golden/v2/tensorflow.distribute.-input-options.pbtxt

diff --git a/tensorflow/python/distribute/collective_all_reduce_strategy.py b/tensorflow/python/distribute/collective_all_reduce_strategy.py
index 40c60241ac0..23ed16c5cfd 100644
--- a/tensorflow/python/distribute/collective_all_reduce_strategy.py
+++ b/tensorflow/python/distribute/collective_all_reduce_strategy.py
@@ -409,7 +409,7 @@ class CollectiveAllReduceExtended(mirrored_strategy.MirroredExtended):
         num_replicas_in_sync=self._num_replicas_in_sync)
     return input_context
 
-  def _experimental_distribute_dataset(self, dataset):
+  def _experimental_distribute_dataset(self, dataset, options):
     input_context = self._make_input_context()
     return input_lib.get_distributed_dataset(
         dataset,
@@ -418,7 +418,8 @@ class CollectiveAllReduceExtended(mirrored_strategy.MirroredExtended):
         split_batch_by=self._num_replicas_in_sync,
         input_context=input_context)
 
-  def _experimental_distribute_datasets_from_function(self, dataset_fn):
+  def _experimental_distribute_datasets_from_function(self, dataset_fn,
+                                                      options):
     input_context = self._make_input_context()
     return input_lib.get_distributed_datasets_from_function(
         dataset_fn=dataset_fn,
diff --git a/tensorflow/python/distribute/distribute_lib.py b/tensorflow/python/distribute/distribute_lib.py
index 109cb03ca88..a6dc35507e9 100644
--- a/tensorflow/python/distribute/distribute_lib.py
+++ b/tensorflow/python/distribute/distribute_lib.py
@@ -602,6 +602,43 @@ class RunOptions(
                  cls).__new__(cls, experimental_enable_dynamic_batch_size,
                               experimental_bucketizing_dynamic_shape)
 
+
+@tf_export("distribute.InputOptions", v1=[])
+class InputOptions(
+    collections.namedtuple("InputOptions", [
+        "experimental_prefetch_to_device",
+    ])):
+  """Run options for `experimental_distribute_dataset(s_from_function)`.
+
+  This can be used to hold some strategy specific configs.
+
+  ```python
+  # Setup TPUStrategy
+  resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='')
+  tf.config.experimental_connect_to_cluster(resolver)
+  tf.tpu.experimental.initialize_tpu_system(resolver)
+  strategy = tf.distribute.experimental.TPUStrategy(resolver)
+
+  dataset = tf.data.Dataset.range(16)
+  distributed_dataset_on_host = (
+      strategy.experimental_distribute_dataset(
+          dataset,
+          tf.distribute.InputOptions(
+              experimental_prefetch_to_device=False)))
+  ```
+
+  Attributes:
+    experimental_prefetch_to_device: Boolean. Currently only applies to
+      TPUStrategy. Defaults to True. If True, dataset elements will be
+      prefetched to accelerator device memory. When False, dataset elements are
+      prefetched to host device memory. Must be False when using TPUEmbedding
+      API.
+  """
+
+  def __new__(cls, experimental_prefetch_to_device=True):
+    return super(InputOptions, cls).__new__(cls,
+                                            experimental_prefetch_to_device)
+
 # ------------------------------------------------------------------------------
 # Base classes for all distribution strategies.
 
@@ -821,7 +858,7 @@ class StrategyBase(object):
       args = (input_iterator.get_next(),) if input_iterator is not None else ()
     return self.run(fn, args=args)
 
-  def experimental_distribute_dataset(self, dataset):
+  def experimental_distribute_dataset(self, dataset, options=None):
     """Distributes a tf.data.Dataset instance provided via `dataset`.
 
     The returned distributed dataset can be iterated over similar to how
@@ -910,14 +947,17 @@ class StrategyBase(object):
     Args:
       dataset: `tf.data.Dataset` that will be sharded across all replicas using
         the rules stated above.
+      options: `tf.distribute.InputOptions` used to control options on how this
+        dataset is distributed.
 
     Returns:
       A "distributed `Dataset`", which acts like a `tf.data.Dataset` except
       it produces "per-replica" values.
     """
-    return self._extended._experimental_distribute_dataset(dataset)  # pylint: disable=protected-access
+    return self._extended._experimental_distribute_dataset(dataset, options)  # pylint: disable=protected-access
 
-  def experimental_distribute_datasets_from_function(self, dataset_fn):
+  def experimental_distribute_datasets_from_function(self, dataset_fn,
+                                                     options=None):
     """Distributes `tf.data.Dataset` instances created by calls to `dataset_fn`.
 
     `dataset_fn` will be called once for each worker in the strategy. Each
@@ -973,13 +1013,15 @@ class StrategyBase(object):
     Args:
       dataset_fn: A function taking a `tf.distribute.InputContext` instance and
         returning a `tf.data.Dataset`.
+      options: `tf.distribute.InputOptions` used to control options on how this
+        dataset is distributed.
 
     Returns:
       A "distributed `Dataset`", which acts like a `tf.data.Dataset` except
       it produces "per-replica" values.
     """
     return self._extended._experimental_distribute_datasets_from_function(  # pylint: disable=protected-access
-        dataset_fn)
+        dataset_fn, options)
 
   def run(self, fn, args=(), kwargs=None, options=None):
     """Run `fn` on each replica, with the given arguments.
@@ -1943,10 +1985,11 @@ class StrategyExtendedV2(object):
   def _make_input_fn_iterator(self, input_fn, replication_mode):
     raise NotImplementedError("must be implemented in descendants")
 
-  def _experimental_distribute_dataset(self, dataset):
+  def _experimental_distribute_dataset(self, dataset, options):
     raise NotImplementedError("must be implemented in descendants")
 
-  def _experimental_distribute_datasets_from_function(self, dataset_fn):
+  def _experimental_distribute_datasets_from_function(self, dataset_fn,
+                                                      options):
     raise NotImplementedError("must be implemented in descendants")
 
   def _experimental_distribute_values_from_function(self, value_fn):
@@ -2693,10 +2736,11 @@ class _DefaultDistributionExtended(StrategyExtendedV1):
   def variable_created_in_scope(self, v):
     return v._distribute_strategy is None  # pylint: disable=protected-access
 
-  def _experimental_distribute_dataset(self, dataset):
+  def _experimental_distribute_dataset(self, dataset, options):
     return dataset
 
-  def _experimental_distribute_datasets_from_function(self, dataset_fn):
+  def _experimental_distribute_datasets_from_function(self, dataset_fn,
+                                                      options):
     return dataset_fn(InputContext())
 
   def _experimental_distribute_values_from_function(self, value_fn):
diff --git a/tensorflow/python/distribute/distribute_lib_test.py b/tensorflow/python/distribute/distribute_lib_test.py
index 828e7a1aed9..8ea1cac6f02 100644
--- a/tensorflow/python/distribute/distribute_lib_test.py
+++ b/tensorflow/python/distribute/distribute_lib_test.py
@@ -89,7 +89,8 @@ class _TestExtended(distribute_lib.StrategyExtendedV1):
                                            [distribute_lib.InputContext()],
                                            self._container_strategy())
 
-  def _experimental_distribute_datasets_from_function(self, dataset_fn):
+  def _experimental_distribute_datasets_from_function(self, dataset_fn,
+                                                      options):
     return dataset_fn(distribute_lib.InputContext())
 
   def _local_results(self, value):
diff --git a/tensorflow/python/distribute/mirrored_strategy.py b/tensorflow/python/distribute/mirrored_strategy.py
index fe565261f16..ac9045d2322 100644
--- a/tensorflow/python/distribute/mirrored_strategy.py
+++ b/tensorflow/python/distribute/mirrored_strategy.py
@@ -476,7 +476,7 @@ class MirroredExtended(distribute_lib.StrategyExtendedV1):
                                            input_contexts,
                                            self._container_strategy())
 
-  def _experimental_distribute_dataset(self, dataset):
+  def _experimental_distribute_dataset(self, dataset, options):
     return input_lib.get_distributed_dataset(
         dataset,
         self._input_workers,
@@ -487,7 +487,8 @@ class MirroredExtended(distribute_lib.StrategyExtendedV1):
     return numpy_dataset.one_host_numpy_dataset(
         numpy_input, self._host_input_device, session)
 
-  def _experimental_distribute_datasets_from_function(self, dataset_fn):
+  def _experimental_distribute_datasets_from_function(self, dataset_fn,
+                                                      options):
     input_contexts = []
     num_workers = self._input_workers.num_workers
     for i in range(num_workers):
diff --git a/tensorflow/python/distribute/one_device_strategy.py b/tensorflow/python/distribute/one_device_strategy.py
index 9a74832cd9d..e2bb28ac96f 100644
--- a/tensorflow/python/distribute/one_device_strategy.py
+++ b/tensorflow/python/distribute/one_device_strategy.py
@@ -297,13 +297,14 @@ class OneDeviceExtended(distribute_lib.StrategyExtendedV1):
     del destinations
     return tensor
 
-  def _experimental_distribute_dataset(self, dataset):
+  def _experimental_distribute_dataset(self, dataset, options):
     # Note that split_batch_by argument is not passed because it is always 1 in
     # this strategy, and adding it adds unnecessary overhead to the dataset.
     return input_lib.get_distributed_dataset(dataset, self._input_workers,
                                              self._container_strategy())
 
-  def _experimental_distribute_datasets_from_function(self, dataset_fn):
+  def _experimental_distribute_datasets_from_function(self, dataset_fn,
+                                                      options):
     return input_lib.get_distributed_datasets_from_function(
         dataset_fn,
         self._input_workers,
diff --git a/tensorflow/python/distribute/parameter_server_strategy.py b/tensorflow/python/distribute/parameter_server_strategy.py
index 42fc327351c..9675b7002c5 100644
--- a/tensorflow/python/distribute/parameter_server_strategy.py
+++ b/tensorflow/python/distribute/parameter_server_strategy.py
@@ -337,7 +337,7 @@ class ParameterServerStrategyExtended(distribute_lib.StrategyExtendedV1):
   def _validate_colocate_with_variable(self, colocate_with_variable):
     distribute_utils.validate_colocate(colocate_with_variable, self)
 
-  def _experimental_distribute_dataset(self, dataset):
+  def _experimental_distribute_dataset(self, dataset, options):
     return input_lib.get_distributed_dataset(
         dataset,
         self._input_workers,
@@ -376,7 +376,8 @@ class ParameterServerStrategyExtended(distribute_lib.StrategyExtendedV1):
     return numpy_dataset.one_host_numpy_dataset(
         numpy_input, self._input_host_device, session)
 
-  def _experimental_distribute_datasets_from_function(self, dataset_fn):
+  def _experimental_distribute_datasets_from_function(self, dataset_fn,
+                                                      options):
     if self._cluster_spec:
       input_pipeline_id = multi_worker_util.id_in_cluster(
           self._cluster_spec, self._task_type, self._task_id)
diff --git a/tensorflow/python/distribute/tpu_strategy.py b/tensorflow/python/distribute/tpu_strategy.py
index c605abd9eae..9493ecce767 100644
--- a/tensorflow/python/distribute/tpu_strategy.py
+++ b/tensorflow/python/distribute/tpu_strategy.py
@@ -308,13 +308,14 @@ class TPUExtended(distribute_lib.StrategyExtendedV1):
     # device 0 for each replica.
     # TODO(cjfj): Create `InputWorkers` lazily, allowing users to place the
     # input onto a different logical device?
-    input_worker_devices = collections.OrderedDict()
+    self._device_input_worker_devices = collections.OrderedDict()
+    self._host_input_worker_devices = collections.OrderedDict()
     for tpu_device in self._tpu_devices[:, 0]:
       host_device = device_util.get_host_for_device(tpu_device)
-      input_worker_devices.setdefault(host_device, [])
-      input_worker_devices[host_device].append(tpu_device)
-    self._input_worker_devices = tuple(input_worker_devices.items())
-    self._input_workers_obj = None
+      self._device_input_worker_devices.setdefault(host_device, [])
+      self._device_input_worker_devices[host_device].append(tpu_device)
+      self._host_input_worker_devices.setdefault(host_device, [])
+      self._host_input_worker_devices[host_device].append(host_device)
 
     # TODO(sourabhbajaj): Remove this once performance of running one step
     # at a time is comparable to multiple steps.
@@ -322,7 +323,7 @@ class TPUExtended(distribute_lib.StrategyExtendedV1):
     self._require_static_shapes = True
 
     self.experimental_enable_get_next_as_optional = True
-    self._prefetch_on_host = False
+    self._prefetch_to_device = True
 
     self._logical_device_stack = [0]
 
@@ -339,38 +340,18 @@ class TPUExtended(distribute_lib.StrategyExtendedV1):
   # memory and b) TPU Embedding enqueue operation are CPU ops and this avoids
   # a copy back to the host for dense tensors
   def _set_prefetch_on_host(self, value):
-    if self._prefetch_on_host == value:
-      return
-    if self._input_workers_obj is not None:
-      raise RuntimeError("Unable to change prefetch on host behavior as "
-                         "InputWorkers are already created.")
-    self._prefetch_on_host = value
-    if value:
-      # To prefetch on the host, we must set all the input worker devices to the
-      # corresponding host devices.
-      self._input_worker_devices = tuple([
-          tuple([host,
-                 [device_util.get_host_for_device(d) for d in devices]])
-          for host, devices in self._input_worker_devices])
-      # Force creation of the workers.
-      workers = self._input_workers
-      del workers
-
-  @property
-  def _input_workers(self):
-    if self._input_workers_obj is None:
-      self._input_workers_obj = input_lib.InputWorkers(
-          self._input_worker_devices)
-    return self._input_workers_obj
+    self._prefetch_to_device = not value
 
   def _validate_colocate_with_variable(self, colocate_with_variable):
     distribute_utils. validate_colocate(colocate_with_variable, self)
 
   def _make_dataset_iterator(self, dataset):
     """Make iterators for each of the TPU hosts."""
+    input_workers = input_lib.InputWorkers(
+        tuple(self._device_input_worker_devices.items()))
     return input_lib.DatasetIterator(
         dataset,
-        self._input_workers,
+        input_workers,
         self._container_strategy(),
         split_batch_by=self._num_replicas_in_sync)
 
@@ -379,7 +360,9 @@ class TPUExtended(distribute_lib.StrategyExtendedV1):
       input_fn,
       replication_mode=distribute_lib.InputReplicationMode.PER_WORKER):
     input_contexts = []
-    num_workers = self._input_workers.num_workers
+    input_workers = input_lib.InputWorkers(
+        tuple(self._device_input_worker_devices.items()))
+    num_workers = input_workers.num_workers
     for i in range(num_workers):
       input_contexts.append(distribute_lib.InputContext(
           num_input_pipelines=num_workers,
@@ -387,7 +370,7 @@ class TPUExtended(distribute_lib.StrategyExtendedV1):
           num_replicas_in_sync=self._num_replicas_in_sync))
     return input_lib.InputFunctionIterator(
         input_fn,
-        self._input_workers,
+        input_workers,
         input_contexts,
         self._container_strategy())
 
@@ -396,16 +379,29 @@ class TPUExtended(distribute_lib.StrategyExtendedV1):
         numpy_input, numpy_dataset.SingleDevice(self._host_device),
         session)
 
-  def _experimental_distribute_dataset(self, dataset):
+  def _get_input_workers(self, options):
+    prefetch_to_device = self._prefetch_to_device
+    if options:
+      prefetch_to_device = options.experimental_prefetch_to_device
+    if prefetch_to_device:
+      return input_lib.InputWorkers(
+          tuple(self._device_input_worker_devices.items()))
+    else:
+      return input_lib.InputWorkers(
+          tuple(self._host_input_worker_devices.items()))
+
+  def _experimental_distribute_dataset(self, dataset, options):
     return input_lib.get_distributed_dataset(
         dataset,
-        self._input_workers,
+        self._get_input_workers(options),
         self._container_strategy(),
         split_batch_by=self._num_replicas_in_sync)
 
-  def _experimental_distribute_datasets_from_function(self, dataset_fn):
+  def _experimental_distribute_datasets_from_function(self, dataset_fn,
+                                                      options):
+    input_workers = self._get_input_workers(options)
     input_contexts = []
-    num_workers = self._input_workers.num_workers
+    num_workers = input_workers.num_workers
     for i in range(num_workers):
       input_contexts.append(distribute_lib.InputContext(
           num_input_pipelines=num_workers,
@@ -414,7 +410,7 @@ class TPUExtended(distribute_lib.StrategyExtendedV1):
 
     return input_lib.get_distributed_datasets_from_function(
         dataset_fn,
-        self._input_workers,
+        input_workers,
         input_contexts,
         self._container_strategy())
 
diff --git a/tensorflow/python/distribute/tpu_strategy_test.py b/tensorflow/python/distribute/tpu_strategy_test.py
index 70a38af95aa..6dd7de500e4 100644
--- a/tensorflow/python/distribute/tpu_strategy_test.py
+++ b/tensorflow/python/distribute/tpu_strategy_test.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 
 from tensorflow.python import keras
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.distribute import distribute_lib
 from tensorflow.python.distribute import distribution_strategy_context
 from tensorflow.python.distribute import reduce_util
 from tensorflow.python.distribute import tpu_strategy as tpu_lib
@@ -30,6 +31,7 @@ from tensorflow.python.eager import remote
 from tensorflow.python.eager import test
 from tensorflow.python.framework import config
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import device as tf_device
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
@@ -546,6 +548,46 @@ class TPUStrategyTest(test.TestCase):
       update_variable.get_concrete_function()
       self.assertEqual(trace_count[0], len(strategy.extended.worker_devices))
 
+  def test_prefetch_to_device_default(self):
+    strategy = get_tpu_strategy()
+    dataset = dataset_ops.Dataset.range(
+        strategy.num_replicas_in_sync * 2,
+        output_type=dtypes.float32).batch(strategy.num_replicas_in_sync)
+
+    # Check default, should prefetch to TPU.
+    dataset_item = next(iter(strategy.experimental_distribute_dataset(dataset)))
+    dataset_location = tf_device.DeviceSpec.from_string(
+        dataset_item.values[0].device)
+    self.assertEqual(dataset_location.device_type, "TPU")
+
+  def test_prefetch_to_device_tpu(self):
+    strategy = get_tpu_strategy()
+    dataset = dataset_ops.Dataset.range(
+        strategy.num_replicas_in_sync * 2,
+        output_type=dtypes.float32).batch(strategy.num_replicas_in_sync)
+
+    input_options = distribute_lib.InputOptions(
+        experimental_prefetch_to_device=True)
+    dataset_item = next(iter(strategy.experimental_distribute_dataset(
+        dataset, options=input_options)))
+    dataset_location = tf_device.DeviceSpec.from_string(
+        dataset_item.values[0].device)
+    self.assertEqual(dataset_location.device_type, "TPU")
+
+  def test_prefetch_to_device_cpu(self):
+    strategy = get_tpu_strategy()
+    dataset = dataset_ops.Dataset.range(
+        strategy.num_replicas_in_sync * 2,
+        output_type=dtypes.float32).batch(strategy.num_replicas_in_sync)
+
+    # Should be CPU when prefetch_to_device is False.
+    input_options = distribute_lib.InputOptions(
+        experimental_prefetch_to_device=False)
+    dataset_item = next(iter(strategy.experimental_distribute_dataset(
+        dataset, options=input_options)))
+    dataset_location = tf_device.DeviceSpec.from_string(
+        dataset_item.values[0].device)
+    self.assertEqual(dataset_location.device_type, "CPU")
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.distribute.-mirrored-strategy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distribute.-mirrored-strategy.pbtxt
index 0b74423ce62..36c78c406b7 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.distribute.-mirrored-strategy.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.distribute.-mirrored-strategy.pbtxt
@@ -26,11 +26,11 @@ tf_class {
   }
   member_method {
     name: "experimental_distribute_dataset"
-    argspec: "args=[\'self\', \'dataset\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'dataset\', \'options\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "experimental_distribute_datasets_from_function"
-    argspec: "args=[\'self\', \'dataset_fn\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'dataset_fn\', \'options\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "experimental_local_results"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.distribute.-one-device-strategy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distribute.-one-device-strategy.pbtxt
index 67d6923e86c..09865ab02ee 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.distribute.-one-device-strategy.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.distribute.-one-device-strategy.pbtxt
@@ -26,11 +26,11 @@ tf_class {
   }
   member_method {
     name: "experimental_distribute_dataset"
-    argspec: "args=[\'self\', \'dataset\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'dataset\', \'options\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "experimental_distribute_datasets_from_function"
-    argspec: "args=[\'self\', \'dataset_fn\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'dataset_fn\', \'options\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "experimental_local_results"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.distribute.-strategy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distribute.-strategy.pbtxt
index d22b42d9098..0e6c10bd533 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.distribute.-strategy.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.distribute.-strategy.pbtxt
@@ -25,11 +25,11 @@ tf_class {
   }
   member_method {
     name: "experimental_distribute_dataset"
-    argspec: "args=[\'self\', \'dataset\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'dataset\', \'options\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "experimental_distribute_datasets_from_function"
-    argspec: "args=[\'self\', \'dataset_fn\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'dataset_fn\', \'options\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "experimental_local_results"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.distribute.experimental.-central-storage-strategy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distribute.experimental.-central-storage-strategy.pbtxt
index 03c5b2476b0..fbc4c107a1a 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.distribute.experimental.-central-storage-strategy.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.distribute.experimental.-central-storage-strategy.pbtxt
@@ -26,11 +26,11 @@ tf_class {
   }
   member_method {
     name: "experimental_distribute_dataset"
-    argspec: "args=[\'self\', \'dataset\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'dataset\', \'options\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "experimental_distribute_datasets_from_function"
-    argspec: "args=[\'self\', \'dataset_fn\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'dataset_fn\', \'options\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "experimental_local_results"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.distribute.experimental.-multi-worker-mirrored-strategy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distribute.experimental.-multi-worker-mirrored-strategy.pbtxt
index baee19e2a50..cd67e7d27c4 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.distribute.experimental.-multi-worker-mirrored-strategy.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.distribute.experimental.-multi-worker-mirrored-strategy.pbtxt
@@ -26,11 +26,11 @@ tf_class {
   }
   member_method {
     name: "experimental_distribute_dataset"
-    argspec: "args=[\'self\', \'dataset\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'dataset\', \'options\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "experimental_distribute_datasets_from_function"
-    argspec: "args=[\'self\', \'dataset_fn\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'dataset_fn\', \'options\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "experimental_local_results"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.distribute.experimental.-parameter-server-strategy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distribute.experimental.-parameter-server-strategy.pbtxt
index d92dab8f5bf..0eff82474ff 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.distribute.experimental.-parameter-server-strategy.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.distribute.experimental.-parameter-server-strategy.pbtxt
@@ -26,11 +26,11 @@ tf_class {
   }
   member_method {
     name: "experimental_distribute_dataset"
-    argspec: "args=[\'self\', \'dataset\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'dataset\', \'options\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "experimental_distribute_datasets_from_function"
-    argspec: "args=[\'self\', \'dataset_fn\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'dataset_fn\', \'options\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "experimental_local_results"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.distribute.experimental.-t-p-u-strategy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distribute.experimental.-t-p-u-strategy.pbtxt
index c7c8c832764..2af9a5ad095 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.distribute.experimental.-t-p-u-strategy.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.distribute.experimental.-t-p-u-strategy.pbtxt
@@ -30,11 +30,11 @@ tf_class {
   }
   member_method {
     name: "experimental_distribute_dataset"
-    argspec: "args=[\'self\', \'dataset\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'dataset\', \'options\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "experimental_distribute_datasets_from_function"
-    argspec: "args=[\'self\', \'dataset_fn\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'dataset_fn\', \'options\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "experimental_local_results"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-input-options.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-input-options.pbtxt
new file mode 100644
index 00000000000..c3beabd938e
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-input-options.pbtxt
@@ -0,0 +1,19 @@
+path: "tensorflow.distribute.InputOptions"
+tf_class {
+  is_instance: "<class \'tensorflow.python.distribute.distribute_lib.InputOptions\'>"
+  is_instance: "<class \'tensorflow.python.distribute.distribute_lib.InputOptions\'>"
+  is_instance: "<type \'tuple\'>"
+  member {
+    name: "experimental_prefetch_to_device"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+  }
+  member_method {
+    name: "count"
+  }
+  member_method {
+    name: "index"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-mirrored-strategy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-mirrored-strategy.pbtxt
index 20dfe7fe5a6..be4c841aed7 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-mirrored-strategy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-mirrored-strategy.pbtxt
@@ -30,11 +30,11 @@ tf_class {
   }
   member_method {
     name: "experimental_distribute_dataset"
-    argspec: "args=[\'self\', \'dataset\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'dataset\', \'options\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "experimental_distribute_datasets_from_function"
-    argspec: "args=[\'self\', \'dataset_fn\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'dataset_fn\', \'options\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "experimental_distribute_values_from_function"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-strategy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-strategy.pbtxt
index 0844739c8eb..9f6a2ac32be 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-strategy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-strategy.pbtxt
@@ -29,11 +29,11 @@ tf_class {
   }
   member_method {
     name: "experimental_distribute_dataset"
-    argspec: "args=[\'self\', \'dataset\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'dataset\', \'options\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "experimental_distribute_datasets_from_function"
-    argspec: "args=[\'self\', \'dataset_fn\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'dataset_fn\', \'options\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "experimental_distribute_values_from_function"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-multi-worker-mirrored-strategy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-multi-worker-mirrored-strategy.pbtxt
index 0f722ecc8b9..500ae362e5f 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-multi-worker-mirrored-strategy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-multi-worker-mirrored-strategy.pbtxt
@@ -30,11 +30,11 @@ tf_class {
   }
   member_method {
     name: "experimental_distribute_dataset"
-    argspec: "args=[\'self\', \'dataset\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'dataset\', \'options\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "experimental_distribute_datasets_from_function"
-    argspec: "args=[\'self\', \'dataset_fn\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'dataset_fn\', \'options\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "experimental_distribute_values_from_function"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-t-p-u-strategy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-t-p-u-strategy.pbtxt
index 6cefc4e7977..82a4362a597 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-t-p-u-strategy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-t-p-u-strategy.pbtxt
@@ -30,11 +30,11 @@ tf_class {
   }
   member_method {
     name: "experimental_distribute_dataset"
-    argspec: "args=[\'self\', \'dataset\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'dataset\', \'options\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "experimental_distribute_datasets_from_function"
-    argspec: "args=[\'self\', \'dataset_fn\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'dataset_fn\', \'options\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "experimental_distribute_values_from_function"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.pbtxt
index 3e226fd8e70..19d83909120 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.distribute.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.pbtxt
@@ -16,6 +16,10 @@ tf_module {
     name: "InputContext"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "InputOptions"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "InputReplicationMode"
     mtype: "<class \'enum.EnumMeta\'>"

From 57eccc7bc29ddb105dcaa2f6a413163461ad9987 Mon Sep 17 00:00:00 2001
From: Dominic Jack <thedomjack@gmail.com>
Date: Tue, 16 Jun 2020 09:06:15 +1000
Subject: [PATCH 0222/1390] added test

---
 tensorflow/python/keras/engine/training_test.py | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/tensorflow/python/keras/engine/training_test.py b/tensorflow/python/keras/engine/training_test.py
index 111833ba8b5..72cd5ad88a3 100644
--- a/tensorflow/python/keras/engine/training_test.py
+++ b/tensorflow/python/keras/engine/training_test.py
@@ -3328,6 +3328,16 @@ class TestTrainingWithMetrics(keras_parameterized.TestCase):
     outer_model.fit(np.ones((10, 1)), np.ones((10, 1)), batch_size=10)
     self.assertEqual([m.name for m in outer_model.metrics],
                      ['loss', 'acc2', 'mean', 'mean1', 'mean2'])
+  
+  def test_subclassed_model_with_empty_list_attr(self):
+    class ModelSubclass(training_module.Model):
+      def __init__(self):
+        self.empty_list = []
+        inputs = layers_module.Input(shape=())
+        outputs = inputs + 1
+        super(ModelSubclass, self).__init__(inputs, outputs)
+    
+    ModelSubclass()  # empty_list attr assignment should not raise
 
 
 class BareUpdateLayer(layers_module.Layer):

From f4e20ec5ae4792a3b595a598479ebe8f5a1760ea Mon Sep 17 00:00:00 2001
From: Jose Baiocchi <jbaiocchi@google.com>
Date: Mon, 15 Jun 2020 16:04:21 -0700
Subject: [PATCH 0223/1390] XPlane schema cleanup

PiperOrigin-RevId: 316564434
Change-Id: Icabe3400c82cb4aff84c2bdae7ecf59424b36fa4
---
 tensorflow/core/profiler/convert/BUILD        |  2 ++
 .../convert/op_stats_to_tf_stats_test.cc      |  5 ++--
 .../convert/xplane_to_memory_profile_test.cc  |  3 +-
 .../convert/xplane_to_op_metrics_db_test.cc   | 24 ++++++---------
 .../profiler/convert/xplane_to_op_stats.cc    |  2 +-
 .../convert/xplane_to_op_stats_test.cc        | 22 +++++++-------
 .../convert/xplane_to_profile_response.cc     |  3 +-
 .../convert/xplane_to_step_events_test.cc     |  3 +-
 .../convert/xplane_to_tf_functions_test.cc    | 12 ++++----
 .../convert/xplane_to_trace_events.cc         |  2 +-
 .../core/profiler/internal/cpu/host_tracer.cc |  2 +-
 .../profiler/internal/cpu/host_tracer_test.cc |  2 +-
 .../internal/cpu/metadata_collector.cc        |  2 +-
 .../profiler/internal/gpu/device_tracer.cc    |  7 +++--
 .../core/profiler/lib/profiler_session.cc     |  5 ++--
 tensorflow/core/profiler/utils/BUILD          |  1 +
 .../core/profiler/utils/derived_timeline.cc   |  7 +++--
 .../profiler/utils/derived_timeline_test.cc   |  8 ++---
 .../core/profiler/utils/group_events.cc       | 19 +++++++-----
 .../core/profiler/utils/group_events_test.cc  | 18 ++++--------
 .../core/profiler/utils/xplane_schema.cc      |  6 ++--
 .../core/profiler/utils/xplane_schema.h       | 16 ++++++++--
 .../core/profiler/utils/xplane_test_utils.cc  | 10 +++++++
 .../core/profiler/utils/xplane_test_utils.h   |  4 +++
 .../core/profiler/utils/xplane_utils.cc       | 29 +++++++------------
 tensorflow/core/profiler/utils/xplane_utils.h | 15 ++++------
 26 files changed, 117 insertions(+), 112 deletions(-)

diff --git a/tensorflow/core/profiler/convert/BUILD b/tensorflow/core/profiler/convert/BUILD
index 2482698cdf0..5f287a14267 100644
--- a/tensorflow/core/profiler/convert/BUILD
+++ b/tensorflow/core/profiler/convert/BUILD
@@ -47,6 +47,7 @@ tf_cc_test(
         "//tensorflow/core/profiler/utils:time_utils",
         "//tensorflow/core/profiler/utils:xplane_builder",
         "//tensorflow/core/profiler/utils:xplane_schema",
+        "//tensorflow/core/profiler/utils:xplane_test_utils",
         "@com_google_absl//absl/strings",
     ],
 )
@@ -171,6 +172,7 @@ tf_cc_test(
         "//tensorflow/core/profiler/utils:time_utils",
         "//tensorflow/core/profiler/utils:xplane_builder",
         "//tensorflow/core/profiler/utils:xplane_schema",
+        "//tensorflow/core/profiler/utils:xplane_test_utils",
         "@com_google_absl//absl/strings",
     ],
 )
diff --git a/tensorflow/core/profiler/convert/op_stats_to_tf_stats_test.cc b/tensorflow/core/profiler/convert/op_stats_to_tf_stats_test.cc
index 9ca83b51a70..5a01bf3417b 100644
--- a/tensorflow/core/profiler/convert/op_stats_to_tf_stats_test.cc
+++ b/tensorflow/core/profiler/convert/op_stats_to_tf_stats_test.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include "tensorflow/core/profiler/utils/time_utils.h"
 #include "tensorflow/core/profiler/utils/xplane_builder.h"
 #include "tensorflow/core/profiler/utils/xplane_schema.h"
+#include "tensorflow/core/profiler/utils/xplane_test_utils.h"
 
 namespace tensorflow {
 namespace profiler {
@@ -59,8 +60,8 @@ TEST(OpStatsToTfStats, GpuTfStats) {
   constexpr int64 kKernel3DurationNs = 10000;
 
   XSpace space;
-  XPlaneBuilder device_plane(space.add_planes());
-  device_plane.SetName(absl::StrCat(kGpuPlanePrefix, ":0"));
+  XPlaneBuilder device_plane(
+      GetOrCreateGpuXPlane(&space, /*device_ordinal=*/0));
   XLineBuilder stream1 = device_plane.GetOrCreateLine(/*line_id=*/10);
   AddTensorFlowOpEvent(absl::StrCat(kTfOp1, ":", kTfOp1), kKernel1StartNs,
                        kKernel1DurationNs, /*on_device=*/true, kKernel1,
diff --git a/tensorflow/core/profiler/convert/xplane_to_memory_profile_test.cc b/tensorflow/core/profiler/convert/xplane_to_memory_profile_test.cc
index 6766fd5f1b5..5ddcbcfc75d 100644
--- a/tensorflow/core/profiler/convert/xplane_to_memory_profile_test.cc
+++ b/tensorflow/core/profiler/convert/xplane_to_memory_profile_test.cc
@@ -32,9 +32,8 @@ namespace {
 // activities within one memory allocator captured in host trace.
 TEST(ConvertXPlaneToMemoryProfile, OneAllocatorMultiActivitiesTest) {
   XSpace space;
-  XPlane* host_plane = space.add_planes();
+  XPlane* host_plane = GetOrCreateHostXPlane(&space);
   XPlaneBuilder host_plane_builder(host_plane);
-  host_plane_builder.SetName(kHostThreads);
   host_plane_builder.ReserveLines(1);
 
   auto tf_executor_thread = host_plane_builder.GetOrCreateLine(0);
diff --git a/tensorflow/core/profiler/convert/xplane_to_op_metrics_db_test.cc b/tensorflow/core/profiler/convert/xplane_to_op_metrics_db_test.cc
index 8bd0443b8f6..bdac1129c81 100644
--- a/tensorflow/core/profiler/convert/xplane_to_op_metrics_db_test.cc
+++ b/tensorflow/core/profiler/convert/xplane_to_op_metrics_db_test.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "tensorflow/core/profiler/utils/time_utils.h"
 #include "tensorflow/core/profiler/utils/xplane_builder.h"
 #include "tensorflow/core/profiler/utils/xplane_schema.h"
+#include "tensorflow/core/profiler/utils/xplane_test_utils.h"
 
 namespace tensorflow {
 namespace profiler {
@@ -43,12 +44,6 @@ void AddTensorFlowOpEvent(absl::string_view tf_op_fullname,
                              tf_op_fullname);
 }
 
-void SetXPlaneNameAndId(absl::string_view name, int64 id,
-                        XPlaneBuilder* plane) {
-  plane->SetName(name);
-  plane->SetId(id);
-}
-
 TEST(ConvertXPlaneToOpMetricsDb, HostOpMetricsDb) {
   static constexpr char kTfOp1[] = "TfOp1";
   static constexpr char kTfOp2[] = "TfOp2";
@@ -57,9 +52,9 @@ TEST(ConvertXPlaneToOpMetricsDb, HostOpMetricsDb) {
   constexpr int64 kTfOp2StartNs = 110000;
   constexpr int64 kTfOp2DurationNs = 10000;
 
-  XPlane xplane;
-  XPlaneBuilder host_plane(&xplane);
-  SetXPlaneNameAndId(kHostThreads, /*id=*/0, &host_plane);
+  XSpace xspace;
+  XPlane* xplane = GetOrCreateHostXPlane(&xspace);
+  XPlaneBuilder host_plane(xplane);
   XLineBuilder thread1 = host_plane.GetOrCreateLine(/*line_id=*/10);
   AddTensorFlowOpEvent(absl::StrCat(kTfOp1, ":", kTfOp1), kTfOp1StartNs,
                        kTfOp1DurationNs, /*on_device=*/false,
@@ -72,7 +67,7 @@ TEST(ConvertXPlaneToOpMetricsDb, HostOpMetricsDb) {
                        kTfOp2DurationNs, /*on_device=*/false,
                        /*kernel_name=*/"", &host_plane, &thread2);
 
-  OpMetricsDb op_metrics = ConvertHostThreadsXPlaneToOpMetricsDb(xplane);
+  OpMetricsDb op_metrics = ConvertHostThreadsXPlaneToOpMetricsDb(*xplane);
   // Op1, Op2, Idle.
   EXPECT_EQ(3, op_metrics.metrics_db_size());
   uint64 total_op_duration =
@@ -115,10 +110,9 @@ TEST(ConvertXPlaneToOpMetricsDb, DeviceOpMetricsDb) {
   constexpr int64 kKernel3StartNs = 120000;
   constexpr int64 kKernel3DurationNs = 10000;
 
-  XPlane xplane;
-  XPlaneBuilder device_plane(&xplane);
-  SetXPlaneNameAndId(absl::StrCat(kGpuPlanePrefix, ":0"), /*id=*/1,
-                     &device_plane);
+  XSpace xspace;
+  XPlane* xplane = GetOrCreateGpuXPlane(&xspace, /*device_ordinal=*/0);
+  XPlaneBuilder device_plane(xplane);
   XLineBuilder stream1 = device_plane.GetOrCreateLine(/*line_id=*/10);
   AddTensorFlowOpEvent(absl::StrCat(kTfOp1, ":", kTfOp1), kKernel1StartNs,
                        kKernel1DurationNs, /*on_device=*/true, kKernel1,
@@ -138,7 +132,7 @@ TEST(ConvertXPlaneToOpMetricsDb, DeviceOpMetricsDb) {
                        &device_plane, &stream2);
 
   OpMetricsDb op_metrics = ConvertDeviceTraceXPlaneToOpMetricsDb(
-      xplane, /*peak_tera_flops_per_second=*/0,
+      *xplane, /*peak_tera_flops_per_second=*/0,
       /*peak_hbm_bw_giga_bytes_per_second=*/0);
 
   // kernel1, kernel2, kernel3, Idle.
diff --git a/tensorflow/core/profiler/convert/xplane_to_op_stats.cc b/tensorflow/core/profiler/convert/xplane_to_op_stats.cc
index eb2e13dbb4a..df050d16ede 100644
--- a/tensorflow/core/profiler/convert/xplane_to_op_stats.cc
+++ b/tensorflow/core/profiler/convert/xplane_to_op_stats.cc
@@ -129,7 +129,7 @@ void PropagateXSpaceDiagnosticsToOpStats(const XSpace& space,
 }
 
 OpStats ConvertXSpaceToOpStats(const XSpace& space) {
-  const XPlane* host_plane = FindPlaneWithName(space, kHostThreads);
+  const XPlane* host_plane = FindPlaneWithName(space, kHostThreadsPlaneName);
   std::vector<const XPlane*> device_planes =
       FindPlanesWithPrefix(space, kGpuPlanePrefix);
   OpStats op_stats;
diff --git a/tensorflow/core/profiler/convert/xplane_to_op_stats_test.cc b/tensorflow/core/profiler/convert/xplane_to_op_stats_test.cc
index 138bcee72be..68bb8205f5e 100644
--- a/tensorflow/core/profiler/convert/xplane_to_op_stats_test.cc
+++ b/tensorflow/core/profiler/convert/xplane_to_op_stats_test.cc
@@ -43,8 +43,8 @@ TEST(ConvertXPlaneToOpStats, PerfEnv) {
   constexpr int kComputeCapMajor = 7;
   constexpr int kComputeCapMinor = 0;
 
-  XPlaneBuilder device_plane(space.add_planes());
-  device_plane.SetName(absl::StrCat(kGpuPlanePrefix, ":0"));
+  XPlaneBuilder device_plane(
+      GetOrCreateGpuXPlane(&space, /*device_ordinal=*/0));
   device_plane.ParseAndAddStatValue(
       *device_plane.GetOrCreateStatMetadata("clock_rate"),
       absl::StrCat(kClockRateKHz));
@@ -71,10 +71,10 @@ TEST(ConvertXPlaneToOpStats, PerfEnv) {
 
 TEST(ConvertXPlaneToOpStats, RunEnvironment) {
   XSpace space;
-  XPlaneBuilder device_plane1(space.add_planes());
-  device_plane1.SetName(absl::StrCat(kGpuPlanePrefix, ":0"));
-  XPlaneBuilder device_plane2(space.add_planes());
-  device_plane2.SetName(absl::StrCat(kGpuPlanePrefix, ":1"));
+  XPlaneBuilder device_plane1(
+      GetOrCreateGpuXPlane(&space, /*device_ordinal=*/0));
+  XPlaneBuilder device_plane2(
+      GetOrCreateGpuXPlane(&space, /*device_ordinal=*/1));
 
   GroupTfEvents(&space, /*event_group_name_map=*/nullptr);
   OpStats op_stats = ConvertXSpaceToOpStats(space);
@@ -91,8 +91,7 @@ TEST(ConvertXPlaneToOpStats, CpuOnlyStepDbTest) {
   constexpr int64 kStepId = 0;
 
   XSpace space;
-  XPlaneBuilder host_plane_builder(space.add_planes());
-  host_plane_builder.SetName(kHostThreads);
+  XPlaneBuilder host_plane_builder(GetOrCreateHostXPlane(&space));
   host_plane_builder.ReserveLines(2);
 
   auto main_thread = host_plane_builder.GetOrCreateLine(0);
@@ -120,8 +119,7 @@ TEST(ConvertXPlaneToOpStats, GpuStepDbTest) {
   constexpr int64 kCorrelationId = 100;
 
   XSpace space;
-  XPlaneBuilder host_plane_builder(space.add_planes());
-  host_plane_builder.SetName(kHostThreads);
+  XPlaneBuilder host_plane_builder(GetOrCreateHostXPlane(&space));
   host_plane_builder.ReserveLines(2);
 
   auto main_thread = host_plane_builder.GetOrCreateLine(0);
@@ -137,8 +135,8 @@ TEST(ConvertXPlaneToOpStats, GpuStepDbTest) {
   CreateXEvent(&host_plane_builder, &tf_executor_thread, "matmul", 30, 10,
                {{StatType::kCorrelationId, kCorrelationId}});
 
-  XPlaneBuilder device_plane_builder(space.add_planes());
-  device_plane_builder.SetName(absl::StrCat(kGpuPlanePrefix, ":0"));
+  XPlaneBuilder device_plane_builder(
+      GetOrCreateGpuXPlane(&space, /*device_ordinal=*/0));
   device_plane_builder.ReserveLines(1);
 
   auto stream = device_plane_builder.GetOrCreateLine(0);
diff --git a/tensorflow/core/profiler/convert/xplane_to_profile_response.cc b/tensorflow/core/profiler/convert/xplane_to_profile_response.cc
index 70a07171310..22af46c4380 100644
--- a/tensorflow/core/profiler/convert/xplane_to_profile_response.cc
+++ b/tensorflow/core/profiler/convert/xplane_to_profile_response.cc
@@ -139,7 +139,8 @@ Status ConvertXSpaceToProfileResponse(const XSpace& xspace,
     AddToolData(ToolName(kKernelStats), op_stats.kernel_stats_db(), response);
   }
   if (tools.contains(kMemoryProfile)) {
-    if (const XPlane* host_plane = FindPlaneWithName(xspace, kHostThreads)) {
+    if (const XPlane* host_plane =
+            FindPlaneWithName(xspace, kHostThreadsPlaneName)) {
       MemoryProfile memory_profile = ConvertXPlaneToMemoryProfile(*host_plane);
       std::string json_output;
       TF_RETURN_IF_ERROR(ConvertProtoToJson(memory_profile, &json_output));
diff --git a/tensorflow/core/profiler/convert/xplane_to_step_events_test.cc b/tensorflow/core/profiler/convert/xplane_to_step_events_test.cc
index ff68f1817ed..1c6dfee7cc7 100644
--- a/tensorflow/core/profiler/convert/xplane_to_step_events_test.cc
+++ b/tensorflow/core/profiler/convert/xplane_to_step_events_test.cc
@@ -45,9 +45,8 @@ TEST(ConvertXPlaneToOpStats, CpuOnlyStepDbTest) {
   constexpr int64 kSecondCorrelationId = 200;
 
   XSpace space;
-  XPlane* host_plane = space.add_planes();
+  XPlane* host_plane = GetOrCreateHostXPlane(&space);
   XPlaneBuilder host_plane_builder(host_plane);
-  host_plane_builder.SetName(kHostThreads);
   host_plane_builder.ReserveLines(2);
 
   auto main_thread = host_plane_builder.GetOrCreateLine(0);
diff --git a/tensorflow/core/profiler/convert/xplane_to_tf_functions_test.cc b/tensorflow/core/profiler/convert/xplane_to_tf_functions_test.cc
index 12287217e04..a310313ff86 100644
--- a/tensorflow/core/profiler/convert/xplane_to_tf_functions_test.cc
+++ b/tensorflow/core/profiler/convert/xplane_to_tf_functions_test.cc
@@ -43,11 +43,12 @@ constexpr double kMaxError = 0.001;
 
 TfFunctionDb ConvertXSpaceToTfFunctionDb(const XSpace& space) {
   TfFunctionDb result;
-  const XPlane* host_plane = FindPlaneWithName(space, kHostThreads);
+  const XPlane* host_plane = FindPlaneWithName(space, kHostThreadsPlaneName);
   if (host_plane) {
     XPlaneVisitor plane = CreateTfXPlaneVisitor(host_plane);
     plane.ForEachLine([&result](const XLineVisitor& line) {
-      CombineTfFunctionDb(ConvertHostThreadsXLineToTfFunctionDb(line), &result);
+      TfFunctionDb tf_function_db = ConvertHostThreadsXLineToTfFunctionDb(line);
+      CombineTfFunctionDb(tf_function_db, &result);
     });
   }
   return result;
@@ -56,7 +57,7 @@ TfFunctionDb ConvertXSpaceToTfFunctionDb(const XSpace& space) {
 TEST(ConvertXPlaneToTfFunctions, CombineTwoThreads) {
   XSpace space;
   XPlaneBuilder host_plane_builder(space.add_planes());
-  host_plane_builder.SetName(kHostThreads);
+  host_plane_builder.SetName(kHostThreadsPlaneName);
   host_plane_builder.ReserveLines(2);
   std::string kFunctionName = "decrement";
 
@@ -100,7 +101,7 @@ TEST(ConvertXPlaneToTfFunctions, CombineTwoThreads) {
 TEST(ConvertXPlaneToTfFunctions, NestedFunctions) {
   XSpace space;
   XPlaneBuilder host_plane_builder(space.add_planes());
-  host_plane_builder.SetName(kHostThreads);
+  host_plane_builder.SetName(kHostThreadsPlaneName);
   host_plane_builder.ReserveLines(1);
   std::string kOuterFunctionName = "outer";
   std::string kInnerFunctionName = "inner";
@@ -140,8 +141,7 @@ TEST(ConvertXPlaneToTfFunctions, NestedFunctions) {
 
 TEST(ConvertXPlaneToTfFunctions, EagerPlusConcrete) {
   XSpace space;
-  XPlaneBuilder host_plane_builder(space.add_planes());
-  host_plane_builder.SetName(kHostThreads);
+  XPlaneBuilder host_plane_builder(GetOrCreateHostXPlane(&space));
   host_plane_builder.ReserveLines(2);
   std::string kEagerFunctionName = "i_am_eager";
   std::string kConcreteFunctionName = "i_am_concrete";
diff --git a/tensorflow/core/profiler/convert/xplane_to_trace_events.cc b/tensorflow/core/profiler/convert/xplane_to_trace_events.cc
index ffdad034337..f4a0145d8f6 100644
--- a/tensorflow/core/profiler/convert/xplane_to_trace_events.cc
+++ b/tensorflow/core/profiler/convert/xplane_to_trace_events.cc
@@ -41,7 +41,7 @@ Device BuildDeviceAndResource(const XPlaneVisitor& plane) {
   device.set_name(std::string(plane.Name()));
   device.set_device_id(plane.Id());
 
-  bool sort_by_ordinal = plane.Name() == kHostThreads;
+  bool sort_by_ordinal = (plane.Name() == kHostThreadsPlaneName);
   int ordinal = 0;
   plane.ForEachLine([&](const XLineVisitor& line) {
     Resource resource;
diff --git a/tensorflow/core/profiler/internal/cpu/host_tracer.cc b/tensorflow/core/profiler/internal/cpu/host_tracer.cc
index be1a7a2777b..37f7baca1d3 100644
--- a/tensorflow/core/profiler/internal/cpu/host_tracer.cc
+++ b/tensorflow/core/profiler/internal/cpu/host_tracer.cc
@@ -146,7 +146,7 @@ Status HostTracer::CollectData(XSpace* space) {
     return errors::Internal("TraceMeRecorder not stopped");
   }
   MakeCompleteEvents(&events_);
-  XPlane* plane = GetOrCreatePlane(space, kHostThreads);
+  XPlane* plane = FindOrAddMutablePlaneWithName(space, kHostThreadsPlaneName);
   plane->set_id(kHostPlaneId);
   ConvertCompleteEventsToXPlane(start_timestamp_ns_, events_, plane);
   events_.clear();
diff --git a/tensorflow/core/profiler/internal/cpu/host_tracer_test.cc b/tensorflow/core/profiler/internal/cpu/host_tracer_test.cc
index 499b7b6b564..0e4c3dd7a9b 100644
--- a/tensorflow/core/profiler/internal/cpu/host_tracer_test.cc
+++ b/tensorflow/core/profiler/internal/cpu/host_tracer_test.cc
@@ -150,7 +150,7 @@ TEST(HostTracerTest, CollectsTraceMeEventsAsXSpace) {
   ASSERT_EQ(space.planes_size(), 1);
   const auto& plane = space.planes(0);
   XPlaneVisitor xplane(&plane);
-  ASSERT_EQ(plane.name(), kHostThreads);
+  ASSERT_EQ(plane.name(), kHostThreadsPlaneName);
   ASSERT_EQ(plane.lines_size(), 1);
   ASSERT_EQ(plane.event_metadata_size(), 7);
   ASSERT_EQ(plane.stat_metadata_size(), 2);
diff --git a/tensorflow/core/profiler/internal/cpu/metadata_collector.cc b/tensorflow/core/profiler/internal/cpu/metadata_collector.cc
index da922d4b18b..58e6385a7ec 100644
--- a/tensorflow/core/profiler/internal/cpu/metadata_collector.cc
+++ b/tensorflow/core/profiler/internal/cpu/metadata_collector.cc
@@ -65,7 +65,7 @@ class MetadataCollector : public ProfilerInterface {
 
   Status CollectData(XSpace* space) override {
     if (!debug_info_.empty()) {
-      XPlane* plane = GetOrCreatePlane(space, kMetadataPlane);
+      XPlane* plane = FindOrAddMutablePlaneWithName(space, kMetadataPlaneName);
       plane->set_id(kMetadataPlaneId);
       XPlaneBuilder xplane(plane);
       const XStatMetadata& hlo_proto_stat =
diff --git a/tensorflow/core/profiler/internal/gpu/device_tracer.cc b/tensorflow/core/profiler/internal/gpu/device_tracer.cc
index 5ddee687333..bc9952302e8 100644
--- a/tensorflow/core/profiler/internal/gpu/device_tracer.cc
+++ b/tensorflow/core/profiler/internal/gpu/device_tracer.cc
@@ -223,11 +223,12 @@ class CuptiTraceCollectorImpl : public CuptiTraceCollector {
               << " callback api events and " << num_activity_events_
               << " activity events. " << ReportDroppedEvents();
     uint64 end_gpu_ns = CuptiTracer::GetTimestamp();
-    XPlaneBuilder host_plane(GetOrCreatePlane(space, kCuptiDriverApiPlaneName));
+    XPlaneBuilder host_plane(
+        FindOrAddMutablePlaneWithName(space, kCuptiDriverApiPlaneName));
     host_plane.SetId(kCuptiDriverApiPlaneId);
     for (int device_ordinal = 0; device_ordinal < num_gpus_; ++device_ordinal) {
-      std::string name = absl::StrCat(kGpuPlanePrefix, device_ordinal);
-      XPlaneBuilder device_plane(GetOrCreatePlane(space, name));
+      std::string name = GpuPlaneName(device_ordinal);
+      XPlaneBuilder device_plane(FindOrAddMutablePlaneWithName(space, name));
       device_plane.SetId(kGpuPlaneBaseId + device_ordinal);
       per_device_collector_[device_ordinal].Flush(start_gpu_ns_, end_gpu_ns,
                                                   &device_plane, &host_plane);
diff --git a/tensorflow/core/profiler/lib/profiler_session.cc b/tensorflow/core/profiler/lib/profiler_session.cc
index 885c5e0ca4f..f7d97711da0 100644
--- a/tensorflow/core/profiler/lib/profiler_session.cc
+++ b/tensorflow/core/profiler/lib/profiler_session.cc
@@ -98,11 +98,10 @@ Status ProfilerSession::CollectData(profiler::XSpace* space) {
   const profiler::XPlane* cupti_driver_api_plane =
       profiler::FindPlaneWithName(*space, profiler::kCuptiDriverApiPlaneName);
   if (cupti_driver_api_plane) {
-    profiler::XPlane* host_plane =
-        profiler::GetOrCreatePlane(space, profiler::kHostThreads);
+    profiler::XPlane* host_plane = profiler::FindOrAddMutablePlaneWithName(
+        space, profiler::kHostThreadsPlaneName);
     profiler::MergePlanes(*cupti_driver_api_plane, host_plane);
     profiler::SortXLinesBy(host_plane, profiler::XLinesComparatorByName());
-    // This might invalidate host_plane pointer.
     profiler::RemovePlaneWithName(space, profiler::kCuptiDriverApiPlaneName);
   }
   // 2. Normalize all timestamps by shifting timeline to profiling start time.
diff --git a/tensorflow/core/profiler/utils/BUILD b/tensorflow/core/profiler/utils/BUILD
index e0eaa5968c1..6942f3ea306 100644
--- a/tensorflow/core/profiler/utils/BUILD
+++ b/tensorflow/core/profiler/utils/BUILD
@@ -233,6 +233,7 @@ cc_library(
     deps = [
         ":xplane_builder",
         ":xplane_schema",
+        ":xplane_utils",
         "//tensorflow/core/platform:types",
         "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
         "@com_google_absl//absl/container:flat_hash_map",
diff --git a/tensorflow/core/profiler/utils/derived_timeline.cc b/tensorflow/core/profiler/utils/derived_timeline.cc
index 112c0977763..f63a8e5c2d9 100644
--- a/tensorflow/core/profiler/utils/derived_timeline.cc
+++ b/tensorflow/core/profiler/utils/derived_timeline.cc
@@ -339,9 +339,10 @@ void GenerateDerivedTimeLines(const EventGroupNameMap& event_group_name_map,
                               XSpace* space, bool step_info_only) {
   for (XPlane& plane : *space->mutable_planes()) {
     // Derived timelines only generated for device traces.
-    if (plane.id() == kHostPlaneId) continue;
-    DeriveEventsFromAnnotations(DummySymbolResolver, event_group_name_map,
-                                &plane, step_info_only);
+    if (IsGpuPlaneName(plane.name())) {
+      DeriveEventsFromAnnotations(DummySymbolResolver, event_group_name_map,
+                                  &plane, step_info_only);
+    }
   }
 }
 
diff --git a/tensorflow/core/profiler/utils/derived_timeline_test.cc b/tensorflow/core/profiler/utils/derived_timeline_test.cc
index 4ae558eb446..a75ba8ea085 100644
--- a/tensorflow/core/profiler/utils/derived_timeline_test.cc
+++ b/tensorflow/core/profiler/utils/derived_timeline_test.cc
@@ -44,7 +44,7 @@ TEST(DerivedTimelineTest, HloModuleNameTest) {
   const absl::string_view kKernelDetails = "kernel_details";
   XSpace space;
   EventGroupNameMap event_group_name_map;
-  XPlane* plane = space.add_planes();
+  XPlane* plane = GetOrCreateGpuXPlane(&space, /*device_ordinal=*/0);
   XPlaneBuilder plane_builder(plane);
   auto line_builder = plane_builder.GetOrCreateLine(0);
   CreateXEvent(&plane_builder, &line_builder, "op1", 0, 100,
@@ -74,7 +74,7 @@ TEST(DerivedTimelineTest, TfOpLineTest) {
   const absl::string_view kKernelDetails = "kernel_details";
   XSpace space;
   EventGroupNameMap event_group_name_map;
-  XPlane* plane = space.add_planes();
+  XPlane* plane = GetOrCreateGpuXPlane(&space, /*device_ordinal=*/0);
   XPlaneBuilder plane_builder(plane);
   auto line_builder = plane_builder.GetOrCreateLine(0);
   CreateXEvent(&plane_builder, &line_builder, "op1", 0, 100,
@@ -109,7 +109,7 @@ TEST(DerivedTimelineTest, DependencyTest) {
   const absl::string_view kKernelDetails = "kernel_details";
   XSpace space;
   EventGroupNameMap event_group_name_map({{0, "train 0"}, {1, "train 1"}});
-  XPlane* plane = space.add_planes();
+  XPlane* plane = GetOrCreateGpuXPlane(&space, /*device_ordinal=*/0);
   XPlaneBuilder plane_builder(plane);
   auto line_builder = plane_builder.GetOrCreateLine(0);
   CreateXEvent(&plane_builder, &line_builder, "op1", 0, 100,
@@ -138,7 +138,7 @@ TEST(DerivedTimelineTest, TfOpNameScopeTest) {
   const absl::string_view kKernelDetails = "kernel_details";
   XSpace space;
   EventGroupNameMap event_group_name_map;
-  XPlane* plane = space.add_planes();
+  XPlane* plane = GetOrCreateGpuXPlane(&space, /*device_ordinal=*/0);
   XPlaneBuilder plane_builder(plane);
   auto line_builder = plane_builder.GetOrCreateLine(0);
   CreateXEvent(&plane_builder, &line_builder, "op1", 0, 100,
diff --git a/tensorflow/core/profiler/utils/group_events.cc b/tensorflow/core/profiler/utils/group_events.cc
index 2f4e3c8b3f1..be8dd506b0c 100644
--- a/tensorflow/core/profiler/utils/group_events.cc
+++ b/tensorflow/core/profiler/utils/group_events.cc
@@ -54,13 +54,13 @@ void CreateStatMetadata(XPlane* plane) {
 }
 
 // Returns event type if it is a KernelLaunch or KernelExecute event.
-absl::optional<int64> GetKernelEventType(const XPlaneVisitor& visitor,
+absl::optional<int64> GetKernelEventType(bool is_host_plane,
+                                         const XPlaneVisitor& visitor,
                                          const XEvent& event) {
   for (const auto& stat : event.stats()) {
     if (visitor.GetStatType(stat) == StatType::kCorrelationId) {
-      // TODO(b/149095099): avoid string comparison.
-      return visitor.Name() == kHostThreads ? HostEventType::kKernelLaunch
-                                            : HostEventType::kKernelExecute;
+      return is_host_plane ? HostEventType::kKernelLaunch
+                           : HostEventType::kKernelExecute;
     }
   }
   return absl::nullopt;
@@ -72,14 +72,15 @@ bool IsTfOpEvent(const XPlaneVisitor& visitor, const XEvent& event) {
   return tf_op.category == Category::kTensorFlow;
 }
 
-int64 GetEventType(const XPlaneVisitor& visitor, const XEvent& event) {
+int64 GetEventType(bool is_host_plane, const XPlaneVisitor& visitor,
+                   const XEvent& event) {
   if (absl::optional<int64> event_type = visitor.GetEventType(event)) {
     return *event_type;
   } else if (absl::optional<int64> kernel_event_type =
-                 GetKernelEventType(visitor, event)) {
+                 GetKernelEventType(is_host_plane, visitor, event)) {
     // KernelLaunch and KernelExecute event types are not supported by
     // XPlaneVisitor and should be checked separately.
-    // TODO(148346217): Make XPlaneVisitor support KernelLaunch and
+    // TODO(b/148346217): Make XPlaneVisitor support KernelLaunch and
     // KernelExecute event types.
     return *kernel_event_type;
   } else if (IsTfOpEvent(visitor, event)) {
@@ -396,6 +397,8 @@ bool EventNode::StartsBefore(const EventNode& other) const {
 void EventForest::ConnectIntraThread(const XPlaneVisitor& visitor,
                                      XPlane* plane,
                                      ContextGroupMap* context_groups) {
+  // TODO(b/149095099): avoid string comparison.
+  bool is_host_plane = (visitor.Name() == kHostThreadsPlaneName);
   for (auto& line : *plane->mutable_lines()) {
     std::vector<EventNode*> parent_nodes;
     for (auto& event : *line.mutable_events()) {
@@ -418,7 +421,7 @@ void EventForest::ConnectIntraThread(const XPlaneVisitor& visitor,
       }
       parent_nodes.push_back(cur_node.get());
       // event_node_map_ keeps cur_node alive.
-      event_node_map_[GetEventType(visitor, event)].push_back(
+      event_node_map_[GetEventType(is_host_plane, visitor, event)].push_back(
           std::move(cur_node));
     }
   }
diff --git a/tensorflow/core/profiler/utils/group_events_test.cc b/tensorflow/core/profiler/utils/group_events_test.cc
index eab9527de09..e9f5d58f8d5 100644
--- a/tensorflow/core/profiler/utils/group_events_test.cc
+++ b/tensorflow/core/profiler/utils/group_events_test.cc
@@ -36,8 +36,7 @@ TEST(GroupEventsTest, GroupGpuTraceTest) {
   constexpr int64 kCorrelationId = 100;
 
   XSpace space;
-  XPlaneBuilder host_plane_builder(space.add_planes());
-  host_plane_builder.SetName(kHostThreads);
+  XPlaneBuilder host_plane_builder(GetOrCreateHostXPlane(&space));
   host_plane_builder.ReserveLines(2);
 
   auto main_thread = host_plane_builder.GetOrCreateLine(0);
@@ -78,8 +77,7 @@ TEST(GroupEventsTest, GroupTensorFlowLoopTest) {
   constexpr int64 kCorrelationId = 100;
 
   XSpace space;
-  XPlaneBuilder host_plane_builder(space.add_planes());
-  host_plane_builder.SetName(kHostThreads);
+  XPlaneBuilder host_plane_builder(GetOrCreateHostXPlane(&space));
   host_plane_builder.ReserveLines(1);
 
   auto tf_executor_thread = host_plane_builder.GetOrCreateLine(0);
@@ -125,8 +123,7 @@ TEST(GroupEventsTest, GroupMultipleTensorFlowLoopsTest) {
   constexpr int64 kSecondIterNumStart = 0;
 
   XSpace space;
-  XPlaneBuilder host_plane_builder(space.add_planes());
-  host_plane_builder.SetName(kHostThreads);
+  XPlaneBuilder host_plane_builder(GetOrCreateHostXPlane(&space));
   host_plane_builder.ReserveLines(2);
 
   auto first_tf_executor_thread = host_plane_builder.GetOrCreateLine(0);
@@ -163,9 +160,8 @@ TEST(GroupEventsTest, GroupFunctionalOp) {
   constexpr int64 kFunctionStepId = 1;
 
   XSpace space;
-  XPlane* host_plane = space.add_planes();
+  XPlane* host_plane = GetOrCreateHostXPlane(&space);
   XPlaneBuilder host_plane_builder(host_plane);
-  host_plane_builder.SetName(kHostThreads);
   host_plane_builder.ReserveLines(2);
 
   auto main_thread = host_plane_builder.GetOrCreateLine(0);
@@ -209,9 +205,8 @@ TEST(GroupEventsTest, EagerOpTest) {
   constexpr int64 kCorrelationId = 100;
 
   XSpace space;
-  XPlane* host_plane = space.add_planes();
+  XPlane* host_plane = GetOrCreateHostXPlane(&space);
   XPlaneBuilder host_plane_builder(host_plane);
-  host_plane_builder.SetName(kHostThreads);
   host_plane_builder.ReserveLines(1);
 
   auto main_thread = host_plane_builder.GetOrCreateLine(0);
@@ -255,9 +250,8 @@ TEST(GroupEventsTest, FunctionOpTest) {
   constexpr int64 kCorrelationId = 100;
 
   XSpace space;
-  XPlane* host_plane = space.add_planes();
+  XPlane* host_plane = GetOrCreateHostXPlane(&space);
   XPlaneBuilder host_plane_builder(host_plane);
-  host_plane_builder.SetName(kHostThreads);
   host_plane_builder.ReserveLines(2);
 
   auto main_thread = host_plane_builder.GetOrCreateLine(0);
diff --git a/tensorflow/core/profiler/utils/xplane_schema.cc b/tensorflow/core/profiler/utils/xplane_schema.cc
index c7a2cb6e37e..2c79df7980f 100644
--- a/tensorflow/core/profiler/utils/xplane_schema.cc
+++ b/tensorflow/core/profiler/utils/xplane_schema.cc
@@ -26,11 +26,11 @@ limitations under the License.
 namespace tensorflow {
 namespace profiler {
 
-const absl::string_view kHostThreads = "/host:CPU";
+const absl::string_view kHostThreadsPlaneName = "/host:CPU";
 const absl::string_view kGpuPlanePrefix = "/device:GPU:";
 const absl::string_view kCuptiDriverApiPlaneName = "/host:CUPTI";
-const absl::string_view kMetadataPlane = "/host:metadata";
-const absl::string_view kTFStreamzPlane = "/host:tfstreamz";
+const absl::string_view kMetadataPlaneName = "/host:metadata";
+const absl::string_view kTFStreamzPlaneName = "/host:tfstreamz";
 
 const absl::string_view kStepLineName = "Steps";
 const absl::string_view kTensorFlowNameScopeLineName = "TensorFlow Name Scope";
diff --git a/tensorflow/core/profiler/utils/xplane_schema.h b/tensorflow/core/profiler/utils/xplane_schema.h
index 84b374e488d..a045e20d8de 100644
--- a/tensorflow/core/profiler/utils/xplane_schema.h
+++ b/tensorflow/core/profiler/utils/xplane_schema.h
@@ -16,6 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_PROFILER_UTILS_XPLANE_SCHEMA_H_
 #define TENSORFLOW_CORE_PROFILER_UTILS_XPLANE_SCHEMA_H_
 
+#include "absl/strings/match.h"
+#include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/optional.h"
 #include "tensorflow/core/platform/logging.h"
@@ -25,15 +27,15 @@ namespace tensorflow {
 namespace profiler {
 
 // Name of XPlane that contains TraceMe events.
-ABSL_CONST_INIT extern const absl::string_view kHostThreads;
+ABSL_CONST_INIT extern const absl::string_view kHostThreadsPlaneName;
 // Name prefix of XPlane that contains GPU events.
 ABSL_CONST_INIT extern const absl::string_view kGpuPlanePrefix;
 // Name of XPlane that contains CUPTI driver API generated events.
 ABSL_CONST_INIT extern const absl::string_view kCuptiDriverApiPlaneName;
 // Name of XPlane that contains profile metadata such as XLA debug info.
-ABSL_CONST_INIT extern const absl::string_view kMetadataPlane;
+ABSL_CONST_INIT extern const absl::string_view kMetadataPlaneName;
 // Name of XPlane that contains kpi related metrics.
-ABSL_CONST_INIT extern const absl::string_view kTFStreamzPlane;
+ABSL_CONST_INIT extern const absl::string_view kTFStreamzPlaneName;
 
 // Names of XLines that contain ML-level events.
 ABSL_CONST_INIT extern const absl::string_view kStepLineName;
@@ -184,6 +186,14 @@ enum StatType {
   kLastStatType = kDevCapComputeCapMinor,
 };
 
+inline std::string GpuPlaneName(int32 device_ordinal) {
+  return absl::StrCat(kGpuPlanePrefix, device_ordinal);
+}
+
+inline bool IsGpuPlaneName(absl::string_view plane_name) {
+  return absl::StartsWith(plane_name, kGpuPlanePrefix);
+}
+
 absl::string_view GetHostEventTypeStr(HostEventType event_type);
 
 bool IsHostEventType(HostEventType event_type, absl::string_view event_name);
diff --git a/tensorflow/core/profiler/utils/xplane_test_utils.cc b/tensorflow/core/profiler/utils/xplane_test_utils.cc
index cd8821f05a8..a389d3619bd 100644
--- a/tensorflow/core/profiler/utils/xplane_test_utils.cc
+++ b/tensorflow/core/profiler/utils/xplane_test_utils.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/core/profiler/protobuf/xplane.pb.h"
 #include "tensorflow/core/profiler/utils/xplane_builder.h"
 #include "tensorflow/core/profiler/utils/xplane_schema.h"
+#include "tensorflow/core/profiler/utils/xplane_utils.h"
 
 namespace tensorflow {
 namespace profiler {
@@ -44,6 +45,15 @@ class XStatValueVisitor {
 
 }  // namespace
 
+XPlane* GetOrCreateHostXPlane(XSpace* space) {
+  return FindOrAddMutablePlaneWithName(space, kHostThreadsPlaneName);
+}
+
+XPlane* GetOrCreateGpuXPlane(XSpace* space, int32 device_ordinal) {
+  std::string name = GpuPlaneName(device_ordinal);
+  return FindOrAddMutablePlaneWithName(space, name);
+}
+
 void CreateXEvent(
     XPlaneBuilder* plane_builder, XLineBuilder* line_builder,
     absl::string_view event_name, int64 offset_ps, int64 duration_ps,
diff --git a/tensorflow/core/profiler/utils/xplane_test_utils.h b/tensorflow/core/profiler/utils/xplane_test_utils.h
index 9abf09fc695..89fda765771 100644
--- a/tensorflow/core/profiler/utils/xplane_test_utils.h
+++ b/tensorflow/core/profiler/utils/xplane_test_utils.h
@@ -28,6 +28,10 @@ namespace profiler {
 
 using XStatValue = absl::variant<int64, uint64, absl::string_view>;
 
+XPlane* GetOrCreateHostXPlane(XSpace* space);
+
+XPlane* GetOrCreateGpuXPlane(XSpace* space, int32 device_ordinal);
+
 void CreateXEvent(
     XPlaneBuilder* plane_builder, XLineBuilder* line_builder,
     absl::string_view event_name, int64 offset_ps, int64 duration_ps,
diff --git a/tensorflow/core/profiler/utils/xplane_utils.cc b/tensorflow/core/profiler/utils/xplane_utils.cc
index 10d40eee3b4..3fa421c3459 100644
--- a/tensorflow/core/profiler/utils/xplane_utils.cc
+++ b/tensorflow/core/profiler/utils/xplane_utils.cc
@@ -65,12 +65,19 @@ std::vector<const XPlane*> FindPlanesWithPrefix(const XSpace& space,
   return result;
 }
 
-XPlane* GetOrCreatePlane(XSpace* space, absl::string_view name) {
+XPlane* FindMutablePlaneWithName(XSpace* space, absl::string_view name) {
   for (XPlane& plane : *space->mutable_planes()) {
     if (plane.name() == name) return &plane;
   }
-  XPlane* plane = space->add_planes();
-  plane->set_name(std::string(name));
+  return nullptr;
+}
+
+XPlane* FindOrAddMutablePlaneWithName(XSpace* space, absl::string_view name) {
+  XPlane* plane = FindMutablePlaneWithName(space, name);
+  if (plane == nullptr) {
+    plane = space->add_planes();
+    plane->set_name(name.data(), name.size());
+  }
   return plane;
 }
 
@@ -128,22 +135,6 @@ void RemoveEmptyLines(XPlane* plane) {
                lines->end());
 }
 
-XPlane* FindMutablePlaneWithName(XSpace* space, absl::string_view name) {
-  for (XPlane& plane : *space->mutable_planes()) {
-    if (plane.name() == name) return &plane;
-  }
-  return nullptr;
-}
-
-XPlane* FindOrAddMutablePlaneWithName(XSpace* space, absl::string_view name) {
-  XPlane* plane = FindMutablePlaneWithName(space, name);
-  if (plane == nullptr) {
-    plane = space->add_planes();
-    plane->set_name(std::string(name));
-  }
-  return plane;
-}
-
 void SortXPlane(XPlane* plane) {
   for (XLine& line : *plane->mutable_lines()) {
     auto& events = *line.mutable_events();
diff --git a/tensorflow/core/profiler/utils/xplane_utils.h b/tensorflow/core/profiler/utils/xplane_utils.h
index 11a2c28f719..7575244e7bd 100644
--- a/tensorflow/core/profiler/utils/xplane_utils.h
+++ b/tensorflow/core/profiler/utils/xplane_utils.h
@@ -32,8 +32,12 @@ const XPlane* FindPlaneWithName(const XSpace& space, absl::string_view name);
 std::vector<const XPlane*> FindPlanesWithPrefix(const XSpace& space,
                                                 absl::string_view prefix);
 
-// Returns the plane with the given name, create it if necessary.
-XPlane* GetOrCreatePlane(XSpace* space, absl::string_view name);
+// Returns the plane with the given name in the container or null if not found.
+XPlane* FindMutablePlaneWithName(XSpace* space, absl::string_view name);
+
+// Returns the plane with the given name in the container. If necessary, adds a
+// new plane to the container.
+XPlane* FindOrAddMutablePlaneWithName(XSpace* space, absl::string_view name);
 
 // Returns true if event is nested by parent.
 bool IsNested(const tensorflow::profiler::XEvent& event,
@@ -49,13 +53,6 @@ void RemovePlaneWithName(XSpace* space, absl::string_view name);
 void RemoveEmptyPlanes(XSpace* space);
 void RemoveEmptyLines(XPlane* plane);
 
-// Returns the plane with the given name in the container or null if not found.
-XPlane* FindMutablePlaneWithName(XSpace* space, absl::string_view name);
-
-// Returns the plane with the given name in the container. If necessary, adds a
-// new plane to the container.
-XPlane* FindOrAddMutablePlaneWithName(XSpace* space, absl::string_view name);
-
 // Sort lines in plane with a provided comparator.
 template <class Compare>
 void SortXLinesBy(XPlane* plane, Compare comp) {

From a428ec179e3037c164886df692236561114830c0 Mon Sep 17 00:00:00 2001
From: Raman Sarokin <sorokin@google.com>
Date: Mon, 15 Jun 2020 16:06:18 -0700
Subject: [PATCH 0224/1390] Added new compiler option.

PiperOrigin-RevId: 316564867
Change-Id: I5e0b54b99395d322041207294cbdda5efc62ff76
---
 tensorflow/lite/delegates/gpu/cl/cl_program.cc | 2 ++
 tensorflow/lite/delegates/gpu/cl/cl_program.h  | 3 ++-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/tensorflow/lite/delegates/gpu/cl/cl_program.cc b/tensorflow/lite/delegates/gpu/cl/cl_program.cc
index 690bc598777..3b821dc3a5d 100644
--- a/tensorflow/lite/delegates/gpu/cl/cl_program.cc
+++ b/tensorflow/lite/delegates/gpu/cl/cl_program.cc
@@ -93,6 +93,8 @@ std::string CompilerOptionToString(const CLDevice& device,
       return "-cl-fast-relaxed-math";
     case CompilerOptions::CL_OPT_DISABLE:
       return "-cl-opt-disable";
+    case CompilerOptions::CL_2_0:
+      return "-cl-std=CL2.0";
   }
 }
 
diff --git a/tensorflow/lite/delegates/gpu/cl/cl_program.h b/tensorflow/lite/delegates/gpu/cl/cl_program.h
index fb2a7edb9c1..138b7d9fbd0 100644
--- a/tensorflow/lite/delegates/gpu/cl/cl_program.h
+++ b/tensorflow/lite/delegates/gpu/cl/cl_program.h
@@ -40,7 +40,8 @@ enum class CompilerOptions {
   ADRENO_FULL_SIMD_LINE,
   ADRENO_MORE_WAVES,
   POWERVR_FP16,
-  CL_OPT_DISABLE
+  CL_OPT_DISABLE,
+  CL_2_0
 };
 
 std::string CompilerOptionsToString(

From 0c68775ed3682849849113db4c957a0a16e84000 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 15 Jun 2020 16:13:08 -0700
Subject: [PATCH 0225/1390] Introduces a new experimental package that: -
 Defines a schema for configuring delegates - Defines a C++ plugin mechanism
 using the schema, so that code can support   configuring arbitrary delegates
 without a build-time dependency

PiperOrigin-RevId: 316566081
Change-Id: I4d36b4e155dd30fbdf57d60ef4b546304c033b1a
---
 .../acceleration/configuration/BUILD          | 160 --------------
 .../configuration/configuration.proto         | 208 ------------------
 .../configuration/delegate_registry.cc        |  60 -----
 .../configuration/delegate_registry.h         |  95 --------
 .../acceleration/configuration/gpu_plugin.cc  |  62 ------
 .../configuration/hexagon_plugin.cc           |  73 ------
 .../configuration/nnapi_plugin.cc             |  93 --------
 .../configuration/nnapi_plugin_test.cc        | 175 ---------------
 .../configuration/proto_to_flatbuffer.cc      |  58 -----
 .../configuration/proto_to_flatbuffer.h       |  32 ---
 10 files changed, 1016 deletions(-)
 delete mode 100644 tensorflow/lite/experimental/acceleration/configuration/BUILD
 delete mode 100644 tensorflow/lite/experimental/acceleration/configuration/configuration.proto
 delete mode 100644 tensorflow/lite/experimental/acceleration/configuration/delegate_registry.cc
 delete mode 100644 tensorflow/lite/experimental/acceleration/configuration/delegate_registry.h
 delete mode 100644 tensorflow/lite/experimental/acceleration/configuration/gpu_plugin.cc
 delete mode 100644 tensorflow/lite/experimental/acceleration/configuration/hexagon_plugin.cc
 delete mode 100644 tensorflow/lite/experimental/acceleration/configuration/nnapi_plugin.cc
 delete mode 100644 tensorflow/lite/experimental/acceleration/configuration/nnapi_plugin_test.cc
 delete mode 100644 tensorflow/lite/experimental/acceleration/configuration/proto_to_flatbuffer.cc
 delete mode 100644 tensorflow/lite/experimental/acceleration/configuration/proto_to_flatbuffer.h

diff --git a/tensorflow/lite/experimental/acceleration/configuration/BUILD b/tensorflow/lite/experimental/acceleration/configuration/BUILD
deleted file mode 100644
index 38d28d5cc2e..00000000000
--- a/tensorflow/lite/experimental/acceleration/configuration/BUILD
+++ /dev/null
@@ -1,160 +0,0 @@
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-load("@flatbuffers//:build_defs.bzl", "flatbuffer_cc_library", "flatbuffer_java_library", "flatc_path")
-
-package(
-    default_visibility = [
-        "//visibility:public",
-    ],
-    licenses = ["notice"],  # Apache 2.0
-)
-
-genrule(
-    name = "configuration_schema",
-    srcs = ["configuration.proto"],
-    outs = ["configuration.fbs"],
-    # We rename the namespace since otherwise the proto classes and flatbuffer
-    # classes would have the same names.
-    cmd = """
-    $(location {}) --proto -o $(@D) $(location :configuration.proto)
-    perl -p -i -e 's/tflite.proto/tflite/' $(@D)/configuration.fbs
-    """.format(flatc_path),
-    tools = [
-        flatc_path,
-    ],
-)
-
-genrule(
-    name = "configuration_fbs_contents_cc",
-    srcs = ["configuration.fbs"],
-    outs = ["configuration_fbs_contents-inl.h"],
-    cmd = """
-      echo 'constexpr char configuration_fbs_contents[] = R"Delimiter(' > $(@)
-      cat < $(<) >> $(@)
-      echo ')Delimiter";' >> $(@)
-    """,
-)
-
-proto_library(
-    name = "configuration_proto",
-    srcs = [
-        "configuration.proto",
-    ],
-)
-
-cc_proto_library(
-    name = "configuration_cc_proto",
-    deps = [":configuration_proto"],
-)
-
-java_lite_proto_library(
-    name = "configuration_java_proto_lite",
-    deps = [":configuration_proto"],
-)
-
-flatbuffer_cc_library(
-    name = "configuration_fbs",
-    srcs = [":configuration.fbs"],
-)
-
-flatbuffer_java_library(
-    name = "configuration_fbs_java",
-    srcs = [":configuration.fbs"],
-)
-
-cc_library(
-    name = "proto_to_flatbuffer",
-    srcs = [
-        "configuration_fbs_contents-inl.h",
-        "proto_to_flatbuffer.cc",
-    ],
-    hdrs = ["proto_to_flatbuffer.h"],
-    deps = [
-        ":configuration_cc_proto",
-        ":configuration_fbs",
-        "//tensorflow/core/platform:protobuf",
-        "//tensorflow/lite:minimal_logging",
-        "@flatbuffers",
-    ],
-)
-
-cc_library(
-    name = "delegate_registry",
-    srcs = ["delegate_registry.cc"],
-    hdrs = ["delegate_registry.h"],
-    deps = [
-        ":configuration_fbs",
-        "//tensorflow/lite/c:common",
-        "@com_google_absl//absl/synchronization",
-    ],
-)
-
-cc_library(
-    name = "nnapi_plugin",
-    srcs = ["nnapi_plugin.cc"],
-    deps = [
-        ":configuration_fbs",
-        ":delegate_registry",
-        "//tensorflow/lite/delegates/nnapi:nnapi_delegate",
-        "@com_google_absl//absl/memory",
-    ],
-    alwayslink = 1,  # For registration to always run.
-)
-
-cc_test(
-    name = "nnapi_plugin_test",
-    srcs = ["nnapi_plugin_test.cc"],
-    deps = [
-        ":configuration_fbs",
-        ":delegate_registry",
-        ":nnapi_plugin",
-        "//tensorflow/lite:framework",
-        "//tensorflow/lite/c:common",
-        "//tensorflow/lite/delegates/nnapi:nnapi_delegate",
-        "//tensorflow/lite/delegates/nnapi:nnapi_delegate_mock_test",
-        "//tensorflow/lite/kernels:test_util",
-        "@com_google_googletest//:gtest_main",
-        "@flatbuffers",
-    ],
-)
-
-cc_library(
-    name = "hexagon_plugin",
-    srcs = ["hexagon_plugin.cc"],
-    deps = [
-        ":configuration_fbs",
-        ":delegate_registry",
-        "@com_google_absl//absl/memory",
-    ] + select({
-        "//tensorflow:android": [
-            "//tensorflow/lite/delegates/hexagon:hexagon_delegate",
-        ],
-        "//conditions:default": [],
-    }),
-    alwayslink = 1,  # For registration to always run.
-)
-
-cc_library(
-    name = "gpu_plugin",
-    srcs = ["gpu_plugin.cc"],
-    deps = [
-        ":configuration_fbs",
-        ":delegate_registry",
-        "//tensorflow/lite/delegates/gpu:delegate",
-        "@com_google_absl//absl/memory",
-    ],
-    alwayslink = 1,  # For registration to always run.
-)
diff --git a/tensorflow/lite/experimental/acceleration/configuration/configuration.proto b/tensorflow/lite/experimental/acceleration/configuration/configuration.proto
deleted file mode 100644
index e1c49f02856..00000000000
--- a/tensorflow/lite/experimental/acceleration/configuration/configuration.proto
+++ /dev/null
@@ -1,208 +0,0 @@
-// Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// This schema defines how to configure TFLite for delegation. These
-// definitions can be used in multiple ways: as output of a compatibility list,
-// in benchmarking tools and to decouple delegate instantiation from code.
-//
-// The schema is work-in-progress, covering the most broadly used delegates and
-// options.
-
-syntax = "proto2";
-
-package tflite.proto;
-
-// ExecutionPreference is used to match accelerators against the preferences of
-// the current application or usecase. Some of the values here can appear both
-// in the compatibility list and as input, some only as input.
-//
-// These are separate from NNAPIExecutionPreference - the compatibility list
-// design doesn't assume a one-to-one mapping between which usecases
-// compatibility list entries have been developed for and what settings are used
-// for NNAPI.
-enum ExecutionPreference {
-  // Match any selected preference. Whitelist (semantically - value is same as
-  // on input).
-  ANY = 0;
-  // Match low latency preference. Both compatibility list and input.
-  LOW_LATENCY = 1;
-  // Math low power preference. Both compatibility list and input.
-  LOW_POWER = 2;
-  // Never accelerate. Can be used for input to compatibility list or for
-  // standalone Acceleration configuration.
-  FORCE_CPU = 3;
-}
-
-// TFLite delegate to use.
-enum Delegate {
-  NONE = 0;
-  NNAPI = 1;
-  GPU = 2;
-  HEXAGON = 3;
-  XNNPACK = 4;
-  // TODO(b/157893534): Support exposing edgetpu tflite delegate creation
-  // options.
-  EDGETPU = 5;
-}
-
-enum NNAPIExecutionPreference {
-  // Undefined.
-  UNDEFINED = 0;
-  // Prefer executing in a way that minimizes battery drain.
-  NNAPI_LOW_POWER = 1;
-  // Prefer returning a single answer as fast as possible, even if this causes
-  // more power consumption.
-  NNAPI_FAST_SINGLE_ANSWER = 2;
-  // Prefer maximizing the throughput of successive frames, for example when
-  // processing successive frames coming from the camera.
-  NNAPI_SUSTAINED_SPEED = 3;
-}
-
-// One possible acceleration configuration.
-message ComputeSettings {
-  // Which preference to use this accelerator for.
-  optional ExecutionPreference preference = 1;
-  // How to configure TFLite
-  optional TFLiteSettings tflite_settings = 2;
-  // Identifiers to use for instrumentation and telemetry.
-  optional string model_namespace_for_statistics = 3;
-  optional string model_identifier_for_statistics = 4;
-}
-
-// NNAPI delegate settings.
-message NNAPISettings {
-  // Which instance (NNAPI accelerator) to use. One driver may provide several
-  // accelerators (though a driver may also hide several back-ends behind one
-  // name, at the choice of the driver vendor).
-  // Note that driver introspection is only available in Android Q and later.
-  optional string accelerator_name = 1;
-
-  // NNAPI model compilation caching settings to be passed to
-  // tflite::StatefulNnApiDelegate
-  optional string cache_directory = 2;
-  optional string model_token = 3;
-
-  // NNAPI execution preference to pass. See
-  // https://developer.android.com/ndk/reference/group/neural-networks.html
-  optional NNAPIExecutionPreference execution_preference = 4;
-
-  // Number of instances to cache for the same model (for input size
-  // changes). This is mandatory for getting reasonable performance in that
-  // case.
-  optional int32 no_of_nnapi_instances_to_cache = 5;
-
-  // Whether to automatically fall back to TFLite CPU path.
-  optional FallbackSettings fallback_settings = 6;
-
-  // Whether to allow use of NNAPI CPU (nnapi-reference accelerator) on Android
-  // 10+ when an accelerator name is not specified. The NNAPI CPU typically
-  // performs less well than the TfLite built-in kernels; but allowing allows a
-  // model to be partially accelerated which may be a win.
-  optional bool allow_nnapi_cpu_on_android_10_plus = 7;
-}
-
-// Which GPU backend to select. Default behaviour on Android is to try OpenCL
-// and if it's not available fall back to OpenGL.
-enum GPUBackend {
-  UNSET = 0;
-  OPENCL = 1;
-  OPENGL = 2;
-  // Not yet supported.
-  // VULKAN = 3;
-  // METAL = 4;
-}
-
-// GPU Delegate settings.
-//
-// See
-// https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/delegates/gpu/delegate.h
-message GPUSettings {
-  optional bool is_precision_loss_allowed = 1;
-  optional bool enable_quantized_inference = 2 [default = true];
-  optional GPUBackend force_backend = 3;
-  // TODO(b/152019007): add remaining options.
-}
-
-// Hexagon Delegate settings.
-//
-// See
-// https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/delegates/hexagon/hexagon_delegate.h
-message HexagonSettings {
-  optional int32 debug_level = 1;
-  optional int32 powersave_level = 2;
-  optional bool print_graph_profile = 3;
-  optional bool print_graph_debug = 4;
-}
-
-// XNNPack Delegate settings.
-//
-// See
-// https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h
-message XNNPackSettings {
-  optional int32 num_threads = 1;
-}
-
-message CPUSettings {
-  optional int32 num_threads = 1;
-}
-
-// How to configure TFLite.
-message TFLiteSettings {
-  // Which delegate to use.
-  optional Delegate delegate = 1;
-
-  // How to configure the chosen delegate.
-  // (In principle we would like to use 'oneof', but flatc turns that into an
-  // nested anonymous table rather than a union. See
-  // https://github.com/google/flatbuffers/issues/4628).
-  optional NNAPISettings nnapi_settings = 2;
-  optional GPUSettings gpu_settings = 3;
-  optional HexagonSettings hexagon_settings = 4;
-  optional XNNPackSettings xnnpack_settings = 5;
-
-  // How to configure CPU execution.
-  optional CPUSettings cpu_settings = 6;
-
-  // Shared delegation settings.
-  optional int32 max_delegated_partitions = 7;
-}
-
-// Whether to automatically fallback to TFLite CPU path on delegation errors.
-//
-// Typically fallback is enabled in production use but disabled in tests and
-// benchmarks to ensure they test the intended path.
-message FallbackSettings {
-  // Whether to allow automatically falling back to TfLite CPU path on
-  // compilation failure. Default is not allowing automatic fallback.
-  //
-  // This is useful in naive production usecases where the caller would prefer
-  // for the model to run even if it's not accelerated. More advanced users will
-  // implement fallback themselves; e.g., by using a different model on CPU.
-  //
-  // Note that compilation errors may occur either at initial
-  // ModifyGraphWithDelegate() time, or when calling AllocateTensors() after
-  // resizing.
-  optional bool allow_automatic_fallback_on_compilation_error = 7;
-  // Whether to allow automatically falling back to TfLite CPU path on
-  // execution error. Default is not allowing automatic fallback.
-  //
-  // Experimental, use with care (only when you have complete control over the
-  // client code).
-  //
-  // The caveat above for compilation error holds.  Additionally, execution-time
-  // errors are harder to handle automatically as they require invalidating the
-  // TfLite interpreter which most client code has not been designed to deal
-  // with.
-  optional bool allow_automatic_fallback_on_execution_error = 8;
-}
diff --git a/tensorflow/lite/experimental/acceleration/configuration/delegate_registry.cc b/tensorflow/lite/experimental/acceleration/configuration/delegate_registry.cc
deleted file mode 100644
index b8d80342d5f..00000000000
--- a/tensorflow/lite/experimental/acceleration/configuration/delegate_registry.cc
+++ /dev/null
@@ -1,60 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include "tensorflow/lite/experimental/acceleration/configuration/delegate_registry.h"
-
-#include "absl/synchronization/mutex.h"
-
-namespace tflite {
-namespace delegates {
-
-void DelegatePluginRegistry::RegisterImpl(
-    const std::string& name,
-    std::function<
-        std::unique_ptr<DelegatePluginInterface>(const TFLiteSettings&)>
-        creator_function) {
-  absl::MutexLock lock(&mutex_);
-  factories_[name] = creator_function;
-}
-
-std::unique_ptr<DelegatePluginInterface> DelegatePluginRegistry::CreateImpl(
-    const std::string& name, const TFLiteSettings& settings) {
-  absl::MutexLock lock(&mutex_);
-  auto it = factories_.find(name);
-  if (it != factories_.end()) {
-    return it->second(settings);
-  } else {
-    return nullptr;
-  }
-}
-
-DelegatePluginRegistry* DelegatePluginRegistry::GetSingleton() {
-  static auto* instance = new DelegatePluginRegistry();
-  return instance;
-}
-
-std::unique_ptr<DelegatePluginInterface> DelegatePluginRegistry::CreateByName(
-    const std::string& name, const TFLiteSettings& settings) {
-  auto* const instance = DelegatePluginRegistry::GetSingleton();
-  return instance->CreateImpl(name, settings);
-}
-
-DelegatePluginRegistry::Register::Register(const std::string& name,
-                                           CreatorFunction creator_function) {
-  auto* const instance = DelegatePluginRegistry::GetSingleton();
-  instance->RegisterImpl(name, creator_function);
-}
-
-}  // namespace delegates
-}  // namespace tflite
diff --git a/tensorflow/lite/experimental/acceleration/configuration/delegate_registry.h b/tensorflow/lite/experimental/acceleration/configuration/delegate_registry.h
deleted file mode 100644
index c86759dcc3f..00000000000
--- a/tensorflow/lite/experimental/acceleration/configuration/delegate_registry.h
+++ /dev/null
@@ -1,95 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_CONFIGURATION_DELEGATE_REGISTRY_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_CONFIGURATION_DELEGATE_REGISTRY_H_
-
-#include <memory>
-#include <unordered_map>
-
-#include "absl/synchronization/mutex.h"
-#include "tensorflow/lite/c/common.h"
-#include "tensorflow/lite/experimental/acceleration/configuration/configuration_generated.h"
-
-// Defines an interface for TFLite delegate plugins.
-//
-// The acceleration library aims to support all TFLite delegates based on
-// configuration expressed as data (flatbuffers). However, consumers tend to
-// care about size and also use a subset of delegates. Hence we don't want to
-// statically build against all delegates.
-//
-// This interface allows plugins to handle specific delegates.
-//
-// Goal of this interface is not to abstract away all the differences between
-// delegates. The goal is only to avoid static linking.
-//
-// Note to implementers: this interface may change if new delegates don't fit
-// into the same design.
-namespace tflite {
-namespace delegates {
-
-// Same w/ Interpreter::TfLiteDelegatePtr to avoid pulling
-// tensorflow/lite/interpreter.h dependency
-using TfLiteDelegatePtr =
-    std::unique_ptr<TfLiteDelegate, void (*)(TfLiteDelegate*)>;
-
-class DelegatePluginInterface {
- public:
-  virtual TfLiteDelegatePtr Create() = 0;
-  virtual int GetDelegateErrno(TfLiteDelegate* from_delegate) = 0;
-  virtual ~DelegatePluginInterface() = default;
-};
-
-// A stripped-down registry that allows delegate plugins to be created by name.
-//
-// Limitations:
-// - Doesn't allow deregistration.
-// - Doesn't check for duplication registration.
-//
-class DelegatePluginRegistry {
- public:
-  typedef std::function<std::unique_ptr<DelegatePluginInterface>(
-      const TFLiteSettings&)>
-      CreatorFunction;
-  // Returns a DelegatePluginInterface registered with `name` or nullptr if no
-  // matching plugin found.
-  // TFLiteSettings is per-plugin, so that the corresponding delegate options
-  // data lifetime is maintained.
-  static std::unique_ptr<DelegatePluginInterface> CreateByName(
-      const std::string& name, const TFLiteSettings& settings);
-
-  // Struct to be statically allocated for registration.
-  struct Register {
-    Register(const std::string& name, CreatorFunction creator_function);
-  };
-
- private:
-  void RegisterImpl(const std::string& name, CreatorFunction creator_function);
-  std::unique_ptr<DelegatePluginInterface> CreateImpl(
-      const std::string& name, const TFLiteSettings& settings);
-  static DelegatePluginRegistry* GetSingleton();
-  std::unordered_map<std::string, CreatorFunction> factories_;
-  absl::Mutex mutex_;
-};
-
-}  // namespace delegates
-}  // namespace tflite
-
-#define TFLITE_REGISTER_DELEGATE_FACTORY_FUNCTION_VNAME(name, f) \
-  static auto* g_delegate_plugin_##name##_ =                     \
-      new DelegatePluginRegistry::Register(#name, f);
-#define TFLITE_REGISTER_DELEGATE_FACTORY_FUNCTION(name, f) \
-  TFLITE_REGISTER_DELEGATE_FACTORY_FUNCTION_VNAME(name, f);
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_CONFIGURATION_DELEGATE_REGISTRY_H_
diff --git a/tensorflow/lite/experimental/acceleration/configuration/gpu_plugin.cc b/tensorflow/lite/experimental/acceleration/configuration/gpu_plugin.cc
deleted file mode 100644
index 25b8171c5ea..00000000000
--- a/tensorflow/lite/experimental/acceleration/configuration/gpu_plugin.cc
+++ /dev/null
@@ -1,62 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include <memory>
-
-#include "absl/memory/memory.h"
-#include "tensorflow/lite/delegates/gpu/delegate.h"
-#include "tensorflow/lite/experimental/acceleration/configuration/configuration_generated.h"
-#include "tensorflow/lite/experimental/acceleration/configuration/delegate_registry.h"
-
-namespace tflite {
-namespace delegates {
-class GpuPlugin : public DelegatePluginInterface {
- public:
-  TfLiteDelegatePtr Create() override {
-    return TfLiteDelegatePtr(TfLiteGpuDelegateV2Create(&options_),
-                             TfLiteGpuDelegateV2Delete);
-  }
-  int GetDelegateErrno(TfLiteDelegate* from_delegate) override { return 0; }
-  static std::unique_ptr<DelegatePluginInterface> New(
-      const TFLiteSettings& acceleration) {
-    return absl::make_unique<GpuPlugin>(acceleration);
-  }
-  explicit GpuPlugin(const TFLiteSettings& tflite_settings)
-      : options_(TfLiteGpuDelegateOptionsV2Default()) {
-    const auto* gpu_settings = tflite_settings.gpu_settings();
-    if (gpu_settings) {
-      options_.inference_priority1 =
-          gpu_settings->is_precision_loss_allowed()
-              ? TFLITE_GPU_INFERENCE_PRIORITY_MIN_LATENCY
-              : TFLITE_GPU_INFERENCE_PRIORITY_MAX_PRECISION;
-      if (gpu_settings->enable_quantized_inference()) {
-        options_.experimental_flags |=
-            TFLITE_GPU_EXPERIMENTAL_FLAGS_ENABLE_QUANT;
-      }
-      if (gpu_settings->force_backend() == GPUBackend_OPENCL) {
-        options_.experimental_flags |= TFLITE_GPU_EXPERIMENTAL_FLAGS_CL_ONLY;
-      } else if (gpu_settings->force_backend() == GPUBackend_OPENGL) {
-        options_.experimental_flags |= TFLITE_GPU_EXPERIMENTAL_FLAGS_GL_ONLY;
-      }
-    }
-  }
-
- private:
-  TfLiteGpuDelegateOptionsV2 options_;
-};
-
-TFLITE_REGISTER_DELEGATE_FACTORY_FUNCTION(GpuPlugin, GpuPlugin::New);
-
-}  // namespace delegates
-}  // namespace tflite
diff --git a/tensorflow/lite/experimental/acceleration/configuration/hexagon_plugin.cc b/tensorflow/lite/experimental/acceleration/configuration/hexagon_plugin.cc
deleted file mode 100644
index 7f2674604b0..00000000000
--- a/tensorflow/lite/experimental/acceleration/configuration/hexagon_plugin.cc
+++ /dev/null
@@ -1,73 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include <memory>
-
-#include "absl/memory/memory.h"
-#include "tensorflow/lite/experimental/acceleration/configuration/configuration_generated.h"
-#include "tensorflow/lite/experimental/acceleration/configuration/delegate_registry.h"
-
-#if defined(__ARM_ARCH)
-#include "tensorflow/lite/delegates/hexagon/hexagon_delegate.h"
-#endif
-
-namespace tflite {
-namespace delegates {
-class HexagonPlugin : public DelegatePluginInterface {
- public:
-  TfLiteDelegatePtr Create() override {
-#if defined(__ARM_ARCH)
-    TfLiteHexagonInit();
-    auto* delegate_ptr = TfLiteHexagonDelegateCreate(&options_);
-    TfLiteDelegatePtr delegate(delegate_ptr, [](TfLiteDelegate* delegate) {
-      TfLiteHexagonDelegateDelete(delegate);
-      TfLiteHexagonTearDown();
-    });
-    return delegate;
-#else   // !defined(__ARM_ARCH)
-    return TfLiteDelegatePtr(nullptr, [](TfLiteDelegate*) {});
-#endif  // defined(__ARM_ARCH)
-  }
-  int GetDelegateErrno(TfLiteDelegate* /* from_delegate */) override {
-    return 0;
-  }
-  static std::unique_ptr<HexagonPlugin> New(
-      const TFLiteSettings& tflite_settings) {
-    return absl::make_unique<HexagonPlugin>(tflite_settings);
-  }
-  explicit HexagonPlugin(const TFLiteSettings& tflite_settings) {
-    const HexagonSettings* settings = tflite_settings.hexagon_settings();
-#if defined(__ARM_ARCH)
-    options_ = TfLiteHexagonDelegateOptions({0});
-    if (settings) {
-      options_.debug_level = settings->debug_level();
-      options_.powersave_level = settings->powersave_level();
-      options_.print_graph_profile = settings->print_graph_profile();
-      options_.print_graph_debug = settings->print_graph_debug();
-    }
-#else
-    (void)settings;
-#endif
-  }
-
- private:
-#if defined(__ARM_ARCH)
-  TfLiteHexagonDelegateOptions options_;
-#endif
-};
-
-TFLITE_REGISTER_DELEGATE_FACTORY_FUNCTION(HexagonPlugin, HexagonPlugin::New);
-
-}  // namespace delegates
-}  // namespace tflite
diff --git a/tensorflow/lite/experimental/acceleration/configuration/nnapi_plugin.cc b/tensorflow/lite/experimental/acceleration/configuration/nnapi_plugin.cc
deleted file mode 100644
index 7301983a815..00000000000
--- a/tensorflow/lite/experimental/acceleration/configuration/nnapi_plugin.cc
+++ /dev/null
@@ -1,93 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include <memory>
-
-#include "absl/memory/memory.h"
-#include "tensorflow/lite/delegates/nnapi/nnapi_delegate.h"
-#include "tensorflow/lite/experimental/acceleration/configuration/configuration_generated.h"
-#include "tensorflow/lite/experimental/acceleration/configuration/delegate_registry.h"
-
-namespace tflite {
-namespace delegates {
-
-inline tflite::StatefulNnApiDelegate::Options::ExecutionPreference
-ConvertExecutionPrefence(
-    NNAPIExecutionPreference from_compatibility_preference) {
-  using TflitePreference =
-      tflite::StatefulNnApiDelegate::Options::ExecutionPreference;
-  switch (from_compatibility_preference) {
-    case NNAPIExecutionPreference_NNAPI_LOW_POWER:
-      return TflitePreference::kLowPower;
-    case NNAPIExecutionPreference_NNAPI_FAST_SINGLE_ANSWER:
-      return TflitePreference::kFastSingleAnswer;
-    case NNAPIExecutionPreference_NNAPI_SUSTAINED_SPEED:
-      return TflitePreference::kSustainedSpeed;
-    default:
-      return TflitePreference::kUndefined;
-  }
-}
-
-class NnapiPlugin : public DelegatePluginInterface {
- public:
-  TfLiteDelegatePtr Create() override {
-    auto nnapi_delegate =
-        absl::make_unique<tflite::StatefulNnApiDelegate>(options_);
-    return TfLiteDelegatePtr(
-        nnapi_delegate.release(), [](TfLiteDelegate* delegate) {
-          delete reinterpret_cast<tflite::StatefulNnApiDelegate*>(delegate);
-        });
-  }
-  int GetDelegateErrno(TfLiteDelegate* from_delegate) override {
-    auto nnapi_delegate =
-        reinterpret_cast<tflite::StatefulNnApiDelegate*>(from_delegate);
-    return nnapi_delegate->GetNnApiErrno();
-  }
-  static std::unique_ptr<NnapiPlugin> New(
-      const TFLiteSettings& tflite_settings) {
-    return absl::make_unique<NnapiPlugin>(tflite_settings);
-  }
-  explicit NnapiPlugin(const TFLiteSettings& tflite_settings) {
-    const NNAPISettings* nnapi_settings = tflite_settings.nnapi_settings();
-    if (!nnapi_settings) return;
-    if (nnapi_settings->accelerator_name() &&
-        nnapi_settings->accelerator_name()->Length() != 0) {
-      accelerator_ = nnapi_settings->accelerator_name()->str();
-      options_.accelerator_name = accelerator_.c_str();
-    }
-    if (nnapi_settings->cache_directory() &&
-        nnapi_settings->cache_directory()->Length() != 0) {
-      cache_dir_ = nnapi_settings->cache_directory()->str();
-      options_.cache_dir = cache_dir_.c_str();
-    }
-    if (nnapi_settings->model_token() &&
-        nnapi_settings->model_token()->Length() != 0) {
-      model_token_ = nnapi_settings->model_token()->str();
-      options_.model_token = model_token_.c_str();
-    }
-    options_.execution_preference =
-        ConvertExecutionPrefence(nnapi_settings->execution_preference());
-    options_.disallow_nnapi_cpu =
-        !nnapi_settings->allow_nnapi_cpu_on_android_10_plus();
-  }
-
- private:
-  std::string accelerator_, cache_dir_, model_token_;
-  tflite::StatefulNnApiDelegate::Options options_;
-};
-
-TFLITE_REGISTER_DELEGATE_FACTORY_FUNCTION(NnapiPlugin, NnapiPlugin::New);
-
-}  // namespace delegates
-}  // namespace tflite
diff --git a/tensorflow/lite/experimental/acceleration/configuration/nnapi_plugin_test.cc b/tensorflow/lite/experimental/acceleration/configuration/nnapi_plugin_test.cc
deleted file mode 100644
index 4f9f5dd08c1..00000000000
--- a/tensorflow/lite/experimental/acceleration/configuration/nnapi_plugin_test.cc
+++ /dev/null
@@ -1,175 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include <memory>
-
-#include <gtest/gtest.h>
-#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
-#include "tensorflow/lite/c/common.h"
-#include "tensorflow/lite/delegates/nnapi/nnapi_delegate.h"
-#include "tensorflow/lite/delegates/nnapi/nnapi_delegate_mock_test.h"
-#include "tensorflow/lite/experimental/acceleration/configuration/configuration_generated.h"
-#include "tensorflow/lite/experimental/acceleration/configuration/delegate_registry.h"
-#include "tensorflow/lite/interpreter.h"
-#include "tensorflow/lite/kernels/test_util.h"
-
-// Tests for checking that the NNAPI Delegate plugin correctly handles all the
-// options from the flatbuffer.
-//
-// Checking done at NNAPI call level, as that is where we have a mockable
-// layer.
-namespace tflite {
-namespace {
-
-using delegate::nnapi::NnApiMock;
-
-class SingleAddOpModel : tflite::SingleOpModel {
- public:
-  void Build() {
-    int input = AddInput({tflite::TensorType_FLOAT32, {1, 2, 2}});
-    int constant = AddConstInput({tflite::TensorType_FLOAT32, {1, 2, 2}},
-                                 {1.0f, 1.0f, 1.0f, 1.0f});
-    AddOutput({tflite::TensorType_FLOAT32, {}});
-
-    SetBuiltinOp(tflite::BuiltinOperator_ADD, tflite::BuiltinOptions_AddOptions,
-                 tflite::CreateAddOptions(builder_).Union());
-    BuildInterpreter({GetShape(input), GetShape(constant)});
-  }
-
-  tflite::Interpreter* Interpreter() const { return interpreter_.get(); }
-};
-
-class NNAPIPluginTest : public ::testing::Test {
- protected:
-  NNAPIPluginTest() : delegate_(nullptr, [](TfLiteDelegate*) {}) {}
-  void SetUp() override {
-    nnapi_ = const_cast<NnApi*>(NnApiImplementation());
-    nnapi_mock_ = absl::make_unique<NnApiMock>(nnapi_);
-    nnapi_->ANeuralNetworksModel_getSupportedOperationsForDevices =
-        [](const ANeuralNetworksModel* model,
-           const ANeuralNetworksDevice* const* devices, uint32_t numDevices,
-           bool* supportedOps) -> int {
-      supportedOps[0] = true;
-      return 0;
-    };
-    model_.Build();
-  }
-  template <NNAPIExecutionPreference input, int output>
-  void CheckExecutionPreference() {
-    // Note - this uses a template since the NNAPI functions are C function
-    // pointers rather than lambdas so can't capture variables.
-    nnapi_->ANeuralNetworksCompilation_setPreference =
-        [](ANeuralNetworksCompilation* compilation, int32_t preference) {
-          return preference - output;
-        };
-    CreateDelegate(CreateNNAPISettings(fbb_, 0, 0, 0, input));
-    // Since delegation succeeds, the model becomes immutable and hence can't
-    // reuse it.
-    SingleAddOpModel model;
-    model.Build();
-    EXPECT_EQ(model.Interpreter()->ModifyGraphWithDelegate(delegate_.get()),
-              kTfLiteOk)
-        << " given input: " << input << " expected output: " << output;
-  }
-
-  void CreateDelegate(flatbuffers::Offset<NNAPISettings> settings) {
-    settings_ = flatbuffers::GetTemporaryPointer(
-        fbb_, CreateTFLiteSettings(fbb_, tflite::Delegate_NNAPI, settings));
-
-    plugin_ = delegates::DelegatePluginRegistry::CreateByName("NnapiPlugin",
-                                                              *settings_);
-    delegate_ = plugin_->Create();
-  }
-
-  NnApi* nnapi_;
-  std::unique_ptr<NnApiMock> nnapi_mock_;
-  SingleAddOpModel model_;
-  flatbuffers::FlatBufferBuilder fbb_;
-  const TFLiteSettings* settings_ = nullptr;
-  delegates::TfLiteDelegatePtr delegate_;
-  std::unique_ptr<delegates::DelegatePluginInterface> plugin_;
-};
-
-TEST_F(NNAPIPluginTest, PassesAcceleratorName) {
-  // Fails with non-existent "foo".
-  CreateDelegate(CreateNNAPISettings(fbb_, fbb_.CreateString("foo")));
-  EXPECT_EQ(model_.Interpreter()->ModifyGraphWithDelegate(delegate_.get()),
-            kTfLiteDelegateError);
-
-  // Succeeds with "test-device" supported by the mock.
-  CreateDelegate(CreateNNAPISettings(fbb_, fbb_.CreateString("test-device")));
-  EXPECT_EQ(model_.Interpreter()->ModifyGraphWithDelegate(delegate_.get()),
-            kTfLiteOk);
-}
-
-TEST_F(NNAPIPluginTest, PassesExecutionPreference) {
-  CheckExecutionPreference<NNAPIExecutionPreference_UNDEFINED,
-                           StatefulNnApiDelegate::Options::kUndefined>();
-  CheckExecutionPreference<NNAPIExecutionPreference_NNAPI_LOW_POWER,
-                           StatefulNnApiDelegate::Options::kLowPower>();
-  CheckExecutionPreference<NNAPIExecutionPreference_NNAPI_FAST_SINGLE_ANSWER,
-                           StatefulNnApiDelegate::Options::kFastSingleAnswer>();
-  CheckExecutionPreference<NNAPIExecutionPreference_NNAPI_SUSTAINED_SPEED,
-                           StatefulNnApiDelegate::Options::kSustainedSpeed>();
-}
-
-TEST_F(NNAPIPluginTest, PassesCachingParameters) {
-  nnapi_->ANeuralNetworksCompilation_setCaching =
-      [](ANeuralNetworksCompilation* compilation, const char* cacheDir,
-         const uint8_t* token) -> int {
-    if (std::string(cacheDir) != "d") return 1;
-    // Token is hashed with other bits, just check that it's not empty.
-    if (std::string(reinterpret_cast<const char*>(token)).empty()) return 2;
-    return 0;
-  };
-  CreateDelegate(CreateNNAPISettings(fbb_, 0, fbb_.CreateString("d"),
-                                     fbb_.CreateString("t")));
-  EXPECT_EQ(model_.Interpreter()->ModifyGraphWithDelegate(delegate_.get()),
-            kTfLiteOk);
-}
-
-TEST_F(NNAPIPluginTest, PassesFalseNNAPICpuFlag) {
-  CreateDelegate(CreateNNAPISettings(fbb_, 0, 0, 0,
-                                     NNAPIExecutionPreference_UNDEFINED, 0, 0,
-                                     /* allow CPU */ false));
-  nnapi_->ANeuralNetworksModel_getSupportedOperationsForDevices =
-      [](const ANeuralNetworksModel* model,
-         const ANeuralNetworksDevice* const* devices, uint32_t numDevices,
-         bool* supportedOps) -> int {
-    supportedOps[0] = true;
-    // Since no CPU, should only pass one device.
-    return numDevices - 1;
-  };
-  EXPECT_EQ(model_.Interpreter()->ModifyGraphWithDelegate(delegate_.get()),
-            kTfLiteOk);
-}
-
-TEST_F(NNAPIPluginTest, PassesTrueNNAPICpuFlag) {
-  CreateDelegate(CreateNNAPISettings(fbb_, 0, 0, 0,
-                                     NNAPIExecutionPreference_UNDEFINED, 0, 0,
-                                     /* allow CPU */ true));
-  nnapi_->ANeuralNetworksModel_getSupportedOperationsForDevices =
-      [](const ANeuralNetworksModel* model,
-         const ANeuralNetworksDevice* const* devices, uint32_t numDevices,
-         bool* supportedOps) -> int {
-    supportedOps[0] = true;
-    // With CPU allowed, should pass two devices.
-    return numDevices - 2;
-  };
-  EXPECT_EQ(model_.Interpreter()->ModifyGraphWithDelegate(delegate_.get()),
-            kTfLiteOk);
-}
-
-}  // namespace
-}  // namespace tflite
diff --git a/tensorflow/lite/experimental/acceleration/configuration/proto_to_flatbuffer.cc b/tensorflow/lite/experimental/acceleration/configuration/proto_to_flatbuffer.cc
deleted file mode 100644
index 709bb70ca70..00000000000
--- a/tensorflow/lite/experimental/acceleration/configuration/proto_to_flatbuffer.cc
+++ /dev/null
@@ -1,58 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include "tensorflow/lite/experimental/acceleration/configuration/proto_to_flatbuffer.h"
-
-#include <string>
-
-#include "flatbuffers/idl.h"  // from @flatbuffers
-#include "flatbuffers/util.h"  // from @flatbuffers
-#include "tensorflow/core/platform/protobuf.h"
-#include "tensorflow/lite/minimal_logging.h"
-
-namespace tflite {
-
-namespace {
-#include "tensorflow/lite/experimental/acceleration/configuration/configuration_fbs_contents-inl.h"
-}
-
-const ComputeSettings* ConvertFromProto(
-    flatbuffers::Parser* parser, const proto::ComputeSettings& proto_settings) {
-  std::string json;
-  tensorflow::protobuf::util::JsonPrintOptions options;
-  options.preserve_proto_field_names = true;
-  options.always_print_primitive_fields = true;  // For catching problems.
-  auto status = tensorflow::protobuf::util::MessageToJsonString(proto_settings,
-                                                                &json, options);
-  if (!status.ok()) {
-    TFLITE_LOG_PROD(TFLITE_LOG_ERROR, "Failed to convert to Json: %s",
-                    status.ToString().c_str());
-    return nullptr;
-  }
-  if (!parser->Parse(configuration_fbs_contents)) {
-    TFLITE_LOG_PROD(TFLITE_LOG_ERROR, "Failed to parse schema: %s",
-                    parser->error_.c_str());
-    return nullptr;
-  }
-  parser->SetRootType("tflite.ComputeSettings");
-  if (!parser->Parse(json.c_str())) {
-    TFLITE_LOG_PROD(TFLITE_LOG_ERROR, "Failed to parse json: %s",
-                    parser->error_.c_str());
-    return nullptr;
-  }
-  return flatbuffers::GetRoot<ComputeSettings>(
-      parser->builder_.GetBufferPointer());
-}
-
-}  // namespace tflite
diff --git a/tensorflow/lite/experimental/acceleration/configuration/proto_to_flatbuffer.h b/tensorflow/lite/experimental/acceleration/configuration/proto_to_flatbuffer.h
deleted file mode 100644
index 3b69e8465a5..00000000000
--- a/tensorflow/lite/experimental/acceleration/configuration/proto_to_flatbuffer.h
+++ /dev/null
@@ -1,32 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_CONFIGURATION_PROTO_TO_FLATBUFFER_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_CONFIGURATION_PROTO_TO_FLATBUFFER_H_
-
-#include "flatbuffers/idl.h"  // from @flatbuffers
-#include "tensorflow/lite/experimental/acceleration/configuration/configuration.pb.h"
-#include "tensorflow/lite/experimental/acceleration/configuration/configuration_generated.h"
-
-namespace tflite {
-
-// Converts the protobuf version ComputeSettings to the flatbuffer version, via
-// json. The parser is used for state - the returned pointer is valid only as
-// long as the parser is kept alive and unmutated.
-const ComputeSettings* ConvertFromProto(
-    flatbuffers::Parser* parser, const proto::ComputeSettings& proto_settings);
-
-}  // namespace tflite
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_CONFIGURATION_PROTO_TO_FLATBUFFER_H_

From f8410051b094aa686640f4325ebb134ceb13ca2d Mon Sep 17 00:00:00 2001
From: Tomer Kaftan <kaftan@google.com>
Date: Mon, 15 Jun 2020 16:14:45 -0700
Subject: [PATCH 0226/1390] Remove a test of Keras implementation details that
 are subject to change. Specifically, the test checks that nested layers in a
 functional model have their _keras_history updated when that model is used to
 construct a bigger functional model.

In the future this incidental internal-implementation-specific-behavior will break.

PiperOrigin-RevId: 316566350
Change-Id: Ifb1e5ab04aafbbaa45e420ed34238059b16bcbdc
---
 tensorflow/python/keras/engine/functional_test.py | 12 ------------
 1 file changed, 12 deletions(-)

diff --git a/tensorflow/python/keras/engine/functional_test.py b/tensorflow/python/keras/engine/functional_test.py
index 25b433ce582..a7e314d4a49 100644
--- a/tensorflow/python/keras/engine/functional_test.py
+++ b/tensorflow/python/keras/engine/functional_test.py
@@ -2073,18 +2073,6 @@ class CacheCorrectnessTest(keras_parameterized.TestCase):
     # `None` value passed during construction is overridden.
     self.assertAllEqual(network(x, training=False), x * 0.0)
 
-  def test_keras_history_propagation_(self):
-    for input_shape in [(1,), (1, 1)]:
-      sub_in = input_layer_lib.Input((1,))
-      relu_layer = layers.ReLU()
-      sub_out = relu_layer(sub_in)
-      submodel = functional.Functional(sub_in, sub_out)
-      self.assertLen(relu_layer._inbound_nodes, 1)
-
-      inp = input_layer_lib.Input(input_shape)
-      submodel(inp)
-      self.assertLen(relu_layer._inbound_nodes, 2)
-
 
 if __name__ == '__main__':
   test.main()

From 2b7baa3ba36f0e346e60086232631a41db83a224 Mon Sep 17 00:00:00 2001
From: Rahul Joshi <jurahul@google.com>
Date: Mon, 15 Jun 2020 16:21:27 -0700
Subject: [PATCH 0227/1390] Change HLO importer to set visibility when
 importing.

PiperOrigin-RevId: 316567573
Change-Id: I3e04c34ce022563f03f52a3bc5d24d20c4c90b0d
---
 .../mlir/xla/hlo_function_importer.cc         |  3 +
 .../mlir/xla/tests/translate/import.hlotxt    | 87 ++++++++++---------
 2 files changed, 47 insertions(+), 43 deletions(-)

diff --git a/tensorflow/compiler/mlir/xla/hlo_function_importer.cc b/tensorflow/compiler/mlir/xla/hlo_function_importer.cc
index a3b6222a8af..0627f2587de 100644
--- a/tensorflow/compiler/mlir/xla/hlo_function_importer.cc
+++ b/tensorflow/compiler/mlir/xla/hlo_function_importer.cc
@@ -115,6 +115,9 @@ StatusOr<mlir::FuncOp> HloFunctionImporter::ImportAsFunc(
   llvm::ArrayRef<mlir::NamedAttribute> attrs;
   auto function = mlir::FuncOp::create(mlir::UnknownLoc::get(context_),
                                        computation_name, func_type, attrs);
+  auto visibility = computation_name == "main" ? FuncOp::Visibility::Public
+                                               : FuncOp::Visibility::Private;
+  function.setVisibility(visibility);
   module_.push_back(function);
 
   // Add to the map right away for function calls.
diff --git a/tensorflow/compiler/mlir/xla/tests/translate/import.hlotxt b/tensorflow/compiler/mlir/xla/tests/translate/import.hlotxt
index 4565e1e4938..6336d6ed688 100644
--- a/tensorflow/compiler/mlir/xla/tests/translate/import.hlotxt
+++ b/tensorflow/compiler/mlir/xla/tests/translate/import.hlotxt
@@ -1,4 +1,4 @@
-// RUN: tf-mlir-translate -hlo-text-to-mlir-hlo %s -o - | FileCheck %s
+// RUN: tf-mlir-translate -hlo-text-to-mlir-hlo %s -o - | FileCheck %s -DPRIVATE="attributes {sym_visibility = \"private\"}"
 
 HloModule main
 
@@ -8,6 +8,7 @@ ENTRY %dummy_main (Arg_0.1: f32[]) -> f32[] {
 }
 
 // CHECK-LABEL:  func @test_simple
+// CHECK-SAME: [[PRIVATE]]
 %test_simple (Arg_0.1: f32[4], Arg_1.2: f32[4]) -> f32[] {
   %Arg_0.1 = f32[4]{0} parameter(0)
   %Arg_1.2 = f32[4]{0} parameter(1)
@@ -21,7 +22,7 @@ ENTRY %dummy_main (Arg_0.1: f32[]) -> f32[] {
 }
 
 // CHECK-LABEL:  func @test_after_all
-// CHECK-SAME:  ([[VAL_0:%.*]]: !xla_hlo.token, [[VAL_1:%.*]]: !xla_hlo.token) -> !xla_hlo.token
+// CHECK-SAME:  ([[VAL_0:%.*]]: !xla_hlo.token, [[VAL_1:%.*]]: !xla_hlo.token) -> !xla_hlo.token [[PRIVATE]]
 %test_after_all (token0: token[], token1: token[] ) -> token[] {
   token0 = token[] parameter(0)
   token1 = token[] parameter(1)
@@ -95,7 +96,7 @@ add {
   ROOT %batch-norm-grad = (f32[2,2,2,2], f32[2], f32[2]) batch-norm-grad(f32[2,2,2,2] %input, f32[2] %scale, f32[2] %mean, f32[2] %variance, f32[2,2,2,2] %grad_output), epsilon=0.001, feature_index=1
 }
 
-// CHECK-LABEL:  func @call(%arg0: tensor<i64>) -> tensor<i64> {
+// CHECK-LABEL:  func @call(%arg0: tensor<i64>) -> tensor<i64>
 %call (arg_1: s64[]) -> s64[] {
   %arg_1 = s64[] parameter(0), metadata={op_name="HLO_Args"}
   ROOT %compare.2 = s64[] add(%arg_1, %arg_1), metadata={op_type="Less" op_name="Less"}
@@ -136,7 +137,7 @@ add {
 }
 
 
-// CHECK-LABEL:  func @test_compare(%arg0: tensor<3xf32>, %arg1: tensor<3xf32>, %arg2: tensor<3xf32>) -> tensor<3xi1> {
+// CHECK-LABEL:  func @test_compare(%arg0: tensor<3xf32>, %arg1: tensor<3xf32>, %arg2: tensor<3xf32>) -> tensor<3xi1>
 %test_compare (Arg_0.1: f32[3], Arg_1.2: f32[3], Arg_2.3: f32[3]) -> pred[3] {
   %Arg_0.1 = f32[3] parameter(0)
   %Arg_1.2 = f32[3] parameter(1)
@@ -162,7 +163,7 @@ add {
   ROOT %complex.3 = c64[4] complex(f32[4] %Arg_0.1, f32[4] %Arg_1.2)
 }
 
-// CHECK-LABEL:  func @test_concat(%arg0: tensor<4x1xf32>, %arg1: tensor<4x2xf32>) -> tensor<4x3xf32> {
+// CHECK-LABEL:  func @test_concat(%arg0: tensor<4x1xf32>, %arg1: tensor<4x2xf32>) -> tensor<4x3xf32>
 %test_concat (Arg_0.1: f32[4, 1], Arg_1.2: f32[4, 2]) -> f32[4, 3] {
   %Arg_0.1 = f32[4, 1] parameter(0)
   %Arg_1.2 = f32[4, 2] parameter(1)
@@ -201,7 +202,7 @@ add {
 
 // TODO(b/129422361) Potentially update when copy, reshape, and conv have actual
 // implementations with attributes, etc.
-// CHECK-LABEL:  func @test_conv(%arg0: tensor<256x32x32x6xf32>) -> tuple<tensor<256x30x30x16xf32>> {
+// CHECK-LABEL:  func @test_conv(%arg0: tensor<256x32x32x6xf32>) -> tuple<tensor<256x30x30x16xf32>>
 %test_conv {
   %arg0.1 = f32[256,32,32,6]{3,2,1,0} parameter(0), metadata={op_name="HLO_Args"}
 
@@ -257,7 +258,7 @@ add {
   ROOT %convolution = f32[1,5,1] convolution(f32[1,2,1] %input, f32[1,1,1] %filter), feature_group_count=1, dim_labels=b0f_0io->b0f, window={pad=1_2 size=1}
 }
 
-// CHECK-LABEL:  func @test_convert(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf64> {
+// CHECK-LABEL:  func @test_convert(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf64>
 %test_convert (Arg_0.1: f32[4], Arg_1.2: f32[4]) -> f64[4] {
   %Arg_0.1 = f32[4] parameter(0)
   %Arg_1.2 = f32[4] parameter(1)
@@ -272,7 +273,7 @@ add {
   ROOT %add.5 = f64[4] add(f64[4] %convert.3, f64[4] %convert.4)
 }
 
-// CHECK-LABEL:  func @test_cosine(%arg0: tensor<1x16x16x3xf32>) -> tensor<1x16x16x3xf32> {
+// CHECK-LABEL:  func @test_cosine(%arg0: tensor<1x16x16x3xf32>) -> tensor<1x16x16x3xf32>
 %test_cosine (arg0.1: f32[1,16,16,3]) -> f32[1,16,16,3] {
   %arg0.1 = f32[1,16,16,3]{3,2,1,0} parameter(0), metadata={op_name="HLO_Args"}
 
@@ -289,7 +290,7 @@ add {
   ROOT %custom-call = f32[1,2,3]{0,2,1} custom-call(f32[2,3] %arg1, f32[5,5] %arg2), custom_call_target="foo", backend_config="bar", custom_call_has_side_effect=true
 }
 
-// CHECK-LABEL:  func @test_div(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
+// CHECK-LABEL:  func @test_div(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32>
 %test_div (Arg_0.1: f32[4], Arg_1.2: f32[4]) -> f32[4] {
   %Arg_0.1 = f32[4] parameter(0)
   %Arg_1.2 = f32[4] parameter(1)
@@ -298,7 +299,7 @@ add {
   ROOT %divide.3 = f32[4] divide(f32[4] %Arg_0.1, f32[4] %Arg_1.2)
 }
 
-// CHECK-LABEL:  func @test_dot(%arg0: tensor<1x4xf32>, %arg1: tensor<4x1xf32>) -> tensor<f32> {
+// CHECK-LABEL:  func @test_dot(%arg0: tensor<1x4xf32>, %arg1: tensor<4x1xf32>) -> tensor<f32>
 %test_dot (Arg_0.1: f32[1, 4], Arg_1.2: f32[4, 1]) -> f32[] {
   %Arg_0.1 = f32[1, 4] parameter(0)
   %Arg_1.2 = f32[4, 1] parameter(1)
@@ -350,7 +351,7 @@ add {
   ROOT %dynamic-slice = s32[1,1,32] dynamic-slice(s32[2,2,258] %operand, s32[] %start_idx_1, s32[] %start_idx_2, s32[] %start_idx_3), dynamic_slice_sizes={1,1,32}
 }
 
-// CHECK-LABEL:  func @test_dynamic_update_slice_1(%arg0: tensor<4x4xf32>, %arg1: tensor<1x4xf32>, %arg2: tensor<i32>, %arg3: tensor<i32>) -> tensor<4x4xf32> {
+// CHECK-LABEL:  func @test_dynamic_update_slice_1(%arg0: tensor<4x4xf32>, %arg1: tensor<1x4xf32>, %arg2: tensor<i32>, %arg3: tensor<i32>) -> tensor<4x4xf32>
 %test_dynamic_update_slice_1 (Arg_0.1: f32[4, 4], Arg_1.2: f32[1, 4], Arg_2.3: f32[], Arg_3.4: f32[]) -> f32[4, 4] {
   %Arg_0.1 = f32[4, 4] parameter(0)
   %Arg_1.2 = f32[1, 4] parameter(1)
@@ -371,7 +372,7 @@ add {
   ROOT %dynamic-update-slice.5 = f32[4] dynamic-update-slice(%Arg_0.1, %Arg_1.2, %Arg_2.3)
 }
 
-// CHECK-LABEL:  func @test_exponential(%arg0: tensor<16xf32>) -> tensor<16xf32> {
+// CHECK-LABEL:  func @test_exponential(%arg0: tensor<16xf32>) -> tensor<16xf32>
 %test_exponential (arg0.1: f32[16]) -> f32[16] {
   %arg0.1 = f32[16] parameter(0)
 
@@ -379,7 +380,7 @@ add {
   ROOT %exp.2 = f32[16] exponential(f32[16] %arg0.1)
 }
 
-// CHECK-LABEL:  func @test_expm1(%arg0: tensor<16xf32>) -> tensor<16xf32> {
+// CHECK-LABEL:  func @test_expm1(%arg0: tensor<16xf32>) -> tensor<16xf32>
 %test_expm1 (arg0.1: f32[16]) -> f32[16] {
   %arg0.1 = f32[16] parameter(0)
 
@@ -387,7 +388,7 @@ add {
   ROOT %expm1.2 = f32[16] exponential-minus-one(f32[16] %arg0.1)
 }
 
-// CHECK-LABEL:  func @test_fft(%arg0: tensor<3x9xf32>) -> tensor<3x5xcomplex<f32>> {
+// CHECK-LABEL:  func @test_fft(%arg0: tensor<3x9xf32>) -> tensor<3x5xcomplex<f32>>
 %test_fft {
   %arg0.1 = f32[3,9]{1,0} parameter(0), parameter_replication={false}, metadata={op_name="XLA_Args"}
   // CHECK:  "xla_hlo.fft"(%arg0) {fft_length = dense<9> : tensor<1xi64>, fft_type = "RFFT"
@@ -395,7 +396,7 @@ add {
 }
 
 // CHECK-LABEL:  func @test_floor(
-// CHECK-SAME: [[A0:%.+]]: tensor<16xf32>) -> tensor<16xf32> {
+// CHECK-SAME: [[A0:%.+]]: tensor<16xf32>) -> tensor<16xf32>
 %test_floor (arg0.1: f32[16]) -> f32[16] {
   %arg0.1 = f32[16] parameter(0)
 
@@ -404,7 +405,7 @@ add {
 }
 
 // CHECK-LABEL:  func @test_gather(
-// CHECK-SAME:  [[ARG0:%.+]]: tensor<200x100x300xf32>, [[ARG1:%.+]]: tensor<10x2xi32>) -> tensor<10x300xf32> {
+// CHECK-SAME: [[ARG0:%.+]]: tensor<200x100x300xf32>, [[ARG1:%.+]]: tensor<10x2xi32>) -> tensor<10x300xf32>
 %test_gather (arg.0: f32[200,100,300], arg.1: s32[10,2]) -> f32[10,300] {
   %arg.0 = f32[200,100,300] parameter(0)
   %arg.1 = s32[10,2] parameter(1)
@@ -442,7 +443,7 @@ add {
 }
 
 // CHECK-LABEL:  func @test_infeed
-// CHECK-SAME: ([[TOKEN:%.*]]: !xla_hlo.token) -> tuple<tensor<3xi32>, !xla_hlo.token> {
+// CHECK-SAME: ([[TOKEN:%.*]]: !xla_hlo.token) -> tuple<tensor<3xi32>, !xla_hlo.token>
 %test_infeed (token0: token[]) -> (s32[3], token[]) {
   %token0 = token[] parameter(0)
   // CHECK-NEXT:  "xla_hlo.infeed"([[TOKEN]])
@@ -451,19 +452,19 @@ add {
 }
 
 
-// CHECK-LABEL:  func @test_iota_1() -> tensor<4xf32> {
+// CHECK-LABEL:  func @test_iota_1() -> tensor<4xf32>
 %test_iota_1 () -> f32[4] {
   // CHECK-NEXT:  "xla_hlo.iota"() {iota_dimension = 0 : i64} : () -> tensor<4xf32>
   ROOT %iota.0 = f32[4] iota(), iota_dimension=0
 }
 
-// CHECK-LABEL:  func @test_iota_2() -> tensor<4x5xf32> {
+// CHECK-LABEL:  func @test_iota_2() -> tensor<4x5xf32>
 %test_iota_2 () -> f32[4, 5] {
   // CHECK-NEXT:  "xla_hlo.iota"() {iota_dimension = 1 : i64} : () -> tensor<4x5xf32>
   ROOT %iota.0 = f32[4, 5] iota(), iota_dimension=1
 }
 
-// CHECK-LABEL:  func @test_log(%arg0: tensor<16xf32>) -> tensor<16xf32> {
+// CHECK-LABEL:  func @test_log(%arg0: tensor<16xf32>) -> tensor<16xf32>
 %test_log (arg0.1: f32[16]) -> f32[16] {
   %arg0.1 = f32[16] parameter(0)
 
@@ -471,7 +472,7 @@ add {
   ROOT %log.2 = f32[16] log(f32[16] %arg0.1)
 }
 
-// CHECK-LABEL:  func @test_log1p(%arg0: tensor<16xf32>) -> tensor<16xf32> {
+// CHECK-LABEL:  func @test_log1p(%arg0: tensor<16xf32>) -> tensor<16xf32>
 %test_log1p (arg0.1: f32[16]) -> f32[16] {
   %arg0.1 = f32[16] parameter(0)
 
@@ -501,7 +502,7 @@ add {
 
 
-// CHECK-LABEL:  func @test_maximum(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
+// CHECK-LABEL:  func @test_maximum(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32>
 %test_maximum (Arg_0.1: f32[4], Arg_1.2: f32[4]) -> f32[4] {
   %Arg_0.1 = f32[4] parameter(0)
   %Arg_1.2 = f32[4] parameter(1)
@@ -510,7 +511,7 @@ add {
   ROOT %maximum.3 = f32[4] maximum(f32[4] %Arg_0.1, f32[4] %Arg_1.2)
 }
 
-// CHECK-LABEL:  func @test_minimum(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
+// CHECK-LABEL:  func @test_minimum(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32>
 %test_minimum (Arg_0.1: f32[4], Arg_1.2: f32[4]) -> f32[4] {
   %Arg_0.1 = f32[4] parameter(0)
   %Arg_1.2 = f32[4] parameter(1)
@@ -519,7 +520,7 @@ add {
   ROOT %minimum.3 = f32[4] minimum(f32[4] %Arg_0.1, f32[4] %Arg_1.2)
 }
 
-// CHECK-LABEL:  func @test_multiply(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
+// CHECK-LABEL:  func @test_multiply(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32>
 %test_multiply (Arg_0.1: f32[4], Arg_1.2: f32[4]) -> f32[4] {
   %Arg_0.1 = f32[4] parameter(0)
   %Arg_1.2 = f32[4] parameter(1)
@@ -528,7 +529,7 @@ add {
   ROOT %multiply.3 = f32[4] multiply(f32[4] %Arg_0.1, f32[4] %Arg_1.2)
 }
 
-// CHECK-LABEL:  func @test_negate(%arg0: tensor<16xf32>) -> tensor<16xf32> {
+// CHECK-LABEL:  func @test_negate(%arg0: tensor<16xf32>) -> tensor<16xf32>
 %test_negate (arg0.1: f32[16]) -> f32[16] {
   %arg0.1 = f32[16] parameter(0)
 
@@ -536,7 +537,7 @@ add {
   ROOT %negate.2 = f32[16] negate(f32[16] %arg0.1)
 }
 
-// CHECK-LABEL:  func @test_not(%arg0: tensor<16xi1>) -> tensor<16xi1> {
+// CHECK-LABEL:  func @test_not(%arg0: tensor<16xi1>) -> tensor<16xi1>
 %test_not (arg0.1: pred[16]) -> pred[16] {
   %arg0.1 = pred[16] parameter(0)
 
@@ -554,7 +555,7 @@ add {
 }
 
 // CHECK-LABEL:  func @test_outfeed
-// CHECK-SAME: ([[DATA:%.*]]: tensor<3xi32>, [[TOKEN:%.*]]: !xla_hlo.token) -> !xla_hlo.token {
+// CHECK-SAME: ([[DATA:%.*]]: tensor<3xi32>, [[TOKEN:%.*]]: !xla_hlo.token) -> !xla_hlo.token
 %test_outfeed (Arg_0.1: s32[3], Arg_1.2: token[]) -> token[] {
   %Arg_0.1 = s32[3] parameter(0)
   %Arg_1.2 = token[] parameter(1)
@@ -563,7 +564,7 @@ add {
   ROOT %outfeed.3 = token[] outfeed(s32[3] %Arg_0.1, token[] %Arg_1.2), outfeed_config="foobar"
 }
 
-// CHECK-LABEL:  func @test_pad(%arg0: tensor<4xf32>, %arg1: tensor<f32>) -> tensor<4xf32> {
+// CHECK-LABEL:  func @test_pad(%arg0: tensor<4xf32>, %arg1: tensor<f32>) -> tensor<4xf32>
 %test_pad (Arg_0.1: f32[4], Arg_1.2: f32[]) -> f32[4] {
   %Arg_0.1 = f32[4] parameter(0)
   %Arg_1.2 = f32[] parameter(1)
@@ -572,7 +573,7 @@ add {
   ROOT %pad.3 = f32[4] pad(%Arg_0.1, %Arg_1.2), padding=0_0_0
 }
 
-// CHECK-LABEL:  func @test_pad_edge(%arg0: tensor<4x4x4xf32>, %arg1: tensor<f32>) -> tensor<7x11x15xf32> {
+// CHECK-LABEL:  func @test_pad_edge(%arg0: tensor<4x4x4xf32>, %arg1: tensor<f32>) -> tensor<7x11x15xf32>
 %test_pad_edge (Arg_0.1: f32[4, 4, 4], Arg_1.2: f32[]) -> f32[7, 11, 15] {
   %Arg_0.1 = f32[4, 4, 4] parameter(0)
   %Arg_1.2 = f32[] parameter(1)
@@ -581,7 +582,7 @@ add {
   ROOT %pad.3 = f32[7, 11, 15] pad(%Arg_0.1, %Arg_1.2), padding=1_2x3_4x5_6
 }
 
-// CHECK-LABEL:  func @test_pad_interior(%arg0: tensor<4xf32>, %arg1: tensor<f32>) -> tensor<10xf32> {
+// CHECK-LABEL:  func @test_pad_interior(%arg0: tensor<4xf32>, %arg1: tensor<f32>) -> tensor<10xf32>
 %test_pad_interior (Arg_0.1: f32[4], Arg_1.2: f32[]) -> f32[10] {
   %Arg_0.1 = f32[4] parameter(0)
   %Arg_1.2 = f32[] parameter(1)
@@ -590,7 +591,7 @@ add {
   ROOT %pad.3 = f32[10] pad(%Arg_0.1, %Arg_1.2), padding=0_0_2
 }
 
-// CHECK-LABEL:  func @test_popcnt(%arg0: tensor<16xi32>) -> tensor<16xi32> {
+// CHECK-LABEL:  func @test_popcnt(%arg0: tensor<16xi32>) -> tensor<16xi32>
 %test_popcnt (arg0.1: s32[16]) -> s32[16] {
   %arg0.1 = s32[16] parameter(0)
 
@@ -598,7 +599,7 @@ add {
   ROOT %popcnt.2 = s32[16] popcnt(s32[16] %arg0.1)
 }
 
-// CHECK-LABEL:  func @test_pow(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
+// CHECK-LABEL:  func @test_pow(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32>
 %test_pow (Arg_0.1: f32[4], Arg_1.2: f32[4]) -> f32[4] {
   %Arg_0.1 = f32[4] parameter(0)
   %Arg_1.2 = f32[4] parameter(1)
@@ -659,7 +660,7 @@ add {
 }
 
 // CHECK-LABEL:  func @test_reduce
-// CHECK-SAME: ([[ARG0:%.*]]: tensor<4x4xf32>, [[ARG1:%.*]]: tensor<4xf32>, [[ARG2:%.*]]: tensor<f32>) -> tuple<tuple<tensor<f32>, tensor<f32>>, tensor<f32>> {
+// CHECK-SAME: ([[ARG0:%.*]]: tensor<4x4xf32>, [[ARG1:%.*]]: tensor<4xf32>, [[ARG2:%.*]]: tensor<f32>) -> tuple<tuple<tensor<f32>, tensor<f32>>, tensor<f32>>
 %test_reduce (Arg_0.1: f32[4, 4], Arg_1.2: f32[4], Arg_2.3: f32[]) -> ((f32[], f32[]), f32[]) {
   %Arg_0.1 = f32[4, 4] parameter(0)
   %Arg_1.2 = f32[4] parameter(1)
@@ -719,7 +720,7 @@ add {
   ROOT %remainder.3 = f32[4] remainder(f32[4] %Arg_0.1, f32[4] %Arg_1.2)
 }
 
-// CHECK-LABEL:  func @test_reverse_1d(%arg0: tensor<4xf32>) -> tensor<4xf32> {
+// CHECK-LABEL:  func @test_reverse_1d(%arg0: tensor<4xf32>) -> tensor<4xf32>
 %test_reverse_1d (Arg_0.1: f32[4]) -> f32[4] {
   %Arg_0.1 = f32[4] parameter(0)
 
@@ -727,7 +728,7 @@ add {
   ROOT reverse.2 = f32[4] reverse(%Arg_0.1), dimensions={0}
 }
 
-// CHECK-LABEL:  func @test_reverse_2d(%arg0: tensor<4x4xf32>) -> tensor<4x4xf32> {
+// CHECK-LABEL:  func @test_reverse_2d(%arg0: tensor<4x4xf32>) -> tensor<4x4xf32
 %test_reverse_2d (Arg_0.1: f32[4, 4]) -> f32[4, 4] {
   %Arg_0.1 = f32[4, 4] parameter(0)
 
@@ -736,7 +737,7 @@ add {
 }
 
 // CHECK-LABEL:  func @test_rsqrt(
-// CHECK-SAME: [[ARG0:%.+]]: tensor<16xf32>) -> tensor<16xf32> {
+// CHECK-SAME: [[ARG0:%.+]]: tensor<16xf32>) -> tensor<16xf32>
 %test_rsqrt (arg0.1: f32[16]) -> f32[16] {
   %arg0.1 = f32[16] parameter(0)
 
@@ -744,7 +745,7 @@ add {
   ROOT %rsqrt.2 = f32[16] rsqrt(f32[16] %arg0.1)
 }
 
-// CHECK-LABEL:  func @test_scalar(%arg0: tensor<f32>) -> tensor<f32> {
+// CHECK-LABEL:  func @test_scalar(%arg0: tensor<f32>) -> tensor<f32>
 %test_scalar (Arg_0.1: f32[]) -> f32[] {
   // CHECK-NEXT:  return %arg0 : tensor<f32>
   ROOT %Arg_0.1 = f32[] parameter(0)
@@ -781,7 +782,7 @@ add {
 // CHECK-SAME:  unique_indices = false
 
 
-// CHECK-LABEL:  func @test_select(%arg0: tensor<2x3xi1>, %arg1: tensor<2x3xi32>, %arg2: tensor<2x3xi32>) -> tensor<2x3xi32> {
+// CHECK-LABEL:  func @test_select(%arg0: tensor<2x3xi1>, %arg1: tensor<2x3xi32>, %arg2: tensor<2x3xi32>) -> tensor<2x3xi32>
 %test_select {
   %Arg_0.1 = pred[2,3] parameter(0)
   %Arg_1.2 = s32[2,3] parameter(1)
@@ -838,7 +839,7 @@ add {
   ROOT %set-dimension-size.2 = f32[4,<=4] set-dimension-size(f32[4,4] %Arg_0.1, s32[] %Arg_1.2), dimensions={1}
 }
 
-// CHECK-LABEL:  func @test_sine(%arg0: tensor<1x16x16x3xf32>) -> tensor<1x16x16x3xf32> {
+// CHECK-LABEL:  func @test_sine(%arg0: tensor<1x16x16x3xf32>) -> tensor<1x16x16x3xf32>
 %test_sine (arg0.1: f32[1,16,16,3]) -> f32[1,16,16,3] {
   %arg0.1 = f32[1,16,16,3]{3,2,1,0} parameter(0), metadata={op_name="HLO_Args"}
 
@@ -874,7 +875,7 @@ add {
   ROOT %subtract.3 = f32[4] subtract(f32[4] %Arg_0.1, f32[4] %Arg_1.2)
 }
 
-// CHECK-LABEL:  func @test_tanh(%arg0: tensor<1x16x16x3xf32>) -> tensor<1x16x16x3xf32> {
+// CHECK-LABEL:  func @test_tanh(%arg0: tensor<1x16x16x3xf32>) -> tensor<1x16x16x3xf32>
 %test_tanh (arg0.1: f32[1,16,16,3]) -> f32[1,16,16,3] {
   %arg0.1 = f32[1,16,16,3]{3,2,1,0} parameter(0), metadata={op_name="HLO_Args"}
 
@@ -882,7 +883,7 @@ add {
   ROOT %tanh.3 = f32[1,16,16,3]{3,2,1,0} tanh(f32[1,16,16,3]{3,2,1,0} %arg0.1), metadata={op_type="Tanh" op_name="embedded_inference/tanh_model/Tanh"}
 }
 
-// CHECK-LABEL:  func @test_transpose(%arg0: tensor<1x2x3x4xi32>) -> tensor<2x1x4x3xi32> {
+// CHECK-LABEL:  func @test_transpose(%arg0: tensor<1x2x3x4xi32>) -> tensor<2x1x4x3xi32>
 %test_transpose {
   %Arg_0.1 = s32[1,2,3,4] parameter(0)
 
@@ -903,7 +904,7 @@ add {
   ROOT %triangular-solve.3 = f32[4,3] triangular-solve(f32[4,4] %Arg_0.1, f32[4,3] %Arg_1.2), left_side=true, lower=true, transpose_a=NO_TRANSPOSE, unit_diagonal=true
 }
 
-// CHECK-LABEL:  func @test_tuple(%arg0: tensor<1xi32>, %arg1: tensor<1x2xf32>) -> tuple<tensor<1xi32>, tensor<1x2xf32>> {
+// CHECK-LABEL:  func @test_tuple(%arg0: tensor<1xi32>, %arg1: tensor<1x2xf32>) -> tuple<tensor<1xi32>, tensor<1x2xf32>>
 %test_tuple(Arg_0.1: s32[1], Arg_1.2: f32[1, 2]) -> (s32[1], f32[1,2]) {
   %Arg_0.1 = s32[1] parameter(0)
   %Arg_1.2 = f32[1, 2] parameter(1)
@@ -928,7 +929,7 @@ add {
   ROOT %compare.2 = s64[] add(%arg_1, %arg_1), metadata={op_type="Less" op_name="Less"}
 }
 
-// CHECK-LABEL:  func @test_while(%arg0: tensor<i64>) -> tensor<i64> {
+// CHECK-LABEL:  func @test_while(%arg0: tensor<i64>) -> tensor<i64>
 %test_while (arg0.1: s64[]) -> s64[] {
   %arg0.1 = s64[] parameter(0), metadata={op_name="HLO_Args"}
   // CHECK-NEXT:  "xla_hlo.while"(%arg0) ( {

From caf465347e8e9520f1b810809f88f0ea229ba835 Mon Sep 17 00:00:00 2001
From: Tare Gaskin <taregaskin@google.com>
Date: Mon, 15 Jun 2020 23:39:29 +0000
Subject: [PATCH 0228/1390] segregation attempt 5

---
 tensorflow/core/framework/tensor_shape.cc             | 2 +-
 tensorflow/core/lib/io/inputbuffer.cc                 | 6 +++---
 tensorflow/core/lib/io/random_inputstream.cc          | 2 +-
 tensorflow/core/lib/io/snappy/snappy_inputbuffer.cc   | 2 +-
 tensorflow/core/lib/io/snappy/snappy_outputbuffer.cc  | 6 +++---
 tensorflow/core/lib/io/zlib_outputbuffer.cc           | 6 +++---
 tensorflow/core/platform/env.cc                       | 2 +-
 tensorflow/core/platform/file_system.cc               | 2 +-
 tensorflow/core/platform/file_system_helper.cc        | 2 +-
 tensorflow/core/platform/status.cc                    | 4 +++-
 tensorflow/core/profiler/internal/parse_annotation.cc | 2 +-
 11 files changed, 19 insertions(+), 17 deletions(-)

diff --git a/tensorflow/core/framework/tensor_shape.cc b/tensorflow/core/framework/tensor_shape.cc
index f4b440e9cd1..79d0cc0822d 100644
--- a/tensorflow/core/framework/tensor_shape.cc
+++ b/tensorflow/core/framework/tensor_shape.cc
@@ -187,7 +187,7 @@ void TensorShapeBase<Shape>::InitDims(gtl::ArraySlice<int64> dim_sizes) {
                 "bad overflow check");
   bool large_size = false;
   for (auto s : dim_sizes) {
-    if (s > kMaxSmall) {
+    if (static_cast<size_t>(s) > static_cast<size_t>(kMaxSmall)) {
       large_size = true;
       break;
     }
diff --git a/tensorflow/core/lib/io/inputbuffer.cc b/tensorflow/core/lib/io/inputbuffer.cc
index 2b138b825e4..d005ee11d78 100644
--- a/tensorflow/core/lib/io/inputbuffer.cc
+++ b/tensorflow/core/lib/io/inputbuffer.cc
@@ -85,7 +85,7 @@ Status InputBuffer::ReadNBytes(int64 bytes_to_read, string* result) {
   result->resize(bytes_to_read);
   size_t bytes_read = 0;
   Status status = ReadNBytes(bytes_to_read, &(*result)[0], &bytes_read);
-  if (bytes_read < bytes_to_read) result->resize(bytes_read);
+  if (static_cast<int64>(bytes_read) < bytes_to_read) result->resize(bytes_read);
   return status;
 }
 
@@ -204,7 +204,7 @@ Status InputBuffer::Hint(int64 bytes_to_read) {
   }
 
   // The internal buffer is too small. Do nothing.
-  if (bytes_to_read > size_) {
+  if (bytes_to_read > static_cast<int64>(size_)) {
     return Status::OK();
   }
 
@@ -230,7 +230,7 @@ Status InputBuffer::Hint(int64 bytes_to_read) {
   limit_ += data.size();
   file_pos_ += data.size();
 
-  if (errors::IsOutOfRange(s) && data.size() == bytes_to_read) {
+  if (errors::IsOutOfRange(s) && data.size() == static_cast<size_t>(bytes_to_read)) {
     return Status::OK();
   } else {
     return s;
diff --git a/tensorflow/core/lib/io/random_inputstream.cc b/tensorflow/core/lib/io/random_inputstream.cc
index 10f734a5bae..bd0054ce753 100644
--- a/tensorflow/core/lib/io/random_inputstream.cc
+++ b/tensorflow/core/lib/io/random_inputstream.cc
@@ -92,7 +92,7 @@ Status RandomAccessInputStream::SkipNBytes(int64 bytes_to_skip) {
     } else {
       return s;
     }
-    if (data.size() < bytes_to_read) {
+    if (data.size() < static_cast<size_t>(bytes_to_read)) {
       return errors::OutOfRange("reached end of file");
     }
     bytes_to_skip -= bytes_to_read;
diff --git a/tensorflow/core/lib/io/snappy/snappy_inputbuffer.cc b/tensorflow/core/lib/io/snappy/snappy_inputbuffer.cc
index a331d4173cf..53939f2d8a3 100644
--- a/tensorflow/core/lib/io/snappy/snappy_inputbuffer.cc
+++ b/tensorflow/core/lib/io/snappy/snappy_inputbuffer.cc
@@ -134,7 +134,7 @@ Status SnappyInputBuffer::ReadCompressedBlockLength(uint32* length) {
     }
     size_t readable = std::min(bytes_to_read, avail_in_);
 
-    for (int i = 0; i < readable; i++) {
+    for (size_t i = 0; i < readable; i++) {
       // The "unsigned char" type cast is intentional to avoid implicit type
       // casting of the signed char to unsigned int during bitwise OR which
       // causes weird overflow errors.
diff --git a/tensorflow/core/lib/io/snappy/snappy_outputbuffer.cc b/tensorflow/core/lib/io/snappy/snappy_outputbuffer.cc
index 563503a1319..fe3a53c6c25 100644
--- a/tensorflow/core/lib/io/snappy/snappy_outputbuffer.cc
+++ b/tensorflow/core/lib/io/snappy/snappy_outputbuffer.cc
@@ -76,7 +76,7 @@ Status SnappyOutputBuffer::Write(StringPiece data) {
 
   // If there is sufficient free space in input_buffer_ to fit data we
   // add it there and return.
-  if (bytes_to_write <= AvailableInputSpace()) {
+  if (static_cast<int32>(bytes_to_write) <= AvailableInputSpace()) {
     AddToInputBuffer(data);
     return Status::OK();
   }
@@ -87,7 +87,7 @@ Status SnappyOutputBuffer::Write(StringPiece data) {
   TF_RETURN_IF_ERROR(DeflateBuffered());
 
   // input_buffer_ should be empty at this point.
-  if (bytes_to_write <= AvailableInputSpace()) {
+  if (static_cast<int32>(bytes_to_write) <= AvailableInputSpace()) {
     AddToInputBuffer(data);
     return Status::OK();
   }
@@ -144,7 +144,7 @@ void SnappyOutputBuffer::AddToInputBuffer(StringPiece data) {
   const int32 free_tail_bytes =
       input_buffer_capacity_ - (read_bytes + unread_bytes);
 
-  if (bytes_to_write > free_tail_bytes) {
+  if (static_cast<int32>(bytes_to_write) > free_tail_bytes) {
     memmove(input_buffer_.get(), next_in_, avail_in_);
     next_in_ = input_buffer_.get();
   }
diff --git a/tensorflow/core/lib/io/zlib_outputbuffer.cc b/tensorflow/core/lib/io/zlib_outputbuffer.cc
index 5840ca60242..d475d0eaa5c 100644
--- a/tensorflow/core/lib/io/zlib_outputbuffer.cc
+++ b/tensorflow/core/lib/io/zlib_outputbuffer.cc
@@ -98,7 +98,7 @@ void ZlibOutputBuffer::AddToInputBuffer(StringPiece data) {
   int32 unread_bytes = z_stream_->avail_in;
   int32 free_tail_bytes = input_buffer_capacity_ - (read_bytes + unread_bytes);
 
-  if (bytes_to_write > free_tail_bytes) {
+  if (static_cast<int32>(bytes_to_write) > free_tail_bytes) {
     memmove(z_stream_input_.get(), z_stream_->next_in, z_stream_->avail_in);
     z_stream_->next_in = z_stream_input_.get();
   }
@@ -154,7 +154,7 @@ Status ZlibOutputBuffer::Append(StringPiece data) {
 
   size_t bytes_to_write = data.size();
 
-  if (bytes_to_write <= AvailableInputSpace()) {
+  if (static_cast<int32>(bytes_to_write) <= AvailableInputSpace()) {
     AddToInputBuffer(data);
     return Status::OK();
   }
@@ -162,7 +162,7 @@ Status ZlibOutputBuffer::Append(StringPiece data) {
   TF_RETURN_IF_ERROR(DeflateBuffered(zlib_options_.flush_mode));
 
   // At this point input stream should be empty.
-  if (bytes_to_write <= AvailableInputSpace()) {
+  if (static_cast<int32>(bytes_to_write) <= AvailableInputSpace()) {
     AddToInputBuffer(data);
     return Status::OK();
   }
diff --git a/tensorflow/core/platform/env.cc b/tensorflow/core/platform/env.cc
index b29cad05459..05d95ba0425 100644
--- a/tensorflow/core/platform/env.cc
+++ b/tensorflow/core/platform/env.cc
@@ -214,7 +214,7 @@ bool Env::FilesExist(const std::vector<string>& files,
     }
     if (fs_status) {
       result &= fs_result;
-      for (int i = 0; i < itr.second.size(); ++i) {
+      for (size_t i = 0; i < itr.second.size(); ++i) {
         per_file_status[itr.second[i]] = fs_status->at(i);
       }
     } else if (!fs_result) {
diff --git a/tensorflow/core/platform/file_system.cc b/tensorflow/core/platform/file_system.cc
index 9e96ceedbdc..c9657e2339f 100644
--- a/tensorflow/core/platform/file_system.cc
+++ b/tensorflow/core/platform/file_system.cc
@@ -308,7 +308,7 @@ StringPiece FileSystem::Basename(StringPiece path) const {
 StringPiece FileSystem::Extension(StringPiece path) const {
   StringPiece basename = this->Basename(path);
 
-  int pos = basename.rfind('.');
+  size_t pos = basename.rfind('.');
   if (pos == StringPiece::npos) {
     return StringPiece(path.data() + path.size(), 0);
   } else {
diff --git a/tensorflow/core/platform/file_system_helper.cc b/tensorflow/core/platform/file_system_helper.cc
index 64b175c4d17..909752389e1 100644
--- a/tensorflow/core/platform/file_system_helper.cc
+++ b/tensorflow/core/platform/file_system_helper.cc
@@ -103,7 +103,7 @@ Status GetMatchingPaths(FileSystem* fs, Env* env, const string& pattern,
                 children_dir_status[i] = fs->IsDirectory(child_path);
               }
             });
-    for (int i = 0; i < children.size(); ++i) {
+    for (size_t i = 0; i < children.size(); ++i) {
       const string child_path = io::JoinPath(current_dir, children[i]);
       // If the IsDirectory call was cancelled we bail.
       if (children_dir_status[i].code() == tensorflow::error::CANCELLED) {
diff --git a/tensorflow/core/platform/status.cc b/tensorflow/core/platform/status.cc
index 756b8314148..e303c18091c 100644
--- a/tensorflow/core/platform/status.cc
+++ b/tensorflow/core/platform/status.cc
@@ -74,7 +74,9 @@ class StatusLogSink : public TFLogSink {
 
     mutex_lock lock(mu_);
     messages_.emplace_back(entry.ToString());
-    if (messages_.size() > num_messages_) messages_.pop_front();
+    if (messages_.size() > static_cast<size_t>(num_messages_)){
+        messages_.pop_front();
+    }
   }
 
  private:
diff --git a/tensorflow/core/profiler/internal/parse_annotation.cc b/tensorflow/core/profiler/internal/parse_annotation.cc
index 32c26befa3d..a4cdc09739d 100644
--- a/tensorflow/core/profiler/internal/parse_annotation.cc
+++ b/tensorflow/core/profiler/internal/parse_annotation.cc
@@ -50,7 +50,7 @@ std::vector<absl::string_view> SplitNameAndMetadata(
 std::vector<absl::string_view> SplitPairs(absl::string_view metadata) {
   std::vector<absl::string_view> key_value_pairs;
   std::stack<char> quotes;
-  int start = 0, end = 0;
+  size_t start = 0, end = 0;
   for (; end < metadata.size(); ++end) {
     char ch = metadata[end];
     switch (ch) {

From 67c32abcff8b5bb873a7ce02e5fe5a6653e55d6c Mon Sep 17 00:00:00 2001
From: Dan Moldovan <mdan@google.com>
Date: Mon, 15 Jun 2020 16:22:20 -0700
Subject: [PATCH 0229/1390] More compatibility fixes for typing.Generic:  *
 types.new_class is required in some distributions  * avoid calling isinstance
 on some function objects in python 3.6 Required for #40132.

PiperOrigin-RevId: 316567732
Change-Id: I6ef39c8de6dbdf2878d30a3a5860a1047a54a55d
---
 tensorflow/python/framework/test_util.py |  7 +++++++
 tensorflow/python/util/tf_should_use.py  | 18 +++++++++++++-----
 2 files changed, 20 insertions(+), 5 deletions(-)

diff --git a/tensorflow/python/framework/test_util.py b/tensorflow/python/framework/test_util.py
index 2967bb3de84..950e17d0d8c 100644
--- a/tensorflow/python/framework/test_util.py
+++ b/tensorflow/python/framework/test_util.py
@@ -33,6 +33,7 @@ import tempfile
 import threading
 import time
 import unittest
+import weakref
 
 from absl.testing import parameterized
 import numpy as np
@@ -732,6 +733,12 @@ def assert_no_new_tensors(f):
     """Finds existing Tensors, runs the test, checks for new Tensors."""
 
     def _is_tensorflow_object(obj):
+      if isinstance(obj, weakref.ReferenceType):
+        obj = obj()
+        if obj is None:
+          return False
+      if not hasattr(obj, "__class__"):
+        return False
       try:
         return isinstance(obj,
                           (ops.Tensor, variables.Variable,
diff --git a/tensorflow/python/util/tf_should_use.py b/tensorflow/python/util/tf_should_use.py
index 1671b078fa3..41c3220f5ca 100644
--- a/tensorflow/python/util/tf_should_use.py
+++ b/tensorflow/python/util/tf_should_use.py
@@ -21,15 +21,12 @@ import copy
 import sys
 import textwrap
 import traceback
-
-import six  # pylint: disable=unused-import
-
+import types
 
 from tensorflow.python.eager import context
 from tensorflow.python.framework import ops
 from tensorflow.python.platform import tf_logging
 from tensorflow.python.util import tf_decorator
-# pylint: enable=g-bad-import-order,g-import-not-at-top
 
 
 class _TFShouldUseHelper(object):
@@ -154,7 +151,18 @@ def _get_wrapper(x, tf_should_use_helper):
   tx = copy.deepcopy(type_x)
   # Prefer using __orig_bases__, which preserve generic type arguments.
   bases = getattr(tx, '__orig_bases__', tx.__bases__)
-  copy_tx = type(tx.__name__, bases, dict(tx.__dict__))
+
+  # Use types.new_class when available, which is preferred over plain type in
+  # some distributions.
+  if sys.version_info >= (3, 5):
+    def set_body(ns):
+      ns.update(tx.__dict__)
+      return ns
+
+    copy_tx = types.new_class(tx.__name__, bases, exec_body=set_body)
+  else:
+    copy_tx = type(tx.__name__, bases, dict(tx.__dict__))
+
   copy_tx.__init__ = _new__init__
   copy_tx.__getattribute__ = _new__getattribute__
   copy_tx.mark_used = _new_mark_used

From 609a60b44bbf934b31a1dce4f0aa84e731b83c35 Mon Sep 17 00:00:00 2001
From: Lukas Geiger <lukas.geiger94@gmail.com>
Date: Tue, 16 Jun 2020 00:41:09 +0100
Subject: [PATCH 0230/1390] Refactor AutoCastVariable tests to rely on
 strategy_combinations

---
 .../keras/mixed_precision/experimental/BUILD  |   3 +-
 .../experimental/autocast_variable_test.py    | 130 +++++++++---------
 2 files changed, 64 insertions(+), 69 deletions(-)

diff --git a/tensorflow/python/keras/mixed_precision/experimental/BUILD b/tensorflow/python/keras/mixed_precision/experimental/BUILD
index 024b093c469..4060e455f84 100644
--- a/tensorflow/python/keras/mixed_precision/experimental/BUILD
+++ b/tensorflow/python/keras/mixed_precision/experimental/BUILD
@@ -144,9 +144,10 @@ py_test(
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework",
         "//tensorflow/python:platform_test",
+        "//tensorflow/python/distribute:combinations",
         "//tensorflow/python/distribute:mirrored_strategy",
+        "//tensorflow/python/distribute:strategy_combinations",
         "//tensorflow/python/eager:context",
-        "//tensorflow/python/keras:combinations",
         "//tensorflow/python/keras/optimizer_v2",
         "@absl_py//absl/testing:parameterized",
     ],
diff --git a/tensorflow/python/keras/mixed_precision/experimental/autocast_variable_test.py b/tensorflow/python/keras/mixed_precision/experimental/autocast_variable_test.py
index 78041973cc1..95957f5634e 100644
--- a/tensorflow/python/keras/mixed_precision/experimental/autocast_variable_test.py
+++ b/tensorflow/python/keras/mixed_precision/experimental/autocast_variable_test.py
@@ -17,20 +17,23 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import contextlib
 import os
 
 from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.python import tf2
+from tensorflow.python.distribute import combinations
+from tensorflow.python.distribute import distribution_strategy_context as ds_context
 from tensorflow.python.distribute import mirrored_strategy
+from tensorflow.python.distribute import strategy_combinations
 from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import indexed_slices
 from tensorflow.python.framework import ops
-from tensorflow.python.keras import combinations
 from tensorflow.python.keras.mixed_precision.experimental import autocast_variable
 from tensorflow.python.keras.optimizer_v2 import gradient_descent as gradient_descent_v2
 from tensorflow.python.ops import array_ops
@@ -40,30 +43,17 @@ from tensorflow.python.platform import test
 from tensorflow.python.training import gradient_descent as gradient_descent_v1
 from tensorflow.python.training.tracking import util as trackable_utils
 
-TESTCASES = ({
-    'testcase_name': 'base',
-    'distribute': False
-}, {
-    'testcase_name': 'distribute',
-    'distribute': True
-})
-
-
-def get_distribute_scope(distribute):
-
-  class DummyContextManager(object):
-
-    def __enter__(self):
-      pass
-
-    def __exit__(self, *args):
-      pass
-
-  if distribute:
-    return mirrored_strategy.MirroredStrategy(['cpu:0']).scope()
-  else:
-    return DummyContextManager()
+class DummyStrategy(object):
+  @contextlib.contextmanager
+  def scope(self):
+    yield
 
+maybe_distribute = combinations.combine(
+    distribution=[
+        combinations.NamedDistribution(
+            "Dummy", lambda: DummyStrategy(), required_gpus=None),
+        strategy_combinations.mirrored_strategy_with_cpu_1_and_2
+    ])
 
 def get_var(val, dtype, name=None):
   return variables.VariableV1(val, use_resource=True, dtype=dtype, name=name)
@@ -71,10 +61,13 @@ def get_var(val, dtype, name=None):
 
 @combinations.generate(combinations.combine(mode=['graph', 'eager']))
 class AutoCastVariableTest(test.TestCase, parameterized.TestCase):
+  def setUp(self):
+    strategy_combinations.set_virtual_cpus_to_at_least(3)
+    super(AutoCastVariableTest, self).setUp()
 
-  @parameterized.named_parameters(*TESTCASES)
-  def test_read(self, distribute):
-    with get_distribute_scope(distribute):
+  @combinations.generate(maybe_distribute)
+  def test_read(self, distribution):
+    with distribution.scope():
       x = get_var(1., dtypes.float32)
       x = autocast_variable.create_autocast_variable(x)
       self.evaluate(x.initializer)
@@ -116,9 +109,9 @@ class AutoCastVariableTest(test.TestCase, parameterized.TestCase):
       self.assertEqual(x.sparse_read([0]).dtype, dtypes.float16)
       self.assertEqual(x.gather_nd([0]).dtype, dtypes.float16)
 
-  @parameterized.named_parameters(*TESTCASES)
-  def test_read_nested_scopes(self, distribute):
-    with get_distribute_scope(distribute):
+  @combinations.generate(maybe_distribute)
+  def test_read_nested_scopes(self, distribution):
+    with distribution.scope():
       x = get_var(1., dtypes.float32)
       x = autocast_variable.create_autocast_variable(x)
       self.evaluate(x.initializer)
@@ -136,9 +129,9 @@ class AutoCastVariableTest(test.TestCase, parameterized.TestCase):
         self.assertEqual(x.dtype, dtypes.float16)
         self.assertEqual(x.read_value().dtype, dtypes.float16)
 
-  @parameterized.named_parameters(*TESTCASES)
-  def test_dtype_is_not_string(self, distribute):
-    with get_distribute_scope(distribute):
+  @combinations.generate(maybe_distribute)
+  def test_dtype_is_not_string(self, distribution):
+    with distribution.scope():
       x = get_var(1., dtypes.float32)
       x = autocast_variable.create_autocast_variable(x)
       self.assertEqual(x.dtype, dtypes.float32)
@@ -153,13 +146,13 @@ class AutoCastVariableTest(test.TestCase, parameterized.TestCase):
         self.assertEqual(x.true_dtype, dtypes.float32)
         self.assertIsInstance(x.true_dtype, dtypes.DType)
 
-  @parameterized.named_parameters(*TESTCASES)
-  def test_method_delegations(self, distribute):
+  @combinations.generate(maybe_distribute)
+  def test_method_delegations(self, distribution):
     # Test AutoCastVariable correctly delegates Variable methods to the
     # underlying variable.
-    with self.test_session(), get_distribute_scope(distribute):
+    with self.test_session(), distribution.scope():
       for read_dtype in (dtypes.float32, dtypes.float16):
-        if distribute:
+        if ds_context.has_strategy():
           # MirroredVariable.assign will (incorrectly) return a Mirrored value
           # instead of a MirroredVariable. So we cannot properly wrap it in an
           # AutoCastVariable.
@@ -183,14 +176,14 @@ class AutoCastVariableTest(test.TestCase, parameterized.TestCase):
           self.assertEqual(x.aggregation, x._variable.aggregation)
           self.assertEqual(self.evaluate(x.initialized_value()), 7)
           if not context.executing_eagerly():
-            if not distribute:
+            if not ds_context.has_strategy():
               # These functions are not supported for DistributedVariables
               x.load(9)
               self.assertEqual(x.eval(), 9)
             self.assertEqual(self.evaluate(x.initial_value), 7)
             self.assertEqual(x.op, x._variable.op)
             self.assertEqual(x.graph, x._variable.graph)
-          if not distribute:
+          if not ds_context.has_strategy():
             # These attributes are not supported for DistributedVariables
             self.assertIsNone(x.constraint)
             self.assertEqual(x.initializer, x._variable.initializer)
@@ -202,7 +195,7 @@ class AutoCastVariableTest(test.TestCase, parameterized.TestCase):
           self.assertEqual(x.shape, ())
           self.assertEqual(x.get_shape(), ())
 
-        if not distribute:
+        if not ds_context.has_strategy():
           # Test scatter_* methods. These are not supported for
           # DistributedVariables
           x = get_var([7, 8], dtypes.float32)
@@ -233,9 +226,9 @@ class AutoCastVariableTest(test.TestCase, parameterized.TestCase):
             self.assertAllEqual(
                 evaluate(x.scatter_nd_update([[0], [1]], [1., 2.])), [1, 2])
 
-  @parameterized.named_parameters(*TESTCASES)
-  def test_operator_overloads(self, distribute):
-    with get_distribute_scope(distribute):
+  @combinations.generate(maybe_distribute)
+  def test_operator_overloads(self, distribution):
+    with distribution.scope():
       for read_dtype in (dtypes.float32, dtypes.float16):
         x = get_var(7., dtypes.float32)
         x = autocast_variable.create_autocast_variable(x)
@@ -280,9 +273,9 @@ class AutoCastVariableTest(test.TestCase, parameterized.TestCase):
             self.assertAllEqual(x == [7., 8., 10.], [True, True, False])
             self.assertAllEqual(x != [7., 8., 10.], [False, False, True])
 
-  @parameterized.named_parameters(*TESTCASES)
-  def test_assign(self, distribute):
-    with get_distribute_scope(distribute):
+  @combinations.generate(maybe_distribute)
+  def test_assign(self, distribution):
+    with distribution.scope():
       x = get_var(0., dtypes.float32)
       x = autocast_variable.create_autocast_variable(x)
       self.evaluate(x.initializer)
@@ -318,18 +311,19 @@ class AutoCastVariableTest(test.TestCase, parameterized.TestCase):
         self.assertAllClose(3., self.evaluate(x.assign_sub(3.)))
 
         # Assign multiple times
-        assign = x.assign(1.)
-        self.assertAllClose(1., self.evaluate(assign))
-        self.assertAllClose(0., self.evaluate(assign.assign(0.)))
-        assign_add = x.assign_add(3.)
-        self.assertAllClose(3., self.evaluate(assign_add))
-        self.assertAllClose(3. * 3,
-                            self.evaluate(x.assign_add(3.).assign_add(3.)))
-        self.assertAllClose(3. * 3, x)
-        assign_sub = x.assign_sub(3.)
-        self.assertAllClose(3. * 2, self.evaluate(assign_sub))
-        self.assertAllClose(0.,
-                            self.evaluate(x.assign_sub(3.).assign_sub(3.)))
+        if not ds_context.has_strategy():
+          assign = x.assign(1.)
+          self.assertAllClose(1., self.evaluate(assign))
+          self.assertAllClose(0., self.evaluate(assign.assign(0.)))
+          assign_add = x.assign_add(3.)
+          self.assertAllClose(3., self.evaluate(assign_add))
+          self.assertAllClose(3. * 3,
+                              self.evaluate(x.assign_add(3.).assign_add(3.)))
+          self.assertAllClose(3. * 3, x)
+          assign_sub = x.assign_sub(3.)
+          self.assertAllClose(3. * 2, self.evaluate(assign_sub))
+          self.assertAllClose(0.,
+                              self.evaluate(x.assign_sub(3.).assign_sub(3.)))
 
         # Assign with read_value=False
         self.assertIsNone(self.evaluate(x.assign(1., read_value=False)))
@@ -355,9 +349,9 @@ class AutoCastVariableTest(test.TestCase, parameterized.TestCase):
         # assign still expect float32 value even if in float16 scope
         run_and_check()
 
-  @parameterized.named_parameters(*TESTCASES)
-  def test_assign_stays_in_true_dtype(self, distribute):
-    with get_distribute_scope(distribute):
+  @combinations.generate(maybe_distribute)
+  def test_assign_stays_in_true_dtype(self, distribution):
+    with distribution.scope():
       x = get_var(1., dtypes.float32)
       x = autocast_variable.create_autocast_variable(x)
       self.evaluate(x.initializer)
@@ -382,10 +376,10 @@ class AutoCastVariableTest(test.TestCase, parameterized.TestCase):
         self.assertEqual(1., self.evaluate(x.value()))
       self.assertEqual(1. + small_val, self.evaluate(x.value()))
 
-  @parameterized.named_parameters(*TESTCASES)
-  def test_checkpoint(self, distribute):
+  @combinations.generate(maybe_distribute)
+  def test_checkpoint(self, distribution):
     with self.test_session():
-      with get_distribute_scope(distribute):
+      with distribution.scope():
         x = get_var(1., dtypes.float32)
         x = autocast_variable.create_autocast_variable(x)
       self.evaluate(x.initializer)
@@ -398,9 +392,9 @@ class AutoCastVariableTest(test.TestCase, parameterized.TestCase):
       checkpoint.restore(save_path).assert_consumed().run_restore_ops()
       self.assertEqual(self.evaluate(x), 123.)
 
-  @parameterized.named_parameters(*TESTCASES)
-  def test_invalid_wrapped_variable(self, distribute):
-    with get_distribute_scope(distribute):
+  @combinations.generate(maybe_distribute)
+  def test_invalid_wrapped_variable(self, distribution):
+    with distribution.scope():
       # Wrap a non-variable
       with self.assertRaisesRegexp(ValueError, 'variable must be of type'):
         x = constant_op.constant([1.], dtype=dtypes.float32)
@@ -443,7 +437,7 @@ class AutoCastVariableTest(test.TestCase, parameterized.TestCase):
         )
 
   def test_repr_distributed(self):
-    with get_distribute_scope(distribute=True):
+    with mirrored_strategy.MirroredStrategy(["/cpu:1", "/cpu:2"]).scope():
       x = get_var(1., dtypes.float32)
       x = autocast_variable.create_autocast_variable(x)
       self.assertRegexpMatches(

From 0e3a53269fe3d02d114a54430471116a400f4207 Mon Sep 17 00:00:00 2001
From: Robert David <lrdx@google.com>
Date: Mon, 15 Jun 2020 16:25:46 -0700
Subject: [PATCH 0231/1390] Rename SVDF's kInputActivationStateTensor to
 kStateTensor, as there is no other state in SVDF, and making variable names
 consistent with the corresponding parameter names of
 reference_ops::Eval..Svdf functions.

PiperOrigin-RevId: 316568280
Change-Id: I61dacf43e7a97437df531372b91ccba0876663ff
---
 tensorflow/lite/kernels/svdf.cc | 35 +++++++++++++++------------------
 1 file changed, 16 insertions(+), 19 deletions(-)

diff --git a/tensorflow/lite/kernels/svdf.cc b/tensorflow/lite/kernels/svdf.cc
index 82b7b7e4ee5..57eedb6b204 100644
--- a/tensorflow/lite/kernels/svdf.cc
+++ b/tensorflow/lite/kernels/svdf.cc
@@ -54,7 +54,7 @@ constexpr int kWeightsFeatureTensor = 1;
 constexpr int kWeightsTimeTensor = 2;
 constexpr int kBiasTensor = 3;
 // This is a variable tensor, and will be modified by this op.
-constexpr int kInputActivationStateTensor = 4;
+constexpr int kStateTensor = 4;
 
 // Output tensor.
 constexpr int kOutputTensor = 0;
@@ -107,14 +107,13 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
     TF_LITE_ENSURE_EQ(context, bias->dims->data[0], num_units);
   }
 
-  const TfLiteTensor* activation_state =
-      GetInput(context, node, kInputActivationStateTensor);
+  const TfLiteTensor* state = GetInput(context, node, kStateTensor);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
 
   // Check the shape of input state tensors.
-  TF_LITE_ENSURE_EQ(context, NumDimensions(activation_state), 2);
-  TF_LITE_ENSURE_EQ(context, SizeOfDimension(activation_state, 0), batch_size);
-  TF_LITE_ENSURE_EQ(context, SizeOfDimension(activation_state, 1),
+  TF_LITE_ENSURE_EQ(context, NumDimensions(state), 2);
+  TF_LITE_ENSURE_EQ(context, SizeOfDimension(state, 0), batch_size);
+  TF_LITE_ENSURE_EQ(context, SizeOfDimension(state, 1),
                     memory_size * num_filters);
 
   // Resize output.
@@ -184,7 +183,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
     }
 
     // Used to store dequantized weights_time matrix for hybrid computation of
-    // matmul(activation_state, weights_time), which occurs in floating point.
+    // matmul(state, weights_time), which occurs in floating point.
     node->temporaries->data[3] = scratch_tensor_index + 3;
     TfLiteTensor* float_weights_time = GetTemporary(context, node, /*index=*/3);
     float_weights_time->type = kTfLiteFloat32;
@@ -239,8 +238,8 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
         reinterpret_cast<TfLiteAffineQuantization*>(input->quantization.params);
     auto* weights_feature_params = reinterpret_cast<TfLiteAffineQuantization*>(
         weights_feature->quantization.params);
-    auto* state_params = reinterpret_cast<TfLiteAffineQuantization*>(
-        activation_state->quantization.params);
+    auto* state_params =
+        reinterpret_cast<TfLiteAffineQuantization*>(state->quantization.params);
     auto* weight_time_params = reinterpret_cast<TfLiteAffineQuantization*>(
         weights_time->quantization.params);
     auto* output_params = reinterpret_cast<TfLiteAffineQuantization*>(
@@ -272,15 +271,14 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 
   TfLiteTensor* scratch = GetTemporary(context, node, /*index=*/0);
 
-  TfLiteTensor* activation_state =
-      GetVariableInput(context, node, kInputActivationStateTensor);
+  TfLiteTensor* state = GetVariableInput(context, node, kStateTensor);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
 
   switch (weights_feature->type) {
     case kTfLiteFloat32: {
       reference_ops::EvalFloatSVDF(context, node, input, weights_feature,
-                                   weights_time, bias, params, scratch,
-                                   activation_state, output);
+                                   weights_time, bias, params, scratch, state,
+                                   output);
       return kTfLiteOk;
       break;
     }
@@ -315,8 +313,8 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 
         reference_ops::EvalHybridSVDF(
             context, node, input, weights_feature, float_weights_time, bias,
-            params, scratch, scaling_factors, input_quantized, activation_state,
-            output, zero_points, row_sums, &op_data->compute_row_sums);
+            params, scratch, scaling_factors, input_quantized, state, output,
+            zero_points, row_sums, &op_data->compute_row_sums);
         return kTfLiteOk;
       } else {
         auto* input_params = reinterpret_cast<TfLiteAffineQuantization*>(
@@ -330,10 +328,9 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
         TF_LITE_ENSURE_EQ(context, params->activation, kTfLiteActRelu);
         reference_ops::EvalIntegerSVDF(
             context, node, input, weights_feature, weights_time, bias, params,
-            activation_state, output, scratch, output_temp,
-            op_data->effective_scale_1_a, op_data->effective_scale_1_b,
-            op_data->effective_scale_2_a, op_data->effective_scale_2_b,
-            input_params->zero_point->data[0],
+            state, output, scratch, output_temp, op_data->effective_scale_1_a,
+            op_data->effective_scale_1_b, op_data->effective_scale_2_a,
+            op_data->effective_scale_2_b, input_params->zero_point->data[0],
             output_params->zero_point->data[0]);
         return kTfLiteOk;
       }

From e2e974a383c06288664b8276eab1bd25659ab0c0 Mon Sep 17 00:00:00 2001
From: Raman Sarokin <sorokin@google.com>
Date: Mon, 15 Jun 2020 16:35:58 -0700
Subject: [PATCH 0232/1390] Arguments declaration moved inside
 TransformToCLCode.

PiperOrigin-RevId: 316570056
Change-Id: Ia8096e251ac78ad68d2439b8ea91c03bc1dc35bf
---
 tensorflow/lite/delegates/gpu/cl/arguments.cc |  2 ++
 .../lite/delegates/gpu/cl/kernels/BUILD       |  1 -
 .../lite/delegates/gpu/cl/kernels/softmax.cc  | 24 +++++----------
 .../delegates/gpu/cl/kernels/transpose.cc     |  5 +---
 .../lite/delegates/gpu/cl/kernels/winograd.cc | 29 +++++--------------
 5 files changed, 19 insertions(+), 42 deletions(-)

diff --git a/tensorflow/lite/delegates/gpu/cl/arguments.cc b/tensorflow/lite/delegates/gpu/cl/arguments.cc
index afb58ba46ad..53303eab079 100644
--- a/tensorflow/lite/delegates/gpu/cl/arguments.cc
+++ b/tensorflow/lite/delegates/gpu/cl/arguments.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_replace.h"
 #include "absl/strings/str_split.h"
+#include "absl/strings/substitute.h"
 #include "tensorflow/lite/delegates/gpu/cl/tensor_type.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
 
@@ -445,6 +446,7 @@ absl::Status Arguments::TransformToCLCode(
   RETURN_IF_ERROR(AddObjectArgs());
   RETURN_IF_ERROR(ResolveSelectorsPass(linkables, code));
   ResolveArgsPass(device_info, code);
+  *code = absl::Substitute(*code, GetListOfArgs());
   return absl::OkStatus();
 }
 
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/BUILD b/tensorflow/lite/delegates/gpu/cl/kernels/BUILD
index d1f6d4014fe..24a9a962296 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/BUILD
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/BUILD
@@ -1130,7 +1130,6 @@ cc_library(
         "//tensorflow/lite/delegates/gpu/cl:tensor",
         "//tensorflow/lite/delegates/gpu/common:status",
         "//tensorflow/lite/delegates/gpu/common:types",
-        "@com_google_absl//absl/strings",
     ],
 )
 
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/softmax.cc b/tensorflow/lite/delegates/gpu/cl/kernels/softmax.cc
index e5f0933401a..fda7dbba6dd 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/softmax.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/softmax.cc
@@ -17,7 +17,6 @@ limitations under the License.
 
 #include <string>
 
-#include "absl/strings/substitute.h"
 #include "tensorflow/lite/delegates/gpu/cl/kernels/util.h"
 #include "tensorflow/lite/delegates/gpu/cl/kernels/work_group_picking.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
@@ -29,7 +28,6 @@ namespace {
 
 std::string GetSoftmaxKernelCode(
     const OperationDef& op_def,
-    const std::vector<ElementwiseOperation*>& linked_operations,
     Arguments* args) {
   auto src_desc = absl::make_unique<TensorDescriptor>(op_def.src_tensors[0]);
   if (op_def.IsBatchSupported()) {
@@ -43,12 +41,7 @@ std::string GetSoftmaxKernelCode(
   args->AddObjectRef("dst_tensor", AccessType::WRITE, std::move(dst_desc));
 
   std::string c = GetCommonDefines(op_def.precision);
-  std::string linked_args = GetArgsDeclaration(linked_operations);
-  if (linked_args[0] == ',') {
-    linked_args[0] = ' ';
-  }
   c += "__kernel void main_function(\n";
-  c += linked_args;
   c += "$0) {\n";
   c += "  int X = get_global_id(0);\n";
   c += "  int Y = get_global_id(1);\n";
@@ -66,7 +59,6 @@ std::string GetSoftmaxKernelCode(
   c += "    float4 t = args.src_tensor.Read<float>(X, Y, d);\n";
   c += "    t = exp(t) / sum;\n";
   c += "    FLT4 result = TO_FLT4(t);\n";
-  c += PostProcess(linked_operations, {"result", "X", "Y", "d"});
   c += "    args.dst_tensor.Write(result, X, Y, d);\n";
   c += "  }\n";
   c += "}\n";
@@ -89,11 +81,13 @@ Softmax& Softmax::operator=(Softmax&& kernel) {
 }
 
 absl::Status Softmax::Compile(const CreationContext& creation_context) {
-  std::string code =
-      GetSoftmaxKernelCode(definition_, linked_operations_, &args_);
+  std::string code = GetSoftmaxKernelCode(definition_, &args_);
+  std::string element_wise_code;
   RETURN_IF_ERROR(
-      args_.TransformToCLCode(creation_context.device->GetInfo(), {}, &code));
-  code = absl::Substitute(code, args_.GetListOfArgs());
+      MergeOperations(linked_operations_, &args_, &element_wise_code));
+  RETURN_IF_ERROR(args_.TransformToCLCode(creation_context.device->GetInfo(),
+                                          {{"dst_tensor", element_wise_code}},
+                                          &code));
   return creation_context.cache->GetOrCreateCLKernel(
       code, "main_function", *creation_context.context,
       *creation_context.device, &kernel_);
@@ -102,10 +96,8 @@ absl::Status Softmax::Compile(const CreationContext& creation_context) {
 absl::Status Softmax::BindArguments() {
   RETURN_IF_ERROR(args_.SetObjectRef("src_tensor", src_[0]));
   RETURN_IF_ERROR(args_.SetObjectRef("dst_tensor", dst_[0]));
-  kernel_.ResetBindingCounter();
-  RETURN_IF_ERROR(BindArgs(&kernel_, linked_operations_));
-  RETURN_IF_ERROR(args_.Bind(kernel_.kernel(), kernel_.GetBindingCounter()));
-  return absl::OkStatus();
+  RETURN_IF_ERROR(SetArguments(linked_operations_, &args_));
+  return args_.Bind(kernel_.kernel());
 }
 
 int3 Softmax::GetGridSize() const {
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/transpose.cc b/tensorflow/lite/delegates/gpu/cl/kernels/transpose.cc
index 8c5af6e9785..e12c44566b7 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/transpose.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/transpose.cc
@@ -18,7 +18,6 @@ limitations under the License.
 #include <string>
 
 #include "absl/strings/str_cat.h"
-#include "absl/strings/substitute.h"
 #include "tensorflow/lite/delegates/gpu/cl/kernels/util.h"
 #include "tensorflow/lite/delegates/gpu/cl/kernels/work_group_picking.h"
 
@@ -127,7 +126,6 @@ absl::Status Transpose::Compile(const CreationContext& creation_context) {
   RETURN_IF_ERROR(args_.TransformToCLCode(creation_context.device->GetInfo(),
                                           {{"dst_tensor", element_wise_code}},
                                           &code));
-  code = absl::Substitute(code, args_.GetListOfArgs());
   return creation_context.cache->GetOrCreateCLKernel(
       code, "main_function", *creation_context.context,
       *creation_context.device, &kernel_);
@@ -137,8 +135,7 @@ absl::Status Transpose::BindArguments() {
   RETURN_IF_ERROR(args_.SetObjectRef("src_tensor", src_[0]));
   RETURN_IF_ERROR(args_.SetObjectRef("dst_tensor", dst_[0]));
   RETURN_IF_ERROR(SetArguments(linked_operations_, &args_));
-  RETURN_IF_ERROR(args_.Bind(kernel_.kernel()));
-  return absl::OkStatus();
+  return args_.Bind(kernel_.kernel());
 }
 
 int3 Transpose::GetGridSize() const {
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/winograd.cc b/tensorflow/lite/delegates/gpu/cl/kernels/winograd.cc
index aa47e3a1c24..d71513e4de4 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/winograd.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/winograd.cc
@@ -19,7 +19,6 @@ limitations under the License.
 #include <vector>
 
 #include "absl/strings/str_format.h"
-#include "absl/strings/substitute.h"
 #include "tensorflow/lite/delegates/gpu/cl/cl_device.h"
 #include "tensorflow/lite/delegates/gpu/cl/kernels/util.h"
 #include "tensorflow/lite/delegates/gpu/cl/kernels/work_group_picking.h"
@@ -36,7 +35,6 @@ namespace {
 
 std::string GetWinograd4x4To36Code(
     const OperationDef& op_def,
-    const std::vector<ElementwiseOperation*>& linked_operations,
     Arguments* args) {
   std::string c = GetCommonDefines(op_def.precision);
 
@@ -82,12 +80,7 @@ std::string GetWinograd4x4To36Code(
   args->AddInt("tiles_total");
   args->AddInt("tiles_x");
 
-  std::string linked_args = GetArgsDeclaration(linked_operations);
-  if (linked_args[0] == ',') {
-    linked_args[0] = ' ';
-  }
   c += "__kernel void main_function(\n";
-  c += linked_args;
   c += "$0) {\n";
   c += "  int DST_X = get_global_id(0);\n";
   c += "  int DST_Y = get_global_id(1);\n";
@@ -181,14 +174,12 @@ std::string GetWinograd4x4To36Code(
   const LinkingContext context{"r0", "DST_X", "DST_Y", "DST_Z"};
   c += "  {\n";
   c += "    FLT4 r0 = TO_FLT4(I0 + Bt[2] * I2 + Bt[4] * I4);\n";
-  c += PostProcess(linked_operations, context);
   c += "    args.dst_tensor.Write(r0, DST_X, DST_Y, DST_Z);\n";
   c += "    DST_Y++;\n";
   c += "  }\n";
   c += "  {\n";
   c += "    FLT4 r0 = TO_FLT4(Bt[7] * I1 + Bt[8] * I2 + Bt[9] * I3 + Bt[10] * "
        "I4);\n";
-  c += PostProcess(linked_operations, context);
   c += "    args.dst_tensor.Write(r0, DST_X, DST_Y, DST_Z);\n";
   c += "    DST_Y++;\n";
   c += "  }\n";
@@ -196,7 +187,6 @@ std::string GetWinograd4x4To36Code(
   c += "    FLT4 r0 = TO_FLT4(Bt[13] * I1 + Bt[14] * I2 + Bt[15] * I3 + Bt[16] "
        "* "
        "I4);\n";
-  c += PostProcess(linked_operations, context);
   c += "    args.dst_tensor.Write(r0, DST_X, DST_Y, DST_Z);\n";
   c += "    DST_Y++;\n";
   c += "  }\n";
@@ -204,7 +194,6 @@ std::string GetWinograd4x4To36Code(
   c += "    FLT4 r0 = TO_FLT4(Bt[19] * I1 + Bt[20] * I2 + Bt[21] * I3 + Bt[22] "
        "* "
        "I4);\n";
-  c += PostProcess(linked_operations, context);
   c += "    args.dst_tensor.Write(r0, DST_X, DST_Y, DST_Z);\n";
   c += "    DST_Y++;\n";
   c += "  }\n";
@@ -212,13 +201,11 @@ std::string GetWinograd4x4To36Code(
   c += "    FLT4 r0 = TO_FLT4(Bt[25] * I1 + Bt[26] * I2 + Bt[27] * I3 + Bt[28] "
        "* "
        "I4);\n";
-  c += PostProcess(linked_operations, context);
   c += "    args.dst_tensor.Write(r0, DST_X, DST_Y, DST_Z);\n";
   c += "    DST_Y++;\n";
   c += "  }\n";
   c += "  {\n";
   c += "    FLT4 r0 = TO_FLT4(Bt[31] * I1 + Bt[33] * I3 + I5);\n";
-  c += PostProcess(linked_operations, context);
   c += "    args.dst_tensor.Write(r0, DST_X, DST_Y, DST_Z);\n";
   c += "    DST_Y++;\n";
   c += "  }\n";
@@ -389,11 +376,13 @@ absl::Status Winograd4x4To36::Compile(const CreationContext& creation_context) {
     options.push_back(CompilerOptions::POWERVR_FP16);
   }
   RETURN_IF_ERROR(UploadBt(creation_context.context));
-  std::string code =
-      GetWinograd4x4To36Code(definition_, linked_operations_, &args_);
+  std::string code = GetWinograd4x4To36Code(definition_, &args_);
+  std::string element_wise_code;
   RETURN_IF_ERROR(
-      args_.TransformToCLCode(creation_context.device->GetInfo(), {}, &code));
-  code = absl::Substitute(code, args_.GetListOfArgs());
+      MergeOperations(linked_operations_, &args_, &element_wise_code));
+  RETURN_IF_ERROR(args_.TransformToCLCode(creation_context.device->GetInfo(),
+                                          {{"dst_tensor", element_wise_code}},
+                                          &code));
   RETURN_IF_ERROR(creation_context.cache->GetOrCreateCLKernel(
       code, "main_function", options, *creation_context.context,
       *creation_context.device, &kernel_));
@@ -445,10 +434,8 @@ absl::Status Winograd4x4To36::BindArguments() {
   RETURN_IF_ERROR(args_.SetInt("padding_y", -padding_.prepended.h));
   RETURN_IF_ERROR(args_.SetInt("tiles_total", tiles_total));
   RETURN_IF_ERROR(args_.SetInt("tiles_x", tiles_x));
-  kernel_.ResetBindingCounter();
-  RETURN_IF_ERROR(BindArgs(&kernel_, linked_operations_));
-  RETURN_IF_ERROR(args_.Bind(kernel_.kernel(), kernel_.GetBindingCounter()));
-  return absl::OkStatus();
+  RETURN_IF_ERROR(SetArguments(linked_operations_, &args_));
+  return args_.Bind(kernel_.kernel());
 }
 
 int3 Winograd4x4To36::GetGridSize() const {

From 4aef940d519fb79f4b4c0b44fa4adf268b396183 Mon Sep 17 00:00:00 2001
From: Jacques Pienaar <jpienaar@google.com>
Date: Mon, 15 Jun 2020 16:38:27 -0700
Subject: [PATCH 0233/1390] Select default implementation via DeviceIndex

For DeviceIndex that feeds into Case, simply select the default implementation for now.

PiperOrigin-RevId: 316570448
Change-Id: Ic088cfe5a2da6953526a46e4f51e69ec42c47e0f
---
 tensorflow/compiler/mlir/lite/BUILD           |  1 +
 .../lite/tests/tf_device_index_selector.mlir  | 25 ++++++
 .../compiler/mlir/lite/tf_tfl_passes.cc       |  1 +
 .../lite/transforms/device_index_selector.cc  | 85 +++++++++++++++++++
 .../compiler/mlir/lite/transforms/passes.h    |  3 +
 5 files changed, 115 insertions(+)
 create mode 100644 tensorflow/compiler/mlir/lite/tests/tf_device_index_selector.mlir
 create mode 100644 tensorflow/compiler/mlir/lite/transforms/device_index_selector.cc

diff --git a/tensorflow/compiler/mlir/lite/BUILD b/tensorflow/compiler/mlir/lite/BUILD
index 8d4efeb3d60..8e9d615053c 100644
--- a/tensorflow/compiler/mlir/lite/BUILD
+++ b/tensorflow/compiler/mlir/lite/BUILD
@@ -314,6 +314,7 @@ tf_cc_test(
 cc_library(
     name = "tensorflow_lite_legalize_tf",
     srcs = [
+        "transforms/device_index_selector.cc",
         "transforms/dilated_conv.cc",
         "transforms/generated_legalize_tf.inc",
         "transforms/generated_lower_static_tensor_list.inc",
diff --git a/tensorflow/compiler/mlir/lite/tests/tf_device_index_selector.mlir b/tensorflow/compiler/mlir/lite/tests/tf_device_index_selector.mlir
new file mode 100644
index 00000000000..1ac7f30d644
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/tests/tf_device_index_selector.mlir
@@ -0,0 +1,25 @@
+// Test DeviceIndex selector.
+
+// RUN: tf-opt --tfl-device-index-selector %s | FileCheck %s
+
+// CHECK-LABEL: func @select
+func @select(%arg0: tensor<f32>, %arg1: tensor<f32>) -> (tensor<i32>, tensor<f32>) {
+  // CHECK:  %[[first:.*]] = "tf.DeviceIndex"
+  // CHECK: constant dense<2>
+  // CHECK:  return %[[first]],
+  %0 = "tf.DeviceIndex"() {device = "", device_names = ["CPU", "GPU"]} : () -> tensor<i32>
+  %1 = "tf.DeviceIndex"() {device = "", device_names = ["CPU", "GPU"]} : () -> tensor<i32>
+  %4 = "tf.Case"(%1, %arg0, %arg1) {branches = [@sub, @add], output_shapes = [#tf.shape<>]} : (tensor<i32>, tensor<f32>, tensor<f32>) -> tensor<f32>
+
+  return %0, %4 : tensor<i32>, tensor<f32>
+}
+
+func @add(%i: tensor<i32>, %arg0: tensor<*xf32>, %arg1: tensor<*xf32>) -> tensor<*xf32> {
+  %0 = "tf.Add"(%arg0, %arg1): (tensor<*xf32>, tensor<*xf32>) -> tensor<*xf32>
+  return %0 : tensor<*xf32>
+}
+
+func @sub(%i: tensor<i32>, %arg0: tensor<*xf32>, %arg1: tensor<*xf32>) -> tensor<*xf32> {
+  %0 = "tf.Sub"(%arg0, %arg1) : (tensor<*xf32>, tensor<*xf32>) -> tensor<*xf32>
+  return %0 : tensor<*xf32>
+}
diff --git a/tensorflow/compiler/mlir/lite/tf_tfl_passes.cc b/tensorflow/compiler/mlir/lite/tf_tfl_passes.cc
index 06fe8684ce4..589515d6246 100644
--- a/tensorflow/compiler/mlir/lite/tf_tfl_passes.cc
+++ b/tensorflow/compiler/mlir/lite/tf_tfl_passes.cc
@@ -63,6 +63,7 @@ void AddTFToTFLConversionPasses(const mlir::TFL::PassConfig& pass_config,
   standard_pipeline_options.enable_inliner = false;
   standard_pipeline_options.form_clusters = pass_config.form_clusters;
   mlir::TF::CreateTFStandardPipeline(*pass_manager, standard_pipeline_options);
+  pass_manager->addPass(mlir::TFL::CreateDeviceIndexSelectorPass());
 
   if (pass_config.shape_inference) {
     pass_manager->addPass(mlir::TF::CreateTFShapeInferencePass());
diff --git a/tensorflow/compiler/mlir/lite/transforms/device_index_selector.cc b/tensorflow/compiler/mlir/lite/transforms/device_index_selector.cc
new file mode 100644
index 00000000000..d4aed750dc8
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/transforms/device_index_selector.cc
@@ -0,0 +1,85 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Converts DeviceIndex to constant device.
+
+#include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/lite/transforms/passes.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+
+namespace mlir {
+namespace TFL {
+namespace {
+
+// Folds the DeviceIndex op to a constant value. The DeviceIndex return the
+// index of the device the op should run on. The user can use this to provide
+// different op specializations. E.g.,
+//
+// ```mlir
+//  %1 = "tf.DeviceIndex"()
+//          {device = "", device_names = ["CPU", "GPU"]} : () -> tensor<i32>
+//  %4 = "tf.Case"(%1, %arg0, %arg1)
+//          {branches = [@foo, @baz], output_shapes = [#tf.shape<>]} :
+//            (tensor<i32>, tensor<f32>, tensor<f32>) -> tensor<f32>
+// ```
+//
+// Shows an example where there are 2 different functions which could be
+// executed to produce the same values but with different functions optimized
+// for CPU or GPU.
+struct DeviceIndexSelector
+    : public PassWrapper<DeviceIndexSelector, OperationPass<FuncOp>> {
+  void runOnOperation() override;
+};
+
+}  // namespace
+
+void DeviceIndexSelector::runOnOperation() {
+  FuncOp func = getOperation();
+  // Convert all the DeviceIndex ops to constant values.
+  func.getBody().walk([](TF::DeviceIndexOp op) {
+    // This just selects the default in all cases where DeviceIndex feeds into
+    // tf.Case. This could be enhanced based on explicit TFLite specification or
+    // TAC in future.
+    OpBuilder b(op);
+    RankedTensorType type = RankedTensorType::get({}, b.getIntegerType(32));
+    int index = op.device_names().size();
+    for (auto use : op.getOperation()->getUsers()) {
+      // Skip if it doesn't feed into case. Alternatively this could always
+      // return the CPU device index if it exists.
+      if (!isa<TF::CaseOp>(use)) return;
+    }
+    DenseElementsAttr attr =
+        DenseElementsAttr::get(type, b.getI32IntegerAttr(index));
+    auto constant = b.create<ConstantOp>(op.getLoc(), type, attr);
+    op.replaceAllUsesWith(constant.getOperation());
+    op.erase();
+  });
+}
+
+// Creates an instance of the TensorFlow DeviceIndex selector pass.
+std::unique_ptr<OperationPass<FuncOp>> CreateDeviceIndexSelectorPass() {
+  return std::make_unique<DeviceIndexSelector>();
+}
+
+static PassRegistration<DeviceIndexSelector> pass(
+    "tfl-device-index-selector", "Fold tf.DeviceIndex to constant");
+
+}  // namespace TFL
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/lite/transforms/passes.h b/tensorflow/compiler/mlir/lite/transforms/passes.h
index 105c9394fb4..01e5eb1cb68 100644
--- a/tensorflow/compiler/mlir/lite/transforms/passes.h
+++ b/tensorflow/compiler/mlir/lite/transforms/passes.h
@@ -91,6 +91,9 @@ std::unique_ptr<OperationPass<ModuleOp>> CreateWhileOutlinePass();
 // Verifies runtime constraints.
 std::unique_ptr<OperationPass<FuncOp>> CreateRuntimeVerifyPass();
 
+// Creates function pass to select device index/fold tf.DeviceIndex.
+std::unique_ptr<OperationPass<FuncOp>> CreateDeviceIndexSelectorPass();
+
 }  // namespace TFL
 
 }  // namespace mlir

From bfbec5fb039001aaf9ae7bff05d3dee3bbf54d82 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 15 Jun 2020 16:39:39 -0700
Subject: [PATCH 0234/1390] Add a configuration option to BaseLayer/V1 that
 will throw an exception if a user tries to reconstitute a model with these
 layer objects in it but does not provide the base class.

This CL also changes SavedModel serialization code to use the Keras serialization name (which is different from the class name when using the register_keras_serializable decorator). This should fully enable custom layers in SavedModels.

PiperOrigin-RevId: 316570643
Change-Id: I5755a59cb74db8bbe6c3f607804abfb67c9ea8e0
---
 tensorflow/python/keras/engine/base_layer.py  | 29 +++++++++++++---
 .../saving/saved_model/layer_serialization.py | 10 ++++--
 .../python/keras/saving/saved_model/load.py   | 14 +++++++-
 .../saving/saved_model/saved_model_test.py    | 34 +++++++++++++++++++
 4 files changed, 79 insertions(+), 8 deletions(-)

diff --git a/tensorflow/python/keras/engine/base_layer.py b/tensorflow/python/keras/engine/base_layer.py
index 87f306b2879..a0ee25417c0 100644
--- a/tensorflow/python/keras/engine/base_layer.py
+++ b/tensorflow/python/keras/engine/base_layer.py
@@ -286,16 +286,34 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
       module.Module._TF_MODULE_IGNORED_PROPERTIES
   ))
 
+  # When loading from a SavedModel, Layers typically can be revived into a
+  # generic Layer wrapper. Sometimes, however, layers may implement methods
+  # that go beyond this wrapper, as in the case of PreprocessingLayers'
+  # `adapt` method. When this is the case, layer implementers can override
+  # must_restore_from_config to return True; layers with this property must
+  # be restored into their actual objects (and will fail if the object is
+  # not available to the restoration code).
+  _must_restore_from_config = False
+
   @trackable.no_automatic_dependency_tracking
-  def __init__(self, trainable=True, name=None, dtype=None, dynamic=False,
+  def __init__(self,
+               trainable=True,
+               name=None,
+               dtype=None,
+               dynamic=False,
                **kwargs):
     # These properties should be set by the user via keyword arguments.
     # note that 'dtype', 'input_shape' and 'batch_input_shape'
     # are only applicable to input layers: do not pass these keywords
     # to non-input layers.
     allowed_kwargs = {
-        'input_dim', 'input_shape', 'batch_input_shape', 'batch_size',
-        'weights', 'activity_regularizer', 'autocast'
+        'input_dim',
+        'input_shape',
+        'batch_input_shape',
+        'batch_size',
+        'weights',
+        'activity_regularizer',
+        'autocast',
     }
     # Validate optional keyword arguments.
     generic_utils.validate_kwargs(kwargs, allowed_kwargs)
@@ -637,7 +655,10 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
         Python dictionary.
     """
     all_args = tf_inspect.getfullargspec(self.__init__).args
-    config = {'name': self.name, 'trainable': self.trainable}
+    config = {
+        'name': self.name,
+        'trainable': self.trainable,
+    }
     if hasattr(self, '_batch_input_shape'):
       config['batch_input_shape'] = self._batch_input_shape
     config['dtype'] = policy.serialize(self._dtype_policy)
diff --git a/tensorflow/python/keras/saving/saved_model/layer_serialization.py b/tensorflow/python/keras/saving/saved_model/layer_serialization.py
index 0de38577dc0..559b6158d87 100644
--- a/tensorflow/python/keras/saving/saved_model/layer_serialization.py
+++ b/tensorflow/python/keras/saving/saved_model/layer_serialization.py
@@ -46,13 +46,15 @@ class LayerSavedModelSaver(base_serialization.SavedModelSaver):
     # TODO(kathywu): Synchronize with the keras spec (go/keras-json-spec) once
     # the python config serialization has caught up.
     metadata = dict(
-        class_name=type(self.obj).__name__,
+        class_name=generic_utils.get_registered_name(type(self.obj)),
         name=self.obj.name,
         trainable=self.obj.trainable,
         expects_training_arg=self.obj._expects_training_arg,  # pylint: disable=protected-access
         dtype=policy.serialize(self.obj._dtype_policy),  # pylint: disable=protected-access
         batch_input_shape=getattr(self.obj, '_batch_input_shape', None),
-        stateful=self.obj.stateful)
+        stateful=self.obj.stateful,
+        must_restore_from_config=self.obj._must_restore_from_config,  # pylint: disable=protected-access
+    )
 
     metadata.update(get_config(self.obj))
     if self.obj.input_spec is not None:
@@ -85,7 +87,8 @@ class LayerSavedModelSaver(base_serialization.SavedModelSaver):
     serialized_attr = keras_cache[self.obj] = (
         serialized_attributes.SerializedAttributes.new(self.obj))
 
-    if save_impl.should_skip_serialization(self.obj):
+    if (save_impl.should_skip_serialization(self.obj) or
+        self.obj._must_restore_from_config):  # pylint: disable=protected-access
       return serialized_attr
 
     object_dict, function_dict = self._get_serialized_attributes_internal(
@@ -128,6 +131,7 @@ class InputLayerSavedModelSaver(base_serialization.SavedModelSaver):
 
   @property
   def python_properties(self):
+
     return dict(
         class_name=type(self.obj).__name__,
         name=self.obj.name,
diff --git a/tensorflow/python/keras/saving/saved_model/load.py b/tensorflow/python/keras/saving/saved_model/load.py
index 7e67bf6305c..313eea4342e 100644
--- a/tensorflow/python/keras/saving/saved_model/load.py
+++ b/tensorflow/python/keras/saving/saved_model/load.py
@@ -407,6 +407,7 @@ class KerasObjectLoader(tf_load.Loader):
     #       found.
     class_name = metadata.get('class_name')
     config = metadata.get('config')
+    must_restore_from_config = metadata.get('must_restore_from_config')
     if not generic_utils.validate_config(config):
       return None
 
@@ -414,7 +415,18 @@ class KerasObjectLoader(tf_load.Loader):
       obj = layers_module.deserialize(
           generic_utils.serialize_keras_class_and_config(class_name, config))
     except ValueError:
-      return None
+      if must_restore_from_config:
+        raise RuntimeError(
+            'Unable to restore a layer of class {cls}. Layers of '
+            'class {cls} require that the class be provided to '
+            'the model loading code, either by registering the '
+            'class using @keras.utils.register_keras_serializable '
+            'on the class def and including that file in your '
+            'program, or by passing the class in a '
+            'keras.utils.CustomObjectScope that wraps this load '
+            'call.'.format(cls=class_name))
+      else:
+        return None
 
     # Use the dtype, name, and trainable status. Often times these are not
     # specified in custom configs, so retrieve their values from the metadata.
diff --git a/tensorflow/python/keras/saving/saved_model/saved_model_test.py b/tensorflow/python/keras/saving/saved_model/saved_model_test.py
index c208805686d..7eaa75b78e2 100644
--- a/tensorflow/python/keras/saving/saved_model/saved_model_test.py
+++ b/tensorflow/python/keras/saving/saved_model/saved_model_test.py
@@ -108,6 +108,11 @@ class LayerWithUpdate(keras.layers.Layer):
     return inputs * 2.
 
 
+@generic_utils.register_keras_serializable('Testing')
+class GlobalLayerThatShouldFailIfNotAdded(keras.layers.Layer):
+  _must_restore_from_config = True
+
+
 @keras_parameterized.run_all_keras_modes
 class TestModelSavingAndLoadingV2(keras_parameterized.TestCase):
 
@@ -331,6 +336,35 @@ class TestModelSavingAndLoadingV2(keras_parameterized.TestCase):
     self.assertAllEqual([None, 2, 3], loaded.input_spec['b'].shape)
     self.assertEqual('float16', loaded.input_spec['b'].dtype)
 
+  def test_must_restore_from_config_fails_if_layer_is_not_in_scope(self):
+
+    class LayerThatShouldFailIfNotAdded(keras.layers.Layer):
+      _must_restore_from_config = True
+
+    layer = LayerThatShouldFailIfNotAdded()
+    saved_model_dir = self._save_model_dir()
+    tf_save.save(layer, saved_model_dir)
+    with self.assertRaisesRegex(RuntimeError, 'Unable to restore a layer of'):
+      _ = keras_load.load(saved_model_dir)
+
+  def test_must_restore_from_config_custom_object_scope(self):
+
+    class LayerThatShouldFailIfNotAdded(keras.layers.Layer):
+      _must_restore_from_config = True
+
+    layer = LayerThatShouldFailIfNotAdded()
+    saved_model_dir = self._save_model_dir()
+    tf_save.save(layer, saved_model_dir)
+    with generic_utils.CustomObjectScope(
+        {'LayerThatShouldFailIfNotAdded': LayerThatShouldFailIfNotAdded}):
+      _ = keras_load.load(saved_model_dir)
+
+  def test_must_restore_from_config_registration(self):
+    layer = GlobalLayerThatShouldFailIfNotAdded()
+    saved_model_dir = self._save_model_dir()
+    tf_save.save(layer, saved_model_dir)
+    _ = keras_load.load(saved_model_dir)
+
   def test_multi_input_model(self):
     input_1 = keras.layers.Input(shape=(3,))
     input_2 = keras.layers.Input(shape=(5,))

From 7a43680c0e69032e0c8025af45d23de4e5f6cc44 Mon Sep 17 00:00:00 2001
From: Jay Shi <xiaojies@google.com>
Date: Mon, 15 Jun 2020 16:44:00 -0700
Subject: [PATCH 0235/1390] [tf.data] Fix some small typo in
 `parallel_interleave_dataset_op`.

PiperOrigin-RevId: 316571471
Change-Id: Ie4b11e5256b88086a7efb3c87077936889a3fe10
---
 .../core/kernels/data/parallel_interleave_dataset_op.cc   | 6 ++++--
 .../kernels/data/parallel_interleave_dataset_op_test.cc   | 8 ++++----
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/tensorflow/core/kernels/data/parallel_interleave_dataset_op.cc b/tensorflow/core/kernels/data/parallel_interleave_dataset_op.cc
index 8aca5005789..ac82666866d 100644
--- a/tensorflow/core/kernels/data/parallel_interleave_dataset_op.cc
+++ b/tensorflow/core/kernels/data/parallel_interleave_dataset_op.cc
@@ -1544,11 +1544,13 @@ void ParallelInterleaveDatasetOp::MakeDataset(OpKernelContext* ctx,
   int64 buffer_output_elements = model::kAutotune;
   int64 prefetch_input_elements = model::kAutotune;
   if (op_version_ >= 4) {
+    OP_REQUIRES_OK(ctx, ParseScalarArgument(ctx, kBufferOutputElements,
+                                            &buffer_output_elements));
     OP_REQUIRES(ctx,
                 buffer_output_elements == model::kAutotune ||
-                    buffer_output_elements >= 0,
+                    buffer_output_elements > 0,
                 errors::InvalidArgument("`buffer_output_elements` must be ",
-                                        model::kAutotune, " or >= 0 but is ",
+                                        model::kAutotune, " or > 0 but is ",
                                         buffer_output_elements));
 
     OP_REQUIRES_OK(ctx, ParseScalarArgument(ctx, kPrefetchInputElements,
diff --git a/tensorflow/core/kernels/data/parallel_interleave_dataset_op_test.cc b/tensorflow/core/kernels/data/parallel_interleave_dataset_op_test.cc
index 95a0d121f92..489a6a2a4b6 100644
--- a/tensorflow/core/kernels/data/parallel_interleave_dataset_op_test.cc
+++ b/tensorflow/core/kernels/data/parallel_interleave_dataset_op_test.cc
@@ -159,7 +159,7 @@ ParallelInterleaveDatasetParams ParallelInterleaveDatasetParams2() {
       /*other_arguments=*/{},
       /*cycle_length=*/2,
       /*block_length=*/1,
-      /*buffer_output_elements=*/0,
+      /*buffer_output_elements=*/1,
       /*prefetch_input_elements=*/0,
       /*num_parallel_calls=*/2,
       /*func=*/
@@ -184,7 +184,7 @@ ParallelInterleaveDatasetParams ParallelInterleaveDatasetParams3() {
       /*other_arguments=*/{},
       /*cycle_length=*/3,
       /*block_length=*/1,
-      /*buffer_output_elements=*/0,
+      /*buffer_output_elements=*/1,
       /*prefetch_input_elements=*/1,
       /*num_parallel_calls=*/2,
       /*func=*/
@@ -488,7 +488,7 @@ ParallelInterleaveDatasetParamsWithInvalidBufferOutputElements() {
       /*other_arguments=*/{},
       /*cycle_length=*/1,
       /*block_length=*/1,
-      /*buffer_output_elements=*/-2,
+      /*buffer_output_elements=*/model::kAutotune,
       /*prefetch_input_elements=*/model::kAutotune,
       /*num_parallel_calls=*/-5,
       /*func=*/
@@ -514,7 +514,7 @@ ParallelInterleaveDatasetParamsWithInvalidPrefetchInputElements() {
       /*other_arguments=*/{},
       /*cycle_length=*/1,
       /*block_length=*/1,
-      /*buffer_output_elements=*/-2,
+      /*buffer_output_elements=*/model::kAutotune,
       /*prefetch_input_elements=*/model::kAutotune,
       /*num_parallel_calls=*/-5,
       /*func=*/

From 5bc13320c9cab1dbca9348412ecc647ef62bebc7 Mon Sep 17 00:00:00 2001
From: Marat Dukhan <maratek@google.com>
Date: Mon, 15 Jun 2020 16:51:56 -0700
Subject: [PATCH 0236/1390] Re-enable depthwise_conv_2d_test on --config=msan
 in XNNPACK

PiperOrigin-RevId: 316572700
Change-Id: I0fe943907f18aa7d851cfb0e7e66e7b140b3079d
---
 tensorflow/lite/delegates/xnnpack/BUILD | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tensorflow/lite/delegates/xnnpack/BUILD b/tensorflow/lite/delegates/xnnpack/BUILD
index efbaf0cfc42..5736a2995b1 100644
--- a/tensorflow/lite/delegates/xnnpack/BUILD
+++ b/tensorflow/lite/delegates/xnnpack/BUILD
@@ -318,7 +318,6 @@ cc_test(
         "//tensorflow:emscripten": EMSCRIPTEN_LINKOPTS,
         "//conditions:default": [],
     }),
-    tags = ["nomsan"],  # b/145129478
     deps = [
         ":depthwise_conv_2d_tester",
         ":test_main",

From 6f0425cd06fd66c0686e376e17f3d9c23d773b86 Mon Sep 17 00:00:00 2001
From: Robert David <lrdx@google.com>
Date: Mon, 15 Jun 2020 16:58:43 -0700
Subject: [PATCH 0237/1390] Unidirectional sequence LSTM: Use constants for
 tensor indices in lstm_shared.h instead of redefining them.

PiperOrigin-RevId: 316573775
Change-Id: I2de5264e4ab3375344dfe3132b6cde7b6cb0226c
---
 .../kernels/unidirectional_sequence_lstm.cc   | 192 +++++++-----------
 1 file changed, 76 insertions(+), 116 deletions(-)

diff --git a/tensorflow/lite/kernels/unidirectional_sequence_lstm.cc b/tensorflow/lite/kernels/unidirectional_sequence_lstm.cc
index a6fe785ce53..0552885f720 100644
--- a/tensorflow/lite/kernels/unidirectional_sequence_lstm.cc
+++ b/tensorflow/lite/kernels/unidirectional_sequence_lstm.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/tensor_utils.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/kernels/lstm_eval.h"
+#include "tensorflow/lite/kernels/lstm_shared.h"
 
 namespace tflite {
 namespace ops {
@@ -36,53 +37,6 @@ struct OpData {
   bool compute_row_sums = false;
 };
 
-// Input Tensors of size {max_time, n_batch, n_input}
-constexpr int kInputTensor = 0;
-
-// Input weight tensors of size: {n_cell, n_input}
-constexpr int kInputToInputWeightsTensor = 1;  // Optional
-constexpr int kInputToForgetWeightsTensor = 2;
-constexpr int kInputToCellWeightsTensor = 3;
-constexpr int kInputToOutputWeightsTensor = 4;
-
-// Recurrent weight tensors of size {n_cell, n_output}
-constexpr int kRecurrentToInputWeightsTensor = 5;  // Optional
-constexpr int kRecurrentToForgetWeightsTensor = 6;
-constexpr int kRecurrentToCellWeightsTensor = 7;
-constexpr int kRecurrentToOutputWeightsTensor = 8;
-
-// Peephole weights tensors of size {n_cell}, representing a diagonal matrix.
-constexpr int kCellToInputWeightsTensor = 9;    // Optional
-constexpr int kCellToForgetWeightsTensor = 10;  // Optional
-constexpr int kCellToOutputWeightsTensor = 11;  // Optional
-
-// Gates bias tensors of size {n_cell}
-constexpr int kInputGateBiasTensor = 12;  // Optional
-constexpr int kForgetGateBiasTensor = 13;
-constexpr int kCellGateBiasTensor = 14;
-constexpr int kOutputGateBiasTensor = 15;
-
-// Projection weight tensor of size {n_output, n_cell}
-constexpr int kProjectionWeightsTensor = 16;  // Optional
-// Projection bias tensor of size {n_output}
-constexpr int kProjectionBiasTensor = 17;  // Optional
-
-// Stateful input tensors that are variables and will be modified by the Op.
-// Activation state tensor of size {n_batch, n_output}
-constexpr int kInputActivationStateTensor = 18;
-// Cell state tensor of size {n_batch, n_cell}
-constexpr int kInputCellStateTensor = 19;
-
-// Layer norm coefficient tensors of size {n_cell}, representing a diagonal
-// matrix.
-constexpr int kInputLayerNormCoefficientsTensor = 20;   // Optional
-constexpr int kForgetLayerNormCoefficientsTensor = 21;  // Optional
-constexpr int kCellLayerNormCoefficientsTensor = 22;    // Optional
-constexpr int kOutputLayerNormCoefficientsTensor = 23;  // Optional
-
-// Output tensors.
-constexpr int kOutputTensor = 0;
-
 // Temporary tensors
 enum TemporaryTensor {
   kScratchBuffer = 0,
@@ -122,8 +76,8 @@ TfLiteStatus CheckInputTensorDimensions(TfLiteContext* context,
   TF_LITE_ENSURE(context, params->cell_clip >= 0);
   TF_LITE_ENSURE(context, params->proj_clip >= 0);
 
-  const TfLiteTensor* input_to_input_weights =
-      GetOptionalInputTensor(context, node, kInputToInputWeightsTensor);
+  const TfLiteTensor* input_to_input_weights = GetOptionalInputTensor(
+      context, node, lstm::full::kInputToInputWeightsTensor);
   if (input_to_input_weights != nullptr) {
     TF_LITE_ENSURE_EQ(context, input_to_input_weights->dims->size, 2);
     TF_LITE_ENSURE_EQ(context, input_to_input_weights->dims->data[0], n_cell);
@@ -131,19 +85,19 @@ TfLiteStatus CheckInputTensorDimensions(TfLiteContext* context,
   }
 
   const TfLiteTensor* input_to_forget_weights =
-      GetInput(context, node, kInputToForgetWeightsTensor);
+      GetInput(context, node, lstm::full::kInputToForgetWeightsTensor);
   TF_LITE_ENSURE_EQ(context, input_to_forget_weights->dims->size, 2);
   TF_LITE_ENSURE_EQ(context, input_to_forget_weights->dims->data[0], n_cell);
   TF_LITE_ENSURE_EQ(context, input_to_forget_weights->dims->data[1], n_input);
 
   const TfLiteTensor* input_to_cell_weights =
-      GetInput(context, node, kInputToCellWeightsTensor);
+      GetInput(context, node, lstm::full::kInputToCellWeightsTensor);
   TF_LITE_ENSURE_EQ(context, input_to_cell_weights->dims->size, 2);
   TF_LITE_ENSURE_EQ(context, input_to_cell_weights->dims->data[0], n_cell);
   TF_LITE_ENSURE_EQ(context, input_to_cell_weights->dims->data[1], n_input);
 
-  const TfLiteTensor* recurrent_to_input_weights =
-      GetOptionalInputTensor(context, node, kRecurrentToInputWeightsTensor);
+  const TfLiteTensor* recurrent_to_input_weights = GetOptionalInputTensor(
+      context, node, lstm::full::kRecurrentToInputWeightsTensor);
   if (recurrent_to_input_weights != nullptr) {
     TF_LITE_ENSURE_EQ(context, recurrent_to_input_weights->dims->size, 2);
     TF_LITE_ENSURE_EQ(context, recurrent_to_input_weights->dims->data[0],
@@ -153,7 +107,7 @@ TfLiteStatus CheckInputTensorDimensions(TfLiteContext* context,
   }
 
   const TfLiteTensor* recurrent_to_forget_weights =
-      GetInput(context, node, kRecurrentToForgetWeightsTensor);
+      GetInput(context, node, lstm::full::kRecurrentToForgetWeightsTensor);
   TF_LITE_ENSURE_EQ(context, recurrent_to_forget_weights->dims->size, 2);
   TF_LITE_ENSURE_EQ(context, recurrent_to_forget_weights->dims->data[0],
                     n_cell);
@@ -161,7 +115,7 @@ TfLiteStatus CheckInputTensorDimensions(TfLiteContext* context,
                     n_output);
 
   const TfLiteTensor* recurrent_to_cell_weights =
-      GetInput(context, node, kRecurrentToCellWeightsTensor);
+      GetInput(context, node, lstm::full::kRecurrentToCellWeightsTensor);
   TF_LITE_ENSURE_EQ(context, recurrent_to_cell_weights->dims->size, 2);
   TF_LITE_ENSURE_EQ(context, recurrent_to_cell_weights->dims->data[0], n_cell);
   TF_LITE_ENSURE_EQ(context, recurrent_to_cell_weights->dims->data[1],
@@ -176,22 +130,22 @@ TfLiteStatus CheckInputTensorDimensions(TfLiteContext* context,
        (recurrent_to_input_weights == nullptr));
   TF_LITE_ENSURE(context, cifg_weights_all_or_none == true);
 
-  const TfLiteTensor* cell_to_input_weights =
-      GetOptionalInputTensor(context, node, kCellToInputWeightsTensor);
+  const TfLiteTensor* cell_to_input_weights = GetOptionalInputTensor(
+      context, node, lstm::full::kCellToInputWeightsTensor);
   if (cell_to_input_weights != nullptr) {
     TF_LITE_ENSURE_EQ(context, cell_to_input_weights->dims->size, 1);
     TF_LITE_ENSURE_EQ(context, cell_to_input_weights->dims->data[0], n_cell);
   }
 
-  const TfLiteTensor* cell_to_forget_weights =
-      GetOptionalInputTensor(context, node, kCellToForgetWeightsTensor);
+  const TfLiteTensor* cell_to_forget_weights = GetOptionalInputTensor(
+      context, node, lstm::full::kCellToForgetWeightsTensor);
   if (cell_to_forget_weights != nullptr) {
     TF_LITE_ENSURE_EQ(context, cell_to_forget_weights->dims->size, 1);
     TF_LITE_ENSURE_EQ(context, cell_to_forget_weights->dims->data[0], n_cell);
   }
 
-  const TfLiteTensor* cell_to_output_weights =
-      GetOptionalInputTensor(context, node, kCellToOutputWeightsTensor);
+  const TfLiteTensor* cell_to_output_weights = GetOptionalInputTensor(
+      context, node, lstm::full::kCellToOutputWeightsTensor);
   if (cell_to_output_weights != nullptr) {
     TF_LITE_ENSURE_EQ(context, cell_to_output_weights->dims->size, 1);
     TF_LITE_ENSURE_EQ(context, cell_to_output_weights->dims->data[0], n_cell);
@@ -210,7 +164,7 @@ TfLiteStatus CheckInputTensorDimensions(TfLiteContext* context,
 
   // Make sure the input gate bias is present only when not a CIFG-LSTM.
   const TfLiteTensor* input_gate_bias =
-      GetOptionalInputTensor(context, node, kInputGateBiasTensor);
+      GetOptionalInputTensor(context, node, lstm::full::kInputGateBiasTensor);
   if (use_cifg) {
     TF_LITE_ENSURE_EQ(context, input_gate_bias, nullptr);
   } else {
@@ -219,21 +173,22 @@ TfLiteStatus CheckInputTensorDimensions(TfLiteContext* context,
   }
 
   const TfLiteTensor* forget_gate_bias =
-      GetInput(context, node, kForgetGateBiasTensor);
+      GetInput(context, node, lstm::full::kForgetGateBiasTensor);
   TF_LITE_ENSURE_EQ(context, forget_gate_bias->dims->size, 1);
   TF_LITE_ENSURE_EQ(context, forget_gate_bias->dims->data[0], n_cell);
 
-  const TfLiteTensor* cell_bias = GetInput(context, node, kCellGateBiasTensor);
+  const TfLiteTensor* cell_bias =
+      GetInput(context, node, lstm::full::kCellGateBiasTensor);
   TF_LITE_ENSURE_EQ(context, cell_bias->dims->size, 1);
   TF_LITE_ENSURE_EQ(context, cell_bias->dims->data[0], n_cell);
 
   const TfLiteTensor* output_gate_bias =
-      GetInput(context, node, kOutputGateBiasTensor);
+      GetInput(context, node, lstm::full::kOutputGateBiasTensor);
   TF_LITE_ENSURE_EQ(context, output_gate_bias->dims->size, 1);
   TF_LITE_ENSURE_EQ(context, output_gate_bias->dims->data[0], n_cell);
 
-  const TfLiteTensor* projection_weights =
-      GetOptionalInputTensor(context, node, kProjectionWeightsTensor);
+  const TfLiteTensor* projection_weights = GetOptionalInputTensor(
+      context, node, lstm::full::kProjectionWeightsTensor);
   if (projection_weights != nullptr) {
     TF_LITE_ENSURE_EQ(context, projection_weights->dims->size, 2);
     TF_LITE_ENSURE_EQ(context, projection_weights->dims->data[0], n_output);
@@ -241,7 +196,7 @@ TfLiteStatus CheckInputTensorDimensions(TfLiteContext* context,
   }
 
   const TfLiteTensor* projection_bias =
-      GetOptionalInputTensor(context, node, kProjectionBiasTensor);
+      GetOptionalInputTensor(context, node, lstm::full::kProjectionBiasTensor);
   if (projection_bias != nullptr) {
     TF_LITE_ENSURE_EQ(context, projection_bias->dims->size, 1);
     TF_LITE_ENSURE_EQ(context, projection_bias->dims->data[0], n_output);
@@ -258,7 +213,7 @@ TfLiteStatus CheckInputTensorDimensions(TfLiteContext* context,
 
   if (is_layer_norm_lstm) {
     const TfLiteTensor* input_layer_norm_coefficients = GetOptionalInputTensor(
-        context, node, kInputLayerNormCoefficientsTensor);
+        context, node, lstm::full::kInputLayerNormCoefficientsTensor);
     if (use_cifg) {
       TF_LITE_ENSURE_EQ(context, input_layer_norm_coefficients, nullptr);
     } else {
@@ -271,7 +226,7 @@ TfLiteStatus CheckInputTensorDimensions(TfLiteContext* context,
     }
 
     const TfLiteTensor* forget_layer_norm_coefficients =
-        GetInput(context, node, kForgetLayerNormCoefficientsTensor);
+        GetInput(context, node, lstm::full::kForgetLayerNormCoefficientsTensor);
     TF_LITE_ENSURE(context, forget_layer_norm_coefficients != nullptr);
     TF_LITE_ENSURE_EQ(context, forget_layer_norm_coefficients->dims->size, 1);
     TF_LITE_ENSURE_EQ(context, forget_layer_norm_coefficients->dims->data[0],
@@ -280,7 +235,7 @@ TfLiteStatus CheckInputTensorDimensions(TfLiteContext* context,
                       kTfLiteFloat32);
 
     const TfLiteTensor* cell_layer_norm_coefficients =
-        GetInput(context, node, kCellLayerNormCoefficientsTensor);
+        GetInput(context, node, lstm::full::kCellLayerNormCoefficientsTensor);
     TF_LITE_ENSURE(context, cell_layer_norm_coefficients != nullptr);
     TF_LITE_ENSURE_EQ(context, cell_layer_norm_coefficients->dims->size, 1);
     TF_LITE_ENSURE_EQ(context, cell_layer_norm_coefficients->dims->data[0],
@@ -289,7 +244,7 @@ TfLiteStatus CheckInputTensorDimensions(TfLiteContext* context,
                       kTfLiteFloat32);
 
     const TfLiteTensor* output_layer_norm_coefficients =
-        GetInput(context, node, kOutputLayerNormCoefficientsTensor);
+        GetInput(context, node, lstm::full::kOutputLayerNormCoefficientsTensor);
     TF_LITE_ENSURE(context, output_layer_norm_coefficients != nullptr);
     TF_LITE_ENSURE_EQ(context, output_layer_norm_coefficients->dims->size, 1);
     TF_LITE_ENSURE_EQ(context, output_layer_norm_coefficients->dims->data[0],
@@ -312,7 +267,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   bool is_layer_norm_lstm = false;
   if (node->inputs->size == 24) {
     const TfLiteTensor* forget_layer_norm_coefficients = GetOptionalInputTensor(
-        context, node, kForgetLayerNormCoefficientsTensor);
+        context, node, lstm::full::kForgetLayerNormCoefficientsTensor);
     if (forget_layer_norm_coefficients == nullptr) {
       is_layer_norm_lstm = false;
     } else {
@@ -332,7 +287,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 
   // Inferring batch size, number of outputs and sequence length and
   // number of cells from the input tensors.
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  const TfLiteTensor* input = GetInput(context, node, lstm::full::kInputTensor);
   TF_LITE_ENSURE_EQ(context, input->type, kTfLiteFloat32);
   TF_LITE_ENSURE(context, input->dims->size > 1);
   const auto* params =
@@ -343,13 +298,13 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   const int n_input = input->dims->data[2];
 
   const TfLiteTensor* input_to_output_weights =
-      GetInput(context, node, kInputToOutputWeightsTensor);
+      GetInput(context, node, lstm::full::kInputToOutputWeightsTensor);
   const int n_cell = input_to_output_weights->dims->data[0];
   TF_LITE_ENSURE_EQ(context, input_to_output_weights->dims->size, 2);
   TF_LITE_ENSURE_EQ(context, input_to_output_weights->dims->data[1], n_input);
 
   const TfLiteTensor* recurrent_to_output_weights =
-      GetInput(context, node, kRecurrentToOutputWeightsTensor);
+      GetInput(context, node, lstm::full::kRecurrentToOutputWeightsTensor);
   TF_LITE_ENSURE_EQ(context, recurrent_to_output_weights->dims->size, 2);
   TF_LITE_ENSURE_EQ(context, recurrent_to_output_weights->dims->data[0],
                     n_cell);
@@ -361,13 +316,13 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
                                                n_cell, is_layer_norm_lstm));
 
   // Get the pointer to output, activation_state and cell_state buffer tensors.
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  TfLiteTensor* output = GetOutput(context, node, lstm::full::kOutputTensor);
 
   TfLiteTensor* activation_state =
-      GetVariableInput(context, node, kInputActivationStateTensor);
+      GetVariableInput(context, node, lstm::full::kInputActivationStateTensor);
   TF_LITE_ENSURE(context, activation_state != nullptr);
   TfLiteTensor* cell_state =
-      GetVariableInput(context, node, kInputCellStateTensor);
+      GetVariableInput(context, node, lstm::full::kInputCellStateTensor);
   TF_LITE_ENSURE(context, cell_state != nullptr);
 
   // Check the shape of input state tensors.
@@ -395,8 +350,8 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   scratch_buffer->type = input->type;
   scratch_buffer->allocation_type = kTfLiteArenaRw;
 
-  const TfLiteTensor* input_to_input_weights =
-      GetOptionalInputTensor(context, node, kInputToInputWeightsTensor);
+  const TfLiteTensor* input_to_input_weights = GetOptionalInputTensor(
+      context, node, lstm::full::kInputToInputWeightsTensor);
   const bool use_cifg = (input_to_input_weights == nullptr);
   TfLiteIntArray* scratch_buffer_size = TfLiteIntArrayCreate(2);
   scratch_buffer_size->data[0] = n_batch;
@@ -534,8 +489,8 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
     row_sums->type = kTfLiteInt32;
     row_sums->allocation_type = kTfLiteArenaRwPersistent;
     int row_sums_rows = use_cifg ? 6 : 8;
-    const TfLiteTensor* projection_weights =
-        GetOptionalInputTensor(context, node, kProjectionWeightsTensor);
+    const TfLiteTensor* projection_weights = GetOptionalInputTensor(
+        context, node, lstm::full::kProjectionWeightsTensor);
     if (projection_weights != nullptr) {
       row_sums_rows += ceil(static_cast<float>(n_output) / n_cell);
     }
@@ -558,74 +513,79 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   const OpData* op_data = reinterpret_cast<OpData*>(node->user_data);
   const bool is_layer_norm_lstm = op_data->is_layer_norm_lstm;
   const bool time_major = params->time_major;
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  const TfLiteTensor* input = GetInput(context, node, lstm::full::kInputTensor);
 
-  const TfLiteTensor* input_to_input_weights =
-      GetOptionalInputTensor(context, node, kInputToInputWeightsTensor);
+  const TfLiteTensor* input_to_input_weights = GetOptionalInputTensor(
+      context, node, lstm::full::kInputToInputWeightsTensor);
   const TfLiteTensor* input_to_forget_weights =
-      GetInput(context, node, kInputToForgetWeightsTensor);
+      GetInput(context, node, lstm::full::kInputToForgetWeightsTensor);
   const TfLiteTensor* input_to_cell_weights =
-      GetInput(context, node, kInputToCellWeightsTensor);
+      GetInput(context, node, lstm::full::kInputToCellWeightsTensor);
   const TfLiteTensor* input_to_output_weights =
-      GetInput(context, node, kInputToOutputWeightsTensor);
+      GetInput(context, node, lstm::full::kInputToOutputWeightsTensor);
 
-  const TfLiteTensor* recurrent_to_input_weights =
-      GetOptionalInputTensor(context, node, kRecurrentToInputWeightsTensor);
+  const TfLiteTensor* recurrent_to_input_weights = GetOptionalInputTensor(
+      context, node, lstm::full::kRecurrentToInputWeightsTensor);
   const TfLiteTensor* recurrent_to_forget_weights =
-      GetInput(context, node, kRecurrentToForgetWeightsTensor);
+      GetInput(context, node, lstm::full::kRecurrentToForgetWeightsTensor);
   const TfLiteTensor* recurrent_to_cell_weights =
-      GetInput(context, node, kRecurrentToCellWeightsTensor);
+      GetInput(context, node, lstm::full::kRecurrentToCellWeightsTensor);
   const TfLiteTensor* recurrent_to_output_weights =
-      GetInput(context, node, kRecurrentToOutputWeightsTensor);
+      GetInput(context, node, lstm::full::kRecurrentToOutputWeightsTensor);
 
-  const TfLiteTensor* cell_to_input_weights =
-      GetOptionalInputTensor(context, node, kCellToInputWeightsTensor);
-  const TfLiteTensor* cell_to_forget_weights =
-      GetOptionalInputTensor(context, node, kCellToForgetWeightsTensor);
-  const TfLiteTensor* cell_to_output_weights =
-      GetOptionalInputTensor(context, node, kCellToOutputWeightsTensor);
+  const TfLiteTensor* cell_to_input_weights = GetOptionalInputTensor(
+      context, node, lstm::full::kCellToInputWeightsTensor);
+  const TfLiteTensor* cell_to_forget_weights = GetOptionalInputTensor(
+      context, node, lstm::full::kCellToForgetWeightsTensor);
+  const TfLiteTensor* cell_to_output_weights = GetOptionalInputTensor(
+      context, node, lstm::full::kCellToOutputWeightsTensor);
 
   const TfLiteTensor* input_gate_bias =
-      GetOptionalInputTensor(context, node, kInputGateBiasTensor);
+      GetOptionalInputTensor(context, node, lstm::full::kInputGateBiasTensor);
   const TfLiteTensor* forget_gate_bias =
-      GetInput(context, node, kForgetGateBiasTensor);
-  const TfLiteTensor* cell_bias = GetInput(context, node, kCellGateBiasTensor);
+      GetInput(context, node, lstm::full::kForgetGateBiasTensor);
+  const TfLiteTensor* cell_bias =
+      GetInput(context, node, lstm::full::kCellGateBiasTensor);
   const TfLiteTensor* output_gate_bias =
-      GetInput(context, node, kOutputGateBiasTensor);
+      GetInput(context, node, lstm::full::kOutputGateBiasTensor);
 
-  const TfLiteTensor* projection_weights =
-      GetOptionalInputTensor(context, node, kProjectionWeightsTensor);
+  const TfLiteTensor* projection_weights = GetOptionalInputTensor(
+      context, node, lstm::full::kProjectionWeightsTensor);
   const TfLiteTensor* projection_bias =
-      GetOptionalInputTensor(context, node, kProjectionBiasTensor);
+      GetOptionalInputTensor(context, node, lstm::full::kProjectionBiasTensor);
 
   // Index the scratch buffers pointers to the global scratch buffer.
   TfLiteTensor* scratch_buffer = GetTemporary(context, node, /*index=*/0);
 
   TfLiteTensor* activation_state =
-      GetVariableInput(context, node, kInputActivationStateTensor);
+      GetVariableInput(context, node, lstm::full::kInputActivationStateTensor);
   TF_LITE_ENSURE(context, activation_state != nullptr);
   TfLiteTensor* cell_state =
-      GetVariableInput(context, node, kInputCellStateTensor);
+      GetVariableInput(context, node, lstm::full::kInputCellStateTensor);
   TF_LITE_ENSURE(context, cell_state != nullptr);
 
   const TfLiteTensor* input_layer_norm_coefficients =
-      is_layer_norm_lstm ? GetOptionalInputTensor(
-                               context, node, kInputLayerNormCoefficientsTensor)
-                         : nullptr;
+      is_layer_norm_lstm
+          ? GetOptionalInputTensor(
+                context, node, lstm::full::kInputLayerNormCoefficientsTensor)
+          : nullptr;
   const TfLiteTensor* forget_layer_norm_coefficients =
       is_layer_norm_lstm
-          ? GetInput(context, node, kForgetLayerNormCoefficientsTensor)
+          ? GetInput(context, node,
+                     lstm::full::kForgetLayerNormCoefficientsTensor)
           : nullptr;
   const TfLiteTensor* cell_layer_norm_coefficients =
       is_layer_norm_lstm
-          ? GetInput(context, node, kCellLayerNormCoefficientsTensor)
+          ? GetInput(context, node,
+                     lstm::full::kCellLayerNormCoefficientsTensor)
           : nullptr;
   const TfLiteTensor* output_layer_norm_coefficients =
       is_layer_norm_lstm
-          ? GetInput(context, node, kOutputLayerNormCoefficientsTensor)
+          ? GetInput(context, node,
+                     lstm::full::kOutputLayerNormCoefficientsTensor)
           : nullptr;
 
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  TfLiteTensor* output = GetOutput(context, node, lstm::full::kOutputTensor);
 
   // Copy out the LSTM specific params so they can be passed in the function.
   TfLiteLSTMParams lstm_params;

From 304daeb37f56a12e0d0dd26654893e9e6cf840eb Mon Sep 17 00:00:00 2001
From: Jing Pu <jingpu@google.com>
Date: Mon, 15 Jun 2020 17:03:42 -0700
Subject: [PATCH 0238/1390] Fix the tests that would be otherwise broken by
 https://reviews.llvm.org/D80258.

Previously, the tests relied on the misbehavior that col zero pointing to the first column that will be fixed by the LLVM patch.

PiperOrigin-RevId: 316574635
Change-Id: I6a66de318081bc5ebee3f888e1f1601fc3de9af3
---
 .../aot/tests/test_error_message.lit.pbtxt.debug.pbtxt        | 4 ++++
 .../graphdef2mlir/error-message-with-source-info.pbtxt.debug  | 4 ++++
 2 files changed, 8 insertions(+)

diff --git a/tensorflow/compiler/aot/tests/test_error_message.lit.pbtxt.debug.pbtxt b/tensorflow/compiler/aot/tests/test_error_message.lit.pbtxt.debug.pbtxt
index 7acc8287950..0820ebfd040 100644
--- a/tensorflow/compiler/aot/tests/test_error_message.lit.pbtxt.debug.pbtxt
+++ b/tensorflow/compiler/aot/tests/test_error_message.lit.pbtxt.debug.pbtxt
@@ -4,6 +4,7 @@ traces: {
   value: {
     file_line_cols: {
       line: 1
+      col:  1
     }
   }
 }
@@ -12,9 +13,11 @@ traces: {
   value: {
     file_line_cols: {
       line: 3
+      col:  1
     }
     file_line_cols: {
       line: 4
+      col:  1
     }
   }
 }
@@ -23,6 +26,7 @@ traces: {
   value: {
     file_line_cols: {
       line: 2
+      col:  1
     }
   }
 }
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/error-message-with-source-info.pbtxt.debug b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/error-message-with-source-info.pbtxt.debug
index 881a01f3a3a..2e1afeb2329 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/error-message-with-source-info.pbtxt.debug
+++ b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/error-message-with-source-info.pbtxt.debug
@@ -4,6 +4,7 @@ traces: {
     value: {
       file_line_cols: {
         line      : 1
+        col       : 1
       }
     }
 }
@@ -12,9 +13,11 @@ traces: {
     value: {
       file_line_cols: {
         line      : 3
+        col       : 1
       }
       file_line_cols: {
         line      : 4
+        col       : 1
       }
     }
 }
@@ -23,6 +26,7 @@ traces: {
     value: {
       file_line_cols: {
         line      : 2
+        col       : 1
       }
     }
 }

From 0cc6210daa35247daf7f4cc98c115de611d2d05f Mon Sep 17 00:00:00 2001
From: Yanhui Liang <yhliang@google.com>
Date: Mon, 15 Jun 2020 17:13:45 -0700
Subject: [PATCH 0239/1390] Cache DataHandler in `model.evaluate` to avoid
 function retracing between epochs in `model.fit`.

PiperOrigin-RevId: 316576844
Change-Id: Icf85ce6830b69a003c1f2ebf41f8c70258504afd
---
 tensorflow/python/keras/engine/training.py    | 55 +++++++++++++------
 .../python/keras/engine/training_test.py      | 22 ++++++++
 2 files changed, 61 insertions(+), 16 deletions(-)

diff --git a/tensorflow/python/keras/engine/training.py b/tensorflow/python/keras/engine/training.py
index b7a4795d768..5567e1733a7 100644
--- a/tensorflow/python/keras/engine/training.py
+++ b/tensorflow/python/keras/engine/training.py
@@ -1040,6 +1040,10 @@ class Model(base_layer.Layer, version_utils.ModelVersionSelector):
           data_adapter.train_validation_split(
               (x, y, sample_weight), validation_split=validation_split))
 
+    if validation_data:
+      val_x, val_y, val_sample_weight = (
+          data_adapter.unpack_x_y_sample_weight(validation_data))
+
     with self.distribute_strategy.scope(), \
          training_utils.RespectCompiledTrainableState(self):
       # Creates a `tf.data.Dataset` and handles batch and epoch iteration.
@@ -1102,8 +1106,21 @@ class Model(base_layer.Layer, version_utils.ModelVersionSelector):
 
         # Run validation.
         if validation_data and self._should_eval(epoch, validation_freq):
-          val_x, val_y, val_sample_weight = (
-              data_adapter.unpack_x_y_sample_weight(validation_data))
+          # Create data_handler for evaluation and cache it.
+          if getattr(self, '_eval_data_handler', None) is None:
+            self._eval_data_handler = data_adapter.DataHandler(
+                x=val_x,
+                y=val_y,
+                sample_weight=val_sample_weight,
+                batch_size=validation_batch_size or batch_size,
+                steps_per_epoch=validation_steps,
+                initial_epoch=0,
+                epochs=1,
+                max_queue_size=max_queue_size,
+                workers=workers,
+                use_multiprocessing=use_multiprocessing,
+                model=self,
+                steps_per_execution=self._steps_per_execution)
           val_logs = self.evaluate(
               x=val_x,
               y=val_y,
@@ -1123,6 +1140,9 @@ class Model(base_layer.Layer, version_utils.ModelVersionSelector):
         if self.stop_training:
           break
 
+      # If eval data_hanlder exists, delete it after all epochs are done.
+      if getattr(self, '_eval_data_handler', None) is not None:
+        del self._eval_data_handler
       callbacks.on_train_end(logs=training_logs)
       return self.history
 
@@ -1318,20 +1338,23 @@ class Model(base_layer.Layer, version_utils.ModelVersionSelector):
     _disallow_inside_tf_function('evaluate')
 
     with self.distribute_strategy.scope():
-      # Creates a `tf.data.Dataset` and handles batch and epoch iteration.
-      data_handler = data_adapter.DataHandler(
-          x=x,
-          y=y,
-          sample_weight=sample_weight,
-          batch_size=batch_size,
-          steps_per_epoch=steps,
-          initial_epoch=0,
-          epochs=1,
-          max_queue_size=max_queue_size,
-          workers=workers,
-          use_multiprocessing=use_multiprocessing,
-          model=self,
-          steps_per_execution=self._steps_per_execution)
+      if getattr(self, '_eval_data_handler', None) is not None:
+        data_handler = self._eval_data_handler
+      else:
+        # Creates a `tf.data.Dataset` and handles batch and epoch iteration.
+        data_handler = data_adapter.DataHandler(
+            x=x,
+            y=y,
+            sample_weight=sample_weight,
+            batch_size=batch_size,
+            steps_per_epoch=steps,
+            initial_epoch=0,
+            epochs=1,
+            max_queue_size=max_queue_size,
+            workers=workers,
+            use_multiprocessing=use_multiprocessing,
+            model=self,
+            steps_per_execution=self._steps_per_execution)
 
       # Container that configures and calls `tf.keras.Callback`s.
       if not isinstance(callbacks, callbacks_module.CallbackList):
diff --git a/tensorflow/python/keras/engine/training_test.py b/tensorflow/python/keras/engine/training_test.py
index aa01463582c..5cf15926bfb 100644
--- a/tensorflow/python/keras/engine/training_test.py
+++ b/tensorflow/python/keras/engine/training_test.py
@@ -59,6 +59,7 @@ from tensorflow.python.ops import template
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables as variables_lib
 from tensorflow.python.platform import test
+from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training.rmsprop import RMSPropOptimizer
 
 try:
@@ -3538,5 +3539,26 @@ class TestAutoUpdates(keras_parameterized.TestCase):
     self.assertAllEqual(self.evaluate(bn.moving_variance), np.ones((10,)))
 
 
+class TestFunctionTracing(keras_parameterized.TestCase):
+
+  @keras_parameterized.run_all_keras_modes(
+      always_skip_v1=True, always_skip_eager=True)
+  def test_no_tracing_between_epoch(self):
+    if sys.version_info[0] < 3:
+      self.skipTest('self.assertLogs() call is not available in Python 2.')
+
+    model = sequential.Sequential([layers_module.Dense(4, activation='relu')])
+    model.compile(loss='mse', optimizer='rmsprop')
+    x = np.random.random((10, 6))
+    y = np.random.random((10, 4))
+
+    logging.set_verbosity(1)
+    with self.assertLogs(level=1) as logs:
+      model.fit(x, y, epochs=10, batch_size=5, validation_data=(x, y))
+
+    new_func_graph = 'INFO:absl:Creating new FuncGraph for Python function'
+    self.assertEqual(sum(new_func_graph in log for log in logs.output), 9)
+
+
 if __name__ == '__main__':
   test.main()

From ac695a31de1649a3d61e738b87e19ea6c558dada Mon Sep 17 00:00:00 2001
From: Gaurav Jain <gjn@google.com>
Date: Mon, 15 Jun 2020 17:16:40 -0700
Subject: [PATCH 0240/1390] Add DT_BOOL support to GPU variable ops

This is a follow-on to PR #38848 & PR #39172 and resolves remaining ask
in Issue #35994. The original PR tried to add many variable ops on the
GPU including DT_BOOL. However, this caused testCondModifyBoolPred to
fail and thus the DT_BOOL type was removed. The reason for the test
failure is once DT_BOOL variables are supported on the GPU, we need to
ensure the switch ops are also updated to not have host memory
requirement. Otherwise, a DT_BOOL ref variable is attempted to be
copied to the GPU which fails since we should not be transfering ref
types.

PiperOrigin-RevId: 316577397
Change-Id: Ic0d96ed4cdf8a0ea4674889aaff3a8ecd50991dd
---
 tensorflow/core/kernels/control_flow_ops.cc            | 10 +++++-----
 tensorflow/core/kernels/variable_ops.cc                |  3 +--
 .../debug/lib/debug_graph_reconstruction_test.py       |  6 +++---
 tensorflow/python/ops/control_flow_ops_test.py         |  6 +++---
 4 files changed, 12 insertions(+), 13 deletions(-)

diff --git a/tensorflow/core/kernels/control_flow_ops.cc b/tensorflow/core/kernels/control_flow_ops.cc
index c8e83b6f672..435de3c5954 100644
--- a/tensorflow/core/kernels/control_flow_ops.cc
+++ b/tensorflow/core/kernels/control_flow_ops.cc
@@ -111,15 +111,17 @@ REGISTER_GPU_SWITCH(uint64);
 TF_CALL_variant(REGISTER_GPU_SWITCH);
 TF_CALL_uint32(REGISTER_GPU_SWITCH);
 TF_CALL_uint32(REGISTER_GPU_REF_SWITCH);
+TF_CALL_bool(REGISTER_GPU_SWITCH);
+TF_CALL_bool(REGISTER_GPU_REF_SWITCH);
 
 #undef REGISTER_CPU_SWITCH
 #undef REGISTER_CPU_REF_SWITCH
 #undef REGISTER_GPU_SWITCH
 #undef REGISTER_GPU_REF_SWITCH
 
-// Special GPU kernels for int32 and string.
-// TODO(b/25387198): Also enable int32 in device memory. This kernel
-// registration requires all int32 inputs and outputs to be in host memory.
+// Special GPU kernels for int32, string & resource handles. Requiring all
+// inputs and outputs to be in host memory.
+// TODO(b/25387198): Also enable int32 in device memory.
 #define REGISTER_GPU_HOST_KERNEL(type)                    \
   REGISTER_KERNEL_BUILDER(Name("Switch")                  \
                               .Device(DEVICE_GPU)         \
@@ -149,8 +151,6 @@ TF_CALL_uint32(REGISTER_GPU_REF_SWITCH);
 
 REGISTER_GPU_HOST_KERNEL(int32);
 REGISTER_GPU_HOST_REF_KERNEL(int32);
-REGISTER_GPU_HOST_KERNEL(bool);
-REGISTER_GPU_HOST_REF_KERNEL(bool);
 REGISTER_GPU_HOST_KERNEL(tstring);
 REGISTER_GPU_HOST_REF_KERNEL(tstring);
 REGISTER_GPU_HOST_KERNEL(ResourceHandle);
diff --git a/tensorflow/core/kernels/variable_ops.cc b/tensorflow/core/kernels/variable_ops.cc
index 6f5e0b94eca..ccd33e8c75a 100644
--- a/tensorflow/core/kernels/variable_ops.cc
+++ b/tensorflow/core/kernels/variable_ops.cc
@@ -252,8 +252,7 @@ TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SYCL_KERNEL);
 
 TF_CALL_int64(REGISTER_GPU_KERNELS);
 TF_CALL_uint32(REGISTER_GPU_KERNELS);
-TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_KERNELS);
-TF_CALL_COMPLEX_TYPES(REGISTER_GPU_KERNELS);
+TF_CALL_GPU_ALL_TYPES(REGISTER_GPU_KERNELS);
 #undef REGISTER_GPU_KERNELS
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
diff --git a/tensorflow/python/debug/lib/debug_graph_reconstruction_test.py b/tensorflow/python/debug/lib/debug_graph_reconstruction_test.py
index fb722efab4e..b3baa6e7bc2 100644
--- a/tensorflow/python/debug/lib/debug_graph_reconstruction_test.py
+++ b/tensorflow/python/debug/lib/debug_graph_reconstruction_test.py
@@ -73,9 +73,9 @@ class ReconstructNonDebugGraphTest(test_util.TensorFlowTestCase):
           for attr_key in new_node.attr:
             if attr_key == "parallel_iterations":
               new_node.attr[attr_key].i = 1
-        elif new_node.op == "Switch":
-          # We don't check the inputs to Switch ops as their inputs may be
-          # Send/Recv nodes.
+        elif new_node.op == "Switch" or new_node.op == "Identity":
+          # We don't check the inputs to Switch or Identity ops as their inputs
+          # may be Send/Recv nodes.
           del new_node.input[:]
 
     return output_graph_def
diff --git a/tensorflow/python/ops/control_flow_ops_test.py b/tensorflow/python/ops/control_flow_ops_test.py
index 9254695d988..3ca9bda82f2 100644
--- a/tensorflow/python/ops/control_flow_ops_test.py
+++ b/tensorflow/python/ops/control_flow_ops_test.py
@@ -396,10 +396,10 @@ class CondTest(test_util.TensorFlowTestCase):
         fn2=lambda: math_ops.add(y, 23))
     self.assertEquals(self.evaluate(z), 24)
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("Exercises Ref variables")
   def testCondModifyBoolPred(self):
-    # This test in particular used to fail only when running in GPU, hence
-    # use_gpu=True.
+    # We want to use the GPU here because we want to ensure that we can update
+    # a boolean ref variable on the GPU.
     with test_util.use_gpu():
       bool_var = variable_scope.get_variable(
           "bool_var", dtype=dtypes.bool, initializer=True)

From 4ac4683f5ed3dca87afd799447a57ed81b5bfdcf Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 15 Jun 2020 17:17:28 -0700
Subject: [PATCH 0241/1390] Go: Update generated wrapper functions for
 TensorFlow ops.

PiperOrigin-RevId: 316577541
Change-Id: I10368c7cc5267b32ac298011dd16abaaab671448
---
 tensorflow/go/op/wrappers.go | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index f118e2bd494..10acebc7965 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -9495,6 +9495,14 @@ func DebugIdentityV2CircularBufferSize(value int64) DebugIdentityV2Attr {
 	}
 }
 
+// DebugIdentityV2TfdbgRunId sets the optional tfdbg_run_id attribute to value.
+// If not specified, defaults to ""
+func DebugIdentityV2TfdbgRunId(value string) DebugIdentityV2Attr {
+	return func(m optionalAttr) {
+		m["tfdbg_run_id"] = value
+	}
+}
+
 // Debug Identity V2 Op.
 //
 // Provides an identity mapping from input to output, while writing the content of

From c34265b348e2fccff165d33deceb91220d670dad Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 15 Jun 2020 17:35:30 -0700
Subject: [PATCH 0242/1390] Integrate stackdriver support with tensorflow (roll
 forward cl/315012368)

PiperOrigin-RevId: 316580813
Change-Id: I93a66821fd13dc24a45c47f3e67638aed81d59be
---
 .bazelrc                  |  2 ++
 tensorflow/BUILD          |  7 +++++++
 tensorflow/core/BUILD     |  4 ++--
 tensorflow/python/BUILD   |  4 ++++
 tensorflow/tensorflow.bzl | 27 +++++++++++++++++++++++++--
 tensorflow/workspace.bzl  | 10 ++++++++++
 6 files changed, 50 insertions(+), 4 deletions(-)

diff --git a/.bazelrc b/.bazelrc
index 15d46120642..5ea8048d5d9 100644
--- a/.bazelrc
+++ b/.bazelrc
@@ -200,6 +200,8 @@ build:nogcp --define=no_gcp_support=true
 build:nohdfs --define=no_hdfs_support=true
 build:nonccl --define=no_nccl_support=true
 
+build:stackdriver_support --define=stackdriver_support=true
+
 build --define=use_fast_cpp_protos=true
 build --define=allow_oversize_protos=true
 
diff --git a/tensorflow/BUILD b/tensorflow/BUILD
index ce759634232..bd0619b0c05 100644
--- a/tensorflow/BUILD
+++ b/tensorflow/BUILD
@@ -298,6 +298,13 @@ config_setting(
     visibility = ["//visibility:public"],
 )
 
+# Experimental features
+config_setting(
+    name = "stackdriver_support",
+    define_values = {"stackdriver_support": "true"},
+    visibility = ["//visibility:public"],
+)
+
 # Crosses between platforms and file system libraries not supported on those
 # platforms due to limitations in nested select() statements.
 config_setting(
diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index 2b16801f6ed..50f1f2527a5 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -100,7 +100,7 @@ load("//tensorflow:tensorflow.bzl", "tf_cc_test_gpu")
 load("//tensorflow:tensorflow.bzl", "tf_cc_tests_gpu")
 
 # buildifier: disable=same-origin-load
-load("//tensorflow:tensorflow.bzl", "tf_monitoring_deps")
+load("//tensorflow:tensorflow.bzl", "tf_monitoring_framework_deps")
 
 # For platform specific build config
 load(
@@ -1950,7 +1950,7 @@ cc_library(
         "@zlib",
         "@double_conversion//:double-conversion",
         "@com_google_protobuf//:protobuf",
-    ] + tf_protos_all_impl() + tf_protos_grappler_impl() + tf_protos_profiler_impl() + tf_monitoring_deps(),
+    ] + tf_protos_all_impl() + tf_protos_grappler_impl() + tf_protos_profiler_impl() + tf_monitoring_framework_deps(),
     # Alwayslink causes a cc_binary to "always link" in the
     # srcs for a given cc_library, even if they are unreferenced, see:
     # https://docs.bazel.build/versions/master/be/c-cpp.html#cc_library.alwayslink
diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index a7338772589..87048ba9d40 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -5,6 +5,9 @@
 load("//tensorflow:tensorflow.bzl", "py_strict_library")
 load("//tensorflow:tensorflow.bzl", "cc_header_only_library", "if_mlir", "if_not_windows", "if_xla_available", "py_test", "py_tests", "tf_cc_shared_object", "tf_cuda_library", "tf_gen_op_wrapper_py")
 
+# buildifier: disable=same-origin-load
+load("//tensorflow:tensorflow.bzl", "tf_monitoring_python_deps")
+
 # buildifier: disable=same-origin-load
 load("//tensorflow:tensorflow.bzl", "tf_python_pybind_extension")
 
@@ -6048,6 +6051,7 @@ pywrap_tensorflow_macro(
         "//tensorflow/core/util/tensor_bundle",
         "//tensorflow/compiler/mlir/python:mlir",
     ] + (tf_additional_lib_deps() +
+         tf_monitoring_python_deps() +
          tf_additional_plugin_deps() +
          tf_additional_profiler_deps()) + if_ngraph([
         "@ngraph_tf//:ngraph_tf",
diff --git a/tensorflow/tensorflow.bzl b/tensorflow/tensorflow.bzl
index 2609f5a42cf..f97363a919e 100644
--- a/tensorflow/tensorflow.bzl
+++ b/tensorflow/tensorflow.bzl
@@ -2864,8 +2864,31 @@ def if_cuda_or_rocm(if_true, if_false = []):
         "//conditions:default": if_false,
     })
 
-def tf_monitoring_deps():
-    return []
+def tf_monitoring_framework_deps(link_to_tensorflow_framework = True):
+    """Get the monitoring libs that will be linked to the tensorflow framework.
+
+      Currently in OSS, the protos must be statically linked to the tensorflow
+      framework, whereas the grpc should not be linked here.
+    """
+    return select({
+        "//tensorflow:stackdriver_support": [
+            "@com_github_googlecloudplatform_tensorflow_gcp_tools//monitoring:stackdriver_exporter_protos",
+        ],
+        "//conditions:default": [],
+    })
+
+def tf_monitoring_python_deps():
+    """Get the monitoring libs that will be linked to the python wrapper.
+
+      Currently in OSS, the grpc must be statically linked to the python wrapper
+      whereas the protos should not be linked here.
+    """
+    return select({
+        "//tensorflow:stackdriver_support": [
+            "@com_github_googlecloudplatform_tensorflow_gcp_tools//monitoring:stackdriver_exporter",
+        ],
+        "//conditions:default": [],
+    })
 
 def tf_jit_compilation_passes_extra_deps():
     return []
diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index 6b0143e397f..60fef8c0cb9 100755
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -351,6 +351,16 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
         ],
     )
 
+    tf_http_archive(
+        name = "com_github_googlecloudplatform_tensorflow_gcp_tools",
+        sha256 = "5e9ebe17eaa2895eb7f77fefbf52deeda7c4b63f5a616916b823eb74f3a0c542",
+        strip_prefix = "tensorflow-gcp-tools-2643d8caeba6ca2a6a0b46bb123953cb95b7e7d5",
+        urls = [
+            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/GoogleCloudPlatform/tensorflow-gcp-tools/archive/2643d8caeba6ca2a6a0b46bb123953cb95b7e7d5.tar.gz",
+            "https://github.com/GoogleCloudPlatform/tensorflow-gcp-tools/archive/2643d8caeba6ca2a6a0b46bb123953cb95b7e7d5.tar.gz",
+        ],
+    )
+
     tf_http_archive(
         name = "com_google_googleapis",
         build_file = clean_dep("//third_party/googleapis:googleapis.BUILD"),

From 2f45ee867dffb4dc8accb523d86663b7c9184635 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 15 Jun 2020 17:47:21 -0700
Subject: [PATCH 0243/1390]   Add decomposition for
 ResourceApplyCenteredRMSProp

PiperOrigin-RevId: 316582464
Change-Id: Id61fb84cf6ce26398fcd870f331b51e0af8b9c3d
---
 .../mlir/tensorflow/ir/tf_generated_ops.td    | 43 ++++++++++++
 .../tests/decompose_resource_ops.mlir         | 50 ++++++++++++++
 .../transforms/decompose_resource_ops.td      | 66 +++++++++++++++++++
 3 files changed, 159 insertions(+)

diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td b/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
index df8dccb2163..6131a729441 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
@@ -7280,6 +7280,49 @@ $$\text{variable} := \text{variable} - \text{lr}_t * m_t / (\sqrt{v_t} + \epsilo
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<3>;
 }
 
+def TF_ResourceApplyCenteredRMSPropOp : TF_Op<"ResourceApplyCenteredRMSProp", []> {
+  let summary = "Update '*var' according to the centered RMSProp algorithm.";
+
+  let description = [{
+The centered RMSProp algorithm uses an estimate of the centered second moment
+(i.e., the variance) for normalization, as opposed to regular RMSProp, which
+uses the (uncentered) second moment. This often helps with training, but is
+slightly more expensive in terms of computation and memory.
+
+Note that in dense implementation of this algorithm, mg, ms, and mom will
+update even if the grad is zero, but in this sparse implementation, mg, ms,
+and mom will not update in iterations during which the grad is zero.
+
+mean_square = decay * mean_square + (1-decay) * gradient ** 2
+mean_grad = decay * mean_grad + (1-decay) * gradient
+
+Delta = learning_rate * gradient / sqrt(mean_square + epsilon - mean_grad ** 2)
+
+mg <- rho * mg_{t-1} + (1-rho) * grad
+ms <- rho * ms_{t-1} + (1-rho) * grad * grad
+mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms - mg * mg + epsilon)
+var <- var - mom
+  }];
+
+  let arguments = (ins
+    TF_ResourceTensor:$var,
+    TF_ResourceTensor:$mg,
+    TF_ResourceTensor:$ms,
+    TF_ResourceTensor:$mom,
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$lr,
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$rho,
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$momentum,
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$epsilon,
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$grad,
+
+    DefaultValuedAttr<BoolAttr, "false">:$use_locking
+  );
+
+  let results = (outs);
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<4>;
+}
+
 def TF_ResourceApplyGradientDescentOp : TF_Op<"ResourceApplyGradientDescent", []> {
   let summary = "Update '*var' by subtracting 'alpha' * 'delta' from it.";
 
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/decompose_resource_ops.mlir b/tensorflow/compiler/mlir/tensorflow/tests/decompose_resource_ops.mlir
index 7a2e5173247..25dfda25358 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/decompose_resource_ops.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/decompose_resource_ops.mlir
@@ -368,6 +368,56 @@ func @decompose_resource_gather_op(%indices : tensor<5xi32>) -> tensor<2x5x16xi3
 
 // -----
 
+// Tests that composite tf.ResourceApplyCenteredRMSProp operation is decomposed.
+
+// CHECK-LABEL: func @decompose_resource_apply_centered_RMS_prop
+// CHECK-SAME:  [[VAR:%.*]]: tensor<f32>, [[MG:%.*]]: tensor<f32>, [[MS:%.*]]: tensor<f32>, [[MOM:%.*]]: tensor<f32>, [[LR:%.*]]: tensor<f32>, [[RHO:%.*]]: tensor<f32>, [[MOMENTUM:%.*]]: tensor<f32>, [[EPSILON:%.*]]: tensor<f32>, [[GRAD:%.*]]: tensor<f32>
+func @decompose_resource_apply_centered_RMS_prop(%arg0: tensor<f32>, %arg1: tensor<f32>, %arg2: tensor<f32>, %arg3: tensor<f32>, %arg4: tensor<f32>, %arg5: tensor<f32>, %arg6: tensor<f32>, %arg7: tensor<f32>, %arg8: tensor<f32>) -> () {
+  // CHECK: [[ONE:%.*]] = "tf.Const"() {value = dense<1.000000e+00> : tensor<f32>}
+  // CHECK: [[VAR_HANDLE:%.*]] = "tf.VarHandleOp"
+  // CHECK: [[MG_HANDLE:%.*]] = "tf.VarHandleOp"
+  // CHECK: [[MS_HANDLE:%.*]] = "tf.VarHandleOp"
+  // CHECK: [[MOM_HANDLE:%.*]] = "tf.VarHandleOp"
+  %0 = "tf.VarHandleOp"() {container = "c", shared_name = "v"} : () -> tensor<*x!tf.resource>
+  %1 = "tf.VarHandleOp"() {container = "c", shared_name = "v"} : () -> tensor<*x!tf.resource>
+  %2 = "tf.VarHandleOp"() {container = "c", shared_name = "v"} : () -> tensor<*x!tf.resource>
+  %3 = "tf.VarHandleOp"() {container = "c", shared_name = "v"} : () -> tensor<*x!tf.resource>
+
+  // CHECK: [[GRADSQ:%.*]] = "tf.Mul"([[GRAD]], [[GRAD]])
+  // CHECK: [[SB:%.*]] = "tf.Sub"([[ONE]], [[RHO]])
+  // CHECK: [[GRAD_SUB:%.*]] = "tf.Mul"([[GRADSQ]], [[SB]])
+  // CHECK: [[MS:%.*]] = "tf.ReadVariableOp"([[MS_HANDLE]])
+  // CHECK: [[MS_RHO:%.*]] = "tf.Mul"([[MS]], [[RHO]])
+  // CHECK: [[MS_NEW:%.*]] = "tf.Add"([[GRAD_SUB]], [[MS_RHO]])
+  // CHECK: "tf.AssignVariableOp"([[MS_HANDLE]], [[MS_NEW]])
+
+  // CHECK: [[SUB_RHO:%.*]] = "tf.Sub"([[ONE]], [[RHO]])
+  // CHECK: [[SUB_GRAD:%.*]] = "tf.Mul"([[GRAD]], [[SUB_RHO]])
+  // CHECK: [[MG:%.*]] = "tf.ReadVariableOp"([[MG_HANDLE]])
+  // CHECK: [[MG_RHO:%.*]] = "tf.Mul"([[MG]], [[RHO]])
+  // CHECK: [[MG_NEW:%.*]] = "tf.Add"([[SUB_GRAD]], [[MG_RHO]])
+  // CHECK: "tf.AssignVariableOp"([[MG_HANDLE]], [[MG_NEW]])
+
+  // CHECK: [[MOM:%.*]] = "tf.ReadVariableOp"([[MOM_HANDLE]])
+  // CHECK: [[MOM_MOM:%.*]] = "tf.Mul"([[MOMENTUM]], [[MOM]])
+  // CHECK: [[LR_GRAD:%.*]] = "tf.Mul"([[LR]], [[GRAD]])
+
+  // CHECK: [[MG_MG:%.*]] = "tf.Mul"([[MG_NEW]], [[MG_NEW]])
+  // CHECK: [[MG_NEW:%.*]] = "tf.Add"([[MG_MG]], [[EPSILON]])
+  // CHECK: [[MG_SUB:%.*]] = "tf.Sub"([[MS_NEW]], [[MG_NEW]])
+  // CHECK: [[MG_SQRT:%.*]] = "tf.Sqrt"([[MG_SUB]])
+  // CHECK: [[MOM_DIV:%.*]] = "tf.Div"([[LR_GRAD]], [[MG_SQRT]])
+  // CHECK: [[MOM_NEW:%.*]] = "tf.Add"([[MOM_MOM]], [[MOM_DIV]])
+
+  // CHECK: [[VAR:%.*]] = "tf.ReadVariableOp"([[VAR_HANDLE]])
+  // CHECK: [[VAR_NEW:%.*]] = "tf.Sub"([[VAR]], [[MOM_NEW]])
+
+  "tf.ResourceApplyCenteredRMSProp"(%0, %1, %2, %3, %arg4, %arg5, %arg6, %arg7, %arg8) {use_locking = false} : (tensor<*x!tf.resource>, tensor<*x!tf.resource>, tensor<*x!tf.resource>, tensor<*x!tf.resource>, tensor<f32>, tensor<f32>, tensor<f32>, tensor<f32>, tensor<f32>) -> ()
+  return
+}
+
+// -----
+
 // Tests that composite tf.ResourceScatterUpdate operation is decomposed.
 
 // CHECK-LABEL: @decompose_resource_scatter_update_op
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/decompose_resource_ops.td b/tensorflow/compiler/mlir/tensorflow/transforms/decompose_resource_ops.td
index 3869a1a7fa3..0dd7d778e31 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/decompose_resource_ops.td
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/decompose_resource_ops.td
@@ -327,3 +327,69 @@ def DecomposeVariableShape : Pat<
   (TF_VariableShapeOp:$src_op $resource),
   (TF_ShapeOp (CreateTFReadVariableOpFromResourceHandle $src_op, $resource)),
   [(CheckHasResourceSubtype $resource)]>;
+
+// This decomposition is only correct inside XLA as it ignores use_locking
+// attribute.
+// ms <- rho * ms_{t-1} + (1-rho) * grad * grad
+// mg = grad * (one - rho) + mg * rho;
+// mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms - mg * mg + epsilon)
+//
+def DecomposeResourceApplyCenteredRMSProp :
+  Pattern<
+    (TF_ResourceApplyCenteredRMSPropOp:$src_op
+       $var_resource, $mg_resource, $ms_resource, $mom_resource, $lr, $rho, $momentum, $epsilon,
+       $grad, ConstBoolAttrFalse:$use_locking
+    ),
+    [(TF_ConstOp:$one (GetScalarOfType<1> $grad)),
+     (CreateTFReadVariableOp $src_op, $grad, $ms_resource),
+     (TF_AddOp:$ms_new
+       (TF_MulOp
+         (TF_MulOp $grad, $grad),
+         (TF_SubOp $one, $rho)
+       ),
+       (TF_MulOp
+          (CreateTFReadVariableOp $src_op, $grad, $ms_resource),
+          $rho
+       )
+     ),
+     (TF_AssignVariableOp $ms_resource, $ms_new),
+     // mg = grad * (one - rho) + mg * rho;
+     (TF_AddOp:$mg_new
+       (TF_MulOp
+         $grad,
+         (TF_SubOp $one, $rho)
+       ),
+       (TF_MulOp
+          (CreateTFReadVariableOp $src_op, $grad, $mg_resource),
+          $rho
+       )
+     ),
+     (TF_AssignVariableOp $mg_resource, $mg_new),
+     // mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms - mg * mg + epsilon)
+     (TF_AddOp:$mom_new
+      (TF_MulOp $momentum,
+       (CreateTFReadVariableOp $src_op, $grad, $mom_resource)),
+      (TF_DivOp
+         (TF_MulOp $lr, $grad),
+         (TF_SqrtOp
+           (TF_SubOp
+             $ms_new,
+             (TF_AddOp
+               (TF_MulOp
+                 $mg_new,
+                 $mg_new
+               ),
+               $epsilon
+             )
+           )
+         )
+      )
+     ),
+     (TF_AssignVariableOp $mom_resource, $mom_new),
+     // var <- var - mom
+     (TF_AssignSubVariableOp $var_resource,
+        (TF_SubOp (CreateTFReadVariableOp $src_op, $grad, $var_resource),
+                  $mom_new)
+     )
+   ]
+   >;

From fe6580b4d85b12a2ce7b1a529b70fdbfeedd899e Mon Sep 17 00:00:00 2001
From: Lukas Geiger <lukas.geiger94@gmail.com>
Date: Tue, 16 Jun 2020 02:16:09 +0100
Subject: [PATCH 0244/1390] Add comment about ignoring distributed multi
 assignment

---
 .../keras/mixed_precision/experimental/autocast_variable_test.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/python/keras/mixed_precision/experimental/autocast_variable_test.py b/tensorflow/python/keras/mixed_precision/experimental/autocast_variable_test.py
index 95957f5634e..14f26cdf953 100644
--- a/tensorflow/python/keras/mixed_precision/experimental/autocast_variable_test.py
+++ b/tensorflow/python/keras/mixed_precision/experimental/autocast_variable_test.py
@@ -311,6 +311,7 @@ class AutoCastVariableTest(test.TestCase, parameterized.TestCase):
         self.assertAllClose(3., self.evaluate(x.assign_sub(3.)))
 
         # Assign multiple times
+        # This currently only works if no strategy is used
         if not ds_context.has_strategy():
           assign = x.assign(1.)
           self.assertAllClose(1., self.evaluate(assign))

From da2046b8a3b1bb79c77bf258aa8a52887bc3703a Mon Sep 17 00:00:00 2001
From: Lukas Geiger <lukas.geiger94@gmail.com>
Date: Tue, 16 Jun 2020 02:25:18 +0100
Subject: [PATCH 0245/1390] Use default_strategy instead of dummy scope

---
 .../experimental/autocast_variable_test.py               | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/tensorflow/python/keras/mixed_precision/experimental/autocast_variable_test.py b/tensorflow/python/keras/mixed_precision/experimental/autocast_variable_test.py
index 14f26cdf953..c45015b644e 100644
--- a/tensorflow/python/keras/mixed_precision/experimental/autocast_variable_test.py
+++ b/tensorflow/python/keras/mixed_precision/experimental/autocast_variable_test.py
@@ -17,7 +17,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import contextlib
 import os
 
 from absl.testing import parameterized
@@ -43,15 +42,9 @@ from tensorflow.python.platform import test
 from tensorflow.python.training import gradient_descent as gradient_descent_v1
 from tensorflow.python.training.tracking import util as trackable_utils
 
-class DummyStrategy(object):
-  @contextlib.contextmanager
-  def scope(self):
-    yield
-
 maybe_distribute = combinations.combine(
     distribution=[
-        combinations.NamedDistribution(
-            "Dummy", lambda: DummyStrategy(), required_gpus=None),
+        strategy_combinations.default_strategy,
         strategy_combinations.mirrored_strategy_with_cpu_1_and_2
     ])
 

From 0292d384f84f55a78312ae0132f700caec245315 Mon Sep 17 00:00:00 2001
From: Liam Miller-Cushon <cushon@google.com>
Date: Mon, 15 Jun 2020 18:23:12 -0700
Subject: [PATCH 0246/1390] Suppress a deprecation warning for Object.finalize

The API is deprecated starting in JDK 9:
https://bugs.openjdk.java.net/browse/JDK-8165641

PiperOrigin-RevId: 316587198
Change-Id: Icafed5879fd50973e955522a176aa11a8773090b
---
 .../java/src/main/java/org/tensorflow/lite/Interpreter.java     | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tensorflow/lite/java/src/main/java/org/tensorflow/lite/Interpreter.java b/tensorflow/lite/java/src/main/java/org/tensorflow/lite/Interpreter.java
index 0f8b7b5c2f2..7c9c5644f47 100644
--- a/tensorflow/lite/java/src/main/java/org/tensorflow/lite/Interpreter.java
+++ b/tensorflow/lite/java/src/main/java/org/tensorflow/lite/Interpreter.java
@@ -507,6 +507,8 @@ public final class Interpreter implements AutoCloseable {
     }
   }
 
+  // for Object.finalize, see https://bugs.openjdk.java.net/browse/JDK-8165641
+  @SuppressWarnings("deprecation")
   @Override
   protected void finalize() throws Throwable {
     try {

From 2a96849f478627ad03b8f20f8c71e85d6d7480e3 Mon Sep 17 00:00:00 2001
From: Karim Nosir <karimnosseir@google.com>
Date: Mon, 15 Jun 2020 18:38:58 -0700
Subject: [PATCH 0247/1390] Update source files with used includes.

PiperOrigin-RevId: 316589177
Change-Id: I0aba0ed1cf9ff478e7890fa53a7749bf844bd26d
---
 tensorflow/lite/kernels/BUILD                 | 406 +++++++++---------
 .../lite/kernels/acceleration_test_util.cc    |  10 +-
 .../lite/kernels/acceleration_test_util.h     |   2 +-
 .../acceleration_test_util_internal.cc        |   8 +
 .../kernels/acceleration_test_util_internal.h |   4 +-
 .../acceleration_test_util_internal_test.cc   |   2 +
 tensorflow/lite/kernels/activations.cc        |  12 +-
 tensorflow/lite/kernels/activations_test.cc   |  18 +-
 tensorflow/lite/kernels/add.cc                |  12 +-
 tensorflow/lite/kernels/add_n.cc              |   2 +
 tensorflow/lite/kernels/add_n_test.cc         |   7 +-
 tensorflow/lite/kernels/add_test.cc           |  10 +-
 tensorflow/lite/kernels/arg_min_max.cc        |   8 +-
 tensorflow/lite/kernels/arg_min_max_test.cc   |  11 +-
 tensorflow/lite/kernels/assign_variable.cc    |   6 +-
 tensorflow/lite/kernels/audio_spectrogram.cc  |   9 +-
 .../lite/kernels/audio_spectrogram_test.cc    |   5 +-
 tensorflow/lite/kernels/basic_rnn_test.cc     |   7 +-
 tensorflow/lite/kernels/batch_matmul.cc       |   6 +
 tensorflow/lite/kernels/batch_matmul_test.cc  |  12 +-
 tensorflow/lite/kernels/batch_to_space_nd.cc  |   8 +-
 .../lite/kernels/batch_to_space_nd_test.cc    |   9 +-
 .../kernels/bidirectional_sequence_lstm.cc    |   3 +
 .../bidirectional_sequence_lstm_test.cc       |   9 +-
 .../kernels/bidirectional_sequence_rnn.cc     |   1 +
 .../bidirectional_sequence_rnn_test.cc        |  11 +-
 tensorflow/lite/kernels/cast.cc               |   6 +-
 tensorflow/lite/kernels/cast_test.cc          |  11 +-
 tensorflow/lite/kernels/ceil.cc               |   1 +
 tensorflow/lite/kernels/ceil_test.cc          |   8 +-
 tensorflow/lite/kernels/comparisons.cc        |   9 +
 tensorflow/lite/kernels/comparisons_test.cc   |  13 +-
 tensorflow/lite/kernels/concatenation.cc      |  13 +-
 tensorflow/lite/kernels/concatenation_test.cc |  13 +-
 tensorflow/lite/kernels/conv.cc               |  13 +-
 tensorflow/lite/kernels/conv_test.cc          |  15 +-
 .../lite/kernels/cpu_backend_context.cc       |   4 +
 tensorflow/lite/kernels/cpu_backend_context.h |   1 +
 tensorflow/lite/kernels/cpu_backend_gemm.h    |   1 +
 .../kernels/cpu_backend_gemm_custom_gemv.h    |   5 +
 .../lite/kernels/cpu_backend_gemm_gemmlowp.h  |   4 +-
 .../lite/kernels/cpu_backend_gemm_ruy.h       |   2 +-
 .../lite/kernels/cpu_backend_gemm_test.cc     |   9 +-
 .../kernels/cpu_backend_threadpool_test.cc    |   2 +
 tensorflow/lite/kernels/custom_ops_register.h |   2 +-
 tensorflow/lite/kernels/densify.cc            |   5 +-
 tensorflow/lite/kernels/densify_test.cc       |   8 +-
 tensorflow/lite/kernels/depth_to_space.cc     |   5 +-
 .../lite/kernels/depth_to_space_test.cc       |  11 +-
 tensorflow/lite/kernels/depthwise_conv.cc     |  15 +-
 .../kernels/depthwise_conv_hybrid_test.cc     |  15 +-
 .../lite/kernels/depthwise_conv_test.cc       |  14 +-
 tensorflow/lite/kernels/dequantize.cc         |   8 +-
 tensorflow/lite/kernels/dequantize.h          |   7 +-
 tensorflow/lite/kernels/dequantize_test.cc    |   9 +-
 .../lite/kernels/detection_postprocess.cc     |  10 +-
 .../kernels/detection_postprocess_test.cc     |   9 +-
 tensorflow/lite/kernels/div.cc                |   9 +-
 tensorflow/lite/kernels/div_test.cc           |  10 +-
 tensorflow/lite/kernels/eigen_support.cc      |   3 +
 tensorflow/lite/kernels/eigen_support_test.cc |   3 +-
 tensorflow/lite/kernels/elementwise.cc        |   4 +
 tensorflow/lite/kernels/elementwise_test.cc   |   8 +-
 tensorflow/lite/kernels/embedding_lookup.cc   |  12 +-
 .../lite/kernels/embedding_lookup_sparse.cc   |   3 +-
 .../kernels/embedding_lookup_sparse_test.cc   |   7 +-
 .../lite/kernels/embedding_lookup_test.cc     |   9 +-
 tensorflow/lite/kernels/exp.cc                |   7 +-
 tensorflow/lite/kernels/exp_test.cc           |  11 +-
 tensorflow/lite/kernels/expand_dims.cc        |   6 +-
 tensorflow/lite/kernels/expand_dims_test.cc   |  14 +-
 tensorflow/lite/kernels/fake_quant.cc         |   7 +-
 tensorflow/lite/kernels/fake_quant_test.cc    |   9 +-
 tensorflow/lite/kernels/fill.cc               |   4 +-
 tensorflow/lite/kernels/fill_test.cc          |  12 +-
 tensorflow/lite/kernels/floor.cc              |   1 +
 tensorflow/lite/kernels/floor_div.cc          |   9 +-
 tensorflow/lite/kernels/floor_div_test.cc     |   9 +-
 tensorflow/lite/kernels/floor_mod.cc          |   7 +-
 tensorflow/lite/kernels/floor_mod_test.cc     |   9 +-
 tensorflow/lite/kernels/floor_test.cc         |   8 +-
 tensorflow/lite/kernels/fully_connected.cc    |   1 -
 .../lite/kernels/fully_connected_test.cc      |  15 +-
 tensorflow/lite/kernels/gather.cc             |   6 +-
 tensorflow/lite/kernels/gather_nd.cc          |   6 +-
 tensorflow/lite/kernels/gather_nd_test.cc     |  13 +-
 tensorflow/lite/kernels/gather_test.cc        |  14 +-
 tensorflow/lite/kernels/hashtable/BUILD       |   2 +-
 tensorflow/lite/kernels/hashtable_lookup.cc   |   9 +-
 .../lite/kernels/hashtable_lookup_test.cc     |  11 +-
 tensorflow/lite/kernels/if.cc                 |   4 +
 tensorflow/lite/kernels/if_test.cc            |   9 +-
 .../kernels/internal/optimized/im2col_utils.h |   2 +
 .../internal/optimized/integer_ops/add.h      |   5 +
 .../internal/optimized/integer_ops/conv.h     |   1 +
 .../optimized/integer_ops/depthwise_conv.h    |   7 +
 .../optimized/integer_ops/fully_connected.h   |   4 +-
 .../internal/optimized/integer_ops/mul.h      |   5 +
 .../internal/optimized/integer_ops/pooling.h  |  13 +-
 .../internal/per_channel_dequantize_test.cc   |   1 +
 .../kernels/internal/reference/batch_matmul.h |   5 +-
 .../lite/kernels/internal/reference/densify.h |   2 +
 .../internal/reference/non_max_suppression.h  |   2 +-
 .../kernels/internal/reference/quantize.h     |   4 +
 .../internal/reference/strided_slice.h        |   2 +
 .../lite/kernels/internal/reference/sub.h     |   8 +-
 .../lite/kernels/internal/reference/svdf.h    |   3 +
 tensorflow/lite/kernels/kernel_util.cc        |   7 +-
 tensorflow/lite/kernels/kernel_util.h         |   4 +-
 tensorflow/lite/kernels/kernel_util_test.cc   |  10 +
 tensorflow/lite/kernels/l2norm.cc             |   5 +-
 tensorflow/lite/kernels/l2norm_test.cc        |  11 +-
 .../lite/kernels/local_response_norm.cc       |   3 +-
 .../lite/kernels/local_response_norm_test.cc  |   9 +-
 tensorflow/lite/kernels/log_softmax_test.cc   |   8 +-
 tensorflow/lite/kernels/logical.cc            |   5 +-
 tensorflow/lite/kernels/logical_test.cc       |   9 +-
 tensorflow/lite/kernels/lsh_projection.cc     |  10 +-
 .../lite/kernels/lsh_projection_test.cc       |   7 +-
 tensorflow/lite/kernels/lstm_eval.cc          |   5 +
 tensorflow/lite/kernels/lstm_eval.h           |   1 -
 tensorflow/lite/kernels/lstm_eval_test.cc     |  12 +-
 tensorflow/lite/kernels/lstm_test.cc          |   8 +-
 tensorflow/lite/kernels/matrix_diag.cc        |   7 +-
 tensorflow/lite/kernels/matrix_diag_test.cc   |  10 +-
 tensorflow/lite/kernels/matrix_set_diag.cc    |   7 +-
 .../lite/kernels/matrix_set_diag_test.cc      |  10 +-
 tensorflow/lite/kernels/maximum_minimum.cc    |  10 +-
 .../lite/kernels/maximum_minimum_test.cc      |  11 +-
 tensorflow/lite/kernels/mfcc.cc               |   9 +-
 tensorflow/lite/kernels/mfcc_test.cc          |   6 +-
 tensorflow/lite/kernels/mirror_pad.cc         |   6 +-
 tensorflow/lite/kernels/mirror_pad_test.cc    |   7 +-
 tensorflow/lite/kernels/mul.cc                |  10 +-
 tensorflow/lite/kernels/mul_test.cc           |  11 +-
 tensorflow/lite/kernels/neg.cc                |   3 +
 tensorflow/lite/kernels/neg_test.cc           |  11 +-
 .../lite/kernels/non_max_suppression.cc       |   8 +-
 .../lite/kernels/non_max_suppression_test.cc  |   9 +-
 tensorflow/lite/kernels/numeric_verify.cc     |   9 +-
 .../lite/kernels/numeric_verify_test.cc       |   6 +-
 tensorflow/lite/kernels/one_hot.cc            |   4 +-
 tensorflow/lite/kernels/one_hot_test.cc       |   7 +-
 tensorflow/lite/kernels/op_macros.h           |   2 -
 .../lite/kernels/optional_tensor_test.cc      |  10 +-
 tensorflow/lite/kernels/pack.cc               |   4 +
 tensorflow/lite/kernels/pack_test.cc          |  11 +-
 tensorflow/lite/kernels/pad.cc                |  11 +-
 tensorflow/lite/kernels/pad_test.cc           |   8 +-
 tensorflow/lite/kernels/pooling.cc            |  13 +-
 tensorflow/lite/kernels/pooling_test.cc       |  11 +-
 tensorflow/lite/kernels/pow.cc                |   5 +-
 tensorflow/lite/kernels/pow_test.cc           |  10 +-
 .../lite/kernels/quant_basic_lstm_test.cc     |   7 +-
 tensorflow/lite/kernels/quantize.cc           |   1 +
 tensorflow/lite/kernels/quantize_test.cc      |   7 +-
 tensorflow/lite/kernels/range.cc              |   8 +-
 tensorflow/lite/kernels/range_test.cc         |   8 +-
 tensorflow/lite/kernels/rank.cc               |   5 +-
 tensorflow/lite/kernels/rank_test.cc          |   8 +-
 tensorflow/lite/kernels/read_variable.cc      |   5 +-
 tensorflow/lite/kernels/reduce.cc             |   9 +-
 tensorflow/lite/kernels/reduce_test.cc        |  10 +-
 tensorflow/lite/kernels/register.cc           |   2 +
 tensorflow/lite/kernels/register.h            |   1 -
 tensorflow/lite/kernels/register_ref.cc       |   4 +
 tensorflow/lite/kernels/register_ref.h        |   2 +-
 tensorflow/lite/kernels/reshape.cc            |   2 +-
 tensorflow/lite/kernels/reshape_test.cc       |   7 +-
 tensorflow/lite/kernels/reshape_test_common.h |   8 +
 tensorflow/lite/kernels/resize_bilinear.cc    |   7 +-
 .../lite/kernels/resize_bilinear_test.cc      |  10 +-
 .../lite/kernels/resize_nearest_neighbor.cc   |   9 +-
 .../kernels/resize_nearest_neighbor_test.cc   |  10 +-
 tensorflow/lite/kernels/reverse.cc            |   4 +-
 tensorflow/lite/kernels/reverse_sequence.cc   |   3 +
 .../lite/kernels/reverse_sequence_test.cc     |   9 +-
 tensorflow/lite/kernels/reverse_test.cc       |   9 +-
 tensorflow/lite/kernels/rfft2d.cc             |  12 +-
 tensorflow/lite/kernels/rfft2d_test.cc        |   8 +-
 tensorflow/lite/kernels/round.cc              |   3 +
 tensorflow/lite/kernels/round_test.cc         |   7 +-
 tensorflow/lite/kernels/scatter_nd.cc         |   7 +-
 tensorflow/lite/kernels/scatter_nd_test.cc    |  12 +-
 tensorflow/lite/kernels/segment_sum.cc        |   4 +-
 tensorflow/lite/kernels/segment_sum_test.cc   |   8 +-
 tensorflow/lite/kernels/select.cc             |   6 +-
 tensorflow/lite/kernels/select_test.cc        |  10 +-
 tensorflow/lite/kernels/shape.cc              |   5 +-
 tensorflow/lite/kernels/shape_test.cc         |   7 +-
 tensorflow/lite/kernels/skip_gram.cc          |   2 -
 tensorflow/lite/kernels/skip_gram_test.cc     |   7 +-
 tensorflow/lite/kernels/slice.cc              |  12 +-
 tensorflow/lite/kernels/slice_test.cc         |  11 +-
 tensorflow/lite/kernels/softmax_test.cc       |  11 +-
 tensorflow/lite/kernels/space_to_batch_nd.cc  |   9 +-
 .../lite/kernels/space_to_batch_nd_test.cc    |  10 +-
 tensorflow/lite/kernels/space_to_depth.cc     |   5 +-
 .../lite/kernels/space_to_depth_test.cc       |  10 +-
 tensorflow/lite/kernels/sparse_to_dense.cc    |  12 +-
 .../lite/kernels/sparse_to_dense_test.cc      |  11 +-
 tensorflow/lite/kernels/split.cc              |   7 +-
 tensorflow/lite/kernels/split_test.cc         |  12 +-
 tensorflow/lite/kernels/split_v.cc            |   5 +-
 tensorflow/lite/kernels/split_v_test.cc       |  11 +-
 tensorflow/lite/kernels/squared_difference.cc |   8 +-
 .../lite/kernels/squared_difference_test.cc   |  10 +-
 tensorflow/lite/kernels/squeeze.cc            |   3 -
 tensorflow/lite/kernels/squeeze_test.cc       |  11 +-
 tensorflow/lite/kernels/strided_slice.cc      |   8 +-
 tensorflow/lite/kernels/strided_slice_test.cc |  10 +-
 tensorflow/lite/kernels/sub.cc                |  11 +-
 tensorflow/lite/kernels/sub_test.cc           |  10 +-
 tensorflow/lite/kernels/subgraph_test_util.cc |  13 +-
 tensorflow/lite/kernels/subgraph_test_util.h  |   5 +
 .../lite/kernels/subgraph_test_util_test.cc   |   8 +-
 tensorflow/lite/kernels/svdf.cc               |   1 +
 tensorflow/lite/kernels/svdf_test.cc          |  10 +-
 tensorflow/lite/kernels/test_util.cc          |  23 +-
 tensorflow/lite/kernels/test_util.h           |  24 +-
 tensorflow/lite/kernels/test_util_test.cc     |   6 +
 tensorflow/lite/kernels/tile.cc               |  11 +-
 tensorflow/lite/kernels/tile_test.cc          |  13 +-
 tensorflow/lite/kernels/topk_v2.cc            |  11 +-
 tensorflow/lite/kernels/topk_v2_test.cc       |  12 +-
 tensorflow/lite/kernels/transpose.cc          |   9 +-
 tensorflow/lite/kernels/transpose_conv.cc     |  13 +-
 .../lite/kernels/transpose_conv_test.cc       |  16 +-
 tensorflow/lite/kernels/transpose_test.cc     |  13 +-
 .../kernels/unidirectional_sequence_lstm.cc   |   2 +
 .../unidirectional_sequence_lstm_test.cc      |   6 +-
 .../unidirectional_sequence_rnn_test.cc       |   7 +-
 tensorflow/lite/kernels/unique.cc             |   6 +
 tensorflow/lite/kernels/unique_test.cc        |   9 +-
 tensorflow/lite/kernels/unpack.cc             |   4 +
 tensorflow/lite/kernels/unpack_test.cc        |  12 +-
 tensorflow/lite/kernels/variable_ops_test.cc  |   4 +-
 tensorflow/lite/kernels/where.cc              |   2 +
 tensorflow/lite/kernels/where_test.cc         |   8 +-
 tensorflow/lite/kernels/while.cc              |   3 +
 tensorflow/lite/kernels/while_test.cc         |  10 +-
 tensorflow/lite/kernels/zeros_like.cc         |   4 +
 tensorflow/lite/kernels/zeros_like_test.cc    |   9 +-
 .../lite/micro/kernels/cmsis-nn/pooling.cc    |   1 +
 244 files changed, 1457 insertions(+), 809 deletions(-)

diff --git a/tensorflow/lite/kernels/BUILD b/tensorflow/lite/kernels/BUILD
index aad79ffbc89..b16a85c65d8 100644
--- a/tensorflow/lite/kernels/BUILD
+++ b/tensorflow/lite/kernels/BUILD
@@ -115,10 +115,11 @@ cc_test(
     size = "small",
     srcs = ["optional_tensor_test.cc"],
     deps = [
-        ":builtin_ops",
         ":test_util",
-        "//tensorflow/lite:framework",
+        "//tensorflow/lite/schema:schema_fbs",
+        "//tensorflow/lite/testing:util",
         "@com_google_googletest//:gtest",
+        "@flatbuffers",
     ],
 )
 
@@ -158,7 +159,6 @@ cc_test(
     ],
     deps = [
         ":acceleration_test_util_internal",
-        "@com_google_absl//absl/types:optional",
         "@com_google_googletest//:gtest_main",
     ],
 )
@@ -175,8 +175,11 @@ cc_library(
         "//tensorflow/lite:framework",
         "//tensorflow/lite:minimal_logging",
         "//tensorflow/lite:schema_fbs_version",
+        "//tensorflow/lite:string",
         "//tensorflow/lite:string_util",
+        "//tensorflow/lite:type_to_tflitetype",
         "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core/api",
         "//tensorflow/lite/delegates/nnapi:acceleration_test_util",
         "//tensorflow/lite/delegates/nnapi:nnapi_delegate",
         "//tensorflow/lite/kernels/internal:tensor_utils",
@@ -187,6 +190,7 @@ cc_library(
         "//tensorflow/lite/tools/optimize/sparsity:format_converter",
         "//tensorflow/lite/tools/versioning",
         "@com_google_googletest//:gtest",
+        "@flatbuffers",
     ],
 )
 
@@ -217,6 +221,7 @@ cc_library(
         "//tensorflow/lite:arena_planner",
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/kernels/internal:optimized",
+        "//third_party/eigen3",
     ],
 )
 
@@ -226,7 +231,9 @@ cc_test(
     srcs = ["eigen_support_test.cc"],
     deps = [
         ":eigen_support",
+        "//tensorflow/lite/c:common",
         "//tensorflow/lite/kernels/internal:optimized",
+        "//third_party/eigen3",
         "@com_google_googletest//:gtest",
     ],
 )
@@ -291,6 +298,7 @@ cc_library(
         # gemmlowp_context_ and ruy_context_ members.
         "@ruy//ruy:context",
         "@gemmlowp",
+        "//tensorflow/lite/c:common",
         "//tensorflow/lite:external_cpu_backend_context",
     ],
 )
@@ -333,17 +341,18 @@ cc_library(
         "cpu_backend_gemm_eigen.cc",
         "cpu_backend_gemm_eigen.h",
         "cpu_backend_gemm_gemmlowp.h",
-        "cpu_backend_gemm_ruy.h",
     ],
     hdrs = [
         "cpu_backend_gemm.h",
         "cpu_backend_gemm_params.h",
+        "cpu_backend_gemm_ruy.h",
     ],
     copts = tflite_copts(),
     deps = [
         ":tflite_with_ruy_only",
         "//tensorflow/lite/kernels/internal:common",
         "//tensorflow/lite/kernels/internal:compatibility",
+        "//tensorflow/lite/kernels/internal:cpu_check",
         "//tensorflow/lite/kernels/internal:types",
         ":cpu_backend_context",
         ":cpu_backend_threadpool",
@@ -369,6 +378,7 @@ cc_test(
         ":cpu_backend_context",
         ":cpu_backend_gemm",
         "@com_google_googletest//:gtest",
+        "@ruy//ruy:matrix",
         # ruy:reference_mul provides the reference implementation
         # that this test compares against.
         "@ruy//ruy:reference_mul",
@@ -382,6 +392,7 @@ cc_library(
     ],
     build_for_embedded = True,
     copts = tflite_copts(),
+    deps = ["//tensorflow/lite/micro:debug_log"],
 )
 
 cc_library(
@@ -408,6 +419,7 @@ cc_test(
     srcs = ["kernel_util_test.cc"],
     deps = [
         ":kernel_util",
+        "//tensorflow/lite/c:common",
         "//tensorflow/lite/testing:util",
         "@com_google_googletest//:gtest",
     ],
@@ -569,7 +581,12 @@ cc_library(
     ],
     copts = tflite_copts() + tf_opts_nortti_if_android() + EXTRA_EIGEN_COPTS,
     visibility = ["//visibility:private"],
-    deps = BUILTIN_KERNEL_DEPS + ["@farmhash_archive//:farmhash"],
+    deps = BUILTIN_KERNEL_DEPS + [
+        "@ruy//ruy/profiler:instrumentation",
+        "//tensorflow/lite/kernels/internal:cppmath",
+        "//tensorflow/lite:string",
+        "@farmhash_archive//:farmhash",
+    ],
 )
 
 # Creates a target where Ruy is unconditionally enabled along with caching
@@ -583,7 +600,12 @@ cc_library(
     ],
     copts = tflite_copts() + tf_opts_nortti_if_android() + EXTRA_EIGEN_COPTS,
     visibility = ["//visibility:private"],
-    deps = BUILTIN_KERNEL_DEPS + ["@farmhash_archive//:farmhash"] + [":tflite_with_ruy_only_and_caching_enabled"],
+    deps = BUILTIN_KERNEL_DEPS + [
+        "@ruy//ruy/profiler:instrumentation",
+        "//tensorflow/lite/kernels/internal:cppmath",
+        "//tensorflow/lite:string",
+        "@farmhash_archive//:farmhash",
+    ] + [":tflite_with_ruy_only_and_caching_enabled"],
 )
 
 cc_library(
@@ -611,7 +633,6 @@ cc_test(
     ],
     deps = [
         ":test_main",
-        ":test_util",
         ":variable_op_kernels",  # buildcleaner: keep
         "//tensorflow/lite:framework",
         "//tensorflow/lite/kernels/internal:tensor",
@@ -632,6 +653,7 @@ cc_library(
         "//tensorflow/lite/kernels/hashtable:hashtable_op_kernels",
         "//tensorflow/lite/kernels/internal:kernel_utils",
         "//tensorflow/lite/kernels/internal:tensor",
+        "//tensorflow/lite/kernels/internal:types",
         "//third_party/fft2d:fft2d_headers",
         "@fft2d",
         "@ruy//ruy/profiler:instrumentation",
@@ -681,6 +703,7 @@ cc_library(
         ":builtin_op_kernels",
         "//tensorflow/lite:framework",
         "//tensorflow/lite/c:common",
+        "//tensorflow/lite/schema:schema_fbs",
     ],
     alwayslink = 1,
 )
@@ -697,6 +720,7 @@ cc_library(
         ":builtin_op_kernels",
         "//tensorflow/lite:framework_lib",
         "//tensorflow/lite/c:common",
+        "//tensorflow/lite/schema:schema_fbs",
     ],
 )
 
@@ -713,6 +737,7 @@ cc_library(
         ":builtin_op_kernels_ruy_and_caching",
         "//tensorflow/lite:framework_lib",
         "//tensorflow/lite/c:common",
+        "//tensorflow/lite/schema:schema_fbs",
     ],
 )
 
@@ -728,6 +753,7 @@ cc_library(
         "//tensorflow/lite:framework",
         "//tensorflow/lite:util",
         "//tensorflow/lite/c:common",
+        "//tensorflow/lite/schema:schema_fbs",
     ],
 )
 
@@ -736,10 +762,10 @@ cc_test(
     size = "small",
     srcs = ["audio_spectrogram_test.cc"],
     deps = [
-        ":builtin_ops",
         ":test_main",
         ":test_util",
         "//tensorflow/lite:framework",
+        "//tensorflow/lite/schema:schema_fbs",
         "@com_google_googletest//:gtest",
         "@flatbuffers",
     ],
@@ -750,10 +776,10 @@ cc_test(
     size = "small",
     srcs = ["mfcc_test.cc"],
     deps = [
-        ":builtin_ops",
         ":test_main",
         ":test_util",
         "//tensorflow/lite:framework",
+        "//tensorflow/lite/schema:schema_fbs",
         "@com_google_googletest//:gtest",
         "@flatbuffers",
     ],
@@ -764,10 +790,10 @@ cc_test(
     size = "small",
     srcs = ["detection_postprocess_test.cc"],
     deps = [
-        ":builtin_ops",
         ":test_main",
         ":test_util",
         "//tensorflow/lite:framework",
+        "//tensorflow/lite/schema:schema_fbs",
         "@com_google_googletest//:gtest",
         "@flatbuffers",
     ],
@@ -782,12 +808,15 @@ cc_test(
         "tflite_xnnpack",
     ],
     deps = [
-        ":builtin_ops",
         ":test_main",
         ":test_util",
         "//tensorflow/lite:framework",
+        "//tensorflow/lite:string",
+        "//tensorflow/lite/core/api",
+        "//tensorflow/lite/schema:schema_fbs",
         "@com_google_absl//absl/memory",
         "@com_google_googletest//:gtest",
+        "@flatbuffers",
     ],
 )
 
@@ -800,11 +829,11 @@ cc_test(
         "tflite_xnnpack",
     ],
     deps = [
-        ":builtin_ops",
         ":test_main",
         ":test_util",
-        "//tensorflow/lite:framework",
+        "//tensorflow/lite/schema:schema_fbs",
         "@com_google_googletest//:gtest",
+        "@flatbuffers",
     ],
 )
 
@@ -813,11 +842,11 @@ cc_test(
     size = "small",
     srcs = ["add_n_test.cc"],
     deps = [
-        ":builtin_ops",
         ":test_main",
         ":test_util",
-        "//tensorflow/lite:framework",
+        "//tensorflow/lite/schema:schema_fbs",
         "@com_google_googletest//:gtest",
+        "@flatbuffers",
     ],
 )
 
@@ -827,11 +856,11 @@ cc_test(
     srcs = ["arg_min_max_test.cc"],
     tags = ["tflite_nnapi"],
     deps = [
-        ":builtin_ops",
         ":test_main",
         ":test_util",
-        "//tensorflow/lite:framework",
+        "//tensorflow/lite/schema:schema_fbs",
         "@com_google_googletest//:gtest",
+        "@flatbuffers",
     ],
 )
 
@@ -841,11 +870,11 @@ cc_test(
     srcs = ["div_test.cc"],
     tags = ["tflite_nnapi"],
     deps = [
-        ":builtin_ops",
         ":test_main",
         ":test_util",
-        "//tensorflow/lite:framework",
+        "//tensorflow/lite/schema:schema_fbs",
         "@com_google_googletest//:gtest",
+        "@flatbuffers",
     ],
 )
 
@@ -855,11 +884,11 @@ cc_test(
     srcs = ["sub_test.cc"],
     tags = ["tflite_nnapi"],
     deps = [
-        ":builtin_ops",
         ":test_main",
         ":test_util",
-        "//tensorflow/lite:framework",
+        "//tensorflow/lite/schema:schema_fbs",
         "@com_google_googletest//:gtest",
+        "@flatbuffers",
     ],
 )
 
@@ -869,13 +898,15 @@ cc_test(
     srcs = ["transpose_test.cc"],
     tags = ["tflite_nnapi"],
     deps = [
-        ":builtin_ops",
         ":test_main",
         ":test_util",
-        "//tensorflow/lite:framework",
+        "//tensorflow/lite/kernels/internal:compatibility",
         "//tensorflow/lite/kernels/internal:reference",
         "//tensorflow/lite/kernels/internal:reference_base",
+        "//tensorflow/lite/kernels/internal:types",
+        "//tensorflow/lite/schema:schema_fbs",
         "@com_google_googletest//:gtest",
+        "@flatbuffers",
     ],
 )
 
@@ -884,11 +915,11 @@ cc_test(
     size = "small",
     srcs = ["space_to_batch_nd_test.cc"],
     deps = [
-        ":builtin_ops",
         ":test_main",
         ":test_util",
-        "//tensorflow/lite:framework",
+        "//tensorflow/lite/schema:schema_fbs",
         "@com_google_googletest//:gtest",
+        "@flatbuffers",
     ],
 )
 
@@ -898,11 +929,12 @@ cc_test(
     srcs = ["batch_to_space_nd_test.cc"],
     tags = ["tflite_nnapi"],
     deps = [
-        ":builtin_ops",
         ":test_main",
         ":test_util",
         "//tensorflow/lite:framework",
+        "//tensorflow/lite/schema:schema_fbs",
         "@com_google_googletest//:gtest",
+        "@flatbuffers",
     ],
 )
 
@@ -911,11 +943,11 @@ cc_test(
     size = "small",
     srcs = ["batch_matmul_test.cc"],
     deps = [
-        ":builtin_ops",
         ":test_main",
         ":test_util",
-        "//tensorflow/lite:framework",
+        "//tensorflow/lite/schema:schema_fbs",
         "@com_google_googletest//:gtest",
+        "@flatbuffers",
     ],
 )
 
@@ -925,11 +957,11 @@ cc_test(
     srcs = ["cast_test.cc"],
     tags = ["tflite_nnapi"],
     deps = [
-        ":builtin_ops",
         ":test_main",
         ":test_util",
-        "//tensorflow/lite:framework",
+        "//tensorflow/lite/schema:schema_fbs",
         "@com_google_googletest//:gtest",
+        "@flatbuffers",
     ],
 )
 
@@ -939,11 +971,11 @@ cc_test(
     srcs = ["concatenation_test.cc"],
     tags = ["tflite_nnapi"],
     deps = [
-        ":builtin_ops",
         ":test_main",
         ":test_util",
-        "//tensorflow/lite:framework",
+        "//tensorflow/lite/schema:schema_fbs",
         "@com_google_googletest//:gtest",
+        "@flatbuffers",
     ],
 )
 
@@ -953,10 +985,11 @@ cc_test(
     srcs = ["conv_test.cc"],
     tags = ["tflite_nnapi"],
     deps = [
-        ":builtin_ops",
         ":test_main",
         ":test_util",
         "//tensorflow/lite:framework",
+        "//tensorflow/lite:string",
+        "//tensorflow/lite/schema:schema_fbs",
         "@com_google_absl//absl/memory",
         "@com_google_googletest//:gtest",
     ],
@@ -968,15 +1001,11 @@ cc_test(
     srcs = ["densify_test.cc"],
     tags = ["tflite_nnapi"],
     deps = [
-        ":builtin_ops",
         ":test_main",
         ":test_util",
-        "//tensorflow/lite:framework",
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/kernels/internal:types",
         "//tensorflow/lite/schema:schema_fbs",
-        "//tensorflow/lite/tools/optimize/sparsity:format_converter",
-        "//third_party/eigen3",
         "@com_google_absl//absl/memory",
         "@com_google_googletest//:gtest",
     ],
@@ -987,13 +1016,16 @@ cc_test(
     size = "small",
     srcs = ["depthwise_conv_hybrid_test.cc"],
     deps = [
-        ":builtin_ops",
         ":test_main",
         ":test_util",
         "//tensorflow/lite:framework",
+        "//tensorflow/lite:string",
+        "//tensorflow/lite/core/api",
         "//tensorflow/lite/kernels/internal:test_util",
+        "//tensorflow/lite/schema:schema_fbs",
         "@com_google_absl//absl/memory",
         "@com_google_googletest//:gtest",
+        "@flatbuffers",
     ],
 )
 
@@ -1003,13 +1035,16 @@ cc_test(
     srcs = ["depthwise_conv_test.cc"],
     tags = ["tflite_nnapi"],
     deps = [
-        ":builtin_ops",
         ":test_main",
         ":test_util",
         "//tensorflow/lite:framework",
+        "//tensorflow/lite:string",
+        "//tensorflow/lite/core/api",
         "//tensorflow/lite/kernels/internal:test_util",
+        "//tensorflow/lite/schema:schema_fbs",
         "@com_google_absl//absl/memory",
         "@com_google_googletest//:gtest",
+        "@flatbuffers",
     ],
 )
 
@@ -1019,14 +1054,16 @@ cc_test(
     srcs = ["dequantize_test.cc"],
     tags = ["tflite_nnapi"],
     deps = [
-        ":builtin_ops",
         ":test_main",
         ":test_util",
         "//tensorflow/lite:framework",
+        "//tensorflow/lite/core/api",
         "//tensorflow/lite/kernels/internal:types",
+        "//tensorflow/lite/schema:schema_fbs",
         "//third_party/eigen3",
         "@com_google_absl//absl/memory",
         "@com_google_googletest//:gtest",
+        "@flatbuffers",
     ],
 )
 
@@ -1036,11 +1073,11 @@ cc_test(
     srcs = ["numeric_verify_test.cc"],
     tags = ["tflite_nnapi"],
     deps = [
-        ":builtin_ops",
         ":test_main",
         ":test_util",
         "//tensorflow/lite:framework",
         "//tensorflow/lite/kernels/internal:types",
+        "//tensorflow/lite/schema:schema_fbs",
         "//third_party/eigen3",
         "@com_google_absl//absl/memory",
         "@com_google_googletest//:gtest",
@@ -1053,11 +1090,11 @@ cc_test(
     srcs = ["basic_rnn_test.cc"],
     tags = ["tflite_nnapi"],
     deps = [
-        ":builtin_ops",
         ":test_main",
         ":test_util",
-        "//tensorflow/lite:framework",
+        "//tensorflow/lite/schema:schema_fbs",
         "@com_google_googletest//:gtest",
+        "@flatbuffers",
     ],
 )
 
@@ -1067,12 +1104,11 @@ cc_test(
     srcs = ["bidirectional_sequence_lstm_test.cc"],
     tags = ["tflite_nnapi"],
     deps = [
-        ":builtin_ops",
         ":test_main",
         ":test_util",
-        "//tensorflow/lite:framework",
         "//tensorflow/lite/schema:schema_fbs",
         "@com_google_googletest//:gtest",
+        "@flatbuffers",
     ],
 )
 
@@ -1082,10 +1118,9 @@ cc_test(
     srcs = ["floor_test.cc"],
     tags = ["tflite_nnapi"],
     deps = [
-        ":builtin_ops",
         ":test_main",
         ":test_util",
-        "//tensorflow/lite:framework",
+        "//tensorflow/lite/schema:schema_fbs",
         "@com_google_googletest//:gtest",
     ],
 )
@@ -1098,10 +1133,9 @@ cc_test(
         "tflite_not_portable_ios",
     ],
     deps = [
-        ":builtin_ops",
         ":test_main",
         ":test_util",
-        "//tensorflow/lite:framework",
+        "//tensorflow/lite/schema:schema_fbs",
         "@com_google_googletest//:gtest",
     ],
 )
@@ -1114,10 +1148,9 @@ cc_test(
         "tflite_not_portable_ios",
     ],
     deps = [
-        ":builtin_ops",
         ":test_main",
         ":test_util",
-        "//tensorflow/lite:framework",
+        "//tensorflow/lite/schema:schema_fbs",
         "@com_google_googletest//:gtest",
     ],
 )
@@ -1128,10 +1161,9 @@ cc_test(
     srcs = ["elementwise_test.cc"],
     tags = ["tflite_nnapi"],
     deps = [
-        ":builtin_ops",
         ":test_main",
         ":test_util",
-        "//tensorflow/lite:framework",
+        "//tensorflow/lite/schema:schema_fbs",
         "@com_google_googletest//:gtest",
     ],
 )
@@ -1142,11 +1174,11 @@ cc_test(
     srcs = ["unidirectional_sequence_lstm_test.cc"],
     tags = ["tflite_nnapi"],
     deps = [
-        ":builtin_ops",
         ":test_main",
         ":test_util",
-        "//tensorflow/lite:framework",
+        "//tensorflow/lite/schema:schema_fbs",
         "@com_google_googletest//:gtest",
+        "@flatbuffers",
     ],
 )
 
@@ -1155,11 +1187,11 @@ cc_test(
     size = "small",
     srcs = ["bidirectional_sequence_rnn_test.cc"],
     deps = [
-        ":builtin_ops",
         ":test_main",
         ":test_util",
-        "//tensorflow/lite:framework",
+        "//tensorflow/lite/schema:schema_fbs",
         "@com_google_googletest//:gtest",
+        "@flatbuffers",
     ],
 )
 
@@ -1169,11 +1201,11 @@ cc_test(
     srcs = ["unidirectional_sequence_rnn_test.cc"],
     tags = ["tflite_nnapi"],
     deps = [
-        ":builtin_ops",
         ":test_main",
         ":test_util",
-        "//tensorflow/lite:framework",
+        "//tensorflow/lite/schema:schema_fbs",
         "@com_google_googletest//:gtest",
+        "@flatbuffers",
     ],
 )
 
@@ -1184,11 +1216,11 @@ cc_test(
     # TODO(b/143912164): Enable NNAPI test when fix nnapi.
     # tags = ["tflite_nnapi"],
     deps = [
-        ":builtin_ops",
         ":test_main",
         ":test_util",
-        "//tensorflow/lite:framework",
+        "//tensorflow/lite/schema:schema_fbs",
         "@com_google_googletest//:gtest",
+        "@flatbuffers",
     ],
 )
 
@@ -1198,11 +1230,11 @@ cc_test(
     srcs = ["exp_test.cc"],
     tags = ["tflite_nnapi"],
     deps = [
-        ":builtin_ops",
         ":test_main",
         ":test_util",
-        "//tensorflow/lite:framework",
+        "//tensorflow/lite/schema:schema_fbs",
         "@com_google_googletest//:gtest",
+        "@flatbuffers",
     ],
 )
 
@@ -1211,11 +1243,11 @@ cc_test(
     size = "small",
     srcs = ["fake_quant_test.cc"],
     deps = [
-        ":builtin_ops",
         ":test_main",
         ":test_util",
-        "//tensorflow/lite:framework",
+        "//tensorflow/lite/schema:schema_fbs",
         "@com_google_googletest//:gtest",
+        "@flatbuffers",
     ],
 )
 
@@ -1225,10 +1257,9 @@ cc_test(
     srcs = ["maximum_minimum_test.cc"],
     tags = ["tflite_nnapi"],
     deps = [
-        ":builtin_ops",
         ":test_main",
         ":test_util",
-        "//tensorflow/lite:framework",
+        "//tensorflow/lite/schema:schema_fbs",
         "@com_google_googletest//:gtest",
     ],
 )
@@ -1239,11 +1270,11 @@ cc_test(
     srcs = ["reduce_test.cc"],
     tags = ["tflite_nnapi"],
     deps = [
-        ":builtin_ops",
         ":test_main",
         ":test_util",
-        "//tensorflow/lite:framework",
+        "//tensorflow/lite/schema:schema_fbs",
         "@com_google_googletest//:gtest",
+        "@flatbuffers",
     ],
 )
 
@@ -1256,11 +1287,11 @@ cc_test(
         "tflite_xnnpack",
     ],
     deps = [
-        ":builtin_ops",
         ":test_main",
         ":test_util",
-        "//tensorflow/lite:framework",
+        "//tensorflow/lite/schema:schema_fbs",
         "@com_google_googletest//:gtest",
+        "@flatbuffers",
     ],
 )
 
@@ -1270,11 +1301,12 @@ cc_test(
     srcs = ["pad_test.cc"],
     tags = ["tflite_nnapi"],
     deps = [
-        ":builtin_ops",
         ":test_main",
         ":test_util",
         "//tensorflow/lite:framework",
+        "//tensorflow/lite/schema:schema_fbs",
         "@com_google_googletest//:gtest",
+        "@flatbuffers",
     ],
 )
 
@@ -1286,6 +1318,8 @@ cc_library(
     ],
     deps = [
         ":test_util",
+        "//tensorflow/lite:string",
+        "//tensorflow/lite/schema:schema_fbs",
     ],
 )
 
@@ -1295,11 +1329,10 @@ cc_test(
     srcs = ["reshape_test.cc"],
     tags = ["tflite_nnapi"],
     deps = [
-        ":builtin_ops",
         ":reshape_test_common",
         ":test_main",
-        ":test_util",
         "//tensorflow/lite:framework",
+        "//tensorflow/lite:string",
         "@com_google_googletest//:gtest",
     ],
 )
@@ -1310,12 +1343,13 @@ cc_test(
     srcs = ["gather_test.cc"],
     tags = ["tflite_nnapi"],
     deps = [
-        ":builtin_ops",
         ":test_main",
         ":test_util",
-        "//tensorflow/lite:framework",
+        "//tensorflow/lite:string",
         "//tensorflow/lite/c:common",
+        "//tensorflow/lite/schema:schema_fbs",
         "@com_google_googletest//:gtest",
+        "@flatbuffers",
     ],
 )
 
@@ -1324,12 +1358,13 @@ cc_test(
     size = "small",
     srcs = ["gather_nd_test.cc"],
     deps = [
-        ":builtin_ops",
         ":test_main",
         ":test_util",
-        "//tensorflow/lite:framework",
+        "//tensorflow/lite:string",
         "//tensorflow/lite/c:common",
+        "//tensorflow/lite/schema:schema_fbs",
         "@com_google_googletest//:gtest",
+        "@flatbuffers",
     ],
 )
 
@@ -1338,12 +1373,12 @@ cc_test(
     size = "small",
     srcs = ["scatter_nd_test.cc"],
     deps = [
-        ":builtin_ops",
         ":test_main",
         ":test_util",
-        "//tensorflow/lite:framework",
         "//tensorflow/lite/c:common",
+        "//tensorflow/lite/schema:schema_fbs",
         "@com_google_googletest//:gtest",
+        "@flatbuffers",
     ],
 )
 
@@ -1353,11 +1388,10 @@ cc_test(
     srcs = ["topk_v2_test.cc"],
     tags = ["tflite_nnapi"],
     deps = [
-        ":builtin_ops",
         ":test_main",
         ":test_util",
-        "//tensorflow/lite:framework",
         "//tensorflow/lite/c:common",
+        "//tensorflow/lite/schema:schema_fbs",
         "@com_google_googletest//:gtest",
     ],
 )
@@ -1368,11 +1402,11 @@ cc_test(
     srcs = ["resize_bilinear_test.cc"],
     tags = ["tflite_nnapi"],
     deps = [
-        ":builtin_ops",
         ":test_main",
         ":test_util",
-        "//tensorflow/lite:framework",
+        "//tensorflow/lite/schema:schema_fbs",
         "@com_google_googletest//:gtest",
+        "@flatbuffers",
     ],
 )
 
@@ -1382,11 +1416,11 @@ cc_test(
     srcs = ["resize_nearest_neighbor_test.cc"],
     tags = ["tflite_nnapi"],
     deps = [
-        ":builtin_ops",
         ":test_main",
         ":test_util",
-        "//tensorflow/lite:framework",
+        "//tensorflow/lite/schema:schema_fbs",
         "@com_google_googletest//:gtest",
+        "@flatbuffers",
     ],
 )
 
@@ -1396,11 +1430,11 @@ cc_test(
     srcs = ["svdf_test.cc"],
     tags = ["tflite_nnapi"],
     deps = [
-        ":builtin_ops",
         ":test_main",
         ":test_util",
-        "//tensorflow/lite:framework",
+        "//tensorflow/lite/schema:schema_fbs",
         "@com_google_googletest//:gtest",
+        "@flatbuffers",
     ],
 )
 
@@ -1410,11 +1444,11 @@ cc_test(
     srcs = ["embedding_lookup_test.cc"],
     tags = ["tflite_nnapi"],
     deps = [
-        ":builtin_ops",
         ":test_main",
         ":test_util",
         "//tensorflow/lite:framework",
         "//tensorflow/lite/kernels/internal:tensor",
+        "//tensorflow/lite/schema:schema_fbs",
         "@com_google_googletest//:gtest",
     ],
 )
@@ -1425,12 +1459,13 @@ cc_test(
     srcs = ["embedding_lookup_sparse_test.cc"],
     tags = ["tflite_nnapi"],
     deps = [
-        ":builtin_ops",
         ":test_main",
         ":test_util",
         "//tensorflow/lite:framework",
         "//tensorflow/lite/kernels/internal:tensor",
+        "//tensorflow/lite/schema:schema_fbs",
         "@com_google_googletest//:gtest",
+        "@flatbuffers",
     ],
 )
 
@@ -1444,9 +1479,13 @@ cc_test(
         ":test_main",
         ":test_util",
         "//tensorflow/lite:framework",
+        "//tensorflow/lite:string",
+        "//tensorflow/lite/core/api",
         "//tensorflow/lite/kernels/internal:tensor_utils",
+        "//tensorflow/lite/schema:schema_fbs",
         "@com_google_absl//absl/memory",
         "@com_google_googletest//:gtest",
+        "@flatbuffers",
     ],
 )
 
@@ -1456,11 +1495,11 @@ cc_test(
     srcs = ["local_response_norm_test.cc"],
     tags = ["tflite_nnapi"],
     deps = [
-        ":builtin_ops",
         ":test_main",
         ":test_util",
-        "//tensorflow/lite:framework",
+        "//tensorflow/lite/schema:schema_fbs",
         "@com_google_googletest//:gtest",
+        "@flatbuffers",
     ],
 )
 
@@ -1473,11 +1512,11 @@ cc_test(
         "tflite_xnnpack",
     ],
     deps = [
-        ":builtin_ops",
         ":test_main",
         ":test_util",
-        "//tensorflow/lite:framework",
+        "//tensorflow/lite/schema:schema_fbs",
         "@com_google_googletest//:gtest",
+        "@flatbuffers",
     ],
 )
 
@@ -1490,12 +1529,13 @@ cc_test(
         "tflite_xnnpack",
     ],
     deps = [
-        ":builtin_ops",
         ":test_main",
         ":test_util",
-        "//tensorflow/lite:framework",
         "//tensorflow/lite/kernels/internal:reference_base",
+        "//tensorflow/lite/kernels/internal:types",
+        "//tensorflow/lite/schema:schema_fbs",
         "@com_google_googletest//:gtest",
+        "@flatbuffers",
     ],
 )
 
@@ -1505,12 +1545,13 @@ cc_test(
     srcs = ["log_softmax_test.cc"],
     tags = ["tflite_nnapi"],
     deps = [
-        ":builtin_ops",
         ":test_main",
         ":test_util",
-        "//tensorflow/lite:framework",
         "//tensorflow/lite/kernels/internal:reference_base",
+        "//tensorflow/lite/kernels/internal:types",
+        "//tensorflow/lite/schema:schema_fbs",
         "@com_google_googletest//:gtest",
+        "@flatbuffers",
     ],
 )
 
@@ -1520,11 +1561,11 @@ cc_test(
     srcs = ["lsh_projection_test.cc"],
     tags = ["tflite_nnapi"],
     deps = [
-        ":builtin_ops",
         ":test_main",
         ":test_util",
-        "//tensorflow/lite:framework",
+        "//tensorflow/lite/schema:schema_fbs",
         "@com_google_googletest//:gtest",
+        "@flatbuffers",
     ],
 )
 
@@ -1534,12 +1575,13 @@ cc_test(
     srcs = ["hashtable_lookup_test.cc"],
     tags = ["tflite_nnapi"],
     deps = [
-        ":builtin_ops",
         ":test_main",
         ":test_util",
         "//tensorflow/lite:framework",
+        "//tensorflow/lite:string",
         "//tensorflow/lite:string_util",
         "//tensorflow/lite/kernels/internal:tensor",
+        "//tensorflow/lite/schema:schema_fbs",
         "@com_google_googletest//:gtest",
     ],
 )
@@ -1550,11 +1592,12 @@ cc_test(
     srcs = ["lstm_test.cc"],
     tags = ["tflite_nnapi"],
     deps = [
-        ":builtin_ops",
         ":test_main",
         ":test_util",
         "//tensorflow/lite:framework",
+        "//tensorflow/lite/schema:schema_fbs",
         "@com_google_googletest//:gtest",
+        "@flatbuffers",
     ],
 )
 
@@ -1563,11 +1606,9 @@ cc_test(
     size = "small",
     srcs = ["lstm_eval_test.cc"],
     deps = [
-        ":builtin_ops",
+        ":cpu_backend_context",
         ":lstm_eval",
         ":test_main",
-        ":test_util",
-        "//tensorflow/lite:framework",
         "//tensorflow/lite/c:common",
         "@com_google_googletest//:gtest",
     ],
@@ -1578,12 +1619,14 @@ cc_test(
     size = "small",
     srcs = ["skip_gram_test.cc"],
     deps = [
-        ":builtin_ops",
         ":test_main",
         ":test_util",
         "//tensorflow/lite:framework",
+        "//tensorflow/lite:string",
         "//tensorflow/lite:string_util",
+        "//tensorflow/lite/schema:schema_fbs",
         "@com_google_googletest//:gtest",
+        "@flatbuffers",
     ],
 )
 
@@ -1593,11 +1636,11 @@ cc_test(
     srcs = ["space_to_depth_test.cc"],
     tags = ["tflite_nnapi"],
     deps = [
-        ":builtin_ops",
         ":test_main",
         ":test_util",
-        "//tensorflow/lite:framework",
+        "//tensorflow/lite/schema:schema_fbs",
         "@com_google_googletest//:gtest",
+        "@flatbuffers",
     ],
 )
 
@@ -1607,11 +1650,11 @@ cc_test(
     srcs = ["depth_to_space_test.cc"],
     tags = ["tflite_nnapi"],
     deps = [
-        ":builtin_ops",
         ":test_main",
         ":test_util",
-        "//tensorflow/lite:framework",
+        "//tensorflow/lite/schema:schema_fbs",
         "@com_google_googletest//:gtest",
+        "@flatbuffers",
     ],
 )
 
@@ -1621,11 +1664,11 @@ cc_test(
     srcs = ["split_test.cc"],
     tags = ["tflite_nnapi"],
     deps = [
-        ":builtin_ops",
         ":test_main",
         ":test_util",
-        "//tensorflow/lite:framework",
+        "//tensorflow/lite/schema:schema_fbs",
         "@com_google_googletest//:gtest",
+        "@flatbuffers",
     ],
 )
 
@@ -1634,11 +1677,11 @@ cc_test(
     size = "small",
     srcs = ["split_v_test.cc"],
     deps = [
-        ":builtin_ops",
         ":test_main",
         ":test_util",
-        "//tensorflow/lite:framework",
+        "//tensorflow/lite/schema:schema_fbs",
         "@com_google_googletest//:gtest",
+        "@flatbuffers",
     ],
 )
 
@@ -1648,11 +1691,11 @@ cc_test(
     srcs = ["squeeze_test.cc"],
     tags = ["tflite_nnapi"],
     deps = [
-        ":builtin_ops",
         ":test_main",
         ":test_util",
-        "//tensorflow/lite:framework",
+        "//tensorflow/lite/schema:schema_fbs",
         "@com_google_googletest//:gtest",
+        "@flatbuffers",
     ],
 )
 
@@ -1662,10 +1705,9 @@ cc_test(
     srcs = ["strided_slice_test.cc"],
     tags = ["tflite_nnapi"],
     deps = [
-        ":builtin_ops",
         ":test_main",
         ":test_util",
-        "//tensorflow/lite:framework",
+        "//tensorflow/lite/schema:schema_fbs",
         "@com_google_googletest//:gtest",
     ],
 )
@@ -1676,11 +1718,11 @@ cc_test(
     srcs = ["tile_test.cc"],
     tags = ["tflite_nnapi"],
     deps = [
-        ":builtin_ops",
         ":test_main",
         ":test_util",
-        "//tensorflow/lite:framework",
+        "//tensorflow/lite:string",
         "//tensorflow/lite/c:common",
+        "//tensorflow/lite/schema:schema_fbs",
         "@com_google_googletest//:gtest",
     ],
 )
@@ -1693,11 +1735,12 @@ cc_test(
     ],
     tags = ["tflite_nnapi"],
     deps = [
-        ":builtin_ops",
         ":test_main",
         ":test_util",
-        "//tensorflow/lite:framework",
+        "//tensorflow/lite:string",
+        "//tensorflow/lite/schema:schema_fbs",
         "@com_google_googletest//:gtest",
+        "@flatbuffers",
     ],
 )
 
@@ -1707,11 +1750,11 @@ cc_test(
     srcs = ["neg_test.cc"],
     tags = ["tflite_nnapi"],
     deps = [
-        ":builtin_ops",
         ":test_main",
         ":test_util",
-        "//tensorflow/lite:framework",
+        "//tensorflow/lite/schema:schema_fbs",
         "@com_google_googletest//:gtest",
+        "@flatbuffers",
     ],
 )
 
@@ -1723,11 +1766,11 @@ cc_test(
     ],
     tags = ["tflite_nnapi"],
     deps = [
-        ":builtin_ops",
         ":test_main",
         ":test_util",
-        "//tensorflow/lite:framework",
+        "//tensorflow/lite/schema:schema_fbs",
         "@com_google_googletest//:gtest",
+        "@flatbuffers",
     ],
 )
 
@@ -1739,10 +1782,10 @@ cc_test(
     ],
     tags = ["tflite_nnapi"],
     deps = [
-        ":builtin_ops",
         ":test_main",
         ":test_util",
-        "//tensorflow/lite:framework",
+        "//tensorflow/lite:string",
+        "//tensorflow/lite/schema:schema_fbs",
         "@com_google_googletest//:gtest",
     ],
 )
@@ -1753,10 +1796,11 @@ cc_test(
     srcs = ["transpose_conv_test.cc"],
     tags = ["tflite_nnapi"],
     deps = [
-        ":builtin_ops",
         ":test_main",
         ":test_util",
         "//tensorflow/lite:framework",
+        "//tensorflow/lite:string",
+        "//tensorflow/lite/schema:schema_fbs",
         "@com_google_absl//absl/memory",
         "@com_google_googletest//:gtest",
     ],
@@ -1768,11 +1812,11 @@ cc_test(
     srcs = ["expand_dims_test.cc"],
     tags = ["tflite_nnapi"],
     deps = [
-        ":builtin_ops",
         ":test_main",
         ":test_util",
-        "//tensorflow/lite:framework",
+        "//tensorflow/lite:string",
         "//tensorflow/lite/c:common",
+        "//tensorflow/lite/schema:schema_fbs",
         "@com_google_googletest//:gtest",
     ],
 )
@@ -1782,11 +1826,10 @@ cc_test(
     size = "small",
     srcs = ["sparse_to_dense_test.cc"],
     deps = [
-        ":builtin_ops",
         ":test_main",
         ":test_util",
-        "//tensorflow/lite:framework",
         "//tensorflow/lite/c:common",
+        "//tensorflow/lite/schema:schema_fbs",
         "@com_google_googletest//:gtest",
     ],
 )
@@ -1796,11 +1839,11 @@ cc_test(
     size = "small",
     srcs = ["shape_test.cc"],
     deps = [
-        ":builtin_ops",
         ":test_main",
         ":test_util",
         "//tensorflow/lite:framework",
         "//tensorflow/lite/c:common",
+        "//tensorflow/lite/schema:schema_fbs",
         "@com_google_googletest//:gtest",
     ],
 )
@@ -1810,12 +1853,13 @@ cc_test(
     size = "small",
     srcs = ["rank_test.cc"],
     deps = [
-        ":builtin_ops",
         ":test_main",
         ":test_util",
         "//tensorflow/lite:framework",
         "//tensorflow/lite/c:common",
+        "//tensorflow/lite/schema:schema_fbs",
         "@com_google_googletest//:gtest",
+        "@flatbuffers",
     ],
 )
 
@@ -1825,12 +1869,11 @@ cc_test(
     srcs = ["pow_test.cc"],
     tags = ["tflite_nnapi"],
     deps = [
-        ":builtin_ops",
         ":test_main",
         ":test_util",
-        "//tensorflow/lite:framework",
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/kernels/internal:test_util",
+        "//tensorflow/lite/schema:schema_fbs",
         "@com_google_googletest//:gtest",
     ],
 )
@@ -1840,11 +1883,10 @@ cc_test(
     size = "small",
     srcs = ["pack_test.cc"],
     deps = [
-        ":builtin_ops",
         ":test_main",
         ":test_util",
-        "//tensorflow/lite:framework",
         "//tensorflow/lite/c:common",
+        "//tensorflow/lite/schema:schema_fbs",
         "@com_google_googletest//:gtest",
     ],
 )
@@ -1854,10 +1896,10 @@ cc_test(
     size = "small",
     srcs = ["one_hot_test.cc"],
     deps = [
-        ":builtin_ops",
         ":test_main",
         ":test_util",
         "//tensorflow/lite:framework",
+        "//tensorflow/lite/schema:schema_fbs",
         "@com_google_googletest//:gtest",
     ],
 )
@@ -1868,12 +1910,12 @@ cc_test(
     srcs = ["logical_test.cc"],
     tags = ["tflite_nnapi"],
     deps = [
-        ":builtin_ops",
         ":test_main",
         ":test_util",
-        "//tensorflow/lite:framework",
         "//tensorflow/lite/c:common",
+        "//tensorflow/lite/schema:schema_fbs",
         "@com_google_googletest//:gtest",
+        "@flatbuffers",
     ],
 )
 
@@ -1882,11 +1924,9 @@ cc_test(
     size = "small",
     srcs = ["unpack_test.cc"],
     deps = [
-        ":builtin_ops",
         ":test_main",
         ":test_util",
-        "//tensorflow/lite:builtin_op_data",
-        "//tensorflow/lite:framework",
+        "//tensorflow/lite/schema:schema_fbs",
         "@com_google_googletest//:gtest",
     ],
 )
@@ -1896,11 +1936,9 @@ cc_test(
     size = "small",
     srcs = ["floor_div_test.cc"],
     deps = [
-        ":builtin_ops",
         ":test_main",
         ":test_util",
-        "//tensorflow/lite:builtin_op_data",
-        "//tensorflow/lite:framework",
+        "//tensorflow/lite/schema:schema_fbs",
         "@com_google_googletest//:gtest",
     ],
 )
@@ -1910,12 +1948,11 @@ cc_test(
     size = "small",
     srcs = ["where_test.cc"],
     deps = [
-        ":builtin_ops",
         ":test_main",
         ":test_util",
-        "//tensorflow/lite:builtin_op_data",
-        "//tensorflow/lite:framework",
+        "//tensorflow/lite/schema:schema_fbs",
         "@com_google_googletest//:gtest",
+        "@flatbuffers",
     ],
 )
 
@@ -1924,12 +1961,12 @@ cc_test(
     size = "small",
     srcs = ["zeros_like_test.cc"],
     deps = [
-        ":builtin_ops",
         ":test_main",
         ":test_util",
-        "//tensorflow/lite:builtin_op_data",
         "//tensorflow/lite:framework",
+        "//tensorflow/lite/schema:schema_fbs",
         "@com_google_googletest//:gtest",
+        "@flatbuffers",
     ],
 )
 
@@ -1938,11 +1975,9 @@ cc_test(
     size = "small",
     srcs = ["floor_mod_test.cc"],
     deps = [
-        ":builtin_ops",
         ":test_main",
         ":test_util",
-        "//tensorflow/lite:builtin_op_data",
-        "//tensorflow/lite:framework",
+        "//tensorflow/lite/schema:schema_fbs",
         "@com_google_googletest//:gtest",
     ],
 )
@@ -1952,11 +1987,9 @@ cc_test(
     size = "small",
     srcs = ["range_test.cc"],
     deps = [
-        ":builtin_ops",
         ":test_main",
         ":test_util",
-        "//tensorflow/lite:builtin_op_data",
-        "//tensorflow/lite:framework",
+        "//tensorflow/lite/schema:schema_fbs",
         "@com_google_googletest//:gtest",
     ],
 )
@@ -1966,12 +1999,11 @@ cc_test(
     size = "small",
     srcs = ["squared_difference_test.cc"],
     deps = [
-        ":builtin_ops",
         ":test_main",
         ":test_util",
-        "//tensorflow/lite:builtin_op_data",
-        "//tensorflow/lite:framework",
+        "//tensorflow/lite/schema:schema_fbs",
         "@com_google_googletest//:gtest",
+        "@flatbuffers",
     ],
 )
 
@@ -1981,15 +2013,11 @@ cc_test(
     srcs = ["if_test.cc"],
     tags = ["tflite_not_portable_ios"],
     deps = [
-        ":builtin_ops",
         ":kernel_util",
         ":subgraph_test_util",
         ":test_main",
-        ":test_util",
-        "//tensorflow/lite:builtin_op_data",
         "//tensorflow/lite:framework",
         "@com_google_googletest//:gtest",
-        "@flatbuffers",
     ],
 )
 
@@ -1999,15 +2027,10 @@ cc_test(
     srcs = ["while_test.cc"],
     tags = ["tflite_not_portable_ios"],
     deps = [
-        ":builtin_ops",
-        ":kernel_util",
         ":subgraph_test_util",
         ":test_main",
-        ":test_util",
-        "//tensorflow/lite:builtin_op_data",
         "//tensorflow/lite:framework",
         "@com_google_googletest//:gtest",
-        "@flatbuffers",
     ],
 )
 
@@ -2017,10 +2040,10 @@ cc_test(
     srcs = ["fill_test.cc"],
     tags = ["tflite_nnapi"],
     deps = [
-        ":builtin_ops",
         ":test_main",
         ":test_util",
-        "//tensorflow/lite:framework",
+        "//tensorflow/lite:string",
+        "//tensorflow/lite/schema:schema_fbs",
         "@com_google_googletest//:gtest",
     ],
 )
@@ -2029,10 +2052,9 @@ cc_test(
     name = "unique_test",
     srcs = ["unique_test.cc"],
     deps = [
-        ":builtin_ops",
         ":test_main",
         ":test_util",
-        "//tensorflow/lite:framework",
+        "//tensorflow/lite/schema:schema_fbs",
         "@com_google_googletest//:gtest",
     ],
 )
@@ -2042,10 +2064,9 @@ cc_test(
     size = "small",
     srcs = ["reverse_test.cc"],
     deps = [
-        ":builtin_ops",
         ":test_main",
         ":test_util",
-        "//tensorflow/lite:framework",
+        "//tensorflow/lite/schema:schema_fbs",
         "@com_google_googletest//:gtest",
     ],
 )
@@ -2059,6 +2080,8 @@ cc_test(
         ":test_util",
         "//tensorflow/lite:framework",
         "//tensorflow/lite/c:common",
+        "//tensorflow/lite/schema:schema_fbs",
+        "//tensorflow/lite/testing:util",
         "@com_google_googletest//:gtest",
     ],
 )
@@ -2068,11 +2091,11 @@ cc_test(
     size = "small",
     srcs = ["non_max_suppression_test.cc"],
     deps = [
-        ":builtin_ops",
         ":test_main",
         ":test_util",
-        "//tensorflow/lite:framework",
+        "//tensorflow/lite/schema:schema_fbs",
         "@com_google_googletest//:gtest",
+        "@flatbuffers",
     ],
 )
 
@@ -2092,10 +2115,9 @@ cc_test(
     name = "mirror_pad_test",
     srcs = ["mirror_pad_test.cc"],
     deps = [
-        ":builtin_ops",
         ":test_main",
         ":test_util",
-        "//tensorflow/lite:framework",
+        "//tensorflow/lite/schema:schema_fbs",
         "@com_google_googletest//:gtest",
     ],
 )
@@ -2111,6 +2133,7 @@ cc_library(
         ":test_util",
         "//tensorflow/lite:builtin_op_data",
         "//tensorflow/lite:framework",
+        "//tensorflow/lite/c:common",
         "@com_google_googletest//:gtest",
         "@flatbuffers",
     ],
@@ -2123,8 +2146,8 @@ cc_test(
     deps = [
         ":kernel_util",
         ":subgraph_test_util",
-        ":test_util",
         "//tensorflow/lite:framework",
+        "//tensorflow/lite/testing:util",
         "@com_google_googletest//:gtest",
     ],
 )
@@ -2134,10 +2157,9 @@ cc_test(
     size = "small",
     srcs = ["reverse_sequence_test.cc"],
     deps = [
-        ":builtin_ops",
         ":test_main",
         ":test_util",
-        "//tensorflow/lite:framework",
+        "//tensorflow/lite/schema:schema_fbs",
         "@com_google_googletest//:gtest",
     ],
 )
@@ -2147,10 +2169,10 @@ cc_test(
     size = "small",
     srcs = ["matrix_diag_test.cc"],
     deps = [
-        ":builtin_ops",
         ":test_main",
         ":test_util",
         "//tensorflow/lite:framework",
+        "//tensorflow/lite/schema:schema_fbs",
         "@com_google_googletest//:gtest",
     ],
 )
@@ -2161,12 +2183,12 @@ cc_test(
     srcs = ["quantize_test.cc"],
     tags = ["tflite_nnapi"],
     deps = [
-        ":builtin_ops",
         ":test_main",
         ":test_util",
-        "//tensorflow/lite:framework",
         "//tensorflow/lite/kernels/internal:types",
+        "//tensorflow/lite/schema:schema_fbs",
         "@com_google_googletest//:gtest",
+        "@flatbuffers",
     ],
 )
 
@@ -2175,10 +2197,10 @@ cc_test(
     size = "small",
     srcs = ["matrix_set_diag_test.cc"],
     deps = [
-        ":builtin_ops",
         ":test_main",
         ":test_util",
         "//tensorflow/lite:framework",
+        "//tensorflow/lite/schema:schema_fbs",
         "@com_google_googletest//:gtest",
     ],
 )
@@ -2189,12 +2211,11 @@ cc_test(
     srcs = ["quant_basic_lstm_test.cc"],
     tags = ["tflite_nnapi"],
     deps = [
-        ":builtin_ops",
-        ":kernel_util",
         ":test_main",
         ":test_util",
-        "//tensorflow/lite:framework",
+        "//tensorflow/lite/schema:schema_fbs",
         "@com_google_googletest//:gtest",
+        "@flatbuffers",
     ],
 )
 
@@ -2202,10 +2223,9 @@ cc_test(
     name = "segment_sum_test",
     srcs = ["segment_sum_test.cc"],
     deps = [
-        ":builtin_ops",
         ":test_main",
         ":test_util",
-        "//tensorflow/lite:framework",
+        "//tensorflow/lite/schema:schema_fbs",
         "@com_google_googletest//:gtest",
     ],
 )
diff --git a/tensorflow/lite/kernels/acceleration_test_util.cc b/tensorflow/lite/kernels/acceleration_test_util.cc
index 0dffd22fa26..741c34d9672 100644
--- a/tensorflow/lite/kernels/acceleration_test_util.cc
+++ b/tensorflow/lite/kernels/acceleration_test_util.cc
@@ -14,19 +14,11 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/lite/kernels/acceleration_test_util.h"
 
-#include <algorithm>
-#include <array>
-#include <atomic>
-#include <cctype>
-#include <cstring>
-#include <set>
 #include <sstream>
 #include <string>
-#include <unordered_map>
 
+#include <gtest/gtest.h>
 #include "absl/types/optional.h"
-#include "tensorflow/lite/kernels/acceleration_test_util_internal.h"
-#include "tensorflow/lite/minimal_logging.h"
 
 namespace tflite {
 
diff --git a/tensorflow/lite/kernels/acceleration_test_util.h b/tensorflow/lite/kernels/acceleration_test_util.h
index 75b5d79c8c4..a6a88d5f131 100644
--- a/tensorflow/lite/kernels/acceleration_test_util.h
+++ b/tensorflow/lite/kernels/acceleration_test_util.h
@@ -15,7 +15,7 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_KERNELS_ACCELERATION_TEST_UTIL_H_
 #define TENSORFLOW_LITE_KERNELS_ACCELERATION_TEST_UTIL_H_
 
-#include <gtest/gtest.h>
+#include <string>
 
 namespace tflite {
 
diff --git a/tensorflow/lite/kernels/acceleration_test_util_internal.cc b/tensorflow/lite/kernels/acceleration_test_util_internal.cc
index f4a1f5cdc87..a6ad8234f59 100644
--- a/tensorflow/lite/kernels/acceleration_test_util_internal.cc
+++ b/tensorflow/lite/kernels/acceleration_test_util_internal.cc
@@ -14,6 +14,14 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/lite/kernels/acceleration_test_util_internal.h"
 
+#include <ctype.h>
+
+#include <algorithm>
+#include <functional>
+#include <iterator>
+#include <sstream>
+#include <string>
+
 namespace tflite {
 
 void ReadAccelerationConfig(
diff --git a/tensorflow/lite/kernels/acceleration_test_util_internal.h b/tensorflow/lite/kernels/acceleration_test_util_internal.h
index 8999af7e7ad..24fc2383f9e 100644
--- a/tensorflow/lite/kernels/acceleration_test_util_internal.h
+++ b/tensorflow/lite/kernels/acceleration_test_util_internal.h
@@ -18,14 +18,12 @@ limitations under the License.
 #include <algorithm>
 #include <atomic>
 #include <functional>
-#include <sstream>
+#include <iterator>
 #include <string>
-#include <utility>
 #include <vector>
 
 #include "absl/types/optional.h"
 #include "re2/re2.h"
-#include "tensorflow/lite/minimal_logging.h"
 
 namespace tflite {
 
diff --git a/tensorflow/lite/kernels/acceleration_test_util_internal_test.cc b/tensorflow/lite/kernels/acceleration_test_util_internal_test.cc
index 71e0c9e9912..82d21fd9332 100644
--- a/tensorflow/lite/kernels/acceleration_test_util_internal_test.cc
+++ b/tensorflow/lite/kernels/acceleration_test_util_internal_test.cc
@@ -14,7 +14,9 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/lite/kernels/acceleration_test_util_internal.h"
 
+#include <functional>
 #include <optional>
+#include <string>
 #include <unordered_map>
 
 #include <gmock/gmock.h>
diff --git a/tensorflow/lite/kernels/activations.cc b/tensorflow/lite/kernels/activations.cc
index 749a0d69ef9..2b2428f3f92 100644
--- a/tensorflow/lite/kernels/activations.cc
+++ b/tensorflow/lite/kernels/activations.cc
@@ -12,25 +12,35 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include <stddef.h>
+
+#include <algorithm>
 #include <cmath>
 #include <cstdint>
+#include <functional>
 #include <limits>
 
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/kernels/cpu_backend_context.h"
 #include "tensorflow/lite/kernels/internal/common.h"
 #include "tensorflow/lite/kernels/internal/compatibility.h"
+#include "tensorflow/lite/kernels/internal/cppmath.h"
 #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
 #include "tensorflow/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/lite/kernels/internal/reference/binary_function.h"
 #include "tensorflow/lite/kernels/internal/reference/integer_ops/log_softmax.h"
 #include "tensorflow/lite/kernels/internal/reference/integer_ops/logistic.h"
 #include "tensorflow/lite/kernels/internal/reference/integer_ops/tanh.h"
 #include "tensorflow/lite/kernels/internal/reference/logistic.h"
+#include "tensorflow/lite/kernels/internal/reference/prelu.h"
 #include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
+#include "tensorflow/lite/kernels/internal/reference/softmax.h"
+#include "tensorflow/lite/kernels/internal/reference/tanh.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/internal/types.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
-#include "tensorflow/lite/kernels/op_macros.h"
 
 #if __aarch64__ && __clang__
 #include <arm_neon.h>
diff --git a/tensorflow/lite/kernels/activations_test.cc b/tensorflow/lite/kernels/activations_test.cc
index 5a679147469..50b1c041e34 100644
--- a/tensorflow/lite/kernels/activations_test.cc
+++ b/tensorflow/lite/kernels/activations_test.cc
@@ -12,16 +12,28 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include <cstdarg>
+#include <math.h>
+#include <stdint.h>
+#include <stdlib.h>
+
+#include <algorithm>
+#include <initializer_list>
 #include <limits>
+#include <map>
+#include <memory>
 #include <random>
+#include <string>
+#include <utility>
+#include <vector>
 
 #include <gtest/gtest.h>
 #include "absl/memory/memory.h"
+#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
+#include "tensorflow/lite/core/api/op_resolver.h"
 #include "tensorflow/lite/interpreter.h"
-#include "tensorflow/lite/kernels/register.h"
 #include "tensorflow/lite/kernels/test_util.h"
-#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/string_type.h"
 
 namespace tflite {
 
diff --git a/tensorflow/lite/kernels/add.cc b/tensorflow/lite/kernels/add.cc
index d9b8c87eeb7..279f6aa12ce 100644
--- a/tensorflow/lite/kernels/add.cc
+++ b/tensorflow/lite/kernels/add.cc
@@ -14,16 +14,26 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/lite/kernels/internal/optimized/integer_ops/add.h"
 
+#include <stddef.h>
+#include <stdint.h>
+
+#include <algorithm>
+
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/kernels/internal/compatibility.h"
 #include "tensorflow/lite/kernels/internal/optimized/cpu_check.h"
+#include "tensorflow/lite/kernels/internal/optimized/neon_check.h"
 #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
 #include "tensorflow/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/lite/kernels/internal/reference/add.h"
 #include "tensorflow/lite/kernels/internal/reference/integer_ops/add.h"
+#include "tensorflow/lite/kernels/internal/reference/process_broadcast_shapes.h"
 #include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/kernels/internal/types.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
-#include "tensorflow/lite/kernels/op_macros.h"
 
 namespace tflite {
 namespace ops {
diff --git a/tensorflow/lite/kernels/add_n.cc b/tensorflow/lite/kernels/add_n.cc
index 5f6437fe331..7b4d52c5272 100644
--- a/tensorflow/lite/kernels/add_n.cc
+++ b/tensorflow/lite/kernels/add_n.cc
@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include <stdint.h>
+
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
diff --git a/tensorflow/lite/kernels/add_n_test.cc b/tensorflow/lite/kernels/add_n_test.cc
index ac6ccec2b66..4db646ace9c 100644
--- a/tensorflow/lite/kernels/add_n_test.cc
+++ b/tensorflow/lite/kernels/add_n_test.cc
@@ -12,13 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include <stdint.h>
+
 #include <vector>
 
 #include <gtest/gtest.h>
-#include "tensorflow/lite/interpreter.h"
-#include "tensorflow/lite/kernels/register.h"
+#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
 #include "tensorflow/lite/kernels/test_util.h"
-#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
 namespace {
diff --git a/tensorflow/lite/kernels/add_test.cc b/tensorflow/lite/kernels/add_test.cc
index 267b80564c9..bb883dd9b05 100644
--- a/tensorflow/lite/kernels/add_test.cc
+++ b/tensorflow/lite/kernels/add_test.cc
@@ -12,11 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include <stddef.h>
+#include <stdint.h>
+
+#include <vector>
+
 #include <gtest/gtest.h>
-#include "tensorflow/lite/interpreter.h"
-#include "tensorflow/lite/kernels/register.h"
+#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
 #include "tensorflow/lite/kernels/test_util.h"
-#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
 namespace {
diff --git a/tensorflow/lite/kernels/arg_min_max.cc b/tensorflow/lite/kernels/arg_min_max.cc
index e99f59ba703..4a3902ac57c 100644
--- a/tensorflow/lite/kernels/arg_min_max.cc
+++ b/tensorflow/lite/kernels/arg_min_max.cc
@@ -12,13 +12,19 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include "tensorflow/lite/kernels/internal/reference/arg_min_max.h"
+
+#include <stdint.h>
+
+#include <functional>
+
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
 #include "tensorflow/lite/kernels/internal/quantization_util.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
-#include "tensorflow/lite/kernels/op_macros.h"
 
 namespace tflite {
 namespace ops {
diff --git a/tensorflow/lite/kernels/arg_min_max_test.cc b/tensorflow/lite/kernels/arg_min_max_test.cc
index d028bac3fb2..957d3473b8d 100644
--- a/tensorflow/lite/kernels/arg_min_max_test.cc
+++ b/tensorflow/lite/kernels/arg_min_max_test.cc
@@ -12,11 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include <stdint.h>
+
+#include <initializer_list>
+#include <tuple>
+#include <vector>
+
 #include <gtest/gtest.h>
-#include "tensorflow/lite/interpreter.h"
-#include "tensorflow/lite/kernels/register.h"
+#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
 #include "tensorflow/lite/kernels/test_util.h"
-#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
 namespace {
diff --git a/tensorflow/lite/kernels/assign_variable.cc b/tensorflow/lite/kernels/assign_variable.cc
index 41ddcdda6f7..4cb4e08e43a 100644
--- a/tensorflow/lite/kernels/assign_variable.cc
+++ b/tensorflow/lite/kernels/assign_variable.cc
@@ -13,17 +13,13 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include <string.h>
+#include <stdint.h>
 
-#include <memory>
-
-#include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/core/subgraph.h"
 #include "tensorflow/lite/experimental/resource/resource_variable.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
-#include "tensorflow/lite/kernels/op_macros.h"
 
 namespace tflite {
 namespace ops {
diff --git a/tensorflow/lite/kernels/audio_spectrogram.cc b/tensorflow/lite/kernels/audio_spectrogram.cc
index 99457ea11b1..29c9eeef3d0 100644
--- a/tensorflow/lite/kernels/audio_spectrogram.cc
+++ b/tensorflow/lite/kernels/audio_spectrogram.cc
@@ -13,15 +13,20 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <math.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#include <vector>
+
 #include "flatbuffers/flexbuffers.h"  // from @flatbuffers
-#include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
 #include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
 #include "tensorflow/lite/kernels/internal/spectrogram.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
-#include "tensorflow/lite/kernels/op_macros.h"
 
 namespace tflite {
 namespace ops {
diff --git a/tensorflow/lite/kernels/audio_spectrogram_test.cc b/tensorflow/lite/kernels/audio_spectrogram_test.cc
index 0f4182ea728..cdb77303748 100644
--- a/tensorflow/lite/kernels/audio_spectrogram_test.cc
+++ b/tensorflow/lite/kernels/audio_spectrogram_test.cc
@@ -13,16 +13,13 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include <functional>
-#include <memory>
 #include <vector>
 
 #include <gtest/gtest.h>
 #include "flatbuffers/flexbuffers.h"  // from @flatbuffers
 #include "tensorflow/lite/interpreter.h"
-#include "tensorflow/lite/kernels/register.h"
 #include "tensorflow/lite/kernels/test_util.h"
-#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
 namespace ops {
diff --git a/tensorflow/lite/kernels/basic_rnn_test.cc b/tensorflow/lite/kernels/basic_rnn_test.cc
index f7cbaa5a814..2146d086c9a 100644
--- a/tensorflow/lite/kernels/basic_rnn_test.cc
+++ b/tensorflow/lite/kernels/basic_rnn_test.cc
@@ -14,17 +14,14 @@ limitations under the License.
 ==============================================================================*/
 // Unit test for TFLite RNN op.
 
-#include <string.h>
 #include <initializer_list>
-#include <memory>
 #include <vector>
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
-#include "tensorflow/lite/interpreter.h"
-#include "tensorflow/lite/kernels/register.h"
+#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
 #include "tensorflow/lite/kernels/test_util.h"
-#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
 namespace {
diff --git a/tensorflow/lite/kernels/batch_matmul.cc b/tensorflow/lite/kernels/batch_matmul.cc
index 9cbad101bab..d2115f96e1c 100644
--- a/tensorflow/lite/kernels/batch_matmul.cc
+++ b/tensorflow/lite/kernels/batch_matmul.cc
@@ -15,15 +15,21 @@ limitations under the License.
 
 #include "tensorflow/lite/kernels/internal/reference/batch_matmul.h"
 
+#include <stddef.h>
+
+#include <algorithm>
 #include <cstdint>
 
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/kernels/cpu_backend_context.h"
+#include "tensorflow/lite/kernels/internal/compatibility.h"
 #include "tensorflow/lite/kernels/internal/optimized/batch_matmul.h"
 #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
 #include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/kernels/internal/tensor_utils.h"
 #include "tensorflow/lite/kernels/internal/types.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 
diff --git a/tensorflow/lite/kernels/batch_matmul_test.cc b/tensorflow/lite/kernels/batch_matmul_test.cc
index aec031015c0..5e52479f49b 100644
--- a/tensorflow/lite/kernels/batch_matmul_test.cc
+++ b/tensorflow/lite/kernels/batch_matmul_test.cc
@@ -12,12 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include <gmock/gmock.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#include <initializer_list>
+#include <vector>
+
 #include <gtest/gtest.h>
-#include "tensorflow/lite/interpreter.h"
-#include "tensorflow/lite/kernels/register.h"
+#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
 #include "tensorflow/lite/kernels/test_util.h"
-#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
 namespace {
diff --git a/tensorflow/lite/kernels/batch_to_space_nd.cc b/tensorflow/lite/kernels/batch_to_space_nd.cc
index d7d796ebec1..9d6492e0fcb 100644
--- a/tensorflow/lite/kernels/batch_to_space_nd.cc
+++ b/tensorflow/lite/kernels/batch_to_space_nd.cc
@@ -12,17 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include <string.h>
+#include <stdint.h>
 
-#include <vector>
-
-#include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/kernels/internal/compatibility.h"
 #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
 #include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
-#include "tensorflow/lite/kernels/op_macros.h"
 
 namespace tflite {
 namespace ops {
diff --git a/tensorflow/lite/kernels/batch_to_space_nd_test.cc b/tensorflow/lite/kernels/batch_to_space_nd_test.cc
index cffa1036c84..e675faafd74 100644
--- a/tensorflow/lite/kernels/batch_to_space_nd_test.cc
+++ b/tensorflow/lite/kernels/batch_to_space_nd_test.cc
@@ -13,11 +13,16 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <stdint.h>
+
+#include <initializer_list>
+#include <vector>
+
 #include <gtest/gtest.h>
+#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
 #include "tensorflow/lite/interpreter.h"
-#include "tensorflow/lite/kernels/register.h"
 #include "tensorflow/lite/kernels/test_util.h"
-#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
 namespace {
diff --git a/tensorflow/lite/kernels/bidirectional_sequence_lstm.cc b/tensorflow/lite/kernels/bidirectional_sequence_lstm.cc
index 8ccc7a68eb7..a984ff5124f 100644
--- a/tensorflow/lite/kernels/bidirectional_sequence_lstm.cc
+++ b/tensorflow/lite/kernels/bidirectional_sequence_lstm.cc
@@ -13,6 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <math.h>
+
+#include <algorithm>
 #include <cstddef>
 
 #include "tensorflow/lite/c/builtin_op_data.h"
diff --git a/tensorflow/lite/kernels/bidirectional_sequence_lstm_test.cc b/tensorflow/lite/kernels/bidirectional_sequence_lstm_test.cc
index c468c4c09fb..3a52de130e3 100644
--- a/tensorflow/lite/kernels/bidirectional_sequence_lstm_test.cc
+++ b/tensorflow/lite/kernels/bidirectional_sequence_lstm_test.cc
@@ -14,17 +14,12 @@ limitations under the License.
 ==============================================================================*/
 // Unit test for TFLite Bidirectional LSTM op.
 
-#include <initializer_list>
-#include <iomanip>
-#include <memory>
+#include <tuple>
 #include <vector>
 
-#include <gmock/gmock.h>
 #include <gtest/gtest.h>
-#include "tensorflow/lite/interpreter.h"
-#include "tensorflow/lite/kernels/register.h"
+#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
 #include "tensorflow/lite/kernels/test_util.h"
-#include "tensorflow/lite/model.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/kernels/bidirectional_sequence_rnn.cc b/tensorflow/lite/kernels/bidirectional_sequence_rnn.cc
index 58a2ef9c1ea..abaf6df9fa8 100644
--- a/tensorflow/lite/kernels/bidirectional_sequence_rnn.cc
+++ b/tensorflow/lite/kernels/bidirectional_sequence_rnn.cc
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include <algorithm>
 #include <cstddef>
 #include <cstdint>
 
diff --git a/tensorflow/lite/kernels/bidirectional_sequence_rnn_test.cc b/tensorflow/lite/kernels/bidirectional_sequence_rnn_test.cc
index 4a7cc9a016d..870b99d7437 100644
--- a/tensorflow/lite/kernels/bidirectional_sequence_rnn_test.cc
+++ b/tensorflow/lite/kernels/bidirectional_sequence_rnn_test.cc
@@ -14,15 +14,18 @@ limitations under the License.
 ==============================================================================*/
 // Unit test for TFLite Bidirectional RNN op.
 
-#include <iomanip>
+#include <algorithm>
+#include <functional>
+#include <initializer_list>
+#include <iterator>
+#include <tuple>
 #include <vector>
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
-#include "tensorflow/lite/interpreter.h"
-#include "tensorflow/lite/kernels/register.h"
+#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
 #include "tensorflow/lite/kernels/test_util.h"
-#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
 namespace {
diff --git a/tensorflow/lite/kernels/cast.cc b/tensorflow/lite/kernels/cast.cc
index a24dadb5279..415f1270328 100644
--- a/tensorflow/lite/kernels/cast.cc
+++ b/tensorflow/lite/kernels/cast.cc
@@ -12,18 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include <string.h>
-
 #include <algorithm>
 #include <complex>
 
-#include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
-#include "tensorflow/lite/kernels/op_macros.h"
-#include "tensorflow/lite/string_util.h"
 
 namespace tflite {
 namespace ops {
diff --git a/tensorflow/lite/kernels/cast_test.cc b/tensorflow/lite/kernels/cast_test.cc
index 8f1cb44f1c9..a615edbd085 100644
--- a/tensorflow/lite/kernels/cast_test.cc
+++ b/tensorflow/lite/kernels/cast_test.cc
@@ -12,13 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include <complex>
+#include <stdint.h>
 
+#include <complex>
+#include <vector>
+
+#include <gmock/gmock.h>
 #include <gtest/gtest.h>
-#include "tensorflow/lite/interpreter.h"
-#include "tensorflow/lite/kernels/register.h"
+#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
 #include "tensorflow/lite/kernels/test_util.h"
-#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
 namespace {
diff --git a/tensorflow/lite/kernels/ceil.cc b/tensorflow/lite/kernels/ceil.cc
index 3b1df4f6c2f..9914dbe09ce 100644
--- a/tensorflow/lite/kernels/ceil.cc
+++ b/tensorflow/lite/kernels/ceil.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/kernels/ceil_test.cc b/tensorflow/lite/kernels/ceil_test.cc
index 36486087fcf..bb23a5b6197 100644
--- a/tensorflow/lite/kernels/ceil_test.cc
+++ b/tensorflow/lite/kernels/ceil_test.cc
@@ -13,11 +13,13 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <initializer_list>
+#include <vector>
+
+#include <gmock/gmock.h>
 #include <gtest/gtest.h>
-#include "tensorflow/lite/interpreter.h"
-#include "tensorflow/lite/kernels/register.h"
 #include "tensorflow/lite/kernels/test_util.h"
-#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
 namespace {
diff --git a/tensorflow/lite/kernels/comparisons.cc b/tensorflow/lite/kernels/comparisons.cc
index 91dbc447c35..7d1c6b7804e 100644
--- a/tensorflow/lite/kernels/comparisons.cc
+++ b/tensorflow/lite/kernels/comparisons.cc
@@ -12,10 +12,19 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include "tensorflow/lite/kernels/internal/reference/comparisons.h"
+
+#include <stdint.h>
+
 #include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/kernels/internal/compatibility.h"
+#include "tensorflow/lite/kernels/internal/quantization_util.h"
 #include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/kernels/internal/types.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/string_util.h"
 
 namespace tflite {
 namespace ops {
diff --git a/tensorflow/lite/kernels/comparisons_test.cc b/tensorflow/lite/kernels/comparisons_test.cc
index 986600ccd1a..f8cf6dee74c 100644
--- a/tensorflow/lite/kernels/comparisons_test.cc
+++ b/tensorflow/lite/kernels/comparisons_test.cc
@@ -12,11 +12,18 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include <stdint.h>
+
+#include <initializer_list>
+#include <string>
+#include <vector>
+
+#include <gmock/gmock.h>
 #include <gtest/gtest.h>
-#include "tensorflow/lite/interpreter.h"
-#include "tensorflow/lite/kernels/register.h"
+#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
 #include "tensorflow/lite/kernels/test_util.h"
-#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/string_type.h"
 
 namespace tflite {
 namespace {
diff --git a/tensorflow/lite/kernels/concatenation.cc b/tensorflow/lite/kernels/concatenation.cc
index 8beb962b1b1..61748e5ce58 100644
--- a/tensorflow/lite/kernels/concatenation.cc
+++ b/tensorflow/lite/kernels/concatenation.cc
@@ -12,20 +12,19 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include <cassert>
-#include <cmath>
-#include <cstdio>
-#include <cstdlib>
-#include <iostream>
-#include <limits>
+#include "tensorflow/lite/kernels/internal/reference/concatenation.h"
+
+#include <stdint.h>
 
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/kernels/internal/compatibility.h"
 #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
 #include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/kernels/internal/types.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
-#include "tensorflow/lite/kernels/op_macros.h"
 
 namespace tflite {
 namespace ops {
diff --git a/tensorflow/lite/kernels/concatenation_test.cc b/tensorflow/lite/kernels/concatenation_test.cc
index 8f4abe0bcda..4e362598aae 100644
--- a/tensorflow/lite/kernels/concatenation_test.cc
+++ b/tensorflow/lite/kernels/concatenation_test.cc
@@ -12,13 +12,18 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include <cstdarg>
+#include <stdint.h>
 
+#include <initializer_list>
+#include <limits>
+#include <type_traits>
+#include <vector>
+
+#include <gmock/gmock.h>
 #include <gtest/gtest.h>
-#include "tensorflow/lite/interpreter.h"
-#include "tensorflow/lite/kernels/register.h"
+#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
 #include "tensorflow/lite/kernels/test_util.h"
-#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
 namespace {
diff --git a/tensorflow/lite/kernels/conv.cc b/tensorflow/lite/kernels/conv.cc
index 1d610b2e068..fa6caff5baa 100644
--- a/tensorflow/lite/kernels/conv.cc
+++ b/tensorflow/lite/kernels/conv.cc
@@ -14,19 +14,17 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/lite/kernels/internal/optimized/integer_ops/conv.h"
 
-#include <algorithm>
-#include <cassert>
-#include <cmath>
+#include <stddef.h>
+
 #include <cstdint>
-#include <cstdio>
-#include <cstdlib>
-#include <iostream>
-#include <limits>
+#include <vector>
 
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/kernels/cpu_backend_context.h"
 #include "tensorflow/lite/kernels/eigen_support.h"
+#include "tensorflow/lite/kernels/internal/compatibility.h"
+#include "tensorflow/lite/kernels/internal/types.h"
 // b/131835803 forces us to include multithreaded_conv.h before optimized_ops.h
 #ifndef TFLITE_WITH_RUY_ONLY
 #include "tensorflow/lite/kernels/internal/optimized/multithreaded_conv.h"
@@ -39,7 +37,6 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/internal/tensor_utils.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
-#include "tensorflow/lite/kernels/op_macros.h"
 #include "tensorflow/lite/kernels/padding.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/kernels/conv_test.cc b/tensorflow/lite/kernels/conv_test.cc
index ef1d5366255..a1fd34eb1cb 100644
--- a/tensorflow/lite/kernels/conv_test.cc
+++ b/tensorflow/lite/kernels/conv_test.cc
@@ -12,15 +12,22 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include <cstdarg>
-#include <initializer_list>
+#include <stddef.h>
+#include <stdint.h>
 
+#include <initializer_list>
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "absl/memory/memory.h"
 #include "tensorflow/lite/interpreter.h"
-#include "tensorflow/lite/kernels/register.h"
 #include "tensorflow/lite/kernels/test_util.h"
-#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/string_type.h"
 
 namespace tflite {
 
diff --git a/tensorflow/lite/kernels/cpu_backend_context.cc b/tensorflow/lite/kernels/cpu_backend_context.cc
index 7a16bed0ead..a99d08769ea 100644
--- a/tensorflow/lite/kernels/cpu_backend_context.cc
+++ b/tensorflow/lite/kernels/cpu_backend_context.cc
@@ -15,8 +15,12 @@ limitations under the License.
 
 #include "tensorflow/lite/kernels/cpu_backend_context.h"
 
+#include <memory>
+
 #include "public/gemmlowp.h"
 #include "ruy/context.h"  // from @ruy
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/external_cpu_backend_context.h"
 #include "tensorflow/lite/kernels/op_macros.h"
 
 namespace {
diff --git a/tensorflow/lite/kernels/cpu_backend_context.h b/tensorflow/lite/kernels/cpu_backend_context.h
index b4973feb56f..19ef88bf8e3 100644
--- a/tensorflow/lite/kernels/cpu_backend_context.h
+++ b/tensorflow/lite/kernels/cpu_backend_context.h
@@ -20,6 +20,7 @@ limitations under the License.
 
 #include "public/gemmlowp.h"
 #include "ruy/context.h"  // from @ruy
+#include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/external_cpu_backend_context.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/kernels/cpu_backend_gemm.h b/tensorflow/lite/kernels/cpu_backend_gemm.h
index 8e324c8b515..f4d20d8970a 100644
--- a/tensorflow/lite/kernels/cpu_backend_gemm.h
+++ b/tensorflow/lite/kernels/cpu_backend_gemm.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include <cstdint>
 
+#include "ruy/profiler/instrumentation.h"  // from @ruy
 #include "tensorflow/lite/kernels/cpu_backend_context.h"
 #include "tensorflow/lite/kernels/cpu_backend_gemm_custom_gemv.h"
 #include "tensorflow/lite/kernels/cpu_backend_gemm_params.h"
diff --git a/tensorflow/lite/kernels/cpu_backend_gemm_custom_gemv.h b/tensorflow/lite/kernels/cpu_backend_gemm_custom_gemv.h
index 1c3c0ca39c4..224f8ecea41 100644
--- a/tensorflow/lite/kernels/cpu_backend_gemm_custom_gemv.h
+++ b/tensorflow/lite/kernels/cpu_backend_gemm_custom_gemv.h
@@ -32,6 +32,9 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_KERNELS_CPU_BACKEND_GEMM_CUSTOM_GEMV_H_
 #define TENSORFLOW_LITE_KERNELS_CPU_BACKEND_GEMM_CUSTOM_GEMV_H_
 
+#include <stdint.h>
+
+#include <algorithm>
 #include <type_traits>
 #include <vector>
 
@@ -40,6 +43,8 @@ limitations under the License.
 #include "tensorflow/lite/kernels/cpu_backend_gemm_params.h"
 #include "tensorflow/lite/kernels/cpu_backend_threadpool.h"
 #include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/compatibility.h"
+#include "tensorflow/lite/kernels/internal/optimized/neon_check.h"
 
 namespace tflite {
 namespace cpu_backend_gemm {
diff --git a/tensorflow/lite/kernels/cpu_backend_gemm_gemmlowp.h b/tensorflow/lite/kernels/cpu_backend_gemm_gemmlowp.h
index 1e1074523ab..77d37aac291 100644
--- a/tensorflow/lite/kernels/cpu_backend_gemm_gemmlowp.h
+++ b/tensorflow/lite/kernels/cpu_backend_gemm_gemmlowp.h
@@ -16,13 +16,15 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_KERNELS_CPU_BACKEND_GEMM_GEMMLOWP_H_
 #define TENSORFLOW_LITE_KERNELS_CPU_BACKEND_GEMM_GEMMLOWP_H_
 
+#include <tuple>
+
+#include "tensorflow/lite/kernels/internal/compatibility.h"
 #ifndef TFLITE_WITH_RUY_ONLY
 
 #include <cstdint>
 #include <type_traits>
 
 #include "public/gemmlowp.h"
-#include "ruy/ruy.h"  // from @ruy
 #include "tensorflow/lite/kernels/cpu_backend_context.h"
 #include "tensorflow/lite/kernels/cpu_backend_gemm_params.h"
 #include "tensorflow/lite/kernels/cpu_backend_gemm_ruy.h"
diff --git a/tensorflow/lite/kernels/cpu_backend_gemm_ruy.h b/tensorflow/lite/kernels/cpu_backend_gemm_ruy.h
index 8aaedb6a13a..07ae2ff08b7 100644
--- a/tensorflow/lite/kernels/cpu_backend_gemm_ruy.h
+++ b/tensorflow/lite/kernels/cpu_backend_gemm_ruy.h
@@ -17,10 +17,10 @@ limitations under the License.
 #define TENSORFLOW_LITE_KERNELS_CPU_BACKEND_GEMM_RUY_H_
 
 #include "ruy/matrix.h"  // from @ruy
-#include "ruy/path.h"  // from @ruy
 #include "ruy/ruy.h"  // from @ruy
 #include "tensorflow/lite/kernels/cpu_backend_context.h"
 #include "tensorflow/lite/kernels/cpu_backend_gemm_params.h"
+#include "tensorflow/lite/kernels/internal/compatibility.h"
 
 namespace tflite {
 namespace cpu_backend_gemm {
diff --git a/tensorflow/lite/kernels/cpu_backend_gemm_test.cc b/tensorflow/lite/kernels/cpu_backend_gemm_test.cc
index 20334947dde..d79d1357696 100644
--- a/tensorflow/lite/kernels/cpu_backend_gemm_test.cc
+++ b/tensorflow/lite/kernels/cpu_backend_gemm_test.cc
@@ -15,19 +15,26 @@ limitations under the License.
 
 #include "tensorflow/lite/kernels/cpu_backend_gemm.h"
 
+#include <math.h>
+#include <stdint.h>
+#include <stdlib.h>
+
 #include <algorithm>
-#include <cstdarg>
+#include <iterator>
 #include <limits>
 #include <random>
 #include <sstream>
 #include <string>
 #include <tuple>
 #include <type_traits>
+#include <vector>
 
 #include <gtest/gtest.h>
+#include "ruy/matrix.h"  // from @ruy
 #include "ruy/reference_mul.h"  // from @ruy
 #include "tensorflow/lite/kernels/cpu_backend_context.h"
 #include "tensorflow/lite/kernels/cpu_backend_gemm_params.h"
+#include "tensorflow/lite/kernels/cpu_backend_gemm_ruy.h"
 
 namespace tflite {
 
diff --git a/tensorflow/lite/kernels/cpu_backend_threadpool_test.cc b/tensorflow/lite/kernels/cpu_backend_threadpool_test.cc
index 5089323070a..fafe4c40067 100644
--- a/tensorflow/lite/kernels/cpu_backend_threadpool_test.cc
+++ b/tensorflow/lite/kernels/cpu_backend_threadpool_test.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "tensorflow/lite/kernels/cpu_backend_threadpool.h"
 
+#include <vector>
+
 #include <gtest/gtest.h>
 #include "tensorflow/lite/kernels/cpu_backend_context.h"
 
diff --git a/tensorflow/lite/kernels/custom_ops_register.h b/tensorflow/lite/kernels/custom_ops_register.h
index ca9fac81889..3abc893243b 100644
--- a/tensorflow/lite/kernels/custom_ops_register.h
+++ b/tensorflow/lite/kernels/custom_ops_register.h
@@ -15,7 +15,7 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_KERNELS_CUSTOM_OPS_REGISTER_H_
 #define TENSORFLOW_LITE_KERNELS_CUSTOM_OPS_REGISTER_H_
 
-#include "tensorflow/lite/context.h"
+#include "tensorflow/lite/c/common.h"
 
 namespace tflite {
 namespace ops {
diff --git a/tensorflow/lite/kernels/densify.cc b/tensorflow/lite/kernels/densify.cc
index 0c2742d8696..cc3ac67464d 100644
--- a/tensorflow/lite/kernels/densify.cc
+++ b/tensorflow/lite/kernels/densify.cc
@@ -14,16 +14,13 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/lite/kernels/internal/reference/densify.h"
 
-#include <string.h>
+#include <stddef.h>
 
 #include <cstdint>
-#include <vector>
 
-#include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
-#include "tensorflow/lite/kernels/op_macros.h"
 
 namespace tflite {
 namespace ops {
diff --git a/tensorflow/lite/kernels/densify_test.cc b/tensorflow/lite/kernels/densify_test.cc
index 5cb90932069..d453606cf2e 100644
--- a/tensorflow/lite/kernels/densify_test.cc
+++ b/tensorflow/lite/kernels/densify_test.cc
@@ -14,18 +14,16 @@ limitations under the License.
 ==============================================================================*/
 #include <cstdint>
 #include <initializer_list>
+#include <memory>
+#include <vector>
 
+#include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "absl/memory/memory.h"
-#include "third_party/eigen3/Eigen/Core"
 #include "tensorflow/lite/c/common.h"
-#include "tensorflow/lite/interpreter.h"
 #include "tensorflow/lite/kernels/internal/types.h"
-#include "tensorflow/lite/kernels/register.h"
 #include "tensorflow/lite/kernels/test_util.h"
-#include "tensorflow/lite/model.h"
 #include "tensorflow/lite/schema/schema_generated.h"
-#include "tensorflow/lite/tools/optimize/sparsity/format_converter.h"
 
 namespace tflite {
 
diff --git a/tensorflow/lite/kernels/depth_to_space.cc b/tensorflow/lite/kernels/depth_to_space.cc
index d6fe8c7ab1c..8a81ea932bf 100644
--- a/tensorflow/lite/kernels/depth_to_space.cc
+++ b/tensorflow/lite/kernels/depth_to_space.cc
@@ -12,13 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include <stdint.h>
+
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
 #include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/kernels/internal/types.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
-#include "tensorflow/lite/kernels/op_macros.h"
 
 namespace tflite {
 namespace ops {
diff --git a/tensorflow/lite/kernels/depth_to_space_test.cc b/tensorflow/lite/kernels/depth_to_space_test.cc
index 8d59a1ad82f..4429faf9909 100644
--- a/tensorflow/lite/kernels/depth_to_space_test.cc
+++ b/tensorflow/lite/kernels/depth_to_space_test.cc
@@ -12,11 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include <stdint.h>
+
+#include <initializer_list>
+#include <vector>
+
+#include <gmock/gmock.h>
 #include <gtest/gtest.h>
-#include "tensorflow/lite/interpreter.h"
-#include "tensorflow/lite/kernels/register.h"
+#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
 #include "tensorflow/lite/kernels/test_util.h"
-#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
 namespace {
diff --git a/tensorflow/lite/kernels/depthwise_conv.cc b/tensorflow/lite/kernels/depthwise_conv.cc
index 8500b5cd39b..1897d14a065 100644
--- a/tensorflow/lite/kernels/depthwise_conv.cc
+++ b/tensorflow/lite/kernels/depthwise_conv.cc
@@ -15,27 +15,28 @@ limitations under the License.
 
 #include "tensorflow/lite/kernels/internal/optimized/integer_ops/depthwise_conv.h"
 
-#include <cassert>
-#include <cmath>
-#include <cstdio>
-#include <cstdlib>
-#include <iostream>
-#include <limits>
+#include <stddef.h>
+#include <stdint.h>
+
+#include <vector>
 
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/kernels/cpu_backend_context.h"
+#include "tensorflow/lite/kernels/internal/compatibility.h"
 #include "tensorflow/lite/kernels/internal/optimized/cpu_check.h"
 #include "tensorflow/lite/kernels/internal/optimized/depthwiseconv_multithread.h"
 #include "tensorflow/lite/kernels/internal/optimized/integer_ops/depthwise_conv_hybrid.h"
+#include "tensorflow/lite/kernels/internal/optimized/neon_check.h"
 #include "tensorflow/lite/kernels/internal/quantization_util.h"
 #include "tensorflow/lite/kernels/internal/reference/depthwiseconv_float.h"
 #include "tensorflow/lite/kernels/internal/reference/depthwiseconv_uint8.h"
 #include "tensorflow/lite/kernels/internal/reference/integer_ops/depthwise_conv.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/internal/tensor_utils.h"
+#include "tensorflow/lite/kernels/internal/types.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
-#include "tensorflow/lite/kernels/op_macros.h"
 #include "tensorflow/lite/kernels/padding.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/kernels/depthwise_conv_hybrid_test.cc b/tensorflow/lite/kernels/depthwise_conv_hybrid_test.cc
index 09cd7cec4d7..c5158eac3d0 100644
--- a/tensorflow/lite/kernels/depthwise_conv_hybrid_test.cc
+++ b/tensorflow/lite/kernels/depthwise_conv_hybrid_test.cc
@@ -12,18 +12,25 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include <cstdarg>
+#include <stddef.h>
+
 #include <cstdint>
 #include <initializer_list>
-#include <random>
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
 
+#include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "absl/memory/memory.h"
+#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
+#include "tensorflow/lite/core/api/op_resolver.h"
 #include "tensorflow/lite/interpreter.h"
 #include "tensorflow/lite/kernels/internal/test_util.h"
-#include "tensorflow/lite/kernels/register.h"
 #include "tensorflow/lite/kernels/test_util.h"
-#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/string_type.h"
 
 namespace tflite {
 
diff --git a/tensorflow/lite/kernels/depthwise_conv_test.cc b/tensorflow/lite/kernels/depthwise_conv_test.cc
index 5d85eac4aa9..f410476d983 100644
--- a/tensorflow/lite/kernels/depthwise_conv_test.cc
+++ b/tensorflow/lite/kernels/depthwise_conv_test.cc
@@ -12,17 +12,25 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include <cstdarg>
+#include <stddef.h>
+
 #include <cstdint>
 #include <initializer_list>
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
 
+#include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "absl/memory/memory.h"
+#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
+#include "tensorflow/lite/core/api/op_resolver.h"
 #include "tensorflow/lite/interpreter.h"
 #include "tensorflow/lite/kernels/internal/test_util.h"
-#include "tensorflow/lite/kernels/register.h"
 #include "tensorflow/lite/kernels/test_util.h"
-#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/string_type.h"
 
 namespace tflite {
 
diff --git a/tensorflow/lite/kernels/dequantize.cc b/tensorflow/lite/kernels/dequantize.cc
index 272662d9c48..a2a1bd495cf 100644
--- a/tensorflow/lite/kernels/dequantize.cc
+++ b/tensorflow/lite/kernels/dequantize.cc
@@ -14,15 +14,11 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/lite/kernels/dequantize.h"
 
-#include <string.h>
+#include <stddef.h>
 
-#include <cstdint>
-#include <vector>
-
-#include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/kernels/internal/optimized/neon_check.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
-#include "tensorflow/lite/kernels/op_macros.h"
 
 namespace tflite {
 namespace ops {
diff --git a/tensorflow/lite/kernels/dequantize.h b/tensorflow/lite/kernels/dequantize.h
index 3d9e7ccb135..30739eb2c57 100644
--- a/tensorflow/lite/kernels/dequantize.h
+++ b/tensorflow/lite/kernels/dequantize.h
@@ -15,16 +15,17 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_KERNELS_DEQUANTIZE_H_
 #define TENSORFLOW_LITE_KERNELS_DEQUANTIZE_H_
 
+#include <stdint.h>
+
 #include "third_party/eigen3/Eigen/Core"
-#include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
+#include "tensorflow/lite/kernels/internal/reference/dequantize.h"
 #include "tensorflow/lite/kernels/internal/reference/integer_ops/dequantize.h"
 #include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
-#include "tensorflow/lite/kernels/kernel_util.h"
-#include "tensorflow/lite/kernels/op_macros.h"
+#include "tensorflow/lite/kernels/internal/types.h"
 
 namespace tflite {
 namespace ops {
diff --git a/tensorflow/lite/kernels/dequantize_test.cc b/tensorflow/lite/kernels/dequantize_test.cc
index f55a23e138d..da795474400 100644
--- a/tensorflow/lite/kernels/dequantize_test.cc
+++ b/tensorflow/lite/kernels/dequantize_test.cc
@@ -13,15 +13,20 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include <cstdint>
+#include <initializer_list>
+#include <memory>
+#include <vector>
 
+#include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "absl/memory/memory.h"
 #include "third_party/eigen3/Eigen/Core"
+#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
+#include "tensorflow/lite/core/api/op_resolver.h"
 #include "tensorflow/lite/interpreter.h"
 #include "tensorflow/lite/kernels/internal/types.h"
-#include "tensorflow/lite/kernels/register.h"
 #include "tensorflow/lite/kernels/test_util.h"
-#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
 
diff --git a/tensorflow/lite/kernels/detection_postprocess.cc b/tensorflow/lite/kernels/detection_postprocess.cc
index 5d848bc9eab..c0b5b2ddf7c 100644
--- a/tensorflow/lite/kernels/detection_postprocess.cc
+++ b/tensorflow/lite/kernels/detection_postprocess.cc
@@ -12,19 +12,23 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include <string.h>
+#include <math.h>
+#include <stddef.h>
+#include <stdint.h>
 
+#include <algorithm>
+#include <initializer_list>
 #include <numeric>
 #include <vector>
 
 #include "flatbuffers/flexbuffers.h"  // from @flatbuffers
-#include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/kernels/internal/compatibility.h"
 #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
 #include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
-#include "tensorflow/lite/kernels/op_macros.h"
 
 namespace tflite {
 namespace ops {
diff --git a/tensorflow/lite/kernels/detection_postprocess_test.cc b/tensorflow/lite/kernels/detection_postprocess_test.cc
index 348ea45a515..cf0d3ba2f3d 100644
--- a/tensorflow/lite/kernels/detection_postprocess_test.cc
+++ b/tensorflow/lite/kernels/detection_postprocess_test.cc
@@ -12,16 +12,17 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include <functional>
-#include <memory>
+#include <stdint.h>
+
+#include <initializer_list>
 #include <vector>
 
+#include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "flatbuffers/flexbuffers.h"  // from @flatbuffers
 #include "tensorflow/lite/interpreter.h"
-#include "tensorflow/lite/kernels/register.h"
 #include "tensorflow/lite/kernels/test_util.h"
-#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
 namespace ops {
diff --git a/tensorflow/lite/kernels/div.cc b/tensorflow/lite/kernels/div.cc
index 731fb3c2fe2..cdd02277ec9 100644
--- a/tensorflow/lite/kernels/div.cc
+++ b/tensorflow/lite/kernels/div.cc
@@ -12,15 +12,22 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include <stddef.h>
+#include <stdint.h>
+
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/kernels/internal/compatibility.h"
 #include "tensorflow/lite/kernels/internal/optimized/cpu_check.h"
+#include "tensorflow/lite/kernels/internal/optimized/neon_check.h"
 #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
 #include "tensorflow/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/lite/kernels/internal/reference/process_broadcast_shapes.h"
 #include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/kernels/internal/types.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
-#include "tensorflow/lite/kernels/op_macros.h"
 
 namespace tflite {
 namespace ops {
diff --git a/tensorflow/lite/kernels/div_test.cc b/tensorflow/lite/kernels/div_test.cc
index e72565f84a0..b1a691d2452 100644
--- a/tensorflow/lite/kernels/div_test.cc
+++ b/tensorflow/lite/kernels/div_test.cc
@@ -12,11 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include <stdint.h>
+
+#include <vector>
+
+#include <gmock/gmock.h>
 #include <gtest/gtest.h>
-#include "tensorflow/lite/interpreter.h"
-#include "tensorflow/lite/kernels/register.h"
+#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
 #include "tensorflow/lite/kernels/test_util.h"
-#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
 namespace {
diff --git a/tensorflow/lite/kernels/eigen_support.cc b/tensorflow/lite/kernels/eigen_support.cc
index 3f71b0a20fc..c911222f26e 100644
--- a/tensorflow/lite/kernels/eigen_support.cc
+++ b/tensorflow/lite/kernels/eigen_support.cc
@@ -14,9 +14,12 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/lite/kernels/eigen_support.h"
 
+#include <functional>
+#include <memory>
 #include <utility>
 
 #include "tensorflow/lite/arena_planner.h"
+#include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/kernels/internal/optimized/eigen_spatial_convolutions.h"
 #include "tensorflow/lite/kernels/op_macros.h"
 
diff --git a/tensorflow/lite/kernels/eigen_support_test.cc b/tensorflow/lite/kernels/eigen_support_test.cc
index a8c8dc0a5e4..08b58446c16 100644
--- a/tensorflow/lite/kernels/eigen_support_test.cc
+++ b/tensorflow/lite/kernels/eigen_support_test.cc
@@ -15,9 +15,10 @@ limitations under the License.
 
 #include "tensorflow/lite/kernels/eigen_support.h"
 
-#include <string>
+#include <utility>
 
 #include <gtest/gtest.h>
+#include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/kernels/internal/optimized/eigen_spatial_convolutions.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/kernels/elementwise.cc b/tensorflow/lite/kernels/elementwise.cc
index 78ded8f932c..95b791be3f2 100644
--- a/tensorflow/lite/kernels/elementwise.cc
+++ b/tensorflow/lite/kernels/elementwise.cc
@@ -13,11 +13,15 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <stdint.h>
+#include <stdlib.h>
+
 #include <cmath>
 
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/kernels/elementwise_test.cc b/tensorflow/lite/kernels/elementwise_test.cc
index ec00d3e071d..9495be0e590 100644
--- a/tensorflow/lite/kernels/elementwise_test.cc
+++ b/tensorflow/lite/kernels/elementwise_test.cc
@@ -13,11 +13,13 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <initializer_list>
+#include <vector>
+
+#include <gmock/gmock.h>
 #include <gtest/gtest.h>
-#include "tensorflow/lite/interpreter.h"
-#include "tensorflow/lite/kernels/register.h"
 #include "tensorflow/lite/kernels/test_util.h"
-#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
 namespace {
diff --git a/tensorflow/lite/kernels/embedding_lookup.cc b/tensorflow/lite/kernels/embedding_lookup.cc
index 0334c0daf84..36e0737c7e2 100644
--- a/tensorflow/lite/kernels/embedding_lookup.cc
+++ b/tensorflow/lite/kernels/embedding_lookup.cc
@@ -29,19 +29,13 @@ limitations under the License.
 //   When indices are out of bound, the ops will not succeed.
 //
 
-#include <cassert>
-#include <cmath>
-#include <cstdio>
-#include <cstdlib>
-#include <cstring>
-#include <iostream>
-#include <limits>
+#include <stdint.h>
+
+#include <cstring>
 
-#include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
-#include "tensorflow/lite/kernels/op_macros.h"
 
 namespace tflite {
 namespace ops {
diff --git a/tensorflow/lite/kernels/embedding_lookup_sparse.cc b/tensorflow/lite/kernels/embedding_lookup_sparse.cc
index 8ecf427d4c1..92574817e3b 100644
--- a/tensorflow/lite/kernels/embedding_lookup_sparse.cc
+++ b/tensorflow/lite/kernels/embedding_lookup_sparse.cc
@@ -62,6 +62,8 @@ limitations under the License.
 //
 //   When indices are out of bound, the op will not succeed.
 
+#include <stdint.h>
+
 #include <algorithm>
 #include <cmath>
 
@@ -70,7 +72,6 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/internal/tensor_utils.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
-#include "tensorflow/lite/kernels/op_macros.h"
 
 namespace tflite {
 namespace ops {
diff --git a/tensorflow/lite/kernels/embedding_lookup_sparse_test.cc b/tensorflow/lite/kernels/embedding_lookup_sparse_test.cc
index 1714ff52e70..d9b2d523297 100644
--- a/tensorflow/lite/kernels/embedding_lookup_sparse_test.cc
+++ b/tensorflow/lite/kernels/embedding_lookup_sparse_test.cc
@@ -15,15 +15,18 @@ limitations under the License.
 // Unit test for TFLite sparse lookup op.
 
 #include <cmath>
+#include <functional>
+#include <initializer_list>
+#include <memory>
 #include <vector>
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
 #include "tensorflow/lite/interpreter.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
-#include "tensorflow/lite/kernels/register.h"
 #include "tensorflow/lite/kernels/test_util.h"
-#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
 namespace {
diff --git a/tensorflow/lite/kernels/embedding_lookup_test.cc b/tensorflow/lite/kernels/embedding_lookup_test.cc
index 58e77afeeef..cbca3af0e8e 100644
--- a/tensorflow/lite/kernels/embedding_lookup_test.cc
+++ b/tensorflow/lite/kernels/embedding_lookup_test.cc
@@ -14,17 +14,20 @@ License.
 ==============================================================================*/
 // Unit test for TFLite Lookup op.
 
+#include <stdint.h>
+
+#include <functional>
 #include <initializer_list>
-#include <iomanip>
+#include <memory>
 #include <vector>
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "tensorflow/lite/interpreter.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
-#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/test_util.h"
-#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
 namespace {
diff --git a/tensorflow/lite/kernels/exp.cc b/tensorflow/lite/kernels/exp.cc
index b53d2f9f56a..764ae6ce7a3 100644
--- a/tensorflow/lite/kernels/exp.cc
+++ b/tensorflow/lite/kernels/exp.cc
@@ -12,16 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include <string.h>
-
-#include <vector>
-
-#include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
-#include "tensorflow/lite/kernels/op_macros.h"
 
 namespace tflite {
 namespace ops {
diff --git a/tensorflow/lite/kernels/exp_test.cc b/tensorflow/lite/kernels/exp_test.cc
index b6f73169c4b..97cb591d353 100644
--- a/tensorflow/lite/kernels/exp_test.cc
+++ b/tensorflow/lite/kernels/exp_test.cc
@@ -12,11 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include <math.h>
+
+#include <initializer_list>
+#include <vector>
+
+#include <gmock/gmock.h>
 #include <gtest/gtest.h>
-#include "tensorflow/lite/interpreter.h"
-#include "tensorflow/lite/kernels/register.h"
+#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
 #include "tensorflow/lite/kernels/test_util.h"
-#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
 namespace {
diff --git a/tensorflow/lite/kernels/expand_dims.cc b/tensorflow/lite/kernels/expand_dims.cc
index 5c7bd167425..721ab3d510a 100644
--- a/tensorflow/lite/kernels/expand_dims.cc
+++ b/tensorflow/lite/kernels/expand_dims.cc
@@ -1,4 +1,5 @@
 
+#include <stdint.h>
 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
@@ -15,14 +16,11 @@ limitations under the License.
 ==============================================================================*/
 #include <string.h>
 
-#include <vector>
-
-#include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
-#include "tensorflow/lite/kernels/op_macros.h"
 
 namespace tflite {
 namespace ops {
diff --git a/tensorflow/lite/kernels/expand_dims_test.cc b/tensorflow/lite/kernels/expand_dims_test.cc
index 5bb1d76f00f..6d231d4cb63 100644
--- a/tensorflow/lite/kernels/expand_dims_test.cc
+++ b/tensorflow/lite/kernels/expand_dims_test.cc
@@ -1,4 +1,3 @@
-
 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
@@ -13,12 +12,17 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include <stdint.h>
+
+#include <initializer_list>
+#include <string>
+#include <vector>
+
+#include <gmock/gmock.h>
 #include <gtest/gtest.h>
-#include "tensorflow/lite/c/builtin_op_data.h"
-#include "tensorflow/lite/interpreter.h"
-#include "tensorflow/lite/kernels/register.h"
 #include "tensorflow/lite/kernels/test_util.h"
-#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/string_type.h"
 
 namespace tflite {
 namespace {
diff --git a/tensorflow/lite/kernels/fake_quant.cc b/tensorflow/lite/kernels/fake_quant.cc
index 79ce8416a0a..e8ea090c7b3 100644
--- a/tensorflow/lite/kernels/fake_quant.cc
+++ b/tensorflow/lite/kernels/fake_quant.cc
@@ -12,16 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include <string.h>
-
-#include <vector>
-
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/kernels/internal/types.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
-#include "tensorflow/lite/kernels/op_macros.h"
 
 namespace tflite {
 namespace ops {
diff --git a/tensorflow/lite/kernels/fake_quant_test.cc b/tensorflow/lite/kernels/fake_quant_test.cc
index 3a2dc258fe2..94c03ff1fbd 100644
--- a/tensorflow/lite/kernels/fake_quant_test.cc
+++ b/tensorflow/lite/kernels/fake_quant_test.cc
@@ -12,11 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include <initializer_list>
+#include <vector>
+
+#include <gmock/gmock.h>
 #include <gtest/gtest.h>
-#include "tensorflow/lite/interpreter.h"
-#include "tensorflow/lite/kernels/register.h"
+#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
 #include "tensorflow/lite/kernels/test_util.h"
-#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
 namespace {
diff --git a/tensorflow/lite/kernels/fill.cc b/tensorflow/lite/kernels/fill.cc
index 19ff1de4939..68ec3e9eca3 100644
--- a/tensorflow/lite/kernels/fill.cc
+++ b/tensorflow/lite/kernels/fill.cc
@@ -13,12 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/lite/c/builtin_op_data.h"
+#include <stdint.h>
+
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/string_util.h"
 
 namespace tflite {
 namespace ops {
diff --git a/tensorflow/lite/kernels/fill_test.cc b/tensorflow/lite/kernels/fill_test.cc
index 0717a31b9d7..4fc753ba36a 100644
--- a/tensorflow/lite/kernels/fill_test.cc
+++ b/tensorflow/lite/kernels/fill_test.cc
@@ -12,11 +12,17 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include <stdint.h>
+
+#include <initializer_list>
+#include <string>
+#include <vector>
+
+#include <gmock/gmock.h>
 #include <gtest/gtest.h>
-#include "tensorflow/lite/interpreter.h"
-#include "tensorflow/lite/kernels/register.h"
 #include "tensorflow/lite/kernels/test_util.h"
-#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/string_type.h"
 
 namespace tflite {
 namespace {
diff --git a/tensorflow/lite/kernels/floor.cc b/tensorflow/lite/kernels/floor.cc
index a01a3199ffc..2e341218700 100644
--- a/tensorflow/lite/kernels/floor.cc
+++ b/tensorflow/lite/kernels/floor.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/kernels/floor_div.cc b/tensorflow/lite/kernels/floor_div.cc
index 05b6d9cfeae..5677dc4d9b7 100644
--- a/tensorflow/lite/kernels/floor_div.cc
+++ b/tensorflow/lite/kernels/floor_div.cc
@@ -12,11 +12,18 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include <math.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#include <functional>
+
 #include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/kernels/internal/reference/binary_function.h"
 #include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
-#include "tensorflow/lite/kernels/op_macros.h"
 
 namespace tflite {
 namespace ops {
diff --git a/tensorflow/lite/kernels/floor_div_test.cc b/tensorflow/lite/kernels/floor_div_test.cc
index b7dcb3babe2..d219f8913ce 100644
--- a/tensorflow/lite/kernels/floor_div_test.cc
+++ b/tensorflow/lite/kernels/floor_div_test.cc
@@ -12,11 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include <stdint.h>
+
+#include <vector>
+
+#include <gmock/gmock.h>
 #include <gtest/gtest.h>
-#include "tensorflow/lite/interpreter.h"
-#include "tensorflow/lite/kernels/register.h"
 #include "tensorflow/lite/kernels/test_util.h"
-#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
 namespace {
diff --git a/tensorflow/lite/kernels/floor_mod.cc b/tensorflow/lite/kernels/floor_mod.cc
index a91ab7f07ab..a4bc9fa9841 100644
--- a/tensorflow/lite/kernels/floor_mod.cc
+++ b/tensorflow/lite/kernels/floor_mod.cc
@@ -12,14 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include <functional>
-#include <type_traits>
+#include <stddef.h>
+#include <stdint.h>
 
 #include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/kernels/internal/reference/binary_function.h"
 #include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
-#include "tensorflow/lite/kernels/op_macros.h"
 
 // TODO(b/117523611): We should factor out a binary_op and put binary ops there.
 namespace tflite {
diff --git a/tensorflow/lite/kernels/floor_mod_test.cc b/tensorflow/lite/kernels/floor_mod_test.cc
index f96988a855a..33b3834c972 100644
--- a/tensorflow/lite/kernels/floor_mod_test.cc
+++ b/tensorflow/lite/kernels/floor_mod_test.cc
@@ -12,11 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include <stdint.h>
+
+#include <vector>
+
+#include <gmock/gmock.h>
 #include <gtest/gtest.h>
-#include "tensorflow/lite/interpreter.h"
-#include "tensorflow/lite/kernels/register.h"
 #include "tensorflow/lite/kernels/test_util.h"
-#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
 namespace {
diff --git a/tensorflow/lite/kernels/floor_test.cc b/tensorflow/lite/kernels/floor_test.cc
index e66158ba7ba..8602786f439 100644
--- a/tensorflow/lite/kernels/floor_test.cc
+++ b/tensorflow/lite/kernels/floor_test.cc
@@ -13,11 +13,13 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <initializer_list>
+#include <vector>
+
+#include <gmock/gmock.h>
 #include <gtest/gtest.h>
-#include "tensorflow/lite/interpreter.h"
-#include "tensorflow/lite/kernels/register.h"
 #include "tensorflow/lite/kernels/test_util.h"
-#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
 namespace {
diff --git a/tensorflow/lite/kernels/fully_connected.cc b/tensorflow/lite/kernels/fully_connected.cc
index 37ccf8459bb..a1893878232 100644
--- a/tensorflow/lite/kernels/fully_connected.cc
+++ b/tensorflow/lite/kernels/fully_connected.cc
@@ -22,7 +22,6 @@ limitations under the License.
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/kernels/cpu_backend_context.h"
-#include "tensorflow/lite/kernels/cpu_backend_gemm_params.h"
 #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
 #include "tensorflow/lite/kernels/internal/optimized/sparse_ops/fully_connected.h"
 #include "tensorflow/lite/kernels/internal/quantization_util.h"
diff --git a/tensorflow/lite/kernels/fully_connected_test.cc b/tensorflow/lite/kernels/fully_connected_test.cc
index 364d54b0c8e..7f02ed079bd 100644
--- a/tensorflow/lite/kernels/fully_connected_test.cc
+++ b/tensorflow/lite/kernels/fully_connected_test.cc
@@ -16,19 +16,28 @@ limitations under the License.
 
 #include "tensorflow/lite/kernels/fully_connected.h"
 
+#include <stddef.h>
+#include <stdint.h>
+
+#include <algorithm>
 #include <initializer_list>
-#include <iomanip>
+#include <limits>
+#include <map>
+#include <memory>
 #include <random>
+#include <string>
 #include <vector>
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "absl/memory/memory.h"
+#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
+#include "tensorflow/lite/core/api/op_resolver.h"
 #include "tensorflow/lite/interpreter.h"
 #include "tensorflow/lite/kernels/internal/tensor_utils.h"
-#include "tensorflow/lite/kernels/register.h"
 #include "tensorflow/lite/kernels/test_util.h"
-#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/string_type.h"
 
 namespace tflite {
 namespace {
diff --git a/tensorflow/lite/kernels/gather.cc b/tensorflow/lite/kernels/gather.cc
index b1485397291..1de49f7c486 100644
--- a/tensorflow/lite/kernels/gather.cc
+++ b/tensorflow/lite/kernels/gather.cc
@@ -12,14 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include <string.h>
+#include <stdint.h>
 
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
+#include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/kernels/internal/types.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
-#include "tensorflow/lite/kernels/op_macros.h"
 #include "tensorflow/lite/string_util.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/kernels/gather_nd.cc b/tensorflow/lite/kernels/gather_nd.cc
index 4ca0864b94f..fd31b8c4ddd 100644
--- a/tensorflow/lite/kernels/gather_nd.cc
+++ b/tensorflow/lite/kernels/gather_nd.cc
@@ -12,12 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/lite/c/builtin_op_data.h"
+#include <stdint.h>
+
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
+#include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
-#include "tensorflow/lite/kernels/op_macros.h"
 
 namespace tflite {
 namespace ops {
diff --git a/tensorflow/lite/kernels/gather_nd_test.cc b/tensorflow/lite/kernels/gather_nd_test.cc
index 7e2714dac5e..33dce89917d 100644
--- a/tensorflow/lite/kernels/gather_nd_test.cc
+++ b/tensorflow/lite/kernels/gather_nd_test.cc
@@ -12,13 +12,18 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include <stdint.h>
+
+#include <initializer_list>
+#include <string>
+#include <vector>
+
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
-#include "tensorflow/lite/c/builtin_op_data.h"
-#include "tensorflow/lite/interpreter.h"
-#include "tensorflow/lite/kernels/register.h"
+#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
 #include "tensorflow/lite/kernels/test_util.h"
-#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/string_type.h"
 
 namespace tflite {
 namespace {
diff --git a/tensorflow/lite/kernels/gather_test.cc b/tensorflow/lite/kernels/gather_test.cc
index 483b59fb533..01be7f01935 100644
--- a/tensorflow/lite/kernels/gather_test.cc
+++ b/tensorflow/lite/kernels/gather_test.cc
@@ -12,12 +12,18 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include <stdint.h>
+
+#include <initializer_list>
+#include <string>
+#include <vector>
+
+#include <gmock/gmock.h>
 #include <gtest/gtest.h>
-#include "tensorflow/lite/c/builtin_op_data.h"
-#include "tensorflow/lite/interpreter.h"
-#include "tensorflow/lite/kernels/register.h"
+#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
 #include "tensorflow/lite/kernels/test_util.h"
-#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/string_type.h"
 
 namespace tflite {
 namespace {
diff --git a/tensorflow/lite/kernels/hashtable/BUILD b/tensorflow/lite/kernels/hashtable/BUILD
index 4ec3abe77ee..d141abf4f95 100644
--- a/tensorflow/lite/kernels/hashtable/BUILD
+++ b/tensorflow/lite/kernels/hashtable/BUILD
@@ -25,7 +25,6 @@ cc_library(
         "//tensorflow/lite/core/api",
         "//tensorflow/lite/experimental/resource",
         "//tensorflow/lite/kernels:kernel_util",
-        "//tensorflow/lite/kernels:op_macros",
         "//tensorflow/lite/kernels/internal:tensor",
         "//tensorflow/lite/schema:schema_fbs",
         "@flatbuffers",
@@ -49,6 +48,7 @@ cc_test(
         "//tensorflow/lite/testing:util",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
+        "@com_google_googletest//:gtest",
         "@flatbuffers",
     ],
 )
diff --git a/tensorflow/lite/kernels/hashtable_lookup.cc b/tensorflow/lite/kernels/hashtable_lookup.cc
index a432dcb8e22..65e50fe41c2 100644
--- a/tensorflow/lite/kernels/hashtable_lookup.cc
+++ b/tensorflow/lite/kernels/hashtable_lookup.cc
@@ -31,18 +31,13 @@ limitations under the License.
 //   Each item indicates whether the corresponding lookup has a returned value.
 //   0 for missing key, 1 for found key.
 
-#include <cassert>
-#include <cmath>
-#include <cstdio>
+#include <stdint.h>
+
 #include <cstdlib>
 #include <cstring>
-#include <iostream>
-#include <limits>
 
-#include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
-#include "tensorflow/lite/kernels/op_macros.h"
 #include "tensorflow/lite/string_util.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/kernels/hashtable_lookup_test.cc b/tensorflow/lite/kernels/hashtable_lookup_test.cc
index 638d82ea167..8f90de3a71a 100644
--- a/tensorflow/lite/kernels/hashtable_lookup_test.cc
+++ b/tensorflow/lite/kernels/hashtable_lookup_test.cc
@@ -14,16 +14,21 @@ limitations under the License.
 ==============================================================================*/
 // Unit test for TFLite Lookup op.
 
-#include <iomanip>
+#include <stdint.h>
+
+#include <functional>
+#include <initializer_list>
+#include <memory>
+#include <string>
 #include <vector>
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "tensorflow/lite/interpreter.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
-#include "tensorflow/lite/kernels/register.h"
 #include "tensorflow/lite/kernels/test_util.h"
-#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/string_type.h"
 #include "tensorflow/lite/string_util.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/kernels/if.cc b/tensorflow/lite/kernels/if.cc
index 731b9d4c82f..d3f92a92b08 100644
--- a/tensorflow/lite/kernels/if.cc
+++ b/tensorflow/lite/kernels/if.cc
@@ -13,7 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <stddef.h>
+
 #include <cstring>
+#include <memory>
+#include <vector>
 
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
diff --git a/tensorflow/lite/kernels/if_test.cc b/tensorflow/lite/kernels/if_test.cc
index c81300e5d1d..0bef77ef7a7 100644
--- a/tensorflow/lite/kernels/if_test.cc
+++ b/tensorflow/lite/kernels/if_test.cc
@@ -13,14 +13,15 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <stdint.h>
+
+#include <memory>
+#include <vector>
+
 #include <gtest/gtest.h>
-#include "flatbuffers/flexbuffers.h"  // from @flatbuffers
 #include "tensorflow/lite/interpreter.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
-#include "tensorflow/lite/kernels/register.h"
 #include "tensorflow/lite/kernels/subgraph_test_util.h"
-#include "tensorflow/lite/kernels/test_util.h"
-#include "tensorflow/lite/model.h"
 
 namespace tflite {
 
diff --git a/tensorflow/lite/kernels/internal/optimized/im2col_utils.h b/tensorflow/lite/kernels/internal/optimized/im2col_utils.h
index 42aa4825771..ca4ce6bbea3 100644
--- a/tensorflow/lite/kernels/internal/optimized/im2col_utils.h
+++ b/tensorflow/lite/kernels/internal/optimized/im2col_utils.h
@@ -15,6 +15,8 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_IM2COL_UTILS_H_
 #define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_IM2COL_UTILS_H_
 
+#include <cassert>
+
 #include "ruy/profiler/instrumentation.h"  // from @ruy
 #include "tensorflow/lite/kernels/internal/types.h"
 
diff --git a/tensorflow/lite/kernels/internal/optimized/integer_ops/add.h b/tensorflow/lite/kernels/internal/optimized/integer_ops/add.h
index 44479d93a31..a63763b755a 100644
--- a/tensorflow/lite/kernels/internal/optimized/integer_ops/add.h
+++ b/tensorflow/lite/kernels/internal/optimized/integer_ops/add.h
@@ -15,9 +15,14 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_INTEGER_OPS_ADD_H_
 #define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_INTEGER_OPS_ADD_H_
 
+#include <algorithm>
+
+#include "fixedpoint/fixedpoint.h"
 #include "ruy/profiler/instrumentation.h"  // from @ruy
 #include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/compatibility.h"
 #include "tensorflow/lite/kernels/internal/optimized/cpu_check.h"
+#include "tensorflow/lite/kernels/internal/optimized/neon_check.h"
 #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
 #include "tensorflow/lite/kernels/internal/reference/integer_ops/add.h"
 #include "tensorflow/lite/kernels/internal/types.h"
diff --git a/tensorflow/lite/kernels/internal/optimized/integer_ops/conv.h b/tensorflow/lite/kernels/internal/optimized/integer_ops/conv.h
index 61f848c888e..c426ceb3a67 100644
--- a/tensorflow/lite/kernels/internal/optimized/integer_ops/conv.h
+++ b/tensorflow/lite/kernels/internal/optimized/integer_ops/conv.h
@@ -20,6 +20,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/cpu_backend_gemm.h"
 #include "tensorflow/lite/kernels/cpu_backend_gemm_params.h"
 #include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/compatibility.h"
 #include "tensorflow/lite/kernels/internal/optimized/im2col_utils.h"
 #include "tensorflow/lite/kernels/internal/types.h"
 
diff --git a/tensorflow/lite/kernels/internal/optimized/integer_ops/depthwise_conv.h b/tensorflow/lite/kernels/internal/optimized/integer_ops/depthwise_conv.h
index 0ff153da977..c84e7dc04d9 100644
--- a/tensorflow/lite/kernels/internal/optimized/integer_ops/depthwise_conv.h
+++ b/tensorflow/lite/kernels/internal/optimized/integer_ops/depthwise_conv.h
@@ -15,13 +15,20 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_INTEGER_OPS_DEPTHWISE_CONV_H_
 #define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_INTEGER_OPS_DEPTHWISE_CONV_H_
 
+#include <string.h>
+
+#include <algorithm>
+#include <vector>
+
 #include "ruy/profiler/instrumentation.h"  // from @ruy
 #include "tensorflow/lite/kernels/cpu_backend_context.h"
 #include "tensorflow/lite/kernels/cpu_backend_threadpool.h"
+#include "tensorflow/lite/kernels/internal/compatibility.h"
 #include "tensorflow/lite/kernels/internal/optimized/cpu_check.h"
 #include "tensorflow/lite/kernels/internal/optimized/depthwiseconv_3x3_filter_common.h"
 #include "tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8_3x3_filter.h"
 #include "tensorflow/lite/kernels/internal/optimized/integer_ops/depthwise_conv_3x3_filter.h"
+#include "tensorflow/lite/kernels/internal/optimized/neon_check.h"
 #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
 #include "tensorflow/lite/kernels/internal/reference/depthwiseconv_uint8.h"
 #include "tensorflow/lite/kernels/internal/types.h"
diff --git a/tensorflow/lite/kernels/internal/optimized/integer_ops/fully_connected.h b/tensorflow/lite/kernels/internal/optimized/integer_ops/fully_connected.h
index 8de99c1a564..d234c5bb4a1 100644
--- a/tensorflow/lite/kernels/internal/optimized/integer_ops/fully_connected.h
+++ b/tensorflow/lite/kernels/internal/optimized/integer_ops/fully_connected.h
@@ -18,9 +18,11 @@ limitations under the License.
 #include "ruy/profiler/instrumentation.h"  // from @ruy
 #include "tensorflow/lite/kernels/cpu_backend_context.h"
 #include "tensorflow/lite/kernels/cpu_backend_gemm.h"
-#include "tensorflow/lite/kernels/cpu_backend_threadpool.h"
+#include "tensorflow/lite/kernels/cpu_backend_gemm_params.h"
 #include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/compatibility.h"
 #include "tensorflow/lite/kernels/internal/reference/integer_ops/fully_connected.h"
+#include "tensorflow/lite/kernels/internal/types.h"
 
 namespace tflite {
 namespace optimized_integer_ops {
diff --git a/tensorflow/lite/kernels/internal/optimized/integer_ops/mul.h b/tensorflow/lite/kernels/internal/optimized/integer_ops/mul.h
index 15c3d291ec3..45c27ab026a 100644
--- a/tensorflow/lite/kernels/internal/optimized/integer_ops/mul.h
+++ b/tensorflow/lite/kernels/internal/optimized/integer_ops/mul.h
@@ -15,9 +15,14 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_INTEGER_OPS_MUL_H_
 #define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_INTEGER_OPS_MUL_H_
 
+#include <algorithm>
+
+#include "fixedpoint/fixedpoint.h"
 #include "ruy/profiler/instrumentation.h"  // from @ruy
 #include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/compatibility.h"
 #include "tensorflow/lite/kernels/internal/optimized/cpu_check.h"
+#include "tensorflow/lite/kernels/internal/optimized/neon_check.h"
 #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
 #include "tensorflow/lite/kernels/internal/reference/integer_ops/mul.h"
 #include "tensorflow/lite/kernels/internal/types.h"
diff --git a/tensorflow/lite/kernels/internal/optimized/integer_ops/pooling.h b/tensorflow/lite/kernels/internal/optimized/integer_ops/pooling.h
index 060845f4a10..f2696500ab9 100644
--- a/tensorflow/lite/kernels/internal/optimized/integer_ops/pooling.h
+++ b/tensorflow/lite/kernels/internal/optimized/integer_ops/pooling.h
@@ -15,23 +15,16 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_INTEGER_OPS_POOLING_H_
 #define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_INTEGER_OPS_POOLING_H_
 
-#include <assert.h>
-#include <stdint.h>
-#include <sys/types.h>
+#include <string.h>
 
 #include <algorithm>
-#include <cmath>
-#include <cstdint>
-#include <limits>
-#include <memory>
-#include <tuple>
-#include <type_traits>
 
-#include "fixedpoint/fixedpoint.h"
 #include "ruy/profiler/instrumentation.h"  // from @ruy
+#include "tensorflow/lite/kernels/internal/compatibility.h"
 #include "tensorflow/lite/kernels/internal/cppmath.h"
 #include "tensorflow/lite/kernels/internal/optimized/cpu_check.h"
 #include "tensorflow/lite/kernels/internal/optimized/im2col_utils.h"
+#include "tensorflow/lite/kernels/internal/optimized/neon_check.h"
 #include "tensorflow/lite/kernels/internal/quantization_util.h"
 #include "tensorflow/lite/kernels/internal/reference/integer_ops/pooling.h"
 #include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
diff --git a/tensorflow/lite/kernels/internal/per_channel_dequantize_test.cc b/tensorflow/lite/kernels/internal/per_channel_dequantize_test.cc
index 3ad125b86e4..89710b99b95 100644
--- a/tensorflow/lite/kernels/internal/per_channel_dequantize_test.cc
+++ b/tensorflow/lite/kernels/internal/per_channel_dequantize_test.cc
@@ -15,6 +15,7 @@ limitations under the License.
 #include <cstdint>
 #include <vector>
 
+#include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "tensorflow/lite/kernels/internal/reference/dequantize.h"
 #include "tensorflow/lite/kernels/internal/types.h"
diff --git a/tensorflow/lite/kernels/internal/reference/batch_matmul.h b/tensorflow/lite/kernels/internal/reference/batch_matmul.h
index c8d6d6a0e29..1394bd9da64 100644
--- a/tensorflow/lite/kernels/internal/reference/batch_matmul.h
+++ b/tensorflow/lite/kernels/internal/reference/batch_matmul.h
@@ -15,8 +15,11 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_BATCH_MATMUL_H_
 #define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_BATCH_MATMUL_H_
 
-#include "tensorflow/lite/c/common.h"
+#include <stdint.h>
+#include <string.h>
+
 #include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/compatibility.h"
 #include "tensorflow/lite/kernels/internal/reference/portable_tensor_utils_impl.h"
 #include "tensorflow/lite/kernels/internal/types.h"
 
diff --git a/tensorflow/lite/kernels/internal/reference/densify.h b/tensorflow/lite/kernels/internal/reference/densify.h
index d1fd488700a..71a9a26cc09 100644
--- a/tensorflow/lite/kernels/internal/reference/densify.h
+++ b/tensorflow/lite/kernels/internal/reference/densify.h
@@ -15,6 +15,8 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_DENSIFY_H_
 #define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_DENSIFY_H_
 
+#include <vector>
+
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/kernels/internal/common.h"
 #include "tensorflow/lite/kernels/internal/types.h"
diff --git a/tensorflow/lite/kernels/internal/reference/non_max_suppression.h b/tensorflow/lite/kernels/internal/reference/non_max_suppression.h
index 5d3823788ef..64c27c174fa 100644
--- a/tensorflow/lite/kernels/internal/reference/non_max_suppression.h
+++ b/tensorflow/lite/kernels/internal/reference/non_max_suppression.h
@@ -17,7 +17,7 @@ limitations under the License.
 
 #include <algorithm>
 #include <cmath>
-#include <numeric>
+#include <deque>
 #include <queue>
 
 namespace tflite {
diff --git a/tensorflow/lite/kernels/internal/reference/quantize.h b/tensorflow/lite/kernels/internal/reference/quantize.h
index 58d19c0a14c..d36db06f2e0 100644
--- a/tensorflow/lite/kernels/internal/reference/quantize.h
+++ b/tensorflow/lite/kernels/internal/reference/quantize.h
@@ -15,7 +15,11 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_QUANTIZE_H_
 #define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_QUANTIZE_H_
 
+#include <algorithm>
+#include <limits>
+
 #include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/compatibility.h"
 #include "tensorflow/lite/kernels/internal/cppmath.h"
 #include "tensorflow/lite/kernels/internal/types.h"
 
diff --git a/tensorflow/lite/kernels/internal/reference/strided_slice.h b/tensorflow/lite/kernels/internal/reference/strided_slice.h
index ba6d4c22554..8b6f0c13da1 100644
--- a/tensorflow/lite/kernels/internal/reference/strided_slice.h
+++ b/tensorflow/lite/kernels/internal/reference/strided_slice.h
@@ -16,8 +16,10 @@ limitations under the License.
 #define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_STRIDED_SLICE_H_
 
 #include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/compatibility.h"
 #include "tensorflow/lite/kernels/internal/strided_slice_logic.h"
 #include "tensorflow/lite/kernels/internal/types.h"
+
 namespace tflite {
 
 namespace reference_ops {
diff --git a/tensorflow/lite/kernels/internal/reference/sub.h b/tensorflow/lite/kernels/internal/reference/sub.h
index 48d03de02ee..6191eaac558 100644
--- a/tensorflow/lite/kernels/internal/reference/sub.h
+++ b/tensorflow/lite/kernels/internal/reference/sub.h
@@ -15,9 +15,15 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_SUB_H_
 #define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_SUB_H_
 
-#include "fixedpoint/fixedpoint.h"
+#include <stdint.h>
+
+#include <algorithm>
+#include <limits>
+
 #include "ruy/profiler/instrumentation.h"  // from @ruy
 #include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/compatibility.h"
+#include "tensorflow/lite/kernels/internal/types.h"
 
 namespace tflite {
 
diff --git a/tensorflow/lite/kernels/internal/reference/svdf.h b/tensorflow/lite/kernels/internal/reference/svdf.h
index f03c0392ff4..ffa46b8f422 100644
--- a/tensorflow/lite/kernels/internal/reference/svdf.h
+++ b/tensorflow/lite/kernels/internal/reference/svdf.h
@@ -15,7 +15,10 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_SVDF_H_
 #define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_SVDF_H_
 
+#include <stdint.h>
+
 #include <algorithm>
+#include <limits>
 
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
diff --git a/tensorflow/lite/kernels/kernel_util.cc b/tensorflow/lite/kernels/kernel_util.cc
index ded536ab3a7..032726a7860 100644
--- a/tensorflow/lite/kernels/kernel_util.cc
+++ b/tensorflow/lite/kernels/kernel_util.cc
@@ -14,10 +14,15 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/lite/kernels/kernel_util.h"
 
+#include <stdint.h>
+#include <stdlib.h>
+
 #include <algorithm>
-#include <cmath>
+#include <limits>
 #include <memory>
 
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/kernels/internal/cppmath.h"
 #include "tensorflow/lite/kernels/internal/quantization_util.h"
 
diff --git a/tensorflow/lite/kernels/kernel_util.h b/tensorflow/lite/kernels/kernel_util.h
index d57234afa77..6fc69fa1629 100644
--- a/tensorflow/lite/kernels/kernel_util.h
+++ b/tensorflow/lite/kernels/kernel_util.h
@@ -15,10 +15,10 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_KERNELS_KERNEL_UTIL_H_
 #define TENSORFLOW_LITE_KERNELS_KERNEL_UTIL_H_
 
-#include <algorithm>
+#include <stdint.h>
+
 #include <limits>
 
-#include "flatbuffers/flatbuffers.h"
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
 
diff --git a/tensorflow/lite/kernels/kernel_util_test.cc b/tensorflow/lite/kernels/kernel_util_test.cc
index 7a7467ee0d4..db0cc3cb39c 100644
--- a/tensorflow/lite/kernels/kernel_util_test.cc
+++ b/tensorflow/lite/kernels/kernel_util_test.cc
@@ -14,8 +14,18 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/lite/kernels/kernel_util.h"
 
+#include <math.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <initializer_list>
+#include <vector>
+
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/testing/util.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/kernels/l2norm.cc b/tensorflow/lite/kernels/l2norm.cc
index ab009f337de..a7fb35ed594 100644
--- a/tensorflow/lite/kernels/l2norm.cc
+++ b/tensorflow/lite/kernels/l2norm.cc
@@ -14,12 +14,15 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/kernels/internal/compatibility.h"
 #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
 #include "tensorflow/lite/kernels/internal/reference/integer_ops/l2normalization.h"
+#include "tensorflow/lite/kernels/internal/reference/l2normalization.h"
 #include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/kernels/internal/types.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
-#include "tensorflow/lite/kernels/op_macros.h"
 
 namespace tflite {
 namespace ops {
diff --git a/tensorflow/lite/kernels/l2norm_test.cc b/tensorflow/lite/kernels/l2norm_test.cc
index e4793dc5c74..968bcc556b3 100644
--- a/tensorflow/lite/kernels/l2norm_test.cc
+++ b/tensorflow/lite/kernels/l2norm_test.cc
@@ -12,11 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include <stdint.h>
+
+#include <initializer_list>
+#include <vector>
+
+#include <gmock/gmock.h>
 #include <gtest/gtest.h>
-#include "tensorflow/lite/interpreter.h"
-#include "tensorflow/lite/kernels/register.h"
+#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
 #include "tensorflow/lite/kernels/test_util.h"
-#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
 namespace {
diff --git a/tensorflow/lite/kernels/local_response_norm.cc b/tensorflow/lite/kernels/local_response_norm.cc
index 85d0796cc1c..f4b996c45a1 100644
--- a/tensorflow/lite/kernels/local_response_norm.cc
+++ b/tensorflow/lite/kernels/local_response_norm.cc
@@ -17,8 +17,9 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
 #include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/kernels/internal/types.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
-#include "tensorflow/lite/kernels/op_macros.h"
 
 namespace tflite {
 namespace ops {
diff --git a/tensorflow/lite/kernels/local_response_norm_test.cc b/tensorflow/lite/kernels/local_response_norm_test.cc
index 701da5ceb3d..353cce3db8a 100644
--- a/tensorflow/lite/kernels/local_response_norm_test.cc
+++ b/tensorflow/lite/kernels/local_response_norm_test.cc
@@ -12,11 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include <initializer_list>
+#include <vector>
+
+#include <gmock/gmock.h>
 #include <gtest/gtest.h>
-#include "tensorflow/lite/interpreter.h"
-#include "tensorflow/lite/kernels/register.h"
+#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
 #include "tensorflow/lite/kernels/test_util.h"
-#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
 namespace {
diff --git a/tensorflow/lite/kernels/log_softmax_test.cc b/tensorflow/lite/kernels/log_softmax_test.cc
index bc265915279..a65ee528a30 100644
--- a/tensorflow/lite/kernels/log_softmax_test.cc
+++ b/tensorflow/lite/kernels/log_softmax_test.cc
@@ -14,17 +14,17 @@ limitations under the License.
 ==============================================================================*/
 // Unit test for TFLite LOG_SOFTMAX op.
 
-#include <iomanip>
+#include <initializer_list>
 #include <memory>
 #include <vector>
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
-#include "tensorflow/lite/interpreter.h"
+#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
 #include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
-#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/kernels/internal/types.h"
 #include "tensorflow/lite/kernels/test_util.h"
-#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
 namespace {
diff --git a/tensorflow/lite/kernels/logical.cc b/tensorflow/lite/kernels/logical.cc
index 397964cfd19..ec650dd4210 100644
--- a/tensorflow/lite/kernels/logical.cc
+++ b/tensorflow/lite/kernels/logical.cc
@@ -12,11 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include <stddef.h>
+
 #include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/kernels/internal/reference/binary_function.h"
 #include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
-#include "tensorflow/lite/kernels/op_macros.h"
 
 namespace tflite {
 namespace ops {
diff --git a/tensorflow/lite/kernels/logical_test.cc b/tensorflow/lite/kernels/logical_test.cc
index 276d5d91cd1..cd85e320069 100644
--- a/tensorflow/lite/kernels/logical_test.cc
+++ b/tensorflow/lite/kernels/logical_test.cc
@@ -12,11 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include <initializer_list>
+#include <vector>
+
+#include <gmock/gmock.h>
 #include <gtest/gtest.h>
-#include "tensorflow/lite/interpreter.h"
-#include "tensorflow/lite/kernels/register.h"
+#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
 #include "tensorflow/lite/kernels/test_util.h"
-#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
 namespace {
diff --git a/tensorflow/lite/kernels/lsh_projection.cc b/tensorflow/lite/kernels/lsh_projection.cc
index 68d0719c4e2..b809748c59c 100644
--- a/tensorflow/lite/kernels/lsh_projection.cc
+++ b/tensorflow/lite/kernels/lsh_projection.cc
@@ -50,20 +50,16 @@ limitations under the License.
 //     Output.Dim == { Tensor[0].Dim[0] * Tensor[0].Dim[1] }
 //     A flattened tensor represents projected bit vectors.
 
-#include <cassert>
-#include <cmath>
-#include <cstdio>
-#include <cstdlib>
+#include <stddef.h>
+#include <stdint.h>
+
 #include <cstring>
-#include <iostream>
-#include <limits>
 #include <memory>
 
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
-#include "tensorflow/lite/kernels/op_macros.h"
 #include <farmhash.h>
 
 namespace tflite {
diff --git a/tensorflow/lite/kernels/lsh_projection_test.cc b/tensorflow/lite/kernels/lsh_projection_test.cc
index 1b75992de6d..008a5c45aaa 100644
--- a/tensorflow/lite/kernels/lsh_projection_test.cc
+++ b/tensorflow/lite/kernels/lsh_projection_test.cc
@@ -13,13 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <initializer_list>
 #include <vector>
 
+#include <gmock/gmock.h>
 #include <gtest/gtest.h>
-#include "tensorflow/lite/interpreter.h"
-#include "tensorflow/lite/kernels/register.h"
+#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
 #include "tensorflow/lite/kernels/test_util.h"
-#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
 namespace {
diff --git a/tensorflow/lite/kernels/lstm_eval.cc b/tensorflow/lite/kernels/lstm_eval.cc
index 7fa3d85687c..b285ed1030f 100644
--- a/tensorflow/lite/kernels/lstm_eval.cc
+++ b/tensorflow/lite/kernels/lstm_eval.cc
@@ -14,8 +14,13 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/lite/kernels/lstm_eval.h"
 
+#include <math.h>
+#include <string.h>
+
 #include <algorithm>
 #include <cstdint>
+#include <memory>
+#include <vector>
 
 #include "ruy/profiler/instrumentation.h"  // from @ruy
 #include "tensorflow/lite/c/builtin_op_data.h"
diff --git a/tensorflow/lite/kernels/lstm_eval.h b/tensorflow/lite/kernels/lstm_eval.h
index 877cfd70a89..91f47b18df6 100644
--- a/tensorflow/lite/kernels/lstm_eval.h
+++ b/tensorflow/lite/kernels/lstm_eval.h
@@ -17,7 +17,6 @@ limitations under the License.
 
 #include <cstdint>
 #include <memory>
-#include <vector>
 
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
diff --git a/tensorflow/lite/kernels/lstm_eval_test.cc b/tensorflow/lite/kernels/lstm_eval_test.cc
index 885ae250ae7..baf2e5e83df 100644
--- a/tensorflow/lite/kernels/lstm_eval_test.cc
+++ b/tensorflow/lite/kernels/lstm_eval_test.cc
@@ -14,17 +14,17 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/lite/kernels/lstm_eval.h"
 
+#include <stdint.h>
+#include <stdlib.h>
+
 #include <algorithm>
-#include <cmath>
+#include <memory>
 #include <vector>
 
-#include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
-#include "tensorflow/lite/interpreter.h"
-#include "tensorflow/lite/kernels/register.h"
-#include "tensorflow/lite/kernels/test_util.h"
-#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/kernels/cpu_backend_context.h"
 
 namespace tflite {
 namespace {
diff --git a/tensorflow/lite/kernels/lstm_test.cc b/tensorflow/lite/kernels/lstm_test.cc
index 74ec8d324c6..ba5ee6508cc 100644
--- a/tensorflow/lite/kernels/lstm_test.cc
+++ b/tensorflow/lite/kernels/lstm_test.cc
@@ -17,15 +17,17 @@ limitations under the License.
 // TODO(alanchiao): add unit test with invalid input dimensions for this and its
 // variants.
 
-#include <memory>
+#include <stdint.h>
+
+#include <utility>
 #include <vector>
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
 #include "tensorflow/lite/interpreter.h"
-#include "tensorflow/lite/kernels/register.h"
 #include "tensorflow/lite/kernels/test_util.h"
-#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
 namespace {
diff --git a/tensorflow/lite/kernels/matrix_diag.cc b/tensorflow/lite/kernels/matrix_diag.cc
index a4137c1e0b7..c921650926f 100644
--- a/tensorflow/lite/kernels/matrix_diag.cc
+++ b/tensorflow/lite/kernels/matrix_diag.cc
@@ -12,17 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include <string.h>
+#include <stdint.h>
 
-#include <vector>
-
-#include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
 #include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
-#include "tensorflow/lite/kernels/op_macros.h"
 
 namespace tflite {
 namespace ops {
diff --git a/tensorflow/lite/kernels/matrix_diag_test.cc b/tensorflow/lite/kernels/matrix_diag_test.cc
index 09a72e9b726..d0c2a45b3b3 100644
--- a/tensorflow/lite/kernels/matrix_diag_test.cc
+++ b/tensorflow/lite/kernels/matrix_diag_test.cc
@@ -12,10 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include <stdint.h>
+
+#include <memory>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
 #include "tensorflow/lite/interpreter.h"
-#include "tensorflow/lite/kernels/register.h"
 #include "tensorflow/lite/kernels/test_util.h"
-#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
 namespace {
diff --git a/tensorflow/lite/kernels/matrix_set_diag.cc b/tensorflow/lite/kernels/matrix_set_diag.cc
index 4602ca0228c..e9c17f985d3 100644
--- a/tensorflow/lite/kernels/matrix_set_diag.cc
+++ b/tensorflow/lite/kernels/matrix_set_diag.cc
@@ -12,17 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include <string.h>
+#include <stdint.h>
 
-#include <vector>
-
-#include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
 #include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
-#include "tensorflow/lite/kernels/op_macros.h"
 
 namespace tflite {
 namespace ops {
diff --git a/tensorflow/lite/kernels/matrix_set_diag_test.cc b/tensorflow/lite/kernels/matrix_set_diag_test.cc
index 46b314735b3..8fdb381f2b9 100644
--- a/tensorflow/lite/kernels/matrix_set_diag_test.cc
+++ b/tensorflow/lite/kernels/matrix_set_diag_test.cc
@@ -12,10 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include <stdint.h>
+
+#include <memory>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
 #include "tensorflow/lite/interpreter.h"
-#include "tensorflow/lite/kernels/register.h"
 #include "tensorflow/lite/kernels/test_util.h"
-#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
 namespace {
diff --git a/tensorflow/lite/kernels/maximum_minimum.cc b/tensorflow/lite/kernels/maximum_minimum.cc
index cad86acd8dd..ae1920e53db 100644
--- a/tensorflow/lite/kernels/maximum_minimum.cc
+++ b/tensorflow/lite/kernels/maximum_minimum.cc
@@ -12,17 +12,19 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include <string.h>
+#include "tensorflow/lite/kernels/internal/reference/maximum_minimum.h"
 
-#include <vector>
+#include <stdint.h>
 
-#include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/kernels/internal/compatibility.h"
 #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
+#include "tensorflow/lite/kernels/internal/reference/process_broadcast_shapes.h"
 #include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/kernels/internal/types.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
-#include "tensorflow/lite/kernels/op_macros.h"
 
 namespace tflite {
 namespace ops {
diff --git a/tensorflow/lite/kernels/maximum_minimum_test.cc b/tensorflow/lite/kernels/maximum_minimum_test.cc
index b22435d3e97..2c036e369bd 100644
--- a/tensorflow/lite/kernels/maximum_minimum_test.cc
+++ b/tensorflow/lite/kernels/maximum_minimum_test.cc
@@ -12,13 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include <memory>
+#include <stdint.h>
 
+#include <initializer_list>
+#include <memory>
+#include <vector>
+
+#include <gmock/gmock.h>
 #include <gtest/gtest.h>
-#include "tensorflow/lite/interpreter.h"
-#include "tensorflow/lite/kernels/register.h"
 #include "tensorflow/lite/kernels/test_util.h"
-#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
 namespace {
diff --git a/tensorflow/lite/kernels/mfcc.cc b/tensorflow/lite/kernels/mfcc.cc
index cba7cb132eb..5fe5b948a87 100644
--- a/tensorflow/lite/kernels/mfcc.cc
+++ b/tensorflow/lite/kernels/mfcc.cc
@@ -14,16 +14,21 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/lite/kernels/internal/mfcc.h"
 
+#include <stddef.h>
+#include <stdint.h>
+
+#include <vector>
+
 #include "flatbuffers/flexbuffers.h"  // from @flatbuffers
-#include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/kernels/internal/compatibility.h"
 #include "tensorflow/lite/kernels/internal/mfcc_dct.h"
 #include "tensorflow/lite/kernels/internal/mfcc_mel_filterbank.h"
 #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
 #include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
-#include "tensorflow/lite/kernels/op_macros.h"
 
 namespace tflite {
 namespace ops {
diff --git a/tensorflow/lite/kernels/mfcc_test.cc b/tensorflow/lite/kernels/mfcc_test.cc
index a6b769ccd37..abe9b7b9dad 100644
--- a/tensorflow/lite/kernels/mfcc_test.cc
+++ b/tensorflow/lite/kernels/mfcc_test.cc
@@ -13,16 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include <functional>
-#include <memory>
 #include <vector>
 
+#include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "flatbuffers/flexbuffers.h"  // from @flatbuffers
 #include "tensorflow/lite/interpreter.h"
-#include "tensorflow/lite/kernels/register.h"
 #include "tensorflow/lite/kernels/test_util.h"
-#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
 namespace ops {
diff --git a/tensorflow/lite/kernels/mirror_pad.cc b/tensorflow/lite/kernels/mirror_pad.cc
index 17756113069..8f4f02f7848 100644
--- a/tensorflow/lite/kernels/mirror_pad.cc
+++ b/tensorflow/lite/kernels/mirror_pad.cc
@@ -13,9 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <stddef.h>
+#include <stdint.h>
+
+#include <algorithm>
 #include <memory>
 #include <vector>
 
+#include "ruy/profiler/instrumentation.h"  // from @ruy
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/kernels/cpu_backend_context.h"
@@ -26,7 +31,6 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/tensor.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
-#include "tensorflow/lite/kernels/op_macros.h"
 
 namespace tflite {
 namespace ops {
diff --git a/tensorflow/lite/kernels/mirror_pad_test.cc b/tensorflow/lite/kernels/mirror_pad_test.cc
index 91e48fa68aa..fc8a7e68c49 100644
--- a/tensorflow/lite/kernels/mirror_pad_test.cc
+++ b/tensorflow/lite/kernels/mirror_pad_test.cc
@@ -12,11 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include <vector>
+
+#include <gmock/gmock.h>
 #include <gtest/gtest.h>
-#include "tensorflow/lite/interpreter.h"
-#include "tensorflow/lite/kernels/register.h"
 #include "tensorflow/lite/kernels/test_util.h"
-#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
 namespace {
diff --git a/tensorflow/lite/kernels/mul.cc b/tensorflow/lite/kernels/mul.cc
index 4140a1ac5b2..0ab378e278d 100644
--- a/tensorflow/lite/kernels/mul.cc
+++ b/tensorflow/lite/kernels/mul.cc
@@ -14,16 +14,24 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/lite/kernels/internal/optimized/integer_ops/mul.h"
 
+#include <stddef.h>
+#include <stdint.h>
+
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/kernels/internal/compatibility.h"
 #include "tensorflow/lite/kernels/internal/optimized/cpu_check.h"
+#include "tensorflow/lite/kernels/internal/optimized/neon_check.h"
 #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
 #include "tensorflow/lite/kernels/internal/quantization_util.h"
 #include "tensorflow/lite/kernels/internal/reference/integer_ops/mul.h"
+#include "tensorflow/lite/kernels/internal/reference/mul.h"
+#include "tensorflow/lite/kernels/internal/reference/process_broadcast_shapes.h"
 #include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/kernels/internal/types.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
-#include "tensorflow/lite/kernels/op_macros.h"
 
 namespace tflite {
 namespace ops {
diff --git a/tensorflow/lite/kernels/mul_test.cc b/tensorflow/lite/kernels/mul_test.cc
index db8dca9f4b3..9499fd40bea 100644
--- a/tensorflow/lite/kernels/mul_test.cc
+++ b/tensorflow/lite/kernels/mul_test.cc
@@ -12,11 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include <stddef.h>
+#include <stdint.h>
+
+#include <vector>
+
+#include <gmock/gmock.h>
 #include <gtest/gtest.h>
-#include "tensorflow/lite/interpreter.h"
-#include "tensorflow/lite/kernels/register.h"
+#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
 #include "tensorflow/lite/kernels/test_util.h"
-#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
 namespace {
diff --git a/tensorflow/lite/kernels/neg.cc b/tensorflow/lite/kernels/neg.cc
index 135f888a9e5..4a4ce8fcbd5 100644
--- a/tensorflow/lite/kernels/neg.cc
+++ b/tensorflow/lite/kernels/neg.cc
@@ -15,9 +15,12 @@ limitations under the License.
 
 #include "tensorflow/lite/kernels/internal/reference/neg.h"
 
+#include <stdint.h>
+
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/kernels/neg_test.cc b/tensorflow/lite/kernels/neg_test.cc
index 0cdf5161628..87326846ddd 100644
--- a/tensorflow/lite/kernels/neg_test.cc
+++ b/tensorflow/lite/kernels/neg_test.cc
@@ -12,11 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include <stdint.h>
+
+#include <initializer_list>
+#include <vector>
+
+#include <gmock/gmock.h>
 #include <gtest/gtest.h>
-#include "tensorflow/lite/interpreter.h"
-#include "tensorflow/lite/kernels/register.h"
+#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
 #include "tensorflow/lite/kernels/test_util.h"
-#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
 namespace {
diff --git a/tensorflow/lite/kernels/non_max_suppression.cc b/tensorflow/lite/kernels/non_max_suppression.cc
index f57ee1bc5d2..d6e13cdbd33 100644
--- a/tensorflow/lite/kernels/non_max_suppression.cc
+++ b/tensorflow/lite/kernels/non_max_suppression.cc
@@ -14,16 +14,12 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/lite/kernels/internal/reference/non_max_suppression.h"
 
-#include <string.h>
+#include <initializer_list>
 
-#include <numeric>
-#include <vector>
-
-#include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
-#include "tensorflow/lite/kernels/op_macros.h"
 
 namespace tflite {
 namespace ops {
diff --git a/tensorflow/lite/kernels/non_max_suppression_test.cc b/tensorflow/lite/kernels/non_max_suppression_test.cc
index 454bb5a0959..9b7baa147e5 100644
--- a/tensorflow/lite/kernels/non_max_suppression_test.cc
+++ b/tensorflow/lite/kernels/non_max_suppression_test.cc
@@ -12,11 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include <initializer_list>
+#include <vector>
+
+#include <gmock/gmock.h>
 #include <gtest/gtest.h>
-#include "tensorflow/lite/interpreter.h"
-#include "tensorflow/lite/kernels/register.h"
+#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
 #include "tensorflow/lite/kernels/test_util.h"
-#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
 namespace {
diff --git a/tensorflow/lite/kernels/numeric_verify.cc b/tensorflow/lite/kernels/numeric_verify.cc
index fa6324086e1..bbd2448ece0 100644
--- a/tensorflow/lite/kernels/numeric_verify.cc
+++ b/tensorflow/lite/kernels/numeric_verify.cc
@@ -12,25 +12,24 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include <string.h>
+#include <math.h>
+#include <stddef.h>
+#include <stdlib.h>
 
 #include <algorithm>
-#include <cassert>
 #include <cstdint>
 #include <numeric>
 #include <vector>
 
-#include "third_party/eigen3/Eigen/Core"
-#include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/kernels/dequantize.h"
+#include "tensorflow/lite/kernels/internal/optimized/neon_check.h"
 #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
 #include "tensorflow/lite/kernels/internal/reference/integer_ops/dequantize.h"
 #include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
-#include "tensorflow/lite/kernels/op_macros.h"
 
 namespace tflite {
 namespace ops {
diff --git a/tensorflow/lite/kernels/numeric_verify_test.cc b/tensorflow/lite/kernels/numeric_verify_test.cc
index 7dcf2436b32..9fb2e559c37 100644
--- a/tensorflow/lite/kernels/numeric_verify_test.cc
+++ b/tensorflow/lite/kernels/numeric_verify_test.cc
@@ -12,17 +12,19 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include <string.h>
+
 #include <cstdint>
 #include <initializer_list>
+#include <vector>
 
 #include <gtest/gtest.h>
 #include "absl/memory/memory.h"
 #include "third_party/eigen3/Eigen/Core"
 #include "tensorflow/lite/interpreter.h"
 #include "tensorflow/lite/kernels/internal/types.h"
-#include "tensorflow/lite/kernels/register.h"
 #include "tensorflow/lite/kernels/test_util.h"
-#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
 
diff --git a/tensorflow/lite/kernels/one_hot.cc b/tensorflow/lite/kernels/one_hot.cc
index 750d6dac2ef..76d53c6396f 100644
--- a/tensorflow/lite/kernels/one_hot.cc
+++ b/tensorflow/lite/kernels/one_hot.cc
@@ -12,11 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include <stdint.h>
+
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
-#include "tensorflow/lite/kernels/op_macros.h"
 
 namespace tflite {
 namespace ops {
diff --git a/tensorflow/lite/kernels/one_hot_test.cc b/tensorflow/lite/kernels/one_hot_test.cc
index 96b549cb6eb..e94854612ab 100644
--- a/tensorflow/lite/kernels/one_hot_test.cc
+++ b/tensorflow/lite/kernels/one_hot_test.cc
@@ -13,13 +13,16 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <stdint.h>
+
 #include <initializer_list>
+#include <memory>
+#include <vector>
 
 #include <gtest/gtest.h>
 #include "tensorflow/lite/interpreter.h"
-#include "tensorflow/lite/kernels/register.h"
 #include "tensorflow/lite/kernels/test_util.h"
-#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
 namespace {
diff --git a/tensorflow/lite/kernels/op_macros.h b/tensorflow/lite/kernels/op_macros.h
index 8c1a6b1be16..5c190f1c595 100644
--- a/tensorflow/lite/kernels/op_macros.h
+++ b/tensorflow/lite/kernels/op_macros.h
@@ -20,7 +20,6 @@ limitations under the License.
 #ifdef TF_LITE_MCU_DEBUG_LOG
 
 #include "tensorflow/lite/micro/debug_log.h"
-#include "tensorflow/lite/micro/micro_error_reporter.h"
 
 #define DEBUG_LOG(x) \
   do {               \
@@ -37,7 +36,6 @@ inline void InfiniteLoop() {
 
 #else  // TF_LITE_MCU_DEBUG_LOG
 
-#include <cassert>
 #include <cstdio>
 #include <cstdlib>
 
diff --git a/tensorflow/lite/kernels/optional_tensor_test.cc b/tensorflow/lite/kernels/optional_tensor_test.cc
index a09f8601589..26d619276aa 100644
--- a/tensorflow/lite/kernels/optional_tensor_test.cc
+++ b/tensorflow/lite/kernels/optional_tensor_test.cc
@@ -14,16 +14,14 @@ limitations under the License.
 ==============================================================================*/
 // Unit test for TFLite LSTM op.
 
-#include <iomanip>
-#include <memory>
+#include <initializer_list>
 #include <vector>
 
-#include <gmock/gmock.h>
 #include <gtest/gtest.h>
-#include "tensorflow/lite/interpreter.h"
-#include "tensorflow/lite/kernels/register.h"
+#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
 #include "tensorflow/lite/kernels/test_util.h"
-#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/testing/util.h"
 
 namespace tflite {
 namespace {
diff --git a/tensorflow/lite/kernels/pack.cc b/tensorflow/lite/kernels/pack.cc
index ebc3381dae8..fc7a87692c4 100644
--- a/tensorflow/lite/kernels/pack.cc
+++ b/tensorflow/lite/kernels/pack.cc
@@ -13,10 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <stdint.h>
+
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/kernels/internal/types.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/kernels/pack_test.cc b/tensorflow/lite/kernels/pack_test.cc
index bc6758c7249..c15a0b7fe1f 100644
--- a/tensorflow/lite/kernels/pack_test.cc
+++ b/tensorflow/lite/kernels/pack_test.cc
@@ -12,11 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include <stdint.h>
+
+#include <initializer_list>
+#include <type_traits>
+#include <vector>
+
+#include <gmock/gmock.h>
 #include <gtest/gtest.h>
-#include "tensorflow/lite/interpreter.h"
-#include "tensorflow/lite/kernels/register.h"
 #include "tensorflow/lite/kernels/test_util.h"
-#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
 namespace {
diff --git a/tensorflow/lite/kernels/pad.cc b/tensorflow/lite/kernels/pad.cc
index cc735e4eede..2239511b60a 100644
--- a/tensorflow/lite/kernels/pad.cc
+++ b/tensorflow/lite/kernels/pad.cc
@@ -12,17 +12,20 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include <string.h>
+#include "tensorflow/lite/kernels/internal/reference/pad.h"
 
-#include <vector>
+#include <stdint.h>
+
+#include <limits>
 
-#include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/kernels/internal/compatibility.h"
 #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
 #include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/kernels/internal/types.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
-#include "tensorflow/lite/kernels/op_macros.h"
 
 namespace tflite {
 namespace ops {
diff --git a/tensorflow/lite/kernels/pad_test.cc b/tensorflow/lite/kernels/pad_test.cc
index 6e1e00cc3b8..983642298be 100644
--- a/tensorflow/lite/kernels/pad_test.cc
+++ b/tensorflow/lite/kernels/pad_test.cc
@@ -13,11 +13,15 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include <cstdint>
+#include <initializer_list>
+#include <vector>
+
+#include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
 #include "tensorflow/lite/interpreter.h"
-#include "tensorflow/lite/kernels/register.h"
 #include "tensorflow/lite/kernels/test_util.h"
-#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
 namespace {
diff --git a/tensorflow/lite/kernels/pooling.cc b/tensorflow/lite/kernels/pooling.cc
index 0dcb667e901..1dc5cbb6199 100644
--- a/tensorflow/lite/kernels/pooling.cc
+++ b/tensorflow/lite/kernels/pooling.cc
@@ -14,21 +14,22 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/lite/kernels/internal/optimized/integer_ops/pooling.h"
 
-#include <cassert>
-#include <cmath>
-#include <cstdio>
+#include <stddef.h>
+#include <stdint.h>
+
 #include <cstdlib>
-#include <iostream>
-#include <limits>
 
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/kernels/internal/compatibility.h"
 #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
 #include "tensorflow/lite/kernels/internal/reference/integer_ops/pooling.h"
+#include "tensorflow/lite/kernels/internal/reference/pooling.h"
 #include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/kernels/internal/types.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
-#include "tensorflow/lite/kernels/op_macros.h"
 #include "tensorflow/lite/kernels/padding.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/kernels/pooling_test.cc b/tensorflow/lite/kernels/pooling_test.cc
index e609f04e21d..e614fedccfd 100644
--- a/tensorflow/lite/kernels/pooling_test.cc
+++ b/tensorflow/lite/kernels/pooling_test.cc
@@ -12,13 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include <cstdarg>
+#include <stdint.h>
 
+#include <initializer_list>
+#include <vector>
+
+#include <gmock/gmock.h>
 #include <gtest/gtest.h>
-#include "tensorflow/lite/interpreter.h"
-#include "tensorflow/lite/kernels/register.h"
+#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
 #include "tensorflow/lite/kernels/test_util.h"
-#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
 namespace {
diff --git a/tensorflow/lite/kernels/pow.cc b/tensorflow/lite/kernels/pow.cc
index 8b72e7c0418..a76c77a3f9f 100644
--- a/tensorflow/lite/kernels/pow.cc
+++ b/tensorflow/lite/kernels/pow.cc
@@ -12,12 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include <stddef.h>
+#include <stdint.h>
+
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
 #include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
-#include "tensorflow/lite/kernels/op_macros.h"
 
 namespace tflite {
 namespace ops {
diff --git a/tensorflow/lite/kernels/pow_test.cc b/tensorflow/lite/kernels/pow_test.cc
index e106cbb1b91..fa7b6d2ef9a 100644
--- a/tensorflow/lite/kernels/pow_test.cc
+++ b/tensorflow/lite/kernels/pow_test.cc
@@ -12,12 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include <math.h>
+#include <stdint.h>
+
+#include <vector>
+
+#include <gmock/gmock.h>
 #include <gtest/gtest.h>
-#include "tensorflow/lite/interpreter.h"
 #include "tensorflow/lite/kernels/internal/test_util.h"
-#include "tensorflow/lite/kernels/register.h"
 #include "tensorflow/lite/kernels/test_util.h"
-#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
 namespace {
diff --git a/tensorflow/lite/kernels/quant_basic_lstm_test.cc b/tensorflow/lite/kernels/quant_basic_lstm_test.cc
index 0baae569d24..3e081c221c5 100644
--- a/tensorflow/lite/kernels/quant_basic_lstm_test.cc
+++ b/tensorflow/lite/kernels/quant_basic_lstm_test.cc
@@ -14,16 +14,13 @@ limitations under the License.
 ==============================================================================*/
 #include <cstdint>
 #include <initializer_list>
-#include <memory>
 #include <vector>
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
-#include "tensorflow/lite/interpreter.h"
-#include "tensorflow/lite/kernels/kernel_util.h"
-#include "tensorflow/lite/kernels/register.h"
+#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
 #include "tensorflow/lite/kernels/test_util.h"
-#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
 namespace {
diff --git a/tensorflow/lite/kernels/quantize.cc b/tensorflow/lite/kernels/quantize.cc
index 21cc73278a0..1779500e6a2 100644
--- a/tensorflow/lite/kernels/quantize.cc
+++ b/tensorflow/lite/kernels/quantize.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
 #include "tensorflow/lite/kernels/internal/quantization_util.h"
 #include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
+#include "tensorflow/lite/kernels/internal/reference/requantize.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/internal/types.h"
diff --git a/tensorflow/lite/kernels/quantize_test.cc b/tensorflow/lite/kernels/quantize_test.cc
index cfd9c1f434e..d7392b3e3ea 100644
--- a/tensorflow/lite/kernels/quantize_test.cc
+++ b/tensorflow/lite/kernels/quantize_test.cc
@@ -13,13 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include <cstdint>
+#include <initializer_list>
+#include <vector>
 
 #include <gtest/gtest.h>
-#include "tensorflow/lite/interpreter.h"
+#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
 #include "tensorflow/lite/kernels/internal/types.h"
-#include "tensorflow/lite/kernels/register.h"
 #include "tensorflow/lite/kernels/test_util.h"
-#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
 namespace {
diff --git a/tensorflow/lite/kernels/range.cc b/tensorflow/lite/kernels/range.cc
index ae6db1b601f..55cc543d745 100644
--- a/tensorflow/lite/kernels/range.cc
+++ b/tensorflow/lite/kernels/range.cc
@@ -13,7 +13,13 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/lite/c/builtin_op_data.h"
+#include <math.h>
+#include <stdint.h>
+#include <stdlib.h>
+
+#include <functional>
+#include <type_traits>
+
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
diff --git a/tensorflow/lite/kernels/range_test.cc b/tensorflow/lite/kernels/range_test.cc
index 7de4fe3cb76..52f7231def9 100644
--- a/tensorflow/lite/kernels/range_test.cc
+++ b/tensorflow/lite/kernels/range_test.cc
@@ -12,11 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include <stdint.h>
+
+#include <vector>
+
 #include <gtest/gtest.h>
-#include "tensorflow/lite/interpreter.h"
-#include "tensorflow/lite/kernels/register.h"
 #include "tensorflow/lite/kernels/test_util.h"
-#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
 namespace {
diff --git a/tensorflow/lite/kernels/rank.cc b/tensorflow/lite/kernels/rank.cc
index 53fd92f1682..2202f6dd953 100644
--- a/tensorflow/lite/kernels/rank.cc
+++ b/tensorflow/lite/kernels/rank.cc
@@ -12,11 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/lite/c/builtin_op_data.h"
+#include <stdint.h>
+
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
-#include "tensorflow/lite/kernels/op_macros.h"
 
 namespace tflite {
 namespace ops {
diff --git a/tensorflow/lite/kernels/rank_test.cc b/tensorflow/lite/kernels/rank_test.cc
index 5373a0a66fe..760560c5a92 100644
--- a/tensorflow/lite/kernels/rank_test.cc
+++ b/tensorflow/lite/kernels/rank_test.cc
@@ -13,13 +13,17 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <stdint.h>
+
 #include <initializer_list>
+#include <memory>
+#include <vector>
 
 #include <gtest/gtest.h>
+#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
 #include "tensorflow/lite/interpreter.h"
-#include "tensorflow/lite/kernels/register.h"
 #include "tensorflow/lite/kernels/test_util.h"
-#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
 namespace {
diff --git a/tensorflow/lite/kernels/read_variable.cc b/tensorflow/lite/kernels/read_variable.cc
index f4762303b8b..ad6e8d43858 100644
--- a/tensorflow/lite/kernels/read_variable.cc
+++ b/tensorflow/lite/kernels/read_variable.cc
@@ -13,17 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <stdint.h>
 #include <string.h>
 
-#include <memory>
-
-#include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/core/subgraph.h"
 #include "tensorflow/lite/experimental/resource/resource_variable.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
-#include "tensorflow/lite/kernels/op_macros.h"
 
 namespace tflite {
 namespace ops {
diff --git a/tensorflow/lite/kernels/reduce.cc b/tensorflow/lite/kernels/reduce.cc
index f0222a08fe3..af42b2a369c 100644
--- a/tensorflow/lite/kernels/reduce.cc
+++ b/tensorflow/lite/kernels/reduce.cc
@@ -12,16 +12,20 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include <string.h>
+#include "tensorflow/lite/kernels/internal/reference/reduce.h"
+
+#include <stddef.h>
 
 #include <cstdint>
 #include <limits>
-#include <vector>
 
+#include "ruy/profiler/instrumentation.h"  // from @ruy
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/kernels/cpu_backend_context.h"
+#include "tensorflow/lite/kernels/internal/compatibility.h"
 #include "tensorflow/lite/kernels/internal/optimized/integer_ops/mean.h"
+#include "tensorflow/lite/kernels/internal/optimized/neon_check.h"
 #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
 #include "tensorflow/lite/kernels/internal/quantization_util.h"
 #include "tensorflow/lite/kernels/internal/reference/integer_ops/mean.h"
@@ -30,7 +34,6 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/internal/types.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
-#include "tensorflow/lite/kernels/op_macros.h"
 
 namespace tflite {
 namespace ops {
diff --git a/tensorflow/lite/kernels/reduce_test.cc b/tensorflow/lite/kernels/reduce_test.cc
index ddbd5106063..2c83369ea37 100644
--- a/tensorflow/lite/kernels/reduce_test.cc
+++ b/tensorflow/lite/kernels/reduce_test.cc
@@ -12,11 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include <stdint.h>
+
+#include <initializer_list>
+#include <vector>
+
 #include <gtest/gtest.h>
-#include "tensorflow/lite/interpreter.h"
-#include "tensorflow/lite/kernels/register.h"
+#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
 #include "tensorflow/lite/kernels/test_util.h"
-#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
 namespace {
diff --git a/tensorflow/lite/kernels/register.cc b/tensorflow/lite/kernels/register.cc
index 8ca58e6a309..90688a2aa1f 100644
--- a/tensorflow/lite/kernels/register.cc
+++ b/tensorflow/lite/kernels/register.cc
@@ -15,7 +15,9 @@ limitations under the License.
 
 #include "tensorflow/lite/kernels/register.h"
 
+#include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/kernels/builtin_op_kernels.h"
+#include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
 namespace ops {
diff --git a/tensorflow/lite/kernels/register.h b/tensorflow/lite/kernels/register.h
index 3e5bd298baf..a2a41ea9428 100644
--- a/tensorflow/lite/kernels/register.h
+++ b/tensorflow/lite/kernels/register.h
@@ -15,7 +15,6 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_KERNELS_REGISTER_H_
 #define TENSORFLOW_LITE_KERNELS_REGISTER_H_
 
-#include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/model.h"
 #include "tensorflow/lite/mutable_op_resolver.h"
 
diff --git a/tensorflow/lite/kernels/register_ref.cc b/tensorflow/lite/kernels/register_ref.cc
index 426f8a8e896..233520e2165 100644
--- a/tensorflow/lite/kernels/register_ref.cc
+++ b/tensorflow/lite/kernels/register_ref.cc
@@ -14,6 +14,10 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/lite/kernels/register_ref.h"
+
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/mutable_op_resolver.h"
+#include "tensorflow/lite/schema/schema_generated.h"
 #include "tensorflow/lite/util.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/kernels/register_ref.h b/tensorflow/lite/kernels/register_ref.h
index 5d9cb2c0b95..406fad89673 100644
--- a/tensorflow/lite/kernels/register_ref.h
+++ b/tensorflow/lite/kernels/register_ref.h
@@ -16,8 +16,8 @@ limitations under the License.
 #define TENSORFLOW_LITE_KERNELS_REGISTER_REF_H_
 
 #include "tensorflow/lite/c/common.h"
-#include "tensorflow/lite/model.h"
 #include "tensorflow/lite/mutable_op_resolver.h"
+#include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
 namespace ops {
diff --git a/tensorflow/lite/kernels/reshape.cc b/tensorflow/lite/kernels/reshape.cc
index 6afc3c8a670..ab6f0d8577d 100644
--- a/tensorflow/lite/kernels/reshape.cc
+++ b/tensorflow/lite/kernels/reshape.cc
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include <stdint.h>
 #include <string.h>
 
 #include <memory>
@@ -20,7 +21,6 @@ limitations under the License.
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
-#include "tensorflow/lite/kernels/op_macros.h"
 
 namespace tflite {
 namespace ops {
diff --git a/tensorflow/lite/kernels/reshape_test.cc b/tensorflow/lite/kernels/reshape_test.cc
index 310d594698c..09f8ce6a3d5 100644
--- a/tensorflow/lite/kernels/reshape_test.cc
+++ b/tensorflow/lite/kernels/reshape_test.cc
@@ -12,14 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include <stdint.h>
+
+#include <string>
 #include <vector>
 
 #include <gtest/gtest.h>
 #include "tensorflow/lite/interpreter.h"
-#include "tensorflow/lite/kernels/register.h"
 #include "tensorflow/lite/kernels/reshape_test_common.h"
-#include "tensorflow/lite/kernels/test_util.h"
-#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/string_type.h"
 
 namespace tflite {
 namespace {
diff --git a/tensorflow/lite/kernels/reshape_test_common.h b/tensorflow/lite/kernels/reshape_test_common.h
index 9dbf028e7be..662c163b7c0 100644
--- a/tensorflow/lite/kernels/reshape_test_common.h
+++ b/tensorflow/lite/kernels/reshape_test_common.h
@@ -15,7 +15,15 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_KERNELS_RESHAPE_TEST_COMMON_H_
 #define TENSORFLOW_LITE_KERNELS_RESHAPE_TEST_COMMON_H_
 
+#include <stdint.h>
+
+#include <initializer_list>
+#include <string>
+#include <vector>
+
 #include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/string_type.h"
 
 namespace tflite {
 // There are three ways to specify the output shape of a Reshape
diff --git a/tensorflow/lite/kernels/resize_bilinear.cc b/tensorflow/lite/kernels/resize_bilinear.cc
index dfd58255491..b0488a0b464 100644
--- a/tensorflow/lite/kernels/resize_bilinear.cc
+++ b/tensorflow/lite/kernels/resize_bilinear.cc
@@ -12,13 +12,18 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include <stdint.h>
+
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/kernels/internal/compatibility.h"
+#include "tensorflow/lite/kernels/internal/optimized/neon_check.h"
 #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
 #include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/kernels/internal/types.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
-#include "tensorflow/lite/kernels/op_macros.h"
 
 namespace tflite {
 namespace ops {
diff --git a/tensorflow/lite/kernels/resize_bilinear_test.cc b/tensorflow/lite/kernels/resize_bilinear_test.cc
index d4d414ae29c..6dedc0d169d 100644
--- a/tensorflow/lite/kernels/resize_bilinear_test.cc
+++ b/tensorflow/lite/kernels/resize_bilinear_test.cc
@@ -12,11 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include <stdint.h>
+
+#include <initializer_list>
+#include <vector>
+
 #include <gtest/gtest.h>
-#include "tensorflow/lite/interpreter.h"
-#include "tensorflow/lite/kernels/register.h"
+#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
 #include "tensorflow/lite/kernels/test_util.h"
-#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
 namespace {
diff --git a/tensorflow/lite/kernels/resize_nearest_neighbor.cc b/tensorflow/lite/kernels/resize_nearest_neighbor.cc
index 1b58e5245ee..fff45ac13cc 100644
--- a/tensorflow/lite/kernels/resize_nearest_neighbor.cc
+++ b/tensorflow/lite/kernels/resize_nearest_neighbor.cc
@@ -12,13 +12,20 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include "tensorflow/lite/kernels/internal/reference/resize_nearest_neighbor.h"
+
+#include <stdint.h>
+
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/kernels/internal/compatibility.h"
+#include "tensorflow/lite/kernels/internal/optimized/neon_check.h"
 #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
 #include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/kernels/internal/types.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
-#include "tensorflow/lite/kernels/op_macros.h"
 
 namespace tflite {
 namespace ops {
diff --git a/tensorflow/lite/kernels/resize_nearest_neighbor_test.cc b/tensorflow/lite/kernels/resize_nearest_neighbor_test.cc
index 656bd6ee750..b22ad48afb9 100644
--- a/tensorflow/lite/kernels/resize_nearest_neighbor_test.cc
+++ b/tensorflow/lite/kernels/resize_nearest_neighbor_test.cc
@@ -12,11 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include <stdint.h>
+
+#include <initializer_list>
+#include <vector>
+
 #include <gtest/gtest.h>
-#include "tensorflow/lite/interpreter.h"
-#include "tensorflow/lite/kernels/register.h"
+#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
 #include "tensorflow/lite/kernels/test_util.h"
-#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
 namespace {
diff --git a/tensorflow/lite/kernels/reverse.cc b/tensorflow/lite/kernels/reverse.cc
index 760236ad6a7..9ce845b4b7b 100644
--- a/tensorflow/lite/kernels/reverse.cc
+++ b/tensorflow/lite/kernels/reverse.cc
@@ -13,10 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/lite/c/builtin_op_data.h"
+#include <stdint.h>
+
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/kernels/reverse_sequence.cc b/tensorflow/lite/kernels/reverse_sequence.cc
index 8e976cccd90..7390876d39b 100644
--- a/tensorflow/lite/kernels/reverse_sequence.cc
+++ b/tensorflow/lite/kernels/reverse_sequence.cc
@@ -13,10 +13,13 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <stdint.h>
+
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/kernels/reverse_sequence_test.cc b/tensorflow/lite/kernels/reverse_sequence_test.cc
index f1fefdd4856..3d9dcdc0b69 100644
--- a/tensorflow/lite/kernels/reverse_sequence_test.cc
+++ b/tensorflow/lite/kernels/reverse_sequence_test.cc
@@ -12,10 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/lite/interpreter.h"
-#include "tensorflow/lite/kernels/register.h"
+#include <stdint.h>
+
+#include <vector>
+
+#include <gtest/gtest.h>
 #include "tensorflow/lite/kernels/test_util.h"
-#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
 namespace {
diff --git a/tensorflow/lite/kernels/reverse_test.cc b/tensorflow/lite/kernels/reverse_test.cc
index 02101ab172c..f1fcf67fd42 100644
--- a/tensorflow/lite/kernels/reverse_test.cc
+++ b/tensorflow/lite/kernels/reverse_test.cc
@@ -12,10 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/lite/interpreter.h"
-#include "tensorflow/lite/kernels/register.h"
+#include <stdint.h>
+
+#include <vector>
+
+#include <gtest/gtest.h>
 #include "tensorflow/lite/kernels/test_util.h"
-#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
 namespace {
diff --git a/tensorflow/lite/kernels/rfft2d.cc b/tensorflow/lite/kernels/rfft2d.cc
index fa201153daf..9aeee53f637 100644
--- a/tensorflow/lite/kernels/rfft2d.cc
+++ b/tensorflow/lite/kernels/rfft2d.cc
@@ -13,13 +13,21 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <math.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+
+#include <algorithm>
+#include <complex>
+
 #include "third_party/fft2d/fft2d.h"
 #include "ruy/profiler/instrumentation.h"  // from @ruy
-#include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/kernels/internal/types.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
-#include "tensorflow/lite/kernels/op_macros.h"
 
 namespace tflite {
 namespace ops {
diff --git a/tensorflow/lite/kernels/rfft2d_test.cc b/tensorflow/lite/kernels/rfft2d_test.cc
index d4b6a0a9d83..e9b23bacf0c 100644
--- a/tensorflow/lite/kernels/rfft2d_test.cc
+++ b/tensorflow/lite/kernels/rfft2d_test.cc
@@ -13,13 +13,17 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include <initializer_list>
+#include <stdint.h>
+
+#include <complex>
+#include <vector>
 
 #include <gtest/gtest.h>
 #include "tensorflow/lite/interpreter.h"
 #include "tensorflow/lite/kernels/custom_ops_register.h"
 #include "tensorflow/lite/kernels/test_util.h"
-#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/testing/util.h"
 
 namespace tflite {
 namespace ops {
diff --git a/tensorflow/lite/kernels/round.cc b/tensorflow/lite/kernels/round.cc
index fd16605abeb..341d2880705 100644
--- a/tensorflow/lite/kernels/round.cc
+++ b/tensorflow/lite/kernels/round.cc
@@ -13,9 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "tensorflow/lite/kernels/internal/reference/round.h"
+
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/kernels/round_test.cc b/tensorflow/lite/kernels/round_test.cc
index baa614347d2..3402014f25f 100644
--- a/tensorflow/lite/kernels/round_test.cc
+++ b/tensorflow/lite/kernels/round_test.cc
@@ -13,11 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <initializer_list>
+#include <vector>
+
 #include <gtest/gtest.h>
-#include "tensorflow/lite/interpreter.h"
-#include "tensorflow/lite/kernels/register.h"
 #include "tensorflow/lite/kernels/test_util.h"
-#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
 namespace {
diff --git a/tensorflow/lite/kernels/scatter_nd.cc b/tensorflow/lite/kernels/scatter_nd.cc
index 32f83357f8c..4e904f66692 100644
--- a/tensorflow/lite/kernels/scatter_nd.cc
+++ b/tensorflow/lite/kernels/scatter_nd.cc
@@ -13,14 +13,15 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/lite/c/builtin_op_data.h"
+#include <stdint.h>
+
 #include "tensorflow/lite/c/common.h"
-#include "tensorflow/lite/context.h"
 #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
+#include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/kernels/internal/types.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
-#include "tensorflow/lite/kernels/op_macros.h"
 
 namespace tflite {
 namespace ops {
diff --git a/tensorflow/lite/kernels/scatter_nd_test.cc b/tensorflow/lite/kernels/scatter_nd_test.cc
index e25ba9b93f3..9fdf176fe1f 100644
--- a/tensorflow/lite/kernels/scatter_nd_test.cc
+++ b/tensorflow/lite/kernels/scatter_nd_test.cc
@@ -12,13 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include <gmock/gmock.h>
+#include <stdint.h>
+
+#include <initializer_list>
+#include <vector>
+
 #include <gtest/gtest.h>
-#include "tensorflow/lite/c/builtin_op_data.h"
-#include "tensorflow/lite/interpreter.h"
-#include "tensorflow/lite/kernels/register.h"
+#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
 #include "tensorflow/lite/kernels/test_util.h"
-#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
 namespace {
diff --git a/tensorflow/lite/kernels/segment_sum.cc b/tensorflow/lite/kernels/segment_sum.cc
index db8aa688ebe..8185359321e 100644
--- a/tensorflow/lite/kernels/segment_sum.cc
+++ b/tensorflow/lite/kernels/segment_sum.cc
@@ -13,10 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/lite/c/builtin_op_data.h"
+#include <stdint.h>
+
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/kernels/segment_sum_test.cc b/tensorflow/lite/kernels/segment_sum_test.cc
index d083feb44aa..ec531ffd92d 100644
--- a/tensorflow/lite/kernels/segment_sum_test.cc
+++ b/tensorflow/lite/kernels/segment_sum_test.cc
@@ -12,11 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include <stdint.h>
+
+#include <vector>
+
 #include <gtest/gtest.h>
-#include "tensorflow/lite/interpreter.h"
-#include "tensorflow/lite/kernels/register.h"
 #include "tensorflow/lite/kernels/test_util.h"
-#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
 namespace {
diff --git a/tensorflow/lite/kernels/select.cc b/tensorflow/lite/kernels/select.cc
index 89fac10c869..281425253c5 100644
--- a/tensorflow/lite/kernels/select.cc
+++ b/tensorflow/lite/kernels/select.cc
@@ -12,12 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include <stddef.h>
+#include <stdint.h>
+
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
-#include "tensorflow/lite/kernels/op_macros.h"
-#include "tensorflow/lite/string_util.h"
 
 namespace tflite {
 namespace ops {
diff --git a/tensorflow/lite/kernels/select_test.cc b/tensorflow/lite/kernels/select_test.cc
index 36935b0b6dc..56ed994d805 100644
--- a/tensorflow/lite/kernels/select_test.cc
+++ b/tensorflow/lite/kernels/select_test.cc
@@ -12,11 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include <stdint.h>
+
+#include <initializer_list>
+#include <vector>
+
 #include <gtest/gtest.h>
-#include "tensorflow/lite/interpreter.h"
-#include "tensorflow/lite/kernels/register.h"
+#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
 #include "tensorflow/lite/kernels/test_util.h"
-#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
 namespace {
diff --git a/tensorflow/lite/kernels/shape.cc b/tensorflow/lite/kernels/shape.cc
index d979f083f70..afeadc38c20 100644
--- a/tensorflow/lite/kernels/shape.cc
+++ b/tensorflow/lite/kernels/shape.cc
@@ -12,11 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include <stdint.h>
+
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/kernels/internal/compatibility.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
-#include "tensorflow/lite/kernels/op_macros.h"
 
 namespace tflite {
 namespace ops {
diff --git a/tensorflow/lite/kernels/shape_test.cc b/tensorflow/lite/kernels/shape_test.cc
index 3eeb83f5000..292b40ab7cc 100644
--- a/tensorflow/lite/kernels/shape_test.cc
+++ b/tensorflow/lite/kernels/shape_test.cc
@@ -13,13 +13,16 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <stdint.h>
+
 #include <initializer_list>
+#include <memory>
+#include <vector>
 
 #include <gtest/gtest.h>
 #include "tensorflow/lite/interpreter.h"
-#include "tensorflow/lite/kernels/register.h"
 #include "tensorflow/lite/kernels/test_util.h"
-#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
 namespace {
diff --git a/tensorflow/lite/kernels/skip_gram.cc b/tensorflow/lite/kernels/skip_gram.cc
index 3eb415a55a8..8348a25bba7 100644
--- a/tensorflow/lite/kernels/skip_gram.cc
+++ b/tensorflow/lite/kernels/skip_gram.cc
@@ -31,13 +31,11 @@ limitations under the License.
 
 #include <ctype.h>
 
-#include <string>
 #include <vector>
 
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
-#include "tensorflow/lite/kernels/op_macros.h"
 #include "tensorflow/lite/string_util.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/kernels/skip_gram_test.cc b/tensorflow/lite/kernels/skip_gram_test.cc
index 12d631660ee..9a5b541c077 100644
--- a/tensorflow/lite/kernels/skip_gram_test.cc
+++ b/tensorflow/lite/kernels/skip_gram_test.cc
@@ -13,13 +13,16 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <memory>
+#include <string>
 #include <vector>
 
 #include <gtest/gtest.h>
+#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
 #include "tensorflow/lite/interpreter.h"
-#include "tensorflow/lite/kernels/register.h"
 #include "tensorflow/lite/kernels/test_util.h"
-#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/string_type.h"
 #include "tensorflow/lite/string_util.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/kernels/slice.cc b/tensorflow/lite/kernels/slice.cc
index 1d2cc588abc..c99e1b573b9 100644
--- a/tensorflow/lite/kernels/slice.cc
+++ b/tensorflow/lite/kernels/slice.cc
@@ -13,17 +13,21 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include <string.h>
+#include <stdint.h>
 
-#include <cmath>
+#include <algorithm>
+#include <string>
 #include <vector>
 
-#include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/kernels/internal/compatibility.h"
 #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
+#include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/kernels/internal/types.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
-#include "tensorflow/lite/kernels/op_macros.h"
+#include "tensorflow/lite/string_type.h"
 
 namespace tflite {
 namespace ops {
diff --git a/tensorflow/lite/kernels/slice_test.cc b/tensorflow/lite/kernels/slice_test.cc
index 1a31ae44a5d..f950f3346ab 100644
--- a/tensorflow/lite/kernels/slice_test.cc
+++ b/tensorflow/lite/kernels/slice_test.cc
@@ -12,11 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include <stdint.h>
+
+#include <initializer_list>
+#include <string>
+#include <vector>
+
 #include <gtest/gtest.h>
-#include "tensorflow/lite/interpreter.h"
-#include "tensorflow/lite/kernels/register.h"
 #include "tensorflow/lite/kernels/test_util.h"
-#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/string_type.h"
 
 namespace tflite {
 namespace {
diff --git a/tensorflow/lite/kernels/softmax_test.cc b/tensorflow/lite/kernels/softmax_test.cc
index 79f3608fe9a..b8e3b3076b3 100644
--- a/tensorflow/lite/kernels/softmax_test.cc
+++ b/tensorflow/lite/kernels/softmax_test.cc
@@ -14,17 +14,18 @@ limitations under the License.
 ==============================================================================*/
 // Unit test for TFLite SOFTMAX op.
 
-#include <iomanip>
+#include "tensorflow/lite/kernels/internal/reference/softmax.h"
+
+#include <initializer_list>
 #include <memory>
 #include <vector>
 
-#include <gmock/gmock.h>
 #include <gtest/gtest.h>
-#include "tensorflow/lite/interpreter.h"
+#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
 #include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
-#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/kernels/internal/types.h"
 #include "tensorflow/lite/kernels/test_util.h"
-#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
 namespace {
diff --git a/tensorflow/lite/kernels/space_to_batch_nd.cc b/tensorflow/lite/kernels/space_to_batch_nd.cc
index e8756ef5f2e..7fc58e7ee6b 100644
--- a/tensorflow/lite/kernels/space_to_batch_nd.cc
+++ b/tensorflow/lite/kernels/space_to_batch_nd.cc
@@ -12,17 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include <string.h>
+#include <stdint.h>
 
-#include <vector>
-
-#include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/kernels/internal/compatibility.h"
 #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
 #include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/kernels/internal/types.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
-#include "tensorflow/lite/kernels/op_macros.h"
 
 namespace tflite {
 namespace ops {
diff --git a/tensorflow/lite/kernels/space_to_batch_nd_test.cc b/tensorflow/lite/kernels/space_to_batch_nd_test.cc
index d34989f0fb7..0591265f73b 100644
--- a/tensorflow/lite/kernels/space_to_batch_nd_test.cc
+++ b/tensorflow/lite/kernels/space_to_batch_nd_test.cc
@@ -13,11 +13,15 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <stdint.h>
+
+#include <initializer_list>
+#include <vector>
+
 #include <gtest/gtest.h>
-#include "tensorflow/lite/interpreter.h"
-#include "tensorflow/lite/kernels/register.h"
+#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
 #include "tensorflow/lite/kernels/test_util.h"
-#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
 namespace {
diff --git a/tensorflow/lite/kernels/space_to_depth.cc b/tensorflow/lite/kernels/space_to_depth.cc
index 527b7c83adb..e4c7efaaf99 100644
--- a/tensorflow/lite/kernels/space_to_depth.cc
+++ b/tensorflow/lite/kernels/space_to_depth.cc
@@ -12,13 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include <stdint.h>
+
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
 #include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/kernels/internal/types.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
-#include "tensorflow/lite/kernels/op_macros.h"
 
 namespace tflite {
 namespace ops {
diff --git a/tensorflow/lite/kernels/space_to_depth_test.cc b/tensorflow/lite/kernels/space_to_depth_test.cc
index ad2f95d82ba..523dc60a37f 100644
--- a/tensorflow/lite/kernels/space_to_depth_test.cc
+++ b/tensorflow/lite/kernels/space_to_depth_test.cc
@@ -12,11 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include <stdint.h>
+
+#include <initializer_list>
+#include <vector>
+
 #include <gtest/gtest.h>
-#include "tensorflow/lite/interpreter.h"
-#include "tensorflow/lite/kernels/register.h"
+#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
 #include "tensorflow/lite/kernels/test_util.h"
-#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
 namespace {
diff --git a/tensorflow/lite/kernels/sparse_to_dense.cc b/tensorflow/lite/kernels/sparse_to_dense.cc
index 29c87734748..bdf0f4e703a 100644
--- a/tensorflow/lite/kernels/sparse_to_dense.cc
+++ b/tensorflow/lite/kernels/sparse_to_dense.cc
@@ -12,20 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include <cassert>
-#include <cmath>
-#include <cstdio>
-#include <cstdlib>
-#include <iostream>
-#include <limits>
+#include <stdint.h>
+
+#include <vector>
 
-#include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/kernels/op_macros.h"
-#include "tensorflow/lite/kernels/padding.h"
 
 namespace tflite {
 namespace ops {
diff --git a/tensorflow/lite/kernels/sparse_to_dense_test.cc b/tensorflow/lite/kernels/sparse_to_dense_test.cc
index ad040b2ce04..add4b53ca48 100644
--- a/tensorflow/lite/kernels/sparse_to_dense_test.cc
+++ b/tensorflow/lite/kernels/sparse_to_dense_test.cc
@@ -1,4 +1,3 @@
-
 /* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
@@ -13,12 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include <cstdarg>
+#include <stdint.h>
+
+#include <initializer_list>
+#include <vector>
+
 #include <gtest/gtest.h>
-#include "tensorflow/lite/interpreter.h"
-#include "tensorflow/lite/kernels/register.h"
 #include "tensorflow/lite/kernels/test_util.h"
-#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
 namespace {
diff --git a/tensorflow/lite/kernels/split.cc b/tensorflow/lite/kernels/split.cc
index da239e6ecab..3b7781f409e 100644
--- a/tensorflow/lite/kernels/split.cc
+++ b/tensorflow/lite/kernels/split.cc
@@ -12,17 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include <string.h>
-
-#include <vector>
+#include <stdint.h>
 
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
 #include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/kernels/internal/types.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
-#include "tensorflow/lite/kernels/op_macros.h"
 
 namespace tflite {
 namespace ops {
diff --git a/tensorflow/lite/kernels/split_test.cc b/tensorflow/lite/kernels/split_test.cc
index 7952396880c..ae7c5cf76e8 100644
--- a/tensorflow/lite/kernels/split_test.cc
+++ b/tensorflow/lite/kernels/split_test.cc
@@ -12,11 +12,17 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include <stdint.h>
+
+#include <initializer_list>
+#include <sstream>
+#include <vector>
+
+#include <gmock/gmock.h>
 #include <gtest/gtest.h>
-#include "tensorflow/lite/interpreter.h"
-#include "tensorflow/lite/kernels/register.h"
+#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
 #include "tensorflow/lite/kernels/test_util.h"
-#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
 namespace {
diff --git a/tensorflow/lite/kernels/split_v.cc b/tensorflow/lite/kernels/split_v.cc
index b5529b98ecb..7d60086a91d 100644
--- a/tensorflow/lite/kernels/split_v.cc
+++ b/tensorflow/lite/kernels/split_v.cc
@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include <stdint.h>
+
 #include <vector>
 
 #include "tensorflow/lite/c/builtin_op_data.h"
@@ -19,8 +21,9 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
 #include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/kernels/internal/types.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
-#include "tensorflow/lite/kernels/op_macros.h"
 
 namespace tflite {
 namespace ops {
diff --git a/tensorflow/lite/kernels/split_v_test.cc b/tensorflow/lite/kernels/split_v_test.cc
index 4e143cabe58..a10e277d653 100644
--- a/tensorflow/lite/kernels/split_v_test.cc
+++ b/tensorflow/lite/kernels/split_v_test.cc
@@ -12,13 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include <initializer_list>
+#include <stdint.h>
 
+#include <initializer_list>
+#include <vector>
+
+#include <gmock/gmock.h>
 #include <gtest/gtest.h>
-#include "tensorflow/lite/interpreter.h"
-#include "tensorflow/lite/kernels/register.h"
+#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
 #include "tensorflow/lite/kernels/test_util.h"
-#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
 namespace {
diff --git a/tensorflow/lite/kernels/squared_difference.cc b/tensorflow/lite/kernels/squared_difference.cc
index fbea2403a53..e17ff8e3191 100644
--- a/tensorflow/lite/kernels/squared_difference.cc
+++ b/tensorflow/lite/kernels/squared_difference.cc
@@ -12,14 +12,18 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/lite/c/builtin_op_data.h"
+#include <stddef.h>
+#include <stdint.h>
+
+#include "ruy/profiler/instrumentation.h"  // from @ruy
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
 #include "tensorflow/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/lite/kernels/internal/reference/binary_function.h"
 #include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
-#include "tensorflow/lite/kernels/op_macros.h"
 
 namespace tflite {
 namespace ops {
diff --git a/tensorflow/lite/kernels/squared_difference_test.cc b/tensorflow/lite/kernels/squared_difference_test.cc
index 249590f37e9..efac1969144 100644
--- a/tensorflow/lite/kernels/squared_difference_test.cc
+++ b/tensorflow/lite/kernels/squared_difference_test.cc
@@ -12,11 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include <stdint.h>
+
+#include <vector>
+
+#include <gmock/gmock.h>
 #include <gtest/gtest.h>
-#include "tensorflow/lite/interpreter.h"
-#include "tensorflow/lite/kernels/register.h"
+#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
 #include "tensorflow/lite/kernels/test_util.h"
-#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
 namespace {
diff --git a/tensorflow/lite/kernels/squeeze.cc b/tensorflow/lite/kernels/squeeze.cc
index fa5656e1d59..c4dc51026a6 100644
--- a/tensorflow/lite/kernels/squeeze.cc
+++ b/tensorflow/lite/kernels/squeeze.cc
@@ -14,13 +14,10 @@ limitations under the License.
 ==============================================================================*/
 #include <string.h>
 
-#include <vector>
-
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
-#include "tensorflow/lite/kernels/op_macros.h"
 
 namespace tflite {
 namespace ops {
diff --git a/tensorflow/lite/kernels/squeeze_test.cc b/tensorflow/lite/kernels/squeeze_test.cc
index 575a02a70f8..4239ae43e1c 100644
--- a/tensorflow/lite/kernels/squeeze_test.cc
+++ b/tensorflow/lite/kernels/squeeze_test.cc
@@ -12,11 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include <stdint.h>
+
+#include <initializer_list>
+#include <vector>
+
+#include <gmock/gmock.h>
 #include <gtest/gtest.h>
-#include "tensorflow/lite/interpreter.h"
-#include "tensorflow/lite/kernels/register.h"
+#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
 #include "tensorflow/lite/kernels/test_util.h"
-#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
 namespace {
diff --git a/tensorflow/lite/kernels/strided_slice.cc b/tensorflow/lite/kernels/strided_slice.cc
index e2ca812d193..50c2255e526 100644
--- a/tensorflow/lite/kernels/strided_slice.cc
+++ b/tensorflow/lite/kernels/strided_slice.cc
@@ -15,14 +15,20 @@ limitations under the License.
 
 #include "tensorflow/lite/kernels/internal/reference/strided_slice.h"
 
+#include <math.h>
+#include <stdint.h>
+
+#include <algorithm>
 #include <vector>
 
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/kernels/internal/compatibility.h"
 #include "tensorflow/lite/kernels/internal/strided_slice_logic.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/kernels/internal/types.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
-#include "tensorflow/lite/kernels/op_macros.h"
 
 namespace tflite {
 namespace ops {
diff --git a/tensorflow/lite/kernels/strided_slice_test.cc b/tensorflow/lite/kernels/strided_slice_test.cc
index e97eab5b7c4..5f625d3f201 100644
--- a/tensorflow/lite/kernels/strided_slice_test.cc
+++ b/tensorflow/lite/kernels/strided_slice_test.cc
@@ -12,11 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include <stdint.h>
+
+#include <initializer_list>
+#include <vector>
+
+#include <gmock/gmock.h>
 #include <gtest/gtest.h>
-#include "tensorflow/lite/interpreter.h"
-#include "tensorflow/lite/kernels/register.h"
 #include "tensorflow/lite/kernels/test_util.h"
-#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
 namespace {
diff --git a/tensorflow/lite/kernels/sub.cc b/tensorflow/lite/kernels/sub.cc
index a2282a0545b..aa628fa5408 100644
--- a/tensorflow/lite/kernels/sub.cc
+++ b/tensorflow/lite/kernels/sub.cc
@@ -14,18 +14,27 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/lite/kernels/internal/reference/sub.h"
 
+#include <stddef.h>
+#include <stdint.h>
+
+#include <algorithm>
 #include <limits>
 
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/kernels/internal/compatibility.h"
 #include "tensorflow/lite/kernels/internal/optimized/cpu_check.h"
+#include "tensorflow/lite/kernels/internal/optimized/neon_check.h"
 #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
 #include "tensorflow/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/lite/kernels/internal/reference/add.h"
 #include "tensorflow/lite/kernels/internal/reference/integer_ops/add.h"
+#include "tensorflow/lite/kernels/internal/reference/process_broadcast_shapes.h"
 #include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/kernels/internal/types.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
-#include "tensorflow/lite/kernels/op_macros.h"
 
 namespace tflite {
 namespace ops {
diff --git a/tensorflow/lite/kernels/sub_test.cc b/tensorflow/lite/kernels/sub_test.cc
index adda1b810ce..21f2dc7cabd 100644
--- a/tensorflow/lite/kernels/sub_test.cc
+++ b/tensorflow/lite/kernels/sub_test.cc
@@ -12,11 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include <stdint.h>
+
+#include <limits>
+#include <vector>
+
 #include <gtest/gtest.h>
-#include "tensorflow/lite/interpreter.h"
-#include "tensorflow/lite/kernels/register.h"
+#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
 #include "tensorflow/lite/kernels/test_util.h"
-#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
 namespace {
diff --git a/tensorflow/lite/kernels/subgraph_test_util.cc b/tensorflow/lite/kernels/subgraph_test_util.cc
index 00f947a9e38..8f1964ad10f 100644
--- a/tensorflow/lite/kernels/subgraph_test_util.cc
+++ b/tensorflow/lite/kernels/subgraph_test_util.cc
@@ -15,13 +15,18 @@ limitations under the License.
 
 #include "tensorflow/lite/kernels/subgraph_test_util.h"
 
-#include "flatbuffers/flexbuffers.h"  // from @flatbuffers
+#include <stddef.h>
+#include <stdint.h>
+#include <stdlib.h>
+
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/core/subgraph.h"
 #include "tensorflow/lite/kernels/builtin_op_kernels.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
-#include "tensorflow/lite/kernels/register.h"
-#include "tensorflow/lite/kernels/test_util.h"
-#include "tensorflow/lite/model.h"
 
 namespace tflite {
 namespace subgraph_test_util {
diff --git a/tensorflow/lite/kernels/subgraph_test_util.h b/tensorflow/lite/kernels/subgraph_test_util.h
index 95b7206fc29..7306f82344d 100644
--- a/tensorflow/lite/kernels/subgraph_test_util.h
+++ b/tensorflow/lite/kernels/subgraph_test_util.h
@@ -20,6 +20,11 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_KERNELS_SUBGRAPH_TEST_UTIL_H_
 #define TENSORFLOW_LITE_KERNELS_SUBGRAPH_TEST_UTIL_H_
 
+#include <stdint.h>
+
+#include <memory>
+#include <vector>
+
 #include <gtest/gtest.h>
 #include "tensorflow/lite/core/subgraph.h"
 #include "tensorflow/lite/interpreter.h"
diff --git a/tensorflow/lite/kernels/subgraph_test_util_test.cc b/tensorflow/lite/kernels/subgraph_test_util_test.cc
index 4bd0482da17..39e013294f7 100644
--- a/tensorflow/lite/kernels/subgraph_test_util_test.cc
+++ b/tensorflow/lite/kernels/subgraph_test_util_test.cc
@@ -14,10 +14,16 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/lite/kernels/subgraph_test_util.h"
+
+#include <stdint.h>
+
+#include <memory>
+#include <vector>
+
 #include <gtest/gtest.h>
 #include "tensorflow/lite/interpreter.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
-#include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/testing/util.h"
 
 namespace tflite {
 
diff --git a/tensorflow/lite/kernels/svdf.cc b/tensorflow/lite/kernels/svdf.cc
index 57eedb6b204..1b8bf904b8a 100644
--- a/tensorflow/lite/kernels/svdf.cc
+++ b/tensorflow/lite/kernels/svdf.cc
@@ -24,6 +24,7 @@ limitations under the License.
 
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/kernels/internal/compatibility.h"
 #include "tensorflow/lite/kernels/internal/quantization_util.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/internal/tensor_utils.h"
diff --git a/tensorflow/lite/kernels/svdf_test.cc b/tensorflow/lite/kernels/svdf_test.cc
index 68963b784f4..b0ac2011948 100644
--- a/tensorflow/lite/kernels/svdf_test.cc
+++ b/tensorflow/lite/kernels/svdf_test.cc
@@ -14,15 +14,15 @@ limitations under the License.
 ==============================================================================*/
 // Unit test for TFLite SVDF op.
 
-#include <iomanip>
+#include <stdint.h>
+
+#include <initializer_list>
 #include <vector>
 
-#include <gmock/gmock.h>
 #include <gtest/gtest.h>
-#include "tensorflow/lite/interpreter.h"
-#include "tensorflow/lite/kernels/register.h"
+#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
 #include "tensorflow/lite/kernels/test_util.h"
-#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
 namespace {
diff --git a/tensorflow/lite/kernels/test_util.cc b/tensorflow/lite/kernels/test_util.cc
index 010aed31dc6..24f6e4f11ca 100644
--- a/tensorflow/lite/kernels/test_util.cc
+++ b/tensorflow/lite/kernels/test_util.cc
@@ -14,18 +14,37 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/lite/kernels/test_util.h"
 
-#include <numeric>
+#include <stddef.h>
+#include <stdint.h>
+
+#include <algorithm>
+#include <complex>
+#include <functional>
+#include <map>
+#include <memory>
+#include <optional>
+#include <string>
+#include <tuple>
+#include <utility>
+#include <vector>
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
 #include "tensorflow/core/platform/logging.h"
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/api/op_resolver.h"
+#include "tensorflow/lite/core/subgraph.h"
 #include "tensorflow/lite/delegates/nnapi/acceleration_test_util.h"
 #include "tensorflow/lite/delegates/nnapi/nnapi_delegate.h"
 #include "tensorflow/lite/interpreter.h"
 #include "tensorflow/lite/kernels/acceleration_test_util.h"
+#include "tensorflow/lite/kernels/register.h"
 #include "tensorflow/lite/minimal_logging.h"
+#include "tensorflow/lite/model.h"
 #include "tensorflow/lite/nnapi/nnapi_implementation.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/string_type.h"
+#include "tensorflow/lite/string_util.h"
 #include "tensorflow/lite/tools/versioning/op_version.h"
 #include "tensorflow/lite/version.h"
 
diff --git a/tensorflow/lite/kernels/test_util.h b/tensorflow/lite/kernels/test_util.h
index f0f02d25add..bc93bdae58a 100644
--- a/tensorflow/lite/kernels/test_util.h
+++ b/tensorflow/lite/kernels/test_util.h
@@ -15,25 +15,39 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_KERNELS_TEST_UTIL_H_
 #define TENSORFLOW_LITE_KERNELS_TEST_UTIL_H_
 
+#include <stddef.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <algorithm>
 #include <cmath>
 #include <complex>
+#include <functional>
+#include <initializer_list>
+#include <limits>
+#include <map>
+#include <memory>
+#include <string>
+#include <tuple>
 #include <type_traits>
+#include <utility>
 #include <vector>
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
 #include "tensorflow/core/platform/logging.h"
-#include "tensorflow/lite/c/common.h"
-#include "tensorflow/lite/delegates/nnapi/nnapi_delegate_kernel.h"
+#include "tensorflow/lite/core/api/op_resolver.h"
 #include "tensorflow/lite/interpreter.h"
 #include "tensorflow/lite/kernels/internal/tensor_utils.h"
-#include "tensorflow/lite/kernels/register.h"
-#include "tensorflow/lite/model.h"
 #include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/string_type.h"
 #include "tensorflow/lite/string_util.h"
-#include "tensorflow/lite/testing/util.h"
+#include "tensorflow/lite/testing/util.h"  // IWYU pragma: keep
 #include "tensorflow/lite/tools/optimize/quantization_utils.h"
 #include "tensorflow/lite/tools/optimize/sparsity/format_converter.h"
+#include "tensorflow/lite/type_to_tflitetype.h"
 
 namespace tflite {
 
diff --git a/tensorflow/lite/kernels/test_util_test.cc b/tensorflow/lite/kernels/test_util_test.cc
index 7abb7011f9d..e6f865f6cd6 100644
--- a/tensorflow/lite/kernels/test_util_test.cc
+++ b/tensorflow/lite/kernels/test_util_test.cc
@@ -13,7 +13,13 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include "tensorflow/lite/kernels/test_util.h"
+
+#include <stdint.h>
+
+#include <vector>
+
 #include <gtest/gtest.h>
+#include "tensorflow/lite/testing/util.h"
 
 namespace tflite {
 namespace {
diff --git a/tensorflow/lite/kernels/tile.cc b/tensorflow/lite/kernels/tile.cc
index 64f6bd05485..884456fcbf2 100644
--- a/tensorflow/lite/kernels/tile.cc
+++ b/tensorflow/lite/kernels/tile.cc
@@ -12,16 +12,19 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include <string.h>
+#include <stdint.h>
 
-#include <vector>
+#include <algorithm>
+#include <tuple>
+#include <utility>
 
-#include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
-#include "tensorflow/lite/kernels/op_macros.h"
+#include "tensorflow/lite/string_util.h"
+
 namespace tflite {
 namespace ops {
 namespace builtin {
diff --git a/tensorflow/lite/kernels/tile_test.cc b/tensorflow/lite/kernels/tile_test.cc
index 0df1f33c2bc..7a5203937a5 100644
--- a/tensorflow/lite/kernels/tile_test.cc
+++ b/tensorflow/lite/kernels/tile_test.cc
@@ -12,12 +12,17 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include <stdint.h>
+
+#include <initializer_list>
+#include <string>
+#include <vector>
+
+#include <gmock/gmock.h>
 #include <gtest/gtest.h>
-#include "tensorflow/lite/c/builtin_op_data.h"
-#include "tensorflow/lite/interpreter.h"
-#include "tensorflow/lite/kernels/register.h"
 #include "tensorflow/lite/kernels/test_util.h"
-#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/string_type.h"
 
 namespace tflite {
 namespace {
diff --git a/tensorflow/lite/kernels/topk_v2.cc b/tensorflow/lite/kernels/topk_v2.cc
index 3e313481725..6a5bd392086 100644
--- a/tensorflow/lite/kernels/topk_v2.cc
+++ b/tensorflow/lite/kernels/topk_v2.cc
@@ -12,13 +12,18 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include <algorithm>
+#include <stdint.h>
+
+#include <algorithm>
+#include <iterator>
+#include <vector>
 
-#include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/kernels/internal/compatibility.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
-#include "tensorflow/lite/kernels/op_macros.h"
+
 namespace tflite {
 namespace ops {
 namespace builtin {
diff --git a/tensorflow/lite/kernels/topk_v2_test.cc b/tensorflow/lite/kernels/topk_v2_test.cc
index 72ed82c1449..5d96fe06bf4 100644
--- a/tensorflow/lite/kernels/topk_v2_test.cc
+++ b/tensorflow/lite/kernels/topk_v2_test.cc
@@ -1,4 +1,3 @@
-
 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
@@ -13,12 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include <stdint.h>
+
+#include <initializer_list>
+#include <vector>
+
+#include <gmock/gmock.h>
 #include <gtest/gtest.h>
-#include "tensorflow/lite/c/builtin_op_data.h"
-#include "tensorflow/lite/interpreter.h"
-#include "tensorflow/lite/kernels/register.h"
 #include "tensorflow/lite/kernels/test_util.h"
-#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
 namespace {
diff --git a/tensorflow/lite/kernels/transpose.cc b/tensorflow/lite/kernels/transpose.cc
index 511b45e9f4a..27f5cf6f065 100644
--- a/tensorflow/lite/kernels/transpose.cc
+++ b/tensorflow/lite/kernels/transpose.cc
@@ -12,17 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include <string.h>
+#include <stdint.h>
 
-#include <vector>
-
-#include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/kernels/internal/compatibility.h"
 #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
 #include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/kernels/internal/types.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
-#include "tensorflow/lite/kernels/op_macros.h"
 
 namespace tflite {
 namespace ops {
diff --git a/tensorflow/lite/kernels/transpose_conv.cc b/tensorflow/lite/kernels/transpose_conv.cc
index 494433159d4..33e122ba037 100644
--- a/tensorflow/lite/kernels/transpose_conv.cc
+++ b/tensorflow/lite/kernels/transpose_conv.cc
@@ -13,27 +13,26 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include <cassert>
-#include <cmath>
-#include <cstdio>
-#include <cstdlib>
-#include <iostream>
-#include <limits>
+#include <stddef.h>
+#include <stdint.h>
+
+#include <vector>
 
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/kernels/cpu_backend_context.h"
 #include "tensorflow/lite/kernels/eigen_support.h"
+#include "tensorflow/lite/kernels/internal/compatibility.h"
 // NOLINTNEXTLINE - This header file should't go to the top.
 #include "tensorflow/lite/kernels/internal/optimized/integer_ops/transpose_conv.h"
 #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
 // NOLINTNEXTLINE - This header file should't go to the top.
 #include "tensorflow/lite/kernels/internal/reference/integer_ops/transpose_conv.h"
+#include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/internal/types.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
-#include "tensorflow/lite/kernels/op_macros.h"
 #include "tensorflow/lite/kernels/padding.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/kernels/transpose_conv_test.cc b/tensorflow/lite/kernels/transpose_conv_test.cc
index 25c55c95412..b57bc047f62 100644
--- a/tensorflow/lite/kernels/transpose_conv_test.cc
+++ b/tensorflow/lite/kernels/transpose_conv_test.cc
@@ -12,14 +12,24 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include <cstdarg>
+#include <stddef.h>
+#include <stdint.h>
 
+#include <initializer_list>
+#include <map>
+#include <memory>
+#include <string>
+#include <tuple>
+#include <type_traits>
+#include <vector>
+
+#include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "absl/memory/memory.h"
 #include "tensorflow/lite/interpreter.h"
-#include "tensorflow/lite/kernels/register.h"
 #include "tensorflow/lite/kernels/test_util.h"
-#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/string_type.h"
 
 namespace tflite {
 
diff --git a/tensorflow/lite/kernels/transpose_test.cc b/tensorflow/lite/kernels/transpose_test.cc
index 30594da5d51..a88abec7161 100644
--- a/tensorflow/lite/kernels/transpose_test.cc
+++ b/tensorflow/lite/kernels/transpose_test.cc
@@ -12,13 +12,20 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include <stdint.h>
+
+#include <initializer_list>
+#include <vector>
+
+#include <gmock/gmock.h>
 #include <gtest/gtest.h>
-#include "tensorflow/lite/interpreter.h"
+#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
+#include "tensorflow/lite/kernels/internal/compatibility.h"
 #include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
-#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/kernels/internal/types.h"
 #include "tensorflow/lite/kernels/test_util.h"
-#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
 namespace {
diff --git a/tensorflow/lite/kernels/unidirectional_sequence_lstm.cc b/tensorflow/lite/kernels/unidirectional_sequence_lstm.cc
index 0552885f720..b8b9396f436 100644
--- a/tensorflow/lite/kernels/unidirectional_sequence_lstm.cc
+++ b/tensorflow/lite/kernels/unidirectional_sequence_lstm.cc
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <math.h>
+
 #include <cstddef>
 
 #include "tensorflow/lite/c/builtin_op_data.h"
diff --git a/tensorflow/lite/kernels/unidirectional_sequence_lstm_test.cc b/tensorflow/lite/kernels/unidirectional_sequence_lstm_test.cc
index 4ea018c0cab..43cc75f894b 100644
--- a/tensorflow/lite/kernels/unidirectional_sequence_lstm_test.cc
+++ b/tensorflow/lite/kernels/unidirectional_sequence_lstm_test.cc
@@ -14,15 +14,13 @@ limitations under the License.
 ==============================================================================*/
 // Unit test for TFLite Sequential LSTM op.
 
-#include <memory>
 #include <vector>
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
-#include "tensorflow/lite/interpreter.h"
-#include "tensorflow/lite/kernels/register.h"
+#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
 #include "tensorflow/lite/kernels/test_util.h"
-#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
 namespace {
diff --git a/tensorflow/lite/kernels/unidirectional_sequence_rnn_test.cc b/tensorflow/lite/kernels/unidirectional_sequence_rnn_test.cc
index 8b6f102acdb..f1486267c17 100644
--- a/tensorflow/lite/kernels/unidirectional_sequence_rnn_test.cc
+++ b/tensorflow/lite/kernels/unidirectional_sequence_rnn_test.cc
@@ -14,15 +14,14 @@ limitations under the License.
 ==============================================================================*/
 // Unit test for TFLite Sequential RNN op.
 
-#include <iomanip>
+#include <initializer_list>
 #include <vector>
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
-#include "tensorflow/lite/interpreter.h"
-#include "tensorflow/lite/kernels/register.h"
+#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
 #include "tensorflow/lite/kernels/test_util.h"
-#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
 namespace {
diff --git a/tensorflow/lite/kernels/unique.cc b/tensorflow/lite/kernels/unique.cc
index d0d277ecaa4..dd5c801b468 100644
--- a/tensorflow/lite/kernels/unique.cc
+++ b/tensorflow/lite/kernels/unique.cc
@@ -13,12 +13,18 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <stddef.h>
+#include <stdint.h>
+
 #include <map>
+#include <memory>
+#include <vector>
 
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/kernels/unique_test.cc b/tensorflow/lite/kernels/unique_test.cc
index b18fcbed654..d01616025bf 100644
--- a/tensorflow/lite/kernels/unique_test.cc
+++ b/tensorflow/lite/kernels/unique_test.cc
@@ -12,11 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include <stdint.h>
+
+#include <vector>
+
+#include <gmock/gmock.h>
 #include <gtest/gtest.h>
-#include "tensorflow/lite/interpreter.h"
-#include "tensorflow/lite/kernels/register.h"
 #include "tensorflow/lite/kernels/test_util.h"
-#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
 namespace {
diff --git a/tensorflow/lite/kernels/unpack.cc b/tensorflow/lite/kernels/unpack.cc
index 285e01b3558..8d307acb268 100644
--- a/tensorflow/lite/kernels/unpack.cc
+++ b/tensorflow/lite/kernels/unpack.cc
@@ -13,10 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <stdint.h>
+
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/kernels/internal/types.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/kernels/unpack_test.cc b/tensorflow/lite/kernels/unpack_test.cc
index 0c6b8fa157c..14d9b9e66b7 100644
--- a/tensorflow/lite/kernels/unpack_test.cc
+++ b/tensorflow/lite/kernels/unpack_test.cc
@@ -12,11 +12,17 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include <stdint.h>
+
+#include <initializer_list>
+#include <iostream>
+#include <type_traits>
+#include <vector>
+
+#include <gmock/gmock.h>
 #include <gtest/gtest.h>
-#include "tensorflow/lite/interpreter.h"
-#include "tensorflow/lite/kernels/register.h"
 #include "tensorflow/lite/kernels/test_util.h"
-#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
 namespace {
diff --git a/tensorflow/lite/kernels/variable_ops_test.cc b/tensorflow/lite/kernels/variable_ops_test.cc
index 2efac9d7d8f..077a03df21d 100644
--- a/tensorflow/lite/kernels/variable_ops_test.cc
+++ b/tensorflow/lite/kernels/variable_ops_test.cc
@@ -12,13 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include <vector>
+#include <stdint.h>
 
 #include <gtest/gtest.h>
 #include "tensorflow/lite/interpreter.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
-#include "tensorflow/lite/kernels/test_util.h"
-#include "tensorflow/lite/model.h"
 
 namespace tflite {
 
diff --git a/tensorflow/lite/kernels/where.cc b/tensorflow/lite/kernels/where.cc
index 867d3069f2c..a20efa8baaa 100644
--- a/tensorflow/lite/kernels/where.cc
+++ b/tensorflow/lite/kernels/where.cc
@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include <stdint.h>
+
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
diff --git a/tensorflow/lite/kernels/where_test.cc b/tensorflow/lite/kernels/where_test.cc
index f52c9aad487..ba93bed6e74 100644
--- a/tensorflow/lite/kernels/where_test.cc
+++ b/tensorflow/lite/kernels/where_test.cc
@@ -12,13 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include <stdint.h>
+
 #include <vector>
 
+#include <gmock/gmock.h>
 #include <gtest/gtest.h>
-#include "tensorflow/lite/interpreter.h"
-#include "tensorflow/lite/kernels/register.h"
+#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
 #include "tensorflow/lite/kernels/test_util.h"
-#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
 namespace {
diff --git a/tensorflow/lite/kernels/while.cc b/tensorflow/lite/kernels/while.cc
index 81b6c0c6634..99d6d2cc1c8 100644
--- a/tensorflow/lite/kernels/while.cc
+++ b/tensorflow/lite/kernels/while.cc
@@ -13,7 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <stddef.h>
+
 #include <cstring>
+#include <vector>
 
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
diff --git a/tensorflow/lite/kernels/while_test.cc b/tensorflow/lite/kernels/while_test.cc
index 324519e32a0..b0b63f8c643 100644
--- a/tensorflow/lite/kernels/while_test.cc
+++ b/tensorflow/lite/kernels/while_test.cc
@@ -12,14 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include <stdint.h>
+
+#include <memory>
+#include <vector>
+
 #include <gtest/gtest.h>
-#include "flatbuffers/flexbuffers.h"  // from @flatbuffers
 #include "tensorflow/lite/interpreter.h"
-#include "tensorflow/lite/kernels/kernel_util.h"
-#include "tensorflow/lite/kernels/register.h"
 #include "tensorflow/lite/kernels/subgraph_test_util.h"
-#include "tensorflow/lite/kernels/test_util.h"
-#include "tensorflow/lite/model.h"
 
 namespace tflite {
 
diff --git a/tensorflow/lite/kernels/zeros_like.cc b/tensorflow/lite/kernels/zeros_like.cc
index ad6d03649f6..8586c945e7c 100644
--- a/tensorflow/lite/kernels/zeros_like.cc
+++ b/tensorflow/lite/kernels/zeros_like.cc
@@ -13,8 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <stdint.h>
+#include <string.h>
+
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/kernels/zeros_like_test.cc b/tensorflow/lite/kernels/zeros_like_test.cc
index 09a233c2c30..0be6336ce3e 100644
--- a/tensorflow/lite/kernels/zeros_like_test.cc
+++ b/tensorflow/lite/kernels/zeros_like_test.cc
@@ -12,11 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include <stdint.h>
+
+#include <vector>
+
+#include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
 #include "tensorflow/lite/interpreter.h"
-#include "tensorflow/lite/kernels/register.h"
 #include "tensorflow/lite/kernels/test_util.h"
-#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
 namespace {
diff --git a/tensorflow/lite/micro/kernels/cmsis-nn/pooling.cc b/tensorflow/lite/micro/kernels/cmsis-nn/pooling.cc
index 381e3632b3a..001b0feaef2 100644
--- a/tensorflow/lite/micro/kernels/cmsis-nn/pooling.cc
+++ b/tensorflow/lite/micro/kernels/cmsis-nn/pooling.cc
@@ -15,6 +15,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/reference/pooling.h"
 
 #include "cmsis/CMSIS/NN/Include/arm_nnfunctions.h"
+#include "flatbuffers/base.h"  // from @flatbuffers
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/kernels/internal/reference/integer_ops/pooling.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"

From cd7da16dd6c17df428dc9ec105c0c8f11e5fd4f5 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 15 Jun 2020 18:39:25 -0700
Subject: [PATCH 0248/1390] Stops calling std::copy_n over zero-sided vec,
 which indeed dereferences null. Also added vector::reserve() call before the
 for loop.

PiperOrigin-RevId: 316589226
Change-Id: Iae6edd6f3bf5c8b737cbad782d45c978e622df43
---
 tensorflow/core/kernels/ctc_decoder_ops.cc | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/tensorflow/core/kernels/ctc_decoder_ops.cc b/tensorflow/core/kernels/ctc_decoder_ops.cc
index 517612eecb6..d62aef2d03b 100644
--- a/tensorflow/core/kernels/ctc_decoder_ops.cc
+++ b/tensorflow/core/kernels/ctc_decoder_ops.cc
@@ -154,7 +154,14 @@ class CTCDecodeHelper {
         auto& p_batch = sequences[b][p];
         int64 num_decoded = p_batch.size();
         max_decoded = std::max(max_decoded, num_decoded);
-        std::copy_n(p_batch.begin(), num_decoded, &values_t(offset));
+        if (num_decoded > 0) {
+          DCHECK_NE(values_t.data(), nullptr)
+              << "values_t should not be nullptr: p_num=" << p_num
+              << " num_decoded=" << num_decoded;
+          DCHECK_LT(offset, values_t.size())
+              << "offset should be smaller than values_t.size()";
+          std::copy_n(p_batch.begin(), num_decoded, &values_t(offset));
+        }
         for (int64 t = 0; t < num_decoded; ++t, ++offset) {
           indices_t(offset, 0) = b;
           indices_t(offset, 1) = t;
@@ -203,6 +210,7 @@ class CTCGreedyDecoderOp : public OpKernel {
 
     auto inputs_t = inputs->tensor<T, 3>();
 
+    input_list_t.reserve(max_time);
     for (std::size_t t = 0; t < max_time; ++t) {
       input_list_t.emplace_back(inputs_t.data() + t * batch_size * num_classes,
                                 batch_size, num_classes);
@@ -305,6 +313,7 @@ class CTCBeamSearchDecoderOp : public OpKernel {
 
     std::vector<typename TTypes<T>::UnalignedConstMatrix> input_list_t;
 
+    input_list_t.reserve(max_time);
     for (std::size_t t = 0; t < max_time; ++t) {
       input_list_t.emplace_back(inputs_t.data() + t * batch_size * num_classes,
                                 batch_size, num_classes);

From 0a0161836d350f62e26fbb30bd234a9017ebfd59 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 15 Jun 2020 18:41:44 -0700
Subject: [PATCH 0249/1390] Require Keras Preprocessing Layers be restored from
 config.

PiperOrigin-RevId: 316589499
Change-Id: Id8f5ffcc84d60735c5d174581077007845717b78
---
 .../keras/engine/base_preprocessing_layer.py    |  1 +
 .../engine/base_preprocessing_layer_test.py     | 17 +++++++++++++++++
 2 files changed, 18 insertions(+)

diff --git a/tensorflow/python/keras/engine/base_preprocessing_layer.py b/tensorflow/python/keras/engine/base_preprocessing_layer.py
index e6b75033d60..08df07e33e3 100644
--- a/tensorflow/python/keras/engine/base_preprocessing_layer.py
+++ b/tensorflow/python/keras/engine/base_preprocessing_layer.py
@@ -41,6 +41,7 @@ from tensorflow.python.util.tf_export import keras_export
 class PreprocessingLayer(Layer):
   """Base class for PreprocessingLayers."""
   __metaclass__ = abc.ABCMeta
+  _must_restore_from_config = True
 
   @abc.abstractmethod
   def adapt(self, data, reset_state=True):
diff --git a/tensorflow/python/keras/engine/base_preprocessing_layer_test.py b/tensorflow/python/keras/engine/base_preprocessing_layer_test.py
index f35871a2f00..70d088cf3d3 100644
--- a/tensorflow/python/keras/engine/base_preprocessing_layer_test.py
+++ b/tensorflow/python/keras/engine/base_preprocessing_layer_test.py
@@ -19,6 +19,7 @@ from __future__ import division
 from __future__ import print_function
 
 import json
+import os
 
 from absl.testing import parameterized
 import numpy as np
@@ -35,6 +36,7 @@ from tensorflow.python.keras.engine import base_preprocessing_layer
 from tensorflow.python.keras.engine import base_preprocessing_layer_v1
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import sparse_ops
+from tensorflow.python.ops import variables
 from tensorflow.python.ops.ragged import ragged_factory_ops
 from tensorflow.python.platform import test
 from tensorflow.python.util import compat
@@ -349,6 +351,21 @@ class PreprocessingLayerTest(keras_parameterized.TestCase):
     layer_2.adapt(np.array([1, 2]), reset_state=False)
     self.assertAllEqual([[19], [20], [21]], model_2.predict([1., 2., 3.]))
 
+  def test_loading_without_providing_class_fails(self):
+    input_data = keras.Input(shape=(1,))
+    layer = get_layer()
+    output = layer(input_data)
+    model = keras.Model(input_data, output)
+
+    if not context.executing_eagerly():
+      self.evaluate(variables.variables_initializer(model.variables))
+
+    output_path = os.path.join(self.get_temp_dir(), "tf_keras_saved_model")
+    model.save(output_path, save_format="tf")
+
+    with self.assertRaisesRegex(RuntimeError, "Unable to restore a layer of"):
+      _ = keras.models.load_model(output_path)
+
 
 @keras_parameterized.run_all_keras_modes
 class ConvertToListTest(keras_parameterized.TestCase):

From 20bfd347119d5ff8dc3d6d58526e8bc683231660 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 15 Jun 2020 18:48:16 -0700
Subject: [PATCH 0250/1390] Update ops-related pbtxt files.

PiperOrigin-RevId: 316590217
Change-Id: Id3e8cb3e4dc0e9c930d1dc457312c55f64a02470
---
 .../ops_history_v2/DebugIdentityV2.pbtxt      | 66 +++++++++++++++++++
 tensorflow/core/ops/ops.pbtxt                 |  7 ++
 2 files changed, 73 insertions(+)

diff --git a/tensorflow/core/ops/compat/ops_history_v2/DebugIdentityV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/DebugIdentityV2.pbtxt
index 9502555a20e..ea92aaa4943 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/DebugIdentityV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/DebugIdentityV2.pbtxt
@@ -109,3 +109,69 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "DebugIdentityV2"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "tfdbg_context_id"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "op_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "output_slot"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  attr {
+    name: "tensor_debug_mode"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  attr {
+    name: "debug_urls"
+    type: "list(string)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "circular_buffer_size"
+    type: "int"
+    default_value {
+      i: 1000
+    }
+  }
+  attr {
+    name: "tfdbg_run_id"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/ops.pbtxt b/tensorflow/core/ops/ops.pbtxt
index c67b347af04..1d92a0671b8 100644
--- a/tensorflow/core/ops/ops.pbtxt
+++ b/tensorflow/core/ops/ops.pbtxt
@@ -10860,6 +10860,13 @@ op {
       i: 1000
     }
   }
+  attr {
+    name: "tfdbg_run_id"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
   is_stateful: true
 }
 op {

From 72e98dfe9675e4b2a78203743904d97586d5775a Mon Sep 17 00:00:00 2001
From: Akshay Modi <nareshmodi@google.com>
Date: Mon, 15 Jun 2020 19:06:11 -0700
Subject: [PATCH 0251/1390] Simple update for passing dynamic indexing with
 integers test

Also copy the _slice_helper logic into tf numpy so we can iterate more quickly
there.

PiperOrigin-RevId: 316592568
Change-Id: I1adb870385872fd407e49381cadcb117755f95a0
---
 tensorflow/python/ops/numpy_ops/np_arrays.py | 134 ++++++++++++++++++-
 1 file changed, 133 insertions(+), 1 deletion(-)

diff --git a/tensorflow/python/ops/numpy_ops/np_arrays.py b/tensorflow/python/ops/numpy_ops/np_arrays.py
index 0e320d415b6..a7696ad31c2 100644
--- a/tensorflow/python/ops/numpy_ops/np_arrays.py
+++ b/tensorflow/python/ops/numpy_ops/np_arrays.py
@@ -17,14 +17,135 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import numbers
 import numpy as np
 import six
 
+from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.numpy_ops import np_dtypes
+from tensorflow.python.util import nest
+
+
+_SLICE_TYPE_ERROR = (
+    'Only integers, slices (`:`), ellipsis (`...`), '
+    'tf.newaxis (`None`) and scalar tf.int32/tf.int64 tensors are valid '
+    'indices')
+
+_SUPPORTED_SLICE_DTYPES = (dtypes.int32, dtypes.int32_ref, dtypes.int64,
+                           dtypes.int64_ref)
+
+
+def _check_index(idx):
+  """Check if a given value is a valid index into a tensor."""
+  if isinstance(idx, (numbers.Integral, tensor_shape.Dimension)):
+    return
+
+  # Optimistic check. Assumptions:
+  # * any object with a dtype is supported
+  # * any object with a dtype has a sizeable shape attribute.
+  dtype = getattr(idx, 'dtype', None)
+  if (dtype is None or dtypes.as_dtype(dtype) not in _SUPPORTED_SLICE_DTYPES or
+      idx.shape and len(idx.shape) == 1):
+    # TODO(slebedev): IndexError seems more appropriate here, but it
+    # will break `_slice_helper` contract.
+    raise TypeError(_SLICE_TYPE_ERROR + ', got {!r}'.format(idx))
+
+
+def _is_undefined_dimension(d):
+  return isinstance(d, tensor_shape.Dimension) and d.value is None
+
+
+def _slice_helper(tensor, slice_spec, var=None):
+  """Copied from array_ops._slice_helper, will be merged back later."""
+  if isinstance(slice_spec, bool) or \
+  (isinstance(slice_spec, ops.Tensor) and slice_spec.dtype == dtypes.bool) or \
+  (isinstance(slice_spec, np.ndarray) and slice_spec.dtype == bool):
+    return array_ops.boolean_mask(tensor=tensor, mask=slice_spec)
+
+  if not isinstance(slice_spec, (list, tuple)):
+    slice_spec = [slice_spec]
+
+  begin, end, strides = [], [], []
+  index = 0
+
+  new_axis_mask, shrink_axis_mask = 0, 0
+  begin_mask, end_mask = 0, 0
+  ellipsis_mask = 0
+  for s in slice_spec:
+    if isinstance(s, slice):
+      if s.start is not None and not _is_undefined_dimension(s.start):
+        _check_index(s.start)
+        begin.append(s.start)
+      else:
+        begin.append(0)
+        begin_mask |= (1 << index)
+      if s.stop is not None and not _is_undefined_dimension(s.stop):
+        _check_index(s.stop)
+        end.append(s.stop)
+      else:
+        end.append(0)
+        end_mask |= (1 << index)
+      if s.step is not None and not _is_undefined_dimension(s.step):
+        _check_index(s.step)
+        strides.append(s.step)
+      else:
+        strides.append(1)
+    elif s is Ellipsis:
+      begin.append(0)
+      end.append(0)
+      strides.append(1)
+      ellipsis_mask |= (1 << index)
+    elif s is array_ops.newaxis:
+      begin.append(0)
+      end.append(0)
+      strides.append(1)
+      new_axis_mask |= (1 << index)
+    else:
+      _check_index(s)
+      begin.append(s)
+      end.append(s + 1)
+      strides.append(1)
+      shrink_axis_mask |= (1 << index)
+    index += 1
+
+  # stack possibly involves no tensors, so we must use op_scope correct graph.
+  with ops.name_scope(
+      None,
+      'strided_slice', [tensor] + begin + end + strides,
+      skip_on_eager=False) as name:
+    if begin:
+      packed_begin, packed_end, packed_strides = (array_ops.stack(begin),
+                                                  array_ops.stack(end),
+                                                  array_ops.stack(strides))
+      if (packed_begin.dtype == dtypes.int64 or
+          packed_end.dtype == dtypes.int64 or
+          packed_strides.dtype == dtypes.int64):
+        if packed_begin.dtype != dtypes.int64:
+          packed_begin = math_ops.cast(packed_begin, dtypes.int64)
+        if packed_end.dtype != dtypes.int64:
+          packed_end = math_ops.cast(packed_end, dtypes.int64)
+        if packed_strides.dtype != dtypes.int64:
+          packed_strides = math_ops.cast(packed_strides, dtypes.int64)
+    else:
+      var_empty = constant_op.constant([], dtype=dtypes.int32)
+      packed_begin = packed_end = packed_strides = var_empty
+    return array_ops.strided_slice(
+        tensor,
+        packed_begin,
+        packed_end,
+        packed_strides,
+        begin_mask=begin_mask,
+        end_mask=end_mask,
+        shrink_axis_mask=shrink_axis_mask,
+        new_axis_mask=new_axis_mask,
+        ellipsis_mask=ellipsis_mask,
+        var=var,
+        name=name)
 
 
 def convert_to_tensor(value, dtype=None, dtype_hint=None):
@@ -184,7 +305,18 @@ class ndarray(object):  # pylint: disable=invalid-name
 
   def __getitem__(self, slice_spec):
     # TODO(srbs): Need to support better indexing.
-    result_t = self.data.__getitem__(slice_spec)
+    def _gettensor(x):
+      if isinstance(x, ndarray):
+        x = x.data
+      if isinstance(x, ops.Tensor) and x.dtype not in (
+          dtypes.int32, dtypes.int64):
+        # Currently _slice_helper will only work with int32/int64 tensors, but
+        # type inference by numpy can create {u,}int{8,16}, so just cast.
+        x = math_ops.cast(x, dtypes.int32)
+      return x
+    slice_spec = nest.map_structure(_gettensor, slice_spec)
+
+    result_t = _slice_helper(self.data, slice_spec)
     return tensor_to_ndarray(result_t)
 
   def __iter__(self):

From 43de71078e7a1e1b8267937448da107d214cd81c Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 15 Jun 2020 19:27:05 -0700
Subject: [PATCH 0252/1390] More compatibility fixes for typing.Generic:  *
 types.new_class is required in some distributions  * avoid calling isinstance
 on some function objects in python 3.6 Required for #40132.

PiperOrigin-RevId: 316594891
Change-Id: I7c67bc04d6cfe4706a85be5b9c2271b4a4f0b97b
---
 tensorflow/python/framework/test_util.py |  7 -------
 tensorflow/python/util/tf_should_use.py  | 18 +++++-------------
 2 files changed, 5 insertions(+), 20 deletions(-)

diff --git a/tensorflow/python/framework/test_util.py b/tensorflow/python/framework/test_util.py
index 950e17d0d8c..2967bb3de84 100644
--- a/tensorflow/python/framework/test_util.py
+++ b/tensorflow/python/framework/test_util.py
@@ -33,7 +33,6 @@ import tempfile
 import threading
 import time
 import unittest
-import weakref
 
 from absl.testing import parameterized
 import numpy as np
@@ -733,12 +732,6 @@ def assert_no_new_tensors(f):
     """Finds existing Tensors, runs the test, checks for new Tensors."""
 
     def _is_tensorflow_object(obj):
-      if isinstance(obj, weakref.ReferenceType):
-        obj = obj()
-        if obj is None:
-          return False
-      if not hasattr(obj, "__class__"):
-        return False
       try:
         return isinstance(obj,
                           (ops.Tensor, variables.Variable,
diff --git a/tensorflow/python/util/tf_should_use.py b/tensorflow/python/util/tf_should_use.py
index 41c3220f5ca..1671b078fa3 100644
--- a/tensorflow/python/util/tf_should_use.py
+++ b/tensorflow/python/util/tf_should_use.py
@@ -21,12 +21,15 @@ import copy
 import sys
 import textwrap
 import traceback
-import types
+
+import six  # pylint: disable=unused-import
+
 
 from tensorflow.python.eager import context
 from tensorflow.python.framework import ops
 from tensorflow.python.platform import tf_logging
 from tensorflow.python.util import tf_decorator
+# pylint: enable=g-bad-import-order,g-import-not-at-top
 
 
 class _TFShouldUseHelper(object):
@@ -151,18 +154,7 @@ def _get_wrapper(x, tf_should_use_helper):
   tx = copy.deepcopy(type_x)
   # Prefer using __orig_bases__, which preserve generic type arguments.
   bases = getattr(tx, '__orig_bases__', tx.__bases__)
-
-  # Use types.new_class when available, which is preferred over plain type in
-  # some distributions.
-  if sys.version_info >= (3, 5):
-    def set_body(ns):
-      ns.update(tx.__dict__)
-      return ns
-
-    copy_tx = types.new_class(tx.__name__, bases, exec_body=set_body)
-  else:
-    copy_tx = type(tx.__name__, bases, dict(tx.__dict__))
-
+  copy_tx = type(tx.__name__, bases, dict(tx.__dict__))
   copy_tx.__init__ = _new__init__
   copy_tx.__getattribute__ = _new__getattribute__
   copy_tx.mark_used = _new_mark_used

From 71d1fe5f14f1fec658fde5bcee83fa486ca05355 Mon Sep 17 00:00:00 2001
From: Yunlu Li <yunluli@google.com>
Date: Mon, 15 Jun 2020 19:31:43 -0700
Subject: [PATCH 0253/1390] Make model sparsification work for Conv,
 DepthwiseConv and TransposeConv.

PiperOrigin-RevId: 316595363
Change-Id: Iad873c74599eda46b6699b0a3cd479209b8a4ca7
---
 tensorflow/compiler/mlir/lite/ir/tfl_ops.td   | 20 ++++++-
 .../mlir/lite/transforms/dense_to_sparse.cc   | 55 ++++++++++---------
 2 files changed, 48 insertions(+), 27 deletions(-)

diff --git a/tensorflow/compiler/mlir/lite/ir/tfl_ops.td b/tensorflow/compiler/mlir/lite/ir/tfl_ops.td
index ab48a0f8f92..509c13ae161 100644
--- a/tensorflow/compiler/mlir/lite/ir/tfl_ops.td
+++ b/tensorflow/compiler/mlir/lite/ir/tfl_ops.td
@@ -436,7 +436,7 @@ class TFL_Op<string mnemonic, list<OpTrait> traits = []> :
 class TFL_ConvOp<string mnemonic, string opSummary, int index> :
     TFL_Op<mnemonic, [NoSideEffect, AccumulatorUniformScale<2, 0, 1>,
     TFL_ChannelDimIndexInterface, AffineOpCoefficient<index, 1>,
-    TFL_GpuTargetOp]> {
+    TFL_GpuTargetOp, TFL_SparseOp]> {
   let summary = opSummary # " operator";
 
   let description = [{
@@ -571,7 +571,8 @@ def TFL_TransposeConvOp: TFL_Op<"transpose_conv", [
     TFL_OperandHasRank<2, 4>,
     PredOpTrait<"input and output must have same element type",
       TFL_TCresVTEtIsSameAsOp<0, 2>>,
-    TFL_GpuTargetOp]> {
+    TFL_GpuTargetOp,
+    TFL_SparseOp]> {
   let summary = "Transpose convolution operator";
 
   let description = [{
@@ -593,6 +594,13 @@ def TFL_TransposeConvOp: TFL_Op<"transpose_conv", [
   let hasOptions = 1;
 
   let verifier = [{ return Verify(*this); }];
+
+  let extraClassDeclaration = [{
+    // SparseOpInterface:
+    std::vector<int> GetSparseOperands() { return {1}; }
+    std::vector<std::vector<int>> GetFloatBlockSize() { return {}; }
+    std::vector<std::vector<int>> GetQuantizedBlockSize() { return {}; }
+  }];
 }
 
 def TFL_AveragePool2DOp:
@@ -826,6 +834,10 @@ def TFL_Conv2DOp : TFL_ConvOp<"conv_2d", "Convolution", 0> {
   let extraClassDeclaration = [{
     // ChannelDimIndexInterface:
     int GetChannelDimIndex() { return 0; }
+    // SparseOpInterface:
+    std::vector<int> GetSparseOperands() { return {1}; }
+    std::vector<std::vector<int>> GetFloatBlockSize() { return {}; }
+    std::vector<std::vector<int>> GetQuantizedBlockSize() { return {}; }
   }];
 }
 
@@ -866,6 +878,10 @@ def TFL_DepthwiseConv2DOp :
   let extraClassDeclaration = [{
     // ChannelDimIndexInterface:
     int GetChannelDimIndex() { return 3; }
+    // SparseOpInterface:
+    std::vector<int> GetSparseOperands() { return {1}; }
+    std::vector<std::vector<int>> GetFloatBlockSize() { return {}; }
+    std::vector<std::vector<int>> GetQuantizedBlockSize() { return {}; }
   }];
 }
 
diff --git a/tensorflow/compiler/mlir/lite/transforms/dense_to_sparse.cc b/tensorflow/compiler/mlir/lite/transforms/dense_to_sparse.cc
index 9b526f40277..f5ef2585be5 100644
--- a/tensorflow/compiler/mlir/lite/transforms/dense_to_sparse.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/dense_to_sparse.cc
@@ -40,14 +40,22 @@ void PopulateEncodingParams(const std::vector<int>& block_size,
                             std::vector<int>* traversal_order,
                             std::vector<TfLiteDimensionType>* format,
                             std::vector<int>* b_map, std::vector<int>* b_size) {
-  *traversal_order = {0, 1};
-  *format = {kTfLiteDimDense, kTfLiteDimSparseCSR};
+  const int dims_count = block_size.size();
+  traversal_order->resize(dims_count);
+  format->resize(dims_count);
+  for (int i = 0; i < dims_count; i++) {
+    (*traversal_order)[i] = i;
+  }
+  for (int i = 0; i < dims_count - 1; i++) {
+    (*format)[i] = kTfLiteDimDense;
+  }
+  (*format)[dims_count - 1] = kTfLiteDimSparseCSR;
   *b_map = {};
   *b_size = {};
   int block_rank = 0;
-  for (int i = 0; i < 2; i++) {
+  for (int i = 0; i < dims_count; i++) {
     if (block_size[i] != 1) {
-      traversal_order->push_back(block_rank + 2);
+      traversal_order->push_back(block_rank + dims_count);
       format->push_back(kTfLiteDimDense);
       block_rank++;
       b_map->push_back(i);
@@ -58,27 +66,18 @@ void PopulateEncodingParams(const std::vector<int>& block_size,
 
 float CalculateRandomSparsity(const ElementsAttr& attr,
                               const ShapedType& type) {
-  int num_elements = 1;
-  for (int i = 0; i < 2; i++) {
-    num_elements *= type.getDimSize(i);
-  }
+  int num_elements = type.getNumElements();
   int num_zeros = 0;
 
   if (type.getElementType().isF32()) {
-    std::vector<float> data;
-    data.reserve(type.getNumElements());
-    for (const auto val : attr.getValues<float>()) data.push_back(val);
-    for (int i = 0; i < data.size(); i++) {
-      if (data[i] == 0) {
+    for (const auto val : attr.getValues<float>()) {
+      if (val == 0.f) {
         num_zeros++;
       }
     }
   } else if (type.getElementType().isa<quant::QuantizedType>()) {
-    std::vector<int8_t> data;
-    data.reserve(type.getNumElements());
-    for (const auto val : attr.getValues<int8_t>()) data.push_back(val);
-    for (int i = 0; i < data.size(); i++) {
-      if (data[i] == 0) {
+    for (const auto val : attr.getValues<int8_t>()) {
+      if (val == 0) {
         num_zeros++;
       }
     }
@@ -150,9 +149,10 @@ InspectResult InspectWeight(
     type = cst.getType().cast<ShapedType>();
   }
 
-  // TODO(b/147449640): Add ability to encode weights more than 2-D, e.g. Conv
-  // weights.
-  if (type.getRank() != 2) {
+  // Currently we only support compressing weights of ops:
+  //   Conv, DepthwiseConv, TransposeConv, whose filter has rank 4, and
+  //   FullyConnected, whose filter has rank 2.
+  if (type.getRank() != 2 && type.getRank() != 4) {
     result.can_compress = false;
     return result;
   }
@@ -195,9 +195,11 @@ std::vector<T> BuildSparsityParameterAttribute(
     attr = cst.value();
     type = cst.getType().cast<ShapedType>();
   }
-  std::vector<int> shape(2);
-  shape[0] = type.getDimSize(0);
-  shape[1] = type.getDimSize(1);
+  const int dims_count = type.getRank();
+  std::vector<int> shape(dims_count);
+  for (int i = 0; i < dims_count; i++) {
+    shape[i] = type.getDimSize(i);
+  }
 
   std::vector<int> traversal_order = {};
   std::vector<TfLiteDimensionType> format = {};
@@ -271,10 +273,13 @@ void DenseToSparse::runOnFunction() {
         continue;
       }
 
+      ShapedType type;
       if (isa<ConstOp>(inst)) {
         supported_block_size = sparse_op.GetFloatBlockSize();
+        type = dyn_cast<ConstOp>(inst).getType().cast<ShapedType>();
       } else if (isa<QConstOp>(inst)) {
         supported_block_size = sparse_op.GetQuantizedBlockSize();
+        type = dyn_cast<QConstOp>(inst).getType().cast<ShapedType>();
       } else {
         continue;
       }
@@ -286,7 +291,7 @@ void DenseToSparse::runOnFunction() {
 
       // The weight is not block sparse. Encode with random sparsity.
       if (result.selected_block_size.empty()) {
-        result.selected_block_size = {1, 1};
+        result.selected_block_size = std::vector<int>(type.getRank(), 1);
       }
 
       builder.setInsertionPoint(op);

From 552604a2efb050dc5f79f75d2e9c286ba7725c5c Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 15 Jun 2020 19:46:08 -0700
Subject: [PATCH 0254/1390] Go: Update generated wrapper functions for
 TensorFlow ops.

PiperOrigin-RevId: 316596872
Change-Id: I3eed9544a8c70c2c7d1ffe5307ba9d37890a8d8c
---
 tensorflow/go/op/wrappers.go | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index 10acebc7965..485baa16f39 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -32348,11 +32348,11 @@ func StridedSliceShrinkAxisMask(value int64) StridedSliceAttr {
 // begin = [1, 2, x, x, 0, x] # x denotes don't care (usually 0)
 // end = [2, 4, x, x, -3, x]
 // strides = [1, 1, x, x, -1, 1]
-// begin_mask = 1<<4 | 1 << 5 = 48
+// begin_mask = 1<<4 | 1<<5 = 48
 // end_mask = 1<<5 = 32
 // ellipsis_mask = 1<<3 = 8
-// new_axis_mask = 1<<2 4
-// shrink_axis_mask = 1<<0
+// new_axis_mask = 1<<2 = 4
+// shrink_axis_mask = 1<<0 = 1
 // ```
 //
 // In this case if `foo.shape` is (5, 5, 5, 5, 5, 5) the final shape of

From c118bc4b5e1bb3a2760a026621d7a19053b0626c Mon Sep 17 00:00:00 2001
From: Prakalp Srivastava <prakalps@google.com>
Date: Mon, 15 Jun 2020 19:50:34 -0700
Subject: [PATCH 0255/1390] Fix decomposition of ResourceApplyCenteredRMSProp
 op pattern.

AssignSubVariableOp only needs the resource handle and the tensor to be subtracted from it.

PiperOrigin-RevId: 316597384
Change-Id: I26a99cb6fe210cc66429ebca13765af3109748ae
---
 .../mlir/tensorflow/tests/decompose_resource_ops.mlir        | 1 +
 .../mlir/tensorflow/transforms/decompose_resource_ops.td     | 5 +----
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/tensorflow/compiler/mlir/tensorflow/tests/decompose_resource_ops.mlir b/tensorflow/compiler/mlir/tensorflow/tests/decompose_resource_ops.mlir
index 25dfda25358..ff4dbf41221 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/decompose_resource_ops.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/decompose_resource_ops.mlir
@@ -411,6 +411,7 @@ func @decompose_resource_apply_centered_RMS_prop(%arg0: tensor<f32>, %arg1: tens
 
   // CHECK: [[VAR:%.*]] = "tf.ReadVariableOp"([[VAR_HANDLE]])
   // CHECK: [[VAR_NEW:%.*]] = "tf.Sub"([[VAR]], [[MOM_NEW]])
+  // CHECK: "tf.AssignVariableOp"([[VAR_HANDLE]], [[VAR_NEW]])
 
   "tf.ResourceApplyCenteredRMSProp"(%0, %1, %2, %3, %arg4, %arg5, %arg6, %arg7, %arg8) {use_locking = false} : (tensor<*x!tf.resource>, tensor<*x!tf.resource>, tensor<*x!tf.resource>, tensor<*x!tf.resource>, tensor<f32>, tensor<f32>, tensor<f32>, tensor<f32>, tensor<f32>) -> ()
   return
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/decompose_resource_ops.td b/tensorflow/compiler/mlir/tensorflow/transforms/decompose_resource_ops.td
index 0dd7d778e31..40339cebd31 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/decompose_resource_ops.td
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/decompose_resource_ops.td
@@ -387,9 +387,6 @@ def DecomposeResourceApplyCenteredRMSProp :
      ),
      (TF_AssignVariableOp $mom_resource, $mom_new),
      // var <- var - mom
-     (TF_AssignSubVariableOp $var_resource,
-        (TF_SubOp (CreateTFReadVariableOp $src_op, $grad, $var_resource),
-                  $mom_new)
-     )
+     (TF_AssignSubVariableOp $var_resource, $mom_new)
    ]
    >;

From 5de12f4d96c48898f58495f81156651c64eed275 Mon Sep 17 00:00:00 2001
From: Francois Chollet <fchollet@google.com>
Date: Mon, 15 Jun 2020 20:32:58 -0700
Subject: [PATCH 0256/1390] Update tf.keras version number.

PiperOrigin-RevId: 316602175
Change-Id: I94fa651c3005f578abafcafe86382a2dc75451e6
---
 tensorflow/python/keras/__init__.py | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/tensorflow/python/keras/__init__.py b/tensorflow/python/keras/__init__.py
index 6f79c219867..47f207329d7 100644
--- a/tensorflow/python/keras/__init__.py
+++ b/tensorflow/python/keras/__init__.py
@@ -32,13 +32,6 @@ from tensorflow.python.keras.engine.training import Model
 
 from tensorflow.python.util.tf_export import keras_export
 
-if tf2.enabled():
-  __version__ = '2.3.0-tf'
-else:
-  __version__ = '2.2.4-tf'
+__version__ = '2.4.0'
 
 keras_export('keras.__version__').export_constant(__name__, '__version__')
-
-del absolute_import
-del division
-del print_function

From 13d59c2d1fccecc6343965cef89464229d00db21 Mon Sep 17 00:00:00 2001
From: Chen Lei <chillychen1991@gmail.com>
Date: Tue, 16 Jun 2020 11:50:49 +0800
Subject: [PATCH 0257/1390] Update stale.yml

---
 .github/stale.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/stale.yml b/.github/stale.yml
index 5f8dd12f477..7eef5309ecd 100644
--- a/.github/stale.yml
+++ b/.github/stale.yml
@@ -23,7 +23,7 @@
 daysUntilStale: 7
 # Number of days of inactivity before a stale Issue or Pull Request is closed
 daysUntilClose: 7
-# Only issues or pull requests with all of these labels are check if stale. Defaults to `[]` (disabled)
+# Only issues or pull requests with all of these labels are checked if stale. Defaults to `[]` (disabled)
 onlyLabels:
  - stat:awaiting response
 # Comment to post when marking as stale. Set to `false` to disable

From e0b43845d711e9dc520b9b6716ff89c3b4cd631f Mon Sep 17 00:00:00 2001
From: Tare Gaskin <taregaskin@google.com>
Date: Tue, 16 Jun 2020 03:51:57 +0000
Subject: [PATCH 0258/1390] Sign compare warning fixes batch 2

---
 .../core/grappler/costs/graph_memory.cc       |  5 +-
 .../core/grappler/costs/graph_properties.cc   | 59 +++++++++++--------
 .../grappler/costs/op_level_cost_estimator.cc |  8 +--
 .../core/grappler/costs/virtual_scheduler.cc  | 14 +++--
 .../optimizers/common_subgraph_elimination.cc |  2 +-
 .../grappler/optimizers/debug_stripper.cc     |  2 +-
 .../grappler/optimizers/function_optimizer.cc | 10 ++--
 .../core/grappler/optimizers/model_pruner.cc  |  3 +-
 .../optimizers/pin_to_host_optimizer.cc       |  3 +-
 .../grappler/optimizers/shape_optimizer.cc    |  3 +-
 tensorflow/core/grappler/utils.cc             |  2 +-
 tensorflow/core/grappler/utils/graph_view.cc  | 37 +++++++-----
 .../core/grappler/utils/graph_view_internal.h | 34 +++++++----
 .../core/grappler/utils/topological_sort.cc   |  4 +-
 .../sql/sqlite_query_connection.cc            |  2 +-
 tensorflow/python/grappler/model_analyzer.cc  |  4 +-
 16 files changed, 113 insertions(+), 79 deletions(-)

diff --git a/tensorflow/core/grappler/costs/graph_memory.cc b/tensorflow/core/grappler/costs/graph_memory.cc
index 020e8cf1d1f..768a025b0e6 100644
--- a/tensorflow/core/grappler/costs/graph_memory.cc
+++ b/tensorflow/core/grappler/costs/graph_memory.cc
@@ -255,7 +255,8 @@ void GraphMemory::InferFromTrace(const StepStats& timeline) {
     std::unordered_set<const LiveTensor*> live_at_peak;
     size_t current = 0;
     std::unordered_set<const LiveTensor*> currently_live;
-    for (int i = 0; i < events.size(); ++i) {
+    int events_size = events.size();
+    for (int i = 0; i < events_size; ++i) {
       const auto& event = events[i];
 
       if (event.allocated) {
@@ -271,7 +272,7 @@ void GraphMemory::InferFromTrace(const StepStats& timeline) {
         current -= event.tensor->memory_used;
         currently_live.erase(event.tensor);
       }
-      if (i + 1 == events.size() ||
+      if (i + 1 == events_size ||
           event.timestamp != events[i + 1].timestamp) {
         if (current > peak) {
           peak = current;
diff --git a/tensorflow/core/grappler/costs/graph_properties.cc b/tensorflow/core/grappler/costs/graph_properties.cc
index ee691e7a081..0c14607e9e2 100644
--- a/tensorflow/core/grappler/costs/graph_properties.cc
+++ b/tensorflow/core/grappler/costs/graph_properties.cc
@@ -363,7 +363,7 @@ void VerboseLogUnknownDimensionSources(
 std::vector<ShapeHandle> ReplaceUnknownDimFromConstWithUnknownDim(
     InferenceContext* ic, const std::vector<ShapeHandle>& shapes) {
   std::vector<ShapeHandle> converted_shapes(shapes.size());
-  for (int i = 0; i < shapes.size(); i++) {
+  for (int i = 0, shapes_size = shapes.size(); i < shapes_size; i++) {
     const auto& shape = shapes[i];
     if (!ic->RankKnown(shape)) {
       converted_shapes[i] = shape;
@@ -502,7 +502,7 @@ class TopoQueue {
       const std::vector<const NodeDef*>& topo_order) const {
     absl::flat_hash_map<const NodeDef*, int> map;
     map.reserve(topo_order.size());
-    for (int i = 0; i < topo_order.size(); ++i) {
+    for (int i = 0, topo_order_size = topo_order.size(); i < topo_order_size; ++i) {
       map.emplace(topo_order[i], i);
     }
     return map;
@@ -680,14 +680,16 @@ class SymbolicShapeRefiner {
                         ", shape: ", ic->DebugString(ic->input(i)),
                         ", tensor: ");
         Tensor t1;
-        if (input_tensor_protos.size() > i &&
+        int input_tensor_protos_size = input_tensor_protos.size();
+        if (input_tensor_protos_size > i &&
             input_tensor_protos.at(i) != nullptr &&
             t1.FromProto(*input_tensor_protos.at(i))) {
           absl::StrAppend(&output, t1.DebugString(), ", tensor_as_shape: ");
         } else {
           absl::StrAppend(&output, " null, tensor_as_shape: ");
         }
-        if (input_tensors_as_shapes_to_propagate.size() > i) {
+        int input_tensors_as_shapes_to_propagate_size = input_tensors_as_shapes_to_propagate.size();
+        if (input_tensors_as_shapes_to_propagate_size > i) {
           absl::StrAppend(
               &output,
               StringifyShapeHandle(input_tensors_as_shapes_to_propagate.at(i)),
@@ -702,14 +704,16 @@ class SymbolicShapeRefiner {
                         ", shape: ", ic->DebugString(ic->output(i)),
                         ", tensor: ");
         Tensor t2;
-        if (output_tensor_protos.size() > i &&
+        int output_tensor_protos_size = output_tensor_protos.size();
+        if (output_tensor_protos_size > i &&
             output_tensor_protos.at(i) != nullptr &&
             t2.FromProto(*output_tensor_protos.at(i))) {
           absl::StrAppend(&output, t2.DebugString(), ", tensor_as_shape: ");
         } else {
           absl::StrAppend(&output, " null, tensor_as_shape: ");
         }
-        if (output_tensors_as_shapes.size() > i) {
+        int output_tensors_as_shapes_size = output_tensors_as_shapes.size();
+        if (output_tensors_as_shapes_size > i) {
           absl::StrAppend(&output,
                           StringifyShapeHandle(output_tensors_as_shapes.at(i)),
                           "\n");
@@ -779,7 +783,7 @@ class SymbolicShapeRefiner {
     MutableGraphView gv(&grappler_function_item.graph);
 
     // Forward shapes from function input nodes to argument nodes.
-    for (int i = 0; i < grappler_function_item.inputs().size(); ++i) {
+    for (int i = 0, iter_limit = grappler_function_item.inputs().size(); i < iter_limit; ++i) {
       auto& fun_input = grappler_function_item.input(i);
       NodeDef* fun_node = gv.GetNode(fun_input.node_name);
       const TensorId input_tensor = ParseTensorName(function_node->input(i));
@@ -858,13 +862,13 @@ class SymbolicShapeRefiner {
       if (IsConstant(*input_node)) {
         TF_CHECK_OK(
             ReplaceInputWithConst(*input_node, i, &grappler_function_item));
-      } else if (ctx->input_tensor_protos.size() > i &&
+      } else if (static_cast<int>(ctx->input_tensor_protos.size()) > i &&
                  ctx->input_tensor_protos[i] != nullptr) {
         NodeDef const_input_node = MakeConstNodeDefFromTensorProto(
             ic, *ctx->input_tensor_protos[i], ctx->input_types[i]);
         TF_CHECK_OK(ReplaceInputWithConst(const_input_node, i,
                                           &grappler_function_item));
-      } else if (ic->input_tensors_as_shapes().size() > i &&
+      } else if (static_cast<int>(ic->input_tensors_as_shapes().size()) > i &&
                  IsShapeFullyDefinedIntegerVectorOrScalar(
                      ic, ic->input(i), ic->input_tensors_as_shapes()[i],
                      ctx->input_types[i])) {
@@ -912,7 +916,8 @@ class SymbolicShapeRefiner {
       }
 
       auto output_properties = gp.GetOutputProperties(retnode->name());
-      if (out_tensor.index() >= output_properties.size()) {
+      int output_properties_size = output_properties.size();
+      if (out_tensor.index() >= output_properties_size) {
         return errors::InvalidArgument(
             out_tensor.ToString(), " has invalid position ", out_tensor.index(),
             " (output_properties.size() = ", output_properties.size(), ").");
@@ -975,12 +980,12 @@ class SymbolicShapeRefiner {
       // NodeContext:
       // output_tensor_protos to input_tensor_protos and input_tensors, and
       // output_tensors_as_shapes to input_tensors_as_shapes.
-      if (src_ctx->output_tensors_as_shapes.size() > src_output) {
+      if (static_cast<int>(src_ctx->output_tensors_as_shapes.size()) > src_output) {
         ctx->input_tensors_as_shapes_to_propagate[dst_input] =
             src_ctx->output_tensors_as_shapes[src_output];
       }
 
-      if (src_ctx->output_tensor_protos.size() > src_output) {
+      if (static_cast<int>(src_ctx->output_tensor_protos.size()) > src_output) {
         const auto* tensor_proto = src_ctx->output_tensor_protos[src_output];
         if (tensor_proto != nullptr) {
           ctx->input_tensor_protos[dst_input] = tensor_proto;
@@ -1233,7 +1238,7 @@ class SymbolicShapeRefiner {
     if (st1.size() != st2.size()) {
       return false;
     }
-    for (int i = 0; i < st1.size(); ++i) {
+    for (int i = 0, st1_size = st1.size(); i < st1_size; ++i) {
       const ShapeAndType& s1 = st1[i];
       const ShapeAndType& s2 = st2[i];
       if (s1.dtype != s2.dtype) {
@@ -1268,13 +1273,13 @@ class SymbolicShapeRefiner {
       return Status::OK();
     }
 
-    if (grappler_function_item.inputs().size() > function_node->input_size()) {
+    if (static_cast<int>(grappler_function_item.inputs().size()) > function_node->input_size()) {
       return errors::FailedPrecondition(
           "Function input size should be smaller than node input size.");
     }
 
-    for (int i = grappler_function_item.inputs().size();
-         i < function_node->input_size(); ++i) {
+    for (int i = grappler_function_item.inputs().size(), iter_limit = function_node->input_size();
+         i < iter_limit; ++i) {
       const string& input = function_node->input(i);
       if (!IsControlInput(input)) {
         return errors::FailedPrecondition(
@@ -1357,18 +1362,20 @@ class SymbolicShapeRefiner {
   // Returns true if all the output tensors have known values.
   bool AllOutputValuesKnown(NodeContext* c) {
     InferenceContext* ic = c->inference_context.get();
-    if (c->output_tensors_as_shapes.size() < ic->num_outputs() &&
-        c->output_tensor_protos.size() < ic->num_outputs()) {
+    int c_output_tensors_as_shapes_size = c->output_tensors_as_shapes.size();
+    int c_output_tensor_protos_size = c->output_tensor_protos.size();
+    if (c_output_tensors_as_shapes_size < ic->num_outputs() &&
+        c_output_tensor_protos_size < ic->num_outputs()) {
       return false;
     } else {
       // Checks if we can get output value via either output_tensor_proto or
       // output_tensors_as_shapes.
       for (int i = 0; i < ic->num_outputs(); i++) {
-        if (c->output_tensor_protos.size() > i &&
+        if (c_output_tensor_protos_size > i &&
             c->output_tensor_protos[i] != nullptr) {
           continue;
         }
-        if (c->output_tensors_as_shapes.size() > i &&
+        if (c_output_tensors_as_shapes_size > i &&
             ic->FullyDefined(c->output_tensors_as_shapes[i])) {
           bool no_unknown_dim_from_const = true;
           for (int32 j = 0; j < ic->Rank(c->output_tensors_as_shapes[i]); ++j) {
@@ -1539,7 +1546,7 @@ class SymbolicShapeRefiner {
                                     &resource_mgr_, &outputs));
     c->output_tensors_as_shapes.resize(outputs.size());
     c->output_tensor_protos.resize(outputs.size(), nullptr);
-    for (int k = 0; k < outputs.size(); k++) {
+    for (int k = 0, outputs_size = outputs.size(); k < outputs_size; k++) {
       const auto& t = outputs[k];
       // Override output shape.
       ShapeHandle output_shape;
@@ -2297,7 +2304,7 @@ Status GraphProperties::UpdateEnqueue(
 
   // TODO(bsteiner): handle EnqueueMany as well.
   std::vector<ShapeAndType> shapes_and_types;
-  for (int i = 1; i < ctx->input_types.size(); ++i) {
+  for (int i = 1, iter_limit = ctx->input_types.size(); i < iter_limit; ++i) {
     GraphView::InputPort inp(enqueue_node, i);
     GraphView::OutputPort fanin = shape_refiner->graph().GetRegularFanin(inp);
     InferenceContext* in = shape_refiner->GetContext(fanin.node);
@@ -2490,10 +2497,10 @@ Status GraphProperties::InferStatically(bool assume_valid_feeds,
             const TensorProto& raw_val =
                 fanin.node->attr().at("value").tensor();
             *input_properties[i].mutable_value() = raw_val;
-          } else if (ctx->input_tensor_protos.size() > i &&
+          } else if (static_cast<int>(ctx->input_tensor_protos.size()) > i &&
                      ctx->input_tensor_protos[i] != nullptr) {
             *input_properties[i].mutable_value() = *ctx->input_tensor_protos[i];
-          } else if (ic->input_tensors_as_shapes().size() > i &&
+          } else if (static_cast<int>(ic->input_tensors_as_shapes().size()) > i &&
                      IsShapeFullyDefinedIntegerVectorOrScalar(
                          ic, ic->input(i), ic->input_tensors_as_shapes()[i],
                          ctx->input_types[i])) {
@@ -2525,11 +2532,11 @@ Status GraphProperties::InferStatically(bool assume_valid_feeds,
             // TODO(rmlarsen): Eliminate this copy.
             const TensorProto& raw_val = node.attr().at("value").tensor();
             *output_properties[i].mutable_value() = raw_val;
-          } else if (ctx->output_tensor_protos.size() > i &&
+          } else if (static_cast<int>(ctx->output_tensor_protos.size()) > i &&
                      ctx->output_tensor_protos[i] != nullptr) {
             *output_properties[i].mutable_value() =
                 *ctx->output_tensor_protos[i];
-          } else if (converted_output_tensors_as_shapes.size() > i &&
+          } else if (static_cast<int>(converted_output_tensors_as_shapes.size()) > i &&
                      IsShapeFullyDefinedIntegerVectorOrScalar(
                          ic, ic->output(i),
                          converted_output_tensors_as_shapes[i],
diff --git a/tensorflow/core/grappler/costs/op_level_cost_estimator.cc b/tensorflow/core/grappler/costs/op_level_cost_estimator.cc
index b8b62cbd6e5..a62359025be 100644
--- a/tensorflow/core/grappler/costs/op_level_cost_estimator.cc
+++ b/tensorflow/core/grappler/costs/op_level_cost_estimator.cc
@@ -1470,8 +1470,8 @@ Costs OpLevelCostEstimator::PredictEinsum(const OpContext& op_context) const {
                          (a_input.shape().dim_size() < matrix_rank) ||
                          (b_input.shape().dim_size() < matrix_rank);
 
-  if (a_input_str.size() != a_input_shape.dim_size() ||
-      b_input_str.size() != b_input_shape.dim_size()) {
+  if (a_input_str.size() != static_cast<size_t>(a_input_shape.dim_size()) ||
+      b_input_str.size() != static_cast<size_t>(b_input_shape.dim_size())) {
     VLOG(1) << "Missing accurate estimator for op: " << op_info.op()
             << ", equation subscripts don't match tensor rank.";
     return PredictCostOfAnUnknownOp(op_context);
@@ -1513,7 +1513,7 @@ Costs OpLevelCostEstimator::PredictEinsum(const OpContext& op_context) const {
   n_dim.set_size(1);
   k_dim.set_size(1);
 
-  for (int i_idx = 0; i_idx < a_input_str.size(); ++i_idx) {
+  for (int i_idx = 0, a_input_str_size = a_input_str.size(); i_idx < a_input_str_size; ++i_idx) {
     if (b_input_str.find(a_input_str[i_idx]) == std::string::npos) {
       if (rhs_str.find(a_input_str[i_idx]) == std::string::npos) {
         VLOG(1) << "Missing accurate estimator for op: " << op_info.op();
@@ -1533,7 +1533,7 @@ Costs OpLevelCostEstimator::PredictEinsum(const OpContext& op_context) const {
     *(a_matrix_shape->add_dim()) = a_input_shape.dim(i_idx);
     *(b_matrix_shape->add_dim()) = a_input_shape.dim(i_idx);
   }
-  for (int i_idx = 0; i_idx < b_input_str.size(); ++i_idx) {
+  for (int i_idx = 0, b_input_str_size = b_input_str.size(); i_idx < b_input_str_size; ++i_idx) {
     if (a_input_str.find(b_input_str[i_idx]) == std::string::npos) {
       if (rhs_str.find(b_input_str[i_idx]) == std::string::npos) {
         VLOG(1) << "Missing accurate estimator for op: " << op_info.op();
diff --git a/tensorflow/core/grappler/costs/virtual_scheduler.cc b/tensorflow/core/grappler/costs/virtual_scheduler.cc
index 5339b00627e..b20fad8b41c 100644
--- a/tensorflow/core/grappler/costs/virtual_scheduler.cc
+++ b/tensorflow/core/grappler/costs/virtual_scheduler.cc
@@ -522,8 +522,8 @@ Status SchedulerState::Init(const GrapplerItem* item,
 
     if (IsPersistent(*curr_node)) {
       auto& device_state = device_[curr_node_device];
-      for (int port_num = 0;
-           port_num < curr_node_state.output_properties.size(); ++port_num) {
+      for (int port_num = 0, port_num_iter_limit = curr_node_state.output_properties.size();
+           port_num < port_num_iter_limit; ++port_num) {
         device_state.persistent_nodes.insert(
             std::make_pair(curr_node, port_num));
       }
@@ -795,7 +795,8 @@ void SchedulerState::GetOutputNodes(const NodeDef* node,
       // Execute a node as soon as all its inputs are ready. Merge nodes are
       // special since they run as soon as one of their inputs becomes
       // available.
-      if (output_state.num_inputs_ready == output_state.inputs.size() ||
+      int output_state_inputs_size = output_state.inputs.size();
+      if (output_state.num_inputs_ready == output_state_inputs_size ||
           IsMerge(*output_node)) {
         // This output node is now ready.
         output_state.time_ready = curr_time;
@@ -900,8 +901,9 @@ std::vector<const NodeDef*> SchedulerState::MarkNodeExecuted(
     auto port = input_port.second;
     auto& input_state = node_map_[input];
     input_state.num_outputs_executed[port]++;
-    if (input_state.num_outputs_executed[port] ==
-            input_state.outputs[port].size() &&
+    int input_state_outputs_size_ = input_state.outputs[port].size();
+    if (input_state.num_outputs_executed[port] == input_state_outputs_size_
+             &&
         !IsPersistent(*input)) {
       // All the outputs are executed; no reference to this output port of
       // input node.
@@ -1119,7 +1121,7 @@ void SchedulerState::GenerateRunMetadata(RunMetadata* metadata) {
       const NodeState& nodestate = node_map_.at(node_def);
       NodeExecStats* node_stats = device_stepstats->add_node_stats();
       uint64 total_output_size = 0;
-      for (int slot = 0; slot < nodestate.output_properties.size(); slot++) {
+      for (int slot = 0, slot_iter_limit = nodestate.output_properties.size(); slot < slot_iter_limit; slot++) {
         const auto& properties = nodestate.output_properties[slot];
         NodeOutput* no = node_stats->add_output();
         no->set_slot(slot);
diff --git a/tensorflow/core/grappler/optimizers/common_subgraph_elimination.cc b/tensorflow/core/grappler/optimizers/common_subgraph_elimination.cc
index af323e913a7..4f385797f20 100644
--- a/tensorflow/core/grappler/optimizers/common_subgraph_elimination.cc
+++ b/tensorflow/core/grappler/optimizers/common_subgraph_elimination.cc
@@ -73,7 +73,7 @@ class UniqueNodes {
     if (it == memoized_signatures_.end()) return;
 
     std::vector<NodeDef*>& candidates = rep_[it->second];
-    for (int i = 0; i < candidates.size(); ++i) {
+    for (int i = 0, candidates_size = candidates.size(); i < candidates_size; ++i) {
       if (candidates[i] == node) {
         std::swap(candidates[i], candidates[candidates.size() - 1]);
         candidates.resize(candidates.size() - 1);
diff --git a/tensorflow/core/grappler/optimizers/debug_stripper.cc b/tensorflow/core/grappler/optimizers/debug_stripper.cc
index d4b3bf395c3..de62e8fe6b9 100644
--- a/tensorflow/core/grappler/optimizers/debug_stripper.cc
+++ b/tensorflow/core/grappler/optimizers/debug_stripper.cc
@@ -63,7 +63,7 @@ Status DebugStripper::Optimize(Cluster* cluster, const GrapplerItem& item,
       node.mutable_attr()->swap(new_attr);
       // As Identity op only takes one input, mark redundant inputs as control
       // input.
-      for (size_t i = 1; i < node.input_size(); ++i) {
+      for (int i = 1, node_input_size = node.input_size(); i < node_input_size; ++i) {
         if (!IsControlInput(node.input(i))) {
           *node.mutable_input(i) = AsControlDependency(NodeName(node.input(i)));
         }
diff --git a/tensorflow/core/grappler/optimizers/function_optimizer.cc b/tensorflow/core/grappler/optimizers/function_optimizer.cc
index ed3af955c13..5c703b18a6d 100644
--- a/tensorflow/core/grappler/optimizers/function_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/function_optimizer.cc
@@ -438,8 +438,8 @@ bool HasUnusedOutputs(const NodeDef& func_node, const FunctionDef& func,
   int num_outputs = func.signature().output_arg_size();
   const absl::flat_hash_set<int> active_outputs =
       GetActiveOutputs(func_node, ctx, /*size_hind*/ num_outputs);
-
-  return active_outputs.size() != num_outputs;
+  int active_outputs_size = active_outputs.size(); 
+  return active_outputs_size != num_outputs;
 }
 
 // Return pruned FunctionDefLibrary with functions that are reachable from
@@ -563,7 +563,8 @@ void RemoveUnusedOutputsTypes(const FunctionSpecialization& specialization,
   if (tout == nullptr || !tout->has_list()) return;
 
   // Nothing to do if all outputs are active.
-  if (specialization.active_outputs.size() == tout->list().type_size()) return;
+  int specialization_active_outputs_size = specialization.active_outputs.size();
+  if (specialization_active_outputs_size == tout->list().type_size()) return;
 
   // Clear input types for the specialized node.
   auto* attr = specialized_func_node->mutable_attr();
@@ -1142,7 +1143,8 @@ void AddFrameForwardingControlEdge(const std::vector<ControlFlowInfo>& info,
                                    Node* caller, Graph* g) {
   // All nodes added to the graph by v2 control flow lowering and function
   // inlining are guaranteed to have control edges to nested function calls.
-  if (caller->id() >= info.size()) return;
+  int info_size = info.size();
+  if (caller->id() >= info_size ) return;
 
   // Check if a lowered node is executing inside a while loop.
   const Node* frame = info[caller->id()].frame;
diff --git a/tensorflow/core/grappler/optimizers/model_pruner.cc b/tensorflow/core/grappler/optimizers/model_pruner.cc
index 20db4360f73..634ef35ab21 100644
--- a/tensorflow/core/grappler/optimizers/model_pruner.cc
+++ b/tensorflow/core/grappler/optimizers/model_pruner.cc
@@ -401,9 +401,10 @@ Status SplitIdentityNInputs(GraphDef* graph,
     }
 
     const int num_non_control_inputs = NumNonControlInputs(*node);
+    int terminal_second_size = terminal.second.size(); 
     if (node->attr().count("T") == 0 ||
         node->attr().at("T").list().type_size() != num_non_control_inputs ||
-        terminal.second.size() >= num_non_control_inputs) {
+        terminal_second_size >= num_non_control_inputs) {
       continue;
     }
 
diff --git a/tensorflow/core/grappler/optimizers/pin_to_host_optimizer.cc b/tensorflow/core/grappler/optimizers/pin_to_host_optimizer.cc
index ec16de1294b..35d0c5b0e40 100644
--- a/tensorflow/core/grappler/optimizers/pin_to_host_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/pin_to_host_optimizer.cc
@@ -107,7 +107,8 @@ Status IsNodeOutputPortHostFriendly(const GraphView& graph,
         /*include_tensor_values=*/false));
   }
   const auto& output_properties = properties->GetOutputProperties(node.name());
-  if (port_id >= output_properties.size()) {
+  int output_properties_size = output_properties.size();
+  if (port_id >= output_properties_size) {
     LOG(WARNING) << "port_id=" << port_id
                  << " but output_properties.size()=" << output_properties.size()
                  << "\n"
diff --git a/tensorflow/core/grappler/optimizers/shape_optimizer.cc b/tensorflow/core/grappler/optimizers/shape_optimizer.cc
index 69de1cde4ca..656c1a1db1c 100644
--- a/tensorflow/core/grappler/optimizers/shape_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/shape_optimizer.cc
@@ -99,7 +99,8 @@ Status ShapeOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
         }
         const auto& prop =
             properties.GetOutputProperties(reduce_indices.node->name());
-        if (prop.size() <= reduce_indices.port_id) {
+        int prop_size = prop.size();
+        if (prop_size <= reduce_indices.port_id) {
           continue;
         }
         const TensorShapeProto& reduction_indices_shape =
diff --git a/tensorflow/core/grappler/utils.cc b/tensorflow/core/grappler/utils.cc
index cd6b4855583..240a52d1c6b 100644
--- a/tensorflow/core/grappler/utils.cc
+++ b/tensorflow/core/grappler/utils.cc
@@ -357,7 +357,7 @@ void PermuteNodesInPlace(GraphDef* graph, std::vector<int>* permutation,
     }
     permutation->swap(inv_perm);
   }
-  for (std::size_t n = 0; n + 1 < permutation->size(); ++n) {
+  for (int n = 0, permutation_size = permutation->size(); n + 1 < permutation_size; ++n) {
     while (n != (*permutation)[n]) {
       std::size_t r = (*permutation)[n];
       graph->mutable_node()->SwapElements(n, r);
diff --git a/tensorflow/core/grappler/utils/graph_view.cc b/tensorflow/core/grappler/utils/graph_view.cc
index 5a9a1cd2abb..c19e600e869 100644
--- a/tensorflow/core/grappler/utils/graph_view.cc
+++ b/tensorflow/core/grappler/utils/graph_view.cc
@@ -63,7 +63,7 @@ bool NodeView::HasFanout(const FaninView& fanout) const {
     return false;
   } else if (fanout.index() == Graph::kControlSlot) {
     return view->fanins_set_.contains({this->node(), Graph::kControlSlot});
-  } else if (fanout.index() >= view->regular_fanins_.size()) {
+  } else if (fanout.index() >= static_cast<int>(view->regular_fanins_.size())) {
     return false;
   }
   return view->regular_fanins_[fanout.index()].node_index_ == node_index_;
@@ -152,7 +152,8 @@ Status GraphView::CheckAndAddFaninsInternal(NodeView* node_view) {
                                      Graph::kControlSlot);
       has_observed_control = true;
     } else {
-      if (fanin_node_view.regular_fanouts_by_port_.size() <
+      int fanin_node_view_regular_fanouts_by_port_size = fanin_node_view.regular_fanouts_by_port_.size();
+      if (fanin_node_view_regular_fanouts_by_port_size <
           fanin_id.index() + 1) {
         fanin_node_view.regular_fanouts_by_port_.resize(fanin_id.index() + 1);
       }
@@ -197,7 +198,7 @@ bool MutableNodeView::HasFanout(const MutableFaninView& fanout) const {
     return false;
   } else if (fanout.index() == Graph::kControlSlot) {
     return view->fanins_count_.contains({this->node(), Graph::kControlSlot});
-  } else if (fanout.index() >= view->regular_fanins_.size()) {
+  } else if (fanout.index() >= static_cast<int>(view->regular_fanins_.size())) {
     return false;
   }
   return view->regular_fanins_[fanout.index()].node_index_ == node_index_;
@@ -279,7 +280,8 @@ void Mutation::AddMutation(
 void Mutation::RemoveNode(MutableNodeView* node) {
   auto& update_index = node->update_index_;
   if (update_index != internal::kMissingIndex) {
-    if (update_index < updated_nodes_.size() - 1) {
+    int updated_nodes_size = updated_nodes_.size();
+    if (update_index < updated_nodes_size - 1) {
       graph_view_->nodes_[updated_nodes_.back().node_index].update_index_ =
           update_index;
       std::swap(updated_nodes_[update_index], updated_nodes_.back());
@@ -574,7 +576,8 @@ void MutableGraphView::AddFaninsInternal(
           --last_pos;
         }
       } else {
-        if (fanin_node_view.regular_fanouts_by_port_.size() <
+      	int fanin_node_view_regular_fanouts_by_port_size = fanin_node_view.regular_fanouts_by_port_.size();
+        if (fanin_node_view_regular_fanouts_by_port_size <
             fanin_id.index() + 1) {
           fanin_node_view.regular_fanouts_by_port_.resize(fanin_id.index() + 1);
         }
@@ -852,8 +855,8 @@ template <typename T>
 void MutableGraphView::ReplaceNodeFanouts(MutableNodeView* node, T* fanouts) {
   node->num_regular_fanouts_ = fanouts->num_regular_fanouts_;
   node->regular_fanouts_by_port_ = std::move(fanouts->regular_fanouts_by_port_);
-  for (int i = 0; i < node->regular_fanouts_by_port_.size(); ++i) {
-    for (int j = 0; j < node->regular_fanouts_by_port_[i].size(); ++j) {
+  for (int i = 0, i_max = node->regular_fanouts_by_port_.size(); i < i_max; ++i) {
+    for (int j = 0, j_max = node->regular_fanouts_by_port_[i].size(); j < j_max; ++j) {
       auto& fanout = node->regular_fanouts_by_port_[i][j];
       auto* fanout_node_view = fanout.node_view();
       auto& fanout_fanin = fanout_node_view->regular_fanins_[fanout.index()];
@@ -868,7 +871,7 @@ void MutableGraphView::ReplaceNodeFanouts(MutableNodeView* node, T* fanouts) {
     }
   }
   node->controlled_fanouts_ = std::move(fanouts->controlled_fanouts_);
-  for (int i = 0; i < node->controlled_fanouts_.size(); ++i) {
+  for (int i = 0, i_max = node->controlled_fanouts_.size(); i < i_max; ++i) {
     auto& fanout = node->controlled_fanouts_[i];
     auto* fanout_node_view = fanout.node_view();
     auto& fanout_fanin =
@@ -1017,7 +1020,8 @@ inline void MutableGraphView::RemoveRegularFaninFanoutInternal(
                       {&graph_->node(fanin.node_index_), fanin.index()});
   auto* fanin_node_view = fanin.node_view();
   auto& fanouts = fanin_node_view->regular_fanouts_by_port_[fanin.index()];
-  if (fanin.fanout_index_ < fanouts.size() - 1) {
+  int fanouts_size = fanouts.size();
+  if (fanin.fanout_index_ < fanouts_size - 1) {
     // Swap fanout with last fanout in vector, and update it's associated fanin
     // index.
     MutableFaninView& last_fanout = fanouts.back();
@@ -1043,7 +1047,8 @@ inline void MutableGraphView::RemoveRegularFaninFanoutInternal(
       break;
     }
   }
-  if (last_fanout_index < fanin_node_view->regular_fanouts_by_port_.size()) {
+  int fanin_node_view_regular_fanouts_by_port_size = fanin_node_view->regular_fanouts_by_port_.size();
+  if (last_fanout_index < fanin_node_view_regular_fanouts_by_port_size) {
     fanin_node_view->regular_fanouts_by_port_.resize(last_fanout_index);
   }
 }
@@ -1052,7 +1057,8 @@ inline void MutableGraphView::AddRegularFaninInternal(
     MutableNodeView* node_view, const SafeTensorId& fanin_id) {
   MutableNodeView* fanin_node_view = GetNode(fanin_id.node());
   // Resize fanouts to include new output port index.
-  if (fanin_node_view->regular_fanouts_by_port_.size() < fanin_id.index() + 1) {
+  int fanin_node_view_regular_fanouts_by_port_size = fanin_node_view->regular_fanouts_by_port_.size();
+  if (fanin_node_view_regular_fanouts_by_port_size < fanin_id.index() + 1) {
     fanin_node_view->regular_fanouts_by_port_.resize(fanin_id.index() + 1);
   }
 
@@ -1078,7 +1084,8 @@ inline void MutableGraphView::UpdateRegularFaninInternal(
 
   MutableNodeView* fanin_node_view = GetNode(fanin_id.node());
   // Resize fanouts to include new output port index.
-  if (fanin_node_view->regular_fanouts_by_port_.size() < fanin_id.index() + 1) {
+  int fanin_node_view_regular_fanouts_by_port_size = fanin_node_view->regular_fanouts_by_port_.size();
+  if (fanin_node_view_regular_fanouts_by_port_size < fanin_id.index() + 1) {
     fanin_node_view->regular_fanouts_by_port_.resize(fanin_id.index() + 1);
   }
 
@@ -1110,8 +1117,9 @@ inline void MutableGraphView::RemoveControllingFaninFanoutInternal(
     // controlled fanout in controlling fanin with controlled fanout to be
     // removed.
     auto* control_to_remove_view = control_to_remove.node_view();
+    int control_to_remove_view_controlled_fanouts_size = control_to_remove_view->controlled_fanouts_.size();
     if (control_to_remove.fanout_index_ <
-        control_to_remove_view->controlled_fanouts_.size() - 1) {
+        control_to_remove_view_controlled_fanouts_size - 1) {
       auto& control_to_remove_view_last_control =
           control_to_remove_view->controlled_fanouts_.back();
       control_to_remove_view_last_control.node_view()
@@ -1137,7 +1145,8 @@ inline void MutableGraphView::RemoveControllingFaninInternal(
     RemoveControllingFaninFanoutInternal(node_view, control_index);
 
     // Swap last controlling fanin in node with controlling fanin to be removed.
-    if (control_index < node_view->controlling_fanins_.size() - 1) {
+    int node_view_controlling_fanins_size = node_view->controlling_fanins_.size();
+    if (control_index < node_view_controlling_fanins_size - 1) {
       auto& last_control = node_view->controlling_fanins_.back();
       auto* last_control_view = last_control.node_view();
       last_control_view->controlled_fanouts_[last_control.fanout_index_]
diff --git a/tensorflow/core/grappler/utils/graph_view_internal.h b/tensorflow/core/grappler/utils/graph_view_internal.h
index d07f9f71640..9b142444d8a 100644
--- a/tensorflow/core/grappler/utils/graph_view_internal.h
+++ b/tensorflow/core/grappler/utils/graph_view_internal.h
@@ -172,7 +172,8 @@ class NodeViewInternal {
   // Returns a regular fanin based on input index. If no such fanin exist, a
   // missing fanin is returned, with no NodeView set and an index of -2.
   const FanoutViewT& GetRegularFanin(int i) const {
-    if (i < 0 || i >= regular_fanins_.size()) {
+    int regular_fanins_size = regular_fanins_.size(); 
+    if (i < 0 || i >= regular_fanins_size) {
       return GetMissingFanin();
     }
     return regular_fanins_[i];
@@ -191,7 +192,8 @@ class NodeViewInternal {
   // Returns a regular fanout(s) based on output index. If no such output index
   // exists, no fanouts will be returned.
   const std::vector<FaninViewT>& GetRegularFanout(int i) const {
-    if (i < 0 || i >= regular_fanouts_by_port_.size()) {
+    int regular_fanouts_by_port_size = regular_fanouts_by_port_.size();
+    if (i < 0 || i >= regular_fanouts_by_port_size) {
       return GetMissingFanout();
     }
     return regular_fanouts_by_port_[i];
@@ -289,14 +291,16 @@ class GraphViewInternal {
   // Finds node by index in the graph. If no such node exists in the graph, a
   // `nullptr` is returned.
   const NodeViewT* GetNode(int node_index) const {
-    if (node_index < 0 || node_index >= nodes_.size()) {
+    int nodes_size = nodes_.size(); 
+    if (node_index < 0 || node_index >= nodes_size) {
       return nullptr;
     }
     return &nodes_[node_index];
   }
 
   NodeViewT* GetNode(int node_index) {
-    if (node_index < 0 || node_index >= nodes_.size()) {
+    int nodes_size = nodes_.size(); 
+    if (node_index < 0 || node_index >= nodes_size) {
       return nullptr;
     }
     return &nodes_[node_index];
@@ -444,13 +448,14 @@ inline bool UpdateDevice(NodeViewDiff<GraphViewT>* diff,
 template <typename T, typename U>
 inline bool AddOrUpdateAtIndex(std::vector<T>* v, int i, const U& value,
                                const T& default_value) {
-  if (i > v->size()) {
+  int v_size = v->size(); 
+  if (i > v_size) {
     // Resize to include `value`, filling the newly introduced gap with
     // `default_value` for later checks of validity (gaps in vector).
     v->reserve(i + 1);
     v->resize(i, default_value);
     v->push_back({value});
-  } else if (i == v->size()) {
+  } else if (i == v_size) {
     // Vector is large enough, simply append `value` to the end.
     v->push_back({value});
   } else {
@@ -494,7 +499,8 @@ inline bool AddOrUpdateRegularFanin(NodeViewDiff<GraphViewT>* diff, int index,
     // index from beginning of regular fanins.
     const int relative_removal_index = num_regular_fanins - index - 1;
     // Check if at relative index fanin was already marked for removal.
-    if (relative_removal_index < diff->regular_inputs_to_remove.size() &&
+    int diff_regular_inputs_to_remove_size = diff->regular_inputs_to_remove.size();
+    if (relative_removal_index < diff_regular_inputs_to_remove_size &&
         diff->regular_inputs_to_remove[relative_removal_index]) {
       // Unmark fanin for removal.
       diff->regular_inputs_to_remove[relative_removal_index] = false;
@@ -543,7 +549,8 @@ inline bool RemoveRegularFanin(NodeViewDiff<GraphViewT>* diff, int index) {
   } else {
     // Relative index from end of regular fanins.
     const int relative_add_index = index - num_regular_fanins;
-    if (relative_add_index >= diff->regular_inputs_to_add.size() ||
+    int diff_regular_inputs_to_add_size = diff->regular_inputs_to_add.size();
+    if (relative_add_index >= diff_regular_inputs_to_add_size ||
         IsEmptyTensorId(diff->regular_inputs_to_add[relative_add_index])) {
       // At relative index, appended regular fanin was already marked for
       // removal.
@@ -671,7 +678,8 @@ inline bool IsWellFormed(
     const absl::flat_hash_map<absl::string_view, int>& updated_node_names) {
   ResizeByTrimmingEndForValue(&diff->regular_inputs_to_remove, false);
   ResizeByTrimmingEndForValue(&diff->regular_inputs_to_add, EmptyTensorId());
-  if (diff->regular_inputs_to_add.size() != diff->num_regular_inputs_to_add) {
+  int diff_regular_inputs_to_add_size = diff->regular_inputs_to_add.size();
+  if (diff_regular_inputs_to_add_size != diff->num_regular_inputs_to_add) {
     // Missing regular fanins in between appended fanins.
     return false;
   } else if (diff->num_regular_inputs_to_add > 0 &&
@@ -679,7 +687,7 @@ inline bool IsWellFormed(
     // Appending new fanins while removing existing fanins, resulting in missing
     // regular fanins in between.
     return false;
-  } else if (diff->regular_inputs_to_remove.size() !=
+  } else if ( static_cast<int>(diff->regular_inputs_to_remove.size()) !=
              diff->num_regular_inputs_to_remove) {
     // Regular fanins exist in between removed fanins.
     return false;
@@ -830,7 +838,8 @@ inline void AddOrUpdateRegularFanin(NewNode<GraphViewT>* new_node, int index,
 // remove existing fanins and updated/added fanins via AddOrUpdateRegularFanins.
 template <typename GraphViewT>
 inline void RemoveRegularFanin(NewNode<GraphViewT>* new_node, int index) {
-  if (index < 0 || index >= new_node->regular_fanins.size() ||
+  int new_node_regular_fanins_size = new_node->regular_fanins.size();
+  if (index < 0 || index >= new_node_regular_fanins_size ||
       IsEmptyTensorId(new_node->regular_fanins[index])) {
     return;
   }
@@ -874,7 +883,8 @@ inline bool IsWellFormed(
     NewNode<GraphViewT>* new_node,
     const absl::flat_hash_map<absl::string_view, int>& updated_node_names) {
   ResizeByTrimmingEndForValue(&new_node->regular_fanins, EmptyTensorId());
-  if (new_node->regular_fanins.size() != new_node->num_regular_fanins) {
+  int new_node_regular_fanins_size = new_node->regular_fanins.size();
+  if (new_node_regular_fanins_size != new_node->num_regular_fanins) {
     return false;
   }
 
diff --git a/tensorflow/core/grappler/utils/topological_sort.cc b/tensorflow/core/grappler/utils/topological_sort.cc
index e24a457593a..5ed292d1983 100644
--- a/tensorflow/core/grappler/utils/topological_sort.cc
+++ b/tensorflow/core/grappler/utils/topological_sort.cc
@@ -81,7 +81,7 @@ Status ComputeTopologicalOrder(
     int ready_node = (*ready_nodes)[front];
     for (int fanout : graph_view.GetFanout(ready_node)) {
       ++num_ready_inputs[fanout];
-      if (num_ready_inputs[fanout] == graph_view.GetFanin(fanout).size()) {
+      if (num_ready_inputs[fanout] == static_cast<int>(graph_view.GetFanin(fanout).size())) {
         ready_nodes->push_back(fanout);
         ++back;
       }
@@ -95,7 +95,7 @@ Status ComputeTopologicalOrder(
                  "at node = "
               << graph.node(back).DebugString();
       for (int i = 0; i < graph_view.num_nodes(); ++i) {
-        if (num_ready_inputs[i] != graph_view.GetFanin(i).size()) {
+        if (num_ready_inputs[i] != static_cast<int>(graph_view.GetFanin(i).size())) {
           VLOG(1) << "Node not ready: " << graph.node(i).DebugString();
         }
       }
diff --git a/tensorflow/core/kernels/data/experimental/sql/sqlite_query_connection.cc b/tensorflow/core/kernels/data/experimental/sql/sqlite_query_connection.cc
index e86cbc7684c..ada94be15bf 100644
--- a/tensorflow/core/kernels/data/experimental/sql/sqlite_query_connection.cc
+++ b/tensorflow/core/kernels/data/experimental/sql/sqlite_query_connection.cc
@@ -68,7 +68,7 @@ Status SqliteQueryConnection::GetNext(IteratorContext* ctx,
 
 Status SqliteQueryConnection::PrepareQuery() {
   TF_RETURN_IF_ERROR(db_->Prepare(query_, &stmt_));
-  int column_count = stmt_.ColumnCount();
+  size_t column_count = stmt_.ColumnCount();
   if (column_count != output_types_.size()) {
     stmt_ = SqliteStatement();
     return errors::InvalidArgument(tensorflow::strings::Printf(
diff --git a/tensorflow/python/grappler/model_analyzer.cc b/tensorflow/python/grappler/model_analyzer.cc
index 5a76cdd8fb2..250010c0fed 100644
--- a/tensorflow/python/grappler/model_analyzer.cc
+++ b/tensorflow/python/grappler/model_analyzer.cc
@@ -48,7 +48,7 @@ void ModelAnalyzer::PrintNodeInfo(const NodeDef* node,
   if (properties.HasOutputProperties(node->name())) {
     const std::vector<OpInfo::TensorProperties>& props =
         properties.GetOutputProperties(node->name());
-    for (int i = 0; i < props.size(); ++i) {
+    for (int i = 0, props_size = props.size(); i < props_size; ++i) {
       const OpInfo::TensorProperties& prop = props[i];
       os << "\t"
          << "output " << i << " (" << DataTypeString(prop.dtype())
@@ -88,7 +88,7 @@ void ModelAnalyzer::PrintNodeInfo(const NodeDef* node,
     } else if (properties.HasInputProperties(node->name())) {
       const std::vector<OpInfo::TensorProperties>& props =
           properties.GetInputProperties(node->name());
-      for (int i = 0; i < props.size(); ++i) {
+      for (int i = 0, props_size = props.size(); i < props_size; ++i) {
         const OpInfo::TensorProperties& prop = props[i];
         if (prop.has_value()) {
           os << "\t"

From 6166444602c0ddab5e5e7ff129113341a99bd98c Mon Sep 17 00:00:00 2001
From: Andrew Audibert <aaudibert@google.com>
Date: Mon, 15 Jun 2020 20:56:58 -0700
Subject: [PATCH 0259/1390] [tf.data] Add prefetch benchmark.

This CL adds a benchmark to test the performance of the prefetch dataset transformation.

PiperOrigin-RevId: 316605096
Change-Id: Iaf5d8f8c3afba6e51a53805afe5bc978916ff01e
---
 tensorflow/python/data/benchmarks/BUILD       |  9 +++++
 .../data/benchmarks/prefetch_benchmark.py     | 40 +++++++++++++++++++
 2 files changed, 49 insertions(+)
 create mode 100644 tensorflow/python/data/benchmarks/prefetch_benchmark.py

diff --git a/tensorflow/python/data/benchmarks/BUILD b/tensorflow/python/data/benchmarks/BUILD
index 256f10dcefd..3f0faf5364a 100644
--- a/tensorflow/python/data/benchmarks/BUILD
+++ b/tensorflow/python/data/benchmarks/BUILD
@@ -85,6 +85,15 @@ tf_py_test(
     ],
 )
 
+tf_py_test(
+    name = "prefetch_benchmark",
+    srcs = ["prefetch_benchmark.py"],
+    deps = [
+        ":benchmark_base",
+        "//tensorflow/python/data/ops:dataset_ops",
+    ],
+)
+
 tf_py_test(
     name = "range_benchmark",
     srcs = ["range_benchmark.py"],
diff --git a/tensorflow/python/data/benchmarks/prefetch_benchmark.py b/tensorflow/python/data/benchmarks/prefetch_benchmark.py
new file mode 100644
index 00000000000..31177508397
--- /dev/null
+++ b/tensorflow/python/data/benchmarks/prefetch_benchmark.py
@@ -0,0 +1,40 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Benchmarks for `tf.data.Dataset.prefetch()`."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.data.benchmarks import benchmark_base
+from tensorflow.python.data.ops import dataset_ops
+
+
+class PrefetchBenchmark(benchmark_base.DatasetBenchmarkBase):
+  """Benchmarks for `tf.data.Dataset.prefetch()`."""
+
+  def benchmark_prefetch(self):
+    num_elements = 1000000
+    for prefetch_buffer in [1, 5, 10, 20, 100]:
+      dataset = dataset_ops.Dataset.range(num_elements)
+      dataset = dataset.prefetch(prefetch_buffer)
+
+      self.run_and_report_benchmark(
+          dataset,
+          num_elements=num_elements,
+          name="prefetch_{}".format(prefetch_buffer))
+
+
+if __name__ == "__main__":
+  benchmark_base.test.main()

From 52e1dba6b14da82ddd30344526e13557cf33cc32 Mon Sep 17 00:00:00 2001
From: Tare Gaskin <taregaskin@google.com>
Date: Tue, 16 Jun 2020 04:02:50 +0000
Subject: [PATCH 0260/1390] getting rid of pesky lingering commits

---
 .../mlir/lite/quantization/import_quant_stats_pass.cc  |  4 ++--
 .../mlir/lite/quantization/quantization_config.cc      |  8 ++++----
 .../mlir/lite/quantization/quantization_driver.cc      |  4 ++--
 .../mlir/lite/quantization/quantization_utils.cc       | 10 +++++-----
 .../compiler/mlir/tensorflow/utils/dump_mlir_util.cc   |  2 +-
 tensorflow/compiler/mlir/xla/ir/chlo_ops.cc            |  2 +-
 tensorflow/compiler/mlir/xla/ir/hlo_ops.cc             |  6 +++---
 7 files changed, 18 insertions(+), 18 deletions(-)

diff --git a/tensorflow/compiler/mlir/lite/quantization/import_quant_stats_pass.cc b/tensorflow/compiler/mlir/lite/quantization/import_quant_stats_pass.cc
index e00a088c38c..d924a3e82ac 100644
--- a/tensorflow/compiler/mlir/lite/quantization/import_quant_stats_pass.cc
+++ b/tensorflow/compiler/mlir/lite/quantization/import_quant_stats_pass.cc
@@ -76,7 +76,7 @@ class ImportQuantStatsPass
   // If the index is out of range, this method returns false. Otherwise it
   // returns true if the value is a float tensor.
   bool IsQuantizableResult(Operation *op, int index) {
-    if (index < 0 || index >= static_cast<int>(op->getNumResults())) return false;
+    if (index < 0 || index >= op->getNumResults()) return false;
     Value res = op->getResult(index);
     return res.getType().isa<ShapedType>() &&
            res.getType().cast<ShapedType>().getElementType().isa<FloatType>();
@@ -158,7 +158,7 @@ void ImportQuantStatsPass::ImportAsStatsOps(OpBuilder b, Operation *op,
     InsertStatsOpAtResult(b, op->getResult(index), layer_stats, axis_stats,
                           axis);
   } else {
-    for (int i = 0, e = op->getNumResults(); i < e; ++i) {
+    for (int i = 0; i < op->getNumResults(); ++i) {
       if (IsQuantizableResult(op, i)) {
         InsertStatsOpAtResult(b, op->getResult(i), layer_stats, axis_stats,
                               axis);
diff --git a/tensorflow/compiler/mlir/lite/quantization/quantization_config.cc b/tensorflow/compiler/mlir/lite/quantization/quantization_config.cc
index cdff93502f2..6b897bd5608 100644
--- a/tensorflow/compiler/mlir/lite/quantization/quantization_config.cc
+++ b/tensorflow/compiler/mlir/lite/quantization/quantization_config.cc
@@ -48,9 +48,9 @@ bool ParseInputNodeQuantSpecs(absl::string_view node_names,
   std::vector<double> node_mins;
   if (!min_values.empty()) {
     std::vector<std::string> node_mins_str = absl::StrSplit(min_values, ',');
-    for (const std::string&node_min : node_mins_str.size()) {
+    for (int i = 0; i < node_mins_str.size(); i++) {
       double value;
-      if (!absl::SimpleAtod(node_min, &value)) {
+      if (!absl::SimpleAtod(node_mins_str[i], &value)) {
         return true;
       }
       node_mins.push_back(value);
@@ -60,9 +60,9 @@ bool ParseInputNodeQuantSpecs(absl::string_view node_names,
   std::vector<double> node_maxs;
   if (!max_values.empty()) {
     std::vector<std::string> node_maxs_str = absl::StrSplit(max_values, ',');
-    for (const std::string&node_max : node_maxs_str.size()) {
+    for (int i = 0; i < node_maxs_str.size(); i++) {
       double value;
-      if (!absl::SimpleAtod(node_max, &value)) {
+      if (!absl::SimpleAtod(node_maxs_str[i], &value)) {
         llvm::errs() << "Unexpected mins: " << node_maxs_str[i] << "\n";
         return true;
       }
diff --git a/tensorflow/compiler/mlir/lite/quantization/quantization_driver.cc b/tensorflow/compiler/mlir/lite/quantization/quantization_driver.cc
index a9f4eb78431..2964a3e79f8 100644
--- a/tensorflow/compiler/mlir/lite/quantization/quantization_driver.cc
+++ b/tensorflow/compiler/mlir/lite/quantization/quantization_driver.cc
@@ -294,7 +294,7 @@ class QuantizationDriver {
         return;
       if (current_op == op) llvm::errs() << "===>>>";
       llvm::errs() << op->getName() << " : (";
-      for (int i = 0, e = op->getNumOperands(); i < e; ++i) {
+      for (auto i = 0; i < op->getNumOperands(); ++i) {
         if (auto params = GetOperandQuantState(op, i).params)
           params.print(llvm::errs());
         else
@@ -303,7 +303,7 @@ class QuantizationDriver {
         llvm::errs() << ",";
       }
       llvm::errs() << ") -> (";
-      for (int i = 0, e = op->getNumResults(); i < e; ++i) {
+      for (auto i = 0; i < op->getNumResults(); ++i) {
         if (auto params = GetResultQuantState(op, i).params)
           params.print(llvm::errs());
         else
diff --git a/tensorflow/compiler/mlir/lite/quantization/quantization_utils.cc b/tensorflow/compiler/mlir/lite/quantization/quantization_utils.cc
index 57b24eb8772..3d50f280d0f 100644
--- a/tensorflow/compiler/mlir/lite/quantization/quantization_utils.cc
+++ b/tensorflow/compiler/mlir/lite/quantization/quantization_utils.cc
@@ -54,7 +54,7 @@ static Type GetQuantizedType(Builder builder, Type input_type,
   } else if (min.size() == max.size()) {
     auto shape = input_type.dyn_cast<ShapedType>();
     if (!shape || shape.getRank() <= quant_dim ||
-        static_cast<int64_t>(min.size()) != shape.getDimSize(quant_dim)) {
+        min.size() != shape.getDimSize(quant_dim)) {
       return {};
     }
     // TODO(b/141508873): the quantization dim is set to the last dimension.
@@ -75,7 +75,7 @@ TypeAttr RescaleQuantizedType(Type input, Attribute factor) {
   if (auto qtype = ele_type.dyn_cast<quant::UniformQuantizedPerAxisType>()) {
     ArrayRef<double> scales = qtype.getScales();
     // Broadcasting hasn't been implemented yet.
-    if (static_cast<int64_t>(scales.size()) != factor_values.getNumElements()) return {};
+    if (scales.size() != factor_values.getNumElements()) return {};
     SmallVector<double, 4> new_scales;
     new_scales.reserve(scales.size());
     auto scales_iter = scales.begin();
@@ -269,7 +269,7 @@ Type GetUniformQuantizedPerAxisTypeForWeight(ElementsAttr attr, int quant_dim,
                                              bool narrow_range) {
   Builder builder(attr.getContext());
   auto shape = attr.getType().cast<ShapedType>().getShape();
-  if (static_cast<int>(shape.size()) <= quant_dim) return {};
+  if (shape.size() <= quant_dim) return {};
   // `symmetric` can only be used when it is `signed` and `narrow_range`.
   if (symmetric && (!is_signed || !narrow_range)) return {};
 
@@ -334,7 +334,7 @@ quant::QuantizedType GetUniformQuantizedTypeForBias(
     const std::vector<quant::QuantizedType>& op_types) {
   if (op_types.empty()) return {};
 
-  size_t axis_size = 1;
+  int axis_size = 1;
   int32_t quant_dim = -1;
   Type expressed_type;
   // Requires all the op types are valid UniformQuantizedTypes or
@@ -368,7 +368,7 @@ quant::QuantizedType GetUniformQuantizedTypeForBias(
         scales[index_scale.index()] *= index_scale.value();
       }
     } else if (auto type = op_type.dyn_cast<quant::UniformQuantizedType>()) {
-      for (int index = 0, e = axis_size; index != e; ++index) {
+      for (int index = 0; index != axis_size; ++index) {
         scales[index] *= type.getScale();
       }
     }
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc b/tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc
index febf2bc096d..797687ea658 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc
@@ -41,7 +41,7 @@ std::string MakeUniqueFilename(string name) {
   static NameCounts& instance = *new NameCounts;
 
   // Remove illegal characters from `name`.
-  for (int i = 0, e = name.size(); i < e; ++i) {
+  for (int i = 0; i < name.size(); ++i) {
     char ch = name[i];
     if (ch == '/' || ch == '[' || ch == ']' || ch == '*' || ch == '?' ||
         ch == '\\') {
diff --git a/tensorflow/compiler/mlir/xla/ir/chlo_ops.cc b/tensorflow/compiler/mlir/xla/ir/chlo_ops.cc
index 3408f3ed0cc..26db4549a2a 100644
--- a/tensorflow/compiler/mlir/xla/ir/chlo_ops.cc
+++ b/tensorflow/compiler/mlir/xla/ir/chlo_ops.cc
@@ -49,7 +49,7 @@ static Type GetBroadcastType(Type x, Type y, Type element_type,
 
   if (shape_x.size() == shape_y.size()) {
     llvm::SmallVector<int64_t, 4> out_shape(shape_x.size());
-    for (int i = 0, e = shape_x.size(); i < e; i++) {
+    for (int i = 0; i < shape_x.size(); i++) {
       auto x_val = shape_x[i];
       auto y_val = shape_y[i];
       if (x_val == -1 || y_val == -1) {
diff --git a/tensorflow/compiler/mlir/xla/ir/hlo_ops.cc b/tensorflow/compiler/mlir/xla/ir/hlo_ops.cc
index 7f313b56925..d20f1713eba 100644
--- a/tensorflow/compiler/mlir/xla/ir/hlo_ops.cc
+++ b/tensorflow/compiler/mlir/xla/ir/hlo_ops.cc
@@ -143,7 +143,7 @@ DenseIntElementsAttr BuildConvPaddingAttrs(
 
   int rank = padding_low.size();
   SmallVector<int64_t, 8> padding;
-  for (unsigned i = 0, e = rank; i < e; ++i) {
+  for (unsigned i = 0; i < rank; ++i) {
     padding.push_back(GetPaddingValue(padding_attr, {i, 0}) + padding_low[i]);
     padding.push_back(GetPaddingValue(padding_attr, {i, 1}) + padding_high[i]);
   }
@@ -853,7 +853,7 @@ static Attribute foldConcatenateHelper(ConcatenateOp* op,
   auto shape = type.getShape();
 
   size_t top_size = 1;
-  for (int i = 0, e = axis; i < e; i++) {
+  for (int i = 0; i < axis; i++) {
     top_size = top_size * shape[i];
   }
 
@@ -1118,7 +1118,7 @@ static LogicalResult Verify(MapOp op) {
   // increasing.
   auto values = op.dimensions().getValues<int64_t>();
   auto dimensions = std::vector<int64_t>{values.begin(), values.end()};
-  for (int i = 0, e = dimensions.size(); i < e; ++i) {
+  for (int i = 0; i < dimensions.size(); ++i) {
     if (dimensions[i] != i)
       return op.emitOpError() << "requires monotonically increasing dimension "
                                  "numbers, but got: "

From 7ee9571a8e127b39a4a8a01016e40105b7613bbf Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 15 Jun 2020 21:21:15 -0700
Subject: [PATCH 0261/1390] Integrate LLVM at
 https://github.com/llvm/llvm-project/commit/1a7f115dce22

PiperOrigin-RevId: 316608084
Change-Id: I6df721c782e850371267e6a2fba7eda96d3b1610
---
 third_party/mlir/BUILD      | 22 ++++++++++++++++++++++
 third_party/mlir/test.BUILD |  1 +
 2 files changed, 23 insertions(+)

diff --git a/third_party/mlir/BUILD b/third_party/mlir/BUILD
index 5f3f0a4b99b..06e0ed8d4b4 100644
--- a/third_party/mlir/BUILD
+++ b/third_party/mlir/BUILD
@@ -756,6 +756,25 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "ShapeToSCF",
+    srcs = glob([
+        "lib/Conversion/ShapeToSCF/*.cpp",
+        "lib/Conversion/ShapeToSCF/*.h",
+    ]) + ["lib/Conversion/PassDetail.h"],
+    hdrs = ["include/mlir/Conversion/ShapeToSCF/ShapeToSCF.h"],
+    includes = ["include"],
+    deps = [
+        ":ConversionPassIncGen",
+        ":IR",
+        ":Pass",
+        ":SCFDialect",
+        ":Shape",
+        ":StandardOps",
+        ":Transforms",
+    ],
+)
+
 gentbl(
     name = "ShapeTransformsPassIncGen",
     strip_include_prefix = "include",
@@ -2613,6 +2632,7 @@ cc_library(
         ":Parser",
         ":Pass",
         ":SCFTransforms",
+        ":ShapeToSCF",
         ":ShapeToStandard",
         ":ShapeTransforms",
         ":StandardOpsTransforms",
@@ -2713,6 +2733,7 @@ cc_library(
         ":SPIRVPassIncGen",
         ":SPIRVToLLVM",
         ":Shape",
+        ":ShapeToSCF",
         ":ShapeToStandard",
         ":ShapeTransforms",
         ":ShapeTransformsPassIncGen",
@@ -3282,6 +3303,7 @@ cc_library(
         ":LinalgTransforms",
         ":Pass",
         ":StandardOps",
+        ":StandardOpsTransforms",
         ":Support",
         ":Transforms",
         ":VectorToLLVM",
diff --git a/third_party/mlir/test.BUILD b/third_party/mlir/test.BUILD
index 23287ce28d6..14c2ba7778e 100644
--- a/third_party/mlir/test.BUILD
+++ b/third_party/mlir/test.BUILD
@@ -166,6 +166,7 @@ cc_library(
         "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:SCFDialect",
         "@llvm-project//mlir:StandardOps",
+        "@llvm-project//mlir:StandardOpsTransforms",
         "@llvm-project//mlir:Support",
         "@llvm-project//mlir:TargetNVVMIR",
         "@llvm-project//mlir:TargetROCDLIR",

From e74010b4e803f5f47f6c013a2932172e4beae72a Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 15 Jun 2020 22:24:35 -0700
Subject: [PATCH 0262/1390] Clarify the documentation for PermuteDimensions

PiperOrigin-RevId: 316616603
Change-Id: Iccfbd986276688bdc25b6757cdee6806f0b587d6
---
 tensorflow/compiler/xla/shape_util.cc | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/tensorflow/compiler/xla/shape_util.cc b/tensorflow/compiler/xla/shape_util.cc
index 52cbb8f95ac..ab46e49b181 100644
--- a/tensorflow/compiler/xla/shape_util.cc
+++ b/tensorflow/compiler/xla/shape_util.cc
@@ -968,17 +968,18 @@ Status ForEachMutableSubshapeHelper(
   // `shape`'s list of dimensions is isomorphic to the identity I.
   //
   // Let `shape`'s layout be L.  A layout is a permutation which maps a
-  // minor-to-major physical layout to the order of a shape's logical dims.
-  // Therefore inverse of a layout maps from logical to physical dims, and so
-  // the physical layout of I is simply L'.I = L', where L' is the inverse of L.
+  // minor-to-major physical dimension ordering to a shape's logical dimension
+  // ordering.  Therefore the inverse of a layout maps from logical to physical
+  // dims, and so the physical ordering of I is simply L'.I = L', where L' is
+  // the inverse of L.
   //
   // Let the argument `permutation` be P.  This is a permutation over `shape`'s
   // dimensions, so our return value will be a shape with dims P.I = P.  Our
-  // goal is to construct a layout permutation L* that we can apply to P such
-  // that the physical dimension ordering of the returned shape is the same
-  // as that of the original shape, namely L'.
+  // goal is to construct a layout permutation L* for this shape. The physical
+  // dimension ordering of this returned shape must be the same as that of the
+  // original shape, namely L'.
   //
-  // Our returned shape has dims P and layout L*, so its in-memory layout is
+  // Our returned shape has dims P and layout L*, so its in-memory ordering is
   // L*'.P.  Setting this equal to L' and solving for L*, we get:
   //
   //   L*'.P = L'    =>

From 42a734170dae2942fcf553ccf5480fd48840795a Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 15 Jun 2020 23:00:10 -0700
Subject: [PATCH 0263/1390] tf.numpy: Change a bunch of ops to handle unknown
 shapes. Fix logic in sort_ops to handle 1D values.

PiperOrigin-RevId: 316620637
Change-Id: Iedc2ba8aad7673bbe210661bb741bf0660f047aa
---
 .../python/ops/numpy_ops/np_array_ops.py      | 36 ++++++--
 tensorflow/python/ops/numpy_ops/np_arrays.py  | 51 ++++++++---
 .../python/ops/numpy_ops/np_math_ops.py       | 90 +++++++++++++------
 tensorflow/python/ops/numpy_ops/np_utils.py   | 32 ++++---
 tensorflow/python/ops/sort_ops.py             | 16 +---
 5 files changed, 153 insertions(+), 72 deletions(-)

diff --git a/tensorflow/python/ops/numpy_ops/np_array_ops.py b/tensorflow/python/ops/numpy_ops/np_array_ops.py
index fbf67a46e31..e97bb61613b 100644
--- a/tensorflow/python/ops/numpy_ops/np_array_ops.py
+++ b/tensorflow/python/ops/numpy_ops/np_array_ops.py
@@ -223,9 +223,10 @@ def full(shape, fill_value, dtype=None):  # pylint: disable=redefined-outer-name
   Raises:
     ValueError: if `fill_value` can not be broadcast to shape `shape`.
   """
+  if not isinstance(shape, np_arrays.ndarray):
+    shape = asarray(np_arrays.convert_to_tensor(shape, dtype_hint=np.int32))
+  shape = atleast_1d(shape).data
   fill_value = asarray(fill_value, dtype=dtype)
-  if np_utils.isscalar(shape):
-    shape = array_ops.reshape(shape, [1])
   return np_arrays.tensor_to_ndarray(
       array_ops.broadcast_to(fill_value.data, shape))
 
@@ -808,16 +809,21 @@ def var(a, axis=None, dtype=None, out=None, ddof=0, keepdims=None):  # pylint: d
 @np_utils.np_doc(np.std)
 def std(a, axis=None, keepdims=None):  # pylint: disable=missing-function-docstring
   return _reduce(
-      math_ops.reduce_std, a, axis=axis, dtype=None, keepdims=keepdims,
+      math_ops.reduce_std,
+      a,
+      axis=axis,
+      dtype=None,
+      keepdims=keepdims,
       promote_int=_TO_FLOAT)
 
 
 @np_utils.np_doc(np.ravel)
 def ravel(a):  # pylint: disable=missing-docstring
   a = asarray(a)
-  if a.ndim == 1:
-    return a
-  return np_utils.tensor_to_ndarray(array_ops.reshape(a.data, [-1]))
+  out = np_utils.cond(
+      math_ops.equal(a.ndim, 1), lambda: a.data,
+      lambda: array_ops.reshape(a.data, [-1]))
+  return np_utils.tensor_to_ndarray(out)
 
 
 setattr(np_arrays.ndarray, 'ravel', ravel)
@@ -846,7 +852,8 @@ def repeat(a, repeats, axis=None):  # pylint: disable=missing-docstring
   a = asarray(a).data
   original_shape = a._shape_as_list()  # pylint: disable=protected-access
   # Best effort recovery of the shape.
-  if original_shape is not None and None not in original_shape:
+  known_shape = original_shape is not None and None not in original_shape
+  if known_shape:
     if not original_shape:
       original_shape = (repeats,)
     else:
@@ -865,7 +872,8 @@ def repeat(a, repeats, axis=None):  # pylint: disable=missing-docstring
 
   repeats = asarray(repeats).data
   result = array_ops.repeat(a, repeats, axis)
-  result.set_shape(original_shape)
+  if known_shape:
+    result.set_shape(original_shape)
 
   return np_utils.tensor_to_ndarray(result)
 
@@ -1287,7 +1295,13 @@ def broadcast_to(array, shape):  # pylint: disable=redefined-outer-name
 
 
 @np_utils.np_doc(np.stack)
-def stack(arrays, axis=0):
+def stack(arrays, axis=0):  # pylint: disable=missing-function-docstring
+  if isinstance(arrays, (np_arrays.ndarray, ops.Tensor)):
+    arrays = asarray(arrays)
+    if axis == 0:
+      return arrays
+    else:
+      return swapaxes(arrays, 0, axis)
   arrays = _promote_dtype(*arrays)  # pylint: disable=protected-access
   unwrapped_arrays = [
       a.data if isinstance(a, np_arrays.ndarray) else a for a in arrays
@@ -1450,6 +1464,8 @@ def tri(N, M=None, k=0, dtype=None):  # pylint: disable=invalid-name,missing-doc
 @np_utils.np_doc(np.tril)
 def tril(m, k=0):  # pylint: disable=missing-docstring
   m = asarray(m).data
+  if m.shape.ndims is None:
+    raise ValueError('Argument to tril should have known rank')
   m_shape = m.shape.as_list()
 
   if len(m_shape) < 2:
@@ -1470,6 +1486,8 @@ def tril(m, k=0):  # pylint: disable=missing-docstring
 @np_utils.np_doc(np.triu)
 def triu(m, k=0):  # pylint: disable=missing-docstring
   m = asarray(m).data
+  if m.shape.ndims is None:
+    raise ValueError('Argument to triu should have known rank')
   m_shape = m.shape.as_list()
 
   if len(m_shape) < 2:
diff --git a/tensorflow/python/ops/numpy_ops/np_arrays.py b/tensorflow/python/ops/numpy_ops/np_arrays.py
index a7696ad31c2..e2f73100909 100644
--- a/tensorflow/python/ops/numpy_ops/np_arrays.py
+++ b/tensorflow/python/ops/numpy_ops/np_arrays.py
@@ -13,6 +13,9 @@
 # limitations under the License.
 # ==============================================================================
 """ndarray class."""
+
+# pylint: disable=g-direct-tensorflow-import
+
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
@@ -151,13 +154,16 @@ def _slice_helper(tensor, slice_spec, var=None):
 def convert_to_tensor(value, dtype=None, dtype_hint=None):
   """Wrapper over `tf.convert_to_tensor`.
 
-     Args:
-       value: value to convert
-       dtype: (optional) the type we would like it to be converted to.
-       dtype_hint: (optional) soft preference for the type we would like it to
-         be converted to. `tf.convert_to_tensor` will attempt to convert value
-         to this type first, but will not fail if conversion is not possible
-         falling back to inferring the type instead.
+  Args:
+    value: value to convert
+    dtype: (optional) the type we would like it to be converted to.
+    dtype_hint: (optional) soft preference for the type we would like it to be
+      converted to. `tf.convert_to_tensor` will attempt to convert value to this
+      type first, but will not fail if conversion is not possible falling back
+      to inferring the type instead.
+
+  Returns:
+    Value converted to tf.Tensor.
   """
   # A safer version of `tf.convert_to_tensor` to work around b/149876037.
   # TODO(wangpeng): Remove this function once the bug is fixed.
@@ -250,8 +256,12 @@ class ndarray(object):  # pylint: disable=invalid-name
 
   @property
   def shape(self):
-    """Returns a tuple of array dimensions."""
-    return self.data._shape_tuple()  # pylint: disable=protected-access
+    """Returns a tuple or tf.Tensor of array dimensions."""
+    shape = self.data.shape
+    if shape.is_fully_defined():
+      return tuple(shape.as_list())
+    else:
+      return array_ops.shape(self.data)
 
   @property
   def dtype(self):
@@ -259,19 +269,30 @@ class ndarray(object):  # pylint: disable=invalid-name
 
   @property
   def ndim(self):
-    return self.data.shape.ndims
+    ndims = self.data.shape.ndims
+    if ndims is None:
+      return array_ops.rank(self.data)
+    else:
+      return ndims
 
   @property
   def size(self):
     """Returns the number of elements in the array."""
-    return np.prod(self.shape)
+    shape = self.shape
+    if isinstance(shape, ops.Tensor):
+      return array_ops.size(self.data)
+    else:
+      return np.prod(self.shape)
 
   @property
   def T(self):  # pylint: disable=invalid-name
     return self.transpose()
 
   def __len__(self):
-    if self.shape:
+    shape = self.shape
+    if isinstance(shape, ops.Tensor):
+      raise TypeError('len() of symbolic tensor undefined')
+    elif shape:
       return self.shape[0]
     else:
       raise TypeError('len() of unsized object.')
@@ -320,6 +341,8 @@ class ndarray(object):  # pylint: disable=invalid-name
     return tensor_to_ndarray(result_t)
 
   def __iter__(self):
+    if not isinstance(self.data, ops.EagerTensor):
+      raise TypeError('Iteration over symbolic tensor is not allowed')
     for i in range(self.shape[0]):
       result_t = self.data[i]
       yield tensor_to_ndarray(result_t)
@@ -356,6 +379,8 @@ class ndarray(object):  # pylint: disable=invalid-name
       ValueError: If the array does not have size 1.
     """
     # TODO(wangpeng): Handle graph mode
+    if not isinstance(self.data, ops.EagerTensor):
+      raise TypeError('Indexing using symbolic tensor is not allowed')
     return np.asscalar(self.data.numpy())
 
   def tolist(self):
@@ -384,5 +409,3 @@ def ndarray_to_tensor(arr, dtype=None, name=None, as_ref=False):
 
 
 ops.register_tensor_conversion_function(ndarray, ndarray_to_tensor)
-
-
diff --git a/tensorflow/python/ops/numpy_ops/np_math_ops.py b/tensorflow/python/ops/numpy_ops/np_math_ops.py
index 02d37b3a3a4..b32f78bee5a 100644
--- a/tensorflow/python/ops/numpy_ops/np_math_ops.py
+++ b/tensorflow/python/ops/numpy_ops/np_math_ops.py
@@ -50,9 +50,9 @@ def dot(a, b):  # pylint: disable=missing-docstring
             math_ops.equal(array_ops.rank(b), 0)),
         lambda: a * b,
         lambda: np_utils.cond(  # pylint: disable=g-long-lambda
-            math_ops.equal(array_ops.rank(b), 1), lambda: math_ops.tensordot(
-                a, b, axes=[[-1], [-1]]), lambda: math_ops.tensordot(
-                    a, b, axes=[[-1], [-2]])))
+            math_ops.equal(array_ops.rank(b), 1),
+            lambda: math_ops.tensordot(a, b, axes=[[-1], [-1]]),
+            lambda: math_ops.tensordot(a, b, axes=[[-1], [-2]])))
 
   return _bin_op(f, a, b)
 
@@ -204,8 +204,8 @@ def matmul(x1, x2):  # pylint: disable=missing-docstring
       return np_utils.cond(
           math_ops.equal(array_ops.rank(x2), 1),
           lambda: math_ops.tensordot(x1, x2, axes=1),
-          lambda: np_utils.cond(
-              math_ops.equal(array_ops.rank(x1), 1),  # pylint: disable=g-long-lambda
+          lambda: np_utils.cond(  # pylint: disable=g-long-lambda
+              math_ops.equal(array_ops.rank(x1), 1),
               lambda: math_ops.tensordot(  # pylint: disable=g-long-lambda
                   x1, x2, axes=[[0], [-2]]),
               lambda: math_ops.matmul(x1, x2)))
@@ -352,14 +352,30 @@ def hypot(x1, x2):
 def kron(a, b):  # pylint: disable=missing-function-docstring
   # pylint: disable=protected-access,g-complex-comprehension
   a, b = np_array_ops._promote_dtype(a, b)
-  ndim = max(a.ndim, b.ndim)
-  if a.ndim < ndim:
-    a = np_array_ops.reshape(a, np_array_ops._pad_left_to(ndim, a.shape))
-  if b.ndim < ndim:
-    b = np_array_ops.reshape(b, np_array_ops._pad_left_to(ndim, b.shape))
-  a_reshaped = np_array_ops.reshape(a, [i for d in a.shape for i in (d, 1)])
-  b_reshaped = np_array_ops.reshape(b, [i for d in b.shape for i in (1, d)])
-  out_shape = tuple(np.multiply(a.shape, b.shape))
+  t_a = np_utils.cond(
+      a.ndim < b.ndim,
+      lambda: np_array_ops.reshape(  # pylint: disable=g-long-lambda
+          a.data, np_array_ops._pad_left_to(b.ndim, a.shape)),
+      lambda: a.data)
+  t_b = np_utils.cond(
+      b.ndim < a.ndim,
+      lambda: np_array_ops.reshape(  # pylint: disable=g-long-lambda
+          b.data, np_array_ops._pad_left_to(a.ndim, b.shape)),
+      lambda: b.data)
+
+  def _make_shape(shape, prepend):
+    ones = array_ops.ones_like(shape)
+    if prepend:
+      shapes = [ones, shape]
+    else:
+      shapes = [shape, ones]
+    return array_ops.reshape(array_ops.stack(shapes, axis=1), [-1])
+
+  a_shape = array_ops.shape(t_a)
+  b_shape = array_ops.shape(t_b)
+  a_reshaped = np_array_ops.reshape(t_a, _make_shape(a_shape, False))
+  b_reshaped = np_array_ops.reshape(t_b, _make_shape(b_shape, True))
+  out_shape = a_shape * b_shape
   return np_array_ops.reshape(a_reshaped * b_reshaped, out_shape)
 
 
@@ -454,7 +470,8 @@ def _tf_gcd(x1, x2):  # pylint: disable=missing-function-docstring
   if (not np.issubdtype(x1.dtype.as_numpy_dtype, np.integer) or
       not np.issubdtype(x2.dtype.as_numpy_dtype, np.integer)):
     raise ValueError('Arguments to gcd must be integers.')
-  shape = array_ops.broadcast_static_shape(x1.shape, x2.shape)
+  shape = array_ops.broadcast_dynamic_shape(
+      array_ops.shape(x1), array_ops.shape(x2))
   x1 = array_ops.broadcast_to(x1, shape)
   x2 = array_ops.broadcast_to(x2, shape)
   value, _ = control_flow_ops.while_loop(_gcd_cond_fn, _gcd_body_fn,
@@ -607,7 +624,7 @@ def signbit(x):
 
   def f(x):
     if x.dtype == dtypes.bool:
-      return array_ops.fill(x.shape, False)
+      return array_ops.fill(array_ops.shape(x), False)
     return x < 0
 
   return _scalar(f, x)
@@ -866,7 +883,11 @@ def square(x):
 def diff(a, n=1, axis=-1):  # pylint: disable=missing-function-docstring
 
   def f(a):
+    # TODO(agarwal): transpose and reshape to N, H, 1 and do a 1D convolution
+    # TODO(agarwal): avoid depending on static rank.
     nd = a.shape.rank
+    if nd is None:
+      raise ValueError('diff currently requires known rank for input `a`')
     if (axis + nd if axis < 0 else axis) >= nd:
       raise ValueError('axis %s is out of bounds for array of dimension %s' %
                        (axis, nd))
@@ -887,8 +908,10 @@ def diff(a, n=1, axis=-1):  # pylint: disable=missing-function-docstring
 
 
 def _flip_args(f):
+
   def _f(a, b):
     return f(b, a)
+
   return _f
 
 
@@ -910,6 +933,7 @@ setattr(np_arrays.ndarray, '__rtruediv__', _flip_args(true_divide))
 
 
 def _comparison(tf_fun, x1, x2, cast_bool_to_int=False):
+  """Helper function for comparision."""
   dtype = np_utils.result_type(x1, x2)
   # Cast x1 and x2 to the result_type if needed.
   x1 = np_array_ops.array(x1, dtype=dtype)
@@ -953,12 +977,18 @@ def less_equal(x1, x2):
 
 
 @np_utils.np_doc(np.array_equal)
-def array_equal(a1, a2):
+def array_equal(a1, a2):  # pylint: disable=missing-function-docstring
 
-  def f(a1, a2):
-    if a1.shape != a2.shape:
-      return constant_op.constant(False)
-    return math_ops.reduce_all(math_ops.equal(a1, a2))
+  def f(x1, x2):
+    return np_utils.cond(
+        math_ops.equal(array_ops.rank(x1), array_ops.rank(x2)),
+        lambda: np_utils.cond(  # pylint: disable=g-long-lambda
+            np_utils.reduce_all(
+                math_ops.equal(array_ops.shape(x1), array_ops.shape(x2))
+            ),
+            lambda: math_ops.reduce_all(math_ops.equal(x1, x2)),
+            lambda: constant_op.constant(False)),
+        lambda: constant_op.constant(False))
 
   return _comparison(f, a1, a2)
 
@@ -1001,7 +1031,13 @@ setattr(np_arrays.ndarray, '__ne__', not_equal)
 
 @np_utils.np_doc(np.linspace)
 def linspace(  # pylint: disable=missing-docstring
-    start, stop, num=50, endpoint=True, retstep=False, dtype=float, axis=0):
+    start,
+    stop,
+    num=50,
+    endpoint=True,
+    retstep=False,
+    dtype=float,
+    axis=0):
   if dtype:
     dtype = np_utils.result_type(dtype)
   start = np_array_ops.array(start, dtype=dtype).data
@@ -1054,10 +1090,14 @@ def geomspace(start, stop, num=50, endpoint=True, dtype=None, axis=0):  # pylint
   start_sign = 1 - np_array_ops.sign(np_array_ops.real(start))
   stop_sign = 1 - np_array_ops.sign(np_array_ops.real(stop))
   signflip = 1 - start_sign * stop_sign // 2
-  res = signflip * logspace(log10(signflip * start),
-                            log10(signflip * stop), num,
-                            endpoint=endpoint, base=10.0,
-                            dtype=computation_dtype, axis=0)
+  res = signflip * logspace(
+      log10(signflip * start),
+      log10(signflip * stop),
+      num,
+      endpoint=endpoint,
+      base=10.0,
+      dtype=computation_dtype,
+      axis=0)
   if axis != 0:
     res = np_array_ops.moveaxis(res, 0, axis)
   return np_utils.tensor_to_ndarray(math_ops.cast(res, dtype))
diff --git a/tensorflow/python/ops/numpy_ops/np_utils.py b/tensorflow/python/ops/numpy_ops/np_utils.py
index 47b45b171fb..186e56816fe 100644
--- a/tensorflow/python/ops/numpy_ops/np_utils.py
+++ b/tensorflow/python/ops/numpy_ops/np_utils.py
@@ -47,7 +47,7 @@ def _canonicalize_axes(axes, rank):
     canonicalizer = (
         lambda axis: cond(axis < 0, lambda: axis + rank, lambda: axis))
   else:
-    canonicalizer = lambda axis: axis+rank if axis < 0 else axis
+    canonicalizer = lambda axis: axis + rank if axis < 0 else axis
 
   return [canonicalizer(axis) for axis in axes]
 
@@ -100,9 +100,16 @@ def finfo(dtype):
 
 def isscalar(val):
   """Returns whether `val` is a scalar value or scalar Tensor."""
-  if isinstance(val, (np.ndarray, np_arrays.ndarray, ops.Tensor)):
-    return len(val.shape) == 0  # pylint: disable=g-explicit-length-test
-  return np.isscalar(val)
+  if isinstance(val, np_arrays.ndarray):
+    val = val.data
+  if isinstance(val, ops.Tensor):
+    ndims = val.shape.ndims
+    if ndims is not None:
+      return ndims == 0
+    else:
+      return math_ops.equal(array_ops.rank(val), 0)
+  else:
+    return np.isscalar(val)
 
 
 # Can't use np_doc because np.result_type is a builtin function.
@@ -119,8 +126,8 @@ def result_type(*arrays_and_dtypes):
   def maybe_get_dtype(x):
     # Don't put np.ndarray in this list, because np.result_type looks at the
     # value (not just dtype) of np.ndarray to decide the result type.
-    if isinstance(x, (np_arrays.ndarray, ops.Tensor,
-                      indexed_slices.IndexedSlices)):
+    if isinstance(
+        x, (np_arrays.ndarray, ops.Tensor, indexed_slices.IndexedSlices)):
       return _to_numpy_type(x.dtype)
     elif isinstance(x, dtypes.DType):
       return _to_numpy_type(x)
@@ -277,8 +284,11 @@ def np_doc(np_fun, np_fun_name=None):
       #   for name in np_sig.parameters:
       #     if name not in sig.parameters:
       #       unsupported_params.append(name)
-    f.__doc__ = _np_doc_helper(f, np_fun, np_fun_name=np_fun_name,
-                               unsupported_params=unsupported_params)
+    f.__doc__ = _np_doc_helper(
+        f,
+        np_fun,
+        np_fun_name=np_fun_name,
+        unsupported_params=unsupported_params)
     return f
 
   return decorator
@@ -287,9 +297,9 @@ def np_doc(np_fun, np_fun_name=None):
 def _np_doc_helper(f, np_f, np_fun_name=None, unsupported_params=None):
   """Helper to get docs."""
   if not unsupported_params and not _has_docstring(f) and _has_docstring(np_f):
-      # TODO(wangpeng): It looks like code snippets in numpy doc don't work
-      # correctly with doctest. Fix that and remove the reformatting of the np_f
-      # comment, here and below.
+    # TODO(wangpeng): It looks like code snippets in numpy doc don't work
+    # correctly with doctest. Fix that and remove the reformatting of the np_f
+    # comment, here and below.
     return np_f.__doc__.replace('>>>', '>')
   assert np_f or np_fun_name
   if not np_fun_name:
diff --git a/tensorflow/python/ops/sort_ops.py b/tensorflow/python/ops/sort_ops.py
index 4e66a80bc01..d711516cb86 100644
--- a/tensorflow/python/ops/sort_ops.py
+++ b/tensorflow/python/ops/sort_ops.py
@@ -134,7 +134,7 @@ def _sort_or_argsort(values, axis, direction, return_argsort):
   # Axis must be an integer, not a Tensor.
   axis = framework_ops.convert_to_tensor(axis, name='axis')
   axis_static = tensor_util.constant_value(axis)
-  if axis.shape.ndims != 0 or axis_static is None:
+  if axis.shape.ndims not in (None, 0) or axis_static is None:
     raise ValueError('axis must be a constant scalar')
   axis_static = int(axis_static)  # Avoids NumPy casting error
 
@@ -184,18 +184,8 @@ def _descending_sort(values, axis, return_argsort=False):
           name='transposition')
     else:
       # Generate the transposition array from the tensors.
-      transposition = array_ops.concat(
-          [
-              # Axes up to axis are unchanged.
-              math_ops.range(axis),
-              # Swap axis and rank - 1.
-              [rank - 1],
-              # Axes in [axis + 1, rank - 1) are unchanged.
-              math_ops.range(axis + 1, rank - 1),
-              # Swap axis and rank - 1.
-              [axis]
-          ],
-          axis=0)
+      transposition = array_ops.tensor_scatter_update(
+          math_ops.range(rank), [[axis], [rank-1]], [rank-1, axis])
     top_k_input = array_ops.transpose(values, transposition)
 
   values, indices = nn_ops.top_k(top_k_input, k)

From 3fa71a90282e984050d25f4ea61e4bbe8b75bd12 Mon Sep 17 00:00:00 2001
From: Mehdi Amini <aminim@google.com>
Date: Mon, 15 Jun 2020 23:20:01 -0700
Subject: [PATCH 0264/1390] Bump open source llvm revision to
 1a7f115dce22b2c09fdd4f7f79d24da5de6eaef8

PiperOrigin-RevId: 316622447
Change-Id: I4f0e46be7b6a3e2624034cab02e06a1f8e86a1a0
---
 tensorflow/workspace.bzl                  | 4 ++--
 third_party/llvm/llvm.autogenerated.BUILD | 1 +
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index 60fef8c0cb9..31daf2249a0 100755
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -710,8 +710,8 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
     )
 
     # Check out LLVM and MLIR from llvm-project.
-    LLVM_COMMIT = "ec02635d104c5f42840c63ed41d0cea774d649fe"
-    LLVM_SHA256 = "2e1155b29bd84b7382bcd7c00a88899f4156e9b09bc443debfdd7d3a5931e929"
+    LLVM_COMMIT = "1a7f115dce22b2c09fdd4f7f79d24da5de6eaef8"
+    LLVM_SHA256 = "3ace55744a86211c9c837915b88c18e1ed3e5cd839aaeade6aa88b02bc86e47e"
     LLVM_URLS = [
         "https://storage.googleapis.com/mirror.tensorflow.org/github.com/llvm/llvm-project/archive/{commit}.tar.gz".format(commit = LLVM_COMMIT),
         "https://github.com/llvm/llvm-project/archive/{commit}.tar.gz".format(commit = LLVM_COMMIT),
diff --git a/third_party/llvm/llvm.autogenerated.BUILD b/third_party/llvm/llvm.autogenerated.BUILD
index 88f007dff1d..d05fa841420 100644
--- a/third_party/llvm/llvm.autogenerated.BUILD
+++ b/third_party/llvm/llvm.autogenerated.BUILD
@@ -3602,6 +3602,7 @@ cc_library(
     ]),
     copts = llvm_copts,
     deps = [
+        ":Core",
         ":MC",
         ":MCDisassembler",
         ":Object",

From b03a4dbbd6a679d02437708e210f115525ed20f1 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 16 Jun 2020 00:03:48 -0700
Subject: [PATCH 0265/1390] tf.numpy: Add module comments.

PiperOrigin-RevId: 316626561
Change-Id: I2a750832ccb5461f01c5bf0a948bc15190f4fd39
---
 tensorflow/python/ops/numpy_ops/__init__.py | 108 ++++++++++++++++++--
 1 file changed, 102 insertions(+), 6 deletions(-)

diff --git a/tensorflow/python/ops/numpy_ops/__init__.py b/tensorflow/python/ops/numpy_ops/__init__.py
index d20b171205e..10ace06df9a 100644
--- a/tensorflow/python/ops/numpy_ops/__init__.py
+++ b/tensorflow/python/ops/numpy_ops/__init__.py
@@ -12,30 +12,126 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tensorflow numpy API."""
+"""numpy_ops.
+
+This module provides a subset of numpy API, built on top of TensorFlow
+operations. APIs are based on numpy 1.16 version.
+
+The set of supported APIs may be expanded over time. Also future releases may
+change the baseline version of numpy API being supported. A list of some
+systematic differences with numpy are listed later in the "Differences with
+Numpy" section.
+
+Types
+-----
+
+The module provide an `ndarray` class which wraps an immutable `tf.Tensor`.
+Additional functions are provided which accept array-like objects. Here
+array-like objects includes `ndarrays` as defined by this module, as well as
+`tf.Tensor`, in addition to types accepted by `numpy`.
+
+A subset of `numpy` dtypes are supported, along with `tf.bfloat16`.
+Additionally, support is provided for selecting the default float type
+(`np.float32` vs `np.float64`) given that some applications may prefer lower
+precision.
+
+Device Support
+-------------
+
+Given that `ndarray` and functions wrap TensorFlow constructs, the code will
+have GPU and TPU support on par with TensorFlow. Also the code can be wrapped
+with `tf.function` and XLA compiled. Device placement can be controlled by using
+`with tf.device` scopes.
+
+Graph and Eager Modes
+--------------------
+
+Eager mode execution should typically match numpy semantics of executing
+op-by-op. However the same code can be executed in graph mode, by putting it
+inside a `tf.function`. This can change behavior of certain operations since
+symbolic execution may not have information that is computed during runtime.
+
+Some differences are:
+  * Shapes can be incomplete or unknown. This means that `ndarray.shape`,
+    `ndarray.size` and `ndarray.ndim` can return `ndarray` objects instead of
+    returning integer (or tuple of integer) values.
+  * Python control flow based on `ndarray` values may not work and may have to
+    be rewritten to use `tf.cond` or `tf.while_loop`. Note that autograph
+    conversion as part of `tf.function` should still work.
+  * `__len__`, `__iter__` and `__index__` properties of `ndarray` may similarly
+    not work in graph mode.
+
+Mutation and Variables
+---------------------
+
+`ndarrays` currently wrap immutable `tf.Tensor`. Also currently mutation
+operations like slice assigns are not supported. This may change in the future.
+
+There is currently no explict construct on par with tf.Variable. However one can
+directly construct a `tf.Variable` and use that with the numpy APIs in this
+module. See section on Interop.
+
+Interop
+------
+
+The numpy API calls can be interleaved with TensorFlow calls without incurring
+Tensor data copies. This is true even if the `ndarray` or `tf.Tensor` is placed
+on a non-CPU device.
+
+Additionally, one could put these calls in a `with tf.GradientTape()` context to
+compute gradients through the numpy API calls. Similarly, code vectorization can
+be done using `tf.vectorized_map()`.
+
+In general, the expected behavior should be on par with that of code involving
+`tf.Tensor` and running stateless TensorFlow functions on them.
+
+Array Interface
+--------------
+
+The `ndarray` class implements the `__array__ interface. This should allow these
+objects to be passed into contexts that expect a `numpy` or array-like object
+(e.g. matplotlib).
+
+
+Differences with Numpy
+---------------------
+
+Here is a non-exhaustive list of differences:
+  * Not all dtypes are currently supported. e.g. `np.float96`, `np.float128`.
+    `np.object`, `np.str`, `np.recarray` types are not supported.
+  * `ndarray` storage is in C order only. Fortran order, views, stride_tricks
+    are not supported.
+  * Only a subset of functions and modules are supported. This set would be
+    expanded over time. For supported functions, some arguments or argument
+    values may not be supported. This differences are listed in the function
+    comments.
+  * Buffer mutation is currently not supported. `ndarrays` wrap immutable
+    tensors. This means that output buffer arguments (e..g `out` in ufuncs) are
+    not supported
+  * full `ufunc` support is not provided.
+  * Numpy C API is not supported. Numpy's Cython and Swig integration are not
+    supported.
+"""
 # pylint: disable=g-direct-tensorflow-import
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.ops.array_ops import newaxis
 from tensorflow.python.ops.numpy_ops import np_random as random
-
 # pylint: disable=wildcard-import
-
 from tensorflow.python.ops.numpy_ops.np_array_ops import *
 from tensorflow.python.ops.numpy_ops.np_arrays import ndarray
 from tensorflow.python.ops.numpy_ops.np_dtypes import *
 from tensorflow.python.ops.numpy_ops.np_math_ops import *
+# pylint: enable=wildcard-import
 from tensorflow.python.ops.numpy_ops.np_utils import finfo
 from tensorflow.python.ops.numpy_ops.np_utils import promote_types
 from tensorflow.python.ops.numpy_ops.np_utils import result_type
-# pylint: enable=wildcard-import
 
 # pylint: disable=redefined-builtin,undefined-variable
 max = amax
 min = amin
 round = around
 # pylint: enable=redefined-builtin,undefined-variable
-
-from tensorflow.python.ops.array_ops import newaxis

From 5107743c47cff6980ebd68d61931bc8b5c3c6a87 Mon Sep 17 00:00:00 2001
From: Anjali Sridhar <anjalisridhar@google.com>
Date: Tue, 16 Jun 2020 00:34:30 -0700
Subject: [PATCH 0266/1390] Add VariablePolicy field to the DistributedVariable
 class as part of an internal refactor that will allow us to attach different
 policies to a DistributedVariable.

PiperOrigin-RevId: 316629999
Change-Id: I20160480b0678657198112adaa61ad7a47823cbd
---
 tensorflow/python/distribute/values.py      | 603 +++++++++++++++-----
 tensorflow/python/distribute/values_util.py | 152 +++++
 2 files changed, 615 insertions(+), 140 deletions(-)

diff --git a/tensorflow/python/distribute/values.py b/tensorflow/python/distribute/values.py
index 5c038c01999..c6e0eb34a7b 100644
--- a/tensorflow/python/distribute/values.py
+++ b/tensorflow/python/distribute/values.py
@@ -41,6 +41,36 @@ from tensorflow.python.types import core
 from tensorflow.python.util.tf_export import tf_export
 
 
+def _on_write_update_replica(var, update_fn, value, **kwargs):
+  """Updates variables with ON_WRITE synchronization in replica context."""
+  if var.aggregation == vs.VariableAggregation.NONE:
+    return update_fn(var._get_on_device_or_primary(), value, **kwargs)  # pylint: disable=protected-access
+
+  def merge_fn(strategy, value, **kwargs):
+    """Aggregate values and update all variables in cross replica context."""
+    # Don't allow MEAN with non float dtype, since it may cause unexpected
+    # precision loss. Python3 and NumPy automatically upcast integers to
+    # float in division, but we should always preserve the type.
+    #
+    # Note that to be backward compatible we allow the case when the value
+    # is *always* the same on each replica. I.E. value is not a
+    # PerReplica. Refer to regroup() to see how values are grouped.
+    if var.aggregation == vs.VariableAggregation.MEAN and (
+        not var.dtype.is_floating) and isinstance(value, PerReplica):
+      raise ValueError(
+          "Cannot update non-float variables with "
+          "tf.VariableAggregation.MEAN aggregation in replica context. "
+          "Either change the variable dtype to float or update it in "
+          "cross-replica context.")
+
+    assert strategy == var.distribute_strategy
+    v = values_util.apply_aggregation(strategy, value, var.aggregation, var)
+    return var._update_cross_replica(update_fn, v, **kwargs)  # pylint: disable=protected-access
+
+  return ds_context.get_replica_context().merge_call(
+      merge_fn, args=(value,), kwargs=kwargs)
+
+
 @tf_export("distribute.DistributedValues", v1=[])
 class DistributedValues(object):
   """Base class for representing distributed values.
@@ -409,10 +439,7 @@ class DistributedVariable(DistributedDelegate, variables_lib.Variable,
                           core.Tensor):
   """Holds a map from replica to variables."""
 
-  # TODO(josh11b): Support changing the set of variables if e.g. if new
-  # devices are joining or a device is to leave.
-
-  def __init__(self, strategy, values, aggregation):
+  def __init__(self, strategy, values, aggregation, var_policy=None):
     self._distribute_strategy = strategy
     self._aggregation = aggregation
     super(DistributedVariable, self).__init__(values)
@@ -439,6 +466,9 @@ class DistributedVariable(DistributedDelegate, variables_lib.Variable,
     # when restoring from a checkpoint, we may set the _initializer_op
     # property on the entire `DistributedVariable`.
     self._initializer_op = None
+    # Set a VariablePolicy which decides how we replicate/aggregate the given
+    # variable.
+    self._var_policy = var_policy
 
   def is_initialized(self, name=None):
     """Identifies if all the component variables are initialized.
@@ -580,6 +610,8 @@ class DistributedVariable(DistributedDelegate, variables_lib.Variable,
       return array_ops.identity(self._get())
 
   def value(self):
+    if self._var_policy:
+      return self._var_policy.value(self)
     return self._get_on_device_or_primary().value()
 
   def numpy(self):
@@ -590,87 +622,104 @@ class DistributedVariable(DistributedDelegate, variables_lib.Variable,
           "numpy() is only available when eager execution is enabled.")
 
   def assign_sub(self, value, use_locking=False, name=None, read_value=True):
-    assign_sub_fn = lambda var, *a, **kw: var.assign_sub(*a, **kw)
-    return self._update(
-        update_fn=assign_sub_fn,
-        value=value,
-        use_locking=use_locking,
-        name=name,
-        read_value=read_value)
+    if self._var_policy:
+      return self._var_policy.assign_sub(self, value, use_locking=use_locking,
+                                         name=name, read_value=read_value)
+    return values_util.on_write_assign_sub(self, value, use_locking=use_locking,
+                                           name=name, read_value=read_value)
 
   def assign_add(self, value, use_locking=False, name=None, read_value=True):
-    assign_add_fn = lambda var, *a, **kw: var.assign_add(*a, **kw)
-    return self._update(
-        update_fn=assign_add_fn,
-        value=value,
-        use_locking=use_locking,
-        name=name,
-        read_value=read_value)
+    if self._var_policy:
+      return self._var_policy.assign_add(self, value, use_locking=use_locking,
+                                         name=name, read_value=read_value)
+    return values_util.on_write_assign_add(self, value, use_locking=use_locking,
+                                           name=name, read_value=read_value)
 
   def assign(self, value, use_locking=False, name=None, read_value=True):
-    assign_fn = lambda var, *a, **kw: var.assign(*a, **kw)
-    return self._update(
-        update_fn=assign_fn,
-        value=value,
-        use_locking=use_locking,
-        name=name,
-        read_value=read_value)
+    if self._var_policy:
+      return self._var_policy.assign(self, value, use_locking=use_locking,
+                                     name=name, read_value=read_value)
+    return values_util.on_write_assign(self, value, use_locking=use_locking,
+                                       name=name, read_value=read_value)
 
   def scatter_sub(self, sparse_delta, use_locking=False, name=None):
-    scatter_sub_fn = lambda var, *a, **kw: var.scatter_sub(*a, **kw)
-    return self._update(
-        update_fn=scatter_sub_fn,
-        value=sparse_delta,
-        use_locking=use_locking,
-        name=name)
+    if self._var_policy:
+      self._var_policy.scatter_sub(self, sparse_delta, use_locking=use_locking,
+                                   name=name)
+    return values_util.scatter_sub(self, sparse_delta, use_locking=use_locking,
+                                   name=name)
 
   def scatter_add(self, sparse_delta, use_locking=False, name=None):
-    scatter_add_fn = lambda var, *a, **kw: var.scatter_add(*a, **kw)
-    return self._update(
-        update_fn=scatter_add_fn,
-        value=sparse_delta,
-        use_locking=use_locking,
-        name=name)
+    if self._var_policy:
+      self._var_policy.scatter_add(self, sparse_delta, use_locking=use_locking,
+                                   name=name)
+    return values_util.scatter_add(self, sparse_delta, use_locking=use_locking,
+                                   name=name)
 
   def scatter_mul(self, sparse_delta, use_locking=False, name=None):
-    scatter_mul_fn = lambda var, *a, **kw: var.scatter_mul(*a, **kw)
-    return self._update(
-        update_fn=scatter_mul_fn,
-        value=sparse_delta,
-        use_locking=use_locking,
-        name=name)
+    if self._var_policy:
+      self._var_policy.scatter_mul(self, sparse_delta, use_locking=use_locking,
+                                   name=name)
+    return values_util.scatter_mul(self, sparse_delta, use_locking=use_locking,
+                                   name=name)
 
   def scatter_div(self, sparse_delta, use_locking=False, name=None):
-    scatter_div_fn = lambda var, *a, **kw: var.scatter_div(*a, **kw)
-    return self._update(
-        update_fn=scatter_div_fn,
-        value=sparse_delta,
-        use_locking=use_locking,
-        name=name)
+    if self._var_policy:
+      self._var_policy.scatter_div(self, sparse_delta, use_locking=use_locking,
+                                   name=name)
+    return values_util.scatter_div(self, sparse_delta, use_locking=use_locking,
+                                   name=name)
 
   def scatter_min(self, sparse_delta, use_locking=False, name=None):
-    scatter_min_fn = lambda var, *a, **kw: var.scatter_min(*a, **kw)
-    return self._update(
-        update_fn=scatter_min_fn,
-        value=sparse_delta,
-        use_locking=use_locking,
-        name=name)
+    if self._var_policy:
+      self._var_policy.scatter_min(self, sparse_delta, use_locking=use_locking,
+                                   name=name)
+    return values_util.scatter_min(self, sparse_delta, use_locking=use_locking,
+                                   name=name)
 
   def scatter_max(self, sparse_delta, use_locking=False, name=None):
-    scatter_max_fn = lambda var, *a, **kw: var.scatter_max(*a, **kw)
-    return self._update(
-        update_fn=scatter_max_fn,
-        value=sparse_delta,
-        use_locking=use_locking,
-        name=name)
+    if self._var_policy:
+      self._var_policy.scatter_max(self, sparse_delta, use_locking=use_locking,
+                                   name=name)
+    return values_util.scatter_max(self, sparse_delta, use_locking=use_locking,
+                                   name=name)
 
   def scatter_update(self, sparse_delta, use_locking=False, name=None):
-    scatter_update_fn = lambda var, *a, **kw: var.scatter_update(*a, **kw)
-    return self._update(
-        update_fn=scatter_update_fn,
-        value=sparse_delta,
-        use_locking=use_locking,
-        name=name)
+    if self._var_policy:
+      self._var_policy.scatter_update(self, sparse_delta,
+                                      use_locking=use_locking, name=name)
+    return values_util.scatter_update(self, sparse_delta,
+                                      use_locking=use_locking,
+                                      name=name)
+
+  def _gather_saveables_for_checkpoint(self):
+    """Overrides Trackable method.
+
+    This allows both name-based and object-based save and restore of
+    DistributedVariables.
+
+    Returns:
+      A dictionary mapping attribute names to `SaveableObject` factories.
+    """
+
+    def _saveable_factory(name=self._common_name):
+      return _DistributedVariableSaveable(self, self._primary, name)
+
+    return {trackable.VARIABLE_VALUE_KEY: _saveable_factory}
+
+  def _as_graph_element(self):
+    if self._var_policy:
+      return self._var_policy._as_graph_element(self)  # pylint: disable=protected-access
+
+    raise NotImplementedError("No policy set for calling _as_graph_element.")
+
+  def _get_cross_replica(self):
+    if self._var_policy:
+      return self._var_policy._get_cross_replica(self)  # pylint: disable=protected-access
+
+    raise NotImplementedError(
+        "This method should be overridden by sub-classes which support cross-"
+        "replica accesses.")
 
   def _update_cross_replica(self, update_fn, value, **kwargs):
     """Applies updates across replicas.
@@ -699,6 +748,8 @@ class DistributedVariable(DistributedDelegate, variables_lib.Variable,
     Returns:
       Updated variable or `tf.Operation`.
     """
+    if self._var_policy:
+      return self._var_policy._update_replica(self, update_fn, value, **kwargs)  # pylint: disable=protected-access
     raise NotImplementedError("should be implemented by subclass.")
 
   def _update(self, update_fn, value, **kwargs):
@@ -735,6 +786,31 @@ class DistributedVariable(DistributedDelegate, variables_lib.Variable,
     """Pass resource_variable_ops.is_resource_variable check."""
     pass
 
+  def _dense_var_to_tensor(self, dtype=None, name=None, as_ref=False):
+    """Converts a variable to a tensor."""
+    with ds_context.enter_or_assert_strategy(self._distribute_strategy):
+      return ops.convert_to_tensor(
+          self._get(), dtype=dtype, name=name, as_ref=as_ref)
+
+
+class _DistributedVariableSaveable(saveable_object.SaveableObject):
+  """Class for defining how to restore a DistributedVariable."""
+
+  def __init__(self, distributed_variable, primary_variable, name):
+    self._distributed_variable = distributed_variable
+    if not self._distributed_variable._var_policy:
+      raise ValueError("VariablePolicy has not been set for the distributed "
+                       "variable.")
+    tensor, spec = distributed_variable._var_policy.get_saveable(
+        distributed_variable, primary_variable, name)
+    super(_DistributedVariableSaveable, self).__init__(tensor, spec, name)
+
+  def restore(self, restored_tensors, restored_shapes):
+    """Restore the same value into all variables."""
+    tensor, = restored_tensors
+    return self._distributed_variable._var_policy.get_restore_ops(  # pylint: disable=protected-access
+        self._distributed_variable, tensor)
+
 
 class _MirroredSaveable(saveable_object_util.ResourceVariableSaveable):
   """Class for defining how to restore a MirroredVariable."""
@@ -756,61 +832,27 @@ class MirroredVariable(DistributedVariable, Mirrored):
   """Holds a map from replica to variables whose values are kept in sync."""
 
   def _update_replica(self, update_fn, value, **kwargs):
-    if self.aggregation == vs.VariableAggregation.NONE:
-      return update_fn(self._get_on_device_or_primary(), value, **kwargs)
-
-    def merge_fn(strategy, value, **kwargs):
-      """Aggregate values and update all variables in cross replica context."""
-      # Don't allow MEAN with non float dtype, since it may cause unexpected
-      # precision loss. Python3 and NumPy automatically upcast integers to
-      # float in division, but we should always preserve the type.
-      #
-      # Note that to be backward compatible we allow the case when the value
-      # is *always* the same on each replica. I.E. value is not a
-      # PerReplica. Refer to regroup() to see how values are grouped.
-      if self._aggregation == vs.VariableAggregation.MEAN and (
-          not self.dtype.is_floating) and isinstance(value, PerReplica):
-        raise ValueError(
-            "Cannot update non-float variables with "
-            "tf.VariableAggregation.MEAN aggregation in replica context. "
-            "Either change the variable dtype to float or update it in "
-            "cross-replica context.")
-
-      assert strategy == self.distribute_strategy
-      v = values_util.apply_aggregation(strategy, value, self.aggregation, self)
-      return self._update_cross_replica(update_fn, v, **kwargs)
-
-    return ds_context.get_replica_context().merge_call(
-        merge_fn, args=(value,), kwargs=kwargs)
+    return _on_write_update_replica(self, update_fn, value, **kwargs)
 
   def scatter_min(self, *args, **kwargs):
     if (self._aggregation != vs.VariableAggregation.ONLY_FIRST_REPLICA and
         self._aggregation != vs.VariableAggregation.NONE):
-      raise NotImplementedError("scatter_min is only supported for mirrored "
-                                "variable (variable created within certain "
-                                "`tf.distribute.Strategy` scope) with NONE or "
-                                "`ONLY_FIRST_REPLICA` aggregation, got: %s" %
-                                self._aggregation)
+      raise NotImplementedError(values_util.scatter_error_msg.format(
+          op_name="scatter_min", aggregation=self._aggregation))
     return super(MirroredVariable, self).scatter_min(*args, **kwargs)
 
   def scatter_max(self, *args, **kwargs):
     if (self._aggregation != vs.VariableAggregation.ONLY_FIRST_REPLICA and
         self._aggregation != vs.VariableAggregation.NONE):
-      raise NotImplementedError("scatter_max is only supported for mirrored "
-                                "variable (variable created within certain "
-                                "`tf.distribute.Strategy` scope) with NONE or "
-                                "`ONLY_FIRST_REPLICA` aggregation, got: %s" %
-                                self._aggregation)
+      raise NotImplementedError(values_util.scatter_error_msg.format(
+          op_name="scatter_min", aggregation=self._aggregation))
     return super(MirroredVariable, self).scatter_max(*args, **kwargs)
 
   def scatter_update(self, *args, **kwargs):
     if (self._aggregation != vs.VariableAggregation.ONLY_FIRST_REPLICA and
         self._aggregation != vs.VariableAggregation.NONE):
-      raise NotImplementedError("scatter_update is only supported for mirrored "
-                                "variable (variable created within certain "
-                                "`tf.distribute.Strategy` scope) with NONE or "
-                                "`ONLY_FIRST_REPLICA` aggregation, got: %s" %
-                                self._aggregation)
+      raise NotImplementedError(values_util.scatter_error_msg.format(
+          op_name="scatter_min", aggregation=self._aggregation))
     return super(MirroredVariable, self).scatter_update(*args, **kwargs)
 
   def _get_cross_replica(self):
@@ -893,28 +935,13 @@ class SyncOnReadVariable(DistributedVariable):
   def _update_replica(self, update_fn, value, **kwargs):
     return update_fn(self._get_on_device_or_primary(), value, **kwargs)
 
-  def _assign_on_each_device(self, assign_func, value, read_value):
-    update = control_flow_ops.group(
-        tuple(
-            assign_func(v.device, v, value)
-            for v in self._values))
-    if not read_value:
-      return update
-    with ops.control_dependencies([update] if update else []):
-      return self.read_value()
-
   # TODO(b/154017756): Make assign behaivor in cross replica context consistent
   # with MirroredVariable.
   def assign_sub(self, value, use_locking=False, name=None, read_value=True):
     with ds_context.enter_or_assert_strategy(self._distribute_strategy):
       if ds_context.in_cross_replica_context():
-        if self._aggregation == vs.VariableAggregation.SUM:
-          raise ValueError(
-              "SyncOnReadVariable does not support `assign_sub` in "
-              "cross-replica context when aggregation is set to "
-              "`tf.VariableAggregation.SUM`.")
-        return self._assign_on_each_device(values_util.assign_sub_on_device,
-                                           value, read_value)
+        return values_util.on_read_assign_sub_cross_replica(
+            self, value, read_value=read_value)
       else:
         return super(SyncOnReadVariable,
                      self).assign_sub(value, use_locking, name, read_value)
@@ -922,13 +949,8 @@ class SyncOnReadVariable(DistributedVariable):
   def assign_add(self, value, use_locking=False, name=None, read_value=True):
     with ds_context.enter_or_assert_strategy(self._distribute_strategy):
       if ds_context.in_cross_replica_context():
-        if self._aggregation == vs.VariableAggregation.SUM:
-          raise ValueError(
-              "SyncOnReadVariable does not support `assign_add` in "
-              "cross-replica context when aggregation is set to "
-              "`tf.VariableAggregation.SUM`.")
-        return self._assign_on_each_device(values_util.assign_add_on_device,
-                                           value, read_value)
+        return values_util.on_read_assign_add_cross_replica(
+            self, value, read_value=read_value)
       else:
         return super(SyncOnReadVariable,
                      self).assign_add(value, use_locking, name, read_value)
@@ -936,13 +958,8 @@ class SyncOnReadVariable(DistributedVariable):
   def assign(self, value, use_locking=False, name=None, read_value=True):
     with ds_context.enter_or_assert_strategy(self._distribute_strategy):
       if ds_context.in_cross_replica_context():
-        # To preserve the sum across save and restore, we have to divide the
-        # total across all devices when restoring a variable that was summed
-        # when saving.
-        if self._aggregation == vs.VariableAggregation.SUM:
-          value = math_ops.cast(value / len(self._values), self.dtype)
-        return self._assign_on_each_device(values_util.assign_on_device, value,
-                                           read_value)
+        return values_util.on_read_assign_cross_replica(
+            self, value, read_value=read_value)
       else:
         return super(SyncOnReadVariable,
                      self).assign(value, use_locking, name, read_value)
@@ -987,7 +1004,7 @@ class SyncOnReadVariable(DistributedVariable):
 
     with ds_context.enter_or_assert_strategy(self._distribute_strategy):
       return self._distribute_strategy.reduce(
-          reduce_util.ReduceOp.from_variable_aggregation(self.aggregation),
+          reduce_util.ReduceOp.from_variable_aggregation(self._aggregation),
           self,
           axis=None)
 
@@ -1022,6 +1039,16 @@ class SyncOnReadVariable(DistributedVariable):
 
 # Register a conversion functions which reads the value of the variable,
 # allowing instances of the class to be used as tensors.
+# DistributedVariable
+def _tensor_conversion_distributed_var(var, dtype=None, name=None,
+                                       as_ref=False):
+  return var._dense_var_to_tensor(dtype=dtype, name=name, as_ref=as_ref)  # pylint: disable=protected-access
+
+
+ops.register_tensor_conversion_function(DistributedVariable,
+                                        _tensor_conversion_distributed_var)
+
+
 # MirroredVariables
 def _tensor_conversion_mirrored(var, dtype=None, name=None, as_ref=False):
   return var._dense_var_to_tensor(dtype=dtype, name=name, as_ref=as_ref)  # pylint: disable=protected-access
@@ -1048,3 +1075,299 @@ def _tensor_conversion_sync_on_read(var, dtype=None, name=None, as_ref=False):
 
 ops.register_tensor_conversion_function(SyncOnReadVariable,
                                         _tensor_conversion_sync_on_read)
+
+
+class VariablePolicy(object):
+  """Policy defining synchronization and aggregation of a distributed variable.
+
+  Given `synchronization` and `aggregation` parameters set on a `tf.Variable`
+  during variable creation within `tf.distribute` scope, `tf.distribute` creates
+  an appropriate policy object and assigns it to the distributed variable. All
+  variable operations are delegated to the respective policy object.
+  """
+
+  def __init__(self, aggregation):
+    self._aggregation = aggregation
+
+  def value(self):
+    raise NotImplementedError(
+        "This method should be overridden by sub-classes.")
+
+  def _is_mirrored(self):
+    raise NotImplementedError(
+        "This method should be overridden by sub-classes.")
+
+  def _as_graph_element(self, _):
+    raise NotImplementedError(
+        "This method should be overridden by sub-classes.")
+
+  def _get_cross_replica(self, var):
+    raise NotImplementedError(
+        "This method should be overridden by sub-classes.")
+
+  def _update_replica(self, var, update_fn, value, **kwargs):
+    raise NotImplementedError(
+        "This method should be overridden by sub-classes.")
+
+
+class OnReadPolicy(VariablePolicy):
+  """Policy defined for `tf.VariableSynchronization.ON_READ` synchronization.
+
+  This policy is created when `synchronization` is set to
+  `tf.VariableSynchronization.ON_READ` and `aggregation` is set to any of the
+  values allowed by the `tf.VariableAggregation` enum such as `NONE`, `SUM`,
+  `MEAN` or `ONLY_FIRST_REPLICA`when creating a `tf.Variable` in `tf.distribute`
+  scope.
+  """
+
+  def _is_mirrored(self):
+    return False
+
+  def value(self, var):
+    with ds_context.enter_or_assert_strategy(var.distribute_strategy):
+      if ds_context.in_cross_replica_context():
+        return var._get_cross_replica()  # pylint: disable=protected-access
+      else:
+        return var._get_on_device_or_primary().value()  # pylint: disable=protected-access
+
+  def _as_graph_element(self, var):
+    with ds_context.enter_or_assert_strategy(var.distribute_strategy):
+      if ds_context.in_cross_replica_context():
+        return ops.convert_to_tensor(var._get_cross_replica())   # pylint: disable=protected-access
+    return var._get()._as_graph_element()   # pylint: disable=protected-access
+
+  def _get_cross_replica(self, var):
+    if self._aggregation == vs.VariableAggregation.ONLY_FIRST_REPLICA:
+      return var._primary  # pylint: disable=protected-access
+
+    with ds_context.enter_or_assert_strategy(var.distribute_strategy):
+      return  var.distribute_strategy.reduce(
+          reduce_util.ReduceOp.from_variable_aggregation(self._aggregation),
+          var,
+          axis=None)
+
+  def _update_replica(self, var, update_fn, value, **kwargs):
+    return update_fn(var._get_on_device_or_primary(), value, **kwargs)  # pylint: disable=protected-access
+
+  def _scatter_not_implemented(self, method):
+    raise NotImplementedError(
+        "ON_READ variables doesn't support `%s` in cross replica context" %
+        method)
+
+  def assign_sub(self, var, value, use_locking=False, name=None,
+                 read_value=True):
+    with ds_context.enter_or_assert_strategy(var.distribute_strategy):
+      if ds_context.in_cross_replica_context():
+        return values_util.on_read_assign_sub_cross_replica(
+            var, value, read_value=read_value)
+      else:
+        return values_util.on_write_assign_sub(
+            var, value, use_locking=use_locking, name=name,
+            read_value=read_value)
+
+  def assign_add(self, var, value, use_locking=False, name=None,
+                 read_value=True):
+    with ds_context.enter_or_assert_strategy(var.distribute_strategy):
+      if ds_context.in_cross_replica_context():
+        return values_util.on_read_assign_add_cross_replica(
+            var, value, read_value=read_value)
+      else:
+        return values_util.on_write_assign_add(
+            var, value, use_locking=use_locking, name=name,
+            read_value=read_value)
+
+  def assign(self, var, value, use_locking=False, name=None, read_value=True):
+    with ds_context.enter_or_assert_strategy(var.distribute_strategy):
+      if ds_context.in_cross_replica_context():
+        return values_util.on_read_assign_cross_replica(var, value,
+                                                        read_value=read_value)
+      else:
+        return values_util.on_write_assign(var, value,
+                                           use_locking=use_locking,
+                                           name=name,
+                                           read_value=read_value)
+
+  def scatter_sub(self, *args, **kwargs):
+    del args, kwargs
+    self._scatter_not_implemented("scatter_sub")
+
+  def scatter_add(self, *args, **kwargs):
+    del args, kwargs
+    self._scatter_not_implemented("scatter_add")
+
+  def scatter_mul(self, *args, **kwargs):
+    del args, kwargs
+    self._scatter_not_implemented("scatter_mul")
+
+  def scatter_div(self, *args, **kwargs):
+    del args, kwargs
+    self._scatter_not_implemented("scatter_div")
+
+  def scatter_min(self, *args, **kwargs):
+    del args, kwargs
+    self._scatter_not_implemented("scatter_min")
+
+  def scatter_max(self, *args, **kwargs):
+    del args, kwargs
+    self._scatter_not_implemented("scatter_max")
+
+  def scatter_update(self, *args, **kwargs):
+    del args, kwargs
+    self._scatter_not_implemented("scatter_update")
+
+  def get_saveable(self, var, primary_var, name):
+    """Create a saveable object for the given variable."""
+    # We use a callable so that we don't have to evaluate this expression
+    # in the case where we are trying to restore instead of save.
+    def tensor():
+      strategy = var.distribute_strategy
+      return strategy.extended.read_var(var)
+
+    spec = saveable_object.SaveSpec(
+        tensor=tensor,
+        slice_spec="",
+        name=name,
+        dtype=var.dtype,
+        device=primary_var.device)
+
+    return tensor, [spec]
+
+  def get_restore_ops(self, var, tensor):
+    """Restore the same value into all variables."""
+    # To preserve the sum across save and restore, we have to divide the
+    # total across all devices when restoring a variable that was summed
+    # when saving.
+    if self._aggregation == vs.VariableAggregation.SUM:
+      tensor = math_ops.cast(tensor / len(var._devices),  # pylint: disable=protected-access
+                             var.dtype)
+    return control_flow_ops.group(
+        tuple(
+            values_util.assign_on_device(v.device, v, tensor)
+            for v in var.values))
+
+
+class AutoPolicy(VariablePolicy):
+  """Policy defined for `tf.VariableSynchronization.AUTO` synchronization.
+
+  This policy is created when `synchronization` is set to
+  `tf.VariableSynchronization.AUTO` and `aggregation` is set to
+  `tf.VariableAggregation.NONE` when creating a `tf.Variable` in `tf.distribute`
+  scope.
+  """
+
+  def _is_mirrored(self):
+    return True
+
+  def value(self, var):
+    return var._get_on_device_or_primary().value()  # pylint: disable=protected-access
+
+  def _as_graph_element(self, var):
+    return var._get_on_device_or_primary()._as_graph_element()  # pylint: disable=protected-access
+
+  def _get_cross_replica(self, var):
+    # Return identity, to avoid directly exposing the variable to the user and
+    # allowing it to be modified by mistake.
+    return array_ops.identity(Mirrored._get_cross_replica(var))  # pylint: disable=protected-access
+
+  def _update_replica(self, var, update_fn, value, **kwargs):
+    return update_fn(var._get_on_device_or_primary(), value, **kwargs)  # pylint: disable=protected-access
+
+  def assign(self, var, value, use_locking=False, name=None, read_value=True):
+    return values_util.on_write_assign(var, value, use_locking=use_locking,
+                                       name=name, read_value=read_value)
+
+  def assign_add(self, var, value, use_locking=False, name=None,
+                 read_value=True):
+    return values_util.on_write_assign_add(var, value, use_locking=use_locking,
+                                           name=name, read_value=read_value)
+
+  def assign_sub(self, var, value, use_locking=False, name=None,
+                 read_value=True):
+    return values_util.on_write_assign_sub(var, value, use_locking=use_locking,
+                                           name=name, read_value=read_value)
+
+  def scatter_sub(self, var, sparse_delta, use_locking=False, name=None):
+    return values_util.scatter_sub(var, sparse_delta, use_locking=use_locking,
+                                   name=name)
+
+  def scatter_add(self, var, sparse_delta, use_locking=False, name=None):
+    return values_util.scatter_add(var, sparse_delta, use_locking=use_locking,
+                                   name=name)
+
+  def scatter_mul(self, var, sparse_delta, use_locking=False, name=None):
+    return values_util.scatter_mul(var, sparse_delta, use_locking=use_locking,
+                                   name=name)
+
+  def scatter_div(self, var, sparse_delta, use_locking=False, name=None):
+    return values_util.scatter_div(var, sparse_delta, use_locking=use_locking,
+                                   name=name)
+
+  def scatter_min(self, var, sparse_delta, use_locking=False, name=None):
+    if (self._aggregation != vs.VariableAggregation.ONLY_FIRST_REPLICA and
+        self._aggregation != vs.VariableAggregation.NONE):
+      raise NotImplementedError(values_util.scatter_error_msg.format(
+          op_name="scatter_min", aggregation=self._aggregation))
+    return values_util.scatter_min(var, sparse_delta, use_locking=use_locking,
+                                   name=name)
+
+  def scatter_max(self, var, sparse_delta, use_locking=False, name=None):
+    if (self._aggregation != vs.VariableAggregation.ONLY_FIRST_REPLICA and
+        self._aggregation != vs.VariableAggregation.NONE):
+      raise NotImplementedError(values_util.scatter_error_msg.format(
+          op_name="scatter_max", aggregation=self._aggregation))
+    return values_util.scatter_max(var, sparse_delta, use_locking=use_locking,
+                                   name=name)
+
+  def scatter_update(self, var, sparse_delta, use_locking=False, name=None):
+    if (self._aggregation != vs.VariableAggregation.ONLY_FIRST_REPLICA and
+        self._aggregation != vs.VariableAggregation.NONE):
+      raise NotImplementedError(values_util.scatter_error_msg.format(
+          op_name="scatter_update", aggregation=self._aggregation))
+    return values_util.scatter_update(var, sparse_delta,
+                                      use_locking=use_locking,
+                                      name=name)
+
+  def get_saveable(self, var, primary_var, name):
+    del var, name
+    return primary_var, ""
+
+  def get_restore_ops(self, var, tensor):
+    return control_flow_ops.group(
+        tuple(
+            values_util.assign_on_device(v.device, v, tensor)
+            for v in var.values))
+
+
+class OnWritePolicy(AutoPolicy):
+  """Policy defined for `tf.VariableSynchronization.ON_WRITE` synchronization.
+
+  This policy is created when the following `synchronization` and
+  `aggregation` parameters are specified when creating a `tf.Variable` in
+  `tf.distribute` scope:
+  * `synchronization` is equal to `tf.VariableSynchronization.AUTO` and
+  aggregation can be any of the following `tf.VariableAggregation` enum
+  values such as `SUM`, `MEAN` or `ONLY_FIRST_REPLICA`.
+  * `synchronization` is equal to `tf.VariableSynchronization.ON_WRITE` and
+  aggregation can be any of the following `tf.VariableAggregation` enum
+  values such as `NONE`, `SUM`, `MEAN` or `ONLY_FIRST_REPLICA`.
+  """
+
+  def _update_replica(self, var, update_fn, value, **kwargs):
+    return _on_write_update_replica(var, update_fn, value, **kwargs)
+
+
+# Utility functions
+# Return True if the Value is Mirrored or the Variable is replicated and kept in
+# sync.
+def _is_mirrored(val):
+  if isinstance(val, DistributedVariable):
+    if val._var_policy:  # pylint: disable=protected-access
+      return val._var_policy._is_mirrored()  # pylint: disable=protected-access
+  return isinstance(val, Mirrored)
+
+
+def _is_sync_on_read(val):
+  if isinstance(val, DistributedVariable):
+    if val._var_policy:  # pylint: disable=protected-access
+      return not val._var_policy._is_mirrored()  # pylint: disable=protected-access
+  return not isinstance(val, Mirrored)
diff --git a/tensorflow/python/distribute/values_util.py b/tensorflow/python/distribute/values_util.py
index c42ac9e4de1..ddb0d2d0401 100644
--- a/tensorflow/python/distribute/values_util.py
+++ b/tensorflow/python/distribute/values_util.py
@@ -23,9 +23,155 @@ from tensorflow.python.distribute import distribution_strategy_context as ds_con
 from tensorflow.python.distribute import reduce_util
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variable_scope as vs
 
 
+def on_write_assign(var, value, use_locking=False, name=None, read_value=True):
+  assign_fn = lambda var, *a, **kw: var.assign(*a, **kw)
+  return var._update(  # pylint: disable=protected-access
+      update_fn=assign_fn,
+      value=value,
+      use_locking=use_locking,
+      name=name,
+      read_value=read_value)
+
+
+def on_write_assign_add(var, value, use_locking=False, name=None,
+                        read_value=True):
+  assign_add_fn = lambda var, *a, **kw: var.assign_add(*a, **kw)
+  return var._update(  # pylint: disable=protected-access
+      update_fn=assign_add_fn,
+      value=value,
+      use_locking=use_locking,
+      name=name,
+      read_value=read_value)
+
+
+def on_write_assign_sub(var, value, use_locking=False, name=None,
+                        read_value=True):
+  assign_sub_fn = lambda var, *a, **kw: var.assign_sub(*a, **kw)
+  return var._update(  # pylint: disable=protected-access
+      update_fn=assign_sub_fn,
+      value=value,
+      use_locking=use_locking,
+      name=name,
+      read_value=read_value)
+
+
+def assign_on_each_device(var, assign_func, value, read_value):
+  update = control_flow_ops.group(
+      tuple(assign_func(v.device, v, value) for v in var._values))  # pylint: disable=protected-access
+  if not read_value:
+    return update
+  with ops.control_dependencies([update] if update else []):
+    return var.read_value()
+
+
+def on_read_assign_sub_cross_replica(var, value, read_value=True):
+  with ds_context.enter_or_assert_strategy(var.distribute_strategy):
+    if ds_context.in_cross_replica_context():
+      if var.aggregation == vs.VariableAggregation.SUM:
+        raise ValueError(
+            "SyncOnReadVariable does not support `assign_sub` in "
+            "cross-replica context when aggregation is set to "
+            "`tf.VariableAggregation.SUM`.")
+      return assign_on_each_device(var, assign_sub_on_device,
+                                   value, read_value)
+
+
+def on_read_assign_add_cross_replica(var, value, read_value=True):
+  with ds_context.enter_or_assert_strategy(var.distribute_strategy):
+    if ds_context.in_cross_replica_context():
+      if var.aggregation == vs.VariableAggregation.SUM:
+        raise ValueError(
+            "SyncOnReadVariable does not support `assign_add` in "
+            "cross-replica context when aggregation is set to "
+            "`tf.VariableAggregation.SUM`.")
+      return assign_on_each_device(var, assign_add_on_device,
+                                   value, read_value)
+
+
+def on_read_assign_cross_replica(var, value, read_value=True):
+  """Return the value of the variable in cross replica context."""
+  with ds_context.enter_or_assert_strategy(var.distribute_strategy):
+    if ds_context.in_cross_replica_context():
+      # To preserve the sum across save and restore, we have to divide the
+      # total across all devices when restoring a variable that was summed
+      # when saving.
+      tensor = value
+      # TODO(anjs): Should this be over all the replicas in sync since we
+      # call `reduce` on the variable during read?
+      if var.aggregation == vs.VariableAggregation.SUM:
+        tensor = math_ops.cast(tensor / len(var._values), var.dtype)  # pylint: disable=protected-access
+      return assign_on_each_device(var, assign_on_device, tensor,
+                                   read_value)
+
+
+def scatter_sub(var, sparse_delta, use_locking=False, name=None):
+  scatter_sub_fn = lambda var, *a, **kw: var.scatter_sub(*a, **kw)
+  return var._update(  # pylint: disable=protected-access
+      update_fn=scatter_sub_fn,
+      value=sparse_delta,
+      use_locking=use_locking,
+      name=name)
+
+
+def scatter_add(var, sparse_delta, use_locking=False, name=None):
+  scatter_add_fn = lambda var, *a, **kw: var.scatter_add(*a, **kw)
+  return var._update(  # pylint: disable=protected-access
+      update_fn=scatter_add_fn,
+      value=sparse_delta,
+      use_locking=use_locking,
+      name=name)
+
+
+def scatter_mul(var, sparse_delta, use_locking=False, name=None):
+  scatter_mul_fn = lambda var, *a, **kw: var.scatter_mul(*a, **kw)
+  return var._update(  # pylint: disable=protected-access
+      update_fn=scatter_mul_fn,
+      value=sparse_delta,
+      use_locking=use_locking,
+      name=name)
+
+
+def scatter_div(var, sparse_delta, use_locking=False, name=None):
+  scatter_div_fn = lambda var, *a, **kw: var.scatter_div(*a, **kw)
+  return var._update(  # pylint: disable=protected-access
+      update_fn=scatter_div_fn,
+      value=sparse_delta,
+      use_locking=use_locking,
+      name=name)
+
+
+def scatter_min(var, sparse_delta, use_locking=False, name=None):
+  scatter_min_fn = lambda var, *a, **kw: var.scatter_min(*a, **kw)
+  return var._update(  # pylint: disable=protected-access
+      update_fn=scatter_min_fn,
+      value=sparse_delta,
+      use_locking=use_locking,
+      name=name)
+
+
+def scatter_max(var, sparse_delta, use_locking=False, name=None):
+  scatter_max_fn = lambda var, *a, **kw: var.scatter_max(*a, **kw)
+  return var._update(  # pylint: disable=protected-access
+      update_fn=scatter_max_fn,
+      value=sparse_delta,
+      use_locking=use_locking,
+      name=name)
+
+
+def scatter_update(var, sparse_delta, use_locking=False, name=None):
+  scatter_update_fn = lambda var, *a, **kw: var.scatter_update(*a, **kw)
+  return var._update(  # pylint: disable=protected-access
+      update_fn=scatter_update_fn,
+      value=sparse_delta,
+      use_locking=use_locking,
+      name=name)
+
+
 def get_current_replica_id_as_int():
   """Returns the current replica ID as an integer, or `None`."""
   replica_context = ds_context.get_replica_context()
@@ -89,3 +235,9 @@ aggregation_error_msg = (
     "`tf.distribute.get_replica_context().merge_call(merge_fn, ..)`."
     "Inside `merge_fn`, you can then update the {variable_type} "
     "using `tf.distribute.StrategyExtended.update()`.")
+
+
+scatter_error_msg = ("{op_name} is only supported for mirrored "
+                     "variable (variable created within certain "
+                     "`tf.distribute.Strategy` scope) with NONE or "
+                     "`ONLY_FIRST_REPLICA` aggregation, got: {aggregation}.")

From e887f933e4fd4965742944668f94255e9a603533 Mon Sep 17 00:00:00 2001
From: Thai Nguyen <thaink@google.com>
Date: Tue, 16 Jun 2020 00:39:41 -0700
Subject: [PATCH 0267/1390] Add the missing versions to runtime version

This cl also add a new unit test to ensure new version
got reflected in runtime version.

PiperOrigin-RevId: 316630739
Change-Id: I139ecf3077b2eec9bcc13ea5bb9030199d29203a
---
 .../lite/tests/mlir2flatbuffer/if_op.mlir     |  2 +-
 .../tests/mlir2flatbuffer/quantization.mlir   |  2 +-
 .../tests/mlir2flatbuffer/tfl_while_op.mlir   |  2 +-
 .../lite/tests/mlir2flatbuffer/while_op.mlir  |  2 +-
 tensorflow/lite/tools/versioning/BUILD        |  2 +
 .../lite/tools/versioning/runtime_version.cc  | 98 ++++++++++++++++---
 .../lite/tools/versioning/runtime_version.h   |  6 ++
 .../tools/versioning/runtime_version_test.cc  | 23 ++++-
 8 files changed, 118 insertions(+), 19 deletions(-)

diff --git a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/if_op.mlir b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/if_op.mlir
index b89ba0fe400..7290209cc4a 100644
--- a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/if_op.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/if_op.mlir
@@ -157,7 +157,7 @@
 // CHECK-NEXT:   }, {
 // CHECK-EMPTY:
 // CHECK-NEXT:   }, {
-// CHECK-NEXT:   data: [ 49, 46, 49, 52, 46, 48, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ]
+// CHECK-NEXT:   data: [ 49, 46, 49, 53, 46, 48, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ]
 // CHECK-NEXT:   } ],
 // CHECK-NEXT:   metadata: [ {
 // CHECK-NEXT:   name: "min_runtime_version",
diff --git a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/quantization.mlir b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/quantization.mlir
index 9bbbdca1c97..dbe10a3f90c 100644
--- a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/quantization.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/quantization.mlir
@@ -154,7 +154,7 @@ func @main(%arg0: tensor<1x224x224x3xf32>) -> tensor<1x1001xf32> {
 // CHECK-NEXT:  }, {
 // CHECK-EMPTY:
 // CHECK-NEXT:  }, {
-// CHECK-NEXT:    data: [ 49, 46, 49, 51, 46, 49, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ]
+// CHECK-NEXT:    data: [ 49, 46, 49, 52, 46, 48, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ]
 // CHECK-NEXT:  } ],
 // CHECK-NEXT:  metadata: [ {
 // CHECK-NEXT:  name: "min_runtime_version",
diff --git a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/tfl_while_op.mlir b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/tfl_while_op.mlir
index 1d3a70f0996..996543cc9c7 100644
--- a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/tfl_while_op.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/tfl_while_op.mlir
@@ -190,7 +190,7 @@
 // CHECK-NEXT:   }, {
 // CHECK-EMPTY:
 // CHECK-NEXT:   }, {
-// CHECK-NEXT:     data: [ 49, 46, 49, 52, 46, 48, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ]
+// CHECK-NEXT:     data: [ 49, 46, 49, 53, 46, 48, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ]
 // CHECK-NEXT:   } ],
 // CHECK-NEXT:  metadata: [ {
 // CHECK-NEXT:  name: "min_runtime_version",
diff --git a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/while_op.mlir b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/while_op.mlir
index a76fbbeb871..d69e8f40311 100644
--- a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/while_op.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/while_op.mlir
@@ -190,7 +190,7 @@
 // CHECK-NEXT:   }, {
 // CHECK-EMPTY:
 // CHECK-NEXT:   }, {
-// CHECK-NEXT:     data: [ 49, 46, 49, 52, 46, 48, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ]
+// CHECK-NEXT:     data: [ 49, 46, 49, 53, 46, 48, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ]
 // CHECK-NEXT:   } ],
 // CHECK-NEXT:   metadata: [ {
 // CHECK-NEXT:   name: "min_runtime_version",
diff --git a/tensorflow/lite/tools/versioning/BUILD b/tensorflow/lite/tools/versioning/BUILD
index 1ba221d3fa9..34d63bd9645 100644
--- a/tensorflow/lite/tools/versioning/BUILD
+++ b/tensorflow/lite/tools/versioning/BUILD
@@ -38,6 +38,8 @@ tf_cc_test(
     ],
     deps = [
         ":versioning",
+        "//tensorflow/lite/kernels:builtin_ops",
+        "//tensorflow/lite/schema:schema_fbs",
         "//tensorflow/lite/schema:schema_fbs_with_mutable",
         "@com_google_googletest//:gtest_main",
     ],
diff --git a/tensorflow/lite/tools/versioning/runtime_version.cc b/tensorflow/lite/tools/versioning/runtime_version.cc
index 702f9fc7f85..92a7001606f 100644
--- a/tensorflow/lite/tools/versioning/runtime_version.cc
+++ b/tensorflow/lite/tools/versioning/runtime_version.cc
@@ -22,6 +22,12 @@ limitations under the License.
 #include "tensorflow/lite/schema/mutable/schema_generated.h"
 
 namespace tflite {
+namespace {
+// Use this as the placeholder string if a particular op is not yet included
+// in any Tensorflow's RC/Final release source package. Once that op is
+// included in the release, please update this with the real version string.
+static constexpr char kPendingReleaseVersion[] = "UNKNOWN";
+}  // namespace
 
 bool CompareRuntimeVersion(const std::string& v1, const std::string& v2) {
   const std::vector<std::string> vec1 = absl::StrSplit(v1, '.');
@@ -40,11 +46,8 @@ bool CompareRuntimeVersion(const std::string& v1, const std::string& v2) {
   return i < vec2.size();
 }
 
-void UpdateMinimumRuntimeVersionForModel(uint8_t* model_buffer_pointer) {
-  // Use this as the placeholder string if a particular op is not yet included
-  // in any Tensorflow's RC/Final release source package. Once that op is
-  // included in the release, please update this with the real version string.
-  static constexpr char kPendingReleaseOpVersion[] = "UNKNOWN";
+std::string FindMinimumRuntimeVersionForOp(tflite::BuiltinOperator op_code,
+                                           int op_version) {
   // A map from the version key of an op to its minimum runtime version.
   // For example, {{kAveragePool, 1}, "1.5.0"},  means the 1st version of
   // AveragePool requires a minimum TF Lite runtime version '1.5.0`.
@@ -53,27 +56,41 @@ void UpdateMinimumRuntimeVersionForModel(uint8_t* model_buffer_pointer) {
           new std::map<std::pair<BuiltinOperator, int>, std::string>({
               {{BuiltinOperator_AVERAGE_POOL_2D, 1}, "1.5.0"},
               {{BuiltinOperator_AVERAGE_POOL_2D, 2}, "1.14.0"},
+              {{BuiltinOperator_AVERAGE_POOL_2D, 3}, kPendingReleaseVersion},
+              {{BuiltinOperator_BATCH_MATMUL, 1}, kPendingReleaseVersion},
               {{BuiltinOperator_CONV_2D, 1}, "1.5.0"},
               {{BuiltinOperator_CONV_2D, 2}, "1.14.0"},
               {{BuiltinOperator_CONV_2D, 3}, "1.14.0"},
+              {{BuiltinOperator_CONV_2D, 4}, kPendingReleaseVersion},
               {{BuiltinOperator_DEPTHWISE_CONV_2D, 1}, "1.5.0"},
               {{BuiltinOperator_DEPTHWISE_CONV_2D, 2}, "1.12.0"},
               {{BuiltinOperator_DEPTHWISE_CONV_2D, 3}, "1.14.0"},
+              {{BuiltinOperator_DEPTHWISE_CONV_2D, 4}, "2.2.0"},
+              {{BuiltinOperator_DEPTHWISE_CONV_2D, 5}, kPendingReleaseVersion},
               {{BuiltinOperator_ADD, 1}, "1.5.0"},
               {{BuiltinOperator_ADD, 2}, "1.14.0"},
               {{BuiltinOperator_ADD_N, 1}, "1.14.0"},
               {{BuiltinOperator_SPACE_TO_BATCH_ND, 1}, "1.6.0"},
               {{BuiltinOperator_SPACE_TO_BATCH_ND, 2}, "1.14.0"},
+              {{BuiltinOperator_SPACE_TO_BATCH_ND, 3}, kPendingReleaseVersion},
               {{BuiltinOperator_SUB, 1}, "1.6.0"},
               {{BuiltinOperator_SUB, 2}, "1.14.0"},
+              {{BuiltinOperator_SUB, 3}, kPendingReleaseVersion},
+              {{BuiltinOperator_DENSIFY, 1}, "2.2.0"},
               {{BuiltinOperator_DIV, 1}, "1.6.0"},
-              {{BuiltinOperator_DIV, 2}, kPendingReleaseOpVersion},
+              {{BuiltinOperator_DIV, 2}, kPendingReleaseVersion},
               {{BuiltinOperator_BATCH_TO_SPACE_ND, 1}, "1.6.0"},
               {{BuiltinOperator_BATCH_TO_SPACE_ND, 2}, "1.14.0"},
+              {{BuiltinOperator_BATCH_TO_SPACE_ND, 3}, kPendingReleaseVersion},
               {{BuiltinOperator_CAST, 1}, "1.5.0"},
               {{BuiltinOperator_CONCATENATION, 1}, "1.5.0"},
               {{BuiltinOperator_CONCATENATION, 2}, "1.14.0"},
+              {{BuiltinOperator_CONCATENATION, 3}, kPendingReleaseVersion},
               {{BuiltinOperator_DEPTH_TO_SPACE, 1}, "2.1.0"},
+              {{BuiltinOperator_EMBEDDING_LOOKUP, 1}, "1.13.0"},
+              {{BuiltinOperator_EMBEDDING_LOOKUP, 2}, "1.14.0"},
+              {{BuiltinOperator_EMBEDDING_LOOKUP, 3}, "1.14.0"},
+              {{BuiltinOperator_EMBEDDING_LOOKUP_SPARSE, 1}, "1.5.0"},
               {{BuiltinOperator_FAKE_QUANT, 1}, "1.5.0"},
               {{BuiltinOperator_FAKE_QUANT, 2}, "1.10.0"},
               {{BuiltinOperator_FULLY_CONNECTED, 1}, "1.5.0"},
@@ -82,10 +99,14 @@ void UpdateMinimumRuntimeVersionForModel(uint8_t* model_buffer_pointer) {
               {{BuiltinOperator_FULLY_CONNECTED, 4}, "1.14.0"},
               {{BuiltinOperator_FULLY_CONNECTED, 5}, "2.0.0"},
               {{BuiltinOperator_FULLY_CONNECTED, 6}, "2.1.0"},
+              {{BuiltinOperator_FULLY_CONNECTED, 7}, kPendingReleaseVersion},
+              {{BuiltinOperator_FULLY_CONNECTED, 8}, kPendingReleaseVersion},
               {{BuiltinOperator_GATHER, 1}, "1.6.0"},
               {{BuiltinOperator_GATHER, 2}, "1.14.0"},
               {{BuiltinOperator_GATHER, 3}, "1.15.0"},
               {{BuiltinOperator_GATHER_ND, 1}, "1.14.0"},
+              {{BuiltinOperator_GATHER_ND, 2}, kPendingReleaseVersion},
+              {{BuiltinOperator_HASHTABLE_LOOKUP, 1}, "1.5.0"},
               {{BuiltinOperator_SVDF, 1}, "1.5.0"},
               {{BuiltinOperator_SVDF, 2}, "1.14.0"},
               {{BuiltinOperator_SVDF, 3}, "2.2.0"},
@@ -95,13 +116,21 @@ void UpdateMinimumRuntimeVersionForModel(uint8_t* model_buffer_pointer) {
               {{BuiltinOperator_LOCAL_RESPONSE_NORMALIZATION, 1}, "1.5.0"},
               {{BuiltinOperator_MAX_POOL_2D, 1}, "1.5.0"},
               {{BuiltinOperator_MAX_POOL_2D, 2}, "1.14.0"},
+              {{BuiltinOperator_MAX_POOL_2D, 3}, kPendingReleaseVersion},
               {{BuiltinOperator_MAXIMUM, 1}, "1.14.0"},
               {{BuiltinOperator_MAXIMUM, 2}, "1.14.0"},
+              {{BuiltinOperator_MAXIMUM, 3}, kPendingReleaseVersion},
+              {{BuiltinOperator_MAXIMUM, 4}, kPendingReleaseVersion},
               {{BuiltinOperator_MINIMUM, 1}, "1.14.0"},
               {{BuiltinOperator_MINIMUM, 2}, "1.14.0"},
+              {{BuiltinOperator_MINIMUM, 3}, kPendingReleaseVersion},
+              {{BuiltinOperator_MINIMUM, 4}, kPendingReleaseVersion},
               {{BuiltinOperator_MUL, 1}, "1.5.0"},
               {{BuiltinOperator_MUL, 2}, "1.14.0"},
               {{BuiltinOperator_MUL, 3}, "1.15.0"},
+              {{BuiltinOperator_MUL, 4}, kPendingReleaseVersion},
+              {{BuiltinOperator_NON_MAX_SUPPRESSION_V4, 1}, "2.1.0"},
+              {{BuiltinOperator_NON_MAX_SUPPRESSION_V5, 1}, "2.1.0"},
               {{BuiltinOperator_PAD, 1}, "1.5.0"},
               {{BuiltinOperator_PAD, 2}, "1.14.0"},
               {{BuiltinOperator_TILE, 1}, "1.10.1"},
@@ -111,18 +140,23 @@ void UpdateMinimumRuntimeVersionForModel(uint8_t* model_buffer_pointer) {
               {{BuiltinOperator_RESHAPE, 1}, "1.5.0"},
               {{BuiltinOperator_SOFTMAX, 1}, "1.5.0"},
               {{BuiltinOperator_SOFTMAX, 2}, "1.14.0"},
+              {{BuiltinOperator_SOFTMAX, 3}, kPendingReleaseVersion},
               {{BuiltinOperator_SPACE_TO_DEPTH, 1}, "1.5.0"},
               {{BuiltinOperator_SPACE_TO_DEPTH, 2}, "1.14.0"},
               {{BuiltinOperator_TRANSPOSE, 1}, "1.6.0"},
               {{BuiltinOperator_TRANSPOSE, 2}, "1.14.0"},
               {{BuiltinOperator_TRANSPOSE, 3}, "1.15.0"},
+              {{BuiltinOperator_TRANSPOSE, 4}, kPendingReleaseVersion},
               {{BuiltinOperator_LSTM, 1}, "1.7.0"},
               {{BuiltinOperator_LSTM, 2}, "1.10.0"},
               {{BuiltinOperator_LSTM, 3}, "1.14.0"},
               {{BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_LSTM, 1}, "1.13.1"},
-              {{BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_LSTM, 1}, "1.14.0"},
+              {{BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_LSTM, 2}, "1.14.0"},
               {{BuiltinOperator_BIDIRECTIONAL_SEQUENCE_LSTM, 1}, "1.14.0"},
+              {{BuiltinOperator_BIDIRECTIONAL_SEQUENCE_LSTM, 2}, "1.14.0"},
+              {{BuiltinOperator_BIDIRECTIONAL_SEQUENCE_LSTM, 3}, "1.14.0"},
               {{BuiltinOperator_BIDIRECTIONAL_SEQUENCE_RNN, 1}, "1.14.0"},
+              {{BuiltinOperator_BIDIRECTIONAL_SEQUENCE_RNN, 2}, "1.14.0"},
               {{BuiltinOperator_MEAN, 1}, "1.6.0"},
               {{BuiltinOperator_MEAN, 2}, "1.14.0"},
               {{BuiltinOperator_SUM, 1}, "1.10.0"},
@@ -140,14 +174,22 @@ void UpdateMinimumRuntimeVersionForModel(uint8_t* model_buffer_pointer) {
               {{BuiltinOperator_RESIZE_BILINEAR, 3}, "2.2.0"},
               {{BuiltinOperator_RESIZE_NEAREST_NEIGHBOR, 1}, "1.13.1"},
               {{BuiltinOperator_RESIZE_NEAREST_NEIGHBOR, 2}, "1.14.0"},
+              {{BuiltinOperator_RESIZE_NEAREST_NEIGHBOR, 3},
+               kPendingReleaseVersion},
+              {{BuiltinOperator_RNN, 1}, "1.5.0"},
+              {{BuiltinOperator_RNN, 2}, "1.14.0"},
+              {{BuiltinOperator_SKIP_GRAM, 1}, "1.5.0"},
               {{BuiltinOperator_SQUEEZE, 1}, "1.6.0"},
               {{BuiltinOperator_SPLIT, 1}, "1.5.0"},
               {{BuiltinOperator_SPLIT, 2}, "1.14.0"},
               {{BuiltinOperator_SPLIT, 3}, "1.14.0"},
+              {{BuiltinOperator_SPLIT, 4}, kPendingReleaseVersion},
               {{BuiltinOperator_SPLIT_V, 1}, "1.13.1"},
+              {{BuiltinOperator_SPLIT_V, 2}, kPendingReleaseVersion},
               {{BuiltinOperator_STRIDED_SLICE, 1}, "1.6.0"},
               {{BuiltinOperator_STRIDED_SLICE, 2}, "1.14.0"},
               {{BuiltinOperator_STRIDED_SLICE, 3}, "2.1.0"},
+              {{BuiltinOperator_STRIDED_SLICE, 4}, "2.2.0"},
               {{BuiltinOperator_TOPK_V2, 1}, "1.7.0"},
               {{BuiltinOperator_TOPK_V2, 2}, "1.14.0"},
               {{BuiltinOperator_ARG_MAX, 1}, "1.9.0"},
@@ -155,40 +197,53 @@ void UpdateMinimumRuntimeVersionForModel(uint8_t* model_buffer_pointer) {
               {{BuiltinOperator_ARG_MIN, 1}, "1.9.0"},
               {{BuiltinOperator_ARG_MIN, 2}, "1.14.0"},
               {{BuiltinOperator_TRANSPOSE_CONV, 1}, "1.9.0"},
+              {{BuiltinOperator_TRANSPOSE_CONV, 2}, "2.2.0"},
+              {{BuiltinOperator_TRANSPOSE_CONV, 3}, kPendingReleaseVersion},
               {{BuiltinOperator_SPARSE_TO_DENSE, 1}, "1.9.0"},
               {{BuiltinOperator_SPARSE_TO_DENSE, 2}, "1.14.0"},
               {{BuiltinOperator_SPARSE_TO_DENSE, 3}, "1.15.0"},
               {{BuiltinOperator_EXPAND_DIMS, 1}, "1.10.0"},
               {{BuiltinOperator_PACK, 1}, "1.11.0"},
               {{BuiltinOperator_PACK, 2}, "1.14.0"},
+              {{BuiltinOperator_PACK, 3}, kPendingReleaseVersion},
               {{BuiltinOperator_SHAPE, 1}, "1.10.0"},
               {{BuiltinOperator_SLICE, 1}, "1.14.0"},
               {{BuiltinOperator_SLICE, 2}, "1.14.0"},
               {{BuiltinOperator_SLICE, 3}, "1.14.0"},
               {{BuiltinOperator_TANH, 1}, "1.14.0"},
               {{BuiltinOperator_TANH, 2}, "1.14.0"},
+              {{BuiltinOperator_TANH, 3}, kPendingReleaseVersion},
               {{BuiltinOperator_ONE_HOT, 1}, "1.11.0"},
               {{BuiltinOperator_UNPACK, 1}, "1.11.0"},
               {{BuiltinOperator_UNPACK, 2}, "1.14.0"},
               {{BuiltinOperator_UNPACK, 3}, "2.2.0"},
+              {{BuiltinOperator_UNPACK, 4}, kPendingReleaseVersion},
               {{BuiltinOperator_LEAKY_RELU, 1}, "1.13.1"},
+              {{BuiltinOperator_LEAKY_RELU, 2}, kPendingReleaseVersion},
               {{BuiltinOperator_LOGISTIC, 1}, "1.14.0"},
               {{BuiltinOperator_LOGISTIC, 2}, "1.14.0"},
+              {{BuiltinOperator_LOGISTIC, 3}, kPendingReleaseVersion},
               {{BuiltinOperator_LOG_SOFTMAX, 1}, "1.14.0"},
               {{BuiltinOperator_LOG_SOFTMAX, 2}, "1.14.0"},
+              {{BuiltinOperator_LSH_PROJECTION, 1}, "1.5.0"},
               {{BuiltinOperator_SQUARED_DIFFERENCE, 1}, "1.13.1"},
               {{BuiltinOperator_MIRROR_PAD, 1}, "1.13.1"},
+              {{BuiltinOperator_MIRROR_PAD, 2}, kPendingReleaseVersion},
               {{BuiltinOperator_UNIQUE, 1}, "1.14.0"},
               {{BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_RNN, 1}, "1.14.0"},
+              {{BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_RNN, 2}, "1.14.0"},
               {{BuiltinOperator_WHERE, 1}, "1.14.0"},
               {{BuiltinOperator_DEQUANTIZE, 1}, "1.13.1"},
               {{BuiltinOperator_DEQUANTIZE, 2}, "1.14.0"},
               {{BuiltinOperator_DEQUANTIZE, 3}, "1.15.0"},
+              {{BuiltinOperator_DEQUANTIZE, 4}, "2.2.0"},
               {{BuiltinOperator_REVERSE_SEQUENCE, 1}, "1.14.0"},
               {{BuiltinOperator_EQUAL, 1}, "1.14.0"},
               {{BuiltinOperator_EQUAL, 2}, "1.14.0"},
+              {{BuiltinOperator_EQUAL, 3}, kPendingReleaseVersion},
               {{BuiltinOperator_NOT_EQUAL, 1}, "1.14.0"},
               {{BuiltinOperator_NOT_EQUAL, 2}, "1.14.0"},
+              {{BuiltinOperator_NOT_EQUAL, 3}, kPendingReleaseVersion},
               {{BuiltinOperator_GREATER, 1}, "1.14.0"},
               {{BuiltinOperator_GREATER, 2}, "1.14.0"},
               {{BuiltinOperator_GREATER_EQUAL, 1}, "1.14.0"},
@@ -197,10 +252,12 @@ void UpdateMinimumRuntimeVersionForModel(uint8_t* model_buffer_pointer) {
               {{BuiltinOperator_LESS, 2}, "1.14.0"},
               {{BuiltinOperator_LESS_EQUAL, 1}, "1.14.0"},
               {{BuiltinOperator_LESS_EQUAL, 2}, "1.14.0"},
+              {{BuiltinOperator_SCATTER_ND, 1}, "2.1.0"},
               {{BuiltinOperator_SEGMENT_SUM, 1}, "2.2.0"},
               {{BuiltinOperator_SELECT, 1}, "1.14.0"},
               {{BuiltinOperator_SELECT, 2}, "1.14.0"},
               {{BuiltinOperator_SELECT_V2, 1}, "2.2.0"},
+              {{BuiltinOperator_IF, 1}, "1.15.0"},
               {{BuiltinOperator_FLOOR_DIV, 1}, "1.14.0"},
               {{BuiltinOperator_FLOOR_DIV, 2}, "1.14.0"},
               {{BuiltinOperator_FLOOR, 1}, "1.9.0"},
@@ -208,6 +265,8 @@ void UpdateMinimumRuntimeVersionForModel(uint8_t* model_buffer_pointer) {
               {{BuiltinOperator_MATRIX_DIAG, 1}, "1.14.0"},
               {{BuiltinOperator_MATRIX_SET_DIAG, 1}, "1.14.0"},
               {{BuiltinOperator_ELU, 1}, "1.14.0"},
+              {{BuiltinOperator_QUANTIZE, 1}, "1.14.0"},
+              {{BuiltinOperator_QUANTIZE, 2}, "1.15.0"},
               {{BuiltinOperator_ROUND, 1}, "1.14.0"},
               {{BuiltinOperator_RELU, 1}, "1.5.0"},
               {{BuiltinOperator_RELU, 2}, "2.1.0"},
@@ -224,17 +283,29 @@ void UpdateMinimumRuntimeVersionForModel(uint8_t* model_buffer_pointer) {
               {{BuiltinOperator_RANGE, 1}, "1.13.0"},
               {{BuiltinOperator_SIN, 1}, "1.9.0"},
               {{BuiltinOperator_LOG, 1}, "1.14.0"},
+              {{BuiltinOperator_SQRT, 1}, "1.10.0"},
               {{BuiltinOperator_RSQRT, 1}, "1.10.0"},
               {{BuiltinOperator_SQUARE, 1}, "1.12.0"},
               {{BuiltinOperator_ZEROS_LIKE, 1}, "1.12.0"},
               {{BuiltinOperator_ABS, 1}, "1.13.0"},
               {{BuiltinOperator_HARD_SWISH, 1}, "1.15.0"},
               {{BuiltinOperator_FILL, 1}, "1.13.0"},
+              {{BuiltinOperator_FILL, 2}, kPendingReleaseVersion},
               {{BuiltinOperator_REVERSE_V2, 1}, "1.14.0"},
               {{BuiltinOperator_REVERSE_V2, 2}, "2.2.0"},
               {{BuiltinOperator_RANK, 1}, "1.14.0"},
+              {{BuiltinOperator_WHILE, 1}, "1.15.0"},
           });
 
+  std::pair<BuiltinOperator, int> version_key = {op_code, op_version};
+  auto it = op_version_map->find(version_key);
+  if (it == op_version_map->end()) {
+    return std::string();
+  }
+  return it->second;
+}
+
+void UpdateMinimumRuntimeVersionForModel(uint8_t* model_buffer_pointer) {
   auto model = GetMutableModel(model_buffer_pointer);
   std::string model_min_version;
   auto subgraphs = model->subgraphs();
@@ -244,19 +315,18 @@ void UpdateMinimumRuntimeVersionForModel(uint8_t* model_buffer_pointer) {
       const Operator* op = subgraph->operators()->Get(j);
       const OperatorCode* op_code =
           model->operator_codes()->Get(op->opcode_index());
-      std::pair<BuiltinOperator, int> version_key = {op_code->builtin_code(),
-                                                     op_code->version()};
-      auto it = op_version_map->find(version_key);
-      if (it == op_version_map->end() ||
-          it->second == kPendingReleaseOpVersion) {
+      std::string runtime_version = FindMinimumRuntimeVersionForOp(
+          op_code->builtin_code(), op_code->version());
+      if (runtime_version.empty() ||
+          runtime_version == kPendingReleaseVersion) {
         // In case we didn't find the current op in the map, or the operator
         // doesn't have a minimum runtime version associated, continue.
         continue;
       }
-      if (CompareRuntimeVersion(model_min_version, it->second)) {
+      if (CompareRuntimeVersion(model_min_version, runtime_version)) {
         // Current min model runtime version should be bumped if we see a higher
         // op version.
-        model_min_version = it->second;
+        model_min_version = runtime_version;
       }
     }
   }
diff --git a/tensorflow/lite/tools/versioning/runtime_version.h b/tensorflow/lite/tools/versioning/runtime_version.h
index ad88bd2ab89..64329eb1118 100644
--- a/tensorflow/lite/tools/versioning/runtime_version.h
+++ b/tensorflow/lite/tools/versioning/runtime_version.h
@@ -18,11 +18,17 @@ limitations under the License.
 #include <string>
 
 #include "flatbuffers/flatbuffers.h"  // from @flatbuffers
+#include "tensorflow/lite/schema/mutable/schema_generated.h"
 
 namespace tflite {
 // Update minimum runtime version of the given TFL flatbuffer model.
 void UpdateMinimumRuntimeVersionForModel(uint8_t* model_buffer_pointer);
 
+// Find the minimum runtime version of a given op version. Return an empty
+// string the version is not registered.
+std::string FindMinimumRuntimeVersionForOp(tflite::BuiltinOperator op_code,
+                                           int op_version);
+
 // Returns true if the first version string precedes the second.
 // For example, '1.9' should precede '1.14', also '1.14' should precede
 // '1.14.1'. If two version string is equal, then false will be returned.
diff --git a/tensorflow/lite/tools/versioning/runtime_version_test.cc b/tensorflow/lite/tools/versioning/runtime_version_test.cc
index c7b70552340..c32de228cc3 100644
--- a/tensorflow/lite/tools/versioning/runtime_version_test.cc
+++ b/tensorflow/lite/tools/versioning/runtime_version_test.cc
@@ -16,7 +16,8 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
-
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/schema/schema_generated.h"
 namespace tflite {
 
 TEST(OpVersionTest, CompareRuntimeVersion) {
@@ -31,4 +32,24 @@ TEST(OpVersionTest, CompareRuntimeVersion) {
   EXPECT_FALSE(CompareRuntimeVersion("", ""));
 }
 
+// This test will fail if an op version is added to a builtin op, but not
+// registered to runtime version.
+TEST(OpVersionTest, OpversionMissing) {
+  tflite::ops::builtin::BuiltinOpResolver resolver;
+
+  for (int id = BuiltinOperator_MIN; id <= BuiltinOperator_MAX; ++id) {
+    for (int version = 1;; ++version) {
+      auto op_code = static_cast<tflite::BuiltinOperator>(id);
+      if (resolver.FindOp(op_code, version) == nullptr) break;
+      // Throw error if the version is not registered in runtime version.
+      std::string runtime_version =
+          FindMinimumRuntimeVersionForOp(op_code, version);
+      EXPECT_NE(runtime_version, "")
+          << "Please add the version " << version << " of "
+          << tflite::EnumNamesBuiltinOperator()[op_code]
+          << " runtime_version.cc";
+    }
+  }
+}
+
 }  // namespace tflite

From ec0e105c6fe537969a736ddb546c277ae18b9282 Mon Sep 17 00:00:00 2001
From: Thai Nguyen <thaink@google.com>
Date: Tue, 16 Jun 2020 00:47:28 -0700
Subject: [PATCH 0268/1390] Fix build failure of list_flex_ops_main in OSS

The cc_binary required --config=monolithic which can't be passed
into a native.genrule. Using tf_cc_binary solves the build failure.

PiperOrigin-RevId: 316631689
Change-Id: Ia706d532578ccbf5bc8f172f6344f166d05531fb
---
 tensorflow/lite/tools/BUILD                 |  7 ++++---
 tensorflow/lite/tools/list_flex_ops_test.cc | 23 +++++++++++----------
 2 files changed, 16 insertions(+), 14 deletions(-)

diff --git a/tensorflow/lite/tools/BUILD b/tensorflow/lite/tools/BUILD
index c34453e0809..89d3da1ec6a 100644
--- a/tensorflow/lite/tools/BUILD
+++ b/tensorflow/lite/tools/BUILD
@@ -1,5 +1,6 @@
 load("//tensorflow/lite:special_rules.bzl", "tflite_portable_test_suite")
 load("//tensorflow/lite:build_def.bzl", "tflite_copts")
+load("//tensorflow:tensorflow.bzl", "tf_cc_binary", "tf_cc_test")
 
 package(
     default_visibility = [
@@ -271,7 +272,7 @@ cc_library(
 
 # This tool list flex ops and kernels inside a TFLite file.
 # It is used to generate header file for selective registration.
-cc_binary(
+tf_cc_binary(
     name = "list_flex_ops_main",
     srcs = ["list_flex_ops_main.cc"],
     visibility = ["//visibility:public"],
@@ -282,7 +283,7 @@ cc_binary(
     ],
 )
 
-cc_test(
+tf_cc_test(
     name = "list_flex_ops_test",
     srcs = ["list_flex_ops_test.cc"],
     data = [
@@ -293,7 +294,6 @@ cc_test(
         "//tensorflow/lite:testdata/test_model_broken.bin",
     ],
     tags = [
-        "no_oss",  # Currently requires --config=monolithic, b/118895218.
         "tflite_not_portable_android",
         "tflite_not_portable_ios",
     ],
@@ -301,6 +301,7 @@ cc_test(
         ":list_flex_ops",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/platform:protobuf",
+        "//tensorflow/core/platform:resource_loader",
         "//tensorflow/lite/kernels:test_util",
         "@com_google_googletest//:gtest",
         "@flatbuffers",
diff --git a/tensorflow/lite/tools/list_flex_ops_test.cc b/tensorflow/lite/tools/list_flex_ops_test.cc
index 67ddc06325a..872d7509d0c 100644
--- a/tensorflow/lite/tools/list_flex_ops_test.cc
+++ b/tensorflow/lite/tools/list_flex_ops_test.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "flatbuffers/flexbuffers.h"  // from @flatbuffers
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/platform/protobuf.h"
+#include "tensorflow/core/platform/resource_loader.h"
 #include "tensorflow/lite/kernels/test_util.h"
 
 namespace tflite {
@@ -31,8 +32,9 @@ class FlexOpsListTest : public ::testing::Test {
  protected:
   FlexOpsListTest() {}
 
-  void ReadOps(const string& model_path) {
-    auto model = FlatBufferModel::BuildFromFile(model_path.data());
+  void ReadOps(const string& path) {
+    std::string full_path = tensorflow::GetDataDependencyFilepath(path);
+    auto model = FlatBufferModel::BuildFromFile(full_path.data());
     AddFlexOpsFromModel(model->GetModel(), &flex_ops_);
     output_text_ = OpListToJSONString(flex_ops_);
   }
@@ -84,30 +86,29 @@ class FlexOpModel : public SingleOpModel {
 };
 
 TEST_F(FlexOpsListTest, TestModelsNoFlex) {
-  ReadOps("third_party/tensorflow/lite/testdata/test_model.bin");
+  ReadOps("tensorflow/lite/testdata/test_model.bin");
   EXPECT_EQ(output_text_, "[]");
 }
 
 TEST_F(FlexOpsListTest, TestBrokenModel) {
   EXPECT_DEATH_IF_SUPPORTED(
-      ReadOps("third_party/tensorflow/lite/testdata/test_model_broken.bin"),
-      "");
+      ReadOps("tensorflow/lite/testdata/test_model_broken.bin"), "");
 }
 
 TEST_F(FlexOpsListTest, TestZeroSubgraphs) {
-  ReadOps("third_party/tensorflow/lite/testdata/0_subgraphs.bin");
+  ReadOps("tensorflow/lite/testdata/0_subgraphs.bin");
   EXPECT_EQ(output_text_, "[]");
 }
 
 TEST_F(FlexOpsListTest, TestFlexAdd) {
-  ReadOps("third_party/tensorflow/lite/testdata/multi_add_flex.bin");
+  ReadOps("tensorflow/lite/testdata/multi_add_flex.bin");
   EXPECT_EQ(output_text_,
             "[[\"Add\", \"BinaryOp<CPUDevice, functor::add<float>>\"]]");
 }
 
 TEST_F(FlexOpsListTest, TestTwoModel) {
-  ReadOps("third_party/tensorflow/lite/testdata/multi_add_flex.bin");
-  ReadOps("third_party/tensorflow/lite/testdata/softplus_flex.bin");
+  ReadOps("tensorflow/lite/testdata/multi_add_flex.bin");
+  ReadOps("tensorflow/lite/testdata/softplus_flex.bin");
   EXPECT_EQ(output_text_,
             "[[\"Add\", \"BinaryOp<CPUDevice, "
             "functor::add<float>>\"],\n[\"Softplus\", \"SoftplusOp<CPUDevice, "
@@ -115,8 +116,8 @@ TEST_F(FlexOpsListTest, TestTwoModel) {
 }
 
 TEST_F(FlexOpsListTest, TestDuplicatedOp) {
-  ReadOps("third_party/tensorflow/lite/testdata/multi_add_flex.bin");
-  ReadOps("third_party/tensorflow/lite/testdata/multi_add_flex.bin");
+  ReadOps("tensorflow/lite/testdata/multi_add_flex.bin");
+  ReadOps("tensorflow/lite/testdata/multi_add_flex.bin");
   EXPECT_EQ(output_text_,
             "[[\"Add\", \"BinaryOp<CPUDevice, functor::add<float>>\"]]");
 }

From d43a0150f891b938dfa4247744e4d18e2e696e06 Mon Sep 17 00:00:00 2001
From: Jing Pu <jingpu@google.com>
Date: Tue, 16 Jun 2020 00:57:18 -0700
Subject: [PATCH 0269/1390] Add a pattern to legalize hlo.reduce to tf.Min.

PiperOrigin-RevId: 316632939
Change-Id: I7fbc90c1a75e5cc8bafb6b87475284de6ebe91a7
---
 .../mlir/tensorflow/tests/legalize_hlo.mlir   | 21 +++++++
 .../tensorflow/transforms/legalize_hlo.cc     | 57 +++++++++++++++----
 2 files changed, 68 insertions(+), 10 deletions(-)

diff --git a/tensorflow/compiler/mlir/tensorflow/tests/legalize_hlo.mlir b/tensorflow/compiler/mlir/tensorflow/tests/legalize_hlo.mlir
index 6a32bcb5254..c8542ab3bae 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/legalize_hlo.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/legalize_hlo.mlir
@@ -773,6 +773,20 @@ func @convert_reduce_to_max(%arg0: tensor<1x256xf32>) -> tensor<1xf32> {
   return %1 : tensor<1xf32>
 }
 
+
+func @convert_reduce_to_min(%arg0: tensor<1x256xf32>) -> tensor<1xf32> {
+  // "0x7F800000" represents INF for f32.
+  %0 = xla_hlo.constant dense<0x7F800000> : tensor<f32>
+  %1 = "xla_hlo.reduce"(%arg0, %0) ( {
+  ^bb0(%arg1: tensor<f32>, %arg2: tensor<f32>):
+    %2 = xla_hlo.minimum %arg1, %arg2 : tensor<f32>
+    "xla_hlo.return"(%2) : (tensor<f32>) -> ()
+  }) {dimensions = dense<1> : tensor<1xi64>} : (tensor<1x256xf32>, tensor<f32>) -> tensor<1xf32>
+  return %1 : tensor<1xf32>
+}
+
+
+
 // NOTE: Assertions have been autogenerated by utils/generate-test-checks.py
 
 // CHECK-LABEL:   func @biasAdd_NHWC(
@@ -1689,3 +1703,10 @@ func @convert_reduce_to_max(%arg0: tensor<1x256xf32>) -> tensor<1xf32> {
 // CHECK:           [[VAL_418:%.*]] = "tf.Max"([[VAL_416:%.*]], [[VAL_417:%.*]]) {keep_dims = false} : (tensor<1x256xf32>, tensor<1xi64>) -> tensor<1xf32>
 // CHECK:           return [[VAL_418]] : tensor<1xf32>
 // CHECK:         }
+
+// CHECK-LABEL:   func @convert_reduce_to_min(
+// CHECK-SAME:                  [[VAL_419:%.*]]: tensor<1x256xf32>) -> tensor<1xf32> {
+// CHECK:           [[VAL_420:%.*]] = "tf.Const"() {value = dense<1> : tensor<1xi64>} : () -> tensor<1xi64>
+// CHECK:           [[VAL_421:%.*]] = "tf.Min"([[VAL_419:%.*]], [[VAL_420:%.*]]) {keep_dims = false} : (tensor<1x256xf32>, tensor<1xi64>) -> tensor<1xf32>
+// CHECK:           return [[VAL_421]] : tensor<1xf32>
+// CHECK:         }
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/legalize_hlo.cc b/tensorflow/compiler/mlir/tensorflow/transforms/legalize_hlo.cc
index ad7abc08d94..9bb23213919 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/legalize_hlo.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/legalize_hlo.cc
@@ -489,9 +489,9 @@ LogicalResult MatchReduceOpInput(xla_hlo::ReduceOp reduce_op) {
   return success();
 }
 
-// TODO(b/157192370): This "xla_hlo::ReduceOp" can corresponds to many TF ops
-// with different ops in reduce_op.body. Now we only match to "tf.Max" and
-// "tf.Sum".
+// TODO(jingpu): This "xla_hlo::ReduceOp" can corresponds to many TF ops
+// with different ops in reduce_op.body. Now we only match to "tf.Max", "tf.Min"
+// and "tf.Sum".
 class ConvertReduceOpToTfSum : public OpConversionPattern<xla_hlo::ReduceOp> {
  public:
   using OpConversionPattern::OpConversionPattern;
@@ -504,15 +504,13 @@ class ConvertReduceOpToTfSum : public OpConversionPattern<xla_hlo::ReduceOp> {
     Operation *first_op = &reduce_op.body().front().front();
     if (!llvm::isa<xla_hlo::AddOp>(first_op)) return failure();
 
-    // In `MatchReduceOpInput` function, we only match that the
+    // In `MatchReduceOpInput` function, we already match that the
     // "xla_hlo::ReduceOp" only has one input, one init_value and one result.
     auto input = reduce_op.operands()[0];
     // Get reduction dimension.
     DenseIntElementsAttr dimension = reduce_op.dimensions();
     SmallVector<int64_t, 4> reduce_dims;
-    const int64_t input_rank = input.getType().cast<ShapedType>().getRank();
     for (const int64_t &dim : dimension.getValues<int64_t>()) {
-      if (dim < 0 || dim >= input_rank) return failure();
       reduce_dims.emplace_back(dim);
     }
 
@@ -545,15 +543,13 @@ class ConvertReduceOpToTfMax : public OpConversionPattern<xla_hlo::ReduceOp> {
     Operation *first_op = &reduce_op.body().front().front();
     if (!llvm::isa<xla_hlo::MaxOp>(first_op)) return failure();
 
-    // In `MatchReduceOpInput` function, we only match that the
+    // In `MatchReduceOpInput` function, we already match that the
     // "xla_hlo::ReduceOp" only has one input, one init_value and one result.
     auto input = reduce_op.operands()[0];
     // Get reduction dimension.
     DenseIntElementsAttr dimension = reduce_op.dimensions();
     SmallVector<int64_t, 4> reduce_dims;
-    const int64_t input_rank = input.getType().cast<ShapedType>().getRank();
     for (const int64_t &dim : dimension.getValues<int64_t>()) {
-      if (dim < 0 || dim >= input_rank) return failure();
       reduce_dims.emplace_back(dim);
     }
 
@@ -576,6 +572,47 @@ class ConvertReduceOpToTfMax : public OpConversionPattern<xla_hlo::ReduceOp> {
   };
 };
 
+class ConvertReduceOpToTfMin : public OpConversionPattern<xla_hlo::ReduceOp> {
+ public:
+  using OpConversionPattern::OpConversionPattern;
+
+  LogicalResult matchAndRewrite(
+      xla_hlo::ReduceOp reduce_op, ArrayRef<Value> args,
+      ConversionPatternRewriter &rewriter) const final {
+    if (failed(MatchReduceOpInput(reduce_op))) return failure();
+
+    Operation *first_op = &reduce_op.body().front().front();
+    if (!llvm::isa<xla_hlo::MinOp>(first_op)) return failure();
+
+    // In `MatchReduceOpInput` function, we already match that the
+    // "xla_hlo::ReduceOp" only has one input, one init_value and one result.
+    Value input = reduce_op.operands()[0];
+    // Get reduction dimension.
+    DenseIntElementsAttr dimension = reduce_op.dimensions();
+    SmallVector<int64_t, 4> reduce_dims;
+    for (const int64_t &dim : dimension.getValues<int64_t>()) {
+      reduce_dims.emplace_back(dim);
+    }
+
+    // Check initial value is +INF.
+    DenseFPElementsAttr init_value;
+    if (!matchPattern(reduce_op.init_values()[0], m_Constant(&init_value)) ||
+        !init_value.isSplat() ||
+        !init_value.getSplatValue<APFloat>().isInfinity() ||
+        init_value.getSplatValue<APFloat>().isNegative())
+      return failure();
+
+    auto dim_type = RankedTensorType::get(
+        {static_cast<int64_t>(reduce_dims.size())}, rewriter.getI64Type());
+    auto reduction_indices = rewriter.create<ConstOp>(
+        reduce_op.getLoc(), dim_type, rewriter.getI64TensorAttr(reduce_dims));
+    rewriter.replaceOpWithNewOp<MinOp>(
+        reduce_op, reduce_op.getType(0), input, reduction_indices,
+        /*keep_dim=*/rewriter.getBoolAttr(false));
+    return success();
+  };
+};
+
 class LegalizeHloToTf : public PassWrapper<LegalizeHloToTf, FunctionPass> {
  public:
   LegalizeHloToTf() = default;
@@ -709,7 +746,7 @@ void LegalizeHloToTf::runOnFunction() {
   OwningRewritePatternList patterns;
   populateWithGenerated(&context, &patterns);
   patterns.insert<ConvertConvOp, ConvertSliceOp, ConvertReduceOpToTfMax,
-                  ConvertReduceOpToTfSum>(&context);
+                  ConvertReduceOpToTfMin, ConvertReduceOpToTfSum>(&context);
 
   ConversionTarget target(context);
   target.addLegalDialect<TensorFlowDialect>();

From 44db81e3241f98e61d386aeb8b1ee0dee33e04b6 Mon Sep 17 00:00:00 2001
From: YoungSeok Yoon <youngseokyoon@google.com>
Date: Tue, 16 Jun 2020 01:16:13 -0700
Subject: [PATCH 0270/1390] Fix broken hyperlinks in guide docs

PiperOrigin-RevId: 316635046
Change-Id: I604b94075e2e10520bbfb2885089e534f3a649cd
---
 tensorflow/lite/g3doc/guide/ops_select.md           | 2 +-
 tensorflow/lite/g3doc/performance/best_practices.md | 5 +++--
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/tensorflow/lite/g3doc/guide/ops_select.md b/tensorflow/lite/g3doc/guide/ops_select.md
index 2226d86b1c9..0fa608cfa96 100644
--- a/tensorflow/lite/g3doc/guide/ops_select.md
+++ b/tensorflow/lite/g3doc/guide/ops_select.md
@@ -53,7 +53,7 @@ partially supported by TensorFlow Lite, and one would like to avoid those
 limitations.
 
 The following example shows how to use this feature in the
-[`TFLiteConverter`](./convert/python_api.md) Python API.
+[`TFLiteConverter`](../convert/python_api.md) Python API.
 
 ```python
 import tensorflow as tf
diff --git a/tensorflow/lite/g3doc/performance/best_practices.md b/tensorflow/lite/g3doc/performance/best_practices.md
index 94436865e48..e4abb564b26 100644
--- a/tensorflow/lite/g3doc/performance/best_practices.md
+++ b/tensorflow/lite/g3doc/performance/best_practices.md
@@ -20,8 +20,9 @@ accuracy and latency tradeoffs for some common image classification models.
 
 One example of models optimized for mobile devices are
 [MobileNets](https://arxiv.org/abs/1704.04861), which are optimized for mobile
-vision applications. [Hosted models](../models/hosted.md) lists several other
-models that have been optimized specifically for mobile and embedded devices.
+vision applications. [Hosted models](../guide/hosted_models.md) lists several
+other models that have been optimized specifically for mobile and embedded
+devices.
 
 You can retrain the listed models on your own dataset by using transfer
 learning. Check out our transfer learning tutorial for

From aa99cf218c8bf13aeb15e64ec4c62ea14ecb5753 Mon Sep 17 00:00:00 2001
From: Terry Heo <terryheo@google.com>
Date: Tue, 16 Jun 2020 01:27:48 -0700
Subject: [PATCH 0271/1390] Enable flex delegate on tensorflow.lite.Interpreter
 Python package

Usually, flex delegate is enabled by symbol override of AcquireFlexDelegate()
function. But this approach doesn't work well with shared library.

Since pywrap_tensorflow_internal.so is available for tensorflow PIP,
I've made the following changes to enable flex delegate.
- Included flex delegate module to the pywrap_tensorflow_internal.so.
  This file already contains most TF internal logic and having TFLite flex
  delegate impacts about 72K to the output.
- Added new function of TF_AcquireFlexDelegate() in the delegate module.
- Updated logic in AcquireFlexDelegate() of interpreter_builder.cc to check
  the availability of pywrap_tensorflow_internal.so and lookup the
  TF_AcquireFlexDelegate() symbol to enable flex delegate.

Also updated python/lite_flex_test.py since flex delegate is supported with
Python API

PiperOrigin-RevId: 316636275
Change-Id: I13a3246f27860ac0551fb04d81a84d4e82997ebc
---
 tensorflow/lite/delegates/flex/delegate.cc |  7 +++
 tensorflow/lite/interpreter_builder.cc     | 17 ++++++
 tensorflow/lite/python/BUILD               |  3 +-
 tensorflow/lite/python/lite_flex_test.py   | 61 +++++++++++++---------
 tensorflow/python/BUILD                    |  1 +
 5 files changed, 62 insertions(+), 27 deletions(-)

diff --git a/tensorflow/lite/delegates/flex/delegate.cc b/tensorflow/lite/delegates/flex/delegate.cc
index 4741bddc2f5..b8b0d4e6d01 100644
--- a/tensorflow/lite/delegates/flex/delegate.cc
+++ b/tensorflow/lite/delegates/flex/delegate.cc
@@ -136,3 +136,10 @@ TfLiteStatus FlexDelegate::CopyFromBufferHandle(
 }
 
 }  // namespace tflite
+
+// Exported C interface function which is used by AcquireFlexDelegate() at
+// interpreter_build.cc. To export the function name globally, the function name
+// must be matched with patterns in tf_version_script.lds
+extern "C" tflite::TfLiteDelegateUniquePtr TF_AcquireFlexDelegate() {
+  return tflite::AcquireFlexDelegate();
+}
diff --git a/tensorflow/lite/interpreter_builder.cc b/tensorflow/lite/interpreter_builder.cc
index 43d81ef0770..d73b298e595 100644
--- a/tensorflow/lite/interpreter_builder.cc
+++ b/tensorflow/lite/interpreter_builder.cc
@@ -14,6 +14,9 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/lite/interpreter_builder.h"
 
+#if !defined(__ANDROID__) && !defined(__APPLE__) && !defined(_WIN32)
+#include <dlfcn.h>
+#endif
 #include <fcntl.h>
 #include <stdint.h>
 #include <stdio.h>
@@ -114,6 +117,20 @@ const char* kEmptyTensorName = "";
 // For flex delegate, see also the strong override in
 // lite/delegates/flex/delegate.cc.
 TFLITE_ATTRIBUTE_WEAK Interpreter::TfLiteDelegatePtr AcquireFlexDelegate() {
+#if !defined(__ANDROID__) && !defined(__APPLE__) && !defined(_WIN32)
+  // If _pywrap_tensorflow_internal.so is available, use
+  // TF_AcquireFlexDelegate() to initialize flex delegate.
+  void* lib_tf_internal =
+      dlopen("_pywrap_tensorflow_internal.so", RTLD_NOW | RTLD_LOCAL);
+  if (lib_tf_internal) {
+    auto TF_AcquireFlexDelegate =
+        reinterpret_cast<Interpreter::TfLiteDelegatePtr (*)()>(
+            dlsym(lib_tf_internal, "TF_AcquireFlexDelegate"));
+    if (TF_AcquireFlexDelegate) {
+      return TF_AcquireFlexDelegate();
+    }
+  }
+#endif
   return Interpreter::TfLiteDelegatePtr(nullptr, [](TfLiteDelegate*) {});
 }
 
diff --git a/tensorflow/lite/python/BUILD b/tensorflow/lite/python/BUILD
index c1f37c81b7f..b0f605ed50d 100644
--- a/tensorflow/lite/python/BUILD
+++ b/tensorflow/lite/python/BUILD
@@ -194,8 +194,7 @@ py_test(
     python_version = "PY3",
     srcs_version = "PY2AND3",
     tags = [
-        # TODO(b/111881877): Enable in oss after resolving op registry issues.
-        "no_oss",
+        "no_mac",  # TODO(b/159077703): Enable Python API Flex support on MacOS.
         "no_windows",
     ],
     deps = [
diff --git a/tensorflow/lite/python/lite_flex_test.py b/tensorflow/lite/python/lite_flex_test.py
index 26bee206d27..ffc157c2128 100644
--- a/tensorflow/lite/python/lite_flex_test.py
+++ b/tensorflow/lite/python/lite_flex_test.py
@@ -19,6 +19,7 @@ from __future__ import division
 from __future__ import print_function
 
 from absl.testing import parameterized
+import numpy as np
 
 from tensorflow.lite.python import lite
 from tensorflow.lite.python.interpreter import Interpreter
@@ -41,8 +42,7 @@ class FromSessionTest(test_util.TensorFlowTestCase, parameterized.TestCase):
       ('DisableMlirConverter', False))  # disable mlir
   def testFlexMode(self, enable_mlir):
     with ops.Graph().as_default():
-      in_tensor = array_ops.placeholder(
-          shape=[1, 16, 16, 3], dtype=dtypes.float32)
+      in_tensor = array_ops.placeholder(shape=[1, 4], dtype=dtypes.float32)
       out_tensor = in_tensor + in_tensor
       sess = session.Session()
 
@@ -54,19 +54,22 @@ class FromSessionTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     tflite_model = converter.convert()
     self.assertTrue(tflite_model)
 
-    # Ensures the model contains TensorFlow ops.
-    # TODO(nupurgarg): Check values once there is a Python delegate interface.
+    # Check the model works with TensorFlow ops.
     interpreter = Interpreter(model_content=tflite_model)
-    with self.assertRaises(RuntimeError) as error:
-      interpreter.allocate_tensors()
-    self.assertIn(
-        'Regular TensorFlow ops are not supported by this interpreter.',
-        str(error.exception))
+    interpreter.allocate_tensors()
+    input_details = interpreter.get_input_details()
+    test_input = np.array([[1.0, 2.0, 3.0, 4.0]], dtype=np.float32)
+    interpreter.set_tensor(input_details[0]['index'], test_input)
+    interpreter.invoke()
+
+    output_details = interpreter.get_output_details()
+    expected_output = np.array([[2.0, 4.0, 6.0, 8.0]], dtype=np.float32)
+    output_data = interpreter.get_tensor(output_details[0]['index'])
+    self.assertTrue((expected_output == output_data).all())
 
   def testDeprecatedFlags(self):
     with ops.Graph().as_default():
-      in_tensor = array_ops.placeholder(
-          shape=[1, 16, 16, 3], dtype=dtypes.float32)
+      in_tensor = array_ops.placeholder(shape=[1, 4], dtype=dtypes.float32)
       out_tensor = in_tensor + in_tensor
       sess = session.Session()
 
@@ -83,14 +86,18 @@ class FromSessionTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     tflite_model = converter.convert()
     self.assertTrue(tflite_model)
 
-    # Ensures the model contains TensorFlow ops.
-    # TODO(nupurgarg): Check values once there is a Python delegate interface.
+    # Check the model works with TensorFlow ops.
     interpreter = Interpreter(model_content=tflite_model)
-    with self.assertRaises(RuntimeError) as error:
-      interpreter.allocate_tensors()
-    self.assertIn(
-        'Regular TensorFlow ops are not supported by this interpreter.',
-        str(error.exception))
+    interpreter.allocate_tensors()
+    input_details = interpreter.get_input_details()
+    test_input = np.array([[1.0, 2.0, 3.0, 4.0]], dtype=np.float32)
+    interpreter.set_tensor(input_details[0]['index'], test_input)
+    interpreter.invoke()
+
+    output_details = interpreter.get_output_details()
+    expected_output = np.array([[2.0, 4.0, 6.0, 8.0]], dtype=np.float32)
+    output_data = interpreter.get_tensor(output_details[0]['index'])
+    self.assertTrue((expected_output == output_data).all())
 
 
 class FromConcreteFunctionTest(test_util.TensorFlowTestCase,
@@ -114,14 +121,18 @@ class FromConcreteFunctionTest(test_util.TensorFlowTestCase,
     converter.experimental_new_converter = enable_mlir
     tflite_model = converter.convert()
 
-    # Ensures the model contains TensorFlow ops.
-    # TODO(nupurgarg): Check values once there is a Python delegate interface.
+    # Check the model works with TensorFlow ops.
     interpreter = Interpreter(model_content=tflite_model)
-    with self.assertRaises(RuntimeError) as error:
-      interpreter.allocate_tensors()
-    self.assertIn(
-        'Regular TensorFlow ops are not supported by this interpreter.',
-        str(error.exception))
+    interpreter.allocate_tensors()
+    input_details = interpreter.get_input_details()
+    test_input = np.array([4.0], dtype=np.float32)
+    interpreter.set_tensor(input_details[0]['index'], test_input)
+    interpreter.invoke()
+
+    output_details = interpreter.get_output_details()
+    expected_output = np.array([24.0], dtype=np.float32)
+    output_data = interpreter.get_tensor(output_details[0]['index'])
+    self.assertTrue((expected_output == output_data).all())
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index 87048ba9d40..343a95b85e9 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -6046,6 +6046,7 @@ pywrap_tensorflow_macro(
         "//tensorflow/core/profiler/internal:print_model_analysis",
         "//tensorflow/core/profiler/internal/cpu:python_tracer",
         "//tensorflow/tools/graph_transforms:transform_graph_lib",
+        "//tensorflow/lite/delegates/flex:delegate",
         "//tensorflow/lite/toco/python:toco_python_api",
         "//tensorflow/python/eager:pywrap_tfe_lib",
         "//tensorflow/core/util/tensor_bundle",

From 8950c470bb11a9b94c0dd08d73156008dfac60c9 Mon Sep 17 00:00:00 2001
From: Pavithra Vijay <psv@google.com>
Date: Tue, 16 Jun 2020 01:54:26 -0700
Subject: [PATCH 0272/1390] Remove automatic control dep wrapping from layers
 in v2.

PiperOrigin-RevId: 316638920
Change-Id: Iad14b1a4b0b14052f34784401b375a14b49a7641
---
 tensorflow/python/keras/engine/base_layer.py | 13 +------------
 1 file changed, 1 insertion(+), 12 deletions(-)

diff --git a/tensorflow/python/keras/engine/base_layer.py b/tensorflow/python/keras/engine/base_layer.py
index a0ee25417c0..628e74db27a 100644
--- a/tensorflow/python/keras/engine/base_layer.py
+++ b/tensorflow/python/keras/engine/base_layer.py
@@ -40,7 +40,6 @@ from tensorflow.python.eager import context
 from tensorflow.python.eager import execute
 from tensorflow.python.eager import function
 from tensorflow.python.eager import monitoring
-from tensorflow.python.framework import auto_control_deps
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
@@ -1105,17 +1104,7 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
 
           try:
             with ops.enable_auto_cast_variables(self._compute_dtype_object):
-              # Add auto_control_deps in V2 when they are not already added by
-              # a `tf.function`.
-              if (ops.executing_eagerly_outside_functions() and
-                  not base_layer_utils.is_in_eager_or_tf_function()):
-                with auto_control_deps.AutomaticControlDependencies() as acd:
-                  outputs = call_fn(cast_inputs, *args, **kwargs)
-                  # Wrap Tensors in `outputs` in `tf.identity` to avoid
-                  # circular dependencies.
-                  outputs = base_layer_utils.mark_as_return(outputs, acd)
-              else:
-                outputs = call_fn(cast_inputs, *args, **kwargs)
+              outputs = call_fn(cast_inputs, *args, **kwargs)
 
           except errors.OperatorNotAllowedInGraphError as e:
             raise TypeError('You are attempting to use Python control '

From e2b5397f126ba9cbc76a840ea0a46331e0f10897 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 16 Jun 2020 02:02:06 -0700
Subject: [PATCH 0273/1390] Update GraphDef version to 434.

PiperOrigin-RevId: 316639748
Change-Id: I2f62575a1ffdf72dbbafd5a2d6a10ae2a64d4b7c
---
 tensorflow/core/public/version.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index e08b166df2c..8e3c66edfc2 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -108,7 +108,7 @@ limitations under the License.
 
 #define TF_GRAPH_DEF_VERSION_MIN_PRODUCER 0
 #define TF_GRAPH_DEF_VERSION_MIN_CONSUMER 0
-#define TF_GRAPH_DEF_VERSION 433  // Updated: 2020/6/15
+#define TF_GRAPH_DEF_VERSION 434  // Updated: 2020/6/16
 
 // Checkpoint compatibility versions (the versions field in SavedSliceMeta).
 //

From d2cba310e80fc545cb0f8075d32335c170d547f2 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 16 Jun 2020 02:02:10 -0700
Subject: [PATCH 0274/1390] compat: Update forward compatibility horizon to
 2020-06-16

PiperOrigin-RevId: 316639760
Change-Id: I5bfbc17f255457595771a2a4636abd59ee03feb1
---
 tensorflow/python/compat/compat.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/compat/compat.py b/tensorflow/python/compat/compat.py
index 33cd3404b4d..314acfdd38f 100644
--- a/tensorflow/python/compat/compat.py
+++ b/tensorflow/python/compat/compat.py
@@ -33,7 +33,7 @@ from tensorflow.python.util.tf_export import tf_export
 # This value changes every day with an automatic CL. It can be modified in code
 # via `forward_compatibility_horizon()` or with the environment variable
 # TF_FORWARD_COMPATIBILITY_DELTA_DAYS, which is added to the compatibility date.
-_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2020, 6, 15)
+_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2020, 6, 16)
 _FORWARD_COMPATIBILITY_DELTA_DAYS_VAR_NAME = "TF_FORWARD_COMPATIBILITY_DELTA_DAYS"
 _FORWARD_COMPATIBILITY_DATE_NUMBER = None
 

From 83f19c6a9e84fc6971ad0a7df5874603237a595f Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Mon, 15 Jun 2020 17:16:56 +0000
Subject: [PATCH 0275/1390] Fix unknown output shape issue in autograph for
 tf.equal

This PR tries to address the issue raised in 40471 where
the output shape of an autograph consists of tf.equal
could not inference correctly. Specifically
`x.shape == [None, 10, 1]` and `y.shape == [None, 1, 4]`
only yield `shape == None` (should be `shape == [None, 10, 4]`).

The reason was that the shape inbference function for equal
didn't capture the cases where both x and y's dim are None.

This PR fixes the issue.

This PR fixes 40471.

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/core/framework/common_shape_fns.cc      |  3 +++
 tensorflow/core/ops/math_ops_test.cc               |  2 +-
 .../python/autograph/operators/logical_test.py     | 14 ++++++++++++++
 3 files changed, 18 insertions(+), 1 deletion(-)

diff --git a/tensorflow/core/framework/common_shape_fns.cc b/tensorflow/core/framework/common_shape_fns.cc
index 113adbdd432..7567db03c23 100644
--- a/tensorflow/core/framework/common_shape_fns.cc
+++ b/tensorflow/core/framework/common_shape_fns.cc
@@ -1936,6 +1936,7 @@ Status BroadcastBinaryOpOutputShapeFnHelper(InferenceContext* c,
       // in C++ op code, we must still assert that the unknown dim is either 1
       // or the same as the known dim.
       // - If either dimension is 1, the other dimension is the output.
+      // - If both are unknown then dimension is unknown
       if (c->Value(dim_x) > 1) {
         if (!incompatible_shape_error) {
           *out = c->UnknownShape();
@@ -1954,6 +1955,8 @@ Status BroadcastBinaryOpOutputShapeFnHelper(InferenceContext* c,
         dims.push_back(dim_x);
       } else if (dim_y.SameHandle(dim_x)) {
         dims.push_back(dim_x);
+      } else if (!c->ValueKnown(dim_x) && !c->ValueKnown(dim_y)) {
+        dims.push_back(c->UnknownDim());
       } else {
         if (!incompatible_shape_error) {
           *out = c->UnknownShape();
diff --git a/tensorflow/core/ops/math_ops_test.cc b/tensorflow/core/ops/math_ops_test.cc
index 5c69a2a7f1c..a2837d88bde 100644
--- a/tensorflow/core/ops/math_ops_test.cc
+++ b/tensorflow/core/ops/math_ops_test.cc
@@ -120,7 +120,7 @@ TEST(MathOpsTest, BroadcastBinaryOps_ShapeFn) {
     INFER_OK(op, "[1];[?]", "[d1_0]");
     INFER_OK(op, "[?];[2]", incompatible_shape_error ? "[d1_0]" : "?");
     INFER_OK(op, "[2];[?]", incompatible_shape_error ? "[d0_0]" : "?");
-    INFER_OK(op, "[?];[?]", incompatible_shape_error ? "[?]" : "?");
+    INFER_OK(op, "[?];[?]", "[?]");
     INFER_OK(op, "[];[?]", "[d1_0]");
     INFER_OK(op, "[?];[]", "[d0_0]");
 
diff --git a/tensorflow/python/autograph/operators/logical_test.py b/tensorflow/python/autograph/operators/logical_test.py
index e22f39932d1..0eab302a825 100644
--- a/tensorflow/python/autograph/operators/logical_test.py
+++ b/tensorflow/python/autograph/operators/logical_test.py
@@ -19,7 +19,9 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.autograph.operators import logical
+from tensorflow.python.eager import def_function
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import tensor_spec
 from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test
 
@@ -83,6 +85,18 @@ class LogicalOperatorsTest(test.TestCase):
       t = logical.not_(self._tf_false())
       self.assertEqual(self.evaluate(t), True)
 
+  # Test case for GitHub issue 40471
+  def test_equal_output_shapes(self):
+
+    @def_function.function(input_signature=[
+        tensor_spec.TensorSpec([None, 10, 1]),
+        tensor_spec.TensorSpec([None, 1, 4])])
+    def f(x, y):
+      z = x == y
+      return z
+
+    self.assertAllEqual(f.get_concrete_function().output_shapes, [None, 10, 4])
+
 
 if __name__ == '__main__':
   test.main()

From d52f3465f56882ad169759a942448843d1b4b589 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 16 Jun 2020 03:57:26 -0700
Subject: [PATCH 0276/1390] Remove automatic control dep wrapping from layers
 in v2.

PiperOrigin-RevId: 316652071
Change-Id: I90d3568fa727c8370de1f20e35742efbd9d615ac
---
 tensorflow/python/keras/engine/base_layer.py | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/tensorflow/python/keras/engine/base_layer.py b/tensorflow/python/keras/engine/base_layer.py
index 628e74db27a..a0ee25417c0 100644
--- a/tensorflow/python/keras/engine/base_layer.py
+++ b/tensorflow/python/keras/engine/base_layer.py
@@ -40,6 +40,7 @@ from tensorflow.python.eager import context
 from tensorflow.python.eager import execute
 from tensorflow.python.eager import function
 from tensorflow.python.eager import monitoring
+from tensorflow.python.framework import auto_control_deps
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
@@ -1104,7 +1105,17 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
 
           try:
             with ops.enable_auto_cast_variables(self._compute_dtype_object):
-              outputs = call_fn(cast_inputs, *args, **kwargs)
+              # Add auto_control_deps in V2 when they are not already added by
+              # a `tf.function`.
+              if (ops.executing_eagerly_outside_functions() and
+                  not base_layer_utils.is_in_eager_or_tf_function()):
+                with auto_control_deps.AutomaticControlDependencies() as acd:
+                  outputs = call_fn(cast_inputs, *args, **kwargs)
+                  # Wrap Tensors in `outputs` in `tf.identity` to avoid
+                  # circular dependencies.
+                  outputs = base_layer_utils.mark_as_return(outputs, acd)
+              else:
+                outputs = call_fn(cast_inputs, *args, **kwargs)
 
           except errors.OperatorNotAllowedInGraphError as e:
             raise TypeError('You are attempting to use Python control '

From a5ebf37c1d67340559b13be2b622d288af47368b Mon Sep 17 00:00:00 2001
From: Stephan Herhut <herhut@google.com>
Date: Tue, 16 Jun 2020 04:20:01 -0700
Subject: [PATCH 0277/1390] Allow escaping return values in kernel lowering.

For simple cases where the launch only has function arguments and results as operands, the kernel signature will now be rewritten to have its operands ordered in the order of the function arguments followed by results.

This only works for simple functions without control flow or multiple launches and is intended for cases where the kernel is meant to be extracted. If the host-side is also used, signature rewriting should be disabled via the provided flag.

PiperOrigin-RevId: 316654295
Change-Id: Ie3e78b78bab2191610875acb79506e5986a821af
---
 .../xla/service/mlir_gpu/kernel_lowering.cc   | 81 +++++++++++++++----
 .../xla/service/mlir_gpu/kernel_lowering.h    |  2 +-
 2 files changed, 66 insertions(+), 17 deletions(-)

diff --git a/tensorflow/compiler/xla/service/mlir_gpu/kernel_lowering.cc b/tensorflow/compiler/xla/service/mlir_gpu/kernel_lowering.cc
index f1e01bba27e..b0cbddcdb92 100644
--- a/tensorflow/compiler/xla/service/mlir_gpu/kernel_lowering.cc
+++ b/tensorflow/compiler/xla/service/mlir_gpu/kernel_lowering.cc
@@ -43,6 +43,7 @@ limitations under the License.
 #include "mlir/IR/OperationSupport.h"  // from @llvm-project
 #include "mlir/IR/PatternMatch.h"  // from @llvm-project
 #include "mlir/IR/Region.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Pass/PassManager.h"  // from @llvm-project
 #include "mlir/Transforms/BufferPlacement.h"  // from @llvm-project
@@ -278,15 +279,34 @@ struct MoveScalarComputationsIntoGpuLaunch
   }
 };
 
-// TODO(herhut): Make this a proper thing.
-struct FixKernelFunctionSignatures
-    : mlir::PassWrapper<FixKernelFunctionSignatures, mlir::FunctionPass> {
+// Sort the operands to the kernel for a deterministic order. First operands
+// that are defined by function arguments, followed by operands that are
+// returned from the function. This only works for simple functions without
+// control flow and can be used in cases where the kernel is extracted and used
+// independently of the host-side code.
+struct RewriteKernelSignature
+    : mlir::PassWrapper<RewriteKernelSignature, mlir::FunctionPass> {
   void runOnFunction() override {
     mlir::FuncOp func = getFunction();
     mlir::ModuleOp module = func.getParentOfType<mlir::ModuleOp>();
     getFunction().walk([&](mlir::gpu::LaunchFuncOp launchOp) {
       mlir::gpu::GPUFuncOp kernel =
           module.lookupSymbol<mlir::gpu::GPUFuncOp>(launchOp.kernel());
+
+      if (kernel.getNumFuncArguments() !=
+          func.getNumArguments() + func.getNumResults()) {
+        kernel.emitError()
+            << "number of kernel arguments does not match number"
+            << "of arguments and results of surrounding function";
+        signalPassFailure();
+        return;
+      }
+      if (func.getBlocks().size() != 1) {
+        func.emitError() << "surrounding function has more than one block";
+        signalPassFailure();
+        return;
+      }
+
       // Compute a map from function arguments to kernel function operands.
       mlir::BlockAndValueMapping func_to_kernel;
       for (mlir::BlockArgument arg : func.getArguments()) {
@@ -297,27 +317,54 @@ struct FixKernelFunctionSignatures
           }
         }
       }
+      // Also add function results that are computed by the launch.
+      mlir::Operation* returnOp = func.getBody().back().getTerminator();
+      for (mlir::Value result : returnOp->getOperands()) {
+        for (int i = 0, e = launchOp.getNumKernelOperands(); i < e; ++i) {
+          if (launchOp.getKernelOperand(i) == result) {
+            func_to_kernel.map(result, kernel.getArgument(i));
+            break;
+          }
+        }
+      }
 
-      // Create a new kernel function with modified signature. We know that it
-      // will have the same signature as the original function, so just reuse it
-      // here.
+      // Create a new kernel function with modified signature. It will have the
+      // parameters and result types of the original funcion as its parameter
+      // type and otherwise will be void.
       auto gpu_module = kernel.getParentOfType<mlir::gpu::GPUModuleOp>();
       mlir::OpBuilder kernel_builder(gpu_module.body());
+      auto operand_types = llvm::to_vector<4>(llvm::concat<const mlir::Type>(
+          func.getType().getInputs(), func.getType().getResults()));
       auto new_kernel = kernel_builder.create<mlir::gpu::GPUFuncOp>(
-          kernel.getLoc(), kernel.getName(), func.getType());
+          kernel.getLoc(), kernel.getName(),
+          kernel_builder.getFunctionType(operand_types, {}));
       new_kernel.setAttr(mlir::gpu::GPUDialect::getKernelFuncAttrName(),
                          kernel_builder.getUnitAttr());
 
       // Create a map from old kernel argument to new one.
       mlir::BlockAndValueMapping old_kernel_to_new;
-      for (int i = 0, e = kernel.getNumFuncArguments(); i < e; ++i) {
+      for (int i = 0, e = func.getNumArguments(); i < e; ++i) {
         mlir::Value func_arg = func.getArgument(i);
         mlir::Value new_kernel_arg = new_kernel.getArgument(i);
         mlir::Value old_kernel_arg = func_to_kernel.lookupOrNull(func_arg);
         if (!old_kernel_arg) {
           kernel.emitOpError()
               << "argument " << i
-              << "to kernel is not an argument to the containing function";
+              << " to containing function is not an argument to the kernel";
+          signalPassFailure();
+          return;
+        }
+        old_kernel_to_new.map(old_kernel_arg, new_kernel_arg);
+      }
+      for (int i = 0, e = returnOp->getNumOperands(); i < e; ++i) {
+        mlir::Value ret_op = returnOp->getOperand(i);
+        mlir::Value new_kernel_arg =
+            new_kernel.getArgument(func.getNumArguments() + i);
+        mlir::Value old_kernel_arg = func_to_kernel.lookupOrNull(ret_op);
+        if (!old_kernel_arg) {
+          kernel.emitOpError()
+              << "result " << i
+              << " of containing function is not an argument to the kernel";
           signalPassFailure();
           return;
         }
@@ -328,13 +375,16 @@ struct FixKernelFunctionSignatures
       kernel_builder.setInsertionPointToEnd(&new_kernel.body().front());
       kernel_builder.create<mlir::BranchOp>(
           new_kernel.getLoc(), &*std::next(new_kernel.body().begin()));
-      // Now create a new launchOp calling the new kernel. We can just forward
-      // the arguments of the function to the launch, as we fixed the
-      // signature.
+      // Now create a new launchOp calling the new kernel. We need to forward
+      // the arguments of the surrounding function and operands to the return.
+      mlir::SmallVector<mlir::Value, 4> new_operands;
+      new_operands.reserve(new_kernel.getNumFuncArguments());
+      new_operands.append(func.args_begin(), func.args_end());
+      new_operands.append(returnOp->operand_begin(), returnOp->operand_end());
       mlir::OpBuilder launch_builder(launchOp);
       launch_builder.create<mlir::gpu::LaunchFuncOp>(
           launchOp.getLoc(), new_kernel, launchOp.getGridSizeOperandValues(),
-          launchOp.getBlockSizeOperandValues(), func.getArguments());
+          launchOp.getBlockSizeOperandValues(), new_operands);
       // Launch does not have results, so we can just erase it. And the kernel
       // also needs to go.
       launchOp.erase();
@@ -418,7 +468,6 @@ Status LowerLHLOToGPU(mlir::ModuleOp module, LowerLHLOToGPUOptions options) {
   // Transform LHLO operations to LinAlg.
   pm.addPass(::mlir::xla_lhlo::createLegalizeLhloToLinalgPass());
   // Fuse linalg operations.
-  // TODO(herhut): Make tiling conigurable.
   pm.addPass(::mlir::xla_lhlo::createLhloFuseLinalg(/*use_parallel_loops=*/true,
                                                     tiling_for_unrolling));
   // Legalize reduce operations directly to GPU dialect.
@@ -462,8 +511,8 @@ Status LowerLHLOToGPU(mlir::ModuleOp module, LowerLHLOToGPUOptions options) {
   pm.addPass(::mlir::createGpuKernelOutliningPass());
   // Make sure the kernel signature resembled the original function's
   // signature
-  if (options.fix_signature) {
-    pm.addPass(absl::make_unique<FixKernelFunctionSignatures>());
+  if (options.rewrite_signature) {
+    pm.addPass(absl::make_unique<RewriteKernelSignature>());
   }
   if (failed(pm.run(module))) {
     return InternalError("Lowering to GPU kernels failed.");
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/kernel_lowering.h b/tensorflow/compiler/xla/service/mlir_gpu/kernel_lowering.h
index 7b5d5c35c05..77cf75b9e47 100644
--- a/tensorflow/compiler/xla/service/mlir_gpu/kernel_lowering.h
+++ b/tensorflow/compiler/xla/service/mlir_gpu/kernel_lowering.h
@@ -27,7 +27,7 @@ struct LowerLHLOToGPUOptions {
   llvm::ArrayRef<unsigned> tile_sizes = {16, 64};
   llvm::ArrayRef<unsigned> unroll_factors = {};
   bool collapse_parallel_loops = true;
-  bool fix_signature = true;
+  bool rewrite_signature = true;
 };
 
 Status LowerLHLOToGPU(mlir::ModuleOp module,

From fb7fe20ec732f4c13cf9a5e52697f0ae4e8e17b6 Mon Sep 17 00:00:00 2001
From: Stephan Herhut <herhut@google.com>
Date: Tue, 16 Jun 2020 05:20:07 -0700
Subject: [PATCH 0278/1390] Extend lowering from hlo to lhlo to also support
 buffer allocation with escaping result buffers. This is now a flag to the
 pass (defaults to the current preallocation behavior).

PiperOrigin-RevId: 316660810
Change-Id: I89e46b494d09acf2dbe14b300ee5b9df431ab09c
---
 .../mlir/xla/tests/hlo-legalize-to-lhlo.mlir  | 323 +++++++++---------
 .../xla/transforms/hlo_legalize_to_lhlo.cc    |  36 +-
 .../compiler/mlir/xla/transforms/passes.h     |  10 +-
 3 files changed, 203 insertions(+), 166 deletions(-)

diff --git a/tensorflow/compiler/mlir/xla/tests/hlo-legalize-to-lhlo.mlir b/tensorflow/compiler/mlir/xla/tests/hlo-legalize-to-lhlo.mlir
index 56429249d99..f3ce29f1bd2 100644
--- a/tensorflow/compiler/mlir/xla/tests/hlo-legalize-to-lhlo.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/hlo-legalize-to-lhlo.mlir
@@ -1,12 +1,13 @@
-// RUN: xla-opt -hlo-legalize-to-lhlo -buffer-placement -split-input-file %s -o - | FileCheck %s
+// RUN: xla-opt -hlo-legalize-to-lhlo -buffer-placement -split-input-file %s -o - | FileCheck --check-prefixes=PRE,BOTH %s
+// RUN: xla-opt -hlo-legalize-to-lhlo=results-escape-function=true -buffer-placement -split-input-file %s -o - | FileCheck --check-prefixes=ESC,BOTH %s
 
-// CHECK-LABEL: func @attrs
+// BOTH-LABEL: func @attrs
 func @attrs_copy(%operand: memref<2x2xf32>, %result: memref<2x2xf32>) {
   %tensor_operand = tensor_load %operand : memref<2x2xf32>
   %tensor_result = "xla_hlo.exponential"(%tensor_operand)
       {some_attr_1 = "exp.1", some_attr_2 = dense<1> : tensor<1xi64>}
       : (tensor<2x2xf32>) -> tensor<2x2xf32>
-  // CHECK: "xla_lhlo.exponential"(%{{.*}}, %{{.*}}) {some_attr_1 = "exp.1", some_attr_2 = dense<1> : tensor<1xi64>}
+  // BOTH: "xla_lhlo.exponential"(%{{.*}}, %{{.*}}) {some_attr_1 = "exp.1", some_attr_2 = dense<1> : tensor<1xi64>}
   tensor_store %tensor_result, %result : memref<2x2xf32>
   return
 }
@@ -16,13 +17,16 @@ func @attrs_copy(%operand: memref<2x2xf32>, %result: memref<2x2xf32>) {
 func @return_func(%arg0: tensor<4xf32>) -> tensor<4xf32> {
   return %arg0 : tensor<4xf32>
 }
-//      CHECK: (%[[ARG0:.*]]: [[TYPE:.*]], %[[RESULT:.*]]: [[TYPE]])
-// CHECK-NEXT: "xla_lhlo.copy"(%[[ARG0]], %[[RESULT]]) : ([[TYPE]], [[TYPE]]) -> ()
-// CHECK-NEXT: return
+//      PRE: (%[[ARG0:.*]]: [[TYPE:.*]], %[[RESULT:.*]]: [[TYPE]])
+// PRE-NEXT: "xla_lhlo.copy"(%[[ARG0]], %[[RESULT]]) : ([[TYPE]], [[TYPE]]) -> ()
+// PRE-NEXT: return
+//      ESC: (%[[ARG0:.*]]: [[TYPE:.*]]) -> [[TYPE]]
+//  ESC-NOT: "xla_lhlo.copy"
+// ESC-NEXT: return %[[ARG0]]
 
 // -----
 
-// CHECK-LABEL: func @func_op_long
+// BOTH-LABEL: func @func_op_long
 func @func_op_long(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
   %1 = xla_hlo.maximum %arg0, %arg1 : tensor<4xf32>
   %2 = xla_hlo.add %arg0, %1 : tensor<4xf32>
@@ -31,89 +35,91 @@ func @func_op_long(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32>
   %5 = xla_hlo.multiply %2, %4 : tensor<4xf32>
   return %5 : tensor<4xf32>
 }
-//      CHECK: (%[[NEW_ARG0:.*]]: memref<4xf32>, %[[NEW_ARG1:.*]]: memref<4xf32>, %[[RESULT:.*]]: memref<4xf32>)
-// CHECK-NEXT: %[[MAX_RESULT:.*]] = alloc() : memref<4xf32>
-// CHECK-NEXT: "xla_lhlo.maximum"(%[[NEW_ARG0]], %[[NEW_ARG1]], %[[MAX_RESULT]])
-// CHECK-NEXT: %[[ADD_RESULT:.*]] = alloc() : memref<4xf32>
-// CHECK-NEXT: "xla_lhlo.add"(%[[NEW_ARG0]], %[[MAX_RESULT]], %[[ADD_RESULT]])
-// CHECK-NEXT: dealloc %[[MAX_RESULT]] : memref<4xf32>
-// CHECK-NEXT: %[[MIN_RESULT:.*]] = alloc() : memref<4xf32>
-// CHECK-NEXT: "xla_lhlo.minimum"(%[[NEW_ARG0]], %[[NEW_ARG1]], %[[MIN_RESULT]])
-// CHECK-NEXT: %[[SUB_RESULT:.*]] = alloc() : memref<4xf32>
-// CHECK-NEXT: "xla_lhlo.subtract"(%[[NEW_ARG1]], %[[MIN_RESULT]], %[[SUB_RESULT]])
-// CHECK-NEXT: dealloc %[[MIN_RESULT]] : memref<4xf32>
-// CHECK-NEXT: %[[MUL_RESULT:.*]] = alloc() : memref<4xf32>
-// CHECK-NEXT: "xla_lhlo.multiply"(%[[ADD_RESULT]], %[[SUB_RESULT]], %[[MUL_RESULT]])
-// CHECK-NEXT: dealloc %[[SUB_RESULT]] : memref<4xf32>
-// CHECK-NEXT: dealloc %[[ADD_RESULT]] : memref<4xf32>
-// CHECK-NEXT: "xla_lhlo.copy"(%[[MUL_RESULT]], %[[RESULT]]) : (memref<4xf32>, memref<4xf32>) -> ()
-// CHECK-NEXT: dealloc %[[MUL_RESULT]] : memref<4xf32>
-// CHECK-NEXT: return
+//        PRE: (%[[NEW_ARG0:.*]]: memref<4xf32>, %[[NEW_ARG1:.*]]: memref<4xf32>, %[[RESULT:.*]]: memref<4xf32>)
+//        ESC: (%[[NEW_ARG0:.*]]: memref<4xf32>, %[[NEW_ARG1:.*]]: memref<4xf32>) -> memref<4xf32>
+//  BOTH-NEXT: %[[MAX_RESULT:.*]] = alloc() : memref<4xf32>
+//  BOTH-NEXT: "xla_lhlo.maximum"(%[[NEW_ARG0]], %[[NEW_ARG1]], %[[MAX_RESULT]])
+//  BOTH-NEXT: %[[ADD_RESULT:.*]] = alloc() : memref<4xf32>
+//  BOTH-NEXT: "xla_lhlo.add"(%[[NEW_ARG0]], %[[MAX_RESULT]], %[[ADD_RESULT]])
+//  BOTH-NEXT: dealloc %[[MAX_RESULT]] : memref<4xf32>
+//  BOTH-NEXT: %[[MIN_RESULT:.*]] = alloc() : memref<4xf32>
+//  BOTH-NEXT: "xla_lhlo.minimum"(%[[NEW_ARG0]], %[[NEW_ARG1]], %[[MIN_RESULT]])
+//  BOTH-NEXT: %[[SUB_RESULT:.*]] = alloc() : memref<4xf32>
+//  BOTH-NEXT: "xla_lhlo.subtract"(%[[NEW_ARG1]], %[[MIN_RESULT]], %[[SUB_RESULT]])
+//  BOTH-NEXT: dealloc %[[MIN_RESULT]] : memref<4xf32>
+//  BOTH-NEXT: %[[MUL_RESULT:.*]] = alloc() : memref<4xf32>
+//  BOTH-NEXT: "xla_lhlo.multiply"(%[[ADD_RESULT]], %[[SUB_RESULT]], %[[MUL_RESULT]])
+//  BOTH-NEXT: dealloc %[[SUB_RESULT]] : memref<4xf32>
+//  BOTH-NEXT: dealloc %[[ADD_RESULT]] : memref<4xf32>
+//   PRE-NEXT: "xla_lhlo.copy"(%[[MUL_RESULT]], %[[RESULT]]) : (memref<4xf32>, memref<4xf32>) -> ()
+//   PRE-NEXT: dealloc %[[MUL_RESULT]] : memref<4xf32>
+//   PRE-NEXT: return
+//   ESC-NEXT: return %[[MUL_RESULT]] : memref<4xf32>
 
 // -----
 
-// CHECK-LABEL: func @fusion
+// BOTH-LABEL: func @fusion
 func @fusion(%multiplier: memref<2x2xf32>, %summand_1: memref<2x2xf32>,
              %summand_2: memref<2x2xf32>, %result: memref<2x2xf32>) {
-  // CHECK: (%{{.*}}: {{.*}}, {{.*}}: {{.*}}, {{.*}}: {{.*}}, %[[RESULT:.*]]: {{.*}})
-  // CHECK-NEXT:  %[[ADD_RESULT:.*]] = alloc() : memref<2x2xf32>
+  // BOTH: (%{{.*}}: {{.*}}, {{.*}}: {{.*}}, {{.*}}: {{.*}}, %[[RESULT:.*]]: {{.*}})
+  // BOTH-NEXT:  %[[ADD_RESULT:.*]] = alloc() : memref<2x2xf32>
   %tensor_summand_1 = tensor_load %summand_1 : memref<2x2xf32>
   %tensor_summand_2 = tensor_load %summand_2 : memref<2x2xf32>
   %sum = "xla_hlo.add"(%tensor_summand_1, %tensor_summand_2)
       : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
-  // CHECK-NEXT: "xla_lhlo.add"(%{{.*}}, %{{.*}}, %[[ADD_RESULT]])
-  // CHECK-NEXT:  %[[MUL_RESULT:.*]] = alloc() : memref<2x2xf32>
+  // BOTH-NEXT: "xla_lhlo.add"(%{{.*}}, %{{.*}}, %[[ADD_RESULT]])
+  // BOTH-NEXT:  %[[MUL_RESULT:.*]] = alloc() : memref<2x2xf32>
   %tensor_multiplier = tensor_load %multiplier : memref<2x2xf32>
   %tensor_result = "xla_hlo.multiply"(%sum, %tensor_multiplier)
       : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
-  // CHECK-NEXT: "xla_lhlo.multiply"(%[[ADD_RESULT]], %{{.*}}, %[[MUL_RESULT]])
-  // CHECK-NEXT:  dealloc %[[ADD_RESULT]] : memref<2x2xf32>
-  // CHECK-NEXT: "xla_lhlo.copy"(%[[MUL_RESULT]], %[[RESULT]])
+  // BOTH-NEXT: "xla_lhlo.multiply"(%[[ADD_RESULT]], %{{.*}}, %[[MUL_RESULT]])
+  // BOTH-NEXT:  dealloc %[[ADD_RESULT]] : memref<2x2xf32>
+  // BOTH-NEXT: "xla_lhlo.copy"(%[[MUL_RESULT]], %[[RESULT]])
   tensor_store %tensor_result, %result : memref<2x2xf32>
-  // CHECK-NEXT:  dealloc %[[MUL_RESULT]] : memref<2x2xf32>
-  // CHECK-NEXT: "xla_lhlo.terminator"() : () -> ()
-  "xla_lhlo.terminator"() : () -> ()
+  // BOTH-NEXT:  dealloc %[[MUL_RESULT]] : memref<2x2xf32>
+  // BOTH-NEXT:  return
+  return
 }
 
 // -----
 
-// CHECK-LABEL: func @copy
+// BOTH-LABEL: func @copy
 func @copy(%operand: memref<2x2xf32>, %result: memref<2x2xf32>) {
   %tensor_operand = tensor_load %operand : memref<2x2xf32>
   %tensor_result = "xla_hlo.copy"(%tensor_operand)
       : (tensor<2x2xf32>) -> tensor<2x2xf32>
-  // CHECK: "xla_lhlo.copy"(%{{.*}}, %{{.*}})
+  // BOTH: "xla_lhlo.copy"(%{{.*}}, %{{.*}})
   tensor_store %tensor_result, %result : memref<2x2xf32>
   return
 }
 
 // -----
 
-// CHECK-LABEL: func @exp
+// BOTH-LABEL: func @exp
 func @exp(%operand: memref<2x2xf32>, %result: memref<2x2xf32>) {
   %tensor_operand = tensor_load %operand : memref<2x2xf32>
   %tensor_result = "xla_hlo.exponential"(%tensor_operand)
       : (tensor<2x2xf32>) -> tensor<2x2xf32>
-  // CHECK: "xla_lhlo.exponential"(%{{.*}}, %{{.*}})
+  // BOTH: "xla_lhlo.exponential"(%{{.*}}, %{{.*}})
   tensor_store %tensor_result, %result : memref<2x2xf32>
   return
 }
 
 // -----
 
-// CHECK-LABEL: func @log
+// BOTH-LABEL: func @log
 func @log(%operand: memref<2x2xf32>, %result: memref<2x2xf32>) {
   %tensor_operand = tensor_load %operand : memref<2x2xf32>
   %tensor_result = "xla_hlo.log"(%tensor_operand)
       : (tensor<2x2xf32>) -> tensor<2x2xf32>
-  // CHECK: "xla_lhlo.log"(%{{.*}}, %{{.*}})
+  // BOTH: "xla_lhlo.log"(%{{.*}}, %{{.*}})
   tensor_store %tensor_result, %result : memref<2x2xf32>
   return
 }
 
 // -----
 
-// CHECK-LABEL: func @select
+// BOTH-LABEL: func @select
 func @select(%pred: memref<2x2xi1>, %lhs: memref<2x2xf32>,
              %rhs: memref<2x2xf32>, %result: memref<2x2xf32>) {
   %tensor_pred = tensor_load %pred : memref<2x2xi1>
@@ -121,34 +127,34 @@ func @select(%pred: memref<2x2xi1>, %lhs: memref<2x2xf32>,
   %tensor_rhs = tensor_load %rhs : memref<2x2xf32>
   %tensor_result = "xla_hlo.select"(%tensor_pred, %tensor_lhs, %tensor_rhs)
       : (tensor<2x2xi1>, tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
-  // CHECK: "xla_lhlo.select"(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}})
+  // BOTH: "xla_lhlo.select"(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}})
   tensor_store %tensor_result, %result : memref<2x2xf32>
   return
 }
 
 // -----
 
-// CHECK-LABEL: func @compare
+// BOTH-LABEL: func @compare
 func @compare(%lhs: memref<2x2xf32>, %rhs: memref<2x2xf32>, %result: memref<2x2xi1>) {
   %tensor_lhs = tensor_load %lhs : memref<2x2xf32>
   %tensor_rhs = tensor_load %rhs : memref<2x2xf32>
   %tensor_result = "xla_hlo.compare"(%tensor_lhs, %tensor_rhs)
       {comparison_direction = "EQ"}
       : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xi1>
-  // CHECK: "xla_lhlo.compare"(%{{.*}}, %{{.*}}, %{{.*}}) {comparison_direction = "EQ"}
+  // BOTH: "xla_lhlo.compare"(%{{.*}}, %{{.*}}, %{{.*}}) {comparison_direction = "EQ"}
   tensor_store %tensor_result, %result : memref<2x2xi1>
   return
 }
 
 // -----
 
-// CHECK-LABEL: func @broadcast
+// BOTH-LABEL: func @broadcast
 func @broadcast(%operand: memref<5xf32>, %result: memref<10x5xf32>) {
   %tensor_operand = tensor_load %operand : memref<5xf32>
   %tensor_result = "xla_hlo.broadcast_in_dim"(%tensor_operand)
       {broadcast_dimensions = dense<1> : tensor<1xi64>}
         : (tensor<5xf32>) -> tensor<10x5xf32>
-  // CHECK: "xla_lhlo.broadcast_in_dim"(%{{.*}}, %{{.*}}) {broadcast_dimensions = dense<1> : tensor<1xi64>}
+  // BOTH: "xla_lhlo.broadcast_in_dim"(%{{.*}}, %{{.*}}) {broadcast_dimensions = dense<1> : tensor<1xi64>}
   tensor_store %tensor_result, %result : memref<10x5xf32>
   return
 }
@@ -157,55 +163,55 @@ func @broadcast(%operand: memref<5xf32>, %result: memref<10x5xf32>) {
 
 func @external_func() -> tensor<3xi64>
 
-// CHECK: #[[MAP:.*]] = affine_map<(d0, d1)[s0, s1] -> (d0 * s0 + d1 * s1)>
+// BOTH: #[[MAP:.*]] = affine_map<(d0, d1)[s0, s1] -> (d0 * s0 + d1 * s1)>
 
-// CHECK-LABEL: func @dyn_broadcast
+// BOTH-LABEL: func @dyn_broadcast
 func @dyn_broadcast(%operand: memref<?x?xf32>) {
-  // CHECK-SAME: (%[[OPERAND:.*]]: memref<?x?xf32>)
+  // BOTH-SAME: (%[[OPERAND:.*]]: memref<?x?xf32>)
   %tensor_operand = tensor_load %operand : memref<?x?xf32>
   %shape = call @external_func() : () -> tensor<3xi64>
   %tensor_result = "xla_hlo.dynamic_broadcast_in_dim"(%tensor_operand, %shape) {
     broadcast_dimensions = dense<[1, 2]> : tensor<2xi64>
   } : (tensor<?x?xf32>, tensor<3xi64>) -> tensor<?x?x?xf32>
-  // CHECK: %[[SHAPE:.*]] = call @external_func()
-  // CHECK: %[[C0:.*]] = constant 0 : index
-  // CHECK: %[[EL0:.*]] = extract_element %[[SHAPE]][%[[C0]]] : tensor<3xi64>
-  // CHECK: %[[IC0:.*]]  = index_cast %[[EL0]] : i64 to index
-  // CHECK: %[[C1:.*]] = constant 1 : index
-  // CHECK: %[[EL1:.*]] = extract_element %[[SHAPE]][%[[C1]]] : tensor<3xi64>
-  // CHECK: %[[IC1:.*]]  = index_cast %[[EL1]] : i64 to index
-  // CHECK: %[[C2:.*]] = constant 2 : index
-  // CHECK: %[[EL2:.*]] = extract_element %[[SHAPE]][%[[C2]]] : tensor<3xi64>
-  // CHECK: %[[IC2:.*]]  = index_cast %[[EL2]] : i64 to index
-  // CHECK: %[[RESULT:.*]] = alloc(%[[IC0]], %[[IC1]], %[[IC2]])
+  // BOTH: %[[SHAPE:.*]] = call @external_func()
+  // BOTH: %[[C0:.*]] = constant 0 : index
+  // BOTH: %[[EL0:.*]] = extract_element %[[SHAPE]][%[[C0]]] : tensor<3xi64>
+  // BOTH: %[[IC0:.*]]  = index_cast %[[EL0]] : i64 to index
+  // BOTH: %[[C1:.*]] = constant 1 : index
+  // BOTH: %[[EL1:.*]] = extract_element %[[SHAPE]][%[[C1]]] : tensor<3xi64>
+  // BOTH: %[[IC1:.*]]  = index_cast %[[EL1]] : i64 to index
+  // BOTH: %[[C2:.*]] = constant 2 : index
+  // BOTH: %[[EL2:.*]] = extract_element %[[SHAPE]][%[[C2]]] : tensor<3xi64>
+  // BOTH: %[[IC2:.*]]  = index_cast %[[EL2]] : i64 to index
+  // BOTH: %[[RESULT:.*]] = alloc(%[[IC0]], %[[IC1]], %[[IC2]])
 
-  // CHECK: %[[C0_:.*]] = constant 0 : index
-  // CHECK: %[[C1_:.*]] = constant 1 : index
+  // BOTH: %[[C0_:.*]] = constant 0 : index
+  // BOTH: %[[C1_:.*]] = constant 1 : index
 
-  // CHECK: %[[C1__:.*]] = constant 1 : index
-  // CHECK: %[[EL1_:.*]] = extract_element %[[SHAPE]]{{\[}}%[[C1__]]] : tensor<3xi64>
-  // CHECK: %[[C0___:.*]] = constant 0 : index
-  // CHECK: %[[OPERAND_DIM_0:.*]] = dim %[[OPERAND]], %[[C0___]] : memref<?x?xf32>
-  // CHECK: %[[RESULT_DIM_1:.*]] = index_cast %[[EL1_]] : i64 to index
-  // CHECK: %[[EXPAND_0:.*]] = cmpi "slt", %[[OPERAND_DIM_0]], %[[RESULT_DIM_1]]
-  // CHECK: %[[STRIDE_0:.*]] = select %[[EXPAND_0]], %[[C0_]], %[[C1_]] : index
+  // BOTH: %[[C1__:.*]] = constant 1 : index
+  // BOTH: %[[EL1_:.*]] = extract_element %[[SHAPE]]{{\[}}%[[C1__]]] : tensor<3xi64>
+  // BOTH: %[[C0___:.*]] = constant 0 : index
+  // BOTH: %[[OPERAND_DIM_0:.*]] = dim %[[OPERAND]], %[[C0___]] : memref<?x?xf32>
+  // BOTH: %[[RESULT_DIM_1:.*]] = index_cast %[[EL1_]] : i64 to index
+  // BOTH: %[[EXPAND_0:.*]] = cmpi "slt", %[[OPERAND_DIM_0]], %[[RESULT_DIM_1]]
+  // BOTH: %[[STRIDE_0:.*]] = select %[[EXPAND_0]], %[[C0_]], %[[C1_]] : index
 
-  // CHECK: %[[C2_:.*]] = constant 2 : index
-  // CHECK: %[[EL2_:.*]] = extract_element %[[SHAPE]]{{\[}}%[[C2_]]] : tensor<3xi64>
-  // CHECK: %[[C1___:.*]] = constant 1 : index
-  // CHECK: %[[OPERAND_DIM_1:.*]] = dim %[[OPERAND]], %[[C1___]] : memref<?x?xf32>
-  // CHECK: %[[RESULT_DIM_2:.*]] = index_cast %[[EL2_]] : i64 to index
-  // CHECK: %[[EXPAND_1:.*]] = cmpi "slt", %[[OPERAND_DIM_1]], %[[RESULT_DIM_2]]
-  // CHECK: %[[STRIDE_1:.*]] = select %[[EXPAND_1]], %[[C0_]], %[[C1_]] : index
+  // BOTH: %[[C2_:.*]] = constant 2 : index
+  // BOTH: %[[EL2_:.*]] = extract_element %[[SHAPE]]{{\[}}%[[C2_]]] : tensor<3xi64>
+  // BOTH: %[[C1___:.*]] = constant 1 : index
+  // BOTH: %[[OPERAND_DIM_1:.*]] = dim %[[OPERAND]], %[[C1___]] : memref<?x?xf32>
+  // BOTH: %[[RESULT_DIM_2:.*]] = index_cast %[[EL2_]] : i64 to index
+  // BOTH: %[[EXPAND_1:.*]] = cmpi "slt", %[[OPERAND_DIM_1]], %[[RESULT_DIM_2]]
+  // BOTH: %[[STRIDE_1:.*]] = select %[[EXPAND_1]], %[[C0_]], %[[C1_]] : index
 
-  // CHECK: %[[TRANSFORMED_MEMREF:.*]] = xla_lhlo.dynamic_memref_cast
-  // CHECK-SAME: %[[OPERAND]](%[[RESULT_DIM_1]], %[[RESULT_DIM_2]])
-  // CHECK-SAME: {{\[}}%[[STRIDE_0]], %[[STRIDE_1]]]
-  // CHECK-SAME: : memref<?x?xf32> -> memref<?x?xf32, #map0>
+  // BOTH: %[[TRANSFORMED_MEMREF:.*]] = xla_lhlo.dynamic_memref_cast
+  // BOTH-SAME: %[[OPERAND]](%[[RESULT_DIM_1]], %[[RESULT_DIM_2]])
+  // BOTH-SAME: {{\[}}%[[STRIDE_0]], %[[STRIDE_1]]]
+  // BOTH-SAME: : memref<?x?xf32> -> memref<?x?xf32, #map0>
 
-  // CHECK: "xla_lhlo.broadcast_in_dim"(%[[TRANSFORMED_MEMREF]], %[[RESULT]]) {
-  // CHECK-SAME:   broadcast_dimensions = dense<[1, 2]> : tensor<2xi64>
-  // CHECK-SAME: } : (memref<?x?xf32, #[[MAP]]>, memref<?x?x?xf32>) -> ()
+  // BOTH: "xla_lhlo.broadcast_in_dim"(%[[TRANSFORMED_MEMREF]], %[[RESULT]]) {
+  // BOTH-SAME:   broadcast_dimensions = dense<[1, 2]> : tensor<2xi64>
+  // BOTH-SAME: } : (memref<?x?xf32, #[[MAP]]>, memref<?x?x?xf32>) -> ()
 
   // Do not store the value back to avoid the tensor-store being rewritten to
   // a copy into the pre-allocated argument.
@@ -214,7 +220,7 @@ func @dyn_broadcast(%operand: memref<?x?xf32>) {
 
 // -----
 
-// CHECK-LABEL: func @complex
+// BOTH-LABEL: func @complex
 func @complex(%real: memref<2x2xf32>,
               %imag: memref<2x2xf32>,
               %result: memref<2x2xcomplex<f32>>) {
@@ -222,164 +228,164 @@ func @complex(%real: memref<2x2xf32>,
   %tensor_imag = tensor_load %imag : memref<2x2xf32>
   %tensor_result = "xla_hlo.complex"(%tensor_real, %tensor_imag)
       : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xcomplex<f32>>
-  // CHECK: "xla_lhlo.complex"(%{{.*}}, %{{.*}})
+  // BOTH: "xla_lhlo.complex"(%{{.*}}, %{{.*}})
   tensor_store %tensor_result, %result : memref<2x2xcomplex<f32>>
   return
 }
 
 // -----
 
-// CHECK-LABEL: func @real
+// BOTH-LABEL: func @real
 func @real(%operand: memref<2x2xcomplex<f32>>, %result: memref<2x2xf32>) {
   %tensor_operand = tensor_load %operand : memref<2x2xcomplex<f32>>
   %tensor_result = "xla_hlo.real"(%tensor_operand)
       : (tensor<2x2xcomplex<f32>>) -> tensor<2x2xf32>
-  // CHECK: "xla_lhlo.real"(%{{.*}}, %{{.*}})
+  // BOTH: "xla_lhlo.real"(%{{.*}}, %{{.*}})
   tensor_store %tensor_result, %result : memref<2x2xf32>
   return
 }
 
 // -----
 
-// CHECK-LABEL: func @imag
+// BOTH-LABEL: func @imag
 func @imag(%operand: memref<2x2xcomplex<f32>>, %result: memref<2x2xf32>) {
   %tensor_operand = tensor_load %operand : memref<2x2xcomplex<f32>>
   %tensor_result = "xla_hlo.imag"(%tensor_operand)
       : (tensor<2x2xcomplex<f32>>) -> tensor<2x2xf32>
-  // CHECK: "xla_lhlo.imag"(%{{.*}}, %{{.*}})
+  // BOTH: "xla_lhlo.imag"(%{{.*}}, %{{.*}})
   tensor_store %tensor_result, %result : memref<2x2xf32>
   return
 }
 
 // -----
 
-// CHECK-LABEL: func @iota
+// BOTH-LABEL: func @iota
 func @iota(%result: memref<10xi32>) {
   %tensor_result = "xla_hlo.iota"()
       {iota_dimension = 0 : i64} : () -> tensor<10xi32>
-  // CHECK: "xla_lhlo.iota"(%{{.*}}) {iota_dimension = 0 : i64}
+  // BOTH: "xla_lhlo.iota"(%{{.*}}) {iota_dimension = 0 : i64}
   tensor_store %tensor_result, %result : memref<10xi32>
   return
 }
 
 // -----
 
-// CHECK-LABEL: func @abs
+// BOTH-LABEL: func @abs
 func @abs(%operand: memref<2x2xf32>, %result: memref<2x2xf32>) {
   %tensor_operand = tensor_load %operand : memref<2x2xf32>
   %tensor_result = "xla_hlo.abs"(%tensor_operand)
       : (tensor<2x2xf32>) -> tensor<2x2xf32>
-  // CHECK: "xla_lhlo.abs"(%{{.*}}, %{{.*}})
+  // BOTH: "xla_lhlo.abs"(%{{.*}}, %{{.*}})
   tensor_store %tensor_result, %result : memref<2x2xf32>
   return
 }
 
 // -----
 
-// CHECK-LABEL: func @ceil
+// BOTH-LABEL: func @ceil
 func @ceil(%operand: memref<2x2xf32>, %result: memref<2x2xf32>) {
   %tensor_operand = tensor_load %operand : memref<2x2xf32>
   %tensor_result = "xla_hlo.ceil"(%tensor_operand)
       : (tensor<2x2xf32>) -> tensor<2x2xf32>
-  // CHECK: "xla_lhlo.ceil"(%{{.*}}, %{{.*}})
+  // BOTH: "xla_lhlo.ceil"(%{{.*}}, %{{.*}})
   tensor_store %tensor_result, %result : memref<2x2xf32>
   return
 }
 
 // -----
 
-// CHECK-LABEL: func @convert
+// BOTH-LABEL: func @convert
 func @convert(%operand: memref<2x2xf32>, %result: memref<2x2xf32>) {
   %tensor_operand = tensor_load %operand : memref<2x2xf32>
   %tensor_result = "xla_hlo.convert"(%tensor_operand)
       : (tensor<2x2xf32>) -> tensor<2x2xf32>
-  // CHECK: "xla_lhlo.copy"(%{{.*}}, %{{.*}})
-  // CHECK-NOT: tensor_store
+  // BOTH: "xla_lhlo.copy"(%{{.*}}, %{{.*}})
+  // BOTH-NOT: tensor_store
   tensor_store %tensor_result, %result : memref<2x2xf32>
   return
 }
 
 // -----
 
-// CHECK-LABEL: func @cos
+// BOTH-LABEL: func @cos
 func @cos(%operand: memref<2x2xf32>, %result: memref<2x2xf32>) {
   %tensor_operand = tensor_load %operand : memref<2x2xf32>
   %tensor_result = "xla_hlo.cosine"(%tensor_operand)
       : (tensor<2x2xf32>) -> tensor<2x2xf32>
-  // CHECK: "xla_lhlo.cosine"(%{{.*}}, %{{.*}})
+  // BOTH: "xla_lhlo.cosine"(%{{.*}}, %{{.*}})
   tensor_store %tensor_result, %result : memref<2x2xf32>
   return
 }
 
 // -----
 
-// CHECK-LABEL: func @neg
+// BOTH-LABEL: func @neg
 func @neg(%operand: memref<2x2xf32>, %result: memref<2x2xf32>) {
   %tensor_operand = tensor_load %operand : memref<2x2xf32>
   %tensor_result = "xla_hlo.negate"(%tensor_operand)
       : (tensor<2x2xf32>) -> tensor<2x2xf32>
-  // CHECK: "xla_lhlo.negate"(%{{.*}}, %{{.*}})
+  // BOTH: "xla_lhlo.negate"(%{{.*}}, %{{.*}})
   tensor_store %tensor_result, %result : memref<2x2xf32>
   return
 }
 
 // -----
 
-// CHECK-LABEL: func @rsqrt
+// BOTH-LABEL: func @rsqrt
 func @rsqrt(%operand: memref<2x2xf32>, %result: memref<2x2xf32>) {
   %tensor_operand = tensor_load %operand : memref<2x2xf32>
   %tensor_result = "xla_hlo.rsqrt"(%tensor_operand)
       : (tensor<2x2xf32>) -> tensor<2x2xf32>
-  // CHECK: "xla_lhlo.rsqrt"(%{{.*}}, %{{.*}})
+  // BOTH: "xla_lhlo.rsqrt"(%{{.*}}, %{{.*}})
   tensor_store %tensor_result, %result : memref<2x2xf32>
   return
 }
 
 // -----
 
-// CHECK-LABEL: func @sign
+// BOTH-LABEL: func @sign
 func @sign(%operand: memref<2x2xf32>, %result: memref<2x2xf32>) {
   %tensor_operand = tensor_load %operand : memref<2x2xf32>
   %tensor_result = "xla_hlo.sign"(%tensor_operand)
       : (tensor<2x2xf32>) -> tensor<2x2xf32>
-  // CHECK: "xla_lhlo.sign"(%{{.*}}, %{{.*}})
+  // BOTH: "xla_lhlo.sign"(%{{.*}}, %{{.*}})
   tensor_store %tensor_result, %result : memref<2x2xf32>
   return
 }
 
 // -----
 
-// CHECK-LABEL: func @sqrt
+// BOTH-LABEL: func @sqrt
 func @sqrt(%operand: memref<2x2xf32>, %result: memref<2x2xf32>) {
   %tensor_operand = tensor_load %operand : memref<2x2xf32>
   %tensor_result = "xla_hlo.sqrt"(%tensor_operand)
       : (tensor<2x2xf32>) -> tensor<2x2xf32>
-  // CHECK: "xla_lhlo.sqrt"(%{{.*}}, %{{.*}})
+  // BOTH: "xla_lhlo.sqrt"(%{{.*}}, %{{.*}})
   tensor_store %tensor_result, %result : memref<2x2xf32>
   return
 }
 
 // -----
 
-// CHECK-LABEL: func @tanh
+// BOTH-LABEL: func @tanh
 func @tanh(%operand: memref<2x2xf32>, %result: memref<2x2xf32>) {
   %tensor_operand = tensor_load %operand : memref<2x2xf32>
   %tensor_result = "xla_hlo.tanh"(%tensor_operand)
       : (tensor<2x2xf32>) -> tensor<2x2xf32>
-  // CHECK: "xla_lhlo.tanh"(%{{.*}}, %{{.*}})
+  // BOTH: "xla_lhlo.tanh"(%{{.*}}, %{{.*}})
   tensor_store %tensor_result, %result : memref<2x2xf32>
   return
 }
 
 // -----
 
-// CHECK-LABEL: func @remainder
+// BOTH-LABEL: func @remainder
 func @remainder(%lhs: memref<2x2xf32>, %rhs: memref<2x2xf32>, %result: memref<2x2xf32>) {
   %tensor_lhs = tensor_load %lhs : memref<2x2xf32>
   %tensor_rhs = tensor_load %rhs : memref<2x2xf32>
   %tensor_result = "xla_hlo.remainder"(%tensor_lhs, %tensor_rhs)
       : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
-  // CHECK: "xla_lhlo.remainder"(%{{.*}}, %{{.*}}, %{{.*}})
+  // BOTH: "xla_lhlo.remainder"(%{{.*}}, %{{.*}}, %{{.*}})
   tensor_store %tensor_result, %result : memref<2x2xf32>
   return
 }
@@ -387,76 +393,79 @@ func @remainder(%lhs: memref<2x2xf32>, %rhs: memref<2x2xf32>, %result: memref<2x
 // -----
 
 // Dynamic shape binary element-wise operation.
-// CHECK-LABEL: func @add_dyn
+// BOTH-LABEL: func @add_dyn
 func @add_dyn(%lhs: tensor<?x?xf32>, %rhs: tensor<?x?xf32>) {
   %result = "xla_hlo.add"(%lhs, %rhs)
       : (tensor<?x?xf32>, tensor<?x?xf32>) -> tensor<?x?xf32>
-  // CHECK: %[[C0:.*]] = constant 0 : index
-  // CHECK: %[[DIM0:.*]] = dim %arg0, %[[C0]] : memref<?x?xf32>
-  // CHECK: %[[IC0:.*]] = index_cast %[[DIM0]] : index to i64
-  // CHECK: %[[C1:.*]] = constant 1 : index
-  // CHECK: %[[DIM1:.*]] = dim %arg0, %[[C1]] : memref<?x?xf32>
-  // CHECK: %[[IC1:.*]] = index_cast %[[DIM1]] : index to i64
-  // CHECK: %[[SHAPE:.*]] = tensor_from_elements(%[[IC0]], %[[IC1]]) : tensor<2xi64>
-  // CHECK: %[[C0_:.*]] = constant 0 : index
-  // CHECK: %[[EE0:.*]] = extract_element %[[SHAPE]][%[[C0_]]] : tensor<2xi64>
-  // CHECK: %[[ICS0:.*]] = index_cast %[[EE0]] : i64 to index
-  // CHECK: %[[C1_:.*]] = constant 1 : index
-  // CHECK: %[[EE1:.*]] = extract_element %[[SHAPE]][%[[C1_]]] : tensor<2xi64>
-  // CHECK: %[[ICS1:.*]] = index_cast %[[EE1]] : i64 to index
-  // CHECK: %[[RESULT:.*]] = alloc(%[[ICS0]], %[[ICS1]])
-  // CHECK: "xla_lhlo.add"(%arg0, %arg1, %[[RESULT]]) : (memref<?x?xf32>, memref<?x?xf32>, memref<?x?xf32>) -> ()
+  // BOTH: %[[C0:.*]] = constant 0 : index
+  // BOTH: %[[DIM0:.*]] = dim %arg0, %[[C0]] : memref<?x?xf32>
+  // BOTH: %[[IC0:.*]] = index_cast %[[DIM0]] : index to i64
+  // BOTH: %[[C1:.*]] = constant 1 : index
+  // BOTH: %[[DIM1:.*]] = dim %arg0, %[[C1]] : memref<?x?xf32>
+  // BOTH: %[[IC1:.*]] = index_cast %[[DIM1]] : index to i64
+  // BOTH: %[[SHAPE:.*]] = tensor_from_elements(%[[IC0]], %[[IC1]]) : tensor<2xi64>
+  // BOTH: %[[C0_:.*]] = constant 0 : index
+  // BOTH: %[[EE0:.*]] = extract_element %[[SHAPE]][%[[C0_]]] : tensor<2xi64>
+  // BOTH: %[[ICS0:.*]] = index_cast %[[EE0]] : i64 to index
+  // BOTH: %[[C1_:.*]] = constant 1 : index
+  // BOTH: %[[EE1:.*]] = extract_element %[[SHAPE]][%[[C1_]]] : tensor<2xi64>
+  // BOTH: %[[ICS1:.*]] = index_cast %[[EE1]] : i64 to index
+  // BOTH: %[[RESULT:.*]] = alloc(%[[ICS0]], %[[ICS1]])
+  // BOTH: "xla_lhlo.add"(%arg0, %arg1, %[[RESULT]]) : (memref<?x?xf32>, memref<?x?xf32>, memref<?x?xf32>) -> ()
   return
 }
 
 // -----
 
 // Dynamic shape unary element-wise operation.
-// CHECK-LABEL: func @tanh_dyn
+// BOTH-LABEL: func @tanh_dyn
 func @tanh_dyn(%arg0: tensor<?x?xf32>) {
   %result = "xla_hlo.tanh"(%arg0)
       : (tensor<?x?xf32>) -> tensor<?x?xf32>
-  // CHECK: %[[C0:.*]] = constant 0 : index
-  // CHECK: %[[DIM0:.*]] = dim %arg0, %[[C0]] : memref<?x?xf32>
-  // CHECK: %[[IC0:.*]] = index_cast %[[DIM0]] : index to i64
-  // CHECK: %[[C1:.*]] = constant 1 : index
-  // CHECK: %[[DIM1:.*]] = dim %arg0, %[[C1]] : memref<?x?xf32>
-  // CHECK: %[[IC1:.*]] = index_cast %[[DIM1]] : index to i64
-  // CHECK: %[[SHAPE:.*]] = tensor_from_elements(%[[IC0]], %[[IC1]]) : tensor<2xi64>
-  // CHECK: %[[C0_:.*]] = constant 0 : index
-  // CHECK: %[[EE0:.*]] = extract_element %[[SHAPE]][%[[C0_]]] : tensor<2xi64>
-  // CHECK: %[[ICS0:.*]] = index_cast %[[EE0]] : i64 to index
-  // CHECK: %[[C1_:.*]] = constant 1 : index
-  // CHECK: %[[EE1:.*]] = extract_element %[[SHAPE]][%[[C1_]]] : tensor<2xi64>
-  // CHECK: %[[ICS1:.*]] = index_cast %[[EE1]] : i64 to index
-  // CHECK: %[[RESULT:.*]] = alloc(%[[ICS0]], %[[ICS1]])
-  // CHECK: "xla_lhlo.tanh"(%arg0, %[[RESULT]]) : (memref<?x?xf32>, memref<?x?xf32>) -> ()
+  // BOTH: %[[C0:.*]] = constant 0 : index
+  // BOTH: %[[DIM0:.*]] = dim %arg0, %[[C0]] : memref<?x?xf32>
+  // BOTH: %[[IC0:.*]] = index_cast %[[DIM0]] : index to i64
+  // BOTH: %[[C1:.*]] = constant 1 : index
+  // BOTH: %[[DIM1:.*]] = dim %arg0, %[[C1]] : memref<?x?xf32>
+  // BOTH: %[[IC1:.*]] = index_cast %[[DIM1]] : index to i64
+  // BOTH: %[[SHAPE:.*]] = tensor_from_elements(%[[IC0]], %[[IC1]]) : tensor<2xi64>
+  // BOTH: %[[C0_:.*]] = constant 0 : index
+  // BOTH: %[[EE0:.*]] = extract_element %[[SHAPE]][%[[C0_]]] : tensor<2xi64>
+  // BOTH: %[[ICS0:.*]] = index_cast %[[EE0]] : i64 to index
+  // BOTH: %[[C1_:.*]] = constant 1 : index
+  // BOTH: %[[EE1:.*]] = extract_element %[[SHAPE]][%[[C1_]]] : tensor<2xi64>
+  // BOTH: %[[ICS1:.*]] = index_cast %[[EE1]] : i64 to index
+  // BOTH: %[[RESULT:.*]] = alloc(%[[ICS0]], %[[ICS1]])
+  // BOTH: "xla_lhlo.tanh"(%arg0, %[[RESULT]]) : (memref<?x?xf32>, memref<?x?xf32>) -> ()
   return
 }
 
 // -----
 
-// CHECK-LABEL: func @dot
+// BOTH-LABEL: func @dot
 func @dot(%arg0: tensor<1024x1024xf32>) -> tensor<1024x1024xf32> {
-// CHECK-SAME: (%[[ARG0:.*]]: [[TYPE:.*]],
-// CHECK-SAME:  %[[RESULT:.*]]: [[TYPE]])
-// CHECK: "xla_lhlo.dot"(%[[ARG0]], %[[ARG0]], %{{.*}}) : ([[TYPE]], [[TYPE]], [[TYPE]]) -> ()
+//  PRE-SAME: (%[[ARG0:.*]]: [[TYPE:.*]], %[[RESULT:.*]]: [[TYPE]])
+//  ESC-SAME: (%[[ARG0:.*]]: [[TYPE:.*]]) -> [[TYPE]]
+// BOTH-NEXT: %[[ALLOC:.*]] = alloc
+//      BOTH: "xla_lhlo.dot"(%[[ARG0]], %[[ARG0]], %[[ALLOC]]) : ([[TYPE]], [[TYPE]], [[TYPE]]) -> ()
   %dot = "xla_hlo.dot"(%arg0, %arg0)
           : (tensor<1024x1024xf32>, tensor<1024x1024xf32>) -> tensor<1024x1024xf32>
+// PRE: "xla_lhlo.copy"(%[[ALLOC]], %[[RESULT]])
+// ESC: return %[[ALLOC]]
   return %dot : tensor<1024x1024xf32>
 }
 
 // -----
 
-// CHECK-LABEL: func @conv
+// BOTH-LABEL: func @conv
 func @conv(%input: tensor<3x5x5x3xf32>, %filter : tensor<2x2x3x4xf32>) -> tensor<3x5x5x4xf32> {
   %c0 = constant 0 : index
-  // CHECK: %[[OUT:.*]] = alloc() : memref<3x5x5x4xf32>
-  // CHECK: "xla_lhlo.convolution"(%{{.+}}, %{{.+}}, %[[OUT]])
-  // CHECK-SAME: padding = dense<[
-  // CHECK-SAME:                  [0, 1], [0, 1]]> : tensor<2x2xi64>
-  // CHECK-SAME: rhs_dilation = dense<[1, 2]>
-  // CHECK-SAME: window_strides = dense<[2, 1]>
+  // BOTH: %[[OUT:.*]] = alloc() : memref<3x5x5x4xf32>
+  // BOTH: "xla_lhlo.convolution"(%{{.+}}, %{{.+}}, %[[OUT]])
+  // BOTH-SAME: padding = dense<[
+  // BOTH-SAME:                  [0, 1], [0, 1]]> : tensor<2x2xi64>
+  // BOTH-SAME: rhs_dilation = dense<[1, 2]>
+  // BOTH-SAME: window_strides = dense<[2, 1]>
   %out = "xla_hlo.convolution"(%filter, %input) {
     batch_group_count = 1 : i64,
     dimension_numbers = {
diff --git a/tensorflow/compiler/mlir/xla/transforms/hlo_legalize_to_lhlo.cc b/tensorflow/compiler/mlir/xla/transforms/hlo_legalize_to_lhlo.cc
index f966ce765fc..1cfe0c12e20 100644
--- a/tensorflow/compiler/mlir/xla/transforms/hlo_legalize_to_lhlo.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/hlo_legalize_to_lhlo.cc
@@ -368,6 +368,15 @@ class HloToLhloTensorStoreOpConverter
 
 struct HloLegalizeToLhlo
     : public PassWrapper<HloLegalizeToLhlo, OperationPass<ModuleOp>> {
+ public:
+  HloLegalizeToLhlo() = default;
+  HloLegalizeToLhlo(const HloLegalizeToLhlo& o) {
+    this->results_escape_function = o.results_escape_function.getValue();
+  }
+  explicit HloLegalizeToLhlo(bool results_escape_function) {
+    this->results_escape_function.setValue(results_escape_function);
+  }
+
   void runOnOperation() override {
     OwningRewritePatternList patterns;
     auto& context = getContext();
@@ -398,10 +407,28 @@ struct HloLegalizeToLhlo
       OwningRewritePatternList patterns;
       populateHLOToLHLOConversionPattern(func.getContext(), &bufferAssignment,
                                          &converter, &patterns);
+      if (results_escape_function) {
+        populateWithBufferAssignmentOpConversionPatterns<
+            mlir::ReturnOp, mlir::ReturnOp, xla_lhlo::CopyOp,
+            /*allowMemrefFunctionResults=*/true>(&context, &bufferAssignment,
+                                                 &converter, &patterns);
+      } else {
+        populateWithBufferAssignmentOpConversionPatterns<
+            mlir::ReturnOp, mlir::ReturnOp, xla_lhlo::CopyOp,
+            /*allowMemrefFunctionResults=*/false>(&context, &bufferAssignment,
+                                                  &converter, &patterns);
+      }
       return WalkResult(
           applyPartialConversion(func, target, patterns, &converter));
     });
   }
+
+ private:
+  Option<bool> results_escape_function{
+      *this, "results-escape-function",
+      llvm::cl::desc(
+          "Allocate the results of functions within the functions body"),
+      llvm::cl::init(false)};
 };
 }  // namespace
 
@@ -446,14 +473,11 @@ void populateHLOToLHLOConversionPattern(
       HloToLhloTensorStoreOpConverter
   >(context, bufferAssignment, converter);
   // clang-format on
-  populateWithBufferAssignmentOpConversionPatterns<
-      mlir::ReturnOp, mlir::ReturnOp, xla_lhlo::CopyOp,
-      /*allowMemrefFunctionResults=*/false>(context, bufferAssignment,
-                                            converter, patterns);
 }
 
-std::unique_ptr<OperationPass<ModuleOp>> createLegalizeToLhloPass() {
-  return absl::make_unique<HloLegalizeToLhlo>();
+std::unique_ptr<OperationPass<ModuleOp>> createLegalizeToLhloPass(
+    bool results_escape_function) {
+  return absl::make_unique<HloLegalizeToLhlo>(results_escape_function);
 }
 
 static PassRegistration<HloLegalizeToLhlo> legalize_pass(
diff --git a/tensorflow/compiler/mlir/xla/transforms/passes.h b/tensorflow/compiler/mlir/xla/transforms/passes.h
index 9b9c799b2f0..f0c2d9b7372 100644
--- a/tensorflow/compiler/mlir/xla/transforms/passes.h
+++ b/tensorflow/compiler/mlir/xla/transforms/passes.h
@@ -59,9 +59,13 @@ std::unique_ptr<OperationPass<FuncOp>> createLegalizeControlFlowPass();
 /// Lowers from HLO dialect to Standard dialect.
 std::unique_ptr<OperationPass<FuncOp>> createLegalizeToStdPass();
 
-// Lowers from HLO dialect to LHLO dialect allocating/deallocating temporary
-// buffers if necessary.
-std::unique_ptr<OperationPass<ModuleOp>> createLegalizeToLhloPass();
+/// Lowers from HLO dialect to LHLO dialect allocating/deallocating temporary
+/// buffers if necessary. If `results_escape_functions` is set to true,
+/// allocated buffers for function results will be returned and escape the
+/// function. Otherwise, the signature is rewritten with extra arguments for the
+/// buffers that are to be used for results.
+std::unique_ptr<OperationPass<ModuleOp>> createLegalizeToLhloPass(
+    bool results_escape_functions = false);
 
 // Lowers from HLO dialect to Linalg dialect.
 std::unique_ptr<OperationPass<FuncOp>> createLegalizeHloToLinalgPass();

From 2614035897d5c29fe54a3c45a605f11703db7097 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 16 Jun 2020 05:39:27 -0700
Subject: [PATCH 0279/1390] Integrate LLVM at
 https://github.com/llvm/llvm-project/commit/9b72b47ed633

PiperOrigin-RevId: 316662757
Change-Id: I94751da57f9731bd60aae6dd09186f3e173e5c89
---
 .../lhlo-legalize-select-and-scatter.mlir     | 30 ++++++++-----------
 .../lhlo-legalize-to-parallel-loops.mlir      | 10 ++-----
 2 files changed, 15 insertions(+), 25 deletions(-)

diff --git a/tensorflow/compiler/mlir/xla/tests/lhlo-legalize-select-and-scatter.mlir b/tensorflow/compiler/mlir/xla/tests/lhlo-legalize-select-and-scatter.mlir
index a8c8f5e73c5..9887860ca26 100644
--- a/tensorflow/compiler/mlir/xla/tests/lhlo-legalize-select-and-scatter.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/lhlo-legalize-select-and-scatter.mlir
@@ -38,15 +38,15 @@ func @select_and_scatter(%arg: memref<112x112xf32>,
 // CHECK-SAME:   [[RESULT_BUF:%.*]]: memref<112x112xf32>) {
 
 // Constants.
-// CHECK:  [[C56:%.*]] = constant 56 : index
-// CHECK:  [[C1:%.*]] = constant 1 : index
-// CHECK:  [[C0_F32:%.*]] = constant 0.000000e+00 : f32
-// CHECK:  [[CFALSE:%.*]] = constant false
-// CHECK:  [[C3:%.*]] = constant 3 : index
-// CHECK:  [[C2:%.*]] = constant 2 : index
-// CHECK:  [[C0:%.*]] = constant 0 : index
-// CHECK:  [[C112:%.*]] = constant 112 : index
-// CHECK:  [[CTRUE:%.*]] = constant true
+// CHECK-DAG:  [[C56:%.*]] = constant 56 : index
+// CHECK-DAG:  [[C0:%.*]] = constant 0 : index
+// CHECK-DAG:  [[C1:%.*]] = constant 1 : index
+// CHECK-DAG:  [[C0_F32:%.*]] = constant 0.000000e+00 : f32
+// CHECK-DAG:  [[CFALSE:%.*]] = constant false
+// CHECK-DAG:  [[C3:%.*]] = constant 3 : index
+// CHECK-DAG:  [[C2:%.*]] = constant 2 : index
+// CHECK-DAG:  [[C112:%.*]] = constant 112 : index
+// CHECK-DAG:  [[CTRUE:%.*]] = constant true
 
 // Parallel loop to initialize the output buffer.
 // CHECK:    [[INIT:%.*]] = load [[INIT_BUF]][] : memref<f32>
@@ -80,23 +80,17 @@ func @select_and_scatter(%arg: memref<112x112xf32>,
 
 // Compute index I of the ARG buffer and check whether it is in padding area.
 // CHECK:  [[START_I:%.*]] = muli [[II]], [[C2]] : index
-// CHECK:  [[OFFSET_I:%.*]] = subi [[WIN_I]], [[C0]] : index
-// CHECK:  [[ARG_I:%.*]] = addi [[START_I]], [[OFFSET_I]] : index
+// CHECK:  [[ARG_I:%.*]] = addi [[START_I]], [[WIN_I]] : index
 // CHECK:  [[ARG_I_FITS:%.*]] = cmpi "ult", [[ARG_I]], [[C112]] : index
 
-// Update `INBOUNDS`, i.e. whether or not ARG indices are inside the boundaries
-// of the buffer or they are in the padding area.
-// CHECK:      [[INBOUNDS_0:%.*]] = and [[ARG_I_FITS]], [[CTRUE]] : i1
-
 // Compute index J of the ARG buffer and check whether it is in padding area.
 // CHECK:  [[START_J:%.*]] = muli [[JJ]], [[C2]] : index
-// CHECK:  [[OFFSET_J:%.*]] = subi [[WIN_J]], [[C0]] : index
-// CHECK:  [[ARG_J:%.*]] = addi [[START_J]], [[OFFSET_J]] : index
+// CHECK:  [[ARG_J:%.*]] = addi [[START_J]], [[WIN_J]] : index
 // CHECK:  [[ARG_J_FITS:%.*]] = cmpi "ult", [[ARG_J]], [[C112]] : index
 
 // Update `INBOUNDS`, i.e. whether or not ARG indices are inside the boundaries
 // of the buffer or they are in the padding area.
-// CHECK:  [[INBOUNDS_1:%.*]] = and [[INBOUNDS_0]], [[ARG_J_FITS]] : i1
+// CHECK:  [[INBOUNDS_1:%.*]] = and [[ARG_I_FITS]], [[ARG_J_FITS]] : i1
 
 // If ARG ivs are in the padding area, then 'select' function does not have to
 // be applied, current selected ivs (SEL_I, SEL_J) and value (SEL_VAL) are
diff --git a/tensorflow/compiler/mlir/xla/tests/lhlo-legalize-to-parallel-loops.mlir b/tensorflow/compiler/mlir/xla/tests/lhlo-legalize-to-parallel-loops.mlir
index 8c22c035edd..5127bcfcd8f 100644
--- a/tensorflow/compiler/mlir/xla/tests/lhlo-legalize-to-parallel-loops.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/lhlo-legalize-to-parallel-loops.mlir
@@ -151,7 +151,6 @@ func @reduce_window(%arg: memref<112x112xf32>,
 // CHECK-SAME:      [[OPERAND_BUF:%.*]]: memref<112x112xf32>,
 // CHECK-SAME:      [[INIT_BUF:%.*]]: memref<f32>,
 // CHECK-SAME:      [[RESULT_BUF:%.*]]: memref<56x56xf32>) {
-// CHECK-DAG:  [[IN_BOUNDS:%.*]] = constant true
 // CHECK-DAG:  [[C0:%.*]] = constant 0 : index
 // CHECK-DAG:  [[C1:%.*]] = constant 1 : index
 // CHECK-DAG:  [[C2:%.*]] = constant 2 : index
@@ -167,16 +166,13 @@ func @reduce_window(%arg: memref<112x112xf32>,
 // CHECK-SAME:       init ([[INIT]]) -> f32 {
 
 // CHECK:          [[START_I:%.*]] = muli [[I]], [[C2]] : index
-// CHECK:          [[OFFSET_I:%.*]] = subi [[IW]], [[C0]] : index
-// CHECK:          [[INDEX_I:%.*]] = addi [[START_I]], [[OFFSET_I]] : index
+// CHECK:          [[INDEX_I:%.*]] = addi [[START_I]], [[IW]] : index
 // CHECK:          [[INDEX_I_FITS:%.*]] = cmpi "ult", [[INDEX_I]], [[C112]]
-// CHECK:          [[IN_BOUNDS_0:%.*]] = and [[INDEX_I_FITS]], [[IN_BOUNDS]]
 
 // CHECK:          [[START_J:%.*]] = muli [[J]], [[C2]] : index
-// CHECK:          [[OFFSET_J:%.*]] = subi [[JW]], [[C0]] : index
-// CHECK:          [[INDEX_J:%.*]] = addi [[START_J]], [[OFFSET_J]] : index
+// CHECK:          [[INDEX_J:%.*]] = addi [[START_J]], [[JW]] : index
 // CHECK:          [[INDEX_J_FITS:%.*]] = cmpi "ult", [[INDEX_J]], [[C112]]
-// CHECK:          [[IN_BOUNDS_1:%.*]] = and [[IN_BOUNDS_0]], [[INDEX_J_FITS]]
+// CHECK:          [[IN_BOUNDS_1:%.*]] = and [[INDEX_I_FITS]], [[INDEX_J_FITS]]
 
 // CHECK:          [[ELEM_TO_REDUCE:%.*]] = scf.if [[IN_BOUNDS_1]] -> (f32) {
 // CHECK:            [[OPERAND_ELEM:%.*]] =

From 55a1f169c3d2db146da385722213e8b4dfcfa6c3 Mon Sep 17 00:00:00 2001
From: Stephan Herhut <herhut@google.com>
Date: Tue, 16 Jun 2020 06:09:10 -0700
Subject: [PATCH 0280/1390] Fix same_shape constraint handling in cubin
 generator.

PiperOrigin-RevId: 316665956
Change-Id: Icd582db4e0c4c19b7f4b0c563a3323df0cda6836
---
 .../mlir/tools/kernel_gen/cubin_creator.cc    | 39 ++++++++++++-------
 tensorflow/core/kernels/cubin_headers/BUILD   |  1 +
 2 files changed, 26 insertions(+), 14 deletions(-)

diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/cubin_creator.cc b/tensorflow/compiler/mlir/tools/kernel_gen/cubin_creator.cc
index 30b60e8079f..b534b5a5604 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/cubin_creator.cc
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/cubin_creator.cc
@@ -125,7 +125,8 @@ Status LowerTfOpToLhloWithDynamicShapes(mlir::ModuleOp module) {
   pm.addNestedPass<mlir::FuncOp>(
       absl::make_unique<MaterializeBroadcastsPass>());
   pm.addNestedPass<mlir::FuncOp>(absl::make_unique<UnfuseBatchNormPass>());
-  pm.addPass(mlir::xla_hlo::createLegalizeToLhloPass());
+  pm.addPass(mlir::xla_hlo::createLegalizeToLhloPass(
+      /*results_escape_functions=*/true));
   pm.addNestedPass<mlir::FuncOp>(mlir::xla_lhlo::createLhloCopyRemovalPass());
 
   if (failed(pm.run(module))) {
@@ -148,7 +149,12 @@ struct PropagateStaticKnowledge
     // We do not change the signature so that we keep a somewhat stable ABI
     // that is easy to undertand by tools.
     mlir::LLVM::LLVMFuncOp func = getOperation();
+
+    // This only works if the function is local and we can rewrite it.
+    if (func.isExternal()) return;
+
     mlir::OpBuilder b(func.getBody());
+    // Steal the LLVM representation of the index type from the third argument.
     auto index_type = func.getArgument(3).getType();
     mlir::Value one = b.create<mlir::LLVM::ConstantOp>(
         func.getLoc(), index_type, b.getIntegerAttr(b.getIndexType(), 1));
@@ -156,10 +162,21 @@ struct PropagateStaticKnowledge
         func.getLoc(), index_type, b.getIntegerAttr(b.getIndexType(), 0));
     uint32_t arg_pos = 0;
     std::vector<uint32_t> positions;
-    for (mlir::Type arg_type : func_type.getInputs()) {
+    // Collect the agument and return types of the surrounding function.
+    auto arg_types = llvm::to_vector<4>(llvm::concat<const mlir::Type>(
+        func_type.getInputs(), func_type.getResults()));
+    for (mlir::Type arg_type : arg_types) {
+      if (!arg_type.isa<mlir::MemRefType>()) {
+        func.emitError() << "argument of surrounding func is not ranked memref";
+        signalPassFailure();
+        return;
+      }
       positions.push_back(arg_pos);
+      // Replace the offset with zero. Offset is argument number 3.
       func.getArgument(arg_pos + 2).replaceAllUsesWith(zero);
-      arg_pos += 3 + arg_type.cast<mlir::ShapedType>().getRank() * 2;
+      // Forward over base_ptr, aligned_ptr, offset, size and stride arguments.
+      arg_pos += 3 + arg_type.cast<mlir::MemRefType>().getRank() * 2;
+      // Replace the last stride with constant 1.
       func.getArgument(arg_pos - 1).replaceAllUsesWith(one);
     }
 
@@ -169,17 +186,17 @@ struct PropagateStaticKnowledge
     if (!same_shape.empty()) {
       auto first = same_shape.front();
       auto first_offset = positions.at(first);
-      mlir::ShapedType first_type =
-          func_type.getInput(first).cast<mlir::ShapedType>();
+      auto first_type = arg_types[first].cast<mlir::ShapedType>();
       uint32_t rank = first_type.getRank();
       for (auto same : same_shape.drop_front(1)) {
         uint32_t same_offset = positions.at(same);
-        auto same_type = func_type.getInput(same).cast<mlir::ShapedType>();
+        auto same_type = arg_types[same].cast<mlir::ShapedType>();
         if (same_type.getRank() != rank) {
           func.emitOpError() << "same shape constraints on arguments with "
                                 "non-matching shapes: #"
                              << first << " and #" << same;
           signalPassFailure();
+          continue;
         }
 
         for (uint32_t i = 0; i < 2 * rank; ++i) {
@@ -245,14 +262,8 @@ StatusOr<std::vector<uint8_t>> tensorflow::kernel_gen::GenerateCubinForTfCode(
     TF_RETURN_IF_ERROR(xla::mlir_gpu::LowerLHLOToGPU(module.get(), options));
   }
   TF_RETURN_IF_ERROR(xla::mlir_gpu::LowerKernelBodiesToNVVM(module.get()));
-  // TODO(b/156985522): Figure out why we get a segfault when generating Tanh
-  // with 'same_shape' containing {0, 1}. We would also get the crash if we
-  // unconditionally call PropagateStaticShapeKnowledgeToKernel while
-  // 'same_shape' is empty.
-  if (!same_shape.empty()) {
-    TF_RETURN_IF_ERROR(
-        PropagateStaticShapeKnowledgeToKernel(module.get(), same_shape));
-  }
+  TF_RETURN_IF_ERROR(
+      PropagateStaticShapeKnowledgeToKernel(module.get(), same_shape));
 
   mlir::OwningModuleRef kernel_module =
       xla::mlir_gpu::ExtractKernelModule(*module).ValueOrDie();
diff --git a/tensorflow/core/kernels/cubin_headers/BUILD b/tensorflow/core/kernels/cubin_headers/BUILD
index 49ab1b8a911..b8ba164fbc3 100644
--- a/tensorflow/core/kernels/cubin_headers/BUILD
+++ b/tensorflow/core/kernels/cubin_headers/BUILD
@@ -30,6 +30,7 @@ gen_kernel_library(
 
 gen_kernel_library(
     name = "tanh",
+    same_shape = "0,1",
     tile_size = "256",
     types = [
         "f16",

From 35fc2312816ffbb577843e2741568049639021e7 Mon Sep 17 00:00:00 2001
From: Alexander Belyaev <pifon@google.com>
Date: Tue, 16 Jun 2020 06:15:38 -0700
Subject: [PATCH 0281/1390] Internal change

PiperOrigin-RevId: 316666669
Change-Id: I39909bfda1b991755cc4f7a69b00659fa947b8f4
---
 third_party/mlir/BUILD | 27 +++++----------------------
 1 file changed, 5 insertions(+), 22 deletions(-)

diff --git a/third_party/mlir/BUILD b/third_party/mlir/BUILD
index 06e0ed8d4b4..cb0b2f9dc8e 100644
--- a/third_party/mlir/BUILD
+++ b/third_party/mlir/BUILD
@@ -707,14 +707,8 @@ gentbl(
 
 cc_library(
     name = "Shape",
-    srcs = glob(
-        [
-            "lib/Dialect/Shape/IR/*.cpp",
-        ],
-    ),
-    hdrs = glob([
-        "include/mlir/Dialect/Shape/IR/*.h",
-    ]),
+    srcs = glob(["lib/Dialect/Shape/IR/*.cpp"]),
+    hdrs = ["include/mlir/Dialect/Shape/IR/Shape.h"],
     includes = ["include"],
     deps = [
         ":CallOpInterfaces",
@@ -737,22 +731,16 @@ cc_library(
         "lib/Conversion/ShapeToStandard/*.cpp",
         "lib/Conversion/ShapeToStandard/*.h",
     ]) + ["lib/Conversion/PassDetail.h"],
-    hdrs = glob([
-        "include/mlir/Conversion/ShapeToStandard/*.h",
-    ]),
+    hdrs = ["include/mlir/Conversion/ShapeToStandard/ShapeToStandard.h"],
     includes = ["include"],
     deps = [
-        ":Affine",
         ":ConversionPassIncGen",
-        ":IR",
         ":Pass",
         ":SCFDialect",
         ":Shape",
         ":StandardOps",
         ":Support",
         ":Transforms",
-        "@llvm-project//llvm:Core",
-        "@llvm-project//llvm:Support",
     ],
 )
 
@@ -793,18 +781,15 @@ cc_library(
         "lib/Dialect/Shape/Transforms/*.cpp",
         "lib/Dialect/Shape/Transforms/*.h",
     ]),
-    hdrs = glob(["include/mlir/Dialect/Shape/Transforms/*.h"]),
+    hdrs = ["include/mlir/Dialect/Shape/Transforms/Passes.h"],
     includes = ["include"],
     deps = [
-        #":Analysis",
-        #":ControlFlowInterfaces",
         ":IR",
         ":Pass",
         ":Shape",
         ":ShapeTransformsPassIncGen",
         ":Support",
         ":Transforms",
-        #"@llvm-project//llvm:Support",
     ],
 )
 
@@ -820,9 +805,7 @@ cc_library(
     hdrs = glob([
         "include/mlir/Dialect/StandardOps/IR/*.h",
         "include/mlir/Dialect/StandardOps/EDSC/*.h",
-    ]) + [
-        "include/mlir/Transforms/InliningUtils.h",
-    ],
+    ]) + ["include/mlir/Transforms/InliningUtils.h"],
     includes = ["include"],
     deps = [
         ":CallOpInterfaces",

From b8b6bc58a1954353b6fbd49e6d7a0d1f6ad18072 Mon Sep 17 00:00:00 2001
From: Thomas Joerg <tjoerg@google.com>
Date: Tue, 16 Jun 2020 06:21:33 -0700
Subject: [PATCH 0282/1390] [XLA] Keep op metadata when replacing with bitcast.

PiperOrigin-RevId: 316667446
Change-Id: I3618b5eafba269eb087ff7607bdad1cb3873ffbc
---
 tensorflow/compiler/xla/service/algebraic_simplifier.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier.cc b/tensorflow/compiler/xla/service/algebraic_simplifier.cc
index f88f08b9fa2..98e3229b062 100755
--- a/tensorflow/compiler/xla/service/algebraic_simplifier.cc
+++ b/tensorflow/compiler/xla/service/algebraic_simplifier.cc
@@ -573,6 +573,7 @@ void AlgebraicSimplifierVisitor::ReplaceWithBitcast(HloInstruction* instruction,
 
   auto bitcast = computation_->AddInstruction(
       HloInstruction::CreateBitcast(instruction->shape(), operand));
+  bitcast->set_metadata(instruction->metadata());
   TF_CHECK_OK(ReplaceInstruction(instruction, bitcast));
 }
 

From d9532e652600fcda0fee32b14cd62a4404a662fe Mon Sep 17 00:00:00 2001
From: Benjamin Kramer <kramerb@google.com>
Date: Tue, 16 Jun 2020 06:27:29 -0700
Subject: [PATCH 0283/1390] Bump open source llvm revision to
 9b72b47ed63351ee5ceff4c44ccd9a71dc7dad27

PiperOrigin-RevId: 316668225
Change-Id: Ie49a7f46a51674c854e300629f4563c5be5e6448
---
 tensorflow/workspace.bzl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index 31daf2249a0..acba1598d1f 100755
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -710,8 +710,8 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
     )
 
     # Check out LLVM and MLIR from llvm-project.
-    LLVM_COMMIT = "1a7f115dce22b2c09fdd4f7f79d24da5de6eaef8"
-    LLVM_SHA256 = "3ace55744a86211c9c837915b88c18e1ed3e5cd839aaeade6aa88b02bc86e47e"
+    LLVM_COMMIT = "9b72b47ed63351ee5ceff4c44ccd9a71dc7dad27"
+    LLVM_SHA256 = "03ce1e00901936e7259c6ee465773b7f231ca1724925460b71909868f5a61e11"
     LLVM_URLS = [
         "https://storage.googleapis.com/mirror.tensorflow.org/github.com/llvm/llvm-project/archive/{commit}.tar.gz".format(commit = LLVM_COMMIT),
         "https://github.com/llvm/llvm-project/archive/{commit}.tar.gz".format(commit = LLVM_COMMIT),

From 572442eb16bb989c1ae9665549c2c2d9daac7952 Mon Sep 17 00:00:00 2001
From: Peter Hawkins <phawkins@google.com>
Date: Tue, 16 Jun 2020 06:56:05 -0700
Subject: [PATCH 0284/1390] [PJRT] Fix potential misuse of
 `PjRtBuffer::FromHostBuffer`.

Add a new `PjRtBuffer::HostBufferSemantics` enum that describes the possible contracts between caller and runtime.

* Change `FromHostBuffer(..., force_copy, ...)` to `FromHostBuffer(..., host_buffer_semantics, ...)`.

We were seeing some data races between modifications to a NumPy array and JAX on CPU, due to unintended buffer aliasing. This change allows clients to control whether they want zero-copy behavior or not.

PiperOrigin-RevId: 316672280
Change-Id: Ibee296305005e0aa306a2c0aacf4b35a3d6c3ac1
---
 tensorflow/compiler/xla/pjrt/cpu_device.cc    |   1 +
 .../compiler/xla/pjrt/gpu_multistream_test.cc |  17 +-
 .../compiler/xla/pjrt/interpreter_device.cc   |   1 +
 .../compiler/xla/pjrt/nvidia_gpu_device.cc    |   1 +
 tensorflow/compiler/xla/pjrt/pjrt_client.cc   | 162 +++++++++++++-----
 tensorflow/compiler/xla/pjrt/pjrt_client.h    |  49 ++++--
 tensorflow/compiler/xla/python/py_client.cc   |   9 +-
 tensorflow/compiler/xla/python/py_client.h    |   3 +-
 tensorflow/compiler/xla/python/xla.cc         |  11 +-
 tensorflow/compiler/xla/python/xla_client.py  |   1 +
 .../compiler/xla/python/xla_client_test.py    |   7 +-
 11 files changed, 195 insertions(+), 67 deletions(-)

diff --git a/tensorflow/compiler/xla/pjrt/cpu_device.cc b/tensorflow/compiler/xla/pjrt/cpu_device.cc
index 75c3bfc1277..be70c16fc12 100644
--- a/tensorflow/compiler/xla/pjrt/cpu_device.cc
+++ b/tensorflow/compiler/xla/pjrt/cpu_device.cc
@@ -59,6 +59,7 @@ StatusOr<std::shared_ptr<PjRtClient>> GetCpuClient(bool asynchronous) {
   return std::make_shared<PjRtClient>(
       kCpuPlatformName, client, std::move(devices), /*host_id=*/0,
       /*allocator=*/nullptr, /*host_memory_allocator=*/nullptr,
+      /*should_stage_host_to_device_transfers=*/false,
       /*gpu_run_options=*/nullptr);
 }
 
diff --git a/tensorflow/compiler/xla/pjrt/gpu_multistream_test.cc b/tensorflow/compiler/xla/pjrt/gpu_multistream_test.cc
index 2db7de3720d..d54be61fbb8 100644
--- a/tensorflow/compiler/xla/pjrt/gpu_multistream_test.cc
+++ b/tensorflow/compiler/xla/pjrt/gpu_multistream_test.cc
@@ -72,18 +72,21 @@ TEST(GpuMultiStream, Basics) {
     TF_ASSERT_OK_AND_ASSIGN(
         auto dummy_buffer,
         PjRtBuffer::FromHostBuffer(
-            dummy_inputs.data(), dummy_shape, /*force_copy=*/false,
+            dummy_inputs.data(), dummy_shape,
+            PjRtBuffer::HostBufferSemantics::kImmutableUntilTransferCompletes,
             /*buffer_reference=*/nullptr, client.get(), device));
     TF_ASSERT_OK_AND_ASSIGN(
         auto in_buffer0,
-        PjRtBuffer::FromHostBuffer(inputs.data(), shape, /*force_copy=*/false,
-                                   /*buffer_reference=*/nullptr, client.get(),
-                                   device));
+        PjRtBuffer::FromHostBuffer(
+            inputs.data(), shape,
+            PjRtBuffer::HostBufferSemantics::kImmutableUntilTransferCompletes,
+            /*buffer_reference=*/nullptr, client.get(), device));
     TF_ASSERT_OK_AND_ASSIGN(
         auto in_buffer1,
-        PjRtBuffer::FromHostBuffer(inputs.data(), shape, /*force_copy=*/false,
-                                   /*buffer_reference=*/nullptr, client.get(),
-                                   device));
+        PjRtBuffer::FromHostBuffer(
+            inputs.data(), shape,
+            PjRtBuffer::HostBufferSemantics::kImmutableUntilTransferCompletes,
+            /*buffer_reference=*/nullptr, client.get(), device));
     // The execution may be enqueued before the transfers complete, requiring
     // adequate device-side synchronization.
     ExecuteOptions options;
diff --git a/tensorflow/compiler/xla/pjrt/interpreter_device.cc b/tensorflow/compiler/xla/pjrt/interpreter_device.cc
index 63254d4aa70..f7138a8c181 100644
--- a/tensorflow/compiler/xla/pjrt/interpreter_device.cc
+++ b/tensorflow/compiler/xla/pjrt/interpreter_device.cc
@@ -53,6 +53,7 @@ StatusOr<std::shared_ptr<PjRtClient>> GetInterpreterClient() {
   return std::make_shared<PjRtClient>(
       kInterpreterPlatformName, client, std::move(devices), /*host_id=*/0,
       /*allocator=*/nullptr, /*host_memory_allocator=*/nullptr,
+      /*should_stage_host_to_device_transfers=*/false,
       /*gpu_run_options=*/nullptr);
 }
 
diff --git a/tensorflow/compiler/xla/pjrt/nvidia_gpu_device.cc b/tensorflow/compiler/xla/pjrt/nvidia_gpu_device.cc
index 4863e5e8165..de760af8fd9 100644
--- a/tensorflow/compiler/xla/pjrt/nvidia_gpu_device.cc
+++ b/tensorflow/compiler/xla/pjrt/nvidia_gpu_device.cc
@@ -316,6 +316,7 @@ StatusOr<std::shared_ptr<PjRtClient>> GetNvidiaGpuClient(
       "gpu", xla_client, std::move(devices),
       /*node_id=*/node_id, std::move(allocator),
       std::move(host_memory_allocator),
+      /*should_stage_host_to_device_transfers=*/true,
       /*gpu_run_options=*/std::move(gpu_run_options));
   return pyclient;
 }
diff --git a/tensorflow/compiler/xla/pjrt/pjrt_client.cc b/tensorflow/compiler/xla/pjrt/pjrt_client.cc
index c1b433845b2..ccb72b7ce30 100644
--- a/tensorflow/compiler/xla/pjrt/pjrt_client.cc
+++ b/tensorflow/compiler/xla/pjrt/pjrt_client.cc
@@ -95,6 +95,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/mem.h"
 #include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/lib/traceme.h"
@@ -154,18 +155,35 @@ StatusOr<DeviceAssignment> DevicesToDeviceAssignment(
   return xla_assignment;
 }
 
+class CpuAllocator : public tensorflow::Allocator {
+ public:
+  CpuAllocator() = default;
+
+  string Name() override { return "cpu"; }
+
+  void* AllocateRaw(size_t alignment, size_t num_bytes) override {
+    return tensorflow::port::AlignedMalloc(num_bytes, alignment);
+  }
+  void DeallocateRaw(void* ptr) override {
+    return tensorflow::port::AlignedFree(ptr);
+  }
+};
+
 PjRtClient::PjRtClient(
     std::string platform_name, LocalClient* client,
     std::vector<std::unique_ptr<Device>> devices, int host_id,
     std::unique_ptr<se::DeviceMemoryAllocator> allocator,
     std::unique_ptr<tensorflow::Allocator> host_memory_allocator,
+    bool should_stage_host_to_device_transfers,
     std::unique_ptr<GpuExecutableRunOptions> gpu_run_options)
     : platform_name_(std::move(platform_name)),
       client_(client),
+      host_memory_allocator_(std::move(host_memory_allocator)),
       devices_(std::move(devices)),
       host_id_(host_id),
       owned_allocator_(std::move(allocator)),
-      host_memory_allocator_(std::move(host_memory_allocator)),
+      should_stage_host_to_device_transfers_(
+          should_stage_host_to_device_transfers),
       gpu_run_options_(std::move(gpu_run_options)),
       h2d_transfer_pool_(tensorflow::Env::Default(), "py_xla_h2d_transfer",
                          client->device_count()) {
@@ -175,6 +193,10 @@ PjRtClient::PjRtClient(
     allocator_ = client_->backend().memory_allocator();
   }
 
+  if (!host_memory_allocator_) {
+    host_memory_allocator_ = std::make_unique<CpuAllocator>();
+  }
+
   for (const std::unique_ptr<Device>& device : devices_) {
     CHECK(id_to_device_.insert({device->id(), device.get()}).second)
         << "Duplicate device id: " << device->id();
@@ -526,7 +548,8 @@ void PjRtBuffer::ScopedHold::AddToInput(
 
 /* static */
 StatusOr<std::unique_ptr<PjRtBuffer>> PjRtBuffer::FromHostBuffer(
-    const void* data, const Shape& shape, bool force_copy,
+    const void* data, const Shape& shape,
+    HostBufferSemantics host_buffer_semantics,
     std::shared_ptr<void> buffer_reference, PjRtClient* client,
     Device* device) {
   tensorflow::profiler::TraceMe traceme("PjRtBuffer::FromHostBuffer");
@@ -537,34 +560,63 @@ StatusOr<std::unique_ptr<PjRtBuffer>> PjRtBuffer::FromHostBuffer(
   }
   TF_ASSIGN_OR_RETURN(LocalDeviceState * local_device,
                       device->GetLocalDeviceState());
-
-  // If we are on the host platform and the input buffer is sufficiently
-  // aligned, we can simply point to the input array's data without any further
-  // copies. At the time of writing we require a 16-byte alignment because XLA
-  // may generate code which requires it.
-  if (!force_copy &&
-      ((absl::bit_cast<std::uintptr_t>(data) &
-        (cpu_function_runtime::kMinAlign - 1)) == 0) &&
-      local_device->executor()->platform()->id() == se::host::kHostPlatformId) {
-    std::function<void()> on_delete_callback =
-        [buffer_reference{std::move(buffer_reference)}]() {
-          // Frees buffer_reference.
-        };
-    se::DeviceMemoryBase buffer(const_cast<void*>(data),
-                                ShapeUtil::ByteSizeOf(shape));
-    absl::Span<const std::shared_ptr<BufferSequencingEvent>> definition_events;
-    auto device_buffer = std::make_shared<TrackedDeviceBuffer>(
-        /*allocator=*/nullptr, local_device->device_ordinal(),
-        std::initializer_list<se::DeviceMemoryBase>{buffer}, definition_events,
-        std::move(on_delete_callback));
-    return absl::make_unique<PjRtBuffer>(shape, shape, std::move(device_buffer),
-                                         client, device);
-  }
+  int64 size = ShapeUtil::ByteSizeOf(shape);
 
   TransferManager* transfer_manager =
       client->client()->backend().transfer_manager();
   TF_ASSIGN_OR_RETURN(Shape compact_shape,
                       transfer_manager->ChooseCompactLayoutForShape(shape));
+
+  // The CPU platform is special because the "host" and the "device" are in the
+  // same memory space. If the input shape is in the correct layout and we don't
+  // want to defer the copy onto a thread, we can use the following fast
+  // path.
+  bool is_cpu_platform =
+      local_device->executor()->platform()->id() == se::host::kHostPlatformId;
+  if (is_cpu_platform) {
+    // If we are on the host platform and the input buffer is sufficiently
+    // aligned, we can simply point to the input array's data without any
+    // further copies. At the time of writing we require a 16-byte alignment
+    // because XLA may generate code which requires it.
+    bool can_use_zero_copy =
+        host_buffer_semantics == HostBufferSemantics::kZeroCopy &&
+        ((absl::bit_cast<std::uintptr_t>(data) &
+          (cpu_function_runtime::kMinAlign - 1)) == 0);
+    if (shape.layout() == compact_shape.layout() &&
+        (host_buffer_semantics ==
+             HostBufferSemantics::kImmutableOnlyDuringCall ||
+         can_use_zero_copy)) {
+      std::function<void()> on_delete_callback;
+      se::DeviceMemoryBase buffer;
+      // If we are on the host platform and the input buffer is sufficiently
+      // aligned, we can simply point to the input array's data without any
+      // further copies. At the time of writing we require a 16-byte alignment
+      // because XLA may generate code which requires it.
+      if (can_use_zero_copy) {
+        on_delete_callback = [buffer_reference{std::move(buffer_reference)}]() {
+          // Frees buffer_reference.
+        };
+        buffer = se::DeviceMemoryBase(const_cast<void*>(data), size);
+      } else {
+        void* staging_buffer = client->host_memory_allocator()->AllocateRaw(
+            cpu_function_runtime::kMinAlign, size);
+        on_delete_callback = [staging_buffer, client]() {
+          client->host_memory_allocator()->DeallocateRaw(staging_buffer);
+        };
+        buffer = se::DeviceMemoryBase(staging_buffer, size);
+        std::memcpy(staging_buffer, data, size);
+      }
+      absl::Span<const std::shared_ptr<BufferSequencingEvent>>
+          definition_events;
+      auto device_buffer = std::make_shared<TrackedDeviceBuffer>(
+          /*allocator=*/nullptr, local_device->device_ordinal(),
+          std::initializer_list<se::DeviceMemoryBase>{buffer},
+          definition_events, std::move(on_delete_callback));
+      return absl::make_unique<PjRtBuffer>(
+          shape, shape, std::move(device_buffer), client, device);
+    }
+  }
+
   TF_ASSIGN_OR_RETURN(
       std::unique_ptr<PjRtBuffer> py_buffer,
       AllocateDestinationBuffer(compact_shape, device, local_device,
@@ -573,17 +625,41 @@ StatusOr<std::unique_ptr<PjRtBuffer>> PjRtBuffer::FromHostBuffer(
   ScopedHold device_buffer(py_buffer->GetBufferWithUsageHold());
   CHECK(device_buffer.ok());
 
+  // If necessary, allocate a host-side buffer for staging host-to-device
+  // transfers. On GPU this is a buffer in pinned memory.
+  std::shared_ptr<void> staging_buffer;
+  if (host_buffer_semantics == HostBufferSemantics::kImmutableOnlyDuringCall ||
+      client->should_stage_host_to_device_transfers()) {
+    void* ptr = client->host_memory_allocator()->AllocateRaw(
+        tensorflow::Allocator::kAllocatorAlignment, size);
+    staging_buffer = std::shared_ptr<void>(ptr, [client](void* ptr) {
+      client->host_memory_allocator()->DeallocateRaw(ptr);
+    });
+  }
+
+  // Copy the buffer into a staging buffer before returning control to the
+  // caller if the caller only guaranteed that the buffer is valid for the
+  // duration of the call. Otherwise, we stage (if necessary) on a separate
+  // thread.
+  if (host_buffer_semantics == HostBufferSemantics::kImmutableOnlyDuringCall) {
+    std::memcpy(staging_buffer.get(), data, size);
+    buffer_reference.reset();
+    data = nullptr;
+  }
+
   // The host to device transfer is performed on a thread pool, mostly because
   // it includes linearization that may be slow. It is OK to capture the
   // py_buffer pointer because the py_buffer can't be deleted until all the
   // usage holds have gone away.
   // TODO(misard) assess if it would be preferable to introduce a heuristic to
   // put the transfer into the calling thread for small literals.
-  auto transfer_h2d = [client, transfer_manager, local_device,
-                       movable_device_buffer{device_buffer.ToClosure()}, data,
-                       shape, py_buffer{py_buffer.get()}, compact_shape,
+  auto transfer_h2d = [client, transfer_manager, local_device, data, size,
+                       movable_device_buffer{device_buffer.ToClosure()}, shape,
+                       py_buffer{py_buffer.get()}, compact_shape,
                        on_device_shape{py_buffer->on_device_shape()},
-                       buffer_reference{std::move(buffer_reference)}]() {
+                       staging_buffer{std::move(staging_buffer)},
+                       buffer_reference{std::move(buffer_reference)},
+                       host_buffer_semantics]() {
     ScopedHold device_buffer(movable_device_buffer);
     // This function uses TF_CHECK_OK and ValueOrDie() since we have no way
     // to report failures from a callback. However, the operations here are
@@ -593,20 +669,16 @@ StatusOr<std::unique_ptr<PjRtBuffer>> PjRtBuffer::FromHostBuffer(
 
     ShapedBuffer buffer = device_buffer->AsShapedBuffer(
         compact_shape, on_device_shape, client->client()->platform());
-
-    std::shared_ptr<void> staging_buffer;
-
     // If applicable on the backend, stage the transfer via host memory
     // allocated via the host_memory_allocator. On GPU, this is pinned
     // memory.
-    if (client->host_memory_allocator()) {
-      int64 size = ShapeUtil::ByteSizeOf(shape);
-      void* ptr = client->host_memory_allocator()->AllocateRaw(
-          tensorflow::Allocator::kAllocatorAlignment, size);
-      staging_buffer = std::shared_ptr<void>(ptr, [client](void* ptr) {
-        client->host_memory_allocator()->DeallocateRaw(ptr);
-      });
-      std::memcpy(ptr, data, size);
+    if (staging_buffer) {
+      // If we didn't already copy the input buffer into the staging buffer,
+      // do so now.
+      if (host_buffer_semantics !=
+          HostBufferSemantics::kImmutableOnlyDuringCall) {
+        std::memcpy(staging_buffer.get(), data, size);
+      }
       BorrowingLiteral literal(static_cast<const char*>(staging_buffer.get()),
                                shape);
       TF_CHECK_OK(transfer_manager->TransferLiteralToDeviceAsync(
@@ -626,9 +698,15 @@ StatusOr<std::unique_ptr<PjRtBuffer>> PjRtBuffer::FromHostBuffer(
 
     local_device->ThenRelease(
         local_device->host_to_device_stream(),
-        std::make_pair(buffer_reference, std::move(staging_buffer)));
+        std::make_pair(std::move(buffer_reference), std::move(staging_buffer)));
   };
-  client->h2d_transfer_pool()->Schedule(transfer_h2d);
+  if (is_cpu_platform) {
+    // Using the h2d_transfer_pool would be a double thread hop; the code
+    // already defers its work onto a stream (= thread on CPU).
+    transfer_h2d();
+  } else {
+    client->h2d_transfer_pool()->Schedule(transfer_h2d);
+  }
   return py_buffer;
 }
 
diff --git a/tensorflow/compiler/xla/pjrt/pjrt_client.h b/tensorflow/compiler/xla/pjrt/pjrt_client.h
index c609abbf6fd..754eb19bec6 100644
--- a/tensorflow/compiler/xla/pjrt/pjrt_client.h
+++ b/tensorflow/compiler/xla/pjrt/pjrt_client.h
@@ -128,6 +128,7 @@ class PjRtClient {
       std::vector<std::unique_ptr<Device>> devices, int host_id,
       std::unique_ptr<se::DeviceMemoryAllocator> allocator,
       std::unique_ptr<tensorflow::Allocator> host_memory_allocator,
+      bool should_stage_host_to_device_transfers,
       std::unique_ptr<GpuExecutableRunOptions> gpu_run_options);
   virtual ~PjRtClient() = default;
 
@@ -153,6 +154,9 @@ class PjRtClient {
   tensorflow::Allocator* host_memory_allocator() const {
     return host_memory_allocator_.get();
   }
+  bool should_stage_host_to_device_transfers() const {
+    return should_stage_host_to_device_transfers_;
+  }
 
   GpuExecutableRunOptions* gpu_run_options() const {
     return gpu_run_options_.get();
@@ -190,6 +194,9 @@ class PjRtClient {
   std::string platform_name_;
   LocalClient* client_;
 
+  // Allocator to be used for staging memory transfers to devices.
+  std::unique_ptr<tensorflow::Allocator> host_memory_allocator_;
+
   // Includes all devices, including non-local devices on multi-host platforms.
   std::vector<std::unique_ptr<Device>> devices_;
   // Maps Device::id() to the corresponding Device. Includes all devices.
@@ -201,10 +208,10 @@ class PjRtClient {
   se::DeviceMemoryAllocator* allocator_;
   std::unique_ptr<se::DeviceMemoryAllocator> owned_allocator_;
 
-  // Allocator to be used for staging memory transfers to devices. Optional;
-  // only used on GPU where it is more efficient to copy buffers to and from the
-  // device via a staging area of pinned memory.
-  std::unique_ptr<tensorflow::Allocator> host_memory_allocator_;
+  // Should we always prefer to stage host-to-device transfers via memory
+  // allocated on host_memory_allocator_? True only on GPU, where we prefer to
+  // transfer via pinned memory.
+  bool should_stage_host_to_device_transfers_;
 
   std::unique_ptr<GpuExecutableRunOptions> gpu_run_options_;
 
@@ -396,13 +403,35 @@ class PjRtBuffer {
     StatusOr<std::shared_ptr<TrackedDeviceBuffer>> buffer_or_;
   };
 
-  // If `force_copy` is true, forces a copy of the input buffer on CPU.
-  // Otherwise the library is free to alias the output buffer with `data`.
-  // `buffer_reference` is an optional shared pointer that should be kept alive
-  // by the runtime as long as the contents of `data` may still be accessed by
-  // the runtime (may be nullptr).
+  // Describes the semantics the caller to FromHostBuffer expects from the
+  // runtime, in a total order from most restrictive to least restrictive.
+  enum class HostBufferSemantics {
+    // The runtime may not hold references to `data` after the call to
+    // `FromHostBuffer` completes. The caller promises that `data` is immutable
+    // and will not be freed only for the duration of the FromHostBuffer call.
+    // `buffer_reference` will be freed by the time `FromHostBuffer` returns.
+    kImmutableOnlyDuringCall,
+
+    // The runtime may hold onto `data` after the call to `FromHostBuffer`
+    // returns while the runtime completes a transfer to the device. The caller
+    // promises not to mutate or free `data` until the transfer completes, at
+    // which point the runtime will release `buffer_reference`. It is also
+    // correct to wait on the host (directly or indirectly) for the buffer's
+    // definition event to complete.
+    kImmutableUntilTransferCompletes,
+
+    // The PjRtBuffer may alias `data` internally and the runtime may use the
+    // `data` contents as long as the buffer is alive.
+    // The caller promises to keep `data` alive and not to mutate its contents
+    // as long as the buffer is alive; to notify the caller that the buffer may
+    // be freed, the runtime will release its `buffer_reference` when the
+    // PjRtBuffer is freed. On non-CPU platforms this acts identically to
+    // kImmutableUntilTransferCompletes.
+    kZeroCopy,
+  };
   static StatusOr<std::unique_ptr<PjRtBuffer>> FromHostBuffer(
-      const void* data, const Shape& shape, bool force_copy,
+      const void* data, const Shape& shape,
+      HostBufferSemantics host_buffer_semantics,
       std::shared_ptr<void> buffer_reference, PjRtClient* client,
       Device* device);
 
diff --git a/tensorflow/compiler/xla/python/py_client.cc b/tensorflow/compiler/xla/python/py_client.cc
index a5779f3f8ee..bc7244cfc64 100644
--- a/tensorflow/compiler/xla/python/py_client.cc
+++ b/tensorflow/compiler/xla/python/py_client.cc
@@ -84,7 +84,8 @@ PyClient::GetDefaultDeviceAssignment1D(int num_replicas) {
 }
 
 StatusOr<std::unique_ptr<PyBuffer>> PyClient::BufferFromPyal(
-    const pybind11::object& argument, Device* device, bool force_copy) {
+    const pybind11::object& argument, Device* device, bool force_copy,
+    PjRtBuffer::HostBufferSemantics host_buffer_semantics) {
   if (device == nullptr) {
     TF_RET_CHECK(!pjrt_client_->local_devices().empty());
     device = pjrt_client_->local_devices().front();
@@ -111,9 +112,9 @@ StatusOr<std::unique_ptr<PyBuffer>> PyClient::BufferFromPyal(
   {
     py::gil_scoped_release gil_release;
     TF_ASSIGN_OR_RETURN(
-        buffer, PjRtBuffer::FromHostBuffer(c->buf_ptr, c->shape, force_copy,
-                                           std::move(py_buffer_ref),
-                                           pjrt_client_.get(), device));
+        buffer, PjRtBuffer::FromHostBuffer(
+                    c->buf_ptr, c->shape, host_buffer_semantics,
+                    std::move(py_buffer_ref), pjrt_client_.get(), device));
   }
   auto traceback = Traceback::Get();
   return std::make_unique<PyBuffer>(shared_from_this(), std::move(buffer),
diff --git a/tensorflow/compiler/xla/python/py_client.h b/tensorflow/compiler/xla/python/py_client.h
index 76419742c57..c94a206a926 100644
--- a/tensorflow/compiler/xla/python/py_client.h
+++ b/tensorflow/compiler/xla/python/py_client.h
@@ -120,7 +120,8 @@ class PyClient : public std::enable_shared_from_this<PyClient> {
   }
 
   StatusOr<std::unique_ptr<PyBuffer>> BufferFromPyal(
-      const pybind11::object& argument, Device* device, bool force_copy);
+      const pybind11::object& argument, Device* device, bool force_copy,
+      PjRtBuffer::HostBufferSemantics host_buffer_semantics);
 
   StatusOr<std::unique_ptr<PyExecutable>> Compile(
       const XlaComputation& computation, CompileOptions options);
diff --git a/tensorflow/compiler/xla/python/xla.cc b/tensorflow/compiler/xla/python/xla.cc
index 2c323783961..c0a440aa4bd 100644
--- a/tensorflow/compiler/xla/python/xla.cc
+++ b/tensorflow/compiler/xla/python/xla.cc
@@ -509,6 +509,13 @@ PYBIND11_MODULE(xla_extension, m) {
       .value("PLATFORM", GpuAllocatorConfig::Kind::kPlatform)
       .value("BFC", GpuAllocatorConfig::Kind::kBFC);
 
+  py::enum_<PjRtBuffer::HostBufferSemantics>(m, "HostBufferSemantics")
+      .value("IMMUTABLE_ONLY_DURING_CALL",
+             PjRtBuffer::HostBufferSemantics::kImmutableOnlyDuringCall)
+      .value("IMMUTABLE_UNTIL_TRANSFER_COMPLETES",
+             PjRtBuffer::HostBufferSemantics::kImmutableUntilTransferCompletes)
+      .value("ZERO_COPY", PjRtBuffer::HostBufferSemantics::kZeroCopy);
+
   py::class_<PyClient, std::shared_ptr<PyClient>> py_local_client(m, "Client");
   py_local_client.def_property_readonly("platform", &PyClient::platform_name)
       .def("device_count", &PyClient::device_count)
@@ -527,7 +534,9 @@ PYBIND11_MODULE(xla_extension, m) {
       .def("create_host_to_device_channel_handle",
            &PyClient::CreateHostToDeviceChannelHandle)
       .def("buffer_from_pyval", &PyClient::BufferFromPyal, py::arg("argument"),
-           py::arg("device") = nullptr, py::arg("force_copy") = false)
+           py::arg("device") = nullptr, py::arg("force_copy") = false,
+           py::arg("host_buffer_semantics") =
+               PjRtBuffer::HostBufferSemantics::kZeroCopy)
       .def("compile", &PyClient::Compile, py::arg("computation"),
            py::arg("compile_options") = CompileOptions())
       .def("heap_profile", &PyClient::HeapProfile);
diff --git a/tensorflow/compiler/xla/python/xla_client.py b/tensorflow/compiler/xla/python/xla_client.py
index 8f176507215..38c55c6fe5d 100644
--- a/tensorflow/compiler/xla/python/xla_client.py
+++ b/tensorflow/compiler/xla/python/xla_client.py
@@ -304,6 +304,7 @@ def computation_count():
 Device = _xla.Device
 CompileOptions = _xla.CompileOptions
 
+HostBufferSemantics = _xla.HostBufferSemantics
 
 # An Executable is a C++ class that duck types with the following API:
 # class Executable(object):
diff --git a/tensorflow/compiler/xla/python/xla_client_test.py b/tensorflow/compiler/xla/python/xla_client_test.py
index 6a316044734..49c57a27ac0 100644
--- a/tensorflow/compiler/xla/python/xla_client_test.py
+++ b/tensorflow/compiler/xla/python/xla_client_test.py
@@ -1986,7 +1986,8 @@ def TestFactory(xla_backend, cloud_tpu=False):
     def testRoundTrip(self, dtype, shape):
       x = np.array(np.random.rand(*shape) * 100, dtype=dtype)
       x_ptr = x.__array_interface__["data"][0]
-      buffer = self.backend.buffer_from_pyval(x)
+      buffer = self.backend.buffer_from_pyval(
+          x, host_buffer_semantics=xla_client.HostBufferSemantics.ZERO_COPY)
       y = np.array(buffer, copy=False)
       y_ptr = y.__array_interface__["data"][0]
       np.testing.assert_array_equal(x, y)
@@ -1995,7 +1996,9 @@ def TestFactory(xla_backend, cloud_tpu=False):
       self.assertTrue((x_ptr & 15) != 0 or x_ptr == y_ptr)
       self.assertEqual(y_ptr, buffer.unsafe_buffer_pointer())
 
-      buffer2 = self.backend.buffer_from_pyval(x, force_copy=True)
+      during_call = xla_client.HostBufferSemantics.IMMUTABLE_ONLY_DURING_CALL
+      buffer2 = self.backend.buffer_from_pyval(
+          x, host_buffer_semantics=during_call)
       z = np.array(buffer2, copy=False)
       self.assertNotEqual(x.__array_interface__["data"][0],
                           z.__array_interface__["data"][0])

From 2537f3d413a30ffd55187e795b256f29a1ce9d97 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 16 Jun 2020 06:56:09 -0700
Subject: [PATCH 0285/1390] Enabled support for negative edge padding for
 XlaPad.

The underlying HLO pad already supports negative edge padding. We just removed
some checks preventing negative edge padding, and added a test.

PiperOrigin-RevId: 316672291
Change-Id: I38e55f86727d90f66aadb549a587b43df6588571
---
 tensorflow/compiler/tests/xla_ops_test.py      | 18 ++++++++++++++++++
 .../compiler/tf2xla/kernels/xla_pad_op.cc      |  8 --------
 2 files changed, 18 insertions(+), 8 deletions(-)

diff --git a/tensorflow/compiler/tests/xla_ops_test.py b/tensorflow/compiler/tests/xla_ops_test.py
index f3e915daa67..35d36315464 100644
--- a/tensorflow/compiler/tests/xla_ops_test.py
+++ b/tensorflow/compiler/tests/xla_ops_test.py
@@ -211,6 +211,24 @@ class XlaOpsNumericalTest(xla_test.XLATestCase, parameterized.TestCase):
                [7, 7, 7, 7, 7], [7, 2, 3, 7, 7], [7, 7, 7, 7, 7]],
               dtype=dtype))
 
+  def testPadNegative(self):
+    for dtype in self.numeric_types:
+
+      def pad_fn(x):
+        return xla.pad(
+            x,
+            padding_value=7,
+            padding_low=[0, -1],
+            padding_high=[1, -2],
+            padding_interior=[1, 2])
+
+      self._assertOpOutputMatchesExpected(
+          pad_fn,
+          args=(np.arange(6, dtype=np.int32).astype(dtype).reshape([2, 3]),),
+          expected=np.array(
+              [[7, 7, 1, 7], [7, 7, 7, 7], [7, 7, 4, 7], [7, 7, 7, 7]],
+              dtype=dtype))
+
   @test_util.disable_mlir_bridge('Not supported yet')
   def testReduce(self):
     for dtype in set(self.numeric_types).intersection(
diff --git a/tensorflow/compiler/tf2xla/kernels/xla_pad_op.cc b/tensorflow/compiler/tf2xla/kernels/xla_pad_op.cc
index a3c2eef993c..d35101a771a 100644
--- a/tensorflow/compiler/tf2xla/kernels/xla_pad_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/xla_pad_op.cc
@@ -64,14 +64,6 @@ class XlaPadOp : public XlaOpKernel {
                     padding_interior.size(), " vs. ", rank, ")"));
 
     auto non_negative = [](int64 x) { return x >= 0; };
-    OP_REQUIRES(
-        context, absl::c_all_of(padding_low, non_negative),
-        errors::InvalidArgument("padding_low must be non-negative, got [",
-                                absl::StrJoin(padding_low, ","), "]"));
-    OP_REQUIRES(
-        context, absl::c_all_of(padding_high, non_negative),
-        errors::InvalidArgument("padding_high must be non-negative, got [",
-                                absl::StrJoin(padding_high, ","), "]"));
     OP_REQUIRES(
         context, absl::c_all_of(padding_interior, non_negative),
         errors::InvalidArgument("padding_interior must be non-negative, got [",

From 770251a7008c1d89e8141b37191e74008e01253d Mon Sep 17 00:00:00 2001
From: Stephan Herhut <herhut@google.com>
Date: Tue, 16 Jun 2020 07:34:49 -0700
Subject: [PATCH 0286/1390] Support escaping result memrefs in
 lhlo_fuse_linalg.

So far, we have identified the root computation to fuse into by it writing into a function argument. Now writing into a buffer that is returned also qualifies.

PiperOrigin-RevId: 316677942
Change-Id: I7c3912419606555946c9111d12c4086d086d9456
---
 tensorflow/compiler/mlir/xla/BUILD            |  1 +
 .../mlir/xla/tests/lhlo-fuse-linalg.mlir      | 81 ++++++++++++++++---
 .../mlir/xla/transforms/lhlo_fuse_linalg.cc   | 16 +++-
 3 files changed, 82 insertions(+), 16 deletions(-)

diff --git a/tensorflow/compiler/mlir/xla/BUILD b/tensorflow/compiler/mlir/xla/BUILD
index 8f0f000b26a..43458aab2d3 100644
--- a/tensorflow/compiler/mlir/xla/BUILD
+++ b/tensorflow/compiler/mlir/xla/BUILD
@@ -377,6 +377,7 @@ cc_library(
         "@llvm-project//mlir:LinalgOps",
         "@llvm-project//mlir:LinalgTransforms",
         "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:StandardOps",
         "@llvm-project//mlir:TransformUtils",
     ],
     alwayslink = 1,
diff --git a/tensorflow/compiler/mlir/xla/tests/lhlo-fuse-linalg.mlir b/tensorflow/compiler/mlir/xla/tests/lhlo-fuse-linalg.mlir
index 063487c00d8..b04c97f42d7 100644
--- a/tensorflow/compiler/mlir/xla/tests/lhlo-fuse-linalg.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/lhlo-fuse-linalg.mlir
@@ -1,13 +1,12 @@
-// RUN: xla-opt -lhlo-fuse-linalg %s -o - | FileCheck %s --dump-input=always
-// RUN: xla-opt -lhlo-fuse-linalg=tile-sizes=2,3 %s -o - | FileCheck %s -check-prefix=TILED
-// RUN: xla-opt -lhlo-fuse-linalg=use-parallel-loops %s -o - | FileCheck %s -check-prefix=PLOOP
-
+// RUN: xla-opt -lhlo-fuse-linalg %s -split-input-file | FileCheck %s --dump-input=always
+// RUN: xla-opt -lhlo-fuse-linalg=tile-sizes=2,3 %s -split-input-file | FileCheck %s -check-prefix=TILED
+// RUN: xla-opt -lhlo-fuse-linalg=use-parallel-loops %s -split-input-file | FileCheck %s -check-prefix=PLOOP
 
 #map0 = affine_map<(d0, d1) -> (d0, d1)>
 #pointwise_2d_trait = {args_in = 2, args_out = 1, indexing_maps = [#map0, #map0, #map0], iterator_types = ["parallel", "parallel"]}
 func @fusion(%multiplier: memref<6x6xf32>, %summand_1: memref<6x6xf32>,
              %summand_2: memref<6x6xf32>, %result: memref<6x6xf32>) {
-  %temp_result = alloc() {temp = true} : memref<6x6xf32>
+  %temp_result = alloc() : memref<6x6xf32>
   linalg.generic #pointwise_2d_trait %summand_1, %summand_2, %temp_result {
   ^bb0(%summand_1_in: f32, %summand_2_in: f32, %temp_result_in: f32):
     %out = addf %summand_1_in, %summand_2_in : f32
@@ -19,7 +18,7 @@ func @fusion(%multiplier: memref<6x6xf32>, %summand_1: memref<6x6xf32>,
     linalg.yield %out : f32
   } : memref<6x6xf32>, memref<6x6xf32>, memref<6x6xf32>
   dealloc %temp_result : memref<6x6xf32>
-  "xla_lhlo.terminator"() : () -> ()
+  return
 }
 // CHECK-LABEL: func @fusion
 //       CHECK:  %[[C1:.*]] = constant 1
@@ -53,10 +52,12 @@ func @fusion(%multiplier: memref<6x6xf32>, %summand_1: memref<6x6xf32>,
 //       PLOOP:      linalg.generic
 //       PLOOP:        mulf
 
+// -----
+
 func @fusion_of_three(%arg0: memref<100x10xf32>,
                       %arg1: memref<100xf32>,
                       %arg2: memref<100x10xf32>) {
- %0 = alloc() {temp = true} : memref<100x10xf32>
+ %0 = alloc() : memref<100x10xf32>
  linalg.generic {
    args_in = 1 : i64,
    args_out = 1 : i64,
@@ -66,7 +67,7 @@ func @fusion_of_three(%arg0: memref<100x10xf32>,
      ^bb0(%arg3: f32, %arg4: f32): // no predecessors
        linalg.yield %arg3 : f32
      }: memref<100xf32>, memref<100x10xf32>
- %1 = alloc() {temp = true} : memref<100x10xf32>
+ %1 = alloc() : memref<100x10xf32>
  linalg.generic {
    args_in = 2 : i64,
    args_out = 1 : i64,
@@ -126,11 +127,13 @@ func @fusion_of_three(%arg0: memref<100x10xf32>,
 //       PLOOP:       linalg.generic
 //       PLOOP:         exp
 
-#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
-#pointwise_4d_trait = {args_in = 2, args_out = 1, indexing_maps = [#map1, #map1, #map1], iterator_types = ["parallel", "parallel", "parallel", "parallel"]}
+// -----
+
+#map0 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
+#pointwise_4d_trait = {args_in = 2, args_out = 1, indexing_maps = [#map0, #map0, #map0], iterator_types = ["parallel", "parallel", "parallel", "parallel"]}
 func @fusion_4d(%multiplier: memref<6x6x6x6xf32>, %summand_1: memref<6x6x6x6xf32>,
              %summand_2: memref<6x6x6x6xf32>, %result: memref<6x6x6x6xf32>) {
-  %temp_result = alloc() {temp = true} : memref<6x6x6x6xf32>
+  %temp_result = alloc() : memref<6x6x6x6xf32>
   linalg.generic #pointwise_4d_trait %summand_1, %summand_2, %temp_result {
   ^bb0(%summand_1_in: f32, %summand_2_in: f32, %temp_result_in: f32):
     %out = addf %summand_1_in, %summand_2_in : f32
@@ -142,7 +145,7 @@ func @fusion_4d(%multiplier: memref<6x6x6x6xf32>, %summand_1: memref<6x6x6x6xf32
     linalg.yield %out : f32
   } : memref<6x6x6x6xf32>, memref<6x6x6x6xf32>, memref<6x6x6x6xf32>
   dealloc %temp_result : memref<6x6x6x6xf32>
-  "xla_lhlo.terminator"() : () -> ()
+  return
 }
 // CHECK-LABEL: func @fusion_4d
 //       CHECK:  %[[C1:.*]] = constant 1
@@ -177,3 +180,57 @@ func @fusion_4d(%multiplier: memref<6x6x6x6xf32>, %summand_1: memref<6x6x6x6xf32
 //       PLOOP:        addf
 //       PLOOP:      linalg.generic
 //       PLOOP:        mulf
+
+// -----
+
+#map0 = affine_map<(d0, d1) -> (d0, d1)>
+#pointwise_2d_trait = {args_in = 2, args_out = 1, indexing_maps = [#map0, #map0, #map0], iterator_types = ["parallel", "parallel"]}
+func @fusion(%multiplier: memref<6x6xf32>, %summand_1: memref<6x6xf32>,
+             %summand_2: memref<6x6xf32>) -> memref<6x6xf32> {
+  %temp_result = alloc() : memref<6x6xf32>
+  linalg.generic #pointwise_2d_trait %summand_1, %summand_2, %temp_result {
+  ^bb0(%summand_1_in: f32, %summand_2_in: f32, %temp_result_in: f32):
+    %out = addf %summand_1_in, %summand_2_in : f32
+    linalg.yield %out : f32
+  } : memref<6x6xf32>, memref<6x6xf32>, memref<6x6xf32>
+  %result = alloc() : memref<6x6xf32>
+  linalg.generic #pointwise_2d_trait %temp_result, %multiplier, %result {
+  ^bb0(%temp_result_in: f32, %multiplier_in: f32, %result_in: f32):
+    %out = mulf %temp_result_in, %multiplier_in : f32
+    linalg.yield %out : f32
+  } : memref<6x6xf32>, memref<6x6xf32>, memref<6x6xf32>
+  dealloc %temp_result : memref<6x6xf32>
+  return %result : memref<6x6xf32>
+}
+
+// CHECK-LABEL: func @fusion
+//       CHECK:  %[[C1:.*]] = constant 1
+//   CHECK-NOT:  linalg.generic
+//       CHECK:  scf.for {{.*}} step %[[C1]]
+//       CHECK:    scf.for {{.*}} step %[[C1]]
+//   CHECK-NOT:  scf.for
+//       CHECK:      linalg.generic
+//       CHECK:        addf
+//       CHECK:      linalg.generic
+//       CHECK:        mulf
+
+// TILED-LABEL: func @fusion
+//   TILED-DAG:  %[[C2:.*]] = constant 2
+//   TILED-DAG:  %[[C3:.*]] = constant 3
+//   TILED-NOT:  linalg.generic
+//       TILED:  scf.for {{.*}} step %[[C2]]
+//       TILED:    scf.for {{.*}} step %[[C3]]
+//   TILED-NOT:  scf.for
+//       TILED:      linalg.generic
+//       TILED:        addf
+//       TILED:      linalg.generic
+//       TILED:        mulf
+
+// PLOOP-LABEL: func @fusion
+//   PLOOP-NOT:  linalg.generic
+//       PLOOP:  scf.parallel
+//   PLOOP-NOT:  scf.parallel
+//       PLOOP:      linalg.generic
+//       PLOOP:        addf
+//       PLOOP:      linalg.generic
+//       PLOOP:        mulf
diff --git a/tensorflow/compiler/mlir/xla/transforms/lhlo_fuse_linalg.cc b/tensorflow/compiler/mlir/xla/transforms/lhlo_fuse_linalg.cc
index ddbb672c70a..e16ab571b4d 100644
--- a/tensorflow/compiler/mlir/xla/transforms/lhlo_fuse_linalg.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/lhlo_fuse_linalg.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "absl/memory/memory.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "mlir/Dialect/Linalg/Transforms/Transforms.h"  // from @llvm-project
+#include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Transforms/FoldUtils.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/xla/transforms/passes.h"
@@ -52,10 +53,17 @@ class LhloFuseLinalg : public PassWrapper<LhloFuseLinalg, FunctionPass> {
     // The fusion in Linalg is currently possible only when the consumer op is
     // tiled. In order to greedily fuse the ops, we have to start from the tiled
     // root linalg ops, i.e. linalg ops that write to output buffers of the
-    // function.
-    llvm::SmallDenseSet<Value> func_args;
+    // function or are returned in case of escaping allocations.
+    llvm::SmallDenseSet<Value> result_buffers;
     for (auto func_arg : func.getArguments()) {
-      func_args.insert(func_arg);
+      result_buffers.insert(func_arg);
+    }
+    for (auto& block : func.getBlocks()) {
+      auto returnOp = mlir::dyn_cast<mlir::ReturnOp>(block.getTerminator());
+      if (!returnOp) continue;
+      for (auto operand : returnOp.getOperands()) {
+        result_buffers.insert(operand);
+      }
     }
     MLIRContext* ctx = func.getContext();
     OpBuilder b(func);
@@ -68,7 +76,7 @@ class LhloFuseLinalg : public PassWrapper<LhloFuseLinalg, FunctionPass> {
       }
       auto op = cast<LinalgOp>(generic_op.getOperation());
       for (const Value result : op.getOutputBuffers()) {
-        if (!func_args.count(result)) continue;
+        if (!result_buffers.count(result)) continue;
         if (tileGenericOp(op, tile_sizes, &b)) {
           generic_op.erase();
           return;

From 426f62af5eb80e5f0c3b660451dac6f953b4ca0c Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 16 Jun 2020 07:36:58 -0700
Subject: [PATCH 0287/1390] Add an explicit distribution_strategy attribute to
 TrackableHandler. Since it's treated as a weight by Keras, it needs to pass
 DistStrat checks during compile.

PiperOrigin-RevId: 316678228
Change-Id: I132168f1ca3dd3729d7a499cef3564c5e04abb34
---
 tensorflow/python/keras/engine/base_layer_utils.py             | 1 +
 .../layers/preprocessing/index_lookup_distribution_test.py     | 3 ++-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/tensorflow/python/keras/engine/base_layer_utils.py b/tensorflow/python/keras/engine/base_layer_utils.py
index cb2b9ed87f1..de67080af66 100644
--- a/tensorflow/python/keras/engine/base_layer_utils.py
+++ b/tensorflow/python/keras/engine/base_layer_utils.py
@@ -774,6 +774,7 @@ class TrackableWeightHandler(object):
     if not isinstance(trackable, tracking.Trackable):
       raise ValueError('%s is not a Trackable object.' % (trackable,))
     self._trackable = trackable
+    self._distribute_strategy = distribution_strategy_context.get_strategy()
 
     # TODO(b/141682913): Figure out why this is private and fix it.
     saveables = trackable._gather_saveables_for_checkpoint().values()  # pylint: disable=protected-access
diff --git a/tensorflow/python/keras/layers/preprocessing/index_lookup_distribution_test.py b/tensorflow/python/keras/layers/preprocessing/index_lookup_distribution_test.py
index 098e67f5f6b..c593cd41c85 100644
--- a/tensorflow/python/keras/layers/preprocessing/index_lookup_distribution_test.py
+++ b/tensorflow/python/keras/layers/preprocessing/index_lookup_distribution_test.py
@@ -44,7 +44,7 @@ def get_layer_class():
 @combinations.generate(
     combinations.combine(
         distribution=strategy_combinations.all_strategies,
-        mode=["eager", "graph"]))
+        mode=["eager"]))  # Eager-only, no graph: b/158793009
 class IndexLookupDistributionTest(
     keras_parameterized.TestCase,
     preprocessing_test_utils.PreprocessingLayerTest):
@@ -74,6 +74,7 @@ class IndexLookupDistributionTest(
       layer.adapt(vocab_dataset)
       int_data = layer(input_data)
       model = keras.Model(inputs=input_data, outputs=int_data)
+    model.compile(loss="mse")
     output_dataset = model.predict(input_dataset)
     self.assertAllEqual(expected_output, output_dataset)
 

From cb60e1c14b1470b41f60a905ced1176763dfc9ee Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 16 Jun 2020 07:41:44 -0700
Subject: [PATCH 0288/1390] Introduces a new experimental package that: -
 Defines a schema for configuring delegates - Defines a C++ plugin mechanism
 using the schema, so that code can support   configuring arbitrary delegates
 without a build-time dependency

PiperOrigin-RevId: 316678829
Change-Id: I36ce8a6175b550d83dfe9cf1f237a04173fb8b16
---
 .../acceleration/configuration/BUILD          | 165 ++++++++++++++
 .../configuration/configuration.proto         | 208 ++++++++++++++++++
 .../configuration/delegate_registry.cc        |  60 +++++
 .../configuration/delegate_registry.h         |  95 ++++++++
 .../acceleration/configuration/gpu_plugin.cc  |  62 ++++++
 .../configuration/hexagon_plugin.cc           |  73 ++++++
 .../configuration/nnapi_plugin.cc             |  93 ++++++++
 .../configuration/nnapi_plugin_test.cc        | 175 +++++++++++++++
 .../configuration/proto_to_flatbuffer.cc      |  58 +++++
 .../configuration/proto_to_flatbuffer.h       |  32 +++
 10 files changed, 1021 insertions(+)
 create mode 100644 tensorflow/lite/experimental/acceleration/configuration/BUILD
 create mode 100644 tensorflow/lite/experimental/acceleration/configuration/configuration.proto
 create mode 100644 tensorflow/lite/experimental/acceleration/configuration/delegate_registry.cc
 create mode 100644 tensorflow/lite/experimental/acceleration/configuration/delegate_registry.h
 create mode 100644 tensorflow/lite/experimental/acceleration/configuration/gpu_plugin.cc
 create mode 100644 tensorflow/lite/experimental/acceleration/configuration/hexagon_plugin.cc
 create mode 100644 tensorflow/lite/experimental/acceleration/configuration/nnapi_plugin.cc
 create mode 100644 tensorflow/lite/experimental/acceleration/configuration/nnapi_plugin_test.cc
 create mode 100644 tensorflow/lite/experimental/acceleration/configuration/proto_to_flatbuffer.cc
 create mode 100644 tensorflow/lite/experimental/acceleration/configuration/proto_to_flatbuffer.h

diff --git a/tensorflow/lite/experimental/acceleration/configuration/BUILD b/tensorflow/lite/experimental/acceleration/configuration/BUILD
new file mode 100644
index 00000000000..1bfd2494fe1
--- /dev/null
+++ b/tensorflow/lite/experimental/acceleration/configuration/BUILD
@@ -0,0 +1,165 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+load("@flatbuffers//:build_defs.bzl", "flatbuffer_cc_library", "flatbuffer_java_library", "flatc_path")
+
+package(
+    default_visibility = [
+        "//visibility:public",
+    ],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+genrule(
+    name = "configuration_schema",
+    srcs = ["configuration.proto"],
+    outs = ["configuration.fbs"],
+    # We rename the namespace since otherwise the proto classes and flatbuffer
+    # classes would have the same names.
+    cmd = """
+    $(location {}) --proto -o $(@D) $(location :configuration.proto)
+    perl -p -i -e 's/tflite.proto/tflite/' $(@D)/configuration.fbs
+    """.format(flatc_path),
+    tools = [
+        flatc_path,
+    ],
+)
+
+genrule(
+    name = "configuration_fbs_contents_cc",
+    srcs = ["configuration.fbs"],
+    outs = ["configuration_fbs_contents-inl.h"],
+    cmd = """
+      echo 'constexpr char configuration_fbs_contents[] = R"Delimiter(' > $(@)
+      cat < $(<) >> $(@)
+      echo ')Delimiter";' >> $(@)
+    """,
+)
+
+proto_library(
+    name = "configuration_proto",
+    srcs = [
+        "configuration.proto",
+    ],
+)
+
+cc_proto_library(
+    name = "configuration_cc_proto",
+    deps = [":configuration_proto"],
+)
+
+java_lite_proto_library(
+    name = "configuration_java_proto_lite",
+    deps = [":configuration_proto"],
+)
+
+flatbuffer_cc_library(
+    name = "configuration_fbs",
+    srcs = [":configuration.fbs"],
+)
+
+flatbuffer_java_library(
+    name = "configuration_fbs_java",
+    srcs = [":configuration.fbs"],
+)
+
+cc_library(
+    name = "proto_to_flatbuffer",
+    srcs = [
+        "configuration_fbs_contents-inl.h",
+        "proto_to_flatbuffer.cc",
+    ],
+    hdrs = ["proto_to_flatbuffer.h"],
+    deps = [
+        ":configuration_cc_proto",
+        ":configuration_fbs",
+        "//tensorflow/core/platform:protobuf",
+        "//tensorflow/lite:minimal_logging",
+        "@flatbuffers",
+    ],
+)
+
+cc_library(
+    name = "delegate_registry",
+    srcs = ["delegate_registry.cc"],
+    hdrs = ["delegate_registry.h"],
+    deps = [
+        ":configuration_fbs",
+        "//tensorflow/lite/c:common",
+        "@com_google_absl//absl/synchronization",
+    ],
+)
+
+cc_library(
+    name = "nnapi_plugin",
+    srcs = ["nnapi_plugin.cc"],
+    deps = [
+        ":configuration_fbs",
+        ":delegate_registry",
+        "//tensorflow/lite/delegates/nnapi:nnapi_delegate",
+        "@com_google_absl//absl/memory",
+    ],
+    alwayslink = 1,  # For registration to always run.
+)
+
+cc_test(
+    name = "nnapi_plugin_test",
+    srcs = ["nnapi_plugin_test.cc"],
+    tags = [
+        "no_mac",
+        "no_windows",
+        "tflite_not_portable_ios",
+    ],
+    deps = [
+        ":configuration_fbs",
+        ":delegate_registry",
+        ":nnapi_plugin",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/delegates/nnapi:nnapi_delegate",
+        "//tensorflow/lite/delegates/nnapi:nnapi_delegate_mock_test",
+        "//tensorflow/lite/kernels:test_util",
+        "@com_google_googletest//:gtest_main",
+        "@flatbuffers",
+    ],
+)
+
+cc_library(
+    name = "hexagon_plugin",
+    srcs = ["hexagon_plugin.cc"],
+    deps = [
+        ":configuration_fbs",
+        ":delegate_registry",
+        "@com_google_absl//absl/memory",
+    ] + select({
+        "//tensorflow:android": [
+            "//tensorflow/lite/delegates/hexagon:hexagon_delegate",
+        ],
+        "//conditions:default": [],
+    }),
+    alwayslink = 1,  # For registration to always run.
+)
+
+cc_library(
+    name = "gpu_plugin",
+    srcs = ["gpu_plugin.cc"],
+    deps = [
+        ":configuration_fbs",
+        ":delegate_registry",
+        "//tensorflow/lite/delegates/gpu:delegate",
+        "@com_google_absl//absl/memory",
+    ],
+    alwayslink = 1,  # For registration to always run.
+)
diff --git a/tensorflow/lite/experimental/acceleration/configuration/configuration.proto b/tensorflow/lite/experimental/acceleration/configuration/configuration.proto
new file mode 100644
index 00000000000..e1c49f02856
--- /dev/null
+++ b/tensorflow/lite/experimental/acceleration/configuration/configuration.proto
@@ -0,0 +1,208 @@
+// Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// This schema defines how to configure TFLite for delegation. These
+// definitions can be used in multiple ways: as output of a compatibility list,
+// in benchmarking tools and to decouple delegate instantiation from code.
+//
+// The schema is work-in-progress, covering the most broadly used delegates and
+// options.
+
+syntax = "proto2";
+
+package tflite.proto;
+
+// ExecutionPreference is used to match accelerators against the preferences of
+// the current application or usecase. Some of the values here can appear both
+// in the compatibility list and as input, some only as input.
+//
+// These are separate from NNAPIExecutionPreference - the compatibility list
+// design doesn't assume a one-to-one mapping between which usecases
+// compatibility list entries have been developed for and what settings are used
+// for NNAPI.
+enum ExecutionPreference {
+  // Match any selected preference. Whitelist (semantically - value is same as
+  // on input).
+  ANY = 0;
+  // Match low latency preference. Both compatibility list and input.
+  LOW_LATENCY = 1;
+  // Math low power preference. Both compatibility list and input.
+  LOW_POWER = 2;
+  // Never accelerate. Can be used for input to compatibility list or for
+  // standalone Acceleration configuration.
+  FORCE_CPU = 3;
+}
+
+// TFLite delegate to use.
+enum Delegate {
+  NONE = 0;
+  NNAPI = 1;
+  GPU = 2;
+  HEXAGON = 3;
+  XNNPACK = 4;
+  // TODO(b/157893534): Support exposing edgetpu tflite delegate creation
+  // options.
+  EDGETPU = 5;
+}
+
+enum NNAPIExecutionPreference {
+  // Undefined.
+  UNDEFINED = 0;
+  // Prefer executing in a way that minimizes battery drain.
+  NNAPI_LOW_POWER = 1;
+  // Prefer returning a single answer as fast as possible, even if this causes
+  // more power consumption.
+  NNAPI_FAST_SINGLE_ANSWER = 2;
+  // Prefer maximizing the throughput of successive frames, for example when
+  // processing successive frames coming from the camera.
+  NNAPI_SUSTAINED_SPEED = 3;
+}
+
+// One possible acceleration configuration.
+message ComputeSettings {
+  // Which preference to use this accelerator for.
+  optional ExecutionPreference preference = 1;
+  // How to configure TFLite
+  optional TFLiteSettings tflite_settings = 2;
+  // Identifiers to use for instrumentation and telemetry.
+  optional string model_namespace_for_statistics = 3;
+  optional string model_identifier_for_statistics = 4;
+}
+
+// NNAPI delegate settings.
+message NNAPISettings {
+  // Which instance (NNAPI accelerator) to use. One driver may provide several
+  // accelerators (though a driver may also hide several back-ends behind one
+  // name, at the choice of the driver vendor).
+  // Note that driver introspection is only available in Android Q and later.
+  optional string accelerator_name = 1;
+
+  // NNAPI model compilation caching settings to be passed to
+  // tflite::StatefulNnApiDelegate
+  optional string cache_directory = 2;
+  optional string model_token = 3;
+
+  // NNAPI execution preference to pass. See
+  // https://developer.android.com/ndk/reference/group/neural-networks.html
+  optional NNAPIExecutionPreference execution_preference = 4;
+
+  // Number of instances to cache for the same model (for input size
+  // changes). This is mandatory for getting reasonable performance in that
+  // case.
+  optional int32 no_of_nnapi_instances_to_cache = 5;
+
+  // Whether to automatically fall back to TFLite CPU path.
+  optional FallbackSettings fallback_settings = 6;
+
+  // Whether to allow use of NNAPI CPU (nnapi-reference accelerator) on Android
+  // 10+ when an accelerator name is not specified. The NNAPI CPU typically
+  // performs less well than the TfLite built-in kernels; but allowing allows a
+  // model to be partially accelerated which may be a win.
+  optional bool allow_nnapi_cpu_on_android_10_plus = 7;
+}
+
+// Which GPU backend to select. Default behaviour on Android is to try OpenCL
+// and if it's not available fall back to OpenGL.
+enum GPUBackend {
+  UNSET = 0;
+  OPENCL = 1;
+  OPENGL = 2;
+  // Not yet supported.
+  // VULKAN = 3;
+  // METAL = 4;
+}
+
+// GPU Delegate settings.
+//
+// See
+// https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/delegates/gpu/delegate.h
+message GPUSettings {
+  optional bool is_precision_loss_allowed = 1;
+  optional bool enable_quantized_inference = 2 [default = true];
+  optional GPUBackend force_backend = 3;
+  // TODO(b/152019007): add remaining options.
+}
+
+// Hexagon Delegate settings.
+//
+// See
+// https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/delegates/hexagon/hexagon_delegate.h
+message HexagonSettings {
+  optional int32 debug_level = 1;
+  optional int32 powersave_level = 2;
+  optional bool print_graph_profile = 3;
+  optional bool print_graph_debug = 4;
+}
+
+// XNNPack Delegate settings.
+//
+// See
+// https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h
+message XNNPackSettings {
+  optional int32 num_threads = 1;
+}
+
+message CPUSettings {
+  optional int32 num_threads = 1;
+}
+
+// How to configure TFLite.
+message TFLiteSettings {
+  // Which delegate to use.
+  optional Delegate delegate = 1;
+
+  // How to configure the chosen delegate.
+  // (In principle we would like to use 'oneof', but flatc turns that into an
+  // nested anonymous table rather than a union. See
+  // https://github.com/google/flatbuffers/issues/4628).
+  optional NNAPISettings nnapi_settings = 2;
+  optional GPUSettings gpu_settings = 3;
+  optional HexagonSettings hexagon_settings = 4;
+  optional XNNPackSettings xnnpack_settings = 5;
+
+  // How to configure CPU execution.
+  optional CPUSettings cpu_settings = 6;
+
+  // Shared delegation settings.
+  optional int32 max_delegated_partitions = 7;
+}
+
+// Whether to automatically fallback to TFLite CPU path on delegation errors.
+//
+// Typically fallback is enabled in production use but disabled in tests and
+// benchmarks to ensure they test the intended path.
+message FallbackSettings {
+  // Whether to allow automatically falling back to TfLite CPU path on
+  // compilation failure. Default is not allowing automatic fallback.
+  //
+  // This is useful in naive production usecases where the caller would prefer
+  // for the model to run even if it's not accelerated. More advanced users will
+  // implement fallback themselves; e.g., by using a different model on CPU.
+  //
+  // Note that compilation errors may occur either at initial
+  // ModifyGraphWithDelegate() time, or when calling AllocateTensors() after
+  // resizing.
+  optional bool allow_automatic_fallback_on_compilation_error = 7;
+  // Whether to allow automatically falling back to TfLite CPU path on
+  // execution error. Default is not allowing automatic fallback.
+  //
+  // Experimental, use with care (only when you have complete control over the
+  // client code).
+  //
+  // The caveat above for compilation error holds.  Additionally, execution-time
+  // errors are harder to handle automatically as they require invalidating the
+  // TfLite interpreter which most client code has not been designed to deal
+  // with.
+  optional bool allow_automatic_fallback_on_execution_error = 8;
+}
diff --git a/tensorflow/lite/experimental/acceleration/configuration/delegate_registry.cc b/tensorflow/lite/experimental/acceleration/configuration/delegate_registry.cc
new file mode 100644
index 00000000000..b8d80342d5f
--- /dev/null
+++ b/tensorflow/lite/experimental/acceleration/configuration/delegate_registry.cc
@@ -0,0 +1,60 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/experimental/acceleration/configuration/delegate_registry.h"
+
+#include "absl/synchronization/mutex.h"
+
+namespace tflite {
+namespace delegates {
+
+void DelegatePluginRegistry::RegisterImpl(
+    const std::string& name,
+    std::function<
+        std::unique_ptr<DelegatePluginInterface>(const TFLiteSettings&)>
+        creator_function) {
+  absl::MutexLock lock(&mutex_);
+  factories_[name] = creator_function;
+}
+
+std::unique_ptr<DelegatePluginInterface> DelegatePluginRegistry::CreateImpl(
+    const std::string& name, const TFLiteSettings& settings) {
+  absl::MutexLock lock(&mutex_);
+  auto it = factories_.find(name);
+  if (it != factories_.end()) {
+    return it->second(settings);
+  } else {
+    return nullptr;
+  }
+}
+
+DelegatePluginRegistry* DelegatePluginRegistry::GetSingleton() {
+  static auto* instance = new DelegatePluginRegistry();
+  return instance;
+}
+
+std::unique_ptr<DelegatePluginInterface> DelegatePluginRegistry::CreateByName(
+    const std::string& name, const TFLiteSettings& settings) {
+  auto* const instance = DelegatePluginRegistry::GetSingleton();
+  return instance->CreateImpl(name, settings);
+}
+
+DelegatePluginRegistry::Register::Register(const std::string& name,
+                                           CreatorFunction creator_function) {
+  auto* const instance = DelegatePluginRegistry::GetSingleton();
+  instance->RegisterImpl(name, creator_function);
+}
+
+}  // namespace delegates
+}  // namespace tflite
diff --git a/tensorflow/lite/experimental/acceleration/configuration/delegate_registry.h b/tensorflow/lite/experimental/acceleration/configuration/delegate_registry.h
new file mode 100644
index 00000000000..c86759dcc3f
--- /dev/null
+++ b/tensorflow/lite/experimental/acceleration/configuration/delegate_registry.h
@@ -0,0 +1,95 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_CONFIGURATION_DELEGATE_REGISTRY_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_CONFIGURATION_DELEGATE_REGISTRY_H_
+
+#include <memory>
+#include <unordered_map>
+
+#include "absl/synchronization/mutex.h"
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/experimental/acceleration/configuration/configuration_generated.h"
+
+// Defines an interface for TFLite delegate plugins.
+//
+// The acceleration library aims to support all TFLite delegates based on
+// configuration expressed as data (flatbuffers). However, consumers tend to
+// care about size and also use a subset of delegates. Hence we don't want to
+// statically build against all delegates.
+//
+// This interface allows plugins to handle specific delegates.
+//
+// Goal of this interface is not to abstract away all the differences between
+// delegates. The goal is only to avoid static linking.
+//
+// Note to implementers: this interface may change if new delegates don't fit
+// into the same design.
+namespace tflite {
+namespace delegates {
+
+// Same w/ Interpreter::TfLiteDelegatePtr to avoid pulling
+// tensorflow/lite/interpreter.h dependency
+using TfLiteDelegatePtr =
+    std::unique_ptr<TfLiteDelegate, void (*)(TfLiteDelegate*)>;
+
+class DelegatePluginInterface {
+ public:
+  virtual TfLiteDelegatePtr Create() = 0;
+  virtual int GetDelegateErrno(TfLiteDelegate* from_delegate) = 0;
+  virtual ~DelegatePluginInterface() = default;
+};
+
+// A stripped-down registry that allows delegate plugins to be created by name.
+//
+// Limitations:
+// - Doesn't allow deregistration.
+// - Doesn't check for duplication registration.
+//
+class DelegatePluginRegistry {
+ public:
+  typedef std::function<std::unique_ptr<DelegatePluginInterface>(
+      const TFLiteSettings&)>
+      CreatorFunction;
+  // Returns a DelegatePluginInterface registered with `name` or nullptr if no
+  // matching plugin found.
+  // TFLiteSettings is per-plugin, so that the corresponding delegate options
+  // data lifetime is maintained.
+  static std::unique_ptr<DelegatePluginInterface> CreateByName(
+      const std::string& name, const TFLiteSettings& settings);
+
+  // Struct to be statically allocated for registration.
+  struct Register {
+    Register(const std::string& name, CreatorFunction creator_function);
+  };
+
+ private:
+  void RegisterImpl(const std::string& name, CreatorFunction creator_function);
+  std::unique_ptr<DelegatePluginInterface> CreateImpl(
+      const std::string& name, const TFLiteSettings& settings);
+  static DelegatePluginRegistry* GetSingleton();
+  std::unordered_map<std::string, CreatorFunction> factories_;
+  absl::Mutex mutex_;
+};
+
+}  // namespace delegates
+}  // namespace tflite
+
+#define TFLITE_REGISTER_DELEGATE_FACTORY_FUNCTION_VNAME(name, f) \
+  static auto* g_delegate_plugin_##name##_ =                     \
+      new DelegatePluginRegistry::Register(#name, f);
+#define TFLITE_REGISTER_DELEGATE_FACTORY_FUNCTION(name, f) \
+  TFLITE_REGISTER_DELEGATE_FACTORY_FUNCTION_VNAME(name, f);
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_CONFIGURATION_DELEGATE_REGISTRY_H_
diff --git a/tensorflow/lite/experimental/acceleration/configuration/gpu_plugin.cc b/tensorflow/lite/experimental/acceleration/configuration/gpu_plugin.cc
new file mode 100644
index 00000000000..25b8171c5ea
--- /dev/null
+++ b/tensorflow/lite/experimental/acceleration/configuration/gpu_plugin.cc
@@ -0,0 +1,62 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <memory>
+
+#include "absl/memory/memory.h"
+#include "tensorflow/lite/delegates/gpu/delegate.h"
+#include "tensorflow/lite/experimental/acceleration/configuration/configuration_generated.h"
+#include "tensorflow/lite/experimental/acceleration/configuration/delegate_registry.h"
+
+namespace tflite {
+namespace delegates {
+class GpuPlugin : public DelegatePluginInterface {
+ public:
+  TfLiteDelegatePtr Create() override {
+    return TfLiteDelegatePtr(TfLiteGpuDelegateV2Create(&options_),
+                             TfLiteGpuDelegateV2Delete);
+  }
+  int GetDelegateErrno(TfLiteDelegate* from_delegate) override { return 0; }
+  static std::unique_ptr<DelegatePluginInterface> New(
+      const TFLiteSettings& acceleration) {
+    return absl::make_unique<GpuPlugin>(acceleration);
+  }
+  explicit GpuPlugin(const TFLiteSettings& tflite_settings)
+      : options_(TfLiteGpuDelegateOptionsV2Default()) {
+    const auto* gpu_settings = tflite_settings.gpu_settings();
+    if (gpu_settings) {
+      options_.inference_priority1 =
+          gpu_settings->is_precision_loss_allowed()
+              ? TFLITE_GPU_INFERENCE_PRIORITY_MIN_LATENCY
+              : TFLITE_GPU_INFERENCE_PRIORITY_MAX_PRECISION;
+      if (gpu_settings->enable_quantized_inference()) {
+        options_.experimental_flags |=
+            TFLITE_GPU_EXPERIMENTAL_FLAGS_ENABLE_QUANT;
+      }
+      if (gpu_settings->force_backend() == GPUBackend_OPENCL) {
+        options_.experimental_flags |= TFLITE_GPU_EXPERIMENTAL_FLAGS_CL_ONLY;
+      } else if (gpu_settings->force_backend() == GPUBackend_OPENGL) {
+        options_.experimental_flags |= TFLITE_GPU_EXPERIMENTAL_FLAGS_GL_ONLY;
+      }
+    }
+  }
+
+ private:
+  TfLiteGpuDelegateOptionsV2 options_;
+};
+
+TFLITE_REGISTER_DELEGATE_FACTORY_FUNCTION(GpuPlugin, GpuPlugin::New);
+
+}  // namespace delegates
+}  // namespace tflite
diff --git a/tensorflow/lite/experimental/acceleration/configuration/hexagon_plugin.cc b/tensorflow/lite/experimental/acceleration/configuration/hexagon_plugin.cc
new file mode 100644
index 00000000000..7f2674604b0
--- /dev/null
+++ b/tensorflow/lite/experimental/acceleration/configuration/hexagon_plugin.cc
@@ -0,0 +1,73 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <memory>
+
+#include "absl/memory/memory.h"
+#include "tensorflow/lite/experimental/acceleration/configuration/configuration_generated.h"
+#include "tensorflow/lite/experimental/acceleration/configuration/delegate_registry.h"
+
+#if defined(__ARM_ARCH)
+#include "tensorflow/lite/delegates/hexagon/hexagon_delegate.h"
+#endif
+
+namespace tflite {
+namespace delegates {
+class HexagonPlugin : public DelegatePluginInterface {
+ public:
+  TfLiteDelegatePtr Create() override {
+#if defined(__ARM_ARCH)
+    TfLiteHexagonInit();
+    auto* delegate_ptr = TfLiteHexagonDelegateCreate(&options_);
+    TfLiteDelegatePtr delegate(delegate_ptr, [](TfLiteDelegate* delegate) {
+      TfLiteHexagonDelegateDelete(delegate);
+      TfLiteHexagonTearDown();
+    });
+    return delegate;
+#else   // !defined(__ARM_ARCH)
+    return TfLiteDelegatePtr(nullptr, [](TfLiteDelegate*) {});
+#endif  // defined(__ARM_ARCH)
+  }
+  int GetDelegateErrno(TfLiteDelegate* /* from_delegate */) override {
+    return 0;
+  }
+  static std::unique_ptr<HexagonPlugin> New(
+      const TFLiteSettings& tflite_settings) {
+    return absl::make_unique<HexagonPlugin>(tflite_settings);
+  }
+  explicit HexagonPlugin(const TFLiteSettings& tflite_settings) {
+    const HexagonSettings* settings = tflite_settings.hexagon_settings();
+#if defined(__ARM_ARCH)
+    options_ = TfLiteHexagonDelegateOptions({0});
+    if (settings) {
+      options_.debug_level = settings->debug_level();
+      options_.powersave_level = settings->powersave_level();
+      options_.print_graph_profile = settings->print_graph_profile();
+      options_.print_graph_debug = settings->print_graph_debug();
+    }
+#else
+    (void)settings;
+#endif
+  }
+
+ private:
+#if defined(__ARM_ARCH)
+  TfLiteHexagonDelegateOptions options_;
+#endif
+};
+
+TFLITE_REGISTER_DELEGATE_FACTORY_FUNCTION(HexagonPlugin, HexagonPlugin::New);
+
+}  // namespace delegates
+}  // namespace tflite
diff --git a/tensorflow/lite/experimental/acceleration/configuration/nnapi_plugin.cc b/tensorflow/lite/experimental/acceleration/configuration/nnapi_plugin.cc
new file mode 100644
index 00000000000..7301983a815
--- /dev/null
+++ b/tensorflow/lite/experimental/acceleration/configuration/nnapi_plugin.cc
@@ -0,0 +1,93 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <memory>
+
+#include "absl/memory/memory.h"
+#include "tensorflow/lite/delegates/nnapi/nnapi_delegate.h"
+#include "tensorflow/lite/experimental/acceleration/configuration/configuration_generated.h"
+#include "tensorflow/lite/experimental/acceleration/configuration/delegate_registry.h"
+
+namespace tflite {
+namespace delegates {
+
+inline tflite::StatefulNnApiDelegate::Options::ExecutionPreference
+ConvertExecutionPrefence(
+    NNAPIExecutionPreference from_compatibility_preference) {
+  using TflitePreference =
+      tflite::StatefulNnApiDelegate::Options::ExecutionPreference;
+  switch (from_compatibility_preference) {
+    case NNAPIExecutionPreference_NNAPI_LOW_POWER:
+      return TflitePreference::kLowPower;
+    case NNAPIExecutionPreference_NNAPI_FAST_SINGLE_ANSWER:
+      return TflitePreference::kFastSingleAnswer;
+    case NNAPIExecutionPreference_NNAPI_SUSTAINED_SPEED:
+      return TflitePreference::kSustainedSpeed;
+    default:
+      return TflitePreference::kUndefined;
+  }
+}
+
+class NnapiPlugin : public DelegatePluginInterface {
+ public:
+  TfLiteDelegatePtr Create() override {
+    auto nnapi_delegate =
+        absl::make_unique<tflite::StatefulNnApiDelegate>(options_);
+    return TfLiteDelegatePtr(
+        nnapi_delegate.release(), [](TfLiteDelegate* delegate) {
+          delete reinterpret_cast<tflite::StatefulNnApiDelegate*>(delegate);
+        });
+  }
+  int GetDelegateErrno(TfLiteDelegate* from_delegate) override {
+    auto nnapi_delegate =
+        reinterpret_cast<tflite::StatefulNnApiDelegate*>(from_delegate);
+    return nnapi_delegate->GetNnApiErrno();
+  }
+  static std::unique_ptr<NnapiPlugin> New(
+      const TFLiteSettings& tflite_settings) {
+    return absl::make_unique<NnapiPlugin>(tflite_settings);
+  }
+  explicit NnapiPlugin(const TFLiteSettings& tflite_settings) {
+    const NNAPISettings* nnapi_settings = tflite_settings.nnapi_settings();
+    if (!nnapi_settings) return;
+    if (nnapi_settings->accelerator_name() &&
+        nnapi_settings->accelerator_name()->Length() != 0) {
+      accelerator_ = nnapi_settings->accelerator_name()->str();
+      options_.accelerator_name = accelerator_.c_str();
+    }
+    if (nnapi_settings->cache_directory() &&
+        nnapi_settings->cache_directory()->Length() != 0) {
+      cache_dir_ = nnapi_settings->cache_directory()->str();
+      options_.cache_dir = cache_dir_.c_str();
+    }
+    if (nnapi_settings->model_token() &&
+        nnapi_settings->model_token()->Length() != 0) {
+      model_token_ = nnapi_settings->model_token()->str();
+      options_.model_token = model_token_.c_str();
+    }
+    options_.execution_preference =
+        ConvertExecutionPrefence(nnapi_settings->execution_preference());
+    options_.disallow_nnapi_cpu =
+        !nnapi_settings->allow_nnapi_cpu_on_android_10_plus();
+  }
+
+ private:
+  std::string accelerator_, cache_dir_, model_token_;
+  tflite::StatefulNnApiDelegate::Options options_;
+};
+
+TFLITE_REGISTER_DELEGATE_FACTORY_FUNCTION(NnapiPlugin, NnapiPlugin::New);
+
+}  // namespace delegates
+}  // namespace tflite
diff --git a/tensorflow/lite/experimental/acceleration/configuration/nnapi_plugin_test.cc b/tensorflow/lite/experimental/acceleration/configuration/nnapi_plugin_test.cc
new file mode 100644
index 00000000000..4f9f5dd08c1
--- /dev/null
+++ b/tensorflow/lite/experimental/acceleration/configuration/nnapi_plugin_test.cc
@@ -0,0 +1,175 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <memory>
+
+#include <gtest/gtest.h>
+#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/delegates/nnapi/nnapi_delegate.h"
+#include "tensorflow/lite/delegates/nnapi/nnapi_delegate_mock_test.h"
+#include "tensorflow/lite/experimental/acceleration/configuration/configuration_generated.h"
+#include "tensorflow/lite/experimental/acceleration/configuration/delegate_registry.h"
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/test_util.h"
+
+// Tests for checking that the NNAPI Delegate plugin correctly handles all the
+// options from the flatbuffer.
+//
+// Checking done at NNAPI call level, as that is where we have a mockable
+// layer.
+namespace tflite {
+namespace {
+
+using delegate::nnapi::NnApiMock;
+
+class SingleAddOpModel : tflite::SingleOpModel {
+ public:
+  void Build() {
+    int input = AddInput({tflite::TensorType_FLOAT32, {1, 2, 2}});
+    int constant = AddConstInput({tflite::TensorType_FLOAT32, {1, 2, 2}},
+                                 {1.0f, 1.0f, 1.0f, 1.0f});
+    AddOutput({tflite::TensorType_FLOAT32, {}});
+
+    SetBuiltinOp(tflite::BuiltinOperator_ADD, tflite::BuiltinOptions_AddOptions,
+                 tflite::CreateAddOptions(builder_).Union());
+    BuildInterpreter({GetShape(input), GetShape(constant)});
+  }
+
+  tflite::Interpreter* Interpreter() const { return interpreter_.get(); }
+};
+
+class NNAPIPluginTest : public ::testing::Test {
+ protected:
+  NNAPIPluginTest() : delegate_(nullptr, [](TfLiteDelegate*) {}) {}
+  void SetUp() override {
+    nnapi_ = const_cast<NnApi*>(NnApiImplementation());
+    nnapi_mock_ = absl::make_unique<NnApiMock>(nnapi_);
+    nnapi_->ANeuralNetworksModel_getSupportedOperationsForDevices =
+        [](const ANeuralNetworksModel* model,
+           const ANeuralNetworksDevice* const* devices, uint32_t numDevices,
+           bool* supportedOps) -> int {
+      supportedOps[0] = true;
+      return 0;
+    };
+    model_.Build();
+  }
+  template <NNAPIExecutionPreference input, int output>
+  void CheckExecutionPreference() {
+    // Note - this uses a template since the NNAPI functions are C function
+    // pointers rather than lambdas so can't capture variables.
+    nnapi_->ANeuralNetworksCompilation_setPreference =
+        [](ANeuralNetworksCompilation* compilation, int32_t preference) {
+          return preference - output;
+        };
+    CreateDelegate(CreateNNAPISettings(fbb_, 0, 0, 0, input));
+    // Since delegation succeeds, the model becomes immutable and hence can't
+    // reuse it.
+    SingleAddOpModel model;
+    model.Build();
+    EXPECT_EQ(model.Interpreter()->ModifyGraphWithDelegate(delegate_.get()),
+              kTfLiteOk)
+        << " given input: " << input << " expected output: " << output;
+  }
+
+  void CreateDelegate(flatbuffers::Offset<NNAPISettings> settings) {
+    settings_ = flatbuffers::GetTemporaryPointer(
+        fbb_, CreateTFLiteSettings(fbb_, tflite::Delegate_NNAPI, settings));
+
+    plugin_ = delegates::DelegatePluginRegistry::CreateByName("NnapiPlugin",
+                                                              *settings_);
+    delegate_ = plugin_->Create();
+  }
+
+  NnApi* nnapi_;
+  std::unique_ptr<NnApiMock> nnapi_mock_;
+  SingleAddOpModel model_;
+  flatbuffers::FlatBufferBuilder fbb_;
+  const TFLiteSettings* settings_ = nullptr;
+  delegates::TfLiteDelegatePtr delegate_;
+  std::unique_ptr<delegates::DelegatePluginInterface> plugin_;
+};
+
+TEST_F(NNAPIPluginTest, PassesAcceleratorName) {
+  // Fails with non-existent "foo".
+  CreateDelegate(CreateNNAPISettings(fbb_, fbb_.CreateString("foo")));
+  EXPECT_EQ(model_.Interpreter()->ModifyGraphWithDelegate(delegate_.get()),
+            kTfLiteDelegateError);
+
+  // Succeeds with "test-device" supported by the mock.
+  CreateDelegate(CreateNNAPISettings(fbb_, fbb_.CreateString("test-device")));
+  EXPECT_EQ(model_.Interpreter()->ModifyGraphWithDelegate(delegate_.get()),
+            kTfLiteOk);
+}
+
+TEST_F(NNAPIPluginTest, PassesExecutionPreference) {
+  CheckExecutionPreference<NNAPIExecutionPreference_UNDEFINED,
+                           StatefulNnApiDelegate::Options::kUndefined>();
+  CheckExecutionPreference<NNAPIExecutionPreference_NNAPI_LOW_POWER,
+                           StatefulNnApiDelegate::Options::kLowPower>();
+  CheckExecutionPreference<NNAPIExecutionPreference_NNAPI_FAST_SINGLE_ANSWER,
+                           StatefulNnApiDelegate::Options::kFastSingleAnswer>();
+  CheckExecutionPreference<NNAPIExecutionPreference_NNAPI_SUSTAINED_SPEED,
+                           StatefulNnApiDelegate::Options::kSustainedSpeed>();
+}
+
+TEST_F(NNAPIPluginTest, PassesCachingParameters) {
+  nnapi_->ANeuralNetworksCompilation_setCaching =
+      [](ANeuralNetworksCompilation* compilation, const char* cacheDir,
+         const uint8_t* token) -> int {
+    if (std::string(cacheDir) != "d") return 1;
+    // Token is hashed with other bits, just check that it's not empty.
+    if (std::string(reinterpret_cast<const char*>(token)).empty()) return 2;
+    return 0;
+  };
+  CreateDelegate(CreateNNAPISettings(fbb_, 0, fbb_.CreateString("d"),
+                                     fbb_.CreateString("t")));
+  EXPECT_EQ(model_.Interpreter()->ModifyGraphWithDelegate(delegate_.get()),
+            kTfLiteOk);
+}
+
+TEST_F(NNAPIPluginTest, PassesFalseNNAPICpuFlag) {
+  CreateDelegate(CreateNNAPISettings(fbb_, 0, 0, 0,
+                                     NNAPIExecutionPreference_UNDEFINED, 0, 0,
+                                     /* allow CPU */ false));
+  nnapi_->ANeuralNetworksModel_getSupportedOperationsForDevices =
+      [](const ANeuralNetworksModel* model,
+         const ANeuralNetworksDevice* const* devices, uint32_t numDevices,
+         bool* supportedOps) -> int {
+    supportedOps[0] = true;
+    // Since no CPU, should only pass one device.
+    return numDevices - 1;
+  };
+  EXPECT_EQ(model_.Interpreter()->ModifyGraphWithDelegate(delegate_.get()),
+            kTfLiteOk);
+}
+
+TEST_F(NNAPIPluginTest, PassesTrueNNAPICpuFlag) {
+  CreateDelegate(CreateNNAPISettings(fbb_, 0, 0, 0,
+                                     NNAPIExecutionPreference_UNDEFINED, 0, 0,
+                                     /* allow CPU */ true));
+  nnapi_->ANeuralNetworksModel_getSupportedOperationsForDevices =
+      [](const ANeuralNetworksModel* model,
+         const ANeuralNetworksDevice* const* devices, uint32_t numDevices,
+         bool* supportedOps) -> int {
+    supportedOps[0] = true;
+    // With CPU allowed, should pass two devices.
+    return numDevices - 2;
+  };
+  EXPECT_EQ(model_.Interpreter()->ModifyGraphWithDelegate(delegate_.get()),
+            kTfLiteOk);
+}
+
+}  // namespace
+}  // namespace tflite
diff --git a/tensorflow/lite/experimental/acceleration/configuration/proto_to_flatbuffer.cc b/tensorflow/lite/experimental/acceleration/configuration/proto_to_flatbuffer.cc
new file mode 100644
index 00000000000..709bb70ca70
--- /dev/null
+++ b/tensorflow/lite/experimental/acceleration/configuration/proto_to_flatbuffer.cc
@@ -0,0 +1,58 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/experimental/acceleration/configuration/proto_to_flatbuffer.h"
+
+#include <string>
+
+#include "flatbuffers/idl.h"  // from @flatbuffers
+#include "flatbuffers/util.h"  // from @flatbuffers
+#include "tensorflow/core/platform/protobuf.h"
+#include "tensorflow/lite/minimal_logging.h"
+
+namespace tflite {
+
+namespace {
+#include "tensorflow/lite/experimental/acceleration/configuration/configuration_fbs_contents-inl.h"
+}
+
+const ComputeSettings* ConvertFromProto(
+    flatbuffers::Parser* parser, const proto::ComputeSettings& proto_settings) {
+  std::string json;
+  tensorflow::protobuf::util::JsonPrintOptions options;
+  options.preserve_proto_field_names = true;
+  options.always_print_primitive_fields = true;  // For catching problems.
+  auto status = tensorflow::protobuf::util::MessageToJsonString(proto_settings,
+                                                                &json, options);
+  if (!status.ok()) {
+    TFLITE_LOG_PROD(TFLITE_LOG_ERROR, "Failed to convert to Json: %s",
+                    status.ToString().c_str());
+    return nullptr;
+  }
+  if (!parser->Parse(configuration_fbs_contents)) {
+    TFLITE_LOG_PROD(TFLITE_LOG_ERROR, "Failed to parse schema: %s",
+                    parser->error_.c_str());
+    return nullptr;
+  }
+  parser->SetRootType("tflite.ComputeSettings");
+  if (!parser->Parse(json.c_str())) {
+    TFLITE_LOG_PROD(TFLITE_LOG_ERROR, "Failed to parse json: %s",
+                    parser->error_.c_str());
+    return nullptr;
+  }
+  return flatbuffers::GetRoot<ComputeSettings>(
+      parser->builder_.GetBufferPointer());
+}
+
+}  // namespace tflite
diff --git a/tensorflow/lite/experimental/acceleration/configuration/proto_to_flatbuffer.h b/tensorflow/lite/experimental/acceleration/configuration/proto_to_flatbuffer.h
new file mode 100644
index 00000000000..3b69e8465a5
--- /dev/null
+++ b/tensorflow/lite/experimental/acceleration/configuration/proto_to_flatbuffer.h
@@ -0,0 +1,32 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_CONFIGURATION_PROTO_TO_FLATBUFFER_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_CONFIGURATION_PROTO_TO_FLATBUFFER_H_
+
+#include "flatbuffers/idl.h"  // from @flatbuffers
+#include "tensorflow/lite/experimental/acceleration/configuration/configuration.pb.h"
+#include "tensorflow/lite/experimental/acceleration/configuration/configuration_generated.h"
+
+namespace tflite {
+
+// Converts the protobuf version ComputeSettings to the flatbuffer version, via
+// json. The parser is used for state - the returned pointer is valid only as
+// long as the parser is kept alive and unmutated.
+const ComputeSettings* ConvertFromProto(
+    flatbuffers::Parser* parser, const proto::ComputeSettings& proto_settings);
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_CONFIGURATION_PROTO_TO_FLATBUFFER_H_

From bedc750c7d06c79ca61b53c6eeaba727ba7e886a Mon Sep 17 00:00:00 2001
From: Meteorix <lxhustauto@gmail.com>
Date: Tue, 16 Jun 2020 23:38:53 +0800
Subject: [PATCH 0289/1390] fix LaunchDepthwiseConvBackpropFilterOp

---
 tensorflow/core/kernels/depthwise_conv_op_gpu.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/kernels/depthwise_conv_op_gpu.h b/tensorflow/core/kernels/depthwise_conv_op_gpu.h
index f712d9c48c0..f69878bea89 100644
--- a/tensorflow/core/kernels/depthwise_conv_op_gpu.h
+++ b/tensorflow/core/kernels/depthwise_conv_op_gpu.h
@@ -1763,7 +1763,7 @@ void LaunchDepthwiseConvBackpropFilterOp<GpuDevice, T>::operator()(
   int num_filter_backprop =
       args.filter_rows * args.filter_cols * args.out_depth;
   se::DeviceMemoryBase filter_bp_ptr(filter_backprop, num_filter_backprop);
-  stream->ThenMemset32(&filter_bp_ptr, 0, num_filter_backprop * sizeof(T));
+  stream->ThenMemZero(&filter_bp_ptr, num_filter_backprop * sizeof(T));
 
   if (args.filter_rows == 3 && args.filter_cols == 3) {
     OP_REQUIRES_OK(

From 878ac5ae83120cd5f2d013ee3dd08b8d1cd14040 Mon Sep 17 00:00:00 2001
From: Thomas O'Malley <omalleyt@google.com>
Date: Tue, 16 Jun 2020 08:54:22 -0700
Subject: [PATCH 0290/1390] Add fused/non-fused inference/training overhead
 benchmarks for BatchNormalization.

PiperOrigin-RevId: 316690878
Change-Id: I36a0c8595b973657ae3cb8f95c11ba797cc4dcab
---
 .../benchmark/eager_microbenchmarks_test.py   | 36 +++++++++++++++++--
 1 file changed, 33 insertions(+), 3 deletions(-)

diff --git a/tensorflow/python/keras/benchmark/eager_microbenchmarks_test.py b/tensorflow/python/keras/benchmark/eager_microbenchmarks_test.py
index 9c755286ee0..b4d0837c326 100644
--- a/tensorflow/python/keras/benchmark/eager_microbenchmarks_test.py
+++ b/tensorflow/python/keras/benchmark/eager_microbenchmarks_test.py
@@ -246,10 +246,40 @@ class MicroBenchmarksBase(test.Benchmark):
 
     self._run(fn, 10000)
 
-  def benchmark_layers_normalization_batch_normalization_overhead(self):
+  def benchmark_layers_batch_norm_fused_inf(self):
 
-    layer = normalization.BatchNormalization()
-    x = array_ops.ones((1, 1))
+    layer = normalization.BatchNormalization(fused=True)
+    x = array_ops.ones((1, 1, 1, 1))
+
+    def fn():
+      layer(x)
+
+    self._run(fn, 10000)
+
+  def benchmark_layers_batch_norm_fused_train(self):
+
+    layer = normalization.BatchNormalization(fused=True)
+    x = array_ops.ones((1, 1, 1, 1))
+
+    def fn():
+      layer(x, training=True)
+
+    self._run(fn, 10000)
+
+  def benchmark_layers_batch_norm_nonfused_inf(self):
+
+    layer = normalization.BatchNormalization(fused=False)
+    x = array_ops.ones((1, 1, 1, 1))
+
+    def fn():
+      layer(x)
+
+    self._run(fn, 10000)
+
+  def benchmark_layers_batch_norm_nonfused_train(self):
+
+    layer = normalization.BatchNormalization(fused=False)
+    x = array_ops.ones((1, 1, 1, 1))
 
     def fn():
       layer(x, training=True)

From 8a081946857e0951fd206daf69d159df8cdca34b Mon Sep 17 00:00:00 2001
From: Andy Ly <lyandy@google.com>
Date: Tue, 16 Jun 2020 08:55:28 -0700
Subject: [PATCH 0291/1390] Add verifier for tf.BatchToSpace.

The verifier checks for input, output and crops for whether the op is valid or not. If the contents of crops can be determined, its values will be used directly in checking the input and output shapes.

PiperOrigin-RevId: 316691063
Change-Id: Idd495d2102604e267cdaa5f45a21c0bfa3dcbcb0
---
 .../mlir/tensorflow/ir/tf_generated_ops.td    |   4 +
 .../compiler/mlir/tensorflow/ir/tf_ops.cc     | 143 ++++++++++++++++++
 .../mlir/tensorflow/tests/tf-ops.mlir         | 137 +++++++++++++++++
 3 files changed, 284 insertions(+)

diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td b/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
index c51e9f4d026..7db3539fcef 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
@@ -820,6 +820,10 @@ followed by cropping along the `height` and `width` dimensions.
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
   TF_DerivedOperandTypeAttr Tidx = TF_DerivedOperandTypeAttr<1>;
+
+  let verifier = [{
+    return Verify(*this);
+  }];
 }
 
 def TF_BatchToSpaceNDOp : TF_Op<"BatchToSpaceND", [NoSideEffect]> {
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc
index f4e5dc05eb0..8410929a19f 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc
@@ -695,6 +695,149 @@ void BatchMatMulV2Op::getCanonicalizationPatterns(
   results.insert<BatchMatMulV2ToMatMul>(context);
 }
 
+//===----------------------------------------------------------------------===//
+// BatchToSpaceOp
+//===----------------------------------------------------------------------===//
+
+static LogicalResult Verify(BatchToSpaceOp op) {
+  // Op already has a constraint that block_size >= 2.
+  int64_t block_size = op.block_size().getSExtValue();
+
+  llvm::SmallVector<int64_t, 4> input_shape(4, ShapedType::kDynamicSize);
+  auto input_type = op.input().getType().cast<TensorType>();
+  if (input_type.hasRank()) {
+    if (input_type.getRank() != 4)
+      return op.emitOpError()
+             << "requires input to be a 4D tensor, but got " << input_type;
+
+    int64_t input_batch = input_type.getDimSize(0);
+    if (input_batch != ShapedType::kDynamicSize &&
+        input_batch % (block_size * block_size) != 0) {
+      return op.emitOpError()
+             << "requires input batch (dimension 0) to be evenly divisible "
+                "by (block_size * block_size), but got input batch "
+             << input_batch << " and block_size " << block_size;
+    }
+
+    input_shape.assign(input_type.getShape().begin(),
+                       input_type.getShape().end());
+  }
+
+  auto crops_type = op.crops().getType().cast<TensorType>();
+  if (crops_type.hasRank()) {
+    if (crops_type.getRank() != 2)
+      return op.emitOpError()
+             << "requires crops to be a 2D tensor, but got " << crops_type;
+
+    auto dim_of_size = [&](int64_t dim, int64_t size) {
+      if (crops_type.isDynamicDim(dim)) return true;
+      return crops_type.getDimSize(dim) == size;
+    };
+    if (!dim_of_size(0, 2) || !dim_of_size(1, 2))
+      return op.emitOpError()
+             << "requires crops to be a tensor<2x2>, but got " << crops_type;
+  }
+
+  DenseIntElementsAttr crops_attr;
+  // Crops are defined as [[crop_top, crop_bottom], [crop_left, crop_right]],
+  // and flattened as [crop_top, crop_bottom, crop_left, crop_right]
+  llvm::SmallVector<int64_t, 4> crops_values;
+  if (matchPattern(op.crops(), m_Constant(&crops_attr))) {
+    assert(crops_attr.getNumElements() == 4 &&
+           "tf.BatchToSpace crops must have 4 elements");
+
+    auto crops_range = crops_attr.getIntValues();
+    for (const auto &crops_value : crops_range) {
+      int64_t crops_value_int = crops_value.getSExtValue();
+      if (crops_value_int < 0)
+        return op.emitOpError()
+               << "requires all crop values to be nonnegative, but got "
+               << crops_attr;
+
+      crops_values.push_back(crops_value_int);
+    }
+  }
+
+  auto output_type = op.output().getType().cast<TensorType>();
+  if (output_type.hasRank()) {
+    if (output_type.getRank() != 4)
+      return op.emitOpError()
+             << "requires output to be a 4D tensor, but got " << output_type;
+
+    auto static_dims = [](int64_t dim_a, int64_t dim_b) {
+      return dim_a != ShapedType::kDynamicSize &&
+             dim_b != ShapedType::kDynamicSize;
+    };
+
+    auto output_shape = output_type.getShape();
+
+    // output batch = input batch / (block_size * block_size).
+    int64_t input_batch = input_shape[0];
+    int64_t output_batch = output_shape[0];
+    if (static_dims(input_batch, output_batch) &&
+        (output_batch * block_size * block_size) != input_batch)
+      return op.emitOpError()
+             << "requires output batch (dimension 0) to be equal to input "
+                "batch (dimension 0) / (block_size * block_size), but got "
+                "output batch "
+             << output_batch << ", input batch " << input_batch
+             << ", and block_size " << block_size;
+
+    auto check_spatial_dim = [&](int64_t spatial_dim_index,
+                                 llvm::StringRef dim_name,
+                                 llvm::StringRef crop_a_name,
+                                 llvm::StringRef crop_b_name) -> LogicalResult {
+      int64_t input_dim = input_shape[spatial_dim_index];
+      int64_t output_dim = output_shape[spatial_dim_index];
+      if (!static_dims(input_dim, output_dim)) return success();
+
+      int64_t input_dim_pad = input_dim * block_size;
+      // If crops are unknown, the maximum output spatial dim size is input
+      // spatial dim size * block_size, as crops can be minimum 0.
+      if (crops_values.empty() && output_dim > input_dim * block_size)
+        return op.emitOpError()
+               << "requires output " << dim_name << " (dimension "
+               << spatial_dim_index << ") to be less than or equal to input "
+               << dim_name << " (dimension " << spatial_dim_index
+               << ") * block_size, but got output " << dim_name << " "
+               << output_dim << ", input " << dim_name << " " << input_dim
+               << ", and block_size " << block_size;
+
+      if (!crops_values.empty()) {
+        // output spatial dim = input spatial dim * block_size - crops.
+        int64_t crop_a = crops_values[2 * (spatial_dim_index - 1)];
+        int64_t crop_b = crops_values[2 * (spatial_dim_index - 1) + 1];
+        if (output_dim != input_dim_pad - crop_a - crop_b)
+          return op.emitOpError()
+                 << "requires output " << dim_name << " (dimension "
+                 << spatial_dim_index << ") to be equal to input " << dim_name
+                 << " (dimension " << spatial_dim_index << ") * block_size - "
+                 << crop_a_name << " - " << crop_b_name << ", but got output "
+                 << dim_name << " " << output_dim << ", input " << dim_name
+                 << " " << input_dim << ", " << crop_a_name << " " << crop_a
+                 << ", " << crop_b_name << " " << crop_b << ", and block_size "
+                 << block_size;
+      }
+
+      return success();
+    };
+
+    if (failed(check_spatial_dim(1, "height", "crop_top", "crop_bottom")) ||
+        failed(check_spatial_dim(2, "width", "crop_left", "crop_right")))
+      return failure();
+
+    int64_t input_depth = input_shape[3];
+    int64_t output_depth = output_shape[3];
+    if (static_dims(input_depth, output_depth) && output_depth != input_depth)
+      return op.emitOpError()
+             << "requires output depth (dimension 3) to be equal to input "
+                "depth (dimension 3), but got output depth "
+             << output_depth << " and input depth " << input_depth;
+  }
+
+  return success();
+}
+
 //===----------------------------------------------------------------------===//
 // BiasAddOp
 //===----------------------------------------------------------------------===//
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tf-ops.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tf-ops.mlir
index 058585b41d7..4ba2e83300b 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tf-ops.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tf-ops.mlir
@@ -2872,4 +2872,141 @@ func @testSendTPUEmbeddingGradients(%x: tensor<512x256xf32>) {
   return
 }
 
+// -----
 
+//===--------------------------------------------------------------------===//
+//  tf.BatchToSpace
+//===--------------------------------------------------------------------===//
+
+func @testBatchToSpaceDynamic(%arg0: tensor<*xf32>, %arg1: tensor<*xi32>) {
+  %0 = "tf.BatchToSpace"(%arg0, %arg1) {block_size = 2 : i64} : (tensor<*xf32>, tensor<*xi32>) -> tensor<*xf32>
+  return
+}
+
+func @testBatchToSpaceRankedInput(%arg0: tensor<?x?x?x?xf32>, %arg1: tensor<*xi32>) {
+  %0 = "tf.BatchToSpace"(%arg0, %arg1) {block_size = 2 : i64} : (tensor<?x?x?x?xf32>, tensor<*xi32>) -> tensor<*xf32>
+  return
+}
+
+func @testBatchToSpaceRankedCrops(%arg0: tensor<*xf32>, %arg1: tensor<?x?xi32>) {
+  %0 = "tf.BatchToSpace"(%arg0, %arg1) {block_size = 2 : i64} : (tensor<*xf32>, tensor<?x?xi32>) -> tensor<*xf32>
+  return
+}
+
+func @testBatchToSpaceRankedOutput(%arg0: tensor<*xf32>, %arg1: tensor<*xi32>) {
+  %0 = "tf.BatchToSpace"(%arg0, %arg1) {block_size = 2 : i64} : (tensor<*xf32>, tensor<*xi32>) -> tensor<?x?x?x?xf32>
+  return
+}
+
+func @testBatchToSpaceStatic(%arg0: tensor<36x8x8x8xf32>) {
+  %crops = "tf.Const"() {value = dense<[[1, 2], [3, 4]]> : tensor<2x2xi32>} : () -> tensor<2x2xi32>
+  %0 = "tf.BatchToSpace"(%arg0, %crops) {block_size = 3 : i64} : (tensor<36x8x8x8xf32>, tensor<2x2xi32>) -> tensor<4x21x17x8xf32>
+  return
+}
+
+// -----
+
+func @testBatchToSpaceInvalidInputRank(%arg0: tensor<8xf32>, %arg1: tensor<*xi32>) {
+  // expected-error @+1 {{'tf.BatchToSpace' op requires input to be a 4D tensor, but got 'tensor<8xf32>'}}
+  %0 = "tf.BatchToSpace"(%arg0, %arg1) {block_size = 2 : i64} : (tensor<8xf32>, tensor<*xi32>) -> tensor<*xf32>
+  return
+}
+
+// -----
+
+func @testBatchToSpaceInvalidInputBatch(%arg0: tensor<2x4x6x8xf32>, %arg1: tensor<*xi32>) {
+  // expected-error @+1 {{'tf.BatchToSpace' op requires input batch (dimension 0) to be evenly divisible by (block_size * block_size), but got input batch 2 and block_size 2}}
+  %0 = "tf.BatchToSpace"(%arg0, %arg1) {block_size = 2 : i64} : (tensor<2x4x6x8xf32>, tensor<*xi32>) -> tensor<*xf32>
+  return
+}
+
+// -----
+
+func @testBatchToSpaceInvalidCropsRank(%arg0: tensor<*xf32>, %arg1: tensor<?x?x?xi32>) {
+  // expected-error @+1 {{'tf.BatchToSpace' op requires crops to be a 2D tensor, but got 'tensor<?x?x?xi32>'}}
+  %0 = "tf.BatchToSpace"(%arg0, %arg1) {block_size = 2 : i64} : (tensor<*xf32>, tensor<?x?x?xi32>) -> tensor<*xf32>
+  return
+}
+
+// -----
+
+func @testBatchToSpaceInvalidCropsFirstDim(%arg0: tensor<*xf32>, %arg1: tensor<3x?xi32>) {
+  // expected-error @+1 {{'tf.BatchToSpace' op requires crops to be a tensor<2x2>, but got 'tensor<3x?xi32>'}}
+  %0 = "tf.BatchToSpace"(%arg0, %arg1) {block_size = 2 : i64} : (tensor<*xf32>, tensor<3x?xi32>) -> tensor<*xf32>
+  return
+}
+
+// -----
+
+func @testBatchToSpaceInvalidCropsSecondDim(%arg0: tensor<*xf32>, %arg1: tensor<?x3xi32>) {
+  // expected-error @+1 {{'tf.BatchToSpace' op requires crops to be a tensor<2x2>, but got 'tensor<?x3xi32>'}}
+  %0 = "tf.BatchToSpace"(%arg0, %arg1) {block_size = 2 : i64} : (tensor<*xf32>, tensor<?x3xi32>) -> tensor<*xf32>
+  return
+}
+
+// -----
+
+func @testBatchToSpaceBadCropValues(%arg0: tensor<*xf32>) {
+  %crops = "tf.Const"() {value = dense<[[-1, -2], [-3, -4]]> : tensor<2x2xi32>} : () -> tensor<2x2xi32>
+  // expected-error @+1 {{'tf.BatchToSpace' op requires all crop values to be nonnegative, but got dense<[[-1, -2], [-3, -4]]> : tensor<2x2xi32>}}
+  %0 = "tf.BatchToSpace"(%arg0, %crops) {block_size = 2 : i64} : (tensor<*xf32>, tensor<2x2xi32>) -> tensor<*xf32>
+  return
+}
+
+// -----
+
+func @testBatchToSpaceInvalidOutputRank(%arg0: tensor<*xf32>, %arg1: tensor<*xi32>) {
+  // expected-error @+1 {{'tf.BatchToSpace' op requires output to be a 4D tensor, but got 'tensor<8xf32>'}}
+  %0 = "tf.BatchToSpace"(%arg0, %arg1) {block_size = 2 : i64} : (tensor<*xf32>, tensor<*xi32>) -> tensor<8xf32>
+  return
+}
+
+// -----
+
+func @testBatchToSpaceInvalidOutputBatch(%arg0: tensor<16x8x8x3xf32>, %arg1: tensor<*xi32>) {
+  // expected-error @+1 {{'tf.BatchToSpace' op requires output batch (dimension 0) to be equal to input batch (dimension 0) / (block_size * block_size), but got output batch 8, input batch 16, and block_size 2}}
+  %0 = "tf.BatchToSpace"(%arg0, %arg1) {block_size = 2 : i64} : (tensor<16x8x8x3xf32>, tensor<*xi32>) -> tensor<8x8x8x3xf32>
+  return
+}
+
+// -----
+
+func @testBatchToSpaceInvalidOutputHeight(%arg0: tensor<16x8x8x3xf32>, %arg1: tensor<*xi32>) {
+  // expected-error @+1 {{'tf.BatchToSpace' op requires output height (dimension 1) to be less than or equal to input height (dimension 1) * block_size, but got output height 17, input height 8, and block_size 2}}
+  %0 = "tf.BatchToSpace"(%arg0, %arg1) {block_size = 2 : i64} : (tensor<16x8x8x3xf32>, tensor<*xi32>) -> tensor<4x17x8x3xf32>
+  return
+}
+
+// -----
+
+func @testBatchToSpaceInvalidOutputHeightCrops(%arg0: tensor<16x8x8x3xf32>) {
+  %crops = "tf.Const"() {value = dense<[[1, 2], [3, 4]]> : tensor<2x2xi32>} : () -> tensor<2x2xi32>
+  // expected-error @+1 {{'tf.BatchToSpace' op requires output height (dimension 1) to be equal to input height (dimension 1) * block_size - crop_top - crop_bottom, but got output height 8, input height 8, crop_top 1, crop_bottom 2, and block_size 2}}
+  %0 = "tf.BatchToSpace"(%arg0, %crops) {block_size = 2 : i64} : (tensor<16x8x8x3xf32>, tensor<2x2xi32>) -> tensor<4x8x9x3xf32>
+  return
+}
+
+// -----
+
+func @testBatchToSpaceInvalidOutputWidth(%arg0: tensor<16x4x4x3xf32>, %arg1: tensor<*xi32>) {
+  // expected-error @+1 {{'tf.BatchToSpace' op requires output width (dimension 2) to be less than or equal to input width (dimension 2) * block_size, but got output width 9, input width 4, and block_size 2}}
+  %0 = "tf.BatchToSpace"(%arg0, %arg1) {block_size = 2 : i64} : (tensor<16x4x4x3xf32>, tensor<*xi32>) -> tensor<4x4x9x3xf32>
+  return
+}
+
+// -----
+
+func @testBatchToSpaceInvalidOutputWidthCrops(%arg0: tensor<16x8x8x3xf32>) {
+  %crops = "tf.Const"() {value = dense<[[1, 2], [3, 4]]> : tensor<2x2xi32>} : () -> tensor<2x2xi32>
+  // expected-error @+1 {{'tf.BatchToSpace' op requires output width (dimension 2) to be equal to input width (dimension 2) * block_size - crop_left - crop_right, but got output width 8, input width 8, crop_left 3, crop_right 4, and block_size 2}}
+  %0 = "tf.BatchToSpace"(%arg0, %crops) {block_size = 2 : i64} : (tensor<16x8x8x3xf32>, tensor<2x2xi32>) -> tensor<4x13x8x3xf32>
+  return
+}
+
+// -----
+
+func @testBatchToSpaceInvalidOutputDepth(%arg0: tensor<16x8x8x3xf32>, %arg1: tensor<*xi32>) {
+  // expected-error @+1 {{'tf.BatchToSpace' op requires output depth (dimension 3) to be equal to input depth (dimension 3), but got output depth 8 and input depth 3}}
+  %0 = "tf.BatchToSpace"(%arg0, %arg1) {block_size = 2 : i64} : (tensor<16x8x8x3xf32>, tensor<*xi32>) -> tensor<4x8x8x8xf32>
+  return
+}

From 68e3ce2fe773f1731f40ce69da7585ebd3257b3d Mon Sep 17 00:00:00 2001
From: Vo Van Nghia <vovannghia2409@gmail.com>
Date: Tue, 16 Jun 2020 23:19:28 +0700
Subject: [PATCH 0292/1390] Add GetTempFileName

---
 tensorflow/c/env.cc              | 10 ++++++++++
 tensorflow/c/env.h               |  6 ++++++
 tensorflow/core/platform/path.cc |  2 ++
 3 files changed, 18 insertions(+)

diff --git a/tensorflow/c/env.cc b/tensorflow/c/env.cc
index 1c35ff9001d..3d490d95e66 100644
--- a/tensorflow/c/env.cc
+++ b/tensorflow/c/env.cc
@@ -146,6 +146,16 @@ TF_StringStream* TF_GetLocalTempDirectories() {
   return list;
 }
 
+void TF_GetTempFileName(const char* extension, std::string* name,
+                        TF_Status* status) {
+  *name = ::tensorflow::Env::Default()->GetTempFilename(extension);
+  if (*name.length() == 0) {
+    TF_SetStatus(status, TF_INTERNAL, "Can not create temp file name");
+  } else {
+    TF_SetStatus(status, TF_OK, "");
+  }
+}
+
 TF_CAPI_EXPORT extern uint64_t TF_NowNanos(void) {
   return ::tensorflow::Env::Default()->NowNanos();
 }
diff --git a/tensorflow/c/env.h b/tensorflow/c/env.h
index 2a763730bc3..b50d0fdec03 100644
--- a/tensorflow/c/env.h
+++ b/tensorflow/c/env.h
@@ -152,6 +152,12 @@ TF_CAPI_EXPORT extern TF_StringStream* TF_GetChildren(const char* filename,
 // The caller is responsible for freeing the list (see TF_StringStreamDone).
 TF_CAPI_EXPORT extern TF_StringStream* TF_GetLocalTempDirectories(void);
 
+// Creates a temporary file name with an extension.
+// The caller is responsible for freeing the returned pointer.
+TF_CAPI_EXPORT extern void TF_GetTempFileName(const char* extension,
+                                              std::string* name,
+                                              TF_Status* status);
+
 // Returns the number of nanoseconds since the Unix epoch.
 TF_CAPI_EXPORT extern uint64_t TF_NowNanos(void);
 
diff --git a/tensorflow/core/platform/path.cc b/tensorflow/core/platform/path.cc
index 1e88328aace..f9442ccba0f 100644
--- a/tensorflow/core/platform/path.cc
+++ b/tensorflow/core/platform/path.cc
@@ -327,6 +327,8 @@ string GetTempFilename(const string& extension) {
   }
   LOG(FATAL) << "No temp directory found.";
 #endif
+  // Return an empty string to indicate that we can not create temp file name.
+  return "";
 }
 
 bool GetTestUndeclaredOutputsDir(string* dir) {

From 12ec80d2395da75c18db5b8371c68f07889dba11 Mon Sep 17 00:00:00 2001
From: Scott Zhu <scottzhu@google.com>
Date: Tue, 16 Jun 2020 09:18:02 -0700
Subject: [PATCH 0293/1390] Move the module test wrt to Keras private API to
 keras test.

PiperOrigin-RevId: 316695477
Change-Id: I4201f62e97bb1acb768b980a51d26bf0c76853ab
---
 tensorflow/python/keras/layers/BUILD          | 12 ++++++
 tensorflow/python/keras/layers/layers_test.py | 40 +++++++++++++++++++
 tensorflow/tools/api/tests/module_test.py     | 12 ------
 3 files changed, 52 insertions(+), 12 deletions(-)
 create mode 100644 tensorflow/python/keras/layers/layers_test.py

diff --git a/tensorflow/python/keras/layers/BUILD b/tensorflow/python/keras/layers/BUILD
index 91bace936d3..f67838d591a 100644
--- a/tensorflow/python/keras/layers/BUILD
+++ b/tensorflow/python/keras/layers/BUILD
@@ -886,3 +886,15 @@ tf_py_test(
         "@absl_py//absl/testing:parameterized",
     ],
 )
+
+tf_py_test(
+    name = "layers_test",
+    size = "small",
+    srcs = ["layers_test.py"],
+    python_version = "PY3",
+    deps = [
+        ":layers",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:tf2",
+    ],
+)
diff --git a/tensorflow/python/keras/layers/layers_test.py b/tensorflow/python/keras/layers/layers_test.py
new file mode 100644
index 00000000000..35ba029dd83
--- /dev/null
+++ b/tensorflow/python/keras/layers/layers_test.py
@@ -0,0 +1,40 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+# pylint: disable=g-classes-have-attributes
+"""Tests for layers.__init__."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python import tf2
+from tensorflow.python.keras import layers
+from tensorflow.python.platform import test
+
+
+class LayersTest(test.TestCase):
+
+  def test_keras_private_symbol(self):
+    normalization_parent = layers.Normalization.__module__.split('.')[-1]
+    if tf2.enabled():
+      self.assertEqual('normalization', normalization_parent)
+      self.assertTrue(layers.BatchNormalization._USE_V2_BEHAVIOR)
+    else:
+      self.assertEqual('normalization_v1', normalization_parent)
+      self.assertFalse(layers.BatchNormalization._USE_V2_BEHAVIOR)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/tools/api/tests/module_test.py b/tensorflow/tools/api/tests/module_test.py
index aa8a224d00b..5397278f5f3 100644
--- a/tensorflow/tools/api/tests/module_test.py
+++ b/tensorflow/tools/api/tests/module_test.py
@@ -79,18 +79,6 @@ class ModuleTest(test.TestCase):
       tf.compat.v1.summary.FileWriter
     # pylint: enable=pointless-statement
 
-  def testInternalKerasImport(self):
-    # pylint: disable=g-import-not-at-top
-    from tensorflow.python.keras import layers
-    normalization_parent = layers.Normalization.__module__.split('.')[-1]
-    if tf._major_api_version == 2:
-      self.assertEqual('normalization', normalization_parent)
-      self.assertTrue(layers.BatchNormalization._USE_V2_BEHAVIOR)
-    else:
-      self.assertEqual('normalization_v1', normalization_parent)
-      self.assertFalse(layers.BatchNormalization._USE_V2_BEHAVIOR)
-    # pylint: enable=g-import-not-at-top
-
 
 if __name__ == '__main__':
   test.main()

From cf599cade160e40d9f7051afebe89672f60e7137 Mon Sep 17 00:00:00 2001
From: Scott Zhu <scottzhu@google.com>
Date: Tue, 16 Jun 2020 09:30:51 -0700
Subject: [PATCH 0294/1390] Move the serialization_test to keras/tests

PiperOrigin-RevId: 316697481
Change-Id: I918b29a976b166662acf9045f87512aef485441b
---
 tensorflow/python/keras/tests/BUILD           | 15 +++++
 .../keras/tests/serialization_util_test.py    | 67 +++++++++++++++++++
 tensorflow/python/util/serialization_test.py  | 36 ----------
 3 files changed, 82 insertions(+), 36 deletions(-)
 create mode 100644 tensorflow/python/keras/tests/serialization_util_test.py

diff --git a/tensorflow/python/keras/tests/BUILD b/tensorflow/python/keras/tests/BUILD
index 36af32184e6..b01fc3ca903 100644
--- a/tensorflow/python/keras/tests/BUILD
+++ b/tensorflow/python/keras/tests/BUILD
@@ -359,6 +359,21 @@ cuda_py_test(
     ],
 )
 
+tf_py_test(
+    name = "serialization_util_test",
+    size = "small",
+    srcs = ["serialization_util_test.py"],
+    python_version = "PY3",
+    deps = [
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:util",
+        "//tensorflow/python/keras/engine",
+        "//tensorflow/python/keras/layers:core",
+    ],
+)
+
 tf_py_test(
     name = "temporal_sample_weights_correctness_test",
     srcs = ["temporal_sample_weights_correctness_test.py"],
diff --git a/tensorflow/python/keras/tests/serialization_util_test.py b/tensorflow/python/keras/tests/serialization_util_test.py
new file mode 100644
index 00000000000..058bdaec56c
--- /dev/null
+++ b/tensorflow/python/keras/tests/serialization_util_test.py
@@ -0,0 +1,67 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for serialization functions."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import json
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import test_util
+from tensorflow.python.keras.engine import input_layer
+from tensorflow.python.keras.engine import sequential
+from tensorflow.python.keras.engine import training
+from tensorflow.python.keras.layers import core
+from tensorflow.python.platform import test
+from tensorflow.python.util import serialization
+
+
+class SerializationTests(test.TestCase):
+
+  def test_serialize_dense(self):
+    dense = core.Dense(3)
+    dense(constant_op.constant([[4.]]))
+    round_trip = json.loads(json.dumps(
+        dense, default=serialization.get_json_type))
+    self.assertEqual(3, round_trip["config"]["units"])
+
+  @test_util.run_in_graph_and_eager_modes
+  def test_serialize_sequential(self):
+    model = sequential.Sequential()
+    model.add(core.Dense(4))
+    model.add(core.Dense(5))
+    model(constant_op.constant([[1.]]))
+    sequential_round_trip = json.loads(
+        json.dumps(model, default=serialization.get_json_type))
+    self.assertEqual(
+        # Note that `config['layers'][0]` will be an InputLayer in V2
+        # (but not in V1)
+        5, sequential_round_trip["config"]["layers"][-1]["config"]["units"])
+
+  @test_util.run_in_graph_and_eager_modes
+  def test_serialize_model(self):
+    x = input_layer.Input(shape=[3])
+    y = core.Dense(10)(x)
+    model = training.Model(x, y)
+    model(constant_op.constant([[1., 1., 1.]]))
+    model_round_trip = json.loads(
+        json.dumps(model, default=serialization.get_json_type))
+    self.assertEqual(
+        10, model_round_trip["config"]["layers"][1]["config"]["units"])
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/util/serialization_test.py b/tensorflow/python/util/serialization_test.py
index a66dd11ba99..fea2e0feb6e 100644
--- a/tensorflow/python/util/serialization_test.py
+++ b/tensorflow/python/util/serialization_test.py
@@ -20,26 +20,13 @@ from __future__ import print_function
 
 import json
 
-from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import tensor_shape
-from tensorflow.python.framework import test_util
-from tensorflow.python.keras.engine import input_layer
-from tensorflow.python.keras.engine import sequential
-from tensorflow.python.keras.engine import training
-from tensorflow.python.keras.layers import core
 from tensorflow.python.platform import test
 from tensorflow.python.util import serialization
 
 
 class SerializationTests(test.TestCase):
 
-  def test_serialize_dense(self):
-    dense = core.Dense(3)
-    dense(constant_op.constant([[4.]]))
-    round_trip = json.loads(json.dumps(
-        dense, default=serialization.get_json_type))
-    self.assertEqual(3, round_trip["config"]["units"])
-
   def test_serialize_shape(self):
     round_trip = json.loads(json.dumps(
         tensor_shape.TensorShape([None, 2, 3]),
@@ -47,29 +34,6 @@ class SerializationTests(test.TestCase):
     self.assertIs(round_trip[0], None)
     self.assertEqual(round_trip[1], 2)
 
-  @test_util.run_in_graph_and_eager_modes
-  def test_serialize_sequential(self):
-    model = sequential.Sequential()
-    model.add(core.Dense(4))
-    model.add(core.Dense(5))
-    model(constant_op.constant([[1.]]))
-    sequential_round_trip = json.loads(
-        json.dumps(model, default=serialization.get_json_type))
-    self.assertEqual(
-        # Note that `config['layers'][0]` will be an InputLayer in V2
-        # (but not in V1)
-        5, sequential_round_trip["config"]["layers"][-1]["config"]["units"])
-
-  @test_util.run_in_graph_and_eager_modes
-  def test_serialize_model(self):
-    x = input_layer.Input(shape=[3])
-    y = core.Dense(10)(x)
-    model = training.Model(x, y)
-    model(constant_op.constant([[1., 1., 1.]]))
-    model_round_trip = json.loads(
-        json.dumps(model, default=serialization.get_json_type))
-    self.assertEqual(
-        10, model_round_trip["config"]["layers"][1]["config"]["units"])
 
 if __name__ == "__main__":
   test.main()

From e178928ea82c020c234cabf0ea672ada29a04292 Mon Sep 17 00:00:00 2001
From: Mihai Maruseac <mihaimaruseac@google.com>
Date: Tue, 16 Jun 2020 09:33:22 -0700
Subject: [PATCH 0295/1390] Rollback of breaking
 aa99cf218c8bf13aeb15e64ec4c62ea14ecb5753

PiperOrigin-RevId: 316697856
Change-Id: I1ea9a700a1b6d6d4bf824b3d810e4a61127da912
---
 tensorflow/lite/delegates/flex/delegate.cc |  7 ---
 tensorflow/lite/interpreter_builder.cc     | 17 ------
 tensorflow/lite/python/BUILD               |  3 +-
 tensorflow/lite/python/lite_flex_test.py   | 61 +++++++++-------------
 tensorflow/python/BUILD                    |  1 -
 5 files changed, 27 insertions(+), 62 deletions(-)

diff --git a/tensorflow/lite/delegates/flex/delegate.cc b/tensorflow/lite/delegates/flex/delegate.cc
index b8b0d4e6d01..4741bddc2f5 100644
--- a/tensorflow/lite/delegates/flex/delegate.cc
+++ b/tensorflow/lite/delegates/flex/delegate.cc
@@ -136,10 +136,3 @@ TfLiteStatus FlexDelegate::CopyFromBufferHandle(
 }
 
 }  // namespace tflite
-
-// Exported C interface function which is used by AcquireFlexDelegate() at
-// interpreter_build.cc. To export the function name globally, the function name
-// must be matched with patterns in tf_version_script.lds
-extern "C" tflite::TfLiteDelegateUniquePtr TF_AcquireFlexDelegate() {
-  return tflite::AcquireFlexDelegate();
-}
diff --git a/tensorflow/lite/interpreter_builder.cc b/tensorflow/lite/interpreter_builder.cc
index d73b298e595..43d81ef0770 100644
--- a/tensorflow/lite/interpreter_builder.cc
+++ b/tensorflow/lite/interpreter_builder.cc
@@ -14,9 +14,6 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/lite/interpreter_builder.h"
 
-#if !defined(__ANDROID__) && !defined(__APPLE__) && !defined(_WIN32)
-#include <dlfcn.h>
-#endif
 #include <fcntl.h>
 #include <stdint.h>
 #include <stdio.h>
@@ -117,20 +114,6 @@ const char* kEmptyTensorName = "";
 // For flex delegate, see also the strong override in
 // lite/delegates/flex/delegate.cc.
 TFLITE_ATTRIBUTE_WEAK Interpreter::TfLiteDelegatePtr AcquireFlexDelegate() {
-#if !defined(__ANDROID__) && !defined(__APPLE__) && !defined(_WIN32)
-  // If _pywrap_tensorflow_internal.so is available, use
-  // TF_AcquireFlexDelegate() to initialize flex delegate.
-  void* lib_tf_internal =
-      dlopen("_pywrap_tensorflow_internal.so", RTLD_NOW | RTLD_LOCAL);
-  if (lib_tf_internal) {
-    auto TF_AcquireFlexDelegate =
-        reinterpret_cast<Interpreter::TfLiteDelegatePtr (*)()>(
-            dlsym(lib_tf_internal, "TF_AcquireFlexDelegate"));
-    if (TF_AcquireFlexDelegate) {
-      return TF_AcquireFlexDelegate();
-    }
-  }
-#endif
   return Interpreter::TfLiteDelegatePtr(nullptr, [](TfLiteDelegate*) {});
 }
 
diff --git a/tensorflow/lite/python/BUILD b/tensorflow/lite/python/BUILD
index b0f605ed50d..c1f37c81b7f 100644
--- a/tensorflow/lite/python/BUILD
+++ b/tensorflow/lite/python/BUILD
@@ -194,7 +194,8 @@ py_test(
     python_version = "PY3",
     srcs_version = "PY2AND3",
     tags = [
-        "no_mac",  # TODO(b/159077703): Enable Python API Flex support on MacOS.
+        # TODO(b/111881877): Enable in oss after resolving op registry issues.
+        "no_oss",
         "no_windows",
     ],
     deps = [
diff --git a/tensorflow/lite/python/lite_flex_test.py b/tensorflow/lite/python/lite_flex_test.py
index ffc157c2128..26bee206d27 100644
--- a/tensorflow/lite/python/lite_flex_test.py
+++ b/tensorflow/lite/python/lite_flex_test.py
@@ -19,7 +19,6 @@ from __future__ import division
 from __future__ import print_function
 
 from absl.testing import parameterized
-import numpy as np
 
 from tensorflow.lite.python import lite
 from tensorflow.lite.python.interpreter import Interpreter
@@ -42,7 +41,8 @@ class FromSessionTest(test_util.TensorFlowTestCase, parameterized.TestCase):
       ('DisableMlirConverter', False))  # disable mlir
   def testFlexMode(self, enable_mlir):
     with ops.Graph().as_default():
-      in_tensor = array_ops.placeholder(shape=[1, 4], dtype=dtypes.float32)
+      in_tensor = array_ops.placeholder(
+          shape=[1, 16, 16, 3], dtype=dtypes.float32)
       out_tensor = in_tensor + in_tensor
       sess = session.Session()
 
@@ -54,22 +54,19 @@ class FromSessionTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     tflite_model = converter.convert()
     self.assertTrue(tflite_model)
 
-    # Check the model works with TensorFlow ops.
+    # Ensures the model contains TensorFlow ops.
+    # TODO(nupurgarg): Check values once there is a Python delegate interface.
     interpreter = Interpreter(model_content=tflite_model)
-    interpreter.allocate_tensors()
-    input_details = interpreter.get_input_details()
-    test_input = np.array([[1.0, 2.0, 3.0, 4.0]], dtype=np.float32)
-    interpreter.set_tensor(input_details[0]['index'], test_input)
-    interpreter.invoke()
-
-    output_details = interpreter.get_output_details()
-    expected_output = np.array([[2.0, 4.0, 6.0, 8.0]], dtype=np.float32)
-    output_data = interpreter.get_tensor(output_details[0]['index'])
-    self.assertTrue((expected_output == output_data).all())
+    with self.assertRaises(RuntimeError) as error:
+      interpreter.allocate_tensors()
+    self.assertIn(
+        'Regular TensorFlow ops are not supported by this interpreter.',
+        str(error.exception))
 
   def testDeprecatedFlags(self):
     with ops.Graph().as_default():
-      in_tensor = array_ops.placeholder(shape=[1, 4], dtype=dtypes.float32)
+      in_tensor = array_ops.placeholder(
+          shape=[1, 16, 16, 3], dtype=dtypes.float32)
       out_tensor = in_tensor + in_tensor
       sess = session.Session()
 
@@ -86,18 +83,14 @@ class FromSessionTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     tflite_model = converter.convert()
     self.assertTrue(tflite_model)
 
-    # Check the model works with TensorFlow ops.
+    # Ensures the model contains TensorFlow ops.
+    # TODO(nupurgarg): Check values once there is a Python delegate interface.
     interpreter = Interpreter(model_content=tflite_model)
-    interpreter.allocate_tensors()
-    input_details = interpreter.get_input_details()
-    test_input = np.array([[1.0, 2.0, 3.0, 4.0]], dtype=np.float32)
-    interpreter.set_tensor(input_details[0]['index'], test_input)
-    interpreter.invoke()
-
-    output_details = interpreter.get_output_details()
-    expected_output = np.array([[2.0, 4.0, 6.0, 8.0]], dtype=np.float32)
-    output_data = interpreter.get_tensor(output_details[0]['index'])
-    self.assertTrue((expected_output == output_data).all())
+    with self.assertRaises(RuntimeError) as error:
+      interpreter.allocate_tensors()
+    self.assertIn(
+        'Regular TensorFlow ops are not supported by this interpreter.',
+        str(error.exception))
 
 
 class FromConcreteFunctionTest(test_util.TensorFlowTestCase,
@@ -121,18 +114,14 @@ class FromConcreteFunctionTest(test_util.TensorFlowTestCase,
     converter.experimental_new_converter = enable_mlir
     tflite_model = converter.convert()
 
-    # Check the model works with TensorFlow ops.
+    # Ensures the model contains TensorFlow ops.
+    # TODO(nupurgarg): Check values once there is a Python delegate interface.
     interpreter = Interpreter(model_content=tflite_model)
-    interpreter.allocate_tensors()
-    input_details = interpreter.get_input_details()
-    test_input = np.array([4.0], dtype=np.float32)
-    interpreter.set_tensor(input_details[0]['index'], test_input)
-    interpreter.invoke()
-
-    output_details = interpreter.get_output_details()
-    expected_output = np.array([24.0], dtype=np.float32)
-    output_data = interpreter.get_tensor(output_details[0]['index'])
-    self.assertTrue((expected_output == output_data).all())
+    with self.assertRaises(RuntimeError) as error:
+      interpreter.allocate_tensors()
+    self.assertIn(
+        'Regular TensorFlow ops are not supported by this interpreter.',
+        str(error.exception))
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index 343a95b85e9..87048ba9d40 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -6046,7 +6046,6 @@ pywrap_tensorflow_macro(
         "//tensorflow/core/profiler/internal:print_model_analysis",
         "//tensorflow/core/profiler/internal/cpu:python_tracer",
         "//tensorflow/tools/graph_transforms:transform_graph_lib",
-        "//tensorflow/lite/delegates/flex:delegate",
         "//tensorflow/lite/toco/python:toco_python_api",
         "//tensorflow/python/eager:pywrap_tfe_lib",
         "//tensorflow/core/util/tensor_bundle",

From 9ee6864c2c1acc356fcd50c56d1d1a6ae38392a7 Mon Sep 17 00:00:00 2001
From: Sachin Joglekar <srjoglekar@google.com>
Date: Tue, 16 Jun 2020 09:56:08 -0700
Subject: [PATCH 0296/1390] Minor doc update to use TfLiteIntArray better

PiperOrigin-RevId: 316701561
Change-Id: If14f772f0053eb4ecb0bc225cc11e93e0f07adb0
---
 tensorflow/lite/g3doc/performance/delegates.md | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/tensorflow/lite/g3doc/performance/delegates.md b/tensorflow/lite/g3doc/performance/delegates.md
index 1d11a8eb391..760e7273fc4 100644
--- a/tensorflow/lite/g3doc/performance/delegates.md
+++ b/tensorflow/lite/g3doc/performance/delegates.md
@@ -85,6 +85,8 @@ To see it in code, let's define a delegate and call it `MyDelegate`, which can
 execute Conv2D and Mean operations faster.
 
 ```c++
+#include "tensorflow/lite/util.h"
+
 // This is where the execution of the operations or whole graph happens.
 // The class below has an empty implementation just as a guideline
 // on the structure.
@@ -156,8 +158,7 @@ TfLiteRegistration GetMyDelegateNodeRegistration() {
 TfLiteStatus DelegatePrepare(TfLiteContext* context, TfLiteDelegate* delegate) {
   // Claim all nodes that can be evaluated by the delegate and ask the
   // framework to update the graph with delegate kernel instead.
-  // Reserve 1 element, since we need first element to be size.
-  std::vector<int> supported_nodes(1);
+  std::vector<int> supported_nodes;
   TfLiteIntArray* plan;
   TF_LITE_ENSURE_STATUS(context->GetExecutionPlan(context, &plan));
   TfLiteNode* node;
@@ -169,17 +170,19 @@ TfLiteStatus DelegatePrepare(TfLiteContext* context, TfLiteDelegate* delegate) {
       supported_nodes.push_back(node_index);
     }
   }
-  // Set first element to the number of nodes to replace.
-  supported_nodes[0] = supported_nodes.size() - 1;
   TfLiteRegistration my_delegate_kernel_registration =
       GetMyDelegateNodeRegistration();
 
   // This call split the graphs into subgraphs, for subgraphs that can be
   // handled by the delegate, it will replace it with a
   // 'my_delegate_kernel_registration'
-  return context->ReplaceNodeSubsetsWithDelegateKernels(
+  TfLiteIntArray* supported_nodes_int_array =
+      ::tflite::ConvertVectorToTfLiteIntArray(supported_nodes);
+  auto status = context->ReplaceNodeSubsetsWithDelegateKernels(
       context, my_delegate_kernel_registration,
-      reinterpret_cast<TfLiteIntArray*>(supported_nodes.data()), delegate);
+      supported_nodes_int_array, delegate);
+  TfLiteIntArrayFree(supported_nodes_int_array);
+  return status
 }
 
 void FreeBufferHandle(TfLiteContext* context, TfLiteDelegate* delegate,

From f6b8e93a5f614c962db73d4409f2b89026fbf4da Mon Sep 17 00:00:00 2001
From: Mingming Liu <mingmingl@google.com>
Date: Tue, 16 Jun 2020 10:09:39 -0700
Subject: [PATCH 0297/1390] A refactor to add helper functions {Split, Concat},
 which do Tensor dtype deduction automatically.

PiperOrigin-RevId: 316703807
Change-Id: I234bea9fce3cf3b2cb352be12246ee9f4e8c405a
---
 tensorflow/core/kernels/batch_kernels.cc | 73 ++++++++++++++----------
 1 file changed, 44 insertions(+), 29 deletions(-)

diff --git a/tensorflow/core/kernels/batch_kernels.cc b/tensorflow/core/kernels/batch_kernels.cc
index e94aef641e3..6449a399573 100644
--- a/tensorflow/core/kernels/batch_kernels.cc
+++ b/tensorflow/core/kernels/batch_kernels.cc
@@ -100,9 +100,9 @@ typedef Eigen::SyclDevice SYCLDevice;
 #endif  // TENSORFLOW_USE_SYCL
 
 // Concatenates 'inputs' into a single tensor along the zeroth dimension.
-// Requires that all elements of 'inputs' have element type T. Writes to the
-// op's output at position 'output_index', using 'context' for the allocation to
-// ensure proper device placement.
+// Requires that all elements of 'inputs' have element type T. Writes to
+// 'output' using 'context' for the allocation to ensure proper device
+// placement.
 template <typename T>
 Status Concat(OpKernelContext* context, const gtl::ArraySlice<Tensor> inputs,
               Tensor* output) {
@@ -157,6 +157,25 @@ Status Concat(OpKernelContext* context, const gtl::ArraySlice<Tensor> inputs,
   return Status::OK();
 }
 
+// Same as 'Concat' above, but handles Tensor dtype deduction automatically.
+Status Concat(OpKernelContext* context, const gtl::ArraySlice<Tensor> inputs,
+              Tensor* output) {
+  const DataType type = inputs[0].dtype();
+  Status concat_status;
+  switch (type) {
+#define CASE(type)                                         \
+  case DataTypeToEnum<type>::value:                        \
+    concat_status = Concat<type>(context, inputs, output); \
+    break;
+    TF_CALL_ALL_TYPES(CASE);
+#undef CASE
+    default:
+      concat_status = errors::InvalidArgument("Unsupported data type: ", type);
+      break;
+  }
+  return concat_status;
+}
+
 // The Split*() functions split 'input' with element type T into 'sizes.size()'
 // tensors along the zeroth dimension, with the ith split having zeroth-
 // dimension size 'sizes[i]'. They allocate the output tensors using 'context',
@@ -268,6 +287,25 @@ Status Split(OpKernelContext* context, const Tensor& input,
   return SplitCPU<T>(context, input, sizes, outputs);
 }
 
+// Same as 'Split' above, but handles Tensor dtype automatically.
+Status Split(OpKernelContext* context, const Tensor& input,
+             const gtl::ArraySlice<int64> sizes, std::vector<Tensor>* outputs) {
+  const DataType type = input.dtype();
+  Status split_status;
+  switch (type) {
+#define CASE(type)                                              \
+  case DataTypeToEnum<type>::value:                             \
+    split_status = Split<type>(context, input, sizes, outputs); \
+    break;
+    TF_CALL_ALL_TYPES(CASE);
+#undef CASE
+    default:
+      split_status = errors::InvalidArgument("Unsupported data type: ", type);
+      break;
+  }
+  return split_status;
+}
+
 // A class encapsulating the state and logic for batching tensors.
 class BatchResource : public ResourceBase {
  public:
@@ -449,22 +487,9 @@ class BatchResource : public ResourceBase {
         }
       }
 
-      const DataType type = to_concatenate[0].dtype();
-      Status concat_status;
       Tensor concatenated_tensor;
-      switch (type) {
-#define CASE(type)                                                   \
-  case DataTypeToEnum<type>::value:                                  \
-    concat_status =                                                  \
-        Concat<type>(context, to_concatenate, &concatenated_tensor); \
-    break;
-        TF_CALL_ALL_TYPES(CASE);
-#undef CASE
-        default:
-          concat_status =
-              errors::InvalidArgument("Unsupported data type: ", type);
-          break;
-      }
+      Status concat_status =
+          Concat(context, to_concatenate, &concatenated_tensor);
       TF_RETURN_IF_ERROR(concat_status);
       concatenated_tensors->push_back(concatenated_tensor);
     }
@@ -1001,17 +1026,7 @@ class UnbatchResource : public ResourceBase {
         batch_keys.push_back(batch_indices(i, 0));
       }
 
-      const DataType type = data_t.dtype();
-      switch (type) {
-#define CASE(type)                                                          \
-  case DataTypeToEnum<type>::value:                                         \
-    TF_RETURN_IF_ERROR(Split<type>(context, data_t, sizes, &split_inputs)); \
-    break;
-        TF_CALL_ALL_TYPES(CASE);
-#undef CASE
-        default:
-          return errors::InvalidArgument("Unsupported data type: ", type);
-      }
+      TF_RETURN_IF_ERROR(Split(context, data_t, sizes, &split_inputs));
     }
 
     // Critical section.

From 57c533a79af76f65af04f4ea5d0b30e777a31b66 Mon Sep 17 00:00:00 2001
From: Raman Sarokin <sorokin@google.com>
Date: Tue, 16 Jun 2020 10:24:58 -0700
Subject: [PATCH 0298/1390] Winograd back conversion updated to new style.

PiperOrigin-RevId: 316707136
Change-Id: I710af1fd5748f2d8ca371e870e150d79a39af0ce
---
 .../lite/delegates/gpu/cl/kernels/winograd.cc | 131 +++++++-----------
 .../lite/delegates/gpu/cl/kernels/winograd.h  |   3 -
 2 files changed, 53 insertions(+), 81 deletions(-)

diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/winograd.cc b/tensorflow/lite/delegates/gpu/cl/kernels/winograd.cc
index d71513e4de4..2dcb72637ec 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/winograd.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/winograd.cc
@@ -213,20 +213,8 @@ std::string GetWinograd4x4To36Code(
   return c;
 }
 
-std::string GetWinograd36To4x4Code(
-    const OperationDef& op_def, const LinearStorage& at_arr,
-    const LinearStorage& biases,
-    const std::vector<ElementwiseOperation*>& linked_operations) {
-  TensorCodeGenerator src_tensor(
-      "src_data",
-      WHSBPoint{"src_size.x", "src_size.y", "src_size.z", "src_size.w"},
-      op_def.src_tensors[0]);
-  TensorCodeGenerator dst_tensor(
-      "dst_data",
-      WHSBPoint{"dst_size.x", "dst_size.y", "dst_size.z", "dst_size.w"},
-      op_def.dst_tensors[0]);
-
-  const std::string batch_id = op_def.IsBatchSupported() ? "batch_id" : "";
+std::string GetWinograd36To4x4Code(const OperationDef& op_def,
+                                   Arguments* args) {
   std::string c = GetCommonDefines(op_def.precision);
 
   switch (op_def.precision) {
@@ -243,6 +231,15 @@ std::string GetWinograd36To4x4Code(
                                   ? DataType::FLOAT16
                                   : DataType::FLOAT32;
 
+  std::string cl_type = accum_type == DataType::FLOAT16 ? "half" : "float";
+  auto src_desc = absl::make_unique<TensorDescriptor>(op_def.src_tensors[0]);
+  src_desc->SetStateVar("ACCUM_FLT", cl_type);
+  args->AddObjectRef("src_tensor", AccessType::READ, std::move(src_desc));
+  args->AddObjectRef(
+      "dst_tensor", AccessType::WRITE,
+      absl::make_unique<TensorDescriptor>(op_def.dst_tensors[0]));
+  args->AddInt("tiles_x");
+
   auto at_mat = AtMatrixForWinograd4x4To6x6();
   c += "constant ACCUM_FLT At[24] = {\n";
   for (int y = 0; y < 4; ++y) {
@@ -255,30 +252,21 @@ std::string GetWinograd36To4x4Code(
   c += "};\n";
 
   c += "__kernel void main_function(\n";
-  c += src_tensor.GetDeclaration(AccessType::READ) + ",\n";
-  c += at_arr.GetDeclaration() + ",\n";
-  c += biases.GetDeclaration();
-  c += GetArgsDeclaration(linked_operations);
-  c += dst_tensor.GetDeclaration(AccessType::WRITE) + ",\n";
-  c += "    int4 src_size,                              \n";
-  c += "    int4 dst_size,                              \n";
-  c += "    int tiles_x                                 \n";
-  c += ") {\n";
+  c += "$0) {\n";
   c += "  int tile_id = get_global_id(0);\n";
   c += "  int DST_Y = get_global_id(1);\n";
   c += "  int DST_Z = get_global_id(2);\n";
-  c += "  int tile_x = (tile_id % tiles_x) * 4;\n";
-  c += "  int tile_y = (tile_id / tiles_x) * 4 + DST_Y;\n";
-  c += "  if (tile_x >= dst_size.x || tile_y >= dst_size.y || DST_Z >= "
-       "dst_size.z) {\n";
+  c += "  int tile_x = (tile_id % args.tiles_x) * 4;\n";
+  c += "  int tile_y = (tile_id / args.tiles_x) * 4 + DST_Y;\n";
+
+  c += "  if (tile_x >= args.dst_tensor.Width() || tile_y >= "
+       "args.dst_tensor.Height() || DST_Z >= args.dst_tensor.Slices()) {\n";
   c += "    return; \n";
   c += "  }\n";
   c += "  ACCUM_FLT4 I0, I1, I2, I3, I4, I5;\n";
   c += "  ACCUM_FLT at_ar[6];\n";
-  c += "  ACCUM_FLT4 t00 = TO_ACCUM_TYPE(" +
-       at_arr.ReadLinearFLT4("DST_Y * 2 + 0") + ");\n";
-  c += "  ACCUM_FLT4 t01 = TO_ACCUM_TYPE(" +
-       at_arr.ReadLinearFLT4("DST_Y * 2 + 1") + ");\n";
+  c += "  ACCUM_FLT4 t00 = TO_ACCUM_TYPE(args.at.Read(DST_Y * 2 + 0));\n";
+  c += "  ACCUM_FLT4 t01 = TO_ACCUM_TYPE(args.at.Read(DST_Y * 2 + 1));\n";
   c += "  at_ar[0] = t00.x;\n";
   c += "  at_ar[1] = t00.y;\n";
   c += "  at_ar[2] = t00.z;\n";
@@ -290,10 +278,8 @@ std::string GetWinograd36To4x4Code(
   for (int x = 0; x < 6; ++x) {
     const std::string yc = std::to_string(x);
     const std::string src = "src" + std::to_string(x);
-    c += "    ACCUM_FLT4 " + src + " = " +
-         src_tensor.ReadAsTypeWHSB(accum_type, "tile_id", yc, "DST_Z",
-                                   batch_id) +
-         ";\n";
+    c += "    ACCUM_FLT4 " + src +
+         " = args.src_tensor.Read<ACCUM_FLT>(tile_id, " + yc + ", DST_Z);\n";
     c += "    I" + std::to_string(x) + " = at * " + src + ";\n";
   }
   c += "  }\n";
@@ -303,46 +289,35 @@ std::string GetWinograd36To4x4Code(
     for (int x = 0; x < 6; ++x) {
       const std::string yc = std::to_string(y * 6 + x);
       const std::string src = "src" + std::to_string(x);
-      c += "    ACCUM_FLT4 " + src + " = " +
-           src_tensor.ReadAsTypeWHSB(accum_type, "tile_id", yc, "DST_Z",
-                                     batch_id) +
-           ";\n";
+      c += "    ACCUM_FLT4 " + src +
+           " = args.src_tensor.Read<ACCUM_FLT>(tile_id, " + yc + ", DST_Z);\n";
       c += "    I" + std::to_string(x) + " += at * " + src + ";\n";
     }
     c += "  }\n";
   }
   c += "  ACCUM_FLT4 t0 = I1 + I2;\n";
   c += "  ACCUM_FLT4 t1 = I3 + I4;\n";
-  c += "  FLT4 bias_val = " + biases.ReadLinearFLT4("DST_Z") + ";\n";
+  c += "  FLT4 bias_val = args.biases.Read(DST_Z);\n";
   c += "  {\n";
-  const LinkingContext context{"r0", "tile_x", "tile_y", "DST_Z"};
   c += "    FLT4 r0 = TO_FLT4(I0 + t0 + t1) + bias_val;\n";
-  c += PostProcess(linked_operations, context);
-  c += "    " +
-       dst_tensor.WriteWHSB("r0", "tile_x", "tile_y", "DST_Z", batch_id);
+  c += "    args.dst_tensor.Write(r0, tile_x, tile_y, DST_Z);\n";
   c += "    tile_x++;\n";
   c += "  }\n";
   c += "  ACCUM_FLT4 t2 = I1 - I2;\n";
   c += "  ACCUM_FLT4 t3 = I3 - I4;\n";
-  c += "  if (tile_x < dst_size.x) {\n";
+  c += "  if (tile_x < args.dst_tensor.Width()) {\n";
   c += "    FLT4 r0 = TO_FLT4(t2 * At[7] + t3 * At[9]) + bias_val;\n";
-  c += PostProcess(linked_operations, context);
-  c += "    " +
-       dst_tensor.WriteWHSB("r0", "tile_x", "tile_y", "DST_Z", batch_id);
+  c += "    args.dst_tensor.Write(r0, tile_x, tile_y, DST_Z);\n";
   c += "    tile_x++;\n";
   c += "  }\n";
-  c += "  if (tile_x < dst_size.x) {\n";
+  c += "  if (tile_x < args.dst_tensor.Width()) {\n";
   c += "    FLT4 r0 = TO_FLT4(t0 * At[13] + t1 * At[15]) + bias_val;\n";
-  c += PostProcess(linked_operations, context);
-  c += "    " +
-       dst_tensor.WriteWHSB("r0", "tile_x", "tile_y", "DST_Z", batch_id);
+  c += "    args.dst_tensor.Write(r0, tile_x, tile_y, DST_Z);\n";
   c += "    tile_x++;\n";
   c += "  }\n";
-  c += "  if (tile_x < dst_size.x) {\n";
+  c += "  if (tile_x < args.dst_tensor.Width()) {\n";
   c += "    FLT4 r0 = TO_FLT4(t2 * At[19] + t3 * At[21] + I5) + bias_val;\n";
-  c += PostProcess(linked_operations, context);
-  c += "    " +
-       dst_tensor.WriteWHSB("r0", "tile_x", "tile_y", "DST_Z", batch_id);
+  c += "    args.dst_tensor.Write(r0, tile_x, tile_y, DST_Z);\n";
   c += "    tile_x++;\n";
   c += "  }\n";
   c += "}\n";
@@ -406,7 +381,6 @@ absl::Status Winograd4x4To36::UploadBt(CLContext* context) {
   LinearStorageCreateInfo create_info;
   create_info.storage_type = LinearStorageType::TEXTURE_2D;
   create_info.data_type = definition_.GetDataType();
-  create_info.name = "bt_arr";
 
   LinearStorage lt;
   RETURN_IF_ERROR(CreateLinearStorage(create_info, bt_aligned, context, &lt));
@@ -473,15 +447,11 @@ absl::Status CreateWinograd4x4To36(const CreationContext& creation_context,
 
 Winograd36To4x4::Winograd36To4x4(Winograd36To4x4&& operation)
     : GPUOperation(std::move(operation)),
-      at_(std::move(operation.at_)),
-      biases_(std::move(operation.biases_)),
       kernel_(std::move(operation.kernel_)),
       work_group_size_(operation.work_group_size_) {}
 
 Winograd36To4x4& Winograd36To4x4::operator=(Winograd36To4x4&& operation) {
   if (this != &operation) {
-    at_ = std::move(operation.at_);
-    biases_ = std::move(operation.biases_);
     kernel_ = std::move(operation.kernel_);
     std::swap(work_group_size_, operation.work_group_size_);
     GPUOperation::operator=(std::move(operation));
@@ -495,8 +465,13 @@ absl::Status Winograd36To4x4::Compile(const CreationContext& creation_context) {
       creation_context.device->IsPowerVR()) {
     options.push_back(CompilerOptions::POWERVR_FP16);
   }
-  const auto code =
-      GetWinograd36To4x4Code(definition_, at_, biases_, linked_operations_);
+  std::string code = GetWinograd36To4x4Code(definition_, &args_);
+  std::string element_wise_code;
+  RETURN_IF_ERROR(
+      MergeOperations(linked_operations_, &args_, &element_wise_code));
+  RETURN_IF_ERROR(args_.TransformToCLCode(creation_context.device->GetInfo(),
+                                          {{"dst_tensor", element_wise_code}},
+                                          &code));
   RETURN_IF_ERROR(creation_context.cache->GetOrCreateCLKernel(
       code, "main_function", options, *creation_context.context,
       *creation_context.device, &kernel_));
@@ -520,8 +495,11 @@ absl::Status Winograd36To4x4::UploadAt(CLContext* context) {
   LinearStorageCreateInfo create_info;
   create_info.storage_type = LinearStorageType::TEXTURE_2D;
   create_info.data_type = definition_.GetDataType();
-  create_info.name = "at_arr";
-  return CreateLinearStorage(create_info, at_aligned, context, &at_);
+  LinearStorage lt;
+  RETURN_IF_ERROR(CreateLinearStorage(create_info, at_aligned, context, &lt));
+  args_.AddObject("at", AccessType::READ,
+                  absl::make_unique<LinearStorage>(std::move(lt)));
+  return absl::OkStatus();
 }
 
 int3 Winograd36To4x4::SelectBestWorkGroup() {
@@ -532,17 +510,12 @@ int3 Winograd36To4x4::SelectBestWorkGroup() {
 }
 
 absl::Status Winograd36To4x4::BindArguments() {
-  kernel_.ResetBindingCounter();
-  RETURN_IF_ERROR(kernel_.SetMemoryAuto(src_[0]->GetMemoryPtr()));
-  RETURN_IF_ERROR(kernel_.SetMemoryAuto(at_.GetMemoryPtr()));
-  RETURN_IF_ERROR(kernel_.SetMemoryAuto(biases_.GetMemoryPtr()));
-  RETURN_IF_ERROR(BindArgs(&kernel_, linked_operations_));
-  RETURN_IF_ERROR(kernel_.SetMemoryAuto(dst_[0]->GetMemoryPtrForWriting()));
-  RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->GetWHSB()));
-  RETURN_IF_ERROR(kernel_.SetBytesAuto(dst_[0]->GetWHSB()));
+  RETURN_IF_ERROR(args_.SetObjectRef("src_tensor", src_[0]));
+  RETURN_IF_ERROR(args_.SetObjectRef("dst_tensor", dst_[0]));
   const int tiles_x = DivideRoundUp(dst_[0]->Width(), 4);
-  RETURN_IF_ERROR(kernel_.SetBytesAuto(tiles_x));
-  return absl::OkStatus();
+  RETURN_IF_ERROR(args_.SetInt("tiles_x", tiles_x));
+  RETURN_IF_ERROR(SetArguments(linked_operations_, &args_));
+  return args_.Bind(kernel_.kernel());
 }
 
 int3 Winograd36To4x4::GetGridSize() const {
@@ -580,9 +553,11 @@ absl::Status CreateWinograd36To4x4(
   LinearStorageCreateInfo create_info;
   create_info.storage_type = LinearStorageType::TEXTURE_2D;
   create_info.data_type = definition.GetDataType();
-  create_info.name = "biases";
-  RETURN_IF_ERROR(CreateLinearStorage(
-      create_info, biases, creation_context.context, &result->biases_));
+  LinearStorage lt;
+  RETURN_IF_ERROR(
+      CreateLinearStorage(create_info, biases, creation_context.context, &lt));
+  result->args_.AddObject("biases", AccessType::READ,
+                          absl::make_unique<LinearStorage>(std::move(lt)));
   return result->UploadAt(creation_context.context);
 }
 
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/winograd.h b/tensorflow/lite/delegates/gpu/cl/kernels/winograd.h
index ec8fe22ea11..84ebd87042d 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/winograd.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/winograd.h
@@ -99,9 +99,6 @@ class Winograd36To4x4 : public GPUOperation {
   absl::Status BindArguments();
   int3 GetGridSize() const;
 
-  LinearStorage at_;
-  LinearStorage biases_;
-
   CLKernel kernel_;
   int3 work_group_size_ = int3(128, 1, 1);
 };

From 7fb05db5b0d4caad58ecb536b1ed80393dabc4a8 Mon Sep 17 00:00:00 2001
From: Thomas Joerg <tjoerg@google.com>
Date: Tue, 16 Jun 2020 10:27:26 -0700
Subject: [PATCH 0299/1390] Add op metadata to bitcasts inserted by
 ReductionLayoutNormalizer.

PiperOrigin-RevId: 316707766
Change-Id: I11aa5e9c32e86d63981398605c50a73ac9149303
---
 .../compiler/xla/service/gpu/reduction_layout_normalizer.cc      | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/compiler/xla/service/gpu/reduction_layout_normalizer.cc b/tensorflow/compiler/xla/service/gpu/reduction_layout_normalizer.cc
index 295ccebd442..64a1403a3f9 100644
--- a/tensorflow/compiler/xla/service/gpu/reduction_layout_normalizer.cc
+++ b/tensorflow/compiler/xla/service/gpu/reduction_layout_normalizer.cc
@@ -101,6 +101,7 @@ class EnforceMinorToMajorReduceOpVisitor : public DfsHloRewriteVisitor {
         new_reduce_shape_layout);
     HloInstruction *canonical_reduce_input = reduce->parent()->AddInstruction(
         HloInstruction::CreateBitcast(new_operand_shape, operand));
+    canonical_reduce_input->set_metadata(reduce->metadata());
 
     VLOG(5) << "Reduction input: " << canonical_reduce_input->ToString();
     std::unique_ptr<HloInstruction> new_reduce = HloInstruction::CreateReduce(

From c42314ef70f9e093e53cb85d0dac19535bd677ae Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 16 Jun 2020 10:29:33 -0700
Subject: [PATCH 0300/1390] fix flaky test multi_process_runner_test on
 tensorflow msan TAP. allow more margin of errors.

PiperOrigin-RevId: 316708284
Change-Id: I3ab912bd7a546bbedf5ed7825fa4b345e92880d8
---
 tensorflow/python/distribute/multi_process_runner_test.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/tensorflow/python/distribute/multi_process_runner_test.py b/tensorflow/python/distribute/multi_process_runner_test.py
index f7158d1fdb4..aeba43b6b7c 100644
--- a/tensorflow/python/distribute/multi_process_runner_test.py
+++ b/tensorflow/python/distribute/multi_process_runner_test.py
@@ -314,8 +314,12 @@ class MultiProcessRunnerTest(test.TestCase):
           timeout=5)
 
     list_to_assert = cm.exception.mpr_result.stdout
+    # We should see 5 iterations from worker and ps, however sometime on TAP
+    # due to CPU throttling and slugginess of msan/asan build, this became
+    # flaky. Therefore we allow more margin of errors to only check the first
+    # 3 iterations.
     for job in ['worker', 'ps']:
-      for iteration in range(0, 5):
+      for iteration in range(0, 3):
         self.assertTrue(
             any('(logging) {}-0, i: {}'.format(job, iteration) in line
                 for line in list_to_assert))

From 08e445e37f27c2228c499ed261e343aae15551cd Mon Sep 17 00:00:00 2001
From: George Karpenkov <cheshire@google.com>
Date: Tue, 16 Jun 2020 10:51:13 -0700
Subject: [PATCH 0301/1390] [TF/XLA] [NFC] Simplify
 XlaComputationLaunchContext::PopulateInputs

Try to make the logic more transparent

PiperOrigin-RevId: 316713452
Change-Id: I41e8d691e6cab9c7a6b5bc40d50d660fcbe05906
---
 tensorflow/compiler/jit/xla_launch_util.cc | 43 +++++++++-------------
 1 file changed, 17 insertions(+), 26 deletions(-)

diff --git a/tensorflow/compiler/jit/xla_launch_util.cc b/tensorflow/compiler/jit/xla_launch_util.cc
index 209220938ed..ec5a372875c 100644
--- a/tensorflow/compiler/jit/xla_launch_util.cc
+++ b/tensorflow/compiler/jit/xla_launch_util.cc
@@ -198,50 +198,41 @@ void XlaComputationLaunchContext::PopulateInputs(
     OpKernelContext* ctx, const XlaCompiler::CompilationResult* kernel,
     const std::map<int, OptionalTensor>& variables,
     int missing_ctx_input_prefix) {
-  se::Stream* stream =
-      ctx->op_device_context() ? ctx->op_device_context()->stream() : nullptr;
   // Build ShapedBuffers that point directly to the Tensor buffers.
   arg_ptrs_ = std::vector<ShapedBuffer*>(kernel->xla_input_shapes.size());
 
-  // Pass remaining parameters.
-  const Tensor* t;
+  xla::TransferManager* transfer_manager =
+      client_->backend().transfer_manager();
   for (int i = 0; i < kernel->xla_input_shapes.size(); ++i) {
     int arg_num = kernel->input_mapping[i];
-    DCHECK_GE(arg_num, missing_ctx_input_prefix);
+    CHECK_GE(arg_num, missing_ctx_input_prefix);
     const xla::Shape& shape = kernel->xla_input_shapes[i];
-    if (variables.count(arg_num)) {
-      t = &(variables.at(arg_num).value);
-      CHECK(t);
-    } else {
-      t = &(ctx->input(arg_num - missing_ctx_input_prefix));
-    }
+    const Tensor* t = variables.count(arg_num)
+                          ? &(variables.at(arg_num).value)
+                          : &(ctx->input(arg_num - missing_ctx_input_prefix));
+    CHECK(t);
 
     if (use_multiple_streams_) {
-      CHECK(stream) << "Must have a stream available when using XLA tensors!";
+      CHECK(ctx->op_device_context() && ctx->op_device_context()->stream())
+          << "Must have a stream available when using XLA tensors!";
       XlaTensor* xla_tensor = XlaTensor::FromTensor(t);
       CHECK(xla_tensor);
-      xla_tensor->WaitForDefinitionEventOnStream(stream);
+      xla_tensor->WaitForDefinitionEventOnStream(
+          ctx->op_device_context()->stream());
     }
 
-    const xla::Shape on_device_shape =
-        client_->backend().transfer_manager()->HostShapeToDeviceShape(shape);
-    if (on_device_shape.IsTuple()) {
-      const XlaTensor* xla_tensor = XlaTensor::FromTensor(t);
-      CHECK(xla_tensor && xla_tensor->has_shaped_buffer());
-      arg_ptrs_[i] = const_cast<ShapedBuffer*>(&xla_tensor->shaped_buffer());
-    } else {
-      CHECK(xla::Shape::Equal().MinorToMajorOnlyInLayout()(shape,
-                                                           on_device_shape))
-          << "On-device shape "
-          << xla::ShapeUtil::HumanStringWithLayout(on_device_shape)
-          << " not the same as on-host shape "
-          << xla::ShapeUtil::HumanStringWithLayout(shape);
+    if (xla::Shape::Equal().MinorToMajorOnlyInLayout()(
+            shape, transfer_manager->HostShapeToDeviceShape(shape))) {
       se::DeviceMemoryBase dmem = XlaTensor::DeviceMemoryFromTensor(*t);
       arg_buffers_.emplace_back(
           /*on_host_shape=*/shape, /*on_device_shape=*/shape,
           client_->platform(), client_->default_device_ordinal());
       arg_buffers_.back().set_buffer(dmem, /*index=*/{});
       arg_ptrs_[i] = &arg_buffers_.back();
+    } else {
+      const XlaTensor* xla_tensor = XlaTensor::FromTensor(t);
+      CHECK(xla_tensor && xla_tensor->has_shaped_buffer());
+      arg_ptrs_[i] = const_cast<ShapedBuffer*>(&xla_tensor->shaped_buffer());
     }
   }
 }

From e6e8d48f8b4c17a19e968ccff3c88b084d7053e3 Mon Sep 17 00:00:00 2001
From: Srinivas Vasudevan <srvasude@google.com>
Date: Tue, 16 Jun 2020 10:55:37 -0700
Subject: [PATCH 0302/1390] Improve numerics of Log1p for XLA:CPU.

PiperOrigin-RevId: 316714497
Change-Id: I20bf0148a850451077e435190034418e7eb9b38c
---
 .../compiler/tests/special_math_test.py       | 75 +++++++++++++++++++
 tensorflow/compiler/tests/unary_ops_test.py   | 14 ++--
 .../xla/service/elemental_ir_emitter.cc       | 47 +++++++++++-
 .../xla/service/elemental_ir_emitter.h        |  4 +
 4 files changed, 132 insertions(+), 8 deletions(-)

diff --git a/tensorflow/compiler/tests/special_math_test.py b/tensorflow/compiler/tests/special_math_test.py
index 3efaa6434be..246ab2a1641 100644
--- a/tensorflow/compiler/tests/special_math_test.py
+++ b/tensorflow/compiler/tests/special_math_test.py
@@ -61,6 +61,81 @@ def implicit_reparameterization_grad(a, x):
   return -gen_math_ops.igamma_grad_a(a, x) / prob
 
 
+@def_function.function(experimental_compile=True)
+def _log1p(x):
+  return math_ops.log1p(x)
+
+
+class Log1pTest(xla_test.XLATestCase, parameterized.TestCase):
+
+  def setUp(self):
+    if flags.FLAGS.vary_seed:
+      entropy = os.urandom(64)
+      if six.PY2:
+        answer = int(entropy.encode('hex'), 16)
+      else:
+        answer = int.from_bytes(entropy, 'big')
+      np.random.seed(answer % (2**32 - 1))
+    super(Log1pTest, self).setUp()
+
+  def adjust_tolerance_for_tpu(self, dtype, rtol, atol):
+    if self.device not in ['TPU']:
+      return rtol, atol
+
+    if dtype == np.float32:
+      return 4e-4, 0.
+    return 1e-10, 0.
+
+  def _test_range(self, low, high, dtype, rtol, atol, is_negative=False):
+    # Test values near zero.
+    rtol, atol = self.adjust_tolerance_for_tpu(dtype, rtol, atol)
+    x = np.exp(np.random.uniform(
+        low=low, high=high, size=[NUM_SAMPLES])).astype(dtype)
+    if is_negative:
+      x = -x
+    expected_values = np.log1p(x)
+    with self.session() as sess:
+      with self.test_scope():
+        actual = _log1p(x)
+      actual = sess.run(actual)
+    self.assertAllClose(expected_values, actual, atol=atol, rtol=rtol)
+
+  @parameterized.parameters((np.float32, 1e-7, 0.),
+                            (np.float64, 1e-15, 0.))
+  def testSmallX(self, dtype, rtol, atol):
+    self._test_range(-40., -20., dtype, rtol, atol, is_negative=False)
+    self._test_range(-40., -20., dtype, rtol, atol, is_negative=True)
+
+  @parameterized.parameters((np.float32, 1e-7, 0.),
+                            (np.float64, 1e-15, 0.))
+  def testGreaterThanNegativeTwentyExponent(self, dtype, rtol, atol):
+    self._test_range(-20., -10., dtype, rtol, atol, is_negative=False)
+    self._test_range(-20., -10., dtype, rtol, atol, is_negative=True)
+
+  @parameterized.parameters((np.float32, 1e-7, 0.),
+                            (np.float64, 1e-15, 0.))
+  def testGreaterThanNegativeTenExponent(self, dtype, rtol, atol):
+    self._test_range(-10., -5., dtype, rtol, atol, is_negative=False)
+    self._test_range(-10., -5., dtype, rtol, atol, is_negative=True)
+
+  @parameterized.parameters((np.float32, 2e-7, 0.),
+                            (np.float64, 1e-15, 0.))
+  def testGreaterThanNegativeFiveExponent(self, dtype, rtol, atol):
+    self._test_range(-5., -1., dtype, rtol, atol, is_negative=False)
+    self._test_range(-5., -1., dtype, rtol, atol, is_negative=True)
+
+  @parameterized.parameters((np.float32, 4e-7, 0.),
+                            (np.float64, 3e-14, 0.))
+  def testXGreaterThanOneTenth(self, dtype, rtol, atol):
+    self._test_range(-1., 0., dtype, rtol, atol, is_negative=False)
+    self._test_range(-1., 0., dtype, rtol, atol, is_negative=True)
+
+  @parameterized.parameters((np.float32, 2e-7, 0.),
+                            (np.float64, 2e-15, 0.))
+  def testXGreaterThanOne(self, dtype, rtol, atol):
+    self._test_range(0., 3., dtype, rtol, atol, is_negative=False)
+
+
 class IgammaTest(xla_test.XLATestCase, parameterized.TestCase):
 
   def setUp(self):
diff --git a/tensorflow/compiler/tests/unary_ops_test.py b/tensorflow/compiler/tests/unary_ops_test.py
index 5b3f9465513..162693a9eb1 100644
--- a/tensorflow/compiler/tests/unary_ops_test.py
+++ b/tensorflow/compiler/tests/unary_ops_test.py
@@ -292,13 +292,17 @@ class UnaryOpsTest(xla_test.XLATestCase):
           np.array([[1, 2]], dtype=dtype),
           expected=np.array([[0.540297, -0.41614]], dtype=dtype))
 
+      # Confirm that log1p will remain precise across a range of small values.
       self._assertOpOutputMatchesExpected(
           math_ops.log1p,
-          np.array([[1e-14, 1e-15, 0.6]], dtype=dtype),
-          expected=np.log1p(np.array([[1e-14, 1e-15, 0.6]],
-                                     dtype=dtype)).astype(dtype),
-          rtol=1e-4,
-          atol=1e-6)
+          np.array([[1e-14, 1e-15, 0.6, 2] + [x * 1e-5 for x in range(1, 20)]],
+                   dtype=dtype),
+          expected=np.log1p(
+              np.array(
+                  [[1e-14, 1e-15, 0.6, 2] + [x * 1e-5 for x in range(1, 20)]],
+                  dtype=dtype)).astype(dtype),
+          rtol=1e-15 if dtype == np.float64 else 1e-4,
+          atol=1e-15 if dtype == np.float64 else 1e-4)
 
       self._assertOpOutputMatchesExpected(
           math_ops.rint,
diff --git a/tensorflow/compiler/xla/service/elemental_ir_emitter.cc b/tensorflow/compiler/xla/service/elemental_ir_emitter.cc
index 8cb660de46c..e4097b0c06f 100644
--- a/tensorflow/compiler/xla/service/elemental_ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/elemental_ir_emitter.cc
@@ -1336,9 +1336,40 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitLog1p(PrimitiveType prim_type,
   // When x is large, the naive evaluation of ln(x + 1) is more
   // accurate than the Taylor series.
   TF_ASSIGN_OR_RETURN(auto for_large_x, EmitLog(prim_type, FAdd(x, one)));
-  // The Taylor series for ln(x+1) is x - x^2/2 - x^3/3 + ….
-  auto for_small_x = FMul(FAdd(FMul(negative_half, x), one), x);
-  const auto kAntilogarithmIsSmallThreshold = 1e-4;
+  // When x is small, (defined to be less than sqrt(2) / 2), use a rational
+  // approximation. The approximation below is based on one from the Cephes
+  // Mathematical Library.
+  //
+  // sqrt(2) - 1.
+  const auto kAntilogarithmIsSmallThreshold = 0.41421356237309504880;
+
+  static const std::array<double, 7> kDenominatorCoeffs{
+      1.,
+      1.5062909083469192043167E1,
+      8.3047565967967209469434E1,
+      2.2176239823732856465394E2,
+      3.0909872225312059774938E2,
+      2.1642788614495947685003E2,
+      6.0118660497603843919306E1,
+  };
+
+  static const std::array<double, 7> kNumeratorCoeffs{
+      4.5270000862445199635215E-5, 4.9854102823193375972212E-1,
+      6.5787325942061044846969E0,  2.9911919328553073277375E1,
+      6.0949667980987787057556E1,  5.7112963590585538103336E1,
+      2.0039553499201281259648E1,
+  };
+
+  auto x_squared = FMul(x, x);
+  TF_ASSIGN_OR_RETURN(auto denominator,
+                      EvaluatePolynomial(type, x, kDenominatorCoeffs));
+  TF_ASSIGN_OR_RETURN(auto numerator,
+                      EvaluatePolynomial(type, x, kNumeratorCoeffs));
+  auto for_small_x = FDiv(numerator, denominator);
+  for_small_x = FMul(FMul(x, x_squared), for_small_x);
+  for_small_x = FAdd(FMul(negative_half, x_squared), for_small_x);
+  for_small_x = FAdd(x, for_small_x);
+
   auto abs_x =
       llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::fabs, {value}, {type}, b_);
   auto x_is_small = FCmpOLT(
@@ -2699,4 +2730,14 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitElementalReduce(
   }
 }
 
+// Evaluate polynomial using Horner's method.
+StatusOr<llvm::Value*> ElementalIrEmitter::EvaluatePolynomial(
+    llvm::Type* type, llvm::Value* x, absl::Span<const double> coefficients) {
+  llvm::Value* poly = llvm::ConstantFP::get(type, 0.0);
+  for (const double c : coefficients) {
+    poly = FAdd(FMul(poly, x), llvm::ConstantFP::get(type, c));
+  }
+  return poly;
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/elemental_ir_emitter.h b/tensorflow/compiler/xla/service/elemental_ir_emitter.h
index 06a9d7b194c..e39d2dd99ec 100644
--- a/tensorflow/compiler/xla/service/elemental_ir_emitter.h
+++ b/tensorflow/compiler/xla/service/elemental_ir_emitter.h
@@ -258,6 +258,10 @@ class ElementalIrEmitter : public IrBuilderMixin<ElementalIrEmitter> {
   StatusOr<llvm::Value*> EmitComplexPower(const HloInstruction* op,
                                           llvm::Value* a, llvm::Value* b,
                                           llvm::Value* c, llvm::Value* d);
+
+  // Evaluates a polynomial using Horner's method.
+  StatusOr<llvm::Value*> EvaluatePolynomial(
+      llvm::Type* type, llvm::Value* x, absl::Span<const double> coefficients);
 };
 
 }  // namespace xla

From 70075e5acc70783336c220bc484909ceadfdd9ac Mon Sep 17 00:00:00 2001
From: Raman Sarokin <sorokin@google.com>
Date: Tue, 16 Jun 2020 10:56:01 -0700
Subject: [PATCH 0303/1390] StridedSlice converted to new style.

PiperOrigin-RevId: 316714604
Change-Id: I5bc49b5d49704b46b5b4f3169fd679b91f60d235
---
 .../delegates/gpu/cl/kernels/strided_slice.cc | 102 +++++++++---------
 1 file changed, 52 insertions(+), 50 deletions(-)

diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/strided_slice.cc b/tensorflow/lite/delegates/gpu/cl/kernels/strided_slice.cc
index 19f1b185d3c..2cf65f24447 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/strided_slice.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/strided_slice.cc
@@ -25,72 +25,67 @@ namespace gpu {
 namespace cl {
 namespace {
 
-std::string GetStridedSliceCode(
-    const OperationDef& op_def, bool alignedx4,
-    const std::vector<ElementwiseOperation*>& linked_operations) {
-  TensorCodeGenerator src_tensor(
-      "src_data",
-      WHSBPoint{"src_size.x", "src_size.y", "src_size.z", "src_size.w"},
-      op_def.src_tensors[0]);
-  TensorCodeGenerator dst_tensor(
-      "dst_data",
-      WHSBPoint{"dst_size.x", "dst_size.y", "dst_size.z", "dst_size.w"},
-      op_def.dst_tensors[0]);
+std::string GetStridedSliceCode(const OperationDef& op_def, bool alignedx4,
+                                Arguments* args) {
+  args->AddObjectRef(
+      "src_tensor", AccessType::READ,
+      absl::make_unique<TensorDescriptor>(op_def.src_tensors[0]));
+  args->AddObjectRef(
+      "dst_tensor", AccessType::WRITE,
+      absl::make_unique<TensorDescriptor>(op_def.dst_tensors[0]));
+  args->AddInt("offset_x");
+  args->AddInt("offset_y");
+  args->AddInt("offset_z");
+  args->AddInt("offset_b");
+  args->AddInt("stride_x");
+  args->AddInt("stride_y");
+  args->AddInt("stride_z");
+  args->AddInt("stride_b");
 
   const std::string dst_batch = op_def.IsBatchSupported() ? "B" : "";
   std::string c = GetCommonDefines(op_def.precision);
   c += "__kernel void main_function(\n";
-  c += src_tensor.GetDeclaration(AccessType::READ);
-  c += GetArgsDeclaration(linked_operations);
-  c += dst_tensor.GetDeclaration(AccessType::WRITE) + ",\n";
-  c += "    int4 offset,            \n";
-  c += "    int4 stride,            \n";
-  c += "    int4 src_size,             \n";
-  c += "    int4 dst_size              \n";
-  c += ") {\n";
+  c += "$0) {\n";
   if (op_def.IsBatchSupported()) {
     c += "  int linear_id = get_global_id(0);\n";
-    c += "  int X = linear_id / dst_size.w;\n";
-    c += "  int B = linear_id % dst_size.w;\n";
+    c += "  int X = linear_id / args.dst_tensor.Batch();\n";
+    c += "  int B = linear_id % args.dst_tensor.Batch();\n";
+    c += "  args.dst_tensor.SetBatchRef(B);\n";
   } else {
     c += "  int X = get_global_id(0);\n";
   }
   c += "  int Y = get_global_id(1);\n";
   c += "  int Z = get_global_id(2);\n";
-  c += "  if (X >= dst_size.x || Y >= dst_size.y || Z >= dst_size.z) { \n";
+  c += "  if (X >= args.dst_tensor.Width() || Y >= args.dst_tensor.Height() || "
+       "Z >= args.dst_tensor.Slices()) { \n";
   c += "    return; \n";
   c += "  } \n";
-  c += "  int s_x = X * stride.x + offset.x;\n";
-  c += "  int s_y = Y * stride.y + offset.y;\n";
+  c += "  int s_x = X * args.stride_x + args.offset_x;\n";
+  c += "  int s_y = Y * args.stride_y + args.offset_y;\n";
   if (op_def.IsBatchSupported()) {
-    c += "  int s_b = B * stride.w + offset.w;\n";
+    c += "  int s_b = B * args.stride_b + args.offset_b;\n";
+    c += "  args.src_tensor.SetBatchRef(s_b);\n";
   }
   const std::string src_batch = op_def.IsBatchSupported() ? "s_b" : "";
   if (alignedx4) {
-    c += "  int s_z = Z + offset.z;\n";
-    c += "  FLT4 result = " +
-         src_tensor.ReadWHSB("s_x", "s_y", "s_z", src_batch) + ";\n";
+    c += "  int s_z = Z + args.offset_z;\n";
+    c += "  FLT4 result = args.src_tensor.Read(s_x, s_y, s_z);\n";
   } else {
     c += "  FLT4 result;\n";
     const std::string postfixes[] = {"x", "y", "z", "w"};
     for (int i = 0; i < 4; ++i) {
       c += "  {\n";
       const std::string channel = "(Z * 4 + " + std::to_string(i) + ")";
-      c += "    int s_ch = " + channel + " * stride.z + offset.z;\n";
-      c += "    int s_z = min(s_ch >> 2, src_size.z - 1);\n";
+      c += "    int s_ch = " + channel + " * args.stride_z + args.offset_z;\n";
+      c += "    int s_z = min(s_ch >> 2, args.src_tensor.Slices() - 1);\n";
       c += "    int s_z_rem = s_ch & 3;\n";
-      c += "    FLT4 t = " +
-           src_tensor.ReadWHSB("s_x", "s_y", "s_z", src_batch) + ";\n";
+      c += "    FLT4 t = args.src_tensor.Read(s_x, s_y, s_z);\n";
       c += "    FLT t_ar[4] = {t.x, t.y, t.z, t.w};\n";
       c += "    result." + postfixes[i] + " = t_ar[s_z_rem];\n";
       c += "  }\n";
     }
   }
-  std::string x_3dcoord =
-      op_def.IsBatchSupported() ? "X * dst_size.w + B" : "X";
-  const LinkingContext context{"result", x_3dcoord, "Y", "Z"};
-  c += PostProcess(linked_operations, context);
-  c += "  " + dst_tensor.WriteWHSB("result", "X", "Y", "Z", dst_batch);
+  c += "  args.dst_tensor.Write(result, X, Y, Z);\n";
   c += "}\n";
   return c;
 }
@@ -167,27 +162,34 @@ StridedSlice& StridedSlice::operator=(StridedSlice&& operation) {
 }
 
 absl::Status StridedSlice::Compile(const CreationContext& creation_context) {
-  const auto code = GetStridedSliceCode(definition_, Is4Aligned(attributes_),
-                                        linked_operations_);
+  std::string code =
+      GetStridedSliceCode(definition_, Is4Aligned(attributes_), &args_);
+  std::string element_wise_code;
+  RETURN_IF_ERROR(
+      MergeOperations(linked_operations_, &args_, &element_wise_code));
+  RETURN_IF_ERROR(args_.TransformToCLCode(creation_context.device->GetInfo(),
+                                          {{"dst_tensor", element_wise_code}},
+                                          &code));
   return creation_context.cache->GetOrCreateCLKernel(
       code, "main_function", *creation_context.context,
       *creation_context.device, &kernel_);
 }
 
 absl::Status StridedSlice::BindArguments() {
-  kernel_.ResetBindingCounter();
-  RETURN_IF_ERROR(kernel_.SetMemoryAuto(src_[0]->GetMemoryPtr()));
-  RETURN_IF_ERROR(BindArgs(&kernel_, linked_operations_));
-  RETURN_IF_ERROR(kernel_.SetMemoryAuto(dst_[0]->GetMemoryPtrForWriting()));
+  RETURN_IF_ERROR(args_.SetObjectRef("src_tensor", src_[0]));
+  RETURN_IF_ERROR(args_.SetObjectRef("dst_tensor", dst_[0]));
   int4 offset = GetOffset(attributes_, src_[0]->Width(), src_[0]->Height(),
                           src_[0]->Channels(), src_[0]->Batch());
-  RETURN_IF_ERROR(kernel_.SetBytesAuto(offset));
-  RETURN_IF_ERROR(
-      kernel_.SetBytesAuto(int4(attributes_.strides.w, attributes_.strides.h,
-                                attributes_.strides.c, attributes_.strides.b)));
-  RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->GetWHSB()));
-  RETURN_IF_ERROR(kernel_.SetBytesAuto(dst_[0]->GetWHSB()));
-  return absl::OkStatus();
+  RETURN_IF_ERROR(args_.SetInt("offset_x", offset.x));
+  RETURN_IF_ERROR(args_.SetInt("offset_y", offset.y));
+  RETURN_IF_ERROR(args_.SetInt("offset_z", offset.z));
+  RETURN_IF_ERROR(args_.SetInt("offset_b", offset.w));
+  RETURN_IF_ERROR(args_.SetInt("stride_x", attributes_.strides.w));
+  RETURN_IF_ERROR(args_.SetInt("stride_y", attributes_.strides.h));
+  RETURN_IF_ERROR(args_.SetInt("stride_z", attributes_.strides.c));
+  RETURN_IF_ERROR(args_.SetInt("stride_b", attributes_.strides.b));
+  RETURN_IF_ERROR(SetArguments(linked_operations_, &args_));
+  return args_.Bind(kernel_.kernel());
 }
 
 int3 StridedSlice::GetGridSize() const {

From 9a0838f66ad078e7ae089eaf87447cc00e5d65c5 Mon Sep 17 00:00:00 2001
From: Andy Ly <lyandy@google.com>
Date: Tue, 16 Jun 2020 10:59:42 -0700
Subject: [PATCH 0304/1390] Add canonicalization for tf.BatchToSpace to more
 generalized tf.BatchToSpaceND.

This canonicalization is restricted to ranked inputs and crops, so in the case where an invalid shaped input or crops is passed to tf.BatchToSpace, a similar error will occur instead of potentially succeeding with the more generalized tf.BatchToSpaceND.

PiperOrigin-RevId: 316715564
Change-Id: I41b8cedf5ac770772e4c943fee8049af9e34418b
---
 .../mlir/tensorflow/ir/tf_generated_ops.td    |  2 ++
 .../compiler/mlir/tensorflow/ir/tf_ops.cc     |  5 ++++
 .../mlir/tensorflow/tests/canonicalize.mlir   | 24 +++++++++++++++++++
 .../tensorflow/transforms/canonicalize.td     | 17 +++++++++++++
 4 files changed, 48 insertions(+)

diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td b/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
index 7db3539fcef..d403462e6a6 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
@@ -824,6 +824,8 @@ followed by cropping along the `height` and `width` dimensions.
   let verifier = [{
     return Verify(*this);
   }];
+
+  let hasCanonicalizer = 1;
 }
 
 def TF_BatchToSpaceNDOp : TF_Op<"BatchToSpaceND", [NoSideEffect]> {
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc
index 8410929a19f..6d8c5af297d 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc
@@ -838,6 +838,11 @@ static LogicalResult Verify(BatchToSpaceOp op) {
   return success();
 }
 
+void BatchToSpaceOp::getCanonicalizationPatterns(
+    OwningRewritePatternList &results, MLIRContext *context) {
+  results.insert<BatchToSpaceToBatchToSpaceND>(context);
+}
+
 //===----------------------------------------------------------------------===//
 // BiasAddOp
 //===----------------------------------------------------------------------===//
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/canonicalize.mlir b/tensorflow/compiler/mlir/tensorflow/tests/canonicalize.mlir
index a127168157f..542c5b3c166 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/canonicalize.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/canonicalize.mlir
@@ -586,3 +586,27 @@ func @sub(%arg0: tensor<*xf32>, %arg1: tensor<*xf32>) -> tensor<*xf32> {
   %0 = "tf.Sub"(%arg0, %arg1) : (tensor<*xf32>, tensor<*xf32>) -> tensor<*xf32>
   return %0 : tensor<*xf32>
 }
+
+// CHECK-LABEL: testBatchToSpaceToBatchToSpaceND
+// CHECK-SAME: ([[INPUT:%.*]]: tensor<?x?x?x?xf32>, [[CROPS:%.*]]: tensor<?x?xi32>)
+func @testBatchToSpaceToBatchToSpaceND(%arg0: tensor<?x?x?x?xf32>, %arg1: tensor<?x?xi32>) -> tensor<*xf32> {
+  // CHECK: [[BLOCK_SHAPE:%.*]] = "tf.Const"() {value = dense<8> : tensor<2xi64>}
+  // CHECK: [[BATCH_TO_SHAPE_ND:%.*]] = "tf.BatchToSpaceND"([[INPUT]], [[BLOCK_SHAPE]], [[CROPS]])
+  %0 = "tf.BatchToSpace"(%arg0, %arg1) {block_size = 8 : i64} : (tensor<?x?x?x?xf32>, tensor<?x?xi32>) -> tensor<*xf32>
+  // CHECK: return [[BATCH_TO_SHAPE_ND]]
+  return %0 : tensor<*xf32>
+}
+
+// CHECK-LABEL: testBatchToSpaceDynamicInput
+func @testBatchToSpaceDynamicInput(%arg0: tensor<*xf32>, %arg1: tensor<?x?xi32>) -> tensor<*xf32> {
+  // CHECK-NOT: "tf.BatchToSpaceND"
+  %0 = "tf.BatchToSpace"(%arg0, %arg1) {block_size = 8 : i64} : (tensor<*xf32>, tensor<?x?xi32>) -> tensor<*xf32>
+  return %0 : tensor<*xf32>
+}
+
+// CHECK-LABEL: testBatchToSpaceDynamicCrops
+func @testBatchToSpaceDynamicCrops(%arg0: tensor<?x?x?x?xf32>, %arg1: tensor<*xi32>) -> tensor<*xf32> {
+  // CHECK-NOT: "tf.BatchToSpaceND"
+  %0 = "tf.BatchToSpace"(%arg0, %arg1) {block_size = 8 : i64} : (tensor<?x?x?x?xf32>, tensor<*xi32>) -> tensor<*xf32>
+  return %0 : tensor<*xf32>
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/canonicalize.td b/tensorflow/compiler/mlir/tensorflow/transforms/canonicalize.td
index 041d1c6cdaf..edd36a7b4c7 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/canonicalize.td
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/canonicalize.td
@@ -27,6 +27,8 @@ def SingleResultAndOperandHaveSameType : Constraint<
 
 def IsRank2Tensor : Type<HasAnyRankOfPred<[2]>, "Rank 2 tensor">;
 
+def IsRank4Tensor : Type<HasAnyRankOfPred<[4]>, "Rank 4 tensor">;
+
 // Checks if all the users is ReadVariableOp.
 def HasOnlyReadVariableOpUsers : Constraint<
   CPred<"llvm::all_of($0.getUsers(), [](mlir::OpOperand op) { "
@@ -65,6 +67,21 @@ def BatchMatMulV2ToMatMul : Pat<(TF_BatchMatMulV2Op $x, $y, $adj_x, $adj_y),
                               (TF_MatMulOp $x, $y, $adj_x, $adj_y),
                               [(IsRank2Tensor $x), (IsRank2Tensor $y)]>;
 
+//===----------------------------------------------------------------------===//
+// BatchToSpace op patterns.
+//===----------------------------------------------------------------------===//
+
+def BatchToSpaceBlockSizeToBlockShape : NativeCodeCall<
+  "DenseElementsAttr::get(RankedTensorType::get({2}, $_builder.getI64Type()), "
+  "ArrayRef<APInt>{$0.getValue(), $0.getValue()})">;
+
+def BatchToSpaceToBatchToSpaceND :
+    Pat<(TF_BatchToSpaceOp $input, $crops, $block_size),
+        (TF_BatchToSpaceNDOp $input,
+         (TF_ConstOp (BatchToSpaceBlockSizeToBlockShape $block_size)),
+         $crops),
+        [(IsRank4Tensor $input), (IsRank2Tensor $crops)]>;
+
 //===----------------------------------------------------------------------===//
 // BiasAddV1 op patterns.
 //===----------------------------------------------------------------------===//

From 30357d1d9b5599b08a43eba918d0349ce4a29acc Mon Sep 17 00:00:00 2001
From: Sachin Joglekar <srjoglekar@google.com>
Date: Tue, 16 Jun 2020 11:08:54 -0700
Subject: [PATCH 0305/1390] Improve floating-point NMS tests to use smaller
 error thresholds

PiperOrigin-RevId: 316717898
Change-Id: Iab097dcf4ac3feca17c6d54ad84a2437341d0bb3
---
 .../kernels/detection_postprocess_test.cc     | 32 +++++++++----------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/tensorflow/lite/kernels/detection_postprocess_test.cc b/tensorflow/lite/kernels/detection_postprocess_test.cc
index cf0d3ba2f3d..b9c42e75f21 100644
--- a/tensorflow/lite/kernels/detection_postprocess_test.cc
+++ b/tensorflow/lite/kernels/detection_postprocess_test.cc
@@ -174,17 +174,17 @@ TEST(DetectionPostprocessOpTest, FloatTest) {
   std::vector<int> output_shape2 = m.GetOutputShape2();
   EXPECT_THAT(output_shape2, ElementsAre(1, 3));
   EXPECT_THAT(m.GetOutput2<float>(),
-              ElementsAreArray(ArrayFloatNear({1, 0, 0}, 1e-1)));
+              ElementsAreArray(ArrayFloatNear({1, 0, 0}, 1e-4)));
   // detection_scores
   std::vector<int> output_shape3 = m.GetOutputShape3();
   EXPECT_THAT(output_shape3, ElementsAre(1, 3));
   EXPECT_THAT(m.GetOutput3<float>(),
-              ElementsAreArray(ArrayFloatNear({0.95, 0.9, 0.3}, 1e-1)));
+              ElementsAreArray(ArrayFloatNear({0.95, 0.9, 0.3}, 1e-4)));
   // num_detections
   std::vector<int> output_shape4 = m.GetOutputShape4();
   EXPECT_THAT(output_shape4, ElementsAre(1));
   EXPECT_THAT(m.GetOutput4<float>(),
-              ElementsAreArray(ArrayFloatNear({3.0}, 1e-1)));
+              ElementsAreArray(ArrayFloatNear({3.0}, 1e-4)));
 }
 
 TEST(DetectionPostprocessOpTest, QuantizedTest) {
@@ -385,17 +385,17 @@ TEST(DetectionPostprocessOpTest, FloatTestFastNMS) {
   std::vector<int> output_shape2 = m.GetOutputShape2();
   EXPECT_THAT(output_shape2, ElementsAre(1, 3));
   EXPECT_THAT(m.GetOutput2<float>(),
-              ElementsAreArray(ArrayFloatNear({1, 0, 0}, 1e-1)));
+              ElementsAreArray(ArrayFloatNear({1, 0, 0}, 1e-4)));
   // detection_scores
   std::vector<int> output_shape3 = m.GetOutputShape3();
   EXPECT_THAT(output_shape3, ElementsAre(1, 3));
   EXPECT_THAT(m.GetOutput3<float>(),
-              ElementsAreArray(ArrayFloatNear({0.95, 0.9, 0.3}, 1e-1)));
+              ElementsAreArray(ArrayFloatNear({0.95, 0.9, 0.3}, 1e-4)));
   // num_detections
   std::vector<int> output_shape4 = m.GetOutputShape4();
   EXPECT_THAT(output_shape4, ElementsAre(1));
   EXPECT_THAT(m.GetOutput4<float>(),
-              ElementsAreArray(ArrayFloatNear({3.0}, 1e-1)));
+              ElementsAreArray(ArrayFloatNear({3.0}, 1e-4)));
 }
 
 TEST(DetectionPostprocessOpTest, QuantizedTestFastNMS) {
@@ -492,22 +492,22 @@ TEST(DetectionPostprocessOpTest, FloatTestRegularNMS) {
   EXPECT_THAT(m.GetOutput1<float>(),
               ElementsAreArray(ArrayFloatNear({0.0, 10.0, 1.0, 11.0, 0.0, 10.0,
                                                1.0, 11.0, 0.0, 0.0, 0.0, 0.0},
-                                              3e-1)));
+                                              3e-4)));
   // detection_classes
   std::vector<int> output_shape2 = m.GetOutputShape2();
   EXPECT_THAT(output_shape2, ElementsAre(1, 3));
   EXPECT_THAT(m.GetOutput2<float>(),
-              ElementsAreArray(ArrayFloatNear({1, 0, 0}, 1e-1)));
+              ElementsAreArray(ArrayFloatNear({1, 0, 0}, 1e-4)));
   // detection_scores
   std::vector<int> output_shape3 = m.GetOutputShape3();
   EXPECT_THAT(output_shape3, ElementsAre(1, 3));
   EXPECT_THAT(m.GetOutput3<float>(),
-              ElementsAreArray(ArrayFloatNear({0.95, 0.9, 0.0}, 1e-1)));
+              ElementsAreArray(ArrayFloatNear({0.95, 0.93, 0.0}, 1e-4)));
   // num_detections
   std::vector<int> output_shape4 = m.GetOutputShape4();
   EXPECT_THAT(output_shape4, ElementsAre(1));
   EXPECT_THAT(m.GetOutput4<float>(),
-              ElementsAreArray(ArrayFloatNear({2.0}, 1e-1)));
+              ElementsAreArray(ArrayFloatNear({2.0}, 1e-4)));
 }
 
 TEST(DetectionPostprocessOpTest, QuantizedTestRegularNMS) {
@@ -666,17 +666,17 @@ TEST(DetectionPostprocessOpTest, FloatTestwithBackgroundClassAndKeypoints) {
   std::vector<int> output_shape2 = m.GetOutputShape2();
   EXPECT_THAT(output_shape2, ElementsAre(1, 3));
   EXPECT_THAT(m.GetOutput2<float>(),
-              ElementsAreArray(ArrayFloatNear({1, 0, 0}, 1e-1)));
+              ElementsAreArray(ArrayFloatNear({1, 0, 0}, 1e-4)));
   // detection_scores
   std::vector<int> output_shape3 = m.GetOutputShape3();
   EXPECT_THAT(output_shape3, ElementsAre(1, 3));
   EXPECT_THAT(m.GetOutput3<float>(),
-              ElementsAreArray(ArrayFloatNear({0.95, 0.9, 0.3}, 1e-1)));
+              ElementsAreArray(ArrayFloatNear({0.95, 0.9, 0.3}, 1e-4)));
   // num_detections
   std::vector<int> output_shape4 = m.GetOutputShape4();
   EXPECT_THAT(output_shape4, ElementsAre(1));
   EXPECT_THAT(m.GetOutput4<float>(),
-              ElementsAreArray(ArrayFloatNear({3.0}, 1e-1)));
+              ElementsAreArray(ArrayFloatNear({3.0}, 1e-4)));
 }
 
 TEST(DetectionPostprocessOpTest,
@@ -780,17 +780,17 @@ TEST(DetectionPostprocessOpTest, FloatTestwithNoBackgroundClassAndKeypoints) {
   std::vector<int> output_shape2 = m.GetOutputShape2();
   EXPECT_THAT(output_shape2, ElementsAre(1, 3));
   EXPECT_THAT(m.GetOutput2<float>(),
-              ElementsAreArray(ArrayFloatNear({1, 0, 0}, 1e-1)));
+              ElementsAreArray(ArrayFloatNear({1, 0, 0}, 1e-4)));
   // detection_scores
   std::vector<int> output_shape3 = m.GetOutputShape3();
   EXPECT_THAT(output_shape3, ElementsAre(1, 3));
   EXPECT_THAT(m.GetOutput3<float>(),
-              ElementsAreArray(ArrayFloatNear({0.95, 0.9, 0.3}, 1e-1)));
+              ElementsAreArray(ArrayFloatNear({0.95, 0.9, 0.3}, 1e-4)));
   // num_detections
   std::vector<int> output_shape4 = m.GetOutputShape4();
   EXPECT_THAT(output_shape4, ElementsAre(1));
   EXPECT_THAT(m.GetOutput4<float>(),
-              ElementsAreArray(ArrayFloatNear({3.0}, 1e-1)));
+              ElementsAreArray(ArrayFloatNear({3.0}, 1e-4)));
 }
 }  // namespace
 }  // namespace custom

From 421e64c0c6d54a877059f3743c0cbcaf625c51ee Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 16 Jun 2020 11:10:26 -0700
Subject: [PATCH 0306/1390] Build fixes for CLion.

This change allows targets that have TF dependencies to build in the IDE without extraneous errors in the output.

PiperOrigin-RevId: 316718211
Change-Id: Ia2c8b87dbb7b755b65f8c1d390f13eb70c8cf4ee
---
 tensorflow/lite/toco/python/BUILD | 1 +
 tensorflow/python/BUILD           | 3 ++-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/tensorflow/lite/toco/python/BUILD b/tensorflow/lite/toco/python/BUILD
index 7dfa714d1d6..bada1016d26 100644
--- a/tensorflow/lite/toco/python/BUILD
+++ b/tensorflow/lite/toco/python/BUILD
@@ -35,6 +35,7 @@ cc_library(
     ],
     deps = [
         "@com_google_protobuf//:protobuf_headers",
+        "//third_party/python_runtime:headers",  # build_cleaner: keep; DNR: b/35864863
         "//tensorflow/core:lib",
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/core/api",
diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index 87048ba9d40..d141b719aef 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -1058,11 +1058,12 @@ cc_library(
         ":safe_ptr",
         "//tensorflow/c:tensor_interface",
         "//tensorflow/c:tf_tensor_internal",
+        "//tensorflow/c/eager:c_api_internal",
         "//tensorflow/c/eager:tfe_context_internal",
         "//tensorflow/c/eager:tfe_tensorhandle_internal",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
-        "//third_party/python_runtime:headers",
+        "//third_party/python_runtime:headers",  # build_cleaner: keep; DNR: b/35864863
     ],
 )
 

From 2d50164bdb08f5c02ffe9dba2c597fcded1f07a1 Mon Sep 17 00:00:00 2001
From: George Karpenkov <cheshire@google.com>
Date: Tue, 16 Jun 2020 11:19:06 -0700
Subject: [PATCH 0307/1390] [TF/XLA] [NFC] Simplify
 XlaComputationLaunchContext::PopulateOutputs

Reduce the nesting level, extract a function for gathering VariableInfo.

PiperOrigin-RevId: 316720004
Change-Id: I49982058d9f7efbc2dcbb2b180c1fc95193cfa39
---
 tensorflow/compiler/jit/xla_launch_util.cc | 234 +++++++++++----------
 tensorflow/compiler/jit/xla_launch_util.h  |   9 +-
 2 files changed, 133 insertions(+), 110 deletions(-)

diff --git a/tensorflow/compiler/jit/xla_launch_util.cc b/tensorflow/compiler/jit/xla_launch_util.cc
index ec5a372875c..25eed134e35 100644
--- a/tensorflow/compiler/jit/xla_launch_util.cc
+++ b/tensorflow/compiler/jit/xla_launch_util.cc
@@ -195,18 +195,20 @@ XlaComputationLaunchContext::XlaComputationLaunchContext(
 }
 
 void XlaComputationLaunchContext::PopulateInputs(
-    OpKernelContext* ctx, const XlaCompiler::CompilationResult* kernel,
+    OpKernelContext* ctx,
+    const XlaCompiler::CompilationResult* compilation_result,
     const std::map<int, OptionalTensor>& variables,
     int missing_ctx_input_prefix) {
   // Build ShapedBuffers that point directly to the Tensor buffers.
-  arg_ptrs_ = std::vector<ShapedBuffer*>(kernel->xla_input_shapes.size());
+  arg_ptrs_ =
+      std::vector<ShapedBuffer*>(compilation_result->xla_input_shapes.size());
 
   xla::TransferManager* transfer_manager =
       client_->backend().transfer_manager();
-  for (int i = 0; i < kernel->xla_input_shapes.size(); ++i) {
-    int arg_num = kernel->input_mapping[i];
+  for (int i = 0; i < compilation_result->xla_input_shapes.size(); ++i) {
+    int arg_num = compilation_result->input_mapping[i];
     CHECK_GE(arg_num, missing_ctx_input_prefix);
-    const xla::Shape& shape = kernel->xla_input_shapes[i];
+    const xla::Shape& shape = compilation_result->xla_input_shapes[i];
     const Tensor* t = variables.count(arg_num)
                           ? &(variables.at(arg_num).value)
                           : &(ctx->input(arg_num - missing_ctx_input_prefix));
@@ -361,13 +363,94 @@ static Status SetBufferForResourceVarTensorUnderAllocateXlaTensors(
   return Status::OK();
 }
 
+// Sets output `output_num` for `ctx` provided it is known at a compile time.
+static Status SetOutputForConstant(
+    OpKernelContext* ctx, se::Stream* stream,
+    const XlaCompiler::CompilationResult* compilation_result, int output_num) {
+  CHECK(compilation_result->outputs[output_num].is_constant);
+  // Output is a constant.
+  const Tensor& const_tensor =
+      compilation_result->outputs[output_num].constant_value;
+  Tensor* output_tensor;
+  const size_t total_bytes = const_tensor.TotalBytes();
+  if (stream && total_bytes > 0) {
+    // Copy host -> device. (Empty tensors don't have backing buffers.)
+    // Manually allocate memory using an XlaTensorBuffer so we can allocate
+    // as much memory as the device requires (as given by
+    // GetByteSizeRequirement). This avoids XlaTransferManager having to
+    // reallocate the device buffer later.
+    VLOG(1) << "Constant output tensor on device";
+
+    TF_RETURN_IF_ERROR(
+        ctx->allocate_output(output_num, const_tensor.shape(), &output_tensor));
+    Device* device = dynamic_cast<Device*>(ctx->device());
+    if (device == nullptr) {
+      return errors::Internal("DeviceBase was not a Device.");
+    }
+    ctx->op_device_context()->CopyCPUTensorToDevice(
+        &const_tensor, device, output_tensor,
+        [&](Status status) { TF_CHECK_OK(status); });
+
+    if (device->device_type() == DEVICE_GPU) {
+      // The GPUDeviceContext enqueues the host->device transfer in a
+      // separate stream from the main compute stream. We must ensure the
+      // compute stream is synchronized with the host->device transfer
+      // stream now otherwise we will create a race condition.
+      auto* gpu_device_context =
+          static_cast<GPUDeviceContext*>(ctx->op_device_context());
+      gpu_device_context->stream()->ThenWaitFor(
+          gpu_device_context->host_to_device_stream());
+    }
+  } else {
+    // No copy required.
+    ctx->set_output(output_num, const_tensor);
+    output_tensor = ctx->mutable_output(output_num);
+  }
+  if (XlaTensor* xla_tensor = XlaTensor::FromTensor(output_tensor)) {
+    xla_tensor->set_host_tensor(const_tensor);
+  }
+  return Status::OK();
+}
+
+// Creates a list of updates resource variables.
+static xla::StatusOr<std::vector<VariableInfo>> GatherVariableInfo(
+    OpKernelContext* ctx,
+    const XlaCompiler::CompilationResult* compilation_result,
+    int missing_ctx_input_prefix) {
+  std::vector<VariableInfo> variable_infos;
+  variable_infos.reserve(compilation_result->resource_updates.size());
+
+  for (int i = 0; i < compilation_result->resource_updates.size(); ++i) {
+    const XlaCompiler::ResourceUpdate& write =
+        compilation_result->resource_updates[i];
+    int actual_input_index = write.input_index - missing_ctx_input_prefix;
+    if (actual_input_index < 0 || actual_input_index >= ctx->num_inputs()) {
+      return errors::Internal("Invalid input index for variable write.");
+    }
+
+    // TODO(b/35625933): tensorflow::Var should contain a PersistentTensor,
+    // not a Tensor.
+    Var* variable = nullptr;
+    TF_RETURN_IF_ERROR(LookupOrCreateResource<Var>(
+        ctx, HandleFromInput(ctx, actual_input_index), &variable,
+        [&write](Var** ptr) {
+          *ptr = new Var(write.type);
+          return Status::OK();
+        }));
+    variable_infos.emplace_back(actual_input_index, variable);
+  }
+  return variable_infos;
+}
+
 Status XlaComputationLaunchContext::PopulateOutputs(
-    OpKernelContext* ctx, const XlaCompiler::CompilationResult* kernel,
+    OpKernelContext* ctx,
+    const XlaCompiler::CompilationResult* compilation_result,
     ScopedShapedBuffer output, int missing_ctx_input_prefix,
     const xla::HloInputOutputAliasConfig& input_output_alias,
     const std::map<int, OptionalTensor>& resource_var_snapshots) {
   se::Stream* stream =
       ctx->op_device_context() ? ctx->op_device_context()->stream() : nullptr;
+  Allocator* allocator = ctx->device()->GetAllocator({});
 
   // Computation output should always be a tuple.
   if (VLOG_IS_ON(2)) {
@@ -375,7 +458,7 @@ Status XlaComputationLaunchContext::PopulateOutputs(
     VLOG(2) << "Result tuple shape (on device): "
             << output.on_device_shape().DebugString();
   }
-  CHECK_EQ(ctx->num_outputs(), kernel->outputs.size());
+  CHECK_EQ(ctx->num_outputs(), compilation_result->outputs.size());
 
   // If the on-host-shape isn't a tuple, create a new single-element tuple
   // buffer with a nullptr root index table. This allows the code below to treat
@@ -404,82 +487,41 @@ Status XlaComputationLaunchContext::PopulateOutputs(
   // Copy XLA results to the OpOutputList.
   int output_num = 0;
   for (int i = 0; i < ctx->num_outputs(); ++i) {
-    Allocator* allocator = ctx->device()->GetAllocator({});
-    if (kernel->outputs[i].is_constant) {
-      // Output is a constant.
-      const Tensor& const_tensor = kernel->outputs[i].constant_value;
-      Tensor* output_tensor;
-      const size_t total_bytes = const_tensor.TotalBytes();
-      if (stream && total_bytes > 0) {
-        // Copy host -> device. (Empty tensors don't have backing buffers.)
-        // Manually allocate memory using an XlaTensorBuffer so we can allocate
-        // as much memory as the device requires (as given by
-        // GetByteSizeRequirement). This avoids XlaTransferManager having to
-        // reallocate the device buffer later.
-        VLOG(1) << "Constant output tensor on device";
+    const TensorShape& shape = compilation_result->outputs[i].shape;
+    const DataType& type = compilation_result->outputs[i].type;
+    VLOG(2) << "Retval " << i << " shape " << shape.DebugString() << " type "
+            << DataTypeString(type);
+    if (type == DT_VARIANT) {
+      return errors::Unimplemented(
+          "Support for TensorList crossing the XLA/TF boundary "
+          "is not implemented");
+    }
 
-        TF_RETURN_IF_ERROR(
-            ctx->allocate_output(i, const_tensor.shape(), &output_tensor));
-
-        Device* device = dynamic_cast<Device*>(ctx->device());
-        if (device == nullptr) {
-          return errors::Internal("DeviceBase was not a Device.");
-        }
-        ctx->op_device_context()->CopyCPUTensorToDevice(
-            &const_tensor, device, output_tensor,
-            [&](Status status) { TF_CHECK_OK(status); });
-
-        if (device->device_type() == DEVICE_GPU) {
-          // The GPUDeviceContext enqueues the host->device transfer in a
-          // separate stream from the main compute stream. We must ensure the
-          // compute stream is synchronized with the host->device transfer
-          // stream now otherwise we will create a race condition.
-          auto* gpu_device_context =
-              static_cast<GPUDeviceContext*>(ctx->op_device_context());
-          gpu_device_context->stream()->ThenWaitFor(
-              gpu_device_context->host_to_device_stream());
-        }
-      } else {
-        // No copy required.
-        ctx->set_output(i, const_tensor);
-        output_tensor = ctx->mutable_output(i);
-      }
-      if (XlaTensor* xla_tensor = XlaTensor::FromTensor(output_tensor)) {
-        xla_tensor->set_host_tensor(const_tensor);
-      }
+    if (compilation_result->outputs[i].is_constant) {
+      TF_RETURN_IF_ERROR(
+          SetOutputForConstant(ctx, stream, compilation_result, i));
+    } else if (type == DT_RESOURCE) {
+      int input_index =
+          compilation_result->outputs[i].input_index - missing_ctx_input_prefix;
+      TF_RET_CHECK(input_index >= 0 && input_index < ctx->num_inputs())
+          << "Invalid input for outputs " << i << ": " << input_index;
+      ctx->set_output(i, ctx->input(input_index));
     } else {
-      const TensorShape& shape = kernel->outputs[i].shape;
-      const DataType& type = kernel->outputs[i].type;
-      VLOG(2) << "Retval " << i << " shape " << shape.DebugString() << " type "
-              << DataTypeString(type);
-      if (type == DT_RESOURCE) {
-        int input_index =
-            kernel->outputs[i].input_index - missing_ctx_input_prefix;
-        TF_RET_CHECK(input_index >= 0 && input_index < ctx->num_inputs())
-            << "Invalid input for outputs " << i << ": " << input_index;
-        ctx->set_output(i, ctx->input(input_index));
-      } else {
-        if (allocate_xla_tensors_) {
-          TF_RETURN_IF_ERROR(SetBufferForTensorUnderAllocateXlaTensors(
-              input_output_alias, output_num, ctx, i, shape, &output,
-              definition_event, stream, use_multiple_streams_));
-        } else {
-          if (type == DT_VARIANT) {
-            return errors::Unimplemented(
-                "Support for TensorList crossing the XLA/TF boundary "
-                "is not implemented");
-          }
+      if (allocate_xla_tensors_) {
+        TF_RETURN_IF_ERROR(SetBufferForTensorUnderAllocateXlaTensors(
+            input_output_alias, output_num, ctx, i, shape, &output,
+            definition_event, stream, use_multiple_streams_));
 
-          se::DeviceMemoryBase buffer = output.buffer({output_num});
-          Tensor output_tensor = GetOrCreateTensorForOutput(
-              output_num, ctx, missing_ctx_input_prefix, input_output_alias,
-              kernel->input_mapping, resource_var_snapshots,
-              ctx->expected_output_dtype(i), shape, buffer, allocator);
-          output.set_buffer(se::OwningDeviceMemory(), {output_num});
-          ctx->set_output(i, output_tensor);
-        }
-        ++output_num;
+      } else {
+        se::DeviceMemoryBase buffer = output.buffer({output_num});
+        Tensor output_tensor = GetOrCreateTensorForOutput(
+            output_num, ctx, missing_ctx_input_prefix, input_output_alias,
+            compilation_result->input_mapping, resource_var_snapshots,
+            ctx->expected_output_dtype(i), shape, buffer, allocator);
+        output.set_buffer(se::OwningDeviceMemory(), {output_num});
+        ctx->set_output(i, output_tensor);
       }
+      ++output_num;
     }
 
     if (VLOG_IS_ON(3)) {
@@ -489,34 +531,14 @@ Status XlaComputationLaunchContext::PopulateOutputs(
 
   // Apply variable updates, if any.
   VLOG(2) << "Applying variable updates";
-  std::vector<VariableInfo> variable_infos;
-  variable_infos.reserve(kernel->resource_updates.size());
-
-  for (int i = 0; i < kernel->resource_updates.size(); ++i) {
-    const XlaCompiler::ResourceUpdate& write = kernel->resource_updates[i];
-    int actual_input_index = write.input_index - missing_ctx_input_prefix;
-    if (actual_input_index < 0 || actual_input_index >= ctx->num_inputs()) {
-      return errors::Internal("Invalid input index for variable write.");
-    }
-
-    // TODO(b/35625933): tensorflow::Var should contain a PersistentTensor,
-    // not a Tensor.
-    Var* variable = nullptr;
-    TF_RETURN_IF_ERROR(LookupOrCreateResource<Var>(
-        ctx, HandleFromInput(ctx, actual_input_index), &variable,
-        [&write](Var** ptr) {
-          *ptr = new Var(write.type);
-          return Status::OK();
-        }));
-    variable_infos.emplace_back(actual_input_index, variable);
-  }
-
+  TF_ASSIGN_OR_RETURN(
+      std::vector<VariableInfo> variable_infos,
+      GatherVariableInfo(ctx, compilation_result, missing_ctx_input_prefix));
   TF_RETURN_IF_ERROR(LockVariables(absl::MakeSpan(variable_infos)));
 
-  for (int i = 0; i < kernel->resource_updates.size(); ++i) {
-    Allocator* allocator = ctx->device()->GetAllocator({});
-    const XlaCompiler::ResourceUpdate& write = kernel->resource_updates[i];
-
+  for (int i = 0; i < compilation_result->resource_updates.size(); ++i) {
+    const XlaCompiler::ResourceUpdate& write =
+        compilation_result->resource_updates[i];
     if (variable_infos[i].var()->tensor()->dtype() != write.type) {
       return errors::Internal("Mismatched type in variable write");
     }
@@ -530,7 +552,7 @@ Status XlaComputationLaunchContext::PopulateOutputs(
       output.set_buffer(se::OwningDeviceMemory(), {output_num});
       Tensor output_tensor = GetOrCreateTensorForOutput(
           output_num, ctx, missing_ctx_input_prefix, input_output_alias,
-          kernel->input_mapping, resource_var_snapshots, write.type,
+          compilation_result->input_mapping, resource_var_snapshots, write.type,
           write.shape, buffer, allocator);
       *variable_infos[i].var()->tensor() = output_tensor;
       variable_infos[i].var()->is_initialized |= write.modified;
diff --git a/tensorflow/compiler/jit/xla_launch_util.h b/tensorflow/compiler/jit/xla_launch_util.h
index cf68dcb7dd6..9a7f20cb310 100644
--- a/tensorflow/compiler/jit/xla_launch_util.h
+++ b/tensorflow/compiler/jit/xla_launch_util.h
@@ -136,7 +136,7 @@ class XlaComputationLaunchContext {
   // input_mapping must be greater than or equal to `missing_ctx_input_prefix`
   // (in other words, no inputs actually required by the kernel can be missing).
   void PopulateInputs(OpKernelContext* ctx,
-                      const XlaCompiler::CompilationResult* kernel,
+                      const XlaCompiler::CompilationResult* compilation_result,
                       const std::map<int, OptionalTensor>& variables,
                       int missing_ctx_input_prefix);
 
@@ -148,10 +148,11 @@ class XlaComputationLaunchContext {
   // See jit/resource_operation_safety_analysis for details.
   //
   //
-  // Assumes that the first `missing_ctx_input_prefix` inputs to the kernel are
-  // missing and adjusts input indices accordingly.
+  // Assumes that the first `missing_ctx_input_prefix` inputs to the
+  // compilation_result are missing and adjusts input indices accordingly.
   Status PopulateOutputs(
-      OpKernelContext* ctx, const XlaCompiler::CompilationResult* kernel,
+      OpKernelContext* ctx,
+      const XlaCompiler::CompilationResult* compilation_result,
       xla::ScopedShapedBuffer output, int missing_ctx_input_prefix,
       const xla::HloInputOutputAliasConfig& input_output_alias,
       const std::map<int, OptionalTensor>& resource_var_snapshots);

From ed557008d681b6bd612f0ea6b1aa056c4cfa744b Mon Sep 17 00:00:00 2001
From: Robert David <lrdx@google.com>
Date: Tue, 16 Jun 2020 11:19:41 -0700
Subject: [PATCH 0308/1390] All LSTM implementations: Rename cell_scratch to
 cell_gate_scratch, and cell_bias_ptr to cell_gate_bias_ptr to better reflect
 what those arrays are.

Do note this is not the same thing as the LSTM cell "state", but a layer/gate that calculates the update. The cell state depends on the input, forget, and cell gates; these arrays are the output and the bias for the last gate.

PiperOrigin-RevId: 316720132
Change-Id: I71c370dabd27f776987e061b9393022c775589c9
---
 tensorflow/lite/kernels/lstm_eval.cc          | 156 +++++++++---------
 .../calibration/builtin_logging_ops/lstm.cc   |  61 +++----
 2 files changed, 114 insertions(+), 103 deletions(-)

diff --git a/tensorflow/lite/kernels/lstm_eval.cc b/tensorflow/lite/kernels/lstm_eval.cc
index b285ed1030f..b4d43414d89 100644
--- a/tensorflow/lite/kernels/lstm_eval.cc
+++ b/tensorflow/lite/kernels/lstm_eval.cc
@@ -212,13 +212,13 @@ inline void LstmStepFloat(
     const float* cell_layer_norm_coefficients_ptr,
     const float* output_layer_norm_coefficients_ptr,
     const float* input_gate_bias_ptr, const float* forget_gate_bias_ptr,
-    const float* cell_bias_ptr, const float* output_gate_bias_ptr,
+    const float* cell_gate_bias_ptr, const float* output_gate_bias_ptr,
     const float* projection_weights_ptr, const float* projection_bias_ptr,
     const TfLiteLSTMParams* params, int n_batch, int n_cell, int n_input,
     int n_aux_input, int n_output, int output_batch_leading_dim,
     float* output_state_ptr, float* cell_state_ptr, float* input_gate_scratch,
-    float* forget_gate_scratch, float* cell_scratch, float* output_gate_scratch,
-    float* output_ptr) {
+    float* forget_gate_scratch, float* cell_gate_scratch,
+    float* output_gate_scratch, float* output_ptr) {
   ruy::profiler::ScopeLabel label("LstmStepFloat");
   // Since we have already checked that weights are all there or none, we can
   // check the existence of only one to the get the condition.
@@ -233,7 +233,7 @@ inline void LstmStepFloat(
       std::fill_n(input_gate_scratch, n_cell * n_batch, 0.0f);
     }
     std::fill_n(forget_gate_scratch, n_cell * n_batch, 0.0f);
-    std::fill_n(cell_scratch, n_cell * n_batch, 0.0f);
+    std::fill_n(cell_gate_scratch, n_cell * n_batch, 0.0f);
     std::fill_n(output_gate_scratch, n_cell * n_batch, 0.0f);
   } else {
     if (!use_cifg) {
@@ -242,8 +242,8 @@ inline void LstmStepFloat(
     }
     tensor_utils::VectorBatchVectorAssign(forget_gate_bias_ptr, n_cell, n_batch,
                                           forget_gate_scratch);
-    tensor_utils::VectorBatchVectorAssign(cell_bias_ptr, n_cell, n_batch,
-                                          cell_scratch);
+    tensor_utils::VectorBatchVectorAssign(cell_gate_bias_ptr, n_cell, n_batch,
+                                          cell_gate_scratch);
     tensor_utils::VectorBatchVectorAssign(output_gate_bias_ptr, n_cell, n_batch,
                                           output_gate_scratch);
   }
@@ -262,7 +262,7 @@ inline void LstmStepFloat(
         forget_gate_scratch);
     tensor_utils::MatrixBatchVectorMultiplyAccumulate(
         input_to_cell_weights_ptr, n_cell, n_input, input_ptr, n_batch,
-        cell_scratch);
+        cell_gate_scratch);
     tensor_utils::MatrixBatchVectorMultiplyAccumulate(
         input_to_output_weights_ptr, n_cell, n_input, input_ptr, n_batch,
         output_gate_scratch);
@@ -283,7 +283,7 @@ inline void LstmStepFloat(
         n_batch, forget_gate_scratch);
     tensor_utils::MatrixBatchVectorMultiplyAccumulate(
         aux_input_to_cell_weights_ptr, n_cell, n_aux_input, aux_input_ptr,
-        n_batch, cell_scratch);
+        n_batch, cell_gate_scratch);
     tensor_utils::MatrixBatchVectorMultiplyAccumulate(
         aux_input_to_output_weights_ptr, n_cell, n_aux_input, aux_input_ptr,
         n_batch, output_gate_scratch);
@@ -300,7 +300,7 @@ inline void LstmStepFloat(
       n_batch, forget_gate_scratch);
   tensor_utils::MatrixBatchVectorMultiplyAccumulate(
       recurrent_to_cell_weights_ptr, n_cell, n_output, output_state_ptr,
-      n_batch, cell_scratch);
+      n_batch, cell_gate_scratch);
   tensor_utils::MatrixBatchVectorMultiplyAccumulate(
       recurrent_to_output_weights_ptr, n_cell, n_output, output_state_ptr,
       n_batch, output_gate_scratch);
@@ -347,24 +347,26 @@ inline void LstmStepFloat(
   tensor_utils::VectorVectorCwiseProduct(forget_gate_scratch, cell_state_ptr,
                                          n_batch * n_cell, cell_state_ptr);
   if (use_layer_norm) {
-    tensor_utils::MeanStddevNormalization(cell_scratch, cell_scratch, n_cell,
-                                          n_batch);
+    tensor_utils::MeanStddevNormalization(cell_gate_scratch, cell_gate_scratch,
+                                          n_cell, n_batch);
     tensor_utils::VectorBatchVectorCwiseProduct(
-        cell_layer_norm_coefficients_ptr, n_cell, cell_scratch, n_batch,
-        cell_scratch);
-    tensor_utils::VectorBatchVectorAdd(cell_bias_ptr, n_cell, n_batch,
-                                       cell_scratch);
+        cell_layer_norm_coefficients_ptr, n_cell, cell_gate_scratch, n_batch,
+        cell_gate_scratch);
+    tensor_utils::VectorBatchVectorAdd(cell_gate_bias_ptr, n_cell, n_batch,
+                                       cell_gate_scratch);
   }
-  tensor_utils::ApplyActivationToVector(cell_scratch, n_batch * n_cell,
-                                        params->activation, cell_scratch);
+  tensor_utils::ApplyActivationToVector(cell_gate_scratch, n_batch * n_cell,
+                                        params->activation, cell_gate_scratch);
   if (use_cifg) {
     tensor_utils::Sub1Vector(forget_gate_scratch, n_batch * n_cell,
                              forget_gate_scratch);
     tensor_utils::VectorVectorCwiseProductAccumulate(
-        cell_scratch, forget_gate_scratch, n_batch * n_cell, cell_state_ptr);
+        cell_gate_scratch, forget_gate_scratch, n_batch * n_cell,
+        cell_state_ptr);
   } else {
     tensor_utils::VectorVectorCwiseProductAccumulate(
-        cell_scratch, input_gate_scratch, n_batch * n_cell, cell_state_ptr);
+        cell_gate_scratch, input_gate_scratch, n_batch * n_cell,
+        cell_state_ptr);
   }
   if (params->cell_clip > 0.0) {
     tensor_utils::ClipVector(cell_state_ptr, n_batch * n_cell,
@@ -389,8 +391,8 @@ inline void LstmStepFloat(
   tensor_utils::ApplySigmoidToVector(output_gate_scratch, n_batch * n_cell,
                                      output_gate_scratch);
   tensor_utils::ApplyActivationToVector(cell_state_ptr, n_batch * n_cell,
-                                        params->activation, cell_scratch);
-  tensor_utils::VectorVectorCwiseProduct(output_gate_scratch, cell_scratch,
+                                        params->activation, cell_gate_scratch);
+  tensor_utils::VectorVectorCwiseProduct(output_gate_scratch, cell_gate_scratch,
                                          n_batch * n_cell, output_gate_scratch);
 
   const bool use_projection_weight = (projection_weights_ptr != nullptr);
@@ -525,19 +527,19 @@ inline void LstmStepHybrid(
     const float* cell_layer_norm_coefficients_ptr,
     const float* output_layer_norm_coefficients_ptr,
     const float* input_gate_bias_ptr, const float* forget_gate_bias_ptr,
-    const float* cell_bias_ptr, const float* output_gate_bias_ptr,
+    const float* cell_gate_bias_ptr, const float* output_gate_bias_ptr,
     const int8_t* projection_weights_ptr, float projection_weights_scale,
     const float* projection_bias_ptr, const TfLiteLSTMParams* params,
     int n_batch, int n_cell, int n_input, int n_aux_input, int n_output,
     int output_batch_leading_dim, float* input_gate_scratch,
-    float* forget_gate_scratch, float* cell_scratch, float* output_gate_scratch,
-    float* scaling_factors, float* scaling_factors_scratch,
-    float* recovered_cell_weights, int8_t* quantized_input_ptr,
-    int8_t* quantized_aux_input_ptr, int8_t* quantized_output_state_ptr,
-    int8_t* quantized_cell_state_ptr, float* output_state_ptr,
-    float* cell_state_ptr, int32_t* accum_scratch_ptr, float* output_ptr,
-    int32_t* zero_points, int32_t* row_sums, int row_sums_size,
-    bool* compute_row_sums, bool asymmetric_quantize_inputs,
+    float* forget_gate_scratch, float* cell_gate_scratch,
+    float* output_gate_scratch, float* scaling_factors,
+    float* scaling_factors_scratch, float* recovered_cell_weights,
+    int8_t* quantized_input_ptr, int8_t* quantized_aux_input_ptr,
+    int8_t* quantized_output_state_ptr, int8_t* quantized_cell_state_ptr,
+    float* output_state_ptr, float* cell_state_ptr, int32_t* accum_scratch_ptr,
+    float* output_ptr, int32_t* zero_points, int32_t* row_sums,
+    int row_sums_size, bool* compute_row_sums, bool asymmetric_quantize_inputs,
     CpuBackendContext* context) {
   ruy::profiler::ScopeLabel label("LstmStepHybrid");
   // Since we have already checked that weights are all there or none, we
@@ -553,7 +555,7 @@ inline void LstmStepHybrid(
       std::fill_n(input_gate_scratch, n_cell * n_batch, 0.0f);
     }
     std::fill_n(forget_gate_scratch, n_cell * n_batch, 0.0f);
-    std::fill_n(cell_scratch, n_cell * n_batch, 0.0f);
+    std::fill_n(cell_gate_scratch, n_cell * n_batch, 0.0f);
     std::fill_n(output_gate_scratch, n_cell * n_batch, 0.0f);
   } else {
     if (!use_cifg) {
@@ -562,8 +564,8 @@ inline void LstmStepHybrid(
     }
     tensor_utils::VectorBatchVectorAssign(forget_gate_bias_ptr, n_cell, n_batch,
                                           forget_gate_scratch);
-    tensor_utils::VectorBatchVectorAssign(cell_bias_ptr, n_cell, n_batch,
-                                          cell_scratch);
+    tensor_utils::VectorBatchVectorAssign(cell_gate_bias_ptr, n_cell, n_batch,
+                                          cell_gate_scratch);
     tensor_utils::VectorBatchVectorAssign(output_gate_bias_ptr, n_cell, n_batch,
                                           output_gate_scratch);
   }
@@ -657,7 +659,8 @@ inline void LstmStepHybrid(
 
     tensor_utils::MatrixBatchVectorMultiplyAccumulate(
         input_to_cell_weights_ptr, n_cell, n_input, quantized_input_ptr,
-        input_to_cell_weights_scale, scaling_factors, n_batch, cell_scratch,
+        input_to_cell_weights_scale, scaling_factors, n_batch,
+        cell_gate_scratch,
         /*per_channel_scale=*/nullptr, zero_points, accum_scratch_ptr,
         input_to_cell_row_sums, compute_row_sums, scaling_factors_scratch,
         context);
@@ -699,9 +702,10 @@ inline void LstmStepHybrid(
     tensor_utils::MatrixBatchVectorMultiplyAccumulate(
         aux_input_to_cell_weights_ptr, n_cell, n_aux_input,
         quantized_aux_input_ptr, aux_input_to_cell_weights_scale,
-        scaling_factors, n_batch, cell_scratch, /*per_channel_scale=*/nullptr,
-        zero_points, accum_scratch_ptr, aux_input_to_cell_row_sums,
-        compute_row_sums, scaling_factors_scratch, context);
+        scaling_factors, n_batch, cell_gate_scratch,
+        /*per_channel_scale=*/nullptr, zero_points, accum_scratch_ptr,
+        aux_input_to_cell_row_sums, compute_row_sums, scaling_factors_scratch,
+        context);
 
     tensor_utils::MatrixBatchVectorMultiplyAccumulate(
         aux_input_to_output_weights_ptr, n_cell, n_aux_input,
@@ -739,9 +743,10 @@ inline void LstmStepHybrid(
     tensor_utils::MatrixBatchVectorMultiplyAccumulate(
         recurrent_to_cell_weights_ptr, n_cell, n_output,
         quantized_output_state_ptr, recurrent_to_cell_weights_scale,
-        scaling_factors, n_batch, cell_scratch, /*per_channel_scale=*/nullptr,
-        zero_points, accum_scratch_ptr, recurrent_to_cell_row_sums,
-        compute_row_sums, scaling_factors_scratch, context);
+        scaling_factors, n_batch, cell_gate_scratch,
+        /*per_channel_scale=*/nullptr, zero_points, accum_scratch_ptr,
+        recurrent_to_cell_row_sums, compute_row_sums, scaling_factors_scratch,
+        context);
 
     tensor_utils::MatrixBatchVectorMultiplyAccumulate(
         recurrent_to_output_weights_ptr, n_cell, n_output,
@@ -800,24 +805,26 @@ inline void LstmStepHybrid(
   tensor_utils::VectorVectorCwiseProduct(forget_gate_scratch, cell_state_ptr,
                                          n_batch * n_cell, cell_state_ptr);
   if (use_layer_norm) {
-    tensor_utils::MeanStddevNormalization(cell_scratch, cell_scratch, n_cell,
-                                          n_batch);
+    tensor_utils::MeanStddevNormalization(cell_gate_scratch, cell_gate_scratch,
+                                          n_cell, n_batch);
     tensor_utils::VectorBatchVectorCwiseProduct(
-        cell_layer_norm_coefficients_ptr, n_cell, cell_scratch, n_batch,
-        cell_scratch);
-    tensor_utils::VectorBatchVectorAdd(cell_bias_ptr, n_cell, n_batch,
-                                       cell_scratch);
+        cell_layer_norm_coefficients_ptr, n_cell, cell_gate_scratch, n_batch,
+        cell_gate_scratch);
+    tensor_utils::VectorBatchVectorAdd(cell_gate_bias_ptr, n_cell, n_batch,
+                                       cell_gate_scratch);
   }
-  tensor_utils::ApplyActivationToVector(cell_scratch, n_batch * n_cell,
-                                        params->activation, cell_scratch);
+  tensor_utils::ApplyActivationToVector(cell_gate_scratch, n_batch * n_cell,
+                                        params->activation, cell_gate_scratch);
   if (use_cifg) {
     tensor_utils::Sub1Vector(forget_gate_scratch, n_batch * n_cell,
                              forget_gate_scratch);
     tensor_utils::VectorVectorCwiseProductAccumulate(
-        cell_scratch, forget_gate_scratch, n_batch * n_cell, cell_state_ptr);
+        cell_gate_scratch, forget_gate_scratch, n_batch * n_cell,
+        cell_state_ptr);
   } else {
     tensor_utils::VectorVectorCwiseProductAccumulate(
-        cell_scratch, input_gate_scratch, n_batch * n_cell, cell_state_ptr);
+        cell_gate_scratch, input_gate_scratch, n_batch * n_cell,
+        cell_state_ptr);
   }
   if (params->cell_clip > 0.0) {
     tensor_utils::ClipVector(cell_state_ptr, n_batch * n_cell,
@@ -845,8 +852,8 @@ inline void LstmStepHybrid(
   tensor_utils::ApplySigmoidToVector(output_gate_scratch, n_batch * n_cell,
                                      output_gate_scratch);
   tensor_utils::ApplyActivationToVector(cell_state_ptr, n_batch * n_cell,
-                                        params->activation, cell_scratch);
-  tensor_utils::VectorVectorCwiseProduct(output_gate_scratch, cell_scratch,
+                                        params->activation, cell_gate_scratch);
+  tensor_utils::VectorVectorCwiseProduct(output_gate_scratch, cell_gate_scratch,
                                          n_batch * n_cell, output_gate_scratch);
 
   const bool use_projection_weight = (projection_weights_ptr != nullptr);
@@ -940,7 +947,7 @@ inline void LstmStepHybrid(
 // Gate biases of size 'n_cell':
 //   input_bias_ptr                 - optional
 //   forget_bias_ptr
-//   cell_bias_ptr
+//   cell_gate_bias_ptr
 //   output_bias_ptr
 //
 // Layer norm coefficients of size 'n_cell', representing diagonal matrices.
@@ -1028,7 +1035,7 @@ inline void LstmStepInteger(
     const int16_t* layer_norm_output_weight_ptr,
     int32_t layer_norm_output_scale_a, int32_t layer_norm_output_scale_b,
     const int32_t* input_bias_ptr, const int32_t* forget_bias_ptr,
-    const int32_t* cell_bias_ptr, const int32_t* output_bias_ptr,
+    const int32_t* cell_gate_bias_ptr, const int32_t* output_bias_ptr,
     int16_t quantized_cell_clip, int8_t quantized_proj_clip, int32_t cell_scale,
     int32_t input_variance_guard, int32_t forget_variance_guard,
     int32_t cell_variance_guard, int32_t output_variance_guard,
@@ -1115,7 +1122,7 @@ inline void LstmStepInteger(
 
   if (use_layer_norm) {
     tensor_utils::ApplyLayerNorm(scratch_2_ptr, layer_norm_cell_weight_ptr,
-                                 cell_bias_ptr, layer_norm_cell_scale_a,
+                                 cell_gate_bias_ptr, layer_norm_cell_scale_a,
                                  layer_norm_cell_scale_b, cell_variance_guard,
                                  n_batch, n_cell, scratch_2_ptr);
   }
@@ -1266,7 +1273,7 @@ inline void LstmStepInteger(
 // Gate biases of size 'n_cell':
 //   input_bias_ptr                 - optional
 //   forget_bias_ptr
-//   cell_bias_ptr
+//   cell_gate_bias_ptr
 //   output_bias_ptr
 //
 // Layer norm coefficients of size 'n_cell', representing diagonal matrices.
@@ -1355,7 +1362,7 @@ void LstmStepInteger(
     const int16_t* layer_norm_output_weight_ptr,
     int32_t layer_norm_output_scale_a, int32_t layer_norm_output_scale_b,
     const int32_t* input_bias_ptr, const int32_t* forget_bias_ptr,
-    const int32_t* cell_bias_ptr, const int32_t* output_bias_ptr,
+    const int32_t* cell_gate_bias_ptr, const int32_t* output_bias_ptr,
     const int32_t* proj_bias_ptr, const TfLiteLSTMParams* params,
     const int32_t* intermediate_scale_a, const int32_t* intermediate_scale_b,
     const int32_t* intermediate_zp, int32 quantized_cell_clip,
@@ -1413,7 +1420,7 @@ void LstmStepInteger(
   // Update gate with layer norm.
   tensor_utils::ApplyLayerNormFloat(
       scratch3, layer_norm_cell_weight_ptr, layer_norm_cell_scale_a,
-      layer_norm_cell_scale_b, cell_bias_ptr, n_batch, n_cell, scratch3);
+      layer_norm_cell_scale_b, cell_gate_bias_ptr, n_batch, n_cell, scratch3);
 
   // Update gate tanh.
   tensor_utils::ApplyTanhFloat(scratch3, n_batch, n_cell, -12, scratch3);
@@ -1538,16 +1545,16 @@ TfLiteStatus EvalFloat(
   // Index the scratch buffers pointers to the global scratch buffer.
   float* scratch_buffer_ptr = GetTensorData<float>(scratch_buffer);
   float* input_gate_scratch = nullptr;
-  float* cell_scratch = nullptr;
+  float* cell_gate_scratch = nullptr;
   float* forget_gate_scratch = nullptr;
   float* output_gate_scratch = nullptr;
   if (use_cifg) {
-    cell_scratch = scratch_buffer_ptr;
+    cell_gate_scratch = scratch_buffer_ptr;
     forget_gate_scratch = scratch_buffer_ptr + n_cell * n_batch;
     output_gate_scratch = scratch_buffer_ptr + 2 * n_cell * n_batch;
   } else {
     input_gate_scratch = scratch_buffer_ptr;
-    cell_scratch = scratch_buffer_ptr + n_cell * n_batch;
+    cell_gate_scratch = scratch_buffer_ptr + n_cell * n_batch;
     forget_gate_scratch = scratch_buffer_ptr + 2 * n_cell * n_batch;
     output_gate_scratch = scratch_buffer_ptr + 3 * n_cell * n_batch;
   }
@@ -1599,7 +1606,8 @@ TfLiteStatus EvalFloat(
           n_input, aux_input_size, n_output, output_batch_leading_dim,
           GetTensorData<float>(activation_state),
           GetTensorData<float>(cell_state), input_gate_scratch,
-          forget_gate_scratch, cell_scratch, output_gate_scratch, output_ptr);
+          forget_gate_scratch, cell_gate_scratch, output_gate_scratch,
+          output_ptr);
     }
   } else {
     for (int b = 0; b < n_batch; b++) {
@@ -1628,7 +1636,7 @@ TfLiteStatus EvalFloat(
         float* input_gate_scratch_ptr =
             input_gate_scratch ? input_gate_scratch + b * n_cell : nullptr;
         float* forget_gate_scratch_ptr = forget_gate_scratch + b * n_cell;
-        float* cell_scratch_ptr = cell_scratch + b * n_cell;
+        float* cell_gate_scratch_ptr = cell_gate_scratch + b * n_cell;
         float* output_gate_scratch_ptr = output_gate_scratch + b * n_cell;
 
         LstmStepFloat(
@@ -1659,8 +1667,8 @@ TfLiteStatus EvalFloat(
             GetTensorData<float>(projection_bias), params, /*n_batch=*/1,
             n_cell, n_input, aux_input_size, n_output, output_batch_leading_dim,
             activation_state_ptr, cell_state_ptr, input_gate_scratch_ptr,
-            forget_gate_scratch_ptr, cell_scratch_ptr, output_gate_scratch_ptr,
-            output_ptr);
+            forget_gate_scratch_ptr, cell_gate_scratch_ptr,
+            output_gate_scratch_ptr, output_ptr);
       }
     }
   }
@@ -1723,16 +1731,16 @@ TfLiteStatus EvalHybrid(
 
   float* scratch_buffer_ptr = GetTensorData<float>(scratch_buffer);
   float* input_gate_scratch = nullptr;
-  float* cell_scratch = nullptr;
+  float* cell_gate_scratch = nullptr;
   float* forget_gate_scratch = nullptr;
   float* output_gate_scratch = nullptr;
   if (use_cifg) {
-    cell_scratch = scratch_buffer_ptr;
+    cell_gate_scratch = scratch_buffer_ptr;
     forget_gate_scratch = scratch_buffer_ptr + n_cell * n_batch;
     output_gate_scratch = scratch_buffer_ptr + 2 * n_cell * n_batch;
   } else {
     input_gate_scratch = scratch_buffer_ptr;
-    cell_scratch = scratch_buffer_ptr + n_cell * n_batch;
+    cell_gate_scratch = scratch_buffer_ptr + n_cell * n_batch;
     forget_gate_scratch = scratch_buffer_ptr + 2 * n_cell * n_batch;
     output_gate_scratch = scratch_buffer_ptr + 3 * n_cell * n_batch;
   }
@@ -1805,7 +1813,7 @@ TfLiteStatus EvalHybrid(
           GetTensorScale(projection_weights),
           GetTensorData<float>(projection_bias), params, n_batch, n_cell,
           n_input, aux_input_size, n_output, output_batch_leading_dim,
-          input_gate_scratch, forget_gate_scratch, cell_scratch,
+          input_gate_scratch, forget_gate_scratch, cell_gate_scratch,
           output_gate_scratch, GetTensorData<float>(scaling_factors),
           GetTensorData<float>(prod_scaling_factors),
           GetTensorData<float>(recovered_cell_weights),
@@ -1845,7 +1853,7 @@ TfLiteStatus EvalHybrid(
         float* input_gate_scratch_ptr =
             input_gate_scratch ? input_gate_scratch + b * n_cell : nullptr;
         float* forget_gate_scratch_ptr = forget_gate_scratch + b * n_cell;
-        float* cell_scratch_ptr = cell_scratch + b * n_cell;
+        float* cell_gate_scratch_ptr = cell_gate_scratch + b * n_cell;
         float* output_gate_scratch_ptr = output_gate_scratch + b * n_cell;
 
         LstmStepHybrid(
@@ -1892,8 +1900,8 @@ TfLiteStatus EvalHybrid(
             GetTensorData<float>(projection_bias), params,
             /*n_batch=*/1, n_cell, n_input, aux_input_size, n_output,
             output_batch_leading_dim, input_gate_scratch_ptr,
-            forget_gate_scratch_ptr, cell_scratch_ptr, output_gate_scratch_ptr,
-            GetTensorData<float>(scaling_factors),
+            forget_gate_scratch_ptr, cell_gate_scratch_ptr,
+            output_gate_scratch_ptr, GetTensorData<float>(scaling_factors),
             GetTensorData<float>(prod_scaling_factors),
             GetTensorData<float>(recovered_cell_weights),
             GetTensorData<int8_t>(input_quantized),
@@ -2119,7 +2127,7 @@ TfLiteStatus EvalInteger8x8_8(
       GetTensorData<int16_t>(output_layer_norm_coefficients);
   const int32_t* input_bias_ptr = GetTensorData<int32_t>(input_gate_bias);
   const int32_t* forget_bias_ptr = GetTensorData<int32_t>(forget_gate_bias);
-  const int32_t* cell_bias_ptr = GetTensorData<int32_t>(cell_bias);
+  const int32_t* cell_gate_bias_ptr = GetTensorData<int32_t>(cell_bias);
   const int32_t* output_bias_ptr = GetTensorData<int32_t>(output_gate_bias);
   const int32_t* proj_bias_ptr = GetTensorData<int32_t>(projection_bias);
   int16_t* cell_ptr = GetTensorData<int16_t>(cell_state);
@@ -2206,7 +2214,7 @@ TfLiteStatus EvalInteger8x8_8(
         integer_lstm_param->layer_norm_output_scale_a,
         integer_lstm_param->layer_norm_output_scale_b,
 
-        input_bias_ptr, forget_bias_ptr, cell_bias_ptr, output_bias_ptr,
+        input_bias_ptr, forget_bias_ptr, cell_gate_bias_ptr, output_bias_ptr,
         proj_bias_ptr,
 
         params, integer_lstm_param->intermediate_scale_a,
diff --git a/tensorflow/lite/tools/optimize/calibration/builtin_logging_ops/lstm.cc b/tensorflow/lite/tools/optimize/calibration/builtin_logging_ops/lstm.cc
index b58900c0bc6..0d4c614511d 100644
--- a/tensorflow/lite/tools/optimize/calibration/builtin_logging_ops/lstm.cc
+++ b/tensorflow/lite/tools/optimize/calibration/builtin_logging_ops/lstm.cc
@@ -58,13 +58,13 @@ inline void LstmStepWithAuxInput(
     const float* cell_layer_norm_coefficients_ptr,
     const float* output_layer_norm_coefficients_ptr,
     const float* input_gate_bias_ptr, const float* forget_gate_bias_ptr,
-    const float* cell_bias_ptr, const float* output_gate_bias_ptr,
+    const float* cell_gate_bias_ptr, const float* output_gate_bias_ptr,
     const float* projection_weights_ptr, const float* projection_bias_ptr,
     const TfLiteLSTMParams* params, int n_batch, int n_cell, int n_input,
     int n_aux_input, int n_output, int output_batch_leading_dim,
     float* output_state_ptr, float* cell_state_ptr, float* input_gate_scratch,
-    float* forget_gate_scratch, float* cell_scratch, float* output_gate_scratch,
-    float* output_ptr, Logger* logger,
+    float* forget_gate_scratch, float* cell_gate_scratch,
+    float* output_gate_scratch, float* output_ptr, Logger* logger,
     const std::vector<int>& intermediate_tensor_indexes,
     ErrorReporter* error_reporter) {
   // Since we have already checked that weights are all there or none, we can
@@ -80,7 +80,7 @@ inline void LstmStepWithAuxInput(
       std::fill_n(input_gate_scratch, n_cell * n_batch, 0.0f);
     }
     std::fill_n(forget_gate_scratch, n_cell * n_batch, 0.0f);
-    std::fill_n(cell_scratch, n_cell * n_batch, 0.0f);
+    std::fill_n(cell_gate_scratch, n_cell * n_batch, 0.0f);
     std::fill_n(output_gate_scratch, n_cell * n_batch, 0.0f);
   } else {
     if (!use_cifg) {
@@ -89,8 +89,8 @@ inline void LstmStepWithAuxInput(
     }
     tensor_utils::VectorBatchVectorAssign(forget_gate_bias_ptr, n_cell, n_batch,
                                           forget_gate_scratch);
-    tensor_utils::VectorBatchVectorAssign(cell_bias_ptr, n_cell, n_batch,
-                                          cell_scratch);
+    tensor_utils::VectorBatchVectorAssign(cell_gate_bias_ptr, n_cell, n_batch,
+                                          cell_gate_scratch);
     tensor_utils::VectorBatchVectorAssign(output_gate_bias_ptr, n_cell, n_batch,
                                           output_gate_scratch);
   }
@@ -107,7 +107,7 @@ inline void LstmStepWithAuxInput(
       forget_gate_scratch);
   tensor_utils::MatrixBatchVectorMultiplyAccumulate(input_to_cell_weights_ptr,
                                                     n_cell, n_input, input_ptr,
-                                                    n_batch, cell_scratch);
+                                                    n_batch, cell_gate_scratch);
   tensor_utils::MatrixBatchVectorMultiplyAccumulate(
       input_to_output_weights_ptr, n_cell, n_input, input_ptr, n_batch,
       output_gate_scratch);
@@ -125,7 +125,7 @@ inline void LstmStepWithAuxInput(
         n_batch, forget_gate_scratch);
     tensor_utils::MatrixBatchVectorMultiplyAccumulate(
         aux_input_to_cell_weights_ptr, n_cell, n_aux_input, aux_input_ptr,
-        n_batch, cell_scratch);
+        n_batch, cell_gate_scratch);
     tensor_utils::MatrixBatchVectorMultiplyAccumulate(
         aux_input_to_output_weights_ptr, n_cell, n_aux_input, aux_input_ptr,
         n_batch, output_gate_scratch);
@@ -142,7 +142,7 @@ inline void LstmStepWithAuxInput(
       n_batch, forget_gate_scratch);
   tensor_utils::MatrixBatchVectorMultiplyAccumulate(
       recurrent_to_cell_weights_ptr, n_cell, n_output, output_state_ptr,
-      n_batch, cell_scratch);
+      n_batch, cell_gate_scratch);
   tensor_utils::MatrixBatchVectorMultiplyAccumulate(
       recurrent_to_output_weights_ptr, n_cell, n_output, output_state_ptr,
       n_batch, output_gate_scratch);
@@ -193,26 +193,28 @@ inline void LstmStepWithAuxInput(
   tensor_utils::VectorVectorCwiseProduct(forget_gate_scratch, cell_state_ptr,
                                          n_batch * n_cell, cell_state_ptr);
   if (use_layer_norm) {
-    logger->LogTensorValue(intermediate_tensor_indexes[2], cell_scratch,
+    logger->LogTensorValue(intermediate_tensor_indexes[2], cell_gate_scratch,
                            n_cell * n_batch, error_reporter);
-    tensor_utils::MeanStddevNormalization(cell_scratch, cell_scratch, n_cell,
-                                          n_batch);
+    tensor_utils::MeanStddevNormalization(cell_gate_scratch, cell_gate_scratch,
+                                          n_cell, n_batch);
     tensor_utils::VectorBatchVectorCwiseProduct(
-        cell_layer_norm_coefficients_ptr, n_cell, cell_scratch, n_batch,
-        cell_scratch);
-    tensor_utils::VectorBatchVectorAdd(cell_bias_ptr, n_cell, n_batch,
-                                       cell_scratch);
+        cell_layer_norm_coefficients_ptr, n_cell, cell_gate_scratch, n_batch,
+        cell_gate_scratch);
+    tensor_utils::VectorBatchVectorAdd(cell_gate_bias_ptr, n_cell, n_batch,
+                                       cell_gate_scratch);
   }
-  tensor_utils::ApplyActivationToVector(cell_scratch, n_batch * n_cell,
-                                        params->activation, cell_scratch);
+  tensor_utils::ApplyActivationToVector(cell_gate_scratch, n_batch * n_cell,
+                                        params->activation, cell_gate_scratch);
   if (use_cifg) {
     tensor_utils::Sub1Vector(forget_gate_scratch, n_batch * n_cell,
                              forget_gate_scratch);
     tensor_utils::VectorVectorCwiseProductAccumulate(
-        cell_scratch, forget_gate_scratch, n_batch * n_cell, cell_state_ptr);
+        cell_gate_scratch, forget_gate_scratch, n_batch * n_cell,
+        cell_state_ptr);
   } else {
     tensor_utils::VectorVectorCwiseProductAccumulate(
-        cell_scratch, input_gate_scratch, n_batch * n_cell, cell_state_ptr);
+        cell_gate_scratch, input_gate_scratch, n_batch * n_cell,
+        cell_state_ptr);
   }
   if (params->cell_clip > 0.0) {
     tensor_utils::ClipVector(cell_state_ptr, n_batch * n_cell,
@@ -239,8 +241,8 @@ inline void LstmStepWithAuxInput(
   tensor_utils::ApplySigmoidToVector(output_gate_scratch, n_batch * n_cell,
                                      output_gate_scratch);
   tensor_utils::ApplyActivationToVector(cell_state_ptr, n_batch * n_cell,
-                                        params->activation, cell_scratch);
-  tensor_utils::VectorVectorCwiseProduct(output_gate_scratch, cell_scratch,
+                                        params->activation, cell_gate_scratch);
+  tensor_utils::VectorVectorCwiseProduct(output_gate_scratch, cell_gate_scratch,
                                          n_batch * n_cell, output_gate_scratch);
 
   logger->LogTensorValue(intermediate_tensor_indexes[4], output_gate_scratch,
@@ -329,16 +331,16 @@ TfLiteStatus EvalFloat(
   // Index the scratch buffers pointers to the global scratch buffer.
   float* scratch_buffer_ptr = GetTensorData<float>(scratch_buffer);
   float* input_gate_scratch = nullptr;
-  float* cell_scratch = nullptr;
+  float* cell_gate_scratch = nullptr;
   float* forget_gate_scratch = nullptr;
   float* output_gate_scratch = nullptr;
   if (use_cifg) {
-    cell_scratch = scratch_buffer_ptr;
+    cell_gate_scratch = scratch_buffer_ptr;
     forget_gate_scratch = scratch_buffer_ptr + n_cell * n_batch;
     output_gate_scratch = scratch_buffer_ptr + 2 * n_cell * n_batch;
   } else {
     input_gate_scratch = scratch_buffer_ptr;
-    cell_scratch = scratch_buffer_ptr + n_cell * n_batch;
+    cell_gate_scratch = scratch_buffer_ptr + n_cell * n_batch;
     forget_gate_scratch = scratch_buffer_ptr + 2 * n_cell * n_batch;
     output_gate_scratch = scratch_buffer_ptr + 3 * n_cell * n_batch;
   }
@@ -390,7 +392,7 @@ TfLiteStatus EvalFloat(
           n_input, aux_input_size, n_output, output_batch_leading_dim,
           GetTensorData<float>(activation_state),
           GetTensorData<float>(cell_state), input_gate_scratch,
-          forget_gate_scratch, cell_scratch, output_gate_scratch,
+          forget_gate_scratch, cell_gate_scratch, output_gate_scratch,
           output_ptr_time, logger, intermediate_tensor_indexes, error_reporter);
     }
   } else {
@@ -420,7 +422,7 @@ TfLiteStatus EvalFloat(
         float* input_gate_scratch_ptr =
             input_gate_scratch ? input_gate_scratch + b * n_cell : nullptr;
         float* forget_gate_scratch_ptr = forget_gate_scratch + b * n_cell;
-        float* cell_scratch_ptr = cell_scratch + b * n_cell;
+        float* cell_gate_scratch_ptr = cell_gate_scratch + b * n_cell;
         float* output_gate_scratch_ptr = output_gate_scratch + b * n_cell;
 
         LstmStepWithAuxInput(
@@ -451,8 +453,9 @@ TfLiteStatus EvalFloat(
             GetTensorData<float>(projection_bias), params, /*n_batch=*/1,
             n_cell, n_input, aux_input_size, n_output, output_batch_leading_dim,
             activation_state_ptr, cell_state_ptr, input_gate_scratch_ptr,
-            forget_gate_scratch_ptr, cell_scratch_ptr, output_gate_scratch_ptr,
-            output_ptr, logger, intermediate_tensor_indexes, error_reporter);
+            forget_gate_scratch_ptr, cell_gate_scratch_ptr,
+            output_gate_scratch_ptr, output_ptr, logger,
+            intermediate_tensor_indexes, error_reporter);
       }
     }
   }

From 430b00361b76827c055c63fb6398a520b25ed770 Mon Sep 17 00:00:00 2001
From: Sachin Joglekar <srjoglekar@google.com>
Date: Tue, 16 Jun 2020 11:20:59 -0700
Subject: [PATCH 0309/1390] Audit and improve TfLiteType checks in kernels

PiperOrigin-RevId: 316720436
Change-Id: I2032e799ee6afa533b932385c2a70f7621f4ac1b
---
 tensorflow/lite/c/common.h                    |  1 +
 tensorflow/lite/kernels/activations.cc        | 16 ++--
 tensorflow/lite/kernels/add.cc                |  2 +-
 tensorflow/lite/kernels/add_n.cc              |  2 +-
 tensorflow/lite/kernels/audio_spectrogram.cc  |  4 +-
 tensorflow/lite/kernels/basic_rnn.cc          |  9 +-
 tensorflow/lite/kernels/batch_matmul.cc       |  2 +-
 .../kernels/bidirectional_sequence_lstm.cc    | 54 ++++++------
 .../kernels/bidirectional_sequence_rnn.cc     |  2 +-
 tensorflow/lite/kernels/ceil.cc               |  2 +-
 tensorflow/lite/kernels/concatenation.cc      |  2 +-
 tensorflow/lite/kernels/conv.cc               | 16 ++--
 tensorflow/lite/kernels/depth_to_space.cc     |  2 +-
 tensorflow/lite/kernels/depthwise_conv.cc     | 14 ++--
 tensorflow/lite/kernels/div.cc                |  2 +-
 tensorflow/lite/kernels/elementwise.cc        |  4 +-
 .../lite/kernels/embedding_lookup_sparse.cc   |  2 +-
 tensorflow/lite/kernels/floor.cc              |  2 +-
 tensorflow/lite/kernels/floor_div.cc          |  2 +-
 tensorflow/lite/kernels/fully_connected.cc    | 16 ++--
 tensorflow/lite/kernels/if.cc                 |  2 +-
 tensorflow/lite/kernels/l2norm.cc             |  6 +-
 .../lite/kernels/local_response_norm.cc       |  4 +-
 tensorflow/lite/kernels/logical.cc            |  2 +-
 tensorflow/lite/kernels/lstm.cc               | 82 +++++++++----------
 tensorflow/lite/kernels/maximum_minimum.cc    |  3 +-
 tensorflow/lite/kernels/mfcc.cc               |  6 +-
 tensorflow/lite/kernels/mul.cc                |  2 +-
 tensorflow/lite/kernels/one_hot.cc            |  9 +-
 tensorflow/lite/kernels/pack.cc               |  4 +-
 tensorflow/lite/kernels/pad.cc                | 12 +--
 tensorflow/lite/kernels/pooling.cc            | 12 +--
 tensorflow/lite/kernels/pow.cc                |  5 +-
 tensorflow/lite/kernels/range.cc              |  4 +-
 tensorflow/lite/kernels/read_variable.cc      |  2 +-
 tensorflow/lite/kernels/reduce.cc             |  2 +-
 .../lite/kernels/resize_nearest_neighbor.cc   |  8 +-
 tensorflow/lite/kernels/reverse.cc            |  2 +-
 tensorflow/lite/kernels/reverse_sequence.cc   |  2 +-
 tensorflow/lite/kernels/round.cc              |  2 +-
 tensorflow/lite/kernels/select.cc             |  4 +-
 tensorflow/lite/kernels/skip_gram.cc          |  6 +-
 tensorflow/lite/kernels/space_to_batch_nd.cc  |  3 +-
 tensorflow/lite/kernels/space_to_depth.cc     |  2 +-
 tensorflow/lite/kernels/sparse_to_dense.cc    | 14 ++--
 tensorflow/lite/kernels/squared_difference.cc |  2 +-
 tensorflow/lite/kernels/strided_slice.cc      | 14 ++--
 tensorflow/lite/kernels/sub.cc                |  6 +-
 tensorflow/lite/kernels/tile.cc               |  2 +-
 tensorflow/lite/kernels/topk_v2.cc            | 11 ++-
 tensorflow/lite/kernels/transpose.cc          |  9 +-
 tensorflow/lite/kernels/transpose_conv.cc     | 16 ++--
 .../kernels/unidirectional_sequence_lstm.cc   | 22 ++---
 .../kernels/unidirectional_sequence_rnn.cc    |  9 +-
 tensorflow/lite/kernels/unpack.cc             |  2 +-
 tensorflow/lite/kernels/while.cc              |  4 +-
 .../micro/kernels/arc_mli/fully_connected.cc  |  2 +-
 tensorflow/lite/micro/kernels/ceil.cc         |  4 +-
 .../lite/micro/kernels/circular_buffer.cc     |  4 +-
 .../micro/kernels/cmsis-nn/fully_connected.cc |  2 +-
 tensorflow/lite/micro/kernels/cmsis-nn/mul.cc |  2 +-
 tensorflow/lite/micro/kernels/elementwise.cc  |  4 +-
 tensorflow/lite/micro/kernels/floor.cc        |  2 +-
 .../lite/micro/kernels/fully_connected.cc     |  2 +-
 tensorflow/lite/micro/kernels/l2norm.cc       |  6 +-
 tensorflow/lite/micro/kernels/logistic.cc     |  2 +-
 tensorflow/lite/micro/kernels/mul.cc          |  2 +-
 tensorflow/lite/micro/kernels/reshape.cc      |  2 +-
 tensorflow/lite/micro/kernels/round.cc        |  4 +-
 tensorflow/lite/micro/kernels/svdf.cc         |  4 +-
 tensorflow/lite/micro/kernels/tanh.cc         |  2 +-
 .../lite/micro/kernels/xtensa_hifi/floor.cc   |  2 +-
 .../lite/micro/kernels/xtensa_hifi/svdf.cc    |  4 +-
 .../kernels/xtensa_hifimini_legacy/svdf.cc    |  2 +-
 .../benchmark/experimental/c/c_api_types.h    |  1 +
 75 files changed, 258 insertions(+), 248 deletions(-)

diff --git a/tensorflow/lite/c/common.h b/tensorflow/lite/c/common.h
index ab769fec249..15823784d12 100644
--- a/tensorflow/lite/c/common.h
+++ b/tensorflow/lite/c/common.h
@@ -205,6 +205,7 @@ void TfLiteFloatArrayFree(TfLiteFloatArray* a);
 // the current function, while also reporting the location of the error.
 // `a` and `b` may be evaluated more than once, so no side effects or
 // extremely expensive computations should be done.
+// NOTE: Use TF_LITE_ENSURE_TYPES_EQ if comparing TfLiteTypes.
 #define TF_LITE_ENSURE_EQ(context, a, b)                                   \
   do {                                                                     \
     if ((a) != (b)) {                                                      \
diff --git a/tensorflow/lite/kernels/activations.cc b/tensorflow/lite/kernels/activations.cc
index 2b2428f3f92..7ad33973b38 100644
--- a/tensorflow/lite/kernels/activations.cc
+++ b/tensorflow/lite/kernels/activations.cc
@@ -254,7 +254,7 @@ TfLiteStatus GenericPrepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
   const TfLiteTensor* input = GetInput(context, node, 0);
   TfLiteTensor* output = GetOutput(context, node, 0);
-  TF_LITE_ENSURE_EQ(context, input->type, output->type);
+  TF_LITE_ENSURE_TYPES_EQ(context, input->type, output->type);
 
   return context->ResizeTensor(context, output,
                                TfLiteIntArrayCopy(input->dims));
@@ -274,7 +274,7 @@ TfLiteStatus ReluPrepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
   const TfLiteTensor* input = GetInput(context, node, 0);
   TfLiteTensor* output = GetOutput(context, node, 0);
-  TF_LITE_ENSURE_EQ(context, input->type, output->type);
+  TF_LITE_ENSURE_TYPES_EQ(context, input->type, output->type);
 
   if (input->type == kTfLiteInt8 || input->type == kTfLiteUInt8) {
     double real_multiplier = input->params.scale / output->params.scale;
@@ -355,7 +355,7 @@ TfLiteStatus LeakyReluPrepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
   const TfLiteTensor* input = GetInput(context, node, 0);
   TfLiteTensor* output = GetOutput(context, node, 0);
-  TF_LITE_ENSURE_EQ(context, input->type, output->type);
+  TF_LITE_ENSURE_TYPES_EQ(context, input->type, output->type);
 
   LeakyReluOpData* data = reinterpret_cast<LeakyReluOpData*>(node->user_data);
 
@@ -384,7 +384,7 @@ TfLiteStatus TanhPrepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
   const TfLiteTensor* input = GetInput(context, node, 0);
   TfLiteTensor* output = GetOutput(context, node, 0);
-  TF_LITE_ENSURE_EQ(context, input->type, output->type);
+  TF_LITE_ENSURE_TYPES_EQ(context, input->type, output->type);
 
   if (kernel_type == kFixedPointOptimized) {
     if (input->type == kTfLiteUInt8 || input->type == kTfLiteInt8) {
@@ -469,7 +469,7 @@ TfLiteStatus SigmoidPrepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
   const TfLiteTensor* input = GetInput(context, node, 0);
   TfLiteTensor* output = GetOutput(context, node, 0);
-  TF_LITE_ENSURE_EQ(context, input->type, output->type);
+  TF_LITE_ENSURE_TYPES_EQ(context, input->type, output->type);
 
   if (kernel_type == kFixedPointOptimized) {
     if (input->type == kTfLiteUInt8 || input->type == kTfLiteInt8) {
@@ -569,7 +569,7 @@ TfLiteStatus SoftmaxPrepare(TfLiteContext* context, TfLiteNode* node) {
                                 input->type == kTfLiteUInt8 ||
                                 input->type == kTfLiteInt16);
   } else {
-    TF_LITE_ENSURE_EQ(context, input->type, output->type);
+    TF_LITE_ENSURE_TYPES_EQ(context, input->type, output->type);
   }
 
   TF_LITE_ENSURE(context, NumDimensions(input) >= 1);
@@ -632,7 +632,7 @@ TfLiteStatus LogSoftmaxPrepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
   const TfLiteTensor* input = GetInput(context, node, 0);
   TfLiteTensor* output = GetOutput(context, node, 0);
-  TF_LITE_ENSURE_EQ(context, input->type, output->type);
+  TF_LITE_ENSURE_TYPES_EQ(context, input->type, output->type);
 
   if (input->type == kTfLiteUInt8 || input->type == kTfLiteInt8) {
     TF_LITE_ENSURE_EQ(context, output->params.scale, 16.0 / 256);
@@ -671,7 +671,7 @@ TfLiteStatus PreluPrepare(TfLiteContext* context, TfLiteNode* node) {
   const TfLiteTensor* alpha = GetInput(context, node, 1);
   PreluOpData* data = reinterpret_cast<PreluOpData*>(node->user_data);
 
-  TF_LITE_ENSURE_EQ(context, input->type, alpha->type);
+  TF_LITE_ENSURE_TYPES_EQ(context, input->type, alpha->type);
 
   output->type = input->type;
 
diff --git a/tensorflow/lite/kernels/add.cc b/tensorflow/lite/kernels/add.cc
index 279f6aa12ce..d6e5db90a97 100644
--- a/tensorflow/lite/kernels/add.cc
+++ b/tensorflow/lite/kernels/add.cc
@@ -90,7 +90,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   const TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
 
-  TF_LITE_ENSURE_EQ(context, input1->type, input2->type);
+  TF_LITE_ENSURE_TYPES_EQ(context, input1->type, input2->type);
   output->type = input2->type;
 
   const bool requires_broadcast = !HaveSameShapes(input1, input2);
diff --git a/tensorflow/lite/kernels/add_n.cc b/tensorflow/lite/kernels/add_n.cc
index 7b4d52c5272..e933c5bbd66 100644
--- a/tensorflow/lite/kernels/add_n.cc
+++ b/tensorflow/lite/kernels/add_n.cc
@@ -41,7 +41,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   for (int i = kInputTensor1 + 1; i < num_inputs; ++i) {
     const TfLiteTensor* input = GetInput(context, node, i);
     TF_LITE_ENSURE(context, HaveSameShapes(input1, input));
-    TF_LITE_ENSURE_EQ(context, input1->type, input->type);
+    TF_LITE_ENSURE_TYPES_EQ(context, input1->type, input->type);
   }
 
   // Use the first input node's dimension to be the dimension of the output
diff --git a/tensorflow/lite/kernels/audio_spectrogram.cc b/tensorflow/lite/kernels/audio_spectrogram.cc
index 29c9eeef3d0..8132130f4ab 100644
--- a/tensorflow/lite/kernels/audio_spectrogram.cc
+++ b/tensorflow/lite/kernels/audio_spectrogram.cc
@@ -81,8 +81,8 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 
   TF_LITE_ENSURE_EQ(context, NumDimensions(input), 2);
 
-  TF_LITE_ENSURE_EQ(context, output->type, kTfLiteFloat32);
-  TF_LITE_ENSURE_EQ(context, input->type, output->type);
+  TF_LITE_ENSURE_TYPES_EQ(context, output->type, kTfLiteFloat32);
+  TF_LITE_ENSURE_TYPES_EQ(context, input->type, output->type);
 
   TF_LITE_ENSURE(context, params->spectrogram->Initialize(params->window_size,
                                                           params->stride));
diff --git a/tensorflow/lite/kernels/basic_rnn.cc b/tensorflow/lite/kernels/basic_rnn.cc
index 920e8cd223a..c2e503d6462 100644
--- a/tensorflow/lite/kernels/basic_rnn.cc
+++ b/tensorflow/lite/kernels/basic_rnn.cc
@@ -79,8 +79,9 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
                     bias->dims->data[0]);
   TF_LITE_ENSURE_EQ(context, recurrent_weights->dims->data[1],
                     bias->dims->data[0]);
-  TF_LITE_ENSURE_EQ(context, input->type, kTfLiteFloat32);
-  TF_LITE_ENSURE_EQ(context, input_weights->type, recurrent_weights->type);
+  TF_LITE_ENSURE_TYPES_EQ(context, input->type, kTfLiteFloat32);
+  TF_LITE_ENSURE_TYPES_EQ(context, input_weights->type,
+                          recurrent_weights->type);
   TF_LITE_ENSURE_EQ(context, NumDimensions(hidden_state), 2);
   TF_LITE_ENSURE_EQ(context, hidden_state->dims->data[0], batch_size);
   TF_LITE_ENSURE_EQ(context, hidden_state->dims->data[1], num_units);
@@ -288,8 +289,8 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
                         accum_scratch, row_sums, &op_data->compute_row_sums);
     }
     default:
-      context->ReportError(context, "Type %d not currently supported.",
-                           input_weights->type);
+      TF_LITE_KERNEL_LOG(context, "Type %s not currently supported.",
+                         TfLiteTypeGetName(input_weights->type));
       return kTfLiteError;
   }
   return kTfLiteOk;
diff --git a/tensorflow/lite/kernels/batch_matmul.cc b/tensorflow/lite/kernels/batch_matmul.cc
index d2115f96e1c..8bc23c9c94a 100644
--- a/tensorflow/lite/kernels/batch_matmul.cc
+++ b/tensorflow/lite/kernels/batch_matmul.cc
@@ -282,7 +282,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   const TfLiteTensor* rhs_data = GetInput(context, node, kInputRHSTensor);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
 
-  TF_LITE_ENSURE_EQ(context, lhs_data->type, kTfLiteFloat32);
+  TF_LITE_ENSURE_TYPES_EQ(context, lhs_data->type, kTfLiteFloat32);
   TF_LITE_ENSURE(context, rhs_data->type == kTfLiteFloat32 ||
                               rhs_data->type == kTfLiteInt8);
   // Support dimensions between 2 and 4, inclusive.
diff --git a/tensorflow/lite/kernels/bidirectional_sequence_lstm.cc b/tensorflow/lite/kernels/bidirectional_sequence_lstm.cc
index a984ff5124f..439fc94afad 100644
--- a/tensorflow/lite/kernels/bidirectional_sequence_lstm.cc
+++ b/tensorflow/lite/kernels/bidirectional_sequence_lstm.cc
@@ -203,8 +203,8 @@ TfLiteStatus CheckLstmTensorDimensionsAndTypes(
     TF_LITE_ENSURE_EQ(context, input_to_input_weights->dims->size, 2);
     TF_LITE_ENSURE_EQ(context, input_to_input_weights->dims->data[0], n_cell);
     TF_LITE_ENSURE_EQ(context, input_to_input_weights->dims->data[1], n_input);
-    TF_LITE_ENSURE_EQ(context, input_to_input_weights->type,
-                      input_to_forget_weights->type);
+    TF_LITE_ENSURE_TYPES_EQ(context, input_to_input_weights->type,
+                            input_to_forget_weights->type);
   }
 
   const TfLiteTensor* input_to_cell_weights =
@@ -212,16 +212,16 @@ TfLiteStatus CheckLstmTensorDimensionsAndTypes(
   TF_LITE_ENSURE_EQ(context, input_to_cell_weights->dims->size, 2);
   TF_LITE_ENSURE_EQ(context, input_to_cell_weights->dims->data[0], n_cell);
   TF_LITE_ENSURE_EQ(context, input_to_cell_weights->dims->data[1], n_input);
-  TF_LITE_ENSURE_EQ(context, input_to_cell_weights->type,
-                    input_to_forget_weights->type);
+  TF_LITE_ENSURE_TYPES_EQ(context, input_to_cell_weights->type,
+                          input_to_forget_weights->type);
 
   const TfLiteTensor* input_to_output_weights =
       GetInput(context, node, input_to_output_weights_tensor);
   TF_LITE_ENSURE_EQ(context, input_to_output_weights->dims->size, 2);
   TF_LITE_ENSURE_EQ(context, input_to_output_weights->dims->data[0], n_cell);
   TF_LITE_ENSURE_EQ(context, input_to_output_weights->dims->data[1], n_input);
-  TF_LITE_ENSURE_EQ(context, input_to_output_weights->type,
-                    input_to_forget_weights->type);
+  TF_LITE_ENSURE_TYPES_EQ(context, input_to_output_weights->type,
+                          input_to_forget_weights->type);
 
   const TfLiteTensor* recurrent_to_input_weights =
       GetOptionalInputTensor(context, node, recurrent_to_input_weights_tensor);
@@ -231,8 +231,8 @@ TfLiteStatus CheckLstmTensorDimensionsAndTypes(
                       n_cell);
     TF_LITE_ENSURE_EQ(context, recurrent_to_input_weights->dims->data[1],
                       n_output);
-    TF_LITE_ENSURE_EQ(context, recurrent_to_input_weights->type,
-                      input_to_forget_weights->type);
+    TF_LITE_ENSURE_TYPES_EQ(context, recurrent_to_input_weights->type,
+                            input_to_forget_weights->type);
   }
 
   const TfLiteTensor* recurrent_to_forget_weights =
@@ -242,8 +242,8 @@ TfLiteStatus CheckLstmTensorDimensionsAndTypes(
                     n_cell);
   TF_LITE_ENSURE_EQ(context, recurrent_to_forget_weights->dims->data[1],
                     n_output);
-  TF_LITE_ENSURE_EQ(context, recurrent_to_forget_weights->type,
-                    input_to_forget_weights->type);
+  TF_LITE_ENSURE_TYPES_EQ(context, recurrent_to_forget_weights->type,
+                          input_to_forget_weights->type);
 
   const TfLiteTensor* recurrent_to_cell_weights =
       GetInput(context, node, recurrent_to_cell_weights_tensor);
@@ -251,8 +251,8 @@ TfLiteStatus CheckLstmTensorDimensionsAndTypes(
   TF_LITE_ENSURE_EQ(context, recurrent_to_cell_weights->dims->data[0], n_cell);
   TF_LITE_ENSURE_EQ(context, recurrent_to_cell_weights->dims->data[1],
                     n_output);
-  TF_LITE_ENSURE_EQ(context, recurrent_to_cell_weights->type,
-                    input_to_forget_weights->type);
+  TF_LITE_ENSURE_TYPES_EQ(context, recurrent_to_cell_weights->type,
+                          input_to_forget_weights->type);
 
   // We make sure the input-gate's parameters are either both present (regular
   // LSTM) or not at all (CIFG-LSTM).
@@ -268,8 +268,8 @@ TfLiteStatus CheckLstmTensorDimensionsAndTypes(
   if (cell_to_input_weights != nullptr) {
     TF_LITE_ENSURE_EQ(context, cell_to_input_weights->dims->size, 1);
     TF_LITE_ENSURE_EQ(context, cell_to_input_weights->dims->data[0], n_cell);
-    TF_LITE_ENSURE_EQ(context, cell_to_input_weights->type,
-                      input_to_forget_weights->type);
+    TF_LITE_ENSURE_TYPES_EQ(context, cell_to_input_weights->type,
+                            input_to_forget_weights->type);
   }
 
   const TfLiteTensor* cell_to_forget_weights =
@@ -277,8 +277,8 @@ TfLiteStatus CheckLstmTensorDimensionsAndTypes(
   if (cell_to_forget_weights != nullptr) {
     TF_LITE_ENSURE_EQ(context, cell_to_forget_weights->dims->size, 1);
     TF_LITE_ENSURE_EQ(context, cell_to_forget_weights->dims->data[0], n_cell);
-    TF_LITE_ENSURE_EQ(context, cell_to_forget_weights->type,
-                      input_to_forget_weights->type);
+    TF_LITE_ENSURE_TYPES_EQ(context, cell_to_forget_weights->type,
+                            input_to_forget_weights->type);
   }
 
   const TfLiteTensor* cell_to_output_weights =
@@ -286,8 +286,8 @@ TfLiteStatus CheckLstmTensorDimensionsAndTypes(
   if (cell_to_output_weights != nullptr) {
     TF_LITE_ENSURE_EQ(context, cell_to_output_weights->dims->size, 1);
     TF_LITE_ENSURE_EQ(context, cell_to_output_weights->dims->data[0], n_cell);
-    TF_LITE_ENSURE_EQ(context, cell_to_output_weights->type,
-                      input_to_forget_weights->type);
+    TF_LITE_ENSURE_TYPES_EQ(context, cell_to_output_weights->type,
+                            input_to_forget_weights->type);
   }
 
   // Making sure the peephole weights are there all or none.
@@ -309,14 +309,14 @@ TfLiteStatus CheckLstmTensorDimensionsAndTypes(
   } else {
     TF_LITE_ENSURE_EQ(context, input_gate_bias->dims->size, 1);
     TF_LITE_ENSURE_EQ(context, input_gate_bias->dims->data[0], n_cell);
-    TF_LITE_ENSURE_EQ(context, input_gate_bias->type, kTfLiteFloat32);
+    TF_LITE_ENSURE_TYPES_EQ(context, input_gate_bias->type, kTfLiteFloat32);
   }
 
   const TfLiteTensor* forget_gate_bias =
       GetInput(context, node, forget_gate_bias_tensor);
   TF_LITE_ENSURE_EQ(context, forget_gate_bias->dims->size, 1);
   TF_LITE_ENSURE_EQ(context, forget_gate_bias->dims->data[0], n_cell);
-  TF_LITE_ENSURE_EQ(context, forget_gate_bias->type, kTfLiteFloat32);
+  TF_LITE_ENSURE_TYPES_EQ(context, forget_gate_bias->type, kTfLiteFloat32);
 
   const TfLiteTensor* cell_bias =
       GetInput(context, node, cell_gate_bias_tensor);
@@ -328,7 +328,7 @@ TfLiteStatus CheckLstmTensorDimensionsAndTypes(
       GetInput(context, node, output_gate_bias_tensor);
   TF_LITE_ENSURE_EQ(context, output_gate_bias->dims->size, 1);
   TF_LITE_ENSURE_EQ(context, output_gate_bias->dims->data[0], n_cell);
-  TF_LITE_ENSURE_EQ(context, output_gate_bias->type, kTfLiteFloat32);
+  TF_LITE_ENSURE_TYPES_EQ(context, output_gate_bias->type, kTfLiteFloat32);
 
   const TfLiteTensor* projection_weights =
       GetOptionalInputTensor(context, node, projection_weights_tensor);
@@ -336,8 +336,8 @@ TfLiteStatus CheckLstmTensorDimensionsAndTypes(
     TF_LITE_ENSURE_EQ(context, projection_weights->dims->size, 2);
     TF_LITE_ENSURE_EQ(context, projection_weights->dims->data[0], n_output);
     TF_LITE_ENSURE_EQ(context, projection_weights->dims->data[1], n_cell);
-    TF_LITE_ENSURE_EQ(context, projection_weights->type,
-                      input_to_forget_weights->type);
+    TF_LITE_ENSURE_TYPES_EQ(context, projection_weights->type,
+                            input_to_forget_weights->type);
   }
 
   const TfLiteTensor* projection_bias =
@@ -345,7 +345,7 @@ TfLiteStatus CheckLstmTensorDimensionsAndTypes(
   if (projection_bias != nullptr) {
     TF_LITE_ENSURE_EQ(context, projection_bias->dims->size, 1);
     TF_LITE_ENSURE_EQ(context, projection_bias->dims->data[0], n_output);
-    TF_LITE_ENSURE_EQ(context, projection_bias->type, kTfLiteFloat32);
+    TF_LITE_ENSURE_TYPES_EQ(context, projection_bias->type, kTfLiteFloat32);
   }
 
   // Making sure the projection tensors are consistent:
@@ -410,7 +410,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   // Inferring batch size, number of outputs and sequence length and
   // number of cells from the input tensors.
   const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  TF_LITE_ENSURE_EQ(context, input->type, kTfLiteFloat32);
+  TF_LITE_ENSURE_TYPES_EQ(context, input->type, kTfLiteFloat32);
   TF_LITE_ENSURE_EQ(context, input->dims->size, 3);
   const bool time_major = params->time_major;
   const int max_time = time_major ? input->dims->data[0] : input->dims->data[1];
@@ -1140,8 +1140,8 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
       return kTfLiteOk;
     }
     default:
-      context->ReportError(context, "Type %d is not currently supported.",
-                           fw_input_to_output_weights->type);
+      TF_LITE_KERNEL_LOG(context, "Type %s is not currently supported.",
+                         TfLiteTypeGetName(fw_input_to_output_weights->type));
       return kTfLiteError;
   }
   return kTfLiteOk;
diff --git a/tensorflow/lite/kernels/bidirectional_sequence_rnn.cc b/tensorflow/lite/kernels/bidirectional_sequence_rnn.cc
index abaf6df9fa8..bc88740b6ed 100644
--- a/tensorflow/lite/kernels/bidirectional_sequence_rnn.cc
+++ b/tensorflow/lite/kernels/bidirectional_sequence_rnn.cc
@@ -129,7 +129,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 
   // Check all the parameters of tensor match within themselves and match the
   // input configuration.
-  TF_LITE_ENSURE_EQ(context, input->type, kTfLiteFloat32);
+  TF_LITE_ENSURE_TYPES_EQ(context, input->type, kTfLiteFloat32);
 
   TF_LITE_ENSURE_EQ(context, input->dims->size, 3);
   const bool time_major = params->time_major;
diff --git a/tensorflow/lite/kernels/ceil.cc b/tensorflow/lite/kernels/ceil.cc
index 9914dbe09ce..d8c6eaad7a4 100644
--- a/tensorflow/lite/kernels/ceil.cc
+++ b/tensorflow/lite/kernels/ceil.cc
@@ -32,7 +32,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
-  TF_LITE_ENSURE_EQ(context, input->type, kTfLiteFloat32);
+  TF_LITE_ENSURE_TYPES_EQ(context, input->type, kTfLiteFloat32);
   output->type = input->type;
   TfLiteIntArray* output_size = TfLiteIntArrayCopy(input->dims);
   return context->ResizeTensor(context, output, output_size);
diff --git a/tensorflow/lite/kernels/concatenation.cc b/tensorflow/lite/kernels/concatenation.cc
index 61748e5ce58..5d5f06ba013 100644
--- a/tensorflow/lite/kernels/concatenation.cc
+++ b/tensorflow/lite/kernels/concatenation.cc
@@ -81,7 +81,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   }
 
   TfLiteTensor* output = GetOutput(context, node, 0);
-  TF_LITE_ENSURE_EQ(context, output->type, input_type);
+  TF_LITE_ENSURE_TYPES_EQ(context, output->type, input_type);
 
   if (input_type == kTfLiteInt8) {
     // Make sure there is no re-scaling needed for Int8 quantized kernel. This
diff --git a/tensorflow/lite/kernels/conv.cc b/tensorflow/lite/kernels/conv.cc
index fa6caff5baa..88765b2f9c4 100644
--- a/tensorflow/lite/kernels/conv.cc
+++ b/tensorflow/lite/kernels/conv.cc
@@ -320,7 +320,7 @@ TfLiteStatus Prepare(KernelType kernel_type, TfLiteContext* context,
   TF_LITE_ENSURE(context,
                  input_type == kTfLiteFloat32 || input_type == kTfLiteUInt8 ||
                      input_type == kTfLiteInt8 || input_type == kTfLiteInt16);
-  TF_LITE_ENSURE_EQ(context, output->type, input_type);
+  TF_LITE_ENSURE_TYPES_EQ(context, output->type, input_type);
 
   const TfLiteTensor* bias = nullptr;
 
@@ -331,15 +331,15 @@ TfLiteStatus Prepare(KernelType kernel_type, TfLiteContext* context,
   if (has_bias) {
     bias = GetInput(context, node, 2);
     if (input_type == kTfLiteUInt8 || input_type == kTfLiteInt8) {
-      TF_LITE_ENSURE_EQ(context, bias->type, kTfLiteInt32);
+      TF_LITE_ENSURE_TYPES_EQ(context, bias->type, kTfLiteInt32);
       TF_LITE_ENSURE_EQ(context, bias->params.zero_point, 0);
     } else if (input_type == kTfLiteInt16) {
-      TF_LITE_ENSURE_EQ(context, bias->type, kTfLiteInt64);
+      TF_LITE_ENSURE_TYPES_EQ(context, bias->type, kTfLiteInt64);
       TF_LITE_ENSURE_EQ(context, bias->params.zero_point, 0);
       TF_LITE_ENSURE_EQ(context, input->params.zero_point, 0);
       TF_LITE_ENSURE_EQ(context, output->params.zero_point, 0);
     } else {
-      TF_LITE_ENSURE_EQ(context, bias->type, input_type);
+      TF_LITE_ENSURE_TYPES_EQ(context, bias->type, input_type);
     }
     TF_LITE_ENSURE_EQ(context, NumElements(bias), SizeOfDimension(filter, 0));
   }
@@ -984,8 +984,8 @@ TfLiteStatus EvalImpl(TfLiteContext* context, TfLiteNode* node) {
           context, node, params, data, input, filter, bias, output, im2col);
       break;
     default:
-      context->ReportError(context, "Type %s currently not supported.",
-                           TfLiteTypeGetName(input->type));
+      TF_LITE_KERNEL_LOG(context, "Type %s currently not supported.",
+                         TfLiteTypeGetName(input->type));
       return kTfLiteError;
   }
   return kTfLiteOk;
@@ -1005,8 +1005,8 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
     case kTfLiteInt16:
       return EvalImpl<kernel_type, kTfLiteInt16>(context, node);
     default:
-      context->ReportError(context, "Type %d not currently supported.",
-                           input->type);
+      TF_LITE_KERNEL_LOG(context, "Type %s not currently supported.",
+                         TfLiteTypeGetName(input->type));
       return kTfLiteError;
   }
 }
diff --git a/tensorflow/lite/kernels/depth_to_space.cc b/tensorflow/lite/kernels/depth_to_space.cc
index 8a81ea932bf..1637ad4350f 100644
--- a/tensorflow/lite/kernels/depth_to_space.cc
+++ b/tensorflow/lite/kernels/depth_to_space.cc
@@ -55,7 +55,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
                  data_type == kTfLiteFloat32 || data_type == kTfLiteUInt8 ||
                      data_type == kTfLiteInt8 || data_type == kTfLiteInt32 ||
                      data_type == kTfLiteInt64);
-  TF_LITE_ENSURE_EQ(context, input->type, output->type);
+  TF_LITE_ENSURE_TYPES_EQ(context, input->type, output->type);
 
   const int block_size = params->block_size;
   const int input_height = input->dims->data[1];
diff --git a/tensorflow/lite/kernels/depthwise_conv.cc b/tensorflow/lite/kernels/depthwise_conv.cc
index 1897d14a065..961a987cf02 100644
--- a/tensorflow/lite/kernels/depthwise_conv.cc
+++ b/tensorflow/lite/kernels/depthwise_conv.cc
@@ -122,7 +122,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE(context,
                  data_type == kTfLiteFloat32 || data_type == kTfLiteUInt8 ||
                      data_type == kTfLiteInt8 || data_type == kTfLiteInt16);
-  TF_LITE_ENSURE_EQ(context, output->type, data_type);
+  TF_LITE_ENSURE_TYPES_EQ(context, output->type, data_type);
   if (!is_hybrid) {
     TF_LITE_ENSURE(context,
                    filter->type == data_type || data_type == kTfLiteInt16);
@@ -134,15 +134,15 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   if (hasBias) {
     bias = GetInput(context, node, kBiasTensor);
     if (data_type == kTfLiteUInt8 || data_type == kTfLiteInt8) {
-      TF_LITE_ENSURE_EQ(context, bias->type, kTfLiteInt32);
+      TF_LITE_ENSURE_TYPES_EQ(context, bias->type, kTfLiteInt32);
       TF_LITE_ENSURE_EQ(context, bias->params.zero_point, 0);
     } else if (data_type == kTfLiteInt16) {
-      TF_LITE_ENSURE_EQ(context, bias->type, kTfLiteInt64);
+      TF_LITE_ENSURE_TYPES_EQ(context, bias->type, kTfLiteInt64);
       TF_LITE_ENSURE_EQ(context, bias->params.zero_point, 0);
       TF_LITE_ENSURE_EQ(context, input->params.zero_point, 0);
       TF_LITE_ENSURE_EQ(context, output->params.zero_point, 0);
     } else {
-      TF_LITE_ENSURE_EQ(context, bias->type, data_type);
+      TF_LITE_ENSURE_TYPES_EQ(context, bias->type, data_type);
     }
     TF_LITE_ENSURE_EQ(context, NumDimensions(bias), 1);
     TF_LITE_ENSURE_EQ(context, SizeOfDimension(filter, 3),
@@ -520,9 +520,9 @@ TfLiteStatus EvalImpl(TfLiteContext* context, TfLiteNode* node) {
         return EvalHybridPerChannel<kernel_type>(context, node, params, data,
                                                  input, filter, bias, output);
       } else {
-        context->ReportError(
-            context, "Type %d with filter type %d not currently supported.",
-            input->type, filter->type);
+        TF_LITE_KERNEL_LOG(
+            context, "Type %s with filter type %s not currently supported.",
+            TfLiteTypeGetName(input->type), TfLiteTypeGetName(filter->type));
         return kTfLiteError;
       }
       break;
diff --git a/tensorflow/lite/kernels/div.cc b/tensorflow/lite/kernels/div.cc
index cdd02277ec9..c9eb1db531a 100644
--- a/tensorflow/lite/kernels/div.cc
+++ b/tensorflow/lite/kernels/div.cc
@@ -78,7 +78,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   const TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
 
-  TF_LITE_ENSURE_EQ(context, input1->type, input2->type);
+  TF_LITE_ENSURE_TYPES_EQ(context, input1->type, input2->type);
   output->type = input2->type;
 
   data->requires_broadcast = !HaveSameShapes(input1, input2);
diff --git a/tensorflow/lite/kernels/elementwise.cc b/tensorflow/lite/kernels/elementwise.cc
index 95b791be3f2..1b91244af33 100644
--- a/tensorflow/lite/kernels/elementwise.cc
+++ b/tensorflow/lite/kernels/elementwise.cc
@@ -45,7 +45,7 @@ TfLiteStatus GenericPrepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
   const TfLiteTensor* input = GetInput(context, node, 0);
   TfLiteTensor* output = GetOutput(context, node, 0);
-  TF_LITE_ENSURE_EQ(context, input->type, output->type);
+  TF_LITE_ENSURE_TYPES_EQ(context, input->type, output->type);
   if (!IsSupportedType(input->type)) {
     context->ReportError(context, "Current data type %d is not supported.",
                          input->type);
@@ -60,7 +60,7 @@ inline TfLiteStatus EvalImpl(TfLiteContext* context, TfLiteNode* node,
                              T func(T), TfLiteType expected_type) {
   const TfLiteTensor* input = GetInput(context, node, 0);
   TfLiteTensor* output = GetOutput(context, node, 0);
-  TF_LITE_ENSURE_EQ(context, input->type, expected_type);
+  TF_LITE_ENSURE_TYPES_EQ(context, input->type, expected_type);
   const int64_t num_elements = NumElements(input);
   const T* in_data = GetTensorData<T>(input);
   T* out_data = GetTensorData<T>(output);
diff --git a/tensorflow/lite/kernels/embedding_lookup_sparse.cc b/tensorflow/lite/kernels/embedding_lookup_sparse.cc
index 92574817e3b..745b5090094 100644
--- a/tensorflow/lite/kernels/embedding_lookup_sparse.cc
+++ b/tensorflow/lite/kernels/embedding_lookup_sparse.cc
@@ -109,7 +109,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 
   // Mark the output as a dynamic tensor.
   TfLiteTensor* output = GetOutput(context, node, 0);
-  TF_LITE_ENSURE_EQ(context, output->type, kTfLiteFloat32);
+  TF_LITE_ENSURE_TYPES_EQ(context, output->type, kTfLiteFloat32);
   output->allocation_type = kTfLiteDynamic;
 
   return kTfLiteOk;
diff --git a/tensorflow/lite/kernels/floor.cc b/tensorflow/lite/kernels/floor.cc
index 2e341218700..d629b48d1a6 100644
--- a/tensorflow/lite/kernels/floor.cc
+++ b/tensorflow/lite/kernels/floor.cc
@@ -39,7 +39,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
-  TF_LITE_ENSURE_EQ(context, input->type, kTfLiteFloat32);
+  TF_LITE_ENSURE_TYPES_EQ(context, input->type, kTfLiteFloat32);
   output->type = input->type;
   TfLiteIntArray* output_size = TfLiteIntArrayCopy(input->dims);
   return context->ResizeTensor(context, output, output_size);
diff --git a/tensorflow/lite/kernels/floor_div.cc b/tensorflow/lite/kernels/floor_div.cc
index 5677dc4d9b7..24682fdebe1 100644
--- a/tensorflow/lite/kernels/floor_div.cc
+++ b/tensorflow/lite/kernels/floor_div.cc
@@ -68,7 +68,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   const TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
 
-  TF_LITE_ENSURE_EQ(context, input1->type, input2->type);
+  TF_LITE_ENSURE_TYPES_EQ(context, input1->type, input2->type);
 
   const TfLiteType type = input1->type;
   switch (type) {
diff --git a/tensorflow/lite/kernels/fully_connected.cc b/tensorflow/lite/kernels/fully_connected.cc
index a1893878232..8b7a7832dbb 100644
--- a/tensorflow/lite/kernels/fully_connected.cc
+++ b/tensorflow/lite/kernels/fully_connected.cc
@@ -101,13 +101,13 @@ inline TfLiteStatus CheckTypes(TfLiteContext* context,
 
   if (is_quantized) {
     if (is_shuffled) {
-      TF_LITE_ENSURE_EQ(context, input->type, kTfLiteUInt8);
-      TF_LITE_ENSURE_EQ(context, filter->type, kTfLiteUInt8);
-      TF_LITE_ENSURE_EQ(context, output->type, kTfLiteInt16);
+      TF_LITE_ENSURE_TYPES_EQ(context, input->type, kTfLiteUInt8);
+      TF_LITE_ENSURE_TYPES_EQ(context, filter->type, kTfLiteUInt8);
+      TF_LITE_ENSURE_TYPES_EQ(context, output->type, kTfLiteInt16);
       TF_LITE_ENSURE_EQ(context, is_optional_bias_int, true);
     } else if (is_hybrid) {
-      TF_LITE_ENSURE_EQ(context, input->type, kTfLiteFloat32);
-      TF_LITE_ENSURE_EQ(context, output->type, kTfLiteFloat32);
+      TF_LITE_ENSURE_TYPES_EQ(context, input->type, kTfLiteFloat32);
+      TF_LITE_ENSURE_TYPES_EQ(context, output->type, kTfLiteFloat32);
       TF_LITE_ENSURE_EQ(context, is_optional_bias_float, true);
     } else {
       TF_LITE_ENSURE(context, input->type == kTfLiteUInt8 ||
@@ -120,9 +120,9 @@ inline TfLiteStatus CheckTypes(TfLiteContext* context,
     }
   } else {
     // Only float32 is supported currently
-    TF_LITE_ENSURE_EQ(context, input->type, kTfLiteFloat32);
-    TF_LITE_ENSURE_EQ(context, output->type, kTfLiteFloat32);
-    TF_LITE_ENSURE_EQ(context, filter->type, kTfLiteFloat32);
+    TF_LITE_ENSURE_TYPES_EQ(context, input->type, kTfLiteFloat32);
+    TF_LITE_ENSURE_TYPES_EQ(context, output->type, kTfLiteFloat32);
+    TF_LITE_ENSURE_TYPES_EQ(context, filter->type, kTfLiteFloat32);
     TF_LITE_ENSURE_EQ(context, is_optional_bias_float, true);
   }
 
diff --git a/tensorflow/lite/kernels/if.cc b/tensorflow/lite/kernels/if.cc
index d3f92a92b08..4c39a07bf8b 100644
--- a/tensorflow/lite/kernels/if.cc
+++ b/tensorflow/lite/kernels/if.cc
@@ -88,7 +88,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
                             input->dims->data + input->dims->size);
       subgraph->ResizeInputTensor(i, dims);
       TfLiteTensor* subgraph_input = subgraph->tensor(subgraph->inputs()[i]);
-      TF_LITE_ENSURE_EQ(context, input->type, subgraph_input->type);
+      TF_LITE_ENSURE_TYPES_EQ(context, input->type, subgraph_input->type);
     }
     // Note: The `Prepare` function is responsible to run `AllocateTensors` on
     // both subgraphs. It's intentionally not to break out of the loop when
diff --git a/tensorflow/lite/kernels/l2norm.cc b/tensorflow/lite/kernels/l2norm.cc
index a7fb35ed594..857ef62a155 100644
--- a/tensorflow/lite/kernels/l2norm.cc
+++ b/tensorflow/lite/kernels/l2norm.cc
@@ -52,7 +52,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE(context, output->type == kTfLiteFloat32 ||
                               output->type == kTfLiteUInt8 ||
                               output->type == kTfLiteInt8);
-  TF_LITE_ENSURE_EQ(context, input->type, output->type);
+  TF_LITE_ENSURE_TYPES_EQ(context, input->type, output->type);
 
   if (output->type == kTfLiteUInt8 || output->type == kTfLiteInt8) {
     TF_LITE_ENSURE_EQ(context, output->params.scale, (1. / 128.));
@@ -133,8 +133,8 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
                                            depth, GetTensorData<int8>(input),
                                            GetTensorData<int8>(output));
   } else {
-    context->ReportError(context, "Output type is %d, requires float.",
-                         output->type);
+    TF_LITE_KERNEL_LOG(context, "Output type is %s, requires float.",
+                       TfLiteTypeGetName(output->type));
     return kTfLiteError;
   }
 
diff --git a/tensorflow/lite/kernels/local_response_norm.cc b/tensorflow/lite/kernels/local_response_norm.cc
index f4b996c45a1..ed964365920 100644
--- a/tensorflow/lite/kernels/local_response_norm.cc
+++ b/tensorflow/lite/kernels/local_response_norm.cc
@@ -44,8 +44,8 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 
   TF_LITE_ENSURE_EQ(context, NumDimensions(input), 4);
 
-  TF_LITE_ENSURE_EQ(context, output->type, kTfLiteFloat32);
-  TF_LITE_ENSURE_EQ(context, input->type, output->type);
+  TF_LITE_ENSURE_TYPES_EQ(context, output->type, kTfLiteFloat32);
+  TF_LITE_ENSURE_TYPES_EQ(context, input->type, output->type);
 
   TfLiteIntArray* output_size = TfLiteIntArrayCreate(4);
   output_size->data[0] = input->dims->data[0];
diff --git a/tensorflow/lite/kernels/logical.cc b/tensorflow/lite/kernels/logical.cc
index ec650dd4210..a703f3f5358 100644
--- a/tensorflow/lite/kernels/logical.cc
+++ b/tensorflow/lite/kernels/logical.cc
@@ -58,7 +58,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   const TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
 
-  TF_LITE_ENSURE_EQ(context, input1->type, input2->type);
+  TF_LITE_ENSURE_TYPES_EQ(context, input1->type, input2->type);
 
   const TfLiteType type = input1->type;
   if (type != kTfLiteBool) {
diff --git a/tensorflow/lite/kernels/lstm.cc b/tensorflow/lite/kernels/lstm.cc
index e022bfb85ba..74caafbd0c7 100644
--- a/tensorflow/lite/kernels/lstm.cc
+++ b/tensorflow/lite/kernels/lstm.cc
@@ -762,8 +762,8 @@ TfLiteStatus CheckInputTensorDimensions(TfLiteContext* context,
     TF_LITE_ENSURE_EQ(context, input_to_input_weights->dims->size, 2);
     TF_LITE_ENSURE_EQ(context, input_to_input_weights->dims->data[0], n_cell);
     TF_LITE_ENSURE_EQ(context, input_to_input_weights->dims->data[1], n_input);
-    TF_LITE_ENSURE_EQ(context, input_to_input_weights->type,
-                      input_to_forget_weights->type);
+    TF_LITE_ENSURE_TYPES_EQ(context, input_to_input_weights->type,
+                            input_to_forget_weights->type);
   }
 
   const TfLiteTensor* input_to_cell_weights =
@@ -771,8 +771,8 @@ TfLiteStatus CheckInputTensorDimensions(TfLiteContext* context,
   TF_LITE_ENSURE_EQ(context, input_to_cell_weights->dims->size, 2);
   TF_LITE_ENSURE_EQ(context, input_to_cell_weights->dims->data[0], n_cell);
   TF_LITE_ENSURE_EQ(context, input_to_cell_weights->dims->data[1], n_input);
-  TF_LITE_ENSURE_EQ(context, input_to_cell_weights->type,
-                    input_to_forget_weights->type);
+  TF_LITE_ENSURE_TYPES_EQ(context, input_to_cell_weights->type,
+                          input_to_forget_weights->type);
 
   const TfLiteTensor* recurrent_to_input_weights =
       GetOptionalInputTensor(context, node, kRecurrentToInputWeightsTensor);
@@ -782,8 +782,8 @@ TfLiteStatus CheckInputTensorDimensions(TfLiteContext* context,
                       n_cell);
     TF_LITE_ENSURE_EQ(context, recurrent_to_input_weights->dims->data[1],
                       n_output);
-    TF_LITE_ENSURE_EQ(context, recurrent_to_input_weights->type,
-                      input_to_forget_weights->type);
+    TF_LITE_ENSURE_TYPES_EQ(context, recurrent_to_input_weights->type,
+                            input_to_forget_weights->type);
   }
 
   const TfLiteTensor* recurrent_to_forget_weights =
@@ -793,8 +793,8 @@ TfLiteStatus CheckInputTensorDimensions(TfLiteContext* context,
                     n_cell);
   TF_LITE_ENSURE_EQ(context, recurrent_to_forget_weights->dims->data[1],
                     n_output);
-  TF_LITE_ENSURE_EQ(context, recurrent_to_forget_weights->type,
-                    input_to_forget_weights->type);
+  TF_LITE_ENSURE_TYPES_EQ(context, recurrent_to_forget_weights->type,
+                          input_to_forget_weights->type);
 
   const TfLiteTensor* recurrent_to_cell_weights =
       GetInput(context, node, kRecurrentToCellWeightsTensor);
@@ -802,8 +802,8 @@ TfLiteStatus CheckInputTensorDimensions(TfLiteContext* context,
   TF_LITE_ENSURE_EQ(context, recurrent_to_cell_weights->dims->data[0], n_cell);
   TF_LITE_ENSURE_EQ(context, recurrent_to_cell_weights->dims->data[1],
                     n_output);
-  TF_LITE_ENSURE_EQ(context, recurrent_to_cell_weights->type,
-                    input_to_forget_weights->type);
+  TF_LITE_ENSURE_TYPES_EQ(context, recurrent_to_cell_weights->type,
+                          input_to_forget_weights->type);
 
   // We make sure the input-gate's parameters are either both present (regular
   // LSTM) or not at all (CIFG-LSTM).
@@ -819,7 +819,7 @@ TfLiteStatus CheckInputTensorDimensions(TfLiteContext* context,
   if (cell_to_input_weights) {
     TF_LITE_ENSURE_EQ(context, cell_to_input_weights->dims->size, 1);
     TF_LITE_ENSURE_EQ(context, cell_to_input_weights->dims->data[0], n_cell);
-    TF_LITE_ENSURE_EQ(
+    TF_LITE_ENSURE_TYPES_EQ(
         context, cell_to_input_weights->type,
         is_integer ? kTfLiteInt16 : input_to_forget_weights->type);
   }
@@ -829,7 +829,7 @@ TfLiteStatus CheckInputTensorDimensions(TfLiteContext* context,
   if (cell_to_forget_weights) {
     TF_LITE_ENSURE_EQ(context, cell_to_forget_weights->dims->size, 1);
     TF_LITE_ENSURE_EQ(context, cell_to_forget_weights->dims->data[0], n_cell);
-    TF_LITE_ENSURE_EQ(
+    TF_LITE_ENSURE_TYPES_EQ(
         context, cell_to_forget_weights->type,
         is_integer ? kTfLiteInt16 : input_to_forget_weights->type);
   }
@@ -839,7 +839,7 @@ TfLiteStatus CheckInputTensorDimensions(TfLiteContext* context,
   if (cell_to_output_weights) {
     TF_LITE_ENSURE_EQ(context, cell_to_output_weights->dims->size, 1);
     TF_LITE_ENSURE_EQ(context, cell_to_output_weights->dims->data[0], n_cell);
-    TF_LITE_ENSURE_EQ(
+    TF_LITE_ENSURE_TYPES_EQ(
         context, cell_to_output_weights->type,
         is_integer ? kTfLiteInt16 : input_to_forget_weights->type);
   }
@@ -863,9 +863,9 @@ TfLiteStatus CheckInputTensorDimensions(TfLiteContext* context,
     TF_LITE_ENSURE_EQ(context, input_gate_bias->dims->size, 1);
     TF_LITE_ENSURE_EQ(context, input_gate_bias->dims->data[0], n_cell);
     if (is_integer) {
-      TF_LITE_ENSURE_EQ(context, input_gate_bias->type, kTfLiteInt32);
+      TF_LITE_ENSURE_TYPES_EQ(context, input_gate_bias->type, kTfLiteInt32);
     } else {
-      TF_LITE_ENSURE_EQ(context, input_gate_bias->type, kTfLiteFloat32);
+      TF_LITE_ENSURE_TYPES_EQ(context, input_gate_bias->type, kTfLiteFloat32);
     }
   }
 
@@ -874,18 +874,18 @@ TfLiteStatus CheckInputTensorDimensions(TfLiteContext* context,
   TF_LITE_ENSURE_EQ(context, forget_gate_bias->dims->size, 1);
   TF_LITE_ENSURE_EQ(context, forget_gate_bias->dims->data[0], n_cell);
   if (is_integer) {
-    TF_LITE_ENSURE_EQ(context, forget_gate_bias->type, kTfLiteInt32);
+    TF_LITE_ENSURE_TYPES_EQ(context, forget_gate_bias->type, kTfLiteInt32);
   } else {
-    TF_LITE_ENSURE_EQ(context, forget_gate_bias->type, kTfLiteFloat32);
+    TF_LITE_ENSURE_TYPES_EQ(context, forget_gate_bias->type, kTfLiteFloat32);
   }
 
   const TfLiteTensor* cell_bias = GetInput(context, node, kCellGateBiasTensor);
   TF_LITE_ENSURE_EQ(context, cell_bias->dims->size, 1);
   TF_LITE_ENSURE_EQ(context, cell_bias->dims->data[0], n_cell);
   if (is_integer) {
-    TF_LITE_ENSURE_EQ(context, cell_bias->type, kTfLiteInt32);
+    TF_LITE_ENSURE_TYPES_EQ(context, cell_bias->type, kTfLiteInt32);
   } else {
-    TF_LITE_ENSURE_EQ(context, cell_bias->type, kTfLiteFloat32);
+    TF_LITE_ENSURE_TYPES_EQ(context, cell_bias->type, kTfLiteFloat32);
   }
 
   const TfLiteTensor* output_gate_bias =
@@ -893,9 +893,9 @@ TfLiteStatus CheckInputTensorDimensions(TfLiteContext* context,
   TF_LITE_ENSURE_EQ(context, output_gate_bias->dims->size, 1);
   TF_LITE_ENSURE_EQ(context, output_gate_bias->dims->data[0], n_cell);
   if (is_integer) {
-    TF_LITE_ENSURE_EQ(context, output_gate_bias->type, kTfLiteInt32);
+    TF_LITE_ENSURE_TYPES_EQ(context, output_gate_bias->type, kTfLiteInt32);
   } else {
-    TF_LITE_ENSURE_EQ(context, output_gate_bias->type, kTfLiteFloat32);
+    TF_LITE_ENSURE_TYPES_EQ(context, output_gate_bias->type, kTfLiteFloat32);
   }
 
   const TfLiteTensor* projection_weights =
@@ -904,8 +904,8 @@ TfLiteStatus CheckInputTensorDimensions(TfLiteContext* context,
     TF_LITE_ENSURE_EQ(context, projection_weights->dims->size, 2);
     TF_LITE_ENSURE_EQ(context, projection_weights->dims->data[0], n_output);
     TF_LITE_ENSURE_EQ(context, projection_weights->dims->data[1], n_cell);
-    TF_LITE_ENSURE_EQ(context, projection_weights->type,
-                      input_to_forget_weights->type);
+    TF_LITE_ENSURE_TYPES_EQ(context, projection_weights->type,
+                            input_to_forget_weights->type);
   }
 
   const TfLiteTensor* projection_bias =
@@ -914,9 +914,9 @@ TfLiteStatus CheckInputTensorDimensions(TfLiteContext* context,
     TF_LITE_ENSURE_EQ(context, projection_bias->dims->size, 1);
     TF_LITE_ENSURE_EQ(context, projection_bias->dims->data[0], n_output);
     if (is_integer) {
-      TF_LITE_ENSURE_EQ(context, projection_bias->type, kTfLiteInt32);
+      TF_LITE_ENSURE_TYPES_EQ(context, projection_bias->type, kTfLiteInt32);
     } else {
-      TF_LITE_ENSURE_EQ(context, projection_bias->type, kTfLiteFloat32);
+      TF_LITE_ENSURE_TYPES_EQ(context, projection_bias->type, kTfLiteFloat32);
     }
   }
 
@@ -940,11 +940,11 @@ TfLiteStatus CheckInputTensorDimensions(TfLiteContext* context,
       TF_LITE_ENSURE_EQ(context, input_layer_norm_coefficients->dims->data[0],
                         n_cell);
       if (is_integer) {
-        TF_LITE_ENSURE_EQ(context, input_layer_norm_coefficients->type,
-                          kTfLiteInt16);
+        TF_LITE_ENSURE_TYPES_EQ(context, input_layer_norm_coefficients->type,
+                                kTfLiteInt16);
       } else {
-        TF_LITE_ENSURE_EQ(context, input_layer_norm_coefficients->type,
-                          kTfLiteFloat32);
+        TF_LITE_ENSURE_TYPES_EQ(context, input_layer_norm_coefficients->type,
+                                kTfLiteFloat32);
       }
     }
 
@@ -955,11 +955,11 @@ TfLiteStatus CheckInputTensorDimensions(TfLiteContext* context,
     TF_LITE_ENSURE_EQ(context, forget_layer_norm_coefficients->dims->data[0],
                       n_cell);
     if (is_integer) {
-      TF_LITE_ENSURE_EQ(context, forget_layer_norm_coefficients->type,
-                        kTfLiteInt16);
+      TF_LITE_ENSURE_TYPES_EQ(context, forget_layer_norm_coefficients->type,
+                              kTfLiteInt16);
     } else {
-      TF_LITE_ENSURE_EQ(context, forget_layer_norm_coefficients->type,
-                        kTfLiteFloat32);
+      TF_LITE_ENSURE_TYPES_EQ(context, forget_layer_norm_coefficients->type,
+                              kTfLiteFloat32);
     }
 
     const TfLiteTensor* cell_layer_norm_coefficients =
@@ -969,11 +969,11 @@ TfLiteStatus CheckInputTensorDimensions(TfLiteContext* context,
     TF_LITE_ENSURE_EQ(context, cell_layer_norm_coefficients->dims->data[0],
                       n_cell);
     if (is_integer) {
-      TF_LITE_ENSURE_EQ(context, cell_layer_norm_coefficients->type,
-                        kTfLiteInt16);
+      TF_LITE_ENSURE_TYPES_EQ(context, cell_layer_norm_coefficients->type,
+                              kTfLiteInt16);
     } else {
-      TF_LITE_ENSURE_EQ(context, cell_layer_norm_coefficients->type,
-                        kTfLiteFloat32);
+      TF_LITE_ENSURE_TYPES_EQ(context, cell_layer_norm_coefficients->type,
+                              kTfLiteFloat32);
     }
 
     const TfLiteTensor* output_layer_norm_coefficients = GetOptionalInputTensor(
@@ -983,11 +983,11 @@ TfLiteStatus CheckInputTensorDimensions(TfLiteContext* context,
     TF_LITE_ENSURE_EQ(context, output_layer_norm_coefficients->dims->data[0],
                       n_cell);
     if (is_integer) {
-      TF_LITE_ENSURE_EQ(context, output_layer_norm_coefficients->type,
-                        kTfLiteInt16);
+      TF_LITE_ENSURE_TYPES_EQ(context, output_layer_norm_coefficients->type,
+                              kTfLiteInt16);
     } else {
-      TF_LITE_ENSURE_EQ(context, output_layer_norm_coefficients->type,
-                        kTfLiteFloat32);
+      TF_LITE_ENSURE_TYPES_EQ(context, output_layer_norm_coefficients->type,
+                              kTfLiteFloat32);
     }
   }
 
diff --git a/tensorflow/lite/kernels/maximum_minimum.cc b/tensorflow/lite/kernels/maximum_minimum.cc
index ae1920e53db..777e51442f1 100644
--- a/tensorflow/lite/kernels/maximum_minimum.cc
+++ b/tensorflow/lite/kernels/maximum_minimum.cc
@@ -57,7 +57,8 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
 
   OpContext op_context(context, node);
-  TF_LITE_ENSURE_EQ(context, op_context.input1->type, op_context.input2->type);
+  TF_LITE_ENSURE_TYPES_EQ(context, op_context.input1->type,
+                          op_context.input2->type);
   op_context.output->type = op_context.input1->type;
 
   bool requires_broadcast =
diff --git a/tensorflow/lite/kernels/mfcc.cc b/tensorflow/lite/kernels/mfcc.cc
index 5fe5b948a87..a3bf5baafaa 100644
--- a/tensorflow/lite/kernels/mfcc.cc
+++ b/tensorflow/lite/kernels/mfcc.cc
@@ -80,9 +80,9 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, NumDimensions(input_wav), 3);
   TF_LITE_ENSURE_EQ(context, NumElements(input_rate), 1);
 
-  TF_LITE_ENSURE_EQ(context, output->type, kTfLiteFloat32);
-  TF_LITE_ENSURE_EQ(context, input_wav->type, output->type);
-  TF_LITE_ENSURE_EQ(context, input_rate->type, kTfLiteInt32);
+  TF_LITE_ENSURE_TYPES_EQ(context, output->type, kTfLiteFloat32);
+  TF_LITE_ENSURE_TYPES_EQ(context, input_wav->type, output->type);
+  TF_LITE_ENSURE_TYPES_EQ(context, input_rate->type, kTfLiteInt32);
 
   TfLiteIntArray* output_size = TfLiteIntArrayCreate(3);
   output_size->data[0] = input_wav->dims->data[0];
diff --git a/tensorflow/lite/kernels/mul.cc b/tensorflow/lite/kernels/mul.cc
index 0ab378e278d..5c34c9c7199 100644
--- a/tensorflow/lite/kernels/mul.cc
+++ b/tensorflow/lite/kernels/mul.cc
@@ -79,7 +79,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   const TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
 
-  TF_LITE_ENSURE_EQ(context, input1->type, input2->type);
+  TF_LITE_ENSURE_TYPES_EQ(context, input1->type, input2->type);
 
   const bool requires_broadcast = !HaveSameShapes(input1, input2);
 
diff --git a/tensorflow/lite/kernels/one_hot.cc b/tensorflow/lite/kernels/one_hot.cc
index 76d53c6396f..f7b4e8e7e19 100644
--- a/tensorflow/lite/kernels/one_hot.cc
+++ b/tensorflow/lite/kernels/one_hot.cc
@@ -136,8 +136,8 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
       op_context.output->type = op_context.dtype;
       break;
     default:
-      context->ReportError(context, "Unknown output data type: %d",
-                           op_context.dtype);
+      TF_LITE_KERNEL_LOG(context, "Unknown output data type: %s",
+                         TfLiteTypeGetName(op_context.dtype));
       return kTfLiteError;
   }
 
@@ -148,8 +148,9 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, NumElements(op_context.depth), 1);
   TF_LITE_ENSURE_EQ(context, NumElements(op_context.on_value), 1);
   TF_LITE_ENSURE_EQ(context, NumElements(op_context.off_value), 1);
-  TF_LITE_ENSURE_EQ(context, op_context.on_value->type, op_context.dtype);
-  TF_LITE_ENSURE_EQ(context, op_context.off_value->type, op_context.dtype);
+  TF_LITE_ENSURE_TYPES_EQ(context, op_context.on_value->type, op_context.dtype);
+  TF_LITE_ENSURE_TYPES_EQ(context, op_context.off_value->type,
+                          op_context.dtype);
 
   if (!IsConstantTensor(op_context.depth)) {
     SetTensorToDynamic(op_context.output);
diff --git a/tensorflow/lite/kernels/pack.cc b/tensorflow/lite/kernels/pack.cc
index fc7a87692c4..90a87b0c8c7 100644
--- a/tensorflow/lite/kernels/pack.cc
+++ b/tensorflow/lite/kernels/pack.cc
@@ -57,7 +57,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   for (int i = 1; i < data->values_count; ++i) {
     const TfLiteTensor* input = GetInput(context, node, i);
     TF_LITE_ENSURE(context, HaveSameShapes(input0, input));
-    TF_LITE_ENSURE_EQ(context, input0->type, input->type);
+    TF_LITE_ENSURE_TYPES_EQ(context, input0->type, input->type);
   }
 
   // Resize output. rank R will become rank R + 1
@@ -73,7 +73,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   }
 
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
-  TF_LITE_ENSURE_EQ(context, output->type, input0->type);
+  TF_LITE_ENSURE_TYPES_EQ(context, output->type, input0->type);
 
   // Guarantee input/output quantization params match as we do not support
   // packing quantized tensors.
diff --git a/tensorflow/lite/kernels/pad.cc b/tensorflow/lite/kernels/pad.cc
index 2239511b60a..4d9df6b89ab 100644
--- a/tensorflow/lite/kernels/pad.cc
+++ b/tensorflow/lite/kernels/pad.cc
@@ -111,10 +111,11 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
 
   PadContext op_context(context, node);
-  TF_LITE_ENSURE_EQ(context, op_context.input->type, op_context.output->type);
+  TF_LITE_ENSURE_TYPES_EQ(context, op_context.input->type,
+                          op_context.output->type);
   if (op_context.constant_values != nullptr) {
-    TF_LITE_ENSURE_EQ(context, op_context.input->type,
-                      op_context.constant_values->type);
+    TF_LITE_ENSURE_TYPES_EQ(context, op_context.input->type,
+                            op_context.constant_values->type);
   }
 
   // TODO(nupurgarg): Current implementations rely on the inputs being <= 4D.
@@ -268,9 +269,8 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
       }
     } break;
     default:
-      context->ReportError(context,
-                           "Type %d is currently not supported by Pad.",
-                           op_context.input->type);
+      TF_LITE_KERNEL_LOG(context, "Type %s is currently not supported by Pad.",
+                         TfLiteTypeGetName(op_context.input->type));
       return kTfLiteError;
   }
 #undef TF_LITE_PAD
diff --git a/tensorflow/lite/kernels/pooling.cc b/tensorflow/lite/kernels/pooling.cc
index 1dc5cbb6199..a1380080a1e 100644
--- a/tensorflow/lite/kernels/pooling.cc
+++ b/tensorflow/lite/kernels/pooling.cc
@@ -74,7 +74,7 @@ TfLiteStatus GenericPrepare(TfLiteContext* context, TfLiteNode* node) {
   TfLiteTensor* output = GetOutput(context, node, 0);
   const TfLiteTensor* input = GetInput(context, node, 0);
   TF_LITE_ENSURE_EQ(context, NumDimensions(input), 4);
-  TF_LITE_ENSURE_EQ(context, input->type, output->type);
+  TF_LITE_ENSURE_TYPES_EQ(context, input->type, output->type);
 
   int batches = input->dims->data[0];
   int height = input->dims->data[1];
@@ -98,7 +98,7 @@ TfLiteStatus GenericPrepare(TfLiteContext* context, TfLiteNode* node) {
     }
     if (pool_type == kL2) {
       // We currently don't have a quantized implementation of L2Pool
-      TF_LITE_ENSURE_EQ(context, input->type, kTfLiteFloat32);
+      TF_LITE_ENSURE_TYPES_EQ(context, input->type, kTfLiteFloat32);
     }
   }
 
@@ -387,8 +387,8 @@ TfLiteStatus AverageEval(TfLiteContext* context, TfLiteNode* node) {
                                              output);
       break;
     default:
-      context->ReportError(context, "Type %d not currently supported.",
-                           input->type);
+      TF_LITE_KERNEL_LOG(context, "Type %s not currently supported.",
+                         TfLiteTypeGetName(input->type));
       return kTfLiteError;
   }
   return kTfLiteOk;
@@ -418,8 +418,8 @@ TfLiteStatus MaxEval(TfLiteContext* context, TfLiteNode* node) {
                                          output);
       break;
     default:
-      context->ReportError(context, "Type %d not currently supported.",
-                           input->type);
+      TF_LITE_KERNEL_LOG(context, "Type %s not currently supported.",
+                         TfLiteTypeGetName(input->type));
       return kTfLiteError;
   }
   return kTfLiteOk;
diff --git a/tensorflow/lite/kernels/pow.cc b/tensorflow/lite/kernels/pow.cc
index a76c77a3f9f..7f995929ec7 100644
--- a/tensorflow/lite/kernels/pow.cc
+++ b/tensorflow/lite/kernels/pow.cc
@@ -58,11 +58,12 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   const TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
 
-  TF_LITE_ENSURE_EQ(context, input1->type, input2->type);
+  TF_LITE_ENSURE_TYPES_EQ(context, input1->type, input2->type);
 
   const TfLiteType type = input1->type;
   if (type != kTfLiteInt32 && type != kTfLiteFloat32) {
-    context->ReportError(context, "Unsupported data type %d.", type);
+    TF_LITE_KERNEL_LOG(context, "Unsupported data type %s.",
+                       TfLiteTypeGetName(type));
     return kTfLiteError;
   }
   output->type = type;
diff --git a/tensorflow/lite/kernels/range.cc b/tensorflow/lite/kernels/range.cc
index 55cc543d745..fe67d055ded 100644
--- a/tensorflow/lite/kernels/range.cc
+++ b/tensorflow/lite/kernels/range.cc
@@ -100,8 +100,8 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
     return kTfLiteError;
   }
 
-  TF_LITE_ENSURE_EQ(context, limit->type, dtype);
-  TF_LITE_ENSURE_EQ(context, delta->type, dtype);
+  TF_LITE_ENSURE_TYPES_EQ(context, limit->type, dtype);
+  TF_LITE_ENSURE_TYPES_EQ(context, delta->type, dtype);
 
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
   output->type = dtype;
diff --git a/tensorflow/lite/kernels/read_variable.cc b/tensorflow/lite/kernels/read_variable.cc
index ad6e8d43858..78b6a136be4 100644
--- a/tensorflow/lite/kernels/read_variable.cc
+++ b/tensorflow/lite/kernels/read_variable.cc
@@ -58,7 +58,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   TfLiteTensor* variable_tensor = variable->GetTensor();
   TfLiteTensor* output = GetOutput(context, node, kOutputValue);
 
-  TF_LITE_ENSURE_EQ(context, variable_tensor->type, output->type);
+  TF_LITE_ENSURE_TYPES_EQ(context, variable_tensor->type, output->type);
   TF_LITE_ENSURE_OK(
       context, context->ResizeTensor(
                    context, output, TfLiteIntArrayCopy(variable_tensor->dims)));
diff --git a/tensorflow/lite/kernels/reduce.cc b/tensorflow/lite/kernels/reduce.cc
index af42b2a369c..6107b01cd46 100644
--- a/tensorflow/lite/kernels/reduce.cc
+++ b/tensorflow/lite/kernels/reduce.cc
@@ -235,7 +235,7 @@ TfLiteStatus PrepareSimple(TfLiteContext* context, TfLiteNode* node) {
 TfLiteStatus PrepareAny(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 2);
   const TfLiteTensor* input = GetInput(context, node, 0);
-  TF_LITE_ENSURE_EQ(context, input->type, kTfLiteBool);
+  TF_LITE_ENSURE_TYPES_EQ(context, input->type, kTfLiteBool);
   return PrepareSimple(context, node);
 }
 
diff --git a/tensorflow/lite/kernels/resize_nearest_neighbor.cc b/tensorflow/lite/kernels/resize_nearest_neighbor.cc
index fff45ac13cc..13c54c4f906 100644
--- a/tensorflow/lite/kernels/resize_nearest_neighbor.cc
+++ b/tensorflow/lite/kernels/resize_nearest_neighbor.cc
@@ -68,7 +68,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   // and the size being 1D tensor with exactly 2 elements.
   TF_LITE_ENSURE_EQ(context, NumDimensions(input), 4);
   TF_LITE_ENSURE_EQ(context, NumDimensions(size), 1);
-  TF_LITE_ENSURE_EQ(context, size->type, kTfLiteInt32);
+  TF_LITE_ENSURE_TYPES_EQ(context, size->type, kTfLiteInt32);
   TF_LITE_ENSURE_EQ(context, size->dims->data[0], 2);
 
   output->type = input->type;
@@ -122,9 +122,9 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
         GetTensorShape(size), GetTensorData<int32>(size),
         GetTensorShape(output), GetTensorData<int8_t>(output));
   } else {
-    context->ReportError(context,
-                         "Output type is %d, requires float, uint8 or int8.",
-                         output->type);
+    TF_LITE_KERNEL_LOG(context,
+                       "Output type is %s, requires float, uint8 or int8.",
+                       TfLiteTypeGetName(output->type));
     return kTfLiteError;
   }
 
diff --git a/tensorflow/lite/kernels/reverse.cc b/tensorflow/lite/kernels/reverse.cc
index 9ce845b4b7b..34cc92da5d8 100644
--- a/tensorflow/lite/kernels/reverse.cc
+++ b/tensorflow/lite/kernels/reverse.cc
@@ -61,7 +61,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
   TfLiteIntArray* output_shape = TfLiteIntArrayCopy(input->dims);
-  TF_LITE_ENSURE_EQ(context, output->type, input->type);
+  TF_LITE_ENSURE_TYPES_EQ(context, output->type, input->type);
 
   return context->ResizeTensor(context, output, output_shape);
 }
diff --git a/tensorflow/lite/kernels/reverse_sequence.cc b/tensorflow/lite/kernels/reverse_sequence.cc
index 7390876d39b..b36b1f803ca 100644
--- a/tensorflow/lite/kernels/reverse_sequence.cc
+++ b/tensorflow/lite/kernels/reverse_sequence.cc
@@ -58,7 +58,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
   TfLiteIntArray* output_shape = TfLiteIntArrayCopy(input->dims);
-  TF_LITE_ENSURE_EQ(context, output->type, input->type);
+  TF_LITE_ENSURE_TYPES_EQ(context, output->type, input->type);
 
   return context->ResizeTensor(context, output, output_shape);
 }
diff --git a/tensorflow/lite/kernels/round.cc b/tensorflow/lite/kernels/round.cc
index 341d2880705..72c793c1152 100644
--- a/tensorflow/lite/kernels/round.cc
+++ b/tensorflow/lite/kernels/round.cc
@@ -34,7 +34,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
-  TF_LITE_ENSURE_EQ(context, input->type, kTfLiteFloat32);
+  TF_LITE_ENSURE_TYPES_EQ(context, input->type, kTfLiteFloat32);
   output->type = input->type;
   TfLiteIntArray* output_size = TfLiteIntArrayCopy(input->dims);
   return context->ResizeTensor(context, output, output_size);
diff --git a/tensorflow/lite/kernels/select.cc b/tensorflow/lite/kernels/select.cc
index 281425253c5..62c8ddbaa97 100644
--- a/tensorflow/lite/kernels/select.cc
+++ b/tensorflow/lite/kernels/select.cc
@@ -66,8 +66,8 @@ TfLiteStatus SelectPrepare(TfLiteContext* context, TfLiteNode* node) {
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
 
   // Input must be bool.
-  TF_LITE_ENSURE(context, input_condition->type == kTfLiteBool);
-  TF_LITE_ENSURE_EQ(context, input_x->type, input_y->type);
+  TF_LITE_ENSURE_TYPES_EQ(context, input_condition->type, kTfLiteBool);
+  TF_LITE_ENSURE_TYPES_EQ(context, input_x->type, input_y->type);
   output->type = input_x->type;
 
   bool same_shape = HaveSameShapes(input_condition, input_x) &&
diff --git a/tensorflow/lite/kernels/skip_gram.cc b/tensorflow/lite/kernels/skip_gram.cc
index 8348a25bba7..f81d152bb70 100644
--- a/tensorflow/lite/kernels/skip_gram.cc
+++ b/tensorflow/lite/kernels/skip_gram.cc
@@ -48,8 +48,10 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
 
-  TF_LITE_ENSURE_EQ(context, GetInput(context, node, 0)->type, kTfLiteString);
-  TF_LITE_ENSURE_EQ(context, GetOutput(context, node, 0)->type, kTfLiteString);
+  TF_LITE_ENSURE_TYPES_EQ(context, GetInput(context, node, 0)->type,
+                          kTfLiteString);
+  TF_LITE_ENSURE_TYPES_EQ(context, GetOutput(context, node, 0)->type,
+                          kTfLiteString);
   return kTfLiteOk;
 }
 
diff --git a/tensorflow/lite/kernels/space_to_batch_nd.cc b/tensorflow/lite/kernels/space_to_batch_nd.cc
index 7fc58e7ee6b..0d537e2d189 100644
--- a/tensorflow/lite/kernels/space_to_batch_nd.cc
+++ b/tensorflow/lite/kernels/space_to_batch_nd.cc
@@ -100,7 +100,8 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
                  NumDimensions(op_context.input) >= kInputMinDimensionNum);
   TF_LITE_ENSURE(context,
                  NumDimensions(op_context.input) <= kInputMaxDimensionNum);
-  TF_LITE_ENSURE_EQ(context, op_context.input->type, op_context.output->type);
+  TF_LITE_ENSURE_TYPES_EQ(context, op_context.input->type,
+                          op_context.output->type);
 
   if (!IsConstantTensor(op_context.block_shape) ||
       !IsConstantTensor(op_context.paddings)) {
diff --git a/tensorflow/lite/kernels/space_to_depth.cc b/tensorflow/lite/kernels/space_to_depth.cc
index e4c7efaaf99..ac001d903a4 100644
--- a/tensorflow/lite/kernels/space_to_depth.cc
+++ b/tensorflow/lite/kernels/space_to_depth.cc
@@ -55,7 +55,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
                  data_type == kTfLiteFloat32 || data_type == kTfLiteUInt8 ||
                      data_type == kTfLiteInt8 || data_type == kTfLiteInt32 ||
                      data_type == kTfLiteInt64);
-  TF_LITE_ENSURE_EQ(context, input->type, output->type);
+  TF_LITE_ENSURE_TYPES_EQ(context, input->type, output->type);
 
   const int block_size = params->block_size;
   const int input_height = input->dims->data[1];
diff --git a/tensorflow/lite/kernels/sparse_to_dense.cc b/tensorflow/lite/kernels/sparse_to_dense.cc
index bdf0f4e703a..4aea0f491bc 100644
--- a/tensorflow/lite/kernels/sparse_to_dense.cc
+++ b/tensorflow/lite/kernels/sparse_to_dense.cc
@@ -172,7 +172,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
                               values->type == kTfLiteInt8 ||
                               values->type == kTfLiteUInt8 ||
                               values->type == kTfLiteFloat32);
-  TF_LITE_ENSURE_EQ(context, values->type, default_value->type);
+  TF_LITE_ENSURE_TYPES_EQ(context, values->type, default_value->type);
 
   // Ensure dimensions match.
   TF_LITE_ENSURE_OK(
@@ -229,10 +229,10 @@ TfLiteStatus EvalForIndexType(TfLiteContext* context, TfLiteNode* node,
       return SparseToDenseImpl<T, int64_t>(context, node);
     }
     default:
-      context->ReportError(
+      TF_LITE_KERNEL_LOG(
           context,
-          "Indice type %d is currently not supported by sparse to dense.",
-          indices->type);
+          "Indice type %s is currently not supported by sparse to dense.",
+          TfLiteTypeGetName(indices->type));
       return kTfLiteError;
   }
 }
@@ -253,10 +253,10 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
     case kTfLiteUInt8:
       return EvalForIndexType<uint8_t>(context, node, indices);
     default:
-      context->ReportError(
+      TF_LITE_KERNEL_LOG(
           context,
-          "Value type %d is currently not supported by sparse to dense.",
-          values->type);
+          "Value type %s is currently not supported by sparse to dense.",
+          TfLiteTypeGetName(values->type));
       return kTfLiteError;
   }
 }
diff --git a/tensorflow/lite/kernels/squared_difference.cc b/tensorflow/lite/kernels/squared_difference.cc
index e17ff8e3191..ff09995845e 100644
--- a/tensorflow/lite/kernels/squared_difference.cc
+++ b/tensorflow/lite/kernels/squared_difference.cc
@@ -64,7 +64,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   const TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
 
-  TF_LITE_ENSURE_EQ(context, input1->type, input2->type);
+  TF_LITE_ENSURE_TYPES_EQ(context, input1->type, input2->type);
   output->type = input2->type;
 
   data->requires_broadcast = !HaveSameShapes(input1, input2);
diff --git a/tensorflow/lite/kernels/strided_slice.cc b/tensorflow/lite/kernels/strided_slice.cc
index 50c2255e526..83221cd4a3d 100644
--- a/tensorflow/lite/kernels/strided_slice.cc
+++ b/tensorflow/lite/kernels/strided_slice.cc
@@ -145,9 +145,9 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, op_context.input->type, op_context.output->type);
   // Only INT32 begin/end/strides are supported
   // TODO(soroosh) add support for INT64
-  TF_LITE_ENSURE_EQ(context, op_context.begin->type, kTfLiteInt32);
-  TF_LITE_ENSURE_EQ(context, op_context.end->type, kTfLiteInt32);
-  TF_LITE_ENSURE_EQ(context, op_context.strides->type, kTfLiteInt32);
+  TF_LITE_ENSURE_TYPES_EQ(context, op_context.begin->type, kTfLiteInt32);
+  TF_LITE_ENSURE_TYPES_EQ(context, op_context.end->type, kTfLiteInt32);
+  TF_LITE_ENSURE_TYPES_EQ(context, op_context.strides->type, kTfLiteInt32);
   TF_LITE_ENSURE_MSG(context, op_context.dims <= 5,
                      "StridedSlice op only supports 1D-5D input arrays.");
 
@@ -223,10 +223,10 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
       }
       break;
     default:
-      context->ReportError(context,
-                           "Type %d is currently not supported "
-                           "by StridedSlice.",
-                           op_context.input->type);
+      TF_LITE_KERNEL_LOG(context,
+                         "Type %s is currently not supported "
+                         "by StridedSlice.",
+                         TfLiteTypeGetName(op_context.input->type));
       return kTfLiteError;
   }
 #undef TF_LITE_STRIDED_SLICE
diff --git a/tensorflow/lite/kernels/sub.cc b/tensorflow/lite/kernels/sub.cc
index aa628fa5408..83b2714135d 100644
--- a/tensorflow/lite/kernels/sub.cc
+++ b/tensorflow/lite/kernels/sub.cc
@@ -206,7 +206,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   const TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
 
-  TF_LITE_ENSURE_EQ(context, input1->type, input2->type);
+  TF_LITE_ENSURE_TYPES_EQ(context, input1->type, input2->type);
   output->type = input2->type;
 
   data->requires_broadcast = !HaveSameShapes(input1, input2);
@@ -287,8 +287,8 @@ void EvalSub(TfLiteContext* context, TfLiteNode* node, TfLiteSubParams* params,
                                       input2, requires_broadcast, output);
       break;
     default:
-      TF_LITE_KERNEL_LOG(context, "output type %d is not supported.",
-                         output->type);
+      TF_LITE_KERNEL_LOG(context, "output type %s is not supported.",
+                         TfLiteTypeGetName(output->type));
   }
 }
 
diff --git a/tensorflow/lite/kernels/tile.cc b/tensorflow/lite/kernels/tile.cc
index 884456fcbf2..08d246203ae 100644
--- a/tensorflow/lite/kernels/tile.cc
+++ b/tensorflow/lite/kernels/tile.cc
@@ -211,7 +211,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   const TfLiteTensor* input = GetInput(context, node, kInputTensor);
 
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
-  TF_LITE_ENSURE_EQ(context, input->type, output->type);
+  TF_LITE_ENSURE_TYPES_EQ(context, input->type, output->type);
 
   const TfLiteTensor* multipliers = GetInput(context, node, kInputMultipliers);
   // Only int32 and int64 multipliers type is supported.
diff --git a/tensorflow/lite/kernels/topk_v2.cc b/tensorflow/lite/kernels/topk_v2.cc
index 6a5bd392086..3fb241356e1 100644
--- a/tensorflow/lite/kernels/topk_v2.cc
+++ b/tensorflow/lite/kernels/topk_v2.cc
@@ -37,7 +37,7 @@ namespace {
 TfLiteStatus ResizeOutput(TfLiteContext* context, TfLiteNode* node) {
   const TfLiteTensor* top_k = GetInput(context, node, kInputTopK);
   // INT32 number of top results is supported.
-  TF_LITE_ENSURE_EQ(context, top_k->type, kTfLiteInt32);
+  TF_LITE_ENSURE_TYPES_EQ(context, top_k->type, kTfLiteInt32);
   // Check that the tensor contains only one value.
   TF_LITE_ENSURE_EQ(context, NumElements(top_k), 1);
   const int32 k = *GetTensorData<int32_t>(top_k);
@@ -197,10 +197,10 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 
   const TfLiteTensor* input = GetInput(context, node, kInputTensor);
   TfLiteTensor* output_values = GetOutput(context, node, kOutputValues);
-  TF_LITE_ENSURE_EQ(context, input->type, output_values->type);
+  TF_LITE_ENSURE_TYPES_EQ(context, input->type, output_values->type);
 
   const TfLiteTensor* top_k = GetInput(context, node, kInputTopK);
-  TF_LITE_ENSURE_EQ(context, top_k->type, kTfLiteInt32);
+  TF_LITE_ENSURE_TYPES_EQ(context, top_k->type, kTfLiteInt32);
 
   // Set output dynamic if the input is not const.
   if (IsConstantTensor(top_k)) {
@@ -252,9 +252,8 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
            output_values->data.i64);
       break;
     default:
-      context->ReportError(context,
-                           "Type %d is currently not supported by TopK.",
-                           output_values->type);
+      TF_LITE_KERNEL_LOG(context, "Type %s is currently not supported by TopK.",
+                         TfLiteTypeGetName(output_values->type));
       return kTfLiteError;
   }
 
diff --git a/tensorflow/lite/kernels/transpose.cc b/tensorflow/lite/kernels/transpose.cc
index 27f5cf6f065..3a6d1b1f1ed 100644
--- a/tensorflow/lite/kernels/transpose.cc
+++ b/tensorflow/lite/kernels/transpose.cc
@@ -77,7 +77,8 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   // Ensure validity of input tensor.
   TF_LITE_ENSURE_MSG(context, NumDimensions(op_context.input) <= 5,
                      "Transpose op only supports 1D-5D input arrays.");
-  TF_LITE_ENSURE_EQ(context, op_context.input->type, op_context.output->type);
+  TF_LITE_ENSURE_TYPES_EQ(context, op_context.input->type,
+                          op_context.output->type);
 
   if (!IsConstantTensor(op_context.perm)) {
     SetTensorToDynamic(op_context.output);
@@ -144,9 +145,9 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
       }
       break;
     default:
-      context->ReportError(context,
-                           "Type %d is currently not supported by Transpose.",
-                           op_context.input->type);
+      TF_LITE_KERNEL_LOG(context,
+                         "Type %s is currently not supported by Transpose.",
+                         TfLiteTypeGetName(op_context.input->type));
       return kTfLiteError;
   }
 #undef TF_LITE_TRANSPOSE
diff --git a/tensorflow/lite/kernels/transpose_conv.cc b/tensorflow/lite/kernels/transpose_conv.cc
index 33e122ba037..07dc4bbac53 100644
--- a/tensorflow/lite/kernels/transpose_conv.cc
+++ b/tensorflow/lite/kernels/transpose_conv.cc
@@ -111,8 +111,8 @@ TfLiteStatus ResizeTensor(TfLiteContext* context,
                           TfLiteTensor* tensor_to_resize) {
   // Currently only support int32 for output shape.
   if (shape_tensor->type != kTfLiteInt32) {
-    context->ReportError(context, "Output shape is %d, not int32.",
-                         shape_tensor->type);
+    TF_LITE_KERNEL_LOG(context, "Output shape is %s, not int32.",
+                       TfLiteTypeGetName(shape_tensor->type));
     return kTfLiteError;
   }
 
@@ -176,8 +176,8 @@ TfLiteStatus ResizeCol2ImTensor(TfLiteContext* context,
                                 const TfLiteTensor* input,
                                 TfLiteTensor* col2im) {
   if (output_shape->type != kTfLiteInt32) {
-    context->ReportError(context, "col2im shape is %d, not int32.",
-                         output_shape->type);
+    TF_LITE_KERNEL_LOG(context, "col2im shape is %s, not int32.",
+                       TfLiteTypeGetName(output_shape->type));
     return kTfLiteError;
   }
   TF_LITE_ENSURE_EQ(context, NumElements(output_shape), 4);
@@ -274,7 +274,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
     bias = GetOptionalInputTensor(context, node, kBiasTensor);
     if (bias) {
       if (input->type == kTfLiteUInt8 || input->type == kTfLiteInt8) {
-        TF_LITE_ENSURE_EQ(context, bias->type, kTfLiteInt32);
+        TF_LITE_ENSURE_TYPES_EQ(context, bias->type, kTfLiteInt32);
         if (input->type == kTfLiteInt8) {
           TF_LITE_ENSURE_EQ(context, bias->params.zero_point, 0);
         }
@@ -282,7 +282,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
         TF_LITE_ENSURE_EQ(context, bias->type, kTfLiteInt64);
         TF_LITE_ENSURE_EQ(context, bias->params.zero_point, 0);
       } else {
-        TF_LITE_ENSURE_EQ(context, bias->type, input->type);
+        TF_LITE_ENSURE_TYPES_EQ(context, bias->type, input->type);
       }
       TF_LITE_ENSURE_EQ(context, NumElements(bias),
                         SizeOfDimension(weights, 0));
@@ -294,9 +294,9 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
     TF_LITE_ENSURE_EQ(context, input->params.zero_point, 0);
     TF_LITE_ENSURE_EQ(context, output->params.zero_point, 0);
   } else {
-    TF_LITE_ENSURE_EQ(context, weights->type, input->type);
+    TF_LITE_ENSURE_TYPES_EQ(context, weights->type, input->type);
   }
-  TF_LITE_ENSURE_EQ(context, output->type, input->type);
+  TF_LITE_ENSURE_TYPES_EQ(context, output->type, input->type);
   // Ensure that weights and inputs have the same channel dimension.
   // Note: TOCO will reorder weights in the following format: OHWI.
   TF_LITE_ENSURE_EQ(context, SizeOfDimension(input, 3),
diff --git a/tensorflow/lite/kernels/unidirectional_sequence_lstm.cc b/tensorflow/lite/kernels/unidirectional_sequence_lstm.cc
index b8b9396f436..95864196f18 100644
--- a/tensorflow/lite/kernels/unidirectional_sequence_lstm.cc
+++ b/tensorflow/lite/kernels/unidirectional_sequence_lstm.cc
@@ -223,8 +223,8 @@ TfLiteStatus CheckInputTensorDimensions(TfLiteContext* context,
       TF_LITE_ENSURE_EQ(context, input_layer_norm_coefficients->dims->size, 1);
       TF_LITE_ENSURE_EQ(context, input_layer_norm_coefficients->dims->data[0],
                         n_cell);
-      TF_LITE_ENSURE_EQ(context, input_layer_norm_coefficients->type,
-                        kTfLiteFloat32);
+      TF_LITE_ENSURE_TYPES_EQ(context, input_layer_norm_coefficients->type,
+                              kTfLiteFloat32);
     }
 
     const TfLiteTensor* forget_layer_norm_coefficients =
@@ -233,8 +233,8 @@ TfLiteStatus CheckInputTensorDimensions(TfLiteContext* context,
     TF_LITE_ENSURE_EQ(context, forget_layer_norm_coefficients->dims->size, 1);
     TF_LITE_ENSURE_EQ(context, forget_layer_norm_coefficients->dims->data[0],
                       n_cell);
-    TF_LITE_ENSURE_EQ(context, forget_layer_norm_coefficients->type,
-                      kTfLiteFloat32);
+    TF_LITE_ENSURE_TYPES_EQ(context, forget_layer_norm_coefficients->type,
+                            kTfLiteFloat32);
 
     const TfLiteTensor* cell_layer_norm_coefficients =
         GetInput(context, node, lstm::full::kCellLayerNormCoefficientsTensor);
@@ -242,8 +242,8 @@ TfLiteStatus CheckInputTensorDimensions(TfLiteContext* context,
     TF_LITE_ENSURE_EQ(context, cell_layer_norm_coefficients->dims->size, 1);
     TF_LITE_ENSURE_EQ(context, cell_layer_norm_coefficients->dims->data[0],
                       n_cell);
-    TF_LITE_ENSURE_EQ(context, cell_layer_norm_coefficients->type,
-                      kTfLiteFloat32);
+    TF_LITE_ENSURE_TYPES_EQ(context, cell_layer_norm_coefficients->type,
+                            kTfLiteFloat32);
 
     const TfLiteTensor* output_layer_norm_coefficients =
         GetInput(context, node, lstm::full::kOutputLayerNormCoefficientsTensor);
@@ -251,8 +251,8 @@ TfLiteStatus CheckInputTensorDimensions(TfLiteContext* context,
     TF_LITE_ENSURE_EQ(context, output_layer_norm_coefficients->dims->size, 1);
     TF_LITE_ENSURE_EQ(context, output_layer_norm_coefficients->dims->data[0],
                       n_cell);
-    TF_LITE_ENSURE_EQ(context, output_layer_norm_coefficients->type,
-                      kTfLiteFloat32);
+    TF_LITE_ENSURE_TYPES_EQ(context, output_layer_norm_coefficients->type,
+                            kTfLiteFloat32);
   }
 
   return kTfLiteOk;
@@ -290,7 +290,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   // Inferring batch size, number of outputs and sequence length and
   // number of cells from the input tensors.
   const TfLiteTensor* input = GetInput(context, node, lstm::full::kInputTensor);
-  TF_LITE_ENSURE_EQ(context, input->type, kTfLiteFloat32);
+  TF_LITE_ENSURE_TYPES_EQ(context, input->type, kTfLiteFloat32);
   TF_LITE_ENSURE(context, input->dims->size > 1);
   const auto* params =
       reinterpret_cast<TfLiteUnidirectionalSequenceLSTMParams*>(
@@ -659,8 +659,8 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
           CpuBackendContext::GetFromContext(context));
     }
     default:
-      context->ReportError(context, "Type %d is not currently supported.",
-                           input_to_output_weights->type);
+      TF_LITE_KERNEL_LOG(context, "Type %s is not currently supported.",
+                         TfLiteTypeGetName(input_to_output_weights->type));
       return kTfLiteError;
   }
   return kTfLiteOk;
diff --git a/tensorflow/lite/kernels/unidirectional_sequence_rnn.cc b/tensorflow/lite/kernels/unidirectional_sequence_rnn.cc
index 7ed67c1614d..350ca293cbf 100644
--- a/tensorflow/lite/kernels/unidirectional_sequence_rnn.cc
+++ b/tensorflow/lite/kernels/unidirectional_sequence_rnn.cc
@@ -85,8 +85,9 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
                     bias->dims->data[0]);
   TF_LITE_ENSURE_EQ(context, recurrent_weights->dims->data[1],
                     bias->dims->data[0]);
-  TF_LITE_ENSURE_EQ(context, input->type, kTfLiteFloat32);
-  TF_LITE_ENSURE_EQ(context, input_weights->type, recurrent_weights->type);
+  TF_LITE_ENSURE_TYPES_EQ(context, input->type, kTfLiteFloat32);
+  TF_LITE_ENSURE_TYPES_EQ(context, input_weights->type,
+                          recurrent_weights->type);
   TF_LITE_ENSURE_EQ(context, NumDimensions(hidden_state), 2);
   TF_LITE_ENSURE_EQ(context, hidden_state->dims->data[0], batch_size);
   TF_LITE_ENSURE_EQ(context, hidden_state->dims->data[1], num_units);
@@ -364,8 +365,8 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
                         accum_scratch, row_sums, &op_data->compute_row_sums);
     }
     default:
-      context->ReportError(context, "Type %d not currently supported.",
-                           input_weights->type);
+      TF_LITE_KERNEL_LOG(context, "Type %d not currently supported.",
+                         TfLiteTypeGetName(input_weights->type));
       return kTfLiteError;
   }
   return kTfLiteOk;
diff --git a/tensorflow/lite/kernels/unpack.cc b/tensorflow/lite/kernels/unpack.cc
index 8d307acb268..a41556ed079 100644
--- a/tensorflow/lite/kernels/unpack.cc
+++ b/tensorflow/lite/kernels/unpack.cc
@@ -68,7 +68,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   for (int i = 0; i < data->num; ++i) {
     TfLiteIntArray* copied_output_shape = TfLiteIntArrayCopy(output_shape);
     TfLiteTensor* output = GetOutput(context, node, i);
-    TF_LITE_ENSURE_EQ(context, output->type, input->type);
+    TF_LITE_ENSURE_TYPES_EQ(context, output->type, input->type);
     // Guarantee input/output quantization params match as we do not support
     // rescaling of unpacked quantized tensors.
     TF_LITE_ENSURE_EQ(context, input->params.zero_point,
diff --git a/tensorflow/lite/kernels/while.cc b/tensorflow/lite/kernels/while.cc
index 99d6d2cc1c8..b50cdff9974 100644
--- a/tensorflow/lite/kernels/while.cc
+++ b/tensorflow/lite/kernels/while.cc
@@ -90,7 +90,7 @@ TfLiteStatus CopyTensorsData(TfLiteContext* context, Subgraph* src_subgraph,
 TfLiteStatus CheckCondOutput(TfLiteContext* context,
                              const TfLiteTensor* cond_output) {
   // The condition output must be a single boolean value.
-  TF_LITE_ENSURE_EQ(context, cond_output->type, kTfLiteBool);
+  TF_LITE_ENSURE_TYPES_EQ(context, cond_output->type, kTfLiteBool);
   if (cond_output->dims->size == 0) {
     // It's okay if it's a 0D scalar.
     return kTfLiteOk;
@@ -179,7 +179,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
           body_subgraph->tensor(body_subgraph->inputs()[i]);
       TfLiteTensor* body_output =
           body_subgraph->tensor(body_subgraph->outputs()[i]);
-      TF_LITE_ENSURE_EQ(context, body_input->type, body_output->type);
+      TF_LITE_ENSURE_TYPES_EQ(context, body_input->type, body_output->type);
 
       // TODO(ycling): Support dynamic sized body subgraph.
       TF_LITE_ENSURE(context, !IsDynamicTensor(body_output));
diff --git a/tensorflow/lite/micro/kernels/arc_mli/fully_connected.cc b/tensorflow/lite/micro/kernels/arc_mli/fully_connected.cc
index fe077c99fac..8247875a7ac 100644
--- a/tensorflow/lite/micro/kernels/arc_mli/fully_connected.cc
+++ b/tensorflow/lite/micro/kernels/arc_mli/fully_connected.cc
@@ -111,7 +111,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
 
   TF_LITE_ENSURE(context, data != nullptr);
-  TF_LITE_ENSURE_EQ(context, input->type, output->type);
+  TF_LITE_ENSURE_TYPES_EQ(context, input->type, output->type);
   TF_LITE_ENSURE_MSG(context, input->type == filter->type,
                      "Hybrid models are not supported on TFLite Micro.");
 
diff --git a/tensorflow/lite/micro/kernels/ceil.cc b/tensorflow/lite/micro/kernels/ceil.cc
index 89831a767fe..ace038aaac5 100644
--- a/tensorflow/lite/micro/kernels/ceil.cc
+++ b/tensorflow/lite/micro/kernels/ceil.cc
@@ -32,8 +32,8 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
-  TF_LITE_ENSURE_EQ(context, input->type, kTfLiteFloat32);
-  TF_LITE_ENSURE_EQ(context, output->type, input->type);
+  TF_LITE_ENSURE_TYPES_EQ(context, input->type, kTfLiteFloat32);
+  TF_LITE_ENSURE_TYPES_EQ(context, output->type, input->type);
   TF_LITE_ENSURE_EQ(context, output->bytes, input->bytes);
   TF_LITE_ENSURE_EQ(context, output->dims->size, input->dims->size);
   for (int i = 0; i < output->dims->size; ++i) {
diff --git a/tensorflow/lite/micro/kernels/circular_buffer.cc b/tensorflow/lite/micro/kernels/circular_buffer.cc
index 590bdbe00f8..f588d64dcd5 100644
--- a/tensorflow/lite/micro/kernels/circular_buffer.cc
+++ b/tensorflow/lite/micro/kernels/circular_buffer.cc
@@ -89,10 +89,10 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, 1, input->dims->data[2]);
   TF_LITE_ENSURE_EQ(context, output->dims->data[3], input->dims->data[3]);
 
-  TF_LITE_ENSURE_EQ(context, input->type, output->type);
+  TF_LITE_ENSURE_TYPES_EQ(context, input->type, output->type);
 
   // The circular buffer custom operator currently only supports int8.
-  TF_LITE_ENSURE_EQ(context, input->type, kTfLiteInt8);
+  TF_LITE_ENSURE_TYPES_EQ(context, input->type, kTfLiteInt8);
 
   // TODO(b/132070898): Use statically slotted OpData structures until a
   // scratch memory API is ready.
diff --git a/tensorflow/lite/micro/kernels/cmsis-nn/fully_connected.cc b/tensorflow/lite/micro/kernels/cmsis-nn/fully_connected.cc
index 6a9eb882fdf..d8827b36d06 100644
--- a/tensorflow/lite/micro/kernels/cmsis-nn/fully_connected.cc
+++ b/tensorflow/lite/micro/kernels/cmsis-nn/fully_connected.cc
@@ -81,7 +81,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   const TfLiteTensor* input = GetInput(context, node, kInputTensor);
   const TfLiteTensor* filter = GetInput(context, node, kWeightsTensor);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
-  TF_LITE_ENSURE_EQ(context, input->type, output->type);
+  TF_LITE_ENSURE_TYPES_EQ(context, input->type, output->type);
   TF_LITE_ENSURE_MSG(context, input->type == filter->type,
                      "Hybrid models are not supported on TFLite Micro.");
 #if defined(__ARM_FEATURE_DSP)
diff --git a/tensorflow/lite/micro/kernels/cmsis-nn/mul.cc b/tensorflow/lite/micro/kernels/cmsis-nn/mul.cc
index 814daa526d2..a0e7af6d2d5 100644
--- a/tensorflow/lite/micro/kernels/cmsis-nn/mul.cc
+++ b/tensorflow/lite/micro/kernels/cmsis-nn/mul.cc
@@ -48,7 +48,7 @@ TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteNode* node,
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 2);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
 
-  TF_LITE_ENSURE_EQ(context, input1->type, input2->type);
+  TF_LITE_ENSURE_TYPES_EQ(context, input1->type, input2->type);
 
   if (output->type == kTfLiteUInt8 || output->type == kTfLiteInt8) {
     TF_LITE_ENSURE_STATUS(CalculateActivationRangeQuantized(
diff --git a/tensorflow/lite/micro/kernels/elementwise.cc b/tensorflow/lite/micro/kernels/elementwise.cc
index b69d260a826..aa97907df24 100644
--- a/tensorflow/lite/micro/kernels/elementwise.cc
+++ b/tensorflow/lite/micro/kernels/elementwise.cc
@@ -40,7 +40,7 @@ TfLiteStatus GenericPrepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
   const TfLiteTensor* input = GetInput(context, node, 0);
   TfLiteTensor* output = GetOutput(context, node, 0);
-  TF_LITE_ENSURE_EQ(context, input->type, output->type);
+  TF_LITE_ENSURE_TYPES_EQ(context, input->type, output->type);
   if (!IsSupportedType(input->type)) {
     TF_LITE_KERNEL_LOG(context, "Input data type %s (%d) is not supported.",
                        TfLiteTypeGetName(input->type), input->type);
@@ -54,7 +54,7 @@ inline TfLiteStatus EvalImpl(TfLiteContext* context, TfLiteNode* node,
                              T func(T), TfLiteType expected_type) {
   const TfLiteTensor* input = GetInput(context, node, 0);
   TfLiteTensor* output = GetOutput(context, node, 0);
-  TF_LITE_ENSURE_EQ(context, input->type, expected_type);
+  TF_LITE_ENSURE_TYPES_EQ(context, input->type, expected_type);
   const int64_t num_elements = NumElements(input);
   const T* in_data = GetTensorData<T>(input);
   T* out_data = GetTensorData<T>(output);
diff --git a/tensorflow/lite/micro/kernels/floor.cc b/tensorflow/lite/micro/kernels/floor.cc
index 435934fe39f..d8134e96cd6 100644
--- a/tensorflow/lite/micro/kernels/floor.cc
+++ b/tensorflow/lite/micro/kernels/floor.cc
@@ -29,7 +29,7 @@ constexpr int kOutputTensor = 0;
 
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  TF_LITE_ENSURE_EQ(context, input->type, kTfLiteFloat32);
+  TF_LITE_ENSURE_TYPES_EQ(context, input->type, kTfLiteFloat32);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
   reference_ops::Floor(GetTensorShape(input), GetTensorData<float>(input),
                        GetTensorShape(output), GetTensorData<float>(output));
diff --git a/tensorflow/lite/micro/kernels/fully_connected.cc b/tensorflow/lite/micro/kernels/fully_connected.cc
index bd949e6f552..8478b13d90e 100644
--- a/tensorflow/lite/micro/kernels/fully_connected.cc
+++ b/tensorflow/lite/micro/kernels/fully_connected.cc
@@ -93,7 +93,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   const TfLiteTensor* bias = GetOptionalInputTensor(context, node, kBiasTensor);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
 
-  TF_LITE_ENSURE_EQ(context, input->type, output->type);
+  TF_LITE_ENSURE_TYPES_EQ(context, input->type, output->type);
   TF_LITE_ENSURE_MSG(context, input->type == filter->type,
                      "Hybrid models are not supported on TFLite Micro.");
 
diff --git a/tensorflow/lite/micro/kernels/l2norm.cc b/tensorflow/lite/micro/kernels/l2norm.cc
index 4dd71fe1c4b..050f9d1e184 100644
--- a/tensorflow/lite/micro/kernels/l2norm.cc
+++ b/tensorflow/lite/micro/kernels/l2norm.cc
@@ -48,7 +48,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE(context, output->type == kTfLiteFloat32 ||
                               output->type == kTfLiteUInt8 ||
                               output->type == kTfLiteInt8);
-  TF_LITE_ENSURE_EQ(context, input->type, output->type);
+  TF_LITE_ENSURE_TYPES_EQ(context, input->type, output->type);
 
   if (output->type == kTfLiteUInt8 || output->type == kTfLiteInt8) {
     TF_LITE_ENSURE_EQ(context, output->params.scale, (1. / 128.));
@@ -118,8 +118,8 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
                                            depth, GetTensorData<int8>(input),
                                            GetTensorData<int8>(output));
   } else {
-    TF_LITE_KERNEL_LOG(context, "Output type is %d, requires float.",
-                         output->type);
+    TF_LITE_KERNEL_LOG(context, "Output type is %s, requires float.",
+                       TfLiteTypeGetName(output->type));
     return kTfLiteError;
   }
 
diff --git a/tensorflow/lite/micro/kernels/logistic.cc b/tensorflow/lite/micro/kernels/logistic.cc
index cc360c58bd9..cb1140e0839 100644
--- a/tensorflow/lite/micro/kernels/logistic.cc
+++ b/tensorflow/lite/micro/kernels/logistic.cc
@@ -44,7 +44,7 @@ TfLiteStatus CalculateArithmeticOpData(TfLiteContext* context, TfLiteNode* node,
   const TfLiteTensor* input = GetInput(context, node, kInputTensor);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
 
-  TF_LITE_ENSURE_EQ(context, input->type, output->type);
+  TF_LITE_ENSURE_TYPES_EQ(context, input->type, output->type);
   if (input->type == kTfLiteInt8) {
     TF_LITE_ENSURE_EQ(context, output->params.zero_point,
                       std::numeric_limits<int8_t>::min());
diff --git a/tensorflow/lite/micro/kernels/mul.cc b/tensorflow/lite/micro/kernels/mul.cc
index fb47728a1a4..82b01b11baf 100644
--- a/tensorflow/lite/micro/kernels/mul.cc
+++ b/tensorflow/lite/micro/kernels/mul.cc
@@ -48,7 +48,7 @@ TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteNode* node,
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 2);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
 
-  TF_LITE_ENSURE_EQ(context, input1->type, input2->type);
+  TF_LITE_ENSURE_TYPES_EQ(context, input1->type, input2->type);
 
   if (output->type == kTfLiteUInt8 || output->type == kTfLiteInt8) {
     TF_LITE_ENSURE_STATUS(CalculateActivationRangeQuantized(
diff --git a/tensorflow/lite/micro/kernels/reshape.cc b/tensorflow/lite/micro/kernels/reshape.cc
index 407682a6ff4..36601b1a43d 100644
--- a/tensorflow/lite/micro/kernels/reshape.cc
+++ b/tensorflow/lite/micro/kernels/reshape.cc
@@ -61,7 +61,7 @@ TfLiteStatus ReshapeOutput(TfLiteContext* context, TfLiteNode* node) {
     num_output_elements *= output_shape->data[stretch_dim];
   }
 
-  TF_LITE_ENSURE_EQ(context, input->type, output->type);
+  TF_LITE_ENSURE_TYPES_EQ(context, input->type, output->type);
   TF_LITE_ENSURE_EQ(context, num_input_elements, num_output_elements);
   return kTfLiteOk;
 }
diff --git a/tensorflow/lite/micro/kernels/round.cc b/tensorflow/lite/micro/kernels/round.cc
index b88c9fe0581..dc93817729b 100644
--- a/tensorflow/lite/micro/kernels/round.cc
+++ b/tensorflow/lite/micro/kernels/round.cc
@@ -32,8 +32,8 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
-  TF_LITE_ENSURE_EQ(context, input->type, kTfLiteFloat32);
-  TF_LITE_ENSURE_EQ(context, output->type, input->type);
+  TF_LITE_ENSURE_TYPES_EQ(context, input->type, kTfLiteFloat32);
+  TF_LITE_ENSURE_TYPES_EQ(context, output->type, input->type);
   TF_LITE_ENSURE_EQ(context, output->bytes, input->bytes);
   TF_LITE_ENSURE_EQ(context, output->dims->size, input->dims->size);
   for (int i = 0; i < output->dims->size; ++i) {
diff --git a/tensorflow/lite/micro/kernels/svdf.cc b/tensorflow/lite/micro/kernels/svdf.cc
index ba7cb05da57..717301e2261 100644
--- a/tensorflow/lite/micro/kernels/svdf.cc
+++ b/tensorflow/lite/micro/kernels/svdf.cc
@@ -419,7 +419,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
       TF_LITE_ENSURE_EQ(context, bias->type, kTfLiteInt32);
     }
 
-    TF_LITE_ENSURE_EQ(context, output->type, kTfLiteInt8);
+    TF_LITE_ENSURE_TYPES_EQ(context, output->type, kTfLiteInt8);
 
     const auto* input_params =
         reinterpret_cast<TfLiteAffineQuantization*>(input->quantization.params);
@@ -467,7 +467,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
     if (bias != nullptr) {
       TF_LITE_ENSURE_EQ(context, bias->type, kTfLiteFloat32);
     }
-    TF_LITE_ENSURE_EQ(context, output->type, kTfLiteFloat32);
+    TF_LITE_ENSURE_TYPES_EQ(context, output->type, kTfLiteFloat32);
 
     TFLITE_DCHECK(node->user_data != nullptr);
     OpData* data = static_cast<OpData*>(node->user_data);
diff --git a/tensorflow/lite/micro/kernels/tanh.cc b/tensorflow/lite/micro/kernels/tanh.cc
index 9ee5b74bde4..d978c7a1308 100644
--- a/tensorflow/lite/micro/kernels/tanh.cc
+++ b/tensorflow/lite/micro/kernels/tanh.cc
@@ -44,7 +44,7 @@ TfLiteStatus CalculateArithmeticOpData(TfLiteContext* context, TfLiteNode* node,
   const TfLiteTensor* input = GetInput(context, node, kInputTensor);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
 
-  TF_LITE_ENSURE_EQ(context, input->type, output->type);
+  TF_LITE_ENSURE_TYPES_EQ(context, input->type, output->type);
   if (input->type == kTfLiteInt8) {
     TF_LITE_ENSURE_EQ(context, output->params.zero_point, 0);
 
diff --git a/tensorflow/lite/micro/kernels/xtensa_hifi/floor.cc b/tensorflow/lite/micro/kernels/xtensa_hifi/floor.cc
index 7ea2c2c906e..e1507b7ab27 100644
--- a/tensorflow/lite/micro/kernels/xtensa_hifi/floor.cc
+++ b/tensorflow/lite/micro/kernels/xtensa_hifi/floor.cc
@@ -51,7 +51,7 @@ constexpr int kOutputTensor = 0;
 
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  TF_LITE_ENSURE_EQ(context, input->type, kTfLiteFloat32);
+  TF_LITE_ENSURE_TYPES_EQ(context, input->type, kTfLiteFloat32);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
   int err;
   const float* inp_data_ptr;
diff --git a/tensorflow/lite/micro/kernels/xtensa_hifi/svdf.cc b/tensorflow/lite/micro/kernels/xtensa_hifi/svdf.cc
index 92e5a476197..26a5a4e87d3 100644
--- a/tensorflow/lite/micro/kernels/xtensa_hifi/svdf.cc
+++ b/tensorflow/lite/micro/kernels/xtensa_hifi/svdf.cc
@@ -432,7 +432,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
     // EvalIntegerSVDF().
 
     // Validate output tensor:
-    TF_LITE_ENSURE_EQ(context, output->type, kTfLiteInt8);
+    TF_LITE_ENSURE_TYPES_EQ(context, output->type, kTfLiteInt8);
   } else {
     TF_LITE_ENSURE_EQ(context, node->inputs->size, 5);
 
@@ -457,7 +457,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
     // TODO(b/132070898): Use input tensor as variable until scratch tensor
     // allocation has been implemented.
     // TF_LITE_ENSURE_EQ(context, node->temporaries->size, 1);
-    TF_LITE_ENSURE_EQ(context, output->type, kTfLiteFloat32);
+    TF_LITE_ENSURE_TYPES_EQ(context, output->type, kTfLiteFloat32);
   }
 
   return kTfLiteOk;
diff --git a/tensorflow/lite/micro/kernels/xtensa_hifimini_legacy/svdf.cc b/tensorflow/lite/micro/kernels/xtensa_hifimini_legacy/svdf.cc
index 02bb72976dd..760e1290e73 100644
--- a/tensorflow/lite/micro/kernels/xtensa_hifimini_legacy/svdf.cc
+++ b/tensorflow/lite/micro/kernels/xtensa_hifimini_legacy/svdf.cc
@@ -348,7 +348,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, activation_state->type, kTfLiteInt16);
 
   // Validate output tensor:
-  TF_LITE_ENSURE_EQ(context, output->type, kTfLiteInt8);
+  TF_LITE_ENSURE_TYPES_EQ(context, output->type, kTfLiteInt8);
 
   // Calculate effective scales.
   auto* input_params =
diff --git a/tensorflow/lite/tools/benchmark/experimental/c/c_api_types.h b/tensorflow/lite/tools/benchmark/experimental/c/c_api_types.h
index ab769fec249..15823784d12 100644
--- a/tensorflow/lite/tools/benchmark/experimental/c/c_api_types.h
+++ b/tensorflow/lite/tools/benchmark/experimental/c/c_api_types.h
@@ -205,6 +205,7 @@ void TfLiteFloatArrayFree(TfLiteFloatArray* a);
 // the current function, while also reporting the location of the error.
 // `a` and `b` may be evaluated more than once, so no side effects or
 // extremely expensive computations should be done.
+// NOTE: Use TF_LITE_ENSURE_TYPES_EQ if comparing TfLiteTypes.
 #define TF_LITE_ENSURE_EQ(context, a, b)                                   \
   do {                                                                     \
     if ((a) != (b)) {                                                      \

From 2bba7c464ecd2a1baa234cd72bac5f6ddc436ff5 Mon Sep 17 00:00:00 2001
From: Tim Shen <timshen@google.com>
Date: Tue, 16 Jun 2020 11:31:59 -0700
Subject: [PATCH 0310/1390] [MLIR] Add more LHLO ops.

PiperOrigin-RevId: 316722878
Change-Id: I996646ef895cdb5ce2f68c6cb128ac55c443adb1
---
 tensorflow/compiler/mlir/xla/ir/hlo_ops.td    |  10 +-
 .../compiler/mlir/xla/ir/hlo_ops_base.td      |   9 ++
 tensorflow/compiler/mlir/xla/ir/lhlo_ops.td   |  57 +++++++++
 .../compiler/mlir/xla/tests/lhlo_ops.mlir     | 121 ++++++++++++++++++
 4 files changed, 188 insertions(+), 9 deletions(-)

diff --git a/tensorflow/compiler/mlir/xla/ir/hlo_ops.td b/tensorflow/compiler/mlir/xla/ir/hlo_ops.td
index f4947f2aadb..b1745c73fbf 100644
--- a/tensorflow/compiler/mlir/xla/ir/hlo_ops.td
+++ b/tensorflow/compiler/mlir/xla/ir/hlo_ops.td
@@ -1112,21 +1112,13 @@ def HLO_DynamicReshapeOp: HLO_Op<"dynamic_reshape", [NoSideEffect]> {
   let hasCustomHLOConverter = 1;
 }
 
-def ScatterDimensionNumbers : StructAttr<"ScatterDimensionNumbers", HLO_Dialect,
-      [StructFieldAttr<"update_window_dims", I64ElementsAttr>,
-      StructFieldAttr<"inserted_window_dims", I64ElementsAttr>,
-      StructFieldAttr<"scatter_dims_to_operand_dims", I64ElementsAttr>,
-      StructFieldAttr<"index_vector_dim", I64Attr>]> {
-  let description = "Structure of dimension information for scatter";
-}
-
 def HLO_ScatterOp: HLO_Op<"scatter", [RecursiveSideEffects]>,
       BASE_HLO_ScatterOp {
   let arguments = (ins
     HLO_Tensor:$operand,
     HLO_Tensor:$scatter_indices,
     HLO_Tensor:$updates,
-    ScatterDimensionNumbers:$scatter_dimension_numbers,
+    ScatterDimensionNumbers<HLO_Dialect>:$scatter_dimension_numbers,
     DefaultValuedAttr<BoolAttr, "false">:$indices_are_sorted,
     DefaultValuedAttr<BoolAttr, "false">:$unique_indices
   );
diff --git a/tensorflow/compiler/mlir/xla/ir/hlo_ops_base.td b/tensorflow/compiler/mlir/xla/ir/hlo_ops_base.td
index 64d374692a8..b0975d9ab03 100644
--- a/tensorflow/compiler/mlir/xla/ir/hlo_ops_base.td
+++ b/tensorflow/compiler/mlir/xla/ir/hlo_ops_base.td
@@ -1098,6 +1098,15 @@ class BASE_HLO_ReshapeOp {
   }];
 }
 
+class ScatterDimensionNumbers<Dialect dialect> : StructAttr<
+    "ScatterDimensionNumbers", dialect, [
+      StructFieldAttr<"update_window_dims", I64ElementsAttr>,
+      StructFieldAttr<"inserted_window_dims", I64ElementsAttr>,
+      StructFieldAttr<"scatter_dims_to_operand_dims", I64ElementsAttr>,
+      StructFieldAttr<"index_vector_dim", I64Attr>]> {
+  let description = "Structure of dimension information for scatter";
+}
+
 class BASE_HLO_ScatterOp {
   string summary = "Scatter operator";
 
diff --git a/tensorflow/compiler/mlir/xla/ir/lhlo_ops.td b/tensorflow/compiler/mlir/xla/ir/lhlo_ops.td
index b3ba4afa97a..aed7c83570e 100644
--- a/tensorflow/compiler/mlir/xla/ir/lhlo_ops.td
+++ b/tensorflow/compiler/mlir/xla/ir/lhlo_ops.td
@@ -471,6 +471,12 @@ def LHLO_BatchNormTrainingOp : LHLO_Op<"batch_norm_training", []>,
   );
 }
 
+// TODO(timshen): add a custom verifier.
+def LHLO_BitcastOp: LHLO_Op<"bitcast", []> {
+  let arguments = (ins Arg<LHLO_Buffer, "", [MemRead]>:$input,
+                       Arg<LHLO_Buffer, "", [MemWrite]>:$output);
+}
+
 def LHLO_BroadcastOp : LHLO_Op<"broadcast",
       []>, BASE_HLO_BroadcastOp {
   let arguments = (ins
@@ -578,6 +584,19 @@ def LHLO_ReshapeOp: LHLO_Op<"reshape", []>, BASE_HLO_ReshapeOp {
   );
 }
 
+def LHLO_ScatterOp: LHLO_Op<"scatter", []>, BASE_HLO_ScatterOp {
+  let arguments = (ins
+    Arg<LHLO_Buffer, "", [MemRead]>:$operand,
+    Arg<LHLO_Buffer, "", [MemRead]>:$scatter_indices,
+    Arg<LHLO_Buffer, "", [MemRead]>:$updates,
+    Arg<LHLO_Buffer, "", [MemWrite]>:$output,
+    ScatterDimensionNumbers<LHLO_Dialect>:$scatter_dimension_numbers,
+    DefaultValuedAttr<BoolAttr, "false">:$indices_are_sorted,
+    DefaultValuedAttr<BoolAttr, "false">:$unique_indices
+  );
+
+  let regions = (region SizedRegion<1>:$update_computation);
+}
 
 def LHLO_SelectOp: LHLO_Op<"select", []>, BASE_HLO_SelectOp {
   let arguments = (ins
@@ -712,6 +731,44 @@ def LHLO_TriangularSolveOp: LHLO_Op<"triangular_solve", [SameOperandsElementType
   );
 }
 
+// TODO(timshen): add a custom verifier.
+def LHLO_MapOp: LHLO_Op<"map", [SameOperandsShape]>, BASE_HLO_MapOp {
+  let arguments = (ins
+    Arg<Variadic<LHLO_Buffer>, "", [MemRead]>:$operands,
+    Arg<LHLO_Buffer, "", [MemWrite]>:$output,
+    I64ElementsAttr:$dimensions
+  );
+  let regions = (region SizedRegion<1>:$computation);
+}
+
+def LHLO_RngGetAndUpdateStateOp: LHLO_Op<"rng_get_and_update_state", []> {
+  let arguments = (ins
+    Arg<MemRefOf<[UI64]>, "", [MemRead, MemWrite]>:$state,
+    I64Attr:$delta
+  );
+}
+
+// TODO(timshen): add a custom verifier.
+def LHLO_SortOp: LHLO_Op<"sort", []>, BASE_HLO_SortOp {
+  let arguments = (ins
+    Arg<Variadic<LHLO_Buffer>, "", [MemRead]>:$operands,
+    LHLO_BufferOrTuple:$output,
+    DefaultValuedAttr<I64Attr, "-1">:$dimension,
+    DefaultValuedAttr<BoolAttr, "false">:$is_stable
+  );
+
+  let regions = (region SizedRegion<1>:$comparator);
+}
+
+def LHLO_TupleSelectOp: LHLO_Op<"tuple_select", [SameOperandsShape]> {
+  let arguments = (ins
+    Arg<LHLO_PredBuffer, "", [MemRead]>:$pred,
+    Arg<LHLO_Buffer, "", [MemRead]>:$on_true,
+    Arg<LHLO_Buffer, "", [MemRead]>:$on_false,
+    Arg<LHLO_Buffer, "", [MemWrite]>:$output
+  );
+}
+
 //===----------------------------------------------------------------------===//
 // Late operations
 //===----------------------------------------------------------------------===//
diff --git a/tensorflow/compiler/mlir/xla/tests/lhlo_ops.mlir b/tensorflow/compiler/mlir/xla/tests/lhlo_ops.mlir
index 6747b1e536e..0ed8b36466e 100644
--- a/tensorflow/compiler/mlir/xla/tests/lhlo_ops.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/lhlo_ops.mlir
@@ -863,3 +863,124 @@ func @while_memrefs(%arg0: memref<i64>, %arg_out: memref<i64>) -> () {
   ) : (memref<i64>, memref<i64>) -> ()
   return
 }
+
+// -----
+
+// CHECK-LABEL: func @bitcast_memrefs
+func @bitcast_memrefs(%arg0: memref<1xf64>, %arg_out: memref<2xi32>) -> () {
+  "xla_lhlo.bitcast"(%arg0, %arg_out) : (memref<1xf64>, memref<2xi32>) -> ()
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func @scatter_memrefs
+func @scatter_memrefs(%input: memref<200x100x300xf32>, %indices: memref<10x2xi32>,
+                      %updates: memref<10x300xf32>, %arg_out: memref<200x100x300xf32>) -> () {
+  "xla_lhlo.scatter" (%input, %indices, %updates, %arg_out) ({
+  ^bb0(%lhs: tensor<f32>, %rhs: tensor<f32>): // no predecessors
+    %add = xla_hlo.add %lhs, %rhs : tensor<f32>
+    "xla_hlo.return"(%add) : (tensor<f32>) -> ()
+  }) {
+    scatter_dimension_numbers = {
+      update_window_dims = dense<[1]> : tensor<1xi64>,
+      inserted_window_dims = dense<[0, 1]> : tensor<2xi64>,
+      scatter_dims_to_operand_dims = dense<[0, 1]> : tensor<2xi64>,
+      index_vector_dim = 1 : i64
+    },
+    indices_are_sorted = true,
+    unique_indices = true
+  } : (memref<200x100x300xf32>, memref<10x2xi32>, memref<10x300xf32>, memref<200x100x300xf32>) -> ()
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func @map_memrefs
+func @map_memrefs(%arg0: memref<20xf32>, %arg1: memref<20xf32>, %arg_out: memref<20xf32>) -> () {
+  "xla_lhlo.map"(%arg0, %arg1, %arg_out) ({
+    ^bb0(%a: tensor<f32>, %b: tensor<f32>):
+    %c = xla_hlo.add %a, %b : tensor<f32>
+    "xla_hlo.return"(%c) : (tensor<f32>) -> ()
+  }) {dimensions = dense<0> : tensor<1xi64>} : (memref<20xf32>, memref<20xf32>, memref<20xf32>) -> ()
+  return
+}
+
+// -----
+
+func @map_memrefs(%arg0: memref<20xf32>, %arg1: memref<20xf32>, %arg_out: memref<10xf32>) -> () {
+  // expected-error@+1{{requires the same shape for all operands}}
+  "xla_lhlo.map"(%arg0, %arg1, %arg_out) ({
+    ^bb0(%a: tensor<f32>, %b: tensor<f32>):
+    %c = xla_hlo.add %a, %b : tensor<f32>
+    "xla_hlo.return"(%c) : (tensor<f32>) -> ()
+  }) {dimensions = dense<0> : tensor<1xi64>} : (memref<20xf32>, memref<20xf32>, memref<10xf32>) -> ()
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func @rng_get_and_update_state_memrefs
+func @rng_get_and_update_state_memrefs(%state: memref<1xui64>) -> () {
+  "xla_lhlo.rng_get_and_update_state"(%state) { delta = 1 : i64 } : (memref<1xui64>) -> ()
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func @sort_memrefs
+func @sort_memrefs(%arg0: memref<16x16xf32>, %arg1: memref<16x16xf16>,
+                   %arg_out: tuple<memref<16x16xf32>, memref<16x16xf16>>) -> () {
+  "xla_lhlo.sort"(%arg0, %arg1, %arg_out) ( {
+  ^bb0(%a: tensor<f32>, %b: tensor<f32>, %c: tensor<f16>, %d: tensor<f16>):
+    %7 = "xla_hlo.compare"(%a, %b) {comparison_direction = "GT"} : (tensor<f32>, tensor<f32>) -> tensor<i1>
+    "xla_hlo.return"(%7) : (tensor<i1>) -> ()
+  }) {dimension = 1 : i64, is_stable = true} : (memref<16x16xf32>, memref<16x16xf16>, tuple<memref<16x16xf32>, memref<16x16xf16>>) -> ()
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func @sort_memrefs
+func @sort_memrefs(%arg0: memref<16x16xf32>, %arg1: memref<16x16xf16>,
+                   %arg_out: tuple<memref<16x16xf32>, memref<16x16xf16>>) -> () {
+  "xla_lhlo.sort"(%arg0, %arg1, %arg_out) ( {
+  ^bb0(%a: tensor<f32>, %b: tensor<f32>, %c: tensor<f16>, %d: tensor<f16>):
+    %7 = "xla_hlo.compare"(%a, %b) {comparison_direction = "GT"} : (tensor<f32>, tensor<f32>) -> tensor<i1>
+    "xla_hlo.return"(%7) : (tensor<i1>) -> ()
+  }) {dimension = 1 : i64} : (memref<16x16xf32>, memref<16x16xf16>, tuple<memref<16x16xf32>, memref<16x16xf16>>) -> ()
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func @sort_memrefs
+func @sort_memrefs(%arg0: memref<16x16xf32>, %arg1: memref<16x16xf16>,
+                   %arg_out: tuple<memref<16x16xf32>, memref<16x16xf16>>) -> () {
+  "xla_lhlo.sort"(%arg0, %arg1, %arg_out) ( {
+  ^bb0(%a: tensor<f32>, %b: tensor<f32>, %c: tensor<f16>, %d: tensor<f16>):
+    %7 = "xla_hlo.compare"(%a, %b) {comparison_direction = "GT"} : (tensor<f32>, tensor<f32>) -> tensor<i1>
+    "xla_hlo.return"(%7) : (tensor<i1>) -> ()
+  }) : (memref<16x16xf32>, memref<16x16xf16>, tuple<memref<16x16xf32>, memref<16x16xf16>>) -> ()
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func @tuple_select_memrefs
+func @tuple_select_memrefs(%pred: memref<20xi1>, %true_values: memref<20xf32>,
+                           %false_values: memref<20xf32>, %arg_out: memref<20xf32>) -> () {
+  "xla_lhlo.tuple_select"(%pred, %true_values, %false_values, %arg_out)
+      : (memref<20xi1>, memref<20xf32>, memref<20xf32>, memref<20xf32>) -> ()
+  return
+}
+
+// -----
+
+func @tuple_select_memrefs(%pred: memref<10xi1>, %true_values: memref<20xf32>,
+                           %false_values: memref<20xf32>, %arg_out: memref<20xf32>) -> () {
+  // expected-error@+1{{requires the same shape for all operands}}
+  "xla_lhlo.tuple_select"(%pred, %true_values, %false_values, %arg_out)
+      : (memref<10xi1>, memref<20xf32>, memref<20xf32>, memref<20xf32>) -> ()
+  return
+}

From dd3e910e21e3c22e2f4c8172d48386642527bbc4 Mon Sep 17 00:00:00 2001
From: Raman Sarokin <sorokin@google.com>
Date: Tue, 16 Jun 2020 11:57:20 -0700
Subject: [PATCH 0311/1390] Changed address space of default samplers.

PiperOrigin-RevId: 316728298
Change-Id: Ib74d1bbbb5561a29aa569e2960a3776eceb9f237
---
 tensorflow/lite/delegates/gpu/cl/kernels/util.cc | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/util.cc b/tensorflow/lite/delegates/gpu/cl/kernels/util.cc
index 73fc0e744bb..3161a73a18f 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/util.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/util.cc
@@ -119,13 +119,13 @@ std::string GetCommonDefines(CalculationsPrecision precision) {
   }
 
   result +=
-      "const sampler_t smp_edge = CLK_NORMALIZED_COORDS_FALSE | "
+      "__constant sampler_t smp_edge = CLK_NORMALIZED_COORDS_FALSE | "
       "CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;\n";
   result +=
-      "const sampler_t smp_none = CLK_NORMALIZED_COORDS_FALSE | "
+      "__constant sampler_t smp_none = CLK_NORMALIZED_COORDS_FALSE | "
       "CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;\n";
   result +=
-      "const sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | "
+      "__constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | "
       "CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;\n";
 
   return result;

From 266387ecb45ca06a2bdf117a119cb65b9ab5dcd9 Mon Sep 17 00:00:00 2001
From: nammbash <niroopshankar.ammbashankar@intel.com>
Date: Tue, 16 Jun 2020 12:02:14 -0700
Subject: [PATCH 0312/1390] create a single oneDNN string

---
 tensorflow/core/platform/cpu_feature_guard.cc | 11 ++---------
 1 file changed, 2 insertions(+), 9 deletions(-)

diff --git a/tensorflow/core/platform/cpu_feature_guard.cc b/tensorflow/core/platform/cpu_feature_guard.cc
index a020d3fd70e..13116de49dc 100644
--- a/tensorflow/core/platform/cpu_feature_guard.cc
+++ b/tensorflow/core/platform/cpu_feature_guard.cc
@@ -138,16 +138,9 @@ void InfoAboutUnusedCPUFeatures() {
     CheckIfFeatureUnused(CPUFeature::FMA, "FMA", missing_instructions);
 #endif  // __FMA__
 #endif  // else of if defined(_MSC_VER) && !defined(__clang__)
-
-    string intel_library_official_name(
-        "Intel(R) oneAPI Deep Neural Network Library (oneDNN) ");
-#ifndef INTEL_MKL
-    intel_library_official_name = "oneAPI Deep Neural Network Library (oneDNN) ";
-#endif
-
-        if (!missing_instructions.empty()) {
+    if (!missing_instructions.empty()) {
       LOG(INFO) << "This TensorFlow binary is optimized with "
-                << intel_library_official_name
+                << "oneAPI Deep Neural Network Library (oneDNN)"
                 << "to use the following CPU instructions in performance-"
                 << "critical operations: " << missing_instructions << std::endl
                 << "To enable them in other operations, rebuild TensorFlow "

From aee694363c354881ee659cd6a70fafc5a3d30507 Mon Sep 17 00:00:00 2001
From: Kibeom Kim <kkb@google.com>
Date: Tue, 16 Jun 2020 12:22:48 -0700
Subject: [PATCH 0313/1390] Support TFRT async config.

PiperOrigin-RevId: 316733436
Change-Id: I0eef2279b9c77d6084c1300a8d38a987a2cee065
---
 tensorflow/c/eager/c_api.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/c/eager/c_api.cc b/tensorflow/c/eager/c_api.cc
index e71073ec79f..fdc91675f8b 100644
--- a/tensorflow/c/eager/c_api.cc
+++ b/tensorflow/c/eager/c_api.cc
@@ -713,8 +713,8 @@ TFE_Context* TFE_NewContext(const TFE_ContextOptions* opts, TF_Status* status) {
     status->status = tfrt::ListOpHandlerChains(
         opts->session_options.options, &op_handler_chains, &device_attributes);
     if (!status->status.ok()) return nullptr;
-    return tensorflow::wrap(
-        new tfrt::ContextInterface(op_handler_chains, device_attributes));
+    return tensorflow::wrap(new tfrt::ContextInterface(
+        op_handler_chains, device_attributes, opts->async));
 #else
     status->status = tensorflow::errors::Unimplemented("TFRT is not supported");
     return nullptr;

From 1a03ea7e61d54135047d50c942af346d50b5af8b Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 16 Jun 2020 12:31:05 -0700
Subject: [PATCH 0314/1390] PR #39767: Wider vector for FP16 RELU Grad on GPUs

Imported from GitHub PR https://github.com/tensorflow/tensorflow/pull/39767

This PR uses wider vector (8 FP16 values) for loading and storing in FP16 ReluGrad kernel to improve performance on Nvidia Ampere GPUs.

For older GPUs, the performance is expected to be unchanged.

fyi @nluehr
Copybara import of the project:

--
ab809024a4a5b0887c360b3e5542c149f4a5f14d by Kaixi Hou <kaixih@nvidia.com>:

Enable wider vector for reluGrad...

PiperOrigin-RevId: 316735014
Change-Id: Ic4d93a211e52844f9804ed6c2c4a0346052ceb1e
---
 tensorflow/core/kernels/relu_op_gpu.cu.cc | 87 ++---------------------
 1 file changed, 7 insertions(+), 80 deletions(-)

diff --git a/tensorflow/core/kernels/relu_op_gpu.cu.cc b/tensorflow/core/kernels/relu_op_gpu.cu.cc
index 568bed57c70..27fd5f64249 100644
--- a/tensorflow/core/kernels/relu_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/relu_op_gpu.cu.cc
@@ -35,7 +35,6 @@ namespace tensorflow {
 
 typedef Eigen::GpuDevice GPUDevice;
 
-static constexpr int VectorSizeElements = 8;
 namespace functor {
 
 // This kernel computes ReluGrad by processing one half2, two fp16, at a time.
@@ -94,65 +93,6 @@ __global__ void ReluGradHalfKernel(const Eigen::half* __restrict__ gradient,
   }
 }
 
-__global__ void ReluGradHalfKernelVector(
-    const Eigen::half* __restrict__ gradient,
-    const Eigen::half* __restrict__ feature, Eigen::half* __restrict__ backprop,
-    int32 count) {
-  int32 half8_count = count / VectorSizeElements;
-  int32 index = blockIdx.x * blockDim.x + threadIdx.x;
-
-  if (index < half8_count) {
-    // Cast to xx_h8 for vector load and store.
-    float4 gradient_h8 = reinterpret_cast<const float4*>(gradient)[index];
-    float4 feature_h8 = reinterpret_cast<const float4*>(feature)[index];
-    float4* p_backprop_h8 = reinterpret_cast<float4*>(backprop) + index;
-
-    half2* gradient_h2 = reinterpret_cast<half2*>(&gradient_h8);
-    half2* feature_h2 = reinterpret_cast<half2*>(&feature_h8);
-    float4 backprop_h8;
-    half2* p_backprop_h2 = reinterpret_cast<half2*>(&backprop_h8);
-
-    // Fast path, when half2 primitives are available.
-#if __CUDA_ARCH__ >= 530
-    const half2 kZeroH2 = __float2half2_rn(0.f);
-#endif
-    for (int i = 0; i < VectorSizeElements / 2; i++) {
-#if __CUDA_ARCH__ >= 530
-      // mask = (feature > 0)
-      half2 mask_h2 = __hgt2(feature_h2[i], kZeroH2);
-      // backprop = mask * gradient
-      half2 backprop_h2 = __hmul2(mask_h2, gradient_h2[i]);
-#else
-      // Fall back: convert half2 to float2 for processing.
-      float2 feature_f2 = __half22float2(feature_h2[i]);
-      float2 gradient_f2 = __half22float2(gradient_h2[i]);
-      float2 backprop_f2 = make_float2((feature_f2.x > 0) ? gradient_f2.x : 0,
-                                       (feature_f2.y > 0) ? gradient_f2.y : 0);
-      // Convert back to half2.
-      half2 backprop_h2 = __float22half2_rn(backprop_f2);
-#endif
-      p_backprop_h2[i] = backprop_h2;
-    }
-    // Write back the result.
-    *p_backprop_h8 = backprop_h8;
-  }
-
-  int remaining_count = (count % VectorSizeElements);
-
-  if (index < remaining_count) {
-    // Use first threads to process the remaining elements.
-    Eigen::half grad_h = gradient[half8_count * VectorSizeElements + index];
-    Eigen::half feature_h = feature[half8_count * VectorSizeElements + index];
-
-    float grad_f = static_cast<float>(grad_h);
-    float feature_f = static_cast<float>(feature_h);
-    float backprop_f = (feature_f > 0) ? grad_f : 0;
-
-    Eigen::half backprop_h(backprop_f);
-    backprop[half8_count * VectorSizeElements + index] = backprop_h;
-  }
-}
-
 template <typename Device>
 struct ReluGrad<Device, Eigen::half> {
   // Computes ReluGrad backprop.
@@ -168,28 +108,15 @@ struct ReluGrad<Device, Eigen::half> {
     // NOTE: When the activation is exactly zero, we do not propagate the
     // associated gradient value. This allows the output of the Relu to be used,
     // as well as its input.
-    auto gradient_ptr = reinterpret_cast<uintptr_t>(gradient.data());
-    auto feature_ptr = reinterpret_cast<uintptr_t>(feature.data());
-    auto backprop_ptr = reinterpret_cast<uintptr_t>(backprop.data());
-    bool aligned = gradient_ptr % 16 == 0 && feature_ptr % 16 == 0 &&
-                   backprop_ptr % 16 == 0;
     int32 count = gradient.size();
-    constexpr int32 kThreadInBlock = 512;
     if (count == 0) return;
-    if (aligned) {
-      int32 half8_count = Eigen::divup(count, VectorSizeElements);
-      int32 kBlock = Eigen::divup(half8_count, kThreadInBlock);
-      TF_CHECK_OK(GpuLaunchKernel(
-          ReluGradHalfKernelVector, kBlock, kThreadInBlock, 0, d.stream(),
-          gradient.data(), feature.data(), backprop.data(), count));
-    } else {
-      int32 half2_count = Eigen::divup(count, 2);
-      GpuLaunchConfig config = GetGpuLaunchConfigFixedBlockSize(
-          half2_count, d, ReluGradHalfKernel, 0, kThreadInBlock);
-      TF_CHECK_OK(GpuLaunchKernel(
-          ReluGradHalfKernel, config.block_count, config.thread_per_block, 0,
-          d.stream(), gradient.data(), feature.data(), backprop.data(), count));
-    }
+    int32 half2_count = Eigen::divup(count, 2);
+    constexpr int32 kThreadInBlock = 512;
+    GpuLaunchConfig config = GetGpuLaunchConfigFixedBlockSize(
+        half2_count, d, ReluGradHalfKernel, 0, kThreadInBlock);
+    TF_CHECK_OK(GpuLaunchKernel(
+        ReluGradHalfKernel, config.block_count, config.thread_per_block, 0,
+        d.stream(), gradient.data(), feature.data(), backprop.data(), count));
   }
 };
 

From d97234dc564b2db7543b7f69517e5beefb89dc0e Mon Sep 17 00:00:00 2001
From: Dan Moldovan <mdan@google.com>
Date: Tue, 16 Jun 2020 12:34:27 -0700
Subject: [PATCH 0315/1390] More compatibility fixes for typing.Generic:  *
 types.new_class is required in some distributions  * avoid calling
 `isinstance` on some function objects in python 3.6  * account for some
 strange zombie pointer issue on windows Required for #40132.

PiperOrigin-RevId: 316735720
Change-Id: I1b08ef5f18c77c9343d587562f50632336b684d5
---
 tensorflow/python/framework/test_util.py |  2 +-
 tensorflow/python/util/tf_should_use.py  | 18 +++++++++++++-----
 2 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/tensorflow/python/framework/test_util.py b/tensorflow/python/framework/test_util.py
index 2967bb3de84..a46bb7c9bda 100644
--- a/tensorflow/python/framework/test_util.py
+++ b/tensorflow/python/framework/test_util.py
@@ -736,7 +736,7 @@ def assert_no_new_tensors(f):
         return isinstance(obj,
                           (ops.Tensor, variables.Variable,
                            tensor_shape.Dimension, tensor_shape.TensorShape))
-      except ReferenceError:
+      except (ReferenceError, AttributeError):
         # If the object no longer exists, we don't care about it.
         return False
 
diff --git a/tensorflow/python/util/tf_should_use.py b/tensorflow/python/util/tf_should_use.py
index 1671b078fa3..41c3220f5ca 100644
--- a/tensorflow/python/util/tf_should_use.py
+++ b/tensorflow/python/util/tf_should_use.py
@@ -21,15 +21,12 @@ import copy
 import sys
 import textwrap
 import traceback
-
-import six  # pylint: disable=unused-import
-
+import types
 
 from tensorflow.python.eager import context
 from tensorflow.python.framework import ops
 from tensorflow.python.platform import tf_logging
 from tensorflow.python.util import tf_decorator
-# pylint: enable=g-bad-import-order,g-import-not-at-top
 
 
 class _TFShouldUseHelper(object):
@@ -154,7 +151,18 @@ def _get_wrapper(x, tf_should_use_helper):
   tx = copy.deepcopy(type_x)
   # Prefer using __orig_bases__, which preserve generic type arguments.
   bases = getattr(tx, '__orig_bases__', tx.__bases__)
-  copy_tx = type(tx.__name__, bases, dict(tx.__dict__))
+
+  # Use types.new_class when available, which is preferred over plain type in
+  # some distributions.
+  if sys.version_info >= (3, 5):
+    def set_body(ns):
+      ns.update(tx.__dict__)
+      return ns
+
+    copy_tx = types.new_class(tx.__name__, bases, exec_body=set_body)
+  else:
+    copy_tx = type(tx.__name__, bases, dict(tx.__dict__))
+
   copy_tx.__init__ = _new__init__
   copy_tx.__getattribute__ = _new__getattribute__
   copy_tx.mark_used = _new_mark_used

From e0266dbf39deac09315b764524835299b513926c Mon Sep 17 00:00:00 2001
From: Mihai Maruseac <mihaimaruseac@google.com>
Date: Tue, 16 Jun 2020 12:48:20 -0700
Subject: [PATCH 0316/1390] Use `static_cast` instead of C-style casts.

PiperOrigin-RevId: 316738458
Change-Id: I54f2f2f43d31606246475df0eae8d20e673aee6b
---
 tensorflow/core/platform/types.h | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/tensorflow/core/platform/types.h b/tensorflow/core/platform/types.h
index 5f26dabda55..b2fefcaa960 100644
--- a/tensorflow/core/platform/types.h
+++ b/tensorflow/core/platform/types.h
@@ -37,18 +37,18 @@ namespace tensorflow {
 // Alias tensorflow::string to std::string.
 using std::string;
 
-static const uint8 kuint8max = ((uint8)0xFF);
-static const uint16 kuint16max = ((uint16)0xFFFF);
-static const uint32 kuint32max = ((uint32)0xFFFFFFFF);
-static const uint64 kuint64max = ((uint64)0xFFFFFFFFFFFFFFFFull);
-static const int8 kint8min = ((int8)~0x7F);
-static const int8 kint8max = ((int8)0x7F);
-static const int16 kint16min = ((int16)~0x7FFF);
-static const int16 kint16max = ((int16)0x7FFF);
-static const int32 kint32min = ((int32)~0x7FFFFFFF);
-static const int32 kint32max = ((int32)0x7FFFFFFF);
-static const int64 kint64min = ((int64)~0x7FFFFFFFFFFFFFFFll);
-static const int64 kint64max = ((int64)0x7FFFFFFFFFFFFFFFll);
+static const uint8 kuint8max = static_cast<uint8>(0xFF);
+static const uint16 kuint16max = static_cast<uint16>(0xFFFF);
+static const uint32 kuint32max = static_cast<uint32>(0xFFFFFFFF);
+static const uint64 kuint64max = static_cast<uint64>(0xFFFFFFFFFFFFFFFFull);
+static const int8 kint8min = static_cast<int8>(~0x7F);
+static const int8 kint8max = static_cast<int8>(0x7F);
+static const int16 kint16min = static_cast<int16>(~0x7FFF);
+static const int16 kint16max = static_cast<int16>(0x7FFF);
+static const int32 kint32min = static_cast<int32>(~0x7FFFFFFF);
+static const int32 kint32max = static_cast<int32>(0x7FFFFFFF);
+static const int64 kint64min = static_cast<int64>(~0x7FFFFFFFFFFFFFFFll);
+static const int64 kint64max = static_cast<int64>(0x7FFFFFFFFFFFFFFFll);
 
 // A typedef for a uint64 used as a short fingerprint.
 typedef uint64 Fprint;

From ed1d7d09aec54c8c277da957ed18d17ed6885711 Mon Sep 17 00:00:00 2001
From: Yunxing Dai <yunxing@google.com>
Date: Tue, 16 Jun 2020 12:54:54 -0700
Subject: [PATCH 0317/1390] Implement Mul(Convert(Pred), operand) =>
 select(pred, operand, 0) optimization.

PiperOrigin-RevId: 316739811
Change-Id: Ica5e50c6639a9792ae1dd47eefd713021fb97533
---
 .../xla/service/algebraic_simplifier.cc       | 19 +++++++++++++++++++
 .../xla/service/hlo_creation_utils.cc         |  9 +++++++++
 .../compiler/xla/service/hlo_creation_utils.h |  5 +++++
 3 files changed, 33 insertions(+)

diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier.cc b/tensorflow/compiler/xla/service/algebraic_simplifier.cc
index 98e3229b062..ce2a801fccd 100755
--- a/tensorflow/compiler/xla/service/algebraic_simplifier.cc
+++ b/tensorflow/compiler/xla/service/algebraic_simplifier.cc
@@ -2455,6 +2455,25 @@ Status AlgebraicSimplifierVisitor::HandleMultiply(HloInstruction* multiply) {
     return Status::OK();
   }
 
+  {
+    HloInstruction *convert_operand, *operand;
+    // Mul(Convert(Pred), operand) => select(pred, operand, 0)
+    if (Match(multiply,
+              m::MultiplyAnyOrder(
+                  m::Op(&operand),
+                  m::Convert(
+                      m::Op(&convert_operand)
+                          .WithShape(m::Shape().WithElementType(PRED)))))) {
+      HloInstruction* zero_like_multiply =
+          BroadcastZeros(computation_, multiply->shape().element_type(),
+                         multiply->shape().dimensions());
+      return ReplaceWithNewInstruction(
+          multiply, HloInstruction::CreateTernary(
+                        multiply->shape(), HloOpcode::kSelect, convert_operand,
+                        operand, zero_like_multiply));
+    }
+  }
+
   VLOG(10) << "trying transform [(A * C1) * C2 => A * (C1 * C2)]";
   HloInstruction *a, *c1, *c2;
   if (Match(multiply,
diff --git a/tensorflow/compiler/xla/service/hlo_creation_utils.cc b/tensorflow/compiler/xla/service/hlo_creation_utils.cc
index dd174772c62..0f5267e9fbc 100644
--- a/tensorflow/compiler/xla/service/hlo_creation_utils.cc
+++ b/tensorflow/compiler/xla/service/hlo_creation_utils.cc
@@ -539,6 +539,15 @@ HloInstruction* BroadcastZeros(HloComputation* computation,
                           /*result_shape_bounds=*/broadcast_dimensions);
 }
 
+HloInstruction* BroadcastOnes(HloComputation* computation,
+                              PrimitiveType element_type,
+                              absl::Span<const int64> broadcast_dimensions) {
+  HloInstruction* one = computation->AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::One(element_type)));
+  return MakeBroadcastHlo(one, /*broadcast_dimensions=*/{},
+                          /*result_shape_bounds=*/broadcast_dimensions);
+}
+
 // Recursively creates a dummy op given a shape. Leaf nodes are broadcasted zero
 // while internal nodes are tuples.
 HloInstruction* CreateDummyOp(HloComputation::Builder* b, const Shape& shape) {
diff --git a/tensorflow/compiler/xla/service/hlo_creation_utils.h b/tensorflow/compiler/xla/service/hlo_creation_utils.h
index 3f2e3aa25a1..2ba753d3cdb 100644
--- a/tensorflow/compiler/xla/service/hlo_creation_utils.h
+++ b/tensorflow/compiler/xla/service/hlo_creation_utils.h
@@ -276,6 +276,11 @@ HloInstruction* BroadcastZeros(HloComputation* computation,
                                PrimitiveType element_type,
                                absl::Span<const int64> broadcast_dimensions);
 
+// Same as above, but fill the tensor with ones.
+HloInstruction* BroadcastOnes(HloComputation* computation,
+                              PrimitiveType element_type,
+                              absl::Span<const int64> broadcast_dimensions);
+
 // Creates a HLO computation that takes arguments of type `domain` and produces
 // a value of type `range`.
 StatusOr<std::unique_ptr<HloComputation>> CreateComputationWithSignature(

From 5f06c799857622c0a16d47ff985a6b8f1b5559bf Mon Sep 17 00:00:00 2001
From: Sung Jin Hwang <sjhwang@google.com>
Date: Tue, 16 Jun 2020 12:59:30 -0700
Subject: [PATCH 0318/1390] Limit reserve size by 2**16 in dataset batch op
 when drop_remainder is false.

Dataset batch is sometimes used to stack all the elements in the dataset. A
common pattern is to pass a very large batch size with drop_remainder == false.
The batch size should be larger than the dataset cardinality, but at the same
time, it should not be too large otherwise vector reserve in the batch op ends
up OOM.

This change limits the reserve size by 2**16 when drop_remainder is false. Then
the users may pass a large enough number like INT32_MAX or INT64_MAX to stack
all elements.

PiperOrigin-RevId: 316740746
Change-Id: I445ff66eff088363c802ec31c359e81f188d8047
---
 tensorflow/core/kernels/data/batch_dataset_op.cc | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/tensorflow/core/kernels/data/batch_dataset_op.cc b/tensorflow/core/kernels/data/batch_dataset_op.cc
index c915f80c2c6..cfeb63a4242 100644
--- a/tensorflow/core/kernels/data/batch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/batch_dataset_op.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/kernels/data/batch_dataset_op.h"
 
+#include <algorithm>
 #include <utility>
 
 #include "tensorflow/core/framework/op_kernel.h"
@@ -49,6 +50,12 @@ class BatchDatasetOp::Dataset : public DatasetBase {
           bool parallel_copy, const DatasetBase* input, int op_version)
       : DatasetBase(DatasetContext(ctx)),
         batch_size_(batch_size),
+        // Dataset batch is sometimes used to stack all elements in the
+        // dataset. In such cases, a very large batch size (e.g., INT32_MAX)
+        // is passed with drop_remainder set to false. Avoid OOM in such case
+        // by limiting `reserve()` size by 2**16.
+        reserve_size_(drop_remainder ? batch_size
+                                     : std::min<int64>(batch_size, 1 << 16)),
         drop_remainder_(drop_remainder),
         parallel_copy_(parallel_copy),
         input_(input),
@@ -153,7 +160,7 @@ class BatchDatasetOp::Dataset : public DatasetBase {
           *end_of_sequence = true;
           return Status::OK();
         }
-        batch_elements.reserve(dataset()->batch_size_);
+        batch_elements.reserve(dataset()->reserve_size_);
         *end_of_sequence = false;
         for (int i = 0; i < dataset()->batch_size_ && !*end_of_sequence; ++i) {
           std::vector<Tensor> batch_element_tuple;
@@ -289,6 +296,7 @@ class BatchDatasetOp::Dataset : public DatasetBase {
   };
 
   const int64 batch_size_;
+  const int64 reserve_size_;
   const bool drop_remainder_;
   const bool parallel_copy_;
   const DatasetBase* const input_;

From 93a441910f93117238a6b5bc74a765d05081dfd3 Mon Sep 17 00:00:00 2001
From: Amit Patankar <amitpatankar@google.com>
Date: Tue, 16 Jun 2020 13:01:22 -0700
Subject: [PATCH 0319/1390] Add a link to GitHub README for libtensorflow
 nightly binaries.

PiperOrigin-RevId: 316741133
Change-Id: Id38352fc0e52f3f1cbeef4256e4a380b8e0ed07e
---
 README.md | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index a76b1bfd0b7..54c9470b04b 100644
--- a/README.md
+++ b/README.md
@@ -61,7 +61,6 @@ commands.
 *Nightly binaries are available for testing using the
 [tf-nightly](https://pypi.python.org/pypi/tf-nightly) and
 [tf-nightly-cpu](https://pypi.python.org/pypi/tf-nightly-cpu) packages on PyPi.*
-
 #### *Try your first TensorFlow program*
 
 ```shell
@@ -114,6 +113,12 @@ Build Type               | Status
 **Android**              | [![Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/android.svg)](https://storage.googleapis.com/tensorflow-kokoro-build-badges/android.html)               | [![Download](https://api.bintray.com/packages/google/tensorflow/tensorflow/images/download.svg)](https://bintray.com/google/tensorflow/tensorflow/_latestVersion)
 **Raspberry Pi 0 and 1** | [![Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/rpi01-py3.svg)](https://storage.googleapis.com/tensorflow-kokoro-build-badges/rpi01-py3.html)           | [Py3](https://storage.googleapis.com/tensorflow-nightly/tensorflow-1.10.0-cp34-none-linux_armv6l.whl)
 **Raspberry Pi 2 and 3** | [![Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/rpi23-py3.svg)](https://storage.googleapis.com/tensorflow-kokoro-build-badges/rpi23-py3.html)           | [Py3](https://storage.googleapis.com/tensorflow-nightly/tensorflow-1.10.0-cp34-none-linux_armv7l.whl)
+**Libtensorflow MacOS CPU** | [![Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/libtensorflow-mac-cpu.svg)](https://storage.googleapis.com/tensorflow-kokoro-build-badges/libtensorflow-mac-cpu.html)           | [GCS](https://storage.googleapis.com/libtensorflow-nightly)
+**Libtensorflow Linux CPU** | [![Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/libtensorflow-linux-cpu.svg)](https://storage.googleapis.com/tensorflow-kokoro-build-badges/libtensorflow-linux-cpu.html)           | [GCS](https://storage.googleapis.com/libtensorflow-nightly)
+**Libtensorflow Linux GPU** | [![Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/libtensorflow-linux-gpu.svg)](https://storage.googleapis.com/tensorflow-kokoro-build-badges/libtensorflow-linux-gpu.html)           | [GCS](https://storage.googleapis.com/libtensorflow-nightly)
+**Libtensorflow Windows CPU** | [![Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/libtensorflow-win-cpu.svg)](https://storage.googleapis.com/tensorflow-kokoro-build-badges/libtensorflow-win-cpu.html)           | [GCS](https://storage.googleapis.com/libtensorflow-nightly)
+**Libtensorflow Windows GPU** | [![Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/libtensorflow-win-gpu.svg)](https://storage.googleapis.com/tensorflow-kokoro-build-badges/libtensorflow-win-gpu.html)           | [GCS](https://storage.googleapis.com/libtensorflow-nightly)
+
 
 ### Community Supported Builds
 

From 2a0ad4792645fde8563baa42f9ac476125e60fad Mon Sep 17 00:00:00 2001
From: Bruce Fontaine <bfontain@google.com>
Date: Tue, 16 Jun 2020 13:07:46 -0700
Subject: [PATCH 0320/1390] Update tpu_embedding_v2.py to use the new API for
 prefetching data to host memory.

PiperOrigin-RevId: 316742491
Change-Id: I6803c798256578a284d9ef190d79bf2e35f9ce6a
---
 tensorflow/python/tpu/tpu_embedding_v2.py     |  54 +++++++-
 .../python/tpu/tpu_embedding_v2_test.py       | 126 +++++++++++++++---
 2 files changed, 152 insertions(+), 28 deletions(-)

diff --git a/tensorflow/python/tpu/tpu_embedding_v2.py b/tensorflow/python/tpu/tpu_embedding_v2.py
index 90b43c1ebf4..f7a383c440c 100644
--- a/tensorflow/python/tpu/tpu_embedding_v2.py
+++ b/tensorflow/python/tpu/tpu_embedding_v2.py
@@ -31,6 +31,7 @@ from tensorflow.python.distribute import sharded_variable
 from tensorflow.python.distribute import tpu_strategy
 from tensorflow.python.eager import def_function
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import device as tf_device
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
@@ -139,6 +140,18 @@ class TPUEmbedding(tracking.AutoTrackable):
         optimizer=tf.tpu.experimental.embedding.SGD(0.1))
   ```
 
+  When creating a distributed dataset that is to be passed to the enqueue
+  operation a special input option must be specified:
+
+  ```python
+  distributed_dataset = (
+      strategy.experimental_distribute_datasets_from_function(
+          dataset_fn=...,
+          options=tf.distribute.InputOptions(
+              experimental_prefetch_to_device=False))
+  dataset_iterator = iter(distributed_dataset)
+  ```
+
   To use this API on TPU you should use a custom training loop. Below is an
   example of a training and evaluation step:
 
@@ -309,10 +322,6 @@ class TPUEmbedding(tracking.AutoTrackable):
       # We need to list of host devices for the load/retrieve operations.
       self._hosts = get_list_of_hosts(self._strategy)
 
-      # TODO(bfontain) Remove this once we have an official way of splitting
-      # prefetch between host and device.
-      self._strategy.extended._set_prefetch_on_host(True)  # pylint: disable=protected-access
-
       # We generally use the per core batch size, but will have the user pass
       # in a global batch size.
       self._batch_size = batch_size // self._strategy.num_replicas_in_sync
@@ -507,7 +516,11 @@ class TPUEmbedding(tracking.AutoTrackable):
     with strategy.scope():
       embedding = tf.tpu.experimental.embedding.TPUEmbedding(...)
 
-    distributed_dataset = strategy.experimental_distribute_dataset(...)
+    distributed_dataset = (
+        strategy.experimental_distribute_datasets_from_function(
+            dataset_fn=...,
+            options=tf.distribute.InputOptions(
+                experimental_prefetch_to_device=False))
     dataset_iterator = iter(distributed_dataset)
 
     @tf.function
@@ -594,7 +607,11 @@ class TPUEmbedding(tracking.AutoTrackable):
     with strategy.scope():
       embedding = tf.tpu.experimental.embedding.TPUEmbedding(...)
 
-    distributed_dataset = strategy.experimental_distribute_dataset(...)
+    distributed_dataset = (
+        strategy.experimental_distribute_datasets_from_function(
+            dataset_fn=...,
+            options=tf.distribute.InputOptions(
+                experimental_prefetch_to_device=False))
     dataset_iterator = iter(distributed_dataset)
 
     @tf.function
@@ -1004,6 +1021,24 @@ class TPUEmbedding(tracking.AutoTrackable):
                                                  input_tensor.op.name,
                                                  input_tensor.op.type))
 
+  def _raise_error_for_inputs_not_on_cpu(self, features):
+    """Checks all tensors in features to see are placed on the CPU."""
+
+    # expand_composites here is important, we need to check the device of each
+    # underlying tensor.
+    for path, input_tensor in nest.flatten_with_joined_string_paths(
+        features, expand_composites=True):
+      spec = tf_device.DeviceSpec.from_string(input_tensor.device)
+      if spec.device_type == "TPU":
+        raise ValueError(
+            "Received input tensor {} which is on a TPU input device {}. Input "
+            "tensors for TPU embeddings must be placed on the CPU. Please "
+            "ensure that your dataset is prefetching tensors to the host by "
+            "setting the 'experimental_prefetch_to_device' option of the "
+            "dataset distribution function. See the documentation of the "
+            "enqueue method for an example.".format(
+                path, input_tensor.device))
+
   def enqueue(self, features, weights=None, training=True, name=None):
     """Enqueues id tensors for embedding lookup.
 
@@ -1021,7 +1056,11 @@ class TPUEmbedding(tracking.AutoTrackable):
     with strategy.scope():
       embedding = tf.tpu.experimental.embedding.TPUEmbedding(...)
 
-    distributed_dataset = strategy.experimental_distribute_dataset(...)
+    distributed_dataset = (
+        strategy.experimental_distribute_datasets_from_function(
+            dataset_fn=...,
+            options=tf.distribute.InputOptions(
+                experimental_prefetch_to_device=False))
     dataset_iterator = iter(distributed_dataset)
 
     @tf.function
@@ -1091,6 +1130,7 @@ class TPUEmbedding(tracking.AutoTrackable):
       flat_weights = nest.flatten(weights)
     flat_features = nest.flatten_with_joined_string_paths(self._feature_config)
 
+    self._raise_error_for_inputs_not_on_cpu(features)
     in_tpu_context = self._raise_error_for_incorrect_control_flow_context()
     # If we are in a tpu_context, automatically apply outside compilation.
     if in_tpu_context:
diff --git a/tensorflow/python/tpu/tpu_embedding_v2_test.py b/tensorflow/python/tpu/tpu_embedding_v2_test.py
index 0c257010f6a..ebaf2791055 100644
--- a/tensorflow/python/tpu/tpu_embedding_v2_test.py
+++ b/tensorflow/python/tpu/tpu_embedding_v2_test.py
@@ -28,6 +28,7 @@ import numpy as np
 
 from tensorflow.python.compat import v2_compat
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.distribute import distribute_lib
 from tensorflow.python.distribute import distribution_strategy_context
 from tensorflow.python.distribute import tpu_strategy
 from tensorflow.python.distribute.cluster_resolver import tpu_cluster_resolver
@@ -443,7 +444,10 @@ class TPUEmbeddingTest(parameterized.TestCase, test.TestCase):
   def test_pass_none_to_apply_gradients(self):
     strategy, mid_level_api, _ = self._create_strategy_and_mid_level('sgd')
     dataset = self._create_sparse_dataset(strategy)
-    data = next(iter(strategy.experimental_distribute_dataset(dataset)))
+    data = next(iter(strategy.experimental_distribute_dataset(
+        dataset,
+        options=distribute_lib.InputOptions(
+            experimental_prefetch_to_device=False))))
 
     @def_function.function
     def embedding_and_set_gradients(data):
@@ -527,7 +531,10 @@ class TPUEmbeddingTest(parameterized.TestCase, test.TestCase):
     strategy, mid_level_api, _ = self._create_strategy_and_mid_level('sgd')
 
     input_fn = self._create_dense_input_fn(strategy, include_weights=True)
-    dist = strategy.experimental_distribute_datasets_from_function(input_fn)
+    dist = strategy.experimental_distribute_datasets_from_function(
+        input_fn,
+        options=distribute_lib.InputOptions(
+            experimental_prefetch_to_device=False))
     dist_iter = iter(dist)
 
     @def_function.function
@@ -547,8 +554,14 @@ class TPUEmbeddingTest(parameterized.TestCase, test.TestCase):
 
     sparse = self._create_sparse_dataset(strategy)
     ragged = self._create_ragged_dataset(strategy, include_weights=True)
-    sparse_iter = iter(strategy.experimental_distribute_dataset(sparse))
-    ragged_iter = iter(strategy.experimental_distribute_dataset(ragged))
+    sparse_iter = iter(strategy.experimental_distribute_dataset(
+        sparse,
+        options=distribute_lib.InputOptions(
+            experimental_prefetch_to_device=False)))
+    ragged_iter = iter(strategy.experimental_distribute_dataset(
+        ragged,
+        options=distribute_lib.InputOptions(
+            experimental_prefetch_to_device=False)))
 
     @def_function.function
     def test_fn():
@@ -569,8 +582,14 @@ class TPUEmbeddingTest(parameterized.TestCase, test.TestCase):
 
     sparse = self._create_sparse_dataset(strategy, include_weights=True)
     ragged = self._create_ragged_dataset(strategy)
-    sparse_iter = iter(strategy.experimental_distribute_dataset(sparse))
-    ragged_iter = iter(strategy.experimental_distribute_dataset(ragged))
+    sparse_iter = iter(strategy.experimental_distribute_dataset(
+        sparse,
+        options=distribute_lib.InputOptions(
+            experimental_prefetch_to_device=False)))
+    ragged_iter = iter(strategy.experimental_distribute_dataset(
+        ragged,
+        options=distribute_lib.InputOptions(
+            experimental_prefetch_to_device=False)))
 
     @def_function.function
     def test_fn():
@@ -591,8 +610,14 @@ class TPUEmbeddingTest(parameterized.TestCase, test.TestCase):
 
     sparse = self._create_sparse_dataset(strategy)
     ragged = self._create_ragged_dataset(strategy)
-    sparse_iter = iter(strategy.experimental_distribute_dataset(sparse))
-    ragged_iter = iter(strategy.experimental_distribute_dataset(ragged))
+    sparse_iter = iter(strategy.experimental_distribute_dataset(
+        sparse,
+        options=distribute_lib.InputOptions(
+            experimental_prefetch_to_device=False)))
+    ragged_iter = iter(strategy.experimental_distribute_dataset(
+        ragged,
+        options=distribute_lib.InputOptions(
+            experimental_prefetch_to_device=False)))
 
     @def_function.function
     def test_fn():
@@ -613,7 +638,10 @@ class TPUEmbeddingTest(parameterized.TestCase, test.TestCase):
     strategy, mid_level_api, _ = self._create_strategy_and_mid_level('sgd')
 
     sparse = self._create_sparse_dataset(strategy)
-    sparse_iter = iter(strategy.experimental_distribute_dataset(sparse))
+    sparse_iter = iter(strategy.experimental_distribute_dataset(
+        sparse,
+        options=distribute_lib.InputOptions(
+            experimental_prefetch_to_device=False)))
 
     @def_function.function
     def test_fn():
@@ -633,7 +661,10 @@ class TPUEmbeddingTest(parameterized.TestCase, test.TestCase):
     strategy, mid_level_api, _ = self._create_strategy_and_mid_level('sgd')
 
     sparse = self._create_sparse_dataset(strategy, include_weights=True)
-    sparse_iter = iter(strategy.experimental_distribute_dataset(sparse))
+    sparse_iter = iter(strategy.experimental_distribute_dataset(
+        sparse,
+        options=distribute_lib.InputOptions(
+            experimental_prefetch_to_device=False)))
 
     @def_function.function
     def test_fn():
@@ -654,8 +685,14 @@ class TPUEmbeddingTest(parameterized.TestCase, test.TestCase):
 
     sparse = self._create_sparse_dataset(strategy)
     ragged = self._create_ragged_dataset(strategy)
-    sparse_iter = iter(strategy.experimental_distribute_dataset(sparse))
-    ragged_iter = iter(strategy.experimental_distribute_dataset(ragged))
+    sparse_iter = iter(strategy.experimental_distribute_dataset(
+        sparse,
+        options=distribute_lib.InputOptions(
+            experimental_prefetch_to_device=False)))
+    ragged_iter = iter(strategy.experimental_distribute_dataset(
+        ragged,
+        options=distribute_lib.InputOptions(
+            experimental_prefetch_to_device=False)))
 
     @def_function.function
     def test_fn():
@@ -678,6 +715,26 @@ class TPUEmbeddingTest(parameterized.TestCase, test.TestCase):
     ragged0 = self._get_replica_numpy(ragged_activations, strategy, 0)
     self.assertAllClose(sparse0, ragged0)
 
+  def test_enqueue_cpu_tensor(self):
+    strategy, mid_level_api, _ = self._create_strategy_and_mid_level('sgd')
+
+    input_fn = self._create_dense_input_fn(strategy)
+    sparse_iter = iter(strategy.experimental_distribute_datasets_from_function(
+        input_fn))
+
+    @def_function.function
+    def test_fn():
+      def get_activations():
+        return mid_level_api.dequeue()
+
+      sparse_features = next(sparse_iter)
+      mid_level_api.enqueue(sparse_features, training=False)
+      sparse_activations = strategy.run(get_activations)
+      return sparse_activations
+
+    with self.assertRaisesRegex(ValueError, 'which is on a TPU input device'):
+      test_fn()
+
   @parameterized.parameters(True, False)
   def test_enqueue_with_weights(self, ragged):
     strategy, mid_level_api, _ = self._create_strategy_and_mid_level('sgd')
@@ -689,7 +746,10 @@ class TPUEmbeddingTest(parameterized.TestCase, test.TestCase):
       dataset = self._create_sparse_dataset(strategy, include_weights=True,
                                             weight=weight)
 
-    dataset_iter = iter(strategy.experimental_distribute_dataset(dataset))
+    dataset_iter = iter(strategy.experimental_distribute_dataset(
+        dataset,
+        options=distribute_lib.InputOptions(
+            experimental_prefetch_to_device=False)))
 
     @def_function.function
     def enqueue_and_get(features, weights):
@@ -727,7 +787,10 @@ class TPUEmbeddingTest(parameterized.TestCase, test.TestCase):
 
     strategy, mid_level_api, _ = self._create_strategy_and_mid_level('sgd')
     dataset = self._create_sparse_dataset(strategy)
-    dataset_iter = iter(strategy.experimental_distribute_dataset(dataset))
+    dataset_iter = iter(strategy.experimental_distribute_dataset(
+        dataset,
+        options=distribute_lib.InputOptions(
+            experimental_prefetch_to_device=False)))
 
     @def_function.function
     def enqueue_with_outside_compilation(data):
@@ -761,7 +824,10 @@ class TPUEmbeddingTest(parameterized.TestCase, test.TestCase):
 
     strategy, mid_level_api, _ = self._create_strategy_and_mid_level('sgd')
     dataset = self._create_sparse_dataset(strategy)
-    dataset_iter = iter(strategy.experimental_distribute_dataset(dataset))
+    dataset_iter = iter(strategy.experimental_distribute_dataset(
+        dataset,
+        options=distribute_lib.InputOptions(
+            experimental_prefetch_to_device=False)))
 
     # This is one way to force the enqueue in some control flow. @tf.functions
     # aren't inlined in the calling tf.function. An alternative would be to
@@ -785,7 +851,10 @@ class TPUEmbeddingTest(parameterized.TestCase, test.TestCase):
   def test_enqueue_with_outside_compilation_non_direct_input(self):
     strategy, mid_level_api, _ = self._create_strategy_and_mid_level('sgd')
     dataset = self._create_sparse_dataset(strategy)
-    dataset_iter = iter(strategy.experimental_distribute_dataset(dataset))
+    dataset_iter = iter(strategy.experimental_distribute_dataset(
+        dataset,
+        options=distribute_lib.InputOptions(
+            experimental_prefetch_to_device=False)))
 
     @def_function.function
     def enqueue_with_outside_compilation():
@@ -804,7 +873,10 @@ class TPUEmbeddingTest(parameterized.TestCase, test.TestCase):
   def test_enqueue_with_outside_compilation_auto_mode(self):
     strategy, mid_level_api, _ = self._create_strategy_and_mid_level('sgd')
     dataset = self._create_sparse_dataset(strategy)
-    dataset_iter = iter(strategy.experimental_distribute_dataset(dataset))
+    dataset_iter = iter(strategy.experimental_distribute_dataset(
+        dataset,
+        options=distribute_lib.InputOptions(
+            experimental_prefetch_to_device=False)))
 
     @def_function.function
     def enqueue_with_no_gradient_apply(data):
@@ -883,7 +955,10 @@ class TPUEmbeddingTest(parameterized.TestCase, test.TestCase):
         self._create_strategy_and_mid_level(optimizer_name))
 
     dataset = self._create_sparse_dataset(strategy)
-    dist = strategy.experimental_distribute_dataset(dataset)
+    dist = strategy.experimental_distribute_dataset(
+        dataset,
+        options=distribute_lib.InputOptions(
+            experimental_prefetch_to_device=False))
     dist_iter = iter(dist)
 
     @def_function.function
@@ -1175,7 +1250,10 @@ class TPUEmbeddingTest(parameterized.TestCase, test.TestCase):
     strategy, mid_level_api, _ = self._create_strategy_and_mid_level('sgd')
 
     input_fn = self._create_dense_input_fn(strategy)
-    dist = strategy.experimental_distribute_datasets_from_function(input_fn)
+    dist = strategy.experimental_distribute_datasets_from_function(
+        input_fn,
+        options=distribute_lib.InputOptions(
+            experimental_prefetch_to_device=False))
     dist_iter = iter(dist)
 
     @def_function.function
@@ -1235,7 +1313,10 @@ class TPUEmbeddingTest(parameterized.TestCase, test.TestCase):
     def input_fn(ctx):
       del ctx
       return dataset_ops.DatasetV2.from_tensors(feature).repeat()
-    dist = strategy.experimental_distribute_datasets_from_function(input_fn)
+    dist = strategy.experimental_distribute_datasets_from_function(
+        input_fn,
+        options=distribute_lib.InputOptions(
+            experimental_prefetch_to_device=False))
     dist_iter = iter(dist)
 
     @def_function.function
@@ -1364,7 +1445,10 @@ class TPUEmbeddingTest(parameterized.TestCase, test.TestCase):
           optimizer=optimizer)
 
     dataset = self._create_sparse_dataset(strategy)
-    data = next(iter(strategy.experimental_distribute_dataset(dataset)))
+    data = next(iter(strategy.experimental_distribute_dataset(
+        dataset,
+        options=distribute_lib.InputOptions(
+            experimental_prefetch_to_device=False))))
 
     @def_function.function
     def embedding_and_set_gradients(data):

From 89df3ddcd5451aefe52c7deb684f220b0520a6b1 Mon Sep 17 00:00:00 2001
From: Katherine Wu <kathywu@google.com>
Date: Tue, 16 Jun 2020 13:08:10 -0700
Subject: [PATCH 0321/1390] Raise error when calling loaded model with layers
 that are not fully serialized.

PiperOrigin-RevId: 316742578
Change-Id: Iefc40d21374388ed99f7ff40bb09436830b46cbe
---
 .../python/keras/saving/saved_model/load.py   | 51 ++++++++++++++++---
 .../saving/saved_model/saved_model_test.py    | 30 +++++++++++
 2 files changed, 75 insertions(+), 6 deletions(-)

diff --git a/tensorflow/python/keras/saving/saved_model/load.py b/tensorflow/python/keras/saving/saved_model/load.py
index 313eea4342e..a378c1b98e7 100644
--- a/tensorflow/python/keras/saving/saved_model/load.py
+++ b/tensorflow/python/keras/saving/saved_model/load.py
@@ -690,18 +690,22 @@ def _finalize_saved_model_layers(layers):
           layer, _get_keras_attr(layer).call_and_return_conditional_losses,
           return_method=True)
       layer._init_call_fn_args()
+    else:
+      layer.call = types.MethodType(
+          _unable_to_call_layer_due_to_serialization_issue, layer)
 
   for layer in layers:
     # 2. Set model inputs and outputs.
     if isinstance(layer, RevivedNetwork):
       _set_network_attributes_from_metadata(layer)
 
-      call_fn = _get_keras_attr(layer).call_and_return_conditional_losses
-      if call_fn.input_signature is None:
-        inputs = infer_inputs_from_restored_call_function(call_fn)
-      else:
-        inputs = call_fn.input_signature[0]
-      layer._set_inputs(inputs)
+      if hasattr(_get_keras_attr(layer), 'call_and_return_conditional_losses'):
+        call_fn = _get_keras_attr(layer).call_and_return_conditional_losses
+        if call_fn.input_signature is None:
+          inputs = infer_inputs_from_restored_call_function(call_fn)
+        else:
+          inputs = call_fn.input_signature[0]
+        layer._set_inputs(inputs)  # pylint: disable=protected-access
 
     # 3. Add losses that aren't generated by the layer.call function.
     _restore_layer_unconditional_losses(layer)
@@ -713,6 +717,41 @@ def _finalize_saved_model_layers(layers):
   # pylint: enable=protected-access
 
 
+def _unable_to_call_layer_due_to_serialization_issue(
+    layer, *unused_args, **unused_kwargs):
+  """Replaces the `layer.call` if the layer was not fully serialized.
+
+  Keras Model/Layer serialization is relatively relaxed because SavedModels
+  are not always loaded back as keras models. Thus, when there is an issue
+  tracing a non-signature function, a warning is logged instead of raising an
+  error. This results in a SavedModel where the model's call function is saved,
+  but the internal layer call functions are not.
+
+  When deserialized with `tf.keras.models.load_model`, the internal layers
+  which do not have serialized call functions should raise an error when called.
+
+  Args:
+    layer: Layer without the serialized call function.
+
+  Raises:
+    ValueError
+  """
+
+  raise ValueError(
+      'Cannot call {} ({}), because the call function was not serialized to '
+      'the SavedModel (due to lack information about the inputs). Please try '
+      'one of the following methods to fix the serialization:'
+      '\n\n(1) Implement `get_config` and `from_config` in the layer/model '
+      'class, and pass the object to the `custom_objects` argument when '
+      'loading the model. For more details, see: '
+      'https://www.tensorflow.org/guide/keras/save_and_serialize'
+      '\n\n(2) Ensure that the subclassed model or layer overwrites `call` '
+      'and not `__call__`. The input shape and dtype will be automatically '
+      'recorded when the object is called, and used when saving. To manually '
+      'specify the input shape/dtype, decorate the call function with '
+      '`@tf.function(input_signature=...)`.'.format(layer.name, layer))
+
+
 def _finalize_config_layers(layers):
   """Runs the final steps of loading Keras Layers from config."""
   for layer in layers:
diff --git a/tensorflow/python/keras/saving/saved_model/saved_model_test.py b/tensorflow/python/keras/saving/saved_model/saved_model_test.py
index 7eaa75b78e2..8d4d27e2357 100644
--- a/tensorflow/python/keras/saving/saved_model/saved_model_test.py
+++ b/tensorflow/python/keras/saving/saved_model/saved_model_test.py
@@ -809,6 +809,36 @@ class TestModelSavingAndLoadingV2(keras_parameterized.TestCase):
     self.evaluate(variables.variables_initializer(loaded.variables))
     self.assertAllClose(model.predict(f), loaded.predict(f))
 
+  def test_load_with_partially_failed_serialization(self):
+
+    class BadCustomLayer(keras.layers.Layer):
+
+      def __call__(self, inputs):
+        return inputs
+
+    class Model(keras.models.Model):
+
+      def __init__(self):
+        super(Model, self).__init__()
+        self.layer = BadCustomLayer()
+
+      @def_function.function(
+          input_signature=[tensor_spec.TensorSpec([None, 1])])
+      def call(self, inputs):
+        return self.layer(inputs)
+
+    model = Model()
+    inp = constant_op.constant([[1.0]])
+    model(inp)
+    saved_model_dir = self._save_model_dir()
+    tf_save.save(model, saved_model_dir)
+
+    loaded = keras_load.load(saved_model_dir)
+    self.assertAllEqual([[1.0]], self.evaluate(loaded(inp)))
+    with self.assertRaisesRegexp(ValueError,
+                                 'call function was not serialized'):
+      loaded.layer(inp)
+
 
 class TestLayerCallTracing(test.TestCase, parameterized.TestCase):
 

From 1a342fb760f6a3b5ff29b0b4c47bbf07a7bb1221 Mon Sep 17 00:00:00 2001
From: Ran Chen <crccw@google.com>
Date: Tue, 16 Jun 2020 13:11:14 -0700
Subject: [PATCH 0322/1390] Explicitly take the set of devices in
 CollectiveAllReduce

We used to infer the devices from the inputs, but sometimes the inputs don't
have device placement. E.g. when passing into or returning from tf.function, the
device placement may be lost.

Instead of inferring from the inputs we should just be explicit about the
collective devices.

PiperOrigin-RevId: 316743112
Change-Id: I2f6995f2f4cc86864723e203deb7562363cdbc38
---
 tensorflow/python/distribute/BUILD            |  9 +-
 .../collective_all_reduce_strategy.py         | 45 +++++++--
 .../collective_all_reduce_strategy_test.py    |  3 +-
 .../python/distribute/cross_device_ops.py     | 77 +++++++++-------
 .../distribute/cross_device_ops_test.py       | 80 ++++++++++++----
 .../python/distribute/cross_device_utils.py   | 91 +++++++++----------
 .../python/distribute/mirrored_strategy.py    | 19 ++--
 .../python/distribute/strategy_common_test.py | 47 +++++++++-
 8 files changed, 252 insertions(+), 119 deletions(-)

diff --git a/tensorflow/python/distribute/BUILD b/tensorflow/python/distribute/BUILD
index 77ef98d1cb7..a3655699669 100644
--- a/tensorflow/python/distribute/BUILD
+++ b/tensorflow/python/distribute/BUILD
@@ -1029,14 +1029,21 @@ cuda_py_test(
         ":collective_util",
         ":combinations",
         ":cross_device_ops",
-        ":mirrored_strategy",
+        ":cross_device_utils",
+        ":device_util",
         ":multi_worker_test_base",
+        ":multi_worker_util",
+        ":reduce_util",
         ":strategy_combinations",
         ":values",
         "//tensorflow/python:array_ops",
+        "//tensorflow/python:collective_ops",
         "//tensorflow/python:constant_op",
         "//tensorflow/python:framework_ops",
+        "//tensorflow/python:kernels",
         "//tensorflow/python:math_ops",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/distribute/cluster_resolver:cluster_resolver_lib",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:test",
         "@absl_py//absl/testing:parameterized",
diff --git a/tensorflow/python/distribute/collective_all_reduce_strategy.py b/tensorflow/python/distribute/collective_all_reduce_strategy.py
index 23ed16c5cfd..68cc421c21b 100644
--- a/tensorflow/python/distribute/collective_all_reduce_strategy.py
+++ b/tensorflow/python/distribute/collective_all_reduce_strategy.py
@@ -175,7 +175,7 @@ class CollectiveAllReduceExtended(mirrored_strategy.MirroredExtended):
     self._communication = communication
     self._initialize_strategy(self._cluster_resolver)
     self._cfer_fn_cache = weakref.WeakKeyDictionary()
-    assert isinstance(self._get_cross_device_ops(),
+    assert isinstance(self._cross_device_ops,
                       cross_device_ops_lib.CollectiveAllReduce)
 
   def _initialize_strategy(self, cluster_resolver):
@@ -217,12 +217,18 @@ class CollectiveAllReduceExtended(mirrored_strategy.MirroredExtended):
     self._host_input_device = numpy_dataset.SingleDevice(self._worker_device)
 
     self._collective_keys = cross_device_utils.CollectiveKeys()
-    # TODO(yuefengz): remove num_gpus_per_worker from CollectiveAllReduce.
     self._cross_device_ops = cross_device_ops_lib.CollectiveAllReduce(
-        num_workers=self._num_workers,
-        num_gpus_per_worker=num_gpus,
+        devices=local_devices,
+        group_size=len(local_devices),
         collective_keys=self._collective_keys,
         communication=self._communication)
+    # CrossDeviceOps for per host tensors.
+    self._host_cross_device_ops = cross_device_ops_lib.CollectiveAllReduce(
+        devices=[self._worker_device],
+        group_size=self._num_workers,
+        collective_keys=self._collective_keys,
+        communication=cross_device_ops_lib.CollectiveCommunication.RING,
+    )
     super(CollectiveAllReduceExtended, self)._initialize_single_worker(
         local_devices)
 
@@ -324,10 +330,17 @@ class CollectiveAllReduceExtended(mirrored_strategy.MirroredExtended):
 
     self._collective_keys = cross_device_utils.CollectiveKeys()
     self._cross_device_ops = cross_device_ops_lib.CollectiveAllReduce(
-        num_workers=self._num_workers,
-        num_gpus_per_worker=num_gpus,
+        devices=local_devices,
+        group_size=len(local_devices) * self._num_workers,
         collective_keys=self._collective_keys,
         communication=self._communication)
+    # CrossDeviceOps for per host tensors.
+    self._host_cross_device_ops = cross_device_ops_lib.CollectiveAllReduce(
+        devices=[self._worker_device],
+        group_size=self._num_workers,
+        collective_keys=self._collective_keys,
+        communication=cross_device_ops_lib.CollectiveCommunication.RING,
+    )
     super(CollectiveAllReduceExtended, self)._initialize_single_worker(
         local_devices)
     host_device = device_util.get_host_for_device(self._worker_device)
@@ -474,7 +487,7 @@ class CollectiveAllReduceExtended(mirrored_strategy.MirroredExtended):
           num_accelerators={"GPU": self._num_gpus_per_worker},
           rpc_layer=self._rpc_layer)
       self._initialize_multi_worker(cluster_resolver)
-      assert isinstance(self._get_cross_device_ops(),
+      assert isinstance(self._cross_device_ops,
                         cross_device_ops_lib.CollectiveAllReduce)
 
     if session_config:
@@ -518,6 +531,22 @@ class CollectiveAllReduceExtended(mirrored_strategy.MirroredExtended):
 
     return updated_config
 
+  def _get_cross_device_ops(self, value):
+    # CollectiveAllReduce works on a predefined set of devices. In most cases
+    # they should be the compute devices, but certain use cases may reduce host
+    # tensors as well (e.g. early stopping). We infer the cross_device_ops to
+    # use based on the number of devices, since inputs don't always have device
+    # annotations. The compute devices one is preferred since we can potentially
+    # leverage NCCL.
+    if isinstance(value, values.DistributedValues):
+      num_devices = len(value._values)  # pylint: disable=protected-access
+    else:
+      num_devices = 1
+    if num_devices == len(self.worker_devices):
+      return self._cross_device_ops
+    else:
+      return self._host_cross_device_ops
+
   def _reduce_to(self, reduce_op, value, destinations, experimental_hints):
     if (isinstance(value, values.Mirrored) and
         reduce_op == reduce_util.ReduceOp.MEAN):
@@ -538,7 +567,7 @@ class CollectiveAllReduceExtended(mirrored_strategy.MirroredExtended):
       # be 0.
       return cross_device_ops_lib.reduce_non_distributed_value(
           reduce_op, value, destinations, len(self.worker_devices))
-    return self._get_cross_device_ops().reduce(
+    return self._get_cross_device_ops(value).reduce(
         reduce_op,
         value,
         destinations=destinations,
diff --git a/tensorflow/python/distribute/collective_all_reduce_strategy_test.py b/tensorflow/python/distribute/collective_all_reduce_strategy_test.py
index 63c0db4c3b3..a9f7bc74e9e 100644
--- a/tensorflow/python/distribute/collective_all_reduce_strategy_test.py
+++ b/tensorflow/python/distribute/collective_all_reduce_strategy_test.py
@@ -116,7 +116,8 @@ class CollectiveAllReduceStrategyTestBase(
         variable_instance_key_start=10000 +
         CollectiveAllReduceStrategyTestBase.collective_key_base)
     strategy.extended._collective_keys = collective_keys
-    strategy.extended._cross_device_ops._collective_keys = (collective_keys)
+    strategy.extended._cross_device_ops._collective_keys = collective_keys
+    strategy.extended._host_cross_device_ops._collective_keys = collective_keys
 
     return strategy, target, session_config
 
diff --git a/tensorflow/python/distribute/cross_device_ops.py b/tensorflow/python/distribute/cross_device_ops.py
index b88357e0ea6..ed6b0558b46 100644
--- a/tensorflow/python/distribute/cross_device_ops.py
+++ b/tensorflow/python/distribute/cross_device_ops.py
@@ -165,7 +165,8 @@ def get_devices_from(destinations):
 
 
 def _devices_match(left, right):
-  return set(get_devices_from(left)) == set(get_devices_from(right))
+  return left is right or set(get_devices_from(left)) == set(
+      get_devices_from(right))
 
 
 def _all_devices_match(value_destination_pairs):
@@ -936,20 +937,24 @@ class CollectiveAllReduce(CrossDeviceOps):
   """
 
   def __init__(self,
-               num_workers=1,
-               num_gpus_per_worker=0,
+               devices,
+               group_size,
                collective_keys=None,
                communication=CollectiveCommunication.AUTO):
     """Initializes the object.
 
     Args:
-      num_workers: number of workers in the between-graph replicated training.
-      num_gpus_per_worker: number of GPUs per worker.
+      devices: a list of device strings to run collectives on.
+      group_size: the global group size. For between-graph replicated training
+        it's the total number of devices across all workers.
       collective_keys: an optional CollectiveKey object.
       communication: indicates which collective communication to use.
     """
-    self._num_workers = num_workers
-    self._num_gpus_per_worker = num_gpus_per_worker
+    if group_size % len(devices) > 0:
+      raise ValueError("group_size must be divisible by the number of devices.")
+
+    self._devices = tuple(device_util.canonicalize(d) for d in devices)
+    self._group_size = group_size
     self._collective_keys = (collective_keys or
                              cross_device_utils.CollectiveKeys())
     self._communication = communication
@@ -963,15 +968,15 @@ class CollectiveAllReduce(CrossDeviceOps):
     # async executor operations are still executed sequentially. In graph or
     # function building, the executors are not used.
     self._executors = []
-    for _ in range(self._num_gpus_per_worker or 1):
-      # If num_gpus_per_worker is zero, we assume there's only one device (CPU).
+    for _ in range(len(devices)):
       self._executors.append(executor.new_executor(enable_async=True))
 
     super(CollectiveAllReduce, self).__init__()
 
   @property
   def _num_between_graph_workers(self):
-    return self._num_workers
+    # Currently we only support equal number of devices on each worker.
+    return self._group_size / len(self._devices)
 
   def reduce_implementation(self, reduce_op, per_replica_value, destinations,
                             experimental_hints):
@@ -979,8 +984,7 @@ class CollectiveAllReduce(CrossDeviceOps):
                                          experimental_hints)[0]
     devices = get_devices_from(destinations)
 
-    if (isinstance(all_reduced, value_lib.Mirrored) and
-        (all_reduced._devices == devices)):  # pylint: disable=protected-access
+    if _devices_match(per_replica_value, destinations):
       return all_reduced
 
     # Convert `all_reduced` to a `Mirrored` object, as a simple and uniform
@@ -1069,14 +1073,16 @@ class CollectiveAllReduce(CrossDeviceOps):
 
     if batch_size > 1:
       logging.info(
-          "Collective batch_all_reduce: %d all-reduces, num_workers = %d, "
-          "communication_hint = %s, num_packs = %d", batch_size,
-          self._num_workers, communication, len(packs))
+          "Collective batch_all_reduce: %d all-reduces, num_devices = %d, "
+          "group_size = %d, communication_hint = %s, num_packs = %d",
+          batch_size, len(self._devices), self._group_size, communication,
+          len(packs))
     else:
       logging.log_first_n(
           logging.INFO, "Collective batch_all_reduce: %d all-reduces, "
-          "num_workers = %d, communication_hint = %s, num_packs = %d" %
-          (batch_size, self._num_workers, communication, len(packs)), 10)
+          "num_devices = %d, group_size = %d, communication_hint = %s, "
+          "num_packs = %d" % (batch_size, len(
+              self._devices), self._group_size, communication, len(packs)), 10)
 
     reduced_values = []
     for pack in packs:
@@ -1094,21 +1100,25 @@ class CollectiveAllReduce(CrossDeviceOps):
             control_inputs = None
           reduced_values.append(
               cross_device_utils.build_collective_reduce(
-                  per_replica.values, self._num_workers,
-                  self._collective_keys, "Add", "Id", communication,
-                  control_inputs, executors=self._executors))
+                  per_replica.values,
+                  self._devices,
+                  self._group_size,
+                  self._collective_keys,
+                  "Add",
+                  "Id",
+                  communication,
+                  control_inputs,
+                  executors=self._executors))
 
     mirrored = []
     # Reverse the order of reduced value to recover the order in the input.
     for value in reversed(reduced_values):
       if reduce_op == reduce_util.ReduceOp.MEAN:
-        # Assume each worker has the same number of replicas.
-        num_replicas = len(value) * self._num_workers
         for i, v in enumerate(value):
           with ops.device(v.device):
-            value[i] = v / num_replicas
-      mirrored.append(distribute_utils.regroup(value,
-                                               wrap_class=value_lib.Mirrored))
+            value[i] = v / self._group_size
+      mirrored.append(
+          distribute_utils.regroup(value, wrap_class=value_lib.Mirrored))
     return mirrored
 
   def _do_batch_all_reduce_sparse(self, reduce_op, per_replica_values):
@@ -1116,8 +1126,8 @@ class CollectiveAllReduce(CrossDeviceOps):
 
     logging.log_first_n(
         logging.INFO, "Collective batch_all_reduce for IndexedSlices: "
-        "%d all-reduces, num_workers = %d" %
-        (len(per_replica_values), self._num_workers), 10)
+        "%d all-reduces, group_size = %d" %
+        (len(per_replica_values), self._group_size), 10)
 
     # Pass self._communication to the runtime as a communication hint.
     communication_hint = self._communication.value
@@ -1133,25 +1143,24 @@ class CollectiveAllReduce(CrossDeviceOps):
       for per_replica in per_replica_values:
         gathered_values.append(
             cross_device_utils.build_collective_gather_indexed_slices(
-                per_replica.values, self._num_workers, self._collective_keys,
-                communication_hint))
+                per_replica.values, self._devices, self._group_size,
+                self._collective_keys, communication_hint))
 
     mirrored = []
     for value in gathered_values:
       if reduce_op == reduce_util.ReduceOp.MEAN:
         # Assume each worker has the same number of replicas.
-        num_replicas = len(value) * self._num_workers
         for i, v in enumerate(value):
           with ops.device(v.device):
-            value[i].values = value[i].values / num_replicas
-      mirrored.append(distribute_utils.regroup(value,
-                                               wrap_class=value_lib.Mirrored))
+            value[i].values = value[i].values / self._group_size
+      mirrored.append(
+          distribute_utils.regroup(value, wrap_class=value_lib.Mirrored))
     return mirrored
 
   def __deepcopy__(self, memo):
     # distribute_coordinator deep-copies the strategy object, so
     # CollectiveAllReduce needs to support deep copy as well.
-    return CollectiveAllReduce(self._num_workers, self._num_gpus_per_worker,
+    return CollectiveAllReduce(self._devices, self._group_size,
                                self._collective_keys, self._communication)
 
 
diff --git a/tensorflow/python/distribute/cross_device_ops_test.py b/tensorflow/python/distribute/cross_device_ops_test.py
index 9554de41a6e..4b6943e8971 100644
--- a/tensorflow/python/distribute/cross_device_ops_test.py
+++ b/tensorflow/python/distribute/cross_device_ops_test.py
@@ -26,6 +26,7 @@ import time
 from absl.testing import parameterized
 import numpy as np
 from tensorflow.core.protobuf import config_pb2
+from tensorflow.python.distribute import cluster_resolver
 from tensorflow.python.distribute import collective_all_reduce_strategy
 from tensorflow.python.distribute import collective_util
 from tensorflow.python.distribute import combinations
@@ -34,10 +35,12 @@ from tensorflow.python.distribute import cross_device_utils
 from tensorflow.python.distribute import device_util
 from tensorflow.python.distribute import distribute_utils
 from tensorflow.python.distribute import multi_worker_test_base
+from tensorflow.python.distribute import multi_worker_util
 from tensorflow.python.distribute import reduce_util
 from tensorflow.python.distribute import strategy_combinations
 from tensorflow.python.distribute import values as value_lib
 from tensorflow.python.eager import context
+from tensorflow.python.eager import def_function
 from tensorflow.python.eager import test
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import kernels
@@ -125,7 +128,10 @@ class CrossDeviceOpsTestBase(test.TestCase, parameterized.TestCase):
         self.evaluate(ops.convert_to_tensor(left)),
         self.evaluate(ops.convert_to_tensor(right)))
 
-  def _assert_mirrored_equal(self, left_list, right_list, sess,
+  def _assert_mirrored_equal(self,
+                             left_list,
+                             right_list,
+                             sess=None,
                              run_options=None):
     if not isinstance(left_list, list):
       left_list, right_list = [left_list], [right_list]
@@ -142,17 +148,14 @@ class CrossDeviceOpsTestBase(test.TestCase, parameterized.TestCase):
         left, right = [left], [right]
 
       for left_value, right_value in zip(left, right):
-        self.assertEqual(left_value.device, right_value.device)
+        self.assertEqual(
+            device_util.resolve(left_value.device),
+            device_util.resolve(right_value.device))
 
       # Densify IndexedSlices.
       left = [ops.convert_to_tensor(v) for v in left]
       right = [ops.convert_to_tensor(v) for v in right]
-      if context.executing_eagerly():
-        # Optional args in session run are not supported when eager execution
-        # is enabled.
-        assert run_options is None
-        left, right = sess.run((left, right))
-      else:
+      if not context.executing_eagerly():
         left, right = sess.run((left, right), options=run_options)
       for left_value, right_value in zip(left, right):
         self.assertAllEqual(left_value, right_value)
@@ -525,8 +528,8 @@ class CollectiveAllReduceTest(multi_worker_test_base.MultiWorkerTestBase,
         return strategy, devices, ""
       else:
         collective_all_reduce_ops = cross_device_ops_lib.CollectiveAllReduce(
-            1,
-            num_gpus,
+            devices=devices,
+            group_size=len(devices),
             collective_keys=collective_keys,
             communication=communication)
         return collective_all_reduce_ops, devices, ""
@@ -545,26 +548,28 @@ class CollectiveAllReduceTest(multi_worker_test_base.MultiWorkerTestBase,
         ]
 
       if use_strategy_object:
-        strategy = collective_all_reduce_strategy.CollectiveAllReduceStrategy(
-            communication=communication)
-        strategy.configure(
-            cluster_spec=self._cluster_spec,
+        resolver = cluster_resolver.SimpleClusterResolver(
+            cluster_spec=multi_worker_util.normalize_cluster_spec(
+                self._cluster_spec),
             task_type=task_type,
-            task_id=task_id)
+            task_id=task_id,
+            num_accelerators={"GPU": num_gpus})
+        strategy = collective_all_reduce_strategy.CollectiveAllReduceStrategy(
+            cluster_resolver=resolver, communication=communication)
         strategy.extended._collective_keys = collective_keys
         strategy.extended._cross_device_ops._collective_keys = collective_keys
         return (strategy, devices,
                 "grpc://" + self._cluster_spec[task_type][task_id])
       else:
         collective_all_reduce_ops = cross_device_ops_lib.CollectiveAllReduce(
-            NUM_WORKERS,
-            num_gpus,
+            devices=devices,
+            group_size=len(devices) * NUM_WORKERS,
             collective_keys=collective_keys,
             communication=communication)
         return (collective_all_reduce_ops, devices,
                 "grpc://" + self._cluster_spec[task_type][task_id])
 
-  def _assert_mirrored_equal(self, left_list, right_list, sess):
+  def _assert_mirrored_equal(self, left_list, right_list, sess=None):
     if context.executing_eagerly():
       run_options = None
     else:
@@ -895,6 +900,45 @@ class CollectiveAllReduceTest(multi_worker_test_base.MultiWorkerTestBase,
       self.assertAllEqual(reduced[1].values, [4.0, 4.0])
       t.join()
 
+  @combinations.generate(
+      combinations.combine(
+          required_gpus=2,
+          mode="eager",
+          communication=[
+              CollectiveCommunication.NCCL, CollectiveCommunication.RING
+          ]))
+  def testInputsAreFunctionArgs(self, communication):
+    # Function inputs don't have device placement.
+    hints = collective_util.Hints(bytes_per_pack=1)
+    collective, devices, _ = self._get_test_objects(
+        None,
+        None,
+        num_gpus=2,
+        communication=communication,
+        use_strategy_object=False,
+        local_mode=True)
+    devices = [device_util.canonicalize(d) for d in devices]
+
+    @def_function.function
+    def reduce_fn(v):
+      self.assertEqual(v.values[0].device, "")
+      self.assertEqual(v.values[1].device, "")
+      # We only use NCCL for batch reduce with two or more values, so we use two
+      # values here.
+      reduced = collective.batch_reduce(
+          reduce_util.ReduceOp.SUM, [(v, v), (v, v)], experimental_hints=hints)
+      self.assertEqual(reduced[0].values[0].device, devices[0])
+      self.assertEqual(reduced[0].values[1].device, devices[1])
+      self.assertEqual(reduced[1].values[0].device, devices[0])
+      self.assertEqual(reduced[1].values[1].device, devices[1])
+      # Returning Mirrored only evaluates the primary value, which causes
+      # hanging,
+      return [reduced[0].values, reduced[1].values]
+
+    v = _make_per_replica([1.0, 2.0], devices)
+    reduced = reduce_fn(v)
+    self.assertAllEqual(self.evaluate(reduced), [[3.0, 3.0], [3.0, 3.0]])
+
 
 if __name__ == "__main__":
   # Set default inter op thread pool size to one to ensure we don't exhaust the
diff --git a/tensorflow/python/distribute/cross_device_utils.py b/tensorflow/python/distribute/cross_device_utils.py
index d7be93ae2c4..9dc24b16e6a 100644
--- a/tensorflow/python/distribute/cross_device_utils.py
+++ b/tensorflow/python/distribute/cross_device_utils.py
@@ -305,19 +305,6 @@ class CollectiveKeys(object):
         self._group_key_table[key_id] = new_key
     return self._group_key_table[key_id]
 
-  def get_group_key_of_tensors(self, tensors):
-    """Returns a group key for set of tensors.
-
-    Args:
-      tensors: list of `Tensor`s in a collective group. Each tensor must be on a
-        different device.
-
-    Returns:
-      int key uniquely identifying the set of devices of these tensors.
-    """
-    devices = [t.device for t in tensors]
-    return self.get_group_key(devices)
-
   def get_op_instance_key(self):
     """Returns a new instance key for use in defining a collective op."""
     v = self._get_thread_local_object().op_instance_key
@@ -332,7 +319,8 @@ class CollectiveKeys(object):
 
 
 def build_collective_reduce(input_tensors,
-                            num_workers,
+                            devices,
+                            group_size,
                             collective_keys,
                             reduction_op='Add',
                             unary_op='Id',
@@ -347,9 +335,10 @@ def build_collective_reduce(input_tensors,
   Args:
     input_tensors: tensors within a single worker graph that are to be reduced
       together; must be one per device.
-    num_workers: total number of workers with identical independent graphs that
-      will be doing this same reduction.  The reduction will actually include
-      the corresponding tensors at all these workers.
+    devices: a list of device strings to run the collective on.
+    group_size: total number of devices globally that will be doing this same
+      reduction.  The reduction will actually include the corresponding tensors
+      at all these workers.
     collective_keys: a CollectiveKeys object.
     reduction_op: string naming the reduction op.
     unary_op: string naming the unary final op.
@@ -370,11 +359,14 @@ def build_collective_reduce(input_tensors,
         not all(e.is_async() for e in executors)):
       raise ValueError(
           'collectives requires async executors for each device in eager mode')
+  if len(input_tensors) != len(devices):
+    raise ValueError('collective requires one input tensor for each device, '
+                     'len(input_tensors) = %d, len(devices) = %d' %
+                     (len(input_tensors), len(devices)))
 
-  group_size = len(input_tensors) * num_workers
   if group_size < 2:
     return input_tensors
-  group_key = collective_keys.get_group_key_of_tensors(input_tensors)
+  group_key = collective_keys.get_group_key(devices)
   instance_key = collective_keys.get_op_instance_key()
   subdiv_offsets = [0]  # TODO(tucker): maybe support non-default subdiv spec
 
@@ -385,9 +377,9 @@ def build_collective_reduce(input_tensors,
     else:
       executor_scope = ops.NullContextmanager()
     with executor_scope, \
-         ops.device(input_tensor.device), \
+         ops.device(devices[idx]), \
          ops.control_dependencies(
-             _control_input(input_tensors, control_inputs, idx)):
+             _control_input(devices, control_inputs, idx)):
       out_tensor = collective_ops.all_reduce(input_tensor, group_size,
                                              group_key, instance_key,
                                              reduction_op, unary_op,
@@ -397,7 +389,8 @@ def build_collective_reduce(input_tensors,
 
 
 def build_collective_gather(input_tensors,
-                            num_workers,
+                            devices,
+                            group_size,
                             collective_keys,
                             communication_hint='AUTO',
                             control_inputs=None):
@@ -408,9 +401,10 @@ def build_collective_gather(input_tensors,
   Args:
     input_tensors: tensors within a single worker graph that are to be gathered
       together; must be one per device.
-    num_workers: total number of workers with identical independent graphs that
-      will be doing this same reduction.  The reduction will actually include
-      the corresponding tensors at all these workers.
+    devices: a list of device strings to run the collective on.
+    group_size: total number of devices globally that will be doing this same
+      gathering. The gathering will actually include the corresponding tensors
+      at all these workers.
     collective_keys: a CollectiveKeys object.
     communication_hint: string providing hint to runtime for choosing collective
       implementation.
@@ -423,18 +417,21 @@ def build_collective_gather(input_tensors,
   assert not context.executing_eagerly(), (
       'build_collective_gather can only be called in graph mode or inside '
       'tf.function')
+  if len(input_tensors) != len(devices):
+    raise ValueError(
+        'collective requires one input tensor for each device, %d != %d' %
+        (len(input_tensors), len(devices)))
 
-  group_size = len(input_tensors) * num_workers
   if group_size < 2:
     return input_tensors
-  group_key = collective_keys.get_group_key_of_tensors(input_tensors)
+  group_key = collective_keys.get_group_key(devices)
   instance_key = collective_keys.get_op_instance_key()
 
   out_tensors = []
   for idx, input_tensor in enumerate(input_tensors):
-    with ops.device(input_tensor.device):
+    with ops.device(devices[idx]):
       with ops.control_dependencies(
-          _control_input(input_tensors, control_inputs, idx)):
+          _control_input(devices, control_inputs, idx)):
         out_tensor = collective_ops.all_gather(input_tensor, group_size,
                                                group_key, instance_key,
                                                communication_hint)
@@ -443,7 +440,8 @@ def build_collective_gather(input_tensors,
 
 
 def build_collective_gather_indexed_slices(input_slices_list,
-                                           num_workers,
+                                           devices,
+                                           group_size,
                                            collective_keys,
                                            communication_hint='AUTO',
                                            control_inputs=None):
@@ -454,9 +452,10 @@ def build_collective_gather_indexed_slices(input_slices_list,
   Args:
     input_slices_list: a list of IndexedSlices within a single worker graph that
       are to be gathered together; must be one per device.
-    num_workers: total number of workers with identical independent graphs that
-      will be doing this same reduction.  The reduction will actually include
-      the corresponding tensors at all these workers.
+    devices: a list of device strings to run the collective on.
+    group_size: total number of devices globally that will be doing this same
+      gathering. The gathering will actually include the corresponding tensors
+      at all these workers.
     collective_keys: a CollectiveKeys object.
     communication_hint: string providing hint to runtime for choosing collective
       implementation.
@@ -474,12 +473,15 @@ def build_collective_gather_indexed_slices(input_slices_list,
   assert not context.executing_eagerly(), (
       'build_collective_gather_indexed_slices can only be called in graph mode'
       ' or inside tf.function')
+  if len(input_slices_list) != len(devices):
+    raise ValueError(
+        'collective requires one input IndexedSlice for each device, %d != %d' %
+        (len(input_slices_list), len(devices)))
 
-  group_size = len(input_slices_list) * num_workers
   if group_size < 2:
     return input_slices_list
 
-  group_key = collective_keys.get_group_key_of_tensors(input_slices_list)
+  group_key = collective_keys.get_group_key(devices)
   gather_length_key = collective_keys.get_op_instance_key()
   gather_indices_key = collective_keys.get_op_instance_key()
   gather_values_key = collective_keys.get_op_instance_key()
@@ -495,7 +497,7 @@ def build_collective_gather_indexed_slices(input_slices_list,
   out_slices_list = []
   for idx, input_slices in enumerate(input_slices_list):
     # pylint: disable = cell-var-from-loop
-    with ops.device(input_slices.device):
+    with ops.device(devices[idx]):
 
       def all_gather():
         """Use all_gather to aggregate `IndexedSlices`."""
@@ -967,14 +969,13 @@ def pack_by_size(per_replica_list, bytes_per_pack):
   return packs
 
 
-def _control_input(inputs, control_inputs, idx):
+def _control_input(devices, control_inputs, idx):
   """Returns the `idx`-th item in control_inputs to be used in ops.control_dependencies.
 
-  This is a helper function for building collective ops.  The function checks
-  that the devices of control_inputs and inputs match.
+  This is a helper function for building collective ops.
 
   Args:
-    inputs: a list of `Tensor`s
+    devices: a list of device strings the collective run on.
     control_inputs: a list or None.
     idx: the index into `inputs` and `control_inputs`.
 
@@ -984,12 +985,8 @@ def _control_input(inputs, control_inputs, idx):
   """
   if control_inputs is None:
     return []
-  if len(control_inputs) != len(inputs):
+  if len(control_inputs) != len(devices):
     raise ValueError(
-        'control_inputs must match the length of the inputs, %s != %s' %
-        (len(control_inputs), len(inputs)))
-  if control_inputs[idx].device != inputs[idx].device:
-    raise ValueError(
-        'control_inputs must match the device of the inputs, %s != %s' %
-        (control_inputs[idx].device, inputs[idx].device))
+        'control_inputs must match the length of the devices, %s != %s' %
+        (len(control_inputs), len(devices)))
   return [control_inputs[idx]]
diff --git a/tensorflow/python/distribute/mirrored_strategy.py b/tensorflow/python/distribute/mirrored_strategy.py
index ac9045d2322..36598634fac 100644
--- a/tensorflow/python/distribute/mirrored_strategy.py
+++ b/tensorflow/python/distribute/mirrored_strategy.py
@@ -579,7 +579,7 @@ class MirroredExtended(distribute_lib.StrategyExtendedV1):
     if not destinations:
       # TODO(josh11b): Use current logical device instead of 0 here.
       destinations = self._devices
-    return self._get_cross_device_ops().broadcast(tensor, destinations)
+    return self._get_cross_device_ops(tensor).broadcast(tensor, destinations)
 
   def _call_for_each_replica(self, fn, args, kwargs):
     return mirrored_run.call_for_each_replica(
@@ -608,7 +608,8 @@ class MirroredExtended(distribute_lib.StrategyExtendedV1):
     updated_config.isolate_session_state = True
     return updated_config
 
-  def _get_cross_device_ops(self):
+  def _get_cross_device_ops(self, value):
+    del value  # Unused.
     return self._cross_device_ops or self._inferred_cross_device_ops
 
   def _reduce_to(self, reduce_op, value, destinations, experimental_hints):
@@ -623,7 +624,7 @@ class MirroredExtended(distribute_lib.StrategyExtendedV1):
       # be 0.
       return cross_device_ops_lib.reduce_non_distributed_value(
           reduce_op, value, destinations, self._num_replicas_in_sync)
-    return self._get_cross_device_ops().reduce(
+    return self._get_cross_device_ops(value).reduce(
         reduce_op,
         value,
         destinations=destinations,
@@ -631,9 +632,15 @@ class MirroredExtended(distribute_lib.StrategyExtendedV1):
 
   def _batch_reduce_to(self, reduce_op, value_destination_pairs,
                        experimental_hints):
-    return self._get_cross_device_ops().batch_reduce(reduce_op,
-                                                     value_destination_pairs,
-                                                     experimental_hints)
+    cross_device_ops = None
+    for value, _ in value_destination_pairs:
+      if cross_device_ops is None:
+        cross_device_ops = self._get_cross_device_ops(value)
+      elif cross_device_ops is not self._get_cross_device_ops(value):
+        raise ValueError("inputs to batch_reduce_to must be either all on the "
+                         "the host or all on the compute devices")
+    return cross_device_ops.batch_reduce(reduce_op, value_destination_pairs,
+                                         experimental_hints)
 
   def _update(self, var, fn, args, kwargs, group):
     # TODO(josh11b): In eager mode, use one thread per device.
diff --git a/tensorflow/python/distribute/strategy_common_test.py b/tensorflow/python/distribute/strategy_common_test.py
index 4ed5054af2d..ed52a4794ee 100644
--- a/tensorflow/python/distribute/strategy_common_test.py
+++ b/tensorflow/python/distribute/strategy_common_test.py
@@ -65,14 +65,14 @@ class StrategyReduceTest(test.TestCase, parameterized.TestCase):
     self.assertEqual(fn_graph().numpy(), 1.0 * strategy.num_replicas_in_sync)
 
 
+@combinations.generate(
+    combinations.combine(
+        strategy=[strategy_combinations.multi_worker_mirrored_two_workers],
+        mode=['eager']))
 class DistributedCollectiveAllReduceStrategyTest(
     strategy_test_lib.DistributionTestBase,
     parameterized.TestCase):
 
-  @combinations.generate(
-      combinations.combine(
-          strategy=[strategy_combinations.multi_worker_mirrored_two_workers],
-          mode=['eager']))
   def testDatasetFromFunction(self, strategy):
     def dataset_fn(input_context):
       global_batch_size = 10
@@ -95,6 +95,45 @@ class DistributedCollectiveAllReduceStrategyTest(
         sum_value.numpy(),
         expected_sum_on_workers[multi_worker_test_base.get_task_index()])
 
+  def testReduceHostTensor(self, strategy):
+    reduced = strategy.reduce(
+        reduce_util.ReduceOp.SUM, array_ops.identity(1.), axis=None)
+    self.assertEqual(reduced.numpy(), 2.)
+
+  def testReduceToHostTensor(self, strategy):
+    value = array_ops.identity(1.)
+    reduced = strategy.extended.reduce_to(reduce_util.ReduceOp.SUM, value,
+                                          value)
+    self.assertEqual(reduced.numpy(), 2.)
+
+  def testBatchReduceToHostTensor(self, strategy):
+    value = array_ops.identity(1.)
+    reduced = strategy.extended.batch_reduce_to(reduce_util.ReduceOp.SUM,
+                                                [(value, value),
+                                                 (value, value)])
+    self.assertAllEqual(reduced, [2., 2.])
+
+  def testReduceDeviceTensors(self, strategy):
+    value = strategy.run(lambda: array_ops.identity(1.))
+    reduced = strategy.reduce(reduce_util.ReduceOp.SUM, value, axis=None)
+    self.assertEqual(reduced.numpy(), 2.)
+
+  def testReduceToDeviceTensors(self, strategy):
+    value = strategy.run(lambda: array_ops.identity(1.))
+    reduced = strategy.extended.reduce_to(reduce_util.ReduceOp.SUM, value,
+                                          value)
+    self.assertEqual(reduced.numpy(), 2.)
+
+  def testBatchReduceToDeviceTensors(self, strategy):
+    value = strategy.run(lambda: array_ops.identity(1.))
+    reduced = strategy.extended.batch_reduce_to(reduce_util.ReduceOp.SUM,
+                                                [(value, value),
+                                                 (value, value)])
+    self.assertAllEqual(reduced, [2., 2.])
+
+  # TODO(crccw): add a test that mixes device and host tensors after multi
+  # worker strategy combinations can run on a fixed number of GPUs.
+
 
 if __name__ == '__main__':
   combinations.main()

From a71c78bcf91d404de37188a8a7a73016729dd2a0 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 16 Jun 2020 13:14:37 -0700
Subject: [PATCH 0323/1390] PR #40169: ensure model initialized on ANY
 trackable attr set

Imported from GitHub PR https://github.com/tensorflow/tensorflow/pull/40169

In particular, empty tuples should not trigger this.
Copybara import of the project:

--
17b7e169135127e0e866b50577ad8b213abc1d97 by Dominic Jack <thedomjack@gmail.com>:

ensure model initialized on ANY trackable attr set

--
57eccc7bc29ddb105dcaa2f6a413163461ad9987 by Dominic Jack <thedomjack@gmail.com>:

added test

PiperOrigin-RevId: 316743715
Change-Id: I038a0261fbb3a0dac50c62a50c787bade10abb6a
---
 tensorflow/python/keras/engine/training.py      |  8 ++++----
 tensorflow/python/keras/engine/training_test.py | 12 ------------
 2 files changed, 4 insertions(+), 16 deletions(-)

diff --git a/tensorflow/python/keras/engine/training.py b/tensorflow/python/keras/engine/training.py
index 682c0272da5..5567e1733a7 100644
--- a/tensorflow/python/keras/engine/training.py
+++ b/tensorflow/python/keras/engine/training.py
@@ -324,10 +324,10 @@ class Model(base_layer.Layer, version_utils.ModelVersionSelector):
       super(Model, self).__setattr__(name, value)
       return
 
-    if any(
-        isinstance(v, (base_layer.Layer, data_structures.TrackableDataStructure
-                      )) or trackable_layer_utils.has_weights(v)
-        for v in nest.flatten(value)):
+    if all(
+        isinstance(v, (base_layer.Layer,
+                       data_structures.TrackableDataStructure)) or
+        trackable_layer_utils.has_weights(v) for v in nest.flatten(value)):
       try:
         self._base_model_initialized
       except AttributeError:
diff --git a/tensorflow/python/keras/engine/training_test.py b/tensorflow/python/keras/engine/training_test.py
index 63943f4f720..5cf15926bfb 100644
--- a/tensorflow/python/keras/engine/training_test.py
+++ b/tensorflow/python/keras/engine/training_test.py
@@ -3383,18 +3383,6 @@ class TestTrainingWithMetrics(keras_parameterized.TestCase):
     self.assertEqual([m.name for m in outer_model.metrics],
                      ['loss', 'acc2', 'mean', 'mean1', 'mean2'])
 
-  def test_subclassed_model_with_empty_list_attr(self):
-
-    class ModelSubclass(training_module.Model):
-
-      def __init__(self):
-        self.empty_list = []
-        inputs = layers_module.Input(shape=())
-        outputs = inputs + 1
-        super(ModelSubclass, self).__init__(inputs, outputs)
-
-    ModelSubclass()  # empty_list attr assignment should not raise
-
 
 class BareUpdateLayer(layers_module.Layer):
 

From 4db7ec52010ef737300a00e669973fecea5c0603 Mon Sep 17 00:00:00 2001
From: Andy Ly <lyandy@google.com>
Date: Tue, 16 Jun 2020 13:35:30 -0700
Subject: [PATCH 0324/1390] Replace const llvm::SmallVector<>& with
 llvm::ArrayRef and const std::string& with llvm::StringRef in
 TPUExtractOutsideCompilation. (NFC)

PiperOrigin-RevId: 316748196
Change-Id: Icdfcaa5a808ae69e5a6286d5bd7c6a988dbbe616
---
 .../tpu_extract_outside_compilation.cc        | 20 +++++++++----------
 1 file changed, 9 insertions(+), 11 deletions(-)

diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_extract_outside_compilation.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_extract_outside_compilation.cc
index 93e5cc22c30..54600faca4b 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_extract_outside_compilation.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_extract_outside_compilation.cc
@@ -73,9 +73,8 @@ LogicalResult CollectAndGroupOutsideClusterOps(Block* block,
 }
 
 // Moves `cluster_ops` to associated `launch_op` body.
-void MoveOutsideClusterOpsToLaunchOp(
-    tf_device::LaunchOp launch_op,
-    const llvm::SmallVector<Operation*, 8>& cluster_ops) {
+void MoveOutsideClusterOpsToLaunchOp(tf_device::LaunchOp launch_op,
+                                     llvm::ArrayRef<Operation*> cluster_ops) {
   MLIRContext* context = launch_op.getContext();
   Operation* terminator = launch_op.GetBody().getTerminator();
 
@@ -123,7 +122,7 @@ void PropagateParallelExecuteReturnToReplicate(
 
 // Extracts all externally provided operands of `cluster_ops`.
 llvm::SmallSetVector<Value, 4> GetExternalOperands(
-    const llvm::SmallVector<Operation*, 8>& cluster_ops) {
+    llvm::ArrayRef<Operation*> cluster_ops) {
   llvm::SmallSetVector<Value, 4> external_values;
 
   for (Operation* op : cluster_ops) {
@@ -143,7 +142,7 @@ llvm::SmallSetVector<Value, 4> GetExternalOperands(
 
 // Extracts all externally used outputs of `cluster_ops`.
 llvm::SmallVector<Value, 4> GetExternalOutputs(
-    const llvm::SmallVector<Operation*, 8>& cluster_ops) {
+    llvm::ArrayRef<Operation*> cluster_ops) {
   llvm::SmallSetVector<Value, 4> external_outputs;
 
   for (Operation* op : cluster_ops) {
@@ -166,7 +165,7 @@ llvm::SmallVector<Value, 4> GetExternalOutputs(
 // as an operand.  If there are no external_inputs, set insertion point to first
 // cluster_op.
 void SetHostComputeInsertion(
-    OpBuilder* builder, const llvm::SmallVector<Operation*, 8>& cluster_ops,
+    OpBuilder* builder, llvm::ArrayRef<Operation*> cluster_ops,
     const llvm::SmallSetVector<Value, 4>& external_inputs) {
   if (external_inputs.empty()) builder->setInsertionPoint(cluster_ops.front());
   for (const auto& cluster_op : cluster_ops) {
@@ -183,9 +182,9 @@ void SetHostComputeInsertion(
 // using `communication_key`.
 TF::_HostComputeMlirOp CreateHostCompute(
     OpBuilder* builder, tf_device::ClusterOp tpu_cluster,
-    const llvm::SmallVector<Operation*, 8>& cluster_ops,
+    llvm::ArrayRef<Operation*> cluster_ops,
     const llvm::SmallSetVector<Value, 4>& inputs, llvm::ArrayRef<Value> outputs,
-    const std::string& communication_key) {
+    llvm::StringRef communication_key) {
   llvm::SmallVector<Type, 4> device_output_types;
   for (const auto& output : outputs)
     device_output_types.push_back(output.getType());
@@ -201,10 +200,9 @@ TF::_HostComputeMlirOp CreateHostCompute(
 
 void MoveOutsideCompiledOps(
     tf_device::ClusterOp tpu_cluster, llvm::StringRef outside_cluster_name,
-    tf_device::LaunchOp host_launch_op,
-    const llvm::SmallVector<Operation*, 8>& cluster_ops,
+    tf_device::LaunchOp host_launch_op, llvm::ArrayRef<Operation*> cluster_ops,
     const llvm::SmallSetVector<Value, 4>& external_inputs,
-    const llvm::SmallVector<Value, 4>& external_outputs) {
+    llvm::ArrayRef<Value> external_outputs) {
   if (external_inputs.empty() && external_outputs.empty()) {
     MoveOutsideClusterOpsToLaunchOp(host_launch_op, cluster_ops);
     return;

From 14ee01957cb6450261f6efec77a8997ca4d8c3c5 Mon Sep 17 00:00:00 2001
From: Peng Wang <wangpeng@google.com>
Date: Tue, 16 Jun 2020 13:37:06 -0700
Subject: [PATCH 0325/1390] [TF-numpy] Adds @np_doc/@np_doc_only to all public
 ops.

PiperOrigin-RevId: 316748465
Change-Id: I40a49431d6075e47ac05f2946c745ab1c1222214
---
 .../python/ops/numpy_ops/np_array_ops.py      | 310 ++----------------
 1 file changed, 36 insertions(+), 274 deletions(-)

diff --git a/tensorflow/python/ops/numpy_ops/np_array_ops.py b/tensorflow/python/ops/numpy_ops/np_array_ops.py
index e97bb61613b..906e53c556d 100644
--- a/tensorflow/python/ops/numpy_ops/np_array_ops.py
+++ b/tensorflow/python/ops/numpy_ops/np_array_ops.py
@@ -39,51 +39,18 @@ from tensorflow.python.ops.numpy_ops import np_utils
 from tensorflow.python.util import nest
 
 
+@np_utils.np_doc(np.empty)
 def empty(shape, dtype=float):  # pylint: disable=redefined-outer-name
-  """Returns an empty array with the specified shape and dtype.
-
-  Args:
-    shape: A fully defined shape. Could be - NumPy array or a python scalar,
-      list or tuple of integers, - TensorFlow tensor/ndarray of integer type and
-      rank <=1.
-    dtype: Optional, defaults to float. The type of the resulting ndarray. Could
-      be a python type, a NumPy type or a TensorFlow `DType`.
-
-  Returns:
-    An ndarray.
-  """
   return zeros(shape, dtype)
 
 
+@np_utils.np_doc(np.empty_like)
 def empty_like(a, dtype=None):
-  """Returns an empty array with the shape and possibly type of the input array.
-
-  Args:
-    a: array_like. Could be an ndarray, a Tensor or any object that can be
-      converted to a Tensor using `tf.convert_to_tensor`.
-    dtype: Optional, defaults to dtype of the input array. The type of the
-      resulting ndarray. Could be a python type, a NumPy type or a TensorFlow
-      `DType`.
-
-  Returns:
-    An ndarray.
-  """
   return zeros_like(a, dtype)
 
 
+@np_utils.np_doc(np.zeros)
 def zeros(shape, dtype=float):  # pylint: disable=redefined-outer-name
-  """Returns an ndarray with the given shape and type filled with zeros.
-
-  Args:
-    shape: A fully defined shape. Could be - NumPy array or a python scalar,
-      list or tuple of integers, - TensorFlow tensor/ndarray of integer type and
-      rank <=1.
-    dtype: Optional, defaults to float. The type of the resulting ndarray. Could
-      be a python type, a NumPy type or a TensorFlow `DType`.
-
-  Returns:
-    An ndarray.
-  """
   dtype = (
       np_utils.result_type(dtype) if dtype else np_dtypes.default_float_type())
   if isinstance(shape, np_arrays.ndarray):
@@ -91,19 +58,8 @@ def zeros(shape, dtype=float):  # pylint: disable=redefined-outer-name
   return np_arrays.tensor_to_ndarray(array_ops.zeros(shape, dtype=dtype))
 
 
-def zeros_like(a, dtype=None):
-  """Returns an array of zeros with the shape and type of the input array.
-
-  Args:
-    a: array_like. Could be an ndarray, a Tensor or any object that can be
-      converted to a Tensor using `tf.convert_to_tensor`.
-    dtype: Optional, defaults to dtype of the input array. The type of the
-      resulting ndarray. Could be a python type, a NumPy type or a TensorFlow
-      `DType`.
-
-  Returns:
-    An ndarray.
-  """
+@np_utils.np_doc(np.zeros_like)
+def zeros_like(a, dtype=None):  # pylint: disable=missing-docstring
   if isinstance(a, np_arrays.ndarray):
     a = a.data
   if dtype is None:
@@ -117,19 +73,8 @@ def zeros_like(a, dtype=None):
   return np_arrays.tensor_to_ndarray(array_ops.zeros_like(a, dtype))
 
 
+@np_utils.np_doc(np.ones)
 def ones(shape, dtype=float):  # pylint: disable=redefined-outer-name
-  """Returns an ndarray with the given shape and type filled with ones.
-
-  Args:
-    shape: A fully defined shape. Could be - NumPy array or a python scalar,
-      list or tuple of integers, - TensorFlow tensor/ndarray of integer type and
-      rank <=1.
-    dtype: Optional, defaults to float. The type of the resulting ndarray. Could
-      be a python type, a NumPy type or a TensorFlow `DType`.
-
-  Returns:
-    An ndarray.
-  """
   if dtype:
     dtype = np_utils.result_type(dtype)
   if isinstance(shape, np_arrays.ndarray):
@@ -137,19 +82,8 @@ def ones(shape, dtype=float):  # pylint: disable=redefined-outer-name
   return np_arrays.tensor_to_ndarray(array_ops.ones(shape, dtype=dtype))
 
 
+@np_utils.np_doc(np.ones_like)
 def ones_like(a, dtype=None):
-  """Returns an array of ones with the shape and type of the input array.
-
-  Args:
-    a: array_like. Could be an ndarray, a Tensor or any object that can be
-      converted to a Tensor using `tf.convert_to_tensor`.
-    dtype: Optional, defaults to dtype of the input array. The type of the
-      resulting ndarray. Could be a python type, a NumPy type or a TensorFlow
-      `DType`.
-
-  Returns:
-    An ndarray.
-  """
   if isinstance(a, np_arrays.ndarray):
     a = a.data
   if dtype is None:
@@ -191,38 +125,13 @@ def eye(N, M=None, k=0, dtype=float):  # pylint: disable=invalid-name,missing-do
       array_ops.matrix_diag(diagonal=diagonal_, num_rows=N, num_cols=M, k=k))
 
 
+@np_utils.np_doc(np.identity)
 def identity(n, dtype=float):
-  """Returns a square array with ones on the main diagonal and zeros elsewhere.
-
-  Args:
-    n: number of rows/cols.
-    dtype: Optional, defaults to float. The type of the resulting ndarray. Could
-      be a python type, a NumPy type or a TensorFlow `DType`.
-
-  Returns:
-    An ndarray of shape (n, n) and requested type.
-  """
   return eye(N=n, M=n, dtype=dtype)
 
 
+@np_utils.np_doc(np.full)
 def full(shape, fill_value, dtype=None):  # pylint: disable=redefined-outer-name
-  """Returns an array with given shape and dtype filled with `fill_value`.
-
-  Args:
-    shape: A valid shape object. Could be a native python object or an object of
-      type ndarray, numpy.ndarray or tf.TensorShape.
-    fill_value: array_like. Could be an ndarray, a Tensor or any object that can
-      be converted to a Tensor using `tf.convert_to_tensor`.
-    dtype: Optional, defaults to dtype of the `fill_value`. The type of the
-      resulting ndarray. Could be a python type, a NumPy type or a TensorFlow
-      `DType`.
-
-  Returns:
-    An ndarray.
-
-  Raises:
-    ValueError: if `fill_value` can not be broadcast to shape `shape`.
-  """
   if not isinstance(shape, np_arrays.ndarray):
     shape = asarray(np_arrays.convert_to_tensor(shape, dtype_hint=np.int32))
   shape = atleast_1d(shape).data
@@ -251,26 +160,13 @@ def full_like(a, fill_value, dtype=None, order='K', subok=True, shape=None):  #
 
 
 # TODO(wangpeng): investigate whether we can make `copy` default to False.
-# TODO(wangpeng): np_utils.np_doc can't handle np.array because np.array is a
-#   builtin function. Make np_utils.np_doc support builtin functions.
+# pylint: disable=g-short-docstring-punctuation,g-no-space-after-docstring-summary,g-doc-return-or-yield,g-doc-args
+@np_utils.np_doc_only(np.array)
 def array(val, dtype=None, copy=True, ndmin=0):  # pylint: disable=redefined-outer-name
-  """Creates an ndarray with the contents of val.
-
-  Args:
-    val: array_like. Could be an ndarray, a Tensor or any object that can be
-      converted to a Tensor using `tf.convert_to_tensor`.
-    dtype: Optional, defaults to dtype of the `val`. The type of the resulting
-      ndarray. Could be a python type, a NumPy type or a TensorFlow `DType`.
-    copy: Determines whether to create a copy of the backing buffer. Since
-      Tensors are immutable, a copy is made only if val is placed on a different
-      device than the current one. Even if `copy` is False, a new Tensor may
-      need to be built to satisfy `dtype` and `ndim`. This is used only if `val`
-      is an ndarray or a Tensor.
-    ndmin: The minimum rank of the returned array.
-
-  Returns:
-    An ndarray.
-  """
+  """Since Tensors are immutable, a copy is made only if val is placed on a
+  different device than the current one. Even if `copy` is False, a new Tensor
+  may need to be built to satisfy `dtype` and `ndim`. This is used only if `val`
+  is an ndarray or a Tensor."""  # pylint:disable=g-docstring-missing-newline
   if dtype:
     dtype = np_utils.result_type(dtype)
   if isinstance(val, np_arrays.ndarray):
@@ -319,6 +215,7 @@ def array(val, dtype=None, copy=True, ndmin=0):  # pylint: disable=redefined-out
   result_t = np_utils.cond(
       np_utils.greater(ndmin, ndims), true_fn, lambda: result_t)
   return np_arrays.tensor_to_ndarray(result_t)
+# pylint: enable=g-short-docstring-punctuation,g-no-space-after-docstring-summary,g-doc-return-or-yield,g-doc-args
 
 
 @np_utils.np_doc(np.asarray)
@@ -341,6 +238,7 @@ def ascontiguousarray(a, dtype=None):
 
 
 # Numerical ranges.
+@np_utils.np_doc(np.arange)
 def arange(start, stop=None, step=1, dtype=None):
   """Returns `step`-separated values in the range [start, stop).
 
@@ -448,20 +346,8 @@ def diagonal(a, offset=0, axis1=0, axis2=1):  # pylint: disable=missing-docstrin
   return a
 
 
+@np_utils.np_doc(np.diagflat)
 def diagflat(v, k=0):
-  """Returns a 2-d array with flattened `v` as diagonal.
-
-  Args:
-    v: array_like of any rank. Gets flattened when setting as diagonal. Could be
-      an ndarray, a Tensor or any object that can be converted to a Tensor using
-      `tf.convert_to_tensor`.
-    k: Position of the diagonal. Defaults to 0, the main diagonal. Positive
-      values refer to diagonals shifted right, negative values refer to
-      diagonals shifted left.
-
-  Returns:
-    2-d ndarray.
-  """
   v = asarray(v)
   return diag(array_ops.reshape(v.data, [-1]), k)
 
@@ -471,69 +357,22 @@ def _promote_dtype(*arrays):
   return [asarray(a, dtype=dtype) for a in arrays]
 
 
+@np_utils.np_doc(np.all)
 def all(a, axis=None, keepdims=None):  # pylint: disable=redefined-builtin
-  """Whether all array elements or those along an axis evaluate to true.
-
-  Casts the array to bool type if it is not already and uses `tf.reduce_all` to
-  compute the result.
-
-  Args:
-    a: array_like. Could be an ndarray, a Tensor or any object that can be
-      converted to a Tensor using `tf.convert_to_tensor`.
-    axis: Optional. Could be an int or a tuple of integers. If not specified,
-      the reduction is performed over all array indices.
-    keepdims: If true, retains reduced dimensions with length 1.
-
-  Returns:
-    An ndarray. Note that unlike NumPy this does not return a scalar bool if
-    `axis` is None.
-  """
   a = asarray(a, dtype=bool)
   return np_utils.tensor_to_ndarray(
       math_ops.reduce_all(input_tensor=a.data, axis=axis, keepdims=keepdims))
 
 
+@np_utils.np_doc(np.any)
 def any(a, axis=None, keepdims=None):  # pylint: disable=redefined-builtin
-  """Whether any element in the entire array or in an axis evaluates to true.
-
-  Casts the array to bool type if it is not already and uses `tf.reduce_any` to
-  compute the result.
-
-  Args:
-    a: array_like. Could be an ndarray, a Tensor or any object that can be
-      converted to a Tensor using `tf.convert_to_tensor`.
-    axis: Optional. Could be an int or a tuple of integers. If not specified,
-      the reduction is performed over all array indices.
-    keepdims: If true, retains reduced dimensions with length 1.
-
-  Returns:
-    An ndarray. Note that unlike NumPy this does not return a scalar bool if
-    `axis` is None.
-  """
   a = asarray(a, dtype=bool)
   return np_utils.tensor_to_ndarray(
       math_ops.reduce_any(input_tensor=a.data, axis=axis, keepdims=keepdims))
 
 
-def compress(condition, a, axis=None):
-  """Compresses `a` by selecting values along `axis` with `condition` true.
-
-  Uses `tf.boolean_mask`.
-
-  Args:
-    condition: 1-d array of bools. If `condition` is shorter than the array axis
-      (or the flattened array if axis is None), it is padded with False.
-    a: array_like. Could be an ndarray, a Tensor or any object that can be
-      converted to a Tensor using `tf.convert_to_tensor`.
-    axis: Optional. Axis along which to select elements. If None, `condition` is
-      applied on flattened array.
-
-  Returns:
-    An ndarray.
-
-  Raises:
-    ValueError: if `condition` is not of rank 1.
-  """
+@np_utils.np_doc(np.compress)
+def compress(condition, a, axis=None):  # pylint: disable=redefined-outer-name,missing-function-docstring
   condition = asarray(condition, dtype=bool)
   a = asarray(a)
 
@@ -563,8 +402,8 @@ def compress(condition, a, axis=None):
       array_ops.boolean_mask(tensor=a_t, mask=condition_t, axis=axis))
 
 
+@np_utils.np_doc(np.copy)
 def copy(a):
-  """Returns a copy of the array."""
   return array(a, copy=True)
 
 
@@ -611,18 +450,8 @@ def cumsum(a, axis=None, dtype=None):  # pylint: disable=missing-docstring
   return np_utils.tensor_to_ndarray(math_ops.cumsum(a.data, axis))
 
 
+@np_utils.np_doc(np.imag)
 def imag(a):
-  """Returns imaginary parts of all elements in `a`.
-
-  Uses `tf.imag`.
-
-  Args:
-    a: array_like. Could be an ndarray, a Tensor or any object that can be
-      converted to a Tensor using `tf.convert_to_tensor`.
-
-  Returns:
-    An ndarray with the same shape as `a`.
-  """
   a = asarray(a)
   # TODO(srbs): np.imag returns a scalar if a is a scalar, whereas we always
   # return an ndarray.
@@ -760,6 +589,7 @@ def amin(a, axis=None, keepdims=None):
       preserve_bool=True)
 
 
+@np_utils.np_doc(np.var)
 def var(a, axis=None, dtype=None, out=None, ddof=0, keepdims=None):  # pylint: disable=missing-docstring
   if dtype:
     working_dtype = np_utils.result_type(a, dtype)
@@ -829,18 +659,8 @@ def ravel(a):  # pylint: disable=missing-docstring
 setattr(np_arrays.ndarray, 'ravel', ravel)
 
 
+@np_utils.np_doc(np.real)
 def real(val):
-  """Returns real parts of all elements in `a`.
-
-  Uses `tf.real`.
-
-  Args:
-    val: array_like. Could be an ndarray, a Tensor or any object that can be
-      converted to a Tensor using `tf.convert_to_tensor`.
-
-  Returns:
-    An ndarray with the same shape as `a`.
-  """
   val = asarray(val)
   # TODO(srbs): np.real returns a scalar if val is a scalar, whereas we always
   # return an ndarray.
@@ -897,7 +717,6 @@ def around(a, decimals=0):  # pylint: disable=missing-docstring
   return np_utils.tensor_to_ndarray(a).astype(dtype)
 
 
-round_ = around
 setattr(np_arrays.ndarray, '__round__', around)
 
 
@@ -933,51 +752,20 @@ def _reshape_method_wrapper(a, *newshape, **kwargs):
   return reshape(a, newshape, order=order)
 
 
+@np_utils.np_doc(np.expand_dims)
 def expand_dims(a, axis):
-  """Expand the shape of an array.
-
-  Args:
-    a: array_like. Could be an ndarray, a Tensor or any object that can be
-      converted to a Tensor using `tf.convert_to_tensor`.
-    axis: int. axis on which to expand the shape.
-
-  Returns:
-    An ndarray with the contents and dtype of `a` and shape expanded on axis.
-  """
   a = asarray(a)
   return np_utils.tensor_to_ndarray(array_ops.expand_dims(a.data, axis=axis))
 
 
+@np_utils.np_doc(np.squeeze)
 def squeeze(a, axis=None):
-  """Removes single-element axes from the array.
-
-  Args:
-    a: array_like. Could be an ndarray, a Tensor or any object that can be
-      converted to a Tensor using `tf.convert_to_tensor`.
-    axis: scalar or list/tuple of ints.
-  TODO(srbs): tf.squeeze throws error when axis is a Tensor eager execution is
-    enabled. So we cannot allow axis to be array_like here. Fix.
-
-  Returns:
-    An ndarray.
-  """
   a = asarray(a)
   return np_utils.tensor_to_ndarray(array_ops.squeeze(a, axis))
 
 
+@np_utils.np_doc(np.transpose)
 def transpose(a, axes=None):
-  """Permutes dimensions of the array.
-
-  Args:
-    a: array_like. Could be an ndarray, a Tensor or any object that can be
-      converted to a Tensor using `tf.convert_to_tensor`.
-    axes: array_like. A list of ints with length rank(a) or None specifying the
-      order of permutation. The i'th dimension of the output array corresponds
-      to axes[i]'th dimension of the `a`. If None, the axes are reversed.
-
-  Returns:
-    An ndarray.
-  """
   a = asarray(a)
   if axes is not None:
     axes = asarray(axes)
@@ -1113,37 +901,16 @@ def _setitem(arr, index, value):
         [prefix_t, array_ops.expand_dims(subarray.data, 0), postfix_t], 0)
 
 
+# TODO(wangpeng): Make a custom `setattr` that also sets docstring for the
+#   method.
 setattr(np_arrays.ndarray, 'transpose', transpose)
 setattr(np_arrays.ndarray, 'reshape', _reshape_method_wrapper)
 setattr(np_arrays.ndarray, '__setitem__', _setitem)
 
 
+@np_utils.np_doc(np.pad)
 def pad(ary, pad_width, mode, constant_values=0):
-  """Pads an array.
-
-  Args:
-    ary: array_like of rank N. Input array.
-    pad_width: {sequence, array_like, int}. Number of values padded to the edges
-      of each axis. ((before_1, after_1), ... (before_N, after_N)) unique pad
-      widths for each axis. ((before, after),) yields same before and after pad
-      for each axis. (pad,) or int is a shortcut for before = after = pad width
-      for all axes.
-    mode: string. One of the following string values: 'constant' Pads with a
-      constant value. 'reflect' Pads with the reflection of the vector mirrored
-      on the first and last values of the vector along each axis. 'symmetric'
-      Pads with the reflection of the vector mirrored along the edge of the
-      array.
-      **NOTE**: The supported list of `mode` does not match that of numpy's.
-    constant_values: scalar with same dtype as `array`. Used in 'constant' mode
-      as the pad value.  Default is 0.
-
-  Returns:
-    An ndarray padded array of rank equal to `array` with shape increased
-    according to `pad_width`.
-
-  Raises:
-    ValueError if `mode` is not supported.
-  """
+  """Only supports modes 'constant', 'reflect' and 'symmetric' currently."""
   if not (mode == 'constant' or mode == 'reflect' or mode == 'symmetric'):
     raise ValueError('Unsupported padding mode: ' + mode)
   mode = mode.upper()
@@ -1214,24 +981,19 @@ def select(condlist, choicelist, default=0):  # pylint: disable=missing-docstrin
   return output
 
 
+@np_utils.np_doc(np.shape)
 def shape(a):
-  """Return the shape of an array.
-
-  Args:
-    a: array_like. Input array.
-
-  Returns:
-    Tuple of ints.
-  """
   a = asarray(a)
   return a.shape
 
 
+@np_utils.np_doc(np.ndim)
 def ndim(a):
   a = asarray(a)
   return a.ndim
 
 
+@np_utils.np_doc(np.isscalar)
 def isscalar(a):
   return ndim(a) == 0
 

From 10d5e26a9009151639ee1c4812901c3ac96df304 Mon Sep 17 00:00:00 2001
From: Yunlu Li <yunluli@google.com>
Date: Tue, 16 Jun 2020 13:38:11 -0700
Subject: [PATCH 0326/1390] Internal change

PiperOrigin-RevId: 316748654
Change-Id: Iec1a40e1523cbe2712e1dccf7a947de0e3156d46
---
 tensorflow/lite/python/BUILD | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tensorflow/lite/python/BUILD b/tensorflow/lite/python/BUILD
index c1f37c81b7f..d25e7d5ef8d 100644
--- a/tensorflow/lite/python/BUILD
+++ b/tensorflow/lite/python/BUILD
@@ -40,7 +40,6 @@ py_test(
         "no_windows",
         "noasan",  # TODO(b/137568139): enable after this is fixed.
         "nomsan",  # TODO(b/137568139): enable after this is fixed.
-        "notsan",  # TODO(b/149882556): enable after this is fixed.
     ],
     deps = [
         ":interpreter",

From 8cf2895fcce8fb47ef1b603f9784a34a72d5ae54 Mon Sep 17 00:00:00 2001
From: Suharsh Sivakumar <suharshs@google.com>
Date: Tue, 16 Jun 2020 13:38:34 -0700
Subject: [PATCH 0327/1390] Add dynamic range test to op_tests A-B.

PiperOrigin-RevId: 316748721
Change-Id: I2e803777a160197f3b7a6026c5e94ce11f47ab92
---
 tensorflow/lite/testing/op_tests/add_n.py     |  9 ++++
 .../lite/testing/op_tests/arg_min_max.py      |  3 +-
 .../testing/op_tests/batch_to_space_nd.py     |  4 ++
 tensorflow/lite/testing/op_tests/binary_op.py | 54 +++++++++++++++++++
 4 files changed, 69 insertions(+), 1 deletion(-)

diff --git a/tensorflow/lite/testing/op_tests/add_n.py b/tensorflow/lite/testing/op_tests/add_n.py
index 2385bd89600..bef0a0632b9 100644
--- a/tensorflow/lite/testing/op_tests/add_n.py
+++ b/tensorflow/lite/testing/op_tests/add_n.py
@@ -32,16 +32,25 @@ def make_add_n_tests(options):
           "dtype": [tf.float32, tf.int32],
           "input_shape": [[2, 5, 3, 1]],
           "num_inputs": [2, 3, 4, 5],
+          "dynamic_range_quantize": [False],
       },
       {
           "dtype": [tf.float32, tf.int32],
           "input_shape": [[5]],
           "num_inputs": [2, 3, 4, 5],
+          "dynamic_range_quantize": [False],
       },
       {
           "dtype": [tf.float32, tf.int32],
           "input_shape": [[]],
           "num_inputs": [2, 3, 4, 5],
+          "dynamic_range_quantize": [False],
+      },
+      {
+          "dtype": [tf.float32],
+          "input_shape": [[]],
+          "num_inputs": [2, 3, 4, 5],
+          "dynamic_range_quantize": [True],
       },
   ]
 
diff --git a/tensorflow/lite/testing/op_tests/arg_min_max.py b/tensorflow/lite/testing/op_tests/arg_min_max.py
index e693ce6f44a..ec0013225e0 100644
--- a/tensorflow/lite/testing/op_tests/arg_min_max.py
+++ b/tensorflow/lite/testing/op_tests/arg_min_max.py
@@ -34,6 +34,7 @@ def make_arg_min_max_tests(options):
       "input_shape": [[], [1, 1, 1, 3], [2, 3, 4, 5], [2, 3, 3], [5, 5], [10]],
       "output_type": [tf.int32, tf.int64],
       "is_arg_max": [True],
+      "dynamic_range_quantize": [False, True],
   }]
 
   def build_graph(parameters):
@@ -62,4 +63,4 @@ def make_arg_min_max_tests(options):
       test_parameters,
       build_graph,
       build_inputs,
-      expected_tf_failures=4)
+      expected_tf_failures=8)
diff --git a/tensorflow/lite/testing/op_tests/batch_to_space_nd.py b/tensorflow/lite/testing/op_tests/batch_to_space_nd.py
index e3f05697b0b..2180bfa5b10 100644
--- a/tensorflow/lite/testing/op_tests/batch_to_space_nd.py
+++ b/tensorflow/lite/testing/op_tests/batch_to_space_nd.py
@@ -36,6 +36,7 @@ def make_batch_to_space_nd_tests(options):
           "crops": [[[0, 0], [0, 0]], [[1, 1], [1, 1]]],
           "constant_block_shape": [True, False],
           "constant_crops": [True, False],
+          "dynamic_range_quantize": [False],
       },
       # Single batch (no-op)
       {
@@ -45,6 +46,7 @@ def make_batch_to_space_nd_tests(options):
           "crops": [[[0, 0], [0, 0]], [[1, 1], [1, 1]]],
           "constant_block_shape": [True],
           "constant_crops": [True],
+          "dynamic_range_quantize": [True, False],
       },
       # 3D use case.
       {
@@ -54,6 +56,7 @@ def make_batch_to_space_nd_tests(options):
           "crops": [[[0, 0]], [[1, 1]]],
           "constant_block_shape": [True],
           "constant_crops": [True],
+          "dynamic_range_quantize": [True, False],
       },
   ]
 
@@ -66,6 +69,7 @@ def make_batch_to_space_nd_tests(options):
         "crops": [[[0, 0], [0, 0], [0, 0]]],
         "constant_block_shape": [True, False],
         "constant_crops": [True, False],
+        "dynamic_range_quantize": [False],
     }]
 
   def build_graph(parameters):
diff --git a/tensorflow/lite/testing/op_tests/binary_op.py b/tensorflow/lite/testing/op_tests/binary_op.py
index 9d0c85e35aa..17ed2f3522d 100644
--- a/tensorflow/lite/testing/op_tests/binary_op.py
+++ b/tensorflow/lite/testing/op_tests/binary_op.py
@@ -41,6 +41,7 @@ def make_binary_op_tests(options,
           "input_shape_2": [[1, 3, 4, 3]],
           "activation": [True],
           "fully_quantize": [False],
+          "dynamic_range_quantize": [False],
       },
       {
           "dtype": [tf.float32],
@@ -48,6 +49,7 @@ def make_binary_op_tests(options,
           "input_shape_2": [[5]],
           "activation": [False, True],
           "fully_quantize": [False],
+          "dynamic_range_quantize": [False],
       },
       {
           "dtype": [tf.float32, tf.int32, tf.int64],
@@ -55,6 +57,7 @@ def make_binary_op_tests(options,
           "input_shape_2": [[3]],
           "activation": [True, False],
           "fully_quantize": [False],
+          "dynamic_range_quantize": [False],
       },
       {
           "dtype": [tf.float32, tf.int32],
@@ -62,6 +65,7 @@ def make_binary_op_tests(options,
           "input_shape_2": [[1, 3, 4, 3]],
           "activation": [True, False],
           "fully_quantize": [False],
+          "dynamic_range_quantize": [False],
       },
       {
           "dtype": [tf.float32],
@@ -69,6 +73,7 @@ def make_binary_op_tests(options,
           "input_shape_2": [[]],
           "activation": [False],
           "fully_quantize": [False],
+          "dynamic_range_quantize": [False],
       },
       {
           "dtype": [tf.float32],
@@ -76,6 +81,7 @@ def make_binary_op_tests(options,
           "input_shape_2": [[1]],
           "activation": [False],
           "fully_quantize": [False],
+          "dynamic_range_quantize": [False],
       },
       {
           "dtype": [tf.float32],
@@ -83,6 +89,7 @@ def make_binary_op_tests(options,
           "input_shape_2": [[1, 3, 4, 3]],
           "activation": [False],
           "fully_quantize": [True],
+          "dynamic_range_quantize": [False],
       },
       {
           "dtype": [tf.float32],
@@ -90,6 +97,7 @@ def make_binary_op_tests(options,
           "input_shape_2": [[5]],
           "activation": [False],
           "fully_quantize": [True],
+          "dynamic_range_quantize": [False],
       },
       {
           "dtype": [tf.float32],
@@ -97,6 +105,7 @@ def make_binary_op_tests(options,
           "input_shape_2": [[3]],
           "activation": [False],
           "fully_quantize": [True],
+          "dynamic_range_quantize": [False],
       },
       {
           "dtype": [tf.float32],
@@ -104,6 +113,7 @@ def make_binary_op_tests(options,
           "input_shape_2": [[1, 3, 4, 3]],
           "activation": [False],
           "fully_quantize": [True],
+          "dynamic_range_quantize": [False],
       },
       {
           "dtype": [tf.float32],
@@ -111,6 +121,47 @@ def make_binary_op_tests(options,
           "input_shape_2": [[]],
           "activation": [False],
           "fully_quantize": [True],
+          "dynamic_range_quantize": [False],
+      },
+      {
+          "dtype": [tf.float32],
+          "input_shape_1": [[1, 3, 4, 3]],
+          "input_shape_2": [[1, 3, 4, 3]],
+          "activation": [False],
+          "fully_quantize": [False],
+          "dynamic_range_quantize": [True],
+      },
+      {
+          "dtype": [tf.float32],
+          "input_shape_1": [[5]],
+          "input_shape_2": [[5]],
+          "activation": [False],
+          "fully_quantize": [False],
+          "dynamic_range_quantize": [True],
+      },
+      {
+          "dtype": [tf.float32],
+          "input_shape_1": [[1, 3, 4, 3]],
+          "input_shape_2": [[3]],
+          "activation": [False],
+          "fully_quantize": [False],
+          "dynamic_range_quantize": [True],
+      },
+      {
+          "dtype": [tf.float32],
+          "input_shape_1": [[3]],
+          "input_shape_2": [[1, 3, 4, 3]],
+          "activation": [False],
+          "fully_quantize": [False],
+          "dynamic_range_quantize": [True],
+      },
+      {
+          "dtype": [tf.float32],
+          "input_shape_1": [[]],
+          "input_shape_2": [[]],
+          "activation": [False],
+          "fully_quantize": [False],
+          "dynamic_range_quantize": [True],
       },
   ]
 
@@ -123,6 +174,7 @@ def make_binary_op_tests(options,
             "input_shape_2": [[7]],
             "activation": [False],
             "fully_quantize": [False],
+            "dynamic_range_quantize": [False],
         },
     ]
 
@@ -204,6 +256,7 @@ def make_div_tests(options):
           "input_shape_2": [[3]],
           "activation": [False],
           "fully_quantize": [False],
+          "dynamic_range_quantize": [False, True],
       },
   ]
   make_binary_op_tests(
@@ -220,6 +273,7 @@ def make_sub_tests(options):
           "input_shape_2": [[3]],
           "activation": [False],
           "fully_quantize": [False],
+          "dynamic_range_quantize": [False, True],
       },
   ]
   make_binary_op_tests(

From 23910c191f29c8ea060b0d24671fccb356ed6600 Mon Sep 17 00:00:00 2001
From: Khanh LeViet <khanhlvg@google.com>
Date: Tue, 16 Jun 2020 13:48:29 -0700
Subject: [PATCH 0328/1390] Recommend Netron for TF Lite model visualization

PiperOrigin-RevId: 316750470
Change-Id: Id794ed7b2a8405cf5821fb106e8861d8aacef22f
---
 tensorflow/lite/g3doc/guide/faq.md | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/tensorflow/lite/g3doc/guide/faq.md b/tensorflow/lite/g3doc/guide/faq.md
index 197fc2d4a8f..491e3252635 100644
--- a/tensorflow/lite/g3doc/guide/faq.md
+++ b/tensorflow/lite/g3doc/guide/faq.md
@@ -45,30 +45,37 @@ or file a [new one](https://github.com/tensorflow/tensorflow/issues).
 
 #### How do I determine the inputs/outputs for GraphDef protocol buffer?
 
-The easiest way to inspect a graph from a `.pb` file is to use the
+The easiest way to inspect a graph from a `.pb` file is to use
+[Netron](https://github.com/lutzroeder/netron), an open-source viewer for
+machine learning models.
+
+If Netron cannot open the graph, you can try the
 [summarize_graph](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/tools/graph_transforms/README.md#inspecting-graphs)
 tool.
 
-If that approach yields an error, you can visualize the GraphDef with
+If the summarize_graph tool yields an error, you can visualize the GraphDef with
 [TensorBoard](https://www.tensorflow.org/guide/summaries_and_tensorboard) and
 look for the inputs and outputs in the graph. To visualize a `.pb` file, use the
 [`import_pb_to_tensorboard.py`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/tools/import_pb_to_tensorboard.py)
 script like below:
 
-```sh
+```shell
 python import_pb_to_tensorboard.py --model_dir <model path> --log_dir <log dir path>
 ```
 
 #### How do I inspect a `.tflite` file?
 
-TensorFlow Lite models can be visualized using the
+[Netron](https://github.com/lutzroeder/netron) is the easiest way to visualize a
+TensorFlow Lite model.
+
+If Netron cannot open your TensorFlow Lite model, you can try the
 [visualize.py](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/tools/visualize.py)
 script in our repository.
 
 *   [Clone the TensorFlow repository](https://www.tensorflow.org/install/source)
 *   Run the `visualize.py` script with bazel:
 
-```sh
+```shell
 bazel run //tensorflow/lite/tools:visualize model.tflite visualized_model.html
 ```
 

From 295ee8ab72f907a37850a26dbaa684e4f992fa2b Mon Sep 17 00:00:00 2001
From: Anjali Sridhar <anjalisridhar@google.com>
Date: Tue, 16 Jun 2020 14:07:19 -0700
Subject: [PATCH 0329/1390] Assign to all component TPUMirroredVariables when
 assigning in replica context and aggregation=NONE.

PiperOrigin-RevId: 316754219
Change-Id: I791f392b892886404cb80868368ae4a167d8b3d8
---
 .../distribute/mirrored_strategy_test.py      |  2 --
 tensorflow/python/distribute/tpu_values.py    | 30 +++++++++++++++++++
 tensorflow/python/distribute/values_test.py   | 22 ++++++++++++++
 3 files changed, 52 insertions(+), 2 deletions(-)

diff --git a/tensorflow/python/distribute/mirrored_strategy_test.py b/tensorflow/python/distribute/mirrored_strategy_test.py
index 6009eece14e..950b6f2446b 100644
--- a/tensorflow/python/distribute/mirrored_strategy_test.py
+++ b/tensorflow/python/distribute/mirrored_strategy_test.py
@@ -635,8 +635,6 @@ class MirroredVariableUpdateTest(test.TestCase):
 
   def testAssignMirroredVarReplicaContextWithoutAggregationType(self,
                                                                 distribution):
-    # Test that we always have an aggregation type set on the mirrored variable
-    # if we assign to it in replica mode.
     def var_fn():
       v = variable_scope.variable(1.0, name="foo")
       return v
diff --git a/tensorflow/python/distribute/tpu_values.py b/tensorflow/python/distribute/tpu_values.py
index 3a4290a80dd..40ab058ac7c 100644
--- a/tensorflow/python/distribute/tpu_values.py
+++ b/tensorflow/python/distribute/tpu_values.py
@@ -30,6 +30,7 @@ from tensorflow.python.eager import tape
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import gen_resource_variable_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import variable_scope
 from tensorflow.python.tpu import tpu
 
 
@@ -173,6 +174,16 @@ class TPUMirroredVariable(TPUVariableMixin, values.MirroredVariable):
   """Holds a map from replica to TPU variables whose values are kept in sync."""
 
   def assign_sub(self, value, use_locking=False, name=None, read_value=True):
+    if (enclosing_tpu_context() and
+        self.aggregation == variable_scope.VariableAggregation.NONE):
+      return _make_raw_assign_fn(
+          gen_resource_variable_ops.assign_sub_variable_op)(
+              self,
+              value=value,
+              use_locking=use_locking,
+              name=name,
+              read_value=read_value)
+
     assign_sub_fn = _make_raw_assign_fn(
         gen_resource_variable_ops.assign_sub_variable_op)
     return self._update(
@@ -183,6 +194,16 @@ class TPUMirroredVariable(TPUVariableMixin, values.MirroredVariable):
         read_value=read_value)
 
   def assign_add(self, value, use_locking=False, name=None, read_value=True):
+    if (enclosing_tpu_context() and
+        self.aggregation == variable_scope.VariableAggregation.NONE):
+      return _make_raw_assign_fn(
+          gen_resource_variable_ops.assign_add_variable_op)(
+              self,
+              value=value,
+              use_locking=use_locking,
+              name=name,
+              read_value=read_value)
+
     assign_add_fn = _make_raw_assign_fn(
         gen_resource_variable_ops.assign_add_variable_op)
     return self._update(
@@ -193,6 +214,15 @@ class TPUMirroredVariable(TPUVariableMixin, values.MirroredVariable):
         read_value=read_value)
 
   def assign(self, value, use_locking=False, name=None, read_value=True):
+    if (enclosing_tpu_context() and
+        self.aggregation == variable_scope.VariableAggregation.NONE):
+      return _make_raw_assign_fn(gen_resource_variable_ops.assign_variable_op)(
+          self,
+          value=value,
+          use_locking=use_locking,
+          name=name,
+          read_value=read_value)
+
     assign_fn = _make_raw_assign_fn(
         gen_resource_variable_ops.assign_variable_op)
     return self._update(
diff --git a/tensorflow/python/distribute/values_test.py b/tensorflow/python/distribute/values_test.py
index 180a4d3f278..0cb4d6ddd2a 100644
--- a/tensorflow/python/distribute/values_test.py
+++ b/tensorflow/python/distribute/values_test.py
@@ -915,6 +915,28 @@ class MirroredVariableTest(test.TestCase, parameterized.TestCase):
       sess.run(variables_lib.global_variables_initializer())
       sess.run({"complicated": mirrored})
 
+  @combinations.generate(
+      combinations.combine(
+          distribution=[
+              strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
+              strategy_combinations.tpu_strategy,
+          ],
+          mode=["eager"]))
+  def testAssignValueInReplicaContextWithoutAggregation(self, distribution):
+    with distribution.scope():
+      v = variables_lib.Variable(1.0, name="foo")
+
+    @def_function.function
+    def mytest():
+      def model_fn():
+        v.assign(5.0)
+        return v.read_value()
+
+      return distribution.run(model_fn)
+
+    mytest()
+    self.assertAllEqual([5.0, 5.0], self.evaluate(v.values))
+
   @combinations.generate(
       combinations.combine(
           distribution=[

From 389405a77946410400ed410246e4cc7257802dde Mon Sep 17 00:00:00 2001
From: Suharsh Sivakumar <suharshs@google.com>
Date: Tue, 16 Jun 2020 14:16:01 -0700
Subject: [PATCH 0330/1390] Add dynamic range tests to relevant op_tests
 (letter C).

PiperOrigin-RevId: 316755889
Change-Id: If087d4bb5db715f2ccefa1aec15a89e9a915c5ad
---
 tensorflow/lite/testing/op_tests/concat.py       | 13 +++++++++++--
 tensorflow/lite/testing/op_tests/conv.py         | 16 +++++++++++++++-
 .../lite/testing/op_tests/conv_activation.py     | 16 +++++++++++++++-
 .../testing/op_tests/conv_with_shared_weights.py |  1 +
 4 files changed, 42 insertions(+), 4 deletions(-)

diff --git a/tensorflow/lite/testing/op_tests/concat.py b/tensorflow/lite/testing/op_tests/concat.py
index 1cb726ceb1d..0d6936a0e48 100644
--- a/tensorflow/lite/testing/op_tests/concat.py
+++ b/tensorflow/lite/testing/op_tests/concat.py
@@ -32,13 +32,22 @@ def make_concat_tests(options):
       "num_tensors": [1, 2, 3, 4, 5, 6],
       "axis": [0, 1, 2, 3, -3, -2, -1],
       "type": [tf.float32, tf.uint8, tf.int32, tf.int64],
-      "fully_quantize": [False]
+      "fully_quantize": [False],
+      "dynamic_range_quantize": [False],
   }, {
       "base_shape": [[1, 3, 4, 3], [3, 4], [2, 3, 4, 3]],
       "num_tensors": [1, 2, 3, 4, 5, 6],
       "axis": [1, 2, 3, -3, -2, -1],
       "type": [tf.float32],
-      "fully_quantize": [True]
+      "fully_quantize": [True],
+      "dynamic_range_quantize": [False],
+  }, {
+      "base_shape": [[1, 3, 4, 3]],
+      "num_tensors": [6],
+      "axis": [1],
+      "type": [tf.float32],
+      "fully_quantize": [False],
+      "dynamic_range_quantize": [True],
   }]
 
   def get_shape(parameters, delta):
diff --git a/tensorflow/lite/testing/op_tests/conv.py b/tensorflow/lite/testing/op_tests/conv.py
index 3a12cafe478..ee0b589b5ca 100644
--- a/tensorflow/lite/testing/op_tests/conv.py
+++ b/tensorflow/lite/testing/op_tests/conv.py
@@ -39,6 +39,7 @@ def make_conv_tests(options):
           "constant_filter": [True, False],
           "channel_multiplier": [1, 2],
           "fully_quantize": [False],
+          "dynamic_range_quantize": [False],
       },
       # TODO(b/134702301): The fully_quantize param is just ignored by the MLIR
       # testing path now, resulting in duplicate tests. Either ignore these
@@ -53,7 +54,20 @@ def make_conv_tests(options):
           "constant_filter": [True],
           "channel_multiplier": [1, 2],
           "fully_quantize": [True],
-      }
+          "dynamic_range_quantize": [False],
+      },
+      {
+          "input_shape": [[1, 3, 4, 3]],
+          "filter_shape": [[1, 1]],
+          "strides": [[1, 1, 1, 1], [1, 2, 3, 1]],
+          "dilations": [[1, 1, 1, 1]],
+          "padding": ["SAME", "VALID"],
+          "data_format": ["NHWC"],
+          "constant_filter": [True],
+          "channel_multiplier": [2],
+          "fully_quantize": [False],
+          "dynamic_range_quantize": [True],
+      },
   ]
 
   def get_tensor_shapes(parameters):
diff --git a/tensorflow/lite/testing/op_tests/conv_activation.py b/tensorflow/lite/testing/op_tests/conv_activation.py
index b4cc4c6ba58..1ee1210ec9e 100644
--- a/tensorflow/lite/testing/op_tests/conv_activation.py
+++ b/tensorflow/lite/testing/op_tests/conv_activation.py
@@ -40,6 +40,7 @@ def make_conv_activation_tests(activation_op):
             "constant_filter": [True, False],
             "channel_multiplier": [1, 2],
             "fully_quantize": [False],
+            "dynamic_range_quantize": [False],
         },
         # TODO(b/134702301): The fully_quantize param is just ignored by the
         # MLIR testing path now, resulting in duplicate tests. Either ignore
@@ -54,7 +55,20 @@ def make_conv_activation_tests(activation_op):
             "constant_filter": [True],
             "channel_multiplier": [1, 2],
             "fully_quantize": [True],
-        }
+            "dynamic_range_quantize": [False],
+        },
+        {
+            "input_shape": [[1, 3, 4, 3]],
+            "filter_shape": [[1, 1], [2, 3], [3, 3]],
+            "strides": [[1, 1, 1, 1], [1, 2, 3, 1]],
+            "dilations": [[1, 1, 1, 1]],
+            "padding": ["SAME", "VALID"],
+            "data_format": ["NHWC"],
+            "constant_filter": [True],
+            "channel_multiplier": [1, 2],
+            "fully_quantize": [False],
+            "dynamic_range_quantize": [True],
+        },
     ]
 
     def get_tensor_shapes(parameters):
diff --git a/tensorflow/lite/testing/op_tests/conv_with_shared_weights.py b/tensorflow/lite/testing/op_tests/conv_with_shared_weights.py
index 6f4499af9d4..bca775ec20e 100644
--- a/tensorflow/lite/testing/op_tests/conv_with_shared_weights.py
+++ b/tensorflow/lite/testing/op_tests/conv_with_shared_weights.py
@@ -36,6 +36,7 @@ def make_conv_with_shared_weights_tests(options):
       "padding": ["SAME"],
       "data_format": ["NHWC"],
       "channel_multiplier": [1],
+      "dynamic_range_quantize": [False, True],
   }]
 
   def get_tensor_shapes(parameters):

From 8019570ea041bf274b42568c65d66b388e4746af Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 16 Jun 2020 14:16:21 -0700
Subject: [PATCH 0331/1390] load nnapi 1.3 memory domain functions in shim
 layer.

PiperOrigin-RevId: 316755954
Change-Id: If4fc4a7c1001e4b47479531914d2631ee2e31fcd
---
 tensorflow/lite/nnapi/NeuralNetworksShim.h    | 325 ++++++++++++++++++
 tensorflow/lite/nnapi/NeuralNetworksTypes.h   |  72 ++++
 tensorflow/lite/nnapi/nnapi_implementation.cc |  12 +
 tensorflow/lite/nnapi/nnapi_implementation.h  | 306 ++++++++++++++++-
 4 files changed, 714 insertions(+), 1 deletion(-)

diff --git a/tensorflow/lite/nnapi/NeuralNetworksShim.h b/tensorflow/lite/nnapi/NeuralNetworksShim.h
index 01b597ce36f..9a5c44dfd61 100644
--- a/tensorflow/lite/nnapi/NeuralNetworksShim.h
+++ b/tensorflow/lite/nnapi/NeuralNetworksShim.h
@@ -1237,6 +1237,331 @@ inline int ANeuralNetworksModel_setOperandExtensionData(
   EXECUTE_FUNCTION_RETURN(model, index, data, length);
 }
 
+/**
+ * Create a {@link ANeuralNetworksMemoryDesc} with no properties.
+ *
+ * This only creates the memory descriptor. Its properties should be set with
+ * calls to
+ * {@link ANeuralNetworksMemoryDesc_addInputRole},
+ * {@link ANeuralNetworksMemoryDesc_addOutputRole}, and
+ * {@link ANeuralNetworksMemoryDesc_setDimensions}.
+ *
+ * {@link ANeuralNetworksMemoryDesc_finish} must be called once all properties
+ * have been set.
+ *
+ * {@link ANeuralNetworksMemoryDesc_free} must be called once the memory
+ * descriptor is no longer needed.
+ *
+ * Available since API level 30.
+ *
+ * @param desc The {@link ANeuralNetworksMemoryDesc} to be created.
+ *             Set to NULL if unsuccessful.
+ *
+ * @return ANEURALNETWORKS_NO_ERROR if successful.
+ */
+inline int ANeuralNetworksMemoryDesc_create(ANeuralNetworksMemoryDesc** desc) {
+  LOAD_FUNCTION(ANeuralNetworksMemoryDesc_create);
+  EXECUTE_FUNCTION_RETURN(desc);
+}
+
+/**
+ * Destroy a memory descriptor.
+ *
+ * The memory descriptor need not have been finished by a call to
+ * {@link ANeuralNetworksMemoryDesc_finish}.
+ *
+ * See {@link ANeuralNetworksMemoryDesc} for information on multithreaded usage.
+ *
+ * Available since API level 30.
+ *
+ * @param desc The memory descriptor to be destroyed. Passing NULL is acceptable
+ * and results in no operation.
+ */
+inline void ANeuralNetworksMemoryDesc_free(ANeuralNetworksMemoryDesc* desc) {
+  LOAD_FUNCTION(ANeuralNetworksMemoryDesc_free);
+  EXECUTE_FUNCTION(desc);
+}
+
+/**
+ * Specify that a memory object will be playing the role of an output to an
+ * execution created from a particular compilation.
+ *
+ * The compilation and the output index fully specify an output operand. This
+ * function may be invoked multiple times on the same memory descriptor with
+ * different output operands, and the same output operand may be specified on
+ * multiple memory descriptors. However, specifying the same output operand on
+ * the same memory descriptor object more than once will return an error.
+ *
+ * The dimensions of the corresponding model operands of all the roles specified
+ * by
+ * {@link ANeuralNetworksMemoryDesc_addInputRole} and
+ * {@link ANeuralNetworksMemoryDesc_addOutputRole} must be compatible with each
+ * other. Two dimensions are incompatible if both ranks are fully specified but
+ * have different values, or if there is at least one axis that is fully
+ * specified in both but has different values.
+ *
+ * At least one of {@link ANeuralNetworksMemoryDesc_addInputRole} and
+ * {@link ANeuralNetworksMemoryDesc_addOutputRole} must be called on the memory
+ * descriptor before invoking {@link ANeuralNetworksMemoryDesc_finish}.
+ *
+ * Attempting to modify a memory descriptor once
+ * {@link ANeuralNetworksMemoryDesc_finish} has been called will return an
+ * error.
+ *
+ * See {@link ANeuralNetworksMemoryDesc} for information on multithreaded usage.
+ *
+ * Available since API level 30.
+ *
+ * @param desc The memory descriptor to be modified.
+ * @param compilation The compilation object. It must already have been finished
+ * by calling {@link ANeuralNetworksCompilation_finish}, and must outlive the
+ * memory descriptor.
+ * @param index The index of the output argument we are referencing from the
+ *              compilation. It is an index into the outputs list passed to
+ *              {@link ANeuralNetworksModel_identifyInputsAndOutputs}. It is not
+ *              the index associated with {@link
+ * ANeuralNetworksModel_addOperand}.
+ * @param frequency A floating-point value within the range (0.0, 1.0].
+ * Describes how likely the memory is to be used in the specified role. This is
+ *                  provided as a hint to optimize the case when multiple roles
+ * prefer different memory locations or data layouts.
+ *
+ * @return ANEURALNETWORKS_NO_ERROR if successful.
+ */
+inline int ANeuralNetworksMemoryDesc_addOutputRole(
+    ANeuralNetworksMemoryDesc* desc,
+    const ANeuralNetworksCompilation* compilation, int32_t index,
+    float frequency) {
+  LOAD_FUNCTION(ANeuralNetworksMemoryDesc_addOutputRole);
+  EXECUTE_FUNCTION_RETURN(desc, compilation, index, frequency);
+}
+
+/**
+ * Specify that a memory object will be playing the role of an input to an
+ * execution created from a particular compilation.
+ *
+ * The compilation and the input index fully specify an input operand. This
+ * function may be invoked multiple times on the same memory descriptor with
+ * different input operands, and the same input operand may be specified on
+ * multiple memory descriptors. However, specifying the same input operand on
+ * the same memory descriptor more than once will return an error.
+ *
+ * The dimensions of the corresponding model operands of all the roles specified
+ * by
+ * {@link ANeuralNetworksMemoryDesc_addInputRole} and
+ * {@link ANeuralNetworksMemoryDesc_addOutputRole} must be compatible with each
+ * other. Two dimensions are incompatible if both ranks are fully specified but
+ * have different values, or if there is at least one axis that is fully
+ * specified in both but has different values.
+ *
+ * At least one of {@link ANeuralNetworksMemoryDesc_addInputRole} and
+ * {@link ANeuralNetworksMemoryDesc_addOutputRole} must be called on a memory
+ * descriptor before invoking {@link ANeuralNetworksMemoryDesc_finish}.
+ *
+ * Attempting to modify a memory descriptor once
+ * {@link ANeuralNetworksMemoryDesc_finish} has been called will return an
+ * error.
+ *
+ * See {@link ANeuralNetworksMemoryDesc} for information on multithreaded usage.
+ *
+ * Available since API level 30.
+ *
+ * @param desc The memory descriptor to be modified.
+ * @param compilation The compilation object. It must already have been finished
+ * by calling {@link ANeuralNetworksCompilation_finish}, and must outlive the
+ * memory descriptor.
+ * @param index The index of the input argument we are referencing from the
+ * compilation. It is an index into the inputs list passed to
+ *              {@link ANeuralNetworksModel_identifyInputsAndOutputs}. It is not
+ *              the index associated with {@link
+ * ANeuralNetworksModel_addOperand}.
+ * @param frequency A floating-point value within the range (0.0, 1.0].
+ * Describes how likely the memory is to be used in the specified role. This is
+ *                  provided as a hint to optimize the case when different roles
+ * prefer different memory locations or data layouts.
+ *
+ * @return ANEURALNETWORKS_NO_ERROR if successful.
+ */
+inline int ANeuralNetworksMemoryDesc_addInputRole(
+    ANeuralNetworksMemoryDesc* desc,
+    const ANeuralNetworksCompilation* compilation, uint32_t index,
+    float frequency) {
+  LOAD_FUNCTION(ANeuralNetworksMemoryDesc_addInputRole);
+  EXECUTE_FUNCTION_RETURN(desc, compilation, index, frequency);
+}
+
+/**
+ * Set the dimensional information of the memory descriptor.
+ *
+ * The specified dimensions must be compatible with the dimensions of the
+ * corresponding model operands of all the roles specified by
+ * {@link ANeuralNetworksMemoryDesc_addInputRole} and
+ * {@link ANeuralNetworksMemoryDesc_addOutputRole}. Two dimensions are
+ * incompatible if both ranks are fully specified but have different values, or
+ * if there is at least one axis that is fully specified in both but has
+ * different values.
+ *
+ * Attempting to modify a memory descriptor once
+ * {@link ANeuralNetworksMemoryDesc_finish} has been called will return an
+ * error.
+ *
+ * See {@link ANeuralNetworksMemoryDesc} for information on multithreaded usage.
+ *
+ * Available since API level 30.
+ *
+ * @param desc The memory descriptor to be modified.
+ * @param rank The number of dimensions. Must be 0 for scalars.
+ * @param dimensions An array of dimensions. An entry with the value 0 indicates
+ * that the corresponding axis has an unknown size.
+ *
+ * @return ANEURALNETWORKS_NO_ERROR if successful.
+ */
+inline int ANeuralNetworksMemoryDesc_setDimensions(
+    ANeuralNetworksMemoryDesc* desc, uint32_t rank,
+    const uint32_t* dimensions) {
+  LOAD_FUNCTION(ANeuralNetworksMemoryDesc_setDimensions);
+  EXECUTE_FUNCTION_RETURN(desc, rank, dimensions);
+}
+
+/**
+ * Indicate that we have finished modifying a memory descriptor. Required before
+ * calling
+ * {@link ANeuralNetworksMemory_createFromDesc}.
+ *
+ * This function must only be called once for a given memory descriptor.
+ *
+ * See {@link ANeuralNetworksMemoryDesc} for information on multithreaded usage.
+ *
+ * Available since API level 30.
+ *
+ * @param desc The memory descriptor to be finished.
+ *
+ * @return ANEURALNETWORKS_NO_ERROR if successful.
+ */
+inline int ANeuralNetworksMemoryDesc_finish(ANeuralNetworksMemoryDesc* desc) {
+  LOAD_FUNCTION(ANeuralNetworksMemoryDesc_finish);
+  EXECUTE_FUNCTION_RETURN(desc);
+}
+
+/**
+ * Creates a memory object from a memory descriptor.
+ *
+ * The memory object is created with an uninitialized buffer. A memory object
+ * with an uninitialized buffer may only be used according to the roles
+ * specified by
+ * {@link ANeuralNetworksMemoryDesc_addOutputRole}, or as the destination memory
+ * in
+ * {@link ANeuralNetworksMemory_copy}. The buffer of a memory object is
+ * initialized after the memory object is used as an output in a successful
+ * execution, or used as the destination memory in a successful {@link
+ * ANeuralNetworksMemory_copy}. A memory object with an initialized buffer may
+ * be used according to all roles specified in
+ * {@link ANeuralNetworksMemoryDesc}, or as the source or destination memory in
+ * {@link ANeuralNetworksMemory_copy}. The buffer of a memory object will return
+ * to the uninitialized state if the memory object is used as an output in a
+ * failed execution, or used as the destination memory in a failed {@link
+ * ANeuralNetworksMemory_copy}.
+ *
+ * The dimensions of the memory descriptor are deduced from the dimensions of
+ * the corresponding model operands of all the roles specified by
+ * {@link ANeuralNetworksMemoryDesc_addInputRole} and
+ * {@link ANeuralNetworksMemoryDesc_addOutputRole}, as well as the dimensions
+ * set by the call to {@link ANeuralNetworksMemoryDesc_setDimensions}, if any.
+ * The memory descriptor may have unspecified dimensions or rank. In such a
+ * case, the same memory object may be used with different shapes of outputs in
+ * different executions. When the memory is used as an input, the input shape
+ * must be the same as the output shape from the last execution using this
+ * memory object as an output, or the last
+ * {@link ANeuralNetworkMemory_copy} using this memory object as the destination
+ * memory. Creating a memory object with unspecified dimensions or rank may fail
+ * for certain sets of roles.
+ *
+ * Using the memory in roles or shapes that are not compatible with the rules
+ * specified above will return an error.
+ *
+ * When calling {@link ANeuralNetworksExecution_setInputFromMemory} or
+ * {@link ANeuralNetworksExecution_setOutputFromMemory} with the memory object,
+ * both offset and length must be set to zero and the entire memory region will
+ * be associated with the specified input or output operand.
+ *
+ * Calling {@link ANeuralNetworksModel_setOperandValueFromMemory} with the
+ * memory created from this function will return an error.
+ *
+ * {@link ANeuralNetworksMemory_free} must be called once the memory is no
+ * longer needed.
+ *
+ * Attempting to create memory from an unfinished memory descriptor will return
+ * an error.
+ *
+ * The provided {@link ANeuralNetworksMemoryDesc} need not outlive the
+ * {@link ANeuralNetworksMemory} object.
+ *
+ * Available since API level 30.
+ *
+ * @param desc The memory descriptor.
+ * @param memory The memory object to be created.
+ *               Set to NULL if unsuccessful.
+ *
+ * @return ANEURALNETWORKS_NO_ERROR if successful; ANEURALNETWORKS_OP_FAILED if
+ * the memory is created with unspecified dimensions or rank and it is not
+ * supported for this set of roles.
+ */
+inline int ANeuralNetworksMemory_createFromDesc(
+    const ANeuralNetworksMemoryDesc* desc, ANeuralNetworksMemory** memory) {
+  LOAD_FUNCTION(ANeuralNetworksMemory_createFromDesc);
+  EXECUTE_FUNCTION_RETURN(desc, memory);
+}
+
+/**
+ * Copies data from one memory object to another.
+ *
+ * If at most one of the src and dst is created from
+ * {@link ANeuralNetworksMemory_createFromDesc}, the src and dst must have the
+ * same logical size:
+ * - If the memory is created from {@link ANeuralNetworksMemory_createFromFd},
+ * or if it is created from {@link
+ * ANeuralNetworksMemory_createFromAHardwareBuffer} with format of
+ * AHARDWAREBUFFER_FORMAT_BLOB, the logical size equals the size of the memory.
+ * - If the memory is created from
+ *   {@link ANeuralNetworksMemory_createFromAHardwareBuffer} with a format other
+ * than AHARDWAREBUFFER_FORMAT_BLOB, the logical size equals the size when there
+ * is no padding and the data is tightly packed. This function may fail if the
+ *   AHardwareBuffer cannot be accessed.
+ * - If the memory is created from {@link ANeuralNetworksMemory_createFromDesc},
+ * the logical size equals the size indicated by the {@link OperandCode}
+ * multiplied by the number of elements. This function will fail if the number
+ * of elements is unknown.
+ *
+ * If both src and dst are created from {@link
+ * ANeuralNetworksMemory_createFromDesc}, they must have compatible dimensions.
+ * Two dimensions are incompatible if both ranks are fully specified but have
+ * different values, or if there is at least one axis that is fully specified in
+ * both but has different values. The dst may have unspecified dimensions or
+ * rank. In such a case, the dimensions of dst will get updated according to the
+ * dimensions of the src.
+ *
+ * In both cases, if the src is created from
+ * {@link ANeuralNetworksMemory_createFromDesc}, it must have been used as an
+ * output in a successful execution, or used as the destination memory in a
+ * successful
+ * {@link ANeuralNetworksMemory_copy}.
+ *
+ * The src and dst may have different data layout, in which case the data
+ * copying is performed logically with data layout transformation.
+ *
+ * Available since API level 30.
+ *
+ * @param src The source memory object.
+ * @param dst The destination memory object.
+ *
+ * @return ANEURALNETWORKS_NO_ERROR if successful.
+ */
+inline int ANeuralNetworksMemory_copy(const ANeuralNetworksMemory* src,
+                                      const ANeuralNetworksMemory* dst) {
+  LOAD_FUNCTION(ANeuralNetworksMemory_copy);
+  EXECUTE_FUNCTION_RETURN(src, dst);
+}
+
 /**/
 
 #endif  // TENSORFLOW_LITE_NNAPI_NEURALNETWORKSSHIM_H_
diff --git a/tensorflow/lite/nnapi/NeuralNetworksTypes.h b/tensorflow/lite/nnapi/NeuralNetworksTypes.h
index 6739838e4d1..3c30a0479fa 100644
--- a/tensorflow/lite/nnapi/NeuralNetworksTypes.h
+++ b/tensorflow/lite/nnapi/NeuralNetworksTypes.h
@@ -226,6 +226,50 @@ enum {
   ANEURALNETWORKS_PRIORITY_HIGH = 110,
   ANEURALNETWORKS_PRIORITY_DEFAULT = ANEURALNETWORKS_PRIORITY_MEDIUM,
 };
+/**
+ * ANeuralNetworksMemoryDesc is an opaque type that represents a memory
+ * descriptor.
+ *
+ * A memory descriptor describes the properties of a memory object, and is used
+ * by
+ * {@link ANeuralNetworksMemory_createFromDesc}.
+ *
+ * To use:
+ *   - Create a new memory descriptor by calling
+ *     {@link ANeuralNetworksMemoryDesc_create}.
+ *   - Specify all of the intended input and output roles by calling
+ *     {@link ANeuralNetworksMemoryDesc_addInputRole} and
+ *     {@link ANeuralNetworksMemoryDesc_addOutputRole}.
+ *   - Optionally, specify the memory dimensions by calling
+ *     {@link ANeuralNetworksMemoryDesc_setDimensions}.
+ *   - Complete the memory descriptor with {@link
+ * ANeuralNetworksMemoryDesc_finish}.
+ *   - Use the memory descriptor as many times as needed with
+ *     {@link ANeuralNetworksMemory_createFromDesc}.
+ *   - Destroy the memory descriptor with {@link
+ * ANeuralNetworksMemoryDesc_free}.
+ *
+ * A memory descriptor is completed by calling {@link
+ * ANeuralNetworksMemoryDesc_finish}. A memory descriptor is destroyed by
+ * calling {@link ANeuralNetworksMemoryDesc_free}.
+ *
+ * A memory descriptor must not be modified once
+ * {@link ANeuralNetworksMemoryDesc_finish}
+ * has been called on it.
+ *
+ * It is the application's responsibility to make sure that only
+ * one thread modifies a memory descriptor at a given time. It is however
+ * safe for more than one thread to use the memory descriptor once
+ * {@link ANeuralNetworksMemoryDesc_finish} has returned.
+ *
+ * It is also the application's responsibility to ensure that there are no other
+ * uses of the memory descriptor after calling {@link
+ * ANeuralNetworksMemoryDesc_free}. It is however safe to continue using a
+ * {@link ANeuralNetworksMemory} object created from the memory descriptor.
+ *
+ * Available since API level 30.
+ */
+typedef struct ANeuralNetworksMemoryDesc ANeuralNetworksMemoryDesc;
 
 /**
  * ANeuralNetworksMemory is an opaque type that represents memory.
@@ -604,4 +648,32 @@ typedef int (*ANeuralNetworksModel_setOperandExtensionData_fn)(
     ANeuralNetworksModel* model, int32_t index, const void* data,
     size_t length);
 
+typedef int (*ANeuralNetworksMemoryDesc_create_fn)(
+    ANeuralNetworksMemoryDesc** desc);
+
+typedef void (*ANeuralNetworksMemoryDesc_free_fn)(
+    ANeuralNetworksMemoryDesc* desc);
+
+typedef int (*ANeuralNetworksMemoryDesc_addInputRole_fn)(
+    ANeuralNetworksMemoryDesc* desc,
+    const ANeuralNetworksCompilation* compilation, int32_t index,
+    float frequency);
+
+typedef int (*ANeuralNetworksMemoryDesc_addOutputRole_fn)(
+    ANeuralNetworksMemoryDesc* desc,
+    const ANeuralNetworksCompilation* compilation, uint32_t index,
+    float frequency);
+
+typedef int (*ANeuralNetworksMemoryDesc_setDimensions_fn)(
+    ANeuralNetworksMemoryDesc* desc, uint32_t rank, const uint32_t* dimensions);
+
+typedef int (*ANeuralNetworksMemoryDesc_finish_fn)(
+    ANeuralNetworksMemoryDesc* desc);
+
+typedef int (*ANeuralNetworksMemory_createFromDesc_fn)(
+    const ANeuralNetworksMemoryDesc* desc, ANeuralNetworksMemory** memory);
+
+typedef int (*ANeuralNetworksMemory_copy_fn)(const ANeuralNetworksMemory* src,
+                                             const ANeuralNetworksMemory* dst);
+
 #endif  // TENSORFLOW_LITE_NNAPI_NEURALNETWORKSTYPES_H_
diff --git a/tensorflow/lite/nnapi/nnapi_implementation.cc b/tensorflow/lite/nnapi/nnapi_implementation.cc
index ad5869fec04..862c4ba2499 100644
--- a/tensorflow/lite/nnapi/nnapi_implementation.cc
+++ b/tensorflow/lite/nnapi/nnapi_implementation.cc
@@ -225,6 +225,18 @@ const NnApi LoadNnApi() {
                          ANeuralNetworksExecution_setTimeout);
   LOAD_FUNCTION_OPTIONAL(libneuralnetworks,
                          ANeuralNetworksExecution_setLoopTimeout);
+  LOAD_FUNCTION_OPTIONAL(libneuralnetworks, ANeuralNetworksMemoryDesc_create);
+  LOAD_FUNCTION_OPTIONAL(libneuralnetworks, ANeuralNetworksMemoryDesc_free);
+  LOAD_FUNCTION_OPTIONAL(libneuralnetworks,
+                         ANeuralNetworksMemoryDesc_addInputRole);
+  LOAD_FUNCTION_OPTIONAL(libneuralnetworks,
+                         ANeuralNetworksMemoryDesc_addOutputRole);
+  LOAD_FUNCTION_OPTIONAL(libneuralnetworks,
+                         ANeuralNetworksMemoryDesc_setDimensions);
+  LOAD_FUNCTION_OPTIONAL(libneuralnetworks, ANeuralNetworksMemoryDesc_finish);
+  LOAD_FUNCTION_OPTIONAL(libneuralnetworks,
+                         ANeuralNetworksMemory_createFromDesc);
+  LOAD_FUNCTION_OPTIONAL(libneuralnetworks, ANeuralNetworksMemory_copy);
 
   return nnapi;
 }
diff --git a/tensorflow/lite/nnapi/nnapi_implementation.h b/tensorflow/lite/nnapi/nnapi_implementation.h
index abee0fbdef3..9f481cded9b 100644
--- a/tensorflow/lite/nnapi/nnapi_implementation.h
+++ b/tensorflow/lite/nnapi/nnapi_implementation.h
@@ -1225,7 +1225,311 @@ struct NnApi {
       ANeuralNetworksModel* model, int32_t index, const void* data,
       size_t length);
 
-  /**/
+  /**
+   * Create a {@link ANeuralNetworksMemoryDesc} with no properties.
+   *
+   * This only creates the memory descriptor. Its properties should be set with
+   * calls to
+   * {@link ANeuralNetworksMemoryDesc_addInputRole},
+   * {@link ANeuralNetworksMemoryDesc_addOutputRole}, and
+   * {@link ANeuralNetworksMemoryDesc_setDimensions}.
+   *
+   * {@link ANeuralNetworksMemoryDesc_finish} must be called once all properties
+   * have been set.
+   *
+   * {@link ANeuralNetworksMemoryDesc_free} must be called once the memory
+   * descriptor is no longer needed.
+   *
+   * Available since API level 30.
+   *
+   * @param desc The {@link ANeuralNetworksMemoryDesc} to be created.
+   *             Set to NULL if unsuccessful.
+   *
+   * @return ANEURALNETWORKS_NO_ERROR if successful.
+   */
+  int (*ANeuralNetworksMemoryDesc_create)(ANeuralNetworksMemoryDesc** desc);
+
+  /**
+   * Destroy a memory descriptor.
+   *
+   * The memory descriptor need not have been finished by a call to
+   * {@link ANeuralNetworksMemoryDesc_finish}.
+   *
+   * See {@link ANeuralNetworksMemoryDesc} for information on multithreaded
+   * usage.
+   *
+   * Available since API level 30.
+   *
+   * @param desc The memory descriptor to be destroyed. Passing NULL is
+   * acceptable and results in no operation.
+   */
+  void (*ANeuralNetworksMemoryDesc_free)(ANeuralNetworksMemoryDesc* desc);
+
+  /**
+   * Specify that a memory object will be playing the role of an input to an
+   * execution created from a particular compilation.
+   *
+   * The compilation and the input index fully specify an input operand. This
+   * function may be invoked multiple times on the same memory descriptor with
+   * different input operands, and the same input operand may be specified on
+   * multiple memory descriptors. However, specifying the same input operand on
+   * the same memory descriptor more than once will return an error.
+   *
+   * The dimensions of the corresponding model operands of all the roles
+   * specified by
+   * {@link ANeuralNetworksMemoryDesc_addInputRole} and
+   * {@link ANeuralNetworksMemoryDesc_addOutputRole} must be compatible with
+   * each other. Two dimensions are incompatible if both ranks are fully
+   * specified but have different values, or if there is at least one axis that
+   * is fully specified in both but has different values.
+   *
+   * At least one of {@link ANeuralNetworksMemoryDesc_addInputRole} and
+   * {@link ANeuralNetworksMemoryDesc_addOutputRole} must be called on a memory
+   * descriptor before invoking {@link ANeuralNetworksMemoryDesc_finish}.
+   *
+   * Attempting to modify a memory descriptor once
+   * {@link ANeuralNetworksMemoryDesc_finish} has been called will return an
+   * error.
+   *
+   * See {@link ANeuralNetworksMemoryDesc} for information on multithreaded
+   * usage.
+   *
+   * Available since API level 30.
+   *
+   * @param desc The memory descriptor to be modified.
+   * @param compilation The compilation object. It must already have been
+   * finished by calling {@link ANeuralNetworksCompilation_finish}, and must
+   * outlive the memory descriptor.
+   * @param index The index of the input argument we are referencing from the
+   * compilation. It is an index into the inputs list passed to
+   *              {@link ANeuralNetworksModel_identifyInputsAndOutputs}. It is
+   * not the index associated with {@link ANeuralNetworksModel_addOperand}.
+   * @param frequency A floating-point value within the range (0.0, 1.0].
+   * Describes how likely the memory is to be used in the specified role. This
+   * is provided as a hint to optimize the case when different roles prefer
+   * different memory locations or data layouts.
+   *
+   * @return ANEURALNETWORKS_NO_ERROR if successful.
+   */
+  int (*ANeuralNetworksMemoryDesc_addInputRole)(
+      ANeuralNetworksMemoryDesc* desc,
+      const ANeuralNetworksCompilation* compilation, int32_t index,
+      float frequency);
+
+  /**
+   * Specify that a memory object will be playing the role of an output to an
+   * execution created from a particular compilation.
+   *
+   * The compilation and the output index fully specify an output operand. This
+   * function may be invoked multiple times on the same memory descriptor with
+   * different output operands, and the same output operand may be specified on
+   * multiple memory descriptors. However, specifying the same output operand on
+   * the same memory descriptor object more than once will return an error.
+   *
+   * The dimensions of the corresponding model operands of all the roles
+   * specified by
+   * {@link ANeuralNetworksMemoryDesc_addInputRole} and
+   * {@link ANeuralNetworksMemoryDesc_addOutputRole} must be compatible with
+   * each other. Two dimensions are incompatible if both ranks are fully
+   * specified but have different values, or if there is at least one axis that
+   * is fully specified in both but has different values.
+   *
+   * At least one of {@link ANeuralNetworksMemoryDesc_addInputRole} and
+   * {@link ANeuralNetworksMemoryDesc_addOutputRole} must be called on the
+   * memory descriptor before invoking {@link ANeuralNetworksMemoryDesc_finish}.
+   *
+   * Attempting to modify a memory descriptor once
+   * {@link ANeuralNetworksMemoryDesc_finish} has been called will return an
+   * error.
+   *
+   * See {@link ANeuralNetworksMemoryDesc} for information on multithreaded
+   * usage.
+   *
+   * Available since API level 30.
+   *
+   * @param desc The memory descriptor to be modified.
+   * @param compilation The compilation object. It must already have been
+   * finished by calling {@link ANeuralNetworksCompilation_finish}, and must
+   * outlive the memory descriptor.
+   * @param index The index of the output argument we are referencing from the
+   *              compilation. It is an index into the outputs list passed to
+   *              {@link ANeuralNetworksModel_identifyInputsAndOutputs}. It is
+   * not the index associated with {@link ANeuralNetworksModel_addOperand}.
+   * @param frequency A floating-point value within the range (0.0, 1.0].
+   * Describes how likely the memory is to be used in the specified role. This
+   * is provided as a hint to optimize the case when multiple roles prefer
+   * different memory locations or data layouts.
+   *
+   * @return ANEURALNETWORKS_NO_ERROR if successful.
+   */
+  int (*ANeuralNetworksMemoryDesc_addOutputRole)(
+      ANeuralNetworksMemoryDesc* desc,
+      const ANeuralNetworksCompilation* compilation, uint32_t index,
+      float frequency);
+
+  /**
+   * Set the dimensional information of the memory descriptor.
+   *
+   * The specified dimensions must be compatible with the dimensions of the
+   * corresponding model operands of all the roles specified by
+   * {@link ANeuralNetworksMemoryDesc_addInputRole} and
+   * {@link ANeuralNetworksMemoryDesc_addOutputRole}. Two dimensions are
+   * incompatible if both ranks are fully specified but have different values,
+   * or if there is at least one axis that is fully specified in both but has
+   * different values.
+   *
+   * Attempting to modify a memory descriptor once
+   * {@link ANeuralNetworksMemoryDesc_finish} has been called will return an
+   * error.
+   *
+   * See {@link ANeuralNetworksMemoryDesc} for information on multithreaded
+   * usage.
+   *
+   * Available since API level 30.
+   *
+   * @param desc The memory descriptor to be modified.
+   * @param rank The number of dimensions. Must be 0 for scalars.
+   * @param dimensions An array of dimensions. An entry with the value 0
+   * indicates that the corresponding axis has an unknown size.
+   *
+   * @return ANEURALNETWORKS_NO_ERROR if successful.
+   */
+  int (*ANeuralNetworksMemoryDesc_setDimensions)(
+      ANeuralNetworksMemoryDesc* desc, uint32_t rank,
+      const uint32_t* dimensions);
+
+  /**
+   * Indicate that we have finished modifying a memory descriptor. Required
+   * before calling
+   * {@link ANeuralNetworksMemory_createFromDesc}.
+   *
+   * This function must only be called once for a given memory descriptor.
+   *
+   * See {@link ANeuralNetworksMemoryDesc} for information on multithreaded
+   * usage.
+   *
+   * Available since API level 30.
+   *
+   * @param desc The memory descriptor to be finished.
+   *
+   * @return ANEURALNETWORKS_NO_ERROR if successful.
+   */
+  int (*ANeuralNetworksMemoryDesc_finish)(ANeuralNetworksMemoryDesc* desc);
+
+  /**
+   * Creates a memory object from a memory descriptor.
+   *
+   * The memory object is created with an uninitialized buffer. A memory object
+   * with an uninitialized buffer may only be used according to the roles
+   * specified by
+   * {@link ANeuralNetworksMemoryDesc_addOutputRole}, or as the destination
+   * memory in
+   * {@link ANeuralNetworksMemory_copy}. The buffer of a memory object is
+   * initialized after the memory object is used as an output in a successful
+   * execution, or used as the destination memory in a successful {@link
+   * ANeuralNetworksMemory_copy}. A memory object with an initialized buffer may
+   * be used according to all roles specified in
+   * {@link ANeuralNetworksMemoryDesc}, or as the source or destination memory
+   * in
+   * {@link ANeuralNetworksMemory_copy}. The buffer of a memory object will
+   * return to the uninitialized state if the memory object is used as an output
+   * in a failed execution, or used as the destination memory in a failed {@link
+   * ANeuralNetworksMemory_copy}.
+   *
+   * The dimensions of the memory descriptor are deduced from the dimensions of
+   * the corresponding model operands of all the roles specified by
+   * {@link ANeuralNetworksMemoryDesc_addInputRole} and
+   * {@link ANeuralNetworksMemoryDesc_addOutputRole}, as well as the dimensions
+   * set by the call to {@link ANeuralNetworksMemoryDesc_setDimensions}, if any.
+   * The memory descriptor may have unspecified dimensions or rank. In such a
+   * case, the same memory object may be used with different shapes of outputs
+   * in different executions. When the memory is used as an input, the input
+   * shape must be the same as the output shape from the last execution using
+   * this memory object as an output, or the last
+   * {@link ANeuralNetworkMemory_copy} using this memory object as the
+   * destination memory. Creating a memory object with unspecified dimensions or
+   * rank may fail for certain sets of roles.
+   *
+   * Using the memory in roles or shapes that are not compatible with the rules
+   * specified above will return an error.
+   *
+   * When calling {@link ANeuralNetworksExecution_setInputFromMemory} or
+   * {@link ANeuralNetworksExecution_setOutputFromMemory} with the memory
+   * object, both offset and length must be set to zero and the entire memory
+   * region will be associated with the specified input or output operand.
+   *
+   * Calling {@link ANeuralNetworksModel_setOperandValueFromMemory} with the
+   * memory created from this function will return an error.
+   *
+   * {@link ANeuralNetworksMemory_free} must be called once the memory is no
+   * longer needed.
+   *
+   * Attempting to create memory from an unfinished memory descriptor will
+   * return an error.
+   *
+   * The provided {@link ANeuralNetworksMemoryDesc} need not outlive the
+   * {@link ANeuralNetworksMemory} object.
+   *
+   * Available since API level 30.
+   *
+   * @param desc The memory descriptor.
+   * @param memory The memory object to be created.
+   *               Set to NULL if unsuccessful.
+   *
+   * @return ANEURALNETWORKS_NO_ERROR if successful; ANEURALNETWORKS_OP_FAILED
+   * if the memory is created with unspecified dimensions or rank and it is not
+   * supported for this set of roles.
+   */
+  int (*ANeuralNetworksMemory_createFromDesc)(
+      const ANeuralNetworksMemoryDesc* desc, ANeuralNetworksMemory** memory);
+
+  /**
+   * Copies data from one memory object to another.
+   *
+   * If at most one of the src and dst is created from
+   * {@link ANeuralNetworksMemory_createFromDesc}, the src and dst must have the
+   * same logical size:
+   * - If the memory is created from {@link ANeuralNetworksMemory_createFromFd},
+   * or if it is created from {@link
+   * ANeuralNetworksMemory_createFromAHardwareBuffer} with format of
+   * AHARDWAREBUFFER_FORMAT_BLOB, the logical size equals the size of the
+   * memory.
+   * - If the memory is created from
+   *   {@link ANeuralNetworksMemory_createFromAHardwareBuffer} with a format
+   * other than AHARDWAREBUFFER_FORMAT_BLOB, the logical size equals the size
+   * when there is no padding and the data is tightly packed. This function may
+   * fail if the AHardwareBuffer cannot be accessed.
+   * - If the memory is created from {@link
+   * ANeuralNetworksMemory_createFromDesc}, the logical size equals the size
+   * indicated by the {@link OperandCode} multiplied by the number of elements.
+   * This function will fail if the number of elements is unknown.
+   *
+   * If both src and dst are created from {@link
+   * ANeuralNetworksMemory_createFromDesc}, they must have compatible
+   * dimensions. Two dimensions are incompatible if both ranks are fully
+   * specified but have different values, or if there is at least one axis that
+   * is fully specified in both but has different values. The dst may have
+   * unspecified dimensions or rank. In such a case, the dimensions of dst will
+   * get updated according to the dimensions of the src.
+   *
+   * In both cases, if the src is created from
+   * {@link ANeuralNetworksMemory_createFromDesc}, it must have been used as an
+   * output in a successful execution, or used as the destination memory in a
+   * successful
+   * {@link ANeuralNetworksMemory_copy}.
+   *
+   * The src and dst may have different data layout, in which case the data
+   * copying is performed logically with data layout transformation.
+   *
+   * Available since API level 30.
+   *
+   * @param src The source memory object.
+   * @param dst The destination memory object.
+   *
+   * @return ANEURALNETWORKS_NO_ERROR if successful.
+   */
+  int (*ANeuralNetworksMemory_copy)(const ANeuralNetworksMemory* src,
+                                    const ANeuralNetworksMemory* dst);
 };
 
 /**

From 4334d8b06f333bee5dc17c942ea41beb5bd9c32d Mon Sep 17 00:00:00 2001
From: Mahmoud Abuzaina <mahmoud.abuzaina@intel.com>
Date: Tue, 16 Jun 2020 14:26:13 -0700
Subject: [PATCH 0332/1390] Fixing build issue on windows

---
 third_party/mkl_dnn/mkldnn_v1.BUILD | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/third_party/mkl_dnn/mkldnn_v1.BUILD b/third_party/mkl_dnn/mkldnn_v1.BUILD
index 47a7efecda3..7bdec138b99 100644
--- a/third_party/mkl_dnn/mkldnn_v1.BUILD
+++ b/third_party/mkl_dnn/mkldnn_v1.BUILD
@@ -71,6 +71,8 @@ cc_library(
         "src/cpu/**/*.cpp",
         "src/cpu/**/*.hpp",
         "src/cpu/xbyak/*.h",
+        "src/cpu/jit_utils/jitprofiling/*.c",
+        "src/cpu/jit_utils/jitprofiling/*.h",
     ]) + [
         ":dnnl_config_h",
         ":dnnl_version_h",

From aea06cde550055304a2c1b5f23d1ff1dd9a24107 Mon Sep 17 00:00:00 2001
From: Srinivas Vasudevan <srvasude@google.com>
Date: Tue, 16 Jun 2020 14:50:48 -0700
Subject: [PATCH 0333/1390] Update tolerance for Log1p test for Windows Build.

PiperOrigin-RevId: 316762917
Change-Id: Ia6620bb7e03bd6f71db0a8090b79eef78c0cb15d
---
 tensorflow/compiler/tests/special_math_test.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/compiler/tests/special_math_test.py b/tensorflow/compiler/tests/special_math_test.py
index 246ab2a1641..bd105bb5e95 100644
--- a/tensorflow/compiler/tests/special_math_test.py
+++ b/tensorflow/compiler/tests/special_math_test.py
@@ -106,13 +106,13 @@ class Log1pTest(xla_test.XLATestCase, parameterized.TestCase):
     self._test_range(-40., -20., dtype, rtol, atol, is_negative=False)
     self._test_range(-40., -20., dtype, rtol, atol, is_negative=True)
 
-  @parameterized.parameters((np.float32, 1e-7, 0.),
+  @parameterized.parameters((np.float32, 2e-7, 0.),
                             (np.float64, 1e-15, 0.))
   def testGreaterThanNegativeTwentyExponent(self, dtype, rtol, atol):
     self._test_range(-20., -10., dtype, rtol, atol, is_negative=False)
     self._test_range(-20., -10., dtype, rtol, atol, is_negative=True)
 
-  @parameterized.parameters((np.float32, 1e-7, 0.),
+  @parameterized.parameters((np.float32, 2e-7, 0.),
                             (np.float64, 1e-15, 0.))
   def testGreaterThanNegativeTenExponent(self, dtype, rtol, atol):
     self._test_range(-10., -5., dtype, rtol, atol, is_negative=False)

From dd789e77833942bf7ffccb9e2e2e93e1c6dff436 Mon Sep 17 00:00:00 2001
From: Jose Baiocchi <jbaiocchi@google.com>
Date: Tue, 16 Jun 2020 15:01:16 -0700
Subject: [PATCH 0334/1390] Remove XPlane to trace viewer PID mapping from
 XPlane schema

PiperOrigin-RevId: 316764981
Change-Id: If8e23e0136493dc721fc41095fd9425b8226b060
---
 tensorflow/core/profiler/convert/BUILD        |   7 +-
 .../convert/xplane_to_trace_events.cc         | 106 ++++++++++--------
 .../convert/xplane_to_trace_events_test.cc    |  16 +--
 .../core/profiler/internal/cpu/host_tracer.cc |   1 -
 .../internal/cpu/metadata_collector.cc        |   1 -
 tensorflow/core/profiler/internal/gpu/BUILD   |   7 +-
 .../profiler/internal/gpu/device_tracer.cc    |   4 +-
 .../internal/gpu/device_tracer_test.cc        |   2 -
 tensorflow/core/profiler/utils/BUILD          |   3 +
 tensorflow/core/profiler/utils/trace_utils.h  |  15 ++-
 .../core/profiler/utils/xplane_schema.cc      |   9 --
 .../core/profiler/utils/xplane_schema.h       |  15 ---
 12 files changed, 93 insertions(+), 93 deletions(-)

diff --git a/tensorflow/core/profiler/convert/BUILD b/tensorflow/core/profiler/convert/BUILD
index 5f287a14267..abf0176bf6f 100644
--- a/tensorflow/core/profiler/convert/BUILD
+++ b/tensorflow/core/profiler/convert/BUILD
@@ -377,7 +377,9 @@ cc_library(
         "//tensorflow/core/profiler/protobuf:trace_events_proto_cc",
         "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
         "//tensorflow/core/profiler/utils:tf_xplane_visitor",
+        "//tensorflow/core/profiler/utils:trace_utils",
         "//tensorflow/core/profiler/utils:xplane_schema",
+        "//tensorflow/core/profiler/utils:xplane_utils",
         "//tensorflow/core/profiler/utils:xplane_visitor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:optional",
@@ -390,17 +392,14 @@ tf_cc_test(
     srcs = ["xplane_to_trace_events_test.cc"],
     deps = [
         ":xplane_to_trace_events",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
-        "//tensorflow/core:testlib",
         "//tensorflow/core/profiler/protobuf:trace_events_proto_cc",
         "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
+        "//tensorflow/core/profiler/utils:trace_utils",
         "//tensorflow/core/profiler/utils:xplane_builder",
         "//tensorflow/core/profiler/utils:xplane_schema",
-        "//tensorflow/core/profiler/utils:xplane_utils",
     ],
 )
 
diff --git a/tensorflow/core/profiler/convert/xplane_to_trace_events.cc b/tensorflow/core/profiler/convert/xplane_to_trace_events.cc
index f4a0145d8f6..882f50e6080 100644
--- a/tensorflow/core/profiler/convert/xplane_to_trace_events.cc
+++ b/tensorflow/core/profiler/convert/xplane_to_trace_events.cc
@@ -28,7 +28,9 @@ limitations under the License.
 #include "tensorflow/core/profiler/protobuf/trace_events.pb.h"
 #include "tensorflow/core/profiler/protobuf/xplane.pb.h"
 #include "tensorflow/core/profiler/utils/tf_xplane_visitor.h"
+#include "tensorflow/core/profiler/utils/trace_utils.h"
 #include "tensorflow/core/profiler/utils/xplane_schema.h"
+#include "tensorflow/core/profiler/utils/xplane_utils.h"
 #include "tensorflow/core/profiler/utils/xplane_visitor.h"
 
 namespace tensorflow {
@@ -36,25 +38,63 @@ namespace profiler {
 
 namespace {
 
-Device BuildDeviceAndResource(const XPlaneVisitor& plane) {
-  Device device;
-  device.set_name(std::string(plane.Name()));
-  device.set_device_id(plane.Id());
+void BuildDeviceAndResources(uint32 device_id, const XPlaneVisitor& plane,
+                             Device* device) {
+  device->set_name(std::string(plane.Name()));
+  device->set_device_id(device_id);
 
-  bool sort_by_ordinal = (plane.Name() == kHostThreadsPlaneName);
+  bool sort_by_ordinal = (device_id == kHostThreadsDeviceId);
   int ordinal = 0;
   plane.ForEachLine([&](const XLineVisitor& line) {
-    Resource resource;
-    resource.set_resource_id(line.Id());
+    uint32 resource_id = line.DisplayId();
+    Resource& resource = (*device->mutable_resources())[resource_id];
+    resource.set_resource_id(resource_id);
     resource.set_name(std::string(line.DisplayName()));
     if (sort_by_ordinal) {
       // When sort_index is absent (i.e. 0), resource id will be used.
       // Therefore sort_index starts with 1.
       resource.set_sort_index(++ordinal);
     }
-    (*device.mutable_resources())[line.Id()] = resource;
   });
-  return device;
+}
+
+void ConvertXPlaneToTraceEvents(uint32 device_id, const XPlaneVisitor& xplane,
+                                Trace* trace) {
+  // Convert devices and resources.
+  BuildDeviceAndResources(device_id, xplane,
+                          &(*trace->mutable_devices())[device_id]);
+
+  // Convert events.
+  xplane.ForEachLine([device_id, trace](const XLineVisitor& xline) {
+    uint32 resource_id = xline.DisplayId();
+    xline.ForEachEvent(
+        [device_id, resource_id, trace](const XEventVisitor& xevent) {
+          int64 event_type =
+              xevent.Type().value_or(HostEventType::kUnknownHostEventType);
+          if (event_type == HostEventType::kMemoryAllocation ||
+              event_type == HostEventType::kMemoryDeallocation) {
+            return;
+          }
+          auto* event = trace->add_trace_events();
+          auto& args = *event->mutable_args();
+          event->set_device_id(device_id);
+          event->set_resource_id(resource_id);
+          if (xevent.HasDisplayName()) {
+            event->set_name(std::string(xevent.DisplayName()));
+            args["long_name"] = std::string(xevent.Name());
+          } else {
+            event->set_name(std::string(xevent.Name()));
+          }
+          event->set_timestamp_ps(xevent.TimestampPs());
+          event->set_duration_ps(xevent.DurationPs());
+
+          xevent.ForEachStat([&](const XStatVisitor& stat) {
+            if (stat.ValueCase() == XStat::VALUE_NOT_SET) return;
+            if (IsInternalStat(stat.Type())) return;
+            args[std::string(stat.Name())] = stat.ToString();
+          });
+        });
+  });
 }
 
 }  // namespace
@@ -81,44 +121,18 @@ void MaybeDropEventsForTraceViewer(Trace* trace, uint32 limit) {
 }
 
 void ConvertXSpaceToTraceEvents(const XSpace& xspace, Trace* trace) {
-  auto* trace_devices = trace->mutable_devices();
+  const XPlane* host_plane = FindPlaneWithName(xspace, kHostThreadsPlaneName);
+  if (host_plane != nullptr) {
+    XPlaneVisitor xplane = CreateTfXPlaneVisitor(host_plane);
+    ConvertXPlaneToTraceEvents(kHostThreadsDeviceId, xplane, trace);
+  }
 
-  for (const auto& raw_plane : xspace.planes()) {
-    XPlaneVisitor xplane = CreateTfXPlaneVisitor(&raw_plane);
-    // Convert devices and resources.
-    int64 device_id = xplane.Id();
-    (*trace_devices)[device_id] = BuildDeviceAndResource(xplane);
-
-    // Convert events.
-    xplane.ForEachLine([&](const XLineVisitor& xline) {
-      int64 resource_id = xline.Id();  // Either thread id or CUDA stream id.
-      xline.ForEachEvent([&](const XEventVisitor& xevent) {
-        int64 event_type =
-            xevent.Type().value_or(HostEventType::kUnknownHostEventType);
-        if (event_type == HostEventType::kMemoryAllocation ||
-            event_type == HostEventType::kMemoryDeallocation) {
-          return;
-        }
-        auto* event = trace->add_trace_events();
-        auto& args = *event->mutable_args();
-        event->set_device_id(device_id);
-        event->set_resource_id(resource_id);
-        if (xevent.HasDisplayName()) {
-          event->set_name(std::string(xevent.DisplayName()));
-          args["long_name"] = std::string(xevent.Name());
-        } else {
-          event->set_name(std::string(xevent.Name()));
-        }
-        event->set_timestamp_ps(xevent.TimestampPs());
-        event->set_duration_ps(xevent.DurationPs());
-
-        xevent.ForEachStat([&](const XStatVisitor& stat) {
-          if (stat.ValueCase() == XStat::VALUE_NOT_SET) return;
-          if (IsInternalStat(stat.Type())) return;
-          args[std::string(stat.Name())] = stat.ToString();
-        });
-      });
-    });
+  const std::vector<const XPlane*> device_planes =
+      FindPlanesWithPrefix(xspace, kGpuPlanePrefix);
+  for (const XPlane* device_plane : device_planes) {
+    XPlaneVisitor xplane = CreateTfXPlaneVisitor(device_plane);
+    uint32 device_id = kFirstDeviceId + xplane.Id();
+    ConvertXPlaneToTraceEvents(device_id, xplane, trace);
   }
 
   // Trace viewer (non-streaming) has scalability issues, we need to drop
diff --git a/tensorflow/core/profiler/convert/xplane_to_trace_events_test.cc b/tensorflow/core/profiler/convert/xplane_to_trace_events_test.cc
index b9a9fe09981..1e0d27ae68a 100644
--- a/tensorflow/core/profiler/convert/xplane_to_trace_events_test.cc
+++ b/tensorflow/core/profiler/convert/xplane_to_trace_events_test.cc
@@ -18,7 +18,9 @@ limitations under the License.
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/profiler/protobuf/trace_events.pb.h"
 #include "tensorflow/core/profiler/protobuf/xplane.pb.h"
+#include "tensorflow/core/profiler/utils/trace_utils.h"
 #include "tensorflow/core/profiler/utils/xplane_builder.h"
+#include "tensorflow/core/profiler/utils/xplane_schema.h"
 
 namespace tensorflow {
 namespace profiler {
@@ -26,10 +28,7 @@ namespace {
 
 void CreateXSpace(XSpace* space) {
   XPlaneBuilder host_plane(space->add_planes());
-  XPlaneBuilder device_plane(space->add_planes());
-
-  host_plane.SetName("cpu");
-  host_plane.SetId(0);
+  host_plane.SetName(kHostThreadsPlaneName);
   XLineBuilder thread1 = host_plane.GetOrCreateLine(10);
   thread1.SetName("thread1");
   XEventBuilder event1 =
@@ -47,8 +46,9 @@ void CreateXSpace(XSpace* space) {
   event2.ParseAndAddStatValue(*host_plane.GetOrCreateStatMetadata("tf_op"),
                               "Conv2D");
 
-  device_plane.SetName("gpu:0");
-  device_plane.SetId(1);
+  XPlaneBuilder device_plane(space->add_planes());
+  device_plane.SetName(GpuPlaneName(0));
+  device_plane.SetId(0);
   XLineBuilder stream1 = device_plane.GetOrCreateLine(30);
   stream1.SetName("gpu stream 1");
   XEventBuilder event3 =
@@ -67,8 +67,8 @@ TEST(ConvertXPlaneToTraceEvents, Convert) {
   ConvertXSpaceToTraceEvents(xspace, &trace);
 
   ASSERT_EQ(trace.devices_size(), 2);
-  EXPECT_EQ(trace.devices().at(0).resources_size(), 2);
-  EXPECT_EQ(trace.devices().at(1).resources_size(), 1);
+  EXPECT_EQ(trace.devices().at(kHostThreadsDeviceId).resources_size(), 2);
+  EXPECT_EQ(trace.devices().at(kFirstDeviceId).resources_size(), 1);
   EXPECT_EQ(trace.trace_events_size(), 3);
 }
 
diff --git a/tensorflow/core/profiler/internal/cpu/host_tracer.cc b/tensorflow/core/profiler/internal/cpu/host_tracer.cc
index 37f7baca1d3..fa21df004df 100644
--- a/tensorflow/core/profiler/internal/cpu/host_tracer.cc
+++ b/tensorflow/core/profiler/internal/cpu/host_tracer.cc
@@ -147,7 +147,6 @@ Status HostTracer::CollectData(XSpace* space) {
   }
   MakeCompleteEvents(&events_);
   XPlane* plane = FindOrAddMutablePlaneWithName(space, kHostThreadsPlaneName);
-  plane->set_id(kHostPlaneId);
   ConvertCompleteEventsToXPlane(start_timestamp_ns_, events_, plane);
   events_.clear();
   return Status::OK();
diff --git a/tensorflow/core/profiler/internal/cpu/metadata_collector.cc b/tensorflow/core/profiler/internal/cpu/metadata_collector.cc
index 58e6385a7ec..c9a593f101b 100644
--- a/tensorflow/core/profiler/internal/cpu/metadata_collector.cc
+++ b/tensorflow/core/profiler/internal/cpu/metadata_collector.cc
@@ -66,7 +66,6 @@ class MetadataCollector : public ProfilerInterface {
   Status CollectData(XSpace* space) override {
     if (!debug_info_.empty()) {
       XPlane* plane = FindOrAddMutablePlaneWithName(space, kMetadataPlaneName);
-      plane->set_id(kMetadataPlaneId);
       XPlaneBuilder xplane(plane);
       const XStatMetadata& hlo_proto_stat =
           *xplane.GetOrCreateStatMetadata(kHloProto);
diff --git a/tensorflow/core/profiler/internal/gpu/BUILD b/tensorflow/core/profiler/internal/gpu/BUILD
index 987bc5ea336..dd8ff1e9a27 100644
--- a/tensorflow/core/profiler/internal/gpu/BUILD
+++ b/tensorflow/core/profiler/internal/gpu/BUILD
@@ -36,17 +36,21 @@ tf_cuda_library(
     deps = [
         ":cupti_utils",
         "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/profiler/internal:annotation_stack",
         "//tensorflow/core/profiler/internal:parse_annotation",
         "//tensorflow/core/profiler/internal:profiler_factory",
         "//tensorflow/core/profiler/internal:profiler_interface",
-        "//tensorflow/core/profiler/lib:traceme",
         "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
         "//tensorflow/core/profiler/utils:xplane_builder",
         "//tensorflow/core/profiler/utils:xplane_schema",
         "//tensorflow/core/profiler/utils:xplane_utils",
+        "@com_google_absl//absl/container:fixed_array",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/synchronization",
     ],
     alwayslink = 1,
@@ -145,4 +149,5 @@ tf_cuda_library(
         ":cupti_wrapper",
     ] + tf_additional_cupti_utils_cuda_deps(),
     visibility = ["//visibility:public"],
+    alwayslink = 1,
 )
diff --git a/tensorflow/core/profiler/internal/gpu/device_tracer.cc b/tensorflow/core/profiler/internal/gpu/device_tracer.cc
index bc9952302e8..73d2a278ea4 100644
--- a/tensorflow/core/profiler/internal/gpu/device_tracer.cc
+++ b/tensorflow/core/profiler/internal/gpu/device_tracer.cc
@@ -41,6 +41,7 @@ limitations under the License.
 #include "tensorflow/core/profiler/internal/parse_annotation.h"
 #include "tensorflow/core/profiler/internal/profiler_factory.h"
 #include "tensorflow/core/profiler/internal/profiler_interface.h"
+#include "tensorflow/core/profiler/protobuf/xplane.pb.h"
 #include "tensorflow/core/profiler/utils/xplane_builder.h"
 #include "tensorflow/core/profiler/utils/xplane_schema.h"
 #include "tensorflow/core/profiler/utils/xplane_utils.h"
@@ -225,11 +226,10 @@ class CuptiTraceCollectorImpl : public CuptiTraceCollector {
     uint64 end_gpu_ns = CuptiTracer::GetTimestamp();
     XPlaneBuilder host_plane(
         FindOrAddMutablePlaneWithName(space, kCuptiDriverApiPlaneName));
-    host_plane.SetId(kCuptiDriverApiPlaneId);
     for (int device_ordinal = 0; device_ordinal < num_gpus_; ++device_ordinal) {
       std::string name = GpuPlaneName(device_ordinal);
       XPlaneBuilder device_plane(FindOrAddMutablePlaneWithName(space, name));
-      device_plane.SetId(kGpuPlaneBaseId + device_ordinal);
+      device_plane.SetId(device_ordinal);
       per_device_collector_[device_ordinal].Flush(start_gpu_ns_, end_gpu_ns,
                                                   &device_plane, &host_plane);
       per_device_collector_[device_ordinal].GetDeviceCapabilities(
diff --git a/tensorflow/core/profiler/internal/gpu/device_tracer_test.cc b/tensorflow/core/profiler/internal/gpu/device_tracer_test.cc
index 6fc19e776e1..973167ff51b 100644
--- a/tensorflow/core/profiler/internal/gpu/device_tracer_test.cc
+++ b/tensorflow/core/profiler/internal/gpu/device_tracer_test.cc
@@ -263,12 +263,10 @@ TEST_F(DeviceTracerTest, TraceToXSpace) {
   // At least one gpu plane and one host plane for launching events.
   const XPlane* host_plane = FindPlaneWithName(space, kCuptiDriverApiPlaneName);
   ASSERT_NE(host_plane, nullptr);
-  EXPECT_EQ(host_plane->id(), kCuptiDriverApiPlaneId);
 
   const XPlane* device_plane =
       FindPlaneWithName(space, strings::StrCat(kGpuPlanePrefix, 0));
   ASSERT_NE(device_plane, nullptr);  // Check if device plane is serialized.
-  EXPECT_EQ(device_plane->id(), kGpuPlaneBaseId);
   // one for MemcpyH2D, one for MemcpyD2H, two for Matmul (one from Eigen, one
   // from cudnn).
   EXPECT_EQ(device_plane->event_metadata_size(), 4);
diff --git a/tensorflow/core/profiler/utils/BUILD b/tensorflow/core/profiler/utils/BUILD
index 6942f3ea306..ece58802661 100644
--- a/tensorflow/core/profiler/utils/BUILD
+++ b/tensorflow/core/profiler/utils/BUILD
@@ -145,6 +145,9 @@ cc_library(
 cc_library(
     name = "trace_utils",
     hdrs = ["trace_utils.h"],
+    deps = [
+        "//tensorflow/core:lib",
+    ],
 )
 
 cc_library(
diff --git a/tensorflow/core/profiler/utils/trace_utils.h b/tensorflow/core/profiler/utils/trace_utils.h
index 024330faa79..72e6f23feb9 100644
--- a/tensorflow/core/profiler/utils/trace_utils.h
+++ b/tensorflow/core/profiler/utils/trace_utils.h
@@ -16,11 +16,20 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_PROFILER_UTILS_TRACE_UTILS_H_
 #define TENSORFLOW_CORE_PROFILER_UTILS_TRACE_UTILS_H_
 
+#include "tensorflow/core/platform/types.h"
+
 namespace tensorflow {
 namespace profiler {
 
-// The thread id used for step information in GPU trace viewer.
-// First derived stream/thread id.
+// Constants used as trace_viewer PID (device_id in trace_events.proto).
+// PID 0 is unused.
+// Support up to 500 accelerator devices.
+constexpr uint32 kFirstDeviceId = 1;
+constexpr uint32 kLastDeviceId = 500;
+// Host threads are shown as a single fake device.
+constexpr uint32 kHostThreadsDeviceId = kLastDeviceId + 1;
+
+// Constants used as trace_viewer TID (resource_id in trace_events.proto).
 constexpr int kThreadIdDerivedMin = 0xdeadbeef;
 constexpr int kThreadIdStepInfo = kThreadIdDerivedMin;
 constexpr int kThreadIdKernelLaunch = kThreadIdDerivedMin + 1;
@@ -29,8 +38,6 @@ constexpr int kThreadIdTfOp = kThreadIdDerivedMin + 3;
 constexpr int kThreadIdHloModule = kThreadIdDerivedMin + 4;
 constexpr int kThreadIdHloOp = kThreadIdDerivedMin + 5;
 constexpr int kThreadIdOverhead = kThreadIdDerivedMin + 6;
-
-// Last derived stream/thread id.
 constexpr int kThreadIdDerivedMax = kThreadIdOverhead;
 
 static inline bool IsDerivedThreadId(int thread_id) {
diff --git a/tensorflow/core/profiler/utils/xplane_schema.cc b/tensorflow/core/profiler/utils/xplane_schema.cc
index 2c79df7980f..197dab75d3b 100644
--- a/tensorflow/core/profiler/utils/xplane_schema.cc
+++ b/tensorflow/core/profiler/utils/xplane_schema.cc
@@ -39,15 +39,6 @@ const absl::string_view kXlaModuleLineName = "XLA Modules";
 const absl::string_view kXlaOpLineName = "XLA Ops";
 const absl::string_view kKernelLaunchLineName = "Launch Stats";
 
-const int32 kHostPlaneId = 49;
-const int32 kGpuPlaneBaseId = 0;
-const int32 kCuptiDriverApiPlaneId = 50;
-const int32 kMetadataPlaneId = 99;
-const int32 kTFStreamzPlaneId = 98;
-
-const int32 kThreadGroupMinPlaneId = kCuptiDriverApiPlaneId + 1;
-const int32 kThreadGroupMaxPlaneId = kTFStreamzPlaneId - 1;
-
 namespace {
 
 constexpr int kNumHostEventTypes =
diff --git a/tensorflow/core/profiler/utils/xplane_schema.h b/tensorflow/core/profiler/utils/xplane_schema.h
index a045e20d8de..8b999dc6f9f 100644
--- a/tensorflow/core/profiler/utils/xplane_schema.h
+++ b/tensorflow/core/profiler/utils/xplane_schema.h
@@ -45,21 +45,6 @@ ABSL_CONST_INIT extern const absl::string_view kXlaModuleLineName;
 ABSL_CONST_INIT extern const absl::string_view kXlaOpLineName;
 ABSL_CONST_INIT extern const absl::string_view kKernelLaunchLineName;
 
-// Id of XPlane that contains TraceMe events.
-ABSL_CONST_INIT extern const int32 kHostPlaneId;
-// Ids prefix of XPlane that contains GPU events.
-ABSL_CONST_INIT extern const int32 kGpuPlaneBaseId;
-// Id of XPlane that contains CUPTI driver API generated events which happens
-// on CPU host threads, e.g. Kernel launch.
-ABSL_CONST_INIT extern const int32 kCuptiDriverApiPlaneId;
-// Id of XPlane that contains profile metadata such as XLA debug info.
-ABSL_CONST_INIT extern const int32 kMetadataPlaneId;
-// Id of XPlane that contains kpi related metrics.
-ABSL_CONST_INIT extern const int32 kTFStreamzPlaneId;
-
-ABSL_CONST_INIT extern const int32 kThreadGroupMinPlaneId;
-ABSL_CONST_INIT extern const int32 kThreadGroupMaxPlaneId;
-
 // Interesting event types (i.e., TraceMe names).
 enum HostEventType {
   kFirstHostEventType = 0,

From d4c2030375551acc9ad36356e090dcc1af8baa51 Mon Sep 17 00:00:00 2001
From: David Majnemer <majnemer@google.com>
Date: Tue, 16 Jun 2020 15:01:21 -0700
Subject: [PATCH 0335/1390] [XLA] Make IdenticalSlowPath more strict

- Map did not check |dimensions|. This has not resulted in miscompliation
  because none of the backends/frontends support/generate map's with dimensions.
- DynamicSlice did not check |dynamic_slice_sizes|. This has not resulted in
  miscompliation because it is not possible for two DynamicSlice operations
  with different |dynamic_slice_sizes| to have the same shape.
- HloCollective did not check |constrain_layout|.

PiperOrigin-RevId: 316765004
Change-Id: Id2629fef71446842eeae18901142a502a634b010
---
 tensorflow/compiler/xla/service/hlo_instructions.cc | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/tensorflow/compiler/xla/service/hlo_instructions.cc b/tensorflow/compiler/xla/service/hlo_instructions.cc
index bcc00d806da..2a53841fd34 100644
--- a/tensorflow/compiler/xla/service/hlo_instructions.cc
+++ b/tensorflow/compiler/xla/service/hlo_instructions.cc
@@ -550,6 +550,7 @@ bool HloCollectiveInstruction::IdenticalSlowPath(
   const auto& casted_other =
       static_cast<const HloCollectiveInstruction&>(other);
   return HloChannelInstruction::IdenticalSlowPath(other, eq_computations) &&
+         constrain_layout() == casted_other.constrain_layout() &&
          absl::c_equal(replica_groups(), casted_other.replica_groups(),
                        [](const ReplicaGroup& a, const ReplicaGroup& b) {
                          return absl::c_equal(a.replica_ids(), b.replica_ids());
@@ -1101,7 +1102,9 @@ bool HloMapInstruction::IdenticalSlowPath(
     const HloInstruction& other,
     const std::function<bool(const HloComputation*, const HloComputation*)>&
         eq_computations) const {
-  return eq_computations(to_apply(), other.to_apply());
+  const auto& casted_other = static_cast<const HloMapInstruction&>(other);
+  return eq_computations(to_apply(), casted_other.to_apply()) &&
+         dimensions() == casted_other.dimensions();
 }
 
 std::unique_ptr<HloInstruction> HloMapInstruction::CloneWithNewOperandsImpl(
@@ -2515,7 +2518,8 @@ bool HloDynamicSliceInstruction::IdenticalSlowPath(
     const HloInstruction& other,
     const std::function<bool(const HloComputation*, const HloComputation*)>&
         eq_computations) const {
-  return true;
+  const auto& casted_other = static_cast<const HloMapInstruction&>(other);
+  return dynamic_slice_sizes() == casted_other.dynamic_slice_sizes();
 }
 
 std::unique_ptr<HloInstruction>

From ef68eff5378aa52ec52064a0b630b79876e90c5a Mon Sep 17 00:00:00 2001
From: Raman Sarokin <sorokin@google.com>
Date: Tue, 16 Jun 2020 15:06:59 -0700
Subject: [PATCH 0336/1390] Some fixes for private mem broadcast convolution.

PiperOrigin-RevId: 316766133
Change-Id: Icf2273be31c299c49664b9b9fe17df55a0c53693
---
 .../delegates/gpu/cl/kernels/conv_powervr.cc  | 87 ++++++++++---------
 .../delegates/gpu/cl/kernels/conv_powervr.h   | 33 +++++++
 2 files changed, 78 insertions(+), 42 deletions(-)

diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/conv_powervr.cc b/tensorflow/lite/delegates/gpu/cl/kernels/conv_powervr.cc
index 6ab22bf545d..363a0157420 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/conv_powervr.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/conv_powervr.cc
@@ -196,6 +196,9 @@ absl::Status ConvPowerVR::Compile(const CreationContext& creation_context) {
       creation_context.device->IsPowerVR()) {
     options.push_back(CompilerOptions::POWERVR_FP16);
   }
+  if (conv_params_.IsPrivateMemBroadcast()) {
+    options.push_back(CompilerOptions::CL_2_0);
+  }
   return creation_context.cache->GetOrCreateCLKernel(
       code, "main_function", options, *creation_context.context,
       *creation_context.device, &kernel_);
@@ -311,37 +314,10 @@ std::string GenerateConv(
   const int local_mem_size =
       conv_params.block_size.z * 4 * conv_params.src_depth_loop_size;
 
-  const bool use_simd_broadcast =
-      conv_params.weights_upload_type ==
-          ConvPowerVR::WeightsUploadType::PRIVATE_MEM_SIMD8_BROADCAST ||
-      conv_params.weights_upload_type ==
-          ConvPowerVR::WeightsUploadType::PRIVATE_MEM_SIMD16_BROADCAST ||
-      conv_params.weights_upload_type ==
-          ConvPowerVR::WeightsUploadType::PRIVATE_MEM_SIMD32_BROADCAST ||
-      conv_params.weights_upload_type ==
-          ConvPowerVR::WeightsUploadType::PRIVATE_MEM_SIMD64_BROADCAST ||
-      conv_params.weights_upload_type ==
-          ConvPowerVR::WeightsUploadType::PRIVATE_MEM_SIMD128_BROADCAST;
+  const bool use_simd_broadcast = conv_params.IsPrivateMemBroadcast();
+  const int simd_size = conv_params.GetSimdSize();
 
-  int simd_size = 1;
-  if (conv_params.weights_upload_type ==
-      ConvPowerVR::WeightsUploadType::PRIVATE_MEM_SIMD8_BROADCAST) {
-    simd_size = 8;
-  } else if (conv_params.weights_upload_type ==
-             ConvPowerVR::WeightsUploadType::PRIVATE_MEM_SIMD16_BROADCAST) {
-    simd_size = 16;
-  } else if (conv_params.weights_upload_type ==
-             ConvPowerVR::WeightsUploadType::PRIVATE_MEM_SIMD32_BROADCAST) {
-    simd_size = 32;
-  } else if (conv_params.weights_upload_type ==
-             ConvPowerVR::WeightsUploadType::PRIVATE_MEM_SIMD64_BROADCAST) {
-    simd_size = 64;
-  } else if (conv_params.weights_upload_type ==
-             ConvPowerVR::WeightsUploadType::PRIVATE_MEM_SIMD128_BROADCAST) {
-    simd_size = 128;
-  }
-
-  bool late_oob_check = need_local_mem || use_simd_broadcast;
+  const bool late_oob_check = need_local_mem || use_simd_broadcast;
 
   const std::string weights_space =
       conv_params.weights_upload_type ==
@@ -355,6 +331,12 @@ std::string GenerateConv(
   const std::string weights_global_ptr =
       weights_space + " " + weights_data_type + "*";
 
+  if (use_simd_broadcast) {
+    if (device.cl_version() == OpenCLVersion::CL_2_0) {
+      c += "#pragma OPENCL EXTENSION cl_khr_subgroups : enable\n";
+    }
+  }
+
   const int3 work_group_size = conv_params.work_group_size;
   const int3 block_size = conv_params.block_size;
   if (conv_params.fixed_work_group_size) {
@@ -364,7 +346,7 @@ std::string GenerateConv(
          std::to_string(work_group_size.z) + ")))\n";
   }
   if (use_simd_broadcast && device.IsIntel()) {
-    c += "__attribute__((intel_reqd_work_group_size(" +
+    c += "__attribute__((intel_reqd_sub_group_size(" +
          std::to_string(simd_size) + ")))\n";
   }
   c += "__kernel void main_function(\n";
@@ -408,6 +390,9 @@ std::string GenerateConv(
            std::to_string(work_group_size.x) + " + get_local_id(0);\n";
     }
   }
+  if (use_simd_broadcast) {
+    c += "  int simd_id = get_sub_group_local_id();\n";
+  }
   for (int z = 0; z < block_size.z; ++z) {
     for (int y = 0; y < block_size.y; ++y) {
       for (int x = 0; x < block_size.x; ++x) {
@@ -555,17 +540,36 @@ std::string GenerateConv(
           for (int y = 0; y < block_size.y; ++y) {
             for (int x = 0; x < block_size.x; ++x) {
               std::string id = std::to_string(y) + std::to_string(x);
-              std::string w_val = "weights_cache[" +
-                                  std::to_string(z * 4 + ch + shared_offset) +
-                                  "]";
               if (use_simd_broadcast) {
                 int simd_id = (z * 4 + ch + shared_offset) / simd_size;
                 int thread_id = (z * 4 + ch + shared_offset) % simd_size;
-                w_val = "sub_group_broadcast(simd_w" + std::to_string(simd_id) +
-                        ", " + std::to_string(thread_id) + "u)";
+                std::string w_val_x = "sub_group_broadcast(simd_w" +
+                                      std::to_string(simd_id) + ".x, " +
+                                      std::to_string(thread_id) + "u)";
+                std::string w_val_y = "sub_group_broadcast(simd_w" +
+                                      std::to_string(simd_id) + ".y, " +
+                                      std::to_string(thread_id) + "u)";
+                std::string w_val_z = "sub_group_broadcast(simd_w" +
+                                      std::to_string(simd_id) + ".z, " +
+                                      std::to_string(thread_id) + "u)";
+                std::string w_val_w = "sub_group_broadcast(simd_w" +
+                                      std::to_string(simd_id) + ".w, " +
+                                      std::to_string(thread_id) + "u)";
+                c += "    r" + std::to_string(z) + id + ".x += " + w_val_x +
+                     " * src" + id + "." + channels[ch] + ";\n";
+                c += "    r" + std::to_string(z) + id + ".y += " + w_val_y +
+                     " * src" + id + "." + channels[ch] + ";\n";
+                c += "    r" + std::to_string(z) + id + ".z += " + w_val_z +
+                     " * src" + id + "." + channels[ch] + ";\n";
+                c += "    r" + std::to_string(z) + id + ".w += " + w_val_w +
+                     " * src" + id + "." + channels[ch] + ";\n";
+              } else {
+                std::string w_val = "weights_cache[" +
+                                    std::to_string(z * 4 + ch + shared_offset) +
+                                    "]";
+                c += "    r" + std::to_string(z) + id + " += " + w_val +
+                     " * src" + id + "." + channels[ch] + ";\n";
               }
-              c += "    r" + std::to_string(z) + id + " += " + w_val +
-                   " * src" + id + "." + channels[ch] + ";\n";
             }
           }
         }
@@ -608,16 +612,15 @@ std::string GenerateConv(
     int parts = local_mem_size / simd_size;
     int reminder = local_mem_size % simd_size;
     for (int i = 0; i < parts; ++i) {
-      c += "    FLT4 simd_w" + std::to_string(i) +
-           " = filters_loc[get_sub_group_local_id() + " +
+      c += "    FLT4 simd_w" + std::to_string(i) + " = filters_loc[simd_id + " +
            std::to_string(i * simd_size) + "];\n";
     }
     if (reminder) {
       c += "    FLT4 simd_w" + std::to_string(parts) + ";\n";
       c += "    if (simd_id < " + std::to_string(reminder) + ") {\n";
       c += "      simd_w" + std::to_string(parts) +
-           " = filters_loc[get_sub_group_local_id() + " +
-           std::to_string(parts * simd_size) + "];\n";
+           " = filters_loc[simd_id + " + std::to_string(parts * simd_size) +
+           "];\n";
       c += "    }\n";
     }
   } else {  // GLOBAL_MEM/CONSTANT_MEM
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/conv_powervr.h b/tensorflow/lite/delegates/gpu/cl/kernels/conv_powervr.h
index a729098bded..cf182404e23 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/conv_powervr.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/conv_powervr.h
@@ -90,6 +90,39 @@ class ConvPowerVR : public GPUOperation {
     WeightsUploadType weights_upload_type;
     bool x_kernel_is_1;
     bool y_kernel_is_1;
+
+    bool IsPrivateMemBroadcast() const {
+      return weights_upload_type ==
+                 WeightsUploadType::PRIVATE_MEM_SIMD8_BROADCAST ||
+             weights_upload_type ==
+                 WeightsUploadType::PRIVATE_MEM_SIMD16_BROADCAST ||
+             weights_upload_type ==
+                 WeightsUploadType::PRIVATE_MEM_SIMD32_BROADCAST ||
+             weights_upload_type ==
+                 WeightsUploadType::PRIVATE_MEM_SIMD64_BROADCAST ||
+             weights_upload_type ==
+                 WeightsUploadType::PRIVATE_MEM_SIMD128_BROADCAST;
+    }
+
+    int GetSimdSize() const {
+      if (weights_upload_type ==
+          WeightsUploadType::PRIVATE_MEM_SIMD8_BROADCAST) {
+        return 8;
+      } else if (weights_upload_type ==
+                 WeightsUploadType::PRIVATE_MEM_SIMD16_BROADCAST) {
+        return 16;
+      } else if (weights_upload_type ==
+                 WeightsUploadType::PRIVATE_MEM_SIMD32_BROADCAST) {
+        return 32;
+      } else if (weights_upload_type ==
+                 WeightsUploadType::PRIVATE_MEM_SIMD64_BROADCAST) {
+        return 64;
+      } else if (weights_upload_type ==
+                 WeightsUploadType::PRIVATE_MEM_SIMD128_BROADCAST) {
+        return 128;
+      }
+      return 1;
+    }
   };
 
   ConvPowerVR(const OperationDef& definition,

From 0ea67f633b6272c809244925c9aa05f16adf2401 Mon Sep 17 00:00:00 2001
From: Ran Chen <crccw@google.com>
Date: Tue, 16 Jun 2020 15:17:40 -0700
Subject: [PATCH 0337/1390] Try to deduce job, replica and task from
 config.list_logical_devices() again

We used to assume that job is localhost, which may not be true in a multi worker environment.

list_logical_devices() isn't always safe since it may return remote devices as well, but we don't have list_local_devices() yet, and we're already using list_logical_devices() in MirorredStrategy (when user doesn't explicit pass devices), so this shouldn't make things worse.

An alternative is to re-consider whether we need device_util.canonicalize at all, but there are multiple usage of it. This requires further effort.

PiperOrigin-RevId: 316768135
Change-Id: Idb2653497e0c4e746a61c44b4d452ab1d052d014
---
 tensorflow/python/distribute/BUILD            |  8 ++++
 tensorflow/python/distribute/device_util.py   | 13 ++++-
 .../python/distribute/device_util_test.py     | 48 +++++++++++++++----
 3 files changed, 59 insertions(+), 10 deletions(-)

diff --git a/tensorflow/python/distribute/BUILD b/tensorflow/python/distribute/BUILD
index a3655699669..e39631d634f 100644
--- a/tensorflow/python/distribute/BUILD
+++ b/tensorflow/python/distribute/BUILD
@@ -121,9 +121,17 @@ cuda_py_test(
     name = "device_util_test",
     srcs = ["device_util_test.py"],
     deps = [
+        ":combinations",
         ":device_util",
+        ":multi_worker_test_base",
+        ":multi_worker_util",
+        "//tensorflow/core:protos_all_py",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python:extra_py_tests_deps",
         "//tensorflow/python:framework_ops",
+        "//tensorflow/python:training_server_lib",
+        "//tensorflow/python/eager:context",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
diff --git a/tensorflow/python/distribute/device_util.py b/tensorflow/python/distribute/device_util.py
index 7f32ed39aed..0aee34e33ae 100644
--- a/tensorflow/python/distribute/device_util.py
+++ b/tensorflow/python/distribute/device_util.py
@@ -19,6 +19,7 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.eager import context
+from tensorflow.python.framework import config
 from tensorflow.python.framework import device as tf_device
 from tensorflow.python.framework import ops
 
@@ -55,8 +56,16 @@ def canonicalize(d, default=None):
   result = tf_device.DeviceSpec(
       replica=0, task=0, device_type="CPU", device_index=0)
   if ops.executing_eagerly_outside_functions():
-    # The default job is localhost if eager execution is enabled
-    result = result.replace(job="localhost")
+    # Try to deduce job, replica and task in case it's in a multi worker setup.
+    # TODO(b/151452748): Using list_logical_devices is not always safe since it
+    # may return remote devices as well, but we're already doing this elsewhere.
+    host_cpu = tf_device.DeviceSpec.from_string(
+        config.list_logical_devices("CPU")[0].name)
+    if host_cpu.job:
+      result = result.make_merged_spec(host_cpu)
+    else:
+      # The default job is localhost if eager execution is enabled
+      result = result.replace(job="localhost")
   if default:
     # Overrides any defaults with values from the default device if given.
     result = result.make_merged_spec(
diff --git a/tensorflow/python/distribute/device_util_test.py b/tensorflow/python/distribute/device_util_test.py
index 2f0d7ed3b31..df53fe0288a 100644
--- a/tensorflow/python/distribute/device_util_test.py
+++ b/tensorflow/python/distribute/device_util_test.py
@@ -18,16 +18,27 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from absl.testing import parameterized
+
+from tensorflow.core.protobuf import tensorflow_server_pb2
+from tensorflow.python.distribute import combinations
 from tensorflow.python.distribute import device_util
+from tensorflow.python.distribute import multi_worker_test_base
 from tensorflow.python.eager import context
 from tensorflow.python.framework import ops
-from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test
+from tensorflow.python.training import server_lib
 
 
-class DeviceUtilTest(test.TestCase):
+class DeviceUtilTest(test.TestCase, parameterized.TestCase):
 
-  @test_util.run_deprecated_v1
+  def setUp(self):
+    super(DeviceUtilTest, self).setUp()
+    context._reset_context()  # pylint: disable=protected-access
+
+  @combinations.generate(
+      combinations.combine(mode="graph")
+  )
   def testCurrentDeviceWithGlobalGraph(self):
     with ops.device("/cpu:0"):
       self.assertEqual(device_util.current(), "/device:CPU:0")
@@ -51,11 +62,16 @@ class DeviceUtilTest(test.TestCase):
         self.assertEqual(device_util.current(),
                          "/job:localhost/replica:0/task:0/device:CPU:0")
 
-  @test_util.run_deprecated_v1
-  def testCanonicalizeWithoutDefaultDevice(self):
-    self.assertEqual(
-        device_util.canonicalize("/cpu:0"),
-        "/replica:0/task:0/device:CPU:0")
+  @combinations.generate(combinations.combine(mode=["graph", "eager"]))
+  def testCanonicalizeWithoutDefaultDevice(self, mode):
+    if mode == "graph":
+      self.assertEqual(
+          device_util.canonicalize("/cpu:0"),
+          "/replica:0/task:0/device:CPU:0")
+    else:
+      self.assertEqual(
+          device_util.canonicalize("/cpu:0"),
+          "/job:localhost/replica:0/task:0/device:CPU:0")
     self.assertEqual(
         device_util.canonicalize("/job:worker/cpu:0"),
         "/job:worker/replica:0/task:0/device:CPU:0")
@@ -63,6 +79,22 @@ class DeviceUtilTest(test.TestCase):
         device_util.canonicalize("/job:worker/task:1/cpu:0"),
         "/job:worker/replica:0/task:1/device:CPU:0")
 
+  @combinations.generate(combinations.combine(mode=["eager"]))
+  def testCanonicalizeWithoutDefaultDeviceCollectiveEnabled(self):
+    cluster_spec = server_lib.ClusterSpec(
+        multi_worker_test_base.create_cluster_spec(
+            has_chief=False, num_workers=1, num_ps=0, has_eval=False))
+    server_def = tensorflow_server_pb2.ServerDef(
+        cluster=cluster_spec.as_cluster_def(),
+        job_name="worker",
+        task_index=0,
+        protocol="grpc",
+        port=0)
+    context.context().enable_collective_ops(server_def)
+    self.assertEqual(
+        device_util.canonicalize("/cpu:0"),
+        "/job:worker/replica:0/task:0/device:CPU:0")
+
   def testCanonicalizeWithDefaultDevice(self):
     self.assertEqual(
         device_util.canonicalize("/job:worker/task:1/cpu:0", default="/gpu:0"),

From 31e83f2f68b42025939113f5778a528d4ebd0417 Mon Sep 17 00:00:00 2001
From: Ran Chen <crccw@google.com>
Date: Tue, 16 Jun 2020 15:27:59 -0700
Subject: [PATCH 0338/1390] Fix cross_device_ops_test with multi GPU

The test doesn't limit the number of GPUs when creating
CollectiveAllReduceStrategy, which results in mismatch in the number of devices.

PiperOrigin-RevId: 316769983
Change-Id: I50c11107a99348162b37615f209fd6fa6ee659d2
---
 .../python/distribute/collective_all_reduce_strategy.py    | 7 +++++--
 tensorflow/python/distribute/cross_device_ops_test.py      | 7 +++++--
 2 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/tensorflow/python/distribute/collective_all_reduce_strategy.py b/tensorflow/python/distribute/collective_all_reduce_strategy.py
index 68cc421c21b..f73474c4af4 100644
--- a/tensorflow/python/distribute/collective_all_reduce_strategy.py
+++ b/tensorflow/python/distribute/collective_all_reduce_strategy.py
@@ -112,9 +112,12 @@ class CollectiveAllReduceStrategy(distribute_lib.Strategy):
         "num_replicas_per_worker").set(self.extended._num_gpus_per_worker)
 
   @classmethod
-  def _from_local_devices(cls, devices):
+  def _from_local_devices(
+      cls,
+      devices,
+      communication=cross_device_ops_lib.CollectiveCommunication.AUTO):
     """A convenience method to create an object with a list of devices."""
-    obj = cls()
+    obj = cls(communication)
     obj.extended._initialize_local(TFConfigClusterResolver(), devices=devices)  # pylint: disable=protected-access
     return obj
 
diff --git a/tensorflow/python/distribute/cross_device_ops_test.py b/tensorflow/python/distribute/cross_device_ops_test.py
index 4b6943e8971..d54e16c2748 100644
--- a/tensorflow/python/distribute/cross_device_ops_test.py
+++ b/tensorflow/python/distribute/cross_device_ops_test.py
@@ -521,10 +521,13 @@ class CollectiveAllReduceTest(multi_worker_test_base.MultiWorkerTestBase,
         devices = ["/device:CPU:0"]
 
       if use_strategy_object:
-        strategy = collective_all_reduce_strategy.CollectiveAllReduceStrategy(
-            communication=communication)
+        strategy = (
+            collective_all_reduce_strategy.CollectiveAllReduceStrategy
+            ._from_local_devices(devices, communication=communication))  # pylint: disable=protected-access
         strategy.extended._collective_keys = collective_keys
         strategy.extended._cross_device_ops._collective_keys = collective_keys
+        strategy.extended._host_cross_device_ops._collective_keys = (
+            collective_keys)
         return strategy, devices, ""
       else:
         collective_all_reduce_ops = cross_device_ops_lib.CollectiveAllReduce(

From ade14486097e5ab81839f42c537db626d0bf5e88 Mon Sep 17 00:00:00 2001
From: Tamas Bela Feher <tfeher@nvidia.com>
Date: Thu, 7 May 2020 18:17:52 +0200
Subject: [PATCH 0339/1390] Test ConvertSquaredDifference in dynamic shape mode

Additionally the runtime error handling is improved:
- Do not check runtime error details in TF-TRT op converter test
- Report an error if invalid dimension is found
---
 .../tf2tensorrt/convert/convert_nodes_test.cc | 131 ++++++++----------
 .../tf2tensorrt/utils/trt_engine_utils.cc     |   8 ++
 2 files changed, 66 insertions(+), 73 deletions(-)

diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes_test.cc b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes_test.cc
index aeadbdbf012..f5e61dc0061 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes_test.cc
+++ b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes_test.cc
@@ -1803,7 +1803,9 @@ class ParameterizedOpConverterTestBase
     const int batch_size = input_data_[0].tensor.shape().dim_size(0);
     Status stat =
         OpConverterTest::BuildAndRun(input_data_, &output_data, batch_size);
-    ASSERT_EQ(expected_runtime_status, stat);
+    ASSERT_EQ(expected_runtime_status.ok(), stat.ok())
+        << "expected status: " << expected_runtime_status
+        << ", actual status: " << stat;
     if (expected_runtime_status.ok() && stat.ok()) {
       for (int i = 0; i < n_output; i++) {
         // Check the shape of the actual output tensors
@@ -6440,87 +6442,70 @@ NodeDef GetSquaredDifferenceNodeDef(DataType dtype) {
   return squared_diff.operation.node()->def();
 }
 
-template <DataType dtype>
-void TestConvertSquaredDifference(OpConverterTest* test) {
-  typedef typename EnumToDataType<dtype>::Type CType;
-
-  struct TestParams {
-    std::vector<int> dims_x;
-    std::vector<int> dims_y;
-    std::vector<CType> value_x;
-    std::vector<CType> value_y;
-    std::vector<int> expected_output_dims;
-    std::vector<CType> expected_output;
-  };
-
-  const std::vector<CType> common_input = InitTestVector<CType>(6);
-  std::vector<TestParams> params = {
-      {
-          /*dims_x=*/{1, 2, 3},
-          /*dims_y=*/{1, 2, 3},
-          /*value_x=*/common_input,
-          /*value_y=*/CastTestVector<int, CType>({0, -1, 3, 0, 10, -7}),
-          /*expected_output_dims=*/{1, 2, 3},
-          /*expected_output=*/CastTestVector<int, CType>({0, 4, 1, 9, 36, 144}),
-      },
-      {
-          /*dims_x=*/{1, 2, 3},
-          /*dims_y=*/{1, 1, 3},
-          /*value_x=*/common_input,
-          /*value_y=*/CastTestVector<int, CType>({0, 1, 2}),
-          /*expected_output_dims=*/{1, 2, 3},
-          /*expected_output=*/CastTestVector<int, CType>({0, 0, 0, 9, 9, 9}),
-      },
-  };
-
-  for (int i = 0; i < params.size(); ++i) {
-    test->Reset();
-
-    NodeDef node_def = GetSquaredDifferenceNodeDef(dtype);
-    test->AddTestTensor("x", params[i].dims_x, 1, TfDataTypeToTrt(dtype));
-    test->AddTestTensor("y", params[i].dims_y, 1, TfDataTypeToTrt(dtype));
-    test->RunValidationAndConversion(node_def);
-
-    TRT_TensorOrWeights output;
-    TF_EXPECT_OK(test->GetTensorOrWeights("my_squared_diff", &output));
-    EXPECT_TRUE(output.is_tensor());
-    ExpectTrtDimsEqualsArray(params[i].expected_output_dims,
-                             output.tensor()->getDimensions());
-
-    DataVec input_data{{"x", test->AsTensor<CType>(params[i].value_x)},
-                       {"y", test->AsTensor<CType>(params[i].value_y)}};
-    DataVec output_data{
-        {"my_squared_diff",
-         test->ConstructTensor<CType>(params[i].expected_output.size())}};
-    TF_EXPECT_OK(test->BuildAndRun(input_data, &output_data));
-    EXPECT_THAT(GetSpanForData<CType>(output_data[0]),
-                ElementsAreArray(params[i].expected_output));
-  }
-}
-
-TEST_F(OpConverterTest, ConvertSquaredDifference) {
+TEST_P(OpConverterTest2, ConvertSquaredDifference) {
   {
     // Input is a weight, should fail.
     Reset();
-    NodeDef node_def = GetSquaredDifferenceNodeDef(DT_FLOAT);
+    NodeDef node_def = GetSquaredDifferenceNodeDef(tf_type);
     AddTestWeights<float>("x", {1, 2, 3}, {1, 2, 3, 4, 5, 6});
-    AddTestTensor("y", {1, 2, 3});
+    AddTestTensor("y", {1, 1, 2, 3});
     RunValidationAndConversion(node_def, error::UNIMPLEMENTED,
                                "The input \"x\" for SquaredDifference must be "
                                "a tensor, at my_squared_diff");
   }
-  {
-    // Shapes are not broadcastable, should fail.
-    Reset();
-    NodeDef node_def = GetSquaredDifferenceNodeDef(DT_FLOAT);
-    AddTestTensor("x", {2, 3});
-    AddTestTensor("y", {7, 5});
-    RunValidationAndConversion(node_def, error::INVALID_ARGUMENT,
-                               "Infeasible broadcast scheme");
-  }
 
-  TestConvertSquaredDifference<DT_FLOAT>(this);
-  TestConvertSquaredDifference<DT_HALF>(this);
+  struct TestParams {
+    std::vector<int> dims_x;
+    std::vector<int> dims_y;
+    std::vector<float> value_x;
+    std::vector<float> value_y;
+    std::vector<int> expected_output_dims;
+    std::vector<float> expected_output;
+    Status status;
+    Status runtime_status;
+  };
+
+  const std::vector<float> common_input = InitTestVector<float>(6);
+  std::vector<TestParams> params = {
+      {/*dims_x=*/{1, 2, 3},
+       /*dims_y=*/{1, 7, 5},
+       /*value_x=*/common_input,
+       /*value_y=*/std::vector<float>(7 * 5, 0),
+       /*expected_output_dims=*/{1, 1, 2, 3},
+       /*expected_output=*/common_input,
+       trt_mode == TrtTestMode::kDynamicShape
+           ? Status::OK()
+           : errors::InvalidArgument("Infeasible broadcast scheme"),
+       errors::Internal(
+           "Binding index out of range. This can happen if profile is not set, "
+           "or the network is invalid for the current profile.")},
+      {
+          /*dims_x=*/{1, 1, 2, 3},
+          /*dims_y=*/{1, 1, 2, 3},
+          /*value_x=*/common_input,
+          /*value_y=*/{0, -1, 3, 0, 10, -7},
+          /*expected_output_dims=*/{1, 1, 2, 3},
+          /*expected_output=*/{0, 4, 1, 9, 36, 144},
+      },
+      {
+          /*dims_x=*/{1, 1, 2, 3},
+          /*dims_y=*/{1, 1, 1, 3},
+          /*value_x=*/common_input,
+          /*value_y=*/{0, 1, 2},
+          /*expected_output_dims=*/{1, 1, 2, 3},
+          /*expected_output=*/{0, 0, 0, 9, 9, 9},
+      },
+  };
+
+  for (auto p : params) {
+    Reset();
+    NodeDef node_def = GetSquaredDifferenceNodeDef(tf_type);
+    AddTestTensor("x", p.dims_x, p.value_x);
+    AddTestTensor("y", p.dims_y, p.value_y);
+    TestOpConverter("my_squared_diff", node_def, p.expected_output_dims,
+                    p.status, p.runtime_status,
+                    ElementsAreArray(p.expected_output));
+  }
 }
 
 #if IS_TRT_VERSION_GE(6, 0, 0, 0)
diff --git a/tensorflow/compiler/tf2tensorrt/utils/trt_engine_utils.cc b/tensorflow/compiler/tf2tensorrt/utils/trt_engine_utils.cc
index 213c1732e59..ed997b267b1 100644
--- a/tensorflow/compiler/tf2tensorrt/utils/trt_engine_utils.cc
+++ b/tensorflow/compiler/tf2tensorrt/utils/trt_engine_utils.cc
@@ -46,6 +46,14 @@ Status GetTrtBindingShape(const nvinfer1::ICudaEngine* cuda_engine,
     // Get dims from context instead of engine in explicit batch mode because
     // the engine might have dynamic shapes.
     dims = execution_context->getBindingDimensions(binding_index);
+    if (dims.nbDims == -1) {
+      // Invalid dimensions. There can be multiple reasons for this. If we have
+      // incompatible input shapes (network invalid for the current profile)
+      // that can trigger this error.
+      return errors::Internal(
+          "Binding index out of range. This can happen if profile is not set, "
+          "or the network is invalid for the current profile.");
+    }
 #else
     return errors::Internal(
         "Explicit batch mode is only supported with TensorRT 6 and above.");

From 6b126156d962547f4b4f66cf19aabfe14d75dd88 Mon Sep 17 00:00:00 2001
From: Mihai Maruseac <mihaimaruseac@google.com>
Date: Tue, 16 Jun 2020 15:51:18 -0700
Subject: [PATCH 0340/1390] Fix clang-tidy warnings in `logging.{cc,h}`.

PiperOrigin-RevId: 316773944
Change-Id: Ie35223f385d0e9038e0a0ea223825b5ea78f4769
---
 tensorflow/core/platform/default/logging.cc |  8 +++---
 tensorflow/core/platform/default/logging.h  | 30 +++++++++------------
 2 files changed, 17 insertions(+), 21 deletions(-)

diff --git a/tensorflow/core/platform/default/logging.cc b/tensorflow/core/platform/default/logging.cc
index 9bf809131a9..6d2af607748 100644
--- a/tensorflow/core/platform/default/logging.cc
+++ b/tensorflow/core/platform/default/logging.cc
@@ -303,7 +303,7 @@ void MakeCheckOpValueString(std::ostream* os, const char& v) {
   if (v >= 32 && v <= 126) {
     (*os) << "'" << v << "'";
   } else {
-    (*os) << "char value " << static_cast<short>(v);
+    (*os) << "char value " << static_cast<int16>(v);
   }
 }
 
@@ -312,7 +312,7 @@ void MakeCheckOpValueString(std::ostream* os, const signed char& v) {
   if (v >= 32 && v <= 126) {
     (*os) << "'" << v << "'";
   } else {
-    (*os) << "signed char value " << static_cast<short>(v);
+    (*os) << "signed char value " << static_cast<int16>(v);
   }
 }
 
@@ -321,13 +321,13 @@ void MakeCheckOpValueString(std::ostream* os, const unsigned char& v) {
   if (v >= 32 && v <= 126) {
     (*os) << "'" << v << "'";
   } else {
-    (*os) << "unsigned char value " << static_cast<unsigned short>(v);
+    (*os) << "unsigned char value " << static_cast<uint16>(v);
   }
 }
 
 #if LANG_CXX11
 template <>
-void MakeCheckOpValueString(std::ostream* os, const std::nullptr_t& p) {
+void MakeCheckOpValueString(std::ostream* os, const std::nullptr_t& v) {
   (*os) << "nullptr";
 }
 #endif
diff --git a/tensorflow/core/platform/default/logging.h b/tensorflow/core/platform/default/logging.h
index 8b171a15f7c..f60deb43683 100644
--- a/tensorflow/core/platform/default/logging.h
+++ b/tensorflow/core/platform/default/logging.h
@@ -265,16 +265,12 @@ inline const T& GetReferenceableValue(const T& t) {
 inline char GetReferenceableValue(char t) { return t; }
 inline unsigned char GetReferenceableValue(unsigned char t) { return t; }
 inline signed char GetReferenceableValue(signed char t) { return t; }
-inline short GetReferenceableValue(short t) { return t; }
-inline unsigned short GetReferenceableValue(unsigned short t) { return t; }
+inline int16 GetReferenceableValue(int16 t) { return t; }
+inline uint16 GetReferenceableValue(uint16 t) { return t; }
 inline int GetReferenceableValue(int t) { return t; }
 inline unsigned int GetReferenceableValue(unsigned int t) { return t; }
-inline long GetReferenceableValue(long t) { return t; }
-inline unsigned long GetReferenceableValue(unsigned long t) { return t; }
-inline long long GetReferenceableValue(long long t) { return t; }
-inline unsigned long long GetReferenceableValue(unsigned long long t) {
-  return t;
-}
+inline int64 GetReferenceableValue(int64 t) { return t; }
+inline uint64 GetReferenceableValue(uint64 t) { return t; }
 
 // This formats a value for a failing CHECK_XX statement.  Ordinarily,
 // it uses the definition for operator<<, with a few special cases below.
@@ -295,16 +291,16 @@ void MakeCheckOpValueString(std::ostream* os, const unsigned char& v);
 #if LANG_CXX11
 // We need an explicit specialization for std::nullptr_t.
 template <>
-void MakeCheckOpValueString(std::ostream* os, const std::nullptr_t& p);
+void MakeCheckOpValueString(std::ostream* os, const std::nullptr_t& v);
 #endif
 
 // A container for a string pointer which can be evaluated to a bool -
 // true iff the pointer is non-NULL.
 struct CheckOpString {
-  CheckOpString(string* str) : str_(str) {}
+  explicit CheckOpString(string* str) : str_(str) {}
   // No destructor: if str_ is non-NULL, we're about to LOG(FATAL),
   // so there's no point in cleaning up str_.
-  operator bool() const { return TF_PREDICT_FALSE(str_ != NULL); }
+  explicit operator bool() const { return TF_PREDICT_FALSE(str_ != nullptr); }
   string* str_;
 };
 
@@ -393,12 +389,12 @@ TF_DEFINE_CHECK_OP_IMPL(Check_GT, >)
 
 // In optimized mode, use CheckOpString to hint to compiler that
 // the while condition is unlikely.
-#define CHECK_OP_LOG(name, op, val1, val2)                            \
-  while (::tensorflow::internal::CheckOpString _result =              \
-             ::tensorflow::internal::name##Impl(                      \
-                 ::tensorflow::internal::GetReferenceableValue(val1), \
-                 ::tensorflow::internal::GetReferenceableValue(val2), \
-                 #val1 " " #op " " #val2))                            \
+#define CHECK_OP_LOG(name, op, val1, val2)                     \
+  while (::tensorflow::internal::CheckOpString _result{        \
+      ::tensorflow::internal::name##Impl(                      \
+          ::tensorflow::internal::GetReferenceableValue(val1), \
+          ::tensorflow::internal::GetReferenceableValue(val2), \
+          #val1 " " #op " " #val2)})                           \
   ::tensorflow::internal::LogMessageFatal(__FILE__, __LINE__) << *(_result.str_)
 
 #define CHECK_OP(name, op, val1, val2) CHECK_OP_LOG(name, op, val1, val2)

From a6945b9b0f0c43b6e6557c3c6a441dad5f98a866 Mon Sep 17 00:00:00 2001
From: Raman Sarokin <sorokin@google.com>
Date: Tue, 16 Jun 2020 16:02:34 -0700
Subject: [PATCH 0341/1390] SpaceToDepth converted to new style.

PiperOrigin-RevId: 316775897
Change-Id: I2715cadbf112dffc93b2b45570d2220444af31a4
---
 .../gpu/cl/kernels/space_to_depth.cc          | 76 ++++++++++---------
 1 file changed, 41 insertions(+), 35 deletions(-)

diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/space_to_depth.cc b/tensorflow/lite/delegates/gpu/cl/kernels/space_to_depth.cc
index 34227f6b887..439b7d0fc15 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/space_to_depth.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/space_to_depth.cc
@@ -27,28 +27,33 @@ namespace gpu {
 namespace cl {
 namespace {
 
-std::string GetSpaceToDepthCode(
-    const OperationDef& op_def,
-    const std::vector<ElementwiseOperation*>& linked_operations) {
-  TensorCodeGenerator src_tensor(
-      "src_data", WHSPoint{"src_size.x", "src_size.y", "src_size.z"},
-      op_def.src_tensors[0]);
-  TensorCodeGenerator dst_tensor(
-      "dst_data", WHSPoint{"dst_size.x", "dst_size.y", "dst_size.z"},
-      op_def.dst_tensors[0]);
+std::string GetSpaceToDepthCode(const OperationDef& op_def, Arguments* args) {
+  args->AddObjectRef(
+      "src_tensor", AccessType::READ,
+      absl::make_unique<TensorDescriptor>(op_def.src_tensors[0]));
+  args->AddObjectRef(
+      "dst_tensor", AccessType::WRITE,
+      absl::make_unique<TensorDescriptor>(op_def.dst_tensors[0]));
+  args->AddInt("block_size");
+
   std::string c = GetCommonDefines(op_def.precision);
   c += "__kernel void main_function(\n";
-  c += src_tensor.GetDeclaration(AccessType::READ);
-  c += GetArgsDeclaration(linked_operations);
-  c += dst_tensor.GetDeclaration(AccessType::WRITE) + ",\n";
-  c += "    int4 src_size,\n";
-  c += "    int4 dst_size,\n";
-  c += "    int src_channels,\n";
-  c += "    int block_size) {\n";
-  c += "  int X = get_global_id(0);\n";
+  c += "$0) {\n";
+  if (op_def.IsBatchSupported()) {
+    c += "  int linear_id = get_global_id(0);\n";
+    c += "  int X = linear_id / args.dst_tensor.Batch();\n";
+    c += "  int B = linear_id % args.dst_tensor.Batch();\n";
+    c += "  args.dst_tensor.SetBatchRef(B);\n";
+    c += "  args.src_tensor.SetBatchRef(B);\n";
+  } else {
+    c += "  int X = get_global_id(0);\n";
+  }
   c += "  int Y = get_global_id(1);\n";
   c += "  int Z = get_global_id(2);\n";
-  c += "  if (X >= dst_size.x || Y >= dst_size.y || Z >= dst_size.z) return;\n";
+  c += "  if (X >= args.dst_tensor.Width() || Y >= args.dst_tensor.Height() || "
+       "Z >= args.dst_tensor.Slices()) { \n";
+  c += "    return; \n";
+  c += "  } \n";
   c += "  FLT tmp[4];\n";
   c += "  tmp[0] = (FLT)(0.0f);\n";
   c += "  tmp[1] = (FLT)(0.0f);\n";
@@ -56,19 +61,17 @@ std::string GetSpaceToDepthCode(
   c += "  tmp[3] = (FLT)(0.0f);\n";
   c += "  for (int i = 0; i < 4; ++i) {\n";
   c += "    int dst_c = 4 * Z + i;\n";
-  c += "    int block_id = dst_c / src_channels;\n";
-  c += "    int src_x = X * block_size + block_id % block_size;\n";
-  c += "    int src_y = Y * block_size + block_id / block_size;\n";
-  c += "    int src_c = dst_c % src_channels;\n";
+  c += "    int block_id = dst_c / args.src_tensor.Channels();\n";
+  c += "    int src_x = X * args.block_size + block_id % args.block_size;\n";
+  c += "    int src_y = Y * args.block_size + block_id / args.block_size;\n";
+  c += "    int src_c = dst_c % args.src_tensor.Channels();\n";
   c += "    int src_z = src_c / 4;\n";
-  c += "    FLT4 t = " + src_tensor.ReadWHS("src_x", "src_y", "src_z") + ";\n";
+  c += "    FLT4 t =  args.src_tensor.Read(src_x, src_y, src_z);\n";
   c += "    FLT t_ar[4] = {t.x, t.y, t.z, t.w};\n";
   c += "    tmp[i] = t_ar[src_c % 4];\n";
   c += "  }\n";
   c += "  FLT4 result = (FLT4)(tmp[0], tmp[1], tmp[2], tmp[3]);\n";
-  const LinkingContext context{"result", "X", "Y", "Z"};
-  c += PostProcess(linked_operations, context);
-  c += "  " + dst_tensor.WriteWHS("result", "X", "Y", "Z");
+  c += "  args.dst_tensor.Write(result, X, Y, Z);\n";
   c += "}\n";
   return c;
 }
@@ -92,21 +95,24 @@ SpaceToDepth& SpaceToDepth::operator=(SpaceToDepth&& operation) {
 }
 
 absl::Status SpaceToDepth::Compile(const CreationContext& creation_context) {
-  const auto code = GetSpaceToDepthCode(definition_, linked_operations_);
+  std::string code = GetSpaceToDepthCode(definition_, &args_);
+  std::string element_wise_code;
+  RETURN_IF_ERROR(
+      MergeOperations(linked_operations_, &args_, &element_wise_code));
+  RETURN_IF_ERROR(args_.TransformToCLCode(creation_context.device->GetInfo(),
+                                          {{"dst_tensor", element_wise_code}},
+                                          &code));
   return creation_context.cache->GetOrCreateCLKernel(
       code, "main_function", *creation_context.context,
       *creation_context.device, &kernel_);
 }
 
 absl::Status SpaceToDepth::BindArguments() {
-  kernel_.ResetBindingCounter();
-  RETURN_IF_ERROR(kernel_.SetMemoryAuto(src_[0]->GetMemoryPtr()));
-  RETURN_IF_ERROR(BindArgs(&kernel_, linked_operations_));
-  RETURN_IF_ERROR(kernel_.SetMemoryAuto(dst_[0]->GetMemoryPtrForWriting()));
-  RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->GetWHSB()));
-  RETURN_IF_ERROR(kernel_.SetBytesAuto(dst_[0]->GetWHSB()));
-  RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->Channels()));
-  return kernel_.SetBytesAuto(attr_.block_size);
+  RETURN_IF_ERROR(args_.SetObjectRef("src_tensor", src_[0]));
+  RETURN_IF_ERROR(args_.SetObjectRef("dst_tensor", dst_[0]));
+  RETURN_IF_ERROR(args_.SetInt("block_size", attr_.block_size));
+  RETURN_IF_ERROR(SetArguments(linked_operations_, &args_));
+  return args_.Bind(kernel_.kernel());
 }
 
 int3 SpaceToDepth::GetGridSize() const {

From fad7b3a33b476ed2b4d7b6a901b8dc5ab02f38b0 Mon Sep 17 00:00:00 2001
From: Berkin Ilbeyi <berkin@google.com>
Date: Tue, 16 Jun 2020 16:12:25 -0700
Subject: [PATCH 0342/1390] [XLA] More tweaks and fixes to memory space
 assignment:

- When sorting the BufferIntervals, now we also correctly include aliased
  HloValues into the benefit computation.
- To avoid double counting, we now skip over while and conditional HLOs when
  using cost analysis since we already look inside the called computations.
- When trying to find a preferred alternate memory offset, we now find the
  latest use that can be contiguously allocated (determined by the max overlap
  ratio heuristic) instead of trying to find an allocation that is as
  long-living as possible. This should improve fragmentation slightly.

PiperOrigin-RevId: 316777648
Change-Id: If9d40a79283b644db975ebe62b1bb6c545fea89d
---
 .../xla/service/memory_space_assignment.cc    | 113 +++++++++++++-----
 .../xla/service/memory_space_assignment.h     |  35 ++++--
 .../service/memory_space_assignment_test.cc   |  16 +--
 3 files changed, 109 insertions(+), 55 deletions(-)

diff --git a/tensorflow/compiler/xla/service/memory_space_assignment.cc b/tensorflow/compiler/xla/service/memory_space_assignment.cc
index 388a2e18f38..ea1438380a6 100644
--- a/tensorflow/compiler/xla/service/memory_space_assignment.cc
+++ b/tensorflow/compiler/xla/service/memory_space_assignment.cc
@@ -31,6 +31,22 @@ const int kWhileExecutionCount = 5;
 
 }  // namespace
 
+/*static*/ StatusOr<std::unique_ptr<MemorySpaceAssignmentCostAnalysis>>
+MemorySpaceAssignmentCostAnalysis::Create(
+    const HloCostAnalysis& cost_analysis,
+    float async_copy_bandwidth_bytes_per_second,
+    float alternate_mem_bandwidth_bytes_per_second, const HloModule& module) {
+  TF_ASSIGN_OR_RETURN(auto alias_analysis, HloAliasAnalysis::Run(&module));
+  TF_ASSIGN_OR_RETURN(auto hlo_live_range,
+                      HloLiveRange::Run(module.schedule(), *alias_analysis,
+                                        module.entry_computation()));
+  auto call_graph = CallGraph::Build(&module);
+  return absl::WrapUnique(new MemorySpaceAssignmentCostAnalysis(
+      cost_analysis, async_copy_bandwidth_bytes_per_second,
+      alternate_mem_bandwidth_bytes_per_second, std::move(alias_analysis),
+      std::move(hlo_live_range), std::move(call_graph)));
+}
+
 float MemorySpaceAssignmentCostAnalysis::GetAlternateMemoryBenefit(
     const HloInstruction& instruction, float elapsed_time_due_to_alternate_mem,
     MemorySpaceAssignmentCostAnalysis::Cache* cache) const {
@@ -74,19 +90,32 @@ float MemorySpaceAssignmentCostAnalysis::GetMemoryBoundedness(
                                        /*operand_in_alternate_mem=*/{},
                                        /*output_in_alternate_mem=*/true),
       cache);
-  for (const HloUse& use : interval.buffer->uses()) {
-    float use_alternate_mem_benefit = GetAlternateMemoryBenefit(
-        *use.instruction,
-        GetInstructionElapsedDueToMemory(*use.instruction, use.operand_number),
-        cache);
-    // If the benefit is positive (memory bound), add it to this buffer's
-    // benefit. If the benefit is negative (compute bound), calculate the
-    // maximum.
-    if (alternate_mem_benefit > 0 && use_alternate_mem_benefit > 0) {
-      alternate_mem_benefit += use_alternate_mem_benefit;
-    } else {
-      alternate_mem_benefit =
-          std::max(alternate_mem_benefit, use_alternate_mem_benefit);
+  for (const HloBuffer* buffer : alias_analysis_->ComputeBuffersAt(
+           interval.buffer->defining_position().instruction,
+           interval.buffer->defining_position().index)) {
+    for (const HloValue* value : buffer->values()) {
+      for (const HloUse& use : value->uses()) {
+        // We look inside the called computations of while and conditional, so
+        // don't use the benefit of while and conditional directly.
+        if (use.instruction->opcode() == HloOpcode::kWhile ||
+            use.instruction->opcode() == HloOpcode::kConditional) {
+          continue;
+        }
+        float use_alternate_mem_benefit =
+            GetAlternateMemoryBenefit(*use.instruction,
+                                      GetInstructionElapsedDueToMemory(
+                                          *use.instruction, use.operand_number),
+                                      cache);
+        // If the benefit is positive (memory bound), add it to this buffer's
+        // benefit. If the benefit is negative (compute bound), calculate the
+        // maximum.
+        if (alternate_mem_benefit > 0 && use_alternate_mem_benefit > 0) {
+          alternate_mem_benefit += use_alternate_mem_benefit;
+        } else {
+          alternate_mem_benefit =
+              std::max(alternate_mem_benefit, use_alternate_mem_benefit);
+        }
+      }
     }
   }
 
@@ -95,17 +124,9 @@ float MemorySpaceAssignmentCostAnalysis::GetMemoryBoundedness(
   float alternate_mem_slowdown =
       GetInstructionElapsedDueToMemorySlowdown(interval.size);
 
-  // Scale the slowdown based on the time of this buffer. We would want earlier
-  // buffers have lower slowdown values, because they are less likely to overlap
-  // with other HLOs.
-  // TODO(yuemmawang): We may want a piecewise function, where a lower slowdown
-  // for early HLOs, and full slowdown for mid-to-late HLOs.
-  // TODO(yuemmawang): Further in a smarter way, we want buffers overlapped with
-  // more HLOs have higher slowdown, and vice versa.
-  float scale = interval.start * 1.0 / GetScheduleEndTime();
-  alternate_mem_slowdown *= scale;
-
-  return alternate_mem_benefit - alternate_mem_slowdown;
+  // Divide by the size of the buffer to prioritize smaller buffers that will
+  // give the largest alternate memory benefit.
+  return (alternate_mem_benefit - alternate_mem_slowdown) / interval.size;
 }
 
 int MemorySpaceAssignmentCostAnalysis::CalculateWhileLoopNestLevel(
@@ -113,7 +134,7 @@ int MemorySpaceAssignmentCostAnalysis::CalculateWhileLoopNestLevel(
   int nest_level = 0;
   const HloComputation* computation = instruction->parent();
   while (!computation->IsEntryComputation()) {
-    auto node = call_graph_.GetNode(computation);
+    auto node = call_graph_->GetNode(computation);
     auto callsites = node.caller_callsites();
     CHECK_EQ(callsites.size(), 1) << "The module is not flattened!";
     auto callsite = callsites[0];
@@ -195,7 +216,7 @@ float MemorySpaceAssignmentCostAnalysis::GetAsyncCopyElapsed(
 }
 
 int64 MemorySpaceAssignmentCostAnalysis::GetScheduleEndTime() const {
-  return hlo_live_range_.schedule_end_time();
+  return hlo_live_range_->schedule_end_time();
 }
 
 bool InstructionCountPrefetchIntervalPicker::CanAllocateInAlternateMemoryNoCopy(
@@ -253,6 +274,13 @@ CostAnalysisPrefetchIntervalPicker::CostAnalysisPrefetchIntervalPicker(
   std::vector<float> instructions_elapsed_time(instruction_schedule_->size(),
                                                0.0);
   for (const auto& instruction_and_logical_time : *instruction_schedule_) {
+    // To avoid double counting, don't include the elapsed time of while and
+    // conditional HLOs.
+    const HloInstruction* instruction = instruction_and_logical_time.first;
+    if (instruction->opcode() == HloOpcode::kWhile ||
+        instruction->opcode() == HloOpcode::kConditional) {
+      continue;
+    }
     float elapsed_time = cost_analysis_.cost_analysis().optimal_seconds(
         *instruction_and_logical_time.first);
     int64 logical_time = instruction_and_logical_time.second;
@@ -1937,17 +1965,38 @@ AlternateMemoryBestFitHeap::FindBestChunkCandidate(
     BufferInterval* alternate_mem_interval) const {
   int64 end_time = request.end_time;
   if (!preferred_offset) {
+    // First find the earliest use that is the same or later than the end time.
+    const auto& uses = request.allocation_value->uses();
+    auto use_it = uses.begin();
+    for (; use_it->time < end_time; ++use_it) {
+    }
+    CHECK(use_it != uses.end());
+    int64 earliest_use = use_it->time;
+
+    // Then find the latest use that can be allocated contiguously without
+    // copies.
+    const Shape& shape = request.allocation_value->defining_position().shape();
+    for (;
+         (use_it + 1) != uses.end() &&
+         options_.prefetch_interval_picker->CanAllocateInAlternateMemoryNoCopy(
+             shape, use_it->time, (use_it + 1)->time);
+         ++use_it) {
+    }
+    CHECK(use_it != uses.end());
+    int64 latest_contiguous_use = use_it->time;
+
     // Find a chunk that's as long living as possible iterating in reverse over
     // the use times.
-    for (auto use_it = request.allocation_value->uses().rbegin();
-         use_it != request.allocation_value->uses().rend() &&
-         use_it->time >= end_time;
-         ++use_it) {
+    for (; use_it >= uses.begin() && use_it->time >= end_time; --use_it) {
       alternate_mem_interval->end = use_it->time;
       ChunkCandidate chunk_candidate =
           FindChunkCandidate(*alternate_mem_interval);
       if (chunk_candidate.heap_size <= available_heap_size()) {
         alternate_mem_interval->end = end_time;
+        VLOG(3) << "FindBestChunkCandidate earliest use = " << earliest_use
+                << ", latest contiguous use = " << latest_contiguous_use
+                << ", use with available mem = " << use_it->time
+                << ", offset = " << chunk_candidate.chunk.offset;
         return chunk_candidate;
       }
     }
@@ -2005,8 +2054,8 @@ MemorySpaceAssignment::CalculateAsyncCopyStats() const {
 MemorySpaceAssignment::GetMemoryBoundednessBufferIntervalCompare(
     const MemorySpaceAssignmentCostAnalysis& cost_analysis,
     MemorySpaceAssignmentCostAnalysis::Cache* cache) {
-  return [cost_analysis, cache](const BufferInterval& x,
-                                const BufferInterval& y) {
+  return [&cost_analysis, cache](const BufferInterval& x,
+                                 const BufferInterval& y) {
     float x_memory_boundedness = cost_analysis.GetMemoryBoundedness(x, cache);
     float y_memory_boundedness = cost_analysis.GetMemoryBoundedness(y, cache);
     if (x_memory_boundedness != y_memory_boundedness) {
diff --git a/tensorflow/compiler/xla/service/memory_space_assignment.h b/tensorflow/compiler/xla/service/memory_space_assignment.h
index f9e5738d17e..5e34f755fe9 100644
--- a/tensorflow/compiler/xla/service/memory_space_assignment.h
+++ b/tensorflow/compiler/xla/service/memory_space_assignment.h
@@ -84,18 +84,10 @@ class MemorySpaceAssignmentCostAnalysis {
     absl::flat_hash_map<const HloInstruction*, float> while_nest_multiplier;
   };
 
-  MemorySpaceAssignmentCostAnalysis(
+  static StatusOr<std::unique_ptr<MemorySpaceAssignmentCostAnalysis>> Create(
       const HloCostAnalysis& cost_analysis,
       float async_copy_bandwidth_bytes_per_second,
-      float alternate_mem_bandwidth_bytes_per_second,
-      const HloLiveRange& hlo_live_range, const CallGraph& call_graph)
-      : cost_analysis_(cost_analysis),
-        async_copy_bandwidth_bytes_per_second_(
-            async_copy_bandwidth_bytes_per_second),
-        alternate_mem_bandwidth_bytes_per_second_(
-            alternate_mem_bandwidth_bytes_per_second),
-        hlo_live_range_(hlo_live_range),
-        call_graph_(call_graph) {}
+      float alternate_mem_bandwidth_bytes_per_second, const HloModule& module);
 
   const HloCostAnalysis& cost_analysis() const { return cost_analysis_; }
 
@@ -153,14 +145,31 @@ class MemorySpaceAssignmentCostAnalysis {
   // 0 means it is not in a while loop.
   int CalculateWhileLoopNestLevel(const HloInstruction* instruction) const;
 
-  const HloLiveRange& hlo_live_range() const { return hlo_live_range_; }
+  const HloLiveRange& hlo_live_range() const { return *hlo_live_range_; }
 
  private:
+  MemorySpaceAssignmentCostAnalysis(
+      const HloCostAnalysis& cost_analysis,
+      float async_copy_bandwidth_bytes_per_second,
+      float alternate_mem_bandwidth_bytes_per_second,
+      std::unique_ptr<HloAliasAnalysis> alias_analysis,
+      std::unique_ptr<HloLiveRange> hlo_live_range,
+      std::unique_ptr<CallGraph> call_graph)
+      : cost_analysis_(cost_analysis),
+        async_copy_bandwidth_bytes_per_second_(
+            async_copy_bandwidth_bytes_per_second),
+        alternate_mem_bandwidth_bytes_per_second_(
+            alternate_mem_bandwidth_bytes_per_second),
+        alias_analysis_(std::move(alias_analysis)),
+        hlo_live_range_(std::move(hlo_live_range)),
+        call_graph_(std::move(call_graph)) {}
+
   const HloCostAnalysis& cost_analysis_;
   float async_copy_bandwidth_bytes_per_second_;
   float alternate_mem_bandwidth_bytes_per_second_;
-  const HloLiveRange& hlo_live_range_;
-  const CallGraph& call_graph_;
+  std::unique_ptr<HloAliasAnalysis> alias_analysis_;
+  std::unique_ptr<HloLiveRange> hlo_live_range_;
+  std::unique_ptr<CallGraph> call_graph_;
 };
 
 // Abstract base class that memory space assignment uses to pick prefetch
diff --git a/tensorflow/compiler/xla/service/memory_space_assignment_test.cc b/tensorflow/compiler/xla/service/memory_space_assignment_test.cc
index 032a3f53479..398f07d4a40 100644
--- a/tensorflow/compiler/xla/service/memory_space_assignment_test.cc
+++ b/tensorflow/compiler/xla/service/memory_space_assignment_test.cc
@@ -53,22 +53,18 @@ class MemorySpaceAssignmentTest : public HloTestBase,
       TF_CHECK_OK(computation->Accept(&hlo_cost_analysis));
     }
     auto alias_analysis = HloAliasAnalysis::Run(module).ValueOrDie();
-    std::unique_ptr<HloLiveRange> hlo_live_range =
-        HloLiveRange::Run(module->schedule(), *alias_analysis,
-                          module->entry_computation())
-            .ValueOrDie();
-    std::unique_ptr<CallGraph> call_graph = CallGraph::Build(module);
-    MemorySpaceAssignmentCostAnalysis cost_analysis(
-        hlo_cost_analysis, kAsyncCopyBandwidth, kAlternateMemBandwidth,
-        *hlo_live_range, *call_graph);
+    auto cost_analysis = MemorySpaceAssignmentCostAnalysis::Create(
+                             hlo_cost_analysis, kAsyncCopyBandwidth,
+                             kAlternateMemBandwidth, *module)
+                             .ValueOrDie();
     CostAnalysisPrefetchIntervalPicker prefetch_interval_picker(
         CostAnalysisPrefetchIntervalPicker(
-            cost_analysis, /*min_async_copy_to_overlap_ratio=*/0.8,
+            *cost_analysis, /*min_async_copy_to_overlap_ratio=*/0.8,
             /*max_async_copy_to_overlap_ratio=*/10.0));
     return AssignMemorySpace(
         module, /*max_outstanding_async_copies=*/-1,
         MemorySpaceAssignment::GetMemoryBoundednessBufferIntervalCompare(
-            cost_analysis, &cache_),
+            *cost_analysis, &cache_),
         &prefetch_interval_picker);
   }
 

From 89a1d3f4e9dcea93bff6cdcd469e5e10c26376bc Mon Sep 17 00:00:00 2001
From: Jaesung Chung <jaesung@google.com>
Date: Tue, 16 Jun 2020 16:24:49 -0700
Subject: [PATCH 0343/1390] Implement a pass for lifting variables.

This pass creates GlobalTensorOp for each variable from function
arguments and converts the function arguments to the corresponding saved
model arguments.

This change fixes the Kokoro builds by avoiding the common_runtime
dependency.

PiperOrigin-RevId: 316779884
Change-Id: I07c83bf12486748e4350717d94928a75bad92342
---
 tensorflow/compiler/mlir/tensorflow/BUILD     |  69 +++++++
 .../tests/tf_saved_model_lift_variables.mlir  |  61 ++++++
 ..._model_lift_variables_invalid_session.mlir |  33 ++++
 .../tensorflow/transforms/lift_variables.cc   | 183 ++++++++++++++++++
 .../tensorflow/transforms/lift_variables.h    |  33 ++++
 .../transforms/lift_variables_pass.h          |  57 ++++++
 .../transforms/lift_variables_test_pass.h     | 146 ++++++++++++++
 .../lift_variables_test_pass_registration.cc  |  32 +++
 8 files changed, 614 insertions(+)
 create mode 100644 tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model_lift_variables.mlir
 create mode 100644 tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model_lift_variables_invalid_session.mlir
 create mode 100644 tensorflow/compiler/mlir/tensorflow/transforms/lift_variables.cc
 create mode 100644 tensorflow/compiler/mlir/tensorflow/transforms/lift_variables.h
 create mode 100644 tensorflow/compiler/mlir/tensorflow/transforms/lift_variables_pass.h
 create mode 100644 tensorflow/compiler/mlir/tensorflow/transforms/lift_variables_test_pass.h
 create mode 100644 tensorflow/compiler/mlir/tensorflow/transforms/lift_variables_test_pass_registration.cc

diff --git a/tensorflow/compiler/mlir/tensorflow/BUILD b/tensorflow/compiler/mlir/tensorflow/BUILD
index 9e5688cd230..904ccb7e820 100644
--- a/tensorflow/compiler/mlir/tensorflow/BUILD
+++ b/tensorflow/compiler/mlir/tensorflow/BUILD
@@ -397,6 +397,73 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "lift_variables_lib",
+    srcs = [
+        "transforms/lift_variables.cc",
+    ],
+    hdrs = [
+        "transforms/lift_variables.h",
+    ],
+    deps = [
+        ":convert_tensor",
+        ":tensorflow",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:core_cpu_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:lib",
+        "//tensorflow/core/platform:errors",
+        "//tensorflow/core/platform:threadpool_options",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:Support",
+    ],
+    alwayslink = 1,
+)
+
+cc_library(
+    name = "lift_variables_pass",
+    hdrs = [
+        "transforms/lift_variables_pass.h",
+    ],
+    deps = [
+        ":lift_variables_lib",
+        ":tensorflow",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:lib",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:Support",
+    ],
+    alwayslink = 1,
+)
+
+cc_library(
+    name = "lift_variables_test_pass",
+    hdrs = [
+        "transforms/lift_variables_test_pass.h",
+    ],
+    deps = [
+        ":lift_variables_lib",
+        ":tensorflow",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:core_cpu_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:lib",
+        "//tensorflow/core/platform:errors",
+        "//tensorflow/core/platform:status",
+        "//tensorflow/core/platform:threadpool_options",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:Support",
+    ],
+    alwayslink = 1,
+)
+
 cc_library(
     name = "tensorflow_passes",
     srcs = [
@@ -520,9 +587,11 @@ cc_library(
 cc_library(
     name = "tensorflow_test_passes",
     srcs = [
+        "transforms/lift_variables_test_pass_registration.cc",
         "transforms/lower_tf_pass.cc",
     ],
     deps = [
+        ":lift_variables_test_pass",
         ":lower_tf_lib",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Pass",
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model_lift_variables.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model_lift_variables.mlir
new file mode 100644
index 00000000000..0c04a0d738c
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model_lift_variables.mlir
@@ -0,0 +1,61 @@
+// RUN: tf-opt -verify-diagnostics -tf-saved-model-lift-variables-test -split-input-file %s | FileCheck %s --dump-input=fail
+
+module attributes {tf_saved_model.semantics} {
+
+  // Test case: Freezing VarHandleOp ops.
+
+  func @serving_default(%arg0: tensor<!tf.resource<tensor<100x50xf32>>> {tf.resource_name = "dense/kernel"}, %arg1: tensor<!tf.resource<tensor<50xf32>>> {tf.resource_name = "dense/bias"}) -> (tensor<100x50xf32> {tf_saved_model.index_path = ["dense_2"]})
+  attributes {tf.entry_function = {control_outputs = "", inputs = "", outputs = "dense_2/Add:0"}, tf_saved_model.exported_names = ["serving_default"]} {
+    %0 = "tf.VarHandleOp"() {_class = ["loc:@dense/kernel"], allowed_devices = [], container = "", device = "", shared_name = "dense/kernel"} : () -> tensor<!tf.resource<tensor<100x50xf32>>>
+    %1 = "tf.ReadVariableOp"(%0) {device = ""} : (tensor<!tf.resource<tensor<100x50xf32>>>) -> tensor<100x50xf32>
+    %2 = "tf.VarHandleOp"() {_class = ["loc:@dense/bias"], allowed_devices = [], container = "", device = "", shared_name = "dense/bias"} : () -> tensor<!tf.resource<tensor<50xf32>>>
+    %3 = "tf.ReadVariableOp"(%2) {device = ""} : (tensor<!tf.resource<tensor<50xf32>>>) -> tensor<50xf32>
+    %4 = "tf.Add"(%1, %3) {device = ""} : (tensor<100x50xf32>, tensor<50xf32>) -> tensor<100x50xf32>
+    return %4 : tensor<100x50xf32>
+  }
+  // CHECK: "tf_saved_model.global_tensor"()
+  // CHECK:    sym_name = "dense/kernel"
+  // CHECK: "tf_saved_model.global_tensor"()
+  // CHECK:    sym_name = "dense/bias"
+  // CHECK:  func @serving_default(
+  // CHECK:    %arg0: tensor<!tf.resource<tensor<100x50xf32>>> {tf_saved_model.bound_input = @"dense/kernel"},
+  // CHECK:    %arg1: tensor<!tf.resource<tensor<50xf32>>> {tf_saved_model.bound_input = @"dense/bias"})
+}
+
+// -----
+
+module attributes {tf_saved_model.semantics} {
+
+  // Test case: Freezing shared VarHandleOp ops.
+
+  func @f(%arg0: tensor<!tf.resource<tensor<100x50xf32>>> {tf.resource_name = "dense/kernel"}, %arg1: tensor<!tf.resource<tensor<50xf32>>> {tf.resource_name = "dense/bias"}) -> (tensor<100x50xf32> {tf_saved_model.index_path = ["dense_2"]})
+  attributes {tf.entry_function = {control_outputs = "", inputs = "", outputs = "dense_2/Add:0"}, tf_saved_model.exported_names = ["f"]} {
+    %0 = "tf.VarHandleOp"() {_class = ["loc:@dense/kernel"], allowed_devices = [], container = "", device = "", shared_name = "dense/kernel"} : () -> tensor<!tf.resource<tensor<100x50xf32>>>
+    %1 = "tf.ReadVariableOp"(%0) {device = ""} : (tensor<!tf.resource<tensor<100x50xf32>>>) -> tensor<100x50xf32>
+    %2 = "tf.VarHandleOp"() {_class = ["loc:@dense/bias"], allowed_devices = [], container = "", device = "", shared_name = "dense/bias"} : () -> tensor<!tf.resource<tensor<50xf32>>>
+    %3 = "tf.ReadVariableOp"(%2) {device = ""} : (tensor<!tf.resource<tensor<50xf32>>>) -> tensor<50xf32>
+    %4 = "tf.Add"(%1, %3) {device = ""} : (tensor<100x50xf32>, tensor<50xf32>) -> tensor<100x50xf32>
+    return %4 : tensor<100x50xf32>
+  }
+
+  func @f2(%arg0: tensor<!tf.resource<tensor<100x50xf32>>> {tf.resource_name = "dense/kernel"}, %arg1: tensor<!tf.resource<tensor<50xf32>>> {tf.resource_name = "dense/bias"}) -> (tensor<100x50xf32> {tf_saved_model.index_path = ["dense_2"]})
+  attributes {tf.entry_function = {control_outputs = "", inputs = "", outputs = "dense_2/Add:0"}, tf_saved_model.exported_names = ["f2"]} {
+    %0 = "tf.VarHandleOp"() {_class = ["loc:@dense/kernel"], allowed_devices = [], container = "", device = "", shared_name = "dense/kernel"} : () -> tensor<!tf.resource<tensor<100x50xf32>>>
+    %1 = "tf.ReadVariableOp"(%0) {device = ""} : (tensor<!tf.resource<tensor<100x50xf32>>>) -> tensor<100x50xf32>
+    %2 = "tf.VarHandleOp"() {_class = ["loc:@dense/bias"], allowed_devices = [], container = "", device = "", shared_name = "dense/bias"} : () -> tensor<!tf.resource<tensor<50xf32>>>
+    %3 = "tf.ReadVariableOp"(%2) {device = ""} : (tensor<!tf.resource<tensor<50xf32>>>) -> tensor<50xf32>
+    %4 = "tf.Add"(%1, %3) {device = ""} : (tensor<100x50xf32>, tensor<50xf32>) -> tensor<100x50xf32>
+    return %4 : tensor<100x50xf32>
+  }
+  // CHECK: "tf_saved_model.global_tensor"()
+  // CHECK:    sym_name = "dense/kernel"
+  // CHECK: "tf_saved_model.global_tensor"()
+  // CHECK:    sym_name = "dense/bias"
+  // CHECK:  func @f(
+  // CHECK:    %arg0: tensor<!tf.resource<tensor<100x50xf32>>> {tf_saved_model.bound_input = @"dense/kernel"},
+  // CHECK:    %arg1: tensor<!tf.resource<tensor<50xf32>>> {tf_saved_model.bound_input = @"dense/bias"})
+
+  // CHECK:  func @f2(
+  // CHECK:    %arg0: tensor<!tf.resource<tensor<100x50xf32>>> {tf_saved_model.bound_input = @"dense/kernel"},
+  // CHECK:    %arg1: tensor<!tf.resource<tensor<50xf32>>> {tf_saved_model.bound_input = @"dense/bias"})
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model_lift_variables_invalid_session.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model_lift_variables_invalid_session.mlir
new file mode 100644
index 00000000000..17244d8481a
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model_lift_variables_invalid_session.mlir
@@ -0,0 +1,33 @@
+// RUN: tf-opt -verify-diagnostics -tf-saved-model-lift-variables-invalid-session-test -split-input-file %s | FileCheck %s --dump-input=fail
+
+// Test case: Invalid session.
+// expected-error @+1 {{'module' op no session provided}}
+module attributes {tf_saved_model.semantics} {
+
+  func @serving_default(%arg0: tensor<!tf.resource<tensor<100x50xf32>>> {tf.resource_name = "dense/kernel"}, %arg1: tensor<!tf.resource<tensor<50xf32>>> {tf.resource_name = "dense/bias"}) -> (tensor<100x50xf32> {tf_saved_model.index_path = ["dense_2"]})
+  attributes {tf.entry_function = {control_outputs = "", inputs = "", outputs = "dense_2/Add:0"}, tf_saved_model.exported_names = ["serving_default"]} {
+    %0 = "tf.VarHandleOp"() {_class = ["loc:@dense/kernel"], allowed_devices = [], container = "", device = "", shared_name = "dense/kernel"} : () -> tensor<!tf.resource<tensor<100x50xf32>>>
+    %1 = "tf.ReadVariableOp"(%0) {device = ""} : (tensor<!tf.resource<tensor<100x50xf32>>>) -> tensor<100x50xf32>
+    %2 = "tf.VarHandleOp"() {_class = ["loc:@dense/bias"], allowed_devices = [], container = "", device = "", shared_name = "dense/bias"} : () -> tensor<!tf.resource<tensor<50xf32>>>
+    %3 = "tf.ReadVariableOp"(%2) {device = ""} : (tensor<!tf.resource<tensor<50xf32>>>) -> tensor<50xf32>
+    %4 = "tf.Add"(%1, %3) {device = ""} : (tensor<100x50xf32>, tensor<50xf32>) -> tensor<100x50xf32>
+    return %4 : tensor<100x50xf32>
+  }
+}
+
+// -----
+
+// Test case: No errors on no resource arguments.
+module attributes {tf_saved_model.semantics} {
+
+	// CHECK-LABEL: @serving_default
+  func @serving_default() -> (tensor<100x50xf32> {tf_saved_model.index_path = ["dense_2"]})
+  attributes {tf.entry_function = {control_outputs = "", inputs = "", outputs = "dense_2/Add:0"}, tf_saved_model.exported_names = ["serving_default"]} {
+    %0 = "tf.VarHandleOp"() {_class = ["loc:@dense/kernel"], allowed_devices = [], container = "", device = "", shared_name = "dense/kernel"} : () -> tensor<!tf.resource<tensor<100x50xf32>>>
+    %1 = "tf.ReadVariableOp"(%0) {device = ""} : (tensor<!tf.resource<tensor<100x50xf32>>>) -> tensor<100x50xf32>
+    %2 = "tf.VarHandleOp"() {_class = ["loc:@dense/bias"], allowed_devices = [], container = "", device = "", shared_name = "dense/bias"} : () -> tensor<!tf.resource<tensor<50xf32>>>
+    %3 = "tf.ReadVariableOp"(%2) {device = ""} : (tensor<!tf.resource<tensor<50xf32>>>) -> tensor<50xf32>
+    %4 = "tf.Add"(%1, %3) {device = ""} : (tensor<100x50xf32>, tensor<50xf32>) -> tensor<100x50xf32>
+    return %4 : tensor<100x50xf32>
+  }
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/lift_variables.cc b/tensorflow/compiler/mlir/tensorflow/transforms/lift_variables.cc
new file mode 100644
index 00000000000..6686b340be9
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/lift_variables.cc
@@ -0,0 +1,183 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/compiler/mlir/tensorflow/transforms/lift_variables.h"
+
+#include <algorithm>
+#include <iterator>
+#include <string>
+#include <tuple>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/Module.h"  // from @llvm-project
+#include "mlir/IR/UseDefLists.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/convert_tensor.h"
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/framework/resource_var.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/threadpool_options.h"
+#include "tensorflow/core/public/session.h"
+
+namespace mlir {
+namespace tf_saved_model {
+
+using llvm::SmallSet;
+using ::tensorflow::Device;
+using ::tensorflow::DeviceMgr;
+using ::tensorflow::mutex_lock;
+using ::tensorflow::ResourceHandle;
+using ::tensorflow::Session;
+using ::tensorflow::Status;
+using ::tensorflow::StatusOr;
+using ::tensorflow::Tensor;
+using ::tensorflow::Var;
+
+namespace {
+
+constexpr char kResourceNameArgAttr[] = "tf.resource_name";
+constexpr char kSavedModelArgAttr[] = "tf_saved_model.bound_input";
+
+LogicalResult LiftVariablesFromSession(
+    ModuleOp module, Session* session,
+    const SmallSet<StringRef, 4>& resource_names) {
+  OpBuilder builder(module.getBodyRegion());
+  MLIRContext* context = module.getContext();
+
+  if (!session) return module.emitOpError() << "no session provided";
+
+  // Read all resource variables from the session.
+  std::vector<std::string> variable_names;
+  variable_names.reserve(resource_names.size());
+  for (StringRef name : resource_names) variable_names.push_back(name.str());
+
+  std::vector<Tensor> resource_tensors;
+  Status status = session->Run(
+      /*inputs=*/{}, variable_names,
+      /*target_node_names=*/{}, &resource_tensors);
+  if (!status.ok()) {
+    return module.emitOpError()
+           << "failed to run the provided session: " << status.error_message();
+  }
+
+  const DeviceMgr* device_manager;
+  if (!(session->LocalDeviceManager(&device_manager).ok())) {
+    return module.emitOpError() << "failed to get local device manager";
+  }
+
+  // Read all underlying tensors of the variables from the session.
+  std::vector<Tensor> tensors;
+  tensors.reserve(resource_tensors.size());
+  for (const Tensor& resource_tensor : resource_tensors) {
+    if (resource_tensor.dtype() != tensorflow::DT_RESOURCE) {
+      tensors.push_back(resource_tensor);
+      continue;
+    }
+
+    const ResourceHandle& resource_handle =
+        resource_tensor.scalar<ResourceHandle>()();
+
+    Device* device;
+    if (!(device_manager->LookupDevice(resource_handle.device(), &device)
+              .ok())) {
+      return module.emitOpError() << "failed to look up device";
+    }
+
+    tensorflow::Var* var_ptr;
+    if (!(device->resource_manager()
+              ->Lookup(resource_handle.container(), resource_handle.name(),
+                       &var_ptr)
+              .ok())) {
+      return module.emitOpError() << "failed to look up resource value";
+    }
+    tensorflow::core::RefCountPtr<Var> var(var_ptr);
+
+    // The variable tensor is already loaded into corresponding device's
+    // resource manager when we load the saved model using LoadSavedModel().
+    // Here we just read its value.
+    mutex_lock ml(*var->mu());
+    tensors.push_back(*var->tensor());
+  }
+
+  for (const auto iter : llvm::zip(resource_names, tensors)) {
+    const StringRef name = std::get<0>(iter);
+    const Tensor& tensor = std::get<1>(iter);
+
+    // Create tensor attribute for this variable.
+    StatusOr<ElementsAttr> tensor_attr_or = ConvertTensor(tensor, &builder);
+    if (!tensor_attr_or.ok()) {
+      return module.emitOpError()
+             << "failed to convert tensor (name: " << name.str() << ")";
+    }
+    ElementsAttr tensor_attr = tensor_attr_or.ValueOrDie();
+
+    builder.create<tf_saved_model::GlobalTensorOp>(
+        NameLoc::get(builder.getIdentifier(name.str()), context),
+        builder.getStringAttr(name), tensor_attr,
+        TypeAttr::get(tensor_attr.getType()), builder.getUnitAttr());
+  }
+
+  return success();
+}
+
+}  // namespace
+
+LogicalResult LiftVariables(ModuleOp module, Session* session) {
+  MLIRContext* context = module.getContext();
+  mlir::Builder builder(context);
+  Identifier resource_name_id = builder.getIdentifier(kResourceNameArgAttr);
+
+  SmallSet<StringRef, 4> resource_names;
+
+  for (FuncOp func : module.getOps<FuncOp>()) {
+    for (int i = 0, e = func.getNumArguments(); i < e; ++i) {
+      auto resource_arg =
+          func.getArgAttrOfType<StringAttr>(i, kResourceNameArgAttr);
+      if (!resource_arg) continue;
+
+      StringRef resource_name = resource_arg.getValue();
+      auto flat_symbol_ref_attr =
+          FlatSymbolRefAttr::get(resource_name, context);
+
+      // Add the corresponding `tf_saved_model.bound_input` attribute.
+      func.setArgAttr(i, kSavedModelArgAttr, flat_symbol_ref_attr);
+
+      resource_names.insert(flat_symbol_ref_attr.getValue());
+
+      // Remove the existing `tf.resource_name` attribute.
+      func.removeArgAttr(i, resource_name_id);
+    }
+  }
+
+  if (resource_names.empty()) return success();
+
+  return LiftVariablesFromSession(module, session, resource_names);
+}
+
+}  // namespace tf_saved_model
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/lift_variables.h b/tensorflow/compiler/mlir/tensorflow/transforms/lift_variables.h
new file mode 100644
index 00000000000..12dc787fbcf
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/lift_variables.h
@@ -0,0 +1,33 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSFORMS_LIFT_VARIABLES_H_
+#define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSFORMS_LIFT_VARIABLES_H_
+
+#include "mlir/IR/Module.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "tensorflow/core/public/session.h"
+
+namespace mlir {
+namespace tf_saved_model {
+
+// Creates GlobalTensorOp for each variable from function arguments and converts
+// them to the corresponding saved model arguments.
+LogicalResult LiftVariables(ModuleOp module, ::tensorflow::Session* session);
+
+}  // namespace tf_saved_model
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSFORMS_LIFT_VARIABLES_H_
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/lift_variables_pass.h b/tensorflow/compiler/mlir/tensorflow/transforms/lift_variables_pass.h
new file mode 100644
index 00000000000..0eaee959c77
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/lift_variables_pass.h
@@ -0,0 +1,57 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSFORMS_LIFT_VARIABLES_PASS_H_
+#define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSFORMS_LIFT_VARIABLES_PASS_H_
+
+#include "mlir/IR/Module.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/transforms/lift_variables.h"
+#include "tensorflow/core/public/session.h"
+
+namespace mlir {
+namespace tf_saved_model {
+
+// This pass takes care of finding all variables from the function arguments and
+// converting them to the corresponding global tensors, that will be located out
+// of function. Also it converts resource arguments from function types to the
+// corresponding saved model arguments accordingly.
+class LiftVariablesPass
+    : public PassWrapper<LiftVariablesPass, OperationPass<ModuleOp>> {
+ public:
+  explicit LiftVariablesPass(::tensorflow::Session* session)
+      : session_(session) {}
+
+  void runOnOperation() override {
+    ModuleOp module = getOperation();
+    if (failed(LiftVariables(module, session_))) signalPassFailure();
+  }
+
+ private:
+  ::tensorflow::Session* session_;
+};
+
+// Creates as pass that creates GlobalTensorOp for each variable from function
+// arguments and converts the function arguments to the corresponding saved
+// model arguments.
+std::unique_ptr<OperationPass<ModuleOp>> CreateLiftVariablesPass(
+    ::tensorflow::Session* session);
+
+}  // namespace tf_saved_model
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSFORMS_LIFT_VARIABLES_PASS_H_
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/lift_variables_test_pass.h b/tensorflow/compiler/mlir/tensorflow/transforms/lift_variables_test_pass.h
new file mode 100644
index 00000000000..faecdf04368
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/lift_variables_test_pass.h
@@ -0,0 +1,146 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSFORMS_LIFT_VARIABLES_TEST_PASS_H_
+#define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSFORMS_LIFT_VARIABLES_TEST_PASS_H_
+
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/transforms/lift_variables.h"
+#include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/platform/threadpool_options.h"
+#include "tensorflow/core/public/session.h"
+
+namespace mlir {
+namespace tf_saved_model {
+
+using ::tensorflow::DeviceMgr;
+using ::tensorflow::Session;
+using ::tensorflow::Status;
+using ::tensorflow::Tensor;
+
+// FakeSession is for testing only.
+class FakeSession : public tensorflow::Session {
+ public:
+  FakeSession() {}
+  ~FakeSession() override = default;
+
+  Status Create(const tensorflow::GraphDef& graph) override {
+    return tensorflow::errors::Unimplemented("not available");
+  }
+  Status Extend(const tensorflow::GraphDef& graph) override {
+    return tensorflow::errors::Unimplemented("not available");
+  }
+
+  Status Close() override {
+    return tensorflow::errors::Unimplemented("not available");
+  }
+
+  Status ListDevices(
+      std::vector<tensorflow::DeviceAttributes>* response) override {
+    return tensorflow::errors::Unimplemented("not available");
+  }
+
+  Status LocalDeviceManager(
+      const tensorflow::DeviceMgr** deviceMgrPtr) override {
+    // This method returns a null device manager without making an error.
+    // Users of this method will be notified since it will have a fake data.
+    *deviceMgrPtr = nullptr;
+    return Status::OK();
+  }
+
+  Status Run(const std::vector<std::pair<std::string, Tensor>>& inputs,
+             const std::vector<std::string>& output_names,
+             const std::vector<std::string>& target_nodes,
+             std::vector<Tensor>* outputs) override {
+    tensorflow::RunMetadata run_metadata;
+    return Run(tensorflow::RunOptions(), inputs, output_names, target_nodes,
+               outputs, &run_metadata);
+  }
+
+  Status Run(const tensorflow::RunOptions& run_options,
+             const std::vector<std::pair<std::string, Tensor>>& inputs,
+             const std::vector<std::string>& output_names,
+             const std::vector<std::string>& target_nodes,
+             std::vector<Tensor>* outputs,
+             tensorflow::RunMetadata* run_metadata) override {
+    return Run(run_options, inputs, output_names, target_nodes, outputs,
+               run_metadata, tensorflow::thread::ThreadPoolOptions());
+  }
+
+  Status Run(const tensorflow::RunOptions& run_options,
+             const std::vector<std::pair<std::string, Tensor>>& inputs,
+             const std::vector<std::string>& output_names,
+             const std::vector<std::string>& target_nodes,
+             std::vector<Tensor>* outputs,
+             tensorflow::RunMetadata* run_metadata,
+             const tensorflow::thread::ThreadPoolOptions& thread_pool_options)
+      override {
+    for (const std::string& output_name : output_names) {
+      Tensor output;
+      if (output_name == "dense/bias") {
+        outputs->push_back(
+            Tensor(tensorflow::DT_FLOAT, tensorflow::TensorShape({50})));
+      } else if (output_name == "dense/kernel") {
+        outputs->push_back(
+            Tensor(tensorflow::DT_FLOAT, tensorflow::TensorShape({100, 50})));
+      } else {
+        // Create a scalar float tensor.
+        outputs->push_back(
+            Tensor(tensorflow::DT_FLOAT, tensorflow::TensorShape({})));
+      }
+    }
+    return Status::OK();
+  }
+};
+
+// This pass is only available in the tf-opt binary for testing.
+class LiftVariablesTestPass
+    : public PassWrapper<LiftVariablesTestPass, OperationPass<ModuleOp>> {
+ public:
+  LiftVariablesTestPass() { session_ = new FakeSession(); }
+
+  ~LiftVariablesTestPass() override { delete session_; }
+
+  void runOnOperation() override {
+    ModuleOp module = getOperation();
+    if (failed(LiftVariables(module, session_))) signalPassFailure();
+  }
+
+ private:
+  Session* session_;
+};
+
+// This pass is only available in the tf-opt binary for testing.
+class LiftVariablesInvalidSessionTestPass
+    : public PassWrapper<LiftVariablesInvalidSessionTestPass,
+                         OperationPass<ModuleOp>> {
+ public:
+  void runOnOperation() override {
+    ModuleOp module = getOperation();
+    // Pass an invalid session argument, which is a nullptr.
+    if (failed(LiftVariables(module, /*session=*/nullptr))) signalPassFailure();
+  }
+};
+
+}  // namespace tf_saved_model
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSFORMS_LIFT_VARIABLES_TEST_PASS_H_
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/lift_variables_test_pass_registration.cc b/tensorflow/compiler/mlir/tensorflow/transforms/lift_variables_test_pass_registration.cc
new file mode 100644
index 00000000000..19c367c6d46
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/lift_variables_test_pass_registration.cc
@@ -0,0 +1,32 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/mlir/tensorflow/transforms/lift_variables_test_pass.h"
+
+namespace mlir {
+namespace tf_saved_model {
+
+static PassRegistration<LiftVariablesTestPass> lift_variables_test_pass(
+    "tf-saved-model-lift-variables-test",
+    "Lift variables and save them as global tensors");
+
+static PassRegistration<LiftVariablesInvalidSessionTestPass>
+    lift_variables_invalid_session_test_pass(
+        "tf-saved-model-lift-variables-invalid-session-test",
+        "Lift variables and save them as global tensors with an invalid "
+        "session");
+
+}  // namespace tf_saved_model
+}  // namespace mlir

From e3a423a9493af054d87f7ce45feb228a3ccfe6e6 Mon Sep 17 00:00:00 2001
From: Gabriel Rasskin <grasskin@google.com>
Date: Tue, 16 Jun 2020 16:34:38 -0700
Subject: [PATCH 0344/1390] Added FuzzedDataProvider to split fuzzer data

Switched manual data splicing to FuzzedDataProvider @mihaimaruseac
---
 tensorflow/security/fuzzing/status_fuzz.cc | 26 +++++++++-------------
 1 file changed, 10 insertions(+), 16 deletions(-)

diff --git a/tensorflow/security/fuzzing/status_fuzz.cc b/tensorflow/security/fuzzing/status_fuzz.cc
index 7b161645148..8b5949009b1 100644
--- a/tensorflow/security/fuzzing/status_fuzz.cc
+++ b/tensorflow/security/fuzzing/status_fuzz.cc
@@ -17,6 +17,8 @@ limitations under the License.
 
 #include "tensorflow/core/platform/status.h"
 
+#include <fuzzer/FuzzedDataProvider.h>
+
 // This is a fuzzer for `tensorflow::Status`. Since `Status` is used almost
 // everywhere, we need to ensure that the common functionality is safe. We don't
 // expect many crashes from this fuzzer since we only create a status and then
@@ -26,9 +28,7 @@ limitations under the License.
 
 namespace {
 
-tensorflow::error::Code BuildRandomErrorCode(uint8_t a, uint8_t b, uint8_t c,
-                                             uint8_t d) {
-  int code = (a << 24) | (b << 16) | (c << 8) | d;
+tensorflow::error::Code BuildRandomErrorCode(uint32_t code){
 
   // We cannot build a `Status` with error_code of 0 and a message, so force
   // error code to be non-zero.
@@ -39,22 +39,16 @@ tensorflow::error::Code BuildRandomErrorCode(uint8_t a, uint8_t b, uint8_t c,
   return static_cast<tensorflow::error::Code>(code);
 }
 
-std::string GetRandomErrorString(const uint8_t *data, size_t size) {
-  const char *p = reinterpret_cast<const char *>(data);
-  return std::string(p, size);
-}
-
 extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
-  // TODO(mihaimaruseac): Use `FuzzedDataProvider` and then make these `const`
   tensorflow::error::Code error_code;
   std::string error_message;
-  if (size < 4) {
-    error_code = BuildRandomErrorCode(0, 0, 0, 0);
-    error_message = GetRandomErrorString(data, size);
-  } else {
-    error_code = BuildRandomErrorCode(data[0], data[1], data[2], data[3]);
-    error_message = GetRandomErrorString(data + 4, size - 4);
-  }
+
+  FuzzedDataProvider fuzzed_data(data, size);
+
+  uint32_t code = fuzzed_data.ConsumeIntegral<uint32_t>();
+  error_code = BuildRandomErrorCode(code);
+
+  error_message = fuzzed_data.ConsumeRemainingBytesAsString();
 
   tensorflow::Status s = tensorflow::Status(error_code, error_message);
   const std::string actual_message = s.ToString();

From dac169cd7f2f618abc6511efcae0bb87d033ec7e Mon Sep 17 00:00:00 2001
From: YoungSeok Yoon <youngseokyoon@google.com>
Date: Tue, 16 Jun 2020 16:34:47 -0700
Subject: [PATCH 0345/1390] Include TensorFlow LICENSE file to TFLite aars

PiperOrigin-RevId: 316781613
Change-Id: I6323349321f5b009b1511e0cec12dbaca6a8c770
---
 tensorflow/lite/java/aar_with_jni.bzl | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/tensorflow/lite/java/aar_with_jni.bzl b/tensorflow/lite/java/aar_with_jni.bzl
index 71da735703d..34706c19c54 100644
--- a/tensorflow/lite/java/aar_with_jni.bzl
+++ b/tensorflow/lite/java/aar_with_jni.bzl
@@ -52,7 +52,11 @@ EOF
         ],
     )
 
-    srcs = [android_library + ".aar", name + "_dummy_app_for_so_unsigned.apk"]
+    srcs = [
+        android_library + ".aar",
+        name + "_dummy_app_for_so_unsigned.apk",
+        "//:LICENSE",
+    ]
 
     cmd = """
 cp $(location {0}.aar) $(location :{1}.aar)
@@ -62,6 +66,8 @@ cd $$(mktemp -d)
 unzip $$origdir/$(location :{1}_dummy_app_for_so_unsigned.apk) "lib/*"
 cp -r lib jni
 zip -r $$origdir/$(location :{1}.aar) jni/*/*.so
+cp $$origdir/$(location //:LICENSE) ./
+zip $$origdir/$(location :{1}.aar) LICENSE
 """.format(android_library, name)
 
     if headers:

From 897e3c0ecad3b45f5e96615173e7511619eebc93 Mon Sep 17 00:00:00 2001
From: Raman Sarokin <sorokin@google.com>
Date: Tue, 16 Jun 2020 16:39:31 -0700
Subject: [PATCH 0346/1390] Softmax1x1 converted to new style.

PiperOrigin-RevId: 316782370
Change-Id: I1f7761c0520d72876f352c9f156341b349b90cbe
---
 .../delegates/gpu/cl/kernels/softmax1x1.cc    | 87 ++++++++++---------
 1 file changed, 45 insertions(+), 42 deletions(-)

diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/softmax1x1.cc b/tensorflow/lite/delegates/gpu/cl/kernels/softmax1x1.cc
index 192bee771d6..fcfe4a1810c 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/softmax1x1.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/softmax1x1.cc
@@ -25,47 +25,45 @@ namespace gpu {
 namespace cl {
 namespace {
 
-std::string GetSoftmaxKernelCode(
-    const OperationDef& op_def,
-    const std::vector<ElementwiseOperation*>& linked_operations) {
-  TensorCodeGenerator src_tensor("src_data",
-                                 WHSBPoint{"tensor_size.x", "tensor_size.y",
-                                           "tensor_size.z", "tensor_size.w"},
-                                 op_def.src_tensors[0]);
-  TensorCodeGenerator dst_tensor("dst_data",
-                                 WHSBPoint{"tensor_size.x", "tensor_size.y",
-                                           "tensor_size.z", "tensor_size.w"},
-                                 op_def.dst_tensors[0]);
+std::string GetSoftmaxKernelCode(const OperationDef& op_def, Arguments* args) {
+  args->AddObjectRef(
+      "src_tensor", AccessType::READ,
+      absl::make_unique<TensorDescriptor>(op_def.src_tensors[0]));
+  args->AddObjectRef(
+      "dst_tensor", AccessType::WRITE,
+      absl::make_unique<TensorDescriptor>(op_def.dst_tensors[0]));
+  args->AddFloat("mask_x");
+  args->AddFloat("mask_y");
+  args->AddFloat("mask_z");
+  args->AddFloat("mask_w");
+  args->AddInt("slices_x32");
 
-  const std::string batch_id = op_def.IsBatchSupported() ? "batch_id" : "";
   std::string c = GetCommonDefines(op_def.precision);
   c += "__kernel void main_function(\n";
-  c += src_tensor.GetDeclaration(AccessType::READ);
-  c += GetArgsDeclaration(linked_operations);
-  c += dst_tensor.GetDeclaration(AccessType::WRITE) + ",\n";
-  c += "    int4 tensor_size,\n";
-  c += "    int2 size,\n";
-  c += "    float4 mask\n";
-  c += ") {\n";
+  c += "$0) {\n";
   if (op_def.IsBatchSupported()) {
     c += "  int batch_id = get_global_id(1);\n";
-    c += "  if (batch_id >= tensor_size.w) return;\n";
+    c += "  if (batch_id >= args.dst_tensor.Batch()) return;\n";
+    c += "  args.dst_tensor.SetBatchRef(batch_id);\n";
+    c += "  args.src_tensor.SetBatchRef(batch_id);\n";
   }
+  c += "  float4 mask = (float4)(args.mask_x, args.mask_y, args.mask_z, "
+       "args.mask_w);\n";
   c += "  int offset = 0;\n";
   c += "  float sum = 0.0f;\n";
   c += "  int s = 0;\n";
   c += "  int tid = get_local_id(0);\n";
   c += "  do {\n";
   c += "    int z = offset + tid;\n";
-  c += "    if (z < size.x) {\n";
-  c += "      float4 mask_temp = z == size.x - 1 ? mask : (float4)(1.0f);\n";
-  c += "      float4 src = " +
-       src_tensor.ReadAsFloatWHSB("0", "0", "z", batch_id) + ";\n";
+  c += "    if (z < args.dst_tensor.Slices()) {\n";
+  c += "      float4 mask_temp = z == args.dst_tensor.Slices() - 1 ? mask : "
+       "(float4)(1.0f);\n";
+  c += "      float4 src = args.src_tensor.Read<float>(0, 0, z);\n";
   c += "      sum += dot(mask_temp, exp(src));\n";
   c += "      offset += 32;\n";
   c += "    }\n";
   c += "    s++;\n";
-  c += "  } while (s < size.y);\n";
+  c += "  } while (s < args.slices_x32);\n";
   c += "\n";
   c += "  __local float4 tmp[8];\n";
   c += "  __local float* tmpx1 = (__local float*)tmp;\n";
@@ -89,16 +87,14 @@ std::string GetSoftmaxKernelCode(
   c += "  s = 0;\n";
   c += "  do {\n";
   c += "    int z = offset + tid;\n";
-  c += "    if (z < size.x) {\n";
-  c += "      FLT4 res = TO_FLT4(exp(" +
-       src_tensor.ReadAsFloatWHSB("0", "0", "z", batch_id) + ")*sum);\n";
-  const LinkingContext context{"res", "0", "0", "z"};
-  c += PostProcess(linked_operations, context);
-  c += "    " + dst_tensor.WriteWHSB("res", "0", "0", "z", batch_id);
+  c += "    if (z < args.dst_tensor.Slices()) {\n";
+  c += "      FLT4 res = TO_FLT4(exp(args.src_tensor.Read<float>(0, 0, "
+       "z))*sum);\n";
+  c += "      args.dst_tensor.Write(res, 0, 0, z);\n";
   c += "      offset += 32;\n";
   c += "    }\n";
   c += "    s++;\n";
-  c += "  } while (s < size.y);\n";
+  c += "  } while (s < args.slices_x32);\n";
   c += "}\n";
   return c;
 }
@@ -116,23 +112,30 @@ Softmax1x1& Softmax1x1::operator=(Softmax1x1&& kernel) {
 }
 
 absl::Status Softmax1x1::Compile(const CreationContext& creation_context) {
-  const auto code = GetSoftmaxKernelCode(definition_, linked_operations_);
+  std::string code = GetSoftmaxKernelCode(definition_, &args_);
+  std::string element_wise_code;
+  RETURN_IF_ERROR(
+      MergeOperations(linked_operations_, &args_, &element_wise_code));
+  RETURN_IF_ERROR(args_.TransformToCLCode(creation_context.device->GetInfo(),
+                                          {{"dst_tensor", element_wise_code}},
+                                          &code));
   return creation_context.cache->GetOrCreateCLKernel(
       code, "main_function", *creation_context.context,
       *creation_context.device, &kernel_);
 }
 
 absl::Status Softmax1x1::AddToQueue(CLCommandQueue* queue) {
-  kernel_.ResetBindingCounter();
-  RETURN_IF_ERROR(kernel_.SetMemoryAuto(src_[0]->GetMemoryPtr()));
-  RETURN_IF_ERROR(BindArgs(&kernel_, linked_operations_));
-  RETURN_IF_ERROR(kernel_.SetMemoryAuto(dst_[0]->GetMemoryPtrForWriting()));
-  RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->GetWHSB()));
-  const int depth = src_[0]->Slices();
-  RETURN_IF_ERROR(kernel_.SetBytesAuto(int2(depth, DivideRoundUp(depth, 32))));
+  RETURN_IF_ERROR(args_.SetObjectRef("src_tensor", src_[0]));
+  RETURN_IF_ERROR(args_.SetObjectRef("dst_tensor", dst_[0]));
+  float4 mask = GetMaskForLastPlane(src_[0]->Channels());
+  RETURN_IF_ERROR(args_.SetFloat("mask_x", mask.x));
+  RETURN_IF_ERROR(args_.SetFloat("mask_y", mask.y));
+  RETURN_IF_ERROR(args_.SetFloat("mask_z", mask.z));
+  RETURN_IF_ERROR(args_.SetFloat("mask_w", mask.w));
   RETURN_IF_ERROR(
-      kernel_.SetBytesAuto(GetMaskForLastPlane(src_[0]->Channels())));
-
+      args_.SetInt("slices_x32", DivideRoundUp(src_[0]->Slices(), 32)));
+  RETURN_IF_ERROR(SetArguments(linked_operations_, &args_));
+  RETURN_IF_ERROR(args_.Bind(kernel_.kernel()));
   return queue->DispatchImplicit(kernel_, {32, dst_[0]->Batch(), 1},
                                  {32, 1, 1});
 }

From 267f956246750357f8eff2b88b2e6c8741b46775 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 16 Jun 2020 16:39:49 -0700
Subject: [PATCH 0347/1390] load libcupti.so shared object if libcupti.so.10.1
 is not installed. This makes it possible for tensorflow to load CUPTI 10.2
 via libcupti.so symlink in the event that libcupti.so.10.1 is missing.

PiperOrigin-RevId: 316782424
Change-Id: I930912f1d3a8fa80e8b91c41214c374650f08847
---
 .../stream_executor/platform/default/dso_loader.cc     | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/tensorflow/stream_executor/platform/default/dso_loader.cc b/tensorflow/stream_executor/platform/default/dso_loader.cc
index fb7d88aaedb..01af4114536 100644
--- a/tensorflow/stream_executor/platform/default/dso_loader.cc
+++ b/tensorflow/stream_executor/platform/default/dso_loader.cc
@@ -101,13 +101,11 @@ port::StatusOr<void*> GetCurandDsoHandle() {
 }
 
 port::StatusOr<void*> GetCuptiDsoHandle() {
-#if defined(ANDROID_TEGRA)
-  // On Android devices the CUDA version number is not added to the library
-  // name.
+  // Load specific version of CUPTI this is built.
+  auto status_or_handle = GetDsoHandle("cupti", GetCudaVersion());
+  if (status_or_handle.ok()) return status_or_handle;
+  // Load whatever libcupti.so user specified.
   return GetDsoHandle("cupti", "");
-#else
-  return GetDsoHandle("cupti", GetCudaVersion());
-#endif
 }
 
 port::StatusOr<void*> GetCudnnDsoHandle() {

From f8657c62c60dffe01e27f8d47028b533c0837d2c Mon Sep 17 00:00:00 2001
From: Allen Lavoie <allenl@google.com>
Date: Tue, 16 Jun 2020 16:48:19 -0700
Subject: [PATCH 0348/1390] Parallel device: avoid deadlocks when the
 EagerContext's default executor is async

Creates one sync executor per thread.

Requires fixing a tangential use-after-free where the context assumed all of the thread-local executors were still allocated at shutdown.

PiperOrigin-RevId: 316783819
Change-Id: I62e7a91dcccb847d4e1c2a5f08e30c2877556618
---
 tensorflow/c/eager/c_api_experimental_test.cc | 29 +++++++++++++++++
 .../parallel_device/parallel_device_lib.cc    | 18 +++++++++++
 .../parallel_device/parallel_device_test.cc   |  6 +---
 .../core/common_runtime/eager/context.cc      | 32 ++++++++++++++++++-
 .../core/common_runtime/eager/context.h       |  2 ++
 .../common_runtime/eager/eager_executor.cc    | 11 +++++++
 .../common_runtime/eager/eager_executor.h     | 10 ++++++
 .../parallel_device/parallel_device_test.py   | 24 +++++++++++++-
 8 files changed, 125 insertions(+), 7 deletions(-)

diff --git a/tensorflow/c/eager/c_api_experimental_test.cc b/tensorflow/c/eager/c_api_experimental_test.cc
index 0c058398299..a4d31417073 100644
--- a/tensorflow/c/eager/c_api_experimental_test.cc
+++ b/tensorflow/c/eager/c_api_experimental_test.cc
@@ -212,6 +212,35 @@ TEST(CAPI, CancellationManager) {
   TFE_DeleteCancellationManager(c_mgr);
 }
 
+TEST(CAPI, ExecutorContextDestructionOrder) {
+  TF_Status* status = TF_NewStatus();
+
+  {
+    TFE_ContextOptions* opts = TFE_NewContextOptions();
+    TFE_Context* ctx = TFE_NewContext(opts, status);
+    ASSERT_TRUE(TF_GetCode(status) == TF_OK) << TF_Message(status);
+    TFE_DeleteContextOptions(opts);
+    TFE_Executor* executor = TFE_NewExecutor(/*is_async=*/false);
+    TFE_ContextSetExecutorForThread(ctx, executor);
+
+    TFE_DeleteContext(ctx);
+    TFE_DeleteExecutor(executor);
+  }
+
+  {
+    TFE_ContextOptions* opts = TFE_NewContextOptions();
+    TFE_Context* ctx = TFE_NewContext(opts, status);
+    ASSERT_TRUE(TF_GetCode(status) == TF_OK) << TF_Message(status);
+    TFE_DeleteContextOptions(opts);
+    TFE_Executor* executor = TFE_NewExecutor(/*is_async=*/false);
+    TFE_ContextSetExecutorForThread(ctx, executor);
+
+    TFE_DeleteExecutor(executor);
+    TFE_DeleteContext(ctx);
+  }
+  TF_DeleteStatus(status);
+}
+
 TEST(CAPI, Function_ident_CPU) {
   // First create a simple identity function.
   TF_Graph* function_graph = TF_NewGraph();
diff --git a/tensorflow/c/eager/parallel_device/parallel_device_lib.cc b/tensorflow/c/eager/parallel_device/parallel_device_lib.cc
index 98cd4812610..d0149b29c08 100644
--- a/tensorflow/c/eager/parallel_device/parallel_device_lib.cc
+++ b/tensorflow/c/eager/parallel_device/parallel_device_lib.cc
@@ -37,6 +37,15 @@ class StatusDeleter {
 
 using StatusPtr = std::unique_ptr<TF_Status, StatusDeleter>;
 
+class ExecutorDeleter {
+ public:
+  void operator()(TFE_Executor* to_delete) const {
+    TFE_DeleteExecutor(to_delete);
+  }
+};
+
+using ExecutorPtr = std::unique_ptr<TFE_Executor, ExecutorDeleter>;
+
 }  // namespace
 
 // Allows a single op at a time to be launched without blocking.
@@ -51,6 +60,13 @@ class DeviceThread {
   explicit DeviceThread(const std::string& device)
       : status_(TF_NewStatus()),
         device_(device),
+        // If the context's default exector is set to async, re-using that in
+        // each thread would cause collectives to deadlock. For consistency we
+        // create a new sync executor for every thread.
+        //
+        // TODO(allenl): We should have an async API that works with the
+        // parallel device.
+        executor_(TFE_NewExecutor(/*is_async=*/false)),
         op_(nullptr),
         thread_(tensorflow::Env::Default()->StartThread(
             tensorflow::ThreadOptions(), "parallel_device_execute",
@@ -105,6 +121,7 @@ class DeviceThread {
   StatusPtr status_ TF_GUARDED_BY(execution_mutex_);
 
   const std::string device_;
+  ExecutorPtr executor_ TF_GUARDED_BY(execution_mutex_);
   mutable OpPtr op_ TF_GUARDED_BY(execution_mutex_);
   std::unique_ptr<Thread> thread_;
 };
@@ -186,6 +203,7 @@ void DeviceThread::Execute(TFE_Context* context, const char* operation_name,
                            std::vector<TensorHandlePtr>* outputs,
                            TF_Status* status) const {
   if (op_ == nullptr) {
+    TFE_ContextSetExecutorForThread(context, executor_.get());
     op_.reset(TFE_NewOp(context, operation_name, status));
     if (TF_GetCode(status) != TF_OK) return;
     TFE_OpSetDevice(op_.get(), device_.c_str(), status);
diff --git a/tensorflow/c/eager/parallel_device/parallel_device_test.cc b/tensorflow/c/eager/parallel_device/parallel_device_test.cc
index e5412dbba61..2fa183d50f6 100644
--- a/tensorflow/c/eager/parallel_device/parallel_device_test.cc
+++ b/tensorflow/c/eager/parallel_device/parallel_device_test.cc
@@ -412,6 +412,7 @@ void TestCollective(bool async) {
       TF_NewStatus(), TF_DeleteStatus);
   std::unique_ptr<TFE_ContextOptions, decltype(&TFE_DeleteContextOptions)> opts(
       TFE_NewContextOptions(), TFE_DeleteContextOptions);
+  TFE_ContextOptionsSetAsync(opts.get(), async);
   std::unique_ptr<TF_Buffer, decltype(&TF_DeleteBuffer)> config(
       TF_CreateConfig(
           /*xla*/ false,
@@ -423,9 +424,6 @@ void TestCollective(bool async) {
   std::unique_ptr<TFE_Context, decltype(&TFE_DeleteContext)> context(
       TFE_NewContext(opts.get(), status.get()), TFE_DeleteContext);
   ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
-  std::unique_ptr<TFE_Executor, decltype(&TFE_DeleteExecutor)> executor(
-      TFE_NewExecutor(async), TFE_DeleteExecutor);
-  TFE_ContextSetExecutorForThread(context.get(), executor.get());
 
   const char* device_name = "/job:localhost/replica:0/task:0/device:CUSTOM:0";
   std::array<const char*, 2> underlying_devices{
@@ -455,8 +453,6 @@ void TestCollective(bool async) {
   ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
   ExpectScalarEq<float>(result_components[0].get(), 3.);
   ExpectScalarEq<float>(result_components[1].get(), 3.);
-  // Destroying the context's default executor first isn't safe.
-  context.reset();
 }
 
 TEST(PARALLEL_DEVICE, TestCollectiveSync) { TestCollective(/*async=*/false); }
diff --git a/tensorflow/core/common_runtime/eager/context.cc b/tensorflow/core/common_runtime/eager/context.cc
index 5d8cb3da6bc..970c2bcbb89 100644
--- a/tensorflow/core/common_runtime/eager/context.cc
+++ b/tensorflow/core/common_runtime/eager/context.cc
@@ -341,7 +341,28 @@ void EagerContext::SetExecutorForThread(EagerExecutor* executor) {
   if (executor == &default_executor_) {
     thread_local_executor_.erase(std::this_thread::get_id());
   } else {
-    thread_local_executor_[std::this_thread::get_id()] = executor;
+    auto thread_id = std::this_thread::get_id();
+    thread_local_executor_[thread_id] = executor;
+    auto& executors_with_cleanups = has_cleanup_[thread_id];
+    if (executors_with_cleanups.find(executor) ==
+        executors_with_cleanups.end()) {
+      executors_with_cleanups.insert(executor);
+      // If the executor is deleted before this context, we need to remove it
+      // from the map to avoid attempting to sync it in our destructor.
+      std::function<void()> cleanup([this, thread_id, executor]() {
+        {
+          tensorflow::mutex_lock l(executor_map_mu_);
+          auto existing = thread_local_executor_.find(thread_id);
+          if (existing != thread_local_executor_.end() &&
+              existing->second == executor) {
+            thread_local_executor_.erase(thread_id);
+          }
+          has_cleanup_[thread_id].erase(executor);
+        }
+      });
+      executor->AddCleanup(reinterpret_cast<intptr_t>(this),
+                           std::move(cleanup));
+    }
   }
 }
 
@@ -525,6 +546,15 @@ EagerContext::~EagerContext() {
   custom_devices_.clear();
 
   ClearCachesAndThreadExecutors();
+  std::unordered_map<std::thread::id, EagerExecutor*> executors_copy;
+  {
+    mutex_lock l(executor_map_mu_);
+    executors_copy = thread_local_executor_;
+  }
+  for (const auto& entry : executors_copy) {
+    // Let the executor know that its cleanup closure is no longer valid.
+    entry.second->RemoveCleanups(reinterpret_cast<intptr_t>(this));
+  }
   for (auto& entry : registered_functions_) {
     while (!entry.second->Unref()) {
       // remove all references.
diff --git a/tensorflow/core/common_runtime/eager/context.h b/tensorflow/core/common_runtime/eager/context.h
index fa57afecbaf..cb6d09f8f1d 100644
--- a/tensorflow/core/common_runtime/eager/context.h
+++ b/tensorflow/core/common_runtime/eager/context.h
@@ -639,6 +639,8 @@ class EagerContext : public AbstractContextInterface, public core::RefCounted {
   // Not owned.
   std::unordered_map<std::thread::id, EagerExecutor*> thread_local_executor_
       TF_GUARDED_BY(executor_map_mu_);
+  std::unordered_map<std::thread::id, std::unordered_set<EagerExecutor*>>
+      has_cleanup_ TF_GUARDED_BY(executor_map_mu_);
 
   const bool log_memory_;
 
diff --git a/tensorflow/core/common_runtime/eager/eager_executor.cc b/tensorflow/core/common_runtime/eager/eager_executor.cc
index ddfdabf9472..7fe321edffd 100644
--- a/tensorflow/core/common_runtime/eager/eager_executor.cc
+++ b/tensorflow/core/common_runtime/eager/eager_executor.cc
@@ -46,6 +46,11 @@ EagerExecutor::~EagerExecutor() {
   tensorflow::mutex_lock l(node_queue_mutex_);
   state_ = ExecutorState::kShutDown;
   nodes_pending_.notify_all();
+  for (const auto& cleanups_for_key : cleanups_) {
+    for (const std::function<void()>& cleanup : cleanups_for_key.second) {
+      cleanup();
+    }
+  }
 }
 
 Status EagerExecutor::ShutDown() {
@@ -413,4 +418,10 @@ Status EagerExecutor::MoveToUnfinished(core::RefCountPtr<NodeItem> item,
   return Status::OK();
 }
 
+void EagerExecutor::AddCleanup(intptr_t key, std::function<void()> callback) {
+  cleanups_[key].push_back(callback);
+}
+
+void EagerExecutor::RemoveCleanups(intptr_t key) { cleanups_.erase(key); }
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/eager/eager_executor.h b/tensorflow/core/common_runtime/eager/eager_executor.h
index aa8864c7ad6..34847abc26a 100644
--- a/tensorflow/core/common_runtime/eager/eager_executor.h
+++ b/tensorflow/core/common_runtime/eager/eager_executor.h
@@ -153,6 +153,13 @@ class EagerExecutor {
 
   bool ok() const TF_NO_THREAD_SAFETY_ANALYSIS { return ok_; }
 
+  // On destruction, runs `callback`. Used by the EagerContext for clearing
+  // thread-local executors.
+  void AddCleanup(intptr_t key, std::function<void()> callback);
+  // If `key` (e.g. a context) is destroyed before the executor, the associated
+  // callbacks are no longer safe to run.
+  void RemoveCleanups(intptr_t key);
+
  private:
   // Possible states for this executor.
   // Executor starts in kActive state. When Shutdown() is called, Executor
@@ -250,6 +257,9 @@ class EagerExecutor {
   const eager::EagerClient* last_eager_client_;
 
   const bool enable_async_wait_for_remote_function_;
+
+  // Callbacks to run on destruction.
+  std::unordered_map<intptr_t, std::vector<std::function<void()>>> cleanups_;
 };
 
 inline bool EagerExecutor::Async() const { return thread_ != nullptr; }
diff --git a/tensorflow/python/distribute/parallel_device/parallel_device_test.py b/tensorflow/python/distribute/parallel_device/parallel_device_test.py
index 9dbf258f70f..8fc3dcb5816 100644
--- a/tensorflow/python/distribute/parallel_device/parallel_device_test.py
+++ b/tensorflow/python/distribute/parallel_device/parallel_device_test.py
@@ -23,6 +23,7 @@ import threading
 from tensorflow.python.distribute.parallel_device import parallel_device
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
+from tensorflow.python.framework import config
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import ops
 from tensorflow.python.module import module
@@ -136,7 +137,7 @@ class ParallelDeviceTests(_VirtualDeviceTestCase):
     self.assertIn(self.device.components[0], outputs[0].backing_device)
     self.assertIn(self.device.components[1], outputs[1].backing_device)
 
-  def test_collective_reduce_async(self):
+  def test_collective_reduce_async_scope(self):
     # Note that ops on the parallel device currently don't execute
     # asynchronously. The test is just that we don't get deadlocks.
     with context.async_scope(), ops.device(self.device.name):
@@ -149,6 +150,27 @@ class ParallelDeviceTests(_VirtualDeviceTestCase):
     self.assertIn(self.device.components[0], outputs[0].backing_device)
     self.assertIn(self.device.components[1], outputs[1].backing_device)
 
+  def test_collective_reduce_async_context(self):
+    previous = config.get_synchronous_execution()
+    try:
+      context._reset_context()
+      config.set_synchronous_execution(False)
+      self.setUp()
+      # Note that ops on the parallel device currently don't execute
+      # asynchronously. The test is just that we don't get deadlocks.
+      with ops.device(self.device.name):
+        x = self.device.pack(
+            [constant_op.constant(-1.5),
+             constant_op.constant(3.5)])
+        reduced = _collective_sum(x, num_replicas=2)
+        outputs = self.device.unpack(reduced)
+      self.assertAllClose([2., 2.], outputs)
+      self.assertIn(self.device.components[0], outputs[0].backing_device)
+      self.assertIn(self.device.components[1], outputs[1].backing_device)
+    finally:
+      context._reset_context()
+      config.set_synchronous_execution(previous)
+
   def test_checkpointing(self):
     prefix = os.path.join(self.get_temp_dir(), "ckpt")
     with self.device.scope():

From 950cffcd8deb881dcbfdf92f22c37eaa36f61e04 Mon Sep 17 00:00:00 2001
From: Mihai Maruseac <mihaimaruseac@google.com>
Date: Tue, 16 Jun 2020 16:51:59 -0700
Subject: [PATCH 0349/1390] Enable builds of TF to link against `libc++`.

This should now enable more fuzzers and a nicer/stabler OSSFuzz integration.

PiperOrigin-RevId: 316784432
Change-Id: Iaef6c288221c3f7214d7806aa6913f0370a63544
---
 .bazelrc | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/.bazelrc b/.bazelrc
index 5ea8048d5d9..e21a1a32917 100644
--- a/.bazelrc
+++ b/.bazelrc
@@ -30,6 +30,7 @@
 #     short_logs:       Only log errors during build, skip warnings.
 #     monolithic:       Build all TF C++ code into a single shared object.
 #     dynamic_kernels:  Try to link all kernels dynamically (experimental).
+#     libc++:           Link against libc++ instead of stdlibc++
 #
 #
 # TF version options;
@@ -79,6 +80,14 @@
 #     elinux_armhf:    Embedded Linux options for armhf (ARMv7) CPU support.
 
 
+# Allow builds using libc++ as a linker library
+# This is mostly for OSSFuzz, so we also pass in the flags from environment to clean build file
+build:libc++ --action_env=CC
+build:libc++ --action_env=CXX
+build:libc++ --action_env=CXXFLAGS=-stdlib=libc++
+build:libc++ --action_env=PATH
+build:libc++ --define force_libcpp=enabled
+build:libc++ --linkopt -fuse-ld=lld
 
 # Android configs. Bazel needs to have --cpu and --fat_apk_cpu both set to the
 # target CPU to build transient dependencies correctly. See

From c49404dc26d9daa82a012253ca55f95eab6891eb Mon Sep 17 00:00:00 2001
From: Henry Tan <henrytan@google.com>
Date: Tue, 16 Jun 2020 17:00:16 -0700
Subject: [PATCH 0350/1390] TPU library refactor.

PiperOrigin-RevId: 316785838
Change-Id: I408dd6be75ed6a6ccfdbfad704a1a695906f2aa9
---
 .../core/tpu/kernels/tpu_compile_op_common.cc | 22 +++++++++----------
 .../core/tpu/kernels/tpu_compile_op_common.h  | 11 +++++-----
 2 files changed, 16 insertions(+), 17 deletions(-)

diff --git a/tensorflow/core/tpu/kernels/tpu_compile_op_common.cc b/tensorflow/core/tpu/kernels/tpu_compile_op_common.cc
index ae090913dc7..c8faba1d975 100644
--- a/tensorflow/core/tpu/kernels/tpu_compile_op_common.cc
+++ b/tensorflow/core/tpu/kernels/tpu_compile_op_common.cc
@@ -118,7 +118,7 @@ Status SetPerCoreArgShapes(
 
 }  // namespace
 
-Status TPUCompileOpKernelCommon::AssignReturnValueToCore(
+Status TpuCompileOpKernelCommon::AssignReturnValueToCore(
     std::vector<tpu::ShardingAndIndex>* retval_core_mapping) {
   std::vector<int> per_core_retval_counts(metadata_.num_cores_per_replica(), 0);
   for (int i = 0; i < metadata_.retvals_size(); ++i) {
@@ -149,7 +149,7 @@ Status TPUCompileOpKernelCommon::AssignReturnValueToCore(
   return Status::OK();
 }
 
-Status TPUCompileOpKernelCommon::BuildComputationArgumentDescriptions(
+Status TpuCompileOpKernelCommon::BuildComputationArgumentDescriptions(
     const std::vector<TensorShape>& arg_shapes,
     const OpInputList& guaranteed_constants, const XlaCompiler& compiler,
     std::vector<XlaCompiler::Argument>* args,
@@ -207,7 +207,7 @@ Status TPUCompileOpKernelCommon::BuildComputationArgumentDescriptions(
   return Status::OK();
 }
 
-Status TPUCompileOpKernelCommon::GetShardingInfo(
+Status TpuCompileOpKernelCommon::GetShardingInfo(
     absl::Span<const TensorShape> arg_shapes,
     const XlaCompiler::ShapeRepresentationFn shape_representation_fn,
     std::vector<tpu::ShardingAndIndex>* arg_core_mapping,
@@ -230,7 +230,7 @@ Status TPUCompileOpKernelCommon::GetShardingInfo(
   return Status::OK();
 }
 
-Status TPUCompileOpKernelCommon::CompileTFFunctionToHlo(
+Status TpuCompileOpKernelCommon::CompileTFFunctionToHlo(
     const FunctionLibraryDefinition& flib_def, int graph_def_version,
     const XlaCompiler::ShapeRepresentationFn shape_representation_fn,
     const std::vector<TensorShape>& arg_shapes,
@@ -260,7 +260,7 @@ Status TPUCompileOpKernelCommon::CompileTFFunctionToHlo(
   std::vector<tpu::ShardingAndIndex> retval_core_mapping(
       metadata_.retvals_size());
   TF_RETURN_IF_ERROR(
-      TPUCompileOpKernelCommon::AssignReturnValueToCore(&retval_core_mapping));
+      TpuCompileOpKernelCommon::AssignReturnValueToCore(&retval_core_mapping));
 
   LOG(INFO) << "Instantiating function:" << function.name();
   FunctionLibraryRuntime::Handle handle;
@@ -341,7 +341,7 @@ Status TPUCompileOpKernelCommon::CompileTFFunctionToHlo(
                                 args, compilation_result);
 }
 
-/* static */ void TPUCompileOpKernelCommon::ExitCountdown(
+/* static */ void TpuCompileOpKernelCommon::ExitCountdown(
     OpKernelContext* ctx, std::shared_ptr<std::atomic<bool>> done) {
   const int kSleepSeconds = 300;
   LOG(INFO) << "TpuCompileOp was cancelled. Sleeping for " << kSleepSeconds
@@ -355,7 +355,7 @@ Status TPUCompileOpKernelCommon::CompileTFFunctionToHlo(
   LogAndExit(42);
 }
 
-/* static */ Status TPUCompileOpKernelCommon::GetDynamicShapes(
+/* static */ Status TpuCompileOpKernelCommon::GetDynamicShapes(
     OpKernelContext* ctx, std::vector<TensorShape>* shapes) {
   OpInputList dynamic_shapes;
   TF_RETURN_IF_ERROR(ctx->input_list("dynamic_shapes", &dynamic_shapes));
@@ -368,7 +368,7 @@ Status TPUCompileOpKernelCommon::CompileTFFunctionToHlo(
   return Status::OK();
 }
 
-/* static */ Status TPUCompileOpKernelCommon::ComputeArgumentShapes(
+/* static */ Status TpuCompileOpKernelCommon::ComputeArgumentShapes(
     const tpu::TPUCompileMetadataProto& metadata,
     const std::vector<TensorShape>& dynamic_shapes,
     std::vector<TensorShape>* arg_shapes) {
@@ -409,7 +409,7 @@ Status TPUCompileOpKernelCommon::CompileTFFunctionToHlo(
 
 // Function arguments and return values lose their device assignments, so we
 // must recreate them.
-/* static */ Status TPUCompileOpKernelCommon::AssignDevicesToArgsAndRetvals(
+/* static */ Status TpuCompileOpKernelCommon::AssignDevicesToArgsAndRetvals(
     absl::Span<const tpu::ShardingAndIndex> arg_core_mapping,
     absl::Span<const tpu::ShardingAndIndex> retval_core_mapping, Graph* graph) {
   auto assign = [&](Node* node, const xla::OpSharding& sharding) -> Status {
@@ -444,7 +444,7 @@ Status TPUCompileOpKernelCommon::CompileTFFunctionToHlo(
 
 // Performs shape inference on the body of `graph`. Shapes for arguments
 // are taken from `metadata` and `arg_shapes`.
-/* static */ Status TPUCompileOpKernelCommon::RunShapeInferenceOnComputation(
+/* static */ Status TpuCompileOpKernelCommon::RunShapeInferenceOnComputation(
     const tpu::TPUCompileMetadataProto& metadata,
     const std::vector<PartialTensorShape>& arg_shapes, Graph* graph,
     FunctionLibraryRuntime* flr, GraphShapeInfo* shape_info) {
@@ -476,7 +476,7 @@ Status TPUCompileOpKernelCommon::CompileTFFunctionToHlo(
       shape_info);
 }
 
-Status TPUCompileOpKernelCommon::OptimizeGraph(
+Status TpuCompileOpKernelCommon::OptimizeGraph(
     const tpu::TPUCompileMetadataProto& metadata,
     const std::vector<PartialTensorShape>& arg_shapes,
     std::unique_ptr<Graph>* graph, FunctionLibraryRuntime* flr,
diff --git a/tensorflow/core/tpu/kernels/tpu_compile_op_common.h b/tensorflow/core/tpu/kernels/tpu_compile_op_common.h
index 74ae8729f8b..2c8d90643ef 100644
--- a/tensorflow/core/tpu/kernels/tpu_compile_op_common.h
+++ b/tensorflow/core/tpu/kernels/tpu_compile_op_common.h
@@ -33,9 +33,9 @@ namespace tensorflow {
 namespace tpu {
 
 // Abstract base class for TpuCompileOpKernel implementation.
-class TPUCompileOpKernelCommon {
+class TpuCompileOpKernelCommon {
  public:
-  TPUCompileOpKernelCommon(const std::string& mlir_module,
+  TpuCompileOpKernelCommon(const std::string& mlir_module,
                            const tpu::TPUCompileMetadataProto metadata,
                            int num_computations)
       : metadata_(metadata),
@@ -43,7 +43,7 @@ class TPUCompileOpKernelCommon {
         mlir_module_(mlir_module),
         num_computations_(num_computations) {}
 
-  TPUCompileOpKernelCommon(const NameAttrList& function,
+  TpuCompileOpKernelCommon(const NameAttrList& function,
                            const tpu::TPUCompileMetadataProto metadata,
                            int num_computations)
       : metadata_(metadata),
@@ -51,7 +51,7 @@ class TPUCompileOpKernelCommon {
         function_(function),
         num_computations_(num_computations) {}
 
-  virtual ~TPUCompileOpKernelCommon() = default;
+  virtual ~TpuCompileOpKernelCommon() = default;
 
   virtual void Compute(OpKernelContext* ctx) = 0;
 
@@ -153,8 +153,7 @@ class TPUCompileOpKernelCommon {
   int num_computations_;
 
  private:
-  TPUCompileOpKernelCommon(const TPUCompileOpKernelCommon&) = delete;
-  TPUCompileOpKernelCommon& operator=(const TPUCompileOpKernelCommon&) = delete;
+  DISALLOW_COPY_AND_ASSIGN(TpuCompileOpKernelCommon);
 };
 
 }  // namespace tpu

From 6ad3d3c05cb0dff9d6253a6089acaa7ac6c57604 Mon Sep 17 00:00:00 2001
From: Ran Chen <crccw@google.com>
Date: Tue, 16 Jun 2020 17:00:56 -0700
Subject: [PATCH 0351/1390] Make device_util_test as PY3 only

PiperOrigin-RevId: 316785955
Change-Id: Ibef166f3b3d095f9c7aaceb68c1c15952d91d250
---
 tensorflow/python/distribute/BUILD | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tensorflow/python/distribute/BUILD b/tensorflow/python/distribute/BUILD
index e39631d634f..96559a9a740 100644
--- a/tensorflow/python/distribute/BUILD
+++ b/tensorflow/python/distribute/BUILD
@@ -120,6 +120,8 @@ py_library(
 cuda_py_test(
     name = "device_util_test",
     srcs = ["device_util_test.py"],
+    python_version = "PY3",
+    tags = ["no_oss_py2"],
     deps = [
         ":combinations",
         ":device_util",

From de0d8ddc270f33b899d56b04eed622c2a0906006 Mon Sep 17 00:00:00 2001
From: Yunlu Li <yunluli@google.com>
Date: Tue, 16 Jun 2020 17:01:18 -0700
Subject: [PATCH 0352/1390] Set the type of sparsity metadata accordingly to
 further reduce the model size.

PiperOrigin-RevId: 316786000
Change-Id: I8748ba453802ea86d586170bc2450ed970ed316c
---
 .../compiler/mlir/lite/flatbuffer_export.cc   | 65 ++++++++++++++++---
 1 file changed, 55 insertions(+), 10 deletions(-)

diff --git a/tensorflow/compiler/mlir/lite/flatbuffer_export.cc b/tensorflow/compiler/mlir/lite/flatbuffer_export.cc
index df84b028f63..a260670015a 100644
--- a/tensorflow/compiler/mlir/lite/flatbuffer_export.cc
+++ b/tensorflow/compiler/mlir/lite/flatbuffer_export.cc
@@ -1406,22 +1406,67 @@ BufferOffset<tflite::SparsityParameters> Translator::BuildSparsityParameters(
       for (int j = 0; j < segments.size(); j++) {
         vector_segments[j] = segments[j].dyn_cast<mlir::IntegerAttr>().getInt();
       }
-      auto array_segments =
-          tflite::CreateInt32Vector(builder_,
-                                    builder_.CreateVector(vector_segments))
-              .Union();
+      tflite::SparseIndexVector segments_type;
+      BufferOffset<void> array_segments;
+      // The segment array is sorted.
+      // TODO(b/147449640): Clean this up with util functions.
+      int max_of_segments = vector_segments[segments.size() - 1];
+      if (max_of_segments <= UINT8_MAX) {
+        segments_type = tflite::SparseIndexVector_Uint8Vector;
+        std::vector<uint8_t> uint8_vector(vector_segments.begin(),
+                                          vector_segments.end());
+        array_segments = tflite::CreateUint8Vector(
+                             builder_, builder_.CreateVector(uint8_vector))
+                             .Union();
+      } else if (max_of_segments <= UINT16_MAX) {
+        segments_type = tflite::SparseIndexVector_Uint16Vector;
+        std::vector<uint16_t> uint16_vector(vector_segments.begin(),
+                                            vector_segments.end());
+        array_segments = tflite::CreateUint16Vector(
+                             builder_, builder_.CreateVector(uint16_vector))
+                             .Union();
+      } else {
+        segments_type = tflite::SparseIndexVector_Int32Vector;
+        array_segments = tflite::CreateInt32Vector(
+                             builder_, builder_.CreateVector(vector_segments))
+                             .Union();
+      }
+
       auto indices = dim_metadata.indices();
       std::vector<int> vector_indices(indices.size(), 0);
+      int max_of_indices = 0;
       for (int j = 0; j < indices.size(); j++) {
         vector_indices[j] = indices[j].dyn_cast<mlir::IntegerAttr>().getInt();
+        if (vector_indices[j] > max_of_indices) {
+          max_of_indices = vector_indices[j];
+        }
       }
-      auto array_indices = tflite::CreateInt32Vector(
-                               builder_, builder_.CreateVector(vector_indices))
-                               .Union();
+      tflite::SparseIndexVector indices_type;
+      BufferOffset<void> array_indices;
+      if (max_of_indices <= UINT8_MAX) {
+        indices_type = tflite::SparseIndexVector_Uint8Vector;
+        std::vector<uint8_t> uint8_vector(vector_indices.begin(),
+                                          vector_indices.end());
+        array_indices = tflite::CreateUint8Vector(
+                            builder_, builder_.CreateVector(uint8_vector))
+                            .Union();
+      } else if (max_of_indices <= UINT16_MAX) {
+        indices_type = tflite::SparseIndexVector_Uint16Vector;
+        std::vector<uint16_t> uint16_vector(vector_indices.begin(),
+                                            vector_indices.end());
+        array_indices = tflite::CreateUint16Vector(
+                            builder_, builder_.CreateVector(uint16_vector))
+                            .Union();
+      } else {
+        indices_type = tflite::SparseIndexVector_Int32Vector;
+        array_indices = tflite::CreateInt32Vector(
+                            builder_, builder_.CreateVector(vector_indices))
+                            .Union();
+      }
+
       fb_dim_metadata[i] = tflite::CreateDimensionMetadata(
-          builder_, tflite::DimensionType_SPARSE_CSR, 0,
-          tflite::SparseIndexVector_Int32Vector, array_segments,
-          tflite::SparseIndexVector_Int32Vector, array_indices);
+          builder_, tflite::DimensionType_SPARSE_CSR, 0, segments_type,
+          array_segments, indices_type, array_indices);
     }
   }
 

From 6ffd6820ff844917e5f3d673695158396806d1be Mon Sep 17 00:00:00 2001
From: Vo Van Nghia <vovannghia2409@gmail.com>
Date: Wed, 17 Jun 2020 07:13:24 +0700
Subject: [PATCH 0353/1390] Add dependecies

---
 tensorflow/c/env.cc | 7 ++++---
 tensorflow/c/env.h  | 2 ++
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/tensorflow/c/env.cc b/tensorflow/c/env.cc
index 3d490d95e66..43879a18359 100644
--- a/tensorflow/c/env.cc
+++ b/tensorflow/c/env.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include "tensorflow/c/c_api_internal.h"
 #include "tensorflow/c/tf_status_helper.h"
 #include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/path.h"
 #include "tensorflow/core/platform/types.h"
 
 struct TF_StringStream {
@@ -148,9 +149,9 @@ TF_StringStream* TF_GetLocalTempDirectories() {
 
 void TF_GetTempFileName(const char* extension, std::string* name,
                         TF_Status* status) {
-  *name = ::tensorflow::Env::Default()->GetTempFilename(extension);
-  if (*name.length() == 0) {
-    TF_SetStatus(status, TF_INTERNAL, "Can not create temp file name");
+  *name = ::tensorflow::io::GetTempFilename(extension);
+  if (name->length() == 0) {
+    TF_SetStatus(status, TF_INTERNAL, "Can not get temp file name");
   } else {
     TF_SetStatus(status, TF_OK, "");
   }
diff --git a/tensorflow/c/env.h b/tensorflow/c/env.h
index b50d0fdec03..273a3b5e142 100644
--- a/tensorflow/c/env.h
+++ b/tensorflow/c/env.h
@@ -20,6 +20,8 @@ limitations under the License.
 #include <stddef.h>
 #include <stdint.h>
 
+#include <string>
+
 #include "tensorflow/c/c_api.h"
 #include "tensorflow/c/tf_file_statistics.h"
 

From 01cfc8a8a3d6176b1f028886087de4eaaa64ce2f Mon Sep 17 00:00:00 2001
From: Raman Sarokin <sorokin@google.com>
Date: Tue, 16 Jun 2020 17:07:23 -0700
Subject: [PATCH 0354/1390] Resize converted to new style.

PiperOrigin-RevId: 316787130
Change-Id: I67db63fa6eaec2bccc87031f2e202da65a2ce439
---
 .../lite/delegates/gpu/cl/kernels/resize.cc   | 287 +++++++++---------
 1 file changed, 147 insertions(+), 140 deletions(-)

diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/resize.cc b/tensorflow/lite/delegates/gpu/cl/kernels/resize.cc
index 5d578fe6e09..6aa2d1d2570 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/resize.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/resize.cc
@@ -25,168 +25,166 @@ namespace gpu {
 namespace cl {
 namespace {
 
-std::string GetResizeCode(
-    const OperationDef& op_def, SamplingType sampling_type,
-    bool half_pixel_centers,
-    const std::vector<ElementwiseOperation*>& linked_operations) {
-  TensorCodeGenerator src_tensor(
-      "src_data", WHSPoint{"src_size.x", "src_size.y", "src_size.z"},
-      op_def.src_tensors[0]);
-  TensorCodeGenerator dst_tensor(
-      "dst_data", WHSPoint{"dst_size.x", "dst_size.y", "dst_size.z"},
-      op_def.dst_tensors[0]);
+std::string GetResizeCode(const OperationDef& op_def,
+                          SamplingType sampling_type, bool half_pixel_centers,
+                          Arguments* args) {
+  auto src_desc = absl::make_unique<TensorDescriptor>(op_def.src_tensors[0]);
+  if (op_def.IsBatchSupported()) {
+    src_desc->SetStateVar("BatchedWidth", "true");
+  }
+  args->AddObjectRef("src_tensor", AccessType::READ, std::move(src_desc));
+  auto dst_desc = absl::make_unique<TensorDescriptor>(op_def.dst_tensors[0]);
+  if (op_def.IsBatchSupported()) {
+    dst_desc->SetStateVar("BatchedWidth", "true");
+  }
+  args->AddObjectRef("dst_tensor", AccessType::WRITE, std::move(dst_desc));
+  args->AddInt("border_x");
+  args->AddInt("border_y");
+  args->AddFloat("scale_factor_x");
+  args->AddFloat("scale_factor_y");
 
   std::string c = GetCommonDefines(op_def.precision);
   c += "__kernel void main_function(\n";
-  c += src_tensor.GetDeclaration(AccessType::READ);
-  c += GetArgsDeclaration(linked_operations);
-  c += dst_tensor.GetDeclaration(AccessType::WRITE) + ",\n";
-  c += "    int4 src_size,         \n";
-  c += "    int4 dst_size,         \n";
-  c += "    int2 border,           \n";
-  c += "    float2 scale_factor    \n";
-  c += ") {\n";
+  c += "$0) {\n";
   c += "  int Y = get_global_id(1);\n";
   c += "  int Z = get_global_id(2);\n";
   if (op_def.IsBatchSupported()) {
     c += "  int linear_id = get_global_id(0);\n";
-    c += "  int X = linear_id / dst_size.w;\n";
-    c += "  int B = linear_id % dst_size.w;\n";
-    c += "  if (get_global_id(0) >= dst_size.x || Y >= dst_size.y || Z >= "
-         "dst_size.z) return;\n";
+    c += "  int X = linear_id / args.dst_tensor.Batch();\n";
+    c += "  int B = linear_id % args.dst_tensor.Batch();\n";
+    c += "  if (linear_id >= args.dst_tensor.Width() || Y >= "
+         "args.dst_tensor.Height() || Z >= args.dst_tensor.Slices()) return;\n";
   } else {
     c += "  int X = get_global_id(0);\n";
-    c += "  if (X >= dst_size.x || Y >= dst_size.y || Z >= dst_size.z) "
-         "return;\n";
+    c += "  if (X >= args.dst_tensor.Width() || Y >= args.dst_tensor.Height() "
+         "|| Z >= args.dst_tensor.Slices()) return;\n";
   }
   if (sampling_type == SamplingType::NEAREST) {
-    c += "  int2 coord = (int2)(X * scale_factor.x, Y * scale_factor.y);\n";
+    c += "  int2 coord = (int2)(X * args.scale_factor_x, Y * "
+         "args.scale_factor_y);\n";
     if (op_def.IsBatchSupported()) {
-      c += "  coord.x = coord.x * src_size.w + B;\n";
-      c += "  X = X * src_size.w + B;\n";
+      c += "  coord.x = coord.x * args.src_tensor.Batch() + B;\n";
+      c += "  X = X * args.src_tensor.Batch() + B;\n";
     }
-    c += "  FLT4 r0 = " + src_tensor.ReadWHS("coord.x", "coord.y", "Z") + ";\n";
+    c += "  FLT4 r0 = args.src_tensor.Read(coord.x, coord.y, Z);\n";
   } else {
     if (half_pixel_centers) {
-      c += "  float2 f_coords = ((float2)(X, Y) + 0.5f) * scale_factor - "
+      c += "  float2 f_coords = ((float2)(X, Y) + 0.5f) * "
+           "(float2)(args.scale_factor_x, args.scale_factor_y) - "
            "0.5f;\n";
     } else {
-      c += "  float2 f_coords = (float2)(X, Y) * scale_factor;\n";
+      c += "  float2 f_coords = (float2)(X, Y) * (float2)(args.scale_factor_x, "
+           "args.scale_factor_y);\n";
     }
     c += "  float2 f_coords_floor = floor(f_coords);\n";
     c += "  int2 coords_floor = (int2)(f_coords_floor.x, f_coords_floor.y);\n";
     c += "  int4 st;\n";
     c += "  st.xy = max(coords_floor, (int2)(0, 0));\n";
-    c += "  st.zw = min(coords_floor + (int2)(1, 1), border);\n";
+    c += "  st.zw = min(coords_floor + (int2)(1, 1), (int2)(args.border_x, "
+         "args.border_y));\n";
     c += "  float2 t = f_coords - f_coords_floor;\n";
     if (op_def.IsBatchSupported()) {
-      c += "  st.x = st.x * src_size.w + B;\n";
-      c += "  st.z = st.z * src_size.w + B;\n";
-      c += "  X = X * src_size.w + B;\n";
+      c += "  st.x = st.x * args.src_tensor.Batch() + B;\n";
+      c += "  st.z = st.z * args.src_tensor.Batch() + B;\n";
+      c += "  X = X * args.src_tensor.Batch() + B;\n";
     }
-    c += "  float4 src0 = " + src_tensor.ReadAsFloatWHS("st.x", "st.y", "Z") +
-         ";\n";
-    c += "  float4 src1 = " + src_tensor.ReadAsFloatWHS("st.z", "st.y", "Z") +
-         ";\n";
-    c += "  float4 src2 = " + src_tensor.ReadAsFloatWHS("st.x", "st.w", "Z") +
-         ";\n";
-    c += "  float4 src3 = " + src_tensor.ReadAsFloatWHS("st.z", "st.w", "Z") +
-         ";\n";
+    c += "  float4 src0 = args.src_tensor.Read<float>(st.x, st.y, Z);\n";
+    c += "  float4 src1 = args.src_tensor.Read<float>(st.z, st.y, Z);\n";
+    c += "  float4 src2 = args.src_tensor.Read<float>(st.x, st.w, Z);\n";
+    c += "  float4 src3 = args.src_tensor.Read<float>(st.z, st.w, Z);\n";
     c += "  FLT4 r0 = TO_FLT4(mix(mix(src0, src1, t.x), mix(src2, src3, t.x), "
          "t.y));\n";
   }
-  const LinkingContext context{"r0", "X", "Y", "Z"};
-  c += PostProcess(linked_operations, context);
-  c += "  " + dst_tensor.WriteWHS("r0", "X", "Y", "Z");
+  c += "  args.dst_tensor.Write(r0, X, Y, Z);\n";
   c += "}\n";
   return c;
 }
 
-std::string GetResize3DCode(
-    const OperationDef& op_def, SamplingType sampling_type,
-    const std::vector<ElementwiseOperation*>& linked_operations) {
-  TensorCodeGenerator src_tensor(
-      "src_data",
-      WHDSPoint{"src_size.x", "src_size.y", "src_size.z", "src_size.w"},
-      op_def.src_tensors[0]);
-  TensorCodeGenerator dst_tensor(
-      "dst_data",
-      WHDSPoint{"dst_size.x", "dst_size.y", "dst_size.z", "dst_size.w"},
-      op_def.dst_tensors[0]);
+std::string GetResize3DCode(const OperationDef& op_def,
+                            SamplingType sampling_type, Arguments* args) {
+  auto src_desc = absl::make_unique<TensorDescriptor>(op_def.src_tensors[0]);
+  if (op_def.IsBatchSupported()) {
+    src_desc->SetStateVar("BatchedWidth", "true");
+  }
+  args->AddObjectRef("src_tensor", AccessType::READ, std::move(src_desc));
+  auto dst_desc = absl::make_unique<TensorDescriptor>(op_def.dst_tensors[0]);
+  if (op_def.IsBatchSupported()) {
+    dst_desc->SetStateVar("BatchedWidth", "true");
+  }
+  args->AddObjectRef("dst_tensor", AccessType::WRITE, std::move(dst_desc));
+  args->AddInt("border_x");
+  args->AddInt("border_y");
+  args->AddInt("border_z");
+  args->AddFloat("scale_factor_x");
+  args->AddFloat("scale_factor_y");
+  args->AddFloat("scale_factor_z");
 
   std::string c = GetCommonDefines(op_def.precision);
   c += "__kernel void main_function(\n";
-  c += src_tensor.GetDeclaration(AccessType::READ);
-  c += GetArgsDeclaration(linked_operations);
-  c += dst_tensor.GetDeclaration(AccessType::WRITE) + ",\n";
-  c += "    int4 src_size,         \n";
-  c += "    int4 dst_size,         \n";
-  if (op_def.IsBatchSupported()) {
-    c += "    int batch_size,      \n";
-  }
-  c += "    int4 border,           \n";
-  c += "    float4 scale_factor    \n";
-  c += ") {\n";
+  c += "$0) {\n";
   c += "  int Y = get_global_id(1);\n";
   c += "  int linear_id_z = get_global_id(2);\n";
-  c += "  int S = linear_id_z % dst_size.w;\n";
-  c += "  int Z = linear_id_z / dst_size.w;\n";
+  c += "  int S = linear_id_z % args.dst_tensor.Slices();\n";
+  c += "  int Z = linear_id_z / args.dst_tensor.Slices();\n";
   if (op_def.IsBatchSupported()) {
     c += "  int linear_id = get_global_id(0);\n";
-    c += "  int X = linear_id / batch_size;\n";
-    c += "  int B = linear_id % batch_size;\n";
-    c += "  if (linear_id >= dst_size.x || Y >= dst_size.y || Z >= "
-         "dst_size.z) return;\n";
+    c += "  int X = linear_id / args.dst_tensor.Batch();\n";
+    c += "  int B = linear_id % args.dst_tensor.Batch();\n";
+    c += "  if (linear_id >= args.dst_tensor.Width() || Y >= "
+         "args.dst_tensor.Height() || Z >= args.dst_tensor.Depth()) return;\n";
   } else {
     c += "  int X = get_global_id(0);\n";
-    c += "  if (X >= dst_size.x || Y >= dst_size.y || Z >= dst_size.z) "
-         "return;\n";
+    c += "  if (X >= args.dst_tensor.Width() || Y >= args.dst_tensor.Height() "
+         "|| Z >= args.dst_tensor.Depth()) return;\n";
   }
   if (sampling_type == SamplingType::NEAREST) {
-    c += "  int4 coord = (int4)(X * scale_factor.x, Y * scale_factor.y, Z * "
-         "scale_factor.z, 0);\n";
+    c += "  int4 coord = (int4)(X * args.scale_factor_x, Y * "
+         "args.scale_factor_y, Z * "
+         "args.scale_factor_z, 0);\n";
     if (op_def.IsBatchSupported()) {
-      c += "  coord.x = coord.x * batch_size + B;\n";
-      c += "  X = X * batch_size + B;\n";
+      c += "  coord.x = coord.x * args.src_tensor.Batch() + B;\n";
+      c += "  X = X * args.src_tensor.Batch() + B;\n";
     }
-    c += "  FLT4 r0 = " +
-         src_tensor.ReadWHDS("coord.x", "coord.y", "coord.z", "S") + ";\n";
+    c += "  FLT4 r0 = args.src_tensor.Read(coord.x, coord.y, coord.z, S);\n";
   } else {
-    c += "  float4 f_coords = (float4)(X, Y, Z, 0) * scale_factor;\n";
+    c += "  float4 f_coords;\n";
+    c += "  f_coords.x = (float)(X) * args.scale_factor_x;\n";
+    c += "  f_coords.y = (float)(Y) * args.scale_factor_y;\n";
+    c += "  f_coords.z = (float)(Z) * args.scale_factor_z;\n";
     c += "  int4 start = (int4)(f_coords.x, f_coords.y, f_coords.z, 0);\n";
-    c += "  int4 end = min(start + (int4)(1, 1, 1, 0), border);\n";
+    c += "  int4 end;\n";
+    c += "  end.x = min(start.x + 1, args.border_x);\n";
+    c += "  end.y = min(start.y + 1, args.border_y);\n";
+    c += "  end.z = min(start.z + 1, args.border_z);\n";
     c += "  float4 t = f_coords - (float4)(start.x, start.y, start.z, 0.0f);\n";
     if (op_def.IsBatchSupported()) {
-      c += "  start.x = start.x * batch_size + B;\n";
-      c += "  end.x = end.x * batch_size + B;\n";
-      c += "  X = X * batch_size + B;\n";
+      c += "  start.x = start.x * args.src_tensor.Batch() + B;\n";
+      c += "  end.x = end.x * args.src_tensor.Batch() + B;\n";
+      c += "  X = X * args.src_tensor.Batch() + B;\n";
     }
-    c += "  float4 src0 = " +
-         src_tensor.ReadAsFloatWHDS("start.x", "start.y", "start.z", "S") +
-         ";\n";
-    c += "  float4 src1 = " +
-         src_tensor.ReadAsFloatWHDS("end.x", "start.y", "start.z", "S") + ";\n";
-    c += "  float4 src2 = " +
-         src_tensor.ReadAsFloatWHDS("start.x", "end.y", "start.z", "S") + ";\n";
-    c += "  float4 src3 = " +
-         src_tensor.ReadAsFloatWHDS("end.x", "end.y", "start.z", "S") + ";\n";
-    c += "  float4 src4 = " +
-         src_tensor.ReadAsFloatWHDS("start.x", "start.y", "end.z", "S") + ";\n";
-    c += "  float4 src5 = " +
-         src_tensor.ReadAsFloatWHDS("end.x", "start.y", "end.z", "S") + ";\n";
-    c += "  float4 src6 = " +
-         src_tensor.ReadAsFloatWHDS("start.x", "end.y", "end.z", "S") + ";\n";
-    c += "  float4 src7 = " +
-         src_tensor.ReadAsFloatWHDS("end.x", "end.y", "end.z", "S") + ";\n";
+    c += "  float4 src0 = args.src_tensor.Read<float>(start.x, start.y, "
+         "start.z, S);\n";
+    c += "  float4 src1 = args.src_tensor.Read<float>(end.x, start.y, start.z, "
+         "S);\n";
+    c += "  float4 src2 = args.src_tensor.Read<float>(start.x, end.y, start.z, "
+         "S);\n";
+    c += "  float4 src3 = args.src_tensor.Read<float>(end.x, end.y, start.z, "
+         "S);\n";
+    c += "  float4 src4 = args.src_tensor.Read<float>(start.x, start.y, end.z, "
+         "S);\n";
+    c += "  float4 src5 = args.src_tensor.Read<float>(end.x, start.y, end.z, "
+         "S);\n";
+    c += "  float4 src6 = args.src_tensor.Read<float>(start.x, end.y, end.z, "
+         "S);\n";
+    c += "  float4 src7 = args.src_tensor.Read<float>(end.x, end.y, end.z, "
+         "S);\n";
     c +=
         "  float4 t0 = mix(mix(src0, src1, t.x), mix(src2, src3, t.x), t.y);\n";
     c +=
         "  float4 t1 = mix(mix(src4, src5, t.x), mix(src6, src7, t.x), t.y);\n";
     c += "  FLT4 r0 = TO_FLT4(mix(t0, t1, t.z));\n";
   }
-  const LinkingContext context{"r0", "X", "Y", "S"};
-  c += PostProcess(linked_operations, context);
-  c += "  " + dst_tensor.WriteWHDS("r0", "X", "Y", "Z", "S");
+  c += "  args.dst_tensor.Write(r0, X, Y, Z, S);\n";
   c += "}\n";
   return c;
 }
@@ -210,27 +208,32 @@ Resize& Resize::operator=(Resize&& operation) {
 }
 
 absl::Status Resize::Compile(const CreationContext& creation_context) {
-  const auto code = GetResizeCode(definition_, attr_.type,
-                                  attr_.half_pixel_centers, linked_operations_);
+  std::string code =
+      GetResizeCode(definition_, attr_.type, attr_.half_pixel_centers, &args_);
+  std::string element_wise_code;
+  RETURN_IF_ERROR(
+      MergeOperations(linked_operations_, &args_, &element_wise_code));
+  RETURN_IF_ERROR(args_.TransformToCLCode(creation_context.device->GetInfo(),
+                                          {{"dst_tensor", element_wise_code}},
+                                          &code));
   return creation_context.cache->GetOrCreateCLKernel(
       code, "main_function", *creation_context.context,
       *creation_context.device, &kernel_);
 }
 
 absl::Status Resize::BindArguments() {
-  kernel_.ResetBindingCounter();
-  RETURN_IF_ERROR(kernel_.SetMemoryAuto(src_[0]->GetMemoryPtr()));
-  RETURN_IF_ERROR(BindArgs(&kernel_, linked_operations_));
-  RETURN_IF_ERROR(kernel_.SetMemoryAuto(dst_[0]->GetMemoryPtrForWriting()));
-  RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->GetWBatchedHSB()));
-  RETURN_IF_ERROR(kernel_.SetBytesAuto(dst_[0]->GetWBatchedHSB()));
-  RETURN_IF_ERROR(
-      kernel_.SetBytesAuto(int2(src_[0]->Width() - 1, src_[0]->Height() - 1)));
-  float2 scale_factor =
-      float2(CalculateResizeScale(src_[0]->Width(), dst_[0]->Width(), attr_),
-             CalculateResizeScale(src_[0]->Height(), dst_[0]->Height(), attr_));
-  RETURN_IF_ERROR(kernel_.SetBytesAuto(scale_factor));
-  return absl::OkStatus();
+  RETURN_IF_ERROR(args_.SetObjectRef("src_tensor", src_[0]));
+  RETURN_IF_ERROR(args_.SetObjectRef("dst_tensor", dst_[0]));
+  RETURN_IF_ERROR(args_.SetInt("border_x", src_[0]->Width() - 1));
+  RETURN_IF_ERROR(args_.SetInt("border_y", src_[0]->Height() - 1));
+  RETURN_IF_ERROR(args_.SetFloat(
+      "scale_factor_x",
+      CalculateResizeScale(src_[0]->Width(), dst_[0]->Width(), attr_)));
+  RETURN_IF_ERROR(args_.SetFloat(
+      "scale_factor_y",
+      CalculateResizeScale(src_[0]->Height(), dst_[0]->Height(), attr_)));
+  RETURN_IF_ERROR(SetArguments(linked_operations_, &args_));
+  return args_.Bind(kernel_.kernel());
 }
 
 int3 Resize::GetGridSize() const {
@@ -272,31 +275,35 @@ Resize3D& Resize3D::operator=(Resize3D&& operation) {
 }
 
 absl::Status Resize3D::Compile(const CreationContext& creation_context) {
-  const auto code =
-      GetResize3DCode(definition_, attr_.type, linked_operations_);
+  std::string code = GetResize3DCode(definition_, attr_.type, &args_);
+  std::string element_wise_code;
+  RETURN_IF_ERROR(
+      MergeOperations(linked_operations_, &args_, &element_wise_code));
+  RETURN_IF_ERROR(args_.TransformToCLCode(creation_context.device->GetInfo(),
+                                          {{"dst_tensor", element_wise_code}},
+                                          &code));
   return creation_context.cache->GetOrCreateCLKernel(
       code, "main_function", *creation_context.context,
       *creation_context.device, &kernel_);
 }
 
 absl::Status Resize3D::BindArguments() {
-  kernel_.ResetBindingCounter();
-  RETURN_IF_ERROR(kernel_.SetMemoryAuto(src_[0]->GetMemoryPtr()));
-  RETURN_IF_ERROR(BindArgs(&kernel_, linked_operations_));
-  RETURN_IF_ERROR(kernel_.SetMemoryAuto(dst_[0]->GetMemoryPtrForWriting()));
-  RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->GetWBatchedHDS()));
-  RETURN_IF_ERROR(kernel_.SetBytesAuto(dst_[0]->GetWBatchedHDS()));
-  if (definition_.IsBatchSupported()) {
-    RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->Batch()));
-  }
-  RETURN_IF_ERROR(kernel_.SetBytesAuto(int4(
-      src_[0]->Width() - 1, src_[0]->Height() - 1, src_[0]->Depth() - 1, 0)));
-  float4 scale_factor = float4(
-      CalculateResizeScale(src_[0]->Width(), dst_[0]->Width(), attr_),
-      CalculateResizeScale(src_[0]->Height(), dst_[0]->Height(), attr_),
-      CalculateResizeScale(src_[0]->Depth(), dst_[0]->Depth(), attr_), 1.0f);
-  RETURN_IF_ERROR(kernel_.SetBytesAuto(scale_factor));
-  return absl::OkStatus();
+  RETURN_IF_ERROR(args_.SetObjectRef("src_tensor", src_[0]));
+  RETURN_IF_ERROR(args_.SetObjectRef("dst_tensor", dst_[0]));
+  RETURN_IF_ERROR(args_.SetInt("border_x", src_[0]->Width() - 1));
+  RETURN_IF_ERROR(args_.SetInt("border_y", src_[0]->Height() - 1));
+  RETURN_IF_ERROR(args_.SetInt("border_z", src_[0]->Depth() - 1));
+  RETURN_IF_ERROR(args_.SetFloat(
+      "scale_factor_x",
+      CalculateResizeScale(src_[0]->Width(), dst_[0]->Width(), attr_)));
+  RETURN_IF_ERROR(args_.SetFloat(
+      "scale_factor_y",
+      CalculateResizeScale(src_[0]->Height(), dst_[0]->Height(), attr_)));
+  RETURN_IF_ERROR(args_.SetFloat(
+      "scale_factor_z",
+      CalculateResizeScale(src_[0]->Depth(), dst_[0]->Depth(), attr_)));
+  RETURN_IF_ERROR(SetArguments(linked_operations_, &args_));
+  return args_.Bind(kernel_.kernel());
 }
 
 int3 Resize3D::GetGridSize() const {

From 7873e14cf9900b4c01d3b6f06e36f843dbb7f05b Mon Sep 17 00:00:00 2001
From: Andrew Audibert <aaudibert@google.com>
Date: Tue, 16 Jun 2020 17:16:15 -0700
Subject: [PATCH 0355/1390] [tf.data] Add note about performance cost of
 `Dataset.unbatch`.

PiperOrigin-RevId: 316788559
Change-Id: I87d0d8a2be0a6d6751baa838ea8acc1eb9ee8d9a
---
 tensorflow/python/data/ops/dataset_ops.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tensorflow/python/data/ops/dataset_ops.py b/tensorflow/python/data/ops/dataset_ops.py
index 013447dfd8d..586b82e9ca6 100644
--- a/tensorflow/python/data/ops/dataset_ops.py
+++ b/tensorflow/python/data/ops/dataset_ops.py
@@ -2099,6 +2099,10 @@ name=None))
     >>> list(dataset.as_numpy_iterator())
     [1, 2, 3, 1, 2, 1, 2, 3, 4]
 
+    Note: `unbatch` requires a data copy to slice up the batched tensor into
+    smaller, unbatched tensors. When optimizing performance, try to avoid
+    unnecessary usage of `unbatch`.
+
     Returns:
       A `Dataset`.
     """

From cdb6e80b21997c2b24336eb524134198fa6754d8 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 16 Jun 2020 17:25:15 -0700
Subject: [PATCH 0356/1390] Qualify uses of std::string

PiperOrigin-RevId: 316789814
Change-Id: Ice83a74e70122008e090af3b818b9920abf7f5bc
---
 .../toco/tensorflow_graph_matching/cluster.h  | 14 +++----
 .../cluster_utils.cc                          |  4 +-
 .../resolve_cluster.cc                        | 13 +++---
 .../resolve_cluster.h                         |  4 +-
 .../tensorflow_graph_matching/resolve_svdf.cc | 40 +++++++++----------
 .../tensorflow_graph_matching/resolve_svdf.h  |  6 +--
 .../resolve_svdf_test.cc                      | 24 +++++------
 7 files changed, 53 insertions(+), 52 deletions(-)

diff --git a/tensorflow/lite/toco/tensorflow_graph_matching/cluster.h b/tensorflow/lite/toco/tensorflow_graph_matching/cluster.h
index af268ddd370..7dc79f17a6b 100644
--- a/tensorflow/lite/toco/tensorflow_graph_matching/cluster.h
+++ b/tensorflow/lite/toco/tensorflow_graph_matching/cluster.h
@@ -47,7 +47,7 @@ class Cluster {
   // 2- All the nodes in GraphDef which belong to this cluster.
   void SetGraphDefInfo(const tensorflow::GraphDef* graph_def);
 
-  const string& GetName() const { return name_; }
+  const std::string& GetName() const { return name_; }
 
   const std::vector<std::unique_ptr<tensorflow::NodeDef>>& GetNewNodes() const {
     return new_nodes_;
@@ -55,18 +55,18 @@ class Cluster {
 
   const std::vector<const tensorflow::NodeDef*>& GetNodes() { return nodes_; }
 
-  void SetName(const string& name) { name_ = name; }
+  void SetName(const std::string& name) { name_ = name; }
 
-  void SetDevice(const string& device) { device_ = device; }
+  void SetDevice(const std::string& device) { device_ = device; }
 
   // Find the input(s) and output(s) of this Cluster.
   bool FindClusterInputsAndOutputs();
 
  protected:
-  string name_;
-  string device_;
-  std::vector<string> inputs_;
-  std::vector<string> outputs_;
+  std::string name_;
+  std::string device_;
+  std::vector<std::string> inputs_;
+  std::vector<std::string> outputs_;
 
   // Used to hold the pointers to nodes which are in this cluster. These nodes
   // are pointing to the nodes in graph_def_.
diff --git a/tensorflow/lite/toco/tensorflow_graph_matching/cluster_utils.cc b/tensorflow/lite/toco/tensorflow_graph_matching/cluster_utils.cc
index 8a010ef8208..fb12e62d9af 100644
--- a/tensorflow/lite/toco/tensorflow_graph_matching/cluster_utils.cc
+++ b/tensorflow/lite/toco/tensorflow_graph_matching/cluster_utils.cc
@@ -16,8 +16,8 @@ limitations under the License.
 #include "tensorflow/lite/toco/toco_types.h"
 namespace toco {
 
-bool StrContains(const string& x, const string& search_pattern) {
-  return x.find(search_pattern) != string::npos;
+bool StrContains(const std::string& x, const std::string& search_pattern) {
+  return x.find(search_pattern) != std::string::npos;
 }
 
 void Transpose2DTensor(const float* tensor, int row, int col,
diff --git a/tensorflow/lite/toco/tensorflow_graph_matching/resolve_cluster.cc b/tensorflow/lite/toco/tensorflow_graph_matching/resolve_cluster.cc
index 7a187512078..f2645e89511 100644
--- a/tensorflow/lite/toco/tensorflow_graph_matching/resolve_cluster.cc
+++ b/tensorflow/lite/toco/tensorflow_graph_matching/resolve_cluster.cc
@@ -33,7 +33,8 @@ using tensorflow::GraphDef;
 using tensorflow::NodeDef;
 
 void AddNodeToGraph(const NodeDef& node,
-                    const std::vector<string>& cluster_names, GraphDef* graph) {
+                    const std::vector<std::string>& cluster_names,
+                    GraphDef* graph) {
   NodeDef* new_node = graph->add_node();
   new_node->set_op(node.op());
   new_node->set_name(node.name());
@@ -41,9 +42,9 @@ void AddNodeToGraph(const NodeDef& node,
   // If the inputs are coming from a node which belongs to another cluster, then
   // those inputs are renamed to the source cluster name. Otherwise the original
   // input name is used.
-  for (const string& node_input : node.input()) {
+  for (const std::string& node_input : node.input()) {
     bool input_from_cluster = false;
-    for (const string& cluster_name : cluster_names) {
+    for (const std::string& cluster_name : cluster_names) {
       if (StrContains(node_input, cluster_name) &&
           !StrContains(node.name(), cluster_name)) {
         new_node->add_input(cluster_name);
@@ -62,7 +63,7 @@ void AddNodeToGraph(const NodeDef& node,
 
 bool FindCluster(const ClusterFactoryInterface& cluster_factory,
                  const GraphDef& graph_def,
-                 std::unordered_map<string, bool>* is_node_in_cluster,
+                 std::unordered_map<std::string, bool>* is_node_in_cluster,
                  std::vector<std::unique_ptr<Cluster>>* clusters) {
   for (const NodeDef& node : graph_def.node()) {
     // If the node is not assigned to any cluster, then we check if it belong to
@@ -90,12 +91,12 @@ std::unique_ptr<GraphDef> MaybeResolveClusters(
   std::unique_ptr<GraphDef> pruned_graph(new GraphDef);
   // The structure to keep track of which cluster each node is assigned to, and
   // to initialize them to all un-assigned,
-  std::unordered_map<string, bool> is_node_in_cluster;
+  std::unordered_map<std::string, bool> is_node_in_cluster;
   for (const NodeDef& node : graph_def.node()) {
     is_node_in_cluster[node.name()] = false;
   }
 
-  std::vector<string> cluster_names;
+  std::vector<std::string> cluster_names;
   std::vector<std::unique_ptr<Cluster>> all_clusters;
   // Find the clusters for all available cluster factories.
   for (const ClusterFactoryInterface* cluster_factory : cluster_factories) {
diff --git a/tensorflow/lite/toco/tensorflow_graph_matching/resolve_cluster.h b/tensorflow/lite/toco/tensorflow_graph_matching/resolve_cluster.h
index d7afcced7b7..5215114fccf 100644
--- a/tensorflow/lite/toco/tensorflow_graph_matching/resolve_cluster.h
+++ b/tensorflow/lite/toco/tensorflow_graph_matching/resolve_cluster.h
@@ -40,7 +40,7 @@ std::unique_ptr<tensorflow::GraphDef> MaybeResolveClusters(
 // belongs to another cluster, then those inputs are renamed to the source
 // cluster name.
 void AddNodeToGraph(const tensorflow::NodeDef& node,
-                    const std::vector<string>& cluster_names,
+                    const std::vector<std::string>& cluster_names,
                     tensorflow::GraphDef* graph);
 
 // Given a graph and a cluster class, it finds all the nodes which belong to a
@@ -49,7 +49,7 @@ void AddNodeToGraph(const tensorflow::NodeDef& node,
 // they belong to the generated clusters.
 bool FindCluster(const ClusterFactoryInterface& cluster_factory,
                  const tensorflow::GraphDef& graph_def,
-                 std::unordered_map<string, bool>* is_node_in_cluster,
+                 std::unordered_map<std::string, bool>* is_node_in_cluster,
                  std::vector<std::unique_ptr<Cluster>>* clusters);
 
 // Receives a graph and generates another graph by replacing the cluster of
diff --git a/tensorflow/lite/toco/tensorflow_graph_matching/resolve_svdf.cc b/tensorflow/lite/toco/tensorflow_graph_matching/resolve_svdf.cc
index 2f9f9a8c9b0..7d83a9dbfed 100644
--- a/tensorflow/lite/toco/tensorflow_graph_matching/resolve_svdf.cc
+++ b/tensorflow/lite/toco/tensorflow_graph_matching/resolve_svdf.cc
@@ -47,11 +47,11 @@ namespace {
 // Since these nodes are connected to a Concatenate node, it makes sure the
 // axis value input of the Concatenate operator is 0.
 void FilterPartitionedConstNodes(
-    const string& const_pattern,
+    const std::string& const_pattern,
     const std::vector<const NodeDef*>& cluster_nodes,
     std::vector<const NodeDef*>* const_node_parts) {
   for (const NodeDef* node : cluster_nodes) {
-    string node_name_to_upper = node->name();
+    std::string node_name_to_upper = node->name();
     std::transform(node_name_to_upper.begin(), node_name_to_upper.end(),
                    node_name_to_upper.begin(), ::toupper);
     if (StrContains(node->name(), const_pattern) && node->op() == "Const") {
@@ -97,7 +97,7 @@ int SvdfCluster::InferFilterRank() {
 }
 
 void SvdfCluster::CreateNodes() {
-  for (const string& const_pattern : const_node_patterns_) {
+  for (const std::string& const_pattern : const_node_patterns_) {
     CreateConstNode(const_pattern);
   }
   std::unique_ptr<tensorflow::NodeDef> svdf_node(new NodeDef);
@@ -110,14 +110,14 @@ void SvdfCluster::CreateNodes() {
 
   // Add the rest of the inputs to Svdf cell: weights and bias.
   CHECK(new_nodes_.size() == 3 || new_nodes_.size() == 2);
-  string* weights_feature_input = svdf_node->add_input();
-  string* weights_time_input = svdf_node->add_input();
-  string* bias_input;
+  std::string* weights_feature_input = svdf_node->add_input();
+  std::string* weights_time_input = svdf_node->add_input();
+  std::string* bias_input;
   if (new_nodes_.size() == 3) {
     bias_input = svdf_node->add_input();
   }
   for (const std::unique_ptr<tensorflow::NodeDef>& node : new_nodes_) {
-    const string node_name = node->name();
+    const std::string node_name = node->name();
     if (StrContains(node_name, "SVDF_weights_feature")) {
       *weights_feature_input = node_name;
     } else if (StrContains(node_name, "SVDF_weights_time")) {
@@ -136,7 +136,7 @@ void SvdfCluster::CreateNodes() {
   CHECK_GT(rank, 0);
 
   // Add Svdf activation and rank.
-  string activation_function =
+  std::string activation_function =
       StrContains(outputs_[0], "Relu") ? "Relu" : "None";
   (*svdf_node->mutable_attr())["ActivationFunction"].set_s(activation_function);
   (*svdf_node->mutable_attr())["Rank"].set_i(rank);
@@ -145,7 +145,7 @@ void SvdfCluster::CreateNodes() {
   new_nodes_.push_back(std::move(svdf_node));
 }
 
-void SvdfCluster::CreateConstNode(const string& const_pattern) {
+void SvdfCluster::CreateConstNode(const std::string& const_pattern) {
   // Find the nodes with pattern like: "const_pattern"/part_xxx of type Const.
   std::vector<const NodeDef*> const_node_parts;
   FilterPartitionedConstNodes(const_pattern, nodes_, &const_node_parts);
@@ -236,15 +236,15 @@ void SvdfCluster::MaybeMergeConstNodes(
 
     // Set the tensor attributes.
     allocated_tensor->set_tensor_content(
-        string(reinterpret_cast<const char*>(transposed_tensor.get()),
-               allocated_content_flat_size));
+        std::string(reinterpret_cast<const char*>(transposed_tensor.get()),
+                    allocated_content_flat_size));
   } else {
     tensor_shape_dim0->set_size(dim0_size);
 
     // Set the tensor attributes.
     allocated_tensor->set_tensor_content(
-        string(reinterpret_cast<const char*>(allocated_content.get()),
-               allocated_content_flat_size));
+        std::string(reinterpret_cast<const char*>(allocated_content.get()),
+                    allocated_content_flat_size));
   }
 }
 
@@ -252,21 +252,21 @@ void SvdfCluster::MaybeMergeConstNodes(
 
 std::unique_ptr<Cluster> SvdfClusterFactory::CreateCluster(
     const NodeDef& node, const GraphDef& graph_def) const {
-  std::vector<string> node_patterns = {"SVDF_weights_feature",
-                                       "SVDF_weights_time", "SVDF_bias"};
+  std::vector<std::string> node_patterns = {"SVDF_weights_feature",
+                                            "SVDF_weights_time", "SVDF_bias"};
 
-  string node_name_to_upper = node.name();
+  std::string node_name_to_upper = node.name();
   std::transform(node_name_to_upper.begin(), node_name_to_upper.end(),
                  node_name_to_upper.begin(), ::toupper);
   std::unique_ptr<SvdfCluster> cluster = nullptr;
-  if (node_name_to_upper.find("SVDF", 0) != string::npos) {
+  if (node_name_to_upper.find("SVDF", 0) != std::string::npos) {
     size_t weights_pos = node.name().find(node_patterns[0]);
-    if (weights_pos != string::npos) {
+    if (weights_pos != std::string::npos) {
       // Assuming the node name has a pattern like:
       // "SOMESTRING1/CELLNAME/SEARCH_PATTERN/SOMESTRING2", we use
       // CELLNAME as the cluster name.
       size_t cell_pos = node.name().rfind("/", weights_pos - 2) + 1;
-      string cell_name =
+      std::string cell_name =
           node.name().substr(cell_pos, weights_pos - cell_pos - 1);
       cluster = std::unique_ptr<SvdfCluster>(new SvdfCluster);
       cluster->SetName(cell_name);
@@ -274,7 +274,7 @@ std::unique_ptr<Cluster> SvdfClusterFactory::CreateCluster(
       cluster->SetGraphDefInfo(&graph_def);
       CHECK(cluster->FindClusterInputsAndOutputs());
 
-      for (const string& const_pattern : node_patterns) {
+      for (const std::string& const_pattern : node_patterns) {
         cluster->AddConstNodePattern(const_pattern);
       }
     }
diff --git a/tensorflow/lite/toco/tensorflow_graph_matching/resolve_svdf.h b/tensorflow/lite/toco/tensorflow_graph_matching/resolve_svdf.h
index 649cadfa066..b5843016299 100644
--- a/tensorflow/lite/toco/tensorflow_graph_matching/resolve_svdf.h
+++ b/tensorflow/lite/toco/tensorflow_graph_matching/resolve_svdf.h
@@ -36,7 +36,7 @@ class SvdfCluster : public Cluster {
 
   // A helper function to set the pattern of Const nodes which CreateNodes()
   // should handle specially.
-  void AddConstNodePattern(const string& const_pattern) {
+  void AddConstNodePattern(const std::string& const_pattern) {
     const_node_patterns_.push_back(const_pattern);
   }
 
@@ -46,7 +46,7 @@ class SvdfCluster : public Cluster {
   // The main function which is used to create Const nodes for this cluster.
   // These Const nodes are the inputs to the composite op generated for this
   // cluster.
-  void CreateConstNode(const string& const_pattern);
+  void CreateConstNode(const std::string& const_pattern);
 
   // Receives a vector of Const nodes, merge them (if necessary) and returns
   // only one Const node holding all the arrays contents. It transposes it if
@@ -61,7 +61,7 @@ class SvdfCluster : public Cluster {
   // shape to [num_units, rank, batch] shape. The 2nd shape element is rank.
   int InferFilterRank();
 
-  std::vector<string> const_node_patterns_;
+  std::vector<std::string> const_node_patterns_;
 };
 
 class SvdfClusterFactory : public ClusterFactoryInterface {
diff --git a/tensorflow/lite/toco/tensorflow_graph_matching/resolve_svdf_test.cc b/tensorflow/lite/toco/tensorflow_graph_matching/resolve_svdf_test.cc
index f66b59ccce6..9828b0050b6 100644
--- a/tensorflow/lite/toco/tensorflow_graph_matching/resolve_svdf_test.cc
+++ b/tensorflow/lite/toco/tensorflow_graph_matching/resolve_svdf_test.cc
@@ -77,8 +77,8 @@ class ResolveSvdfTest : public ::testing::Test {
   ~ResolveSvdfTest() override {}
 
  protected:
-  void AddNewNode(const string& name, const string& op,
-                  const std::vector<string>& inputs) {
+  void AddNewNode(const std::string& name, const std::string& op,
+                  const std::vector<std::string>& inputs) {
     NodeDef* node = graph_.add_node();
     node->set_name(name);
     node->set_op(op);
@@ -89,8 +89,8 @@ class ResolveSvdfTest : public ::testing::Test {
     }
   }
 
-  void AddNewNode(const string& name, const string& op,
-                  const std::vector<string>& inputs,
+  void AddNewNode(const std::string& name, const std::string& op,
+                  const std::vector<std::string>& inputs,
                   const std::vector<float>& values) {
     NodeDef* node = graph_.add_node();
     node->set_name(name);
@@ -109,12 +109,12 @@ class ResolveSvdfTest : public ::testing::Test {
     tensor_shape_dim0->set_size(values.size());
     allocated_tensor->set_allocated_tensor_shape(allocated_tensor_shape);
     allocated_tensor->set_tensor_content(
-        string(reinterpret_cast<const char*>(values.data()),
-               values.size() * sizeof(float)));
+        std::string(reinterpret_cast<const char*>(values.data()),
+                    values.size() * sizeof(float)));
     (*node->mutable_attr())["value"].set_allocated_tensor(allocated_tensor);
   }
 
-  void AddShapeNode(const string& name, const std::vector<int>& values) {
+  void AddShapeNode(const std::string& name, const std::vector<int>& values) {
     NodeDef* node = graph_.add_node();
     node->set_name(name);
     node->set_op("Const");
@@ -128,8 +128,8 @@ class ResolveSvdfTest : public ::testing::Test {
     tensor_shape_dim0->set_size(values.size());
     allocated_tensor->set_allocated_tensor_shape(allocated_tensor_shape);
     allocated_tensor->set_tensor_content(
-        string(reinterpret_cast<const char*>(values.data()),
-               values.size() * sizeof(int)));
+        std::string(reinterpret_cast<const char*>(values.data()),
+                    values.size() * sizeof(int)));
     (*node->mutable_attr())["value"].set_allocated_tensor(allocated_tensor);
   }
 
@@ -157,12 +157,12 @@ TEST_F(ResolveSvdfTest, TestTranspose2DTensor) {
 }
 
 TEST_F(ResolveSvdfTest, TestResolveSvdfFlow) {
-  std::unordered_map<string, bool> is_node_in_cluster;
+  std::unordered_map<std::string, bool> is_node_in_cluster;
   for (const NodeDef& node : graph_.node()) {
     is_node_in_cluster[node.name()] = false;
   }
 
-  std::vector<string> cluster_names;
+  std::vector<std::string> cluster_names;
   CHECK(FindCluster(svdf_cluster_factory_, graph_, &is_node_in_cluster,
                     &clusters_));
 
@@ -174,7 +174,7 @@ TEST_F(ResolveSvdfTest, TestResolveSvdfFlow) {
   EXPECT_THAT(cluster_names,
               testing::UnorderedElementsAreArray({"Svdf1", "Svdf2"}));
 
-  std::vector<string> new_node_names;
+  std::vector<std::string> new_node_names;
   std::vector<float> content_array(3);
   for (const std::unique_ptr<Cluster>& cluster : clusters_) {
     // After CreateNodes in each cluster we have three nodes: Svdf,

From 81cff8fc905be6d5e0489e454f9d8362e391dcad Mon Sep 17 00:00:00 2001
From: Tiezhen WANG <wangtz@google.com>
Date: Tue, 16 Jun 2020 17:30:30 -0700
Subject: [PATCH 0357/1390] TFLM: Rename FinishTensorAllocation to
 FinishModelAllocation in the comments.

The name of this function has been changed.

PiperOrigin-RevId: 316790532
Change-Id: I31a77d3c2e7fe5af3138c99317ccd075ecbdf4ff
---
 tensorflow/lite/micro/micro_allocator.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tensorflow/lite/micro/micro_allocator.h b/tensorflow/lite/micro/micro_allocator.h
index 09b6567ac30..ab3f2a44d18 100644
--- a/tensorflow/lite/micro/micro_allocator.h
+++ b/tensorflow/lite/micro/micro_allocator.h
@@ -40,7 +40,7 @@ TfLiteStatus InitializeTfLiteTensorFromFlatbuffer(
 
 // A handle tracking scratch buffer allocation. This handle is created by
 // `RequestScratchBufferInArena`. `data` field is populated in
-// `FinishTensorAllocation` after static memory planning.
+// `FinishModelAllocation` after static memory planning.
 // TODO(b/150257460) As a future optimization, this struct could be replaced by
 // a union, since once `data` is populated, `bytes` and `node_idx` is not
 // needed.
@@ -126,7 +126,7 @@ class MicroAllocator {
 
   // Register a scratch buffer of size `bytes` for Node with `node_id`.
   // This method only allocates a BufferHandle holding information for memory
-  // planning. The buffer ptr is ready after `FinishTensorAllocation` and can
+  // planning. The buffer ptr is ready after `FinishModelAllocation` and can
   // be retrieved by `GetScratchBuffer` method using the returned buffer_idx.
   // Note that there should be no tail allocation between two consecutive
   // `RequestScratchBufferInArena` calls.
@@ -136,7 +136,7 @@ class MicroAllocator {
   void* GetScratchBuffer(int buffer_idx) const;
 
   // Returns the arena usage in bytes, only available after
-  // `FinishTensorAllocation`. Otherwise, it will return 0.
+  // `FinishModelAllocation`. Otherwise, it will return 0.
   size_t used_bytes() const;
 
  protected:

From 451352fd4031c9ef7be9ffae49ab59132107111e Mon Sep 17 00:00:00 2001
From: Brian Zhao <bmzhao@google.com>
Date: Tue, 16 Jun 2020 17:38:15 -0700
Subject: [PATCH 0358/1390] Adding convenience functions for calling ops needed
 for implementing the SavedModel C API. This change starts by adding functions
 to create and destroy resource variables.

PiperOrigin-RevId: 316791867
Change-Id: Ieba37cabcc0200e48fc0b64c980c82c2ee476e1c
---
 .../c/experimental/saved_model/core/ops/BUILD | 94 +++++++++++++++++++
 .../core/ops/owned_eager_context.h            | 54 +++++++++++
 .../saved_model/core/ops/owned_eager_op.h     | 42 +++++++++
 .../saved_model/core/ops/owned_tensor.h       | 42 +++++++++
 .../core/ops/owned_tensor_handle.h            | 54 +++++++++++
 .../saved_model/core/ops/variable_ops.cc      | 74 +++++++++++++++
 .../saved_model/core/ops/variable_ops.h       | 46 +++++++++
 .../saved_model/core/ops/variable_ops_test.cc | 77 +++++++++++++++
 8 files changed, 483 insertions(+)
 create mode 100644 tensorflow/c/experimental/saved_model/core/ops/BUILD
 create mode 100644 tensorflow/c/experimental/saved_model/core/ops/owned_eager_context.h
 create mode 100644 tensorflow/c/experimental/saved_model/core/ops/owned_eager_op.h
 create mode 100644 tensorflow/c/experimental/saved_model/core/ops/owned_tensor.h
 create mode 100644 tensorflow/c/experimental/saved_model/core/ops/owned_tensor_handle.h
 create mode 100644 tensorflow/c/experimental/saved_model/core/ops/variable_ops.cc
 create mode 100644 tensorflow/c/experimental/saved_model/core/ops/variable_ops.h
 create mode 100644 tensorflow/c/experimental/saved_model/core/ops/variable_ops_test.cc

diff --git a/tensorflow/c/experimental/saved_model/core/ops/BUILD b/tensorflow/c/experimental/saved_model/core/ops/BUILD
new file mode 100644
index 00000000000..b42e93c3716
--- /dev/null
+++ b/tensorflow/c/experimental/saved_model/core/ops/BUILD
@@ -0,0 +1,94 @@
+# This package contains written convenience helpers for Eager Operations
+# used by SavedModel. Once we autogenerate C++ Eager Op wrappers, we can remove these.
+load(
+    "//tensorflow:tensorflow.bzl",
+    "tf_cc_test",
+)
+
+package(
+    default_visibility = [
+        # Restricting visibility for now
+        "//tensorflow/c/experimental/saved_model/core:__subpackages__",
+        "//tensorflow/c/experimental/saved_model/internal:__subpackages__",
+    ],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+cc_library(
+    name = "owned_eager_op",
+    hdrs = [
+        "owned_eager_op.h",
+    ],
+    deps = [
+        "//tensorflow/c/eager:operation_interface",
+    ],
+)
+
+cc_library(
+    name = "owned_tensor_handle",
+    hdrs = [
+        "owned_tensor_handle.h",
+    ],
+    deps = [
+        "//tensorflow/c/eager:tensor_handle_interface",
+        "//tensorflow/core/common_runtime/eager:tensor_handle",
+    ],
+)
+
+cc_library(
+    name = "owned_eager_context",
+    hdrs = ["owned_eager_context.h"],
+    deps = [
+        "//tensorflow/c/eager:context_interface",
+        "//tensorflow/core/common_runtime/eager:context",
+    ],
+)
+
+cc_library(
+    name = "owned_tensor",
+    hdrs = ["owned_tensor.h"],
+    deps = [
+        "//tensorflow/c:tensor_interface",
+    ],
+)
+
+cc_library(
+    name = "variable_ops",
+    srcs = [
+        "variable_ops.cc",
+    ],
+    hdrs = [
+        "variable_ops.h",
+    ],
+    deps = [
+        ":owned_eager_op",
+        ":owned_tensor_handle",
+        "//tensorflow/c/eager:context_interface",
+        "//tensorflow/c/eager:tensor_handle_interface",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
+tf_cc_test(
+    name = "variable_ops_test",
+    srcs = [
+        "variable_ops_test.cc",
+    ],
+    deps = [
+        ":owned_eager_context",
+        ":owned_tensor",
+        ":owned_tensor_handle",
+        ":variable_ops",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core/common_runtime:core_cpu_lib",
+        "//tensorflow/core/common_runtime/eager:context",
+        "//tensorflow/core/common_runtime/eager:core",
+    ],
+)
diff --git a/tensorflow/c/experimental/saved_model/core/ops/owned_eager_context.h b/tensorflow/c/experimental/saved_model/core/ops/owned_eager_context.h
new file mode 100644
index 00000000000..300059cd069
--- /dev/null
+++ b/tensorflow/c/experimental/saved_model/core/ops/owned_eager_context.h
@@ -0,0 +1,54 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_OPS_OWNED_EAGER_CONTEXT_H_
+#define TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_OPS_OWNED_EAGER_CONTEXT_H_
+
+#include <memory>
+
+#include "tensorflow/c/eager/context_interface.h"
+#include "tensorflow/core/common_runtime/eager/context.h"
+
+namespace tensorflow {
+namespace internal {
+
+struct AbstractContextInterfaceDeleter {
+  void operator()(AbstractContextInterface* p) const {
+    if (p != nullptr) {
+      p->Release();
+    }
+  }
+};
+
+struct EagerContextDeleter {
+  void operator()(EagerContext* p) const {
+    if (p != nullptr) {
+      p->Release();
+    }
+  }
+};
+
+}  // namespace internal
+
+using AbstractContextPtr =
+    std::unique_ptr<AbstractContextInterface,
+                    internal::AbstractContextInterfaceDeleter>;
+
+using EagerContextPtr =
+    std::unique_ptr<EagerContext, internal::EagerContextDeleter>;
+
+}  // namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_OPS_OWNED_EAGER_CONTEXT_H_
diff --git a/tensorflow/c/experimental/saved_model/core/ops/owned_eager_op.h b/tensorflow/c/experimental/saved_model/core/ops/owned_eager_op.h
new file mode 100644
index 00000000000..c6b21578820
--- /dev/null
+++ b/tensorflow/c/experimental/saved_model/core/ops/owned_eager_op.h
@@ -0,0 +1,42 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_OWNED_EAGER_OP_H_
+#define TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_OWNED_EAGER_OP_H_
+
+#include <memory>
+
+#include "tensorflow/c/eager/operation_interface.h"
+
+namespace tensorflow {
+namespace internal {
+
+struct AbstractOperationInterfaceDeleter {
+  void operator()(AbstractOperationInterface* p) const {
+    if (p != nullptr) {
+      p->Release();
+    }
+  }
+};
+
+}  // namespace internal
+
+using AbstractOpPtr =
+    std::unique_ptr<AbstractOperationInterface,
+                    internal::AbstractOperationInterfaceDeleter>;
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_OWNED_EAGER_OP_H_
diff --git a/tensorflow/c/experimental/saved_model/core/ops/owned_tensor.h b/tensorflow/c/experimental/saved_model/core/ops/owned_tensor.h
new file mode 100644
index 00000000000..335d9e46c7a
--- /dev/null
+++ b/tensorflow/c/experimental/saved_model/core/ops/owned_tensor.h
@@ -0,0 +1,42 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_OPS_OWNED_TENSOR_H_
+#define TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_OPS_OWNED_TENSOR_H_
+
+#include <memory>
+
+#include "tensorflow/c/tensor_interface.h"
+
+namespace tensorflow {
+namespace internal {
+
+struct AbstractTensorInterfaceDeleter {
+  void operator()(AbstractTensorInterface* p) const {
+    if (p != nullptr) {
+      p->Release();
+    }
+  }
+};
+
+}  // namespace internal
+
+using AbstractTensorPtr =
+    std::unique_ptr<AbstractTensorInterface,
+                    internal::AbstractTensorInterfaceDeleter>;
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_OPS_OWNED_TENSOR_H_
diff --git a/tensorflow/c/experimental/saved_model/core/ops/owned_tensor_handle.h b/tensorflow/c/experimental/saved_model/core/ops/owned_tensor_handle.h
new file mode 100644
index 00000000000..e98d6554afb
--- /dev/null
+++ b/tensorflow/c/experimental/saved_model/core/ops/owned_tensor_handle.h
@@ -0,0 +1,54 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_OWNED_TENSOR_HANDLE_H_
+#define TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_OWNED_TENSOR_HANDLE_H_
+
+#include <memory>
+
+#include "tensorflow/c/eager/tensor_handle_interface.h"
+#include "tensorflow/core/common_runtime/eager/tensor_handle.h"
+
+namespace tensorflow {
+namespace internal {
+
+struct TensorHandleDeleter {
+  void operator()(TensorHandle* p) const {
+    if (p != nullptr) {
+      p->Release();
+    }
+  }
+};
+
+struct AbstractTensorHandleDeleter {
+  void operator()(AbstractTensorHandleInterface* p) const {
+    if (p != nullptr) {
+      p->Release();
+    }
+  }
+};
+
+}  // namespace internal
+
+using TensorHandlePtr =
+    std::unique_ptr<TensorHandle, internal::TensorHandleDeleter>;
+
+using AbstractTensorHandlePtr =
+    std::unique_ptr<AbstractTensorHandleInterface,
+                    internal::AbstractTensorHandleDeleter>;
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_OWNED_TENSOR_HANDLE_H_
diff --git a/tensorflow/c/experimental/saved_model/core/ops/variable_ops.cc b/tensorflow/c/experimental/saved_model/core/ops/variable_ops.cc
new file mode 100644
index 00000000000..94548e553ad
--- /dev/null
+++ b/tensorflow/c/experimental/saved_model/core/ops/variable_ops.cc
@@ -0,0 +1,74 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/c/experimental/saved_model/core/ops/variable_ops.h"
+
+#include "absl/types/span.h"
+#include "tensorflow/c/eager/context_interface.h"
+#include "tensorflow/c/experimental/saved_model/core/ops/owned_eager_op.h"
+#include "tensorflow/c/experimental/saved_model/core/ops/owned_tensor_handle.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/lib/gtl/inlined_vector.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace internal {
+
+static const char kNoSharingResourceID[] =
+    "cd2c89b7-88b7-44c8-ad83-06c2a9158347";
+
+Status CreateUninitializedResourceVariable(AbstractContextInterface* ctx,
+                                           DataType dtype, TensorShape shape,
+                                           AbstractTensorHandlePtr* handle) {
+  AbstractOpPtr varhandle_op = AbstractOpPtr(ctx->CreateOperation());
+
+  TF_RETURN_IF_ERROR(varhandle_op->Reset("VarHandleOp", nullptr));
+  TF_RETURN_IF_ERROR(varhandle_op->SetAttrType("dtype", dtype));
+
+  // Note that if shape is unknown rank, shape.dim_sizes() will be empty, and
+  // shape.dims() will be -1.
+  gtl::InlinedVector<int64, 4> dim_sizes = shape.dim_sizes();
+  TF_RETURN_IF_ERROR(varhandle_op->SetAttrShape(
+      "shape", reinterpret_cast<const int64_t*>(dim_sizes.data()),
+      shape.dims()));
+  TF_RETURN_IF_ERROR(varhandle_op->SetAttrString("container", "", 0));
+  TF_RETURN_IF_ERROR(varhandle_op->SetAttrString(
+      "shared_name", kNoSharingResourceID, strlen(kNoSharingResourceID)));
+
+  AbstractTensorHandleInterface* var_handle = nullptr;
+  int num_retvals = 1;
+  TF_RETURN_IF_ERROR(varhandle_op->Execute(
+      absl::MakeSpan(&var_handle, num_retvals), &num_retvals));
+  handle->reset(var_handle);
+  return Status();
+}
+
+Status DestroyResource(AbstractContextInterface* ctx,
+                       AbstractTensorHandleInterface* handle) {
+  AbstractOpPtr destroy_op = AbstractOpPtr(ctx->CreateOperation());
+  TF_RETURN_IF_ERROR(destroy_op->Reset("DestroyResourceOp", nullptr));
+  TF_RETURN_IF_ERROR(destroy_op->SetAttrBool("ignore_lookup_error", true));
+  TF_RETURN_IF_ERROR(destroy_op->AddInput(handle));
+
+  int num_retvals = 0;
+  TF_RETURN_IF_ERROR(destroy_op->Execute({}, &num_retvals));
+  return Status();
+}
+
+}  // namespace internal
+}  // namespace tensorflow
diff --git a/tensorflow/c/experimental/saved_model/core/ops/variable_ops.h b/tensorflow/c/experimental/saved_model/core/ops/variable_ops.h
new file mode 100644
index 00000000000..1c4d757af8c
--- /dev/null
+++ b/tensorflow/c/experimental/saved_model/core/ops/variable_ops.h
@@ -0,0 +1,46 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_OPS_VARIABLE_OPS_H
+#define TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_OPS_VARIABLE_OPS_H
+
+#include "tensorflow/c/eager/context_interface.h"
+#include "tensorflow/c/eager/tensor_handle_interface.h"
+#include "tensorflow/c/experimental/saved_model/core/ops/owned_tensor_handle.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/platform/status.h"
+
+namespace tensorflow {
+namespace internal {
+
+// Executes a VarHandleOp using `ctx`, and fills `handle` with the DT_RESOURCE
+// TensorHandle associated with the variable. This is equivalent to creating an
+// unitialized TF2 tf.Variable.
+// https://github.com/tensorflow/tensorflow/blob/516608035f85cec8b126712b0ff8407220206b22/tensorflow/python/ops/resource_variable_ops.py#L1867-L1872
+Status CreateUninitializedResourceVariable(AbstractContextInterface* ctx,
+                                           DataType dtype, TensorShape shape,
+                                           AbstractTensorHandlePtr* handle);
+
+// Executes DestroyResourceOp on `handle`, using `ctx`. This is equivalent to
+// the cleanup that occurs in a tf.Variable's EagerResourceDeleter:
+// https://github.com/tensorflow/tensorflow/blob/516608035f85cec8b126712b0ff8407220206b22/tensorflow/python/ops/resource_variable_ops.py#L289-L290
+Status DestroyResource(AbstractContextInterface* ctx,
+                       AbstractTensorHandleInterface* handle);
+
+}  // namespace internal
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_OPS_VARIABLE_OPS_H
diff --git a/tensorflow/c/experimental/saved_model/core/ops/variable_ops_test.cc b/tensorflow/c/experimental/saved_model/core/ops/variable_ops_test.cc
new file mode 100644
index 00000000000..7a9486f8ebd
--- /dev/null
+++ b/tensorflow/c/experimental/saved_model/core/ops/variable_ops_test.cc
@@ -0,0 +1,77 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/c/experimental/saved_model/core/ops/variable_ops.h"
+
+#include <memory>
+
+#include "tensorflow/c/experimental/saved_model/core/ops/owned_eager_context.h"
+#include "tensorflow/c/experimental/saved_model/core/ops/owned_tensor.h"
+#include "tensorflow/c/experimental/saved_model/core/ops/owned_tensor_handle.h"
+#include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/common_runtime/eager/context.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace {
+
+class VariableOpsTest : public ::testing::Test {
+ public:
+  VariableOpsTest()
+      : device_mgr_(std::make_unique<StaticDeviceMgr>(DeviceFactory::NewDevice(
+            "CPU", {}, "/job:localhost/replica:0/task:0"))),
+        ctx_(new EagerContext(
+            SessionOptions(),
+            tensorflow::ContextDevicePlacementPolicy::DEVICE_PLACEMENT_SILENT,
+            tensorflow::ContextMirroringPolicy::MIRRORING_NONE,
+            /* async= */ false,
+            /* lazy_copy_function_remote_inputs= */ false, device_mgr_.get(),
+            /* device_mgr_owned= */ false, /* rendezvous= */ nullptr,
+            /* custom_kernel_creator= */ nullptr,
+            /* cluster_flr= */ nullptr)) {}
+
+  EagerContext* context() { return ctx_.get(); }
+
+ private:
+  std::unique_ptr<StaticDeviceMgr> device_mgr_;
+  EagerContextPtr ctx_;
+};
+
+// Sanity check for variable creation
+TEST_F(VariableOpsTest, CreateVariableSuccessful) {
+  // Create a DT_Resource TensorHandle that points to a scalar DT_FLOAT tensor
+  AbstractTensorHandlePtr handle;
+  TF_EXPECT_OK(internal::CreateUninitializedResourceVariable(
+      context(), DT_FLOAT, {}, &handle));
+  // The created TensorHandle should be a DT_Resource
+  EXPECT_EQ(handle->DataType(), DT_RESOURCE);
+}
+
+// Sanity check for variable destruction
+TEST_F(VariableOpsTest, DestroyVariableSuccessful) {
+  // Create a DT_Resource TensorHandle that points to a scalar DT_FLOAT tensor
+  AbstractTensorHandlePtr handle;
+  TF_EXPECT_OK(internal::CreateUninitializedResourceVariable(
+      context(), DT_FLOAT, {}, &handle));
+
+  // Destroy the variable
+  TF_EXPECT_OK(internal::DestroyResource(context(), handle.get()));
+}
+
+}  // namespace
+}  // namespace tensorflow

From afb849aa36145b37c70f88211890f1b968dfb6da Mon Sep 17 00:00:00 2001
From: Mark Daoust <markdaoust@google.com>
Date: Tue, 16 Jun 2020 17:39:29 -0700
Subject: [PATCH 0359/1390] Fix list formatting on TextVectorization.

Paren-numbered lists are not recognized on tensorflow.org

PiperOrigin-RevId: 316792053
Change-Id: I72daa1bc6da92211904b77e4c5e057efc0435732
---
 .../layers/preprocessing/text_vectorization.py | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/tensorflow/python/keras/layers/preprocessing/text_vectorization.py b/tensorflow/python/keras/layers/preprocessing/text_vectorization.py
index bff7969477c..97e3ac4a63c 100644
--- a/tensorflow/python/keras/layers/preprocessing/text_vectorization.py
+++ b/tensorflow/python/keras/layers/preprocessing/text_vectorization.py
@@ -89,23 +89,25 @@ class TextVectorization(CombinerPreprocessingLayer):
   to create the vocabulary.
 
   The processing of each sample contains the following steps:
-    1) standardize each sample (usually lowercasing + punctuation stripping)
-    2) split each sample into substrings (usually words)
-    3) recombine substrings into tokens (usually ngrams)
-    4) index tokens (associate a unique int value with each token)
-    5) transform each sample using this index, either into a vector of ints or
+
+    1. standardize each sample (usually lowercasing + punctuation stripping)
+    2. split each sample into substrings (usually words)
+    3. recombine substrings into tokens (usually ngrams)
+    4. index tokens (associate a unique int value with each token)
+    5. transform each sample using this index, either into a vector of ints or
        a dense float vector.
 
   Some notes on passing Callables to customize splitting and normalization for
   this layer:
-    1) Any callable can be passed to this Layer, but if you want to serialize
+
+    1. Any callable can be passed to this Layer, but if you want to serialize
        this object you should only pass functions that are registered Keras
        serializables (see `tf.keras.utils.register_keras_serializable` for more
        details).
-    2) When using a custom callable for `standardize`, the data received
+    2. When using a custom callable for `standardize`, the data received
        by the callable will be exactly as passed to this layer. The callable
        should return a tensor of the same shape as the input.
-    3) When using a custom callable for `split`, the data received by the
+    3. When using a custom callable for `split`, the data received by the
        callable will have the 1st dimension squeezed out - instead of
        `[["string to split"], ["another string to split"]]`, the Callable will
        see `["string to split", "another string to split"]`. The callable should

From 2bab28aecc7f44f2c1a2207362bdfacc93059ca0 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 16 Jun 2020 17:49:15 -0700
Subject: [PATCH 0360/1390] Explicitly cast printed integer types to the
 primitive type so as to not implicitly rely on whether int64 is long or long
 long.

PiperOrigin-RevId: 316793291
Change-Id: I6eb74c5d7987fd7ece10e8eef26397e6dd58cf33
---
 .../common_runtime/direct_session_test.cc     |  2 +-
 tensorflow/core/debug/bfc_dump_reader.cc      | 57 +++++++++++--------
 .../core/kernels/data/experimental/io_ops.cc  |  3 +-
 .../experimental/map_and_batch_dataset_op.cc  |  8 ++-
 .../data/experimental/snapshot_util.cc        | 19 +++++--
 .../profiler/internal/gpu/device_tracer.cc    |  2 +-
 6 files changed, 56 insertions(+), 35 deletions(-)

diff --git a/tensorflow/core/common_runtime/direct_session_test.cc b/tensorflow/core/common_runtime/direct_session_test.cc
index 968713f8acd..eab508662e6 100644
--- a/tensorflow/core/common_runtime/direct_session_test.cc
+++ b/tensorflow/core/common_runtime/direct_session_test.cc
@@ -2822,7 +2822,7 @@ class StatefulOutputRequiredOp : public OpKernel {
   void Compute(OpKernelContext* ctx) override {
     // The op counts the number of outputs required in the current subgraph,
     // and emits that number on each of its required outputs.
-    Tensor count_outputs_required_t(0LL);
+    Tensor count_outputs_required_t(int64{0});
     int64& count_outputs_required = count_outputs_required_t.scalar<int64>()();
     for (int i = 0; i < num_outputs(); ++i) {
       if (ctx->output_required(i)) ++count_outputs_required;
diff --git a/tensorflow/core/debug/bfc_dump_reader.cc b/tensorflow/core/debug/bfc_dump_reader.cc
index f28eb6c428f..722315574e2 100644
--- a/tensorflow/core/debug/bfc_dump_reader.cc
+++ b/tensorflow/core/debug/bfc_dump_reader.cc
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include <cinttypes>
 #include <string>
 
 #include "tensorflow/core/platform/env.h"
@@ -70,17 +71,18 @@ void PrintChunk(const MemChunk& mc, const uint64 ac_offset, bool freed_at,
   // A size class corresponding approximately to log base 100.
   int size_class = floor(0.5 * log10(static_cast<double>(mc.size())));
   *cumulative_bytes += mc.size();
-  printf("  %c %d %p bin=%d bytes=%llu %3.1f%%", mc.in_use() ? 'U' : 'F',
+  printf("  %c %d %p bin=%d bytes=%" PRIu64 " %3.1f%%", mc.in_use() ? 'U' : 'F',
          size_class, reinterpret_cast<const void*>(mc.address()), mc.bin(),
-         mc.size(),
+         static_cast<uint64_t>(mc.size()),
          100 * (*cumulative_bytes / static_cast<float>(total_bytes)));
   if (freed_at) {
-    printf(" freed_at=%llu", mc.freed_at_count());
+    printf(" freed_at=%" PRIu64, static_cast<uint64_t>(mc.freed_at_count()));
   }
   if (ac_offset > 0) {
-    printf(" age=%llu", ac_offset - mc.action_count());
+    printf(" age=%" PRIu64,
+           static_cast<uint64_t>(ac_offset - mc.action_count()));
   } else {
-    printf(" ac=%llu", mc.action_count());
+    printf(" ac=%" PRIu64, static_cast<uint64_t>(mc.action_count()));
   }
   // step_ids are random, so save space by showing only low 16 bits.
   printf(" step=%x op=%s\n", static_cast<uint>(0xFFFF & mc.step_id()),
@@ -90,18 +92,24 @@ void PrintChunk(const MemChunk& mc, const uint64 ac_offset, bool freed_at,
 void PrintSummary(const MemoryDump& md) {
   printf("MemoryMap for allocator %s\n", md.allocator_name().c_str());
   for (auto& it : md.bin_summary()) {
-    printf(
-        "   Bin %2d total bytes=%10lld \tin use=%10lld \ttotal_chunks=%6lld "
-        "\tin_use=%6lld\n",
-        it.bin(), it.total_bytes_in_bin(), it.total_bytes_in_use(),
-        it.total_chunks_in_bin(), it.total_chunks_in_use());
+    printf("   Bin %2d total bytes=%10" PRId64 " \tin use=%10" PRId64
+           " \ttotal_chunks=%6" PRId64
+           " "
+           "\tin_use=%6" PRId64 "\n",
+           it.bin(), static_cast<int64_t>(it.total_bytes_in_bin()),
+           static_cast<int64_t>(it.total_bytes_in_use()),
+           static_cast<int64_t>(it.total_chunks_in_bin()),
+           static_cast<int64_t>(it.total_chunks_in_use()));
   }
-  printf(
-      "Total num_allocs: %lld, bytes_in_use: %lld, peak_bytes_in_use: %lld,\n"
-      "largest_alloc_size: %lld, fragmentation: %f\n",
-      md.stats().num_allocs(), md.stats().bytes_in_use(),
-      md.stats().peak_bytes_in_use(), md.stats().largest_alloc_size(),
-      md.stats().fragmentation_metric());
+  printf("Total num_allocs: %" PRId64 ", bytes_in_use: %" PRId64
+         ", peak_bytes_in_use: %" PRId64
+         ",\n"
+         "largest_alloc_size: %" PRId64 ", fragmentation: %f\n",
+         static_cast<int64_t>(md.stats().num_allocs()),
+         static_cast<int64_t>(md.stats().bytes_in_use()),
+         static_cast<int64_t>(md.stats().peak_bytes_in_use()),
+         static_cast<int64_t>(md.stats().largest_alloc_size()),
+         md.stats().fragmentation_metric());
 }
 
 void PrintSortedChunks(
@@ -125,10 +133,10 @@ void PrintSortedChunks(
   for (int i = 0; i < chunks.size(); ++i) {
     const MemChunk* c = chunks[i];
     if (by_addr && i > 0 && last_end != c->address()) {
-      printf("  empty range from %p to %p  (%lld)\n",
+      printf("  empty range from %p to %p  (%" PRId64 ")\n",
              reinterpret_cast<const void*>(last_end),
              reinterpret_cast<const void*>(c->address()),
-             (c->address() - last_end));
+             static_cast<int64_t>(c->address() - last_end));
     }
     PrintChunk(*c, max_action_count, freed_at, total_bytes, &cumulative_bytes);
     last_end = c->address() + c->size();
@@ -182,8 +190,8 @@ void PrintChunksByOpName(const MemoryDump& md, const string& op_name,
       total_bytes += it.size();
     }
   }
-  printf("\t%d matching Chunks of total size %llu bytes:\n",
-         filtered.chunk_size(), total_bytes);
+  printf("\t%d matching Chunks of total size %" PRIu64 " bytes:\n",
+         filtered.chunk_size(), static_cast<uint64_t>(total_bytes));
   PrintSortedChunks(
       filtered,
       [](const MemChunk* a, const MemChunk* b) {
@@ -205,10 +213,13 @@ void PrintSizeHistory(const MemoryDump& md, bool by_age) {
   }
   for (auto& it : md.snap_shot()) {
     if (by_age) {
-      printf("\tage=%llu, size=%lld\n", max_action_count - it.action_count(),
-             it.size());
+      printf("\tage=%" PRIu64 ", size=%" PRId64 "\n",
+             static_cast<uint64_t>(max_action_count - it.action_count()),
+             static_cast<int64_t>(it.size()));
     } else {
-      printf("\tac=%llu, size=%lld\n", it.action_count(), it.size());
+      printf("\tac=%" PRIu64 ", size=%" PRId64 "\n",
+             static_cast<uint64_t>(it.action_count()),
+             static_cast<int64_t>(it.size()));
     }
   }
 }
diff --git a/tensorflow/core/kernels/data/experimental/io_ops.cc b/tensorflow/core/kernels/data/experimental/io_ops.cc
index d41c604bb32..112a58a7e9a 100644
--- a/tensorflow/core/kernels/data/experimental/io_ops.cc
+++ b/tensorflow/core/kernels/data/experimental/io_ops.cc
@@ -159,7 +159,8 @@ Status SaveDatasetOp::WriteMetadataFile(Env* env, const std::string& path,
                                         uint64 num_elements, bool finalized) {
   SnapshotMetadataRecord metadata;
   metadata.set_creation_timestamp(EnvTime::NowMicros());
-  metadata.set_run_id(strings::Printf("%llu", run_id));
+  metadata.set_run_id(
+      strings::Printf("%llu", static_cast<unsigned long long>(run_id)));
   metadata.set_version(kFileFormatVersion);
   for (const auto& output_dtype : output_dtypes) {
     metadata.add_dtype(output_dtype);
diff --git a/tensorflow/core/kernels/data/experimental/map_and_batch_dataset_op.cc b/tensorflow/core/kernels/data/experimental/map_and_batch_dataset_op.cc
index 09783161091..0cf85a58985 100644
--- a/tensorflow/core/kernels/data/experimental/map_and_batch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/map_and_batch_dataset_op.cc
@@ -302,9 +302,11 @@ class MapAndBatchDatasetOp::Dataset : public DatasetBase {
       }
       auto result = dataset()->traceme_metadata_;
       result.push_back(std::make_pair(
-          "max_batch_results", strings::Printf("%lld", max_batch_results)));
-      result.push_back(
-          std::make_pair("parallelism", strings::Printf("%lld", parallelism)));
+          "max_batch_results",
+          strings::Printf("%lld", static_cast<long long>(max_batch_results))));
+      result.push_back(std::make_pair(
+          "parallelism",
+          strings::Printf("%lld", static_cast<long long>(parallelism))));
       return result;
     }
 
diff --git a/tensorflow/core/kernels/data/experimental/snapshot_util.cc b/tensorflow/core/kernels/data/experimental/snapshot_util.cc
index 00269ab534a..3b051d7d572 100644
--- a/tensorflow/core/kernels/data/experimental/snapshot_util.cc
+++ b/tensorflow/core/kernels/data/experimental/snapshot_util.cc
@@ -50,11 +50,14 @@ namespace snapshot_util {
     CustomReader::kSnappyReaderOutputBufferSizeBytes;
 
 std::string HashDirectory(const std::string& path, uint64 hash) {
-  return io::JoinPath(path, strings::Printf("%llu", hash));
+  return io::JoinPath(
+      path, strings::Printf("%llu", static_cast<unsigned long long>(hash)));
 }
 
 std::string RunDirectory(const std::string& hash_directory, uint64 run_id) {
-  return RunDirectory(hash_directory, strings::Printf("%llu", run_id));
+  return RunDirectory(
+      hash_directory,
+      strings::Printf("%llu", static_cast<unsigned long long>(run_id)));
 }
 
 std::string RunDirectory(const std::string& hash_directory,
@@ -63,13 +66,17 @@ std::string RunDirectory(const std::string& hash_directory,
 }
 
 std::string ShardDirectory(const std::string& run_directory, int64 shard_id) {
-  return io::JoinPath(run_directory, strings::Printf("%08llu%s", shard_id,
-                                                     kShardDirectorySuffix));
+  return io::JoinPath(
+      run_directory,
+      strings::Printf("%08llu%s", static_cast<unsigned long long>(shard_id),
+                      kShardDirectorySuffix));
 }
 std::string GetCheckpointFileName(const std::string& shard_directory,
                                   uint64 checkpoint_id) {
-  return io::JoinPath(shard_directory,
-                      strings::Printf("%08llu.snapshot", checkpoint_id));
+  return io::JoinPath(
+      shard_directory,
+      strings::Printf("%08llu.snapshot",
+                      static_cast<unsigned long long>(checkpoint_id)));
 }
 
 Status Writer::Create(Env* env, const std::string& filename,
diff --git a/tensorflow/core/profiler/internal/gpu/device_tracer.cc b/tensorflow/core/profiler/internal/gpu/device_tracer.cc
index 73d2a278ea4..9c3e2d67bf0 100644
--- a/tensorflow/core/profiler/internal/gpu/device_tracer.cc
+++ b/tensorflow/core/profiler/internal/gpu/device_tracer.cc
@@ -486,7 +486,7 @@ class CuptiTraceCollectorImpl : public CuptiTraceCollector {
         // Times 2 because HBM is DDR memory; it gets two data bits per each
         // data lane.
         auto memory_bandwidth =
-            2ULL * (*mem_clock_khz) * 1000 * (*mem_bus_width_bits) / 8;
+            uint64{2} * (*mem_clock_khz) * 1000 * (*mem_bus_width_bits) / 8;
         device_plane->AddStatValue(
             *device_plane->GetOrCreateStatMetadata(
                 GetStatTypeStr(StatType::kDevCapMemoryBandwidth)),

From ec27f797f1711adbd255e492c617d1c5ba0cb273 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 16 Jun 2020 18:03:25 -0700
Subject: [PATCH 0361/1390] fix kokoro build.

PiperOrigin-RevId: 316795091
Change-Id: I5c8b121d79a0d21f0fa89ba8999f930e0d305322
---
 tensorflow/core/tpu/kernels/tpu_compile_op_common.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/tpu/kernels/tpu_compile_op_common.h b/tensorflow/core/tpu/kernels/tpu_compile_op_common.h
index 2c8d90643ef..5223732430a 100644
--- a/tensorflow/core/tpu/kernels/tpu_compile_op_common.h
+++ b/tensorflow/core/tpu/kernels/tpu_compile_op_common.h
@@ -153,7 +153,7 @@ class TpuCompileOpKernelCommon {
   int num_computations_;
 
  private:
-  DISALLOW_COPY_AND_ASSIGN(TpuCompileOpKernelCommon);
+  TF_DISALLOW_COPY_AND_ASSIGN(TpuCompileOpKernelCommon);
 };
 
 }  // namespace tpu

From f3d3480c811480c2b966df1086828e47131eb783 Mon Sep 17 00:00:00 2001
From: Koan-Sin Tan <koansin.tan@gmail.com>
Date: Wed, 17 Jun 2020 09:25:28 +0800
Subject: [PATCH 0362/1390] [tflite] make label_image build on linux and macOS

label_image doesn't build on Linux and macOS platforms

```
bazel build --config opt //tensorflow/lite/examples/label_image:label_image
```
shows something like
```
ERROR: /home/freedom/work/tensorflow/tensorflow/lite/examples/label_image/BUILD:15:1: undeclared inclusion(s) in rule '//tensorflow/lite/examples/label_image:label_image':
this rule is missing dependency declarations for the following files included by 'tensorflow/lite/examples/label_image/label_image.cc':
  'external/com_google_absl/absl/strings/string_view.h'
  'external/com_google_absl/absl/base/internal/throw_delegate.h'

```

Add `"@com_google_absl//absl/strings"` to deps
---
 tensorflow/lite/examples/label_image/BUILD | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/lite/examples/label_image/BUILD b/tensorflow/lite/examples/label_image/BUILD
index 01296b0b2a0..633f767c5e9 100644
--- a/tensorflow/lite/examples/label_image/BUILD
+++ b/tensorflow/lite/examples/label_image/BUILD
@@ -38,6 +38,7 @@ cc_binary(
         "//tensorflow/lite/profiling:profiler",
         "//tensorflow/lite/tools/evaluation:utils",
         "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
     ] + select({
         "//tensorflow:android": [
             "//tensorflow/lite/delegates/gpu:delegate",

From 06ff30f7ea35098cb68a231a9eb7ff3ff4be4e1e Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 16 Jun 2020 18:22:14 -0700
Subject: [PATCH 0363/1390] [Profiler] Enhance the memory profiler to handle
 very large number of snapshots.

PiperOrigin-RevId: 316797391
Change-Id: Ia3c3a678241498bb480f7efdeb535d032eeb634a
---
 .../convert/xplane_to_memory_profile.cc       | 34 ++++++++++++++-----
 .../convert/xplane_to_memory_profile.h        |  7 ++--
 2 files changed, 30 insertions(+), 11 deletions(-)

diff --git a/tensorflow/core/profiler/convert/xplane_to_memory_profile.cc b/tensorflow/core/profiler/convert/xplane_to_memory_profile.cc
index a0353d371d6..d039ca8da32 100644
--- a/tensorflow/core/profiler/convert/xplane_to_memory_profile.cc
+++ b/tensorflow/core/profiler/convert/xplane_to_memory_profile.cc
@@ -31,7 +31,6 @@ limitations under the License.
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/protobuf.h"
-#include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/protobuf/memory_profile.pb.h"
 #include "tensorflow/core/profiler/protobuf/xplane.pb.h"
 #include "tensorflow/core/profiler/utils/tf_xplane_visitor.h"
@@ -443,9 +442,23 @@ void ProcessActiveAllocations(int64 peak_bytes_profile_step_id,
           << memory_profile->active_allocations_size();
 }
 
+void SampleSnapshots(
+    int64 max_num_snapshots,
+    protobuf::RepeatedPtrField<MemoryProfileSnapshot>* snapshots) {
+  if (snapshots->size() <= max_num_snapshots) return;
+  absl::c_partial_sort(
+      *snapshots, snapshots->begin() + max_num_snapshots,
+      [](const MemoryProfileSnapshot& a, const MemoryProfileSnapshot& b) {
+        return a.aggregation_stats().free_memory_bytes() <
+               b.aggregation_stats().free_memory_bytes();
+      });
+  snapshots->erase(snapshots->begin() + max_num_snapshots, snapshots->end());
+}
+
 // Post-process the memory profile to correctly update proto fields, and break
 // down peak memory usage for each allocator.
-void ProcessMemoryProfileProto(MemoryProfile* memory_profile) {
+void ProcessMemoryProfileProto(int64 max_num_snapshots,
+                               MemoryProfile* memory_profile) {
   memory_profile->set_num_hosts(1);
   // Add sorted memory ids within memory profile data to the selection list.
   for (const auto& id_and_allocator_profile :
@@ -460,12 +473,13 @@ void ProcessMemoryProfileProto(MemoryProfile* memory_profile) {
        *memory_profile->mutable_memory_profile_per_allocator()) {
     PerAllocatorMemoryProfile* allocator_memory_profile =
         &id_and_allocator_profile.second;
+    protobuf::RepeatedPtrField<MemoryProfileSnapshot>* snapshots =
+        allocator_memory_profile->mutable_memory_profile_snapshots();
     // Sort the memory_profile_snapshots by time_offset_ps (ascending) in proto.
-    absl::c_sort(
-        *allocator_memory_profile->mutable_memory_profile_snapshots(),
-        [](const MemoryProfileSnapshot& a, const MemoryProfileSnapshot& b) {
-          return a.time_offset_ps() < b.time_offset_ps();
-        });
+    absl::c_sort(*snapshots, [](const MemoryProfileSnapshot& a,
+                                const MemoryProfileSnapshot& b) {
+      return a.time_offset_ps() < b.time_offset_ps();
+    });
 
     UpdateStepId(memory_profile->step_count(), allocator_memory_profile);
     UpdateDeallocation(allocator_memory_profile);
@@ -476,14 +490,16 @@ void ProcessMemoryProfileProto(MemoryProfile* memory_profile) {
     int64 peak_step_id =
         GetPeakMemoryStep(peak_bytes_profile, allocator_memory_profile);
     ProcessActiveAllocations(peak_step_id, allocator_memory_profile);
+    SampleSnapshots(max_num_snapshots, snapshots);
   }
 }
 
 }  // namespace
 
-MemoryProfile ConvertXPlaneToMemoryProfile(const XPlane& host_plane) {
+MemoryProfile ConvertXPlaneToMemoryProfile(const XPlane& host_plane,
+                                           int64 max_num_snapshots) {
   MemoryProfile memory_profile = GenerateMemoryProfile(&host_plane);
-  ProcessMemoryProfileProto(&memory_profile);
+  ProcessMemoryProfileProto(max_num_snapshots, &memory_profile);
   return memory_profile;
 }
 
diff --git a/tensorflow/core/profiler/convert/xplane_to_memory_profile.h b/tensorflow/core/profiler/convert/xplane_to_memory_profile.h
index bd8a6e8df08..873ac800aa5 100644
--- a/tensorflow/core/profiler/convert/xplane_to_memory_profile.h
+++ b/tensorflow/core/profiler/convert/xplane_to_memory_profile.h
@@ -16,14 +16,17 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_PROFILER_CONVERT_XPLANE_TO_MEMORY_PROFILE_H_
 #define TENSORFLOW_CORE_PROFILER_CONVERT_XPLANE_TO_MEMORY_PROFILE_H_
 
+#include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/protobuf/memory_profile.pb.h"
 #include "tensorflow/core/profiler/protobuf/xplane.pb.h"
 
 namespace tensorflow {
 namespace profiler {
 
-// Process the host threads XPlane and generate MemoryProfile result.
-MemoryProfile ConvertXPlaneToMemoryProfile(const XPlane& host_plane);
+// Process the host threads XPlane and generate MemoryProfile result; at most
+// max_num_snapshots will be displayed on the UI.
+MemoryProfile ConvertXPlaneToMemoryProfile(const XPlane& host_plane,
+                                           int64 max_num_snapshots = 1000);
 
 }  // namespace profiler
 }  // namespace tensorflow

From 8d8212ea8129347603e60b053f147d6bd0d1f83a Mon Sep 17 00:00:00 2001
From: tg-at-google <taregaskin@google.com>
Date: Tue, 16 Jun 2020 21:31:09 -0400
Subject: [PATCH 0364/1390] Update quantization_config.cc

---
 .../mlir/lite/quantization/quantization_config.cc         | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tensorflow/compiler/mlir/lite/quantization/quantization_config.cc b/tensorflow/compiler/mlir/lite/quantization/quantization_config.cc
index cdff93502f2..08cc29dd647 100644
--- a/tensorflow/compiler/mlir/lite/quantization/quantization_config.cc
+++ b/tensorflow/compiler/mlir/lite/quantization/quantization_config.cc
@@ -48,9 +48,9 @@ bool ParseInputNodeQuantSpecs(absl::string_view node_names,
   std::vector<double> node_mins;
   if (!min_values.empty()) {
     std::vector<std::string> node_mins_str = absl::StrSplit(min_values, ',');
-    for (const std::string&node_min : node_mins_str.size()) {
+    for (int i = 0, e = node_mins_str.size(); i < e; i++) {
       double value;
-      if (!absl::SimpleAtod(node_min, &value)) {
+      if (!absl::SimpleAtod(node_mins_str[i], &value)) {	
         return true;
       }
       node_mins.push_back(value);
@@ -60,9 +60,9 @@ bool ParseInputNodeQuantSpecs(absl::string_view node_names,
   std::vector<double> node_maxs;
   if (!max_values.empty()) {
     std::vector<std::string> node_maxs_str = absl::StrSplit(max_values, ',');
-    for (const std::string&node_max : node_maxs_str.size()) {
+    for (int i = 0, e = node_maxs_str.size(); i < e; i++) {
       double value;
-      if (!absl::SimpleAtod(node_max, &value)) {
+      if (!absl::SimpleAtod(node_maxs_str[i], &value)) {
         llvm::errs() << "Unexpected mins: " << node_maxs_str[i] << "\n";
         return true;
       }

From 9338f4da0d648cb73339c202fafbbc9376bb3fcb Mon Sep 17 00:00:00 2001
From: tg-at-google <taregaskin@google.com>
Date: Tue, 16 Jun 2020 22:19:40 -0400
Subject: [PATCH 0365/1390] Update sqlite_query_connection.cc

---
 .../kernels/data/experimental/sql/sqlite_query_connection.cc  | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/core/kernels/data/experimental/sql/sqlite_query_connection.cc b/tensorflow/core/kernels/data/experimental/sql/sqlite_query_connection.cc
index ada94be15bf..9a7eb125f95 100644
--- a/tensorflow/core/kernels/data/experimental/sql/sqlite_query_connection.cc
+++ b/tensorflow/core/kernels/data/experimental/sql/sqlite_query_connection.cc
@@ -68,8 +68,8 @@ Status SqliteQueryConnection::GetNext(IteratorContext* ctx,
 
 Status SqliteQueryConnection::PrepareQuery() {
   TF_RETURN_IF_ERROR(db_->Prepare(query_, &stmt_));
-  size_t column_count = stmt_.ColumnCount();
-  if (column_count != output_types_.size()) {
+  int column_count = stmt_.ColumnCount();
+  if (column_count != static_cast<int>(output_types_.size())) {
     stmt_ = SqliteStatement();
     return errors::InvalidArgument(tensorflow::strings::Printf(
         "The number of columns in query (%d) must match the number of "

From 1444b6fbbd880aee31426bdbe3b2b7b66cdeac6e Mon Sep 17 00:00:00 2001
From: George Karpenkov <cheshire@google.com>
Date: Tue, 16 Jun 2020 19:39:06 -0700
Subject: [PATCH 0366/1390] [XLA] Remove TPU-specific aliasing #ifdef which is
 no longer TPU-specific

PiperOrigin-RevId: 316806281
Change-Id: I6acab4ed606cc123052b539a4b2b160fffe21b5b
---
 tensorflow/compiler/xla/tests/buffer_donation_test.cc | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/tensorflow/compiler/xla/tests/buffer_donation_test.cc b/tensorflow/compiler/xla/tests/buffer_donation_test.cc
index 9cfeefadead..18cde722a64 100644
--- a/tensorflow/compiler/xla/tests/buffer_donation_test.cc
+++ b/tensorflow/compiler/xla/tests/buffer_donation_test.cc
@@ -109,9 +109,7 @@ class BufferDonationTest : public HloTestBase {
               << "             size = " << result_root_buffer.size();
 
     // Check for expected aliasing between input and output buffers.
-    // The following aliasing pattern is only ever generated by the TPU backend
-    // at the moment.
-#if defined(XLA_TEST_BACKEND_TPU)
+#ifndef XLA_TEST_BACKEND_INTERPRETER
     for (int i = 0; i < ShapeUtil::TupleElementCount(argument_literal.shape());
          ++i) {
       const ShapeIndex index({i});

From 62beb0fc6651cd9628ce1f34f377a54ef97acfab Mon Sep 17 00:00:00 2001
From: Brian Zhao <bmzhao@google.com>
Date: Tue, 16 Jun 2020 19:48:23 -0700
Subject: [PATCH 0367/1390] Adding convenience functions for assigning and
 reading resource variables.

PiperOrigin-RevId: 316807134
Change-Id: Ia37fdf8299064fe1a471eb0b34568e85164808ce
---
 .../saved_model/core/ops/variable_ops.cc      | 30 +++++++++++++++++++
 .../saved_model/core/ops/variable_ops.h       | 16 ++++++++++
 .../saved_model/core/ops/variable_ops_test.cc | 30 +++++++++++++++++++
 3 files changed, 76 insertions(+)

diff --git a/tensorflow/c/experimental/saved_model/core/ops/variable_ops.cc b/tensorflow/c/experimental/saved_model/core/ops/variable_ops.cc
index 94548e553ad..a3b3ace7be9 100644
--- a/tensorflow/c/experimental/saved_model/core/ops/variable_ops.cc
+++ b/tensorflow/c/experimental/saved_model/core/ops/variable_ops.cc
@@ -58,6 +58,36 @@ Status CreateUninitializedResourceVariable(AbstractContextInterface* ctx,
   return Status();
 }
 
+Status AssignVariable(AbstractContextInterface* ctx,
+                      AbstractTensorHandleInterface* variable_handle,
+                      DataType dtype, AbstractTensorHandleInterface* value) {
+  AbstractOpPtr assign_op(ctx->CreateOperation());
+  TF_RETURN_IF_ERROR(assign_op->Reset("AssignVariableOp", nullptr));
+  TF_RETURN_IF_ERROR(assign_op->SetAttrType("dtype", dtype));
+  TF_RETURN_IF_ERROR(assign_op->AddInput(variable_handle));
+  TF_RETURN_IF_ERROR(assign_op->AddInput(value));
+
+  int num_retvals = 0;
+  TF_RETURN_IF_ERROR(assign_op->Execute({}, &num_retvals));
+  return Status();
+}
+
+Status ReadVariable(AbstractContextInterface* ctx,
+                    AbstractTensorHandleInterface* variable_handle,
+                    DataType dtype, AbstractTensorHandlePtr* output) {
+  AbstractOpPtr read_op = AbstractOpPtr(ctx->CreateOperation());
+  TF_RETURN_IF_ERROR(read_op->Reset("ReadVariableOp", nullptr));
+  TF_RETURN_IF_ERROR(read_op->SetAttrType("dtype", dtype));
+  TF_RETURN_IF_ERROR(read_op->AddInput(variable_handle));
+
+  AbstractTensorHandleInterface* value = nullptr;
+  int num_retvals = 1;
+  TF_RETURN_IF_ERROR(
+      read_op->Execute(absl::MakeSpan(&value, num_retvals), &num_retvals));
+  output->reset(value);
+  return Status();
+}
+
 Status DestroyResource(AbstractContextInterface* ctx,
                        AbstractTensorHandleInterface* handle) {
   AbstractOpPtr destroy_op = AbstractOpPtr(ctx->CreateOperation());
diff --git a/tensorflow/c/experimental/saved_model/core/ops/variable_ops.h b/tensorflow/c/experimental/saved_model/core/ops/variable_ops.h
index 1c4d757af8c..8a410328b9e 100644
--- a/tensorflow/c/experimental/saved_model/core/ops/variable_ops.h
+++ b/tensorflow/c/experimental/saved_model/core/ops/variable_ops.h
@@ -34,6 +34,22 @@ Status CreateUninitializedResourceVariable(AbstractContextInterface* ctx,
                                            DataType dtype, TensorShape shape,
                                            AbstractTensorHandlePtr* handle);
 
+// Executes an AssignVariableOp using `ctx`, assigning the variable associated
+// with `variable_handle` with `value`. `dtype` must be the datatype of the
+// underlying variable for `variable_handle`. Note that it is illegal to assign
+// a variable to a Tensor with a different dtype than what the variable was
+// created with.
+Status AssignVariable(AbstractContextInterface* ctx,
+                      AbstractTensorHandleInterface* variable_handle,
+                      DataType dtype, AbstractTensorHandleInterface* value);
+
+// Executes a ReadVariableOp using `ctx`. This reads the underlying variable
+// value of `variable_handle` and copies the value to `output`. `dtype` must be
+// the dtype of the variable associated with `variable_handle`.
+Status ReadVariable(AbstractContextInterface* ctx,
+                    AbstractTensorHandleInterface* variable_handle,
+                    DataType dtype, AbstractTensorHandlePtr* output);
+
 // Executes DestroyResourceOp on `handle`, using `ctx`. This is equivalent to
 // the cleanup that occurs in a tf.Variable's EagerResourceDeleter:
 // https://github.com/tensorflow/tensorflow/blob/516608035f85cec8b126712b0ff8407220206b22/tensorflow/python/ops/resource_variable_ops.py#L289-L290
diff --git a/tensorflow/c/experimental/saved_model/core/ops/variable_ops_test.cc b/tensorflow/c/experimental/saved_model/core/ops/variable_ops_test.cc
index 7a9486f8ebd..3c57ed4d38a 100644
--- a/tensorflow/c/experimental/saved_model/core/ops/variable_ops_test.cc
+++ b/tensorflow/c/experimental/saved_model/core/ops/variable_ops_test.cc
@@ -30,6 +30,13 @@ limitations under the License.
 namespace tensorflow {
 namespace {
 
+AbstractTensorHandlePtr CreateScalarTensorHandle(EagerContext* context,
+                                                 float value) {
+  AbstractTensorPtr tensor(context->CreateFloatScalar(value));
+  AbstractTensorHandlePtr handle(context->CreateLocalHandle(tensor.get()));
+  return handle;
+}
+
 class VariableOpsTest : public ::testing::Test {
  public:
   VariableOpsTest()
@@ -73,5 +80,28 @@ TEST_F(VariableOpsTest, DestroyVariableSuccessful) {
   TF_EXPECT_OK(internal::DestroyResource(context(), handle.get()));
 }
 
+// Sanity check for handle assignment and reading
+TEST_F(VariableOpsTest, AssignVariableAndReadSuccessful) {
+  // Create a DT_Resource TensorHandle that points to a scalar DT_FLOAT tensor
+  AbstractTensorHandlePtr variable;
+  TF_EXPECT_OK(internal::CreateUninitializedResourceVariable(
+      context(), DT_FLOAT, {}, &variable));
+
+  // Create a Scalar float TensorHandle with value 42, and assign it to
+  // the variable.
+  AbstractTensorHandlePtr my_value = CreateScalarTensorHandle(context(), 42.0);
+  TF_EXPECT_OK(internal::AssignVariable(context(), variable.get(), DT_FLOAT,
+                                        my_value.get()));
+
+  // Read back the value from the variable, and check that it is 42.
+  AbstractTensorHandlePtr read_value_handle;
+  TF_EXPECT_OK(internal::ReadVariable(context(), variable.get(), DT_FLOAT,
+                                      &read_value_handle));
+  Status status;
+  AbstractTensorPtr read_value(read_value_handle->Resolve(&status));
+  TF_EXPECT_OK(status);
+  EXPECT_FLOAT_EQ(42.0, *static_cast<float*>(read_value->Data()));
+}
+
 }  // namespace
 }  // namespace tensorflow

From 7c5ddb830f17b15708664fc4a34fe3b24f142628 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 16 Jun 2020 19:54:11 -0700
Subject: [PATCH 0368/1390] Integrate LLVM at
 https://github.com/llvm/llvm-project/commit/4799fb63b551

PiperOrigin-RevId: 316807673
Change-Id: I246445df966079989305be15318b9b9b4cc4db3a
---
 tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc b/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc
index 9e75c1b9ac5..72f3a4dfac7 100644
--- a/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc
+++ b/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc
@@ -29,6 +29,7 @@ limitations under the License.
 #include "mlir/IR/Builders.h"  // from @llvm-project
 #include "mlir/IR/Function.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/OperationSupport.h"  // from @llvm-project
 #include "mlir/IR/Value.h"  // from @llvm-project
 #include "tensorflow/compiler/xla/service/cpu/cpu_options.h"
 #include "tensorflow/compiler/xla/service/cpu/cpu_runtime.h"
@@ -255,7 +256,8 @@ Status DotOpEmitter::EmitLinalgMatmul() {
         mlir::edsc::ScopedContext scope(*builder, function.getLoc());
         mlir::Value a = function.getArgument(0), b = function.getArgument(1),
                     c = function.getArgument(2);
-        mlir::edsc::intrinsics::linalg_matmul(b, c, a);
+        mlir::edsc::intrinsics::linalg_matmul(mlir::TypeRange{},
+                                              mlir::ValueRange{b, c, a});
         mlir::edsc::intrinsics::std_ret();
       });
 }

From d68cccb57eaa23d9af2e1e065373f084a808098b Mon Sep 17 00:00:00 2001
From: Raman Sarokin <sorokin@google.com>
Date: Tue, 16 Jun 2020 19:54:13 -0700
Subject: [PATCH 0369/1390] Removed fuse add to conv transformation. This
 transformation incorrect for border elements, when used Zero clamping.

PiperOrigin-RevId: 316807675
Change-Id: Iffbdcd3f5a46ecbdc8c4ad1c4f8f4393410f7be3
---
 .../transformations/fuse_add_to_conv.cc       | 114 ------------------
 .../common/transformations/fuse_add_to_conv.h |  23 ----
 .../transformations/fuse_add_to_conv_test.cc  | 114 ------------------
 .../general_transformations.cc                |   4 +-
 4 files changed, 1 insertion(+), 254 deletions(-)

diff --git a/tensorflow/lite/delegates/gpu/common/transformations/fuse_add_to_conv.cc b/tensorflow/lite/delegates/gpu/common/transformations/fuse_add_to_conv.cc
index b279e49e40c..adee86e4a64 100644
--- a/tensorflow/lite/delegates/gpu/common/transformations/fuse_add_to_conv.cc
+++ b/tensorflow/lite/delegates/gpu/common/transformations/fuse_add_to_conv.cc
@@ -92,66 +92,12 @@ class MergeConvolutionWithAdd : public SequenceTransformation {
   }
 };
 
-class MergeAddWithConvolution : public SequenceTransformation {
- public:
-  int ExpectedSequenceLength() const final { return 2; }
-
-  TransformResult ApplyToNodesSequence(const std::vector<Node*>& sequence,
-                                       GraphFloat32* graph) final {
-    auto& conv_node = *sequence[1];
-    auto& add_node = *sequence[0];
-    if (add_node.operation.type != ToString(OperationType::ADD)) {
-      return {TransformStatus::SKIPPED, ""};
-    }
-    AddAttributes add_attr =
-        absl::any_cast<AddAttributes>(add_node.operation.attributes);
-    if (!absl::holds_alternative<Tensor<Linear, DataType::FLOAT32>>(
-            add_attr.param) &&
-        !absl::holds_alternative<float>(add_attr.param)) {
-      return {TransformStatus::DECLINED,
-              "This fuse applicable only for broadcast or scalar addition."};
-    }
-
-    if (conv_node.operation.type == ToString(OperationType::CONVOLUTION_2D)) {
-      Convolution2DAttributes* conv_attr =
-          absl::any_cast<Convolution2DAttributes>(
-              &conv_node.operation.attributes);
-      FuseAddWithConvolution2D(add_attr, conv_attr);
-    } else if (conv_node.operation.type ==
-               ToString(OperationType::DEPTHWISE_CONVOLUTION)) {
-      DepthwiseConvolution2DAttributes* conv_attr =
-          absl::any_cast<DepthwiseConvolution2DAttributes>(
-              &conv_node.operation.attributes);
-      FuseAddWithDepthwiseConvolution2D(add_attr, conv_attr);
-    } else if (conv_node.operation.type ==
-               ToString(OperationType::FULLY_CONNECTED)) {
-      FullyConnectedAttributes* conv_attr =
-          absl::any_cast<FullyConnectedAttributes>(
-              &conv_node.operation.attributes);
-      FuseAddWithFullyConnected(add_attr, conv_attr);
-    } else {
-      return {TransformStatus::SKIPPED, ""};
-    }
-
-    absl::Status status = RemovePrecedingNode(graph, &add_node, &conv_node);
-    if (!status.ok()) {
-      return {TransformStatus::INVALID,
-              "Unable to remove add node after convolution: " +
-                  std::string(status.message())};
-    }
-    return {TransformStatus::APPLIED, ""};
-  }
-};
 }  // namespace
 
 std::unique_ptr<SequenceTransformation> NewMergeConvolutionWithAdd() {
   return absl::make_unique<MergeConvolutionWithAdd>();
 }
 
-std::unique_ptr<SequenceTransformation> NewMergeAddWithConvolution() {
-  return absl::make_unique<MergeAddWithConvolution>();
-}
-
 void FuseConvolution2DWithAdd(const AddAttributes& add_attr,
                               Convolution2DAttributes* attr) {
   FuseBiasWithAddAttributes(add_attr, attr->weights.shape.o, &attr->bias);
@@ -173,65 +119,5 @@ void FuseFullyConnectedWithAdd(const AddAttributes& add_attr,
   FuseBiasWithAddAttributes(add_attr, attr->weights.shape.o, &attr->bias);
 }
 
-void FuseAddWithConvolution2D(const AddAttributes& add_attr,
-                              Convolution2DAttributes* attr) {
-  auto add = absl::get_if<Tensor<Linear, DataType::FLOAT32>>(&add_attr.param);
-  auto add_scalar = absl::get_if<float>(&add_attr.param);
-  if (attr->bias.data.empty()) {
-    attr->bias = MakeZeroTensor<Linear, DataType::FLOAT32>(
-        Linear(attr->weights.shape.o));
-  }
-  for (int d = 0; d < attr->weights.shape.o; ++d) {
-    for (int s = 0; s < attr->weights.shape.i; ++s) {
-      const float add_value = add ? add->data[s] : *add_scalar;
-      for (int k_y = 0; k_y < attr->weights.shape.h; ++k_y) {
-        for (int k_x = 0; k_x < attr->weights.shape.w; ++k_x) {
-          const int index = attr->weights.shape.LinearIndex({{d, k_y, k_x, s}});
-          attr->bias.data[d] += attr->weights.data[index] * add_value;
-        }
-      }
-    }
-  }
-}
-
-void FuseAddWithDepthwiseConvolution2D(const AddAttributes& add_attr,
-                                       DepthwiseConvolution2DAttributes* attr) {
-  auto add = absl::get_if<Tensor<Linear, DataType::FLOAT32>>(&add_attr.param);
-  auto add_scalar = absl::get_if<float>(&add_attr.param);
-  if (attr->bias.data.empty()) {
-    attr->bias = MakeZeroTensor<Linear, DataType::FLOAT32>(
-        Linear(attr->weights.shape.o * attr->weights.shape.i));
-  }
-  for (int s = 0; s < attr->weights.shape.i; ++s) {
-    const float add_value = add ? add->data[s] : *add_scalar;
-    for (int g = 0; g < attr->weights.shape.o; ++g) {
-      const int d = s * attr->weights.shape.o + g;
-      for (int k_y = 0; k_y < attr->weights.shape.h; ++k_y) {
-        for (int k_x = 0; k_x < attr->weights.shape.w; ++k_x) {
-          const int index = attr->weights.shape.LinearIndex({{g, k_y, k_x, s}});
-          attr->bias.data[d] += attr->weights.data[index] * add_value;
-        }
-      }
-    }
-  }
-}
-
-void FuseAddWithFullyConnected(const AddAttributes& add_attr,
-                               FullyConnectedAttributes* attr) {
-  auto add = absl::get_if<Tensor<Linear, DataType::FLOAT32>>(&add_attr.param);
-  auto add_scalar = absl::get_if<float>(&add_attr.param);
-  if (attr->bias.data.empty()) {
-    attr->bias = MakeZeroTensor<Linear, DataType::FLOAT32>(
-        Linear(attr->weights.shape.o));
-  }
-  for (int d = 0; d < attr->weights.shape.o; ++d) {
-    for (int s = 0; s < attr->weights.shape.i; ++s) {
-      const float add_value = add ? add->data[s] : *add_scalar;
-      const int index = attr->weights.shape.LinearIndex({{d, 0, 0, s}});
-      attr->bias.data[d] += attr->weights.data[index] * add_value;
-    }
-  }
-}
-
 }  // namespace gpu
 }  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/common/transformations/fuse_add_to_conv.h b/tensorflow/lite/delegates/gpu/common/transformations/fuse_add_to_conv.h
index 49871a815da..85014ec177e 100644
--- a/tensorflow/lite/delegates/gpu/common/transformations/fuse_add_to_conv.h
+++ b/tensorflow/lite/delegates/gpu/common/transformations/fuse_add_to_conv.h
@@ -30,11 +30,6 @@ namespace gpu {
 // convolution.
 std::unique_ptr<SequenceTransformation> NewMergeConvolutionWithAdd();
 
-// Fuse Add Scalar or Add Broadcast before Convolution(Convolution2D,
-// DepthWise, FullyConnected) into biases of
-// convolution.
-std::unique_ptr<SequenceTransformation> NewMergeAddWithConvolution();
-
 // Modify Convolution2DAttributes so that after making convolution with
 // modified attributes we will have the same result as convolution
 // with old attributes and following add operation.
@@ -59,24 +54,6 @@ void FuseConvolutionTransposedWithAdd(const AddAttributes& add_attr,
 void FuseFullyConnectedWithAdd(const AddAttributes& add_attr,
                                FullyConnectedAttributes* attr);
 
-// Modify Convolution2DAttributes so that after making convolution with
-// modified attributes we will have the same result as add operation and
-// convolution with old attributes
-void FuseAddWithConvolution2D(const AddAttributes& add_attr,
-                              Convolution2DAttributes* attr);
-
-// Modify DepthwiseConvolution2DAttributes so that after making depth wise
-// convolution with modified attributes we will have the same result as add
-// operation and depth wise convolution with old attributes
-void FuseAddWithDepthwiseConvolution2D(const AddAttributes& add_attr,
-                                       DepthwiseConvolution2DAttributes* attr);
-
-// Modify FullyConnectedAttributes so that after making fully connected
-// with modified attributes we will have the same result as add operation and
-// fully connected with old attributes
-void FuseAddWithFullyConnected(const AddAttributes& add_attr,
-                               FullyConnectedAttributes* attr);
-
 }  // namespace gpu
 }  // namespace tflite
 
diff --git a/tensorflow/lite/delegates/gpu/common/transformations/fuse_add_to_conv_test.cc b/tensorflow/lite/delegates/gpu/common/transformations/fuse_add_to_conv_test.cc
index 431d8167f81..53dba56ffb8 100644
--- a/tensorflow/lite/delegates/gpu/common/transformations/fuse_add_to_conv_test.cc
+++ b/tensorflow/lite/delegates/gpu/common/transformations/fuse_add_to_conv_test.cc
@@ -78,57 +78,6 @@ TEST(MergeConvolutionWithAddTest, Smoke) {
             graph.nodes()[0]->operation.type);
 }
 
-TEST(MergeAddWithConvolutionTest, Smoke) {
-  GraphFloat32 graph;
-  auto input = graph.NewValue();
-  input->tensor.shape = BHWC(1, 4, 4, 8);
-
-  Convolution2DAttributes conv_attr;
-  conv_attr.padding.prepended = HW(0, 0);
-  conv_attr.padding.appended = HW(0, 0);
-  conv_attr.strides = HW(1, 1);
-  conv_attr.dilations = HW(1, 1);
-  conv_attr.weights.shape = OHWI(16, 3, 2, 8);
-  conv_attr.weights.data.resize(conv_attr.weights.shape.DimensionsProduct());
-  conv_attr.bias.shape = Linear(16);
-  conv_attr.bias.data.resize(16);
-
-  Tensor<Linear, DataType::FLOAT32> add_tensor;
-  add_tensor.shape = Linear(8);
-  add_tensor.data.resize(8);
-  AddAttributes add_attr;
-  add_attr.param = add_tensor;
-
-  auto conv_node = graph.NewNode();
-  conv_node->operation.type = ToString(OperationType::CONVOLUTION_2D);
-  conv_node->operation.attributes = conv_attr;
-  auto add_node = graph.NewNode();
-  add_node->operation.type = ToString(OperationType::ADD);
-  add_node->operation.attributes = add_attr;
-
-  ASSERT_TRUE(graph.AddConsumer(add_node->id, input->id).ok());
-
-  Value* output;
-  ASSERT_TRUE(AddOutput(&graph, conv_node, &output).ok());
-  output->tensor.shape = BHWC(1, 4, 4, 16);
-
-  Value* link1;
-  ASSERT_TRUE(ConnectTwoNodes(&graph, add_node, conv_node, &link1).ok());
-  link1->tensor.shape = BHWC(1, 4, 4, 16);
-
-  ASSERT_EQ(2, graph.nodes().size());
-  ASSERT_EQ(3, graph.values().size());
-
-  auto transformation = NewMergeAddWithConvolution();
-  ModelTransformer transformer(&graph, nullptr);
-  transformer.Apply("merge_add_with_convolution", transformation.get());
-
-  EXPECT_EQ(1, graph.nodes().size());
-  EXPECT_EQ(2, graph.values().size());
-  EXPECT_EQ(ToString(OperationType::CONVOLUTION_2D),
-            graph.nodes()[0]->operation.type);
-}
-
 TEST(FuseAddAfterConvolution2DTest, Smoke) {
   Convolution2DAttributes attr;
   attr.weights.shape = OHWI(2, 1, 2, 2);
@@ -213,69 +162,6 @@ TEST(FuseAddAfterFullyConnectedTest, Smoke) {
   EXPECT_THAT(attr.bias.data, Pointwise(FloatNear(1e-6), {1.4f, 1.9f}));
 }
 
-TEST(FuseAddBeforeConvolution2DTest, Smoke) {
-  Convolution2DAttributes attr;
-  attr.weights.shape = OHWI(2, 1, 2, 2);
-  attr.weights.data = {0.1f, 0.2f, 0.3f, 0.4f, 0.5f, 0.6f, 0.7f, 0.8f};
-  attr.bias.shape = Linear(2);
-  attr.bias.data = {1.1f, 1.2f};
-
-  Tensor<Linear, DataType::FLOAT32> add_tensor;
-  add_tensor.shape = Linear(2);
-  add_tensor.data = {2.0f, 0.5f};
-  AddAttributes add_attr;
-  add_attr.param = add_tensor;
-
-  FuseAddWithConvolution2D(add_attr, &attr);
-
-  EXPECT_THAT(attr.weights.data,
-              Pointwise(FloatNear(1e-6),
-                        {0.1f, 0.2f, 0.3f, 0.4f, 0.5f, 0.6f, 0.7f, 0.8f}));
-  EXPECT_THAT(attr.bias.data, Pointwise(FloatNear(1e-6), {2.2f, 4.3f}));
-}
-
-TEST(FuseAddBeforeDepthwiseConvolution2DTest, Smoke) {
-  DepthwiseConvolution2DAttributes attr;
-  attr.weights.shape = OHWI(2, 1, 2, 2);
-  attr.weights.data = {0.1f, 0.2f, 0.3f, 0.4f, 0.5f, 0.6f, 0.7f, 0.8f};
-  attr.bias.shape = Linear(4);
-  attr.bias.data = {1.1f, 1.2f, 1.3f, 1.4f};
-
-  Tensor<Linear, DataType::FLOAT32> add_tensor;
-  add_tensor.shape = Linear(4);
-  add_tensor.data = {0.3f, 0.7f, 0.5f, 0.1f};
-  AddAttributes add_attr;
-  add_attr.param = add_tensor;
-
-  FuseAddWithDepthwiseConvolution2D(add_attr, &attr);
-
-  EXPECT_THAT(attr.weights.data,
-              Pointwise(FloatNear(1e-6),
-                        {0.1f, 0.2f, 0.3f, 0.4f, 0.5f, 0.6f, 0.7f, 0.8f}));
-  EXPECT_THAT(attr.bias.data,
-              Pointwise(FloatNear(1e-6), {1.22f, 1.56f, 1.72f, 2.38f}));
-}
-
-TEST(FuseAddBeforeFullyConnectedTest, Smoke) {
-  FullyConnectedAttributes attr;
-  attr.weights.shape = OHWI(2, 1, 1, 2);
-  attr.weights.data = {0.1f, 0.2f, 0.3f, 0.4f};
-  attr.bias.shape = Linear(2);
-  attr.bias.data = {1.1f, 1.2f};
-
-  Tensor<Linear, DataType::FLOAT32> add_tensor;
-  add_tensor.shape = Linear(2);
-  add_tensor.data = {0.5f, 2.0f};
-  AddAttributes add_attr;
-  add_attr.param = add_tensor;
-
-  FuseAddWithFullyConnected(add_attr, &attr);
-
-  EXPECT_THAT(attr.weights.data,
-              Pointwise(FloatNear(1e-6), {0.1f, 0.2f, 0.3f, 0.4f}));
-  EXPECT_THAT(attr.bias.data, Pointwise(FloatNear(1e-6), {1.55f, 2.15f}));
-}
-
 }  // namespace
 }  // namespace gpu
 }  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/common/transformations/general_transformations.cc b/tensorflow/lite/delegates/gpu/common/transformations/general_transformations.cc
index 354fbcd040b..f9ae7f41f8f 100644
--- a/tensorflow/lite/delegates/gpu/common/transformations/general_transformations.cc
+++ b/tensorflow/lite/delegates/gpu/common/transformations/general_transformations.cc
@@ -54,9 +54,7 @@ bool ApplyGeneralTransformations(ModelTransformer* transformer) {
          transformer->Apply("merge_convolution_with_add",
                             NewMergeConvolutionWithAdd().get()) &&
          transformer->Apply("merge_mul_with_convolution",
-                            NewMergeMulWithConvolution().get()) &&
-         transformer->Apply("merge_add_with_convolution",
-                            NewMergeAddWithConvolution().get());
+                            NewMergeMulWithConvolution().get());
 }
 
 }  // namespace gpu

From 2b9a7d5e40aa85e84f7d6196d3e7527675d33038 Mon Sep 17 00:00:00 2001
From: peng <pengmeng@tencent.com>
Date: Wed, 17 Jun 2020 11:21:12 +0800
Subject: [PATCH 0370/1390] skip if no cuda

---
 tensorflow/core/grappler/optimizers/remapper_test.cc | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tensorflow/core/grappler/optimizers/remapper_test.cc b/tensorflow/core/grappler/optimizers/remapper_test.cc
index 831aa51ee72..a947210d8a6 100644
--- a/tensorflow/core/grappler/optimizers/remapper_test.cc
+++ b/tensorflow/core/grappler/optimizers/remapper_test.cc
@@ -452,6 +452,9 @@ TEST_F(RemapperTest, FuseMatMulWithBias) {
 TEST_F(RemapperTest, FuseConv2DWithBiasAndActivationOnGPU) {
   using ::tensorflow::ops::Placeholder;
 
+#if !(GOOGLE_CUDA)
+  GTEST_SKIP() << "No CUDA, skip FuseConv2DWithBiasAndActivation on GPU";
+#endif  // !GOOGLE_CUDA
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
 
   auto input_shape = Placeholder::Shape({8, 32, 32, 3});

From 9bf28d66871931c16ba8269873918a914bc3efe7 Mon Sep 17 00:00:00 2001
From: peng <pengmeng@tencent.com>
Date: Wed, 17 Jun 2020 11:24:34 +0800
Subject: [PATCH 0371/1390] delete empty lines

---
 tensorflow/core/grappler/optimizers/remapper.cc | 2 --
 1 file changed, 2 deletions(-)

diff --git a/tensorflow/core/grappler/optimizers/remapper.cc b/tensorflow/core/grappler/optimizers/remapper.cc
index b738a23de23..a6ddad8dfc8 100644
--- a/tensorflow/core/grappler/optimizers/remapper.cc
+++ b/tensorflow/core/grappler/optimizers/remapper.cc
@@ -1667,7 +1667,6 @@ bool RequiresInferredShapes(const RemapperContext& ctx, int node_index) {
     return false;
   };
 
-
 #ifdef INTEL_MKL
   return is_batch_norm_candidate() || is_batch_norm_fusion_candidate() ||
          IsConv2DWithAdd(ctx, node_index);
@@ -1817,7 +1816,6 @@ Status Remapper::Optimize(Cluster* cluster, const GrapplerItem& item,
     }
 #endif  // !INTEL_MKL
 
-
     // Remap FusedBatchNorm+<SideInput>+<Activation> into the _FusedBatchNormEx.
     FusedBatchNormEx fused_batch_norm_ex;
     if (allow_non_differentiable_rewrites &&

From dd417ca4477794c1dc0bc3df5b818b420417b4bf Mon Sep 17 00:00:00 2001
From: peng <pengmeng@tencent.com>
Date: Wed, 17 Jun 2020 11:28:25 +0800
Subject: [PATCH 0372/1390] delete empty lines

---
 tensorflow/core/grappler/optimizers/remapper.cc | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tensorflow/core/grappler/optimizers/remapper.cc b/tensorflow/core/grappler/optimizers/remapper.cc
index a6ddad8dfc8..b1158813345 100644
--- a/tensorflow/core/grappler/optimizers/remapper.cc
+++ b/tensorflow/core/grappler/optimizers/remapper.cc
@@ -1783,7 +1783,6 @@ Status Remapper::Optimize(Cluster* cluster, const GrapplerItem& item,
 // Remove this once TF-MKL supports _FusedConv2D with these operations.
 #ifndef INTEL_MKL
     // Remap Conv2D+Squeeze+BiasAdd into the _FusedConv2D+Squeeze.
-
     ContractionWithSqueezeAndBiasAdd contract_with_squeeze_and_bias;
     if (allow_non_differentiable_rewrites &&
         FindConv2DWithSqueezeAndBias(ctx, i, &contract_with_squeeze_and_bias)) {

From 1890578d7a576cd18e2044c330b1286efaf34eee Mon Sep 17 00:00:00 2001
From: Steenu Johnson <steenu.johnson@students.iiserpune.ac.in>
Date: Wed, 17 Jun 2020 09:18:43 +0530
Subject: [PATCH 0373/1390] Fixing build errors.

api_test
api_compatibility_test

Signed-off-by: Steenu Johnson <steenu.johnson@students.iiserpune.ac.in>
---
 tensorflow/core/api_def/base_api/api_def_CSVDatasetV2.pbtxt   | 4 ++++
 tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt       | 4 ++++
 .../golden/v2/tensorflow.data.experimental.-csv-dataset.pbtxt | 2 +-
 tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt       | 4 ++++
 4 files changed, 13 insertions(+), 1 deletion(-)
 create mode 100644 tensorflow/core/api_def/base_api/api_def_CSVDatasetV2.pbtxt

diff --git a/tensorflow/core/api_def/base_api/api_def_CSVDatasetV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_CSVDatasetV2.pbtxt
new file mode 100644
index 00000000000..65b6c660947
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_CSVDatasetV2.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "CSVDatasetV2"
+  visibility: HIDDEN
+}
\ No newline at end of file
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt
index a5fe83e713e..96e1122f1e5 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt
@@ -632,6 +632,10 @@ tf_module {
     name: "CSVDataset"
     argspec: "args=[\'filenames\', \'compression_type\', \'buffer_size\', \'header\', \'field_delim\', \'use_quote_delim\', \'na_value\', \'select_cols\', \'record_defaults\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "CSVDatasetV2"
+    argspec: "args=[\'filenames\', \'compression_type\', \'buffer_size\', \'header\', \'field_delim\', \'use_quote_delim\', \'na_value\', \'select_cols\', \'record_defaults\', \'exclude_cols\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "CTCBeamSearchDecoder"
     argspec: "args=[\'inputs\', \'sequence_length\', \'beam_width\', \'top_paths\', \'merge_repeated\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-csv-dataset.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-csv-dataset.pbtxt
index 41865c9700f..21d53a27e74 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-csv-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-csv-dataset.pbtxt
@@ -12,7 +12,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'filenames\', \'record_defaults\', \'compression_type\', \'buffer_size\', \'header\', \'field_delim\', \'use_quote_delim\', \'na_value\', \'select_cols\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'False\', \',\', \'True\', \'\', \'None\'], "
+    argspec: "args=[\'self\', \'filenames\', \'record_defaults\', \'compression_type\', \'buffer_size\', \'header\', \'field_delim\', \'use_quote_delim\', \'na_value\', \'select_cols\', \'exclude_cols\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'False\', \',\', \'True\', \'\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt
index a5fe83e713e..96e1122f1e5 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt
@@ -632,6 +632,10 @@ tf_module {
     name: "CSVDataset"
     argspec: "args=[\'filenames\', \'compression_type\', \'buffer_size\', \'header\', \'field_delim\', \'use_quote_delim\', \'na_value\', \'select_cols\', \'record_defaults\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "CSVDatasetV2"
+    argspec: "args=[\'filenames\', \'compression_type\', \'buffer_size\', \'header\', \'field_delim\', \'use_quote_delim\', \'na_value\', \'select_cols\', \'record_defaults\', \'exclude_cols\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "CTCBeamSearchDecoder"
     argspec: "args=[\'inputs\', \'sequence_length\', \'beam_width\', \'top_paths\', \'merge_repeated\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "

From 806e85998ab1cfbb8a969e67f1272f31604ea5f0 Mon Sep 17 00:00:00 2001
From: Mehdi Amini <aminim@google.com>
Date: Wed, 17 Jun 2020 00:14:26 -0700
Subject: [PATCH 0374/1390] Bump open source llvm revision to
 4799fb63b5513f655ca8e85416ec8fe35df49bae

PiperOrigin-RevId: 316836386
Change-Id: I26fa7c031f5057f1db5e3b26db09a02ff79f4081
---
 tensorflow/workspace.bzl                  |  4 ++--
 third_party/llvm/llvm.autogenerated.BUILD | 22 ----------------------
 2 files changed, 2 insertions(+), 24 deletions(-)

diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index acba1598d1f..cb1ea721fb0 100755
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -710,8 +710,8 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
     )
 
     # Check out LLVM and MLIR from llvm-project.
-    LLVM_COMMIT = "9b72b47ed63351ee5ceff4c44ccd9a71dc7dad27"
-    LLVM_SHA256 = "03ce1e00901936e7259c6ee465773b7f231ca1724925460b71909868f5a61e11"
+    LLVM_COMMIT = "4799fb63b5513f655ca8e85416ec8fe35df49bae"
+    LLVM_SHA256 = "f401a61bd7f5b05bd8a3ffdfb1f32e9379cae2c8e988f3ae6772b588ad97c84a"
     LLVM_URLS = [
         "https://storage.googleapis.com/mirror.tensorflow.org/github.com/llvm/llvm-project/archive/{commit}.tar.gz".format(commit = LLVM_COMMIT),
         "https://github.com/llvm/llvm-project/archive/{commit}.tar.gz".format(commit = LLVM_COMMIT),
diff --git a/third_party/llvm/llvm.autogenerated.BUILD b/third_party/llvm/llvm.autogenerated.BUILD
index d05fa841420..50b71a3686f 100644
--- a/third_party/llvm/llvm.autogenerated.BUILD
+++ b/third_party/llvm/llvm.autogenerated.BUILD
@@ -2753,27 +2753,6 @@ cc_library(
     ],
 )
 
-cc_library(
-    name = "MLPolicies",
-    srcs = glob([
-        "lib/Analysis/ML/*.c",
-        "lib/Analysis/ML/*.cpp",
-        "lib/Analysis/ML/*.inc",
-        "lib/Analysis/ML/*.h",
-    ]),
-    hdrs = glob([
-        "include/llvm/Analysis/ML/*.h",
-        "include/llvm/Analysis/ML/*.def",
-        "include/llvm/Analysis/ML/*.inc",
-    ]),
-    copts = llvm_copts,
-    deps = [
-        ":Core",
-        ":Support",
-        ":config",
-    ],
-)
-
 cc_library(
     name = "MSP430AsmParser",
     srcs = glob([
@@ -3256,7 +3235,6 @@ cc_library(
         ":IPO",
         ":InstCombine",
         ":Instrumentation",
-        ":MLPolicies",
         ":Scalar",
         ":Support",
         ":Target",

From af926984871a130eec2816815cfc98a362d4f5b6 Mon Sep 17 00:00:00 2001
From: Tres Popp <tpopp@google.com>
Date: Wed, 17 Jun 2020 00:44:26 -0700
Subject: [PATCH 0375/1390] [TF:XLA] Update TF:XLA tests for
 matrix_triangular_solve to test V1 and V2.

TF:V1 raises an error on non-square coefficient matrices
TF:V2 allows non-square coefficient matrices.
PiperOrigin-RevId: 316839892
Change-Id: I34c2567ba3579c8f0fd4bc6da57abe14bc6471b2
---
 .../tests/matrix_triangular_solve_op_test.py         | 12 +++++-------
 .../tf2xla/kernels/matrix_triangular_solve_op.cc     |  8 ++++++++
 2 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/tensorflow/compiler/tests/matrix_triangular_solve_op_test.py b/tensorflow/compiler/tests/matrix_triangular_solve_op_test.py
index b07b254c600..0202c582ef3 100644
--- a/tensorflow/compiler/tests/matrix_triangular_solve_op_test.py
+++ b/tensorflow/compiler/tests/matrix_triangular_solve_op_test.py
@@ -135,18 +135,16 @@ class MatrixTriangularSolveOpTest(xla_test.XLATestCase):
     self._VerifyTriangularSolve(
         a.astype(np.float32), b.astype(np.float32), True, False, 1e-4)
 
-  @test_util.run_deprecated_v1
-  def testNonSquareCoefficientMatrixV1(self):
+  def testNonSquareCoefficientMatrix(self):
     rng = np.random.RandomState(0)
     for dtype in self.float_types:
       a = rng.randn(3, 4).astype(dtype)
       b = rng.randn(4, 4).astype(dtype)
-      with self.assertRaises(ValueError):
-        linalg_ops.matrix_triangular_solve(a, b)
-      with self.assertRaises(ValueError):
-        linalg_ops.matrix_triangular_solve(a, b)
+      with self.test_scope():
+        with self.assertRaises((ValueError, errors.InvalidArgumentError)):
+          linalg_ops.matrix_triangular_solve(a, b)
 
-  @test_util.run_v2_only
+  @test_util.run_v2_only  # Different error types
   def testWrongDimensionsV2(self):
     randn = np.random.RandomState(0).randn
     for dtype in self.float_types:
diff --git a/tensorflow/compiler/tf2xla/kernels/matrix_triangular_solve_op.cc b/tensorflow/compiler/tf2xla/kernels/matrix_triangular_solve_op.cc
index 5a719484e05..8d222d947c9 100644
--- a/tensorflow/compiler/tf2xla/kernels/matrix_triangular_solve_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/matrix_triangular_solve_op.cc
@@ -50,6 +50,14 @@ class MatrixTriangularSolveOp : public XlaOpKernel {
       return;
     }
 
+    auto lhs_size = lhs_shape.dims();
+    OP_REQUIRES(
+        ctx,
+        lhs_shape.dim_size(lhs_size - 1) == lhs_shape.dim_size(lhs_size - 2),
+        errors::InvalidArgument("The coefficient matrix must be square in "
+                                "the inner-most two dimensions: ",
+                                lhs_shape.DebugString()));
+
     xla::XlaOp a = ctx->Input(0);
     xla::XlaOp b = ctx->Input(1);
     std::tie(a, b) = Broadcast(a, lhs_shape, b, rhs_shape, bcast);

From 02aad07710422ccb12a1d63093cb34c37e80e538 Mon Sep 17 00:00:00 2001
From: Stephan Herhut <herhut@google.com>
Date: Wed, 17 Jun 2020 00:55:02 -0700
Subject: [PATCH 0376/1390] Remove left-over declaration for buffer assignment
 pass that has since been removed.

PiperOrigin-RevId: 316841221
Change-Id: I9c38becf498cfaaa7edb9490948890d52e49e737
---
 .../compiler/mlir/xla/transforms/passes.h      | 18 ------------------
 1 file changed, 18 deletions(-)

diff --git a/tensorflow/compiler/mlir/xla/transforms/passes.h b/tensorflow/compiler/mlir/xla/transforms/passes.h
index f0c2d9b7372..a2af8124786 100644
--- a/tensorflow/compiler/mlir/xla/transforms/passes.h
+++ b/tensorflow/compiler/mlir/xla/transforms/passes.h
@@ -115,24 +115,6 @@ std::unique_ptr<Pass> createLhloCopyRemovalPass();
 std::unique_ptr<OperationPass<FuncOp>> createLegalizeLhloToParallelLoopsPass();
 
 }  // namespace xla_lhlo
-
-namespace xla {
-
-/// Moves alloc nodes (and their associated dealloc nodes - if any) into the
-/// right positions. If there is no associated dealloc node for a given alloc
-/// node, this pass will automatically insert a proper dealloc node in the right
-/// place. The intended use case of this pass is to store SSA values into
-/// buffers using load/store operations. For this purpose, you need to know
-/// proper positions to place the required allocs and deallocs.
-/// 1) Note that the function signatures and all types for which buffers should
-/// be allocated need to be converted in advance.
-/// 2) All required alloc nodes have the be inserted in advance.
-/// 3) Note that the current implementation does not support loops.
-/// Refer to the class mlir::xla::BufferAssignmentLegalizer for more
-/// information.
-std::unique_ptr<OperationPass<FuncOp>> createBufferAssignmentPass();
-
-}  // namespace xla
 }  // namespace mlir
 
 #endif  // TENSORFLOW_COMPILER_MLIR_XLA_TRANSFORMS_PASSES_H_

From de907d8746c0f8909d50e8d699ff09f6248408a1 Mon Sep 17 00:00:00 2001
From: Tres Popp <tpopp@google.com>
Date: Wed, 17 Jun 2020 01:20:35 -0700
Subject: [PATCH 0377/1390] Enable XLA:CPU fast math for min/max by default to
 be similar to TF's behavior.

Another big change here is changing the use of this flag to use the value in the HloModule and not the global environment variable which was bad temporary behavior.

PiperOrigin-RevId: 316844057
Change-Id: I995715ccc9009e9845fca77060b835fdc50fb4d2
---
 tensorflow/compiler/xla/debug_options_flags.cc       |  2 +-
 .../compiler/xla/service/cpu/elemental_ir_emitter.h  |  4 ++++
 .../compiler/xla/service/cpu/llvm_ir_runtime.cc      |  4 +++-
 .../xla/service/cpu/vector_support_library.cc        |  5 +++--
 .../xla/service/cpu/vector_support_library.h         |  8 +++++---
 .../compiler/xla/service/elemental_ir_emitter.cc     |  4 ++--
 .../compiler/xla/service/elemental_ir_emitter.h      |  2 ++
 .../compiler/xla/service/gpu/elemental_ir_emitter.h  |  4 ++++
 tensorflow/compiler/xla/service/llvm_ir/llvm_util.cc | 12 ++++--------
 tensorflow/compiler/xla/service/llvm_ir/llvm_util.h  |  4 ++--
 tensorflow/compiler/xla/service/llvm_ir/math_ops.cc  |  8 ++++++--
 tensorflow/python/ops/nn_test.py                     |  2 ++
 12 files changed, 38 insertions(+), 21 deletions(-)

diff --git a/tensorflow/compiler/xla/debug_options_flags.cc b/tensorflow/compiler/xla/debug_options_flags.cc
index 81655101701..8ca6e2b294c 100644
--- a/tensorflow/compiler/xla/debug_options_flags.cc
+++ b/tensorflow/compiler/xla/debug_options_flags.cc
@@ -64,7 +64,7 @@ DebugOptions DefaultDebugOptionsIgnoringFlags() {
   opts.set_xla_cpu_fast_math_honor_division(true);
 
   // By default, copy TF's Eigen style min_max behavior with nans.
-  opts.set_xla_cpu_enable_fast_min_max(false);
+  opts.set_xla_cpu_enable_fast_min_max(true);
 
   opts.set_xla_gpu_enable_fast_min_max(true);
 
diff --git a/tensorflow/compiler/xla/service/cpu/elemental_ir_emitter.h b/tensorflow/compiler/xla/service/cpu/elemental_ir_emitter.h
index 5c9f6677ab3..4c3167e16d9 100644
--- a/tensorflow/compiler/xla/service/cpu/elemental_ir_emitter.h
+++ b/tensorflow/compiler/xla/service/cpu/elemental_ir_emitter.h
@@ -50,6 +50,10 @@ class CpuElementalIrEmitter : public ElementalIrEmitter {
     return ir_emitter_->EmitThreadLocalCall(callee, parameters, name);
   }
 
+  bool fast_min_max() override {
+    return hlo_module_config_.debug_options().xla_cpu_enable_fast_min_max();
+  }
+
   IrEmitter* ir_emitter_;
 };
 
diff --git a/tensorflow/compiler/xla/service/cpu/llvm_ir_runtime.cc b/tensorflow/compiler/xla/service/cpu/llvm_ir_runtime.cc
index f62769cc615..8d9229c1223 100644
--- a/tensorflow/compiler/xla/service/cpu/llvm_ir_runtime.cc
+++ b/tensorflow/compiler/xla/service/cpu/llvm_ir_runtime.cc
@@ -318,7 +318,9 @@ llvm::Value* GenerateVF32Log(llvm::IRBuilder<>* b, llvm::Value* input,
   llvm::Value* is_pos_inf_mask = vsl.FCmpEQMask(input, pos_inf);
 
   // Cut off denormalized stuff.
-  llvm::Value* tmp0 = vsl.Max(min_norm_pos, input);
+  // Always allow fast max because we are checking for the nan above.
+  llvm::Value* tmp0 =
+      vsl.Max(min_norm_pos, input, /*enable_fast_min_max=*/true);
 
   // VectorSupportLibrary (intentionally) can't juggle more than one type at a
   // time so drop down to IRBuilder for this bit.
diff --git a/tensorflow/compiler/xla/service/cpu/vector_support_library.cc b/tensorflow/compiler/xla/service/cpu/vector_support_library.cc
index b15ad1e162d..0d2eab9fd42 100644
--- a/tensorflow/compiler/xla/service/cpu/vector_support_library.cc
+++ b/tensorflow/compiler/xla/service/cpu/vector_support_library.cc
@@ -80,10 +80,11 @@ llvm::Value* VectorSupportLibrary::Sub(llvm::Value* lhs, llvm::Value* rhs) {
   return b()->CreateFSub(lhs, rhs);
 }
 
-llvm::Value* VectorSupportLibrary::Max(llvm::Value* lhs, llvm::Value* rhs) {
+llvm::Value* VectorSupportLibrary::Max(llvm::Value* lhs, llvm::Value* rhs,
+                                       bool enable_fast_min_max) {
   AssertCorrectTypes({lhs, rhs});
   if (scalar_type_->isFloatingPointTy()) {
-    return llvm_ir::EmitFloatMax(lhs, rhs, b_);
+    return llvm_ir::EmitFloatMax(lhs, rhs, b_, enable_fast_min_max);
   } else {
     LOG(FATAL) << "Max for integers is unimplemented";
   }
diff --git a/tensorflow/compiler/xla/service/cpu/vector_support_library.h b/tensorflow/compiler/xla/service/cpu/vector_support_library.h
index cbbc4d7bf34..f1a0b0a4406 100644
--- a/tensorflow/compiler/xla/service/cpu/vector_support_library.h
+++ b/tensorflow/compiler/xla/service/cpu/vector_support_library.h
@@ -78,9 +78,11 @@ class VectorSupportLibrary {
   llvm::Value* Sub(llvm::Value* lhs, const llvm::APFloat& rhs) {
     return Sub(lhs, GetConstantFloat(lhs->getType(), rhs));
   }
-  llvm::Value* Max(llvm::Value* lhs, llvm::Value* rhs);
-  llvm::Value* Max(const llvm::APFloat& lhs, llvm::Value* rhs) {
-    return Max(GetConstantFloat(rhs->getType(), lhs), rhs);
+  llvm::Value* Max(llvm::Value* lhs, llvm::Value* rhs,
+                   bool enable_fast_min_max);
+  llvm::Value* Max(const llvm::APFloat& lhs, llvm::Value* rhs,
+                   bool enable_fast_min_max) {
+    return Max(GetConstantFloat(rhs->getType(), lhs), rhs, enable_fast_min_max);
   }
   llvm::Value* Div(llvm::Value* lhs, llvm::Value* rhs);
 
diff --git a/tensorflow/compiler/xla/service/elemental_ir_emitter.cc b/tensorflow/compiler/xla/service/elemental_ir_emitter.cc
index e4097b0c06f..4b6c30cadc4 100644
--- a/tensorflow/compiler/xla/service/elemental_ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/elemental_ir_emitter.cc
@@ -1313,12 +1313,12 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitComplexBinaryOp(
 
 llvm::Value* ElementalIrEmitter::EmitFloatMax(llvm::Value* lhs_value,
                                               llvm::Value* rhs_value) {
-  return llvm_ir::EmitFloatMax(lhs_value, rhs_value, b_);
+  return llvm_ir::EmitFloatMax(lhs_value, rhs_value, b_, fast_min_max());
 }
 
 llvm::Value* ElementalIrEmitter::EmitFloatMin(llvm::Value* lhs_value,
                                               llvm::Value* rhs_value) {
-  return llvm_ir::EmitFloatMin(lhs_value, rhs_value, b_);
+  return llvm_ir::EmitFloatMin(lhs_value, rhs_value, b_, fast_min_max());
 }
 
 StatusOr<llvm::Value*> ElementalIrEmitter::EmitLog(PrimitiveType prim_type,
diff --git a/tensorflow/compiler/xla/service/elemental_ir_emitter.h b/tensorflow/compiler/xla/service/elemental_ir_emitter.h
index e39d2dd99ec..365e3f56b85 100644
--- a/tensorflow/compiler/xla/service/elemental_ir_emitter.h
+++ b/tensorflow/compiler/xla/service/elemental_ir_emitter.h
@@ -245,6 +245,8 @@ class ElementalIrEmitter : public IrBuilderMixin<ElementalIrEmitter> {
       std::vector<llvm_ir::ElementGenerator> initial_value_generators,
       const llvm_ir::IrArray::Index& index);
 
+  virtual bool fast_min_max() = 0;
+
   llvm::IRBuilder<>* const b_;
 
   llvm::Module* module_;
diff --git a/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.h b/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.h
index a3056b1ddad..766a4c84df5 100644
--- a/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.h
+++ b/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.h
@@ -96,6 +96,10 @@ class GpuElementalIrEmitter : public ElementalIrEmitter {
 
   llvm::Value* EmitThreadId() override;
 
+  bool fast_min_max() override {
+    return hlo_module_config_.debug_options().xla_gpu_enable_fast_min_max();
+  }
+
  private:
   // Emits IR for op, which must have opcode kPower.
   StatusOr<llvm::Value*> EmitPowerOp(const HloInstruction* op,
diff --git a/tensorflow/compiler/xla/service/llvm_ir/llvm_util.cc b/tensorflow/compiler/xla/service/llvm_ir/llvm_util.cc
index e4ca08f972b..b01ae2efe43 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/llvm_util.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/llvm_util.cc
@@ -91,10 +91,8 @@ llvm::CallInst* EmitCallToIntrinsic(
 }
 
 llvm::Value* EmitFloatMax(llvm::Value* lhs_value, llvm::Value* rhs_value,
-                          llvm::IRBuilder<>* b) {
-  // TODO(tpopp): Pass this information down from the HLO's ModuleConfig.
-  if (b->getFastMathFlags().noNaNs() ||
-      GetDebugOptionsFromFlags().xla_cpu_enable_fast_min_max()) {
+                          llvm::IRBuilder<>* b, bool enable_fast_min_max) {
+  if (b->getFastMathFlags().noNaNs() || enable_fast_min_max) {
     auto cmp = b->CreateFCmpUGE(lhs_value, rhs_value);
     return b->CreateSelect(cmp, lhs_value, rhs_value);
   } else {
@@ -106,10 +104,8 @@ llvm::Value* EmitFloatMax(llvm::Value* lhs_value, llvm::Value* rhs_value,
 }
 
 llvm::Value* EmitFloatMin(llvm::Value* lhs_value, llvm::Value* rhs_value,
-                          llvm::IRBuilder<>* b) {
-  // TODO(tpopp): Pass this information down from the HLO's ModuleConfig.
-  if (b->getFastMathFlags().noNaNs() ||
-      GetDebugOptionsFromFlags().xla_cpu_enable_fast_min_max()) {
+                          llvm::IRBuilder<>* b, bool enable_fast_min_max) {
+  if (b->getFastMathFlags().noNaNs() || enable_fast_min_max) {
     auto cmp = b->CreateFCmpULE(lhs_value, rhs_value);
     return b->CreateSelect(cmp, lhs_value, rhs_value);
   } else {
diff --git a/tensorflow/compiler/xla/service/llvm_ir/llvm_util.h b/tensorflow/compiler/xla/service/llvm_ir/llvm_util.h
index 691898011ed..642965b6470 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/llvm_util.h
+++ b/tensorflow/compiler/xla/service/llvm_ir/llvm_util.h
@@ -108,12 +108,12 @@ llvm::CallInst* EmitCallToIntrinsic(
 // Emit float max. Emit maxnum intrinsic is fast math is disabled, or
 // fcmp+select otherwise
 llvm::Value* EmitFloatMax(llvm::Value* lhs_value, llvm::Value* rhs_value,
-                          llvm::IRBuilder<>* b);
+                          llvm::IRBuilder<>* b, bool enable_fast_min_max);
 
 // Emit float min. Emit minnum intrinsic is fast math is disabled, or
 // fcmp+select otherwise
 llvm::Value* EmitFloatMin(llvm::Value* lhs_value, llvm::Value* rhs_value,
-                          llvm::IRBuilder<>* b);
+                          llvm::IRBuilder<>* b, bool enable_fast_min_max);
 
 // Convenience methods for emitting a GEP instruction that indexes into a buffer
 // (1-dimensional array), equivalent to array[index]. The type is automatically
diff --git a/tensorflow/compiler/xla/service/llvm_ir/math_ops.cc b/tensorflow/compiler/xla/service/llvm_ir/math_ops.cc
index 333a2e8f612..0604cb848d2 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/math_ops.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/math_ops.cc
@@ -31,9 +31,13 @@ llvm::Value* EmitFastTanh(llvm::IRBuilder<>* b, llvm::Value* input) {
       b->CreateFCmpOLT(abs_x, llvm::ConstantFP::get(type, kCanUseApprox));
 
   // Clamp the input to [-9, 9].
+  //
+  // To simplify the code base until it's an issue, don't have a slow min/max in
+  // this approximation.
   llvm::Value* input_clamped = llvm_ir::EmitFloatMin(
-      llvm_ir::EmitFloatMax(input, llvm::ConstantFP::get(type, -9.0), b),
-      llvm::ConstantFP::get(type, 9.0), b);
+      llvm_ir::EmitFloatMax(input, llvm::ConstantFP::get(type, -9.0), b,
+                            /*enable_fast_min_max=*/true),
+      llvm::ConstantFP::get(type, 9.0), b, /*enable_fast_min_max=*/true);
 
   static constexpr std::array<float, 7> numerator_coeffs{
       -2.76076847742355e-16f, 2.00018790482477e-13f, -8.60467152213735e-11f,
diff --git a/tensorflow/python/ops/nn_test.py b/tensorflow/python/ops/nn_test.py
index 6ecd1e015d2..bfe11b63eea 100644
--- a/tensorflow/python/ops/nn_test.py
+++ b/tensorflow/python/ops/nn_test.py
@@ -1004,6 +1004,8 @@ class ReluTest(test_lib.TestCase):
     z = self.evaluate(nn_ops.relu(constant_op.constant(x)))
     self.assertAllEqual(y, z)
 
+  @test_util.disable_xla(
+      "This test relies on undefined behavior that XLA does not replicate")
   @test_util.run_deprecated_v1
   def testNaNs(self):
     # Test that relu(nan) = nan for various sizes.

From b0199879f09e4bd0068e627e1a43384dcff34983 Mon Sep 17 00:00:00 2001
From: Stephan Herhut <herhut@google.com>
Date: Wed, 17 Jun 2020 01:26:51 -0700
Subject: [PATCH 0378/1390] Add lowering for TanhOp that uses an approximation
 instead of lowering to intrinsics.

The same approximation is used by the XLA compiler.

PiperOrigin-RevId: 316844625
Change-Id: I1a909bd063509491a6a58ae0acc3bfb919cb34d5
---
 tensorflow/compiler/mlir/xla/BUILD            |  19 ++
 .../tests/legalize_tanh_to_approximation.mlir | 134 ++++++++++++++
 .../legalize_tanh_to_approximation.cc         | 167 ++++++++++++++++++
 .../compiler/mlir/xla/transforms/passes.h     |   7 +
 .../compiler/mlir/xla/transforms/rewriters.h  |   8 +
 5 files changed, 335 insertions(+)
 create mode 100644 tensorflow/compiler/mlir/xla/tests/legalize_tanh_to_approximation.mlir
 create mode 100644 tensorflow/compiler/mlir/xla/transforms/legalize_tanh_to_approximation.cc

diff --git a/tensorflow/compiler/mlir/xla/BUILD b/tensorflow/compiler/mlir/xla/BUILD
index 43458aab2d3..d089f80d571 100644
--- a/tensorflow/compiler/mlir/xla/BUILD
+++ b/tensorflow/compiler/mlir/xla/BUILD
@@ -515,6 +515,24 @@ cc_library(
     alwayslink = 1,
 )
 
+cc_library(
+    name = "xla_legalize_tanh_to_approximation",
+    srcs = ["transforms/legalize_tanh_to_approximation.cc"],
+    hdrs = [
+        "transforms/passes.h",
+        "transforms/rewriters.h",
+    ],
+    deps = [
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:StandardOps",
+        "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:Transforms",
+    ],
+    alwayslink = 1,
+)
+
 gentbl(
     name = "xla_lower_complex_inc_gen",
     tbl_outs = [
@@ -946,6 +964,7 @@ cc_library(
         ":xla_hlo_fusion",
         ":xla_hlo_to_lhlo_with_xla",
         ":xla_legalize_control_flow",
+        ":xla_legalize_tanh_to_approximation",
         ":xla_legalize_tf",
         ":xla_legalize_tf_with_tf2xla",
         ":xla_legalize_to_linalg",
diff --git a/tensorflow/compiler/mlir/xla/tests/legalize_tanh_to_approximation.mlir b/tensorflow/compiler/mlir/xla/tests/legalize_tanh_to_approximation.mlir
new file mode 100644
index 00000000000..a8286c9b5a9
--- /dev/null
+++ b/tensorflow/compiler/mlir/xla/tests/legalize_tanh_to_approximation.mlir
@@ -0,0 +1,134 @@
+// RUN: xla-opt -xla-legalize-tanh-to-approximation -split-input-file %s | FileCheck %s
+
+func @tanh_f64(%arg0 : f64) -> f64 {
+  %res = tanh %arg0 : f64
+  return %res : f64
+}
+
+// CHECK-LABEL: @tanh_f64
+// CHECK: tanh
+
+// -----
+
+func @tanh_f32(%arg0 : f32) -> f32 {
+  %res = tanh %arg0 : f32
+  return %res : f32
+}
+
+// NOTE: Assertions have been autogenerated by utils/generate-test-checks.py
+// CHECK:       module {
+
+// CHECK-LABEL:   func @tanh_f32(
+// CHECK-SAME:                   %[[VAL_0:.*]]: f32) -> f32 {
+// CHECK:           %[[VAL_1:.*]] = constant 2.000000e+01 : f32
+// CHECK:           %[[VAL_2:.*]] = constant 1.000000e+00 : f32
+// CHECK:           %[[VAL_3:.*]] = constant 4.000000e-04 : f32
+// CHECK:           %[[VAL_4:.*]] = constant 9.000000e+00 : f32
+// CHECK:           %[[VAL_5:.*]] = constant -2.76076837E-16 : f32
+// CHECK:           %[[VAL_6:.*]] = constant 2.00018794E-13 : f32
+// CHECK:           %[[VAL_7:.*]] = constant -8.60467184E-11 : f32
+// CHECK:           %[[VAL_8:.*]] = constant 5.12229725E-8 : f32
+// CHECK:           %[[VAL_9:.*]] = constant 1.48572235E-5 : f32
+// CHECK:           %[[VAL_10:.*]] = constant 6.37261954E-4 : f32
+// CHECK:           %[[VAL_11:.*]] = constant 0.00489352457 : f32
+// CHECK:           %[[VAL_12:.*]] = constant 1.19825836E-6 : f32
+// CHECK:           %[[VAL_13:.*]] = constant 1.18534706E-4 : f32
+// CHECK:           %[[VAL_14:.*]] = constant 0.00226843474 : f32
+// CHECK:           %[[VAL_15:.*]] = constant 0.00489352504 : f32
+// CHECK:           %[[VAL_16:.*]] = absf %[[VAL_0]] : f32
+// CHECK:           %[[VAL_17:.*]] = copysign %[[VAL_2]], %[[VAL_0]] : f32
+// CHECK:           %[[VAL_18:.*]] = cmpf "ult", %[[VAL_16]], %[[VAL_1]] : f32
+// CHECK:           %[[VAL_19:.*]] = cmpf "olt", %[[VAL_16]], %[[VAL_3]] : f32
+// CHECK:           %[[VAL_20:.*]] = cmpf "ule", %[[VAL_16]], %[[VAL_4]] : f32
+// CHECK:           %[[VAL_21:.*]] = copysign %[[VAL_4]], %[[VAL_0]] : f32
+// CHECK:           %[[VAL_22:.*]] = select %[[VAL_20]], %[[VAL_0]], %[[VAL_21]] : f32
+// CHECK:           %[[VAL_23:.*]] = mulf %[[VAL_22]], %[[VAL_22]] : f32
+// CHECK:           %[[VAL_24:.*]] = mulf %[[VAL_23]], %[[VAL_5]] : f32
+// CHECK:           %[[VAL_25:.*]] = addf %[[VAL_24]], %[[VAL_6]] : f32
+// CHECK:           %[[VAL_26:.*]] = mulf %[[VAL_23]], %[[VAL_25]] : f32
+// CHECK:           %[[VAL_27:.*]] = addf %[[VAL_26]], %[[VAL_7]] : f32
+// CHECK:           %[[VAL_28:.*]] = mulf %[[VAL_23]], %[[VAL_27]] : f32
+// CHECK:           %[[VAL_29:.*]] = addf %[[VAL_28]], %[[VAL_8]] : f32
+// CHECK:           %[[VAL_30:.*]] = mulf %[[VAL_23]], %[[VAL_29]] : f32
+// CHECK:           %[[VAL_31:.*]] = addf %[[VAL_30]], %[[VAL_9]] : f32
+// CHECK:           %[[VAL_32:.*]] = mulf %[[VAL_23]], %[[VAL_31]] : f32
+// CHECK:           %[[VAL_33:.*]] = addf %[[VAL_32]], %[[VAL_10]] : f32
+// CHECK:           %[[VAL_34:.*]] = mulf %[[VAL_23]], %[[VAL_33]] : f32
+// CHECK:           %[[VAL_35:.*]] = addf %[[VAL_34]], %[[VAL_11]] : f32
+// CHECK:           %[[VAL_36:.*]] = mulf %[[VAL_22]], %[[VAL_35]] : f32
+// CHECK:           %[[VAL_37:.*]] = mulf %[[VAL_23]], %[[VAL_12]] : f32
+// CHECK:           %[[VAL_38:.*]] = addf %[[VAL_37]], %[[VAL_13]] : f32
+// CHECK:           %[[VAL_39:.*]] = mulf %[[VAL_23]], %[[VAL_38]] : f32
+// CHECK:           %[[VAL_40:.*]] = addf %[[VAL_39]], %[[VAL_14]] : f32
+// CHECK:           %[[VAL_41:.*]] = mulf %[[VAL_23]], %[[VAL_40]] : f32
+// CHECK:           %[[VAL_42:.*]] = addf %[[VAL_41]], %[[VAL_15]] : f32
+// CHECK:           %[[VAL_43:.*]] = divf %[[VAL_36]], %[[VAL_42]] : f32
+// CHECK:           %[[VAL_44:.*]] = select %[[VAL_19]], %[[VAL_0]], %[[VAL_43]] : f32
+// CHECK:           %[[VAL_45:.*]] = select %[[VAL_18]], %[[VAL_44]], %[[VAL_17]] : f32
+// CHECK:           return %[[VAL_45]] : f32
+// CHECK:         }
+// CHECK:       }
+
+// -----
+
+func @tanh_f16(%arg0 : f16) -> f16 {
+  %res = tanh %arg0 : f16
+  return %res : f16
+}
+
+// NOTE: Assertions have been autogenerated by utils/generate-test-checks.py
+// CHECK:       module {
+
+// CHECK-LABEL:   func @tanh_f16(
+// CHECK-SAME:                   %[[VAL_0:.*]]: f16) -> f16 {
+// CHECK:           %[[VAL_1:.*]] = constant 2.000000e+01 : f32
+// CHECK:           %[[VAL_2:.*]] = constant 1.000000e+00 : f32
+// CHECK:           %[[VAL_3:.*]] = constant 4.000000e-04 : f32
+// CHECK:           %[[VAL_4:.*]] = constant 9.000000e+00 : f32
+// CHECK:           %[[VAL_5:.*]] = constant -2.76076837E-16 : f32
+// CHECK:           %[[VAL_6:.*]] = constant 2.00018794E-13 : f32
+// CHECK:           %[[VAL_7:.*]] = constant -8.60467184E-11 : f32
+// CHECK:           %[[VAL_8:.*]] = constant 5.12229725E-8 : f32
+// CHECK:           %[[VAL_9:.*]] = constant 1.48572235E-5 : f32
+// CHECK:           %[[VAL_10:.*]] = constant 6.37261954E-4 : f32
+// CHECK:           %[[VAL_11:.*]] = constant 0.00489352457 : f32
+// CHECK:           %[[VAL_12:.*]] = constant 1.19825836E-6 : f32
+// CHECK:           %[[VAL_13:.*]] = constant 1.18534706E-4 : f32
+// CHECK:           %[[VAL_14:.*]] = constant 0.00226843474 : f32
+// CHECK:           %[[VAL_15:.*]] = constant 0.00489352504 : f32
+// CHECK:           %[[VAL_16:.*]] = fpext %[[VAL_0]] : f16 to f32
+// CHECK:           %[[VAL_17:.*]] = absf %[[VAL_16]] : f32
+// CHECK:           %[[VAL_18:.*]] = copysign %[[VAL_2]], %[[VAL_16]] : f32
+// CHECK:           %[[VAL_19:.*]] = cmpf "ult", %[[VAL_17]], %[[VAL_1]] : f32
+// CHECK:           %[[VAL_20:.*]] = cmpf "olt", %[[VAL_17]], %[[VAL_3]] : f32
+// CHECK:           %[[VAL_21:.*]] = cmpf "ule", %[[VAL_17]], %[[VAL_4]] : f32
+// CHECK:           %[[VAL_22:.*]] = copysign %[[VAL_4]], %[[VAL_16]] : f32
+// CHECK:           %[[VAL_23:.*]] = select %[[VAL_21]], %[[VAL_16]], %[[VAL_22]] : f32
+// CHECK:           %[[VAL_24:.*]] = mulf %[[VAL_23]], %[[VAL_23]] : f32
+// CHECK:           %[[VAL_25:.*]] = mulf %[[VAL_24]], %[[VAL_5]] : f32
+// CHECK:           %[[VAL_26:.*]] = addf %[[VAL_25]], %[[VAL_6]] : f32
+// CHECK:           %[[VAL_27:.*]] = mulf %[[VAL_24]], %[[VAL_26]] : f32
+// CHECK:           %[[VAL_28:.*]] = addf %[[VAL_27]], %[[VAL_7]] : f32
+// CHECK:           %[[VAL_29:.*]] = mulf %[[VAL_24]], %[[VAL_28]] : f32
+// CHECK:           %[[VAL_30:.*]] = addf %[[VAL_29]], %[[VAL_8]] : f32
+// CHECK:           %[[VAL_31:.*]] = mulf %[[VAL_24]], %[[VAL_30]] : f32
+// CHECK:           %[[VAL_32:.*]] = addf %[[VAL_31]], %[[VAL_9]] : f32
+// CHECK:           %[[VAL_33:.*]] = mulf %[[VAL_24]], %[[VAL_32]] : f32
+// CHECK:           %[[VAL_34:.*]] = addf %[[VAL_33]], %[[VAL_10]] : f32
+// CHECK:           %[[VAL_35:.*]] = mulf %[[VAL_24]], %[[VAL_34]] : f32
+// CHECK:           %[[VAL_36:.*]] = addf %[[VAL_35]], %[[VAL_11]] : f32
+// CHECK:           %[[VAL_37:.*]] = mulf %[[VAL_23]], %[[VAL_36]] : f32
+// CHECK:           %[[VAL_38:.*]] = mulf %[[VAL_24]], %[[VAL_12]] : f32
+// CHECK:           %[[VAL_39:.*]] = addf %[[VAL_38]], %[[VAL_13]] : f32
+// CHECK:           %[[VAL_40:.*]] = mulf %[[VAL_24]], %[[VAL_39]] : f32
+// CHECK:           %[[VAL_41:.*]] = addf %[[VAL_40]], %[[VAL_14]] : f32
+// CHECK:           %[[VAL_42:.*]] = mulf %[[VAL_24]], %[[VAL_41]] : f32
+// CHECK:           %[[VAL_43:.*]] = addf %[[VAL_42]], %[[VAL_15]] : f32
+// CHECK:           %[[VAL_44:.*]] = divf %[[VAL_37]], %[[VAL_43]] : f32
+// CHECK:           %[[VAL_45:.*]] = select %[[VAL_20]], %[[VAL_16]], %[[VAL_44]] : f32
+// CHECK:           %[[VAL_46:.*]] = select %[[VAL_19]], %[[VAL_45]], %[[VAL_18]] : f32
+// CHECK:           %[[VAL_47:.*]] = fptrunc %[[VAL_46]] : f32 to f16
+// CHECK:           return %[[VAL_47]] : f16
+// CHECK:         }
+// CHECK:       }
+
diff --git a/tensorflow/compiler/mlir/xla/transforms/legalize_tanh_to_approximation.cc b/tensorflow/compiler/mlir/xla/transforms/legalize_tanh_to_approximation.cc
new file mode 100644
index 00000000000..9696db377da
--- /dev/null
+++ b/tensorflow/compiler/mlir/xla/transforms/legalize_tanh_to_approximation.cc
@@ -0,0 +1,167 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This file implements logic for lowering the tanh standard ops to an
+// approximation.
+
+#include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
+#include "mlir/IR/Function.h"  // from @llvm-project
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/xla/transforms/passes.h"
+#include "tensorflow/compiler/mlir/xla/transforms/rewriters.h"
+
+namespace mlir {
+namespace xla {
+namespace {
+
+/// Emits the fast tanh approximation that is also used by XLA.
+static Value EmitTanhApproximation(Value input, Value abs_value, Location loc,
+                                   PatternRewriter &rewriter) {
+  // For small values of x, we can approximate tanh(x)=x. For extremely small
+  // values of x (|x| < 1e-37), the other approximation would evaluate
+  // tanh(x) = 0.
+  constexpr float kCanUseApprox = 0.0004;
+  Value can_use_approx =
+      rewriter.create<ConstantOp>(loc, rewriter.getF32FloatAttr(kCanUseApprox));
+  Value return_input = rewriter.create<CmpFOp>(loc, CmpFPredicate::OLT,
+                                               abs_value, can_use_approx);
+
+  // Clamp the input to [-9, 9].
+  Value plus_nine =
+      rewriter.create<ConstantOp>(loc, rewriter.getF32FloatAttr(9.0));
+  Value smaller_than_nine =
+      rewriter.create<CmpFOp>(loc, CmpFPredicate::ULE, abs_value, plus_nine);
+  Value input_clamped = rewriter.create<SelectOp>(
+      loc, smaller_than_nine, input,
+      rewriter.create<CopySignOp>(loc, plus_nine, input));
+
+  static constexpr std::array<float, 7> numerator_coeffs{
+      -2.76076847742355e-16f, 2.00018790482477e-13f, -8.60467152213735e-11f,
+      5.12229709037114e-08f,  1.48572235717979e-05f, 6.37261928875436e-04f,
+      4.89352455891786e-03f};
+
+  static constexpr std::array<float, 4> denominator_coeffs{
+      1.19825839466702e-06f, 1.18534705686654e-04f, 2.26843463243900e-03f,
+      4.89352518554385e-03f};
+
+  Value input_squared =
+      rewriter.create<MulFOp>(loc, input_clamped, input_clamped);
+  Value numerator = rewriter.create<ConstantOp>(
+      loc, rewriter.getF32FloatAttr(numerator_coeffs[0]));
+  for (int i = 1; i < numerator_coeffs.size(); i++) {
+    numerator = rewriter.create<AddFOp>(
+        loc, rewriter.create<MulFOp>(loc, input_squared, numerator),
+        rewriter.create<ConstantOp>(
+            loc, rewriter.getF32FloatAttr(numerator_coeffs[i])));
+  }
+
+  numerator = rewriter.create<MulFOp>(loc, input_clamped, numerator);
+
+  Value denominator = rewriter.create<ConstantOp>(
+      loc, rewriter.getF32FloatAttr(denominator_coeffs[0]));
+  for (int i = 1; i < denominator_coeffs.size(); i++) {
+    denominator = rewriter.create<AddFOp>(
+        loc, rewriter.create<MulFOp>(loc, input_squared, denominator),
+        rewriter.create<ConstantOp>(
+            loc, rewriter.getF32FloatAttr(denominator_coeffs[i])));
+  }
+
+  Value approx = rewriter.create<DivFOp>(loc, numerator, denominator);
+
+  return rewriter.create<SelectOp>(loc, return_input, input, approx);
+}
+
+class ApproximateTanhLowering : public OpRewritePattern<TanhOp> {
+ public:
+  explicit ApproximateTanhLowering(MLIRContext *ctx)
+      : OpRewritePattern<TanhOp>(ctx, 100) {}
+
+  LogicalResult matchAndRewrite(TanhOp tanhOp,
+                                PatternRewriter &rewriter) const override {
+    Type operand_type = tanhOp.getType();
+
+    if (operand_type.isF64()) {
+      // Similar to XLA, do not rewrite f64 as precision might matter.
+      return failure();
+    }
+
+    Location loc = tanhOp.getLoc();
+    Value input = tanhOp.operand();
+    if (operand_type.isF16()) {
+      input = rewriter.create<FPExtOp>(loc, input, rewriter.getF32Type());
+    }
+
+    // If we still do not have f32, fail.
+    if (!input.getType().isF32()) {
+      return failure();
+    }
+
+    // For |operand| > 20.0, we just return -1/1.
+    constexpr double kMaxValue = 20.0;
+    Value max_value =
+        rewriter.create<ConstantOp>(loc, rewriter.getF32FloatAttr(kMaxValue));
+    Value abs_value = rewriter.create<AbsFOp>(loc, input);
+
+    Value one = rewriter.create<ConstantOp>(loc, rewriter.getF32FloatAttr(1.0));
+    Value one_with_sign = rewriter.create<CopySignOp>(loc, one, input);
+
+    Value smaller_than_twenty =
+        rewriter.create<CmpFOp>(loc, CmpFPredicate::ULT, abs_value, max_value);
+
+    // Otherwise, we use the approximation.
+    Value approx = EmitTanhApproximation(input, abs_value, loc, rewriter);
+
+    Value result = rewriter.create<SelectOp>(loc, smaller_than_twenty, approx,
+                                             one_with_sign);
+
+    // Truncate back if needed.
+    if (operand_type.isF16()) {
+      result = rewriter.create<FPTruncOp>(loc, result, rewriter.getF16Type());
+    }
+
+    rewriter.replaceOp(tanhOp, {result});
+    return success();
+  }
+};
+
+struct LegalizeTanhToApproximation
+    : public PassWrapper<LegalizeTanhToApproximation, FunctionPass> {
+  /// Perform the lowering of standard dialect operations to approximations.
+  void runOnFunction() override {
+    OwningRewritePatternList patterns;
+    PopulateTanhToApproximationPatterns(&getContext(), &patterns);
+    applyPatternsAndFoldGreedily(getFunction(), patterns);
+  }
+};
+
+}  // anonymous namespace
+
+std::unique_ptr<mlir::OperationPass<mlir::FuncOp>>
+createLegalizeTanhToApproximationPass() {
+  return std::make_unique<LegalizeTanhToApproximation>();
+}
+
+void PopulateTanhToApproximationPatterns(mlir::MLIRContext *context,
+                                         OwningRewritePatternList *patterns) {
+  patterns->insert<ApproximateTanhLowering>(context);
+}
+
+static PassRegistration<LegalizeTanhToApproximation> legalize_pass(
+    "xla-legalize-tanh-to-approximation",
+    "Legalize tanh from standard dialect to an approximation");
+
+}  // namespace xla
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/xla/transforms/passes.h b/tensorflow/compiler/mlir/xla/transforms/passes.h
index a2af8124786..3db0bc3b474 100644
--- a/tensorflow/compiler/mlir/xla/transforms/passes.h
+++ b/tensorflow/compiler/mlir/xla/transforms/passes.h
@@ -115,6 +115,13 @@ std::unique_ptr<Pass> createLhloCopyRemovalPass();
 std::unique_ptr<OperationPass<FuncOp>> createLegalizeLhloToParallelLoopsPass();
 
 }  // namespace xla_lhlo
+
+namespace xla {
+
+/// Lowers the standard TanhOp to an approximation that does not use intrinsics.
+std::unique_ptr<OperationPass<FuncOp>> createLegalizeTanhToApproximationPass();
+
+}  // namespace xla
 }  // namespace mlir
 
 #endif  // TENSORFLOW_COMPILER_MLIR_XLA_TRANSFORMS_PASSES_H_
diff --git a/tensorflow/compiler/mlir/xla/transforms/rewriters.h b/tensorflow/compiler/mlir/xla/transforms/rewriters.h
index 59347198fe4..7303b87be75 100644
--- a/tensorflow/compiler/mlir/xla/transforms/rewriters.h
+++ b/tensorflow/compiler/mlir/xla/transforms/rewriters.h
@@ -91,6 +91,14 @@ void PopulateLegalizeChloToHloPatterns(MLIRContext *context,
 
 }  // namespace xla_chlo
 
+namespace xla {
+
+// Populates a pattern that translates the standard TanhOp to an approximation
+// that does not use intrinsics.
+void PopulateTanhToApproximationPatterns(MLIRContext *context,
+                                         OwningRewritePatternList *patterns);
+
+}  // namespace xla
 }  // namespace mlir
 
 #endif  // TENSORFLOW_COMPILER_MLIR_XLA_TRANSFORMS_REWRITERS_H_

From d5435d42c5856477cafc70103c71e4039861d990 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 17 Jun 2020 02:01:56 -0700
Subject: [PATCH 0379/1390] compat: Update forward compatibility horizon to
 2020-06-17

PiperOrigin-RevId: 316848134
Change-Id: Ied10fd4074e6cc4e0a737cdf4e6ed6ab48632c94
---
 tensorflow/python/compat/compat.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/compat/compat.py b/tensorflow/python/compat/compat.py
index 314acfdd38f..8a42b3dfdd3 100644
--- a/tensorflow/python/compat/compat.py
+++ b/tensorflow/python/compat/compat.py
@@ -33,7 +33,7 @@ from tensorflow.python.util.tf_export import tf_export
 # This value changes every day with an automatic CL. It can be modified in code
 # via `forward_compatibility_horizon()` or with the environment variable
 # TF_FORWARD_COMPATIBILITY_DELTA_DAYS, which is added to the compatibility date.
-_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2020, 6, 16)
+_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2020, 6, 17)
 _FORWARD_COMPATIBILITY_DELTA_DAYS_VAR_NAME = "TF_FORWARD_COMPATIBILITY_DELTA_DAYS"
 _FORWARD_COMPATIBILITY_DATE_NUMBER = None
 

From 25e5d3cb1208ac9368a35ca97756a8a89110e2cf Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 17 Jun 2020 02:02:02 -0700
Subject: [PATCH 0380/1390] Update GraphDef version to 435.

PiperOrigin-RevId: 316848144
Change-Id: I871d54f7b4f40d8ed9ab1d947302934bb03b463a
---
 tensorflow/core/public/version.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index 8e3c66edfc2..3e4e3888d87 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -108,7 +108,7 @@ limitations under the License.
 
 #define TF_GRAPH_DEF_VERSION_MIN_PRODUCER 0
 #define TF_GRAPH_DEF_VERSION_MIN_CONSUMER 0
-#define TF_GRAPH_DEF_VERSION 434  // Updated: 2020/6/16
+#define TF_GRAPH_DEF_VERSION 435  // Updated: 2020/6/17
 
 // Checkpoint compatibility versions (the versions field in SavedSliceMeta).
 //

From eaf10e9a279e270d8133a41cfc9690de10423a1d Mon Sep 17 00:00:00 2001
From: Lutz Roeder <lutzroeder@users.noreply.github.com>
Date: Sun, 14 Jun 2020 23:02:59 -0700
Subject: [PATCH 0381/1390] Fix Keras documentation

---
 tensorflow/python/keras/layers/core.py     | 8 ++++++--
 tensorflow/python/keras/layers/wrappers.py | 9 +++++----
 2 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/tensorflow/python/keras/layers/core.py b/tensorflow/python/keras/layers/core.py
index abfb025db30..e64a1c27bcf 100644
--- a/tensorflow/python/keras/layers/core.py
+++ b/tensorflow/python/keras/layers/core.py
@@ -825,10 +825,14 @@ class Lambda(Layer):
       returned as output mask regardless of what the input is.
     arguments: Optional dictionary of keyword arguments to be passed to the
       function.
-  Input shape: Arbitrary. Use the keyword argument input_shape (tuple of
+
+  Input shape:
+    Arbitrary. Use the keyword argument input_shape (tuple of
     integers, does not include the samples axis) when using this layer as the
     first layer in a model.
-  Output shape: Specified by `output_shape` argument
+
+  Output shape:
+    Specified by `output_shape` argument
   """
 
   @trackable.no_automatic_dependency_tracking
diff --git a/tensorflow/python/keras/layers/wrappers.py b/tensorflow/python/keras/layers/wrappers.py
index 8fe3b3b20bb..23fef467cfe 100644
--- a/tensorflow/python/keras/layers/wrappers.py
+++ b/tensorflow/python/keras/layers/wrappers.py
@@ -341,10 +341,11 @@ class Bidirectional(Wrapper):
       combined. One of {'sum', 'mul', 'concat', 'ave', None}. If None, the
       outputs will not be combined, they will be returned as a list. Default
       value is 'concat'.
-    backward_layer: Optional `keras.layers.RNN`, or keras.layers.Layer` instance
-      to be used to handle backwards input processing. If `backward_layer` is
-      not provided, the layer instance passed as the `layer` argument will be
-      used to generate the backward layer automatically.
+    backward_layer: Optional `keras.layers.RNN`, or `keras.layers.Layer`
+      instance to be used to handle backwards input processing.
+      If `backward_layer` is not provided, the layer instance passed as the
+      `layer` argument will be used to generate the backward layer
+      automatically.
       Note that the provided `backward_layer` layer should have properties
       matching those of the `layer` argument, in particular it should have the
       same values for `stateful`, `return_states`, `return_sequence`, etc.

From 2aa2332364e4b27bb4fa433f0c6bd1709fab7f34 Mon Sep 17 00:00:00 2001
From: Nishidha Panpaliya <npanpa23@in.ibm.com>
Date: Wed, 17 Jun 2020 09:13:57 +0000
Subject: [PATCH 0382/1390] Fixed build error

---
 tensorflow/compiler/xla/service/cpu/BUILD       |  2 +-
 tensorflow/compiler/xla/service/cpu/tests/BUILD | 10 +++++-----
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/tensorflow/compiler/xla/service/cpu/BUILD b/tensorflow/compiler/xla/service/cpu/BUILD
index 7be4d3e724a..8ea3672d6a8 100644
--- a/tensorflow/compiler/xla/service/cpu/BUILD
+++ b/tensorflow/compiler/xla/service/cpu/BUILD
@@ -1080,7 +1080,7 @@ tf_cc_test(
     deps = [
         ":cpu_compiler",
         ":cpu_transfer_manager",
-	":test_header_helper",
+        ":test_header_helper",
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
diff --git a/tensorflow/compiler/xla/service/cpu/tests/BUILD b/tensorflow/compiler/xla/service/cpu/tests/BUILD
index 9036c5c9024..510e7b18fba 100644
--- a/tensorflow/compiler/xla/service/cpu/tests/BUILD
+++ b/tensorflow/compiler/xla/service/cpu/tests/BUILD
@@ -42,7 +42,7 @@ tf_cc_test(
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service/cpu:cpu_compiler",
         "//tensorflow/compiler/xla/service/cpu/tests:cpu_codegen_test",
-	"//tensorflow/compiler/xla/service/cpu:test_header_helper",
+        "//tensorflow/compiler/xla/service/cpu:test_header_helper",
         "//tensorflow/core:lib",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
@@ -138,7 +138,7 @@ tf_cc_test(
         "//tensorflow/compiler/xla/service/cpu:cpu_compiler",
         "//tensorflow/compiler/xla/service/cpu/tests:cpu_codegen_test",
         "//tensorflow/compiler/xla/tests:test_utils",
-	"//tensorflow/compiler/xla/service/cpu:test_header_helper",
+        "//tensorflow/compiler/xla/service/cpu:test_header_helper",
         "//tensorflow/core:lib",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
@@ -218,7 +218,7 @@ tf_cc_test(
         "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/service/cpu:cpu_compiler",
         "//tensorflow/compiler/xla/service/cpu/tests:cpu_codegen_test",
-	"//tensorflow/compiler/xla/service/cpu:test_header_helper",
+        "//tensorflow/compiler/xla/service/cpu:test_header_helper",
         "//tensorflow/core:lib",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
@@ -232,7 +232,7 @@ tf_cc_test(
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service/cpu:cpu_compiler",
         "//tensorflow/compiler/xla/service/cpu/tests:cpu_codegen_test",
-	"//tensorflow/compiler/xla/service/cpu:test_header_helper",
+        "//tensorflow/compiler/xla/service/cpu:test_header_helper",
         "//tensorflow/core:lib",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
@@ -246,7 +246,7 @@ tf_cc_test(
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service/cpu:cpu_compiler",
         "//tensorflow/compiler/xla/service/cpu/tests:cpu_codegen_test",
-	"//tensorflow/compiler/xla/service/cpu:test_header_helper",
+        "//tensorflow/compiler/xla/service/cpu:test_header_helper",
         "//tensorflow/core:lib",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",

From 41970246764299cd059fee0cbb08b5ba904b1520 Mon Sep 17 00:00:00 2001
From: Henry Tan <henrytan@google.com>
Date: Wed, 17 Jun 2020 02:06:23 -0700
Subject: [PATCH 0383/1390] Publish TpuCompileOp.h header.

PiperOrigin-RevId: 316848726
Change-Id: I36ef7ae0f8357cf144d057851cd3267309632fe5
---
 tensorflow/core/tpu/kernels/BUILD            |  6 ++
 tensorflow/core/tpu/kernels/tpu_compile_op.h | 75 ++++++++++++++++++++
 2 files changed, 81 insertions(+)
 create mode 100644 tensorflow/core/tpu/kernels/tpu_compile_op.h

diff --git a/tensorflow/core/tpu/kernels/BUILD b/tensorflow/core/tpu/kernels/BUILD
index e7be7d2b062..318d60b22df 100644
--- a/tensorflow/core/tpu/kernels/BUILD
+++ b/tensorflow/core/tpu/kernels/BUILD
@@ -365,3 +365,9 @@ tf_proto_library_cc(
     srcs = ["tpu_compilation_cache.proto"],
     cc_api_version = 2,
 )
+
+cc_library(
+    name = "tpu_compile_op_hdrs",
+    hdrs = ["tpu_compile_op.h"],
+    deps = ["//tensorflow/core:framework"],
+)
diff --git a/tensorflow/core/tpu/kernels/tpu_compile_op.h b/tensorflow/core/tpu/kernels/tpu_compile_op.h
new file mode 100644
index 00000000000..8a1963dde5c
--- /dev/null
+++ b/tensorflow/core/tpu/kernels/tpu_compile_op.h
@@ -0,0 +1,75 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_TPU_KERNELS_TPU_COMPILE_OP_H_
+#define TENSORFLOW_CORE_TPU_KERNELS_TPU_COMPILE_OP_H_
+
+#include <memory>
+
+#include "tensorflow/core/framework/op_kernel.h"
+
+namespace tensorflow {
+
+namespace tpu {
+// Forward declaration.
+class TpuCompileOpKernelImpl;
+}  // namespace tpu
+
+// The TPUCompile operator compiles a Tensorflow function into a
+// TPU executable to be run by TPUExecute.
+//
+class TpuCompileOp : public OpKernel {
+ public:
+  explicit TpuCompileOp(OpKernelConstruction* ctx);
+  ~TpuCompileOp() override = default;
+
+  void Compute(OpKernelContext* ctx) override;
+
+ private:
+  std::unique_ptr<tpu::TpuCompileOpKernelImpl> impl_;
+
+  DISALLOW_COPY_AND_ASSIGN(TpuCompileOp);
+};
+
+// The TPUCompile operator compiles a MLIR module into a
+// TPU executable to be run by TPUExecute.
+//
+class TpuCompileMlirOp : public OpKernel {
+ public:
+  explicit TpuCompileMlirOp(OpKernelConstruction* ctx);
+  ~TpuCompileMlirOp() override = default;
+
+  void Compute(OpKernelContext* ctx) override;
+
+ private:
+  std::unique_ptr<tpu::TpuCompileOpKernelImpl> impl_;
+
+  DISALLOW_COPY_AND_ASSIGN(TpuCompileMlirOp);
+};
+
+class TpuCompileSucceededAssertOp : public OpKernel {
+ public:
+  explicit TpuCompileSucceededAssertOp(OpKernelConstruction* ctx)
+      : OpKernel(ctx) {}
+  ~TpuCompileSucceededAssertOp() override = default;
+
+  void Compute(OpKernelContext* ctx) override;
+
+ private:
+  DISALLOW_COPY_AND_ASSIGN(TpuCompileSucceededAssertOp);
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TPU_KERNELS_TPU_COMPILE_OP_H_

From 70a617943c2f54bae01ffe695a56ba8e14ab4b65 Mon Sep 17 00:00:00 2001
From: Tiezhen WANG <wangtz@google.com>
Date: Wed, 17 Jun 2020 02:14:20 -0700
Subject: [PATCH 0384/1390] TFL: Selective registration for C++ target.

Usage:
- Create a tflite_custom_cc_library rule in the BUILD file with the targeted model.
- Call tflite::CreateOpResolver to get the slimmed op resolver.
PiperOrigin-RevId: 316849510
Change-Id: I3e7d75da6a9f2876b3fbefe1962e5ae09ebadb33
---
 tensorflow/lite/build_def.bzl                 | 46 +++++++++++++++++++
 tensorflow/lite/java/src/main/native/BUILD    | 13 ++++++
 .../lite/java/src/main/native/op_resolver.h   | 26 +++++++++++
 .../java/src/main/native/selected_ops_jni.cc  | 36 +++++++++++++++
 4 files changed, 121 insertions(+)
 create mode 100644 tensorflow/lite/java/src/main/native/op_resolver.h
 create mode 100644 tensorflow/lite/java/src/main/native/selected_ops_jni.cc

diff --git a/tensorflow/lite/build_def.bzl b/tensorflow/lite/build_def.bzl
index 285824a613f..5e487395355 100644
--- a/tensorflow/lite/build_def.bzl
+++ b/tensorflow/lite/build_def.bzl
@@ -732,3 +732,49 @@ def tflite_experimental_runtime_linkopts(if_eager = [], if_non_eager = [], if_no
         ] + if_non_eager,
         if_none = [] + if_none,
     )
+
+def tflite_custom_cc_library(name, models = [], srcs = [], deps = [], visibility = ["//visibility:private"]):
+    """Generates a tflite cc library, stripping off unused operators.
+
+    This library includes the TfLite runtime as well as all operators needed for the given models.
+    Op resolver can be retrieved using tflite::CreateOpResolver method.
+
+    Args:
+        name: Str, name of the target.
+        models: List of models. This TFLite build will only include
+            operators used in these models. If the list is empty, all builtin
+            operators are included.
+        srcs: List of files implementing custom operators if any.
+        deps: Additional dependencies to build all the custom operators.
+        visibility: Visibility setting for the generated target. Default to private.
+    """
+    real_srcs = []
+    real_srcs.extend(srcs)
+    real_deps = []
+    real_deps.extend(deps)
+
+    if models:
+        gen_selected_ops(
+            name = "%s_registration" % name,
+            model = models[0],
+        )
+        real_srcs.append(":%s_registration" % name)
+        real_deps.append("//tensorflow/lite/java/src/main/native:selected_ops_jni")
+    else:
+        # Support all operators if `models` not specified.
+        real_deps.append("//tensorflow/lite/java/src/main/native")
+
+    native.cc_library(
+        name = name,
+        srcs = real_srcs,
+        copts = tflite_copts(),
+        linkopts = [
+            "-lm",
+            "-ldl",
+        ],
+        deps = depset([
+            "//tensorflow/lite:framework",
+            "//tensorflow/lite/kernels:builtin_ops",
+        ] + real_deps),
+        visibility = visibility,
+    )
diff --git a/tensorflow/lite/java/src/main/native/BUILD b/tensorflow/lite/java/src/main/native/BUILD
index 0d3535b29af..fdbbc9dc72c 100644
--- a/tensorflow/lite/java/src/main/native/BUILD
+++ b/tensorflow/lite/java/src/main/native/BUILD
@@ -45,14 +45,27 @@ cc_library(
     srcs = [
         "builtin_ops_jni.cc",
     ],
+    hdrs = ["op_resolver.h"],
     copts = tflite_copts(),
     deps = [
         ":native_framework_only",
+        "//tensorflow/lite:framework",
         "//tensorflow/lite/kernels:builtin_ops",
     ],
     alwayslink = 1,
 )
 
+# TODO(b/153652701): Generate this target to give CreateOpResolver a custom namespace.
+cc_library(
+    name = "selected_ops_jni",
+    srcs = ["selected_ops_jni.cc"],
+    hdrs = ["op_resolver.h"],
+    copts = tflite_copts(),
+    deps = [
+        "//tensorflow/lite:framework",
+    ],
+)
+
 exports_files(
     [
         "exported_symbols.lds",
diff --git a/tensorflow/lite/java/src/main/native/op_resolver.h b/tensorflow/lite/java/src/main/native/op_resolver.h
new file mode 100644
index 00000000000..ba9c1bfb487
--- /dev/null
+++ b/tensorflow/lite/java/src/main/native/op_resolver.h
@@ -0,0 +1,26 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_JAVA_SRC_MAIN_NATIVE_OP_RESOLVER_H_
+#define TENSORFLOW_LITE_JAVA_SRC_MAIN_NATIVE_OP_RESOLVER_H_
+
+#include "tensorflow/lite/op_resolver.h"
+
+namespace tflite {
+
+std::unique_ptr<OpResolver> CreateOpResolver();
+
+}
+
+#endif  // TENSORFLOW_LITE_JAVA_SRC_MAIN_NATIVE_OP_RESOLVER_H_
diff --git a/tensorflow/lite/java/src/main/native/selected_ops_jni.cc b/tensorflow/lite/java/src/main/native/selected_ops_jni.cc
new file mode 100644
index 00000000000..d8eb233f90a
--- /dev/null
+++ b/tensorflow/lite/java/src/main/native/selected_ops_jni.cc
@@ -0,0 +1,36 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/java/src/main/native/op_resolver.h"
+#include "tensorflow/lite/mutable_op_resolver.h"
+
+// This method is generated by `gen_selected_ops`.
+// TODO(b/153652701): Instead of relying on a global method, make
+// `gen_selected_ops` generating a header file with custom namespace.
+void RegisterSelectedOps(::tflite::MutableOpResolver* resolver);
+
+namespace tflite {
+// This interface is the unified entry point for creating op resolver
+// regardless if selective registration is being used. C++ client will call
+// this method directly and Java client will call this method indirectly via
+// JNI code in interpreter_jni.cc.
+std::unique_ptr<OpResolver> CreateOpResolver() {
+  std::unique_ptr<MutableOpResolver> resolver =
+      std::unique_ptr<MutableOpResolver>(new MutableOpResolver());
+  RegisterSelectedOps(resolver.get());
+  return std::move(resolver);
+}
+
+}  // namespace tflite

From 17cacd3a2124616876096c94699931f6ffb64e2c Mon Sep 17 00:00:00 2001
From: Stephan Herhut <herhut@google.com>
Date: Wed, 17 Jun 2020 02:32:08 -0700
Subject: [PATCH 0385/1390] Add support for unrolling to the build rules.

PiperOrigin-RevId: 316851326
Change-Id: Icef09025b154f6d88c94884a7970f93022bcd160
---
 .../core/kernels/cubin_headers/build_defs.bzl    | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/tensorflow/core/kernels/cubin_headers/build_defs.bzl b/tensorflow/core/kernels/cubin_headers/build_defs.bzl
index f0f4a944e74..5880cbe8add 100644
--- a/tensorflow/core/kernels/cubin_headers/build_defs.bzl
+++ b/tensorflow/core/kernels/cubin_headers/build_defs.bzl
@@ -15,9 +15,11 @@ def _gen_kernel_image_hdr_impl(ctx):
 
     name = ctx.attr.name
     tile_sizes = ctx.attr.tile_size.replace("x", ",")
-    same_shape = []
+    cmd_args = []
     if ctx.attr.same_shape:
-        same_shape.append("--same_shape=%s" % ctx.attr.same_shape)
+        cmd_args.append("--same_shape=%s" % ctx.attr.same_shape)
+    if ctx.attr.unroll_factors:
+        cmd_args.append("--unroll_factors=%s" % ctx.attr.unroll_factors)
 
     cubins = []
     images = []
@@ -30,7 +32,7 @@ def _gen_kernel_image_hdr_impl(ctx):
             inputs = [ctx.file.mlir_op],
             outputs = [cubin],
             executable = ctx.executable._tool,
-            arguments = same_shape + [
+            arguments = cmd_args + [
                 "--tile_sizes=%s" % tile_sizes,
                 "--arch=%s" % arch.split("_")[1],
                 "--input=%s" % ctx.file.mlir_op.path,
@@ -74,6 +76,7 @@ _gen_kernel_image_hdr_rule = rule(
         "mlir_op": attr.label(mandatory = True, allow_single_file = True),
         "tile_size": attr.string(mandatory = True),
         "same_shape": attr.string(),
+        "unroll_factors": attr.string(),
         "out": attr.output(mandatory = True),
         "symbol": attr.string(mandatory = True),
         "gpu_archs": attr.string_list(mandatory = True),
@@ -88,7 +91,7 @@ _gen_kernel_image_hdr_rule = rule(
     },
 )
 
-def _gen_kernel_image_hdr(name, mlir_op, tile_size, tags = [], same_shape = None):
+def _gen_kernel_image_hdr(name, mlir_op, tile_size, tags = [], same_shape = None, unroll_factors = None):
     """Generates a C header with fatbin data from a Tensorflow op."""
     if cuda_gpu_architectures():
         _gen_kernel_image_hdr_rule(
@@ -96,6 +99,7 @@ def _gen_kernel_image_hdr(name, mlir_op, tile_size, tags = [], same_shape = None
             mlir_op = mlir_op,
             tile_size = tile_size,
             same_shape = same_shape,
+            unroll_factors = unroll_factors,
             out = "%s.h" % name,
             symbol = "k%s" % name.replace("_", " ").title().replace(" ", ""),
             gpu_archs = cuda_gpu_architectures(),
@@ -131,13 +135,14 @@ def _gen_mlir_op(name, type):
         out = "{name}_{type}.mlir".format(name = name, type = type),
     )
 
-def gen_kernel_library(name, types, tile_size, tags = [], same_shape = None):
+def gen_kernel_library(name, types, tile_size, tags = [], same_shape = None, unroll_factors = None):
     """ Generate a library with kernels for a specific tensorflow op.
 
     Args:
       name: The name of the tensorflow op.
       types: The types ("f16", "f32", "f64") for which a kernel should be generated.
       tile_size: The tiling specification, e.g. "16x16".
+      unroll_factors: The unrolling specification, e.g. "4,4"
       tags: The tags which should be added to the library.
       same_shape: The information about which shapes are the same, e.g. "0,1".
     """
@@ -154,6 +159,7 @@ def gen_kernel_library(name, types, tile_size, tags = [], same_shape = None):
                 tile_size = tile_size,
                 tags = tags,
                 same_shape = same_shape,
+                unroll_factors = unroll_factors,
             )
 
     native.cc_library(

From 80e9a8d07680ffd1b138ab95306c1827c4e39865 Mon Sep 17 00:00:00 2001
From: Taehee Jeong <taeheej@google.com>
Date: Wed, 17 Jun 2020 02:54:52 -0700
Subject: [PATCH 0386/1390] Add tflite_ios_static_framework rule for hiding
 symbols

Exposed C++ symbols might cause collisions with other libraries.

PiperOrigin-RevId: 316853905
Change-Id: Ib7fd3f14fa40bc5c3e2ece0bf244cfb6e0623770
---
 tensorflow/lite/experimental/ios/BUILD.apple  |  24 ++--
 .../ios/hide_symbols_with_whitelist.sh        | 135 ++++++++++++++++++
 tensorflow/lite/experimental/ios/ios.bzl      |  58 ++++++++
 .../ios/whitelist_TensorFlowLiteC.txt         |   1 +
 .../ios/whitelist_TensorFlowLiteCCoreML.txt   |   2 +
 .../ios/whitelist_TensorFlowLiteCMetal.txt    |   2 +
 6 files changed, 212 insertions(+), 10 deletions(-)
 create mode 100755 tensorflow/lite/experimental/ios/hide_symbols_with_whitelist.sh
 create mode 100644 tensorflow/lite/experimental/ios/whitelist_TensorFlowLiteC.txt
 create mode 100644 tensorflow/lite/experimental/ios/whitelist_TensorFlowLiteCCoreML.txt
 create mode 100644 tensorflow/lite/experimental/ios/whitelist_TensorFlowLiteCMetal.txt

diff --git a/tensorflow/lite/experimental/ios/BUILD.apple b/tensorflow/lite/experimental/ios/BUILD.apple
index aa41b9e2d62..1a85b604f9b 100644
--- a/tensorflow/lite/experimental/ios/BUILD.apple
+++ b/tensorflow/lite/experimental/ios/BUILD.apple
@@ -1,7 +1,7 @@
 # TensorFlow Lite for iOS
 
 load("@bazel_skylib//rules:build_test.bzl", "build_test")
-load("//tensorflow/lite/experimental/ios:ios.bzl", "TFL_MINIMUM_OS_VERSION")
+load("//tensorflow/lite/experimental/ios:ios.bzl", "TFL_MINIMUM_OS_VERSION", "tflite_ios_static_framework")
 load("@build_bazel_rules_apple//apple:ios.bzl", "ios_static_framework")
 
 package(
@@ -11,8 +11,15 @@ package(
     licenses = ["notice"],  # Apache 2.0
 )
 
+sh_binary(
+    name = "hide_symbols_with_whitelist",
+    srcs = [
+        "hide_symbols_with_whitelist.sh",
+    ],
+)
+
 # bazel build -c opt --config=ios_fat //tensorflow/lite/experimental/ios:TensorFlowLiteC_framework
-ios_static_framework(
+tflite_ios_static_framework(
     name = "TensorFlowLiteC_framework",
     hdrs = [
         "//tensorflow/lite/c:c_api.h",
@@ -20,6 +27,7 @@ ios_static_framework(
     ],
     bundle_name = "TensorFlowLiteC",
     minimum_os_version = TFL_MINIMUM_OS_VERSION,
+    whitelist_symbols_file = ":whitelist_TensorFlowLiteC.txt",
     deps = [
         ":tensorflow_lite_c",
     ],
@@ -60,16 +68,14 @@ genrule(
 # TensorFlowLiteC framework above in a composable way.
 #
 # bazel build -c opt --config=ios_fat //tensorflow/lite/experimental/ios:TensorFlowLiteCCoreMl_framework
-ios_static_framework(
+tflite_ios_static_framework(
     name = "TensorFlowLiteCCoreML_framework",
     hdrs = [
         ":coreml_delegate.h",
     ],
-    avoid_deps = [
-        ":tensorflow_lite_c",
-    ],
     bundle_name = "TensorFlowLiteCCoreML",
     minimum_os_version = TFL_MINIMUM_OS_VERSION,
+    whitelist_symbols_file = ":whitelist_TensorFlowLiteCCoreML.txt",
     deps = [
         "//tensorflow/lite/experimental/delegates/coreml:coreml_delegate",
     ],
@@ -81,16 +87,14 @@ ios_static_framework(
 # TensorFlowLiteC framework above in a composable way.
 #
 # bazel build -c opt --config=ios_fat //tensorflow/lite/experimental/ios:TensorFlowLiteCMetal_framework
-ios_static_framework(
+tflite_ios_static_framework(
     name = "TensorFlowLiteCMetal_framework",
     hdrs = [
         "//tensorflow/lite/delegates/gpu:metal_delegate.h",
     ],
-    avoid_deps = [
-        ":tensorflow_lite_c",
-    ],
     bundle_name = "TensorFlowLiteCMetal",
     minimum_os_version = TFL_MINIMUM_OS_VERSION,
+    whitelist_symbols_file = ":whitelist_TensorFlowLiteCMetal.txt",
     deps = [
         "//tensorflow/lite/delegates/gpu:metal_delegate",
     ],
diff --git a/tensorflow/lite/experimental/ios/hide_symbols_with_whitelist.sh b/tensorflow/lite/experimental/ios/hide_symbols_with_whitelist.sh
new file mode 100755
index 00000000000..2fa0fc53c33
--- /dev/null
+++ b/tensorflow/lite/experimental/ios/hide_symbols_with_whitelist.sh
@@ -0,0 +1,135 @@
+#!/bin/bash
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+#
+# A script to merge Mach-O object files into a single object file and hide
+# their internal symbols. Only whitelisted symbols will be visible in the
+# symbol table after this script.
+
+# To run this script, you must set several variables:
+#   INPUT_FRAMEWORK: a zip file containing the iOS static framework.
+#   BUNDLE_NAME: the pod/bundle name of the iOS static framework.
+#   WHITELIST_FILE_PATH: contains the whitelisted symbols.
+#   OUTPUT: the output zip file.
+
+# Halt on any error or any unknown variable.
+set -ue
+
+LD_DEBUGGABLE_FLAGS="-x"
+# Uncomment the below to get debuggable output. This can only be done for one
+# library at a time.
+# LD_DEBUGGABLE_FLAGS="-d"
+
+# Exits if C++ symbols are found in the whitelist list.
+if grep -q "^__Z" "${WHITELIST_FILE_PATH}"
+then
+  echo "ERROR: Failed in symbol hiding. This rule does not permit hiding of" \
+       "C++ symbols due to possible serious problems mixing symbol hiding," \
+       "shared libraries and the C++ runtime." \
+       "More info can be found in go/ios-symbols-hiding." \
+       "Please recheck the whitelist list and remove C++ symbols:"
+  echo "$(grep "^__Z" "${WHITELIST_FILE_PATH}")"
+  exit 1 # terminate and indicate error
+fi
+# Unzips the framework zip file into a temp workspace.
+framework=$(mktemp -t framework -d)
+unzip "${INPUT_FRAMEWORK}" -d "${framework}"/
+
+# Executable file in the framework.
+executable_file="${BUNDLE_NAME}.framework/${BUNDLE_NAME}"
+
+# Extracts architectures from the framework binary.
+archs_str=$(xcrun lipo -info "${framework}/${executable_file}" |
+sed -En -e 's/^(Non-|Architectures in the )fat file: .+( is architecture| are): (.*)$/\3/p')
+
+IFS=' ' read -r -a archs <<< "${archs_str}"
+
+merge_cmd=(xcrun lipo)
+
+# Merges object files and hide symbols for each architecture.
+for arch in "${archs[@]}"
+do
+    archdir=$(mktemp -t "${arch}" -d)
+    arch_file="${archdir}/${arch}"
+
+    # Handles the binary differently if they are fat or thin.
+    if [[ "${#archs[@]}" -gt 1 ]]; then
+       xcrun lipo "${framework}/${executable_file}" -thin "${arch}" -output "${arch_file}"
+    else
+       mv "${framework}/${executable_file}" "${arch_file}"
+    fi
+    if [[ "$arch" == "armv7" ]]; then
+      # Check that there are no thread local variables in the input, as they get broken.
+      # See b/124533863.
+      thread_locals=$(xcrun nm -m -g "${arch_file}" | awk '/__DATA,__thread_vars/ { print $5 }' | c++filt)
+      if [[ -n "${thread_locals}" ]]; then
+         echo
+         echo "WARNING: This symbol hiding script breaks thread local variables on 32-bit arm, you had:"
+         echo "${thread_locals}"
+         echo
+         echo "Your build will crash if these variables are actually used at runtime."
+         echo
+      fi
+    fi
+    xcrun ar -x "${arch_file}"
+    mv *.o "${archdir}"/
+
+    objects_file_list=$(mktemp)
+    # Hides the symbols except the whitelisted ones.
+    find "${archdir}" -name "*.o" >> "${objects_file_list}"
+
+    # Checks whether bitcode is enabled in the framework.
+    all_objects_have_bitcode=true
+    for object_file in $(cat "$objects_file_list"); do
+      if otool -arch "${arch}" -l "${object_file}" | grep -q __LLVM; then
+        : # Do nothing
+      else
+        echo "The ${arch} in ${object_file} is NOT bitcode-enabled."
+        all_objects_have_bitcode=false
+        break
+      fi
+    done
+    if [[ "$all_objects_have_bitcode" = "true" ]]; then
+      echo "The ${arch} in ${executable_file} is fully bitcode-enabled."
+      xcrun ld -r -bitcode_bundle -exported_symbols_list \
+        "${WHITELIST_FILE_PATH}" \
+        $LD_DEBUGGABLE_FLAGS \
+        -filelist "${objects_file_list}" -o "${arch_file}_processed.o"
+    else
+      echo "The ${arch} in ${executable_file} is NOT fully bitcode-enabled."
+      xcrun ld -r -exported_symbols_list \
+        "${WHITELIST_FILE_PATH}" \
+        $LD_DEBUGGABLE_FLAGS \
+        -filelist "${objects_file_list}" -o "${arch_file}_processed.o"
+    fi
+
+    output_object="${framework}/${arch}"
+
+    mv "${arch_file}_processed.o" "${output_object}"
+    rm -rf "${archdir}"
+    rm "${objects_file_list}"
+    merge_cmd+=(-arch "${arch}" "${output_object}")
+done
+
+# Repackages the processed object files.
+unzip "${INPUT_FRAMEWORK}"
+merge_cmd+=(-create -output "${BUNDLE_NAME}")
+"${merge_cmd[@]}"
+
+chmod +x "${BUNDLE_NAME}"
+rm "${executable_file}"
+mv "${BUNDLE_NAME}" "${executable_file}"
+( TZ=UTC find "${BUNDLE_NAME}.framework/" -exec touch -h -t 198001010000 {} \+ )
+zip --compression-method store --symlinks --recurse-paths --quiet "${OUTPUT}" "${BUNDLE_NAME}.framework/"
diff --git a/tensorflow/lite/experimental/ios/ios.bzl b/tensorflow/lite/experimental/ios/ios.bzl
index 976c6b09a97..3181b587e72 100644
--- a/tensorflow/lite/experimental/ios/ios.bzl
+++ b/tensorflow/lite/experimental/ios/ios.bzl
@@ -1,5 +1,8 @@
 """TensorFlow Lite Build Configurations for iOS"""
 
+# Placeholder for Google-internal load statements.
+load("@build_bazel_rules_apple//apple:ios.bzl", "ios_static_framework")
+
 TFL_MINIMUM_OS_VERSION = "9.0"
 
 # Default tags for filtering iOS targets. Targets are restricted to Apple platforms.
@@ -13,3 +16,58 @@ TFL_DISABLED_SANITIZER_TAGS = [
     "nomsan",
     "notsan",
 ]
+
+# iOS static framework with symbol whitelist. Exported C++ symbbols might cause
+# symbol collision with other libraries. List of symbols to whitelist can be
+# generated by running `nm -m -g FRAMEWORK_LIBRARY | grep _TfLite` for framework
+# built with `ios_static_framework` rule.
+def tflite_ios_static_framework(
+        name,
+        bundle_name,
+        whitelist_symbols_file,
+        exclude_resources = True,
+        **kwargs):
+    """TFLite variant of ios_static_framework with symbol hiding.
+
+    Args:
+      name: The name of the target.
+      bundle_name: The name to give to the framework bundle, without the
+          ".framework" extension. If omitted, the target's name will be used.
+      whitelist_symbols_file: a file including a list of whitelisted symbols,
+          one symbol per line.
+      exclude_resources: Indicates whether resources should be excluded from the
+          bundle. This can be used to avoid unnecessarily bundling resources if
+          the static framework is being distributed in a different fashion, such
+          as a Cocoapod.
+      **kwargs: Pass-through arguments.
+    """
+
+    preprocessed_name = "Preprocessed_" + name
+    ios_static_framework(
+        name = preprocessed_name,
+        bundle_name = bundle_name,
+        exclude_resources = exclude_resources,
+        **kwargs
+    )
+
+    framework_target = ":{}.zip".format(preprocessed_name)
+
+    srcs = [
+        framework_target,
+        whitelist_symbols_file,
+    ]
+    cmd = ("INPUT_FRAMEWORK=\"$(location " + framework_target + ")\" " +
+           "BUNDLE_NAME=\"" + bundle_name + "\" " +
+           "WHITELIST_FILE_PATH=\"$(location " + whitelist_symbols_file + ")\" " +
+           "OUTPUT=\"$(OUTS)\" " +
+           "\"$(location //tensorflow/lite/experimental/ios:hide_symbols_with_whitelist)\"")
+
+    native.genrule(
+        name = name,
+        srcs = srcs,
+        outs = [name + ".zip"],
+        cmd = cmd,
+        tools = [
+            "//tensorflow/lite/experimental/ios:hide_symbols_with_whitelist",
+        ],
+    )
diff --git a/tensorflow/lite/experimental/ios/whitelist_TensorFlowLiteC.txt b/tensorflow/lite/experimental/ios/whitelist_TensorFlowLiteC.txt
new file mode 100644
index 00000000000..e8ae288ea8f
--- /dev/null
+++ b/tensorflow/lite/experimental/ios/whitelist_TensorFlowLiteC.txt
@@ -0,0 +1 @@
+_TfLite*
diff --git a/tensorflow/lite/experimental/ios/whitelist_TensorFlowLiteCCoreML.txt b/tensorflow/lite/experimental/ios/whitelist_TensorFlowLiteCCoreML.txt
new file mode 100644
index 00000000000..817b4a7f2ec
--- /dev/null
+++ b/tensorflow/lite/experimental/ios/whitelist_TensorFlowLiteCCoreML.txt
@@ -0,0 +1,2 @@
+_TfLiteCoreMlDelegateCreate
+_TfLiteCoreMlDelegateDelete
diff --git a/tensorflow/lite/experimental/ios/whitelist_TensorFlowLiteCMetal.txt b/tensorflow/lite/experimental/ios/whitelist_TensorFlowLiteCMetal.txt
new file mode 100644
index 00000000000..b66b059eef0
--- /dev/null
+++ b/tensorflow/lite/experimental/ios/whitelist_TensorFlowLiteCMetal.txt
@@ -0,0 +1,2 @@
+_TFLGpuDelegateCreate
+_TFLGpuDelegateDelete

From 6128ffea4615c5de807beb515bb528422ba88782 Mon Sep 17 00:00:00 2001
From: Stephan Herhut <herhut@google.com>
Date: Wed, 17 Jun 2020 02:55:28 -0700
Subject: [PATCH 0387/1390] Properly configure the block and grid dimensions
 when launching generated kernels.

PiperOrigin-RevId: 316853965
Change-Id: I1fb68ef970f9d4b9bf1ae66fa84a2183aa3e8186
---
 tensorflow/core/kernels/cubin_headers/BUILD   |  1 +
 .../mlir_generated_cwise_op_gpu_tanh.cu.cc    | 69 ++++++++++++++-----
 2 files changed, 52 insertions(+), 18 deletions(-)

diff --git a/tensorflow/core/kernels/cubin_headers/BUILD b/tensorflow/core/kernels/cubin_headers/BUILD
index b8ba164fbc3..ec8b44050db 100644
--- a/tensorflow/core/kernels/cubin_headers/BUILD
+++ b/tensorflow/core/kernels/cubin_headers/BUILD
@@ -37,4 +37,5 @@ gen_kernel_library(
         "f32",
         "f64",
     ],
+    unroll_factors = "4",
 )
diff --git a/tensorflow/core/kernels/mlir_generated_cwise_op_gpu_tanh.cu.cc b/tensorflow/core/kernels/mlir_generated_cwise_op_gpu_tanh.cu.cc
index 40dd7c7e49e..76d1a46aedd 100644
--- a/tensorflow/core/kernels/mlir_generated_cwise_op_gpu_tanh.cu.cc
+++ b/tensorflow/core/kernels/mlir_generated_cwise_op_gpu_tanh.cu.cc
@@ -45,9 +45,40 @@ Status CreateKernel(absl::string_view kernel_name, uint64_t num_args,
   return stream_exec->GetKernel(loader_spec, kernel_base.get());
 }
 
-class MlirGenerateTanhOp : public OpKernel {
+struct LaunchConfig {
+  se::BlockDim blockDim;
+  se::ThreadDim threadDim;
+}
+
+LaunchConfig
+GetLaunchConfiguration(std::vector<uint64> tile_sizes,
+                       std::vector<uint64> unrolling_factors,
+                       std::vector<uint64> shape) {
+  LaunchConfig result;
+  // Ensure the vectors are length 3 and pad with ones.
+  tile_sizes.resize(3, 1);
+  unrolling_factors.resize(3, 1);
+  shape.resize(3, 1);
+  // We know that the kernel was generated by mapping the three outer-most
+  // dimensions to x,y,z dimensions. So we only need to compute those.
+  for (int i = 0; i < 3; ++i) {
+    // The number of threads is given by the tiling size.
+    result.threadDim[i] = tile_sizes[i];
+    // Compute the number of grids. We use ceildiv here as we have to allocate
+    // an extra thread/block if the division is not even. The kernel contains
+    // code to handle the boundaries.
+    int number_of_threads =
+        (shape[i] + unrolling_factors[i] - 1) / unrolling_factors[i];
+    int number_of_grids =
+        (number_of_threads + tile_sizes[i] - 1) / tile_sizes[i];
+    result.blockDim[i] = number_of_grids;
+  }
+  return result;
+}
+
+class MlirGeneratedTanhOp : public OpKernel {
  public:
-  explicit MlirGenerateTanhOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+  explicit MlirGeneratedTanhOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
 
   void Compute(OpKernelContext* ctx) override {
     auto* stream = ctx->op_device_context()->stream();
@@ -88,11 +119,13 @@ class MlirGenerateTanhOp : public OpKernel {
     args.add_argument<int64_t>(inp.NumElements());
     args.add_argument<int64_t>(1);
 
-    // TODO(b/158649746): Choose block size and thread dim according to the
-    // number of input elements. For now, this supports at most 1024 elements.
+    // This has to be aligned with the configuration that was used when building
+    // the kernels. See the corresponding build rules in `cubin_headers/BUILD`.
+    LaunchCondig config =
+        GetLaunchConfiguration({256}, {4}, {inp.getNumElements()});
     OP_REQUIRES_OK(
-        ctx, stream->parent()->Launch(stream, se::ThreadDim(inp.NumElements()),
-                                      se::BlockDim(1), *kernel, args));
+        ctx, stream->parent()->Launch(stream, config.threadDim, config.blockDim,
+                                      *kernel, args));
   }
 
  protected:
@@ -103,26 +136,26 @@ class MlirGenerateTanhOp : public OpKernel {
   std::mutex mu_;
 };
 
-class MlirGenerateTanhF16Op : public MlirGenerateTanhOp {
+class MlirGeneratedTanhF16Op : public MlirGeneratedTanhOp {
  public:
-  explicit MlirGenerateTanhF16Op(OpKernelConstruction* ctx)
-      : MlirGenerateTanhOp(ctx) {
+  explicit MlirGeneratedTanhF16Op(OpKernelConstruction* ctx)
+      : MlirGeneratedTanhOp(ctx) {
     cubin_data_ = kTanhF16Kernel;
   }
 };
 
-class MlirGenerateTanhF32Op : public MlirGenerateTanhOp {
+class MlirGeneratedTanhF32Op : public MlirGeneratedTanhOp {
  public:
-  explicit MlirGenerateTanhF32Op(OpKernelConstruction* ctx)
-      : MlirGenerateTanhOp(ctx) {
+  explicit MlirGeneratedTanhF32Op(OpKernelConstruction* ctx)
+      : MlirGeneratedTanhOp(ctx) {
     cubin_data_ = kTanhF32Kernel;
   }
 };
 
-class MlirGenerateTanhF64Op : public MlirGenerateTanhOp {
+class MlirGeneratedTanhF64Op : public MlirGeneratedTanhOp {
  public:
-  explicit MlirGenerateTanhF64Op(OpKernelConstruction* ctx)
-      : MlirGenerateTanhOp(ctx) {
+  explicit MlirGeneratedTanhF64Op(OpKernelConstruction* ctx)
+      : MlirGeneratedTanhOp(ctx) {
     cubin_data_ = kTanhF64Kernel;
   }
 };
@@ -130,11 +163,11 @@ class MlirGenerateTanhF64Op : public MlirGenerateTanhOp {
 
 REGISTER_KERNEL_BUILDER(
     Name("Tanh").Device(DEVICE_GPU).TypeConstraint<Eigen::half>("T"),
-    MlirGenerateTanhF16Op);
+    MlirGeneratedTanhF16Op);
 REGISTER_KERNEL_BUILDER(
     Name("Tanh").Device(DEVICE_GPU).TypeConstraint<float>("T"),
-    MlirGenerateTanhF32Op);
+    MlirGeneratedTanhF32Op);
 REGISTER_KERNEL_BUILDER(
     Name("Tanh").Device(DEVICE_GPU).TypeConstraint<double>("T"),
-    MlirGenerateTanhF64Op);
+    MlirGeneratedTanhF64Op);
 }  // namespace tensorflow

From 13b840aa7299c77e744a2eed5a366cbf98988fb0 Mon Sep 17 00:00:00 2001
From: Agoniii <815244047@qq.com>
Date: Wed, 17 Jun 2020 19:10:45 +0800
Subject: [PATCH 0388/1390] modify Log to gry

---
 .../optimizers/auto_mixed_precision_test.cc        | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/tensorflow/core/grappler/optimizers/auto_mixed_precision_test.cc b/tensorflow/core/grappler/optimizers/auto_mixed_precision_test.cc
index 951279d37cd..3fa8260739c 100644
--- a/tensorflow/core/grappler/optimizers/auto_mixed_precision_test.cc
+++ b/tensorflow/core/grappler/optimizers/auto_mixed_precision_test.cc
@@ -287,10 +287,10 @@ TEST_F(AutoMixedPrecisionTest, Simple) {
   Output clr2 = ops::Relu(s.WithOpName("clr2"), gry1);
   Output wht1 = ops::MatMul(s.WithOpName("wht1"), clr2, clr2);
   Output clr3 = ops::Relu(s.WithOpName("clr3"), wht1);
-  Output blk2 = ops::Log(s.WithOpName("blk2"), clr3);
-  Output clr4 = ops::Relu(s.WithOpName("clr4"), blk2);
-  Output blk3 = ops::SparseMatMul(s.WithOpName("blk3"), clr4, clr4);
-  Output clr5 = ops::Relu(s.WithOpName("clr5"), blk3);
+  Output gry2 = ops::Log(s.WithOpName("gry2"), clr3);
+  Output clr4 = ops::Relu(s.WithOpName("clr4"), gry2);
+  Output blk2 = ops::SparseMatMul(s.WithOpName("blk2"), clr4, clr4);
+  Output clr5 = ops::Relu(s.WithOpName("clr5"), blk2);
   Output fetch = ops::Identity(s.WithOpName("fetch"), clr5);
 
   GrapplerItem item;
@@ -313,10 +313,10 @@ TEST_F(AutoMixedPrecisionTest, Simple) {
   EXPECT_EQ(output_view.GetNode("clr2")->attr().at("T").type(), DT_HALF);
   EXPECT_EQ(output_view.GetNode("wht1")->attr().at("T").type(), DT_HALF);
   EXPECT_EQ(output_view.GetNode("clr3")->attr().at("T").type(), DT_HALF);
-  EXPECT_EQ(output_view.GetNode("blk2")->attr().at("T").type(), DT_FLOAT);
+  EXPECT_EQ(output_view.GetNode("gry2")->attr().at("T").type(), DT_FLOAT);
   EXPECT_EQ(output_view.GetNode("clr4")->attr().at("T").type(), DT_FLOAT);
-  EXPECT_EQ(output_view.GetNode("blk3")->attr().at("Ta").type(), DT_FLOAT);
-  EXPECT_EQ(output_view.GetNode("blk3")->attr().at("Tb").type(), DT_FLOAT);
+  EXPECT_EQ(output_view.GetNode("blk2")->attr().at("Ta").type(), DT_FLOAT);
+  EXPECT_EQ(output_view.GetNode("blk2")->attr().at("Tb").type(), DT_FLOAT);
   EXPECT_EQ(output_view.GetNode("clr5")->attr().at("T").type(), DT_FLOAT);
 
   auto tensors = EvaluateNodes(output, item.fetch);

From 286cd7fc6839bb2fc999fd16fb1801f6b30656b8 Mon Sep 17 00:00:00 2001
From: Elena Zhelezina <elena.zhelezina@arm.com>
Date: Wed, 17 Jun 2020 12:22:53 +0100
Subject: [PATCH 0389/1390] Addressed reviewer's comments.

Change-Id: I4b849e60540879ca89483ede675c63631bc9417b
---
 tensorflow/lite/python/lite.py | 25 +++++++------------------
 1 file changed, 7 insertions(+), 18 deletions(-)

diff --git a/tensorflow/lite/python/lite.py b/tensorflow/lite/python/lite.py
index 06796ba820b..cb2f1853619 100644
--- a/tensorflow/lite/python/lite.py
+++ b/tensorflow/lite/python/lite.py
@@ -299,32 +299,21 @@ class QuantizationMode(object):
 
     inference_input_type = input_ty if input_ty else constants.FLOAT
     inference_output_type = output_ty if output_ty else constants.FLOAT
-    if self.post_training_int8_no_float():
+
+    if self.post_training_int8_no_float() \
+      or self.post_training_int16x8_no_float():
       return True, {
           "inference_input_type": inference_input_type,
           "inference_output_type": inference_output_type,
-          "activations_type": constants.INT8,
+          "activations_type": self.activations_type(),
           "allow_float": False
       }
-    elif self.post_training_int8_allow_float():
+    elif self.post_training_int8_allow_float() \
+      or self.post_training_int16x8_allow_float():
       return True, {
           "inference_input_type": inference_input_type,
           "inference_output_type": inference_output_type,
-          "activations_type": constants.INT8,
-          "allow_float": True
-      }
-    elif self.post_training_int16x8_no_float():
-      return True, {
-          "inference_input_type": inference_input_type,
-          "inference_output_type": inference_output_type,
-          "activations_type": constants.INT16,
-          "allow_float": False
-      }
-    elif self.post_training_int16x8_allow_float():
-      return True, {
-          "inference_input_type": inference_input_type,
-          "inference_output_type": inference_output_type,
-          "activations_type": constants.INT16,
+          "activations_type": self.activations_type(),
           "allow_float": True
       }
     else:

From 9eafb72689fb2e99cd06cb09ec84de96e2f2a509 Mon Sep 17 00:00:00 2001
From: Adrian Kuegel <akuegel@google.com>
Date: Wed, 17 Jun 2020 05:06:09 -0700
Subject: [PATCH 0390/1390] Properly configure the block and grid dimensions
 when launching generated kernels.

PiperOrigin-RevId: 316867392
Change-Id: I975a9d1e29e954760532a82985dd016707aa6d02
---
 tensorflow/core/kernels/cubin_headers/BUILD   |  1 -
 .../mlir_generated_cwise_op_gpu_tanh.cu.cc    | 69 +++++--------------
 2 files changed, 18 insertions(+), 52 deletions(-)

diff --git a/tensorflow/core/kernels/cubin_headers/BUILD b/tensorflow/core/kernels/cubin_headers/BUILD
index ec8b44050db..b8ba164fbc3 100644
--- a/tensorflow/core/kernels/cubin_headers/BUILD
+++ b/tensorflow/core/kernels/cubin_headers/BUILD
@@ -37,5 +37,4 @@ gen_kernel_library(
         "f32",
         "f64",
     ],
-    unroll_factors = "4",
 )
diff --git a/tensorflow/core/kernels/mlir_generated_cwise_op_gpu_tanh.cu.cc b/tensorflow/core/kernels/mlir_generated_cwise_op_gpu_tanh.cu.cc
index 76d1a46aedd..40dd7c7e49e 100644
--- a/tensorflow/core/kernels/mlir_generated_cwise_op_gpu_tanh.cu.cc
+++ b/tensorflow/core/kernels/mlir_generated_cwise_op_gpu_tanh.cu.cc
@@ -45,40 +45,9 @@ Status CreateKernel(absl::string_view kernel_name, uint64_t num_args,
   return stream_exec->GetKernel(loader_spec, kernel_base.get());
 }
 
-struct LaunchConfig {
-  se::BlockDim blockDim;
-  se::ThreadDim threadDim;
-}
-
-LaunchConfig
-GetLaunchConfiguration(std::vector<uint64> tile_sizes,
-                       std::vector<uint64> unrolling_factors,
-                       std::vector<uint64> shape) {
-  LaunchConfig result;
-  // Ensure the vectors are length 3 and pad with ones.
-  tile_sizes.resize(3, 1);
-  unrolling_factors.resize(3, 1);
-  shape.resize(3, 1);
-  // We know that the kernel was generated by mapping the three outer-most
-  // dimensions to x,y,z dimensions. So we only need to compute those.
-  for (int i = 0; i < 3; ++i) {
-    // The number of threads is given by the tiling size.
-    result.threadDim[i] = tile_sizes[i];
-    // Compute the number of grids. We use ceildiv here as we have to allocate
-    // an extra thread/block if the division is not even. The kernel contains
-    // code to handle the boundaries.
-    int number_of_threads =
-        (shape[i] + unrolling_factors[i] - 1) / unrolling_factors[i];
-    int number_of_grids =
-        (number_of_threads + tile_sizes[i] - 1) / tile_sizes[i];
-    result.blockDim[i] = number_of_grids;
-  }
-  return result;
-}
-
-class MlirGeneratedTanhOp : public OpKernel {
+class MlirGenerateTanhOp : public OpKernel {
  public:
-  explicit MlirGeneratedTanhOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+  explicit MlirGenerateTanhOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
 
   void Compute(OpKernelContext* ctx) override {
     auto* stream = ctx->op_device_context()->stream();
@@ -119,13 +88,11 @@ class MlirGeneratedTanhOp : public OpKernel {
     args.add_argument<int64_t>(inp.NumElements());
     args.add_argument<int64_t>(1);
 
-    // This has to be aligned with the configuration that was used when building
-    // the kernels. See the corresponding build rules in `cubin_headers/BUILD`.
-    LaunchCondig config =
-        GetLaunchConfiguration({256}, {4}, {inp.getNumElements()});
+    // TODO(b/158649746): Choose block size and thread dim according to the
+    // number of input elements. For now, this supports at most 1024 elements.
     OP_REQUIRES_OK(
-        ctx, stream->parent()->Launch(stream, config.threadDim, config.blockDim,
-                                      *kernel, args));
+        ctx, stream->parent()->Launch(stream, se::ThreadDim(inp.NumElements()),
+                                      se::BlockDim(1), *kernel, args));
   }
 
  protected:
@@ -136,26 +103,26 @@ class MlirGeneratedTanhOp : public OpKernel {
   std::mutex mu_;
 };
 
-class MlirGeneratedTanhF16Op : public MlirGeneratedTanhOp {
+class MlirGenerateTanhF16Op : public MlirGenerateTanhOp {
  public:
-  explicit MlirGeneratedTanhF16Op(OpKernelConstruction* ctx)
-      : MlirGeneratedTanhOp(ctx) {
+  explicit MlirGenerateTanhF16Op(OpKernelConstruction* ctx)
+      : MlirGenerateTanhOp(ctx) {
     cubin_data_ = kTanhF16Kernel;
   }
 };
 
-class MlirGeneratedTanhF32Op : public MlirGeneratedTanhOp {
+class MlirGenerateTanhF32Op : public MlirGenerateTanhOp {
  public:
-  explicit MlirGeneratedTanhF32Op(OpKernelConstruction* ctx)
-      : MlirGeneratedTanhOp(ctx) {
+  explicit MlirGenerateTanhF32Op(OpKernelConstruction* ctx)
+      : MlirGenerateTanhOp(ctx) {
     cubin_data_ = kTanhF32Kernel;
   }
 };
 
-class MlirGeneratedTanhF64Op : public MlirGeneratedTanhOp {
+class MlirGenerateTanhF64Op : public MlirGenerateTanhOp {
  public:
-  explicit MlirGeneratedTanhF64Op(OpKernelConstruction* ctx)
-      : MlirGeneratedTanhOp(ctx) {
+  explicit MlirGenerateTanhF64Op(OpKernelConstruction* ctx)
+      : MlirGenerateTanhOp(ctx) {
     cubin_data_ = kTanhF64Kernel;
   }
 };
@@ -163,11 +130,11 @@ class MlirGeneratedTanhF64Op : public MlirGeneratedTanhOp {
 
 REGISTER_KERNEL_BUILDER(
     Name("Tanh").Device(DEVICE_GPU).TypeConstraint<Eigen::half>("T"),
-    MlirGeneratedTanhF16Op);
+    MlirGenerateTanhF16Op);
 REGISTER_KERNEL_BUILDER(
     Name("Tanh").Device(DEVICE_GPU).TypeConstraint<float>("T"),
-    MlirGeneratedTanhF32Op);
+    MlirGenerateTanhF32Op);
 REGISTER_KERNEL_BUILDER(
     Name("Tanh").Device(DEVICE_GPU).TypeConstraint<double>("T"),
-    MlirGeneratedTanhF64Op);
+    MlirGenerateTanhF64Op);
 }  // namespace tensorflow

From 401dad16ea5a4b0fc13e373c347404b98f2b030a Mon Sep 17 00:00:00 2001
From: Nick Kreeger <kreeger@google.com>
Date: Wed, 17 Jun 2020 06:43:11 -0700
Subject: [PATCH 0391/1390] Add default values in MicroInterpreter
 constructors.

The MicroInterpreter uses a few values to check state - there is a scenario where these values are not always defaulted to internal states. This can cause an exception when the interpreter tries to run. To ensure things work properly, default values.

I also updated the MicroInterpreter test to use the new RecordingMicroAllocator. Two new tests have been added:

1.) Ensure that the interpreter fails to allocate with too small an arena at Invoke() (insured by recording allocation APIs)

2.) Ensure that the interpreter does not allocate anything at construction time - only at Invoke() (or manually with AllocateTensors()).

This will give us better coverage when we add more tenant use cases.

PiperOrigin-RevId: 316877994
Change-Id: I0582080a1fb649276076371be991a13392324801
---
 tensorflow/lite/micro/BUILD                   |   1 +
 tensorflow/lite/micro/micro_interpreter.cc    |   4 +
 .../lite/micro/micro_interpreter_test.cc      | 111 ++++++++++++++++++
 3 files changed, 116 insertions(+)

diff --git a/tensorflow/lite/micro/BUILD b/tensorflow/lite/micro/BUILD
index 32d7271734e..f63d9778634 100644
--- a/tensorflow/lite/micro/BUILD
+++ b/tensorflow/lite/micro/BUILD
@@ -268,6 +268,7 @@ tflite_micro_cc_test(
         ":micro_framework",
         ":micro_utils",
         ":op_resolvers",
+        ":recording_allocators",
         ":test_helpers",
         "//tensorflow/lite/core/api",
         "//tensorflow/lite/micro/testing:micro_test",
diff --git a/tensorflow/lite/micro/micro_interpreter.cc b/tensorflow/lite/micro/micro_interpreter.cc
index c20eb1f0984..6b17a5ffe84 100644
--- a/tensorflow/lite/micro/micro_interpreter.cc
+++ b/tensorflow/lite/micro/micro_interpreter.cc
@@ -83,6 +83,8 @@ MicroInterpreter::MicroInterpreter(const Model* model,
       error_reporter_(error_reporter),
       allocator_(*MicroAllocator::Create(tensor_arena, tensor_arena_size,
                                          error_reporter)),
+      tensors_allocated_(false),
+      initialization_status_(kTfLiteError),
       context_helper_(error_reporter_, &allocator_) {
   Init(profiler);
 }
@@ -96,6 +98,8 @@ MicroInterpreter::MicroInterpreter(const Model* model,
       op_resolver_(*op_resolver),
       error_reporter_(error_reporter),
       allocator_(*allocator),
+      tensors_allocated_(false),
+      initialization_status_(kTfLiteError),
       context_helper_(error_reporter_, &allocator_) {
   Init(profiler);
 }
diff --git a/tensorflow/lite/micro/micro_interpreter_test.cc b/tensorflow/lite/micro/micro_interpreter_test.cc
index 079e23d33eb..93d095d3c68 100644
--- a/tensorflow/lite/micro/micro_interpreter_test.cc
+++ b/tensorflow/lite/micro/micro_interpreter_test.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "tensorflow/lite/micro/micro_mutable_op_resolver.h"
 #include "tensorflow/lite/micro/micro_optional_debug_tools.h"
 #include "tensorflow/lite/micro/micro_utils.h"
+#include "tensorflow/lite/micro/recording_micro_allocator.h"
 #include "tensorflow/lite/micro/test_helpers.h"
 #include "tensorflow/lite/micro/testing/micro_test.h"
 
@@ -244,6 +245,7 @@ TF_LITE_MICRO_TEST(TestIncompleteInitialization) {
   tflite::testing::MockOpResolver mock_resolver;
   constexpr size_t allocator_buffer_size = 2048;
   uint8_t allocator_buffer[allocator_buffer_size];
+
   tflite::MicroInterpreter interpreter(model, mock_resolver, allocator_buffer,
                                        allocator_buffer_size,
                                        micro_test::reporter);
@@ -276,4 +278,113 @@ TF_LITE_MICRO_TEST(InterpreterWithProfilerShouldProfileOps) {
 #endif
 }
 
+TF_LITE_MICRO_TEST(TestIncompleteInitializationAllocationsWithSmallArena) {
+  const tflite::Model* model = tflite::testing::GetComplexMockModel();
+  TF_LITE_MICRO_EXPECT_NE(nullptr, model);
+
+  tflite::testing::MockOpResolver mock_resolver;
+  // 1kb is too small for the ComplexMockModel:
+  constexpr size_t allocator_buffer_size = 1048;
+  uint8_t allocator_buffer[allocator_buffer_size];
+
+  tflite::RecordingMicroAllocator* allocator =
+      tflite::RecordingMicroAllocator::Create(
+          allocator_buffer, allocator_buffer_size, micro_test::reporter);
+  TF_LITE_MICRO_EXPECT_NE(nullptr, allocator);
+
+  tflite::MicroInterpreter interpreter(model, &mock_resolver, allocator,
+                                       micro_test::reporter);
+
+  // Interpreter fails because arena is too small:
+  TF_LITE_MICRO_EXPECT_EQ(interpreter.Invoke(), kTfLiteError);
+
+  // Ensure allocations are zero (ignore tail since some internal structs are
+  // initialized with this space):
+  TF_LITE_MICRO_EXPECT_EQ(
+      0, allocator->GetSimpleMemoryAllocator()->GetHeadUsedBytes());
+  TF_LITE_MICRO_EXPECT_EQ(
+      0, allocator
+             ->GetRecordedAllocation(
+                 tflite::RecordedAllocationType::kTfLiteTensorArray)
+             .used_bytes);
+  TF_LITE_MICRO_EXPECT_EQ(
+      0, allocator
+             ->GetRecordedAllocation(tflite::RecordedAllocationType::
+                                         kTfLiteTensorArrayQuantizationData)
+             .used_bytes);
+  TF_LITE_MICRO_EXPECT_EQ(
+      0,
+      allocator
+          ->GetRecordedAllocation(
+              tflite::RecordedAllocationType::kTfLiteTensorVariableBufferData)
+          .used_bytes);
+  TF_LITE_MICRO_EXPECT_EQ(
+      0,
+      allocator->GetRecordedAllocation(tflite::RecordedAllocationType::kOpData)
+          .used_bytes);
+}
+
+TF_LITE_MICRO_TEST(TestInterpreterDoesNotAllocateUntilInvoke) {
+  const tflite::Model* model = tflite::testing::GetComplexMockModel();
+  TF_LITE_MICRO_EXPECT_NE(nullptr, model);
+
+  tflite::testing::MockOpResolver mock_resolver;
+  constexpr size_t allocator_buffer_size = 1024 * 4;
+  uint8_t allocator_buffer[allocator_buffer_size];
+
+  tflite::RecordingMicroAllocator* allocator =
+      tflite::RecordingMicroAllocator::Create(
+          allocator_buffer, allocator_buffer_size, micro_test::reporter);
+  TF_LITE_MICRO_EXPECT_NE(nullptr, allocator);
+
+  tflite::MicroInterpreter interpreter(model, &mock_resolver, allocator,
+                                       micro_test::reporter);
+
+  // Ensure allocations are zero (ignore tail since some internal structs are
+  // initialized with this space):
+  TF_LITE_MICRO_EXPECT_EQ(
+      0, allocator->GetSimpleMemoryAllocator()->GetHeadUsedBytes());
+  TF_LITE_MICRO_EXPECT_EQ(
+      0, allocator
+             ->GetRecordedAllocation(
+                 tflite::RecordedAllocationType::kTfLiteTensorArray)
+             .used_bytes);
+  TF_LITE_MICRO_EXPECT_EQ(
+      0,
+      allocator
+          ->GetRecordedAllocation(
+              tflite::RecordedAllocationType::kTfLiteTensorVariableBufferData)
+          .used_bytes);
+  TF_LITE_MICRO_EXPECT_EQ(
+      0,
+      allocator->GetRecordedAllocation(tflite::RecordedAllocationType::kOpData)
+          .used_bytes);
+
+  TF_LITE_MICRO_EXPECT_EQ(interpreter.Invoke(), kTfLiteOk);
+  allocator->PrintAllocations();
+
+  // Allocation sizes vary based on platform - check that allocations are now
+  // non-zero:
+  TF_LITE_MICRO_EXPECT_GT(
+      allocator->GetSimpleMemoryAllocator()->GetHeadUsedBytes(), 0);
+  TF_LITE_MICRO_EXPECT_GT(
+      allocator
+          ->GetRecordedAllocation(
+              tflite::RecordedAllocationType::kTfLiteTensorArray)
+          .used_bytes,
+      0);
+  TF_LITE_MICRO_EXPECT_GT(
+
+      allocator
+          ->GetRecordedAllocation(
+              tflite::RecordedAllocationType::kTfLiteTensorVariableBufferData)
+          .used_bytes,
+      0);
+  TF_LITE_MICRO_EXPECT_GT(
+
+      allocator->GetRecordedAllocation(tflite::RecordedAllocationType::kOpData)
+          .used_bytes,
+      0);
+}
+
 TF_LITE_MICRO_TESTS_END

From f44b07ed4e5f2b338d3b3d9f17875b86b8636a92 Mon Sep 17 00:00:00 2001
From: Mark Daoust <markdaoust@google.com>
Date: Wed, 17 Jun 2020 07:20:46 -0700
Subject: [PATCH 0392/1390] Fix formatting

Some of these indents are triggering markdown's (horrible) "4-space indent is a code block" feature

PiperOrigin-RevId: 316883013
Change-Id: If2b53a6788d3179b868a62fb6b4caeeb08caa4bf
---
 tensorflow/python/ops/map_fn.py | 94 ++++++++++++++++-----------------
 1 file changed, 46 insertions(+), 48 deletions(-)

diff --git a/tensorflow/python/ops/map_fn.py b/tensorflow/python/ops/map_fn.py
index 96810805c18..516f427ad08 100644
--- a/tensorflow/python/ops/map_fn.py
+++ b/tensorflow/python/ops/map_fn.py
@@ -108,31 +108,29 @@ def map_fn(fn,
 
   `fn_output_signature` can be specified using any of the following:
 
-    * A `tf.DType` or `tf.TensorSpec` (to describe a `tf.Tensor`)
-    * A `tf.RaggedTensorSpec` (to describe a `tf.RaggedTensor`)
-    * A `tf.SparseTensorSpec` (to describe a `tf.sparse.SparseTensor`)
-    * A (possibly nested) tuple, list, or dict containing the above types.
+  * A `tf.DType` or `tf.TensorSpec` (to describe a `tf.Tensor`)
+  * A `tf.RaggedTensorSpec` (to describe a `tf.RaggedTensor`)
+  * A `tf.SparseTensorSpec` (to describe a `tf.sparse.SparseTensor`)
+  * A (possibly nested) tuple, list, or dict containing the above types.
 
   #### RaggedTensors
 
   `map_fn` supports `tf.RaggedTensor` inputs and outputs.  In particular:
 
-    * If `elems` is a `RaggedTensor`, then `fn` will be called with each
-      row of that ragged tensor.
+  * If `elems` is a `RaggedTensor`, then `fn` will be called with each
+    row of that ragged tensor.
+    * If `elems` has only one ragged dimension, then the values passed to
+      `fn` will be `tf.Tensor`s.
+    * If `elems` has multiple ragged dimensions, then the values passed to
+      `fn` will be `tf.RaggedTensor`s with one fewer ragged dimension.
 
-      * If `elems` has only one ragged dimension, then the values passed to
-        `fn` will be `tf.Tensor`s.
-      * If `elems` has multiple ragged dimensions, then the values passed to
-        `fn` will be `tf.RaggedTensor`s with one fewer ragged dimension.
-
-    * If the result of `map_fn` should be a `RaggedTensor`, then use a
-      `tf.RaggedTensorSpec` to specify `fn_output_signature`.
-
-      * If `fn` returns `tf.Tensor`s with varying sizes, then use a
-        `tf.RaggedTensorSpec` with `ragged_rank=0` to combine them into a
-        single ragged tensor (which will have ragged_rank=1).
-      * If `fn` returns `tf.RaggedTensor`s, then use a `tf.RaggedTensorSpec`
-        with the same `ragged_rank`.
+  * If the result of `map_fn` should be a `RaggedTensor`, then use a
+    `tf.RaggedTensorSpec` to specify `fn_output_signature`.
+    * If `fn` returns `tf.Tensor`s with varying sizes, then use a
+      `tf.RaggedTensorSpec` with `ragged_rank=0` to combine them into a
+      single ragged tensor (which will have ragged_rank=1).
+    * If `fn` returns `tf.RaggedTensor`s, then use a `tf.RaggedTensorSpec`
+      with the same `ragged_rank`.
 
   >>> # Example: RaggedTensor input
   >>> rt = tf.ragged.constant([[1, 2, 3], [], [4, 5], [6]])
@@ -150,10 +148,10 @@ def map_fn(fn,
   *rows* of a `RaggedTensor`.  If you wish to map a function over the
   individual values, then you should use:
 
-    * `tf.ragged.map_flat_values(fn, rt)`
-      (if fn is expressible as TensorFlow ops)
-    * `rt.with_flat_values(map_fn(fn, rt.flat_values))`
-      (otherwise)
+  * `tf.ragged.map_flat_values(fn, rt)`
+    (if fn is expressible as TensorFlow ops)
+  * `rt.with_flat_values(map_fn(fn, rt.flat_values))`
+    (otherwise)
 
   E.g.:
 
@@ -165,14 +163,14 @@ def map_fn(fn,
 
   `map_fn` supports `tf.sparse.SparseTensor` inputs and outputs.  In particular:
 
-    * If `elems` is a `SparseTensor`, then `fn` will be called with each row
-      of that sparse tensor. In particular, the value passed to `fn` will be a
-      `tf.sparse.SparseTensor` with one fewer dimension than `elems`.
+  * If `elems` is a `SparseTensor`, then `fn` will be called with each row
+    of that sparse tensor. In particular, the value passed to `fn` will be a
+    `tf.sparse.SparseTensor` with one fewer dimension than `elems`.
 
-    * If the result of `map_fn` should be a `SparseTensor`, then use a
-      `tf.SparseTensorSpec` to specify `fn_output_signature`.  The individual
-      `SparseTensor`s returned by `fn` will be stacked into a single
-      `SparseTensor` with one more dimension.
+  * If the result of `map_fn` should be a `SparseTensor`, then use a
+    `tf.SparseTensorSpec` to specify `fn_output_signature`.  The individual
+    `SparseTensor`s returned by `fn` will be stacked into a single
+    `SparseTensor` with one more dimension.
 
   >>> # Example: SparseTensor input
   >>> st = tf.sparse.SparseTensor([[0, 0], [2, 0], [2, 1]], [2, 3, 4], [4, 4])
@@ -195,15 +193,15 @@ def map_fn(fn,
   *rows* of a `SparseTensor`.  If you wish to map a function over the nonzero
   values, then you should use:
 
-    * If the function is expressible as TensorFlow ops, use:
-      ```python
-      tf.sparse.SparseTensor(st.indices, fn(st.values), st.dense_shape)
-      ```
-    * Otherwise, use:
-      ```python
-      tf.sparse.SparseTensor(st.indices, tf.map_fn(fn, st.values),
-                             st.dense_shape)
-      ```
+  * If the function is expressible as TensorFlow ops, use:
+    ```python
+    tf.sparse.SparseTensor(st.indices, fn(st.values), st.dense_shape)
+    ```
+  * Otherwise, use:
+    ```python
+    tf.sparse.SparseTensor(st.indices, tf.map_fn(fn, st.values),
+                           st.dense_shape)
+    ```
 
   #### `map_fn` vs. vectorized operations
 
@@ -215,14 +213,14 @@ def map_fn(fn,
 
   `map_fn` should typically only be used if one of the following is true:
 
-    * It is difficult or expensive to express the desired transform with
-      vectorized operations.
-    * `fn` creates large intermediate values, so an equivalent vectorized
-      transform would take too much memory.
-    * Processing elements in parallel is more efficient than an equivalent
-      vectorized transform.
-    * Efficiency of the transform is not critical, and using `map_fn` is
-      more readable.
+  * It is difficult or expensive to express the desired transform with
+    vectorized operations.
+  * `fn` creates large intermediate values, so an equivalent vectorized
+    transform would take too much memory.
+  * Processing elements in parallel is more efficient than an equivalent
+    vectorized transform.
+  * Efficiency of the transform is not critical, and using `map_fn` is
+    more readable.
 
   E.g., the example given above that maps `fn=lambda t: tf.range(t, t + 3)`
   across `elems` could be rewritten more efficiently using vectorized ops:
@@ -255,7 +253,7 @@ def map_fn(fn,
            [2, 3, 4]], dtype=int32)>
 
 
-  Note that if you use the `tf.function` decorator, any non-TensorFlow Python
+  Note: if you use the `tf.function` decorator, any non-TensorFlow Python
   code that you may have written in your function won't get executed. See
   `tf.function` for more  details. The recommendation would be to debug without
   `tf.function` but switch to it to get performance benefits of running `map_fn`

From 3ebc53c394e34dfd780e6bf59ad3c96bd9b3fa79 Mon Sep 17 00:00:00 2001
From: Nathan Luehr <nluehr@nvidia.com>
Date: Thu, 11 Jun 2020 15:59:21 -0500
Subject: [PATCH 0393/1390] Relax stub include version checking.

Remove upper bound on version check for latest inc files.
---
 tensorflow/stream_executor/cuda/cublas_stub.cc   | 10 ++++------
 tensorflow/stream_executor/cuda/cuda_stub.cc     | 10 ++++------
 tensorflow/stream_executor/cuda/cudart_stub.cc   | 10 ++++------
 tensorflow/stream_executor/cuda/cusolver_stub.cc | 10 ++++------
 tensorflow/stream_executor/cuda/cusparse_stub.cc | 10 ++++------
 5 files changed, 20 insertions(+), 30 deletions(-)

diff --git a/tensorflow/stream_executor/cuda/cublas_stub.cc b/tensorflow/stream_executor/cuda/cublas_stub.cc
index 1cbfd51316c..76f3d9b134e 100644
--- a/tensorflow/stream_executor/cuda/cublas_stub.cc
+++ b/tensorflow/stream_executor/cuda/cublas_stub.cc
@@ -63,14 +63,12 @@ typedef enum {} cublasMath_t;
 
 #if CUDA_VERSION < 10000
 #include "tensorflow/stream_executor/cuda/cublas_9_0.inc"
-#elif CUDA_VERSION == 10000
+#elif CUDA_VERSION < 10010
 #include "tensorflow/stream_executor/cuda/cublas_10_0.inc"
-#elif CUDA_VERSION == 10010
+#elif CUDA_VERSION < 10020
 #include "tensorflow/stream_executor/cuda/cublas_10_1.inc"
-#elif CUDA_VERSION == 10020
+#elif CUDA_VERSION < 11000
 #include "tensorflow/stream_executor/cuda/cublas_10_2.inc"
-#elif CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 0
-#include "tensorflow/stream_executor/cuda/cublas_11_0.inc"
 #else
-#error "We have no wrapper for this version."
+#include "tensorflow/stream_executor/cuda/cublas_11_0.inc"
 #endif
diff --git a/tensorflow/stream_executor/cuda/cuda_stub.cc b/tensorflow/stream_executor/cuda/cuda_stub.cc
index ce02be89c22..58c898a54ee 100644
--- a/tensorflow/stream_executor/cuda/cuda_stub.cc
+++ b/tensorflow/stream_executor/cuda/cuda_stub.cc
@@ -95,14 +95,12 @@ typedef void(CUDA_CB* CUhostFn)(void* userData);
 
 #if CUDA_VERSION < 10000
 #include "tensorflow/stream_executor/cuda/cuda_9_0.inc"
-#elif CUDA_VERSION == 10000
+#elif CUDA_VERSION < 10010
 #include "tensorflow/stream_executor/cuda/cuda_10_0.inc"
-#elif CUDA_VERSION <= 10010
+#elif CUDA_VERSION < 10020
 #include "tensorflow/stream_executor/cuda/cuda_10_1.inc"
-#elif CUDA_VERSION <= 10020
+#elif CUDA_VERSION < 11000
 #include "tensorflow/stream_executor/cuda/cuda_10_2.inc"
-#elif CUDA_VERSION <= 11000
-#include "tensorflow/stream_executor/cuda/cuda_11_0.inc"
 #else
-#error "We have no wrapper for this version."
+#include "tensorflow/stream_executor/cuda/cuda_11_0.inc"
 #endif
diff --git a/tensorflow/stream_executor/cuda/cudart_stub.cc b/tensorflow/stream_executor/cuda/cudart_stub.cc
index 3b9e0f2937b..2ab9d142e3c 100644
--- a/tensorflow/stream_executor/cuda/cudart_stub.cc
+++ b/tensorflow/stream_executor/cuda/cudart_stub.cc
@@ -53,16 +53,14 @@ cudaError_t GetSymbolNotFoundError() {
 // A bunch of new symbols were introduced in version 10
 #if CUDART_VERSION < 10000
 #include "tensorflow/stream_executor/cuda/cuda_runtime_9_0.inc"
-#elif CUDART_VERSION == 10000
+#elif CUDART_VERSION < 10010
 #include "tensorflow/stream_executor/cuda/cuda_runtime_10_0.inc"
-#elif CUDART_VERSION == 10010
+#elif CUDART_VERSION < 10020
 #include "tensorflow/stream_executor/cuda/cuda_runtime_10_1.inc"
-#elif CUDART_VERSION == 10020
+#elif CUDART_VERSION < 11000
 #include "tensorflow/stream_executor/cuda/cuda_runtime_10_2.inc"
-#elif CUDART_VERSION == 11000
-#include "tensorflow/stream_executor/cuda/cuda_runtime_11_0.inc"
 #else
-#error "We have no wrapper for this version."
+#include "tensorflow/stream_executor/cuda/cuda_runtime_11_0.inc"
 #endif
 #undef __dv
 #undef __CUDA_DEPRECATED
diff --git a/tensorflow/stream_executor/cuda/cusolver_stub.cc b/tensorflow/stream_executor/cuda/cusolver_stub.cc
index a4b9cc37f9b..edf87c3dc0b 100644
--- a/tensorflow/stream_executor/cuda/cusolver_stub.cc
+++ b/tensorflow/stream_executor/cuda/cusolver_stub.cc
@@ -53,14 +53,12 @@ cusolverStatus_t GetSymbolNotFoundError() {
 
 #if CUDA_VERSION < 10000
 #include "tensorflow/stream_executor/cuda/cusolver_dense_9_0.inc"
-#elif CUDA_VERSION == 10000
+#elif CUDA_VERSION < 10010
 #include "tensorflow/stream_executor/cuda/cusolver_dense_10_0.inc"
-#elif CUDA_VERSION == 10010
+#elif CUDA_VERSION < 10020
 #include "tensorflow/stream_executor/cuda/cusolver_dense_10_1.inc"
-#elif CUDA_VERSION == 10020
+#elif CUDA_VERSION < 11000
 #include "tensorflow/stream_executor/cuda/cusolver_dense_10_2.inc"
-#elif CUDA_VERSION == 11000
-#include "tensorflow/stream_executor/cuda/cusolver_dense_11_0.inc"
 #else
-#error "We don't have a wrapper for this version."
+#include "tensorflow/stream_executor/cuda/cusolver_dense_11_0.inc"
 #endif
diff --git a/tensorflow/stream_executor/cuda/cusparse_stub.cc b/tensorflow/stream_executor/cuda/cusparse_stub.cc
index ae56402fbc3..caed4d1008e 100644
--- a/tensorflow/stream_executor/cuda/cusparse_stub.cc
+++ b/tensorflow/stream_executor/cuda/cusparse_stub.cc
@@ -53,14 +53,12 @@ cusparseStatus_t GetSymbolNotFoundError() {
 
 #if CUDA_VERSION < 10000
 #include "tensorflow/stream_executor/cuda/cusparse_9_0.inc"
-#elif CUDA_VERSION == 10000
+#elif CUDA_VERSION < 10010
 #include "tensorflow/stream_executor/cuda/cusparse_10_0.inc"
-#elif CUDA_VERSION == 10010
+#elif CUDA_VERSION < 10020
 #include "tensorflow/stream_executor/cuda/cusparse_10_1.inc"
-#elif CUDA_VERSION == 10020
+#elif CUDA_VERSION < 11000
 #include "tensorflow/stream_executor/cuda/cusparse_10_2.inc"
-#elif CUSPARSE_VER_MAJOR == 11 && CUSPARSE_VER_MINOR == 0
-#include "tensorflow/stream_executor/cuda/cusparse_11_0.inc"
 #else
-#error "We don't have a wrapper for this version."
+#include "tensorflow/stream_executor/cuda/cusparse_11_0.inc"
 #endif

From 3d4ca5a00a58aae4273c682f05dcb6665def8d02 Mon Sep 17 00:00:00 2001
From: Michael Banfield <micban@google.com>
Date: Wed, 17 Jun 2020 08:02:15 -0700
Subject: [PATCH 0394/1390] Support fetching configured runtime version without
 using Cloud TPU API.

PiperOrigin-RevId: 316888815
Change-Id: I9f840076122e220c80b7b301f2290a6d4f595f1a
---
 tensorflow/python/tpu/client/client.py      | 20 +++++++++++++++++++-
 tensorflow/python/tpu/client/client_test.py | 16 ++++++++++++++++
 tensorflow/python/tpu/client/version.py     |  2 +-
 3 files changed, 36 insertions(+), 2 deletions(-)

diff --git a/tensorflow/python/tpu/client/client.py b/tensorflow/python/tpu/client/client.py
index 972f02437d8..c834a57c153 100644
--- a/tensorflow/python/tpu/client/client.py
+++ b/tensorflow/python/tpu/client/client.py
@@ -20,6 +20,7 @@ from __future__ import division
 from __future__ import print_function
 
 import datetime
+import json
 import logging
 import os
 import time
@@ -48,6 +49,7 @@ _DISCOVERY_SERVICE_URL_ENV_VARIABLE = 'TPU_API_DISCOVERY_URL'
 _GCE_METADATA_ENDPOINT = 'http://metadata.google.internal'
 _DEFAULT_ENDPOINT_PORT = '8470'
 _OOM_EVENT_COOL_TIME_SEC = 90
+_VERSION_SWITCHER_ENDPOINT = 'http://{}:8475/requestversion'
 
 
 def _utcnow():
@@ -277,6 +279,22 @@ class Client(object):
 
   def runtime_version(self):
     """Return runtime version of the TPU."""
+
+    if not self._use_api:
+      # Fallback on getting version directly from TPU.
+      url = _VERSION_SWITCHER_ENDPOINT.format(
+          self.network_endpoints()[0]['ipAddress'])
+      try:
+        req = request.Request(url)
+        resp = request.urlopen(req)
+        version_details = json.loads(resp.read())
+        return version_details.get('currentVersion')
+      except HTTPError as e:
+        status_code = e.code
+        if status_code == 404:
+          return None
+        else:
+          raise e
     return self._get_tpu_property('tensorflowVersion')
 
   def accelerator_type(self):
@@ -350,7 +368,7 @@ class Client(object):
           be sent.
       """
       ip_address = worker['ipAddress']
-      url = 'http://{}:8475/requestversion/{}?restartType={}'.format(
+      url = (_VERSION_SWITCHER_ENDPOINT + '/{}?restartType={}').format(
           ip_address, version, restart_type)
       req = request.Request(url, data=b'')
       try:
diff --git a/tensorflow/python/tpu/client/client_test.py b/tensorflow/python/tpu/client/client_test.py
index 9d7f29ad476..f53f09cd3d5 100644
--- a/tensorflow/python/tpu/client/client_test.py
+++ b/tensorflow/python/tpu/client/client_test.py
@@ -630,6 +630,22 @@ class CloudTpuClientTest(test.TestCase):
         'http://5.6.7.8:8475/requestversion/1.15?restartType=ifNeeded'
     ], sorted(paths))
 
+  @mock.patch.object(request, 'urlopen')
+  def testGetTpuVersion(self, urlopen):
+    c = client.Client(
+        tpu='grpc://1.2.3.4:8470')
+    resp = mock.Mock()
+    resp.read.side_effect = ['{}', '{"currentVersion": "someVersion"}']
+    urlopen.return_value = resp
+    self.assertIsNone(c.runtime_version(), 'Missing key should be handled.')
+    self.assertEqual(
+        'someVersion', c.runtime_version(), 'Should return configured version.')
+    paths = [call[0][0].full_url for call in urlopen.call_args_list]
+    self.assertCountEqual([
+        'http://1.2.3.4:8475/requestversion',
+        'http://1.2.3.4:8475/requestversion',
+    ], sorted(paths))
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/tpu/client/version.py b/tensorflow/python/tpu/client/version.py
index 001059a91da..a91586640fc 100644
--- a/tensorflow/python/tpu/client/version.py
+++ b/tensorflow/python/tpu/client/version.py
@@ -18,4 +18,4 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-__version__ = "0.9"
+__version__ = "0.10"

From 23be4f5d44f28734620fa508d9807d9aca2ce074 Mon Sep 17 00:00:00 2001
From: Rahul Joshi <jurahul@google.com>
Date: Wed, 17 Jun 2020 08:32:24 -0700
Subject: [PATCH 0395/1390] - Fix
 PropagatePotentiallyWrittenWithinUnhandledOp() to mark resource uses within  
 regions as potentially written when a resource operand is seen. This also
 fixes   the case when multiple resources are used as operands in the same
 unhandled op - Add test case to demonstrate the issue

PiperOrigin-RevId: 316893534
Change-Id: I9e688a90155efd990eb5ef835c23933825bcbdd0
---
 ...f_saved_model_optimize_global_tensors.mlir | 88 ++++++++++++++++++-
 .../transforms/optimize_global_tensors.cc     | 22 ++---
 2 files changed, 95 insertions(+), 15 deletions(-)

diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model_optimize_global_tensors.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model_optimize_global_tensors.mlir
index 9d8911d306d..0c68cf0cf64 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model_optimize_global_tensors.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model_optimize_global_tensors.mlir
@@ -1,7 +1,7 @@
 // RUN: tf-opt -tf-saved-model-optimize-global-tensors -split-input-file %s | FileCheck %s
 
 //===----------------------------------------------------------------------===//
-// Freezing.
+// Immutability.
 //===----------------------------------------------------------------------===//
 
 module attributes {tf_saved_model.semantics} {
@@ -142,3 +142,89 @@ module attributes {tf_saved_model.semantics} {
 // Test running the pass on a module that does not have
 // tf_saved_model.semantics.
 module {}
+
+// -----
+
+// Test use as an input in unhandled op
+module attributes {tf_saved_model.semantics} {
+
+  // CHECK: "tf_saved_model.global_tensor"() {
+  // CHECK-SAME: is_mutable
+  // CHECK-SAME: } : () -> ()
+  "tf_saved_model.global_tensor"() { is_mutable, sym_name = "v", type = tensor<f32>, value = dense<42.> : tensor<f32> } : () -> ()
+
+  func @f(%arg0: tensor<!tf.resource<tensor<f32>>> {tf_saved_model.bound_input = @v})
+  attributes {tf_saved_model.exported_names = ["f"]} {
+    "tf.unhandled_op"(%arg0) : (tensor<!tf.resource<tensor<f32>>>) -> ()
+    return
+  }
+}
+
+
+// -----
+
+// Test use as a region capture in an unhandled op
+module attributes {tf_saved_model.semantics} {
+
+  // CHECK: "tf_saved_model.global_tensor"() {
+  // CHECK-SAME: is_mutable
+  // CHECK-SAME: } : () -> ()
+  "tf_saved_model.global_tensor"() { is_mutable, sym_name = "v", type = tensor<f32>, value = dense<42.> : tensor<f32> } : () -> ()
+
+  func @f(%arg0: tensor<!tf.resource<tensor<f32>>> {tf_saved_model.bound_input = @v})
+  attributes {tf_saved_model.exported_names = ["f"]} {
+    "tf.unhandled"() ({
+      %val = "tf.ReadVariableOp"(%arg0) : (tensor<!tf.resource<tensor<f32>>>) -> tensor<f32>
+      "tf.unhandled_terminator"() : () -> ()
+    }) : () -> ()
+    return
+  }
+}
+
+// -----
+
+// Test use as region capture as well as input in an unhandled op
+// to the unhandled op.
+module attributes {tf_saved_model.semantics} {
+
+  // CHECK: "tf_saved_model.global_tensor"() {
+  // CHECK-SAME: is_mutable
+  // CHECK-SAME: } : () -> ()
+  "tf_saved_model.global_tensor"() { is_mutable, sym_name = "v", type = tensor<f32>, value = dense<42.> : tensor<f32> } : () -> ()
+
+  // CHECK: "tf_saved_model.global_tensor"() {
+  // CHECK-SAME: is_mutable
+  // CHECK-SAME: } : () -> ()
+  "tf_saved_model.global_tensor"() { is_mutable, sym_name = "u", type = tensor<f32>, value = dense<22.> : tensor<f32> } : () -> ()
+
+  func @f(%arg0: tensor<!tf.resource<tensor<f32>>> {tf_saved_model.bound_input = @v}, %arg1: tensor<!tf.resource<tensor<f32>>> {tf_saved_model.bound_input = @u})
+  attributes {tf_saved_model.exported_names = ["f"]} {
+    %0 = "tf.unhandled"(%arg0) ({
+      %val = "tf.ReadVariableOp"(%arg1) : (tensor<!tf.resource<tensor<f32>>>) -> tensor<f32>
+      "tf.unhandled_terminator"() : () -> ()
+    }) : (tensor<!tf.resource<tensor<f32>>>) -> (tensor<!tf.resource<tensor<f32>>>)
+    return
+  }
+}
+
+// -----
+
+// Test multiple global tensors uses as operands for an unhandled op.
+module attributes {tf_saved_model.semantics} {
+
+  // CHECK: "tf_saved_model.global_tensor"() {
+  // CHECK-SAME: is_mutable
+  // CHECK-SAME: } : () -> ()
+  "tf_saved_model.global_tensor"() { is_mutable, sym_name = "v", type = tensor<f32>, value = dense<42.> : tensor<f32> } : () -> ()
+
+  // CHECK: "tf_saved_model.global_tensor"() {
+  // CHECK-SAME: is_mutable
+  // CHECK-SAME: } : () -> ()
+  "tf_saved_model.global_tensor"() { is_mutable, sym_name = "u", type = tensor<f32>, value = dense<22.> : tensor<f32> } : () -> ()
+
+  func @f(%arg0: tensor<!tf.resource<tensor<f32>>> {tf_saved_model.bound_input = @v}, %arg1: tensor<!tf.resource<tensor<f32>>> {tf_saved_model.bound_input = @u})
+  attributes {tf_saved_model.exported_names = ["f"]} {
+    "tf.unhandled"(%arg0, %arg1) : (tensor<!tf.resource<tensor<f32>>>, tensor<!tf.resource<tensor<f32>>>) -> ()
+    return
+  }
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/optimize_global_tensors.cc b/tensorflow/compiler/mlir/tensorflow/transforms/optimize_global_tensors.cc
index cd8f988fd5f..07cc6203cbd 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/optimize_global_tensors.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/optimize_global_tensors.cc
@@ -56,14 +56,14 @@ struct GlobalTensorUse {
 using GlobalTensorUsesMap =
     std::map<GlobalTensorOp, std::vector<GlobalTensorUse>>;
 
-static bool IsResourceType(Type type) {
+bool IsResourceType(Type type) {
   if (auto tensor_type = type.dyn_cast<TensorType>()) {
     return tensor_type.getElementType().isa<TF::ResourceType>();
   }
   return false;
 }
 
-static bool IsResource(Value value) { return IsResourceType(value.getType()); }
+bool IsResource(Value value) { return IsResourceType(value.getType()); }
 
 class ResourceAnalyzer {
  public:
@@ -129,30 +129,24 @@ class ResourceAnalyzer {
       // this errs on the side of being conservative. We should improve
       // this by using either a property or a trait that clearly
       // identifies ops with resource mutating behavior.
-      if (PropagatePotentiallyWrittenWithinUnhandledOp(op)) {
-        return;
-      }
+      PropagatePotentiallyWrittenWithinUnhandledOp(op);
     });
     return success();
   }
 
   // If an op is not one of the handled ones, we assume all resource usages
   // within its purview are mutating in nature.
-  bool PropagatePotentiallyWrittenWithinUnhandledOp(Operation* op) {
+  void PropagatePotentiallyWrittenWithinUnhandledOp(Operation* op) {
     for (auto operand : op->getOperands()) {
       if (IsResource(operand)) {
         SetPotentiallyWritten(operand);
-        return true;
       }
     }
-    bool uses_resources = false;
     visitUsedValuesDefinedAbove(op->getRegions(), [&](OpOperand* operand) {
       if (IsResource(operand->get())) {
         SetPotentiallyWritten(operand->get());
-        uses_resources = true;
       }
     });
-    return uses_resources;
   }
 
   // Given a funcOp associated with the callee and operands from the
@@ -212,7 +206,7 @@ bool IsImmutable(GlobalTensorOp global_tensor,
   return true;
 }
 
-static GlobalTensorUsesMap CreateGlobalTensorUsesMap(ModuleOp module) {
+GlobalTensorUsesMap CreateGlobalTensorUsesMap(ModuleOp module) {
   GlobalTensorUsesMap global_tensor_uses;
 
   SymbolTable symbol_table(module);
@@ -293,13 +287,13 @@ void OptimizeGlobalTensorsPass::runOnOperation() {
   EraseUnusedGlobalTensors(module, global_tensor_uses);
 }
 
-}  // namespace
-
 // For "opt" to pick up this pass.
-static PassRegistration<OptimizeGlobalTensorsPass> pass(
+PassRegistration<OptimizeGlobalTensorsPass> pass(
     "tf-saved-model-optimize-global-tensors",
     "Optimize tf_saved_model.global_tensor's.");
 
+}  // namespace
+
 std::unique_ptr<OperationPass<ModuleOp>> CreateOptimizeGlobalTensorsPass() {
   return std::make_unique<OptimizeGlobalTensorsPass>();
 }

From fc296acdc1d454596d9e0e531656858f3b0acca6 Mon Sep 17 00:00:00 2001
From: Mark Daoust <markdaoust@google.com>
Date: Wed, 17 Jun 2020 09:00:01 -0700
Subject: [PATCH 0396/1390] Make axis handling for Normalization more robust.

PiperOrigin-RevId: 316898233
Change-Id: I6888216ed21c4d2a482772fb2a314160750185b6
---
 .../layers/preprocessing/normalization.py     | 67 ++++++++++++-------
 .../preprocessing/normalization_test.py       | 43 ++++++++++++
 2 files changed, 86 insertions(+), 24 deletions(-)

diff --git a/tensorflow/python/keras/layers/preprocessing/normalization.py b/tensorflow/python/keras/layers/preprocessing/normalization.py
index 09564cbb064..ba2f7eaae89 100644
--- a/tensorflow/python/keras/layers/preprocessing/normalization.py
+++ b/tensorflow/python/keras/layers/preprocessing/normalization.py
@@ -53,10 +53,13 @@ class Normalization(CombinerPreprocessingLayer):
 
   Attributes:
       axis: Integer or tuple of integers, the axis or axes that should be
-        normalized (typically the features axis). We will normalize each element
-        in the specified axis. If set to 'None', the layer will perform scalar
-        normalization (diving the input by a single scalar value). 0 (the batch
-        axis) is not allowed.
+        "kept". These axes are not be summed over when calculating the
+        normalization statistics. By default the last axis, the `features` axis
+        is kept and any `space` or `time` axes are summed. Each element in the
+        the axes that are kept is normalized independently. If `axis` is set to
+        'None', the layer will perform scalar normalization (diving the input
+        by a single scalar value). The `batch` axis, 0, is always summed over
+        (`axis=0` is not allowed).
 
   Examples:
 
@@ -78,10 +81,18 @@ class Normalization(CombinerPreprocessingLayer):
     # time, the dtype value will change to reflect it.
     dtype = dtype or K.floatx()
 
+    # Standardize `axis` to a tuple.
+    if axis is None:
+      axis = ()
+    elif isinstance(axis, int):
+      axis = (axis,)
+    else:
+      axis = tuple(axis)
+
     super(Normalization, self).__init__(
         combiner=_NormalizingCombiner(axis), dtype=dtype, **kwargs)
 
-    if axis == 0:
+    if 0 in axis:
       raise ValueError('The argument \'axis\' may not be 0.')
 
     self.axis = axis
@@ -90,18 +101,27 @@ class Normalization(CombinerPreprocessingLayer):
     input_shape = tensor_shape.TensorShape(input_shape).as_list()
     if len(input_shape) == 1:
       input_shape = input_shape + [1]
+
+    ndim = len(input_shape)
+
+    # Sort `self.axis` to avoid transposing `mean_and_var_shape`.
+    # Negative axes are not sortable until you know the number of dimensions.
+    original_axis = self.axis
+    self.axis = tuple(sorted(self.axis,
+                             key=lambda a: a if a >= 0 else ndim + a))
+
+    if any(a < 1-ndim for a in self.axis) or any(a >= ndim for a in self.axis):
+      raise ValueError('All `axis` values must be in '
+                       'the range [1-ndim, ndim-1].\n'
+                       'Got:\n'
+                       '    ndim: {}\n'
+                       '    axis: {}'.format(ndim, original_axis))
+
     self._broadcast_shape = [1 for _ in range(len(input_shape))]
-    if isinstance(self.axis, (tuple, list)):
-      mean_and_var_shape = []
-      for i in self.axis:
-        mean_and_var_shape.append(input_shape[i])
-        self._broadcast_shape[i] = input_shape[i]
-    else:
-      if self.axis is None:
-        mean_and_var_shape = ()
-      else:
-        mean_and_var_shape = input_shape[self.axis]
-        self._broadcast_shape[self.axis] = input_shape[self.axis]
+    mean_and_var_shape = []
+    for i in self.axis:
+      mean_and_var_shape.append(input_shape[i])
+      self._broadcast_shape[i] = input_shape[i]
 
     # count is not used in this class's call() method, but is used to re-create
     # the accumulator during multiple calls to 'adapt'.
@@ -179,11 +199,13 @@ class _NormalizingCombiner(Combiner):
     if values.ndim == 1:
       values = np.expand_dims(values, 1)
 
+    # `np.delete` ignores negative indexes, so use a mask to delete items.
+    axis_mask = np.ones([values.ndim], dtype=bool)
+    axis_mask[np.array(self.axis, dtype=np.int32)] = False
+
     # This is the shape of all reduced axes (not specified in 'axis').
-    if self.axis is None:
-      reduction_counts = values.shape
-    else:
-      reduction_counts = np.delete(values.shape, self.axis)
+
+    reduction_counts = np.array(values.shape)[axis_mask]
     # We get the number of elements that will be reduced by multiplying all
     # values of 'shape' corresponding to the reduced axes.
     count = np.prod(reduction_counts, dtype=np.int64)
@@ -191,10 +213,7 @@ class _NormalizingCombiner(Combiner):
     # We want to reduce across dimensions except those specified in 'axis'
     # when using np.mean or np.variance; create the tuple of axes to reduce
     # over here.
-    if self.axis is None:
-      reduction_axes = None
-    else:
-      reduction_axes = tuple(np.delete(range(values.ndim), self.axis))
+    reduction_axes = tuple(np.arange(values.ndim)[axis_mask])
 
     mean = np.mean(values, axis=reduction_axes, dtype=np.float64)
     variance = np.var(values, axis=reduction_axes, dtype=np.float64)
diff --git a/tensorflow/python/keras/layers/preprocessing/normalization_test.py b/tensorflow/python/keras/layers/preprocessing/normalization_test.py
index 75ef9370899..f5f68d9c51a 100644
--- a/tensorflow/python/keras/layers/preprocessing/normalization_test.py
+++ b/tensorflow/python/keras/layers/preprocessing/normalization_test.py
@@ -275,6 +275,49 @@ class NormalizationTest(keras_parameterized.TestCase,
     if context.executing_eagerly():
       self.assertAllClose(output.numpy(), [[-1], [1], [-1], [1]])
 
+  @parameterized.parameters(
+      {"axis": 0},
+      {"axis": (-1, 0)},
+  )
+  def test_zeros_fail_init(self, axis):
+    cls = get_layer_class()
+    with self.assertRaisesRegex(ValueError,
+                                "The argument 'axis' may not be 0."):
+      cls(axis=axis)
+
+  @parameterized.parameters(
+      # Out of bounds
+      {"axis": 3},
+      {"axis": -3},
+      # In a tuple
+      {"axis": (1, 3)},
+      {"axis": (1, -3)},
+  )
+  def test_bad_axis_fail_build(self, axis):
+    cls = get_layer_class()
+    layer = cls(axis=axis)
+    with self.assertRaisesRegex(ValueError,
+                                r"in the range \[1-ndim, ndim-1\]."):
+      layer.build([None, 2, 3])
+
+  @parameterized.parameters(
+      # Results should be identical no matter how the axes are specified (3d).
+      {"axis": (1, 2)},
+      {"axis": (2, 1)},
+      {"axis": (1, -1)},
+      {"axis": (-1, 1)},
+  )
+  def test_axis_permutations(self, axis):
+    cls = get_layer_class()
+    layer = cls(axis=axis)
+    # data.shape = [2, 2, 3]
+    data = np.array([[[0., 1., 2.], [0., 2., 6.]],
+                     [[2., 3., 4.], [3., 6., 10.]]])
+    expect = np.array([[[-1., -1., -1.], [-1., -1., -1.]],
+                       [[1., 1., 1.], [1., 1., 1.]]])
+    layer.adapt(data)
+    self.assertAllClose(expect, layer(data))
+
 
 if __name__ == "__main__":
   test.main()

From cb264418f6be7a8ce4fcfc6ee19a60404e428162 Mon Sep 17 00:00:00 2001
From: Edward Loper <edloper@google.com>
Date: Wed, 17 Jun 2020 09:19:04 -0700
Subject: [PATCH 0397/1390] Add tf.strings.format() and tf.print() to support
 RaggedTensors.

PiperOrigin-RevId: 316901841
Change-Id: I5fa78acc118557fcf43d1b805149172f8547c0e1
---
 tensorflow/python/ops/ragged/BUILD            |  28 ++-
 .../python/ops/ragged/ragged_dispatch.py      |  11 +-
 .../python/ops/ragged/ragged_dispatch_test.py |  22 +-
 .../python/ops/ragged/ragged_print_op_test.py | 195 ++++++++++++++++++
 .../python/ops/ragged/ragged_string_ops.py    | 117 ++++++++++-
 5 files changed, 359 insertions(+), 14 deletions(-)
 create mode 100644 tensorflow/python/ops/ragged/ragged_print_op_test.py

diff --git a/tensorflow/python/ops/ragged/BUILD b/tensorflow/python/ops/ragged/BUILD
index b2a02b82454..95e5602a246 100644
--- a/tensorflow/python/ops/ragged/BUILD
+++ b/tensorflow/python/ops/ragged/BUILD
@@ -1264,9 +1264,35 @@ py_test(
     srcs_version = "PY2AND3",
     deps = [
         ":ragged_array_ops",
-        "//tensorflow/python:constant_op",
+        ":ragged_factory_ops",
+        ":ragged_tensor",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:platform_test",
+        "//tensorflow/python:tensor_shape",
+        "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
+py_test(
+    name = "ragged_print_op_test",
+    srcs = ["ragged_print_op_test.py"],
+    python_version = "PY3",
+    srcs_version = "PY2AND3",
+    deps = [
+        ":ragged",  # fixdeps: keep
+        ":ragged_factory_ops",
+        ":ragged_string_ops",
+        ":ragged_tensor",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:logging_ops",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:sparse_ops",
+        "//tensorflow/python/eager:def_function",
         "@absl_py//absl/testing:parameterized",
     ],
 )
diff --git a/tensorflow/python/ops/ragged/ragged_dispatch.py b/tensorflow/python/ops/ragged/ragged_dispatch.py
index f13bed07ba0..5c9388b8677 100644
--- a/tensorflow/python/ops/ragged/ragged_dispatch.py
+++ b/tensorflow/python/ops/ragged/ragged_dispatch.py
@@ -29,6 +29,7 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import clip_ops
 from tensorflow.python.ops import data_flow_ops
 from tensorflow.python.ops import gen_bitwise_ops
+from tensorflow.python.ops import logging_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import parsing_ops
@@ -510,6 +511,7 @@ _RAGGED_DISPATCH_OPS = [
      ['data', 'segment_ids']),
     (math_ops.unsorted_segment_sqrt_n, ragged_math_ops.segment_sqrt_n,
      ['data', 'segment_ids']),
+    (string_ops.string_format, ragged_string_ops.string_format, ['[inputs]']),
     (string_ops.reduce_join_v2, ragged_string_ops.reduce_join, ['inputs']),
     (math_ops.reduce_sum, ragged_math_ops.reduce_sum, ['input_tensor']),
     (math_ops.reduce_prod, ragged_math_ops.reduce_prod, ['input_tensor']),
@@ -549,7 +551,7 @@ def register_dispatchers():
     RaggedDispatcher(original_op, ragged_op, args).register(original_op)
 
 
-def _ragged_op_signature(op, ragged_args):
+def _ragged_op_signature(op, ragged_args, ragged_varargs=False):
   """Returns a signature for the given op, marking ragged args in bold."""
   op_name = tf_export.get_canonical_name_for_symbol(op)
   argspec = tf_inspect.getfullargspec(op)
@@ -566,7 +568,10 @@ def _ragged_op_signature(op, ragged_args):
 
   # Add varargs and keyword args
   if argspec.varargs:
-    arg_names.append('*' + argspec.varargs)
+    if ragged_varargs:
+      arg_names.append('***' + argspec.varargs + '**')
+    else:
+      arg_names.append('*' + argspec.varargs)
   if argspec.varkw:
     arg_names.append('**' + argspec.varkw)
 
@@ -597,6 +602,8 @@ def ragged_op_list(tf_version=1):
       arginfos = _get_arg_infos(op, ragged_args)
       ragged_args = [arginfo.position for arginfo in arginfos]
       lines.append(_ragged_op_signature(op, ragged_args))
+  lines.append(
+      _ragged_op_signature(logging_ops.print_v2, [], ragged_varargs=True))
   return ('\n\n### Additional ops that support `RaggedTensor`\n\n'
           'Arguments that accept `RaggedTensor`s are marked in **bold**.\n\n' +
           '\n'.join(sorted(lines)) + 'n')
diff --git a/tensorflow/python/ops/ragged/ragged_dispatch_test.py b/tensorflow/python/ops/ragged/ragged_dispatch_test.py
index 60d9f6c8713..193e329e18a 100644
--- a/tensorflow/python/ops/ragged/ragged_dispatch_test.py
+++ b/tensorflow/python/ops/ragged/ragged_dispatch_test.py
@@ -142,8 +142,7 @@ BINARY_INT_OPS = [
 
 # pylint: disable=g-complex-comprehension
 @test_util.run_all_in_graph_and_eager_modes
-class RaggedElementwiseOpsTest(test_util.TensorFlowTestCase,
-                               parameterized.TestCase):
+class RaggedDispatchTest(test_util.TensorFlowTestCase, parameterized.TestCase):
 
   def assertSameShape(self, x, y):
     """Checks that x and y have the same shape (including ragged shapes)."""
@@ -763,7 +762,12 @@ class RaggedElementwiseOpsTest(test_util.TensorFlowTestCase,
               'tensor': ragged_factory_ops.constant_value([[1, 2, 3], [4, 5]]),
               'axis': [0, -1]
           },
-          expected=ragged_factory_ops.constant_value([[5, 4], [3, 2, 1]]))
+          expected=ragged_factory_ops.constant_value([[5, 4], [3, 2, 1]])),
+      dict(
+          op=string_ops.string_format,
+          kwargs={'template': 'Hi {}',
+                  'inputs': [ragged_factory_ops.constant_value([[1, 2], [3]])]},
+          expected='Hi [[1, 2], [3]]'),
   ])
   def testRaggedDispatch(self, op, expected, args=(), result_is_list=False,
                          kwargs=None):
@@ -819,14 +823,14 @@ class RaggedElementwiseOpsTest(test_util.TensorFlowTestCase,
         'math.unsorted_segment_mean', 'math.unsorted_segment_min',
         'math.unsorted_segment_prod', 'math.unsorted_segment_sqrt_n',
         'math.unsorted_segment_sum', 'one_hot', 'ones_like', 'rank', 'realdiv',
-        'reduce_all', 'size', 'squeeze', 'stack', 'strings.as_string',
+        'math.reduce_all', 'size', 'squeeze', 'stack', 'strings.as_string',
         'strings.join', 'strings.length', 'strings.reduce_join',
         'strings.regex_full_match', 'strings.regex_replace', 'strings.strip',
         'strings.substr', 'strings.to_hash_bucket_fast',
         'strings.to_hash_bucket_strong', 'strings.to_hash_bucket',
         'strings.to_number', 'strings.unicode_script', 'tile', 'truncatediv',
         'truncatemod', 'zeros_like', 'dynamic_partition', 'reverse',
-        'nn.dropout',
+        'nn.dropout', 'strings.format', 'print'
     ]
 
     # Ops that should be listed as supported in v1 only.
@@ -838,15 +842,15 @@ class RaggedElementwiseOpsTest(test_util.TensorFlowTestCase,
 
     v1_ragged_ops = ragged_dispatch.ragged_op_list(tf_version=1)
     for element in supported_ops + supported_ops_v1:
-      self.assertIn(element, v1_ragged_ops)
+      self.assertIn('`tf.' + element + '`', v1_ragged_ops)
     for element in supported_ops_v2:
-      self.assertNotIn(element, v1_ragged_ops)
+      self.assertNotIn('`tf.' + element + '`', v1_ragged_ops)
 
     v2_ragged_ops = ragged_dispatch.ragged_op_list(tf_version=2)
     for element in supported_ops + supported_ops_v2:
-      self.assertIn(element, v2_ragged_ops)
+      self.assertIn('`tf.' + element + '`', v2_ragged_ops)
     for element in supported_ops_v1:
-      self.assertNotIn(element, v2_ragged_ops)
+      self.assertNotIn('`tf.' + element + '`', v2_ragged_ops)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/ops/ragged/ragged_print_op_test.py b/tensorflow/python/ops/ragged/ragged_print_op_test.py
new file mode 100644
index 00000000000..2b612d463d0
--- /dev/null
+++ b/tensorflow/python/ops/ragged/ragged_print_op_test.py
@@ -0,0 +1,195 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for tf.print with ragged tensors.
+
+Note: ragged support for tf.print is implemented by RaggedPrintV2Dispatcher in
+ragged_dispatch.py.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os.path
+import tempfile
+from absl.testing import parameterized
+from tensorflow.python.eager import def_function
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import logging_ops
+from tensorflow.python.ops import sparse_ops
+from tensorflow.python.ops.ragged import ragged_factory_ops
+from tensorflow.python.ops.ragged import ragged_string_ops
+from tensorflow.python.ops.ragged import ragged_tensor
+from tensorflow.python.platform import googletest
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class RaggedPrintV2Test(test_util.TensorFlowTestCase, parameterized.TestCase):
+
+  # pylint: disable=g-long-lambda
+  @parameterized.named_parameters([
+      dict(
+          testcase_name='2d_int_values',
+          inputs=lambda: [ragged_factory_ops.constant([[1, 2], [3]])],
+          expected='[[1, 2], [3]]\n'),
+      dict(
+          testcase_name='3d_int_values',
+          inputs=lambda: [ragged_factory_ops.constant([[[1, 2], [3]], [[4]]])],
+          expected='[[[1, 2], [3]], [[4]]]\n'),
+      dict(
+          testcase_name='2d_str_values',
+          inputs=lambda: [ragged_factory_ops.constant([['a', 'b'], ['c']])],
+          expected="[['a', 'b'], ['c']]\n"),
+      dict(
+          testcase_name='2d_str_values_with_escaping',
+          inputs=lambda: [ragged_factory_ops.constant([["a'b"], ['c"d']])],
+          expected="[['a\\'b'], ['c\"d']]\n"),
+      dict(
+          testcase_name='two_ragged_values',
+          inputs=lambda: [
+              ragged_factory_ops.constant([[1, 2], [3]]),
+              ragged_factory_ops.constant([[5], [], [6, 7, 8]])
+          ],
+          expected='[[1, 2], [3]] [[5], [], [6, 7, 8]]\n'),
+      dict(
+          testcase_name='ragged_value_and_non_tensor_values',
+          inputs=lambda:
+          ['a', 5, True,
+           ragged_factory_ops.constant([[1, 2], [3]]), 'c'],
+          expected='a 5 True [[1, 2], [3]] c\n'),
+      dict(
+          testcase_name='ragged_value_and_dense_value',
+          inputs=lambda: [
+              ragged_factory_ops.constant([[1, 2], [3]]),
+              constant_op.constant([[1, 2], [3, 4]])
+          ],
+          expected='[[1, 2], [3]] [[1 2]\n [3 4]]\n'),
+      dict(
+          testcase_name='ragged_value_and_sparse_value',
+          inputs=lambda: [
+              ragged_factory_ops.constant([[1, 2], [3]]),
+              sparse_ops.from_dense([[1]])
+          ],
+          expected=(
+              '[[1, 2], [3]] '
+              "'SparseTensor(indices=[[0 0]], values=[1], shape=[1 1])'\n")),
+      dict(
+          testcase_name='summarize_default',
+          inputs=lambda: [
+              ragged_factory_ops.constant([[1, 2, 3, 4, 5, 6, 7, 8, 9], [10], [
+              ], [], [], [], [11, 12]])
+          ],
+          expected=('[[1, 2, 3, ..., 7, 8, 9], [10], [], '
+                    '..., '
+                    '[], [], [11, 12]]\n')),
+      dict(
+          testcase_name='summarize_2',
+          inputs=lambda: [
+              ragged_factory_ops.constant([[1, 2, 3, 4, 5, 6, 7, 8, 9], [10], [
+              ], [], [], [], [11, 12]])
+          ],
+          summarize=2,
+          expected='[[1, 2, ..., 8, 9], [10], ..., [], [11, 12]]\n'),
+      dict(
+          testcase_name='summarize_neg1',
+          inputs=lambda: [
+              ragged_factory_ops.constant([[1, 2, 3, 4, 5, 6, 7, 8, 9], [10], [
+              ], [], [], [], [11, 12]])
+          ],
+          summarize=-1,
+          expected=('[[1, 2, 3, 4, 5, 6, 7, 8, 9], [10], '
+                    '[], [], [], [], [11, 12]]\n')),
+  ])
+  def testRaggedPrint(self, inputs, expected, summarize=None):
+    if callable(inputs):
+      inputs = inputs()
+    with tempfile.TemporaryDirectory() as tmpdirname:
+      path = os.path.join(tmpdirname, 'print_output')
+      kwargs = {'output_stream': 'file://{}'.format(path)}
+      if summarize is not None:
+        kwargs.update(summarize=summarize)
+      self.evaluate(logging_ops.print_v2(*inputs, **kwargs))
+      actual = open(path, 'r').read()
+      self.assertEqual(repr(actual), repr(expected))
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class RaggedToStringTest(test_util.TensorFlowTestCase, parameterized.TestCase):
+
+  @parameterized.named_parameters([
+      ('2d_int', [[1, 2], [], [3, 4, 5]], '[[1, 2], [], [3, 4, 5]]'),
+      ('2d_str', [['a'], ['b'], ['c', 'd']], "[['a'], ['b'], ['c', 'd']]"),
+      ('3d_int', [[[1, 2], []], [[3, 4, 5]]], '[[[1, 2], []], [[3, 4, 5]]]'),
+      ('escape', [["a'b"], [r'c\d']], r"[['a\'b'], ['c\\d']]"),
+      dict(testcase_name='2d_empty', rt=[], ragged_rank=1, expected='[]'),
+      dict(testcase_name='3d_empty', rt=[], ragged_rank=2, expected='[]'),
+      dict(
+          testcase_name='3d_rrank1',
+          rt=[[[1, 2], [3, 4]], [], [[5, 6]]],
+          ragged_rank=1,
+          expected='[[[1, 2], [3, 4]], [], [[5, 6]]]'),
+      dict(
+          testcase_name='2d_empty_row', rt=[[]], ragged_rank=1,
+          expected='[[]]'),
+      dict(
+          testcase_name='3d_empty_row', rt=[[]], ragged_rank=2,
+          expected='[[]]'),
+      dict(
+          testcase_name='summarize_1',
+          rt=[[1, 2, 3, 4, 5], [], [6], [7], [8, 9]],
+          summarize=1,
+          expected='[[1, ..., 5], ..., [8, 9]]'),
+      dict(
+          testcase_name='summarize_2',
+          rt=[[1, 2, 3, 4, 5], [], [6], [7], [8, 9]],
+          summarize=2,
+          expected='[[1, 2, ..., 4, 5], [], ..., [7], [8, 9]]'),
+  ])
+  def testRaggedToString(self, rt, expected, summarize=None, ragged_rank=None):
+    rt = ragged_factory_ops.constant(rt, ragged_rank=ragged_rank)
+    actual = ragged_string_ops.ragged_tensor_to_string(rt, summarize=summarize)
+    self.assertAllEqual(actual, expected)
+
+  @parameterized.named_parameters([
+      ('maxelts_BadType', [[1]], "Expected summarize .*, got 'foo'", 'foo'),
+      ('maxelts_0', [[1]], 'Expected summarize to be .*, got 0', 0),
+      ('maxelts_Neg2', [[1]], 'Expected summarize to be .*, got -2', -2),
+  ])
+  def testRaggedToStringErrors(self,
+                               rt,
+                               error,
+                               summarize=None,
+                               exception=ValueError):
+    rt = ragged_factory_ops.constant(rt)
+    with self.assertRaisesRegex(exception, error):
+      self.evaluate(
+          ragged_string_ops.ragged_tensor_to_string(rt, summarize=summarize))
+
+  def testRaggedToStringUnknownRank(self):
+
+    @def_function.function(
+        input_signature=[ragged_tensor.RaggedTensorSpec(ragged_rank=1)])
+    def f(rt):
+      return ragged_string_ops.ragged_tensor_to_string(rt)
+
+    with self.assertRaisesRegex(
+        ValueError, 'RaggedTensor to_string requires '
+        'that rt.shape.rank is not None'):
+      f(ragged_factory_ops.constant([[1, 2], [3]]))
+
+
+if __name__ == '__main__':
+  googletest.main()
diff --git a/tensorflow/python/ops/ragged/ragged_string_ops.py b/tensorflow/python/ops/ragged/ragged_string_ops.py
index 0d9c4d506f3..0ac23c298ba 100755
--- a/tensorflow/python/ops/ragged/ragged_string_ops.py
+++ b/tensorflow/python/ops/ragged/ragged_string_ops.py
@@ -18,10 +18,13 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_spec
+from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import gen_array_ops
+from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gen_string_ops
 from tensorflow.python.ops import string_ops
 from tensorflow.python.ops.ragged import ragged_array_ops
@@ -30,9 +33,14 @@ from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.util import compat as util_compat
 from tensorflow.python.util import deprecation
 from tensorflow.python.util import dispatch
+from tensorflow.python.util.lazy_loader import LazyLoader
 from tensorflow.python.util.tf_export import tf_export
 
 
+map_fn_lib = LazyLoader("map_fn_lib", globals(),
+                        "tensorflow.python.ops.map_fn")
+
+
 @tf_export("strings.bytes_split")
 @dispatch.add_dispatch_support
 def string_bytes_split(input, name=None):  # pylint: disable=redefined-builtin
@@ -640,7 +648,7 @@ def strings_split_v1(input=None, sep=None, maxsplit=-1,  # pylint: disable=redef
         input, dtype=dtypes.string, name="input")
 
     if input.shape.rank == 0:
-      input = gen_array_ops.expand_dims(input, 0)
+      input = array_ops.expand_dims(input, 0)
 
     if result_type == "SparseTensor":
       if input.shape.rank == 1:
@@ -813,3 +821,108 @@ def ngrams(data,
         values=output, row_splits=output_splits, validate=False)
     return array_ops.reshape(output.flat_values,
                              dense_shape) if to_tensor else output
+
+
+def string_format(template, inputs, placeholder="{}", summarize=3, name=None):
+  """Version of tf.strings.format that handles RaggedTensors."""
+  if tensor_util.is_tensor(inputs) or ragged_tensor.is_ragged(inputs):
+    inputs = [inputs]
+
+  split_template = template.split(placeholder)
+  if len(inputs) != len(split_template) - 1:
+    raise ValueError("num placeholders in template and num inputs must match"
+                     ": {} vs {}".format(len(split_template) - 1, len(inputs)))
+
+  with ops.name_scope(name, "StringFormat", [inputs]):
+    output_pieces = [constant_op.constant(split_template[0])]
+    for i, input in enumerate(inputs):
+      if ragged_tensor.is_ragged(input):
+        output_pieces.append(ragged_tensor_to_string(input, summarize))
+      else:
+        output_pieces.append(string_ops.string_format(
+            "{}", [input], summarize=summarize))
+      output_pieces.append(constant_op.constant(split_template[i + 1]))
+    if len(output_pieces) == 1:
+      return output_pieces[0]
+    else:
+      return string_ops.reduce_join(output_pieces)
+
+
+def ragged_tensor_to_string(rt, summarize=None):
+  """Returns a scalar string tensor with the contents of a RaggedTensor.
+
+  Requires that `rt.shape.rank` is not `None`.
+
+  Note: this converts the entire `RaggedTensor` into a single string scalar.
+  If you want to convert individual elements, use `tf.strings.as_string(rt)`.
+
+  >>> rt1 = tf.ragged.constant([[1, 2, 3], [4, 5]])
+  >>> ragged_tensor_to_string(rt1).numpy()
+  b'[[1, 2, 3], [4, 5]]'
+
+  >>> rt2 = tf.ragged.constant([[['a'], ['b', 'c']], [['d', 'e', 'f'], []]])
+  >>> ragged_tensor_to_string(rt2).numpy()
+  b"[[['a'], ['b', 'c']], [['d', 'e', 'f'], []]]"
+
+  >>> rt3 = tf.ragged.constant([[1], [2, 3, 4, 5, 6], [], [], [7], [8, 9]])
+  >>> ragged_tensor_to_string(rt3, summarize=2).numpy()
+  b'[[1], [2, 3, ..., 5, 6], ..., [7], [8, 9]]'
+
+  Args:
+    rt: The RaggedTensor that should be converted to a string.
+    summarize: If specified, then only the first and last `summarize` elements
+      within each dimension are included in the string. If `-1` or `None`, then
+      all elements are included.
+  """
+  if (summarize is not None and summarize != -1 and
+      not (isinstance(summarize, int) and summarize > 0)):
+    raise ValueError("Expected summarize to be -1 or a positive int, got %r" %
+                     summarize)
+  with ops.name_scope(None, "AsString", [rt]):
+    rt = ragged_tensor.convert_to_tensor_or_ragged_tensor(rt)
+    if rt.shape.rank is None:
+      raise ValueError("RaggedTensor to_string requires that rt.shape.rank "
+                       "is not None.")
+    # Convert all elements of `rt` to strings.
+    if rt.dtype == dtypes.string:
+      escaped = string_ops.regex_replace(rt.flat_values, r"(['\\])", r"\\\1")
+      str_t = rt.with_flat_values("'" + escaped + "'")
+    else:
+      str_t = rt.with_flat_values(string_ops.as_string(rt.flat_values))
+
+    return _ragged_tensor_to_string(str_t, summarize)
+
+
+def _ragged_tensor_to_string(string_tensor, summarize):
+  """Returns a scalar string tensor with the contents of `string_tensor`.
+
+  Args:
+    string_tensor: A potentially ragged tensor with dtype=string.
+    summarize: Include only the first and last `summarize` elements of each
+      dimension.  If `-1` or `None`, then include all elements.
+
+  Returns:
+    A scalar string Tensor.
+  """
+  if string_tensor.shape.rank == 1:
+    pieces = string_tensor
+  else:
+    pieces = map_fn_lib.map_fn(
+        lambda s: _ragged_tensor_to_string(s, summarize),
+        string_tensor,
+        fn_output_signature=tensor_spec.TensorSpec(None, dtypes.string))
+  if summarize not in (-1, None):
+    pieces = control_flow_ops.cond(
+        _nrows(string_tensor) <= 2 * summarize,
+        lambda: pieces,
+        lambda: array_ops.concat(  # pylint: disable=g-long-lambda
+            [pieces[:summarize], ["..."], pieces[-summarize:]],
+            axis=0))
+  return "[" + string_ops.reduce_join(pieces, separator=", ") + "]"
+
+
+def _nrows(tensor, out_type=dtypes.int32):
+  if isinstance(tensor, ragged_tensor.RaggedTensor):
+    return tensor.nrows(out_type=out_type)
+  else:
+    return array_ops.shape(tensor, out_type=out_type)[0]

From fa6074832921268ffe81dfcca6b2133d19721b29 Mon Sep 17 00:00:00 2001
From: Marat Dukhan <maratek@google.com>
Date: Wed, 17 Jun 2020 09:27:58 -0700
Subject: [PATCH 0398/1390] Recognize "armeabi" cpu as Linux/ARM in XNNPACK
 backend

PiperOrigin-RevId: 316903436
Change-Id: I22ea414026df1f78da63df6ff6c872c65e9bcf5c
---
 tensorflow/workspace.bzl | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index cb1ea721fb0..354c4c353b9 100755
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -164,11 +164,11 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
 
     tf_http_archive(
         name = "XNNPACK",
-        sha256 = "bd5fd63a09222cd092f0c058b576cf044fb4074f2c4ce8a6fc32fc43d155f9c7",
-        strip_prefix = "XNNPACK-ae046f5a5127084bfe41090afdf1c1d4c9874b77",
+        sha256 = "714d650828b1409e88ccb2a62b36a47827bdcddd875bfcfd3b321fe1b7b1c106",
+        strip_prefix = "XNNPACK-b8e7b076a0c2e7356a69b8478fcd76498d357b45",
         urls = [
-            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/google/XNNPACK/archive/ae046f5a5127084bfe41090afdf1c1d4c9874b77.zip",
-            "https://github.com/google/XNNPACK/archive/ae046f5a5127084bfe41090afdf1c1d4c9874b77.zip",
+            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/google/XNNPACK/archive/b8e7b076a0c2e7356a69b8478fcd76498d357b45.zip",
+            "https://github.com/google/XNNPACK/archive/b8e7b076a0c2e7356a69b8478fcd76498d357b45.zip",
         ],
     )
 
@@ -184,11 +184,11 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
 
     tf_http_archive(
         name = "pthreadpool",
-        sha256 = "c4d4a16053ec0e5125dbd8ae1d6d2ba99601d6fdcf8601a0d51d02a048c40348",
-        strip_prefix = "pthreadpool-e1642461b3b0217d23d6664d839a060f54e4e652",
+        sha256 = "03312bd7d8d9e379d685258963ee8820767158b5946cdd00336ff17dae851001",
+        strip_prefix = "pthreadpool-029c88620802e1361ccf41d1970bd5b07fd6b7bb",
         urls = [
-            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/Maratyszcza/pthreadpool/archive/e1642461b3b0217d23d6664d839a060f54e4e652.zip",
-            "https://github.com/Maratyszcza/pthreadpool/archive/e1642461b3b0217d23d6664d839a060f54e4e652.zip",
+            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/Maratyszcza/pthreadpool/archive/029c88620802e1361ccf41d1970bd5b07fd6b7bb.zip",
+            "https://github.com/Maratyszcza/pthreadpool/archive/029c88620802e1361ccf41d1970bd5b07fd6b7bb.zip",
         ],
     )
 

From 225bdf60f3c4f51ab5568a53d31b0799369a1b89 Mon Sep 17 00:00:00 2001
From: Raman Sarokin <sorokin@google.com>
Date: Wed, 17 Jun 2020 09:30:39 -0700
Subject: [PATCH 0399/1390] Added new way of scalar reading in
 ParseMultiplyScalar.

PiperOrigin-RevId: 316903891
Change-Id: I6a160c0a1a83cff9ee6b1df9c3f59ea0477d6c30
---
 tensorflow/lite/delegates/gpu/common/model_builder.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/lite/delegates/gpu/common/model_builder.cc b/tensorflow/lite/delegates/gpu/common/model_builder.cc
index dc671a47691..01f94c94888 100644
--- a/tensorflow/lite/delegates/gpu/common/model_builder.cc
+++ b/tensorflow/lite/delegates/gpu/common/model_builder.cc
@@ -1271,7 +1271,7 @@ class MulOperationParser : public TFLiteOperationParser {
                                    GraphFloat32* graph, ObjectReader* reader) {
     RETURN_IF_ERROR(reader->AddInput(node, runtime_tensor));
     MultiplyAttributes attr;
-    if (constant_dims->size <= 0) {
+    if (constant_dims->size <= 0 || NumElements(constant_dims) == 1) {
       Tensor<Scalar, DataType::FLOAT32> tensor;
       RETURN_IF_ERROR(reader->ReadTensor(constant_tensor, &tensor));
       attr.param = tensor.data[0];

From 51ccd6911b8bab58df2e8be4f31ced43b04cff96 Mon Sep 17 00:00:00 2001
From: Raman Sarokin <sorokin@google.com>
Date: Wed, 17 Jun 2020 09:31:04 -0700
Subject: [PATCH 0400/1390] Choosing better setting in convolution for Intel.

PiperOrigin-RevId: 316903965
Change-Id: I9ff6c2a5026059011b5ccf7beddb8111b419ff8d
---
 tensorflow/lite/delegates/gpu/cl/cl_device.cc |  6 +++++
 tensorflow/lite/delegates/gpu/cl/cl_device.h  |  1 +
 .../delegates/gpu/cl/kernels/conv_powervr.cc  | 23 +++++++++++++++----
 3 files changed, 26 insertions(+), 4 deletions(-)

diff --git a/tensorflow/lite/delegates/gpu/cl/cl_device.cc b/tensorflow/lite/delegates/gpu/cl/cl_device.cc
index aea81d5e659..64e07428515 100644
--- a/tensorflow/lite/delegates/gpu/cl/cl_device.cc
+++ b/tensorflow/lite/delegates/gpu/cl/cl_device.cc
@@ -495,6 +495,12 @@ std::string CLDevice::GetPlatformVersion() const {
   return GetPlatformInfo(platform_id_, CL_PLATFORM_VERSION);
 }
 
+bool CLDevice::IsCL20OrHigher() const {
+  return info_.cl_version != OpenCLVersion::CL_1_0 &&
+         info_.cl_version != OpenCLVersion::CL_1_1 &&
+         info_.cl_version != OpenCLVersion::CL_1_2;
+}
+
 bool CLDevice::IsAdreno() const { return info_.vendor == Vendor::QUALCOMM; }
 
 bool CLDevice::IsAdreno3xx() const {
diff --git a/tensorflow/lite/delegates/gpu/cl/cl_device.h b/tensorflow/lite/delegates/gpu/cl/cl_device.h
index 1df16aa3bad..ae6a1d11af6 100644
--- a/tensorflow/lite/delegates/gpu/cl/cl_device.h
+++ b/tensorflow/lite/delegates/gpu/cl/cl_device.h
@@ -178,6 +178,7 @@ class CLDevice {
   bool SupportsExtension(const std::string& extension) const;
   bool SupportsFP32RTN() const;
   bool SupportsFP16RTN() const;
+  bool IsCL20OrHigher() const;
   bool IsAdreno() const;
   bool IsAdreno3xx() const;
   bool IsAdreno4xx() const;
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/conv_powervr.cc b/tensorflow/lite/delegates/gpu/cl/kernels/conv_powervr.cc
index 363a0157420..bd694e7cc4f 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/conv_powervr.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/conv_powervr.cc
@@ -859,12 +859,27 @@ ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
     conv_params.src_depth_loop_size = 1;
     conv_params.weights_upload_type = WeightsUploadType::GLOBAL_MEM;
   } else if (device.IsIntel()) {
+    if (different_weights_for_height) {
+      conv_params.work_group_size = int3(16, 1, 1);
+      conv_params.work_group_launch_order = int3(0, 1, 2);
+      conv_params.fixed_work_group_size = true;
+    } else {
+      conv_params.linear_hw = true;
+      conv_params.work_group_size = int3(16, 1, 1);
+      conv_params.work_group_launch_order = int3(0, 1, 2);
+      conv_params.fixed_work_group_size = true;
+    }
     conv_params.block_size = int3(1, 1, 4);
-    conv_params.work_group_size = int3(8, 2, 1);
-    conv_params.work_group_launch_order = int3(0, 1, 2);
-    conv_params.fixed_work_group_size = true;
     conv_params.src_depth_loop_size = 1;
-    conv_params.weights_upload_type = WeightsUploadType::LOCAL_MEM_BY_THREADS;
+    if (definition.precision != CalculationsPrecision::F32_F16 &&
+        device.SupportsExtension("cl_khr_subgroups") &&
+        device.SupportsExtension("cl_intel_required_subgroup_size") &&
+        device.IsCL20OrHigher()) {
+      conv_params.weights_upload_type =
+          WeightsUploadType::PRIVATE_MEM_SIMD16_BROADCAST;
+    } else {
+      conv_params.weights_upload_type = WeightsUploadType::LOCAL_MEM_BY_THREADS;
+    }
     if (dst_depth % 4 == 0 || dst_depth >= 8) {
       conv_params.block_size.z = 4;
     } else if (dst_depth % 2 == 0 || dst_depth >= 4) {

From 8b7af2c77091b4e4286aa3ef9efda324646fbf99 Mon Sep 17 00:00:00 2001
From: Vo Van Nghia <vovannghia2409@gmail.com>
Date: Wed, 17 Jun 2020 23:41:36 +0700
Subject: [PATCH 0401/1390] Use char** instead of string

---
 tensorflow/c/env.cc              | 10 ++--------
 tensorflow/c/env.h               |  5 +----
 tensorflow/core/platform/path.cc |  2 --
 3 files changed, 3 insertions(+), 14 deletions(-)

diff --git a/tensorflow/c/env.cc b/tensorflow/c/env.cc
index 43879a18359..6d8528bc42f 100644
--- a/tensorflow/c/env.cc
+++ b/tensorflow/c/env.cc
@@ -147,14 +147,8 @@ TF_StringStream* TF_GetLocalTempDirectories() {
   return list;
 }
 
-void TF_GetTempFileName(const char* extension, std::string* name,
-                        TF_Status* status) {
-  *name = ::tensorflow::io::GetTempFilename(extension);
-  if (name->length() == 0) {
-    TF_SetStatus(status, TF_INTERNAL, "Can not get temp file name");
-  } else {
-    TF_SetStatus(status, TF_OK, "");
-  }
+void TF_GetTempFileName(const char* extension, char** name) {
+  *name = strdup(::tensorflow::io::GetTempFilename(extension).c_str());
 }
 
 TF_CAPI_EXPORT extern uint64_t TF_NowNanos(void) {
diff --git a/tensorflow/c/env.h b/tensorflow/c/env.h
index 273a3b5e142..29ec417a75e 100644
--- a/tensorflow/c/env.h
+++ b/tensorflow/c/env.h
@@ -20,8 +20,6 @@ limitations under the License.
 #include <stddef.h>
 #include <stdint.h>
 
-#include <string>
-
 #include "tensorflow/c/c_api.h"
 #include "tensorflow/c/tf_file_statistics.h"
 
@@ -157,8 +155,7 @@ TF_CAPI_EXPORT extern TF_StringStream* TF_GetLocalTempDirectories(void);
 // Creates a temporary file name with an extension.
 // The caller is responsible for freeing the returned pointer.
 TF_CAPI_EXPORT extern void TF_GetTempFileName(const char* extension,
-                                              std::string* name,
-                                              TF_Status* status);
+                                              char** name);
 
 // Returns the number of nanoseconds since the Unix epoch.
 TF_CAPI_EXPORT extern uint64_t TF_NowNanos(void);
diff --git a/tensorflow/core/platform/path.cc b/tensorflow/core/platform/path.cc
index f9442ccba0f..1e88328aace 100644
--- a/tensorflow/core/platform/path.cc
+++ b/tensorflow/core/platform/path.cc
@@ -327,8 +327,6 @@ string GetTempFilename(const string& extension) {
   }
   LOG(FATAL) << "No temp directory found.";
 #endif
-  // Return an empty string to indicate that we can not create temp file name.
-  return "";
 }
 
 bool GetTestUndeclaredOutputsDir(string* dir) {

From b780ee931b1d4c14aae7ac534c937a88204d52c2 Mon Sep 17 00:00:00 2001
From: Edward Loper <edloper@google.com>
Date: Wed, 17 Jun 2020 09:33:05 -0700
Subject: [PATCH 0402/1390] Replace the mechanism used to register & look up
 Python types from c code in tensorflow/python/util.h with one that supports
 non-type symbols as well.

PiperOrigin-RevId: 316904361
Change-Id: I4ec98c861742efddcebd140ff9e1a6ff567cc94c
---
 tensorflow/python/util/util.cc                | 56 +++++++++++++------
 tensorflow/python/util/util.h                 | 15 ++++-
 tensorflow/python/util/util_wrapper.cc        |  4 ++
 .../tools/def_file_filter/symbols_pybind.txt  |  1 +
 4 files changed, 56 insertions(+), 20 deletions(-)

diff --git a/tensorflow/python/util/util.cc b/tensorflow/python/util/util.cc
index 1d0dd695d74..cf8581443e7 100644
--- a/tensorflow/python/util/util.cc
+++ b/tensorflow/python/util/util.cc
@@ -29,15 +29,25 @@ limitations under the License.
 namespace tensorflow {
 namespace swig {
 
-std::unordered_map<string, PyObject*>* PythonTypesMap() {
+namespace {
+string PyObjectToString(PyObject* o);
+}  // namespace
+
+std::unordered_map<string, PyObject*>* RegisteredPyObjectMap() {
   static auto* m = new std::unordered_map<string, PyObject*>();
   return m;
 }
 
-PyObject* GetRegisteredType(const string& key) {
-  auto* m = PythonTypesMap();
-  auto it = m->find(key);
-  if (it == m->end()) return nullptr;
+PyObject* GetRegisteredPyObject(const string& name) {
+  const auto* m = RegisteredPyObjectMap();
+  auto it = m->find(name);
+  if (it == m->end()) {
+    PyErr_SetString(PyExc_TypeError,
+                    tensorflow::strings::StrCat("No object with name ", name,
+                                                " has been registered.")
+                        .c_str());
+    return nullptr;
+  }
   return it->second;
 }
 
@@ -49,26 +59,35 @@ PyObject* RegisterType(PyObject* type_name, PyObject* type) {
                         .c_str());
     return nullptr;
   }
+  return RegisterPyObject(type_name, type);
+}
 
+PyObject* RegisterPyObject(PyObject* name, PyObject* value) {
   string key;
-  if (PyBytes_Check(type_name)) {
-    key = PyBytes_AsString(type_name);
-  }
+  if (PyBytes_Check(name)) {
+    key = PyBytes_AsString(name);
 #if PY_MAJOR_VERSION >= 3
-  if (PyUnicode_Check(type_name)) {
-    key = PyUnicode_AsUTF8(type_name);
-  }
+  } else if (PyUnicode_Check(name)) {
+    key = PyUnicode_AsUTF8(name);
 #endif
-
-  if (PythonTypesMap()->find(key) != PythonTypesMap()->end()) {
+  } else {
     PyErr_SetString(PyExc_TypeError, tensorflow::strings::StrCat(
-                                         "Type already registered for ", key)
+                                         "Expected name to be a str, got",
+                                         PyObjectToString(name))
                                          .c_str());
     return nullptr;
   }
 
-  Py_INCREF(type);
-  PythonTypesMap()->emplace(key, type);
+  auto* m = RegisteredPyObjectMap();
+  if (m->find(key) != m->end()) {
+    PyErr_SetString(PyExc_TypeError, tensorflow::strings::StrCat(
+                                         "Value already registered for ", key)
+                                         .c_str());
+    return nullptr;
+  }
+
+  Py_INCREF(value);
+  m->emplace(key, value);
 
   Py_RETURN_NONE;
 }
@@ -196,7 +215,7 @@ class CachedTypeCheck {
 // Returns 0 otherwise.
 // Returns -1 if an error occurred (e.g., if 'type_name' is not registered.)
 int IsInstanceOfRegisteredType(PyObject* obj, const char* type_name) {
-  PyObject* type_obj = GetRegisteredType(type_name);
+  PyObject* type_obj = GetRegisteredPyObject(type_name);
   if (TF_PREDICT_FALSE(type_obj == nullptr)) {
     PyErr_SetString(PyExc_RuntimeError,
                     tensorflow::strings::StrCat(
@@ -513,7 +532,8 @@ class AttrsValueIterator : public ValueIterator {
 };
 
 bool IsSparseTensorValueType(PyObject* o) {
-  PyObject* sparse_tensor_value_type = GetRegisteredType("SparseTensorValue");
+  PyObject* sparse_tensor_value_type =
+      GetRegisteredPyObject("SparseTensorValue");
   if (TF_PREDICT_FALSE(sparse_tensor_value_type == nullptr)) {
     return false;
   }
diff --git a/tensorflow/python/util/util.h b/tensorflow/python/util/util.h
index 23438b43c53..fc0b864416e 100644
--- a/tensorflow/python/util/util.h
+++ b/tensorflow/python/util/util.h
@@ -19,6 +19,8 @@ limitations under the License.
 
 #include <Python.h>
 
+#include <string>
+
 namespace tensorflow {
 namespace swig {
 
@@ -270,11 +272,20 @@ PyObject* FlattenForData(PyObject* nested);
 PyObject* AssertSameStructureForData(PyObject* o1, PyObject* o2,
                                      bool check_types);
 
-// RegisterType is used to pass PyTypeObject (which is defined in python) for an
-// arbitrary identifier `type_name` into C++.
+// Registers a Python object so it can be looked up from c++.  The set of
+// valid names, and the expected values for those names, are listed in
+// the documentation for `RegisteredPyObjects`.  Returns PyNone.
+PyObject* RegisterPyObject(PyObject* name, PyObject* value);
+
+// Variant of RegisterPyObject that requires the object's value to be a type.
 PyObject* RegisterType(PyObject* type_name, PyObject* type);
 
 }  // namespace swig
+
+// Returns a borrowed reference to an object that was registered with
+// RegisterPyObject.  (Do not call PY_DECREF on the result).
+PyObject* GetRegisteredPyObject(const std::string& name);
+
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_PYTHON_UTIL_UTIL_H_
diff --git a/tensorflow/python/util/util_wrapper.cc b/tensorflow/python/util/util_wrapper.cc
index dd74306413c..63c70d785cc 100644
--- a/tensorflow/python/util/util_wrapper.cc
+++ b/tensorflow/python/util/util_wrapper.cc
@@ -30,6 +30,10 @@ PYBIND11_MODULE(_pywrap_utils, m) {
           return tensorflow::PyoOrThrow(
               tensorflow::swig::RegisterType(type_name.ptr(), type.ptr()));
         });
+  m.def("RegisterPyObject", [](const py::handle& name, const py::handle& type) {
+    return tensorflow::PyoOrThrow(
+        tensorflow::swig::RegisterPyObject(name.ptr(), type.ptr()));
+  });
   m.def(
       "IsTensor",
       [](const py::handle& o) {
diff --git a/tensorflow/tools/def_file_filter/symbols_pybind.txt b/tensorflow/tools/def_file_filter/symbols_pybind.txt
index e72ef973ff2..07f5906aa08 100644
--- a/tensorflow/tools/def_file_filter/symbols_pybind.txt
+++ b/tensorflow/tools/def_file_filter/symbols_pybind.txt
@@ -17,6 +17,7 @@ tensorflow::swig::Flatten
 tensorflow::swig::IsSequenceForData
 tensorflow::swig::FlattenForData
 tensorflow::swig::AssertSameStructureForData
+tensorflow::swig::RegisterPyObject
 tensorflow::swig::RegisterType
 tensorflow::swig::IsEagerTensorSlow
 

From d8e0beacd934a3a4e7ee3c14240a9e398a929a27 Mon Sep 17 00:00:00 2001
From: Raman Sarokin <sorokin@google.com>
Date: Wed, 17 Jun 2020 09:33:37 -0700
Subject: [PATCH 0403/1390] Reshape&Reshapex4 converted to new style.

PiperOrigin-RevId: 316904479
Change-Id: I7c1fb0ca5a31fc1f82545d70cfcdcfb7d63bcd6a
---
 .../lite/delegates/gpu/cl/kernels/reshape.cc  | 185 ++++++++----------
 .../delegates/gpu/cl/kernels/reshapex4.cc     | 127 ++++++------
 2 files changed, 140 insertions(+), 172 deletions(-)

diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/reshape.cc b/tensorflow/lite/delegates/gpu/cl/kernels/reshape.cc
index e1589e9d682..a99fff0a1da 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/reshape.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/reshape.cc
@@ -25,92 +25,24 @@ namespace gpu {
 namespace cl {
 namespace {
 
-std::string GetReshapeBatchedCode(
-    const OperationDef& op_def,
-    const std::vector<ElementwiseOperation*>& linked_operations) {
-  TensorCodeGenerator src_tensor(
-      "src_data",
-      WHSBPoint{"src_size.x", "src_size.y", "src_size.z", "src_size.w"},
-      op_def.src_tensors[0]);
-  TensorCodeGenerator dst_tensor(
-      "dst_data",
-      WHSBPoint{"dst_size.x", "dst_size.y", "dst_size.z", "dst_size.w"},
-      op_def.dst_tensors[0]);
+std::string GetReshapeBatchedCode(const OperationDef& op_def, Arguments* args) {
+  args->AddObjectRef(
+      "src_tensor", AccessType::READ,
+      absl::make_unique<TensorDescriptor>(op_def.src_tensors[0]));
+  args->AddObjectRef(
+      "dst_tensor", AccessType::WRITE,
+      absl::make_unique<TensorDescriptor>(op_def.dst_tensors[0]));
 
   std::string c = GetCommonDefines(op_def.precision);
   c += "__kernel void main_function(\n";
-  c += src_tensor.GetDeclaration(AccessType::READ);
-  c += GetArgsDeclaration(linked_operations);
-  c += dst_tensor.GetDeclaration(AccessType::WRITE) + ",\n";
-  c += "    int4 src_size,             \n";
-  c += "    int4 dst_size,             \n";
-  c += "    int src_channels,          \n";
-  c += "    int dst_channels           \n";
-  c += ") {\n";
+  c += "$0) {\n";
   c += "  int linear_id = get_global_id(0);\n";
-  c += "  int X = linear_id / dst_size.w;\n";
-  c += "  int B = linear_id % dst_size.w;\n";
+  c += "  int X = linear_id / args.dst_tensor.Batch();\n";
+  c += "  int B = linear_id % args.dst_tensor.Batch();\n";
   c += "  int Y = get_global_id(1);\n";
   c += "  int Z = get_global_id(2);\n";
-  c += "  if (X >= dst_size.x || Y >= dst_size.y || Z >= dst_size.z || B >= "
-       "dst_size.w) return;\n";
-  c += "  FLT temps[4];\n";
-  c += "  temps[0] = (FLT)(0.0f);\n";
-  c += "  temps[1] = (FLT)(0.0f);\n";
-  c += "  temps[2] = (FLT)(0.0f);\n";
-  c += "  temps[3] = (FLT)(0.0f);\n";
-  c += "  int base = ((B * dst_size.y + Y)* dst_size.x + X)* dst_channels + Z "
-       "* 4;\n";
-  c += "  for (int i = 0; i < 4; ++i) {\n";
-  c += "    int dst_channel = Z * 4 + i;\n";
-  c += "    if (dst_channel < dst_channels) {;\n";
-  c += "      int p = base + i;\n";
-  c += "      int src_c = p % src_channels;\n";
-  c += "      p = p / src_channels;\n";
-  c += "      int src_x = p % src_size.x;\n";
-  c += "      p = p / src_size.x;\n";
-  c += "      int src_y = p % src_size.y;\n";
-  c += "      int src_b = p / src_size.y;\n";
-  c += "      int src_z = src_c / 4;\n";
-  c += "      int src_sub_ch = src_c % 4;\n";
-  c += "      FLT4 t =" +
-       src_tensor.ReadWHSB("src_x", "src_y", "src_z", "src_b") + ";\n";
-  c += "      FLT t_ar[4] = {t.x, t.y, t.z, t.w};\n";
-  c += "      temps[i] = t_ar[src_sub_ch];\n";
-  c += "    }\n";
-  c += "  }\n";
-  c += "  FLT4 result = (FLT4)(temps[0], temps[1], temps[2], temps[3]);\n";
-  const LinkingContext context{"result", "X * dst_size.w + B", "Y", "Z"};
-  c += PostProcess(linked_operations, context);
-  c += "  " + dst_tensor.WriteWHSB("result", "X", "Y", "Z", "B");
-  c += "}\n";
-  return c;
-}
-
-std::string GetReshapeCode(
-    const OperationDef& op_def,
-    const std::vector<ElementwiseOperation*>& linked_operations) {
-  TensorCodeGenerator src_tensor(
-      "src_data", WHSPoint{"src_size.x", "src_size.y", "src_size.z"},
-      op_def.src_tensors[0]);
-  TensorCodeGenerator dst_tensor(
-      "dst_data", WHSPoint{"dst_size.x", "dst_size.y", "dst_size.z"},
-      op_def.dst_tensors[0]);
-
-  std::string c = GetCommonDefines(op_def.precision);
-  c += "__kernel void main_function(\n";
-  c += src_tensor.GetDeclaration(AccessType::READ);
-  c += GetArgsDeclaration(linked_operations);
-  c += dst_tensor.GetDeclaration(AccessType::WRITE) + ",\n";
-  c += "    int4 src_size,             \n";
-  c += "    int4 dst_size,             \n";
-  c += "    int src_channels,          \n";
-  c += "    int dst_channels           \n";
-  c += ") {\n";
-  c += "  int X = get_global_id(0);\n";
-  c += "  int Y = get_global_id(1);\n";
-  c += "  int Z = get_global_id(2);\n";
-  c += "  if (X >= dst_size.x || Y >= dst_size.y || Z >= dst_size.z) { \n";
+  c += "  if (X >= args.dst_tensor.Width() || Y >= args.dst_tensor.Height() || "
+       "Z >= args.dst_tensor.Slices()) { \n";
   c += "    return; \n";
   c += "  } \n";
   c += "  FLT temps[4];\n";
@@ -118,25 +50,73 @@ std::string GetReshapeCode(
   c += "  temps[1] = (FLT)(0.0f);\n";
   c += "  temps[2] = (FLT)(0.0f);\n";
   c += "  temps[3] = (FLT)(0.0f);\n";
+  c += "  int base = ((B * args.dst_tensor.Height() + Y) * "
+       "args.dst_tensor.Width() + X) * args.dst_tensor.Channels() + Z * 4;\n";
   c += "  for (int i = 0; i < 4; ++i) {\n";
   c += "    int dst_channel = Z * 4 + i;\n";
-  c += "    if (dst_channel < dst_channels) {;\n";
-  c += "      int p = dst_channel + dst_channels * (X + dst_size.x * Y);\n";
-  c += "      int src_c = p % src_channels;\n";
-  c += "      p = p / src_channels;\n";
-  c += "      int src_x = p % src_size.x;\n";
-  c += "      int src_y = p / src_size.x;\n";
+  c += "    if (dst_channel < args.dst_tensor.Channels()) {;\n";
+  c += "      int p = base + i;\n";
+  c += "      int src_c = p % args.src_tensor.Channels();\n";
+  c += "      p = p / args.src_tensor.Channels();\n";
+  c += "      int src_x = p % args.src_tensor.Width();\n";
+  c += "      p = p / args.src_tensor.Width();\n";
+  c += "      int src_y = p % args.src_tensor.Height();\n";
+  c += "      int src_b = p / args.src_tensor.Height();\n";
   c += "      int src_z = src_c / 4;\n";
   c += "      int src_sub_ch = src_c % 4;\n";
-  c += "      FLT4 t =" + src_tensor.ReadWHS("src_x", "src_y", "src_z") + ";\n";
+  c += "      FLT4 t = args.src_tensor.Read(src_x, src_y, src_z, src_b);\n";
   c += "      FLT t_ar[4] = {t.x, t.y, t.z, t.w};\n";
   c += "      temps[i] = t_ar[src_sub_ch];\n";
   c += "    }\n";
   c += "  }\n";
   c += "  FLT4 result = (FLT4)(temps[0], temps[1], temps[2], temps[3]);\n";
-  const LinkingContext context{"result", "X", "Y", "Z"};
-  c += PostProcess(linked_operations, context);
-  c += "  " + dst_tensor.WriteWHS("result", "X", "Y", "Z");
+  c += "  args.dst_tensor.Write(result, X, Y, Z, B);\n";
+  c += "}\n";
+  return c;
+}
+
+std::string GetReshapeCode(const OperationDef& op_def, Arguments* args) {
+  args->AddObjectRef(
+      "src_tensor", AccessType::READ,
+      absl::make_unique<TensorDescriptor>(op_def.src_tensors[0]));
+  args->AddObjectRef(
+      "dst_tensor", AccessType::WRITE,
+      absl::make_unique<TensorDescriptor>(op_def.dst_tensors[0]));
+
+  std::string c = GetCommonDefines(op_def.precision);
+  c += "__kernel void main_function(\n";
+  c += "$0) {\n";
+  c += "  int X = get_global_id(0);\n";
+  c += "  int Y = get_global_id(1);\n";
+  c += "  int Z = get_global_id(2);\n";
+  c += "  if (X >= args.dst_tensor.Width() || Y >= args.dst_tensor.Height() || "
+       "Z >= args.dst_tensor.Slices()) { \n";
+  c += "    return; \n";
+  c += "  } \n";
+  c += "  FLT temps[4];\n";
+  c += "  temps[0] = (FLT)(0.0f);\n";
+  c += "  temps[1] = (FLT)(0.0f);\n";
+  c += "  temps[2] = (FLT)(0.0f);\n";
+  c += "  temps[3] = (FLT)(0.0f);\n";
+  c += "  int base = (Y * args.dst_tensor.Width() + X) * "
+       "args.dst_tensor.Channels() + Z * 4;\n";
+  c += "  for (int i = 0; i < 4; ++i) {\n";
+  c += "    int dst_channel = Z * 4 + i;\n";
+  c += "    if (dst_channel < args.dst_tensor.Channels()) {;\n";
+  c += "      int p = base + i;\n";
+  c += "      int src_c = p % args.src_tensor.Channels();\n";
+  c += "      p = p / args.src_tensor.Channels();\n";
+  c += "      int src_x = p % args.src_tensor.Width();\n";
+  c += "      int src_y = p / args.src_tensor.Width();\n";
+  c += "      int src_z = src_c / 4;\n";
+  c += "      int src_sub_ch = src_c % 4;\n";
+  c += "      FLT4 t = args.src_tensor.Read(src_x, src_y, src_z);\n";
+  c += "      FLT t_ar[4] = {t.x, t.y, t.z, t.w};\n";
+  c += "      temps[i] = t_ar[src_sub_ch];\n";
+  c += "    }\n";
+  c += "  }\n";
+  c += "  FLT4 result = (FLT4)(temps[0], temps[1], temps[2], temps[3]);\n";
+  c += "  args.dst_tensor.Write(result, X, Y, Z);\n";
   c += "}\n";
   return c;
 }
@@ -157,24 +137,25 @@ Reshape& Reshape::operator=(Reshape&& operation) {
 }
 
 absl::Status Reshape::Compile(const CreationContext& creation_context) {
-  const auto code = definition_.IsBatchSupported()
-                        ? GetReshapeBatchedCode(definition_, linked_operations_)
-                        : GetReshapeCode(definition_, linked_operations_);
+  std::string code = definition_.IsBatchSupported()
+                         ? GetReshapeBatchedCode(definition_, &args_)
+                         : GetReshapeCode(definition_, &args_);
+  std::string element_wise_code;
+  RETURN_IF_ERROR(
+      MergeOperations(linked_operations_, &args_, &element_wise_code));
+  RETURN_IF_ERROR(args_.TransformToCLCode(creation_context.device->GetInfo(),
+                                          {{"dst_tensor", element_wise_code}},
+                                          &code));
   return creation_context.cache->GetOrCreateCLKernel(
       code, "main_function", *creation_context.context,
       *creation_context.device, &kernel_);
 }
 
 absl::Status Reshape::BindArguments() {
-  kernel_.ResetBindingCounter();
-  RETURN_IF_ERROR(kernel_.SetMemoryAuto(src_[0]->GetMemoryPtr()));
-  RETURN_IF_ERROR(BindArgs(&kernel_, linked_operations_));
-  RETURN_IF_ERROR(kernel_.SetMemoryAuto(dst_[0]->GetMemoryPtrForWriting()));
-  RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->GetWHSB()));
-  RETURN_IF_ERROR(kernel_.SetBytesAuto(dst_[0]->GetWHSB()));
-  RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->Channels()));
-  RETURN_IF_ERROR(kernel_.SetBytesAuto(dst_[0]->Channels()));
-  return absl::OkStatus();
+  RETURN_IF_ERROR(args_.SetObjectRef("src_tensor", src_[0]));
+  RETURN_IF_ERROR(args_.SetObjectRef("dst_tensor", dst_[0]));
+  RETURN_IF_ERROR(SetArguments(linked_operations_, &args_));
+  return args_.Bind(kernel_.kernel());
 }
 
 int3 Reshape::GetGridSize() const {
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/reshapex4.cc b/tensorflow/lite/delegates/gpu/cl/kernels/reshapex4.cc
index de6813e741f..0847fce5836 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/reshapex4.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/reshapex4.cc
@@ -25,82 +25,66 @@ namespace gpu {
 namespace cl {
 namespace {
 
-std::string GetReshapeBatchedCode(
-    const OperationDef& op_def,
-    const std::vector<ElementwiseOperation*>& linked_operations) {
-  TensorCodeGenerator src_tensor(
-      "src_data",
-      WHSBPoint{"src_size.x", "src_size.y", "src_size.z", "src_size.w"},
-      op_def.src_tensors[0]);
-  TensorCodeGenerator dst_tensor(
-      "dst_data",
-      WHSBPoint{"dst_size.x", "dst_size.y", "dst_size.z", "dst_size.w"},
-      op_def.dst_tensors[0]);
+std::string GetReshapeBatchedCode(const OperationDef& op_def, Arguments* args) {
+  args->AddObjectRef(
+      "src_tensor", AccessType::READ,
+      absl::make_unique<TensorDescriptor>(op_def.src_tensors[0]));
+  args->AddObjectRef(
+      "dst_tensor", AccessType::WRITE,
+      absl::make_unique<TensorDescriptor>(op_def.dst_tensors[0]));
 
   std::string c = GetCommonDefines(op_def.precision);
   c += "__kernel void main_function(\n";
-  c += src_tensor.GetDeclaration(AccessType::READ);
-  c += GetArgsDeclaration(linked_operations);
-  c += dst_tensor.GetDeclaration(AccessType::WRITE) + ",\n";
-  c += "    int4 src_size,             \n";
-  c += "    int4 dst_size              \n";
-  c += ") {\n";
+  c += "$0) {\n";
   c += "  int linear_id = get_global_id(0);\n";
-  c += "  int X = linear_id / dst_size.w;\n";
-  c += "  int B = linear_id % dst_size.w;\n";
+  c += "  int X = linear_id / args.dst_tensor.Batch();\n";
+  c += "  int B = linear_id % args.dst_tensor.Batch();\n";
   c += "  int Y = get_global_id(1);\n";
   c += "  int Z = get_global_id(2);\n";
-  c += "  if (X >= dst_size.x || Y >= dst_size.y || Z >= dst_size.z || B >= "
-       "dst_size.w) return;\n";
-  c += "  int dst_bhwc4 = ((B * dst_size.y + Y) * dst_size.x + X) * dst_size.z "
-       "+ Z;\n";
-  c += "  int src_z = dst_bhwc4 % src_size.z;\n";
-  c += "  dst_bhwc4 = dst_bhwc4 / src_size.z;\n";
-  c += "  int src_x = dst_bhwc4 % src_size.x;\n";
-  c += "  dst_bhwc4 = dst_bhwc4 / src_size.x;\n";
-  c += "  int src_y = dst_bhwc4 % src_size.y;\n";
-  c += "  int src_b = dst_bhwc4 / src_size.y;\n";
-  c += "  FLT4 result =" +
-       src_tensor.ReadWHSB("src_x", "src_y", "src_z", "src_b") + ";\n";
-  const LinkingContext context{"result", "X * dst_size.w + B", "Y", "Z"};
-  c += PostProcess(linked_operations, context);
-  c += "  " + dst_tensor.WriteWHSB("result", "X", "Y", "Z", "B");
+  c += "  if (X >= args.dst_tensor.Width() || Y >= args.dst_tensor.Height() || "
+       "Z >= args.dst_tensor.Slices()) { \n";
+  c += "    return; \n";
+  c += "  } \n";
+  c += "  int dst_bhwc4 = ((B * args.dst_tensor.Height() + Y) * "
+       "args.dst_tensor.Width() + X) * args.dst_tensor.Slices() + Z;\n";
+  c += "  int src_z = dst_bhwc4 % args.src_tensor.Slices();\n";
+  c += "  dst_bhwc4 = dst_bhwc4 / args.src_tensor.Slices();\n";
+  c += "  int src_x = dst_bhwc4 % args.src_tensor.Width();\n";
+  c += "  dst_bhwc4 = dst_bhwc4 / args.src_tensor.Width();\n";
+  c += "  int src_y = dst_bhwc4 % args.src_tensor.Height();\n";
+  c += "  int src_b = dst_bhwc4 / args.src_tensor.Height();\n";
+  c += "  FLT4 result = args.src_tensor.Read(src_x, src_y, src_z, src_b);\n";
+  c += "  args.dst_tensor.Write(result, X, Y, Z, B);\n";
   c += "}\n";
   return c;
 }
 
-std::string GetReshapeCode(
-    const OperationDef& op_def,
-    const std::vector<ElementwiseOperation*>& linked_operations) {
-  TensorCodeGenerator src_tensor(
-      "src_data", WHSPoint{"src_size.x", "src_size.y", "src_size.z"},
-      op_def.src_tensors[0]);
-  TensorCodeGenerator dst_tensor(
-      "dst_data", WHSPoint{"dst_size.x", "dst_size.y", "dst_size.z"},
-      op_def.dst_tensors[0]);
+std::string GetReshapeCode(const OperationDef& op_def, Arguments* args) {
+  args->AddObjectRef(
+      "src_tensor", AccessType::READ,
+      absl::make_unique<TensorDescriptor>(op_def.src_tensors[0]));
+  args->AddObjectRef(
+      "dst_tensor", AccessType::WRITE,
+      absl::make_unique<TensorDescriptor>(op_def.dst_tensors[0]));
 
   std::string c = GetCommonDefines(op_def.precision);
   c += "__kernel void main_function(\n";
-  c += src_tensor.GetDeclaration(AccessType::READ);
-  c += GetArgsDeclaration(linked_operations);
-  c += dst_tensor.GetDeclaration(AccessType::WRITE) + ",\n";
-  c += "    int4 src_size,             \n";
-  c += "    int4 dst_size              \n";
-  c += ") {\n";
+  c += "$0) {\n";
   c += "  int X = get_global_id(0);\n";
   c += "  int Y = get_global_id(1);\n";
   c += "  int Z = get_global_id(2);\n";
-  c += "  if (X >= dst_size.x || Y >= dst_size.y || Z >= dst_size.z) return;\n";
-  c += "  int dst_hwc4 = (Y * dst_size.x + X) * dst_size.z + Z;\n";
-  c += "  int src_z = dst_hwc4 % src_size.z;\n";
-  c += "  dst_hwc4 = dst_hwc4 / src_size.z;\n";
-  c += "  int src_x = dst_hwc4 % src_size.x;\n";
-  c += "  int src_y = dst_hwc4 / src_size.x;\n";
-  c +=
-      "  FLT4 result =" + src_tensor.ReadWHS("src_x", "src_y", "src_z") + ";\n";
-  const LinkingContext context{"result", "X", "Y", "Z"};
-  c += PostProcess(linked_operations, context);
-  c += "  " + dst_tensor.WriteWHS("result", "X", "Y", "Z");
+  c += "  if (X >= args.dst_tensor.Width() || Y >= args.dst_tensor.Height() || "
+       "Z >= args.dst_tensor.Slices()) { \n";
+  c += "    return; \n";
+  c += "  } \n";
+  c += "  int dst_hwc4 = (Y * args.dst_tensor.Width() + X) * "
+       "args.dst_tensor.Slices() + Z;\n";
+  c += "  int src_z = dst_hwc4 % args.src_tensor.Slices();\n";
+  c += "  dst_hwc4 = dst_hwc4 / args.src_tensor.Slices();\n";
+  c += "  int src_x = dst_hwc4 % args.src_tensor.Width();\n";
+  c += "  int src_y = dst_hwc4 / args.src_tensor.Width();\n";
+  c += "  FLT4 result = args.src_tensor.Read(src_x, src_y, src_z);\n";
+  c += "  args.dst_tensor.Write(result, X, Y, Z);\n";
   c += "}\n";
   return c;
 }
@@ -121,22 +105,25 @@ Reshapex4& Reshapex4::operator=(Reshapex4&& operation) {
 }
 
 absl::Status Reshapex4::Compile(const CreationContext& creation_context) {
-  const auto code = definition_.IsBatchSupported()
-                        ? GetReshapeBatchedCode(definition_, linked_operations_)
-                        : GetReshapeCode(definition_, linked_operations_);
+  std::string code = definition_.IsBatchSupported()
+                         ? GetReshapeBatchedCode(definition_, &args_)
+                         : GetReshapeCode(definition_, &args_);
+  std::string element_wise_code;
+  RETURN_IF_ERROR(
+      MergeOperations(linked_operations_, &args_, &element_wise_code));
+  RETURN_IF_ERROR(args_.TransformToCLCode(creation_context.device->GetInfo(),
+                                          {{"dst_tensor", element_wise_code}},
+                                          &code));
   return creation_context.cache->GetOrCreateCLKernel(
       code, "main_function", *creation_context.context,
       *creation_context.device, &kernel_);
 }
 
 absl::Status Reshapex4::BindArguments() {
-  kernel_.ResetBindingCounter();
-  RETURN_IF_ERROR(kernel_.SetMemoryAuto(src_[0]->GetMemoryPtr()));
-  RETURN_IF_ERROR(BindArgs(&kernel_, linked_operations_));
-  RETURN_IF_ERROR(kernel_.SetMemoryAuto(dst_[0]->GetMemoryPtrForWriting()));
-  RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->GetWHSB()));
-  RETURN_IF_ERROR(kernel_.SetBytesAuto(dst_[0]->GetWHSB()));
-  return absl::OkStatus();
+  RETURN_IF_ERROR(args_.SetObjectRef("src_tensor", src_[0]));
+  RETURN_IF_ERROR(args_.SetObjectRef("dst_tensor", dst_[0]));
+  RETURN_IF_ERROR(SetArguments(linked_operations_, &args_));
+  return args_.Bind(kernel_.kernel());
 }
 
 int3 Reshapex4::GetGridSize() const {

From c870b9f9203c989c7e2940c7f60f2e288e42138c Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 17 Jun 2020 09:50:14 -0700
Subject: [PATCH 0404/1390] Qualify uses of std::string

PiperOrigin-RevId: 316907769
Change-Id: I01c3ca5f0fb1b1b2284ae5969da968cec7493583
---
 .../lite/toco/allocate_transient_arrays.cc    |  35 ++--
 tensorflow/lite/toco/args.cc                  |  26 +--
 tensorflow/lite/toco/args.h                   |  66 +++----
 tensorflow/lite/toco/dump_graphviz.cc         | 103 +++++-----
 tensorflow/lite/toco/dump_graphviz.h          |   4 +-
 tensorflow/lite/toco/export_tensorflow.cc     | 183 +++++++++---------
 tensorflow/lite/toco/export_tensorflow.h      |   3 +-
 tensorflow/lite/toco/format_port.h            |  10 +-
 tensorflow/lite/toco/import_tensorflow.cc     | 142 +++++++-------
 tensorflow/lite/toco/import_tensorflow.h      |   2 +-
 .../lite/toco/import_tensorflow_test.cc       |   6 +-
 tensorflow/lite/toco/model.h                  |  38 ++--
 tensorflow/lite/toco/model_cmdline_flags.cc   |  30 +--
 tensorflow/lite/toco/model_cmdline_flags.h    |   2 +-
 .../lite/toco/model_cmdline_flags_test.cc     |   6 +-
 tensorflow/lite/toco/tensorflow_util.cc       |   6 +-
 tensorflow/lite/toco/tensorflow_util.h        |   2 +-
 tensorflow/lite/toco/toco.cc                  |   2 +-
 tensorflow/lite/toco/toco_cmdline_flags.cc    |   6 +-
 tensorflow/lite/toco/toco_cmdline_flags.h     |   3 +-
 .../lite/toco/toco_cmdline_flags_test.cc      |   4 +-
 tensorflow/lite/toco/toco_convert.cc          |  14 +-
 tensorflow/lite/toco/toco_convert.h           |   4 +-
 tensorflow/lite/toco/toco_convert_test.cc     |  24 +--
 tensorflow/lite/toco/toco_port.cc             |  16 +-
 tensorflow/lite/toco/toco_port.h              |  18 +-
 tensorflow/lite/toco/toco_tooling.cc          |   9 +-
 tensorflow/lite/toco/toco_tooling.h           |   7 +-
 tensorflow/lite/toco/tooling_util.cc          | 163 ++++++++--------
 tensorflow/lite/toco/tooling_util.h           |  74 +++----
 30 files changed, 523 insertions(+), 485 deletions(-)

diff --git a/tensorflow/lite/toco/allocate_transient_arrays.cc b/tensorflow/lite/toco/allocate_transient_arrays.cc
index 3ec53c9c2d6..76279f6e62d 100644
--- a/tensorflow/lite/toco/allocate_transient_arrays.cc
+++ b/tensorflow/lite/toco/allocate_transient_arrays.cc
@@ -55,8 +55,8 @@ bool EndsAt(const ArrayLifespan& lifespan, std::size_t op_index) {
 // Helper function for ComputeArrayLifespans: updates one ArrayLifespan for
 // one array for one op.
 void UpdateArrayLifespan(
-    const string& array_name, std::size_t op_index,
-    std::unordered_map<string, ArrayLifespan>* array_lifespans) {
+    const std::string& array_name, std::size_t op_index,
+    std::unordered_map<std::string, ArrayLifespan>* array_lifespans) {
   if (array_lifespans->count(array_name)) {
     auto& lifespan = array_lifespans->at(array_name);
     if (!lifespan.persistent) {
@@ -74,7 +74,7 @@ void UpdateArrayLifespan(
 // Computes the ArrayLifespan for each array.
 void ComputeArrayLifespans(
     const Model& model,
-    std::unordered_map<string, ArrayLifespan>* array_lifespans) {
+    std::unordered_map<std::string, ArrayLifespan>* array_lifespans) {
   CHECK(array_lifespans->empty());
   for (const auto& rnn_state : model.flags.rnn_states()) {
     ArrayLifespan lifespan;
@@ -159,7 +159,8 @@ class Allocator {
 
 // Returns the required transient allocation size (in bytes) for a given array,
 // or 0 if it's not a transient array.
-std::size_t TransientArraySize(const Model& model, const string& array_name,
+std::size_t TransientArraySize(const Model& model,
+                               const std::string& array_name,
                                std::size_t transient_data_alignment) {
   if (!IsAllocatableTransientArray(model, array_name)) {
     return 0;
@@ -191,7 +192,7 @@ std::size_t TransientArraySize(const Model& model, const string& array_name,
 
 // Allocates an array: call this for every array just before the first
 // op where it is used.
-void AllocateTransientArray(const Model& model, const string& array_name,
+void AllocateTransientArray(const Model& model, const std::string& array_name,
                             Allocator* allocator,
                             std::size_t transient_data_alignment) {
   if (!IsAllocatableTransientArray(model, array_name)) {
@@ -206,7 +207,7 @@ void AllocateTransientArray(const Model& model, const string& array_name,
 
 // Deallocates an array: call this for every array just after the last
 // op where it is used.
-void DeallocateTransientArray(const Model& model, const string& array_name,
+void DeallocateTransientArray(const Model& model, const std::string& array_name,
                               Allocator* allocator) {
   if (!IsAllocatableTransientArray(model, array_name)) {
     return;
@@ -216,7 +217,7 @@ void DeallocateTransientArray(const Model& model, const string& array_name,
   allocator->Deallocate(*array->alloc);
 }
 
-void PushBackIfNotFound(const string& s, std::vector<string>* v) {
+void PushBackIfNotFound(const std::string& s, std::vector<std::string>* v) {
   if (std::find(v->begin(), v->end(), s) == v->end()) {
     v->push_back(s);
   }
@@ -227,7 +228,7 @@ void PushBackIfNotFound(const string& s, std::vector<string>* v) {
 void AllocateTransientArrays(Model* model,
                              std::size_t transient_data_alignment) {
   // Precompute the lifespans for all arrays.
-  std::unordered_map<string, ArrayLifespan> array_lifespans;
+  std::unordered_map<std::string, ArrayLifespan> array_lifespans;
   ComputeArrayLifespans(*model, &array_lifespans);
 
   // In case of variable batch, our convention will be to compute the
@@ -250,7 +251,7 @@ void AllocateTransientArrays(Model* model,
 
   // Construct a sorted map of array names, so that other layout engines can
   // match exactly.
-  std::map<string, const Array*> ordered_arrays_map;
+  std::map<std::string, const Array*> ordered_arrays_map;
   for (const auto& pair : model->GetArrayMap()) {
     ordered_arrays_map[pair.first] = pair.second.get();
   }
@@ -258,7 +259,7 @@ void AllocateTransientArrays(Model* model,
   // Allocate persistent arrays (like RNN states). For them, 'transient'
   // is a misnormer, should read 'workspace'.
   for (const auto& array_pair : ordered_arrays_map) {
-    const string& array_name = array_pair.first;
+    const std::string& array_name = array_pair.first;
     auto it = array_lifespans.find(array_name);
     if (it != array_lifespans.end() && it->second.persistent) {
       AllocateTransientArray(*model, array_name, &allocator,
@@ -270,7 +271,7 @@ void AllocateTransientArrays(Model* model,
        op_index++) {
     const auto& op = model->operators[op_index];
     // Allocate those arrays whose lifespan starts exactly here.
-    std::vector<string> arrays_to_allocate;
+    std::vector<std::string> arrays_to_allocate;
     for (const auto& input : op->inputs) {
       if (StartsAt(array_lifespans[input], op_index)) {
         PushBackIfNotFound(input, &arrays_to_allocate);
@@ -281,13 +282,13 @@ void AllocateTransientArrays(Model* model,
         PushBackIfNotFound(output, &arrays_to_allocate);
       }
     }
-    for (const string& array : arrays_to_allocate) {
+    for (const std::string& array : arrays_to_allocate) {
       AllocateTransientArray(*model, array, &allocator,
                              transient_data_alignment);
     }
 
     // Deallocate those arrays whose lifespan ends exactly here.
-    std::vector<string> arrays_to_deallocate;
+    std::vector<std::string> arrays_to_deallocate;
     for (const auto& input : op->inputs) {
       if (EndsAt(array_lifespans[input], op_index)) {
         PushBackIfNotFound(input, &arrays_to_deallocate);
@@ -298,7 +299,7 @@ void AllocateTransientArrays(Model* model,
         PushBackIfNotFound(output, &arrays_to_deallocate);
       }
     }
-    for (const string& array : arrays_to_deallocate) {
+    for (const std::string& array : arrays_to_deallocate) {
       DeallocateTransientArray(*model, array, &allocator);
     }
   }
@@ -309,7 +310,7 @@ void AllocateTransientArrays(Model* model,
   std::size_t optimal_transient_alloc_size = 0;
   std::size_t persistent_alloc_size = 0;
   for (const auto& array_pair : ordered_arrays_map) {
-    const string& array_name = array_pair.first;
+    const std::string& array_name = array_pair.first;
     auto it = array_lifespans.find(array_name);
     if (it != array_lifespans.end() && it->second.persistent) {
       persistent_alloc_size +=
@@ -320,7 +321,7 @@ void AllocateTransientArrays(Model* model,
     // for each operator, compute the sum of the sizes of the array that must
     // be live during the execution of this operator, plus the size of
     // persistent arrays that must be live at all times.
-    std::vector<string> non_persistent_edges;
+    std::vector<std::string> non_persistent_edges;
     for (const auto& input : op->inputs) {
       if (!array_lifespans[input].persistent) {
         PushBackIfNotFound(input, &non_persistent_edges);
@@ -332,7 +333,7 @@ void AllocateTransientArrays(Model* model,
       }
     }
     std::size_t size = persistent_alloc_size;
-    for (const string& edge : non_persistent_edges) {
+    for (const std::string& edge : non_persistent_edges) {
       size += TransientArraySize(*model, edge, transient_data_alignment);
     }
     // The optimal total size is the maximum of all operator-specific sizes.
diff --git a/tensorflow/lite/toco/args.cc b/tensorflow/lite/toco/args.cc
index ce67de900d7..c30b98ce516 100644
--- a/tensorflow/lite/toco/args.cc
+++ b/tensorflow/lite/toco/args.cc
@@ -94,14 +94,16 @@ bool SplitStructuredLine(absl::string_view line, char delimiter,
 }
 
 inline bool TryStripPrefixString(absl::string_view str,
-                                 absl::string_view prefix, string* result) {
+                                 absl::string_view prefix,
+                                 std::string* result) {
   bool res = absl::ConsumePrefix(&str, prefix);
   result->assign(str.begin(), str.end());
   return res;
 }
 
 inline bool TryStripSuffixString(absl::string_view str,
-                                 absl::string_view suffix, string* result) {
+                                 absl::string_view suffix,
+                                 std::string* result) {
   bool res = absl::ConsumeSuffix(&str, suffix);
   result->assign(str.begin(), str.end());
   return res;
@@ -109,7 +111,7 @@ inline bool TryStripSuffixString(absl::string_view str,
 
 }  // namespace
 
-bool Arg<toco::IntList>::Parse(string text) {
+bool Arg<toco::IntList>::Parse(std::string text) {
   parsed_value_.elements.clear();
   specified_ = true;
   // strings::Split("") produces {""}, but we need {} on empty input.
@@ -125,7 +127,7 @@ bool Arg<toco::IntList>::Parse(string text) {
   return true;
 }
 
-bool Arg<toco::StringMapList>::Parse(string text) {
+bool Arg<toco::StringMapList>::Parse(std::string text) {
   parsed_value_.elements.clear();
   specified_ = true;
 
@@ -138,24 +140,24 @@ bool Arg<toco::StringMapList>::Parse(string text) {
   // TODO(aselle): Change argument parsing when absl supports structuredline.
   SplitStructuredLine(text_disposable_copy, ',', "{}", &outer_vector);
   for (const absl::string_view& outer_member_stringpiece : outer_vector) {
-    string outer_member(outer_member_stringpiece);
+    std::string outer_member(outer_member_stringpiece);
     if (outer_member.empty()) {
       continue;
     }
-    string outer_member_copy = outer_member;
+    std::string outer_member_copy = outer_member;
     absl::StripAsciiWhitespace(&outer_member);
     if (!TryStripPrefixString(outer_member, "{", &outer_member)) return false;
     if (!TryStripSuffixString(outer_member, "}", &outer_member)) return false;
-    const std::vector<string> inner_fields_vector =
+    const std::vector<std::string> inner_fields_vector =
         absl::StrSplit(outer_member, ',');
 
-    std::unordered_map<string, string> element;
-    for (const string& member_field : inner_fields_vector) {
-      std::vector<string> outer_member_key_value =
+    std::unordered_map<std::string, std::string> element;
+    for (const std::string& member_field : inner_fields_vector) {
+      std::vector<std::string> outer_member_key_value =
           absl::StrSplit(member_field, ':');
       if (outer_member_key_value.size() != 2) return false;
-      string& key = outer_member_key_value[0];
-      string& value = outer_member_key_value[1];
+      std::string& key = outer_member_key_value[0];
+      std::string& value = outer_member_key_value[1];
       absl::StripAsciiWhitespace(&key);
       absl::StripAsciiWhitespace(&value);
       if (element.count(key) != 0) return false;
diff --git a/tensorflow/lite/toco/args.h b/tensorflow/lite/toco/args.h
index 20fa5ecc20c..e1fe209062e 100644
--- a/tensorflow/lite/toco/args.h
+++ b/tensorflow/lite/toco/args.h
@@ -35,7 +35,7 @@ struct IntList {
   std::vector<int32> elements;
 };
 struct StringMapList {
-  std::vector<std::unordered_map<string, string>> elements;
+  std::vector<std::unordered_map<std::string, std::string>> elements;
 };
 
 // command_line_flags.h don't track whether or not a flag is specified. Arg
@@ -82,13 +82,13 @@ template <>
 class Arg<toco::IntList> final {
  public:
   // Provide default_value() to arg list
-  string default_value() const { return ""; }
+  std::string default_value() const { return ""; }
   // Return true if the command line argument was specified on the command line.
   bool specified() const { return specified_; }
   // Bind the parse member function so tensorflow::Flags can call it.
-  bool Parse(string text);
+  bool Parse(std::string text);
 
-  std::function<bool(string)> bind() {
+  std::function<bool(std::string)> bind() {
     return std::bind(&Arg::Parse, this, std::placeholders::_1);
   }
 
@@ -103,14 +103,14 @@ template <>
 class Arg<toco::StringMapList> final {
  public:
   // Provide default_value() to StringMapList
-  string default_value() const { return ""; }
+  std::string default_value() const { return ""; }
   // Return true if the command line argument was specified on the command line.
   bool specified() const { return specified_; }
   // Bind the parse member function so tensorflow::Flags can call it.
 
-  bool Parse(string text);
+  bool Parse(std::string text);
 
-  std::function<bool(string)> bind() {
+  std::function<bool(std::string)> bind() {
     return std::bind(&Arg::Parse, this, std::placeholders::_1);
   }
 
@@ -123,18 +123,18 @@ class Arg<toco::StringMapList> final {
 
 // Flags that describe a model. See model_cmdline_flags.cc for details.
 struct ParsedModelFlags {
-  Arg<string> input_array;
-  Arg<string> input_arrays;
-  Arg<string> output_array;
-  Arg<string> output_arrays;
-  Arg<string> input_shapes;
+  Arg<std::string> input_array;
+  Arg<std::string> input_arrays;
+  Arg<std::string> output_array;
+  Arg<std::string> output_arrays;
+  Arg<std::string> input_shapes;
   Arg<int> batch_size = Arg<int>(1);
   Arg<float> mean_value = Arg<float>(0.f);
-  Arg<string> mean_values;
+  Arg<std::string> mean_values;
   Arg<float> std_value = Arg<float>(1.f);
-  Arg<string> std_values;
-  Arg<string> input_data_type;
-  Arg<string> input_data_types;
+  Arg<std::string> std_values;
+  Arg<std::string> input_data_type;
+  Arg<std::string> input_data_types;
   Arg<bool> variable_batch = Arg<bool>(false);
   Arg<toco::IntList> input_shape;
   Arg<toco::StringMapList> rnn_states;
@@ -142,44 +142,44 @@ struct ParsedModelFlags {
   Arg<bool> change_concat_input_ranges = Arg<bool>(true);
   // Debugging output options.
   // TODO(benoitjacob): these shouldn't be ModelFlags.
-  Arg<string> graphviz_first_array;
-  Arg<string> graphviz_last_array;
-  Arg<string> dump_graphviz;
+  Arg<std::string> graphviz_first_array;
+  Arg<std::string> graphviz_last_array;
+  Arg<std::string> dump_graphviz;
   Arg<bool> dump_graphviz_video = Arg<bool>(false);
-  Arg<string> conversion_summary_dir;
+  Arg<std::string> conversion_summary_dir;
   Arg<bool> allow_nonexistent_arrays = Arg<bool>(false);
   Arg<bool> allow_nonascii_arrays = Arg<bool>(false);
-  Arg<string> arrays_extra_info_file;
-  Arg<string> model_flags_file;
+  Arg<std::string> arrays_extra_info_file;
+  Arg<std::string> model_flags_file;
 };
 
 // Flags that describe the operation you would like to do (what conversion
 // you want). See toco_cmdline_flags.cc for details.
 struct ParsedTocoFlags {
-  Arg<string> input_file;
-  Arg<string> savedmodel_directory;
-  Arg<string> output_file;
-  Arg<string> input_format = Arg<string>("TENSORFLOW_GRAPHDEF");
-  Arg<string> output_format = Arg<string>("TFLITE");
-  Arg<string> savedmodel_tagset;
+  Arg<std::string> input_file;
+  Arg<std::string> savedmodel_directory;
+  Arg<std::string> output_file;
+  Arg<std::string> input_format = Arg<std::string>("TENSORFLOW_GRAPHDEF");
+  Arg<std::string> output_format = Arg<std::string>("TFLITE");
+  Arg<std::string> savedmodel_tagset;
   // TODO(aselle): command_line_flags  doesn't support doubles
   Arg<float> default_ranges_min = Arg<float>(0.);
   Arg<float> default_ranges_max = Arg<float>(0.);
   Arg<float> default_int16_ranges_min = Arg<float>(0.);
   Arg<float> default_int16_ranges_max = Arg<float>(0.);
-  Arg<string> inference_type;
-  Arg<string> inference_input_type;
+  Arg<std::string> inference_type;
+  Arg<std::string> inference_input_type;
   Arg<bool> drop_fake_quant = Arg<bool>(false);
   Arg<bool> reorder_across_fake_quant = Arg<bool>(false);
   Arg<bool> allow_custom_ops = Arg<bool>(false);
   Arg<bool> allow_dynamic_tensors = Arg<bool>(true);
-  Arg<string> custom_opdefs;
+  Arg<std::string> custom_opdefs;
   Arg<bool> post_training_quantize = Arg<bool>(false);
   Arg<bool> quantize_to_float16 = Arg<bool>(false);
   // Deprecated flags
   Arg<bool> quantize_weights = Arg<bool>(false);
-  Arg<string> input_type;
-  Arg<string> input_types;
+  Arg<std::string> input_type;
+  Arg<std::string> input_types;
   Arg<bool> debug_disable_recurrent_cell_fusion = Arg<bool>(false);
   Arg<bool> drop_control_dependency = Arg<bool>(false);
   Arg<bool> propagate_fake_quant_num_bits = Arg<bool>(false);
diff --git a/tensorflow/lite/toco/dump_graphviz.cc b/tensorflow/lite/toco/dump_graphviz.cc
index 68d3b957129..006d5546c60 100644
--- a/tensorflow/lite/toco/dump_graphviz.cc
+++ b/tensorflow/lite/toco/dump_graphviz.cc
@@ -77,7 +77,9 @@ class Color {
 
   // Returns the string serialization of this color in graphviz format,
   // for use as 'fillcolor' in boxes.
-  string AsHexString() const { return StringF("#%.2X%.2X%.2X", r_, g_, b_); }
+  std::string AsHexString() const {
+    return StringF("#%.2X%.2X%.2X", r_, g_, b_);
+  }
   // The color to use for this node; will be used as 'fillcolor'
   // for its box. See Color::AsHexString. A suitable, different
   // color will be chosen for the 'fontcolor' for the inside text
@@ -85,7 +87,7 @@ class Color {
   // Returns the serialization in graphviz format of a suitable color to use
   // 'fontcolor' in the same boxes. It should black or white, whichever offers
   // the better contrast from AsHexString().
-  string TextColorString() const {
+  std::string TextColorString() const {
     // https://en.wikipedia.org/wiki/Relative_luminance
     const float luminance = 0.2126f * r_ + 0.7152f * g_ + 0.0722f * b_;
     const uint8 l = luminance > 128.f ? 0 : 255;
@@ -96,7 +98,7 @@ class Color {
   uint8 r_ = 0, g_ = 0, b_ = 0;
 };
 
-Color HashStringToColor(string s) {
+Color HashStringToColor(std::string s) {
   // Return a unique color for a name.
   //
   // This function removes Tensorflow anti-collision suffixes (eg "_2"), hashes
@@ -120,8 +122,8 @@ Color HashStringToColor(string s) {
   return Color(color_word);
 }
 
-void GetArrayColorAndShape(const Model& model, const string& array_name,
-                           Color* color, string* shape) {
+void GetArrayColorAndShape(const Model& model, const std::string& array_name,
+                           Color* color, std::string* shape) {
   // All colors in this file are from:
   // https://material.io/guidelines/style/color.html
   // Arrays involved in RNN back-edges have a different color
@@ -167,7 +169,8 @@ void GetArrayColorAndShape(const Model& model, const string& array_name,
   *shape = "box";
 }
 
-string GetArrayCompassPt(const Model& model, const string& array_name) {
+std::string GetArrayCompassPt(const Model& model,
+                              const std::string& array_name) {
   // The "compass point" is the point on the node where edge connections are
   // made. For most arrays we don't care, but input's and outputs look better
   // connected at the tip of the "house" and "invhouse" shapes used. So we
@@ -191,7 +194,7 @@ string GetArrayCompassPt(const Model& model, const string& array_name) {
   return "";
 }
 
-void AppendArrayVal(string* string, Array const& array, int index) {
+void AppendArrayVal(std::string* string, Array const& array, int index) {
   if (array.buffer->type == ArrayDataType::kFloat) {
     const auto& data = array.GetBuffer<ArrayDataType::kFloat>().data;
     if (index >= data.size()) {
@@ -231,10 +234,10 @@ void AppendArrayVal(string* string, Array const& array, int index) {
   }
 }
 
-typedef std::map<string, string> Attributes;
+typedef std::map<std::string, std::string> Attributes;
 
-string AttributesToHtml(Attributes attributes) {
-  string html;
+std::string AttributesToHtml(Attributes attributes) {
+  std::string html;
   for (const auto& attr : attributes) {
     html += R"CODE(<TR><TD CELLPADDING="1" ALIGN="RIGHT">)CODE";
     html += attr.first;
@@ -245,8 +248,8 @@ string AttributesToHtml(Attributes attributes) {
   return html;
 }
 
-string GetArrayLabel(const Model& model, const string& array_id) {
-  string html;
+std::string GetArrayLabel(const Model& model, const std::string& array_id) {
+  std::string html;
 
   // Use HTML-like labels (http://www.graphviz.org/doc/info/shapes.html#html)
   html += "<";
@@ -265,7 +268,7 @@ string GetArrayLabel(const Model& model, const string& array_id) {
   html += R"CODE(<TR><TD COLSPAN="2" ALIGN="CENTER">)CODE";
   html += R"CODE(<FONT POINT-SIZE="16" FACE="Helvetica"><I>)CODE";
   AppendF(&html, R"CODE(%s)CODE",
-          std::vector<string>(absl::StrSplit(array_id, '/')).back());
+          std::vector<std::string>(absl::StrSplit(array_id, '/')).back());
   html += R"CODE(</I></FONT>)CODE";
   html += "</TD></TR>";
 
@@ -371,7 +374,7 @@ Attributes GetOpAttributes(const Model& model, const Operator& op) {
   switch (op.type) {
     case OperatorType::kConv: {
       const auto& conv_op = static_cast<const ConvOperator&>(op);
-      string stride;
+      std::string stride;
       AppendF(&stride, "%d", conv_op.stride_width);
       stride += kUnicodeMult;
       AppendF(&stride, "%d", conv_op.stride_height);
@@ -382,7 +385,7 @@ Attributes GetOpAttributes(const Model& model, const Operator& op) {
     }
     case OperatorType::kDepthwiseConv: {
       const auto& depthconv_op = static_cast<const ConvOperator&>(op);
-      string stride;
+      std::string stride;
       AppendF(&stride, "%d", depthconv_op.stride_width);
       stride += kUnicodeMult;
       AppendF(&stride, "%d", depthconv_op.stride_height);
@@ -426,9 +429,9 @@ Color GetOpColor(const Operator& op) {
   }
 }
 
-string GetOpLabel(const Model& model, const Operator& op) {
+std::string GetOpLabel(const Model& model, const Operator& op) {
   // Use HTML-like labels (http://www.graphviz.org/doc/info/shapes.html#html)
-  string html;
+  std::string html;
   html += "<";
 
   // Begin Table
@@ -462,7 +465,8 @@ string GetOpLabel(const Model& model, const Operator& op) {
   if (op.type == OperatorType::kUnsupported) {
     html += static_cast<const TensorFlowUnsupportedOperator&>(op).tensorflow_op;
   } else {
-    html += string(absl::StripPrefix(OperatorTypeName(op.type), "TensorFlow"));
+    html +=
+        std::string(absl::StripPrefix(OperatorTypeName(op.type), "TensorFlow"));
   }
   html += R"CODE(</B></FONT>)CODE";
   html += "</TD></TR>";
@@ -498,7 +502,7 @@ string GetOpLabel(const Model& model, const Operator& op) {
   return html;
 }
 
-float GetLog2BufferSize(const Model& model, const string& array_id) {
+float GetLog2BufferSize(const Model& model, const std::string& array_id) {
   auto& array = model.GetArray(array_id);
   if (array.has_shape()) {
     int buffer_size = 0;
@@ -510,22 +514,23 @@ float GetLog2BufferSize(const Model& model, const string& array_id) {
   return 0.0f;
 }
 
-string GetOpId(int op_index) { return StringF("op%05d", op_index); }
+std::string GetOpId(int op_index) { return StringF("op%05d", op_index); }
 
-void DumpOperator(const Model& model, string* output_file, int op_index) {
+void DumpOperator(const Model& model, std::string* output_file, int op_index) {
   // Dump node for operator.
   const Operator& op = *model.operators[op_index];
   Color color = GetOpColor(op);
-  string label = GetOpLabel(model, op);
-  string op_id = GetOpId(op_index);
+  std::string label = GetOpLabel(model, op);
+  std::string op_id = GetOpId(op_index);
   AppendF(output_file, kOpNodeFmt, op_id, label, color.AsHexString(),
           color.TextColorString());
 }
 
-void DumpOperatorEdges(const Model& model, string* output_file, int op_index) {
+void DumpOperatorEdges(const Model& model, std::string* output_file,
+                       int op_index) {
   // Inputs
   const Operator& op = *model.operators[op_index];
-  string op_id = GetOpId(op_index);
+  std::string op_id = GetOpId(op_index);
   for (int i = 0; i < op.inputs.size(); i++) {
     const auto& input = op.inputs[i];
     if (!model.HasArray(input)) {
@@ -546,7 +551,7 @@ void DumpOperatorEdges(const Model& model, string* output_file, int op_index) {
       // would otherwise skew the layout.
       weight = 1.0f;
     }
-    string compass_pt = GetArrayCompassPt(model, input);
+    std::string compass_pt = GetArrayCompassPt(model, input);
     AppendF(output_file, kInputEdgeFmt, input, compass_pt, op_id, i, line_width,
             weight);
   }
@@ -563,7 +568,7 @@ void DumpOperatorEdges(const Model& model, string* output_file, int op_index) {
     if (!IsArrayConsumed(model, output)) {
       weight = 1.0f;
     }
-    string compass_pt = GetArrayCompassPt(model, output);
+    std::string compass_pt = GetArrayCompassPt(model, output);
     AppendF(output_file, kOutputEdgeFmt, op_id, i, output, compass_pt,
             line_width, weight);
   }
@@ -572,19 +577,19 @@ void DumpOperatorEdges(const Model& model, string* output_file, int op_index) {
 struct Node {
   Node() : math_ops(0) {}
   // Name used as a key in the model's array map
-  string array_id;
+  std::string array_id;
 
   // Estimated number of math ops incurred by this node (the sum of the op
   // with this array as 1st output, plus all children nodes).
   int64 math_ops;
 
   // A map of child nodes keyed by name.
-  std::map<const string, std::unique_ptr<Node>> children;
+  std::map<const std::string, std::unique_ptr<Node>> children;
 };
 
-string GetSubgraphLabel(Node const& node, const string& subgraph) {
+std::string GetSubgraphLabel(Node const& node, const std::string& subgraph) {
   // Use HTML-like labels (http://www.graphviz.org/doc/info/shapes.html#html)
-  string html;
+  std::string html;
   html += "<";
 
   // Begin Table
@@ -613,19 +618,19 @@ string GetSubgraphLabel(Node const& node, const string& subgraph) {
   return html;
 }
 
-void DumpSubgraphHeader(string* output_file, Node const& node,
-                        const string& node_name) {
+void DumpSubgraphHeader(std::string* output_file, Node const& node,
+                        const std::string& node_name) {
   Color color = HashStringToColor(node_name);
-  string label = GetSubgraphLabel(node, node_name);
+  std::string label = GetSubgraphLabel(node, node_name);
   AppendF(output_file, kSubgraphFmt, node_name, color.AsHexString(), label);
 }
 
-void DumpArray(const Model& model, string* output_file,
-               const string& array_id) {
+void DumpArray(const Model& model, std::string* output_file,
+               const std::string& array_id) {
   Color color;
-  string shape;
+  std::string shape;
   GetArrayColorAndShape(model, array_id, &color, &shape);
-  string label = GetArrayLabel(model, array_id);
+  std::string label = GetArrayLabel(model, array_id);
   AppendF(output_file, kArrayNodeFmt, array_id, label, array_id, shape,
           color.AsHexString(), color.TextColorString());
 
@@ -638,8 +643,8 @@ void DumpArray(const Model& model, string* output_file,
   }
 }
 
-void DumpNode(const Model& model, string* output_file, const string& node_name,
-              Node const& node) {
+void DumpNode(const Model& model, std::string* output_file,
+              const std::string& node_name, Node const& node) {
   bool not_root = !node_name.empty();
   if (not_root) {
     DumpSubgraphHeader(output_file, node, node_name);
@@ -662,7 +667,7 @@ void DumpNode(const Model& model, string* output_file, const string& node_name,
   }
 }
 
-int64 GetArithmeticOpsCount(const Model& model, const string& array_id) {
+int64 GetArithmeticOpsCount(const Model& model, const std::string& array_id) {
   for (const auto& op : model.operators) {
     if (!op->outputs.empty() && op->outputs[0] == array_id) {
       int64 count;
@@ -676,15 +681,15 @@ int64 GetArithmeticOpsCount(const Model& model, const string& array_id) {
   return 0;
 }
 
-void InsertNode(const Model& model, const string& array_id, Node* node,
-                std::vector<string> prefixes, int64* math_ops) {
+void InsertNode(const Model& model, const std::string& array_id, Node* node,
+                std::vector<std::string> prefixes, int64* math_ops) {
   if (prefixes.empty()) {
     // Base case: store array in this node.
     node->array_id = array_id;
     *math_ops = GetArithmeticOpsCount(model, array_id);
   } else {
     // Insert into the sub-tree for that prefix.
-    string prefix = prefixes.back();
+    std::string prefix = prefixes.back();
     prefixes.pop_back();
     if (node->children.count(prefix) == 0) {
       // Create a new node if this prefix is unseen.
@@ -700,16 +705,16 @@ void InsertNode(const Model& model, const string& array_id, Node* node,
 void BuildArrayTree(const Model& model, Node* tree) {
   // Delimit array names by path "/", then place into a tree based on this path.
   for (const auto& array_id : model.GetArrayMap()) {
-    std::vector<string> prefixes = absl::StrSplit(array_id.first, '/');
+    std::vector<std::string> prefixes = absl::StrSplit(array_id.first, '/');
     std::reverse(prefixes.begin(), prefixes.end());
     int64 math_ops;  // Temporary storage for math ops used during recursion.
     InsertNode(model, array_id.first, tree, prefixes, &math_ops);
   }
 }
 
-string GetGraphLabel(const Model& model, const string& graph_name) {
+std::string GetGraphLabel(const Model& model, const std::string& graph_name) {
   // Use HTML-like labels (http://www.graphviz.org/doc/info/shapes.html#html)
-  string html;
+  std::string html;
   html += "<";
 
   // Begin Table
@@ -753,8 +758,8 @@ string GetGraphLabel(const Model& model, const string& graph_name) {
 }
 }  // namespace
 
-void DumpGraphviz(const Model& model, string* output_file,
-                  const string& graph_name) {
+void DumpGraphviz(const Model& model, std::string* output_file,
+                  const std::string& graph_name) {
   // Start graphviz format
   AppendF(output_file, kGraphFmt, GetGraphLabel(model, graph_name));
 
diff --git a/tensorflow/lite/toco/dump_graphviz.h b/tensorflow/lite/toco/dump_graphviz.h
index 9bb74dac3f8..0e847896552 100644
--- a/tensorflow/lite/toco/dump_graphviz.h
+++ b/tensorflow/lite/toco/dump_graphviz.h
@@ -21,8 +21,8 @@ limitations under the License.
 
 namespace toco {
 
-void DumpGraphviz(const Model& model, string* output_file_contents,
-                  const string& graph_name);
+void DumpGraphviz(const Model& model, std::string* output_file_contents,
+                  const std::string& graph_name);
 
 }  // namespace toco
 
diff --git a/tensorflow/lite/toco/export_tensorflow.cc b/tensorflow/lite/toco/export_tensorflow.cc
index ec3fb386d10..7ecf6cc7d44 100644
--- a/tensorflow/lite/toco/export_tensorflow.cc
+++ b/tensorflow/lite/toco/export_tensorflow.cc
@@ -49,7 +49,7 @@ namespace toco {
 namespace {
 
 tensorflow::DataType GetTensorFlowDataType(ArrayDataType data_type,
-                                           const string& error_location) {
+                                           const std::string& error_location) {
   switch (data_type) {
     case ArrayDataType::kBool:
       return tensorflow::DT_BOOL;
@@ -74,12 +74,12 @@ tensorflow::DataType GetTensorFlowDataType(ArrayDataType data_type,
 }
 
 tensorflow::DataType GetTensorFlowDataTypeForOp(ArrayDataType data_type,
-                                                const string& op_name) {
+                                                const std::string& op_name) {
   return GetTensorFlowDataType(data_type, "op '" + op_name + "'");
 }
 
 tensorflow::DataType GetTensorFlowDataType(const Model& model,
-                                           const string& array_name) {
+                                           const std::string& array_name) {
   return GetTensorFlowDataType(model.GetArray(array_name).data_type,
                                "array '" + array_name + "'");
 }
@@ -113,8 +113,8 @@ void ExportFloatArray(const Shape& input_shape, const float* input_data,
     }
   }
   output_tensor->set_tensor_content(
-      string(reinterpret_cast<const char*>(input_data),
-             sizeof(*input_data) * input_flat_size));
+      std::string(reinterpret_cast<const char*>(input_data),
+                  sizeof(*input_data) * input_flat_size));
 }
 
 void ExportFloatArray(AxesOrder input_axes_order, const Shape& input_shape,
@@ -137,7 +137,7 @@ void ExportFloatArray(AxesOrder input_axes_order, const Shape& input_shape,
                    legacy_scalar_policy);
 }
 
-bool HasAlreadyExportedConst(const string& name,
+bool HasAlreadyExportedConst(const std::string& name,
                              const GraphDef& tensorflow_graph) {
   for (const auto& node : tensorflow_graph.node()) {
     if (node.op() == "Const" && node.name() == name) {
@@ -147,7 +147,7 @@ bool HasAlreadyExportedConst(const string& name,
   return false;
 }
 
-void ConvertFloatTensorConst(const string& name, const Shape& input_shape,
+void ConvertFloatTensorConst(const std::string& name, const Shape& input_shape,
                              const float* input_data,
                              AxesOrder input_axes_order,
                              AxesOrder output_axes_order,
@@ -165,7 +165,7 @@ void ConvertFloatTensorConst(const string& name, const Shape& input_shape,
                    tensor, legacy_scalar_policy);
 }
 
-void ConvertFloatTensorConst(const string& name, const Shape& input_shape,
+void ConvertFloatTensorConst(const std::string& name, const Shape& input_shape,
                              const float* input_data,
                              AxesOrder input_axes_order,
                              AxesOrder output_axes_order,
@@ -175,7 +175,7 @@ void ConvertFloatTensorConst(const string& name, const Shape& input_shape,
                           LegacyScalarPolicy::kAvoidLegacyScalars);
 }
 
-void ConvertFloatTensorConst(const Model& model, const string& name,
+void ConvertFloatTensorConst(const Model& model, const std::string& name,
                              AxesOrder input_axes_order,
                              AxesOrder output_axes_order,
                              GraphDef* tensorflow_graph) {
@@ -193,7 +193,7 @@ void ConvertFloatTensorConst(const Model& model, const string& name,
                           output_axes_order, tensorflow_graph);
 }
 
-void ConvertFloatTensorConst(const Model& model, const string& name,
+void ConvertFloatTensorConst(const Model& model, const std::string& name,
                              GraphDef* tensorflow_graph) {
   if (HasAlreadyExportedConst(name, *tensorflow_graph)) {
     return;
@@ -214,7 +214,7 @@ void ConvertFloatTensorConst(const Model& model, const string& name,
                    LegacyScalarPolicy::kAvoidLegacyScalars);
 }
 
-void ConvertBoolTensorConst(const Model& model, const string& name,
+void ConvertBoolTensorConst(const Model& model, const std::string& name,
                             GraphDef* tensorflow_graph) {
   if (HasAlreadyExportedConst(name, *tensorflow_graph)) {
     return;
@@ -238,7 +238,7 @@ void ConvertBoolTensorConst(const Model& model, const string& name,
   }
 }
 
-void ConvertIntTensorConst(const Model& model, const string& name,
+void ConvertIntTensorConst(const Model& model, const std::string& name,
                            GraphDef* tensorflow_graph) {
   if (HasAlreadyExportedConst(name, *tensorflow_graph)) {
     return;
@@ -262,7 +262,8 @@ void ConvertIntTensorConst(const Model& model, const string& name,
   }
 }
 
-void CreateIntTensorConst(const string& name, const std::vector<int32>& data,
+void CreateIntTensorConst(const std::string& name,
+                          const std::vector<int32>& data,
                           const std::vector<int32>& shape,
                           GraphDef* tensorflow_graph) {
   if (HasAlreadyExportedConst(name, *tensorflow_graph)) {
@@ -286,7 +287,7 @@ void CreateIntTensorConst(const string& name, const std::vector<int32>& data,
   CHECK_EQ(num_elements, data.size());
 }
 
-void ConvertComplex64TensorConst(const Model& model, const string& name,
+void ConvertComplex64TensorConst(const Model& model, const std::string& name,
                                  GraphDef* tensorflow_graph) {
   if (HasAlreadyExportedConst(name, *tensorflow_graph)) {
     return;
@@ -311,7 +312,7 @@ void ConvertComplex64TensorConst(const Model& model, const string& name,
   }
 }
 
-void CreateMatrixShapeTensorConst(const string& name, int rows, int cols,
+void CreateMatrixShapeTensorConst(const std::string& name, int rows, int cols,
                                   GraphDef* tensorflow_graph) {
   if (HasAlreadyExportedConst(name, *tensorflow_graph)) {
     return;
@@ -324,12 +325,12 @@ void CreateMatrixShapeTensorConst(const string& name, int rows, int cols,
   tensor->set_dtype(DT_INT32);
   const int32 data[2] = {cols, rows};
   tensor->set_tensor_content(
-      string(reinterpret_cast<const char*>(data), sizeof(data)));
+      std::string(reinterpret_cast<const char*>(data), sizeof(data)));
   auto* shape = tensor->mutable_tensor_shape();
   shape->add_dim()->set_size(2);
 }
 
-void CreateDummyConcatDimTensorConst(const string& name, int dim,
+void CreateDummyConcatDimTensorConst(const std::string& name, int dim,
                                      GraphDef* tensorflow_graph) {
   if (HasAlreadyExportedConst(name, *tensorflow_graph)) {
     return;
@@ -343,7 +344,7 @@ void CreateDummyConcatDimTensorConst(const string& name, int dim,
   tensor->add_int_val(dim);
 }
 
-void CreateReshapeShapeTensorConst(const string& name,
+void CreateReshapeShapeTensorConst(const std::string& name,
                                    const std::vector<int32>& shape,
                                    GraphDef* tensorflow_graph) {
   if (HasAlreadyExportedConst(name, *tensorflow_graph)) {
@@ -367,7 +368,7 @@ void CreateReshapeShapeTensorConst(const string& name,
   }
 }
 
-string WalkUpToConstantArray(const Model& model, const string& name) {
+std::string WalkUpToConstantArray(const Model& model, const std::string& name) {
   const Array& original_array = model.GetArray(name);
   if (original_array.buffer) {
     return name;
@@ -375,7 +376,7 @@ string WalkUpToConstantArray(const Model& model, const string& name) {
   const auto* op = GetOpWithOutput(model, name);
   CHECK(op);
   CHECK(op->type == OperatorType::kFakeQuant);
-  const string& input_of_fakequant_name = op->inputs[0];
+  const std::string& input_of_fakequant_name = op->inputs[0];
   const Array& input_of_fakequant = model.GetArray(input_of_fakequant_name);
   CHECK(input_of_fakequant.buffer);
   return input_of_fakequant_name;
@@ -384,7 +385,7 @@ string WalkUpToConstantArray(const Model& model, const string& name) {
 void ConvertConvOperator(const Model& model, const ConvOperator& src_op,
                          GraphDef* tensorflow_graph) {
   const bool has_bias = src_op.inputs.size() >= 3;
-  string conv_output = src_op.outputs[0];
+  std::string conv_output = src_op.outputs[0];
   if (has_bias) {
     conv_output += "/conv";
   }
@@ -395,7 +396,7 @@ void ConvertConvOperator(const Model& model, const ConvOperator& src_op,
   *conv2d_op->add_input() = src_op.inputs[0];
   *conv2d_op->add_input() = src_op.inputs[1];
   (*conv2d_op->mutable_attr())["T"].set_type(DT_FLOAT);
-  const string& weights_array_name =
+  const std::string& weights_array_name =
       WalkUpToConstantArray(model, src_op.inputs[1]);
   const auto& weights_array = model.GetArray(weights_array_name);
   CHECK(weights_array.buffer->type == ArrayDataType::kFloat);
@@ -414,7 +415,7 @@ void ConvertConvOperator(const Model& model, const ConvOperator& src_op,
     dilations.mutable_list()->add_i(src_op.dilation_width_factor);
     dilations.mutable_list()->add_i(1);
   }
-  string padding;
+  std::string padding;
   if (src_op.padding.type == PaddingType::kSame) {
     padding = "SAME";
   } else if (src_op.padding.type == PaddingType::kValid) {
@@ -432,7 +433,7 @@ void ConvertConvOperator(const Model& model, const ConvOperator& src_op,
     biasadd_op->add_input(src_op.inputs[2]);
     (*biasadd_op->mutable_attr())["T"].set_type(DT_FLOAT);
     CHECK(model.HasArray(src_op.inputs[2]));
-    const string& bias_array_name =
+    const std::string& bias_array_name =
         WalkUpToConstantArray(model, src_op.inputs[2]);
     const auto& bias_array = model.GetArray(bias_array_name);
     // TODO(b/62904716) Bias arrays should be 1-D, and used directly.
@@ -452,7 +453,7 @@ void ConvertDepthwiseConvOperator(const Model& model,
                                   const DepthwiseConvOperator& src_op,
                                   GraphDef* tensorflow_graph) {
   const bool has_bias = src_op.inputs.size() >= 3;
-  string conv_output = src_op.outputs[0];
+  std::string conv_output = src_op.outputs[0];
   if (has_bias) {
     conv_output += "/conv";
   }
@@ -469,7 +470,7 @@ void ConvertDepthwiseConvOperator(const Model& model,
   // That's only a matter of constructing a Dims object; the actual
   // array layout is the same.
   CHECK(model.HasArray(src_op.inputs[1]));
-  const string& src_weights_name =
+  const std::string& src_weights_name =
       WalkUpToConstantArray(model, src_op.inputs[1]);
   const auto& src_weights_array = model.GetArray(src_weights_name);
   const auto& src_weights_shape = src_weights_array.shape();
@@ -505,7 +506,7 @@ void ConvertDepthwiseConvOperator(const Model& model,
     dilations.mutable_list()->add_i(src_op.dilation_width_factor);
     dilations.mutable_list()->add_i(1);
   }
-  string padding;
+  std::string padding;
   if (src_op.padding.type == PaddingType::kSame) {
     padding = "SAME";
   } else if (src_op.padding.type == PaddingType::kValid) {
@@ -523,7 +524,8 @@ void ConvertDepthwiseConvOperator(const Model& model,
     biasadd_op->add_input(src_op.inputs[2]);
     (*biasadd_op->mutable_attr())["T"].set_type(DT_FLOAT);
     CHECK(model.HasArray(src_op.inputs[2]));
-    const string& bias_name = WalkUpToConstantArray(model, src_op.inputs[2]);
+    const std::string& bias_name =
+        WalkUpToConstantArray(model, src_op.inputs[2]);
     const auto& bias_array = model.GetArray(bias_name);
     // TODO(b/62904716) Bias arrays should be 1-D, and used directly.
     Shape bias_shape_1d = bias_array.shape();
@@ -548,7 +550,7 @@ void ConvertTransposeConvOperator(const Model& model,
   *conv2d_op->add_input() = src_op.inputs[1];
   *conv2d_op->add_input() = src_op.inputs[2];
   (*conv2d_op->mutable_attr())["T"].set_type(DT_FLOAT);
-  const string& weights_array_name = WalkUpToConstantArray(
+  const std::string& weights_array_name = WalkUpToConstantArray(
       model, src_op.inputs[TransposeConvOperator::WEIGHTS]);
   const auto& weights_array = model.GetArray(weights_array_name);
   CHECK(weights_array.buffer->type == ArrayDataType::kFloat);
@@ -559,7 +561,7 @@ void ConvertTransposeConvOperator(const Model& model,
   strides.mutable_list()->add_i(src_op.stride_height);
   strides.mutable_list()->add_i(src_op.stride_width);
   strides.mutable_list()->add_i(1);
-  string padding;
+  std::string padding;
   if (src_op.padding.type == PaddingType::kSame) {
     padding = "SAME";
   } else if (src_op.padding.type == PaddingType::kValid) {
@@ -596,9 +598,9 @@ void ConvertFullyConnectedOperator(const Model& model,
                                    const FullyConnectedOperator& src_op,
                                    GraphDef* tensorflow_graph) {
   // Reshape input activations to have the shape expected by the MatMul.
-  const string reshape_output =
+  const std::string reshape_output =
       AvailableArrayName(model, src_op.outputs[0] + "/reshape");
-  const string reshape_shape =
+  const std::string reshape_shape =
       AvailableArrayName(model, reshape_output + "/shape");
   const auto& fc_weights_array = model.GetArray(src_op.inputs[1]);
   const auto& fc_weights_shape = fc_weights_array.shape();
@@ -614,7 +616,7 @@ void ConvertFullyConnectedOperator(const Model& model,
       GetTensorFlowDataType(model, src_op.inputs[0]));
 
   const bool has_bias = src_op.inputs.size() >= 3;
-  string matmul_output = src_op.outputs[0];
+  std::string matmul_output = src_op.outputs[0];
   if (has_bias) {
     matmul_output += "/matmul";
   }
@@ -622,9 +624,9 @@ void ConvertFullyConnectedOperator(const Model& model,
   // Transpose the RHS input from column-major to row-major to match TensorFlow
   // expectations. This is the inverse of the transpose we do during
   // ResolveTensorFlowMatMul.
-  const string transpose_output =
+  const std::string transpose_output =
       AvailableArrayName(model, matmul_output + "/transpose_weights");
-  const string transpose_perm =
+  const std::string transpose_perm =
       AvailableArrayName(model, transpose_output + "/perm");
   CreateIntTensorConst(transpose_perm, {1, 0}, {2}, tensorflow_graph);
   tensorflow::NodeDef* transpose_op = tensorflow_graph->add_node();
@@ -733,9 +735,9 @@ void ConvertReluOperator(const Model& model, const ReluOperator& src_op,
 
 void ConvertRelu1Operator(const Relu1Operator& src_op,
                           GraphDef* tensorflow_graph) {
-  const string max_bounds = src_op.outputs[0] + "/max_bounds";
-  const string min_bounds = src_op.outputs[0] + "/min_bounds";
-  const string max_output = src_op.outputs[0] + "/max_output";
+  const std::string max_bounds = src_op.outputs[0] + "/max_bounds";
+  const std::string min_bounds = src_op.outputs[0] + "/min_bounds";
+  const std::string max_output = src_op.outputs[0] + "/max_output";
 
   tensorflow::NodeDef* max_bounds_const_op = tensorflow_graph->add_node();
   max_bounds_const_op->set_op("Const");
@@ -808,15 +810,16 @@ void ConvertTanhOperator(const TanhOperator& src_op,
 
 void ConvertSoftmaxOperator(const Model& model, const SoftmaxOperator& src_op,
                             GraphDef* tensorflow_graph) {
-  string softmax_input;
+  std::string softmax_input;
   Operator* providing_op = GetOpWithOutput(model, src_op.inputs[0]);
   if (providing_op != nullptr && providing_op->type == OperatorType::kReshape) {
     softmax_input = src_op.inputs[0];
   } else {
     // Insert a reshape operator that reduces the dimensions down to the 2 that
     // are required for TensorFlow Logits.
-    const string reshape_output = src_op.outputs[0] + "/softmax_insert_reshape";
-    const string softmax_size = src_op.outputs[0] + "/softmax_insert_size";
+    const std::string reshape_output =
+        src_op.outputs[0] + "/softmax_insert_reshape";
+    const std::string softmax_size = src_op.outputs[0] + "/softmax_insert_size";
     softmax_input = reshape_output;
 
     tensorflow::NodeDef* reshape_op = tensorflow_graph->add_node();
@@ -848,16 +851,17 @@ void ConvertSoftmaxOperator(const Model& model, const SoftmaxOperator& src_op,
 void ConvertLogSoftmaxOperator(const Model& model,
                                const LogSoftmaxOperator& src_op,
                                GraphDef* tensorflow_graph) {
-  string softmax_input;
+  std::string softmax_input;
   Operator* providing_op = GetOpWithOutput(model, src_op.inputs[0]);
   if (providing_op != nullptr && providing_op->type == OperatorType::kReshape) {
     softmax_input = src_op.inputs[0];
   } else {
     // Insert a reshape operator that reduces the dimensions down to the 2 that
     // are required for TensorFlow Logits.
-    const string reshape_output =
+    const std::string reshape_output =
         src_op.outputs[0] + "/log_softmax_insert_reshape";
-    const string softmax_size = src_op.outputs[0] + "/log_softmax_insert_size";
+    const std::string softmax_size =
+        src_op.outputs[0] + "/log_softmax_insert_size";
     softmax_input = reshape_output;
 
     tensorflow::NodeDef* reshape_op = tensorflow_graph->add_node();
@@ -886,11 +890,12 @@ void ConvertLogSoftmaxOperator(const Model& model,
 
 void ConvertL2NormalizationOperator(const L2NormalizationOperator& src_op,
                                     GraphDef* tensorflow_graph) {
-  const string square_output = src_op.outputs[0] + "/square";
-  const string sum_reduction_indices = src_op.outputs[0] + "/reduction_indices";
-  const string sum_output = src_op.outputs[0] + "/sum";
-  const string rsqrt_output = src_op.outputs[0] + "/rsqrt";
-  const string rsqrt_tiled_output = src_op.outputs[0] + "/rsqrt_tiled";
+  const std::string square_output = src_op.outputs[0] + "/square";
+  const std::string sum_reduction_indices =
+      src_op.outputs[0] + "/reduction_indices";
+  const std::string sum_output = src_op.outputs[0] + "/sum";
+  const std::string rsqrt_output = src_op.outputs[0] + "/rsqrt";
+  const std::string rsqrt_tiled_output = src_op.outputs[0] + "/rsqrt_tiled";
 
   tensorflow::NodeDef* sum_reduction_indices_op = tensorflow_graph->add_node();
   sum_reduction_indices_op->set_op("Const");
@@ -975,7 +980,7 @@ void ConvertMaxPoolOperator(const MaxPoolOperator& src_op,
   strides.mutable_list()->add_i(src_op.stride_height);
   strides.mutable_list()->add_i(src_op.stride_width);
   strides.mutable_list()->add_i(1);
-  string padding;
+  std::string padding;
   if (src_op.padding.type == PaddingType::kSame) {
     padding = "SAME";
   } else if (src_op.padding.type == PaddingType::kValid) {
@@ -1003,7 +1008,7 @@ void ConvertAveragePoolOperator(const AveragePoolOperator& src_op,
   strides.mutable_list()->add_i(src_op.stride_height);
   strides.mutable_list()->add_i(src_op.stride_width);
   strides.mutable_list()->add_i(1);
-  string padding;
+  std::string padding;
   if (src_op.padding.type == PaddingType::kSame) {
     padding = "SAME";
   } else if (src_op.padding.type == PaddingType::kValid) {
@@ -1026,7 +1031,7 @@ void ConvertConcatenationOperator(const Model& model,
   tensorflow::NodeDef* dc_op = tensorflow_graph->add_node();
   dc_op->set_op("ConcatV2");
   dc_op->set_name(src_op.outputs[0]);
-  const string dummy_axis = src_op.outputs[0] + "/axis";
+  const std::string dummy_axis = src_op.outputs[0] + "/axis";
   CreateDummyConcatDimTensorConst(dummy_axis, src_op.axis, tensorflow_graph);
   for (const auto& input : src_op.inputs) {
     *dc_op->add_input() = input;
@@ -1060,8 +1065,8 @@ void ConvertTensorFlowReshapeOperator(const Model& model,
 
 void ConvertL2PoolOperator(const L2PoolOperator& src_op,
                            GraphDef* tensorflow_graph) {
-  const string square_output = src_op.outputs[0] + "/square";
-  const string avgpool_output = src_op.outputs[0] + "/avgpool";
+  const std::string square_output = src_op.outputs[0] + "/square";
+  const std::string avgpool_output = src_op.outputs[0] + "/avgpool";
 
   tensorflow::NodeDef* square_op = tensorflow_graph->add_node();
   square_op->set_op("Square");
@@ -1069,7 +1074,7 @@ void ConvertL2PoolOperator(const L2PoolOperator& src_op,
   *square_op->add_input() = src_op.inputs[0];
   (*square_op->mutable_attr())["T"].set_type(DT_FLOAT);
 
-  string padding;
+  std::string padding;
   if (src_op.padding.type == PaddingType::kSame) {
     padding = "SAME";
   } else if (src_op.padding.type == PaddingType::kValid) {
@@ -1235,7 +1240,7 @@ void ConvertGatherOperator(const Model& model, const GatherOperator& src_op,
   } else {
     // Constant axis.
     CHECK_EQ(src_op.inputs.size(), 2);
-    const string gather_axis =
+    const std::string gather_axis =
         AvailableArrayName(model, gather_op->name() + "/axis");
     CreateIntTensorConst(gather_axis, {src_op.axis.value()}, {},
                          tensorflow_graph);
@@ -1454,8 +1459,8 @@ absl::string_view FindLongestCommonPrefix(absl::string_view a,
 
   const char* pa = a.data();
   const char* pb = b.data();
-  string::difference_type count = 0;
-  const string::difference_type limit = std::min(a.size(), b.size());
+  std::string::difference_type count = 0;
+  const std::string::difference_type limit = std::min(a.size(), b.size());
   while (count < limit && *pa == *pb) {
     ++pa;
     ++pb;
@@ -1469,12 +1474,12 @@ absl::string_view FindLongestCommonPrefix(absl::string_view a,
 void ConvertLstmCellOperator(const Model& model, const LstmCellOperator& src_op,
                              GraphDef* tensorflow_graph) {
   // Find the base name
-  const string base(
+  const std::string base(
       FindLongestCommonPrefix(src_op.outputs[LstmCellOperator::STATE_OUTPUT],
                               src_op.outputs[LstmCellOperator::ACTIV_OUTPUT]));
 
   // Concatenate inputs
-  const string concat_output = base + "basic_lstm_cell/concat";
+  const std::string concat_output = base + "basic_lstm_cell/concat";
   // Op names have been chosen to match the tf.slim LSTM naming
   // as closely as possible.
   const int axis =
@@ -1484,7 +1489,7 @@ void ConvertLstmCellOperator(const Model& model, const LstmCellOperator& src_op,
       1;
   // Note that DATA_INPUT may have extra size 1 dimensions, but TF concat
   // works the same since the tensor has the same underlying data layout.
-  const string axis_output = concat_output + "/axis";
+  const std::string axis_output = concat_output + "/axis";
   CreateDummyConcatDimTensorConst(axis_output, axis, tensorflow_graph);
   tensorflow::NodeDef* concat_op = tensorflow_graph->add_node();
   concat_op->set_op("ConcatV2");
@@ -1497,9 +1502,9 @@ void ConvertLstmCellOperator(const Model& model, const LstmCellOperator& src_op,
   (*concat_op->mutable_attr())["N"].set_i(2);  // Number of inputs
 
   // Write weights
-  const string weights_output = base + "weights";
+  const std::string weights_output = base + "weights";
   CHECK(model.HasArray(src_op.inputs[LstmCellOperator::WEIGHTS_INPUT]));
-  const string weights_name = WalkUpToConstantArray(
+  const std::string weights_name = WalkUpToConstantArray(
       model, src_op.inputs[LstmCellOperator::WEIGHTS_INPUT]);
   const auto& weights_array = model.GetArray(weights_name);
   // Convert 4D FullyConnected weights into 2D matrix
@@ -1513,7 +1518,7 @@ void ConvertLstmCellOperator(const Model& model, const LstmCellOperator& src_op,
                           AxesOrder::kCR, AxesOrder::kRC, tensorflow_graph);
 
   // Fully connected matrix multiply
-  const string matmul_output = base + "MatMul";
+  const std::string matmul_output = base + "MatMul";
   tensorflow::NodeDef* matmul_op = tensorflow_graph->add_node();
   matmul_op->set_op("MatMul");
   matmul_op->set_name(matmul_output);
@@ -1524,9 +1529,9 @@ void ConvertLstmCellOperator(const Model& model, const LstmCellOperator& src_op,
   (*matmul_op->mutable_attr())["T"].set_type(DT_FLOAT);
 
   // Write biases
-  const string biases_output = base + "biases";
+  const std::string biases_output = base + "biases";
   CHECK(model.HasArray(src_op.inputs[LstmCellOperator::BIASES_INPUT]));
-  const string bias_name = WalkUpToConstantArray(
+  const std::string bias_name = WalkUpToConstantArray(
       model, src_op.inputs[LstmCellOperator::BIASES_INPUT]);
   const auto& bias_array = model.GetArray(bias_name);
   // TODO(b/62904716) Bias arrays should be 1-D, and used directly.
@@ -1542,7 +1547,7 @@ void ConvertLstmCellOperator(const Model& model, const LstmCellOperator& src_op,
                           LegacyScalarPolicy::kDoCreateLegacyScalars);
 
   // Add biases
-  string biasadd_output = base + "BiasAdd";
+  std::string biasadd_output = base + "BiasAdd";
   tensorflow::NodeDef* biasadd_op = tensorflow_graph->add_node();
   biasadd_op->set_op("BiasAdd");
   biasadd_op->set_name(biasadd_output);
@@ -1552,10 +1557,10 @@ void ConvertLstmCellOperator(const Model& model, const LstmCellOperator& src_op,
   (*biasadd_op->mutable_attr())["T"].set_type(DT_FLOAT);
 
   // Split
-  string split_dim_output = base + "split/split_dim";
+  std::string split_dim_output = base + "split/split_dim";
   // The dimension is the same as the concatenation dimension
   CreateDummyConcatDimTensorConst(split_dim_output, axis, tensorflow_graph);
-  string split_output = base + "split";
+  std::string split_output = base + "split";
   tensorflow::NodeDef* split_op = tensorflow_graph->add_node();
   split_op->set_op("Split");
   split_op->set_name(split_output);
@@ -1565,21 +1570,21 @@ void ConvertLstmCellOperator(const Model& model, const LstmCellOperator& src_op,
   (*split_op->mutable_attr())["num_split"].set_i(4);  // Split into four outputs
 
   // Activation functions and memory computations
-  const string tanh_0_output = base + "Tanh";
+  const std::string tanh_0_output = base + "Tanh";
   tensorflow::NodeDef* tanh_0_op = tensorflow_graph->add_node();
   tanh_0_op->set_op("Tanh");
   tanh_0_op->set_name(tanh_0_output);
   *tanh_0_op->add_input() = split_output + ":1";
   (*tanh_0_op->mutable_attr())["T"].set_type(DT_FLOAT);
 
-  const string sigmoid_1_output = base + "Sigmoid_1";
+  const std::string sigmoid_1_output = base + "Sigmoid_1";
   tensorflow::NodeDef* logistic_1_op = tensorflow_graph->add_node();
   logistic_1_op->set_op("Sigmoid");
   logistic_1_op->set_name(sigmoid_1_output);
   *logistic_1_op->add_input() = split_output;
   (*logistic_1_op->mutable_attr())["T"].set_type(DT_FLOAT);
 
-  const string mul_1_output = base + "mul_1";
+  const std::string mul_1_output = base + "mul_1";
   tensorflow::NodeDef* mul_1_op = tensorflow_graph->add_node();
   mul_1_op->set_op("Mul");
   mul_1_op->set_name(mul_1_output);
@@ -1587,21 +1592,21 @@ void ConvertLstmCellOperator(const Model& model, const LstmCellOperator& src_op,
   *mul_1_op->add_input() = tanh_0_output;
   (*mul_1_op->mutable_attr())["T"].set_type(DT_FLOAT);
 
-  const string sigmoid_0_output = base + "Sigmoid";
+  const std::string sigmoid_0_output = base + "Sigmoid";
   tensorflow::NodeDef* logistic_2_op = tensorflow_graph->add_node();
   logistic_2_op->set_op("Sigmoid");
   logistic_2_op->set_name(sigmoid_0_output);
   *logistic_2_op->add_input() = split_output + ":2";
   (*logistic_2_op->mutable_attr())["T"].set_type(DT_FLOAT);
 
-  const string sigmoid_2_output = base + "Sigmoid_2";
+  const std::string sigmoid_2_output = base + "Sigmoid_2";
   tensorflow::NodeDef* logistic_3_op = tensorflow_graph->add_node();
   logistic_3_op->set_op("Sigmoid");
   logistic_3_op->set_name(sigmoid_2_output);
   *logistic_3_op->add_input() = split_output + ":3";
   (*logistic_3_op->mutable_attr())["T"].set_type(DT_FLOAT);
 
-  const string mul_0_output = base + "mul";
+  const std::string mul_0_output = base + "mul";
   tensorflow::NodeDef* mul_0_op = tensorflow_graph->add_node();
   mul_0_op->set_op("Mul");
   mul_0_op->set_name(mul_0_output);
@@ -1609,7 +1614,8 @@ void ConvertLstmCellOperator(const Model& model, const LstmCellOperator& src_op,
   *mul_0_op->add_input() = sigmoid_0_output;
   (*mul_0_op->mutable_attr())["T"].set_type(DT_FLOAT);
 
-  const string add_1_output = src_op.outputs[LstmCellOperator::STATE_OUTPUT];
+  const std::string add_1_output =
+      src_op.outputs[LstmCellOperator::STATE_OUTPUT];
   tensorflow::NodeDef* add_1_op = tensorflow_graph->add_node();
   add_1_op->set_op("Add");
   add_1_op->set_name(add_1_output);
@@ -1617,14 +1623,15 @@ void ConvertLstmCellOperator(const Model& model, const LstmCellOperator& src_op,
   *add_1_op->add_input() = mul_1_output;
   (*add_1_op->mutable_attr())["T"].set_type(DT_FLOAT);
 
-  const string tanh_1_output = base + "Tanh_1";
+  const std::string tanh_1_output = base + "Tanh_1";
   tensorflow::NodeDef* tanh_1_op = tensorflow_graph->add_node();
   tanh_1_op->set_op("Tanh");
   tanh_1_op->set_name(tanh_1_output);
   *tanh_1_op->add_input() = add_1_output;
   (*tanh_1_op->mutable_attr())["T"].set_type(DT_FLOAT);
 
-  const string mul_2_output = src_op.outputs[LstmCellOperator::ACTIV_OUTPUT];
+  const std::string mul_2_output =
+      src_op.outputs[LstmCellOperator::ACTIV_OUTPUT];
   tensorflow::NodeDef* mul_2_op = tensorflow_graph->add_node();
   mul_2_op->set_op("Mul");
   mul_2_op->set_name(mul_2_output);
@@ -1730,7 +1737,8 @@ void ConvertPadV2Operator(const Model& model, const PadV2Operator& src_op,
   shape->add_dim()->set_size(2);
 }
 
-void CreateSliceInput(const string& input_name, const std::vector<int>& values,
+void CreateSliceInput(const std::string& input_name,
+                      const std::vector<int>& values,
                       GraphDef* tensorflow_graph) {
   tensorflow::NodeDef* params_op = tensorflow_graph->add_node();
   params_op->set_op("Const");
@@ -1797,7 +1805,8 @@ void ConvertSliceOperator(const Model& model, const SliceOperator& src_op,
 
 template <typename T>
 void ConvertReduceOperator(const Model& model, const T& src_op,
-                           GraphDef* tensorflow_graph, const string& op_name) {
+                           GraphDef* tensorflow_graph,
+                           const std::string& op_name) {
   tensorflow::NodeDef* new_op = tensorflow_graph->add_node();
   new_op->set_op(op_name);
   new_op->set_name(src_op.outputs[0]);
@@ -2412,7 +2421,7 @@ void ConvertOperator(const Model& model, const Operator& src_op,
   }
 }
 
-void AddPlaceholder(const string& name, ArrayDataType type,
+void AddPlaceholder(const std::string& name, ArrayDataType type,
                     GraphDef* tensorflow_graph) {
   tensorflow::NodeDef* placeholder = tensorflow_graph->add_node();
   placeholder->set_op("Placeholder");
@@ -2444,8 +2453,8 @@ void AddPlaceholder(const string& name, ArrayDataType type,
   placeholder->set_name(name);
 }
 
-void AddPlaceholderForRNNState(const Model& model, const string& name, int size,
-                               GraphDef* tensorflow_graph) {
+void AddPlaceholderForRNNState(const Model& model, const std::string& name,
+                               int size, GraphDef* tensorflow_graph) {
   tensorflow::NodeDef* placeholder = tensorflow_graph->add_node();
   placeholder->set_op("Placeholder");
   placeholder->set_name(name);
@@ -2484,7 +2493,7 @@ void ExportTensorFlowGraphDefImplementation(const Model& model,
   // after, as some operators need to export arrays that they reference
   // in a specific way, rather than in the generic way done below.
   for (const auto& array_pair : model.GetArrayMap()) {
-    const string& array_name = array_pair.first;
+    const std::string& array_name = array_pair.first;
     const auto& array = *array_pair.second;
     if (array.buffer) {
       switch (array.data_type) {
@@ -2510,12 +2519,12 @@ void ExportTensorFlowGraphDefImplementation(const Model& model,
 
 void EncodeConstantArraysMinMaxByWrappingThemInFakeQuantNodes(Model* model) {
   for (const auto& array_kv : model->GetArrayMap()) {
-    const string& array_name = array_kv.first;
+    const std::string& array_name = array_kv.first;
     Array& array = *array_kv.second;
     if (!array.buffer || !array.minmax) {
       continue;
     }
-    const string& wrapped_array_name =
+    const std::string& wrapped_array_name =
         AvailableArrayName(*model, array_name + "/data");
     Array& wrapped_array = model->GetOrCreateArray(wrapped_array_name);
     wrapped_array.data_type = array.data_type;
@@ -2533,7 +2542,7 @@ void EncodeConstantArraysMinMaxByWrappingThemInFakeQuantNodes(Model* model) {
 }
 
 void ExportTensorFlowGraphDef(const Model& model,
-                              string* output_file_contents) {
+                              std::string* output_file_contents) {
   CHECK(output_file_contents->empty());
   GraphDef tensorflow_graph;
   ExportTensorFlowGraphDefImplementation(model, &tensorflow_graph);
diff --git a/tensorflow/lite/toco/export_tensorflow.h b/tensorflow/lite/toco/export_tensorflow.h
index 09c966ded62..bc7ccd8d875 100644
--- a/tensorflow/lite/toco/export_tensorflow.h
+++ b/tensorflow/lite/toco/export_tensorflow.h
@@ -20,7 +20,8 @@ limitations under the License.
 
 namespace toco {
 
-void ExportTensorFlowGraphDef(const Model& model, string* output_file_contents);
+void ExportTensorFlowGraphDef(const Model& model,
+                              std::string* output_file_contents);
 
 void EncodeConstantArraysMinMaxByWrappingThemInFakeQuantNodes(Model* model);
 
diff --git a/tensorflow/lite/toco/format_port.h b/tensorflow/lite/toco/format_port.h
index 3c154e5ad48..47d39069c9f 100644
--- a/tensorflow/lite/toco/format_port.h
+++ b/tensorflow/lite/toco/format_port.h
@@ -38,13 +38,13 @@ inline const char* IdentityOrConvertStringToRaw(const std::string& foo) {
 
 // Delegate to TensorFlow Appendf function until absl has an equivalent.
 template <typename... Args>
-inline void AppendFHelper(string* destination, const char* fmt,
+inline void AppendFHelper(std::string* destination, const char* fmt,
                           Args&&... args) {
   tensorflow::strings::Appendf(destination, fmt, args...);
 }
 
 // Specialization for no argument format string (avoid security bug).
-inline void AppendFHelper(string* destination, const char* fmt) {
+inline void AppendFHelper(std::string* destination, const char* fmt) {
   tensorflow::strings::Appendf(destination, "%s", fmt);
 }
 
@@ -52,15 +52,15 @@ inline void AppendFHelper(string* destination, const char* fmt) {
 // pointed to by destination. fmt follows C printf semantics.
 // One departure is that %s can be driven by a std::string or string.
 template <typename... Args>
-inline void AppendF(string* destination, const char* fmt, Args&&... args) {
+inline void AppendF(std::string* destination, const char* fmt, Args&&... args) {
   AppendFHelper(destination, fmt, IdentityOrConvertStringToRaw(args)...);
 }
 
 // Return formatted string (with format fmt and args args). fmt follows C printf
 // semantics. One departure is that %s can be driven by a std::string or string.
 template <typename... Args>
-inline string StringF(const char* fmt, Args&&... args) {
-  string result;
+inline std::string StringF(const char* fmt, Args&&... args) {
+  std::string result;
   AppendFHelper(&result, fmt, IdentityOrConvertStringToRaw(args)...);
   return result;
 }
diff --git a/tensorflow/lite/toco/import_tensorflow.cc b/tensorflow/lite/toco/import_tensorflow.cc
index 3124133047e..2adfe838c3d 100644
--- a/tensorflow/lite/toco/import_tensorflow.cc
+++ b/tensorflow/lite/toco/import_tensorflow.cc
@@ -67,7 +67,7 @@ using tensorflow::TensorShapeProto;
 namespace toco {
 
 namespace {
-bool HasAttr(const NodeDef& node, const string& attr_name) {
+bool HasAttr(const NodeDef& node, const std::string& attr_name) {
   return node.attr().count(attr_name) > 0;
 }
 
@@ -78,14 +78,15 @@ bool HasWildcardDimension(const TensorShapeProto& shape) {
   return false;
 }
 
-const string& GetStringAttr(const NodeDef& node, const string& attr_name) {
+const std::string& GetStringAttr(const NodeDef& node,
+                                 const std::string& attr_name) {
   CHECK(HasAttr(node, attr_name));
   const auto& attr = node.attr().at(attr_name);
   CHECK_EQ(attr.value_case(), AttrValue::kS);
   return attr.s();
 }
 
-int64 GetIntAttr(const NodeDef& node, const string& attr_name) {
+int64 GetIntAttr(const NodeDef& node, const std::string& attr_name) {
   CHECK(HasAttr(node, attr_name)) << attr_name << " not found in:\n"
                                   << node.DebugString();
   const auto& attr = node.attr().at(attr_name);
@@ -93,14 +94,14 @@ int64 GetIntAttr(const NodeDef& node, const string& attr_name) {
   return attr.i();
 }
 
-float GetFloatAttr(const NodeDef& node, const string& attr_name) {
+float GetFloatAttr(const NodeDef& node, const std::string& attr_name) {
   CHECK(HasAttr(node, attr_name));
   const auto& attr = node.attr().at(attr_name);
   CHECK_EQ(attr.value_case(), AttrValue::kF);
   return attr.f();
 }
 
-bool GetBoolAttr(const NodeDef& node, const string& attr_name) {
+bool GetBoolAttr(const NodeDef& node, const std::string& attr_name) {
   CHECK(HasAttr(node, attr_name));
   const auto& attr = node.attr().at(attr_name);
   CHECK_EQ(attr.value_case(), AttrValue::kB);
@@ -108,7 +109,7 @@ bool GetBoolAttr(const NodeDef& node, const string& attr_name) {
 }
 
 tensorflow::DataType GetDataTypeAttr(const NodeDef& node,
-                                     const string& attr_name) {
+                                     const std::string& attr_name) {
   CHECK(HasAttr(node, attr_name));
   const auto& attr = node.attr().at(attr_name);
   CHECK_EQ(attr.value_case(), AttrValue::kType);
@@ -116,14 +117,15 @@ tensorflow::DataType GetDataTypeAttr(const NodeDef& node,
 }
 
 const TensorShapeProto& GetShapeAttr(const NodeDef& node,
-                                     const string& attr_name) {
+                                     const std::string& attr_name) {
   CHECK(HasAttr(node, attr_name));
   const auto& attr = node.attr().at(attr_name);
   CHECK_EQ(attr.value_case(), AttrValue::kShape);
   return attr.shape();
 }
 
-const TensorProto& GetTensorAttr(const NodeDef& node, const string& attr_name) {
+const TensorProto& GetTensorAttr(const NodeDef& node,
+                                 const std::string& attr_name) {
   CHECK(HasAttr(node, attr_name)) << "No attr named '" << attr_name << "'";
   const auto& attr = node.attr().at(attr_name);
   CHECK_EQ(attr.value_case(), AttrValue::kTensor);
@@ -131,7 +133,7 @@ const TensorProto& GetTensorAttr(const NodeDef& node, const string& attr_name) {
 }
 
 const AttrValue::ListValue& GetListAttr(const NodeDef& node,
-                                        const string& attr_name) {
+                                        const std::string& attr_name) {
   CHECK(HasAttr(node, attr_name));
   const auto& attr = node.attr().at(attr_name);
   CHECK_EQ(attr.value_case(), AttrValue::kList);
@@ -139,10 +141,10 @@ const AttrValue::ListValue& GetListAttr(const NodeDef& node,
 }
 
 tensorflow::Status CheckOptionalAttr(const NodeDef& node,
-                                     const string& attr_name,
-                                     const string& expected_value) {
+                                     const std::string& attr_name,
+                                     const std::string& expected_value) {
   if (HasAttr(node, attr_name)) {
-    const string& value = GetStringAttr(node, attr_name);
+    const std::string& value = GetStringAttr(node, attr_name);
     if (value != expected_value) {
       return tensorflow::errors::InvalidArgument(
           "Unexpected value for attribute '" + attr_name + "'. Expected '" +
@@ -153,7 +155,7 @@ tensorflow::Status CheckOptionalAttr(const NodeDef& node,
 }
 
 tensorflow::Status CheckOptionalAttr(
-    const NodeDef& node, const string& attr_name,
+    const NodeDef& node, const std::string& attr_name,
     const tensorflow::DataType& expected_value) {
   if (HasAttr(node, attr_name)) {
     const tensorflow::DataType& value = GetDataTypeAttr(node, attr_name);
@@ -168,7 +170,7 @@ tensorflow::Status CheckOptionalAttr(
 
 template <typename T1, typename T2>
 tensorflow::Status ExpectValue(const T1& v1, const T2& v2,
-                               const string& description) {
+                               const std::string& description) {
   if (v1 == v2) return tensorflow::Status::OK();
   return tensorflow::errors::InvalidArgument(absl::StrCat(
       "Unexpected ", description, ": got ", v1, ", expected ", v2));
@@ -244,8 +246,8 @@ template <>
 struct TensorTraits<float> {
   static int size(const TensorProto& p) { return p.float_val_size(); }
   static float get(const TensorProto& p, int i) { return p.float_val(i); }
-  static string accessor_name() { return "float_val"; }
-  static string type_name() { return "float"; }
+  static std::string accessor_name() { return "float_val"; }
+  static std::string type_name() { return "float"; }
   static void CopyFromContent(const TensorProto& p, std::vector<float>* data) {
     toco::port::CopyToBuffer(p.tensor_content(),
                              reinterpret_cast<char*>(data->data()));
@@ -256,8 +258,8 @@ template <>
 struct TensorTraits<uint8_t> {
   static int size(const TensorProto& p) { return p.int_val_size(); }
   static uint8_t get(const TensorProto& p, int i) { return p.int_val(i); }
-  static string accessor_name() { return "int_val"; }
-  static string type_name() { return "uint8"; }
+  static std::string accessor_name() { return "int_val"; }
+  static std::string type_name() { return "uint8"; }
   static void CopyFromContent(const TensorProto& p,
                               std::vector<uint8_t>* data) {
     toco::port::CopyToBuffer(p.tensor_content(),
@@ -272,8 +274,8 @@ struct TensorTraits<std::complex<float>> {
     return std::complex<float>(p.scomplex_val(2 * i),
                                p.scomplex_val(2 * i + 1));
   }
-  static string accessor_name() { return "scomplex_val"; }
-  static string type_name() { return "complex64"; }
+  static std::string accessor_name() { return "scomplex_val"; }
+  static std::string type_name() { return "complex64"; }
   static void CopyFromContent(const TensorProto& p,
                               std::vector<std::complex<float>>* data) {
     toco::port::CopyToBuffer(p.tensor_content(),
@@ -285,8 +287,8 @@ template <>
 struct TensorTraits<int32> {
   static int size(const TensorProto& p) { return p.int_val_size(); }
   static int32 get(const TensorProto& p, int i) { return p.int_val(i); }
-  static string accessor_name() { return "int_val"; }
-  static string type_name() { return "int32"; }
+  static std::string accessor_name() { return "int_val"; }
+  static std::string type_name() { return "int32"; }
   static void CopyFromContent(const TensorProto& p, std::vector<int32>* data) {
     toco::port::CopyToBuffer(p.tensor_content(),
                              reinterpret_cast<char*>(data->data()));
@@ -297,8 +299,8 @@ template <>
 struct TensorTraits<int64> {
   static int size(const TensorProto& p) { return p.int64_val_size(); }
   static int64 get(const TensorProto& p, int i) { return p.int64_val(i); }
-  static string accessor_name() { return "int64_val"; }
-  static string type_name() { return "int64"; }
+  static std::string accessor_name() { return "int64_val"; }
+  static std::string type_name() { return "int64"; }
   static void CopyFromContent(const TensorProto& p, std::vector<int64>* data) {
     toco::port::CopyToBuffer(p.tensor_content(),
                              reinterpret_cast<char*>(data->data()));
@@ -309,8 +311,8 @@ template <>
 struct TensorTraits<bool> {
   static int size(const TensorProto& p) { return p.bool_val_size(); }
   static bool get(const TensorProto& p, int i) { return p.bool_val(i); }
-  static string accessor_name() { return "bool_val"; }
-  static string type_name() { return "bool"; }
+  static std::string accessor_name() { return "bool_val"; }
+  static std::string type_name() { return "bool"; }
   static void CopyFromContent(const TensorProto& p, std::vector<bool>* data) {
     std::vector<char> buf(p.tensor_content().size());
     toco::port::CopyToBuffer(p.tensor_content(), buf.data());
@@ -348,8 +350,8 @@ tensorflow::Status ImportTensorData(const TensorProto& input_tensor,
       (*output_data)[i] = last;
     }
   } else {
-    string accessor_name = TensorTraits<T>::accessor_name();
-    string type_name = TensorTraits<T>::type_name();
+    std::string accessor_name = TensorTraits<T>::accessor_name();
+    std::string type_name = TensorTraits<T>::type_name();
     return tensorflow::errors::InvalidArgument(
         absl::StrCat("Neither input_content (",
                      input_tensor.tensor_content().size() / sizeof(T), ") nor ",
@@ -527,10 +529,11 @@ tensorflow::Status CheckInputsCount(
 }
 
 template <ArrayDataType T>
-string CreateConstArray(Model* model, string const& name,
-                        std::vector<typename toco::DataType<T> > const& data) {
+std::string CreateConstArray(
+    Model* model, std::string const& name,
+    std::vector<typename toco::DataType<T>> const& data) {
   // Utility function to create a const 1D array, useful for input parameters.
-  string array_name = toco::AvailableArrayName(*model, name);
+  std::string array_name = toco::AvailableArrayName(*model, name);
   auto& array = model->GetOrCreateArray(array_name);
   array.data_type = T;
   array.mutable_shape()->mutable_dims()->emplace_back(
@@ -576,7 +579,7 @@ void GetOutputNamesFromNodeDef(const NodeDef& node,
     ++next_output;
   };
   for (int i = 0; i < op_def.output_arg_size(); ++i) {
-    string multiples = op_def.output_arg(i).number_attr();
+    std::string multiples = op_def.output_arg(i).number_attr();
     if (!multiples.empty()) {
       CHECK(HasAttr(node, multiples)) << "No attr named " << multiples;
       int num_outputs = GetIntAttr(node, multiples);
@@ -584,7 +587,7 @@ void GetOutputNamesFromNodeDef(const NodeDef& node,
         add_output();
       }
     } else {
-      string list = op_def.output_arg(i).type_list_attr();
+      std::string list = op_def.output_arg(i).type_list_attr();
       if (!list.empty()) {
         CHECK(HasAttr(node, list)) << "No attr named " << list;
         const AttrValue::ListValue& list_value = GetListAttr(node, list);
@@ -624,7 +627,7 @@ void GetOutputTypesFromNodeDef(const NodeDef& node,
   };
 
   for (int i = 0; i < op_def.output_arg_size(); ++i) {
-    string multiples = op_def.output_arg(i).number_attr();
+    std::string multiples = op_def.output_arg(i).number_attr();
     if (!multiples.empty()) {
       CHECK(HasAttr(node, multiples)) << "No attr named " << multiples;
       int num_outputs = GetIntAttr(node, multiples);
@@ -633,7 +636,7 @@ void GetOutputTypesFromNodeDef(const NodeDef& node,
         add_type(type);
       }
     } else {
-      string list = op_def.output_arg(i).type_list_attr();
+      std::string list = op_def.output_arg(i).type_list_attr();
       if (!list.empty()) {
         CHECK(HasAttr(node, list)) << "No attr named " << list;
         const AttrValue::ListValue& list_value = GetListAttr(node, list);
@@ -1057,7 +1060,7 @@ tensorflow::Status ConvertIdentityNOperator(
   for (int i = 0; i < node.input_size(); ++i) {
     auto* op = new TensorFlowIdentityOperator;
     const auto& input_name = node.input(i);
-    string output_name = node.name();
+    std::string output_name = node.name();
     if (i > 0) {
       output_name = output_name + ":" + std::to_string(i);
     }
@@ -1756,13 +1759,13 @@ tensorflow::Status ConvertBatchNormWithGlobalNormalizationOperator(
   // to the input, before feeding it into TensorFlowRsqrtOperator.
   // CHECK_EQ(GetFloatAttr(node, "variance_epsilon"), 0.001f);
 
-  string multiplier = node.name() + "_mul";
+  std::string multiplier = node.name() + "_mul";
   if (GetBoolAttr(node, "scale_after_normalization")) {
     // Create graph:
     //   v -> RSQRT ->
     //                 MUL  -> multiplier
     //   gamma  ----->
-    string rsqrt = node.name() + "_rsqrt";
+    std::string rsqrt = node.name() + "_rsqrt";
 
     auto* rsqrt_op = new TensorFlowRsqrtOperator;
     rsqrt_op->inputs.push_back(node.input(2));
@@ -1803,17 +1806,19 @@ tensorflow::Status ConvertFusedBatchNormOperator(
   TF_QCHECK_OK(CheckInputsCount(node, tf_import_flags, 5));
 
   // Declare shortcuts for the inputs.
-  const string& gamma_input = node.input(1);
-  const string& beta_input = node.input(2);
-  const string& moving_mean_input = node.input(3);
-  const string& moving_variance_input = node.input(4);
+  const std::string& gamma_input = node.input(1);
+  const std::string& beta_input = node.input(2);
+  const std::string& moving_mean_input = node.input(3);
+  const std::string& moving_variance_input = node.input(4);
 
   // Create an array holding the epsilon value (typically, 0.001).
-  const string epsilon_array_name = CreateConstArray<ArrayDataType::kFloat>(
-      model, node.name() + "_epsilon_array", {GetFloatAttr(node, "epsilon")});
+  const std::string epsilon_array_name =
+      CreateConstArray<ArrayDataType::kFloat>(model,
+                                              node.name() + "_epsilon_array",
+                                              {GetFloatAttr(node, "epsilon")});
 
   // Add epsilon to the moving variance.
-  const string epsilon_add_op_name = node.name() + "_epsilon";
+  const std::string epsilon_add_op_name = node.name() + "_epsilon";
   auto* epsilon_add_op = new AddOperator;
   epsilon_add_op->inputs.push_back(moving_variance_input);
   epsilon_add_op->inputs.push_back(epsilon_array_name);
@@ -1821,14 +1826,14 @@ tensorflow::Status ConvertFusedBatchNormOperator(
   model->operators.emplace_back(epsilon_add_op);
 
   // Take the inverse square root of the (variance + epsilon).
-  const string rsqrt_op_name = node.name() + "_rsqrt";
+  const std::string rsqrt_op_name = node.name() + "_rsqrt";
   auto* rsqrt_op = new TensorFlowRsqrtOperator;
   rsqrt_op->inputs.push_back(epsilon_add_op_name);
   rsqrt_op->outputs.push_back(rsqrt_op_name);
   model->operators.emplace_back(rsqrt_op);
 
   // Multiply the result by gamma.
-  const string multiplier = node.name() + "_mul";
+  const std::string multiplier = node.name() + "_mul";
   auto* mul_op = new MulOperator;
   mul_op->inputs.push_back(rsqrt_op_name);
   mul_op->inputs.push_back(gamma_input);
@@ -1966,8 +1971,8 @@ tensorflow::Status ConvertTransposeConvOperator(
         << "].";
   }
 
-  const string& weights_name = node.input(TransposeConvOperator::WEIGHTS);
-  const string& transposed_weights_name = weights_name + "_transposed";
+  const std::string& weights_name = node.input(TransposeConvOperator::WEIGHTS);
+  const std::string& transposed_weights_name = weights_name + "_transposed";
   // Check if a TransposeOperator was already created for these weights
   // (can happen when multiple layers share the same weights).
   const Operator* existing_transpose =
@@ -1980,7 +1985,7 @@ tensorflow::Status ConvertTransposeConvOperator(
     // because they consider this a backward conv, inverting the sense of
     // input/output.)
     TransposeOperator* transpose = new TransposeOperator;
-    string perm_array = CreateConstArray<ArrayDataType::kInt32>(
+    std::string perm_array = CreateConstArray<ArrayDataType::kInt32>(
         model, node.name() + "_transpose_perm", {2, 0, 1, 3});
     transpose->inputs = {weights_name, perm_array};
     transpose->outputs = {transposed_weights_name};
@@ -2137,10 +2142,10 @@ tensorflow::Status ConvertReverseSequenceOperator(
 void StripCaretFromArrayNames(Model* model) {
   for (auto& op : model->operators) {
     for (auto& input : op->inputs) {
-      input = string(absl::StripPrefix(input, "^"));
+      input = std::string(absl::StripPrefix(input, "^"));
     }
     for (auto& output : op->outputs) {
-      output = string(absl::StripPrefix(output, "^"));
+      output = std::string(absl::StripPrefix(output, "^"));
     }
   }
   for (auto& array : model->GetArrayMap()) {
@@ -2152,7 +2157,7 @@ void StripCaretFromArrayNames(Model* model) {
 
 void StripZeroOutputIndexFromInputs(NodeDef* node) {
   for (auto& input : *node->mutable_input()) {
-    input = string(absl::StripSuffix(input, ":0"));
+    input = std::string(absl::StripSuffix(input, ":0"));
   }
 }
 
@@ -2170,15 +2175,15 @@ void StripZeroOutputIndexFromInputs(NodeDef* node) {
 // all nodes, we can use that information.
 void AddExtraOutputs(Model* model) {
   // Construct the list of all arrays consumed by anything in the graph.
-  std::vector<string> consumed_arrays;
+  std::vector<std::string> consumed_arrays;
   // Add arrays consumed by an op.
   for (const auto& consumer_op : model->operators) {
-    for (const string& input : consumer_op->inputs) {
+    for (const std::string& input : consumer_op->inputs) {
       consumed_arrays.push_back(input);
     }
   }
   // Add global outputs of the model.
-  for (const string& output_array : model->flags.output_arrays()) {
+  for (const std::string& output_array : model->flags.output_arrays()) {
     consumed_arrays.push_back(output_array);
   }
   // Add arrays consumed by a RNN back-edge.
@@ -2187,7 +2192,7 @@ void AddExtraOutputs(Model* model) {
   }
   // Now add operator outputs so that all arrays that are consumed,
   // are produced.
-  for (const string& consumed_array : consumed_arrays) {
+  for (const std::string& consumed_array : consumed_arrays) {
     // Test if consumed_array is already the output of some op.
     // This has occurred in a model where separate nodes had names of the form
     // foo:$i with the same base name foo.
@@ -2195,7 +2200,7 @@ void AddExtraOutputs(Model* model) {
       continue;
     }
     // Split the consumed array name into the form name:output_index.
-    const std::vector<string>& split = absl::StrSplit(consumed_array, ':');
+    const std::vector<std::string>& split = absl::StrSplit(consumed_array, ':');
     // If not of the form name:output_index, then this is not an additional
     // output of a node with multiple outputs, so nothing to do here.
     if (split.size() != 2) {
@@ -2288,7 +2293,7 @@ tensorflow::Status ConvertTopKV2Operator(
   op->inputs.push_back(node.input(0));
   // K can be encoded as attr (TopK) convert it to a const.
   if (HasAttr(node, "k")) {
-    string k_array = CreateConstArray<ArrayDataType::kInt32>(
+    std::string k_array = CreateConstArray<ArrayDataType::kInt32>(
         model, node.name() + "k", {static_cast<int32>(GetIntAttr(node, "k"))});
     op->inputs.push_back(k_array);
   } else {
@@ -2346,7 +2351,7 @@ tensorflow::Status ConvertSparseToDenseOperator(
   TF_QCHECK_OK(CheckInputsCount(node, tf_import_flags, 4));
 
   auto* op = new SparseToDenseOperator;
-  for (const string& input : node.input()) {
+  for (const std::string& input : node.input()) {
     op->inputs.push_back(input);
   }
   op->outputs.push_back(node.name());
@@ -2371,7 +2376,7 @@ tensorflow::Status ConvertOneHotOperator(
 
   auto op = absl::make_unique<OneHotOperator>();
   op->axis = HasAttr(node, "axis") ? GetIntAttr(node, "axis") : -1;
-  for (const string& input : node.input()) {
+  for (const std::string& input : node.input()) {
     op->inputs.push_back(input);
   }
   op->outputs.push_back(node.name());
@@ -2386,7 +2391,7 @@ tensorflow::Status ConvertCTCBeamSearchDecoderOperator(
   TF_QCHECK_OK(CheckInputsCount(node, tf_import_flags, 2));
 
   auto* op = new CTCBeamSearchDecoderOperator;
-  for (const string& input : node.input()) {
+  for (const std::string& input : node.input()) {
     op->inputs.push_back(input);
   }
 
@@ -2434,7 +2439,7 @@ tensorflow::Status ConvertUnidirectionalSequenceLstm(
         count++;
       } else {
         // Optional input.
-        string optional_name = node.name() + "_" + std::to_string(idx);
+        std::string optional_name = node.name() + "_" + std::to_string(idx);
         model->CreateOptionalArray(optional_name);
         op->inputs[idx] = optional_name;
       }
@@ -2442,7 +2447,7 @@ tensorflow::Status ConvertUnidirectionalSequenceLstm(
   } else {  // Legacy version.
     std::vector<bool> done(kInputsSize);
     int idx = 0;
-    for (const string& input : node.input()) {
+    for (const std::string& input : node.input()) {
       int real_index = indices.i(idx);
       op->inputs[real_index] = (input);
       done[real_index] = true;
@@ -2451,7 +2456,7 @@ tensorflow::Status ConvertUnidirectionalSequenceLstm(
 
     for (int idx = 0; idx < done.size(); idx++) {
       if (!done[idx]) {
-        string optional_name = node.name() + "_" + std::to_string(idx);
+        std::string optional_name = node.name() + "_" + std::to_string(idx);
         model->CreateOptionalArray(optional_name);
         op->inputs[idx] = optional_name;
       }
@@ -2491,7 +2496,7 @@ tensorflow::Status ConvertUnidirectionalSequenceRnn(
   }
 
   auto* op = new UnidirectionalSequenceRnnOperator();
-  for (const string& input : node.input()) {
+  for (const std::string& input : node.input()) {
     op->inputs.push_back(input);
   }
   // Only use the last one as input.
@@ -2703,7 +2708,8 @@ std::unique_ptr<Model> ImportTensorFlowGraphDef(
         << "Unsupported explicit zero output index: "
         << specified_input_array.name();
   }
-  for (const string& specified_output_array : model_flags.output_arrays()) {
+  for (const std::string& specified_output_array :
+       model_flags.output_arrays()) {
     CHECK(!absl::EndsWith(specified_output_array, ":0"))
         << "Unsupported explicit zero output index: " << specified_output_array;
   }
@@ -2746,7 +2752,7 @@ std::unique_ptr<Model> ImportTensorFlowGraphDef(
 
 std::unique_ptr<Model> ImportTensorFlowGraphDef(
     const ModelFlags& model_flags, const TensorFlowImportFlags& tf_import_flags,
-    const string& input_file_contents) {
+    const std::string& input_file_contents) {
   std::unique_ptr<GraphDef> tf_graph(new GraphDef);
   CHECK(ParseFromStringEitherTextOrBinary(input_file_contents, tf_graph.get()));
 
diff --git a/tensorflow/lite/toco/import_tensorflow.h b/tensorflow/lite/toco/import_tensorflow.h
index 4ada25e2fbe..a95cfee2e75 100644
--- a/tensorflow/lite/toco/import_tensorflow.h
+++ b/tensorflow/lite/toco/import_tensorflow.h
@@ -43,7 +43,7 @@ std::unique_ptr<Model> ImportTensorFlowGraphDef(
 // flags.
 std::unique_ptr<Model> ImportTensorFlowGraphDef(
     const ModelFlags& model_flags, const TensorFlowImportFlags& tf_import_flags,
-    const string& input_file_contents);
+    const std::string& input_file_contents);
 
 // Gets a list of supported ops by their names.
 std::vector<std::string> GetPotentiallySupportedOps();
diff --git a/tensorflow/lite/toco/import_tensorflow_test.cc b/tensorflow/lite/toco/import_tensorflow_test.cc
index eb6ed3fdd74..98ce18bf38e 100644
--- a/tensorflow/lite/toco/import_tensorflow_test.cc
+++ b/tensorflow/lite/toco/import_tensorflow_test.cc
@@ -163,7 +163,7 @@ void BuildConstNode(std::initializer_list<int64_t> shape,
 TEST(FlexImportTest, ConditionalConst) {
   Model model;
   auto build_and_import_node =
-      [&model](const string& name, std::initializer_list<int64_t> shape,
+      [&model](const std::string& name, std::initializer_list<int64_t> shape,
                tensorflow::DataType dtype, int64_t num_elements) {
         NodeDef node;
         BuildConstNode(shape, dtype, num_elements, &node);
@@ -486,8 +486,8 @@ class TensorContentTest : public ::testing::Test {
         break;
     }
     t.set_tensor_content(
-        string(reinterpret_cast<const char*>(allocated_content.get()),
-               num_elements * sizeof(T)));
+        std::string(reinterpret_cast<const char*>(allocated_content.get()),
+                    num_elements * sizeof(T)));
 
     AttrValue value_attr;
     SetAttrValue(t, &value_attr);
diff --git a/tensorflow/lite/toco/model.h b/tensorflow/lite/toco/model.h
index 89ea9d997f9..58397f5a3eb 100644
--- a/tensorflow/lite/toco/model.h
+++ b/tensorflow/lite/toco/model.h
@@ -287,7 +287,7 @@ struct DataTypeImpl<ArrayDataType::kUint64> {
 };
 template <>
 struct DataTypeImpl<ArrayDataType::kString> {
-  typedef string Type;
+  typedef std::string Type;
 };
 template <>
 struct DataTypeImpl<ArrayDataType::kComplex64> {
@@ -398,10 +398,10 @@ struct Operator {
   // names to addresses is given by the Model, which owns both Operator's and
   // Array's. Thus, an Operator on its own doesn't contain much information,
   // it is meant to be used in conjunction with the Model that owns it.
-  std::vector<string> inputs;
+  std::vector<std::string> inputs;
 
   // Output activation arrays. Same comments as for inputs apply here too.
-  std::vector<string> outputs;
+  std::vector<std::string> outputs;
 
   // If true, the operator has more outputs than are listed in the 'outputs'
   // member. These need to be resolved by some graph transformation.
@@ -415,7 +415,7 @@ struct Operator {
   // It's guaranteed to be filled for `TensorFlowUnsupportedOperator`.
   // It's not guaranteed to be filled for other ops. Ops created by graph
   // transformations won't have TensorFlow NodeDef.
-  string tensorflow_node_def;
+  std::string tensorflow_node_def;
 
  protected:
   // Constructor used by subclasses for specific OperatorType's.
@@ -1693,7 +1693,7 @@ struct TensorFlowUnsupportedOperator : Operator {
   TensorFlowUnsupportedOperator() : Operator(OperatorType::kUnsupported) {}
 
   // The original TF operation type. Used for diagnostic purposes.
-  string tensorflow_op;
+  std::string tensorflow_op;
   // A boolean indicating if the unsupported op should be treated as quantized.
   bool quantized = false;
   // A boolean indicating if the unsupported op output should allow float values
@@ -2393,14 +2393,16 @@ struct Array {
 // Owns everything.
 class Model {
  public:
-  using ArrayMap = std::unordered_map<string, std::unique_ptr<Array>>;
+  using ArrayMap = std::unordered_map<std::string, std::unique_ptr<Array>>;
 
-  bool HasArray(const string& name) const { return arrays.count(name) > 0; }
-  Array& GetArray(const string& name) const {
+  bool HasArray(const std::string& name) const {
+    return arrays.count(name) > 0;
+  }
+  Array& GetArray(const std::string& name) const {
     DCHECK(HasArray(name)) << "Array not found: " << name;
     return *arrays.at(name);
   }
-  Array& GetOrCreateArray(const string& name) {
+  Array& GetOrCreateArray(const std::string& name) {
     // Make sure name is not used by an optional array
     DCHECK(!optional_arrays.count(name));
     if (!HasArray(name)) {
@@ -2410,17 +2412,17 @@ class Model {
     Array& result = GetArray(name);
     return result;
   }
-  void CreateOptionalArray(const string& name) {
+  void CreateOptionalArray(const std::string& name) {
     DCHECK(!arrays.count(name) && !optional_arrays.count(name));
     optional_arrays.insert(name);
   }
-  bool IsOptionalArray(const string& name) const {
+  bool IsOptionalArray(const std::string& name) const {
     return optional_arrays.count(name);
   }
 
   // Note that this invalidates all array iterators.
-  void EraseArray(const string& name) { arrays.erase(name); }
-  void EraseArrays(std::function<bool(const string&)> discardable) {
+  void EraseArray(const std::string& name) { arrays.erase(name); }
+  void EraseArrays(std::function<bool(const std::string&)> discardable) {
     for (auto it = arrays.begin(); it != arrays.end();) {
       if (discardable(it->first)) {
         it = arrays.erase(it);
@@ -2434,17 +2436,17 @@ class Model {
 
   int64 ArithmeticOpsCount() const { return ops_count; }
 
-  void AddInvalidInputArray(string invalid_input_array) {
+  void AddInvalidInputArray(std::string invalid_input_array) {
     invalid_input_arrays_.insert(invalid_input_array);
   }
 
-  const std::unordered_set<string>& GetInvalidInputArrays() const {
+  const std::unordered_set<std::string>& GetInvalidInputArrays() const {
     return invalid_input_arrays_;
   }
 
   // Optional arrays are used for optional tensors,
   // these tensors do not have data, but with reserved names as op inputs.
-  std::set<string> optional_arrays;
+  std::set<std::string> optional_arrays;
 
   // The list of operators. Notice how it's a list of unique_ptr's, implying
   // that the Model is what owns Operator's and keeps them alive.
@@ -2467,10 +2469,10 @@ class Model {
   // that the Model is what owns Array's and keeps them alive.
   // The Operator's refer to these Array's by their name strings, not by their
   // addresses. See Operator::inputs, Operator::outputs.
-  std::unordered_map<string, std::unique_ptr<Array>> arrays;
+  std::unordered_map<std::string, std::unique_ptr<Array>> arrays;
 
   // Invalid input arrays.
-  std::unordered_set<string> invalid_input_arrays_;
+  std::unordered_set<std::string> invalid_input_arrays_;
 };
 
 // OperatorSignature contains the information required to making versioning
diff --git a/tensorflow/lite/toco/model_cmdline_flags.cc b/tensorflow/lite/toco/model_cmdline_flags.cc
index 86a1cedd612..d3c48aee2fe 100644
--- a/tensorflow/lite/toco/model_cmdline_flags.cc
+++ b/tensorflow/lite/toco/model_cmdline_flags.cc
@@ -36,7 +36,7 @@ limitations under the License.
 namespace toco {
 
 bool ParseModelFlagsFromCommandLineFlags(
-    int* argc, char* argv[], string* msg,
+    int* argc, char* argv[], std::string* msg,
     ParsedModelFlags* parsed_model_flags_ptr) {
   ParsedModelFlags& parsed_flags = *parsed_model_flags_ptr;
   using tensorflow::Flag;
@@ -188,7 +188,7 @@ void ReadModelFlagsFromCommandLineFlags(
   // Load proto containing the initial model flags.
   // Additional flags specified on the command line will overwrite the values.
   if (parsed_model_flags.model_flags_file.specified()) {
-    string model_flags_file_contents;
+    std::string model_flags_file_contents;
     QCHECK(port::file::GetContents(parsed_model_flags.model_flags_file.value(),
                                    &model_flags_file_contents,
                                    port::file::Defaults())
@@ -217,9 +217,9 @@ void ReadModelFlagsFromCommandLineFlags(
   }
 
   if (parsed_model_flags.output_arrays.specified()) {
-    std::vector<string> output_arrays =
+    std::vector<std::string> output_arrays =
         absl::StrSplit(parsed_model_flags.output_arrays.value(), ',');
-    for (const string& output_array : output_arrays) {
+    for (const std::string& output_array : output_arrays) {
       model_flags->add_output_arrays(output_array);
     }
   }
@@ -251,7 +251,7 @@ void ReadModelFlagsFromCommandLineFlags(
     QCHECK(uses_multi_input_flags);
     for (const auto& input_array :
          absl::StrSplit(parsed_model_flags.input_arrays.value(), ',')) {
-      model_flags->add_input_arrays()->set_name(string(input_array));
+      model_flags->add_input_arrays()->set_name(std::string(input_array));
     }
   }
   if (parsed_model_flags.mean_value.specified()) {
@@ -261,7 +261,7 @@ void ReadModelFlagsFromCommandLineFlags(
   }
   if (parsed_model_flags.mean_values.specified()) {
     QCHECK(uses_multi_input_flags);
-    std::vector<string> mean_values =
+    std::vector<std::string> mean_values =
         absl::StrSplit(parsed_model_flags.mean_values.value(), ',');
     QCHECK(mean_values.size() == model_flags->input_arrays_size());
     for (size_t i = 0; i < mean_values.size(); ++i) {
@@ -278,7 +278,7 @@ void ReadModelFlagsFromCommandLineFlags(
   }
   if (parsed_model_flags.std_values.specified()) {
     QCHECK(uses_multi_input_flags);
-    std::vector<string> std_values =
+    std::vector<std::string> std_values =
         absl::StrSplit(parsed_model_flags.std_values.value(), ',');
     QCHECK(std_values.size() == model_flags->input_arrays_size());
     for (size_t i = 0; i < std_values.size(); ++i) {
@@ -296,7 +296,7 @@ void ReadModelFlagsFromCommandLineFlags(
   }
   if (parsed_model_flags.input_data_types.specified()) {
     QCHECK(uses_multi_input_flags);
-    std::vector<string> input_data_types =
+    std::vector<std::string> input_data_types =
         absl::StrSplit(parsed_model_flags.input_data_types.value(), ',');
     QCHECK(input_data_types.size() == model_flags->input_arrays_size());
     for (size_t i = 0; i < input_data_types.size(); ++i) {
@@ -319,7 +319,7 @@ void ReadModelFlagsFromCommandLineFlags(
   }
   if (parsed_model_flags.input_shapes.specified()) {
     QCHECK(uses_multi_input_flags);
-    std::vector<string> input_shapes =
+    std::vector<std::string> input_shapes =
         absl::StrSplit(parsed_model_flags.input_shapes.value(), ':');
     QCHECK(input_shapes.size() == model_flags->input_arrays_size());
     for (size_t i = 0; i < input_shapes.size(); ++i) {
@@ -352,8 +352,8 @@ void ReadModelFlagsFromCommandLineFlags(
   for (const auto& element : parsed_model_flags.rnn_states.value().elements) {
     auto* rnn_state_proto = model_flags->add_rnn_states();
     for (const auto& kv_pair : element) {
-      const string& key = kv_pair.first;
-      const string& value = kv_pair.second;
+      const std::string& key = kv_pair.first;
+      const std::string& value = kv_pair.second;
       if (key == "state_array") {
         rnn_state_proto->set_state_array(value);
       } else if (key == "back_edge_source_array") {
@@ -377,8 +377,8 @@ void ReadModelFlagsFromCommandLineFlags(
   for (const auto& element : parsed_model_flags.model_checks.value().elements) {
     auto* model_check_proto = model_flags->add_model_checks();
     for (const auto& kv_pair : element) {
-      const string& key = kv_pair.first;
-      const string& value = kv_pair.second;
+      const std::string& key = kv_pair.first;
+      const std::string& value = kv_pair.second;
       if (key == "count_type") {
         model_check_proto->set_count_type(value);
       } else if (key == "count_min") {
@@ -411,7 +411,7 @@ void ReadModelFlagsFromCommandLineFlags(
   }
 
   if (parsed_model_flags.arrays_extra_info_file.specified()) {
-    string arrays_extra_info_file_contents;
+    std::string arrays_extra_info_file_contents;
     CHECK(port::file::GetContents(
               parsed_model_flags.arrays_extra_info_file.value(),
               &arrays_extra_info_file_contents, port::file::Defaults())
@@ -443,7 +443,7 @@ void ParseModelFlagsOrDie(int* argc, char* argv[]) {
   // TODO(aselle): in the future allow Google version to use
   // flags, and only use this mechanism for open source
   auto* flags = UncheckedGlobalParsedModelFlags(false);
-  string msg;
+  std::string msg;
   bool model_success =
       toco::ParseModelFlagsFromCommandLineFlags(argc, argv, &msg, flags);
   if (!model_success || !msg.empty()) {
diff --git a/tensorflow/lite/toco/model_cmdline_flags.h b/tensorflow/lite/toco/model_cmdline_flags.h
index 1642e053199..23e79e62047 100644
--- a/tensorflow/lite/toco/model_cmdline_flags.h
+++ b/tensorflow/lite/toco/model_cmdline_flags.h
@@ -28,7 +28,7 @@ namespace toco {
 // is successful. msg has the usage string if there was an error or
 // "--help" was specified
 bool ParseModelFlagsFromCommandLineFlags(
-    int* argc, char* argv[], string* msg,
+    int* argc, char* argv[], std::string* msg,
     ParsedModelFlags* parsed_model_flags_ptr);
 // Populate the ModelFlags proto with model data.
 void ReadModelFlagsFromCommandLineFlags(
diff --git a/tensorflow/lite/toco/model_cmdline_flags_test.cc b/tensorflow/lite/toco/model_cmdline_flags_test.cc
index bff8e4843a0..b87e200095c 100644
--- a/tensorflow/lite/toco/model_cmdline_flags_test.cc
+++ b/tensorflow/lite/toco/model_cmdline_flags_test.cc
@@ -35,8 +35,8 @@ TEST(ModelCmdlineFlagsTest, ParseArgsStringMapList) {
       "back_edge_source_array:rnn/basic_lstm_cell/Mul_2,size:4}",
       nullptr};
 
-  string expected_input_arrays = "input_1";
-  std::vector<std::unordered_map<string, string>> expected_rnn_states;
+  std::string expected_input_arrays = "input_1";
+  std::vector<std::unordered_map<std::string, std::string>> expected_rnn_states;
   expected_rnn_states.push_back(
       {{"state_array", "rnn/BasicLSTMCellZeroState/zeros"},
        {"back_edge_source_array", "rnn/basic_lstm_cell/Add_1"},
@@ -46,7 +46,7 @@ TEST(ModelCmdlineFlagsTest, ParseArgsStringMapList) {
        {"back_edge_source_array", "rnn/basic_lstm_cell/Mul_2"},
        {"size", "4"}});
 
-  string message;
+  std::string message;
   ParsedModelFlags result_flags;
 
   EXPECT_TRUE(ParseModelFlagsFromCommandLineFlags(
diff --git a/tensorflow/lite/toco/tensorflow_util.cc b/tensorflow/lite/toco/tensorflow_util.cc
index db9388b040c..bf5d8016857 100644
--- a/tensorflow/lite/toco/tensorflow_util.cc
+++ b/tensorflow/lite/toco/tensorflow_util.cc
@@ -37,16 +37,16 @@ namespace toco {
 using tensorflow::AttrValue;
 using tensorflow::GraphDef;
 
-void LogDumpGraphDef(int log_level, const string& message,
+void LogDumpGraphDef(int log_level, const std::string& message,
                      const GraphDef& tf_graph) {
   if (!VLOG_IS_ON(log_level)) {
     return;
   }
-  std::set<string> ops;
+  std::set<std::string> ops;
   for (const auto& node : tf_graph.node()) {
     ops.insert(node.op());
   }
-  string dump;
+  std::string dump;
   toco::port::AppendF(&dump, R"MSG(
 BEGIN DUMP OF TENSORFLOW GRAPHDEF (%s)
 There are %d nodes.
diff --git a/tensorflow/lite/toco/tensorflow_util.h b/tensorflow/lite/toco/tensorflow_util.h
index 010fbe88b21..6abad52b9cb 100644
--- a/tensorflow/lite/toco/tensorflow_util.h
+++ b/tensorflow/lite/toco/tensorflow_util.h
@@ -24,7 +24,7 @@ limitations under the License.
 
 namespace toco {
 
-void LogDumpGraphDef(int log_level, const string& message,
+void LogDumpGraphDef(int log_level, const std::string& message,
                      const tensorflow::GraphDef& tf_graph);
 
 }  // namespace toco
diff --git a/tensorflow/lite/toco/toco.cc b/tensorflow/lite/toco/toco.cc
index aa7e43350ca..18800c7b726 100644
--- a/tensorflow/lite/toco/toco.cc
+++ b/tensorflow/lite/toco/toco.cc
@@ -21,7 +21,7 @@ limitations under the License.
 #include "tensorflow/lite/toco/toco_convert.h"
 
 int main(int argc, char** argv) {
-  toco::string msg;
+  std::string msg;
   toco::ParsedTocoFlags parsed_toco_flags;
   toco::ParsedModelFlags parsed_model_flags;
 
diff --git a/tensorflow/lite/toco/toco_cmdline_flags.cc b/tensorflow/lite/toco/toco_cmdline_flags.cc
index c133db8f2a4..223cfd40775 100644
--- a/tensorflow/lite/toco/toco_cmdline_flags.cc
+++ b/tensorflow/lite/toco/toco_cmdline_flags.cc
@@ -29,7 +29,7 @@ limitations under the License.
 namespace toco {
 
 bool ParseTocoFlagsFromCommandLineFlags(
-    int* argc, char* argv[], string* msg,
+    int* argc, char* argv[], std::string* msg,
     ParsedTocoFlags* parsed_toco_flags_ptr) {
   using tensorflow::Flag;
   ParsedTocoFlags& parsed_flags = *parsed_toco_flags_ptr;
@@ -212,7 +212,7 @@ enum class FlagRequirement {
 
 // Enforces the FlagRequirements are met for a given flag.
 template <typename T>
-void EnforceFlagRequirement(const T& flag, const string& flag_name,
+void EnforceFlagRequirement(const T& flag, const std::string& flag_name,
                             FlagRequirement requirement) {
   if (requirement == FlagRequirement::kMustBeSpecified) {
     QCHECK(flag.specified()) << "Missing required flag " << flag_name;
@@ -317,7 +317,7 @@ void ReadTocoFlagsFromCommandLineFlags(const ParsedTocoFlags& parsed_toco_flags,
            "type of input arrays, use --input_data_type. If you are trying to "
            "control the quantization/dequantization of real-numbers input "
            "arrays in the output file, use --inference_input_type.";
-    std::vector<string> input_types =
+    std::vector<std::string> input_types =
         absl::StrSplit(parsed_toco_flags.input_types.value(), ',');
     QCHECK(!input_types.empty());
     for (int i = 1; i < input_types.size(); i++) {
diff --git a/tensorflow/lite/toco/toco_cmdline_flags.h b/tensorflow/lite/toco/toco_cmdline_flags.h
index cf57055abc2..278c49d25e3 100644
--- a/tensorflow/lite/toco/toco_cmdline_flags.h
+++ b/tensorflow/lite/toco/toco_cmdline_flags.h
@@ -25,7 +25,8 @@ namespace toco {
 // Parse and remove arguments handled from toco. Returns true if parsing
 // is successful. msg has the usage string if there was an error or
 // "--help" was specified
-bool ParseTocoFlagsFromCommandLineFlags(int* argc, char* argv[], string* msg,
+bool ParseTocoFlagsFromCommandLineFlags(int* argc, char* argv[],
+                                        std::string* msg,
                                         ParsedTocoFlags* parsed_toco_flags_ptr);
 // Populate the TocoFlags proto with parsed_toco_flags data.
 void ReadTocoFlagsFromCommandLineFlags(const ParsedTocoFlags& parsed_toco_flags,
diff --git a/tensorflow/lite/toco/toco_cmdline_flags_test.cc b/tensorflow/lite/toco/toco_cmdline_flags_test.cc
index a1066e063bc..c1d0e2f7e9b 100644
--- a/tensorflow/lite/toco/toco_cmdline_flags_test.cc
+++ b/tensorflow/lite/toco/toco_cmdline_flags_test.cc
@@ -29,7 +29,7 @@ TEST(TocoCmdlineFlagsTest, DefaultValue) {
   // TF flag parsing lib is relaying on this invariant.
   const char* args[] = {"toco", nullptr};
 
-  string message;
+  std::string message;
   ParsedTocoFlags result_flags;
 
   EXPECT_TRUE(ParseTocoFlagsFromCommandLineFlags(
@@ -41,7 +41,7 @@ TEST(TocoCmdlineFlagsTest, ParseFlags) {
   int argc = 2;
   const char* args[] = {"toco", "--allow_dynamic_tensors=false", nullptr};
 
-  string message;
+  std::string message;
   ParsedTocoFlags result_flags;
 
   EXPECT_TRUE(ParseTocoFlagsFromCommandLineFlags(
diff --git a/tensorflow/lite/toco/toco_convert.cc b/tensorflow/lite/toco/toco_convert.cc
index 62dacef0b60..0e04b9ce143 100644
--- a/tensorflow/lite/toco/toco_convert.cc
+++ b/tensorflow/lite/toco/toco_convert.cc
@@ -32,7 +32,7 @@ namespace toco {
 namespace {
 
 // Checks the permissions of the output file to ensure it is writeable.
-void CheckOutputFilePermissions(const Arg<string>& output_file) {
+void CheckOutputFilePermissions(const Arg<std::string>& output_file) {
   QCHECK(output_file.specified()) << "Missing required flag --output_file.\n";
   QCHECK(port::file::Writable(output_file.value()).ok())
       << "Specified output_file is not writable: " << output_file.value()
@@ -40,7 +40,7 @@ void CheckOutputFilePermissions(const Arg<string>& output_file) {
 }
 
 // Checks the permissions of the frozen model file.
-void CheckFrozenModelPermissions(const Arg<string>& input_file) {
+void CheckFrozenModelPermissions(const Arg<std::string>& input_file) {
   QCHECK(input_file.specified()) << "Missing required flag --input_file.\n";
   QCHECK(port::file::Exists(input_file.value(), port::file::Defaults()).ok())
       << "Specified input_file does not exist: " << input_file.value() << ".\n";
@@ -55,7 +55,7 @@ void CheckFrozenModelPermissions(const Arg<string>& input_file) {
 void ReadInputData(const ParsedTocoFlags& parsed_toco_flags,
                    const ParsedModelFlags& parsed_model_flags,
                    TocoFlags* toco_flags, ModelFlags* model_flags,
-                   string* graph_def_contents) {
+                   std::string* graph_def_contents) {
   port::CheckInitGoogleIsDone("InitGoogle is not done yet.\n");
 
   // Ensure savedmodel_directory is not set.
@@ -71,10 +71,10 @@ void ReadInputData(const ParsedTocoFlags& parsed_toco_flags,
 }
 }  // namespace
 
-tensorflow::Status Convert(const string& graph_def_contents,
+tensorflow::Status Convert(const std::string& graph_def_contents,
                            const TocoFlags& toco_flags,
                            const ModelFlags& model_flags,
-                           string* output_file_contents,
+                           std::string* output_file_contents,
                            int64* arithmetic_ops_count = nullptr) {
   std::unique_ptr<Model> model =
       Import(toco_flags, model_flags, graph_def_contents);
@@ -95,12 +95,12 @@ tensorflow::Status Convert(const ParsedTocoFlags& parsed_toco_flags,
   TocoFlags toco_flags;
   ReadTocoFlagsFromCommandLineFlags(parsed_toco_flags, &toco_flags);
 
-  string graph_def_contents;
+  std::string graph_def_contents;
   ReadInputData(parsed_toco_flags, parsed_model_flags, &toco_flags,
                 &model_flags, &graph_def_contents);
   CheckOutputFilePermissions(parsed_toco_flags.output_file);
 
-  string output_file_contents;
+  std::string output_file_contents;
   TF_RETURN_IF_ERROR(Convert(graph_def_contents, toco_flags, model_flags,
                              &output_file_contents));
 
diff --git a/tensorflow/lite/toco/toco_convert.h b/tensorflow/lite/toco/toco_convert.h
index 4e3ffe5119b..85abcfcc3bb 100644
--- a/tensorflow/lite/toco/toco_convert.h
+++ b/tensorflow/lite/toco/toco_convert.h
@@ -22,10 +22,10 @@ limitations under the License.
 
 namespace toco {
 
-tensorflow::Status Convert(const string& graph_def_contents,
+tensorflow::Status Convert(const std::string& graph_def_contents,
                            const TocoFlags& toco_flags,
                            const ModelFlags& model_flags,
-                           string* output_file_contents,
+                           std::string* output_file_contents,
                            int64* arithmetic_ops_count = nullptr);
 
 tensorflow::Status Convert(const ParsedTocoFlags& parsed_toco_flags,
diff --git a/tensorflow/lite/toco/toco_convert_test.cc b/tensorflow/lite/toco/toco_convert_test.cc
index b02c1043f2b..bd6a8e79b45 100644
--- a/tensorflow/lite/toco/toco_convert_test.cc
+++ b/tensorflow/lite/toco/toco_convert_test.cc
@@ -32,8 +32,8 @@ TEST(TocoTest, BadInputFormat) {
   TocoFlags toco_flags;
   ModelFlags model_flags;
 
-  string input;
-  string output;
+  std::string input;
+  std::string output;
 
   EXPECT_DEATH(Convert(input, toco_flags, model_flags, &output).ok(),
                "Unhandled input_format='FILE_FORMAT_UNKNOWN'");
@@ -44,8 +44,8 @@ TEST(TocoTest, MissingOutputArrays) {
   ModelFlags model_flags;
 
   toco_flags.set_input_format(TENSORFLOW_GRAPHDEF);
-  string input;
-  string output;
+  std::string input;
+  std::string output;
 
   EXPECT_DEATH(Convert(input, toco_flags, model_flags, &output).ok(),
                "This model does not define output arrays, so a --output_arrays "
@@ -58,8 +58,8 @@ TEST(TocoTest, BadOutputArray) {
 
   toco_flags.set_input_format(TENSORFLOW_GRAPHDEF);
   model_flags.add_output_arrays("output1");
-  string input;
-  string output;
+  std::string input;
+  std::string output;
 
   EXPECT_DEATH(Convert(input, toco_flags, model_flags, &output).ok(),
                "Specified output array .output1. is not produced by any op "
@@ -72,7 +72,7 @@ TEST(TocoTest, BadOutputFormat) {
 
   toco_flags.set_input_format(TENSORFLOW_GRAPHDEF);
   model_flags.add_output_arrays("output1");
-  string input = R"GraphDef(
+  std::string input = R"GraphDef(
     node {
       name: "output1"
       input: "input1"
@@ -82,7 +82,7 @@ TEST(TocoTest, BadOutputFormat) {
     }
   )GraphDef";
 
-  string output;
+  std::string output;
 
   EXPECT_DEATH(Convert(input, toco_flags, model_flags, &output).ok(),
                "Unhandled output_format='FILE_FORMAT_UNKNOWN'");
@@ -97,7 +97,7 @@ TEST(TocoTest, SimpleFloatModel) {
 
   // Inputs are automatically selected (but that might not be a good idea).
   model_flags.add_output_arrays("output1");
-  string input = R"GraphDef(
+  std::string input = R"GraphDef(
     node {
       name: "input1"
       op: "Placeholder"
@@ -117,7 +117,7 @@ TEST(TocoTest, SimpleFloatModel) {
     }
   )GraphDef";
 
-  string output;
+  std::string output;
   EXPECT_TRUE(Convert(input, toco_flags, model_flags, &output).ok());
   EXPECT_TRUE(!output.empty());
 }
@@ -139,7 +139,7 @@ TEST(TocoTest, TransientStringTensors) {
   indices_1->set_name("indices1");
 
   model_flags.add_output_arrays("output1");
-  string input = R"GraphDef(
+  std::string input = R"GraphDef(
     node {
       name: "input1"
       op: "Placeholder"
@@ -169,7 +169,7 @@ TEST(TocoTest, TransientStringTensors) {
     }
   )GraphDef";
 
-  string output;
+  std::string output;
 
   EXPECT_TRUE(Convert(input, toco_flags, model_flags, &output).ok());
   EXPECT_TRUE(!output.empty());
diff --git a/tensorflow/lite/toco/toco_port.cc b/tensorflow/lite/toco/toco_port.cc
index d2f1d102c5a..8352e0fd9f2 100644
--- a/tensorflow/lite/toco/toco_port.cc
+++ b/tensorflow/lite/toco/toco_port.cc
@@ -28,7 +28,7 @@ double round(double x) { return ::round(x); }
 
 namespace toco {
 namespace port {
-void CopyToBuffer(const string& src, char* dest) {
+void CopyToBuffer(const std::string& src, char* dest) {
   memcpy(dest, src.data(), src.size());
 }
 
@@ -84,7 +84,7 @@ toco::port::file::Options ToOptions(const ::file::Options& options) {
   return Options();
 }
 
-tensorflow::Status Writable(const string& filename) {
+tensorflow::Status Writable(const std::string& filename) {
   File* f = nullptr;
   const auto status = ::file::Open(filename, "w", &f, ::file::Defaults());
   if (f) {
@@ -93,28 +93,30 @@ tensorflow::Status Writable(const string& filename) {
   return ToStatus(status);
 }
 
-tensorflow::Status Readable(const string& filename,
+tensorflow::Status Readable(const std::string& filename,
                             const file::Options& options) {
   return ToStatus(::file::Readable(filename, ::file::Defaults()));
 }
 
-tensorflow::Status Exists(const string& filename,
+tensorflow::Status Exists(const std::string& filename,
                           const file::Options& options) {
   auto status = ::file::Exists(filename, ::file::Defaults());
   return ToStatus(status);
 }
 
-tensorflow::Status GetContents(const string& filename, string* contents,
+tensorflow::Status GetContents(const std::string& filename,
+                               std::string* contents,
                                const file::Options& options) {
   return ToStatus(::file::GetContents(filename, contents, ::file::Defaults()));
 }
 
-tensorflow::Status SetContents(const string& filename, const string& contents,
+tensorflow::Status SetContents(const std::string& filename,
+                               const std::string& contents,
                                const file::Options& options) {
   return ToStatus(::file::SetContents(filename, contents, ::file::Defaults()));
 }
 
-string JoinPath(const string& a, const string& b) {
+std::string JoinPath(const std::string& a, const std::string& b) {
   return ::file::JoinPath(a, b);
 }
 
diff --git a/tensorflow/lite/toco/toco_port.h b/tensorflow/lite/toco/toco_port.h
index 5a80d29b72a..e57420fba4f 100644
--- a/tensorflow/lite/toco/toco_port.h
+++ b/tensorflow/lite/toco/toco_port.h
@@ -68,21 +68,23 @@ inline Options Defaults() {
   Options o;
   return o;
 }
-tensorflow::Status GetContents(const string& filename, string* contents,
+tensorflow::Status GetContents(const std::string& filename,
+                               std::string* contents, const Options& options);
+tensorflow::Status SetContents(const std::string& filename,
+                               const std::string& contents,
                                const Options& options);
-tensorflow::Status SetContents(const string& filename, const string& contents,
-                               const Options& options);
-string JoinPath(const string& base, const string& filename);
-tensorflow::Status Writable(const string& filename);
-tensorflow::Status Readable(const string& filename, const Options& options);
-tensorflow::Status Exists(const string& filename, const Options& options);
+std::string JoinPath(const std::string& base, const std::string& filename);
+tensorflow::Status Writable(const std::string& filename);
+tensorflow::Status Readable(const std::string& filename,
+                            const Options& options);
+tensorflow::Status Exists(const std::string& filename, const Options& options);
 }  // namespace file
 
 // Copy `src` string to `dest`. User must ensure `dest` has enough space.
 #if defined(PLATFORM_GOOGLE)
 void CopyToBuffer(const ::absl::Cord& src, char* dest);
 #endif  // PLATFORM_GOOGLE
-void CopyToBuffer(const string& src, char* dest);
+void CopyToBuffer(const std::string& src, char* dest);
 
 inline uint32 ReverseBits32(uint32 n) {
   n = ((n >> 1) & 0x55555555) | ((n & 0x55555555) << 1);
diff --git a/tensorflow/lite/toco/toco_tooling.cc b/tensorflow/lite/toco/toco_tooling.cc
index da0915f9739..25b48b54135 100644
--- a/tensorflow/lite/toco/toco_tooling.cc
+++ b/tensorflow/lite/toco/toco_tooling.cc
@@ -37,7 +37,7 @@ namespace toco {
 namespace {
 // CHECK-fails if the model contains a kUnsupported operation.
 void CheckUnsupportedOperations(const Model& model) {
-  std::set<string> unsupported_ops;
+  std::set<std::string> unsupported_ops;
   for (auto& op : model.operators) {
     if (op->type == OperatorType::kUnsupported) {
       unsupported_ops.insert(
@@ -172,7 +172,7 @@ void SetFinalDataTypeOnInputs(const TocoFlags& toco_flags, Model* model) {
   }
 
   for (int i = 0; i < model->flags.input_arrays_size(); i++) {
-    string const& array_name = model->flags.input_arrays(i).name();
+    std::string const& array_name = model->flags.input_arrays(i).name();
     auto* array = &model->GetArray(array_name);
     // Note that the notion of changing data types only applies to real-numbers
     // arrays (see the documentation for inference_input_type).
@@ -209,7 +209,7 @@ void SetFinalDataTypeOnInputs(const TocoFlags& toco_flags, Model* model) {
 
 std::unique_ptr<Model> Import(const TocoFlags& toco_flags,
                               const ModelFlags& model_flags,
-                              const string& input_file_contents) {
+                              const std::string& input_file_contents) {
   std::unique_ptr<Model> model;
   switch (toco_flags.input_format()) {
     case TENSORFLOW_GRAPHDEF: {
@@ -473,7 +473,8 @@ tensorflow::Status TransformWithStatus(const TocoFlags& toco_flags,
 }
 
 tensorflow::Status Export(const TocoFlags& toco_flags, const Model& model,
-                          bool allow_custom_ops, string* output_file_contents) {
+                          bool allow_custom_ops,
+                          std::string* output_file_contents) {
   switch (toco_flags.output_format()) {
     case TENSORFLOW_GRAPHDEF:
       ExportTensorFlowGraphDef(model, output_file_contents);
diff --git a/tensorflow/lite/toco/toco_tooling.h b/tensorflow/lite/toco/toco_tooling.h
index 36996151949..581df4b14fd 100644
--- a/tensorflow/lite/toco/toco_tooling.h
+++ b/tensorflow/lite/toco/toco_tooling.h
@@ -27,7 +27,7 @@ namespace toco {
 // Imports the input file into a Model object.
 std::unique_ptr<Model> Import(const TocoFlags& toco_flags,
                               const ModelFlags& model_flags,
-                              const string& input_file_contents);
+                              const std::string& input_file_contents);
 
 // Transforms a Model. The resulting Model is ready to be passed
 // to Export with the exact same toco_flags.
@@ -42,11 +42,12 @@ inline void Transform(const TocoFlags& toco_flags, Model* model) {
 // Transform, to a file of the format given by
 // toco_flags.output_format().
 tensorflow::Status Export(const TocoFlags& toco_flags, const Model& model,
-                          bool allow_custom_ops, string* output_file_contents);
+                          bool allow_custom_ops,
+                          std::string* output_file_contents);
 
 // This if for backward-compatibility with internal tools.
 inline void Export(const TocoFlags& toco_flags, const Model& model,
-                   string* output_file_contents) {
+                   std::string* output_file_contents) {
   auto status = Export(toco_flags, model, true, output_file_contents);
   if (!status.ok()) {
     LOG(QFATAL) << status.error_message();
diff --git a/tensorflow/lite/toco/tooling_util.cc b/tensorflow/lite/toco/tooling_util.cc
index 82ef4445a84..be4cda8aa3d 100644
--- a/tensorflow/lite/toco/tooling_util.cc
+++ b/tensorflow/lite/toco/tooling_util.cc
@@ -53,8 +53,8 @@ absl::string_view FindLongestCommonPrefix(absl::string_view a,
   return absl::string_view(a.data(), count);
 }
 
-string LogName(const Operator& op) {
-  const string& opname = HelpfulOperatorTypeName(op);
+std::string LogName(const Operator& op) {
+  const std::string& opname = HelpfulOperatorTypeName(op);
   if (op.outputs.empty()) {
     return toco::port::StringF("{%s operator}", opname);
   } else {
@@ -63,7 +63,7 @@ string LogName(const Operator& op) {
   }
 }
 
-string ArrayDataTypeName(ArrayDataType data_type) {
+std::string ArrayDataTypeName(ArrayDataType data_type) {
   switch (data_type) {
     case ArrayDataType::kFloat:
       return "float";
@@ -96,7 +96,7 @@ string ArrayDataTypeName(ArrayDataType data_type) {
   }
 }
 
-bool IsInputArray(const Model& model, const string& array_name) {
+bool IsInputArray(const Model& model, const std::string& array_name) {
   for (const auto& input_array : model.flags.input_arrays()) {
     if (array_name == input_array.name()) {
       return true;
@@ -105,7 +105,7 @@ bool IsInputArray(const Model& model, const string& array_name) {
   return false;
 }
 
-bool IsOutputArray(const Model& model, const string& array_name) {
+bool IsOutputArray(const Model& model, const std::string& array_name) {
   for (const auto& output_array : model.flags.output_arrays()) {
     if (array_name == output_array) {
       return true;
@@ -114,7 +114,7 @@ bool IsOutputArray(const Model& model, const string& array_name) {
   return false;
 }
 
-bool IsArrayConsumed(const Model& model, const string& name) {
+bool IsArrayConsumed(const Model& model, const std::string& name) {
   if (GetOpWithInput(model, name)) {
     return true;
   }
@@ -131,7 +131,7 @@ bool IsArrayConsumed(const Model& model, const string& name) {
 
 int CountTrueOutputs(const Model& model, const Operator& op) {
   int count = 0;
-  for (const string& output : op.outputs) {
+  for (const std::string& output : op.outputs) {
     if (IsArrayConsumed(model, output)) {
       ++count;
     }
@@ -139,7 +139,7 @@ int CountTrueOutputs(const Model& model, const Operator& op) {
   return count;
 }
 
-int CountOpsWithInput(const Model& model, const string& array_name) {
+int CountOpsWithInput(const Model& model, const std::string& array_name) {
   int count = 0;
   for (const auto& op : model.operators) {
     for (auto& input : op->inputs) {
@@ -155,7 +155,7 @@ int CountOpsWithInput(const Model& model, const string& array_name) {
   return count;
 }
 
-bool DeleteArrayIfUnused(const string& array_name, Model* model) {
+bool DeleteArrayIfUnused(const std::string& array_name, Model* model) {
   if (IsDiscardableArray(*model, array_name) &&
       CountOpsWithInput(*model, array_name) == 0 &&
       GetOpWithOutput(*model, array_name) == nullptr) {
@@ -165,7 +165,7 @@ bool DeleteArrayIfUnused(const string& array_name, Model* model) {
   return false;
 }
 
-bool DeleteArrayIfUnusedOutsideOfOp(const string& array_name,
+bool DeleteArrayIfUnusedOutsideOfOp(const std::string& array_name,
                                     const Operator* op, Model* model) {
   if (!IsDiscardableArray(*model, array_name)) {
     return false;
@@ -187,10 +187,10 @@ bool DeleteArrayIfUnusedOutsideOfOp(const string& array_name,
 }
 
 void DeleteOpAndArrays(Model* model, const Operator* op) {
-  for (const string& array_name : op->inputs) {
+  for (const std::string& array_name : op->inputs) {
     DeleteArrayIfUnusedOutsideOfOp(array_name, op, model);
   }
-  for (const string& array_name : op->outputs) {
+  for (const std::string& array_name : op->outputs) {
     DeleteArrayIfUnusedOutsideOfOp(array_name, op, model);
   }
   auto op_it = FindOp(*model, op);
@@ -199,7 +199,7 @@ void DeleteOpAndArrays(Model* model, const Operator* op) {
 }
 
 std::vector<std::unique_ptr<Operator>>::const_iterator FindOpWithOutput(
-    const Model& model, const string& array_name) {
+    const Model& model, const std::string& array_name) {
   for (auto it = model.operators.begin(); it != model.operators.end(); ++it) {
     for (auto& output : it->get()->outputs) {
       if (output == array_name) {
@@ -211,7 +211,7 @@ std::vector<std::unique_ptr<Operator>>::const_iterator FindOpWithOutput(
 }
 
 std::vector<std::unique_ptr<Operator>>::iterator FindOpWithOutput(
-    Model& model, const string& array_name) {
+    Model& model, const std::string& array_name) {
   for (auto it = model.operators.begin(); it != model.operators.end(); ++it) {
     for (auto& output : it->get()->outputs) {
       if (output == array_name) {
@@ -222,14 +222,14 @@ std::vector<std::unique_ptr<Operator>>::iterator FindOpWithOutput(
   return model.operators.end();
 }
 
-Operator* GetOpWithOutput(const Model& model, const string& array_name) {
+Operator* GetOpWithOutput(const Model& model, const std::string& array_name) {
   auto it = FindOpWithOutput(model, array_name);
   return it == model.operators.end() ? nullptr : it->get();
 }
 
 // GetFirstOpWithInput assumes that this finds the first op.
 std::vector<std::unique_ptr<Operator>>::const_iterator FindOpWithInput(
-    const Model& model, const string& array_name) {
+    const Model& model, const std::string& array_name) {
   for (auto it = model.operators.begin(); it != model.operators.end(); ++it) {
     for (auto& input : it->get()->inputs) {
       if (input == array_name) {
@@ -241,7 +241,7 @@ std::vector<std::unique_ptr<Operator>>::const_iterator FindOpWithInput(
 }
 
 std::vector<std::unique_ptr<Operator>>::iterator FindOpWithInput(
-    Model& model, const string& array_name) {
+    Model& model, const std::string& array_name) {
   for (auto it = model.operators.begin(); it != model.operators.end(); ++it) {
     for (auto& input : it->get()->inputs) {
       if (input == array_name) {
@@ -272,18 +272,19 @@ std::vector<std::unique_ptr<Operator>>::iterator FindOp(Model& model,
   return model.operators.end();
 }
 
-Operator* GetOpWithInput(const Model& model, const string& array_name) {
+Operator* GetOpWithInput(const Model& model, const std::string& array_name) {
   auto it = FindOpWithInput(model, array_name);
   return it == model.operators.end() ? nullptr : it->get();
 }
 
-Operator* GetFirstOpWithInput(const Model& model, const string& array_name) {
+Operator* GetFirstOpWithInput(const Model& model,
+                              const std::string& array_name) {
   auto it = FindOpWithInput(model, array_name);
   return it == model.operators.end() ? nullptr : it->get();
 }
 
-void ReplaceArrayUsage(Model* model, const string& old_array_name,
-                       const string& new_array_name) {
+void ReplaceArrayUsage(Model* model, const std::string& old_array_name,
+                       const std::string& new_array_name) {
   for (auto& op_it : model->operators) {
     Operator* op = op_it.get();
     for (size_t i = 0; i < op->inputs.size(); ++i) {
@@ -299,11 +300,12 @@ void ReplaceArrayUsage(Model* model, const string& old_array_name,
   }
 }
 
-string FormatArraysList(const Model& model, const std::vector<string>& list) {
+std::string FormatArraysList(const Model& model,
+                             const std::vector<std::string>& list) {
   if (list.empty()) {
     return "[]";
   }
-  string result = "";
+  std::string result = "";
   if (list.size() > 1) {
     result += "[ ";
   }
@@ -459,7 +461,7 @@ const char* OperatorTypeName(OperatorType type) {
   }
 }
 
-string HelpfulOperatorTypeName(const Operator& op) {
+std::string HelpfulOperatorTypeName(const Operator& op) {
   if (op.type == OperatorType::kUnsupported) {
     return toco::port::StringF(
         "(Unsupported TensorFlow op: %s)",
@@ -503,7 +505,7 @@ void LogSummary(int log_level, const Model& model) {
   }
 }
 
-void LogArray(int log_level, const Model& model, const string& name) {
+void LogArray(int log_level, const Model& model, const std::string& name) {
   VLOG(log_level) << "Array: " << name;
   if (!model.HasArray(name)) {
     VLOG(log_level) << "  DOES NOT EXIST";
@@ -524,7 +526,7 @@ void LogArray(int log_level, const Model& model, const string& name) {
     if (array_shape.dimensions_count() == 0) {
       VLOG(log_level) << "  (Zero dimensions)";
     } else {
-      string message = "  Dims: ";
+      std::string message = "  Dims: ";
       bool first = true;
       for (const int dim : array_shape.dims()) {
         if (!first) {
@@ -568,10 +570,10 @@ void DumpGraphvizVideoFrame(const Model& model) {
   // this new video-dumping feature.
   static int dump_id = 0;
   static std::unordered_set<std::size_t> dump_hashes;
-  string graphviz_dump;
+  std::string graphviz_dump;
   DumpGraphviz(model, &graphviz_dump,
                toco::port::StringF("VIDEO frame:%05d", dump_id));
-  std::size_t hash = std::hash<string>{}(graphviz_dump);
+  std::size_t hash = std::hash<std::string>{}(graphviz_dump);
   if (!dump_hashes.count(hash)) {
     LOG(INFO) << "DUMPING GRAPHVIZ VIDEO FRAME: " << dump_id;
     dump_hashes.insert(hash);
@@ -585,13 +587,13 @@ void DumpGraphvizVideoFrame(const Model& model) {
   }
 }
 
-void LogDump(int log_level, const string& message, const Model& model) {
+void LogDump(int log_level, const std::string& message, const Model& model) {
   namespace port = toco::port;
   const auto& dump_options = *GraphVizDumpOptions::singleton();
 
   DumpGraphvizVideoFrame(model);
   if (!dump_options.dump_graphviz.empty()) {
-    string graphviz_dump;
+    std::string graphviz_dump;
 
     DumpGraphviz(model, &graphviz_dump, message);
     const auto result = port::file::SetContents(
@@ -608,7 +610,7 @@ void LogDump(int log_level, const string& message, const Model& model) {
   }
   VLOG(log_level) << "BEGIN DUMP OF TOCO MODEL (" << message << ")";
   LogSummary(log_level, model);
-  std::unordered_set<string> already_printed_arrays;
+  std::unordered_set<std::string> already_printed_arrays;
   for (const auto& op : model.operators) {
     for (const auto& input : op->inputs) {
       if (!already_printed_arrays.count(input)) {
@@ -759,7 +761,7 @@ int RequiredBufferSizeForShape(const Shape& shape) {
   return max_offset;
 }
 
-bool IsConstantParameterArray(const Model& model, const string& name) {
+bool IsConstantParameterArray(const Model& model, const std::string& name) {
   if (!model.HasArray(name)) {
     return false;
   }
@@ -858,7 +860,7 @@ bool CompareConstantArrays(const Array& lhs_array, const Array& rhs_array) {
 namespace {
 // Take an array name, which may be something like "name:3_5" and make it
 // acceptable as a TF node name, say "name_3_5";
-string SanitizeNameForTFNode(const string& array_name) {
+std::string SanitizeNameForTFNode(const std::string& array_name) {
   auto node_name = array_name;
   std::replace(node_name.begin(), node_name.end(), ':', '_');
   return node_name;
@@ -866,7 +868,7 @@ string SanitizeNameForTFNode(const string& array_name) {
 
 void CheckInputArraysAreNotOutputArrays(const ModelFlags& model_flags) {
   for (const auto& input_array : model_flags.input_arrays()) {
-    for (const string& output_array : model_flags.output_arrays()) {
+    for (const std::string& output_array : model_flags.output_arrays()) {
       QCHECK_NE(input_array.name(), output_array)
           << "The array " << output_array
           << " is listed in both --input_arrays and --output_arrays.";
@@ -874,7 +876,7 @@ void CheckInputArraysAreNotOutputArrays(const ModelFlags& model_flags) {
   }
 }
 
-bool IsAsciiPrintable(const string& name) {
+bool IsAsciiPrintable(const std::string& name) {
   for (char c : name) {
     if (!absl::ascii_isprint(c)) {
       return false;
@@ -883,8 +885,8 @@ bool IsAsciiPrintable(const string& name) {
   return true;
 }
 
-string DumpAscii(const string& name) {
-  string result;
+std::string DumpAscii(const std::string& name) {
+  std::string result;
   port::AppendF(&result, "ASCII | Hex\n");
   port::AppendF(&result, "------+----\n");
   for (char c : name) {
@@ -909,7 +911,7 @@ void CheckNonAsciiIOArrays(const ModelFlags& model_flags) {
         << "Here is a dump of the string:\n\n"
         << DumpAscii(input_array.name());
   }
-  for (const string& output_array : model_flags.output_arrays()) {
+  for (const std::string& output_array : model_flags.output_arrays()) {
     QCHECK(IsAsciiPrintable(output_array))
         << "Non-ASCII-printable character found in --output_arrays: "
         << output_array << ". Pass --allow_nonascii_arrays to allow that. "
@@ -932,7 +934,7 @@ void CheckNonExistentIOArrays(const Model& model) {
       "Is it a typo? This should not happen. If you trigger this error "
       "please send a bug report (with code to reproduce this error), to the "
       "TensorFlow Lite team.";
-  for (const string& output_array : model.flags.output_arrays()) {
+  for (const std::string& output_array : model.flags.output_arrays()) {
     if (IsConstantParameterArray(model, output_array)) {
       continue;  // It is OK to request that a constant be an output.
     }
@@ -984,7 +986,7 @@ void FixNoMissingArray(Model* model) {
     }
   }
   if (model->flags.allow_nonexistent_arrays()) {
-    for (const string& output_array : model->flags.output_arrays()) {
+    for (const std::string& output_array : model->flags.output_arrays()) {
       model->GetOrCreateArray(output_array);
     }
     for (const auto& rnn_state : model->flags.rnn_states()) {
@@ -995,7 +997,7 @@ void FixNoMissingArray(Model* model) {
 }
 
 void CheckNoOrphanedArray(const Model& model) {
-  std::unordered_set<string> arrays_without_known_use;
+  std::unordered_set<std::string> arrays_without_known_use;
   for (const auto& array : model.GetArrayMap()) {
     if (IsDiscardableArray(model, array.first)) {
       arrays_without_known_use.insert(array.first);
@@ -1022,7 +1024,7 @@ void CheckNoOrphanedArray(const Model& model) {
 }
 
 void FixNoOrphanedArray(Model* model) {
-  std::unordered_set<string> arrays_without_known_use;
+  std::unordered_set<std::string> arrays_without_known_use;
   for (const auto& array : model->GetArrayMap()) {
     arrays_without_known_use.insert(array.first);
   }
@@ -1071,11 +1073,11 @@ void CheckEachArray(const Model& model) {
 
     // Check name.  Either "name_with_suffix_8", "name_with_port:3", but not
     // "name_with_both:3_8".
-    const string& name = array_entry.first;
+    const std::string& name = array_entry.first;
     auto colon_pos = name.find_first_of(":");
-    if (colon_pos != string::npos) {
+    if (colon_pos != std::string::npos) {
       CHECK_EQ(name.substr(colon_pos + 1).find_first_not_of("0123456789"),
-               string::npos)
+               std::string::npos)
           << "Array '" << name << "' has non-digit characters after colon.";
     }
     CHECK_GT(colon_pos, 0) << "Array '" << name
@@ -1084,7 +1086,7 @@ void CheckEachArray(const Model& model) {
 }
 
 void CheckOperatorOrdering(const Model& model) {
-  std::unordered_set<string> arrays_behind_us;
+  std::unordered_set<std::string> arrays_behind_us;
   for (const auto& array_entry : model.GetArrayMap()) {
     if (!GetOpWithOutput(model, array_entry.first)) {
       arrays_behind_us.insert(array_entry.first);
@@ -1103,13 +1105,13 @@ void CheckOperatorOrdering(const Model& model) {
       arrays_behind_us.insert(output);
     }
   }
-  for (const string& output_array : model.flags.output_arrays()) {
+  for (const std::string& output_array : model.flags.output_arrays()) {
     CHECK(arrays_behind_us.count(output_array));
   }
 }
 
 void FixOperatorOrdering(Model* model) {
-  std::unordered_set<string> arrays_behind_us;
+  std::unordered_set<std::string> arrays_behind_us;
   for (const auto& array_entry : model->GetArrayMap()) {
     if (!GetOpWithOutput(*model, array_entry.first)) {
       arrays_behind_us.insert(array_entry.first);
@@ -1123,7 +1125,7 @@ void FixOperatorOrdering(Model* model) {
   for (std::size_t i = 0; i < old_operators.size(); i++) {
     remaining.insert(i);
   }
-  std::unordered_map<string, string> reason_why_leftover;
+  std::unordered_map<std::string, std::string> reason_why_leftover;
   while (true) {
     bool inserted_something = false;
     for (const auto& i : remaining) {
@@ -1133,7 +1135,7 @@ void FixOperatorOrdering(Model* model) {
       for (const auto& input : op->inputs) {
         if (!IsConstantParameterArray(*model, input) &&
             !arrays_behind_us.count(input)) {
-          for (const string& output : op->outputs) {
+          for (const std::string& output : op->outputs) {
             reason_why_leftover[output] = input;
           }
           can_insert = false;
@@ -1166,15 +1168,15 @@ void FixOperatorOrdering(Model* model) {
     LOG(ERROR) << "BEGIN TRACE OF OPERATOR WITH BAD INPUT";
     LOG(ERROR) << "Here is the first-encountered operator with a bad input: ";
     const Operator* bad_op = old_operators[*remaining.begin()].get();
-    std::unordered_set<string> bad_inputs_already_traced;
+    std::unordered_set<std::string> bad_inputs_already_traced;
     // The following while(true) loop should always end with a LOG(FATAL).
     while (true) {
       LOG(ERROR) << HelpfulOperatorTypeName(*bad_op) << " : "
                  << FormatArraysList(*model, bad_op->inputs) << " -> "
                  << FormatArraysList(*model, bad_op->outputs);
       bool found_bad_output = false;
-      string bad_output;
-      for (const string& output : bad_op->outputs) {
+      std::string bad_output;
+      for (const std::string& output : bad_op->outputs) {
         if (reason_why_leftover.count(output)) {
           found_bad_output = true;
           bad_output = output;
@@ -1182,7 +1184,7 @@ void FixOperatorOrdering(Model* model) {
         }
       }
       CHECK(found_bad_output);
-      const string& bad_input = reason_why_leftover[bad_output];
+      const std::string& bad_input = reason_why_leftover[bad_output];
       LOG(ERROR) << "The bad input here is: " << bad_input;
       if (bad_inputs_already_traced.count(bad_input)) {
         LOG(FATAL)
@@ -1198,7 +1200,7 @@ void FixOperatorOrdering(Model* model) {
       bad_op = nullptr;
       for (const auto& i : remaining) {
         const Operator* op = old_operators[i].get();
-        for (const string& output : op->outputs) {
+        for (const std::string& output : op->outputs) {
           if (bad_input == output) {
             bad_op = op;
             break;
@@ -1233,7 +1235,7 @@ void CheckInvariants(const Model& model) {
 }
 
 void CheckCountInRange(const ::toco::ModelFlags::ModelCheck& model_check,
-                       const int count, const string& count_description) {
+                       const int count, const std::string& count_description) {
   if (model_check.count_min() >= 0) {
     CHECK_GE(count, model_check.count_min())
         << "Mismatch in " << count_description << ": count  was " << count
@@ -1251,7 +1253,7 @@ void CheckCountInRange(const ::toco::ModelFlags::ModelCheck& model_check,
 
 void CheckModelCounts(const Model& model) {
   std::unordered_multiset<OperatorType> ops_by_type;
-  std::unordered_map<string, OperatorType> op_type_by_name;
+  std::unordered_map<std::string, OperatorType> op_type_by_name;
   if (model.flags.model_checks_size() == 0) {
     return;
   }
@@ -1261,7 +1263,7 @@ void CheckModelCounts(const Model& model) {
     op_type_by_name[OperatorTypeName(op->type)] = op->type;
   }
   for (const auto& model_check : model.flags.model_checks()) {
-    string count_type = model_check.count_type();
+    std::string count_type = model_check.count_type();
     if (count_type == "None") {
       continue;
     } else if (count_type == "Arrays") {
@@ -1284,12 +1286,12 @@ void CheckModelCounts(const Model& model) {
 }
 
 void FixEdgeArrays(Model* model) {
-  for (const string& output_array_name : model->flags.output_arrays()) {
+  for (const std::string& output_array_name : model->flags.output_arrays()) {
     if (!GetOpWithOutput(*model, output_array_name)) {
       // Output has no operator producing it. Change that by inserting a copy.
       LOG(WARNING) << "Fixing constant output array " << output_array_name
                    << " by inserting a copy. This is not optimal.";
-      string intermediate_array_name =
+      std::string intermediate_array_name =
           AvailableArrayName(*model, output_array_name + "_copy");
       CloneArray(model, output_array_name, intermediate_array_name);
       InsertCopyOperator(model, intermediate_array_name, output_array_name);
@@ -1378,8 +1380,8 @@ void CopyArrayAttribs(const Array& source_array, Array* target_array) {
 }
 }  // namespace
 
-void InsertCopyOperator(Model* model, const string& source_array_name,
-                        const string& target_array_name) {
+void InsertCopyOperator(Model* model, const std::string& source_array_name,
+                        const std::string& target_array_name) {
   // Reshape to the same size. This should be a no-op.
   const Array& source_array = model->GetArray(source_array_name);
   std::vector<int> shape = source_array.shape().dims();
@@ -1404,8 +1406,8 @@ void InsertCopyOperator(Model* model, const string& source_array_name,
   model->operators.emplace_back(copy_op);
 }
 
-void CloneArray(Model* model, const string& source_array_name,
-                const string& target_array_name) {
+void CloneArray(Model* model, const std::string& source_array_name,
+                const std::string& target_array_name) {
   CHECK(!model->HasArray(target_array_name));
   const Array& source_array = model->GetArray(source_array_name);
   Array& target_array = model->GetOrCreateArray(target_array_name);
@@ -1479,7 +1481,7 @@ void MakeArrayDims(int num_dims, int batch, int height, int width, int depth,
   }
 }
 
-void CreateOrCheckRnnStateArray(const string& name, int size,
+void CreateOrCheckRnnStateArray(const std::string& name, int size,
                                 int state_num_dims, Model* model) {
   int batch = 1;
   int num_dims = -1;
@@ -1781,7 +1783,7 @@ int ElementSize(ArrayDataType data_type) {
   }
 }
 
-void DropMinMax(Model* model, const string& array_name) {
+void DropMinMax(Model* model, const std::string& array_name) {
   auto& array = model->GetArray(array_name);
   if (!!array.minmax) {
     LOG(WARNING) << "Dropping MinMax information in array " << array_name
@@ -1790,7 +1792,8 @@ void DropMinMax(Model* model, const string& array_name) {
   }
 }
 
-bool IsAllocatableTransientArray(const Model& model, const string& array_name) {
+bool IsAllocatableTransientArray(const Model& model,
+                                 const std::string& array_name) {
   // Optional array is not transient
   if (model.IsOptionalArray(array_name)) return false;
   // The model's input and output arrays are externally allocated.
@@ -1818,15 +1821,15 @@ bool IsAllocatableTransientArray(const Model& model, const string& array_name) {
   return true;
 }
 
-string AvailableArrayName(const Model& model, const string& name) {
-  string sanitized_name = SanitizeNameForTFNode(name);
+std::string AvailableArrayName(const Model& model, const std::string& name) {
+  std::string sanitized_name = SanitizeNameForTFNode(name);
   if (!model.HasArray(sanitized_name) &&
       !model.IsOptionalArray(sanitized_name)) {
     return sanitized_name;
   }
   const int kNumSuffixesToTry = 1000;
   for (int i = 0; i < kNumSuffixesToTry; i++) {
-    const string& name_with_suffix =
+    const std::string& name_with_suffix =
         toco::port::StringF("%s_%d", sanitized_name, i);
     if (!model.HasArray(name_with_suffix) &&
         !model.IsOptionalArray(name_with_suffix)) {
@@ -1839,7 +1842,7 @@ string AvailableArrayName(const Model& model, const string& name) {
   return "";
 }
 
-string ShapeToString(const Shape& shape) {
+std::string ShapeToString(const Shape& shape) {
   if (shape.dimensions_count() == 0) {
     return "[]";
   }
@@ -1847,7 +1850,7 @@ string ShapeToString(const Shape& shape) {
   return absl::StrCat("[ ", absl::StrJoin(shape.dims(), ", "), " ]");
 }
 
-void PrintArrayShape(Model* model, const string& name) {
+void PrintArrayShape(Model* model, const std::string& name) {
   if (!model->GetArray(name).has_shape()) {
     LOG(INFO) << name << " has no shape";
     return;
@@ -1856,7 +1859,7 @@ void PrintArrayShape(Model* model, const string& name) {
             << " has shape: " << ShapeToString(model->GetArray(name).shape());
 }
 
-bool IsArrayFullyConnectedWeights(const Model& model, const string& name) {
+bool IsArrayFullyConnectedWeights(const Model& model, const std::string& name) {
   bool is_fc_weights = false;
   bool is_something_else = false;
   for (const auto& op : model.operators) {
@@ -1874,8 +1877,8 @@ bool IsArrayFullyConnectedWeights(const Model& model, const string& name) {
   return is_fc_weights;
 }
 
-string CreateInt32Array(Model* model, const string& param_name,
-                        const std::vector<int>& value) {
+std::string CreateInt32Array(Model* model, const std::string& param_name,
+                             const std::vector<int>& value) {
   auto param_array_name = AvailableArrayName(*model, param_name);
   auto& param_array = model->GetOrCreateArray(param_array_name);
   param_array.mutable_shape()->ReplaceDims({static_cast<int>(value.size())});
@@ -2031,7 +2034,7 @@ bool EstimateArithmeticOpsCount(const Model& model, int64* result) {
   return true;
 }
 
-string FormattedNumber(int64 x) {
+std::string FormattedNumber(int64 x) {
   const int64 million = 1000000;
   const int64 billion = 1000000000;
   if (x < 10000) {
@@ -2222,7 +2225,7 @@ int AxesCount(AxesOrder axes_order) {
   }
 }
 
-bool IsDiscardableArray(const Model& model, const string& array_name) {
+bool IsDiscardableArray(const Model& model, const std::string& array_name) {
   if (IsInputArray(model, array_name) || IsOutputArray(model, array_name)) {
     return false;
   }
@@ -2338,9 +2341,9 @@ void FinishBuildingRNNStates(Model* model) {
 
 // Returns the array names that match the ArraysExtraInfo's name and
 // name_regexp. The regexp match is for a full match.
-std::unordered_set<string> ScanArrayNames(
+std::unordered_set<std::string> ScanArrayNames(
     const Model& model, const toco::ArraysExtraInfo_Entry& entry) {
-  std::unordered_set<string> matches;
+  std::unordered_set<std::string> matches;
   if (model.HasArray(entry.name())) {
     matches.insert(entry.name());
   }
@@ -2409,7 +2412,7 @@ void UndoWeightsShuffling(Model* model) {
     if (fc_op.weights_format == FullyConnectedWeightsFormat::kDefault) {
       continue;
     }
-    const string& weights_name = fc_op.inputs[1];
+    const std::string& weights_name = fc_op.inputs[1];
     QCHECK_EQ(CountOpsWithInput(*model, weights_name), 1);
     auto& weights_array = model->GetArray(weights_name);
     QCHECK(weights_array.data_type == ArrayDataType::kUint8);
diff --git a/tensorflow/lite/toco/tooling_util.h b/tensorflow/lite/toco/tooling_util.h
index 6fd13be182c..438ce19970d 100644
--- a/tensorflow/lite/toco/tooling_util.h
+++ b/tensorflow/lite/toco/tooling_util.h
@@ -54,44 +54,45 @@ constexpr int kLogLevelModelUnchanged = 2;
 
 absl::string_view FindLongestCommonPrefix(absl::string_view a,
                                           absl::string_view b);
-string LogName(const Operator& op);
+std::string LogName(const Operator& op);
 
-string ArrayDataTypeName(ArrayDataType data_type);
+std::string ArrayDataTypeName(ArrayDataType data_type);
 
 // Returns true if the given array is specified as a model input array.
-bool IsInputArray(const Model& model, const string& array_name);
+bool IsInputArray(const Model& model, const std::string& array_name);
 // Returns true if the given array is specified as a model output array.
-bool IsOutputArray(const Model& model, const string& array_name);
+bool IsOutputArray(const Model& model, const std::string& array_name);
 
-bool IsArrayConsumed(const Model& model, const string& name);
+bool IsArrayConsumed(const Model& model, const std::string& name);
 int CountTrueOutputs(const Model& model, const Operator& op);
 
-int CountOpsWithInput(const Model& model, const string& array_name);
-bool DeleteArrayIfUnused(const string& array_name, Model* model);
+int CountOpsWithInput(const Model& model, const std::string& array_name);
+bool DeleteArrayIfUnused(const std::string& array_name, Model* model);
 
 // Deletes the op and any of its input and output arrays if they are unused
 // after the op has been deleted.
 void DeleteOpAndArrays(Model* model, const Operator* op);
 
 std::vector<std::unique_ptr<Operator>>::const_iterator FindOpWithOutput(
-    const Model& model, const string& array_name);
-Operator* GetOpWithOutput(const Model& model, const string& array_name);
+    const Model& model, const std::string& array_name);
+Operator* GetOpWithOutput(const Model& model, const std::string& array_name);
 
 std::vector<std::unique_ptr<Operator>>::iterator FindOpWithOutput(
-    Model& model, const string& array_name);
+    Model& model, const std::string& array_name);
 
 std::vector<std::unique_ptr<Operator>>::const_iterator FindOpWithInput(
-    const Model& model, const string& array_name);
+    const Model& model, const std::string& array_name);
 
 std::vector<std::unique_ptr<Operator>>::iterator FindOpWithInput(
-    Model& model, const string& array_name);
+    Model& model, const std::string& array_name);
 
-Operator* GetOpWithInput(const Model& model, const string& array_name);
-Operator* GetFirstOpWithInput(const Model& model, const string& array_name);
+Operator* GetOpWithInput(const Model& model, const std::string& array_name);
+Operator* GetFirstOpWithInput(const Model& model,
+                              const std::string& array_name);
 
 // Replaces all uses of the |old_array_name| with the |new_array_name|.
-void ReplaceArrayUsage(Model* model, const string& old_array_name,
-                       const string& new_array_name);
+void ReplaceArrayUsage(Model* model, const std::string& old_array_name,
+                       const std::string& new_array_name);
 
 std::vector<std::unique_ptr<Operator>>::const_iterator FindOp(
     const Model& model, const Operator* op);
@@ -99,15 +100,15 @@ std::vector<std::unique_ptr<Operator>>::iterator FindOp(Model& model,
                                                         const Operator* op);
 
 const char* OperatorTypeName(OperatorType type);
-string HelpfulOperatorTypeName(const Operator& op);
+std::string HelpfulOperatorTypeName(const Operator& op);
 
 // Whether the operator can be fused with an activation function. Note that this
 // will return false by default for new operators; fusing support is opt-in.
 bool OperatorSupportsFusedActivation(OperatorType type);
 
 void DumpGraphvizVideoFrame(const Model& model);
-void LogDump(int log_level, const string& message, const Model& model);
-void LogSummary(int log_level, const string& message, const Model& model);
+void LogDump(int log_level, const std::string& message, const Model& model);
+void LogSummary(int log_level, const std::string& message, const Model& model);
 
 // TODO(b/36075966): Clean up when dims superseded by array shape.
 void ExtendShape(Shape* shape, int new_shape_size);
@@ -143,12 +144,12 @@ inline ::tflite::RuntimeShape ToRuntimeShape(const Shape& shape) {
   return ::tflite::RuntimeShape(shape.dimensions_count(), shape.dims().data());
 }
 
-bool IsArrayFullyConnectedWeights(const Model& model, const string& name);
+bool IsArrayFullyConnectedWeights(const Model& model, const std::string& name);
 
 // If there is a wildcard dimension (-1), this may return a negative value.
 int RequiredBufferSizeForShape(const Shape& shape);
 
-bool IsConstantParameterArray(const Model& model, const string& name);
+bool IsConstantParameterArray(const Model& model, const std::string& name);
 
 // Compares two constant parameter arrays for exact equality.
 bool CompareConstantArrays(const Array& lhs_array, const Array& rhs_array);
@@ -193,12 +194,12 @@ void CopyArrayBuffer(const Array& source_array, Array* target_array) {
 
 // Inserts a no-op reshape operator between the source array and the target
 // array. This effectively just copies the data.
-void InsertCopyOperator(Model* model, const string& source_array_name,
-                        const string& target_array_name);
+void InsertCopyOperator(Model* model, const std::string& source_array_name,
+                        const std::string& target_array_name);
 
 // Clones an array with all data and parameters.
-void CloneArray(Model* model, const string& source_array_name,
-                const string& target_array_name);
+void CloneArray(Model* model, const std::string& source_array_name,
+                const std::string& target_array_name);
 
 void ResolveModelFlags(const ModelFlags& model_flags, Model* model);
 
@@ -245,32 +246,33 @@ inline std::vector<int> ReverseOffset(const Shape& shape, int index) {
 
 int ElementSize(ArrayDataType data_type);
 
-void DropMinMax(Model* model, const string& array_name);
+void DropMinMax(Model* model, const std::string& array_name);
 
-bool IsAllocatableTransientArray(const Model& model, const string& array_name);
+bool IsAllocatableTransientArray(const Model& model,
+                                 const std::string& array_name);
 
-void CreateOrCheckRnnStateArray(const string& name, int size,
+void CreateOrCheckRnnStateArray(const std::string& name, int size,
                                 int state_num_dims, Model* model);
 
-string AvailableArrayName(const Model& model, const string& name);
+std::string AvailableArrayName(const Model& model, const std::string& name);
 
 // Formats a shape as a string: [ dims(0), dims(1), ..., dims(num_dims-1) ].
-string ShapeToString(const Shape& shape);
+std::string ShapeToString(const Shape& shape);
 
-void PrintArrayShape(Model* model, const string& name);
+void PrintArrayShape(Model* model, const std::string& name);
 
 void MakeArrayDims(int num_dims, int batch, int height, int width, int depth,
                    std::vector<int>* out_dims);
 
 // Defines a constant int32 array with the provided values formatted for use
 // as op parameters.
-string CreateInt32Array(Model* model, const string& param_name,
-                        const std::vector<int>& value);
+std::string CreateInt32Array(Model* model, const std::string& param_name,
+                             const std::vector<int>& value);
 
 bool EstimateArithmeticOpsCount(const Model& model, const Operator& op,
                                 int64* result);
 bool EstimateArithmeticOpsCount(const Model& model, int64* result);
-string FormattedNumber(int64 x);
+std::string FormattedNumber(int64 x);
 
 int AxesCount(AxesOrder axes_order);
 
@@ -297,7 +299,7 @@ void ShuffleArray(const Shape& input_shape, AxesOrder input_axes_order,
 // that array. The idea is that we can't ever discard arrays that are either
 // an input or an output of the whole graph, or that appear in RNN back-edges,
 // as that would undercut explicit flags that the user might pass.
-bool IsDiscardableArray(const Model& model, const string& array_name);
+bool IsDiscardableArray(const Model& model, const std::string& array_name);
 
 void CheckFinalDataTypesSatisfied(const Model& model);
 
@@ -362,7 +364,7 @@ void CopyMinMaxAndQuantizationRelatedFields(const Array& src, Array* dst);
 
 // Delete Array if it's discardable and not referenced as input or output array
 // by any other op than the specified op.
-bool DeleteArrayIfUnusedOutsideOfOp(const string& array_name,
+bool DeleteArrayIfUnusedOutsideOfOp(const std::string& array_name,
                                     const Operator* op, Model* model);
 
 }  // namespace toco

From 406d9b5521a6bd665a8773573a529fe31b95e5ce Mon Sep 17 00:00:00 2001
From: Robert David <lrdx@google.com>
Date: Wed, 17 Jun 2020 09:53:14 -0700
Subject: [PATCH 0405/1390] Make the names of "activation", "activation_state",
 "output_state" variables consistent: Use "output_state", used by the float
 implementation (LstmStepFloat).

Remove a few variables in implementation and test that were representing the same value (the index of these tensors, mostly).

Also rename "input cell state" to just "cell state".

PiperOrigin-RevId: 316908413
Change-Id: Icb64ecd31c90f45ef21cf7d48849fb2ec0975d3a
---
 tensorflow/lite/kernels/lstm.cc               | 169 +++++++++---------
 tensorflow/lite/kernels/lstm_eval.cc          |  82 +++++----
 tensorflow/lite/kernels/lstm_eval.h           |  15 +-
 tensorflow/lite/kernels/lstm_shared.h         |   4 +-
 tensorflow/lite/kernels/lstm_test.cc          |  76 ++++----
 .../kernels/unidirectional_sequence_lstm.cc   |  53 +++---
 .../unidirectional_sequence_lstm_test.cc      |  39 ++--
 .../calibration/builtin_logging_ops/lstm.cc   |  33 ++--
 8 files changed, 226 insertions(+), 245 deletions(-)

diff --git a/tensorflow/lite/kernels/lstm.cc b/tensorflow/lite/kernels/lstm.cc
index 74caafbd0c7..0e0c1b9c0f0 100644
--- a/tensorflow/lite/kernels/lstm.cc
+++ b/tensorflow/lite/kernels/lstm.cc
@@ -68,19 +68,19 @@ TfLiteStatus PopulateQuantizedLstmParams8x8_16(
   const float cell_clip = params->cell_clip;
   const float proj_clip = params->proj_clip;
 
-  const TfLiteTensor* cell_tensor =
-      GetVariableInput(context, node, kInputCellStateTensor);
-  TF_LITE_ENSURE(context, cell_tensor != nullptr);
+  const TfLiteTensor* cell_state =
+      GetVariableInput(context, node, kCellStateTensor);
+  TF_LITE_ENSURE(context, cell_state != nullptr);
   const TfLiteTensor* output_tensor = GetOutput(context, node, kOutputTensor);
 
-  auto* cell_params =
-      static_cast<TfLiteAffineQuantization*>(cell_tensor->quantization.params);
+  auto* cell_state_params =
+      static_cast<TfLiteAffineQuantization*>(cell_state->quantization.params);
   auto* proj_params = static_cast<TfLiteAffineQuantization*>(
       output_tensor->quantization.params);
   if (cell_clip > 0.0) {
-    integer_lstm_param->quantized_cell_clip = static_cast<int32_t>(
-        std::min(std::max(cell_clip / cell_params->scale->data[0], -32768.0f),
-                 32767.0f));
+    integer_lstm_param->quantized_cell_clip = static_cast<int32_t>(std::min(
+        std::max(cell_clip / cell_state_params->scale->data[0], -32768.0f),
+        32767.0f));
   } else {
     integer_lstm_param->quantized_cell_clip = 0;
   }
@@ -134,9 +134,9 @@ TfLiteStatus PopulateQuantizedLstmParams8x8_16(
   const TfLiteTensor* projection_weights =
       GetOptionalInputTensor(context, node, kProjectionWeightsTensor);
 
-  TfLiteTensor* activation_state =
-      GetVariableInput(context, node, kInputActivationStateTensor);
-  TF_LITE_ENSURE(context, activation_state != nullptr);
+  TfLiteTensor* output_state =
+      GetVariableInput(context, node, kOutputStateTensor);
+  TF_LITE_ENSURE(context, output_state != nullptr);
 
   // Since we have already checked that weights are all there or none, we can
   // check the existence of only one to get the condition.
@@ -187,7 +187,7 @@ TfLiteStatus PopulateQuantizedLstmParams8x8_16(
   float layer_norm_forget_scale = default_scale;
   float layer_norm_cell_scale = default_scale;
   float layer_norm_output_scale = default_scale;
-  float activation_scale = default_scale;
+  float output_state_scale = default_scale;
   int cell_scale = 1;
 
   // Effective scales.
@@ -231,7 +231,7 @@ TfLiteStatus PopulateQuantizedLstmParams8x8_16(
   if (use_projection) {
     proj_weight_scale = projection_weights->params.scale;
   }
-  activation_scale = activation_state->params.scale;
+  output_state_scale = output_state->params.scale;
 
   input_to_forget_weight_scale = input_to_forget_weights->params.scale;
   input_to_cell_weight_scale = input_to_cell_weights->params.scale;
@@ -240,12 +240,8 @@ TfLiteStatus PopulateQuantizedLstmParams8x8_16(
   recurrent_to_cell_weight_scale = recurrent_to_cell_weights->params.scale;
   recurrent_to_output_weight_scale = recurrent_to_output_weights->params.scale;
 
-  // Get cell state.
-  TfLiteTensor* cell_state =
-      GetVariableInput(context, node, kInputCellStateTensor);
-  TF_LITE_ENSURE(context, cell_state != nullptr);
+  // Check cell state (already used above)
   TF_LITE_ENSURE(context, CheckedLog2(cell_state->params.scale, &cell_scale));
-
   TF_LITE_ENSURE(context, cell_scale <= -9);
   integer_lstm_param->cell_scale = cell_scale;
   input_scale = input->params.scale;
@@ -255,31 +251,32 @@ TfLiteStatus PopulateQuantizedLstmParams8x8_16(
     effective_input_to_input_scale =
         input_to_input_weight_scale * input_scale / intermediate_scale[0];
     effective_recurrent_to_input_scale = recurrent_to_input_weight_scale *
-                                         activation_scale /
+                                         output_state_scale /
                                          intermediate_scale[0];
   }
   effective_input_to_forget_scale =
       input_to_forget_weight_scale * input_scale / intermediate_scale[1];
   effective_recurrent_to_forget_scale = recurrent_to_forget_weight_scale *
-                                        activation_scale /
+                                        output_state_scale /
                                         intermediate_scale[1];
 
   effective_input_to_cell_scale =
       input_to_cell_weight_scale * input_scale / intermediate_scale[2];
-  effective_recurrent_to_cell_scale =
-      recurrent_to_cell_weight_scale * activation_scale / intermediate_scale[2];
+  effective_recurrent_to_cell_scale = recurrent_to_cell_weight_scale *
+                                      output_state_scale /
+                                      intermediate_scale[2];
 
   effective_input_to_output_scale =
       input_to_output_weight_scale * input_scale / intermediate_scale[3];
   effective_recurrent_to_output_scale = recurrent_to_output_weight_scale *
-                                        activation_scale /
+                                        output_state_scale /
                                         intermediate_scale[3];
 
   effective_hidden_scale =
       std::pow(2, -15) / intermediate_scale[4] * std::pow(2, -15);
 
   effective_proj_scale =
-      proj_weight_scale * intermediate_scale[4] / activation_scale;
+      proj_weight_scale * intermediate_scale[4] / output_state_scale;
 
   if (use_peephole) {
     if (!use_cifg) {
@@ -419,11 +416,10 @@ TfLiteStatus PopulateQuantizedLstmParams8x8_8(
   const TfLiteTensor* projection_bias =
       GetOptionalInputTensor(context, node, kProjectionBiasTensor);
 
-  TfLiteTensor* activation_state =
-      GetVariableInput(context, node, kInputActivationStateTensor);
-  TF_LITE_ENSURE(context, activation_state != nullptr);
-  TfLiteTensor* cell_state =
-      GetVariableInput(context, node, kInputCellStateTensor);
+  TfLiteTensor* output_state =
+      GetVariableInput(context, node, kOutputStateTensor);
+  TF_LITE_ENSURE(context, output_state != nullptr);
+  TfLiteTensor* cell_state = GetVariableInput(context, node, kCellStateTensor);
   TF_LITE_ENSURE(context, cell_state != nullptr);
 
   // Since we have already checked that weights are all there or none, we can
@@ -456,7 +452,7 @@ TfLiteStatus PopulateQuantizedLstmParams8x8_8(
   int32_t* output_bias_ptr = nullptr;
   int32_t* proj_bias_ptr = nullptr;
   int16_t* cell_ptr = nullptr;
-  int8_t* activation_ptr = nullptr;
+  int8_t* output_state_ptr = nullptr;
 
   // Scales.
   const float default_scale = 1.0;
@@ -477,7 +473,7 @@ TfLiteStatus PopulateQuantizedLstmParams8x8_8(
   float layer_norm_forget_scale = default_scale;
   float layer_norm_cell_scale = default_scale;
   float layer_norm_output_scale = default_scale;
-  float activation_scale = default_scale;
+  float output_state_scale = default_scale;
 
   // Effective scales.
   float effective_input_to_input_scale = default_scale;
@@ -495,7 +491,7 @@ TfLiteStatus PopulateQuantizedLstmParams8x8_8(
 
   // Zero points
   int input_zp = 0;
-  int activation_zp = 0;
+  int output_state_zp = 0;
 
   // Populate all the values.
   if (!use_cifg) {
@@ -537,7 +533,7 @@ TfLiteStatus PopulateQuantizedLstmParams8x8_8(
       proj_bias_ptr = projection_bias->data.i32;
     }
   }
-  activation_scale = activation_state->params.scale;
+  output_state_scale = output_state->params.scale;
 
   input_to_forget_weight_ptr = input_to_forget_weights->data.int8;
   input_to_forget_weight_scale = input_to_forget_weights->params.scale;
@@ -554,11 +550,11 @@ TfLiteStatus PopulateQuantizedLstmParams8x8_8(
   forget_bias_ptr = forget_gate_bias->data.i32;
   cell_bias_ptr = cell_bias->data.i32;
   output_bias_ptr = output_gate_bias->data.i32;
-  activation_ptr = activation_state->data.int8;
+  output_state_ptr = output_state->data.int8;
   cell_ptr = cell_state->data.i16;
   input_scale = input->params.scale;
   input_zp = input->params.zero_point;
-  activation_zp = activation_state->params.zero_point;
+  output_state_zp = output_state->params.zero_point;
 
   std::vector<float> intermediate_scale;
   for (int i = 0; i < 12; ++i) {
@@ -575,27 +571,28 @@ TfLiteStatus PopulateQuantizedLstmParams8x8_8(
     effective_input_to_input_scale =
         input_to_input_weight_scale * input_scale / intermediate_scale[1];
     effective_recurrent_to_input_scale = recurrent_to_input_weight_scale *
-                                         activation_scale /
+                                         output_state_scale /
                                          intermediate_scale[2];
   }
   effective_input_to_forget_scale =
       input_to_forget_weight_scale * input_scale / intermediate_scale[4];
   effective_recurrent_to_forget_scale = recurrent_to_forget_weight_scale *
-                                        activation_scale /
+                                        output_state_scale /
                                         intermediate_scale[5];
 
   effective_input_to_cell_scale =
       input_to_cell_weight_scale * input_scale / intermediate_scale[7];
-  effective_recurrent_to_cell_scale =
-      recurrent_to_cell_weight_scale * activation_scale / intermediate_scale[8];
+  effective_recurrent_to_cell_scale = recurrent_to_cell_weight_scale *
+                                      output_state_scale /
+                                      intermediate_scale[8];
 
   effective_input_to_output_scale =
       input_to_output_weight_scale * input_scale / intermediate_scale[10];
   effective_recurrent_to_output_scale = recurrent_to_output_weight_scale *
-                                        activation_scale /
+                                        output_state_scale /
                                         intermediate_scale[11];
   effective_proj_scale =
-      proj_weight_scale * std::pow(2, -15) / activation_scale;
+      proj_weight_scale * std::pow(2, -15) / output_state_scale;
 
   if (use_peephole) {
     if (!use_cifg) {
@@ -698,18 +695,16 @@ TfLiteStatus PopulateQuantizedLstmParams8x8_8(
   const float cell_clip = params->cell_clip;
   const float proj_clip = params->proj_clip;
 
-  const TfLiteTensor* cell_tensor =
-      GetInput(context, node, kInputCellStateTensor);
   const TfLiteTensor* output_tensor = GetOutput(context, node, kOutputTensor);
 
-  auto* cell_params = reinterpret_cast<TfLiteAffineQuantization*>(
-      cell_tensor->quantization.params);
+  auto* cell_state_params = reinterpret_cast<TfLiteAffineQuantization*>(
+      cell_state->quantization.params);
   auto* proj_params = reinterpret_cast<TfLiteAffineQuantization*>(
       output_tensor->quantization.params);
-  TF_LITE_ENSURE_EQ(context, cell_params->scale->data[0], 1.0 / 32768);
+  TF_LITE_ENSURE_EQ(context, cell_state_params->scale->data[0], 1.0 / 32768);
   if (cell_clip > 0.0 && cell_clip < 1.0) {
     integer_lstm_param->quantized_cell_clip =
-        static_cast<int>(cell_clip / cell_params->scale->data[0]);
+        static_cast<int>(cell_clip / cell_state_params->scale->data[0]);
   } else {
     integer_lstm_param->quantized_cell_clip = 0;
   }
@@ -1026,12 +1021,12 @@ TfLiteStatus PopulatePrecomputedZPTimesWeightsWithBias(TfLiteContext* context,
                                                        OpData* op_data,
                                                        TfLiteNode* node) {
   const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  const TfLiteTensor* activation_state =
-      GetVariableInput(context, node, kInputActivationStateTensor);
-  TF_LITE_ENSURE(context, activation_state != nullptr);
+  const TfLiteTensor* output_state =
+      GetVariableInput(context, node, kOutputStateTensor);
+  TF_LITE_ENSURE(context, output_state != nullptr);
 
   const int32_t input_zero_point = -input->params.zero_point;
-  const int32_t activation_zero_point = -activation_state->params.zero_point;
+  const int32_t output_state_zero_point = -output_state->params.zero_point;
 
   const TfLiteTensor* input_to_input_weights =
       GetOptionalInputTensor(context, node, kInputToInputWeightsTensor);
@@ -1083,8 +1078,8 @@ TfLiteStatus PopulatePrecomputedZPTimesWeightsWithBias(TfLiteContext* context,
   TF_LITE_ENSURE_OK(
       context,
       PrecomputeZeroPointTimesWeightWithBias(
-          context, activation_zero_point, recurrent_to_forget_weights, nullptr,
-          &(integer_lstm_params->recurrent_to_forget_effective_bias)));
+          context, output_state_zero_point, recurrent_to_forget_weights,
+          nullptr, &(integer_lstm_params->recurrent_to_forget_effective_bias)));
 
   // Modulation gate.
   const TfLiteTensor* cell_gate_bias =
@@ -1097,7 +1092,7 @@ TfLiteStatus PopulatePrecomputedZPTimesWeightsWithBias(TfLiteContext* context,
   TF_LITE_ENSURE_OK(
       context,
       PrecomputeZeroPointTimesWeightWithBias(
-          context, activation_zero_point, recurrent_to_cell_weights, nullptr,
+          context, output_state_zero_point, recurrent_to_cell_weights, nullptr,
           &(integer_lstm_params->recurrent_to_cell_effective_bias)));
 
   // Output gate.
@@ -1112,8 +1107,8 @@ TfLiteStatus PopulatePrecomputedZPTimesWeightsWithBias(TfLiteContext* context,
   TF_LITE_ENSURE_OK(
       context,
       PrecomputeZeroPointTimesWeightWithBias(
-          context, activation_zero_point, recurrent_to_output_weights, nullptr,
-          &(integer_lstm_params->recurrent_to_output_effective_bias)));
+          context, output_state_zero_point, recurrent_to_output_weights,
+          nullptr, &(integer_lstm_params->recurrent_to_output_effective_bias)));
 
   // Input gate. The calculation is only meaningful for non-cifg case.
   const TfLiteTensor* input_gate_bias =
@@ -1126,7 +1121,7 @@ TfLiteStatus PopulatePrecomputedZPTimesWeightsWithBias(TfLiteContext* context,
   TF_LITE_ENSURE_OK(
       context,
       PrecomputeZeroPointTimesWeightWithBias(
-          context, activation_zero_point, recurrent_to_input_weights, nullptr,
+          context, output_state_zero_point, recurrent_to_input_weights, nullptr,
           &(integer_lstm_params->recurrent_to_input_effective_bias)));
 
   // Projection bias. The calculation is only meaningful for with projection.
@@ -1198,20 +1193,19 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
       context, CheckInputTensorDimensions(context, node, n_input, n_output,
                                           n_cell, use_layer_norm, is_integer));
 
-  // Get the pointer to output, activation_state and cell_state tensors.
+  // Get the pointer to output, output_state and cell_state tensors.
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
 
-  TfLiteTensor* activation_state =
-      GetVariableInput(context, node, kInputActivationStateTensor);
-  TF_LITE_ENSURE(context, activation_state != nullptr);
-  TfLiteTensor* cell_state =
-      GetVariableInput(context, node, kInputCellStateTensor);
+  TfLiteTensor* output_state =
+      GetVariableInput(context, node, kOutputStateTensor);
+  TF_LITE_ENSURE(context, output_state != nullptr);
+  TfLiteTensor* cell_state = GetVariableInput(context, node, kCellStateTensor);
   TF_LITE_ENSURE(context, cell_state != nullptr);
 
   // Check the shape of input state tensors.
   // These tensor may be 1D or 2D. It's fine as long as the total size is
   // correct.
-  TF_LITE_ENSURE_EQ(context, NumElements(activation_state), n_batch * n_output);
+  TF_LITE_ENSURE_EQ(context, NumElements(output_state), n_batch * n_output);
   TF_LITE_ENSURE_EQ(context, NumElements(cell_state), n_batch * n_cell);
 
   // Resize the output tensors.
@@ -1275,7 +1269,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   if (is_hybrid_op) {
     op_data->compute_row_sums = true;
     // Allocate temporary tensors to store quantized values of input,
-    // activation_state and cell_state tensors.
+    // output_state and cell_state tensors.
     node->temporaries->data[1] = op_data->scratch_tensor_index + 1;
     TfLiteTensor* input_quantized = GetTemporary(context, node, /*index=*/1);
     input_quantized->type = input_to_output_weights->type;
@@ -1286,17 +1280,17 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
                                                        input_quantized_size));
     }
     node->temporaries->data[2] = op_data->scratch_tensor_index + 2;
-    TfLiteTensor* activation_state_quantized =
+    TfLiteTensor* output_state_quantized =
         GetTemporary(context, node, /*index=*/2);
-    activation_state_quantized->type = input_to_output_weights->type;
-    activation_state_quantized->allocation_type = kTfLiteArenaRw;
-    if (!TfLiteIntArrayEqual(activation_state_quantized->dims,
-                             activation_state->dims)) {
-      TfLiteIntArray* activation_state_quantized_size =
-          TfLiteIntArrayCopy(activation_state->dims);
-      TF_LITE_ENSURE_OK(
-          context, context->ResizeTensor(context, activation_state_quantized,
-                                         activation_state_quantized_size));
+    output_state_quantized->type = input_to_output_weights->type;
+    output_state_quantized->allocation_type = kTfLiteArenaRw;
+    if (!TfLiteIntArrayEqual(output_state_quantized->dims,
+                             output_state->dims)) {
+      TfLiteIntArray* output_state_quantized_size =
+          TfLiteIntArrayCopy(output_state->dims);
+      TF_LITE_ENSURE_OK(context,
+                        context->ResizeTensor(context, output_state_quantized,
+                                              output_state_quantized_size));
     }
     node->temporaries->data[3] = op_data->scratch_tensor_index + 3;
     TfLiteTensor* cell_state_quantized =
@@ -1540,11 +1534,10 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   const TfLiteTensor* projection_bias =
       GetOptionalInputTensor(context, node, kProjectionBiasTensor);
 
-  TfLiteTensor* activation_state =
-      GetVariableInput(context, node, kInputActivationStateTensor);
-  TF_LITE_ENSURE(context, activation_state != nullptr);
-  TfLiteTensor* cell_state =
-      GetVariableInput(context, node, kInputCellStateTensor);
+  TfLiteTensor* output_state =
+      GetVariableInput(context, node, kOutputStateTensor);
+  TF_LITE_ENSURE(context, output_state != nullptr);
+  TfLiteTensor* cell_state = GetVariableInput(context, node, kCellStateTensor);
   TF_LITE_ENSURE(context, cell_state != nullptr);
 
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
@@ -1569,7 +1562,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
           forget_gate_bias, cell_bias, output_gate_bias, projection_weights,
           projection_bias, params, /*forward_sequence=*/true,
           /*time_major=*/true,
-          /*output_offset=*/0, scratch_buffer, activation_state, cell_state,
+          /*output_offset=*/0, scratch_buffer, output_state, cell_state,
           output);
     }
     case kTfLiteUInt8:
@@ -1580,7 +1573,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
         TfLiteTensor* scratch_buffer = GetTemporary(context, node, /*index=*/0);
         TfLiteTensor* input_quantized =
             GetTemporary(context, node, /*index=*/1);
-        TfLiteTensor* activation_state_quantized =
+        TfLiteTensor* output_state_quantized =
             GetTemporary(context, node, /*index=*/2);
         TfLiteTensor* cell_state_quantized =
             GetTemporary(context, node, /*index=*/3);
@@ -1614,8 +1607,8 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
             /*time_major=*/true, /*output_offset=*/0, scratch_buffer,
             scaling_factors, prod_scaling_factors, recovered_cell_weights,
             input_quantized,
-            /*aux_input_quantized=*/nullptr, activation_state_quantized,
-            cell_state_quantized, activation_state, cell_state,
+            /*aux_input_quantized=*/nullptr, output_state_quantized,
+            cell_state_quantized, output_state, cell_state,
             output_scratch_buffer, output, zero_points, row_sums, row_sums_size,
             &op_data->compute_row_sums,
             CpuBackendContext::GetFromContext(context));
@@ -1638,9 +1631,9 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
               forget_layer_norm_coefficients, cell_layer_norm_coefficients,
               output_layer_norm_coefficients, input_gate_bias, forget_gate_bias,
               cell_bias, output_gate_bias, projection_weights, projection_bias,
-              params, &op_data->integer_lstm_param, activation_state,
-              cell_state, output, scratch0, scratch1, scratch2, scratch3,
-              scratch4, scratch5, CpuBackendContext::GetFromContext(context));
+              params, &op_data->integer_lstm_param, output_state, cell_state,
+              output, scratch0, scratch1, scratch2, scratch3, scratch4,
+              scratch5, CpuBackendContext::GetFromContext(context));
         } else {
           TfLiteTensor* scratch0 = GetTemporary(context, node, /*index=*/0);
           TfLiteTensor* scratch1 = GetTemporary(context, node, /*index=*/1);
@@ -1660,7 +1653,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
               forget_layer_norm_coefficients, cell_layer_norm_coefficients,
               output_layer_norm_coefficients, input_gate_bias, forget_gate_bias,
               cell_bias, output_gate_bias, projection_weights, projection_bias,
-              params, activation_state, cell_state, output,
+              params, output_state, cell_state, output,
               &op_data->integer_lstm_param, scratch0, scratch1, scratch2,
               scratch3, scratch4, scratch5, scratch6, scratch7);
           return kTfLiteOk;
diff --git a/tensorflow/lite/kernels/lstm_eval.cc b/tensorflow/lite/kernels/lstm_eval.cc
index b4d43414d89..65f68b34251 100644
--- a/tensorflow/lite/kernels/lstm_eval.cc
+++ b/tensorflow/lite/kernels/lstm_eval.cc
@@ -900,7 +900,7 @@ inline void LstmStepHybrid(
 
 // Fully quantized lstm kernel for 16 bit gate matmul output.
 //
-// Input activation of size n_batch * n_input:
+// Input tensor of size n_batch * n_input:
 //   input_ptr
 //
 // LSTM weights:
@@ -972,7 +972,7 @@ inline void LstmStepHybrid(
 //   cell_scale: the power of two scale for cell state.
 //
 // Zero points:
-//   activation_zp: zero point of activation
+//   output_state_zp: zero point of output state
 //   hidden_zp: zero point for hidden state.
 //
 // Temporary pre-allocated storage for the calculation. Each is of size n_cell *
@@ -1048,8 +1048,8 @@ inline void LstmStepInteger(
     const int32_t* input_to_input_effective_bias,
     const int32_t* recurrent_to_input_effective_bias,
     const int32_t* projection_effective_bias, int32 n_batch, int32 n_cell,
-    int32 n_input, int32 n_output, int8_t* activation_ptr,
-    int32_t activation_zp, int16_t* cell_ptr, int8_t* output_ptr,
+    int32 n_input, int32 n_output, int8_t* output_state_ptr,
+    int32_t output_state_zp, int16_t* cell_ptr, int8_t* output_ptr,
     int16_t* scratch_0_ptr, int16_t* scratch_1_ptr, int16_t* scratch_2_ptr,
     int16_t* scratch_3_ptr, int8_t* scratch_4_ptr, int32_t* scratch_5_ptr,
     CpuBackendContext* context) {
@@ -1088,7 +1088,7 @@ inline void LstmStepInteger(
       n_batch, n_input, n_cell, 0, scratch_5_ptr, scratch_1_ptr, context);
 
   tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-      activation_ptr, recurrent_to_forget_effective_bias,
+      output_state_ptr, recurrent_to_forget_effective_bias,
       recurrent_to_forget_weight_ptr, effective_recurrent_to_forget_scale_a,
       effective_recurrent_to_forget_scale_b, n_batch, n_output, n_cell, 0,
       scratch_5_ptr, scratch_1_ptr, context);
@@ -1115,7 +1115,7 @@ inline void LstmStepInteger(
       n_input, n_cell, 0, scratch_5_ptr, scratch_2_ptr, context);
 
   tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-      activation_ptr, recurrent_to_cell_effective_bias,
+      output_state_ptr, recurrent_to_cell_effective_bias,
       recurrent_to_cell_weight_ptr, effective_recurrent_to_cell_scale_a,
       effective_recurrent_to_cell_scale_b, n_batch, n_output, n_cell, 0,
       scratch_5_ptr, scratch_2_ptr, context);
@@ -1139,7 +1139,7 @@ inline void LstmStepInteger(
         n_batch, n_input, n_cell, 0, scratch_5_ptr, scratch_0_ptr, context);
 
     tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-        activation_ptr, recurrent_to_input_effective_bias,
+        output_state_ptr, recurrent_to_input_effective_bias,
         recurrent_to_input_weight_ptr, effective_recurrent_to_input_scale_a,
         effective_recurrent_to_input_scale_b, n_batch, n_output, n_cell, 0,
         scratch_5_ptr, scratch_0_ptr, context);
@@ -1180,7 +1180,7 @@ inline void LstmStepInteger(
       n_batch, n_input, n_cell, 0, scratch_5_ptr, scratch_3_ptr, context);
 
   tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-      activation_ptr, recurrent_to_output_effective_bias,
+      output_state_ptr, recurrent_to_output_effective_bias,
       recurrent_to_output_weight_ptr, effective_recurrent_to_output_scale_a,
       effective_recurrent_to_output_scale_b, n_batch, n_output, n_cell, 0,
       scratch_5_ptr, scratch_3_ptr, context);
@@ -1213,7 +1213,7 @@ inline void LstmStepInteger(
     tensor_utils::MatrixBatchVectorMultiplyAccumulate(
         scratch_4_ptr, projection_effective_bias, proj_weight_ptr,
         effective_proj_scale_a, effective_proj_scale_b, n_batch, n_cell,
-        n_output, activation_zp, scratch_5_ptr, output_ptr, context);
+        n_output, output_state_zp, scratch_5_ptr, output_ptr, context);
     if (quantized_proj_clip > 0) {
       tensor_utils::CwiseClipping(output_ptr, quantized_proj_clip, n_batch,
                                   n_output);
@@ -1221,12 +1221,12 @@ inline void LstmStepInteger(
   } else {
     std::copy_n(scratch_4_ptr, n_batch * n_output, output_ptr);
   }
-  std::copy_n(output_ptr, n_batch * n_output, activation_ptr);
+  std::copy_n(output_ptr, n_batch * n_output, output_state_ptr);
 }
 
 // Fully quantized lstm kernel for 8 bit gate matmul output.
 //
-// Input activation of size n_batch * n_input:
+// Input tensor of size n_batch * n_input:
 //   input_ptr
 //
 // LSTM weights:
@@ -1298,7 +1298,7 @@ inline void LstmStepInteger(
 //   cell_scale: the power of two scale for cell state.
 //
 // Zero points:
-//   activation_zp: zero point of activation
+//   output_state_zp: zero point of output state.
 //   hidden_zp: zero point for hidden state.
 //
 // Temporary pre-allocated storage for the calculation. Each is of size n_cell *
@@ -1367,8 +1367,8 @@ void LstmStepInteger(
     const int32_t* intermediate_scale_a, const int32_t* intermediate_scale_b,
     const int32_t* intermediate_zp, int32 quantized_cell_clip,
     int32 quantized_proj_clip, int32 n_batch, int32 n_cell, int32 n_input,
-    int32 n_output, int32 output_batch_leading_dim, int8_t* activation_ptr,
-    int32_t activation_zp, int16_t* cell_ptr, int8_t* output_ptr,
+    int32 n_output, int32 output_batch_leading_dim, int8_t* output_state_ptr,
+    int32_t output_state_zp, int16_t* cell_ptr, int8_t* output_ptr,
     int8_t* scratch0, int8_t* scratch1, int16_t* scratch2, int16_t* scratch3,
     int16_t* scratch4, int16_t* scratch5, int16_t* scratch6,
     int16_t* scratch7) {
@@ -1381,7 +1381,7 @@ void LstmStepInteger(
       n_batch, n_input, n_cell, scratch0, intermediate_zp[4]);
 
   tensor_utils::MatrixBatchVectorMultiply(
-      activation_ptr, activation_zp, recurrent_to_forget_weight_ptr,
+      output_state_ptr, output_state_zp, recurrent_to_forget_weight_ptr,
       effective_recurrent_to_forget_scale_a,
       effective_recurrent_to_forget_scale_b, n_batch, n_output, n_cell,
       scratch1, intermediate_zp[5]);
@@ -1408,7 +1408,7 @@ void LstmStepInteger(
       n_input, n_cell, scratch0, intermediate_zp[7]);
 
   tensor_utils::MatrixBatchVectorMultiply(
-      activation_ptr, activation_zp, recurrent_to_cell_weight_ptr,
+      output_state_ptr, output_state_zp, recurrent_to_cell_weight_ptr,
       effective_recurrent_to_cell_scale_a, effective_recurrent_to_cell_scale_b,
       n_batch, n_output, n_cell, scratch1, intermediate_zp[8]);
 
@@ -1434,7 +1434,7 @@ void LstmStepInteger(
       n_batch, n_input, n_cell, scratch0, intermediate_zp[10]);
 
   tensor_utils::MatrixBatchVectorMultiply(
-      activation_ptr, activation_zp, recurrent_to_output_weight_ptr,
+      output_state_ptr, output_state_zp, recurrent_to_output_weight_ptr,
       effective_recurrent_to_output_scale_a,
       effective_recurrent_to_output_scale_b, n_batch, n_output, n_cell,
       scratch1, intermediate_zp[11]);
@@ -1478,7 +1478,7 @@ void LstmStepInteger(
   // Projection.
   tensor_utils::MatrixBatchVectorMultiply(
       scratch3, proj_weight_ptr, effective_proj_scale_a, effective_proj_scale_b,
-      proj_bias_ptr, n_batch, n_cell, n_output, activation_zp, output_ptr);
+      proj_bias_ptr, n_batch, n_cell, n_output, output_state_zp, output_ptr);
 
   // Projection clipping.
   if (quantized_proj_clip > 0) {
@@ -1486,8 +1486,8 @@ void LstmStepInteger(
                                 n_output);
   }
 
-  // Copy output to activation.
-  std::copy_n(output_ptr, n_batch * n_output, activation_ptr);
+  // Copy output to output state.
+  std::copy_n(output_ptr, n_batch * n_output, output_state_ptr);
 }
 
 }  // namespace
@@ -1518,9 +1518,8 @@ TfLiteStatus EvalFloat(
     const TfLiteTensor* cell_bias, const TfLiteTensor* output_gate_bias,
     const TfLiteTensor* projection_weights, const TfLiteTensor* projection_bias,
     const TfLiteLSTMParams* params, bool forward_sequence, bool time_major,
-    int output_offset, TfLiteTensor* scratch_buffer,
-    TfLiteTensor* activation_state, TfLiteTensor* cell_state,
-    TfLiteTensor* output) {
+    int output_offset, TfLiteTensor* scratch_buffer, TfLiteTensor* output_state,
+    TfLiteTensor* cell_state, TfLiteTensor* output) {
   TF_LITE_ASSERT(input->dims->size >= 2 && input->dims->size <= 3);
   int max_time, n_batch;
   if (input->dims->size == 3) {
@@ -1604,10 +1603,9 @@ TfLiteStatus EvalFloat(
           GetTensorData<float>(projection_weights),
           GetTensorData<float>(projection_bias), params, n_batch, n_cell,
           n_input, aux_input_size, n_output, output_batch_leading_dim,
-          GetTensorData<float>(activation_state),
-          GetTensorData<float>(cell_state), input_gate_scratch,
-          forget_gate_scratch, cell_gate_scratch, output_gate_scratch,
-          output_ptr);
+          GetTensorData<float>(output_state), GetTensorData<float>(cell_state),
+          input_gate_scratch, forget_gate_scratch, cell_gate_scratch,
+          output_gate_scratch, output_ptr);
     }
   } else {
     for (int b = 0; b < n_batch; b++) {
@@ -1628,9 +1626,9 @@ TfLiteStatus EvalFloat(
         float* output_ptr = GetTensorData<float>(output) +
                             time_offset * output_step + output_offset;
 
-        // Offset the {activation,cell}_state pointers to the right batch.
-        float* activation_state_ptr = GetTensorData<float>(activation_state) +
-                                      b * output_batch_leading_dim;
+        // Offset the {output,cell}_state pointers to the right batch.
+        float* output_state_ptr =
+            GetTensorData<float>(output_state) + b * output_batch_leading_dim;
         float* cell_state_ptr = GetTensorData<float>(cell_state) + b * n_cell;
         // Offset the scratch pointers to the right batch.
         float* input_gate_scratch_ptr =
@@ -1666,7 +1664,7 @@ TfLiteStatus EvalFloat(
             GetTensorData<float>(projection_weights),
             GetTensorData<float>(projection_bias), params, /*n_batch=*/1,
             n_cell, n_input, aux_input_size, n_output, output_batch_leading_dim,
-            activation_state_ptr, cell_state_ptr, input_gate_scratch_ptr,
+            output_state_ptr, cell_state_ptr, input_gate_scratch_ptr,
             forget_gate_scratch_ptr, cell_gate_scratch_ptr,
             output_gate_scratch_ptr, output_ptr);
       }
@@ -1939,10 +1937,10 @@ TfLiteStatus EvalInteger8x8_16(
     const TfLiteTensor* projection_weights, const TfLiteTensor* projection_bias,
     const TfLiteLSTMParams* params,
     const lstm_eval::IntegerLstmParameter* integer_lstm_param,
-    TfLiteTensor* activation_state, TfLiteTensor* cell_state,
-    TfLiteTensor* output, TfLiteTensor* scratch0, TfLiteTensor* scratch1,
-    TfLiteTensor* scratch2, TfLiteTensor* scratch3, TfLiteTensor* scratch4,
-    TfLiteTensor* scratch5, CpuBackendContext* context) {
+    TfLiteTensor* output_state, TfLiteTensor* cell_state, TfLiteTensor* output,
+    TfLiteTensor* scratch0, TfLiteTensor* scratch1, TfLiteTensor* scratch2,
+    TfLiteTensor* scratch3, TfLiteTensor* scratch4, TfLiteTensor* scratch5,
+    CpuBackendContext* context) {
   TF_LITE_ASSERT(input->dims->size >= 2 && input->dims->size <= 3);
   const int n_input = input->dims->data[input->dims->size - 1];
   int max_time, n_batch;
@@ -1959,7 +1957,7 @@ TfLiteStatus EvalInteger8x8_16(
   const int n_output = recurrent_to_output_weights->dims->data[1];
 
   // Activation zero point
-  int activation_zp = activation_state->params.zero_point;
+  int output_state_zp = output_state->params.zero_point;
 
   // Get params for time/batch/sequence.
   const int output_batch_leading_dim =
@@ -2042,8 +2040,8 @@ TfLiteStatus EvalInteger8x8_16(
         integer_lstm_param->input_to_input_effective_bias.get(),
         integer_lstm_param->recurrent_to_input_effective_bias.get(),
         integer_lstm_param->projection_effective_bias.get(), n_batch, n_cell,
-        n_input, n_output, GetTensorData<int8_t>(activation_state),
-        activation_zp, GetTensorData<int16_t>(cell_state), output_ptr,
+        n_input, n_output, GetTensorData<int8_t>(output_state), output_state_zp,
+        GetTensorData<int16_t>(cell_state), output_ptr,
         GetTensorData<int16_t>(scratch0), GetTensorData<int16_t>(scratch1),
         GetTensorData<int16_t>(scratch2), GetTensorData<int16_t>(scratch3),
         GetTensorData<int8_t>(scratch4), GetTensorData<int32_t>(scratch5),
@@ -2072,7 +2070,7 @@ TfLiteStatus EvalInteger8x8_8(
     const TfLiteTensor* input_gate_bias, const TfLiteTensor* forget_gate_bias,
     const TfLiteTensor* cell_bias, const TfLiteTensor* output_gate_bias,
     const TfLiteTensor* projection_weights, const TfLiteTensor* projection_bias,
-    const TfLiteLSTMParams* params, TfLiteTensor* activation_state,
+    const TfLiteLSTMParams* params, TfLiteTensor* output_state,
     TfLiteTensor* cell_state, TfLiteTensor* output,
     const lstm_eval::IntegerLstmParameter* integer_lstm_param,
     TfLiteTensor* scratch0, TfLiteTensor* scratch1, TfLiteTensor* scratch2,
@@ -2131,11 +2129,11 @@ TfLiteStatus EvalInteger8x8_8(
   const int32_t* output_bias_ptr = GetTensorData<int32_t>(output_gate_bias);
   const int32_t* proj_bias_ptr = GetTensorData<int32_t>(projection_bias);
   int16_t* cell_ptr = GetTensorData<int16_t>(cell_state);
-  int8_t* activation_ptr = GetTensorData<int8_t>(activation_state);
+  int8_t* output_state_ptr = GetTensorData<int8_t>(output_state);
   int8_t* output_ptr = nullptr;
 
   const int32 input_zp = input->params.zero_point;
-  const int32 activation_zp = activation_state->params.zero_point;
+  const int32 output_state_zp = output_state->params.zero_point;
 
   // Get params for time/batch/sequence.
   const int output_batch_leading_dim =
@@ -2222,7 +2220,7 @@ TfLiteStatus EvalInteger8x8_8(
         integer_lstm_param->intermediate_zp,
         integer_lstm_param->quantized_cell_clip,
         integer_lstm_param->quantized_proj_clip, n_batch, n_cell, n_input,
-        n_output, output_batch_leading_dim, activation_ptr, activation_zp,
+        n_output, output_batch_leading_dim, output_state_ptr, output_state_zp,
         cell_ptr, output_ptr, GetTensorData<int8_t>(scratch0),
         GetTensorData<int8_t>(scratch1), GetTensorData<int16_t>(scratch2),
         GetTensorData<int16_t>(scratch3), GetTensorData<int16_t>(scratch4),
diff --git a/tensorflow/lite/kernels/lstm_eval.h b/tensorflow/lite/kernels/lstm_eval.h
index 91f47b18df6..3c9b4bccf42 100644
--- a/tensorflow/lite/kernels/lstm_eval.h
+++ b/tensorflow/lite/kernels/lstm_eval.h
@@ -120,9 +120,8 @@ TfLiteStatus EvalFloat(
     const TfLiteTensor* cell_bias, const TfLiteTensor* output_gate_bias,
     const TfLiteTensor* projection_weights, const TfLiteTensor* projection_bias,
     const TfLiteLSTMParams* params, bool forward_sequence, bool time_major,
-    int output_offset, TfLiteTensor* scratch_buffer,
-    TfLiteTensor* activation_state, TfLiteTensor* cell_state,
-    TfLiteTensor* output);
+    int output_offset, TfLiteTensor* scratch_buffer, TfLiteTensor* output_state,
+    TfLiteTensor* cell_state, TfLiteTensor* output);
 
 TfLiteStatus EvalHybrid(
     const TfLiteTensor* input, const TfLiteTensor* input_to_input_weights,
@@ -179,10 +178,10 @@ TfLiteStatus EvalInteger8x8_16(
     const TfLiteTensor* projection_weights, const TfLiteTensor* projection_bias,
     const TfLiteLSTMParams* params,
     const lstm_eval::IntegerLstmParameter* integer_lstm_param,
-    TfLiteTensor* activation_state, TfLiteTensor* cell_state,
-    TfLiteTensor* output, TfLiteTensor* scratch0, TfLiteTensor* scratch1,
-    TfLiteTensor* scratch2, TfLiteTensor* scratch3, TfLiteTensor* scratch4,
-    TfLiteTensor* scratch5, CpuBackendContext* context);
+    TfLiteTensor* output_state, TfLiteTensor* cell_state, TfLiteTensor* output,
+    TfLiteTensor* scratch0, TfLiteTensor* scratch1, TfLiteTensor* scratch2,
+    TfLiteTensor* scratch3, TfLiteTensor* scratch4, TfLiteTensor* scratch5,
+    CpuBackendContext* context);
 
 TfLiteStatus EvalInteger8x8_8(
     const TfLiteTensor* input, const TfLiteTensor* input_to_input_weights,
@@ -203,7 +202,7 @@ TfLiteStatus EvalInteger8x8_8(
     const TfLiteTensor* input_gate_bias, const TfLiteTensor* forget_gate_bias,
     const TfLiteTensor* cell_bias, const TfLiteTensor* output_gate_bias,
     const TfLiteTensor* projection_weights, const TfLiteTensor* projection_bias,
-    const TfLiteLSTMParams* params, TfLiteTensor* activation_state,
+    const TfLiteLSTMParams* params, TfLiteTensor* output_state,
     TfLiteTensor* cell_state, TfLiteTensor* output,
     const lstm_eval::IntegerLstmParameter* integer_lstm_param,
     TfLiteTensor* scratch0, TfLiteTensor* scratch1, TfLiteTensor* scratch2,
diff --git a/tensorflow/lite/kernels/lstm_shared.h b/tensorflow/lite/kernels/lstm_shared.h
index 9e29650a3d8..0907be9094b 100644
--- a/tensorflow/lite/kernels/lstm_shared.h
+++ b/tensorflow/lite/kernels/lstm_shared.h
@@ -57,8 +57,8 @@ constexpr int kProjectionBiasTensor = 17;  // Optional
 
 // These state tensors are defined as variable tensors, and will be modified by
 // this op.
-constexpr int kInputActivationStateTensor = 18;
-constexpr int kInputCellStateTensor = 19;
+constexpr int kOutputStateTensor = 18;
+constexpr int kCellStateTensor = 19;
 
 // Layer norm coefficient tensors of size {n_cell}, representing a diagonal
 // matrix.
diff --git a/tensorflow/lite/kernels/lstm_test.cc b/tensorflow/lite/kernels/lstm_test.cc
index ba5ee6508cc..f8594f9adf0 100644
--- a/tensorflow/lite/kernels/lstm_test.cc
+++ b/tensorflow/lite/kernels/lstm_test.cc
@@ -104,10 +104,10 @@ class LSTMOpModel : public SingleOpModel {
       projection_bias_ = AddNullInput();
     }
 
-    // Adding the 2 input state tensors.
-    input_activation_state_ =
+    // Adding the 2 state tensors.
+    output_state_ =
         AddInput(TensorData{TensorType_FLOAT32, {n_batch_, n_output_}}, true);
-    input_cell_state_ =
+    cell_state_ =
         AddInput(TensorData{TensorType_FLOAT32, {n_batch_, n_cell_}}, true);
 
     // Layer norm weights.
@@ -266,13 +266,11 @@ class LSTMOpModel : public SingleOpModel {
 
   int projection_weights_;
   int projection_bias_;
-  int input_activation_state_;
-  int input_cell_state_;
-
-  int output_;
   int output_state_;
   int cell_state_;
 
+  int output_;
+
   int n_batch_;
   int n_input_;
   int n_cell_;
@@ -553,7 +551,7 @@ TEST_F(NoCifgNoPeepholeNoProjectionNoClippingOmittedLayerNormLstmTest,
                        {0, 0},  // projection_weight tensor
                        {0},     // projection_bias tensor
 
-                       {n_batch, n_output},  // activation_state tensor
+                       {n_batch, n_output},  // output_state tensor
                        {n_batch, n_cell},    // cell_state tensor
 
                        {0},  // input_layer_norm_coefficient tensor
@@ -1697,7 +1695,7 @@ TEST_F(NoCifgPeepholeProjectionNoClippingLayerNormLstmTest,
           {n_output, n_cell},  // projection_weight tensor
           {0},                 // projection_bias tensor
 
-          {n_batch, n_output},  // activation_state tensor
+          {n_batch, n_output},  // output_state tensor
           {n_batch, n_cell},    // cell_state tensor
 
           {n_cell},  // input_layer_norm_coefficient tensor
@@ -1768,7 +1766,7 @@ TEST_P(NoCifgPeepholeProjectionNoClippingLayerNormLstmTest,
           {n_output, n_cell},  // projection_weight tensor
           {0},                 // projection_bias tensor
 
-          {n_batch, n_output},  // activation_state tensor
+          {n_batch, n_output},  // output_state tensor
           {n_batch, n_cell},    // cell_state tensor
 
           {n_cell},  // input_layer_norm_coefficient tensor
@@ -1841,7 +1839,7 @@ TEST_P(NoCifgPeepholeProjectionNoClippingLayerNormLstmInt8Test,
           {n_output, n_cell},  // projection_weight tensor
           {0},                 // projection_bias tensor
 
-          {n_batch, n_output},  // activation_state tensor
+          {n_batch, n_output},  // output_state tensor
           {n_batch, n_cell},    // cell_state tensor
 
           {n_cell},  // input_layer_norm_coefficient tensor
@@ -1955,7 +1953,7 @@ TEST_F(CifgPeepholeProjectionNoClippingLayerNormLstmTest,
           {n_output, n_cell},  // projection_weight tensor
           {0},                 // projection_bias tensor
 
-          {n_batch, n_output},  // activation_state tensor
+          {n_batch, n_output},  // output_state tensor
           {n_batch, n_cell},    // cell_state tensor
 
           {0},       // input_layer_norm_coefficient tensor
@@ -2026,7 +2024,7 @@ TEST_P(CifgPeepholeProjectionNoClippingLayerNormLstmTest,
           {n_output, n_cell},  // projection_weight tensor
           {0},                 // projection_bias tensor
 
-          {n_batch, n_output},  // activation_state tensor
+          {n_batch, n_output},  // output_state tensor
           {n_batch, n_cell},    // cell_state tensor
 
           {0},       // input_layer_norm_coefficient tensor
@@ -2098,7 +2096,7 @@ TEST_P(CifgPeepholeProjectionNoClippingLayerNormLstmInt8Test,
           {n_output, n_cell},  // projection_weight tensor
           {0},                 // projection_bias tensor
 
-          {n_batch, n_output},  // activation_state tensor
+          {n_batch, n_output},  // output_state tensor
           {n_batch, n_cell},    // cell_state tensor
 
           {0},       // input_layer_norm_coefficient tensor
@@ -2216,13 +2214,13 @@ class LSTMIntegerOpModel : public SingleOpModel {
       projection_bias_ = AddNullInput();
     }
 
-    // Adding the 2 input state tensors.
-    input_activation_state_ = AddInput({TensorType_INT16, input_shapes[18],
-                                        ranges[18].first, ranges[18].second},
-                                       true);
-    input_cell_state_ = AddInput({TensorType_INT16, input_shapes[19],
-                                  ranges[19].first, ranges[19].second},
-                                 true);
+    // Adding the 2 state tensors.
+    output_state_ = AddInput({TensorType_INT16, input_shapes[18],
+                              ranges[18].first, ranges[18].second},
+                             true);
+    cell_state_ = AddInput({TensorType_INT16, input_shapes[19],
+                            ranges[19].first, ranges[19].second},
+                           true);
 
     // Layer norm weights.
     if (use_layer_norm) {
@@ -2386,8 +2384,6 @@ class LSTMIntegerOpModel : public SingleOpModel {
 
   int projection_weights_;
   int projection_bias_;
-  int input_activation_state_;
-  int input_cell_state_;
 
   int intermediates_[5];
 
@@ -2483,7 +2479,7 @@ TEST(LSTMIntegerOpModel, NoCifgYesLayerNormNoYesProjectionNoPeephole) {
       {n_output, n_cell},  // projection_weight tensor
       {0},                 // projection_bias tensor
 
-      {n_batch, n_output},  // activation_state tensor
+      {n_batch, n_output},  // output_state tensor
       {n_batch, n_cell},    // cell_state tensor
 
       {n_cell},  // input_layer_norm_coefficient tensor
@@ -2517,14 +2513,14 @@ TEST(LSTMIntegerOpModel, NoCifgYesLayerNormNoYesProjectionNoPeephole) {
       {-0.5, 0.5},  // projection_weight tensor
       {-1, 1},      // projection_bias tensor
 
-      {-1.0, 32767.0 / 32768},  // activation_state tensor
+      {-1.0, 32767.0 / 32768},  // output_state tensor
       {-1, 1},                  // cell_state tensor
 
       {-1.00001, 1.0},  // input_layer_norm_coefficient tensor
       {-1.00001, 1.0},  // forget_layer_norm_coefficient tensor
       {-1.00001, 1.0},  // cell_layer_norm_coefficient tensor
       {-1.00001, 1.0},  // output_layer_norm_coefficient tensor
-      // Output scale is the same as input activation scale and only activation
+      // Output scale is the same as output_state scale and only output_state
       // scale is used in the op, so this is only provided for clarity.
       {-1.0, 32767.0 / 32768},  // output tensor.
   };
@@ -2685,7 +2681,7 @@ TEST(LSTMIntegerOpModel, NoCifgYesLayerNormNoYesProjectionYesPeephole) {
       {n_output, n_cell},  // projection_weight tensor
       {0},                 // projection_bias tensor
 
-      {n_batch, n_output},  // activation_state tensor
+      {n_batch, n_output},  // output_state tensor
       {n_batch, n_cell},    // cell_state tensor
 
       {n_cell},  // input_layer_norm_coefficient tensor
@@ -2719,14 +2715,14 @@ TEST(LSTMIntegerOpModel, NoCifgYesLayerNormNoYesProjectionYesPeephole) {
       {-0.5, 0.5},  // projection_weight tensor
       {-1, 1},      // projection_bias tensor
 
-      {-1.0, 32767.0 / 32768},  // activation_state tensor
+      {-1.0, 32767.0 / 32768},  // output_state tensor
       {-1, 1},                  // cell_state tensor
 
       {-0.5, 0.5},  // input_layer_norm_coefficient tensor
       {-0.5, 0.5},  // forget_layer_norm_coefficient tensor
       {-1.0, 1.0},  // cell_layer_norm_coefficient tensor
       {-1.0, 1.0},  // output_layer_norm_coefficient tensor
-      // Output scale is the same as input activation scale and only activation
+      // Output scale is the same as output_state scale and only output_state
       // scale is used in the op, so this is only provided for clarity.
       {-1.0, 32767.0 / 32768},  // output tensor.
   };
@@ -2892,13 +2888,13 @@ class LSTMIntegerOpModel8x8_8 : public SingleOpModel {
       projection_bias_ = AddNullInput();
     }
 
-    // Adding the 2 input state tensors.
-    input_activation_state_ = AddInput({TensorType_INT16, input_shapes[18],
-                                        ranges[18].first, ranges[18].second},
-                                       true);
-    input_cell_state_ = AddInput({TensorType_INT16, input_shapes[19],
-                                  ranges[19].first, ranges[19].second},
-                                 true);
+    // Adding the 2 state tensors.
+    output_state_ = AddInput({TensorType_INT16, input_shapes[18],
+                              ranges[18].first, ranges[18].second},
+                             true);
+    cell_state_ = AddInput({TensorType_INT16, input_shapes[19],
+                            ranges[19].first, ranges[19].second},
+                           true);
 
     // Layer norm weights.
     if (use_layer_norm) {
@@ -3062,8 +3058,6 @@ class LSTMIntegerOpModel8x8_8 : public SingleOpModel {
 
   int projection_weights_;
   int projection_bias_;
-  int input_activation_state_;
-  int input_cell_state_;
 
   int intermediates_[12];
 
@@ -3160,7 +3154,7 @@ TEST(LSTMIntegerOpModel8x8_8, CifgYesLayerNormNoYesProjectionNoPeephole) {
       {n_output, n_cell},  // projection_weight tensor
       {n_output},          // projection_bias tensor
 
-      {n_batch, n_output},  // activation_state tensor
+      {n_batch, n_output},  // output_state tensor
       {n_batch, n_cell},    // cell_state tensor
 
       {0},       // input_layer_norm_coefficient tensor
@@ -3194,14 +3188,14 @@ TEST(LSTMIntegerOpModel8x8_8, CifgYesLayerNormNoYesProjectionNoPeephole) {
       {-0.5, 0.5},  // projection_weight tensor
       {-1, 1},      // projection_bias tensor
 
-      {-1.0, 32767.0 / 32768},  // activation_state tensor
+      {-1.0, 32767.0 / 32768},  // output_state tensor
       {-1.0, 32767.0 / 32768},  // cell_state tensor
 
       {-1.00001, 1.0},  // input_layer_norm_coefficient tensor
       {-1.00001, 1.0},  // forget_layer_norm_coefficient tensor
       {-1.00001, 1.0},  // cell_layer_norm_coefficient tensor
       {-1.00001, 1.0},  // output_layer_norm_coefficient tensor
-      // Output scale is the same as input activation scale and only activation
+      // Output scale is the same as output_state scale and only output_state
       // scale is used in the op, so this is only provided for clarity.
       {-1.0, 32767.0 / 32768},  // output tensor.
   };
diff --git a/tensorflow/lite/kernels/unidirectional_sequence_lstm.cc b/tensorflow/lite/kernels/unidirectional_sequence_lstm.cc
index 95864196f18..f1c0f9d42a6 100644
--- a/tensorflow/lite/kernels/unidirectional_sequence_lstm.cc
+++ b/tensorflow/lite/kernels/unidirectional_sequence_lstm.cc
@@ -317,20 +317,20 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
                     CheckInputTensorDimensions(context, node, n_input, n_output,
                                                n_cell, is_layer_norm_lstm));
 
-  // Get the pointer to output, activation_state and cell_state buffer tensors.
+  // Get the pointer to output, output_state and cell_state buffer tensors.
   TfLiteTensor* output = GetOutput(context, node, lstm::full::kOutputTensor);
 
-  TfLiteTensor* activation_state =
-      GetVariableInput(context, node, lstm::full::kInputActivationStateTensor);
-  TF_LITE_ENSURE(context, activation_state != nullptr);
+  TfLiteTensor* output_state =
+      GetVariableInput(context, node, lstm::full::kOutputStateTensor);
+  TF_LITE_ENSURE(context, output_state != nullptr);
   TfLiteTensor* cell_state =
-      GetVariableInput(context, node, lstm::full::kInputCellStateTensor);
+      GetVariableInput(context, node, lstm::full::kCellStateTensor);
   TF_LITE_ENSURE(context, cell_state != nullptr);
 
   // Check the shape of input state tensors.
   // These tensor may be 1D or 2D. It's fine as long as the total size is
   // correct.
-  TF_LITE_ENSURE_EQ(context, NumElements(activation_state), n_batch * n_output);
+  TF_LITE_ENSURE_EQ(context, NumElements(output_state), n_batch * n_output);
   TF_LITE_ENSURE_EQ(context, NumElements(cell_state), n_batch * n_cell);
 
   // Resize the output tensors.
@@ -370,7 +370,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   if (IsHybridOp(input, input_to_output_weights)) {
     op_data->compute_row_sums = true;
     // Allocate temporary tensors to store quantized values of input,
-    // activation_state and cell_state tensors.
+    // output_state and cell_state tensors.
     node->temporaries->data[kInputQuantized] =
         scratch_tensor_index + kInputQuantized;
     TfLiteTensor* input_quantized =
@@ -384,17 +384,17 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
     }
     node->temporaries->data[kOutputStateQuantized] =
         scratch_tensor_index + kOutputStateQuantized;
-    TfLiteTensor* activation_state_quantized =
+    TfLiteTensor* output_state_quantized =
         GetTemporary(context, node, kOutputStateQuantized);
-    activation_state_quantized->type = input_to_output_weights->type;
-    activation_state_quantized->allocation_type = kTfLiteArenaRw;
-    if (!TfLiteIntArrayEqual(activation_state_quantized->dims,
-                             activation_state->dims)) {
-      TfLiteIntArray* activation_state_quantized_size =
-          TfLiteIntArrayCopy(activation_state->dims);
-      TF_LITE_ENSURE_OK(
-          context, context->ResizeTensor(context, activation_state_quantized,
-                                         activation_state_quantized_size));
+    output_state_quantized->type = input_to_output_weights->type;
+    output_state_quantized->allocation_type = kTfLiteArenaRw;
+    if (!TfLiteIntArrayEqual(output_state_quantized->dims,
+                             output_state->dims)) {
+      TfLiteIntArray* output_state_quantized_size =
+          TfLiteIntArrayCopy(output_state->dims);
+      TF_LITE_ENSURE_OK(context,
+                        context->ResizeTensor(context, output_state_quantized,
+                                              output_state_quantized_size));
     }
     node->temporaries->data[kCellStateQuantized] =
         scratch_tensor_index + kCellStateQuantized;
@@ -559,11 +559,11 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   // Index the scratch buffers pointers to the global scratch buffer.
   TfLiteTensor* scratch_buffer = GetTemporary(context, node, /*index=*/0);
 
-  TfLiteTensor* activation_state =
-      GetVariableInput(context, node, lstm::full::kInputActivationStateTensor);
-  TF_LITE_ENSURE(context, activation_state != nullptr);
+  TfLiteTensor* output_state =
+      GetVariableInput(context, node, lstm::full::kOutputStateTensor);
+  TF_LITE_ENSURE(context, output_state != nullptr);
   TfLiteTensor* cell_state =
-      GetVariableInput(context, node, lstm::full::kInputCellStateTensor);
+      GetVariableInput(context, node, lstm::full::kCellStateTensor);
   TF_LITE_ENSURE(context, cell_state != nullptr);
 
   const TfLiteTensor* input_layer_norm_coefficients =
@@ -613,14 +613,14 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
           /*aux_input_to_output_weights=*/nullptr, input_gate_bias,
           forget_gate_bias, cell_bias, output_gate_bias, projection_weights,
           projection_bias, &lstm_params, /*forward_sequence=*/true, time_major,
-          /*output_offset=*/0, scratch_buffer, activation_state, cell_state,
+          /*output_offset=*/0, scratch_buffer, output_state, cell_state,
           output);
     }
     case kTfLiteUInt8:
     case kTfLiteInt8: {
       OpData* op_data = reinterpret_cast<OpData*>(node->user_data);
       TfLiteTensor* input_quantized = GetTemporary(context, node, /*index=*/1);
-      TfLiteTensor* activation_state_quantized =
+      TfLiteTensor* output_state_quantized =
           GetTemporary(context, node, /*index=*/2);
       TfLiteTensor* cell_state_quantized =
           GetTemporary(context, node, /*index=*/3);
@@ -652,10 +652,9 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
           projection_bias, &lstm_params, /*forward_sequence=*/true, time_major,
           /*output_offset=*/0, scratch_buffer, scaling_factors,
           prod_scaling_factors, recovered_cell_weights, input_quantized,
-          /*aux_input_quantized=*/nullptr, activation_state_quantized,
-          cell_state_quantized, activation_state, cell_state, accum_scratch,
-          output, zero_points, row_sums, row_sums_size,
-          &op_data->compute_row_sums,
+          /*aux_input_quantized=*/nullptr, output_state_quantized,
+          cell_state_quantized, output_state, cell_state, accum_scratch, output,
+          zero_points, row_sums, row_sums_size, &op_data->compute_row_sums,
           CpuBackendContext::GetFromContext(context));
     }
     default:
diff --git a/tensorflow/lite/kernels/unidirectional_sequence_lstm_test.cc b/tensorflow/lite/kernels/unidirectional_sequence_lstm_test.cc
index 43cc75f894b..ec20d76ae2e 100644
--- a/tensorflow/lite/kernels/unidirectional_sequence_lstm_test.cc
+++ b/tensorflow/lite/kernels/unidirectional_sequence_lstm_test.cc
@@ -100,13 +100,12 @@ class UnidirectionalLSTMOpModel : public SingleOpModel {
       projection_bias_ = AddNullInput();
     }
 
-    // Adding the 2 input state tensors.
-    input_activation_state_ =
+    // Adding the 2 state tensors.
+    output_state_ =
         AddInput(TensorData{TensorType_FLOAT32, {n_output_ * n_batch_}},
                  /*is_variable=*/true);
-    input_cell_state_ =
-        AddInput(TensorData{TensorType_FLOAT32, {n_cell_ * n_batch_}},
-                 /*is_variable=*/true);
+    cell_state_ = AddInput(TensorData{TensorType_FLOAT32, {n_cell_ * n_batch_}},
+                           /*is_variable=*/true);
 
     // Layer norm weights.
     if (is_layer_norm) {
@@ -256,8 +255,8 @@ class UnidirectionalLSTMOpModel : public SingleOpModel {
   int projection_weights_;
   int projection_bias_;
 
-  int input_activation_state_;
-  int input_cell_state_;
+  int output_state_;
+  int cell_state_;
 
   int input_layer_norm_coefficients_;
   int forget_layer_norm_coefficients_;
@@ -537,7 +536,7 @@ TEST_F(NoCifgNoPeepholeNoProjectionNoClippingUnidirectionalLstmTest,
           {0, 0},  // projection_weight tensor
           {0},     // projection_bias tensor
 
-          {n_batch, n_output},  // activation_state tensor
+          {n_batch, n_output},  // output_state tensor
           {n_batch, n_cell},    // cell_state tensor
       });
 
@@ -599,7 +598,7 @@ TEST_F(NoCifgNoPeepholeNoProjectionNoClippingUnidirectionalLstmTest,
           {0, 0},  // projection_weight tensor
           {0},     // projection_bias tensor
 
-          {n_batch, n_output},  // activation_state tensor
+          {n_batch, n_output},  // output_state tensor
           {n_batch, n_cell},    // cell_state tensor
       });
 
@@ -665,7 +664,7 @@ TEST_P(NoCifgNoPeepholeNoProjectionNoClippingUnidirectionalLstmTest,
           {0, 0},  // projection_weight tensor
           {0},     // projection_bias tensor
 
-          {n_batch, n_output},  // activation_state tensor
+          {n_batch, n_output},  // output_state tensor
           {n_batch, n_cell},    // cell_state tensor
       },
       TensorType_UINT8, GetParam());
@@ -728,7 +727,7 @@ TEST_P(NoCifgNoPeepholeNoProjectionNoClippingUnidirectionalLstmTest,
           {0, 0},  // projection_weight tensor
           {0},     // projection_bias tensor
 
-          {n_batch, n_output},  // activation_state tensor
+          {n_batch, n_output},  // output_state tensor
           {n_batch, n_cell},    // cell_state tensor
       },
       TensorType_INT8, GetParam());
@@ -840,7 +839,7 @@ TEST_F(CifgPeepholeNoProjectionNoClippingUnidirectionalLstmTest,
           {0, 0},  // projection_weight tensor
           {0},     // projection_bias tensor
 
-          {n_batch, n_output},  // activation_state tensor
+          {n_batch, n_output},  // output_state tensor
           {n_batch, n_cell},    // cell_state tensor
       });
 
@@ -901,7 +900,7 @@ TEST_P(CifgPeepholeNoProjectionNoClippingUnidirectionalLstmTest,
           {0, 0},  // projection_weight tensor
           {0},     // projection_bias tensor
 
-          {n_batch, n_output},  // activation_state tensor
+          {n_batch, n_output},  // output_state tensor
           {n_batch, n_cell},    // cell_state tensor
       },
       TensorType_UINT8, GetParam());
@@ -964,7 +963,7 @@ TEST_P(CifgPeepholeNoProjectionNoClippingUnidirectionalLstmTest,
           {0, 0},  // projection_weight tensor
           {0},     // projection_bias tensor
 
-          {n_batch, n_output},  // activation_state tensor
+          {n_batch, n_output},  // output_state tensor
           {n_batch, n_cell},    // cell_state tensor
       },
       TensorType_INT8, GetParam());
@@ -1626,7 +1625,7 @@ TEST_F(NoCifgPeepholeProjectionClippingUnidirectionalLstmTest,
           {n_output, n_cell},  // projection_weight tensor
           {0},                 // projection_bias tensor
 
-          {n_batch, n_output},  // activation_state tensor
+          {n_batch, n_output},  // output_state tensor
           {n_batch, n_cell},    // cell_state tensor
       });
 
@@ -1695,7 +1694,7 @@ TEST_P(NoCifgPeepholeProjectionClippingUnidirectionalLstmTest,
           {n_output, n_cell},  // projection_weight tensor
           {0},                 // projection_bias tensor
 
-          {n_batch, n_output},  // activation_state tensor
+          {n_batch, n_output},  // output_state tensor
           {n_batch, n_cell},    // cell_state tensor
       },
       TensorType_UINT8, GetParam());
@@ -1766,7 +1765,7 @@ TEST_P(NoCifgPeepholeProjectionClippingUnidirectionalLstmTest,
           {n_output, n_cell},  // projection_weight tensor
           {0},                 // projection_bias tensor
 
-          {n_batch, n_output},  // activation_state tensor
+          {n_batch, n_output},  // output_state tensor
           {n_batch, n_cell},    // cell_state tensor
       },
       TensorType_INT8, GetParam());
@@ -2437,7 +2436,7 @@ TEST_F(NoCifgPeepholeProjectionAndBiasClippingUnidirectionalLstmTest,
           {n_output, n_cell},  // projection_weight tensor
           {n_output},          // projection_bias tensor
 
-          {n_batch, n_output},  // activation_state tensor
+          {n_batch, n_output},  // output_state tensor
           {n_batch, n_cell},    // cell_state tensor
       });
 
@@ -2643,7 +2642,7 @@ TEST_F(CifgPeepholeNoProjectionNoClippingLayerNormUnidirectionalLstmTest,
           {0, 0},  // projection_weight tensor
           {0},     // projection_bias tensor
 
-          {n_batch, n_output},  // activation_state tensor
+          {n_batch, n_output},  // output_state tensor
           {n_batch, n_cell},    // cell_state tensor
 
           {0},       // input_layer_norm_coefficient tensor
@@ -2714,7 +2713,7 @@ TEST_F(CifgPeepholeNoProjectionNoClippingUnidirectionalLstmTest,
           {0, 0},  // projection_weight tensor
           {0},     // projection_bias tensor
 
-          {n_batch, n_output},  // activation_state tensor
+          {n_batch, n_output},  // output_state tensor
           {n_batch, n_cell},    // cell_state tensor
 
           {0},  // input_layer_norm_coefficient tensor
diff --git a/tensorflow/lite/tools/optimize/calibration/builtin_logging_ops/lstm.cc b/tensorflow/lite/tools/optimize/calibration/builtin_logging_ops/lstm.cc
index 0d4c614511d..88ea7c1d591 100644
--- a/tensorflow/lite/tools/optimize/calibration/builtin_logging_ops/lstm.cc
+++ b/tensorflow/lite/tools/optimize/calibration/builtin_logging_ops/lstm.cc
@@ -302,9 +302,8 @@ TfLiteStatus EvalFloat(
     const TfLiteTensor* cell_bias, const TfLiteTensor* output_gate_bias,
     const TfLiteTensor* projection_weights, const TfLiteTensor* projection_bias,
     const TfLiteLSTMParams* params, bool forward_sequence, bool time_major,
-    int output_offset, TfLiteTensor* scratch_buffer,
-    TfLiteTensor* activation_state, TfLiteTensor* cell_state,
-    TfLiteTensor* output, Logger* logger,
+    int output_offset, TfLiteTensor* scratch_buffer, TfLiteTensor* output_state,
+    TfLiteTensor* cell_state, TfLiteTensor* output, Logger* logger,
     const std::vector<int>& intermediate_tensor_indexes,
     ErrorReporter* error_reporter) {
   TF_LITE_ASSERT(input->dims->size >= 2 && input->dims->size <= 3);
@@ -390,10 +389,10 @@ TfLiteStatus EvalFloat(
           GetTensorData<float>(projection_weights),
           GetTensorData<float>(projection_bias), params, n_batch, n_cell,
           n_input, aux_input_size, n_output, output_batch_leading_dim,
-          GetTensorData<float>(activation_state),
-          GetTensorData<float>(cell_state), input_gate_scratch,
-          forget_gate_scratch, cell_gate_scratch, output_gate_scratch,
-          output_ptr_time, logger, intermediate_tensor_indexes, error_reporter);
+          GetTensorData<float>(output_state), GetTensorData<float>(cell_state),
+          input_gate_scratch, forget_gate_scratch, cell_gate_scratch,
+          output_gate_scratch, output_ptr_time, logger,
+          intermediate_tensor_indexes, error_reporter);
     }
   } else {
     for (int b = 0; b < n_batch; b++) {
@@ -414,9 +413,9 @@ TfLiteStatus EvalFloat(
         float* output_ptr = GetTensorData<float>(output) +
                             time_offset * output_step + output_offset;
 
-        // Offset the {activation,cell}_state pointers to the right batch.
-        float* activation_state_ptr = GetTensorData<float>(activation_state) +
-                                      b * output_batch_leading_dim;
+        // Offset the {output,cell}_state pointers to the right batch.
+        float* output_state_ptr =
+            GetTensorData<float>(output_state) + b * output_batch_leading_dim;
         float* cell_state_ptr = GetTensorData<float>(cell_state) + b * n_cell;
         // Offset the scratch pointers to the right batch.
         float* input_gate_scratch_ptr =
@@ -452,7 +451,7 @@ TfLiteStatus EvalFloat(
             GetTensorData<float>(projection_weights),
             GetTensorData<float>(projection_bias), params, /*n_batch=*/1,
             n_cell, n_input, aux_input_size, n_output, output_batch_leading_dim,
-            activation_state_ptr, cell_state_ptr, input_gate_scratch_ptr,
+            output_state_ptr, cell_state_ptr, input_gate_scratch_ptr,
             forget_gate_scratch_ptr, cell_gate_scratch_ptr,
             output_gate_scratch_ptr, output_ptr, logger,
             intermediate_tensor_indexes, error_reporter);
@@ -541,11 +540,11 @@ TfLiteStatus lstm_eval(TfLiteContext* context, TfLiteNode* node, Logger* logger,
   // Index the scratch buffers pointers to the global scratch buffer.
   TfLiteTensor* scratch_buffer = GetTemporary(context, node, /*index=*/0);
 
-  TfLiteTensor* activation_state = GetVariableInput(
-      context, node, ops::builtin::lstm::full::kInputActivationStateTensor);
-  TF_LITE_ENSURE(context, activation_state != nullptr);
+  TfLiteTensor* output_state = GetVariableInput(
+      context, node, ops::builtin::lstm::full::kOutputStateTensor);
+  TF_LITE_ENSURE(context, output_state != nullptr);
   TfLiteTensor* cell_state = GetVariableInput(
-      context, node, ops::builtin::lstm::full::kInputCellStateTensor);
+      context, node, ops::builtin::lstm::full::kCellStateTensor);
   TF_LITE_ENSURE(context, cell_state != nullptr);
 
   TfLiteTensor* output =
@@ -574,8 +573,8 @@ TfLiteStatus lstm_eval(TfLiteContext* context, TfLiteNode* node, Logger* logger,
           forget_gate_bias, cell_bias, output_gate_bias, projection_weights,
           projection_bias, params, /*forward_sequence=*/true,
           /*time_major=*/true,
-          /*output_offset=*/0, scratch_buffer, activation_state, cell_state,
-          output, logger, intermediate_tensor_indexes, error_reporter);
+          /*output_offset=*/0, scratch_buffer, output_state, cell_state, output,
+          logger, intermediate_tensor_indexes, error_reporter);
     }
     case kTfLiteUInt8:
     case kTfLiteInt8:

From 23d482eaa2efe2bb38de7eb4f89539be9e3aa32a Mon Sep 17 00:00:00 2001
From: Jared Duke <jdduke@google.com>
Date: Wed, 17 Jun 2020 09:57:30 -0700
Subject: [PATCH 0406/1390] Add flag for using optimized TFLite CPU kernels on
 Android

Add an experimental flag which allows opting in to a set of highly
optimized floating point kernels provided via the XNNPACK delegate.
This is offered as a preview, with the plan to enable these kernels
by default in a future release. The flag can be enabled via:

  Interpreter.Options options =
      new Interpreter.Options().setUseXNNPACK(true);

See tensorflow/lite/delegates/xnnpack/README.md for more details about
these kernels and the associated delegate functionality.

PiperOrigin-RevId: 316909226
Change-Id: Ib60cf259225b8a48a9830ccbb24ec10534b038ce
---
 tensorflow/lite/delegates/xnnpack/BUILD       |  2 +
 .../delegates/xnnpack/xnnpack_delegate.cc     |  3 +
 tensorflow/lite/java/BUILD                    |  1 +
 .../java/org/tensorflow/lite/Interpreter.java | 27 +++++++++
 .../lite/NativeInterpreterWrapper.java        |  7 +++
 tensorflow/lite/java/src/main/native/BUILD    |  1 +
 .../native/nativeinterpreterwrapper_jni.cc    | 55 +++++++++++++++++++
 .../lite/InterpreterMobileNetTest.java        | 16 ++++++
 .../org/tensorflow/lite/InterpreterTest.java  | 32 +++++++++++
 9 files changed, 144 insertions(+)

diff --git a/tensorflow/lite/delegates/xnnpack/BUILD b/tensorflow/lite/delegates/xnnpack/BUILD
index 5736a2995b1..97e6aea2a6b 100644
--- a/tensorflow/lite/delegates/xnnpack/BUILD
+++ b/tensorflow/lite/delegates/xnnpack/BUILD
@@ -21,6 +21,7 @@ cc_library(
     linkstatic = True,
     deps = [
         "//tensorflow/lite:kernel_api",
+        "//tensorflow/lite:minimal_logging",
         "//tensorflow/lite:util",
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/schema:schema_fbs",
@@ -47,6 +48,7 @@ cc_library(
     linkstatic = True,
     deps = [
         "//tensorflow/lite:kernel_api",
+        "//tensorflow/lite:minimal_logging",
         "//tensorflow/lite:util",
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/schema:schema_fbs",
diff --git a/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.cc b/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.cc
index c4c95b6b295..739e45f62e4 100644
--- a/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.cc
+++ b/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.cc
@@ -32,6 +32,7 @@ limitations under the License.
 #include "tensorflow/lite/builtin_ops.h"
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/minimal_logging.h"
 #include "tensorflow/lite/tools/optimize/sparsity/format_converter.h"
 
 namespace tflite {
@@ -52,6 +53,8 @@ class Delegate {
           pthreadpool_create(static_cast<size_t>(options->num_threads)));
     }
 #endif
+    TFLITE_LOG_PROD_ONCE(tflite::TFLITE_LOG_INFO,
+                         "Created TensorFlow Lite XNNPACK delegate for CPU.");
   }
 
   TfLiteIntArray* PrepareOpsToDelegate(TfLiteContext* context);
diff --git a/tensorflow/lite/java/BUILD b/tensorflow/lite/java/BUILD
index 101e98e3dd1..d0331bca3e5 100644
--- a/tensorflow/lite/java/BUILD
+++ b/tensorflow/lite/java/BUILD
@@ -408,6 +408,7 @@ tflite_jni_binary(
         "//tensorflow/lite/c:c_api",
         "//tensorflow/lite/c:c_api_experimental",
         "//tensorflow/lite/delegates/nnapi/java/src/main/native",
+        "//tensorflow/lite/delegates/xnnpack:xnnpack_delegate",
         "//tensorflow/lite/java/src/main/native",
     ],
 )
diff --git a/tensorflow/lite/java/src/main/java/org/tensorflow/lite/Interpreter.java b/tensorflow/lite/java/src/main/java/org/tensorflow/lite/Interpreter.java
index 7c9c5644f47..5993ee7a037 100644
--- a/tensorflow/lite/java/src/main/java/org/tensorflow/lite/Interpreter.java
+++ b/tensorflow/lite/java/src/main/java/org/tensorflow/lite/Interpreter.java
@@ -137,10 +137,37 @@ public final class Interpreter implements AutoCloseable {
       return this;
     }
 
+    /**
+     * Experimental: Enable an optimized set of floating point CPU kernels (provided by XNNPACK).
+     *
+     * <p>Enabling this flag will enable use of a new, highly optimized set of CPU kernels provided
+     * via the XNNPACK delegate. Currently, this is restricted to a subset of floating point
+     * operations. Eventually, we plan to enable this by default, as it can provide significant
+     * peformance benefits for many classes of floating point models. See
+     * https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/delegates/xnnpack/README.md
+     * for more details.
+     *
+     * <p>Things to keep in mind when enabling this flag:
+     *
+     * <ul>
+     *   <li>Startup time and resize time may increase.
+     *   <li>Baseline memory consumption may increase.
+     *   <li>Compatibility with other delegates (e.g., GPU) has not been fully validated.
+     *   <li>Quantized models will not see any benefit.
+     * </ul>
+     *
+     * <p>WARNING: This is an experimental interface that is subject to change.
+     */
+    public Options setUseXNNPACK(boolean useXNNPACK) {
+      this.useXNNPACK = useXNNPACK;
+      return this;
+    }
+
     int numThreads = -1;
     Boolean useNNAPI;
     Boolean allowFp16PrecisionForFp32;
     Boolean allowBufferHandleOutput;
+    Boolean useXNNPACK;
     final List<Delegate> delegates = new ArrayList<>();
   }
 
diff --git a/tensorflow/lite/java/src/main/java/org/tensorflow/lite/NativeInterpreterWrapper.java b/tensorflow/lite/java/src/main/java/org/tensorflow/lite/NativeInterpreterWrapper.java
index 8eb3c66f3b5..5e9a6eecf00 100644
--- a/tensorflow/lite/java/src/main/java/org/tensorflow/lite/NativeInterpreterWrapper.java
+++ b/tensorflow/lite/java/src/main/java/org/tensorflow/lite/NativeInterpreterWrapper.java
@@ -80,6 +80,10 @@ final class NativeInterpreterWrapper implements AutoCloseable {
       allowBufferHandleOutput(interpreterHandle, options.allowBufferHandleOutput.booleanValue());
     }
     applyDelegates(options);
+    if (options.useXNNPACK != null) {
+      useXNNPACK(
+          interpreterHandle, errorHandle, options.useXNNPACK.booleanValue(), options.numThreads);
+    }
     allocateTensors(interpreterHandle, errorHandle);
     this.isMemoryAllocated = true;
   }
@@ -438,6 +442,9 @@ final class NativeInterpreterWrapper implements AutoCloseable {
 
   private static native void allowBufferHandleOutput(long interpreterHandle, boolean allow);
 
+  private static native void useXNNPACK(
+      long interpreterHandle, long errorHandle, boolean state, int numThreads);
+
   private static native long createErrorReporter(int size);
 
   private static native long createModel(String modelPathOrBuffer, long errorHandle);
diff --git a/tensorflow/lite/java/src/main/native/BUILD b/tensorflow/lite/java/src/main/native/BUILD
index fdbbc9dc72c..52f79615a9f 100644
--- a/tensorflow/lite/java/src/main/native/BUILD
+++ b/tensorflow/lite/java/src/main/native/BUILD
@@ -31,6 +31,7 @@ cc_library(
         "//tensorflow/lite:string_util",
         "//tensorflow/lite:util",
         "//tensorflow/lite/c:common",
+        "//tensorflow/lite/delegates/xnnpack:xnnpack_delegate_hdrs_only",
         "//tensorflow/lite/experimental/tflite_api_dispatcher:tflite_api_dispatcher_with_kernels",
         "//tensorflow/lite/java/jni",
     ],
diff --git a/tensorflow/lite/java/src/main/native/nativeinterpreterwrapper_jni.cc b/tensorflow/lite/java/src/main/native/nativeinterpreterwrapper_jni.cc
index 690b58ac1f4..7abe0f518f0 100644
--- a/tensorflow/lite/java/src/main/native/nativeinterpreterwrapper_jni.cc
+++ b/tensorflow/lite/java/src/main/native/nativeinterpreterwrapper_jni.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <dlfcn.h>
 #include <jni.h>
 #include <stdio.h>
 #include <time.h>
@@ -20,6 +21,7 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
 #include "tensorflow/lite/experimental/tflite_api_dispatcher/tflite_api_dispatcher.h"
 #include "tensorflow/lite/java/src/main/native/jni_utils.h"
 #include "tensorflow/lite/util.h"
@@ -323,6 +325,59 @@ Java_org_tensorflow_lite_NativeInterpreterWrapper_allowBufferHandleOutput(
   interpreter->SetAllowBufferHandleOutput(allow);
 }
 
+JNIEXPORT void JNICALL
+Java_org_tensorflow_lite_NativeInterpreterWrapper_useXNNPACK(
+    JNIEnv* env, jclass clazz, jlong handle, jlong error_handle, jboolean state,
+    jint num_threads) {
+  // If not using xnnpack, simply don't apply the delegate.
+  if (!state) {
+    return;
+  }
+
+  tflite_api_dispatcher::Interpreter* interpreter =
+      convertLongToInterpreter(env, handle);
+  if (interpreter == nullptr) {
+    return;
+  }
+
+  BufferErrorReporter* error_reporter =
+      convertLongToErrorReporter(env, error_handle);
+  if (error_reporter == nullptr) {
+    return;
+  }
+
+  // We use dynamic loading to avoid taking a hard dependency on XNNPack.
+  // This allows clients that use trimmed builds to save on binary size.
+  auto xnnpack_options_default =
+      reinterpret_cast<decltype(TfLiteXNNPackDelegateOptionsDefault)*>(
+          dlsym(RTLD_DEFAULT, "TfLiteXNNPackDelegateOptionsDefault"));
+  auto xnnpack_create =
+      reinterpret_cast<decltype(TfLiteXNNPackDelegateCreate)*>(
+          dlsym(RTLD_DEFAULT, "TfLiteXNNPackDelegateCreate"));
+  auto xnnpack_delete =
+      reinterpret_cast<decltype(TfLiteXNNPackDelegateDelete)*>(
+          dlsym(RTLD_DEFAULT, "TfLiteXNNPackDelegateDelete"));
+
+  if (xnnpack_options_default && xnnpack_create && xnnpack_delete) {
+    TfLiteXNNPackDelegateOptions options = xnnpack_options_default();
+    if (num_threads > 0) {
+      options.num_threads = num_threads;
+    }
+    tflite_api_dispatcher::Interpreter::TfLiteDelegatePtr delegate(
+        xnnpack_create(&options), xnnpack_delete);
+    if (interpreter->ModifyGraphWithDelegate(std::move(delegate)) !=
+        kTfLiteOk) {
+      ThrowException(env, kIllegalArgumentException,
+                     "Internal error: Failed to apply XNNPACK delegate: %s",
+                     error_reporter->CachedErrorMessage());
+    }
+  } else {
+    ThrowException(env, kIllegalArgumentException,
+                   "Failed to load XNNPACK delegate from current runtime. "
+                   "Have you added the necessary dependencies?");
+  }
+}
+
 JNIEXPORT void JNICALL
 Java_org_tensorflow_lite_NativeInterpreterWrapper_numThreads(JNIEnv* env,
                                                              jclass clazz,
diff --git a/tensorflow/lite/java/src/test/java/org/tensorflow/lite/InterpreterMobileNetTest.java b/tensorflow/lite/java/src/test/java/org/tensorflow/lite/InterpreterMobileNetTest.java
index 446cf5f7b02..80b3bf3cab9 100644
--- a/tensorflow/lite/java/src/test/java/org/tensorflow/lite/InterpreterMobileNetTest.java
+++ b/tensorflow/lite/java/src/test/java/org/tensorflow/lite/InterpreterMobileNetTest.java
@@ -54,6 +54,16 @@ public final class InterpreterMobileNetTest {
     runMobileNetFloatTest(new Interpreter.Options().setNumThreads(2));
   }
 
+  @Test
+  public void testMobileNetEnhancedCpuKernels() {
+    runMobileNetFloatTest(new Interpreter.Options().setUseXNNPACK(true));
+  }
+
+  @Test
+  public void testMobileNetEnhancedCpuKernelsMultithreaded() {
+    runMobileNetFloatTest(new Interpreter.Options().setUseXNNPACK(true).setNumThreads(2));
+  }
+
   @Test
   public void testMobileNetQuantized() {
     runMobileNetQuantizedTest(new Interpreter.Options());
@@ -64,6 +74,12 @@ public final class InterpreterMobileNetTest {
     runMobileNetQuantizedTest(new Interpreter.Options().setNumThreads(2));
   }
 
+  @Test
+  public void testMobileNetQuantizedEnhancedCpu() {
+    // The "enhanced CPU flag" should only impact float models, this is a sanity test to confirm.
+    runMobileNetQuantizedTest(new Interpreter.Options().setUseXNNPACK(true));
+  }
+
   private static void runMobileNetFloatTest(Interpreter.Options options) {
     ByteBuffer img =
         TestUtils.getTestImageAsFloatByteBuffer(
diff --git a/tensorflow/lite/java/src/test/java/org/tensorflow/lite/InterpreterTest.java b/tensorflow/lite/java/src/test/java/org/tensorflow/lite/InterpreterTest.java
index 3daa9fe0766..f1d4ff147b1 100644
--- a/tensorflow/lite/java/src/test/java/org/tensorflow/lite/InterpreterTest.java
+++ b/tensorflow/lite/java/src/test/java/org/tensorflow/lite/InterpreterTest.java
@@ -409,6 +409,38 @@ public final class InterpreterTest {
     interpreter.close();
   }
 
+  @Test
+  public void testUseXNNPACK() throws Exception {
+    Interpreter interpreter =
+        new Interpreter(MODEL_BUFFER, new Interpreter.Options().setUseXNNPACK(true));
+    float[] oneD = {1.23f, 6.54f, 7.81f};
+    float[][] twoD = {oneD, oneD, oneD, oneD, oneD, oneD, oneD, oneD};
+    float[][][] threeD = {twoD, twoD, twoD, twoD, twoD, twoD, twoD, twoD};
+    float[][][][] fourD = {threeD, threeD};
+    float[][][][] parsedOutputs = new float[2][8][8][3];
+    interpreter.run(fourD, parsedOutputs);
+    float[] outputOneD = parsedOutputs[0][0][0];
+    float[] expected = {3.69f, 19.62f, 23.43f};
+    assertThat(outputOneD).usingTolerance(0.1f).containsExactly(expected).inOrder();
+    interpreter.close();
+  }
+
+  @Test
+  public void testResizeWithEnhancedCpuKernels() throws Exception {
+    Interpreter interpreter =
+        new Interpreter(MODEL_BUFFER, new Interpreter.Options().setUseXNNPACK(true));
+    float[] input = {1.f};
+    float[] output = new float[1];
+    interpreter.run(input, output);
+    assertThat(output).usingTolerance(0.1f).containsExactly(new float[] {3.f}).inOrder();
+
+    // The new input shape should trigger a resize. Inference should still work properly.
+    float[] input2 = {1.f, 2.f};
+    float[] output2 = new float[2];
+    interpreter.run(input2, output2);
+    assertThat(output2).usingTolerance(0.1f).containsExactly(new float[] {3.f, 6.f}).inOrder();
+  }
+
   @Test
   public void testRedundantClose() throws Exception {
     Interpreter interpreter = new Interpreter(MODEL_BUFFER);

From 076f1474faa95cb70277d96000c2803ea1f8852b Mon Sep 17 00:00:00 2001
From: Vo Van Nghia <vovannghia2409@gmail.com>
Date: Thu, 18 Jun 2020 00:24:43 +0700
Subject: [PATCH 0407/1390] Return instead of using out parameter

---
 tensorflow/c/env.cc | 4 ++--
 tensorflow/c/env.h  | 3 +--
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/tensorflow/c/env.cc b/tensorflow/c/env.cc
index 6d8528bc42f..ce715c43acb 100644
--- a/tensorflow/c/env.cc
+++ b/tensorflow/c/env.cc
@@ -147,8 +147,8 @@ TF_StringStream* TF_GetLocalTempDirectories() {
   return list;
 }
 
-void TF_GetTempFileName(const char* extension, char** name) {
-  *name = strdup(::tensorflow::io::GetTempFilename(extension).c_str());
+char* TF_GetTempFileName(const char* extension) {
+  return strdup(::tensorflow::io::GetTempFilename(extension).c_str());
 }
 
 TF_CAPI_EXPORT extern uint64_t TF_NowNanos(void) {
diff --git a/tensorflow/c/env.h b/tensorflow/c/env.h
index 29ec417a75e..7dc7ac32f08 100644
--- a/tensorflow/c/env.h
+++ b/tensorflow/c/env.h
@@ -154,8 +154,7 @@ TF_CAPI_EXPORT extern TF_StringStream* TF_GetLocalTempDirectories(void);
 
 // Creates a temporary file name with an extension.
 // The caller is responsible for freeing the returned pointer.
-TF_CAPI_EXPORT extern void TF_GetTempFileName(const char* extension,
-                                              char** name);
+TF_CAPI_EXPORT extern char* TF_GetTempFileName(const char* extension);
 
 // Returns the number of nanoseconds since the Unix epoch.
 TF_CAPI_EXPORT extern uint64_t TF_NowNanos(void);

From c1ae0ef7ce398891118797be3de3e8437f306c7b Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 17 Jun 2020 10:22:03 -0700
Subject: [PATCH 0408/1390] Qualify uses of std::string

PiperOrigin-RevId: 316914367
Change-Id: Iae32a48b4db10d313f9e9b72f56eb6a6ac64c55f
---
 tensorflow/lite/toco/python/toco_python_api.cc | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/tensorflow/lite/toco/python/toco_python_api.cc b/tensorflow/lite/toco/python/toco_python_api.cc
index 3f3d301a40d..0f21d0854ae 100644
--- a/tensorflow/lite/toco/python/toco_python_api.cc
+++ b/tensorflow/lite/toco/python/toco_python_api.cc
@@ -47,9 +47,9 @@ namespace toco {
 
 void PopulateConversionLogHelper(const toco::ModelFlags& model_flags,
                                  toco::TocoFlags* toco_flags,
-                                 const string& input_contents_txt,
-                                 const string& output_file_contents_txt,
-                                 const string& error_message,
+                                 const std::string& input_contents_txt,
+                                 const std::string& output_file_contents_txt,
+                                 const std::string& error_message,
                                  GraphVizDumpOptions* dump_options) {
   // Make sure the graphviz file will be dumped under the same folder.
   dump_options->dump_graphviz = toco_flags->conversion_summary_dir();
@@ -167,7 +167,7 @@ PyObject* TocoConvert(PyObject* model_flags_proto_txt_raw,
     dump_options.dump_graphviz_video = toco_flags.dump_graphviz_include_video();
   }
 
-  string output_file_contents_txt;
+  std::string output_file_contents_txt;
   tensorflow::Status status;
   int64 arithmetic_ops_count;
 
@@ -221,7 +221,7 @@ PyObject* TocoGetPotentiallySupportedOps() {
   std::vector<std::string> supported_ops = toco::GetPotentiallySupportedOps();
   PyObject* list = PyList_New(supported_ops.size());
   for (size_t i = 0; i < supported_ops.size(); ++i) {
-    const string& op = supported_ops[i];
+    const std::string& op = supported_ops[i];
     PyObject* op_dict = PyDict_New();
     PyDict_SetItemString(op_dict, "op", PyUnicode_FromString(op.c_str()));
     PyList_SetItem(list, i, op_dict);

From 4747be646ba3a7cfe979afeb9fcaafdca5f1c800 Mon Sep 17 00:00:00 2001
From: Andrew Audibert <aaudibert@google.com>
Date: Wed, 17 Jun 2020 10:46:23 -0700
Subject: [PATCH 0409/1390] [tf.data service] Increase the number of
 client-side uncompress threads.

This is necessary to prevent uncompression from becoming the bottleneck.

The change required updating the unit tests because now the `distribute` transformation may prefetch up to 16 elements.

PiperOrigin-RevId: 316919714
Change-Id: I4e0c0b2985792a2a2a0f216de2143a645076b1c8
---
 .../data/experimental/ops/data_service_ops.py |  5 +++-
 .../kernel_tests/data_service_ops_test.py     | 27 +++++++++++--------
 2 files changed, 20 insertions(+), 12 deletions(-)

diff --git a/tensorflow/python/data/experimental/ops/data_service_ops.py b/tensorflow/python/data/experimental/ops/data_service_ops.py
index 39790d843ba..01ec155a89c 100644
--- a/tensorflow/python/data/experimental/ops/data_service_ops.py
+++ b/tensorflow/python/data/experimental/ops/data_service_ops.py
@@ -240,8 +240,11 @@ def _distribute(processing_mode,
         task_refresh_interval_hint_ms=task_refresh_interval_hint_ms)
     # TODO(b/157105111): Make this an autotuned parallel map when we have a way
     # to limit memory usage.
+    # The value 16 is chosen based on experience with pipelines that require
+    # more than 8 parallel calls to prevent this stage from being a bottleneck.
     dataset = dataset.map(
-        lambda x: compression_ops.uncompress(x, output_spec=uncompressed_spec))
+        lambda x: compression_ops.uncompress(x, output_spec=uncompressed_spec),
+        num_parallel_calls=16)
 
     # Disable autosharding for shared jobs.
     if job_name:
diff --git a/tensorflow/python/data/kernel_tests/data_service_ops_test.py b/tensorflow/python/data/kernel_tests/data_service_ops_test.py
index d316009ce0c..2356a866d6e 100644
--- a/tensorflow/python/data/kernel_tests/data_service_ops_test.py
+++ b/tensorflow/python/data/kernel_tests/data_service_ops_test.py
@@ -201,13 +201,18 @@ class DataServiceOpsTest(test_base.DatasetTestBase, parameterized.TestCase):
     self._new_worker = server_lib.WorkerServer(
         port=port, master_address=self._master._address, protocol=PROTOCOL)
 
-    # The dataset starts over now that we read from the new worker.
-    for i in range(num_elements):
+    # There may have been some elements prefetched from the first worker
+    # before it was stopped.
+    while True:
+      val = next(iterator).numpy()
+      if val == 0:
+        break
+
+    # The dataset starts over now that we read from the new worker.
+    # TODO(b/157086991): Iterate until end of sequence when we support
+    # detecting lost workers.
+    for i in range(1, num_elements // 2):
       val = next(iterator).numpy()
-      if val == midpoint and i != midpoint:
-        # There may have been one last element prefetched from the first worker
-        # before it was stopped.
-        val = next(iterator).numpy()
       self.assertEqual(i, val)
 
   @combinations.generate(test_base.eager_only_combinations())
@@ -248,7 +253,7 @@ class DataServiceOpsTest(test_base.DatasetTestBase, parameterized.TestCase):
 
   @combinations.generate(test_base.eager_only_combinations())
   def testSharedJobName(self):
-    num_elements = 10
+    num_elements = 100
     master_address = self.create_cluster(1)
     ds = dataset_ops.Dataset.range(num_elements)
     ds1 = _make_distributed_dataset(ds, master_address, job_name="job_name")
@@ -256,7 +261,7 @@ class DataServiceOpsTest(test_base.DatasetTestBase, parameterized.TestCase):
     iter1 = iter(ds1)
     iter2 = iter(ds2)
     results = []
-    for _ in range(3):
+    for _ in range(num_elements // 5):
       results.append(next(iter1).numpy())
       results.append(next(iter2).numpy())
     for elem in iter1:
@@ -291,7 +296,7 @@ class DataServiceOpsTest(test_base.DatasetTestBase, parameterized.TestCase):
 
   @combinations.generate(test_base.eager_only_combinations())
   def testSharedJobNameRepeat(self):
-    num_elements = 10
+    num_elements = 100
     num_repetitions = 3
     master_address = self.create_cluster(1)
     ds = dataset_ops.Dataset.range(num_elements)
@@ -302,9 +307,9 @@ class DataServiceOpsTest(test_base.DatasetTestBase, parameterized.TestCase):
     results = []
     iter1 = iter(ds1)
     iter2 = iter(ds2)
-    for _ in range(((num_elements * num_repetitions) // 2) - 1):
+    for _ in range((num_elements * num_repetitions) // 5):
       results.append(next(iter1).numpy())
-    for _ in range(((num_elements * num_repetitions) // 2) - 1):
+    for _ in range((num_elements * num_repetitions) // 5):
       results.append(next(iter2).numpy())
     for elem in iter1:
       results.append(elem.numpy())

From 2ba59dab2c22a592cb47660ecdb12463e457139c Mon Sep 17 00:00:00 2001
From: Thomas O'Malley <omalleyt@google.com>
Date: Wed, 17 Jun 2020 10:56:38 -0700
Subject: [PATCH 0410/1390] Simplify Layer.add_udpate in v2 and update
 version_selector to use v1 inside a tf.compat.v1.wrap_function.

No longer track unused Layer.updates in v2.

PiperOrigin-RevId: 316921838
Change-Id: I4698a0c925528594f402f824705d66b8a1ae7b72
---
 tensorflow/python/keras/engine/base_layer.py  | 53 +++----------------
 .../python/keras/engine/sequential_test.py    | 33 +++++-------
 .../python/keras/layers/normalization_test.py | 19 ++-----
 .../python/keras/layers/wrappers_test.py      |  3 --
 .../python/keras/utils/version_utils.py       | 33 ++++++++----
 5 files changed, 50 insertions(+), 91 deletions(-)

diff --git a/tensorflow/python/keras/engine/base_layer.py b/tensorflow/python/keras/engine/base_layer.py
index a0ee25417c0..5ddce951491 100644
--- a/tensorflow/python/keras/engine/base_layer.py
+++ b/tensorflow/python/keras/engine/base_layer.py
@@ -1733,54 +1733,15 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
       inputs: Deprecated, will be automatically inferred.
     """
     call_context = base_layer_utils.call_context()
-
-    if (ds_context.has_strategy() and
-        ds_context.in_cross_replica_context() and
-        # When saving the model, the distribution strategy context should be
-        # ignored, following the default path for adding updates.
-        not call_context.saving):
-      # Updates don't need to be run in a cross-replica context.
+    # No need to run updates during Functional API construction.
+    if call_context.in_keras_graph:
       return
 
-    updates = generic_utils.to_list(updates)
-
-    # All updates can be run immediately in Eager or in a tf.function.
-    if base_layer_utils.is_in_eager_or_tf_function():
-      if not call_context.frozen:
-        for update in updates:
-          if callable(update):
-            update()
-      return
-
-    def process_update(x):
-      """Standardize update ops.
-
-      Arguments:
-        x: Tensor, op, or callable.
-
-      Returns:
-        An update op.
-      """
-      if callable(x):
-        update = lambda: process_update(x())
-        if not ops.executing_eagerly_outside_functions():
-          # In V1 mode, call the callable right away and process. This is needed
-          # for TPU strategy.
-          return update()
-      elif isinstance(x, ops.Operation):
-        update = x
-      elif hasattr(x, 'op'):
-        update = x.op
-      else:
-        update = ops.convert_to_tensor_v2(x)
-      return update
-
-    updates = [process_update(x) for x in updates]
-    # Non-callable Updates are run automatically inside `call` in V2, so
-    # they do not need to be tracked later.
-    if ops.executing_eagerly_outside_functions() and call_context.in_call:
-      updates = [u for u in updates if callable(u)]
-    self._updates.extend(updates)
+    # Callable updates are disabled by setting `trainable=False`.
+    if not call_context.frozen:
+      for update in nest.flatten(updates):
+        if callable(update):
+          update()
 
   def set_weights(self, weights):
     """Sets the weights of the layer, from Numpy arrays.
diff --git a/tensorflow/python/keras/engine/sequential_test.py b/tensorflow/python/keras/engine/sequential_test.py
index 9589d24fc57..773ce003656 100644
--- a/tensorflow/python/keras/engine/sequential_test.py
+++ b/tensorflow/python/keras/engine/sequential_test.py
@@ -231,33 +231,28 @@ class TestSequential(keras_parameterized.TestCase):
     inner_model.trainable = True
     self.assertEqual(len(model.trainable_weights), 4)
 
+  @keras_parameterized.run_all_keras_modes
   def test_sequential_update_disabling(self):
     val_a = np.random.random((10, 4))
     val_out = np.random.random((10, 4))
 
-    with self.cached_session():
-      model = keras.models.Sequential()
-      model.add(keras.layers.BatchNormalization(input_shape=(4,)))
-      assert model.updates
+    model = keras.models.Sequential()
+    model.add(keras.layers.BatchNormalization(input_shape=(4,)))
 
-      model.trainable = False
-      assert not model.updates
+    model.trainable = False
+    model.compile('sgd', 'mse')
 
-      model.compile('sgd', 'mse')
-      assert not model.updates
+    x1 = model.predict(val_a)
+    model.train_on_batch(val_a, val_out)
+    x2 = model.predict(val_a)
+    self.assertAllClose(x1, x2, atol=1e-7)
 
-      x1 = model.predict(val_a)
-      model.train_on_batch(val_a, val_out)
-      x2 = model.predict(val_a)
-      self.assertAllClose(x1, x2, atol=1e-7)
+    model.trainable = True
+    model.compile('sgd', 'mse')
 
-      model.trainable = True
-      model.compile('sgd', 'mse')
-      assert model.updates
-
-      model.train_on_batch(val_a, val_out)
-      x2 = model.predict(val_a)
-      assert np.abs(np.sum(x1 - x2)) > 1e-5
+    model.train_on_batch(val_a, val_out)
+    x2 = model.predict(val_a)
+    assert np.abs(np.sum(x1 - x2)) > 1e-5
 
   @keras_parameterized.run_all_keras_modes
   def test_sequential_deferred_build_serialization(self):
diff --git a/tensorflow/python/keras/layers/normalization_test.py b/tensorflow/python/keras/layers/normalization_test.py
index ef43bcf5d22..39992f7580a 100644
--- a/tensorflow/python/keras/layers/normalization_test.py
+++ b/tensorflow/python/keras/layers/normalization_test.py
@@ -325,18 +325,18 @@ class BatchNormalizationV2Test(keras_parameterized.TestCase):
       norm(inp)
 
   def test_updates_in_wrap_function(self):
-    layer = normalization.BatchNormalization()
 
     def my_func():
+      layer = normalization.BatchNormalization()
       x = array_ops.ones((10, 1))
-      return layer(x, training=True)
+      y = layer(x, training=True)
+      # Updates should be tracked in a `wrap_function`.
+      self.assertLen(layer.updates, 2)
+      return y
 
     wrapped_fn = wrap_function.wrap_function(my_func, [])
     wrapped_fn()
 
-    # Updates should be tracked in a `wrap_function`.
-    self.assertLen(layer.updates, 2)
-
   @keras_parameterized.run_all_keras_modes
   def test_basic_batchnorm_v2_none_shape_and_virtual_batch_size(self):
     # Test case for GitHub issue for 32380
@@ -392,15 +392,11 @@ class NormalizationLayersGraphModeOnlyTest(
       model.compile(gradient_descent.GradientDescentOptimizer(0.01), 'mse')
       model.train_on_batch(x, x)
 
-      self.assertLen(bn.updates, 4)
-
       # Test model-level reuse
       x3 = keras.layers.Input(shape=(10,))
       y3 = model(x3)
       new_model = keras.models.Model(x3, y3, name='new_model')
 
-      self.assertLen(new_model.updates, 6)
-      self.assertLen(model.updates, 6)
       new_model.compile(gradient_descent.GradientDescentOptimizer(0.01), 'mse')
       new_model.train_on_batch(x, x)
 
@@ -415,10 +411,7 @@ class NormalizationLayersGraphModeOnlyTest(
       model = keras.models.Model(a, b)
 
       model.trainable = False
-      assert not model.updates
-
       model.compile(gradient_descent.GradientDescentOptimizer(0.01), 'mse')
-      assert not model.updates
 
       x1 = model.predict(val_a)
       model.train_on_batch(val_a, val_out)
@@ -427,7 +420,6 @@ class NormalizationLayersGraphModeOnlyTest(
 
       model.trainable = True
       model.compile(gradient_descent.GradientDescentOptimizer(0.01), 'mse')
-      assert model.updates
 
       model.train_on_batch(val_a, val_out)
       x2 = model.predict(val_a)
@@ -435,7 +427,6 @@ class NormalizationLayersGraphModeOnlyTest(
 
       layer.trainable = False
       model.compile(gradient_descent.GradientDescentOptimizer(0.01), 'mse')
-      assert not model.updates
 
       x1 = model.predict(val_a)
       model.train_on_batch(val_a, val_out)
diff --git a/tensorflow/python/keras/layers/wrappers_test.py b/tensorflow/python/keras/layers/wrappers_test.py
index a73177fff12..5ee794dd1ef 100644
--- a/tensorflow/python/keras/layers/wrappers_test.py
+++ b/tensorflow/python/keras/layers/wrappers_test.py
@@ -234,13 +234,10 @@ class TimeDistributedTest(keras_parameterized.TestCase):
     x = keras.layers.Input(shape=(3, 2))
     layer = keras.layers.TimeDistributed(keras.layers.BatchNormalization())
     _ = layer(x)
-    self.assertEqual(len(layer.updates), 2)
     self.assertEqual(len(layer.trainable_weights), 2)
     layer.trainable = False
-    assert not layer.updates
     assert not layer.trainable_weights
     layer.trainable = True
-    assert len(layer.updates) == 2
     assert len(layer.trainable_weights) == 2
 
   def test_TimeDistributed_with_masked_embedding_and_unspecified_shape(self):
diff --git a/tensorflow/python/keras/utils/version_utils.py b/tensorflow/python/keras/utils/version_utils.py
index 551a07d2422..d3796dcbf92 100644
--- a/tensorflow/python/keras/utils/version_utils.py
+++ b/tensorflow/python/keras/utils/version_utils.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.eager import context
 from tensorflow.python.framework import ops
 from tensorflow.python.util import lazy_loader
 
@@ -51,8 +52,8 @@ class ModelVersionSelector(object):
   """Chooses between Keras v1 and v2 Model class."""
 
   def __new__(cls, *args, **kwargs):  # pylint: disable=unused-argument
-    eager_enabled = ops.executing_eagerly_outside_functions()
-    cls = swap_class(cls, training.Model, training_v1.Model, eager_enabled)
+    use_v2 = should_use_v2()
+    cls = swap_class(cls, training.Model, training_v1.Model, use_v2)  # pylint: disable=self-cls-assignment
     return super(ModelVersionSelector, cls).__new__(cls)
 
 
@@ -60,8 +61,8 @@ class LayerVersionSelector(object):
   """Chooses between Keras v1 and v2 Layer class."""
 
   def __new__(cls, *args, **kwargs):  # pylint: disable=unused-argument
-    eager_enabled = ops.executing_eagerly_outside_functions()
-    cls = swap_class(cls, base_layer.Layer, base_layer_v1.Layer, eager_enabled)
+    use_v2 = should_use_v2()
+    cls = swap_class(cls, base_layer.Layer, base_layer_v1.Layer, use_v2)  # pylint: disable=self-cls-assignment
     return super(LayerVersionSelector, cls).__new__(cls)
 
 
@@ -69,10 +70,10 @@ class TensorBoardVersionSelector(object):
   """Chooses between Keras v1 and v2 TensorBoard callback class."""
 
   def __new__(cls, *args, **kwargs):  # pylint: disable=unused-argument
-    eager_enabled = ops.executing_eagerly_outside_functions()
+    use_v2 = should_use_v2()
     start_cls = cls
     cls = swap_class(start_cls, callbacks.TensorBoard, callbacks_v1.TensorBoard,
-                     eager_enabled)
+                     use_v2)
     if start_cls == callbacks_v1.TensorBoard and cls == callbacks.TensorBoard:
       # Since the v2 class is not a subclass of the v1 class, __init__ has to
       # be called manually.
@@ -80,19 +81,33 @@ class TensorBoardVersionSelector(object):
     return super(TensorBoardVersionSelector, cls).__new__(cls)
 
 
-def swap_class(cls, v2_cls, v1_cls, eager_enabled):
+def should_use_v2():
+  """Determine if v1 or v2 version should be used."""
+  if context.executing_eagerly():
+    return True
+  elif ops.executing_eagerly_outside_functions():
+    # Check for a v1 `wrap_function` FuncGraph.
+    # Code inside a `wrap_function` is treated like v1 code.
+    graph = ops.get_default_graph()
+    if (getattr(graph, "name", False) and
+        graph.name.startswith("wrapped_function")):
+      return False
+    return True
+
+
+def swap_class(cls, v2_cls, v1_cls, use_v2):
   """Swaps in v2_cls or v1_cls depending on graph mode."""
   if cls == object:
     return cls
 
   if cls in (v2_cls, v1_cls):
-    if eager_enabled:
+    if use_v2:
       return v2_cls
     return v1_cls
 
   # Recursively search superclasses to swap in the right Keras class.
   cls.__bases__ = tuple(
-      swap_class(base, v2_cls, v1_cls, eager_enabled) for base in cls.__bases__)
+      swap_class(base, v2_cls, v1_cls, use_v2) for base in cls.__bases__)
   return cls
 
 
From f8fd28e4a0e694ca808da57e10d1e3b40e4f2fb9 Mon Sep 17 00:00:00 2001
From: Lu Wang <luwa@google.com>
Date: Wed, 17 Jun 2020 10:58:32 -0700
Subject: [PATCH 0411/1390] Create the Java API doc section on tensorflow.org

PiperOrigin-RevId: 316922276
Change-Id: I1fd4458017daecfd7256313480a4b3d1602a9310
---
 tensorflow/lite/g3doc/_book.yaml        | 2 ++
 tensorflow/lite/g3doc/api_docs/index.md | 6 +++---
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/tensorflow/lite/g3doc/_book.yaml b/tensorflow/lite/g3doc/_book.yaml
index 0da3d152090..abb18870003 100644
--- a/tensorflow/lite/g3doc/_book.yaml
+++ b/tensorflow/lite/g3doc/_book.yaml
@@ -198,6 +198,8 @@ upper_tabs:
       - title: "Overview"
         status: external
         path: /api_docs/python/tf/lite
+      - heading: "Android (Java)"
+      - include: /lite/api_docs/java/_toc.yaml
       - heading: "C++"
       - title: Overview
         path: /lite/api_docs/cc/
diff --git a/tensorflow/lite/g3doc/api_docs/index.md b/tensorflow/lite/g3doc/api_docs/index.md
index 533f5881eb4..5db55fb28a3 100644
--- a/tensorflow/lite/g3doc/api_docs/index.md
+++ b/tensorflow/lite/g3doc/api_docs/index.md
@@ -4,7 +4,7 @@ The API reference documentation provides detailed information for each of the
 classes and methods in the TensorFlow Lite library. Choose your preferred
 platform from the list below.
 
-*   [Python API reference](/api_docs/python/tf/lite)
-*   Android API reference (coming soon)
+*   [Python API reference](https://tensorflow.org/api_docs/python/tf/lite)
+*   [Android (Java) API reference](https://tensorflow.org/lite/api_docs/java/org/tensorflow/lite/package-summary)
 *   iOS API reference (coming soon)
-*   [C++ API reference](/lite/api_docs/cc/)
+*   [C++ API reference](https://tensorflow.org/lite/api_docs/cc)

From 2a8bbb92b74a5dae9dd7e0cc3aed9b79231a06c3 Mon Sep 17 00:00:00 2001
From: Thomas O'Malley <omalleyt@google.com>
Date: Wed, 17 Jun 2020 10:59:06 -0700
Subject: [PATCH 0412/1390] Reduce TensorShape.__init__ overhead by 50%.

TensorShape.__init__ is on the hotpath because a TensorShape is
created the first time EagerTensor.shape is called. The
TensorShape is created from EagerTensor._shape_tuple, which is
a tuple of ints. This change optimizes the code for this common path.

PiperOrigin-RevId: 316922384
Change-Id: I063ea393450123ea4150972e5c73647f03a29cf5
---
 .../data/kernel_tests/from_generator_test.py       |  4 ++--
 tensorflow/python/eager/benchmarks_test.py         | 14 ++++++++++++++
 tensorflow/python/framework/tensor_shape.py        | 12 +++++++++---
 3 files changed, 25 insertions(+), 5 deletions(-)

diff --git a/tensorflow/python/data/kernel_tests/from_generator_test.py b/tensorflow/python/data/kernel_tests/from_generator_test.py
index a08f54a7101..386108f0de7 100644
--- a/tensorflow/python/data/kernel_tests/from_generator_test.py
+++ b/tensorflow/python/data/kernel_tests/from_generator_test.py
@@ -465,8 +465,8 @@ class FromGeneratorTest(test_base.DatasetTestBase, parameterized.TestCase):
       for _ in range(10):
         yield [20]
 
-    with self.assertRaisesRegexp(
-        TypeError, r"Failed to convert '\[\[1\]\]' to a shape"):
+    with self.assertRaisesRegex(TypeError,
+                                r"Dimension value must be integer or None"):
       dataset_ops.Dataset.from_generator(
           generator, output_types=(dtypes.int64), output_shapes=[[1]])
 
diff --git a/tensorflow/python/eager/benchmarks_test.py b/tensorflow/python/eager/benchmarks_test.py
index b7c8395790a..24e86c77a14 100644
--- a/tensorflow/python/eager/benchmarks_test.py
+++ b/tensorflow/python/eager/benchmarks_test.py
@@ -50,6 +50,7 @@ from tensorflow.python.eager import test
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_spec
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
@@ -1441,6 +1442,19 @@ class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
 
     self._run(fn, 10000)
 
+  def benchmark_tf_tensor_shape_creation_overhead(self):
+    # A `TensorShape` is created the first time `EagerTensor.shape` is
+    # called, which puts `TensorShape.__init__` on the hotpath. The
+    # `TensorShape` is created from `EagerTensor._shape_tuple`.
+
+    x = array_ops.ones((1, 1))
+    shape_tuple = x._shape_tuple()
+
+    def fn():
+      tensor_shape.TensorShape(shape_tuple)
+
+    self._run(fn, 100000)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/framework/tensor_shape.py b/tensorflow/python/framework/tensor_shape.py
index fd229b6691a..20508f37eb7 100644
--- a/tensorflow/python/framework/tensor_shape.py
+++ b/tensorflow/python/framework/tensor_shape.py
@@ -184,10 +184,14 @@ class Dimension(object):
 
   def __init__(self, value):
     """Creates a new Dimension with the given value."""
-    if value is None:
+    if isinstance(value, int):  # Most common case.
+      if value < 0:
+        raise ValueError("Dimension %d must be >= 0" % value)
+      self._value = value
+    elif value is None:
       self._value = None
     elif isinstance(value, Dimension):
-      self._value = value
+      self._value = value._value
     else:
       try:
         # int(...) compensates for the int/long dichotomy on Python 2.X.
@@ -748,7 +752,9 @@ class TensorShape(object):
     Raises:
       TypeError: If dims cannot be converted to a list of dimensions.
     """
-    if dims is None:
+    if isinstance(dims, (tuple, list)):  # Most common case.
+      self._dims = [Dimension(d) for d in dims]
+    elif dims is None:
       self._dims = None
     elif isinstance(dims, tensor_shape_pb2.TensorShapeProto):
       if dims.unknown_rank:

From c8dd07ae28a5bdc291cf43adb128fd11a6b2d6f7 Mon Sep 17 00:00:00 2001
From: Bruce Fontaine <bfontain@google.com>
Date: Wed, 17 Jun 2020 11:00:38 -0700
Subject: [PATCH 0413/1390] Fix error check in TPUEmbedding to work when used
 in outside compilation.

PiperOrigin-RevId: 316922750
Change-Id: Ie6b4c83e54f3e6d90fbe38fe8da0eea84312c382
---
 tensorflow/python/tpu/tpu_embedding_v2.py     | 20 ++++++++----
 .../python/tpu/tpu_embedding_v2_test.py       | 31 ++++++++++++++++---
 2 files changed, 41 insertions(+), 10 deletions(-)

diff --git a/tensorflow/python/tpu/tpu_embedding_v2.py b/tensorflow/python/tpu/tpu_embedding_v2.py
index f7a383c440c..e5cfba7c587 100644
--- a/tensorflow/python/tpu/tpu_embedding_v2.py
+++ b/tensorflow/python/tpu/tpu_embedding_v2.py
@@ -1024,11 +1024,8 @@ class TPUEmbedding(tracking.AutoTrackable):
   def _raise_error_for_inputs_not_on_cpu(self, features):
     """Checks all tensors in features to see are placed on the CPU."""
 
-    # expand_composites here is important, we need to check the device of each
-    # underlying tensor.
-    for path, input_tensor in nest.flatten_with_joined_string_paths(
-        features, expand_composites=True):
-      spec = tf_device.DeviceSpec.from_string(input_tensor.device)
+    def check_device(path, device_string):
+      spec = tf_device.DeviceSpec.from_string(device_string)
       if spec.device_type == "TPU":
         raise ValueError(
             "Received input tensor {} which is on a TPU input device {}. Input "
@@ -1037,7 +1034,18 @@ class TPUEmbedding(tracking.AutoTrackable):
             "setting the 'experimental_prefetch_to_device' option of the "
             "dataset distribution function. See the documentation of the "
             "enqueue method for an example.".format(
-                path, input_tensor.device))
+                path, device_string))
+
+    # expand_composites here is important, we need to check the device of each
+    # underlying tensor.
+    for path, input_tensor in nest.flatten_with_joined_string_paths(
+        features, expand_composites=True):
+      if (input_tensor.op.type == "Identity" and
+          input_tensor.op.inputs[0].op.type == "TPUReplicatedInput"):
+        for tensor in input_tensor.op.inputs[0].op.inputs:
+          check_device(path, tensor.device)
+      else:
+        check_device(path, input_tensor.device)
 
   def enqueue(self, features, weights=None, training=True, name=None):
     """Enqueues id tensors for embedding lookup.
diff --git a/tensorflow/python/tpu/tpu_embedding_v2_test.py b/tensorflow/python/tpu/tpu_embedding_v2_test.py
index ebaf2791055..ff09085f3f1 100644
--- a/tensorflow/python/tpu/tpu_embedding_v2_test.py
+++ b/tensorflow/python/tpu/tpu_embedding_v2_test.py
@@ -727,10 +727,33 @@ class TPUEmbeddingTest(parameterized.TestCase, test.TestCase):
       def get_activations():
         return mid_level_api.dequeue()
 
-      sparse_features = next(sparse_iter)
-      mid_level_api.enqueue(sparse_features, training=False)
-      sparse_activations = strategy.run(get_activations)
-      return sparse_activations
+      features = next(sparse_iter)
+      mid_level_api.enqueue(features, training=False)
+      activations = strategy.run(get_activations)
+      return activations
+
+    with self.assertRaisesRegex(ValueError, 'which is on a TPU input device'):
+      test_fn()
+
+  @parameterized.parameters([True, False])
+  def test_enqueue_cpu_tensor_with_outside_compilation(self, use_mlir):
+    if use_mlir:
+      config.enable_mlir_bridge()
+
+    strategy, mid_level_api, _ = self._create_strategy_and_mid_level('sgd')
+
+    input_fn = self._create_dense_input_fn(strategy)
+    sparse_iter = iter(strategy.experimental_distribute_datasets_from_function(
+        input_fn))
+
+    @def_function.function
+    def test_fn():
+      def get_activations(features):
+        mid_level_api.enqueue(features, training=False)
+        return mid_level_api.dequeue()
+
+      activations = strategy.run(get_activations, args=(next(sparse_iter),))
+      return activations
 
     with self.assertRaisesRegex(ValueError, 'which is on a TPU input device'):
       test_fn()

From ab7cb8336c780ffe02ec983e8910cf62250ec3e1 Mon Sep 17 00:00:00 2001
From: Robert David <lrdx@google.com>
Date: Wed, 17 Jun 2020 11:15:35 -0700
Subject: [PATCH 0414/1390] Use the separate matrix/vector scaling factor
 version of MatrixBatchVectorMultiplyAccumulate in projection too.

PiperOrigin-RevId: 316926050
Change-Id: I64e9febce8590231c78a90a0a5aec5da11996195
---
 tensorflow/lite/kernels/lstm_eval.cc | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/tensorflow/lite/kernels/lstm_eval.cc b/tensorflow/lite/kernels/lstm_eval.cc
index 65f68b34251..4ac9e538317 100644
--- a/tensorflow/lite/kernels/lstm_eval.cc
+++ b/tensorflow/lite/kernels/lstm_eval.cc
@@ -874,16 +874,13 @@ inline void LstmStepHybrid(
       tensor_utils::BatchQuantizeFloats(
           output_gate_scratch, n_batch, n_cell, quantized_cell_state_ptr,
           scaling_factors, zero_points, asymmetric_quantize_inputs);
-      for (int b = 0; b < n_batch; ++b) {
-        scaling_factors_scratch[b] =
-            scaling_factors[b] * projection_weights_scale;
-      }
       tensor_utils::MatrixBatchVectorMultiplyAccumulate(
           projection_weights_ptr, n_output, n_cell, quantized_cell_state_ptr,
-          scaling_factors_scratch, n_batch, output_state_ptr,
+          projection_weights_scale, scaling_factors, n_batch, output_state_ptr,
           /*per_channel_scale=*/nullptr,
           asymmetric_quantize_inputs ? zero_points : nullptr, accum_scratch_ptr,
-          projection_weights_row_sums, compute_row_sums, context);
+          projection_weights_row_sums, compute_row_sums,
+          scaling_factors_scratch, context);
     }
     if (params->proj_clip > 0.0) {
       tensor_utils::ClipVector(output_state_ptr, n_batch * n_output,

From 5e7fc9584a983c3e53e2c845017acded9b19f180 Mon Sep 17 00:00:00 2001
From: Ayush Dubey <ayushd@google.com>
Date: Wed, 17 Jun 2020 11:21:20 -0700
Subject: [PATCH 0415/1390] Add a unit test demonstrating collective ops with
 different groups of devices.

Collectives configured this way can be used to implement all-reduce for batch
norm with a subset of all available devices.

PiperOrigin-RevId: 316927203
Change-Id: Ic288d01134776efbe0e49a83fd8030f721890725
---
 tensorflow/python/ops/collective_ops_test.py | 35 ++++++++++++++++++++
 1 file changed, 35 insertions(+)

diff --git a/tensorflow/python/ops/collective_ops_test.py b/tensorflow/python/ops/collective_ops_test.py
index 47c25fcafc0..9727593a1c5 100644
--- a/tensorflow/python/ops/collective_ops_test.py
+++ b/tensorflow/python/ops/collective_ops_test.py
@@ -581,6 +581,41 @@ class CollectiveOpTest(test.TestCase):
       results = sess.run(run_ops)
       self.assertEqual(results, [3., 3., 3., 3.])
 
+  @test_util.run_v2_only
+  def testMultipleGroups(self):
+    context._reset_context()
+    cpus = config.list_physical_devices('CPU')
+    self.assertEqual(len(cpus), 1)
+    config.set_logical_device_configuration(cpus[0], [
+        context.LogicalDeviceConfiguration(),
+        context.LogicalDeviceConfiguration(),
+        context.LogicalDeviceConfiguration()
+    ])
+    context.ensure_initialized()
+    num_elements = 4
+
+    @def_function.function
+    def run_all_reduce(group_size, group_key):
+      instance_key = group_key
+      input_value = [group_key for i in range(num_elements)]
+      collectives = []
+      for device_idx in range(group_size):
+        with ops.device('/CPU:{}'.format(device_idx)):
+          input_tensor = constant_op.constant(input_value)
+          collectives.append(collective_ops.all_reduce(
+              input_tensor, group_size, group_key, instance_key, merge_op='Add',
+              final_op='Id'))
+      return collectives
+
+    def run_and_assert(group_size, group_key):
+      for reduced_tensor in run_all_reduce(group_size, group_key):
+        self.assertAllEqual(
+            [group_key * group_size for i in range(num_elements)],
+            reduced_tensor.numpy())
+
+    run_and_assert(group_size=2, group_key=1)
+    run_and_assert(group_size=3, group_key=2)
+
 
 if __name__ == '__main__':
   test.main()

From c6a3ab159f0d19958198e315d2f4f0943725b840 Mon Sep 17 00:00:00 2001
From: Yunxing Dai <yunxing@google.com>
Date: Wed, 17 Jun 2020 11:25:14 -0700
Subject: [PATCH 0416/1390] Simplify broadcast plus compare

PiperOrigin-RevId: 316927924
Change-Id: If7f3ce209cbff3720c19a60d3e713167e1c6e8c6
---
 .../xla/service/algebraic_simplifier.cc       | 22 +++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier.cc b/tensorflow/compiler/xla/service/algebraic_simplifier.cc
index ce2a801fccd..130661bf1cd 100755
--- a/tensorflow/compiler/xla/service/algebraic_simplifier.cc
+++ b/tensorflow/compiler/xla/service/algebraic_simplifier.cc
@@ -2815,6 +2815,28 @@ Status AlgebraicSimplifierVisitor::HandleCompare(HloInstruction* compare) {
   HloInstruction* lhs;
   HloInstruction* rhs;
   CHECK(Match(compare, m::Compare(m::Op(&lhs), m::Op(&rhs))));
+  {
+    // compare(broadcast(a) + x, broadcast(b)) ==>
+    //   compare(x, broadcast(b-a))
+    HloInstruction *x, *a, *b;
+    if (Match(compare,
+              m::Compare(
+                  m::AddAnyOrder(m::Op(&x), m::Broadcast(m::Op(&a).WithShape(
+                                                m::Shape().IsScalar()))),
+                  m::Broadcast(m::Op(&b).WithShape(m::Shape().IsScalar()))))) {
+      if (ShapeUtil::ElementIsSigned(x->shape())) {
+        HloInstruction* sub =
+            computation_->AddInstruction(HloInstruction::CreateBinary(
+                b->shape(), HloOpcode::kSubtract, b, a));
+        HloInstruction* broadcast = computation_->AddInstruction(
+            HloInstruction::CreateBroadcast(x->shape(), sub, {}));
+        HloInstruction* new_compare = computation_->AddInstruction(
+            HloInstruction::CreateCompare(compare->shape(), x, broadcast,
+                                          compare->comparison_direction()));
+        return ReplaceInstruction(compare, new_compare);
+      }
+    }
+  }
 
   if (compare->comparison_direction() == ComparisonDirection::kLt &&
       lhs->opcode() == HloOpcode::kIota && IsAll(rhs, 0)) {

From ef55a40b374d7310e4ce3149d86395d403403d0d Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Wed, 17 Jun 2020 18:38:24 +0000
Subject: [PATCH 0417/1390] Move tf.equal shape inference test to
 math_ops_test.cc

also added additional shape inference test cases

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/core/ops/math_ops_test.cc           | 18 ++++++++++++++++++
 .../python/autograph/operators/logical_test.py | 14 --------------
 2 files changed, 18 insertions(+), 14 deletions(-)

diff --git a/tensorflow/core/ops/math_ops_test.cc b/tensorflow/core/ops/math_ops_test.cc
index a2837d88bde..2b65f88042c 100644
--- a/tensorflow/core/ops/math_ops_test.cc
+++ b/tensorflow/core/ops/math_ops_test.cc
@@ -604,4 +604,22 @@ TEST(MathOpsTest, SobolSample) {
 
   INFER_OK(op, "[];[];[]", "[?,?]");
 }
+
+TEST(MathOpsTest, EqualOp) {
+  ShapeInferenceTestOp op("Equal");
+  AddNodeAttr("incompatible_shape_error", true, &op.node_def);
+
+  INFER_OK(op, "?;?", "?");
+  INFER_OK(op, "[1,2];?", "?");
+  INFER_OK(op, "?;[1,2]", "?");
+
+  INFER_OK(op, "[1,2,3];[1]", "[d0_0,d0_1,d0_2]");
+  INFER_OK(op, "[?,2,1];[1,3]", "[d0_0,d0_1,d1_1]");
+  INFER_OK(op, "[1,?,3];[3,1]", "[d0_0,d1_0,d0_2]");
+  INFER_OK(op, "[1,2,3];[2,1,3]", "[d1_0,d0_1,d0_2]");
+
+  // Note: Test case for GitHub issue 40471
+  INFER_OK(op, "[?,10,1];[?,1,4]", "[?,d0_1,d1_2]");
+  INFER_OK(op, "[10,?,1];[1,?,4]", "[d0_0,?,d1_2]");
+}
 }  // end namespace tensorflow
diff --git a/tensorflow/python/autograph/operators/logical_test.py b/tensorflow/python/autograph/operators/logical_test.py
index 0eab302a825..e22f39932d1 100644
--- a/tensorflow/python/autograph/operators/logical_test.py
+++ b/tensorflow/python/autograph/operators/logical_test.py
@@ -19,9 +19,7 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.autograph.operators import logical
-from tensorflow.python.eager import def_function
 from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import tensor_spec
 from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test
 
@@ -85,18 +83,6 @@ class LogicalOperatorsTest(test.TestCase):
       t = logical.not_(self._tf_false())
       self.assertEqual(self.evaluate(t), True)
 
-  # Test case for GitHub issue 40471
-  def test_equal_output_shapes(self):
-
-    @def_function.function(input_signature=[
-        tensor_spec.TensorSpec([None, 10, 1]),
-        tensor_spec.TensorSpec([None, 1, 4])])
-    def f(x, y):
-      z = x == y
-      return z
-
-    self.assertAllEqual(f.get_concrete_function().output_shapes, [None, 10, 4])
-
 
 if __name__ == '__main__':
   test.main()

From 653131dd38e9bbde2b9163d756ca4d9cfa69e1a5 Mon Sep 17 00:00:00 2001
From: Pavithra Vijay <psv@google.com>
Date: Wed, 17 Jun 2020 11:32:39 -0700
Subject: [PATCH 0418/1390] Remove automatic control dep wrapping from layers
 in v2.

PiperOrigin-RevId: 316929712
Change-Id: Ic1a7d125776eeb0c7654e321dd6f2351c8656a16
---
 tensorflow/python/keras/engine/base_layer.py | 13 +------------
 1 file changed, 1 insertion(+), 12 deletions(-)

diff --git a/tensorflow/python/keras/engine/base_layer.py b/tensorflow/python/keras/engine/base_layer.py
index 5ddce951491..fbec5382a08 100644
--- a/tensorflow/python/keras/engine/base_layer.py
+++ b/tensorflow/python/keras/engine/base_layer.py
@@ -40,7 +40,6 @@ from tensorflow.python.eager import context
 from tensorflow.python.eager import execute
 from tensorflow.python.eager import function
 from tensorflow.python.eager import monitoring
-from tensorflow.python.framework import auto_control_deps
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
@@ -1105,17 +1104,7 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
 
           try:
             with ops.enable_auto_cast_variables(self._compute_dtype_object):
-              # Add auto_control_deps in V2 when they are not already added by
-              # a `tf.function`.
-              if (ops.executing_eagerly_outside_functions() and
-                  not base_layer_utils.is_in_eager_or_tf_function()):
-                with auto_control_deps.AutomaticControlDependencies() as acd:
-                  outputs = call_fn(cast_inputs, *args, **kwargs)
-                  # Wrap Tensors in `outputs` in `tf.identity` to avoid
-                  # circular dependencies.
-                  outputs = base_layer_utils.mark_as_return(outputs, acd)
-              else:
-                outputs = call_fn(cast_inputs, *args, **kwargs)
+              outputs = call_fn(cast_inputs, *args, **kwargs)
 
           except errors.OperatorNotAllowedInGraphError as e:
             raise TypeError('You are attempting to use Python control '

From f24487e6190eb4637f5e4988737cbe17e91a231a Mon Sep 17 00:00:00 2001
From: Brian Zhao <bmzhao@google.com>
Date: Wed, 17 Jun 2020 11:38:22 -0700
Subject: [PATCH 0419/1390] Disabling test until bug fix lands.

PiperOrigin-RevId: 316930923
Change-Id: I8aa57e5627fa649474c469bd5c92a713c4d9bd75
---
 tensorflow/c/experimental/saved_model/core/ops/BUILD | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tensorflow/c/experimental/saved_model/core/ops/BUILD b/tensorflow/c/experimental/saved_model/core/ops/BUILD
index b42e93c3716..aa909c692ca 100644
--- a/tensorflow/c/experimental/saved_model/core/ops/BUILD
+++ b/tensorflow/c/experimental/saved_model/core/ops/BUILD
@@ -77,6 +77,9 @@ tf_cc_test(
     srcs = [
         "variable_ops_test.cc",
     ],
+    tags = [
+        "no_windows",  # TODO(b/159210739): Remove this tag after fixing the bug.
+    ],
     deps = [
         ":owned_eager_context",
         ":owned_tensor",

From 8d5171bad7374206cb5389ec1df83ed0189fdf6d Mon Sep 17 00:00:00 2001
From: Saurabh Saxena <srbs@google.com>
Date: Wed, 17 Jun 2020 12:35:43 -0700
Subject: [PATCH 0420/1390] Split Abstract interfaces into Abstract and
 ImmediateExecution interfaces. The Abstract interfaces are shared with
 tracing mode. Introduce an AbstractFunction which handles the conversion
 between MLIR function and FunctionDef and the runtime can query whichever
 representation is suitable. Right now this only supports GetFunctionDef but
 an API for fetching the MLIR function directly will be added in future
 changes.

PiperOrigin-RevId: 316942774
Change-Id: I1abebbe853b98dd0048bab9fc092252f4caf3d1b
---
 tensorflow/c/c_api_experimental.cc            |  2 +-
 tensorflow/c/eager/BUILD                      | 98 +++++++++++++++----
 tensorflow/c/eager/abstract_context.h         | 69 +++++++++++++
 tensorflow/c/eager/abstract_function.h        | 46 +++++++++
 ...ation_interface.h => abstract_operation.h} | 47 ++++-----
 tensorflow/c/eager/abstract_tensor_handle.h   | 45 +++++++++
 tensorflow/c/eager/c_api.cc                   | 21 ++--
 tensorflow/c/eager/c_api_experimental.cc      |  2 +-
 ...erface.h => immediate_execution_context.h} | 36 +++----
 .../c/eager/immediate_execution_operation.h   | 53 ++++++++++
 ....h => immediate_execution_tensor_handle.h} | 22 ++---
 tensorflow/c/eager/tfe_context_internal.h     |  4 +-
 tensorflow/c/eager/tfe_op_internal.h          |  6 +-
 .../c/eager/tfe_tensorhandle_internal.h       |  6 +-
 .../c/experimental/saved_model/core/BUILD     |  4 +-
 .../saved_model/core/concrete_function.cc     |  4 +-
 .../saved_model/core/concrete_function.h      | 10 +-
 .../c/experimental/saved_model/core/ops/BUILD | 11 ++-
 .../core/ops/owned_eager_context.h            | 10 +-
 .../saved_model/core/ops/owned_eager_op.h     | 10 +-
 .../core/ops/owned_tensor_handle.h            |  6 +-
 .../saved_model/core/ops/variable_ops.cc      | 33 ++++---
 .../saved_model/core/ops/variable_ops.h       | 20 ++--
 .../c/experimental/saved_model/internal/BUILD |  4 +-
 .../saved_model/internal/tensorhandle_list.cc |  2 +-
 .../internal/tensorhandle_list_type.h         |  4 +-
 tensorflow/core/common_runtime/eager/BUILD    | 18 ++--
 .../core/common_runtime/eager/context.cc      |  2 -
 .../core/common_runtime/eager/context.h       | 16 +--
 tensorflow/core/common_runtime/eager/core.cc  | 23 +++--
 .../common_runtime/eager/eager_operation.cc   | 16 +--
 .../common_runtime/eager/eager_operation.h    | 18 ++--
 .../common_runtime/eager/tensor_handle.cc     |  2 +-
 .../core/common_runtime/eager/tensor_handle.h | 12 +--
 tensorflow/python/eager/pywrap_tfe_src.cc     |  4 +-
 35 files changed, 489 insertions(+), 197 deletions(-)
 create mode 100644 tensorflow/c/eager/abstract_context.h
 create mode 100644 tensorflow/c/eager/abstract_function.h
 rename tensorflow/c/eager/{operation_interface.h => abstract_operation.h} (80%)
 create mode 100644 tensorflow/c/eager/abstract_tensor_handle.h
 rename tensorflow/c/eager/{context_interface.h => immediate_execution_context.h} (78%)
 create mode 100644 tensorflow/c/eager/immediate_execution_operation.h
 rename tensorflow/c/eager/{tensor_handle_interface.h => immediate_execution_tensor_handle.h} (74%)

diff --git a/tensorflow/c/c_api_experimental.cc b/tensorflow/c/c_api_experimental.cc
index e9e6d470c68..831c6a0ad40 100644
--- a/tensorflow/c/c_api_experimental.cc
+++ b/tensorflow/c/c_api_experimental.cc
@@ -624,7 +624,7 @@ void TFE_InferShapes(TFE_Op* tfe_op, TF_ShapeAndTypeList* input_shapes,
 
   const int num_inputs = input_shapes->num_items;
   NodeDef node_def;
-  tensorflow::AbstractOperationInterface* op = tensorflow::unwrap(tfe_op);
+  tensorflow::ImmediateExecutionOperation* op = tensorflow::unwrap(tfe_op);
   node_def.set_name(op->Name());
   node_def.set_op(op->Name());
   for (int i = 0; i < num_inputs; ++i) {
diff --git a/tensorflow/c/eager/BUILD b/tensorflow/c/eager/BUILD
index 9d3c79e0ae7..5f7ab4a1f59 100644
--- a/tensorflow/c/eager/BUILD
+++ b/tensorflow/c/eager/BUILD
@@ -38,9 +38,10 @@ tf_cuda_library(
             "//tensorflow/core:portable_tensorflow_lib_lite",
         ],
         "//conditions:default": [
-            ":context_interface",
-            ":operation_interface",
-            ":tensor_handle_interface",
+            ":immediate_execution_context",
+            ":immediate_execution_operation",
+            ":immediate_execution_tensor_handle",
+            ":abstract_tensor_handle",
             ":tfe_context_internal",
             ":tfe_cancellation_manager_internal",
             ":tfe_executor_internal",
@@ -101,13 +102,17 @@ tf_cuda_library(
 filegroup(
     name = "pywrap_required_hdrs",
     srcs = [
+        "abstract_context.h",
+        "abstract_function.h",
+        "abstract_operation.h",
+        "abstract_tensor_handle.h",
         "c_api_experimental.h",
         "c_api_internal.h",
         "c_api_unified_experimental.h",
-        "context_interface.h",
         "dlpack.h",
-        "operation_interface.h",
-        "tensor_handle_interface.h",
+        "immediate_execution_context.h",
+        "immediate_execution_operation.h",
+        "immediate_execution_tensor_handle.h",
         "tfe_cancellation_manager_internal.h",
         "tfe_executor_internal.h",
         "tfe_monitoring_internal.h",
@@ -163,12 +168,22 @@ cc_library(
 )
 
 cc_library(
-    name = "tensor_handle_interface",
-    hdrs = ["tensor_handle_interface.h"],
+    name = "abstract_tensor_handle",
+    hdrs = ["abstract_tensor_handle.h"],
+    visibility = [
+        "//tensorflow:internal",
+    ],
+    deps = [],
+)
+
+cc_library(
+    name = "immediate_execution_tensor_handle",
+    hdrs = ["immediate_execution_tensor_handle.h"],
     visibility = [
         "//tensorflow:internal",
     ],
     deps = [
+        ":abstract_tensor_handle",
         "//tensorflow/c:tensor_interface",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
@@ -177,13 +192,13 @@ cc_library(
 )
 
 cc_library(
-    name = "operation_interface",
-    hdrs = ["operation_interface.h"],
+    name = "abstract_operation",
+    hdrs = ["abstract_operation.h"],
     visibility = [
         "//tensorflow:internal",
     ],
     deps = [
-        ":tensor_handle_interface",
+        ":abstract_tensor_handle",
         "//tensorflow/c:tensor_interface",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
@@ -193,14 +208,58 @@ cc_library(
 )
 
 cc_library(
-    name = "context_interface",
-    hdrs = ["context_interface.h"],
+    name = "immediate_execution_operation",
+    hdrs = ["immediate_execution_operation.h"],
     visibility = [
         "//tensorflow:internal",
     ],
     deps = [
-        ":operation_interface",
-        ":tensor_handle_interface",
+        ":abstract_operation",
+        ":abstract_tensor_handle",
+        ":immediate_execution_tensor_handle",
+        "//tensorflow/c:tensor_interface",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
+cc_library(
+    name = "abstract_context",
+    hdrs = ["abstract_context.h"],
+    visibility = [
+        "//tensorflow:internal",
+    ],
+    deps = [
+        ":abstract_function",
+        ":abstract_operation",
+    ],
+)
+
+cc_library(
+    name = "abstract_function",
+    hdrs = ["abstract_function.h"],
+    visibility = [
+        "//tensorflow:internal",
+    ],
+    deps = [
+        "//tensorflow/c:c_api",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/platform:status",
+    ],
+)
+
+cc_library(
+    name = "immediate_execution_context",
+    hdrs = ["immediate_execution_context.h"],
+    visibility = [
+        "//tensorflow:internal",
+    ],
+    deps = [
+        ":abstract_context",
+        ":immediate_execution_operation",
+        ":immediate_execution_tensor_handle",
         "//tensorflow/c:tensor_interface",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
@@ -217,7 +276,7 @@ cc_library(
         "//tensorflow:internal",
     ],
     deps = [
-        ":context_interface",
+        ":immediate_execution_context",
         "//tensorflow/c:conversion_macros",
     ],
 )
@@ -277,7 +336,7 @@ cc_library(
         "//tensorflow:internal",
     ],
     deps = [
-        ":operation_interface",
+        ":immediate_execution_operation",
         "//tensorflow/c:conversion_macros",
     ],
 )
@@ -300,7 +359,7 @@ cc_library(
         "//tensorflow:internal",
     ],
     deps = [
-        ":tensor_handle_interface",
+        ":immediate_execution_tensor_handle",
         "//tensorflow/c:conversion_macros",
     ],
 )
@@ -480,6 +539,9 @@ tf_cuda_library(
             ":tfe_context_internal",
             ":tfe_op_internal",
             ":tfe_tensorhandle_internal",
+            ":abstract_operation",
+            ":abstract_context",
+            ":abstract_tensor_handle",
             "//tensorflow/c:c_api",
             "//tensorflow/c:c_api_internal",
             "//tensorflow/core:core_cpu",
diff --git a/tensorflow/c/eager/abstract_context.h b/tensorflow/c/eager/abstract_context.h
new file mode 100644
index 00000000000..59c726349ac
--- /dev/null
+++ b/tensorflow/c/eager/abstract_context.h
@@ -0,0 +1,69 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_C_EAGER_ABSTRACT_CONTEXT_H_
+#define TENSORFLOW_C_EAGER_ABSTRACT_CONTEXT_H_
+
+#include <vector>
+
+#include "tensorflow/c/eager/abstract_function.h"
+#include "tensorflow/c/eager/abstract_operation.h"
+
+namespace tensorflow {
+
+// Abstract interface to a context.
+//
+// This serves as a factory for creating `AbstractOperation`s and for
+// registering traced functions.
+// Operations creation within a context can only be executed in that context
+// (for now at least).
+// Implementations of the context may contain some state e.g. an execution
+// environment, a traced representation etc.
+class AbstractContext {
+ protected:
+  enum AbstractContextKind { kTracing, kImmediateExecution };
+  explicit AbstractContext(AbstractContextKind kind) : kind_(kind) {}
+  virtual ~AbstractContext() {}
+
+ public:
+  AbstractContextKind getKind() const { return kind_; }
+
+  // Release any underlying resources, including the interface object.
+  //
+  // WARNING: The destructor of this class is marked as protected to disallow
+  // clients from directly destroying this object since it may manage it's own
+  // lifetime through ref counting. Thus clients MUST call Release() in order to
+  // destroy an instance of this class.
+  virtual void Release() = 0;
+
+  // Creates an operation builder and ties it to this context.
+  // The returned object can be used for setting operation's attributes,
+  // adding inputs and finally executing (immediately or lazily as in tracing)
+  // it in this context.
+  virtual AbstractOperation* CreateOperation() = 0;
+
+  // Registers a function with this context, after this the function is
+  // available to be called/referenced by its name in this context.
+  virtual Status RegisterFunction(AbstractFunction*) = 0;
+  // Remove a function. 'func' argument is the name of a previously added
+  // FunctionDef. The name is in fdef.signature.name.
+  virtual Status RemoveFunction(const string& func) = 0;
+
+ private:
+  const AbstractContextKind kind_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_C_EAGER_ABSTRACT_CONTEXT_H_
diff --git a/tensorflow/c/eager/abstract_function.h b/tensorflow/c/eager/abstract_function.h
new file mode 100644
index 00000000000..e322b31f2b4
--- /dev/null
+++ b/tensorflow/c/eager/abstract_function.h
@@ -0,0 +1,46 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_C_EAGER_ABSTRACT_FUNCTION_H_
+#define TENSORFLOW_C_EAGER_ABSTRACT_FUNCTION_H_
+
+#include "tensorflow/c/c_api.h"
+#include "tensorflow/core/framework/function.pb.h"
+#include "tensorflow/core/platform/status.h"
+
+namespace tensorflow {
+
+// A traced function: this hides the complexity of converting the serialized
+// representation between various supported formats e.g. FunctionDef and Mlir
+// function.
+class AbstractFunction {
+ protected:
+  enum AbstractFunctionKind { kGraphFunc, kMlirFunc };
+  explicit AbstractFunction(AbstractFunctionKind kind) : kind_(kind) {}
+
+ public:
+  // Returns which subclass is this instance of.
+  AbstractFunctionKind getKind() const { return kind_; }
+  virtual ~AbstractFunction() = default;
+
+  // Returns the AbstractFunction as a FunctionDef.
+  virtual Status GetFunctionDef(FunctionDef**) = 0;
+
+ private:
+  const AbstractFunctionKind kind_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_C_EAGER_ABSTRACT_FUNCTION_H_
diff --git a/tensorflow/c/eager/operation_interface.h b/tensorflow/c/eager/abstract_operation.h
similarity index 80%
rename from tensorflow/c/eager/operation_interface.h
rename to tensorflow/c/eager/abstract_operation.h
index 844ba6c14bd..da4b6ecb75e 100644
--- a/tensorflow/c/eager/operation_interface.h
+++ b/tensorflow/c/eager/abstract_operation.h
@@ -12,24 +12,29 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_C_EAGER_OPERATION_INTERFACE_H_
-#define TENSORFLOW_C_EAGER_OPERATION_INTERFACE_H_
+#ifndef TENSORFLOW_C_EAGER_ABSTRACT_OPERATION_H_
+#define TENSORFLOW_C_EAGER_ABSTRACT_OPERATION_H_
 
 #include "absl/types/span.h"
-#include "tensorflow/c/eager/tensor_handle_interface.h"
+#include "tensorflow/c/eager/abstract_tensor_handle.h"
 #include "tensorflow/c/tensor_interface.h"
-#include "tensorflow/core/framework/device_attributes.pb.h"
-#include "tensorflow/core/framework/op_def.pb.h"
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/platform/status.h"
 
-struct TFE_Op;
-
 namespace tensorflow {
 
 // Abstract interface to an operation.
-class AbstractOperationInterface {
+// This interface allows building and executing an operation in either
+// tracing or immediate execution mode.
+class AbstractOperation {
+ protected:
+  enum AbstractOperationKind { kTracing, kImmediateExecution };
+  explicit AbstractOperation(AbstractOperationKind kind) : kind_(kind) {}
+  virtual ~AbstractOperation() {}
+
  public:
+  AbstractOperationKind getKind() const { return kind_; }
+
   // Release any underlying resources, including the interface object.
   //
   // WARNING: The destructor of this class is marked as protected to disallow
@@ -38,7 +43,6 @@ class AbstractOperationInterface {
   // clients MUST call Release() in order to destroy an instance of this class.
   virtual void Release() = 0;
 
-  virtual void Clear() = 0;
   virtual Status Reset(const char* op, const char* raw_device_name) = 0;
 
   virtual const string& Name() const = 0;
@@ -66,12 +70,10 @@ class AbstractOperationInterface {
   // existing and given constraints will be performed.
   virtual Status SetDeviceName(const char* name) = 0;
 
-  virtual Status AddInput(AbstractTensorHandleInterface* input) = 0;
-  virtual Status AddInputList(
-      absl::Span<AbstractTensorHandleInterface*> inputs) = 0;
-  virtual Status Execute(absl::Span<AbstractTensorHandleInterface*> retvals,
+  virtual Status AddInput(AbstractTensorHandle* input) = 0;
+  virtual Status AddInputList(absl::Span<AbstractTensorHandle*> inputs) = 0;
+  virtual Status Execute(absl::Span<AbstractTensorHandle*> retvals,
                          int* num_retvals) = 0;
-  virtual const tensorflow::OpDef* OpDef() const = 0;
 
   virtual Status SetAttrString(const char* attr_name, const char* data,
                                size_t length) = 0;
@@ -82,7 +84,7 @@ class AbstractOperationInterface {
   virtual Status SetAttrShape(const char* attr_name, const int64_t* dims,
                               const int num_dims) = 0;
   virtual Status SetAttrFunction(const char* attr_name,
-                                 const AbstractOperationInterface* value) = 0;
+                                 const AbstractOperation* value) = 0;
   virtual Status SetAttrFunctionName(const char* attr_name, const char* value,
                                      size_t length) = 0;
   virtual Status SetAttrTensor(const char* attr_name,
@@ -102,19 +104,12 @@ class AbstractOperationInterface {
   virtual Status SetAttrShapeList(const char* attr_name, const int64_t** dims,
                                   const int* num_dims, int num_values) = 0;
   virtual Status SetAttrFunctionList(
-      const char* attr_name,
-      absl::Span<const AbstractOperationInterface*> values) = 0;
+      const char* attr_name, absl::Span<const AbstractOperation*> values) = 0;
 
-  virtual Status InputLength(const char* input_name, int* length) = 0;
-  virtual Status OutputLength(const char* output_name, int* length) = 0;
-
-  // Experimental
-  virtual Status SetUseXla(bool enable) = 0;
-
- protected:
-  virtual ~AbstractOperationInterface() {}
+ private:
+  const AbstractOperationKind kind_;
 };
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_C_EAGER_OPERATION_INTERFACE_H_
+#endif  // TENSORFLOW_C_EAGER_ABSTRACT_OPERATION_H_
diff --git a/tensorflow/c/eager/abstract_tensor_handle.h b/tensorflow/c/eager/abstract_tensor_handle.h
new file mode 100644
index 00000000000..14acac29bb9
--- /dev/null
+++ b/tensorflow/c/eager/abstract_tensor_handle.h
@@ -0,0 +1,45 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_C_EAGER_ABSTRACT_TENSOR_HANDLE_H_
+#define TENSORFLOW_C_EAGER_ABSTRACT_TENSOR_HANDLE_H_
+
+namespace tensorflow {
+
+// Abstract interface to a Tensor handle in either tracing or immediate
+// execution mode.
+class AbstractTensorHandle {
+ protected:
+  enum AbstractTensorHandleKind { kTracing, kImmediateExecution };
+  explicit AbstractTensorHandle(AbstractTensorHandleKind kind) : kind_(kind) {}
+  virtual ~AbstractTensorHandle() {}
+
+ public:
+  AbstractTensorHandleKind getKind() const { return kind_; }
+
+  // Release any underlying resources, including the interface object.
+  //
+  // WARNING: The destructor of this class is marked as protected to disallow
+  // clients from directly destroying this object since it may manage it's own
+  // lifetime through ref counting. Thus this must be allocated on the heap and
+  // clients MUST call Release() in order to destroy an instance of this class.
+  virtual void Release() = 0;
+
+ private:
+  const AbstractTensorHandleKind kind_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_C_EAGER_ABSTRACT_TENSOR_HANDLE_H_
diff --git a/tensorflow/c/eager/c_api.cc b/tensorflow/c/eager/c_api.cc
index fdc91675f8b..4be3cdd7c2d 100644
--- a/tensorflow/c/eager/c_api.cc
+++ b/tensorflow/c/eager/c_api.cc
@@ -21,6 +21,8 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include "tensorflow/c/eager/abstract_tensor_handle.h"
+
 // clang-format off
 #include "tensorflow/core/platform/platform.h"
 // clang-format on
@@ -31,8 +33,8 @@ limitations under the License.
 #include "tensorflow/c/c_api_internal.h"
 #include "tensorflow/c/eager/c_api_experimental.h"
 #include "tensorflow/c/eager/c_api_internal.h"
-#include "tensorflow/c/eager/operation_interface.h"
-#include "tensorflow/c/eager/tensor_handle_interface.h"
+#include "tensorflow/c/eager/immediate_execution_operation.h"
+#include "tensorflow/c/eager/immediate_execution_tensor_handle.h"
 #include "tensorflow/c/eager/tfe_context_internal.h"
 #include "tensorflow/c/eager/tfe_op_internal.h"
 #include "tensorflow/c/eager/tfe_tensorhandle_internal.h"
@@ -1119,7 +1121,7 @@ size_t TFE_TensorHandleDeviceMemorySize(TFE_TensorHandle* h,
 
 TFE_Op* TFE_NewOp(TFE_Context* ctx, const char* op_or_function_name,
                   TF_Status* status) {
-  tensorflow::AbstractOperationInterface* new_op =
+  tensorflow::ImmediateExecutionOperation* new_op =
       tensorflow::unwrap(ctx)->CreateOperation();
   status->status = new_op->Reset(op_or_function_name, nullptr);
   if (!status->status.ok()) {
@@ -1164,7 +1166,9 @@ void TFE_OpAddInput(TFE_Op* op, TFE_TensorHandle* input, TF_Status* status) {
 void TFE_OpAddInputList(TFE_Op* op, TFE_TensorHandle** inputs, int num_inputs,
                         TF_Status* status) {
   status->status = tensorflow::unwrap(op)->AddInputList(
-      {tensorflow::unwrap(inputs), static_cast<size_t>(num_inputs)});
+      {reinterpret_cast<tensorflow::AbstractTensorHandle**>(
+           tensorflow::unwrap(inputs)),
+       static_cast<size_t>(num_inputs)});
 }
 
 TF_AttrType TFE_OpGetAttrType(TFE_Op* op, const char* attr_name,
@@ -1324,7 +1328,9 @@ void TFE_OpSetAttrShapeList(TFE_Op* op, const char* attr_name,
 void TFE_OpSetAttrFunctionList(TFE_Op* op, const char* attr_name,
                                const TFE_Op** value, int num_values) {
   auto s = tensorflow::unwrap(op)->SetAttrFunctionList(
-      attr_name, {tensorflow::unwrap(value), static_cast<size_t>(num_values)});
+      attr_name, {reinterpret_cast<const tensorflow::AbstractOperation**>(
+                      tensorflow::unwrap(value)),
+                  static_cast<size_t>(num_values)});
   if (!s.ok()) {
     LOG(WARNING) << "Unable to set attribute: " << attr_name;
   }
@@ -1368,7 +1374,10 @@ TF_CAPI_EXPORT extern int TFE_OpGetOutputLength(TFE_Op* op,
 void TFE_Execute(TFE_Op* op, TFE_TensorHandle** retvals, int* num_retvals,
                  TF_Status* status) {
   status->status = tensorflow::unwrap(op)->Execute(
-      absl::MakeSpan(tensorflow::unwrap(retvals), *num_retvals), num_retvals);
+      absl::MakeSpan(reinterpret_cast<tensorflow::AbstractTensorHandle**>(
+                         tensorflow::unwrap(retvals)),
+                     *num_retvals),
+      num_retvals);
 }
 
 TFE_TensorHandle* TFE_TensorHandleCopyToDevice(TFE_TensorHandle* h,
diff --git a/tensorflow/c/eager/c_api_experimental.cc b/tensorflow/c/eager/c_api_experimental.cc
index 0d71b11531b..9937fd7551f 100644
--- a/tensorflow/c/eager/c_api_experimental.cc
+++ b/tensorflow/c/eager/c_api_experimental.cc
@@ -38,7 +38,7 @@ using tensorflow::string;
 void TFE_OpReset(TFE_Op* op_to_reset, const char* op_or_function_name,
                  const char* raw_device_name, TF_Status* status) {
   if (op_to_reset) {
-    tensorflow::AbstractOperationInterface* op =
+    tensorflow::ImmediateExecutionOperation* op =
         tensorflow::unwrap(op_to_reset);
     op->Clear();
     status->status = op->Reset(op_or_function_name, raw_device_name);
diff --git a/tensorflow/c/eager/context_interface.h b/tensorflow/c/eager/immediate_execution_context.h
similarity index 78%
rename from tensorflow/c/eager/context_interface.h
rename to tensorflow/c/eager/immediate_execution_context.h
index e5a770a6826..0e3fe8cd4e1 100644
--- a/tensorflow/c/eager/context_interface.h
+++ b/tensorflow/c/eager/immediate_execution_context.h
@@ -12,15 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_C_EAGER_CONTEXT_INTERFACE_H_
-#define TENSORFLOW_C_EAGER_CONTEXT_INTERFACE_H_
+#ifndef TENSORFLOW_C_EAGER_IMMEDIATE_EXECUTION_CONTEXT_H_
+#define TENSORFLOW_C_EAGER_IMMEDIATE_EXECUTION_CONTEXT_H_
 
 #include <vector>
 
 #include "absl/types/optional.h"
 #include "absl/types/span.h"
-#include "tensorflow/c/eager/operation_interface.h"
-#include "tensorflow/c/eager/tensor_handle_interface.h"
+#include "tensorflow/c/eager/abstract_context.h"
+#include "tensorflow/c/eager/immediate_execution_operation.h"
+#include "tensorflow/c/eager/immediate_execution_tensor_handle.h"
 #include "tensorflow/c/tensor_interface.h"
 #include "tensorflow/core/framework/function.pb.h"
 #include "tensorflow/core/framework/numeric_types.h"
@@ -34,16 +35,9 @@ namespace tensorflow {
 //
 // A context is responsible for creating key objects such as Tensors,
 // TensorHandles & Operations.
-class AbstractContextInterface {
+class ImmediateExecutionContext : public AbstractContext {
  public:
-  // Release any underlying resources, including the interface object.
-  //
-  // WARNING: The destructor of this class is marked as protected to disallow
-  // clients from directly destroying this object since it may manage it's own
-  // lifetime through ref counting. Thus clients MUST call Release() in order to
-  // destroy an instance of this class.
-  virtual void Release() = 0;
-
+  static constexpr AbstractContextKind kKind = kImmediateExecution;
   // Optimized scalar creation functions
   virtual AbstractTensorInterface* CreateInt64Scalar(int64 value) = 0;
   virtual AbstractTensorInterface* CreateUint64Scalar(uint64 value) = 0;
@@ -74,15 +68,15 @@ class AbstractContextInterface {
                                                 void* memory_releaser_arg) = 0;
 
   // Create a handle to wrap and manage a Tensor
-  virtual AbstractTensorHandleInterface* CreateLocalHandle(
+  virtual ImmediateExecutionTensorHandle* CreateLocalHandle(
       AbstractTensorInterface* t) = 0;
   // Copy the handle to another device.
-  virtual AbstractTensorHandleInterface* CopyTensorHandleToDevice(
-      AbstractTensorHandleInterface* handle, const char* device_name,
+  virtual ImmediateExecutionTensorHandle* CopyTensorHandleToDevice(
+      ImmediateExecutionTensorHandle* handle, const char* device_name,
       Status* status) = 0;
 
   // Create an operation to perform op execution
-  virtual AbstractOperationInterface* CreateOperation() = 0;
+  ImmediateExecutionOperation* CreateOperation() override = 0;
 
   // Returns whether the runtime is backed by TFRT or the legacy TF Eager
   // Runtime. This is necessary to decouple runtime-dependent
@@ -107,14 +101,12 @@ class AbstractContextInterface {
   // be executed as an op. Return error if the function with the same name
   // already exists.
   virtual Status AddFunctionDef(const FunctionDef& fdef) = 0;
-  // Remove a function. 'func' argument is the name of a previously added
-  // FunctionDef. The name is in fdef.signature.name.
-  virtual Status RemoveFunction(const string& func) = 0;
 
  protected:
-  virtual ~AbstractContextInterface() {}
+  ImmediateExecutionContext() : AbstractContext(kKind) {}
+  ~ImmediateExecutionContext() override {}
 };
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_C_EAGER_CONTEXT_INTERFACE_H_
+#endif  // TENSORFLOW_C_EAGER_IMMEDIATE_EXECUTION_CONTEXT_H_
diff --git a/tensorflow/c/eager/immediate_execution_operation.h b/tensorflow/c/eager/immediate_execution_operation.h
new file mode 100644
index 00000000000..31413b5b4b9
--- /dev/null
+++ b/tensorflow/c/eager/immediate_execution_operation.h
@@ -0,0 +1,53 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_C_EAGER_IMMEDIATE_EXECUTION_OPERATION_H_
+#define TENSORFLOW_C_EAGER_IMMEDIATE_EXECUTION_OPERATION_H_
+
+#include "absl/types/span.h"
+#include "tensorflow/c/eager/abstract_operation.h"
+#include "tensorflow/c/eager/immediate_execution_tensor_handle.h"
+#include "tensorflow/c/tensor_interface.h"
+#include "tensorflow/core/framework/device_attributes.pb.h"
+#include "tensorflow/core/framework/op_def.pb.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/platform/casts.h"
+#include "tensorflow/core/platform/status.h"
+
+struct TFE_Op;
+
+namespace tensorflow {
+
+// Abstract interface to an operation.
+class ImmediateExecutionOperation : public AbstractOperation {
+ public:
+  static constexpr AbstractOperationKind kKind = kImmediateExecution;
+  virtual void Clear() = 0;
+
+  virtual const tensorflow::OpDef* OpDef() const = 0;
+
+  virtual Status InputLength(const char* input_name, int* length) = 0;
+  virtual Status OutputLength(const char* output_name, int* length) = 0;
+
+  // Experimental
+  virtual Status SetUseXla(bool enable) = 0;
+
+ protected:
+  ImmediateExecutionOperation() : AbstractOperation(kKind) {}
+  ~ImmediateExecutionOperation() override {}
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_C_EAGER_IMMEDIATE_EXECUTION_OPERATION_H_
diff --git a/tensorflow/c/eager/tensor_handle_interface.h b/tensorflow/c/eager/immediate_execution_tensor_handle.h
similarity index 74%
rename from tensorflow/c/eager/tensor_handle_interface.h
rename to tensorflow/c/eager/immediate_execution_tensor_handle.h
index 1ca40daec41..1f5a77e54ee 100644
--- a/tensorflow/c/eager/tensor_handle_interface.h
+++ b/tensorflow/c/eager/immediate_execution_tensor_handle.h
@@ -12,9 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_C_EAGER_TENSOR_HANDLE_INTERFACE_H_
-#define TENSORFLOW_C_EAGER_TENSOR_HANDLE_INTERFACE_H_
+#ifndef TENSORFLOW_C_EAGER_IMMEDIATE_EXECUTION_TENSOR_HANDLE_H_
+#define TENSORFLOW_C_EAGER_IMMEDIATE_EXECUTION_TENSOR_HANDLE_H_
 
+#include "tensorflow/c/eager/abstract_tensor_handle.h"
 #include "tensorflow/c/tensor_interface.h"
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/platform/status.h"
@@ -30,15 +31,9 @@ namespace tensorflow {
 // files. The interface lists the common functionality that must be provided by
 // any concrete implementation. However, in cases where the true concrete class
 // is needed a static_cast can be applied.
-class AbstractTensorHandleInterface {
+class ImmediateExecutionTensorHandle : public AbstractTensorHandle {
  public:
-  // Release any underlying resources, including the interface object.
-  //
-  // WARNING: The destructor of this class is marked as protected to disallow
-  // clients from directly destroying this object since it may manage it's own
-  // lifetime through ref counting. Thus this must be allocated on the heap and
-  // clients MUST call Release() in order to destroy an instance of this class.
-  virtual void Release() = 0;
+  static constexpr AbstractTensorHandleKind kKind = kImmediateExecution;
 
   // Returns tensor dtype.
   virtual tensorflow::DataType DataType() const = 0;
@@ -57,12 +52,13 @@ class AbstractTensorHandleInterface {
   virtual AbstractTensorInterface* Resolve(Status* status) = 0;
 
   // Return a copy of the handle.
-  virtual AbstractTensorHandleInterface* Copy() = 0;
+  virtual ImmediateExecutionTensorHandle* Copy() = 0;
 
  protected:
-  virtual ~AbstractTensorHandleInterface() {}
+  ImmediateExecutionTensorHandle() : AbstractTensorHandle(kKind) {}
+  ~ImmediateExecutionTensorHandle() override {}
 };
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_C_EAGER_TENSOR_HANDLE_INTERFACE_H_
+#endif  // TENSORFLOW_C_EAGER_IMMEDIATE_EXECUTION_TENSOR_HANDLE_H_
diff --git a/tensorflow/c/eager/tfe_context_internal.h b/tensorflow/c/eager/tfe_context_internal.h
index 1d29bee9ee3..1f2035317fa 100644
--- a/tensorflow/c/eager/tfe_context_internal.h
+++ b/tensorflow/c/eager/tfe_context_internal.h
@@ -16,7 +16,7 @@ limitations under the License.
 #define TENSORFLOW_C_EAGER_TFE_CONTEXT_INTERNAL_H_
 
 #include "tensorflow/c/conversion_macros.h"
-#include "tensorflow/c/eager/context_interface.h"
+#include "tensorflow/c/eager/immediate_execution_context.h"
 
 // Wraps a pointer to a context implementation.
 //
@@ -28,7 +28,7 @@ typedef struct TFE_Context TFE_Context;
 
 namespace tensorflow {
 
-DEFINE_CONVERSION_FUNCTIONS(tensorflow::AbstractContextInterface, TFE_Context);
+DEFINE_CONVERSION_FUNCTIONS(tensorflow::ImmediateExecutionContext, TFE_Context);
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/c/eager/tfe_op_internal.h b/tensorflow/c/eager/tfe_op_internal.h
index 6ca7f741d16..3fe94d358b6 100644
--- a/tensorflow/c/eager/tfe_op_internal.h
+++ b/tensorflow/c/eager/tfe_op_internal.h
@@ -16,7 +16,7 @@ limitations under the License.
 #define TENSORFLOW_C_EAGER_TFE_OP_INTERNAL_H_
 
 #include "tensorflow/c/conversion_macros.h"
-#include "tensorflow/c/eager/operation_interface.h"
+#include "tensorflow/c/eager/immediate_execution_operation.h"
 
 // Wraps a pointer to an operation implementation.
 //
@@ -28,8 +28,8 @@ typedef struct TFE_Op TFE_Op;
 
 namespace tensorflow {
 
-DEFINE_CONVERSION_FUNCTIONS(tensorflow::AbstractOperationInterface, TFE_Op);
-DEFINE_CONVERSION_FUNCTIONS(tensorflow::AbstractOperationInterface*, TFE_Op*);
+DEFINE_CONVERSION_FUNCTIONS(tensorflow::ImmediateExecutionOperation, TFE_Op);
+DEFINE_CONVERSION_FUNCTIONS(tensorflow::ImmediateExecutionOperation*, TFE_Op*);
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/c/eager/tfe_tensorhandle_internal.h b/tensorflow/c/eager/tfe_tensorhandle_internal.h
index 543e5f1d932..308e8c24e2c 100644
--- a/tensorflow/c/eager/tfe_tensorhandle_internal.h
+++ b/tensorflow/c/eager/tfe_tensorhandle_internal.h
@@ -16,7 +16,7 @@ limitations under the License.
 #define TENSORFLOW_C_EAGER_TFE_TENSORHANDLE_INTERNAL_H_
 
 #include "tensorflow/c/conversion_macros.h"
-#include "tensorflow/c/eager/tensor_handle_interface.h"
+#include "tensorflow/c/eager/immediate_execution_tensor_handle.h"
 
 // Wraps a pointer to a tensor handle implementation.
 //
@@ -28,9 +28,9 @@ typedef struct TFE_TensorHandle TFE_TensorHandle;
 
 namespace tensorflow {
 
-DEFINE_CONVERSION_FUNCTIONS(tensorflow::AbstractTensorHandleInterface,
+DEFINE_CONVERSION_FUNCTIONS(tensorflow::ImmediateExecutionTensorHandle,
                             TFE_TensorHandle);
-DEFINE_CONVERSION_FUNCTIONS(tensorflow::AbstractTensorHandleInterface*,
+DEFINE_CONVERSION_FUNCTIONS(tensorflow::ImmediateExecutionTensorHandle*,
                             TFE_TensorHandle*);
 
 }  // namespace tensorflow
diff --git a/tensorflow/c/experimental/saved_model/core/BUILD b/tensorflow/c/experimental/saved_model/core/BUILD
index 2e817ed02e0..dbe1b6d656c 100644
--- a/tensorflow/c/experimental/saved_model/core/BUILD
+++ b/tensorflow/c/experimental/saved_model/core/BUILD
@@ -23,8 +23,8 @@ cc_library(
     ],
     deps = [
         ":function_metadata",
-        "//tensorflow/c/eager:operation_interface",
-        "//tensorflow/c/eager:tensor_handle_interface",
+        "//tensorflow/c/eager:immediate_execution_operation",
+        "//tensorflow/c/eager:immediate_execution_tensor_handle",
         "//tensorflow/core:protos_all_cc",
     ],
 )
diff --git a/tensorflow/c/experimental/saved_model/core/concrete_function.cc b/tensorflow/c/experimental/saved_model/core/concrete_function.cc
index d5da2ca9bf4..41bae4352fc 100644
--- a/tensorflow/c/experimental/saved_model/core/concrete_function.cc
+++ b/tensorflow/c/experimental/saved_model/core/concrete_function.cc
@@ -15,12 +15,12 @@ limitations under the License.
 
 #include "tensorflow/c/experimental/saved_model/core/concrete_function.h"
 
-#include "tensorflow/c/eager/tensor_handle_interface.h"
+#include "tensorflow/c/eager/immediate_execution_tensor_handle.h"
 #include "tensorflow/c/experimental/saved_model/core/function_metadata.h"
 
 namespace tensorflow {
 
-const std::vector<tensorflow::AbstractTensorHandleInterface*>&
+const std::vector<tensorflow::ImmediateExecutionTensorHandle*>&
 ConcreteFunction::GetCaptures() const {
   return captures_;
 }
diff --git a/tensorflow/c/experimental/saved_model/core/concrete_function.h b/tensorflow/c/experimental/saved_model/core/concrete_function.h
index 6f8a5375277..22535641ef5 100644
--- a/tensorflow/c/experimental/saved_model/core/concrete_function.h
+++ b/tensorflow/c/experimental/saved_model/core/concrete_function.h
@@ -18,8 +18,8 @@ limitations under the License.
 
 #include <vector>
 
-#include "tensorflow/c/eager/operation_interface.h"
-#include "tensorflow/c/eager/tensor_handle_interface.h"
+#include "tensorflow/c/eager/immediate_execution_operation.h"
+#include "tensorflow/c/eager/immediate_execution_tensor_handle.h"
 #include "tensorflow/c/experimental/saved_model/core/function_metadata.h"
 #include "tensorflow/core/framework/function.pb.h"
 
@@ -38,15 +38,15 @@ class ConcreteFunction {
   virtual ~ConcreteFunction() = 0;
 
   // This method returns the "Call" Op used to execute the function.
-  virtual AbstractOperationInterface* GetCallOp() = 0;
+  virtual ImmediateExecutionOperation* GetCallOp() = 0;
 
-  const std::vector<tensorflow::AbstractTensorHandleInterface*>& GetCaptures()
+  const std::vector<tensorflow::ImmediateExecutionTensorHandle*>& GetCaptures()
       const;
   const FunctionMetadata& GetFunctionMetadata() const;
 
  private:
   FunctionMetadata metadata_;
-  std::vector<tensorflow::AbstractTensorHandleInterface*> captures_;
+  std::vector<tensorflow::ImmediateExecutionTensorHandle*> captures_;
   FunctionDef* function_;
 };
 
diff --git a/tensorflow/c/experimental/saved_model/core/ops/BUILD b/tensorflow/c/experimental/saved_model/core/ops/BUILD
index aa909c692ca..8c4c41c6d75 100644
--- a/tensorflow/c/experimental/saved_model/core/ops/BUILD
+++ b/tensorflow/c/experimental/saved_model/core/ops/BUILD
@@ -20,7 +20,7 @@ cc_library(
         "owned_eager_op.h",
     ],
     deps = [
-        "//tensorflow/c/eager:operation_interface",
+        "//tensorflow/c/eager:immediate_execution_operation",
     ],
 )
 
@@ -30,7 +30,7 @@ cc_library(
         "owned_tensor_handle.h",
     ],
     deps = [
-        "//tensorflow/c/eager:tensor_handle_interface",
+        "//tensorflow/c/eager:immediate_execution_tensor_handle",
         "//tensorflow/core/common_runtime/eager:tensor_handle",
     ],
 )
@@ -39,7 +39,7 @@ cc_library(
     name = "owned_eager_context",
     hdrs = ["owned_eager_context.h"],
     deps = [
-        "//tensorflow/c/eager:context_interface",
+        "//tensorflow/c/eager:immediate_execution_context",
         "//tensorflow/core/common_runtime/eager:context",
     ],
 )
@@ -63,8 +63,9 @@ cc_library(
     deps = [
         ":owned_eager_op",
         ":owned_tensor_handle",
-        "//tensorflow/c/eager:context_interface",
-        "//tensorflow/c/eager:tensor_handle_interface",
+        "//tensorflow/c/eager:abstract_tensor_handle",
+        "//tensorflow/c/eager:immediate_execution_context",
+        "//tensorflow/c/eager:immediate_execution_tensor_handle",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
diff --git a/tensorflow/c/experimental/saved_model/core/ops/owned_eager_context.h b/tensorflow/c/experimental/saved_model/core/ops/owned_eager_context.h
index 300059cd069..d944fcb51a2 100644
--- a/tensorflow/c/experimental/saved_model/core/ops/owned_eager_context.h
+++ b/tensorflow/c/experimental/saved_model/core/ops/owned_eager_context.h
@@ -18,14 +18,14 @@ limitations under the License.
 
 #include <memory>
 
-#include "tensorflow/c/eager/context_interface.h"
+#include "tensorflow/c/eager/immediate_execution_context.h"
 #include "tensorflow/core/common_runtime/eager/context.h"
 
 namespace tensorflow {
 namespace internal {
 
-struct AbstractContextInterfaceDeleter {
-  void operator()(AbstractContextInterface* p) const {
+struct ImmediateExecutionContextDeleter {
+  void operator()(ImmediateExecutionContext* p) const {
     if (p != nullptr) {
       p->Release();
     }
@@ -43,8 +43,8 @@ struct EagerContextDeleter {
 }  // namespace internal
 
 using AbstractContextPtr =
-    std::unique_ptr<AbstractContextInterface,
-                    internal::AbstractContextInterfaceDeleter>;
+    std::unique_ptr<ImmediateExecutionContext,
+                    internal::ImmediateExecutionContextDeleter>;
 
 using EagerContextPtr =
     std::unique_ptr<EagerContext, internal::EagerContextDeleter>;
diff --git a/tensorflow/c/experimental/saved_model/core/ops/owned_eager_op.h b/tensorflow/c/experimental/saved_model/core/ops/owned_eager_op.h
index c6b21578820..b3a08334a97 100644
--- a/tensorflow/c/experimental/saved_model/core/ops/owned_eager_op.h
+++ b/tensorflow/c/experimental/saved_model/core/ops/owned_eager_op.h
@@ -18,13 +18,13 @@ limitations under the License.
 
 #include <memory>
 
-#include "tensorflow/c/eager/operation_interface.h"
+#include "tensorflow/c/eager/immediate_execution_operation.h"
 
 namespace tensorflow {
 namespace internal {
 
-struct AbstractOperationInterfaceDeleter {
-  void operator()(AbstractOperationInterface* p) const {
+struct ImmediateExecutionOperationDeleter {
+  void operator()(ImmediateExecutionOperation* p) const {
     if (p != nullptr) {
       p->Release();
     }
@@ -34,8 +34,8 @@ struct AbstractOperationInterfaceDeleter {
 }  // namespace internal
 
 using AbstractOpPtr =
-    std::unique_ptr<AbstractOperationInterface,
-                    internal::AbstractOperationInterfaceDeleter>;
+    std::unique_ptr<ImmediateExecutionOperation,
+                    internal::ImmediateExecutionOperationDeleter>;
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/c/experimental/saved_model/core/ops/owned_tensor_handle.h b/tensorflow/c/experimental/saved_model/core/ops/owned_tensor_handle.h
index e98d6554afb..c52ebaa2479 100644
--- a/tensorflow/c/experimental/saved_model/core/ops/owned_tensor_handle.h
+++ b/tensorflow/c/experimental/saved_model/core/ops/owned_tensor_handle.h
@@ -18,7 +18,7 @@ limitations under the License.
 
 #include <memory>
 
-#include "tensorflow/c/eager/tensor_handle_interface.h"
+#include "tensorflow/c/eager/immediate_execution_tensor_handle.h"
 #include "tensorflow/core/common_runtime/eager/tensor_handle.h"
 
 namespace tensorflow {
@@ -33,7 +33,7 @@ struct TensorHandleDeleter {
 };
 
 struct AbstractTensorHandleDeleter {
-  void operator()(AbstractTensorHandleInterface* p) const {
+  void operator()(ImmediateExecutionTensorHandle* p) const {
     if (p != nullptr) {
       p->Release();
     }
@@ -46,7 +46,7 @@ using TensorHandlePtr =
     std::unique_ptr<TensorHandle, internal::TensorHandleDeleter>;
 
 using AbstractTensorHandlePtr =
-    std::unique_ptr<AbstractTensorHandleInterface,
+    std::unique_ptr<ImmediateExecutionTensorHandle,
                     internal::AbstractTensorHandleDeleter>;
 
 }  // namespace tensorflow
diff --git a/tensorflow/c/experimental/saved_model/core/ops/variable_ops.cc b/tensorflow/c/experimental/saved_model/core/ops/variable_ops.cc
index a3b3ace7be9..eb06662722e 100644
--- a/tensorflow/c/experimental/saved_model/core/ops/variable_ops.cc
+++ b/tensorflow/c/experimental/saved_model/core/ops/variable_ops.cc
@@ -16,7 +16,8 @@ limitations under the License.
 #include "tensorflow/c/experimental/saved_model/core/ops/variable_ops.h"
 
 #include "absl/types/span.h"
-#include "tensorflow/c/eager/context_interface.h"
+#include "tensorflow/c/eager/abstract_tensor_handle.h"
+#include "tensorflow/c/eager/immediate_execution_context.h"
 #include "tensorflow/c/experimental/saved_model/core/ops/owned_eager_op.h"
 #include "tensorflow/c/experimental/saved_model/core/ops/owned_tensor_handle.h"
 #include "tensorflow/core/framework/tensor_shape.h"
@@ -32,7 +33,7 @@ namespace internal {
 static const char kNoSharingResourceID[] =
     "cd2c89b7-88b7-44c8-ad83-06c2a9158347";
 
-Status CreateUninitializedResourceVariable(AbstractContextInterface* ctx,
+Status CreateUninitializedResourceVariable(ImmediateExecutionContext* ctx,
                                            DataType dtype, TensorShape shape,
                                            AbstractTensorHandlePtr* handle) {
   AbstractOpPtr varhandle_op = AbstractOpPtr(ctx->CreateOperation());
@@ -50,17 +51,20 @@ Status CreateUninitializedResourceVariable(AbstractContextInterface* ctx,
   TF_RETURN_IF_ERROR(varhandle_op->SetAttrString(
       "shared_name", kNoSharingResourceID, strlen(kNoSharingResourceID)));
 
-  AbstractTensorHandleInterface* var_handle = nullptr;
+  AbstractTensorHandle* var_handle = nullptr;
   int num_retvals = 1;
   TF_RETURN_IF_ERROR(varhandle_op->Execute(
       absl::MakeSpan(&var_handle, num_retvals), &num_retvals));
-  handle->reset(var_handle);
+  if (var_handle->getKind() != ImmediateExecutionTensorHandle::kKind) {
+    return errors::Internal("Unexpected tensor handle kind.");
+  }
+  handle->reset(reinterpret_cast<ImmediateExecutionTensorHandle*>(var_handle));
   return Status();
 }
 
-Status AssignVariable(AbstractContextInterface* ctx,
-                      AbstractTensorHandleInterface* variable_handle,
-                      DataType dtype, AbstractTensorHandleInterface* value) {
+Status AssignVariable(ImmediateExecutionContext* ctx,
+                      ImmediateExecutionTensorHandle* variable_handle,
+                      DataType dtype, ImmediateExecutionTensorHandle* value) {
   AbstractOpPtr assign_op(ctx->CreateOperation());
   TF_RETURN_IF_ERROR(assign_op->Reset("AssignVariableOp", nullptr));
   TF_RETURN_IF_ERROR(assign_op->SetAttrType("dtype", dtype));
@@ -72,24 +76,27 @@ Status AssignVariable(AbstractContextInterface* ctx,
   return Status();
 }
 
-Status ReadVariable(AbstractContextInterface* ctx,
-                    AbstractTensorHandleInterface* variable_handle,
+Status ReadVariable(ImmediateExecutionContext* ctx,
+                    ImmediateExecutionTensorHandle* variable_handle,
                     DataType dtype, AbstractTensorHandlePtr* output) {
   AbstractOpPtr read_op = AbstractOpPtr(ctx->CreateOperation());
   TF_RETURN_IF_ERROR(read_op->Reset("ReadVariableOp", nullptr));
   TF_RETURN_IF_ERROR(read_op->SetAttrType("dtype", dtype));
   TF_RETURN_IF_ERROR(read_op->AddInput(variable_handle));
 
-  AbstractTensorHandleInterface* value = nullptr;
+  AbstractTensorHandle* value = nullptr;
   int num_retvals = 1;
   TF_RETURN_IF_ERROR(
       read_op->Execute(absl::MakeSpan(&value, num_retvals), &num_retvals));
-  output->reset(value);
+  if (value->getKind() != ImmediateExecutionTensorHandle::kKind) {
+    return errors::Internal("Unexpected tensor handle kind.");
+  }
+  output->reset(reinterpret_cast<ImmediateExecutionTensorHandle*>(value));
   return Status();
 }
 
-Status DestroyResource(AbstractContextInterface* ctx,
-                       AbstractTensorHandleInterface* handle) {
+Status DestroyResource(ImmediateExecutionContext* ctx,
+                       ImmediateExecutionTensorHandle* handle) {
   AbstractOpPtr destroy_op = AbstractOpPtr(ctx->CreateOperation());
   TF_RETURN_IF_ERROR(destroy_op->Reset("DestroyResourceOp", nullptr));
   TF_RETURN_IF_ERROR(destroy_op->SetAttrBool("ignore_lookup_error", true));
diff --git a/tensorflow/c/experimental/saved_model/core/ops/variable_ops.h b/tensorflow/c/experimental/saved_model/core/ops/variable_ops.h
index 8a410328b9e..038b2c3d62a 100644
--- a/tensorflow/c/experimental/saved_model/core/ops/variable_ops.h
+++ b/tensorflow/c/experimental/saved_model/core/ops/variable_ops.h
@@ -16,8 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_OPS_VARIABLE_OPS_H
 #define TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_OPS_VARIABLE_OPS_H
 
-#include "tensorflow/c/eager/context_interface.h"
-#include "tensorflow/c/eager/tensor_handle_interface.h"
+#include "tensorflow/c/eager/immediate_execution_context.h"
+#include "tensorflow/c/eager/immediate_execution_tensor_handle.h"
 #include "tensorflow/c/experimental/saved_model/core/ops/owned_tensor_handle.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/types.pb.h"
@@ -30,7 +30,7 @@ namespace internal {
 // TensorHandle associated with the variable. This is equivalent to creating an
 // unitialized TF2 tf.Variable.
 // https://github.com/tensorflow/tensorflow/blob/516608035f85cec8b126712b0ff8407220206b22/tensorflow/python/ops/resource_variable_ops.py#L1867-L1872
-Status CreateUninitializedResourceVariable(AbstractContextInterface* ctx,
+Status CreateUninitializedResourceVariable(ImmediateExecutionContext* ctx,
                                            DataType dtype, TensorShape shape,
                                            AbstractTensorHandlePtr* handle);
 
@@ -39,22 +39,22 @@ Status CreateUninitializedResourceVariable(AbstractContextInterface* ctx,
 // underlying variable for `variable_handle`. Note that it is illegal to assign
 // a variable to a Tensor with a different dtype than what the variable was
 // created with.
-Status AssignVariable(AbstractContextInterface* ctx,
-                      AbstractTensorHandleInterface* variable_handle,
-                      DataType dtype, AbstractTensorHandleInterface* value);
+Status AssignVariable(ImmediateExecutionContext* ctx,
+                      ImmediateExecutionTensorHandle* variable_handle,
+                      DataType dtype, ImmediateExecutionTensorHandle* value);
 
 // Executes a ReadVariableOp using `ctx`. This reads the underlying variable
 // value of `variable_handle` and copies the value to `output`. `dtype` must be
 // the dtype of the variable associated with `variable_handle`.
-Status ReadVariable(AbstractContextInterface* ctx,
-                    AbstractTensorHandleInterface* variable_handle,
+Status ReadVariable(ImmediateExecutionContext* ctx,
+                    ImmediateExecutionTensorHandle* variable_handle,
                     DataType dtype, AbstractTensorHandlePtr* output);
 
 // Executes DestroyResourceOp on `handle`, using `ctx`. This is equivalent to
 // the cleanup that occurs in a tf.Variable's EagerResourceDeleter:
 // https://github.com/tensorflow/tensorflow/blob/516608035f85cec8b126712b0ff8407220206b22/tensorflow/python/ops/resource_variable_ops.py#L289-L290
-Status DestroyResource(AbstractContextInterface* ctx,
-                       AbstractTensorHandleInterface* handle);
+Status DestroyResource(ImmediateExecutionContext* ctx,
+                       ImmediateExecutionTensorHandle* handle);
 
 }  // namespace internal
 }  // namespace tensorflow
diff --git a/tensorflow/c/experimental/saved_model/internal/BUILD b/tensorflow/c/experimental/saved_model/internal/BUILD
index 72474940c16..888c284bb12 100644
--- a/tensorflow/c/experimental/saved_model/internal/BUILD
+++ b/tensorflow/c/experimental/saved_model/internal/BUILD
@@ -178,7 +178,7 @@ cc_library(
         ":tensorhandle_list_type",
         "//tensorflow/c:c_api_macros",
         "//tensorflow/c/eager:c_api",
-        "//tensorflow/c/eager:tensor_handle_interface",
+        "//tensorflow/c/eager:immediate_execution_tensor_handle",
         "//tensorflow/c/eager:tfe_tensorhandle_internal",
     ],
 )
@@ -190,7 +190,7 @@ cc_library(
     ],
     deps = [
         "//tensorflow/c:conversion_macros",
-        "//tensorflow/c/eager:tensor_handle_interface",
+        "//tensorflow/c/eager:immediate_execution_tensor_handle",
     ],
 )
 
diff --git a/tensorflow/c/experimental/saved_model/internal/tensorhandle_list.cc b/tensorflow/c/experimental/saved_model/internal/tensorhandle_list.cc
index 7d018658101..c8f00c1f7c0 100644
--- a/tensorflow/c/experimental/saved_model/internal/tensorhandle_list.cc
+++ b/tensorflow/c/experimental/saved_model/internal/tensorhandle_list.cc
@@ -17,7 +17,7 @@ limitations under the License.
 
 #include <stddef.h>
 
-#include "tensorflow/c/eager/tensor_handle_interface.h"
+#include "tensorflow/c/eager/immediate_execution_tensor_handle.h"
 #include "tensorflow/c/eager/tfe_tensorhandle_internal.h"
 #include "tensorflow/c/experimental/saved_model/internal/tensorhandle_list_type.h"
 
diff --git a/tensorflow/c/experimental/saved_model/internal/tensorhandle_list_type.h b/tensorflow/c/experimental/saved_model/internal/tensorhandle_list_type.h
index 8cbec2806a8..566417df025 100644
--- a/tensorflow/c/experimental/saved_model/internal/tensorhandle_list_type.h
+++ b/tensorflow/c/experimental/saved_model/internal/tensorhandle_list_type.h
@@ -19,7 +19,7 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/c/conversion_macros.h"
-#include "tensorflow/c/eager/tensor_handle_interface.h"
+#include "tensorflow/c/eager/immediate_execution_tensor_handle.h"
 
 // Internal structures used by the SavedModel C API. These are likely to
 // change and should not be depended on.
@@ -29,7 +29,7 @@ typedef struct TF_TensorHandleList TF_TensorHandleList;
 namespace tensorflow {
 
 DEFINE_CONVERSION_FUNCTIONS(
-    std::vector<tensorflow::AbstractTensorHandleInterface*>,
+    std::vector<tensorflow::ImmediateExecutionTensorHandle*>,
     TF_TensorHandleList)
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/eager/BUILD b/tensorflow/core/common_runtime/eager/BUILD
index e4f4c483209..fb69bcb7ab5 100644
--- a/tensorflow/core/common_runtime/eager/BUILD
+++ b/tensorflow/core/common_runtime/eager/BUILD
@@ -32,6 +32,8 @@ tf_cuda_library(
         ":tensor_handle",
         "//tensorflow/c:c_api_internal",
         "//tensorflow/c:tf_tensor_internal",
+        "//tensorflow/c/eager:abstract_function",
+        "//tensorflow/core/platform:errors",
     ],
     alwayslink = 1,
 )
@@ -74,9 +76,9 @@ tf_cuda_library(
         ":kernel_and_device",
         "@com_google_absl//absl/container:flat_hash_map",
         "//tensorflow/c:tf_tensor_internal",
-        "//tensorflow/c/eager:context_interface",
-        "//tensorflow/c/eager:tensor_handle_interface",
-        "//tensorflow/c/eager:operation_interface",
+        "//tensorflow/c/eager:immediate_execution_context",
+        "//tensorflow/c/eager:immediate_execution_tensor_handle",
+        "//tensorflow/c/eager:immediate_execution_operation",
         "//tensorflow/core/distributed_runtime:rendezvous_mgr_interface",
         "//tensorflow/core/distributed_runtime:worker_env",
     ] + select({
@@ -137,8 +139,10 @@ tf_cuda_library(
         "@com_google_absl//absl/types:span",
         "@com_google_absl//absl/types:variant",
         "//tensorflow/c:tf_tensor_internal",
-        "//tensorflow/c/eager:operation_interface",
-        "//tensorflow/c/eager:tensor_handle_interface",
+        "//tensorflow/c/eager:immediate_execution_operation",
+        "//tensorflow/c/eager:abstract_operation",
+        "//tensorflow/c/eager:immediate_execution_tensor_handle",
+        "//tensorflow/c/eager:abstract_tensor_handle",
         "//tensorflow/core:framework",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:lib",
@@ -211,7 +215,7 @@ tf_cuda_library(
             "@com_google_absl//absl/strings",
             "@com_google_absl//absl/types:variant",
             "//tensorflow/c:tf_tensor_internal",
-            "//tensorflow/c/eager:tensor_handle_interface",
+            "//tensorflow/c/eager:immediate_execution_tensor_handle",
             "//tensorflow/core:core_cpu_lib",
             "//tensorflow/core:framework",
             "//tensorflow/core:framework_internal",
@@ -496,6 +500,8 @@ cc_library(
         "//tensorflow/c:tf_tensor_internal",
         "//tensorflow/compiler/jit:common",
         "//tensorflow/core/profiler/lib:traceme",
+        "//tensorflow/c/eager:abstract_function",
+        "//tensorflow/core/platform:errors",
     ] + select({
         "//tensorflow:android": [
             "//tensorflow/core:portable_tensorflow_lib_lite",
diff --git a/tensorflow/core/common_runtime/eager/context.cc b/tensorflow/core/common_runtime/eager/context.cc
index 970c2bcbb89..6dc0a3a8200 100644
--- a/tensorflow/core/common_runtime/eager/context.cc
+++ b/tensorflow/core/common_runtime/eager/context.cc
@@ -30,8 +30,6 @@ limitations under the License.
 
 #include "tensorflow/c/tf_tensor.h"
 #include "tensorflow/c/tf_tensor_internal.h"
-#include "tensorflow/c/eager/operation_interface.h"
-#include "tensorflow/c/eager/tensor_handle_interface.h"
 #include "tensorflow/core/common_runtime/collective_executor_mgr.h"
 #include "tensorflow/core/common_runtime/collective_param_resolver_local.h"
 #include "tensorflow/core/common_runtime/colocation_graph.h"
diff --git a/tensorflow/core/common_runtime/eager/context.h b/tensorflow/core/common_runtime/eager/context.h
index cb6d09f8f1d..141327c08cb 100644
--- a/tensorflow/core/common_runtime/eager/context.h
+++ b/tensorflow/core/common_runtime/eager/context.h
@@ -33,7 +33,7 @@ limitations under the License.
 
 #include "absl/types/optional.h"
 #include "absl/container/flat_hash_map.h"
-#include "tensorflow/c/eager/context_interface.h"
+#include "tensorflow/c/eager/immediate_execution_context.h"
 #include "tensorflow/core/common_runtime/composite_device.h"
 #include "tensorflow/core/common_runtime/device_factory.h"
 #include "tensorflow/core/common_runtime/device_mgr.h"
@@ -135,7 +135,7 @@ class CustomDevice {
 // TensorHandles may be placed either on custom or physical devices.
 using VariantDevice = absl::variant<Device*, CustomDevice*>;
 
-class EagerContext : public AbstractContextInterface, public core::RefCounted {
+class EagerContext : public ImmediateExecutionContext, public core::RefCounted {
  public:
   static constexpr uint64 kInvalidContextId = 0;
 
@@ -178,12 +178,14 @@ class EagerContext : public AbstractContextInterface, public core::RefCounted {
                                         MemoryReleaser memory_releaser,
                                         void* memory_releaser_arg) override;
 
-  AbstractTensorHandleInterface* CreateLocalHandle(
+  ImmediateExecutionTensorHandle* CreateLocalHandle(
       AbstractTensorInterface* t) override;
-  AbstractTensorHandleInterface* CopyTensorHandleToDevice(
-      AbstractTensorHandleInterface* handle, const char* device_name,
+  ImmediateExecutionTensorHandle* CopyTensorHandleToDevice(
+      ImmediateExecutionTensorHandle* handle, const char* device_name,
       Status* status) override;
-  AbstractOperationInterface* CreateOperation() override;
+  ImmediateExecutionOperation* CreateOperation() override;
+
+  Status RegisterFunction(AbstractFunction* f) override;
 
   bool UsesTFRT() override;
 
@@ -716,7 +718,7 @@ class EagerContext : public AbstractContextInterface, public core::RefCounted {
   std::function<void()> resource_deallocator_ = nullptr;
 };
 
-inline EagerContext* ContextFromInterface(AbstractContextInterface* context) {
+inline EagerContext* ContextFromInterface(ImmediateExecutionContext* context) {
   return down_cast<EagerContext*>(context);
 }
 
diff --git a/tensorflow/core/common_runtime/eager/core.cc b/tensorflow/core/common_runtime/eager/core.cc
index e342f6ae6cd..3d37250a4fe 100644
--- a/tensorflow/core/common_runtime/eager/core.cc
+++ b/tensorflow/core/common_runtime/eager/core.cc
@@ -13,11 +13,13 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include "tensorflow/c/c_api_internal.h"
+#include "tensorflow/c/eager/abstract_function.h"
 #include "tensorflow/c/tf_tensor_internal.h"
 #include "tensorflow/core/common_runtime/eager/context.h"
 #include "tensorflow/core/common_runtime/eager/eager_operation.h"
 #include "tensorflow/core/common_runtime/eager/execute.h"
 #include "tensorflow/core/common_runtime/eager/tensor_handle.h"
+#include "tensorflow/core/platform/errors.h"
 
 namespace {
 
@@ -112,8 +114,8 @@ AbstractTensorInterface* TensorHandle::Resolve(Status* status) {
   }
 }
 
-AbstractTensorHandleInterface* EagerContext::CopyTensorHandleToDevice(
-    AbstractTensorHandleInterface* handle, const char* device_name,
+ImmediateExecutionTensorHandle* EagerContext::CopyTensorHandleToDevice(
+    ImmediateExecutionTensorHandle* handle, const char* device_name,
     Status* status) {
   TensorHandle* input = TensorHandleFromInterface(handle);
   TensorHandle* result = nullptr;
@@ -158,7 +160,7 @@ AbstractTensorHandleInterface* EagerContext::CopyTensorHandleToDevice(
 // here to a circular BUILD dep issue. If we move this to context.cc, then we
 // will have the circular dependency of:
 //   context -> tensor_handle -> remote_tensor_handle_data -> context
-AbstractTensorHandleInterface* EagerContext::CreateLocalHandle(
+ImmediateExecutionTensorHandle* EagerContext::CreateLocalHandle(
     AbstractTensorInterface* t) {
   Tensor tensor = TensorFromInterface(t);
   return TensorHandle::CreateLocalHandle(std::move(tensor), /*d=*/HostCPU(),
@@ -168,14 +170,23 @@ AbstractTensorHandleInterface* EagerContext::CreateLocalHandle(
 // TODO(b/152902651): We have to keep this function here since EagerOperation
 // depends on EagerContext. Thus, the context build target can't depend on
 // EagerOperation.
-AbstractOperationInterface* EagerContext::CreateOperation() {
+ImmediateExecutionOperation* EagerContext::CreateOperation() {
   return new EagerOperation(this);
 }
 
+Status EagerContext::RegisterFunction(AbstractFunction* f) {
+  FunctionDef* fdef;
+  TF_RETURN_IF_ERROR(f->GetFunctionDef(&fdef));
+  if (!fdef) {
+    return errors::InvalidArgument("GetFunctionDef returned nullptr.");
+  }
+  return AddFunctionDef(*fdef);
+}
+
 // TODO(b/152902651): Once we move many execute.cc functions into
 // eager_operation.cc we can avoid a circular dependency between them.
-Status EagerOperation::Execute(
-    absl::Span<AbstractTensorHandleInterface*> retvals, int* num_retvals) {
+Status EagerOperation::Execute(absl::Span<AbstractTensorHandle*> retvals,
+                               int* num_retvals) {
   return EagerExecute(
       this, reinterpret_cast<tensorflow::TensorHandle**>(retvals.data()),
       num_retvals);
diff --git a/tensorflow/core/common_runtime/eager/eager_operation.cc b/tensorflow/core/common_runtime/eager/eager_operation.cc
index 090bfef46bd..073095e64d1 100644
--- a/tensorflow/core/common_runtime/eager/eager_operation.cc
+++ b/tensorflow/core/common_runtime/eager/eager_operation.cc
@@ -15,7 +15,9 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/eager/eager_operation.h"
 
 #include "absl/types/span.h"
-#include "tensorflow/c/eager/tensor_handle_interface.h"
+#include "tensorflow/c/eager/abstract_operation.h"
+#include "tensorflow/c/eager/abstract_tensor_handle.h"
+#include "tensorflow/c/eager/immediate_execution_tensor_handle.h"
 #include "tensorflow/c/tf_tensor_internal.h"
 #include "tensorflow/core/common_runtime/eager/attr_builder.h"
 #include "tensorflow/core/common_runtime/input_colocation_exemption_registry.h"
@@ -91,8 +93,8 @@ Status EagerOperation::SetAttrShape(const char* attr_name, const int64_t* dims,
   return Status::OK();
 }
 
-Status EagerOperation::SetAttrFunction(
-    const char* attr_name, const AbstractOperationInterface* value) {
+Status EagerOperation::SetAttrFunction(const char* attr_name,
+                                       const AbstractOperation* value) {
   AttrValue attr_value;
   NameAttrList* func = attr_value.mutable_func();
   func->set_name(value->Name());
@@ -194,8 +196,7 @@ Status EagerOperation::SetAttrShapeList(const char* attr_name,
 }
 
 Status EagerOperation::SetAttrFunctionList(
-    const char* attr_name,
-    absl::Span<const AbstractOperationInterface*> values) {
+    const char* attr_name, absl::Span<const AbstractOperation*> values) {
   size_t num_values = values.size();
   std::unique_ptr<NameAttrList[]> funcs(new NameAttrList[num_values]);
   for (int i = 0; i < num_values; i++) {
@@ -253,14 +254,13 @@ Status EagerOperation::OutputLength(const char* output_name, int* length) {
   return Status::OK();
 }
 
-Status EagerOperation::AddInput(AbstractTensorHandleInterface* input) {
+Status EagerOperation::AddInput(AbstractTensorHandle* input) {
   TensorHandle* h = TensorHandleFromInterface(input);
   AddTensorHandle(h);
   return MaybeInferSingleInputAttrs(h);
 }
 
-Status EagerOperation::AddInputList(
-    absl::Span<AbstractTensorHandleInterface*> inputs) {
+Status EagerOperation::AddInputList(absl::Span<AbstractTensorHandle*> inputs) {
   for (auto& input : inputs) {
     TensorHandle* h = TensorHandleFromInterface(input);
     AddTensorHandle(h);
diff --git a/tensorflow/core/common_runtime/eager/eager_operation.h b/tensorflow/core/common_runtime/eager/eager_operation.h
index 14268ef2630..963aed25733 100644
--- a/tensorflow/core/common_runtime/eager/eager_operation.h
+++ b/tensorflow/core/common_runtime/eager/eager_operation.h
@@ -19,6 +19,7 @@ limitations under the License.
 #include "absl/types/optional.h"
 #include "absl/types/span.h"
 #include "absl/types/variant.h"
+#include "tensorflow/c/eager/abstract_tensor_handle.h"
 #include "tensorflow/core/common_runtime/eager/attr_builder.h"
 #include "tensorflow/core/common_runtime/eager/context.h"
 #include "tensorflow/core/common_runtime/eager/eager_executor.h"
@@ -31,7 +32,7 @@ limitations under the License.
 
 namespace tensorflow {
 
-class EagerOperation : public AbstractOperationInterface {
+class EagerOperation : public ImmediateExecutionOperation {
  public:
   explicit EagerOperation(tensorflow::EagerContext* ctx) : ctx_(*ctx) {}
   ~EagerOperation() override {
@@ -56,7 +57,7 @@ class EagerOperation : public AbstractOperationInterface {
   }
 
   // Replaces the previous device name with the given one (see
-  // AbstractOperationInterface::SetDeviceName for more details).
+  // AbstractOperation::SetDeviceName for more details).
   //
   // This also resets the internal device pointer, unless the given name refers
   // to a known custom device, in which case the internal device pointer is
@@ -76,10 +77,9 @@ class EagerOperation : public AbstractOperationInterface {
 
   Status SetAttrValue(const char* attr_name, const AttrValue& value);
 
-  Status AddInput(AbstractTensorHandleInterface* input) override;
-  Status AddInputList(
-      absl::Span<AbstractTensorHandleInterface*> inputs) override;
-  Status Execute(absl::Span<AbstractTensorHandleInterface*> retvals,
+  Status AddInput(AbstractTensorHandle* input) override;
+  Status AddInputList(absl::Span<AbstractTensorHandle*> inputs) override;
+  Status Execute(absl::Span<AbstractTensorHandle*> retvals,
                  int* num_retvals) override;
   const tensorflow::OpDef* OpDef() const override { return op_def_; };
 
@@ -92,7 +92,7 @@ class EagerOperation : public AbstractOperationInterface {
   Status SetAttrShape(const char* attr_name, const int64_t* dims,
                       const int num_dims) override;
   Status SetAttrFunction(const char* attr_name,
-                         const AbstractOperationInterface* value) override;
+                         const AbstractOperation* value) override;
   Status SetAttrFunctionName(const char* attr_name, const char* data,
                              size_t length) override;
   Status SetAttrTensor(const char* attr_name,
@@ -111,7 +111,7 @@ class EagerOperation : public AbstractOperationInterface {
                           const int* num_dims, int num_values) override;
   Status SetAttrFunctionList(
       const char* attr_name,
-      absl::Span<const AbstractOperationInterface*> values) override;
+      absl::Span<const AbstractOperation*> values) override;
 
   Status InputLength(const char* input_name, int* length) override;
   Status OutputLength(const char* output_name, int* length) override;
@@ -235,7 +235,7 @@ inline void EagerOperation::UpdateInput(int i, TensorHandle* h) {
 }
 
 inline EagerOperation* OperationFromInterface(
-    AbstractOperationInterface* operation) {
+    ImmediateExecutionOperation* operation) {
   return down_cast<EagerOperation*>(operation);
 }
 
diff --git a/tensorflow/core/common_runtime/eager/tensor_handle.cc b/tensorflow/core/common_runtime/eager/tensor_handle.cc
index 9b82c556cd0..9e607c97683 100644
--- a/tensorflow/core/common_runtime/eager/tensor_handle.cc
+++ b/tensorflow/core/common_runtime/eager/tensor_handle.cc
@@ -1071,7 +1071,7 @@ const char* TensorHandle::BackingDeviceName(Status* status) const {
   }
 }
 
-tensorflow::AbstractTensorHandleInterface* TensorHandle::Copy() {
+tensorflow::ImmediateExecutionTensorHandle* TensorHandle::Copy() {
   Ref();
   return this;
 }
diff --git a/tensorflow/core/common_runtime/eager/tensor_handle.h b/tensorflow/core/common_runtime/eager/tensor_handle.h
index 5e7638ae03c..a14df475e0f 100644
--- a/tensorflow/core/common_runtime/eager/tensor_handle.h
+++ b/tensorflow/core/common_runtime/eager/tensor_handle.h
@@ -31,7 +31,7 @@ limitations under the License.
 // clang-format on
 
 #include "absl/types/variant.h"
-#include "tensorflow/c/eager/tensor_handle_interface.h"
+#include "tensorflow/c/eager/immediate_execution_tensor_handle.h"
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/eager/eager_executor.h"
 #include "tensorflow/core/common_runtime/eager/tensor_handle_data.h"
@@ -53,7 +53,7 @@ class EagerContext;
 // Associates a Tensor and a Device, used in the eager runtime. Internal version
 // of the TFE_TensorHandle struct and the python EagerTensor class
 // (unrelated to python TensorHandle).
-class TensorHandle : public AbstractTensorHandleInterface,
+class TensorHandle : public ImmediateExecutionTensorHandle,
                      public core::RefCounted {
   // TensorHandle for dtype != DT_RESOURCE
   TensorHandle(tensorflow::Tensor&& t, Device* d, Device* op_device,
@@ -121,7 +121,7 @@ class TensorHandle : public AbstractTensorHandleInterface,
   const char* BackingDeviceName(Status* status) const override;
   AbstractTensorInterface* Resolve(Status* status) override;
 
-  AbstractTensorHandleInterface* Copy() override;
+  ImmediateExecutionTensorHandle* Copy() override;
 
   // Return the Tensor from the default device.
   Status Tensor(const tensorflow::Tensor** t) const;
@@ -372,12 +372,12 @@ const VariantDevice kVariantDeviceNull = static_cast<Device*>(nullptr);
 // Returns the device backing the resource. Else, returns nullptr.
 Device* GetResourceDevice(const ResourceHandle& handle, EagerContext* ctx);
 
-class TensorHandleInterface : public AbstractTensorHandleInterface {
+class TensorHandleInterface : public ImmediateExecutionTensorHandle {
  public:
 };
 
-inline TensorHandle* TensorHandleFromInterface(
-    AbstractTensorHandleInterface* handle) {
+template <typename T>
+inline TensorHandle* TensorHandleFromInterface(T* handle) {
   return down_cast<TensorHandle*>(handle);
 }
 
diff --git a/tensorflow/python/eager/pywrap_tfe_src.cc b/tensorflow/python/eager/pywrap_tfe_src.cc
index 639f623bd1a..b9ff474caab 100644
--- a/tensorflow/python/eager/pywrap_tfe_src.cc
+++ b/tensorflow/python/eager/pywrap_tfe_src.cc
@@ -2008,7 +2008,7 @@ bool ListContainsNone(PyObject* list) {
 
 static PyTapeTensor TapeTensorFromTensor(PyObject* tensor) {
   if (EagerTensor_CheckExact(tensor)) {
-    tensorflow::AbstractTensorHandleInterface* handle =
+    tensorflow::ImmediateExecutionTensorHandle* handle =
         tensorflow::unwrap(EagerTensor_Handle(tensor));
     tensorflow::int64 id = PyEagerTensor_ID(tensor);
     tensorflow::DataType dtype =
@@ -3869,7 +3869,7 @@ tensorflow::Status TFE_Py_EncodeTensor(PyObject* arg,
                                        bool include_tensor_ranks_only,
                                        EncodeResult* result) {
   if (EagerTensor_CheckExact(arg)) {
-    tensorflow::AbstractTensorHandleInterface* handle =
+    tensorflow::ImmediateExecutionTensorHandle* handle =
         tensorflow::unwrap(EagerTensor_Handle(arg));
 
     absl::StrAppend(&result->str, kDType,

From cb61c42938010f3ff33b55ef4b41651c22fb7652 Mon Sep 17 00:00:00 2001
From: Matt Conley <mconley@nvidia.com>
Date: Mon, 15 Jun 2020 11:22:30 -0700
Subject: [PATCH 0421/1390] [TFTRT] Add Dynamic Shape Tests for ConvertSquare

Co-authored-by: Tamas Feher <tfeher@nvidia.com>

- Modify ConvertSquare tests to use newer TFTRT testing API
- Add INT32 as a supported dtype for TFTRT ConvertSquare
---
 .../tf2tensorrt/convert/convert_nodes.cc      |  5 ++
 .../tf2tensorrt/convert/convert_nodes_test.cc | 64 +++++++------------
 2 files changed, 28 insertions(+), 41 deletions(-)

diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc
index 20ee5ffd8f8..28b27959afc 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc
+++ b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc
@@ -4424,8 +4424,13 @@ Status ConvertSquare(OpConverterParams* params) {
   const auto& inputs = params->inputs;
   const auto& node_def = params->node_def;
   TF_RETURN_IF_ERROR(CheckInputsWeights(*params, {{"x", false}}));
+#if IS_TRT_VERSION_GE(6, 0, 1, 0)
+  TF_RETURN_IF_ERROR(AllowDataTypes(
+      *params, {DataType::DT_FLOAT, DataType::DT_HALF, DataType::DT_INT32}));
+#else
   TF_RETURN_IF_ERROR(
       AllowDataTypes(*params, {DataType::DT_FLOAT, DataType::DT_HALF}));
+#endif
   if (params->validation_only) return Status::OK();
 
   // Constant 2 with same rank as input
diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes_test.cc b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes_test.cc
index 450831910f6..1192b563e57 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes_test.cc
+++ b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes_test.cc
@@ -2754,58 +2754,40 @@ TEST_F(OpConverterTest, ConvertQuantize) {
   }
 }
 
-template <DataType dtype>
-void TestConvertSquare(OpConverterTest* test) {
-  test->Reset();
-  typedef typename EnumToDataType<dtype>::Type CType;
-
-  Scope s = Scope::NewRootScope();
-  auto input = ops::Placeholder(s.WithOpName("input"), dtype);
-  auto square = ops::Square(s.WithOpName("my_square"), input);
-  NodeDef node_def = square.operation.node()->def();
-
-  test->AddTestTensor("input", {1, 20}, /*batch_size=*/1,
-                      TfDataTypeToTrt(dtype));
-  test->RunValidationAndConversion(node_def);
-  TRT_TensorOrWeights output;
-  TF_EXPECT_OK(test->GetTensorOrWeights("my_square", &output));
-  ASSERT_TRUE(output.is_tensor());
-  ExpectTrtDimsEqualsArray({1, 20}, output.tensor()->getDimensions());
-
-  const int num_inputs = 20;
-  std::vector<CType> inputs(num_inputs);
-  std::vector<CType> expected_outputs(num_inputs);
-  for (int i = 0; i < num_inputs; ++i) {
-    const CType value = CType(i - 9);
-    inputs[i] = value;
-    expected_outputs[i] = value * value;
-  }
-  const DataVec input_data{{"input", test->AsTensor<CType>(inputs)}};
-  // Engine outputs are converted to FP16 automatically if we set FP16 mode in
-  // the builder.
-  DataVec output_data{{"my_square", test->ConstructTensor<CType>(num_inputs)}};
-  TF_EXPECT_OK(test->BuildAndRun(input_data, &output_data));
-  ExpectArrayNear(expected_outputs, GetSpanForData<CType>(output_data[0]));
-}
-
-TEST_F(OpConverterTest, ConvertSquare) {
+TEST_P(OpConverterTest2, ConvertSquare) {
   {
     // Input is weights, should fail.
     Reset();
     Scope s = Scope::NewRootScope();
-    auto input = ops::Placeholder(s.WithOpName("input"), DT_FLOAT);
+    auto input = ops::Placeholder(s.WithOpName("input"), tf_type);
     auto square = ops::Square(s.WithOpName("my_square"), input);
     NodeDef node_def = square.operation.node()->def();
-    AddTestWeights<float>("input", {1, 2, 3}, {1, 2, 3, 4, -5, 6});
+    AddTestWeights("input", {1, 2, 3}, {1, 2, 3, 4, -5, 6}, tf_type);
     RunValidationAndConversion(
         node_def, error::UNIMPLEMENTED,
         "The input \"x\" for Square must be a tensor, at my_square");
   }
 
-  // OK. Note that kINT32 is not supported by IElementWiseLayer, so we don't
-  // test DT_INT32 type here.
-  TestConvertSquare<DT_FLOAT>(this);
-  TestConvertSquare<DT_HALF>(this);
+  Reset();
+
+  Scope s = Scope::NewRootScope();
+  auto input = ops::Placeholder(s.WithOpName("input"), tf_type);
+  auto square = ops::Square(s.WithOpName("my_square"), input);
+  NodeDef node_def = square.operation.node()->def();
+
+  const int num_inputs = 20;
+  std::vector<float> inputs(num_inputs);
+  std::vector<float> expected_outputs(num_inputs);
+
+  for (int i = 0; i < num_inputs; ++i) {
+    const float value = (i - 9);
+    inputs[i] = value;
+    expected_outputs[i] = value * value;
+  }
+  AddTestTensor("input", {1, 1, 20}, tf_type, inputs);
+
+  TestOpConverter("my_square", node_def, {1, 1, 20}, Status::OK(), Status::OK(),
+                  ArrayFloatNear(expected_outputs, 0));
 }
 
 #if IS_TRT_VERSION_GE(5, 1, 0, 0)

From d5ca984c5314d2e683eca87539d03ee4e35e3ee6 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 17 Jun 2020 13:05:59 -0700
Subject: [PATCH 0422/1390] Allow the compiler to vectorize the loop.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

```
name                                                                                old time/op             new time/op             delta
BM_Conv2DBackpropInput_fp32_NHWC_cpu_in8x112x112x64_f2x2x64_s2x2_SAME               18.6ms ± 5%             18.5ms ±13%     ~           (p=0.912 n=10+10)
BM_Conv2DBackpropInput_fp32_NHWC_cpu_in8x56x56x128_f2x2x128_s2x2_SAME               12.7ms ±12%             12.7ms ±17%     ~           (p=0.684 n=10+10)
BM_Conv2DBackpropInput_fp32_NHWC_cpu_in8x28x28x256_f2x2x256_s2x2_SAME               12.2ms ± 8%             11.2ms ± 4%   -8.21%         (p=0.001 n=10+8)
BM_Conv2DBackpropInput_fp32_NHWC_cpu_in8x112x112x64_f2x2x64_s2x2_VALID              18.7ms ±20%             18.6ms ±23%     ~            (p=0.278 n=9+10)
BM_Conv2DBackpropInput_fp32_NHWC_cpu_in8x56x56x128_f2x2x128_s2x2_VALID              12.5ms ±15%             11.4ms ± 2%   -8.98%         (p=0.000 n=10+9)
BM_Conv2DBackpropInput_fp32_NHWC_cpu_in8x28x28x256_f2x2x256_s2x2_VALID              11.6ms ± 8%             11.1ms ± 2%   -4.22%          (p=0.011 n=9+9)
BM_Conv2DBackpropInput_fp32_NHWC_cpu_in8x56x56x64_f1x1x64_s1x1_SAME                 4.57ms ± 3%             4.34ms ± 1%   -5.04%          (p=0.000 n=8+9)
BM_Conv2DBackpropInput_fp32_NHWC_cpu_in8x56x56x64_f1x1x256_s1x1_SAME                12.0ms ± 4%             11.5ms ± 2%   -4.32%          (p=0.000 n=8+9)
BM_Conv2DBackpropInput_fp32_NHWC_cpu_in8x56x56x256_f1x1x64_s1x1_SAME                20.0ms ±31%             20.6ms ±17%     ~           (p=0.912 n=10+10)
BM_Conv2DBackpropInput_fp32_NHWC_cpu_in8x56x56x64_f3x3x64_s1x1_SAME                 36.5ms ±21%             32.0ms ± 1%  -12.30%         (p=0.000 n=10+9)
BM_Conv2DBackpropInput_fp32_NHWC_cpu_in8x28x28x128_f1x1x128_s1x1_SAME               3.71ms ±17%             3.33ms ± 1%  -10.47%         (p=0.000 n=10+9)
BM_Conv2DBackpropInput_fp32_NHWC_cpu_in8x28x28x128_f1x1x512_s1x1_SAME               11.8ms ±16%             10.5ms ± 1%  -11.37%         (p=0.000 n=10+8)
BM_Conv2DBackpropInput_fp32_NHWC_cpu_in8x28x28x512_f1x1x128_s1x1_SAME               13.1ms ±13%             11.4ms ± 2%  -13.36%         (p=0.000 n=10+8)
BM_Conv2DBackpropInput_fp32_NHWC_cpu_in8x28x28x512_f3x3x128_s1x1_SAME                142ms ±12%              124ms ± 1%  -13.22%         (p=0.000 n=10+8)
BM_Conv2DBackpropInput_fp32_NHWC_cpu_in8x14x14x256_f1x1x256_s1x1_SAME               3.51ms ±14%             3.18ms ±20%   -9.43%        (p=0.009 n=10+10)
BM_Conv2DBackpropInput_fp32_NHWC_cpu_in8x14x14x256_f1x1x1024_s1x1_SAME              14.0ms ±18%             12.0ms ± 4%  -13.80%         (p=0.012 n=10+8)
BM_Conv2DBackpropInput_fp32_NHWC_cpu_in8x14x14x1024_f1x1x256_s1x1_SAME              12.8ms ±18%             11.1ms ± 2%  -13.57%         (p=0.001 n=10+8)
BM_Conv2DBackpropInput_fp32_NHWC_cpu_in8x14x14x256_f3x3x256_s1x1_SAME               23.0ms ±18%             19.9ms ± 4%  -13.38%         (p=0.004 n=10+8)
BM_Conv2DBackpropInput_fp32_NHWC_cpu_in16x112x112x64_f2x2x64_s2x2_SAME              45.3ms ± 9%             40.5ms ± 4%  -10.74%         (p=0.000 n=10+9)
BM_Conv2DBackpropInput_fp32_NHWC_cpu_in16x56x56x128_f2x2x128_s2x2_SAME              33.2ms ±13%             28.8ms ± 2%  -13.11%         (p=0.001 n=10+8)
BM_Conv2DBackpropInput_fp32_NHWC_cpu_in16x28x28x256_f2x2x256_s2x2_SAME              31.5ms ±15%             26.7ms ± 2%  -15.13%         (p=0.000 n=10+8)
BM_Conv2DBackpropInput_fp32_NHWC_cpu_in16x112x112x64_f2x2x64_s2x2_VALID             45.5ms ± 8%             41.3ms ± 9%   -9.31%        (p=0.000 n=10+10)
BM_Conv2DBackpropInput_fp32_NHWC_cpu_in16x56x56x128_f2x2x128_s2x2_VALID             33.2ms ±12%             28.8ms ± 2%  -13.38%         (p=0.000 n=10+9)
BM_Conv2DBackpropInput_fp32_NHWC_cpu_in16x28x28x256_f2x2x256_s2x2_VALID             31.6ms ±14%             26.7ms ± 1%  -15.53%         (p=0.000 n=10+8)
BM_Conv2DBackpropInput_fp32_NHWC_cpu_in16x56x56x64_f1x1x64_s1x1_SAME                11.1ms ±15%              9.4ms ± 3%  -15.29%        (p=0.000 n=10+10)
BM_Conv2DBackpropInput_fp32_NHWC_cpu_in16x56x56x64_f1x1x256_s1x1_SAME               27.3ms ±13%             23.5ms ± 2%  -13.97%        (p=0.000 n=10+10)
BM_Conv2DBackpropInput_fp32_NHWC_cpu_in16x56x56x256_f1x1x64_s1x1_SAME               46.2ms ± 8%             40.9ms ± 5%  -11.54%        (p=0.000 n=10+10)
BM_Conv2DBackpropInput_fp32_NHWC_cpu_in16x56x56x64_f3x3x64_s1x1_SAME                81.5ms ±17%             64.2ms ± 1%  -21.17%        (p=0.000 n=10+10)
BM_Conv2DBackpropInput_fp32_NHWC_cpu_in16x28x28x128_f1x1x128_s1x1_SAME              8.30ms ±16%             6.80ms ± 1%  -18.02%        (p=0.000 n=10+10)
BM_Conv2DBackpropInput_fp32_NHWC_cpu_in16x28x28x128_f1x1x512_s1x1_SAME              27.7ms ±14%             23.3ms ± 2%  -15.87%        (p=0.000 n=10+10)
BM_Conv2DBackpropInput_fp32_NHWC_cpu_in16x28x28x512_f1x1x128_s1x1_SAME              34.9ms ±14%             28.8ms ± 2%  -17.41%        (p=0.000 n=10+10)
BM_Conv2DBackpropInput_fp32_NHWC_cpu_in16x28x28x512_f3x3x128_s1x1_SAME               300ms ±12%              250ms ± 1%  -16.80%         (p=0.000 n=10+9)
BM_Conv2DBackpropInput_fp32_NHWC_cpu_in16x14x14x256_f1x1x256_s1x1_SAME              7.91ms ±15%             7.17ms ±21%   -9.36%        (p=0.029 n=10+10)
BM_Conv2DBackpropInput_fp32_NHWC_cpu_in16x14x14x256_f1x1x1024_s1x1_SAME             35.6ms ±15%             33.4ms ±13%   -6.21%        (p=0.023 n=10+10)
BM_Conv2DBackpropInput_fp32_NHWC_cpu_in16x14x14x1024_f1x1x256_s1x1_SAME             28.0ms ± 3%             26.7ms ± 1%   -4.81%          (p=0.000 n=8+8)
BM_Conv2DBackpropInput_fp32_NHWC_cpu_in16x14x14x256_f3x3x256_s1x1_SAME              41.4ms ± 3%             39.6ms ± 4%   -4.43%          (p=0.001 n=9+9)
BM_Conv2DBackpropInput_fp32_NHWC_cpu_in32x112x112x64_f2x2x64_s2x2_SAME              90.0ms ± 3%             85.5ms ±11%   -5.00%         (p=0.010 n=9+10)
BM_Conv2DBackpropInput_fp32_NHWC_cpu_in32x56x56x128_f2x2x128_s2x2_SAME              69.0ms ± 1%             65.8ms ± 3%   -4.68%          (p=0.000 n=9+9)
BM_Conv2DBackpropInput_fp32_NHWC_cpu_in32x28x28x256_f2x2x256_s2x2_SAME              57.8ms ± 2%             55.8ms ± 2%   -3.54%          (p=0.000 n=9+9)
BM_Conv2DBackpropInput_fp32_NHWC_cpu_in32x112x112x64_f2x2x64_s2x2_VALID             89.3ms ± 2%             86.1ms ± 5%   -3.57%         (p=0.006 n=9+10)
BM_Conv2DBackpropInput_fp32_NHWC_cpu_in32x56x56x128_f2x2x128_s2x2_VALID             68.7ms ± 2%             65.7ms ± 3%   -4.31%          (p=0.001 n=8+9)
BM_Conv2DBackpropInput_fp32_NHWC_cpu_in32x28x28x256_f2x2x256_s2x2_VALID             57.1ms ± 1%             55.6ms ± 1%   -2.58%         (p=0.000 n=8+10)
BM_Conv2DBackpropInput_fp32_NHWC_cpu_in32x56x56x64_f1x1x64_s1x1_SAME                23.6ms ± 9%             21.9ms ± 1%   -7.27%         (p=0.000 n=10+9)
BM_Conv2DBackpropInput_fp32_NHWC_cpu_in32x56x56x64_f1x1x256_s1x1_SAME               51.0ms ± 3%             48.4ms ± 5%   -5.13%         (p=0.001 n=8+10)
BM_Conv2DBackpropInput_fp32_NHWC_cpu_in32x56x56x256_f1x1x64_s1x1_SAME               90.2ms ± 3%             85.4ms ± 5%   -5.29%         (p=0.000 n=10+9)
BM_Conv2DBackpropInput_fp32_NHWC_cpu_in32x56x56x64_f3x3x64_s1x1_SAME                 143ms ± 7%              133ms ± 3%   -6.48%          (p=0.000 n=9+9)
BM_Conv2DBackpropInput_fp32_NHWC_cpu_in32x28x28x128_f1x1x128_s1x1_SAME              14.5ms ± 4%             14.0ms ± 2%   -3.82%         (p=0.000 n=10+8)
BM_Conv2DBackpropInput_fp32_NHWC_cpu_in32x28x28x128_f1x1x512_s1x1_SAME              53.8ms ± 3%             51.5ms ± 2%   -4.43%         (p=0.000 n=10+9)
BM_Conv2DBackpropInput_fp32_NHWC_cpu_in32x28x28x512_f1x1x128_s1x1_SAME              69.8ms ± 5%             66.9ms ± 8%   -4.15%         (p=0.010 n=10+9)
BM_Conv2DBackpropInput_fp32_NHWC_cpu_in32x28x28x512_f3x3x128_s1x1_SAME               533ms ± 3%              508ms ± 0%   -4.65%         (p=0.000 n=10+8)
BM_Conv2DBackpropInput_fp32_NHWC_cpu_in32x14x14x256_f1x1x256_s1x1_SAME              14.0ms ± 6%             14.9ms ±21%     ~           (p=0.481 n=10+10)
BM_Conv2DBackpropInput_fp32_NHWC_cpu_in32x14x14x256_f1x1x1024_s1x1_SAME             67.2ms ± 3%             71.0ms ±22%     ~            (p=0.278 n=9+10)
BM_Conv2DBackpropInput_fp32_NHWC_cpu_in32x14x14x1024_f1x1x256_s1x1_SAME             57.5ms ± 4%             56.6ms ± 5%     ~            (p=0.055 n=10+8)
BM_Conv2DBackpropInput_fp32_NHWC_cpu_in32x14x14x256_f3x3x256_s1x1_SAME              78.1ms ± 5%             75.5ms ± 3%   -3.32%          (p=0.011 n=9+8)
```

PiperOrigin-RevId: 316949219
Change-Id: I1a1fb517a5c28d489da9762b650577b61bf4e0de
---
 tensorflow/core/kernels/conv_grad_input_ops.cc | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/tensorflow/core/kernels/conv_grad_input_ops.cc b/tensorflow/core/kernels/conv_grad_input_ops.cc
index fd2f569a8b8..2dd63d1f4d0 100644
--- a/tensorflow/core/kernels/conv_grad_input_ops.cc
+++ b/tensorflow/core/kernels/conv_grad_input_ops.cc
@@ -76,7 +76,7 @@ template <typename T>
 void Col2im(const T* col_data, const int depth, const int height,
             const int width, const int filter_h, const int filter_w,
             const int pad_t, const int pad_l, const int pad_b, const int pad_r,
-            const int stride_h, const int stride_w, T* im_data) {
+            const int stride_h, const int stride_w, T* __restrict im_data) {
   int height_col = (height + pad_t + pad_b - filter_h) / stride_h + 1;
   int width_col = (width + pad_l + pad_r - filter_w) / stride_w + 1;
   int h_pad = -pad_t;
@@ -87,7 +87,6 @@ void Col2im(const T* col_data, const int depth, const int height,
       for (int ih = h_pad; ih < h_pad + filter_h; ++ih) {
         for (int iw = w_pad; iw < w_pad + filter_w; ++iw) {
           if (ih >= 0 && ih < height && iw >= 0 && iw < width) {
-            // TODO(andydavis) Vectorize this loop (if compiler does not).
             for (int i = 0; i < depth; ++i) {
               im_patch_data[i] += col_data[i];
             }

From 9426d35abc2ea3a5e790a928641749c5783e4b65 Mon Sep 17 00:00:00 2001
From: Lukasz Kaiser <lukaszkaiser@google.com>
Date: Wed, 17 Jun 2020 13:07:01 -0700
Subject: [PATCH 0423/1390] Add the vdot op to tf-numpy.

PiperOrigin-RevId: 316949446
Change-Id: I9b0718ec108486096e032729481ec9129863b429
---
 tensorflow/python/ops/numpy_ops/np_math_ops.py      | 10 ++++++++++
 tensorflow/python/ops/numpy_ops/np_math_ops_test.py |  6 ++++++
 2 files changed, 16 insertions(+)

diff --git a/tensorflow/python/ops/numpy_ops/np_math_ops.py b/tensorflow/python/ops/numpy_ops/np_math_ops.py
index b32f78bee5a..abfd9087ffd 100644
--- a/tensorflow/python/ops/numpy_ops/np_math_ops.py
+++ b/tensorflow/python/ops/numpy_ops/np_math_ops.py
@@ -309,6 +309,16 @@ def cross(a, b, axisa=-1, axisb=-1, axisc=-1, axis=None):  # pylint: disable=mis
   return _bin_op(f, a, b)
 
 
+@np_utils.np_doc_only(np.vdot)
+def vdot(a, b):  # pylint: disable=missing-docstring
+  a, b = np_array_ops._promote_dtype(a, b)
+  a = np_array_ops.reshape(a, [-1])
+  b = np_array_ops.reshape(b, [-1])
+  if a.dtype == np_dtypes.complex128 or a.dtype == np_dtypes.complex64:
+    a = conj(a)
+  return dot(a, b)
+
+
 @np_utils.np_doc(np.power)
 def power(x1, x2):
   return _bin_op(math_ops.pow, x1, x2)
diff --git a/tensorflow/python/ops/numpy_ops/np_math_ops_test.py b/tensorflow/python/ops/numpy_ops/np_math_ops_test.py
index a3dfbb6c871..cb5326bcded 100644
--- a/tensorflow/python/ops/numpy_ops/np_math_ops_test.py
+++ b/tensorflow/python/ops/numpy_ops/np_math_ops_test.py
@@ -124,6 +124,12 @@ class MathTest(test.TestCase, parameterized.TestCase):
       np_math_ops.matmul(
           np_array_ops.ones([2, 3], np.int32), np_array_ops.ones([], np.int32))
 
+  def testVDot(self):
+    operands = [([[1, 2], [3, 4]], [[3, 4], [6, 7]]),
+                ([[1, 2], [3, 4]], [3, 4, 6, 7])]
+    return self._testBinaryOp(
+        np_math_ops.vdot, np.vdot, 'vdot', operands=operands)
+
   def _testUnaryOp(self, math_fun, np_fun, name):
 
     def run_test(a):

From cca9b615b264651f5c70d604bb29146ddfbadede Mon Sep 17 00:00:00 2001
From: Scott Zhu <scottzhu@google.com>
Date: Wed, 17 Jun 2020 13:12:33 -0700
Subject: [PATCH 0424/1390] [tfdbg2] Fork local_cli_wrapper_test for keras
 related tests.

PiperOrigin-RevId: 316950499
Change-Id: I428273592694426f72427e6236c68bdfb4e95eba
---
 tensorflow/python/debug/BUILD                 |  2 -
 .../debug/wrappers/local_cli_wrapper_test.py  | 39 +------------------
 2 files changed, 1 insertion(+), 40 deletions(-)

diff --git a/tensorflow/python/debug/BUILD b/tensorflow/python/debug/BUILD
index 1ef0504ecb8..bb3bbbd87c4 100644
--- a/tensorflow/python/debug/BUILD
+++ b/tensorflow/python/debug/BUILD
@@ -788,7 +788,6 @@ cuda_py_test(
         "//tensorflow/python:training",
         "//tensorflow/python:variables",
         "//tensorflow/python/data/ops:dataset_ops",
-        "//tensorflow/python/keras",
         "//third_party/py/numpy",
     ],
 )
@@ -1400,7 +1399,6 @@ py_test(
         "//tensorflow/python:state_ops",
         "//tensorflow/python:training",
         "//tensorflow/python:variables",
-        "//tensorflow/python/keras",
         "//third_party/py/numpy",
     ],
 )
diff --git a/tensorflow/python/debug/wrappers/local_cli_wrapper_test.py b/tensorflow/python/debug/wrappers/local_cli_wrapper_test.py
index 30bb99387b2..ab33a4af030 100644
--- a/tensorflow/python/debug/wrappers/local_cli_wrapper_test.py
+++ b/tensorflow/python/debug/wrappers/local_cli_wrapper_test.py
@@ -22,10 +22,10 @@ import tempfile
 
 import numpy as np
 
-from tensorflow.python.debug.cli import cli_config
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.core.protobuf import rewriter_config_pb2
 from tensorflow.python.client import session
+from tensorflow.python.debug.cli import cli_config
 from tensorflow.python.debug.cli import cli_shared
 from tensorflow.python.debug.cli import debugger_cli_common
 from tensorflow.python.debug.cli import ui_factory
@@ -36,9 +36,6 @@ from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.lib.io import file_io
-from tensorflow.python.keras import backend
-from tensorflow.python.keras.engine import sequential
-from tensorflow.python.keras.layers import core
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
@@ -832,40 +829,6 @@ class LocalCLIDebugWrapperSessionTest(test_util.TensorFlowTestCase):
     run_output = wrapped_sess.run([])
     self.assertEqual([], run_output)
 
-  def testDebuggingKerasFitWithSkippedRunsWorks(self):
-    wrapped_sess = LocalCLIDebuggerWrapperSessionForTest(
-        [["run"], ["run"], ["run", "-t", "10"]], self.sess)
-
-    backend.set_session(wrapped_sess)
-
-    model = sequential.Sequential()
-    model.add(core.Dense(4, input_shape=[2], activation="relu"))
-    model.add(core.Dense(1))
-    model.compile(loss="mse", optimizer="sgd")
-
-    x = np.zeros([8, 2])
-    y = np.zeros([8, 1])
-    model.fit(x, y, epochs=2)
-
-    self.assertEqual(2, len(wrapped_sess.observers["debug_dumps"]))
-
-  def testDebuggingKerasFitWithProfilingWorks(self):
-    wrapped_sess = LocalCLIDebuggerWrapperSessionForTest(
-        [["run", "-p"]] * 10, self.sess)
-
-    backend.set_session(wrapped_sess)
-
-    model = sequential.Sequential()
-    model.add(core.Dense(4, input_shape=[2], activation="relu"))
-    model.add(core.Dense(1))
-    model.compile(loss="mse", optimizer="sgd")
-
-    x = np.zeros([8, 2])
-    y = np.zeros([8, 1])
-    model.fit(x, y, epochs=2)
-
-    self.assertEqual(0, len(wrapped_sess.observers["debug_dumps"]))
-
   def testRunsWithEmptyNestedFetchWorks(self):
     wrapped_sess = LocalCLIDebuggerWrapperSessionForTest(
         [["run"]], self.sess, dump_root="")

From f8195170f88259829ab11538bd5d4b068dfa44a8 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 17 Jun 2020 13:23:53 -0700
Subject: [PATCH 0425/1390] Enable a disabled testcase, since it is fixed.

PiperOrigin-RevId: 316952692
Change-Id: I70d295634012685097b2324b156b2552218d62d7
---
 .../keras/layers/preprocessing/text_vectorization_test.py       | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/keras/layers/preprocessing/text_vectorization_test.py b/tensorflow/python/keras/layers/preprocessing/text_vectorization_test.py
index 992f47efc85..e7f61e94724 100644
--- a/tensorflow/python/keras/layers/preprocessing/text_vectorization_test.py
+++ b/tensorflow/python/keras/layers/preprocessing/text_vectorization_test.py
@@ -1503,7 +1503,7 @@ class TextVectorizationSavingTest(
     loaded_model = keras.models.load_model(output_path)
     self.assertAllEqual(loaded_model.predict(input_array), expected_output)
 
-  def DISABLE_test_saving_with_tfidf(self):
+  def test_saving_with_tfidf(self):
     vocab_data = ["earth", "wind", "and", "fire"]
     tfidf_data = [.5, .25, .2, .125]
     input_array = np.array([["earth", "wind", "and", "earth"],

From 56e71dd0e77507fc5200a650de5c736271df7f8a Mon Sep 17 00:00:00 2001
From: Pavithra Vijay <psv@google.com>
Date: Wed, 17 Jun 2020 13:42:03 -0700
Subject: [PATCH 0426/1390] Removing identity backtracking from entropy losses.

PiperOrigin-RevId: 316956157
Change-Id: I91130052e29e69ae131fe8aad0bbd1d4d42b00f1
---
 tensorflow/python/keras/backend.py            | 65 ++++++++-----------
 .../keras/tests/add_loss_correctness_test.py  | 14 ++++
 2 files changed, 42 insertions(+), 37 deletions(-)

diff --git a/tensorflow/python/keras/backend.py b/tensorflow/python/keras/backend.py
index 391c695b18f..9330425272f 100644
--- a/tensorflow/python/keras/backend.py
+++ b/tensorflow/python/keras/backend.py
@@ -4637,12 +4637,6 @@ def softsign(x):
   return nn.softsign(x)
 
 
-def _backtrack_identity(tensor):
-  while tensor.op.type == 'Identity':
-    tensor = tensor.op.inputs[0]
-  return tensor
-
-
 @keras_export('keras.backend.categorical_crossentropy')
 @dispatch.add_dispatch_support
 def categorical_crossentropy(target, output, from_logits=False, axis=-1):
@@ -4695,17 +4689,16 @@ def categorical_crossentropy(target, output, from_logits=False, axis=-1):
     return nn.softmax_cross_entropy_with_logits_v2(
         labels=target, logits=output, axis=axis)
 
-  if not isinstance(output, (ops.EagerTensor, variables_module.Variable)):
-    output = _backtrack_identity(output)
-    if output.op.type == 'Softmax':
-      # When softmax activation function is used for output operation, we
-      # use logits from the softmax function directly to compute loss in order
-      # to prevent collapsing zero when training.
-      # See b/117284466
-      assert len(output.op.inputs) == 1
-      output = output.op.inputs[0]
-      return nn.softmax_cross_entropy_with_logits_v2(
-          labels=target, logits=output, axis=axis)
+  if (not isinstance(output, (ops.EagerTensor, variables_module.Variable)) and
+      output.op.type == 'Softmax'):
+    # When softmax activation function is used for output operation, we
+    # use logits from the softmax function directly to compute loss in order
+    # to prevent collapsing zero when training.
+    # See b/117284466
+    assert len(output.op.inputs) == 1
+    output = output.op.inputs[0]
+    return nn.softmax_cross_entropy_with_logits_v2(
+        labels=target, logits=output, axis=axis)
 
   # scale preds so that the class probas of each sample sum to 1
   output = output / math_ops.reduce_sum(output, axis, True)
@@ -4740,17 +4733,16 @@ def sparse_categorical_crossentropy(target, output, from_logits=False, axis=-1):
   target = ops.convert_to_tensor_v2(target)
   output = ops.convert_to_tensor_v2(output)
 
-  if not from_logits and not isinstance(
-      output, (ops.EagerTensor, variables_module.Variable)):
-    output = _backtrack_identity(output)
-    if output.op.type == 'Softmax':
-      # When softmax activation function is used for output operation, we
-      # use logits from the softmax function directly to compute loss in order
-      # to prevent collapsing zero when training.
-      # See b/117284466
-      assert len(output.op.inputs) == 1
-      output = output.op.inputs[0]
-      from_logits = True
+  if (not from_logits and
+      not isinstance(output, (ops.EagerTensor, variables_module.Variable)) and
+      output.op.type == 'Softmax'):
+    # When softmax activation function is used for output operation, we
+    # use logits from the softmax function directly to compute loss in order
+    # to prevent collapsing zero when training.
+    # See b/117284466
+    assert len(output.op.inputs) == 1
+    output = output.op.inputs[0]
+    from_logits = True
 
   if not from_logits:
     epsilon_ = _constant_to_tensor(epsilon(), output.dtype.base_dtype)
@@ -4821,15 +4813,14 @@ def binary_crossentropy(target, output, from_logits=False):
   if from_logits:
     return nn.sigmoid_cross_entropy_with_logits(labels=target, logits=output)
 
-  if not isinstance(output, (ops.EagerTensor, variables_module.Variable)):
-    output = _backtrack_identity(output)
-    if output.op.type == 'Sigmoid':
-      # When sigmoid activation function is used for output operation, we
-      # use logits from the sigmoid function directly to compute loss in order
-      # to prevent collapsing zero when training.
-      assert len(output.op.inputs) == 1
-      output = output.op.inputs[0]
-      return nn.sigmoid_cross_entropy_with_logits(labels=target, logits=output)
+  if (not isinstance(output, (ops.EagerTensor, variables_module.Variable)) and
+      output.op.type == 'Sigmoid'):
+    # When sigmoid activation function is used for output operation, we
+    # use logits from the sigmoid function directly to compute loss in order
+    # to prevent collapsing zero when training.
+    assert len(output.op.inputs) == 1
+    output = output.op.inputs[0]
+    return nn.sigmoid_cross_entropy_with_logits(labels=target, logits=output)
 
   epsilon_ = _constant_to_tensor(epsilon(), output.dtype.base_dtype)
   output = clip_ops.clip_by_value(output, epsilon_, 1. - epsilon_)
diff --git a/tensorflow/python/keras/tests/add_loss_correctness_test.py b/tensorflow/python/keras/tests/add_loss_correctness_test.py
index a19eec75ffb..f99b285489d 100644
--- a/tensorflow/python/keras/tests/add_loss_correctness_test.py
+++ b/tensorflow/python/keras/tests/add_loss_correctness_test.py
@@ -34,6 +34,7 @@ from tensorflow.python.keras import testing_utils
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
+from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training.rmsprop import RMSPropOptimizer
 
 MAE = losses.MeanAbsoluteError
@@ -450,6 +451,19 @@ class TestAddLossCorrectness(keras_parameterized.TestCase):
           'Expected a symbolic Tensors or a callable for the loss value'):
         model.add_loss(model.weights[0])
 
+  @keras_parameterized.run_all_keras_modes
+  def test_add_entropy_loss_on_functional_model(self):
+    inputs = Input(shape=(1,))
+    targets = Input(shape=(1,))
+    outputs = testing_utils.Bias()(inputs)
+    model = Model([inputs, targets], outputs)
+    model.add_loss(losses.binary_crossentropy(targets, outputs))
+    model.compile('sgd', run_eagerly=testing_utils.should_run_eagerly())
+    with test.mock.patch.object(logging, 'warning') as mock_log:
+      model.fit([self.x, self.y], batch_size=3, epochs=5)
+      self.assertNotIn('Gradients do not exist for variables',
+                       str(mock_log.call_args))
+
 
 if __name__ == '__main__':
   test.main()

From 8e7be6f71aaa97f8c2ef806da8945a9d11f99830 Mon Sep 17 00:00:00 2001
From: Kuangyuan Chen <chky@google.com>
Date: Wed, 17 Jun 2020 13:56:34 -0700
Subject: [PATCH 0427/1390] Populate side-effect decorators for HashTable ops.

PiperOrigin-RevId: 316958908
Change-Id: I8e75fe538bb3dfc4b6d0742f63072fe87f133a4d
---
 tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td | 6 +++---
 tensorflow/compiler/mlir/tensorflow/ir/tf_op_base.td       | 3 +++
 tensorflow/compiler/mlir/tensorflow/ir/tf_side_effects.h   | 4 ++++
 3 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td b/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
index d403462e6a6..7ed5a215ab8 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
@@ -3862,7 +3862,7 @@ table will be immutable.
   );
 
   let results = (outs
-    TF_ResourceTensor:$table_handle
+    Res<TF_ResourceTensor, "", [TF_TableRead]>:$table_handle
   );
 }
 
@@ -4814,7 +4814,7 @@ table. It must also be of the same type as the table values.
   }];
 
   let arguments = (ins
-    TF_ResourceTensor:$table_handle,
+    Arg<TF_ResourceTensor, "", [TF_TableRead]>:$table_handle,
     TF_Tensor:$keys,
     TF_Tensor:$default_value
   );
@@ -4838,7 +4838,7 @@ The tensor `values` must be of the type of the table values.
   }];
 
   let arguments = (ins
-    TF_ResourceTensor:$table_handle,
+    Arg<TF_ResourceTensor, "", [TF_TableWrite]>:$table_handle,
     TF_Tensor:$keys,
     TF_Tensor:$values
   );
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_op_base.td b/tensorflow/compiler/mlir/tensorflow/ir/tf_op_base.td
index 17424b54fc2..aac03061718 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_op_base.td
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_op_base.td
@@ -91,14 +91,17 @@ class TF_ResourceBase<string resourceKind> :
 def TF_VariableResource : TF_ResourceBase<"Variable">;
 def TF_StackResource : TF_ResourceBase<"Stack">;
 def TF_TensorArrayResource : TF_ResourceBase<"TensorArray">;
+def TF_TableResource : TF_ResourceBase<"Table">;
 
 def TF_VariableRead : MemRead<TF_VariableResource>;
 def TF_StackRead : MemRead<TF_StackResource>;
 def TF_TensorArrayRead : MemRead<TF_TensorArrayResource>;
+def TF_TableRead : MemRead<TF_TableResource>;
 
 def TF_VariableWrite : MemWrite<TF_VariableResource>;
 def TF_StackWrite : MemWrite<TF_StackResource>;
 def TF_TensorArrayWrite : MemWrite<TF_TensorArrayResource>;
+def TF_TableWrite: MemWrite<TF_TableResource>;
 
 //===----------------------------------------------------------------------===//
 // TensorFlow op definitions
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_side_effects.h b/tensorflow/compiler/mlir/tensorflow/ir/tf_side_effects.h
index 9be61b1db39..ab1d7935bad 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_side_effects.h
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_side_effects.h
@@ -35,6 +35,10 @@ struct TensorArray : ::mlir::SideEffects::Resource::Base<TensorArray> {
   StringRef getName() final { return "TensorArray"; }
 };
 
+struct Table : ::mlir::SideEffects::Resource::Base<Table> {
+  StringRef getName() final { return "Table"; }
+};
+
 }  // namespace ResourceEffects
 }  // namespace TF
 }  // namespace mlir

From cbf8f57413d8a14c32515e88292012324a0990f0 Mon Sep 17 00:00:00 2001
From: Kuangyuan Chen <chky@google.com>
Date: Wed, 17 Jun 2020 14:16:51 -0700
Subject: [PATCH 0428/1390] Import initialization graph in SignatureDef
 SavedModels as an MLIR function in TF saved model dialect.

PiperOrigin-RevId: 316963067
Change-Id: I6c49ba4cfa3832f136bec36700aea695ad116737
---
 tensorflow/compiler/mlir/tensorflow/BUILD     |   4 +-
 .../mlir/tensorflow/ir/tf_saved_model.cc      |  25 +++
 .../mlir/tensorflow/ir/tf_saved_model_ops.td  |  24 +++
 .../tests/tf_saved_model/common_v1.py         |   1 +
 .../tests/tf_saved_model/hash_table_v1.py     |  92 +++++++++++
 .../tensorflow/tests/tf_saved_model_ops.mlir  |   5 +
 .../tests/tf_saved_model_ops_invalid.mlir     |  33 ++++
 .../mlir/tensorflow/translate/import_model.cc | 149 +++++++++++++-----
 8 files changed, 290 insertions(+), 43 deletions(-)
 create mode 100644 tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/hash_table_v1.py

diff --git a/tensorflow/compiler/mlir/tensorflow/BUILD b/tensorflow/compiler/mlir/tensorflow/BUILD
index 904ccb7e820..17ed0e36a28 100644
--- a/tensorflow/compiler/mlir/tensorflow/BUILD
+++ b/tensorflow/compiler/mlir/tensorflow/BUILD
@@ -661,7 +661,9 @@ cc_library(
         ":tensorflow_types",
         ":translate_utils",
         "//tensorflow/cc/saved_model:bundle_v2",
+        "//tensorflow/cc/saved_model:constants",
         "//tensorflow/cc/saved_model:loader_lite",
+        "//tensorflow/cc/saved_model:loader_util",
         "//tensorflow/compiler/jit:shape_inference_helpers",
         "//tensorflow/compiler/mlir:op_or_arg_name_mapper",
         "//tensorflow/compiler/tf2xla:functionalize_control_flow",
@@ -673,6 +675,7 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/grappler/utils:transitive_fanin",
+        "//tensorflow/core/platform:protobuf_internal",
         "//tensorflow/core/platform:types",
         "//tensorflow/stream_executor/lib",
         "@com_google_absl//absl/algorithm:container",
@@ -682,7 +685,6 @@ cc_library(
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:optional",
         "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:Analysis",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:StandardOps",
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.cc
index 140a778770c..6af70158e14 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.cc
@@ -76,6 +76,23 @@ static LogicalResult Verify(GlobalTensorOp global_tensor) {
   return success();
 }
 
+static LogicalResult Verify(SessionInitializerOp session_initializer) {
+  mlir::SymbolTable symbol_table(
+      session_initializer.getParentOfType<ModuleOp>());
+
+  auto init_func_op =
+      symbol_table.lookup<mlir::FuncOp>(session_initializer.initializer());
+  if (!init_func_op)
+    return session_initializer.emitOpError()
+           << "the initializer function does not exist";
+
+  if (!init_func_op.getType().getResults().empty())
+    return session_initializer.emitOpError()
+           << "the initializer function should have no output";
+
+  return success();
+}
+
 #define GET_OP_CLASSES
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.cc.inc"
 
@@ -220,6 +237,14 @@ static LogicalResult VerifySavedModelModule(
       }
     }
   }
+
+  auto session_initializers = module.getOps<SessionInitializerOp>();
+  if (std::distance(session_initializers.begin(), session_initializers.end()) >
+      1) {
+    return (*++session_initializers.begin()).emitError()
+           << "there must be no more than one session_initializer op";
+  }
+
   SymbolTable symbol_table(module);
   auto symbol_uses = SymbolTable::getSymbolUses(&module.getBodyRegion());
   if (!symbol_uses.hasValue()) {
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model_ops.td b/tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model_ops.td
index 4431a160edf..497f4d90cb9 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model_ops.td
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model_ops.td
@@ -128,4 +128,28 @@ def TfSavedModel_GlobalTensorOp : TfSavedModel_Op<"global_tensor"> {
   let verifier = [{ return Verify(*this); }];
 }
 
+def TfSavedModel_SessionInitializerOp: TfSavedModel_Op<"session_initializer"> {
+  let summary = "Initializes TensorFlow session state.";
+  let description = [{
+    Represents a session initializer function initializes TensorFlow session
+    state. It is used to initialize resources in the saved model before calling
+    any exported functions. There must be no more than one session initializer
+    in a saved model.
+
+    The `initializer` represents the initialization function. The function have
+    no output and this function should be only called once.
+
+    This is used, for example, to initialize hash tables stored in resources and
+    accessed by resource name (rather than as resource handles or bound inputs
+    which is how `global_tensor`s are referenced)
+  }];
+
+  let arguments = (ins
+    FlatSymbolRefAttr:$initializer
+  );
+
+
+  let verifier = [{ return Verify(*this); }];
+}
+
 #endif // SAVED_MODEL_DIALECT
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/common_v1.py b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/common_v1.py
index 7171f63bb05..51ccbeb1fbd 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/common_v1.py
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/common_v1.py
@@ -84,6 +84,7 @@ def do_test(signature_def_map, show_debug_info=False):
     builder.add_meta_graph_and_variables(
         sess, [tf.saved_model.tag_constants.SERVING],
         signature_def_map,
+        main_op=tf.tables_initializer(),
         strip_default_attrs=True)
     builder.save()
 
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/hash_table_v1.py b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/hash_table_v1.py
new file mode 100644
index 00000000000..64847434b82
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/hash_table_v1.py
@@ -0,0 +1,92 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+# RUN: %p/hash_table_v1 | FileCheck %s
+
+# pylint: disable=missing-docstring,line-too-long
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow.compat.v1 as tf
+from tensorflow.compiler.mlir.tensorflow.tests.tf_saved_model import common_v1
+
+# Verify that the tf.versions attribute exists. It is difficult to enforce
+# contents, since the version numbers change over time. The conversion logic
+# itself is verified in the common graphdef converter, so here just assert
+# it is being invoked.
+# CHECK: module
+# CHECK-SAME: tf.versions
+# CHECK-SAME: bad_consumers
+# CHECK-SAME: min_consumer
+# CHECK-SAME: producer
+
+# CHECK: "tf_saved_model.session_initializer"() {initializer = [[init:@.*]]} : () -> ()
+# CHECK: "tf_saved_model.global_tensor"()
+
+# CHECK:      func {{@[a-zA-Z_0-9]+}}(
+# CHECK-SAME: [[ARG0:%.*]]: tensor<i32>
+# CHECK-SAME: [[ARG1:%.*]]: tensor<!tf.resource
+# CHECK-SAME: attributes {{.*}} tf_saved_model.exported_names = ["key"]
+
+# CHECK-NEXT: [[R0:%.*]] = "tf.Const"()
+# CHECK-NEXT: [[R1:%.*]] = "tf.HashTableV2"()
+# CHECK-SAME: shared_name = "[[hash_table:.*]]"
+# CHECK-NEXT: [[R2:%.*]] = "tf.LookupTableFindV2"([[R1]], [[ARG0]], [[R0]])
+# CHECK-NEXT: [[R3:%.*]] = "tf.ReadVariableOp"([[ARG1]])
+# CHECK-NEXT: [[R4:%.*]] = "tf.AddV2"([[R2]], [[R3]])
+# CHECK-NEXT: return [[R4]]
+
+# CHECK:      func [[init]]
+# CHECK-NEXT: [[R5:%.*]] = "tf.Const"()
+# CHECK-NEXT: [[R6:%.*]] = "tf.Const"()
+# CHECK-NEXT: [[R7:%.*]] = "tf.HashTableV2"()
+# CHECK-SAME: shared_name = "[[hash_table]]"
+# CHECK-NEXT: "tf.LookupTableImportV2"([[R7]], [[R5]], [[R6]])
+
+
+def Test():
+
+  z = tf.compat.v1.get_variable(
+      name='y',
+      shape=(),
+      initializer=tf.random_normal_initializer(),
+      trainable=True)
+  table_initializer = tf.lookup.KeyValueTensorInitializer(
+      keys=[1, 2, 3, 4],
+      values=[5, 6, 7, 8],
+      key_dtype=tf.int32,
+      value_dtype=tf.float32)
+  table = tf.lookup.StaticHashTable(
+      table_initializer, default_value=tf.constant(0.0))
+
+  x = tf.placeholder(tf.int32, shape=(), name='input')
+  y = table.lookup(x)
+  r = tf.add(y, z)
+
+  tensor_info_x = tf.compat.v1.saved_model.utils.build_tensor_info(x)
+  tensor_info_r = tf.compat.v1.saved_model.utils.build_tensor_info(r)
+
+  return {
+      'key': (tf.compat.v1.saved_model.signature_def_utils.build_signature_def(
+          inputs={'x': tensor_info_x},
+          outputs={'r': tensor_info_r},
+          method_name='some_function'))
+  }
+
+
+if __name__ == '__main__':
+  common_v1.set_tf_options()
+  common_v1.do_test(Test())
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model_ops.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model_ops.mlir
index 21e3bef8fd8..aa1f996da07 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model_ops.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model_ops.mlir
@@ -2,6 +2,11 @@
 
 module attributes {tf_saved_model.semantics} {
 
+  // CHECK: tf_saved_model.session_initializer
+  "tf_saved_model.session_initializer"() {
+    initializer = @f
+  } : () -> ()
+
   // Representation for constants: (immutable) global tensor.
   // CHECK: tf_saved_model.global_tensor
   "tf_saved_model.global_tensor"() {
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model_ops_invalid.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model_ops_invalid.mlir
index c055c6c9f56..544600cf6b8 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model_ops_invalid.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model_ops_invalid.mlir
@@ -258,3 +258,36 @@ module attributes {tf_saved_model.semantics} {
   // expected-error@+1 {{'type' attribute for immutable 'tf_saved_model.global_tensor' should have a static shape}}
   "tf_saved_model.global_tensor"() { sym_name = "v", type = tensor<?xf32>, value = dense<1.> : tensor<1xf32> } : () -> ()
 }
+
+// -----
+
+module attributes {tf_saved_model.semantics} {
+
+  // expected-error@+1 {{the initializer function does not exist}}
+  "tf_saved_model.session_initializer"() { initializer = @init } : () -> ()
+}
+
+// -----
+
+module attributes {tf_saved_model.semantics} {
+
+  // expected-error@+1 {{the initializer function should have no output}}
+  "tf_saved_model.session_initializer"() { initializer = @init } : () -> ()
+  func @init() -> tensor<1xf32> {
+    %0 = "tf.Const"() {value = dense<[1.0]> : tensor<1xf32> } : () -> tensor<1xf32>
+    return %0 : tensor<1xf32>
+  }
+}
+
+// -----
+
+module attributes {tf_saved_model.semantics} {
+
+  "tf_saved_model.session_initializer"() { initializer = @init } : () -> ()
+  // expected-error@+1 {{there must be no more than one session_initializer op}}
+  "tf_saved_model.session_initializer"() { initializer = @init } : () -> ()
+  func @init() -> tensor<1xf32> {
+    %0 = "tf.Const"() {value = dense<[1.0]> : tensor<1xf32> } : () -> tensor<1xf32>
+    return %0 : tensor<1xf32>
+  }
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/translate/import_model.cc b/tensorflow/compiler/mlir/tensorflow/translate/import_model.cc
index 820d0ce31fb..3cff4217215 100644
--- a/tensorflow/compiler/mlir/tensorflow/translate/import_model.cc
+++ b/tensorflow/compiler/mlir/tensorflow/translate/import_model.cc
@@ -60,6 +60,8 @@ limitations under the License.
 #include "mlir/IR/Types.h"  // from @llvm-project
 #include "mlir/IR/Verifier.h"  // from @llvm-project
 #include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "tensorflow/cc/saved_model/constants.h"
+#include "tensorflow/cc/saved_model/loader_util.h"
 #include "tensorflow/compiler/jit/shape_inference_helpers.h"
 #include "tensorflow/compiler/mlir/op_or_arg_name_mapper.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_attributes.h"
@@ -99,6 +101,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/protobuf.h"
+#include "tensorflow/core/platform/protobuf_internal.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/protobuf/graph_debug_info.pb.h"
 #include "tensorflow/core/protobuf/meta_graph.pb.h"
@@ -116,6 +119,7 @@ using mlir::NamedAttrList;
 using mlir::TensorType;
 using mlir::TF::VarHandleOp;
 using mlir::tf_saved_model::GlobalTensorOp;
+using mlir::tf_saved_model::SessionInitializerOp;
 using stream_executor::port::StatusOr;
 
 namespace {
@@ -2955,6 +2959,13 @@ void SortSavedModelModule(mlir::ModuleOp module) {
     named_global_tensor.global_tensor.getOperation()->moveBefore(
         &module.getBody()->front());
   }
+
+  auto initializers = module.getOps<SessionInitializerOp>();
+  if (!initializers.empty()) {
+    (*initializers.begin())
+        .getOperation()
+        ->moveBefore(&module.getBody()->front());
+  }
 }
 
 Status CreateSavedModelIR(
@@ -3241,17 +3252,29 @@ class SavedModelSignatureDefImporter {
                                  absl::Span<std::string> exported_names,
                                  mlir::MLIRContext* context)
       : bundle_(bundle),
+        flib_def_(OpRegistry::Global(), graph_def().library()),
+        debug_info_(),
         exported_names_(exported_names),
-        module_(mlir::ModuleOp::create(mlir::UnknownLoc::get(context))) {}
+        module_(mlir::ModuleOp::create(mlir::UnknownLoc::get(context))) {
+    // debug_info might not be loaded with loader_lite.
+    if (bundle_.debug_info != nullptr) debug_info_ = *bundle_.debug_info;
+  }
 
   // Converts the SavedModel to the SavedModel dialect. Creates an MLIR function
   // for each signature.
   StatusOr<mlir::OwningModuleRef> ConvertSignatures();
-  Status ConvertSignature(const GraphDef& graphdef,
-                          const std::string& sig_def_key,
-                          const SignatureDef& signature_def,
-                          const GraphDebugInfo& debug_info,
-                          const FunctionLibraryDefinition& flib_def);
+  Status ConvertSignature(const std::string& sig_def_key,
+                          const SignatureDef& signature_def);
+
+  // Converts the initialization graph in the SavedModel to an MLIR function.
+  Status ConvertInitializer();
+
+  // Converts a graph with feeds and fetches to an MLIR function.
+  StatusOr<mlir::OwningModuleRef> ConvertGraph(
+      const std::string& name,
+      const std::vector<std::pair<std::string, TensorInfo>>& inputs,
+      const std::vector<std::pair<std::string, TensorInfo>>& outputs,
+      const std::vector<std::string> control_outputs);
 
   // Creates GlobalTensorOp for each variable and moves each VarHandle op to
   // the enclosing function's arguments.
@@ -3273,18 +3296,62 @@ class SavedModelSignatureDefImporter {
   GraphImportConfig::InputArrays ParseInputArrays(
       const std::vector<std::pair<std::string, TensorInfo>>& inputs);
 
+  const GraphDef& graph_def() const {
+    return bundle_.meta_graph_def.graph_def();
+  }
+  const FunctionLibraryDefinition& flib_def() const { return flib_def_; }
+  const GraphDebugInfo& debug_info() const { return debug_info_; }
+
   const SavedModelBundle& bundle_;
+  FunctionLibraryDefinition flib_def_;
+  GraphDebugInfo debug_info_;
   absl::Span<std::string> exported_names_;
   mlir::OwningModuleRef module_;
 };
 
+Status SavedModelSignatureDefImporter::ConvertInitializer() {
+  std::vector<AssetFileDef> asset_file_defs;
+  TF_RETURN_IF_ERROR(
+      internal::GetAssetFileDefs(bundle_.meta_graph_def, &asset_file_defs));
+
+  if (!asset_file_defs.empty())
+    return errors::Unimplemented(
+        absl::StrCat("Assets are not supported in signaturedef importer"));
+
+  std::string init_node_name;
+  TF_RETURN_IF_ERROR(
+      internal::GetInitOp("", bundle_.meta_graph_def, &init_node_name));
+
+  if (init_node_name.empty()) return Status::OK();
+
+  TF_ASSIGN_OR_RETURN(auto sub_module,
+                      ConvertGraph(init_node_name, {}, {}, {init_node_name}));
+
+  mlir::SymbolTable symbol_table(*sub_module);
+
+  auto init_func_op = symbol_table.lookup<mlir::FuncOp>(init_node_name);
+
+  init_func_op.removeAttr("tf.entry_function");
+
+  mlir::OpBuilder builder(module_->getBodyRegion());
+
+  builder.create<mlir::tf_saved_model::SessionInitializerOp>(
+      module_->getLoc(), builder.getSymbolRefAttr(init_func_op.getName()));
+
+  // Move the converted functions to top level MLIR module.
+  auto* block = module_->getBody();
+  auto* sub_block = sub_module->getBody();
+  block->getOperations().splice(
+      mlir::Block::iterator(block->getTerminator()), sub_block->getOperations(),
+      sub_block->begin(), mlir::Block::iterator(sub_block->getTerminator()));
+
+  return Status::OK();
+}
+
 StatusOr<mlir::OwningModuleRef>
 SavedModelSignatureDefImporter::ConvertSignatures() {
   const auto& signatures = bundle_.GetSignatures();
-  const auto& graphdef = bundle_.meta_graph_def.graph_def();
-  PopulateTfVersions(module_.get(), graphdef.versions());
-
-  FunctionLibraryDefinition flib_def(OpRegistry::Global(), graphdef.library());
+  PopulateTfVersions(module_.get(), graph_def().versions());
 
   // debug_info might not be loaded with loader_lite.
   GraphDebugInfo debug_info;
@@ -3307,9 +3374,10 @@ SavedModelSignatureDefImporter::ConvertSignatures() {
       continue;
     }
 
-    TF_RETURN_IF_ERROR(ConvertSignature(graphdef, sig_def_key, signature_def,
-                                        debug_info, flib_def));
+    TF_RETURN_IF_ERROR(ConvertSignature(sig_def_key, signature_def));
   }
+
+  TF_RETURN_IF_ERROR(ConvertInitializer());
   TF_RETURN_IF_ERROR(LiftVariables());
 
   mlir::OpBuilder builder(module_->getBodyRegion());
@@ -3320,10 +3388,32 @@ SavedModelSignatureDefImporter::ConvertSignatures() {
   return std::move(module_);
 }
 
+StatusOr<mlir::OwningModuleRef> SavedModelSignatureDefImporter::ConvertGraph(
+    const std::string& name,
+    const std::vector<std::pair<std::string, TensorInfo>>& inputs,
+    const std::vector<std::pair<std::string, TensorInfo>>& outputs,
+    const std::vector<std::string> control_outputs) {
+  GraphImportConfig specs;
+  specs.prune_unused_nodes = true;
+  specs.inputs = ParseInputArrays(inputs);
+  for (auto& output : outputs) specs.outputs.push_back(output.second.name());
+  specs.control_outputs = control_outputs;
+
+  // Convert sub-graphdef to sub-graph.
+  GraphConstructorOptions options;
+  options.allow_internal_ops = true;
+  options.add_default_attributes = true;
+  Graph graph(OpRegistry::Global());
+
+  TF_RETURN_IF_ERROR(ConvertGraphDefToGraph(options, graph_def(), &graph));
+
+  // Convert sub-graph to MLIR module.true
+  return GraphDefImporter::Convert(module_->getContext(), graph, debug_info(),
+                                   flib_def(), specs, name);
+}
+
 Status SavedModelSignatureDefImporter::ConvertSignature(
-    const GraphDef& graphdef, const std::string& sig_def_key,
-    const SignatureDef& signature_def, const GraphDebugInfo& debug_info,
-    const FunctionLibraryDefinition& flib_def) {
+    const std::string& sig_def_key, const SignatureDef& signature_def) {
   // Create local vectors for the input and output and sort them to be
   // deterministic. We don't want anyone to really depend on the order, client
   // should lookup argument/result mapping by attribute name.
@@ -3339,34 +3429,9 @@ Status SavedModelSignatureDefImporter::ConvertSignature(
     return lhs.first.size() < rhs.first.size() || lhs.first > rhs.first;
   });
 
-  GraphImportConfig specs;
-  specs.prune_unused_nodes = true;
-  specs.inputs = ParseInputArrays(inputs);
-  for (auto& output : outputs) specs.outputs.push_back(output.second.name());
-
-  // Remove unused nodes and create sub-graphdef.
-  GraphDef sub_graph_def;
-  TF_RETURN_IF_ERROR(tensorflow::grappler::SetTransitiveFaninGraph(
-      graphdef, &sub_graph_def,
-      /*terminal_nodes=*/{specs.outputs.begin(), specs.outputs.end()}));
-
-  // Set the function library definitions in the pruned graphdef.
-  *sub_graph_def.mutable_library() = flib_def.ToProto();
-
-  // Convert sub-graphdef to sub-graph.
-  GraphConstructorOptions options;
-  options.allow_internal_ops = true;
-  options.add_default_attributes = true;
-  Graph sub_graph(OpRegistry::Global());
-
-  TF_RETURN_IF_ERROR(
-      ConvertGraphDefToGraph(options, sub_graph_def, &sub_graph));
-
   // Convert sub-graph to MLIR module.
-  TF_ASSIGN_OR_RETURN(
-      auto sub_module,
-      GraphDefImporter::Convert(module_->getContext(), sub_graph, debug_info,
-                                flib_def, specs, sig_def_key));
+  TF_ASSIGN_OR_RETURN(auto sub_module,
+                      ConvertGraph(sig_def_key, inputs, outputs, {}));
   mlir::OpBuilder builder(sub_module->getBodyRegion());
 
   // Find the FuncOp which corresponds to current SignatureDef.

From 5b5ab0034ae0f12731943177b275e35cb9f92bb1 Mon Sep 17 00:00:00 2001
From: Lukas Geiger <lukas.geiger94@gmail.com>
Date: Wed, 17 Jun 2020 23:26:29 +0200
Subject: [PATCH 0429/1390] Fix distributed autocast variable assign

---
 .../experimental/autocast_variable.py         | 104 +++++++++---------
 .../experimental/autocast_variable_test.py    |  30 +++--
 2 files changed, 77 insertions(+), 57 deletions(-)

diff --git a/tensorflow/python/keras/mixed_precision/experimental/autocast_variable.py b/tensorflow/python/keras/mixed_precision/experimental/autocast_variable.py
index 7d0abe30581..ca6420f0c0b 100644
--- a/tensorflow/python/keras/mixed_precision/experimental/autocast_variable.py
+++ b/tensorflow/python/keras/mixed_precision/experimental/autocast_variable.py
@@ -188,61 +188,88 @@ class AutoCastVariable(variables.Variable, core.Tensor):
   def constraint(self):
     return self._variable.constraint
 
+  def _apply_assign_update(
+      self, update_fn, value, use_locking=None, name=None, read_value=True):
+    if not read_value:
+      return update_fn(value, use_locking, name, read_value)
+
+    if context.executing_eagerly() or ops.inside_function():
+      assign_op = update_fn(value, use_locking, name, False)
+      with ops.control_dependencies([assign_op]):
+        return self
+
+    # Fallback to wrapping the returned variable in graph mode if possible
+    assign_var = update_fn(value, use_locking, name, read_value)
+    if resource_variable_ops.is_resource_variable(assign_var):
+      return create_autocast_variable(assign_var)
+    return assign_var
+
+  def _apply_update(self, update_fn, *args, **kwargs):
+    update_var = update_fn(*args, **kwargs)
+    if context.executing_eagerly() or ops.inside_function():
+      with ops.control_dependencies([update_var]):
+        return self
+
+    # Fallback to wrapping the returned variable in graph mode if possible
+    if resource_variable_ops.is_resource_variable(update_var):
+      return create_autocast_variable(update_var)
+    return update_var
+
   def assign(self, value, use_locking=None, name=None, read_value=True):
-    assign_op = self._variable.assign(value, use_locking, name, read_value)
-    return _maybe_wrap(assign_op, wrap=read_value)
+    return self._apply_assign_update(
+        self._variable.assign, value, use_locking, name, read_value)
 
   def assign_add(self, delta, use_locking=None, name=None, read_value=True):
-    assign_op = self._variable.assign_add(delta, use_locking, name, read_value)
-    return _maybe_wrap(assign_op, wrap=read_value)
+    return self._apply_assign_update(
+        self._variable.assign_add, delta, use_locking, name, read_value)
 
   def assign_sub(self, delta, use_locking=None, name=None, read_value=True):
-    assign_op = self._variable.assign_sub(delta, use_locking, name, read_value)
-    return _maybe_wrap(assign_op, wrap=read_value)
+    return self._apply_assign_update(
+        self._variable.assign_sub, delta, use_locking, name, read_value)
 
   def scatter_sub(self, sparse_delta, use_locking=False, name=None):
-    var = self._variable.scatter_sub(sparse_delta, use_locking, name)
-    return _maybe_wrap(var)
+    return self._apply_update(
+        self._variable.scatter_sub, sparse_delta, use_locking, name)
 
   def scatter_add(self, sparse_delta, use_locking=False, name=None):
-    var = self._variable.scatter_add(sparse_delta, use_locking, name)
-    return _maybe_wrap(var)
+    return self._apply_update(
+        self._variable.scatter_add, sparse_delta, use_locking, name)
 
   def scatter_max(self, sparse_delta, use_locking=False, name=None):
-    var = self._variable.scatter_max(sparse_delta, use_locking, name)
-    return _maybe_wrap(var)
+    return self._apply_update(
+        self._variable.scatter_max, sparse_delta, use_locking, name)
 
   def scatter_min(self, sparse_delta, use_locking=False, name=None):
-    var = self._variable.scatter_min(sparse_delta, use_locking, name)
-    return _maybe_wrap(var)
+    return self._apply_update(
+        self._variable.scatter_min, sparse_delta, use_locking, name)
 
   def scatter_mul(self, sparse_delta, use_locking=False, name=None):
-    var = self._variable.scatter_mul(sparse_delta, use_locking, name)
-    return _maybe_wrap(var)
+    return self._apply_update(
+        self._variable.scatter_mul, sparse_delta, use_locking, name)
 
   def scatter_div(self, sparse_delta, use_locking=False, name=None):
-    var = self._variable.scatter_div(sparse_delta, use_locking, name)
-    return _maybe_wrap(var)
+    return self._apply_update(
+        self._variable.scatter_div, sparse_delta, use_locking, name)
 
   def scatter_update(self, sparse_delta, use_locking=False, name=None):
-    var = self._variable.scatter_update(sparse_delta, use_locking, name)
-    return _maybe_wrap(var)
+    return self._apply_update(
+        self._variable.scatter_update, sparse_delta, use_locking, name)
 
   def batch_scatter_update(self, sparse_delta, use_locking=False, name=None):
-    var = self._variable.batch_scatter_update(sparse_delta, use_locking, name)
-    return _maybe_wrap(var)
+    return self._apply_update(
+        self._variable.batch_scatter_update, sparse_delta, use_locking, name)
 
   def scatter_nd_sub(self, indices, updates, name=None):
-    var = self._variable.scatter_nd_sub(indices, updates, name)
-    return _maybe_wrap(var)
+    return self._apply_update(
+        self._variable.scatter_nd_sub, indices, updates, name)
 
   def scatter_nd_add(self, indices, updates, name=None):
-    var = self._variable.scatter_nd_add(indices, updates, name)
-    return _maybe_wrap(var)
+    return self._apply_update(
+        self._variable.scatter_nd_add, indices, updates, name)
 
   def scatter_nd_update(self, indices, updates, name=None):
-    var = self._variable.scatter_nd_update(indices, updates, name)
-    return _maybe_wrap(var)
+    return self._apply_update(
+        self._variable.scatter_nd_update, indices, updates, name)
 
   def load(self, value, session=None):
     return self._variable.load(value, session)
@@ -462,24 +489,3 @@ def create_autocast_variable(variable):
       # pylint: enable=missing-format-attribute
 
   return AutoCastDistributedVariable(variable)
-
-
-def _maybe_wrap(variable, wrap=True):
-  """Creates an AutoCastVariable that wraps another variable if applicable.
-
-  This function is used to wrap the return value of AutoCastVariable.assign.
-  Unfortunately MirroredVariable.assign will (incorrectly) return a Mirrored
-  value instead of a MirroredVariable. So we cannot properly wrap it in an
-  AutoCastVariable. We return the original variable in that case.
-
-  Args:
-    variable: A tf.Variable or op.
-    wrap: A boolean to define whether to wrap the variable in an
-      AutoCastVariable or not.
-
-  Returns:
-    An AutoCastVariable if wrap is True and variable is a resource variable.
-  """
-  if wrap and resource_variable_ops.is_resource_variable(variable):
-    return create_autocast_variable(variable)
-  return variable
diff --git a/tensorflow/python/keras/mixed_precision/experimental/autocast_variable_test.py b/tensorflow/python/keras/mixed_precision/experimental/autocast_variable_test.py
index c45015b644e..940bd07c813 100644
--- a/tensorflow/python/keras/mixed_precision/experimental/autocast_variable_test.py
+++ b/tensorflow/python/keras/mixed_precision/experimental/autocast_variable_test.py
@@ -304,8 +304,8 @@ class AutoCastVariableTest(test.TestCase, parameterized.TestCase):
         self.assertAllClose(3., self.evaluate(x.assign_sub(3.)))
 
         # Assign multiple times
-        # This currently only works if no strategy is used
-        if not ds_context.has_strategy():
+        # This currently doesn't work in graph mode
+        if context.executing_eagerly() or ops.inside_function():
           assign = x.assign(1.)
           self.assertAllClose(1., self.evaluate(assign))
           self.assertAllClose(0., self.evaluate(assign.assign(0.)))
@@ -343,6 +343,20 @@ class AutoCastVariableTest(test.TestCase, parameterized.TestCase):
         # assign still expect float32 value even if in float16 scope
         run_and_check()
 
+  @combinations.generate(maybe_distribute)
+  def test_assign_tf_function(self, distribution):
+    with distribution.scope():
+      x = get_var(0., dtypes.float32)
+      x = autocast_variable.create_autocast_variable(x)
+
+      @def_function.function
+      def run_assign():
+        return x.assign(1.).assign_add(3.).assign_add(3.).assign_sub(2.)
+
+      with ops.get_default_graph()._enable_auto_casting_variables(
+          dtypes.float16):
+        self.assertAllClose(5., self.evaluate(run_assign()))
+
   @combinations.generate(maybe_distribute)
   def test_assign_stays_in_true_dtype(self, distribution):
     with distribution.scope():
@@ -357,18 +371,18 @@ class AutoCastVariableTest(test.TestCase, parameterized.TestCase):
           dtypes.float16):
         # Variable should be increased, despite it appearing to be the same
         # float16 value.
-        self.assertEqual(1. + small_val,
-                         self.evaluate(x.assign(1. + small_tensor)))
+        self.evaluate(x.assign(1. + small_tensor))
+        self.assertEqual(1. + small_val, self.evaluate(x._variable))
         self.assertEqual(1., self.evaluate(x.value()))
-      self.assertEqual(1. + small_val, self.evaluate(x.value()))
+      self.assertEqual(1. + small_val, self.evaluate(x))
 
       self.evaluate(x.assign(1.))
       with ops.get_default_graph()._enable_auto_casting_variables(
           dtypes.float16):
-        self.assertEqual(1. + small_val,
-                         self.evaluate(x.assign_add(small_tensor)))
+        self.evaluate(x.assign_add(small_tensor))
+        self.assertEqual(1. + small_val, self.evaluate(x._variable))
         self.assertEqual(1., self.evaluate(x.value()))
-      self.assertEqual(1. + small_val, self.evaluate(x.value()))
+      self.assertEqual(1. + small_val, self.evaluate(x))
 
   @combinations.generate(maybe_distribute)
   def test_checkpoint(self, distribution):

From 92d68dd5f10ce8ef40852f5bb207258ce4126edf Mon Sep 17 00:00:00 2001
From: Francois Chollet <fchollet@google.com>
Date: Wed, 17 Jun 2020 14:30:33 -0700
Subject: [PATCH 0430/1390] Avoid deprecation warning related to the use of
 `collections.Sequence`. It will stop working in Python 3.8.

PiperOrigin-RevId: 316965801
Change-Id: Ia44313b1920653a0dd0a94d404ac914b08239c43
---
 tensorflow/python/keras/engine/data_adapter.py | 18 ------------------
 tensorflow/python/keras/layers/recurrent.py    |  9 ++++++---
 2 files changed, 6 insertions(+), 21 deletions(-)

diff --git a/tensorflow/python/keras/engine/data_adapter.py b/tensorflow/python/keras/engine/data_adapter.py
index 469355dd722..29a99137982 100644
--- a/tensorflow/python/keras/engine/data_adapter.py
+++ b/tensorflow/python/keras/engine/data_adapter.py
@@ -19,7 +19,6 @@ from __future__ import division
 from __future__ import print_function
 
 import abc
-import collections
 import contextlib
 import functools
 import itertools
@@ -57,7 +56,6 @@ try:
   from scipy import sparse as scipy_sparse  # pylint: disable=g-import-not-at-top
 except ImportError:
   scipy_sparse = None
-
 try:
   import pandas as pd  # pylint: disable=g-import-not-at-top
 except ImportError:
@@ -786,7 +784,6 @@ class GeneratorDataAdapter(DataAdapter):
     # Since we have to know the dtype of the python generator when we build the
     # dataset, we have to look at a batch to infer the structure.
     peek, x = self._peek_and_restore(x)
-    assert_not_namedtuple(peek)
     peek = self._standardize_batch(peek)
     peek = _process_tensorlike(peek)
 
@@ -1070,21 +1067,6 @@ def broadcast_sample_weight_modes(target_structure, sample_weight_modes):
   return sample_weight_modes
 
 
-def assert_not_namedtuple(x):
-  if (isinstance(x, tuple) and
-      # TODO(b/144192902): Use a namedtuple checking utility.
-      hasattr(x, "_fields") and
-      isinstance(x._fields, collections.Sequence) and
-      all(isinstance(f, six.string_types) for f in x._fields)):
-    raise ValueError(
-        "Received namedtuple ({}) with fields `{}` as input. namedtuples "
-        "cannot, in general, be unambiguously resolved into `x`, `y`, "
-        "and `sample_weight`. For this reason Keras has elected not to "
-        "support them. If you would like the value to be unpacked, "
-        "please explicitly convert it to a tuple before passing it to "
-        "Keras.".format(x.__class__, x._fields))
-
-
 class DataHandler(object):
   """Handles iterating over epoch-level `tf.data.Iterator` objects."""
 
diff --git a/tensorflow/python/keras/layers/recurrent.py b/tensorflow/python/keras/layers/recurrent.py
index 0ce17c6101e..78a4a33a533 100644
--- a/tensorflow/python/keras/layers/recurrent.py
+++ b/tensorflow/python/keras/layers/recurrent.py
@@ -19,8 +19,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import collections
-
 import numpy as np
 
 from tensorflow.python.distribute import distribution_strategy_context as ds_context
@@ -49,6 +47,11 @@ from tensorflow.python.util import nest
 from tensorflow.python.util.tf_export import keras_export
 from tensorflow.tools.docs import doc_controls
 
+try:
+  from collections import abc as collections_abc  # pylint: disable=g-import-not-at-top
+except ImportError:  # For Python 2
+  import collections as collections_abc  # pylint: disable=g-import-not-at-top
+
 
 RECURRENT_DROPOUT_WARNING_MSG = (
     'RNN `implementation=2` is not supported when `recurrent_dropout` is set. '
@@ -828,7 +831,7 @@ class RNN(Layer):
     # input shape: `(samples, time (padded with zeros), input_dim)`
     # note that the .build() method of subclasses MUST define
     # self.input_spec and self.state_spec with complete input shapes.
-    if (isinstance(inputs, collections.Sequence)
+    if (isinstance(inputs, collections_abc.Sequence)
         and not isinstance(inputs, tuple)):
       # get initial_state from full input spec
       # as they could be copied to multiple GPU.

From b920ae92628e58362a3ef5fd55b3ef6e1a2ff617 Mon Sep 17 00:00:00 2001
From: Hanhan Wang <hanchung@google.com>
Date: Wed, 17 Jun 2020 14:31:35 -0700
Subject: [PATCH 0431/1390] Internal change

PiperOrigin-RevId: 316965990
Change-Id: Ic5ee6b8b3fc04c7d5ca1548f8b638ba9c12a32d6
---
 third_party/mlir/BUILD      | 1 -
 third_party/mlir/test.BUILD | 1 -
 2 files changed, 2 deletions(-)

diff --git a/third_party/mlir/BUILD b/third_party/mlir/BUILD
index cb0b2f9dc8e..476b8566265 100644
--- a/third_party/mlir/BUILD
+++ b/third_party/mlir/BUILD
@@ -3286,7 +3286,6 @@ cc_library(
         ":LinalgTransforms",
         ":Pass",
         ":StandardOps",
-        ":StandardOpsTransforms",
         ":Support",
         ":Transforms",
         ":VectorToLLVM",
diff --git a/third_party/mlir/test.BUILD b/third_party/mlir/test.BUILD
index 14c2ba7778e..23287ce28d6 100644
--- a/third_party/mlir/test.BUILD
+++ b/third_party/mlir/test.BUILD
@@ -166,7 +166,6 @@ cc_library(
         "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:SCFDialect",
         "@llvm-project//mlir:StandardOps",
-        "@llvm-project//mlir:StandardOpsTransforms",
         "@llvm-project//mlir:Support",
         "@llvm-project//mlir:TargetNVVMIR",
         "@llvm-project//mlir:TargetROCDLIR",

From 69e36409c0eb96379f8a4e8d5219f64382aaf75e Mon Sep 17 00:00:00 2001
From: Karim Nosir <karimnosseir@google.com>
Date: Wed, 17 Jun 2020 14:32:39 -0700
Subject: [PATCH 0432/1390] Update documentation for FC op in hexagon delegate.
 FC now supports relu activation function

PiperOrigin-RevId: 316966229
Change-Id: If5a42ecca8aa9b6e94474e75614153f50ca8ae3b
---
 tensorflow/lite/delegates/hexagon/README.md                  | 2 +-
 tensorflow/lite/delegates/hexagon/builders/matmul_builder.cc | 1 -
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/tensorflow/lite/delegates/hexagon/README.md b/tensorflow/lite/delegates/hexagon/README.md
index 226f3a61077..deff36e80b8 100644
--- a/tensorflow/lite/delegates/hexagon/README.md
+++ b/tensorflow/lite/delegates/hexagon/README.md
@@ -74,7 +74,7 @@ are verified in `IsNodeSupportedByHexagon`:
       - depth_multiplier == 1
       - dilation only supported when stride == 1
       - Otherwise, stride height/width <= 3
-* FullyConnected (without any activation)
+* FullyConnected
 * Hardswish
 * L2Normalization (without any activation)
 * Logistic (aka Sigmoid)
diff --git a/tensorflow/lite/delegates/hexagon/builders/matmul_builder.cc b/tensorflow/lite/delegates/hexagon/builders/matmul_builder.cc
index 0757ea6180e..6189294c3a1 100644
--- a/tensorflow/lite/delegates/hexagon/builders/matmul_builder.cc
+++ b/tensorflow/lite/delegates/hexagon/builders/matmul_builder.cc
@@ -151,7 +151,6 @@ TfLiteStatus AddFullyConnectedHelper(const TfLiteIntArray* inputs,
 // Data (8-bit), Weights (const, 8-bit) => MatMul => MatMul out (int32)
 // MatMul out (int32), Bias (int32) => QuantizedBiasAdd => BiasAdd out (int32)
 // BiasAdd out (int32) => Requantize_32to8 => Output (8-bit)
-// TODO(b/129276536): Add activation support.
 TfLiteStatus MatMulWithConstWeightsOpBuilder::PopulateSubGraph(
     const TfLiteIntArray* inputs, const TfLiteIntArray* outputs,
     TfLiteContext* context) {

From b6ff68822a59578f942e4fb8076757da8db278ae Mon Sep 17 00:00:00 2001
From: Mihai Maruseac <mihaimaruseac@google.com>
Date: Wed, 17 Jun 2020 14:32:53 -0700
Subject: [PATCH 0433/1390] Manually define `GpuAtomic{Max,Min}` for `long
 long`.

CUDA only defines `atomic{Max,Min}` for `long long` for compute capability 3.5 or higher. Hence, to prevent compile errors due to failed instantiation when compiling on `sm30` (or other similar ones), we manually define the corresponding wrappers.

PiperOrigin-RevId: 316966290
Change-Id: I813e331309d12bdc1c06f7a74cb45c20c1833a41
---
 tensorflow/core/util/gpu_device_functions.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/tensorflow/core/util/gpu_device_functions.h b/tensorflow/core/util/gpu_device_functions.h
index de55b5c33c4..d4e09a7fc98 100644
--- a/tensorflow/core/util/gpu_device_functions.h
+++ b/tensorflow/core/util/gpu_device_functions.h
@@ -820,6 +820,11 @@ __device__ inline tensorflow::uint64 GpuAtomicMax(tensorflow::uint64* ptr,
   return detail::GpuAtomicCasHelper(
       ptr, [value](tensorflow::uint64 a) { return max(a, value); });
 }
+
+__device__ inline int64 GpuAtomicMax(int64* ptr, int64 value) {
+  return detail::GpuAtomicCasHelper(ptr,
+                                    [value](int64 a) { return max(a, value); });
+}
 #endif
 CREATE_CUDA_DEVICE_FUNCTION_ALIAS(GpuAtomicMax, CudaAtomicMax);
 
@@ -885,6 +890,11 @@ __device__ inline tensorflow::uint64 GpuAtomicMin(tensorflow::uint64* ptr,
   return detail::GpuAtomicCasHelper(
       ptr, [value](tensorflow::uint64 a) { return min(a, value); });
 }
+
+__device__ inline int64 GpuAtomicMin(int64* ptr, int64 value) {
+  return detail::GpuAtomicCasHelper(ptr,
+                                    [value](int64 a) { return min(a, value); });
+}
 #endif
 CREATE_CUDA_DEVICE_FUNCTION_ALIAS(GpuAtomicMin, CudaAtomicMin);
 

From ed2b3d6e1ec7999cd44e84c62d3c99a4b8f047b8 Mon Sep 17 00:00:00 2001
From: Jaesung Chung <jaesung@google.com>
Date: Wed, 17 Jun 2020 14:47:39 -0700
Subject: [PATCH 0434/1390] Fix msan test in tf_saved_model_lift_variables test

Now, all the tenors, generated from lift_variables_test.h, has initialized
values.

PiperOrigin-RevId: 316969327
Change-Id: If7f68aacf0c931430804dfaa1d73c0ea4be70c75
---
 .../transforms/lift_variables_test_pass.h        | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/lift_variables_test_pass.h b/tensorflow/compiler/mlir/tensorflow/transforms/lift_variables_test_pass.h
index faecdf04368..0e6d844bed3 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/lift_variables_test_pass.h
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/lift_variables_test_pass.h
@@ -96,15 +96,19 @@ class FakeSession : public tensorflow::Session {
     for (const std::string& output_name : output_names) {
       Tensor output;
       if (output_name == "dense/bias") {
-        outputs->push_back(
-            Tensor(tensorflow::DT_FLOAT, tensorflow::TensorShape({50})));
+        Tensor t = Tensor(tensorflow::DT_FLOAT, tensorflow::TensorShape({50}));
+        t.flat<float>().setZero();
+        outputs->push_back(t);
       } else if (output_name == "dense/kernel") {
-        outputs->push_back(
-            Tensor(tensorflow::DT_FLOAT, tensorflow::TensorShape({100, 50})));
+        Tensor t =
+            Tensor(tensorflow::DT_FLOAT, tensorflow::TensorShape({100, 50}));
+        t.flat<float>().setZero();
+        outputs->push_back(t);
       } else {
         // Create a scalar float tensor.
-        outputs->push_back(
-            Tensor(tensorflow::DT_FLOAT, tensorflow::TensorShape({})));
+        Tensor t = Tensor(tensorflow::DT_FLOAT, tensorflow::TensorShape({}));
+        t.flat<float>()(0) = 1.0f;
+        outputs->push_back(t);
       }
     }
     return Status::OK();

From 7a928592460e51c453ebaaecf965ef9ec963890b Mon Sep 17 00:00:00 2001
From: Ken Franko <kfranko@google.com>
Date: Wed, 17 Jun 2020 15:06:41 -0700
Subject: [PATCH 0435/1390] Add SaveOptions/CheckpointOptions to
 keras.Models.save_weights and keras_call_backs.ModelCheckpoint.

PiperOrigin-RevId: 316973333
Change-Id: I43f5b59ece4b862db41ab0e99f3c8df0a0d3b901
---
 tensorflow/python/keras/callbacks.py          | 30 +++++++++++++---
 tensorflow/python/keras/callbacks_test.py     | 34 +++++++++++++++++++
 tensorflow/python/keras/engine/training.py    | 10 ++++--
 .../golden/v1/tensorflow.keras.-model.pbtxt   |  2 +-
 .../v1/tensorflow.keras.-sequential.pbtxt     |  2 +-
 ...ow.keras.callbacks.-model-checkpoint.pbtxt |  2 +-
 ...low.keras.experimental.-linear-model.pbtxt |  2 +-
 ....keras.experimental.-wide-deep-model.pbtxt |  2 +-
 .../v1/tensorflow.keras.models.-model.pbtxt   |  2 +-
 .../tensorflow.keras.models.-sequential.pbtxt |  2 +-
 .../golden/v2/tensorflow.keras.-model.pbtxt   |  2 +-
 .../v2/tensorflow.keras.-sequential.pbtxt     |  2 +-
 ...ow.keras.callbacks.-model-checkpoint.pbtxt |  2 +-
 ...low.keras.experimental.-linear-model.pbtxt |  2 +-
 ....keras.experimental.-wide-deep-model.pbtxt |  2 +-
 .../v2/tensorflow.keras.models.-model.pbtxt   |  2 +-
 .../tensorflow.keras.models.-sequential.pbtxt |  2 +-
 17 files changed, 82 insertions(+), 20 deletions(-)

diff --git a/tensorflow/python/keras/callbacks.py b/tensorflow/python/keras/callbacks.py
index 1bca5419774..1fae5abd84b 100644
--- a/tensorflow/python/keras/callbacks.py
+++ b/tensorflow/python/keras/callbacks.py
@@ -54,7 +54,9 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import summary_ops_v2
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.profiler import profiler_v2 as profiler
+from tensorflow.python.saved_model import save_options as save_options_lib
 from tensorflow.python.training import checkpoint_management
+from tensorflow.python.training.saving import checkpoint_options as checkpoint_options_lib
 from tensorflow.python.util import nest
 from tensorflow.python.util.compat import collections_abc
 from tensorflow.python.util.tf_export import keras_export
@@ -1115,6 +1117,9 @@ class ModelCheckpoint(Callback):
         epochs, the monitored metric may potentially be less reliable (it
         could reflect as little as 1 batch, since the metrics get reset every
         epoch). Defaults to `'epoch'`.
+      options: Optional `tf.train.CheckpointOptions` object if
+        `save_weights_only` is true or optional `tf.saved_model.SavedOptions`
+        object if `save_weights_only` is false.
       **kwargs: Additional arguments for backwards compatibility. Possible key
         is `period`.
   """
@@ -1127,6 +1132,7 @@ class ModelCheckpoint(Callback):
                save_weights_only=False,
                mode='auto',
                save_freq='epoch',
+               options=None,
                **kwargs):
     super(ModelCheckpoint, self).__init__()
     self._supports_tf_logs = True
@@ -1140,6 +1146,20 @@ class ModelCheckpoint(Callback):
     self._batches_seen_since_last_saving = 0
     self._last_batch_seen = 0
 
+    if save_weights_only:
+      if options is None or isinstance(
+          options, checkpoint_options_lib.CheckpointOptions):
+        self._options = options or checkpoint_options_lib.CheckpointOptions()
+      else:
+        raise TypeError('If save_weights_only is True, then `options` must be'
+                        'either None or a tf.train.CheckpointOptions')
+    else:
+      if options is None or isinstance(options, save_options_lib.SaveOptions):
+        self._options = options or save_options_lib.SaveOptions()
+      else:
+        raise TypeError('If save_weights_only is False, then `options` must be'
+                        'either None or a tf.saved_model.SaveOptions')
+
     # Deprecated field `load_weights_on_restart` is for loading the checkpoint
     # file from `filepath` at the start of `model.fit()`
     # TODO(rchao): Remove the arg during next breaking release.
@@ -1269,9 +1289,10 @@ class ModelCheckpoint(Callback):
                                                self.best, current, filepath))
               self.best = current
               if self.save_weights_only:
-                self.model.save_weights(filepath, overwrite=True)
+                self.model.save_weights(
+                    filepath, overwrite=True, options=self._options)
               else:
-                self.model.save(filepath, overwrite=True)
+                self.model.save(filepath, overwrite=True, options=self._options)
             else:
               if self.verbose > 0:
                 print('\nEpoch %05d: %s did not improve from %0.5f' %
@@ -1280,9 +1301,10 @@ class ModelCheckpoint(Callback):
           if self.verbose > 0:
             print('\nEpoch %05d: saving model to %s' % (epoch + 1, filepath))
           if self.save_weights_only:
-            self.model.save_weights(filepath, overwrite=True)
+            self.model.save_weights(
+                filepath, overwrite=True, options=self._options)
           else:
-            self.model.save(filepath, overwrite=True)
+            self.model.save(filepath, overwrite=True, options=self._options)
 
         self._maybe_remove_file()
       except IOError as e:
diff --git a/tensorflow/python/keras/callbacks_test.py b/tensorflow/python/keras/callbacks_test.py
index 28f85304688..d180e85a1d9 100644
--- a/tensorflow/python/keras/callbacks_test.py
+++ b/tensorflow/python/keras/callbacks_test.py
@@ -49,9 +49,11 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import summary_ops_v2
 from tensorflow.python.platform import test
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.saved_model import save_options as save_options_lib
 from tensorflow.python.summary import summary_iterator
 from tensorflow.python.training import adam
 from tensorflow.python.training import checkpoint_management
+from tensorflow.python.training.saving import checkpoint_options as checkpoint_options_lib
 
 try:
   import h5py  # pylint:disable=g-import-not-at-top
@@ -666,6 +668,38 @@ class KerasCallbacksTest(keras_parameterized.TestCase):
         mode=mode,
         save_freq=3)
 
+    # Case 9: `ModelCheckpoint` with valid and invalid `options` argument.
+    with self.assertRaisesRegexp(TypeError, 'tf.train.CheckpointOptions'):
+      keras.callbacks.ModelCheckpoint(
+          filepath,
+          monitor=monitor,
+          save_best_only=save_best_only,
+          save_weights_only=True,
+          mode=mode,
+          options=save_options_lib.SaveOptions())
+    with self.assertRaisesRegexp(TypeError, 'tf.saved_model.SaveOptions'):
+      keras.callbacks.ModelCheckpoint(
+          filepath,
+          monitor=monitor,
+          save_best_only=save_best_only,
+          save_weights_only=False,
+          mode=mode,
+          options=checkpoint_options_lib.CheckpointOptions())
+    keras.callbacks.ModelCheckpoint(
+        filepath,
+        monitor=monitor,
+        save_best_only=save_best_only,
+        save_weights_only=True,
+        mode=mode,
+        options=checkpoint_options_lib.CheckpointOptions())
+    keras.callbacks.ModelCheckpoint(
+        filepath,
+        monitor=monitor,
+        save_best_only=save_best_only,
+        save_weights_only=False,
+        mode=mode,
+        options=save_options_lib.SaveOptions())
+
   def _get_dummy_resource_for_model_checkpoint_testing(self):
 
     def get_input_datasets():
diff --git a/tensorflow/python/keras/engine/training.py b/tensorflow/python/keras/engine/training.py
index 5567e1733a7..ccd184a8bc4 100644
--- a/tensorflow/python/keras/engine/training.py
+++ b/tensorflow/python/keras/engine/training.py
@@ -1979,7 +1979,11 @@ class Model(base_layer.Layer, version_utils.ModelVersionSelector):
     save.save_model(self, filepath, overwrite, include_optimizer, save_format,
                     signatures, options)
 
-  def save_weights(self, filepath, overwrite=True, save_format=None):
+  def save_weights(self,
+                   filepath,
+                   overwrite=True,
+                   save_format=None,
+                   options=None):
     """Saves all layer weights.
 
     Either saves in HDF5 or in TensorFlow format based on the `save_format`
@@ -2032,6 +2036,8 @@ class Model(base_layer.Layer, version_utils.ModelVersionSelector):
         save_format: Either 'tf' or 'h5'. A `filepath` ending in '.h5' or
             '.keras' will default to HDF5 if `save_format` is `None`. Otherwise
             `None` defaults to 'tf'.
+        options: Optional `tf.train.CheckpointOptions` object that specifies
+            options for saving weights.
 
     Raises:
         ImportError: If h5py is not available when attempting to save in HDF5
@@ -2093,7 +2099,7 @@ class Model(base_layer.Layer, version_utils.ModelVersionSelector):
              'the TensorFlow format the optimizer\'s state will not be '
              'saved.\n\nConsider using a TensorFlow optimizer from `tf.train`.')
             % (optimizer,))
-      self._trackable_saver.save(filepath, session=session)
+      self._trackable_saver.save(filepath, session=session, options=options)
       # Record this checkpoint so it's visible from tf.train.latest_checkpoint.
       checkpoint_management.update_checkpoint_state_internal(
           save_dir=os.path.dirname(filepath),
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.-model.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.-model.pbtxt
index b62814e81cb..6318e577087 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.-model.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.-model.pbtxt
@@ -302,7 +302,7 @@ tf_class {
   }
   member_method {
     name: "save_weights"
-    argspec: "args=[\'self\', \'filepath\', \'overwrite\', \'save_format\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
+    argspec: "args=[\'self\', \'filepath\', \'overwrite\', \'save_format\', \'options\'], varargs=None, keywords=None, defaults=[\'True\', \'None\', \'None\'], "
   }
   member_method {
     name: "set_weights"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.-sequential.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.-sequential.pbtxt
index 7485a0b3c62..9b7b7736746 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.-sequential.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.-sequential.pbtxt
@@ -320,7 +320,7 @@ tf_class {
   }
   member_method {
     name: "save_weights"
-    argspec: "args=[\'self\', \'filepath\', \'overwrite\', \'save_format\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
+    argspec: "args=[\'self\', \'filepath\', \'overwrite\', \'save_format\', \'options\'], varargs=None, keywords=None, defaults=[\'True\', \'None\', \'None\'], "
   }
   member_method {
     name: "set_weights"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-model-checkpoint.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-model-checkpoint.pbtxt
index 5fb646e1c63..e6cc7aee5a8 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-model-checkpoint.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-model-checkpoint.pbtxt
@@ -5,7 +5,7 @@ tf_class {
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'filepath\', \'monitor\', \'verbose\', \'save_best_only\', \'save_weights_only\', \'mode\', \'save_freq\'], varargs=None, keywords=kwargs, defaults=[\'val_loss\', \'0\', \'False\', \'False\', \'auto\', \'epoch\'], "
+    argspec: "args=[\'self\', \'filepath\', \'monitor\', \'verbose\', \'save_best_only\', \'save_weights_only\', \'mode\', \'save_freq\', \'options\'], varargs=None, keywords=kwargs, defaults=[\'val_loss\', \'0\', \'False\', \'False\', \'auto\', \'epoch\', \'None\'], "
   }
   member_method {
     name: "on_batch_begin"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-linear-model.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-linear-model.pbtxt
index bf980e5d116..976eb49d4c8 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-linear-model.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-linear-model.pbtxt
@@ -303,7 +303,7 @@ tf_class {
   }
   member_method {
     name: "save_weights"
-    argspec: "args=[\'self\', \'filepath\', \'overwrite\', \'save_format\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
+    argspec: "args=[\'self\', \'filepath\', \'overwrite\', \'save_format\', \'options\'], varargs=None, keywords=None, defaults=[\'True\', \'None\', \'None\'], "
   }
   member_method {
     name: "set_weights"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-wide-deep-model.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-wide-deep-model.pbtxt
index c214a5c3419..500aa28eae7 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-wide-deep-model.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-wide-deep-model.pbtxt
@@ -303,7 +303,7 @@ tf_class {
   }
   member_method {
     name: "save_weights"
-    argspec: "args=[\'self\', \'filepath\', \'overwrite\', \'save_format\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
+    argspec: "args=[\'self\', \'filepath\', \'overwrite\', \'save_format\', \'options\'], varargs=None, keywords=None, defaults=[\'True\', \'None\', \'None\'], "
   }
   member_method {
     name: "set_weights"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-model.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-model.pbtxt
index 86868c9d17f..ad0edc64606 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-model.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-model.pbtxt
@@ -302,7 +302,7 @@ tf_class {
   }
   member_method {
     name: "save_weights"
-    argspec: "args=[\'self\', \'filepath\', \'overwrite\', \'save_format\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
+    argspec: "args=[\'self\', \'filepath\', \'overwrite\', \'save_format\', \'options\'], varargs=None, keywords=None, defaults=[\'True\', \'None\', \'None\'], "
   }
   member_method {
     name: "set_weights"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-sequential.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-sequential.pbtxt
index 05aa19a915a..b38c669df0f 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-sequential.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-sequential.pbtxt
@@ -320,7 +320,7 @@ tf_class {
   }
   member_method {
     name: "save_weights"
-    argspec: "args=[\'self\', \'filepath\', \'overwrite\', \'save_format\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
+    argspec: "args=[\'self\', \'filepath\', \'overwrite\', \'save_format\', \'options\'], varargs=None, keywords=None, defaults=[\'True\', \'None\', \'None\'], "
   }
   member_method {
     name: "set_weights"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.-model.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.-model.pbtxt
index b62814e81cb..6318e577087 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.-model.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.-model.pbtxt
@@ -302,7 +302,7 @@ tf_class {
   }
   member_method {
     name: "save_weights"
-    argspec: "args=[\'self\', \'filepath\', \'overwrite\', \'save_format\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
+    argspec: "args=[\'self\', \'filepath\', \'overwrite\', \'save_format\', \'options\'], varargs=None, keywords=None, defaults=[\'True\', \'None\', \'None\'], "
   }
   member_method {
     name: "set_weights"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.-sequential.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.-sequential.pbtxt
index 7485a0b3c62..9b7b7736746 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.-sequential.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.-sequential.pbtxt
@@ -320,7 +320,7 @@ tf_class {
   }
   member_method {
     name: "save_weights"
-    argspec: "args=[\'self\', \'filepath\', \'overwrite\', \'save_format\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
+    argspec: "args=[\'self\', \'filepath\', \'overwrite\', \'save_format\', \'options\'], varargs=None, keywords=None, defaults=[\'True\', \'None\', \'None\'], "
   }
   member_method {
     name: "set_weights"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-model-checkpoint.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-model-checkpoint.pbtxt
index 5fb646e1c63..e6cc7aee5a8 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-model-checkpoint.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-model-checkpoint.pbtxt
@@ -5,7 +5,7 @@ tf_class {
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'filepath\', \'monitor\', \'verbose\', \'save_best_only\', \'save_weights_only\', \'mode\', \'save_freq\'], varargs=None, keywords=kwargs, defaults=[\'val_loss\', \'0\', \'False\', \'False\', \'auto\', \'epoch\'], "
+    argspec: "args=[\'self\', \'filepath\', \'monitor\', \'verbose\', \'save_best_only\', \'save_weights_only\', \'mode\', \'save_freq\', \'options\'], varargs=None, keywords=kwargs, defaults=[\'val_loss\', \'0\', \'False\', \'False\', \'auto\', \'epoch\', \'None\'], "
   }
   member_method {
     name: "on_batch_begin"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-linear-model.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-linear-model.pbtxt
index bf980e5d116..976eb49d4c8 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-linear-model.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-linear-model.pbtxt
@@ -303,7 +303,7 @@ tf_class {
   }
   member_method {
     name: "save_weights"
-    argspec: "args=[\'self\', \'filepath\', \'overwrite\', \'save_format\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
+    argspec: "args=[\'self\', \'filepath\', \'overwrite\', \'save_format\', \'options\'], varargs=None, keywords=None, defaults=[\'True\', \'None\', \'None\'], "
   }
   member_method {
     name: "set_weights"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-wide-deep-model.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-wide-deep-model.pbtxt
index c214a5c3419..500aa28eae7 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-wide-deep-model.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-wide-deep-model.pbtxt
@@ -303,7 +303,7 @@ tf_class {
   }
   member_method {
     name: "save_weights"
-    argspec: "args=[\'self\', \'filepath\', \'overwrite\', \'save_format\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
+    argspec: "args=[\'self\', \'filepath\', \'overwrite\', \'save_format\', \'options\'], varargs=None, keywords=None, defaults=[\'True\', \'None\', \'None\'], "
   }
   member_method {
     name: "set_weights"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-model.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-model.pbtxt
index 86868c9d17f..ad0edc64606 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-model.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-model.pbtxt
@@ -302,7 +302,7 @@ tf_class {
   }
   member_method {
     name: "save_weights"
-    argspec: "args=[\'self\', \'filepath\', \'overwrite\', \'save_format\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
+    argspec: "args=[\'self\', \'filepath\', \'overwrite\', \'save_format\', \'options\'], varargs=None, keywords=None, defaults=[\'True\', \'None\', \'None\'], "
   }
   member_method {
     name: "set_weights"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-sequential.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-sequential.pbtxt
index 05aa19a915a..b38c669df0f 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-sequential.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-sequential.pbtxt
@@ -320,7 +320,7 @@ tf_class {
   }
   member_method {
     name: "save_weights"
-    argspec: "args=[\'self\', \'filepath\', \'overwrite\', \'save_format\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
+    argspec: "args=[\'self\', \'filepath\', \'overwrite\', \'save_format\', \'options\'], varargs=None, keywords=None, defaults=[\'True\', \'None\', \'None\'], "
   }
   member_method {
     name: "set_weights"

From b186ba0334e98ebbee5eacf7fc1c66897eaa19bc Mon Sep 17 00:00:00 2001
From: Li Lao <llao@google.com>
Date: Wed, 17 Jun 2020 15:21:23 -0700
Subject: [PATCH 0436/1390] Add TraceMeProducer/Consumer for
 SharedBatchScheduler.

PiperOrigin-RevId: 316976310
Change-Id: I1cd2a03390aedd7e8e85b2826ab3aadd096bafdd
---
 tensorflow/core/kernels/batching_util/BUILD   |  2 ++
 .../kernels/batching_util/batch_scheduler.h   | 21 ++++++++++++++++++-
 .../batching_util/shared_batch_scheduler.h    | 18 +++++++++++-----
 .../core/profiler/lib/connected_traceme.h     |  1 +
 4 files changed, 36 insertions(+), 6 deletions(-)

diff --git a/tensorflow/core/kernels/batching_util/BUILD b/tensorflow/core/kernels/batching_util/BUILD
index 370c96c6e7f..803eb2e9048 100644
--- a/tensorflow/core/kernels/batching_util/BUILD
+++ b/tensorflow/core/kernels/batching_util/BUILD
@@ -70,6 +70,7 @@ cc_library(
         ":batch_scheduler_hdrs",
         ":periodic_function_dynamic",
         "//tensorflow/core:framework_headers_lib",
+        "//tensorflow/core/profiler/lib:connected_traceme",
         "//tensorflow/core/profiler/lib:traceme",
     ],
 )
@@ -81,6 +82,7 @@ cc_library(
         ":batch_scheduler",
         ":periodic_function_dynamic",
         "//tensorflow/core:lib",
+        "//tensorflow/core/profiler/lib:connected_traceme",
         "//tensorflow/core/profiler/lib:traceme",
     ],
     alwayslink = 1,
diff --git a/tensorflow/core/kernels/batching_util/batch_scheduler.h b/tensorflow/core/kernels/batching_util/batch_scheduler.h
index e418f8acbb1..d0e1d20bed4 100644
--- a/tensorflow/core/kernels/batching_util/batch_scheduler.h
+++ b/tensorflow/core/kernels/batching_util/batch_scheduler.h
@@ -77,7 +77,8 @@ class BatchTask {
 template <typename TaskType>
 class Batch {
  public:
-  Batch() = default;
+  Batch();
+  explicit Batch(uint64 traceme_context_id);
   virtual ~Batch();  // Blocks until the batch is closed.
 
   // Appends 'task' to the batch. After calling AddTask(), the newly-added task
@@ -113,6 +114,9 @@ class Batch {
   // Marks the batch as closed. Dies if called more than once.
   void Close();
 
+  // Returns the TraceMe context id of this batch.
+  uint64 traceme_context_id() const;
+
  private:
   mutable mutex mu_;
 
@@ -125,6 +129,9 @@ class Batch {
   // Whether the batch has been closed.
   Notification closed_;
 
+  // The TracMe context id.
+  const uint64 traceme_context_id_;
+
   TF_DISALLOW_COPY_AND_ASSIGN(Batch);
 };
 
@@ -187,6 +194,13 @@ class BatchScheduler {
 //////////
 // Implementation details follow. API users need not read.
 
+template <typename TaskType>
+Batch<TaskType>::Batch() : Batch(0) {}
+
+template <typename TaskType>
+Batch<TaskType>::Batch(uint64 traceme_context_id)
+    : traceme_context_id_(traceme_context_id) {}
+
 template <typename TaskType>
 Batch<TaskType>::~Batch() {
   WaitUntilClosed();
@@ -275,6 +289,11 @@ void Batch<TaskType>::Close() {
   closed_.Notify();
 }
 
+template <typename TaskType>
+uint64 Batch<TaskType>::traceme_context_id() const {
+  return traceme_context_id_;
+}
+
 }  // namespace serving
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/kernels/batching_util/shared_batch_scheduler.h b/tensorflow/core/kernels/batching_util/shared_batch_scheduler.h
index 66bdff933d8..e47e069eff5 100644
--- a/tensorflow/core/kernels/batching_util/shared_batch_scheduler.h
+++ b/tensorflow/core/kernels/batching_util/shared_batch_scheduler.h
@@ -36,6 +36,7 @@ limitations under the License.
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/thread_annotations.h"
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/profiler/lib/connected_traceme.h"
 #include "tensorflow/core/profiler/lib/traceme.h"
 
 namespace tensorflow {
@@ -311,6 +312,9 @@ class Queue {
   // The enqueued batches. See the invariants in the class comments above.
   std::deque<std::unique_ptr<Batch<TaskType>>> batches_ TF_GUARDED_BY(mu_);
 
+  // The counter of the TraceMe context ids.
+  uint64 traceme_context_id_counter_ TF_GUARDED_BY(mu_) = 0;
+
   // The time at which the first task was added to the open (back-most) batch
   // in 'batches_'. Valid iff that batch contains at least one task.
   uint64 open_batch_start_time_micros_ TF_GUARDED_BY(mu_);
@@ -529,8 +533,6 @@ Queue<TaskType>::~Queue() {
 
 template <typename TaskType>
 Status Queue<TaskType>::Schedule(std::unique_ptr<TaskType>* task) {
-  profiler::TraceMe trace_me(
-      [task] { return strings::StrCat("Schedule:", (*task)->size()); });
   if ((*task)->size() > options_.max_batch_size) {
     return errors::InvalidArgument("Task size ", (*task)->size(),
                                    " is larger than maximum batch size ",
@@ -554,6 +556,10 @@ Status Queue<TaskType>::Schedule(std::unique_ptr<TaskType>* task) {
     if (batches_.back()->empty()) {
       open_batch_start_time_micros_ = env_->NowMicros();
     }
+    profiler::TraceMeProducer trace_me(
+        [&] { return strings::StrCat("Schedule:", (*task)->size()); },
+        profiler::ContextType::kSharedBatchScheduler,
+        batches_.back()->traceme_context_id());
     batches_.back()->AddTask(std::move(*task));
 
     if (!schedulable_batch_) {
@@ -621,8 +627,10 @@ std::unique_ptr<Batch<TaskType>> Queue<TaskType>::ScheduleBatch() {
 
 template <typename TaskType>
 void Queue<TaskType>::ProcessBatch(std::unique_ptr<Batch<TaskType>> batch) {
-  profiler::TraceMe trace_me(
-      [&batch] { return strings::StrCat("ProcessBatch:", batch->size()); });
+  profiler::TraceMeConsumer trace_me(
+      [&batch] { return strings::StrCat("ProcessBatch:", batch->size()); },
+      profiler::ContextType::kSharedBatchScheduler,
+      batch->traceme_context_id());
   process_batch_callback_(std::move(batch));
 
   {
@@ -665,7 +673,7 @@ bool Queue<TaskType>::IsEmptyInternal() const {
 template <typename TaskType>
 void Queue<TaskType>::StartNewBatch() {
   batches_.back()->Close();
-  batches_.emplace_back(new Batch<TaskType>);
+  batches_.emplace_back(new Batch<TaskType>(++traceme_context_id_counter_));
 }
 
 template <typename TaskType>
diff --git a/tensorflow/core/profiler/lib/connected_traceme.h b/tensorflow/core/profiler/lib/connected_traceme.h
index cbc610af407..ed8b4ac1ad2 100644
--- a/tensorflow/core/profiler/lib/connected_traceme.h
+++ b/tensorflow/core/profiler/lib/connected_traceme.h
@@ -28,6 +28,7 @@ namespace profiler {
 enum class ContextType : int {
   kGeneric,
   kTfExecutor,
+  kSharedBatchScheduler,
 };
 
 /*

From 19c51afbf1fd2f724cabc42313ce84e31a4defcc Mon Sep 17 00:00:00 2001
From: Jaesung Chung <jaesung@google.com>
Date: Wed, 17 Jun 2020 15:42:14 -0700
Subject: [PATCH 0437/1390] Do not emit op errors on TF -> TFL legalization

Applying legalization patterns will emit unwanted, transient errors
when the replaced TFLite ops do not meet the sanity checks. In order to
ignore the transient errors, the following lines override a diagnostic
handler with an no-op handler only while this pass runs.

PiperOrigin-RevId: 316980278
Change-Id: Idef14e13f36ff0ee3c4bb1a401f92ba217042dbb
---
 .../compiler/mlir/lite/converter_gen.cc       | 18 +++++-----------
 .../mlir/lite/ir/tfl_op_interfaces.td         |  3 +--
 .../compiler/mlir/lite/tests/legalize-tf.mlir |  7 +++++++
 .../mlir/lite/transforms/legalize_tf.cc       | 21 ++++++++++++++++---
 .../mlir/lite/transforms/runtime_verify.cc    |  3 +--
 5 files changed, 32 insertions(+), 20 deletions(-)

diff --git a/tensorflow/compiler/mlir/lite/converter_gen.cc b/tensorflow/compiler/mlir/lite/converter_gen.cc
index 6df569a8031..edead2037a3 100644
--- a/tensorflow/compiler/mlir/lite/converter_gen.cc
+++ b/tensorflow/compiler/mlir/lite/converter_gen.cc
@@ -446,7 +446,7 @@ static void GenOperandResultVerifier(raw_ostream &os,
     auto desc =
         definit->getDef()->getValueAsString("tflRuntimeTypeDescription");
 
-    // Emit a loop to check all the dynamic values in the pack.
+    // Emit a loop to check all operands.
     os << formatv("    for (Value v : top.getODS{0}{1}s({2})) {{\n",
                   // Capitalize the first letter to match the function name
                   valueKind.substr(0, 1).upper(), valueKind.substr(1),
@@ -455,14 +455,10 @@ static void GenOperandResultVerifier(raw_ostream &os,
     os << "      (void)v;\n"
        << "      if (!("
        << tgfmt(pred.getCondition(), &fctx.withSelf("v.getType()")) << ")) {\n"
-       << "        if (failure_on_operand_type_mismatch) {\n"
        << formatv(
-              "        return op->emitOpError(\"{0} #\") << index "
+              "      return op->emitOpError(\"{0} #\") << index "
               "<< \" must be {1}, but got \" << v.getType();\n",
               valueKind, desc)
-       << "        } else {\n"
-       << "          return ::mlir::LogicalResult::Failure;\n"
-       << "        }\n"
        << "      }\n"  // if
        << "      ++index;\n"
        << "    }\n";  // for
@@ -487,8 +483,7 @@ static bool RuntimeVerifierWriterMain(raw_ostream &os, RecordKeeper &records) {
 
     mlir::tblgen::FmtContext verify_ctx;
     os << "::mlir::LogicalResult " << op.getCppClassName()
-       << "::VerifyTflRuntimeConstraints(::mlir::Operation *op, bool "
-          "failure_on_operand_type_mismatch) {\n";
+       << "::VerifyTflRuntimeConstraints(::mlir::Operation *op) {\n";
     os << "  auto top = cast<" << op.getCppClassName() << ">(op); (void)top;\n";
     verify_ctx.withOp("top");
 
@@ -529,11 +524,8 @@ static bool RuntimeVerifierWriterMain(raw_ostream &os, RecordKeeper &records) {
 
       mlir::tblgen::Pred pred(dyn_cast<llvm::DefInit>(val->getValue()));
       os << tgfmt(
-          "  if (!($0)) {\n    "
-          "    if (failure_on_operand_type_mismatch) {\n"
-          "      return top.emitOpError(\"failed to verify that $1\");\n"
-          "    } else {\n"
-          "      return ::mlir::LogicalResult::Failure;\n  }\n  }\n",
+          "  if (!($0))\n"
+          "    return top.emitOpError(\"failed to verify that $1\");\n",
           &verify_ctx, tgfmt(pred.getCondition(), &verify_ctx), desc);
     }
     os << "  return top.verify();\n}\n";
diff --git a/tensorflow/compiler/mlir/lite/ir/tfl_op_interfaces.td b/tensorflow/compiler/mlir/lite/ir/tfl_op_interfaces.td
index 23101113a6f..a79d79b5970 100644
--- a/tensorflow/compiler/mlir/lite/ir/tfl_op_interfaces.td
+++ b/tensorflow/compiler/mlir/lite/ir/tfl_op_interfaces.td
@@ -94,8 +94,7 @@ def TFL_RuntimeVerification : OpInterface<"TflRuntimeVerifyOpInterface"> {
   let methods = [
     StaticInterfaceMethod<
       [{Returns whether the op's operands/results are supported by runtime.}],
-      "LogicalResult", "VerifyTflRuntimeConstraints",
-      (ins "Operation*":$op, "bool":$failure_on_operand_type_mismatch)
+      "LogicalResult", "VerifyTflRuntimeConstraints", (ins "Operation*":$op)
     >,
   ];
 }
diff --git a/tensorflow/compiler/mlir/lite/tests/legalize-tf.mlir b/tensorflow/compiler/mlir/lite/tests/legalize-tf.mlir
index 1ae789f5468..5756fa6dec2 100644
--- a/tensorflow/compiler/mlir/lite/tests/legalize-tf.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/legalize-tf.mlir
@@ -990,6 +990,13 @@ func @batch_to_space_nd(%arg0: tensor<4x2x2x3xf32>, %arg1: tensor<2xi32>, %arg2:
   // CHECK: "tfl.batch_to_space_nd"(%arg0, %arg1, %arg2) : (tensor<4x2x2x3xf32>, tensor<2xi32>, tensor<2x2xi32>) -> tensor<?xf32>
 }
 
+func @batch_to_space_nd_unsupported(%arg0: tensor<?x1x1x1x4xf32>, %arg1: tensor<3xi32>, %arg2: tensor<3x2xi32>) -> tensor<?x3x3x3x4xf32> {
+  %0 = "tf.BatchToSpaceND"(%arg0, %arg1, %arg2) : (tensor<?x1x1x1x4xf32>, tensor<3xi32>, tensor<3x2xi32>) -> tensor<?x3x3x3x4xf32>
+  return %0 : tensor<?x3x3x3x4xf32>
+  // CHECK-LABEL: batch_to_space_nd_unsupported
+  // CHECK: "tf.BatchToSpaceND"
+}
+
 func @space_to_batch_nd(%arg0: tensor<1x4x4x3xf32>, %arg1: tensor<2xi32>, %arg2: tensor<2x2xi32>) -> tensor<?xf32> {
   %0 = "tf.SpaceToBatchND"(%arg0, %arg1, %arg2) : (tensor<1x4x4x3xf32>, tensor<2xi32>, tensor<2x2xi32>) -> tensor<?xf32>
   return %0 : tensor<?xf32>
diff --git a/tensorflow/compiler/mlir/lite/transforms/legalize_tf.cc b/tensorflow/compiler/mlir/lite/transforms/legalize_tf.cc
index 46ed134d7ee..1328a2baf5d 100644
--- a/tensorflow/compiler/mlir/lite/transforms/legalize_tf.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/legalize_tf.cc
@@ -28,9 +28,11 @@ limitations under the License.
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/StringSwitch.h"
+#include "llvm/Support/Threading.h"
 #include "mlir/Dialect/Quant/FakeQuantSupport.h"  // from @llvm-project
 #include "mlir/Dialect/Quant/UniformSupport.h"  // from @llvm-project
 #include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/Diagnostics.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/IR/Operation.h"  // from @llvm-project
 #include "mlir/IR/PatternMatch.h"  // from @llvm-project
@@ -767,13 +769,26 @@ void LegalizeTF::runOnFunction() {
             [](Operation* op) {
               auto tfl_op = dyn_cast_or_null<TflRuntimeVerifyOpInterface>(op);
               if (!tfl_op) return false;
-              return succeeded(tfl_op.VerifyTflRuntimeConstraints(
-                  tfl_op.getOperation(),
-                  /*failure_on_operand_type_mismatch=*/false));
+              return succeeded(tfl_op.VerifyTflRuntimeConstraints(op));
             }));
   } else {
     target.addLegalDialect<TensorFlowLiteDialect>();
   }
+
+  // Ignore transient errors by registering an no-op handler.
+  // Applying legalization patterns will emit unwanted, transient errors when
+  // the replaced TFLite ops do not meet the sanity checks. In order to ignore
+  // the transient errors, the following lines override a diagnostic handler
+  // with an no-op handler only while this pass runs.
+  uint64_t current_thread_id = llvm::get_threadid();
+  ScopedDiagnosticHandler scoped_diag_handler(
+      context, [&current_thread_id](Diagnostic&) -> LogicalResult {
+        // Consume only errors that are coming from the same thread in order not
+        // to ignore errors from other passes that are running. Things running
+        // in the pass manager can be multi-threaded.
+        return success(current_thread_id == llvm::get_threadid());
+      });
+
   // Keep trying to convert.
   // TODO(karimnosseir): This is similar to what apply greedy patterns does.
   // Look if there is a function that tries until it converge.
diff --git a/tensorflow/compiler/mlir/lite/transforms/runtime_verify.cc b/tensorflow/compiler/mlir/lite/transforms/runtime_verify.cc
index 3268329b1c1..cc2e691180e 100644
--- a/tensorflow/compiler/mlir/lite/transforms/runtime_verify.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/runtime_verify.cc
@@ -34,8 +34,7 @@ class RuntimeVerifyPass
 
 void RuntimeVerifyPass::runOnFunction() {
   getFunction().walk([&](TflRuntimeVerifyOpInterface op) {
-    if (failed(op.VerifyTflRuntimeConstraints(
-            op.getOperation(), /*failure_on_operand_type_mismatch=*/true)))
+    if (failed(op.VerifyTflRuntimeConstraints(op.getOperation())))
       signalPassFailure();
   });
 }

From db3640aaf1e1b520fa59cc134f822cbb7a2ed270 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 17 Jun 2020 15:43:59 -0700
Subject: [PATCH 0438/1390] tf.numpy Add a README.md

PiperOrigin-RevId: 316980572
Change-Id: I441a3d9dc0d36b663a47721f2335b4178a71db65
---
 tensorflow/python/ops/numpy_ops/README.md | 94 +++++++++++++++++++++++
 1 file changed, 94 insertions(+)
 create mode 100644 tensorflow/python/ops/numpy_ops/README.md

diff --git a/tensorflow/python/ops/numpy_ops/README.md b/tensorflow/python/ops/numpy_ops/README.md
new file mode 100644
index 00000000000..3dc37423d26
--- /dev/null
+++ b/tensorflow/python/ops/numpy_ops/README.md
@@ -0,0 +1,94 @@
+# tf.experimental.numpy
+
+This module provides a subset of numpy API, built on top of TensorFlow
+operations. APIs are based on numpy 1.16 version.
+
+The set of supported APIs may be expanded over time. Also future releases may
+change the baseline version of numpy API being supported. A list of some
+systematic differences with numpy are listed later in the "Differences with
+Numpy" section.
+
+## Types
+
+The module provide an `ndarray` class which wraps an immutable `tf.Tensor`.
+Additional functions are provided which accept array-like objects. Here
+array-like objects includes `ndarrays` as defined by this module, as well as
+`tf.Tensor`, in addition to types accepted by `numpy`.
+
+A subset of `numpy` dtypes are supported, along with `tf.bfloat16`.
+Additionally, support is provided for selecting the default float type
+(`np.float32` vs `np.float64`) given that some applications may prefer lower
+precision.
+
+## Device Support
+
+Given that `ndarray` and functions wrap TensorFlow constructs, the code will
+have GPU and TPU support on par with TensorFlow. Also the code can be wrapped
+with `tf.function` and XLA compiled. Device placement can be controlled by using
+`with tf.device` scopes.
+
+## Graph and Eager Modes
+
+Eager mode execution should typically match numpy semantics of executing
+op-by-op. However the same code can be executed in graph mode, by putting it
+inside a `tf.function`. This can change behavior of certain operations since
+symbolic execution may not have information that is computed during runtime.
+
+Some differences are:
+
+  * Shapes can be incomplete or unknown. This means that `ndarray.shape`,
+    `ndarray.size` and `ndarray.ndim` can return `ndarray` objects instead of
+    returning integer (or tuple of integer) values.
+  * Python control flow based on `ndarray` values may not work and may have to
+    be rewritten to use `tf.cond` or `tf.while_loop`. Note that autograph
+    conversion as part of `tf.function` should still work.
+  * `__len__`, `__iter__` and `__index__` properties of `ndarray` may similarly
+    not work in graph mode.
+
+## Mutation and Variables
+
+`ndarrays` currently wrap immutable `tf.Tensor`. Also currently mutation
+operations like slice assigns are not supported. This may change in the future.
+
+There is currently no explict construct on par with `tf.Variable`. However one
+can directly construct a `tf.Variable` and use that with the numpy APIs in this
+module. See section on Interop.
+
+## Interop
+
+The numpy API calls can be interleaved with TensorFlow calls without incurring
+Tensor data copies. This is true even if the `ndarray` or `tf.Tensor` is placed
+on a non-CPU device.
+
+Additionally, one could put these calls in a `with tf.GradientTape()` context to
+compute gradients through the numpy API calls. Similarly, code vectorization can
+be done using `tf.vectorized_map()`.
+
+In general, the expected behavior should be on par with that of code involving
+`tf.Tensor` and running stateless TensorFlow functions on them.
+
+## Array Interface
+
+The `ndarray` class implements the `__array__` interface. This should allow
+these objects to be passed into contexts that expect a `numpy` or array-like
+object (e.g. matplotlib).
+
+
+## Differences with Numpy
+
+Here is a non-exhaustive list of differences:
+
+  * Not all dtypes are currently supported. e.g. `np.float96`, `np.float128`.
+    `np.object`, `np.str`, `np.recarray` types are not supported.
+  * `ndarray` storage is in C order only. Fortran order, views, stride_tricks
+    are not supported.
+  * Only a subset of functions and modules are supported. This set would be
+    expanded over time. For supported functions, some arguments or argument
+    values may not be supported. This differences are listed in the function
+    comments.
+  * Buffer mutation is currently not supported. `ndarrays` wrap immutable
+    tensors. This means that output buffer arguments (e..g `out` in ufuncs) are
+    not supported
+  * full `ufunc` support is not provided.
+  * Numpy C API is not supported. Numpy's Cython and Swig integration are not
+    supported.

From eb0d46affb1eccfb4de1463ef9c70fdedc227d22 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 17 Jun 2020 15:44:12 -0700
Subject: [PATCH 0439/1390] Import initialization graph in SignatureDef
 SavedModels as an MLIR function in TF saved model dialect.

PiperOrigin-RevId: 316980623
Change-Id: I3dafe9303ad65192ca0c137943be3cd1ea0e74e7
---
 tensorflow/compiler/mlir/tensorflow/BUILD     |   4 +-
 .../mlir/tensorflow/ir/tf_saved_model.cc      |  25 ---
 .../mlir/tensorflow/ir/tf_saved_model_ops.td  |  24 ---
 .../tests/tf_saved_model/common_v1.py         |   1 -
 .../tests/tf_saved_model/hash_table_v1.py     |  92 -----------
 .../tensorflow/tests/tf_saved_model_ops.mlir  |   5 -
 .../tests/tf_saved_model_ops_invalid.mlir     |  33 ----
 .../mlir/tensorflow/translate/import_model.cc | 149 +++++-------------
 8 files changed, 43 insertions(+), 290 deletions(-)
 delete mode 100644 tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/hash_table_v1.py

diff --git a/tensorflow/compiler/mlir/tensorflow/BUILD b/tensorflow/compiler/mlir/tensorflow/BUILD
index 17ed0e36a28..904ccb7e820 100644
--- a/tensorflow/compiler/mlir/tensorflow/BUILD
+++ b/tensorflow/compiler/mlir/tensorflow/BUILD
@@ -661,9 +661,7 @@ cc_library(
         ":tensorflow_types",
         ":translate_utils",
         "//tensorflow/cc/saved_model:bundle_v2",
-        "//tensorflow/cc/saved_model:constants",
         "//tensorflow/cc/saved_model:loader_lite",
-        "//tensorflow/cc/saved_model:loader_util",
         "//tensorflow/compiler/jit:shape_inference_helpers",
         "//tensorflow/compiler/mlir:op_or_arg_name_mapper",
         "//tensorflow/compiler/tf2xla:functionalize_control_flow",
@@ -675,7 +673,6 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/grappler/utils:transitive_fanin",
-        "//tensorflow/core/platform:protobuf_internal",
         "//tensorflow/core/platform:types",
         "//tensorflow/stream_executor/lib",
         "@com_google_absl//absl/algorithm:container",
@@ -685,6 +682,7 @@ cc_library(
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:optional",
         "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:Analysis",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:StandardOps",
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.cc
index 6af70158e14..140a778770c 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.cc
@@ -76,23 +76,6 @@ static LogicalResult Verify(GlobalTensorOp global_tensor) {
   return success();
 }
 
-static LogicalResult Verify(SessionInitializerOp session_initializer) {
-  mlir::SymbolTable symbol_table(
-      session_initializer.getParentOfType<ModuleOp>());
-
-  auto init_func_op =
-      symbol_table.lookup<mlir::FuncOp>(session_initializer.initializer());
-  if (!init_func_op)
-    return session_initializer.emitOpError()
-           << "the initializer function does not exist";
-
-  if (!init_func_op.getType().getResults().empty())
-    return session_initializer.emitOpError()
-           << "the initializer function should have no output";
-
-  return success();
-}
-
 #define GET_OP_CLASSES
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.cc.inc"
 
@@ -237,14 +220,6 @@ static LogicalResult VerifySavedModelModule(
       }
     }
   }
-
-  auto session_initializers = module.getOps<SessionInitializerOp>();
-  if (std::distance(session_initializers.begin(), session_initializers.end()) >
-      1) {
-    return (*++session_initializers.begin()).emitError()
-           << "there must be no more than one session_initializer op";
-  }
-
   SymbolTable symbol_table(module);
   auto symbol_uses = SymbolTable::getSymbolUses(&module.getBodyRegion());
   if (!symbol_uses.hasValue()) {
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model_ops.td b/tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model_ops.td
index 497f4d90cb9..4431a160edf 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model_ops.td
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model_ops.td
@@ -128,28 +128,4 @@ def TfSavedModel_GlobalTensorOp : TfSavedModel_Op<"global_tensor"> {
   let verifier = [{ return Verify(*this); }];
 }
 
-def TfSavedModel_SessionInitializerOp: TfSavedModel_Op<"session_initializer"> {
-  let summary = "Initializes TensorFlow session state.";
-  let description = [{
-    Represents a session initializer function initializes TensorFlow session
-    state. It is used to initialize resources in the saved model before calling
-    any exported functions. There must be no more than one session initializer
-    in a saved model.
-
-    The `initializer` represents the initialization function. The function have
-    no output and this function should be only called once.
-
-    This is used, for example, to initialize hash tables stored in resources and
-    accessed by resource name (rather than as resource handles or bound inputs
-    which is how `global_tensor`s are referenced)
-  }];
-
-  let arguments = (ins
-    FlatSymbolRefAttr:$initializer
-  );
-
-
-  let verifier = [{ return Verify(*this); }];
-}
-
 #endif // SAVED_MODEL_DIALECT
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/common_v1.py b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/common_v1.py
index 51ccbeb1fbd..7171f63bb05 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/common_v1.py
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/common_v1.py
@@ -84,7 +84,6 @@ def do_test(signature_def_map, show_debug_info=False):
     builder.add_meta_graph_and_variables(
         sess, [tf.saved_model.tag_constants.SERVING],
         signature_def_map,
-        main_op=tf.tables_initializer(),
         strip_default_attrs=True)
     builder.save()
 
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/hash_table_v1.py b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/hash_table_v1.py
deleted file mode 100644
index 64847434b82..00000000000
--- a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/hash_table_v1.py
+++ /dev/null
@@ -1,92 +0,0 @@
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-# RUN: %p/hash_table_v1 | FileCheck %s
-
-# pylint: disable=missing-docstring,line-too-long
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import tensorflow.compat.v1 as tf
-from tensorflow.compiler.mlir.tensorflow.tests.tf_saved_model import common_v1
-
-# Verify that the tf.versions attribute exists. It is difficult to enforce
-# contents, since the version numbers change over time. The conversion logic
-# itself is verified in the common graphdef converter, so here just assert
-# it is being invoked.
-# CHECK: module
-# CHECK-SAME: tf.versions
-# CHECK-SAME: bad_consumers
-# CHECK-SAME: min_consumer
-# CHECK-SAME: producer
-
-# CHECK: "tf_saved_model.session_initializer"() {initializer = [[init:@.*]]} : () -> ()
-# CHECK: "tf_saved_model.global_tensor"()
-
-# CHECK:      func {{@[a-zA-Z_0-9]+}}(
-# CHECK-SAME: [[ARG0:%.*]]: tensor<i32>
-# CHECK-SAME: [[ARG1:%.*]]: tensor<!tf.resource
-# CHECK-SAME: attributes {{.*}} tf_saved_model.exported_names = ["key"]
-
-# CHECK-NEXT: [[R0:%.*]] = "tf.Const"()
-# CHECK-NEXT: [[R1:%.*]] = "tf.HashTableV2"()
-# CHECK-SAME: shared_name = "[[hash_table:.*]]"
-# CHECK-NEXT: [[R2:%.*]] = "tf.LookupTableFindV2"([[R1]], [[ARG0]], [[R0]])
-# CHECK-NEXT: [[R3:%.*]] = "tf.ReadVariableOp"([[ARG1]])
-# CHECK-NEXT: [[R4:%.*]] = "tf.AddV2"([[R2]], [[R3]])
-# CHECK-NEXT: return [[R4]]
-
-# CHECK:      func [[init]]
-# CHECK-NEXT: [[R5:%.*]] = "tf.Const"()
-# CHECK-NEXT: [[R6:%.*]] = "tf.Const"()
-# CHECK-NEXT: [[R7:%.*]] = "tf.HashTableV2"()
-# CHECK-SAME: shared_name = "[[hash_table]]"
-# CHECK-NEXT: "tf.LookupTableImportV2"([[R7]], [[R5]], [[R6]])
-
-
-def Test():
-
-  z = tf.compat.v1.get_variable(
-      name='y',
-      shape=(),
-      initializer=tf.random_normal_initializer(),
-      trainable=True)
-  table_initializer = tf.lookup.KeyValueTensorInitializer(
-      keys=[1, 2, 3, 4],
-      values=[5, 6, 7, 8],
-      key_dtype=tf.int32,
-      value_dtype=tf.float32)
-  table = tf.lookup.StaticHashTable(
-      table_initializer, default_value=tf.constant(0.0))
-
-  x = tf.placeholder(tf.int32, shape=(), name='input')
-  y = table.lookup(x)
-  r = tf.add(y, z)
-
-  tensor_info_x = tf.compat.v1.saved_model.utils.build_tensor_info(x)
-  tensor_info_r = tf.compat.v1.saved_model.utils.build_tensor_info(r)
-
-  return {
-      'key': (tf.compat.v1.saved_model.signature_def_utils.build_signature_def(
-          inputs={'x': tensor_info_x},
-          outputs={'r': tensor_info_r},
-          method_name='some_function'))
-  }
-
-
-if __name__ == '__main__':
-  common_v1.set_tf_options()
-  common_v1.do_test(Test())
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model_ops.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model_ops.mlir
index aa1f996da07..21e3bef8fd8 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model_ops.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model_ops.mlir
@@ -2,11 +2,6 @@
 
 module attributes {tf_saved_model.semantics} {
 
-  // CHECK: tf_saved_model.session_initializer
-  "tf_saved_model.session_initializer"() {
-    initializer = @f
-  } : () -> ()
-
   // Representation for constants: (immutable) global tensor.
   // CHECK: tf_saved_model.global_tensor
   "tf_saved_model.global_tensor"() {
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model_ops_invalid.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model_ops_invalid.mlir
index 544600cf6b8..c055c6c9f56 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model_ops_invalid.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model_ops_invalid.mlir
@@ -258,36 +258,3 @@ module attributes {tf_saved_model.semantics} {
   // expected-error@+1 {{'type' attribute for immutable 'tf_saved_model.global_tensor' should have a static shape}}
   "tf_saved_model.global_tensor"() { sym_name = "v", type = tensor<?xf32>, value = dense<1.> : tensor<1xf32> } : () -> ()
 }
-
-// -----
-
-module attributes {tf_saved_model.semantics} {
-
-  // expected-error@+1 {{the initializer function does not exist}}
-  "tf_saved_model.session_initializer"() { initializer = @init } : () -> ()
-}
-
-// -----
-
-module attributes {tf_saved_model.semantics} {
-
-  // expected-error@+1 {{the initializer function should have no output}}
-  "tf_saved_model.session_initializer"() { initializer = @init } : () -> ()
-  func @init() -> tensor<1xf32> {
-    %0 = "tf.Const"() {value = dense<[1.0]> : tensor<1xf32> } : () -> tensor<1xf32>
-    return %0 : tensor<1xf32>
-  }
-}
-
-// -----
-
-module attributes {tf_saved_model.semantics} {
-
-  "tf_saved_model.session_initializer"() { initializer = @init } : () -> ()
-  // expected-error@+1 {{there must be no more than one session_initializer op}}
-  "tf_saved_model.session_initializer"() { initializer = @init } : () -> ()
-  func @init() -> tensor<1xf32> {
-    %0 = "tf.Const"() {value = dense<[1.0]> : tensor<1xf32> } : () -> tensor<1xf32>
-    return %0 : tensor<1xf32>
-  }
-}
diff --git a/tensorflow/compiler/mlir/tensorflow/translate/import_model.cc b/tensorflow/compiler/mlir/tensorflow/translate/import_model.cc
index 3cff4217215..820d0ce31fb 100644
--- a/tensorflow/compiler/mlir/tensorflow/translate/import_model.cc
+++ b/tensorflow/compiler/mlir/tensorflow/translate/import_model.cc
@@ -60,8 +60,6 @@ limitations under the License.
 #include "mlir/IR/Types.h"  // from @llvm-project
 #include "mlir/IR/Verifier.h"  // from @llvm-project
 #include "mlir/Pass/PassManager.h"  // from @llvm-project
-#include "tensorflow/cc/saved_model/constants.h"
-#include "tensorflow/cc/saved_model/loader_util.h"
 #include "tensorflow/compiler/jit/shape_inference_helpers.h"
 #include "tensorflow/compiler/mlir/op_or_arg_name_mapper.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_attributes.h"
@@ -101,7 +99,6 @@ limitations under the License.
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/protobuf.h"
-#include "tensorflow/core/platform/protobuf_internal.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/protobuf/graph_debug_info.pb.h"
 #include "tensorflow/core/protobuf/meta_graph.pb.h"
@@ -119,7 +116,6 @@ using mlir::NamedAttrList;
 using mlir::TensorType;
 using mlir::TF::VarHandleOp;
 using mlir::tf_saved_model::GlobalTensorOp;
-using mlir::tf_saved_model::SessionInitializerOp;
 using stream_executor::port::StatusOr;
 
 namespace {
@@ -2959,13 +2955,6 @@ void SortSavedModelModule(mlir::ModuleOp module) {
     named_global_tensor.global_tensor.getOperation()->moveBefore(
         &module.getBody()->front());
   }
-
-  auto initializers = module.getOps<SessionInitializerOp>();
-  if (!initializers.empty()) {
-    (*initializers.begin())
-        .getOperation()
-        ->moveBefore(&module.getBody()->front());
-  }
 }
 
 Status CreateSavedModelIR(
@@ -3252,29 +3241,17 @@ class SavedModelSignatureDefImporter {
                                  absl::Span<std::string> exported_names,
                                  mlir::MLIRContext* context)
       : bundle_(bundle),
-        flib_def_(OpRegistry::Global(), graph_def().library()),
-        debug_info_(),
         exported_names_(exported_names),
-        module_(mlir::ModuleOp::create(mlir::UnknownLoc::get(context))) {
-    // debug_info might not be loaded with loader_lite.
-    if (bundle_.debug_info != nullptr) debug_info_ = *bundle_.debug_info;
-  }
+        module_(mlir::ModuleOp::create(mlir::UnknownLoc::get(context))) {}
 
   // Converts the SavedModel to the SavedModel dialect. Creates an MLIR function
   // for each signature.
   StatusOr<mlir::OwningModuleRef> ConvertSignatures();
-  Status ConvertSignature(const std::string& sig_def_key,
-                          const SignatureDef& signature_def);
-
-  // Converts the initialization graph in the SavedModel to an MLIR function.
-  Status ConvertInitializer();
-
-  // Converts a graph with feeds and fetches to an MLIR function.
-  StatusOr<mlir::OwningModuleRef> ConvertGraph(
-      const std::string& name,
-      const std::vector<std::pair<std::string, TensorInfo>>& inputs,
-      const std::vector<std::pair<std::string, TensorInfo>>& outputs,
-      const std::vector<std::string> control_outputs);
+  Status ConvertSignature(const GraphDef& graphdef,
+                          const std::string& sig_def_key,
+                          const SignatureDef& signature_def,
+                          const GraphDebugInfo& debug_info,
+                          const FunctionLibraryDefinition& flib_def);
 
   // Creates GlobalTensorOp for each variable and moves each VarHandle op to
   // the enclosing function's arguments.
@@ -3296,62 +3273,18 @@ class SavedModelSignatureDefImporter {
   GraphImportConfig::InputArrays ParseInputArrays(
       const std::vector<std::pair<std::string, TensorInfo>>& inputs);
 
-  const GraphDef& graph_def() const {
-    return bundle_.meta_graph_def.graph_def();
-  }
-  const FunctionLibraryDefinition& flib_def() const { return flib_def_; }
-  const GraphDebugInfo& debug_info() const { return debug_info_; }
-
   const SavedModelBundle& bundle_;
-  FunctionLibraryDefinition flib_def_;
-  GraphDebugInfo debug_info_;
   absl::Span<std::string> exported_names_;
   mlir::OwningModuleRef module_;
 };
 
-Status SavedModelSignatureDefImporter::ConvertInitializer() {
-  std::vector<AssetFileDef> asset_file_defs;
-  TF_RETURN_IF_ERROR(
-      internal::GetAssetFileDefs(bundle_.meta_graph_def, &asset_file_defs));
-
-  if (!asset_file_defs.empty())
-    return errors::Unimplemented(
-        absl::StrCat("Assets are not supported in signaturedef importer"));
-
-  std::string init_node_name;
-  TF_RETURN_IF_ERROR(
-      internal::GetInitOp("", bundle_.meta_graph_def, &init_node_name));
-
-  if (init_node_name.empty()) return Status::OK();
-
-  TF_ASSIGN_OR_RETURN(auto sub_module,
-                      ConvertGraph(init_node_name, {}, {}, {init_node_name}));
-
-  mlir::SymbolTable symbol_table(*sub_module);
-
-  auto init_func_op = symbol_table.lookup<mlir::FuncOp>(init_node_name);
-
-  init_func_op.removeAttr("tf.entry_function");
-
-  mlir::OpBuilder builder(module_->getBodyRegion());
-
-  builder.create<mlir::tf_saved_model::SessionInitializerOp>(
-      module_->getLoc(), builder.getSymbolRefAttr(init_func_op.getName()));
-
-  // Move the converted functions to top level MLIR module.
-  auto* block = module_->getBody();
-  auto* sub_block = sub_module->getBody();
-  block->getOperations().splice(
-      mlir::Block::iterator(block->getTerminator()), sub_block->getOperations(),
-      sub_block->begin(), mlir::Block::iterator(sub_block->getTerminator()));
-
-  return Status::OK();
-}
-
 StatusOr<mlir::OwningModuleRef>
 SavedModelSignatureDefImporter::ConvertSignatures() {
   const auto& signatures = bundle_.GetSignatures();
-  PopulateTfVersions(module_.get(), graph_def().versions());
+  const auto& graphdef = bundle_.meta_graph_def.graph_def();
+  PopulateTfVersions(module_.get(), graphdef.versions());
+
+  FunctionLibraryDefinition flib_def(OpRegistry::Global(), graphdef.library());
 
   // debug_info might not be loaded with loader_lite.
   GraphDebugInfo debug_info;
@@ -3374,10 +3307,9 @@ SavedModelSignatureDefImporter::ConvertSignatures() {
       continue;
     }
 
-    TF_RETURN_IF_ERROR(ConvertSignature(sig_def_key, signature_def));
+    TF_RETURN_IF_ERROR(ConvertSignature(graphdef, sig_def_key, signature_def,
+                                        debug_info, flib_def));
   }
-
-  TF_RETURN_IF_ERROR(ConvertInitializer());
   TF_RETURN_IF_ERROR(LiftVariables());
 
   mlir::OpBuilder builder(module_->getBodyRegion());
@@ -3388,32 +3320,10 @@ SavedModelSignatureDefImporter::ConvertSignatures() {
   return std::move(module_);
 }
 
-StatusOr<mlir::OwningModuleRef> SavedModelSignatureDefImporter::ConvertGraph(
-    const std::string& name,
-    const std::vector<std::pair<std::string, TensorInfo>>& inputs,
-    const std::vector<std::pair<std::string, TensorInfo>>& outputs,
-    const std::vector<std::string> control_outputs) {
-  GraphImportConfig specs;
-  specs.prune_unused_nodes = true;
-  specs.inputs = ParseInputArrays(inputs);
-  for (auto& output : outputs) specs.outputs.push_back(output.second.name());
-  specs.control_outputs = control_outputs;
-
-  // Convert sub-graphdef to sub-graph.
-  GraphConstructorOptions options;
-  options.allow_internal_ops = true;
-  options.add_default_attributes = true;
-  Graph graph(OpRegistry::Global());
-
-  TF_RETURN_IF_ERROR(ConvertGraphDefToGraph(options, graph_def(), &graph));
-
-  // Convert sub-graph to MLIR module.true
-  return GraphDefImporter::Convert(module_->getContext(), graph, debug_info(),
-                                   flib_def(), specs, name);
-}
-
 Status SavedModelSignatureDefImporter::ConvertSignature(
-    const std::string& sig_def_key, const SignatureDef& signature_def) {
+    const GraphDef& graphdef, const std::string& sig_def_key,
+    const SignatureDef& signature_def, const GraphDebugInfo& debug_info,
+    const FunctionLibraryDefinition& flib_def) {
   // Create local vectors for the input and output and sort them to be
   // deterministic. We don't want anyone to really depend on the order, client
   // should lookup argument/result mapping by attribute name.
@@ -3429,9 +3339,34 @@ Status SavedModelSignatureDefImporter::ConvertSignature(
     return lhs.first.size() < rhs.first.size() || lhs.first > rhs.first;
   });
 
+  GraphImportConfig specs;
+  specs.prune_unused_nodes = true;
+  specs.inputs = ParseInputArrays(inputs);
+  for (auto& output : outputs) specs.outputs.push_back(output.second.name());
+
+  // Remove unused nodes and create sub-graphdef.
+  GraphDef sub_graph_def;
+  TF_RETURN_IF_ERROR(tensorflow::grappler::SetTransitiveFaninGraph(
+      graphdef, &sub_graph_def,
+      /*terminal_nodes=*/{specs.outputs.begin(), specs.outputs.end()}));
+
+  // Set the function library definitions in the pruned graphdef.
+  *sub_graph_def.mutable_library() = flib_def.ToProto();
+
+  // Convert sub-graphdef to sub-graph.
+  GraphConstructorOptions options;
+  options.allow_internal_ops = true;
+  options.add_default_attributes = true;
+  Graph sub_graph(OpRegistry::Global());
+
+  TF_RETURN_IF_ERROR(
+      ConvertGraphDefToGraph(options, sub_graph_def, &sub_graph));
+
   // Convert sub-graph to MLIR module.
-  TF_ASSIGN_OR_RETURN(auto sub_module,
-                      ConvertGraph(sig_def_key, inputs, outputs, {}));
+  TF_ASSIGN_OR_RETURN(
+      auto sub_module,
+      GraphDefImporter::Convert(module_->getContext(), sub_graph, debug_info,
+                                flib_def, specs, sig_def_key));
   mlir::OpBuilder builder(sub_module->getBodyRegion());
 
   // Find the FuncOp which corresponds to current SignatureDef.

From 02be9b5ef8b3faac54feea8ee99ff14c62d40704 Mon Sep 17 00:00:00 2001
From: Robert David <lrdx@google.com>
Date: Wed, 17 Jun 2020 15:46:30 -0700
Subject: [PATCH 0440/1390] Remove erroneous end-of-namespace comment at the
 end of a function.

PiperOrigin-RevId: 316981028
Change-Id: I06c1063ba6206fe301f8fd6b89e3ac435ec56ba6
---
 tensorflow/lite/kernels/lstm_eval.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/lite/kernels/lstm_eval.cc b/tensorflow/lite/kernels/lstm_eval.cc
index 4ac9e538317..2e7f300f9a9 100644
--- a/tensorflow/lite/kernels/lstm_eval.cc
+++ b/tensorflow/lite/kernels/lstm_eval.cc
@@ -893,7 +893,7 @@ inline void LstmStepHybrid(
     std::copy_n(output_state_ptr + b * n_output, n_output,
                 output_ptr + b * output_batch_leading_dim);
   }
-}  // namespace
+}
 
 // Fully quantized lstm kernel for 16 bit gate matmul output.
 //

From 09bdeb13ebb20f12c76e3d0bd852f42b2f63ffef Mon Sep 17 00:00:00 2001
From: Andy Ly <lyandy@google.com>
Date: Wed, 17 Jun 2020 15:46:55 -0700
Subject: [PATCH 0441/1390] Fix updating results of TPU cluster with
 parallel_execute results when TPU cluster results are not perfectly
 forwarded.

After TPUExtractHeadTailOutsideCompilation some results left of the TPU cluster may not be used by an optional tf_device.replicate op if there is data parallelism. Instead, all results should be remapped if they are used outside of parallel_execute.

PiperOrigin-RevId: 316981114
Change-Id: I5529074857e06cfe26a7141c262a6229fe848be6
---
 .../tpu_extract_outside_compilation.mlir      | 22 +++++++++++++++++-
 .../tpu_extract_outside_compilation.cc        | 23 ++++++++-----------
 2 files changed, 31 insertions(+), 14 deletions(-)

diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tpu_extract_outside_compilation.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tpu_extract_outside_compilation.mlir
index 6bb8e99d796..d88489f5da0 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tpu_extract_outside_compilation.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tpu_extract_outside_compilation.mlir
@@ -262,7 +262,6 @@ func @single_outside_compiled_input_output_single_outside_compilation(%arg0: ten
   return %1 : tensor<?xi32>
 }
 
-
 // Tests extraction of a single outside compiled cluster with multiple input/output.
 
 // CHECK-LABEL: func @multiple_outside_compiled_input_output_single_outside_compilation
@@ -439,3 +438,24 @@ func @multiple_outside_compiled_inputs_single_outside_compilation(%arg0: tensor<
 
   return %1 : tensor<?xi32>
 }
+
+// Tests only directly used results of tpu cluster are remapped with
+// parallel_execute.
+
+// CHECK-LABEL: func @remapped_results
+func @remapped_results(%arg0: tensor<?xi32>) -> tensor<?xi32> {
+  %0 = "tf.A"(%arg0) : (tensor<?xi32>) -> tensor<?xi32>
+  // CHECK: %[[REPLICATE:[0-9]*]]:2 = tf_device.replicate
+  // CHECK:   %[[PARALLEL_EXECUTE_OUTPUT:[0-9]*]]:2 = "tf_device.parallel_execute"
+  // CHECK: tf_device.return %[[PARALLEL_EXECUTE_OUTPUT]]#1 : tensor<?xi32>
+  %1:2 = tf_device.replicate([%0, %arg0] as %ri_0: tensor<?xi32>) {n = 2 : i32} {
+    %2:2 = "tf_device.cluster"() ( {
+      %3 = "tf.A"() : () -> (tensor<?xi32>)
+      %4 = "tf.B"(%3) {_xla_outside_compilation = "cluster1"} : (tensor<?xi32>) -> (tensor<?xi32>)
+      %5:2 = "tf.C"(%4) : (tensor<?xi32>) -> (tensor<?xi32>, tensor<?xi32>)
+      tf_device.return %5#0, %5#1 : tensor<?xi32>, tensor<?xi32>
+    }) {cluster_attr = "cluster_attr"} : () -> (tensor<?xi32>, tensor<?xi32>)
+    tf_device.return %2#1 : tensor<?xi32>
+  }
+  return %1 : tensor<?xi32>
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_extract_outside_compilation.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_extract_outside_compilation.cc
index 54600faca4b..a2a19108326 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_extract_outside_compilation.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_extract_outside_compilation.cc
@@ -108,18 +108,6 @@ tf_device::LaunchOp CreateLaunchOpForOutsideCluster(
   return launch_op;
 }
 
-// Propagates the return from `parallel_execute_op` to parent replicate
-// op if it exists.
-void PropagateParallelExecuteReturnToReplicate(
-    tf_device::ParallelExecuteOp parallel_execute_op) {
-  // Update the return for the parallel_execute op parent.
-  auto replicate = llvm::dyn_cast_or_null<tf_device::ReplicateOp>(
-      parallel_execute_op.getParentOp());
-  if (replicate)
-    replicate.GetBody().getTerminator()->setOperands(
-        parallel_execute_op.execute_outputs());
-}
-
 // Extracts all externally provided operands of `cluster_ops`.
 llvm::SmallSetVector<Value, 4> GetExternalOperands(
     llvm::ArrayRef<Operation*> cluster_ops) {
@@ -305,7 +293,16 @@ void CreateParallelExecuteFromOutsideClusters(
   tpu_cluster.getOperation()->moveBefore(
       parallel_execute_tpu_block.getTerminator());
 
-  PropagateParallelExecuteReturnToReplicate(parallel_execute_op);
+  // Remap cluster results with parallel_execute results if user is outside of
+  // parallel_execute.
+  for (auto result :
+       llvm::zip(tpu_cluster.getResults(), parallel_execute_op.getResults())) {
+    Value tpu_cluster_result = std::get<0>(result);
+    Value parallel_execute_result = std::get<1>(result);
+    for (auto& use : llvm::make_early_inc_range(tpu_cluster_result.getUses()))
+      if (!parallel_execute_op.getOperation()->isProperAncestor(use.getOwner()))
+        use.set(parallel_execute_result);
+  }
 }
 
 void TPUExtractOutsideCompilation::runOnFunction() {

From 259ce6bd35c487ca39b375f16dc91d3636a9e838 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 17 Jun 2020 15:47:01 -0700
Subject: [PATCH 0442/1390] Move the ScatterExpander pass after the
 DynamicPadder pass on cpu. DynamicPadder will pad the indices of
 scatter(operand 1) if indices is of dynamic size. If ScatterExpander pass is
 run first, scatter will be rewrite to a sequence of other ops and this pad
 will not be performed and give incorrect answer.

PiperOrigin-RevId: 316981132
Change-Id: I2662d783e73a0467ebd4f1af21f7ac6cf93faef5
---
 tensorflow/compiler/xla/service/cpu/cpu_compiler.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
index b2416ac2799..31b9fe1c920 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
@@ -277,12 +277,12 @@ Status CpuCompiler::RunHloPassesThroughLayoutAssn(
   pipeline.AddPass<ConvolutionGroupConverter>(
       cost_model,
       /*convert_batch_groups_only=*/false);
-  pipeline.AddPass<ScatterExpander>();
   pipeline.AddPass<BatchNormExpander>(
       /*rewrite_training_op=*/true,
       /*rewrite_inference_op=*/true,
       /*rewrite_grad_op=*/true);
   pipeline.AddPass<DynamicPadder>();
+  pipeline.AddPass<ScatterExpander>();
   pipeline.AddPass<HloGetDimensionSizeRewriter>();
   pipeline.AddPass<ConvCanonicalization>(target_machine_features);
   {

From 2053b238d0d717aef4dfcf4be03daa43d7eeede2 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 17 Jun 2020 15:50:24 -0700
Subject: [PATCH 0443/1390] add two internal stats.

PiperOrigin-RevId: 316981769
Change-Id: Ia53c128b82fa750942acb5441ae6c8dc31478b07
---
 tensorflow/core/profiler/utils/xplane_schema.cc | 5 ++++-
 tensorflow/core/profiler/utils/xplane_schema.h  | 2 ++
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/tensorflow/core/profiler/utils/xplane_schema.cc b/tensorflow/core/profiler/utils/xplane_schema.cc
index 197dab75d3b..be53dcbdc01 100644
--- a/tensorflow/core/profiler/utils/xplane_schema.cc
+++ b/tensorflow/core/profiler/utils/xplane_schema.cc
@@ -166,6 +166,8 @@ const StatTypeMap& GetStatTypeMap() {
       {"is_eager", kIsEager},
       {"tf_function_call", kTfFunctionCall},
       {"tracing_count", kTfFunctionTracingCount},
+      {"flops", kFlops},
+      {"bytes_accessed", kBytesAccessed},
       // Performance counter related.
       {"Raw Value", kRawValue},
       {"Scaled Value", kScaledValue},
@@ -227,7 +229,8 @@ bool IsInternalStat(absl::optional<int64> stat_type) {
       StatType::kKernelDetails, StatType::kLevel0,
       StatType::kProducerType,  StatType::kProducerId,
       StatType::kConsumerType,  StatType::kConsumerId,
-      StatType::kIsRoot,        StatType::kIsAsync};
+      StatType::kIsRoot,        StatType::kIsAsync,
+      StatType::kFlops,         StatType::kBytesAccessed};
   return stat_type.has_value() && kInternalStats->contains(*stat_type);
 }
 
diff --git a/tensorflow/core/profiler/utils/xplane_schema.h b/tensorflow/core/profiler/utils/xplane_schema.h
index 8b999dc6f9f..a31814cef06 100644
--- a/tensorflow/core/profiler/utils/xplane_schema.h
+++ b/tensorflow/core/profiler/utils/xplane_schema.h
@@ -153,6 +153,8 @@ enum StatType {
   kIsEager,
   kTfFunctionCall,
   kTfFunctionTracingCount,
+  kFlops,
+  kBytesAccessed,
   // Performance counter related.
   kRawValue,
   kScaledValue,

From 97afd248868c7f28c197abde87cf610b550bdef9 Mon Sep 17 00:00:00 2001
From: Kaixi Hou <kaixih@nvidia.com>
Date: Wed, 17 Jun 2020 10:00:23 -0700
Subject: [PATCH 0444/1390] Relu grad GPU uses 8 float16 element vector

---
 tensorflow/core/kernels/relu_op_gpu.cu.cc | 89 +++++++++++++++++++++--
 1 file changed, 82 insertions(+), 7 deletions(-)

diff --git a/tensorflow/core/kernels/relu_op_gpu.cu.cc b/tensorflow/core/kernels/relu_op_gpu.cu.cc
index 27fd5f64249..983cc127863 100644
--- a/tensorflow/core/kernels/relu_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/relu_op_gpu.cu.cc
@@ -35,6 +35,7 @@ namespace tensorflow {
 
 typedef Eigen::GpuDevice GPUDevice;
 
+static constexpr int VectorSizeElements = 8;
 namespace functor {
 
 // This kernel computes ReluGrad by processing one half2, two fp16, at a time.
@@ -93,6 +94,66 @@ __global__ void ReluGradHalfKernel(const Eigen::half* __restrict__ gradient,
   }
 }
 
+__global__ void ReluGradHalfKernelVector(
+    const Eigen::half* __restrict__ gradient,
+    const Eigen::half* __restrict__ feature,
+    Eigen::half* __restrict__ backprop, int32 count) {
+  int32 half8_count = count / VectorSizeElements;
+  int32 index = blockIdx.x * blockDim.x + threadIdx.x;
+
+  if (index < half8_count) {
+    // Cast to xx_h8 for vector load and store.
+    float4 gradient_h8 = reinterpret_cast<const float4*>(gradient)[index];
+    float4 feature_h8 = reinterpret_cast<const float4*>(feature)[index];
+    float4* p_backprop_h8 = reinterpret_cast<float4*>(backprop) + index;
+
+    half2 *gradient_h2 = reinterpret_cast<half2*>(&gradient_h8);
+    half2 *feature_h2 = reinterpret_cast<half2*>(&feature_h8);
+    float4 backprop_h8;
+    half2* p_backprop_h2 = reinterpret_cast<half2*>(&backprop_h8);
+
+    // Fast path, when half2 primitives are available.
+#if __CUDA_ARCH__ >= 530
+    const half2 kZeroH2 = __float2half2_rn(0.f);
+#endif
+    for (int i = 0; i < VectorSizeElements / 2; i++) {
+#if __CUDA_ARCH__ >= 530
+      // mask = (feature > 0)
+      half2 mask_h2 = __hgt2(feature_h2[i], kZeroH2);
+      // backprop = mask * gradient
+      half2 backprop_h2 = __hmul2(mask_h2, gradient_h2[i]);
+#else
+      // Fall back: convert half2 to float2 for processing.
+      float2 feature_f2 = __half22float2(feature_h2[i]);
+      float2 gradient_f2 = __half22float2(gradient_h2[i]);
+      float2 backprop_f2 =
+          make_float2((feature_f2.x > 0.0f) ? float(gradient_f2.x) : 0.0f,
+                      (feature_f2.y > 0.0f) ? float(gradient_f2.y) : 0.0f);
+      // Convert back to half2.
+      half2 backprop_h2 = __float22half2_rn(backprop_f2);
+#endif
+      p_backprop_h2[i] = backprop_h2;
+    }
+    // Write back the result.
+    *p_backprop_h8 = backprop_h8;
+  }
+
+  int remaining_count = (count % VectorSizeElements);
+
+  if (index < remaining_count) {
+    // Use first threads to process the remaining elements.
+    Eigen::half grad_h = gradient[half8_count * VectorSizeElements + index];
+    Eigen::half feature_h = feature[half8_count * VectorSizeElements + index];
+
+    float grad_f = static_cast<float>(grad_h);
+    float feature_f = static_cast<float>(feature_h);
+    float backprop_f = (feature_f > 0) ? grad_f : 0;
+
+    Eigen::half backprop_h(backprop_f);
+    backprop[half8_count * VectorSizeElements + index] = backprop_h;
+  }
+}
+
 template <typename Device>
 struct ReluGrad<Device, Eigen::half> {
   // Computes ReluGrad backprop.
@@ -108,15 +169,29 @@ struct ReluGrad<Device, Eigen::half> {
     // NOTE: When the activation is exactly zero, we do not propagate the
     // associated gradient value. This allows the output of the Relu to be used,
     // as well as its input.
+    auto gradient_ptr = reinterpret_cast<uintptr_t>(gradient.data());
+    auto feature_ptr = reinterpret_cast<uintptr_t>(feature.data());
+    auto backprop_ptr = reinterpret_cast<uintptr_t>(backprop.data());
+    bool aligned = gradient_ptr % 16 == 0 && feature_ptr % 16 == 0 &&
+                   backprop_ptr % 16 == 0;
     int32 count = gradient.size();
-    if (count == 0) return;
-    int32 half2_count = Eigen::divup(count, 2);
     constexpr int32 kThreadInBlock = 512;
-    GpuLaunchConfig config = GetGpuLaunchConfigFixedBlockSize(
-        half2_count, d, ReluGradHalfKernel, 0, kThreadInBlock);
-    TF_CHECK_OK(GpuLaunchKernel(
-        ReluGradHalfKernel, config.block_count, config.thread_per_block, 0,
-        d.stream(), gradient.data(), feature.data(), backprop.data(), count));
+    if (count == 0) return;
+    if (aligned) {
+      int32 half8_count = Eigen::divup(count, VectorSizeElements);
+      int32 kBlock = Eigen::divup(half8_count, kThreadInBlock);
+      TF_CHECK_OK(GpuLaunchKernel(
+          ReluGradHalfKernelVector, kBlock, kThreadInBlock,
+          0, d.stream(), gradient.data(), feature.data(), backprop.data(),
+          count));
+    } else {
+      int32 half2_count = Eigen::divup(count, 2);
+      GpuLaunchConfig config = GetGpuLaunchConfigFixedBlockSize(
+          half2_count, d, ReluGradHalfKernel, 0, kThreadInBlock);
+      TF_CHECK_OK(GpuLaunchKernel(
+          ReluGradHalfKernel, config.block_count, config.thread_per_block, 0,
+          d.stream(), gradient.data(), feature.data(), backprop.data(), count));
+    }
   }
 };
 

From eb07cc9d8f370d1c99c317ade64f9dd210b61c61 Mon Sep 17 00:00:00 2001
From: rahul-kamat <rahulkamat@gmail.com>
Date: Wed, 17 Jun 2020 23:20:11 +0000
Subject: [PATCH 0445/1390] Map dtypes to classes and add whitelist op set

---
 tensorflow/python/framework/python_op_gen.cc | 30 ++++++++++++++++++++
 1 file changed, 30 insertions(+)

diff --git a/tensorflow/python/framework/python_op_gen.cc b/tensorflow/python/framework/python_op_gen.cc
index ca0c5d9ef1a..217033f1c31 100644
--- a/tensorflow/python/framework/python_op_gen.cc
+++ b/tensorflow/python/framework/python_op_gen.cc
@@ -45,6 +45,36 @@ const int kRightMargin = 78;
 
 constexpr char kEagerFallbackSuffix[] = "_eager_fallback";
 
+std::unordered_map<string, string> dtypes_map {
+      {"_dtypes.float16", "_dtypes.Float16"},
+      {"_dtypes.float32", "_dtypes.Float32"},
+      {"_dtypes.float64", "_dtypes.Float64"},
+      {"_dtypes.bfloat16", "_dtypes.BFloat16"},
+      {"_dtypes.complex64", "_dtypes.Complex64"},
+      {"_dtypes.complex128", "_dtypes.Complex128"},
+      {"_dtypes.int8", "_dtypes.Int8"},
+      {"_dtypes.uint8", "_dtypes.UInt8"},
+      {"_dtypes.uint16", "_dtypes.UInt16"},
+      {"_dtypes.uint32", "_dtypes.UInt32"},
+      {"_dtypes.uint64", "_dtypes.UInt64"},
+      {"_dtypes.int16", "_dtypes.Int16"},
+      {"_dtypes.int32", "_dtypes.Int32"},
+      {"_dtypes.int64", "_dtypes.Int64"},
+      {"_dtypes.bool", "_dtypes.Bool"},
+      {"_dtypes.string", "_dtypes.String"},
+      {"_dtypes.qint8", "_dtypes.QInt8"},
+      {"_dtypes.quint8", "_dtypes.QUInt8"},
+      {"_dtypes.qint16", "_dtypes.QInt16"},
+      {"_dtypes.quint16", "_dtypes.QUInt16"},
+      {"_dtypes.qint32", "_dtypes.QInt32"},
+      {"_dtypes.resource", "_dtypes.Resource"},
+      {"_dtypes.variant", "_dtypes.Variant"}
+};
+
+// Add op name to this set to add type annotations
+std::unordered_set<string> type_annotate_ops {
+};
+
 string AttrVarName(const string& attr_name,
                    std::unordered_map<string, string>* attr_expressions) {
   const string var = strings::StrCat("_attr_", attr_name);

From 06075a3baa5f57a8e7c84ac814d7350e4582e017 Mon Sep 17 00:00:00 2001
From: rahul-kamat <rahulkamat@gmail.com>
Date: Wed, 17 Jun 2020 23:34:23 +0000
Subject: [PATCH 0446/1390] Add function to generate TypeVars for each op

---
 tensorflow/python/framework/python_op_gen.cc | 53 ++++++++++++++++++++
 1 file changed, 53 insertions(+)

diff --git a/tensorflow/python/framework/python_op_gen.cc b/tensorflow/python/framework/python_op_gen.cc
index 217033f1c31..e8e670de57c 100644
--- a/tensorflow/python/framework/python_op_gen.cc
+++ b/tensorflow/python/framework/python_op_gen.cc
@@ -178,6 +178,8 @@ class GenEagerPythonOp : public python_op_gen_internal::GenPythonOp {
 
   void AddRawOpExport(const string& parameters);
 
+  void GenerateTypeVars();
+
   void AddAttrForArg(const string& attr, int arg_index) {
     gtl::InsertIfNotPresent(&inferred_attrs_, attr,
                             op_def_.input_arg(arg_index).name());
@@ -397,6 +399,53 @@ string GenEagerPythonOp::Code() {
   return prelude_ + result_;
 }
 
+// Generate TypeVars using attrs
+void GenEagerPythonOp::GenerateTypeVars() {
+  bool added_typevar = false;
+  for (int i = 0; i<op_def_.attr_size(); ++i) {
+    const auto& attr(op_def_.attr(i));
+    if (attr.type() == "type") {
+      std::vector<string> allowed_types;
+      bool has_dtype_half = false;
+      for (int t : attr.allowed_values().list().type()) {
+        if (t == 19) { // DT_HALF = 19;
+          has_dtype_half = true;
+          break;
+        }
+        DataType dtype = static_cast<DataType>(t);
+        const string py_dtype = python_op_gen_internal::DataTypeToPython(dtype, "_dtypes.");
+        if (dtypes_map.find(py_dtype) != dtypes_map.end()) {
+          allowed_types.emplace_back(dtypes_map[py_dtype]);
+        }
+      }
+
+      // Do not create a type variable that includes the dtype half
+      if (has_dtype_half) continue;
+
+      // If all dtypes are allowed, add them all
+      if (allowed_types.empty()) {
+        for (std::pair<string, string> map_dtype : dtypes_map) {
+          allowed_types.emplace_back(map_dtype.second);
+        }
+      }
+
+      std::sort(allowed_types.begin(), allowed_types.end());
+
+      string typevar_dtypes;
+      for (std::vector<string>::iterator it = allowed_types.begin(); it != allowed_types.end(); ++it) {
+        if (!typevar_dtypes.empty()) strings::StrAppend(&typevar_dtypes, ", ");
+        strings::StrAppend(&typevar_dtypes, *it);
+      }
+
+      const string type_var_name = "TV_" + op_def_.name() + "_" + attr.name();
+      strings::StrAppend(&result_, type_var_name, " = TypeVar(\"", type_var_name, "\", ", typevar_dtypes,")\n");
+      added_typevar = true;
+    }
+  }
+
+  if(added_typevar) strings::StrAppend(&result_, "\n");
+}
+
 void GenEagerPythonOp::HandleGraphMode(
     const string& function_setup, const std::vector<string>& output_sizes) {
   strings::StrAppend(&result_, "  # Add nodes to the TensorFlow graph.\n");
@@ -720,6 +769,9 @@ void GenEagerPythonOp::AddEagerFunctionTeardown(
 bool GenEagerPythonOp::AddEagerFastPathAndGraphCode(
     const string& parameters, const std::vector<string>& output_sizes,
     const string& eager_not_allowed_error) {
+  if (type_annotate_ops.find(op_def_.name()) != type_annotate_ops.end()) {
+    GenerateTypeVars();
+  }
   if (api_def_.visibility() == ApiDef::VISIBLE) {
     strings::StrAppend(&result_, "@_dispatch.add_dispatch_list\n");
   }
@@ -1047,6 +1099,7 @@ from tensorflow.python.util.deprecation import deprecated_endpoints
 from tensorflow.python.util import dispatch as _dispatch
 from tensorflow.python.util.tf_export import tf_export
 
+from typing import TypeVar
 )");
 
   for (const auto& op_def : ops.op()) {

From c87ccd156017b2cad24740b375a78f322f3951ef Mon Sep 17 00:00:00 2001
From: rahul-kamat <rahulkamat@gmail.com>
Date: Wed, 17 Jun 2020 23:51:42 +0000
Subject: [PATCH 0447/1390] Add type annotations to op parameters

---
 tensorflow/python/framework/python_op_gen.cc | 69 +++++++++++++++++++-
 1 file changed, 66 insertions(+), 3 deletions(-)

diff --git a/tensorflow/python/framework/python_op_gen.cc b/tensorflow/python/framework/python_op_gen.cc
index e8e670de57c..777603c502d 100644
--- a/tensorflow/python/framework/python_op_gen.cc
+++ b/tensorflow/python/framework/python_op_gen.cc
@@ -179,6 +179,7 @@ class GenEagerPythonOp : public python_op_gen_internal::GenPythonOp {
   void AddRawOpExport(const string& parameters);
 
   void GenerateTypeVars();
+  string GetTypeAnnotatedParams();
 
   void AddAttrForArg(const string& attr, int arg_index) {
     gtl::InsertIfNotPresent(&inferred_attrs_, attr,
@@ -343,10 +344,15 @@ string GenEagerPythonOp::Code() {
   }
 
   string parameters;
-  for (const auto& param : params_no_default_) {
-    if (!parameters.empty()) strings::StrAppend(&parameters, ", ");
-    strings::StrAppend(&parameters, param.GetRenameTo());
+  if (type_annotate_ops.find(op_def_.name()) != type_annotate_ops.end()) {
+    strings::StrAppend(&parameters, GetTypeAnnotatedParams());
+  } else {
+    for (const auto& param : params_no_default_) {
+      if (!parameters.empty()) strings::StrAppend(&parameters, ", ");
+      strings::StrAppend(&parameters, param.GetRenameTo());
+    }
   }
+
   string parameters_with_defaults = parameters;
   for (const auto& param_and_default : params_with_default_) {
     if (!parameters.empty()) strings::StrAppend(&parameters, ", ");
@@ -399,6 +405,63 @@ string GenEagerPythonOp::Code() {
   return prelude_ + result_;
 }
 
+string GenEagerPythonOp::GetTypeAnnotatedParams() {
+  // holds mappings from param name to its type annotation
+  std::unordered_map<string, string> param_type_map;
+  for (int i = 0; i<op_def_.attr_size(); ++i) {
+    const auto& attr(op_def_.attr(i));
+    if (attr.type() == "type") {
+      bool has_dtype_half = false;
+      for (int t : attr.allowed_values().list().type()) {
+        if (t == 19) { // DT_HALF = 19;
+          has_dtype_half = true;
+          break;
+        }
+      }
+
+      // Do not add type annotations to an arg that can be of type half
+      if (has_dtype_half) continue;
+
+      const string type_var_name = "TV_" + op_def_.name() + "_" + attr.name();
+      param_type_map[attr.name()] = type_var_name;
+    }
+  }
+
+  for (int i = 0; i < api_def_.arg_order_size(); ++i) {
+    const auto& arg = *FindInputArg(api_def_.arg_order(i), op_def_);
+    // Do not add type annotations to args that accept multiple tensors
+    if (!arg.number_attr().empty()) continue;
+    string type_annotation;
+    if (param_type_map.find(arg.type_attr()) != param_type_map.end()) {
+      // Get the correct TypeVar if input maps to an attr
+      strings::StrAppend(&type_annotation, "_ops.Tensor[", param_type_map[arg.type_attr()], "]");
+    } else {
+      // Get the dtype of the tensor
+      const string py_dtype = python_op_gen_internal::DataTypeToPython(arg.type(), "_dtypes.");
+      if (dtypes_map.find(py_dtype) != dtypes_map.end()) {
+        strings::StrAppend(&type_annotation, "_ops.Tensor[", dtypes_map[py_dtype], "]");
+      }
+    }
+
+    param_type_map[arg.name()] = type_annotation;
+  }
+
+  // Add type annotations to params
+  string parameters;
+  for (const auto& param : params_no_default_) {
+    if (!parameters.empty()) strings::StrAppend(&parameters, ", ");
+    strings::StrAppend(&parameters, param.GetRenameTo());
+
+    if (param_type_map.find(param.GetName()) != param_type_map.end()) {
+      if(!param_type_map[param.GetName()].empty()){
+        strings::StrAppend(&parameters, ": ", param_type_map[param.GetName()]);
+      }
+    }
+  }
+
+  return parameters;
+}
+
 // Generate TypeVars using attrs
 void GenEagerPythonOp::GenerateTypeVars() {
   bool added_typevar = false;

From 28dbfeb2c141a9084c02134ca1175bcca41888c1 Mon Sep 17 00:00:00 2001
From: rahul-kamat <rahulkamat@gmail.com>
Date: Wed, 17 Jun 2020 23:59:45 +0000
Subject: [PATCH 0448/1390] Generate type annotations for op return values

---
 tensorflow/python/framework/python_op_gen.cc | 54 ++++++++++++++++++--
 1 file changed, 51 insertions(+), 3 deletions(-)

diff --git a/tensorflow/python/framework/python_op_gen.cc b/tensorflow/python/framework/python_op_gen.cc
index 777603c502d..8c4d0f5b753 100644
--- a/tensorflow/python/framework/python_op_gen.cc
+++ b/tensorflow/python/framework/python_op_gen.cc
@@ -180,7 +180,7 @@ class GenEagerPythonOp : public python_op_gen_internal::GenPythonOp {
 
   void GenerateTypeVars();
   string GetTypeAnnotatedParams();
-
+  void AddReturnTypeAnnotation();
   void AddAttrForArg(const string& attr, int arg_index) {
     gtl::InsertIfNotPresent(&inferred_attrs_, attr,
                             op_def_.input_arg(arg_index).name());
@@ -413,7 +413,7 @@ string GenEagerPythonOp::GetTypeAnnotatedParams() {
     if (attr.type() == "type") {
       bool has_dtype_half = false;
       for (int t : attr.allowed_values().list().type()) {
-        if (t == 19) { // DT_HALF = 19;
+        if (t == 19) { // DT_HALF = 19
           has_dtype_half = true;
           break;
         }
@@ -471,7 +471,7 @@ void GenEagerPythonOp::GenerateTypeVars() {
       std::vector<string> allowed_types;
       bool has_dtype_half = false;
       for (int t : attr.allowed_values().list().type()) {
-        if (t == 19) { // DT_HALF = 19;
+        if (t == 19) { // DT_HALF = 19
           has_dtype_half = true;
           break;
         }
@@ -509,6 +509,51 @@ void GenEagerPythonOp::GenerateTypeVars() {
   if(added_typevar) strings::StrAppend(&result_, "\n");
 }
 
+void GenEagerPythonOp::AddReturnTypeAnnotation() {
+  string return_type = "";
+  if (op_def_.output_arg_size() == 1) {
+    const auto& arg = op_def_.output_arg(0);
+    // If the "type" field is set, the return Tensor has a single DataType
+    if (arg.type() != 0) {
+      const string py_dtype = python_op_gen_internal::DataTypeToPython(arg.type(), "_dtypes.");
+      if (dtypes_map.find(py_dtype) != dtypes_map.end()) {
+        strings::StrAppend(&return_type, "_ops.Tensor[", dtypes_map[py_dtype], "]");
+      }
+    }
+    else {
+      for (int i = 0; i<op_def_.attr_size(); ++i) {
+        const auto& attr(op_def_.attr(i));
+        if (arg.type_attr() == attr.name() && attr.type() == "type") {
+          std::vector<string> allowed_types;
+          for (int t : attr.allowed_values().list().type()) {
+            // Do not add type annotations when return type can be half
+            if (t == 19) return; // DT_HALF = 19
+            DataType dtype = static_cast<DataType>(t);
+            const string py_dtype = python_op_gen_internal::DataTypeToPython(dtype, "_dtypes.");
+            allowed_types.emplace_back(py_dtype);
+          }
+
+          std::sort(allowed_types.begin(), allowed_types.end());
+
+          string typevar_dtypes;
+          for (std::vector<string>::iterator it = allowed_types.begin(); it != allowed_types.end(); ++it) {
+            if (!typevar_dtypes.empty()) strings::StrAppend(&typevar_dtypes, ", ");
+            strings::StrAppend(&typevar_dtypes, *it);
+          }
+
+          const string type_var_name = "TV_" + op_def_.name() + "_" + attr.name();
+          strings::StrAppend(&return_type, "_ops.Tensor[", type_var_name, "]");
+        }
+      }
+    }
+
+    if (!return_type.empty()) {
+      result_.erase(result_.length() - 2);
+      strings::StrAppend(&result_, " -> ", return_type, ":\n");
+    }
+  }
+}
+
 void GenEagerPythonOp::HandleGraphMode(
     const string& function_setup, const std::vector<string>& output_sizes) {
   strings::StrAppend(&result_, "  # Add nodes to the TensorFlow graph.\n");
@@ -841,6 +886,9 @@ bool GenEagerPythonOp::AddEagerFastPathAndGraphCode(
 
   AddExport();
   AddDefLine(function_name_, parameters);
+  if (type_annotate_ops.find(op_def_.name()) != type_annotate_ops.end()) {
+    AddReturnTypeAnnotation();
+  }
   AddDocStringDescription();
   AddDocStringArgs();
   AddDocStringInputs();

From bae911298b8b79ffd24464adc8fc123566a79c24 Mon Sep 17 00:00:00 2001
From: Raman Sarokin <sorokin@google.com>
Date: Wed, 17 Jun 2020 17:16:59 -0700
Subject: [PATCH 0449/1390] Additional check for Intel specialization.

PiperOrigin-RevId: 316997503
Change-Id: Ibc97cd77c5e15bb021546aac754e2953b050f0f3
---
 tensorflow/lite/delegates/gpu/cl/cl_device.cc | 27 +++++++++++++++++++
 tensorflow/lite/delegates/gpu/cl/cl_device.h  |  1 +
 .../delegates/gpu/cl/kernels/conv_powervr.cc  |  2 +-
 3 files changed, 29 insertions(+), 1 deletion(-)

diff --git a/tensorflow/lite/delegates/gpu/cl/cl_device.cc b/tensorflow/lite/delegates/gpu/cl/cl_device.cc
index 64e07428515..271fbce61ce 100644
--- a/tensorflow/lite/delegates/gpu/cl/cl_device.cc
+++ b/tensorflow/lite/delegates/gpu/cl/cl_device.cc
@@ -501,6 +501,33 @@ bool CLDevice::IsCL20OrHigher() const {
          info_.cl_version != OpenCLVersion::CL_1_2;
 }
 
+bool CLDevice::SupportsSubGroupWithSize(int sub_group_size) const {
+  if (IsIntel()) {
+    if (SupportsExtension("cl_intel_required_subgroup_size")) {
+      size_t sub_groups_count;
+      cl_int error =
+          clGetDeviceInfo(id_, 0x4108 /*CL_DEVICE_SUB_GROUP_SIZES_INTEL*/, 0,
+                          nullptr, &sub_groups_count);
+      if (error != CL_SUCCESS) {
+        return false;
+      }
+      std::vector<size_t> sub_group_sizes(sub_groups_count);
+      error = clGetDeviceInfo(id_, 0x4108 /*CL_DEVICE_SUB_GROUP_SIZES_INTEL*/,
+                              sizeof(size_t) * sub_groups_count,
+                              sub_group_sizes.data(), nullptr);
+      if (error != CL_SUCCESS) {
+        return false;
+      }
+      for (int i = 0; i < sub_groups_count; ++i) {
+        if (sub_group_sizes[i] == sub_group_size) {
+          return true;
+        }
+      }
+    }
+  }
+  return false;
+}
+
 bool CLDevice::IsAdreno() const { return info_.vendor == Vendor::QUALCOMM; }
 
 bool CLDevice::IsAdreno3xx() const {
diff --git a/tensorflow/lite/delegates/gpu/cl/cl_device.h b/tensorflow/lite/delegates/gpu/cl/cl_device.h
index ae6a1d11af6..68abcf3e202 100644
--- a/tensorflow/lite/delegates/gpu/cl/cl_device.h
+++ b/tensorflow/lite/delegates/gpu/cl/cl_device.h
@@ -179,6 +179,7 @@ class CLDevice {
   bool SupportsFP32RTN() const;
   bool SupportsFP16RTN() const;
   bool IsCL20OrHigher() const;
+  bool SupportsSubGroupWithSize(int sub_group_size) const;
   bool IsAdreno() const;
   bool IsAdreno3xx() const;
   bool IsAdreno4xx() const;
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/conv_powervr.cc b/tensorflow/lite/delegates/gpu/cl/kernels/conv_powervr.cc
index bd694e7cc4f..c20cfdbeaa3 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/conv_powervr.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/conv_powervr.cc
@@ -874,7 +874,7 @@ ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
     if (definition.precision != CalculationsPrecision::F32_F16 &&
         device.SupportsExtension("cl_khr_subgroups") &&
         device.SupportsExtension("cl_intel_required_subgroup_size") &&
-        device.IsCL20OrHigher()) {
+        device.IsCL20OrHigher() && device.SupportsSubGroupWithSize(16)) {
       conv_params.weights_upload_type =
           WeightsUploadType::PRIVATE_MEM_SIMD16_BROADCAST;
     } else {

From 1f05cc5973c1182c35cd4b11ee8cf0f01ed707dc Mon Sep 17 00:00:00 2001
From: Raman Sarokin <sorokin@google.com>
Date: Wed, 17 Jun 2020 17:18:45 -0700
Subject: [PATCH 0450/1390] Better FullyConnected/ConvTransposed selection for
 Intel.

PiperOrigin-RevId: 316997745
Change-Id: I28befdd528917c3846ff6ae79b0f8427389dfc39
---
 .../delegates/gpu/cl/kernels/convolution_transposed_3x3.cc     | 2 +-
 .../delegates/gpu/cl/kernels/convolution_transposed_4x4.cc     | 2 +-
 .../gpu/cl/selectors/convolution_transposed_selector.cc        | 1 +
 .../delegates/gpu/cl/selectors/fully_connected_selector.cc     | 3 +++
 4 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3.cc b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3.cc
index 4a68eda1d95..9b028721d2d 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3.cc
@@ -269,7 +269,7 @@ ConvolutionTransposed3x3::ConvolutionTransposed3x3(
       work_group_launch_order_(2, 0, 1) {
   if (device.IsPowerVR()) {
     weights_upload_type_ = WeightsUploadType::LOCAL_MEM_ASYNC;
-  } else if (device.IsNvidia()) {
+  } else if (device.IsNvidia() || device.IsIntel()) {
     weights_upload_type_ = WeightsUploadType::LOCAL_MEM_BY_THREADS;
   } else if (device.IsAMD()) {
     weights_upload_type_ = WeightsUploadType::CONSTANT_MEM;
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_4x4.cc b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_4x4.cc
index 0f7f90989e8..209b675087e 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_4x4.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_4x4.cc
@@ -270,7 +270,7 @@ ConvolutionTransposed4x4::ConvolutionTransposed4x4(
     : GPUOperation(definition) {
   if (device.IsPowerVR()) {
     weights_upload_type_ = WeightsUploadType::LOCAL_MEM_ASYNC;
-  } else if (device.IsNvidia()) {
+  } else if (device.IsNvidia() || device.IsIntel()) {
     weights_upload_type_ = WeightsUploadType::LOCAL_MEM_BY_THREADS;
   } else if (device.IsAMD()) {
     weights_upload_type_ = WeightsUploadType::CONSTANT_MEM;
diff --git a/tensorflow/lite/delegates/gpu/cl/selectors/convolution_transposed_selector.cc b/tensorflow/lite/delegates/gpu/cl/selectors/convolution_transposed_selector.cc
index 12e99b57aa7..5fdfdca073e 100644
--- a/tensorflow/lite/delegates/gpu/cl/selectors/convolution_transposed_selector.cc
+++ b/tensorflow/lite/delegates/gpu/cl/selectors/convolution_transposed_selector.cc
@@ -112,6 +112,7 @@ absl::Status SelectConvolutionTransposed(
     case Vendor::POWERVR:
     case Vendor::NVIDIA:
     case Vendor::AMD:
+    case Vendor::INTEL:
       return SelectConvolutionTransposedPowerVR(attr, creation_context, op_def,
                                                 ptr);
     case Vendor::MALI:
diff --git a/tensorflow/lite/delegates/gpu/cl/selectors/fully_connected_selector.cc b/tensorflow/lite/delegates/gpu/cl/selectors/fully_connected_selector.cc
index 12a1d726368..eacbea8b586 100644
--- a/tensorflow/lite/delegates/gpu/cl/selectors/fully_connected_selector.cc
+++ b/tensorflow/lite/delegates/gpu/cl/selectors/fully_connected_selector.cc
@@ -109,6 +109,9 @@ absl::Status SelectFullyConnected(const FullyConnectedAttributes& attr,
       return SelectFullyConnectedAdreno(attr, creation_context, op_def,
                                         batch_size, ptr);
     case Vendor::POWERVR:
+    case Vendor::AMD:
+    case Vendor::NVIDIA:
+    case Vendor::INTEL:
       return SelectFullyConnectedPowerVR(attr, creation_context, op_def,
                                          batch_size, ptr);
     case Vendor::MALI:

From 2a6d9a1e811f2c0d459839854597f8ca6b5c8750 Mon Sep 17 00:00:00 2001
From: Jose Baiocchi <jbaiocchi@google.com>
Date: Wed, 17 Jun 2020 17:18:58 -0700
Subject: [PATCH 0451/1390] Add test for op_metrics_db_utils IdleTimeRatio

PiperOrigin-RevId: 316997793
Change-Id: I0b89e697d661b2ea1fb95e7a6d51800b407617d0
---
 tensorflow/core/profiler/utils/BUILD          | 11 +++++
 .../utils/op_metrics_db_utils_test.cc         | 46 +++++++++++++++++++
 2 files changed, 57 insertions(+)
 create mode 100644 tensorflow/core/profiler/utils/op_metrics_db_utils_test.cc

diff --git a/tensorflow/core/profiler/utils/BUILD b/tensorflow/core/profiler/utils/BUILD
index ece58802661..0262c5659b7 100644
--- a/tensorflow/core/profiler/utils/BUILD
+++ b/tensorflow/core/profiler/utils/BUILD
@@ -77,6 +77,17 @@ cc_library(
     ],
 )
 
+tf_cc_test(
+    name = "op_metrics_db_utils_test",
+    srcs = ["op_metrics_db_utils_test.cc"],
+    deps = [
+        ":op_metrics_db_utils",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core/profiler/protobuf:op_metrics_proto_cc",
+    ],
+)
+
 cc_library(
     name = "op_utils",
     srcs = ["op_utils.cc"],
diff --git a/tensorflow/core/profiler/utils/op_metrics_db_utils_test.cc b/tensorflow/core/profiler/utils/op_metrics_db_utils_test.cc
new file mode 100644
index 00000000000..12c68426b2e
--- /dev/null
+++ b/tensorflow/core/profiler/utils/op_metrics_db_utils_test.cc
@@ -0,0 +1,46 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/profiler/utils/op_metrics_db_utils.h"
+
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/profiler/protobuf/op_metrics.pb.h"
+
+namespace tensorflow {
+namespace profiler {
+namespace {
+
+constexpr double kMaxError = 1E-10;
+
+TEST(OpMetricsDbTest, IdleTimeRatio) {
+  OpMetricsDb metrics_db_0;
+  metrics_db_0.set_total_time_ps(100000000);
+  metrics_db_0.set_total_op_time_ps(60000000);
+  EXPECT_NEAR(0.4, IdleTimeRatio(metrics_db_0), kMaxError);
+
+  OpMetricsDb metrics_db_1;
+  metrics_db_1.set_total_time_ps(200000000);
+  metrics_db_1.set_total_op_time_ps(150000000);
+  EXPECT_NEAR(0.25, IdleTimeRatio(metrics_db_1), kMaxError);
+
+  OpMetricsDb metrics_db_2;
+  metrics_db_1.set_total_time_ps(0);
+  metrics_db_1.set_total_op_time_ps(0);
+  EXPECT_NEAR(1.0, IdleTimeRatio(metrics_db_2), kMaxError);
+}
+
+}  // namespace
+}  // namespace profiler
+}  // namespace tensorflow

From 73cf8263c771f1813416af9310469c7041e7637e Mon Sep 17 00:00:00 2001
From: Raman Sarokin <sorokin@google.com>
Date: Wed, 17 Jun 2020 17:21:24 -0700
Subject: [PATCH 0452/1390] Pooling converted to new style. Merged 2D and 3D
 versions into one.

PiperOrigin-RevId: 316998142
Change-Id: I92c020476f085e6160a02282c1edafabdd72ca30
---
 .../lite/delegates/gpu/cl/kernels/pooling.cc  | 671 +++++++-----------
 .../lite/delegates/gpu/cl/kernels/pooling.h   |  40 +-
 .../lite/delegates/gpu/cl/kernels/util.cc     |   9 -
 .../lite/delegates/gpu/cl/kernels/util.h      |   5 -
 .../lite/delegates/gpu/cl/tensor_type.cc      |  36 +-
 .../lite/delegates/gpu/cl/tensor_type.h       |  10 +
 6 files changed, 313 insertions(+), 458 deletions(-)

diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/pooling.cc b/tensorflow/lite/delegates/gpu/cl/kernels/pooling.cc
index e292f2dad7d..922d484c57d 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/pooling.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/pooling.cc
@@ -25,366 +25,307 @@ namespace gpu {
 namespace cl {
 namespace {
 
-std::string GetAveragePoolingKernelCode(
-    const OperationDef& op_def, bool stride_correction, const CLDevice& device,
-    const std::vector<ElementwiseOperation*>& linked_operations) {
-  TensorCodeGenerator src_tensor(
-      "src_data", WHSPoint{"src_size.x", "src_size.y", "src_size.z"},
-      op_def.src_tensors[0]);
-  TensorCodeGenerator dst_tensor(
-      "dst_data", WHSPoint{"dst_size.x", "dst_size.y", "dst_size.z"},
-      op_def.dst_tensors[0]);
+std::string GetAveragePoolingKernelCode(const OperationDef& op_def,
+                                        bool stride_correction,
+                                        const CLDevice& device,
+                                        Arguments* args) {
+  auto src_desc = absl::make_unique<TensorDescriptor>(op_def.src_tensors[0]);
+  src_desc->SetTextureAddressMode(GetFastestZeroMode(device));
+  if (op_def.IsBatchSupported()) {
+    src_desc->SetStateVar("BatchedWidth", "true");
+  }
+  args->AddObjectRef("src_tensor", AccessType::READ, std::move(src_desc));
+  auto dst_desc = absl::make_unique<TensorDescriptor>(op_def.dst_tensors[0]);
+  if (op_def.IsBatchSupported()) {
+    dst_desc->SetStateVar("BatchedWidth", "true");
+  }
+  args->AddObjectRef("dst_tensor", AccessType::WRITE, std::move(dst_desc));
+  if (op_def.dst_tensors[0].HasAxis(Axis::WIDTH)) {
+    args->AddInt("kernel_size_x");
+    args->AddInt("padding_x");
+    args->AddInt("stride_x");
+  }
+  if (op_def.dst_tensors[0].HasAxis(Axis::HEIGHT)) {
+    args->AddInt("kernel_size_y");
+    args->AddInt("padding_y");
+    args->AddInt("stride_y");
+  }
+  if (op_def.dst_tensors[0].HasAxis(Axis::DEPTH)) {
+    args->AddInt("kernel_size_z");
+    args->AddInt("padding_z");
+    args->AddInt("stride_z");
+  }
 
-  const auto address_mode = GetFastestZeroMode(device);
+  std::map<Axis, std::string> axis_to_src_coord = {
+      {Axis::WIDTH, "x_c"},  {Axis::HEIGHT, "y_c"}, {Axis::DEPTH, "d_c"},
+      {Axis::CHANNELS, "Z"}, {Axis::BATCH, "B"},
+  };
 
-  std::string c = GetCommonDefines(op_def.precision);
+  std::map<Axis, std::string> axis_to_dst_coord = {
+      {Axis::WIDTH, "X"},    {Axis::HEIGHT, "Y"}, {Axis::DEPTH, "D"},
+      {Axis::CHANNELS, "Z"}, {Axis::BATCH, "B"},
+  };
+
+  std::vector<std::string> src_coords;
+  std::vector<std::string> dst_coords;
+  for (auto axis : {Axis::WIDTH, Axis::HEIGHT, Axis::DEPTH, Axis::CHANNELS}) {
+    if (op_def.dst_tensors[0].HasAxis(axis)) {
+      dst_coords.push_back(axis_to_dst_coord[axis]);
+    }
+    if (op_def.src_tensors[0].HasAxis(axis)) {
+      src_coords.push_back(axis_to_src_coord[axis]);
+    }
+  }
+  std::string src_coord = src_coords[0];
+  for (int i = 1; i < src_coords.size(); ++i) {
+    src_coord += ", " + src_coords[i];
+  }
+  std::string dst_coord = dst_coords[0];
+  for (int i = 1; i < dst_coords.size(); ++i) {
+    dst_coord += ", " + dst_coords[i];
+  }
 
   const bool manual_clamp =
       op_def.src_tensors[0].storage_type == TensorStorageType::BUFFER ||
       op_def.src_tensors[0].storage_type == TensorStorageType::IMAGE_BUFFER;
 
+  std::string c = GetCommonDefines(op_def.precision);
   c += "__kernel void main_function(\n";
-  c += src_tensor.GetDeclaration(AccessType::READ);
-  c += GetArgsDeclaration(linked_operations);
-  c += dst_tensor.GetDeclaration(AccessType::WRITE) + ",\n";
-  c += "    int4 src_size,             \n";
-  c += "    int4 dst_size,             \n";
-  c += "    int2 kernel_size,          \n";
-  c += "    int2 padding,              \n";
-  c += "    int2 stride                \n";
-  c += ") {\n";
+  c += "$0) {\n";
   c += "  int X = get_global_id(0);\n";
-  c += "  int Y = get_global_id(1);\n";
+  if (op_def.dst_tensors[0].HasAxis(Axis::DEPTH)) {
+    c += "  int linear_id_1 = get_global_id(1);\n";
+    c += "  int Y = linear_id_1 / args.dst_tensor.Depth();\n";
+    c += "  int D = linear_id_1 % args.dst_tensor.Depth();\n";
+  } else {
+    c += "  int Y = get_global_id(1);\n";
+  }
   c += "  int Z = get_global_id(2);\n";
-  c += "  if (X >= dst_size.x || Y >= dst_size.y || Z >= dst_size.z) return;\n";
+  c += "  if (X >= args.dst_tensor.Width() || Y >= args.dst_tensor.Height() || "
+       "Z >= args.dst_tensor.Slices()) { \n";
+  c += "    return; \n";
+  c += "  } \n";
   c += "  float4 r = (float4)(0.0f);\n";
   c += "  float window_size = 0.0;\n";
   if (stride_correction) {
     c += "  int xs = " +
-         GetXStrideCorrected("X", "src_size.w", "stride.x", "padding.x") +
+         GetXStrideCorrected("X", "args.src_tensor.Batch()", "args.stride_x",
+                             "args.padding_x") +
          ";\n";
   } else {
-    c += "  int xs = X * stride.x + padding.x;\n";
+    c += "  int xs = X * args.stride_x + args.padding_x;\n";
   }
-  c += "  int ys = Y * stride.y + padding.y;\n";
-  c += "  for (int ky = 0; ky < kernel_size.y; ++ky) {\n";
+  c += "  int ys = Y * args.stride_y + args.padding_y;\n";
+  if (op_def.dst_tensors[0].HasAxis(Axis::DEPTH)) {
+    c += "  int ds = D * args.stride_z + args.padding_z;\n";
+    c += "  for (int kz = 0; kz < args.kernel_size_z; ++kz) {\n";
+    c += "    int d_c = ds + kz;\n";
+    c += "    if (d_c < 0 || d_c >= args.src_tensor.Depth()) continue;\n";
+  }
+  c += "  for (int ky = 0; ky < args.kernel_size_y; ++ky) {\n";
   c += "    int y_c = ys + ky;\n";
-  c += "    bool outside_y = y_c < 0 || y_c >= src_size.y;\n";
-  c += "    for (int kx = 0; kx < kernel_size.x; ++kx) {\n";
+  c += "    bool outside_y = y_c < 0 || y_c >= args.src_tensor.Height();\n";
+  c += "    for (int kx = 0; kx < args.kernel_size_x; ++kx) {\n";
   if (op_def.IsBatchSupported()) {
-    c += "      int x_c = xs + kx * src_size.w;\n";
+    c += "      int x_c = xs + kx * args.src_tensor.Batch();\n";
   } else {
     c += "      int x_c = xs + kx;\n";
   }
-  c += "      bool outside = outside_y || x_c < 0 || x_c >= src_size.x;\n";
+  c += "      bool outside = outside_y || x_c < 0 || x_c >= "
+       "args.src_tensor.Width();\n";
   if (manual_clamp) {
-    c += "     r += !outside ? " +
-         src_tensor.ReadAsFloatWHS("x_c", "y_c", "Z") + " : (float4)(0.0f);\n";
+    c += "     r += !outside ? args.src_tensor.Read<float>(" + src_coord +
+         ") : "
+         "(float4)(0.0f);\n";
   } else {
-    c += "      r += " +
-         src_tensor.ReadAsFloatWHS("x_c", "y_c", "Z", address_mode) + ";\n";
+    c += "      r += args.src_tensor.Read<float>(" + src_coord + ");\n";
   }
   c += "        window_size += !outside ? 1.0 : 0.0;\n";
   c += "    }\n";
   c += "  }\n";
+  if (op_def.dst_tensors[0].HasAxis(Axis::DEPTH)) {
+    c += "  }  // Depth\n";
+  }
   // If window_size==0, window covered nothing. This situation is a sign of
   // incorrectly constructed operation. NaNs are expected as output.
   c += "  FLT4 result = TO_FLT4(r / window_size);\n";
-  const LinkingContext context{"result", "X", "Y", "Z"};
-  c += PostProcess(linked_operations, context);
-  c += "  " + dst_tensor.WriteWHS("result", "X", "Y", "Z");
+  c += "  args.dst_tensor.Write(result, " + dst_coord + ");\n";
   c += "}\n";
 
   return c;
 }
 
-std::string GetAveragePooling3DKernelCode(
-    const OperationDef& op_def, bool stride_correction, const CLDevice& device,
-    const std::vector<ElementwiseOperation*>& linked_operations) {
-  TensorCodeGenerator src_tensor(
-      "src_data",
-      WHDSPoint{"src_size.x", "src_size.y", "src_size.z", "src_size.w"},
-      op_def.src_tensors[0]);
-  TensorCodeGenerator dst_tensor(
-      "dst_data",
-      WHDSPoint{"dst_size.x", "dst_size.y", "dst_size.z", "dst_size.w"},
-      op_def.dst_tensors[0]);
-
-  const auto address_mode = GetFastestZeroMode(device);
-
-  std::string c = GetCommonDefines(op_def.precision);
-
-  c += "__kernel void main_function(\n";
-  c += src_tensor.GetDeclaration(AccessType::READ);
-  c += GetArgsDeclaration(linked_operations);
-  c += dst_tensor.GetDeclaration(AccessType::WRITE) + ",\n";
-  c += "    int4 src_size,             \n";
-  c += "    int4 dst_size,             \n";
+std::string GetMaxPoolingKernelCode(const OperationDef& op_def,
+                                    bool stride_correction, bool output_indices,
+                                    Arguments* args) {
+  auto src_desc = absl::make_unique<TensorDescriptor>(op_def.src_tensors[0]);
   if (op_def.IsBatchSupported()) {
-    c += "    int batch_size,          \n";
+    src_desc->SetStateVar("BatchedWidth", "true");
   }
-  c += "    int4 kernel_size,          \n";
-  c += "    int4 padding,              \n";
-  c += "    int4 stride                \n";
-  c += ") {\n";
-  c += "  int X = get_global_id(0);\n";
-  c += "  int Y = get_global_id(1);\n";
-  c += "  int linear_id_z = get_global_id(2);\n";
-  c += "  int S = linear_id_z % dst_size.w;\n";
-  c += "  int Z = linear_id_z / dst_size.w;\n";
-  c += "  if (X >= dst_size.x || Y >= dst_size.y || Z >= dst_size.z) return;\n";
-  c += "  float4 r = (float4)(0.0f);\n";
-  c += "  float window_size = 0.0;\n";
-  if (stride_correction) {
-    c += "  int xs = " +
-         GetXStrideCorrected("X", "batch_size", "stride.x", "padding.x") +
-         ";\n";
-  } else {
-    c += "  int xs = X * stride.x + padding.x;\n";
-  }
-  c += "  int ys = Y * stride.y + padding.y;\n";
-  c += "  int zs = Z * stride.z + padding.z;\n";
-  c += "  for (int kz = 0; kz < kernel_size.z; ++kz) {\n";
-  c += "    int z_c = zs + kz;\n";
-  c += "    if (z_c < 0 || z_c >= src_size.z) continue;\n";
-  c += "    for (int ky = 0; ky < kernel_size.y; ++ky) {\n";
-  c += "      int y_c = ys + ky;\n";
-  c += "      if (y_c < 0 || y_c >= src_size.y) continue;\n";
-  c += "      for (int kx = 0; kx < kernel_size.x; ++kx) {\n";
+  args->AddObjectRef("src_tensor", AccessType::READ, std::move(src_desc));
+  auto dst_desc = absl::make_unique<TensorDescriptor>(op_def.dst_tensors[0]);
   if (op_def.IsBatchSupported()) {
-    c += "        int x_c = xs + kx * batch_size;\n";
-  } else {
-    c += "        int x_c = xs + kx;\n";
+    dst_desc->SetStateVar("BatchedWidth", "true");
   }
-  c += "        if(x_c < 0 || x_c >= src_size.x) continue;\n";
-  c += "        r += " +
-       src_tensor.ReadAsFloatWHDS("x_c", "y_c", "z_c", "S", address_mode) +
-       ";\n";
-  c += "        window_size += 1.0;\n";
-  c += "      }\n";
-  c += "    }\n";
-  c += "  }\n";
-  // If window_size==0, window covered nothing. This situation is a sign of
-  // incorrectly constructed operation. NaNs are expected as output.
-  c += "  FLT4 result = TO_FLT4(r / window_size);\n";
-  const LinkingContext context{"result", "X", "Y", "Z"};
-  c += PostProcess(linked_operations, context);
-  c += "  " + dst_tensor.WriteWHDS("result", "X", "Y", "Z", "S");
-  c += "}\n";
-
-  return c;
-}
-
-std::string GetMaxPoolingKernelCode(
-    const OperationDef& op_def, bool stride_correction,
-    const std::vector<ElementwiseOperation*>& linked_operations,
-    bool output_indices) {
-  TensorCodeGenerator src_tensor(
-      "src_data", WHSPoint{"src_size.x", "src_size.y", "src_size.z"},
-      op_def.src_tensors[0]);
-  TensorCodeGenerator dst_tensor(
-      "dst_data", WHSPoint{"dst_size.x", "dst_size.y", "dst_size.z"},
-      op_def.dst_tensors[0]);
-  const auto dst_ind_def =
-      output_indices ? op_def.dst_tensors[1] : op_def.dst_tensors[0];
-  TensorCodeGenerator indices_tensor(
-      "dst_indices", WHSPoint{"dst_size.x", "dst_size.y", "dst_size.z"},
-      dst_ind_def);
-
-  std::string c = GetCommonDefines(op_def.precision);
-
-  c += "__kernel void main_function(\n";
-  c += src_tensor.GetDeclaration(AccessType::READ);
-  c += GetArgsDeclaration(linked_operations);
-  c += dst_tensor.GetDeclaration(AccessType::WRITE) + ",\n";
+  args->AddObjectRef("dst_tensor", AccessType::WRITE, std::move(dst_desc));
   if (output_indices) {
-    c += indices_tensor.GetDeclaration(AccessType::WRITE) + ",\n";
+    auto dst_ind_desc =
+        absl::make_unique<TensorDescriptor>(op_def.dst_tensors[1]);
+    if (op_def.IsBatchSupported()) {
+      dst_ind_desc->SetStateVar("BatchedWidth", "true");
+    }
+    args->AddObjectRef("dst_indices", AccessType::WRITE,
+                       std::move(dst_ind_desc));
   }
-  c += "    int4 src_size,             \n";
-  c += "    int4 dst_size,             \n";
-  c += "    int2 kernel_size,          \n";
-  c += "    int2 padding,              \n";
-  c += "    int2 stride                \n";
-  c += ") {\n";
+  if (op_def.dst_tensors[0].HasAxis(Axis::WIDTH)) {
+    args->AddInt("kernel_size_x");
+    args->AddInt("padding_x");
+    args->AddInt("stride_x");
+  }
+  if (op_def.dst_tensors[0].HasAxis(Axis::HEIGHT)) {
+    args->AddInt("kernel_size_y");
+    args->AddInt("padding_y");
+    args->AddInt("stride_y");
+  }
+  if (op_def.dst_tensors[0].HasAxis(Axis::DEPTH)) {
+    args->AddInt("kernel_size_z");
+    args->AddInt("padding_z");
+    args->AddInt("stride_z");
+  }
+
+  std::map<Axis, std::string> axis_to_src_coord = {
+      {Axis::WIDTH, "x_c"},  {Axis::HEIGHT, "y_c"}, {Axis::DEPTH, "d_c"},
+      {Axis::CHANNELS, "Z"}, {Axis::BATCH, "B"},
+  };
+
+  std::map<Axis, std::string> axis_to_dst_coord = {
+      {Axis::WIDTH, "X"},    {Axis::HEIGHT, "Y"}, {Axis::DEPTH, "D"},
+      {Axis::CHANNELS, "Z"}, {Axis::BATCH, "B"},
+  };
+
+  std::vector<std::string> src_coords;
+  std::vector<std::string> dst_coords;
+  for (auto axis : {Axis::WIDTH, Axis::HEIGHT, Axis::DEPTH, Axis::CHANNELS}) {
+    if (op_def.dst_tensors[0].HasAxis(axis)) {
+      dst_coords.push_back(axis_to_dst_coord[axis]);
+    }
+    if (op_def.src_tensors[0].HasAxis(axis)) {
+      src_coords.push_back(axis_to_src_coord[axis]);
+    }
+  }
+  std::string src_coord = src_coords[0];
+  for (int i = 1; i < src_coords.size(); ++i) {
+    src_coord += ", " + src_coords[i];
+  }
+  std::string dst_coord = dst_coords[0];
+  for (int i = 1; i < dst_coords.size(); ++i) {
+    dst_coord += ", " + dst_coords[i];
+  }
+
+  std::string c = GetCommonDefines(op_def.precision);
+  c += "__kernel void main_function(\n";
+  c += "$0) {\n";
   c += "  int X = get_global_id(0);\n";
-  c += "  int Y = get_global_id(1);\n";
+  if (op_def.dst_tensors[0].HasAxis(Axis::DEPTH)) {
+    c += "  int linear_id_1 = get_global_id(1);\n";
+    c += "  int Y = linear_id_1 / args.dst_tensor.Depth();\n";
+    c += "  int D = linear_id_1 % args.dst_tensor.Depth();\n";
+  } else {
+    c += "  int Y = get_global_id(1);\n";
+  }
   c += "  int Z = get_global_id(2);\n";
-  c +=
-      "  if (X >= dst_size.x || Y >= dst_size.y || Z >= dst_size.z) return; \n";
-  c += "  FLT4 maximum = (FLT4)(-10000.0f);\n";
-  if (output_indices) {
-    c += "  FLT4 indexes = (FLT4)(0.0f);\n";
-    c += "  FLT index_counter = (FLT)(0.1f);\n";
-  }
-  if (stride_correction) {
-    c += "  int xs = " +
-         GetXStrideCorrected("X", "src_size.w", "stride.x", "padding.x") +
-         ";\n";
-  } else {
-    c += "  int xs = X * stride.x + padding.x;\n";
-  }
-  c += "  int ys = Y * stride.y + padding.y;\n";
-  c += "  for (int ky = 0; ky < kernel_size.y; ++ky) {\n";
-  c += "    int y_c = ys + ky;\n";
-  c += "    bool outside_y = y_c < 0 || y_c >= src_size.y;\n";
-  c += "    for (int kx = 0; kx < kernel_size.x; ++kx) {\n";
-  if (op_def.IsBatchSupported()) {
-    c += "      int x_c = xs + kx * src_size.w;\n";
-  } else {
-    c += "      int x_c = xs + kx;\n";
-  }
-  c += "      bool outside_x = x_c < 0 || x_c >= src_size.x;\n";
-  c += "      if (!outside_x && !outside_y) {\n";
-  c += "        FLT4 src = " + src_tensor.ReadWHS("x_c", "y_c", "Z") + ";\n";
-  if (output_indices) {
-    c += "        if (src.x > maximum.x) {\n";
-    c += "          indexes.x = index_counter;\n";
-    c += "          maximum.x = src.x;\n";
-    c += "        }\n";
-    c += "        if (src.y > maximum.y) {\n";
-    c += "          indexes.y = index_counter;\n";
-    c += "          maximum.y = src.y;\n";
-    c += "        }\n";
-    c += "        if (src.z > maximum.z) {\n";
-    c += "          indexes.z = index_counter;\n";
-    c += "          maximum.z = src.z;\n";
-    c += "        }\n";
-    c += "        if (src.w > maximum.w) {\n";
-    c += "          indexes.w = index_counter;\n";
-    c += "          maximum.w = src.w;\n";
-    c += "        }\n";
-    c += "        index_counter += (FLT)(1.0f);\n";
-  } else {
-    c += "        maximum = max(src, maximum);\n";
-  }
-  c += "      }\n";
-  c += "    }\n";
-  c += "  }\n";
-  const LinkingContext context{"maximum", "X", "Y", "Z"};
-  c += PostProcess(linked_operations, context);
-  c += "  " + dst_tensor.WriteWHS("maximum", "X", "Y", "Z");
-  if (output_indices) {
-    c += "  " + indices_tensor.WriteWHS("indexes", "X", "Y", "Z");
-  }
-  c += "}\n";
-
-  return c;
-}
-
-std::string GetMaxPooling3DKernelCode(
-    const OperationDef& op_def, bool stride_correction,
-    const std::vector<ElementwiseOperation*>& linked_operations,
-    bool output_indices) {
-  TensorCodeGenerator src_tensor(
-      "src_data",
-      WHDSPoint{"src_size.x", "src_size.y", "src_size.z", "src_size.w"},
-      op_def.src_tensors[0]);
-  TensorCodeGenerator dst_tensor(
-      "dst_data",
-      WHDSPoint{"dst_size.x", "dst_size.y", "dst_size.z", "dst_size.w"},
-      op_def.dst_tensors[0]);
-  const auto dst_ind_def =
-      output_indices ? op_def.dst_tensors[1] : op_def.dst_tensors[0];
-  TensorCodeGenerator indices_tensor(
-      "dst_indices",
-      WHDSPoint{"dst_size.x", "dst_size.y", "dst_size.z", "dst_size.w"},
-      dst_ind_def);
-
-  std::string c = GetCommonDefines(op_def.precision);
-
-  c += "__kernel void main_function(\n";
-  c += src_tensor.GetDeclaration(AccessType::READ);
-  c += GetArgsDeclaration(linked_operations);
-  c += dst_tensor.GetDeclaration(AccessType::WRITE) + ",\n";
-  if (output_indices) {
-    c += indices_tensor.GetDeclaration(AccessType::WRITE) + ",\n";
-  }
-  c += "    int4 src_size,             \n";
-  c += "    int4 dst_size,             \n";
-  if (op_def.IsBatchSupported()) {
-    c += "    int batch_size,          \n";
-  }
-  c += "    int4 kernel_size,          \n";
-  c += "    int4 padding,              \n";
-  c += "    int4 stride                \n";
-  c += ") {\n";
-  c += "  int X = get_global_id(0);\n";
-  c += "  int Y = get_global_id(1);\n";
-  c += "  int linear_id_z = get_global_id(2);\n";
-  c += "  int S = linear_id_z % dst_size.w;\n";
-  c += "  int Z = linear_id_z / dst_size.w;\n";
-  c += "  if (X >= dst_size.x || Y >= dst_size.y || Z >= dst_size.z) return;\n";
+  c += "  if (X >= args.dst_tensor.Width() || Y >= args.dst_tensor.Height() || "
+       "Z >= args.dst_tensor.Slices()) { \n";
+  c += "    return; \n";
+  c += "  } \n";
   c += "  FLT4 maximum = (FLT4)(-10000.0f);\n";
   if (output_indices) {
     c += "  FLT4 indexes = (FLT4)(0.0f);\n";
   }
   if (stride_correction) {
     c += "  int xs = " +
-         GetXStrideCorrected("X", "batch_size", "stride.x", "padding.x") +
+         GetXStrideCorrected("X", "args.src_tensor.Batch()", "args.stride_x",
+                             "args.padding_x") +
          ";\n";
   } else {
-    c += "  int xs = X * stride.x + padding.x;\n";
+    c += "  int xs = X * args.stride_x + args.padding_x;\n";
   }
-  c += "  int ys = Y * stride.y + padding.y;\n";
-  c += "  int zs = Z * stride.z + padding.z;\n";
-  c += "  for (int ky = 0; ky < kernel_size.y; ++ky) {\n";
+  c += "  int ys = Y * args.stride_y + args.padding_y;\n";
+  c += "  for (int ky = 0; ky < args.kernel_size_y; ++ky) {\n";
   c += "    int y_c = ys + ky;\n";
-  c += "    if (y_c < 0 || y_c >= src_size.y) continue;\n";
-  c += "    for (int kx = 0; kx < kernel_size.x; ++kx) {\n";
+  c += "    if (y_c < 0 || y_c >= args.src_tensor.Height()) continue;\n";
+  c += "    for (int kx = 0; kx < args.kernel_size_x; ++kx) {\n";
   if (op_def.IsBatchSupported()) {
-    c += "      int x_c = xs + kx * batch_size;\n";
+    c += "      int x_c = xs + kx * args.src_tensor.Batch();\n";
   } else {
     c += "      int x_c = xs + kx;\n";
   }
-  c += "      if (x_c < 0 || x_c >= src_size.x) continue;\n";
-  c += "      for (int kz = 0; kz < kernel_size.z; ++kz) {\n";
-  c += "        int z_c = zs + kz;\n";
-  c += "        if (z_c < 0 || z_c >= src_size.z) continue;\n";
-  c += "        FLT4 src = " + src_tensor.ReadWHDS("x_c", "y_c", "z_c", "S") +
-       ";\n";
-  if (output_indices) {
-    c += "        FLT index_counter = (FLT)((ky * kernel_size.x + kx) * "
-         "kernel_size.z + kz) + (FLT)(0.1f);\n";
-    c += "        if (src.x > maximum.x) {\n";
-    c += "          indexes.x = index_counter;\n";
-    c += "          maximum.x = src.x;\n";
-    c += "        }\n";
-    c += "        if (src.y > maximum.y) {\n";
-    c += "          indexes.y = index_counter;\n";
-    c += "          maximum.y = src.y;\n";
-    c += "        }\n";
-    c += "        if (src.z > maximum.z) {\n";
-    c += "          indexes.z = index_counter;\n";
-    c += "          maximum.z = src.z;\n";
-    c += "        }\n";
-    c += "        if (src.w > maximum.w) {\n";
-    c += "          indexes.w = index_counter;\n";
-    c += "          maximum.w = src.w;\n";
-    c += "        }\n";
-  } else {
-    c += "        maximum = max(src, maximum);\n";
+  c += "      if (x_c < 0 || x_c >= args.src_tensor.Width()) continue;\n";
+  if (op_def.dst_tensors[0].HasAxis(Axis::DEPTH)) {
+    c += "    int ds = D * args.stride_z + args.padding_z;\n";
+    c += "    for (int kz = 0; kz < args.kernel_size_z; ++kz) {\n";
+    c += "    int d_c = ds + kz;\n";
+    c += "      if (d_c < 0 || d_c >= args.src_tensor.Depth()) continue;\n";
+  }
+  c += "      FLT4 src = args.src_tensor.Read(" + src_coord + ");\n";
+  if (output_indices) {
+    if (op_def.dst_tensors[0].HasAxis(Axis::DEPTH)) {
+      c += "      FLT index_counter = (FLT)((ky * args.kernel_size_x + kx) * "
+           "args.kernel_size_z + kz) + (FLT)(0.1f);\n";
+    } else {
+      c += "      FLT index_counter = (FLT)(ky * args.kernel_size_x + kx) + "
+           "(FLT)(0.1f);\n";
+    }
+    c += "      if (src.x > maximum.x) {\n";
+    c += "        indexes.x = index_counter;\n";
+    c += "        maximum.x = src.x;\n";
+    c += "      }\n";
+    c += "      if (src.y > maximum.y) {\n";
+    c += "        indexes.y = index_counter;\n";
+    c += "        maximum.y = src.y;\n";
+    c += "      }\n";
+    c += "      if (src.z > maximum.z) {\n";
+    c += "        indexes.z = index_counter;\n";
+    c += "        maximum.z = src.z;\n";
+    c += "      }\n";
+    c += "      if (src.w > maximum.w) {\n";
+    c += "        indexes.w = index_counter;\n";
+    c += "        maximum.w = src.w;\n";
+    c += "      }\n";
+  } else {
+    c += "      maximum = max(src, maximum);\n";
+  }
+  if (op_def.dst_tensors[0].HasAxis(Axis::DEPTH)) {
+    c += "    }  // Depth\n";
   }
-  c += "      };\n";
   c += "    }\n";
   c += "  }\n";
-  const LinkingContext context{"maximum", "X", "Y", "Z"};
-  c += PostProcess(linked_operations, context);
-  c += "  " + dst_tensor.WriteWHDS("maximum", "X", "Y", "Z", "S");
+  c += "  args.dst_tensor.Write(maximum, " + dst_coord + ");\n";
   if (output_indices) {
-    c += "  " + indices_tensor.WriteWHDS("indexes", "X", "Y", "Z", "S");
+    c += "  args.dst_indices.Write(indexes, " + dst_coord + ");\n";
   }
   c += "}\n";
+
   return c;
 }
-
 }  // namespace
 
 Pooling::Pooling(const OperationDef& definition,
                  const Pooling2DAttributes& attr)
     : GPUOperation(definition),
-      stride_(attr.strides.w, attr.strides.h),
-      padding_(-attr.padding.prepended.w, -attr.padding.prepended.h),
-      kernel_size_(attr.kernel.w, attr.kernel.h),
+      stride_(attr.strides.w, attr.strides.h, 0, 0),
+      padding_(-attr.padding.prepended.w, -attr.padding.prepended.h, 0, 0),
+      kernel_size_(attr.kernel.w, attr.kernel.h, 0, 0),
+      type_(attr.type),
+      output_indices_(attr.output_indices) {}
+
+Pooling::Pooling(const OperationDef& definition,
+                 const Pooling3DAttributes& attr)
+    : GPUOperation(definition),
+      stride_(attr.strides.w, attr.strides.h, attr.strides.d, 0),
+      padding_(-attr.padding.prepended.w, -attr.padding.prepended.h,
+               -attr.padding.prepended.d, 0),
+      kernel_size_(attr.kernel.w, attr.kernel.h, attr.kernel.d, 0),
       type_(attr.type),
       output_indices_(attr.output_indices) {}
 
@@ -419,44 +360,56 @@ absl::Status Pooling::Compile(const CreationContext& creation_context) {
   switch (type_) {
     case PoolingType::AVERAGE:
       code = GetAveragePoolingKernelCode(definition_, stride_correction,
-                                         *creation_context.device,
-                                         linked_operations_);
+                                         *creation_context.device, &args_);
       break;
     case PoolingType::MAX:
       code = GetMaxPoolingKernelCode(definition_, stride_correction,
-                                     linked_operations_, output_indices_);
+                                     output_indices_, &args_);
       break;
     default:
       return absl::InvalidArgumentError(
           "You should create another kernel with this params");
       break;
   }
+  std::string element_wise_code;
+  RETURN_IF_ERROR(
+      MergeOperations(linked_operations_, &args_, &element_wise_code));
+  RETURN_IF_ERROR(args_.TransformToCLCode(creation_context.device->GetInfo(),
+                                          {{"dst_tensor", element_wise_code}},
+                                          &code));
   return creation_context.cache->GetOrCreateCLKernel(
       code, "main_function", *creation_context.context,
       *creation_context.device, &kernel_);
 }
 
 absl::Status Pooling::BindArguments() {
-  kernel_.ResetBindingCounter();
-  RETURN_IF_ERROR(kernel_.SetMemoryAuto(src_[0]->GetMemoryPtr()));
-  RETURN_IF_ERROR(BindArgs(&kernel_, linked_operations_));
-  RETURN_IF_ERROR(kernel_.SetMemoryAuto(dst_[0]->GetMemoryPtrForWriting()));
-  if (output_indices_) {
-    RETURN_IF_ERROR(kernel_.SetMemoryAuto(dst_[1]->GetMemoryPtrForWriting()));
+  RETURN_IF_ERROR(args_.SetObjectRef("src_tensor", src_[0]));
+  RETURN_IF_ERROR(args_.SetObjectRef("dst_tensor", dst_[0]));
+  if (definition_.dst_tensors[0].HasAxis(Axis::WIDTH)) {
+    RETURN_IF_ERROR(args_.SetInt("stride_x", stride_.x));
+    RETURN_IF_ERROR(args_.SetInt("padding_x", padding_.x * src_[0]->Batch()));
+    RETURN_IF_ERROR(args_.SetInt("kernel_size_x", kernel_size_.x));
   }
-  RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->GetWBatchedHSB()));
-  RETURN_IF_ERROR(kernel_.SetBytesAuto(dst_[0]->GetWBatchedHSB()));
-  RETURN_IF_ERROR(kernel_.SetBytesAuto(kernel_size_));
-  RETURN_IF_ERROR(
-      kernel_.SetBytesAuto(int2(padding_.x * src_[0]->Batch(), padding_.y)));
-  RETURN_IF_ERROR(kernel_.SetBytesAuto(stride_));
-
-  return absl::OkStatus();
+  if (definition_.dst_tensors[0].HasAxis(Axis::HEIGHT)) {
+    RETURN_IF_ERROR(args_.SetInt("stride_y", stride_.y));
+    RETURN_IF_ERROR(args_.SetInt("padding_y", padding_.y));
+    RETURN_IF_ERROR(args_.SetInt("kernel_size_y", kernel_size_.y));
+  }
+  if (definition_.dst_tensors[0].HasAxis(Axis::DEPTH)) {
+    RETURN_IF_ERROR(args_.SetInt("stride_z", stride_.z));
+    RETURN_IF_ERROR(args_.SetInt("padding_z", padding_.z));
+    RETURN_IF_ERROR(args_.SetInt("kernel_size_z", kernel_size_.z));
+  }
+  if (output_indices_) {
+    RETURN_IF_ERROR(args_.SetObjectRef("dst_indices", dst_[1]));
+  }
+  RETURN_IF_ERROR(SetArguments(linked_operations_, &args_));
+  return args_.Bind(kernel_.kernel());
 }
 
 int3 Pooling::GetGridSize() const {
   const int grid_x = dst_[0]->Width() * dst_[0]->Batch();
-  const int grid_y = dst_[0]->Height();
+  const int grid_y = dst_[0]->Height() * dst_[0]->Depth();
   const int grid_z = dst_[0]->Slices();
   return int3(grid_x, grid_y, grid_z);
 }
@@ -476,107 +429,9 @@ Pooling CreatePooling(const OperationDef& definition,
   return Pooling(definition, attr);
 }
 
-Pooling3D::Pooling3D(const OperationDef& definition,
-                     const Pooling3DAttributes& attr)
-    : GPUOperation(definition),
-      stride_(attr.strides.w, attr.strides.h, attr.strides.d),
-      padding_(-attr.padding.prepended.w, -attr.padding.prepended.h,
-               -attr.padding.prepended.d),
-      kernel_size_(attr.kernel.w, attr.kernel.h, attr.kernel.d),
-      type_(attr.type),
-      output_indices_(attr.output_indices) {}
-
-Pooling3D::Pooling3D(Pooling3D&& kernel)
-    : GPUOperation(std::move(kernel)),
-      stride_(kernel.stride_),
-      padding_(kernel.padding_),
-      kernel_size_(kernel.kernel_size_),
-      type_(kernel.type_),
-      output_indices_(kernel.output_indices_),
-      kernel_(std::move(kernel.kernel_)),
-      work_group_size_(kernel.work_group_size_) {}
-
-Pooling3D& Pooling3D::operator=(Pooling3D&& kernel) {
-  if (this != &kernel) {
-    std::swap(stride_, kernel.stride_);
-    std::swap(padding_, kernel.padding_);
-    std::swap(kernel_size_, kernel.kernel_size_);
-    std::swap(type_, kernel.type_);
-    std::swap(output_indices_, kernel.output_indices_);
-    kernel_ = std::move(kernel.kernel_);
-    std::swap(work_group_size_, kernel.work_group_size_);
-    GPUOperation::operator=(std::move(kernel));
-  }
-  return *this;
-}
-
-absl::Status Pooling3D::Compile(const CreationContext& creation_context) {
-  std::string code;
-  const bool stride_correction =
-      definition_.IsBatchSupported() && stride_.x != 1;
-  switch (type_) {
-    case PoolingType::AVERAGE:
-      code = GetAveragePooling3DKernelCode(definition_, stride_correction,
-                                           *creation_context.device,
-                                           linked_operations_);
-      break;
-    case PoolingType::MAX:
-      code = GetMaxPooling3DKernelCode(definition_, stride_correction,
-                                       linked_operations_, output_indices_);
-      break;
-    default:
-      return absl::InvalidArgumentError(
-          "You should create another kernel with this params");
-      break;
-  }
-  return creation_context.cache->GetOrCreateCLKernel(
-      code, "main_function", *creation_context.context,
-      *creation_context.device, &kernel_);
-}
-
-absl::Status Pooling3D::BindArguments() {
-  kernel_.ResetBindingCounter();
-  RETURN_IF_ERROR(kernel_.SetMemoryAuto(src_[0]->GetMemoryPtr()));
-  RETURN_IF_ERROR(BindArgs(&kernel_, linked_operations_));
-  RETURN_IF_ERROR(kernel_.SetMemoryAuto(dst_[0]->GetMemoryPtrForWriting()));
-  if (output_indices_) {
-    RETURN_IF_ERROR(kernel_.SetMemoryAuto(dst_[1]->GetMemoryPtrForWriting()));
-  }
-  RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->GetWBatchedHDS()));
-  RETURN_IF_ERROR(kernel_.SetBytesAuto(dst_[0]->GetWBatchedHDS()));
-  if (definition_.IsBatchSupported()) {
-    RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->Batch()));
-  }
-  RETURN_IF_ERROR(kernel_.SetBytesAuto(
-      int4(kernel_size_.x, kernel_size_.y, kernel_size_.z, 1)));
-  RETURN_IF_ERROR(kernel_.SetBytesAuto(
-      int4(padding_.x * src_[0]->Batch(), padding_.y, padding_.z, 1)));
-  RETURN_IF_ERROR(
-      kernel_.SetBytesAuto(int4(stride_.x, stride_.y, stride_.z, 1)));
-
-  return absl::OkStatus();
-}
-
-int3 Pooling3D::GetGridSize() const {
-  const int grid_x = dst_[0]->Width() * dst_[0]->Batch();
-  const int grid_y = dst_[0]->Height();
-  const int grid_z = dst_[0]->Slices() * dst_[0]->Depth();
-  return int3(grid_x, grid_y, grid_z);
-}
-
-absl::Status Pooling3D::Tune(const TuningParameters& params) {
-  RETURN_IF_ERROR(BindArguments());
-  return GetBestWorkGroup(params, kernel_, GetGridSize(), &work_group_size_);
-}
-
-absl::Status Pooling3D::AddToQueue(CLCommandQueue* queue) {
-  RETURN_IF_ERROR(BindArguments());
-  return queue->DispatchImplicit(kernel_, GetGridSize(), work_group_size_);
-}
-
-Pooling3D CreatePooling3D(const OperationDef& definition,
-                          const Pooling3DAttributes& attr) {
-  return Pooling3D(definition, attr);
+Pooling CreatePooling(const OperationDef& definition,
+                      const Pooling3DAttributes& attr) {
+  return Pooling(definition, attr);
 }
 
 }  // namespace cl
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/pooling.h b/tensorflow/lite/delegates/gpu/cl/kernels/pooling.h
index 09d2d5260f7..20719c90ae3 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/pooling.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/pooling.h
@@ -30,6 +30,7 @@ namespace cl {
 class Pooling : public GPUOperation {
  public:
   Pooling(const OperationDef& definition, const Pooling2DAttributes& attr);
+  Pooling(const OperationDef& definition, const Pooling3DAttributes& attr);
   absl::Status AddToQueue(CLCommandQueue* queue) override;
   absl::Status Tune(const TuningParameters& params) override;
 
@@ -45,9 +46,9 @@ class Pooling : public GPUOperation {
   absl::Status BindArguments();
   int3 GetGridSize() const;
 
-  int2 stride_;
-  int2 padding_;
-  int2 kernel_size_;
+  int4 stride_;
+  int4 padding_;
+  int4 kernel_size_;
 
   PoolingType type_;
   bool output_indices_;
@@ -59,37 +60,8 @@ class Pooling : public GPUOperation {
 Pooling CreatePooling(const OperationDef& definition,
                       const Pooling2DAttributes& attr);
 
-class Pooling3D : public GPUOperation {
- public:
-  Pooling3D(const OperationDef& definition, const Pooling3DAttributes& attr);
-  absl::Status AddToQueue(CLCommandQueue* queue) override;
-  absl::Status Tune(const TuningParameters& params) override;
-
-  absl::Status Compile(const CreationContext& creation_context) override;
-
-  // Move only
-  Pooling3D(Pooling3D&& kernel);
-  Pooling3D& operator=(Pooling3D&& kernel);
-  Pooling3D(const Pooling3D&) = delete;
-  Pooling3D& operator=(const Pooling3D&) = delete;
-
- private:
-  absl::Status BindArguments();
-  int3 GetGridSize() const;
-
-  int3 stride_;
-  int3 padding_;
-  int3 kernel_size_;
-
-  PoolingType type_;
-  bool output_indices_;
-
-  CLKernel kernel_;
-  int3 work_group_size_ = int3(8, 4, 1);
-};
-
-Pooling3D CreatePooling3D(const OperationDef& definition,
-                          const Pooling3DAttributes& attr);
+Pooling CreatePooling(const OperationDef& definition,
+                      const Pooling3DAttributes& attr);
 
 }  // namespace cl
 }  // namespace gpu
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/util.cc b/tensorflow/lite/delegates/gpu/cl/kernels/util.cc
index 3161a73a18f..8cfff18b4ee 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/util.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/util.cc
@@ -60,15 +60,6 @@ std::string GetImageModifier(AccessType access) {
   }
 }
 
-std::string TextureAddressModeToString(TextureAddressMode address_mode) {
-  switch (address_mode) {
-    case TextureAddressMode::DONT_CARE:
-      return "smp_none";
-    case TextureAddressMode::ZERO:
-      return "smp_zero";
-  }
-}
-
 }  // namespace
 
 std::string GetCommonDefines(CalculationsPrecision precision) {
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/util.h b/tensorflow/lite/delegates/gpu/cl/kernels/util.h
index 836a95f7407..3a51d064b40 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/util.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/util.h
@@ -36,11 +36,6 @@ namespace cl {
 
 std::string GetCommonDefines(CalculationsPrecision precision);
 
-enum class TextureAddressMode {
-  DONT_CARE,  // translated to CLK_ADDRESS_NONE
-  ZERO,       // translated to CLK_ADDRESS_CLAMP
-};
-
 struct WHSPoint {
   std::string w_name;
   std::string h_name;
diff --git a/tensorflow/lite/delegates/gpu/cl/tensor_type.cc b/tensorflow/lite/delegates/gpu/cl/tensor_type.cc
index 3b6c686a99a..8e048675697 100644
--- a/tensorflow/lite/delegates/gpu/cl/tensor_type.cc
+++ b/tensorflow/lite/delegates/gpu/cl/tensor_type.cc
@@ -45,6 +45,15 @@ std::string GetWriteImageFromDataType(DataType data_type) {
 
 }  // namespace
 
+std::string TextureAddressModeToString(TextureAddressMode address_mode) {
+  switch (address_mode) {
+    case TextureAddressMode::DONT_CARE:
+      return "smp_none";
+    case TextureAddressMode::ZERO:
+      return "smp_zero";
+  }
+}
+
 std::string ToString(TensorStorageType type) {
   switch (type) {
     case TensorStorageType::UNKNOWN:
@@ -271,8 +280,10 @@ std::string TensorDescriptor::Read(DataType read_as_type,
     case TensorStorageType::TEXTURE_3D:
     case TensorStorageType::SINGLE_TEXTURE_2D:
     case TensorStorageType::TEXTURE_ARRAY:
-      return absl::StrCat(read_as, "(", image_type, ", smp_none, ",
-                          global_address, ")");
+      return absl::StrCat(
+          read_as, "(", image_type,
+          ", " + TextureAddressModeToString(ModeFromState()) + ", ",
+          global_address, ")");
     case TensorStorageType::IMAGE_BUFFER:
       return absl::StrCat(read_as, "(image_buffer, ", global_address, ")");
     case TensorStorageType::UNKNOWN:
@@ -500,6 +511,14 @@ bool TensorDescriptor::HasAxis(Axis axis) const {
   return false;
 }
 
+void TensorDescriptor::SetTextureAddressMode(TextureAddressMode mode) {
+  if (mode == TextureAddressMode::ZERO) {
+    state_vars_["TextureMode"] = "ZERO";
+  } else {
+    state_vars_["TextureMode"] = "DONT_CARE";
+  }
+}
+
 bool TensorDescriptor::ParseCoordsFromArgs(const std::vector<std::string>& args,
                                            int offset, std::string* xc,
                                            std::string* yc, std::string* zc,
@@ -549,6 +568,19 @@ bool TensorDescriptor::IsBatchedWidth() const {
   return it != state_vars_.end() && it->second == "true";
 }
 
+TextureAddressMode TensorDescriptor::ModeFromState() const {
+  auto it = state_vars_.find("TextureMode");
+  if (it != state_vars_.end()) {
+    if (it->second == "ZERO") {
+      return TextureAddressMode::ZERO;
+    } else {
+      return TextureAddressMode::DONT_CARE;
+    }
+  } else {
+    return TextureAddressMode::DONT_CARE;
+  }
+}
+
 }  // namespace cl
 }  // namespace gpu
 }  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/tensor_type.h b/tensorflow/lite/delegates/gpu/cl/tensor_type.h
index 58ebfc51ec4..2d4ae0c7335 100644
--- a/tensorflow/lite/delegates/gpu/cl/tensor_type.h
+++ b/tensorflow/lite/delegates/gpu/cl/tensor_type.h
@@ -27,6 +27,13 @@ namespace tflite {
 namespace gpu {
 namespace cl {
 
+enum class TextureAddressMode {
+  DONT_CARE,  // translated to CLK_ADDRESS_NONE
+  ZERO,       // translated to CLK_ADDRESS_CLAMP
+};
+
+std::string TextureAddressModeToString(TextureAddressMode address_mode);
+
 enum class TensorStorageType {
   UNKNOWN,
   BUFFER,
@@ -71,6 +78,7 @@ struct TensorDescriptor : public GPUObjectDescriptor {
   GPUResources GetGPUResources(AccessType access_type) const override;
 
   bool HasAxis(Axis axis) const;
+  void SetTextureAddressMode(TextureAddressMode mode);
 
   absl::Status GetLinkingContextFromWriteSelector(
       const std::vector<std::string>& args, std::string* value_name,
@@ -106,6 +114,8 @@ struct TensorDescriptor : public GPUObjectDescriptor {
 
   bool IsBatchedWidth() const;
 
+  TextureAddressMode ModeFromState() const;
+
   absl::Status GetDataTypeFromTemplateArgs(const std::string& template_arg,
                                            DataType* result) const;
 

From b1933d67e5507bc1433c6de1700bcb0b8f3a6aec Mon Sep 17 00:00:00 2001
From: Raman Sarokin <sorokin@google.com>
Date: Wed, 17 Jun 2020 17:22:03 -0700
Subject: [PATCH 0453/1390] Added support of different layouts for src/dst. For
 example src - HWC, dst - BHWC, or vice versa.

PiperOrigin-RevId: 316998239
Change-Id: I89b07923020f185c356bb0b63926bbe81be55cb5
---
 .../lite/delegates/gpu/cl/kernels/strided_slice.cc     | 10 +++++-----
 tensorflow/lite/delegates/gpu/cl/kernels/transpose.cc  |  9 +++++----
 2 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/strided_slice.cc b/tensorflow/lite/delegates/gpu/cl/kernels/strided_slice.cc
index 2cf65f24447..d0c4e432f3a 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/strided_slice.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/strided_slice.cc
@@ -42,11 +42,12 @@ std::string GetStridedSliceCode(const OperationDef& op_def, bool alignedx4,
   args->AddInt("stride_z");
   args->AddInt("stride_b");
 
-  const std::string dst_batch = op_def.IsBatchSupported() ? "B" : "";
+  const std::string batch_id =
+      op_def.dst_tensors[0].HasAxis(Axis::BATCH) ? "B" : "0";
   std::string c = GetCommonDefines(op_def.precision);
   c += "__kernel void main_function(\n";
   c += "$0) {\n";
-  if (op_def.IsBatchSupported()) {
+  if (op_def.dst_tensors[0].HasAxis(Axis::BATCH)) {
     c += "  int linear_id = get_global_id(0);\n";
     c += "  int X = linear_id / args.dst_tensor.Batch();\n";
     c += "  int B = linear_id % args.dst_tensor.Batch();\n";
@@ -62,11 +63,10 @@ std::string GetStridedSliceCode(const OperationDef& op_def, bool alignedx4,
   c += "  } \n";
   c += "  int s_x = X * args.stride_x + args.offset_x;\n";
   c += "  int s_y = Y * args.stride_y + args.offset_y;\n";
-  if (op_def.IsBatchSupported()) {
-    c += "  int s_b = B * args.stride_b + args.offset_b;\n";
+  if (op_def.src_tensors[0].HasAxis(Axis::BATCH)) {
+    c += "  int s_b = " + batch_id + " * args.stride_b + args.offset_b;\n";
     c += "  args.src_tensor.SetBatchRef(s_b);\n";
   }
-  const std::string src_batch = op_def.IsBatchSupported() ? "s_b" : "";
   if (alignedx4) {
     c += "  int s_z = Z + args.offset_z;\n";
     c += "  FLT4 result = args.src_tensor.Read(s_x, s_y, s_z);\n";
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/transpose.cc b/tensorflow/lite/delegates/gpu/cl/kernels/transpose.cc
index e12c44566b7..cacfd52542d 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/transpose.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/transpose.cc
@@ -36,11 +36,12 @@ std::string GetTransposeCode(
       "dst_tensor", AccessType::WRITE,
       absl::make_unique<TensorDescriptor>(op_def.dst_tensors[0]));
 
-  const std::string batch_id = op_def.IsBatchSupported() ? "B" : "";
+  const std::string batch_id =
+      op_def.dst_tensors[0].HasAxis(Axis::BATCH) ? "B" : "0";
   std::string c = GetCommonDefines(op_def.precision);
   c += "__kernel void main_function(\n";
   c += "$0) {\n";
-  if (op_def.IsBatchSupported()) {
+  if (op_def.dst_tensors[0].HasAxis(Axis::BATCH)) {
     c += "  int linear_id = get_global_id(0);\n";
     c += "  int X = linear_id / args.dst_tensor.Batch();\n";
     c += "  int B = linear_id % args.dst_tensor.Batch();\n";
@@ -65,7 +66,7 @@ std::string GetTransposeCode(
   remap[attr.perm.w] = 2;
   remap[attr.perm.c] = 3;
   if (attr.perm.c == 3) {  // optimized reading when no channels permutation
-    const std::string bhw[] = {"B", "Y", "X"};
+    const std::string bhw[] = {batch_id, "Y", "X"};
     if (op_def.src_tensors[0].HasAxis(Axis::BATCH)) {
       c += "  args.src_tensor.SetBatchRef(" + bhw[remap[0]] + ");\n";
     }
@@ -80,7 +81,7 @@ std::string GetTransposeCode(
     c += "  for (int i = 0; i < 4; ++i) {\n";
     c += "    int dst_channel = Z * 4 + i;\n";
     c += "    if (dst_channel < args.dst_tensor.Channels()) {\n";
-    const std::string bhwc[] = {"B", "Y", "X", "dst_channel"};
+    const std::string bhwc[] = {batch_id, "Y", "X", "dst_channel"};
     if (op_def.src_tensors[0].HasAxis(Axis::BATCH)) {
       c += "      args.src_tensor.SetBatchRef(" + bhwc[remap[0]] + ");\n";
     }

From 274a0f944e50922759fe8c263f74beeb24884137 Mon Sep 17 00:00:00 2001
From: Raman Sarokin <sorokin@google.com>
Date: Wed, 17 Jun 2020 17:29:06 -0700
Subject: [PATCH 0454/1390] ConcatXY converted to new style. Added support of
 concatenation in Batch and Depth axis.

PiperOrigin-RevId: 316999295
Change-Id: I94f2168f2861790b3a30c79b2b3476aa44c55748
---
 .../delegates/gpu/cl/kernels/concat_xy.cc     | 160 +++++++++++-------
 .../gpu/cl/selectors/simple_selectors.cc      |   6 +-
 2 files changed, 99 insertions(+), 67 deletions(-)

diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/concat_xy.cc b/tensorflow/lite/delegates/gpu/cl/kernels/concat_xy.cc
index ef7915afba5..5476cc22965 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/concat_xy.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/concat_xy.cc
@@ -15,7 +15,9 @@ limitations under the License.
 
 #include "tensorflow/lite/delegates/gpu/cl/kernels/concat_xy.h"
 
+#include <map>
 #include <string>
+#include <vector>
 
 #include "tensorflow/lite/delegates/gpu/cl/kernels/util.h"
 #include "tensorflow/lite/delegates/gpu/cl/kernels/work_group_picking.h"
@@ -27,51 +29,93 @@ namespace gpu {
 namespace cl {
 namespace {
 
-std::string GetConcatKernelCode(
-    const OperationDef& op_def, int tensors_count,
-    const std::vector<ElementwiseOperation*>& linked_operations) {
-  std::vector<TensorCodeGenerator> srcs(tensors_count);
-  for (int i = 0; i < tensors_count; ++i) {
-    const std::string tensor_name = "src_data_" + std::to_string(i);
-    const std::string width = "src_size_" + std::to_string(i) + ".x";
-    const std::string height = "src_size_" + std::to_string(i) + ".y";
-    srcs[i] =
-        TensorCodeGenerator(tensor_name, WHSPoint{width, height, "dst_size.z"},
-                            op_def.src_tensors[i]);
+std::string GetConcatKernelCode(const OperationDef& op_def,
+                                const ConcatAttributes& attr, Arguments* args) {
+  std::vector<std::string> tensor_names(op_def.src_tensors.size());
+  for (int i = 0; i < op_def.src_tensors.size(); ++i) {
+    tensor_names[i] = "src_tensor_" + std::to_string(i);
+    args->AddObjectRef(
+        tensor_names[i], AccessType::READ,
+        absl::make_unique<TensorDescriptor>(op_def.src_tensors[0]));
+  }
+  args->AddObjectRef(
+      "dst_tensor", AccessType::WRITE,
+      absl::make_unique<TensorDescriptor>(op_def.dst_tensors[0]));
+
+  std::map<Axis, std::string> axis_to_selector = {
+      {Axis::WIDTH, "Width"}, {Axis::HEIGHT, "Height"},
+      {Axis::DEPTH, "Depth"}, {Axis::CHANNELS, "Channels"},
+      {Axis::BATCH, "Batch"},
+  };
+  std::map<Axis, std::string> axis_to_coord = {
+      {Axis::WIDTH, "X"},    {Axis::HEIGHT, "Y"}, {Axis::DEPTH, "D"},
+      {Axis::CHANNELS, "S"}, {Axis::BATCH, "B"},
+  };
+
+  std::vector<std::string> src_coords;
+  std::vector<std::string> dst_coords;
+  for (auto axis :
+       {Axis::WIDTH, Axis::HEIGHT, Axis::DEPTH, Axis::CHANNELS, Axis::BATCH}) {
+    if (op_def.src_tensors[0].HasAxis(axis) && axis != Axis::BATCH) {
+      if (axis == attr.axis) {
+        src_coords.push_back("coord");
+      } else {
+        src_coords.push_back(axis_to_coord[axis]);
+      }
+    }
+    if (op_def.dst_tensors[0].HasAxis(axis)) {
+      dst_coords.push_back(axis_to_coord[axis]);
+    }
+  }
+  std::string src_coord = src_coords[0];
+  for (int i = 1; i < src_coords.size(); ++i) {
+    src_coord += ", " + src_coords[i];
+  }
+  std::string dst_coord = dst_coords[0];
+  for (int i = 1; i < dst_coords.size(); ++i) {
+    dst_coord += ", " + dst_coords[i];
   }
-  TensorCodeGenerator dst("dst_data",
-                          WHSPoint{"dst_size.x", "dst_size.y", "dst_size.z"},
-                          op_def.dst_tensors[0]);
 
   std::string c = GetCommonDefines(op_def.precision);
-
   c += "__kernel void main_function(\n";
-  for (const auto& src : srcs) {
-    c += src.GetDeclaration(AccessType::READ) + ",\n";
+  c += "$0) {\n";
+  if (op_def.dst_tensors[0].HasAxis(Axis::BATCH)) {
+    c += "  int linear_id_0 = get_global_id(0);\n";
+    c += "  int X = linear_id_0 / args.dst_tensor.Batch();\n";
+    c += "  int B = linear_id_0 % args.dst_tensor.Batch();\n";
+  } else {
+    c += "  int X = get_global_id(0);\n";
   }
-  c += dst.GetDeclaration(AccessType::WRITE);
-  c += GetArgsDeclaration(linked_operations);
-  for (int i = 0; i < tensors_count; ++i) {
-    const std::string uniform_name = "src_size_" + std::to_string(i);
-    c += "    int4 " + uniform_name + ",\n";
+  if (op_def.dst_tensors[0].HasAxis(Axis::DEPTH)) {
+    c += "  int linear_id_1 = get_global_id(1);\n";
+    c += "  int Y = linear_id_1 / args.dst_tensor.Depth();\n";
+    c += "  int D = linear_id_1 % args.dst_tensor.Depth();\n";
+  } else {
+    c += "  int Y = get_global_id(1);\n";
   }
-  c += "    int4 dst_size  \n";
-  c += ") {\n";
-  c += "  int X = get_global_id(0);\n";
-  c += "  int Y = get_global_id(1);\n";
-  c += "  int Z = get_global_id(2);\n";
-  c += "  if (Z >= dst_size.z) return;\n";
-  for (int i = 0; i < tensors_count; ++i) {
-    const std::string size_name = "src_size_" + std::to_string(i);
-    c += "  if (X < " + size_name + ".x && Y < " + size_name + ".y) { \n";
-    c += "    FLT4 result = " + srcs[i].ReadWHS("X", "Y", "Z") + ";\n";
-    c += "    int dst_x = X + " + size_name + ".z;\n";
-    c += "    int dst_y = Y + " + size_name + ".w;\n";
-    const LinkingContext context{"result", "dst_x", "dst_y", "Z"};
-    c += PostProcess(linked_operations, context);
-    c += "    " + dst.WriteWHS("result", "dst_x", "dst_y", "Z");
+  c += "  int S = get_global_id(2);\n";
+  c += "  if (X >= args.dst_tensor.Width() || Y >= args.dst_tensor.Height() || "
+       "S >= args.dst_tensor.Slices()) { \n";
+  c += "    return; \n";
+  c += "  } \n";
+  c += "  FLT4 result = (FLT4)(0.0f);\n";
+  c += "  int coord = " + axis_to_coord[attr.axis] + ";\n";
+  for (int i = 0; i < op_def.src_tensors.size(); ++i) {
+    const std::string field =
+        "args." + tensor_names[i] + "." + axis_to_selector[attr.axis] + "()";
+    c += "  if (coord >= 0 && coord < " + field + ") { \n";
+    if (op_def.src_tensors[i].HasAxis(Axis::BATCH)) {
+      if (attr.axis == Axis::BATCH) {
+        c += "  args." + tensor_names[i] + ".SetBatchRef(coord);\n";
+      } else {
+        c += "  args." + tensor_names[i] + ".SetBatchRef(B);\n";
+      }
+    }
+    c += "    result = args." + tensor_names[i] + ".Read(" + src_coord + ");\n";
     c += "  } \n";
+    c += "  coord -= " + field + ";\n";
   }
+  c += "  args.dst_tensor.Write(result, " + dst_coord + ");\n";
   c += "}\n";
   return c;
 }
@@ -97,46 +141,32 @@ ConcatXY& ConcatXY::operator=(ConcatXY&& operation) {
 }
 
 absl::Status ConcatXY::Compile(const CreationContext& creation_context) {
-  const auto code =
-      GetConcatKernelCode(definition_, tensors_count_, linked_operations_);
+  std::string code = GetConcatKernelCode(definition_, attr_, &args_);
+  std::string element_wise_code;
+  RETURN_IF_ERROR(
+      MergeOperations(linked_operations_, &args_, &element_wise_code));
+  RETURN_IF_ERROR(args_.TransformToCLCode(creation_context.device->GetInfo(),
+                                          {{"dst_tensor", element_wise_code}},
+                                          &code));
   return creation_context.cache->GetOrCreateCLKernel(
       code, "main_function", *creation_context.context,
       *creation_context.device, &kernel_);
 }
 
 absl::Status ConcatXY::BindArguments() {
-  kernel_.ResetBindingCounter();
-  for (int i = 0; i < tensors_count_; ++i) {
-    RETURN_IF_ERROR(kernel_.SetMemoryAuto(src_[i]->GetMemoryPtr()));
-  }
-  RETURN_IF_ERROR(kernel_.SetMemoryAuto(dst_[0]->GetMemoryPtrForWriting()));
-  RETURN_IF_ERROR(BindArgs(&kernel_, linked_operations_));
-  int x_offset = 0;
-  int y_offset = 0;
-  for (int i = 0; i < tensors_count_; ++i) {
-    const int width = src_[i]->Width() * src_[i]->Batch();
-    const int height = src_[i]->Height();
+  for (int i = 0; i < definition_.src_tensors.size(); ++i) {
     RETURN_IF_ERROR(
-        kernel_.SetBytesAuto(int4(width, height, x_offset, y_offset)));
-    x_offset += attr_.axis == Axis::WIDTH ? width : 0;
-    y_offset += attr_.axis == Axis::HEIGHT ? height : 0;
+        args_.SetObjectRef("src_tensor_" + std::to_string(i), src_[i]));
   }
-  RETURN_IF_ERROR(kernel_.SetBytesAuto(dst_[0]->GetWBatchedHSB()));
-  return absl::OkStatus();
+  RETURN_IF_ERROR(args_.SetObjectRef("dst_tensor", dst_[0]));
+  RETURN_IF_ERROR(SetArguments(linked_operations_, &args_));
+  return args_.Bind(kernel_.kernel());
 }
 
 int3 ConcatXY::GetGridSize() const {
-  int max_src_width = 0;
-  int max_src_height = 0;
-  for (int i = 0; i < tensors_count_; ++i) {
-    max_src_width = std::max(max_src_width, src_[i]->Width());
-    max_src_height = std::max(max_src_height, src_[i]->Height());
-  }
-
-  const int grid_x = max_src_width * dst_[0]->Batch();
-  const int grid_y = max_src_height;
+  const int grid_x = dst_[0]->Width() * dst_[0]->Batch();
+  const int grid_y = dst_[0]->Height() * dst_[0]->Depth();
   const int grid_z = dst_[0]->Slices();
-
   return int3(grid_x, grid_y, grid_z);
 }
 
diff --git a/tensorflow/lite/delegates/gpu/cl/selectors/simple_selectors.cc b/tensorflow/lite/delegates/gpu/cl/selectors/simple_selectors.cc
index 5fc04d12822..d9bd70cc6b3 100644
--- a/tensorflow/lite/delegates/gpu/cl/selectors/simple_selectors.cc
+++ b/tensorflow/lite/delegates/gpu/cl/selectors/simple_selectors.cc
@@ -105,8 +105,10 @@ absl::Status SelectConcat(const ConcatAttributes& attr,
       *ptr = absl::make_unique<ConcatZ>(std::move(operation));
       return absl::OkStatus();
     }
-    case Axis::WIDTH:
-    case Axis::HEIGHT: {
+    case Axis::BATCH:
+    case Axis::DEPTH:
+    case Axis::HEIGHT:
+    case Axis::WIDTH: {
       ConcatXY operation = CreateConcatXY(op_def, attr, channels.size());
       *ptr = absl::make_unique<ConcatXY>(std::move(operation));
       return absl::OkStatus();

From 70e2387ecc190d415cde234c28ce5f5ae6b90bea Mon Sep 17 00:00:00 2001
From: Raman Sarokin <sorokin@google.com>
Date: Wed, 17 Jun 2020 17:29:11 -0700
Subject: [PATCH 0455/1390] Merged Reshape code generation for HWC/BHWC
 layouts. Added support of different layouts for src/dst. For example src -
 HWC, dst - BHWC, or vice versa.

PiperOrigin-RevId: 316999310
Change-Id: I20bd9a12afba8bdcb832565f09350440349041bd
---
 .../lite/delegates/gpu/cl/kernels/reshape.cc  | 79 ++++++-------------
 .../delegates/gpu/cl/kernels/reshapex4.cc     | 69 ++++++----------
 2 files changed, 47 insertions(+), 101 deletions(-)

diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/reshape.cc b/tensorflow/lite/delegates/gpu/cl/kernels/reshape.cc
index a99fff0a1da..5abfad60c1b 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/reshape.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/reshape.cc
@@ -25,56 +25,6 @@ namespace gpu {
 namespace cl {
 namespace {
 
-std::string GetReshapeBatchedCode(const OperationDef& op_def, Arguments* args) {
-  args->AddObjectRef(
-      "src_tensor", AccessType::READ,
-      absl::make_unique<TensorDescriptor>(op_def.src_tensors[0]));
-  args->AddObjectRef(
-      "dst_tensor", AccessType::WRITE,
-      absl::make_unique<TensorDescriptor>(op_def.dst_tensors[0]));
-
-  std::string c = GetCommonDefines(op_def.precision);
-  c += "__kernel void main_function(\n";
-  c += "$0) {\n";
-  c += "  int linear_id = get_global_id(0);\n";
-  c += "  int X = linear_id / args.dst_tensor.Batch();\n";
-  c += "  int B = linear_id % args.dst_tensor.Batch();\n";
-  c += "  int Y = get_global_id(1);\n";
-  c += "  int Z = get_global_id(2);\n";
-  c += "  if (X >= args.dst_tensor.Width() || Y >= args.dst_tensor.Height() || "
-       "Z >= args.dst_tensor.Slices()) { \n";
-  c += "    return; \n";
-  c += "  } \n";
-  c += "  FLT temps[4];\n";
-  c += "  temps[0] = (FLT)(0.0f);\n";
-  c += "  temps[1] = (FLT)(0.0f);\n";
-  c += "  temps[2] = (FLT)(0.0f);\n";
-  c += "  temps[3] = (FLT)(0.0f);\n";
-  c += "  int base = ((B * args.dst_tensor.Height() + Y) * "
-       "args.dst_tensor.Width() + X) * args.dst_tensor.Channels() + Z * 4;\n";
-  c += "  for (int i = 0; i < 4; ++i) {\n";
-  c += "    int dst_channel = Z * 4 + i;\n";
-  c += "    if (dst_channel < args.dst_tensor.Channels()) {;\n";
-  c += "      int p = base + i;\n";
-  c += "      int src_c = p % args.src_tensor.Channels();\n";
-  c += "      p = p / args.src_tensor.Channels();\n";
-  c += "      int src_x = p % args.src_tensor.Width();\n";
-  c += "      p = p / args.src_tensor.Width();\n";
-  c += "      int src_y = p % args.src_tensor.Height();\n";
-  c += "      int src_b = p / args.src_tensor.Height();\n";
-  c += "      int src_z = src_c / 4;\n";
-  c += "      int src_sub_ch = src_c % 4;\n";
-  c += "      FLT4 t = args.src_tensor.Read(src_x, src_y, src_z, src_b);\n";
-  c += "      FLT t_ar[4] = {t.x, t.y, t.z, t.w};\n";
-  c += "      temps[i] = t_ar[src_sub_ch];\n";
-  c += "    }\n";
-  c += "  }\n";
-  c += "  FLT4 result = (FLT4)(temps[0], temps[1], temps[2], temps[3]);\n";
-  c += "  args.dst_tensor.Write(result, X, Y, Z, B);\n";
-  c += "}\n";
-  return c;
-}
-
 std::string GetReshapeCode(const OperationDef& op_def, Arguments* args) {
   args->AddObjectRef(
       "src_tensor", AccessType::READ,
@@ -86,7 +36,14 @@ std::string GetReshapeCode(const OperationDef& op_def, Arguments* args) {
   std::string c = GetCommonDefines(op_def.precision);
   c += "__kernel void main_function(\n";
   c += "$0) {\n";
-  c += "  int X = get_global_id(0);\n";
+  if (op_def.dst_tensors[0].HasAxis(Axis::BATCH)) {
+    c += "  int linear_id = get_global_id(0);\n";
+    c += "  int X = linear_id / args.dst_tensor.Batch();\n";
+    c += "  int B = linear_id % args.dst_tensor.Batch();\n";
+    c += "  args.dst_tensor.SetBatchRef(B);\n";
+  } else {
+    c += "  int X = get_global_id(0);\n";
+  }
   c += "  int Y = get_global_id(1);\n";
   c += "  int Z = get_global_id(2);\n";
   c += "  if (X >= args.dst_tensor.Width() || Y >= args.dst_tensor.Height() || "
@@ -98,8 +55,13 @@ std::string GetReshapeCode(const OperationDef& op_def, Arguments* args) {
   c += "  temps[1] = (FLT)(0.0f);\n";
   c += "  temps[2] = (FLT)(0.0f);\n";
   c += "  temps[3] = (FLT)(0.0f);\n";
-  c += "  int base = (Y * args.dst_tensor.Width() + X) * "
-       "args.dst_tensor.Channels() + Z * 4;\n";
+  if (op_def.dst_tensors[0].HasAxis(Axis::BATCH)) {
+    c += "  int base = B;\n";
+  } else {
+    c += "  int base = 0;\n";
+  }
+  c += "  base = ((base * args.dst_tensor.Height() + Y) * "
+       "args.dst_tensor.Width() + X) * args.dst_tensor.Channels() + Z * 4;\n";
   c += "  for (int i = 0; i < 4; ++i) {\n";
   c += "    int dst_channel = Z * 4 + i;\n";
   c += "    if (dst_channel < args.dst_tensor.Channels()) {;\n";
@@ -107,7 +69,12 @@ std::string GetReshapeCode(const OperationDef& op_def, Arguments* args) {
   c += "      int src_c = p % args.src_tensor.Channels();\n";
   c += "      p = p / args.src_tensor.Channels();\n";
   c += "      int src_x = p % args.src_tensor.Width();\n";
-  c += "      int src_y = p / args.src_tensor.Width();\n";
+  c += "      p = p / args.src_tensor.Width();\n";
+  c += "      int src_y = p % args.src_tensor.Height();\n";
+  if (op_def.src_tensors[0].HasAxis(Axis::BATCH)) {
+    c += "  int src_b = p / args.src_tensor.Height();\n";
+    c += "  args.src_tensor.SetBatchRef(src_b);\n";
+  }
   c += "      int src_z = src_c / 4;\n";
   c += "      int src_sub_ch = src_c % 4;\n";
   c += "      FLT4 t = args.src_tensor.Read(src_x, src_y, src_z);\n";
@@ -137,9 +104,7 @@ Reshape& Reshape::operator=(Reshape&& operation) {
 }
 
 absl::Status Reshape::Compile(const CreationContext& creation_context) {
-  std::string code = definition_.IsBatchSupported()
-                         ? GetReshapeBatchedCode(definition_, &args_)
-                         : GetReshapeCode(definition_, &args_);
+  std::string code = GetReshapeCode(definition_, &args_);
   std::string element_wise_code;
   RETURN_IF_ERROR(
       MergeOperations(linked_operations_, &args_, &element_wise_code));
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/reshapex4.cc b/tensorflow/lite/delegates/gpu/cl/kernels/reshapex4.cc
index 0847fce5836..3edbe637aa2 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/reshapex4.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/reshapex4.cc
@@ -25,40 +25,6 @@ namespace gpu {
 namespace cl {
 namespace {
 
-std::string GetReshapeBatchedCode(const OperationDef& op_def, Arguments* args) {
-  args->AddObjectRef(
-      "src_tensor", AccessType::READ,
-      absl::make_unique<TensorDescriptor>(op_def.src_tensors[0]));
-  args->AddObjectRef(
-      "dst_tensor", AccessType::WRITE,
-      absl::make_unique<TensorDescriptor>(op_def.dst_tensors[0]));
-
-  std::string c = GetCommonDefines(op_def.precision);
-  c += "__kernel void main_function(\n";
-  c += "$0) {\n";
-  c += "  int linear_id = get_global_id(0);\n";
-  c += "  int X = linear_id / args.dst_tensor.Batch();\n";
-  c += "  int B = linear_id % args.dst_tensor.Batch();\n";
-  c += "  int Y = get_global_id(1);\n";
-  c += "  int Z = get_global_id(2);\n";
-  c += "  if (X >= args.dst_tensor.Width() || Y >= args.dst_tensor.Height() || "
-       "Z >= args.dst_tensor.Slices()) { \n";
-  c += "    return; \n";
-  c += "  } \n";
-  c += "  int dst_bhwc4 = ((B * args.dst_tensor.Height() + Y) * "
-       "args.dst_tensor.Width() + X) * args.dst_tensor.Slices() + Z;\n";
-  c += "  int src_z = dst_bhwc4 % args.src_tensor.Slices();\n";
-  c += "  dst_bhwc4 = dst_bhwc4 / args.src_tensor.Slices();\n";
-  c += "  int src_x = dst_bhwc4 % args.src_tensor.Width();\n";
-  c += "  dst_bhwc4 = dst_bhwc4 / args.src_tensor.Width();\n";
-  c += "  int src_y = dst_bhwc4 % args.src_tensor.Height();\n";
-  c += "  int src_b = dst_bhwc4 / args.src_tensor.Height();\n";
-  c += "  FLT4 result = args.src_tensor.Read(src_x, src_y, src_z, src_b);\n";
-  c += "  args.dst_tensor.Write(result, X, Y, Z, B);\n";
-  c += "}\n";
-  return c;
-}
-
 std::string GetReshapeCode(const OperationDef& op_def, Arguments* args) {
   args->AddObjectRef(
       "src_tensor", AccessType::READ,
@@ -70,19 +36,36 @@ std::string GetReshapeCode(const OperationDef& op_def, Arguments* args) {
   std::string c = GetCommonDefines(op_def.precision);
   c += "__kernel void main_function(\n";
   c += "$0) {\n";
-  c += "  int X = get_global_id(0);\n";
+  if (op_def.dst_tensors[0].HasAxis(Axis::BATCH)) {
+    c += "  int linear_id = get_global_id(0);\n";
+    c += "  int X = linear_id / args.dst_tensor.Batch();\n";
+    c += "  int B = linear_id % args.dst_tensor.Batch();\n";
+    c += "  args.dst_tensor.SetBatchRef(B);\n";
+  } else {
+    c += "  int X = get_global_id(0);\n";
+  }
   c += "  int Y = get_global_id(1);\n";
   c += "  int Z = get_global_id(2);\n";
   c += "  if (X >= args.dst_tensor.Width() || Y >= args.dst_tensor.Height() || "
        "Z >= args.dst_tensor.Slices()) { \n";
   c += "    return; \n";
   c += "  } \n";
-  c += "  int dst_hwc4 = (Y * args.dst_tensor.Width() + X) * "
-       "args.dst_tensor.Slices() + Z;\n";
-  c += "  int src_z = dst_hwc4 % args.src_tensor.Slices();\n";
-  c += "  dst_hwc4 = dst_hwc4 / args.src_tensor.Slices();\n";
-  c += "  int src_x = dst_hwc4 % args.src_tensor.Width();\n";
-  c += "  int src_y = dst_hwc4 / args.src_tensor.Width();\n";
+  if (op_def.dst_tensors[0].HasAxis(Axis::BATCH)) {
+    c += "  int dst_bhwc4 = B;\n";
+  } else {
+    c += "  int dst_bhwc4 = 0;\n";
+  }
+  c += "  dst_bhwc4 = ((dst_bhwc4 * args.dst_tensor.Height() + Y) * "
+       "args.dst_tensor.Width() + X) * args.dst_tensor.Slices() + Z;\n";
+  c += "  int src_z = dst_bhwc4 % args.src_tensor.Slices();\n";
+  c += "  dst_bhwc4 = dst_bhwc4 / args.src_tensor.Slices();\n";
+  c += "  int src_x = dst_bhwc4 % args.src_tensor.Width();\n";
+  c += "  dst_bhwc4 = dst_bhwc4 / args.src_tensor.Width();\n";
+  c += "  int src_y = dst_bhwc4 % args.src_tensor.Height();\n";
+  if (op_def.src_tensors[0].HasAxis(Axis::BATCH)) {
+    c += "  int src_b = dst_bhwc4 / args.src_tensor.Height();\n";
+    c += "  args.src_tensor.SetBatchRef(src_b);\n";
+  }
   c += "  FLT4 result = args.src_tensor.Read(src_x, src_y, src_z);\n";
   c += "  args.dst_tensor.Write(result, X, Y, Z);\n";
   c += "}\n";
@@ -105,9 +88,7 @@ Reshapex4& Reshapex4::operator=(Reshapex4&& operation) {
 }
 
 absl::Status Reshapex4::Compile(const CreationContext& creation_context) {
-  std::string code = definition_.IsBatchSupported()
-                         ? GetReshapeBatchedCode(definition_, &args_)
-                         : GetReshapeCode(definition_, &args_);
+  std::string code = GetReshapeCode(definition_, &args_);
   std::string element_wise_code;
   RETURN_IF_ERROR(
       MergeOperations(linked_operations_, &args_, &element_wise_code));

From adf0573f9d874b568c9680a6d3cf310e555dcd62 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 17 Jun 2020 17:37:35 -0700
Subject: [PATCH 0456/1390] Qualify uses of std::string

PiperOrigin-RevId: 317000789
Change-Id: I6f847b235496d1dc8f8b3380e21ce890566a9a88
---
 .../tests/fuse_binary_into_following_affine_test.cc      | 5 +++--
 .../tests/fuse_binary_into_preceding_affine_test.cc      | 5 +++--
 .../toco/graph_transformations/tests/lstm_utils_test.cc  | 4 ++--
 .../tests/resolve_constant_concatenation_test.cc         | 8 ++++----
 .../graph_transformations/tests/unpack_quantize_test.cc  | 9 +++++----
 5 files changed, 17 insertions(+), 14 deletions(-)

diff --git a/tensorflow/lite/toco/graph_transformations/tests/fuse_binary_into_following_affine_test.cc b/tensorflow/lite/toco/graph_transformations/tests/fuse_binary_into_following_affine_test.cc
index 2cba6824cfb..d6cf31f4211 100644
--- a/tensorflow/lite/toco/graph_transformations/tests/fuse_binary_into_following_affine_test.cc
+++ b/tensorflow/lite/toco/graph_transformations/tests/fuse_binary_into_following_affine_test.cc
@@ -43,14 +43,15 @@ class FuseBinaryIntoFollowingAffineTest : public ::testing::Test {
 
   void SetUp() override { model_.reset(new Model); }
 
-  void CreateArray(const string& name, const std::vector<int>& shape) {
+  void CreateArray(const std::string& name, const std::vector<int>& shape) {
     Array& array = model_->GetOrCreateArray(name);
     array.data_type = ArrayDataType::kFloat;
     Shape* array_shape = array.mutable_shape();
     *(array_shape->mutable_dims()) = shape;
   }
 
-  void CreateConstantArray(const string& name, const std::vector<int>& shape,
+  void CreateConstantArray(const std::string& name,
+                           const std::vector<int>& shape,
                            const std::vector<float>& data) {
     CreateArray(name, shape);
     Array& array = model_->GetOrCreateArray(name);
diff --git a/tensorflow/lite/toco/graph_transformations/tests/fuse_binary_into_preceding_affine_test.cc b/tensorflow/lite/toco/graph_transformations/tests/fuse_binary_into_preceding_affine_test.cc
index b5c321c1a26..6c3dc7dc761 100644
--- a/tensorflow/lite/toco/graph_transformations/tests/fuse_binary_into_preceding_affine_test.cc
+++ b/tensorflow/lite/toco/graph_transformations/tests/fuse_binary_into_preceding_affine_test.cc
@@ -43,14 +43,15 @@ class FuseBinaryIntoPrecedingAffineTest : public ::testing::Test {
 
   void SetUp() override { model_.reset(new Model); }
 
-  void CreateArray(const string& name, const std::vector<int>& shape) {
+  void CreateArray(const std::string& name, const std::vector<int>& shape) {
     Array& array = model_->GetOrCreateArray(name);
     array.data_type = ArrayDataType::kFloat;
     Shape* array_shape = array.mutable_shape();
     *(array_shape->mutable_dims()) = shape;
   }
 
-  void CreateConstantArray(const string& name, const std::vector<int>& shape,
+  void CreateConstantArray(const std::string& name,
+                           const std::vector<int>& shape,
                            const std::vector<float>& data) {
     CreateArray(name, shape);
     Array& array = model_->GetOrCreateArray(name);
diff --git a/tensorflow/lite/toco/graph_transformations/tests/lstm_utils_test.cc b/tensorflow/lite/toco/graph_transformations/tests/lstm_utils_test.cc
index bdb27e8af2e..204e197e186 100644
--- a/tensorflow/lite/toco/graph_transformations/tests/lstm_utils_test.cc
+++ b/tensorflow/lite/toco/graph_transformations/tests/lstm_utils_test.cc
@@ -46,12 +46,12 @@ class CopyArrayDataTest : public ::testing::Test {
                       int src_dim_1, int src_dim_2,
                       std::initializer_list<float> dst_data, int dst_dim_1,
                       int dst_dim_2) {
-    string src_array = "src_array";
+    std::string src_array = "src_array";
     src_buffer_ = CreateFloatArrayBuffer(
         model, &src_array,
         src_dim_2 == 1 ? Shape({src_dim_1}) : Shape({src_dim_1, src_dim_2}));
     PopulateBuffer(src_buffer_, src_data);
-    string dst_array = "dst_array";
+    std::string dst_array = "dst_array";
     dst_buffer_ = CreateFloatArrayBuffer(
         model, &dst_array,
         dst_dim_2 == 1 ? Shape({dst_dim_1}) : Shape({dst_dim_1, dst_dim_2}));
diff --git a/tensorflow/lite/toco/graph_transformations/tests/resolve_constant_concatenation_test.cc b/tensorflow/lite/toco/graph_transformations/tests/resolve_constant_concatenation_test.cc
index bfed38ce7aa..5b0566fe074 100644
--- a/tensorflow/lite/toco/graph_transformations/tests/resolve_constant_concatenation_test.cc
+++ b/tensorflow/lite/toco/graph_transformations/tests/resolve_constant_concatenation_test.cc
@@ -107,10 +107,10 @@ class ResolveConstantConcatenationTest : public ::testing::Test {
   // together with 4 arrays as its inputs.
   // It receives the dimension of concatenation as input.
   void PrepareModel(Model* model, int axis) {
-    const string output_name("concat_op_output");
+    const std::string output_name("concat_op_output");
     model->flags.add_output_arrays(output_name);
-    std::vector<string> concat_input_names = {"array0", "array1", "array2",
-                                              "array3"};
+    std::vector<std::string> concat_input_names = {"array0", "array1", "array2",
+                                                   "array3"};
 
     const int kDim = 3;
     const int kElementPerDim = 2;
@@ -122,7 +122,7 @@ class ResolveConstantConcatenationTest : public ::testing::Test {
         {20., 21., 22., 23., 24., 25., 26., 27.},
         {30., 31., 32., 33., 34., 35., 36., 37.}};
     int cnt = 0;
-    for (const string& concat_input_name : concat_input_names) {
+    for (const std::string& concat_input_name : concat_input_names) {
       Array& in_array = model->GetOrCreateArray(concat_input_name);
       in_array.data_type = ArrayDataType::kFloat;
 
diff --git a/tensorflow/lite/toco/graph_transformations/tests/unpack_quantize_test.cc b/tensorflow/lite/toco/graph_transformations/tests/unpack_quantize_test.cc
index 2dc3fb35b0f..3cc4e725463 100755
--- a/tensorflow/lite/toco/graph_transformations/tests/unpack_quantize_test.cc
+++ b/tensorflow/lite/toco/graph_transformations/tests/unpack_quantize_test.cc
@@ -40,10 +40,11 @@ class UnpackQuantizeTest : public ::testing::Test {
   // 1. calculate min and max of the input.
   // 2. insert dequantization nodes after quantized outputs of Unpack operation.
   void PrepareModel(Model* model, int axis) {
-    std::vector<string> unpack_output_names = {"unpack_out0", "unpack_out1"};
+    std::vector<std::string> unpack_output_names = {"unpack_out0",
+                                                    "unpack_out1"};
     model->flags.add_output_arrays(unpack_output_names[0]);
     model->flags.add_output_arrays(unpack_output_names[1]);
-    const string unpack_input_name("unpack_op_input");
+    const std::string unpack_input_name("unpack_op_input");
 
     const int kDim = 2;
     const int kElementPerDim = 2;
@@ -75,7 +76,7 @@ class UnpackQuantizeTest : public ::testing::Test {
     // Configuring the necessary outputs. The outputs also happen to be in
     // kFloat. This is because during quantization transformation data types for
     // these arrays are going to be forced to be kUint8.
-    for (const string& unpack_output_name : unpack_output_names) {
+    for (const std::string& unpack_output_name : unpack_output_names) {
       Array& out_array = model->GetOrCreateArray(unpack_output_name);
       out_array.GetOrCreateMinMax();
       out_array.data_type = ArrayDataType::kFloat;
@@ -109,7 +110,7 @@ TEST_F(UnpackQuantizeTest, CheckUnpackPreservesQuantizationParameters) {
                   ->Run(&model, /*op_index=*/0, &modified)
                   .ok());
 
-  const string output_name = model.flags.output_arrays(0);
+  const std::string output_name = model.flags.output_arrays(0);
 
   // Quantization transformation inserts NODE_NAME_DEQUANTIZE operations,
   // effectively making them the new outputs of the array. Old outputs of the

From 56db128697fd0e9d3f689d472d53e33bf4ea1bb8 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 17 Jun 2020 17:56:36 -0700
Subject: [PATCH 0457/1390] Qualify uses of std::string

PiperOrigin-RevId: 317003622
Change-Id: Iae6a9a287ffd3b97dee8b9993c443db322936592
---
 .../lite/toco/logging/conversion_log_util.cc  | 67 ++++++++++---------
 .../lite/toco/logging/conversion_log_util.h   | 22 +++---
 .../toco/logging/conversion_log_util_test.cc  | 60 ++++++++---------
 3 files changed, 77 insertions(+), 72 deletions(-)

diff --git a/tensorflow/lite/toco/logging/conversion_log_util.cc b/tensorflow/lite/toco/logging/conversion_log_util.cc
index c23c305c750..55afa1370b3 100644
--- a/tensorflow/lite/toco/logging/conversion_log_util.cc
+++ b/tensorflow/lite/toco/logging/conversion_log_util.cc
@@ -34,8 +34,8 @@ namespace toco {
 
 namespace {
 
-string TryGetOperatorName(const Operator& op) {
-  string op_name;
+std::string TryGetOperatorName(const Operator& op) {
+  std::string op_name;
   if (!op.tensorflow_node_def.empty()) {
     // Parse op name from serialized NodeDef.
     tensorflow::NodeDef node_def;
@@ -63,8 +63,8 @@ string TryGetOperatorName(const Operator& op) {
   return op_name;
 }
 
-string GetOSVersion() {
-  string os_info;
+std::string GetOSVersion() {
+  std::string os_info;
 #ifdef __linux__
   utsname info;
   if (uname(&info)) {
@@ -72,12 +72,13 @@ string GetOSVersion() {
     LOG(ERROR) << "Cannot get OS info.";
     return "";
   }
-  os_info = string(info.sysname) + ";OSVer=" + string(info.release) + ";";
+  os_info =
+      std::string(info.sysname) + ";OSVer=" + std::string(info.release) + ";";
 #endif
   return os_info;
 }
 
-string ShapeToStringNoSpace(const Shape& shape) {
+std::string ShapeToStringNoSpace(const Shape& shape) {
   if (shape.dimensions_count() == 0) {
     return "[]";
   }
@@ -85,13 +86,13 @@ string ShapeToStringNoSpace(const Shape& shape) {
   return absl::StrCat("[", absl::StrJoin(shape.dims(), ","), "]");
 }
 
-string GetOperatorSignature(
+std::string GetOperatorSignature(
     const Model& model, const Operator& op,
     const std::map<OperatorType, std::unique_ptr<tflite::BaseOperator>>&
         op_types_map) {
   // The signature of an op has the following schema:
   // INPUT:SHAPE::TYPE::OUTPUT:SHAPE::TYPE::NAME:VERSION:
-  string op_signature;
+  std::string op_signature;
   constexpr char delimiter[] = "::";
 
   // Get input shapes and types.
@@ -137,8 +138,8 @@ string GetOperatorSignature(
 
 }  // namespace
 
-std::vector<string> GetOperatorNames(const Model& model) {
-  std::vector<string> op_names;
+std::vector<std::string> GetOperatorNames(const Model& model) {
+  std::vector<std::string> op_names;
   for (const auto& op : model.operators) {
     op_names.push_back(TryGetOperatorName(*op));
   }
@@ -146,9 +147,9 @@ std::vector<string> GetOperatorNames(const Model& model) {
 }
 
 void CountOperatorsByType(const Model& model,
-                          std::map<string, int>* built_in_ops,
-                          std::map<string, int>* custom_ops,
-                          std::map<string, int>* select_ops) {
+                          std::map<std::string, int>* built_in_ops,
+                          std::map<std::string, int>* custom_ops,
+                          std::map<std::string, int>* select_ops) {
   for (const auto& op : model.operators) {
     OperatorSignature op_signature = {op.get(), &model};
     const auto ops_by_type =
@@ -156,7 +157,7 @@ void CountOperatorsByType(const Model& model,
     tflite::details::OperatorKey op_key(op_signature, ops_by_type,
                                         true /*enable_select_tf_ops*/);
 
-    const string op_name = TryGetOperatorName(*op);
+    const std::string op_name = TryGetOperatorName(*op);
     if (op_key.is_custom_op()) {
       (*custom_ops)[op_name]++;
     } else if (op_key.is_flex_op()) {
@@ -168,8 +169,9 @@ void CountOperatorsByType(const Model& model,
 }
 
 void GetInputAndOutputTypes(
-    const Model& model, TFLITE_PROTO_NS::RepeatedPtrField<string>* input_types,
-    TFLITE_PROTO_NS::RepeatedPtrField<string>* output_types) {
+    const Model& model,
+    TFLITE_PROTO_NS::RepeatedPtrField<std::string>* input_types,
+    TFLITE_PROTO_NS::RepeatedPtrField<std::string>* output_types) {
   for (const auto& input_array : model.flags.input_arrays()) {
     const Array& array = model.GetArray(input_array.name());
     input_types->Add(ArrayDataTypeName(array.data_type));
@@ -180,15 +182,16 @@ void GetInputAndOutputTypes(
   }
 }
 
-string GetTfLiteVersion() { return TFLITE_VERSION_STRING; }
+std::string GetTfLiteVersion() { return TFLITE_VERSION_STRING; }
 
-string GetCachedOSVersion() {
-  static string* version = new string(GetOSVersion());
+std::string GetCachedOSVersion() {
+  static std::string* version = new std::string(GetOSVersion());
   return *version;
 }
 
-void GetOpSignatures(const Model& model,
-                     TFLITE_PROTO_NS::RepeatedPtrField<string>* op_signatures) {
+void GetOpSignatures(
+    const Model& model,
+    TFLITE_PROTO_NS::RepeatedPtrField<std::string>* op_signatures) {
   const auto& op_types_map =
       tflite::BuildOperatorByTypeMap(true /*enable_select_tf_ops*/);
   for (const auto& op : model.operators) {
@@ -196,7 +199,7 @@ void GetOpSignatures(const Model& model,
   }
 }
 
-string GetModelHash(const Model& model) {
+std::string GetModelHash(const Model& model) {
   // TODO(b/123519920): Implement the hash function for Model.
   // Need to consider different implementations for public/private models.
   return "";
@@ -204,18 +207,18 @@ string GetModelHash(const Model& model) {
 
 // This function scans through the error message string, extracts the part about
 // missing ops and prunes away all other information in the error info.
-string SanitizeErrorMessage(const string& error_message) {
-  const string s1 = "Ops that can be supported by the flex runtime";
-  const string s2 = "Ops that need custom implementation";
-  string pruned_message;
+std::string SanitizeErrorMessage(const std::string& error_message) {
+  const std::string s1 = "Ops that can be supported by the flex runtime";
+  const std::string s2 = "Ops that need custom implementation";
+  std::string pruned_message;
   size_t pos = error_message.find(s1);
-  if (pos != string::npos) {
+  if (pos != std::string::npos) {
     // Find the terminate point for flex op list.
     auto end = error_message.find(".", pos);
     pruned_message.append(error_message.substr(pos, end - pos + 1));
   }
   pos = error_message.find(s2);
-  if (pos != string::npos) {
+  if (pos != std::string::npos) {
     // Find the terminate point for custom op list.
     auto end = error_message.find(".", pos);
     pruned_message.append(error_message.substr(pos, end - pos + 1));
@@ -225,18 +228,18 @@ string SanitizeErrorMessage(const string& error_message) {
 
 void PopulateConversionLog(const Model& model, TocoConversionLog* log) {
   // Get the list of ops after conversion.
-  const std::vector<string> op_names = GetOperatorNames(model);
+  const std::vector<std::string> op_names = GetOperatorNames(model);
   for (const auto& op_name : op_names) {
     log->add_op_list(op_name);
   }
 
   // Get op signatures.
-  TFLITE_PROTO_NS::RepeatedPtrField<string> op_signatures;
+  TFLITE_PROTO_NS::RepeatedPtrField<std::string> op_signatures;
   GetOpSignatures(model, &op_signatures);
   log->mutable_op_signatures()->CopyFrom(op_signatures);
 
   // Get op counts by category: custom, built-in or select.
-  std::map<string, int> custom_ops, select_ops, built_in_ops;
+  std::map<std::string, int> custom_ops, select_ops, built_in_ops;
   CountOperatorsByType(model, &built_in_ops, &custom_ops, &select_ops);
   log->mutable_custom_ops()->insert(custom_ops.cbegin(), custom_ops.cend());
   log->mutable_built_in_ops()->insert(built_in_ops.cbegin(),
@@ -244,7 +247,7 @@ void PopulateConversionLog(const Model& model, TocoConversionLog* log) {
   log->mutable_select_ops()->insert(select_ops.cbegin(), select_ops.cend());
 
   // Get the model's input and output types.
-  TFLITE_PROTO_NS::RepeatedPtrField<string> input_types, output_types;
+  TFLITE_PROTO_NS::RepeatedPtrField<std::string> input_types, output_types;
   GetInputAndOutputTypes(model, &input_types, &output_types);
   log->mutable_input_tensor_types()->CopyFrom(input_types);
   log->mutable_output_tensor_types()->CopyFrom(output_types);
diff --git a/tensorflow/lite/toco/logging/conversion_log_util.h b/tensorflow/lite/toco/logging/conversion_log_util.h
index 2237615adbb..c21ec0792cc 100644
--- a/tensorflow/lite/toco/logging/conversion_log_util.h
+++ b/tensorflow/lite/toco/logging/conversion_log_util.h
@@ -25,37 +25,39 @@ namespace toco {
 
 // This function scans through the error message string, extracts the part about
 // missing ops and prunes away all other information in the error info.
-string SanitizeErrorMessage(const string& error_message);
+std::string SanitizeErrorMessage(const std::string& error_message);
 
 // Populates the TocoConversionLog proto after analyzing the model.
 void PopulateConversionLog(const Model& model, TocoConversionLog* log);
 
 // Returns the names of the operators in the model.
-std::vector<string> GetOperatorNames(const Model& model);
+std::vector<std::string> GetOperatorNames(const Model& model);
 
 // Counts the number of different types of operators in the model:
 // Built-in ops, custom ops and select ops.
 // Each map is mapping from the name of the operator (such as 'Conv') to its
 // total number of occurrences in the model.
 void CountOperatorsByType(const Model& model,
-                          std::map<string, int>* built_in_ops,
-                          std::map<string, int>* custom_ops,
-                          std::map<string, int>* select_ops);
+                          std::map<std::string, int>* built_in_ops,
+                          std::map<std::string, int>* custom_ops,
+                          std::map<std::string, int>* select_ops);
 
 // Gets the input and output types of the model. The input and output is
 // specified by model.flags.input_arrays and model.flags.output_arrays.
 void GetInputAndOutputTypes(
-    const Model& model, TFLITE_PROTO_NS::RepeatedPtrField<string>* input_types,
-    TFLITE_PROTO_NS::RepeatedPtrField<string>* output_types);
+    const Model& model,
+    TFLITE_PROTO_NS::RepeatedPtrField<std::string>* input_types,
+    TFLITE_PROTO_NS::RepeatedPtrField<std::string>* output_types);
 
 // Calculates signatures for all the ops in the model. An op signature is
 // defined by its input/output shapes and types, op name and its version.
-void GetOpSignatures(const Model& model,
-                     TFLITE_PROTO_NS::RepeatedPtrField<string>* op_signatures);
+void GetOpSignatures(
+    const Model& model,
+    TFLITE_PROTO_NS::RepeatedPtrField<std::string>* op_signatures);
 
 // TODO(b/123519920): Implement this.
 // Calculates a unique hash for the model.
-string GetModelHash(const Model& model);
+std::string GetModelHash(const Model& model);
 
 }  // namespace toco
 
diff --git a/tensorflow/lite/toco/logging/conversion_log_util_test.cc b/tensorflow/lite/toco/logging/conversion_log_util_test.cc
index c4960715f25..17111eca6d0 100644
--- a/tensorflow/lite/toco/logging/conversion_log_util_test.cc
+++ b/tensorflow/lite/toco/logging/conversion_log_util_test.cc
@@ -58,9 +58,9 @@ TEST(ConversionLogUtilTest, TestCountOperatorsByType) {
   Model model;
   // 1st Conv operator.
   std::unique_ptr<ConvOperator> conv1(new ConvOperator());
-  const string conv1_input_name = "conv_input1";
-  const string conv1_filter_name = "conv_filter1";
-  const string conv1_output_name = "conv_output1";
+  const std::string conv1_input_name = "conv_input1";
+  const std::string conv1_filter_name = "conv_filter1";
+  const std::string conv1_output_name = "conv_output1";
   conv1->inputs.push_back(conv1_input_name);
   conv1->inputs.push_back(conv1_filter_name);
   conv1->outputs.push_back(conv1_output_name);
@@ -71,9 +71,9 @@ TEST(ConversionLogUtilTest, TestCountOperatorsByType) {
 
   // 2nd Conv operator.
   std::unique_ptr<ConvOperator> conv2(new ConvOperator());
-  const string conv2_input_name = "conv_input2";
-  const string conv2_filter_name = "conv_filter2";
-  const string conv2_output_name = "conv_output2";
+  const std::string conv2_input_name = "conv_input2";
+  const std::string conv2_filter_name = "conv_filter2";
+  const std::string conv2_output_name = "conv_output2";
   conv2->inputs.push_back(conv2_input_name);
   conv2->inputs.push_back(conv2_filter_name);
   conv2->outputs.push_back(conv2_output_name);
@@ -83,7 +83,7 @@ TEST(ConversionLogUtilTest, TestCountOperatorsByType) {
 
   // Mean operator.
   std::unique_ptr<MeanOperator> mean(new MeanOperator());
-  const string mean_input_name = "mean_input";
+  const std::string mean_input_name = "mean_input";
   mean->inputs.push_back(mean_input_name);
   array_map[mean_input_name] = std::unique_ptr<Array>(new Array);
 
@@ -111,26 +111,26 @@ TEST(ConversionLogUtilTest, TestCountOperatorsByType) {
   model.operators.push_back(std::move(elu_grad));
   model.operators.push_back(std::move(my_custom_op));
 
-  std::map<string, int> built_in_ops, select_ops, custom_ops;
+  std::map<std::string, int> built_in_ops, select_ops, custom_ops;
   CountOperatorsByType(model, &built_in_ops, &custom_ops, &select_ops);
 
   EXPECT_THAT(built_in_ops,
-              UnorderedElementsAre(std::pair<string, int>("Conv", 2),
-                                   std::pair<string, int>("Mean", 1)));
+              UnorderedElementsAre(std::pair<std::string, int>("Conv", 2),
+                                   std::pair<std::string, int>("Mean", 1)));
   EXPECT_THAT(select_ops,
-              UnorderedElementsAre(std::pair<string, int>("AvgPool3D", 1),
-                                   std::pair<string, int>("EluGrad", 1)));
-  EXPECT_THAT(custom_ops, UnorderedElementsAre(
-                              std::pair<string, int>("MyAwesomeCustomOp", 1)));
+              UnorderedElementsAre(std::pair<std::string, int>("AvgPool3D", 1),
+                                   std::pair<std::string, int>("EluGrad", 1)));
+  EXPECT_THAT(custom_ops, UnorderedElementsAre(std::pair<std::string, int>(
+                              "MyAwesomeCustomOp", 1)));
 }
 
 TEST(ConversionLogUtilTest, TestGetInputAndOutputTypes) {
   Model model;
   auto& array_map = model.GetMutableArrayMap();
-  const string input1 = "conv_input";
-  const string input2 = "conv_filter";
-  const string input3 = "feature";
-  const string output = "softmax";
+  const std::string input1 = "conv_input";
+  const std::string input2 = "conv_filter";
+  const std::string input3 = "feature";
+  const std::string output = "softmax";
   array_map[input1] = std::unique_ptr<Array>(new Array);
   array_map[input1]->data_type = ArrayDataType::kFloat;
   array_map[input2] = std::unique_ptr<Array>(new Array);
@@ -149,7 +149,7 @@ TEST(ConversionLogUtilTest, TestGetInputAndOutputTypes) {
   *model.flags.add_input_arrays() = input_arrays[2];
   model.flags.add_output_arrays(output);
 
-  TFLITE_PROTO_NS::RepeatedPtrField<string> input_types, output_types;
+  TFLITE_PROTO_NS::RepeatedPtrField<std::string> input_types, output_types;
   GetInputAndOutputTypes(model, &input_types, &output_types);
 
   EXPECT_THAT(input_types, ElementsAre("float", "float", "int16"));
@@ -161,9 +161,9 @@ TEST(ConversionLogUtilTest, TestGetOpSignatures) {
   auto& array_map = model.GetMutableArrayMap();
 
   std::unique_ptr<ConvOperator> conv(new ConvOperator());
-  const string conv_input_name = "conv_input";
-  const string conv_filter_name = "conv_filter";
-  const string conv_output_name = "conv_output";
+  const std::string conv_input_name = "conv_input";
+  const std::string conv_filter_name = "conv_filter";
+  const std::string conv_output_name = "conv_output";
   conv->inputs.push_back(conv_input_name);
   conv->inputs.push_back(conv_filter_name);
   conv->outputs.push_back(conv_output_name);
@@ -177,15 +177,15 @@ TEST(ConversionLogUtilTest, TestGetOpSignatures) {
   array_map[conv_output_name]->data_type = ArrayDataType::kFloat;
   array_map[conv_output_name]->copy_shape({4, 4, 2});
 
-  const string mean_input_name = "mean_input";
-  const string mean_output_name = "mean_output";
+  const std::string mean_input_name = "mean_input";
+  const std::string mean_output_name = "mean_output";
   std::unique_ptr<MeanOperator> mean(new MeanOperator());
   mean->inputs.push_back(mean_input_name);
   mean->outputs.push_back(mean_output_name);
   array_map[mean_input_name] = std::unique_ptr<Array>(new Array);
   array_map[mean_output_name] = std::unique_ptr<Array>(new Array);
 
-  const string avg_pool_3d_output_name = "avg_pool_output";
+  const std::string avg_pool_3d_output_name = "avg_pool_output";
   auto avg_pool_3d = absl::make_unique<TensorFlowUnsupportedOperator>();
   avg_pool_3d->tensorflow_op = "AvgPool3D";
   tensorflow::NodeDef node_def;
@@ -197,7 +197,7 @@ TEST(ConversionLogUtilTest, TestGetOpSignatures) {
   array_map[avg_pool_3d_output_name]->data_type = ArrayDataType::kInt32;
   array_map[avg_pool_3d_output_name]->copy_shape({2, 2});
 
-  const string custom_op_output_name = "custom_op_output";
+  const std::string custom_op_output_name = "custom_op_output";
   auto my_custom_op = absl::make_unique<TensorFlowUnsupportedOperator>();
   my_custom_op->tensorflow_op = "MyAwesomeCustomOp";
   my_custom_op->inputs.push_back(avg_pool_3d_output_name);
@@ -211,7 +211,7 @@ TEST(ConversionLogUtilTest, TestGetOpSignatures) {
   model.operators.push_back(std::move(avg_pool_3d));
   model.operators.push_back(std::move(my_custom_op));
 
-  TFLITE_PROTO_NS::RepeatedPtrField<string> op_signatures;
+  TFLITE_PROTO_NS::RepeatedPtrField<std::string> op_signatures;
   GetOpSignatures(model, &op_signatures);
   EXPECT_THAT(op_signatures,
               UnorderedElementsAre(
@@ -225,14 +225,14 @@ TEST(ConversionLogUtilTest, TestGetOpSignatures) {
 }
 
 TEST(ConversionLogUtilTest, TestSanitizeErrorMessage) {
-  const string error =
+  const std::string error =
       "error: failed while converting: 'main': Ops that can be supported by "
       "the flex runtime (enabled via setting the -emit-select-tf-ops flag): "
       "ResizeNearestNeighbor,ResizeNearestNeighbor. Ops that need custom "
       "implementation (enabled via setting the -emit-custom-ops flag): "
       "CombinedNonMaxSuppression.\nTraceback (most recent call last): File "
       "/usr/local/bin/toco_from_protos, line 8, in <module>";
-  const string pruned_error =
+  const std::string pruned_error =
       "Ops that can be supported by "
       "the flex runtime (enabled via setting the -emit-select-tf-ops flag): "
       "ResizeNearestNeighbor,ResizeNearestNeighbor.Ops that need custom "
@@ -242,7 +242,7 @@ TEST(ConversionLogUtilTest, TestSanitizeErrorMessage) {
 }
 
 TEST(ConversionLogUtilTest, TestSanitizeErrorMessageNoMatching) {
-  const string error =
+  const std::string error =
       "error: failed while converting: 'main': Traceback (most recent call "
       "last): File "
       "/usr/local/bin/toco_from_protos, line 8, in <module>";

From c31f2ca4a29d4469f29c57735d74cbd2748c0e03 Mon Sep 17 00:00:00 2001
From: Sam Holt <samianholt@gmail.com>
Date: Tue, 16 Jun 2020 16:50:16 +0100
Subject: [PATCH 0458/1390] fix: convolutional padding argument valid and same
 explaination

---
 .../python/keras/layers/convolutional.py      | 55 +++++++++++++++----
 .../keras/layers/convolutional_recurrent.py   |  6 ++
 tensorflow/python/keras/layers/local.py       |  2 +
 tensorflow/python/keras/layers/pooling.py     | 22 ++++++--
 .../keras/legacy_tf_layers/convolutional.py   | 42 ++++++++++++++
 tensorflow/python/keras/utils/conv_utils.py   | 12 ++++
 tensorflow/python/ops/nn_ops.py               |  3 +
 7 files changed, 128 insertions(+), 14 deletions(-)

diff --git a/tensorflow/python/keras/layers/convolutional.py b/tensorflow/python/keras/layers/convolutional.py
index 51f4e3b320a..471d94570a5 100644
--- a/tensorflow/python/keras/layers/convolutional.py
+++ b/tensorflow/python/keras/layers/convolutional.py
@@ -72,6 +72,10 @@ class Conv(Layer):
       Specifying any stride value != 1 is incompatible with specifying
       any `dilation_rate` value != 1.
     padding: One of `"valid"`,  `"same"`, or `"causal"` (case-insensitive).
+      `"valid"` means no padding. `"same"` results in padding evenly to 
+      the left/right or up/down of the input such that output has the same 
+      height/width dimension as the input. `"causal"` results in causal 
+      (dilated) convolutions, e.g. `output[t]` does not depend on `input[t+1:]`.
     data_format: A string, one of `channels_last` (default) or `channels_first`.
       The ordering of the dimensions in the inputs.
       `channels_last` corresponds to inputs with shape
@@ -418,6 +422,9 @@ class Conv1D(Conv):
       Specifying any stride value != 1 is incompatible with specifying
       any `dilation_rate` value != 1.
     padding: One of `"valid"`, `"causal"` or `"same"` (case-insensitive).
+      `"valid"` means no padding. `"same"` results in padding evenly to 
+      the left/right or up/down of the input such that output has the same 
+      height/width dimension as the input.
       `"causal"` results in causal (dilated) convolutions, e.g. `output[t]`
       does not depend on `input[t+1:]`. Useful when modeling temporal data
       where the model should not violate the temporal order.
@@ -571,6 +578,9 @@ class Conv2D(Conv):
       specify the same value for all spatial dimensions. Specifying any stride
       value != 1 is incompatible with specifying any `dilation_rate` value != 1.
     padding: one of `"valid"` or `"same"` (case-insensitive).
+      `"valid"` means no padding. `"same"` results in padding evenly to 
+      the left/right or up/down of the input such that output has the same 
+      height/width dimension as the input.
     data_format: A string, one of `channels_last` (default) or `channels_first`.
       The ordering of the dimensions in the inputs. `channels_last` corresponds
       to inputs with shape `(batch_size, height, width, channels)` while
@@ -712,6 +722,9 @@ class Conv3D(Conv):
       specify the same value for all spatial dimensions. Specifying any stride
       value != 1 is incompatible with specifying any `dilation_rate` value != 1.
     padding: one of `"valid"` or `"same"` (case-insensitive).
+      `"valid"` means no padding. `"same"` results in padding evenly to 
+      the left/right or up/down of the input such that output has the same 
+      height/width dimension as the input.
     data_format: A string, one of `channels_last` (default) or `channels_first`.
       The ordering of the dimensions in the inputs. `channels_last` corresponds
       to inputs with shape `batch_shape + (spatial_dim1, spatial_dim2,
@@ -833,6 +846,9 @@ class Conv1DTranspose(Conv1D):
       time dimension. Specifying a stride value != 1 is incompatible with
       specifying a `dilation_rate` value != 1. Defaults to 1.
     padding: one of `"valid"` or `"same"` (case-insensitive).
+      `"valid"` means no padding. `"same"` results in padding evenly to 
+      the left/right or up/down of the input such that output has the same 
+      height/width dimension as the input.
     output_padding: An integer specifying the amount of padding along
       the time dimension of the output tensor.
       The amount of output padding must be lower than the stride.
@@ -1083,6 +1099,9 @@ class Conv2DTranspose(Conv2D):
       Specifying any stride value != 1 is incompatible with specifying
       any `dilation_rate` value != 1.
     padding: one of `"valid"` or `"same"` (case-insensitive).
+      `"valid"` means no padding. `"same"` results in padding evenly to 
+      the left/right or up/down of the input such that output has the same 
+      height/width dimension as the input.
     output_padding: An integer or tuple/list of 2 integers,
       specifying the amount of padding along the height and width
       of the output tensor.
@@ -1371,19 +1390,22 @@ class Conv3DTranspose(Conv3D):
 
   Arguments:
     filters: Integer, the dimensionality of the output space
-        (i.e. the number of output filters in the convolution).
+      (i.e. the number of output filters in the convolution).
     kernel_size: An integer or tuple/list of 3 integers, specifying the
-        depth, height and width of the 3D convolution window.
-        Can be a single integer to specify the same value for
-        all spatial dimensions.
+      depth, height and width of the 3D convolution window.
+      Can be a single integer to specify the same value for
+      all spatial dimensions.
     strides: An integer or tuple/list of 3 integers,
-        specifying the strides of the convolution along the depth, height
-          and width.
-        Can be a single integer to specify the same value for
-        all spatial dimensions.
-        Specifying any stride value != 1 is incompatible with specifying
-        any `dilation_rate` value != 1.
+      specifying the strides of the convolution along the depth, height
+        and width.
+      Can be a single integer to specify the same value for
+      all spatial dimensions.
+      Specifying any stride value != 1 is incompatible with specifying
+      any `dilation_rate` value != 1.
     padding: one of `"valid"` or `"same"` (case-insensitive).
+      `"valid"` means no padding. `"same"` results in padding evenly to 
+      the left/right or up/down of the input such that output has the same 
+      height/width dimension as the input.
     output_padding: An integer or tuple/list of 3 integers,
       specifying the amount of padding along the depth, height, and
       width.
@@ -1681,6 +1703,9 @@ class SeparableConv(Conv):
       Specifying any `stride` value != 1 is incompatible with specifying
       any `dilation_rate` value != 1.
     padding: One of `"valid"` or `"same"` (case-insensitive).
+      `"valid"` means no padding. `"same"` results in padding evenly to 
+      the left/right or up/down of the input such that output has the same 
+      height/width dimension as the input.
     data_format: A string, one of `channels_last` (default) or `channels_first`.
       The ordering of the dimensions in the inputs.
       `channels_last` corresponds to inputs with shape
@@ -1885,6 +1910,10 @@ class SeparableConv1D(SeparableConv):
       Specifying any `stride` value != 1 is incompatible with specifying
       any `dilation_rate` value != 1.
     padding: One of `"valid"`, `"same"`, or `"causal"` (case-insensitive).
+      `"valid"` means no padding. `"same"` results in padding evenly to 
+      the left/right or up/down of the input such that output has the same 
+      height/width dimension as the input. `"causal"` results in causal 
+      (dilated) convolutions, e.g. `output[t]` does not depend on `input[t+1:]`.
     data_format: A string, one of `channels_last` (default) or `channels_first`.
       The ordering of the dimensions in the inputs.
       `channels_last` corresponds to inputs with shape
@@ -2070,6 +2099,9 @@ class SeparableConv2D(SeparableConv):
       Specifying any stride value != 1 is incompatible with specifying
       any `dilation_rate` value != 1.
     padding: one of `"valid"` or `"same"` (case-insensitive).
+      `"valid"` means no padding. `"same"` results in padding evenly to 
+      the left/right or up/down of the input such that output has the same 
+      height/width dimension as the input.
     data_format: A string,
       one of `channels_last` (default) or `channels_first`.
       The ordering of the dimensions in the inputs.
@@ -2230,6 +2262,9 @@ class DepthwiseConv2D(Conv2D):
       Specifying any stride value != 1 is incompatible with specifying
       any `dilation_rate` value != 1.
     padding: one of `'valid'` or `'same'` (case-insensitive).
+      `"valid"` means no padding. `"same"` results in padding evenly to 
+      the left/right or up/down of the input such that output has the same 
+      height/width dimension as the input.
     depth_multiplier: The number of depthwise convolution output channels
       for each input channel.
       The total number of depthwise convolution output
diff --git a/tensorflow/python/keras/layers/convolutional_recurrent.py b/tensorflow/python/keras/layers/convolutional_recurrent.py
index 19831429b73..54196f8725c 100644
--- a/tensorflow/python/keras/layers/convolutional_recurrent.py
+++ b/tensorflow/python/keras/layers/convolutional_recurrent.py
@@ -434,6 +434,9 @@ class ConvLSTM2DCell(DropoutRNNCellMixin, Layer):
       Specifying any stride value != 1 is incompatible with specifying
       any `dilation_rate` value != 1.
     padding: One of `"valid"` or `"same"` (case-insensitive).
+      `"valid"` means no padding. `"same"` results in padding evenly to 
+      the left/right or up/down of the input such that output has the same 
+      height/width dimension as the input.
     data_format: A string,
       one of `channels_last` (default) or `channels_first`.
       It defaults to the `image_data_format` value found in your
@@ -710,6 +713,9 @@ class ConvLSTM2D(ConvRNN2D):
       Specifying any stride value != 1 is incompatible with specifying
       any `dilation_rate` value != 1.
     padding: One of `"valid"` or `"same"` (case-insensitive).
+      `"valid"` means no padding. `"same"` results in padding evenly to 
+      the left/right or up/down of the input such that output has the same 
+      height/width dimension as the input.
     data_format: A string,
       one of `channels_last` (default) or `channels_first`.
       The ordering of the dimensions in the inputs.
diff --git a/tensorflow/python/keras/layers/local.py b/tensorflow/python/keras/layers/local.py
index 3e9c0f9c0a3..c33c88f3a3d 100644
--- a/tensorflow/python/keras/layers/local.py
+++ b/tensorflow/python/keras/layers/local.py
@@ -67,6 +67,7 @@ class LocallyConnected1D(Layer):
           any `dilation_rate` value != 1.
       padding: Currently only supports `"valid"` (case-insensitive).
           `"same"` may be supported in the future.
+          `"valid"` means no padding.
       data_format: A string,
           one of `channels_last` (default) or `channels_first`.
           The ordering of the dimensions in the inputs.
@@ -375,6 +376,7 @@ class LocallyConnected2D(Layer):
           all spatial dimensions.
       padding: Currently only support `"valid"` (case-insensitive).
           `"same"` will be supported in future.
+          `"valid"` means no padding.
       data_format: A string,
           one of `channels_last` (default) or `channels_first`.
           The ordering of the dimensions in the inputs.
diff --git a/tensorflow/python/keras/layers/pooling.py b/tensorflow/python/keras/layers/pooling.py
index ff7d157acad..51dc5131a8a 100644
--- a/tensorflow/python/keras/layers/pooling.py
+++ b/tensorflow/python/keras/layers/pooling.py
@@ -164,8 +164,9 @@ class MaxPooling1D(Pooling1D):
       for each pooling step.
       If None, it will default to `pool_size`.
     padding: One of `"valid"` or `"same"` (case-insensitive).
-      "valid" adds no padding.  "same" adds padding such that if the stride
-      is 1, the output shape is the same as the input shape.
+      `"valid"` means no padding. `"same"` results in padding evenly to 
+      the left/right or up/down of the input such that output has the same 
+      height/width dimension as the input.
     data_format: A string,
       one of `channels_last` (default) or `channels_first`.
       The ordering of the dimensions in the inputs.
@@ -209,6 +210,9 @@ class AveragePooling1D(Pooling1D):
       E.g. 2 will halve the input.
       If None, it will default to `pool_size`.
     padding: One of `"valid"` or `"same"` (case-insensitive).
+      `"valid"` means no padding. `"same"` results in padding evenly to 
+      the left/right or up/down of the input such that output has the same 
+      height/width dimension as the input.
     data_format: A string,
       one of `channels_last` (default) or `channels_first`.
       The ordering of the dimensions in the inputs.
@@ -419,8 +423,9 @@ class MaxPooling2D(Pooling2D):
       Strides values.  Specifies how far the pooling window moves
       for each pooling step. If None, it will default to `pool_size`.
     padding: One of `"valid"` or `"same"` (case-insensitive).
-      "valid" adds no zero padding.  "same" adds padding such that if the stride
-      is 1, the output shape is the same as input shape.
+      `"valid"` means no padding. `"same"` results in padding evenly to 
+      the left/right or up/down of the input such that output has the same 
+      height/width dimension as the input.
     data_format: A string,
       one of `channels_last` (default) or `channels_first`.
       The ordering of the dimensions in the inputs.
@@ -475,6 +480,9 @@ class AveragePooling2D(Pooling2D):
       Strides values.
       If None, it will default to `pool_size`.
     padding: One of `"valid"` or `"same"` (case-insensitive).
+      `"valid"` means no padding. `"same"` results in padding evenly to 
+      the left/right or up/down of the input such that output has the same 
+      height/width dimension as the input.
     data_format: A string,
       one of `channels_last` (default) or `channels_first`.
       The ordering of the dimensions in the inputs.
@@ -617,6 +625,9 @@ class MaxPooling3D(Pooling3D):
       `(2, 2, 2)` will halve the size of the 3D input in each dimension.
     strides: tuple of 3 integers, or None. Strides values.
     padding: One of `"valid"` or `"same"` (case-insensitive).
+      `"valid"` means no padding. `"same"` results in padding evenly to 
+      the left/right or up/down of the input such that output has the same 
+      height/width dimension as the input.
     data_format: A string,
       one of `channels_last` (default) or `channels_first`.
       The ordering of the dimensions in the inputs.
@@ -667,6 +678,9 @@ class AveragePooling3D(Pooling3D):
       `(2, 2, 2)` will halve the size of the 3D input in each dimension.
     strides: tuple of 3 integers, or None. Strides values.
     padding: One of `"valid"` or `"same"` (case-insensitive).
+      `"valid"` means no padding. `"same"` results in padding evenly to 
+      the left/right or up/down of the input such that output has the same 
+      height/width dimension as the input.
     data_format: A string,
       one of `channels_last` (default) or `channels_first`.
       The ordering of the dimensions in the inputs.
diff --git a/tensorflow/python/keras/legacy_tf_layers/convolutional.py b/tensorflow/python/keras/legacy_tf_layers/convolutional.py
index 4c91251a0e7..4fd53531fd1 100644
--- a/tensorflow/python/keras/legacy_tf_layers/convolutional.py
+++ b/tensorflow/python/keras/legacy_tf_layers/convolutional.py
@@ -46,6 +46,9 @@ class Conv1D(keras_layers.Conv1D, base.Layer):
       Specifying any stride value != 1 is incompatible with specifying
       any `dilation_rate` value != 1.
     padding: One of `"valid"` or `"same"` (case-insensitive).
+      `"valid"` means no padding. `"same"` results in padding evenly to 
+      the left/right or up/down of the input such that output has the same 
+      height/width dimension as the input.
     data_format: A string, one of `channels_last` (default) or `channels_first`.
       The ordering of the dimensions in the inputs.
       `channels_last` corresponds to inputs with shape
@@ -157,6 +160,9 @@ def conv1d(inputs,
       Specifying any stride value != 1 is incompatible with specifying
       any `dilation_rate` value != 1.
     padding: One of `"valid"` or `"same"` (case-insensitive).
+      `"valid"` means no padding. `"same"` results in padding evenly to 
+      the left/right or up/down of the input such that output has the same 
+      height/width dimension as the input.
     data_format: A string, one of `channels_last` (default) or `channels_first`.
       The ordering of the dimensions in the inputs.
       `channels_last` corresponds to inputs with shape
@@ -242,6 +248,9 @@ class Conv2D(keras_layers.Conv2D, base.Layer):
       Specifying any stride value != 1 is incompatible with specifying
       any `dilation_rate` value != 1.
     padding: One of `"valid"` or `"same"` (case-insensitive).
+      `"valid"` means no padding. `"same"` results in padding evenly to 
+      the left/right or up/down of the input such that output has the same 
+      height/width dimension as the input.
     data_format: A string, one of `channels_last` (default) or `channels_first`.
       The ordering of the dimensions in the inputs.
       `channels_last` corresponds to inputs with shape
@@ -360,6 +369,9 @@ def conv2d(inputs,
       Specifying any stride value != 1 is incompatible with specifying
       any `dilation_rate` value != 1.
     padding: One of `"valid"` or `"same"` (case-insensitive).
+      `"valid"` means no padding. `"same"` results in padding evenly to 
+      the left/right or up/down of the input such that output has the same 
+      height/width dimension as the input.
     data_format: A string, one of `channels_last` (default) or `channels_first`.
       The ordering of the dimensions in the inputs.
       `channels_last` corresponds to inputs with shape
@@ -449,6 +461,9 @@ class Conv3D(keras_layers.Conv3D, base.Layer):
       Specifying any stride value != 1 is incompatible with specifying
       any `dilation_rate` value != 1.
     padding: One of `"valid"` or `"same"` (case-insensitive).
+      `"valid"` means no padding. `"same"` results in padding evenly to 
+      the left/right or up/down of the input such that output has the same 
+      height/width dimension as the input.
     data_format: A string, one of `channels_last` (default) or `channels_first`.
       The ordering of the dimensions in the inputs.
       `channels_last` corresponds to inputs with shape
@@ -568,6 +583,9 @@ def conv3d(inputs,
       Specifying any stride value != 1 is incompatible with specifying
       any `dilation_rate` value != 1.
     padding: One of `"valid"` or `"same"` (case-insensitive).
+      `"valid"` means no padding. `"same"` results in padding evenly to 
+      the left/right or up/down of the input such that output has the same 
+      height/width dimension as the input.
     data_format: A string, one of `channels_last` (default) or `channels_first`.
       The ordering of the dimensions in the inputs.
       `channels_last` corresponds to inputs with shape
@@ -652,6 +670,9 @@ class SeparableConv1D(keras_layers.SeparableConv1D, base.Layer):
       Specifying any `stride` value != 1 is incompatible with specifying
       any `dilation_rate` value != 1.
     padding: One of `"valid"` or `"same"` (case-insensitive).
+      `"valid"` means no padding. `"same"` results in padding evenly to 
+      the left/right or up/down of the input such that output has the same 
+      height/width dimension as the input.
     data_format: A string, one of `channels_last` (default) or `channels_first`.
       The ordering of the dimensions in the inputs.
       `channels_last` corresponds to inputs with shape
@@ -761,6 +782,9 @@ class SeparableConv2D(keras_layers.SeparableConv2D, base.Layer):
       Specifying any `stride` value != 1 is incompatible with specifying
       any `dilation_rate` value != 1.
     padding: One of `"valid"` or `"same"` (case-insensitive).
+      `"valid"` means no padding. `"same"` results in padding evenly to 
+      the left/right or up/down of the input such that output has the same 
+      height/width dimension as the input.
     data_format: A string, one of `channels_last` (default) or `channels_first`.
       The ordering of the dimensions in the inputs.
       `channels_last` corresponds to inputs with shape
@@ -897,6 +921,9 @@ def separable_conv1d(inputs,
       Specifying any `stride` value != 1 is incompatible with specifying
       any `dilation_rate` value != 1.
     padding: One of `"valid"` or `"same"` (case-insensitive).
+      `"valid"` means no padding. `"same"` results in padding evenly to 
+      the left/right or up/down of the input such that output has the same 
+      height/width dimension as the input.
     data_format: A string, one of `channels_last` (default) or `channels_first`.
       The ordering of the dimensions in the inputs.
       `channels_last` corresponds to inputs with shape
@@ -1019,6 +1046,9 @@ def separable_conv2d(inputs,
       Specifying any `stride` value != 1 is incompatible with specifying
       any `dilation_rate` value != 1.
     padding: One of `"valid"` or `"same"` (case-insensitive).
+      `"valid"` means no padding. `"same"` results in padding evenly to 
+      the left/right or up/down of the input such that output has the same 
+      height/width dimension as the input.
     data_format: A string, one of `channels_last` (default) or `channels_first`.
       The ordering of the dimensions in the inputs.
       `channels_last` corresponds to inputs with shape
@@ -1117,6 +1147,9 @@ class Conv2DTranspose(keras_layers.Conv2DTranspose, base.Layer):
       of the convolution. Can be a single integer to specify the same value for
       all spatial dimensions.
     padding: one of `"valid"` or `"same"` (case-insensitive).
+      `"valid"` means no padding. `"same"` results in padding evenly to 
+      the left/right or up/down of the input such that output has the same 
+      height/width dimension as the input.
     data_format: A string, one of `channels_last` (default) or `channels_first`.
       The ordering of the dimensions in the inputs.
       `channels_last` corresponds to inputs with shape
@@ -1223,6 +1256,9 @@ def conv2d_transpose(inputs,
       of the convolution. Can be a single integer to specify the same value for
       all spatial dimensions.
     padding: one of `"valid"` or `"same"` (case-insensitive).
+      `"valid"` means no padding. `"same"` results in padding evenly to 
+      the left/right or up/down of the input such that output has the same 
+      height/width dimension as the input.
     data_format: A string, one of `channels_last` (default) or `channels_first`.
       The ordering of the dimensions in the inputs.
       `channels_last` corresponds to inputs with shape
@@ -1295,6 +1331,9 @@ class Conv3DTranspose(keras_layers.Conv3DTranspose, base.Layer):
       Can be a single integer to specify the same value for all spatial
       dimensions.
     padding: One of `"valid"` or `"same"` (case-insensitive).
+      `"valid"` means no padding. `"same"` results in padding evenly to 
+      the left/right or up/down of the input such that output has the same 
+      height/width dimension as the input.
     data_format: A string, one of `channels_last` (default) or `channels_first`.
       The ordering of the dimensions in the inputs.
       `channels_last` corresponds to inputs with shape
@@ -1396,6 +1435,9 @@ def conv3d_transpose(inputs,
       of the convolution. Can be a single integer to specify the same value for
       all spatial dimensions.
     padding: one of `"valid"` or `"same"` (case-insensitive).
+      `"valid"` means no padding. `"same"` results in padding evenly to 
+      the left/right or up/down of the input such that output has the same 
+      height/width dimension as the input.
     data_format: A string, one of `channels_last` (default) or `channels_first`.
       The ordering of the dimensions in the inputs.
       `channels_last` corresponds to inputs with shape
diff --git a/tensorflow/python/keras/utils/conv_utils.py b/tensorflow/python/keras/utils/conv_utils.py
index f38fdc18252..e8ee866d958 100644
--- a/tensorflow/python/keras/utils/conv_utils.py
+++ b/tensorflow/python/keras/utils/conv_utils.py
@@ -264,6 +264,9 @@ def conv_kernel_mask(input_shape, kernel_shape, strides, padding):
       receptive field.
     strides: tuple of size N, strides along each spatial dimension.
     padding: type of padding, string `"same"` or `"valid"`.
+      `"valid"` means no padding. `"same"` results in padding evenly to 
+      the left/right or up/down of the input such that output has the same 
+      height/width dimension as the input.
 
   Returns:
     A boolean 2N-D `np.ndarray` of shape
@@ -338,6 +341,9 @@ def conv_kernel_idxs(input_shape, kernel_shape, strides, padding, filters_in,
       receptive field.
     strides: tuple of size N, strides along each spatial dimension.
     padding: type of padding, string `"same"` or `"valid"`.
+      `"valid"` means no padding. `"same"` results in padding evenly to 
+      the left/right or up/down of the input such that output has the same 
+      height/width dimension as the input.
     filters_in: `int`, number if filters in the input to the layer.
     filters_out: `int', number if filters in the output of the layer.
     data_format: string, "channels_first" or "channels_last".
@@ -430,6 +436,9 @@ def conv_connected_inputs(input_shape, kernel_shape, output_position, strides,
       in the output of the convolution.
     strides: tuple of size N, strides along each spatial dimension.
     padding: type of padding, string `"same"` or `"valid"`.
+      `"valid"` means no padding. `"same"` results in padding evenly to 
+      the left/right or up/down of the input such that output has the same 
+      height/width dimension as the input.
 
   Returns:
     N ranges `[[p_in_left1, ..., p_in_right1], ...,
@@ -468,6 +477,9 @@ def conv_output_shape(input_shape, kernel_shape, strides, padding):
       receptive field.
     strides: tuple of size N, strides along each spatial dimension.
     padding: type of padding, string `"same"` or `"valid"`.
+      `"valid"` means no padding. `"same"` results in padding evenly to 
+      the left/right or up/down of the input such that output has the same 
+      height/width dimension as the input.
 
   Returns:
     tuple of size N: `(d_out1, ..., d_outN)`, spatial shape of the output.
diff --git a/tensorflow/python/ops/nn_ops.py b/tensorflow/python/ops/nn_ops.py
index 1318f575737..5a9a63637f6 100644
--- a/tensorflow/python/ops/nn_ops.py
+++ b/tensorflow/python/ops/nn_ops.py
@@ -940,6 +940,9 @@ def convolution(
     filter: An (N+2)-D `Tensor` with the same type as `input` and shape
       `spatial_filter_shape + [in_channels, out_channels]`.
     padding: A string, either `"VALID"` or `"SAME"`. The padding algorithm.
+      `"valid"` means no padding. `"same"` results in padding evenly to 
+      the left/right or up/down of the input such that output has the same 
+      height/width dimension as the input.
     strides: Optional.  Sequence of N ints >= 1.  Specifies the output stride.
       Defaults to [1]*N.  If any value of strides is > 1, then all values of
       dilation_rate must be 1.

From c0ba8a09a7edf2b617d10024f7e0ec9abe21d73e Mon Sep 17 00:00:00 2001
From: Xinyi Wang <wxinyi@google.com>
Date: Wed, 17 Jun 2020 18:24:26 -0700
Subject: [PATCH 0459/1390] Export and Document DistributedDataset and
 DistributedIterator APIs

PiperOrigin-RevId: 317007583
Change-Id: I7d7c4615a12a19fb4fd151a0457f176ffe2cd765
---
 .../python/distribute/distribute_lib.py       | 120 ++++---
 tensorflow/python/distribute/input_lib.py     | 324 +++++++++++++++++-
 tensorflow/python/distribute/values.py        |  17 +-
 ...flow.distribute.-distributed-dataset.pbtxt |  16 +
 ...low.distribute.-distributed-iterator.pbtxt |  16 +
 .../api/golden/v2/tensorflow.distribute.pbtxt |   8 +
 6 files changed, 444 insertions(+), 57 deletions(-)
 create mode 100644 tensorflow/tools/api/golden/v2/tensorflow.distribute.-distributed-dataset.pbtxt
 create mode 100644 tensorflow/tools/api/golden/v2/tensorflow.distribute.-distributed-iterator.pbtxt

diff --git a/tensorflow/python/distribute/distribute_lib.py b/tensorflow/python/distribute/distribute_lib.py
index a6dc35507e9..b6a89463426 100644
--- a/tensorflow/python/distribute/distribute_lib.py
+++ b/tensorflow/python/distribute/distribute_lib.py
@@ -684,7 +684,8 @@ class StrategyBase(object):
         instead.
       * Use `tf.distribute.Strategy.run` to run a function
         once per replica, taking values that may be "per-replica" (e.g.
-        from a distributed dataset) and returning "per-replica" values.
+        from a `tf.distribute.DistributedDataset` object) and returning
+        "per-replica" values.
         This function is executed in "replica context", which means each
         operation is performed separately on each replica.
       * Finally use a method (such as `tf.distribute.Strategy.reduce`) to
@@ -720,7 +721,8 @@ class StrategyBase(object):
   distributed-specific behavior.
 
   You can use the `reduce` API to aggregate results across replicas and use
-  this as a return value from one iteration over the distributed dataset. Or
+  this as a return value from one iteration over a
+  `tf.distribute.DistributedDataset`. Or
   you can use `tf.keras.metrics` (such as loss, accuracy, etc.) to
   accumulate metrics across steps in a given epoch.
 
@@ -859,12 +861,12 @@ class StrategyBase(object):
     return self.run(fn, args=args)
 
   def experimental_distribute_dataset(self, dataset, options=None):
-    """Distributes a tf.data.Dataset instance provided via `dataset`.
+    """Creates `tf.distribute.DistributedDataset` from `tf.data.Dataset`.
 
-    The returned distributed dataset can be iterated over similar to how
-    regular datasets can.
-    NOTE: Currently, the user cannot add any more transformations to a
-    distributed dataset.
+    The returned `tf.distribute.DistributedDataset` can be iterated over
+    similar to how regular datasets can.
+    NOTE: The user cannot add any more transformations to a
+    `tf.distribute.DistributedDataset`.
 
     The following is an example:
 
@@ -878,48 +880,53 @@ class StrategyBase(object):
     # Distribute that dataset
     dist_dataset = strategy.experimental_distribute_dataset(dataset)
 
-    # Iterate over the distributed dataset
+    # Iterate over the `tf.distribute.DistributedDataset`
     for x in dist_dataset:
       # process dataset elements
       strategy.run(replica_fn, args=(x,))
     ```
 
-    In the code snippet above, the dataset `dist_dataset` is batched by
-    GLOBAL_BATCH_SIZE, and we iterate through it using `for x in dist_dataset`,
-    where x is one batch of data of GLOBAL_BATCH_SIZE containing N batches of
-    data of per-replica batch size, corresponding to N replicas.
-    `tf.distribute.Strategy.run` will take care of feeding
-    the right per-replica batch to the right `replica_fn` execution on each
+    In the code snippet above, the `tf.distribute.DistributedDataset`
+    `dist_dataset` is batched by `GLOBAL_BATCH_SIZE`, and we iterate through it
+    using `for x in dist_dataset`. `x` a `tf.distribute.DistributedValues`
+    containing data for all replicas, which aggregates to a batch of
+    `GLOBAL_BATCH_SIZE`. `tf.distribute.Strategy.run` will take care of feeding
+    the right per-replica data in `x` to the right `replica_fn` executed on each
     replica.
 
-    In a multi-worker setting, we will first attempt to distribute the dataset
-    by attempting to detect whether the dataset is being created out of
-    ReaderDatasets (e.g. TFRecordDataset, TextLineDataset, etc.) and if so,
-    attempting to shard the input files. Note that there has to be at least one
-    input file per worker. If you have less than one input file per worker, we
-    suggest that you should disable distributing your dataset using the method
-    below.
+    What's under the hood of this method, when we say the `tf.data.Dataset`
+    instance - `dataset` - gets distributed? It depends on how you set the
+    `tf.data.experimental.AutoShardPolicy` through
+    `tf.data.experimental.DistributeOptions`. By default, it is set to
+    `tf.data.experimental.AutoShardPolicy.AUTO`. In a multi-worker setting, we
+    will first attempt to distribute `dataset` by detecting whether `dataset` is
+    being created out of reader datasets (e.g. `tf.data.TFRecordDataset`,
+    `tf.data.TextLineDataset`, etc.) and if so, try to shard the input files.
+    Note that there has to be at least one input file per worker. If you have
+    less than one input file per worker, we suggest that you disable dataset
+    sharding across workers, by setting the
+    `tf.data.experimental.DistributeOptions.auto_shard_policy` to be
+    `tf.data.experimental.AutoShardPolicy.OFF`.
 
-    If that attempt is unsuccessful (e.g. the dataset is created from a
-    Dataset.range), we will shard the dataset evenly at the end by appending a
-    `.shard` operation to the end of the processing pipeline. This will cause
-    the entire preprocessing pipeline for all the data to be run on every
-    worker, and each worker will do redundant work. We will print a warning
-    if this method of sharding is selected.
+    If the attempt to shard by file is unsuccessful (i.e. the dataset is not
+    read from files), we will shard the dataset evenly at the end by
+    appending a `.shard` operation to the end of the processing pipeline. This
+    will cause the entire preprocessing pipeline for all the data to be run on
+    every worker, and each worker will do redundant work. We will print a
+    warning if this route is selected.
 
-    You can disable dataset sharding across workers using the
-    `auto_shard_policy` option in `tf.data.experimental.DistributeOptions`.
-
-    Within each worker, we will also split the data among all the worker
-    devices (if more than one a present), and this will happen even if
-    multi-worker sharding is disabled using the method above.
+    As mentioned before, within each worker, we will also split the data among
+    all the worker devices (if more than one a present). This will happen
+    even if multi-worker sharding is disabled.
 
     If the above batch splitting and dataset sharding logic is undesirable,
-    please use `experimental_distribute_datasets_from_function` instead, which
-    does not do any automatic splitting or sharding.
+    please use
+    `tf.distribute.Strategy.experimental_distribute_datasets_from_function`
+    instead, which does not do any automatic splitting or sharding.
 
-    You can also use the `element_spec` property of the distributed dataset
-    returned by this API to query the `tf.TypeSpec` of the elements returned
+    You can also use the `element_spec` property of the
+    `tf.distribute.DistributedDataset` instance returned by this API to query
+    the `tf.TypeSpec` of the elements returned
     by the iterator. This can be used to set the `input_signature` property
     of a `tf.function`.
 
@@ -938,12 +945,21 @@ class StrategyBase(object):
       # train model with inputs
       return
 
-    # Iterate over the distributed dataset
+    # Iterate over the `tf.distribute.DistributedDataset`
     for x in dist_dataset:
       # process dataset elements
       strategy.run(train_step, args=(x,))
     ```
 
+    Note: The order in which the data is processed by the workers when using
+    `tf.distribute.Strategy.experimental_distribute_dataset` or
+    `tf.distribute.Strategy.experimental_distribute_datasets_from_function` is
+    not guaranteed. This is typically required if you are using
+    `tf.distribute` to scale prediction. You can however insert an index for
+    each element in the batch and order outputs accordingly. Refer to [this
+    snippet](https://www.tensorflow.org/tutorials/distribute/input#caveats)
+    for an example of how to order outputs.
+
     Args:
       dataset: `tf.data.Dataset` that will be sharded across all replicas using
         the rules stated above.
@@ -951,8 +967,7 @@ class StrategyBase(object):
         dataset is distributed.
 
     Returns:
-      A "distributed `Dataset`", which acts like a `tf.data.Dataset` except
-      it produces "per-replica" values.
+      A `tf.distribute.DistributedDataset`.
     """
     return self._extended._experimental_distribute_dataset(dataset, options)  # pylint: disable=protected-access
 
@@ -978,10 +993,10 @@ class StrategyBase(object):
     The `dataset_fn` should take an `tf.distribute.InputContext` instance where
     information about batching and input replication can be accessed.
 
-    You can also use the `element_spec` property of the distributed dataset
-    returned by this API to query the `tf.TypeSpec` of the elements returned
-    by the iterator. This can be used to set the `input_signature` property
-    of a `tf.function`.
+    You can also use the `element_spec` property of the
+    `tf.distribute.DistributedDataset` returned by this API to query the
+    `tf.TypeSpec` of the elements returned by the iterator. This can be used to
+    set the `input_signature` property of a `tf.function`.
 
     >>> global_batch_size = 8
     >>> def dataset_fn(input_context):
@@ -1010,6 +1025,16 @@ class StrategyBase(object):
     the global batch size.  This may be computed using
     `input_context.get_per_replica_batch_size`.
 
+
+    Note: The order in which the data is processed by the workers when using
+    `tf.distribute.Strategy.experimental_distribute_dataset` or
+    `tf.distribute.Strategy.experimental_distribute_datasets_from_function` is
+    not guaranteed. This is typically required if you are using
+    `tf.distribute` to scale prediction. You can however insert an index for
+    each element in the batch and order outputs accordingly. Refer to [this
+    snippet](https://www.tensorflow.org/tutorials/distribute/input#caveats)
+    for an example of how to order outputs.
+
     Args:
       dataset_fn: A function taking a `tf.distribute.InputContext` instance and
         returning a `tf.data.Dataset`.
@@ -1017,8 +1042,7 @@ class StrategyBase(object):
         dataset is distributed.
 
     Returns:
-      A "distributed `Dataset`", which acts like a `tf.data.Dataset` except
-      it produces "per-replica" values.
+      A `tf.distribute.DistributedDataset`.
     """
     return self._extended._experimental_distribute_datasets_from_function(  # pylint: disable=protected-access
         dataset_fn, options)
@@ -1028,7 +1052,9 @@ class StrategyBase(object):
 
     Executes ops specified by `fn` on each replica. If `args` or `kwargs` have
     `tf.distribute.DistributedValues`, such as those produced by a
-    "distributed `Dataset`" or `experimental_distribute_values_from_function`
+    `tf.distribute.DistributedDataset` from
+    `tf.distribute.Strategy.experimental_distribute_dataset` or
+    `tf.distribute.Strategy.experimental_distribute_datasets_from_function`,
     when `fn` is executed on a particular replica, it will be executed with the
     component of `tf.distribute.DistributedValues` that correspond to that
     replica.
diff --git a/tensorflow/python/distribute/input_lib.py b/tensorflow/python/distribute/input_lib.py
index 85e2dac1c1d..ff468af7f87 100644
--- a/tensorflow/python/distribute/input_lib.py
+++ b/tensorflow/python/distribute/input_lib.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import collections
 import functools
 import sys
 
@@ -52,6 +53,8 @@ from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.types import distribute as distribute_types
 from tensorflow.python.util import nest
 from tensorflow.python.util.deprecation import deprecated
+from tensorflow.python.util.tf_export import tf_export
+from tensorflow.tools.docs import doc_controls
 
 
 def get_distributed_dataset(dataset,
@@ -138,6 +141,321 @@ def get_distributed_datasets_from_function(dataset_fn,
         strategy)
 
 
+@tf_export("distribute.DistributedIterator", v1=[])
+class DistributedIteratorInterface(collections.Iterator,
+                                   distribute_types.Iterator):
+  """An iterator over `tf.distribute.DistributedDataset`.
+
+  `tf.distribute.DistributedIterator` is the primary mechanism for enumerating
+  elements of a `tf.distribute.DistributedDataset`. It supports the Python
+  Iterator protocol, which means it can be iterated over using a for-loop or by
+  fetching individual elements explicitly via `get_next()`.
+
+  You can create a `tf.distribute.DistributedIterator` by calling `iter` on
+  a `tf.distribute.DistributedDataset` or creating a python loop over a
+  `tf.distribute.DistributedDataset`.
+
+  Visit the [tutorial](https://www.tensorflow.org/tutorials/distribute/input)
+  on distributed input for more examples and caveats.
+  """
+
+  def get_next(self):
+    """Returns the next input from the iterator for all replicas.
+
+    Example use:
+
+    >>> strategy = tf.distribute.MirroredStrategy()
+    >>> dataset = tf.data.Dataset.range(100).batch(2)
+    >>> dist_dataset = strategy.experimental_distribute_dataset(dataset)
+    >>> dist_dataset_iterator = iter(dist_dataset)
+    >>> @tf.function
+    ... def one_step(input):
+    ...   return input
+    >>> step_num = 5
+    >>> for _ in range(step_num):
+    ...   strategy.run(one_step, args=(dist_dataset_iterator.get_next(),))
+    >>> strategy.experimental_local_results(dist_dataset_iterator.get_next())
+    (<tf.Tensor: shape=(2,), dtype=int64, numpy=array([10, 11])>,)
+
+    The above example corresponds to the case where you have only one device. If
+    you have two devices, for example,
+    ```python
+    strategy = tf.distribute.MirroredStrategy(['/gpu:0', '/gpu:1'])
+    ```
+    Then the final line will print out:
+    ```python
+    (<tf.Tensor: shape=(1,), dtype=int64, numpy=array([10])>,
+     <tf.Tensor: shape=(1,), dtype=int64, numpy=array([11])>)
+    ```
+
+    Returns:
+      A single `tf.Tensor` or a `tf.distribute.DistributedValues` which contains
+      the next input for all replicas.
+
+    Raises:
+      `tf.errors.OutOfRangeError`: If the end of the iterator has been reached.
+    """
+    raise NotImplementedError(
+        "DistributedIterator.get_next() must be implemented in descendants.")
+
+  @property
+  def element_spec(self):
+    # pylint: disable=line-too-long
+    """The type specification of an element of `tf.distribute.DistributedIterator`.
+
+    Example usage:
+
+    >>> global_batch_size = 16
+    >>> strategy = tf.distribute.MirroredStrategy()
+    >>> dataset = tf.data.Dataset.from_tensors(([1.],[2])).repeat(100).batch(global_batch_size)
+    >>> distributed_iterator = iter(strategy.experimental_distribute_dataset(dataset))
+    >>> distributed_iterator.element_spec
+    (TensorSpec(shape=(None, 1), dtype=tf.float32, name=None),
+     TensorSpec(shape=(None, 1), dtype=tf.int32, name=None))
+
+    The above example corresponds to the case where you have only one device. If
+    you have two devices, for example,
+    ```python
+    strategy = tf.distribute.MirroredStrategy(['/gpu:0', '/gpu:1'])
+    ```
+    Then the final line will print out:
+    ```python
+    (PerReplicaSpec(TensorSpec(shape=(None, 1), dtype=tf.float32, name=None),
+                    TensorSpec(shape=(None, 1), dtype=tf.float32, name=None)),
+     PerReplicaSpec(TensorSpec(shape=(None, 1), dtype=tf.int32, name=None),
+                    TensorSpec(shape=(None, 1), dtype=tf.int32, name=None)))
+    ```
+
+    Returns:
+      A nested structure of `tf.TypeSpec` objects matching the structure of an
+      element of this `tf.distribute.DistributedIterator`. This returned value
+      is typically a `tf.distribute.DistributedValues` object and specifies the
+      `tf.TensorSpec` of individual components.
+    """
+    raise NotImplementedError(
+        "DistributedIterator.element_spec() must be implemented in descendants")
+
+
+@tf_export("distribute.DistributedDataset", v1=[])
+class DistributedDatasetInterface(collections.Iterable,
+                                  distribute_types.Iterable):
+  # pylint: disable=line-too-long
+  """Represents a dataset distributed among devices and machines.
+
+  A `tf.distribute.DistributedDataset` could be thought of as a "distributed"
+  dataset. When you use `tf.distribute` API to scale training to multiple
+  devices or machines, you also need to distribute the input data, which leads
+  to a `tf.distribute.DistributedDataset` instance, instead of a
+  `tf.data.Dataset` instance in the non-distributed case. In TF 2.x,
+  `tf.distribute.DistributedDataset` objects are Python iterables.
+
+  Note: `tf.distribute.DistributedDataset` instances are *not* of type
+  `tf.data.Dataset`. It only supports two usages we will mention below:
+  iteration and `element_spec`. We don't support any other APIs to transform or
+  inspect the dataset.
+
+  There are two APIs to create a `tf.distribute.DistributedDataset` object:
+  `tf.distribute.Strategy.experimental_distribute_dataset(dataset)`and
+  `tf.distribute.Strategy.experimental_distribute_datasets_from_function(dataset_fn)`.
+  *When to use which?* When you have a `tf.data.Dataset` instance, and the
+  regular batch splitting (i.e. re-batch the input `tf.data.Dataset` instance
+  with a new batch size that is equal to the global batch size divided by the
+  number of replicas in sync) and autosharding (i.e. the
+  `tf.data.experimental.AutoShardPolicy` options) work for you, use the former
+  API. Otherwise, if you are *not* using a canonical `tf.data.Dataset` instance,
+  or you would like to customize the batch splitting or sharding, you can wrap
+  these logic in a `dataset_fn` and use the latter API. Both API handles
+  prefetch to device for the user. For more details and examples, follow the
+  links to the APIs.
+
+
+  There are two main usages of a `DistributedDataset` object:
+
+  1. Iterate over it to generate the input for a single device or multiple
+  devices, which is a `tf.distribute.DistributedValues` instance. To do this,
+  you can:
+
+    * use a pythonic for-loop construct:
+
+      >>> global_batch_size = 2
+      >>> strategy = tf.distribute.MirroredStrategy()
+      >>> dataset = tf.data.Dataset.from_tensors(([1.],[1.])).repeat(4).batch(global_batch_size)
+      >>> dist_dataset = strategy.experimental_distribute_dataset(dataset)
+      >>> @tf.function
+      ... def train_step(input):
+      ...   features, labels = input
+      ...   return labels - 0.3 * features
+      >>> for x in dist_dataset:
+      ...   # train_step trains the model using the dataset elements
+      ...   loss = strategy.run(train_step, args=(x,))
+      ...   print("Loss is", loss)
+      Loss is tf.Tensor(
+      [[0.7]
+       [0.7]], shape=(2, 1), dtype=float32)
+      Loss is tf.Tensor(
+      [[0.7]
+       [0.7]], shape=(2, 1), dtype=float32)
+
+      Placing the loop inside a `tf.function` will give a performance boost.
+      However `break` and `return` are currently not supported if the loop is
+      placed inside a `tf.function`. We also don't support placing the loop
+      inside a `tf.function` when using
+      `tf.distribute.experimental.MultiWorkerMirroredStrategy` or
+      `tf.distribute.experimental.TPUStrategy` with multiple workers.
+
+    * use `__iter__` to create an explicit iterator, which is of type
+      `tf.distribute.DistributedIterator`
+
+      >>> global_batch_size = 4
+      >>> strategy = tf.distribute.MirroredStrategy()
+      >>> train_dataset = tf.data.Dataset.from_tensors(([1.],[1.])).repeat(50).batch(global_batch_size)
+      >>> train_dist_dataset = strategy.experimental_distribute_dataset(train_dataset)
+      >>> @tf.function
+      ... def distributed_train_step(dataset_inputs):
+      ...   def train_step(input):
+      ...     loss = tf.constant(0.1)
+      ...     return loss
+      ...   per_replica_losses = strategy.run(train_step, args=(dataset_inputs,))
+      ...   return strategy.reduce(tf.distribute.ReduceOp.SUM, per_replica_losses,axis=None)
+      >>> EPOCHS = 2
+      >>> STEPS = 3
+      >>> for epoch in range(EPOCHS):
+      ...   total_loss = 0.0
+      ...   num_batches = 0
+      ...   dist_dataset_iterator = iter(train_dist_dataset)
+      ...   for _ in range(STEPS):
+      ...     total_loss += distributed_train_step(next(dist_dataset_iterator))
+      ...     num_batches += 1
+      ...   average_train_loss = total_loss / num_batches
+      ...   template = ("Epoch {}, Loss: {}")
+      ...   print (template.format(epoch+1, average_train_loss))
+      Epoch 1, Loss: 0.10000000894069672
+      Epoch 2, Loss: 0.10000000894069672
+
+
+    To achieve a performance improvement, you can also wrap the `strategy.run`
+    call with a `tf.range` inside a `tf.function`. This runs multiple steps in a
+    `tf.function`. Autograph will convert it to a `tf.while_loop` on the worker.
+    However, it is less flexible comparing with running a single step inside
+    `tf.function`. For example, you cannot run things eagerly or arbitrary
+    python code within the steps.
+
+
+  2. Inspect the `tf.TypeSpec` of the data generated by `DistributedDataset`.
+
+    `tf.distribute.DistributedDataset` generates
+    `tf.distribute.DistributedValues` as input to the devices. If you pass the
+    input to a `tf.function` and would like to specify the shape and type of
+    each Tensor argument to the function, you can pass a `tf.TypeSpec` object to
+    the `input_signature` argument of the `tf.function`. To get the
+    `tf.TypeSpec` of the input, you can use the `element_spec` property of the
+    `tf.distribute.DistributedDataset` or `tf.distribute.DistributedIterator`
+    object.
+
+    For example:
+
+    >>> global_batch_size = 2
+    >>> epochs = 1
+    >>> steps_per_epoch = 1
+    >>> mirrored_strategy = tf.distribute.MirroredStrategy()
+    >>> dataset = tf.data.Dataset.from_tensors(([2.])).repeat(100).batch(global_batch_size)
+    >>> dist_dataset = mirrored_strategy.experimental_distribute_dataset(dataset)
+    >>> @tf.function(input_signature=[dist_dataset.element_spec])
+    ... def train_step(per_replica_inputs):
+    ...   def step_fn(inputs):
+    ...     return tf.square(inputs)
+    ...   return mirrored_strategy.run(step_fn, args=(per_replica_inputs,))
+    >>> for _ in range(epochs):
+    ...   iterator = iter(dist_dataset)
+    ...   for _ in range(steps_per_epoch):
+    ...     output = train_step(next(iterator))
+    ...     print(output)
+    tf.Tensor(
+    [[4.]
+     [4.]], shape=(2, 1), dtype=float32)
+
+
+  Visit the [tutorial](https://www.tensorflow.org/tutorials/distribute/input)
+  on distributed input for more examples and caveats.
+  """
+
+  def __iter__(self):
+    """Creates an iterator for the `tf.distribute.DistributedDataset`.
+
+    The returned iterator implements the Python Iterator protocol.
+
+    Example usage:
+
+    >>> global_batch_size = 4
+    >>> strategy = tf.distribute.MirroredStrategy()
+    >>> dataset = tf.data.Dataset.from_tensor_slices([1, 2, 3, 4]).repeat().batch(global_batch_size)
+    >>> distributed_iterator = iter(strategy.experimental_distribute_dataset(dataset))
+    >>> print(next(distributed_iterator))
+    tf.Tensor([1 2 3 4], shape=(4,), dtype=int32)
+
+
+    The above example corresponds to the case where you have only one device. If
+    you have two devices, for example,
+    ```python
+    strategy = tf.distribute.MirroredStrategy(['/gpu:0', '/gpu:1'])
+    ```
+    Then the final line will print out:
+    ```python
+    PerReplica:{
+      0: tf.Tensor([1 2], shape=(2,), dtype=int32),
+      1: tf.Tensor([3 4], shape=(2,), dtype=int32)
+    }
+    ```
+
+    Returns:
+      An `tf.distribute.DistributedIterator` instance for the given
+      `tf.distribute.DistributedDataset` object to enumerate over the
+      distributed data.
+    """
+    raise NotImplementedError("Must be implemented in descendants")
+
+  @property
+  def element_spec(self):
+    """The type specification of an element of this `tf.distribute.DistributedDataset`.
+
+    Example usage:
+
+    >>> global_batch_size = 16
+    >>> strategy = tf.distribute.MirroredStrategy()
+    >>> dataset = tf.data.Dataset.from_tensors(([1.],[2])).repeat(100).batch(global_batch_size)
+    >>> dist_dataset = strategy.experimental_distribute_dataset(dataset)
+    >>> dist_dataset.element_spec
+    (TensorSpec(shape=(None, 1), dtype=tf.float32, name=None),
+     TensorSpec(shape=(None, 1), dtype=tf.int32, name=None))
+
+    The above example corresponds to the case where you have only one device. If
+    you have two devices, for example,
+    ```python
+    strategy = tf.distribute.MirroredStrategy(['/gpu:0', '/gpu:1'])
+    ```
+    Then the final line will print out:
+    ```python
+    (PerReplicaSpec(TensorSpec(shape=(None, 1), dtype=tf.float32, name=None),
+                    TensorSpec(shape=(None, 1), dtype=tf.float32, name=None)),
+     PerReplicaSpec(TensorSpec(shape=(None, 1), dtype=tf.int32, name=None),
+                    TensorSpec(shape=(None, 1), dtype=tf.int32, name=None)))
+    ```
+
+    Returns:
+      A nested structure of `tf.TypeSpec` objects matching the structure of an
+      element of this `tf.distribute.DistributedDataset`. This returned value is
+      typically a `tf.distribute.DistributedValues` object and specifies the
+      `tf.TensorSpec` of individual components.
+    """
+    raise NotImplementedError(
+        "DistributedDataset.element_spec must be implemented in descendants.")
+
+  @doc_controls.do_not_generate_docs
+  def reduce(self, initial_state, reduce_func):
+    raise NotImplementedError(
+        "DistributedDataset.reduce must be implemented in descendants.")
+
+
 class InputWorkers(object):
   """A 1-to-many mapping from input worker devices to compute devices."""
 
@@ -259,9 +577,10 @@ def _get_static_shape(iterators):
     return static_shape
 
 
-class DistributedIteratorBase(distribute_types.Iterator):
+class DistributedIteratorBase(DistributedIteratorInterface):
   """Common implementation for all input iterators."""
 
+  # pylint: disable=super-init-not-called
   def __init__(self, input_workers, iterators, strategy):
     static_shape = _get_static_shape(iterators)
 
@@ -548,9 +867,10 @@ class DistributedIterator(DistributedIteratorBase,
                                    self._strategy)
 
 
-class _IterableInput(distribute_types.Iterable):
+class _IterableInput(DistributedDatasetInterface):
   """Base class for iterable inputs for distribution strategies."""
 
+  # pylint: disable=super-init-not-called
   def __init__(self, input_workers):
     assert isinstance(input_workers, InputWorkers)
     self._input_workers = input_workers
diff --git a/tensorflow/python/distribute/values.py b/tensorflow/python/distribute/values.py
index c6e0eb34a7b..d0ed27c69de 100644
--- a/tensorflow/python/distribute/values.py
+++ b/tensorflow/python/distribute/values.py
@@ -75,20 +75,21 @@ def _on_write_update_replica(var, update_fn, value, **kwargs):
 class DistributedValues(object):
   """Base class for representing distributed values.
 
-  A subclass instance of DistributedValues is created when creating variables
-  within a distribution strategy, iterating a `tf.Dataset` or through
-  `strategy.run`.  This base class should never be instantiated
-  directly.  DistributedValues contains a value per replica.  Depending on
+  A subclass instance of `tf.distribute.DistributedValues` is created when
+  creating variables within a distribution strategy, iterating a
+  `tf.distribute.DistributedDataset` or through `tf.distribute.Strategy.run`.
+  This base class should never be instantiated directly.
+  `tf.distribute.DistributedValues` contains a value per replica. Depending on
   the subclass, the values could either be synced on update, synced on demand,
   or never synced.
 
-  DistributedValues can be reduced to obtain single value across replicas,
-  as input into `run` or the per replica values inspected
-  using `experimental_local_results`.
+  `tf.distribute.DistributedValues` can be reduced to obtain single value across
+  replicas, as input into `tf.distribute.Strategy.run` or the per-replica values
+  inspected using `tf.distribute.Strategy.experimental_local_results`.
 
   Example usage:
 
-  1. Created from Dataset:
+  1. Created from a `tf.distribute.DistributedDataset`:
 
   >>> strategy = tf.distribute.MirroredStrategy()
   >>> dataset = tf.data.Dataset.from_tensor_slices([5., 6., 7., 8.]).batch(2)
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-distributed-dataset.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-distributed-dataset.pbtxt
new file mode 100644
index 00000000000..a7b229c6c7c
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-distributed-dataset.pbtxt
@@ -0,0 +1,16 @@
+path: "tensorflow.distribute.DistributedDataset"
+tf_class {
+  is_instance: "<class \'tensorflow.python.distribute.input_lib.DistributedDatasetInterface\'>"
+  is_instance: "<class \'collections.abc.Iterable\'>"
+  member {
+    name: "element_spec"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+  }
+  member_method {
+    name: "reduce"
+    argspec: "args=[\'self\', \'initial_state\', \'reduce_func\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-distributed-iterator.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-distributed-iterator.pbtxt
new file mode 100644
index 00000000000..f712d9058b9
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-distributed-iterator.pbtxt
@@ -0,0 +1,16 @@
+path: "tensorflow.distribute.DistributedIterator"
+tf_class {
+  is_instance: "<class \'tensorflow.python.distribute.input_lib.DistributedIteratorInterface\'>"
+  is_instance: "<class \'collections.abc.Iterator\'>"
+  member {
+    name: "element_spec"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+  }
+  member_method {
+    name: "get_next"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.pbtxt
index 19d83909120..009cb7fe400 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.distribute.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.pbtxt
@@ -4,6 +4,14 @@ tf_module {
     name: "CrossDeviceOps"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "DistributedDataset"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "DistributedIterator"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "DistributedValues"
     mtype: "<type \'type\'>"

From a50001edf9fea03069771934e94d3b3d32ff6a19 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 17 Jun 2020 18:30:26 -0700
Subject: [PATCH 0460/1390] Internal change

PiperOrigin-RevId: 317008433
Change-Id: I5146e28d2f77c7daab4bd023f2826ec8323cff02
---
 tensorflow/tensorflow.bzl | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tensorflow/tensorflow.bzl b/tensorflow/tensorflow.bzl
index f97363a919e..5da15b0a4d6 100644
--- a/tensorflow/tensorflow.bzl
+++ b/tensorflow/tensorflow.bzl
@@ -1817,13 +1817,13 @@ def tf_custom_op_library_additional_deps_impl():
 # tf_collected_deps will be the union of the deps of the current target
 # and the tf_collected_deps of the dependencies of this target.
 def _collect_deps_aspect_impl(target, ctx):
-    alldeps = depset()
+    direct, transitive = [], []
     if hasattr(ctx.rule.attr, "deps"):
         for dep in ctx.rule.attr.deps:
-            alldeps = depset([dep.label], transitive = [alldeps])
+            direct.append(dep.label)
             if hasattr(dep, "tf_collected_deps"):
-                alldeps = depset(transitive = [alldeps, dep.tf_collected_deps])
-    return struct(tf_collected_deps = alldeps)
+                transitive.append(dep.tf_collected_deps)
+    return struct(tf_collected_deps = depset(direct = direct, transitive = transitive))
 
 collect_deps_aspect = aspect(
     attr_aspects = ["deps"],

From ae20f08da9ce9e7336ab97cc9f77ce7a1c13ad12 Mon Sep 17 00:00:00 2001
From: Yunxing Dai <yunxing@google.com>
Date: Wed, 17 Jun 2020 18:43:41 -0700
Subject: [PATCH 0461/1390] Properly support nest phi reduction in reverse
 order.

If we replaced node B with node C, then replace node A with node B, we
should redirect node A to node C instead.

PiperOrigin-RevId: 317010443
Change-Id: I165496a3d1f6571815bfd61d096e26cbba39125a
---
 .../xla/service/hlo_dataflow_analysis.cc      |  2 ++
 .../compiler/xla/service/hlo_phi_graph.cc     | 25 ++++++++++++++++++-
 .../compiler/xla/service/hlo_phi_graph.h      |  2 +-
 .../xla/service/hlo_phi_graph_test.cc         | 25 +++++++++++++++++++
 4 files changed, 52 insertions(+), 2 deletions(-)

diff --git a/tensorflow/compiler/xla/service/hlo_dataflow_analysis.cc b/tensorflow/compiler/xla/service/hlo_dataflow_analysis.cc
index f19882c9347..a46d20d5808 100644
--- a/tensorflow/compiler/xla/service/hlo_dataflow_analysis.cc
+++ b/tensorflow/compiler/xla/service/hlo_dataflow_analysis.cc
@@ -1007,6 +1007,8 @@ void HloDataflowAnalysis::OptimizePhiValues() {
             HloValue::Id phi_id = values[0]->id();
             HloValue::Id new_id = phi_graph_.FindOptimizedValue(phi_id);
             if (new_id != phi_id) {
+              VLOG(1) << "Replacing " << values[0]->ToString() << " with "
+                      << GetValue(new_id).ToString();
               value_set->Clear();
               const HloValue& new_value = GetValue(new_id);
               value_set->AddValue(&new_value);
diff --git a/tensorflow/compiler/xla/service/hlo_phi_graph.cc b/tensorflow/compiler/xla/service/hlo_phi_graph.cc
index 9b69771dab2..a2cba3d1bff 100644
--- a/tensorflow/compiler/xla/service/hlo_phi_graph.cc
+++ b/tensorflow/compiler/xla/service/hlo_phi_graph.cc
@@ -20,10 +20,11 @@ limitations under the License.
 namespace xla {
 HloValue::Id PhiGraph::GetOptimizedId(const HloValue& value) {
   Node* node = value_id_to_node_[value.id()];
+  CHECK(!node->mark_as_dead);
   return node->value_id;
 }
 
-// Returns true if the input to a hlo value is the same as `inputs`.
+// Returns true if the inputs to a hlo value are the same as `inputs`.
 bool PhiGraph::InputsEqualTo(const HloValue& value,
                              absl::Span<const HloValue* const> inputs) {
   auto iter = value_id_to_node_.find(value.id());
@@ -42,6 +43,7 @@ bool PhiGraph::InputsEqualTo(const HloValue& value,
 HloValue::Id PhiGraph::FindOptimizedValue(const HloValue::Id id) {
   auto iter = value_id_to_node_.find(id);
   CHECK(iter != value_id_to_node_.end());
+  CHECK(!iter->second->mark_as_dead);
   return iter->second->value_id;
 }
 
@@ -66,6 +68,17 @@ PhiGraph::Node* PhiGraph::CreateOrReuseNode(const HloValue& value) {
 void PhiGraph::ReplaceNodeWith(PhiGraph::Node* node, PhiGraph::Node* replace) {
   // Update users.
   CHECK(node->is_phi);
+  if (node->mark_as_dead) {
+    // The node has already been replaced with another.
+    return;
+  }
+  if (replace->mark_as_dead) {
+    // The node we are placing with has already been replaced with another node.
+    auto iter = value_id_to_node_.find(replace->value_id);
+    CHECK(iter != value_id_to_node_.end());
+    return ReplaceNodeWith(node, iter->second);
+  }
+  CHECK(!replace->mark_as_dead);
   for (Node* user : node->users) {
     absl::c_replace(user->operands, node, replace);
   }
@@ -74,6 +87,7 @@ void PhiGraph::ReplaceNodeWith(PhiGraph::Node* node, PhiGraph::Node* replace) {
   for (Node* operand : node->operands) {
     absl::c_replace(operand->users, node, replace);
   }
+
   for (HloValue::Id value_id : node_to_value_id_[node]) {
     CHECK(value_id_to_node_.contains(value_id));
     value_id_to_node_[value_id] = replace;
@@ -115,6 +129,8 @@ std::string PhiGraph::ToString() {
 }
 
 void PhiGraph::Optimize() {
+  VLOG(2) << "Optimizing phi graph:";
+  XLA_VLOG_LINES(2, ToString());
   // Set up users for each node.
   for (auto& node : node_storage_) {
     for (Node* input : node->operands) {
@@ -141,6 +157,8 @@ void PhiGraph::Optimize() {
 
       Node* node_ptr = node.get();
 
+      VLOG(2) << "Optimizing: " << node_ptr->value_id;
+
       CHECK_GE(node_ptr->operands.size(), 1);
 
       // Remove self-referencing ids from users and operands.
@@ -167,6 +185,9 @@ void PhiGraph::Optimize() {
           [&](Node* elem) { return elem == node_ptr->operands[0]; });
 
       if (all_inputs_are_same) {
+        VLOG(1) << "All inputs to node " << node_ptr->value_id
+                << " are the same, replacing it with "
+                << node_ptr->operands[0]->value_id;
         ReplaceNodeWith(node_ptr, node_ptr->operands[0]);
         changed = true;
         continue;
@@ -223,6 +244,8 @@ void PhiGraph::Optimize() {
             CHECK_EQ(node, non_phi);
             continue;
           }
+          VLOG(1) << "Replace node " << node->value_id
+                  << " in the closure with node " << non_phi->value_id;
           ReplaceNodeWith(node, non_phi);
           changed = true;
         }
diff --git a/tensorflow/compiler/xla/service/hlo_phi_graph.h b/tensorflow/compiler/xla/service/hlo_phi_graph.h
index a0eb994438e..ca0d5c5009c 100644
--- a/tensorflow/compiler/xla/service/hlo_phi_graph.h
+++ b/tensorflow/compiler/xla/service/hlo_phi_graph.h
@@ -90,7 +90,7 @@ class PhiGraph {
   // to that phi.
   absl::flat_hash_map<Node*, std::vector<HloValue::Id>> node_to_value_id_;
 
-  // A mapping between a HloValue and node in the phi graph.
+  // A mapping from a HloValue to node in the phi graph.
   absl::flat_hash_map<HloValue::Id, Node*> value_id_to_node_;
   std::vector<std::unique_ptr<Node>> node_storage_;
 };
diff --git a/tensorflow/compiler/xla/service/hlo_phi_graph_test.cc b/tensorflow/compiler/xla/service/hlo_phi_graph_test.cc
index 41f0454fe55..ee7300b160b 100644
--- a/tensorflow/compiler/xla/service/hlo_phi_graph_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_phi_graph_test.cc
@@ -82,5 +82,30 @@ TEST_F(PhiGraphTest, CircularPhi) {
   EXPECT_EQ(D.id(), phi_graph.FindOptimizedValue(C.id()));
 }
 
+TEST_F(PhiGraphTest, NestedPhiReduction) {
+  // def A = phi(B, C)
+  // def B = phi(C, E)
+  // def C = phi(A, B)
+  // def D = non-phi
+  // def E = Phi(D, D)
+  // 1. Replace E with D
+  // 2. Replace A B and C with E/D
+  PhiGraph phi_graph;
+  HloValue A = NewHloValue(true);
+  HloValue B = NewHloValue(true);
+  HloValue C = NewHloValue(true);
+  HloValue D = NewHloValue(false);
+  HloValue E = NewHloValue(true);
+  phi_graph.RegisterPhi(A, {&B, &C});
+  phi_graph.RegisterPhi(B, {&E, &C});
+  phi_graph.RegisterPhi(C, {&A, &B});
+  phi_graph.RegisterPhi(E, {&D, &D});
+  phi_graph.Optimize();
+  EXPECT_EQ(D.id(), phi_graph.FindOptimizedValue(A.id()));
+  EXPECT_EQ(D.id(), phi_graph.FindOptimizedValue(B.id()));
+  EXPECT_EQ(D.id(), phi_graph.FindOptimizedValue(C.id()));
+  EXPECT_EQ(D.id(), phi_graph.FindOptimizedValue(E.id()));
+}
+
 }  // namespace
 }  // namespace xla

From 7ae27e344cf9a501c89d785cfe4eb109a9848b47 Mon Sep 17 00:00:00 2001
From: Tiezhen WANG <wangtz@google.com>
Date: Wed, 17 Jun 2020 20:04:47 -0700
Subject: [PATCH 0462/1390] TFL: selective registration: Disable linkopts not
 available on Windows.

PiperOrigin-RevId: 317019750
Change-Id: Ibce23c534a0b8fe15ada085a57aec6f87db71a9b
---
 tensorflow/lite/build_def.bzl | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tensorflow/lite/build_def.bzl b/tensorflow/lite/build_def.bzl
index 5e487395355..e6c92691b15 100644
--- a/tensorflow/lite/build_def.bzl
+++ b/tensorflow/lite/build_def.bzl
@@ -768,10 +768,10 @@ def tflite_custom_cc_library(name, models = [], srcs = [], deps = [], visibility
         name = name,
         srcs = real_srcs,
         copts = tflite_copts(),
-        linkopts = [
-            "-lm",
-            "-ldl",
-        ],
+        linkopts = select({
+            "//tensorflow:windows": [],
+            "//conditions:default": ["-lm", "-ldl"],
+        }),
         deps = depset([
             "//tensorflow/lite:framework",
             "//tensorflow/lite/kernels:builtin_ops",

From 697a0a4fe6d792b57a280e2f22fd23fe219e9d81 Mon Sep 17 00:00:00 2001
From: Thomas Raoux <thomasraoux@google.com>
Date: Wed, 17 Jun 2020 20:43:17 -0700
Subject: [PATCH 0463/1390] Internal change

PiperOrigin-RevId: 317023916
Change-Id: I81311283fae6c28cdee2d7c87c8d062d32725bef
---
 third_party/mlir/BUILD | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/mlir/BUILD b/third_party/mlir/BUILD
index 476b8566265..db75b27e78b 100644
--- a/third_party/mlir/BUILD
+++ b/third_party/mlir/BUILD
@@ -2868,9 +2868,9 @@ cc_library(
         ":SideEffects",
         ":StandardOps",
         ":Support",
-        "//third_party/vulkan_loader",
         "@llvm-project//llvm:Support",
         "@vulkan_headers",
+        "@vulkan_sdk//:sdk",
     ],
 )
 

From 4f341bb742718721563ce6dccb965c85a1fbdcf5 Mon Sep 17 00:00:00 2001
From: Srinivas Vasudevan <srvasude@google.com>
Date: Wed, 17 Jun 2020 21:01:24 -0700
Subject: [PATCH 0464/1390] Add Bessel functions to the public api:     -
 tf.math.special.bessel_i0     - tf.math.special.bessel_i0e     -
 tf.math.special.bessel_i1     - tf.math.special.bessel_i1e     -
 tf.math.special.bessel_k0     - tf.math.special.bessel_k0e     -
 tf.math.special.bessel_k1     - tf.math.special.bessel_k1e     -
 tf.math.special.bessel_j0     - tf.math.special.bessel_j1     -
 tf.math.special.bessel_y0     - tf.math.special.bessel_y1

PiperOrigin-RevId: 317025879
Change-Id: I5c4407eda6bef0d1659b7a566979c7dbbad4ad83
---
 tensorflow/compiler/tf2xla/python/xla.py      |   5 +-
 .../api_def/base_api/api_def_BesselI0.pbtxt   |   4 +
 .../api_def/base_api/api_def_BesselI0e.pbtxt  |   8 +-
 .../api_def/base_api/api_def_BesselI1.pbtxt   |   4 +
 .../api_def/base_api/api_def_BesselI1e.pbtxt  |   8 +-
 .../api_def/base_api/api_def_BesselJ0.pbtxt   |   4 +
 .../api_def/base_api/api_def_BesselJ1.pbtxt   |   4 +
 .../api_def/base_api/api_def_BesselK0.pbtxt   |   4 +
 .../api_def/base_api/api_def_BesselK0e.pbtxt  |   4 +
 .../api_def/base_api/api_def_BesselK1.pbtxt   |   4 +
 .../api_def/base_api/api_def_BesselK1e.pbtxt  |   4 +
 .../api_def/base_api/api_def_BesselY0.pbtxt   |   4 +
 .../api_def/base_api/api_def_BesselY1.pbtxt   |   4 +
 .../python_api/api_def_BesselI0e.pbtxt        |   6 -
 .../python_api/api_def_BesselI1e.pbtxt        |   6 -
 tensorflow/core/kernels/cwise_op_bessel.cc    |  29 --
 tensorflow/core/kernels/cwise_ops.h           |   6 -
 .../special_math/special_math_op_bessel.cc    |  78 +++++
 .../special_math_op_gpu_bessel.cu.cc}         |  16 +-
 .../special_math/special_math_op_misc_impl.h  |  38 +++
 tensorflow/core/ops/math_ops.cc               |   4 -
 tensorflow/core/ops/special_math_ops.cc       |  49 ++-
 .../optimization/map_vectorization_test.py    |   6 +-
 .../eager/pywrap_gradient_exclusions.cc       |   6 +-
 .../kernel_tests/cwise_ops_unary_test.py      |  20 +-
 tensorflow/python/ops/math_grad.py            | 131 ++++++++-
 .../python/ops/parallel_for/math_test.py      |   4 +-
 tensorflow/python/ops/parallel_for/pfor.py    |  14 +-
 tensorflow/python/ops/signal/window_ops.py    |   5 +-
 tensorflow/python/ops/sparse_ops.py           |   6 +-
 tensorflow/python/ops/special_math_ops.py     | 278 +++++++++++++++++-
 .../python/ops/special_math_ops_test.py       | 234 ++++++++++++++-
 .../golden/v1/tensorflow.math.special.pbtxt   |  48 +++
 .../api/golden/v1/tensorflow.raw_ops.pbtxt    |  40 +++
 .../golden/v2/tensorflow.math.special.pbtxt   |  48 +++
 .../api/golden/v2/tensorflow.raw_ops.pbtxt    |  40 +++
 36 files changed, 1028 insertions(+), 145 deletions(-)
 create mode 100644 tensorflow/core/api_def/base_api/api_def_BesselI0.pbtxt
 create mode 100644 tensorflow/core/api_def/base_api/api_def_BesselI1.pbtxt
 create mode 100644 tensorflow/core/api_def/base_api/api_def_BesselJ0.pbtxt
 create mode 100644 tensorflow/core/api_def/base_api/api_def_BesselJ1.pbtxt
 create mode 100644 tensorflow/core/api_def/base_api/api_def_BesselK0.pbtxt
 create mode 100644 tensorflow/core/api_def/base_api/api_def_BesselK0e.pbtxt
 create mode 100644 tensorflow/core/api_def/base_api/api_def_BesselK1.pbtxt
 create mode 100644 tensorflow/core/api_def/base_api/api_def_BesselK1e.pbtxt
 create mode 100644 tensorflow/core/api_def/base_api/api_def_BesselY0.pbtxt
 create mode 100644 tensorflow/core/api_def/base_api/api_def_BesselY1.pbtxt
 delete mode 100644 tensorflow/core/api_def/python_api/api_def_BesselI0e.pbtxt
 delete mode 100644 tensorflow/core/api_def/python_api/api_def_BesselI1e.pbtxt
 delete mode 100644 tensorflow/core/kernels/cwise_op_bessel.cc
 create mode 100644 tensorflow/core/kernels/special_math/special_math_op_bessel.cc
 rename tensorflow/core/kernels/{cwise_op_bessel.cu.cc => special_math/special_math_op_gpu_bessel.cu.cc} (58%)

diff --git a/tensorflow/compiler/tf2xla/python/xla.py b/tensorflow/compiler/tf2xla/python/xla.py
index c59c47e92fb..0ebca2d546f 100644
--- a/tensorflow/compiler/tf2xla/python/xla.py
+++ b/tensorflow/compiler/tf2xla/python/xla.py
@@ -37,6 +37,7 @@ from tensorflow.python.ops import gen_math_ops
 from tensorflow.python.ops import gen_random_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import special_math_ops
 
 # TODO(phawkins): provide wrappers for all XLA operators. Currently the missing
 # ops include:
@@ -103,8 +104,8 @@ sign = _unary_op(math_ops.sign)
 tanh = _unary_op(math_ops.tanh)
 
 # Bessel
-bessel_i0e = _unary_op(math_ops.bessel_i0e)
-bessel_i1e = _unary_op(math_ops.bessel_i1e)
+bessel_i0e = _unary_op(special_math_ops.bessel_i0e)
+bessel_i1e = _unary_op(special_math_ops.bessel_i1e)
 
 # Binary operators
 
diff --git a/tensorflow/core/api_def/base_api/api_def_BesselI0.pbtxt b/tensorflow/core/api_def/base_api/api_def_BesselI0.pbtxt
new file mode 100644
index 00000000000..2c47960429c
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_BesselI0.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "BesselI0"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_BesselI0e.pbtxt b/tensorflow/core/api_def/base_api/api_def_BesselI0e.pbtxt
index 08313cebb99..7965af4916e 100644
--- a/tensorflow/core/api_def/base_api/api_def_BesselI0e.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_BesselI0e.pbtxt
@@ -1,10 +1,4 @@
 op {
   graph_op_name: "BesselI0e"
-  summary: "Computes the Bessel i0e function of `x` element-wise."
-  description: <<END
-Exponentially scaled modified Bessel function of order 0 defined as
-`bessel_i0e(x) = exp(-abs(x)) bessel_i0(x)`.
-
-This function is faster and numerically stabler than `bessel_i0(x)`.
-END
+  visibility: HIDDEN
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_BesselI1.pbtxt b/tensorflow/core/api_def/base_api/api_def_BesselI1.pbtxt
new file mode 100644
index 00000000000..e0007b44162
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_BesselI1.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "BesselI1"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_BesselI1e.pbtxt b/tensorflow/core/api_def/base_api/api_def_BesselI1e.pbtxt
index 3e46a9506f5..dffd296f6d8 100644
--- a/tensorflow/core/api_def/base_api/api_def_BesselI1e.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_BesselI1e.pbtxt
@@ -1,10 +1,4 @@
 op {
   graph_op_name: "BesselI1e"
-  summary: "Computes the Bessel i1e function of `x` element-wise."
-  description: <<END
-Exponentially scaled modified Bessel function of order 0 defined as
-`bessel_i1e(x) = exp(-abs(x)) bessel_i1(x)`.
-
-This function is faster and numerically stabler than `bessel_i1(x)`.
-END
+  visibility: HIDDEN
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_BesselJ0.pbtxt b/tensorflow/core/api_def/base_api/api_def_BesselJ0.pbtxt
new file mode 100644
index 00000000000..4010afadcb8
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_BesselJ0.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "BesselJ0"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_BesselJ1.pbtxt b/tensorflow/core/api_def/base_api/api_def_BesselJ1.pbtxt
new file mode 100644
index 00000000000..12d16910227
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_BesselJ1.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "BesselJ1"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_BesselK0.pbtxt b/tensorflow/core/api_def/base_api/api_def_BesselK0.pbtxt
new file mode 100644
index 00000000000..31d701c821b
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_BesselK0.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "BesselK0"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_BesselK0e.pbtxt b/tensorflow/core/api_def/base_api/api_def_BesselK0e.pbtxt
new file mode 100644
index 00000000000..fac0c1b3459
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_BesselK0e.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "BesselK0e"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_BesselK1.pbtxt b/tensorflow/core/api_def/base_api/api_def_BesselK1.pbtxt
new file mode 100644
index 00000000000..de80f304540
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_BesselK1.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "BesselK1"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_BesselK1e.pbtxt b/tensorflow/core/api_def/base_api/api_def_BesselK1e.pbtxt
new file mode 100644
index 00000000000..c565a85def2
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_BesselK1e.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "BesselK1e"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_BesselY0.pbtxt b/tensorflow/core/api_def/base_api/api_def_BesselY0.pbtxt
new file mode 100644
index 00000000000..af57e504d65
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_BesselY0.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "BesselY0"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_BesselY1.pbtxt b/tensorflow/core/api_def/base_api/api_def_BesselY1.pbtxt
new file mode 100644
index 00000000000..b2cd9827f6f
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_BesselY1.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "BesselY1"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_BesselI0e.pbtxt b/tensorflow/core/api_def/python_api/api_def_BesselI0e.pbtxt
deleted file mode 100644
index fdbe5282bc1..00000000000
--- a/tensorflow/core/api_def/python_api/api_def_BesselI0e.pbtxt
+++ /dev/null
@@ -1,6 +0,0 @@
-op {
-  graph_op_name: "BesselI0e"
-  endpoint {
-    name: "math.bessel_i0e"
-  }
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_BesselI1e.pbtxt b/tensorflow/core/api_def/python_api/api_def_BesselI1e.pbtxt
deleted file mode 100644
index 3f08cd766d8..00000000000
--- a/tensorflow/core/api_def/python_api/api_def_BesselI1e.pbtxt
+++ /dev/null
@@ -1,6 +0,0 @@
-op {
-  graph_op_name: "BesselI1e"
-  endpoint {
-    name: "math.bessel_i1e"
-  }
-}
diff --git a/tensorflow/core/kernels/cwise_op_bessel.cc b/tensorflow/core/kernels/cwise_op_bessel.cc
deleted file mode 100644
index dedc961ffea..00000000000
--- a/tensorflow/core/kernels/cwise_op_bessel.cc
+++ /dev/null
@@ -1,29 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/core/kernels/cwise_ops_common.h"
-
-namespace tensorflow {
-REGISTER3(UnaryOp, CPU, "BesselI0e", functor::bessel_i0e, Eigen::half, float,
-          double);
-REGISTER3(UnaryOp, CPU, "BesselI1e", functor::bessel_i1e, Eigen::half, float,
-          double);
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-REGISTER3(UnaryOp, GPU, "BesselI0e", functor::bessel_i0e, Eigen::half, float,
-          double);
-REGISTER3(UnaryOp, GPU, "BesselI1e", functor::bessel_i1e, Eigen::half, float,
-          double);
-#endif
-}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_ops.h b/tensorflow/core/kernels/cwise_ops.h
index b8bf19c2cec..88651d7bfdc 100644
--- a/tensorflow/core/kernels/cwise_ops.h
+++ b/tensorflow/core/kernels/cwise_ops.h
@@ -943,12 +943,6 @@ struct acos : base<T, Eigen::internal::scalar_acos_op<T>> {};
 template <typename T>
 struct atan : base<T, Eigen::internal::scalar_atan_op<T>> {};
 
-template <typename T>
-struct bessel_i0e : base<T, Eigen::internal::scalar_bessel_i0e_op<T>> {};
-
-template <typename T>
-struct bessel_i1e : base<T, Eigen::internal::scalar_bessel_i1e_op<T>> {};
-
 struct logical_not : base<bool, Eigen::internal::scalar_boolean_not_op<bool>> {
 };
 
diff --git a/tensorflow/core/kernels/special_math/special_math_op_bessel.cc b/tensorflow/core/kernels/special_math/special_math_op_bessel.cc
new file mode 100644
index 00000000000..8efa183655e
--- /dev/null
+++ b/tensorflow/core/kernels/special_math/special_math_op_bessel.cc
@@ -0,0 +1,78 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/kernels/cwise_ops_common.h"
+#include "tensorflow/core/kernels/special_math/special_math_op_misc_impl.h"
+
+namespace tensorflow {
+REGISTER3(UnaryOp, CPU, "BesselI0", functor::bessel_i0, Eigen::half, float,
+          double);
+REGISTER3(UnaryOp, CPU, "BesselI1", functor::bessel_i1, Eigen::half, float,
+          double);
+REGISTER3(UnaryOp, CPU, "BesselI0e", functor::bessel_i0e, Eigen::half, float,
+          double);
+REGISTER3(UnaryOp, CPU, "BesselI1e", functor::bessel_i1e, Eigen::half, float,
+          double);
+
+REGISTER3(UnaryOp, CPU, "BesselK0", functor::bessel_k0, Eigen::half, float,
+          double);
+REGISTER3(UnaryOp, CPU, "BesselK1", functor::bessel_k1, Eigen::half, float,
+          double);
+REGISTER3(UnaryOp, CPU, "BesselK0e", functor::bessel_k0e, Eigen::half, float,
+          double);
+REGISTER3(UnaryOp, CPU, "BesselK1e", functor::bessel_k1e, Eigen::half, float,
+          double);
+
+REGISTER3(UnaryOp, CPU, "BesselJ0", functor::bessel_j0, Eigen::half, float,
+          double);
+REGISTER3(UnaryOp, CPU, "BesselJ1", functor::bessel_j1, Eigen::half, float,
+          double);
+
+REGISTER3(UnaryOp, CPU, "BesselY0", functor::bessel_y0, Eigen::half, float,
+          double);
+REGISTER3(UnaryOp, CPU, "BesselY1", functor::bessel_y1, Eigen::half, float,
+          double);
+
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
+REGISTER3(UnaryOp, GPU, "BesselI0", functor::bessel_i0, Eigen::half, float,
+          double);
+REGISTER3(UnaryOp, GPU, "BesselI1", functor::bessel_i1, Eigen::half, float,
+          double);
+REGISTER3(UnaryOp, GPU, "BesselI0e", functor::bessel_i0e, Eigen::half, float,
+          double);
+REGISTER3(UnaryOp, GPU, "BesselI1e", functor::bessel_i1e, Eigen::half, float,
+          double);
+
+REGISTER3(UnaryOp, GPU, "BesselK0", functor::bessel_k0, Eigen::half, float,
+          double);
+REGISTER3(UnaryOp, GPU, "BesselK1", functor::bessel_k1, Eigen::half, float,
+          double);
+REGISTER3(UnaryOp, GPU, "BesselK0e", functor::bessel_k0e, Eigen::half, float,
+          double);
+REGISTER3(UnaryOp, GPU, "BesselK1e", functor::bessel_k1e, Eigen::half, float,
+          double);
+
+REGISTER3(UnaryOp, GPU, "BesselJ0", functor::bessel_j0, Eigen::half, float,
+          double);
+REGISTER3(UnaryOp, GPU, "BesselJ1", functor::bessel_j1, Eigen::half, float,
+          double);
+
+REGISTER3(UnaryOp, GPU, "BesselY0", functor::bessel_y0, Eigen::half, float,
+          double);
+REGISTER3(UnaryOp, GPU, "BesselY1", functor::bessel_y1, Eigen::half, float,
+          double);
+#endif
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_bessel.cu.cc b/tensorflow/core/kernels/special_math/special_math_op_gpu_bessel.cu.cc
similarity index 58%
rename from tensorflow/core/kernels/cwise_op_bessel.cu.cc
rename to tensorflow/core/kernels/special_math/special_math_op_gpu_bessel.cu.cc
index 3d47dddcdcb..bfb4f253390 100644
--- a/tensorflow/core/kernels/cwise_op_bessel.cu.cc
+++ b/tensorflow/core/kernels/special_math/special_math_op_gpu_bessel.cu.cc
@@ -1,4 +1,4 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -16,11 +16,25 @@ limitations under the License.
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 #include "tensorflow/core/kernels/cwise_ops_gpu_common.cu.h"
+#include "tensorflow/core/kernels/special_math/special_math_op_misc_impl.h"
 
 namespace tensorflow {
 namespace functor {
+DEFINE_UNARY3(bessel_i0, Eigen::half, float, double);
+DEFINE_UNARY3(bessel_i1, Eigen::half, float, double);
 DEFINE_UNARY3(bessel_i0e, Eigen::half, float, double);
 DEFINE_UNARY3(bessel_i1e, Eigen::half, float, double);
+
+DEFINE_UNARY3(bessel_k0, Eigen::half, float, double);
+DEFINE_UNARY3(bessel_k1, Eigen::half, float, double);
+DEFINE_UNARY3(bessel_k0e, Eigen::half, float, double);
+DEFINE_UNARY3(bessel_k1e, Eigen::half, float, double);
+
+DEFINE_UNARY3(bessel_j0, Eigen::half, float, double);
+DEFINE_UNARY3(bessel_j1, Eigen::half, float, double);
+DEFINE_UNARY3(bessel_y0, Eigen::half, float, double);
+DEFINE_UNARY3(bessel_y1, Eigen::half, float, double);
+
 }  // namespace functor
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/kernels/special_math/special_math_op_misc_impl.h b/tensorflow/core/kernels/special_math/special_math_op_misc_impl.h
index 75a04a70cc1..c290c47a384 100644
--- a/tensorflow/core/kernels/special_math/special_math_op_misc_impl.h
+++ b/tensorflow/core/kernels/special_math/special_math_op_misc_impl.h
@@ -685,6 +685,44 @@ struct fresnel_sin : base<T, Eigen::internal::fresnel_sin_op<T>> {};
 template <typename T>
 struct spence : base<T, Eigen::internal::spence_op<T>> {};
 
+// Bessel Functions
+
+template <typename T>
+struct bessel_i0 : base<T, Eigen::internal::scalar_bessel_i0_op<T>> {};
+
+template <typename T>
+struct bessel_i0e : base<T, Eigen::internal::scalar_bessel_i0e_op<T>> {};
+
+template <typename T>
+struct bessel_i1 : base<T, Eigen::internal::scalar_bessel_i1_op<T>> {};
+
+template <typename T>
+struct bessel_i1e : base<T, Eigen::internal::scalar_bessel_i1e_op<T>> {};
+
+template <typename T>
+struct bessel_k0 : base<T, Eigen::internal::scalar_bessel_k0_op<T>> {};
+
+template <typename T>
+struct bessel_k0e : base<T, Eigen::internal::scalar_bessel_k0e_op<T>> {};
+
+template <typename T>
+struct bessel_k1 : base<T, Eigen::internal::scalar_bessel_k1_op<T>> {};
+
+template <typename T>
+struct bessel_k1e : base<T, Eigen::internal::scalar_bessel_k1e_op<T>> {};
+
+template <typename T>
+struct bessel_j0 : base<T, Eigen::internal::scalar_bessel_j0_op<T>> {};
+
+template <typename T>
+struct bessel_j1 : base<T, Eigen::internal::scalar_bessel_j1_op<T>> {};
+
+template <typename T>
+struct bessel_y0 : base<T, Eigen::internal::scalar_bessel_y0_op<T>> {};
+
+template <typename T>
+struct bessel_y1 : base<T, Eigen::internal::scalar_bessel_y1_op<T>> {};
+
 }  // end namespace functor
 }  // end namespace tensorflow
 
diff --git a/tensorflow/core/ops/math_ops.cc b/tensorflow/core/ops/math_ops.cc
index b3c654bdb1d..b81bb9d3afc 100644
--- a/tensorflow/core/ops/math_ops.cc
+++ b/tensorflow/core/ops/math_ops.cc
@@ -297,10 +297,6 @@ REGISTER_OP("Acos").UNARY();
 
 REGISTER_OP("Atan").UNARY();
 
-REGISTER_OP("BesselI0e").UNARY_REAL();
-
-REGISTER_OP("BesselI1e").UNARY_REAL();
-
 REGISTER_OP("_UnaryOpsComposition")
     .Input("x: T")
     .Output("y: T")
diff --git a/tensorflow/core/ops/special_math_ops.cc b/tensorflow/core/ops/special_math_ops.cc
index 1bef65b622b..6d86da849af 100644
--- a/tensorflow/core/ops/special_math_ops.cc
+++ b/tensorflow/core/ops/special_math_ops.cc
@@ -20,34 +20,33 @@ limitations under the License.
 
 namespace tensorflow {
 
-REGISTER_OP("Dawsn")
-    .Input("x: T")
-    .Output("y: T")
-    .Attr("T: {float, double}")
-    .SetShapeFn(shape_inference::UnchangedShape);
+#define UNARY_REAL()                              \
+  Input("x: T")                                   \
+      .Output("y: T")                             \
+      .Attr("T: {bfloat16, half, float, double}") \
+      .SetShapeFn(shape_inference::UnchangedShape)
 
-REGISTER_OP("Expint")
-    .Input("x: T")
-    .Output("y: T")
-    .Attr("T: {float, double}")
-    .SetShapeFn(shape_inference::UnchangedShape);
+REGISTER_OP("Dawsn").UNARY_REAL();
+REGISTER_OP("Expint").UNARY_REAL();
+REGISTER_OP("FresnelCos").UNARY_REAL();
+REGISTER_OP("FresnelSin").UNARY_REAL();
+REGISTER_OP("Spence").UNARY_REAL();
 
-REGISTER_OP("FresnelCos")
-    .Input("x: T")
-    .Output("y: T")
-    .Attr("T: {float, double}")
-    .SetShapeFn(shape_inference::UnchangedShape);
+// Bessel functions
 
-REGISTER_OP("FresnelSin")
-    .Input("x: T")
-    .Output("y: T")
-    .Attr("T: {float, double}")
-    .SetShapeFn(shape_inference::UnchangedShape);
+REGISTER_OP("BesselI0").UNARY_REAL();
+REGISTER_OP("BesselI1").UNARY_REAL();
+REGISTER_OP("BesselI0e").UNARY_REAL();
+REGISTER_OP("BesselI1e").UNARY_REAL();
 
-REGISTER_OP("Spence")
-    .Input("x: T")
-    .Output("y: T")
-    .Attr("T: {float, double}")
-    .SetShapeFn(shape_inference::UnchangedShape);
+REGISTER_OP("BesselK0").UNARY_REAL();
+REGISTER_OP("BesselK1").UNARY_REAL();
+REGISTER_OP("BesselK0e").UNARY_REAL();
+REGISTER_OP("BesselK1e").UNARY_REAL();
+
+REGISTER_OP("BesselJ0").UNARY_REAL();
+REGISTER_OP("BesselJ1").UNARY_REAL();
+REGISTER_OP("BesselY0").UNARY_REAL();
+REGISTER_OP("BesselY1").UNARY_REAL();
 
 }  // namespace tensorflow
diff --git a/tensorflow/python/data/experimental/kernel_tests/optimization/map_vectorization_test.py b/tensorflow/python/data/experimental/kernel_tests/optimization/map_vectorization_test.py
index d1a68931d38..a806e745ef9 100644
--- a/tensorflow/python/data/experimental/kernel_tests/optimization/map_vectorization_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/optimization/map_vectorization_test.py
@@ -44,6 +44,7 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn
 from tensorflow.python.ops import parsing_ops
 from tensorflow.python.ops import script_ops
+from tensorflow.python.ops import special_math_ops
 from tensorflow.python.platform import test
 
 
@@ -90,8 +91,9 @@ def _unary_real_test_combinations():
       ("Asinh", math_ops.asinh),
       ("Atan", math_ops.atan),
       ("Atanh", math_ops.atanh),
-      ("BesselI0e", math_ops.bessel_i0e),
-      ("BesselI1e", math_ops.bessel_i1e),
+      # TODO(b/157272291): Add testing for more special functions.
+      ("BesselI0e", special_math_ops.bessel_i0e),
+      ("BesselI1e", special_math_ops.bessel_i1e),
       ("Ceil", math_ops.ceil),
       ("Cos", math_ops.cos),
       ("Cosh", math_ops.cosh),
diff --git a/tensorflow/python/eager/pywrap_gradient_exclusions.cc b/tensorflow/python/eager/pywrap_gradient_exclusions.cc
index 72757ae41e2..a7c7ab7abc7 100644
--- a/tensorflow/python/eager/pywrap_gradient_exclusions.cc
+++ b/tensorflow/python/eager/pywrap_gradient_exclusions.cc
@@ -411,7 +411,7 @@ absl::optional<tensorflow::gtl::FlatSet<int>> OpGradientUnusedInputIndices(
 
 absl::optional<tensorflow::gtl::FlatSet<int>> OpGradientUnusedOutputIndices(
     const tensorflow::string &op_name) {
-  static std::array<OpIndexInfo, 461> a = {{
+  static std::array<OpIndexInfo, 465> a = {{
       {"Abs"},
       {"AccumulateNV2"},
       {"Acos"},
@@ -443,6 +443,10 @@ absl::optional<tensorflow::gtl::FlatSet<int>> OpGradientUnusedOutputIndices(
       {"BatchNormWithGlobalNormalization"},
       {"BatchToSpace"},
       {"BatchToSpaceND"},
+      {"BesselI0"},
+      {"BesselJ0"},
+      {"BesselK0"},
+      {"BesselY0"},
       {"Betainc"},
       {"BiasAdd"},
       {"BiasAddGrad"},
diff --git a/tensorflow/python/kernel_tests/cwise_ops_unary_test.py b/tensorflow/python/kernel_tests/cwise_ops_unary_test.py
index 0c6807197e9..f4beaabc29a 100644
--- a/tensorflow/python/kernel_tests/cwise_ops_unary_test.py
+++ b/tensorflow/python/kernel_tests/cwise_ops_unary_test.py
@@ -33,6 +33,7 @@ from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import gradient_checker_v2
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_grad  # pylint: disable=unused-import
+from tensorflow.python.ops import special_math_ops
 from tensorflow.python.platform import test
 from tensorflow.python.platform import tf_logging
 
@@ -228,8 +229,8 @@ class UnaryOpTest(test.TestCase):
     self._compareBoth(x, np.vectorize(math.erfc), math_ops.erfc)
     try:
       from scipy import special  # pylint: disable=g-import-not-at-top
-      self._compareBoth(x, special.i0e, math_ops.bessel_i0e)
-      self._compareBoth(x, special.i1e, math_ops.bessel_i1e)
+      self._compareBoth(x, special.i0e, special_math_ops.bessel_i0e)
+      self._compareBoth(x, special.i1e, special_math_ops.bessel_i1e)
     except ImportError as e:
       tf_logging.warn("Cannot test special functions: %s" % str(e))
 
@@ -281,8 +282,8 @@ class UnaryOpTest(test.TestCase):
     self._compareBoth(x, np.arctan, math_ops.atan)
     try:
       from scipy import special  # pylint: disable=g-import-not-at-top
-      self._compareBoth(x, special.i0e, math_ops.bessel_i0e)
-      self._compareBoth(x, special.i1e, math_ops.bessel_i1e)
+      self._compareBoth(x, special.i0e, special_math_ops.bessel_i0e)
+      self._compareBoth(x, special.i1e, special_math_ops.bessel_i1e)
     except ImportError as e:
       tf_logging.warn("Cannot test special functions: %s" % str(e))
 
@@ -335,8 +336,8 @@ class UnaryOpTest(test.TestCase):
     self._compareBoth(k, np.tan, math_ops.tan)
     try:
       from scipy import special  # pylint: disable=g-import-not-at-top
-      self._compareBoth(x, special.i0e, math_ops.bessel_i0e)
-      self._compareBoth(x, special.i1e, math_ops.bessel_i1e)
+      self._compareBoth(x, special.i0e, special_math_ops.bessel_i0e)
+      self._compareBoth(x, special.i1e, special_math_ops.bessel_i1e)
     except ImportError as e:
       tf_logging.warn("Cannot test special functions: %s" % str(e))
 
@@ -375,13 +376,6 @@ class UnaryOpTest(test.TestCase):
         math_ops.lgamma)
     self._compareBoth(x, np.vectorize(math.erf), math_ops.erf)
     self._compareBoth(x, np.vectorize(math.erfc), math_ops.erfc)
-    try:
-      from scipy import special  # pylint: disable=g-import-not-at-top
-      self._compareBoth(x, special.i0e, math_ops.bessel_i0e)
-      self._compareBoth(x, special.i1e, math_ops.bessel_i1e)
-    except ImportError as e:
-      tf_logging.warn("Cannot test special functions: %s" % str(e))
-
     self._compareBothSparse(x, np.abs, math_ops.abs)
     self._compareBothSparse(x, np.negative, math_ops.negative)
     self._compareBothSparse(x, np.square, math_ops.square)
diff --git a/tensorflow/python/ops/math_grad.py b/tensorflow/python/ops/math_grad.py
index 8ce35de006a..7fe9340ba5c 100644
--- a/tensorflow/python/ops/math_grad.py
+++ b/tensorflow/python/ops/math_grad.py
@@ -29,6 +29,7 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_array_ops
 from tensorflow.python.ops import gen_math_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import special_math_ops
 
 
 def _safe_shape_div(x, y):
@@ -875,16 +876,42 @@ def _SpenceGrad(op, grad):
     return grad * partial_x
 
 
+@ops.RegisterGradient("BesselI0")
+def _BesselI0Grad(op, grad):
+  """Compute gradient of bessel_i0(x) with respect to its argument."""
+  x = op.inputs[0]
+  with ops.control_dependencies([grad]):
+    partial_x = special_math_ops.bessel_i1(x)
+    return grad * partial_x
+
+
 @ops.RegisterGradient("BesselI0e")
 def _BesselI0eGrad(op, grad):
   """Compute gradient of bessel_i0e(x) with respect to its argument."""
   x = op.inputs[0]
   y = op.outputs[0]
   with ops.control_dependencies([grad]):
-    partial_x = (math_ops.bessel_i1e(x) - math_ops.sign(x) * y)
+    partial_x = (special_math_ops.bessel_i1e(x) - math_ops.sign(x) * y)
     return grad * partial_x
 
 
+@ops.RegisterGradient("BesselI1")
+def _BesselI1Grad(op, grad):
+  """Compute gradient of bessel_i1(x) with respect to its argument."""
+  x = op.inputs[0]
+  y = op.outputs[0]
+  with ops.control_dependencies([grad]):
+    # For x = 0, the correct gradient is 1.0.
+    # However, the main branch gives NaN because of the division by x, so
+    # we impute the gradient manually.
+    # An alternative solution is to express the gradient via bessel_i0 and
+    # bessel_i2, but the latter is not yet implemented in Eigen.
+    dy_dx = array_ops.where_v2(
+        math_ops.equal(x, 0.), math_ops.cast(1., x.dtype),
+        special_math_ops.bessel_i0(x) - math_ops.div(y, x))
+    return grad * dy_dx
+
+
 @ops.RegisterGradient("BesselI1e")
 def _BesselI1eGrad(op, grad):
   """Compute gradient of bessel_i1e(x) with respect to its argument."""
@@ -896,16 +923,104 @@ def _BesselI1eGrad(op, grad):
     # we impute the gradient manually.
     # An alternative solution is to express the gradient via bessel_i0e and
     # bessel_i2e, but the latter is not yet implemented in Eigen.
-    eps = np.finfo(x.dtype.as_numpy_dtype).eps
-    zeros = array_ops.zeros_like(x)
-    x_is_not_tiny = math_ops.abs(x) > eps
-    safe_x = array_ops.where_v2(x_is_not_tiny, x, eps + zeros)
-    dy_dx = math_ops.bessel_i0e(safe_x) - y * (
-        math_ops.sign(safe_x) + math_ops.reciprocal(safe_x))
-    dy_dx = array_ops.where_v2(x_is_not_tiny, dy_dx, 0.5 + zeros)
+    dy_dx = array_ops.where_v2(
+        math_ops.equal(x, 0.), math_ops.cast(0.5, x.dtype),
+        special_math_ops.bessel_i0e(x) - y *
+        (math_ops.sign(x) + math_ops.reciprocal(x)))
     return grad * dy_dx
 
 
+@ops.RegisterGradient("BesselK0")
+def _BesselK0Grad(op, grad):
+  """Compute gradient of bessel_k0(x) with respect to its argument."""
+  x = op.inputs[0]
+  with ops.control_dependencies([grad]):
+    partial_x = -special_math_ops.bessel_k1(x)
+    return grad * partial_x
+
+
+@ops.RegisterGradient("BesselK0e")
+def _BesselK0eGrad(op, grad):
+  """Compute gradient of bessel_k0e(x) with respect to its argument."""
+  x = op.inputs[0]
+  y = op.outputs[0]
+  with ops.control_dependencies([grad]):
+    partial_x = (y - special_math_ops.bessel_k1e(x))
+    return grad * partial_x
+
+
+@ops.RegisterGradient("BesselK1")
+def _BesselK1Grad(op, grad):
+  """Compute gradient of bessel_k1(x) with respect to its argument."""
+  x = op.inputs[0]
+  y = op.outputs[0]
+  with ops.control_dependencies([grad]):
+    # At 0., this is NaN which is fine since the derivative is undefined
+    # at 0.
+    partial_x = -special_math_ops.bessel_k0(x) - math_ops.div(y, x)
+    return grad * partial_x
+
+
+@ops.RegisterGradient("BesselK1e")
+def _BesselK1eGrad(op, grad):
+  """Compute gradient of bessel_k1e(x) with respect to its argument."""
+  x = op.inputs[0]
+  y = op.outputs[0]
+  with ops.control_dependencies([grad]):
+    # At 0., this is NaN which is fine since the derivative is undefined
+    # at 0.
+    partial_x = (
+        y * (1. - math_ops.reciprocal(x)) - special_math_ops.bessel_k0e(x))
+    return grad * partial_x
+
+
+@ops.RegisterGradient("BesselJ0")
+def _BesselJ0Grad(op, grad):
+  """Compute gradient of bessel_j0(x) with respect to its argument."""
+  x = op.inputs[0]
+  with ops.control_dependencies([grad]):
+    partial_x = -special_math_ops.bessel_j1(x)
+    return grad * partial_x
+
+
+@ops.RegisterGradient("BesselJ1")
+def _BesselJ1Grad(op, grad):
+  """Compute gradient of bessel_j1(x) with respect to its argument."""
+  x = op.inputs[0]
+  y = op.outputs[0]
+  with ops.control_dependencies([grad]):
+    # For x = 0, the correct gradient is 0.5.
+    # However, the main branch gives NaN because of the division by x, so
+    # we impute the gradient manually.
+    # An alternative solution is to express the gradient via bessel_i0e and
+    # bessel_i2e, but the latter is not yet implemented in Eigen.
+    dy_dx = array_ops.where_v2(
+        math_ops.equal(x, 0.), math_ops.cast(0.5, x.dtype),
+        special_math_ops.bessel_j0(x) - math_ops.div(y, x))
+    return grad * dy_dx
+
+
+@ops.RegisterGradient("BesselY0")
+def _BesselY0Grad(op, grad):
+  """Compute gradient of bessel_y0(x) with respect to its argument."""
+  x = op.inputs[0]
+  with ops.control_dependencies([grad]):
+    partial_x = -special_math_ops.bessel_y1(x)
+    return grad * partial_x
+
+
+@ops.RegisterGradient("BesselY1")
+def _BesselY1Grad(op, grad):
+  """Compute gradient of bessel_y1(x) with respect to its argument."""
+  x = op.inputs[0]
+  y = op.outputs[0]
+  with ops.control_dependencies([grad]):
+    # At 0., this is NaN which is fine since the derivative is undefined
+    # at 0.
+    partial_x = special_math_ops.bessel_y0(x) - math_ops.div(y, x)
+    return grad * partial_x
+
+
 @ops.RegisterGradient("Igamma")
 def _IgammaGrad(op, grad):
   """Returns gradient of igamma(a, x) with respect to a and x."""
diff --git a/tensorflow/python/ops/parallel_for/math_test.py b/tensorflow/python/ops/parallel_for/math_test.py
index 8e18b9968fe..26bce86de73 100644
--- a/tensorflow/python/ops/parallel_for/math_test.py
+++ b/tensorflow/python/ops/parallel_for/math_test.py
@@ -90,8 +90,6 @@ class MathTest(PForTestCase, parameterized.TestCase):
         math_ops.asinh,
         math_ops.atan,
         math_ops.atanh,
-        math_ops.bessel_i0e,
-        math_ops.bessel_i1e,
         math_ops.cos,
         math_ops.cosh,
         math_ops.digamma,
@@ -107,6 +105,8 @@ class MathTest(PForTestCase, parameterized.TestCase):
         math_ops.log,
         math_ops.log1p,
         math_ops.ndtri,
+        special_math_ops.bessel_i0e,
+        special_math_ops.bessel_i1e,
     ]
     self._test_unary_cwise_ops(real_ops, False)
 
diff --git a/tensorflow/python/ops/parallel_for/pfor.py b/tensorflow/python/ops/parallel_for/pfor.py
index 29bb1afb056..ad9e83cc924 100644
--- a/tensorflow/python/ops/parallel_for/pfor.py
+++ b/tensorflow/python/ops/parallel_for/pfor.py
@@ -2703,8 +2703,18 @@ def _convert_cast(pfor_input):
 @RegisterPForWithArgs("Atan", math_ops.atan)
 @RegisterPForWithArgs("Atan2", math_ops.atan2)
 @RegisterPForWithArgs("Atanh", math_ops.atanh)
-@RegisterPForWithArgs("BesselI0e", math_ops.bessel_i0e)
-@RegisterPForWithArgs("BesselI1e", math_ops.bessel_i1e)
+@RegisterPForWithArgs("BesselI0", special_math_ops.bessel_i0)
+@RegisterPForWithArgs("BesselI1", special_math_ops.bessel_i1)
+@RegisterPForWithArgs("BesselI0e", special_math_ops.bessel_i0e)
+@RegisterPForWithArgs("BesselI1e", special_math_ops.bessel_i1e)
+@RegisterPForWithArgs("BesselK0", special_math_ops.bessel_k0)
+@RegisterPForWithArgs("BesselK1", special_math_ops.bessel_k1)
+@RegisterPForWithArgs("BesselK0e", special_math_ops.bessel_k0e)
+@RegisterPForWithArgs("BesselK1e", special_math_ops.bessel_k1e)
+@RegisterPForWithArgs("BesselJ0", special_math_ops.bessel_j0)
+@RegisterPForWithArgs("BesselJ1", special_math_ops.bessel_j1)
+@RegisterPForWithArgs("BesselY0", special_math_ops.bessel_y0)
+@RegisterPForWithArgs("BesselY1", special_math_ops.bessel_y1)
 @RegisterPForWithArgs("BitwiseAnd", bitwise_ops.bitwise_and)
 @RegisterPForWithArgs("BitwiseOr", bitwise_ops.bitwise_or)
 @RegisterPForWithArgs("BitwiseXor", bitwise_ops.bitwise_xor)
diff --git a/tensorflow/python/ops/signal/window_ops.py b/tensorflow/python/ops/signal/window_ops.py
index eb33c3f3b58..26eab6b7ff9 100644
--- a/tensorflow/python/ops/signal/window_ops.py
+++ b/tensorflow/python/ops/signal/window_ops.py
@@ -27,6 +27,7 @@ from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import special_math_ops
 from tensorflow.python.util import dispatch
 from tensorflow.python.util.tf_export import tf_export
 
@@ -87,8 +88,8 @@ def kaiser_window(window_length, beta=12., dtype=dtypes.float32, name=None):
     halflen_float = math_ops.cast(halflen_float, dtype=dtype)
     num = beta * math_ops.sqrt(
         one - math_ops.pow(arg, two) / math_ops.pow(halflen_float, two))
-    window = math_ops.exp(num - beta) * (math_ops.bessel_i0e(num) /
-                                         math_ops.bessel_i0e(beta))
+    window = math_ops.exp(num - beta) * (
+        special_math_ops.bessel_i0e(num) / special_math_ops.bessel_i0e(beta))
   return window
 
 
diff --git a/tensorflow/python/ops/sparse_ops.py b/tensorflow/python/ops/sparse_ops.py
index cc4b1010021..07eb35e84ce 100644
--- a/tensorflow/python/ops/sparse_ops.py
+++ b/tensorflow/python/ops/sparse_ops.py
@@ -39,6 +39,7 @@ from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gen_sparse_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import special_math_ops
 # go/tf-wildcard-import
 # pylint: disable=wildcard-import
 from tensorflow.python.ops.gen_sparse_ops import *
@@ -2928,8 +2929,9 @@ _UNARY_OPS = [
     math_ops.sqrt,
     math_ops.erf,
     math_ops.tanh,
-    math_ops.bessel_i0e,
-    math_ops.bessel_i1e,
+    # TODO(b/157272291) Add dispatchers for rest of special functions.
+    special_math_ops.bessel_i0e,
+    special_math_ops.bessel_i1e,
 ]
 for unary_op in _UNARY_OPS:
   _UnaryMapValueDispatcher(unary_op).register(unary_op)
diff --git a/tensorflow/python/ops/special_math_ops.py b/tensorflow/python/ops/special_math_ops.py
index 036346cdecd..6bddd3ea9bf 100644
--- a/tensorflow/python/ops/special_math_ops.py
+++ b/tensorflow/python/ops/special_math_ops.py
@@ -250,7 +250,7 @@ def spence(x, name=None):
     return gen_special_math_ops.spence(x)
 
 
-@tf_export('math.bessel_i0')
+@tf_export('math.bessel_i0', 'math.special.bessel_i0')
 @dispatch.add_dispatch_support
 def bessel_i0(x, name=None):
   """Computes the Bessel i0 function of `x` element-wise.
@@ -259,6 +259,9 @@ def bessel_i0(x, name=None):
 
   It is preferable to use the numerically stabler function `i0e(x)` instead.
 
+  >>> tf.math.special.bessel_i0([-1., -0.5, 0.5, 1.]).numpy()
+  array([1.26606588, 1.06348337, 1.06348337, 1.26606588], dtype=float32)
+
   Args:
     x: A `Tensor` or `SparseTensor`. Must be one of the following types: `half`,
       `float32`, `float64`.
@@ -272,10 +275,36 @@ def bessel_i0(x, name=None):
   @end_compatibility
   """
   with ops.name_scope(name, 'bessel_i0', [x]):
-    return math_ops.exp(math_ops.abs(x)) * math_ops.bessel_i0e(x)
+    return gen_special_math_ops.bessel_i0(x)
 
 
-@tf_export('math.bessel_i1')
+@tf_export('math.bessel_i0e', 'math.special.bessel_i0e')
+@dispatch.add_dispatch_support
+def bessel_i0e(x, name=None):
+  """Computes the Bessel i0e function of `x` element-wise.
+
+  Modified Bessel function of order 0.
+
+  >>> tf.math.special.bessel_i0e([-1., -0.5, 0.5, 1.]).numpy()
+  array([0.46575961, 0.64503527, 0.64503527, 0.46575961], dtype=float32)
+
+  Args:
+    x: A `Tensor` or `SparseTensor`. Must be one of the following types: `half`,
+      `float32`, `float64`.
+    name: A name for the operation (optional).
+
+  Returns:
+    A `Tensor` or `SparseTensor`, respectively. Has the same type as `x`.
+
+  @compatibility(scipy)
+  Equivalent to scipy.special.i0e
+  @end_compatibility
+  """
+  with ops.name_scope(name, 'bessel_i0e', [x]):
+    return gen_special_math_ops.bessel_i0e(x)
+
+
+@tf_export('math.bessel_i1', 'math.special.bessel_i1')
 @dispatch.add_dispatch_support
 def bessel_i1(x, name=None):
   """Computes the Bessel i1 function of `x` element-wise.
@@ -284,6 +313,9 @@ def bessel_i1(x, name=None):
 
   It is preferable to use the numerically stabler function `i1e(x)` instead.
 
+  >>> tf.math.special.bessel_i1([-1., -0.5, 0.5, 1.]).numpy()
+  array([-0.5651591 , -0.25789431,  0.25789431,  0.5651591 ], dtype=float32)
+
   Args:
     x: A `Tensor` or `SparseTensor`. Must be one of the following types: `half`,
       `float32`, `float64`.
@@ -297,7 +329,245 @@ def bessel_i1(x, name=None):
   @end_compatibility
   """
   with ops.name_scope(name, 'bessel_i1', [x]):
-    return math_ops.exp(math_ops.abs(x)) * math_ops.bessel_i1e(x)
+    return gen_special_math_ops.bessel_i1(x)
+
+
+@tf_export('math.bessel_i1e', 'math.special.bessel_i1e')
+@dispatch.add_dispatch_support
+def bessel_i1e(x, name=None):
+  """Computes the Bessel i1e function of `x` element-wise.
+
+  Modified Bessel function of order 1.
+
+  >>> tf.math.special.bessel_i1e([-1., -0.5, 0.5, 1.]).numpy()
+  array([-0.20791042, -0.15642083,  0.15642083,  0.20791042], dtype=float32)
+
+  Args:
+    x: A `Tensor` or `SparseTensor`. Must be one of the following types: `half`,
+      `float32`, `float64`.
+    name: A name for the operation (optional).
+
+  Returns:
+    A `Tensor` or `SparseTensor`, respectively. Has the same type as `x`.
+
+  @compatibility(scipy)
+  Equivalent to scipy.special.i1e
+  @end_compatibility
+  """
+  with ops.name_scope(name, 'bessel_i1e', [x]):
+    return gen_special_math_ops.bessel_i1e(x)
+
+
+@tf_export('math.special.bessel_k0')
+@dispatch.add_dispatch_support
+def bessel_k0(x, name=None):
+  """Computes the Bessel k0 function of `x` element-wise.
+
+  Modified Bessel function of order 0.
+
+  It is preferable to use the numerically stabler function `k0e(x)` instead.
+
+  >>> tf.math.special.bessel_k0([0.5, 1., 2., 4.]).numpy()
+  array([0.92441907, 0.42102444, 0.11389387, 0.01115968], dtype=float32)
+
+  Args:
+    x: A `Tensor` or `SparseTensor`. Must be one of the following types: `half`,
+      `float32`, `float64`.
+    name: A name for the operation (optional).
+
+  Returns:
+    A `Tensor` or `SparseTensor`, respectively. Has the same type as `x`.
+
+  @compatibility(scipy)
+  Equivalent to scipy.special.k0
+  @end_compatibility
+  """
+  with ops.name_scope(name, 'bessel_k0', [x]):
+    return gen_special_math_ops.bessel_k0(x)
+
+
+@tf_export('math.special.bessel_k0e')
+@dispatch.add_dispatch_support
+def bessel_k0e(x, name=None):
+  """Computes the Bessel k0e function of `x` element-wise.
+
+  Modified Bessel function of order 0.
+
+  >>> tf.math.special.bessel_k0e([0.5, 1., 2., 4.]).numpy()
+  array([1.52410939, 1.14446308, 0.84156822, 0.60929767], dtype=float32)
+
+  Args:
+    x: A `Tensor` or `SparseTensor`. Must be one of the following types: `half`,
+      `float32`, `float64`.
+    name: A name for the operation (optional).
+
+  Returns:
+    A `Tensor` or `SparseTensor`, respectively. Has the same type as `x`.
+
+  @compatibility(scipy)
+  Equivalent to scipy.special.k0e
+  @end_compatibility
+  """
+  with ops.name_scope(name, 'bessel_k0e', [x]):
+    return gen_special_math_ops.bessel_k0e(x)
+
+
+@tf_export('math.special.bessel_k1')
+@dispatch.add_dispatch_support
+def bessel_k1(x, name=None):
+  """Computes the Bessel k1 function of `x` element-wise.
+
+  Modified Bessel function of order 1.
+
+  It is preferable to use the numerically stabler function `k1e(x)` instead.
+
+  >>> tf.math.special.bessel_k1([0.5, 1., 2., 4.]).numpy()
+  array([1.65644112, 0.60190723, 0.13986588, 0.0124835 ], dtype=float32)
+
+  Args:
+    x: A `Tensor` or `SparseTensor`. Must be one of the following types: `half`,
+      `float32`, `float64`.
+    name: A name for the operation (optional).
+
+  Returns:
+    A `Tensor` or `SparseTensor`, respectively. Has the same type as `x`.
+
+  @compatibility(scipy)
+  Equivalent to scipy.special.k1
+  @end_compatibility
+  """
+  with ops.name_scope(name, 'bessel_k1', [x]):
+    return gen_special_math_ops.bessel_k1(x)
+
+
+@tf_export('math.special.bessel_k1e')
+@dispatch.add_dispatch_support
+def bessel_k1e(x, name=None):
+  """Computes the Bessel k1e function of `x` element-wise.
+
+  Modified Bessel function of order 1.
+
+  >>> tf.math.special.bessel_k1e([0.5, 1., 2., 4.]).numpy()
+  array([2.73100971, 1.63615349, 1.03347685, 0.68157595], dtype=float32)
+
+  Args:
+    x: A `Tensor` or `SparseTensor`. Must be one of the following types: `half`,
+      `float32`, `float64`.
+    name: A name for the operation (optional).
+
+  Returns:
+    A `Tensor` or `SparseTensor`, respectively. Has the same type as `x`.
+
+  @compatibility(scipy)
+  Equivalent to scipy.special.k1e
+  @end_compatibility
+  """
+  with ops.name_scope(name, 'bessel_k1e', [x]):
+    return gen_special_math_ops.bessel_k1e(x)
+
+
+@tf_export('math.special.bessel_j0')
+@dispatch.add_dispatch_support
+def bessel_j0(x, name=None):
+  """Computes the Bessel j0 function of `x` element-wise.
+
+  Modified Bessel function of order 0.
+
+  >>> tf.math.special.bessel_j0([0.5, 1., 2., 4.]).numpy()
+  array([ 0.93846981,  0.76519769,  0.22389078, -0.39714981], dtype=float32)
+
+  Args:
+    x: A `Tensor` or `SparseTensor`. Must be one of the following types: `half`,
+      `float32`, `float64`.
+    name: A name for the operation (optional).
+
+  Returns:
+    A `Tensor` or `SparseTensor`, respectively. Has the same type as `x`.
+
+  @compatibility(scipy)
+  Equivalent to scipy.special.j0
+  @end_compatibility
+  """
+  with ops.name_scope(name, 'bessel_j0', [x]):
+    return gen_special_math_ops.bessel_j0(x)
+
+
+@tf_export('math.special.bessel_j1')
+@dispatch.add_dispatch_support
+def bessel_j1(x, name=None):
+  """Computes the Bessel j1 function of `x` element-wise.
+
+  Modified Bessel function of order 1.
+
+  >>> tf.math.special.bessel_j1([0.5, 1., 2., 4.]).numpy()
+  array([ 0.24226846,  0.44005059,  0.57672481, -0.06604333], dtype=float32)
+
+  Args:
+    x: A `Tensor` or `SparseTensor`. Must be one of the following types: `half`,
+      `float32`, `float64`.
+    name: A name for the operation (optional).
+
+  Returns:
+    A `Tensor` or `SparseTensor`, respectively. Has the same type as `x`.
+
+  @compatibility(scipy)
+  Equivalent to scipy.special.j1
+  @end_compatibility
+  """
+  with ops.name_scope(name, 'bessel_j1', [x]):
+    return gen_special_math_ops.bessel_j1(x)
+
+
+@tf_export('math.special.bessel_y0')
+@dispatch.add_dispatch_support
+def bessel_y0(x, name=None):
+  """Computes the Bessel y0 function of `x` element-wise.
+
+  Modified Bessel function of order 0.
+
+  >>> tf.math.special.bessel_y0([0.5, 1., 2., 4.]).numpy()
+  array([-0.44451873,  0.08825696,  0.51037567, -0.01694074], dtype=float32)
+
+  Args:
+    x: A `Tensor` or `SparseTensor`. Must be one of the following types: `half`,
+      `float32`, `float64`.
+    name: A name for the operation (optional).
+
+  Returns:
+    A `Tensor` or `SparseTensor`, respectively. Has the same type as `x`.
+
+  @compatibility(scipy)
+  Equivalent to scipy.special.y0
+  @end_compatibility
+  """
+  with ops.name_scope(name, 'bessel_y0', [x]):
+    return gen_special_math_ops.bessel_y0(x)
+
+
+@tf_export('math.special.bessel_y1')
+@dispatch.add_dispatch_support
+def bessel_y1(x, name=None):
+  """Computes the Bessel y1 function of `x` element-wise.
+
+  Modified Bessel function of order 1.
+
+  >>> tf.math.special.bessel_y1([0.5, 1., 2., 4.]).numpy()
+  array([-1.47147239, -0.78121282, -0.10703243,  0.39792571], dtype=float32)
+
+  Args:
+    x: A `Tensor` or `SparseTensor`. Must be one of the following types: `half`,
+      `float32`, `float64`.
+    name: A name for the operation (optional).
+
+  Returns:
+    A `Tensor` or `SparseTensor`, respectively. Has the same type as `x`.
+
+  @compatibility(scipy)
+  Equivalent to scipy.special.y1
+  @end_compatibility
+  """
+  with ops.name_scope(name, 'bessel_y1', [x]):
+    return gen_special_math_ops.bessel_y1(x)
 
 
 @ops.RegisterGradient('XlaEinsum')
diff --git a/tensorflow/python/ops/special_math_ops_test.py b/tensorflow/python/ops/special_math_ops_test.py
index 6aa6b47df6e..437997c9ce0 100644
--- a/tensorflow/python/ops/special_math_ops_test.py
+++ b/tensorflow/python/ops/special_math_ops_test.py
@@ -403,34 +403,236 @@ class SpenceTest(test.TestCase, parameterized.TestCase):
     self.assertAllClose([[[-1.]]], analytical)
 
 
-class BesselTest(test.TestCase):
+@test_util.run_all_in_graph_and_eager_modes
+class BesselTest(test.TestCase, parameterized.TestCase):
 
   @test_util.run_in_graph_and_eager_modes
-  def test_bessel_i0(self):
-    x_single = np.arange(-3, 3).reshape(1, 3, 2).astype(np.float32)
-    x_double = np.arange(-3, 3).reshape(1, 3, 2).astype(np.float64)
+  def test_besseli_boundary(self):
+    self.assertAllClose(1., special_math_ops.bessel_i0(0.))
+    self.assertAllClose(1., special_math_ops.bessel_i0e(0.))
+    self.assertAllClose(0., special_math_ops.bessel_i1(0.))
+    self.assertAllClose(0., special_math_ops.bessel_i1e(0.))
+    self.assertTrue(np.isnan(self.evaluate(special_math_ops.bessel_i0(np.nan))))
+    self.assertTrue(
+        np.isnan(self.evaluate(special_math_ops.bessel_i0e(np.nan))))
+    self.assertTrue(np.isnan(self.evaluate(special_math_ops.bessel_i1(np.nan))))
+    self.assertTrue(
+        np.isnan(self.evaluate(special_math_ops.bessel_i1e(np.nan))))
+
+  @test_util.run_in_graph_and_eager_modes
+  def test_besselj_boundary(self):
+    self.assertAllClose(1., special_math_ops.bessel_j0(0.))
+    self.assertAllClose(0., special_math_ops.bessel_j1(0.))
+    self.assertTrue(np.isnan(self.evaluate(special_math_ops.bessel_j0(np.nan))))
+    self.assertTrue(np.isnan(self.evaluate(special_math_ops.bessel_j1(np.nan))))
+
+  @test_util.run_in_graph_and_eager_modes
+  def test_besselk_boundary(self):
+    self.assertTrue(np.isinf(self.evaluate(special_math_ops.bessel_k0(0.))))
+    self.assertTrue(np.isinf(self.evaluate(special_math_ops.bessel_k0e(0.))))
+    self.assertTrue(np.isinf(self.evaluate(special_math_ops.bessel_k1(0.))))
+    self.assertTrue(np.isinf(self.evaluate(special_math_ops.bessel_k1e(0.))))
+    self.assertTrue(np.isnan(self.evaluate(special_math_ops.bessel_k0(np.nan))))
+    self.assertTrue(
+        np.isnan(self.evaluate(special_math_ops.bessel_k0e(np.nan))))
+    self.assertTrue(np.isnan(self.evaluate(special_math_ops.bessel_k1(np.nan))))
+    self.assertTrue(
+        np.isnan(self.evaluate(special_math_ops.bessel_k1e(np.nan))))
+
+  @parameterized.parameters(np.float32, np.float64)
+  def test_i0j0_even(self, dtype):
+    x = np.random.uniform(-100., 100., size=int(1e4)).astype(dtype)
+    self.assertAllClose(
+        self.evaluate(special_math_ops.bessel_i0(x)),
+        self.evaluate(special_math_ops.bessel_i0(-x)))
+
+    self.assertAllClose(
+        self.evaluate(special_math_ops.bessel_i0e(x)),
+        self.evaluate(special_math_ops.bessel_i0e(-x)))
+
+    self.assertAllClose(
+        self.evaluate(special_math_ops.bessel_j0(x)),
+        self.evaluate(special_math_ops.bessel_j0(-x)))
+
+  @parameterized.parameters(np.float32, np.float64)
+  def test_i1j1_odd(self, dtype):
+    x = np.random.uniform(-100., 100., size=int(1e4)).astype(dtype)
+    self.assertAllClose(
+        self.evaluate(special_math_ops.bessel_i1(x)),
+        self.evaluate(-special_math_ops.bessel_i1(-x)))
+
+    self.assertAllClose(
+        self.evaluate(special_math_ops.bessel_i1e(x)),
+        self.evaluate(-special_math_ops.bessel_i1e(-x)))
+
+    self.assertAllClose(
+        self.evaluate(special_math_ops.bessel_j1(x)),
+        self.evaluate(-special_math_ops.bessel_j1(-x)))
+
+  @parameterized.parameters(np.float32, np.float64)
+  def test_besseli_small(self, dtype):
+    x = np.random.uniform(-1., 1., size=int(1e4)).astype(dtype)
     try:
       from scipy import special  # pylint: disable=g-import-not-at-top
-      self.assertAllClose(special.i0(x_single),
-                          self.evaluate(special_math_ops.bessel_i0(x_single)))
-      self.assertAllClose(special.i0(x_double),
-                          self.evaluate(special_math_ops.bessel_i0(x_double)))
+      self.assertAllClose(
+          special.i0(x), self.evaluate(special_math_ops.bessel_i0(x)))
+      self.assertAllClose(
+          special.i1(x), self.evaluate(special_math_ops.bessel_i1(x)))
+      self.assertAllClose(
+          special.i0e(x), self.evaluate(special_math_ops.bessel_i0e(x)))
+      self.assertAllClose(
+          special.i1e(x), self.evaluate(special_math_ops.bessel_i1e(x)))
     except ImportError as e:
       tf_logging.warn('Cannot test special functions: %s' % str(e))
 
-  @test_util.run_in_graph_and_eager_modes
-  def test_bessel_i1(self):
-    x_single = np.arange(-3, 3).reshape(1, 3, 2).astype(np.float32)
-    x_double = np.arange(-3, 3).reshape(1, 3, 2).astype(np.float64)
+  @parameterized.parameters(np.float32, np.float64)
+  def test_besselj_small(self, dtype):
+    x = np.random.uniform(-1., 1., size=int(1e4)).astype(dtype)
     try:
       from scipy import special  # pylint: disable=g-import-not-at-top
-      self.assertAllClose(special.i1(x_single),
-                          self.evaluate(special_math_ops.bessel_i1(x_single)))
-      self.assertAllClose(special.i1(x_double),
-                          self.evaluate(special_math_ops.bessel_i1(x_double)))
+      self.assertAllClose(
+          special.j0(x), self.evaluate(special_math_ops.bessel_j0(x)))
+      self.assertAllClose(
+          special.j1(x), self.evaluate(special_math_ops.bessel_j1(x)))
     except ImportError as e:
       tf_logging.warn('Cannot test special functions: %s' % str(e))
 
+  @parameterized.parameters(np.float32, np.float64)
+  def test_besselk_small(self, dtype):
+    x = np.random.uniform(np.finfo(dtype).eps, 1., size=int(1e4)).astype(dtype)
+    try:
+      from scipy import special  # pylint: disable=g-import-not-at-top
+      self.assertAllClose(
+          special.k0(x), self.evaluate(special_math_ops.bessel_k0(x)))
+      self.assertAllClose(
+          special.k0e(x), self.evaluate(special_math_ops.bessel_k0e(x)))
+      self.assertAllClose(
+          special.k1(x), self.evaluate(special_math_ops.bessel_k1(x)))
+      self.assertAllClose(
+          special.k1e(x), self.evaluate(special_math_ops.bessel_k1e(x)))
+    except ImportError as e:
+      tf_logging.warn('Cannot test special functions: %s' % str(e))
+
+  @parameterized.parameters(np.float32, np.float64)
+  def test_bessely_small(self, dtype):
+    x = np.random.uniform(np.finfo(dtype).eps, 1., size=int(1e4)).astype(dtype)
+    try:
+      from scipy import special  # pylint: disable=g-import-not-at-top
+      self.assertAllClose(
+          special.y0(x), self.evaluate(special_math_ops.bessel_y0(x)))
+      self.assertAllClose(
+          special.y1(x), self.evaluate(special_math_ops.bessel_y1(x)))
+    except ImportError as e:
+      tf_logging.warn('Cannot test special functions: %s' % str(e))
+
+  @parameterized.parameters(np.float32, np.float64)
+  def test_besseli_larger(self, dtype):
+    x = np.random.uniform(1., 20., size=int(1e4)).astype(dtype)
+    try:
+      from scipy import special  # pylint: disable=g-import-not-at-top
+      self.assertAllClose(
+          special.i0e(x), self.evaluate(special_math_ops.bessel_i0e(x)))
+      self.assertAllClose(
+          special.i1e(x), self.evaluate(special_math_ops.bessel_i1e(x)))
+    except ImportError as e:
+      tf_logging.warn('Cannot test special functions: %s' % str(e))
+
+  @parameterized.parameters(np.float32, np.float64)
+  def test_besselj_larger(self, dtype):
+    x = np.random.uniform(1., 30., size=int(1e4)).astype(dtype)
+    try:
+      from scipy import special  # pylint: disable=g-import-not-at-top
+      self.assertAllClose(
+          special.j0(x), self.evaluate(special_math_ops.bessel_j0(x)))
+      self.assertAllClose(
+          special.j1(x), self.evaluate(special_math_ops.bessel_j1(x)))
+    except ImportError as e:
+      tf_logging.warn('Cannot test special functions: %s' % str(e))
+
+  @parameterized.parameters(np.float32, np.float64)
+  def test_besselk_larger(self, dtype):
+    x = np.random.uniform(1., 30., size=int(1e4)).astype(dtype)
+    try:
+      from scipy import special  # pylint: disable=g-import-not-at-top
+      self.assertAllClose(
+          special.k0(x), self.evaluate(special_math_ops.bessel_k0(x)))
+      self.assertAllClose(
+          special.k0e(x), self.evaluate(special_math_ops.bessel_k0e(x)))
+      self.assertAllClose(
+          special.k1(x), self.evaluate(special_math_ops.bessel_k1(x)))
+      self.assertAllClose(
+          special.k1e(x), self.evaluate(special_math_ops.bessel_k1e(x)))
+    except ImportError as e:
+      tf_logging.warn('Cannot test special functions: %s' % str(e))
+
+  @parameterized.parameters(np.float32, np.float64)
+  def test_bessely_larger(self, dtype):
+    x = np.random.uniform(1., 30., size=int(1e4)).astype(dtype)
+    try:
+      from scipy import special  # pylint: disable=g-import-not-at-top
+      self.assertAllClose(
+          special.y0(x), self.evaluate(special_math_ops.bessel_y0(x)))
+      self.assertAllClose(
+          special.y1(x), self.evaluate(special_math_ops.bessel_y1(x)))
+    except ImportError as e:
+      tf_logging.warn('Cannot test special functions: %s' % str(e))
+
+  def test_besseli_gradient(self):
+    inputs = [np.random.uniform(-10., 10., size=int(1e2))]
+    analytical, numerical = gradient_checker_v2.compute_gradient(
+        special_math_ops.bessel_i0, inputs)
+    self.assertLess(gradient_checker_v2.max_error(analytical, numerical), 1e-3)
+
+    analytical, numerical = gradient_checker_v2.compute_gradient(
+        special_math_ops.bessel_i0e, inputs)
+    self.assertLess(gradient_checker_v2.max_error(analytical, numerical), 1e-4)
+
+    analytical, numerical = gradient_checker_v2.compute_gradient(
+        special_math_ops.bessel_i1, inputs)
+    self.assertLess(gradient_checker_v2.max_error(analytical, numerical), 1e-3)
+
+    analytical, numerical = gradient_checker_v2.compute_gradient(
+        special_math_ops.bessel_i1e, inputs)
+    self.assertLess(gradient_checker_v2.max_error(analytical, numerical), 1e-4)
+
+  def test_besselj_gradient(self):
+    inputs = [np.random.uniform(-50., 50., size=int(1e2))]
+    analytical, numerical = gradient_checker_v2.compute_gradient(
+        special_math_ops.bessel_j0, inputs)
+    self.assertLess(gradient_checker_v2.max_error(analytical, numerical), 1e-4)
+
+    analytical, numerical = gradient_checker_v2.compute_gradient(
+        special_math_ops.bessel_j1, inputs)
+    self.assertLess(gradient_checker_v2.max_error(analytical, numerical), 1e-4)
+
+  def test_besselk_gradient(self):
+    inputs = [np.random.uniform(1., 50., size=int(1e2))]
+    analytical, numerical = gradient_checker_v2.compute_gradient(
+        special_math_ops.bessel_k0, inputs)
+    self.assertLess(gradient_checker_v2.max_error(analytical, numerical), 1e-4)
+
+    analytical, numerical = gradient_checker_v2.compute_gradient(
+        special_math_ops.bessel_k0e, inputs)
+    self.assertLess(gradient_checker_v2.max_error(analytical, numerical), 1e-4)
+
+    analytical, numerical = gradient_checker_v2.compute_gradient(
+        special_math_ops.bessel_k1, inputs)
+    self.assertLess(gradient_checker_v2.max_error(analytical, numerical), 1e-4)
+
+    analytical, numerical = gradient_checker_v2.compute_gradient(
+        special_math_ops.bessel_k1e, inputs)
+    self.assertLess(gradient_checker_v2.max_error(analytical, numerical), 1e-4)
+
+  def test_bessely_gradient(self):
+    inputs = [np.random.uniform(1., 50., size=int(1e2))]
+    analytical, numerical = gradient_checker_v2.compute_gradient(
+        special_math_ops.bessel_y0, inputs)
+    self.assertLess(gradient_checker_v2.max_error(analytical, numerical), 1e-4)
+
+    analytical, numerical = gradient_checker_v2.compute_gradient(
+        special_math_ops.bessel_y1, inputs)
+    self.assertLess(gradient_checker_v2.max_error(analytical, numerical), 1e-4)
+
 
 @test_util.run_all_in_graph_and_eager_modes
 class EinsumTest(test.TestCase):
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.math.special.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.math.special.pbtxt
index 6afc63c7f94..7fd93029924 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.math.special.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.math.special.pbtxt
@@ -1,5 +1,53 @@
 path: "tensorflow.math.special"
 tf_module {
+  member_method {
+    name: "bessel_i0"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "bessel_i0e"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "bessel_i1"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "bessel_i1e"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "bessel_j0"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "bessel_j1"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "bessel_k0"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "bessel_k0e"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "bessel_k1"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "bessel_k1e"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "bessel_y0"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "bessel_y1"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "dawsn"
     argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt
index 54d15b601c5..3d298e928e9 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt
@@ -428,14 +428,54 @@ tf_module {
     name: "BatchToSpaceND"
     argspec: "args=[\'input\', \'block_shape\', \'crops\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "BesselI0"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "BesselI0e"
     argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "BesselI1"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "BesselI1e"
     argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "BesselJ0"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "BesselJ1"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "BesselK0"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "BesselK0e"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "BesselK1"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "BesselK1e"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "BesselY0"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "BesselY1"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "Betainc"
     argspec: "args=[\'a\', \'b\', \'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.math.special.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.math.special.pbtxt
index 6afc63c7f94..7fd93029924 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.math.special.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.math.special.pbtxt
@@ -1,5 +1,53 @@
 path: "tensorflow.math.special"
 tf_module {
+  member_method {
+    name: "bessel_i0"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "bessel_i0e"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "bessel_i1"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "bessel_i1e"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "bessel_j0"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "bessel_j1"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "bessel_k0"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "bessel_k0e"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "bessel_k1"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "bessel_k1e"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "bessel_y0"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "bessel_y1"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "dawsn"
     argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt
index 54d15b601c5..3d298e928e9 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt
@@ -428,14 +428,54 @@ tf_module {
     name: "BatchToSpaceND"
     argspec: "args=[\'input\', \'block_shape\', \'crops\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "BesselI0"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "BesselI0e"
     argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "BesselI1"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "BesselI1e"
     argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "BesselJ0"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "BesselJ1"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "BesselK0"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "BesselK0e"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "BesselK1"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "BesselK1e"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "BesselY0"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "BesselY1"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "Betainc"
     argspec: "args=[\'a\', \'b\', \'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "

From 16125634ef894f44f6b1ec37ce1db198582910c9 Mon Sep 17 00:00:00 2001
From: Xinan Jiang <xinan.jxn@gmail.com>
Date: Fri, 27 Mar 2020 10:31:34 +0800
Subject: [PATCH 0465/1390] [MLIR][XLA] Add GatherOp to LHLO/HLO emitters

---
 tensorflow/compiler/mlir/xla/hlo_utils.cc     | 25 +++++++++++
 tensorflow/compiler/mlir/xla/hlo_utils.h      |  3 ++
 .../service/mlir_gpu/hlo_dialect_emitter.cc   | 25 +++++++++++
 .../service/mlir_gpu/hlo_dialect_emitter.h    |  1 +
 .../service/mlir_gpu/lhlo_dialect_emitter.cc  | 41 +++++++++++++++++++
 .../service/mlir_gpu/lhlo_dialect_emitter.h   |  1 +
 .../xla/service/mlir_gpu/tests/gather.hlo     | 15 +++++++
 7 files changed, 111 insertions(+)
 create mode 100644 tensorflow/compiler/xla/service/mlir_gpu/tests/gather.hlo

diff --git a/tensorflow/compiler/mlir/xla/hlo_utils.cc b/tensorflow/compiler/mlir/xla/hlo_utils.cc
index e1b5feeb117..bb1943284cb 100644
--- a/tensorflow/compiler/mlir/xla/hlo_utils.cc
+++ b/tensorflow/compiler/mlir/xla/hlo_utils.cc
@@ -16,6 +16,7 @@ limitations under the License.
 // This file defines helpers useful when creating or manipulating lhlo/hlo.
 
 #include "tensorflow/compiler/mlir/xla/hlo_utils.h"
+#include <vector>
 
 #include "mlir/IR/AffineMap.h"  // from @llvm-project
 #include "mlir/IR/Attributes.h"  // from @llvm-project
@@ -197,4 +198,28 @@ StatusOr<mlir::Type> ConvertPrimitiveTypeToMLIRType(PrimitiveType element_type,
   }
 }
 
+mlir::xla_hlo::GatherDimensionNumbers CreateGatherDimensionNumbers(
+    const GatherDimensionNumbers& input, mlir::Builder builder) {
+  std::vector<int64> gather_offset_dims(input.offset_dims().begin(),
+                                        input.offset_dims().end());
+  std::vector<int64> gather_collapsed_slice_dims(
+      input.collapsed_slice_dims().begin(), input.collapsed_slice_dims().end());
+  std::vector<int64> gather_start_index_map(input.start_index_map().begin(),
+                                            input.start_index_map().end());
+
+  auto offset_dims = CreateDenseIntElementsAttrFromVector(
+                         gather_offset_dims, builder);
+  auto collapsed_slice_dims = CreateDenseIntElementsAttrFromVector(
+                                  gather_collapsed_slice_dims, builder);
+  auto start_index_map = CreateDenseIntElementsAttrFromVector(
+                             gather_start_index_map, builder);
+
+  mlir::IntegerAttr index_vector_dim = builder.getI64IntegerAttr(
+                                           input.index_vector_dim());
+
+  return mlir::xla_hlo::GatherDimensionNumbers::get(
+      offset_dims, collapsed_slice_dims, start_index_map, index_vector_dim,
+      builder.getContext());
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/mlir/xla/hlo_utils.h b/tensorflow/compiler/mlir/xla/hlo_utils.h
index f372cbf69bb..cc36f5348b6 100644
--- a/tensorflow/compiler/mlir/xla/hlo_utils.h
+++ b/tensorflow/compiler/mlir/xla/hlo_utils.h
@@ -39,6 +39,9 @@ mlir::DenseIntElementsAttr CreateDenseIntElementsAttrFromVector(
 StatusOr<mlir::Type> ConvertPrimitiveTypeToMLIRType(PrimitiveType element_type,
                                                     mlir::Builder builder);
 
+mlir::xla_hlo::GatherDimensionNumbers CreateGatherDimensionNumbers(
+    const GatherDimensionNumbers& input, mlir::Builder builder);
+
 template <typename TypeT>
 static StatusOr<TypeT> ConvertTensorShapeToType(const Shape& shape,
                                                 mlir::Builder builder) {
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/hlo_dialect_emitter.cc b/tensorflow/compiler/xla/service/mlir_gpu/hlo_dialect_emitter.cc
index 33550273bf5..b0ea2a15218 100644
--- a/tensorflow/compiler/xla/service/mlir_gpu/hlo_dialect_emitter.cc
+++ b/tensorflow/compiler/xla/service/mlir_gpu/hlo_dialect_emitter.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/mlir_gpu/hlo_dialect_emitter.h"
 
 #include <utility>
+#include <vector>
 
 #include "llvm/ADT/STLExtras.h"
 #include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
@@ -183,6 +184,30 @@ Status HloDialectEmitter::HandleConstant(HloInstruction* instr) {
   return Status::OK();
 }
 
+Status HloDialectEmitter::HandleGather(HloInstruction* instr) {
+  HloGatherInstruction* gather = static_cast<HloGatherInstruction*>(instr);
+  const xla::GatherDimensionNumbers& gather_dimension_numbers =
+      gather->gather_dimension_numbers();
+  mlir::xla_hlo::GatherDimensionNumbers dimension_numbers =
+      xla::CreateGatherDimensionNumbers(gather_dimension_numbers, builder_);
+  std::vector<int64> gather_slice_sizes(
+                         gather->gather_slice_sizes().begin(),
+                         gather->gather_slice_sizes().end());
+  const mlir::DenseIntElementsAttr slice_sizes =
+      CreateDenseIntElementsAttrFromVector(gather_slice_sizes, builder_);
+  mlir::BoolAttr indices_are_sorted = builder_.getBoolAttr(
+      gather->indices_are_sorted());
+
+  TF_ASSIGN_OR_RETURN(Type res_type, ConvertTensorShapeToType<RankedTensorType>(
+                                         instr->shape(), builder_));
+
+  instruction_to_values_[instr] = builder_.create<hlo::GatherOp>(
+      getLocation(instr), res_type, instruction_to_values_[instr->operand(0)],
+      instruction_to_values_[instr->operand(1)], dimension_numbers, slice_sizes,
+      indices_are_sorted);
+  return Status::OK();
+}
+
 Status HloDialectEmitter::HandleReduce(HloInstruction* instr) {
   llvm::SmallVector<Value, 4> operands;
   for (auto operand : instr->operands()) {
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/hlo_dialect_emitter.h b/tensorflow/compiler/xla/service/mlir_gpu/hlo_dialect_emitter.h
index b8568808f4b..439f85cae49 100644
--- a/tensorflow/compiler/xla/service/mlir_gpu/hlo_dialect_emitter.h
+++ b/tensorflow/compiler/xla/service/mlir_gpu/hlo_dialect_emitter.h
@@ -56,6 +56,7 @@ class HloDialectEmitter : public DfsHloVisitorWithDefault {
   Status HandleCompare(HloInstruction* instr) override;
   Status HandleConcatenate(HloInstruction* instr) override;
   Status HandleConstant(HloInstruction* instr) override;
+  Status HandleGather(HloInstruction* instr) override;
   Status HandleIota(HloInstruction* instr) override;
   Status HandleParameter(HloInstruction* instr) override;
   Status HandleReduce(HloInstruction* instr) override;
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/lhlo_dialect_emitter.cc b/tensorflow/compiler/xla/service/mlir_gpu/lhlo_dialect_emitter.cc
index 6e26d8556e7..96fe97d2122 100644
--- a/tensorflow/compiler/xla/service/mlir_gpu/lhlo_dialect_emitter.cc
+++ b/tensorflow/compiler/xla/service/mlir_gpu/lhlo_dialect_emitter.cc
@@ -307,6 +307,47 @@ Status LhloDialectEmitter::HandleFusion(HloInstruction* instr) {
   return Status::OK();
 }
 
+Status LhloDialectEmitter::HandleGather(HloInstruction* instr) {
+  HloGatherInstruction* gather = static_cast<HloGatherInstruction*>(instr);
+  const GatherDimensionNumbers& dim_numbers =
+                                    gather->gather_dimension_numbers();
+  mlir::IntegerAttr index_vector_dim = builder_.getI64IntegerAttr(
+      dim_numbers.index_vector_dim());
+
+  std::vector<int64> gather_offset_dims(dim_numbers.offset_dims().begin(),
+                                        dim_numbers.offset_dims().end());
+  DenseIntElementsAttr offset_dims = CreateDenseIntElementsAttrFromVector(
+      gather_offset_dims, builder_);
+
+  std::vector<int64> gather_collapsed_slice_dims(
+      dim_numbers.collapsed_slice_dims().begin(),
+      dim_numbers.collapsed_slice_dims().end());
+  DenseIntElementsAttr collapsed_slice_dims =
+      CreateDenseIntElementsAttrFromVector(
+          gather_collapsed_slice_dims, builder_);
+
+  std::vector<int64> gather_start_index_map(
+      dim_numbers.start_index_map().begin(),
+      dim_numbers.start_index_map().end());
+  DenseIntElementsAttr start_index_map = CreateDenseIntElementsAttrFromVector(
+      gather_start_index_map, builder_);
+
+  std::vector<int64> gather_slice_sizes(
+                         gather->gather_slice_sizes().begin(),
+                         gather->gather_slice_sizes().end());
+  DenseIntElementsAttr slice_sizes = CreateDenseIntElementsAttrFromVector(
+      gather_slice_sizes, builder_);
+
+  TF_ASSIGN_OR_RETURN(auto function, CreateFunction(*instr));
+  OpBuilder func_builder(function.getBody());
+  func_builder.create<lhlo::GatherOp>(
+      getLocation(instr), function.getArgument(0), function.getArgument(1),
+      index_vector_dim, offset_dims, slice_sizes, collapsed_slice_dims,
+      start_index_map, function.getArgument(2));
+
+  return Status::OK();
+}
+
 Status LhloDialectEmitter::HandleReduce(HloInstruction* instr) {
   TF_ASSIGN_OR_RETURN(auto function, CreateFunction(*instr));
   llvm::SmallVector<Value, 4> arg_values{function.args_begin(),
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/lhlo_dialect_emitter.h b/tensorflow/compiler/xla/service/mlir_gpu/lhlo_dialect_emitter.h
index 5c5610fbf44..2af3dd637c8 100644
--- a/tensorflow/compiler/xla/service/mlir_gpu/lhlo_dialect_emitter.h
+++ b/tensorflow/compiler/xla/service/mlir_gpu/lhlo_dialect_emitter.h
@@ -60,6 +60,7 @@ class LhloDialectEmitter : public DfsHloVisitorWithDefault,
   Status HandleConstant(HloInstruction* instr) override;
   Status HandleCustomCall(HloInstruction* instr) override;
   Status HandleFusion(HloInstruction* instr) override;
+  Status HandleGather(HloInstruction* instr) override;
   Status HandleIota(HloInstruction* instr) override;
   Status HandleParameter(HloInstruction* instr) override;
   Status HandleReduce(HloInstruction* instr) override;
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/tests/gather.hlo b/tensorflow/compiler/xla/service/mlir_gpu/tests/gather.hlo
new file mode 100644
index 00000000000..99faa319bf6
--- /dev/null
+++ b/tensorflow/compiler/xla/service/mlir_gpu/tests/gather.hlo
@@ -0,0 +1,15 @@
+// RUN: xla-gpu-opt %s | FileCheck %s -dump-input-on-failure
+HloModule Gather
+
+ENTRY %Gather (x: f32[100,10], y: s64[4,6]) -> f32[4,6,10] {
+  %x = f32[100,10] parameter(0)
+  %y = s64[4,6] parameter(1)
+  ROOT %gather = f32[4,6,10]{2,1,0} gather(f32[100,10]{1,0} %x, s64[4,6]{1,0} %y),
+      collapsed_slice_dims={0}, index_vector_dim=2, offset_dims={2},
+      slice_sizes={1,10}, start_index_map={0}
+}
+
+// CHECK: func @gather(%[[ARG0:.*]]: [[TYPE0:.*]], %[[ARG1:.*]]: [[TYPE1:.*]], %[[RESULT:.*]]: [[RTYPE:.*]]) {
+// CHECK:   "xla_lhlo.gather"(%[[ARG0]], %[[ARG1]], %[[RESULT]])
+// CHECK:   {collapsed_slice_dims = dense<0> : tensor<1xi64>, index_vector_dim = 2 : i64, offset_dims = dense<2> : tensor<1xi64>, slice_sizes = dense<[1, 10]> : tensor<2xi64>, start_index_map = dense<0> : tensor<1xi64>} : ([[TYPE0]], [[TYPE1]], [[RTYPE]]) -> ()
+// CHECK: }

From ed6eba7389f390fddc456c6d28396878c326b369 Mon Sep 17 00:00:00 2001
From: Jaesung Chung <jaesung@google.com>
Date: Wed, 17 Jun 2020 21:21:04 -0700
Subject: [PATCH 0466/1390] Make sure that keras conversions work when storing
 to saved model is failed

PiperOrigin-RevId: 317028099
Change-Id: I544d2c5170644791fe644a595d7f4e91b9cd9d3d
---
 tensorflow/lite/python/lite.py         | 68 +++++++++++++++++++-------
 tensorflow/lite/python/lite_v2_test.py | 31 ++++++++++++
 2 files changed, 80 insertions(+), 19 deletions(-)

diff --git a/tensorflow/lite/python/lite.py b/tensorflow/lite/python/lite.py
index 93cca1a6af5..b0bd53cb9b2 100644
--- a/tensorflow/lite/python/lite.py
+++ b/tensorflow/lite/python/lite.py
@@ -690,21 +690,20 @@ class TFLiteKerasModelConverterV2(TFLiteConverterBaseV2):
     self._keras_model = keras_model
     self._trackable_obj = trackable_obj
 
-  def convert(self):
-    """Converts a keras model based on instance variables.
+  def _convert_as_saved_model(self):
+    """Converts a Keras model as a saved model.
 
     Returns:
       The converted data in serialized format.
-
-    Raises:
-      ValueError:
-        Multiple concrete functions are specified.
-        Input shape is not specified.
-        Invalid quantization parameters.
     """
     temp_dir = tempfile.mkdtemp()
     try:
-      self._keras_model.save(temp_dir, save_format="tf")
+      try:
+        self._keras_model.save(temp_dir, save_format="tf")
+      except Exception:  # pylint: disable=broad-except
+        # When storing the given keras model to a saved model is failed, let's
+        # use original keras model conversion pipeline.
+        return None
       self.saved_model_dir = temp_dir
       self._saved_model_tags = set([_tag_constants.SERVING])
       self._saved_model_exported_names = [
@@ -735,6 +734,22 @@ class TFLiteKerasModelConverterV2(TFLiteConverterBaseV2):
     finally:
       shutil.rmtree(temp_dir, True)
 
+  def convert(self):
+    """Converts a keras model based on instance variables.
+
+    Returns:
+      The converted data in serialized format.
+
+    Raises:
+      ValueError:
+        Multiple concrete functions are specified.
+        Input shape is not specified.
+        Invalid quantization parameters.
+    """
+    saved_model_convert_result = self._convert_as_saved_model()
+    if saved_model_convert_result:
+      return saved_model_convert_result
+
     input_signature = None
     # If the model's call is not a `tf.function`, then we need to first get its
     # input signature from `model_input_signature` method. We can't directly
@@ -1473,21 +1488,20 @@ class TFLiteKerasModelConverter(TFLiteConverterBaseV1):
     self._output_tensors = output_tensors
     self._debug_info_func = _build_debug_info_func(sess.graph)
 
-  def convert(self):
-    """Converts a Keras model based on instance variables.
+  def _convert_as_saved_model(self):
+    """Converts a Keras model as a saved model.
 
     Returns:
-      The converted data in serialized format. Either a TFLite Flatbuffer or a
-      Graphviz graph depending on value in `output_format`.
-
-    Raises:
-      ValueError:
-        Input shape is not specified.
-        None value for dimension in input_tensor.
+      The converted data in serialized format.
     """
     temp_dir = tempfile.mkdtemp()
     try:
-      self._keras_model.save(temp_dir, save_format="tf")
+      try:
+        self._keras_model.save(temp_dir, save_format="tf")
+      except Exception:  # pylint: disable=broad-except
+        # When storing the given keras model to a saved model is failed, let's
+        # use original keras model conversion pipeline.
+        return None
       tag_set = set([_tag_constants.SERVING])
       signature_key = _signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
       result = _freeze_saved_model(temp_dir, None, None, None, tag_set,
@@ -1506,6 +1520,22 @@ class TFLiteKerasModelConverter(TFLiteConverterBaseV1):
     finally:
       shutil.rmtree(temp_dir, True)
 
+  def convert(self):
+    """Converts a Keras model based on instance variables.
+
+    Returns:
+      The converted data in serialized format. Either a TFLite Flatbuffer or a
+      Graphviz graph depending on value in `output_format`.
+
+    Raises:
+      ValueError:
+        Input shape is not specified.
+        None value for dimension in input_tensor.
+    """
+    saved_model_convert_result = self._convert_as_saved_model()
+    if saved_model_convert_result:
+      return saved_model_convert_result
+
     return super(TFLiteKerasModelConverter, self).convert()
 
 
diff --git a/tensorflow/lite/python/lite_v2_test.py b/tensorflow/lite/python/lite_v2_test.py
index f56f85d0ba4..ea8db15abc2 100644
--- a/tensorflow/lite/python/lite_v2_test.py
+++ b/tensorflow/lite/python/lite_v2_test.py
@@ -769,6 +769,37 @@ class FromKerasModelTest(lite_v2_test_util.ModelTest):
     converter.convert()
     self._assertValidDebugInfo(converter._debug_info)
 
+  @test_util.run_v2_only
+  def testKerasFallbackPath(self):
+    """Test keras model which failed when exporting to the saved model."""
+    input_data = tf.constant(
+        np.array(np.random.random_sample((20)), dtype=np.float32))
+
+    class Model(tf.keras.Model):
+
+      def __init__(self):
+        super(Model, self).__init__()
+        # A None name will cause a failure in exporting to a saved model.
+        self.shared_weights = self.add_weight(
+            name=None,
+            shape=(20, 1),
+            dtype=tf.float32,
+            initializer=tf.random_normal_initializer(
+                mean=0.0, stddev=300**(-0.5)))
+
+      def call(self, x):
+        return tf.add(self.shared_weights, x)
+
+    # Building the model.
+    model = Model()
+    model.compile(optimizer='sgd', loss='mean_squared_error')
+    model.fit(input_data, input_data, epochs=1)
+
+    # Convert model.
+    converter = lite.TFLiteConverterV2.from_keras_model(model)
+    tflite_model = converter.convert()
+    self.assertTrue(tflite_model)
+
 
 class ControlFlowTest(lite_v2_test_util.ModelTest):
 

From d3e50477fbd5222f50ba59110b075634dab78752 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 17 Jun 2020 21:23:15 -0700
Subject: [PATCH 0467/1390] Update ops-related pbtxt files.

PiperOrigin-RevId: 317028346
Change-Id: I91245ec7ec7c492530c160788178dbd07215729b
---
 .../ops/compat/ops_history_v2/BesselI0.pbtxt  |  23 ++
 .../ops/compat/ops_history_v2/BesselI1.pbtxt  |  23 ++
 .../ops/compat/ops_history_v2/BesselJ0.pbtxt  |  23 ++
 .../ops/compat/ops_history_v2/BesselJ1.pbtxt  |  23 ++
 .../ops/compat/ops_history_v2/BesselK0.pbtxt  |  23 ++
 .../ops/compat/ops_history_v2/BesselK0e.pbtxt |  23 ++
 .../ops/compat/ops_history_v2/BesselK1.pbtxt  |  23 ++
 .../ops/compat/ops_history_v2/BesselK1e.pbtxt |  23 ++
 .../ops/compat/ops_history_v2/BesselY0.pbtxt  |  23 ++
 .../ops/compat/ops_history_v2/BesselY1.pbtxt  |  23 ++
 .../ops/compat/ops_history_v2/Dawsn.pbtxt     |  23 ++
 .../ops/compat/ops_history_v2/Expint.pbtxt    |  23 ++
 .../compat/ops_history_v2/FresnelCos.pbtxt    |  23 ++
 .../compat/ops_history_v2/FresnelSin.pbtxt    |  23 ++
 .../ops/compat/ops_history_v2/Spence.pbtxt    |  23 ++
 tensorflow/core/ops/ops.pbtxt                 | 240 ++++++++++++++++++
 16 files changed, 585 insertions(+)
 create mode 100644 tensorflow/core/ops/compat/ops_history_v2/BesselI0.pbtxt
 create mode 100644 tensorflow/core/ops/compat/ops_history_v2/BesselI1.pbtxt
 create mode 100644 tensorflow/core/ops/compat/ops_history_v2/BesselJ0.pbtxt
 create mode 100644 tensorflow/core/ops/compat/ops_history_v2/BesselJ1.pbtxt
 create mode 100644 tensorflow/core/ops/compat/ops_history_v2/BesselK0.pbtxt
 create mode 100644 tensorflow/core/ops/compat/ops_history_v2/BesselK0e.pbtxt
 create mode 100644 tensorflow/core/ops/compat/ops_history_v2/BesselK1.pbtxt
 create mode 100644 tensorflow/core/ops/compat/ops_history_v2/BesselK1e.pbtxt
 create mode 100644 tensorflow/core/ops/compat/ops_history_v2/BesselY0.pbtxt
 create mode 100644 tensorflow/core/ops/compat/ops_history_v2/BesselY1.pbtxt

diff --git a/tensorflow/core/ops/compat/ops_history_v2/BesselI0.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/BesselI0.pbtxt
new file mode 100644
index 00000000000..78d524c916c
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v2/BesselI0.pbtxt
@@ -0,0 +1,23 @@
+op {
+  name: "BesselI0"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/BesselI1.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/BesselI1.pbtxt
new file mode 100644
index 00000000000..e756c4655dd
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v2/BesselI1.pbtxt
@@ -0,0 +1,23 @@
+op {
+  name: "BesselI1"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/BesselJ0.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/BesselJ0.pbtxt
new file mode 100644
index 00000000000..35e14e5fdf1
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v2/BesselJ0.pbtxt
@@ -0,0 +1,23 @@
+op {
+  name: "BesselJ0"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/BesselJ1.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/BesselJ1.pbtxt
new file mode 100644
index 00000000000..ef8814ea8f7
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v2/BesselJ1.pbtxt
@@ -0,0 +1,23 @@
+op {
+  name: "BesselJ1"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/BesselK0.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/BesselK0.pbtxt
new file mode 100644
index 00000000000..ebb364d0371
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v2/BesselK0.pbtxt
@@ -0,0 +1,23 @@
+op {
+  name: "BesselK0"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/BesselK0e.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/BesselK0e.pbtxt
new file mode 100644
index 00000000000..e3e680c9549
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v2/BesselK0e.pbtxt
@@ -0,0 +1,23 @@
+op {
+  name: "BesselK0e"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/BesselK1.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/BesselK1.pbtxt
new file mode 100644
index 00000000000..f7ca7c2f6e2
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v2/BesselK1.pbtxt
@@ -0,0 +1,23 @@
+op {
+  name: "BesselK1"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/BesselK1e.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/BesselK1e.pbtxt
new file mode 100644
index 00000000000..96fe68d7b7f
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v2/BesselK1e.pbtxt
@@ -0,0 +1,23 @@
+op {
+  name: "BesselK1e"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/BesselY0.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/BesselY0.pbtxt
new file mode 100644
index 00000000000..cd62af34773
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v2/BesselY0.pbtxt
@@ -0,0 +1,23 @@
+op {
+  name: "BesselY0"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/BesselY1.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/BesselY1.pbtxt
new file mode 100644
index 00000000000..06f4c08eaf6
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v2/BesselY1.pbtxt
@@ -0,0 +1,23 @@
+op {
+  name: "BesselY1"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Dawsn.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Dawsn.pbtxt
index 2167305da09..a2f8cba9d58 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Dawsn.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Dawsn.pbtxt
@@ -19,3 +19,26 @@ op {
     }
   }
 }
+op {
+  name: "Dawsn"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Expint.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Expint.pbtxt
index 9092141c828..3080bf15de9 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Expint.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Expint.pbtxt
@@ -19,3 +19,26 @@ op {
     }
   }
 }
+op {
+  name: "Expint"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/FresnelCos.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/FresnelCos.pbtxt
index 97a7278d3b2..7be5bbcb2ff 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/FresnelCos.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/FresnelCos.pbtxt
@@ -19,3 +19,26 @@ op {
     }
   }
 }
+op {
+  name: "FresnelCos"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/FresnelSin.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/FresnelSin.pbtxt
index 2a708da7f79..c8c91ba6a68 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/FresnelSin.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/FresnelSin.pbtxt
@@ -19,3 +19,26 @@ op {
     }
   }
 }
+op {
+  name: "FresnelSin"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Spence.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Spence.pbtxt
index d449e456574..7032cac3dce 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Spence.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Spence.pbtxt
@@ -19,3 +19,26 @@ op {
     }
   }
 }
+op {
+  name: "Spence"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/ops.pbtxt b/tensorflow/core/ops/ops.pbtxt
index 1d92a0671b8..d99f8b8a479 100644
--- a/tensorflow/core/ops/ops.pbtxt
+++ b/tensorflow/core/ops/ops.pbtxt
@@ -4303,6 +4303,29 @@ op {
     }
   }
 }
+op {
+  name: "BesselI0"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
 op {
   name: "BesselI0e"
   input_arg {
@@ -4326,6 +4349,29 @@ op {
     }
   }
 }
+op {
+  name: "BesselI1"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
 op {
   name: "BesselI1e"
   input_arg {
@@ -4349,6 +4395,190 @@ op {
     }
   }
 }
+op {
+  name: "BesselJ0"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "BesselJ1"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "BesselK0"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "BesselK0e"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "BesselK1"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "BesselK1e"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "BesselY0"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "BesselY1"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
 op {
   name: "Betainc"
   input_arg {
@@ -10718,6 +10948,8 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -15093,6 +15325,8 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -16580,6 +16814,8 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -16601,6 +16837,8 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -47609,6 +47847,8 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
       }

From 5b009f971678e47745dc03bbfd22c31dacc3c4bd Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 17 Jun 2020 21:45:48 -0700
Subject: [PATCH 0468/1390] Go: Update generated wrapper functions for
 TensorFlow ops.

PiperOrigin-RevId: 317030798
Change-Id: I6ff071201e1c774e9a4e6a4f2d9751c4f70a79ff
---
 tensorflow/go/op/wrappers.go | 4434 +++++++++++++++++-----------------
 1 file changed, 2197 insertions(+), 2237 deletions(-)

diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index 485baa16f39..188bb3b78bb 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -10292,56 +10292,6 @@ func TensorArraySizeV2(scope *Scope, handle tf.Output, flow_in tf.Output) (size
 	return op.Output(0)
 }
 
-// Elementwise computes the bitwise right-shift of `x` and `y`.
-//
-// Performs a logical shift for unsigned integer types, and an arithmetic shift
-// for signed integer types.
-//
-// If `y` is negative, or greater than or equal to than the width of `x` in bits
-// the result is implementation defined.
-//
-// Example:
-//
-// ```python
-// import tensorflow as tf
-// from tensorflow.python.ops import bitwise_ops
-// import numpy as np
-// dtype_list = [tf.int8, tf.int16, tf.int32, tf.int64]
-//
-// for dtype in dtype_list:
-//   lhs = tf.constant([-1, -5, -3, -14], dtype=dtype)
-//   rhs = tf.constant([5, 0, 7, 11], dtype=dtype)
-//
-//   right_shift_result = bitwise_ops.right_shift(lhs, rhs)
-//
-//   print(right_shift_result)
-//
-// # This will print:
-// # tf.Tensor([-1 -5 -1 -1], shape=(4,), dtype=int8)
-// # tf.Tensor([-1 -5 -1 -1], shape=(4,), dtype=int16)
-// # tf.Tensor([-1 -5 -1 -1], shape=(4,), dtype=int32)
-// # tf.Tensor([-1 -5 -1 -1], shape=(4,), dtype=int64)
-//
-// lhs = np.array([-2, 64, 101, 32], dtype=np.int8)
-// rhs = np.array([-1, -5, -3, -14], dtype=np.int8)
-// bitwise_ops.right_shift(lhs, rhs)
-// # <tf.Tensor: shape=(4,), dtype=int8, numpy=array([ -2,  64, 101,  32], dtype=int8)>
-// ```
-//
-func RightShift(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "RightShift",
-		Input: []tf.Input{
-			x, y,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // RebatchDatasetAttr is an optional argument to RebatchDataset.
 type RebatchDatasetAttr func(optionalAttr)
 
@@ -12732,6 +12682,77 @@ func AdjustSaturation(scope *Scope, images tf.Output, scale tf.Output) (output t
 	return op.Output(0)
 }
 
+// Adjust the contrast of one or more images.
+//
+// `images` is a tensor of at least 3 dimensions.  The last 3 dimensions are
+// interpreted as `[height, width, channels]`.  The other dimensions only
+// represent a collection of images, such as `[batch, height, width, channels].`
+//
+// Contrast is adjusted independently for each channel of each image.
+//
+// For each channel, the Op first computes the mean of the image pixels in the
+// channel and then adjusts each component of each pixel to
+// `(x - mean) * contrast_factor + mean`.
+//
+// Arguments:
+//	images: Images to adjust.  At least 3-D.
+//	contrast_factor: A float multiplier for adjusting contrast.
+//
+// Returns The contrast-adjusted image or images.
+func AdjustContrastv2(scope *Scope, images tf.Output, contrast_factor tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "AdjustContrastv2",
+		Input: []tf.Input{
+			images, contrast_factor,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Initializes the multi device iterator with the given dataset.
+//
+// Arguments:
+//	dataset: Dataset to be iterated upon.
+//	multi_device_iterator: A MultiDeviceIteratorResource.
+//	max_buffer_size: The maximum size of the host side per device buffer to keep.
+//
+// Returns An int64 indicating which incarnation of the MultiDeviceIterator
+// is running.
+func MultiDeviceIteratorInit(scope *Scope, dataset tf.Output, multi_device_iterator tf.Output, max_buffer_size tf.Output) (incarnation_id tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "MultiDeviceIteratorInit",
+		Input: []tf.Input{
+			dataset, multi_device_iterator, max_buffer_size,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Deprecated. Disallowed in GraphDef version >= 2.
+//
+// DEPRECATED at GraphDef version 2: Use AdjustContrastv2 instead
+func AdjustContrast(scope *Scope, images tf.Output, contrast_factor tf.Output, min_value tf.Output, max_value tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "AdjustContrast",
+		Input: []tf.Input{
+			images, contrast_factor, min_value, max_value,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // ExtractJpegShapeAttr is an optional argument to ExtractJpegShape.
 type ExtractJpegShapeAttr func(optionalAttr)
 
@@ -12773,6 +12794,312 @@ func ExtractJpegShape(scope *Scope, contents tf.Output, optional ...ExtractJpegS
 	return op.Output(0)
 }
 
+// JPEG encode input image with provided compression quality.
+//
+// `image` is a 3-D uint8 Tensor of shape `[height, width, channels]`.
+// `quality` is an int32 jpeg compression quality value between 0 and 100.
+//
+//
+// Arguments:
+//	images: Images to adjust.  At least 3-D.
+//	quality: An int quality to encode to.
+//
+// Returns 0-D. JPEG-encoded image.
+func EncodeJpegVariableQuality(scope *Scope, images tf.Output, quality tf.Output) (contents tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "EncodeJpegVariableQuality",
+		Input: []tf.Input{
+			images, quality,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns the rank of a tensor.
+//
+// This operation returns an integer representing the rank of `input`.
+//
+// For example:
+//
+// ```
+// # 't' is [[[1, 1, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]]
+// # shape of tensor 't' is [2, 2, 3]
+// rank(t) ==> 3
+// ```
+//
+// **Note**: The rank of a tensor is not the same as the rank of a matrix. The rank
+// of a tensor is the number of indices required to uniquely select each element
+// of the tensor. Rank is also known as "order", "degree", or "ndims."
+func Rank(scope *Scope, input tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Rank",
+		Input: []tf.Input{
+			input,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// EncodeJpegAttr is an optional argument to EncodeJpeg.
+type EncodeJpegAttr func(optionalAttr)
+
+// EncodeJpegFormat sets the optional format attribute to value.
+//
+// value: Per pixel image format.
+// If not specified, defaults to ""
+func EncodeJpegFormat(value string) EncodeJpegAttr {
+	return func(m optionalAttr) {
+		m["format"] = value
+	}
+}
+
+// EncodeJpegQuality sets the optional quality attribute to value.
+//
+// value: Quality of the compression from 0 to 100 (higher is better and slower).
+// If not specified, defaults to 95
+func EncodeJpegQuality(value int64) EncodeJpegAttr {
+	return func(m optionalAttr) {
+		m["quality"] = value
+	}
+}
+
+// EncodeJpegProgressive sets the optional progressive attribute to value.
+//
+// value: If True, create a JPEG that loads progressively (coarse to fine).
+// If not specified, defaults to false
+func EncodeJpegProgressive(value bool) EncodeJpegAttr {
+	return func(m optionalAttr) {
+		m["progressive"] = value
+	}
+}
+
+// EncodeJpegOptimizeSize sets the optional optimize_size attribute to value.
+//
+// value: If True, spend CPU/RAM to reduce size with no quality change.
+// If not specified, defaults to false
+func EncodeJpegOptimizeSize(value bool) EncodeJpegAttr {
+	return func(m optionalAttr) {
+		m["optimize_size"] = value
+	}
+}
+
+// EncodeJpegChromaDownsampling sets the optional chroma_downsampling attribute to value.
+//
+// value: See http://en.wikipedia.org/wiki/Chroma_subsampling.
+// If not specified, defaults to true
+func EncodeJpegChromaDownsampling(value bool) EncodeJpegAttr {
+	return func(m optionalAttr) {
+		m["chroma_downsampling"] = value
+	}
+}
+
+// EncodeJpegDensityUnit sets the optional density_unit attribute to value.
+//
+// value: Unit used to specify `x_density` and `y_density`:
+// pixels per inch (`'in'`) or centimeter (`'cm'`).
+// If not specified, defaults to "in"
+func EncodeJpegDensityUnit(value string) EncodeJpegAttr {
+	return func(m optionalAttr) {
+		m["density_unit"] = value
+	}
+}
+
+// EncodeJpegXDensity sets the optional x_density attribute to value.
+//
+// value: Horizontal pixels per density unit.
+// If not specified, defaults to 300
+func EncodeJpegXDensity(value int64) EncodeJpegAttr {
+	return func(m optionalAttr) {
+		m["x_density"] = value
+	}
+}
+
+// EncodeJpegYDensity sets the optional y_density attribute to value.
+//
+// value: Vertical pixels per density unit.
+// If not specified, defaults to 300
+func EncodeJpegYDensity(value int64) EncodeJpegAttr {
+	return func(m optionalAttr) {
+		m["y_density"] = value
+	}
+}
+
+// EncodeJpegXmpMetadata sets the optional xmp_metadata attribute to value.
+//
+// value: If not empty, embed this XMP metadata in the image header.
+// If not specified, defaults to ""
+func EncodeJpegXmpMetadata(value string) EncodeJpegAttr {
+	return func(m optionalAttr) {
+		m["xmp_metadata"] = value
+	}
+}
+
+// JPEG-encode an image.
+//
+// `image` is a 3-D uint8 Tensor of shape `[height, width, channels]`.
+//
+// The attr `format` can be used to override the color format of the encoded
+// output.  Values can be:
+//
+// *   `''`: Use a default format based on the number of channels in the image.
+// *   `grayscale`: Output a grayscale JPEG image.  The `channels` dimension
+//     of `image` must be 1.
+// *   `rgb`: Output an RGB JPEG image. The `channels` dimension
+//     of `image` must be 3.
+//
+// If `format` is not specified or is the empty string, a default format is picked
+// in function of the number of channels in `image`:
+//
+// *   1: Output a grayscale image.
+// *   3: Output an RGB image.
+//
+// Arguments:
+//	image: 3-D with shape `[height, width, channels]`.
+//
+// Returns 0-D. JPEG-encoded image.
+func EncodeJpeg(scope *Scope, image tf.Output, optional ...EncodeJpegAttr) (contents tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "EncodeJpeg",
+		Input: []tf.Input{
+			image,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// DecodeAndCropJpegAttr is an optional argument to DecodeAndCropJpeg.
+type DecodeAndCropJpegAttr func(optionalAttr)
+
+// DecodeAndCropJpegChannels sets the optional channels attribute to value.
+//
+// value: Number of color channels for the decoded image.
+// If not specified, defaults to 0
+func DecodeAndCropJpegChannels(value int64) DecodeAndCropJpegAttr {
+	return func(m optionalAttr) {
+		m["channels"] = value
+	}
+}
+
+// DecodeAndCropJpegRatio sets the optional ratio attribute to value.
+//
+// value: Downscaling ratio.
+// If not specified, defaults to 1
+func DecodeAndCropJpegRatio(value int64) DecodeAndCropJpegAttr {
+	return func(m optionalAttr) {
+		m["ratio"] = value
+	}
+}
+
+// DecodeAndCropJpegFancyUpscaling sets the optional fancy_upscaling attribute to value.
+//
+// value: If true use a slower but nicer upscaling of the
+// chroma planes (yuv420/422 only).
+// If not specified, defaults to true
+func DecodeAndCropJpegFancyUpscaling(value bool) DecodeAndCropJpegAttr {
+	return func(m optionalAttr) {
+		m["fancy_upscaling"] = value
+	}
+}
+
+// DecodeAndCropJpegTryRecoverTruncated sets the optional try_recover_truncated attribute to value.
+//
+// value: If true try to recover an image from truncated input.
+// If not specified, defaults to false
+func DecodeAndCropJpegTryRecoverTruncated(value bool) DecodeAndCropJpegAttr {
+	return func(m optionalAttr) {
+		m["try_recover_truncated"] = value
+	}
+}
+
+// DecodeAndCropJpegAcceptableFraction sets the optional acceptable_fraction attribute to value.
+//
+// value: The minimum required fraction of lines before a truncated
+// input is accepted.
+// If not specified, defaults to 1
+func DecodeAndCropJpegAcceptableFraction(value float32) DecodeAndCropJpegAttr {
+	return func(m optionalAttr) {
+		m["acceptable_fraction"] = value
+	}
+}
+
+// DecodeAndCropJpegDctMethod sets the optional dct_method attribute to value.
+//
+// value: string specifying a hint about the algorithm used for
+// decompression.  Defaults to "" which maps to a system-specific
+// default.  Currently valid values are ["INTEGER_FAST",
+// "INTEGER_ACCURATE"].  The hint may be ignored (e.g., the internal
+// jpeg library changes to a version that does not have that specific
+// option.)
+// If not specified, defaults to ""
+func DecodeAndCropJpegDctMethod(value string) DecodeAndCropJpegAttr {
+	return func(m optionalAttr) {
+		m["dct_method"] = value
+	}
+}
+
+// Decode and Crop a JPEG-encoded image to a uint8 tensor.
+//
+// The attr `channels` indicates the desired number of color channels for the
+// decoded image.
+//
+// Accepted values are:
+//
+// *   0: Use the number of channels in the JPEG-encoded image.
+// *   1: output a grayscale image.
+// *   3: output an RGB image.
+//
+// If needed, the JPEG-encoded image is transformed to match the requested number
+// of color channels.
+//
+// The attr `ratio` allows downscaling the image by an integer factor during
+// decoding.  Allowed values are: 1, 2, 4, and 8.  This is much faster than
+// downscaling the image later.
+//
+//
+// It is equivalent to a combination of decode and crop, but much faster by only
+// decoding partial jpeg image.
+//
+// Arguments:
+//	contents: 0-D.  The JPEG-encoded image.
+//	crop_window: 1-D.  The crop window: [crop_y, crop_x, crop_height, crop_width].
+//
+// Returns 3-D with shape `[height, width, channels]`..
+func DecodeAndCropJpeg(scope *Scope, contents tf.Output, crop_window tf.Output, optional ...DecodeAndCropJpegAttr) (image tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "DecodeAndCropJpeg",
+		Input: []tf.Input{
+			contents, crop_window,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // RandomCropAttr is an optional argument to RandomCrop.
 type RandomCropAttr func(optionalAttr)
 
@@ -12883,46 +13210,6 @@ func ResizeNearestNeighborGrad(scope *Scope, grads tf.Output, size tf.Output, op
 	return op.Output(0)
 }
 
-// Initializes the multi device iterator with the given dataset.
-//
-// Arguments:
-//	dataset: Dataset to be iterated upon.
-//	multi_device_iterator: A MultiDeviceIteratorResource.
-//	max_buffer_size: The maximum size of the host side per device buffer to keep.
-//
-// Returns An int64 indicating which incarnation of the MultiDeviceIterator
-// is running.
-func MultiDeviceIteratorInit(scope *Scope, dataset tf.Output, multi_device_iterator tf.Output, max_buffer_size tf.Output) (incarnation_id tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "MultiDeviceIteratorInit",
-		Input: []tf.Input{
-			dataset, multi_device_iterator, max_buffer_size,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Deprecated. Disallowed in GraphDef version >= 2.
-//
-// DEPRECATED at GraphDef version 2: Use AdjustContrastv2 instead
-func AdjustContrast(scope *Scope, images tf.Output, contrast_factor tf.Output, min_value tf.Output, max_value tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "AdjustContrast",
-		Input: []tf.Input{
-			images, contrast_factor, min_value, max_value,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // A placeholder op for a value that will be fed into the computation.
 //
 // Arguments:
@@ -20981,26 +21268,6 @@ func Floor(scope *Scope, x tf.Output) (y tf.Output) {
 	return op.Output(0)
 }
 
-// Computes the Bessel i0e function of `x` element-wise.
-//
-// Exponentially scaled modified Bessel function of order 0 defined as
-// `bessel_i0e(x) = exp(-abs(x)) bessel_i0(x)`.
-//
-// This function is faster and numerically stabler than `bessel_i0(x)`.
-func BesselI0e(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "BesselI0e",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Computes the trignometric inverse tangent of x element-wise.
 //
 // The `tf.math.atan` operation returns the inverse of `tf.math.tan`, such that
@@ -22983,122 +23250,6 @@ func QuantizedDepthwiseConv2D(scope *Scope, input tf.Output, filter tf.Output, m
 	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// DecodeAndCropJpegAttr is an optional argument to DecodeAndCropJpeg.
-type DecodeAndCropJpegAttr func(optionalAttr)
-
-// DecodeAndCropJpegChannels sets the optional channels attribute to value.
-//
-// value: Number of color channels for the decoded image.
-// If not specified, defaults to 0
-func DecodeAndCropJpegChannels(value int64) DecodeAndCropJpegAttr {
-	return func(m optionalAttr) {
-		m["channels"] = value
-	}
-}
-
-// DecodeAndCropJpegRatio sets the optional ratio attribute to value.
-//
-// value: Downscaling ratio.
-// If not specified, defaults to 1
-func DecodeAndCropJpegRatio(value int64) DecodeAndCropJpegAttr {
-	return func(m optionalAttr) {
-		m["ratio"] = value
-	}
-}
-
-// DecodeAndCropJpegFancyUpscaling sets the optional fancy_upscaling attribute to value.
-//
-// value: If true use a slower but nicer upscaling of the
-// chroma planes (yuv420/422 only).
-// If not specified, defaults to true
-func DecodeAndCropJpegFancyUpscaling(value bool) DecodeAndCropJpegAttr {
-	return func(m optionalAttr) {
-		m["fancy_upscaling"] = value
-	}
-}
-
-// DecodeAndCropJpegTryRecoverTruncated sets the optional try_recover_truncated attribute to value.
-//
-// value: If true try to recover an image from truncated input.
-// If not specified, defaults to false
-func DecodeAndCropJpegTryRecoverTruncated(value bool) DecodeAndCropJpegAttr {
-	return func(m optionalAttr) {
-		m["try_recover_truncated"] = value
-	}
-}
-
-// DecodeAndCropJpegAcceptableFraction sets the optional acceptable_fraction attribute to value.
-//
-// value: The minimum required fraction of lines before a truncated
-// input is accepted.
-// If not specified, defaults to 1
-func DecodeAndCropJpegAcceptableFraction(value float32) DecodeAndCropJpegAttr {
-	return func(m optionalAttr) {
-		m["acceptable_fraction"] = value
-	}
-}
-
-// DecodeAndCropJpegDctMethod sets the optional dct_method attribute to value.
-//
-// value: string specifying a hint about the algorithm used for
-// decompression.  Defaults to "" which maps to a system-specific
-// default.  Currently valid values are ["INTEGER_FAST",
-// "INTEGER_ACCURATE"].  The hint may be ignored (e.g., the internal
-// jpeg library changes to a version that does not have that specific
-// option.)
-// If not specified, defaults to ""
-func DecodeAndCropJpegDctMethod(value string) DecodeAndCropJpegAttr {
-	return func(m optionalAttr) {
-		m["dct_method"] = value
-	}
-}
-
-// Decode and Crop a JPEG-encoded image to a uint8 tensor.
-//
-// The attr `channels` indicates the desired number of color channels for the
-// decoded image.
-//
-// Accepted values are:
-//
-// *   0: Use the number of channels in the JPEG-encoded image.
-// *   1: output a grayscale image.
-// *   3: output an RGB image.
-//
-// If needed, the JPEG-encoded image is transformed to match the requested number
-// of color channels.
-//
-// The attr `ratio` allows downscaling the image by an integer factor during
-// decoding.  Allowed values are: 1, 2, 4, and 8.  This is much faster than
-// downscaling the image later.
-//
-//
-// It is equivalent to a combination of decode and crop, but much faster by only
-// decoding partial jpeg image.
-//
-// Arguments:
-//	contents: 0-D.  The JPEG-encoded image.
-//	crop_window: 1-D.  The crop window: [crop_y, crop_x, crop_height, crop_width].
-//
-// Returns 3-D with shape `[height, width, channels]`..
-func DecodeAndCropJpeg(scope *Scope, contents tf.Output, crop_window tf.Output, optional ...DecodeAndCropJpegAttr) (image tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "DecodeAndCropJpeg",
-		Input: []tf.Input{
-			contents, crop_window,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // QuantizedConv2DPerChannelAttr is an optional argument to QuantizedConv2DPerChannel.
 type QuantizedConv2DPerChannelAttr func(optionalAttr)
 
@@ -23468,318 +23619,6 @@ func RetrieveTPUEmbeddingRMSPropParameters(scope *Scope, num_shards int64, shard
 	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Conv3DBackpropInputV2Attr is an optional argument to Conv3DBackpropInputV2.
-type Conv3DBackpropInputV2Attr func(optionalAttr)
-
-// Conv3DBackpropInputV2DataFormat sets the optional data_format attribute to value.
-//
-// value: The data format of the input and output data. With the
-// default format "NDHWC", the data is stored in the order of:
-//     [batch, in_depth, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCDHW", the data storage order is:
-//     [batch, in_channels, in_depth, in_height, in_width].
-// If not specified, defaults to "NDHWC"
-func Conv3DBackpropInputV2DataFormat(value string) Conv3DBackpropInputV2Attr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// Conv3DBackpropInputV2Dilations sets the optional dilations attribute to value.
-//
-// value: 1-D tensor of length 5.  The dilation factor for each dimension of
-// `input`. If set to k > 1, there will be k-1 skipped cells between each
-// filter element on that dimension. The dimension order is determined by the
-// value of `data_format`, see above for details. Dilations in the batch and
-// depth dimensions must be 1.
-// If not specified, defaults to <i:1 i:1 i:1 i:1 i:1 >
-func Conv3DBackpropInputV2Dilations(value []int64) Conv3DBackpropInputV2Attr {
-	return func(m optionalAttr) {
-		m["dilations"] = value
-	}
-}
-
-// Computes the gradients of 3-D convolution with respect to the input.
-//
-// Arguments:
-//	input_sizes: An integer vector representing the tensor shape of `input`,
-// where `input` is a 5-D
-// `[batch, depth, rows, cols, in_channels]` tensor.
-//	filter: Shape `[depth, rows, cols, in_channels, out_channels]`.
-// `in_channels` must match between `input` and `filter`.
-//	out_backprop: Backprop signal of shape `[batch, out_depth, out_rows, out_cols,
-// out_channels]`.
-//	strides: 1-D tensor of length 5. The stride of the sliding window for each
-// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
-//	padding: The type of padding algorithm to use.
-func Conv3DBackpropInputV2(scope *Scope, input_sizes tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...Conv3DBackpropInputV2Attr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "Conv3DBackpropInputV2",
-		Input: []tf.Input{
-			input_sizes, filter, out_backprop,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// LRNAttr is an optional argument to LRN.
-type LRNAttr func(optionalAttr)
-
-// LRNDepthRadius sets the optional depth_radius attribute to value.
-//
-// value: 0-D.  Half-width of the 1-D normalization window.
-// If not specified, defaults to 5
-func LRNDepthRadius(value int64) LRNAttr {
-	return func(m optionalAttr) {
-		m["depth_radius"] = value
-	}
-}
-
-// LRNBias sets the optional bias attribute to value.
-//
-// value: An offset (usually positive to avoid dividing by 0).
-// If not specified, defaults to 1
-func LRNBias(value float32) LRNAttr {
-	return func(m optionalAttr) {
-		m["bias"] = value
-	}
-}
-
-// LRNAlpha sets the optional alpha attribute to value.
-//
-// value: A scale factor, usually positive.
-// If not specified, defaults to 1
-func LRNAlpha(value float32) LRNAttr {
-	return func(m optionalAttr) {
-		m["alpha"] = value
-	}
-}
-
-// LRNBeta sets the optional beta attribute to value.
-//
-// value: An exponent.
-// If not specified, defaults to 0.5
-func LRNBeta(value float32) LRNAttr {
-	return func(m optionalAttr) {
-		m["beta"] = value
-	}
-}
-
-// Local Response Normalization.
-//
-// The 4-D `input` tensor is treated as a 3-D array of 1-D vectors (along the last
-// dimension), and each vector is normalized independently.  Within a given vector,
-// each component is divided by the weighted, squared sum of inputs within
-// `depth_radius`.  In detail,
-//
-//     sqr_sum[a, b, c, d] =
-//         sum(input[a, b, c, d - depth_radius : d + depth_radius + 1] ** 2)
-//     output = input / (bias + alpha * sqr_sum) ** beta
-//
-// For details, see [Krizhevsky et al., ImageNet classification with deep
-// convolutional neural networks (NIPS 2012)](http://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks).
-//
-// Arguments:
-//	input: 4-D.
-func LRN(scope *Scope, input tf.Output, optional ...LRNAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "LRN",
-		Input: []tf.Input{
-			input,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// QuantizedMatMulWithBiasAndReluAndRequantizeAttr is an optional argument to QuantizedMatMulWithBiasAndReluAndRequantize.
-type QuantizedMatMulWithBiasAndReluAndRequantizeAttr func(optionalAttr)
-
-// QuantizedMatMulWithBiasAndReluAndRequantizeToutput sets the optional Toutput attribute to value.
-// If not specified, defaults to DT_QUINT8
-func QuantizedMatMulWithBiasAndReluAndRequantizeToutput(value tf.DataType) QuantizedMatMulWithBiasAndReluAndRequantizeAttr {
-	return func(m optionalAttr) {
-		m["Toutput"] = value
-	}
-}
-
-// QuantizedMatMulWithBiasAndReluAndRequantizeTransposeA sets the optional transpose_a attribute to value.
-//
-// value: If true, `a` is transposed before multiplication.
-// If not specified, defaults to false
-func QuantizedMatMulWithBiasAndReluAndRequantizeTransposeA(value bool) QuantizedMatMulWithBiasAndReluAndRequantizeAttr {
-	return func(m optionalAttr) {
-		m["transpose_a"] = value
-	}
-}
-
-// QuantizedMatMulWithBiasAndReluAndRequantizeTransposeB sets the optional transpose_b attribute to value.
-//
-// value: If true, `b` is transposed before multiplication.
-// If not specified, defaults to false
-func QuantizedMatMulWithBiasAndReluAndRequantizeTransposeB(value bool) QuantizedMatMulWithBiasAndReluAndRequantizeAttr {
-	return func(m optionalAttr) {
-		m["transpose_b"] = value
-	}
-}
-
-// QuantizedMatMulWithBiasAndReluAndRequantizeInputQuantMode sets the optional input_quant_mode attribute to value.
-//
-// value: Input data quantization mode. Either MIN_FIRST(default) or SCALED.
-// If not specified, defaults to "MIN_FIRST"
-func QuantizedMatMulWithBiasAndReluAndRequantizeInputQuantMode(value string) QuantizedMatMulWithBiasAndReluAndRequantizeAttr {
-	return func(m optionalAttr) {
-		m["input_quant_mode"] = value
-	}
-}
-
-// Perform a quantized matrix multiplication of  `a` by the matrix `b` with bias
-// add and relu and requantize fusion.
-//
-// The inputs must be two-dimensional matrices and 1D bias vector. And the inner
-// dimension of `a` (after being transposed if `transpose_a` is non-zero) must
-// match the outer dimension of `b` (after being transposed if `transposed_b` is
-// non-zero). Then do broadcast add operation with bias values on the matrix
-// multiplication result. The bias size must match inner dimension of `b`.  Then do
-// relu activation to get non-negative result. Then do requantize operation to get
-// final uint8 result.
-//
-// Arguments:
-//	a: A matrix to be multiplied. Must be a two-dimensional tensor of type `quint8`.
-//	b: A matrix to be multiplied and must be a two-dimensional tensor of type `qint8`.
-//	bias: A 1D bias tensor with size matching with inner dimension of `b` (after being
-// transposed if `transposed_b` is non-zero).
-//	min_a: The float value that the lowest quantized `a` value represents.
-//	max_a: The float value that the highest quantized `a` value represents.
-//	min_b: The float value that the lowest quantized `b` value represents.
-//	max_b: The float value that the highest quantized `b` value represents.
-//	min_freezed_output: The float value that the highest quantized output value after requantize.
-//
-//
-// Returns:
-//	out
-//	min_out: The float value that the lowest quantized output value represents.
-//	max_out: The float value that the highest quantized output value represents.
-func QuantizedMatMulWithBiasAndReluAndRequantize(scope *Scope, a tf.Output, b tf.Output, bias tf.Output, min_a tf.Output, max_a tf.Output, min_b tf.Output, max_b tf.Output, min_freezed_output tf.Output, max_freezed_output tf.Output, optional ...QuantizedMatMulWithBiasAndReluAndRequantizeAttr) (out tf.Output, min_out tf.Output, max_out tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "QuantizedMatMulWithBiasAndReluAndRequantize",
-		Input: []tf.Input{
-			a, b, bias, min_a, max_a, min_b, max_b, min_freezed_output, max_freezed_output,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// QuantizedMatMulWithBiasAndReluAttr is an optional argument to QuantizedMatMulWithBiasAndRelu.
-type QuantizedMatMulWithBiasAndReluAttr func(optionalAttr)
-
-// QuantizedMatMulWithBiasAndReluToutput sets the optional Toutput attribute to value.
-// If not specified, defaults to DT_QINT32
-func QuantizedMatMulWithBiasAndReluToutput(value tf.DataType) QuantizedMatMulWithBiasAndReluAttr {
-	return func(m optionalAttr) {
-		m["Toutput"] = value
-	}
-}
-
-// QuantizedMatMulWithBiasAndReluTransposeA sets the optional transpose_a attribute to value.
-//
-// value: If true, `a` is transposed before multiplication.
-// If not specified, defaults to false
-func QuantizedMatMulWithBiasAndReluTransposeA(value bool) QuantizedMatMulWithBiasAndReluAttr {
-	return func(m optionalAttr) {
-		m["transpose_a"] = value
-	}
-}
-
-// QuantizedMatMulWithBiasAndReluTransposeB sets the optional transpose_b attribute to value.
-//
-// value: If true, `b` is transposed before multiplication.
-// If not specified, defaults to false
-func QuantizedMatMulWithBiasAndReluTransposeB(value bool) QuantizedMatMulWithBiasAndReluAttr {
-	return func(m optionalAttr) {
-		m["transpose_b"] = value
-	}
-}
-
-// QuantizedMatMulWithBiasAndReluInputQuantMode sets the optional input_quant_mode attribute to value.
-//
-// value: Input data quantization mode. Either MIN_FIRST(default) or SCALED.
-// If not specified, defaults to "MIN_FIRST"
-func QuantizedMatMulWithBiasAndReluInputQuantMode(value string) QuantizedMatMulWithBiasAndReluAttr {
-	return func(m optionalAttr) {
-		m["input_quant_mode"] = value
-	}
-}
-
-// Perform a quantized matrix multiplication of  `a` by the matrix `b` with bias
-// add and relu fusion.
-//
-// The inputs must be two-dimensional matrices and 1D bias vector. And the inner
-// dimension of `a` (after being transposed if `transpose_a` is non-zero) must
-// match the outer dimension of `b` (after being transposed if `transposed_b` is
-// non-zero). Then do broadcast add operation with bias values on the matrix
-// multiplication result. The bias size must match inner dimension of `b`. Then do
-// relu activation to get non-negative result.
-//
-// Arguments:
-//	a: A matrix to be multiplied. Must be a two-dimensional tensor of type `quint8`.
-//	b: A matrix to be multiplied and must be a two-dimensional tensor of type `qint8`.
-//	bias: A 1D bias tensor with size matching with inner dimension of `b` (after being
-// transposed if `transposed_b` is non-zero).
-//	min_a: The float value that the lowest quantized `a` value represents.
-//	max_a: The float value that the highest quantized `a` value represents.
-//	min_b: The float value that the lowest quantized `b` value represents.
-//	max_b: The float value that the highest quantized `b` value represents.
-//
-// Returns:
-//	out
-//	min_out: The float value that the lowest quantized output value represents.
-//	max_out: The float value that the highest quantized output value represents.
-func QuantizedMatMulWithBiasAndRelu(scope *Scope, a tf.Output, b tf.Output, bias tf.Output, min_a tf.Output, max_a tf.Output, min_b tf.Output, max_b tf.Output, optional ...QuantizedMatMulWithBiasAndReluAttr) (out tf.Output, min_out tf.Output, max_out tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "QuantizedMatMulWithBiasAndRelu",
-		Input: []tf.Input{
-			a, b, bias, min_a, max_a, min_b, max_b,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
 // QuantizedMatMulWithBiasAttr is an optional argument to QuantizedMatMulWithBias.
 type QuantizedMatMulWithBiasAttr func(optionalAttr)
 
@@ -25672,6 +25511,234 @@ func Zeta(scope *Scope, x tf.Output, q tf.Output) (z tf.Output) {
 	return op.Output(0)
 }
 
+// QuantizedMatMulWithBiasAndReluAndRequantizeAttr is an optional argument to QuantizedMatMulWithBiasAndReluAndRequantize.
+type QuantizedMatMulWithBiasAndReluAndRequantizeAttr func(optionalAttr)
+
+// QuantizedMatMulWithBiasAndReluAndRequantizeToutput sets the optional Toutput attribute to value.
+// If not specified, defaults to DT_QUINT8
+func QuantizedMatMulWithBiasAndReluAndRequantizeToutput(value tf.DataType) QuantizedMatMulWithBiasAndReluAndRequantizeAttr {
+	return func(m optionalAttr) {
+		m["Toutput"] = value
+	}
+}
+
+// QuantizedMatMulWithBiasAndReluAndRequantizeTransposeA sets the optional transpose_a attribute to value.
+//
+// value: If true, `a` is transposed before multiplication.
+// If not specified, defaults to false
+func QuantizedMatMulWithBiasAndReluAndRequantizeTransposeA(value bool) QuantizedMatMulWithBiasAndReluAndRequantizeAttr {
+	return func(m optionalAttr) {
+		m["transpose_a"] = value
+	}
+}
+
+// QuantizedMatMulWithBiasAndReluAndRequantizeTransposeB sets the optional transpose_b attribute to value.
+//
+// value: If true, `b` is transposed before multiplication.
+// If not specified, defaults to false
+func QuantizedMatMulWithBiasAndReluAndRequantizeTransposeB(value bool) QuantizedMatMulWithBiasAndReluAndRequantizeAttr {
+	return func(m optionalAttr) {
+		m["transpose_b"] = value
+	}
+}
+
+// QuantizedMatMulWithBiasAndReluAndRequantizeInputQuantMode sets the optional input_quant_mode attribute to value.
+//
+// value: Input data quantization mode. Either MIN_FIRST(default) or SCALED.
+// If not specified, defaults to "MIN_FIRST"
+func QuantizedMatMulWithBiasAndReluAndRequantizeInputQuantMode(value string) QuantizedMatMulWithBiasAndReluAndRequantizeAttr {
+	return func(m optionalAttr) {
+		m["input_quant_mode"] = value
+	}
+}
+
+// Perform a quantized matrix multiplication of  `a` by the matrix `b` with bias
+// add and relu and requantize fusion.
+//
+// The inputs must be two-dimensional matrices and 1D bias vector. And the inner
+// dimension of `a` (after being transposed if `transpose_a` is non-zero) must
+// match the outer dimension of `b` (after being transposed if `transposed_b` is
+// non-zero). Then do broadcast add operation with bias values on the matrix
+// multiplication result. The bias size must match inner dimension of `b`.  Then do
+// relu activation to get non-negative result. Then do requantize operation to get
+// final uint8 result.
+//
+// Arguments:
+//	a: A matrix to be multiplied. Must be a two-dimensional tensor of type `quint8`.
+//	b: A matrix to be multiplied and must be a two-dimensional tensor of type `qint8`.
+//	bias: A 1D bias tensor with size matching with inner dimension of `b` (after being
+// transposed if `transposed_b` is non-zero).
+//	min_a: The float value that the lowest quantized `a` value represents.
+//	max_a: The float value that the highest quantized `a` value represents.
+//	min_b: The float value that the lowest quantized `b` value represents.
+//	max_b: The float value that the highest quantized `b` value represents.
+//	min_freezed_output: The float value that the highest quantized output value after requantize.
+//
+//
+// Returns:
+//	out
+//	min_out: The float value that the lowest quantized output value represents.
+//	max_out: The float value that the highest quantized output value represents.
+func QuantizedMatMulWithBiasAndReluAndRequantize(scope *Scope, a tf.Output, b tf.Output, bias tf.Output, min_a tf.Output, max_a tf.Output, min_b tf.Output, max_b tf.Output, min_freezed_output tf.Output, max_freezed_output tf.Output, optional ...QuantizedMatMulWithBiasAndReluAndRequantizeAttr) (out tf.Output, min_out tf.Output, max_out tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "QuantizedMatMulWithBiasAndReluAndRequantize",
+		Input: []tf.Input{
+			a, b, bias, min_a, max_a, min_b, max_b, min_freezed_output, max_freezed_output,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// Conv3DBackpropInputV2Attr is an optional argument to Conv3DBackpropInputV2.
+type Conv3DBackpropInputV2Attr func(optionalAttr)
+
+// Conv3DBackpropInputV2DataFormat sets the optional data_format attribute to value.
+//
+// value: The data format of the input and output data. With the
+// default format "NDHWC", the data is stored in the order of:
+//     [batch, in_depth, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCDHW", the data storage order is:
+//     [batch, in_channels, in_depth, in_height, in_width].
+// If not specified, defaults to "NDHWC"
+func Conv3DBackpropInputV2DataFormat(value string) Conv3DBackpropInputV2Attr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// Conv3DBackpropInputV2Dilations sets the optional dilations attribute to value.
+//
+// value: 1-D tensor of length 5.  The dilation factor for each dimension of
+// `input`. If set to k > 1, there will be k-1 skipped cells between each
+// filter element on that dimension. The dimension order is determined by the
+// value of `data_format`, see above for details. Dilations in the batch and
+// depth dimensions must be 1.
+// If not specified, defaults to <i:1 i:1 i:1 i:1 i:1 >
+func Conv3DBackpropInputV2Dilations(value []int64) Conv3DBackpropInputV2Attr {
+	return func(m optionalAttr) {
+		m["dilations"] = value
+	}
+}
+
+// Computes the gradients of 3-D convolution with respect to the input.
+//
+// Arguments:
+//	input_sizes: An integer vector representing the tensor shape of `input`,
+// where `input` is a 5-D
+// `[batch, depth, rows, cols, in_channels]` tensor.
+//	filter: Shape `[depth, rows, cols, in_channels, out_channels]`.
+// `in_channels` must match between `input` and `filter`.
+//	out_backprop: Backprop signal of shape `[batch, out_depth, out_rows, out_cols,
+// out_channels]`.
+//	strides: 1-D tensor of length 5. The stride of the sliding window for each
+// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
+//	padding: The type of padding algorithm to use.
+func Conv3DBackpropInputV2(scope *Scope, input_sizes tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...Conv3DBackpropInputV2Attr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Conv3DBackpropInputV2",
+		Input: []tf.Input{
+			input_sizes, filter, out_backprop,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// LRNAttr is an optional argument to LRN.
+type LRNAttr func(optionalAttr)
+
+// LRNDepthRadius sets the optional depth_radius attribute to value.
+//
+// value: 0-D.  Half-width of the 1-D normalization window.
+// If not specified, defaults to 5
+func LRNDepthRadius(value int64) LRNAttr {
+	return func(m optionalAttr) {
+		m["depth_radius"] = value
+	}
+}
+
+// LRNBias sets the optional bias attribute to value.
+//
+// value: An offset (usually positive to avoid dividing by 0).
+// If not specified, defaults to 1
+func LRNBias(value float32) LRNAttr {
+	return func(m optionalAttr) {
+		m["bias"] = value
+	}
+}
+
+// LRNAlpha sets the optional alpha attribute to value.
+//
+// value: A scale factor, usually positive.
+// If not specified, defaults to 1
+func LRNAlpha(value float32) LRNAttr {
+	return func(m optionalAttr) {
+		m["alpha"] = value
+	}
+}
+
+// LRNBeta sets the optional beta attribute to value.
+//
+// value: An exponent.
+// If not specified, defaults to 0.5
+func LRNBeta(value float32) LRNAttr {
+	return func(m optionalAttr) {
+		m["beta"] = value
+	}
+}
+
+// Local Response Normalization.
+//
+// The 4-D `input` tensor is treated as a 3-D array of 1-D vectors (along the last
+// dimension), and each vector is normalized independently.  Within a given vector,
+// each component is divided by the weighted, squared sum of inputs within
+// `depth_radius`.  In detail,
+//
+//     sqr_sum[a, b, c, d] =
+//         sum(input[a, b, c, d - depth_radius : d + depth_radius + 1] ** 2)
+//     output = input / (bias + alpha * sqr_sum) ** beta
+//
+// For details, see [Krizhevsky et al., ImageNet classification with deep
+// convolutional neural networks (NIPS 2012)](http://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks).
+//
+// Arguments:
+//	input: 4-D.
+func LRN(scope *Scope, input tf.Output, optional ...LRNAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "LRN",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Returns which elements of x are Inf.
 //
 // @compatibility(numpy)
@@ -26270,34 +26337,6 @@ func FusedPadConv2D(scope *Scope, input tf.Output, paddings tf.Output, filter tf
 	return op.Output(0)
 }
 
-// Adjust the hue of one or more images.
-//
-// `images` is a tensor of at least 3 dimensions.  The last dimension is
-// interpreted as channels, and must be three.
-//
-// The input image is considered in the RGB colorspace. Conceptually, the RGB
-// colors are first mapped into HSV. A delta is then applied all the hue values,
-// and then remapped back to RGB colorspace.
-//
-// Arguments:
-//	images: Images to adjust.  At least 3-D.
-//	delta: A float delta to add to the hue.
-//
-// Returns The hue-adjusted image or images.
-func AdjustHue(scope *Scope, images tf.Output, delta tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "AdjustHue",
-		Input: []tf.Input{
-			images, delta,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // List of the given size with empty elements.
 //
 // element_shape: the shape of the future elements of the list
@@ -26481,36 +26520,267 @@ func BiasAddV1(scope *Scope, value tf.Output, bias tf.Output) (output tf.Output)
 	return op.Output(0)
 }
 
-// BiasAddAttr is an optional argument to BiasAdd.
-type BiasAddAttr func(optionalAttr)
-
-// BiasAddDataFormat sets the optional data_format attribute to value.
+// Creates a Dataset that returns pseudorandom numbers.
 //
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the bias tensor will be added to the last dimension
-// of the value tensor.
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, in_channels, in_height, in_width].
-// The tensor will be added to "in_channels", the third-to-the-last
-//     dimension.
-// If not specified, defaults to "NHWC"
-func BiasAddDataFormat(value string) BiasAddAttr {
+// Creates a Dataset that returns a stream of uniformly distributed
+// pseudorandom 64-bit signed integers.
+//
+// In the TensorFlow Python API, you can instantiate this dataset via the
+// class `tf.data.experimental.RandomDataset`.
+//
+// Instances of this dataset are also created as a result of the
+// `hoist_random_uniform` static optimization. Whether this optimization is
+// performed is determined by the `experimental_optimization.hoist_random_uniform`
+// option of `tf.data.Options`.
+//
+// Arguments:
+//	seed: A scalar seed for the random number generator. If either seed or
+// seed2 is set to be non-zero, the random number generator is seeded
+// by the given seed.  Otherwise, a random seed is used.
+//	seed2: A second scalar seed to avoid seed collision.
+//
+//
+func RandomDataset(scope *Scope, seed tf.Output, seed2 tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "RandomDataset",
+		Input: []tf.Input{
+			seed, seed2,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// FractionalAvgPoolAttr is an optional argument to FractionalAvgPool.
+type FractionalAvgPoolAttr func(optionalAttr)
+
+// FractionalAvgPoolPseudoRandom sets the optional pseudo_random attribute to value.
+//
+// value: When set to True, generates the pooling sequence in a
+// pseudorandom fashion, otherwise, in a random fashion. Check paper [Benjamin
+// Graham, Fractional Max-Pooling](http://arxiv.org/abs/1412.6071) for
+// difference between pseudorandom and random.
+// If not specified, defaults to false
+func FractionalAvgPoolPseudoRandom(value bool) FractionalAvgPoolAttr {
 	return func(m optionalAttr) {
-		m["data_format"] = value
+		m["pseudo_random"] = value
 	}
 }
 
-// Adds `bias` to `value`.
+// FractionalAvgPoolOverlapping sets the optional overlapping attribute to value.
 //
-// This is a special case of `tf.add` where `bias` is restricted to be 1-D.
-// Broadcasting is supported, so `value` may have any number of dimensions.
+// value: When set to True, it means when pooling, the values at the boundary
+// of adjacent pooling cells are used by both cells. For example:
+//
+// `index  0  1  2  3  4`
+//
+// `value  20 5  16 3  7`
+//
+// If the pooling sequence is [0, 2, 4], then 16, at index 2 will be used twice.
+// The result would be [41/3, 26/3] for fractional avg pooling.
+// If not specified, defaults to false
+func FractionalAvgPoolOverlapping(value bool) FractionalAvgPoolAttr {
+	return func(m optionalAttr) {
+		m["overlapping"] = value
+	}
+}
+
+// FractionalAvgPoolDeterministic sets the optional deterministic attribute to value.
+//
+// value: When set to True, a fixed pooling region will be used when
+// iterating over a FractionalAvgPool node in the computation graph. Mainly used
+// in unit test to make FractionalAvgPool deterministic.
+// If not specified, defaults to false
+func FractionalAvgPoolDeterministic(value bool) FractionalAvgPoolAttr {
+	return func(m optionalAttr) {
+		m["deterministic"] = value
+	}
+}
+
+// FractionalAvgPoolSeed sets the optional seed attribute to value.
+//
+// value: If either seed or seed2 are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func FractionalAvgPoolSeed(value int64) FractionalAvgPoolAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// FractionalAvgPoolSeed2 sets the optional seed2 attribute to value.
+//
+// value: An second seed to avoid seed collision.
+// If not specified, defaults to 0
+func FractionalAvgPoolSeed2(value int64) FractionalAvgPoolAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Performs fractional average pooling on the input.
+//
+// Fractional average pooling is similar to Fractional max pooling in the pooling
+// region generation step. The only difference is that after pooling regions are
+// generated, a mean operation is performed instead of a max operation in each
+// pooling region.
 //
 // Arguments:
-//	value: Any number of dimensions.
-//	bias: 1-D with size the last dimension of `value`.
+//	value: 4-D with shape `[batch, height, width, channels]`.
+//	pooling_ratio: Pooling ratio for each dimension of `value`, currently only
+// supports row and col dimension and should be >= 1.0. For example, a valid
+// pooling ratio looks like [1.0, 1.44, 1.73, 1.0]. The first and last elements
+// must be 1.0 because we don't allow pooling on batch and channels
+// dimensions. 1.44 and 1.73 are pooling ratio on height and width dimensions
+// respectively.
 //
-// Returns Broadcasted sum of `value` and `bias`.
-func BiasAdd(scope *Scope, value tf.Output, bias tf.Output, optional ...BiasAddAttr) (output tf.Output) {
+// Returns:
+//	output: output tensor after fractional avg pooling.
+//	row_pooling_sequence: row pooling sequence, needed to calculate gradient.
+//	col_pooling_sequence: column pooling sequence, needed to calculate gradient.
+func FractionalAvgPool(scope *Scope, value tf.Output, pooling_ratio []float32, optional ...FractionalAvgPoolAttr) (output tf.Output, row_pooling_sequence tf.Output, col_pooling_sequence tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"pooling_ratio": pooling_ratio}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "FractionalAvgPool",
+		Input: []tf.Input{
+			value,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// MapStageAttr is an optional argument to MapStage.
+type MapStageAttr func(optionalAttr)
+
+// MapStageCapacity sets the optional capacity attribute to value.
+//
+// value: Maximum number of elements in the Staging Area. If > 0, inserts
+// on the container will block when the capacity is reached.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func MapStageCapacity(value int64) MapStageAttr {
+	return func(m optionalAttr) {
+		m["capacity"] = value
+	}
+}
+
+// MapStageMemoryLimit sets the optional memory_limit attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func MapStageMemoryLimit(value int64) MapStageAttr {
+	return func(m optionalAttr) {
+		m["memory_limit"] = value
+	}
+}
+
+// MapStageContainer sets the optional container attribute to value.
+//
+// value: If non-empty, this queue is placed in the given container. Otherwise,
+// a default container is used.
+// If not specified, defaults to ""
+func MapStageContainer(value string) MapStageAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// MapStageSharedName sets the optional shared_name attribute to value.
+//
+// value: It is necessary to match this name to the matching Unstage Op.
+// If not specified, defaults to ""
+func MapStageSharedName(value string) MapStageAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Stage (key, values) in the underlying container which behaves like a hashtable.
+//
+// Arguments:
+//	key: int64
+//
+//	values: a list of tensors
+// dtypes A list of data types that inserted values should adhere to.
+//
+//
+// Returns the created operation.
+func MapStage(scope *Scope, key tf.Output, indices tf.Output, values []tf.Output, dtypes []tf.DataType, optional ...MapStageAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtypes": dtypes}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "MapStage",
+		Input: []tf.Input{
+			key, indices, tf.OutputList(values),
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// SparseTensorDenseMatMulAttr is an optional argument to SparseTensorDenseMatMul.
+type SparseTensorDenseMatMulAttr func(optionalAttr)
+
+// SparseTensorDenseMatMulAdjointA sets the optional adjoint_a attribute to value.
+//
+// value: Use the adjoint of A in the matrix multiply.  If A is complex, this
+// is transpose(conj(A)).  Otherwise it's transpose(A).
+// If not specified, defaults to false
+func SparseTensorDenseMatMulAdjointA(value bool) SparseTensorDenseMatMulAttr {
+	return func(m optionalAttr) {
+		m["adjoint_a"] = value
+	}
+}
+
+// SparseTensorDenseMatMulAdjointB sets the optional adjoint_b attribute to value.
+//
+// value: Use the adjoint of B in the matrix multiply.  If B is complex, this
+// is transpose(conj(B)).  Otherwise it's transpose(B).
+// If not specified, defaults to false
+func SparseTensorDenseMatMulAdjointB(value bool) SparseTensorDenseMatMulAttr {
+	return func(m optionalAttr) {
+		m["adjoint_b"] = value
+	}
+}
+
+// Multiply SparseTensor (of rank 2) "A" by dense matrix "B".
+//
+// No validity checking is performed on the indices of A.  However, the following
+// input format is recommended for optimal behavior:
+//
+// if adjoint_a == false:
+//   A should be sorted in lexicographically increasing order.  Use SparseReorder
+//   if you're not sure.
+// if adjoint_a == true:
+//   A should be sorted in order of increasing dimension 1 (i.e., "column major"
+//   order instead of "row major" order).
+//
+// Arguments:
+//	a_indices: 2-D.  The `indices` of the `SparseTensor`, size `[nnz, 2]` Matrix.
+//	a_values: 1-D.  The `values` of the `SparseTensor`, size `[nnz]` Vector.
+//	a_shape: 1-D.  The `shape` of the `SparseTensor`, size `[2]` Vector.
+//	b: 2-D.  A dense Matrix.
+func SparseTensorDenseMatMul(scope *Scope, a_indices tf.Output, a_values tf.Output, a_shape tf.Output, b tf.Output, optional ...SparseTensorDenseMatMulAttr) (product tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -26519,9 +26789,9 @@ func BiasAdd(scope *Scope, value tf.Output, bias tf.Output, optional ...BiasAddA
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "BiasAdd",
+		Type: "SparseTensorDenseMatMul",
 		Input: []tf.Input{
-			value, bias,
+			a_indices, a_values, a_shape, b,
 		},
 		Attrs: attrs,
 	}
@@ -26529,6 +26799,86 @@ func BiasAdd(scope *Scope, value tf.Output, bias tf.Output, optional ...BiasAddA
 	return op.Output(0)
 }
 
+// FusedBatchNormGradV2Attr is an optional argument to FusedBatchNormGradV2.
+type FusedBatchNormGradV2Attr func(optionalAttr)
+
+// FusedBatchNormGradV2Epsilon sets the optional epsilon attribute to value.
+//
+// value: A small float number added to the variance of x.
+// If not specified, defaults to 0.0001
+func FusedBatchNormGradV2Epsilon(value float32) FusedBatchNormGradV2Attr {
+	return func(m optionalAttr) {
+		m["epsilon"] = value
+	}
+}
+
+// FusedBatchNormGradV2DataFormat sets the optional data_format attribute to value.
+//
+// value: The data format for y_backprop, x, x_backprop.
+// Either "NHWC" (default) or "NCHW".
+// If not specified, defaults to "NHWC"
+func FusedBatchNormGradV2DataFormat(value string) FusedBatchNormGradV2Attr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// FusedBatchNormGradV2IsTraining sets the optional is_training attribute to value.
+//
+// value: A bool value to indicate the operation is for training (default)
+// or inference.
+// If not specified, defaults to true
+func FusedBatchNormGradV2IsTraining(value bool) FusedBatchNormGradV2Attr {
+	return func(m optionalAttr) {
+		m["is_training"] = value
+	}
+}
+
+// Gradient for batch normalization.
+//
+// Note that the size of 4D Tensors are defined by either "NHWC" or "NCHW".
+// The size of 1D Tensors matches the dimension C of the 4D Tensors.
+//
+// Arguments:
+//	y_backprop: A 4D Tensor for the gradient with respect to y.
+//	x: A 4D Tensor for input data.
+//	scale: A 1D Tensor for scaling factor, to scale the normalized x.
+//	reserve_space_1: When is_training is True, a 1D Tensor for the computed batch
+// mean to be reused in gradient computation. When is_training is
+// False, a 1D Tensor for the population mean to be reused in both
+// 1st and 2nd order gradient computation.
+//	reserve_space_2: When is_training is True, a 1D Tensor for the computed batch
+// variance (inverted variance in the cuDNN case) to be reused in
+// gradient computation. When is_training is False, a 1D Tensor
+// for the population variance to be reused in both 1st and 2nd
+// order gradient computation.
+//
+// Returns:
+//	x_backprop: A 4D Tensor for the gradient with respect to x.
+//	scale_backprop: A 1D Tensor for the gradient with respect to scale.
+//	offset_backprop: A 1D Tensor for the gradient with respect to offset.
+//	reserve_space_3: Unused placeholder to match the mean input in FusedBatchNorm.
+//	reserve_space_4: Unused placeholder to match the variance input
+// in FusedBatchNorm.
+func FusedBatchNormGradV2(scope *Scope, y_backprop tf.Output, x tf.Output, scale tf.Output, reserve_space_1 tf.Output, reserve_space_2 tf.Output, optional ...FusedBatchNormGradV2Attr) (x_backprop tf.Output, scale_backprop tf.Output, offset_backprop tf.Output, reserve_space_3 tf.Output, reserve_space_4 tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "FusedBatchNormGradV2",
+		Input: []tf.Input{
+			y_backprop, x, scale, reserve_space_1, reserve_space_2,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
+}
+
 // Computes the gradient for the inverse of `x` wrt its input.
 //
 // Specifically, `grad = -dy * y*y`, where `y = 1/x`, and `dy`
@@ -26580,132 +26930,44 @@ func SparseDenseCwiseAdd(scope *Scope, sp_indices tf.Output, sp_values tf.Output
 	return op.Output(0)
 }
 
-// EncodeBase64Attr is an optional argument to EncodeBase64.
-type EncodeBase64Attr func(optionalAttr)
-
-// EncodeBase64Pad sets the optional pad attribute to value.
+// Counts the number of occurrences of each value in an integer array.
 //
-// value: Bool whether padding is applied at the ends.
-// If not specified, defaults to false
-func EncodeBase64Pad(value bool) EncodeBase64Attr {
-	return func(m optionalAttr) {
-		m["pad"] = value
-	}
-}
-
-// Encode strings into web-safe base64 format.
+// Outputs a vector with length `size` and the same dtype as `weights`. If
+// `weights` are empty, then index `i` stores the number of times the value `i` is
+// counted in `arr`. If `weights` are non-empty, then index `i` stores the sum of
+// the value in `weights` at each index where the corresponding value in `arr` is
+// `i`.
 //
-// Refer to the following article for more information on base64 format:
-// en.wikipedia.org/wiki/Base64. Base64 strings may have padding with '=' at the
-// end so that the encoded has length multiple of 4. See Padding section of the
-// link above.
-//
-// Web-safe means that the encoder uses - and _ instead of + and /.
+// Values in `arr` outside of the range [0, size) are ignored.
 //
 // Arguments:
-//	input: Strings to be encoded.
+//	arr: int32 `Tensor`.
+//	size: non-negative int32 scalar `Tensor`.
+//	weights: is an int32, int64, float32, or float64 `Tensor` with the same
+// shape as `arr`, or a length-0 `Tensor`, in which case it acts as all weights
+// equal to 1.
 //
-// Returns Input strings encoded in base64.
-func EncodeBase64(scope *Scope, input tf.Output, optional ...EncodeBase64Attr) (output tf.Output) {
+// Returns 1D `Tensor` with length equal to `size`. The counts or summed weights for
+// each value in the range [0, size).
+func Bincount(scope *Scope, arr tf.Output, size tf.Output, weights tf.Output) (bins tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "EncodeBase64",
+		Type: "Bincount",
 		Input: []tf.Input{
-			input,
+			arr, size, weights,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// FakeQuantWithMinMaxArgsAttr is an optional argument to FakeQuantWithMinMaxArgs.
-type FakeQuantWithMinMaxArgsAttr func(optionalAttr)
-
-// FakeQuantWithMinMaxArgsMin sets the optional min attribute to value.
-// If not specified, defaults to -6
-func FakeQuantWithMinMaxArgsMin(value float32) FakeQuantWithMinMaxArgsAttr {
-	return func(m optionalAttr) {
-		m["min"] = value
-	}
-}
-
-// FakeQuantWithMinMaxArgsMax sets the optional max attribute to value.
-// If not specified, defaults to 6
-func FakeQuantWithMinMaxArgsMax(value float32) FakeQuantWithMinMaxArgsAttr {
-	return func(m optionalAttr) {
-		m["max"] = value
-	}
-}
-
-// FakeQuantWithMinMaxArgsNumBits sets the optional num_bits attribute to value.
-// If not specified, defaults to 8
-func FakeQuantWithMinMaxArgsNumBits(value int64) FakeQuantWithMinMaxArgsAttr {
-	return func(m optionalAttr) {
-		m["num_bits"] = value
-	}
-}
-
-// FakeQuantWithMinMaxArgsNarrowRange sets the optional narrow_range attribute to value.
-// If not specified, defaults to false
-func FakeQuantWithMinMaxArgsNarrowRange(value bool) FakeQuantWithMinMaxArgsAttr {
-	return func(m optionalAttr) {
-		m["narrow_range"] = value
-	}
-}
-
-// Fake-quantize the 'inputs' tensor, type float to 'outputs' tensor of same type.
-//
-// Attributes
-//
-// *   `[min; max]` define the clamping range for the `inputs` data.
-// *   `inputs` values are quantized into the quantization range (
-// `[0; 2^num_bits - 1]` when `narrow_range` is false and `[1; 2^num_bits - 1]`
-// when it is true) and then de-quantized and output as floats in `[min; max]`
-// interval.
-// *   `num_bits` is the bitwidth of the quantization; between 2 and 16, inclusive.
-//
-// Before quantization, `min` and `max` values are adjusted with the following
-// logic.
-// It is suggested to have `min <= 0 <= max`. If `0` is not in the range of values,
-// the behavior can be unexpected:
-//
-// *   If `0 < min < max`: `min_adj = 0` and `max_adj = max - min`.
-// *   If `min < max < 0`: `min_adj = min - max` and `max_adj = 0`.
-// *   If `min <= 0 <= max`: `scale = (max - min) / (2^num_bits - 1) `,
-// `min_adj = scale * round(min / scale)` and `max_adj = max + min_adj - min`.
-//
-// Quantization is called fake since the output is still in floating point.
-func FakeQuantWithMinMaxArgs(scope *Scope, inputs tf.Output, optional ...FakeQuantWithMinMaxArgsAttr) (outputs tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "FakeQuantWithMinMaxArgs",
-		Input: []tf.Input{
-			inputs,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Batch normalization.
+// Gradients for batch normalization.
 //
 // DEPRECATED at GraphDef version 9: Use tf.nn.batch_normalization()
 //
-// This op is deprecated. Prefer `tf.nn.batch_normalization`.
+// This op is deprecated. See `tf.nn.batch_normalization`.
 //
 // Arguments:
 //	t: A 4D input Tensor.
@@ -26715,389 +26977,34 @@ func FakeQuantWithMinMaxArgs(scope *Scope, inputs tf.Output, optional ...FakeQua
 //	v: A 1D variance Tensor with size matching the last dimension of t.
 // This is the second output from tf.nn.moments,
 // or a saved moving average thereof.
-//	beta: A 1D beta Tensor with size matching the last dimension of t.
-// An offset to be added to the normalized tensor.
 //	gamma: A 1D gamma Tensor with size matching the last dimension of t.
-// If "scale_after_normalization" is true, this tensor will be multiplied
-// with the normalized tensor.
+// If "scale_after_normalization" is true, this Tensor will be multiplied
+// with the normalized Tensor.
+//	backprop: 4D backprop Tensor.
 //	variance_epsilon: A small float number to avoid dividing by 0.
 //	scale_after_normalization: A bool indicating whether the resulted tensor
 // needs to be multiplied with gamma.
-func BatchNormWithGlobalNormalization(scope *Scope, t tf.Output, m tf.Output, v tf.Output, beta tf.Output, gamma tf.Output, variance_epsilon float32, scale_after_normalization bool) (result tf.Output) {
+//
+// Returns:
+//	dx: 4D backprop tensor for input.
+//	dm: 1D backprop tensor for mean.
+//	dv: 1D backprop tensor for variance.
+//	db: 1D backprop tensor for beta.
+//	dg: 1D backprop tensor for gamma.
+func BatchNormWithGlobalNormalizationGrad(scope *Scope, t tf.Output, m tf.Output, v tf.Output, gamma tf.Output, backprop tf.Output, variance_epsilon float32, scale_after_normalization bool) (dx tf.Output, dm tf.Output, dv tf.Output, db tf.Output, dg tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	attrs := map[string]interface{}{"variance_epsilon": variance_epsilon, "scale_after_normalization": scale_after_normalization}
 	opspec := tf.OpSpec{
-		Type: "BatchNormWithGlobalNormalization",
+		Type: "BatchNormWithGlobalNormalizationGrad",
 		Input: []tf.Input{
-			t, m, v, beta, gamma,
+			t, m, v, gamma, backprop,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// QuantizedConv2DAttr is an optional argument to QuantizedConv2D.
-type QuantizedConv2DAttr func(optionalAttr)
-
-// QuantizedConv2DOutType sets the optional out_type attribute to value.
-// If not specified, defaults to DT_QINT32
-func QuantizedConv2DOutType(value tf.DataType) QuantizedConv2DAttr {
-	return func(m optionalAttr) {
-		m["out_type"] = value
-	}
-}
-
-// QuantizedConv2DDilations sets the optional dilations attribute to value.
-//
-// value: 1-D tensor of length 4.  The dilation factor for each dimension of
-// `input`. If set to k > 1, there will be k-1 skipped cells between each
-// filter element on that dimension. The dimension order is determined by the
-// value of `data_format`, see above for details. Dilations in the batch and
-// depth dimensions must be 1.
-// If not specified, defaults to <i:1 i:1 i:1 i:1 >
-func QuantizedConv2DDilations(value []int64) QuantizedConv2DAttr {
-	return func(m optionalAttr) {
-		m["dilations"] = value
-	}
-}
-
-// Computes a 2D convolution given quantized 4D input and filter tensors.
-//
-// The inputs are quantized tensors where the lowest value represents the real
-// number of the associated minimum, and the highest represents the maximum.
-// This means that you can only interpret the quantized output in the same way, by
-// taking the returned minimum and maximum values into account.
-//
-// Arguments:
-//
-//	filter: filter's input_depth dimension must match input's depth dimensions.
-//	min_input: The float value that the lowest quantized input value represents.
-//	max_input: The float value that the highest quantized input value represents.
-//	min_filter: The float value that the lowest quantized filter value represents.
-//	max_filter: The float value that the highest quantized filter value represents.
-//	strides: The stride of the sliding window for each dimension of the input
-// tensor.
-//	padding: The type of padding algorithm to use.
-//
-// Returns:
-//	output
-//	min_output: The float value that the lowest quantized output value represents.
-//	max_output: The float value that the highest quantized output value represents.
-func QuantizedConv2D(scope *Scope, input tf.Output, filter tf.Output, min_input tf.Output, max_input tf.Output, min_filter tf.Output, max_filter tf.Output, strides []int64, padding string, optional ...QuantizedConv2DAttr) (output tf.Output, min_output tf.Output, max_output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "QuantizedConv2D",
-		Input: []tf.Input{
-			input, filter, min_input, max_input, min_filter, max_filter,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// Computes rectified linear 6 gradients for a Relu6 operation.
-//
-// Arguments:
-//	gradients: The backpropagated gradients to the corresponding Relu6 operation.
-//	features: The features passed as input to the corresponding Relu6 operation, or
-// its output; using either one produces the same result.
-//
-// Returns The gradients:
-// `gradients * (features > 0) * (features < 6)`.
-func Relu6Grad(scope *Scope, gradients tf.Output, features tf.Output) (backprops tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Relu6Grad",
-		Input: []tf.Input{
-			gradients, features,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// StringSplitAttr is an optional argument to StringSplit.
-type StringSplitAttr func(optionalAttr)
-
-// StringSplitSkipEmpty sets the optional skip_empty attribute to value.
-//
-// value: A `bool`. If `True`, skip the empty strings from the result.
-// If not specified, defaults to true
-func StringSplitSkipEmpty(value bool) StringSplitAttr {
-	return func(m optionalAttr) {
-		m["skip_empty"] = value
-	}
-}
-
-// Split elements of `input` based on `delimiter` into a `SparseTensor`.
-//
-// Let N be the size of source (typically N will be the batch size). Split each
-// element of `input` based on `delimiter` and return a `SparseTensor`
-// containing the splitted tokens. Empty tokens are ignored.
-//
-// `delimiter` can be empty, or a string of split characters. If `delimiter` is an
-//  empty string, each element of `input` is split into individual single-byte
-//  character strings, including splitting of UTF-8 multibyte sequences. Otherwise
-//  every character of `delimiter` is a potential split point.
-//
-// For example:
-//   N = 2, input[0] is 'hello world' and input[1] is 'a b c', then the output
-//   will be
-//
-//   indices = [0, 0;
-//              0, 1;
-//              1, 0;
-//              1, 1;
-//              1, 2]
-//   shape = [2, 3]
-//   values = ['hello', 'world', 'a', 'b', 'c']
-//
-// Arguments:
-//	input: 1-D. Strings to split.
-//	delimiter: 0-D. Delimiter characters (bytes), or empty string.
-//
-// Returns:
-//	indices: A dense matrix of int64 representing the indices of the sparse tensor.
-//	values: A vector of strings corresponding to the splited values.
-//	shape: a length-2 vector of int64 representing the shape of the sparse
-// tensor, where the first value is N and the second value is the maximum number
-// of tokens in a single input entry.
-func StringSplit(scope *Scope, input tf.Output, delimiter tf.Output, optional ...StringSplitAttr) (indices tf.Output, values tf.Output, shape tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "StringSplit",
-		Input: []tf.Input{
-			input, delimiter,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// Assigns sparse updates to the variable referenced by `resource`.
-//
-// This operation computes
-//
-//     # Scalar indices
-//     ref[indices, ...] = updates[...]
-//
-//     # Vector indices (for each i)
-//     ref[indices[i], ...] = updates[i, ...]
-//
-//     # High rank indices (for each i, ..., j)
-//     ref[indices[i, ..., j], ...] = updates[i, ..., j, ...]
-//
-// Arguments:
-//	resource: Should be from a `Variable` node.
-//	indices: A tensor of indices into the first dimension of `ref`.
-//	updates: A tensor of updated values to add to `ref`.
-//
-// Returns the created operation.
-func ResourceScatterUpdate(scope *Scope, resource tf.Output, indices tf.Output, updates tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ResourceScatterUpdate",
-		Input: []tf.Input{
-			resource, indices, updates,
-		},
-	}
-	return scope.AddOperation(opspec)
-}
-
-// EditDistanceAttr is an optional argument to EditDistance.
-type EditDistanceAttr func(optionalAttr)
-
-// EditDistanceNormalize sets the optional normalize attribute to value.
-//
-// value: boolean (if true, edit distances are normalized by length of truth).
-//
-// The output is:
-// If not specified, defaults to true
-func EditDistanceNormalize(value bool) EditDistanceAttr {
-	return func(m optionalAttr) {
-		m["normalize"] = value
-	}
-}
-
-// Computes the (possibly normalized) Levenshtein Edit Distance.
-//
-// The inputs are variable-length sequences provided by SparseTensors
-//   (hypothesis_indices, hypothesis_values, hypothesis_shape)
-// and
-//   (truth_indices, truth_values, truth_shape).
-//
-// The inputs are:
-//
-// Arguments:
-//	hypothesis_indices: The indices of the hypothesis list SparseTensor.
-// This is an N x R int64 matrix.
-//	hypothesis_values: The values of the hypothesis list SparseTensor.
-// This is an N-length vector.
-//	hypothesis_shape: The shape of the hypothesis list SparseTensor.
-// This is an R-length vector.
-//	truth_indices: The indices of the truth list SparseTensor.
-// This is an M x R int64 matrix.
-//	truth_values: The values of the truth list SparseTensor.
-// This is an M-length vector.
-//	truth_shape: truth indices, vector.
-//
-// Returns A dense float tensor with rank R - 1.
-//
-// For the example input:
-//
-//     // hypothesis represents a 2x1 matrix with variable-length values:
-//     //   (0,0) = ["a"]
-//     //   (1,0) = ["b"]
-//     hypothesis_indices = [[0, 0, 0],
-//                           [1, 0, 0]]
-//     hypothesis_values = ["a", "b"]
-//     hypothesis_shape = [2, 1, 1]
-//
-//     // truth represents a 2x2 matrix with variable-length values:
-//     //   (0,0) = []
-//     //   (0,1) = ["a"]
-//     //   (1,0) = ["b", "c"]
-//     //   (1,1) = ["a"]
-//     truth_indices = [[0, 1, 0],
-//                      [1, 0, 0],
-//                      [1, 0, 1],
-//                      [1, 1, 0]]
-//     truth_values = ["a", "b", "c", "a"]
-//     truth_shape = [2, 2, 2]
-//     normalize = true
-//
-// The output will be:
-//
-//     // output is a 2x2 matrix with edit distances normalized by truth lengths.
-//     output = [[inf, 1.0],  // (0,0): no truth, (0,1): no hypothesis
-//               [0.5, 1.0]]  // (1,0): addition, (1,1): no hypothesis
-func EditDistance(scope *Scope, hypothesis_indices tf.Output, hypothesis_values tf.Output, hypothesis_shape tf.Output, truth_indices tf.Output, truth_values tf.Output, truth_shape tf.Output, optional ...EditDistanceAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "EditDistance",
-		Input: []tf.Input{
-			hypothesis_indices, hypothesis_values, hypothesis_shape, truth_indices, truth_values, truth_shape,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Concatenates a list of `N` tensors along the first dimension.
-//
-// The input tensors are all required to have size 1 in the first dimension.
-//
-// For example:
-//
-// ```
-// # 'x' is [[1, 4]]
-// # 'y' is [[2, 5]]
-// # 'z' is [[3, 6]]
-// parallel_concat([x, y, z]) => [[1, 4], [2, 5], [3, 6]]  # Pack along first dim.
-// ```
-//
-// The difference between concat and parallel_concat is that concat requires all
-// of the inputs be computed before the operation will begin but doesn't require
-// that the input shapes be known during graph construction.  Parallel concat
-// will copy pieces of the input into the output as they become available, in
-// some situations this can provide a performance benefit.
-//
-// Arguments:
-//	values: Tensors to be concatenated. All must have size 1 in the first dimension
-// and same shape.
-//	shape: the final shape of the result; should be equal to the shapes of any input
-// but with the number of input values in the first dimension.
-//
-// Returns The concatenated tensor.
-func ParallelConcat(scope *Scope, values []tf.Output, shape tf.Shape) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"shape": shape}
-	opspec := tf.OpSpec{
-		Type: "ParallelConcat",
-		Input: []tf.Input{
-			tf.OutputList(values),
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// AvgPoolGradAttr is an optional argument to AvgPoolGrad.
-type AvgPoolGradAttr func(optionalAttr)
-
-// AvgPoolGradDataFormat sets the optional data_format attribute to value.
-//
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the data is stored in the order of:
-//     [batch, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, in_channels, in_height, in_width].
-// If not specified, defaults to "NHWC"
-func AvgPoolGradDataFormat(value string) AvgPoolGradAttr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// Computes gradients of the average pooling function.
-//
-// Arguments:
-//	orig_input_shape: 1-D.  Shape of the original input to `avg_pool`.
-//	grad: 4-D with shape `[batch, height, width, channels]`.  Gradients w.r.t.
-// the output of `avg_pool`.
-//	ksize: The size of the sliding window for each dimension of the input.
-//	strides: The stride of the sliding window for each dimension of the input.
-//	padding: The type of padding algorithm to use.
-//
-// Returns 4-D.  Gradients w.r.t. the input of `avg_pool`.
-func AvgPoolGrad(scope *Scope, orig_input_shape tf.Output, grad tf.Output, ksize []int64, strides []int64, padding string, optional ...AvgPoolGradAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "AvgPoolGrad",
-		Input: []tf.Input{
-			orig_input_shape, grad,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
 }
 
 // LoadAndRemapMatrixAttr is an optional argument to LoadAndRemapMatrix.
@@ -28344,53 +28251,6 @@ func SaveSlices(scope *Scope, filename tf.Output, tensor_names tf.Output, shapes
 	return scope.AddOperation(opspec)
 }
 
-// AvgPool3DAttr is an optional argument to AvgPool3D.
-type AvgPool3DAttr func(optionalAttr)
-
-// AvgPool3DDataFormat sets the optional data_format attribute to value.
-//
-// value: The data format of the input and output data. With the
-// default format "NDHWC", the data is stored in the order of:
-//     [batch, in_depth, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCDHW", the data storage order is:
-//     [batch, in_channels, in_depth, in_height, in_width].
-// If not specified, defaults to "NDHWC"
-func AvgPool3DDataFormat(value string) AvgPool3DAttr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// Performs 3D average pooling on the input.
-//
-// Arguments:
-//	input: Shape `[batch, depth, rows, cols, channels]` tensor to pool over.
-//	ksize: 1-D tensor of length 5. The size of the window for each dimension of
-// the input tensor. Must have `ksize[0] = ksize[4] = 1`.
-//	strides: 1-D tensor of length 5. The stride of the sliding window for each
-// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
-//	padding: The type of padding algorithm to use.
-//
-// Returns The average pooled output tensor.
-func AvgPool3D(scope *Scope, input tf.Output, ksize []int64, strides []int64, padding string, optional ...AvgPool3DAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "AvgPool3D",
-		Input: []tf.Input{
-			input,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // FusedBatchNormGradV3Attr is an optional argument to FusedBatchNormGradV3.
 type FusedBatchNormGradV3Attr func(optionalAttr)
 
@@ -28474,6 +28334,53 @@ func FusedBatchNormGradV3(scope *Scope, y_backprop tf.Output, x tf.Output, scale
 	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
 }
 
+// AvgPool3DAttr is an optional argument to AvgPool3D.
+type AvgPool3DAttr func(optionalAttr)
+
+// AvgPool3DDataFormat sets the optional data_format attribute to value.
+//
+// value: The data format of the input and output data. With the
+// default format "NDHWC", the data is stored in the order of:
+//     [batch, in_depth, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCDHW", the data storage order is:
+//     [batch, in_channels, in_depth, in_height, in_width].
+// If not specified, defaults to "NDHWC"
+func AvgPool3DDataFormat(value string) AvgPool3DAttr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// Performs 3D average pooling on the input.
+//
+// Arguments:
+//	input: Shape `[batch, depth, rows, cols, channels]` tensor to pool over.
+//	ksize: 1-D tensor of length 5. The size of the window for each dimension of
+// the input tensor. Must have `ksize[0] = ksize[4] = 1`.
+//	strides: 1-D tensor of length 5. The stride of the sliding window for each
+// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
+//	padding: The type of padding algorithm to use.
+//
+// Returns The average pooled output tensor.
+func AvgPool3D(scope *Scope, input tf.Output, ksize []int64, strides []int64, padding string, optional ...AvgPool3DAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "AvgPool3D",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Returns the number of records this Reader has produced.
 //
 // This is the same as the number of ReaderRead executions that have
@@ -29902,6 +29809,247 @@ func Neg(scope *Scope, x tf.Output) (y tf.Output) {
 	return op.Output(0)
 }
 
+// Concatenates tensors along one dimension.
+//
+// Arguments:
+//	values: List of `N` Tensors to concatenate. Their ranks and types must match,
+// and their sizes must match in all dimensions except `concat_dim`.
+//	axis: 0-D.  The dimension along which to concatenate.  Must be in the
+// range [-rank(values), rank(values)).
+//
+// Returns A `Tensor` with the concatenation of values stacked along the
+// `concat_dim` dimension.  This tensor's shape matches that of `values` except
+// in `concat_dim` where it has the sum of the sizes.
+func ConcatV2(scope *Scope, values []tf.Output, axis tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ConcatV2",
+		Input: []tf.Input{
+			tf.OutputList(values), axis,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Elementwise computes the bitwise right-shift of `x` and `y`.
+//
+// Performs a logical shift for unsigned integer types, and an arithmetic shift
+// for signed integer types.
+//
+// If `y` is negative, or greater than or equal to than the width of `x` in bits
+// the result is implementation defined.
+//
+// Example:
+//
+// ```python
+// import tensorflow as tf
+// from tensorflow.python.ops import bitwise_ops
+// import numpy as np
+// dtype_list = [tf.int8, tf.int16, tf.int32, tf.int64]
+//
+// for dtype in dtype_list:
+//   lhs = tf.constant([-1, -5, -3, -14], dtype=dtype)
+//   rhs = tf.constant([5, 0, 7, 11], dtype=dtype)
+//
+//   right_shift_result = bitwise_ops.right_shift(lhs, rhs)
+//
+//   print(right_shift_result)
+//
+// # This will print:
+// # tf.Tensor([-1 -5 -1 -1], shape=(4,), dtype=int8)
+// # tf.Tensor([-1 -5 -1 -1], shape=(4,), dtype=int16)
+// # tf.Tensor([-1 -5 -1 -1], shape=(4,), dtype=int32)
+// # tf.Tensor([-1 -5 -1 -1], shape=(4,), dtype=int64)
+//
+// lhs = np.array([-2, 64, 101, 32], dtype=np.int8)
+// rhs = np.array([-1, -5, -3, -14], dtype=np.int8)
+// bitwise_ops.right_shift(lhs, rhs)
+// # <tf.Tensor: shape=(4,), dtype=int8, numpy=array([ -2,  64, 101,  32], dtype=int8)>
+// ```
+//
+func RightShift(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "RightShift",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// IteratorFromStringHandleAttr is an optional argument to IteratorFromStringHandle.
+type IteratorFromStringHandleAttr func(optionalAttr)
+
+// IteratorFromStringHandleOutputTypes sets the optional output_types attribute to value.
+//
+// value: If specified, defines the type of each tuple component in an
+// element produced by the resulting iterator.
+// If not specified, defaults to <>
+//
+// REQUIRES: len(value) >= 0
+func IteratorFromStringHandleOutputTypes(value []tf.DataType) IteratorFromStringHandleAttr {
+	return func(m optionalAttr) {
+		m["output_types"] = value
+	}
+}
+
+// IteratorFromStringHandleOutputShapes sets the optional output_shapes attribute to value.
+//
+// value: If specified, defines the shape of each tuple component in an
+// element produced by the resulting iterator.
+// If not specified, defaults to <>
+//
+// REQUIRES: len(value) >= 0
+func IteratorFromStringHandleOutputShapes(value []tf.Shape) IteratorFromStringHandleAttr {
+	return func(m optionalAttr) {
+		m["output_shapes"] = value
+	}
+}
+
+// Converts the given string representing a handle to an iterator to a resource.
+//
+// Arguments:
+//	string_handle: A string representation of the given handle.
+//
+// Returns A handle to an iterator resource.
+func IteratorFromStringHandle(scope *Scope, string_handle tf.Output, optional ...IteratorFromStringHandleAttr) (resource_handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "IteratorFromStringHandle",
+		Input: []tf.Input{
+			string_handle,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Performs gradient updates of embedding tables.
+//
+// Arguments:
+//	inputs: A TensorList of gradients with which to update embedding tables.
+// This argument has the same length and shapes as the return value of
+// RecvTPUEmbeddingActivations, but contains gradients of the model's loss
+// with respect to the embedding activations. The embedding tables are updated
+// from these gradients via the optimizer specified in the TPU embedding
+// configuration given to tpu.initialize_system.
+//	learning_rates: A TensorList of float32 scalars, one for each dynamic learning
+// rate tag: see the comments in
+// //third_party/tensorflow/core/protobuf/tpu/optimization_parameters.proto.
+// Multiple tables can share the same dynamic learning rate tag as specified
+// in the configuration. If the learning rates for all tables are constant,
+// this list should be empty.
+//	config: Serialized TPUEmbeddingConfiguration proto.
+//
+// Returns the created operation.
+func SendTPUEmbeddingGradients(scope *Scope, inputs []tf.Output, learning_rates []tf.Output, config string) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"config": config}
+	opspec := tf.OpSpec{
+		Type: "SendTPUEmbeddingGradients",
+		Input: []tf.Input{
+			tf.OutputList(inputs), tf.OutputList(learning_rates),
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// CumsumAttr is an optional argument to Cumsum.
+type CumsumAttr func(optionalAttr)
+
+// CumsumExclusive sets the optional exclusive attribute to value.
+//
+// value: If `True`, perform exclusive cumsum.
+// If not specified, defaults to false
+func CumsumExclusive(value bool) CumsumAttr {
+	return func(m optionalAttr) {
+		m["exclusive"] = value
+	}
+}
+
+// CumsumReverse sets the optional reverse attribute to value.
+//
+// value: A `bool` (default: False).
+// If not specified, defaults to false
+func CumsumReverse(value bool) CumsumAttr {
+	return func(m optionalAttr) {
+		m["reverse"] = value
+	}
+}
+
+// Compute the cumulative sum of the tensor `x` along `axis`.
+//
+// By default, this op performs an inclusive cumsum, which means that the first
+// element of the input is identical to the first element of the output:
+//
+// ```python
+// tf.cumsum([a, b, c])  # => [a, a + b, a + b + c]
+// ```
+//
+// By setting the `exclusive` kwarg to `True`, an exclusive cumsum is
+// performed instead:
+//
+// ```python
+// tf.cumsum([a, b, c], exclusive=True)  # => [0, a, a + b]
+// ```
+//
+// By setting the `reverse` kwarg to `True`, the cumsum is performed in the
+// opposite direction:
+//
+// ```python
+// tf.cumsum([a, b, c], reverse=True)  # => [a + b + c, b + c, c]
+// ```
+//
+// This is more efficient than using separate `tf.reverse` ops.
+//
+// The `reverse` and `exclusive` kwargs can also be combined:
+//
+// ```python
+// tf.cumsum([a, b, c], exclusive=True, reverse=True)  # => [b + c, c, 0]
+// ```
+//
+// Arguments:
+//	x: A `Tensor`. Must be one of the following types: `float32`, `float64`,
+// `int64`, `int32`, `uint8`, `uint16`, `int16`, `int8`, `complex64`,
+// `complex128`, `qint8`, `quint8`, `qint32`, `half`.
+//	axis: A `Tensor` of type `int32` (default: 0). Must be in the range
+// `[-rank(x), rank(x))`.
+func Cumsum(scope *Scope, x tf.Output, axis tf.Output, optional ...CumsumAttr) (out tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Cumsum",
+		Input: []tf.Input{
+			x, axis,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Calculates the gradient of the SparseMatrixSoftmax op.
 //
 // Arguments:
@@ -30416,6 +30564,367 @@ func StatefulUniform(scope *Scope, resource tf.Output, algorithm tf.Output, shap
 	return op.Output(0)
 }
 
+// QuantizedConv2DAttr is an optional argument to QuantizedConv2D.
+type QuantizedConv2DAttr func(optionalAttr)
+
+// QuantizedConv2DOutType sets the optional out_type attribute to value.
+// If not specified, defaults to DT_QINT32
+func QuantizedConv2DOutType(value tf.DataType) QuantizedConv2DAttr {
+	return func(m optionalAttr) {
+		m["out_type"] = value
+	}
+}
+
+// QuantizedConv2DDilations sets the optional dilations attribute to value.
+//
+// value: 1-D tensor of length 4.  The dilation factor for each dimension of
+// `input`. If set to k > 1, there will be k-1 skipped cells between each
+// filter element on that dimension. The dimension order is determined by the
+// value of `data_format`, see above for details. Dilations in the batch and
+// depth dimensions must be 1.
+// If not specified, defaults to <i:1 i:1 i:1 i:1 >
+func QuantizedConv2DDilations(value []int64) QuantizedConv2DAttr {
+	return func(m optionalAttr) {
+		m["dilations"] = value
+	}
+}
+
+// Computes a 2D convolution given quantized 4D input and filter tensors.
+//
+// The inputs are quantized tensors where the lowest value represents the real
+// number of the associated minimum, and the highest represents the maximum.
+// This means that you can only interpret the quantized output in the same way, by
+// taking the returned minimum and maximum values into account.
+//
+// Arguments:
+//
+//	filter: filter's input_depth dimension must match input's depth dimensions.
+//	min_input: The float value that the lowest quantized input value represents.
+//	max_input: The float value that the highest quantized input value represents.
+//	min_filter: The float value that the lowest quantized filter value represents.
+//	max_filter: The float value that the highest quantized filter value represents.
+//	strides: The stride of the sliding window for each dimension of the input
+// tensor.
+//	padding: The type of padding algorithm to use.
+//
+// Returns:
+//	output
+//	min_output: The float value that the lowest quantized output value represents.
+//	max_output: The float value that the highest quantized output value represents.
+func QuantizedConv2D(scope *Scope, input tf.Output, filter tf.Output, min_input tf.Output, max_input tf.Output, min_filter tf.Output, max_filter tf.Output, strides []int64, padding string, optional ...QuantizedConv2DAttr) (output tf.Output, min_output tf.Output, max_output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "QuantizedConv2D",
+		Input: []tf.Input{
+			input, filter, min_input, max_input, min_filter, max_filter,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// Computes rectified linear 6 gradients for a Relu6 operation.
+//
+// Arguments:
+//	gradients: The backpropagated gradients to the corresponding Relu6 operation.
+//	features: The features passed as input to the corresponding Relu6 operation, or
+// its output; using either one produces the same result.
+//
+// Returns The gradients:
+// `gradients * (features > 0) * (features < 6)`.
+func Relu6Grad(scope *Scope, gradients tf.Output, features tf.Output) (backprops tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Relu6Grad",
+		Input: []tf.Input{
+			gradients, features,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// EditDistanceAttr is an optional argument to EditDistance.
+type EditDistanceAttr func(optionalAttr)
+
+// EditDistanceNormalize sets the optional normalize attribute to value.
+//
+// value: boolean (if true, edit distances are normalized by length of truth).
+//
+// The output is:
+// If not specified, defaults to true
+func EditDistanceNormalize(value bool) EditDistanceAttr {
+	return func(m optionalAttr) {
+		m["normalize"] = value
+	}
+}
+
+// Computes the (possibly normalized) Levenshtein Edit Distance.
+//
+// The inputs are variable-length sequences provided by SparseTensors
+//   (hypothesis_indices, hypothesis_values, hypothesis_shape)
+// and
+//   (truth_indices, truth_values, truth_shape).
+//
+// The inputs are:
+//
+// Arguments:
+//	hypothesis_indices: The indices of the hypothesis list SparseTensor.
+// This is an N x R int64 matrix.
+//	hypothesis_values: The values of the hypothesis list SparseTensor.
+// This is an N-length vector.
+//	hypothesis_shape: The shape of the hypothesis list SparseTensor.
+// This is an R-length vector.
+//	truth_indices: The indices of the truth list SparseTensor.
+// This is an M x R int64 matrix.
+//	truth_values: The values of the truth list SparseTensor.
+// This is an M-length vector.
+//	truth_shape: truth indices, vector.
+//
+// Returns A dense float tensor with rank R - 1.
+//
+// For the example input:
+//
+//     // hypothesis represents a 2x1 matrix with variable-length values:
+//     //   (0,0) = ["a"]
+//     //   (1,0) = ["b"]
+//     hypothesis_indices = [[0, 0, 0],
+//                           [1, 0, 0]]
+//     hypothesis_values = ["a", "b"]
+//     hypothesis_shape = [2, 1, 1]
+//
+//     // truth represents a 2x2 matrix with variable-length values:
+//     //   (0,0) = []
+//     //   (0,1) = ["a"]
+//     //   (1,0) = ["b", "c"]
+//     //   (1,1) = ["a"]
+//     truth_indices = [[0, 1, 0],
+//                      [1, 0, 0],
+//                      [1, 0, 1],
+//                      [1, 1, 0]]
+//     truth_values = ["a", "b", "c", "a"]
+//     truth_shape = [2, 2, 2]
+//     normalize = true
+//
+// The output will be:
+//
+//     // output is a 2x2 matrix with edit distances normalized by truth lengths.
+//     output = [[inf, 1.0],  // (0,0): no truth, (0,1): no hypothesis
+//               [0.5, 1.0]]  // (1,0): addition, (1,1): no hypothesis
+func EditDistance(scope *Scope, hypothesis_indices tf.Output, hypothesis_values tf.Output, hypothesis_shape tf.Output, truth_indices tf.Output, truth_values tf.Output, truth_shape tf.Output, optional ...EditDistanceAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "EditDistance",
+		Input: []tf.Input{
+			hypothesis_indices, hypothesis_values, hypothesis_shape, truth_indices, truth_values, truth_shape,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Concatenates a list of `N` tensors along the first dimension.
+//
+// The input tensors are all required to have size 1 in the first dimension.
+//
+// For example:
+//
+// ```
+// # 'x' is [[1, 4]]
+// # 'y' is [[2, 5]]
+// # 'z' is [[3, 6]]
+// parallel_concat([x, y, z]) => [[1, 4], [2, 5], [3, 6]]  # Pack along first dim.
+// ```
+//
+// The difference between concat and parallel_concat is that concat requires all
+// of the inputs be computed before the operation will begin but doesn't require
+// that the input shapes be known during graph construction.  Parallel concat
+// will copy pieces of the input into the output as they become available, in
+// some situations this can provide a performance benefit.
+//
+// Arguments:
+//	values: Tensors to be concatenated. All must have size 1 in the first dimension
+// and same shape.
+//	shape: the final shape of the result; should be equal to the shapes of any input
+// but with the number of input values in the first dimension.
+//
+// Returns The concatenated tensor.
+func ParallelConcat(scope *Scope, values []tf.Output, shape tf.Shape) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"shape": shape}
+	opspec := tf.OpSpec{
+		Type: "ParallelConcat",
+		Input: []tf.Input{
+			tf.OutputList(values),
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// AvgPoolGradAttr is an optional argument to AvgPoolGrad.
+type AvgPoolGradAttr func(optionalAttr)
+
+// AvgPoolGradDataFormat sets the optional data_format attribute to value.
+//
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, in_channels, in_height, in_width].
+// If not specified, defaults to "NHWC"
+func AvgPoolGradDataFormat(value string) AvgPoolGradAttr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// Computes gradients of the average pooling function.
+//
+// Arguments:
+//	orig_input_shape: 1-D.  Shape of the original input to `avg_pool`.
+//	grad: 4-D with shape `[batch, height, width, channels]`.  Gradients w.r.t.
+// the output of `avg_pool`.
+//	ksize: The size of the sliding window for each dimension of the input.
+//	strides: The stride of the sliding window for each dimension of the input.
+//	padding: The type of padding algorithm to use.
+//
+// Returns 4-D.  Gradients w.r.t. the input of `avg_pool`.
+func AvgPoolGrad(scope *Scope, orig_input_shape tf.Output, grad tf.Output, ksize []int64, strides []int64, padding string, optional ...AvgPoolGradAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "AvgPoolGrad",
+		Input: []tf.Input{
+			orig_input_shape, grad,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// StringSplitAttr is an optional argument to StringSplit.
+type StringSplitAttr func(optionalAttr)
+
+// StringSplitSkipEmpty sets the optional skip_empty attribute to value.
+//
+// value: A `bool`. If `True`, skip the empty strings from the result.
+// If not specified, defaults to true
+func StringSplitSkipEmpty(value bool) StringSplitAttr {
+	return func(m optionalAttr) {
+		m["skip_empty"] = value
+	}
+}
+
+// Split elements of `input` based on `delimiter` into a `SparseTensor`.
+//
+// Let N be the size of source (typically N will be the batch size). Split each
+// element of `input` based on `delimiter` and return a `SparseTensor`
+// containing the splitted tokens. Empty tokens are ignored.
+//
+// `delimiter` can be empty, or a string of split characters. If `delimiter` is an
+//  empty string, each element of `input` is split into individual single-byte
+//  character strings, including splitting of UTF-8 multibyte sequences. Otherwise
+//  every character of `delimiter` is a potential split point.
+//
+// For example:
+//   N = 2, input[0] is 'hello world' and input[1] is 'a b c', then the output
+//   will be
+//
+//   indices = [0, 0;
+//              0, 1;
+//              1, 0;
+//              1, 1;
+//              1, 2]
+//   shape = [2, 3]
+//   values = ['hello', 'world', 'a', 'b', 'c']
+//
+// Arguments:
+//	input: 1-D. Strings to split.
+//	delimiter: 0-D. Delimiter characters (bytes), or empty string.
+//
+// Returns:
+//	indices: A dense matrix of int64 representing the indices of the sparse tensor.
+//	values: A vector of strings corresponding to the splited values.
+//	shape: a length-2 vector of int64 representing the shape of the sparse
+// tensor, where the first value is N and the second value is the maximum number
+// of tokens in a single input entry.
+func StringSplit(scope *Scope, input tf.Output, delimiter tf.Output, optional ...StringSplitAttr) (indices tf.Output, values tf.Output, shape tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "StringSplit",
+		Input: []tf.Input{
+			input, delimiter,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// Assigns sparse updates to the variable referenced by `resource`.
+//
+// This operation computes
+//
+//     # Scalar indices
+//     ref[indices, ...] = updates[...]
+//
+//     # Vector indices (for each i)
+//     ref[indices[i], ...] = updates[i, ...]
+//
+//     # High rank indices (for each i, ..., j)
+//     ref[indices[i, ..., j], ...] = updates[i, ..., j, ...]
+//
+// Arguments:
+//	resource: Should be from a `Variable` node.
+//	indices: A tensor of indices into the first dimension of `ref`.
+//	updates: A tensor of updated values to add to `ref`.
+//
+// Returns the created operation.
+func ResourceScatterUpdate(scope *Scope, resource tf.Output, indices tf.Output, updates tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceScatterUpdate",
+		Input: []tf.Input{
+			resource, indices, updates,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
 // Creates ngrams from ragged string data.
 //
 // This op accepts a ragged tensor with 1 ragged dimension containing only
@@ -30903,32 +31412,49 @@ func ConsumeMutexLock(scope *Scope, mutex_lock tf.Output) (o *tf.Operation) {
 	return scope.AddOperation(opspec)
 }
 
-// Adjust the contrast of one or more images.
+// BiasAddAttr is an optional argument to BiasAdd.
+type BiasAddAttr func(optionalAttr)
+
+// BiasAddDataFormat sets the optional data_format attribute to value.
 //
-// `images` is a tensor of at least 3 dimensions.  The last 3 dimensions are
-// interpreted as `[height, width, channels]`.  The other dimensions only
-// represent a collection of images, such as `[batch, height, width, channels].`
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the bias tensor will be added to the last dimension
+// of the value tensor.
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, in_channels, in_height, in_width].
+// The tensor will be added to "in_channels", the third-to-the-last
+//     dimension.
+// If not specified, defaults to "NHWC"
+func BiasAddDataFormat(value string) BiasAddAttr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// Adds `bias` to `value`.
 //
-// Contrast is adjusted independently for each channel of each image.
-//
-// For each channel, the Op first computes the mean of the image pixels in the
-// channel and then adjusts each component of each pixel to
-// `(x - mean) * contrast_factor + mean`.
+// This is a special case of `tf.add` where `bias` is restricted to be 1-D.
+// Broadcasting is supported, so `value` may have any number of dimensions.
 //
 // Arguments:
-//	images: Images to adjust.  At least 3-D.
-//	contrast_factor: A float multiplier for adjusting contrast.
+//	value: Any number of dimensions.
+//	bias: 1-D with size the last dimension of `value`.
 //
-// Returns The contrast-adjusted image or images.
-func AdjustContrastv2(scope *Scope, images tf.Output, contrast_factor tf.Output) (output tf.Output) {
+// Returns Broadcasted sum of `value` and `bias`.
+func BiasAdd(scope *Scope, value tf.Output, bias tf.Output, optional ...BiasAddAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "AdjustContrastv2",
+		Type: "BiasAdd",
 		Input: []tf.Input{
-			images, contrast_factor,
+			value, bias,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
@@ -33112,60 +33638,6 @@ func CSRSparseMatrixToDense(scope *Scope, sparse_input tf.Output, type_ tf.DataT
 	return op.Output(0)
 }
 
-// IteratorFromStringHandleAttr is an optional argument to IteratorFromStringHandle.
-type IteratorFromStringHandleAttr func(optionalAttr)
-
-// IteratorFromStringHandleOutputTypes sets the optional output_types attribute to value.
-//
-// value: If specified, defines the type of each tuple component in an
-// element produced by the resulting iterator.
-// If not specified, defaults to <>
-//
-// REQUIRES: len(value) >= 0
-func IteratorFromStringHandleOutputTypes(value []tf.DataType) IteratorFromStringHandleAttr {
-	return func(m optionalAttr) {
-		m["output_types"] = value
-	}
-}
-
-// IteratorFromStringHandleOutputShapes sets the optional output_shapes attribute to value.
-//
-// value: If specified, defines the shape of each tuple component in an
-// element produced by the resulting iterator.
-// If not specified, defaults to <>
-//
-// REQUIRES: len(value) >= 0
-func IteratorFromStringHandleOutputShapes(value []tf.Shape) IteratorFromStringHandleAttr {
-	return func(m optionalAttr) {
-		m["output_shapes"] = value
-	}
-}
-
-// Converts the given string representing a handle to an iterator to a resource.
-//
-// Arguments:
-//	string_handle: A string representation of the given handle.
-//
-// Returns A handle to an iterator resource.
-func IteratorFromStringHandle(scope *Scope, string_handle tf.Output, optional ...IteratorFromStringHandleAttr) (resource_handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "IteratorFromStringHandle",
-		Input: []tf.Input{
-			string_handle,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Add all input tensors element wise.
 //
 //   Inputs must be of same size and shape.
@@ -34540,6 +35012,49 @@ func SparseCrossHashed(scope *Scope, indices []tf.Output, values []tf.Output, sh
 	return op.Output(0), op.Output(1), op.Output(2)
 }
 
+// Conv3DBackpropInputAttr is an optional argument to Conv3DBackpropInput.
+type Conv3DBackpropInputAttr func(optionalAttr)
+
+// Conv3DBackpropInputDilations sets the optional dilations attribute to value.
+// If not specified, defaults to <i:1 i:1 i:1 i:1 i:1 >
+func Conv3DBackpropInputDilations(value []int64) Conv3DBackpropInputAttr {
+	return func(m optionalAttr) {
+		m["dilations"] = value
+	}
+}
+
+// Computes the gradients of 3-D convolution with respect to the input.
+//
+// DEPRECATED at GraphDef version 10: Use Conv3DBackpropInputV2
+//
+// Arguments:
+//	input: Shape `[batch, depth, rows, cols, in_channels]`.
+//	filter: Shape `[depth, rows, cols, in_channels, out_channels]`.
+// `in_channels` must match between `input` and `filter`.
+//	out_backprop: Backprop signal of shape `[batch, out_depth, out_rows, out_cols,
+// out_channels]`.
+//	strides: 1-D tensor of length 5. The stride of the sliding window for each
+// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
+//	padding: The type of padding algorithm to use.
+func Conv3DBackpropInput(scope *Scope, input tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...Conv3DBackpropInputAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Conv3DBackpropInput",
+		Input: []tf.Input{
+			input, filter, out_backprop,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // QuantizedInstanceNormAttr is an optional argument to QuantizedInstanceNorm.
 type QuantizedInstanceNormAttr func(optionalAttr)
 
@@ -35985,222 +36500,6 @@ func DeserializeManySparse(scope *Scope, serialized_sparse tf.Output, dtype tf.D
 	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// MapStageAttr is an optional argument to MapStage.
-type MapStageAttr func(optionalAttr)
-
-// MapStageCapacity sets the optional capacity attribute to value.
-//
-// value: Maximum number of elements in the Staging Area. If > 0, inserts
-// on the container will block when the capacity is reached.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func MapStageCapacity(value int64) MapStageAttr {
-	return func(m optionalAttr) {
-		m["capacity"] = value
-	}
-}
-
-// MapStageMemoryLimit sets the optional memory_limit attribute to value.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func MapStageMemoryLimit(value int64) MapStageAttr {
-	return func(m optionalAttr) {
-		m["memory_limit"] = value
-	}
-}
-
-// MapStageContainer sets the optional container attribute to value.
-//
-// value: If non-empty, this queue is placed in the given container. Otherwise,
-// a default container is used.
-// If not specified, defaults to ""
-func MapStageContainer(value string) MapStageAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// MapStageSharedName sets the optional shared_name attribute to value.
-//
-// value: It is necessary to match this name to the matching Unstage Op.
-// If not specified, defaults to ""
-func MapStageSharedName(value string) MapStageAttr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// Stage (key, values) in the underlying container which behaves like a hashtable.
-//
-// Arguments:
-//	key: int64
-//
-//	values: a list of tensors
-// dtypes A list of data types that inserted values should adhere to.
-//
-//
-// Returns the created operation.
-func MapStage(scope *Scope, key tf.Output, indices tf.Output, values []tf.Output, dtypes []tf.DataType, optional ...MapStageAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dtypes": dtypes}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "MapStage",
-		Input: []tf.Input{
-			key, indices, tf.OutputList(values),
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// FusedBatchNormGradV2Attr is an optional argument to FusedBatchNormGradV2.
-type FusedBatchNormGradV2Attr func(optionalAttr)
-
-// FusedBatchNormGradV2Epsilon sets the optional epsilon attribute to value.
-//
-// value: A small float number added to the variance of x.
-// If not specified, defaults to 0.0001
-func FusedBatchNormGradV2Epsilon(value float32) FusedBatchNormGradV2Attr {
-	return func(m optionalAttr) {
-		m["epsilon"] = value
-	}
-}
-
-// FusedBatchNormGradV2DataFormat sets the optional data_format attribute to value.
-//
-// value: The data format for y_backprop, x, x_backprop.
-// Either "NHWC" (default) or "NCHW".
-// If not specified, defaults to "NHWC"
-func FusedBatchNormGradV2DataFormat(value string) FusedBatchNormGradV2Attr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// FusedBatchNormGradV2IsTraining sets the optional is_training attribute to value.
-//
-// value: A bool value to indicate the operation is for training (default)
-// or inference.
-// If not specified, defaults to true
-func FusedBatchNormGradV2IsTraining(value bool) FusedBatchNormGradV2Attr {
-	return func(m optionalAttr) {
-		m["is_training"] = value
-	}
-}
-
-// Gradient for batch normalization.
-//
-// Note that the size of 4D Tensors are defined by either "NHWC" or "NCHW".
-// The size of 1D Tensors matches the dimension C of the 4D Tensors.
-//
-// Arguments:
-//	y_backprop: A 4D Tensor for the gradient with respect to y.
-//	x: A 4D Tensor for input data.
-//	scale: A 1D Tensor for scaling factor, to scale the normalized x.
-//	reserve_space_1: When is_training is True, a 1D Tensor for the computed batch
-// mean to be reused in gradient computation. When is_training is
-// False, a 1D Tensor for the population mean to be reused in both
-// 1st and 2nd order gradient computation.
-//	reserve_space_2: When is_training is True, a 1D Tensor for the computed batch
-// variance (inverted variance in the cuDNN case) to be reused in
-// gradient computation. When is_training is False, a 1D Tensor
-// for the population variance to be reused in both 1st and 2nd
-// order gradient computation.
-//
-// Returns:
-//	x_backprop: A 4D Tensor for the gradient with respect to x.
-//	scale_backprop: A 1D Tensor for the gradient with respect to scale.
-//	offset_backprop: A 1D Tensor for the gradient with respect to offset.
-//	reserve_space_3: Unused placeholder to match the mean input in FusedBatchNorm.
-//	reserve_space_4: Unused placeholder to match the variance input
-// in FusedBatchNorm.
-func FusedBatchNormGradV2(scope *Scope, y_backprop tf.Output, x tf.Output, scale tf.Output, reserve_space_1 tf.Output, reserve_space_2 tf.Output, optional ...FusedBatchNormGradV2Attr) (x_backprop tf.Output, scale_backprop tf.Output, offset_backprop tf.Output, reserve_space_3 tf.Output, reserve_space_4 tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "FusedBatchNormGradV2",
-		Input: []tf.Input{
-			y_backprop, x, scale, reserve_space_1, reserve_space_2,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
-}
-
-// SparseTensorDenseMatMulAttr is an optional argument to SparseTensorDenseMatMul.
-type SparseTensorDenseMatMulAttr func(optionalAttr)
-
-// SparseTensorDenseMatMulAdjointA sets the optional adjoint_a attribute to value.
-//
-// value: Use the adjoint of A in the matrix multiply.  If A is complex, this
-// is transpose(conj(A)).  Otherwise it's transpose(A).
-// If not specified, defaults to false
-func SparseTensorDenseMatMulAdjointA(value bool) SparseTensorDenseMatMulAttr {
-	return func(m optionalAttr) {
-		m["adjoint_a"] = value
-	}
-}
-
-// SparseTensorDenseMatMulAdjointB sets the optional adjoint_b attribute to value.
-//
-// value: Use the adjoint of B in the matrix multiply.  If B is complex, this
-// is transpose(conj(B)).  Otherwise it's transpose(B).
-// If not specified, defaults to false
-func SparseTensorDenseMatMulAdjointB(value bool) SparseTensorDenseMatMulAttr {
-	return func(m optionalAttr) {
-		m["adjoint_b"] = value
-	}
-}
-
-// Multiply SparseTensor (of rank 2) "A" by dense matrix "B".
-//
-// No validity checking is performed on the indices of A.  However, the following
-// input format is recommended for optimal behavior:
-//
-// if adjoint_a == false:
-//   A should be sorted in lexicographically increasing order.  Use SparseReorder
-//   if you're not sure.
-// if adjoint_a == true:
-//   A should be sorted in order of increasing dimension 1 (i.e., "column major"
-//   order instead of "row major" order).
-//
-// Arguments:
-//	a_indices: 2-D.  The `indices` of the `SparseTensor`, size `[nnz, 2]` Matrix.
-//	a_values: 1-D.  The `values` of the `SparseTensor`, size `[nnz]` Vector.
-//	a_shape: 1-D.  The `shape` of the `SparseTensor`, size `[2]` Vector.
-//	b: 2-D.  A dense Matrix.
-func SparseTensorDenseMatMul(scope *Scope, a_indices tf.Output, a_values tf.Output, a_shape tf.Output, b tf.Output, optional ...SparseTensorDenseMatMulAttr) (product tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "SparseTensorDenseMatMul",
-		Input: []tf.Input{
-			a_indices, a_values, a_shape, b,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Sets the index-th position of the list to contain the given tensor.
 //
 // input_handle: the list
@@ -36542,63 +36841,6 @@ func SparseAddGrad(scope *Scope, backprop_val_grad tf.Output, a_indices tf.Outpu
 	return op.Output(0), op.Output(1)
 }
 
-// Removes keys and its associated values from a table.
-//
-// The tensor `keys` must of the same type as the keys of the table. Keys not
-// already in the table are silently ignored.
-//
-// Arguments:
-//	table_handle: Handle to the table.
-//	keys: Any shape.  Keys of the elements to remove.
-//
-// Returns the created operation.
-func LookupTableRemoveV2(scope *Scope, table_handle tf.Output, keys tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "LookupTableRemoveV2",
-		Input: []tf.Input{
-			table_handle, keys,
-		},
-	}
-	return scope.AddOperation(opspec)
-}
-
-// NotEqualAttr is an optional argument to NotEqual.
-type NotEqualAttr func(optionalAttr)
-
-// NotEqualIncompatibleShapeError sets the optional incompatible_shape_error attribute to value.
-// If not specified, defaults to true
-func NotEqualIncompatibleShapeError(value bool) NotEqualAttr {
-	return func(m optionalAttr) {
-		m["incompatible_shape_error"] = value
-	}
-}
-
-// Returns the truth value of (x != y) element-wise.
-//
-// *NOTE*: `NotEqual` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func NotEqual(scope *Scope, x tf.Output, y tf.Output, optional ...NotEqualAttr) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "NotEqual",
-		Input: []tf.Input{
-			x, y,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Computes the complementary error function of `x` element-wise.
 func Erfc(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
@@ -37827,31 +38069,6 @@ func RFFT(scope *Scope, input tf.Output, fft_length tf.Output, optional ...RFFTA
 	return op.Output(0)
 }
 
-// JPEG encode input image with provided compression quality.
-//
-// `image` is a 3-D uint8 Tensor of shape `[height, width, channels]`.
-// `quality` is an int32 jpeg compression quality value between 0 and 100.
-//
-//
-// Arguments:
-//	images: Images to adjust.  At least 3-D.
-//	quality: An int quality to encode to.
-//
-// Returns 0-D. JPEG-encoded image.
-func EncodeJpegVariableQuality(scope *Scope, images tf.Output, quality tf.Output) (contents tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "EncodeJpegVariableQuality",
-		Input: []tf.Input{
-			images, quality,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // 3D fast Fourier transform.
 //
 // Computes the 3-dimensional discrete Fourier transform over the inner-most 3
@@ -38270,6 +38487,44 @@ func RecvTPUEmbeddingActivations(scope *Scope, num_outputs int64, config string)
 	return outputs
 }
 
+// InfeedEnqueuePrelinearizedBufferAttr is an optional argument to InfeedEnqueuePrelinearizedBuffer.
+type InfeedEnqueuePrelinearizedBufferAttr func(optionalAttr)
+
+// InfeedEnqueuePrelinearizedBufferDeviceOrdinal sets the optional device_ordinal attribute to value.
+//
+// value: The TPU device to use. This should be -1 when the Op is running on a TPU device
+// and = 0 when the Op is running on the CPU device.
+// If not specified, defaults to -1
+func InfeedEnqueuePrelinearizedBufferDeviceOrdinal(value int64) InfeedEnqueuePrelinearizedBufferAttr {
+	return func(m optionalAttr) {
+		m["device_ordinal"] = value
+	}
+}
+
+// An op which enqueues prelinearized buffer into TPU infeed.
+//
+// Arguments:
+//	input: A variant tensor representing linearized output.
+//
+// Returns the created operation.
+func InfeedEnqueuePrelinearizedBuffer(scope *Scope, input tf.Output, optional ...InfeedEnqueuePrelinearizedBufferAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "InfeedEnqueuePrelinearizedBuffer",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
 // Computes the derivative of a Gamma random sample w.r.t. `alpha`.
 func RandomGammaGrad(scope *Scope, alpha tf.Output, sample tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
@@ -38549,171 +38804,6 @@ func StatefulTruncatedNormal(scope *Scope, resource tf.Output, algorithm tf.Outp
 	return op.Output(0)
 }
 
-// Returns the rank of a tensor.
-//
-// This operation returns an integer representing the rank of `input`.
-//
-// For example:
-//
-// ```
-// # 't' is [[[1, 1, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]]
-// # shape of tensor 't' is [2, 2, 3]
-// rank(t) ==> 3
-// ```
-//
-// **Note**: The rank of a tensor is not the same as the rank of a matrix. The rank
-// of a tensor is the number of indices required to uniquely select each element
-// of the tensor. Rank is also known as "order", "degree", or "ndims."
-func Rank(scope *Scope, input tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Rank",
-		Input: []tf.Input{
-			input,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// EncodeJpegAttr is an optional argument to EncodeJpeg.
-type EncodeJpegAttr func(optionalAttr)
-
-// EncodeJpegFormat sets the optional format attribute to value.
-//
-// value: Per pixel image format.
-// If not specified, defaults to ""
-func EncodeJpegFormat(value string) EncodeJpegAttr {
-	return func(m optionalAttr) {
-		m["format"] = value
-	}
-}
-
-// EncodeJpegQuality sets the optional quality attribute to value.
-//
-// value: Quality of the compression from 0 to 100 (higher is better and slower).
-// If not specified, defaults to 95
-func EncodeJpegQuality(value int64) EncodeJpegAttr {
-	return func(m optionalAttr) {
-		m["quality"] = value
-	}
-}
-
-// EncodeJpegProgressive sets the optional progressive attribute to value.
-//
-// value: If True, create a JPEG that loads progressively (coarse to fine).
-// If not specified, defaults to false
-func EncodeJpegProgressive(value bool) EncodeJpegAttr {
-	return func(m optionalAttr) {
-		m["progressive"] = value
-	}
-}
-
-// EncodeJpegOptimizeSize sets the optional optimize_size attribute to value.
-//
-// value: If True, spend CPU/RAM to reduce size with no quality change.
-// If not specified, defaults to false
-func EncodeJpegOptimizeSize(value bool) EncodeJpegAttr {
-	return func(m optionalAttr) {
-		m["optimize_size"] = value
-	}
-}
-
-// EncodeJpegChromaDownsampling sets the optional chroma_downsampling attribute to value.
-//
-// value: See http://en.wikipedia.org/wiki/Chroma_subsampling.
-// If not specified, defaults to true
-func EncodeJpegChromaDownsampling(value bool) EncodeJpegAttr {
-	return func(m optionalAttr) {
-		m["chroma_downsampling"] = value
-	}
-}
-
-// EncodeJpegDensityUnit sets the optional density_unit attribute to value.
-//
-// value: Unit used to specify `x_density` and `y_density`:
-// pixels per inch (`'in'`) or centimeter (`'cm'`).
-// If not specified, defaults to "in"
-func EncodeJpegDensityUnit(value string) EncodeJpegAttr {
-	return func(m optionalAttr) {
-		m["density_unit"] = value
-	}
-}
-
-// EncodeJpegXDensity sets the optional x_density attribute to value.
-//
-// value: Horizontal pixels per density unit.
-// If not specified, defaults to 300
-func EncodeJpegXDensity(value int64) EncodeJpegAttr {
-	return func(m optionalAttr) {
-		m["x_density"] = value
-	}
-}
-
-// EncodeJpegYDensity sets the optional y_density attribute to value.
-//
-// value: Vertical pixels per density unit.
-// If not specified, defaults to 300
-func EncodeJpegYDensity(value int64) EncodeJpegAttr {
-	return func(m optionalAttr) {
-		m["y_density"] = value
-	}
-}
-
-// EncodeJpegXmpMetadata sets the optional xmp_metadata attribute to value.
-//
-// value: If not empty, embed this XMP metadata in the image header.
-// If not specified, defaults to ""
-func EncodeJpegXmpMetadata(value string) EncodeJpegAttr {
-	return func(m optionalAttr) {
-		m["xmp_metadata"] = value
-	}
-}
-
-// JPEG-encode an image.
-//
-// `image` is a 3-D uint8 Tensor of shape `[height, width, channels]`.
-//
-// The attr `format` can be used to override the color format of the encoded
-// output.  Values can be:
-//
-// *   `''`: Use a default format based on the number of channels in the image.
-// *   `grayscale`: Output a grayscale JPEG image.  The `channels` dimension
-//     of `image` must be 1.
-// *   `rgb`: Output an RGB JPEG image. The `channels` dimension
-//     of `image` must be 3.
-//
-// If `format` is not specified or is the empty string, a default format is picked
-// in function of the number of channels in `image`:
-//
-// *   1: Output a grayscale image.
-// *   3: Output an RGB image.
-//
-// Arguments:
-//	image: 3-D with shape `[height, width, channels]`.
-//
-// Returns 0-D. JPEG-encoded image.
-func EncodeJpeg(scope *Scope, image tf.Output, optional ...EncodeJpegAttr) (contents tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "EncodeJpeg",
-		Input: []tf.Input{
-			image,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Checks whether a quantile stream has been initialized.
 //
 // An Op that checks if quantile stream resource is initialized.
@@ -41652,118 +41742,6 @@ func ResourceApplyAdam(scope *Scope, var_ tf.Output, m tf.Output, v tf.Output, b
 	return scope.AddOperation(opspec)
 }
 
-// CumsumAttr is an optional argument to Cumsum.
-type CumsumAttr func(optionalAttr)
-
-// CumsumExclusive sets the optional exclusive attribute to value.
-//
-// value: If `True`, perform exclusive cumsum.
-// If not specified, defaults to false
-func CumsumExclusive(value bool) CumsumAttr {
-	return func(m optionalAttr) {
-		m["exclusive"] = value
-	}
-}
-
-// CumsumReverse sets the optional reverse attribute to value.
-//
-// value: A `bool` (default: False).
-// If not specified, defaults to false
-func CumsumReverse(value bool) CumsumAttr {
-	return func(m optionalAttr) {
-		m["reverse"] = value
-	}
-}
-
-// Compute the cumulative sum of the tensor `x` along `axis`.
-//
-// By default, this op performs an inclusive cumsum, which means that the first
-// element of the input is identical to the first element of the output:
-//
-// ```python
-// tf.cumsum([a, b, c])  # => [a, a + b, a + b + c]
-// ```
-//
-// By setting the `exclusive` kwarg to `True`, an exclusive cumsum is
-// performed instead:
-//
-// ```python
-// tf.cumsum([a, b, c], exclusive=True)  # => [0, a, a + b]
-// ```
-//
-// By setting the `reverse` kwarg to `True`, the cumsum is performed in the
-// opposite direction:
-//
-// ```python
-// tf.cumsum([a, b, c], reverse=True)  # => [a + b + c, b + c, c]
-// ```
-//
-// This is more efficient than using separate `tf.reverse` ops.
-//
-// The `reverse` and `exclusive` kwargs can also be combined:
-//
-// ```python
-// tf.cumsum([a, b, c], exclusive=True, reverse=True)  # => [b + c, c, 0]
-// ```
-//
-// Arguments:
-//	x: A `Tensor`. Must be one of the following types: `float32`, `float64`,
-// `int64`, `int32`, `uint8`, `uint16`, `int16`, `int8`, `complex64`,
-// `complex128`, `qint8`, `quint8`, `qint32`, `half`.
-//	axis: A `Tensor` of type `int32` (default: 0). Must be in the range
-// `[-rank(x), rank(x))`.
-func Cumsum(scope *Scope, x tf.Output, axis tf.Output, optional ...CumsumAttr) (out tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "Cumsum",
-		Input: []tf.Input{
-			x, axis,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Performs gradient updates of embedding tables.
-//
-// Arguments:
-//	inputs: A TensorList of gradients with which to update embedding tables.
-// This argument has the same length and shapes as the return value of
-// RecvTPUEmbeddingActivations, but contains gradients of the model's loss
-// with respect to the embedding activations. The embedding tables are updated
-// from these gradients via the optimizer specified in the TPU embedding
-// configuration given to tpu.initialize_system.
-//	learning_rates: A TensorList of float32 scalars, one for each dynamic learning
-// rate tag: see the comments in
-// //third_party/tensorflow/core/protobuf/tpu/optimization_parameters.proto.
-// Multiple tables can share the same dynamic learning rate tag as specified
-// in the configuration. If the learning rates for all tables are constant,
-// this list should be empty.
-//	config: Serialized TPUEmbeddingConfiguration proto.
-//
-// Returns the created operation.
-func SendTPUEmbeddingGradients(scope *Scope, inputs []tf.Output, learning_rates []tf.Output, config string) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"config": config}
-	opspec := tf.OpSpec{
-		Type: "SendTPUEmbeddingGradients",
-		Input: []tf.Input{
-			tf.OutputList(inputs), tf.OutputList(learning_rates),
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
 // ResourceApplyKerasMomentumAttr is an optional argument to ResourceApplyKerasMomentum.
 type ResourceApplyKerasMomentumAttr func(optionalAttr)
 
@@ -42316,31 +42294,6 @@ func LoadTPUEmbeddingRMSPropParametersGradAccumDebug(scope *Scope, parameters tf
 	return scope.AddOperation(opspec)
 }
 
-// Concatenates tensors along one dimension.
-//
-// Arguments:
-//	values: List of `N` Tensors to concatenate. Their ranks and types must match,
-// and their sizes must match in all dimensions except `concat_dim`.
-//	axis: 0-D.  The dimension along which to concatenate.  Must be in the
-// range [-rank(values), rank(values)).
-//
-// Returns A `Tensor` with the concatenation of values stacked along the
-// `concat_dim` dimension.  This tensor's shape matches that of `values` except
-// in `concat_dim` where it has the sum of the sizes.
-func ConcatV2(scope *Scope, values []tf.Output, axis tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ConcatV2",
-		Input: []tf.Input{
-			tf.OutputList(values), axis,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // LoadTPUEmbeddingFTRLParametersGradAccumDebugAttr is an optional argument to LoadTPUEmbeddingFTRLParametersGradAccumDebug.
 type LoadTPUEmbeddingFTRLParametersGradAccumDebugAttr func(optionalAttr)
 
@@ -43001,6 +42954,34 @@ func SparseSoftmax(scope *Scope, sp_indices tf.Output, sp_values tf.Output, sp_s
 	return op.Output(0)
 }
 
+// Adjust the hue of one or more images.
+//
+// `images` is a tensor of at least 3 dimensions.  The last dimension is
+// interpreted as channels, and must be three.
+//
+// The input image is considered in the RGB colorspace. Conceptually, the RGB
+// colors are first mapped into HSV. A delta is then applied all the hue values,
+// and then remapped back to RGB colorspace.
+//
+// Arguments:
+//	images: Images to adjust.  At least 3-D.
+//	delta: A float delta to add to the hue.
+//
+// Returns The hue-adjusted image or images.
+func AdjustHue(scope *Scope, images tf.Output, delta tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "AdjustHue",
+		Input: []tf.Input{
+			images, delta,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Computes hyperbolic cosine of x element-wise.
 //
 //   Given an input tensor, this function computes hyperbolic cosine of every
@@ -43204,26 +43185,6 @@ func SparseSparseMaximum(scope *Scope, a_indices tf.Output, a_values tf.Output,
 	return op.Output(0), op.Output(1)
 }
 
-// Computes the Bessel i1e function of `x` element-wise.
-//
-// Exponentially scaled modified Bessel function of order 0 defined as
-// `bessel_i1e(x) = exp(-abs(x)) bessel_i1(x)`.
-//
-// This function is faster and numerically stabler than `bessel_i1(x)`.
-func BesselI1e(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "BesselI1e",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // LSTMBlockCellAttr is an optional argument to LSTMBlockCell.
 type LSTMBlockCellAttr func(optionalAttr)
 
@@ -44980,83 +44941,6 @@ func ResourceScatterNdAdd(scope *Scope, ref tf.Output, indices tf.Output, update
 	return scope.AddOperation(opspec)
 }
 
-// Counts the number of occurrences of each value in an integer array.
-//
-// Outputs a vector with length `size` and the same dtype as `weights`. If
-// `weights` are empty, then index `i` stores the number of times the value `i` is
-// counted in `arr`. If `weights` are non-empty, then index `i` stores the sum of
-// the value in `weights` at each index where the corresponding value in `arr` is
-// `i`.
-//
-// Values in `arr` outside of the range [0, size) are ignored.
-//
-// Arguments:
-//	arr: int32 `Tensor`.
-//	size: non-negative int32 scalar `Tensor`.
-//	weights: is an int32, int64, float32, or float64 `Tensor` with the same
-// shape as `arr`, or a length-0 `Tensor`, in which case it acts as all weights
-// equal to 1.
-//
-// Returns 1D `Tensor` with length equal to `size`. The counts or summed weights for
-// each value in the range [0, size).
-func Bincount(scope *Scope, arr tf.Output, size tf.Output, weights tf.Output) (bins tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Bincount",
-		Input: []tf.Input{
-			arr, size, weights,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Gradients for batch normalization.
-//
-// DEPRECATED at GraphDef version 9: Use tf.nn.batch_normalization()
-//
-// This op is deprecated. See `tf.nn.batch_normalization`.
-//
-// Arguments:
-//	t: A 4D input Tensor.
-//	m: A 1D mean Tensor with size matching the last dimension of t.
-// This is the first output from tf.nn.moments,
-// or a saved moving average thereof.
-//	v: A 1D variance Tensor with size matching the last dimension of t.
-// This is the second output from tf.nn.moments,
-// or a saved moving average thereof.
-//	gamma: A 1D gamma Tensor with size matching the last dimension of t.
-// If "scale_after_normalization" is true, this Tensor will be multiplied
-// with the normalized Tensor.
-//	backprop: 4D backprop Tensor.
-//	variance_epsilon: A small float number to avoid dividing by 0.
-//	scale_after_normalization: A bool indicating whether the resulted tensor
-// needs to be multiplied with gamma.
-//
-// Returns:
-//	dx: 4D backprop tensor for input.
-//	dm: 1D backprop tensor for mean.
-//	dv: 1D backprop tensor for variance.
-//	db: 1D backprop tensor for beta.
-//	dg: 1D backprop tensor for gamma.
-func BatchNormWithGlobalNormalizationGrad(scope *Scope, t tf.Output, m tf.Output, v tf.Output, gamma tf.Output, backprop tf.Output, variance_epsilon float32, scale_after_normalization bool) (dx tf.Output, dm tf.Output, dv tf.Output, db tf.Output, dg tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"variance_epsilon": variance_epsilon, "scale_after_normalization": scale_after_normalization}
-	opspec := tf.OpSpec{
-		Type: "BatchNormWithGlobalNormalizationGrad",
-		Input: []tf.Input{
-			t, m, v, gamma, backprop,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
-}
-
 // CropAndResizeGradImageAttr is an optional argument to CropAndResizeGradImage.
 type CropAndResizeGradImageAttr func(optionalAttr)
 
@@ -45355,141 +45239,6 @@ func OutfeedEnqueueTuple(scope *Scope, inputs []tf.Output) (o *tf.Operation) {
 	return scope.AddOperation(opspec)
 }
 
-// Returns the number of nonzeroes of `sparse_matrix`.
-//
-// Arguments:
-//	sparse_matrix: A CSRSparseMatrix.
-//
-// Returns The number of nonzeroes of `sparse_matrix`.
-func SparseMatrixNNZ(scope *Scope, sparse_matrix tf.Output) (nnz tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SparseMatrixNNZ",
-		Input: []tf.Input{
-			sparse_matrix,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// DecodeJpegAttr is an optional argument to DecodeJpeg.
-type DecodeJpegAttr func(optionalAttr)
-
-// DecodeJpegChannels sets the optional channels attribute to value.
-//
-// value: Number of color channels for the decoded image.
-// If not specified, defaults to 0
-func DecodeJpegChannels(value int64) DecodeJpegAttr {
-	return func(m optionalAttr) {
-		m["channels"] = value
-	}
-}
-
-// DecodeJpegRatio sets the optional ratio attribute to value.
-//
-// value: Downscaling ratio.
-// If not specified, defaults to 1
-func DecodeJpegRatio(value int64) DecodeJpegAttr {
-	return func(m optionalAttr) {
-		m["ratio"] = value
-	}
-}
-
-// DecodeJpegFancyUpscaling sets the optional fancy_upscaling attribute to value.
-//
-// value: If true use a slower but nicer upscaling of the
-// chroma planes (yuv420/422 only).
-// If not specified, defaults to true
-func DecodeJpegFancyUpscaling(value bool) DecodeJpegAttr {
-	return func(m optionalAttr) {
-		m["fancy_upscaling"] = value
-	}
-}
-
-// DecodeJpegTryRecoverTruncated sets the optional try_recover_truncated attribute to value.
-//
-// value: If true try to recover an image from truncated input.
-// If not specified, defaults to false
-func DecodeJpegTryRecoverTruncated(value bool) DecodeJpegAttr {
-	return func(m optionalAttr) {
-		m["try_recover_truncated"] = value
-	}
-}
-
-// DecodeJpegAcceptableFraction sets the optional acceptable_fraction attribute to value.
-//
-// value: The minimum required fraction of lines before a truncated
-// input is accepted.
-// If not specified, defaults to 1
-func DecodeJpegAcceptableFraction(value float32) DecodeJpegAttr {
-	return func(m optionalAttr) {
-		m["acceptable_fraction"] = value
-	}
-}
-
-// DecodeJpegDctMethod sets the optional dct_method attribute to value.
-//
-// value: string specifying a hint about the algorithm used for
-// decompression.  Defaults to "" which maps to a system-specific
-// default.  Currently valid values are ["INTEGER_FAST",
-// "INTEGER_ACCURATE"].  The hint may be ignored (e.g., the internal
-// jpeg library changes to a version that does not have that specific
-// option.)
-// If not specified, defaults to ""
-func DecodeJpegDctMethod(value string) DecodeJpegAttr {
-	return func(m optionalAttr) {
-		m["dct_method"] = value
-	}
-}
-
-// Decode a JPEG-encoded image to a uint8 tensor.
-//
-// The attr `channels` indicates the desired number of color channels for the
-// decoded image.
-//
-// Accepted values are:
-//
-// *   0: Use the number of channels in the JPEG-encoded image.
-// *   1: output a grayscale image.
-// *   3: output an RGB image.
-//
-// If needed, the JPEG-encoded image is transformed to match the requested number
-// of color channels.
-//
-// The attr `ratio` allows downscaling the image by an integer factor during
-// decoding.  Allowed values are: 1, 2, 4, and 8.  This is much faster than
-// downscaling the image later.
-//
-//
-// This op also supports decoding PNGs and non-animated GIFs since the interface is
-// the same, though it is cleaner to use `tf.io.decode_image`.
-//
-// Arguments:
-//	contents: 0-D.  The JPEG-encoded image.
-//
-// Returns 3-D with shape `[height, width, channels]`..
-func DecodeJpeg(scope *Scope, contents tf.Output, optional ...DecodeJpegAttr) (image tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "DecodeJpeg",
-		Input: []tf.Input{
-			contents,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // LoadTPUEmbeddingADAMParametersAttr is an optional argument to LoadTPUEmbeddingADAMParameters.
 type LoadTPUEmbeddingADAMParametersAttr func(optionalAttr)
 
@@ -46398,6 +46147,165 @@ func LoadTPUEmbeddingStochasticGradientDescentParametersGradAccumDebug(scope *Sc
 	return scope.AddOperation(opspec)
 }
 
+// FakeQuantWithMinMaxArgsAttr is an optional argument to FakeQuantWithMinMaxArgs.
+type FakeQuantWithMinMaxArgsAttr func(optionalAttr)
+
+// FakeQuantWithMinMaxArgsMin sets the optional min attribute to value.
+// If not specified, defaults to -6
+func FakeQuantWithMinMaxArgsMin(value float32) FakeQuantWithMinMaxArgsAttr {
+	return func(m optionalAttr) {
+		m["min"] = value
+	}
+}
+
+// FakeQuantWithMinMaxArgsMax sets the optional max attribute to value.
+// If not specified, defaults to 6
+func FakeQuantWithMinMaxArgsMax(value float32) FakeQuantWithMinMaxArgsAttr {
+	return func(m optionalAttr) {
+		m["max"] = value
+	}
+}
+
+// FakeQuantWithMinMaxArgsNumBits sets the optional num_bits attribute to value.
+// If not specified, defaults to 8
+func FakeQuantWithMinMaxArgsNumBits(value int64) FakeQuantWithMinMaxArgsAttr {
+	return func(m optionalAttr) {
+		m["num_bits"] = value
+	}
+}
+
+// FakeQuantWithMinMaxArgsNarrowRange sets the optional narrow_range attribute to value.
+// If not specified, defaults to false
+func FakeQuantWithMinMaxArgsNarrowRange(value bool) FakeQuantWithMinMaxArgsAttr {
+	return func(m optionalAttr) {
+		m["narrow_range"] = value
+	}
+}
+
+// Fake-quantize the 'inputs' tensor, type float to 'outputs' tensor of same type.
+//
+// Attributes
+//
+// *   `[min; max]` define the clamping range for the `inputs` data.
+// *   `inputs` values are quantized into the quantization range (
+// `[0; 2^num_bits - 1]` when `narrow_range` is false and `[1; 2^num_bits - 1]`
+// when it is true) and then de-quantized and output as floats in `[min; max]`
+// interval.
+// *   `num_bits` is the bitwidth of the quantization; between 2 and 16, inclusive.
+//
+// Before quantization, `min` and `max` values are adjusted with the following
+// logic.
+// It is suggested to have `min <= 0 <= max`. If `0` is not in the range of values,
+// the behavior can be unexpected:
+//
+// *   If `0 < min < max`: `min_adj = 0` and `max_adj = max - min`.
+// *   If `min < max < 0`: `min_adj = min - max` and `max_adj = 0`.
+// *   If `min <= 0 <= max`: `scale = (max - min) / (2^num_bits - 1) `,
+// `min_adj = scale * round(min / scale)` and `max_adj = max + min_adj - min`.
+//
+// Quantization is called fake since the output is still in floating point.
+func FakeQuantWithMinMaxArgs(scope *Scope, inputs tf.Output, optional ...FakeQuantWithMinMaxArgsAttr) (outputs tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "FakeQuantWithMinMaxArgs",
+		Input: []tf.Input{
+			inputs,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Batch normalization.
+//
+// DEPRECATED at GraphDef version 9: Use tf.nn.batch_normalization()
+//
+// This op is deprecated. Prefer `tf.nn.batch_normalization`.
+//
+// Arguments:
+//	t: A 4D input Tensor.
+//	m: A 1D mean Tensor with size matching the last dimension of t.
+// This is the first output from tf.nn.moments,
+// or a saved moving average thereof.
+//	v: A 1D variance Tensor with size matching the last dimension of t.
+// This is the second output from tf.nn.moments,
+// or a saved moving average thereof.
+//	beta: A 1D beta Tensor with size matching the last dimension of t.
+// An offset to be added to the normalized tensor.
+//	gamma: A 1D gamma Tensor with size matching the last dimension of t.
+// If "scale_after_normalization" is true, this tensor will be multiplied
+// with the normalized tensor.
+//	variance_epsilon: A small float number to avoid dividing by 0.
+//	scale_after_normalization: A bool indicating whether the resulted tensor
+// needs to be multiplied with gamma.
+func BatchNormWithGlobalNormalization(scope *Scope, t tf.Output, m tf.Output, v tf.Output, beta tf.Output, gamma tf.Output, variance_epsilon float32, scale_after_normalization bool) (result tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"variance_epsilon": variance_epsilon, "scale_after_normalization": scale_after_normalization}
+	opspec := tf.OpSpec{
+		Type: "BatchNormWithGlobalNormalization",
+		Input: []tf.Input{
+			t, m, v, beta, gamma,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// EncodeBase64Attr is an optional argument to EncodeBase64.
+type EncodeBase64Attr func(optionalAttr)
+
+// EncodeBase64Pad sets the optional pad attribute to value.
+//
+// value: Bool whether padding is applied at the ends.
+// If not specified, defaults to false
+func EncodeBase64Pad(value bool) EncodeBase64Attr {
+	return func(m optionalAttr) {
+		m["pad"] = value
+	}
+}
+
+// Encode strings into web-safe base64 format.
+//
+// Refer to the following article for more information on base64 format:
+// en.wikipedia.org/wiki/Base64. Base64 strings may have padding with '=' at the
+// end so that the encoded has length multiple of 4. See Padding section of the
+// link above.
+//
+// Web-safe means that the encoder uses - and _ instead of + and /.
+//
+// Arguments:
+//	input: Strings to be encoded.
+//
+// Returns Input strings encoded in base64.
+func EncodeBase64(scope *Scope, input tf.Output, optional ...EncodeBase64Attr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "EncodeBase64",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Creates a dataset that batches input elements into a SparseTensor.
 //
 // Arguments:
@@ -46524,6 +46432,141 @@ func OutfeedEnqueue(scope *Scope, input tf.Output) (o *tf.Operation) {
 	return scope.AddOperation(opspec)
 }
 
+// DecodeJpegAttr is an optional argument to DecodeJpeg.
+type DecodeJpegAttr func(optionalAttr)
+
+// DecodeJpegChannels sets the optional channels attribute to value.
+//
+// value: Number of color channels for the decoded image.
+// If not specified, defaults to 0
+func DecodeJpegChannels(value int64) DecodeJpegAttr {
+	return func(m optionalAttr) {
+		m["channels"] = value
+	}
+}
+
+// DecodeJpegRatio sets the optional ratio attribute to value.
+//
+// value: Downscaling ratio.
+// If not specified, defaults to 1
+func DecodeJpegRatio(value int64) DecodeJpegAttr {
+	return func(m optionalAttr) {
+		m["ratio"] = value
+	}
+}
+
+// DecodeJpegFancyUpscaling sets the optional fancy_upscaling attribute to value.
+//
+// value: If true use a slower but nicer upscaling of the
+// chroma planes (yuv420/422 only).
+// If not specified, defaults to true
+func DecodeJpegFancyUpscaling(value bool) DecodeJpegAttr {
+	return func(m optionalAttr) {
+		m["fancy_upscaling"] = value
+	}
+}
+
+// DecodeJpegTryRecoverTruncated sets the optional try_recover_truncated attribute to value.
+//
+// value: If true try to recover an image from truncated input.
+// If not specified, defaults to false
+func DecodeJpegTryRecoverTruncated(value bool) DecodeJpegAttr {
+	return func(m optionalAttr) {
+		m["try_recover_truncated"] = value
+	}
+}
+
+// DecodeJpegAcceptableFraction sets the optional acceptable_fraction attribute to value.
+//
+// value: The minimum required fraction of lines before a truncated
+// input is accepted.
+// If not specified, defaults to 1
+func DecodeJpegAcceptableFraction(value float32) DecodeJpegAttr {
+	return func(m optionalAttr) {
+		m["acceptable_fraction"] = value
+	}
+}
+
+// DecodeJpegDctMethod sets the optional dct_method attribute to value.
+//
+// value: string specifying a hint about the algorithm used for
+// decompression.  Defaults to "" which maps to a system-specific
+// default.  Currently valid values are ["INTEGER_FAST",
+// "INTEGER_ACCURATE"].  The hint may be ignored (e.g., the internal
+// jpeg library changes to a version that does not have that specific
+// option.)
+// If not specified, defaults to ""
+func DecodeJpegDctMethod(value string) DecodeJpegAttr {
+	return func(m optionalAttr) {
+		m["dct_method"] = value
+	}
+}
+
+// Decode a JPEG-encoded image to a uint8 tensor.
+//
+// The attr `channels` indicates the desired number of color channels for the
+// decoded image.
+//
+// Accepted values are:
+//
+// *   0: Use the number of channels in the JPEG-encoded image.
+// *   1: output a grayscale image.
+// *   3: output an RGB image.
+//
+// If needed, the JPEG-encoded image is transformed to match the requested number
+// of color channels.
+//
+// The attr `ratio` allows downscaling the image by an integer factor during
+// decoding.  Allowed values are: 1, 2, 4, and 8.  This is much faster than
+// downscaling the image later.
+//
+//
+// This op also supports decoding PNGs and non-animated GIFs since the interface is
+// the same, though it is cleaner to use `tf.io.decode_image`.
+//
+// Arguments:
+//	contents: 0-D.  The JPEG-encoded image.
+//
+// Returns 3-D with shape `[height, width, channels]`..
+func DecodeJpeg(scope *Scope, contents tf.Output, optional ...DecodeJpegAttr) (image tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "DecodeJpeg",
+		Input: []tf.Input{
+			contents,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns the number of nonzeroes of `sparse_matrix`.
+//
+// Arguments:
+//	sparse_matrix: A CSRSparseMatrix.
+//
+// Returns The number of nonzeroes of `sparse_matrix`.
+func SparseMatrixNNZ(scope *Scope, sparse_matrix tf.Output) (nnz tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseMatrixNNZ",
+		Input: []tf.Input{
+			sparse_matrix,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // LoadTPUEmbeddingProximalAdagradParametersGradAccumDebugAttr is an optional argument to LoadTPUEmbeddingProximalAdagradParametersGradAccumDebug.
 type LoadTPUEmbeddingProximalAdagradParametersGradAccumDebugAttr func(optionalAttr)
 
@@ -47233,149 +47276,6 @@ func LoadTPUEmbeddingRMSPropParameters(scope *Scope, parameters tf.Output, ms tf
 	return scope.AddOperation(opspec)
 }
 
-// Creates a Dataset that returns pseudorandom numbers.
-//
-// Creates a Dataset that returns a stream of uniformly distributed
-// pseudorandom 64-bit signed integers.
-//
-// In the TensorFlow Python API, you can instantiate this dataset via the
-// class `tf.data.experimental.RandomDataset`.
-//
-// Instances of this dataset are also created as a result of the
-// `hoist_random_uniform` static optimization. Whether this optimization is
-// performed is determined by the `experimental_optimization.hoist_random_uniform`
-// option of `tf.data.Options`.
-//
-// Arguments:
-//	seed: A scalar seed for the random number generator. If either seed or
-// seed2 is set to be non-zero, the random number generator is seeded
-// by the given seed.  Otherwise, a random seed is used.
-//	seed2: A second scalar seed to avoid seed collision.
-//
-//
-func RandomDataset(scope *Scope, seed tf.Output, seed2 tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "RandomDataset",
-		Input: []tf.Input{
-			seed, seed2,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// FractionalAvgPoolAttr is an optional argument to FractionalAvgPool.
-type FractionalAvgPoolAttr func(optionalAttr)
-
-// FractionalAvgPoolPseudoRandom sets the optional pseudo_random attribute to value.
-//
-// value: When set to True, generates the pooling sequence in a
-// pseudorandom fashion, otherwise, in a random fashion. Check paper [Benjamin
-// Graham, Fractional Max-Pooling](http://arxiv.org/abs/1412.6071) for
-// difference between pseudorandom and random.
-// If not specified, defaults to false
-func FractionalAvgPoolPseudoRandom(value bool) FractionalAvgPoolAttr {
-	return func(m optionalAttr) {
-		m["pseudo_random"] = value
-	}
-}
-
-// FractionalAvgPoolOverlapping sets the optional overlapping attribute to value.
-//
-// value: When set to True, it means when pooling, the values at the boundary
-// of adjacent pooling cells are used by both cells. For example:
-//
-// `index  0  1  2  3  4`
-//
-// `value  20 5  16 3  7`
-//
-// If the pooling sequence is [0, 2, 4], then 16, at index 2 will be used twice.
-// The result would be [41/3, 26/3] for fractional avg pooling.
-// If not specified, defaults to false
-func FractionalAvgPoolOverlapping(value bool) FractionalAvgPoolAttr {
-	return func(m optionalAttr) {
-		m["overlapping"] = value
-	}
-}
-
-// FractionalAvgPoolDeterministic sets the optional deterministic attribute to value.
-//
-// value: When set to True, a fixed pooling region will be used when
-// iterating over a FractionalAvgPool node in the computation graph. Mainly used
-// in unit test to make FractionalAvgPool deterministic.
-// If not specified, defaults to false
-func FractionalAvgPoolDeterministic(value bool) FractionalAvgPoolAttr {
-	return func(m optionalAttr) {
-		m["deterministic"] = value
-	}
-}
-
-// FractionalAvgPoolSeed sets the optional seed attribute to value.
-//
-// value: If either seed or seed2 are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func FractionalAvgPoolSeed(value int64) FractionalAvgPoolAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
-
-// FractionalAvgPoolSeed2 sets the optional seed2 attribute to value.
-//
-// value: An second seed to avoid seed collision.
-// If not specified, defaults to 0
-func FractionalAvgPoolSeed2(value int64) FractionalAvgPoolAttr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
-	}
-}
-
-// Performs fractional average pooling on the input.
-//
-// Fractional average pooling is similar to Fractional max pooling in the pooling
-// region generation step. The only difference is that after pooling regions are
-// generated, a mean operation is performed instead of a max operation in each
-// pooling region.
-//
-// Arguments:
-//	value: 4-D with shape `[batch, height, width, channels]`.
-//	pooling_ratio: Pooling ratio for each dimension of `value`, currently only
-// supports row and col dimension and should be >= 1.0. For example, a valid
-// pooling ratio looks like [1.0, 1.44, 1.73, 1.0]. The first and last elements
-// must be 1.0 because we don't allow pooling on batch and channels
-// dimensions. 1.44 and 1.73 are pooling ratio on height and width dimensions
-// respectively.
-//
-// Returns:
-//	output: output tensor after fractional avg pooling.
-//	row_pooling_sequence: row pooling sequence, needed to calculate gradient.
-//	col_pooling_sequence: column pooling sequence, needed to calculate gradient.
-func FractionalAvgPool(scope *Scope, value tf.Output, pooling_ratio []float32, optional ...FractionalAvgPoolAttr) (output tf.Output, row_pooling_sequence tf.Output, col_pooling_sequence tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"pooling_ratio": pooling_ratio}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "FractionalAvgPool",
-		Input: []tf.Input{
-			value,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
 // StatefulUniformFullIntAttr is an optional argument to StatefulUniformFullInt.
 type StatefulUniformFullIntAttr func(optionalAttr)
 
@@ -47966,49 +47866,6 @@ func LoadTPUEmbeddingFTRLParameters(scope *Scope, parameters tf.Output, accumula
 	return scope.AddOperation(opspec)
 }
 
-// Conv3DBackpropInputAttr is an optional argument to Conv3DBackpropInput.
-type Conv3DBackpropInputAttr func(optionalAttr)
-
-// Conv3DBackpropInputDilations sets the optional dilations attribute to value.
-// If not specified, defaults to <i:1 i:1 i:1 i:1 i:1 >
-func Conv3DBackpropInputDilations(value []int64) Conv3DBackpropInputAttr {
-	return func(m optionalAttr) {
-		m["dilations"] = value
-	}
-}
-
-// Computes the gradients of 3-D convolution with respect to the input.
-//
-// DEPRECATED at GraphDef version 10: Use Conv3DBackpropInputV2
-//
-// Arguments:
-//	input: Shape `[batch, depth, rows, cols, in_channels]`.
-//	filter: Shape `[depth, rows, cols, in_channels, out_channels]`.
-// `in_channels` must match between `input` and `filter`.
-//	out_backprop: Backprop signal of shape `[batch, out_depth, out_rows, out_cols,
-// out_channels]`.
-//	strides: 1-D tensor of length 5. The stride of the sliding window for each
-// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
-//	padding: The type of padding algorithm to use.
-func Conv3DBackpropInput(scope *Scope, input tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...Conv3DBackpropInputAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "Conv3DBackpropInput",
-		Input: []tf.Input{
-			input, filter, out_backprop,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // DepthwiseConv2dNativeAttr is an optional argument to DepthwiseConv2dNative.
 type DepthwiseConv2dNativeAttr func(optionalAttr)
 
@@ -48963,44 +48820,6 @@ func AutoShardDataset(scope *Scope, input_dataset tf.Output, num_workers tf.Outp
 	return op.Output(0)
 }
 
-// InfeedEnqueuePrelinearizedBufferAttr is an optional argument to InfeedEnqueuePrelinearizedBuffer.
-type InfeedEnqueuePrelinearizedBufferAttr func(optionalAttr)
-
-// InfeedEnqueuePrelinearizedBufferDeviceOrdinal sets the optional device_ordinal attribute to value.
-//
-// value: The TPU device to use. This should be -1 when the Op is running on a TPU device
-// and = 0 when the Op is running on the CPU device.
-// If not specified, defaults to -1
-func InfeedEnqueuePrelinearizedBufferDeviceOrdinal(value int64) InfeedEnqueuePrelinearizedBufferAttr {
-	return func(m optionalAttr) {
-		m["device_ordinal"] = value
-	}
-}
-
-// An op which enqueues prelinearized buffer into TPU infeed.
-//
-// Arguments:
-//	input: A variant tensor representing linearized output.
-//
-// Returns the created operation.
-func InfeedEnqueuePrelinearizedBuffer(scope *Scope, input tf.Output, optional ...InfeedEnqueuePrelinearizedBufferAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "InfeedEnqueuePrelinearizedBuffer",
-		Input: []tf.Input{
-			input,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
 // RetrieveTPUEmbeddingFTRLParametersAttr is an optional argument to RetrieveTPUEmbeddingFTRLParameters.
 type RetrieveTPUEmbeddingFTRLParametersAttr func(optionalAttr)
 
@@ -49126,6 +48945,90 @@ func ResourceApplyAdagradV2(scope *Scope, var_ tf.Output, accum tf.Output, lr tf
 	return scope.AddOperation(opspec)
 }
 
+// QuantizedMatMulWithBiasAndReluAttr is an optional argument to QuantizedMatMulWithBiasAndRelu.
+type QuantizedMatMulWithBiasAndReluAttr func(optionalAttr)
+
+// QuantizedMatMulWithBiasAndReluToutput sets the optional Toutput attribute to value.
+// If not specified, defaults to DT_QINT32
+func QuantizedMatMulWithBiasAndReluToutput(value tf.DataType) QuantizedMatMulWithBiasAndReluAttr {
+	return func(m optionalAttr) {
+		m["Toutput"] = value
+	}
+}
+
+// QuantizedMatMulWithBiasAndReluTransposeA sets the optional transpose_a attribute to value.
+//
+// value: If true, `a` is transposed before multiplication.
+// If not specified, defaults to false
+func QuantizedMatMulWithBiasAndReluTransposeA(value bool) QuantizedMatMulWithBiasAndReluAttr {
+	return func(m optionalAttr) {
+		m["transpose_a"] = value
+	}
+}
+
+// QuantizedMatMulWithBiasAndReluTransposeB sets the optional transpose_b attribute to value.
+//
+// value: If true, `b` is transposed before multiplication.
+// If not specified, defaults to false
+func QuantizedMatMulWithBiasAndReluTransposeB(value bool) QuantizedMatMulWithBiasAndReluAttr {
+	return func(m optionalAttr) {
+		m["transpose_b"] = value
+	}
+}
+
+// QuantizedMatMulWithBiasAndReluInputQuantMode sets the optional input_quant_mode attribute to value.
+//
+// value: Input data quantization mode. Either MIN_FIRST(default) or SCALED.
+// If not specified, defaults to "MIN_FIRST"
+func QuantizedMatMulWithBiasAndReluInputQuantMode(value string) QuantizedMatMulWithBiasAndReluAttr {
+	return func(m optionalAttr) {
+		m["input_quant_mode"] = value
+	}
+}
+
+// Perform a quantized matrix multiplication of  `a` by the matrix `b` with bias
+// add and relu fusion.
+//
+// The inputs must be two-dimensional matrices and 1D bias vector. And the inner
+// dimension of `a` (after being transposed if `transpose_a` is non-zero) must
+// match the outer dimension of `b` (after being transposed if `transposed_b` is
+// non-zero). Then do broadcast add operation with bias values on the matrix
+// multiplication result. The bias size must match inner dimension of `b`. Then do
+// relu activation to get non-negative result.
+//
+// Arguments:
+//	a: A matrix to be multiplied. Must be a two-dimensional tensor of type `quint8`.
+//	b: A matrix to be multiplied and must be a two-dimensional tensor of type `qint8`.
+//	bias: A 1D bias tensor with size matching with inner dimension of `b` (after being
+// transposed if `transposed_b` is non-zero).
+//	min_a: The float value that the lowest quantized `a` value represents.
+//	max_a: The float value that the highest quantized `a` value represents.
+//	min_b: The float value that the lowest quantized `b` value represents.
+//	max_b: The float value that the highest quantized `b` value represents.
+//
+// Returns:
+//	out
+//	min_out: The float value that the lowest quantized output value represents.
+//	max_out: The float value that the highest quantized output value represents.
+func QuantizedMatMulWithBiasAndRelu(scope *Scope, a tf.Output, b tf.Output, bias tf.Output, min_a tf.Output, max_a tf.Output, min_b tf.Output, max_b tf.Output, optional ...QuantizedMatMulWithBiasAndReluAttr) (out tf.Output, min_out tf.Output, max_out tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "QuantizedMatMulWithBiasAndRelu",
+		Input: []tf.Input{
+			a, b, bias, min_a, max_a, min_b, max_b,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
 // Shuts down a running distributed TPU system.
 //
 // The op returns an error if no system is running.
@@ -49734,3 +49637,60 @@ func TPUReplicatedOutput(scope *Scope, input tf.Output, num_replicas int64) (out
 	}
 	return outputs
 }
+
+// Removes keys and its associated values from a table.
+//
+// The tensor `keys` must of the same type as the keys of the table. Keys not
+// already in the table are silently ignored.
+//
+// Arguments:
+//	table_handle: Handle to the table.
+//	keys: Any shape.  Keys of the elements to remove.
+//
+// Returns the created operation.
+func LookupTableRemoveV2(scope *Scope, table_handle tf.Output, keys tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "LookupTableRemoveV2",
+		Input: []tf.Input{
+			table_handle, keys,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
+// NotEqualAttr is an optional argument to NotEqual.
+type NotEqualAttr func(optionalAttr)
+
+// NotEqualIncompatibleShapeError sets the optional incompatible_shape_error attribute to value.
+// If not specified, defaults to true
+func NotEqualIncompatibleShapeError(value bool) NotEqualAttr {
+	return func(m optionalAttr) {
+		m["incompatible_shape_error"] = value
+	}
+}
+
+// Returns the truth value of (x != y) element-wise.
+//
+// *NOTE*: `NotEqual` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func NotEqual(scope *Scope, x tf.Output, y tf.Output, optional ...NotEqualAttr) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "NotEqual",
+		Input: []tf.Input{
+			x, y,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}

From 899471f58fe71ca36e5f2669ea766acc5c47039d Mon Sep 17 00:00:00 2001
From: Henry Tan <henrytan@google.com>
Date: Wed, 17 Jun 2020 22:06:11 -0700
Subject: [PATCH 0469/1390] TPU library internal update.

PiperOrigin-RevId: 317033127
Change-Id: I06b440435c20e83126cf28c97f11ff1a3b18c61e
---
 tensorflow/core/tpu/kernels/BUILD             |  53 +-
 .../core/tpu/kernels/compiled_subgraph.h      |  10 +-
 .../kernels/tpu_compilation_cache_entry.cc    |   2 +-
 .../tpu/kernels/tpu_compilation_cache_entry.h |  24 +-
 .../tpu_compilation_cache_entry_impl.h        | 108 ++++
 .../kernels/tpu_compilation_cache_external.cc | 564 +-----------------
 .../kernels/tpu_compilation_cache_external.h  | 268 +--------
 .../tpu_compilation_cache_interface.cc        |  15 +-
 .../kernels/tpu_compilation_cache_interface.h | 355 +++++++++++
 .../kernels/tpu_compilation_cache_lookup.cc   |  17 +-
 .../kernels/tpu_compilation_cache_lookup.h    |  32 +-
 .../core/tpu/kernels/tpu_compile_op_common.cc |   1 +
 .../core/tpu/kernels/tpu_compile_op_support.h |   1 -
 .../core/tpu/kernels/tpu_program_group.cc     |  11 +-
 .../core/tpu/kernels/tpu_program_group.h      |   9 +-
 .../tpu/kernels/tpu_program_group_interface.h |   6 +-
 16 files changed, 609 insertions(+), 867 deletions(-)
 create mode 100644 tensorflow/core/tpu/kernels/tpu_compilation_cache_entry_impl.h
 create mode 100644 tensorflow/core/tpu/kernels/tpu_compilation_cache_interface.h

diff --git a/tensorflow/core/tpu/kernels/BUILD b/tensorflow/core/tpu/kernels/BUILD
index 318d60b22df..9ba9ad61aa0 100644
--- a/tensorflow/core/tpu/kernels/BUILD
+++ b/tensorflow/core/tpu/kernels/BUILD
@@ -19,9 +19,9 @@ cc_library(
     deps = [
         ":tpu_compile_op_support",
         ":tpu_mesh_state_interface",
+        ":tpu_program_group_interface",
         ":tpu_util",
         ":tpu_util_hdrs",
-        "@com_google_absl//absl/types:span",
         "//tensorflow/compiler/jit:flags",
         "//tensorflow/compiler/jit:shape_inference",
         "//tensorflow/compiler/tf2xla:tf2xla_util",
@@ -30,16 +30,16 @@ cc_library(
         "//tensorflow/compiler/xla:xla_data_proto_cc",
         "//tensorflow/compiler/xla/client:client_library",
         "//tensorflow/compiler/xla/client:compile_only_client",
-        "//tensorflow/core/protobuf/tpu:dynamic_padding_proto_cc",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
-        # "//tensorflow/core/protobuf/tpu:compilation_result_proto_cc",
         "//tensorflow/core/protobuf/tpu:compile_metadata_proto_cc",
+        "//tensorflow/core/protobuf/tpu:dynamic_padding_proto_cc",
         "//tensorflow/core/tpu:tpu_configuration",
         "//tensorflow/core/tpu:tpu_defs",
         "//tensorflow/stream_executor/tpu:tpu_platform_interface",
+        "@com_google_absl//absl/types:span",
     ],
     alwayslink = 1,
 )
@@ -157,14 +157,28 @@ cc_library(
         "tpu_compilation_cache_entry.h",
     ],
     deps = [
+        ":compiled_subgraph",
+        ":tpu_compilation_cache_proto_cc",
         ":tpu_executable_info_proto_cc",
         ":tpu_program_group",
         "//tensorflow/compiler/xla/service:hlo_proto_cc",
+        "//tensorflow/core:framework",
         "//tensorflow/core/lib/core:refcount",
         "//tensorflow/core/platform:casts",
     ],
 )
 
+cc_library(
+    name = "tpu_compilation_cache_entry_impl",
+    srcs = [],
+    hdrs = ["tpu_compilation_cache_entry_impl.h"],
+    deps = [
+        ":compiled_subgraph",
+        ":tpu_compilation_cache_interface",
+        ":tpu_executable_info_proto_cc",
+    ],
+)
+
 cc_library(
     name = "tpu_compilation_cache_lookup",
     srcs = ["tpu_compilation_cache_lookup.cc"],
@@ -174,6 +188,7 @@ cc_library(
     deps = [
         ":tpu_compilation_cache_entry",
         ":tpu_compilation_cache_external",
+        ":tpu_compilation_cache_interface",
         ":tpu_compilation_cache_proto_cc",
         "//tensorflow/core/lib/core:refcount",
         "//tensorflow/core/platform:status",
@@ -247,6 +262,35 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "tpu_compilation_cache_interface",
+    srcs = ["tpu_compilation_cache_interface.cc"],
+    hdrs = ["tpu_compilation_cache_interface.h"],
+    deps = [
+        ":compiled_subgraph",
+        ":tpu_compilation_cache_key",
+        ":tpu_compilation_cache_metrics_hdrs",
+        ":tpu_compilation_cache_proto_cc",
+        ":tpu_util",
+        ":tpu_util_hdrs",
+        ":trace_util_hdrs",
+        "//tensorflow/compiler/tf2xla:host_compute_metadata_proto_cc",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/distributed_runtime/rpc:grpc_call",
+        "//tensorflow/core/platform:casts",  # buildcleaner: keep
+        "//tensorflow/core/profiler/lib:traceme",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/container:node_hash_map",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/synchronization",
+    ],
+    alwayslink = 1,
+)
+
 cc_library(
     name = "tpu_compilation_cache_external",
     srcs = ["tpu_compilation_cache_external.cc"],
@@ -256,6 +300,8 @@ cc_library(
     deps = [
         ":compiled_subgraph",
         ":tpu_compilation_cache_entry",
+        ":tpu_compilation_cache_entry_impl",
+        ":tpu_compilation_cache_interface",
         ":tpu_compilation_cache_key",
         ":tpu_compilation_cache_metrics",  # buildcleaner: keep
         ":tpu_compilation_cache_metrics_hdrs",
@@ -355,6 +401,7 @@ cc_library(
         "//tensorflow/compiler/xla/client:compile_only_client",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
     ],
     alwayslink = 1,
diff --git a/tensorflow/core/tpu/kernels/compiled_subgraph.h b/tensorflow/core/tpu/kernels/compiled_subgraph.h
index 1066e4839dd..a97c652c279 100644
--- a/tensorflow/core/tpu/kernels/compiled_subgraph.h
+++ b/tensorflow/core/tpu/kernels/compiled_subgraph.h
@@ -25,6 +25,9 @@ limitations under the License.
 namespace tensorflow {
 namespace tpu {
 
+// Forward declaration to avoid circular dependency.
+class TpuCompilationCacheInterface;
+
 // Cache for compiled TPU program.
 //
 // Each key identifies a unique subgraph, and the value is the vector of
@@ -100,10 +103,7 @@ namespace tpu {
 // unmarked and set to most recently used.
 //
 struct CompiledSubgraph : public core::RefCounted {
-  // TODO(henrytan): once `TpuCompilationCache` and
-  // `TpuCompilationCacheExternal` inherits from `TpuCompilationCacheInterface`
-  // update void* with `TpuCompilationCacheInterface`
-  void* parent = nullptr;  // Not owned.
+  TpuCompilationCacheInterface* parent = nullptr;  // Not owned.
 
   bool initialized = false;
 
@@ -145,7 +145,7 @@ struct CompiledSubgraph : public core::RefCounted {
   // owning main entry.
   CompiledSubgraph* main_entry = nullptr;
 
-  // Compiled Tpu program.
+  // Compiled TPU program group.
   std::unique_ptr<TpuProgramGroupInterface> tpu_program_group;
 
   // Computes total program size.
diff --git a/tensorflow/core/tpu/kernels/tpu_compilation_cache_entry.cc b/tensorflow/core/tpu/kernels/tpu_compilation_cache_entry.cc
index 4d1f306ec0c..73f55853306 100644
--- a/tensorflow/core/tpu/kernels/tpu_compilation_cache_entry.cc
+++ b/tensorflow/core/tpu/kernels/tpu_compilation_cache_entry.cc
@@ -40,7 +40,7 @@ TpuCompilationCacheEntry::get_host_transfer_info() const {
 }
 
 const xla::HloProto* TpuCompilationCacheEntry::get_hlo_metadata() const {
-  return tpu_program_group_->hlo_metadatas()[core_index_].get();
+  return tpu_program_group_->hlo_metadatas()[core_index_];
 }
 
 // TODO(henrytan,jiawenhao): When should we expect more than one
diff --git a/tensorflow/core/tpu/kernels/tpu_compilation_cache_entry.h b/tensorflow/core/tpu/kernels/tpu_compilation_cache_entry.h
index a561fc51778..b3766b8b4dd 100644
--- a/tensorflow/core/tpu/kernels/tpu_compilation_cache_entry.h
+++ b/tensorflow/core/tpu/kernels/tpu_compilation_cache_entry.h
@@ -23,7 +23,7 @@ limitations under the License.
 namespace tensorflow {
 namespace tpu {
 
-// A version of `CompilationCacheEntry` that exposes Tpu binary program
+// A version of `CompilationCacheEntry` to access Tpu binary program
 // `XLA_TpuProgram`.
 class TpuCompilationCacheEntry {
  public:
@@ -42,28 +42,6 @@ class TpuCompilationCacheEntry {
   int core_index_;
 };
 
-// Base class for a reference to a cached proto. A unique_ptr to a
-// CompilationCacheEntryRef is returned by all the cache Lookup methods below,
-// and ensures the underlying proto is not garbage-collected until the client
-// discards the ptr.
-class CompilationCacheEntryRef {
- public:
-  virtual ~CompilationCacheEntryRef() = default;
-
-  // Returns a CompilationCacheEntry that should not be used beyond the lifetime
-  // of the CompilationCacheEntryRef.
-  virtual TpuCompilationCacheEntry get() = 0;
-};
-
-// Base class that holds references to compiled protos so that the protos are
-// not garbage-collected before being used by execute ops. Use
-// TpuCompilationCache::MakePerStepRefHolder to create an instance of a concrete
-// ref holder object.
-class CompilationRefHolder : public ResourceBase {
- public:
-  ~CompilationRefHolder() override = default;
-};
-
 }  // namespace tpu
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/tpu/kernels/tpu_compilation_cache_entry_impl.h b/tensorflow/core/tpu/kernels/tpu_compilation_cache_entry_impl.h
new file mode 100644
index 00000000000..501f802b01f
--- /dev/null
+++ b/tensorflow/core/tpu/kernels/tpu_compilation_cache_entry_impl.h
@@ -0,0 +1,108 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_TPU_KERNELS_TPU_COMPILATION_CACHE_ENTRY_IMPL_H_
+#define TENSORFLOW_CORE_TPU_KERNELS_TPU_COMPILATION_CACHE_ENTRY_IMPL_H_
+
+#include "tensorflow/core/tpu/kernels/compiled_subgraph.h"
+#include "tensorflow/core/tpu/kernels/tpu_compilation_cache_interface.h"
+#include "tensorflow/core/tpu/kernels/tpu_executable_info.pb.h"
+
+namespace tensorflow {
+namespace tpu {
+
+// Wrapper for a cache entry that holds a reference to the entry until the
+// wrapper is deleted. This wrapper is the concrete type of
+// CompilationCacheEntryRef returned by Lookup.
+template <typename CacheEntryType>
+class CompilationCacheEntryRefImpl
+    : public CompilationCacheEntryRef<CacheEntryType> {
+ public:
+  CompilationCacheEntryRefImpl(TpuCompilationCacheInterface* parent,
+                               CompiledSubgraph* entry, int index);
+
+  ~CompilationCacheEntryRefImpl() override;
+
+  Status ToSubEntryRef(CompilationCacheFetchTarget fetch_target) override;
+
+ protected:
+  TpuCompilationCacheInterface* parent_;  // Not owned.
+  // A reference to entry_ is acquired in the constructor and released via
+  // parent->DiscardEntryRefs in the destructor.
+  CompiledSubgraph* entry_;
+  // The index of the program in entry_ that is returned by the get method.
+  int index_;
+};
+
+template <typename CacheEntryType>
+CompilationCacheEntryRefImpl<CacheEntryType>::CompilationCacheEntryRefImpl(
+    TpuCompilationCacheInterface* parent, CompiledSubgraph* entry, int index)
+    : parent_(parent), entry_(entry), index_(index) {
+  if (entry_ == nullptr) {
+    return;
+  }
+  if (entry_->main_entry == nullptr) {
+    entry_->Ref();
+  } else {
+    // This is a sharding/unsharding entry nested in a main entry. Only
+    // refcount the main entry.
+    entry_->main_entry->Ref();
+  }
+}
+
+template <typename CacheEntryType>
+CompilationCacheEntryRefImpl<CacheEntryType>::~CompilationCacheEntryRefImpl() {
+  if (entry_ == nullptr) {
+    return;
+  }
+  if (entry_->main_entry == nullptr) {
+    parent_->DiscardEntryRefs({entry_});
+  } else {
+    parent_->DiscardEntryRefs({entry_->main_entry});
+  }
+}
+
+template <typename CacheEntryType>
+Status CompilationCacheEntryRefImpl<CacheEntryType>::ToSubEntryRef(
+    CompilationCacheFetchTarget fetch_target) {
+  CompiledSubgraph* target = nullptr;
+  switch (fetch_target) {
+    case CompilationCacheFetchTarget::MAIN:
+      target = entry_;
+      break;
+    case CompilationCacheFetchTarget::SHARDING:
+      target = entry_->sharding_entry.get();
+      break;
+    case CompilationCacheFetchTarget::UNSHARDING:
+      target = entry_->unsharding_entry.get();
+      break;
+    default:
+      return xla::InvalidArgument("Invalid fetch target: %d", fetch_target);
+  }
+
+  if (target == nullptr) {
+    // Cache entry does not have an unsharding subentry. Unref and replace
+    // with nullptr.
+    parent_->DiscardEntryRefs({entry_});
+  }
+  // Otherwise, since the refcount is always on the main entry, we don't
+  // need ref/unref.
+  entry_ = target;
+  return Status::OK();
+}
+
+}  // namespace tpu
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TPU_KERNELS_TPU_COMPILATION_CACHE_ENTRY_IMPL_H_
diff --git a/tensorflow/core/tpu/kernels/tpu_compilation_cache_external.cc b/tensorflow/core/tpu/kernels/tpu_compilation_cache_external.cc
index 614dfbdf577..8cee90e8e55 100644
--- a/tensorflow/core/tpu/kernels/tpu_compilation_cache_external.cc
+++ b/tensorflow/core/tpu/kernels/tpu_compilation_cache_external.cc
@@ -50,14 +50,6 @@ void PopulateEntry(const std::string& key, CompiledSubgraph* entry,
   entry->initialized = true;
 }
 
-std::string ConstructCompilationCacheKey(const TpuCompilationCacheKey& key) {
-  if (!key.has_guaranteed_const) {
-    return key.prefix;
-  }
-  return absl::StrCat(key.prefix, "|", key.session_handle, "|",
-                      key.guaranteed_const_fingerprint());
-}
-
 // Return fingerprint_in_metadata if it's not empty; otherwise read input tensor
 // data to compute the fingerprint.
 std::string GuaranteedConstFingerprint(
@@ -123,85 +115,32 @@ std::string CreateConfigPrefix(const TPUCompileMetadataProto& metadata) {
 
 }  // namespace
 
-TpuCompilationCacheExternal::TpuCompilationCacheExternal(int64_t max_cache_size)
-    : max_cache_size_(max_cache_size) {
-  if (max_cache_size < 0) {
-    LOG(FATAL) << "`max_cache_size` value must be greater than equal to 0";
-  }
-  VLOG(1) << "Created compilation cache size " << max_cache_size_ << " bytes.";
-}
+TpuCompilationCacheExternal::EntryRefImpl::EntryRefImpl(
+    TpuCompilationCacheInterface* parent, CompiledSubgraph* entry, int index)
+    : CompilationCacheEntryRefImpl<TpuCompilationCacheEntry>(parent, entry,
+                                                             index) {}
 
-TpuCompilationCacheExternal::~TpuCompilationCacheExternal() {
-  VLOG(1) << "TpuCompilationCacheExternal::~TpuCompilationCacheExternal()";
-  // A buggy client may be holding onto a reference, or a client might have
-  // crashed while holding onto a reference. In either case, discard all
-  // outstanding client references to avoid leaking storage.
-  for (const auto& entry : entries_by_uid_) {
-    while (entry.second->external_references > 0) {
-      TF_CHECK_OK(Release(entry.first));
-    }
+TpuCompilationCacheEntry TpuCompilationCacheExternal::EntryRefImpl::get() {
+  if (entry_ == nullptr) {
+    // Create an empty entry if the entry is nullptr. This corresponds to
+    // non-existing sharding/unsharding entries.
+    return TpuCompilationCacheEntry();
   }
-  while (!entries_by_last_use_.empty()) {
-    UnloadAndDestroy(MarkOldestEntryForEviction());
-  }
-  // By the time the cache is deleted all reference holders should have already
-  // been deleted, since they were holding references to the cache. So all
-  // entries should be gone at this point.
-  CHECK_EQ(cache_store_.size(), 0);
-  CHECK_EQ(entries_by_uid_.size(), 0);
-  CHECK_EQ(entries_by_proto_key_.size(), 0);
-  CHECK_EQ(cache_size_, 0);
-  CHECK_EQ(marked_for_eviction_size_, 0);
-}
-
-std::string TpuCompilationCacheExternal::FindCacheKey(
-    const TpuCompilationCacheKey& subgraph_key) const {
-  if (!subgraph_key.has_guaranteed_const) {
-    return subgraph_key.prefix;
-  }
-  auto iter = session_key_map_.find(
-      strings::StrCat(subgraph_key.prefix, subgraph_key.session_handle));
-  if (iter != session_key_map_.end()) {
-    return iter->second;
-  }
-  iter = fingerprint_key_map_.find(strings::StrCat(
-      subgraph_key.prefix, subgraph_key.guaranteed_const_fingerprint()));
-  if (iter != session_key_map_.end()) {
-    return iter->second;
-  }
-  VLOG(1) << "No matching cache key found for key "
-          << ConstructCompilationCacheKey(subgraph_key);
-  return "";
-}
-
-void TpuCompilationCacheExternal::InsertEntry(
-    const std::string& cache_key, const TpuCompilationCacheKey& subgraph_key,
-    CompiledSubgraph* entry) {
-  entry->parent = this;
-  entry->subgraph_key = cache_key;
-  entry->uid = get_uid();
-  TpuCompilationCacheMetrics::SetCacheEntryCount(cache_store_.size());
-  entry->cache_entry_debug_string = subgraph_key.prefix;
-  VLOG(1) << "Cache Initializing Entry Session Debug "
-          << entry->cache_entry_debug_string;
-
-  if (!subgraph_key.has_guaranteed_const) {
-    return;
-  }
-  session_key_map_.insert(std::make_pair(
-      strings::StrCat(subgraph_key.prefix, subgraph_key.session_handle),
-      cache_key));
-  fingerprint_key_map_.insert(std::make_pair(
-      strings::StrCat(subgraph_key.prefix,
-                      subgraph_key.guaranteed_const_fingerprint()),
-      cache_key));
+  return TpuCompilationCacheEntry(entry_->tpu_program_group.get(), index_);
 }
 
 CompiledSubgraph* TpuCompilationCacheExternal::InitializeEntry(
     const string& key,
-    const std::function<Status(TpuProgramGroup*)>& initialize_program,
+    const std::function<Status(TpuProgramGroupInterface*)>& initialize_program,
     const TpuCompilationCacheKey& subgraph_key) {
   CompiledSubgraph* main_entry = new CompiledSubgraph();
+  main_entry->parent = this;
+  main_entry->subgraph_key = key;
+  main_entry->uid = get_uid();
+  // TODO(henrytan): implement TpuCompilationCacheKey.debug_string.
+  main_entry->cache_entry_debug_string = subgraph_key.prefix;
+  VLOG(1) << "Cache Initializing Entry Session Debug "
+          << main_entry->cache_entry_debug_string;
 
   // Add the entry to the cache, with size zero since there are no compiled
   // programs in it. Once the subgraph has been compiled,
@@ -212,7 +151,7 @@ CompiledSubgraph* TpuCompilationCacheExternal::InitializeEntry(
   // who created the entry. A second reference, owned by the cache, will be
   // added below since we leave the entry in the 'marked for eviction' state
   // here.
-  InsertEntry(key, subgraph_key, main_entry);
+  InsertEntry(key, main_entry);
 
   // Initialize the programs outside the lock so that other cache operations
   // can proceed during the (potentially lengthy) initialization.
@@ -320,470 +259,5 @@ TpuCompilationCacheExternal::CreateCompilationCacheKey(
   }
   return key;
 }
-
-TpuCompilationRefHolder* TpuCompilationCacheExternal::MakePerStepRefHolder() {
-  return new RefHolder(this);
-}
-
-Status TpuCompilationCacheExternal::MarkEntryForEviction(int64 subgraph_uid) {
-  profiler::TraceMe key_release_traceme(
-      "TPU compilation cache possibly evict uid",
-      /*level=*/2);
-  CompiledSubgraph* deleted_entry = nullptr;
-  {
-    absl::MutexLock lock(&mu_);
-    auto iter = entries_by_uid_.find(subgraph_uid);
-    if (iter == entries_by_uid_.end()) {
-      // If already evicted, return ok.
-      return Status::OK();
-    }
-
-    // Mark entry for eviction.
-    CompiledSubgraph* subgraph_to_evict = iter->second;
-    // If there are external references, should not use this API.
-    if (subgraph_to_evict->external_references != 0) {
-      return errors::Internal("Subgraph ", subgraph_to_evict->subgraph_key,
-                              " external_references greater than zero. Should "
-                              "use TpuCompilationCache::Release.");
-    }
-
-    VLOG(1) << "Marking " << subgraph_to_evict->subgraph_key << " for eviction";
-    entries_by_last_use_.erase(subgraph_to_evict->last_use);
-    cache_size_ -= subgraph_to_evict->total_size;
-    marked_for_eviction_size_ += subgraph_to_evict->total_size;
-
-    // Evict if refcount exactly one, otherwise only discard cache's reference
-    // to the entry while the actual eviction will happen when refholder's
-    // references go away.
-    deleted_entry = DiscardEntryRef(subgraph_to_evict);
-
-    VLOG(1) << "After possibly evicting entry " << subgraph_uid
-            << " refs cache is " << cache_store_.size() << " entries ("
-            << cache_size_ + marked_for_eviction_size_
-            << " bytes), marked for eviction "
-            << (cache_store_.size() - entries_by_last_use_.size())
-            << " entries (" << marked_for_eviction_size_ << " bytes).";
-  }
-
-  // Unload from device cache if entry is evicted from host cache.
-  UnloadAndDestroy(deleted_entry);
-  return Status::OK();
-}
-
-Status TpuCompilationCacheExternal::Release(int64 subgraph_uid) {
-  profiler::TraceMe key_release_traceme("TPU compilation cache release uid",
-                                        /*level=*/2);
-
-  CompiledSubgraph* deleted_entry = nullptr;
-  {
-    absl::MutexLock lock(&mu_);
-    auto iter = entries_by_uid_.find(subgraph_uid);
-
-    if (iter == entries_by_uid_.end()) {
-      return errors::NotFound("No cache entry found for uid ", subgraph_uid);
-    }
-
-    CHECK_GT(iter->second->external_references, 0);
-    --iter->second->external_references;
-
-    deleted_entry = DiscardEntryRef(iter->second);
-
-    VLOG(1) << "After releasing entry " << subgraph_uid << " refs cache is "
-            << cache_store_.size() << " entries ("
-            << cache_size_ + marked_for_eviction_size_
-            << " bytes), marked for eviction "
-            << (cache_store_.size() - entries_by_last_use_.size())
-            << " entries (" << marked_for_eviction_size_ << " bytes).";
-  }
-  UnloadAndDestroy(deleted_entry);
-  return Status::OK();
-}
-
-void TpuCompilationCacheExternal::UnloadAndDestroy(CompiledSubgraph* entry) {
-  if (!entry) return;
-
-  CHECK(entry->RefCountIsOne());
-  entry->tpu_program_group->UnloadAndDestroyPrograms();
-  entry->Unref();
-}
-
-size_t TpuCompilationCacheExternal::RemoveEntry(const string& key) {
-  auto erased = cache_store_.erase(key);
-  TpuCompilationCacheMetrics::SetCacheEntryCount(cache_store_.size());
-  auto parsed_key_or_status = ParseCompilationCacheKey(key);
-  CHECK(parsed_key_or_status.status().ok());
-  const TpuCompilationCacheKey parsed_key =
-      parsed_key_or_status.ConsumeValueOrDie();
-  if (!parsed_key.has_guaranteed_const) {
-    return erased;
-  }
-  session_key_map_.erase(
-      strings::StrCat(parsed_key.prefix, parsed_key.session_handle));
-  fingerprint_key_map_.erase(strings::StrCat(
-      parsed_key.prefix, parsed_key.guaranteed_const_fingerprint()));
-  return erased;
-}
-
-ABSL_MUST_USE_RESULT CompiledSubgraph*
-TpuCompilationCacheExternal::DiscardEntryRef(CompiledSubgraph* entry) {
-  if (entry->RefCountIsOne()) {
-    // The last reference to this entry is going away, so really delete it from
-    // the cache in such a way that it can't be restored by being looked up
-    // again.
-
-    // Sanity-check that it has been marked for eviction.
-    CHECK(entries_by_last_use_.find(entry->last_use) ==
-          entries_by_last_use_.end());
-    // Update the counter tracking how much space is taken up by entries that
-    // are marked for eviction.
-    marked_for_eviction_size_ -= entry->total_size;
-
-    // Remove the entry from the cache.
-    auto erased = RemoveEntry(entry->subgraph_key);
-
-    if (erased == 0) {
-      LOG(FATAL) << "Tried to discard nonexistent cache entry";
-    }
-    erased = entries_by_uid_.erase(entry->uid);
-    CHECK_EQ(erased, 1);
-    for (const string& key : entry->proto_key) {
-      erased = entries_by_proto_key_.erase(key);
-      CHECK_EQ(erased, 1);
-    }
-    // The actual deletion will happen outside the lock in UnloadAndDestroy().
-    return entry;
-  }
-  entry->Unref();
-  return nullptr;
-}
-
-void TpuCompilationCacheExternal::DiscardEntryRefs(
-    gtl::ArraySlice<CompiledSubgraph*> entries) {
-  std::vector<CompiledSubgraph*> removed_entries;
-  {
-    absl::MutexLock lock(&mu_);
-
-    for (auto entry : entries) {
-      removed_entries.push_back(DiscardEntryRef(entry));
-    }
-
-    VLOG(1) << "After discarding entry refs cache is " << cache_store_.size()
-            << " entries (" << cache_size_ + marked_for_eviction_size_
-            << " bytes), marked for eviction "
-            << (cache_store_.size() - entries_by_last_use_.size())
-            << " entries (" << marked_for_eviction_size_ << " bytes).";
-  }
-  for (auto removed_entry : removed_entries) {
-    UnloadAndDestroy(removed_entry);
-  }
-}
-
-ABSL_MUST_USE_RESULT CompiledSubgraph*
-TpuCompilationCacheExternal::MarkOldestEntryForEviction() {
-  CompiledSubgraph* entry_to_mark = entries_by_last_use_.begin()->second;
-  VLOG(1) << "Marking " << entry_to_mark->subgraph_key << " for eviction";
-  entries_by_last_use_.erase(entry_to_mark->last_use);
-  cache_size_ -= entry_to_mark->total_size;
-  marked_for_eviction_size_ += entry_to_mark->total_size;
-  // Discard the cache's reference to entry. If steps are holding onto
-  // references to entry it won't be deleted until the last step holding it
-  // completes. It stays in the cache in the meantime and can be resurrected
-  // by a call to CompileIfKeyAbsent if that occurs before the last reference
-  // expires.
-  return DiscardEntryRef(entry_to_mark);
-}
-
-void TpuCompilationCacheExternal::LookupEntryMarkedForEviction(
-    CompiledSubgraph* entry, std::vector<CompiledSubgraph*>* removed_entries) {
-  // The entry was previously marked for eviction (or is newly created) so
-  // unmark it. Add a reference (owned by the cache), update the cache size, and
-  // mark something old for eviction if necessary.
-  entry->Ref();
-  marked_for_eviction_size_ -= entry->total_size;
-  cache_size_ += entry->total_size;
-
-  // Mark the least-recently-used non-marked entry for eviction. Never mark the
-  // most-recently used entry (i.e., do nothing if entries_by_last_use_ == 1
-  // which means there's only one entry not already marked for eviction), so
-  // that an entry persists in the cache even if it is larger than the allocated
-  // cache size.
-  while (entries_by_last_use_.size() > 1 && cache_size_ > max_cache_size_) {
-    if (auto entry_to_evict = MarkOldestEntryForEviction()) {
-      removed_entries->push_back(entry_to_evict);
-    }
-  }
-}
-
-Status TpuCompilationCacheExternal::ToSubEntryRef(
-    CompilationCacheEntryRef* entry,
-    CompilationCacheFetchTarget fetch_target) const {
-  return static_cast<TpuEntryRefImpl*>(entry)->ToSubEntryRef(fetch_target);
-}
-
-TpuCompilationCacheExternal::TpuEntryRefImpl::TpuEntryRefImpl(
-    TpuCompilationCacheExternal* parent, CompiledSubgraph* entry, int index)
-    : parent_(parent), entry_(entry), index_(index) {
-  if (entry_ == nullptr) {
-    return;
-  }
-  if (entry_->main_entry == nullptr) {
-    entry_->Ref();
-  } else {
-    // This is a sharding/unsharding entry nested in a main entry. Only refcount
-    // the main entry.
-    entry_->main_entry->Ref();
-  }
-}
-
-TpuCompilationCacheExternal::TpuEntryRefImpl::~TpuEntryRefImpl() {
-  if (entry_ == nullptr) {
-    return;
-  }
-  if (entry_->main_entry == nullptr) {
-    parent_->DiscardEntryRefs({entry_});
-  } else {
-    parent_->DiscardEntryRefs({entry_->main_entry});
-  }
-}
-
-TpuCompilationCacheEntry TpuCompilationCacheExternal::TpuEntryRefImpl::get() {
-  if (entry_ == nullptr) {
-    // Create an empty entry if the entry is nullptr. This corresponds to
-    // non-existing sharding/unsharding entries.
-    return TpuCompilationCacheEntry();
-  }
-  return TpuCompilationCacheEntry(entry_->tpu_program_group.get(), index_);
-}
-
-Status TpuCompilationCacheExternal::TpuEntryRefImpl::ToSubEntryRef(
-    CompilationCacheFetchTarget fetch_target) {
-  CompiledSubgraph* target = nullptr;
-  switch (fetch_target) {
-    case CompilationCacheFetchTarget::MAIN:
-      target = entry_;
-      break;
-    case CompilationCacheFetchTarget::SHARDING:
-      target = entry_->sharding_entry.get();
-      break;
-    case CompilationCacheFetchTarget::UNSHARDING:
-      target = entry_->unsharding_entry.get();
-      break;
-    default:
-      return xla::InvalidArgument("Invalid fetch target: %d", fetch_target);
-  }
-
-  if (target == nullptr) {
-    // Cache entry does not have an unsharding subentry. Unref and replace
-    // with nullptr.
-    parent_->DiscardEntryRefs({entry_});
-  }
-  // Otherwise, since the refcount is always on the main entry, we don't need
-  // ref/unref.
-  entry_ = target;
-  return Status::OK();
-}
-
-Status TpuCompilationCacheExternal::Lookup(
-    int64 uid, int proto_index,
-    std::unique_ptr<CompilationCacheEntryRef>* entry) {
-  entry->reset();
-
-  profiler::TraceMe proto_lookup_traceme(
-      "TPU compilation cache proto lookup by uid",
-      /*level=*/2);
-
-  absl::MutexLock lock(&mu_);
-  const auto iter = entries_by_uid_.find(uid);
-  if (iter == entries_by_uid_.end()) {
-    return errors::NotFound("No subgraph found for uid ", uid);
-  }
-  CompiledSubgraph* cache_entry = iter->second;
-  if (proto_index < 0 ||
-      proto_index >= cache_entry->tpu_program_group->program_size()) {
-    return errors::NotFound("No proto found for core index ", proto_index,
-                            " in subgraph with uid ", uid);
-  }
-  *entry = std::unique_ptr<CompilationCacheEntryRef>(
-      new TpuEntryRefImpl(this, cache_entry, proto_index));
-  return Status::OK();
-}
-
-Status TpuCompilationCacheExternal::Lookup(
-    const string& proto_key, std::unique_ptr<CompilationCacheEntryRef>* entry) {
-  entry->reset();
-
-  profiler::TraceMe proto_lookup_traceme("TPU compilation cache proto lookup",
-                                         /*level=*/2);
-
-  absl::MutexLock lock(&mu_);
-  const auto iter = entries_by_proto_key_.find(proto_key);
-  if (iter == entries_by_proto_key_.end()) {
-    return errors::NotFound("No proto found for key ", proto_key);
-  }
-  CompiledSubgraph* cache_entry = iter->second.first;
-  int proto_index = iter->second.second;
-  *entry = std::unique_ptr<CompilationCacheEntryRef>(
-      new TpuEntryRefImpl(this, cache_entry, proto_index));
-  return Status::OK();
-}
-
-Status TpuCompilationCacheExternal::CompileIfKeyAbsentHelper(
-    const TpuCompilationCacheKey& subgraph_key,
-    const SessionMetadata* session_metadata,
-    TpuCompilationRefHolder* per_step_ref_holder, int64* uid,
-    std::vector<string>* proto_key, std::vector<bool>* may_modify_variables,
-    std::vector<CompiledSubgraph*>* removed_entries,
-    std::vector<std::shared_ptr<const xla::HloProto>>* hlo_metadata,
-    const std::function<Status(TpuProgramGroup*)>& compile_function) {
-  profiler::TraceMe subgraph_lookup_traceme(
-      "TPU compilation cache subgraph lookup",
-      /*level=*/2);
-
-  // NOTE: In spite of the fact that we use MutexLock, we do not hold the lock
-  // for the lifetime of the object, see InitializeEntry() call below.
-  absl::MutexLock lock(&mu_);
-
-  std::string cache_key = FindCacheKey(subgraph_key);
-  auto iter = cache_store_.find(cache_key);
-  bool is_new_key = iter == cache_store_.end();
-
-  const std::string session_name = SessionNameFromMetadata(session_metadata);
-
-  CompiledSubgraph* entry = nullptr;
-  if (is_new_key) {
-    cache_key = ConstructCompilationCacheKey(subgraph_key);
-    TpuCompilationCacheMetrics::IncrementCacheLookupCount(
-        /*is_cache_hit=*/false, session_name);
-    const string msg =
-        strings::StrCat("TPU host compilation cache miss: cache_key(",
-                        cache_key, "), session_name(", session_name, ")");
-
-    TRACESTRING(msg);
-    LOG(INFO) << msg;
-
-    // Check if caller has disabled compilation. Set using
-    // internal::ScopedTpuCompileDisabler.
-    if (!IsTpuCompilationEnabled()) {
-      const string error_msg = strings::StrCat(
-          "[TpuCompilationDisabled]: Compilation cache miss, but compilation "
-          "disabled, session_name(",
-          session_name, ") Debug String: ", subgraph_key.debug_string);
-      if (VLOG_IS_ON(2)) {
-        VLOG(2) << "Cache Missed. Current cache entries: ";
-        for (auto it = cache_store_.begin(); it != cache_store_.end(); ++it) {
-          // TODO(henrytan): add DebugKey as cache_entry_debug_string to
-          // TpuCompilationCacheKey.
-          VLOG(2) << "Cache Debug Info: ";
-          VLOG(2) << it->second->cache_entry_debug_string;
-        }
-      }
-
-      LOG_EVERY_N_SEC(WARNING, 30) << error_msg;
-      return errors::NotFound(error_msg);
-    }
-
-    // The single ref on the newly-created entry is owned by the caller.
-    VLOG(1) << "Before adding new entry for key " << cache_key
-            << " with session_name( " << session_name << ");"
-            << "; cache is " << cache_store_.size() << " entries ("
-            << cache_size_ + marked_for_eviction_size_ << " bytes), "
-            << " marked for eviction "
-            << (cache_store_.size() - entries_by_last_use_.size())
-            << " entries (" << marked_for_eviction_size_ << " bytes).";
-    // Note that InitializeEntry() will Release/Reacquire mu_.
-    entry = InitializeEntry(cache_key, compile_function, subgraph_key);
-    TRACELITERAL("TPU host compilation cache: compilation done.");
-
-    LOG(INFO) << strings::StrCat(
-        "TPU host compilation cache: compilation done for cache_key(",
-        cache_key, "), session_name(", session_name, ")");
-    // If session_name is present, log some additional stats related to HBM
-    // here, so that they can be associated directly to the session.
-    if (!session_name.empty()) {
-      entry->tpu_program_group->LogProgramMemorySummary();
-    }
-  } else {
-    TpuCompilationCacheMetrics::IncrementCacheLookupCount(true, session_name);
-    const string msg =
-        strings::StrCat("TPU host compilation cache hit: cache_key(", cache_key,
-                        "), session_name(", session_name, ")");
-    TRACESTRING(msg);
-    VLOG(1) << msg;
-    VLOG(1) << "Before refreshing entry for key " << cache_key
-            << " with session_name( " << session_name << "); cache is "
-            << cache_store_.size() << " entries ("
-            << cache_size_ + marked_for_eviction_size_ << " bytes), "
-            << " marked for eviction "
-            << (cache_store_.size() - entries_by_last_use_.size())
-            << " entries (" << marked_for_eviction_size_ << " bytes).";
-    entry = iter->second;
-    // Make a new reference that is owned by the caller.
-    entry->Ref();
-    // Block if necessary until the subgraph has been initialized.
-    mu_.Await(absl::Condition(
-        +[](CompiledSubgraph* e) { return e->initialized; }, entry));
-  }
-
-  // Let the caller know the uid of the entry.
-  *uid = entry->uid;
-  // Let the caller know the keys for each of the cached protos.
-  *proto_key = entry->proto_key;
-  *may_modify_variables = entry->tpu_program_group->may_modify_variables();
-  *hlo_metadata = entry->tpu_program_group->hlo_metadatas();
-
-  // If the caller didn't supply a per_step_ref_holder then the caller is going
-  // to manually release the reference later via a call to Release().
-  if (per_step_ref_holder == nullptr) {
-    ++entry->external_references;
-  } else {
-    // The caller wants its reference to be handed off to a per-step holder that
-    // will discard the reference when the step completes.
-    RefHolder* cast_ref_holder = static_cast<RefHolder*>(per_step_ref_holder);
-    TF_RET_CHECK(cast_ref_holder != nullptr);
-    cast_ref_holder->AddRef(entry);
-  }
-
-  // Remove the old LRU-table entry if it wasn't already marked for eviction.
-  auto erased = entries_by_last_use_.erase(entry->last_use);
-  // Update the LRU table indicating this entry is the most recently used.
-  entry->last_use = use_counter_++;
-  entries_by_last_use_[entry->last_use] = entry;
-  if (erased == 0) {
-    // The entry had been marked for eviction, or is newly created.
-    LookupEntryMarkedForEviction(entry, removed_entries);
-  }
-
-  // Log a little more verbosely when a key is added.
-  if (VLOG_IS_ON(1) || is_new_key) {
-    LOG(INFO) << "After " << (is_new_key ? "adding" : "refreshing")
-              << " entry for key " << cache_key << " with session_name "
-              << session_name << " cache is " << cache_store_.size()
-              << " entries (" << cache_size_ + marked_for_eviction_size_
-              << " bytes), "
-              << " marked for eviction "
-              << (cache_store_.size() - entries_by_last_use_.size())
-              << " entries (" << marked_for_eviction_size_ << " bytes).";
-  }
-  return entry->initialization_status;
-}
-
-tensorflow::Status TpuCompilationCacheExternal::CompileIfKeyAbsent(
-    const TpuCompilationCacheKey& cache_key,
-    const tensorflow::SessionMetadata* session_metadata,
-    TpuCompilationRefHolder* per_step_ref_holder, int64* uid,
-    std::vector<string>* proto_key, std::vector<bool>* may_modify_variables,
-    std::vector<std::shared_ptr<const xla::HloProto>>* hlo_metadata,
-    const std::function<tensorflow::Status(TpuProgramGroup*)>&
-        compile_function) {
-  std::vector<CompiledSubgraph*> removed_entries;
-  auto status = CompileIfKeyAbsentHelper(
-      cache_key, session_metadata, per_step_ref_holder, uid, proto_key,
-      may_modify_variables, &removed_entries, hlo_metadata, compile_function);
-  for (auto entry : removed_entries) {
-    UnloadAndDestroy(entry);
-  }
-  return status;
-}
-
 }  // namespace tpu
 }  // namespace tensorflow
diff --git a/tensorflow/core/tpu/kernels/tpu_compilation_cache_external.h b/tensorflow/core/tpu/kernels/tpu_compilation_cache_external.h
index eff2afde108..2c75cb4d053 100644
--- a/tensorflow/core/tpu/kernels/tpu_compilation_cache_external.h
+++ b/tensorflow/core/tpu/kernels/tpu_compilation_cache_external.h
@@ -26,11 +26,14 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "tensorflow/compiler/xla/service/hlo.pb.h"
 #include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/platform/refcount.h"
 #include "tensorflow/core/protobuf/tpu/compile_metadata.pb.h"
 #include "tensorflow/core/tpu/kernels/compiled_subgraph.h"
 #include "tensorflow/core/tpu/kernels/tpu_compilation_cache.pb.h"
 #include "tensorflow/core/tpu/kernels/tpu_compilation_cache_entry.h"
+#include "tensorflow/core/tpu/kernels/tpu_compilation_cache_entry_impl.h"
+#include "tensorflow/core/tpu/kernels/tpu_compilation_cache_interface.h"
 #include "tensorflow/core/tpu/kernels/tpu_compilation_cache_key.h"
 #include "tensorflow/core/tpu/kernels/tpu_compile_c_api.h"
 #include "tensorflow/core/tpu/kernels/tpu_compile_op_support.h"
@@ -40,37 +43,25 @@ limitations under the License.
 namespace tensorflow {
 namespace tpu {
 
-const char kCompilationCacheResourceName[] = "tpu_compilation_cache";
-const char kCompilationCacheUnloaderResourceName[] =
+constexpr char kCompilationCacheResourceName[] = "tpu_compilation_cache";
+constexpr char kCompilationCacheUnloaderResourceName[] =
     "tpu_compilation_cache_unloader";
 
-// Base class that holds references to compiled protos so that the protos are
-// not garbage-collected before being used by execute ops. Use
-// TpuCompilationCache::MakePerStepRefHolder to create an instance of a concrete
-// ref holder object.
-class TpuCompilationRefHolder : public ResourceBase {
- public:
-  ~TpuCompilationRefHolder() override = default;
-};
-
-class TpuCompilationCacheExternal : public ResourceBase {
+class TpuCompilationCacheExternal : public TpuCompilationCacheInterface {
  public:
   using Status = ::stream_executor::port::Status;
 
-  explicit TpuCompilationCacheExternal(int64_t max_cache_size);
-  ~TpuCompilationCacheExternal() override;
-  TpuCompilationCacheExternal(const TpuCompilationCacheExternal&) = delete;
-  TpuCompilationCacheExternal& operator=(const TpuCompilationCacheExternal&) =
-      delete;
+  class EntryRefImpl
+      : public CompilationCacheEntryRefImpl<TpuCompilationCacheEntry> {
+   public:
+    EntryRefImpl(TpuCompilationCacheInterface* parent, CompiledSubgraph* entry,
+                 int index);
 
-  Status CompileIfKeyAbsent(
-      const TpuCompilationCacheKey& cache_key,
-      const SessionMetadata* session_metadata,
-      TpuCompilationRefHolder* per_step_ref_holder, int64* uid,
-      std::vector<string>* proto_key, std::vector<bool>* may_modify_variables,
-      std::vector<std::shared_ptr<const xla::HloProto>>* hlo_metadata,
-      const std::function<tensorflow::Status(TpuProgramGroup*)>&
-          compile_function);
+    TpuCompilationCacheEntry get() override;
+  };
+
+  explicit TpuCompilationCacheExternal(int64 max_cache_size)
+      : TpuCompilationCacheInterface(max_cache_size) {}
 
   static TpuCompilationCacheKey CreateCompilationCacheKey(
       absl::string_view function_name, uint64 function_library_fingerprint,
@@ -82,177 +73,7 @@ class TpuCompilationCacheExternal : public ResourceBase {
 
   string DebugString() const override { return "TpuCompilationCacheExternal"; }
 
-  // Makes a reference holder for this cache, that can be stored in the per-step
-  // resource manager and will ensure that compiled entries persist until the
-  // end of a step.
-  TpuCompilationRefHolder* MakePerStepRefHolder();
-
-  // Differences between MarkEntryForEviction and Release:
-  // There are two modes of managing cache entries:
-  // 1) LRU eviction + pinning; 2) manual.
-  // We use mode 1) if CompilationRefHolder is provided to CompileIfKeyAbsent.
-  // Otherwise it is manual mode (mainly used by XRT).
-  // MarkEntryForEviction should only be used in mode 1) to eagerly evict cache
-  // entries when callers know that they do not need them anymore.
-  // Release should only be used in mode 2) to explicitly remove an entry.
-
-  // Mark the entry indexed by `subgraph_uid` for eviction. This should only be
-  // called if per_step_ref_holder was NOT nullptr in the corresponding call to
-  // CompileIfKeyAbsent(subgraph_key, ...). Otherwise, use Release(int64
-  // subgraph_uid).
-  Status MarkEntryForEviction(int64 subgraph_uid);
-
-  // Manually discards a reference to the compiled subgraph. This should only be
-  // called if per_step_ref_holder was nullptr in the corresponding call to
-  // CompileIfKeyAbsent(subgraph_key, ...).
-  Status Release(int64 subgraph_uid);
-
-  // Looks up an executable corresponding to the model-parallel core index of
-  // the subgraph represented by key. On success a pointer to an EntryRef
-  // holding the program is returned in entry.
-  Status Lookup(const string& proto_key,
-                std::unique_ptr<CompilationCacheEntryRef>* entry);
-
-  // Looks up an executable corresponding to the model-parallel core index of
-  // the subgraph represented by uid. On success a pointer to an EntryRef
-  // holding the program is returned in entry.
-  Status Lookup(int64 uid, int proto_index,
-                std::unique_ptr<CompilationCacheEntryRef>* entry);
-
-  // Mutates the main entry ref to point to the entry's subentry
-  // (for sharding/unsharding) or main entry (unchanged) representing the
-  // fetch target. The entry ref needs to point to the main entry before this
-  // call.
-  //
-  // If the requested subentry does not exist, the ref will point to a nullptr
-  // entry.
-  Status ToSubEntryRef(CompilationCacheEntryRef* entry,
-                       CompilationCacheFetchTarget fetch_target) const;
-
  private:
-  // Wrapper for a cache entry that holds a reference to the entry until the
-  // wrapper is deleted. This wrapper is the concrete type of
-  // CompilationCacheEntryRef returned by Lookup.
-  class TpuEntryRefImpl : public CompilationCacheEntryRef {
-   public:
-    TpuEntryRefImpl(TpuCompilationCacheExternal* parent,
-                    CompiledSubgraph* entry, int index);
-    ~TpuEntryRefImpl() override;
-
-    TpuCompilationCacheEntry get() override;
-
-    // Mutates this ref to point to the entry's subentry (for
-    // sharding/unsharding) or main entry (unchanged) as specified by
-    // fetch_target. The refcount is kept unchanged, since we only track the
-    // refcount of the main entry. The entry ref needs to point to the main
-    // entry before this call.
-    //
-    // If the requested subentry does not exist, the ref will point to a nullptr
-    // entry, and the original entry will be unref'ed.
-    Status ToSubEntryRef(CompilationCacheFetchTarget fetch_target);
-
-   private:
-    TpuCompilationCacheExternal* parent_;  // Not owned.
-    // A reference to entry_ is acquired in the constructor and released via
-    // parent->DiscardEntryRefs in the destructor.
-    CompiledSubgraph* entry_;
-    // The program in entry_ that is returned by the get method.
-    int index_;
-  };
-
-  // Private implementation of the generic CompilationRefHolder that knows about
-  // CompiledSubgraph entries.
-  class RefHolder : public TpuCompilationRefHolder {
-   public:
-    explicit RefHolder(TpuCompilationCacheExternal* parent) : parent_(parent) {
-      parent_->Ref();
-    }
-    ~RefHolder() override {
-      // Release our reference to the parent.
-      parent_->Unref();
-    }
-
-    // Adds entry to the list of entries that will be released when the
-    // RefHolder is destroyed. Each entry is released via a call to
-    // parent_->DiscardEntryRefs.
-    void AddRef(CompiledSubgraph* entry) { entries_.push_back(entry); }
-
-    string DebugString() const override {
-      return "TpuCompilationCacheExternal::RefHolder";
-    }
-
-   private:
-    TpuCompilationCacheExternal* parent_;  // Not owned.
-    std::vector<CompiledSubgraph*> entries_;
-  };
-
-  // The bulk of implementation of CompileIfKeyAbsent() with the exception
-  // of unloading programs that corresponds to possibly removed cache
-  // entries. The split helps to manage locking since we prefer to perform
-  // unloading without holding extra locks.
-  Status CompileIfKeyAbsentHelper(
-      const TpuCompilationCacheKey& subgraph_key,
-      const SessionMetadata* session_metadata,
-      TpuCompilationRefHolder* per_step_ref_holder, int64* uid,
-      std::vector<string>* proto_key, std::vector<bool>* may_modify_variables,
-      std::vector<CompiledSubgraph*>* removed_entries,
-      std::vector<std::shared_ptr<const xla::HloProto>>* hlo_metadata,
-      const std::function<Status(TpuProgramGroup*)>& compile_function);
-
-  // This is called by the cache when entry is marked for eviction; by
-  // a RefHolder (via DiscardEntryRefs) when a step completes; and by
-  // an EntryRefImpl when it is destroyed. Releases one reference to entry
-  // if more than 1 remains. If only one reference is left, the entry is removed
-  // from cache_ and is returned to the caller; which must eventually call
-  // UnloadAndDestroy(). We do not call UnloadAndDestroy within DiscardEntryRef
-  // to avoid holding the lock during program unloading.
-  ABSL_MUST_USE_RESULT CompiledSubgraph* DiscardEntryRef(
-      CompiledSubgraph* entry) ABSL_EXCLUSIVE_LOCKS_REQUIRED(mu_);
-  // Convenience method called by ~RefHolder without mu_ held. Calls
-  // DiscardEntryRef on every element of entries.
-  void DiscardEntryRefs(gtl::ArraySlice<CompiledSubgraph*> entries);
-
-  // Marks the oldest unmarked entry for eviction. Requires that there is at
-  // least one such entry. In case the evicted entry had only 1 reference it
-  // is removed from the cache and returned to the caller which must eventually
-  // call UnloadAndDestroy.
-  CompiledSubgraph* MarkOldestEntryForEviction()
-      ABSL_EXCLUSIVE_LOCKS_REQUIRED(mu_);
-
-  // Updates datastructures to indicate that entry, which had been marked for
-  // eviction, has been looked up. This is called by CompileIfKeyAbsent when an
-  // entry is newly created, or an entry that has been marked for eviction but
-  // not yet evicted is looked up.
-  //
-  // First the entry is unmarked for eviction, i.e. the cache gains a reference
-  // to entry, entry's last_use field is set to be the most recent value of
-  // use_counter_ and entries_by_last_use_ is updated accordingly.
-  //
-  // Next, the size of the cache is examined to see if any other entries need to
-  // be marked for eviction now that entry has been unmarked. While the total
-  // size of unmarked cached entries is greater than max_cache_size_, entries
-  // are marked for eviction in LRU order. The most recently used entry is never
-  // marked for eviction, so an entry larger than the max cache size will remain
-  // in the cache until it is replaced by something else. In case some entries
-  // actually were removed from the cache, they are a returned to the caller via
-  // removed_entries. The caller must eventually delete them by calling
-  // UnloadAndDestroy.
-  void LookupEntryMarkedForEviction(
-      CompiledSubgraph* entry, std::vector<CompiledSubgraph*>* removed_entries)
-      ABSL_EXCLUSIVE_LOCKS_REQUIRED(mu_);
-
-  // Removes the entry with given key from cache.
-  size_t RemoveEntry(const string& key) ABSL_EXCLUSIVE_LOCKS_REQUIRED(mu_);
-
-  // Inserts the given key and entry to cache.
-  void InsertEntry(const std::string& key,
-                   const TpuCompilationCacheKey& subgraph_key,
-                   CompiledSubgraph* entry) ABSL_EXCLUSIVE_LOCKS_REQUIRED(mu_);
-
-  // Returns the cache key matching given subgraph_key.
-  std::string FindCacheKey(const TpuCompilationCacheKey& subgraph_key) const
-      ABSL_EXCLUSIVE_LOCKS_REQUIRED(mu_);
-
   // Creates a new entry by running initialize_programs and places it in the
   // cache to be looked up by key. The new entry is in the 'marked for eviction'
   // state (not present in entries_by_last_use_) and the caller is expected to
@@ -261,61 +82,10 @@ class TpuCompilationCacheExternal : public ResourceBase {
   // **InitializeEntry releases mu_ during the call to initialize_programs.**
   CompiledSubgraph* InitializeEntry(
       const string& key,
-      const std::function<Status(TpuProgramGroup*)>& initialize_program,
+      const std::function<Status(TpuProgramGroupInterface*)>&
+          initialize_program,
       const TpuCompilationCacheKey& subgraph_key)
-      ABSL_EXCLUSIVE_LOCKS_REQUIRED(mu_);
-
-  // Unloads the program associated with the entry from all local devices
-  // and deletes the entry itself. It is assumed no one else has a reference
-  // to it and all related keys had already been removed from the cache.
-  // The call can perform device IO so no locks should be held while calling it.
-  void UnloadAndDestroy(CompiledSubgraph* entry) ABSL_LOCKS_EXCLUDED(mu_);
-
-  // The maximum size of entries that are stored in the cache before entries are
-  // marked for eviction.
-  const int64 max_cache_size_;
-
-  mutable absl::Mutex mu_;
-  // The total size of entries that are stored and not marked for eviction.
-  int64 cache_size_ ABSL_GUARDED_BY(mu_) = 0;
-
-  // The total size of entries that are marked for eviction.
-  int64 marked_for_eviction_size_ ABSL_GUARDED_BY(mu_) = 0;
-
-  // The value to assign to the last_use field of the next entry that is looked
-  // up.
-  int64 use_counter_ ABSL_GUARDED_BY(mu_) = 0;
-
-  // session_key_map_ and fingerprint_key_map_ are used for looking up the
-  // cache_ key matching a given subgraph key. When doing a lookup, check
-  // session_key_map_ first to avoid unnecessay fingerprint computation.
-  // Map from key prefix + session_handle to a cache_ key.
-  std::unordered_map<string, string> session_key_map_ ABSL_GUARDED_BY(mu_);
-
-  // Map from key prefix + fingerprint to a cache_ key.
-  std::unordered_map<string, string> fingerprint_key_map_ ABSL_GUARDED_BY(mu_);
-
-  // All the subgraph entries that can be looked up in the cache. An entry is
-  // marked for eviction iff it is present in cache_ and not in
-  // entries_by_last_use_.
-  std::unordered_map<string, CompiledSubgraph*> cache_store_
-      ABSL_GUARDED_BY(mu_);
-
-  // All the subgraph entries that can be looked up in the cache, indexed by
-  // uid.
-  absl::node_hash_map<int64, CompiledSubgraph*> entries_by_uid_
-      ABSL_GUARDED_BY(mu_);
-
-  // All the protos that can be looked up in the cache, indexed by proto
-  // key. The value of the map is a subgraph and the index of the proto compiled
-  // for that subgraph.
-  std::unordered_map<string, std::pair<CompiledSubgraph*, int>>
-      entries_by_proto_key_ ABSL_GUARDED_BY(mu_);
-
-  // Map from last_use to entry, used to mark entries for eviction in LRU
-  // order. If an entry's last_use counter is not present as a key in
-  // entries_by_last_use_ then the entry has been marked for eviction.
-  std::map<int64, CompiledSubgraph*> entries_by_last_use_ ABSL_GUARDED_BY(mu_);
+      ABSL_EXCLUSIVE_LOCKS_REQUIRED(TpuCompilationCacheInterface::mu_) override;
 };
 
 }  // namespace tpu
diff --git a/tensorflow/core/tpu/kernels/tpu_compilation_cache_interface.cc b/tensorflow/core/tpu/kernels/tpu_compilation_cache_interface.cc
index f3e40df24dd..3b46f0f2d32 100644
--- a/tensorflow/core/tpu/kernels/tpu_compilation_cache_interface.cc
+++ b/tensorflow/core/tpu/kernels/tpu_compilation_cache_interface.cc
@@ -93,7 +93,9 @@ Status TpuCompilationCacheInterface::MarkEntryForEviction(int64 subgraph_uid) {
                               "use TpuCompilationCacheInterface::Release.");
     }
 
-    VLOG(1) << "Marking " << subgraph_to_evict->subgraph_key << " for eviction";
+    VLOG(1) << "Marking " << subgraph_to_evict->subgraph_key
+            << " for eviction. Debug string: "
+            << subgraph_to_evict->cache_entry_debug_string;
     entries_by_last_use_.erase(subgraph_to_evict->last_use);
     cache_size_ -= subgraph_to_evict->total_size;
     marked_for_eviction_size_ += subgraph_to_evict->total_size;
@@ -231,7 +233,9 @@ void TpuCompilationCacheInterface::DiscardEntryRefs(
 
 CompiledSubgraph* TpuCompilationCacheInterface::MarkOldestEntryForEviction() {
   CompiledSubgraph* entry_to_mark = entries_by_last_use_.begin()->second;
-  VLOG(1) << "Marking " << entry_to_mark->subgraph_key << " for eviction";
+  VLOG(1) << "Marking " << entry_to_mark->subgraph_key
+          << " for eviction. Debug string: "
+          << entry_to_mark->cache_entry_debug_string;
   entries_by_last_use_.erase(entry_to_mark->last_use);
   cache_size_ -= entry_to_mark->total_size;
   marked_for_eviction_size_ += entry_to_mark->total_size;
@@ -291,7 +295,7 @@ Status TpuCompilationCacheInterface::CompileIfKeyAbsent(
     const SessionMetadata* session_metadata,
     CompilationRefHolder* per_step_ref_holder, int64* uid,
     std::vector<string>* proto_key, std::vector<bool>* may_modify_variables,
-    std::vector<std::shared_ptr<const xla::HloProto>>* hlo_metadatas,
+    absl::Span<const xla::HloProto* const>* hlo_metadatas,
     const std::function<Status(TpuProgramGroupInterface*)>& compile_function) {
   std::vector<CompiledSubgraph*> removed_entries;
   auto status = CompileIfKeyAbsentHelper(
@@ -328,7 +332,7 @@ Status TpuCompilationCacheInterface::CompileIfKeyAbsentHelper(
     CompilationRefHolder* per_step_ref_holder, int64* uid,
     std::vector<string>* proto_key, std::vector<bool>* may_modify_variables,
     std::vector<CompiledSubgraph*>* removed_entries,
-    std::vector<std::shared_ptr<const xla::HloProto>>* hlo_metadatas,
+    absl::Span<const xla::HloProto* const>* hlo_metadatas,
     const std::function<Status(TpuProgramGroupInterface*)>& compile_function) {
   CompiledSubgraph* entry = nullptr;
 
@@ -388,7 +392,8 @@ Status TpuCompilationCacheInterface::CompileIfKeyAbsentHelper(
     TRACELITERAL("TPU host compilation cache: compilation done.");
     LOG(INFO) << strings::StrCat(
         "TPU host compilation cache: compilation done for cache_key(",
-        cache_key, "), session_name(", session_name, ")");
+        cache_key, "), session_name(", session_name, "), subgraph_key(",
+        subgraph_key.debug_string, ")");
     // If session_name is present, log some additional stats related to HBM
     // here, so that they can be associated directly to the session.
     if (!session_name.empty()) {
diff --git a/tensorflow/core/tpu/kernels/tpu_compilation_cache_interface.h b/tensorflow/core/tpu/kernels/tpu_compilation_cache_interface.h
new file mode 100644
index 00000000000..f92893b78f6
--- /dev/null
+++ b/tensorflow/core/tpu/kernels/tpu_compilation_cache_interface.h
@@ -0,0 +1,355 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_TPU_KERNELS_TPU_COMPILATION_CACHE_INTERFACE_H_
+#define TENSORFLOW_CORE_TPU_KERNELS_TPU_COMPILATION_CACHE_INTERFACE_H_
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "absl/base/thread_annotations.h"
+#include "absl/container/node_hash_map.h"
+#include "absl/strings/str_cat.h"
+#include "absl/synchronization/mutex.h"
+#include "tensorflow/compiler/tf2xla/host_compute_metadata.pb.h"
+#include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/core/distributed_runtime/rpc/grpc_call.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/lib/core/refcount.h"
+#include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/profiler/lib/traceme.h"
+#include "tensorflow/core/protobuf/config.pb.h"
+#include "tensorflow/core/tpu/kernels/compiled_subgraph.h"
+#include "tensorflow/core/tpu/kernels/tpu_compilation_cache.pb.h"
+#include "tensorflow/core/tpu/kernels/tpu_compilation_cache_key.h"
+#include "tensorflow/core/tpu/kernels/tpu_compilation_cache_metrics.h"
+#include "tensorflow/core/tpu/kernels/trace_util.h"
+
+namespace tensorflow {
+namespace tpu {
+
+// Base class that holds references to compiled protos so that the protos are
+// not garbage-collected before being used by execute ops. Use
+// TpuCompilationCache::MakePerStepRefHolder to create an instance of a concrete
+// ref holder object.
+class CompilationRefHolder : public ResourceBase {
+ public:
+  ~CompilationRefHolder() override = default;
+};
+
+// Base class for a reference to a cached tpu program. A unique_ptr to a
+// CompilationCacheEntryRef is returned by all the cache Lookup methods below,
+// and ensures the underlying proto is not garbage-collected until the client
+// discards the ptr.
+template <typename CacheEntryType>
+class CompilationCacheEntryRef {
+ public:
+  virtual ~CompilationCacheEntryRef() = default;
+
+  // Returns a CompilationCacheEntry that should not be used beyond the lifetime
+  // of the tpu::CompilationCacheEntryRef.
+  virtual CacheEntryType get() = 0;
+
+  // Mutates this ref to point to the entry's subentry (for
+  // sharding/unsharding) or main entry (unchanged) as specified by
+  // fetch_target. The refcount is kept unchanged, since we only track the
+  // refcount of the main entry. The entry ref needs to point to the main
+  // entry before this call.
+  //
+  // If the requested subentry does not exist, the ref will point to a nullptr
+  // entry, and the original entry will be unref'ed.
+  virtual Status ToSubEntryRef(CompilationCacheFetchTarget fetch_target) = 0;
+};
+
+class TpuCompilationCacheInterface : public ResourceBase {
+ public:
+  explicit TpuCompilationCacheInterface(int64 max_cache_size);
+  ~TpuCompilationCacheInterface() override;
+
+  // Ensures there is an entry for key present in the cache. By the time
+  // CompileIfKeyAbsent returns there is guaranteed to be an entry in the cache
+  // for key, and that entry will remain valid at least until
+  // per_step_ref_holder is deleted. The first call to CompileIfKeyAbsent with a
+  // key that is not in the cache will evaluate compile_function to compute the
+  // value to use in the entry. Subsequent calls with the same key will block
+  // until compile_function completes. Other cache reads and inserts may proceed
+  // on other threads while compile_function is executing. If
+  // per_step_ref_holder is nullptr then the caller is responsible for calling
+  // Release(subgraph_key) to manually discard its reference to the compiled
+  // program, once the caller will not look up the compiled program again.
+  //
+  // compile_function should compile the subgraph represented by key and fill in
+  // one TPUExecutableProto per model-parallel core into its passed argument. It
+  // should return OK if and only if compilation succeeds. The executable proto
+  // vector will be discarded on non-OK status.
+  Status CompileIfKeyAbsent(
+      const TpuCompilationCacheKey& subgraph_key,
+      const SessionMetadata* session_metadata,
+      CompilationRefHolder* per_step_ref_holder, int64* uid,
+      std::vector<string>* proto_key, std::vector<bool>* may_modify_variables,
+      absl::Span<const xla::HloProto* const>* hlo_metadatas,
+      const std::function<Status(TpuProgramGroupInterface*)>& compile_function);
+
+  // Differences between MarkEntryForEviction and Release:
+  // There are two modes of managing cache entries:
+  // 1) LRU eviction + pinning; 2) manual.
+  // We use mode 1) if CompilationRefHolder is provided to CompileIfKeyAbsent.
+  // Otherwise it is manual mode (mainly used by XRT).
+  // MarkEntryForEviction should only be used in mode 1) to eagerly evict cache
+  // entries when callers know that they do not need them anymore.
+  // Release should only be used in mode 2) to explicitly remove an entry.
+
+  // Mark the entry indexed by `subgraph_uid` for eviction. This should only be
+  // called if per_step_ref_holder was NOT nullptr in the corresponding call to
+  // CompileIfKeyAbsent(subgraph_key, ...). Otherwise, use Release(int64
+  // subgraph_uid).
+  Status MarkEntryForEviction(int64 subgraph_uid);
+
+  // Manually discards a reference to the compiled subgraph. This should only be
+  // called if per_step_ref_holder was nullptr in the corresponding call to
+  // CompileIfKeyAbsent(subgraph_key, ...).
+  Status Release(int64 subgraph_uid);
+
+  // Looks up an executable corresponding to the model-parallel core index of
+  // the subgraph represented by key. On success a pointer to an EntryRef
+  // holding the program is returned in entry.
+  template <typename CacheEntryRef, typename CacheEntryRefImpl>
+  Status Lookup(const string& proto_key, std::unique_ptr<CacheEntryRef>* entry);
+
+  // Looks up an executable corresponding to the model-parallel core index of
+  // the subgraph represented by uid. On success a pointer to an EntryRef
+  // holding the program is returned in entry.
+  template <typename CacheEntryRef, typename CacheEntryRefImpl>
+  Status Lookup(int64 uid, int proto_index,
+                std::unique_ptr<CacheEntryRef>* entry);
+
+  // Looks up the subgraph represented by uid, and returns the vector of keys,
+  // one per core, corresponding to that subgraph.
+  Status GetKeysFromUid(int64 uid, std::vector<string>* keys);
+
+  // Makes a reference holder for this cache, that can be stored in the per-step
+  // resource manager and will ensure that compiled entries persist until the
+  // end of a step.
+  CompilationRefHolder* MakePerStepRefHolder();
+
+  // Convenience method called by ~RefHolder without mu_ held. Calls
+  // DiscardEntryRef on every element of entries.
+  void DiscardEntryRefs(gtl::ArraySlice<CompiledSubgraph*> entries);
+
+  string DebugString() const override { return "TpuCompilationCacheBase"; }
+
+ protected:
+  std::string ConstructCompilationCacheKey(const TpuCompilationCacheKey& key) {
+    if (!key.has_guaranteed_const) {
+      return key.prefix;
+    }
+    return absl::StrCat(key.prefix, "|", key.session_handle, "|",
+                        key.guaranteed_const_fingerprint());
+  }
+
+  // Private implementation of the generic CompilationRefHolder that knows about
+  // CompiledSubgraph entries.
+  class RefHolder : public CompilationRefHolder {
+   public:
+    explicit RefHolder(TpuCompilationCacheInterface* parent);
+    ~RefHolder() override;
+
+    // Adds entry to the list of entries that will be released when the
+    // RefHolder is destroyed. Each entry is released via a call to
+    // parent_->DiscardEntryRefs.
+    void AddRef(CompiledSubgraph* entry);
+
+    string DebugString() const override;
+
+   private:
+    TpuCompilationCacheInterface* parent_;  // Not owned.
+    std::vector<CompiledSubgraph*> entries_;
+  };
+
+  // The bulk of implementation of CompileIfKeyAbsent() with the exception
+  // of unloading programs that corresponds to possibly removed cache
+  // entries. The split helps to manage locking since we prefer to perform
+  // unloading without holding extra locks.
+  Status CompileIfKeyAbsentHelper(
+      const TpuCompilationCacheKey& subgraph_key,
+      const SessionMetadata* session_metadata,
+      CompilationRefHolder* per_step_ref_holder, int64* uid,
+      std::vector<string>* proto_key, std::vector<bool>* may_modify_variables,
+      std::vector<CompiledSubgraph*>* removed_entries,
+      absl::Span<const xla::HloProto* const>* hlo_metadatas,
+      const std::function<Status(TpuProgramGroupInterface*)>& compile_function);
+
+  // This is called by the cache when entry is marked for eviction; by
+  // a RefHolder (via DiscardEntryRefs) when a step completes; and by
+  // an EntryRefImpl when it is destroyed. Releases one reference to entry
+  // if more than 1 remains. If only one reference is left, the entry is removed
+  // from cache_ and is returned to the caller; which must eventually call
+  // UnloadAndDestroy(). We do not call UnloadAndDestroy within DiscardEntryRef
+  // to avoid holding the lock during program unloading.
+  ABSL_MUST_USE_RESULT CompiledSubgraph* DiscardEntryRef(
+      CompiledSubgraph* entry) ABSL_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
+  // Marks the oldest unmarked entry for eviction. Requires that there is at
+  // least one such entry. In case the evicted entry had only 1 reference it
+  // is removed from the cache and returned to the caller which must eventually
+  // call UnloadAndDestroy.
+  ABSL_MUST_USE_RESULT CompiledSubgraph* MarkOldestEntryForEviction()
+      ABSL_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
+  // Updates datastructures to indicate that entry, which had been marked for
+  // eviction, has been looked up. This is called by CompileIfKeyAbsent when an
+  // entry is newly created, or an entry that has been marked for eviction but
+  // not yet evicted is looked up.
+  //
+  // First the entry is unmarked for eviction, i.e. the cache gains a reference
+  // to entry, entry's last_use field is set to be the most recent value of
+  // use_counter_ and entries_by_last_use_ is updated accordingly.
+  //
+  // Next, the size of the cache is examined to see if any other entries need to
+  // be marked for eviction now that entry has been unmarked. While the total
+  // size of unmarked cached entries is greater than max_cache_size_, entries
+  // are marked for eviction in LRU order. The most recently used entry is never
+  // marked for eviction, so an entry larger than the max cache size will remain
+  // in the cache until it is replaced by something else. In case some entries
+  // actually were removed from the cache, they are a returned to the caller via
+  // removed_entries. The caller must eventually delete them by calling
+  // UnloadAndDestroy.
+  void LookupEntryMarkedForEviction(
+      CompiledSubgraph* entry, std::vector<CompiledSubgraph*>* removed_entries)
+      ABSL_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
+  // Removes the entry with given key from cache.
+  size_t RemoveEntry(const string& key) ABSL_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
+  // Inserts the given key and entry to cache.
+  void InsertEntry(const string& key, CompiledSubgraph* entry)
+      ABSL_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
+  // Returns the cache key matching given subgraph_key.
+  string FindCacheKey(const TpuCompilationCacheKey& subgraph_key)
+      ABSL_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
+  // Creates a new entry by running initialize_programs and places it in the
+  // cache to be looked up by key. The new entry is in the 'marked for eviction'
+  // state (not present in entries_by_last_use_) and the caller is expected to
+  // call LookupEntryMarkedForEviction after InitializeEntry.
+  //
+  // **InitializeEntry releases mu_ during the call to initialize_programs.**
+  virtual CompiledSubgraph* InitializeEntry(
+      const string& key,
+      const std::function<Status(TpuProgramGroupInterface*)>&
+          initialize_programs,
+      const TpuCompilationCacheKey& subgraph_key)
+      ABSL_EXCLUSIVE_LOCKS_REQUIRED(mu_) = 0;
+
+  // Unloads the program associated with the entry from all local devices
+  // and deletes the entry itself. It is assumed no one else has a reference
+  // to it and all related keys had already been removed from the cache.
+  // The call can perform device IO so no locks should be held while calling it.
+  void UnloadAndDestroy(CompiledSubgraph* entry) ABSL_LOCKS_EXCLUDED(mu_);
+
+  // The maximum size of entries that are stored in the cache before entries are
+  // marked for eviction.
+  const int64 max_cache_size_;
+  // Mutex to protect access to shared resources under multi-threading
+  // environment.
+  absl::Mutex mu_;
+  // The total size of entries that are stored and not marked for eviction.
+  int64 cache_size_ ABSL_GUARDED_BY(mu_) = 0;
+  // The total size of entries that are marked for eviction.
+  int64 marked_for_eviction_size_ ABSL_GUARDED_BY(mu_) = 0;
+  // The value to assign to the last_use field of the next entry that is looked
+  // up.
+  int64 use_counter_ ABSL_GUARDED_BY(mu_) = 0;
+  // session_key_map_ and fingerprint_key_map_ are used for looking up the
+  // cache_ key matching a given subgraph key. When doing a lookup, check
+  // session_key_map_ first to avoid unnecessay fingerprint computation.
+  // Map from key prefix + session_handle to a cache_ key.
+  absl::node_hash_map<string, string> session_key_map_ ABSL_GUARDED_BY(mu_);
+  // Map from key prefix + fingerprint to a cache_ key.
+  absl::node_hash_map<string, string> fingerprint_key_map_ ABSL_GUARDED_BY(mu_);
+  // All the subgraph entries that can be looked up in the cache. An entry is
+  // marked for eviction iff it is present in cache_ and not in
+  // entries_by_last_use_.
+  std::unordered_map<string, CompiledSubgraph*> cache_ ABSL_GUARDED_BY(mu_);
+  // All the subgraph entries that can be looked up in the cache, indexed by
+  // uid.
+  absl::node_hash_map<int64, CompiledSubgraph*> entries_by_uid_
+      ABSL_GUARDED_BY(mu_);
+  // All the protos that can be looked up in the cache, indexed by proto
+  // key. The value of the map is a subgraph and the index of the proto compiled
+  // for that subgraph.
+  std::unordered_map<string, std::pair<CompiledSubgraph*, int>>
+      entries_by_proto_key_ ABSL_GUARDED_BY(mu_);
+  // Map from last_use to entry, used to mark entries for eviction in LRU
+  // order. If an entry's last_use counter is not present as a key in
+  // entries_by_last_use_ then the entry has been marked for eviction.
+  std::map<int64, CompiledSubgraph*> entries_by_last_use_ ABSL_GUARDED_BY(mu_);
+
+  TpuCompilationCacheMetrics tpu_compilation_cache_metrics_;
+
+ private:
+  TpuCompilationCacheInterface(const TpuCompilationCacheInterface&) = delete;
+  TpuCompilationCacheInterface& operator=(const TpuCompilationCacheInterface&) =
+      delete;
+};
+
+template <typename CacheEntryRef, typename CacheEntryRefImpl>
+Status TpuCompilationCacheInterface::Lookup(
+    int64 uid, int proto_index, std::unique_ptr<CacheEntryRef>* entry) {
+  entry->reset();
+
+  profiler::TraceMe proto_lookup_traceme(
+      "TPU compilation cache proto lookup by uid",
+      /*level=*/2);
+
+  absl::MutexLock lock(&mu_);
+  const auto iter = entries_by_uid_.find(uid);
+  if (iter == entries_by_uid_.end()) {
+    return errors::NotFound("No subgraph found for uid ", uid);
+  }
+  CompiledSubgraph* cache_entry = iter->second;
+  if (proto_index < 0 ||
+      proto_index >= cache_entry->tpu_program_group->program_count()) {
+    return errors::NotFound("No proto found for core index ", proto_index,
+                            " in subgraph with uid ", uid);
+  }
+  *entry = absl::make_unique<CacheEntryRefImpl>(this, cache_entry, proto_index);
+  return Status::OK();
+}
+
+template <typename CacheEntryRef, typename CacheEntryRefImpl>
+Status TpuCompilationCacheInterface::Lookup(
+    const string& proto_key, std::unique_ptr<CacheEntryRef>* entry) {
+  entry->reset();
+
+  profiler::TraceMe proto_lookup_traceme("TPU compilation cache proto lookup",
+                                         /*level=*/2);
+
+  absl::MutexLock lock(&mu_);
+  const auto iter = entries_by_proto_key_.find(proto_key);
+  if (iter == entries_by_proto_key_.end()) {
+    return errors::NotFound("No proto found for key ", proto_key);
+  }
+  CompiledSubgraph* cache_entry = iter->second.first;
+  int proto_index = iter->second.second;
+  *entry = absl::make_unique<CacheEntryRefImpl>(this, cache_entry, proto_index);
+  return Status::OK();
+}
+
+}  // namespace tpu
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TPU_KERNELS_TPU_COMPILATION_CACHE_INTERFACE_H_
diff --git a/tensorflow/core/tpu/kernels/tpu_compilation_cache_lookup.cc b/tensorflow/core/tpu/kernels/tpu_compilation_cache_lookup.cc
index 8b2e832a69e..9285dff62ce 100644
--- a/tensorflow/core/tpu/kernels/tpu_compilation_cache_lookup.cc
+++ b/tensorflow/core/tpu/kernels/tpu_compilation_cache_lookup.cc
@@ -42,7 +42,7 @@ std::string GetName(CompilationCacheFetchTarget target) {
 }  // namespace
 
 TpuCompilationCacheLocalLookup::TpuCompilationCacheLocalLookup(
-    TpuCompilationCacheExternal* cache)
+    TpuCompilationCacheInterface* cache)
     : cache_(cache) {}
 
 TpuCompilationCacheLocalLookup::~TpuCompilationCacheLocalLookup() {
@@ -50,17 +50,19 @@ TpuCompilationCacheLocalLookup::~TpuCompilationCacheLocalLookup() {
 }
 
 Status TpuCompilationCacheLocalLookup::Lookup(
-    const string& proto_key, std::unique_ptr<CompilationCacheEntryRef>* entry,
+    const string& proto_key,
+    std::unique_ptr<TpuCompilationCacheEntryRef>* entry,
     CompilationCacheFetchTarget fetch_target) {
   profiler::TraceMe proto_lookup_traceme("Local TPU proto cache lookup",
                                          /*level=*/2);
-  Status s = cache_->Lookup(proto_key, entry);
+  Status s = cache_->Lookup<TpuCompilationCacheEntryRef, EntryRefImpl>(
+      proto_key, entry);
   VLOG(1) << "Looked up key " << proto_key << " in local subgraph cache status "
           << s;
   if (!s.ok()) {
     return s;
   }
-  s = cache_->ToSubEntryRef(entry->get(), fetch_target);
+  s = (*entry)->ToSubEntryRef(fetch_target);
 
   VLOG(1) << "Fetched subentry: " << GetName(fetch_target) << " with status "
           << s;
@@ -69,17 +71,18 @@ Status TpuCompilationCacheLocalLookup::Lookup(
 
 Status TpuCompilationCacheLocalLookup::Lookup(
     int64 uid, int proto_index,
-    std::unique_ptr<CompilationCacheEntryRef>* entry,
+    std::unique_ptr<TpuCompilationCacheEntryRef>* entry,
     CompilationCacheFetchTarget fetch_target) {
   profiler::TraceMe proto_lookup_traceme("Local TPU proto cache lookup by uid",
                                          /*level=*/2);
-  Status s = cache_->Lookup(uid, proto_index, entry);
+  Status s = cache_->Lookup<TpuCompilationCacheEntryRef, EntryRefImpl>(
+      uid, proto_index, entry);
   VLOG(1) << "Looked up uid " << uid << ", index " << proto_index
           << " in local subgraph cache status " << s;
   if (!s.ok()) {
     return s;
   }
-  s = cache_->ToSubEntryRef(entry->get(), fetch_target);
+  s = (*entry)->ToSubEntryRef(fetch_target);
   VLOG(1) << "Fetched subentry: " << GetName(fetch_target) << " with status "
           << s;
   return s;
diff --git a/tensorflow/core/tpu/kernels/tpu_compilation_cache_lookup.h b/tensorflow/core/tpu/kernels/tpu_compilation_cache_lookup.h
index 0d068e1bdd1..21ca74c46a8 100644
--- a/tensorflow/core/tpu/kernels/tpu_compilation_cache_lookup.h
+++ b/tensorflow/core/tpu/kernels/tpu_compilation_cache_lookup.h
@@ -12,13 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef EXPERIMENTAL_BRAIN_TPU_1VM_MINIEXECUTOR_TPU_COMPILATION_CACHE_LOOKUP_H_
-#define EXPERIMENTAL_BRAIN_TPU_1VM_MINIEXECUTOR_TPU_COMPILATION_CACHE_LOOKUP_H_
+#ifndef TENSORFLOW_CORE_TPU_KERNELS_TPU_COMPILATION_CACHE_LOOKUP_H_
+#define TENSORFLOW_CORE_TPU_KERNELS_TPU_COMPILATION_CACHE_LOOKUP_H_
 
 #include "tensorflow/core/lib/core/refcount.h"
 #include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/tpu/kernels/tpu_compilation_cache.pb.h"
 #include "tensorflow/core/tpu/kernels/tpu_compilation_cache_entry.h"
+#include "tensorflow/core/tpu/kernels/tpu_compilation_cache_external.h"
+#include "tensorflow/core/tpu/kernels/tpu_compilation_cache_interface.h"
 
 namespace tensorflow {
 namespace tpu {
@@ -28,6 +30,11 @@ namespace tpu {
 // and when they need to communicate over RPC.
 class TpuCompilationCacheLookup : public ResourceBase {
  public:
+  using TpuCompilationCacheEntryRef =
+      ::tensorflow::tpu::CompilationCacheEntryRef<TpuCompilationCacheEntry>;
+  using EntryRefImpl =
+      ::tensorflow::tpu::TpuCompilationCacheExternal::EntryRefImpl;
+
   ~TpuCompilationCacheLookup() override = default;
 
   // Looks up an executable corresponding to the model-parallel core index of
@@ -42,11 +49,11 @@ class TpuCompilationCacheLookup : public ResourceBase {
   // fetch_target requests one of them, then after this call
   //   (*entry)->get().get_executable() will return nullptr.
   virtual Status Lookup(const string& proto_key,
-                        std::unique_ptr<CompilationCacheEntryRef>* entry,
+                        std::unique_ptr<TpuCompilationCacheEntryRef>* entry,
                         CompilationCacheFetchTarget fetch_target) = 0;
 
   virtual Status Lookup(const string& proto_key,
-                        std::unique_ptr<CompilationCacheEntryRef>* entry) {
+                        std::unique_ptr<TpuCompilationCacheEntryRef>* entry) {
     return Lookup(proto_key, std::move(entry),
                   CompilationCacheFetchTarget::MAIN);
   }
@@ -56,33 +63,30 @@ class TpuCompilationCacheLookup : public ResourceBase {
   // returned in program. The wrapper is guaranteed to be valid only during the
   // execution of the Op requesting the proto.
   virtual Status Lookup(int64 uid, int proto_index,
-                        std::unique_ptr<CompilationCacheEntryRef>* entry,
+                        std::unique_ptr<TpuCompilationCacheEntryRef>* entry,
                         CompilationCacheFetchTarget fetch_target) = 0;
 
   virtual Status Lookup(int64 uid, int proto_index,
-                        std::unique_ptr<CompilationCacheEntryRef>* entry) {
+                        std::unique_ptr<TpuCompilationCacheEntryRef>* entry) {
     return Lookup(uid, proto_index, std::move(entry),
                   CompilationCacheFetchTarget::MAIN);
   }
 };
 
-// Forward declaration to break cycle dependency graph.
-class TpuCompilationCacheExternal;
-
 // Class for looking up ISA protos when the execute and compile Op are in the
 // same address space. The proto is simply looked up in the compilation cache,
 // without any serialization taking place.
 class TpuCompilationCacheLocalLookup : public TpuCompilationCacheLookup {
  public:
-  explicit TpuCompilationCacheLocalLookup(TpuCompilationCacheExternal* cache);
+  explicit TpuCompilationCacheLocalLookup(TpuCompilationCacheInterface* cache);
   ~TpuCompilationCacheLocalLookup() override;
 
   Status Lookup(const string& proto_key,
-                std::unique_ptr<CompilationCacheEntryRef>* entry,
+                std::unique_ptr<TpuCompilationCacheEntryRef>* entry,
                 CompilationCacheFetchTarget fetch_target) override;
 
   Status Lookup(int64 uid, int proto_index,
-                std::unique_ptr<CompilationCacheEntryRef>* entry,
+                std::unique_ptr<TpuCompilationCacheEntryRef>* entry,
                 CompilationCacheFetchTarget fetch_target) override;
 
   string DebugString() const override;
@@ -90,10 +94,10 @@ class TpuCompilationCacheLocalLookup : public TpuCompilationCacheLookup {
  private:
   // The subgraph compilation cache, in the same process address space where the
   // lookups are happening.
-  TpuCompilationCacheExternal* cache_;
+  TpuCompilationCacheInterface* cache_;
 };
 
 }  // namespace tpu
 }  // namespace tensorflow
 
-#endif  // EXPERIMENTAL_BRAIN_TPU_1VM_MINIEXECUTOR_TPU_COMPILATION_CACHE_LOOKUP_H_
+#endif  // TENSORFLOW_CORE_TPU_KERNELS_TPU_COMPILATION_CACHE_LOOKUP_H_
diff --git a/tensorflow/core/tpu/kernels/tpu_compile_op_common.cc b/tensorflow/core/tpu/kernels/tpu_compile_op_common.cc
index c8faba1d975..7ab1c9b8027 100644
--- a/tensorflow/core/tpu/kernels/tpu_compile_op_common.cc
+++ b/tensorflow/core/tpu/kernels/tpu_compile_op_common.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/protobuf/tpu/compile_metadata.pb.h"
 #include "tensorflow/core/protobuf/tpu/dynamic_padding.pb.h"
+#include "tensorflow/core/tpu/kernels/tpu_program_group_interface.h"
 #include "tensorflow/core/tpu/kernels/tpu_util.h"
 #include "tensorflow/core/tpu/tpu_configuration.h"
 #include "tensorflow/core/tpu/tpu_defs.h"
diff --git a/tensorflow/core/tpu/kernels/tpu_compile_op_support.h b/tensorflow/core/tpu/kernels/tpu_compile_op_support.h
index 0f21e458828..36f9fa96db1 100644
--- a/tensorflow/core/tpu/kernels/tpu_compile_op_support.h
+++ b/tensorflow/core/tpu/kernels/tpu_compile_op_support.h
@@ -24,7 +24,6 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "tensorflow/cc/framework/ops.h"
 #include "tensorflow/compiler/tf2xla/xla_compiler.h"
-#include "tensorflow/compiler/xla/client/compile_only_client.h"
 #include "tensorflow/compiler/xla/service/hlo_module_config.h"
 #include "tensorflow/compiler/xla/service/hlo_module_group.h"
 #include "tensorflow/compiler/xla/service/hlo_sharding.h"
diff --git a/tensorflow/core/tpu/kernels/tpu_program_group.cc b/tensorflow/core/tpu/kernels/tpu_program_group.cc
index 43452b912ec..ecda2ef062e 100644
--- a/tensorflow/core/tpu/kernels/tpu_program_group.cc
+++ b/tensorflow/core/tpu/kernels/tpu_program_group.cc
@@ -209,15 +209,8 @@ xla::HloProto TpuProgramGroup::hlo_metadata(int core_index) const {
       serialized_hlo_proto);
 }
 
-std::vector<std::shared_ptr<const xla::HloProto>>
-TpuProgramGroup::hlo_metadatas() const {
-  const size_t metadata_count = program_count();
-  std::vector<std::shared_ptr<const xla::HloProto>> hlo_metadatas;
-  hlo_metadatas.resize(metadata_count);
-  for (size_t i = 0; i < metadata_count; ++i) {
-    hlo_metadatas[i] = std::make_shared<const xla::HloProto>(hlo_metadata(i));
-  }
-  return hlo_metadatas;
+absl::Span<const xla::HloProto* const> TpuProgramGroup::hlo_metadatas() const {
+  return absl::MakeConstSpan(hlo_metadatas_);
 }
 
 }  // namespace tpu
diff --git a/tensorflow/core/tpu/kernels/tpu_program_group.h b/tensorflow/core/tpu/kernels/tpu_program_group.h
index de8256a9e59..0ade58e6daa 100644
--- a/tensorflow/core/tpu/kernels/tpu_program_group.h
+++ b/tensorflow/core/tpu/kernels/tpu_program_group.h
@@ -139,11 +139,15 @@ class TpuProgramGroup : public TpuProgramGroupInterface {
   const xla::HloProto& hlo_metadata() const { return hlo_metadata_; }
   void set_hlo_metadata(const xla::HloProto& hlo_metadata) {
     hlo_metadata_ = hlo_metadata;
+
+    // TODO(henrytan): initialize hlo_metadatas_ for multi program support.
+    if (hlo_metadatas_.empty()) {
+      hlo_metadatas_.push_back(&hlo_metadata_);
+    }
   }
 
   xla::HloProto hlo_metadata(int core_index) const;
-  std::vector<std::shared_ptr<const xla::HloProto>> hlo_metadatas()
-      const override;
+  absl::Span<const xla::HloProto* const> hlo_metadatas() const override;
 
  private:
   std::vector<bool> may_modify_variables_;
@@ -153,6 +157,7 @@ class TpuProgramGroup : public TpuProgramGroupInterface {
   TPUExecutableInfoProto executable_info_;
   TPUHostTransferInfoProto host_transfer_info_;
   xla::HloProto hlo_metadata_;
+  std::vector<const xla::HloProto*> hlo_metadatas_;
 };
 
 }  // namespace tpu
diff --git a/tensorflow/core/tpu/kernels/tpu_program_group_interface.h b/tensorflow/core/tpu/kernels/tpu_program_group_interface.h
index a4f74fb750d..8d8dd5a8786 100644
--- a/tensorflow/core/tpu/kernels/tpu_program_group_interface.h
+++ b/tensorflow/core/tpu/kernels/tpu_program_group_interface.h
@@ -44,9 +44,9 @@ class TpuProgramGroupInterface {
   // Logs program memory summary.
   virtual bool LogProgramMemorySummary() = 0;
 
-  // Hlo metadatas.
-  virtual std::vector<std::shared_ptr<const xla::HloProto>> hlo_metadatas()
-      const = 0;
+  // Hlo metadatas. The pointers can only be used as long as the cache entry is
+  // referenced.
+  virtual absl::Span<const xla::HloProto* const> hlo_metadatas() const = 0;
 
   // Boolean array to indicate if the modification of variables are
   // allowed.

From 4654b512e70aa11958c0830f5c1fd07dc996745a Mon Sep 17 00:00:00 2001
From: YoungSeok Yoon <youngseokyoon@google.com>
Date: Wed, 17 Jun 2020 22:47:49 -0700
Subject: [PATCH 0470/1390] Update the Select TF Ops guide for iOS

The updated documentation now includes how to use the prebuilt CocoaPods, and
also explains how the framework can be compiled manually with bazel.

PiperOrigin-RevId: 317038761
Change-Id: I0b7968cf614ce2522da2af618bc284fbc165bf0b
---
 tensorflow/lite/g3doc/guide/ops_select.md | 81 +++++++++++++++--------
 1 file changed, 54 insertions(+), 27 deletions(-)

diff --git a/tensorflow/lite/g3doc/guide/ops_select.md b/tensorflow/lite/g3doc/guide/ops_select.md
index 0fa608cfa96..200c0017c4b 100644
--- a/tensorflow/lite/g3doc/guide/ops_select.md
+++ b/tensorflow/lite/g3doc/guide/ops_select.md
@@ -172,38 +172,66 @@ dependencies {
 
 ### iOS
 
-With XCode Command Line Tools installed, TensorFlow Lite with select TensorFlow
-ops support can be built with the following command:
+#### Using CocoaPods
 
-```sh
-tensorflow/contrib/makefile/build_all_ios_with_tflite.sh
+We provide nightly prebuilt select TF ops CocoaPods, which you can depend on
+alongside the `TensorFlowLiteSwift` or `TensorFlowLiteObjC` CocoaPods.
+
+```ruby
+# In your Podfile target:
+  pod 'TensorFlowLiteSwift'   # or 'TensorFlowLiteObjC'
+  pod 'TensorFlowLiteSelectTfOps', '~> 0.0.1-nightly'
 ```
 
-This will generate the required static linking libraries in the
-`tensorflow/contrib/makefile/gen/lib/` directory.
+After running `pod install`, you need to provide an additional linker flag to
+force load the select TF ops framework into your project. In your Xcode project,
+go to `Build Settings` -> `Other Linker Flags`, and add:
 
-The TensorFlow Lite camera example app can be used to test this. A new
-TensorFlow Lite XCode project with support for select TensorFlow ops has been
-added to
-`tensorflow/lite/examples/ios/camera/tflite_camera_example_with_select_tf_ops.xcodeproj`.
+```text
+-force_load $(SRCROOT)/Pods/TensorFlowLiteSelectTfOps/Frameworks/TensorFlowLiteSelectTfOps.framework/TensorFlowLiteSelectTfOps
+```
 
-To use this feature in your own project, either clone the example project or set
-the project settings for a new or existing project to the following:
+You should then be able to run any models converted with the `SELECT_TF_OPS` in
+your iOS app. For example, you can modify the
+[Image Classification iOS app](https://github.com/tensorflow/examples/tree/master/lite/examples/image_classification/ios)
+to test the select TF ops feature.
 
-*   In Build Phases -> Link Binary With Libraries, add the static libraries
-    under `tensorflow/contrib/makefile/gen/lib/` directory:
-    *   `libtensorflow-lite.a`
-    *   `libprotobuf.a`
-    *   `nsync.a`
-*   In Build Settings -> Header Search Paths, add the following directories:
-    *   `tensorflow/lite/`
-    *   `tensorflow/contrib/makefile/downloads/flatbuffer/include`
-    *   `tensorflow/contrib/makefile/downloads/eigen`
-*   In Build Settings -> Other Linker Flags, add `-force_load
-    tensorflow/contrib/makefile/gen/lib/libtensorflow-lite.a`.
+*   Replace the model file with the one converted with `SELECT_TF_OPS` enabled.
+*   Add `TensorFlowLiteSelectTfOps` dependency to the `Podfile` as instructed.
+*   Add the additional linker flag as above.
+*   Run the example app and see if the model works correctly.
 
-A CocoaPod with support for select TensorFlow ops will also be released in the
-future.
+#### Using Bazel + Xcode
+
+TensorFlow Lite with select TensorFlow ops for iOS can be built using Bazel.
+First, follow the [iOS build instructions](build_ios.md) to configure your Bazel
+workspace and `.bazelrc` file correctly.
+
+Once you have configured the workspace with iOS support enabled, you can use the
+following command to build the select TF ops addon framework, which can be added
+on top of the regular `TensorFlowLiteC.framework`. Note that the select TF ops
+framework cannot be built for `i386` architecture, so you need to explicitly
+provide the list of target architectures excluding `i386`.
+
+```sh
+bazel build -c opt --config=ios --ios_multi_cpus=armv7,arm64,x86_64 \
+  //tensorflow/lite/experimental/ios:TensorFlowLiteSelectTfOps_framework
+```
+
+This will generate the framework under
+`bazel-bin/tensorflow/lite/experimental/ios/` directory. You can add this new
+framework to your Xcode project by following similar steps described under the
+[Xcode project settings](./build_ios.md#modify_xcode_project_settings_directly)
+section in the iOS build guide.
+
+After adding the framework into your app project, an additional linker flag
+should be specified in your app project to force load the select TF ops
+framework. In your Xcode project, go to `Build Settings` -> `Other Linker
+Flags`, and add:
+
+```text
+-force_load <path/to/your/TensorFlowLiteSelectTfOps.framework/TensorFlowLiteSelectTfOps>
+```
 
 ### C++
 
@@ -274,7 +302,6 @@ The following is a list of improvements to this pipeline that are in progress:
     generate TFLite interpreter binaries that only contain the TensorFlow ops
     required for a particular set of models.
 *   *Improved usability* - The conversion process will be simplified to only
-    require a single pass through the converter. Additionally, pre-built Android
-    AAR and iOS CocoaPod binaries will be provided.
+    require a single pass through the converter.
 *   *Improved performance* - Work is being done to ensure TensorFlow Lite with
     TensorFlow ops has performance parity to TensorFlow Mobile.

From 050c3008e4b350f2289bae1fcc96dcc698a1e8d8 Mon Sep 17 00:00:00 2001
From: Tiezhen WANG <wangtz@google.com>
Date: Wed, 17 Jun 2020 22:53:56 -0700
Subject: [PATCH 0471/1390] TFLM: Make MicroInterpreter multi-tenant
 constructor taking MicroOpResolver reference instead of pointer to match the
 behavior of the original constructor.

The Multi-tenant/recording constructor hasn't been widely adopted so this change would be safe. In case it's not, we should update the reference. Otherwise this discrepancy could be misleading.

PiperOrigin-RevId: 317039376
Change-Id: I518591faa709a5e386cbc6aa6bf00f539aa498ca
---
 tensorflow/lite/micro/memory_arena_threshold_test.cc | 4 ++--
 tensorflow/lite/micro/micro_interpreter.cc           | 4 ++--
 tensorflow/lite/micro/micro_interpreter.h            | 2 +-
 tensorflow/lite/micro/micro_interpreter_test.cc      | 4 ++--
 tensorflow/lite/micro/recording_micro_interpreter.h  | 2 +-
 5 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/tensorflow/lite/micro/memory_arena_threshold_test.cc b/tensorflow/lite/micro/memory_arena_threshold_test.cc
index 58d3eff8df5..4f49b57112a 100644
--- a/tensorflow/lite/micro/memory_arena_threshold_test.cc
+++ b/tensorflow/lite/micro/memory_arena_threshold_test.cc
@@ -159,7 +159,7 @@ TF_LITE_MICRO_TESTS_BEGIN
 TF_LITE_MICRO_TEST(TestKeywordModelMemoryThreshold) {
   tflite::AllOpsResolver all_ops_resolver;
   tflite::RecordingMicroInterpreter interpreter(
-      tflite::GetModel(g_keyword_scrambled_model_data), &all_ops_resolver,
+      tflite::GetModel(g_keyword_scrambled_model_data), all_ops_resolver,
       keyword_model_tensor_arena, kKeywordModelTensorArenaSize,
       micro_test::reporter);
 
@@ -185,7 +185,7 @@ TF_LITE_MICRO_TEST(TestKeywordModelMemoryThreshold) {
 TF_LITE_MICRO_TEST(TestConvModelMemoryThreshold) {
   tflite::AllOpsResolver all_ops_resolver;
   tflite::RecordingMicroInterpreter interpreter(
-      tflite::GetModel(kTestConvModelData), &all_ops_resolver,
+      tflite::GetModel(kTestConvModelData), all_ops_resolver,
       test_conv_tensor_arena, kTestConvModelArenaSize, micro_test::reporter);
 
   interpreter.AllocateTensors();
diff --git a/tensorflow/lite/micro/micro_interpreter.cc b/tensorflow/lite/micro/micro_interpreter.cc
index 6b17a5ffe84..08556a56a54 100644
--- a/tensorflow/lite/micro/micro_interpreter.cc
+++ b/tensorflow/lite/micro/micro_interpreter.cc
@@ -90,12 +90,12 @@ MicroInterpreter::MicroInterpreter(const Model* model,
 }
 
 MicroInterpreter::MicroInterpreter(const Model* model,
-                                   const MicroOpResolver* op_resolver,
+                                   const MicroOpResolver& op_resolver,
                                    MicroAllocator* allocator,
                                    ErrorReporter* error_reporter,
                                    tflite::Profiler* profiler)
     : model_(model),
-      op_resolver_(*op_resolver),
+      op_resolver_(op_resolver),
       error_reporter_(error_reporter),
       allocator_(*allocator),
       tensors_allocated_(false),
diff --git a/tensorflow/lite/micro/micro_interpreter.h b/tensorflow/lite/micro/micro_interpreter.h
index bbe01fa2934..29377e3b940 100644
--- a/tensorflow/lite/micro/micro_interpreter.h
+++ b/tensorflow/lite/micro/micro_interpreter.h
@@ -82,7 +82,7 @@ class MicroInterpreter {
   // have allocation handled in more than one interpreter or for recording
   // allocations inside the interpreter. The lifetime of the allocator must be
   // as long as that of the interpreter object.
-  MicroInterpreter(const Model* model, const MicroOpResolver* op_resolver,
+  MicroInterpreter(const Model* model, const MicroOpResolver& op_resolver,
                    MicroAllocator* allocator, ErrorReporter* error_reporter,
                    tflite::Profiler* profiler = nullptr);
 
diff --git a/tensorflow/lite/micro/micro_interpreter_test.cc b/tensorflow/lite/micro/micro_interpreter_test.cc
index 93d095d3c68..f54c212b573 100644
--- a/tensorflow/lite/micro/micro_interpreter_test.cc
+++ b/tensorflow/lite/micro/micro_interpreter_test.cc
@@ -292,7 +292,7 @@ TF_LITE_MICRO_TEST(TestIncompleteInitializationAllocationsWithSmallArena) {
           allocator_buffer, allocator_buffer_size, micro_test::reporter);
   TF_LITE_MICRO_EXPECT_NE(nullptr, allocator);
 
-  tflite::MicroInterpreter interpreter(model, &mock_resolver, allocator,
+  tflite::MicroInterpreter interpreter(model, mock_resolver, allocator,
                                        micro_test::reporter);
 
   // Interpreter fails because arena is too small:
@@ -337,7 +337,7 @@ TF_LITE_MICRO_TEST(TestInterpreterDoesNotAllocateUntilInvoke) {
           allocator_buffer, allocator_buffer_size, micro_test::reporter);
   TF_LITE_MICRO_EXPECT_NE(nullptr, allocator);
 
-  tflite::MicroInterpreter interpreter(model, &mock_resolver, allocator,
+  tflite::MicroInterpreter interpreter(model, mock_resolver, allocator,
                                        micro_test::reporter);
 
   // Ensure allocations are zero (ignore tail since some internal structs are
diff --git a/tensorflow/lite/micro/recording_micro_interpreter.h b/tensorflow/lite/micro/recording_micro_interpreter.h
index dcb0b431f29..eb443fc6fd1 100644
--- a/tensorflow/lite/micro/recording_micro_interpreter.h
+++ b/tensorflow/lite/micro/recording_micro_interpreter.h
@@ -35,7 +35,7 @@ namespace tflite {
 class RecordingMicroInterpreter : public MicroInterpreter {
  public:
   RecordingMicroInterpreter(const Model* model,
-                            const MicroOpResolver* op_resolver,
+                            const MicroOpResolver& op_resolver,
                             uint8_t* tensor_arena, size_t tensor_arena_size,
                             ErrorReporter* error_reporter)
       : MicroInterpreter(model, op_resolver,

From 85bf1668995a156b7ef0f698d5586d944e38dc97 Mon Sep 17 00:00:00 2001
From: Marat Dukhan <maratek@google.com>
Date: Wed, 17 Jun 2020 22:58:51 -0700
Subject: [PATCH 0472/1390] Update XNNPACK dependency

Bring in a fix for AVX512 on Clang 8

PiperOrigin-RevId: 317039834
Change-Id: I0efc9d66b44c1cd799361f03d594402b5f423378
---
 tensorflow/workspace.bzl | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index 354c4c353b9..fab6faf62d1 100755
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -164,11 +164,11 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
 
     tf_http_archive(
         name = "XNNPACK",
-        sha256 = "714d650828b1409e88ccb2a62b36a47827bdcddd875bfcfd3b321fe1b7b1c106",
-        strip_prefix = "XNNPACK-b8e7b076a0c2e7356a69b8478fcd76498d357b45",
+        sha256 = "dfcc7b2894c5c6bc570d65ff22b371a28e0fcc672e75705cc3f1ccc1264c3f8b",
+        strip_prefix = "XNNPACK-cac25227b5d8046170f875ad80545696be908ee7",
         urls = [
-            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/google/XNNPACK/archive/b8e7b076a0c2e7356a69b8478fcd76498d357b45.zip",
-            "https://github.com/google/XNNPACK/archive/b8e7b076a0c2e7356a69b8478fcd76498d357b45.zip",
+            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/google/XNNPACK/archive/cac25227b5d8046170f875ad80545696be908ee7.zip",
+            "https://github.com/google/XNNPACK/archive/cac25227b5d8046170f875ad80545696be908ee7.zip",
         ],
     )
 

From ecc9976c06b13082d38f62ba6add6d117d547ab4 Mon Sep 17 00:00:00 2001
From: Thai Nguyen <thaink@google.com>
Date: Wed, 17 Jun 2020 23:19:58 -0700
Subject: [PATCH 0473/1390] Add rule to generate selective-built android
 library for flex delegate

The rule can be used as follow:
tflite_flex_android_library(
    name = "tensorflowlite_flex",
    models = [model1, model2],
)
The size of tensorflow-lite-select-tf-ops.aar built for android_arm:
Full: 9.6MB
Only contain kernel for Add op: 962KB

The default "custom_" prefix for intermediate name is removed, relying on users to add it if the wish.

PiperOrigin-RevId: 317041928
Change-Id: Ia03b15045d9719256a893af886cec525fdd96952
---
 tensorflow/lite/delegates/flex/BUILD         |  1 -
 tensorflow/lite/delegates/flex/build_def.bzl | 93 ++++++++++++++------
 tensorflow/lite/java/BUILD                   | 41 ++++-----
 3 files changed, 84 insertions(+), 51 deletions(-)

diff --git a/tensorflow/lite/delegates/flex/BUILD b/tensorflow/lite/delegates/flex/BUILD
index 692aa212482..42914bf5ab8 100644
--- a/tensorflow/lite/delegates/flex/BUILD
+++ b/tensorflow/lite/delegates/flex/BUILD
@@ -247,7 +247,6 @@ tflite_flex_jni_library(
     models = [
         "//tensorflow/lite:testdata/multi_add_flex.bin",
     ],
-    visibility = ["//tensorflow/lite/android:__subpackages__"],
 )
 
 java_library(
diff --git a/tensorflow/lite/delegates/flex/build_def.bzl b/tensorflow/lite/delegates/flex/build_def.bzl
index 400ea57f290..c48ae5744f0 100644
--- a/tensorflow/lite/delegates/flex/build_def.bzl
+++ b/tensorflow/lite/delegates/flex/build_def.bzl
@@ -1,4 +1,4 @@
-"""Generate custom library flex delegate."""
+"""Generate custom flex delegate library."""
 
 load(
     "//tensorflow:tensorflow.bzl",
@@ -17,6 +17,7 @@ load(
     "tflite_jni_binary",
     "tflite_jni_linkopts",
 )
+load("@build_bazel_rules_android//android:rules.bzl", "android_library")
 
 def generate_flex_kernel_header(
         name,
@@ -44,7 +45,7 @@ def generate_flex_kernel_header(
     list_ops_output = include_path + "/list_flex_ops"
     list_ops_tool = "//tensorflow/lite/tools:list_flex_ops_main"
     native.genrule(
-        name = "%s_custom_list_flex_ops" % name,
+        name = "%s_list_flex_ops" % name,
         srcs = models,
         outs = [list_ops_output],
         tools = [list_ops_tool],
@@ -56,7 +57,7 @@ def generate_flex_kernel_header(
     # Generate the kernel registration header file from list of flex ops.
     tool = "//tensorflow/python/tools:print_selective_registration_header"
     native.genrule(
-        name = "%s_custom_kernel_registration" % name,
+        name = "%s_kernel_registration" % name,
         srcs = [list_ops_output],
         outs = [header],
         tools = [tool],
@@ -72,10 +73,10 @@ def tflite_flex_cc_library(
         name,
         portable_tensorflow_lib = "//tensorflow/core:portable_tensorflow_lib",
         visibility = ["//visibility:public"]):
-    """A rule to generate a flex delegate with custom android and ios tensorflow libs.
+    """A rule to generate a flex delegate with custom portable tensorflow lib.
 
-    These libs should be a custom version of android_tensorflow_lib and ios_tensorflow_lib
-    and contain ops registrations and kernels. If not defined, the default libs will be used.
+    This lib should be a custom version of portable_tensorflow_lib and contains ops
+    registrations and kernels. If not defined, the default libs will be used.
 
     Args:
       name: Name of the generated rule.
@@ -110,7 +111,7 @@ def tflite_flex_cc_library(
 
 def tflite_flex_jni_library(
         name,
-        models,
+        models = [],
         visibility = ["//visibility:private"]):
     """A rule to generate a jni library listing only used operators.
 
@@ -118,24 +119,23 @@ def tflite_flex_jni_library(
     Java wrapper, so please make sure there is no naming conflicts.
 
     Args:
-      name: Name of the generated library.
-      models: TFLite models to interpret.
+      name: Prefix of the generated libraries.
+      models: TFLite models to interpret. The library will only include ops and kernels
+          to support these models. If empty, the library will include all Tensorflow
+          ops and kernels.
       visibility: visibility of the generated rules.
-
-    Returns:
-      Generate a jni library support flex ops.
     """
     portable_tensorflow_lib = "//tensorflow/core:portable_tensorflow_lib"
     if models:
         CUSTOM_KERNEL_HEADER = generate_flex_kernel_header(
-            name = "%s_custom_tf_op_headers" % name,
+            name = "%s_tf_op_headers" % name,
             models = models,
         )
 
-        # Define a custom_tensorflow_lib with selective registration.
+        # Define a custom tensorflow_lib with selective registration.
         # The library will only contain ops exist in provided models.
         native.cc_library(
-            name = "%s_custom_tensorflow_lib" % name,
+            name = "%s_tensorflow_lib" % name,
             srcs = if_mobile([
                 "//tensorflow/core:portable_op_registrations_and_gradients",
                 "//tensorflow/core/kernels:android_all_ops",
@@ -168,12 +168,12 @@ def tflite_flex_jni_library(
             ],
             alwayslink = 1,
         )
-        portable_tensorflow_lib = ":%s_custom_tensorflow_lib" % name
+        portable_tensorflow_lib = ":%s_tensorflow_lib" % name
 
-    # Define a custom_init_tensorflow that depends on the custom_tensorflow_lib.
+    # Define a custom init_tensorflow that depends on the above tensorflow_lib.
     # This will avoid the symbols re-definition errors.
     native.cc_library(
-        name = "%s_custom_init_tensorflow" % name,
+        name = "%s_init_tensorflow" % name,
         srcs = [
             "//tensorflow/lite/testing:init_tensorflow.cc",
         ],
@@ -194,37 +194,78 @@ def tflite_flex_jni_library(
         }),
     )
 
-    # Define a custom_flex_delegate that depends on custom_tensorflow_lib.
+    # Define a custom flex_delegate that depends on above tensorflow_lib.
     # This will reduce the binary size comparing to the original flex delegate.
     tflite_flex_cc_library(
-        name = "%s_custom_flex_delegate" % name,
+        name = "%s_flex_delegate" % name,
         portable_tensorflow_lib = portable_tensorflow_lib,
         visibility = visibility,
     )
 
-    # Define a custom_flex_native that depends on custom_flex_delegate and custom_init_tensorflow.
+    # Define a custom flex_native that depends on above flex_delegate and init_tensorflow.
     native.cc_library(
-        name = "%s_custom_flex_native" % name,
+        name = "%s_flex_native" % name,
         srcs = [
             "//tensorflow/lite/delegates/flex/java/src/main/native:flex_delegate_jni.cc",
         ],
         copts = tflite_copts(),
         visibility = visibility,
         deps = [
-            ":%s_custom_flex_delegate" % name,
-            "%s_custom_init_tensorflow" % name,
+            ":%s_flex_delegate" % name,
+            ":%s_init_tensorflow" % name,
             "//tensorflow/lite/java/jni",
             "//tensorflow/lite/delegates/utils:simple_delegate",
         ],
         alwayslink = 1,
     )
 
-    # Build the jni binary based on the custom_flex_native.
+    # Build the jni binary based on the above flex_native.
     # The library name is fixed as libtensorflowlite_flex_jni.so in FlexDelegate.java.
     tflite_jni_binary(
         name = "libtensorflowlite_flex_jni.so",
         linkopts = tflite_jni_linkopts(),
         deps = [
-            ":%s_custom_flex_native" % name,
+            ":%s_flex_native" % name,
         ],
     )
+
+def tflite_flex_android_library(
+        name,
+        models = [],
+        custom_package = "org.tensorflow.lite.flex",
+        visibility = ["//visibility:private"]):
+    """A rule to generate an android library based on the selective-built jni library.
+
+    Args:
+      name: name of android library.
+      models: TFLite models used for selective build. The library will only include ops
+          and kernels to support these models. If empty, the library will include all
+          Tensorflow ops and kernels.
+      custom_package: Java package for which java sources will be generated.
+      visibility: visibility of the generated rules.
+    """
+    tflite_flex_jni_library(
+        name = name,
+        models = models,
+        visibility = visibility,
+    )
+
+    native.cc_library(
+        name = "%s_native" % name,
+        srcs = ["libtensorflowlite_flex_jni.so"],
+        visibility = visibility,
+    )
+
+    android_library(
+        name = name,
+        srcs = ["//tensorflow/lite/delegates/flex/java/src/main/java/org/tensorflow/lite/flex:flex_delegate"],
+        manifest = "//tensorflow/lite/java:AndroidManifest.xml",
+        proguard_specs = ["//tensorflow/lite/java:proguard.flags"],
+        custom_package = custom_package,
+        deps = [
+            ":%s_native" % name,
+            "//tensorflow/lite/java:tensorflowlite_java",
+            "@org_checkerframework_qual",
+        ],
+        visibility = visibility,
+    )
diff --git a/tensorflow/lite/java/BUILD b/tensorflow/lite/java/BUILD
index d0331bca3e5..738d66a0eb1 100644
--- a/tensorflow/lite/java/BUILD
+++ b/tensorflow/lite/java/BUILD
@@ -5,6 +5,7 @@ load("@build_bazel_rules_android//android:rules.bzl", "android_library")
 load("//tensorflow/java:build_defs.bzl", "JAVACOPTS")
 load("//tensorflow/lite:build_def.bzl", "tflite_jni_binary")
 load("//tensorflow/lite/java:aar_with_jni.bzl", "aar_with_jni")
+load("//tensorflow/lite/delegates/flex:build_def.bzl", "tflite_flex_android_library")
 
 package(
     default_visibility = ["//visibility:public"],
@@ -15,6 +16,8 @@ exports_files([
     "src/testdata/add.bin",
     "src/testdata/add_unknown_dimensions.bin",
     "src/testdata/grace_hopper_224.jpg",
+    "AndroidManifest.xml",
+    "proguard.flags",
 ])
 
 JAVA_SRCS = glob([
@@ -70,16 +73,20 @@ android_library(
 # EXPERIMENTAL: Android target that supports TensorFlow op execution with TFLite.
 # Note that this library contains *only* the Flex delegate and its Java wrapper for using
 # select TF ops; clients must also include the core `tensorflowlite` runtime.
-android_library(
+#
+# The library is generated by tflite_flex_android_library rule. This rule can also be used
+# to generate trimmed library that only contain kernels for flex ops used in
+# a set of models by listing them in the models parameter. Ex:
+# tflite_flex_android_library(
+#     name = "tensorflowlite_flex",
+#     models = [model1, model2],
+# )
+#
+# The tflite_flex_android_library rule also generate the libtensorflowlite_flex_jni.so as
+# an intermidiate target.
+tflite_flex_android_library(
     name = "tensorflowlite_flex",
-    srcs = ["//tensorflow/lite/delegates/flex/java/src/main/java/org/tensorflow/lite/flex:flex_delegate"],
-    manifest = "AndroidManifest.xml",
-    proguard_specs = ["proguard.flags"],
-    deps = [
-        ":tensorflowlite_java",
-        ":tensorflowlite_native_flex",
-        "@org_checkerframework_qual",
-    ],
+    visibility = ["//visibility:public"],
 )
 
 # EXPERIMENTAL: Android target target for GPU acceleration. Note that this
@@ -131,7 +138,7 @@ java_library(
     srcs = ["//tensorflow/lite/delegates/flex/java/src/main/java/org/tensorflow/lite/flex:flex_delegate"],
     javacopts = JAVACOPTS,
     deps = [
-        ":libtensorflowlite_flex_jni.so",
+        ":libtensorflowlite_flex_jni.so",  # Generated by tflite_flex_android_library rule.
         ":tensorflowlitelib",
         "@org_checkerframework_qual",
     ],
@@ -387,12 +394,6 @@ cc_library(
     visibility = ["//visibility:private"],
 )
 
-cc_library(
-    name = "tensorflowlite_native_flex",
-    srcs = ["libtensorflowlite_flex_jni.so"],
-    visibility = ["//visibility:private"],
-)
-
 cc_library(
     name = "tensorflowlite_native_gpu",
     srcs = ["libtensorflowlite_gpu_jni.so"],
@@ -413,14 +414,6 @@ tflite_jni_binary(
     ],
 )
 
-# EXPERIMENTAL: Native target that supports TensorFlow op execution with TFLite.
-tflite_jni_binary(
-    name = "libtensorflowlite_flex_jni.so",
-    deps = [
-        "//tensorflow/lite/delegates/flex/java/src/main/native",
-    ],
-)
-
 # EXPERIMENTAL: Native target that supports GPU acceleration.
 tflite_jni_binary(
     name = "libtensorflowlite_gpu_jni.so",

From 608d44d2cf9b84cbbcc32da905d11bb75d5dd3fd Mon Sep 17 00:00:00 2001
From: Smit Hinsu <hinsu@google.com>
Date: Wed, 17 Jun 2020 23:20:03 -0700
Subject: [PATCH 0474/1390] Add auto-generated tf.FakeParam op

PiperOrigin-RevId: 317041937
Change-Id: Id5094c31c4a8031df6a602694af6b85043707368
---
 .../mlir/tensorflow/ir/tf_generated_ops.td    | 22 +++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td b/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
index 7ed5a215ab8..49abbf18908 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
@@ -3107,6 +3107,28 @@ dimensions of `input`.
   TF_DerivedOperandTypeAttr Tcomplex = TF_DerivedOperandTypeAttr<0>;
 }
 
+def TF_FakeParamOp : TF_Op<"FakeParam", [NoSideEffect]> {
+  let summary = [{
+  This op is used as a placeholder in If branch functions. It doesn't provide a
+  valid output when run, so must either be removed (e.g. replaced with a
+  function input) or guaranteed not to be used (e.g. if mirroring an
+  intermediate output needed for the gradient computation of the other branch).
+  }];
+
+  let description = [{
+  }];
+
+  let arguments = (ins
+    TF_ShapeAttr:$shape
+  );
+
+  let results = (outs
+    TF_Tensor:$output
+  );
+
+  TF_DerivedResultTypeAttr dtype = TF_DerivedResultTypeAttr<0>;
+}
+
 def TF_FakeQuantWithMinMaxArgsOp : TF_Op<"FakeQuantWithMinMaxArgs", [NoSideEffect, SameOperandsAndResultType]> {
   let summary = [{
 Fake-quantize the 'inputs' tensor, type float to 'outputs' tensor of same type.

From 363abb136046b318fe982efae25f4df9abc5d77c Mon Sep 17 00:00:00 2001
From: Smit Hinsu <hinsu@google.com>
Date: Wed, 17 Jun 2020 23:26:37 -0700
Subject: [PATCH 0475/1390] Legalize tf.CheckNumerics op as No-op in HLO

PiperOrigin-RevId: 317042509
Change-Id: Ide2d3db43b491233d0b58d59090e72a1e3fa2459
---
 tensorflow/compiler/mlir/xla/tests/legalize-tf.mlir        | 7 +++++++
 .../compiler/mlir/xla/transforms/legalize_tf_patterns.td   | 5 ++++-
 2 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/tensorflow/compiler/mlir/xla/tests/legalize-tf.mlir b/tensorflow/compiler/mlir/xla/tests/legalize-tf.mlir
index 8d6969dd669..2cd98ea3f6b 100644
--- a/tensorflow/compiler/mlir/xla/tests/legalize-tf.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/legalize-tf.mlir
@@ -814,6 +814,13 @@ func @preventgradient(%arg0: tensor<1xi32>) -> tensor<1xi32> {
   return %0: tensor<1xi32>
 }
 
+// CHECK-LABEL: func @checkNumerics
+func @checkNumerics(%arg0: tensor<1xf32>) -> tensor<1xf32> {
+  // CHECK-NEXT:  return %arg0 : tensor<1xf32>
+  %0 = "tf.CheckNumerics"(%arg0) {message = "check numerics"} : (tensor<1xf32>) -> tensor<1xf32>
+  return %0: tensor<1xf32>
+}
+
 //===----------------------------------------------------------------------===//
 // InfeedDequeueTuple legalization
 //===----------------------------------------------------------------------===//
diff --git a/tensorflow/compiler/mlir/xla/transforms/legalize_tf_patterns.td b/tensorflow/compiler/mlir/xla/transforms/legalize_tf_patterns.td
index f3c432f38bd..df7b887fcad 100644
--- a/tensorflow/compiler/mlir/xla/transforms/legalize_tf_patterns.td
+++ b/tensorflow/compiler/mlir/xla/transforms/legalize_tf_patterns.td
@@ -321,7 +321,10 @@ def : Pat<(TF_PadV2Op $input, (TF_ConstOp $padding), $c),
 
 foreach src = [TF_IdentityOp, TF_StopGradientOp] in
   def : Pat<(src $op), (replaceWithValue $op)>;
-def : Pat<(TF_PreventGradientOp $op, $msg), (replaceWithValue $op)>;
+
+// TODO(b/32223192): Support CheckNumerics in HLO.
+foreach src = [TF_PreventGradientOp, TF_CheckNumericsOp] in
+  def : Pat<(src $op, $msg), (replaceWithValue $op)>;
 
 //===----------------------------------------------------------------------===//
 // MatMul op patterns.

From 0a8019fa2bba83decf05a128a81413edfa6afd6b Mon Sep 17 00:00:00 2001
From: Smit Hinsu <hinsu@google.com>
Date: Wed, 17 Jun 2020 23:49:02 -0700
Subject: [PATCH 0476/1390] Populate side-effect decorators for HashTable ops.

PiperOrigin-RevId: 317044477
Change-Id: I4837078079924cd08d4a4e7e63066f8268d92112
---
 tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td | 6 +++---
 tensorflow/compiler/mlir/tensorflow/ir/tf_op_base.td       | 3 ---
 tensorflow/compiler/mlir/tensorflow/ir/tf_side_effects.h   | 4 ----
 3 files changed, 3 insertions(+), 10 deletions(-)

diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td b/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
index 49abbf18908..3a5deb9c569 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
@@ -3884,7 +3884,7 @@ table will be immutable.
   );
 
   let results = (outs
-    Res<TF_ResourceTensor, "", [TF_TableRead]>:$table_handle
+    TF_ResourceTensor:$table_handle
   );
 }
 
@@ -4836,7 +4836,7 @@ table. It must also be of the same type as the table values.
   }];
 
   let arguments = (ins
-    Arg<TF_ResourceTensor, "", [TF_TableRead]>:$table_handle,
+    TF_ResourceTensor:$table_handle,
     TF_Tensor:$keys,
     TF_Tensor:$default_value
   );
@@ -4860,7 +4860,7 @@ The tensor `values` must be of the type of the table values.
   }];
 
   let arguments = (ins
-    Arg<TF_ResourceTensor, "", [TF_TableWrite]>:$table_handle,
+    TF_ResourceTensor:$table_handle,
     TF_Tensor:$keys,
     TF_Tensor:$values
   );
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_op_base.td b/tensorflow/compiler/mlir/tensorflow/ir/tf_op_base.td
index aac03061718..17424b54fc2 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_op_base.td
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_op_base.td
@@ -91,17 +91,14 @@ class TF_ResourceBase<string resourceKind> :
 def TF_VariableResource : TF_ResourceBase<"Variable">;
 def TF_StackResource : TF_ResourceBase<"Stack">;
 def TF_TensorArrayResource : TF_ResourceBase<"TensorArray">;
-def TF_TableResource : TF_ResourceBase<"Table">;
 
 def TF_VariableRead : MemRead<TF_VariableResource>;
 def TF_StackRead : MemRead<TF_StackResource>;
 def TF_TensorArrayRead : MemRead<TF_TensorArrayResource>;
-def TF_TableRead : MemRead<TF_TableResource>;
 
 def TF_VariableWrite : MemWrite<TF_VariableResource>;
 def TF_StackWrite : MemWrite<TF_StackResource>;
 def TF_TensorArrayWrite : MemWrite<TF_TensorArrayResource>;
-def TF_TableWrite: MemWrite<TF_TableResource>;
 
 //===----------------------------------------------------------------------===//
 // TensorFlow op definitions
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_side_effects.h b/tensorflow/compiler/mlir/tensorflow/ir/tf_side_effects.h
index ab1d7935bad..9be61b1db39 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_side_effects.h
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_side_effects.h
@@ -35,10 +35,6 @@ struct TensorArray : ::mlir::SideEffects::Resource::Base<TensorArray> {
   StringRef getName() final { return "TensorArray"; }
 };
 
-struct Table : ::mlir::SideEffects::Resource::Base<Table> {
-  StringRef getName() final { return "Table"; }
-};
-
 }  // namespace ResourceEffects
 }  // namespace TF
 }  // namespace mlir

From 64e1b489bb63c4b7ceb85bfa846ca699036561e8 Mon Sep 17 00:00:00 2001
From: Terry Heo <terryheo@google.com>
Date: Wed, 17 Jun 2020 23:54:57 -0700
Subject: [PATCH 0477/1390] Enable flex delegate on tensorflow.lite.Interpreter
 Python package

Usually, flex delegate is enabled by symbol override of AcquireFlexDelegate()
function. But this approach doesn't work well with shared library.

Since pywrap_tensorflow_internal.so is available for tensorflow PIP,
I've made the following changes to enable flex delegate.
- Included flex delegate module to the pywrap_tensorflow_internal.so.
  This file already contains most TF internal logic and having TFLite flex
  delegate impacts about 72K to the output.
- Added new function of TF_AcquireFlexDelegate() in the delegate module.
- Updated logic in AcquireFlexDelegate() of interpreter_builder.cc to check
  the availability of pywrap_tensorflow_internal.so and lookup the
  TF_AcquireFlexDelegate() symbol to enable flex delegate.

Also updated python/lite_flex_test.py since flex delegate is supported with
Python API

PiperOrigin-RevId: 317044994
Change-Id: Ic5e953f4a675b3f5360a4c7d607568193103711a
---
 tensorflow/lite/delegates/flex/delegate.cc |  7 +++
 tensorflow/lite/interpreter_builder.cc     | 17 ++++++
 tensorflow/lite/python/BUILD               |  5 +-
 tensorflow/lite/python/lite_flex_test.py   | 61 +++++++++++++---------
 tensorflow/python/BUILD                    |  7 ++-
 5 files changed, 68 insertions(+), 29 deletions(-)

diff --git a/tensorflow/lite/delegates/flex/delegate.cc b/tensorflow/lite/delegates/flex/delegate.cc
index 4741bddc2f5..b8b0d4e6d01 100644
--- a/tensorflow/lite/delegates/flex/delegate.cc
+++ b/tensorflow/lite/delegates/flex/delegate.cc
@@ -136,3 +136,10 @@ TfLiteStatus FlexDelegate::CopyFromBufferHandle(
 }
 
 }  // namespace tflite
+
+// Exported C interface function which is used by AcquireFlexDelegate() at
+// interpreter_build.cc. To export the function name globally, the function name
+// must be matched with patterns in tf_version_script.lds
+extern "C" tflite::TfLiteDelegateUniquePtr TF_AcquireFlexDelegate() {
+  return tflite::AcquireFlexDelegate();
+}
diff --git a/tensorflow/lite/interpreter_builder.cc b/tensorflow/lite/interpreter_builder.cc
index 43d81ef0770..d73b298e595 100644
--- a/tensorflow/lite/interpreter_builder.cc
+++ b/tensorflow/lite/interpreter_builder.cc
@@ -14,6 +14,9 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/lite/interpreter_builder.h"
 
+#if !defined(__ANDROID__) && !defined(__APPLE__) && !defined(_WIN32)
+#include <dlfcn.h>
+#endif
 #include <fcntl.h>
 #include <stdint.h>
 #include <stdio.h>
@@ -114,6 +117,20 @@ const char* kEmptyTensorName = "";
 // For flex delegate, see also the strong override in
 // lite/delegates/flex/delegate.cc.
 TFLITE_ATTRIBUTE_WEAK Interpreter::TfLiteDelegatePtr AcquireFlexDelegate() {
+#if !defined(__ANDROID__) && !defined(__APPLE__) && !defined(_WIN32)
+  // If _pywrap_tensorflow_internal.so is available, use
+  // TF_AcquireFlexDelegate() to initialize flex delegate.
+  void* lib_tf_internal =
+      dlopen("_pywrap_tensorflow_internal.so", RTLD_NOW | RTLD_LOCAL);
+  if (lib_tf_internal) {
+    auto TF_AcquireFlexDelegate =
+        reinterpret_cast<Interpreter::TfLiteDelegatePtr (*)()>(
+            dlsym(lib_tf_internal, "TF_AcquireFlexDelegate"));
+    if (TF_AcquireFlexDelegate) {
+      return TF_AcquireFlexDelegate();
+    }
+  }
+#endif
   return Interpreter::TfLiteDelegatePtr(nullptr, [](TfLiteDelegate*) {});
 }
 
diff --git a/tensorflow/lite/python/BUILD b/tensorflow/lite/python/BUILD
index d25e7d5ef8d..1b64b7d1042 100644
--- a/tensorflow/lite/python/BUILD
+++ b/tensorflow/lite/python/BUILD
@@ -193,9 +193,8 @@ py_test(
     python_version = "PY3",
     srcs_version = "PY2AND3",
     tags = [
-        # TODO(b/111881877): Enable in oss after resolving op registry issues.
-        "no_oss",
-        "no_windows",
+        "no_mac",  # TODO(b/159077703): Enable Python API Flex support on MacOS.
+        "no_windows",  # TODO(b/159077703): Enable Python API Flex support on Windows.
     ],
     deps = [
         ":lite",
diff --git a/tensorflow/lite/python/lite_flex_test.py b/tensorflow/lite/python/lite_flex_test.py
index 26bee206d27..ffc157c2128 100644
--- a/tensorflow/lite/python/lite_flex_test.py
+++ b/tensorflow/lite/python/lite_flex_test.py
@@ -19,6 +19,7 @@ from __future__ import division
 from __future__ import print_function
 
 from absl.testing import parameterized
+import numpy as np
 
 from tensorflow.lite.python import lite
 from tensorflow.lite.python.interpreter import Interpreter
@@ -41,8 +42,7 @@ class FromSessionTest(test_util.TensorFlowTestCase, parameterized.TestCase):
       ('DisableMlirConverter', False))  # disable mlir
   def testFlexMode(self, enable_mlir):
     with ops.Graph().as_default():
-      in_tensor = array_ops.placeholder(
-          shape=[1, 16, 16, 3], dtype=dtypes.float32)
+      in_tensor = array_ops.placeholder(shape=[1, 4], dtype=dtypes.float32)
       out_tensor = in_tensor + in_tensor
       sess = session.Session()
 
@@ -54,19 +54,22 @@ class FromSessionTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     tflite_model = converter.convert()
     self.assertTrue(tflite_model)
 
-    # Ensures the model contains TensorFlow ops.
-    # TODO(nupurgarg): Check values once there is a Python delegate interface.
+    # Check the model works with TensorFlow ops.
     interpreter = Interpreter(model_content=tflite_model)
-    with self.assertRaises(RuntimeError) as error:
-      interpreter.allocate_tensors()
-    self.assertIn(
-        'Regular TensorFlow ops are not supported by this interpreter.',
-        str(error.exception))
+    interpreter.allocate_tensors()
+    input_details = interpreter.get_input_details()
+    test_input = np.array([[1.0, 2.0, 3.0, 4.0]], dtype=np.float32)
+    interpreter.set_tensor(input_details[0]['index'], test_input)
+    interpreter.invoke()
+
+    output_details = interpreter.get_output_details()
+    expected_output = np.array([[2.0, 4.0, 6.0, 8.0]], dtype=np.float32)
+    output_data = interpreter.get_tensor(output_details[0]['index'])
+    self.assertTrue((expected_output == output_data).all())
 
   def testDeprecatedFlags(self):
     with ops.Graph().as_default():
-      in_tensor = array_ops.placeholder(
-          shape=[1, 16, 16, 3], dtype=dtypes.float32)
+      in_tensor = array_ops.placeholder(shape=[1, 4], dtype=dtypes.float32)
       out_tensor = in_tensor + in_tensor
       sess = session.Session()
 
@@ -83,14 +86,18 @@ class FromSessionTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     tflite_model = converter.convert()
     self.assertTrue(tflite_model)
 
-    # Ensures the model contains TensorFlow ops.
-    # TODO(nupurgarg): Check values once there is a Python delegate interface.
+    # Check the model works with TensorFlow ops.
     interpreter = Interpreter(model_content=tflite_model)
-    with self.assertRaises(RuntimeError) as error:
-      interpreter.allocate_tensors()
-    self.assertIn(
-        'Regular TensorFlow ops are not supported by this interpreter.',
-        str(error.exception))
+    interpreter.allocate_tensors()
+    input_details = interpreter.get_input_details()
+    test_input = np.array([[1.0, 2.0, 3.0, 4.0]], dtype=np.float32)
+    interpreter.set_tensor(input_details[0]['index'], test_input)
+    interpreter.invoke()
+
+    output_details = interpreter.get_output_details()
+    expected_output = np.array([[2.0, 4.0, 6.0, 8.0]], dtype=np.float32)
+    output_data = interpreter.get_tensor(output_details[0]['index'])
+    self.assertTrue((expected_output == output_data).all())
 
 
 class FromConcreteFunctionTest(test_util.TensorFlowTestCase,
@@ -114,14 +121,18 @@ class FromConcreteFunctionTest(test_util.TensorFlowTestCase,
     converter.experimental_new_converter = enable_mlir
     tflite_model = converter.convert()
 
-    # Ensures the model contains TensorFlow ops.
-    # TODO(nupurgarg): Check values once there is a Python delegate interface.
+    # Check the model works with TensorFlow ops.
     interpreter = Interpreter(model_content=tflite_model)
-    with self.assertRaises(RuntimeError) as error:
-      interpreter.allocate_tensors()
-    self.assertIn(
-        'Regular TensorFlow ops are not supported by this interpreter.',
-        str(error.exception))
+    interpreter.allocate_tensors()
+    input_details = interpreter.get_input_details()
+    test_input = np.array([4.0], dtype=np.float32)
+    interpreter.set_tensor(input_details[0]['index'], test_input)
+    interpreter.invoke()
+
+    output_details = interpreter.get_output_details()
+    expected_output = np.array([24.0], dtype=np.float32)
+    output_data = interpreter.get_tensor(output_details[0]['index'])
+    self.assertTrue((expected_output == output_data).all())
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index d141b719aef..f53859b2915 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -6058,7 +6058,12 @@ pywrap_tensorflow_macro(
         "@ngraph_tf//:ngraph_tf",
     ]) + if_xla_available([
         "//tensorflow/compiler/aot:tfcompile_lib",
-    ]),
+    ]) + select({
+        "//tensorflow:windows": [],  # TODO(b/159077703): Enable Flex on Windows
+        "//conditions:default": [
+            "//tensorflow/lite/delegates/flex:delegate",
+        ],
+    }),
 )
 
 # ** Targets for Windows build (start) **

From fabcd8f89cd5975331994049705e15cb75f32e0c Mon Sep 17 00:00:00 2001
From: Marat Dukhan <maratek@google.com>
Date: Thu, 18 Jun 2020 00:06:13 -0700
Subject: [PATCH 0478/1390] Update XNNPACK dependency

Bring in fix for x86 builds on Android NDK r20

PiperOrigin-RevId: 317046250
Change-Id: I493cf294ea4e51c91a68b9bc8b062f6cecf4da7f
---
 tensorflow/workspace.bzl | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index fab6faf62d1..78f7e0ce03e 100755
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -164,11 +164,11 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
 
     tf_http_archive(
         name = "XNNPACK",
-        sha256 = "dfcc7b2894c5c6bc570d65ff22b371a28e0fcc672e75705cc3f1ccc1264c3f8b",
-        strip_prefix = "XNNPACK-cac25227b5d8046170f875ad80545696be908ee7",
+        sha256 = "4af883fea0a6ada106867f29670a6c0b7af74bee85d74a2e04356a670814a3d4",
+        strip_prefix = "XNNPACK-69a6a7667d96a84c596b0f4e00632b2037c17723",
         urls = [
-            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/google/XNNPACK/archive/cac25227b5d8046170f875ad80545696be908ee7.zip",
-            "https://github.com/google/XNNPACK/archive/cac25227b5d8046170f875ad80545696be908ee7.zip",
+            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/google/XNNPACK/archive/69a6a7667d96a84c596b0f4e00632b2037c17723.zip",
+            "https://github.com/google/XNNPACK/archive/69a6a7667d96a84c596b0f4e00632b2037c17723.zip",
         ],
     )
 

From b38d5d1889b398ca45aa7f2e4f9f0184c77c6c55 Mon Sep 17 00:00:00 2001
From: Yunxing Dai <yunxing@google.com>
Date: Thu, 18 Jun 2020 00:52:32 -0700
Subject: [PATCH 0479/1390] Remove dynamic dimension of strided slice grad if
 input to strided slice is static.

If we slice a dynamic shaped tensor from a static tensor, the output
of the gradient should still be static. Unfortunately this cannot be
deduced alone by xla, so extra information is needed from the tf2xla
bridge.

PiperOrigin-RevId: 317051543
Change-Id: I7a8113c47a4aed145dfba7f7d12992ca35a9cf19
---
 .../tf2xla/kernels/strided_slice_op.cc        | 22 ++++++++++++++++
 tensorflow/compiler/xla/client/xla_builder.cc | 23 ++++++++++++++++
 tensorflow/compiler/xla/client/xla_builder.h  |  6 +++++
 .../compiler/xla/client/xla_builder_test.cc   | 26 +++++++++++++++++++
 4 files changed, 77 insertions(+)

diff --git a/tensorflow/compiler/tf2xla/kernels/strided_slice_op.cc b/tensorflow/compiler/tf2xla/kernels/strided_slice_op.cc
index 2684c982600..51764018df1 100644
--- a/tensorflow/compiler/tf2xla/kernels/strided_slice_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/strided_slice_op.cc
@@ -350,6 +350,28 @@ class StridedSliceGradOp : public XlaOpKernel {
       grad = xla::Rev(grad, dimensions_to_reverse);
     }
     grad = xla::Pad(grad, zero, padding_config);
+
+    xla::XlaOp dynamic_shape = ctx->Input(0);
+    xla::Shape grad_shape = ctx->builder()->GetShape(grad).ValueOrDie();
+    ctx->set_dynamic_dimension_is_minus_one(true);
+    std::vector<int64> dynamic_size;
+    OP_REQUIRES_OK(ctx, ctx->ConstantInputAsIntVector(0, &dynamic_size));
+    // Input of strided_slice_op has to have the same shape as output.
+    DCHECK_EQ(grad_shape.rank(), input_shape.dims());
+    for (int64 dim = 0; dim < input_shape.dims(); ++dim) {
+      DCHECK_EQ(grad_shape.dimensions(dim), input_shape.dim_size(dim));
+      if (dynamic_size[dim] == -1) {
+        // Input is a dynamic dimension, set the same dynamic dimension size in
+        // the output.
+        auto dim_size = xla::Slice(dynamic_shape, {dim}, {dim + 1}, {1});
+        grad = xla::SetDimensionSize(grad, dim_size, dim);
+      } else if (grad_shape.is_dynamic_dimension(dim)) {
+        // Input is static but output is dynamic, respect input and remove any
+        // dynamic dim in the output.
+        grad = xla::RemoveDynamicDimension(grad, dim);
+      }
+    }
+
     ctx->SetOutput(0, grad);
   }
 
diff --git a/tensorflow/compiler/xla/client/xla_builder.cc b/tensorflow/compiler/xla/client/xla_builder.cc
index bfba48862f6..c7b6a7f9491 100644
--- a/tensorflow/compiler/xla/client/xla_builder.cc
+++ b/tensorflow/compiler/xla/client/xla_builder.cc
@@ -2727,6 +2727,25 @@ XlaOp XlaBuilder::GetDimensionSize(XlaOp operand, int64 dimension) {
   });
 }
 
+XlaOp XlaBuilder::RemoveDynamicDimension(XlaOp operand, int64 dimension) {
+  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    HloInstructionProto instr;
+    TF_ASSIGN_OR_RETURN(const Shape* operand_shape, GetShapePtr(operand));
+
+    Shape shape = *operand_shape;
+    shape.set_dynamic_dimension(dimension, false);
+    // Setting an op's dynamic dimension to its static size removes the dynamic
+    // dimension.
+    XlaOp static_size =
+        ConstantR0<int32>(this, operand_shape->dimensions(dimension));
+
+    *instr.mutable_shape() = shape.ToProto();
+    instr.add_dimensions(dimension);
+    return AddInstruction(std::move(instr), HloOpcode::kSetDimensionSize,
+                          {operand, static_size});
+  });
+}
+
 XlaOp XlaBuilder::SetDimensionSize(XlaOp operand, XlaOp val, int64 dimension) {
   return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
     HloInstructionProto instr;
@@ -3827,4 +3846,8 @@ XlaOp SetDimensionSize(const XlaOp operand, const XlaOp val, int64 dimension) {
   return operand.builder()->SetDimensionSize(operand, val, dimension);
 }
 
+XlaOp RemoveDynamicDimension(const XlaOp operand, int64 dimension) {
+  return operand.builder()->RemoveDynamicDimension(operand, dimension);
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/client/xla_builder.h b/tensorflow/compiler/xla/client/xla_builder.h
index ffa6a7c3439..b8af180b83e 100644
--- a/tensorflow/compiler/xla/client/xla_builder.h
+++ b/tensorflow/compiler/xla/client/xla_builder.h
@@ -704,6 +704,8 @@ class XlaBuilder {
 
   XlaOp SetDimensionSize(XlaOp operand, XlaOp val, int64 dimension);
 
+  XlaOp RemoveDynamicDimension(XlaOp operand, int64 dimension);
+
   StatusOr<XlaOp> AddInstruction(HloInstructionProto&& instr, HloOpcode opcode,
                                  absl::Span<const XlaOp> operands = {});
 
@@ -1151,6 +1153,7 @@ class XlaBuilder {
 
   friend XlaOp GetDimensionSize(XlaOp operand, int64 dimension);
   friend XlaOp SetDimensionSize(XlaOp operand, XlaOp val, int64 dimension);
+  friend XlaOp RemoveDynamicDimension(XlaOp operand, int64 dimension);
 
  protected:
   // Returns OK status if the given op was built using this builder. Otherwise,
@@ -2149,6 +2152,9 @@ XlaOp GetDimensionSize(XlaOp operand, int64 dimension);
 
 XlaOp SetDimensionSize(XlaOp operand, XlaOp val, int64 dimension);
 
+// Returns the same op but with dynamic dimension removed.
+XlaOp RemoveDynamicDimension(XlaOp operand, int64 dimension);
+
 // Implementation details below this point.
 //
 
diff --git a/tensorflow/compiler/xla/client/xla_builder_test.cc b/tensorflow/compiler/xla/client/xla_builder_test.cc
index 4fa47077fca..7011c946203 100644
--- a/tensorflow/compiler/xla/client/xla_builder_test.cc
+++ b/tensorflow/compiler/xla/client/xla_builder_test.cc
@@ -556,6 +556,32 @@ TEST_F(XlaBuilderTest, DynamicParameter) {
   EXPECT_TRUE(param_shape.is_dynamic_dimension(0));
 }
 
+TEST_F(XlaBuilderTest, SetDimensionSize) {
+  XlaBuilder b(TestName());
+  auto p0 = Parameter(&b, 0, ShapeUtil::MakeShape(F32, {10}), "p0");
+  auto p1 = Parameter(&b, 1, ShapeUtil::MakeShape(S32, {}), "p1");
+  auto set_dim_size = SetDimensionSize(p0, p1, 0);
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          BuildHloModule(&b, /*root=*/set_dim_size));
+  const Shape& root_shape =
+      module->entry_computation()->root_instruction()->shape();
+  EXPECT_TRUE(root_shape.is_dynamic_dimension(0));
+}
+
+TEST_F(XlaBuilderTest, RemoveDimensionSize) {
+  XlaBuilder b(TestName());
+  auto p0 = Parameter(&b, 0, ShapeUtil::MakeShape(F32, {10}), "p0");
+  auto p1 = Parameter(&b, 1, ShapeUtil::MakeShape(S32, {}), "p1");
+  auto set_dim_size = SetDimensionSize(p0, p1, 0);
+  auto remove_dim_size = RemoveDynamicDimension(set_dim_size, 0);
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          BuildHloModule(&b, /*root=*/remove_dim_size));
+  const Shape& root_shape =
+      module->entry_computation()->root_instruction()->shape();
+  // Dynamic dimension has been removed.
+  EXPECT_FALSE(root_shape.is_dynamic_dimension(0));
+}
+
 TEST_F(XlaBuilderTest, DynamicUnary) {
   XlaBuilder b(TestName());
   Shape tuple_param_shape = ShapeUtil::MakeTupleShape(

From 6a8268d74e64e3bf1e445a9b1adb62e04bcdea34 Mon Sep 17 00:00:00 2001
From: Felix Johnny <felixjohnny.thomasmathibalan@arm.com>
Date: Wed, 3 Jun 2020 08:27:57 +0200
Subject: [PATCH 0480/1390] CMSIS-NN wrapper update for interface changes

Background: CMSIS-NN int8  APIs are changed
where the pass by value function arguments
are replaced by pass by reference struct
arguments.

The following changes are done in TFL micro

1. Update int8 cmsis-nn's depthwise conv,
   fully connected and average pooling
   wrapper files to use the new  interface.

2. For the above mentioned operators,
   updates from the reference implementation
   (e.g, CalculateOpData() in Prepare() instead
   of Eval()) are ported as well.
---
 .../micro/kernels/cmsis-nn/depthwise_conv.cc  | 394 ++++++++++--------
 .../micro/kernels/cmsis-nn/fully_connected.cc | 221 ++++++----
 .../lite/micro/kernels/cmsis-nn/pooling.cc    | 195 ++++-----
 .../lite/micro/tools/make/ext_libs/cmsis.inc  |   1 +
 .../tools/make/third_party_downloads.inc      |   4 +-
 5 files changed, 442 insertions(+), 373 deletions(-)

diff --git a/tensorflow/lite/micro/kernels/cmsis-nn/depthwise_conv.cc b/tensorflow/lite/micro/kernels/cmsis-nn/depthwise_conv.cc
index 4d398855abc..eb5fad0a08a 100644
--- a/tensorflow/lite/micro/kernels/cmsis-nn/depthwise_conv.cc
+++ b/tensorflow/lite/micro/kernels/cmsis-nn/depthwise_conv.cc
@@ -1,4 +1,4 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -36,7 +36,6 @@ constexpr int kInputTensor = 0;
 constexpr int kFilterTensor = 1;
 constexpr int kBiasTensor = 2;
 constexpr int kOutputTensor = 0;
-constexpr int kMaxChannels = 256;
 
 // Depthwise conv is quantized along dimension 3:
 // https://www.tensorflow.org/lite/performance/quantization_spec
@@ -50,14 +49,14 @@ struct OpData {
   int output_shift;
 
   // Per channel output multiplier and shift.
-  // TODO(b/141139247): Allocate these dynamically when possible.
-  int32_t per_channel_output_multiplier[kMaxChannels];
-  int32_t per_channel_output_shift[kMaxChannels];
-
+  int32_t* per_channel_output_multiplier;
+  int32_t* per_channel_output_shift;
   // The range of the fused activation layer. For example for kNone and
   // uint8_t these would be 0 and 255.
   int32_t output_activation_min;
   int32_t output_activation_max;
+  // Index to buffer for optimizations if applicable.
+  int buffer_idx;
 };
 
 TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteNode* node,
@@ -70,6 +69,8 @@ TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteNode* node,
   TF_LITE_ENSURE_EQ(context, node->outputs->size, 1);
 
   int unused_output_height, unused_output_width;
+  // Set buffer index to a reset value
+  data->buffer_idx = -1;
   data->padding = ComputePaddingHeightWidth(
       params->stride_height, params->stride_width, 1, 1, height, width,
       filter_height, filter_width, params->padding, &unused_output_height,
@@ -85,12 +86,12 @@ TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteNode* node,
     TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
     int num_channels = filter->dims->data[kDepthwiseConvQuantizedDimension];
 
-    TF_LITE_ENSURE_STATUS(tflite::PopulateConvolutionQuantizationParams(
+    return tflite::PopulateConvolutionQuantizationParams(
         context, input, filter, bias, output, params->activation,
         &data->output_multiplier, &data->output_shift,
         &data->output_activation_min, &data->output_activation_max,
         data->per_channel_output_multiplier,
-        reinterpret_cast<int*>(data->per_channel_output_shift), num_channels));
+        reinterpret_cast<int*>(data->per_channel_output_shift), num_channels);
   }
   return kTfLiteOk;
 }
@@ -98,47 +99,117 @@ TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteNode* node,
 }  // namespace
 
 void* Init(TfLiteContext* context, const char* buffer, size_t length) {
-  void* raw;
-  context->AllocatePersistentBuffer(context, sizeof(int), &raw);
-  return raw;
+  TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
+  void* data = nullptr;
+  if (context->AllocatePersistentBuffer(context, sizeof(OpData), &data) ==
+      kTfLiteError) {
+    return nullptr;
+  }
+  return data;
 }
 
 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
-#if defined(__ARM_FEATURE_DSP) || defined(__ARM_FEATURE_MVE)
+  TFLITE_DCHECK(node->user_data != nullptr);
+  TFLITE_DCHECK(node->builtin_data != nullptr);
+
+  OpData* data = static_cast<OpData*>(node->user_data);
   auto* params =
       reinterpret_cast<TfLiteDepthwiseConvParams*>(node->builtin_data);
 
   const TfLiteTensor* input = GetInput(context, node, kInputTensor);
   const TfLiteTensor* filter = GetInput(context, node, kFilterTensor);
 
-  const int filter_width = SizeOfDimension(filter, 2);
-  const int filter_height = SizeOfDimension(filter, 1);
+  const TfLiteType data_type = input->type;
+  int width = SizeOfDimension(input, 2);
+  int height = SizeOfDimension(input, 1);
+  int filter_width = SizeOfDimension(filter, 2);
+  int filter_height = SizeOfDimension(filter, 1);
 
-  RuntimeShape input_shape = GetTensorShape(input);
-  const int input_depth = input_shape.Dims(3);
+  if (input->type == kTfLiteInt8) {
+    // Allocate memory for per-channel quantization parameters
+    const int num_channels =
+        filter->dims->data[kDepthwiseConvQuantizedDimension];
+    // Dynamically allocate per-channel quantization parameters.
+    TF_LITE_ENSURE_STATUS(context->AllocatePersistentBuffer(
+        context, num_channels * sizeof(int32_t),
+        reinterpret_cast<void**>(&data->per_channel_output_multiplier)));
+    TF_LITE_ENSURE_STATUS(context->AllocatePersistentBuffer(
+        context, num_channels * sizeof(int32_t),
+        reinterpret_cast<void**>(&data->per_channel_output_shift)));
+    TF_LITE_ENSURE_EQ(context, filter->quantization.type,
+                      kTfLiteAffineQuantization);
 
-  int* buffer_idx = reinterpret_cast<int*>(node->user_data);
+    // All per-channel quantized tensors need valid zero point and scale arrays.
+    const auto* affine_quantization =
+        reinterpret_cast<TfLiteAffineQuantization*>(
+            filter->quantization.params);
+    TF_LITE_ENSURE(context, affine_quantization);
+    TF_LITE_ENSURE(context, affine_quantization->scale);
+    TF_LITE_ENSURE(context, affine_quantization->zero_point);
+    TF_LITE_ENSURE(
+        context, affine_quantization->scale->size == 1 ||
+                     affine_quantization->scale->size ==
+                         filter->dims->data[kDepthwiseConvQuantizedDimension]);
+    TF_LITE_ENSURE_EQ(context, affine_quantization->scale->size,
+                      affine_quantization->zero_point->size);
+  }
 
-  *buffer_idx = -1;
-  node->user_data = buffer_idx;
+  TF_LITE_ENSURE_STATUS(CalculateOpData(context, node, params, width, height,
+                                        filter_width, filter_height, data_type,
+                                        data));
 
-  if (params->depth_multiplier == 1) {
-    const int32_t buf_size = arm_depthwise_conv_s8_opt_get_buffer_size(
-        input_depth, filter_width, filter_height);
+  if (input->type == kTfLiteInt8) {
+    const TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+    RuntimeShape input_shape = GetTensorShape(input);
+    RuntimeShape output_shape = GetTensorShape(output);
+    RuntimeShape filter_shape = GetTensorShape(filter);
+    TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
+    TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
+    TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
+
+    const int batch_size = MatchingDim(input_shape, 0, output_shape, 0);
+    const int output_depth = MatchingDim(output_shape, 3, filter_shape, 3);
+    TFLITE_DCHECK_EQ(batch_size, 1); /* Only batch = 1 is supported */
+
+    cmsis_nn_dims input_dims;
+    input_dims.n = batch_size;
+    input_dims.h = height;
+    input_dims.w = width;
+    input_dims.c = input_shape.Dims(3);
+
+    cmsis_nn_dims filter_dims;
+    filter_dims.n = 1;
+    filter_dims.h = filter_height;
+    filter_dims.w = filter_width;
+    filter_dims.c = output_depth;
+
+    cmsis_nn_dims output_dims;
+    output_dims.n = batch_size;
+    output_dims.h = output_shape.Dims(1);
+    output_dims.w = output_shape.Dims(2);
+    output_dims.c = output_depth;
+
+    cmsis_nn_dw_conv_params dw_conv_params;
+    dw_conv_params.padding.h = data->padding.height;
+    dw_conv_params.padding.w = data->padding.width;
+
+    const int32_t buf_size = arm_depthwise_conv_wrapper_s8_get_buffer_size(
+        &dw_conv_params, &input_dims, &filter_dims, &output_dims);
 
     if (buf_size > 0) {
-      TF_LITE_ENSURE_STATUS(
-          context->RequestScratchBufferInArena(context, buf_size, buffer_idx));
+      TF_LITE_ENSURE_STATUS(context->RequestScratchBufferInArena(
+          context, buf_size, &data->buffer_idx));
+    } else {
+      data->buffer_idx = -1;
     }
   }
-#endif
   return kTfLiteOk;
 }
 
-TfLiteStatus EvalFloat(TfLiteContext* context, TfLiteNode* node,
-                       TfLiteDepthwiseConvParams* params, OpData* data,
-                       const TfLiteTensor* input, const TfLiteTensor* filter,
-                       const TfLiteTensor* bias, TfLiteTensor* output) {
+void EvalFloat(TfLiteContext* context, TfLiteNode* node,
+               TfLiteDepthwiseConvParams* params, const OpData* data,
+               const TfLiteTensor* input, const TfLiteTensor* filter,
+               const TfLiteTensor* bias, TfLiteTensor* output) {
   float output_activation_min, output_activation_max;
   CalculateActivationRange(params->activation, &output_activation_min,
                            &output_activation_max);
@@ -150,8 +221,8 @@ TfLiteStatus EvalFloat(TfLiteContext* context, TfLiteNode* node,
   op_params.padding_values.height = data->padding.height;
   op_params.stride_width = params->stride_width;
   op_params.stride_height = params->stride_height;
-  op_params.dilation_width_factor = 1;
-  op_params.dilation_height_factor = 1;
+  op_params.dilation_width_factor = params->dilation_width_factor;
+  op_params.dilation_height_factor = params->dilation_height_factor;
   op_params.depth_multiplier = params->depth_multiplier;
   op_params.float_activation_min = output_activation_min;
   op_params.float_activation_max = output_activation_max;
@@ -161,106 +232,120 @@ TfLiteStatus EvalFloat(TfLiteContext* context, TfLiteNode* node,
       GetTensorShape(filter), GetTensorData<float>(filter),
       GetTensorShape(bias), GetTensorData<float>(bias), GetTensorShape(output),
       GetTensorData<float>(output));
-  return kTfLiteOk;
 }
 
-TfLiteStatus EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node,
-                                     TfLiteDepthwiseConvParams* params,
-                                     OpData* data, const TfLiteTensor* input,
-                                     const TfLiteTensor* filter,
-                                     const TfLiteTensor* bias,
-                                     TfLiteTensor* output) {
-  DepthwiseParams op_params;
-  op_params.padding_type = PaddingType::kSame;
-  op_params.padding_values.width = data->padding.width;
-  op_params.padding_values.height = data->padding.height;
-  op_params.stride_width = params->stride_width;
-  op_params.stride_height = params->stride_height;
-  op_params.dilation_width_factor = params->dilation_width_factor;
-  op_params.dilation_height_factor = params->dilation_height_factor;
-  op_params.depth_multiplier = params->depth_multiplier;
-  op_params.input_offset = -input->params.zero_point;
-  op_params.weights_offset = 0;
-  op_params.output_offset = output->params.zero_point;
-  // TODO(b/130439627): Use calculated value for clamping.
-  op_params.quantized_activation_min = std::numeric_limits<int8_t>::min();
-  op_params.quantized_activation_max = std::numeric_limits<int8_t>::max();
+void EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node,
+                             TfLiteDepthwiseConvParams* params,
+                             const OpData* data, const TfLiteTensor* input,
+                             const TfLiteTensor* filter,
+                             const TfLiteTensor* bias, TfLiteTensor* output) {
+  cmsis_nn_dw_conv_params dw_conv_params;
+  dw_conv_params.dilation.h = params->dilation_height_factor;
+  dw_conv_params.dilation.w = params->dilation_width_factor;
+  // Call to reference implementation can be removed when dilation is supported
+  // in the optimized implementations.
+  if (1 == dw_conv_params.dilation.h && 1 == dw_conv_params.dilation.w) {
+    dw_conv_params.input_offset = -input->params.zero_point;
+    dw_conv_params.output_offset = output->params.zero_point;
+    dw_conv_params.stride.h = params->stride_height;
+    dw_conv_params.stride.w = params->stride_width;
+    dw_conv_params.padding.h = data->padding.height;
+    dw_conv_params.padding.w = data->padding.width;
+    // TODO(b/130439627): Use calculated value for clamping.
+    dw_conv_params.activation.min = std::numeric_limits<int8_t>::min();
+    dw_conv_params.activation.max = std::numeric_limits<int8_t>::max();
+    dw_conv_params.ch_mult = params->depth_multiplier;
 
-#if defined(__ARM_FEATURE_DSP) || defined(__ARM_FEATURE_MVE)
-  RuntimeShape filter_shape = GetTensorShape(filter);
-  const int filter_height = filter_shape.Dims(1);
-  const int filter_width = filter_shape.Dims(2);
-  RuntimeShape input_shape = GetTensorShape(input);
-  const int input_height = input_shape.Dims(1);
-  const int input_width = input_shape.Dims(2);
-  const int input_depth = input_shape.Dims(3);
-  RuntimeShape output_shape = GetTensorShape(output);
-  const int output_height = output_shape.Dims(1);
-  const int output_width = output_shape.Dims(2);
-  RuntimeShape bias_shape = GetTensorShape(bias);
+    cmsis_nn_per_channel_quant_params quant_params;
+    quant_params.multiplier = data->per_channel_output_multiplier;
+    quant_params.shift = data->per_channel_output_shift;
 
-  if (op_params.depth_multiplier == 1) {
-    int16_t* buf = nullptr;
-    auto* buffer_idx = reinterpret_cast<int*>(node->user_data);
-    if (*buffer_idx > -1) {
-      void* raw = context->GetScratchBuffer(context, *buffer_idx);
-      buf = reinterpret_cast<int16_t*>(raw);
+    RuntimeShape filter_shape = GetTensorShape(filter);
+    RuntimeShape input_shape = GetTensorShape(input);
+    RuntimeShape output_shape = GetTensorShape(output);
+    RuntimeShape bias_shape = GetTensorShape(bias);
+
+    TFLITE_DCHECK_LE(dw_conv_params.activation.min,
+                     dw_conv_params.activation.max);
+
+    const int batch_size = MatchingDim(input_shape, 0, output_shape, 0);
+    const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3);
+
+    if (GetTensorData<int8_t>(bias)) {
+      TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
     }
 
-    TF_LITE_ENSURE_EQ(
-        context,
-        arm_depthwise_conv_s8_opt(
-            GetTensorData<int8_t>(input), input_width, input_height,
-            input_depth, GetTensorData<int8_t>(filter), input_depth,
-            filter_width, filter_height, op_params.padding_values.width,
-            op_params.padding_values.height, op_params.stride_width,
-            op_params.stride_height, GetTensorData<int32>(bias),
-            GetTensorData<int8_t>(output), data->per_channel_output_shift,
-            data->per_channel_output_multiplier, output_width, output_height,
-            op_params.output_offset, op_params.input_offset,
-            op_params.quantized_activation_min,
-            op_params.quantized_activation_max, op_params.dilation_width_factor,
-            op_params.dilation_height_factor, buf),
-        ARM_MATH_SUCCESS);
+    cmsis_nn_dims input_dims;
+    input_dims.n = batch_size;
+    input_dims.h = input_shape.Dims(1);
+    input_dims.w = input_shape.Dims(2);
+    input_dims.c = input_shape.Dims(3);
+
+    cmsis_nn_dims filter_dims;
+    filter_dims.n = filter_shape.Dims(0);
+    filter_dims.h = filter_shape.Dims(1);
+    filter_dims.w = filter_shape.Dims(2);
+    filter_dims.c = output_depth;
+
+    cmsis_nn_dims bias_dims;
+    bias_dims.n = 1;
+    bias_dims.h = 1;
+    bias_dims.w = 1;
+    bias_dims.c = output_depth;
+
+    cmsis_nn_dims output_dims;
+    output_dims.n = batch_size;
+    output_dims.h = output_shape.Dims(1);
+    output_dims.w = output_shape.Dims(2);
+    output_dims.c = output_depth;
+
+    cmsis_nn_context ctx;
+    ctx.buf = nullptr;
+    /* 'size' is unused */
+    ctx.size = 0;
+
+    if (data->buffer_idx > -1) {
+      ctx.buf = context->GetScratchBuffer(context, data->buffer_idx);
+    }
+
+    TFLITE_DCHECK_EQ(arm_depthwise_conv_wrapper_s8(
+                         &ctx, &dw_conv_params, &quant_params, &input_dims,
+                         GetTensorData<int8_t>(input), &filter_dims,
+                         GetTensorData<int8_t>(filter), &bias_dims,
+                         GetTensorData<int32>(bias), &output_dims,
+                         GetTensorData<int8_t>(output)),
+                     ARM_MATH_SUCCESS);
   } else {
-    TF_LITE_ENSURE_EQ(
-        context,
-        arm_depthwise_conv_s8(
-            GetTensorData<int8_t>(input), input_width, input_height,
-            input_depth, GetTensorData<int8_t>(filter),
-            op_params.depth_multiplier * input_depth,
-            op_params.depth_multiplier, filter_width, filter_height,
-            op_params.padding_values.width, op_params.padding_values.height,
-            op_params.stride_width, op_params.stride_height,
-            GetTensorData<int32>(bias), GetTensorData<int8_t>(output),
-            data->per_channel_output_shift, data->per_channel_output_multiplier,
-            output_width, output_height, op_params.output_offset,
-            op_params.input_offset, op_params.quantized_activation_min,
-            op_params.quantized_activation_max, op_params.dilation_width_factor,
-            op_params.dilation_height_factor, nullptr),
-        ARM_MATH_SUCCESS);
+    DepthwiseParams op_params;
+    op_params.padding_type = PaddingType::kSame;
+    op_params.padding_values.width = data->padding.width;
+    op_params.padding_values.height = data->padding.height;
+    op_params.stride_width = params->stride_width;
+    op_params.stride_height = params->stride_height;
+    op_params.dilation_width_factor = params->dilation_width_factor;
+    op_params.dilation_height_factor = params->dilation_height_factor;
+    op_params.depth_multiplier = params->depth_multiplier;
+    op_params.input_offset = -input->params.zero_point;
+    op_params.weights_offset = 0;
+    op_params.output_offset = output->params.zero_point;
+    // TODO(b/130439627): Use calculated value for clamping.
+    op_params.quantized_activation_min = std::numeric_limits<int8_t>::min();
+    op_params.quantized_activation_max = std::numeric_limits<int8_t>::max();
+
+    reference_integer_ops::DepthwiseConvPerChannel(
+        op_params, data->per_channel_output_multiplier,
+        data->per_channel_output_shift, GetTensorShape(input),
+        GetTensorData<int8>(input), GetTensorShape(filter),
+        GetTensorData<int8>(filter), GetTensorShape(bias),
+        GetTensorData<int32>(bias), GetTensorShape(output),
+        GetTensorData<int8>(output));
   }
-#else
-#pragma message( \
-    "CMSIS-NN optimization for depthwise_conv not available for this target. Using reference kernel.")
-
-  reference_integer_ops::DepthwiseConvPerChannel(
-      op_params, data->per_channel_output_multiplier,
-      data->per_channel_output_shift, GetTensorShape(input),
-      GetTensorData<int8>(input), GetTensorShape(filter),
-      GetTensorData<int8>(filter), GetTensorShape(bias),
-      GetTensorData<int32>(bias), GetTensorShape(output),
-      GetTensorData<int8>(output));
-
-#endif
-  return kTfLiteOk;
 }
 
-TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
-                           TfLiteDepthwiseConvParams* params, OpData* data,
-                           const TfLiteTensor* input,
-                           const TfLiteTensor* filter, const TfLiteTensor* bias,
-                           TfLiteTensor* output) {
+void EvalQuantized(TfLiteContext* context, TfLiteNode* node,
+                   TfLiteDepthwiseConvParams* params, const OpData* data,
+                   const TfLiteTensor* input, const TfLiteTensor* filter,
+                   const TfLiteTensor* bias, TfLiteTensor* output) {
   const int32_t input_offset = -input->params.zero_point;
   const int32_t filter_offset = -filter->params.zero_point;
   const int32_t output_offset = output->params.zero_point;
@@ -272,8 +357,8 @@ TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
   op_params.padding_values.height = data->padding.height;
   op_params.stride_width = params->stride_width;
   op_params.stride_height = params->stride_height;
-  op_params.dilation_width_factor = 1;
-  op_params.dilation_height_factor = 1;
+  op_params.dilation_width_factor = params->dilation_width_factor;
+  op_params.dilation_height_factor = params->dilation_height_factor;
   op_params.depth_multiplier = params->depth_multiplier;
   op_params.quantized_activation_min = data->output_activation_min;
   op_params.quantized_activation_max = data->output_activation_max;
@@ -284,13 +369,11 @@ TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
   // Legacy ops used mixed left and right shifts. Now all are +ve-means-left.
   op_params.output_shift = -data->output_shift;
 
-#if defined(__ARM_FEATURE_DSP) || defined(__ARM_FEATURE_MVE)
-  // optimizations utilize loop unrolling which requires the following power
-  // of two kernel dimensions
-  RuntimeShape filter_shape = GetTensorShape(filter);
-  const int filter_height = filter_shape.Dims(1);
-  const int filter_width = filter_shape.Dims(2);
-  if (0 == op_params.depth_multiplier % 2 && 0 == filter_width % 2) {
+  if (1 == op_params.dilation_width_factor &&
+      1 == op_params.dilation_height_factor) {
+    RuntimeShape filter_shape = GetTensorShape(filter);
+    const int filter_height = filter_shape.Dims(1);
+    const int filter_width = filter_shape.Dims(2);
     RuntimeShape input_shape = GetTensorShape(input);
     const int input_height = input_shape.Dims(1);
     const int input_width = input_shape.Dims(2);
@@ -310,22 +393,22 @@ TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
         output_height, op_params.quantized_activation_min,
         op_params.quantized_activation_max, op_params.output_shift,
         op_params.output_multiplier);
-  } else
-#endif
-
-  {
+  } else {
     tflite::reference_ops::DepthwiseConv(
         op_params, GetTensorShape(input), GetTensorData<uint8_t>(input),
         GetTensorShape(filter), GetTensorData<uint8_t>(filter),
         GetTensorShape(bias), GetTensorData<int32_t>(bias),
         GetTensorShape(output), GetTensorData<uint8_t>(output));
   }
-  return kTfLiteOk;
 }
 
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  TFLITE_DCHECK(node->user_data != nullptr);
+  TFLITE_DCHECK(node->builtin_data != nullptr);
+
   auto* params =
       reinterpret_cast<TfLiteDepthwiseConvParams*>(node->builtin_data);
+  const OpData& data = *(static_cast<const OpData*>(node->user_data));
 
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
   const TfLiteTensor* input = GetInput(context, node, kInputTensor);
@@ -333,51 +416,18 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   const TfLiteTensor* bias =
       (NumInputs(node) == 3) ? GetInput(context, node, kBiasTensor) : nullptr;
 
-  const TfLiteType data_type = input->type;
-  int width = SizeOfDimension(input, 2);
-  int height = SizeOfDimension(input, 1);
-  int filter_width = SizeOfDimension(filter, 2);
-  int filter_height = SizeOfDimension(filter, 1);
-
-  OpData data;
-
-  // All per-channel quantized tensors need valid zero point and scale arrays.
-  if (input->type == kTfLiteInt8) {
-    TF_LITE_ENSURE_EQ(context, filter->quantization.type,
-                      kTfLiteAffineQuantization);
-
-    const auto* affine_quantization =
-        reinterpret_cast<TfLiteAffineQuantization*>(
-            filter->quantization.params);
-    TF_LITE_ENSURE(context, affine_quantization);
-    TF_LITE_ENSURE(context, affine_quantization->scale);
-    TF_LITE_ENSURE(context, affine_quantization->zero_point);
-    TF_LITE_ENSURE(
-        context, affine_quantization->scale->size == 1 ||
-                     affine_quantization->scale->size ==
-                         filter->dims->data[kDepthwiseConvQuantizedDimension]);
-    TF_LITE_ENSURE_EQ(context, affine_quantization->scale->size,
-                      affine_quantization->zero_point->size);
-  }
-
-  TF_LITE_ENSURE_STATUS(CalculateOpData(context, node, params, width, height,
-                                        filter_width, filter_height, data_type,
-                                        &data));
-
   // TODO(aselle): Consider whether float conv and quantized conv should be
   // separate ops to avoid dispatch overhead here.
   switch (input->type) {  // Already know in/out types are same.
     case kTfLiteFloat32:
-      return EvalFloat(context, node, params, &data, input, filter, bias,
-                       output);
+      EvalFloat(context, node, params, &data, input, filter, bias, output);
       break;
     case kTfLiteInt8:
-      return EvalQuantizedPerChannel(context, node, params, &data, input,
-                                     filter, bias, output);
+      EvalQuantizedPerChannel(context, node, params, &data, input, filter, bias,
+                              output);
       break;
     case kTfLiteUInt8:
-      return EvalQuantized(context, node, params, &data, input, filter, bias,
-                           output);
+      EvalQuantized(context, node, params, &data, input, filter, bias, output);
       break;
     default:
       TF_LITE_KERNEL_LOG(context, "Type %s (%d) not supported.",
diff --git a/tensorflow/lite/micro/kernels/cmsis-nn/fully_connected.cc b/tensorflow/lite/micro/kernels/cmsis-nn/fully_connected.cc
index 6ae3a14bc96..3fac250f504 100644
--- a/tensorflow/lite/micro/kernels/cmsis-nn/fully_connected.cc
+++ b/tensorflow/lite/micro/kernels/cmsis-nn/fully_connected.cc
@@ -1,4 +1,4 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -41,6 +41,8 @@ struct OpData {
   int32_t output_activation_max;
   // The index of the temporary tensor where the quantized inputs are cached.
   int input_quantized_index;
+  // Index to buffer for optimizations if applicable.
+  int buffer_idx;
 };
 
 constexpr int kInputTensor = 0;
@@ -49,12 +51,14 @@ constexpr int kBiasTensor = 2;
 constexpr int kOutputTensor = 0;
 
 TfLiteStatus CalculateOpData(TfLiteContext* context,
-                             TfLiteFullyConnectedParams* params,
+                             TfLiteFusedActivation activation,
                              TfLiteType data_type, const TfLiteTensor* input,
                              const TfLiteTensor* filter,
                              const TfLiteTensor* bias, TfLiteTensor* output,
                              OpData* data) {
   TfLiteStatus status = kTfLiteOk;
+  // Set buffer index to a reset value
+  data->buffer_idx = -1;
   if (data_type != kTfLiteFloat32) {
     double real_multiplier = 0.0;
     TF_LITE_ENSURE_STATUS(GetQuantizedConvolutionMultipler(
@@ -63,7 +67,7 @@ TfLiteStatus CalculateOpData(TfLiteContext* context,
     QuantizeMultiplier(real_multiplier, &data->output_multiplier, &exponent);
     data->output_shift = -exponent;
     TF_LITE_ENSURE_STATUS(CalculateActivationRangeQuantized(
-        context, params->activation, output, &data->output_activation_min,
+        context, activation, output, &data->output_activation_min,
         &data->output_activation_max));
   }
   return status;
@@ -72,96 +76,148 @@ TfLiteStatus CalculateOpData(TfLiteContext* context,
 }  // namespace
 
 void* Init(TfLiteContext* context, const char* buffer, size_t length) {
-  void* raw;
-  context->AllocatePersistentBuffer(context, sizeof(int), &raw);
-  return raw;
+  TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
+  void* data = nullptr;
+  if (context->AllocatePersistentBuffer(context, sizeof(OpData), &data) ==
+      kTfLiteError) {
+    return nullptr;
+  }
+  return data;
 }
 
 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  TFLITE_DCHECK(node->user_data != nullptr);
+  TFLITE_DCHECK(node->builtin_data != nullptr);
+
+  OpData* data = static_cast<OpData*>(node->user_data);
+  const auto params =
+      static_cast<const TfLiteFullyConnectedParams*>(node->builtin_data);
+
   const TfLiteTensor* input = GetInput(context, node, kInputTensor);
   const TfLiteTensor* filter = GetInput(context, node, kWeightsTensor);
+  const TfLiteTensor* bias = GetOptionalInputTensor(context, node, kBiasTensor);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
   TF_LITE_ENSURE_TYPES_EQ(context, input->type, output->type);
   TF_LITE_ENSURE_MSG(context, input->type == filter->type,
                      "Hybrid models are not supported on TFLite Micro.");
+  TF_LITE_ENSURE_STATUS(CalculateOpData(context, params->activation,
+                                        input->type, input, filter, bias,
+                                        output, data));
 
-#if defined(__ARM_FEATURE_DSP) || defined(__ARM_FEATURE_MVE)
-  RuntimeShape filter_shape = GetTensorShape(filter);
-  const int filter_dim_count = filter_shape.DimensionsCount();
-  const int accum_depth = filter_shape.Dims(filter_dim_count - 1);
-  const int32_t buf_size = arm_fully_connected_s8_get_buffer_size(accum_depth);
+  if (input->type == kTfLiteInt8 && nullptr != GetTensorData<int32>(bias)) {
+    RuntimeShape filter_shape = GetTensorShape(filter);
+    RuntimeShape output_shape = GetTensorShape(output);
 
-  int* buffer_idx = reinterpret_cast<int*>(node->user_data);
+    TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 2);
+    const int filter_dim_count = filter_shape.DimensionsCount();
+    cmsis_nn_dims filter_dims;
+    filter_dims.n = filter_shape.Dims(filter_dim_count - 1);
+    filter_dims.h = 1;
+    filter_dims.w = 1;
+    filter_dims.c = output_shape.Dims(1);
 
-  node->user_data = buffer_idx;
-  if (buf_size > 0) {
-    TF_LITE_ENSURE_STATUS(
-        context->RequestScratchBufferInArena(context, buf_size, buffer_idx));
-  } else {
-    *buffer_idx = -1;
+    const int32_t buf_size =
+        arm_fully_connected_s8_get_buffer_size(&filter_dims);
+
+    if (buf_size > 0) {
+      TF_LITE_ENSURE_STATUS(context->RequestScratchBufferInArena(
+          context, buf_size, &data->buffer_idx));
+    } else {
+      data->buffer_idx = -1;
+    }
   }
-#endif
-
   return kTfLiteOk;
 }
 
 TfLiteStatus EvalQuantizedInt8(TfLiteContext* context, TfLiteNode* node,
-                               TfLiteFullyConnectedParams* params, OpData* data,
-                               const TfLiteTensor* input,
+                               const OpData& data, const TfLiteTensor* input,
                                const TfLiteTensor* filter,
                                const TfLiteTensor* bias, TfLiteTensor* output) {
-  RuntimeShape output_shape = GetTensorShape(output);
-  const int batches = output_shape.Dims(0);
-  const int output_depth = output_shape.Dims(1);
-  RuntimeShape filter_shape = GetTensorShape(filter);
-  const int filter_dim_count = filter_shape.DimensionsCount();
-  const int accum_depth = filter_shape.Dims(filter_dim_count - 1);
+  // The 'if' condition can be removed when null handling of bias is added to
+  // arm_fully_connected_s8
+  if (nullptr != GetTensorData<int32>(bias)) {
+    RuntimeShape output_shape = GetTensorShape(output);
+    TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 2);
+    const int batches = output_shape.Dims(0);
+    const int output_depth = output_shape.Dims(1);
+    const RuntimeShape filter_shape = GetTensorShape(filter);
+    const int filter_dim_count = filter_shape.DimensionsCount();
+    const int accum_depth = filter_shape.Dims(filter_dim_count - 1);
+    const RuntimeShape input_shape = GetTensorShape(input);
 
-#if defined(__ARM_FEATURE_DSP) || defined(__ARM_FEATURE_MVE)
-  int16_t* buf = nullptr;
+    cmsis_nn_fc_params fc_params;
+    fc_params.input_offset = -input->params.zero_point;
+    fc_params.filter_offset = -filter->params.zero_point;
+    fc_params.output_offset = output->params.zero_point;
+    fc_params.activation.min = data.output_activation_min;
+    fc_params.activation.max = data.output_activation_max;
 
-  auto* buffer_idx = reinterpret_cast<int*>(node->user_data);
-  if (*buffer_idx > -1) {
-    void* raw = context->GetScratchBuffer(context, *buffer_idx);
-    buf = reinterpret_cast<int16_t*>(raw);
+    cmsis_nn_per_tensor_quant_params quant_params;
+    quant_params.multiplier = data.output_multiplier;
+    // TODO(b/138810107): Figure out whether output shift should be inverted
+    quant_params.shift = -data.output_shift;
+
+    cmsis_nn_dims input_dims;
+    input_dims.n = batches;
+    input_dims.h = input_shape.Dims(1);
+    input_dims.w = input_shape.Dims(2);
+    input_dims.c = input_shape.Dims(3);
+
+    cmsis_nn_dims filter_dims;
+    filter_dims.n = accum_depth;
+    filter_dims.h = 1;
+    filter_dims.w = 1;
+    filter_dims.c = output_depth;
+
+    cmsis_nn_dims bias_dims;
+    bias_dims.n = 1;
+    bias_dims.h = 1;
+    bias_dims.w = 1;
+    bias_dims.c = output_depth;
+
+    cmsis_nn_dims output_dims;
+    output_dims.n = batches;
+    output_dims.h = 1;
+    output_dims.w = 1;
+    output_dims.c = output_depth;
+
+    cmsis_nn_context ctx;
+    ctx.buf = nullptr;
+    ctx.size = 0;
+
+    if (data.buffer_idx > -1) {
+      ctx.buf = context->GetScratchBuffer(context, data.buffer_idx);
+    }
+
+    TF_LITE_ENSURE_EQ(context, arm_fully_connected_s8(
+                                   &ctx, &fc_params, &quant_params, &input_dims,
+                                   GetTensorData<int8_t>(input), &filter_dims,
+                                   GetTensorData<int8_t>(filter), &bias_dims,
+                                   GetTensorData<int32>(bias), &output_dims,
+                                   GetTensorData<int8_t>(output)),
+                      ARM_MATH_SUCCESS);
+  } else {
+    tflite::FullyConnectedParams op_params;
+    op_params.input_offset = -input->params.zero_point;
+    op_params.weights_offset = -filter->params.zero_point;
+    op_params.output_offset = output->params.zero_point;
+    op_params.output_multiplier = data.output_multiplier;
+    // TODO(b/138810107): Figure out whether output shift should be inverted
+    op_params.output_shift = -data.output_shift;
+    op_params.quantized_activation_min = data.output_activation_min;
+    op_params.quantized_activation_max = data.output_activation_max;
+
+    reference_integer_ops::FullyConnected(
+        op_params, GetTensorShape(input), GetTensorData<int8_t>(input),
+        GetTensorShape(filter), GetTensorData<int8_t>(filter),
+        GetTensorShape(bias), GetTensorData<int32_t>(bias),
+        GetTensorShape(output), GetTensorData<int8_t>(output));
   }
-
-  TF_LITE_ENSURE_EQ(
-      context,
-      arm_fully_connected_s8(
-          GetTensorData<int8_t>(input), GetTensorData<int8_t>(filter),
-          accum_depth, output_depth, batches, -input->params.zero_point,
-          -filter->params.zero_point, data->output_multiplier,
-          -data->output_shift, output->params.zero_point,
-          GetTensorData<int32_t>(bias), GetTensorData<int8_t>(output),
-          data->output_activation_min, data->output_activation_max, buf),
-      ARM_MATH_SUCCESS);
-#else
-#pragma message( \
-    "CMSIS-NN optimization for fully_connected not available for this target. Using reference kernel.")
-
-  FullyConnectedParams op_params;
-  op_params.input_offset = -input->params.zero_point;
-  op_params.weights_offset = -filter->params.zero_point;
-  op_params.output_offset = output->params.zero_point;
-  op_params.output_multiplier = data->output_multiplier;
-  // TODO(b/138810107): Figure out whether output shift should be inverted
-  op_params.output_shift = -data->output_shift;
-  op_params.quantized_activation_min = data->output_activation_min;
-  op_params.quantized_activation_max = data->output_activation_max;
-
-  reference_integer_ops::FullyConnected(
-      op_params, GetTensorShape(input), GetTensorData<int8_t>(input),
-      GetTensorShape(filter), GetTensorData<int8_t>(filter),
-      GetTensorShape(bias), GetTensorData<int32_t>(bias),
-      GetTensorShape(output), GetTensorData<int8_t>(output));
-#endif
   return kTfLiteOk;
 }
 
 TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
-                           TfLiteFullyConnectedParams* params, OpData* data,
-                           const TfLiteTensor* input,
+                           const OpData& data, const TfLiteTensor* input,
                            const TfLiteTensor* filter, const TfLiteTensor* bias,
                            TfLiteTensor* output) {
   const int32_t input_offset = -input->params.zero_point;
@@ -172,11 +228,11 @@ TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
   op_params.input_offset = input_offset;
   op_params.weights_offset = filter_offset;
   op_params.output_offset = output_offset;
-  op_params.output_multiplier = data->output_multiplier;
+  op_params.output_multiplier = data.output_multiplier;
   // Legacy ops used mixed left and right shifts. Now all are +ve-means-left.
-  op_params.output_shift = -data->output_shift;
-  op_params.quantized_activation_min = data->output_activation_min;
-  op_params.quantized_activation_max = data->output_activation_max;
+  op_params.output_shift = -data.output_shift;
+  op_params.quantized_activation_min = data.output_activation_min;
+  op_params.quantized_activation_max = data.output_activation_max;
 
 #define TF_LITE_FULLY_CONNECTED(output_data_type)                      \
   reference_ops::FullyConnected(                                       \
@@ -201,11 +257,11 @@ TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
 }
 
 TfLiteStatus EvalFloat(TfLiteContext* context, TfLiteNode* node,
-                       TfLiteFullyConnectedParams* params, OpData* data,
+                       TfLiteFusedActivation activation,
                        const TfLiteTensor* input, const TfLiteTensor* filter,
                        const TfLiteTensor* bias, TfLiteTensor* output) {
   float output_activation_min, output_activation_max;
-  CalculateActivationRange(params->activation, &output_activation_min,
+  CalculateActivationRange(activation, &output_activation_min,
                            &output_activation_max);
   tflite::FullyConnectedParams op_params;
   op_params.float_activation_min = output_activation_min;
@@ -219,32 +275,29 @@ TfLiteStatus EvalFloat(TfLiteContext* context, TfLiteNode* node,
 }
 
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
-  auto* params =
-      reinterpret_cast<TfLiteFullyConnectedParams*>(node->builtin_data);
+  TFLITE_DCHECK(node->builtin_data != nullptr);
+  const auto* params =
+      static_cast<const TfLiteFullyConnectedParams*>(node->builtin_data);
 
   const TfLiteTensor* input = GetInput(context, node, kInputTensor);
   const TfLiteTensor* filter = GetInput(context, node, kWeightsTensor);
   const TfLiteTensor* bias = GetOptionalInputTensor(context, node, kBiasTensor);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
 
-  TfLiteType data_type = input->type;
-  OpData local_data_object;
-  OpData* data = &local_data_object;
-  TF_LITE_ENSURE_STATUS(CalculateOpData(context, params, data_type, input,
-                                        filter, bias, output, data));
+  TFLITE_DCHECK(node->user_data != nullptr);
+  const OpData& data = *(static_cast<const OpData*>(node->user_data));
 
   // Checks in Prepare ensure input, output and filter types are all the same.
   switch (input->type) {
     case kTfLiteFloat32:
-      return EvalFloat(context, node, params, data, input, filter, bias,
+      return EvalFloat(context, node, params->activation, input, filter, bias,
                        output);
     case kTfLiteInt8:
-      return EvalQuantizedInt8(context, node, params, data, input, filter, bias,
+      return EvalQuantizedInt8(context, node, data, input, filter, bias,
                                output);
 
     case kTfLiteUInt8:
-      return EvalQuantized(context, node, params, data, input, filter, bias,
-                           output);
+      return EvalQuantized(context, node, data, input, filter, bias, output);
 
     default:
       TF_LITE_KERNEL_LOG(context, "Type %s (%d) not supported.",
diff --git a/tensorflow/lite/micro/kernels/cmsis-nn/pooling.cc b/tensorflow/lite/micro/kernels/cmsis-nn/pooling.cc
index 42b3c2e52ff..94f8e928868 100644
--- a/tensorflow/lite/micro/kernels/cmsis-nn/pooling.cc
+++ b/tensorflow/lite/micro/kernels/cmsis-nn/pooling.cc
@@ -1,4 +1,4 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -76,13 +76,14 @@ void AverageEvalFloat(const TfLiteContext* context, const TfLiteNode* node,
       GetTensorShape(output), GetTensorData<float>(output));
 }
 
-void AverageEvalUint8(TfLiteContext* context, const TfLiteNode* node,
-                      const TfLitePoolParams* params, const OpData* data,
-                      const TfLiteTensor* input, TfLiteTensor* output) {
+void AverageEvalQuantized(TfLiteContext* context, const TfLiteNode* node,
+                          const TfLitePoolParams* params, const OpData* data,
+                          const TfLiteTensor* input, TfLiteTensor* output) {
+  TFLITE_DCHECK(input->type == kTfLiteUInt8 || input->type == kTfLiteInt8);
+
   int32_t activation_min, activation_max;
   (void)CalculateActivationRangeQuantized(context, params->activation, output,
                                           &activation_min, &activation_max);
-
   PoolParams op_params;
   op_params.stride_height = params->stride_height;
   op_params.stride_width = params->stride_width;
@@ -92,76 +93,62 @@ void AverageEvalUint8(TfLiteContext* context, const TfLiteNode* node,
   op_params.padding_values.width = data->padding.width;
   op_params.quantized_activation_min = activation_min;
   op_params.quantized_activation_max = activation_max;
-  reference_ops::AveragePool(
-      op_params, GetTensorShape(input), GetTensorData<uint8_t>(input),
-      GetTensorShape(output), GetTensorData<uint8_t>(output));
-}
 
-TfLiteStatus AverageEvalInt8(TfLiteContext* context, const TfLiteNode* node,
-                             const TfLitePoolParams* params, const OpData* data,
-                             TfLiteTensor* input, TfLiteTensor* output) {
-  int32_t activation_min, activation_max;
-  (void)CalculateActivationRangeQuantized(context, params->activation, output,
-                                          &activation_min, &activation_max);
+  if (input->type == kTfLiteUInt8) {
+    reference_ops::AveragePool(
+        op_params, GetTensorShape(input), GetTensorData<uint8_t>(input),
+        GetTensorShape(output), GetTensorData<uint8_t>(output));
+  } else {
+    TFLITE_DCHECK_LE(activation_min, activation_max);
 
-  TFLITE_DCHECK_LE(activation_min, activation_max);
+    RuntimeShape input_shape = GetTensorShape(input);
+    TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
 
-#if defined(__ARM_FEATURE_DSP) || defined(__ARM_FEATURE_MVE)
-  RuntimeShape input_shape = GetTensorShape(input);
-  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
+    RuntimeShape output_shape = GetTensorShape(output);
+    TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
 
-  RuntimeShape output_shape = GetTensorShape(output);
-  TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
+    const int depth = MatchingDim(input_shape, 3, output_shape, 3);
 
-  const int depth = MatchingDim(input_shape, 3, output_shape, 3);
-  const int input_height = input_shape.Dims(1);
-  const int input_width = input_shape.Dims(2);
-  const int output_height = output_shape.Dims(1);
-  const int output_width = output_shape.Dims(2);
-  const int stride_height = params->stride_height;
-  const int stride_width = params->stride_width;
+    cmsis_nn_dims input_dims;
+    input_dims.n = 1;
+    input_dims.h = input_shape.Dims(1);
+    input_dims.w = input_shape.Dims(2);
+    input_dims.c = depth;
 
-  const int filter_height = params->filter_height;
-  const int filter_width = params->filter_width;
-  const int padding_height = data->padding.height;
-  const int padding_width = data->padding.width;
+    cmsis_nn_dims output_dims;
+    output_dims.n = 1;
+    output_dims.h = output_shape.Dims(1);
+    output_dims.w = output_shape.Dims(2);
+    output_dims.c = depth;
 
-  int16_t* scratch_buffer = nullptr;
+    cmsis_nn_pool_params pool_params;
+    pool_params.stride.h = params->stride_height;
+    pool_params.stride.w = params->stride_width;
+    pool_params.padding.h = data->padding.height;
+    pool_params.padding.w = data->padding.width;
+    pool_params.activation.min = activation_min;
+    pool_params.activation.max = activation_max;
 
-  auto* buffer_idx = reinterpret_cast<int*>(node->user_data);
+    cmsis_nn_dims filter_dims;
+    filter_dims.n = 1;
+    filter_dims.h = params->filter_height;
+    filter_dims.w = params->filter_width;
+    filter_dims.c = 1;
 
-  if (*buffer_idx > -1) {
-    void* raw = context->GetScratchBuffer(context, *buffer_idx);
-    scratch_buffer = reinterpret_cast<int16_t*>(raw);
+    auto* buffer_idx = reinterpret_cast<int*>(node->user_data);
+    cmsis_nn_context ctx;
+    ctx.buf = nullptr;
+    ctx.size = 0;
+    if (*buffer_idx > -1) {
+      ctx.buf = context->GetScratchBuffer(context, *buffer_idx);
+    }
+
+    TFLITE_DCHECK_EQ(
+        arm_avgpool_s8(&ctx, &pool_params, &input_dims,
+                       GetTensorData<int8_t>(input), &filter_dims, &output_dims,
+                       GetTensorData<int8_t>(output)),
+        ARM_MATH_SUCCESS);
   }
-
-  TF_LITE_ENSURE_EQ(
-      context,
-      arm_avgpool_s8(input_height, input_width, output_height, output_width,
-                     stride_height, stride_width, filter_height, filter_width,
-                     padding_height, padding_width, activation_min,
-                     activation_max, depth, GetTensorData<int8_t>(input),
-                     scratch_buffer, GetTensorData<int8_t>(output)),
-      ARM_MATH_SUCCESS);
-#else
-#pragma message( \
-    "CMSIS-NN optimization for avg_pool not available for this target. Using reference kernel.")
-
-  PoolParams op_params;
-  op_params.stride_height = params->stride_height;
-  op_params.stride_width = params->stride_width;
-  op_params.filter_height = params->filter_height;
-  op_params.filter_width = params->filter_width;
-  op_params.padding_values.height = data->padding.height;
-  op_params.padding_values.width = data->padding.width;
-  op_params.quantized_activation_min = activation_min;
-  op_params.quantized_activation_max = activation_max;
-  reference_integer_ops::AveragePool(
-      op_params, GetTensorShape(input), GetTensorData<int8_t>(input),
-      GetTensorShape(output), GetTensorData<int8_t>(output));
-
-#endif
-  return kTfLiteOk;
 }
 
 void MaxEvalFloat(TfLiteContext* context, TfLiteNode* node,
@@ -215,7 +202,6 @@ TfLiteStatus MaxEvalInt8(TfLiteContext* context, const TfLiteNode* node,
 
   TFLITE_DCHECK_LE(activation_min, activation_max);
 
-#if defined(__ARM_FEATURE_DSP)
   RuntimeShape input_shape = GetTensorShape(input);
   TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
 
@@ -245,32 +231,14 @@ TfLiteStatus MaxEvalInt8(TfLiteContext* context, const TfLiteNode* node,
   }
 
   TF_LITE_ENSURE_EQ(
-      context,
-      arm_max_pool_s8_opt(input_height, input_width, output_height,
-                          output_width, stride_height, stride_width,
-                          filter_height, filter_width, padding_height,
-                          padding_width, activation_min, activation_max, depth,
-                          GetTensorData<int8_t>(input), scratch_buffer,
-                          GetTensorData<int8_t>(output)),
+      context, arm_max_pool_s8_opt(
+                   input_height, input_width, output_height, output_width,
+                   stride_height, stride_width, filter_height, filter_width,
+                   padding_height, padding_width, activation_min,
+                   activation_max, depth, GetTensorData<int8_t>(input),
+                   scratch_buffer, GetTensorData<int8_t>(output)),
       ARM_MATH_SUCCESS);
-#else
-#pragma message( \
-    "CMSIS-NN optimization for max_pool not available for this target. Using reference kernel.")
 
-  PoolParams op_params;
-  op_params.stride_height = params->stride_height;
-  op_params.stride_width = params->stride_width;
-  op_params.filter_height = params->filter_height;
-  op_params.filter_width = params->filter_width;
-  op_params.padding_values.height = data->padding.height;
-  op_params.padding_values.width = data->padding.width;
-  op_params.quantized_activation_min = activation_min;
-  op_params.quantized_activation_max = activation_max;
-  reference_integer_ops::MaxPool(
-      op_params, GetTensorShape(input), GetTensorData<int8_t>(input),
-      GetTensorShape(output), GetTensorData<int8_t>(output));
-
-#endif
   return kTfLiteOk;
 }
 
@@ -283,32 +251,33 @@ void* Init(TfLiteContext* context, const char* buffer, size_t length) {
 }
 
 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
-#if defined(__ARM_FEATURE_DSP) || defined(__ARM_FEATURE_MVE)
   const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  const TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
 
-  RuntimeShape input_shape = GetTensorShape(input);
-  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
+  if (input->type == kTfLiteInt8) {
+    const TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
 
-  RuntimeShape output_shape = GetTensorShape(output);
-  TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
+    RuntimeShape input_shape = GetTensorShape(input);
+    TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
 
-  const int depth = MatchingDim(input_shape, 3, output_shape, 3);
-  const int output_width = output_shape.Dims(2);
+    RuntimeShape output_shape = GetTensorShape(output);
+    TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
 
-  const int32_t buffer_size =
-      arm_avgpool_s8_get_buffer_size(output_width, depth);
+    const int depth = MatchingDim(input_shape, 3, output_shape, 3);
+    const int output_width = output_shape.Dims(2);
 
-  int* buffer_idx = reinterpret_cast<int*>(node->user_data);
+    const int32_t buffer_size =
+        arm_avgpool_s8_get_buffer_size(output_width, depth);
 
-  node->user_data = buffer_idx;
-  if (buffer_size > 0) {
-    TF_LITE_ENSURE_STATUS(
-        context->RequestScratchBufferInArena(context, buffer_size, buffer_idx));
-  } else {
-    *buffer_idx = -1;
+    int* buffer_idx = reinterpret_cast<int*>(node->user_data);
+
+    node->user_data = buffer_idx;
+    if (buffer_size > 0) {
+      TF_LITE_ENSURE_STATUS(context->RequestScratchBufferInArena(
+          context, buffer_size, buffer_idx));
+    } else {
+      *buffer_idx = -1;
+    }
   }
-#endif
   return kTfLiteOk;
 }
 
@@ -316,9 +285,7 @@ TfLiteStatus AverageEval(TfLiteContext* context, TfLiteNode* node) {
   auto* params = reinterpret_cast<TfLitePoolParams*>(node->builtin_data);
   OpData data;
 
-  // Todo: make 'input' const once CMSIS-reuse is fixed
-  TfLiteTensor* input = &context->tensors[flatbuffers::EndianScalar(
-      node->inputs->data[kInputTensor])];
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
 
   TF_LITE_ENSURE_STATUS(CalculateOpData(context, params, input, output, &data));
@@ -329,10 +296,8 @@ TfLiteStatus AverageEval(TfLiteContext* context, TfLiteNode* node) {
       AverageEvalFloat(context, node, params, &data, input, output);
       break;
     case kTfLiteUInt8:
-      AverageEvalUint8(context, node, params, &data, input, output);
-      break;
     case kTfLiteInt8:
-      return AverageEvalInt8(context, node, params, &data, input, output);
+      AverageEvalQuantized(context, node, params, &data, input, output);
       break;
     default:
       TF_LITE_KERNEL_LOG(context, "Input type %s is not currently supported",
@@ -387,7 +352,7 @@ TfLiteRegistration* Register_AVERAGE_POOL_2D() {
 TfLiteRegistration* Register_MAX_POOL_2D() {
   static TfLiteRegistration r = {/*init=*/pooling::Init,
                                  /*free=*/nullptr,
-                                 /*prepare=*/pooling::Prepare,
+                                 /*prepare=*/nullptr,
                                  /*invoke=*/pooling::MaxEval,
                                  /*profiling_string=*/nullptr,
                                  /*builtin_code=*/0,
diff --git a/tensorflow/lite/micro/tools/make/ext_libs/cmsis.inc b/tensorflow/lite/micro/tools/make/ext_libs/cmsis.inc
index d445a6f7b37..32874f13dbf 100644
--- a/tensorflow/lite/micro/tools/make/ext_libs/cmsis.inc
+++ b/tensorflow/lite/micro/tools/make/ext_libs/cmsis.inc
@@ -55,6 +55,7 @@ ifneq ($(filter cmsis-nn,$(ALL_TAGS)),)
       $(CMSIS_PATH)/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_1_x_n_s8.c \
       $(CMSIS_PATH)/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q7_basic_nonsquare.c \
       $(CMSIS_PATH)/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_wrapper_s8.c \
+      $(CMSIS_PATH)/CMSIS/NN/Source/ConvolutionFunctions/arm_depthwise_conv_wrapper_s8.c \
       $(CMSIS_PATH)/CMSIS/NN/Source/PoolingFunctions/arm_pool_q7_HWC.c \
       $(CMSIS_PATH)/CMSIS/NN/Source/PoolingFunctions/arm_max_pool_s8_opt.c \
       $(CMSIS_PATH)/CMSIS/NN/Source/PoolingFunctions/arm_avgpool_s8.c \
diff --git a/tensorflow/lite/micro/tools/make/third_party_downloads.inc b/tensorflow/lite/micro/tools/make/third_party_downloads.inc
index 28d3f3ab529..2f2f9396dc0 100644
--- a/tensorflow/lite/micro/tools/make/third_party_downloads.inc
+++ b/tensorflow/lite/micro/tools/make/third_party_downloads.inc
@@ -28,8 +28,8 @@ LEON_BCC2_MD5 := "cdf78082be4882da2a92c9baa82fe765"
 TSIM_URL := "https://www.gaisler.com/anonftp/tsim/tsim-eval-2.0.63.tar.gz"
 TSIM_MD5 := "afa0095d3ed989a949e1467f94e41d2f"
 
-CMSIS_URL := "https://github.com/ARM-software/CMSIS_5/archive/1150e71e07c79b538efd842aba5b210a31827ae5.zip"
-CMSIS_MD5 := "e05f4222ef58825193910b41a0871dcb"
+CMSIS_URL := "https://github.com/ARM-software/CMSIS_5/archive/0f1587564506b385d57a58baed8c2c6a1e2b959d.zip"
+CMSIS_MD5 := "b7bf586417df9ed586d50cb9b885509f"
 
 AM_SDK_URL := "http://s3.asia.ambiqmicro.com/downloads/AmbiqSuite-Rel2.2.0.zip"
 AM_SDK_MD5 := "7605fa2d4d97e6bb7a1190c92b66b597"

From e4af590df8ff7685de1679f2a6eb01a677d84775 Mon Sep 17 00:00:00 2001
From: Adrian Kuegel <akuegel@google.com>
Date: Thu, 18 Jun 2020 01:35:40 -0700
Subject: [PATCH 0481/1390] Properly configure block and grid dimensions when
 launching generated kernels.

Also prepare the possibility to specify unrolling. This is not enabled yet
because there are some LLVM changes required.

PiperOrigin-RevId: 317056534
Change-Id: I3de5dda52d80b528c4bd0026a5e160fda4296c32
---
 .../mlir_generated_cwise_op_gpu_tanh.cu.cc    | 71 ++++++++++++++-----
 1 file changed, 53 insertions(+), 18 deletions(-)

diff --git a/tensorflow/core/kernels/mlir_generated_cwise_op_gpu_tanh.cu.cc b/tensorflow/core/kernels/mlir_generated_cwise_op_gpu_tanh.cu.cc
index 40dd7c7e49e..70de777239f 100644
--- a/tensorflow/core/kernels/mlir_generated_cwise_op_gpu_tanh.cu.cc
+++ b/tensorflow/core/kernels/mlir_generated_cwise_op_gpu_tanh.cu.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include <memory>
+#include <vector>
 
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
@@ -45,9 +46,41 @@ Status CreateKernel(absl::string_view kernel_name, uint64_t num_args,
   return stream_exec->GetKernel(loader_spec, kernel_base.get());
 }
 
-class MlirGenerateTanhOp : public OpKernel {
+struct LaunchConfig {
+  se::BlockDim blockDim;
+  se::ThreadDim threadDim;
+};
+
+LaunchConfig GetLaunchConfiguration(std::vector<uint64> tile_sizes,
+                                    std::vector<uint64> unrolling_factors,
+                                    std::vector<uint64> shape) {
+  LaunchConfig result;
+  // Ensure the vectors are length 3 and pad with ones.
+  tile_sizes.resize(3, 1);
+  unrolling_factors.resize(3, 1);
+  shape.resize(3, 1);
+  // The number of threads is given by the tiling size.
+  result.threadDim = se::ThreadDim(tile_sizes[0], tile_sizes[1], tile_sizes[2]);
+  // We know that the kernel was generated by mapping the three outer-most
+  // dimensions to x,y,z dimensions. So we only need to compute those.
+  std::vector<int> block_dims(3);
+  for (int i = 0; i < 3; ++i) {
+    // Compute the number of grids. We use ceildiv here as we have to allocate
+    // an extra thread/block if the division is not even. The kernel contains
+    // code to handle the boundaries.
+    int number_of_threads =
+        (shape[i] + unrolling_factors[i] - 1) / unrolling_factors[i];
+    int number_of_grids =
+        (number_of_threads + tile_sizes[i] - 1) / tile_sizes[i];
+    block_dims[i] = number_of_grids;
+  }
+  result.blockDim = se::BlockDim(block_dims[0], block_dims[1], block_dims[2]);
+  return result;
+}
+
+class MlirGeneratedTanhOp : public OpKernel {
  public:
-  explicit MlirGenerateTanhOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+  explicit MlirGeneratedTanhOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
 
   void Compute(OpKernelContext* ctx) override {
     auto* stream = ctx->op_device_context()->stream();
@@ -88,11 +121,13 @@ class MlirGenerateTanhOp : public OpKernel {
     args.add_argument<int64_t>(inp.NumElements());
     args.add_argument<int64_t>(1);
 
-    // TODO(b/158649746): Choose block size and thread dim according to the
-    // number of input elements. For now, this supports at most 1024 elements.
+    // This has to be aligned with the configuration that was used when building
+    // the kernels. See the corresponding build rules in `cubin_headers/BUILD`.
+    LaunchConfig config = GetLaunchConfiguration(
+        {256}, {}, {static_cast<uint64>(inp.NumElements())});
     OP_REQUIRES_OK(
-        ctx, stream->parent()->Launch(stream, se::ThreadDim(inp.NumElements()),
-                                      se::BlockDim(1), *kernel, args));
+        ctx, stream->parent()->Launch(stream, config.threadDim, config.blockDim,
+                                      *kernel, args));
   }
 
  protected:
@@ -103,26 +138,26 @@ class MlirGenerateTanhOp : public OpKernel {
   std::mutex mu_;
 };
 
-class MlirGenerateTanhF16Op : public MlirGenerateTanhOp {
+class MlirGeneratedTanhF16Op : public MlirGeneratedTanhOp {
  public:
-  explicit MlirGenerateTanhF16Op(OpKernelConstruction* ctx)
-      : MlirGenerateTanhOp(ctx) {
+  explicit MlirGeneratedTanhF16Op(OpKernelConstruction* ctx)
+      : MlirGeneratedTanhOp(ctx) {
     cubin_data_ = kTanhF16Kernel;
   }
 };
 
-class MlirGenerateTanhF32Op : public MlirGenerateTanhOp {
+class MlirGeneratedTanhF32Op : public MlirGeneratedTanhOp {
  public:
-  explicit MlirGenerateTanhF32Op(OpKernelConstruction* ctx)
-      : MlirGenerateTanhOp(ctx) {
+  explicit MlirGeneratedTanhF32Op(OpKernelConstruction* ctx)
+      : MlirGeneratedTanhOp(ctx) {
     cubin_data_ = kTanhF32Kernel;
   }
 };
 
-class MlirGenerateTanhF64Op : public MlirGenerateTanhOp {
+class MlirGeneratedTanhF64Op : public MlirGeneratedTanhOp {
  public:
-  explicit MlirGenerateTanhF64Op(OpKernelConstruction* ctx)
-      : MlirGenerateTanhOp(ctx) {
+  explicit MlirGeneratedTanhF64Op(OpKernelConstruction* ctx)
+      : MlirGeneratedTanhOp(ctx) {
     cubin_data_ = kTanhF64Kernel;
   }
 };
@@ -130,11 +165,11 @@ class MlirGenerateTanhF64Op : public MlirGenerateTanhOp {
 
 REGISTER_KERNEL_BUILDER(
     Name("Tanh").Device(DEVICE_GPU).TypeConstraint<Eigen::half>("T"),
-    MlirGenerateTanhF16Op);
+    MlirGeneratedTanhF16Op);
 REGISTER_KERNEL_BUILDER(
     Name("Tanh").Device(DEVICE_GPU).TypeConstraint<float>("T"),
-    MlirGenerateTanhF32Op);
+    MlirGeneratedTanhF32Op);
 REGISTER_KERNEL_BUILDER(
     Name("Tanh").Device(DEVICE_GPU).TypeConstraint<double>("T"),
-    MlirGenerateTanhF64Op);
+    MlirGeneratedTanhF64Op);
 }  // namespace tensorflow

From ad1434444bda1e8321dcc09965a5dce8da847eed Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 18 Jun 2020 01:51:06 -0700
Subject: [PATCH 0482/1390] Remove dynamic dimension of strided slice grad if
 input to strided slice is static.

If we slice a dynamic shaped tensor from a static tensor, the output
of the gradient should still be static. Unfortunately this cannot be
deduced alone by xla, so extra information is needed from the tf2xla
bridge.

PiperOrigin-RevId: 317058146
Change-Id: I33e4895e169c238ad3d73a57ada11c4984d11dfb
---
 .../tf2xla/kernels/strided_slice_op.cc        | 22 ----------------
 tensorflow/compiler/xla/client/xla_builder.cc | 23 ----------------
 tensorflow/compiler/xla/client/xla_builder.h  |  6 -----
 .../compiler/xla/client/xla_builder_test.cc   | 26 -------------------
 4 files changed, 77 deletions(-)

diff --git a/tensorflow/compiler/tf2xla/kernels/strided_slice_op.cc b/tensorflow/compiler/tf2xla/kernels/strided_slice_op.cc
index 51764018df1..2684c982600 100644
--- a/tensorflow/compiler/tf2xla/kernels/strided_slice_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/strided_slice_op.cc
@@ -350,28 +350,6 @@ class StridedSliceGradOp : public XlaOpKernel {
       grad = xla::Rev(grad, dimensions_to_reverse);
     }
     grad = xla::Pad(grad, zero, padding_config);
-
-    xla::XlaOp dynamic_shape = ctx->Input(0);
-    xla::Shape grad_shape = ctx->builder()->GetShape(grad).ValueOrDie();
-    ctx->set_dynamic_dimension_is_minus_one(true);
-    std::vector<int64> dynamic_size;
-    OP_REQUIRES_OK(ctx, ctx->ConstantInputAsIntVector(0, &dynamic_size));
-    // Input of strided_slice_op has to have the same shape as output.
-    DCHECK_EQ(grad_shape.rank(), input_shape.dims());
-    for (int64 dim = 0; dim < input_shape.dims(); ++dim) {
-      DCHECK_EQ(grad_shape.dimensions(dim), input_shape.dim_size(dim));
-      if (dynamic_size[dim] == -1) {
-        // Input is a dynamic dimension, set the same dynamic dimension size in
-        // the output.
-        auto dim_size = xla::Slice(dynamic_shape, {dim}, {dim + 1}, {1});
-        grad = xla::SetDimensionSize(grad, dim_size, dim);
-      } else if (grad_shape.is_dynamic_dimension(dim)) {
-        // Input is static but output is dynamic, respect input and remove any
-        // dynamic dim in the output.
-        grad = xla::RemoveDynamicDimension(grad, dim);
-      }
-    }
-
     ctx->SetOutput(0, grad);
   }
 
diff --git a/tensorflow/compiler/xla/client/xla_builder.cc b/tensorflow/compiler/xla/client/xla_builder.cc
index c7b6a7f9491..bfba48862f6 100644
--- a/tensorflow/compiler/xla/client/xla_builder.cc
+++ b/tensorflow/compiler/xla/client/xla_builder.cc
@@ -2727,25 +2727,6 @@ XlaOp XlaBuilder::GetDimensionSize(XlaOp operand, int64 dimension) {
   });
 }
 
-XlaOp XlaBuilder::RemoveDynamicDimension(XlaOp operand, int64 dimension) {
-  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
-    HloInstructionProto instr;
-    TF_ASSIGN_OR_RETURN(const Shape* operand_shape, GetShapePtr(operand));
-
-    Shape shape = *operand_shape;
-    shape.set_dynamic_dimension(dimension, false);
-    // Setting an op's dynamic dimension to its static size removes the dynamic
-    // dimension.
-    XlaOp static_size =
-        ConstantR0<int32>(this, operand_shape->dimensions(dimension));
-
-    *instr.mutable_shape() = shape.ToProto();
-    instr.add_dimensions(dimension);
-    return AddInstruction(std::move(instr), HloOpcode::kSetDimensionSize,
-                          {operand, static_size});
-  });
-}
-
 XlaOp XlaBuilder::SetDimensionSize(XlaOp operand, XlaOp val, int64 dimension) {
   return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
     HloInstructionProto instr;
@@ -3846,8 +3827,4 @@ XlaOp SetDimensionSize(const XlaOp operand, const XlaOp val, int64 dimension) {
   return operand.builder()->SetDimensionSize(operand, val, dimension);
 }
 
-XlaOp RemoveDynamicDimension(const XlaOp operand, int64 dimension) {
-  return operand.builder()->RemoveDynamicDimension(operand, dimension);
-}
-
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/client/xla_builder.h b/tensorflow/compiler/xla/client/xla_builder.h
index b8af180b83e..ffa6a7c3439 100644
--- a/tensorflow/compiler/xla/client/xla_builder.h
+++ b/tensorflow/compiler/xla/client/xla_builder.h
@@ -704,8 +704,6 @@ class XlaBuilder {
 
   XlaOp SetDimensionSize(XlaOp operand, XlaOp val, int64 dimension);
 
-  XlaOp RemoveDynamicDimension(XlaOp operand, int64 dimension);
-
   StatusOr<XlaOp> AddInstruction(HloInstructionProto&& instr, HloOpcode opcode,
                                  absl::Span<const XlaOp> operands = {});
 
@@ -1153,7 +1151,6 @@ class XlaBuilder {
 
   friend XlaOp GetDimensionSize(XlaOp operand, int64 dimension);
   friend XlaOp SetDimensionSize(XlaOp operand, XlaOp val, int64 dimension);
-  friend XlaOp RemoveDynamicDimension(XlaOp operand, int64 dimension);
 
  protected:
   // Returns OK status if the given op was built using this builder. Otherwise,
@@ -2152,9 +2149,6 @@ XlaOp GetDimensionSize(XlaOp operand, int64 dimension);
 
 XlaOp SetDimensionSize(XlaOp operand, XlaOp val, int64 dimension);
 
-// Returns the same op but with dynamic dimension removed.
-XlaOp RemoveDynamicDimension(XlaOp operand, int64 dimension);
-
 // Implementation details below this point.
 //
 
diff --git a/tensorflow/compiler/xla/client/xla_builder_test.cc b/tensorflow/compiler/xla/client/xla_builder_test.cc
index 7011c946203..4fa47077fca 100644
--- a/tensorflow/compiler/xla/client/xla_builder_test.cc
+++ b/tensorflow/compiler/xla/client/xla_builder_test.cc
@@ -556,32 +556,6 @@ TEST_F(XlaBuilderTest, DynamicParameter) {
   EXPECT_TRUE(param_shape.is_dynamic_dimension(0));
 }
 
-TEST_F(XlaBuilderTest, SetDimensionSize) {
-  XlaBuilder b(TestName());
-  auto p0 = Parameter(&b, 0, ShapeUtil::MakeShape(F32, {10}), "p0");
-  auto p1 = Parameter(&b, 1, ShapeUtil::MakeShape(S32, {}), "p1");
-  auto set_dim_size = SetDimensionSize(p0, p1, 0);
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          BuildHloModule(&b, /*root=*/set_dim_size));
-  const Shape& root_shape =
-      module->entry_computation()->root_instruction()->shape();
-  EXPECT_TRUE(root_shape.is_dynamic_dimension(0));
-}
-
-TEST_F(XlaBuilderTest, RemoveDimensionSize) {
-  XlaBuilder b(TestName());
-  auto p0 = Parameter(&b, 0, ShapeUtil::MakeShape(F32, {10}), "p0");
-  auto p1 = Parameter(&b, 1, ShapeUtil::MakeShape(S32, {}), "p1");
-  auto set_dim_size = SetDimensionSize(p0, p1, 0);
-  auto remove_dim_size = RemoveDynamicDimension(set_dim_size, 0);
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          BuildHloModule(&b, /*root=*/remove_dim_size));
-  const Shape& root_shape =
-      module->entry_computation()->root_instruction()->shape();
-  // Dynamic dimension has been removed.
-  EXPECT_FALSE(root_shape.is_dynamic_dimension(0));
-}
-
 TEST_F(XlaBuilderTest, DynamicUnary) {
   XlaBuilder b(TestName());
   Shape tuple_param_shape = ShapeUtil::MakeTupleShape(

From 50f0fcc0005d3afec88b281a07563a603dbe0c7c Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 18 Jun 2020 02:01:45 -0700
Subject: [PATCH 0483/1390] compat: Update forward compatibility horizon to
 2020-06-18

PiperOrigin-RevId: 317059247
Change-Id: I61e0e6659103a5b413a0013fb0782ba0d470d285
---
 tensorflow/python/compat/compat.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/compat/compat.py b/tensorflow/python/compat/compat.py
index 8a42b3dfdd3..32545ac8463 100644
--- a/tensorflow/python/compat/compat.py
+++ b/tensorflow/python/compat/compat.py
@@ -33,7 +33,7 @@ from tensorflow.python.util.tf_export import tf_export
 # This value changes every day with an automatic CL. It can be modified in code
 # via `forward_compatibility_horizon()` or with the environment variable
 # TF_FORWARD_COMPATIBILITY_DELTA_DAYS, which is added to the compatibility date.
-_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2020, 6, 17)
+_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2020, 6, 18)
 _FORWARD_COMPATIBILITY_DELTA_DAYS_VAR_NAME = "TF_FORWARD_COMPATIBILITY_DELTA_DAYS"
 _FORWARD_COMPATIBILITY_DATE_NUMBER = None
 

From 39c6a3b2cd0193eae45a051344921a3c18a2e691 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 18 Jun 2020 02:01:54 -0700
Subject: [PATCH 0484/1390] Update GraphDef version to 436.

PiperOrigin-RevId: 317059270
Change-Id: I2eb1760fec06cae5cf58c41effa3263a0bdde142
---
 tensorflow/core/public/version.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index 3e4e3888d87..546d86e58fa 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -108,7 +108,7 @@ limitations under the License.
 
 #define TF_GRAPH_DEF_VERSION_MIN_PRODUCER 0
 #define TF_GRAPH_DEF_VERSION_MIN_CONSUMER 0
-#define TF_GRAPH_DEF_VERSION 435  // Updated: 2020/6/17
+#define TF_GRAPH_DEF_VERSION 436  // Updated: 2020/6/18
 
 // Checkpoint compatibility versions (the versions field in SavedSliceMeta).
 //

From dd49e65c5b68c4b8113dfe5aadb988fdeb2abd57 Mon Sep 17 00:00:00 2001
From: Benjamin Kramer <kramerb@google.com>
Date: Thu, 18 Jun 2020 02:38:26 -0700
Subject: [PATCH 0485/1390] [XLA:CPU] Fusion: Only check for reuse on expensive
 instructions

The reuse condition walks over all instructions in the fusion, the fusion pass
walks over all instructions, making this essentially quadratic. Moving the
is_expensive check up doesn't completely avoid this behavior, but makes it much
more unlikely.

PiperOrigin-RevId: 317063582
Change-Id: I22459aa922e6d65c6c639ed81208d1d441a132bc
---
 .../compiler/xla/service/cpu/cpu_instruction_fusion.cc       | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion.cc b/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion.cc
index 97e0a518499..9460cc55e10 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion.cc
@@ -94,9 +94,8 @@ bool CpuInstructionFusion::ShouldFuse(HloInstruction* consumer,
 
   // Cost condition: not fuse (simple, expensive producers) and (consumers who
   // reuse operand elements).
-  if (producer->opcode() != HloOpcode::kFusion &&
-      consumer->ReusesOperandElements(operand_index) &&
-      is_expensive(*producer)) {
+  if (producer->opcode() != HloOpcode::kFusion && is_expensive(*producer) &&
+      consumer->ReusesOperandElements(operand_index)) {
     VLOG(2) << "Fusion is not profitable.";
     return false;
   }

From c21328bd992fe359b585452df92fc82976d27557 Mon Sep 17 00:00:00 2001
From: Lukas Geiger <lukas.geiger94@gmail.com>
Date: Thu, 18 Jun 2020 12:06:40 +0200
Subject: [PATCH 0486/1390] Don't skip distributed test case in eager mode

---
 .../mixed_precision/experimental/autocast_variable_test.py    | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/python/keras/mixed_precision/experimental/autocast_variable_test.py b/tensorflow/python/keras/mixed_precision/experimental/autocast_variable_test.py
index 940bd07c813..2fa7c103258 100644
--- a/tensorflow/python/keras/mixed_precision/experimental/autocast_variable_test.py
+++ b/tensorflow/python/keras/mixed_precision/experimental/autocast_variable_test.py
@@ -304,8 +304,8 @@ class AutoCastVariableTest(test.TestCase, parameterized.TestCase):
         self.assertAllClose(3., self.evaluate(x.assign_sub(3.)))
 
         # Assign multiple times
-        # This currently doesn't work in graph mode
-        if context.executing_eagerly() or ops.inside_function():
+        # This currently doesn't work in graph mode if a strategy is used
+        if not ds_context.has_strategy() or context.executing_eagerly():
           assign = x.assign(1.)
           self.assertAllClose(1., self.evaluate(assign))
           self.assertAllClose(0., self.evaluate(assign.assign(0.)))

From de5620b74cb1688dcc81f3e50ad8fc24b18beb9b Mon Sep 17 00:00:00 2001
From: Stephan Herhut <herhut@google.com>
Date: Thu, 18 Jun 2020 04:33:29 -0700
Subject: [PATCH 0487/1390] Add option in kernel_lowering to use the tanh
 approximation.

PiperOrigin-RevId: 317077338
Change-Id: I5cc7bfb84d6defba05439186377f90e422247d68
---
 tensorflow/compiler/xla/service/mlir_gpu/BUILD              | 1 +
 tensorflow/compiler/xla/service/mlir_gpu/kernel_lowering.cc | 5 +++++
 tensorflow/compiler/xla/service/mlir_gpu/kernel_lowering.h  | 1 +
 3 files changed, 7 insertions(+)

diff --git a/tensorflow/compiler/xla/service/mlir_gpu/BUILD b/tensorflow/compiler/xla/service/mlir_gpu/BUILD
index ce45d937424..efe69450846 100644
--- a/tensorflow/compiler/xla/service/mlir_gpu/BUILD
+++ b/tensorflow/compiler/xla/service/mlir_gpu/BUILD
@@ -167,6 +167,7 @@ cc_library(
         "//tensorflow/compiler/mlir/xla:lhlo_legalize_to_affine",
         "//tensorflow/compiler/mlir/xla:lhlo_legalize_to_gpu",
         "//tensorflow/compiler/mlir/xla:xla_dialect_registration",
+        "//tensorflow/compiler/mlir/xla:xla_legalize_tanh_to_approximation",
         "//tensorflow/compiler/mlir/xla:xla_legalize_to_linalg",
         "//tensorflow/compiler/xla:status",
         "//tensorflow/compiler/xla:statusor",
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/kernel_lowering.cc b/tensorflow/compiler/xla/service/mlir_gpu/kernel_lowering.cc
index b0cbddcdb92..9d5b52df010 100644
--- a/tensorflow/compiler/xla/service/mlir_gpu/kernel_lowering.cc
+++ b/tensorflow/compiler/xla/service/mlir_gpu/kernel_lowering.cc
@@ -505,6 +505,11 @@ Status LowerLHLOToGPU(mlir::ModuleOp module, LowerLHLOToGPUOptions options) {
   // Some basic cleanup.
   pm.addNestedPass<::mlir::FuncOp>(::mlir::createCanonicalizerPass());
   pm.addNestedPass<::mlir::FuncOp>(::mlir::createCSEPass());
+  // Approximate of requested.
+  if (options.use_approximations) {
+    pm.addNestedPass<::mlir::FuncOp>(
+        ::mlir::xla::createLegalizeTanhToApproximationPass());
+  }
   // Move scalar operations into the launch to ensure smaller signatures.
   pm.addPass(absl::make_unique<MoveScalarComputationsIntoGpuLaunch>());
   // Take launches to launches with kernels.
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/kernel_lowering.h b/tensorflow/compiler/xla/service/mlir_gpu/kernel_lowering.h
index 77cf75b9e47..bd633bb06cb 100644
--- a/tensorflow/compiler/xla/service/mlir_gpu/kernel_lowering.h
+++ b/tensorflow/compiler/xla/service/mlir_gpu/kernel_lowering.h
@@ -28,6 +28,7 @@ struct LowerLHLOToGPUOptions {
   llvm::ArrayRef<unsigned> unroll_factors = {};
   bool collapse_parallel_loops = true;
   bool rewrite_signature = true;
+  bool use_approximations = false;
 };
 
 Status LowerLHLOToGPU(mlir::ModuleOp module,

From 16e6f9e792741b98d27b4f1463057313d04acdd8 Mon Sep 17 00:00:00 2001
From: Stephan Herhut <herhut@google.com>
Date: Thu, 18 Jun 2020 05:00:40 -0700
Subject: [PATCH 0488/1390] Enable unrolling for the generated tanh kernel.

PiperOrigin-RevId: 317080498
Change-Id: Idaac4b2efee78cd2cd44c68ede4ac56f1a4cde40
---
 tensorflow/core/kernels/cubin_headers/BUILD | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/core/kernels/cubin_headers/BUILD b/tensorflow/core/kernels/cubin_headers/BUILD
index b8ba164fbc3..ec8b44050db 100644
--- a/tensorflow/core/kernels/cubin_headers/BUILD
+++ b/tensorflow/core/kernels/cubin_headers/BUILD
@@ -37,4 +37,5 @@ gen_kernel_library(
         "f32",
         "f64",
     ],
+    unroll_factors = "4",
 )

From 0a541ad1cc89f1eead6a47dc1676bd86dc810937 Mon Sep 17 00:00:00 2001
From: Christian Sigg <csigg@google.com>
Date: Thu, 18 Jun 2020 05:07:41 -0700
Subject: [PATCH 0489/1390] Remove intermediate relocatable code stored in
 __nv_relfatbin sections, if objcopy is at least version 2.26 (which added
 support for --update-sections).

The intermediate code is a result of separate compilation and linking, and removing it reduces TF's GPU wheel size.

PiperOrigin-RevId: 317081343
Change-Id: I603477b4499344aeec653765be78de11f392eac6
---
 third_party/nccl/build_defs.bzl.tpl | 103 ++++++++++++++++++++++++----
 1 file changed, 88 insertions(+), 15 deletions(-)

diff --git a/third_party/nccl/build_defs.bzl.tpl b/third_party/nccl/build_defs.bzl.tpl
index 9268af7c890..b520f71d0f1 100644
--- a/third_party/nccl/build_defs.bzl.tpl
+++ b/third_party/nccl/build_defs.bzl.tpl
@@ -169,35 +169,94 @@ _device_link = rule(
 )
 """Links device code and generates source code for kernel registration."""
 
+def _prune_relocatable_code_impl(ctx):
+    """Clears __nv_relfatbin section containing relocatable device code."""
+    empty_file = ctx.actions.declare_file(ctx.attr.name + "__nv_relfatbin")
+    ctx.actions.write(empty_file, "")
+
+    # Parse 'objcopy --version' and update section if it's at least v2.26.
+    # Otherwise, simply copy the file without changing it.
+    # TODO(csigg): version parsing is brittle, can we do better?
+    command = r"""
+        objcopy=$1                                         \
+        section=$2                                         \
+        input=$3                                           \
+        output=$4                                          \
+        args=""                                            \
+        pattern='([0-9])\.([0-9]+)';                       \
+        if [[ $($objcopy --version) =~ $pattern ]] && {    \
+            [ ${BASH_REMATCH[1]} -gt 2 ] ||                \
+            [ ${BASH_REMATCH[2]} -ge 26 ]; }; then         \
+          args="--update-section __nv_relfatbin=$section"; \
+        fi;                                                \
+        $objcopy $args $input $output
+    """
+    cc_toolchain = find_cpp_toolchain(ctx)
+    outputs = []
+    for src in ctx.files.srcs:
+        out = ctx.actions.declare_file("pruned_" + src.basename, sibling = src)
+        ctx.actions.run_shell(
+            inputs = [empty_file] + ctx.files.srcs,  # + ctx.files._crosstool,
+            outputs = [out],
+            arguments = [
+                cc_toolchain.objcopy_executable,
+                empty_file.path,
+                src.path,
+                out.path,
+            ],
+            command = command,
+        )
+        outputs.append(out)
+    return DefaultInfo(files = depset(outputs))
+
+_prune_relocatable_code = rule(
+    implementation = _prune_relocatable_code_impl,
+    attrs = {
+        "srcs": attr.label_list(mandatory = True, allow_files = True),
+        "_cc_toolchain": attr.label(
+            default = "@bazel_tools//tools/cpp:current_cc_toolchain",
+        ),
+        # "_crosstool": attr.label_list(
+        #     cfg = "host",
+        #     default = ["@bazel_tools//tools/cpp:crosstool"]
+        # ),
+    },
+)
+
 def _merge_archive_impl(ctx):
     # Generate an mri script to the merge archives in srcs and pass it to 'ar'.
     # See https://stackoverflow.com/a/23621751.
     files = _pic_only(ctx.files.srcs)
     mri_script = "create " + ctx.outputs.out.path
     for f in files:
-        mri_script += "\\naddlib " + f.path
-    mri_script += "\\nsave\\nend"
+        mri_script += r"\naddlib " + f.path
+    mri_script += r"\nsave\nend"
 
     cc_toolchain = find_cpp_toolchain(ctx)
     ctx.actions.run_shell(
         inputs = ctx.files.srcs,  # + ctx.files._crosstool,
         outputs = [ctx.outputs.out],
-        command = "printf \"%s\" | %s -M" % (mri_script, cc_toolchain.ar_executable),
+        command = "echo -e \"%s\" | %s -M" % (mri_script, cc_toolchain.ar_executable),
     )
 
 _merge_archive = rule(
     implementation = _merge_archive_impl,
     attrs = {
         "srcs": attr.label_list(mandatory = True, allow_files = True),
-        "_cc_toolchain": attr.label(default = "@bazel_tools//tools/cpp:current_cc_toolchain"),
-        # "_crosstool": attr.label_list(cfg = "host", default = ["@bazel_tools//tools/cpp:crosstool"]),
+        "_cc_toolchain": attr.label(
+            default = "@bazel_tools//tools/cpp:current_cc_toolchain",
+        ),
+        # "_crosstool": attr.label_list(
+        #     cfg = "host",
+        #     default = ["@bazel_tools//tools/cpp:crosstool"]
+        # ),
     },
     outputs = {"out": "lib%{name}.a"},
 )
 """Merges srcs into a single archive."""
 
 def cuda_rdc_library(name, hdrs = None, copts = None, linkstatic = True, **kwargs):
-    """Produces a cuda_library using separate compilation and linking.
+    r"""Produces a cuda_library using separate compilation and linking.
 
     CUDA separate compilation and linking allows device function calls across
     translation units. This is different from the normal whole program
@@ -239,17 +298,24 @@ def cuda_rdc_library(name, hdrs = None, copts = None, linkstatic = True, **kwarg
 
     The steps marked with '*' are implemented in the _device_link rule.
 
+    The intermediate relocatable device code in xy.a is no longer needed at
+    this point and the corresponding section is replaced with an empty one using
+    objcopy. We do not remove the section completely because it is referenced by
+    relocations, and removing those as well breaks fatbin registration.
+
     The object files in both xy.a and dlink.a reference symbols defined in the
     other archive. The separate archives are a side effect of using two
     cc_library targets to implement a single compilation trajectory. We could
     fix this once bazel supports C++ sandwich. For now, we just merge the two
     archives to avoid unresolved symbols:
 
-    xy.a      dlink.a
-        \    /           merge archive
-      xy_dlink.a
-           |             cc_library (or alternatively, cc_import)
-     final target
+                    xy.a
+                     |         objcopy --update-section __nv_relfatbin=''
+    dlink.a     xy_pruned.a
+         \           /         merge archive
+          xy_merged.a
+              |                cc_library (or alternatively, cc_import)
+         final target
 
     Another complication is that cc_library produces (depending on the
     configuration) both PIC and non-PIC archives, but the distinction
@@ -313,19 +379,26 @@ def cuda_rdc_library(name, hdrs = None, copts = None, linkstatic = True, **kwarg
         linkstatic = linkstatic,
     )
 
+    # Remove intermediate relocatable device code.
+    pruned = name + "_pruned"
+    _prune_relocatable_code(
+        name = pruned,
+        srcs = [lib],
+    )
+
     # Repackage the two libs into a single archive. This is required because
     # both libs reference symbols defined in the other one. For details, see
     # https://eli.thegreenplace.net/2013/07/09/library-order-in-static-linking
-    archive = name + "_a"
+    merged = name + "_merged"
     _merge_archive(
-        name = archive,
-        srcs = [lib, dlink],
+        name = merged,
+        srcs = [pruned, dlink],
     )
 
     # Create cc target from archive.
     native.cc_library(
         name = name,
-        srcs = [archive],
+        srcs = [merged],
         hdrs = hdrs,
         linkstatic = linkstatic,
     )

From 7378fabf90f489387e7dbfb111e20ddf03b1910a Mon Sep 17 00:00:00 2001
From: Adrian Kuegel <akuegel@google.com>
Date: Thu, 18 Jun 2020 05:19:27 -0700
Subject: [PATCH 0490/1390] Emit tuple at the end of the Sort emitter.

If sort has more than one operand, the result is a tuple. So far, we didn't
emit the tuple at the end of the emitter.

PiperOrigin-RevId: 317082573
Change-Id: I7bec31302ba2e40556b17654daa081428871a00e
---
 .../xla/service/gpu/ir_emitter_unnested.cc    |  7 ++++
 tensorflow/compiler/xla/tests/tuple_test.cc   | 32 +++++++++++++++++++
 2 files changed, 39 insertions(+)

diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
index 937a0ea5bbc..74aad5f5bd5 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
@@ -1418,6 +1418,13 @@ Status IrEmitterUnnested::HandleSort(HloInstruction* sort) {
 
   AddThunkToThunkSequence(
       absl::make_unique<SequentialThunk>(std::move(thunks), sort));
+  if (sort->operand_count() > 1) {
+    // Emit the tuple as part of the last stage of sorting.
+    // We are currently in the block sorted.in_bounds.after.
+    b_.SetInsertPoint(b_.GetInsertBlock()->getTerminator());
+    llvm_ir::EmitTuple(GetIrArray(*sort, *sort),
+                       ConstructIrArrayForOutputs(*sort), &b_);
+  }
   return Status::OK();
 }
 
diff --git a/tensorflow/compiler/xla/tests/tuple_test.cc b/tensorflow/compiler/xla/tests/tuple_test.cc
index 9ef589e5511..b6ad44497e6 100644
--- a/tensorflow/compiler/xla/tests/tuple_test.cc
+++ b/tensorflow/compiler/xla/tests/tuple_test.cc
@@ -577,5 +577,37 @@ XLA_TEST_F(TupleHloTest,
   EXPECT_TRUE(LiteralTestUtil::Equal(expected, literal));
 }
 
+XLA_TEST_F(TupleHloTest, TupleSelectOfSort) {
+  const char* testcase = R"(
+    HloModule sort
+
+    compare {
+      p.1.lhs = s32[] parameter(2)
+      p.1.rhs = s32[] parameter(3)
+      p.0.lhs = f32[] parameter(0)
+      p.0.rhs = f32[] parameter(1)
+      ROOT lt = pred[] compare(p.0.lhs, p.0.rhs), direction=LT
+    }
+
+    ENTRY Sort {
+      keys = f32[2]{0} iota(), iota_dimension=0
+      values = s32[2]{0} iota(), iota_dimension=0
+      preds = pred[] constant(true)
+      alt = (f32[2], s32[2]) parameter(0)
+
+      sorted = (f32[2]{0}, s32[2]{0}) sort(keys, values), dimensions={0},
+               to_apply=compare
+      ROOT selected = (f32[2], s32[2]) tuple-select(preds, sorted, alt)
+    }
+  )";
+  auto module = ParseAndReturnVerifiedModule(testcase).ValueOrDie();
+  auto param = LiteralUtil::MakeTupleOwned(LiteralUtil::CreateR1<float>({2, 3}),
+                                           LiteralUtil::CreateR1<int>({3, 4}));
+  auto expected = LiteralUtil::MakeTupleOwned(
+      LiteralUtil::CreateR1<float>({0, 1}), LiteralUtil::CreateR1<int>({0, 1}));
+  auto result = ExecuteAndTransfer(std::move(module), {&param});
+  EXPECT_TRUE(LiteralTestUtil::Equal(expected, result));
+}
+
 }  // namespace
 }  // namespace xla

From 78f6d51b8d9a2f3542918f004a29a6fbba232a40 Mon Sep 17 00:00:00 2001
From: Adrian Kuegel <akuegel@google.com>
Date: Thu, 18 Jun 2020 05:26:02 -0700
Subject: [PATCH 0491/1390] Also set the unroll factor in the C++ integration
 code for tanh.

PiperOrigin-RevId: 317083225
Change-Id: I7c2c26d664c15cbc967188da4b3012161edbcf49
---
 tensorflow/core/kernels/mlir_generated_cwise_op_gpu_tanh.cu.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/kernels/mlir_generated_cwise_op_gpu_tanh.cu.cc b/tensorflow/core/kernels/mlir_generated_cwise_op_gpu_tanh.cu.cc
index 70de777239f..a122c5112e6 100644
--- a/tensorflow/core/kernels/mlir_generated_cwise_op_gpu_tanh.cu.cc
+++ b/tensorflow/core/kernels/mlir_generated_cwise_op_gpu_tanh.cu.cc
@@ -124,7 +124,7 @@ class MlirGeneratedTanhOp : public OpKernel {
     // This has to be aligned with the configuration that was used when building
     // the kernels. See the corresponding build rules in `cubin_headers/BUILD`.
     LaunchConfig config = GetLaunchConfiguration(
-        {256}, {}, {static_cast<uint64>(inp.NumElements())});
+        {256}, {4}, {static_cast<uint64>(inp.NumElements())});
     OP_REQUIRES_OK(
         ctx, stream->parent()->Launch(stream, config.threadDim, config.blockDim,
                                       *kernel, args));

From 2ad57928530e3e10ba89e91ff4df3c6fa9feafa0 Mon Sep 17 00:00:00 2001
From: Adrian Kuegel <akuegel@google.com>
Date: Thu, 18 Jun 2020 05:29:17 -0700
Subject: [PATCH 0492/1390] Bump open source llvm revision to
 e3836fe1a5562875396705369353078ab07cf07a

PiperOrigin-RevId: 317083604
Change-Id: Ifd810846a890ad137a31b4db5af5e049450765d3
---
 tensorflow/workspace.bzl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index 78f7e0ce03e..1c165ba5aba 100755
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -710,8 +710,8 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
     )
 
     # Check out LLVM and MLIR from llvm-project.
-    LLVM_COMMIT = "4799fb63b5513f655ca8e85416ec8fe35df49bae"
-    LLVM_SHA256 = "f401a61bd7f5b05bd8a3ffdfb1f32e9379cae2c8e988f3ae6772b588ad97c84a"
+    LLVM_COMMIT = "e3836fe1a5562875396705369353078ab07cf07a"
+    LLVM_SHA256 = "6a78815e2c71c560a11c8c1740d31b88a607d82b7ccc61dc142bef0f1f3fbde8"
     LLVM_URLS = [
         "https://storage.googleapis.com/mirror.tensorflow.org/github.com/llvm/llvm-project/archive/{commit}.tar.gz".format(commit = LLVM_COMMIT),
         "https://github.com/llvm/llvm-project/archive/{commit}.tar.gz".format(commit = LLVM_COMMIT),

From 4f5c65d4494b4e4831d016176d506227c011f01b Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 18 Jun 2020 06:50:27 -0700
Subject: [PATCH 0493/1390] Make linear layout more explicit.

PiperOrigin-RevId: 317093123
Change-Id: I7af437c2c8afb31683bb659b1939eac2ce851da5
---
 .../compiler/xla/service/layout_assignment.cc | 14 +++++++------
 .../xla/service/layout_assignment_test.cc     | 21 +++++++++++++++++++
 2 files changed, 29 insertions(+), 6 deletions(-)

diff --git a/tensorflow/compiler/xla/service/layout_assignment.cc b/tensorflow/compiler/xla/service/layout_assignment.cc
index 307fd82069e..a35ba140e86 100644
--- a/tensorflow/compiler/xla/service/layout_assignment.cc
+++ b/tensorflow/compiler/xla/service/layout_assignment.cc
@@ -951,12 +951,7 @@ Status LayoutAssignment::CheckLayouts(HloModule* module) {
                 if (!Shape::Equal()
                          .IgnoreDynamicDimension()
                          .MinorToMajorOnlyInLayout()(instruction_subshape,
-                                                     buffer->shape()) &&
-                    // TODO(mingyao): Use explicit linear layout tiling to
-                    // detect and allow special bitcast.
-                    instruction->opcode() != HloOpcode::kBitcast &&
-                    instruction->opcode() != HloOpcode::kGetTupleElement &&
-                    instruction->opcode() != HloOpcode::kTuple) {
+                                                     buffer->shape())) {
                   return InternalError(
                       "Layout of instruction %s at index {%s} does not match "
                       "source LogicalBuffer %s: %s vs %s",
@@ -1803,6 +1798,13 @@ Status LayoutAssignment::ClearComputationLayouts(HloComputation* computation) {
   // potential bugs in the layout assignment pass that may accidentally use the
   // existing layout.
   for (HloInstruction* instruction : computation->instructions()) {
+    if (instruction->opcode() == HloOpcode::kBitcast) {
+      // bitcasts are inherently layout sensitive and so a bitcast instruction
+      // present in the IR before layout assignment is a bug.
+      return InternalError(
+          "Unexpected bitcast operation seen during layout assignment: %s.",
+          instruction->ToString());
+    }
     // Some instructions carry mandatory layouts in their shape.
     if (instruction->opcode() != HloOpcode::kInfeed &&
         !IsLayoutConstrainedCustomCall(instruction) &&
diff --git a/tensorflow/compiler/xla/service/layout_assignment_test.cc b/tensorflow/compiler/xla/service/layout_assignment_test.cc
index 6e575247e6b..304a80c7a52 100644
--- a/tensorflow/compiler/xla/service/layout_assignment_test.cc
+++ b/tensorflow/compiler/xla/service/layout_assignment_test.cc
@@ -814,6 +814,27 @@ TEST_F(LayoutAssignmentTest, ConditionalAsymmetricLayout) {
   EXPECT_THAT(false_result->opcode(), HloOpcode::kCopy);
 }
 
+TEST_F(LayoutAssignmentTest, InternalErrorOnBitcast) {
+  auto builder = HloComputation::Builder(TestName());
+  auto constant0 = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR2WithLayout<float>(
+          {{1.0, 2.0}, {3.0, 4.0}}, LayoutUtil::MakeLayout({0, 1}))));
+  builder.AddInstruction(
+      HloInstruction::CreateBitcast(constant0->shape(), constant0));
+  auto m = CreateNewVerifiedModule();
+  m->AddEntryComputation(builder.Build());
+
+  ComputationLayout computation_layout(
+      m->entry_computation()->ComputeProgramShape());
+  LayoutAssignment layout_assignment(&computation_layout);
+  Status error_status = layout_assignment.Run(m.get()).status();
+  EXPECT_FALSE(error_status.ok());
+  EXPECT_THAT(
+      error_status.error_message(),
+      ::testing::HasSubstr(
+          "Unexpected bitcast operation seen during layout assignment"));
+}
+
 TEST_F(LayoutAssignmentTest, ChannelLayoutMismatch) {
   // Pin non matching layouts to parameter and root.
   const char* module_str = R"(

From c65fccf0a6671c90599d0d3426dd18597688ea3a Mon Sep 17 00:00:00 2001
From: shwetaoj <shweta.ojha@intel.com>
Date: Wed, 17 Jun 2020 11:55:27 -0700
Subject: [PATCH 0494/1390] Format changes as per Google's feedback

---
 .../ci_build/linux/mkl/Dockerfile.devel-mkl   |  4 ++--
 .../ci_build/linux/mkl/build-dev-container.sh | 21 +++++++++----------
 .../linux/mkl/install_openmpi_horovod.sh      |  7 ++++---
 3 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/tensorflow/tools/ci_build/linux/mkl/Dockerfile.devel-mkl b/tensorflow/tools/ci_build/linux/mkl/Dockerfile.devel-mkl
index a78d13c7755..1f80cba35f0 100755
--- a/tensorflow/tools/ci_build/linux/mkl/Dockerfile.devel-mkl
+++ b/tensorflow/tools/ci_build/linux/mkl/Dockerfile.devel-mkl
@@ -64,9 +64,9 @@ RUN bazel --bazelrc=/root/.bazelrc build -c opt \
 
 # Install OpenMPI/Horovod
 COPY install_openmpi_horovod.sh .
-RUN if [ "${ENABLE_HOROVOD}" = "yes" ]; then \
+RUN if [ "${ENABLE_HOROVOD}" == "yes" ]; then \
         chmod +x install_openmpi_horovod.sh && \
-        ./install_openmpi_horovod.sh OPENMPI_VERSION=${OPENMPI_VERSION} OPENMPI_DOWNLOAD_URL=${OPENMPI_DOWNLOAD_URL} HOROVOD_VERSION=${HOROVOD_VERSION} && \
+        OPENMPI_VERSION=${OPENMPI_VERSION} OPENMPI_DOWNLOAD_URL=${OPENMPI_DOWNLOAD_URL} HOROVOD_VERSION=${HOROVOD_VERSION} ./install_openmpi_horovod.sh && \
         rm -rf install_openmpi_horovod.sh; \
     fi
 
diff --git a/tensorflow/tools/ci_build/linux/mkl/build-dev-container.sh b/tensorflow/tools/ci_build/linux/mkl/build-dev-container.sh
index 6e789a54e87..a0880b0e51c 100755
--- a/tensorflow/tools/ci_build/linux/mkl/build-dev-container.sh
+++ b/tensorflow/tools/ci_build/linux/mkl/build-dev-container.sh
@@ -201,21 +201,20 @@ function test_container()
   debug "ID of the running docker container: ${CONTAINER_ID}"
 
   debug "Performing basic sanity checks on the running container..."
-  TEST_CMD_1=$(${DOCKER_BINARY} exec ${CONTAINER_ID} bash -c "${PYTHON} -c 'from tensorflow.python import _pywrap_util_port; print(_pywrap_util_port.IsMklEnabled())'")
-  # Make TEST_CMD backward compatible with older code
-  TEST_CMD_2=$(${DOCKER_BINARY} exec ${CONTAINER_ID} bash -c "${PYTHON} -c 'from tensorflow.python import pywrap_tensorflow; print(pywrap_tensorflow.IsMklEnabled())'")
-
-  if [ "${TEST_CMD_1}" = "True" -o "${TEST_CMD_2}" = "True" ] ; then
-      echo "PASS: MKL enabled test in ${TEMP_IMAGE_NAME}"
-  else
-      die "FAIL: MKL enabled test in ${TEMP_IMAGE_NAME}"
-  fi
+  {
+    ${DOCKER_BINARY} exec ${CONTAINER_ID} bash -c "${PYTHON} -c 'from tensorflow.python import _pywrap_util_port; print(_pywrap_util_port.IsMklEnabled())'"
+    echo "PASS: MKL enabled test in ${TEMP_IMAGE_NAME}"
+  } || {
+    ${DOCKER_BINARY} exec ${CONTAINER_ID} bash -c "${PYTHON} -c 'from tensorflow.python import pywrap_tensorflow; print(pywrap_tensorflow.IsMklEnabled())'"
+    echo "PASS: Old MKL enabled in ${TEMP_IMAGE_NAME}"
+  } || {
+    die "FAIL: MKL enabled test in ${TEMP_IMAGE_NAME}"
+  }
 
   # Test to check if horovod is installed successfully
   if [[ ${ENABLE_HOROVOD} == "yes" ]]; then
       debug "Test horovod in the container..."
-      HOROVOD_TEST_CMD=$(${DOCKER_BINARY} exec ${CONTAINER_ID} bash -c "${PYTHON} -c 'import horovod.tensorflow as hvd;'")
-      ${HOROVOD_TEST_CMD}
+      ${DOCKER_BINARY} exec ${CONTAINER_ID} bash -c "${PYTHON} -c 'import horovod.tensorflow as hvd;'"
       if [[ $? == "0" ]]; then
           echo "PASS: HOROVOD installation test in ${TEMP_IMAGE_NAME}"
       else
diff --git a/tensorflow/tools/ci_build/linux/mkl/install_openmpi_horovod.sh b/tensorflow/tools/ci_build/linux/mkl/install_openmpi_horovod.sh
index aec40543a17..9bc92ca4fef 100755
--- a/tensorflow/tools/ci_build/linux/mkl/install_openmpi_horovod.sh
+++ b/tensorflow/tools/ci_build/linux/mkl/install_openmpi_horovod.sh
@@ -54,8 +54,9 @@ echo 'OpenMPI version:'
 mpirun --version
 
 # Install OpenSSH for MPI to communicate between containers
-apt-get clean && apt-get update && apt-get install -y --no-install-recommends --fix-missing \
-    openssh-client openssh-server libnuma-dev && \
+apt-get clean && apt-get update && \
+    apt-get install -y --no-install-recommends --fix-missing \
+        openssh-client openssh-server libnuma-dev && \
     rm -rf /var/lib/apt/lists/*
 if [[ $?  == "0" ]]; then
     echo "PASS: OpenSSH installation"
@@ -70,7 +71,7 @@ else
 fi
 mkdir -p /var/run/sshd
 # Allow OpenSSH to talk to containers without asking for confirmation
-cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new
+grep -v StrictHostKeyChecking /etc/ssh/ssh_config > /etc/ssh/ssh_config.new
 echo " StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new
 mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config
 

From 55929ce6ae2bc9552526a038ef4d01d8bef4f4fd Mon Sep 17 00:00:00 2001
From: shwetaoj <shweta.ojha@intel.com>
Date: Wed, 17 Jun 2020 12:52:57 -0700
Subject: [PATCH 0495/1390] Reverting == in Dockerfile

---
 tensorflow/tools/ci_build/linux/mkl/Dockerfile.devel-mkl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/tools/ci_build/linux/mkl/Dockerfile.devel-mkl b/tensorflow/tools/ci_build/linux/mkl/Dockerfile.devel-mkl
index 1f80cba35f0..80091e55a17 100755
--- a/tensorflow/tools/ci_build/linux/mkl/Dockerfile.devel-mkl
+++ b/tensorflow/tools/ci_build/linux/mkl/Dockerfile.devel-mkl
@@ -64,7 +64,7 @@ RUN bazel --bazelrc=/root/.bazelrc build -c opt \
 
 # Install OpenMPI/Horovod
 COPY install_openmpi_horovod.sh .
-RUN if [ "${ENABLE_HOROVOD}" == "yes" ]; then \
+RUN if [ "${ENABLE_HOROVOD}" = "yes" ]; then \
         chmod +x install_openmpi_horovod.sh && \
         OPENMPI_VERSION=${OPENMPI_VERSION} OPENMPI_DOWNLOAD_URL=${OPENMPI_DOWNLOAD_URL} HOROVOD_VERSION=${HOROVOD_VERSION} ./install_openmpi_horovod.sh && \
         rm -rf install_openmpi_horovod.sh; \

From c054f40f66fa625f51085a20c48554c61d05c5fd Mon Sep 17 00:00:00 2001
From: Deven Desai <deven.desai.amd@gmail.com>
Date: Thu, 18 Jun 2020 14:48:42 +0000
Subject: [PATCH 0496/1390] [ROCm] Fix ROCm CSB build failure - 200618

The folllowing commit introduces build error on the ROCm platform

https://github.com/tensorflow/tensorflow/commit/b6ff68822a59578f942e4fb8076757da8db278ae

build error

```
In file included from tensorflow/core/kernels/split_lib_gpu.cu.cc:27:
In file included from ./tensorflow/core/util/gpu_kernel_helper.h:25:
./tensorflow/core/util/gpu_device_functions.h:824:25: error: redefinition of 'GpuAtomicMax'
__device__ inline int64 GpuAtomicMax(int64* ptr, int64 value) {
                        ^
./tensorflow/core/util/gpu_device_functions.h:792:29: note: previous definition is here
__device__ inline long long GpuAtomicMax(long long* ptr, long long value) {
                            ^
./tensorflow/core/util/gpu_device_functions.h:894:25: error: redefinition of 'GpuAtomicMin'
__device__ inline int64 GpuAtomicMin(int64* ptr, int64 value) {
                        ^
./tensorflow/core/util/gpu_device_functions.h:862:29: note: previous definition is here
__device__ inline long long GpuAtomicMin(long long* ptr, long long value) {
                            ^
2 errors generated.
...
...
```

The cause is a combination of two things
* The condition `#if __CUDA_ARCH__ < 320` will hold true for ROCm too!
* The issue being addressed by (the build breaking commit, for CUDA) was already fixed by this commit (https://github.com/tensorflow/tensorflow/commit/307485737f46a76c97aefb51b0fc3cd264c2bb94) within a `#if TENSORFLOW_USE_ROCM` block

The fix being submitted in this PR, is to undo some of the changes introduced by the earlier ROCm commit, and combine that change with the change in the breaking commit.
---
 tensorflow/core/util/gpu_device_functions.h | 14 ++------------
 1 file changed, 2 insertions(+), 12 deletions(-)

diff --git a/tensorflow/core/util/gpu_device_functions.h b/tensorflow/core/util/gpu_device_functions.h
index d4e09a7fc98..a8158a1ab08 100644
--- a/tensorflow/core/util/gpu_device_functions.h
+++ b/tensorflow/core/util/gpu_device_functions.h
@@ -789,11 +789,6 @@ __device__ inline double GpuAtomicMax(double* ptr, double value) {
       ptr, [value](double a) { return fmax(a, value); });
 }
 
-__device__ inline long long GpuAtomicMax(long long* ptr, long long value) {
-  return detail::GpuAtomicCasHelper(
-      ptr, [value](long long a) { return max(a, value); });
-}
-
 #else
 
 __device__ inline float GpuAtomicMax(float* ptr, float value) {
@@ -814,7 +809,7 @@ __device__ inline Eigen::half GpuAtomicMax(Eigen::half* ptr,
       ptr, [value](Eigen::half a) { return max(a, value); });
 }
 
-#if __CUDA_ARCH__ < 320
+#if TENSORFLOW_USE_ROCM || (__CUDA_ARCH__ < 320)
 __device__ inline tensorflow::uint64 GpuAtomicMax(tensorflow::uint64* ptr,
                                                   tensorflow::uint64 value) {
   return detail::GpuAtomicCasHelper(
@@ -859,11 +854,6 @@ __device__ inline double GpuAtomicMin(double* ptr, double value) {
       ptr, [value](double a) { return fmin(a, value); });
 }
 
-__device__ inline long long GpuAtomicMin(long long* ptr, long long value) {
-  return detail::GpuAtomicCasHelper(
-      ptr, [value](long long a) { return min(a, value); });
-}
-
 #else
 
 __device__ inline float GpuAtomicMin(float* ptr, float value) {
@@ -884,7 +874,7 @@ __device__ inline Eigen::half GpuAtomicMin(Eigen::half* ptr,
       ptr, [value](Eigen::half a) { return min(a, value); });
 }
 
-#if __CUDA_ARCH__ < 320
+#if TENSORFLOW_USE_ROCM || (__CUDA_ARCH__ < 320)
 __device__ inline tensorflow::uint64 GpuAtomicMin(tensorflow::uint64* ptr,
                                                   tensorflow::uint64 value) {
   return detail::GpuAtomicCasHelper(

From 49e3e63dd7d6d7cf700013000efc8c8da80662ab Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 18 Jun 2020 08:20:28 -0700
Subject: [PATCH 0497/1390] Integrate LLVM at
 https://github.com/llvm/llvm-project/commit/92d8ad02e92f

PiperOrigin-RevId: 317105768
Change-Id: I76b13f63a85dd4b8bbcaff3fd1ca624b82411079
---
 tensorflow/workspace.bzl |  4 ++--
 third_party/mlir/BUILD   | 20 ++++++++++++++++++++
 2 files changed, 22 insertions(+), 2 deletions(-)

diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index 1c165ba5aba..f5b0b7537dc 100755
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -710,8 +710,8 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
     )
 
     # Check out LLVM and MLIR from llvm-project.
-    LLVM_COMMIT = "e3836fe1a5562875396705369353078ab07cf07a"
-    LLVM_SHA256 = "6a78815e2c71c560a11c8c1740d31b88a607d82b7ccc61dc142bef0f1f3fbde8"
+    LLVM_COMMIT = "92d8ad02e92fed3884169ba5d98056fe4fa5660d"
+    LLVM_SHA256 = "a4995ace7ddaef0c49293dc65771f58ef1fea96ebe1f39aa0a2d6d75d07f6cc7"
     LLVM_URLS = [
         "https://storage.googleapis.com/mirror.tensorflow.org/github.com/llvm/llvm-project/archive/{commit}.tar.gz".format(commit = LLVM_COMMIT),
         "https://github.com/llvm/llvm-project/archive/{commit}.tar.gz".format(commit = LLVM_COMMIT),
diff --git a/third_party/mlir/BUILD b/third_party/mlir/BUILD
index db75b27e78b..a0d10667d23 100644
--- a/third_party/mlir/BUILD
+++ b/third_party/mlir/BUILD
@@ -738,12 +738,32 @@ cc_library(
         ":Pass",
         ":SCFDialect",
         ":Shape",
+        ":ShapeToStandardPatternsIncGen",
         ":StandardOps",
         ":Support",
         ":Transforms",
     ],
 )
 
+gentbl(
+    name = "ShapeToStandardPatternsIncGen",
+    strip_include_prefix = "include/mlir/Conversion/ShapeToStandard",
+    tbl_outs = [
+        (
+            "-gen-rewriters",
+            "include/mlir/Conversion/ShapeToStandard/ShapeToStandardPatterns.inc",
+        ),
+    ],
+    tblgen = ":mlir-tblgen",
+    td_file = "lib/Conversion/ShapeToStandard/ShapeToStandardPatterns.td",
+    td_srcs = [
+        ":StdOpsTdFiles",
+        "include/mlir/Dialect/Shape/IR/ShapeBase.td",
+        "include/mlir/Dialect/Shape/IR/ShapeOps.td",
+        "include/mlir/Interfaces/InferTypeOpInterface.td",
+    ],
+)
+
 cc_library(
     name = "ShapeToSCF",
     srcs = glob([

From 3cea671a74251d62549280fbb6444ffc2cdc4f03 Mon Sep 17 00:00:00 2001
From: Zhenyu Tan <tanzheny@google.com>
Date: Thu, 18 Jun 2020 08:28:08 -0700
Subject: [PATCH 0498/1390] set `_batch_input_shape` before build for kpl.

PiperOrigin-RevId: 317107061
Change-Id: Idd7f5dcbce9d15b2e3d708dd7ea3b2c3e5c1be7e
---
 .../python/keras/engine/base_preprocessing_layer.py  |  2 ++
 .../keras/layers/preprocessing/normalization_test.py | 12 ++++++++++++
 2 files changed, 14 insertions(+)

diff --git a/tensorflow/python/keras/engine/base_preprocessing_layer.py b/tensorflow/python/keras/engine/base_preprocessing_layer.py
index 08df07e33e3..b2ab0880422 100644
--- a/tensorflow/python/keras/engine/base_preprocessing_layer.py
+++ b/tensorflow/python/keras/engine/base_preprocessing_layer.py
@@ -190,6 +190,8 @@ class CombinerPreprocessingLayer(PreprocessingLayer):
           shape = data_element.shape
         except AttributeError:
           shape = None
+        # TODO (b/159261555): move this to base layer build.
+        self._batch_input_shape = shape
         self.build(shape)
 
       # Once we have built the Layer, we can process the input data. We do so
diff --git a/tensorflow/python/keras/layers/preprocessing/normalization_test.py b/tensorflow/python/keras/layers/preprocessing/normalization_test.py
index f5f68d9c51a..f97b8db50ec 100644
--- a/tensorflow/python/keras/layers/preprocessing/normalization_test.py
+++ b/tensorflow/python/keras/layers/preprocessing/normalization_test.py
@@ -318,6 +318,18 @@ class NormalizationTest(keras_parameterized.TestCase,
     layer.adapt(data)
     self.assertAllClose(expect, layer(data))
 
+  def test_model_summary_after_layer_adapt(self):
+    data = np.array([[[0., 1., 2.], [0., 2., 6.]],
+                     [[2., 3., 4.], [3., 6., 10.]]])
+    cls = get_layer_class()
+    layer = cls(axis=-1)
+    layer.adapt(data)
+    model = keras.Sequential(
+        [layer,
+         keras.layers.Dense(64, activation="relu"),
+         keras.layers.Dense(1)])
+    model.summary()
+
 
 if __name__ == "__main__":
   test.main()

From 06bc84b12edce4c4ce616d0dbad5e0d5178218b7 Mon Sep 17 00:00:00 2001
From: Jian Li <jianlijianli@google.com>
Date: Thu, 18 Jun 2020 08:48:06 -0700
Subject: [PATCH 0499/1390] Create fully integer ResizeBilinear kernel.

PiperOrigin-RevId: 317110515
Change-Id: I233b89083dda9f9e0150c97a88391a154fb32d50
---
 .../internal/reference/reference_ops.h        | 103 ++++++++++++++++++
 1 file changed, 103 insertions(+)

diff --git a/tensorflow/lite/kernels/internal/reference/reference_ops.h b/tensorflow/lite/kernels/internal/reference/reference_ops.h
index e991a21e3bd..5208b21eb4d 100644
--- a/tensorflow/lite/kernels/internal/reference/reference_ops.h
+++ b/tensorflow/lite/kernels/internal/reference/reference_ops.h
@@ -1645,6 +1645,109 @@ inline void ResizeBilinear(const tflite::ResizeBilinearParams& op_params,
   }
 }
 
+inline void ComputeInterpolationValues(const int32 value, const int32 scale_10,
+                                       const bool half_pixel_centers,
+                                       int32 input_size, int32* scaled_value,
+                                       int32* lower_bound, int32* upper_bound) {
+  if (half_pixel_centers) {
+    *scaled_value = value * scale_10 + scale_10 / 2 - (1 << 9);
+  } else {
+    *scaled_value = value * scale_10;
+  }
+  *lower_bound = std::max(*scaled_value / (1 << 10), 0);
+  *upper_bound = std::min(*scaled_value / (1 << 10) + 1, input_size - 1);
+}
+
+// Same as above but takes int8 as input and output.
+inline void ResizeBilinear(const tflite::ResizeBilinearParams& op_params,
+                           const RuntimeShape& unextended_input_shape,
+                           const int8_t* input_data,
+                           const RuntimeShape& unextended_output_size_shape,
+                           const int32* output_size_data,
+                           const RuntimeShape& unextended_output_shape,
+                           int8_t* output_data) {
+  // If half_pixel_centers is True, align_corners must be False.
+  TFLITE_DCHECK(!op_params.half_pixel_centers || !op_params.align_corners);
+  TFLITE_DCHECK_LE(unextended_input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_output_size_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4);
+  const RuntimeShape input_shape =
+      RuntimeShape::ExtendedShape(4, unextended_input_shape);
+  const RuntimeShape output_size_shape =
+      RuntimeShape::ExtendedShape(4, unextended_output_size_shape);
+  const RuntimeShape output_shape =
+      RuntimeShape::ExtendedShape(4, unextended_output_shape);
+
+  const int32 batches = MatchingDim(input_shape, 0, output_shape, 0);
+  const int32 input_height = input_shape.Dims(1);
+  const int32 input_width = input_shape.Dims(2);
+  const int32 depth = MatchingDim(input_shape, 3, output_shape, 3);
+
+  TFLITE_DCHECK_EQ(output_size_shape.Dims(0), 1);
+  TFLITE_DCHECK_EQ(output_size_shape.Dims(1), 1);
+  TFLITE_DCHECK_EQ(output_size_shape.Dims(2), 1);
+  TFLITE_DCHECK_EQ(output_size_shape.Dims(3), 2);
+  const int32 output_height =
+      output_size_data[Offset(output_size_shape, 0, 0, 0, 0)];
+  const int32 output_width =
+      output_size_data[Offset(output_size_shape, 0, 0, 0, 1)];
+
+  int32 height_scale_10 =
+      ((1 << 10) * input_height + output_height / 2) / output_height;
+  int32 width_scale_10 =
+      ((1 << 10) * input_width + output_width / 2) / output_width;
+  if (op_params.align_corners && output_height > 1) {
+    height_scale_10 =
+        ((1 << 10) * (input_height - 1) + (output_height - 1) / 2) /
+        (output_height - 1);
+  }
+  if (op_params.align_corners && output_width > 1) {
+    width_scale_10 = ((1 << 10) * (input_width - 1) + (output_width - 1) / 2) /
+                     (output_width - 1);
+  }
+
+  for (int b = 0; b < batches; ++b) {
+    for (int y = 0; y < output_height; ++y) {
+      int32 input_y, y0, y1;
+      ComputeInterpolationValues(y, height_scale_10,
+                                 op_params.half_pixel_centers, input_height,
+                                 &input_y, &y0, &y1);
+      for (int x = 0; x < output_width; ++x) {
+        int32 input_x, x0, x1;
+        ComputeInterpolationValues(x, width_scale_10,
+                                   op_params.half_pixel_centers, input_width,
+                                   &input_x, &x0, &x1);
+        for (int c = 0; c < depth; ++c) {
+          const int64_t output_20_ll =
+              static_cast<int64_t>(
+                  input_data[Offset(input_shape, b, y0, x0, c)]) *
+              ((1 << 10) - (input_y - (1 << 10) * y0)) *
+              ((1 << 10) - (input_x - (1 << 10) * x0));
+          const int64_t output_20_lu =
+              static_cast<int64_t>(
+                  input_data[Offset(input_shape, b, y1, x0, c)]) *
+              (input_y - (1 << 10) * y0) *
+              ((1 << 10) - (input_x - (1 << 10) * x0));
+          const int64_t output_20_rl =
+              static_cast<int64_t>(
+                  input_data[Offset(input_shape, b, y0, x1, c)]) *
+              ((1 << 10) - (input_y - (1 << 10) * y0)) *
+              (input_x - (1 << 10) * x0);
+          const int64_t output_20_ru =
+              static_cast<int64_t>(
+                  input_data[Offset(input_shape, b, y1, x1, c)]) *
+              (input_y - (1 << 10) * y0) * (input_x - (1 << 10) * x0);
+          const int64_t output_20 =
+              output_20_ll + output_20_lu + output_20_rl + output_20_ru;
+          const int8_t interpolation =
+              static_cast<int8_t>((output_20 + (1 << 19)) / (1 << 20));
+          output_data[Offset(output_shape, b, y, x, c)] = interpolation;
+        }
+      }
+    }
+  }
+}
+
 template <typename T>
 inline void SpaceToBatchND(
     const SpaceToBatchParams& params,

From eda7d05793ec75227069eb0c3f49e0377f33c963 Mon Sep 17 00:00:00 2001
From: Vo Van Nghia <vovannghia2409@gmail.com>
Date: Thu, 18 Jun 2020 23:02:21 +0700
Subject: [PATCH 0500/1390] Add NewWritableFile

---
 .../experimental/filesystem/plugins/gcs/BUILD |  1 +
 .../filesystem/plugins/gcs/gcs_filesystem.cc  | 50 +++++++++++++++++++
 2 files changed, 51 insertions(+)

diff --git a/tensorflow/c/experimental/filesystem/plugins/gcs/BUILD b/tensorflow/c/experimental/filesystem/plugins/gcs/BUILD
index c9fee433589..05fd371088c 100644
--- a/tensorflow/c/experimental/filesystem/plugins/gcs/BUILD
+++ b/tensorflow/c/experimental/filesystem/plugins/gcs/BUILD
@@ -24,6 +24,7 @@ cc_library(
         "//tensorflow:windows": get_win_copts(),
     }),
     deps = [
+        "//tensorflow/c:env",
         "//tensorflow/c:tf_status",
         "//tensorflow/c/experimental/filesystem:filesystem_interface",
         "@com_github_googlecloudplatform_google_cloud_cpp//:storage_client",
diff --git a/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem.cc b/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem.cc
index 8c54bc85439..4ddc8548486 100644
--- a/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem.cc
+++ b/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem.cc
@@ -15,8 +15,11 @@ limitations under the License.
 #include <stdlib.h>
 #include <string.h>
 
+#include <fstream>
+
 #include "absl/strings/string_view.h"
 #include "google/cloud/storage/client.h"
+#include "tensorflow/c/env.h"
 #include "tensorflow/c/experimental/filesystem/filesystem_interface.h"
 #include "tensorflow/c/tf_status.h"
 
@@ -75,6 +78,25 @@ static void ParseGCSPath(absl::string_view fname, bool object_empty_ok,
   strcpy(*object, object_view.data());
 }
 
+class TempFile : public std::fstream {
+ public:
+  // We should specify openmode each time we call TempFile.
+  TempFile(const char* temp_file_name, std::ios::openmode mode)
+      : std::fstream(temp_file_name, mode), name(temp_file_name) {}
+  TempFile(TempFile&& rhs) : std::fstream(std::move(rhs)), name(rhs.name) {
+    rhs.name = nullptr;
+  }
+  ~TempFile() {
+    std::fstream::close();
+    std::remove(name);
+    plugin_memory_free(const_cast<char*>(name));
+  }
+  const char* getName() { return name; }
+
+ private:
+  const char* name;
+};
+
 // SECTION 1. Implementation for `TF_RandomAccessFile`
 // ----------------------------------------------------------------------------
 namespace tf_random_access_file {
@@ -86,6 +108,20 @@ namespace tf_random_access_file {
 // SECTION 2. Implementation for `TF_WritableFile`
 // ----------------------------------------------------------------------------
 namespace tf_writable_file {
+typedef struct GCSFile {
+  const char* bucket;
+  const char* object;
+  gcs::Client* gcs_client;  // not owned
+  TempFile outfile;
+  bool sync_need;
+} GCSFile;
+
+static void Cleanup(TF_WritableFile* file) {
+  auto gcs_file = static_cast<GCSFile*>(file->plugin_file);
+  plugin_memory_free(const_cast<char*>(gcs_file->bucket));
+  plugin_memory_free(const_cast<char*>(gcs_file->object));
+  delete gcs_file;
+}
 
 // TODO(vnvo2409): Implement later
 
@@ -119,6 +155,20 @@ static void Init(TF_Filesystem* filesystem, TF_Status* status) {
 
 // TODO(vnvo2409): Implement later
 
+static void NewWritableFile(const TF_Filesystem* filesystem, const char* path,
+                            TF_WritableFile* file, TF_Status* status) {
+  char* bucket;
+  char* object;
+  ParseGCSPath(path, false, &bucket, &object, status);
+  if (TF_GetCode(status) != TF_OK) return;
+
+  auto gcs_client = static_cast<gcs::Client*>(filesystem->plugin_filesystem);
+  TempFile outfile(TF_GetTempFileName(""), std::ios::binary | std::ios::out);
+  file->plugin_file = new tf_writable_file::GCSFile(
+      {bucket, object, gcs_client, std::move(outfile), true});
+  TF_SetStatus(status, TF_OK, "");
+}
+
 }  // namespace tf_gcs_filesystem
 
 static void ProvideFilesystemSupportFor(TF_FilesystemPluginOps* ops,

From 2e3310031296f9232f7a58a5cfca2ee03b9a7c91 Mon Sep 17 00:00:00 2001
From: Jian Li <jianlijianli@google.com>
Date: Thu, 18 Jun 2020 09:26:13 -0700
Subject: [PATCH 0501/1390] Add quantization test for transpose.

PiperOrigin-RevId: 317117545
Change-Id: I81c4f9583f29205bcbdaae175eac59439cd19047
---
 tensorflow/lite/tools/optimize/BUILD          |   1 +
 .../tools/optimize/quantize_model_test.cc     |  44 ++++++++++++++++++
 tensorflow/lite/tools/optimize/test_util.cc   |   2 +
 tensorflow/lite/tools/optimize/test_util.h    |   3 ++
 .../tools/optimize/testdata/transpose.bin     | Bin 0 -> 544 bytes
 5 files changed, 50 insertions(+)
 create mode 100644 tensorflow/lite/tools/optimize/testdata/transpose.bin

diff --git a/tensorflow/lite/tools/optimize/BUILD b/tensorflow/lite/tools/optimize/BUILD
index 3011c01cdeb..c10d4465e5c 100644
--- a/tensorflow/lite/tools/optimize/BUILD
+++ b/tensorflow/lite/tools/optimize/BUILD
@@ -296,6 +296,7 @@ tf_cc_test(
         "//tensorflow/lite/tools/optimize:testdata/split.bin",
         "//tensorflow/lite/tools/optimize:testdata/svdf_calibrated.bin",
         "//tensorflow/lite/tools/optimize:testdata/svdf_quantized.bin",
+        "//tensorflow/lite/tools/optimize:testdata/transpose.bin",
         "//tensorflow/lite/tools/optimize:testdata/unpack.bin",
     ],
     tags = [
diff --git a/tensorflow/lite/tools/optimize/quantize_model_test.cc b/tensorflow/lite/tools/optimize/quantize_model_test.cc
index f8f1a9d4113..36b35af0065 100644
--- a/tensorflow/lite/tools/optimize/quantize_model_test.cc
+++ b/tensorflow/lite/tools/optimize/quantize_model_test.cc
@@ -1454,6 +1454,50 @@ TEST_F(QuantizeUnpackTest, VerifyUnpack) {
                   unpack_output_1->quantization->zero_point[0]);
 }
 
+class QuantizeTransposeTest : public QuantizeModelTest {
+ protected:
+  QuantizeTransposeTest() {
+    input_model_ = ReadModel(internal::kModelWithTranspose);
+    readonly_model_ = input_model_->GetModel();
+    readonly_model_->UnPackTo(&model_);
+  }
+};
+
+TEST_F(QuantizeTransposeTest, VerifyTranspose) {
+  auto status = QuantizeModel(&builder_, &model_, &error_reporter_);
+
+  ASSERT_EQ(kTfLiteOk, status);
+
+  const auto subgraph = model_.subgraphs[0].get();
+  auto op = subgraph->operators[1].get();
+
+  auto float_graph = readonly_model_->subgraphs()->Get(0);
+
+  ASSERT_EQ(model_.operator_codes[op->opcode_index].get()->builtin_code,
+            BuiltinOperator_TRANSPOSE);
+
+  // The model should only have one input and one outputs.
+  EXPECT_EQ(subgraph->inputs.size(), 1);
+  EXPECT_EQ(subgraph->outputs.size(), 1);
+
+  // Get transpose input and output tensors
+  auto transpose_input = subgraph->tensors[op->inputs[0]].get();
+  auto transpose_output = subgraph->tensors[op->outputs[0]].get();
+
+  // Verify transpose input is quantized.
+  ASSERT_EQ(float_graph->tensors()->Get(op->inputs[0])->type(),
+            TensorType_FLOAT32);
+  EXPECT_EQ(transpose_input->type, TensorType_INT8);
+
+  // Ensure quantization parameters before and after transpose
+  // are preserved after quantization for all outputs of
+  // transpose.
+  EXPECT_FLOAT_EQ(transpose_input->quantization->scale[0],
+                  transpose_output->quantization->scale[0]);
+  EXPECT_EQ(transpose_input->quantization->zero_point[0],
+            transpose_output->quantization->zero_point[0]);
+}
+
 }  // namespace
 }  // namespace optimize
 }  // namespace tflite
diff --git a/tensorflow/lite/tools/optimize/test_util.cc b/tensorflow/lite/tools/optimize/test_util.cc
index 7d5e9d65f06..61e82ed3e34 100644
--- a/tensorflow/lite/tools/optimize/test_util.cc
+++ b/tensorflow/lite/tools/optimize/test_util.cc
@@ -61,6 +61,8 @@ const char* kModelWithMaximumOp = "maximum.bin";
 const char* kLstmCalibrated2 = "lstm_calibrated2.bin";
 const char* kLstmQuantized2 = "lstm_quantized2.bin";
 
+const char* kModelWithTranspose = "transpose.bin";
+
 const char* kSvdfCalibrated = "svdf_calibrated.bin";
 const char* kSvdfQuantized = "svdf_quantized.bin";
 
diff --git a/tensorflow/lite/tools/optimize/test_util.h b/tensorflow/lite/tools/optimize/test_util.h
index abcdbc21d36..4d2eadf283f 100644
--- a/tensorflow/lite/tools/optimize/test_util.h
+++ b/tensorflow/lite/tools/optimize/test_util.h
@@ -98,6 +98,9 @@ extern const char* kModelWithMaximumOp;
 extern const char* kLstmCalibrated2;
 extern const char* kLstmQuantized2;
 
+// Test model with a transpose op.
+extern const char* kModelWithTranspose;
+
 // Test model with SVDF op.
 extern const char* kSvdfCalibrated;
 extern const char* kSvdfQuantized;
diff --git a/tensorflow/lite/tools/optimize/testdata/transpose.bin b/tensorflow/lite/tools/optimize/testdata/transpose.bin
new file mode 100644
index 0000000000000000000000000000000000000000..a76886e5b473b8de04253089e2acc931f5b6ec86
GIT binary patch
literal 544
zcmb1OU|<Mw^D$;%;A4<rU}4~3;9(G85MkhBU|?WkU|@K`$iR>TQp3Q&zyifC3=9kw
z3=9k)y&PaMHn1E60~-Se0}BHiSPY_`fuX^k0SWy1|NlP=0|+uO@Po|tclK9s&d)1L
zEh<S((PLm_U|<knU|`^35P`dfhk=11g@J(~f`Ng-2WmD*9mswWuxgOIVU~b&Lfi(@
z2XZq=2BaQ@L4E@9K{SXD!yqwc1_p)|NOlzc|Nq|x8V)K@dsF`Z{||B>$Xt+K1_lQQ
z2+hC%3XA;Gl7i9_{YsE|AiVAW|No${0;z$Su>!-418_4yVFxh-68<24APkBhP>6tR
z0mTp~ejFGW7(i~;0f!ev1{4w?cc8ejp~eMbhQdKs1_p-Ayn@masJ}qsFboPWNF1_*
K<B)-YK^*|CrY)8L

literal 0
HcmV?d00001


From 88c63dccd1dacc7b7b5658bc200e872be4c32f5f Mon Sep 17 00:00:00 2001
From: Vo Van Nghia <vovannghia2409@gmail.com>
Date: Thu, 18 Jun 2020 23:46:41 +0700
Subject: [PATCH 0502/1390] Move TempFile  to gcs_helper

---
 .../experimental/filesystem/plugins/gcs/BUILD | 12 ++++++-
 .../filesystem/plugins/gcs/gcs_filesystem.cc  | 20 +----------
 .../filesystem/plugins/gcs/gcs_helper.cc      | 19 +++++++++++
 .../filesystem/plugins/gcs/gcs_helper.h       | 33 +++++++++++++++++++
 4 files changed, 64 insertions(+), 20 deletions(-)
 create mode 100644 tensorflow/c/experimental/filesystem/plugins/gcs/gcs_helper.cc
 create mode 100644 tensorflow/c/experimental/filesystem/plugins/gcs/gcs_helper.h

diff --git a/tensorflow/c/experimental/filesystem/plugins/gcs/BUILD b/tensorflow/c/experimental/filesystem/plugins/gcs/BUILD
index 05fd371088c..d104181b264 100644
--- a/tensorflow/c/experimental/filesystem/plugins/gcs/BUILD
+++ b/tensorflow/c/experimental/filesystem/plugins/gcs/BUILD
@@ -24,10 +24,20 @@ cc_library(
         "//tensorflow:windows": get_win_copts(),
     }),
     deps = [
-        "//tensorflow/c:env",
+        ":gcs_helper",
         "//tensorflow/c:tf_status",
         "//tensorflow/c/experimental/filesystem:filesystem_interface",
         "@com_github_googlecloudplatform_google_cloud_cpp//:storage_client",
         "@com_google_absl//absl/strings",
     ],
 )
+
+cc_library(
+    name = "gcs_helper",
+    srcs = ["gcs_helper.cc"],
+    hdrs = ["gcs_helper.h"],
+    linkstatic = 1,
+    deps = [
+        "//tensorflow/c:env",
+    ],
+)
diff --git a/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem.cc b/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem.cc
index 4ddc8548486..2793194e0a8 100644
--- a/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem.cc
+++ b/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "google/cloud/storage/client.h"
 #include "tensorflow/c/env.h"
 #include "tensorflow/c/experimental/filesystem/filesystem_interface.h"
+#include "tensorflow/c/experimental/filesystem/plugins/gcs/gcs_helper.h"
 #include "tensorflow/c/tf_status.h"
 
 // Implementation of a filesystem for GCS environments.
@@ -78,25 +79,6 @@ static void ParseGCSPath(absl::string_view fname, bool object_empty_ok,
   strcpy(*object, object_view.data());
 }
 
-class TempFile : public std::fstream {
- public:
-  // We should specify openmode each time we call TempFile.
-  TempFile(const char* temp_file_name, std::ios::openmode mode)
-      : std::fstream(temp_file_name, mode), name(temp_file_name) {}
-  TempFile(TempFile&& rhs) : std::fstream(std::move(rhs)), name(rhs.name) {
-    rhs.name = nullptr;
-  }
-  ~TempFile() {
-    std::fstream::close();
-    std::remove(name);
-    plugin_memory_free(const_cast<char*>(name));
-  }
-  const char* getName() { return name; }
-
- private:
-  const char* name;
-};
-
 // SECTION 1. Implementation for `TF_RandomAccessFile`
 // ----------------------------------------------------------------------------
 namespace tf_random_access_file {
diff --git a/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_helper.cc b/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_helper.cc
new file mode 100644
index 00000000000..139579c53ae
--- /dev/null
+++ b/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_helper.cc
@@ -0,0 +1,19 @@
+#include "tensorflow/c/experimental/filesystem/plugins/gcs/gcs_helper.h"
+
+#include <stdio.h>
+
+#include <fstream>
+#include <string>
+
+TempFile::TempFile(const char* temp_file_name, std::ios::openmode mode)
+    : std::fstream(temp_file_name, mode), name(temp_file_name) {}
+
+TempFile::TempFile(TempFile&& rhs)
+    : std::fstream(std::move(rhs)), name(std::move(rhs.name)) {}
+
+TempFile::~TempFile() {
+  std::fstream::close();
+  std::remove(name.c_str());
+}
+
+const std::string TempFile::getName() const { return name; }
\ No newline at end of file
diff --git a/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_helper.h b/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_helper.h
new file mode 100644
index 00000000000..437cbe560d6
--- /dev/null
+++ b/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_helper.h
@@ -0,0 +1,33 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_C_EXPERIMENTAL_FILESYSTEM_PLUGINS_GCS_GCS_HELPER_H_
+#define TENSORFLOW_C_EXPERIMENTAL_FILESYSTEM_PLUGINS_GCS_GCS_HELPER_H_
+
+#include <fstream>
+#include <string>
+
+class TempFile : public std::fstream {
+ public:
+  // We should specify openmode each time we call TempFile.
+  TempFile(const char* temp_file_name, std::ios::openmode mode);
+  TempFile(TempFile&& rhs);
+  ~TempFile();
+  const std::string getName() const;
+
+ private:
+  const std::string name;
+};
+
+#endif  // TENSORFLOW_C_EXPERIMENTAL_FILESYSTEM_PLUGINS_GCS_GCS_HELPER_H_

From 18f54c42c62191b60edeff7e1308ed7b3305b0eb Mon Sep 17 00:00:00 2001
From: Srinivas Vasudevan <srvasude@google.com>
Date: Thu, 18 Jun 2020 09:45:59 -0700
Subject: [PATCH 0503/1390] Disable special_math_ops_test on asan build.

PiperOrigin-RevId: 317121277
Change-Id: I902f6c046c2766638b5dbbb90a0ec2942a3c536f
---
 tensorflow/python/BUILD | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index f53859b2915..a4e72bf2460 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -5277,6 +5277,7 @@ cuda_py_test(
     shard_count = 10,
     tags = [
         "no_windows_gpu",
+        "noasan",  # b/159332048
         "nomsan",  # b/148630708
     ],
     deps = [

From 89b80c5fb98ba57175626a9df490dbdd2bc8776b Mon Sep 17 00:00:00 2001
From: Srinivas Vasudevan <srvasude@google.com>
Date: Thu, 18 Jun 2020 09:59:14 -0700
Subject: [PATCH 0504/1390] Add banded triangular solve op.

PiperOrigin-RevId: 317124054
Change-Id: I54f090d7583b21fa18788a2deb02262d9c8231be
---
 .../api_def_BandedTriangularSolve.pbtxt       |   4 +
 .../api_def_BandedTriangularSolve.pbtxt       |   4 +
 tensorflow/core/kernels/BUILD                 |  27 ++
 .../kernels/banded_triangular_solve_op.cc     | 293 ++++++++++++++++++
 .../banded_triangular_solve_op_test.cc        | 180 +++++++++++
 tensorflow/core/ops/linalg_ops.cc             |  54 ++++
 tensorflow/python/kernel_tests/BUILD          |  11 +
 .../banded_triangular_solve_op_test.py        | 232 ++++++++++++++
 .../python/kernel_tests/linalg_grad_test.py   |  52 ++++
 tensorflow/python/ops/linalg/linalg_impl.py   |  96 ++++++
 tensorflow/python/ops/linalg_grad.py          |  33 ++
 .../api/golden/v1/tensorflow.raw_ops.pbtxt    |   4 +
 .../api/golden/v2/tensorflow.linalg.pbtxt     |   4 +
 .../api/golden/v2/tensorflow.raw_ops.pbtxt    |   4 +
 14 files changed, 998 insertions(+)
 create mode 100644 tensorflow/core/api_def/base_api/api_def_BandedTriangularSolve.pbtxt
 create mode 100644 tensorflow/core/api_def/python_api/api_def_BandedTriangularSolve.pbtxt
 create mode 100644 tensorflow/core/kernels/banded_triangular_solve_op.cc
 create mode 100644 tensorflow/core/kernels/banded_triangular_solve_op_test.cc
 create mode 100644 tensorflow/python/kernel_tests/banded_triangular_solve_op_test.py

diff --git a/tensorflow/core/api_def/base_api/api_def_BandedTriangularSolve.pbtxt b/tensorflow/core/api_def/base_api/api_def_BandedTriangularSolve.pbtxt
new file mode 100644
index 00000000000..ba5e1bdcaf2
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_BandedTriangularSolve.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "BandedTriangularSolve"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_BandedTriangularSolve.pbtxt b/tensorflow/core/api_def/python_api/api_def_BandedTriangularSolve.pbtxt
new file mode 100644
index 00000000000..ba5e1bdcaf2
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_BandedTriangularSolve.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "BandedTriangularSolve"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index b2b54adbcf9..b4730dad96c 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -3577,6 +3577,7 @@ tf_cc_tests(
 cc_library(
     name = "linalg",
     deps = [
+        ":banded_triangular_solve_op",
         ":cholesky_grad",
         ":cholesky_op",
         ":determinant_op",
@@ -3750,6 +3751,12 @@ tf_kernel_library(
     deps = LINALG_DEPS,
 )
 
+tf_kernel_library(
+    name = "banded_triangular_solve_op",
+    prefix = "banded_triangular_solve_op",
+    deps = LINALG_DEPS + [":fill_functor"],
+)
+
 tf_kernel_library(
     name = "matrix_triangular_solve_op",
     hdrs = ["matrix_triangular_solve_op_impl.h"],
@@ -4425,6 +4432,26 @@ tf_cuda_cc_test(
     ],
 )
 
+tf_cuda_cc_test(
+    name = "banded_triangular_solve_op_test",
+    size = "small",
+    srcs = ["banded_triangular_solve_op_test.cc"],
+    deps = [
+        ":banded_triangular_solve_op",
+        ":matrix_set_diag_op",
+        ":matrix_triangular_solve_op",
+        ":ops_testutil",
+        ":ops_util",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+    ],
+)
+
 tf_cuda_cc_test(
     name = "matrix_triangular_solve_op_test",
     size = "small",
diff --git a/tensorflow/core/kernels/banded_triangular_solve_op.cc b/tensorflow/core/kernels/banded_triangular_solve_op.cc
new file mode 100644
index 00000000000..d01a015502a
--- /dev/null
+++ b/tensorflow/core/kernels/banded_triangular_solve_op.cc
@@ -0,0 +1,293 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// See docs in ../ops/linalg_ops.cc.
+
+#include "third_party/eigen3/Eigen/Core"
+#include "tensorflow/core/framework/kernel_def_builder.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/kernels/fill_functor.h"
+#include "tensorflow/core/kernels/linalg_ops_common.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/matmul_bcast.h"
+
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+
+template <typename Scalar>
+Scalar eigen_conj(const Scalar& scalar) {
+  return Eigen::numext::conj<Scalar>(scalar);
+}
+
+// Sequential batch matrix triangular solve kernel that calls Eigen's
+// matrix triangular solve.
+template <typename Scalar>
+struct SequentialBandedTriangularSolveKernel {
+  using Matrix =
+      Eigen::Matrix<Scalar, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
+  using ConstMatrixMap = Eigen::Map<const Matrix>;
+  using MatrixMap = Eigen::Map<Matrix>;
+  using RealScalar = typename Eigen::NumTraits<Scalar>::Real;
+
+  static ConstMatrixMap ConstTensorSliceToEigenMatrix(const Tensor& t,
+                                                      int slice) {
+    return ConstMatrixMap(
+        t.flat<Scalar>().data() + slice * t.dim_size(1) * t.dim_size(2),
+        t.dim_size(1), t.dim_size(2));
+  }
+
+  static MatrixMap TensorSliceToEigenMatrix(Tensor* t, int slice) {
+    return MatrixMap(
+        t->flat<Scalar>().data() + slice * t->dim_size(1) * t->dim_size(2),
+        t->dim_size(1), t->dim_size(2));
+  }
+
+  static void Run(const Tensor& in_x, const Tensor& in_y, bool lower,
+                  bool adjoint, const MatMulBCast& bcast, Tensor* out,
+                  int start, int limit) {
+    const bool should_bcast = bcast.IsBroadcastingRequired();
+    const auto& x_batch_indices = bcast.x_batch_indices();
+    const auto& y_batch_indices = bcast.y_batch_indices();
+    int num_bands = in_x.dim_size(1);
+    int matrix_size = in_x.dim_size(2);
+
+    for (int64 i = start; i < limit; ++i) {
+      const int64 x_batch_index = should_bcast ? x_batch_indices[i] : i;
+      const int64 y_batch_index = should_bcast ? y_batch_indices[i] : i;
+      auto matrix = ConstTensorSliceToEigenMatrix(in_x, x_batch_index);
+      auto rhs = ConstTensorSliceToEigenMatrix(in_y, y_batch_index);
+      auto output = TensorSliceToEigenMatrix(out, i);
+      // Below, we use the standard algorithm for computing a triangular solve,
+      // except we band limit it.
+      // Given A x = b, where A is lower triangular,
+      // x_i = (b_i - sum a_ij * x_j) / a_ii, where the sum is from
+      // j = 0 to i - 1.
+      //
+      // Now, in a banded triangular matrix, when i exceeds the band size,
+      // then the sum goes from j = i - band_size to i - 1, since the other
+      // elements are zero.
+      //
+      // Finally, given the band storage format, we'll need to change the
+      // indexing.
+      if (lower) {
+        if (!adjoint) {
+          output.row(0) = rhs.row(0) / matrix(0, 0);
+          for (int i = 1; i < matrix_size; ++i) {
+            if (i < num_bands) {
+              output.row(i).noalias() =
+                  (rhs.row(i) - matrix.block(1, i, i, 1).reverse().transpose() *
+                                    output.topRows(i)) /
+                  matrix(0, i);
+            } else {
+              output.row(i).noalias() =
+                  (rhs.row(i) -
+                   matrix.block(1, i, num_bands - 1, 1).reverse().transpose() *
+                       output.middleRows(i - (num_bands - 1), num_bands - 1)) /
+                  matrix(0, i);
+            }
+          }
+        } else {
+          // In the adjoint case, here and below, we now have an upper (lower)
+          // triangular matrix, and thus need to work through with the other
+          // case. We can't simply conjugate `matrix` and use the upper (lower)
+          // algorithm because the band storage format for upper and lower
+          // triangular matrices are different (in the lower case, we pad
+          // entries on the left, and in the upper case we pad entries on the
+          // right.
+          output.row(matrix_size - 1) =
+              rhs.row(matrix_size - 1) / eigen_conj(matrix(0, matrix_size - 1));
+          for (int i = matrix_size - 1; i >= 0; --i) {
+            output.row(i).noalias() = rhs.row(i);
+            for (int j = i + 1; j < std::min(matrix_size, i + num_bands); ++j) {
+              output.row(i).noalias() -=
+                  eigen_conj(matrix(j - i, j)) * output.row(j);
+            }
+            output.row(i) /= eigen_conj(matrix(0, i));
+          }
+        }
+      } else {
+        if (!adjoint) {
+          output.row(matrix_size - 1) =
+              rhs.row(matrix_size - 1) / matrix(num_bands - 1, matrix_size - 1);
+          for (int i = 1; i < matrix_size; ++i) {
+            int k = matrix_size - 1 - i;
+            if (i < num_bands) {
+              output.row(k).noalias() =
+                  (rhs.row(k) - matrix.block(num_bands - 1 - i, k, i, 1)
+                                        .reverse()
+                                        .transpose() *
+                                    output.bottomRows(i)) /
+                  matrix(num_bands - 1, k);
+            } else {
+              output.row(k).noalias() =
+                  (rhs.row(k) -
+                   matrix.block(0, k, num_bands - 1, 1).reverse().transpose() *
+                       output.middleRows(k + 1, num_bands - 1)) /
+                  matrix(num_bands - 1, k);
+            }
+          }
+        } else {
+          output.row(0) = rhs.row(0) / eigen_conj(matrix(num_bands - 1, 0));
+          for (int i = 1; i < matrix_size; ++i) {
+            output.row(i).noalias() = rhs.row(i);
+            for (int j = std::max(0, i - (num_bands - 1)); j < i; ++j) {
+              output.row(i).noalias() -=
+                  eigen_conj(matrix(num_bands - 1 - (i - j), j)) *
+                  output.row(j);
+            }
+            output.row(i) /= eigen_conj(matrix(num_bands - 1, i));
+          }
+        }
+      }
+    }
+  }
+};
+
+template <typename Scalar>
+struct LaunchBatchBandedTriangularSolve;
+
+template <typename Scalar>
+struct LaunchBatchBandedTriangularSolve {
+  static void Launch(OpKernelContext* context, const Tensor& in_x,
+                     const Tensor& in_y, bool adjoint, bool lower,
+                     const MatMulBCast& bcast, Tensor* out) {
+    // Number of banded matrix triangular solves i.e. size of the batch.
+    const int64 batch_size = bcast.output_batch_size();
+    const int64 cost_per_unit =
+        in_x.dim_size(1) * in_x.dim_size(2) * in_y.dim_size(2);
+    auto worker_threads = *(context->device()->tensorflow_cpu_worker_threads());
+
+    using Matrix =
+        Eigen::Matrix<Scalar, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
+    using ConstMatrixMap = Eigen::Map<const Matrix>;
+    using RealScalar = typename Eigen::NumTraits<Scalar>::Real;
+    // Check diagonal before doing any solves. This is the first row in the
+    // lower case and else is the last row.
+    auto matrix = ConstMatrixMap(in_x.flat<Scalar>().data(), in_x.dim_size(1),
+                                 in_x.dim_size(2));
+    RealScalar min_abs_pivot;
+    if (lower) {
+      min_abs_pivot = matrix.row(0).cwiseAbs().minCoeff();
+    } else {
+      min_abs_pivot = matrix.row(in_x.dim_size(1) - 1).cwiseAbs().minCoeff();
+    }
+    OP_REQUIRES(context, min_abs_pivot > RealScalar(0),
+                errors::InvalidArgument("Input matrix is not invertible."));
+
+    Shard(worker_threads.num_threads, worker_threads.workers, batch_size,
+          cost_per_unit,
+          [&in_x, &in_y, adjoint, lower, &bcast, out](int start, int limit) {
+            SequentialBandedTriangularSolveKernel<Scalar>::Run(
+                in_x, in_y, lower, adjoint, bcast, out, start, limit);
+          });
+  }
+};
+
+template <typename Scalar>
+class BandedTriangularSolveOpCpu : public OpKernel {
+ public:
+  explicit BandedTriangularSolveOpCpu(OpKernelConstruction* context)
+      : OpKernel(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("lower", &lower_));
+    OP_REQUIRES_OK(context, context->GetAttr("adjoint", &adjoint_));
+  }
+
+  ~BandedTriangularSolveOpCpu() override {}
+
+  void Compute(OpKernelContext* ctx) override {
+    const Tensor& in0 = ctx->input(0);
+    const Tensor& in1 = ctx->input(1);
+
+    ValidateInputTensors(ctx, in0, in1);
+
+    MatMulBCast bcast(in0.shape().dim_sizes(), in1.shape().dim_sizes());
+    OP_REQUIRES(
+        ctx, bcast.IsValid(),
+        errors::InvalidArgument(
+            "In[0] and In[1] must have compatible batch dimensions: ",
+            in0.shape().DebugString(), " vs. ", in1.shape().DebugString()));
+
+    TensorShape out_shape = bcast.output_batch_shape();
+    auto batch_size = bcast.output_batch_size();
+    auto d0 = in0.dim_size(in0.dims() - 2);  // Band size.
+    auto d1 = in0.dim_size(in0.dims() - 1);
+    Tensor in0_reshaped;
+    OP_REQUIRES(
+        ctx,
+        in0_reshaped.CopyFrom(in0, TensorShape({bcast.x_batch_size(), d0, d1})),
+        errors::Internal("Failed to reshape In[0] from ",
+                         in0.shape().DebugString()));
+    auto d2 = in1.dim_size(in1.dims() - 2);
+    auto d3 = in1.dim_size(in1.dims() - 1);
+    Tensor in1_reshaped;
+    OP_REQUIRES(
+        ctx,
+        in1_reshaped.CopyFrom(in1, TensorShape({bcast.y_batch_size(), d2, d3})),
+        errors::Internal("Failed to reshape In[1] from ",
+                         in1.shape().DebugString()));
+    OP_REQUIRES(ctx, d1 == d2,
+                errors::InvalidArgument(
+                    "In[0] mismatch In[1] shape: ", d1, " vs. ", d2, ": ",
+                    in0.shape().DebugString(), " ", in1.shape().DebugString(),
+                    " ", lower_, " ", adjoint_));
+    out_shape.AddDim(d1);
+    out_shape.AddDim(d3);
+    Tensor* out = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, out_shape, &out));
+    if (out->NumElements() == 0) {
+      return;
+    }
+    Tensor out_reshaped;
+    OP_REQUIRES(ctx,
+                out_reshaped.CopyFrom(*out, TensorShape({batch_size, d1, d3})),
+                errors::Internal("Failed to reshape output from ",
+                                 out->shape().DebugString()));
+    LaunchBatchBandedTriangularSolve<Scalar>::Launch(
+        ctx, in0_reshaped, in1_reshaped, adjoint_, lower_, bcast,
+        &out_reshaped);
+  }
+
+ private:
+  void ValidateInputTensors(OpKernelContext* ctx, const Tensor& in0,
+                            const Tensor& in1) {
+    OP_REQUIRES(
+        ctx, in0.dims() >= 2,
+        errors::InvalidArgument("In[0] ndims must be >= 2: ", in0.dims()));
+
+    OP_REQUIRES(
+        ctx, in1.dims() >= 2,
+        errors::InvalidArgument("In[1] ndims must be >= 2: ", in1.dims()));
+  }
+  bool lower_;
+  bool adjoint_;
+};
+
+#define REGISTER_BANDED_TRIANGULAR_SOLVE_CPU(TYPE)        \
+  REGISTER_KERNEL_BUILDER(Name("BandedTriangularSolve")   \
+                              .Device(DEVICE_CPU)         \
+                              .TypeConstraint<TYPE>("T"), \
+                          BandedTriangularSolveOpCpu<TYPE>);
+
+REGISTER_BANDED_TRIANGULAR_SOLVE_CPU(float);
+REGISTER_BANDED_TRIANGULAR_SOLVE_CPU(double);
+REGISTER_BANDED_TRIANGULAR_SOLVE_CPU(complex64);
+REGISTER_BANDED_TRIANGULAR_SOLVE_CPU(complex128);
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/banded_triangular_solve_op_test.cc b/tensorflow/core/kernels/banded_triangular_solve_op_test.cc
new file mode 100644
index 00000000000..37e904a3e0e
--- /dev/null
+++ b/tensorflow/core/kernels/banded_triangular_solve_op_test.cc
@@ -0,0 +1,180 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/graph/node_builder.h"
+#include "tensorflow/core/graph/testlib.h"
+#include "tensorflow/core/kernels/matrix_set_diag_op.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/test_benchmark.h"
+
+namespace tensorflow {
+namespace {
+
+Node* SetDiag(int num_bands, Graph* g, Node* bands, Node* triangular) {
+  Node* ret;
+  Tensor bandwidth(DT_INT32, TensorShape({2}));
+  bandwidth.flat<int32>()(0) = -(num_bands - 1);
+  bandwidth.flat<int32>()(1) = 0;
+  TF_CHECK_OK(NodeBuilder(g->NewName("n"), "MatrixSetDiagV3")
+                  .Input(triangular)
+                  .Input(bands)
+                  .Input(test::graph::Constant(g, bandwidth))
+                  .Attr("align", "RIGHT_LEFT")
+                  .Finalize(g, &ret));
+  return ret;
+}
+
+Node* BandedTriangularSolve(Graph* g, Node* in0, Node* in1) {
+  Node* ret;
+  TF_CHECK_OK(NodeBuilder(g->NewName("n"), "BandedTriangularSolve")
+                  .Input(in0)
+                  .Input(in1)
+                  .Attr("lower", true)
+                  .Attr("adjoint", false)
+                  .Finalize(g, &ret));
+  return ret;
+}
+
+Node* MatrixTriangularSolve(Graph* g, Node* in0, Node* in1) {
+  Node* ret;
+  TF_CHECK_OK(NodeBuilder(g->NewName("n"), "MatrixTriangularSolve")
+                  .Input(in0)
+                  .Input(in1)
+                  .Attr("lower", true)
+                  .Attr("adjoint", false)
+                  .Finalize(g, &ret));
+  return ret;
+}
+
+template <typename T>
+static Graph* BandedTriangularSolve(int64 num_bands, int64 n, int64 m,
+                                    bool use_banded_solver, DataType type) {
+  Graph* g = new Graph(OpRegistry::Global());
+  Tensor in0(type, TensorShape({num_bands, n}));
+  // Set diagonal to nonzero to guarantee invertibility.
+  in0.flat<T>().setRandom();
+  in0.flat<T>() =
+      in0.flat<T>().abs() + in0.flat<T>().constant(static_cast<T>(0.5));
+  Tensor in1(type, TensorShape({n, m}));
+  in1.flat<T>().setRandom();
+  if (use_banded_solver) {
+    BandedTriangularSolve(g, test::graph::Constant(g, in0),
+                          test::graph::Constant(g, in1));
+  } else {
+    // Create a zero tensor.
+    Tensor in2(type, TensorShape({n, n}));
+    in2.flat<T>().setZero();
+    Node* triangular_matrix =
+        SetDiag(num_bands, g, test::graph::Constant(g, in0),
+                test::graph::Constant(g, in2));
+    MatrixTriangularSolve(g, triangular_matrix, test::graph::Constant(g, in1));
+  }
+  return g;
+}
+
+// Macro arguments names: --------------------------------------------------- //
+//   K: Number of bands
+//   N: Inner dimension of LHS, Inner dimension of RHS.
+//   M: Outer dimensions of RHS
+//   BS: boolean indicating whether to use the banded solver
+//    T: C++ type of scalars (e.g. float, std::complex)
+//   TT: TensorFlow type of scalars (e.g. DT_FLOAT, DT_COMPLEX128
+#define BM_BandedTriangularSolveDev(K, N, M, BS, T, TT, D)                     \
+  static void BM_BandedTriangularSolve##_##K##_##N##_##M##_##BS##_##TT(        \
+      int iters) {                                                             \
+    testing::UseRealTime();                                                    \
+    testing::ItemsProcessed(static_cast<int64>(iters) * K * N + N * M);        \
+    test::Benchmark(#D, BandedTriangularSolve<T>(K, N, M, BS, TT)).Run(iters); \
+  }                                                                            \
+  BENCHMARK(BM_BandedTriangularSolve##_##K##_##N##_##M##_##BS##_##TT);
+
+#define BM_BandedTriangularSolve(K, N, M, BS, D)                \
+  BM_BandedTriangularSolveDev(K, N, M, BS, float, DT_FLOAT, D); \
+  BM_BandedTriangularSolveDev(K, N, M, BS, double, DT_DOUBLE, D);
+
+// Small number of bands, few rhs
+BM_BandedTriangularSolve(2, 32, 1, true, cpu);
+BM_BandedTriangularSolve(2, 32, 1, false, cpu);
+BM_BandedTriangularSolve(4, 32, 1, true, cpu);
+BM_BandedTriangularSolve(4, 32, 1, false, cpu);
+BM_BandedTriangularSolve(8, 32, 1, true, cpu);
+BM_BandedTriangularSolve(8, 32, 1, false, cpu);
+BM_BandedTriangularSolve(16, 32, 1, true, cpu);
+BM_BandedTriangularSolve(16, 32, 1, false, cpu);
+BM_BandedTriangularSolve(2, 128, 1, true, cpu);
+BM_BandedTriangularSolve(2, 128, 1, false, cpu);
+BM_BandedTriangularSolve(4, 128, 1, true, cpu);
+BM_BandedTriangularSolve(4, 128, 1, false, cpu);
+BM_BandedTriangularSolve(8, 128, 1, true, cpu);
+BM_BandedTriangularSolve(8, 128, 1, false, cpu);
+BM_BandedTriangularSolve(16, 128, 1, true, cpu);
+BM_BandedTriangularSolve(16, 128, 1, false, cpu);
+BM_BandedTriangularSolve(2, 512, 1, true, cpu);
+BM_BandedTriangularSolve(2, 512, 1, false, cpu);
+BM_BandedTriangularSolve(4, 512, 1, true, cpu);
+BM_BandedTriangularSolve(4, 512, 1, false, cpu);
+BM_BandedTriangularSolve(8, 512, 1, true, cpu);
+BM_BandedTriangularSolve(8, 512, 1, false, cpu);
+BM_BandedTriangularSolve(16, 512, 1, true, cpu);
+BM_BandedTriangularSolve(16, 512, 1, false, cpu);
+
+// Larger # rhs
+BM_BandedTriangularSolve(2, 32, 32, true, cpu);
+BM_BandedTriangularSolve(2, 32, 32, false, cpu);
+BM_BandedTriangularSolve(4, 32, 32, true, cpu);
+BM_BandedTriangularSolve(4, 32, 32, false, cpu);
+BM_BandedTriangularSolve(8, 32, 32, true, cpu);
+BM_BandedTriangularSolve(8, 32, 32, false, cpu);
+BM_BandedTriangularSolve(16, 32, 32, true, cpu);
+BM_BandedTriangularSolve(16, 32, 32, false, cpu);
+BM_BandedTriangularSolve(2, 128, 128, true, cpu);
+BM_BandedTriangularSolve(2, 128, 128, false, cpu);
+BM_BandedTriangularSolve(4, 128, 128, true, cpu);
+BM_BandedTriangularSolve(4, 128, 128, false, cpu);
+BM_BandedTriangularSolve(8, 128, 128, true, cpu);
+BM_BandedTriangularSolve(8, 128, 128, false, cpu);
+BM_BandedTriangularSolve(16, 128, 128, true, cpu);
+BM_BandedTriangularSolve(16, 128, 128, false, cpu);
+BM_BandedTriangularSolve(2, 512, 512, true, cpu);
+BM_BandedTriangularSolve(2, 512, 512, false, cpu);
+BM_BandedTriangularSolve(4, 512, 512, true, cpu);
+BM_BandedTriangularSolve(4, 512, 512, false, cpu);
+BM_BandedTriangularSolve(8, 512, 512, true, cpu);
+BM_BandedTriangularSolve(8, 512, 512, false, cpu);
+BM_BandedTriangularSolve(16, 512, 512, true, cpu);
+BM_BandedTriangularSolve(16, 512, 512, false, cpu);
+
+BM_BandedTriangularSolve(2, 2048, 2048, true, cpu);
+BM_BandedTriangularSolve(2, 2048, 2048, false, cpu);
+BM_BandedTriangularSolve(4, 2048, 2048, true, cpu);
+BM_BandedTriangularSolve(4, 2048, 2048, false, cpu);
+BM_BandedTriangularSolve(8, 2048, 2048, true, cpu);
+BM_BandedTriangularSolve(8, 2048, 2048, false, cpu);
+BM_BandedTriangularSolve(16, 2048, 2048, true, cpu);
+BM_BandedTriangularSolve(16, 2048, 2048, false, cpu);
+BM_BandedTriangularSolve(32, 2048, 2048, true, cpu);
+BM_BandedTriangularSolve(32, 2048, 2048, false, cpu);
+BM_BandedTriangularSolve(64, 2048, 2048, true, cpu);
+BM_BandedTriangularSolve(64, 2048, 2048, false, cpu);
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/ops/linalg_ops.cc b/tensorflow/core/ops/linalg_ops.cc
index 75340b28eb0..a05231834b7 100644
--- a/tensorflow/core/ops/linalg_ops.cc
+++ b/tensorflow/core/ops/linalg_ops.cc
@@ -47,6 +47,49 @@ Status BatchUnchangedSquareShapeFn(InferenceContext* c) {
   return Status::OK();
 }
 
+// The first input is [...,K,M] and second input is [...,M,N].
+Status BandedTriangularSolveShapeFn(InferenceContext* c) {
+  ShapeHandle lhs;
+  ShapeHandle rhs;
+
+  TF_RETURN_IF_ERROR(c->WithRankAtLeast(c->input(0), 2, &lhs));
+  TF_RETURN_IF_ERROR(c->WithRankAtLeast(c->input(1), 2, &rhs));
+
+  // Check K > 0.
+  DimensionHandle num_bands = c->Dim(lhs, -2);
+  DimensionHandle m = c->Dim(lhs, -1);
+  if (c->ValueKnown(num_bands) && c->Value(num_bands) <= 0) {
+    return errors::InvalidArgument("Number of bands must be positive, but is ",
+                                   c->Value(num_bands));
+  }
+  if (c->ValueKnown(num_bands) && c->ValueKnown(m) &&
+      c->Value(num_bands) > c->Value(m)) {
+    return errors::InvalidArgument("Number of bands ", c->Value(num_bands),
+                                   " cannot exceed the size of the matrix ",
+                                   c->Value(m));
+  }
+
+  ShapeHandle lhs_batch_shape;
+  ShapeHandle rhs_batch_shape;
+  ShapeHandle output_batch_shape;
+  // Make the common batch subshape.
+  TF_RETURN_IF_ERROR(c->Subshape(lhs, 0, -2, &lhs_batch_shape));
+  TF_RETURN_IF_ERROR(c->Subshape(rhs, 0, -2, &rhs_batch_shape));
+  TF_RETURN_IF_ERROR(BroadcastBinaryOpOutputShapeFnHelper(
+      c, lhs_batch_shape, rhs_batch_shape, true, &output_batch_shape));
+
+  // lhs and rhs have the same value for M to be compatible.
+  TF_RETURN_IF_ERROR(c->Merge(m, c->Dim(rhs, -2), &m));
+
+  // Build final shape (batch_shape + m + n) in <out>.
+  ShapeHandle out;
+  TF_RETURN_IF_ERROR(
+      c->Concatenate(output_batch_shape, c->Matrix(m, c->Dim(rhs, -1)), &out));
+
+  c->set_output(0, out);
+  return Status::OK();
+}
+
 // The first input is [...,M,N] and second input is either [...,M,K] or [...,M].
 // Output is [...,N,K] or [...,N]. If <square>, then input is [...,M,M].
 Status MatrixSolveShapeFn(InferenceContext* c, bool square) {
@@ -446,6 +489,17 @@ REGISTER_OP("MatrixSolve")
       return MatrixSolveShapeFn(c, true /* square (*/);
     });
 
+REGISTER_OP("BandedTriangularSolve")
+    .Input("matrix: T")
+    .Input("rhs: T")
+    .Output("output: T")
+    .Attr("lower: bool = True")
+    .Attr("adjoint: bool = False")
+    .Attr("T: {double, float, half, complex64, complex128}")
+    .SetShapeFn([](InferenceContext* c) {
+      return BandedTriangularSolveShapeFn(c);
+    });
+
 REGISTER_OP("MatrixTriangularSolve")
     .Input("matrix: T")
     .Input("rhs: T")
diff --git a/tensorflow/python/kernel_tests/BUILD b/tensorflow/python/kernel_tests/BUILD
index 846c582737f..f2c614974f5 100644
--- a/tensorflow/python/kernel_tests/BUILD
+++ b/tensorflow/python/kernel_tests/BUILD
@@ -762,6 +762,17 @@ cuda_py_test(
     ],
 )
 
+cuda_py_test(
+    name = "banded_triangular_solve_op_test",
+    size = "small",
+    srcs = ["banded_triangular_solve_op_test.py"],
+    deps = [
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:linalg_ops",
+        "//third_party/py/numpy",
+    ],
+)
+
 cuda_py_test(
     name = "matrix_triangular_solve_op_test",
     size = "medium",
diff --git a/tensorflow/python/kernel_tests/banded_triangular_solve_op_test.py b/tensorflow/python/kernel_tests/banded_triangular_solve_op_test.py
new file mode 100644
index 00000000000..bd0fdae03c5
--- /dev/null
+++ b/tensorflow/python/kernel_tests/banded_triangular_solve_op_test.py
@@ -0,0 +1,232 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for tensorflow.ops.math_ops.banded_triangular_solve."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import linalg_ops
+from tensorflow.python.platform import test
+
+
+class BandedTriangularSolveOpTest(test.TestCase):
+
+  def _verifySolveAllWays(self, x, y, dtypes, batch_dims=None):
+    for lower in (False,):
+      for adjoint in (False, True):
+        for use_placeholder in True, False:
+          self._verifySolve(
+              x,
+              y,
+              lower=lower,
+              adjoint=adjoint,
+              batch_dims=batch_dims,
+              use_placeholder=use_placeholder,
+              dtypes=dtypes)
+
+  def _verifySolveAllWaysReal(self, x, y, batch_dims=None):
+    self._verifySolveAllWays(x, y, (np.float32, np.float64), batch_dims)
+
+  def _verifySolveAllWaysComplex(self, x, y, batch_dims=None):
+    self._verifySolveAllWays(x, y, (np.complex64, np.complex128), batch_dims)
+
+  def _verifySolve(self,
+                   x,
+                   y,
+                   lower=True,
+                   adjoint=False,
+                   batch_dims=None,
+                   use_placeholder=False,
+                   dtypes=(np.float32, np.float64)):
+    for np_type in dtypes:
+      a = x.astype(np_type)
+      b = y.astype(np_type)
+
+      # Now we need to convert a to a dense triangular matrix.
+      def make_diags(diags, lower=True):
+        n = len(diags[0])
+        a = np.zeros(n * n, dtype=diags.dtype)
+        if lower:
+          for i, diag in enumerate(diags):
+            a[n * i:n * n:n + 1] = diag[i:]
+        else:
+          diags_flip = np.flip(diags, 0)
+          for i, diag in enumerate(diags_flip):
+            a[i:(n - i) * n:n + 1] = diag[:(n - i)]
+        return a.reshape(n, n)
+
+      # For numpy.solve we have to explicitly zero out the strictly
+      # upper or lower triangle.
+      if a.size > 0:
+        a_np = make_diags(a, lower=lower)
+      else:
+        a_np = a
+      if adjoint:
+        a_np = np.conj(np.transpose(a_np))
+
+      if batch_dims is not None:
+        a = np.tile(a, batch_dims + [1, 1])
+        a_np = np.tile(a_np, batch_dims + [1, 1])
+        b = np.tile(b, batch_dims + [1, 1])
+
+      with self.cached_session(use_gpu=True):
+        a_tf = a
+        b_tf = b
+        if use_placeholder:
+          a_tf = array_ops.placeholder_with_default(a_tf, shape=None)
+          b_tf = array_ops.placeholder_with_default(b_tf, shape=None)
+        tf_ans = linalg_ops.banded_triangular_solve(
+            a_tf, b_tf, lower=lower, adjoint=adjoint)
+        tf_val = self.evaluate(tf_ans)
+        np_ans = np.linalg.solve(a_np, b)
+        self.assertEqual(np_ans.shape, tf_val.shape)
+        self.assertAllClose(np_ans, tf_val)
+
+  @test_util.run_deprecated_v1
+  def testSolve(self):
+    # 1x1 matrix, single rhs.
+    matrix = np.array([[0.1]])
+    rhs0 = np.array([[1.]])
+    self._verifySolveAllWaysReal(matrix, rhs0)
+    # 2x2 matrix with 2 bands, single right-hand side.
+    # Corresponds to the lower triangular
+    # [[1., 0.], [3., 4.]]
+    # and upper triangular
+    # [[2., 1.], [0., 3.]]
+    matrix = np.array([[1., 4.], [2., 3.]])
+    rhs0 = np.array([[1.], [1.]])
+    self._verifySolveAllWaysReal(matrix, rhs0)
+    # 2x2 matrix with 2 bands, 3 right-hand sides.
+    rhs1 = np.array([[1., 0., 1.], [0., 1., 1.]])
+    self._verifySolveAllWaysReal(matrix, rhs1)
+    # 4 x 4 matrix with 2 bands, 3 right hand sides.
+    # Corresponds to the lower triangular
+    # [[1.,  0., 0., 0.],
+    #  [-1., 2., 0., 0.],
+    #  [0., -2., 3., 0.],
+    #  [0., 0., -3., 4.]]
+    # and upper triangular
+    # [[1.,  1., 0., 0.],
+    #  [0., -1., 2., 0.],
+    #  [0., 0., -2., 3.],
+    #  [0., 0., 0., -3.]]
+    matrix = np.array([[1., 2., 3., 4.], [1., -1., -2., -3.]])
+    rhs0 = np.array([[1., 0., 1.], [0., 1., 1.], [-1., 2., 1.], [0., -1., -1.]])
+    self._verifySolveAllWaysReal(matrix, rhs0)
+
+  def testSolveBandSizeSmaller(self):
+    rhs0 = np.random.randn(6, 4)
+
+    # 6 x 6 matrix with 2 bands. Ensure all non-zero entries.
+    matrix = 2. * np.random.uniform(size=[3, 6]) + 1.
+    self._verifySolveAllWaysReal(matrix, rhs0)
+
+    # 6 x 6 matrix with 3 bands. Ensure all non-zero entries.
+    matrix = 2. * np.random.uniform(size=[3, 6]) + 1.
+    self._verifySolveAllWaysReal(matrix, rhs0)
+
+  @test_util.run_deprecated_v1
+  def testSolveComplex(self):
+    if test.is_built_with_rocm():
+      self.skipTest("ROCm does not support BLAS operations for complex types")
+    # 1x1 matrix, single rhs.
+    matrix = np.array([[0.1 + 1j * 0.1]])
+    rhs0 = np.array([[1. + 1j]])
+    self._verifySolveAllWaysComplex(matrix, rhs0)
+    # 2x2 matrix with 2 bands, single right-hand side.
+    # Corresponds to
+    # [[1. + 1j, 0.], [4 + 1j, 2 + 1j]]
+    matrix = np.array([[1., 2.], [3., 4.]]).astype(np.complex64)
+    matrix += 1j * matrix
+    rhs0 = np.array([[1.], [1.]]).astype(np.complex64)
+    rhs0 += 1j * rhs0
+    self._verifySolveAllWaysComplex(matrix, rhs0)
+    # 2x2 matrix with 2 bands, 3 right-hand sides.
+    rhs1 = np.array([[1., 0., 1.], [0., 1., 1.]]).astype(np.complex64)
+    rhs1 += 1j * rhs1
+    self._verifySolveAllWaysComplex(matrix, rhs1)
+
+  @test_util.run_deprecated_v1
+  def testSolveBatch(self):
+    matrix = np.array([[1., 2.], [3., 4.]])
+    rhs = np.array([[1., 0., 1.], [0., 1., 1.]])
+    # Batch of 2x3x2x2 matrices, 2x3x2x3 right-hand sides.
+    self._verifySolveAllWaysReal(matrix, rhs, batch_dims=[2, 3])
+    # Batch of 3x2x2x2 matrices, 3x2x2x3 right-hand sides.
+    self._verifySolveAllWaysReal(matrix, rhs, batch_dims=[3, 2])
+
+    matrix = np.array([[1., 2., 3., 4.], [-1., -2., -3., -4.],
+                       [-1., 1., 2., 3.]])
+    rhs = np.array([[-1., 2.], [1., 1.], [0., 1.], [2., 3.]])
+    # Batch of 2x3x4x4 matrices with 3 bands, 2x3x4x2 right-hand sides.
+    self._verifySolveAllWaysReal(matrix, rhs, batch_dims=[2, 3])
+    # Batch of 3x2x4x4 matrices with 3 bands, 3x2x4x2 right-hand sides.
+    self._verifySolveAllWaysReal(matrix, rhs, batch_dims=[3, 2])
+
+  @test_util.run_deprecated_v1
+  def testSolveBatchComplex(self):
+    if test.is_built_with_rocm():
+      self.skipTest("ROCm does not support BLAS operations for complex types")
+    matrix = np.array([[1., 2.], [3., 4.]]).astype(np.complex64)
+    matrix += 1j * matrix
+    rhs = np.array([[1., 0., 1.], [0., 1., 1.]]).astype(np.complex64)
+    rhs += 1j * rhs
+    # Batch of 2x3x2x2 matrices, 2x3x2x3 right-hand sides.
+    self._verifySolveAllWaysComplex(matrix, rhs, batch_dims=[2, 3])
+    # Batch of 3x2x2x2 matrices, 3x2x2x3 right-hand sides.
+    self._verifySolveAllWaysComplex(matrix, rhs, batch_dims=[3, 2])
+
+  @test_util.run_deprecated_v1
+  def testWrongDimensions(self):
+    # The matrix should have the same number of rows as the
+    # right-hand sides.
+    matrix = np.array([[1., 1.], [1., 1.]])
+    rhs = np.array([[1., 0.]])
+    with self.cached_session(use_gpu=True):
+      with self.assertRaises(ValueError):
+        self._verifySolve(matrix, rhs)
+      with self.assertRaises(ValueError):
+        self._verifySolve(matrix, rhs, batch_dims=[2, 3])
+
+    # Number of bands exceeds the dimension of the matrix.
+    matrix = np.ones((6, 4))
+    rhs = np.ones((4, 2))
+    with self.cached_session(use_gpu=True):
+      with self.assertRaises(ValueError):
+        self._verifySolve(matrix, rhs)
+      with self.assertRaises(ValueError):
+        self._verifySolve(matrix, rhs, batch_dims=[2, 3])
+
+  @test_util.run_deprecated_v1
+  @test_util.disable_xla("XLA cannot throw assertion errors during a kernel.")
+  def testNotInvertible(self):
+    # The input should be invertible.
+    # The matrix is singular because it has a zero on the diagonal.
+    # FIXME(rmlarsen): The GPU kernel does not check for singularity.
+    singular_matrix = np.array([[1., 0., -1.], [-1., 0., 1.], [0., -1., 1.]])
+    with self.cached_session():
+      with self.assertRaisesOpError("Input matrix is not invertible."):
+        self._verifySolve(singular_matrix, singular_matrix)
+      with self.assertRaisesOpError("Input matrix is not invertible."):
+        self._verifySolve(singular_matrix, singular_matrix, batch_dims=[2, 3])
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/kernel_tests/linalg_grad_test.py b/tensorflow/python/kernel_tests/linalg_grad_test.py
index 36e58bee829..3aceddf4d5f 100644
--- a/tensorflow/python/kernel_tests/linalg_grad_test.py
+++ b/tensorflow/python/kernel_tests/linalg_grad_test.py
@@ -132,6 +132,44 @@ def _GetMatrixBinaryFunctorGradientTest(functor_,
   return Test
 
 
+def _GetBandedTriangularSolveGradientTest(
+    functor_,
+    dtype_,
+    shape_,
+    float32_tol_fudge=1.0,  # pylint: disable=redefined-outer-name
+    **kwargs_):
+
+  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
+  def Test(self):
+    n = shape_[-1]
+
+    np.random.seed(1)
+    # Make sure invertible.
+    a_np = np.random.uniform(low=1.0, high=2.0, size=shape_).astype(dtype_)
+    a = constant_op.constant(a_np)
+
+    b_np = np.random.uniform(low=-1.0, high=1.0, size=[n, n]).astype(dtype_)
+    b = constant_op.constant(b_np)
+
+    epsilon = np.finfo(dtype_).eps
+    delta = epsilon**(1.0 / 3.0)
+    # tolerance obtained by looking at actual differences using
+    # np.linalg.norm(theoretical-numerical, np.inf) on -mavx build
+    tol = 1e-6 if dtype_ == np.float64 else float32_tol_fudge * 0.05
+
+    # check gradient w.r.t. left argument.
+    theoretical, numerical = gradient_checker_v2.compute_gradient(
+        lambda x: functor_(x, b, **kwargs_), [a], delta=delta)
+    self.assertAllClose(theoretical, numerical, atol=tol, rtol=tol)
+
+    # check gradient w.r.t. right argument.
+    theoretical, numerical = gradient_checker_v2.compute_gradient(
+        lambda y: functor_(a, y, **kwargs_), [b], delta=delta)
+    self.assertAllClose(theoretical, numerical, atol=tol, rtol=tol)
+
+  return Test
+
+
 if __name__ == '__main__':
   # Tests for gradients of binary matrix operations.
   for dtype in np.float32, np.float64:
@@ -166,6 +204,20 @@ if __name__ == '__main__':
                          adjoint=adjoint,
                          lower=lower))
 
+            band_shape = extra + (size // 2 + 1, size)
+            name = '%s_%s_adj_%s_low_%s' % (dtype.__name__, '_'.join(
+                map(str, band_shape)), str(adjoint), lower)
+            _AddTest(
+                MatrixBinaryFunctorGradientTest,
+                'BandedTriangularSolveGradient', name,
+                _GetBandedTriangularSolveGradientTest(
+                    linalg_ops.banded_triangular_solve,
+                    dtype,
+                    band_shape,
+                    float32_tol_fudge=4.0,
+                    adjoint=adjoint,
+                    lower=lower))
+
   # Tests for gradients of unary matrix operations.
   for dtype in np.float32, np.float64:
     for size in 2, 5, 10:
diff --git a/tensorflow/python/ops/linalg/linalg_impl.py b/tensorflow/python/ops/linalg/linalg_impl.py
index 82acd09caec..9ddf7b5e8b8 100644
--- a/tensorflow/python/ops/linalg/linalg_impl.py
+++ b/tensorflow/python/ops/linalg/linalg_impl.py
@@ -340,6 +340,102 @@ def matrix_exponential(input, name=None):  # pylint: disable=redefined-builtin
     return array_ops.reshape(result, batch_shape.concatenate(result.shape[-2:]))
 
 
+@tf_export('linalg.banded_triangular_solve', v1=[])
+def banded_triangular_solve(
+    bands,
+    rhs,
+    lower=True,
+    adjoint=False,  # pylint: disable=redefined-outer-name
+    name=None):
+  r"""Solve triangular systems of equations with a banded solver.
+
+  `bands` is a tensor of shape `[..., K, M]`, where `K` represents the number
+  of bands stored. This corresponds to a batch of `M` by `M` matrices, whose
+  `K` subdiagonals (when `lower` is `True`) are stored.
+
+  This operator broadcasts the batch dimensions of `bands` and the batch
+  dimensions of `rhs`.
+
+
+  Examples:
+
+  Storing 2 bands of a 3x3 matrix.
+  Note that first element in the second row is ignored due to
+  the 'LEFT_RIGHT' padding.
+
+  >>> x = [[2., 3., 4.], [1., 2., 3.]]
+  >>> x2 = [[2., 3., 4.], [10000., 2., 3.]]
+  >>> y = tf.zeros([3, 3])
+  >>> z = tf.linalg.set_diag(y, x, align='LEFT_RIGHT', k=(-1, 0))
+  >>> z
+  <tf.Tensor: shape=(3, 3), dtype=float32, numpy=
+  array([[2., 0., 0.],
+         [2., 3., 0.],
+         [0., 3., 4.]], dtype=float32)>
+  >>> soln = tf.linalg.banded_triangular_solve(x, tf.ones([3, 1]))
+  >>> soln
+  <tf.Tensor: shape=(3, 1), dtype=float32, numpy=
+  array([[0.5 ],
+         [0.  ],
+         [0.25]], dtype=float32)>
+  >>> are_equal = soln == tf.linalg.banded_triangular_solve(x2, tf.ones([3, 1]))
+  >>> tf.reduce_all(are_equal).numpy()
+  True
+  >>> are_equal = soln == tf.linalg.triangular_solve(z, tf.ones([3, 1]))
+  >>> tf.reduce_all(are_equal).numpy()
+  True
+
+  Storing 2 superdiagonals of a 4x4 matrix. Because of the 'LEFT_RIGHT' padding
+  the last element of the first row is ignored.
+
+  >>> x = [[2., 3., 4., 5.], [-1., -2., -3., -4.]]
+  >>> y = tf.zeros([4, 4])
+  >>> z = tf.linalg.set_diag(y, x, align='LEFT_RIGHT', k=(0, 1))
+  >>> z
+  <tf.Tensor: shape=(4, 4), dtype=float32, numpy=
+  array([[-1.,  2.,  0.,  0.],
+         [ 0., -2.,  3.,  0.],
+         [ 0.,  0., -3.,  4.],
+         [ 0.,  0., -0., -4.]], dtype=float32)>
+  >>> soln = tf.linalg.banded_triangular_solve(x, tf.ones([4, 1]), lower=False)
+  >>> soln
+  <tf.Tensor: shape=(4, 1), dtype=float32, numpy=
+  array([[-4.       ],
+         [-1.5      ],
+         [-0.6666667],
+         [-0.25     ]], dtype=float32)>
+  >>> are_equal = (soln == tf.linalg.triangular_solve(
+  ...   z, tf.ones([4, 1]), lower=False))
+  >>> tf.reduce_all(are_equal).numpy()
+  True
+
+
+  Args:
+    bands: A `Tensor` describing the bands of the left hand side, with shape
+      `[..., K, M]`. The `K` rows correspond to the diagonal to the `K - 1`-th
+      diagonal (the diagonal is the top row) when `lower` is `True` and
+      otherwise the `K - 1`-th superdiagonal to the diagonal (the diagonal is
+      the bottom row) when `lower` is `False`. The bands are stored with
+      'LEFT_RIGHT' alignment, where the superdiagonals are padded on the right
+      and subdiagonals are padded on the left. This is the alignment cuSPARSE
+      uses.  See  `tf.linalg.set_diag` for more details.
+    rhs: A `Tensor` of shape [..., M] or [..., M, N] and with the same dtype as
+      `diagonals`. Note that if the shape of `rhs` and/or `diags` isn't known
+      statically, `rhs` will be treated as a matrix rather than a vector.
+    lower: An optional `bool`. Defaults to `True`. Boolean indicating whether
+      `bands` represents a lower or upper triangular matrix.
+    adjoint: An optional `bool`. Defaults to `False`. Boolean indicating whether
+      to solve with the matrix's block-wise adjoint.
+    name:  A name to give this `Op` (optional).
+
+  Returns:
+    A `Tensor` of shape [..., M] or [..., M, N] containing the solutions.
+  """
+  with ops.name_scope(name, 'banded_triangular_solve', [bands, rhs]):
+    return gen_linalg_ops.banded_triangular_solve(
+        bands, rhs, lower=lower, adjoint=adjoint)
+
+
 @tf_export('linalg.tridiagonal_solve')
 @dispatch.add_dispatch_support
 def tridiagonal_solve(diagonals,
diff --git a/tensorflow/python/ops/linalg_grad.py b/tensorflow/python/ops/linalg_grad.py
index 8d3664144a1..437e28e7e6b 100644
--- a/tensorflow/python/ops/linalg_grad.py
+++ b/tensorflow/python/ops/linalg_grad.py
@@ -607,6 +607,39 @@ def _MatrixSolveLsGrad(op, grad):
                                  lambda: _Underdetermined(op, grad))
 
 
+@ops.RegisterGradient("BandedTriangularSolve")
+def _BandedTriangularSolveGrad(op, grad):
+  """Gradient for BandedTriangularSolve."""
+  a = op.inputs[0]
+  b = op.inputs[1]
+  num_bands = array_ops.shape(a)[-2]
+  adjoint_a = op.get_attr("adjoint")
+  lower_a = op.get_attr("lower")
+  c = op.outputs[0]
+  grad_b = linalg_ops.banded_triangular_solve(
+      a, grad, lower=lower_a, adjoint=not adjoint_a)
+  if adjoint_a:
+    grad_a = -math_ops.matmul(c, grad_b, adjoint_b=True)
+  else:
+    grad_a = -math_ops.matmul(grad_b, c, adjoint_b=True)
+  if lower_a:
+    grad_a = array_ops.matrix_diag_part(
+        grad_a, k=(-(num_bands - 1), 0), align="LEFT_RIGHT")
+  else:
+    grad_a = array_ops.matrix_diag_part(
+        grad_a, k=(0, num_bands - 1), align="LEFT_RIGHT")
+  # If the static batch shapes are equal, we don't need to unbroadcast.
+  if (a.shape.is_fully_defined() and b.shape.is_fully_defined() and
+      a.shape[:-2] == b.shape[:-2]):
+    return grad_a, grad_b
+  a_shape = array_ops.shape(a)
+  b_shape = array_ops.shape(b)
+  ra, rb = array_ops.broadcast_gradient_args(a_shape[:-2], b_shape[:-2])
+  grad_a = array_ops.reshape(math_ops.reduce_sum(grad_a, axis=ra), a_shape)
+  grad_b = array_ops.reshape(math_ops.reduce_sum(grad_b, axis=rb), b_shape)
+  return grad_a, grad_b
+
+
 @ops.RegisterGradient("MatrixTriangularSolve")
 def _MatrixTriangularSolveGrad(op, grad):
   """Gradient for MatrixTriangularSolve."""
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt
index 3d298e928e9..62969b5a0dd 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt
@@ -284,6 +284,10 @@ tf_module {
     name: "AvgPoolGrad"
     argspec: "args=[\'orig_input_shape\', \'grad\', \'ksize\', \'strides\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'NHWC\', \'None\'], "
   }
+  member_method {
+    name: "BandedTriangularSolve"
+    argspec: "args=[\'matrix\', \'rhs\', \'lower\', \'adjoint\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'False\', \'None\'], "
+  }
   member_method {
     name: "Barrier"
     argspec: "args=[\'component_types\', \'shapes\', \'capacity\', \'container\', \'shared_name\', \'name\'], varargs=None, keywords=None, defaults=[\'[]\', \'-1\', \'\', \'\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.pbtxt
index 734837b99cb..4f62af20dc0 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.pbtxt
@@ -96,6 +96,10 @@ tf_module {
     name: "band_part"
     argspec: "args=[\'input\', \'num_lower\', \'num_upper\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "banded_triangular_solve"
+    argspec: "args=[\'bands\', \'rhs\', \'lower\', \'adjoint\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'False\', \'None\'], "
+  }
   member_method {
     name: "cholesky"
     argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt
index 3d298e928e9..62969b5a0dd 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt
@@ -284,6 +284,10 @@ tf_module {
     name: "AvgPoolGrad"
     argspec: "args=[\'orig_input_shape\', \'grad\', \'ksize\', \'strides\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'NHWC\', \'None\'], "
   }
+  member_method {
+    name: "BandedTriangularSolve"
+    argspec: "args=[\'matrix\', \'rhs\', \'lower\', \'adjoint\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'False\', \'None\'], "
+  }
   member_method {
     name: "Barrier"
     argspec: "args=[\'component_types\', \'shapes\', \'capacity\', \'container\', \'shared_name\', \'name\'], varargs=None, keywords=None, defaults=[\'[]\', \'-1\', \'\', \'\', \'None\'], "

From 455750f3623b15a3b5d46c11d4c5102e9388dbda Mon Sep 17 00:00:00 2001
From: Srinivas Vasudevan <srvasude@google.com>
Date: Thu, 18 Jun 2020 10:23:23 -0700
Subject: [PATCH 0505/1390] Add StatelessParameterizedTruncatedNormal sampler.

This sampler supports broadcasting of its input parameters as well as puts the # samples at the left of the output shape, rather than the right.

PiperOrigin-RevId: 317129622
Change-Id: I4b62ad2e89a9637ae8b30b73af4b662ad0caa943
---
 ...tatelessParameterizedTruncatedNormal.pbtxt |  54 +++
 tensorflow/core/kernels/BUILD                 |   1 +
 .../parameterized_truncated_normal_op.cc      | 435 +++++++++++++++++-
 .../parameterized_truncated_normal_op.h       |  16 +
 tensorflow/core/ops/stateless_random_ops.cc   |  35 ++
 .../eager/pywrap_gradient_exclusions.cc       |   3 +-
 tensorflow/python/kernel_tests/BUILD          |  18 -
 tensorflow/python/kernel_tests/random/BUILD   |  18 +
 .../parameterized_truncated_normal_op_test.py | 198 ++++++--
 tensorflow/python/ops/random_grad.py          | 120 +++++
 tensorflow/python/ops/stateless_random_ops.py |  70 +++
 .../api/golden/v1/tensorflow.random.pbtxt     |   4 +
 .../api/golden/v1/tensorflow.raw_ops.pbtxt    |   4 +
 .../api/golden/v2/tensorflow.random.pbtxt     |   4 +
 .../api/golden/v2/tensorflow.raw_ops.pbtxt    |   4 +
 15 files changed, 913 insertions(+), 71 deletions(-)
 create mode 100644 tensorflow/core/api_def/base_api/api_def_StatelessParameterizedTruncatedNormal.pbtxt
 rename tensorflow/python/kernel_tests/{ => random}/parameterized_truncated_normal_op_test.py (63%)

diff --git a/tensorflow/core/api_def/base_api/api_def_StatelessParameterizedTruncatedNormal.pbtxt b/tensorflow/core/api_def/base_api/api_def_StatelessParameterizedTruncatedNormal.pbtxt
new file mode 100644
index 00000000000..15bd4670cef
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_StatelessParameterizedTruncatedNormal.pbtxt
@@ -0,0 +1,54 @@
+op {
+  graph_op_name: "StatelessParameterizedTruncatedNormal"
+  visibility: HIDDEN
+  in_arg {
+    name: "shape"
+    description: <<END
+The shape of the output tensor.
+END
+  }
+  in_arg {
+    name: "seed"
+    description: <<END
+2 seeds (shape [2]).
+END
+  }
+  in_arg {
+    name: "means"
+    description: <<END
+The mean parameter of each batch.
+END
+  }
+  in_arg {
+    name: "stddevs"
+    description: <<END
+The standard deviation parameter of each batch. Must be greater than 0.
+END
+  }
+  in_arg {
+    name: "minvals"
+    description: <<END
+The minimum cutoff. May be -infinity.
+END
+  }
+  in_arg {
+    name: "maxvals"
+    description: <<END
+The maximum cutoff. May be +infinity, and must be more than the minval
+for each batch.
+END
+  }
+  attr {
+    name: "dtype"
+    description: <<END
+The type of the output.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+The outputs are truncated normal samples and are a deterministic function of
+`shape`, `seed`, `minvals`, `maxvals`, `means` and `stddevs`.
+END
+  }
+}
diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index b4730dad96c..ffe2a035591 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -6156,6 +6156,7 @@ tf_kernel_library(
     ]),
     prefix = "parameterized_truncated_normal_op",
     deps = [
+        ":stateless_random_ops",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
diff --git a/tensorflow/core/kernels/parameterized_truncated_normal_op.cc b/tensorflow/core/kernels/parameterized_truncated_normal_op.cc
index 4eab9052830..ba1fd280ce7 100644
--- a/tensorflow/core/kernels/parameterized_truncated_normal_op.cc
+++ b/tensorflow/core/kernels/parameterized_truncated_normal_op.cc
@@ -32,6 +32,7 @@ limitations under the License.
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/kernels/stateless_random_ops.h"
 #include "tensorflow/core/lib/random/random_distributions.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/util/guarded_philox_random.h"
@@ -67,10 +68,10 @@ struct TruncatedNormalFunctor<CPUDevice, T> {
     const T kStdDevsInsideBoundsToUseRandnSampler = T(1.3);
     auto worker_threads = *(ctx->device()->tensorflow_cpu_worker_threads());
 
-    auto DoWork = [samples_per_batch, num_elements, &ctx, &means, &stddevs,
-                   &minvals, &maxvals, &gen, &output,
-                   kStdDevsInsideBoundsToUseRandnSampler](int start_batch,
-                                                          int limit_batch) {
+    auto do_work = [samples_per_batch, num_elements, &ctx, &means, &stddevs,
+                    &minvals, &maxvals, &gen, &output,
+                    kStdDevsInsideBoundsToUseRandnSampler](int start_batch,
+                                                           int limit_batch) {
       // Capturing "gen" by-value would only make a copy for the _shared_
       // lambda.  Since we want to let each worker have its own copy, we pass
       // "gen" by reference and explicitly do a copy assignment here.
@@ -80,9 +81,9 @@ struct TruncatedNormalFunctor<CPUDevice, T> {
       // The sample from each iteration uses 2 random numbers.
       gen_copy.Skip(start_batch * 2 * kMaxIterations * (samples_per_batch + 3) /
                     4);
-      typedef random::UniformDistribution<random::PhiloxRandom, T> Uniform;
+      using Uniform = random::UniformDistribution<random::PhiloxRandom, T>;
       Uniform dist;
-      typedef random::NormalDistribution<random::PhiloxRandom, T> Normal;
+      using Normal = random::NormalDistribution<random::PhiloxRandom, T>;
       Normal normal_dist;
 
       // Vectorized intermediate calculations for uniform rejection sampling.
@@ -112,7 +113,7 @@ struct TruncatedNormalFunctor<CPUDevice, T> {
                          Eigen::numext::isfinite(maxval)),
                     errors::InvalidArgument("Invalid parameters"));
 
-        int numIterations = 0;
+        int num_iterations = 0;
 
         // If possible, make one-sided bound be the lower bound, or make both
         // bounds positive. Otherwise, the bounds are on either side of the
@@ -160,10 +161,10 @@ struct TruncatedNormalFunctor<CPUDevice, T> {
                 if (sample >= limit_sample) {
                   break;
                 }
-                numIterations = 0;
+                num_iterations = 0;
               } else {
-                numIterations++;
-                if (numIterations > kMaxIterations) {
+                num_iterations++;
+                if (num_iterations > kMaxIterations) {
                   // This should never occur because this sampler should
                   // (by the selection criteria above) be used if at least 3
                   // standard deviations of one side of the distribution
@@ -201,7 +202,7 @@ struct TruncatedNormalFunctor<CPUDevice, T> {
             const auto u = dist(&gen_copy);
             for (int i = 0; i < size; i++) {
               auto accept = u[i] <= Eigen::numext::exp(g[i]);
-              if (accept || numIterations + 1 >= kMaxIterations) {
+              if (accept || num_iterations + 1 >= kMaxIterations) {
                 // Accept the sample z.
                 // If we run out of iterations, just use the current uniform
                 // sample, but emit a warning.
@@ -223,9 +224,9 @@ struct TruncatedNormalFunctor<CPUDevice, T> {
                 if (sample >= limit_sample) {
                   break;
                 }
-                numIterations = 0;
+                num_iterations = 0;
               } else {
-                numIterations++;
+                num_iterations++;
               }
             }
           }
@@ -248,7 +249,7 @@ struct TruncatedNormalFunctor<CPUDevice, T> {
               const T u = rand[i];
               i++;
               auto accept = (u <= g && z < normMax);
-              if (accept || numIterations + 1 >= kMaxIterations) {
+              if (accept || num_iterations + 1 >= kMaxIterations) {
                 if (!accept) {
                   LOG(ERROR) << "TruncatedNormal exponential distribution "
                              << "rejection sampler exceeds max iterations. "
@@ -263,9 +264,9 @@ struct TruncatedNormalFunctor<CPUDevice, T> {
                 if (sample >= limit_sample) {
                   break;
                 }
-                numIterations = 0;
+                num_iterations = 0;
               } else {
-                numIterations++;
+                num_iterations++;
               }
             }
           }
@@ -305,7 +306,297 @@ struct TruncatedNormalFunctor<CPUDevice, T> {
     const int64 batchCost =
         batchInitCost + uniformRejectionSamplingCost * 2 * samples_per_batch;
     Shard(worker_threads.num_threads, worker_threads.workers, num_batches,
-          batchCost, DoWork);
+          batchCost, do_work);
+  }
+};
+
+template <typename T>
+struct TruncatedNormalFunctorV2<CPUDevice, T> {
+  void operator()(OpKernelContext* ctx, const CPUDevice& d, int64 num_batches,
+                  int64 samples_per_batch, int64 num_elements,
+                  const BCastList<4>& bcast,
+                  typename TTypes<T>::ConstFlat means,
+                  typename TTypes<T>::ConstFlat stddevs,
+                  typename TTypes<T>::ConstFlat minvals,
+                  typename TTypes<T>::ConstFlat maxvals,
+                  const random::PhiloxRandom& gen,
+                  typename TTypes<T>::Flat output) {
+    // The randn rejection sampling is used when the mean and at least this many
+    // standard deviations are inside the bounds.
+    // The uniform proposal samplers become less efficient as the bounds are
+    // further from the mean, the reverse is true for the randn sampler.
+    // This number was chosen by empirical benchmarking. If modified, the
+    // benchmarks in parameterized_truncated_normal_op_test should also be
+    // changed.
+    const T kStdDevsInsideBoundsToUseRandnSampler = T(1.3);
+    auto worker_threads = *(ctx->device()->tensorflow_cpu_worker_threads());
+
+    auto do_work = [num_batches, samples_per_batch, &ctx, &bcast, &means,
+                    &stddevs, &minvals, &maxvals, &gen, &output,
+                    kStdDevsInsideBoundsToUseRandnSampler](int start_output,
+                                                           int limit_output) {
+      // Capturing "gen" by-value would only make a copy for the _shared_
+      // lambda.  Since we want to let each worker have its own copy, we pass
+      // "gen" by reference and explicitly do a copy assignment here.
+      random::PhiloxRandom gen_copy = gen;
+      using Uniform = random::UniformDistribution<random::PhiloxRandom, T>;
+      Uniform dist;
+      using Normal = random::NormalDistribution<random::PhiloxRandom, T>;
+      Normal normal_dist;
+      // Skip takes units of 128 bits. The Uniform::kResultElementCount - 1
+      // is so rounding doesn't lead to
+      // us using the same state in different workloads.
+      // The sample from each iteration uses 2 random numbers.
+      gen_copy.Skip((start_output * 2 * kMaxIterations +
+                     Uniform::kResultElementCount - 1) /
+                    Uniform::kResultElementCount);
+
+      // Vectorized intermediate calculations for uniform rejection sampling.
+      // We always generate at most 4 samples.
+      Eigen::array<T, Uniform::kResultElementCount> z;
+      Eigen::array<T, Uniform::kResultElementCount> g;
+
+      const bool should_bcast = bcast.IsBroadcastingRequired();
+      const auto& means_batch_indices = bcast.batch_indices(0);
+      const auto& stddevs_batch_indices = bcast.batch_indices(1);
+      const auto& minvals_batch_indices = bcast.batch_indices(2);
+      const auto& maxvals_batch_indices = bcast.batch_indices(3);
+      auto output_flat = output.data();
+
+      // We partition work across batches and then across samples
+      // per batch member, to avoid extra work.
+      for (int64 output_idx = start_output; output_idx < limit_output;
+           // output_idx is incremented with the inner loops below.
+      ) {
+        int64 batch_idx = output_idx / samples_per_batch;
+        // The output layout is [samples_per_batch, num_batches]. Thus
+        // the output address is sample_idx * num_batches + batch_idx.
+        // Below, code will index at output_batch_offset[sample_idx *
+        // num_batches] matching this.
+        T* const output_batch_offset = output_flat + batch_idx;
+        // Generate batch counts from BCast, as it has the right indices to loop
+        // over.
+        T mean, stddev, minval, maxval;
+        if (should_bcast) {
+          mean = means(means_batch_indices[batch_idx]);
+          stddev = stddevs(stddevs_batch_indices[batch_idx]);
+          minval = minvals(minvals_batch_indices[batch_idx]);
+          maxval = maxvals(maxvals_batch_indices[batch_idx]);
+        } else {
+          mean = means(batch_idx);
+          stddev = stddevs(batch_idx);
+          minval = minvals(batch_idx);
+          maxval = maxvals(batch_idx);
+        }
+
+        // On GPU, this check will just fill samples with NAN if it fails.
+        OP_REQUIRES(ctx,
+                    stddev > T(0) && minval < maxval &&
+                        (Eigen::numext::isfinite(minval) ||
+                         Eigen::numext::isfinite(maxval)),
+                    errors::InvalidArgument("Invalid parameters"));
+
+        int num_iterations = 0;
+
+        // If possible, make one-sided bound be the lower bound, or make both
+        // bounds positive. Otherwise, the bounds are on either side of the
+        // mean.
+        if ((Eigen::numext::isinf(minval) && minval < T(0)) || maxval < mean) {
+          // Reverse all calculations. normMin and normMax will be flipped.
+          std::swap(minval, maxval);
+          stddev = -stddev;
+        }
+
+        // Calculate normalized samples, then convert them.
+        const T normMin = (minval - mean) / stddev;
+        const T normMax = (maxval - mean) / stddev;
+
+        // Determine the method to use.
+        const T sqrtFactor = Eigen::numext::sqrt((normMin * normMin) + T(4));
+        const T cutoff =
+            T(2) *
+            Eigen::numext::exp(T(0.5) +
+                               (normMin * (normMin - sqrtFactor)) / T(4)) /
+            (normMin + sqrtFactor);
+        const T diff = normMax - normMin;
+
+        if (((normMin < -kStdDevsInsideBoundsToUseRandnSampler) &&
+             (normMax >= T(0.))) ||
+            ((normMax > kStdDevsInsideBoundsToUseRandnSampler) &&
+             (normMin <= T(0.)))) {
+          // If the bounds are a least 3 standard deviations from the mean
+          // on at least one side then we rejection sample by sampling
+          // from the normal distribution and rejecting samples outside
+          // the bounds.
+          // Under this condition the acceptance rate per iteration should
+          // always be ~ 50%. This sampler is more efficient (and more
+          // numerically stable when one or both bounds is far from the mean).
+          for (int64 sample_idx = output_idx % samples_per_batch;
+               sample_idx < samples_per_batch && output_idx < limit_output;) {
+            const auto randn_sample = normal_dist(&gen_copy);
+            const int size = randn_sample.size();
+            for (int i = 0; i < size; ++i) {
+              if ((randn_sample[i] >= normMin) &&
+                  (randn_sample[i] <= normMax)) {
+                output_batch_offset[sample_idx * num_batches] =
+                    randn_sample[i] * stddev + mean;
+                ++sample_idx;
+                ++output_idx;
+                if (sample_idx >= samples_per_batch ||
+                    output_idx >= limit_output) {
+                  break;
+                }
+                num_iterations = 0;
+              } else {
+                ++num_iterations;
+                if (num_iterations > kMaxIterations) {
+                  // This should never occur because this sampler should
+                  // (by the selection criteria above) be used if at least 3
+                  // standard deviations of one side of the distribution
+                  // is within the limits (so acceptance probability per
+                  // iterations >~ 1/2 per iteration).
+                  LOG(ERROR) << "TruncatedNormal randn rejection sampler "
+                             << "exceeded maximum iterations for "
+                             << "normMin=" << normMin << " normMax=" << normMax
+                             << " kMaxIterations=" << kMaxIterations;
+                  ctx->SetStatus(errors::Internal(
+                      "TruncatedNormal randn rejection sampler failed to accept"
+                      " a sample."));
+                  return;
+                }
+              }
+            }
+          }
+        } else if (diff < cutoff) {
+          // Sample from a uniform distribution on [normMin, normMax].
+
+          const T plusFactor = (normMin < T(0)) ? T(0) : normMin * normMin;
+
+          for (int64 sample_idx = output_idx % samples_per_batch;
+               sample_idx < samples_per_batch && output_idx < limit_output;) {
+            const auto rand = dist(&gen_copy);
+            const int size = rand.size();
+            // NOTE(ringwalt): These loops seem to only generate packed AVX
+            // instructions for float32.
+            for (int i = 0; i < size; i++) {
+              z[i] = rand[i] * diff + normMin;
+              g[i] = (plusFactor - z[i] * z[i]) / T(2.0);
+            }
+
+            const auto u = dist(&gen_copy);
+            for (int i = 0; i < size; i++) {
+              auto accept = u[i] <= Eigen::numext::exp(g[i]);
+              if (accept || num_iterations + 1 >= kMaxIterations) {
+                // Accept the sample z.
+                // If we run out of iterations, just use the current uniform
+                // sample, but emit a warning.
+                // TODO(jjhunt) For small entropies (relative to the bounds),
+                // this sampler is poor and may take many iterations since
+                // the proposal distribution is the uniform distribution
+                // U(lower_bound, upper_bound).
+                if (!accept) {
+                  LOG(ERROR) << "TruncatedNormal uniform rejection sampler "
+                             << "exceeded max iterations. Sample may contain "
+                             << "outliers.";
+                  ctx->SetStatus(errors::Internal(
+                      "TruncatedNormal uniform rejection sampler failed to "
+                      " accept a sample."));
+                  return;
+                }
+                output_batch_offset[sample_idx * num_batches] =
+                    z[i] * stddev + mean;
+                ++sample_idx;
+                ++output_idx;
+                if (sample_idx >= samples_per_batch ||
+                    output_idx >= limit_output) {
+                  break;
+                }
+                num_iterations = 0;
+              } else {
+                num_iterations++;
+              }
+            }
+          }
+        } else {
+          // Sample from an exponential distribution with alpha maximizing
+          // acceptance probability, offset by normMin from the origin.
+          // Accept only if less than normMax.
+          const T alpha =
+              (normMin + Eigen::numext::sqrt((normMin * normMin) + T(4))) /
+              T(2);
+          for (int64 sample_idx = output_idx % samples_per_batch;
+               sample_idx < samples_per_batch && output_idx < limit_output;) {
+            auto rand = dist(&gen_copy);
+            const int size = rand.size();
+            int i = 0;
+            while (i < size) {
+              const T z = -Eigen::numext::log(rand[i]) / alpha + normMin;
+              i++;
+              const T x = normMin < alpha ? alpha - z : normMin - alpha;
+              const T g = Eigen::numext::exp(-x * x / T(2.0));
+              const T u = rand[i];
+              i++;
+              auto accept = (u <= g && z < normMax);
+              if (accept || num_iterations + 1 >= kMaxIterations) {
+                if (!accept) {
+                  LOG(ERROR) << "TruncatedNormal exponential distribution "
+                             << "rejection sampler exceeds max iterations. "
+                             << "Sample may contain outliers.";
+                  ctx->SetStatus(errors::Internal(
+                      "TruncatedNormal exponential distribution rejection"
+                      " sampler failed to accept a sample."));
+                  return;
+                }
+                output_batch_offset[sample_idx * num_batches] =
+                    z * stddev + mean;
+                ++sample_idx;
+                ++output_idx;
+                if (sample_idx >= samples_per_batch ||
+                    output_idx >= limit_output) {
+                  break;
+                }
+                num_iterations = 0;
+              } else {
+                num_iterations++;
+              }
+            }
+          }
+        }
+      }
+    };
+    // The cost of the initial calculations for the batch.
+    const int64 batchInitCost =
+        // normMin, normMax
+        (Eigen::TensorOpCost::AddCost<T>() +
+         Eigen::TensorOpCost::MulCost<T>()) *
+            2
+        // sqrtFactor
+        + Eigen::TensorOpCost::AddCost<T>() +
+        Eigen::TensorOpCost::MulCost<T>() +
+        Eigen::internal::functor_traits<
+            Eigen::internal::scalar_sqrt_op<T>>::Cost
+        // cutoff
+        + Eigen::TensorOpCost::MulCost<T>() * 4 +
+        Eigen::internal::functor_traits<Eigen::internal::scalar_exp_op<T>>::Cost
+        // diff
+        + Eigen::TensorOpCost::AddCost<T>();
+    const int64 uniformSampleCost =
+        random::PhiloxRandom::kElementCost +
+        random::UniformDistribution<random::PhiloxRandom, T>::kElementCost;
+    // The cost of a single uniform sampling round.
+    const int64 uniformRejectionSamplingCost =
+        uniformSampleCost + Eigen::TensorOpCost::MulCost<T>() +
+        Eigen::TensorOpCost::AddCost<T>() +
+        Eigen::TensorOpCost::MulCost<T>() * 2 +
+        Eigen::TensorOpCost::AddCost<T>() + uniformSampleCost +
+        Eigen::internal::functor_traits<
+            Eigen::internal::scalar_exp_op<T>>::Cost +
+        Eigen::TensorOpCost::MulCost<T>() + Eigen::TensorOpCost::AddCost<T>();
+    // Estimate the cost for an entire batch.
+    // Assume we use uniform sampling, and accept the 2nd sample on average.
+    const int64 batchCost = batchInitCost + uniformRejectionSamplingCost * 2;
+    Shard(worker_threads.num_threads, worker_threads.workers, num_elements,
+          batchCost, do_work);
   }
 };
 
@@ -436,13 +727,113 @@ class ParameterizedTruncatedNormalOp : public OpKernel {
   TF_DISALLOW_COPY_AND_ASSIGN(ParameterizedTruncatedNormalOp);
 };
 
+// Samples from a truncated normal distribution, using the given parameters.
+template <typename Device, typename T>
+class StatelessParameterizedTruncatedNormal : public OpKernel {
+  // Reshape batches so each batch is this size if possible.
+  static const int32 kDesiredBatchSize = 100;
+
+ public:
+  explicit StatelessParameterizedTruncatedNormal(OpKernelConstruction* context)
+      : OpKernel(context) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    const Tensor& shape_tensor = ctx->input(0);
+    const Tensor& seed_tensor = ctx->input(1);
+    const Tensor& means_tensor = ctx->input(2);
+    const Tensor& stddevs_tensor = ctx->input(3);
+    const Tensor& minvals_tensor = ctx->input(4);
+    const Tensor& maxvals_tensor = ctx->input(5);
+
+    OP_REQUIRES(ctx, seed_tensor.dims() == 1 && seed_tensor.dim_size(0) == 2,
+                errors::InvalidArgument("seed must have shape [2], not ",
+                                        seed_tensor.shape().DebugString()));
+
+    tensorflow::BCastList<4> bcast(
+        {means_tensor.shape().dim_sizes(), stddevs_tensor.shape().dim_sizes(),
+         minvals_tensor.shape().dim_sizes(),
+         maxvals_tensor.shape().dim_sizes()},
+        /*fewer_dims_optimization=*/false,
+        /*return_flattened_batch_indices=*/true);
+
+    OP_REQUIRES(ctx, bcast.IsValid(),
+                errors::InvalidArgument(
+                    "means, stddevs, minvals, maxvals must have compatible "
+                    "batch dimensions: ",
+                    means_tensor.shape().DebugString(), " vs. ",
+                    stddevs_tensor.shape().DebugString(), " vs. ",
+                    minvals_tensor.shape().DebugString(), " vs. ",
+                    maxvals_tensor.shape().DebugString()));
+
+    // Let's check that the shape tensor dominates the broadcasted tensor.
+    TensorShape bcast_shape = BCast::ToShape(bcast.output_shape());
+    OP_REQUIRES(
+        ctx, TensorShapeUtils::IsVector(shape_tensor.shape()),
+        errors::InvalidArgument("Input shape should be a vector, got shape: ",
+                                shape_tensor.shape().DebugString()));
+    TensorShape output_shape;
+    if (shape_tensor.dtype() == DataType::DT_INT32) {
+      OP_REQUIRES_OK(ctx, TensorShapeUtils::MakeShape(shape_tensor.vec<int32>(),
+                                                      &output_shape));
+    } else {
+      OP_REQUIRES_OK(ctx, TensorShapeUtils::MakeShape(shape_tensor.vec<int64>(),
+                                                      &output_shape));
+    }
+    OP_REQUIRES(ctx, TensorShapeUtils::EndsWith(output_shape, bcast_shape),
+                errors::InvalidArgument(
+                    "Shape passed in must end with broadcasted shape."));
+
+    int64 samples_per_batch = 1;
+    const int64 num_sample_dims =
+        (shape_tensor.dim_size(0) - bcast.output_shape().size());
+    for (int64 i = 0; i < num_sample_dims; ++i) {
+      samples_per_batch *= output_shape.dim_size(i);
+    }
+    int64 num_batches = 1;
+    for (int64 i = num_sample_dims; i < shape_tensor.dim_size(0); ++i) {
+      num_batches *= output_shape.dim_size(i);
+    }
+    const int64 num_elements = num_batches * samples_per_batch;
+
+    Tensor* samples_tensor;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, output_shape, &samples_tensor));
+
+    auto truncFunctor = functor::TruncatedNormalFunctorV2<Device, T>();
+    // Each worker has the same fudge factor, so use it here.
+    random::PhiloxRandom::Key key;
+    random::PhiloxRandom::ResultType counter;
+    OP_REQUIRES_OK(ctx, GenerateKey(seed_tensor, &key, &counter));
+
+    auto philox = random::PhiloxRandom(counter, key);
+
+    truncFunctor(ctx, ctx->eigen_device<Device>(), num_batches,
+                 samples_per_batch, num_elements, bcast, means_tensor.flat<T>(),
+                 stddevs_tensor.flat<T>(), minvals_tensor.flat<T>(),
+                 maxvals_tensor.flat<T>(), philox, samples_tensor->flat<T>());
+  }
+
+ private:
+  TF_DISALLOW_COPY_AND_ASSIGN(StatelessParameterizedTruncatedNormal);
+};
+
 }  // namespace
 
-#define REGISTER(TYPE)                                         \
-  REGISTER_KERNEL_BUILDER(Name("ParameterizedTruncatedNormal") \
-                              .Device(DEVICE_CPU)              \
-                              .TypeConstraint<TYPE>("dtype"),  \
-                          ParameterizedTruncatedNormalOp<CPUDevice, TYPE>)
+#define REGISTER(TYPE)                                                     \
+  REGISTER_KERNEL_BUILDER(Name("ParameterizedTruncatedNormal")             \
+                              .Device(DEVICE_CPU)                          \
+                              .TypeConstraint<TYPE>("dtype"),              \
+                          ParameterizedTruncatedNormalOp<CPUDevice, TYPE>) \
+  REGISTER_KERNEL_BUILDER(                                                 \
+      Name("StatelessParameterizedTruncatedNormal")                        \
+          .HostMemory("shape")                                             \
+          .HostMemory("seed")                                              \
+          .HostMemory("means")                                             \
+          .HostMemory("stddevs")                                           \
+          .HostMemory("minvals")                                           \
+          .HostMemory("maxvals")                                           \
+          .Device(DEVICE_CPU)                                              \
+          .TypeConstraint<TYPE>("dtype"),                                  \
+      StatelessParameterizedTruncatedNormal<CPUDevice, TYPE>)
 
 TF_CALL_half(REGISTER);
 TF_CALL_float(REGISTER);
diff --git a/tensorflow/core/kernels/parameterized_truncated_normal_op.h b/tensorflow/core/kernels/parameterized_truncated_normal_op.h
index c919a22c7b0..ee7fb7bf605 100644
--- a/tensorflow/core/kernels/parameterized_truncated_normal_op.h
+++ b/tensorflow/core/kernels/parameterized_truncated_normal_op.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/lib/random/random_distributions.h"
+#include "tensorflow/core/util/bcast.h"
 
 namespace tensorflow {
 
@@ -44,6 +45,21 @@ struct TruncatedNormalFunctor {
                   typename TTypes<T>::Flat output);
 };
 
+// This version supports broadcasting of the arguments, as well as puts
+// the sample dimension on the left.
+template <typename Device, typename T>
+struct TruncatedNormalFunctorV2 {
+  void operator()(OpKernelContext* ctx, const Device& d, int64 num_batches,
+                  int64 samples_per_batch, int64 num_elements,
+                  const BCastList<4>& bcast,
+                  typename TTypes<T>::ConstFlat means,
+                  typename TTypes<T>::ConstFlat stddevs,
+                  typename TTypes<T>::ConstFlat minvals,
+                  typename TTypes<T>::ConstFlat maxvals,
+                  const random::PhiloxRandom& gen,
+                  typename TTypes<T>::Flat output);
+};
+
 }  // namespace functor
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/ops/stateless_random_ops.cc b/tensorflow/core/ops/stateless_random_ops.cc
index d540b9a04d9..e1820ea4feb 100644
--- a/tensorflow/core/ops/stateless_random_ops.cc
+++ b/tensorflow/core/ops/stateless_random_ops.cc
@@ -124,6 +124,41 @@ REGISTER_OP("StatelessRandomBinomial")
     .Attr("dtype: {half, float, double, int32, int64} = DT_INT64")
     .SetShapeFn(StatelessShape);
 
+REGISTER_OP("StatelessParameterizedTruncatedNormal")
+    .Input("shape: S")
+    .Input("seed: Tseed")
+    .Input("means: dtype")
+    .Input("stddevs: dtype")
+    .Input("minvals: dtype")
+    .Input("maxvals: dtype")
+    .Output("output: dtype")
+    .Attr("S: {int32, int64}")
+    .Attr("Tseed: {int32, int64} = DT_INT64")
+    .Attr("dtype: {float16, float32, float64}")
+    .SetShapeFn([](InferenceContext* c) {
+      // Check seed shape
+      ShapeHandle seed;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 1, &seed));
+      DimensionHandle unused_dim;
+      TF_RETURN_IF_ERROR(c->WithValue(c->Dim(seed, 0), 2, &unused_dim));
+
+      ShapeHandle bcast_means_stddevs;
+      ShapeHandle bcast_except_maxvals;
+      ShapeHandle bcast_all;
+      TF_RETURN_IF_ERROR(BroadcastBinaryOpOutputShapeFnHelper(
+          c, c->input(2), c->input(3), true, &bcast_means_stddevs));
+      TF_RETURN_IF_ERROR(BroadcastBinaryOpOutputShapeFnHelper(
+          c, c->input(4), bcast_means_stddevs, true, &bcast_except_maxvals));
+      TF_RETURN_IF_ERROR(BroadcastBinaryOpOutputShapeFnHelper(
+          c, c->input(5), bcast_except_maxvals, true, &bcast_all));
+
+      // Set output shape
+      ShapeHandle out;
+      TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensor(0, &out));
+      c->set_output(0, out);
+      return Status::OK();
+    });
+
 REGISTER_OP("StatelessRandomPoisson")
     .Input("shape: T")
     .Input("seed: Tseed")
diff --git a/tensorflow/python/eager/pywrap_gradient_exclusions.cc b/tensorflow/python/eager/pywrap_gradient_exclusions.cc
index a7c7ab7abc7..7da45e36118 100644
--- a/tensorflow/python/eager/pywrap_gradient_exclusions.cc
+++ b/tensorflow/python/eager/pywrap_gradient_exclusions.cc
@@ -50,7 +50,7 @@ auto OpGradientInfoInit(const T &a) {
 
 absl::optional<tensorflow::gtl::FlatSet<int>> OpGradientUnusedInputIndices(
     const tensorflow::string &op_name) {
-  static std::array<OpIndexInfo, 348> a = {{
+  static std::array<OpIndexInfo, 349> a = {{
       {"Acosh"},
       {"AllToAll", 1, {0}},
       {"ApproximateEqual"},
@@ -326,6 +326,7 @@ absl::optional<tensorflow::gtl::FlatSet<int>> OpGradientUnusedInputIndices(
       {"StackPop"},
       {"StackPush"},
       {"StatelessMultinomial"},
+      {"StatelessParameterizedTruncatedNormal", 1, {1}},
       {"StatelessRandomBinomial"},
       {"StatelessRandomGammaV2", 1, {1}},
       {"StatelessRandomNormal"},
diff --git a/tensorflow/python/kernel_tests/BUILD b/tensorflow/python/kernel_tests/BUILD
index f2c614974f5..f93bf5cd1ae 100644
--- a/tensorflow/python/kernel_tests/BUILD
+++ b/tensorflow/python/kernel_tests/BUILD
@@ -785,24 +785,6 @@ cuda_py_test(
     ],
 )
 
-cuda_py_test(
-    name = "parameterized_truncated_normal_op_test",
-    size = "medium",
-    srcs = ["parameterized_truncated_normal_op_test.py"],
-    deps = [
-        "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:client",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:platform",
-        "//tensorflow/python:random_ops",
-        "//third_party/py/numpy",
-        "@absl_py//absl/testing:parameterized",
-    ],
-)
-
 tf_py_test(
     name = "parsing_ops_test",
     size = "medium",
diff --git a/tensorflow/python/kernel_tests/random/BUILD b/tensorflow/python/kernel_tests/random/BUILD
index b5d291d2973..6e404b4cd5f 100644
--- a/tensorflow/python/kernel_tests/random/BUILD
+++ b/tensorflow/python/kernel_tests/random/BUILD
@@ -20,6 +20,24 @@ py_library(
     ],
 )
 
+cuda_py_test(
+    name = "parameterized_truncated_normal_op_test",
+    size = "medium",
+    srcs = ["parameterized_truncated_normal_op_test.py"],
+    deps = [
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:client",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:random_ops",
+        "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
 tf_py_test(
     name = "random_shuffle_queue_test",
     size = "small",
diff --git a/tensorflow/python/kernel_tests/parameterized_truncated_normal_op_test.py b/tensorflow/python/kernel_tests/random/parameterized_truncated_normal_op_test.py
similarity index 63%
rename from tensorflow/python/kernel_tests/parameterized_truncated_normal_op_test.py
rename to tensorflow/python/kernel_tests/random/parameterized_truncated_normal_op_test.py
index ac8ad7a2bd4..309c3e404db 100644
--- a/tensorflow/python/kernel_tests/parameterized_truncated_normal_op_test.py
+++ b/tensorflow/python/kernel_tests/random/parameterized_truncated_normal_op_test.py
@@ -27,11 +27,15 @@ from six.moves import range  # pylint: disable=redefined-builtin
 
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.client import session
+from tensorflow.python.eager import backprop
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import random_seed
 from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import stateless_random_ops as stateless
+from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 from tensorflow.python.platform import tf_logging
 
@@ -91,13 +95,8 @@ class TruncatedNormalMoments(object):
 
 def calculate_moments(samples, max_moment):
   moments = [0.0] * (max_moment + 1)
-  for sample in samples:
-    value = 1.0
-    for k in range(len(moments)):
-      moments[k] += value
-      value *= sample
-  for i in range(len(moments)):
-    moments[i] /= len(samples)
+  for k in range(len(moments)):
+    moments[k] = np.mean(samples**k, axis=0)
   return moments
 
 
@@ -118,16 +117,31 @@ class ParameterizedTruncatedNormalTest(test.TestCase):
   # Stop at moment 10 to avoid numerical errors in the theoretical moments.
   max_moment = 10
 
-  def validateMoments(self, shape, mean, stddev, minval, maxval, seed=1618):
+  def validateMoments(self,
+                      shape,
+                      mean,
+                      stddev,
+                      minval,
+                      maxval,
+                      use_stateless=False,
+                      seed=1618):
     try:
       # TruncatedNormalMoments requires scipy.stats.
       # Give up early if we are unable to import it.
-      import scipy.stats  # pylint: disable=g-import-not-at-top,unused-variable
       random_seed.set_random_seed(seed)
       with self.cached_session(use_gpu=True):
-        samples = random_ops.parameterized_truncated_normal(shape, mean, stddev,
-                                                            minval,
-                                                            maxval).eval()
+        if use_stateless:
+          # Generate a seed that stateless ops can use.
+          new_seed = random_ops.random_uniform([2],
+                                               seed=seed,
+                                               minval=0,
+                                               maxval=(2**31 - 1),
+                                               dtype=np.int32)
+          samples = stateless.stateless_parameterized_truncated_normal(
+              shape, new_seed, mean, stddev, minval, maxval).eval()
+        else:
+          samples = random_ops.parameterized_truncated_normal(
+              shape, mean, stddev, minval, maxval).eval()
         assert (~np.isnan(samples)).all()
       moments = calculate_moments(samples, self.max_moment)
       expected_moments = TruncatedNormalMoments(mean, stddev, minval, maxval)
@@ -144,14 +158,24 @@ class ParameterizedTruncatedNormalTest(test.TestCase):
                                 stddev,
                                 minval,
                                 maxval,
+                                use_stateless=False,
                                 seed=1618):
     try:
       import scipy.stats  # pylint: disable=g-import-not-at-top
       random_seed.set_random_seed(seed)
       with self.cached_session(use_gpu=True):
-        samples = random_ops.parameterized_truncated_normal(shape, mean, stddev,
-                                                            minval,
-                                                            maxval).eval()
+        if use_stateless:
+          new_seed = random_ops.random_uniform([2],
+                                               seed=seed,
+                                               minval=0,
+                                               maxval=(2**31 - 1),
+                                               dtype=np.int32)
+          samples = stateless.stateless_parameterized_truncated_normal(
+              shape, new_seed, mean, stddev, minval, maxval).eval()
+        else:
+          samples = random_ops.parameterized_truncated_normal(
+              shape, mean, stddev, minval, maxval).eval()
+
       assert (~np.isnan(samples)).all()
       minval = max(mean - stddev * 10, minval)
       maxval = min(mean + stddev * 10, maxval)
@@ -169,61 +193,160 @@ class ParameterizedTruncatedNormalTest(test.TestCase):
 
   @test_util.run_deprecated_v1
   def testDefaults(self):
-    self.validateMoments([10**5], 0.0, 1.0, -2.0, 2.0)
+    self.validateMoments([int(1e5)], 0.0, 1.0, -2.0, 2.0)
+    self.validateMoments([int(1e5)], 0.0, 1.0, -2.0, 2.0, use_stateless=True)
 
   @test_util.run_deprecated_v1
   def testShifted(self):
-    self.validateMoments([10**5], -1.0, 1.0, -2.0, 2.0)
+    self.validateMoments([int(1e5)], -1.0, 1.0, -2.0, 2.0)
+    self.validateMoments([int(1e5)], -1.0, 1.0, -2.0, 2.0, use_stateless=True)
 
   @test_util.run_deprecated_v1
   def testRightTail(self):
-    self.validateMoments([10**5], 0.0, 1.0, 4.0, np.infty)
+    self.validateMoments([int(1e5)], 0.0, 1.0, 4.0, np.infty)
+    self.validateMoments([int(1e5)],
+                         0.0,
+                         1.0,
+                         4.0,
+                         np.infty,
+                         use_stateless=True)
 
   @test_util.run_deprecated_v1
   def testLeftTail(self):
-    self.validateMoments([10**5], 0.0, 1.0, -np.infty, -4.0)
+    self.validateMoments([int(1e5)], 0.0, 1.0, -np.infty, -4.0)
+    self.validateMoments([int(1e5)],
+                         0.0,
+                         1.0,
+                         -np.infty,
+                         -4.0,
+                         use_stateless=True)
 
   @test_util.run_deprecated_v1
   def testLeftTailTwoSidedBounds(self):
-    self.validateMoments([10**5], 0.0, 1.0, -6.0, -3.0)
+    self.validateMoments([int(1e5)], 0.0, 1.0, -6.0, -3.0)
+    self.validateMoments([int(1e5)], 0.0, 1.0, -6.0, -3.0, use_stateless=True)
 
   @test_util.run_deprecated_v1
   @test_util.disable_xla("Low probability region")
   def testTwoSidedLeftTailShifted(self):
-    self.validateKolmogorovSmirnov([10**5], 6.0, 1.0, -1.0, 1.0)
+    self.validateKolmogorovSmirnov([int(1e5)], 6.0, 1.0, -1.0, 1.0)
+    self.validateKolmogorovSmirnov([int(1e5)],
+                                   6.0,
+                                   1.0,
+                                   -1.0,
+                                   1.0,
+                                   use_stateless=True)
 
   @test_util.run_deprecated_v1
   @test_util.disable_xla("Low probability region")
   def testRightTailShifted(self):
-    self.validateMoments([10**5], -5.0, 1.0, 2.0, np.infty)
+    self.validateMoments([int(1e5)], -5.0, 1.0, 2.0, np.infty)
+    self.validateMoments([int(1e5)],
+                         -5.0,
+                         1.0,
+                         2.0,
+                         np.infty,
+                         use_stateless=True)
 
   # Take the normal distribution around the mean, but truncating the left tail
   # far from the mean.
   @test_util.run_deprecated_v1
   def testTruncateOnLeft_entireTailOnRight(self):
-    self.validateKolmogorovSmirnov([10**5], 10.0, 1.0, 4.0, np.infty)
+    self.validateKolmogorovSmirnov([int(1e5)], 10.0, 1.0, 4.0, np.infty)
+    self.validateKolmogorovSmirnov([int(1e5)],
+                                   10.0,
+                                   1.0,
+                                   4.0,
+                                   np.infty,
+                                   use_stateless=True)
 
   # Take the normal distribution around the mean, but truncating the right tail.
   @test_util.run_deprecated_v1
   def testTruncateOnRight_entireTailOnLeft(self):
-    self.validateKolmogorovSmirnov([10**5], -8, 1.0, -np.infty, -4.0)
+    self.validateKolmogorovSmirnov([int(1e5)], -8, 1.0, -np.infty, -4.0)
+    self.validateKolmogorovSmirnov([int(1e5)],
+                                   -8.,
+                                   1.0,
+                                   -np.infty,
+                                   -4.0,
+                                   use_stateless=True)
 
   @test_util.run_deprecated_v1
   def testSmallStddev(self):
-    self.validateKolmogorovSmirnov([10**5], 0.0, 0.1, 0.05, 0.10)
+    self.validateKolmogorovSmirnov([int(1e5)], 0.0, 0.1, 0.05, 0.10)
+    self.validateKolmogorovSmirnov([int(1e5)],
+                                   0.0,
+                                   0.1,
+                                   0.05,
+                                   0.10,
+                                   use_stateless=True)
 
   @test_util.run_deprecated_v1
   def testSamplingWithSmallStdDevFarFromBound(self):
     sample_op = random_ops.parameterized_truncated_normal(
         shape=(int(1e5),), means=0.8, stddevs=0.05, minvals=-1., maxvals=1.)
+    new_seed = random_ops.random_uniform([2],
+                                         seed=1234,
+                                         minval=0,
+                                         maxval=(2**31 - 1),
+                                         dtype=np.int32)
+    sample_op_stateless = stateless.stateless_parameterized_truncated_normal(
+        shape=(int(1e5),),
+        seed=new_seed,
+        means=0.8,
+        stddevs=0.05,
+        minvals=-1.,
+        maxvals=1.)
 
     with self.session(use_gpu=True) as sess:
-      samples = sess.run(sample_op)
+      samples, samples_stateless = sess.run([sample_op, sample_op_stateless])
       # 0. is more than 16 standard deviations from the mean, and
       # should have a likelihood < 1e-57.
       assert (~np.isnan(samples)).all()
-      no_neg_samples = np.sum(samples < 0.)
-      self.assertEqual(no_neg_samples, 0.)
+      assert (~np.isnan(samples_stateless)).all()
+      self.assertAllGreater(samples, 0.)
+      self.assertAllGreater(samples_stateless, 0.)
+
+  def testStatelessParameterizedTruncatedNormalHasGrads(self):
+    mean = variables.Variable(0.01)
+    stddev = variables.Variable(1.)
+    minval = variables.Variable(-1.)
+    maxval = variables.Variable(1.)
+
+    with self.cached_session(use_gpu=True) as sess:
+      with backprop.GradientTape(persistent=True) as tape:
+        samples = stateless.stateless_parameterized_truncated_normal(
+            [1], [1, 2], mean, stddev, minval, maxval)
+
+      sess.run(variables.variables_initializer([mean, stddev, minval, maxval]))
+      [mean_grad, std_grad], mean_actual_grad, std_actual_grad = sess.run([
+          tape.gradient(samples, [mean, stddev]),
+          array_ops.ones_like(mean),
+          (samples - mean) / stddev])
+      self.assertAllClose(mean_grad, mean_actual_grad)
+      self.assertAllClose(std_grad, std_actual_grad[0])
+
+      try:
+        import scipy.stats  # pylint:disable=g-import-not-at-top
+        truncnorm = scipy.stats.truncnorm(a=-1., b=1., loc=0., scale=1.)
+        samples_np, [minval_grad, maxval_grad] = sess.run([
+            samples, tape.gradient(samples, [minval, maxval])])
+
+        sample_cdf = truncnorm.cdf(samples_np)
+        # These come from the implicit reparameterization trick.
+        scipy_maxval_grad = np.exp(
+            0.5 * (samples_np ** 2 - ((1. - 0.01) / 1.) ** 2) +
+            np.log(sample_cdf))
+
+        scipy_minval_grad = np.exp(
+            0.5 * (samples_np ** 2 - ((-1. - 0.01) / 1.) ** 2) +
+            np.log1p(-sample_cdf))
+
+        self.assertAllClose(minval_grad, scipy_minval_grad[0], rtol=1e-2)
+        self.assertAllClose(maxval_grad, scipy_maxval_grad[0], rtol=1e-2)
+
+      except ImportError as e:
+        tf_logging.warn("Cannot test truncated normal op: %s" % str(e))
 
   @test_util.run_deprecated_v1
   def testSamplingAtRandnSwitchover(self):
@@ -239,18 +362,33 @@ class ParameterizedTruncatedNormalTest(test.TestCase):
 
     epsilon = 0.001
     self.validateMoments(
-        shape=[10**6],
+        shape=[int(1e6)],
         mean=0.,
         stddev=1.0,
         minval=-epsilon,
         maxval=stddev_inside_bounds_before_using_randn - epsilon)
     self.validateMoments(
-        shape=[10**6],
+        shape=[int(1e6)],
         mean=0.,
         stddev=1.0,
         minval=-epsilon,
         maxval=stddev_inside_bounds_before_using_randn + epsilon)
 
+    self.validateMoments(
+        shape=[int(1e6)],
+        mean=0.,
+        stddev=1.0,
+        minval=-epsilon,
+        maxval=stddev_inside_bounds_before_using_randn - epsilon,
+        use_stateless=True)
+    self.validateMoments(
+        shape=[int(1e6)],
+        mean=0.,
+        stddev=1.0,
+        minval=-epsilon,
+        maxval=stddev_inside_bounds_before_using_randn + epsilon,
+        use_stateless=True)
+
 
 # Benchmarking code
 def parameterized_vs_naive(shape, num_iters, use_gpu=False):
diff --git a/tensorflow/python/ops/random_grad.py b/tensorflow/python/ops/random_grad.py
index 771980932cb..3caa08d96f9 100644
--- a/tensorflow/python/ops/random_grad.py
+++ b/tensorflow/python/ops/random_grad.py
@@ -18,9 +18,14 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import numpy as np
+
+from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import clip_ops
+from tensorflow.python.ops import gen_array_ops
 from tensorflow.python.ops import gen_random_ops
 from tensorflow.python.ops import math_ops
 
@@ -114,3 +119,118 @@ def _StatelessRandomGammaV2Grad(op, grad):  # pylint: disable=invalid-name
     return (None, None,
             math_ops.reduce_sum(
                 grad * partial_a, axis=math_ops.range(num_sample_dimensions)))
+
+
+def _Ndtr(x):
+  """Normal distribution function."""
+  half_sqrt_2 = constant_op.constant(
+      0.5 * np.sqrt(2.), dtype=x.dtype, name="half_sqrt_2")
+  w = x * half_sqrt_2
+  z = math_ops.abs(w)
+  y = array_ops.where(
+      z < half_sqrt_2,
+      1. + math_ops.erf(w),
+      array_ops.where(
+          w > 0., 2. - math_ops.erfc(z), math_ops.erfc(z)))
+  return 0.5 * y
+
+
+@ops.RegisterGradient("StatelessParameterizedTruncatedNormal")
+def _StatelessParameterizedTruncatedNormalGrad(op, grad):  # pylint: disable=invalid-name
+  """Returns the gradient of a TruncatedNormal sample w.r.t. parameters.
+
+  The gradient is computed using implicit differentiation
+  (Figurnov et al., 2018).
+
+  Args:
+    op: A `StatelessParameterizedTruncatedNormal` operation. We assume that the
+      inputs to the operation are `shape`, `seed`, `mean`, `stddev`, `minval`,
+      and `maxval` tensors, and the output is the `sample` tensor.
+    grad: The incoming gradient `dloss / dsample` of the same shape as
+      `op.outputs[0]`.
+
+  Returns:
+    A list of `Tensor` with derivates with respect to each parameter.
+
+  References:
+    Implicit Reparameterization Gradients:
+      [Figurnov et al., 2018]
+      (http://papers.nips.cc/paper/7326-implicit-reparameterization-gradients)
+      ([pdf]
+      (http://papers.nips.cc/paper/7326-implicit-reparameterization-gradients.pdf))
+  """
+  shape = op.inputs[0]
+  mean = op.inputs[2]
+  stddev = op.inputs[3]
+  minval = op.inputs[4]
+  maxval = op.inputs[5]
+  sample = op.outputs[0]
+
+  with ops.control_dependencies([grad]):
+    minval_std = (minval - mean) / stddev
+    maxval_std = (maxval - mean) / stddev
+    sample_std = (sample - mean) / stddev
+
+    cdf_sample = (_Ndtr(sample_std) - _Ndtr(minval_std)) / (
+        _Ndtr(maxval_std) - _Ndtr(minval_std))
+
+    # Clip to avoid zero argument for log_cdf expression
+    tiny = np.finfo(mean.dtype.as_numpy_dtype).tiny
+    eps = np.finfo(mean.dtype.as_numpy_dtype).eps
+    cdf_sample = clip_ops.clip_by_value(cdf_sample, tiny, 1 - eps)
+
+    dmaxval = math_ops.exp(0.5 * (sample_std ** 2 - maxval_std ** 2) +
+                           math_ops.log(cdf_sample))
+    dminval = math_ops.exp(0.5 * (sample_std ** 2 - minval_std ** 2) +
+                           math_ops.log1p(-cdf_sample))
+    dmean = array_ops.ones_like(sample_std)
+    dstddev = sample_std
+
+    # Reduce over extra dimensions caused by `shape`. We need to get the
+    # difference in rank from shape vs. the broadcasted rank.
+
+    mean_shape = array_ops.shape(mean)
+    stddev_shape = array_ops.shape(stddev)
+    minval_shape = array_ops.shape(minval)
+    maxval_shape = array_ops.shape(maxval)
+
+    broadcast_shape = array_ops.broadcast_dynamic_shape(
+        mean_shape, stddev_shape)
+    broadcast_shape = array_ops.broadcast_dynamic_shape(
+        minval_shape, broadcast_shape)
+    broadcast_shape = array_ops.broadcast_dynamic_shape(
+        maxval_shape, broadcast_shape)
+    extra_dims = math_ops.range(
+        array_ops.size(shape) - array_ops.size(broadcast_shape))
+
+    grad_mean = math_ops.reduce_sum(grad * dmean, axis=extra_dims)
+    grad_stddev = math_ops.reduce_sum(grad * dstddev, axis=extra_dims)
+    grad_minval = math_ops.reduce_sum(grad * dminval, axis=extra_dims)
+    grad_maxval = math_ops.reduce_sum(grad * dmaxval, axis=extra_dims)
+
+    _, rmean = gen_array_ops.broadcast_gradient_args(
+        broadcast_shape, mean_shape)
+    _, rstddev = gen_array_ops.broadcast_gradient_args(
+        broadcast_shape, stddev_shape)
+    _, rminval = gen_array_ops.broadcast_gradient_args(
+        broadcast_shape, minval_shape)
+    _, rmaxval = gen_array_ops.broadcast_gradient_args(
+        broadcast_shape, maxval_shape)
+
+    grad_mean = array_ops.reshape(
+        math_ops.reduce_sum(grad_mean, axis=rmean, keepdims=True), mean_shape)
+
+    grad_stddev = array_ops.reshape(
+        math_ops.reduce_sum(grad_stddev, axis=rstddev, keepdims=True),
+        stddev_shape)
+
+    grad_minval = array_ops.reshape(
+        math_ops.reduce_sum(grad_minval, axis=rminval, keepdims=True),
+        minval_shape)
+
+    grad_maxval = array_ops.reshape(
+        math_ops.reduce_sum(grad_maxval, axis=rmaxval, keepdims=True),
+        maxval_shape)
+
+    # The first two inputs are shape.
+    return (None, None, grad_mean, grad_stddev, grad_minval, grad_maxval)
diff --git a/tensorflow/python/ops/stateless_random_ops.py b/tensorflow/python/ops/stateless_random_ops.py
index 25fefcc514c..3e825cc4775 100644
--- a/tensorflow/python/ops/stateless_random_ops.py
+++ b/tensorflow/python/ops/stateless_random_ops.py
@@ -618,3 +618,73 @@ def stateless_multinomial_categorical_impl(logits, num_samples, dtype, seed):
   logits = ops.convert_to_tensor(logits, name="logits")
   return gen_stateless_random_ops.stateless_multinomial(
       logits, num_samples, seed, output_dtype=dtype)
+
+
+@dispatch.add_dispatch_support
+@tf_export("random.stateless_parameterized_truncated_normal")
+def stateless_parameterized_truncated_normal(shape,
+                                             seed,
+                                             means=0.0,
+                                             stddevs=1.0,
+                                             minvals=-2.0,
+                                             maxvals=2.0,
+                                             name=None):
+  """Outputs random values from a truncated normal distribution.
+
+  The generated values follow a normal distribution with specified mean and
+  standard deviation, except that values whose magnitude is more than 2 standard
+  deviations from the mean are dropped and re-picked.
+
+
+  Examples:
+
+  Sample from a Truncated normal, with deferring shape parameters that
+  broadcast.
+
+  >>> means = 0.
+  >>> stddevs = tf.math.exp(tf.random.uniform(shape=[2, 3]))
+  >>> minvals = [-1., -2., -1000.]
+  >>> maxvals = [[10000.], [1.]]
+  >>> y = tf.random.stateless_parameterized_truncated_normal(
+  ...   shape=[10, 2, 3], seed=[7, 17],
+  ...   means=means, stddevs=stddevs, minvals=minvals, maxvals=maxvals)
+  >>> y.shape
+  TensorShape([10, 2, 3])
+
+  Args:
+    shape: A 1-D integer `Tensor` or Python array. The shape of the output
+      tensor.
+    seed: A shape [2] Tensor, the seed to the random number generator. Must have
+      dtype `int32` or `int64`. (When using XLA, only `int32` is allowed.)
+    means: A `Tensor` or Python value of type `dtype`. The mean of the truncated
+      normal distribution. This must broadcast with `stddevs`, `minvals` and
+      `maxvals`, and the broadcasted shape must be dominated by `shape`.
+    stddevs: A `Tensor` or Python value of type `dtype`. The standard deviation
+      of the truncated normal distribution. This must broadcast with `means`,
+      `minvals` and `maxvals`, and the broadcasted shape must be dominated by
+      `shape`.
+    minvals: A `Tensor` or Python value of type `dtype`. The minimum value of
+      the truncated normal distribution. This must broadcast with `means`,
+      `stddevs` and `maxvals`, and the broadcasted shape must be dominated by
+      `shape`.
+    maxvals: A `Tensor` or Python value of type `dtype`. The maximum value of
+      the truncated normal distribution. This must broadcast with `means`,
+      `stddevs` and `minvals`, and the broadcasted shape must be dominated by
+      `shape`.
+    name: A name for the operation (optional).
+
+  Returns:
+    A tensor of the specified shape filled with random truncated normal values.
+  """
+  with ops.name_scope(name, "stateless_parameterized_truncated_normal",
+                      [shape, means, stddevs, minvals, maxvals]) as name:
+    shape_tensor = tensor_util.shape_tensor(shape)
+    means_tensor = ops.convert_to_tensor(means, name="means")
+    stddevs_tensor = ops.convert_to_tensor(stddevs, name="stddevs")
+    minvals_tensor = ops.convert_to_tensor(minvals, name="minvals")
+    maxvals_tensor = ops.convert_to_tensor(maxvals, name="maxvals")
+    rnd = gen_stateless_random_ops.stateless_parameterized_truncated_normal(
+        shape_tensor, seed, means_tensor, stddevs_tensor, minvals_tensor,
+        maxvals_tensor)
+    tensor_util.maybe_set_static_shape(rnd, shape)
+    return rnd
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.random.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.random.pbtxt
index 9c6fa7154a3..f5963f1324c 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.random.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.random.pbtxt
@@ -92,6 +92,10 @@ tf_module {
     name: "stateless_normal"
     argspec: "args=[\'shape\', \'seed\', \'mean\', \'stddev\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'0.0\', \'1.0\', \"<dtype: \'float32\'>\", \'None\'], "
   }
+  member_method {
+    name: "stateless_parameterized_truncated_normal"
+    argspec: "args=[\'shape\', \'seed\', \'means\', \'stddevs\', \'minvals\', \'maxvals\', \'name\'], varargs=None, keywords=None, defaults=[\'0.0\', \'1.0\', \'-2.0\', \'2.0\', \'None\'], "
+  }
   member_method {
     name: "stateless_poisson"
     argspec: "args=[\'shape\', \'seed\', \'lam\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'int32\'>\", \'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt
index 62969b5a0dd..8e5303cbea4 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt
@@ -4492,6 +4492,10 @@ tf_module {
     name: "StatelessMultinomial"
     argspec: "args=[\'logits\', \'num_samples\', \'seed\', \'output_dtype\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'int64\'>\", \'None\'], "
   }
+  member_method {
+    name: "StatelessParameterizedTruncatedNormal"
+    argspec: "args=[\'shape\', \'seed\', \'means\', \'stddevs\', \'minvals\', \'maxvals\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "StatelessRandomBinomial"
     argspec: "args=[\'shape\', \'seed\', \'counts\', \'probs\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'int64\'>\", \'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.random.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.random.pbtxt
index e3a11ee4610..d1b8c90bfae 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.random.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.random.pbtxt
@@ -80,6 +80,10 @@ tf_module {
     name: "stateless_normal"
     argspec: "args=[\'shape\', \'seed\', \'mean\', \'stddev\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'0.0\', \'1.0\', \"<dtype: \'float32\'>\", \'None\'], "
   }
+  member_method {
+    name: "stateless_parameterized_truncated_normal"
+    argspec: "args=[\'shape\', \'seed\', \'means\', \'stddevs\', \'minvals\', \'maxvals\', \'name\'], varargs=None, keywords=None, defaults=[\'0.0\', \'1.0\', \'-2.0\', \'2.0\', \'None\'], "
+  }
   member_method {
     name: "stateless_poisson"
     argspec: "args=[\'shape\', \'seed\', \'lam\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'int32\'>\", \'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt
index 62969b5a0dd..8e5303cbea4 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt
@@ -4492,6 +4492,10 @@ tf_module {
     name: "StatelessMultinomial"
     argspec: "args=[\'logits\', \'num_samples\', \'seed\', \'output_dtype\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'int64\'>\", \'None\'], "
   }
+  member_method {
+    name: "StatelessParameterizedTruncatedNormal"
+    argspec: "args=[\'shape\', \'seed\', \'means\', \'stddevs\', \'minvals\', \'maxvals\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "StatelessRandomBinomial"
     argspec: "args=[\'shape\', \'seed\', \'counts\', \'probs\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'int64\'>\", \'None\'], "

From 81041bcd8267056741c8b4766bd8018a7270f454 Mon Sep 17 00:00:00 2001
From: Lu Wang <luwa@google.com>
Date: Thu, 18 Jun 2020 10:29:58 -0700
Subject: [PATCH 0506/1390] Update the documentation for Model metadata

PiperOrigin-RevId: 317131186
Change-Id: Ida1cd7fc729d7d601c32d646bd650fe1aa539591
---
 tensorflow/lite/g3doc/convert/metadata.md | 72 +++++++++++++++++++----
 1 file changed, 60 insertions(+), 12 deletions(-)

diff --git a/tensorflow/lite/g3doc/convert/metadata.md b/tensorflow/lite/g3doc/convert/metadata.md
index 29b2c5ce2b3..cd86333b305 100644
--- a/tensorflow/lite/g3doc/convert/metadata.md
+++ b/tensorflow/lite/g3doc/convert/metadata.md
@@ -9,7 +9,8 @@ input / output information. The metadata consists of both
 *   human readable parts which convey the best practice when using the model,
     and
 *   machine readable parts that can be leveraged by code generators, such as
-    [the TensorFlow Lite Android code generator](../guide/codegen.md).
+    [the TensorFlow Lite Android code generator](../guide/codegen.md) and
+    [the Android Studio ML Binding feature](https://developer.android.com/studio/preview/features#tensor-flow-lite-models).
 
 ## Setup the metadata tools
 
@@ -32,11 +33,21 @@ There are three parts to the
 [model metadata](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/experimental/support/metadata/metadata_schema.fbs):
 
 1.  **Model information** - Overall description of the model as well as items
-    such as licence terms.
+    such as licence terms. See
+    [ModelMetadata](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/experimental/support/metadata/metadata_schema.fbs#L515).
 2.  **Input information** - Description of the inputs and pre-processing
-    required such as normalization.
+    required such as normalization. See
+    [SubGraphMetadata.input_tensor_metadata](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/experimental/support/metadata/metadata_schema.fbs#L500).
 3.  **Output information** - Description of the output and post-processing
-    required such as mapping to labels.
+    required such as mapping to labels. See
+    [SubGraphMetadata.output_tensor_metadata](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/experimental/support/metadata/metadata_schema.fbs#L509).
+
+Since TensorFlow Lite only supports single subgraph at this point, the
+[TensorFlow Lite code generator](../guide/codegen.md) and
+[the Android Studio ML Binding feature](https://developer.android.com/studio/preview/features#tensor-flow-lite-models)
+will use `ModelMetadata.name` and `ModelMetadata.description`, instead of
+`SubGraphMetadata.name` and `SubGraphMetadata.description`, when displaying
+metadata and generating code.
 
 ### Supported Input / Output types
 
@@ -51,6 +62,29 @@ Lite metadata:
 *   Bounding box - Rectangular shape bounding boxes. The schema supports
     [a variety of numbering schemes](https://github.com/tensorflow/tensorflow/blob/268853ee81edab09e07f455cc918f7ef9a421485/tensorflow/lite/experimental/support/metadata/metadata_schema.fbs#L165).
 
+### Pack the associated files
+
+TensorFlow Lite models may come with different associated files. For example,
+natural language models usually have vocab files that map word pieces to word
+IDs; classification models may have label files that indicate object categories.
+Without the associated files (if there are), a model will not function well.
+
+The associated files can now be bundled with the model through the metadata
+Python library. The new TensorFlow Lite model becomes a zip file that contains
+both the model and the associated files. It can be unpacked with common zip
+tools. This new model format keeps using the same file extension, `.tflite`. It
+is compatible with existing TFLite framework and Interpreter. See
+[Pack mtadata and associated files into the model](#pack-metadata-and-associated-files-into-the-model)
+for more details.
+
+The associate file information can be recored in the metadata. Depending on the
+file type and where the file is attached to (i.e. `ModelMetadata`,
+`SubGraphMetadata`, and `TensorMetadata`),
+[the TensorFlow Lite Android code generator](../guide/codegen.md) may apply
+corresponding pre/post processing automatically to the object. See
+[the \<Codegen usage\> section of each associate file type](https://github.com/tensorflow/tensorflow/blob/268853ee81edab09e07f455cc918f7ef9a421485/tensorflow/lite/experimental/support/metadata/metadata_schema.fbs#L37-L77)
+in the schema for more details.
+
 ### Examples
 
 Note: The export directory specified has to exist before you run the script; it
@@ -63,7 +97,9 @@ types of models here:
 
 Download the script
 [here](https://github.com/tensorflow/examples/tree/master/lite/examples/image_classification/metadata/metadata_writer_for_image_classifier.py)
-and run the script like this:
+, which populates metadata to
+[mobilenet_v1_0.75_160_quantized.tflite](https://tfhub.dev/tensorflow/lite-model/mobilenet_v1_0.75_160_quantized/1/default/1).
+Run the script like this:
 
 ```sh
 python ./metadata_writer_for_image_classifier.py \
@@ -72,8 +108,11 @@ python ./metadata_writer_for_image_classifier.py \
     --export_directory=model_with_metadata
 ```
 
-The rest of this guide will highlight some of the key sections in the image
-classification example to illustrate the key elements.
+To populate metadata for other image classification models, add the model specs
+like
+[this](https://github.com/tensorflow/examples/blob/master/lite/examples/image_classification/metadata/metadata_writer_for_image_classifier.py#L63-L74)
+into the script. The rest of this guide will highlight some of the key sections
+in the image classification example to illustrate the key elements.
 
 ### Deep dive into the image classification example
 
@@ -173,7 +212,7 @@ label_file.type = _metadata_fb.AssociatedFileType.TENSOR_AXIS_LABELS
 output_meta.associatedFiles = [label_file]
 ```
 
-#### Put it all together
+#### Create the metadata Flatbuffers
 
 The following code combines the model information with the input and output
 information:
@@ -192,8 +231,10 @@ b.Finish(
 metadata_buf = b.Output()
 ```
 
-Once the data structure is ready, the metadata is written into the TFLite file
-via the `populate` method:
+#### Pack metadata and associated files into the model
+
+Once the metadata Flatbuffers is created, the metadata and the label file are
+written into the TFLite file via the `populate` method:
 
 ```python
 populator = _metadata.MetadataPopulator.with_model_file(model_file)
@@ -202,9 +243,16 @@ populator.load_associated_files(["your_path_to_label_file"])
 populator.populate()
 ```
 
-#### Verify the metadata
+You can pack as many associated files as you want into the model through
+`load_associated_files`. However, it is required to pack at least those files
+documented in the metadata. In this example, packing the lable file is
+mandatory.
 
-You can read the metadata in a TFLite file using the `MetadataDisplayer`:
+### Visualize the metadata
+
+You can use [Netron](https://github.com/lutzroeder/netron) to visualize your
+metadata, or you can read the metadata from a TensorFlow Lite model into a json
+format using the `MetadataDisplayer`:
 
 ```python
 displayer = _metadata.MetadataDisplayer.with_model_file(export_model_path)

From 47582983cb1064b5bb81233db4f0adeeaa10b74d Mon Sep 17 00:00:00 2001
From: Scott Zhu <scottzhu@google.com>
Date: Thu, 18 Jun 2020 10:30:56 -0700
Subject: [PATCH 0507/1390] Fix save model issue for stateless ConvLSTM2D
 layer.

The root cause is that ConvLSTM2D.state is a tuple rather than a list. When converting the state for save_model, the tuple is not converted to trackable objects since the states are (None, None). On the other hand, save_model requires all objects to be trackable when saving.

We didn't hit this issue for keras.LSTM since its state is a list, rather than tuple. The list is auto convert to ListWrapper since list itself is mutable.

This should fix https://github.com/tensorflow/tensorflow/issues/40328 and partly https://github.com/tensorflow/tensorflow/issues/38220

PiperOrigin-RevId: 317131403
Change-Id: I202d4dbdb29accc7a047d5f5a2fef08d24d05c7c
---
 .../saving/saved_model/layer_serialization.py | 10 ++++++++--
 .../saving/saved_model/saved_model_test.py    | 20 +++++++++++++++++++
 2 files changed, 28 insertions(+), 2 deletions(-)

diff --git a/tensorflow/python/keras/saving/saved_model/layer_serialization.py b/tensorflow/python/keras/saving/saved_model/layer_serialization.py
index 559b6158d87..4216457bf28 100644
--- a/tensorflow/python/keras/saving/saved_model/layer_serialization.py
+++ b/tensorflow/python/keras/saving/saved_model/layer_serialization.py
@@ -159,6 +159,12 @@ class RNNSavedModelSaver(LayerSavedModelSaver):
     objects, functions = (
         super(RNNSavedModelSaver, self)._get_serialized_attributes_internal(
             serialization_cache))
-
-    objects['states'] = data_structures.wrap_or_unwrap(self.obj.states)
+    states = data_structures.wrap_or_unwrap(self.obj.states)
+    # Force the tuple into TupleWrapper which is a trackable object. The
+    # save/load code requires all the objects to be trackable.
+    # Tuple is not converted to TupleWrapper by data_structures.wrap_or_unwrap()
+    # if it doesn't contains any trackable objects.
+    if isinstance(states, tuple):
+      states = data_structures._TupleWrapper(states)  # pylint: disable=protected-access
+    objects['states'] = states
     return objects, functions
diff --git a/tensorflow/python/keras/saving/saved_model/saved_model_test.py b/tensorflow/python/keras/saving/saved_model/saved_model_test.py
index 8d4d27e2357..3f55d5f40b5 100644
--- a/tensorflow/python/keras/saving/saved_model/saved_model_test.py
+++ b/tensorflow/python/keras/saving/saved_model/saved_model_test.py
@@ -773,6 +773,26 @@ class TestModelSavingAndLoadingV2(keras_parameterized.TestCase):
     self.assertAllClose(layer.states, loaded_layer.states)
     self.assertAllClose(model(input_arr), loaded(input_arr))
 
+  def testSaveStatelessConvLSTM2D(self):
+    data_format = 'channels_first'
+    batch, timesteps, channels, rows, cols = 12, 10, 8, 4, 4
+    input_arr = np.ones(
+        (batch, timesteps, channels, rows, cols)).astype('float32')
+    layer = keras.layers.ConvLSTM2D(
+        filters=16, kernel_size=(1, 1), data_format=data_format)
+    x = keras.Input(batch_shape=(batch, timesteps, channels, rows, cols))
+    y = layer(x)
+    model = keras.Model(x, y)
+
+    predict_1 = model(input_arr)
+    saved_model_dir = self._save_model_dir()
+    tf_save.save(model, saved_model_dir)
+    del model
+
+    loaded = keras_load.load(saved_model_dir)
+    predict_2 = loaded(input_arr)
+    self.assertAllClose(predict_1, predict_2)
+
   def testSaveWithRaggedInputs(self):
 
     class EmbeddingMerger(keras.layers.Layer):

From 69467b92630bb37c077eda3e411903ccddd906ff Mon Sep 17 00:00:00 2001
From: Sachin Joglekar <srjoglekar@google.com>
Date: Thu, 18 Jun 2020 10:32:53 -0700
Subject: [PATCH 0508/1390] Make control_dep test mlir-only, since tf2 behavior
 introduces control-flow ops that are not supported by TOCO

PiperOrigin-RevId: 317131879
Change-Id: I8cf51927a3d4985cec7d2b03930d2f553a6a3286
---
 tensorflow/lite/build_def.bzl                 |  1 -
 .../lite/testing/generate_examples_lib.py     |  1 -
 .../lite/testing/op_tests/control_dep.py      | 61 -------------------
 3 files changed, 63 deletions(-)
 delete mode 100644 tensorflow/lite/testing/op_tests/control_dep.py

diff --git a/tensorflow/lite/build_def.bzl b/tensorflow/lite/build_def.bzl
index e6c92691b15..ad43b56743a 100644
--- a/tensorflow/lite/build_def.bzl
+++ b/tensorflow/lite/build_def.bzl
@@ -251,7 +251,6 @@ def generated_test_models():
         "ceil",
         "concat",
         "constant",
-        # "control_dep", # b/150647401
         "conv",
         "conv_relu",
         "conv_relu1",
diff --git a/tensorflow/lite/testing/generate_examples_lib.py b/tensorflow/lite/testing/generate_examples_lib.py
index fc92991bd57..fce2beabf45 100644
--- a/tensorflow/lite/testing/generate_examples_lib.py
+++ b/tensorflow/lite/testing/generate_examples_lib.py
@@ -52,7 +52,6 @@ from tensorflow.lite.testing.op_tests.cast import make_cast_tests
 from tensorflow.lite.testing.op_tests.ceil import make_ceil_tests
 from tensorflow.lite.testing.op_tests.concat import make_concat_tests
 from tensorflow.lite.testing.op_tests.constant import make_constant_tests
-from tensorflow.lite.testing.op_tests.control_dep import make_control_dep_tests
 from tensorflow.lite.testing.op_tests.conv import make_conv_tests
 from tensorflow.lite.testing.op_tests.conv2d_transpose import make_conv2d_transpose_tests
 from tensorflow.lite.testing.op_tests.conv_activation import make_conv_relu_tests, make_conv_relu1_tests, make_conv_relu6_tests
diff --git a/tensorflow/lite/testing/op_tests/control_dep.py b/tensorflow/lite/testing/op_tests/control_dep.py
deleted file mode 100644
index bd9e369303b..00000000000
--- a/tensorflow/lite/testing/op_tests/control_dep.py
+++ /dev/null
@@ -1,61 +0,0 @@
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Test configs for control_dep."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import tensorflow.compat.v1 as tf
-from tensorflow.lite.testing.zip_test_utils import create_tensor_data
-from tensorflow.lite.testing.zip_test_utils import ExtraTocoOptions
-from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
-from tensorflow.lite.testing.zip_test_utils import register_make_test_function
-
-TEST_INPUT_DEPTH = 3
-
-
-@register_make_test_function()
-def make_control_dep_tests(options):
-  """Make a set of tests that use control dependencies."""
-
-  test_parameters = [{
-      "input_shape": [[], [1, 1, 1, 1], [1, 15, 14, 1], [3, 15, 14, 3]],
-  }]
-
-  def build_graph(parameters):
-    input_tensor = tf.compat.v1.placeholder(
-        dtype=tf.float32, name="input", shape=parameters["input_shape"])
-    filter_value = tf.zeros((3, 3, TEST_INPUT_DEPTH, 8), tf.float32)
-    assert_op = tf.compat.v1.assert_greater_equal(input_tensor,
-                                                  input_tensor - 1)
-    with tf.control_dependencies([assert_op]):
-      out = tf.nn.conv2d(
-          input_tensor, filter_value, strides=(1, 1, 1, 1), padding="SAME")
-      return [input_tensor], [out]
-
-  def build_inputs(parameters, sess, inputs, outputs):
-    input_values = create_tensor_data(tf.float32, parameters["input_shape"])
-    return [input_values], sess.run(
-        outputs, feed_dict=dict(zip(inputs, [input_values])))
-
-  extra_toco_options = ExtraTocoOptions()
-  extra_toco_options.drop_control_dependency = True
-  make_zip_of_tests(
-      options,
-      test_parameters,
-      build_graph,
-      build_inputs,
-      extra_toco_options,
-      expected_tf_failures=3)

From d1157c976bd605b36506151493439ebd2f80bded Mon Sep 17 00:00:00 2001
From: Brian Zhao <bmzhao@google.com>
Date: Thu, 18 Jun 2020 10:34:15 -0700
Subject: [PATCH 0509/1390] Adding missing kernel dependency to fix test for
 windows.

PiperOrigin-RevId: 317132189
Change-Id: Ia59020db1c96301af4b9723170eb7808b66ef5b5
---
 tensorflow/c/experimental/saved_model/core/ops/BUILD | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/tensorflow/c/experimental/saved_model/core/ops/BUILD b/tensorflow/c/experimental/saved_model/core/ops/BUILD
index 8c4c41c6d75..332b92bec45 100644
--- a/tensorflow/c/experimental/saved_model/core/ops/BUILD
+++ b/tensorflow/c/experimental/saved_model/core/ops/BUILD
@@ -78,14 +78,12 @@ tf_cc_test(
     srcs = [
         "variable_ops_test.cc",
     ],
-    tags = [
-        "no_windows",  # TODO(b/159210739): Remove this tag after fixing the bug.
-    ],
     deps = [
         ":owned_eager_context",
         ":owned_tensor",
         ":owned_tensor_handle",
         ":variable_ops",
+        "//tensorflow/core:all_kernels",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",

From cf00e559d7e9649714be26173c44f9d6697e702e Mon Sep 17 00:00:00 2001
From: Frank Chen <frankchn@google.com>
Date: Thu, 18 Jun 2020 10:39:58 -0700
Subject: [PATCH 0510/1390] Add TPU configuration ops to default TensorFlow
 build

PiperOrigin-RevId: 317133514
Change-Id: I33bc6d7fdbba5915bd0d1291d4e086139c07eb14
---
 .bazelrc                                      |   4 +
 tensorflow/BUILD                              |   7 +
 tensorflow/core/BUILD                         |   3 +
 tensorflow/core/tpu/BUILD                     |  14 ++
 tensorflow/core/tpu/kernels/BUILD             |  30 +++-
 .../core/tpu/kernels/tpu_compile_c_api.h      |  27 +--
 .../core/tpu/kernels/tpu_compile_op_common.cc |   2 +-
 .../core/tpu/kernels/tpu_configuration_ops.cc |  29 +--
 .../core/tpu/kernels/tpu_mesh_state_c_api.h   |  15 +-
 .../tpu/kernels/tpu_mesh_state_interface.h    |  11 +-
 tensorflow/core/tpu/kernels/tpu_util_c_api.h  |   6 +
 tensorflow/core/tpu/tpu_library_init_fns.inc  | 166 ++++++++++++++++++
 tensorflow/core/tpu/tpu_library_loader.cc     |  75 ++++----
 tensorflow/core/tpu/tpu_library_loader.h      |  16 ++
 .../core/tpu/tpu_library_loader_windows.cc    |  10 ++
 tensorflow/stream_executor/tpu/BUILD          |  14 +-
 .../stream_executor/tpu/tpu_executor.cc       | 129 ++++++++------
 .../stream_executor/tpu/tpu_executor_c_api.h  |  92 +++++++++-
 .../stream_executor/tpu/tpu_node_context.cc   |  14 +-
 .../tpu/tpu_node_context_c_api.h              |  12 ++
 .../stream_executor/tpu/tpu_platform.cc       |  64 ++++---
 tensorflow/stream_executor/tpu/tpu_platform.h |   6 +-
 tensorflow/stream_executor/tpu/tpu_stream.h   |  25 ++-
 tensorflow/stream_executor/tpu/tpu_timer.h    |  13 +-
 .../tpu/tpu_transfer_manager.cc               |  20 ++-
 tensorflow/tensorflow.bzl                     |   7 +
 26 files changed, 627 insertions(+), 184 deletions(-)
 create mode 100644 tensorflow/core/tpu/tpu_library_init_fns.inc

diff --git a/.bazelrc b/.bazelrc
index e21a1a32917..e67c3eecc3b 100644
--- a/.bazelrc
+++ b/.bazelrc
@@ -39,6 +39,7 @@
 #
 # Feature and Third party library support options:
 #     xla:          Build TF with XLA
+#     tpu:          Build TF with TPU support
 #     using_cuda:   CUDA is available to build system.
 #     cuda:         Build with full cuda support.
 #     rocm:         Build with AMD GPU support (rocm).
@@ -180,6 +181,9 @@ build:dbg --cxxopt -DTF_LITE_DISABLE_X86_NEON
 # AWS SDK must be compiled in release mode. see: https://github.com/tensorflow/tensorflow/issues/37498
 build:dbg --copt -DDEBUG_BUILD
 
+# Config to build TPU backend
+build:tpu --define=with_tpu_support=true
+
 build:tensorrt --action_env TF_NEED_TENSORRT=1
 
 build:rocm --crosstool_top=@local_config_rocm//crosstool:toolchain
diff --git a/tensorflow/BUILD b/tensorflow/BUILD
index bd0619b0c05..d00608ccc98 100644
--- a/tensorflow/BUILD
+++ b/tensorflow/BUILD
@@ -467,6 +467,13 @@ config_setting(
     visibility = ["//visibility:public"],
 )
 
+# This flag enables experimental TPU support
+config_setting(
+    name = "with_tpu_support",
+    values = {"define": "with_tpu_support=true"},
+    visibility = ["//visibility:public"],
+)
+
 # Specifies via a config setting if this is a mobile build or not, makes
 # it easier to combine settings later.
 selects.config_setting_group(
diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index 50f1f2527a5..7f1c1bd549b 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -72,6 +72,7 @@ load(
     "if_ios",
     "if_mobile",
     "if_not_windows",
+    "if_tpu",
     "tf_android_core_proto_headers",
     "tf_cc_test",
     "tf_cc_test_mkl",
@@ -1093,6 +1094,8 @@ cc_library(
     ]) + if_tensorrt([
         "//tensorflow/compiler/tf2tensorrt:trt_engine_resource_op_kernels",
         "//tensorflow/compiler/tf2tensorrt:trt_op_kernels",
+    ]) + if_tpu([
+        "//tensorflow/core/tpu/kernels",
     ]),
 )
 
diff --git a/tensorflow/core/tpu/BUILD b/tensorflow/core/tpu/BUILD
index d4bcdfd52c5..9e89cd69235 100644
--- a/tensorflow/core/tpu/BUILD
+++ b/tensorflow/core/tpu/BUILD
@@ -103,6 +103,7 @@ cc_library(
         ":libtftpu_header",
         "//tensorflow/c:tf_status",
     ],
+    alwayslink = True,
 )
 
 cc_library(
@@ -116,7 +117,20 @@ cc_library(
     deps = [
         ":libtftpu_header",
         ":tpu_config_c_api",
+        ":tpu_library_init_fns",
         "//tensorflow/core/platform:errors",
         "//tensorflow/core/platform:status",
+        "//tensorflow/core/tpu/kernels:tpu_compile_c_api_hdrs",
+        "//tensorflow/core/tpu/kernels:tpu_mesh_state_c_api_hdrs",
+        "//tensorflow/core/tpu/kernels:tpu_util_c_api_hdrs",
+        "//tensorflow/stream_executor/tpu:tpu_executor_c_api_hdrs",
+        "//tensorflow/stream_executor/tpu:tpu_node_context_c_api_hdrs",
+        "//tensorflow/stream_executor/tpu:tpu_platform_hdrs",
     ],
 )
+
+cc_library(
+    name = "tpu_library_init_fns",
+    hdrs = ["tpu_library_init_fns.inc"],
+    visibility = ["//visibility:public"],
+)
diff --git a/tensorflow/core/tpu/kernels/BUILD b/tensorflow/core/tpu/kernels/BUILD
index 9ba9ad61aa0..94d3c8edf2b 100644
--- a/tensorflow/core/tpu/kernels/BUILD
+++ b/tensorflow/core/tpu/kernels/BUILD
@@ -3,6 +3,10 @@ load(
     "//tensorflow/core/platform:build_config.bzl",
     "tf_proto_library_cc",
 )
+load(
+    "//tensorflow:tensorflow.bzl",
+    "tf_kernel_library",
+)
 
 package(
     default_visibility = [
@@ -12,6 +16,12 @@ package(
     licenses = ["notice"],  # Apache 2.0
 )
 
+tf_kernel_library(
+    name = "kernels",
+    visibility = ["//visibility:public"],
+    deps = [":tpu_configuration_ops"],
+)
+
 cc_library(
     name = "tpu_compile_op_common",
     srcs = ["tpu_compile_op_common.cc"],
@@ -50,7 +60,7 @@ cc_library(
     hdrs = ["tpu_compile_op_options.h"],
 )
 
-cc_library(
+tf_kernel_library(
     name = "tpu_configuration_ops",
     srcs = ["tpu_configuration_ops.cc"],
     hdrs = ["tpu_configuration_ops.h"],
@@ -75,12 +85,13 @@ cc_library(
     name = "tpu_compile_c_api_hdrs",
     hdrs = ["tpu_compile_c_api.h"],
     deps = [
-        ":tpu_mesh_state_c_api",
+        ":tpu_mesh_state_c_api_hdrs",
         ":tpu_ops_common_c_api_hdrs",
         ":tpu_program_c_api_hdrs",
-        "//tensorflow/c:tf_datatype",
+        "//tensorflow/core/tpu:libtftpu_header",
         "//tensorflow/stream_executor/tpu:proto_helper",
     ],
+    alwayslink = True,
 )
 
 tf_proto_library_cc(
@@ -197,8 +208,10 @@ cc_library(
 )
 
 cc_library(
-    name = "tpu_mesh_state_c_api",
+    name = "tpu_mesh_state_c_api_hdrs",
     hdrs = ["tpu_mesh_state_c_api.h"],
+    deps = ["//tensorflow/core/tpu:libtftpu_header"],
+    alwayslink = True,
 )
 
 cc_library(
@@ -207,12 +220,11 @@ cc_library(
     hdrs = ["tpu_mesh_state_interface.h"],
     deps = [
         ":tpu_compile_c_api_hdrs",
-        ":tpu_mesh_state_c_api",
+        ":tpu_mesh_state_c_api_hdrs",
         "//tensorflow/compiler/xla/service",
         "//tensorflow/core:framework",
-        "//tensorflow/core/platform:errors",
         "//tensorflow/core/protobuf/tpu:compile_metadata_proto_cc",
-        "//tensorflow/core/tpu:tpu_config_c_api",
+        "//tensorflow/core/tpu:tpu_library_loader",
     ],
 )
 
@@ -371,13 +383,16 @@ cc_library(
     name = "tpu_util_c_api_hdrs",
     hdrs = ["tpu_util_c_api.h"],
     deps = [
+        "//tensorflow/core/tpu:libtftpu_header",
         "//tensorflow/stream_executor/tpu:proto_helper",
     ],
+    alwayslink = True,
 )
 
 cc_library(
     name = "tpu_ops_common_c_api_hdrs",
     hdrs = ["tpu_ops_common_c_api.h"],
+    alwayslink = True,
 )
 
 cc_library(
@@ -387,6 +402,7 @@ cc_library(
         ":tpu_ops_common_c_api_hdrs",
         "//tensorflow/stream_executor/tpu:proto_helper",
     ],
+    alwayslink = True,
 )
 
 cc_library(
diff --git a/tensorflow/core/tpu/kernels/tpu_compile_c_api.h b/tensorflow/core/tpu/kernels/tpu_compile_c_api.h
index 70e3a7d2340..d1546ed9610 100644
--- a/tensorflow/core/tpu/kernels/tpu_compile_c_api.h
+++ b/tensorflow/core/tpu/kernels/tpu_compile_c_api.h
@@ -18,6 +18,7 @@ limitations under the License.
 #include "tensorflow/core/tpu/kernels/tpu_mesh_state_c_api.h"
 #include "tensorflow/core/tpu/kernels/tpu_ops_common_c_api.h"
 #include "tensorflow/core/tpu/kernels/tpu_program_c_api.h"
+#include "tensorflow/core/tpu/libtftpu.h"
 #include "tensorflow/stream_executor/tpu/proto_helper.h"
 
 enum TpuCoreTypeEnum {
@@ -44,35 +45,41 @@ struct CompilationCacheKeyProperty {
 extern "C" {
 
 // Returns the number of available TPU core count.
-int TpuTopology_AvailableCoreCount(const XLA_TpuMeshState* mesh_state,
-                                   TpuCoreTypeEnum tpu_core_type);
+TFTPU_CAPI_EXPORT int TpuTopology_AvailableCoreCount(
+    const XLA_TpuMeshState* mesh_state, TpuCoreTypeEnum tpu_core_type);
 
 // Creates a unique compilation cache `key` used for `put` and `get` operations.
 // Returned buffer is heap-allocated and must be owned.
-const char* TpuCompile_CreateCompilationCacheKey(
+TFTPU_CAPI_EXPORT const char* TpuCompile_CreateCompilationCacheKey(
     CompilationCacheKeyProperty property);
 
 // Creates a guaranteed const fingerprint. Guarantee const is normally used in
 // TPU inference to avoid re-copying unchanged variables onto the TPU device.
 // It promises the value is identical for every execution in the same session
 // even if the actual value changes in later executions.
-uint64_t TpuCompile_CreateGuaranteedConstFingerprint(uint64_t fingerprint,
-                                                     const char* data,
-                                                     size_t size);
+TFTPU_CAPI_EXPORT uint64_t TpuCompile_CreateGuaranteedConstFingerprint(
+    uint64_t fingerprint, const char* data, size_t size);
 
 // Executes the computations using XLA TPU compiler and returns TPU programs
 // ready for execution.
-void TpuCompile_CompileAheadOfTime(
-    TpuSerializedProto aot_compilation_request,
-    XLA_TpuProgram** tpu_programs[],
+TFTPU_CAPI_EXPORT void TpuCompile_CompileAheadOfTime(
+    TpuSerializedProto aot_compilation_request, XLA_TpuProgram** tpu_programs[],
     size_t* count, SE_Status* status);
 
 // Builds `DeviceAssignment` from `TpuCompileMetadata` serialized proto.
-void TpuCompile_BuildXLADeviceAssignment(
+TFTPU_CAPI_EXPORT void TpuCompile_BuildXLADeviceAssignment(
     TpuSerializedProto serialized_tpu_compile_metadata,
     const XLA_TpuMeshState* mesh_state,
     TpuSerializedProto* serialized_device_assignment, SE_Status* status);
 
+struct TfTpu_CompileApiFn {
+  TFTPU_ADD_FN_IN_STRUCT(TpuTopology_AvailableCoreCount);
+  TFTPU_ADD_FN_IN_STRUCT(TpuCompile_CreateCompilationCacheKey);
+  TFTPU_ADD_FN_IN_STRUCT(TpuCompile_CreateGuaranteedConstFingerprint);
+  TFTPU_ADD_FN_IN_STRUCT(TpuCompile_CompileAheadOfTime);
+  TFTPU_ADD_FN_IN_STRUCT(TpuCompile_BuildXLADeviceAssignment);
+};
+
 }  // extern "C"
 
 #endif  // TENSORFLOW_CORE_TPU_KERNELS_TPU_COMPILE_C_API_H_
diff --git a/tensorflow/core/tpu/kernels/tpu_compile_op_common.cc b/tensorflow/core/tpu/kernels/tpu_compile_op_common.cc
index 7ab1c9b8027..92d1fa1337e 100644
--- a/tensorflow/core/tpu/kernels/tpu_compile_op_common.cc
+++ b/tensorflow/core/tpu/kernels/tpu_compile_op_common.cc
@@ -353,7 +353,7 @@ Status TpuCompileOpKernelCommon::CompileTFFunctionToHlo(
     return;
   }
 
-  LogAndExit(42);
+  std::quick_exit(42);
 }
 
 /* static */ Status TpuCompileOpKernelCommon::GetDynamicShapes(
diff --git a/tensorflow/core/tpu/kernels/tpu_configuration_ops.cc b/tensorflow/core/tpu/kernels/tpu_configuration_ops.cc
index 7fa345d735c..12a3256a44f 100644
--- a/tensorflow/core/tpu/kernels/tpu_configuration_ops.cc
+++ b/tensorflow/core/tpu/kernels/tpu_configuration_ops.cc
@@ -27,6 +27,7 @@ limitations under the License.
 #include "tensorflow/core/tpu/tpu_config_c_api.h"
 #include "tensorflow/core/tpu/tpu_configuration.h"
 #include "tensorflow/core/tpu/tpu_defs.h"
+#include "tensorflow/core/tpu/tpu_library_loader.h"
 #include "tensorflow/stream_executor/tpu/proto_helper.h"
 
 namespace tensorflow {
@@ -97,13 +98,14 @@ void ConfigureDistributedTpuOp::Compute(OpKernelContext* ctx) {
   OP_REQUIRES_OK(ctx, DeleteIfExists<tpu::TpuMeshStateInterface>(
                           rmgr, tpu::kTpuMeshCommonStateResourceName));
 
-  ConfigureDistributedTpuOp_DoWork(
+  tpu::ConfigApiFn()->ConfigureDistributedTpuOp_DoWorkFn(
       num_devices_per_host.size(), num_devices_per_host.data(),
       &host_config_output_size, &host_config_output, status);
 
-  OP_REQUIRES_OK(ctx, rmgr->Create(rmgr->default_container(),
-                                   tpu::kTpuMeshCommonStateResourceName,
-                                   tpu::TpuMeshStateInterface::Create()));
+  auto* tpu_mesh = tpu::TpuMeshStateInterface::Create();
+  OP_REQUIRES_OK(ctx,
+                 rmgr->Create(rmgr->default_container(),
+                              tpu::kTpuMeshCommonStateResourceName, tpu_mesh));
 
   Tensor* ctx_output;
   OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}), &ctx_output));
@@ -112,7 +114,8 @@ void ConfigureDistributedTpuOp::Compute(OpKernelContext* ctx) {
 
   OP_REQUIRES_OK(ctx, StatusFromTF_Status(status));
   TF_DeleteStatus(status);
-  TpuConfigurationApi_FreeCharArray(host_config_output);
+
+  tpu::ConfigApiFn()->TpuConfigurationApi_FreeCharArrayFn(host_config_output);
 
   VLOG(1) << "ConfigureDistributedTpuOp done";
 }
@@ -171,7 +174,7 @@ void WaitForDistributedTpuOp::Compute(OpKernelContext* ctx) {
   OP_REQUIRES_OK(ctx, GetTpuMeshStateInterface(rmgr, &mesh_state));
   core::ScopedUnref mesh_state_unref(mesh_state);
 
-  WaitForDistributedTpuOp_DoWork(
+  tpu::ConfigApiFn()->WaitForDistributedTpuOp_DoWorkFn(
       num_hosts, num_devices_per_host,
       const_cast<const int32_t**>(mapping_arg.data()), mesh_state,
       &tpu_topology_output_size, &tpu_topology_output, status);
@@ -183,7 +186,7 @@ void WaitForDistributedTpuOp::Compute(OpKernelContext* ctx) {
 
   OP_REQUIRES_OK(ctx, StatusFromTF_Status(status));
   TF_DeleteStatus(status);
-  TpuConfigurationApi_FreeCharArray(tpu_topology_output);
+  tpu::ConfigApiFn()->TpuConfigurationApi_FreeCharArrayFn(tpu_topology_output);
 
   VLOG(1) << "WaitForDistributedTpuOp done";
 }
@@ -196,7 +199,7 @@ void ShutdownDistributedTpuOp::Compute(OpKernelContext* ctx) {
   OP_REQUIRES_OK(ctx, DeleteIfExists<tpu::TpuMeshStateInterface>(
                           GetTPUConfigResourceMgr(),
                           tpu::kTpuMeshCommonStateResourceName));
-  ShutdownDistributedTpuOp_DoWork(status);
+  tpu::ConfigApiFn()->ShutdownDistributedTpuOp_DoWorkFn(status);
   OP_REQUIRES_OK(ctx, StatusFromTF_Status(status));
   TF_DeleteStatus(status);
 
@@ -213,7 +216,7 @@ void InitializeHostForDistributedTpuOp::Compute(OpKernelContext* ctx) {
   int32_t* device_id_output;
   TF_Status* status = TF_NewStatus();
 
-  InitializeHostForDistributedTpuOp_DoWork(
+  tpu::ConfigApiFn()->InitializeHostForDistributedTpuOp_DoWorkFn(
       tpu_host_config.size(), tpu_host_config.data(),
       enable_whole_mesh_compilations_, &device_id_output_size,
       &device_id_output, status);
@@ -230,7 +233,7 @@ void InitializeHostForDistributedTpuOp::Compute(OpKernelContext* ctx) {
 
   OP_REQUIRES_OK(ctx, StatusFromTF_Status(status));
   TF_DeleteStatus(status);
-  TpuConfigurationApi_FreeInt32Array(device_id_output);
+  tpu::ConfigApiFn()->TpuConfigurationApi_FreeInt32ArrayFn(device_id_output);
 
   VLOG(1) << "InitializeHostForDistributedTpuOp done";
 }
@@ -242,7 +245,8 @@ void SetGlobalTPUArrayOp::Compute(OpKernelContext* ctx) {
   auto tpu_topology = ctx->input(0).scalar<tstring>()();
   TF_Status* status = TF_NewStatus();
 
-  SetGlobalTPUArrayOp_DoWork(tpu_topology.size(), tpu_topology.data(), status);
+  tpu::ConfigApiFn()->SetGlobalTPUArrayOp_DoWorkFn(tpu_topology.size(),
+                                                   tpu_topology.data(), status);
 
   OP_REQUIRES_OK(ctx, StatusFromTF_Status(status));
   TF_DeleteStatus(status);
@@ -257,7 +261,8 @@ void DisconnectDistributedTpuChipsOp::Compute(OpKernelContext* ctx) {
   TF_Status* status = TF_NewStatus();
   int32_t number_of_chips_output = 0;
 
-  DisconnectDistributedTpuChipsOp_DoWork(&number_of_chips_output, status);
+  tpu::ConfigApiFn()->DisconnectDistributedTpuChipsOp_DoWorkFn(
+      &number_of_chips_output, status);
 
   Tensor* ctx_output;
   OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}), &ctx_output));
diff --git a/tensorflow/core/tpu/kernels/tpu_mesh_state_c_api.h b/tensorflow/core/tpu/kernels/tpu_mesh_state_c_api.h
index 3ed65fe5cc4..a6434d7d2fd 100644
--- a/tensorflow/core/tpu/kernels/tpu_mesh_state_c_api.h
+++ b/tensorflow/core/tpu/kernels/tpu_mesh_state_c_api.h
@@ -15,20 +15,29 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_TPU_KERNELS_TPU_MESH_STATE_C_API_H_
 #define TENSORFLOW_CORE_TPU_KERNELS_TPU_MESH_STATE_C_API_H_
 
+#include "tensorflow/core/tpu/libtftpu.h"
+
 typedef struct XLA_TpuMeshState XLA_TpuMeshState;
 
 extern "C" {
 
 // Creates a new TPU mesh state object.
-XLA_TpuMeshState* TpuMeshState_Create();
+TFTPU_CAPI_EXPORT XLA_TpuMeshState* TpuMeshState_Create();
 
 // Deletes the given TPU `mesh_state` object. Once deleted the object is
 // unusable.
-void TpuMeshState_Free(XLA_TpuMeshState* mesh_state);
+TFTPU_CAPI_EXPORT void TpuMeshState_Free(XLA_TpuMeshState* mesh_state);
 
 // Returns a pointer to an opaque mesh data structure used internally.
-void* TpuMeshState_MeshCommonState(XLA_TpuMeshState* mesh_state);
+TFTPU_CAPI_EXPORT void* TpuMeshState_MeshCommonState(
+    XLA_TpuMeshState* mesh_state);
 
 }  // extern "C"
 
+struct TfTpu_MeshStateApiFn {
+  TFTPU_ADD_FN_IN_STRUCT(TpuMeshState_Create);
+  TFTPU_ADD_FN_IN_STRUCT(TpuMeshState_Free);
+  TFTPU_ADD_FN_IN_STRUCT(TpuMeshState_MeshCommonState);
+};
+
 #endif  // TENSORFLOW_CORE_TPU_KERNELS_TPU_MESH_STATE_C_API_H_
diff --git a/tensorflow/core/tpu/kernels/tpu_mesh_state_interface.h b/tensorflow/core/tpu/kernels/tpu_mesh_state_interface.h
index 34202a78718..3eff3be4915 100644
--- a/tensorflow/core/tpu/kernels/tpu_mesh_state_interface.h
+++ b/tensorflow/core/tpu/kernels/tpu_mesh_state_interface.h
@@ -21,6 +21,7 @@ limitations under the License.
 #include "tensorflow/core/protobuf/tpu/compile_metadata.pb.h"
 #include "tensorflow/core/tpu/kernels/tpu_compile_c_api.h"
 #include "tensorflow/core/tpu/kernels/tpu_mesh_state_c_api.h"
+#include "tensorflow/core/tpu/tpu_library_loader.h"
 
 namespace tensorflow {
 
@@ -38,19 +39,19 @@ class TpuMeshStateInterface : public tensorflow::ResourceBase {
 
   ~TpuMeshStateInterface() override {
     if (mesh_state_ != nullptr) {
-      TpuMeshState_Free(mesh_state_);
+      MeshStateApiFn()->TpuMeshState_FreeFn(mesh_state_);
     }
   }
 
   static TpuMeshStateInterface* Create() {
-    return new TpuMeshStateInterface(TpuMeshState_Create());
+    return new TpuMeshStateInterface(MeshStateApiFn()->TpuMeshState_CreateFn());
   }
 
   const XLA_TpuMeshState* data() const { return mesh_state_; }
 
   tensorflow::TpuMeshCommonState* mesh_common_state() const {
     return static_cast<tensorflow::TpuMeshCommonState*>(
-        TpuMeshState_MeshCommonState(mesh_state_));
+        MeshStateApiFn()->TpuMeshState_MeshCommonStateFn(mesh_state_));
   }
 
   // Returns whether we should include the device assignment as a static field
@@ -62,8 +63,8 @@ class TpuMeshStateInterface : public tensorflow::ResourceBase {
     // Static device assignment enables XLA to perform certain optimization when
     // all cores are used in the replicated computation.
     return metadata.num_cores_per_replica() * metadata.num_replicas() ==
-           TpuTopology_AvailableCoreCount(mesh_state_,
-                                          tpu_core_type);
+           CompileApiFn()->TpuTopology_AvailableCoreCountFn(mesh_state_,
+                                                            tpu_core_type);
   }
 
   string DebugString() const override { return "TpuMeshStateInterface"; }
diff --git a/tensorflow/core/tpu/kernels/tpu_util_c_api.h b/tensorflow/core/tpu/kernels/tpu_util_c_api.h
index 4d992449cfc..32b946d56c9 100644
--- a/tensorflow/core/tpu/kernels/tpu_util_c_api.h
+++ b/tensorflow/core/tpu/kernels/tpu_util_c_api.h
@@ -15,6 +15,7 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_TPU_KERNELS_TPU_UTIL_C_API_H_
 #define TENSORFLOW_CORE_TPU_KERNELS_TPU_UTIL_C_API_H_
 
+#include "tensorflow/core/tpu/libtftpu.h"
 #include "tensorflow/stream_executor/tpu/proto_helper.h"
 
 typedef struct SE_Status SE_Status;
@@ -32,4 +33,9 @@ void TpuCompile_ToTpuShapeRepresentation(
 
 }  // extern "C"
 
+struct TfTpu_UtilApiFn {
+  TFTPU_ADD_FN_IN_STRUCT(TpuCompile_IsTpuCompilationEnabled);
+  TFTPU_ADD_FN_IN_STRUCT(TpuCompile_ToTpuShapeRepresentation);
+};
+
 #endif  // TENSORFLOW_CORE_TPU_KERNELS_TPU_UTIL_C_API_H_
diff --git a/tensorflow/core/tpu/tpu_library_init_fns.inc b/tensorflow/core/tpu/tpu_library_init_fns.inc
new file mode 100644
index 00000000000..e21d7f195ad
--- /dev/null
+++ b/tensorflow/core/tpu/tpu_library_init_fns.inc
@@ -0,0 +1,166 @@
+namespace {
+
+tensorflow::Status SetTpuConfigStructFns(void* library_handle) {
+  auto* config_fn = tensorflow::tpu::ConfigApiFn();
+
+  TFTPU_SET_FN(config_fn, ConfigureDistributedTpuOp_DoWork);
+  TFTPU_SET_FN(config_fn, WaitForDistributedTpuOp_DoWork);
+  TFTPU_SET_FN(config_fn, ShutdownDistributedTpuOp_DoWork);
+  TFTPU_SET_FN(config_fn, InitializeHostForDistributedTpuOp_DoWork);
+  TFTPU_SET_FN(config_fn, SetGlobalTPUArrayOp_DoWork);
+  TFTPU_SET_FN(config_fn, DisconnectDistributedTpuChipsOp_DoWork);
+  TFTPU_SET_FN(config_fn, TpuConfigurationApi_FreeCharArray);
+  TFTPU_SET_FN(config_fn, TpuConfigurationApi_FreeInt32Array);
+
+  return tensorflow::Status::OK();
+}
+
+tensorflow::Status SetTpuMeshStateStructFns(void* library_handle) {
+  auto* mesh_state_fn = tensorflow::tpu::MeshStateApiFn();
+
+  TFTPU_SET_FN(mesh_state_fn, TpuMeshState_Create);
+  TFTPU_SET_FN(mesh_state_fn, TpuMeshState_Free);
+  TFTPU_SET_FN(mesh_state_fn, TpuMeshState_MeshCommonState);
+
+  return tensorflow::Status::OK();
+}
+
+tensorflow::Status SetCompileStructFn(void* library_handle) {
+  auto* compile_fn = tensorflow::tpu::CompileApiFn();
+
+  TFTPU_SET_FN(compile_fn, TpuTopology_AvailableCoreCount);
+  TFTPU_SET_FN(compile_fn, TpuCompile_CreateCompilationCacheKey);
+  TFTPU_SET_FN(compile_fn, TpuCompile_CreateGuaranteedConstFingerprint);
+  TFTPU_SET_FN(compile_fn, TpuCompile_CompileAheadOfTime);
+  TFTPU_SET_FN(compile_fn, TpuCompile_BuildXLADeviceAssignment);
+
+  return tensorflow::Status::OK();
+}
+
+tensorflow::Status SetExecutorStructFn(void* library_handle) {
+  auto* executor_fn = tensorflow::tpu::ExecutorApiFn();
+
+  TFTPU_SET_FN(executor_fn, TpuPlatform_New);
+  TFTPU_SET_FN(executor_fn, TpuPlatform_Free);
+  TFTPU_SET_FN(executor_fn, TpuPlatform_Initialize);
+  TFTPU_SET_FN(executor_fn, TpuPlatform_Initialized);
+  TFTPU_SET_FN(executor_fn, TpuPlatform_GetExecutor);
+  TFTPU_SET_FN(executor_fn, TpuPlatform_Id);
+  TFTPU_SET_FN(executor_fn, TpuPlatform_VisibleDeviceCount);
+  TFTPU_SET_FN(executor_fn, TpuPlatform_TpuMemoryLimit);
+  TFTPU_SET_FN(executor_fn, TpuPlatform_ShouldRegisterTpuDeviceToDeviceCopy);
+
+  TFTPU_SET_FN(executor_fn, TpuExecutor_Init);
+  TFTPU_SET_FN(executor_fn, TpuExecutor_Free);
+  TFTPU_SET_FN(executor_fn, TpuExecutor_PlatformDeviceCount);
+  TFTPU_SET_FN(executor_fn, TpuExecutor_Allocate);
+  TFTPU_SET_FN(executor_fn, TpuExecutor_Deallocate);
+  TFTPU_SET_FN(executor_fn, TpuExecutor_GetAllocatorStats);
+  TFTPU_SET_FN(executor_fn, TpuExecutor_DeviceMemoryUsage);
+  TFTPU_SET_FN(executor_fn, TpuExecutor_AllocateStream);
+  TFTPU_SET_FN(executor_fn, TpuExecutor_DeallocateStream);
+  TFTPU_SET_FN(executor_fn, TpuExecutor_CreateStreamDependency);
+  TFTPU_SET_FN(executor_fn, TpuExecutor_GetStatus);
+  TFTPU_SET_FN(executor_fn, TpuExecutor_AllocateEvent);
+  TFTPU_SET_FN(executor_fn, TpuExecutor_DeallocateEvent);
+  TFTPU_SET_FN(executor_fn, TpuExecutor_PollForEventStatus);
+  TFTPU_SET_FN(executor_fn, TpuExecutor_RecordEvent);
+  TFTPU_SET_FN(executor_fn, TpuExecutor_WaitForEvent);
+  TFTPU_SET_FN(executor_fn, TpuExecutor_AllocateTimer);
+  TFTPU_SET_FN(executor_fn, TpuExecutor_DeallocateTimer);
+  TFTPU_SET_FN(executor_fn, TpuExecutor_StartTimer);
+  TFTPU_SET_FN(executor_fn, TpuExecutor_StopTimer);
+  TFTPU_SET_FN(executor_fn, TpuExecutor_SynchronousMemcpyToHost);
+  TFTPU_SET_FN(executor_fn, TpuExecutor_SynchronousMemcpyFromHost);
+  TFTPU_SET_FN(executor_fn, TpuExecutor_MemcpyToHost);
+  TFTPU_SET_FN(executor_fn, TpuExecutor_MemcpyFromHost);
+  TFTPU_SET_FN(executor_fn, TpuExecutor_EnqueueInfeed);
+  TFTPU_SET_FN(executor_fn, TpuExecutor_DequeueOutfeed);
+  TFTPU_SET_FN(executor_fn, TpuExecutor_WaitForInfeedReady);
+  TFTPU_SET_FN(executor_fn, TpuExecutor_WaitForOutfeedReady);
+  TFTPU_SET_FN(executor_fn, TpuExecutor_BlockHostUntilDone);
+  TFTPU_SET_FN(executor_fn, TpuExecutor_BlockUntilDoneOrFailed);
+  TFTPU_SET_FN(executor_fn, TpuExecutor_SyncAndForgetFailedStreams);
+  TFTPU_SET_FN(executor_fn, TpuExecutor_SynchronizeAllActivity);
+
+  TFTPU_SET_FN(executor_fn, TpuStream_New);
+  TFTPU_SET_FN(executor_fn, TpuStream_Free);
+  TFTPU_SET_FN(executor_fn, TpuStream_Stream);
+  TFTPU_SET_FN(executor_fn, TpuStream_Status);
+  TFTPU_SET_FN(executor_fn, TpuStream_IsSameSharedMemoryLocation);
+  TFTPU_SET_FN(executor_fn, TpuStream_TpuEnqueueOnDeviceSendRecvLocal);
+
+  TFTPU_SET_FN(executor_fn, TpuEvent_New);
+  TFTPU_SET_FN(executor_fn, TpuEvent_Free);
+
+  TFTPU_SET_FN(executor_fn, TpuTimer_New);
+  TFTPU_SET_FN(executor_fn, TpuTimer_Free);
+  TFTPU_SET_FN(executor_fn, TpuTimer_Nanoseconds);
+  TFTPU_SET_FN(executor_fn, TpuTimer_Microseconds);
+
+  TFTPU_SET_FN(executor_fn, TpuStatus_New);
+  TFTPU_SET_FN(executor_fn, TpuStatus_Create);
+  TFTPU_SET_FN(executor_fn, TpuStatus_Free);
+  TFTPU_SET_FN(executor_fn, TpuStatus_Message);
+  TFTPU_SET_FN(executor_fn, TpuStatus_Code);
+  TFTPU_SET_FN(executor_fn, TpuStatus_Ok);
+
+  TFTPU_SET_FN(executor_fn, TpuStreamExecutorConfig_Default);
+  TFTPU_SET_FN(executor_fn, TpuStreamExecutorConfig_SetOrdinal);
+  TFTPU_SET_FN(executor_fn, TpuStreamExecutorConfig_Free);
+
+  TFTPU_SET_FN(executor_fn, TpuDeviceDescription_New);
+  TFTPU_SET_FN(executor_fn, TpuDeviceDescription_Free);
+
+  TFTPU_SET_FN(executor_fn, TpuExecutor_CreateDeviceDescription);
+  TFTPU_SET_FN(executor_fn, TpuExecutor_NewDeviceOptions);
+  TFTPU_SET_FN(executor_fn, TpuExecutor_FreeDeviceOptions);
+  TFTPU_SET_FN(executor_fn, TpuExecutor_HostCallback);
+
+  TFTPU_SET_FN(executor_fn, TpuTransferManager_New);
+  TFTPU_SET_FN(executor_fn, TpuTransferManager_Free);
+  TFTPU_SET_FN(executor_fn, TpuTransferManager_PlatformId);
+  TFTPU_SET_FN(executor_fn, TpuTransferManager_HostShapeToDeviceShape);
+  TFTPU_SET_FN(executor_fn, TpuTransferManager_TransferLiteralToDeviceAsync);
+  TFTPU_SET_FN(executor_fn, TpuTransferManager_TransferLiteralFromDevice);
+  TFTPU_SET_FN(executor_fn, TpuTransferManager_GetByteSizeRequirement);
+  TFTPU_SET_FN(executor_fn, TpuTransferManager_WriteSingleTupleIndexTable);
+
+  TFTPU_SET_FN(executor_fn, TpuComputationPlacer_New);
+  TFTPU_SET_FN(executor_fn, TpuComputationPlacer_Free);
+
+  return tensorflow::Status::OK();
+}
+
+tensorflow::Status SetTpuNodeContextStructFns(void* library_handle) {
+  auto* node_context_fn = tensorflow::tpu::NodeContextApiFn();
+
+  TFTPU_SET_FN(node_context_fn, TpuNodeContext_Create);
+  TFTPU_SET_FN(node_context_fn, TpuNodeContext_Free);
+  TFTPU_SET_FN(node_context_fn, TpuNodeContext_StopChipHeartbeats);
+  TFTPU_SET_FN(node_context_fn, TpuNodeContext_CloseTpuHost);
+
+  return tensorflow::Status::OK();
+}
+
+tensorflow::Status SetTpuUtilStructFns(void* library_handle) {
+  auto* util_fn = tensorflow::tpu::UtilApiFn();
+
+  TFTPU_SET_FN(util_fn, TpuCompile_IsTpuCompilationEnabled);
+  TFTPU_SET_FN(util_fn, TpuCompile_ToTpuShapeRepresentation);
+
+  return tensorflow::Status::OK();
+}
+
+tensorflow::Status InitializeTpuStructFns(void* library_handle) {
+  TF_RETURN_IF_ERROR(SetTpuConfigStructFns(library_handle));
+  TF_RETURN_IF_ERROR(SetTpuMeshStateStructFns(library_handle));
+  TF_RETURN_IF_ERROR(SetCompileStructFn(library_handle));
+  TF_RETURN_IF_ERROR(SetExecutorStructFn(library_handle));
+  TF_RETURN_IF_ERROR(SetTpuNodeContextStructFns(library_handle));
+  TF_RETURN_IF_ERROR(SetTpuUtilStructFns(library_handle));
+
+  return tensorflow::Status::OK();
+}
+
+}  // namespace
\ No newline at end of file
diff --git a/tensorflow/core/tpu/tpu_library_loader.cc b/tensorflow/core/tpu/tpu_library_loader.cc
index c89de142a9f..834b86e68a7 100644
--- a/tensorflow/core/tpu/tpu_library_loader.cc
+++ b/tensorflow/core/tpu/tpu_library_loader.cc
@@ -13,16 +13,23 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+// TODO(frankchn): Rename to `tpu_api_dlsym_initializer` or similar.
+
 #include "tensorflow/core/tpu/tpu_library_loader.h"
 
 #include <dlfcn.h>
 
-#define TFTPU_SET_FN(Struct, FnName) \
-  Struct->FnName##Fn =               \
-      reinterpret_cast<decltype(FnName)*>(dlsym(library_handle, #FnName));
-
 #include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/status.h"
+#include "tensorflow/stream_executor/tpu/tpu_node_context_c_api.h"
+#include "tensorflow/stream_executor/tpu/tpu_platform.h"
+
+#define TFTPU_SET_FN(Struct, FnName)                                       \
+  Struct->FnName##Fn =                                                     \
+      reinterpret_cast<decltype(FnName)*>(dlsym(library_handle, #FnName)); \
+  if (!(Struct->FnName##Fn)) {                                             \
+    LOG(ERROR) << #FnName " not available in this library.";               \
+  }
 
 // Reminder: Update tpu_library_loader_windows.cc if you are adding new publicly
 // visible methods.
@@ -30,28 +37,7 @@ limitations under the License.
 namespace tensorflow {
 namespace tpu {
 
-Status SetTpuInitializeStructFns(void* library_handle) {
-  auto* base_fn = InitializeApiFn();
-
-  TFTPU_SET_FN(base_fn, TfTpu_Initialize);
-
-  return Status::OK();
-}
-
-Status SetTpuConfigStructFns(void* library_handle) {
-  auto* config_fn = ConfigApiFn();
-
-  TFTPU_SET_FN(config_fn, ConfigureDistributedTpuOp_DoWork);
-  TFTPU_SET_FN(config_fn, WaitForDistributedTpuOp_DoWork);
-  TFTPU_SET_FN(config_fn, ShutdownDistributedTpuOp_DoWork);
-  TFTPU_SET_FN(config_fn, InitializeHostForDistributedTpuOp_DoWork);
-  TFTPU_SET_FN(config_fn, SetGlobalTPUArrayOp_DoWork);
-  TFTPU_SET_FN(config_fn, DisconnectDistributedTpuChipsOp_DoWork);
-  TFTPU_SET_FN(config_fn, TpuConfigurationApi_FreeCharArray);
-  TFTPU_SET_FN(config_fn, TpuConfigurationApi_FreeInt32Array);
-
-  return Status::OK();
-}
+#include "tensorflow/core/tpu/tpu_library_init_fns.inc"
 
 TfTpu_BaseFn* InitializeApiFn() {
   static TfTpu_BaseFn base_fn;
@@ -63,19 +49,48 @@ TfTpu_ConfigApiFn* ConfigApiFn() {
   return &config_api_fn;
 }
 
+TfTpu_MeshStateApiFn* MeshStateApiFn() {
+  static TfTpu_MeshStateApiFn mesh_state_api_fn;
+  return &mesh_state_api_fn;
+}
+
+TfTpu_CompileApiFn* CompileApiFn() {
+  static TfTpu_CompileApiFn compile_api_fn;
+  return &compile_api_fn;
+}
+
+TfTpu_ExecutorApiFn* ExecutorApiFn() {
+  static TfTpu_ExecutorApiFn executor_api_fn;
+  return &executor_api_fn;
+}
+
+TfTpu_NodeContextApiFn* NodeContextApiFn() {
+  static TfTpu_NodeContextApiFn node_context_api_fn;
+  return &node_context_api_fn;
+}
+
+TfTpu_UtilApiFn* UtilApiFn() {
+  static TfTpu_UtilApiFn util_api_fn;
+  return &util_api_fn;
+}
+
 Status InitializeTpuLibrary(void* library_handle) {
   bool shared_object_loaded = true;
   if (library_handle == nullptr) {
-    library_handle = dlopen(nullptr, RTLD_LAZY);
+    library_handle = dlopen(nullptr, RTLD_NOW);
     shared_object_loaded = false;
   }
 
-  TF_RETURN_IF_ERROR(SetTpuInitializeStructFns(library_handle));
-  TF_RETURN_IF_ERROR(SetTpuConfigStructFns(library_handle));
+  TF_RETURN_IF_ERROR(InitializeTpuStructFns(library_handle));
 
   if (shared_object_loaded) {
+    // TODO(frankchn): Make initialization actually work
     // Initialize TPU platform when the platform code is loaded from a library.
-    InitializeApiFn()->TfTpu_InitializeFn();
+    // InitializeApiFn()->TfTpu_InitializeFn();
+
+    // We should only register the TPU platform when the library is loaded.
+    // TODO(frankchn): Resolve the circular dependency and register the platform
+    // RegisterTpuPlatform();
   }
 
   return Status::OK();
diff --git a/tensorflow/core/tpu/tpu_library_loader.h b/tensorflow/core/tpu/tpu_library_loader.h
index a51948cf719..ba6c324707d 100644
--- a/tensorflow/core/tpu/tpu_library_loader.h
+++ b/tensorflow/core/tpu/tpu_library_loader.h
@@ -17,8 +17,13 @@ limitations under the License.
 #define TENSORFLOW_CORE_TPU_TPU_LIBRARY_LOADER_H_
 
 #include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/tpu/kernels/tpu_compile_c_api.h"
+#include "tensorflow/core/tpu/kernels/tpu_mesh_state_c_api.h"
+#include "tensorflow/core/tpu/kernels/tpu_util_c_api.h"
 #include "tensorflow/core/tpu/libtftpu.h"
 #include "tensorflow/core/tpu/tpu_config_c_api.h"
+#include "tensorflow/stream_executor/tpu/tpu_executor_c_api.h"
+#include "tensorflow/stream_executor/tpu/tpu_node_context_c_api.h"
 
 // LINT.IfChange
 namespace tensorflow {
@@ -26,10 +31,21 @@ namespace tpu {
 
 Status InitializeTpuLibrary(void* library_handle);
 
+// TODO(frankchn): Separate out API functions from the loader.
 TfTpu_BaseFn* InitializeApiFn();
 
 TfTpu_ConfigApiFn* ConfigApiFn();
 
+TfTpu_MeshStateApiFn* MeshStateApiFn();
+
+TfTpu_CompileApiFn* CompileApiFn();
+
+TfTpu_ExecutorApiFn* ExecutorApiFn();
+
+TfTpu_NodeContextApiFn* NodeContextApiFn();
+
+TfTpu_UtilApiFn* UtilApiFn();
+
 }  // namespace tpu
 }  // namespace tensorflow
 // LINT.ThenChange(//tensorflow/core/tpu/tpu_library_loader_windows.cc)
diff --git a/tensorflow/core/tpu/tpu_library_loader_windows.cc b/tensorflow/core/tpu/tpu_library_loader_windows.cc
index e7c25df415e..7cf1b5cdb1d 100644
--- a/tensorflow/core/tpu/tpu_library_loader_windows.cc
+++ b/tensorflow/core/tpu/tpu_library_loader_windows.cc
@@ -27,6 +27,16 @@ TfTpu_BaseFn* InitializeApiFn() { return nullptr; }
 
 TfTpu_ConfigApiFn* ConfigApiFn() { return nullptr; }
 
+TfTpu_MeshStateApiFn* MeshStateApiFn() { return nullptr; }
+
+TfTpu_CompileApiFn* CompileApiFn() { return nullptr; }
+
+TfTpu_ExecutorApiFn* ExecutorApiFn() { return nullptr; }
+
+TfTpu_NodeContextApiFn* NodeContextApiFn() { return nullptr; }
+
+TfTpu_UtilApiFn* UtilApiFn() { return nullptr; }
+
 Status InitializeTpuLibrary(void* library_handle) {
   return errors::Unimplemented(
       "Loading TPU library is not supported on Windows.");
diff --git a/tensorflow/stream_executor/tpu/BUILD b/tensorflow/stream_executor/tpu/BUILD
index 964f36b82c7..bf88e9809d0 100644
--- a/tensorflow/stream_executor/tpu/BUILD
+++ b/tensorflow/stream_executor/tpu/BUILD
@@ -11,20 +11,25 @@ package(
 cc_library(
     name = "tpu_executor_c_api_hdrs",
     hdrs = ["tpu_executor_c_api.h"],
+    visibility = ["//visibility:public"],
     deps = [
         "//tensorflow/c:tf_attrtype",
-        "//tensorflow/c:tf_datatype",
         "//tensorflow/c:tf_status",
+        "//tensorflow/core/tpu:libtftpu_header",
         "//tensorflow/core/tpu/kernels:tpu_ops_common_c_api_hdrs",
     ],
+    alwayslink = True,
 )
 
 cc_library(
     name = "tpu_node_context_c_api_hdrs",
     hdrs = ["tpu_node_context_c_api.h"],
+    visibility = ["//visibility:public"],
     deps = [
         ":tpu_executor_c_api_hdrs",
+        "//tensorflow/core/tpu:libtftpu_header",
     ],
+    alwayslink = True,
 )
 
 cc_library(
@@ -65,6 +70,7 @@ cc_library(
         ":status_helper",
         ":tpu_executor_c_api_hdrs",
         ":tpu_stream_interface",
+        "//tensorflow/core/tpu:tpu_library_loader",
         "//tensorflow/stream_executor:stream",
     ],
 )
@@ -75,6 +81,7 @@ cc_library(
     deps = [
         ":tpu_executor_c_api_hdrs",
         "//tensorflow/core/platform:types",
+        "//tensorflow/core/tpu:tpu_library_loader",
         "//tensorflow/stream_executor:stream",
     ],
 )
@@ -94,6 +101,7 @@ cc_library(
         ":tpu_timer",
         "//tensorflow/c:tf_status",
         "//tensorflow/core:lib",
+        "//tensorflow/core/tpu:tpu_library_loader",
         "//tensorflow/stream_executor:stream",
         "//tensorflow/stream_executor/lib",
         "@com_google_absl//absl/container:flat_hash_map",
@@ -143,6 +151,7 @@ cc_library(
         "//tensorflow/compiler/xla/service:stream_pool",
         "//tensorflow/compiler/xla/service:transfer_manager",
         "//tensorflow/core:framework",
+        "//tensorflow/core/tpu:tpu_library_loader",
         "//tensorflow/stream_executor:device_memory_allocator",
         "//tensorflow/stream_executor/lib",
         "@com_google_absl//absl/memory",
@@ -160,6 +169,7 @@ cc_library(
         ":tpu_platform_interface",
         "//tensorflow/c:tf_status",
         "//tensorflow/core/platform:types",
+        "//tensorflow/core/tpu:tpu_library_loader",
         "//tensorflow/stream_executor:stream",
         "@com_google_absl//absl/container:flat_hash_map",
     ],
@@ -191,6 +201,7 @@ cc_library(
         "//tensorflow/compiler/xla:xla_data_proto_cc",
         "//tensorflow/compiler/xla/service:shaped_buffer",
         "//tensorflow/compiler/xla/service:transfer_manager",
+        "//tensorflow/core/tpu:tpu_library_loader",
         "//tensorflow/stream_executor:stream",
     ],
 )
@@ -217,6 +228,7 @@ cc_library(
         "//tensorflow/core/platform:types",
         "//tensorflow/stream_executor:multi_platform_manager",
         "//tensorflow/stream_executor:stream_executor_headers",
+        "//tensorflow/stream_executor:stream_executor_pimpl",
     ],
 )
 
diff --git a/tensorflow/stream_executor/tpu/tpu_executor.cc b/tensorflow/stream_executor/tpu/tpu_executor.cc
index 03cab5801e6..cb1410880eb 100644
--- a/tensorflow/stream_executor/tpu/tpu_executor.cc
+++ b/tensorflow/stream_executor/tpu/tpu_executor.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include "tensorflow/c/tf_status.h"
 #include "tensorflow/core/lib/gtl/cleanup.h"
+#include "tensorflow/core/tpu/tpu_library_loader.h"
 #include "tensorflow/stream_executor/device_memory.h"
 #include "tensorflow/stream_executor/lib/status.h"
 #include "tensorflow/stream_executor/tpu/c_api_conversions.h"
@@ -33,63 +34,68 @@ namespace {
 using ::stream_executor::port::Status;
 }  // namespace
 
-TpuExecutor::~TpuExecutor() { TpuExecutor_Free(executor_); }
+TpuExecutor::~TpuExecutor() {
+  tpu::ExecutorApiFn()->TpuExecutor_FreeFn(executor_);
+}
 
 Status TpuExecutor::Init(int device_ordinal,
                          ::stream_executor::DeviceOptions device_options) {
   StatusHelper status;
   SE_DeviceOptions* options =
-      TpuExecutor_NewDeviceOptions(device_options.flags());
-  TpuExecutor_Init(executor_, device_ordinal, options, status.c_status);
-  TpuExecutor_FreeDeviceOptions(options);
+      tpu::ExecutorApiFn()->TpuExecutor_NewDeviceOptionsFn(
+          device_options.flags());
+  tpu::ExecutorApiFn()->TpuExecutor_InitFn(executor_, device_ordinal, options,
+                                           status.c_status);
+  tpu::ExecutorApiFn()->TpuExecutor_FreeDeviceOptionsFn(options);
   return status.status();
 }
 
 int TpuExecutor::PlatformDeviceCount() {
-  return TpuExecutor_PlatformDeviceCount(executor_);
+  return tpu::ExecutorApiFn()->TpuExecutor_PlatformDeviceCountFn(executor_);
 }
 
 void TpuExecutor::SyncAndForgetFailedStreams() {
-  TpuExecutor_SyncAndForgetFailedStreams(executor_);
+  tpu::ExecutorApiFn()->TpuExecutor_SyncAndForgetFailedStreamsFn(executor_);
 }
 
 bool TpuExecutor::SynchronizeAllActivity() {
-  return TpuExecutor_SynchronizeAllActivity(executor_);
+  return tpu::ExecutorApiFn()->TpuExecutor_SynchronizeAllActivityFn(executor_);
 }
 
 Status TpuExecutor::BlockHostUntilDone(Stream* stream) {
   StatusHelper status;
-  TpuExecutor_BlockHostUntilDone(
+  tpu::ExecutorApiFn()->TpuExecutor_BlockHostUntilDoneFn(
       executor_, stream_map().at(stream->implementation()), status.c_status);
   return status.status();
 }
 
 Status TpuExecutor::BlockUntilDoneOrFailed() {
   StatusHelper status;
-  TpuExecutor_BlockUntilDoneOrFailed(executor_, status.c_status);
+  tpu::ExecutorApiFn()->TpuExecutor_BlockUntilDoneOrFailedFn(executor_,
+                                                             status.c_status);
   return status.status();
 }
 
 Status TpuExecutor::GetStatus(Stream* stream) {
   StatusHelper status;
-  TpuExecutor_GetStatus(executor_, stream_map().at(stream->implementation()),
-                        status.c_status);
+  tpu::ExecutorApiFn()->TpuExecutor_GetStatusFn(
+      executor_, stream_map().at(stream->implementation()), status.c_status);
   return status.status();
 }
 
 bool TpuExecutor::AllocateStream(Stream* stream) {
-  return TpuExecutor_AllocateStream(executor_,
-                                    stream_map().at(stream->implementation()));
+  return tpu::ExecutorApiFn()->TpuExecutor_AllocateStreamFn(
+      executor_, stream_map().at(stream->implementation()));
 }
 
 void TpuExecutor::DeallocateStream(Stream* stream) {
-  TpuExecutor_DeallocateStream(executor_,
-                               stream_map().at(stream->implementation()));
+  tpu::ExecutorApiFn()->TpuExecutor_DeallocateStreamFn(
+      executor_, stream_map().at(stream->implementation()));
   stream_map().erase(stream->implementation());
 }
 
 bool TpuExecutor::CreateStreamDependency(Stream* dependent, Stream* other) {
-  return TpuExecutor_CreateStreamDependency(
+  return tpu::ExecutorApiFn()->TpuExecutor_CreateStreamDependencyFn(
       executor_, stream_map().at(dependent->implementation()),
       stream_map().at(other->implementation()));
 }
@@ -104,15 +110,15 @@ bool TpuExecutor::AllocateTimer(Timer* timer) { return true; }
 void TpuExecutor::DeallocateTimer(Timer* timer) {}
 
 bool TpuExecutor::StartTimer(Stream* stream, ::stream_executor::Timer* timer) {
-  return TpuExecutor_StartTimer(executor_,
-                                stream_map().at(stream->implementation()),
-                                timer_map_.at(timer->implementation()));
+  return tpu::ExecutorApiFn()->TpuExecutor_StartTimerFn(
+      executor_, stream_map().at(stream->implementation()),
+      timer_map_.at(timer->implementation()));
 }
 
 bool TpuExecutor::StopTimer(Stream* stream, ::stream_executor::Timer* timer) {
-  return TpuExecutor_StopTimer(executor_,
-                               stream_map().at(stream->implementation()),
-                               timer_map_.at(timer->implementation()));
+  return tpu::ExecutorApiFn()->TpuExecutor_StopTimerFn(
+      executor_, stream_map().at(stream->implementation()),
+      timer_map_.at(timer->implementation()));
 }
 
 stream_executor::Event::Status TpuExecutor::PollForEventStatus(
@@ -148,7 +154,7 @@ Status TpuExecutor::WaitForEvent(Stream* stream,
 // Called by Timer::Timer
 std::unique_ptr<::stream_executor::internal::TimerInterface>
 TpuExecutor::GetTimerImplementation() {
-  SE_Timer* tpu_timer = TpuTimer_New(executor_);
+  SE_Timer* tpu_timer = tpu::ExecutorApiFn()->TpuTimer_NewFn(executor_);
   auto ptr = absl::make_unique<TpuTimer>(tpu_timer);
   timer_map_[ptr.get()] = tpu_timer;
   return ptr;
@@ -157,7 +163,7 @@ TpuExecutor::GetTimerImplementation() {
 // Called by Stream::Stream
 std::unique_ptr<::stream_executor::internal::StreamInterface>
 TpuExecutor::GetStreamImplementation() {
-  SE_Stream* tpu_stream = TpuStream_New(executor_);
+  SE_Stream* tpu_stream = tpu::ExecutorApiFn()->TpuStream_NewFn(executor_);
   auto ptr = absl::make_unique<TpuStream>(tpu_stream);
   stream_map()[ptr.get()] = tpu_stream;
   return ptr;
@@ -166,34 +172,35 @@ TpuExecutor::GetStreamImplementation() {
 // Called by Event::Event
 std::unique_ptr<::stream_executor::internal::EventInterface>
 TpuExecutor::CreateEventImplementation() {
-  SE_Event* tpu_event = TpuEvent_New(executor_);
+  SE_Event* tpu_event = tpu::ExecutorApiFn()->TpuEvent_NewFn(executor_);
   auto ptr = absl::make_unique<TpuEvent>(tpu_event);
   event_map()[ptr.get()] = tpu_event;
   return ptr;
 }
 
 DeviceMemoryBase TpuExecutor::Allocate(uint64 size, int64 memory_space) {
-  SE_DeviceMemoryBase se_base =
-      TpuExecutor_Allocate(executor_, size, memory_space);
+  SE_DeviceMemoryBase se_base = tpu::ExecutorApiFn()->TpuExecutor_AllocateFn(
+      executor_, size, memory_space);
   return TpuConversions::SE_DeviceMemoryBaseToDeviceMemoryBase(se_base);
 }
 
 void TpuExecutor::Deallocate(const DeviceMemoryBase& memory) {
   SE_DeviceMemoryBase se_base =
       TpuConversions::DeviceMemoryBaseToSE_DeviceMemoryBase(memory);
-  TpuExecutor_Deallocate(executor_, &se_base);
+  tpu::ExecutorApiFn()->TpuExecutor_DeallocateFn(executor_, &se_base);
 }
 
 void TpuExecutor::Deallocate(DeviceMemoryBase* memory) {
   SE_DeviceMemoryBase se_base =
       TpuConversions::DeviceMemoryBaseToSE_DeviceMemoryBase(*memory);
-  TpuExecutor_Deallocate(executor_, &se_base);
+  tpu::ExecutorApiFn()->TpuExecutor_DeallocateFn(executor_, &se_base);
 }
 
 bool TpuExecutor::DeviceMemoryUsage(int64* free, int64* total) const {
   int64_t _free;
   int64_t _total;
-  if (TpuExecutor_DeviceMemoryUsage(executor_, &_free, &_total)) {
+  if (tpu::ExecutorApiFn()->TpuExecutor_DeviceMemoryUsageFn(executor_, &_free,
+                                                            &_total)) {
     *free = _free;
     *total = _total;
     return true;
@@ -204,7 +211,8 @@ bool TpuExecutor::DeviceMemoryUsage(int64* free, int64* total) const {
 absl::optional<stream_executor::AllocatorStats>
 TpuExecutor::GetAllocatorStats() {
   SE_AllocatorStats c_stats;
-  if (TpuExecutor_GetAllocatorStats(executor_, &c_stats)) {
+  if (tpu::ExecutorApiFn()->TpuExecutor_GetAllocatorStatsFn(executor_,
+                                                            &c_stats)) {
     ::stream_executor::AllocatorStats stats;
     stats.num_allocs = c_stats.num_allocs;
     stats.bytes_in_use = c_stats.bytes_in_use;
@@ -226,31 +234,33 @@ TpuExecutor::GetAllocatorStats() {
 
 Status TpuExecutor::WaitForInfeedReady(int32 infeed_queue_index) {
   StatusHelper status;
-  TpuExecutor_WaitForInfeedReady(executor_, infeed_queue_index,
-                                 status.c_status);
+  tpu::ExecutorApiFn()->TpuExecutor_WaitForInfeedReadyFn(
+      executor_, infeed_queue_index, status.c_status);
   return status.status();
 }
 
 Status TpuExecutor::WaitForOutfeedReady(int32 outfeed_queue_index) {
   StatusHelper status;
-  TpuExecutor_WaitForOutfeedReady(executor_, outfeed_queue_index,
-                                  status.c_status);
+  tpu::ExecutorApiFn()->TpuExecutor_WaitForOutfeedReadyFn(
+      executor_, outfeed_queue_index, status.c_status);
   return status.status();
 }
 
 void TpuExecutor::DequeueOutfeed(int32 outfeed_queue_index,
                                  absl::Span<uint8> bytes, StatusCallback done) {
   StatusHelper status;
-  TpuExecutor_DequeueOutfeed(executor_, outfeed_queue_index, bytes.data(),
-                             bytes.size(), status.c_status);
+  tpu::ExecutorApiFn()->TpuExecutor_DequeueOutfeedFn(
+      executor_, outfeed_queue_index, bytes.data(), bytes.size(),
+      status.c_status);
   done(status.status());
 }
 
 Status TpuExecutor::EnqueueInfeed(int32 infeed_queue_index,
                                   absl::Span<const uint8> bytes) {
   StatusHelper status;
-  TpuExecutor_EnqueueInfeed(executor_, infeed_queue_index, bytes.data(),
-                            bytes.size(), status.c_status);
+  tpu::ExecutorApiFn()->TpuExecutor_EnqueueInfeedFn(
+      executor_, infeed_queue_index, bytes.data(), bytes.size(),
+      status.c_status);
   return status.status();
 }
 
@@ -259,9 +269,9 @@ bool TpuExecutor::Memcpy(Stream* stream, void* host_dst,
                          uint64 size) {
   SE_DeviceMemoryBase se_base =
       TpuConversions::DeviceMemoryBaseToSE_DeviceMemoryBase(device_src);
-  return TpuExecutor_MemcpyToHost(executor_,
-                                  stream_map().at(stream->implementation()),
-                                  host_dst, &se_base, size);
+  return tpu::ExecutorApiFn()->TpuExecutor_MemcpyToHostFn(
+      executor_, stream_map().at(stream->implementation()), host_dst, &se_base,
+      size);
 }
 
 bool TpuExecutor::Memcpy(Stream* stream,
@@ -269,9 +279,9 @@ bool TpuExecutor::Memcpy(Stream* stream,
                          const void* host_src, uint64 size) {
   SE_DeviceMemoryBase se_base =
       TpuConversions::DeviceMemoryBaseToSE_DeviceMemoryBase(*device_dst);
-  return TpuExecutor_MemcpyFromHost(executor_,
-                                    stream_map().at(stream->implementation()),
-                                    &se_base, host_src, size);
+  return tpu::ExecutorApiFn()->TpuExecutor_MemcpyFromHostFn(
+      executor_, stream_map().at(stream->implementation()), &se_base, host_src,
+      size);
 }
 
 Status TpuExecutor::SynchronousMemcpy(
@@ -280,8 +290,8 @@ Status TpuExecutor::SynchronousMemcpy(
   StatusHelper status;
   SE_DeviceMemoryBase se_base =
       TpuConversions::DeviceMemoryBaseToSE_DeviceMemoryBase(*device_dst);
-  TpuExecutor_SynchronousMemcpyFromHost(executor_, &se_base, host_src, size,
-                                        status.c_status);
+  tpu::ExecutorApiFn()->TpuExecutor_SynchronousMemcpyFromHostFn(
+      executor_, &se_base, host_src, size, status.c_status);
   return status.status();
 }
 
@@ -291,8 +301,8 @@ Status TpuExecutor::SynchronousMemcpy(
   StatusHelper status;
   SE_DeviceMemoryBase se_base =
       TpuConversions::DeviceMemoryBaseToSE_DeviceMemoryBase(device_src);
-  TpuExecutor_SynchronousMemcpyToHost(executor_, host_dst, &se_base, size,
-                                      status.c_status);
+  tpu::ExecutorApiFn()->TpuExecutor_SynchronousMemcpyToHostFn(
+      executor_, host_dst, &se_base, size, status.c_status);
   return status.status();
 }
 
@@ -316,8 +326,8 @@ struct HostCallbackContext {
 SE_Status* HostCallbackTrampoline(void* ctx) {
   HostCallbackContext* host_ctx = reinterpret_cast<HostCallbackContext*>(ctx);
   Status status = host_ctx->callback();
-  SE_Status* c_status =
-      TpuStatus_Create(status.code(), status.error_message().c_str());
+  SE_Status* c_status = tpu::ExecutorApiFn()->TpuStatus_CreateFn(
+      status.code(), status.error_message().c_str());
   delete host_ctx;
   return c_status;
 }
@@ -325,18 +335,21 @@ SE_Status* HostCallbackTrampoline(void* ctx) {
 bool TpuExecutor::HostCallback(Stream* stream,
                                std::function<Status()> callback) {
   HostCallbackContext* ctx = new HostCallbackContext{callback};
-  return TpuExecutor_HostCallback(executor_,
-                                  stream_map().at(stream->implementation()),
-                                  &HostCallbackTrampoline, ctx);
+  return tpu::ExecutorApiFn()->TpuExecutor_HostCallbackFn(
+      executor_, stream_map().at(stream->implementation()),
+      &HostCallbackTrampoline, ctx);
 }
 
 TpuExecutor::StatusOr<std::unique_ptr<::stream_executor::DeviceDescription>>
 TpuExecutor::CreateDeviceDescription() const {
   StatusHelper status;
-  SE_DeviceDescription* description = TpuDeviceDescription_New();
-  auto cleanup = tensorflow::gtl::MakeCleanup(
-      [description]() { TpuDeviceDescription_Free(description); });
-  TpuExecutor_CreateDeviceDescription(executor_, description, status.c_status);
+  SE_DeviceDescription* description =
+      tpu::ExecutorApiFn()->TpuDeviceDescription_NewFn();
+  auto cleanup = tensorflow::gtl::MakeCleanup([description]() {
+    tpu::ExecutorApiFn()->TpuDeviceDescription_FreeFn(description);
+  });
+  tpu::ExecutorApiFn()->TpuExecutor_CreateDeviceDescriptionFn(
+      executor_, description, status.c_status);
   if (status.status().ok()) {
     stream_executor::internal::DeviceDescriptionBuilder builder;
     CHECK_NE(description->device_vendor, nullptr);
diff --git a/tensorflow/stream_executor/tpu/tpu_executor_c_api.h b/tensorflow/stream_executor/tpu/tpu_executor_c_api.h
index e77e09bb911..eee69a35b23 100644
--- a/tensorflow/stream_executor/tpu/tpu_executor_c_api.h
+++ b/tensorflow/stream_executor/tpu/tpu_executor_c_api.h
@@ -20,9 +20,9 @@ limitations under the License.
 #include <stdint.h>
 
 #include "tensorflow/c/tf_attrtype.h"
-#include "tensorflow/c/tf_datatype.h"
 #include "tensorflow/c/tf_status.h"
 #include "tensorflow/core/tpu/kernels/tpu_ops_common_c_api.h"
+#include "tensorflow/core/tpu/libtftpu.h"
 
 typedef struct SE_Platform SE_Platform;
 typedef struct SE_StreamExecutor SE_StreamExecutor;
@@ -292,6 +292,96 @@ void TpuTransferManager_WriteSingleTupleIndexTable(
 
 XLA_ComputationPlacer* TpuComputationPlacer_New();
 void TpuComputationPlacer_Free(XLA_ComputationPlacer* placer);
+
+struct TfTpu_ExecutorApiFn {
+  TFTPU_ADD_FN_IN_STRUCT(TpuPlatform_New);
+  TFTPU_ADD_FN_IN_STRUCT(TpuPlatform_Free);
+  TFTPU_ADD_FN_IN_STRUCT(TpuPlatform_Initialize);
+  TFTPU_ADD_FN_IN_STRUCT(TpuPlatform_Initialized);
+  TFTPU_ADD_FN_IN_STRUCT(TpuPlatform_GetExecutor);
+  TFTPU_ADD_FN_IN_STRUCT(TpuPlatform_Id);
+  TFTPU_ADD_FN_IN_STRUCT(TpuPlatform_VisibleDeviceCount);
+  TFTPU_ADD_FN_IN_STRUCT(TpuPlatform_TpuMemoryLimit);
+  TFTPU_ADD_FN_IN_STRUCT(TpuPlatform_ShouldRegisterTpuDeviceToDeviceCopy);
+  TFTPU_ADD_FN_IN_STRUCT(TpuExecutor_Init);
+  TFTPU_ADD_FN_IN_STRUCT(TpuExecutor_Free);
+  TFTPU_ADD_FN_IN_STRUCT(TpuExecutor_PlatformDeviceCount);
+  TFTPU_ADD_FN_IN_STRUCT(TpuExecutor_Allocate);
+  TFTPU_ADD_FN_IN_STRUCT(TpuExecutor_Deallocate);
+  TFTPU_ADD_FN_IN_STRUCT(TpuExecutor_GetAllocatorStats);
+  TFTPU_ADD_FN_IN_STRUCT(TpuExecutor_DeviceMemoryUsage);
+  TFTPU_ADD_FN_IN_STRUCT(TpuExecutor_AllocateStream);
+  TFTPU_ADD_FN_IN_STRUCT(TpuExecutor_DeallocateStream);
+  TFTPU_ADD_FN_IN_STRUCT(TpuExecutor_CreateStreamDependency);
+  TFTPU_ADD_FN_IN_STRUCT(TpuExecutor_GetStatus);
+  TFTPU_ADD_FN_IN_STRUCT(TpuExecutor_AllocateEvent);
+  TFTPU_ADD_FN_IN_STRUCT(TpuExecutor_DeallocateEvent);
+  TFTPU_ADD_FN_IN_STRUCT(TpuExecutor_PollForEventStatus);
+  TFTPU_ADD_FN_IN_STRUCT(TpuExecutor_RecordEvent);
+  TFTPU_ADD_FN_IN_STRUCT(TpuExecutor_WaitForEvent);
+  TFTPU_ADD_FN_IN_STRUCT(TpuExecutor_AllocateTimer);
+  TFTPU_ADD_FN_IN_STRUCT(TpuExecutor_DeallocateTimer);
+  TFTPU_ADD_FN_IN_STRUCT(TpuExecutor_StartTimer);
+  TFTPU_ADD_FN_IN_STRUCT(TpuExecutor_StopTimer);
+  TFTPU_ADD_FN_IN_STRUCT(TpuExecutor_SynchronousMemcpyToHost);
+  TFTPU_ADD_FN_IN_STRUCT(TpuExecutor_SynchronousMemcpyFromHost);
+  TFTPU_ADD_FN_IN_STRUCT(TpuExecutor_MemcpyToHost);
+  TFTPU_ADD_FN_IN_STRUCT(TpuExecutor_MemcpyFromHost);
+  TFTPU_ADD_FN_IN_STRUCT(TpuExecutor_EnqueueInfeed);
+  TFTPU_ADD_FN_IN_STRUCT(TpuExecutor_DequeueOutfeed);
+  TFTPU_ADD_FN_IN_STRUCT(TpuExecutor_WaitForInfeedReady);
+  TFTPU_ADD_FN_IN_STRUCT(TpuExecutor_WaitForOutfeedReady);
+  TFTPU_ADD_FN_IN_STRUCT(TpuExecutor_BlockHostUntilDone);
+  TFTPU_ADD_FN_IN_STRUCT(TpuExecutor_BlockUntilDoneOrFailed);
+  TFTPU_ADD_FN_IN_STRUCT(TpuExecutor_SyncAndForgetFailedStreams);
+  TFTPU_ADD_FN_IN_STRUCT(TpuExecutor_SynchronizeAllActivity);
+
+  TFTPU_ADD_FN_IN_STRUCT(TpuStream_New);
+  TFTPU_ADD_FN_IN_STRUCT(TpuStream_Free);
+  TFTPU_ADD_FN_IN_STRUCT(TpuStream_Stream);
+  TFTPU_ADD_FN_IN_STRUCT(TpuStream_Status);
+  TFTPU_ADD_FN_IN_STRUCT(TpuStream_IsSameSharedMemoryLocation);
+  TFTPU_ADD_FN_IN_STRUCT(TpuStream_TpuEnqueueOnDeviceSendRecvLocal);
+
+  TFTPU_ADD_FN_IN_STRUCT(TpuEvent_New);
+  TFTPU_ADD_FN_IN_STRUCT(TpuEvent_Free);
+
+  TFTPU_ADD_FN_IN_STRUCT(TpuTimer_New);
+  TFTPU_ADD_FN_IN_STRUCT(TpuTimer_Free);
+  TFTPU_ADD_FN_IN_STRUCT(TpuTimer_Nanoseconds);
+  TFTPU_ADD_FN_IN_STRUCT(TpuTimer_Microseconds);
+
+  TFTPU_ADD_FN_IN_STRUCT(TpuStatus_New);
+  TFTPU_ADD_FN_IN_STRUCT(TpuStatus_Create);
+  TFTPU_ADD_FN_IN_STRUCT(TpuStatus_Free);
+  TFTPU_ADD_FN_IN_STRUCT(TpuStatus_Message);
+  TFTPU_ADD_FN_IN_STRUCT(TpuStatus_Code);
+  TFTPU_ADD_FN_IN_STRUCT(TpuStatus_Ok);
+
+  TFTPU_ADD_FN_IN_STRUCT(TpuStreamExecutorConfig_Default);
+  TFTPU_ADD_FN_IN_STRUCT(TpuStreamExecutorConfig_SetOrdinal);
+  TFTPU_ADD_FN_IN_STRUCT(TpuStreamExecutorConfig_Free);
+
+  TFTPU_ADD_FN_IN_STRUCT(TpuDeviceDescription_New);
+  TFTPU_ADD_FN_IN_STRUCT(TpuDeviceDescription_Free);
+
+  TFTPU_ADD_FN_IN_STRUCT(TpuExecutor_CreateDeviceDescription);
+  TFTPU_ADD_FN_IN_STRUCT(TpuExecutor_NewDeviceOptions);
+  TFTPU_ADD_FN_IN_STRUCT(TpuExecutor_FreeDeviceOptions);
+  TFTPU_ADD_FN_IN_STRUCT(TpuExecutor_HostCallback);
+
+  TFTPU_ADD_FN_IN_STRUCT(TpuTransferManager_New);
+  TFTPU_ADD_FN_IN_STRUCT(TpuTransferManager_Free);
+  TFTPU_ADD_FN_IN_STRUCT(TpuTransferManager_PlatformId);
+  TFTPU_ADD_FN_IN_STRUCT(TpuTransferManager_HostShapeToDeviceShape);
+  TFTPU_ADD_FN_IN_STRUCT(TpuTransferManager_TransferLiteralToDeviceAsync);
+  TFTPU_ADD_FN_IN_STRUCT(TpuTransferManager_TransferLiteralFromDevice);
+  TFTPU_ADD_FN_IN_STRUCT(TpuTransferManager_GetByteSizeRequirement);
+  TFTPU_ADD_FN_IN_STRUCT(TpuTransferManager_WriteSingleTupleIndexTable);
+
+  TFTPU_ADD_FN_IN_STRUCT(TpuComputationPlacer_New);
+  TFTPU_ADD_FN_IN_STRUCT(TpuComputationPlacer_Free);
+};
 }
 
 // extern "C"
diff --git a/tensorflow/stream_executor/tpu/tpu_node_context.cc b/tensorflow/stream_executor/tpu/tpu_node_context.cc
index 35a9eb53bcd..356ede40fb3 100644
--- a/tensorflow/stream_executor/tpu/tpu_node_context.cc
+++ b/tensorflow/stream_executor/tpu/tpu_node_context.cc
@@ -17,6 +17,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/backend.h"
 #include "tensorflow/compiler/xla/service/platform_util.h"
 #include "tensorflow/compiler/xla/service/transfer_manager.h"
+#include "tensorflow/core/tpu/tpu_library_loader.h"
 #include "tensorflow/stream_executor/device_memory_allocator.h"
 #include "tensorflow/stream_executor/tpu/tpu_executor_c_api.h"
 #include "tensorflow/stream_executor/tpu/tpu_node_context_c_api.h"
@@ -32,15 +33,18 @@ StatusOr<std::unique_ptr<TpuNodeContext>> TpuNodeContext::Create(
     int device_ordinal) {
   StatusHelper status;
   XLA_TpuNodeContext* node_context =
-      TpuNodeContext_Create(device_ordinal, status.c_status);
+      tpu::NodeContextApiFn()->TpuNodeContext_CreateFn(device_ordinal,
+                                                       status.c_status);
   if (!status.status().ok()) {
-    TpuNodeContext_Free(node_context);
+    tpu::NodeContextApiFn()->TpuNodeContext_FreeFn(node_context);
     return status.status();
   }
   return std::make_unique<TpuNodeContext>(device_ordinal, node_context);
 }
 
-TpuNodeContext::~TpuNodeContext() { TpuNodeContext_Free(node_context_); }
+TpuNodeContext::~TpuNodeContext() {
+  tpu::NodeContextApiFn()->TpuNodeContext_FreeFn(node_context_);
+}
 
 /* static */
 Status TpuNodeContext::Initialize(int device_ordinal) {
@@ -52,14 +56,14 @@ Status TpuNodeContext::Initialize(int device_ordinal) {
 /* static */
 Status TpuNodeContext::StopChipHeartbeats() {
   StatusHelper status;
-  TpuNodeContext_StopChipHeartbeats(status.c_status);
+  tpu::NodeContextApiFn()->TpuNodeContext_StopChipHeartbeatsFn(status.c_status);
   return status.status();
 }
 
 /* static */
 Status TpuNodeContext::CloseTpuHost() {
   StatusHelper status;
-  TpuNodeContext_CloseTpuHost(status.c_status);
+  tpu::NodeContextApiFn()->TpuNodeContext_CloseTpuHostFn(status.c_status);
   return status.status();
 }
 
diff --git a/tensorflow/stream_executor/tpu/tpu_node_context_c_api.h b/tensorflow/stream_executor/tpu/tpu_node_context_c_api.h
index e5092d4842b..d47fdf37a46 100644
--- a/tensorflow/stream_executor/tpu/tpu_node_context_c_api.h
+++ b/tensorflow/stream_executor/tpu/tpu_node_context_c_api.h
@@ -15,10 +15,13 @@ limitations under the License.
 #ifndef TENSORFLOW_STREAM_EXECUTOR_TPU_TPU_NODE_CONTEXT_C_API_H_
 #define TENSORFLOW_STREAM_EXECUTOR_TPU_TPU_NODE_CONTEXT_C_API_H_
 
+#include "tensorflow/core/tpu/libtftpu.h"
 #include "tensorflow/stream_executor/tpu/tpu_executor_c_api.h"
 
 typedef struct XLA_TpuNodeContext XLA_TpuNodeContext;
 
+extern "C" {
+
 XLA_TpuNodeContext* TpuNodeContext_Create(int device_ordinal,
                                           SE_Status* status);
 void TpuNodeContext_Free(XLA_TpuNodeContext* node_context);
@@ -28,4 +31,13 @@ void TpuNodeContext_Initialize(int device_ordinal, SE_Status* status);
 void TpuNodeContext_StopChipHeartbeats(SE_Status* status);
 void TpuNodeContext_CloseTpuHost(SE_Status* status);
 
+}  // extern "C"
+
+struct TfTpu_NodeContextApiFn {
+  TFTPU_ADD_FN_IN_STRUCT(TpuNodeContext_Create);
+  TFTPU_ADD_FN_IN_STRUCT(TpuNodeContext_Free);
+  TFTPU_ADD_FN_IN_STRUCT(TpuNodeContext_StopChipHeartbeats);
+  TFTPU_ADD_FN_IN_STRUCT(TpuNodeContext_CloseTpuHost);
+};
+
 #endif  // TENSORFLOW_STREAM_EXECUTOR_TPU_TPU_NODE_CONTEXT_C_API_H_
diff --git a/tensorflow/stream_executor/tpu/tpu_platform.cc b/tensorflow/stream_executor/tpu/tpu_platform.cc
index c65d8a4207a..13a845829c1 100644
--- a/tensorflow/stream_executor/tpu/tpu_platform.cc
+++ b/tensorflow/stream_executor/tpu/tpu_platform.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/tpu/tpu_platform.h"
 
 #include "tensorflow/c/tf_status.h"
+#include "tensorflow/core/tpu/tpu_library_loader.h"
 #include "tensorflow/stream_executor/platform.h"
 #include "tensorflow/stream_executor/tpu/status_helper.h"
 #include "tensorflow/stream_executor/tpu/tpu_executor.h"
@@ -30,7 +31,9 @@ using Status = ::stream_executor::port::Status;
 template <typename T>
 using StatusOr = ::stream_executor::port::StatusOr<T>;
 
-TpuPlatform::TpuPlatform() { platform_ = TpuPlatform_New(); }
+TpuPlatform::TpuPlatform() {
+  platform_ = tpu::ExecutorApiFn()->TpuPlatform_NewFn();
+}
 
 TpuPlatform* TpuPlatform::GetRegisteredPlatform() {
   return tpu_registered_platform;
@@ -53,8 +56,8 @@ Status TpuPlatform::Initialize(
     i++;
   }
 
-  TpuPlatform_Initialize(platform_, options_size, options_key, options_value,
-                         status.c_status);
+  tpu::ExecutorApiFn()->TpuPlatform_InitializeFn(
+      platform_, options_size, options_key, options_value, status.c_status);
 
   free(options_key);
   free(options_value);
@@ -62,10 +65,16 @@ Status TpuPlatform::Initialize(
   return status.status();
 }
 
-TpuPlatform::~TpuPlatform() { TpuPlatform_Free(platform_); }
+bool TpuPlatform::Initialized() const {
+  return tpu::ExecutorApiFn()->TpuPlatform_InitializedFn(platform_);
+}
+
+TpuPlatform::~TpuPlatform() {
+  tpu::ExecutorApiFn()->TpuPlatform_FreeFn(platform_);
+}
 
 int TpuPlatform::VisibleDeviceCount() const {
-  return TpuPlatform_VisibleDeviceCount(platform_);
+  return tpu::ExecutorApiFn()->TpuPlatform_VisibleDeviceCountFn(platform_);
 }
 
 StatusOr<::stream_executor::StreamExecutor*> TpuPlatform::GetExecutor(
@@ -77,14 +86,16 @@ StatusOr<::stream_executor::StreamExecutor*> TpuPlatform::GetExecutor(
 StatusOr<std::unique_ptr<::stream_executor::StreamExecutor>>
 TpuPlatform::GetUncachedExecutor(
     const ::stream_executor::StreamExecutorConfig& config) {
-  SE_StreamExecutorConfig* c_config = TpuStreamExecutorConfig_Default();
+  SE_StreamExecutorConfig* c_config =
+      tpu::ExecutorApiFn()->TpuStreamExecutorConfig_DefaultFn();
 
-  TpuStreamExecutorConfig_SetOrdinal(c_config, config.ordinal);
+  tpu::ExecutorApiFn()->TpuStreamExecutorConfig_SetOrdinalFn(c_config,
+                                                             config.ordinal);
 
   StatusHelper status;
-  SE_StreamExecutor* executor =
-      TpuPlatform_GetExecutor(platform_, c_config, status.c_status);
-  TpuStreamExecutorConfig_Free(c_config);
+  SE_StreamExecutor* executor = tpu::ExecutorApiFn()->TpuPlatform_GetExecutorFn(
+      platform_, c_config, status.c_status);
+  tpu::ExecutorApiFn()->TpuStreamExecutorConfig_FreeFn(c_config);
   if (!status.ok()) {
     return status.status();
   }
@@ -103,27 +114,24 @@ const std::string& TpuPlatform::Name() const {
 }
 
 int64 TpuPlatform::TpuMemoryLimit() {
-  return TpuPlatform_TpuMemoryLimit(platform_);
+  return tpu::ExecutorApiFn()->TpuPlatform_TpuMemoryLimitFn(platform_);
 }
 
 bool TpuPlatform::ShouldRegisterTpuDeviceToDeviceCopy() {
-  return TpuPlatform_ShouldRegisterTpuDeviceToDeviceCopy(platform_);
+  return tpu::ExecutorApiFn()
+      ->TpuPlatform_ShouldRegisterTpuDeviceToDeviceCopyFn(platform_);
+}
+
+void RegisterTpuPlatform() {
+  static bool tpu_platform_registered = false;
+  if (!tpu_platform_registered) {
+    tensorflow::tpu_registered_platform = new tensorflow::TpuPlatform();
+    std::unique_ptr<stream_executor::Platform> platform(
+        tensorflow::tpu_registered_platform);
+    SE_CHECK_OK(stream_executor::MultiPlatformManager::RegisterPlatform(
+        std::move(platform)));
+    tpu_platform_registered = true;
+  }
 }
 
 }  // namespace tensorflow
-
-void RegisterTpuPlatform() {
-  tensorflow::tpu_registered_platform = new tensorflow::TpuPlatform();
-  std::unique_ptr<stream_executor::Platform> platform(
-      tensorflow::tpu_registered_platform);
-  SE_CHECK_OK(stream_executor::MultiPlatformManager::RegisterPlatform(
-      std::move(platform)));
-}
-
-REGISTER_MODULE_INITIALIZER(tpu_platform, RegisterTpuPlatform());
-
-// Note that module initialization sequencing is not supported in the
-// open-source project, so this will be a no-op there.
-REGISTER_MODULE_INITIALIZER_SEQUENCE(tpu_platform, multi_platform_manager);
-REGISTER_MODULE_INITIALIZER_SEQUENCE(multi_platform_manager_listener,
-                                     tpu_platform);
diff --git a/tensorflow/stream_executor/tpu/tpu_platform.h b/tensorflow/stream_executor/tpu/tpu_platform.h
index 6fdd8d15aa4..c2673ab9288 100644
--- a/tensorflow/stream_executor/tpu/tpu_platform.h
+++ b/tensorflow/stream_executor/tpu/tpu_platform.h
@@ -60,9 +60,7 @@ class TpuPlatform : public ::tensorflow::tpu::TpuPlatformInterface {
 
   bool ShouldRegisterTpuDeviceToDeviceCopy() override;
 
-  bool Initialized() const override {
-    return TpuPlatform_Initialized(platform_);
-  }
+  bool Initialized() const override;
 
   Status Initialize(
       const std::map<std::string, std::string>& platform_options) override;
@@ -124,6 +122,8 @@ class TpuPlatform : public ::tensorflow::tpu::TpuPlatformInterface {
   EventMap event_map_;
 };
 
+void RegisterTpuPlatform();
+
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_TPU_TPU_PLATFORM_H_
diff --git a/tensorflow/stream_executor/tpu/tpu_stream.h b/tensorflow/stream_executor/tpu/tpu_stream.h
index 5c71c0535f3..e1aa1164248 100644
--- a/tensorflow/stream_executor/tpu/tpu_stream.h
+++ b/tensorflow/stream_executor/tpu/tpu_stream.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_STREAM_EXECUTOR_TPU_TPU_STREAM_H_
 #define TENSORFLOW_STREAM_EXECUTOR_TPU_TPU_STREAM_H_
 
+#include "tensorflow/core/tpu/tpu_library_loader.h"
 #include "tensorflow/stream_executor/stream_executor_internal.h"
 #include "tensorflow/stream_executor/tpu/c_api_conversions.h"
 #include "tensorflow/stream_executor/tpu/status_helper.h"
@@ -27,23 +28,27 @@ class TpuStream : public tensorflow::tpu::TpuStreamInterface {
   using Status = stream_executor::port::Status;
 
   explicit TpuStream(SE_Stream* stream) : stream_(stream) {}
-  ~TpuStream() override { TpuStream_Free(stream_); }
+  ~TpuStream() override {
+    tensorflow::tpu::ExecutorApiFn()->TpuStream_FreeFn(stream_);
+  }
 
   bool IsSameSharedMemoryLocation(
       tensorflow::tpu::TpuStreamInterface* other) override {
-    return TpuStream_IsSameSharedMemoryLocation(
-        stream_, static_cast<TpuStream*>(other)->stream_);
+    return tensorflow::tpu::ExecutorApiFn()
+        ->TpuStream_IsSameSharedMemoryLocationFn(
+            stream_, static_cast<TpuStream*>(other)->stream_);
   }
 
   Status EnqueueOnTpuDeviceSendRecvLocal(
       stream_executor::DeviceMemoryBase send_buffer,
       stream_executor::DeviceMemoryBase recv_buffer) override {
     StatusHelper status;
-    TpuStream_TpuEnqueueOnDeviceSendRecvLocal(
-        stream_,
-        TpuConversions::DeviceMemoryBaseToSE_DeviceMemoryBase(send_buffer),
-        TpuConversions::DeviceMemoryBaseToSE_DeviceMemoryBase(recv_buffer),
-        status.c_status);
+    tensorflow::tpu::ExecutorApiFn()
+        ->TpuStream_TpuEnqueueOnDeviceSendRecvLocalFn(
+            stream_,
+            TpuConversions::DeviceMemoryBaseToSE_DeviceMemoryBase(send_buffer),
+            TpuConversions::DeviceMemoryBaseToSE_DeviceMemoryBase(recv_buffer),
+            status.c_status);
     return status.status();
   }
 
@@ -54,7 +59,9 @@ class TpuStream : public tensorflow::tpu::TpuStreamInterface {
 class TpuEvent : public ::stream_executor::internal::EventInterface {
  public:
   explicit TpuEvent(SE_Event* event) : event_(event) {}
-  ~TpuEvent() override { TpuEvent_Free(event_); }
+  ~TpuEvent() override {
+    tensorflow::tpu::ExecutorApiFn()->TpuEvent_FreeFn(event_);
+  }
 
  private:
   SE_Event* event_;
diff --git a/tensorflow/stream_executor/tpu/tpu_timer.h b/tensorflow/stream_executor/tpu/tpu_timer.h
index 246a0b7eb32..d7f8f660b37 100644
--- a/tensorflow/stream_executor/tpu/tpu_timer.h
+++ b/tensorflow/stream_executor/tpu/tpu_timer.h
@@ -17,6 +17,7 @@ limitations under the License.
 #define TENSORFLOW_STREAM_EXECUTOR_TPU_TPU_TIMER_H_
 
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/tpu/tpu_library_loader.h"
 #include "tensorflow/stream_executor/stream_executor_internal.h"
 #include "tensorflow/stream_executor/tpu/tpu_executor_c_api.h"
 
@@ -25,9 +26,15 @@ namespace tensorflow {
 class TpuTimer : public ::stream_executor::internal::TimerInterface {
  public:
   explicit TpuTimer(SE_Timer* timer) : timer_(timer) {}
-  ~TpuTimer() override { TpuTimer_Free(timer_); }
-  uint64 Microseconds() const override { return TpuTimer_Microseconds(timer_); }
-  uint64 Nanoseconds() const override { return TpuTimer_Nanoseconds(timer_); }
+  ~TpuTimer() override {
+    tensorflow::tpu::ExecutorApiFn()->TpuTimer_FreeFn(timer_);
+  }
+  uint64 Microseconds() const override {
+    return tensorflow::tpu::ExecutorApiFn()->TpuTimer_MicrosecondsFn(timer_);
+  }
+  uint64 Nanoseconds() const override {
+    return tensorflow::tpu::ExecutorApiFn()->TpuTimer_NanosecondsFn(timer_);
+  }
 
  private:
   SE_Timer* timer_;
diff --git a/tensorflow/stream_executor/tpu/tpu_transfer_manager.cc b/tensorflow/stream_executor/tpu/tpu_transfer_manager.cc
index 4bedc251413..934fabbf54d 100644
--- a/tensorflow/stream_executor/tpu/tpu_transfer_manager.cc
+++ b/tensorflow/stream_executor/tpu/tpu_transfer_manager.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/tpu/tpu_library_loader.h"
 #include "tensorflow/stream_executor/device_memory.h"
 #include "tensorflow/stream_executor/tpu/c_api_conversions.h"
 #include "tensorflow/stream_executor/tpu/proto_helper.h"
@@ -29,10 +30,12 @@ namespace tensorflow {
 using Status = stream_executor::port::Status;
 
 TpuTransferManager::TpuTransferManager() {
-  manager_ = TpuTransferManager_New();
+  manager_ = tpu::ExecutorApiFn()->TpuTransferManager_NewFn();
 }
 
-TpuTransferManager::~TpuTransferManager() { TpuTransferManager_Free(manager_); }
+TpuTransferManager::~TpuTransferManager() {
+  tpu::ExecutorApiFn()->TpuTransferManager_FreeFn(manager_);
+}
 
 stream_executor::Platform::Id TpuTransferManager::PlatformId() const {
   return TpuPlatform::kId;
@@ -45,8 +48,8 @@ xla::Shape TpuTransferManager::HostShapeToDeviceShape(
 
   TpuConversions::XlaShapeToCShape(host_shape, &c_host_shape);
 
-  TpuTransferManager_HostShapeToDeviceShape(manager_, &c_host_shape,
-                                            &c_device_shape);
+  tpu::ExecutorApiFn()->TpuTransferManager_HostShapeToDeviceShapeFn(
+      manager_, &c_host_shape, &c_device_shape);
   xla::Shape device_shape = TpuConversions::CShapeToXlaShape(&c_device_shape);
   TpuConversions::CShapeCleanup(&c_host_shape);
   TpuConversions::CShapeCleanup(&c_device_shape);
@@ -66,7 +69,7 @@ Status TpuTransferManager::TransferLiteralToDeviceAsync(
   TpuConversions::XLAShapedBufferToCShapedBuffer(device_buffer,
                                                  &c_device_buffer);
 
-  TpuTransferManager_TransferLiteralToDeviceAsync(
+  tpu::ExecutorApiFn()->TpuTransferManager_TransferLiteralToDeviceAsyncFn(
       manager_,
       TpuPlatform::GetRegisteredPlatform()->stream_map()->at(
           stream->implementation()),
@@ -112,7 +115,7 @@ void TpuTransferManager::TransferLiteralFromDevice(
   XLA_Literal c_literal;
   TpuConversions::XLALiteralToCLiteral(literal, &c_literal);
 
-  TpuTransferManager_TransferLiteralFromDevice(
+  tpu::ExecutorApiFn()->TpuTransferManager_TransferLiteralFromDeviceFn(
       manager_,
       TpuPlatform::GetRegisteredPlatform()->stream_map()->at(
           stream->implementation()),
@@ -127,7 +130,8 @@ int64 TpuTransferManager::GetByteSizeRequirement(
   TpuConversions::XlaShapeToCShape(shape, &c_shape);
 
   int64 size_in_bytes =
-      TpuTransferManager_GetByteSizeRequirement(manager_, &c_shape);
+      tpu::ExecutorApiFn()->TpuTransferManager_GetByteSizeRequirementFn(
+          manager_, &c_shape);
 
   TpuConversions::CShapeCleanup(&c_shape);
   return size_in_bytes;
@@ -151,7 +155,7 @@ Status TpuTransferManager::WriteSingleTupleIndexTable(
                                   region->payload()};
   StatusHelper status;
 
-  TpuTransferManager_WriteSingleTupleIndexTable(
+  tpu::ExecutorApiFn()->TpuTransferManager_WriteSingleTupleIndexTableFn(
       manager_,
       TpuPlatform::GetRegisteredPlatform()->stream_map()->at(
           stream->implementation()),
diff --git a/tensorflow/tensorflow.bzl b/tensorflow/tensorflow.bzl
index 5da15b0a4d6..4a4f8837867 100644
--- a/tensorflow/tensorflow.bzl
+++ b/tensorflow/tensorflow.bzl
@@ -2899,6 +2899,13 @@ def if_mlir(if_true, if_false = []):
         "//conditions:default": if_false,
     })
 
+def if_tpu(if_true, if_false = []):
+    """Shorthand for select()ing whether to build for TPUs."""
+    return select({
+        str(Label("//tensorflow:with_tpu_support")): if_true,
+        "//conditions:default": if_false,
+    })
+
 def tfcompile_target_cpu():
     return ""
 

From 2ff1c5a31be8a3d38e3bccd59e54cbfd5cabe5cd Mon Sep 17 00:00:00 2001
From: Kuangyuan Chen <chky@google.com>
Date: Thu, 18 Jun 2020 10:59:05 -0700
Subject: [PATCH 0511/1390] Import initialization graph in SignatureDef
 SavedModels as an MLIR function in TF saved model dialect.

PiperOrigin-RevId: 317137903
Change-Id: I7cbded06b3deafa30d3b3e3dad98cc8f056dd4e3
---
 tensorflow/compiler/mlir/tensorflow/BUILD     |   4 +-
 .../mlir/tensorflow/ir/tf_saved_model.cc      |  25 +++
 .../mlir/tensorflow/ir/tf_saved_model_ops.td  |  24 +++
 .../tests/tf_saved_model/common_v1.py         |   1 +
 .../tests/tf_saved_model/hash_table_v1.py     |  92 +++++++++++
 .../tensorflow/tests/tf_saved_model_ops.mlir  |   5 +
 .../tests/tf_saved_model_ops_invalid.mlir     |  33 ++++
 .../mlir/tensorflow/translate/import_model.cc | 149 +++++++++++++-----
 8 files changed, 290 insertions(+), 43 deletions(-)
 create mode 100644 tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/hash_table_v1.py

diff --git a/tensorflow/compiler/mlir/tensorflow/BUILD b/tensorflow/compiler/mlir/tensorflow/BUILD
index 904ccb7e820..17ed0e36a28 100644
--- a/tensorflow/compiler/mlir/tensorflow/BUILD
+++ b/tensorflow/compiler/mlir/tensorflow/BUILD
@@ -661,7 +661,9 @@ cc_library(
         ":tensorflow_types",
         ":translate_utils",
         "//tensorflow/cc/saved_model:bundle_v2",
+        "//tensorflow/cc/saved_model:constants",
         "//tensorflow/cc/saved_model:loader_lite",
+        "//tensorflow/cc/saved_model:loader_util",
         "//tensorflow/compiler/jit:shape_inference_helpers",
         "//tensorflow/compiler/mlir:op_or_arg_name_mapper",
         "//tensorflow/compiler/tf2xla:functionalize_control_flow",
@@ -673,6 +675,7 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/grappler/utils:transitive_fanin",
+        "//tensorflow/core/platform:protobuf_internal",
         "//tensorflow/core/platform:types",
         "//tensorflow/stream_executor/lib",
         "@com_google_absl//absl/algorithm:container",
@@ -682,7 +685,6 @@ cc_library(
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:optional",
         "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:Analysis",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:StandardOps",
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.cc
index 140a778770c..6af70158e14 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.cc
@@ -76,6 +76,23 @@ static LogicalResult Verify(GlobalTensorOp global_tensor) {
   return success();
 }
 
+static LogicalResult Verify(SessionInitializerOp session_initializer) {
+  mlir::SymbolTable symbol_table(
+      session_initializer.getParentOfType<ModuleOp>());
+
+  auto init_func_op =
+      symbol_table.lookup<mlir::FuncOp>(session_initializer.initializer());
+  if (!init_func_op)
+    return session_initializer.emitOpError()
+           << "the initializer function does not exist";
+
+  if (!init_func_op.getType().getResults().empty())
+    return session_initializer.emitOpError()
+           << "the initializer function should have no output";
+
+  return success();
+}
+
 #define GET_OP_CLASSES
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.cc.inc"
 
@@ -220,6 +237,14 @@ static LogicalResult VerifySavedModelModule(
       }
     }
   }
+
+  auto session_initializers = module.getOps<SessionInitializerOp>();
+  if (std::distance(session_initializers.begin(), session_initializers.end()) >
+      1) {
+    return (*++session_initializers.begin()).emitError()
+           << "there must be no more than one session_initializer op";
+  }
+
   SymbolTable symbol_table(module);
   auto symbol_uses = SymbolTable::getSymbolUses(&module.getBodyRegion());
   if (!symbol_uses.hasValue()) {
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model_ops.td b/tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model_ops.td
index 4431a160edf..497f4d90cb9 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model_ops.td
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model_ops.td
@@ -128,4 +128,28 @@ def TfSavedModel_GlobalTensorOp : TfSavedModel_Op<"global_tensor"> {
   let verifier = [{ return Verify(*this); }];
 }
 
+def TfSavedModel_SessionInitializerOp: TfSavedModel_Op<"session_initializer"> {
+  let summary = "Initializes TensorFlow session state.";
+  let description = [{
+    Represents a session initializer function initializes TensorFlow session
+    state. It is used to initialize resources in the saved model before calling
+    any exported functions. There must be no more than one session initializer
+    in a saved model.
+
+    The `initializer` represents the initialization function. The function have
+    no output and this function should be only called once.
+
+    This is used, for example, to initialize hash tables stored in resources and
+    accessed by resource name (rather than as resource handles or bound inputs
+    which is how `global_tensor`s are referenced)
+  }];
+
+  let arguments = (ins
+    FlatSymbolRefAttr:$initializer
+  );
+
+
+  let verifier = [{ return Verify(*this); }];
+}
+
 #endif // SAVED_MODEL_DIALECT
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/common_v1.py b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/common_v1.py
index 7171f63bb05..51ccbeb1fbd 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/common_v1.py
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/common_v1.py
@@ -84,6 +84,7 @@ def do_test(signature_def_map, show_debug_info=False):
     builder.add_meta_graph_and_variables(
         sess, [tf.saved_model.tag_constants.SERVING],
         signature_def_map,
+        main_op=tf.tables_initializer(),
         strip_default_attrs=True)
     builder.save()
 
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/hash_table_v1.py b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/hash_table_v1.py
new file mode 100644
index 00000000000..64847434b82
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/hash_table_v1.py
@@ -0,0 +1,92 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+# RUN: %p/hash_table_v1 | FileCheck %s
+
+# pylint: disable=missing-docstring,line-too-long
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow.compat.v1 as tf
+from tensorflow.compiler.mlir.tensorflow.tests.tf_saved_model import common_v1
+
+# Verify that the tf.versions attribute exists. It is difficult to enforce
+# contents, since the version numbers change over time. The conversion logic
+# itself is verified in the common graphdef converter, so here just assert
+# it is being invoked.
+# CHECK: module
+# CHECK-SAME: tf.versions
+# CHECK-SAME: bad_consumers
+# CHECK-SAME: min_consumer
+# CHECK-SAME: producer
+
+# CHECK: "tf_saved_model.session_initializer"() {initializer = [[init:@.*]]} : () -> ()
+# CHECK: "tf_saved_model.global_tensor"()
+
+# CHECK:      func {{@[a-zA-Z_0-9]+}}(
+# CHECK-SAME: [[ARG0:%.*]]: tensor<i32>
+# CHECK-SAME: [[ARG1:%.*]]: tensor<!tf.resource
+# CHECK-SAME: attributes {{.*}} tf_saved_model.exported_names = ["key"]
+
+# CHECK-NEXT: [[R0:%.*]] = "tf.Const"()
+# CHECK-NEXT: [[R1:%.*]] = "tf.HashTableV2"()
+# CHECK-SAME: shared_name = "[[hash_table:.*]]"
+# CHECK-NEXT: [[R2:%.*]] = "tf.LookupTableFindV2"([[R1]], [[ARG0]], [[R0]])
+# CHECK-NEXT: [[R3:%.*]] = "tf.ReadVariableOp"([[ARG1]])
+# CHECK-NEXT: [[R4:%.*]] = "tf.AddV2"([[R2]], [[R3]])
+# CHECK-NEXT: return [[R4]]
+
+# CHECK:      func [[init]]
+# CHECK-NEXT: [[R5:%.*]] = "tf.Const"()
+# CHECK-NEXT: [[R6:%.*]] = "tf.Const"()
+# CHECK-NEXT: [[R7:%.*]] = "tf.HashTableV2"()
+# CHECK-SAME: shared_name = "[[hash_table]]"
+# CHECK-NEXT: "tf.LookupTableImportV2"([[R7]], [[R5]], [[R6]])
+
+
+def Test():
+
+  z = tf.compat.v1.get_variable(
+      name='y',
+      shape=(),
+      initializer=tf.random_normal_initializer(),
+      trainable=True)
+  table_initializer = tf.lookup.KeyValueTensorInitializer(
+      keys=[1, 2, 3, 4],
+      values=[5, 6, 7, 8],
+      key_dtype=tf.int32,
+      value_dtype=tf.float32)
+  table = tf.lookup.StaticHashTable(
+      table_initializer, default_value=tf.constant(0.0))
+
+  x = tf.placeholder(tf.int32, shape=(), name='input')
+  y = table.lookup(x)
+  r = tf.add(y, z)
+
+  tensor_info_x = tf.compat.v1.saved_model.utils.build_tensor_info(x)
+  tensor_info_r = tf.compat.v1.saved_model.utils.build_tensor_info(r)
+
+  return {
+      'key': (tf.compat.v1.saved_model.signature_def_utils.build_signature_def(
+          inputs={'x': tensor_info_x},
+          outputs={'r': tensor_info_r},
+          method_name='some_function'))
+  }
+
+
+if __name__ == '__main__':
+  common_v1.set_tf_options()
+  common_v1.do_test(Test())
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model_ops.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model_ops.mlir
index 21e3bef8fd8..aa1f996da07 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model_ops.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model_ops.mlir
@@ -2,6 +2,11 @@
 
 module attributes {tf_saved_model.semantics} {
 
+  // CHECK: tf_saved_model.session_initializer
+  "tf_saved_model.session_initializer"() {
+    initializer = @f
+  } : () -> ()
+
   // Representation for constants: (immutable) global tensor.
   // CHECK: tf_saved_model.global_tensor
   "tf_saved_model.global_tensor"() {
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model_ops_invalid.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model_ops_invalid.mlir
index c055c6c9f56..544600cf6b8 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model_ops_invalid.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model_ops_invalid.mlir
@@ -258,3 +258,36 @@ module attributes {tf_saved_model.semantics} {
   // expected-error@+1 {{'type' attribute for immutable 'tf_saved_model.global_tensor' should have a static shape}}
   "tf_saved_model.global_tensor"() { sym_name = "v", type = tensor<?xf32>, value = dense<1.> : tensor<1xf32> } : () -> ()
 }
+
+// -----
+
+module attributes {tf_saved_model.semantics} {
+
+  // expected-error@+1 {{the initializer function does not exist}}
+  "tf_saved_model.session_initializer"() { initializer = @init } : () -> ()
+}
+
+// -----
+
+module attributes {tf_saved_model.semantics} {
+
+  // expected-error@+1 {{the initializer function should have no output}}
+  "tf_saved_model.session_initializer"() { initializer = @init } : () -> ()
+  func @init() -> tensor<1xf32> {
+    %0 = "tf.Const"() {value = dense<[1.0]> : tensor<1xf32> } : () -> tensor<1xf32>
+    return %0 : tensor<1xf32>
+  }
+}
+
+// -----
+
+module attributes {tf_saved_model.semantics} {
+
+  "tf_saved_model.session_initializer"() { initializer = @init } : () -> ()
+  // expected-error@+1 {{there must be no more than one session_initializer op}}
+  "tf_saved_model.session_initializer"() { initializer = @init } : () -> ()
+  func @init() -> tensor<1xf32> {
+    %0 = "tf.Const"() {value = dense<[1.0]> : tensor<1xf32> } : () -> tensor<1xf32>
+    return %0 : tensor<1xf32>
+  }
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/translate/import_model.cc b/tensorflow/compiler/mlir/tensorflow/translate/import_model.cc
index 820d0ce31fb..3cff4217215 100644
--- a/tensorflow/compiler/mlir/tensorflow/translate/import_model.cc
+++ b/tensorflow/compiler/mlir/tensorflow/translate/import_model.cc
@@ -60,6 +60,8 @@ limitations under the License.
 #include "mlir/IR/Types.h"  // from @llvm-project
 #include "mlir/IR/Verifier.h"  // from @llvm-project
 #include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "tensorflow/cc/saved_model/constants.h"
+#include "tensorflow/cc/saved_model/loader_util.h"
 #include "tensorflow/compiler/jit/shape_inference_helpers.h"
 #include "tensorflow/compiler/mlir/op_or_arg_name_mapper.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_attributes.h"
@@ -99,6 +101,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/protobuf.h"
+#include "tensorflow/core/platform/protobuf_internal.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/protobuf/graph_debug_info.pb.h"
 #include "tensorflow/core/protobuf/meta_graph.pb.h"
@@ -116,6 +119,7 @@ using mlir::NamedAttrList;
 using mlir::TensorType;
 using mlir::TF::VarHandleOp;
 using mlir::tf_saved_model::GlobalTensorOp;
+using mlir::tf_saved_model::SessionInitializerOp;
 using stream_executor::port::StatusOr;
 
 namespace {
@@ -2955,6 +2959,13 @@ void SortSavedModelModule(mlir::ModuleOp module) {
     named_global_tensor.global_tensor.getOperation()->moveBefore(
         &module.getBody()->front());
   }
+
+  auto initializers = module.getOps<SessionInitializerOp>();
+  if (!initializers.empty()) {
+    (*initializers.begin())
+        .getOperation()
+        ->moveBefore(&module.getBody()->front());
+  }
 }
 
 Status CreateSavedModelIR(
@@ -3241,17 +3252,29 @@ class SavedModelSignatureDefImporter {
                                  absl::Span<std::string> exported_names,
                                  mlir::MLIRContext* context)
       : bundle_(bundle),
+        flib_def_(OpRegistry::Global(), graph_def().library()),
+        debug_info_(),
         exported_names_(exported_names),
-        module_(mlir::ModuleOp::create(mlir::UnknownLoc::get(context))) {}
+        module_(mlir::ModuleOp::create(mlir::UnknownLoc::get(context))) {
+    // debug_info might not be loaded with loader_lite.
+    if (bundle_.debug_info != nullptr) debug_info_ = *bundle_.debug_info;
+  }
 
   // Converts the SavedModel to the SavedModel dialect. Creates an MLIR function
   // for each signature.
   StatusOr<mlir::OwningModuleRef> ConvertSignatures();
-  Status ConvertSignature(const GraphDef& graphdef,
-                          const std::string& sig_def_key,
-                          const SignatureDef& signature_def,
-                          const GraphDebugInfo& debug_info,
-                          const FunctionLibraryDefinition& flib_def);
+  Status ConvertSignature(const std::string& sig_def_key,
+                          const SignatureDef& signature_def);
+
+  // Converts the initialization graph in the SavedModel to an MLIR function.
+  Status ConvertInitializer();
+
+  // Converts a graph with feeds and fetches to an MLIR function.
+  StatusOr<mlir::OwningModuleRef> ConvertGraph(
+      const std::string& name,
+      const std::vector<std::pair<std::string, TensorInfo>>& inputs,
+      const std::vector<std::pair<std::string, TensorInfo>>& outputs,
+      const std::vector<std::string> control_outputs);
 
   // Creates GlobalTensorOp for each variable and moves each VarHandle op to
   // the enclosing function's arguments.
@@ -3273,18 +3296,62 @@ class SavedModelSignatureDefImporter {
   GraphImportConfig::InputArrays ParseInputArrays(
       const std::vector<std::pair<std::string, TensorInfo>>& inputs);
 
+  const GraphDef& graph_def() const {
+    return bundle_.meta_graph_def.graph_def();
+  }
+  const FunctionLibraryDefinition& flib_def() const { return flib_def_; }
+  const GraphDebugInfo& debug_info() const { return debug_info_; }
+
   const SavedModelBundle& bundle_;
+  FunctionLibraryDefinition flib_def_;
+  GraphDebugInfo debug_info_;
   absl::Span<std::string> exported_names_;
   mlir::OwningModuleRef module_;
 };
 
+Status SavedModelSignatureDefImporter::ConvertInitializer() {
+  std::vector<AssetFileDef> asset_file_defs;
+  TF_RETURN_IF_ERROR(
+      internal::GetAssetFileDefs(bundle_.meta_graph_def, &asset_file_defs));
+
+  if (!asset_file_defs.empty())
+    return errors::Unimplemented(
+        absl::StrCat("Assets are not supported in signaturedef importer"));
+
+  std::string init_node_name;
+  TF_RETURN_IF_ERROR(
+      internal::GetInitOp("", bundle_.meta_graph_def, &init_node_name));
+
+  if (init_node_name.empty()) return Status::OK();
+
+  TF_ASSIGN_OR_RETURN(auto sub_module,
+                      ConvertGraph(init_node_name, {}, {}, {init_node_name}));
+
+  mlir::SymbolTable symbol_table(*sub_module);
+
+  auto init_func_op = symbol_table.lookup<mlir::FuncOp>(init_node_name);
+
+  init_func_op.removeAttr("tf.entry_function");
+
+  mlir::OpBuilder builder(module_->getBodyRegion());
+
+  builder.create<mlir::tf_saved_model::SessionInitializerOp>(
+      module_->getLoc(), builder.getSymbolRefAttr(init_func_op.getName()));
+
+  // Move the converted functions to top level MLIR module.
+  auto* block = module_->getBody();
+  auto* sub_block = sub_module->getBody();
+  block->getOperations().splice(
+      mlir::Block::iterator(block->getTerminator()), sub_block->getOperations(),
+      sub_block->begin(), mlir::Block::iterator(sub_block->getTerminator()));
+
+  return Status::OK();
+}
+
 StatusOr<mlir::OwningModuleRef>
 SavedModelSignatureDefImporter::ConvertSignatures() {
   const auto& signatures = bundle_.GetSignatures();
-  const auto& graphdef = bundle_.meta_graph_def.graph_def();
-  PopulateTfVersions(module_.get(), graphdef.versions());
-
-  FunctionLibraryDefinition flib_def(OpRegistry::Global(), graphdef.library());
+  PopulateTfVersions(module_.get(), graph_def().versions());
 
   // debug_info might not be loaded with loader_lite.
   GraphDebugInfo debug_info;
@@ -3307,9 +3374,10 @@ SavedModelSignatureDefImporter::ConvertSignatures() {
       continue;
     }
 
-    TF_RETURN_IF_ERROR(ConvertSignature(graphdef, sig_def_key, signature_def,
-                                        debug_info, flib_def));
+    TF_RETURN_IF_ERROR(ConvertSignature(sig_def_key, signature_def));
   }
+
+  TF_RETURN_IF_ERROR(ConvertInitializer());
   TF_RETURN_IF_ERROR(LiftVariables());
 
   mlir::OpBuilder builder(module_->getBodyRegion());
@@ -3320,10 +3388,32 @@ SavedModelSignatureDefImporter::ConvertSignatures() {
   return std::move(module_);
 }
 
+StatusOr<mlir::OwningModuleRef> SavedModelSignatureDefImporter::ConvertGraph(
+    const std::string& name,
+    const std::vector<std::pair<std::string, TensorInfo>>& inputs,
+    const std::vector<std::pair<std::string, TensorInfo>>& outputs,
+    const std::vector<std::string> control_outputs) {
+  GraphImportConfig specs;
+  specs.prune_unused_nodes = true;
+  specs.inputs = ParseInputArrays(inputs);
+  for (auto& output : outputs) specs.outputs.push_back(output.second.name());
+  specs.control_outputs = control_outputs;
+
+  // Convert sub-graphdef to sub-graph.
+  GraphConstructorOptions options;
+  options.allow_internal_ops = true;
+  options.add_default_attributes = true;
+  Graph graph(OpRegistry::Global());
+
+  TF_RETURN_IF_ERROR(ConvertGraphDefToGraph(options, graph_def(), &graph));
+
+  // Convert sub-graph to MLIR module.true
+  return GraphDefImporter::Convert(module_->getContext(), graph, debug_info(),
+                                   flib_def(), specs, name);
+}
+
 Status SavedModelSignatureDefImporter::ConvertSignature(
-    const GraphDef& graphdef, const std::string& sig_def_key,
-    const SignatureDef& signature_def, const GraphDebugInfo& debug_info,
-    const FunctionLibraryDefinition& flib_def) {
+    const std::string& sig_def_key, const SignatureDef& signature_def) {
   // Create local vectors for the input and output and sort them to be
   // deterministic. We don't want anyone to really depend on the order, client
   // should lookup argument/result mapping by attribute name.
@@ -3339,34 +3429,9 @@ Status SavedModelSignatureDefImporter::ConvertSignature(
     return lhs.first.size() < rhs.first.size() || lhs.first > rhs.first;
   });
 
-  GraphImportConfig specs;
-  specs.prune_unused_nodes = true;
-  specs.inputs = ParseInputArrays(inputs);
-  for (auto& output : outputs) specs.outputs.push_back(output.second.name());
-
-  // Remove unused nodes and create sub-graphdef.
-  GraphDef sub_graph_def;
-  TF_RETURN_IF_ERROR(tensorflow::grappler::SetTransitiveFaninGraph(
-      graphdef, &sub_graph_def,
-      /*terminal_nodes=*/{specs.outputs.begin(), specs.outputs.end()}));
-
-  // Set the function library definitions in the pruned graphdef.
-  *sub_graph_def.mutable_library() = flib_def.ToProto();
-
-  // Convert sub-graphdef to sub-graph.
-  GraphConstructorOptions options;
-  options.allow_internal_ops = true;
-  options.add_default_attributes = true;
-  Graph sub_graph(OpRegistry::Global());
-
-  TF_RETURN_IF_ERROR(
-      ConvertGraphDefToGraph(options, sub_graph_def, &sub_graph));
-
   // Convert sub-graph to MLIR module.
-  TF_ASSIGN_OR_RETURN(
-      auto sub_module,
-      GraphDefImporter::Convert(module_->getContext(), sub_graph, debug_info,
-                                flib_def, specs, sig_def_key));
+  TF_ASSIGN_OR_RETURN(auto sub_module,
+                      ConvertGraph(sig_def_key, inputs, outputs, {}));
   mlir::OpBuilder builder(sub_module->getBodyRegion());
 
   // Find the FuncOp which corresponds to current SignatureDef.

From b8bb250ebb52d4f037975f58cb1eadb2eff3751c Mon Sep 17 00:00:00 2001
From: George Karpenkov <cheshire@google.com>
Date: Thu, 18 Jun 2020 10:59:05 -0700
Subject: [PATCH 0512/1390] Rollback of rollback: [TF/XLA] Only force retracing
 for non-unique XLA context ID for TPUReplicatedContext

Fixes https://github.com/tensorflow/tensorflow/issues/39872

PiperOrigin-RevId: 317137904
Change-Id: Id287e10a0ab2494b11427435d8f89a383eeaf392
---
 .../python/eager/def_function_xla_jit_test.py | 19 +++++++++++++++++++
 tensorflow/python/eager/function.py           |  7 ++++---
 tensorflow/python/ops/control_flow_ops.py     |  5 +++++
 tensorflow/python/tpu/tpu.py                  |  6 ++++++
 4 files changed, 34 insertions(+), 3 deletions(-)

diff --git a/tensorflow/python/eager/def_function_xla_jit_test.py b/tensorflow/python/eager/def_function_xla_jit_test.py
index b63a3b434d4..78d44a81b0b 100644
--- a/tensorflow/python/eager/def_function_xla_jit_test.py
+++ b/tensorflow/python/eager/def_function_xla_jit_test.py
@@ -29,6 +29,7 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import control_flow_util
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import tensor_array_ops
 from tensorflow.python.platform import test
@@ -385,6 +386,24 @@ class DefFunctionTest(test.TestCase):
     f64_input = constant_op.constant([1.1, 2.2, 3.3], dtype=dtypes.float64)
     self.assertAllClose([1.1, 3.3, 6.6], f(f64_input))
 
+  def testNoExcessiveRetracing(self):
+    inner_retracings = 0
+
+    @def_function.function(experimental_compile=True)
+    def inner(a, b):
+      nonlocal inner_retracings
+      inner_retracings += 1
+      return a * b + a
+
+    def outer(a, b):
+      return inner(a, b)
+
+    func_input = random_ops.random_normal([10, 10])
+    for _ in range(2):
+      def_function.function(outer)(func_input, func_input)
+
+    self.assertEqual(inner_retracings, 1)
+
 
 if __name__ == '__main__':
   ops.enable_eager_execution()
diff --git a/tensorflow/python/eager/function.py b/tensorflow/python/eager/function.py
index a40eaf886b3..c02318cb814 100644
--- a/tensorflow/python/eager/function.py
+++ b/tensorflow/python/eager/function.py
@@ -2981,9 +2981,10 @@ class Function(object):
     if not executing_eagerly:
       # We want to force function retracing for each different
       # XLAControlFlowContext, so add `xla_context_id` to the cache key.
-      tpu_context = _enclosing_xla_context()
-      if tpu_context is not None:
-        xla_context_id = id(tpu_context)
+      xla_context = _enclosing_xla_context()
+      if xla_context is not None and \
+            xla_context.RequiresUniqueFunctionRetracing():
+        xla_context_id = id(xla_context)
 
       with ops.init_scope():
         # The graph, or whether we're executing eagerly, should be a part of the
diff --git a/tensorflow/python/ops/control_flow_ops.py b/tensorflow/python/ops/control_flow_ops.py
index 3398308d42e..748f842a9e0 100644
--- a/tensorflow/python/ops/control_flow_ops.py
+++ b/tensorflow/python/ops/control_flow_ops.py
@@ -3682,6 +3682,11 @@ class XLAControlFlowContext(ControlFlowContext):
   def AddValue(self, x):
     return x
 
+  def RequiresUniqueFunctionRetracing(self):
+    """Returns whether the tf.function should be retraced if the context changes.
+    """
+    return False
+
 
 def from_control_flow_context_def(context_def, import_scope=None):
   """Deserializes `context_def` into the appropriate ControlFlowContext.
diff --git a/tensorflow/python/tpu/tpu.py b/tensorflow/python/tpu/tpu.py
index 28eba69b7da..ce3aaa8a058 100644
--- a/tensorflow/python/tpu/tpu.py
+++ b/tensorflow/python/tpu/tpu.py
@@ -639,6 +639,12 @@ class TPUReplicateContext(control_flow_ops.XLAControlFlowContext):
   def GetControlPivot(self):
     return self._pivot
 
+  def RequiresUniqueFunctionRetracing(self):
+    # More context: b/158152827. TPU stack uses the TPUReplicateContext to
+    # create replicated variable handles and cluster TPU computations, thus we
+    # always retrace a tf.function when the wrapped TPUReplicateContext changes.
+    return True
+
 
 class OutsideCompilationV2Context(control_flow_ops.ControlFlowContext):
   """The context for outside compilation in Tensorflow 2.0.

From 17eea4753dd71b397289f793b348fe9fe751873f Mon Sep 17 00:00:00 2001
From: Mihai Maruseac <mihaimaruseac@google.com>
Date: Thu, 18 Jun 2020 11:09:28 -0700
Subject: [PATCH 0513/1390] Add OSSFuzz badge to TensorFlow.

Now that TF <-> OSSFuzz works again, add badge to show that we are fuzzing the code.

PiperOrigin-RevId: 317140301
Change-Id: I401c1ffd3da37f44910fde3ba60fb2b5c925dfcc
---
 README.md                     | 1 +
 tensorflow/security/README.md | 2 ++
 2 files changed, 3 insertions(+)

diff --git a/README.md b/README.md
index 54c9470b04b..73a345706a4 100644
--- a/README.md
+++ b/README.md
@@ -95,6 +95,7 @@ for general questions and discussion, and please direct specific questions to
 The TensorFlow project strives to abide by generally accepted best practices in
 open-source software development:
 
+[![Fuzzing Status](https://oss-fuzz-build-logs.storage.googleapis.com/badges/tensorflow.svg)](https://bugs.chromium.org/p/oss-fuzz/issues/list?sort=-opened&can=1&q=proj:tensorflow)
 [![CII Best Practices](https://bestpractices.coreinfrastructure.org/projects/1486/badge)](https://bestpractices.coreinfrastructure.org/projects/1486)
 [![Contributor Covenant](https://img.shields.io/badge/Contributor%20Covenant-v1.4%20adopted-ff69b4.svg)](CODE_OF_CONDUCT.md)
 
diff --git a/tensorflow/security/README.md b/tensorflow/security/README.md
index d9fa1c77a02..34f98e640d6 100644
--- a/tensorflow/security/README.md
+++ b/tensorflow/security/README.md
@@ -1,5 +1,7 @@
 # TensorFlow Security Advisories
 
+[![Fuzzing Status](https://oss-fuzz-build-logs.storage.googleapis.com/badges/tensorflow.svg)](https://bugs.chromium.org/p/oss-fuzz/issues/list?sort=-opened&can=1&q=proj:tensorflow)
+
 We regularly publish security advisories about using TensorFlow.
 
 *Note*: In conjunction with these security advisories, we strongly encourage

From a31d5da02607a2c5eb01d7c977b92001f842cc89 Mon Sep 17 00:00:00 2001
From: Bruce Fontaine <bfontain@google.com>
Date: Thu, 18 Jun 2020 11:30:06 -0700
Subject: [PATCH 0514/1390] Wrap save/restore logic in tf.function when in
 eager mode. This allows parallel saving and restoring when using multiple
 devices.

PiperOrigin-RevId: 317144560
Change-Id: Iebc230589a5e2712da03c5db3f45e4fd7eeb5ff9
---
 .../grappler/optimizers/function_optimizer.cc |   8 +-
 .../parallel_device/parallel_device_test.py   |   4 +
 .../python/framework/auto_control_deps.py     |   2 +-
 tensorflow/python/training/saving/BUILD       |   1 +
 .../training/saving/functional_saver.py       | 111 ++++++++++++------
 .../training/saving/functional_saver_test.py  |  17 ++-
 6 files changed, 101 insertions(+), 42 deletions(-)

diff --git a/tensorflow/core/grappler/optimizers/function_optimizer.cc b/tensorflow/core/grappler/optimizers/function_optimizer.cc
index a66e645e04b..0e156aaa84c 100644
--- a/tensorflow/core/grappler/optimizers/function_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/function_optimizer.cc
@@ -837,7 +837,6 @@ const bool IsExemptFromSideEffectsExecutionValidation(const string& op) {
        "ParameterizedTruncatedNormal", "TruncatedNormal", "RandomShuffle",
        "Multinomial", "RandomGamma", "RandomGammaGrad", "RandomPoisson",
        "RandomPoissonV2",
-       // LINT.ThenChange(//tensorflow/python/framework/auto_control_deps.py)
 
        // ReadVariableOp marked as stateful because it consumes DT_RESOURCE,
        // but it can't generate any observable side-effect.
@@ -851,7 +850,12 @@ const bool IsExemptFromSideEffectsExecutionValidation(const string& op) {
        // the same device_ordinal on the same host.
        "EnqueueTPUEmbeddingSparseBatch", "EnqueueTPUEmbeddingIntegerBatch",
        "EnqueueTPUEmbeddingSparseTensorBatch",
-       "EnqueueTPUEmbeddingRaggedTensorBatch"});
+       "EnqueueTPUEmbeddingRaggedTensorBatch",
+
+       // SaveV2 and RestoreV2 should be allowed to operate in parallel on
+       // multiple hosts.
+       "SaveV2", "RestoreV2"});
+  // LINT.ThenChange(//tensorflow/python/framework/auto_control_deps.py)
   return exemption->contains(op);
 }
 
diff --git a/tensorflow/python/distribute/parallel_device/parallel_device_test.py b/tensorflow/python/distribute/parallel_device/parallel_device_test.py
index 8fc3dcb5816..1429c522aba 100644
--- a/tensorflow/python/distribute/parallel_device/parallel_device_test.py
+++ b/tensorflow/python/distribute/parallel_device/parallel_device_test.py
@@ -172,6 +172,8 @@ class ParallelDeviceTests(_VirtualDeviceTestCase):
       config.set_synchronous_execution(previous)
 
   def test_checkpointing(self):
+    self.skipTest(
+        "Disable saving until SaveableObject's methods are traceable.")
     prefix = os.path.join(self.get_temp_dir(), "ckpt")
     with self.device.scope():
       different_values = self.device.pack(
@@ -263,6 +265,8 @@ class LayerTests(_VirtualDeviceTestCase):
     self.assertIn(self.device.components[1], final_kernels[1].backing_device)
 
   def test_training_loop(self):
+    self.skipTest(
+        "Disable saving until SaveableObject's methods are traceable.")
     for _ in range(5):
       layer = _Dense(5)
       checkpoint = tracking.Checkpoint(layer=layer)
diff --git a/tensorflow/python/framework/auto_control_deps.py b/tensorflow/python/framework/auto_control_deps.py
index 51dcb248b11..4b47735e0bf 100644
--- a/tensorflow/python/framework/auto_control_deps.py
+++ b/tensorflow/python/framework/auto_control_deps.py
@@ -100,7 +100,7 @@ _ORDER_INSENSITIVE_STATEFUL_OPS = [
     "CudnnRNNV2", "CudnnRNNV3", "CudnnRNNBackpropV2", "CudnnRNNBackpropV3",
     "EnqueueTPUEmbeddingSparseBatch", "EnqueueTPUEmbeddingIntegerBatch",
     "EnqueueTPUEmbeddingSparseTensorBatch",
-    "EnqueueTPUEmbeddingRaggedTensorBatch"
+    "EnqueueTPUEmbeddingRaggedTensorBatch", "RestoreV2", "SaveV2"
 ]
 # LINT.ThenChange(//tensorflow/core/grappler/optimizers/function_optimizer.cc)
 
diff --git a/tensorflow/python/training/saving/BUILD b/tensorflow/python/training/saving/BUILD
index 670a4c35c6f..12940840309 100644
--- a/tensorflow/python/training/saving/BUILD
+++ b/tensorflow/python/training/saving/BUILD
@@ -43,6 +43,7 @@ cuda_py_test(
         ":checkpoint_options",
         ":functional_saver",
         ":saveable_hook",
+        "//tensorflow/python/eager:remote",
         "//tensorflow/python/eager:test",
     ],
 )
diff --git a/tensorflow/python/training/saving/functional_saver.py b/tensorflow/python/training/saving/functional_saver.py
index c4334e096df..3a9b565470d 100644
--- a/tensorflow/python/training/saving/functional_saver.py
+++ b/tensorflow/python/training/saving/functional_saver.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 import uuid
 
 from tensorflow.core.protobuf import saver_pb2
+from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -161,7 +162,8 @@ class MultiDeviceSaver(object):
         self._after_restore_callbacks.append(saveable.after_restore)
 
       if is_saveable:
-        saveables_by_device.setdefault(saveable.device, []).append(saveable)
+        host_device = saveable_object_util.set_cpu0(saveable.device)
+        saveables_by_device.setdefault(host_device, []).append(saveable)
 
     self._single_device_savers = {
         device: _SingleDeviceSaver(saveables)
@@ -247,33 +249,50 @@ class MultiDeviceSaver(object):
       tmp_checkpoint_prefix = string_ops.string_join(
           [file_prefix, sharded_suffix])
 
-    num_shards = len(self._single_device_savers)
-    sharded_saves = []
-    sharded_prefixes = []
-    num_shards_tensor = constant_op.constant(num_shards, name="num_shards")
-    last_device = None
-    for shard, (device, saver) in enumerate(
-        sorted(self._single_device_savers.items())):
-      last_device = device
-      with ops.device(saveable_object_util.set_cpu0(device)):
-        shard_prefix = sharded_filename(tmp_checkpoint_prefix, shard,
-                                        num_shards_tensor)
-      sharded_prefixes.append(shard_prefix)
-      with ops.device(device):
-        # _SingleDeviceSaver will use the CPU device when necessary, but initial
-        # read operations should be placed on the SaveableObject's device.
-        sharded_saves.append(saver.save(shard_prefix, options))
+    def save_fn():
+      num_shards = len(self._single_device_savers)
+      sharded_saves = []
+      sharded_prefixes = []
+      num_shards_tensor = constant_op.constant(num_shards, name="num_shards")
+      last_device = None
+      for shard, (device, saver) in enumerate(
+          sorted(self._single_device_savers.items())):
+        last_device = device
+        with ops.device(saveable_object_util.set_cpu0(device)):
+          shard_prefix = sharded_filename(tmp_checkpoint_prefix, shard,
+                                          num_shards_tensor)
+        sharded_prefixes.append(shard_prefix)
+        with ops.device(device):
+          # _SingleDeviceSaver will use the CPU device when necessary, but
+          # initial read operations should be placed on the SaveableObject's
+          # device.
+          sharded_saves.append(saver.save(shard_prefix, options))
 
-    with ops.control_dependencies(sharded_saves):
-      # Merge on the io_device if specified, otherwise co-locates the merge op
-      # with the last device used.
-      merge_device = (options.experimental_io_device or
-                      saveable_object_util.set_cpu0(last_device))
-      with ops.device(merge_device):
-        # V2 format write path consists of a metadata merge step.  Once merged,
-        # attempts to delete the temporary directory, "<user-fed prefix>_temp".
-        return gen_io_ops.merge_v2_checkpoints(
-            sharded_prefixes, file_prefix, delete_old_dirs=True)
+      with ops.control_dependencies(sharded_saves):
+        # Merge on the io_device if specified, otherwise co-locates the merge op
+        # with the last device used.
+        merge_device = (
+            options.experimental_io_device or
+            saveable_object_util.set_cpu0(last_device))
+        with ops.device(merge_device):
+          # V2 format write path consists of a metadata merge step.  Once
+          # merged, attempts to delete the temporary directory,
+          # "<user-fed prefix>_temp".
+          return gen_io_ops.merge_v2_checkpoints(
+              sharded_prefixes, file_prefix, delete_old_dirs=True)
+
+    # Since this will causes a function re-trace on each save, limit this to the
+    # cases where it is needed: eager and when there are multiple tasks/single
+    # device savers. Note that the retrace is needed to ensure we pickup the
+    # latest values of options like experimental_io_device.
+    if context.executing_eagerly() and len(self._single_device_savers) > 1:
+      # Explicitly place the identity op on the first device.
+      @def_function.function(experimental_compile=False)
+      def tf_function_save():
+        save_fn()
+      tf_function_save()
+    else:
+      return save_fn()
 
   def restore(self, file_prefix, options=None):
     """Restore the saveable objects from a checkpoint with `file_prefix`.
@@ -287,12 +306,38 @@ class MultiDeviceSaver(object):
       A dictionary mapping from SaveableObject names to restore operations.
     """
     options = options or checkpoint_options.CheckpointOptions()
-    restore_ops = {}
-    # Sort by device name to avoid propagating non-deterministic dictionary
-    # ordering in some Python versions.
-    for device, saver in sorted(self._single_device_savers.items()):
-      with ops.device(device):
-        restore_ops.update(saver.restore(file_prefix, options))
+
+    def restore_fn():
+      restore_ops = {}
+      # Sort by device name to avoid propagating non-deterministic dictionary
+      # ordering in some Python versions.
+      for device, saver in sorted(self._single_device_savers.items()):
+        with ops.device(device):
+          restore_ops.update(saver.restore(file_prefix, options))
+
+      return restore_ops
+
+    # Since this will causes a function re-trace on each save, limit this to the
+    # cases where it is needed: eager and when there are multiple tasks/single
+    # device savers. Note that the retrace is needed to ensure we pickup the
+    # latest values of options like experimental_io_device.
+    if context.executing_eagerly() and len(self._single_device_savers) > 1:
+      first_device, _ = list(self._single_device_savers.items())[0]
+      @def_function.function(experimental_compile=False)
+      def tf_function_restore():
+        restore_ops = restore_fn()
+        restore_tensors = {}
+        # tf.functions must return tensors, thus we use control dependencies so
+        # that we can return a tensor which depends on the given op.
+        with ops.device(saveable_object_util.set_cpu0(first_device)):
+          for name, op in restore_ops.items():
+            with ops.control_dependencies([op]):
+              restore_tensors[name] = array_ops.identity(file_prefix)
+        return restore_tensors
+
+      restore_ops = tf_function_restore()
+    else:
+      restore_ops = restore_fn()
 
     for callback in self._after_restore_callbacks:
       callback()
diff --git a/tensorflow/python/training/saving/functional_saver_test.py b/tensorflow/python/training/saving/functional_saver_test.py
index 7db32ff72d7..8f3eef4fb9c 100644
--- a/tensorflow/python/training/saving/functional_saver_test.py
+++ b/tensorflow/python/training/saving/functional_saver_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 import os
 
 from tensorflow.python.eager import context
+from tensorflow.python.eager import remote
 from tensorflow.python.eager import test
 from tensorflow.python.eager import wrap_function
 from tensorflow.python.framework import config
@@ -29,6 +30,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.platform import gfile
+from tensorflow.python.training import server_lib
 from tensorflow.python.training.saving import checkpoint_options
 from tensorflow.python.training.saving import functional_saver
 from tensorflow.python.training.saving import saveable_hook
@@ -126,13 +128,16 @@ class SaverTest(test.TestCase):
     second_saver.restore(save_path)
     self.assertEqual(2., self.evaluate(v2))
 
-  @test_util.run_in_graph_and_eager_modes
-  def test_checkpoint_is_sharded_by_device(self):
-    with ops.device("cpu:0"):
+  def test_checkpoint_is_sharded_by_task(self):
+    servers = [server_lib.Server.create_local_server() for _ in range(3)]
+    cluster_spec = server_lib.ClusterSpec({
+        "worker": [s.target[len("grpc://"):] for s in servers]})
+    remote.connect_to_cluster(cluster_spec)
+    with ops.device("/job:worker/task:0/cpu:0"):
       v0 = resource_variable_ops.ResourceVariable(0.)
-    with ops.device("cpu:1"):
+    with ops.device("/job:worker/task:1/cpu:0"):
       v1 = resource_variable_ops.ResourceVariable(1.)
-    with ops.device("cpu:2"):
+    with ops.device("/job:worker/task:2/cpu:0"):
       v2 = resource_variable_ops.ResourceVariable(2.)
 
     self.evaluate([v0.initializer, v1.initializer, v2.initializer])
@@ -167,7 +172,7 @@ class SaverTest(test.TestCase):
         list(saveable_object_util.saveable_objects_for_op(v2, "v2")))
     prefix = os.path.join(self.get_temp_dir(), "ckpt")
     self.evaluate(saver.save(constant_op.constant(prefix), self.local_options))
-    self.assertEqual(4, len(gfile.Glob(prefix + "*")))
+    self.assertEqual(2, len(gfile.Glob(prefix + "*")))
     self.evaluate(v0.assign(-1.))
     self.evaluate(v1.assign(-1.))
     self.evaluate(v2.assign(-1.))

From 4fab49e55013f1265fdd42e53522744ae01351ad Mon Sep 17 00:00:00 2001
From: Ken Franko <kfranko@google.com>
Date: Thu, 18 Jun 2020 11:39:19 -0700
Subject: [PATCH 0515/1390] Add suggestion to use experimental_io_device when
 save/load can't find directory.

PiperOrigin-RevId: 317146508
Change-Id: Ia74ac5baa286c90959c7d55f23187d3db46c3c4b
---
 tensorflow/python/saved_model/load.py | 13 +++++++++++--
 tensorflow/python/saved_model/save.py | 12 +++++++++++-
 2 files changed, 22 insertions(+), 3 deletions(-)

diff --git a/tensorflow/python/saved_model/load.py b/tensorflow/python/saved_model/load.py
index 74b030a3797..fb2d01cbee2 100644
--- a/tensorflow/python/saved_model/load.py
+++ b/tensorflow/python/saved_model/load.py
@@ -28,6 +28,7 @@ from tensorflow.python.eager import context
 from tensorflow.python.eager import function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
@@ -614,8 +615,16 @@ def load_internal(export_dir, tags=None, options=None, loader_cls=Loader):
     ckpt_options = checkpoint_options.CheckpointOptions(
         experimental_io_device=options.experimental_io_device)
     with ops.init_scope():
-      loader = loader_cls(object_graph_proto, saved_model_proto, export_dir,
-                          ckpt_options)
+      try:
+        loader = loader_cls(object_graph_proto, saved_model_proto, export_dir,
+                            ckpt_options)
+      except errors.NotFoundError as err:
+        raise FileNotFoundError(
+            str(err) + "\n If trying to load on a different device from the "
+            "computational device, consider using setting the "
+            "`experimental_io_device` option on tf.saved_model.LoadOptions "
+            "to the io_device such as '/job:localhost'."
+        )
       root = loader.get(0)
       if isinstance(loader, Loader):
         root.graph_debug_info = loader.adjust_debug_info_func_names(debug_info)
diff --git a/tensorflow/python/saved_model/save.py b/tensorflow/python/saved_model/save.py
index e22b0129dda..5844c80995f 100644
--- a/tensorflow/python/saved_model/save.py
+++ b/tensorflow/python/saved_model/save.py
@@ -33,6 +33,7 @@ from tensorflow.python.eager import function as defun
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import error_interpolation
+from tensorflow.python.framework import errors
 from tensorflow.python.framework import meta_graph
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
@@ -966,7 +967,16 @@ def save(obj, export_dir, signatures=None, options=None):
   # SavedModel. Users rely on checking saved_model_dir/saved_model.pb as an
   # indication that the SavedModel is completely written.
   if context.executing_eagerly():
-    context.async_wait()  # Ensure save operations have completed.
+    try:
+      context.async_wait()  # Ensure save operations have completed.
+    except errors.NotFoundError as err:
+      raise FileNotFoundError(
+          str(err) + "\n If trying to save on a different device from the "
+          "computational device, consider using setting the "
+          "`experimental_io_device` option on tf.saved_model.SaveOptions "
+          "to the io_device such as '/job:localhost'."
+      )
+
   path = os.path.join(
       compat.as_str(export_dir),
       compat.as_str(constants.SAVED_MODEL_FILENAME_PB))

From bd0a10139a8fa11eb49aa759eca1bb52e662ba34 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 18 Jun 2020 11:50:49 -0700
Subject: [PATCH 0516/1390] Update ops-related pbtxt files.

PiperOrigin-RevId: 317148838
Change-Id: Icecbfedaea80cf2ae6a1424872df62126b7106f1
---
 .../BandedTriangularSolve.pbtxt               |  42 +++++++
 ...tatelessParameterizedTruncatedNormal.pbtxt |  65 +++++++++++
 tensorflow/core/ops/ops.pbtxt                 | 107 ++++++++++++++++++
 3 files changed, 214 insertions(+)
 create mode 100644 tensorflow/core/ops/compat/ops_history_v2/BandedTriangularSolve.pbtxt
 create mode 100644 tensorflow/core/ops/compat/ops_history_v2/StatelessParameterizedTruncatedNormal.pbtxt

diff --git a/tensorflow/core/ops/compat/ops_history_v2/BandedTriangularSolve.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/BandedTriangularSolve.pbtxt
new file mode 100644
index 00000000000..5cf85a62392
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v2/BandedTriangularSolve.pbtxt
@@ -0,0 +1,42 @@
+op {
+  name: "BandedTriangularSolve"
+  input_arg {
+    name: "matrix"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rhs"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "lower"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "adjoint"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+        type: DT_HALF
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/StatelessParameterizedTruncatedNormal.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/StatelessParameterizedTruncatedNormal.pbtxt
new file mode 100644
index 00000000000..598125677b1
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v2/StatelessParameterizedTruncatedNormal.pbtxt
@@ -0,0 +1,65 @@
+op {
+  name: "StatelessParameterizedTruncatedNormal"
+  input_arg {
+    name: "shape"
+    type_attr: "S"
+  }
+  input_arg {
+    name: "seed"
+    type_attr: "Tseed"
+  }
+  input_arg {
+    name: "means"
+    type_attr: "dtype"
+  }
+  input_arg {
+    name: "stddevs"
+    type_attr: "dtype"
+  }
+  input_arg {
+    name: "minvals"
+    type_attr: "dtype"
+  }
+  input_arg {
+    name: "maxvals"
+    type_attr: "dtype"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "S"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "Tseed"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/ops.pbtxt b/tensorflow/core/ops/ops.pbtxt
index d99f8b8a479..1f1cf7444fb 100644
--- a/tensorflow/core/ops/ops.pbtxt
+++ b/tensorflow/core/ops/ops.pbtxt
@@ -3070,6 +3070,48 @@ op {
     }
   }
 }
+op {
+  name: "BandedTriangularSolve"
+  input_arg {
+    name: "matrix"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rhs"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "lower"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "adjoint"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+        type: DT_HALF
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
 op {
   name: "Barrier"
   output_arg {
@@ -48849,6 +48891,71 @@ op {
     }
   }
 }
+op {
+  name: "StatelessParameterizedTruncatedNormal"
+  input_arg {
+    name: "shape"
+    type_attr: "S"
+  }
+  input_arg {
+    name: "seed"
+    type_attr: "Tseed"
+  }
+  input_arg {
+    name: "means"
+    type_attr: "dtype"
+  }
+  input_arg {
+    name: "stddevs"
+    type_attr: "dtype"
+  }
+  input_arg {
+    name: "minvals"
+    type_attr: "dtype"
+  }
+  input_arg {
+    name: "maxvals"
+    type_attr: "dtype"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "S"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "Tseed"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
 op {
   name: "StatelessRandomBinomial"
   input_arg {

From 0957b4306325a2a1c0bded767dd0199f118c123e Mon Sep 17 00:00:00 2001
From: Robert David <lrdx@google.com>
Date: Thu, 18 Jun 2020 11:57:49 -0700
Subject: [PATCH 0517/1390] LSTM: Use int for 'count' values, fix usage of some
 appropriate intN_t for exact-size values.

PiperOrigin-RevId: 317150337
Change-Id: I5cd8bf8b8231f16cb2d130a8209c45a857b30d21
---
 tensorflow/lite/kernels/lstm.cc      | 13 +++++++------
 tensorflow/lite/kernels/lstm_eval.cc | 14 +++++++-------
 2 files changed, 14 insertions(+), 13 deletions(-)

diff --git a/tensorflow/lite/kernels/lstm.cc b/tensorflow/lite/kernels/lstm.cc
index 0e0c1b9c0f0..aa6a112a022 100644
--- a/tensorflow/lite/kernels/lstm.cc
+++ b/tensorflow/lite/kernels/lstm.cc
@@ -78,14 +78,14 @@ TfLiteStatus PopulateQuantizedLstmParams8x8_16(
   auto* proj_params = static_cast<TfLiteAffineQuantization*>(
       output_tensor->quantization.params);
   if (cell_clip > 0.0) {
-    integer_lstm_param->quantized_cell_clip = static_cast<int32_t>(std::min(
+    integer_lstm_param->quantized_cell_clip = static_cast<int16_t>(std::min(
         std::max(cell_clip / cell_state_params->scale->data[0], -32768.0f),
         32767.0f));
   } else {
     integer_lstm_param->quantized_cell_clip = 0;
   }
   if (proj_clip > 0.0) {
-    integer_lstm_param->quantized_proj_clip = static_cast<int32_t>(std::min(
+    integer_lstm_param->quantized_proj_clip = static_cast<int8_t>(std::min(
         std::max(proj_clip / proj_params->scale->data[0], -128.0f), 127.0f));
   } else {
     integer_lstm_param->quantized_proj_clip = 0;
@@ -703,14 +703,15 @@ TfLiteStatus PopulateQuantizedLstmParams8x8_8(
       output_tensor->quantization.params);
   TF_LITE_ENSURE_EQ(context, cell_state_params->scale->data[0], 1.0 / 32768);
   if (cell_clip > 0.0 && cell_clip < 1.0) {
-    integer_lstm_param->quantized_cell_clip =
-        static_cast<int>(cell_clip / cell_state_params->scale->data[0]);
+    integer_lstm_param->quantized_cell_clip = static_cast<int16_t>(std::min(
+        std::max(cell_clip / cell_state_params->scale->data[0], -32768.0f),
+        32767.0f));
   } else {
     integer_lstm_param->quantized_cell_clip = 0;
   }
   if (proj_clip > 0.0) {
-    integer_lstm_param->quantized_proj_clip =
-        proj_clip / proj_params->scale->data[0];
+    integer_lstm_param->quantized_proj_clip = static_cast<int8_t>(std::min(
+        std::max(proj_clip / proj_params->scale->data[0], -128.0f), 127.0f));
   } else {
     integer_lstm_param->quantized_proj_clip = 0;
   }
diff --git a/tensorflow/lite/kernels/lstm_eval.cc b/tensorflow/lite/kernels/lstm_eval.cc
index 2e7f300f9a9..f45d46762bf 100644
--- a/tensorflow/lite/kernels/lstm_eval.cc
+++ b/tensorflow/lite/kernels/lstm_eval.cc
@@ -1044,8 +1044,8 @@ inline void LstmStepInteger(
     const int32_t* recurrent_to_output_effective_bias,
     const int32_t* input_to_input_effective_bias,
     const int32_t* recurrent_to_input_effective_bias,
-    const int32_t* projection_effective_bias, int32 n_batch, int32 n_cell,
-    int32 n_input, int32 n_output, int8_t* output_state_ptr,
+    const int32_t* projection_effective_bias, int n_batch, int n_cell,
+    int n_input, int n_output, int8_t* output_state_ptr,
     int32_t output_state_zp, int16_t* cell_ptr, int8_t* output_ptr,
     int16_t* scratch_0_ptr, int16_t* scratch_1_ptr, int16_t* scratch_2_ptr,
     int16_t* scratch_3_ptr, int8_t* scratch_4_ptr, int32_t* scratch_5_ptr,
@@ -1362,9 +1362,9 @@ void LstmStepInteger(
     const int32_t* cell_gate_bias_ptr, const int32_t* output_bias_ptr,
     const int32_t* proj_bias_ptr, const TfLiteLSTMParams* params,
     const int32_t* intermediate_scale_a, const int32_t* intermediate_scale_b,
-    const int32_t* intermediate_zp, int32 quantized_cell_clip,
-    int32 quantized_proj_clip, int32 n_batch, int32 n_cell, int32 n_input,
-    int32 n_output, int32 output_batch_leading_dim, int8_t* output_state_ptr,
+    const int32_t* intermediate_zp, int16_t quantized_cell_clip,
+    int8_t quantized_proj_clip, int n_batch, int n_cell, int n_input,
+    int n_output, int output_batch_leading_dim, int8_t* output_state_ptr,
     int32_t output_state_zp, int16_t* cell_ptr, int8_t* output_ptr,
     int8_t* scratch0, int8_t* scratch1, int16_t* scratch2, int16_t* scratch3,
     int16_t* scratch4, int16_t* scratch5, int16_t* scratch6,
@@ -2129,8 +2129,8 @@ TfLiteStatus EvalInteger8x8_8(
   int8_t* output_state_ptr = GetTensorData<int8_t>(output_state);
   int8_t* output_ptr = nullptr;
 
-  const int32 input_zp = input->params.zero_point;
-  const int32 output_state_zp = output_state->params.zero_point;
+  const int32_t input_zp = input->params.zero_point;
+  const int32_t output_state_zp = output_state->params.zero_point;
 
   // Get params for time/batch/sequence.
   const int output_batch_leading_dim =

From 0cd2551d8c0ce6f1d8e95f465fa8f3d47054eb32 Mon Sep 17 00:00:00 2001
From: Akshay Modi <nareshmodi@google.com>
Date: Thu, 18 Jun 2020 12:00:17 -0700
Subject: [PATCH 0518/1390] Rewrite tf numpy's ndarray as a composite tensor.

Expand composites when taking gradients.

PiperOrigin-RevId: 317150904
Change-Id: I2fcb0eadd9797340e468bcf82afd17da0d0369e6
---
 tensorflow/python/eager/backprop.py           | 12 ++-
 tensorflow/python/eager/backprop_test.py      | 69 ++++++++++++++
 tensorflow/python/eager/function.py           |  4 +-
 tensorflow/python/ops/numpy_ops/BUILD         | 10 ++
 tensorflow/python/ops/numpy_ops/np_arrays.py  | 41 +++++++-
 .../python/ops/numpy_ops/np_arrays_test.py    | 18 ++++
 .../python/ops/numpy_ops/np_interop_test.py   | 94 +++++++++++++++++++
 7 files changed, 241 insertions(+), 7 deletions(-)
 create mode 100644 tensorflow/python/ops/numpy_ops/np_interop_test.py

diff --git a/tensorflow/python/eager/backprop.py b/tensorflow/python/eager/backprop.py
index dc7bb7c4b11..5c2deb9c0f2 100644
--- a/tensorflow/python/eager/backprop.py
+++ b/tensorflow/python/eager/backprop.py
@@ -14,6 +14,9 @@
 # ==============================================================================
 """Code for backpropagation using the tape utilities."""
 
+# TODO(b/159343581): Properly support CompositeTensor in all functions in this
+# file.
+
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
@@ -1021,7 +1024,7 @@ class GradientTape(object):
             "derivatives.", 1)
 
     flat_targets = []
-    for t in nest.flatten(target):
+    for t in nest.flatten(target, expand_composites=True):
       if not backprop_util.IsTrainable(t):
         logging.vlog(
             logging.WARN, "The dtype of the target tensor must be "
@@ -1032,7 +1035,7 @@ class GradientTape(object):
           t = ops.convert_to_tensor(t)
       flat_targets.append(t)
 
-    flat_sources = nest.flatten(sources)
+    flat_sources = nest.flatten(sources, expand_composites=True)
     flat_sources_raw = flat_sources
     flat_sources = [_handle_or_self(x) for x in flat_sources]
     for t in flat_sources_raw:
@@ -1048,7 +1051,8 @@ class GradientTape(object):
 
     if output_gradients is not None:
       output_gradients = [None if x is None else ops.convert_to_tensor(x)
-                          for x in nest.flatten(output_gradients)]
+                          for x in nest.flatten(
+                              output_gradients, expand_composites=True)]
 
     flat_grad = imperative_grad.imperative_grad(
         self._tape,
@@ -1063,7 +1067,7 @@ class GradientTape(object):
       self._watched_variables = self._tape.watched_variables()
       self._tape = None
 
-    grad = nest.pack_sequence_as(sources, flat_grad)
+    grad = nest.pack_sequence_as(sources, flat_grad, expand_composites=True)
     return grad
 
   def jacobian(self,
diff --git a/tensorflow/python/eager/backprop_test.py b/tensorflow/python/eager/backprop_test.py
index a0f98fc0a44..abdac526ce4 100644
--- a/tensorflow/python/eager/backprop_test.py
+++ b/tensorflow/python/eager/backprop_test.py
@@ -28,6 +28,7 @@ from tensorflow.python.eager import def_function
 from tensorflow.python.eager import function
 from tensorflow.python.eager import tape as tape_lib
 from tensorflow.python.eager import test
+from tensorflow.python.framework import composite_tensor
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
@@ -36,6 +37,7 @@ from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.framework import test_util
+from tensorflow.python.framework import type_spec
 from tensorflow.python.framework.memory_checker import MemoryChecker
 from tensorflow.python.layers.pooling import max_pooling3d
 from tensorflow.python.ops import array_ops
@@ -52,6 +54,44 @@ from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import sparse_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.training import training
+from tensorflow.python.util import nest
+
+
+# TODO(nareshmodi): This is copied from composite_tensor_test.py. Extract it out
+# to a common library to avoid duplication.
+class CTSpec(type_spec.TypeSpec):
+  """A generic CompositeTensor TypeSpec, used for constructing tests."""
+
+  def __init__(self, component_specs):
+    self.component_specs = component_specs
+
+  value_type = property(lambda self: CT)
+  _component_specs = property(lambda self: self.component_specs)
+
+  def _serialize(self):
+    return (self.component_specs,)
+
+  def _to_components(self, value):
+    return value.components
+
+  def _from_components(self, tensor_list):
+    return CT(tensor_list)
+
+
+class CT(composite_tensor.CompositeTensor):
+  """A generic CompositeTensor, used for constructing tests."""
+  _type_spec_class = CTSpec
+
+  def __init__(self, components):
+    if isinstance(components, list):
+      components = tuple(components)
+    self.components = components
+
+  @property
+  def _type_spec(self):
+    component_specs = nest.map_structure(type_spec.type_spec_from_value,
+                                         self.components)
+    return self._type_spec_class(component_specs)
 
 
 class BackpropTest(test.TestCase, parameterized.TestCase):
@@ -1581,6 +1621,35 @@ class BackpropTest(test.TestCase, parameterized.TestCase):
     memory_checker.report()
     memory_checker.assert_no_leak_if_all_possibly_except_one()
 
+  def testCompositeTensorAsSource(self):
+    t = CT([constant_op.constant(3.), constant_op.constant(2.)])
+    with backprop.GradientTape() as gt:
+      gt.watch(t)
+      y = CT([t.components[0] * 2, t.components[1] * 3])
+
+    grad = gt.gradient(y, t)
+    expected_grad = CT([constant_op.constant(2.), constant_op.constant(3.)])
+
+    flat_grads = nest.flatten(grad, expand_composites=True)
+    flat_expected_grads = nest.flatten(expected_grad, expand_composites=True)
+
+    self.assertAllClose(flat_grads, flat_expected_grads)
+
+  def testCompositeTensorAsOutputGradients(self):
+    t = CT([constant_op.constant(3.), constant_op.constant(2.)])
+    with backprop.GradientTape() as gt:
+      gt.watch(t)
+      y = CT([t.components[0] * 2, t.components[1] * 3])
+
+    output_gradients = CT([constant_op.constant(5.), constant_op.constant(10.)])
+    grad = gt.gradient(y, t, output_gradients=output_gradients)
+    expected_grad = CT([constant_op.constant(10.), constant_op.constant(30.)])
+
+    flat_grads = nest.flatten(grad, expand_composites=True)
+    flat_expected_grads = nest.flatten(expected_grad, expand_composites=True)
+
+    self.assertAllClose(flat_grads, flat_expected_grads)
+
 
 class JacobianTest(test.TestCase):
 
diff --git a/tensorflow/python/eager/function.py b/tensorflow/python/eager/function.py
index c02318cb814..ca1e60c1b7b 100644
--- a/tensorflow/python/eager/function.py
+++ b/tensorflow/python/eager/function.py
@@ -2626,7 +2626,9 @@ def _is_ndarray(value):
       # For legacy reasons we do not automatically promote Numpy strings.
       or isinstance(value, np.str_)
       # NumPy dtypes have __array__ as unbound methods.
-      or isinstance(value, type))
+      or isinstance(value, type)
+      # CompositeTensors should be flattened instead.
+      or isinstance(value, composite_tensor.CompositeTensor))
 
 
 def _convert_numpy_inputs(inputs):
diff --git a/tensorflow/python/ops/numpy_ops/BUILD b/tensorflow/python/ops/numpy_ops/BUILD
index 5879bc9f062..3f18a7b3e01 100644
--- a/tensorflow/python/ops/numpy_ops/BUILD
+++ b/tensorflow/python/ops/numpy_ops/BUILD
@@ -115,3 +115,13 @@ cuda_py_test(
         "@absl_py//absl/testing:parameterized",
     ],
 )
+
+cuda_py_test(
+    name = "np_interop_test",
+    srcs = ["np_interop_test.py"],
+    deps = [
+        ":numpy",
+        "//tensorflow/python:platform",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
diff --git a/tensorflow/python/ops/numpy_ops/np_arrays.py b/tensorflow/python/ops/numpy_ops/np_arrays.py
index e2f73100909..8bec8a469a2 100644
--- a/tensorflow/python/ops/numpy_ops/np_arrays.py
+++ b/tensorflow/python/ops/numpy_ops/np_arrays.py
@@ -24,10 +24,13 @@ import numbers
 import numpy as np
 import six
 
+from tensorflow.python.framework import composite_tensor
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import tensor_spec
+from tensorflow.python.framework import type_spec
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.numpy_ops import np_dtypes
@@ -175,7 +178,38 @@ def convert_to_tensor(value, dtype=None, dtype_hint=None):
   return ops.convert_to_tensor(value, dtype=dtype, dtype_hint=dtype_hint)
 
 
-class ndarray(object):  # pylint: disable=invalid-name
+class NdarraySpec(type_spec.BatchableTypeSpec):
+  """Type specification for a `tf.experiemntal.numpy.ndarray`."""
+
+  value_type = property(lambda self: ndarray)
+
+  def __init__(self, data_spec):
+    if not isinstance(data_spec, tensor_spec.TensorSpec):
+      raise ValueError('NdarraySpec.__init__ was expecting a tf.TypeSpec, '
+                       'but got a {} instead.'.format(type(data_spec)))
+    self._data_spec = data_spec
+
+  @property
+  def _component_specs(self):
+    return self._data_spec
+
+  def _to_components(self, value):
+    return value.data
+
+  def _from_components(self, data):
+    return tensor_to_ndarray(data)
+
+  def _serialize(self):
+    return (self._data_spec,)
+
+  def _batch(self, batch_size):
+    return NdarraySpec(self._data_spec.batch(batch_size))
+
+  def _unbatch(self):
+    return NdarraySpec(self._data_spec.unbatch())
+
+
+class ndarray(composite_tensor.CompositeTensor):  # pylint: disable=invalid-name
   """Equivalent of numpy.ndarray backed by TensorFlow tensors.
 
   This does not support all features of NumPy ndarrays e.g. strides and
@@ -236,7 +270,10 @@ class ndarray(object):  # pylint: disable=invalid-name
     if dtype and dtype != buffer.dtype:
       buffer = array_ops.bitcast(buffer, dtype)
     self._data = buffer
-    self.base = None
+
+  @property
+  def _type_spec(self):
+    return NdarraySpec(type_spec.type_spec_from_value(self._data))
 
   @property
   def data(self):
diff --git a/tensorflow/python/ops/numpy_ops/np_arrays_test.py b/tensorflow/python/ops/numpy_ops/np_arrays_test.py
index feced98438d..412addc0ad7 100644
--- a/tensorflow/python/ops/numpy_ops/np_arrays_test.py
+++ b/tensorflow/python/ops/numpy_ops/np_arrays_test.py
@@ -29,6 +29,7 @@ from tensorflow.python.ops.numpy_ops import np_arrays
 # Required for operator overloads
 from tensorflow.python.ops.numpy_ops import np_math_ops  # pylint: disable=unused-import
 from tensorflow.python.platform import test
+from tensorflow.python.util import nest
 
 t2a = np_arrays.tensor_to_ndarray
 
@@ -182,6 +183,23 @@ class ArrayTest(test.TestCase):
     with self.assertRaisesWithPredicateMatch(TypeError, r'unhashable type'):
       hash(a)
 
+  def testFromToCompositeTensor(self):
+    tensors = [t2a(ops.convert_to_tensor(0.1)), t2a(ops.convert_to_tensor(0.2))]
+
+    flattened = nest.flatten(tensors, expand_composites=True)
+    # Each ndarray contains only one tensor, so the flattened output should be
+    # just 2 tensors in a list.
+    self.assertLen(flattened, 2)
+    self.assertIsInstance(flattened[0], ops.Tensor)
+    self.assertIsInstance(flattened[1], ops.Tensor)
+
+    repacked = nest.pack_sequence_as(tensors, flattened, expand_composites=True)
+    self.assertLen(repacked, 2)
+    self.assertIsInstance(repacked[0], np_arrays.ndarray)
+    self.assertIsInstance(repacked[1], np_arrays.ndarray)
+
+    self.assertAllClose(tensors, repacked)
+
 
 if __name__ == '__main__':
   # TODO(wangpeng): Test in graph mode as well.
diff --git a/tensorflow/python/ops/numpy_ops/np_interop_test.py b/tensorflow/python/ops/numpy_ops/np_interop_test.py
new file mode 100644
index 00000000000..052949dff9d
--- /dev/null
+++ b/tensorflow/python/ops/numpy_ops/np_interop_test.py
@@ -0,0 +1,94 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for interop between TF ops, numpy_ops, and numpy methods."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+
+from tensorflow.python.eager import backprop
+from tensorflow.python.eager import def_function
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops.numpy_ops import np_array_ops
+from tensorflow.python.ops.numpy_ops import np_arrays
+from tensorflow.python.platform import test
+
+
+class InteropTest(test.TestCase):
+
+  def testGradientTapeInterop(self):
+    with backprop.GradientTape() as t:
+      x = np_array_ops.asarray(3.0)
+      y = np_array_ops.asarray(2.0)
+
+      t.watch([x, y])
+
+      xx = 2 * x
+      yy = 3 * y
+
+    dx, dy = t.gradient([xx, yy], [x, y])
+
+    # TODO(nareshmodi): Gradient tape returns tensors. Is it possible to rewrap?
+    self.assertAllClose(dx, 2.0)
+    self.assertAllClose(dy, 3.0)
+
+  def testFunctionInterop(self):
+    x = np_array_ops.asarray(3.0)
+    y = np_array_ops.asarray(2.0)
+
+    add = lambda x, y: x + y
+    add_fn = def_function.function(add)
+
+    raw_result = add(x, y)
+    fn_result = add_fn(x, y)
+
+    self.assertIsInstance(raw_result, np_arrays.ndarray)
+    self.assertIsInstance(fn_result, np_arrays.ndarray)
+    self.assertAllClose(raw_result, fn_result)
+
+  def testCondInterop(self):
+    x = np_array_ops.asarray(3.0)
+
+    def fn(x):
+      x_plus_1 = control_flow_ops.cond(x > 0, lambda: x+1, lambda: x+2)
+      x_plus_2 = control_flow_ops.cond(x < 0, lambda: x+1, lambda: x+2)
+
+      return x_plus_1, x_plus_2
+
+    raw_x_plus_1, raw_x_plus_2 = fn(x)
+    fn_x_plus_1, fn_x_plus_2 = def_function.function(fn)(x)
+
+    self.assertAllClose(raw_x_plus_1, x + 1)
+    self.assertAllClose(raw_x_plus_2, x + 2)
+
+    self.assertAllClose(fn_x_plus_1, x + 1)
+    self.assertAllClose(fn_x_plus_2, x + 2)
+
+  def testWhileInterop(self):
+    def fn():
+      x = np_array_ops.asarray(0)
+      c = lambda x: x < 10000
+      b = lambda x: [x + 1]
+      return control_flow_ops.while_loop_v2(c, b, [x], parallel_iterations=20)
+
+    self.assertEqual(10000, fn()[0])
+    self.assertEqual(10000, def_function.function(fn)()[0])
+
+
+if __name__ == '__main__':
+  ops.enable_eager_execution()
+  test.main()

From 629e6077d0c4ccc342a15496cd91becedfc02bc3 Mon Sep 17 00:00:00 2001
From: Andrew Audibert <aaudibert@google.com>
Date: Thu, 18 Jun 2020 12:25:56 -0700
Subject: [PATCH 0519/1390] [tf.data service] Add test that options are applied
 to the distributed dataset

PiperOrigin-RevId: 317156222
Change-Id: Ie1956a3ef474371e381618c4808fee7a8ef8ee08
---
 .../kernel_tests/data_service_ops_test.py     | 30 +++++++++++++++++++
 1 file changed, 30 insertions(+)

diff --git a/tensorflow/python/data/kernel_tests/data_service_ops_test.py b/tensorflow/python/data/kernel_tests/data_service_ops_test.py
index 2356a866d6e..796ab328980 100644
--- a/tensorflow/python/data/kernel_tests/data_service_ops_test.py
+++ b/tensorflow/python/data/kernel_tests/data_service_ops_test.py
@@ -23,6 +23,7 @@ from absl.testing import parameterized
 
 from tensorflow.python.data.experimental.ops import data_service_ops
 from tensorflow.python.data.experimental.ops import distribute_options
+from tensorflow.python.data.experimental.ops import testing
 from tensorflow.python.data.experimental.service import server_lib
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
@@ -30,6 +31,7 @@ from tensorflow.python.eager import def_function
 from tensorflow.python.framework import combinations
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
+from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import tensor_array_ops
 from tensorflow.python.platform import test
@@ -317,6 +319,34 @@ class DataServiceOpsTest(test_base.DatasetTestBase, parameterized.TestCase):
       results.append(elem.numpy())
     self.assertCountEqual(num_repetitions * list(range(num_elements)), results)
 
+  @combinations.generate(test_base.eager_only_combinations())
+  def testApplyDeterminismOption(self):
+    elements = list(range(10))
+    master_address = self.create_cluster(1)
+
+    def dataset_fn(delay_ms):
+
+      def interleave_fn(x):
+        ds = dataset_ops.Dataset.from_tensors(x)
+        if math_ops.equal(x, 0):
+          ds = ds.apply(testing.sleep(delay_ms * 1000))
+        else:
+          ds = ds.apply(testing.sleep(0))
+        return ds
+
+      ds = dataset_ops.Dataset.from_tensor_slices(elements)
+      ds = ds.interleave(interleave_fn, cycle_length=10, num_parallel_calls=10)
+      opts = dataset_ops.Options()
+      opts.experimental_deterministic = False
+      ds = ds.with_options(opts)
+      ds = _make_distributed_dataset(ds, master_address)
+      return ds
+
+    self.checkDeterminism(
+        dataset_fn=dataset_fn,
+        expect_determinism=False,
+        expected_elements=elements)
+
   def run_stateful(self, external_state_policy):
     num_elements = 10
     ds = dataset_ops.Dataset.range(num_elements).map(

From d8bfc935fdc77ddc138e415dd8bb47f9817c4d5e Mon Sep 17 00:00:00 2001
From: Reed <reedwm@google.com>
Date: Thu, 18 Jun 2020 12:42:19 -0700
Subject: [PATCH 0520/1390] Add MKL supoprt to auto_mixed_precision.

This extends the auto mixed precision grappler pass to support converting nodes to bfloat16 on MKL-supported CPUs.

Co-authored-by: Niranjan Hasabnis <niranjan.hasabnis@intel.com>
---
 tensorflow/core/grappler/optimizers/BUILD     |  10 +-
 .../optimizers/auto_mixed_precision.cc        | 229 +++---
 .../optimizers/auto_mixed_precision.h         |  20 +-
 .../optimizers/auto_mixed_precision_lists.h   | 257 +++++--
 .../optimizers/auto_mixed_precision_test.cc   | 234 ++++++-
 .../grappler/optimizers/meta_optimizer.cc     |  11 +-
 .../core/protobuf/rewriter_config.proto       |   8 +-
 .../grappler/auto_mixed_precision_test.py     | 661 ++++++++++--------
 8 files changed, 951 insertions(+), 479 deletions(-)

diff --git a/tensorflow/core/grappler/optimizers/BUILD b/tensorflow/core/grappler/optimizers/BUILD
index 7432e2d54ea..f9b4c6e6d81 100644
--- a/tensorflow/core/grappler/optimizers/BUILD
+++ b/tensorflow/core/grappler/optimizers/BUILD
@@ -1,5 +1,5 @@
 load("//tensorflow:tensorflow.bzl", "tf_cc_test", "tf_cc_test_mkl", "tf_kernel_library")
-load("//tensorflow:tensorflow.bzl", "tf_cuda_cc_test")
+load("//tensorflow:tensorflow.bzl", "tf_copts", "tf_cuda_cc_test")
 
 # Platform specific build config
 load(
@@ -7,6 +7,11 @@ load(
     "if_static",
 )
 
+load(
+    "//third_party/mkl:build_defs.bzl",
+    "mkl_deps",
+)
+
 package(
     licenses = ["notice"],  # Apache 2.0
 )
@@ -611,6 +616,7 @@ cc_library(
         "auto_mixed_precision_lists.h",
     ],
     visibility = ["//visibility:public"],
+    copts = tf_copts(),
     deps = [
         ":custom_graph_optimizer_registry",
         ":graph_optimizer",
@@ -627,7 +633,7 @@ cc_library(
         "//tensorflow/core/grappler/costs:virtual_placer",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
-    ],
+    ] + mkl_deps(),
 )
 
 tf_cuda_cc_test(
diff --git a/tensorflow/core/grappler/optimizers/auto_mixed_precision.cc b/tensorflow/core/grappler/optimizers/auto_mixed_precision.cc
index fa6ca3144a5..2e0b56fd3e2 100644
--- a/tensorflow/core/grappler/optimizers/auto_mixed_precision.cc
+++ b/tensorflow/core/grappler/optimizers/auto_mixed_precision.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/core/grappler/optimizers/auto_mixed_precision.h"
 
 #include <fstream>
+#include <memory>
 
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
@@ -52,6 +53,7 @@ const std::pair<int, int> kMinGPUArch = {0, 0};
 
 const char kSuffix[] = "AutoMixedPrecision";
 const char kCastToFp16[] = "CastToFp16";
+const char kCastToBf16[] = "CastToBf16";
 const char kCastToFp32[] = "CastToFp32";
 
 // Instances of this class represent unique type attribute identifiers within a
@@ -840,22 +842,6 @@ DataTypeSet AllowedDataTypes(const OpDef& op_def, const TypeAttrId& t_attr_id) {
   return AllowedDataTypes(*attr_def);
 }
 
-NodeDef BuildCastNode(const MutableGraphView::OutputPort& src, bool to_fp16,
-                      const string& device) {
-  const char* cast_string = to_fp16 ? kCastToFp16 : kCastToFp32;
-  string name = strings::StrCat(src.node->name(), "-", src.port_id, "-",
-                                cast_string, "-", kSuffix);
-  NodeDef node;
-  node.set_name(name);
-  node.set_op("Cast");
-  node.set_device(device);
-  node.add_input(strings::StrCat(src.node->name(), ":", src.port_id));
-  (*node.mutable_attr())["SrcT"].set_type(to_fp16 ? DT_FLOAT : DT_HALF);
-  (*node.mutable_attr())["DstT"].set_type(to_fp16 ? DT_HALF : DT_FLOAT);
-  (*node.mutable_attr())["Truncate"].set_b(false);
-  return node;
-}
-
 Status ValidateLists(const gtl::FlatSet<string>& white_list,
                      const gtl::FlatSet<string>& black_list,
                      const gtl::FlatSet<string>& gray_list,
@@ -941,7 +927,8 @@ class AutoMixedPrecisionImpl {
  public:
   AutoMixedPrecisionImpl(Cluster* cluster,
                          const std::unordered_set<string>& nodes_to_preserve,
-                         GraphDef* graph, string id)
+                         GraphDef* graph, string id,
+                         AutoMixedPrecisionMode mode)
       : virtual_placer_(cluster->GetDevices()),
         nodes_to_preserve_(nodes_to_preserve),
         graph_(graph),
@@ -949,23 +936,35 @@ class AutoMixedPrecisionImpl {
         id_(id),
         graph_view_(graph),
         cuda_version_(GetCudaVersion(*cluster)),
-        cudnn_version_(GetCudnnVersion(*cluster)) {}
+        cudnn_version_(GetCudnnVersion(*cluster)),
+        mode_(mode),
+        target_dtype_(mode_ == AutoMixedPrecisionMode::CUDA ? DT_HALF
+                                                            : DT_BFLOAT16) {}
 
   Status Optimize();
 
  private:
   typedef absl::flat_hash_set<NodeTypeId> NodeTypeIdSet;
 
+  std::unique_ptr<AutoMixedPrecisionLists> get_mixed_precision_lists() const {
+    switch (mode_) {
+      case AutoMixedPrecisionMode::CUDA:
+        return std::make_unique<AutoMixedPrecisionListsCuda>(cuda_version_,
+                                                             cudnn_version_);
+      case AutoMixedPrecisionMode::MKL:
+        return std::make_unique<AutoMixedPrecisionListsMkl>();
+    }
+  }
   Status PrintDebugLogs(bool preop, size_t timestamp);
   void LogSkippedNode(const NodeDef& node) const;
   bool MustPreserve(const NodeDef& node) const;
-  bool IsOnGPU(const NodeDef& node) const;
+  bool IsOnDevice(const NodeDef& node, const string& device_type) const;
   bool IsOnSuitableGPUArch(const NodeDef& node) const;
   bool ShouldProcess(const NodeDef& node) const;
-  bool NodeHasFP16KernelForTypeAttr(const NodeDef& node, TypeAttrId taid) const;
+  bool NodeHasF16KernelForTypeAttr(const NodeDef& node, TypeAttrId taid) const;
   bool NodeImplicitlyReadsNonResourceVariable(const NodeDef& node) const;
   void ConvertBatchNormOpsToV2();
-  bool SupportsFloat16(const NodeTypeId& node_type) const;
+  bool SupportsF16(const NodeTypeId& node_type) const;
   const NodeTypeId* GetTensorListFloat32NodeTypeId(const NodeDef& node) const;
   bool IsSourceOrSinkOp(const string& op) const;
   void FindFloat32TensorListOpClustersAndBlacklistUnsafe(
@@ -990,6 +989,8 @@ class AutoMixedPrecisionImpl {
       absl::flat_hash_set<int>* white_set) const;
   void MakeCastsWhiteIfAllOutputsWhite(
       absl::flat_hash_set<int>* white_set) const;
+  NodeDef BuildCastNode(const MutableGraphView::OutputPort& src, bool to_f16,
+                        const string& device) const;
   Status ChangeTypeAttrsAndAddCasts(const absl::flat_hash_set<int>& white_set);
 
   VirtualPlacer virtual_placer_;
@@ -1003,21 +1004,44 @@ class AutoMixedPrecisionImpl {
   NodeTypeAttrMap node_type_map_;
   GraphTypeTopologyView graph_type_view_;
   bool force_all_fp16_;
-  gtl::FlatSet<string> fp16_whitelist_;
-  gtl::FlatSet<string> fp16_blacklist_;
-  gtl::FlatSet<string> fp16_graylist_;
-  gtl::FlatSet<string> fp16_clearlist_;
+  AutoMixedPrecisionMode mode_;
+  gtl::FlatSet<string> f16_whitelist_;
+  gtl::FlatSet<string> f16_blacklist_;
+  gtl::FlatSet<string> f16_graylist_;
+  gtl::FlatSet<string> f16_clearlist_;
   absl::flat_hash_set<const NodeDef*> should_process_nodes_;
+  DataType target_dtype_;  // Either DT_HALF or DT_BFLOAT16
 };
 
-bool AutoMixedPrecisionImpl::NodeHasFP16KernelForTypeAttr(
+NodeDef AutoMixedPrecisionImpl::BuildCastNode(
+    const MutableGraphView::OutputPort& src, bool to_f16,
+    const string& device) const {
+  DataType src_type = to_f16 ? DT_FLOAT : target_dtype_;
+  DataType dst_type = to_f16 ? target_dtype_ : DT_FLOAT;
+  const char* cast_string =
+      !to_f16 ? kCastToFp32
+              : target_dtype_ == DT_HALF ? kCastToFp16 : kCastToBf16;
+  string name = strings::StrCat(src.node->name(), "-", src.port_id, "-",
+                                cast_string, "-", kSuffix);
+  NodeDef node;
+  node.set_name(name);
+  node.set_op("Cast");
+  node.set_device(device);
+  node.add_input(strings::StrCat(src.node->name(), ":", src.port_id));
+  (*node.mutable_attr())["SrcT"].set_type(src_type);
+  (*node.mutable_attr())["DstT"].set_type(dst_type);
+  (*node.mutable_attr())["Truncate"].set_b(false);
+  return node;
+}
+
+bool AutoMixedPrecisionImpl::NodeHasF16KernelForTypeAttr(
     const NodeDef& node, TypeAttrId taid) const {
   NodeDef node_copy(node);
   if (node.device().empty()) {
     string device_name = virtual_placer_.get_canonical_device_name(node);
     node_copy.set_device(device_name);
   }
-  if (!SetDataType(&node_copy, taid, DataType::DT_HALF)) {
+  if (!SetDataType(&node_copy, taid, target_dtype_)) {
     return false;
   }
   return IsKernelRegisteredForNode(node_copy).ok();
@@ -1053,21 +1077,22 @@ Status AutoMixedPrecisionImpl::PrintDebugLogs(bool preop, size_t timestamp) {
     fname = io::JoinPath(prepend_path,
                          strings::StrCat("paintbuckets", suffix, ".txt"));
     f.open(fname.c_str(), std::fstream::out);
+    std::unique_ptr<AutoMixedPrecisionLists> mp_lists =
+        get_mixed_precision_lists();
     f << "WhiteList:\n";
-    for (const auto& x :
-         AutoMixedPrecisionLists::WhiteList(cuda_version_, cudnn_version_)) {
+    for (const auto& x : mp_lists->WhiteList()) {
       f << x << "\n";
     }
     f << "\nBlackList:\n";
-    for (const auto& x : AutoMixedPrecisionLists::BlackList()) {
+    for (const auto& x : mp_lists->BlackList()) {
       f << x << "\n";
     }
     f << "\nGrayList:\n";
-    for (const auto& x : AutoMixedPrecisionLists::GrayList()) {
+    for (const auto& x : mp_lists->GrayList()) {
       f << x << "\n";
     }
     f << "\nClearList:\n";
-    for (const auto& x : AutoMixedPrecisionLists::ClearList()) {
+    for (const auto& x : mp_lists->ClearList()) {
       f << x << "\n";
     }
     f.close();
@@ -1088,7 +1113,8 @@ bool AutoMixedPrecisionImpl::MustPreserve(const NodeDef& node) const {
   return nodes_to_preserve_.count(node.name());
 }
 
-bool AutoMixedPrecisionImpl::IsOnGPU(const NodeDef& node) const {
+bool AutoMixedPrecisionImpl::IsOnDevice(const NodeDef& node,
+                                        const string& device_type) const {
   string device_name;
   if (node.device().empty()) {
     device_name = virtual_placer_.get_canonical_device_name(node);
@@ -1099,7 +1125,7 @@ bool AutoMixedPrecisionImpl::IsOnGPU(const NodeDef& node) const {
   string not_used;
   if (DeviceNameUtils::SplitDeviceName(device_name, &not_used, &device) &&
       absl::StrContains(absl::AsciiStrToLower(device),
-                        absl::AsciiStrToLower(DEVICE_GPU))) {
+                        absl::AsciiStrToLower(device_type))) {
     return true;
   }
   return false;
@@ -1164,15 +1190,14 @@ bool IsTensorListWriterOp(const string& op) {
   return tensor_list_writer_ops.count(op);
 }
 
-bool AutoMixedPrecisionImpl::SupportsFloat16(
-    const NodeTypeId& node_type) const {
+bool AutoMixedPrecisionImpl::SupportsF16(const NodeTypeId& node_type) const {
   const OpDef* op_def;
   Status status =
       OpRegistry::Global()->LookUpOpDef(node_type.node->op(), &op_def);
   if (!status.ok()) return false;
   return AllowedDataTypes(*op_def, node_type.type_attr)
-             .Contains(DataType::DT_HALF) &&
-         NodeHasFP16KernelForTypeAttr(*node_type.node, node_type.type_attr);
+             .Contains(target_dtype_) &&
+         NodeHasF16KernelForTypeAttr(*node_type.node, node_type.type_attr);
 }
 
 // TODO(mconley): Make this change the node's name (to aid debugging). Need to
@@ -1219,22 +1244,40 @@ Status AutoMixedPrecisionImpl::Optimize() {
       "TF_AUTO_MIXED_PRECISION_GRAPH_REWRITE_LEVEL", "", &optimization_level));
   optimization_level = absl::AsciiStrToUpper(optimization_level);
   force_all_fp16_ = optimization_level == "UNSAFE_FORCE_ALL";
+  if (force_all_fp16_ && mode_ == AutoMixedPrecisionMode::MKL) {
+    // Many ops do not support bfloat16 on the CPU so we disallowing forcing to
+    // bfloat16.
+    return errors::InvalidArgument(
+        "TF_AUTO_MIXED_PRECISION_GRAPH_REWRITE_LEVEL cannot be set to "
+        "UNSAFE_FORCE_ALL when MKL is used");
+  }
 
-  fp16_whitelist_ =
-      AutoMixedPrecisionLists::WhiteList(cuda_version_, cudnn_version_);
-  fp16_blacklist_ = AutoMixedPrecisionLists::BlackList();
-  fp16_graylist_ = AutoMixedPrecisionLists::GrayList();
-  fp16_clearlist_ = AutoMixedPrecisionLists::ClearList();
-  TF_RETURN_IF_ERROR(ValidateLists(fp16_whitelist_, fp16_blacklist_,
-                                   fp16_graylist_, fp16_clearlist_));
+  std::unique_ptr<AutoMixedPrecisionLists> mp_lists =
+      get_mixed_precision_lists();
+  f16_whitelist_ = mp_lists->WhiteList();
+  f16_blacklist_ = mp_lists->BlackList();
+  f16_graylist_ = mp_lists->GrayList();
+  f16_clearlist_ = mp_lists->ClearList();
+  TF_RETURN_IF_ERROR(ValidateLists(f16_whitelist_, f16_blacklist_,
+                                   f16_graylist_, f16_clearlist_));
 
   size_t timestamp = Env::Default()->NowMicros() / 1000;
   TF_RETURN_IF_ERROR(PrintDebugLogs(/* preop = */ true, timestamp));
 
   VLOG(2) << "Identifying nodes that should be processed";
   for (const NodeDef& node : graph_->node()) {
-    if (!MustPreserve(node) && IsOnGPU(node) &&
-        (ShouldIgnorePerformance() || IsOnSuitableGPUArch(node))) {
+    bool should_process;
+    switch (mode_) {
+      case AutoMixedPrecisionMode::CUDA:
+        should_process =
+            !MustPreserve(node) && IsOnDevice(node, DEVICE_GPU) &&
+            (ShouldIgnorePerformance() || IsOnSuitableGPUArch(node));
+        break;
+      case AutoMixedPrecisionMode::MKL:
+        should_process = !MustPreserve(node) && IsOnDevice(node, DEVICE_CPU);
+        break;
+    }
+    if (should_process) {
       should_process_nodes_.insert(&node);
     } else {
       LogSkippedNode(node);
@@ -1260,29 +1303,29 @@ Status AutoMixedPrecisionImpl::Optimize() {
   for (const auto& cluster : tensor_list_clusters) {
     VLOG(1) << "Found safe Tensor List cluster of size " << cluster.size();
     for (const NodeDef* node : cluster) {
-      VLOG(2) << "Cluster member: " << node->op() << " node " << node->name();
+      VLOG(2) << "  Cluster member: " << node->op() << " node " << node->name();
     }
     FindTensorListImplicitFloat32Edges(cluster, &ephemeral_edges);
   }
   TF_RETURN_IF_ERROR(graph_type_view_.AddEphemeralEdges(ephemeral_edges));
 
-  // The goal here is to change performance-critical ops to fp16, and to do so
-  // with the minimal number of casts, subject to the constraint that the
+  // The goal here is to change performance-critical ops to fp16 or bf16, and to
+  // do so with the minimal number of casts, subject to the constraint that the
   // model's convergence is not affected. This is achieved by first identifying
-  // which nodes should be changed to fp16 and then inserting casts at the
-  // boundaries between fp16/non-fp16 nodes.
+  // which nodes should be changed to f16 and then inserting casts at the
+  // boundaries between f16/non-f16 nodes.
 
-  // The algorithm for deciding which nodes to change to fp16 is as follows:
+  // The algorithm for deciding which nodes to change to f16 is as follows:
   // 1) Add all performance-critical ops (aka "whitelist" ops) to the white_set.
   //    This is done under the assumption that whitelist ops are always
-  //    numerically-safe in fp16 and that they are the most important ops for
+  //    numerically-safe in f16 and that they are the most important ops for
   //    improving performance.
   // 2) Add nodes to the black_set iff they are numerically-dangerous (aka
   //    "blacklist" ops) or they are on a forward path from a blacklist node to
   //    a black/gray node (including the node at the end of the path) through
   //    non-numerically-dangerous ops (aka "greylist" and "clearlist" ops).
   //    This is done to prevent numerically-dangerous ops and their downstream
-  //    effects from being changed to fp16, which would risk breaking the
+  //    effects from being changed to f16, which would risk breaking the
   //    numerical accuracy of the model.
   // 3) For all remaining nodes that are not considered dangerous (greylist
   //    and clearlist ops), find those that are between (i.e., both upstream
@@ -1480,7 +1523,7 @@ void AutoMixedPrecisionImpl::AddWhitelistOps(
     const NodeTypeId& root = *graph_type_view_.GetNode(root_idx);
     if (!ShouldProcess(*root.node)) continue;
     bool force_white = force_all_fp16_ && CanForceFP16(*root.node);
-    if (fp16_whitelist_.count(root.node->op()) || force_white) {
+    if (f16_whitelist_.count(root.node->op()) || force_white) {
       bool inserted = white_set->insert(root_idx).second;
       if (VLOG_IS_ON(2) && inserted) {
         VLOG(2) << "Painting type " << root.type_attr.DebugString()
@@ -1504,8 +1547,8 @@ void AutoMixedPrecisionImpl::PropagateBlackFwdThroughClearAndGray(
   absl::flat_hash_set<int> upstream_of_black_or_gray_set;
   for (int root_idx = 0; root_idx < graph_type_view_.num_nodes(); ++root_idx) {
     const NodeTypeId& root = *graph_type_view_.GetNode(root_idx);
-    if (!(fp16_blacklist_.count(root.node->op()) ||
-          fp16_graylist_.count(root.node->op()))) {
+    if (!(f16_blacklist_.count(root.node->op()) ||
+          f16_graylist_.count(root.node->op()))) {
       continue;
     }
     DfsTypeTraversal(graph_type_view_, {&root},
@@ -1514,7 +1557,7 @@ void AutoMixedPrecisionImpl::PropagateBlackFwdThroughClearAndGray(
                        const NodeTypeId& item = *graph_type_view_.GetNode(idx);
                        return idx == root_idx ||
                               (!upstream_of_black_or_gray_set.count(idx) &&
-                               fp16_clearlist_.count(item.node->op()));
+                               f16_clearlist_.count(item.node->op()));
                      }),
                      DfsTypeCallbacks::PreOrder([&](int idx) {
                        upstream_of_black_or_gray_set.insert(idx);
@@ -1524,7 +1567,7 @@ void AutoMixedPrecisionImpl::PropagateBlackFwdThroughClearAndGray(
   // Propagate black forward through nodes in upstream_of_black_or_gray_set.
   for (int root_idx = 0; root_idx < graph_type_view_.num_nodes(); ++root_idx) {
     const NodeTypeId& root = *graph_type_view_.GetNode(root_idx);
-    if (black_set->count(root_idx) || !fp16_blacklist_.count(root.node->op())) {
+    if (black_set->count(root_idx) || !f16_blacklist_.count(root.node->op())) {
       continue;
     }
     DfsTypeTraversal(
@@ -1552,7 +1595,7 @@ void AutoMixedPrecisionImpl::AddClearAndGrayToWhiteIfBetweenWhite(
   absl::flat_hash_set<int> downstream_of_white_set;
   for (int root_idx = 0; root_idx < graph_type_view_.num_nodes(); ++root_idx) {
     const NodeTypeId& root = *graph_type_view_.GetNode(root_idx);
-    if (!ShouldProcess(*root.node) || !fp16_whitelist_.count(root.node->op())) {
+    if (!ShouldProcess(*root.node) || !f16_whitelist_.count(root.node->op())) {
       continue;
     }
     DfsTypeTraversal(
@@ -1561,14 +1604,14 @@ void AutoMixedPrecisionImpl::AddClearAndGrayToWhiteIfBetweenWhite(
           const NodeTypeId& item = *graph_type_view_.GetNode(idx);
           return idx == root_idx ||
                  (!downstream_of_white_set.count(idx) &&
-                  !fp16_whitelist_.count(item.node->op()) &&
+                  !f16_whitelist_.count(item.node->op()) &&
                   !black_set.count(idx) && ShouldProcess(*item.node) &&
                   // TODO(benbarsdell): Consider allowing propagation through
                   // ops that are already float16 in order to reduce the number
                   // of casts.
-                  IsFloat32(item) && SupportsFloat16(item) &&
-                  (fp16_clearlist_.count(item.node->op()) ||
-                   fp16_graylist_.count(item.node->op())));
+                  IsFloat32(item) && SupportsF16(item) &&
+                  (f16_clearlist_.count(item.node->op()) ||
+                   f16_graylist_.count(item.node->op())));
         }),
         DfsTypeCallbacks::PreOrder(
             [&](int idx) { downstream_of_white_set.insert(idx); }));
@@ -1579,7 +1622,7 @@ void AutoMixedPrecisionImpl::AddClearAndGrayToWhiteIfBetweenWhite(
   for (int root_idx = 0; root_idx < graph_type_view_.num_nodes(); ++root_idx) {
     const NodeTypeId& root = *graph_type_view_.GetNode(root_idx);
     if (!ShouldProcess(*root.node) || upstream_of_white_set.count(root_idx) ||
-        !fp16_whitelist_.count(root.node->op())) {
+        !f16_whitelist_.count(root.node->op())) {
       continue;
     }
     DfsTypeTraversal(
@@ -1620,8 +1663,8 @@ void AutoMixedPrecisionImpl::PropagateWhiteThroughClear(
           return idx == root_idx ||
                  (!white_set->count(idx) && !black_set.count(idx) &&
                   ShouldProcess(*item.node) && IsFloat32(item) &&
-                  SupportsFloat16(item) &&
-                  (fp16_clearlist_.count(item.node->op())) &&
+                  SupportsF16(item) &&
+                  (f16_clearlist_.count(item.node->op())) &&
                   // We don't propagate (backwards) through nodes that read
                   // Variables because it can break the behavior of TensorBoard
                   // visualization and/or (in the case of Enter nodes) the model
@@ -1806,13 +1849,13 @@ void AutoMixedPrecisionImpl::MakeCastsWhiteIfAllOutputsWhite(
   }
 }
 
-// Changes all white-painted type attributes to DT_HALF, and inserts Cast nodes
-// at node outputs for all edges that connect white-painted <->
-// non-white-painted type attributes.
+// Changes all white-painted type attributes to DT_HALF or DT_BFLOAT16, and
+// inserts Cast nodes at node outputs for all edges that connect
+// white-painted <-> non-white-painted type attributes.
 Status AutoMixedPrecisionImpl::ChangeTypeAttrsAndAddCasts(
     const absl::flat_hash_set<int>& white_set) {
   int num_nodes_changed = 0;
-  int num_nonvar_casts_to_fp16 = 0;
+  int num_nonvar_casts_to_f16 = 0;
   int num_nodes_preop = graph_->node_size();
   for (int node_idx = 0; node_idx < num_nodes_preop; ++node_idx) {
     NodeDef* node = graph_->mutable_node(node_idx);
@@ -1829,8 +1872,9 @@ Status AutoMixedPrecisionImpl::ChangeTypeAttrsAndAddCasts(
       bool src_is_white = white_set.count(node_type_idx);
       if (src_is_white) {
         VLOG(1) << "Changing type " << type_attr.DebugString() << " of "
-                << node->op() << " node " << node->name() << " to DT_HALF";
-        if (!SetDataType(node, type_attr, DT_HALF)) {
+                << node->op() << " node " << node->name() << " to "
+                << DataTypeString(target_dtype_);
+        if (!SetDataType(node, type_attr, target_dtype_)) {
           return errors::Internal("Failed to set type attribute");
         }
         ++num_nodes_changed;
@@ -1855,16 +1899,16 @@ Status AutoMixedPrecisionImpl::ChangeTypeAttrsAndAddCasts(
           bool dst_is_white = white_set.count(dst_type_idx);
           if (src_is_white != dst_is_white) {
             if (!added_cast_node) {
-              bool to_fp16 = dst_is_white;
+              bool to_f16 = dst_is_white;
               VLOG(1) << "Inserting cast to "
-                      << (to_fp16 ? "DT_HALF" : "DT_FLOAT") << " at "
-                      << src.node->op() << " " << src.node->name() << ":"
-                      << src.port_id;
+                      << (to_f16 ? DataTypeString(target_dtype_) : "DT_FLOAT")
+                      << " at " << src.node->op() << " " << src.node->name()
+                      << ":" << src.port_id;
               added_cast_node = graph_view_.AddNode(
-                  BuildCastNode(src, to_fp16, src.node->device()));
-              if (to_fp16 && !IsConstant(*node) && !IsVariable(*node) &&
+                  BuildCastNode(src, to_f16, src.node->device()));
+              if (to_f16 && !IsConstant(*node) && !IsVariable(*node) &&
                   !NodeImplicitlyReadsNonResourceVariable(*node)) {
-                ++num_nonvar_casts_to_fp16;
+                ++num_nonvar_casts_to_f16;
               }
             }
             TF_RETURN_IF_ERROR(graph_view_.UpdateRegularFaninByPort(
@@ -1874,9 +1918,13 @@ Status AutoMixedPrecisionImpl::ChangeTypeAttrsAndAddCasts(
       }
     }
   }
+  // Use Python type names (e.g. float16) instead of C++ type names (e.g. half)
+  // since many Python users will see this message.
+  const char* type_str = target_dtype_ == DT_HALF ? "float16" : "bfloat16";
   LOG(INFO) << "Converted " << num_nodes_changed << "/" << num_nodes_preop
-            << " nodes to float16 precision using " << num_nonvar_casts_to_fp16
-            << " cast(s) to float16 (excluding Const and Variable casts)";
+            << " nodes to " << type_str << " precision using "
+            << num_nonvar_casts_to_f16 << " cast(s) to " << type_str
+            << " (excluding Const and Variable casts)";
   return Status::OK();
 }
 
@@ -1902,12 +1950,23 @@ Status AutoMixedPrecision::Optimize(Cluster* cluster, const GrapplerItem& item,
     return errors::InvalidArgument("cluster == nullptr");
   }
 
+#if !defined(INTEL_MKL) || !defined(ENABLE_INTEL_MKL_BFLOAT16)
+  if (mode_ == AutoMixedPrecisionMode::MKL) {
+    return errors::Unimplemented(
+        "The auto_mixed_precision_mkl optimizer cannot be used since "
+        "this build of TensorFlow is not compiled with MKL support for bfloat16. "
+        "For information on MKL builds, see: "
+        "https://software.intel.com/en-us/articles/intel-optimization-for-"
+        "tensorflow-installation-guide");
+  }
+#endif
+
   // Start by copying input graph to output.
   *output = item.graph;
 
   int num_gpus = ShouldIgnorePerformance() ? GetNumGPUs(*cluster)
                                            : GetNumGPUs(*cluster, kMinGPUArch);
-  if (num_gpus < 1) {
+  if (num_gpus < 1 && mode_ == AutoMixedPrecisionMode::CUDA) {
     // AutoMixedPrecision is currently only tuned for GPU.
     LOG(WARNING) << "No (suitable) GPUs detected, skipping " << name()
                  << " graph optimizer";
@@ -1916,7 +1975,7 @@ Status AutoMixedPrecision::Optimize(Cluster* cluster, const GrapplerItem& item,
 
   // Optimize the output graph in-place.
   AutoMixedPrecisionImpl optimizer(cluster, item.NodesToPreserve(), output,
-                                   item.id);
+                                   item.id, mode_);
   if (item.id == "tf_graph") {
     LOG(INFO) << "Running " << name() << " graph optimizer";
   } else {
diff --git a/tensorflow/core/grappler/optimizers/auto_mixed_precision.h b/tensorflow/core/grappler/optimizers/auto_mixed_precision.h
index 163d1f6923f..c41ba7d2821 100644
--- a/tensorflow/core/grappler/optimizers/auto_mixed_precision.h
+++ b/tensorflow/core/grappler/optimizers/auto_mixed_precision.h
@@ -22,16 +22,25 @@ limitations under the License.
 namespace tensorflow {
 namespace grappler {
 
-// Convert data types to float16 where appropriate to improve performance on
-// GPUs.
+enum class AutoMixedPrecisionMode { CUDA, MKL };
+
+// Convert data types to float16 or bfloat16 where appropriate to improve
+// performance on GPUs or CPUs.
 class AutoMixedPrecision : public GraphOptimizer {
  public:
+  // If 'mode' is CUDA, converts nodes to float16 on Nvidia GPUs. If MKL,
+  // converts nodes to bfloat16 on CPUs in order to take advantage of MKL
+  // performance improvements with bfloat16.
   explicit AutoMixedPrecision(
-      RewriterConfig::Toggle opt_level = RewriterConfig::ON) {}
+      AutoMixedPrecisionMode mode = AutoMixedPrecisionMode::CUDA)
+      : mode_(mode) {}
 
   ~AutoMixedPrecision() override {}
 
-  string name() const override { return "auto_mixed_precision"; };
+  string name() const override {
+    return mode_ == AutoMixedPrecisionMode::CUDA ? "auto_mixed_precision_cuda"
+                                                 : "auto_mixed_precision_mkl";
+  };
 
   bool UsesFunctionLibrary() const override { return false; }
 
@@ -40,6 +49,9 @@ class AutoMixedPrecision : public GraphOptimizer {
 
   void Feedback(Cluster* cluster, const GrapplerItem& item,
                 const GraphDef& optimize_output, double result) override;
+
+ private:
+  const AutoMixedPrecisionMode mode_;
 };
 
 }  // end namespace grappler
diff --git a/tensorflow/core/grappler/optimizers/auto_mixed_precision_lists.h b/tensorflow/core/grappler/optimizers/auto_mixed_precision_lists.h
index d3d13e2edc0..c6016548117 100644
--- a/tensorflow/core/grappler/optimizers/auto_mixed_precision_lists.h
+++ b/tensorflow/core/grappler/optimizers/auto_mixed_precision_lists.h
@@ -23,10 +23,44 @@ limitations under the License.
 namespace tensorflow {
 namespace grappler {
 
+// Represents the four lists of ops: the white list, gray list, black list, and
+// clear list. These lists determine which ops are converted to fp16/bf16
+// (referred to as 'f16' for short) and which ops stay as fp32.
 class AutoMixedPrecisionLists {
- private:
-  static void UpdateList(gtl::FlatSet<string>* list, const string& to_add,
-                         const string& to_remove) {
+ public:
+
+  virtual ~AutoMixedPrecisionLists() {}
+
+  // Returns the set of ops that are considered numerically-safe (for execution
+  // in f16), performance-critical, and can run in f16. These ops are always
+  // converted to f16.
+  virtual gtl::FlatSet<string> WhiteList() = 0;
+  // Returns the set of ops that can run in f16 and are considered numerically-
+  // safe (for execution in f16), but which may be made unsafe by an upstream
+  // blacklist op.
+  virtual gtl::FlatSet<string> GrayList() = 0;
+  // Returns the set of ops that are considered numerically-dangerous (i.e.,
+  // unsafe for execution in f16) and whose effects may also be observed in
+  // downstream nodes (e.g. for f16, in Exp -> Add, the Add is unsafe due to
+  // the Exp).
+  virtual gtl::FlatSet<string> BlackList() = 0;
+  // Returns the set of ops that do not have numerically-significant effects
+  // (i.e., they are always considered safe for execution in f16 precision), and
+  // can run in f16.
+  virtual gtl::FlatSet<string> ClearList() = 0;
+
+ protected:
+  // Adds or removes ops from list if certain environmental variables are set.
+  static void UpdateList(const string& list_name, gtl::FlatSet<string>* list) {
+    CHECK(list_name == "WHITELIST" || list_name == "GRAYLIST" ||  // Crash OK.
+          list_name == "BLACKLIST" || list_name == "CLEARLIST");
+    string add_env_var =
+        "TF_AUTO_MIXED_PRECISION_GRAPH_REWRITE_" + list_name + "_ADD";
+    string remove_env_var =
+        "TF_AUTO_MIXED_PRECISION_GRAPH_REWRITE_" + list_name + "_REMOVE";
+    string to_add, to_remove;
+    TF_CHECK_OK(ReadStringFromEnvVar(add_env_var, "", &to_add));
+    TF_CHECK_OK(ReadStringFromEnvVar(remove_env_var, "", &to_remove));
     for (const auto& x : str_util::Split(to_add, ",")) {
       list->insert(x);
     }
@@ -35,6 +69,35 @@ class AutoMixedPrecisionLists {
     }
   }
 
+  // Subclasses should include these on the ClearList.
+  static void AddTensorListOps(gtl::FlatSet<string>* list) {
+    // Note: if a data structure op (such as TensorListPopBack) is added here,
+    // IsTensorListReaderOp or IsTensorListWriterOp may need to be modified
+    constexpr char* tensor_list_ops[] = {
+        "TensorListConcat",
+        "TensorListConcatLists",
+        "TensorListConcatV2",
+        "TensorListGather",
+        "TensorListGetItem",
+        "TensorListPopBack",
+        "TensorListPushBack",
+        "TensorListPushBackBatch",
+        "TensorListFromTensor",
+        "TensorListScatter",
+        "TensorListScatterV2",
+        "TensorListScatterIntoExistingList",
+        "TensorListSetItem",
+        "TensorListSplit",
+        "TensorListStack"
+    };
+    for (auto op : tensor_list_ops) {
+      list->insert(op);
+    }
+  }
+};
+
+class AutoMixedPrecisionListsCuda : public AutoMixedPrecisionLists {
+ private:
   static bool IsPseudoFastMath() {
     string optimization_level;
     TF_CHECK_OK(
@@ -45,16 +108,10 @@ class AutoMixedPrecisionLists {
   }
 
  public:
-  // Returns the set of ops that are considered numerically-safe (for execution
-  // in fp16) and performance-critical. These ops are always converted to fp16.
-  static gtl::FlatSet<string> WhiteList(int cuda_version, int cudnn_version) {
-    string to_add, to_remove;
-    TF_CHECK_OK(ReadStringFromEnvVar(
-        "TF_AUTO_MIXED_PRECISION_GRAPH_REWRITE_WHITELIST_ADD", "", &to_add));
-    TF_CHECK_OK(ReadStringFromEnvVar(
-        "TF_AUTO_MIXED_PRECISION_GRAPH_REWRITE_WHITELIST_REMOVE", "",
-        &to_remove));
+  AutoMixedPrecisionListsCuda(int cuda_version, int cudnn_version)
+      : cuda_version_(cuda_version), cudnn_version_(cudnn_version) {}
 
+  gtl::FlatSet<string> WhiteList() override {
     auto list = gtl::FlatSet<string>{
         "BlockLSTM",
         "BlockLSTMV2",
@@ -81,12 +138,12 @@ class AutoMixedPrecisionLists {
         // "DepthwiseConv2dNativeBackpropInput",
         "MatMul",
     };
-    if (cuda_version >= 9010) {
+    if (cuda_version_ >= 9010) {
       // Fp16 BatchMatMul is slow before CUDA 9.1.
       list.insert("BatchMatMul");
       list.insert("BatchMatMulV2");
     }
-    if (cudnn_version >= 7602) {
+    if (cudnn_version_ >= 7602) {
       // Fp16 3D conv is slow before CUDNN 7.6.2.
       list.insert("Conv3D");
       list.insert("Conv3DBackpropFilter");
@@ -94,22 +151,14 @@ class AutoMixedPrecisionLists {
       list.insert("Conv3DBackpropInput");
       list.insert("Conv3DBackpropInputV2");
     }
-    UpdateList(&list, to_add, to_remove);
+    UpdateList("WHITELIST", &list);
     return list;
   }
 
-  // Returns the set of ops that are considered numerically-safe (for execution
-  // in fp16), but which may be made unsafe by an upstream blacklist op.
-  static gtl::FlatSet<string> GrayList() {
+  gtl::FlatSet<string> GrayList() override {
     if (IsPseudoFastMath()) {
       return gtl::FlatSet<string>{};
     }
-    string to_add, to_remove;
-    TF_CHECK_OK(ReadStringFromEnvVar(
-        "TF_AUTO_MIXED_PRECISION_GRAPH_REWRITE_GRAYLIST_ADD", "", &to_add));
-    TF_CHECK_OK(ReadStringFromEnvVar(
-        "TF_AUTO_MIXED_PRECISION_GRAPH_REWRITE_GRAYLIST_REMOVE", "",
-        &to_remove));
 
     auto list = gtl::FlatSet<string>{
         "Add",
@@ -156,23 +205,14 @@ class AutoMixedPrecisionLists {
         "Tanh",
         "TanhGrad",
     };
-    UpdateList(&list, to_add, to_remove);
+    UpdateList("GRAYLIST", &list);
     return list;
   }
 
-  // Returns the set of ops that are considered numerically-dangerous (i.e.,
-  // unsafe for execution in fp16) and whose effects may also be observed in
-  // downstream nodes (e.g., in Exp -> Add, the Add is unsafe due to the Exp).
-  static gtl::FlatSet<string> BlackList() {
+  gtl::FlatSet<string> BlackList() override {
     if (IsPseudoFastMath()) {
       return gtl::FlatSet<string>{};
     }
-    string to_add, to_remove;
-    TF_CHECK_OK(ReadStringFromEnvVar(
-        "TF_AUTO_MIXED_PRECISION_GRAPH_REWRITE_BLACKLIST_ADD", "", &to_add));
-    TF_CHECK_OK(ReadStringFromEnvVar(
-        "TF_AUTO_MIXED_PRECISION_GRAPH_REWRITE_BLACKLIST_REMOVE", "",
-        &to_remove));
 
     auto list = gtl::FlatSet<string>{
         "Exp",
@@ -185,22 +225,14 @@ class AutoMixedPrecisionLists {
         "SparseSoftmaxCrossEntropyWithLogits",
         "Sum",
     };
-    UpdateList(&list, to_add, to_remove);
+    UpdateList("BLACKLIST", &list);
     return list;
   }
 
-  // Returns the set of ops that do not have numerically-significant effects
-  // (i.e., they are always considered safe for execution in fp16 precision).
-  static gtl::FlatSet<string> ClearList() {
+  gtl::FlatSet<string> ClearList() override {
     if (IsPseudoFastMath()) {
       return gtl::FlatSet<string>{};
     }
-    string to_add, to_remove;
-    TF_CHECK_OK(ReadStringFromEnvVar(
-        "TF_AUTO_MIXED_PRECISION_GRAPH_REWRITE_CLEARLIST_ADD", "", &to_add));
-    TF_CHECK_OK(ReadStringFromEnvVar(
-        "TF_AUTO_MIXED_PRECISION_GRAPH_REWRITE_CLEARLIST_REMOVE", "",
-        &to_remove));
 
     auto list = gtl::FlatSet<string>{
         "Abs",
@@ -291,21 +323,6 @@ class AutoMixedPrecisionLists {
         "StridedSlice",
         "StridedSliceGrad",
         "Switch",
-        "TensorListConcat",
-        "TensorListConcatLists",
-        "TensorListConcatV2",
-        "TensorListGather",
-        "TensorListGetItem",
-        "TensorListPopBack",
-        "TensorListPushBack",
-        "TensorListPushBackBatch",
-        "TensorListFromTensor",
-        "TensorListScatter",
-        "TensorListScatterV2",
-        "TensorListScatterIntoExistingList",
-        "TensorListSetItem",
-        "TensorListSplit",
-        "TensorListStack",
         "Tile",
         "TopK",
         "TopKV2",
@@ -313,7 +330,125 @@ class AutoMixedPrecisionLists {
         "Where",
         "ZerosLike",
     };
-    UpdateList(&list, to_add, to_remove);
+    AddTensorListOps(&list);
+    UpdateList("CLEARLIST", &list);
+    return list;
+  }
+
+ private:
+  int cuda_version_;
+  int cudnn_version_;
+};
+
+class AutoMixedPrecisionListsMkl : public AutoMixedPrecisionLists {
+ private:
+
+ public:
+  AutoMixedPrecisionListsMkl() {}
+
+  // Only ops which are supported by MKL in bfloat16 should be added to the
+  // white list, gray list, or clear list.
+  gtl::FlatSet<string> WhiteList() override {
+    auto list = gtl::FlatSet<string>{
+        "Conv2D",
+        "Conv2DBackpropFilter",
+        "Conv2DBackpropInput",
+        "Conv3D",
+        "Conv3DBackpropFilterV2",
+        "Conv3DBackpropInputV2",
+        "DepthwiseConv2dNative",
+        "DepthwiseConv2dNativeBackpropFilter",
+        "DepthwiseConv2dNativeBackpropInput",
+        "MatMul",
+        "BatchMatMul",
+        "BatchMatMulV2"
+    };
+
+    UpdateList("WHITELIST", &list);
+    return list;
+  }
+
+  gtl::FlatSet<string> GrayList() override {
+    auto list = gtl::FlatSet<string>{
+        "Add",
+        "AddN",
+        "AddV2",
+        "AvgPool",
+        "AvgPool3D",
+        "AvgPool3DGrad",
+        "AvgPoolGrad",
+        "BiasAdd",
+        "BiasAddGrad",
+        "BiasAddV1",
+        "FusedBatchNormV2",
+        "FusedBatchNormGradV2",
+        "FusedBatchNormV3",
+        "FusedBatchNormGradV3",
+        "LeakyRelu",
+        "LeakyReluGrad",
+        "Mul",
+        "Sub",
+    };
+    UpdateList("GRAYLIST", &list);
+    return list;
+  }
+
+  gtl::FlatSet<string> BlackList() override {
+    auto list = gtl::FlatSet<string>{
+        "Exp",
+        "Expm1",
+        "L2Loss",
+        "Mean",
+        "Pow",
+        "SaveV2",
+        "Softmax",
+        "SoftmaxCrossEntropyWithLogits",
+        "SparseSoftmaxCrossEntropyWithLogits",
+        "Sum",
+    };
+    UpdateList("BLACKLIST", &list);
+    return list;
+  }
+
+  gtl::FlatSet<string> ClearList() override {
+    auto list = gtl::FlatSet<string>{
+        "Concat",
+        "ConcatV2",
+        "Enter",
+        "EnsureShape",
+        "Equal",
+        "Exit",
+        "ExpandDims",
+        "Identity",
+        "MaxPool",
+        "MaxPool3D",
+        "MaxPool3DGrad",
+        "MaxPoolGrad",
+        "MaxPoolV2",
+        "Maximum",
+        "Merge",
+        "NextIteration",
+        "PreventGradient",
+        "Relu",
+        "Relu6",
+        "Relu6Grad",
+        "ReluGrad",
+        "Reshape",
+        "Select",
+        "SelectV2",
+        "Shape",
+        "ShapeN",
+        "Slice",
+        "Split",
+        "SplitV",
+        "Squeeze",
+        "StopGradient",
+        "Switch",
+        "Transpose",
+        "ZerosLike",
+    };
+    AddTensorListOps(&list);
+    UpdateList("CLEARLIST", &list);
     return list;
   }
 };
diff --git a/tensorflow/core/grappler/optimizers/auto_mixed_precision_test.cc b/tensorflow/core/grappler/optimizers/auto_mixed_precision_test.cc
index 951279d37cd..248d8dd4266 100644
--- a/tensorflow/core/grappler/optimizers/auto_mixed_precision_test.cc
+++ b/tensorflow/core/grappler/optimizers/auto_mixed_precision_test.cc
@@ -13,12 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-// Currently, this test only passes when TensorFlow passes with CUDA, because
-// otherwise the optimizer will not turn clearlist nodes to float16. When
-// looking at clearlist nodes, this optimizer checks if the nodes have a float16
-// GPU OpKernel, but without CUDA there are no GPU OpKernels at all.
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-
 #include "tensorflow/core/grappler/optimizers/auto_mixed_precision.h"
 
 #include <utility>
@@ -70,6 +64,31 @@ Tensor GenerateRandomTensorInRange(const TensorShape& shape, double minval,
   return tensor;
 }
 
+void VerifyGraphsEquivalent(const GraphDef& original_graph,
+                            const GraphDef& optimized_graph,
+                            const string& func) {
+  EXPECT_EQ(original_graph.node_size(), optimized_graph.node_size()) << func;
+  GraphView optimized_view(&optimized_graph);
+  for (int i = 0; i < original_graph.node_size(); ++i) {
+    const NodeDef& original = original_graph.node(i);
+    const NodeDef& optimized = *optimized_view.GetNode(original.name());
+    EXPECT_EQ(original.name(), optimized.name()) << func;
+    EXPECT_EQ(original.op(), optimized.op()) << func;
+    EXPECT_EQ(original.input_size(), optimized.input_size()) << func;
+    if (original.input_size() == optimized.input_size()) {
+      for (int j = 0; j < original.input_size(); ++j) {
+        EXPECT_EQ(original.input(j), optimized.input(j)) << func;
+      }
+    }
+  }
+}
+
+// Currently, this test suite only passes when TensorFlow passes with CUDA,
+// because otherwise the optimizer will not turn clearlist nodes to float16.
+// When looking at clearlist nodes, this optimizer checks if the nodes have a
+// float16 GPU OpKernel, but without CUDA there are no GPU OpKernels at all.
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
 const std::pair<int, int> kMinGPUArch = {7, 0};
 
 class AutoMixedPrecisionTest : public GrapplerTest {
@@ -184,25 +203,6 @@ class AutoMixedPrecisionTest : public GrapplerTest {
   bool gpu_available_;
 };
 
-void VerifyGraphsEquivalent(const GraphDef& original_graph,
-                            const GraphDef& optimized_graph,
-                            const string& func) {
-  EXPECT_EQ(original_graph.node_size(), optimized_graph.node_size()) << func;
-  GraphView optimized_view(&optimized_graph);
-  for (int i = 0; i < original_graph.node_size(); ++i) {
-    const NodeDef& original = original_graph.node(i);
-    const NodeDef& optimized = *optimized_view.GetNode(original.name());
-    EXPECT_EQ(original.name(), optimized.name()) << func;
-    EXPECT_EQ(original.op(), optimized.op()) << func;
-    EXPECT_EQ(original.input_size(), optimized.input_size()) << func;
-    if (original.input_size() == optimized.input_size()) {
-      for (int j = 0; j < original.input_size(); ++j) {
-        EXPECT_EQ(original.input(j), optimized.input(j)) << func;
-      }
-    }
-  }
-}
-
 TEST_F(AutoMixedPrecisionTest, NoOp) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   Output input = ops::Const(s.WithOpName("input"), 1.234f, {32});
@@ -1164,8 +1164,188 @@ TEST_F(AutoMixedPrecisionTest, TanhOp) {
       });
 }
 
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
+#if INTEL_MKL
+#ifdef ENABLE_INTEL_MKL_BFLOAT16
+
+class AutoMixedPrecisionMklTest : public GrapplerTest {
+ protected:
+  void SetUp() override {
+    virtual_cluster_.reset(new SingleMachine(/* timeout_s = */ 10, 1, 0));
+    TF_CHECK_OK(virtual_cluster_->Provision());
+  }
+  void TearDown() override { TF_CHECK_OK(virtual_cluster_->Shutdown()); }
+
+  std::unique_ptr<Cluster> virtual_cluster_;
+};
+
+TEST_F(AutoMixedPrecisionMklTest, AlreadyBf16) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  Output input = ops::Const(s.WithOpName("input"), 1.f, {32, 32});
+  Output cst1 = ops::Cast(s.WithOpName("cst1"), input, DT_BFLOAT16);
+  Output wht1 = ops::MatMul(s.WithOpName("wht1"), cst1, cst1);
+  Output clr1 = ops::Relu(s.WithOpName("clr1"), wht1);
+  Output cst2 = ops::Cast(s.WithOpName("cst2"), clr1, DT_FLOAT);
+  Output clr2 = ops::Relu(s.WithOpName("clr2"), cst2);
+  Output fetch = ops::Identity(s.WithOpName("fetch"), clr2);
+
+  GrapplerItem item;
+  item.fetch = {"fetch"};
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  auto tensors_expected = EvaluateNodes(item.graph, item.fetch);
+
+  AutoMixedPrecision optimizer{AutoMixedPrecisionMode::MKL};
+  GraphDef output;
+  TF_ASSERT_OK(optimizer.Optimize(virtual_cluster_.get(), item, &output));
+  VLOG(1) << output.DebugString();
+
+  VerifyGraphsEquivalent(item.graph, output, __FUNCTION__);
+  GraphView output_view(&output);
+  EXPECT_EQ(output_view.GetNode("input")->attr().at("dtype").type(), DT_FLOAT);
+  EXPECT_EQ(output_view.GetNode("cst1")->attr().at("DstT").type(), DT_BFLOAT16);
+  EXPECT_EQ(output_view.GetNode("wht1")->attr().at("T").type(), DT_BFLOAT16);
+  EXPECT_EQ(output_view.GetNode("clr1")->attr().at("T").type(), DT_BFLOAT16);
+  EXPECT_EQ(output_view.GetNode("cst2")->attr().at("SrcT").type(), DT_BFLOAT16);
+  EXPECT_EQ(output_view.GetNode("cst2")->attr().at("DstT").type(), DT_FLOAT);
+  EXPECT_EQ(output_view.GetNode("clr2")->attr().at("T").type(), DT_FLOAT);
+
+  auto tensors = EvaluateNodes(output, item.fetch);
+  EXPECT_EQ(tensors.size(), tensors_expected.size());
+  EXPECT_EQ(tensors.size(), item.fetch.size());
+  for (int i = 0; i < item.fetch.size(); ++i) {
+    test::ExpectTensorNear<float>(tensors_expected[i], tensors[i], 1e-6);
+  }
+}
+
+TEST_F(AutoMixedPrecisionMklTest, Simple) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  Output input = ops::Const(s.WithOpName("input"), 1.f / 32, {32, 32});
+  Output blk1 = ops::Exp(s.WithOpName("blk1"), input);
+  Output clr1 = ops::Relu(s.WithOpName("clr1"), blk1);
+  Output gry1 = ops::Sqrt(s.WithOpName("gry1"), clr1);
+  Output clr2 = ops::Relu(s.WithOpName("clr2"), gry1);
+  Output wht1 = ops::MatMul(s.WithOpName("wht1"), clr2, clr2);
+  Output clr3 = ops::Relu(s.WithOpName("clr3"), wht1);
+  Output blk2 = ops::Log(s.WithOpName("blk2"), clr3);
+  Output clr4 = ops::Relu(s.WithOpName("clr4"), blk2);
+  Output blk3 = ops::SparseMatMul(s.WithOpName("blk3"), clr4, clr4);
+  Output clr5 = ops::Relu(s.WithOpName("clr5"), blk3);
+  Output fetch = ops::Identity(s.WithOpName("fetch"), clr5);
+
+  GrapplerItem item;
+  item.fetch = {"fetch"};
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  auto tensors_expected = EvaluateNodes(item.graph, item.fetch);
+
+  AutoMixedPrecision optimizer{AutoMixedPrecisionMode::MKL};
+  GraphDef output;
+  TF_ASSERT_OK(optimizer.Optimize(virtual_cluster_.get(), item, &output));
+
+  VLOG(1) << output.DebugString();
+
+  GraphView output_view(&output);
+  EXPECT_EQ(output.node_size(), item.graph.node_size() + 2);
+  EXPECT_EQ(output_view.GetNode("input")->attr().at("dtype").type(), DT_FLOAT);
+  EXPECT_EQ(output_view.GetNode("blk1")->attr().at("T").type(), DT_FLOAT);
+  EXPECT_EQ(output_view.GetNode("clr1")->attr().at("T").type(), DT_FLOAT);
+  EXPECT_EQ(output_view.GetNode("gry1")->attr().at("T").type(), DT_FLOAT);
+  EXPECT_EQ(output_view.GetNode("clr2")->attr().at("T").type(), DT_BFLOAT16);
+  EXPECT_EQ(output_view.GetNode("wht1")->attr().at("T").type(), DT_BFLOAT16);
+  EXPECT_EQ(output_view.GetNode("clr3")->attr().at("T").type(), DT_BFLOAT16);
+  EXPECT_EQ(output_view.GetNode("blk2")->attr().at("T").type(), DT_FLOAT);
+  EXPECT_EQ(output_view.GetNode("clr4")->attr().at("T").type(), DT_FLOAT);
+  EXPECT_EQ(output_view.GetNode("blk3")->attr().at("Ta").type(), DT_FLOAT);
+  EXPECT_EQ(output_view.GetNode("blk3")->attr().at("Tb").type(), DT_FLOAT);
+  EXPECT_EQ(output_view.GetNode("clr5")->attr().at("T").type(), DT_FLOAT);
+
+  auto tensors = EvaluateNodes(output, item.fetch);
+  EXPECT_EQ(tensors.size(), tensors_expected.size());
+  EXPECT_EQ(tensors.size(), item.fetch.size());
+  for (int i = 0; i < item.fetch.size(); ++i) {
+    test::ExpectClose(tensors_expected[i], tensors[i], -1, 5e-4);
+  }
+}
+
+TEST_F(AutoMixedPrecisionMklTest, TensorListSetGet) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  tensorflow::Input shape = {32, 32};
+  auto tl1 = ops::TensorListReserve(s.WithOpName("tl1"), {32, 32}, 8, DT_FLOAT);
+  Output input = ops::Const(s.WithOpName("input"), 1.f / 32, {32, 32});
+  Output idx1 = ops::Const(s.WithOpName("idx1"), 1);
+  Output idx2 = ops::Const(s.WithOpName("idx2"), 2);
+  Output idx3 = ops::Const(s.WithOpName("idx3"), 3);
+  auto tl1w1 =
+      ops::TensorListSetItem(s.WithOpName("tl1w1"), tl1.handle, idx1, input);
+  Output wht1 = ops::MatMul(s.WithOpName("wht1"), input, input);
+  auto tl1w2 =
+      ops::TensorListSetItem(s.WithOpName("tl1w2"), tl1.handle, idx2, wht1);
+  // Ensure that TensorListResize doesn't cause any problems.
+  Output tl1rs =
+      ops::TensorListResize(s.WithOpName("tl1rs"), tl1w2.output_handle, 6);
+  Output tl1r1 = ops::TensorListGetItem(s.WithOpName("tl1r1"), tl1rs, idx2,
+                                        shape, DT_FLOAT)
+                     .item;
+  Output gry1 = ops::Mul(s.WithOpName("gry1"), tl1r1, tl1r1);
+  Output wht2 = ops::MatMul(s.WithOpName("wht2"), gry1, gry1);
+  auto tl1w3 =
+      ops::TensorListSetItem(s.WithOpName("tl1w3"), tl1.handle, idx3, wht2);
+  Output tl1r2 =
+      ops::TensorListGetItem(s.WithOpName("tl1r2"), tl1w3.output_handle, idx3,
+                             shape, DT_FLOAT)
+          .item;
+  auto tl2 = ops::TensorListReserve(s.WithOpName("tl2"), shape, 8, DT_FLOAT);
+  auto tl2w1 =
+      ops::TensorListSetItem(s.WithOpName("tl2w1"), tl2.handle, idx1, input);
+  Output tl2r1 =
+      ops::TensorListGetItem(s.WithOpName("tl2r1"), tl2w1.output_handle, idx1,
+                             shape, DT_FLOAT)
+          .item;
+  Output fetch1 = ops::Identity(s.WithOpName("fetch1"), tl1r2);
+  Output fetch2 = ops::Identity(s.WithOpName("fetch2"), tl2r1);
+
+  GrapplerItem item;
+  item.fetch = {"fetch1", "fetch2"};
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  auto tensors_expected = EvaluateNodes(item.graph, item.fetch);
+
+  AutoMixedPrecision optimizer{AutoMixedPrecisionMode::MKL};
+  GraphDef output;
+  TF_ASSERT_OK(optimizer.Optimize(virtual_cluster_.get(), item, &output));
+
+  VLOG(1) << output.DebugString();
+
+  GraphView output_view(&output);
+  EXPECT_EQ(output.node_size(), item.graph.node_size() + 2);
+  const char* type_key = "element_dtype";
+  EXPECT_EQ(output_view.GetNode("tl1")->attr().at(type_key).type(),
+            DT_BFLOAT16);
+  EXPECT_EQ(output_view.GetNode("tl1w1")->attr().at(type_key).type(),
+            DT_BFLOAT16);
+  EXPECT_EQ(output_view.GetNode("wht1")->attr().at("T").type(), DT_BFLOAT16);
+  EXPECT_EQ(output_view.GetNode("tl1w2")->attr().at(type_key).type(),
+            DT_BFLOAT16);
+  EXPECT_EQ(output_view.GetNode("tl1r1")->attr().at(type_key).type(),
+            DT_BFLOAT16);
+  EXPECT_EQ(output_view.GetNode("gry1")->attr().at("T").type(), DT_BFLOAT16);
+  EXPECT_EQ(output_view.GetNode("wht2")->attr().at("T").type(), DT_BFLOAT16);
+  EXPECT_EQ(output_view.GetNode("tl1w3")->attr().at(type_key).type(),
+            DT_BFLOAT16);
+  EXPECT_EQ(output_view.GetNode("tl2")->attr().at(type_key).type(), DT_FLOAT);
+  EXPECT_EQ(output_view.GetNode("tl2w1")->attr().at(type_key).type(), DT_FLOAT);
+  EXPECT_EQ(output_view.GetNode("tl2r1")->attr().at(type_key).type(), DT_FLOAT);
+
+  auto tensors = EvaluateNodes(output, item.fetch);
+  EXPECT_EQ(tensors.size(), tensors_expected.size());
+  EXPECT_EQ(tensors.size(), item.fetch.size());
+  for (int i = 0; i < item.fetch.size(); ++i) {
+    test::ExpectClose(tensors_expected[i], tensors[i], -1, 1e-2);
+  }
+}
+
+#endif  // ENABLE_INTEL_MKL_BFLOAT16
+#endif  // INTEL_MKL
+
 }  // namespace
 }  // namespace grappler
 }  // namespace tensorflow
-
-#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/tensorflow/core/grappler/optimizers/meta_optimizer.cc b/tensorflow/core/grappler/optimizers/meta_optimizer.cc
index cd0d44e8e12..2f1c869965d 100644
--- a/tensorflow/core/grappler/optimizers/meta_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/meta_optimizer.cc
@@ -188,7 +188,9 @@ std::unique_ptr<GraphOptimizer> MetaOptimizer::MakeNewOptimizer(
   MK_OPT("remap", new Remapper(cfg_.remapping()));
   MK_OPT("layout", new GenericLayoutOptimizer());
   MK_OPT("auto_mixed_precision",
-         new AutoMixedPrecision(cfg_.auto_mixed_precision()));
+         new AutoMixedPrecision(AutoMixedPrecisionMode::CUDA));
+  MK_OPT("auto_mixed_precision_mkl",
+         new AutoMixedPrecision(AutoMixedPrecisionMode::MKL));
   MK_OPT("memory", new MemoryOptimizer(RewriterConfig::MANUAL));
   MK_OPT("common_subgraph_elimination",
          new CommonSubgraphElimination(cfg_.common_subgraph_elimination()));
@@ -249,7 +251,11 @@ Status MetaOptimizer::InitializeOptimizers(
   }
   if (AutoMixedPrecisionEnabled(cfg_.auto_mixed_precision())) {
     optimizers->push_back(
-        MakeUnique<AutoMixedPrecision>(cfg_.auto_mixed_precision()));
+        MakeUnique<AutoMixedPrecision>(AutoMixedPrecisionMode::CUDA));
+  }
+  if (AutoMixedPrecisionEnabled(cfg_.auto_mixed_precision_mkl())) {
+    optimizers->push_back(
+        MakeUnique<AutoMixedPrecision>(AutoMixedPrecisionMode::MKL));
   }
   if (cfg_.pin_to_host_optimization() == RewriterConfig::ON) {
     optimizers->push_back(MakeUnique<PinToHostOptimizer>());
@@ -835,6 +841,7 @@ bool MetaOptimizerEnabled(const ConfigProto& cfg) {
          rewrite_cfg.scoped_allocator_optimization() == RewriterConfig::ON ||
          rewrite_cfg.pin_to_host_optimization() == RewriterConfig::ON ||
          AutoMixedPrecisionEnabled(rewrite_cfg.auto_mixed_precision()) ||
+         AutoMixedPrecisionEnabled(rewrite_cfg.auto_mixed_precision_mkl()) ||
          !rewrite_cfg.optimizers().empty() ||
          !rewrite_cfg.custom_optimizers().empty();
 }
diff --git a/tensorflow/core/protobuf/rewriter_config.proto b/tensorflow/core/protobuf/rewriter_config.proto
index 38c3ad7ae57..9520db92742 100644
--- a/tensorflow/core/protobuf/rewriter_config.proto
+++ b/tensorflow/core/protobuf/rewriter_config.proto
@@ -85,11 +85,15 @@ message RewriterConfig {
   // Enable the swap of kernel implementations based on the device placement
   // (default is ON).
   Toggle implementation_selector = 22;
-  // Optimize data types (default is OFF).
-  // e.g., This will try to use float16 on GPU which is faster.
+  // Optimize data types for CUDA (default is OFF).
+  // This will try to use float16 on GPU which is faster.
   // Note that this can change the numerical stability of the graph and may
   // require the use of loss scaling to maintain model convergence.
   Toggle auto_mixed_precision = 23;
+  // Optimize data types for MKL (default is OFF).
+  // This will try to use bfloat16 on CPUs, which is faster.
+  // Note that this can change the numerical stability of the graph.
+  Toggle auto_mixed_precision_mkl = 25;
   // Disable the entire meta optimizer (off by default).
   bool disable_meta_optimizer = 19;
 
diff --git a/tensorflow/python/grappler/auto_mixed_precision_test.py b/tensorflow/python/grappler/auto_mixed_precision_test.py
index 494f6fc78fc..0de12b9eca8 100644
--- a/tensorflow/python/grappler/auto_mixed_precision_test.py
+++ b/tensorflow/python/grappler/auto_mixed_precision_test.py
@@ -19,8 +19,8 @@ from __future__ import division
 from __future__ import print_function
 
 import os
-import unittest
 
+from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.core.framework import types_pb2
@@ -209,7 +209,7 @@ def _make_node_with_color(color, input_tensor, name=None):
   if color == 'c':  # Clear node
     return nn.relu(input_tensor, name=name)
   if color == 'b':  # Black node
-    return math_ops.sqrt(math_ops.pow(input_tensor, 2.), name=name)
+    return math_ops.pow(math_ops.pow(input_tensor, 2.), 0.5, name=name)
   raise ValueError('Invalid node color: ' + str(color))
 
 
@@ -231,18 +231,21 @@ def _build_simple_loop_graph(inp_colors, body_colors, out_colors):
   return a
 
 
-def _get_config(auto_mixed_precision=True):
+def _get_config(auto_mixed_precision_mode):
   """Returns a ConfigProto with auto mixed precision enabled if appropriate."""
-  if auto_mixed_precision:
-    rewrite_config = rewriter_config_pb2.RewriterConfig(
-        auto_mixed_precision=rewriter_config_pb2.RewriterConfig.ON,
-        # do not remove duplicated nodes
-        arithmetic_optimization=rewriter_config_pb2.RewriterConfig.OFF)
+  rewrite_config = rewriter_config_pb2.RewriterConfig(
+      # do not remove duplicated nodes
+      arithmetic_optimization=rewriter_config_pb2.RewriterConfig.OFF,
+      # do not turn Conv2D and other nodes into _FusedConv2D
+      remapping=rewriter_config_pb2.RewriterConfig.OFF,
+  )
+  if auto_mixed_precision_mode == 'cuda':
+    rewrite_config.auto_mixed_precision = rewriter_config_pb2.RewriterConfig.ON
+  elif auto_mixed_precision_mode == 'mkl':
+    rewrite_config.auto_mixed_precision_mkl = (
+        rewriter_config_pb2.RewriterConfig.ON)
   else:
-    rewrite_config = rewriter_config_pb2.RewriterConfig(
-        auto_mixed_precision=rewriter_config_pb2.RewriterConfig.OFF,
-        # do not remove duplicated nodes
-        arithmetic_optimization=rewriter_config_pb2.RewriterConfig.OFF)
+    assert auto_mixed_precision_mode is None
   rewrite_config.min_graph_nodes = -1
   graph_options = config_pb2.GraphOptions(
       rewrite_options=rewrite_config, build_cost_model=1)
@@ -255,19 +258,33 @@ def _is_cast_to_fp16(node_name):
   return node_name.endswith('-CastToFp16-AutoMixedPrecision')
 
 
+def _is_cast_to_bf16(node_name):
+  return node_name.endswith('-CastToBf16-AutoMixedPrecision')
+
+
 def _is_cast_to_fp32(node_name):
   return node_name.endswith('-CastToFp32-AutoMixedPrecision')
 
 
-def _count_casts(nodes):
+def _count_casts(mode, nodes):
+  """Counts the number of casts to f16 and fp32."""
   num_to_fp16 = 0
+  num_to_bf16 = 0
   num_to_fp32 = 0
   for node in nodes:
     if _is_cast_to_fp16(node.name):
       num_to_fp16 += 1
+    if _is_cast_to_bf16(node.name):
+      num_to_bf16 += 1
     elif _is_cast_to_fp32(node.name):
       num_to_fp32 += 1
-  return num_to_fp16, num_to_fp32
+  if mode == 'cuda':
+    assert num_to_bf16 == 0
+    return num_to_fp16, num_to_fp32
+  else:
+    assert mode == 'mkl'
+    assert num_to_fp16 == 0
+    return num_to_bf16, num_to_fp32
 
 
 def _build_node_map(nodes):
@@ -303,7 +320,7 @@ def _example_noninlined_funcdef(features):
   return features * math_ops.sigmoid(features)
 
 
-class AutoMixedPrecisionTest(test.TestCase):
+class AutoMixedPrecisionTest(test.TestCase, parameterized.TestCase):
   """Tests the Grappler auto mixed precision optimizer."""
   IGNORE_PERF_VAR = 'TF_AUTO_MIXED_PRECISION_GRAPH_REWRITE_IGNORE_PERFORMANCE'
 
@@ -311,8 +328,8 @@ class AutoMixedPrecisionTest(test.TestCase):
 
   def setUp(self):
     super(AutoMixedPrecisionTest, self).setUp()
-    # Enable the tests to be run on pre-Volta GPUs by telling the grappler pass
-    # to ignore performance and always transform the graph.
+    # Enable the CUDA tests to be run on pre-Volta GPUs by telling the grappler
+    # pass to ignore performance and always transform the graph.
     self._original_ignore_perf_value = os.getenv(self.IGNORE_PERF_VAR)
     os.environ[self.IGNORE_PERF_VAR] = '1'
 
@@ -323,24 +340,33 @@ class AutoMixedPrecisionTest(test.TestCase):
       del os.environ[self.IGNORE_PERF_VAR]
     super(AutoMixedPrecisionTest, self).tearDown()
 
-  def _assert_output_fp16(self, node_map, node_name, output_port=0):
-    self.assertEqual(node_map[node_name].output_info[output_port].dtype,
-                     types_pb2.DT_HALF)
+  def _lower_precision_dtype(self, mode):
+    return dtypes.float16 if mode == 'cuda' else dtypes.bfloat16
 
-  def _run(self, fetches):
+  def _assert_output_f16(self, mode, node_map, node_name, output_port=0):
+    self.assertEqual(node_map[node_name].output_info[output_port].dtype,
+                     self._lower_precision_dtype(mode).as_datatype_enum)
+
+  def _run(self, mode, fetches):
     """Runs the graph and returns the evaluation of the fetches."""
-    with session.Session(config=_get_config(False)) as sess:
+    with session.Session(config=_get_config(None)) as sess:
       sess.run(variables.global_variables_initializer())
       output_val_ref = self.evaluate(fetches)
 
-    with session.Session(config=_get_config()) as sess:
+    with session.Session(config=_get_config(mode)) as sess:
       sess.run(variables.global_variables_initializer())
       metadata = config_pb2.RunMetadata()
       output_val = sess.run(fetches, run_metadata=metadata)
 
     return output_val_ref, output_val, metadata.cost_graph
 
-  def _run_simple_loop_test(self, inp, body, out):
+  def _maybe_skip(self, mode):
+    if mode == 'cuda' and not test.is_gpu_available(cuda_only=True):
+      self.skipTest('No GPU is available')
+    if mode == 'mkl' and not test_util.IsMklEnabled():
+      self.skipTest('MKL is not enabled')
+
+  def _run_simple_loop_test(self, mode, inp, body, out):
     """Runs a test of a simple loop.
 
     The loop has different node colors in different sections of the graph. The
@@ -359,398 +385,441 @@ class AutoMixedPrecisionTest(test.TestCase):
       out: A string of letters indicating the colors and expected dtypes of the
         output nodes.
     """
-    if test.is_gpu_available(cuda_only=True):
-      random_seed.set_random_seed(0)
-      expected_types = []
-      for section in [inp, body, out]:
-        section_expected_types = []
-        for color in section:
-          if color.isupper():
-            expected_type = types_pb2.DT_HALF
-          else:
-            expected_type = types_pb2.DT_FLOAT
-          section_expected_types.append(expected_type)
-        expected_types.append(section_expected_types)
+    self._maybe_skip(mode)
+    random_seed.set_random_seed(0)
+    expected_types = []
+    for section in [inp, body, out]:
+      section_expected_types = []
+      for color in section:
+        if color.isupper():
+          expected_type = self._lower_precision_dtype(mode).as_datatype_enum
+        else:
+          expected_type = types_pb2.DT_FLOAT
+        section_expected_types.append(expected_type)
+      expected_types.append(section_expected_types)
 
-      a = _build_simple_loop_graph(inp, body, out)
-      output_val_ref, output_val, cost_graph = self._run(a)
-      node_map = _build_node_map(cost_graph.node)
+    a = _build_simple_loop_graph(inp, body, out)
+    output_val_ref, output_val, cost_graph = self._run(mode, a)
+    node_map = _build_node_map(cost_graph.node)
 
-      section_names = ['input', 'while/body', 'output']
-      all_types_correct = True
-      for section_name, expected_types in zip(section_names, expected_types):
-        for i, expected_type in enumerate(expected_types):
-          node_name = section_name + '_%i' % i
-          output_port = 0
-          optimized_type = node_map[node_name].output_info[output_port].dtype
-          if optimized_type != expected_type:
-            print('Expected node %s to have type %s but got type %s' %
-                  (node_name, expected_type, optimized_type))
-            all_types_correct = False
-      self.assertTrue(all_types_correct)
+    section_names = ['input', 'while/body', 'output']
+    all_types_correct = True
+    for section_name, expected_types in zip(section_names, expected_types):
+      for i, expected_type in enumerate(expected_types):
+        node_name = section_name + '_%i' % i
+        output_port = 0
+        optimized_type = node_map[node_name].output_info[output_port].dtype
+        if optimized_type != expected_type:
+          print('Expected node %s to have type %s but got type %s' %
+                (node_name, expected_type, optimized_type))
+          all_types_correct = False
+    self.assertTrue(all_types_correct)
+    if mode == 'mkl':
+      self.assertAllClose(output_val_ref, output_val, atol=2e-2, rtol=2e-2)
+    else:
       self.assertAllClose(output_val_ref, output_val, atol=2e-3, rtol=1e-3)
 
+  @parameterized.parameters(['cuda', 'mkl'])
   @test_util.run_deprecated_v1
   @test_util.disable_xla('This test does not pass with XLA')
-  def test_conv_bn(self):
+  def test_conv_bn(self, mode):
     """Test graph with convolution followed by batch norm."""
-    if test.is_gpu_available(cuda_only=True):
-      random_seed.set_random_seed(0)
-      x = _input([2, 8, 8, 1])
-      x = _conv_bn(x)
-      output = _conv_bn(x)
+    self._maybe_skip(mode)
+    random_seed.set_random_seed(0)
+    x = _input([2, 8, 8, 1])
+    x = _conv_bn(x)
+    output = _conv_bn(x)
 
-      output_val_ref, output_val, cost_graph = self._run(output)
-      node_map = _build_node_map(cost_graph.node)
-      num_to_fp16, num_to_fp32 = _count_casts(cost_graph.node)
+    output_val_ref, output_val, cost_graph = self._run(mode, output)
+    node_map = _build_node_map(cost_graph.node)
+    num_to_f16, num_to_fp32 = _count_casts(mode, cost_graph.node)
 
-      self._assert_output_fp16(node_map, 'Conv2D')
-      self._assert_output_fp16(node_map, 'FusedBatchNormV3')
-      self._assert_output_fp16(node_map, 'Conv2D_1')
-      self.assertEqual(num_to_fp16,
-                       3)  # Before Conv2D:0, Conv2D:1, Conv2D_1:1
-      self.assertEqual(num_to_fp32, 1)  # After FusedBatchNormV3:0
+    self._assert_output_f16(mode, node_map, 'Conv2D')
+    self._assert_output_f16(mode, node_map, 'FusedBatchNormV3')
+    self._assert_output_f16(mode, node_map, 'Conv2D_1')
+    self.assertEqual(num_to_f16, 3)  # Before Conv2D:0, Conv2D:1, Conv2D_1:1
+    self.assertEqual(num_to_fp32, 1)  # After FusedBatchNormV3:0
+    if mode == 'mkl':
+      tol = 1e-2
+    elif test.is_built_with_rocm():
       # Bump up the tolerance for the ROCm platform
       # The default tolerance (1e-3) results in a tiny fraction (<1%) of
       # miscompares on ROCm platform, and hence the tolerance bump
-      tol = 2e-3 if test.is_built_with_rocm else 1e-3
-      self.assertAllClose(output_val_ref, output_val, atol=tol, rtol=tol)
+      tol = 2e-3
+    else:
+      tol = 1e-3
+    self.assertAllClose(output_val_ref, output_val, atol=tol, rtol=tol)
 
-  # TODO: enable these tests when cuDNN is upgraded to >= 7.6.2. Same with the
-  # test_conv3d() below.
-  @unittest.skip('Test case should be skipped when cuDNN < 7.6.2')
+  @parameterized.parameters(['cuda', 'mkl'])
   @test_util.run_deprecated_v1
   @test_util.disable_xla('This test does not pass with XLA')
-  def test_conv3d_bn(self):
+  def test_conv3d_bn(self, mode):
     """Test graph with convolution followed by batch norm."""
-    if test.is_gpu_available(cuda_only=True):
-      random_seed.set_random_seed(0)
-      x = _input([2, 8, 8, 8, 1])
-      x = _conv3d_bn(x)
-      output = _conv3d_bn(x)
+    self._maybe_skip(mode)
+    if mode == 'cuda':
+      # TODO: enable these tests when cuDNN is upgraded to >= 7.6.2.
+      self.skipTest('Test case should be skipped when cuDNN < 7.6.2')
+    random_seed.set_random_seed(0)
+    x = _input([2, 8, 8, 8, 1])
+    x = _conv3d_bn(x)
+    output = _conv3d_bn(x)
 
-      output_val_ref, output_val, cost_graph = self._run(output)
-      node_map = _build_node_map(cost_graph.node)
-      num_to_fp16, num_to_fp32 = _count_casts(cost_graph.node)
+    output_val_ref, output_val, cost_graph = self._run(mode, output)
+    node_map = _build_node_map(cost_graph.node)
+    num_to_fp16, num_to_fp32 = _count_casts(mode, cost_graph.node)
 
-      self._assert_output_fp16(node_map, 'Conv3D')
-      self._assert_output_fp16(node_map, 'FusedBatchNormV3')
-      self._assert_output_fp16(node_map, 'Conv3D_1')
-      self.assertEqual(num_to_fp16, 3)  # Before Conv3D:0, Conv3D:1, Conv3D_1:1
-      self.assertEqual(num_to_fp32, 1)  # After FusedBatchNormV3:0
-      self.assertAllClose(output_val_ref, output_val, atol=1e-2, rtol=1e-2)
+    self._assert_output_f16(mode, node_map, 'Conv3D')
+    self._assert_output_f16(mode, node_map, 'FusedBatchNormV3')
+    self._assert_output_f16(mode, node_map, 'Conv3D_1')
+    self.assertEqual(num_to_fp16, 3)  # Before Conv3D:0, Conv3D:1, Conv3D_1:1
+    self.assertEqual(num_to_fp32, 1)  # After FusedBatchNormV3:0
+    self.assertAllClose(output_val_ref, output_val, atol=1e-2, rtol=1e-2)
 
-  @unittest.skip('Test case should be skipped when cuDNN < 7.6.2')
+  @parameterized.parameters(['cuda', 'mkl'])
   @test_util.run_deprecated_v1
   @test_util.disable_xla('This test does not pass with XLA')
-  def test_conv3d(self):
+  def test_conv3d(self, mode):
     """Test grad ops with convolution3d graph."""
-    if test.is_gpu_available(cuda_only=True):
-      random_seed.set_random_seed(0)
-      x = _input([2, 8, 8, 8, 1])
-      f = _weight([3, 3, 3, 1, 6])
-      y = _conv3d(x, f)
-      y = array_ops.identity(y)
-      optimizer = gradient_descent.GradientDescentOptimizer(learning_rate=0.01)
-      g = optimizer.compute_gradients(y, [x, f])
-      output = (y, g)
+    self._maybe_skip(mode)
+    if mode == 'cuda':
+      # TODO: enable these tests when cuDNN is upgraded to >= 7.6.2.
+      self.skipTest('Test case should be skipped when cuDNN < 7.6.2')
+    random_seed.set_random_seed(0)
+    x = _input([2, 8, 8, 8, 1])
+    f = _weight([3, 3, 3, 1, 6])
+    y = _conv3d(x, f)
+    y = array_ops.identity(y)
+    optimizer = gradient_descent.GradientDescentOptimizer(learning_rate=0.01)
+    g = optimizer.compute_gradients(y, [x, f])
+    output = (y, g)
 
-      output_val_ref, output_val, cost_graph = self._run(output)
-      node_map = _build_node_map(cost_graph.node)
-      self._assert_output_fp16(node_map, 'Conv3D')
-      self._assert_output_fp16(node_map,
-                               'gradients/Conv3D_grad/Conv3DBackpropInputV2')
-      self._assert_output_fp16(node_map,
-                               'gradients/Conv3D_grad/Conv3DBackpropFilterV2')
+    output_val_ref, output_val, cost_graph = self._run(mode, output)
+    node_map = _build_node_map(cost_graph.node)
+    self._assert_output_f16(mode, node_map, 'Conv3D')
+    self._assert_output_f16(mode, node_map,
+                            'gradients/Conv3D_grad/Conv3DBackpropInputV2')
+    self._assert_output_f16(mode, node_map,
+                            'gradients/Conv3D_grad/Conv3DBackpropFilterV2')
 
-      output_val_ref, output_val, cost_graph = self._run(output)
-      self.assertAllClose(output_val_ref, output_val, atol=1e-3, rtol=1e-3)
+    output_val_ref, output_val, cost_graph = self._run(mode, output)
+    tol = 5e-2 if mode == 'mkl' else 1e-3
+    self.assertAllClose(output_val_ref, output_val, atol=tol, rtol=tol)
 
+  # TODO(reedwm): Fix and enable this test with MKL. Currently this crashes with
+  # MKL
+  @parameterized.parameters(['cuda'])
   @test_util.run_deprecated_v1
   @test_util.disable_xla('This test does not pass with XLA')
-  def test_conv_bn_dropout(self):
+  def test_conv_bn_dropout(self, mode):
     """Test dropout precision of convolution batch norm graph."""
-    if test.is_gpu_available(cuda_only=True):
-      random_seed.set_random_seed(0)
-      x = _input([2, 8, 8, 1])
-      y = _conv_bn(x)
-      y = nn.dropout(y, rate=0.5)
-      y = math_ops.add(y, 1, name='addition')
-      y = _conv_bn(y)
-      y = array_ops.identity(y)
-      optimizer = gradient_descent.GradientDescentOptimizer(
-          learning_rate=0.01)
-      g = optimizer.compute_gradients(y, [x])
-      output = (y, g)
+    self._maybe_skip(mode)
+    random_seed.set_random_seed(0)
+    x = _input([2, 8, 8, 1])
+    y = _conv_bn(x)
+    y = nn.dropout(y, rate=0.5)
+    y = math_ops.add(y, 1, name='addition')
+    y = _conv_bn(y)
+    y = array_ops.identity(y)
+    optimizer = gradient_descent.GradientDescentOptimizer(
+        learning_rate=0.01)
+    g = optimizer.compute_gradients(y, [x])
+    output = (y, g)
 
-      output_val_ref, output_val, cost_graph = self._run(output)
-      node_map = _build_node_map(cost_graph.node)
-      self._assert_output_fp16(node_map, 'Conv2D')
-      self._assert_output_fp16(node_map, 'FusedBatchNormV3')
-      # We do not assert dropout's dtype because we do not want to rely on the
-      # node names of dropout's internal implementation.
-      self._assert_output_fp16(node_map, 'addition')
-      self._assert_output_fp16(node_map, 'Conv2D_1')
+    output_val_ref, output_val, cost_graph = self._run(mode, output)
+    node_map = _build_node_map(cost_graph.node)
+    self._assert_output_f16(mode, node_map, 'Conv2D')
+    self._assert_output_f16(mode, node_map, 'FusedBatchNormV3')
+    # We do not assert dropout's dtype because we do not want to rely on the
+    # node names of dropout's internal implementation.
+    self._assert_output_f16(mode, node_map, 'addition')
+    self._assert_output_f16(mode, node_map, 'Conv2D_1')
 
-      output_val_ref, output_val, cost_graph = self._run(output)
-      # Bump up the tolerance for the ROCm platform
-      # The default tolerance (1e-3) results in a tiny fraction (<1%) of
-      # miscompares on ROCm platform, and hence the tolerance bump
-      tol = 2e-3 if test.is_built_with_rocm else 1e-3
-      self.assertAllClose(output_val_ref, output_val, atol=tol, rtol=tol)
+    output_val_ref, output_val, cost_graph = self._run(mode, output)
+    # Bump up the tolerance for the ROCm platform
+    # The default tolerance (1e-3) results in a tiny fraction (<1%) of
+    # miscompares on ROCm platform, and hence the tolerance bump
+    tol = 2e-3 if test.is_built_with_rocm else 1e-3
+    self.assertAllClose(output_val_ref, output_val, atol=tol, rtol=tol)
 
+  # TODO(reedwm): Fix and enable this test with MKL. Currently this crashes with
+  # MKL
+  @parameterized.parameters(['cuda'])
   @test_util.run_deprecated_v1
   @test_util.disable_xla('This test does not pass with XLA')
-  def test_conv_pool(self):
+  def test_conv_pool(self, mode):
     """Test graph with convolution followed by pooling."""
-    if test.is_gpu_available(cuda_only=True):
-      random_seed.set_random_seed(0)
-      x = _input([2, 8, 8, 1])
-      output = _conv_pool(x)
+    self._maybe_skip(mode)
+    random_seed.set_random_seed(0)
+    x = _input([2, 8, 8, 1])
+    output = _conv_pool(x)
 
-      output_val_ref, output_val, cost_graph = self._run(output)
-      node_map = _build_node_map(cost_graph.node)
-      num_to_fp16, num_to_fp32 = _count_casts(cost_graph.node)
+    output_val_ref, output_val, cost_graph = self._run(mode, output)
+    node_map = _build_node_map(cost_graph.node)
+    num_to_f16, num_to_fp32 = _count_casts(mode, cost_graph.node)
 
-      self._assert_output_fp16(node_map, 'Conv2D')
-      self._assert_output_fp16(node_map, 'Relu')
-      self._assert_output_fp16(node_map, 'MaxPool')
-      self._assert_output_fp16(node_map, 'Conv2D_1')
-      self.assertEqual(num_to_fp16, 4)
-      self.assertEqual(num_to_fp32, 1)
-      self.assertAllClose(output_val_ref, output_val, atol=1e-3, rtol=1e-3)
+    self._assert_output_f16(mode, node_map, 'Conv2D')
+    self._assert_output_f16(mode, node_map, 'Relu')
+    self._assert_output_f16(mode, node_map, 'MaxPool')
+    self._assert_output_f16(mode, node_map, 'Conv2D_1')
+    self.assertEqual(num_to_f16, 4)
+    self.assertEqual(num_to_fp32, 1)
+    tol = 5e-3 if mode == 'mkl' else 1e-3
+    self.assertAllClose(output_val_ref, output_val, atol=tol, rtol=tol)
 
+  @parameterized.parameters(['cuda', 'mkl'])
   @test_util.run_v1_only('b/138749235')
   @test_util.disable_xla('This test does not pass with XLA')
-  def test_simple_loop(self):
+  def test_simple_loop(self, mode):
     """Test graph with while loop."""
-    if test.is_gpu_available(cuda_only=True):
-      random_seed.set_random_seed(0)
-      x = _input([8, 8])
-      y = _simple_loop(x, _matmul_act)[1]
-      optimizer = gradient_descent.GradientDescentOptimizer(learning_rate=0.01)
-      g = optimizer.compute_gradients(y, [x])
-      output = (y, g)
+    self._maybe_skip(mode)
+    random_seed.set_random_seed(0)
+    x = _input([8, 8])
+    y = _simple_loop(x, _matmul_act)[1]
+    optimizer = gradient_descent.GradientDescentOptimizer(learning_rate=0.01)
+    g = optimizer.compute_gradients(y, [x])
+    output = (y, g)
 
-      output_val_ref, output_val, cost_graph = self._run(output)
-      node_map = _build_node_map(cost_graph.node)
+    output_val_ref, output_val, cost_graph = self._run(mode, output)
+    node_map = _build_node_map(cost_graph.node)
 
-      self._assert_output_fp16(node_map, 'while/MatMul')
-      self._assert_output_fp16(node_map, 'while/Relu')
-      self.assertAllClose(output_val_ref, output_val, atol=1e-3, rtol=1e-3)
+    self._assert_output_f16(mode, node_map, 'while/MatMul')
+    self._assert_output_f16(mode, node_map, 'while/Relu')
+    tol = 1e-2 if mode == 'mkl' else 1e-3
+    self.assertAllClose(output_val_ref, output_val, atol=tol, rtol=tol)
 
+  @parameterized.parameters(['cuda', 'mkl'])
   @test_util.run_v1_only('b/138749235')
   @test_util.disable_xla('This test does not pass with XLA')
-  def test_loop_with_vars_intertwined(self):
+  def test_loop_with_vars_intertwined(self, mode):
     """Test graph with intertwined while loops."""
-    if test.is_gpu_available(cuda_only=True):
-      random_seed.set_random_seed(0)
-      x = _input([8, 8])
-      _, _, k, l = _loop_vars_intertwined(
-          array_ops.ones(array_ops.shape(x)), x, _matmul_act, _matmul_act)
-      optimizer = gradient_descent.GradientDescentOptimizer(learning_rate=0.01)
-      g = optimizer.compute_gradients(k, [x])
-      output = (k, l, g)
+    self._maybe_skip(mode)
+    random_seed.set_random_seed(0)
+    x = _input([8, 8])
+    _, _, k, l = _loop_vars_intertwined(
+        array_ops.ones(array_ops.shape(x)), x, _matmul_act, _matmul_act)
+    optimizer = gradient_descent.GradientDescentOptimizer(learning_rate=0.01)
+    g = optimizer.compute_gradients(k, [x])
+    output = (k, l, g)
 
-      output_val_ref, output_val, cost_graph = self._run(output)
-      node_map = _build_node_map(cost_graph.node)
+    output_val_ref, output_val, cost_graph = self._run(mode, output)
+    node_map = _build_node_map(cost_graph.node)
 
-      self._assert_output_fp16(node_map, 'while/MatMul')
-      self._assert_output_fp16(node_map, 'while/Relu')
-      self._assert_output_fp16(node_map, 'while/MatMul_1')
-      self._assert_output_fp16(node_map, 'while/Relu_1')
-      self.assertAllClose(output_val_ref, output_val, atol=1e-3, rtol=1e-3)
+    self._assert_output_f16(mode, node_map, 'while/MatMul')
+    self._assert_output_f16(mode, node_map, 'while/Relu')
+    self._assert_output_f16(mode, node_map, 'while/MatMul_1')
+    self._assert_output_f16(mode, node_map, 'while/Relu_1')
+    tol = 5e-3 if mode == 'mkl' else 1e-3
+    self.assertAllClose(output_val_ref, output_val, atol=tol, rtol=tol)
 
+  @parameterized.parameters(['cuda'])
   @test_util.run_deprecated_v1
   @test_util.disable_xla('This test does not pass with XLA')
-  def test_multi_paths(self):
+  def test_multi_paths(self, mode):
     """Test graph with multiple paths."""
-    if test.is_gpu_available(cuda_only=True):
-      random_seed.set_random_seed(0)
-      x = _input([2, 8, 8, 3])
-      x1, x2, x3 = array_ops.split(x, num_or_size_splits=3, axis=3)
-      y1 = _conv_pool(x1)
-      y2 = _conv_pool(x2)
-      y3 = _conv_pool(x3)
-      y = array_ops.concat([y1, y2, y3], axis=3)
-      y = array_ops.identity(y)
-      optimizer = gradient_descent.GradientDescentOptimizer(learning_rate=0.01)
-      g = optimizer.compute_gradients(y, [x])
-      output = (y, g)
+    self._maybe_skip(mode)
+    random_seed.set_random_seed(0)
+    x = _input([2, 8, 8, 3])
+    x1, x2, x3 = array_ops.split(x, num_or_size_splits=3, axis=3)
+    y1 = _conv_pool(x1)
+    y2 = _conv_pool(x2)
+    y3 = _conv_pool(x3)
+    y = array_ops.concat([y1, y2, y3], axis=3)
+    y = array_ops.identity(y)
+    optimizer = gradient_descent.GradientDescentOptimizer(learning_rate=0.01)
+    g = optimizer.compute_gradients(y, [x])
+    output = (y, g)
 
-      output_val_ref, output_val, cost_graph = self._run(output)
-      node_map = _build_node_map(cost_graph.node)
+    output_val_ref, output_val, cost_graph = self._run(mode, output)
+    node_map = _build_node_map(cost_graph.node)
 
-      self._assert_output_fp16(node_map, 'split')
-      for suffix in [''] + ['_%i' % i for i in range(1, 6)]:
-        self._assert_output_fp16(node_map, 'Conv2D' + suffix)
-        self._assert_output_fp16(node_map, 'Relu' + suffix)
-        self._assert_output_fp16(node_map, 'MaxPool' + suffix)
-      self._assert_output_fp16(node_map, 'concat')
-      self.assertAllClose(output_val_ref, output_val, atol=1e-3, rtol=1e-3)
+    self._assert_output_f16(mode, node_map, 'split')
+    for suffix in [''] + ['_%i' % i for i in range(1, 6)]:
+      self._assert_output_f16(mode, node_map, 'Conv2D' + suffix)
+      self._assert_output_f16(mode, node_map, 'Relu' + suffix)
+      self._assert_output_f16(mode, node_map, 'MaxPool' + suffix)
+    self._assert_output_f16(mode, node_map, 'concat')
+    self.assertAllClose(output_val_ref, output_val, atol=1e-3, rtol=1e-3)
 
+  @parameterized.parameters(['cuda', 'mkl'])
   @test_util.run_deprecated_v1
   @test_util.disable_xla('This test does not pass with XLA')
-  def test_multi_paths_2(self):
+  def test_multi_paths_2(self, mode):
     """Test graph with multiple paths."""
-    if test.is_gpu_available(cuda_only=True):
-      random_seed.set_random_seed(0)
-      x = _input([8, 8])
-      y1 = _matmul_act(x)
-      y2 = _matmul_act(x)
-      y = y1 + y2 + x
-      optimizer = gradient_descent.GradientDescentOptimizer(learning_rate=0.01)
-      g = optimizer.compute_gradients(y, [x])
-      output = (g, y)
+    self._maybe_skip(mode)
+    random_seed.set_random_seed(0)
+    x = _input([8, 8])
+    y1 = _matmul_act(x)
+    y2 = _matmul_act(x)
+    y = y1 + y2 + x
+    optimizer = gradient_descent.GradientDescentOptimizer(learning_rate=0.01)
+    g = optimizer.compute_gradients(y, [x])
+    output = (g, y)
 
-      output_val_ref, output_val, cost_graph = self._run(output)
-      node_map = _build_node_map(cost_graph.node)
+    output_val_ref, output_val, cost_graph = self._run(mode, output)
+    node_map = _build_node_map(cost_graph.node)
 
-      self._assert_output_fp16(node_map, 'MatMul')
-      self._assert_output_fp16(node_map, 'Relu')
-      self._assert_output_fp16(node_map, 'MatMul_1')
-      self._assert_output_fp16(node_map, 'Relu_1')
+    self._assert_output_f16(mode, node_map, 'MatMul')
+    self._assert_output_f16(mode, node_map, 'Relu')
+    self._assert_output_f16(mode, node_map, 'MatMul_1')
+    self._assert_output_f16(mode, node_map, 'Relu_1')
+    if mode == 'mkl':
+      tol = 2e-2
+    elif test.is_built_with_rocm():
       # Bump up the tolerance for the ROCm platform
       # The default tolerance (1e-3) results in a tiny fraction (<1%) of
       # miscompares on ROCm platform, and hence the tolerance bump
-      tol = 2e-3 if test.is_built_with_rocm else 1e-3
-      self.assertAllClose(output_val_ref, output_val, atol=tol, rtol=tol)
+      tol = 2e-3
+    else:
+      tol = 1e-3
+    self.assertAllClose(output_val_ref, output_val, atol=tol, rtol=tol)
 
+  @parameterized.parameters(['cuda'])  # MKL doesn't support bf16 Sigmoid
   @test_util.run_v1_only('b/138749235')
   @test_util.disable_xla('This test does not pass with XLA')
-  def test_recurrent_lstm(self):
+  def test_recurrent_lstm(self, mode):
     """Test graph with recurrent lstm."""
-    if test.is_gpu_available(cuda_only=True):
-      random_seed.set_random_seed(0)
-      init_c = _input([8, 4])
-      init_h = _input([8, 4])
-      _, _, h, _ = _recurrent_lstm(init_c, init_h)
-      optimizer = gradient_descent.GradientDescentOptimizer(learning_rate=0.01)
-      g = optimizer.compute_gradients(h, [init_c, init_h])
-      output = (h, g)
+    self._maybe_skip(mode)
+    random_seed.set_random_seed(0)
+    init_c = _input([8, 4])
+    init_h = _input([8, 4])
+    _, _, h, _ = _recurrent_lstm(init_c, init_h)
+    optimizer = gradient_descent.GradientDescentOptimizer(learning_rate=0.01)
+    g = optimizer.compute_gradients(h, [init_c, init_h])
+    output = (h, g)
 
-      output_val_ref, output_val, cost_graph = self._run(output)
-      node_map = _build_node_map(cost_graph.node)
+    output_val_ref, output_val, cost_graph = self._run(mode, output)
+    node_map = _build_node_map(cost_graph.node)
 
-      self._assert_output_fp16(node_map, 'while/concat')
-      self._assert_output_fp16(node_map, 'while/MatMul')
-      self._assert_output_fp16(node_map, 'while/split')
-      self._assert_output_fp16(node_map, 'while/Sigmoid')
-      self._assert_output_fp16(node_map, 'while/Sigmoid_1')
-      self._assert_output_fp16(node_map, 'while/Sigmoid_2')
-      self._assert_output_fp16(node_map, 'while/Tanh')
-      self._assert_output_fp16(node_map, 'while/Tanh_1')
-      self.assertAllClose(output_val_ref, output_val, atol=1e-3, rtol=1e-3)
+    self._assert_output_f16(mode, node_map, 'while/concat')
+    self._assert_output_f16(mode, node_map, 'while/MatMul')
+    self._assert_output_f16(mode, node_map, 'while/split')
+    self._assert_output_f16(mode, node_map, 'while/Sigmoid')
+    self._assert_output_f16(mode, node_map, 'while/Sigmoid_1')
+    self._assert_output_f16(mode, node_map, 'while/Sigmoid_2')
+    self._assert_output_f16(mode, node_map, 'while/Tanh')
+    self._assert_output_f16(mode, node_map, 'while/Tanh_1')
+    self.assertAllClose(output_val_ref, output_val, atol=1e-3, rtol=1e-3)
 
+  @parameterized.parameters(['cuda', 'mkl'])
   @test_util.run_v1_only('v1 loop test')
   @test_util.disable_xla('This test does not pass with XLA')
-  def test_propagation_through_simple_loop_1(self):
-    self._run_simple_loop_test('W', 'C', 'C')
+  def test_propagation_through_simple_loop_1(self, mode):
+    self._run_simple_loop_test(mode, 'W', 'C', 'C')
 
+  @parameterized.parameters(['cuda', 'mkl'])
   @test_util.run_v1_only('v1 loop test')
   @test_util.disable_xla('This test does not pass with XLA')
-  def test_propagation_through_simple_loop_2(self):
-    self._run_simple_loop_test('C', 'C', 'W')
+  def test_propagation_through_simple_loop_2(self, mode):
+    self._run_simple_loop_test(mode, 'C', 'C', 'W')
 
+  @parameterized.parameters(['cuda', 'mkl'])
   @test_util.run_v1_only('v1 loop test')
   @test_util.disable_xla('This test does not pass with XLA')
-  def test_propagation_through_simple_loop_3(self):
-    self._run_simple_loop_test('W', 'G', 'W')
+  def test_propagation_through_simple_loop_3(self, mode):
+    self._run_simple_loop_test(mode, 'W', 'G', 'W')
 
+  @parameterized.parameters(['cuda', 'mkl'])
   @test_util.run_v1_only('v1 loop test')
   @test_util.disable_xla('This test does not pass with XLA')
-  def test_propagation_through_simple_loop_4(self):
-    self._run_simple_loop_test('W', 'gbg', 'W')
+  def test_propagation_through_simple_loop_4(self, mode):
+    self._run_simple_loop_test(mode, 'W', 'gbg', 'W')
 
+  @parameterized.parameters(['cuda', 'mkl'])
   @test_util.run_v1_only('b/138749235')
   @test_util.disable_xla('This test does not pass with XLA')
-  def test_propagation_through_simple_loop_5(self):
-    self._run_simple_loop_test('b', 'gWC', 'c')
+  def test_propagation_through_simple_loop_5(self, mode):
+    self._run_simple_loop_test(mode, 'b', 'gWC', 'c')
 
+  @parameterized.parameters(['cuda', 'mkl'])
   @test_util.run_v1_only('b/138749235')
   @test_util.disable_xla('This test does not pass with XLA')
-  def test_propagation_through_simple_loop_6(self):
-    self._run_simple_loop_test('b', 'CWCG', 'C')
+  def test_propagation_through_simple_loop_6(self, mode):
+    self._run_simple_loop_test(mode, 'b', 'CWCG', 'C')
 
+  @parameterized.parameters(['cuda', 'mkl'])
   @test_util.run_v1_only('b/138749235')
   @test_util.disable_xla('This test does not pass with XLA')
-  def test_propagation_through_simple_loop_7(self):
-    self._run_simple_loop_test('C', 'GWCG', 'C')
+  def test_propagation_through_simple_loop_7(self, mode):
+    self._run_simple_loop_test(mode, 'C', 'GWCG', 'C')
 
+  @parameterized.parameters(['cuda', 'mkl'])
   @test_util.run_v1_only('b/138749235')
   @test_util.disable_xla('This test does not pass with XLA')
-  def test_propagation_through_simple_loop_8(self):
-    self._run_simple_loop_test('C', 'CgbgWC', 'g')
+  def test_propagation_through_simple_loop_8(self, mode):
+    self._run_simple_loop_test(mode, 'C', 'CgbgWC', 'g')
 
+  @parameterized.parameters(['cuda', 'mkl'])
   @test_util.run_deprecated_v1
   @test_util.disable_xla('This test does not pass with XLA')
-  def test_noninlined_funcdef(self):
+  def test_noninlined_funcdef(self, mode):
     """Test graph with non-inlined function subgraph.
 
     This requires the grappler pass to handle an OpDef that only appears in the
     graph's function registry instead of the global op registry.
     """
-    if test.is_gpu_available(cuda_only=True):
-      random_seed.set_random_seed(0)
-      x = _input([8, 8])
-      y = _matmul_act(x)
-      y = _example_noninlined_funcdef(y)
-      optimizer = gradient_descent.GradientDescentOptimizer(learning_rate=0.01)
-      g = optimizer.compute_gradients(y, [x])
-      output = (g, y)
+    self._maybe_skip(mode)
+    random_seed.set_random_seed(0)
+    x = _input([8, 8])
+    y = _matmul_act(x)
+    y = _example_noninlined_funcdef(y)
+    optimizer = gradient_descent.GradientDescentOptimizer(learning_rate=0.01)
+    g = optimizer.compute_gradients(y, [x])
+    output = (g, y)
 
-      output_val_ref, output_val, cost_graph = self._run(output)
-      node_map = _build_node_map(cost_graph.node)
+    output_val_ref, output_val, cost_graph = self._run(mode, output)
+    node_map = _build_node_map(cost_graph.node)
 
-      self._assert_output_fp16(node_map, 'MatMul')
-      self.assertAllClose(output_val_ref, output_val, atol=1e-3, rtol=1e-3)
+    self._assert_output_f16(mode, node_map, 'MatMul')
+    tol = 1e-2 if mode == 'mkl' else 1e-3
+    self.assertAllClose(output_val_ref, output_val, atol=tol, rtol=tol)
 
+  @parameterized.parameters(['cuda', 'mkl'])
   @test_util.run_deprecated_v1
   @test_util.disable_xla('This test does not pass with XLA')
-  def test_ingraph_train_loop(self):
+  def test_ingraph_train_loop(self, mode):
     """Tests a graph containing a while loop around a training update.
 
     This requires the grappler pass to take special care with its handling of
     Enter ops that appear in front of reads from non-resource variables. See
     the use of NodeImplicitlyReadsVariable in auto_mixed_precision.cc.
     """
+    self._maybe_skip(mode)
     if tf2.enabled():
       # This test tests non-resource variables, which are only used in TF1.
       self.skipTest('TensorFlow 1 required')
-    if test.is_gpu_available(cuda_only=True):
-      random_seed.set_random_seed(1234)
-      np.random.seed(1234)
-      num_iter, bs, nchan, nclass = 100, 64, 32, 100
+    random_seed.set_random_seed(1234)
+    np.random.seed(1234)
+    num_iter, bs, nchan, nclass = 100, 64, 32, 100
 
-      data = np.random.normal(size=(bs * num_iter, nchan)).astype(np.float32)
-      labels = np.random.randint(nclass, size=(bs * num_iter,))
-      ds = dataset_ops.Dataset.from_tensor_slices((data, labels))
-      ds = ds.batch(bs).prefetch(3)
-      it = ds.make_one_shot_iterator()
+    data = np.random.normal(size=(bs * num_iter, nchan)).astype(np.float32)
+    labels = np.random.randint(nclass, size=(bs * num_iter,))
+    ds = dataset_ops.Dataset.from_tensor_slices((data, labels))
+    ds = ds.batch(bs).prefetch(3)
+    it = ds.make_one_shot_iterator()
 
-      def body(_, i):
-        i += 1
-        x, yt = it.get_next()
-        dense = layers.Dense(nclass)
-        y = dense(x)
-        loss = losses.sparse_softmax_cross_entropy(yt, y)
-        opt = adam.AdamOptimizer()
-        train_op = opt.minimize(loss, var_list=dense.trainable_weights)
-        with ops.control_dependencies([train_op]):
-          loss = array_ops.identity(loss)
-        return loss, i
+    def body(_, i):
+      i += 1
+      x, yt = it.get_next()
+      dense = layers.Dense(nclass)
+      y = dense(x)
+      loss = losses.sparse_softmax_cross_entropy(yt, y)
+      opt = adam.AdamOptimizer()
+      train_op = opt.minimize(loss, var_list=dense.trainable_weights)
+      with ops.control_dependencies([train_op]):
+        loss = array_ops.identity(loss)
+      return loss, i
 
-      begin, end = constant_op.constant(0), constant_op.constant(num_iter)
-      loss, _ = control_flow_ops.while_loop(
-          lambda loss, i: math_ops.less(i, end), body, [0.0, begin])
+    begin, end = constant_op.constant(0), constant_op.constant(num_iter)
+    loss, _ = control_flow_ops.while_loop(
+        lambda loss, i: math_ops.less(i, end), body, [0.0, begin])
 
-      output_val_ref, output_val, cost_graph = self._run(loss)
-      node_map = _build_node_map(cost_graph.node)
+    output_val_ref, output_val, cost_graph = self._run(mode, loss)
+    node_map = _build_node_map(cost_graph.node)
 
-      self._assert_output_fp16(node_map, 'while/dense/MatMul')
-      self._assert_output_fp16(
-          node_map, 'while/gradients/while/dense/MatMul_grad/MatMul_1')
-      self.assertAllClose(output_val_ref, output_val, atol=1e-3, rtol=1e-3)
+    self._assert_output_f16(mode, node_map, 'while/dense/MatMul')
+    self._assert_output_f16(
+        mode, node_map, 'while/gradients/while/dense/MatMul_grad/MatMul_1')
+    self.assertAllClose(output_val_ref, output_val, atol=1e-3, rtol=1e-3)
 
   # TODO(benbarsdell): Add tests for list ops (TensorList*) that pass through
   # graph source/sink nodes, similar to the TensorListThroughFunction C++ test.

From ebe063eb74ab1ee80bfb0d4447fb7c842e0ca27b Mon Sep 17 00:00:00 2001
From: Dan Moldovan <mdan@google.com>
Date: Thu, 18 Jun 2020 12:38:59 -0700
Subject: [PATCH 0521/1390] Create an empty test for python_op_gen.

PiperOrigin-RevId: 317159004
Change-Id: Iabc4810f9c51c257d62dc3e7bd20d96131939d5d
---
 tensorflow/python/BUILD                       | 15 ++++++-
 tensorflow/python/framework/python_op_gen.cc  | 16 ++++---
 tensorflow/python/framework/python_op_gen.h   | 14 ++++++-
 .../python/framework/python_op_gen_test.cc    | 42 +++++++++++++++++++
 4 files changed, 80 insertions(+), 7 deletions(-)
 create mode 100644 tensorflow/python/framework/python_op_gen_test.cc

diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index a4e72bf2460..de9cf9a24c7 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -3,7 +3,7 @@
 #  ":platform" - Low-level and platform-specific Python code.
 
 load("//tensorflow:tensorflow.bzl", "py_strict_library")
-load("//tensorflow:tensorflow.bzl", "cc_header_only_library", "if_mlir", "if_not_windows", "if_xla_available", "py_test", "py_tests", "tf_cc_shared_object", "tf_cuda_library", "tf_gen_op_wrapper_py")
+load("//tensorflow:tensorflow.bzl", "cc_header_only_library", "if_mlir", "if_not_windows", "if_xla_available", "py_test", "py_tests", "tf_cc_shared_object", "tf_cc_test", "tf_cuda_library", "tf_gen_op_wrapper_py")
 
 # buildifier: disable=same-origin-load
 load("//tensorflow:tensorflow.bzl", "tf_monitoring_python_deps")
@@ -1236,6 +1236,19 @@ cc_library(
     alwayslink = 1,
 )
 
+tf_cc_test(
+    name = "python_op_gen_test",
+    srcs = ["framework/python_op_gen_test.cc"],
+    deps = [
+        ":python_op_gen",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:op_gen_lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
+
 py_library(
     name = "framework_for_generated_wrappers",
     srcs_version = "PY2AND3",
diff --git a/tensorflow/python/framework/python_op_gen.cc b/tensorflow/python/framework/python_op_gen.cc
index ca0c5d9ef1a..0f84c6a063d 100644
--- a/tensorflow/python/framework/python_op_gen.cc
+++ b/tensorflow/python/framework/python_op_gen.cc
@@ -981,9 +981,9 @@ void GenEagerPythonOp::AddRawOpExport(const string& parameters) {
                      function_name_, "))\n");
 }
 
-string GetPythonOps(const OpList& ops, const ApiDefMap& api_defs,
-                    const std::vector<string>& hidden_ops,
-                    const string& source_file_name = "") {
+string GetPythonOpsImpl(const OpList& ops, const ApiDefMap& api_defs,
+                        const std::vector<string>& hidden_ops,
+                        const string& source_file_name = "") {
   string result;
   // Header
   // TODO(josh11b): Mention the library for which wrappers are being generated.
@@ -1069,11 +1069,17 @@ from tensorflow.python.util.tf_export import tf_export
 
 }  // namespace
 
+string GetPythonOps(const OpList& ops, const ApiDefMap& api_defs,
+                    const std::vector<string>& hidden_ops,
+                    const string& source_file_name) {
+  return GetPythonOpsImpl(ops, api_defs, hidden_ops, source_file_name);
+}
+
 void PrintPythonOps(const OpList& ops, const ApiDefMap& api_defs,
                     const std::vector<string>& hidden_ops,
                     const string& source_file_name) {
   printf("%s",
-         GetPythonOps(ops, api_defs, hidden_ops, source_file_name).c_str());
+         GetPythonOpsImpl(ops, api_defs, hidden_ops, source_file_name).c_str());
 }
 
 string GetPythonWrappers(const char* op_list_buf, size_t op_list_len) {
@@ -1081,7 +1087,7 @@ string GetPythonWrappers(const char* op_list_buf, size_t op_list_len) {
   ops.ParseFromArray(op_list_buf, op_list_len);
 
   ApiDefMap api_def_map(ops);
-  return GetPythonOps(ops, api_def_map, {});
+  return GetPythonOpsImpl(ops, api_def_map, {});
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/python/framework/python_op_gen.h b/tensorflow/python/framework/python_op_gen.h
index 22fcc452fbb..f1cd6e49013 100644
--- a/tensorflow/python/framework/python_op_gen.h
+++ b/tensorflow/python/framework/python_op_gen.h
@@ -23,8 +23,20 @@ limitations under the License.
 
 namespace tensorflow {
 
+// Returns a string containing the generated Python code for the given Ops.
+// ops is a protobuff, typically generated using OpRegistry::Global()->Export.
+// api_defs is typically constructed directly from ops.
 // hidden_ops should be a list of Op names that should get a leading _
-// in the output. Prints the output to stdout.
+// in the output.
+// source_file_name is optional and contains the name of the original C++ source
+// file where the ops' REGISTER_OP() calls reside.
+string GetPythonOps(const OpList& ops, const ApiDefMap& api_defs,
+                    const std::vector<string>& hidden_ops,
+                    const string& source_file_name);
+
+// Prints the output of GetPrintOps to stdout.
+// hidden_ops should be a list of Op names that should get a leading _
+// in the output.
 // Optional fourth argument is the name of the original C++ source file
 // where the ops' REGISTER_OP() calls reside.
 void PrintPythonOps(const OpList& ops, const ApiDefMap& api_defs,
diff --git a/tensorflow/python/framework/python_op_gen_test.cc b/tensorflow/python/framework/python_op_gen_test.cc
new file mode 100644
index 00000000000..5185086fdd3
--- /dev/null
+++ b/tensorflow/python/framework/python_op_gen_test.cc
@@ -0,0 +1,42 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/python/framework/python_op_gen.h"
+
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_def.pb.h"
+#include "tensorflow/core/framework/op_gen_lib.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace {
+
+TEST(PythonOpGen, Basic) {
+  OpList ops;
+  OpRegistry::Global()->Export(false, &ops);
+
+  ApiDefMap api_def_map(ops);
+
+  string code = GetPythonOps(ops, api_def_map, {}, "");
+
+  EXPECT_TRUE(absl::StrContains(code, "def case"));
+
+  // TODO(mdan): Add tests to verify type annotations are correctly added.
+}
+
+// TODO(mdan): Include more tests with synhtetic ops and api defs.
+
+}  // namespace
+}  // namespace tensorflow

From b0418130b440fa63b588390f37d0ab21b1c4731c Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 18 Jun 2020 12:40:15 -0700
Subject: [PATCH 0522/1390] Go: Update generated wrapper functions for
 TensorFlow ops.

PiperOrigin-RevId: 317159284
Change-Id: I438a0e11d3e678fd3d5bc153a5b0ff353e0fe86a
---
 tensorflow/go/op/wrappers.go | 794 +++++++++++++++++------------------
 1 file changed, 397 insertions(+), 397 deletions(-)

diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index 188bb3b78bb..ace8e58fdcd 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -158,65 +158,6 @@ func FakeQuantWithMinMaxArgsGradient(scope *Scope, gradients tf.Output, inputs t
 	return op.Output(0)
 }
 
-// Applies sparse addition to `input` using individual values or slices
-//
-// from `updates` according to indices `indices`.  The updates are non-aliasing:
-// `input` is only modified in-place if no other operations will use it.
-// Otherwise, a copy of `input` is made.  This operation has a gradient with
-// respect to both `input` and `updates`.
-//
-// `input` is a `Tensor` with rank `P` and `indices` is a `Tensor` of rank `Q`.
-//
-// `indices` must be integer tensor, containing indices into `input`.
-// It must be shape \\([d_0, ..., d_{Q-2}, K]\\) where `0 < K <= P`.
-//
-// The innermost dimension of `indices` (with length `K`) corresponds to
-// indices into elements (if `K = P`) or `(P-K)`-dimensional slices
-// (if `K < P`) along the `K`th dimension of `input`.
-//
-// `updates` is `Tensor` of rank `Q-1+P-K` with shape:
-//
-// $$[d_0, ..., d_{Q-2}, input.shape[K], ..., input.shape[P-1]].$$
-//
-// For example, say we want to add 4 scattered elements to a rank-1 tensor to 8
-// elements. In Python, that addition would look like this:
-//
-//     input = tf.constant([1, 2, 3, 4, 5, 6, 7, 8])
-//     indices = tf.constant([[4], [3], [1], [7]])
-//     updates = tf.constant([9, 10, 11, 12])
-//     output = tf.scatter_nd_non_aliasing_add(input, indices, updates)
-//     with tf.Session() as sess:
-//       print(sess.run(output))
-//
-// The resulting value `output` would look like this:
-//
-//     [1, 13, 3, 14, 14, 6, 7, 20]
-//
-// See `tf.scatter_nd` for more details about how to make updates to slices.
-//
-// Arguments:
-//	input: A Tensor.
-//	indices: A Tensor. Must be one of the following types: `int32`, `int64`.
-// A tensor of indices into `input`.
-//	updates: A Tensor. Must have the same type as ref. A tensor of updated values
-// to add to `input`.
-//
-// Returns A `Tensor` with the same shape as `input`, containing values of `input`
-// updated with `updates`.
-func ScatterNdNonAliasingAdd(scope *Scope, input tf.Output, indices tf.Output, updates tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ScatterNdNonAliasingAdd",
-		Input: []tf.Input{
-			input, indices, updates,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Subtracts sparse `updates` from an existing tensor according to `indices`.
 //
 // This operation creates a new tensor by subtracting sparse `updates` from the
@@ -13210,115 +13151,6 @@ func ResizeNearestNeighborGrad(scope *Scope, grads tf.Output, size tf.Output, op
 	return op.Output(0)
 }
 
-// A placeholder op for a value that will be fed into the computation.
-//
-// Arguments:
-//	dtype: The type of elements in the tensor.
-//	shape: The shape of the tensor.
-//
-// Returns A tensor that will be provided using the infeed mechanism.
-func InfeedDequeue(scope *Scope, dtype tf.DataType, shape tf.Shape) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dtype": dtype, "shape": shape}
-	opspec := tf.OpSpec{
-		Type: "InfeedDequeue",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Encodes a `RaggedTensor` into a `variant` Tensor.
-//
-//
-// Encodes the given `RaggedTensor` and returns a `variant` Tensor. If
-// `batched_input` is True, then input `RaggedTensor` is unbatched along the
-// zero-th dimension, each component `RaggedTensor` is encoded into a scalar
-// `variant` Tensor, and these are stacked to return a 1-D `variant` Tensor.
-// If `batched_input` is False, then the input `RaggedTensor` is encoded as is and
-// a scalar `variant` Tensor is returned. A `RaggedTensor` is encoded by first
-// creating a 1-D `variant` Tensor with `ragged_rank + 1` elements, containing the
-// splits and values Tensors of the `RaggedTensor`. Then the 1-D `variant` Tensor
-// is wrapped in a scalar `variant` Tensor. See `RaggedTensorFromVariant` for the
-// corresponding decoding logic.
-//
-//
-// Arguments:
-//	rt_nested_splits: A list of one or more Tensors representing the splits of the input
-// `RaggedTensor`.
-//	rt_dense_values: A Tensor representing the values of the input `RaggedTensor`.
-//	batched_input: A `bool` denoting whether the input is a batched `RaggedTensor`.
-//
-// Returns A `variant` Tensor that containing encoded `RaggedTensor`.
-func RaggedTensorToVariant(scope *Scope, rt_nested_splits []tf.Output, rt_dense_values tf.Output, batched_input bool) (encoded_ragged tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"batched_input": batched_input}
-	opspec := tf.OpSpec{
-		Type: "RaggedTensorToVariant",
-		Input: []tf.Input{
-			tf.OutputList(rt_nested_splits), rt_dense_values,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// ResizeNearestNeighborAttr is an optional argument to ResizeNearestNeighbor.
-type ResizeNearestNeighborAttr func(optionalAttr)
-
-// ResizeNearestNeighborAlignCorners sets the optional align_corners attribute to value.
-//
-// value: If true, the centers of the 4 corner pixels of the input and output tensors are
-// aligned, preserving the values at the corner pixels. Defaults to false.
-// If not specified, defaults to false
-func ResizeNearestNeighborAlignCorners(value bool) ResizeNearestNeighborAttr {
-	return func(m optionalAttr) {
-		m["align_corners"] = value
-	}
-}
-
-// ResizeNearestNeighborHalfPixelCenters sets the optional half_pixel_centers attribute to value.
-// If not specified, defaults to false
-func ResizeNearestNeighborHalfPixelCenters(value bool) ResizeNearestNeighborAttr {
-	return func(m optionalAttr) {
-		m["half_pixel_centers"] = value
-	}
-}
-
-// Resize `images` to `size` using nearest neighbor interpolation.
-//
-// Arguments:
-//	images: 4-D with shape `[batch, height, width, channels]`.
-//	size: = A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
-// new size for the images.
-//
-// Returns 4-D with shape
-// `[batch, new_height, new_width, channels]`.
-func ResizeNearestNeighbor(scope *Scope, images tf.Output, size tf.Output, optional ...ResizeNearestNeighborAttr) (resized_images tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ResizeNearestNeighbor",
-		Input: []tf.Input{
-			images, size,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Runs multiple additive regression ensemble predictors on input instances and
 //
 // computes the logits. It is designed to be used during prediction.
@@ -15836,6 +15668,65 @@ func SqrtGrad(scope *Scope, y tf.Output, dy tf.Output) (z tf.Output) {
 	return op.Output(0)
 }
 
+// Applies sparse addition to `input` using individual values or slices
+//
+// from `updates` according to indices `indices`.  The updates are non-aliasing:
+// `input` is only modified in-place if no other operations will use it.
+// Otherwise, a copy of `input` is made.  This operation has a gradient with
+// respect to both `input` and `updates`.
+//
+// `input` is a `Tensor` with rank `P` and `indices` is a `Tensor` of rank `Q`.
+//
+// `indices` must be integer tensor, containing indices into `input`.
+// It must be shape \\([d_0, ..., d_{Q-2}, K]\\) where `0 < K <= P`.
+//
+// The innermost dimension of `indices` (with length `K`) corresponds to
+// indices into elements (if `K = P`) or `(P-K)`-dimensional slices
+// (if `K < P`) along the `K`th dimension of `input`.
+//
+// `updates` is `Tensor` of rank `Q-1+P-K` with shape:
+//
+// $$[d_0, ..., d_{Q-2}, input.shape[K], ..., input.shape[P-1]].$$
+//
+// For example, say we want to add 4 scattered elements to a rank-1 tensor to 8
+// elements. In Python, that addition would look like this:
+//
+//     input = tf.constant([1, 2, 3, 4, 5, 6, 7, 8])
+//     indices = tf.constant([[4], [3], [1], [7]])
+//     updates = tf.constant([9, 10, 11, 12])
+//     output = tf.scatter_nd_non_aliasing_add(input, indices, updates)
+//     with tf.Session() as sess:
+//       print(sess.run(output))
+//
+// The resulting value `output` would look like this:
+//
+//     [1, 13, 3, 14, 14, 6, 7, 20]
+//
+// See `tf.scatter_nd` for more details about how to make updates to slices.
+//
+// Arguments:
+//	input: A Tensor.
+//	indices: A Tensor. Must be one of the following types: `int32`, `int64`.
+// A tensor of indices into `input`.
+//	updates: A Tensor. Must have the same type as ref. A tensor of updated values
+// to add to `input`.
+//
+// Returns A `Tensor` with the same shape as `input`, containing values of `input`
+// updated with `updates`.
+func ScatterNdNonAliasingAdd(scope *Scope, input tf.Output, indices tf.Output, updates tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ScatterNdNonAliasingAdd",
+		Input: []tf.Input{
+			input, indices, updates,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // MutableHashTableOfTensorsV2Attr is an optional argument to MutableHashTableOfTensorsV2.
 type MutableHashTableOfTensorsV2Attr func(optionalAttr)
 
@@ -37168,235 +37059,6 @@ func SdcaFprint(scope *Scope, input tf.Output) (output tf.Output) {
 	return op.Output(0)
 }
 
-// ParseSequenceExampleAttr is an optional argument to ParseSequenceExample.
-type ParseSequenceExampleAttr func(optionalAttr)
-
-// ParseSequenceExampleNcontextSparse sets the optional Ncontext_sparse attribute to value.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func ParseSequenceExampleNcontextSparse(value int64) ParseSequenceExampleAttr {
-	return func(m optionalAttr) {
-		m["Ncontext_sparse"] = value
-	}
-}
-
-// ParseSequenceExampleNcontextDense sets the optional Ncontext_dense attribute to value.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func ParseSequenceExampleNcontextDense(value int64) ParseSequenceExampleAttr {
-	return func(m optionalAttr) {
-		m["Ncontext_dense"] = value
-	}
-}
-
-// ParseSequenceExampleNfeatureListSparse sets the optional Nfeature_list_sparse attribute to value.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func ParseSequenceExampleNfeatureListSparse(value int64) ParseSequenceExampleAttr {
-	return func(m optionalAttr) {
-		m["Nfeature_list_sparse"] = value
-	}
-}
-
-// ParseSequenceExampleNfeatureListDense sets the optional Nfeature_list_dense attribute to value.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func ParseSequenceExampleNfeatureListDense(value int64) ParseSequenceExampleAttr {
-	return func(m optionalAttr) {
-		m["Nfeature_list_dense"] = value
-	}
-}
-
-// ParseSequenceExampleContextSparseTypes sets the optional context_sparse_types attribute to value.
-//
-// value: A list of Ncontext_sparse types; the data types of data in
-// each context Feature given in context_sparse_keys.
-// Currently the ParseSingleSequenceExample supports DT_FLOAT (FloatList),
-// DT_INT64 (Int64List), and DT_STRING (BytesList).
-// If not specified, defaults to <>
-//
-// REQUIRES: len(value) >= 0
-func ParseSequenceExampleContextSparseTypes(value []tf.DataType) ParseSequenceExampleAttr {
-	return func(m optionalAttr) {
-		m["context_sparse_types"] = value
-	}
-}
-
-// ParseSequenceExampleFeatureListDenseTypes sets the optional feature_list_dense_types attribute to value.
-// If not specified, defaults to <>
-//
-// REQUIRES: len(value) >= 0
-func ParseSequenceExampleFeatureListDenseTypes(value []tf.DataType) ParseSequenceExampleAttr {
-	return func(m optionalAttr) {
-		m["feature_list_dense_types"] = value
-	}
-}
-
-// ParseSequenceExampleContextDenseShapes sets the optional context_dense_shapes attribute to value.
-//
-// value: A list of Ncontext_dense shapes; the shapes of data in
-// each context Feature given in context_dense_keys.
-// The number of elements in the Feature corresponding to context_dense_key[j]
-// must always equal context_dense_shapes[j].NumEntries().
-// The shape of context_dense_values[j] will match context_dense_shapes[j].
-// If not specified, defaults to <>
-//
-// REQUIRES: len(value) >= 0
-func ParseSequenceExampleContextDenseShapes(value []tf.Shape) ParseSequenceExampleAttr {
-	return func(m optionalAttr) {
-		m["context_dense_shapes"] = value
-	}
-}
-
-// ParseSequenceExampleFeatureListSparseTypes sets the optional feature_list_sparse_types attribute to value.
-//
-// value: A list of Nfeature_list_sparse types; the data types
-// of data in each FeatureList given in feature_list_sparse_keys.
-// Currently the ParseSingleSequenceExample supports DT_FLOAT (FloatList),
-// DT_INT64 (Int64List), and DT_STRING (BytesList).
-// If not specified, defaults to <>
-//
-// REQUIRES: len(value) >= 0
-func ParseSequenceExampleFeatureListSparseTypes(value []tf.DataType) ParseSequenceExampleAttr {
-	return func(m optionalAttr) {
-		m["feature_list_sparse_types"] = value
-	}
-}
-
-// ParseSequenceExampleFeatureListDenseShapes sets the optional feature_list_dense_shapes attribute to value.
-//
-// value: A list of Nfeature_list_dense shapes; the shapes of
-// data in each FeatureList given in feature_list_dense_keys.
-// The shape of each Feature in the FeatureList corresponding to
-// feature_list_dense_key[j] must always equal
-// feature_list_dense_shapes[j].NumEntries().
-// If not specified, defaults to <>
-//
-// REQUIRES: len(value) >= 0
-func ParseSequenceExampleFeatureListDenseShapes(value []tf.Shape) ParseSequenceExampleAttr {
-	return func(m optionalAttr) {
-		m["feature_list_dense_shapes"] = value
-	}
-}
-
-// Transforms a vector of brain.SequenceExample protos (as strings) into typed tensors.
-//
-// Arguments:
-//	serialized: A vector containing binary serialized SequenceExample protos.
-//	debug_name: A vector containing the names of the serialized protos.
-// May contain, for example, table key (descriptive) name for the
-// corresponding serialized proto.  This is purely useful for debugging
-// purposes, and the presence of values here has no effect on the output.
-// May also be an empty vector if no name is available.
-//	context_dense_defaults: A list of Ncontext_dense Tensors (some may be empty).
-// context_dense_defaults[j] provides default values
-// when the SequenceExample's context map lacks context_dense_key[j].
-// If an empty Tensor is provided for context_dense_defaults[j],
-// then the Feature context_dense_keys[j] is required.
-// The input type is inferred from context_dense_defaults[j], even when it's
-// empty.  If context_dense_defaults[j] is not empty, its shape must match
-// context_dense_shapes[j].
-//	feature_list_dense_missing_assumed_empty: A vector listing the
-// FeatureList keys which may be missing from the SequenceExamples.  If the
-// associated FeatureList is missing, it is treated as empty.  By default,
-// any FeatureList not listed in this vector must exist in the SequenceExamples.
-//	context_sparse_keys: A list of Ncontext_sparse string Tensors (scalars).
-// The keys expected in the Examples' features associated with context_sparse
-// values.
-//	context_dense_keys: A list of Ncontext_dense string Tensors (scalars).
-// The keys expected in the SequenceExamples' context features associated with
-// dense values.
-//	feature_list_sparse_keys: A list of Nfeature_list_sparse string Tensors
-// (scalars).  The keys expected in the FeatureLists associated with sparse
-// values.
-//	feature_list_dense_keys: A list of Nfeature_list_dense string Tensors (scalars).
-// The keys expected in the SequenceExamples' feature_lists associated
-// with lists of dense values.
-func ParseSequenceExample(scope *Scope, serialized tf.Output, debug_name tf.Output, context_dense_defaults []tf.Output, feature_list_dense_missing_assumed_empty []string, context_sparse_keys []string, context_dense_keys []string, feature_list_sparse_keys []string, feature_list_dense_keys []string, optional ...ParseSequenceExampleAttr) (context_sparse_indices []tf.Output, context_sparse_values []tf.Output, context_sparse_shapes []tf.Output, context_dense_values []tf.Output, feature_list_sparse_indices []tf.Output, feature_list_sparse_values []tf.Output, feature_list_sparse_shapes []tf.Output, feature_list_dense_values []tf.Output, feature_list_dense_lengths []tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"feature_list_dense_missing_assumed_empty": feature_list_dense_missing_assumed_empty, "context_sparse_keys": context_sparse_keys, "context_dense_keys": context_dense_keys, "feature_list_sparse_keys": feature_list_sparse_keys, "feature_list_dense_keys": feature_list_dense_keys}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ParseSequenceExample",
-		Input: []tf.Input{
-			serialized, debug_name, tf.OutputList(context_dense_defaults),
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if context_sparse_indices, idx, err = makeOutputList(op, idx, "context_sparse_indices"); err != nil {
-		scope.UpdateErr("ParseSequenceExample", err)
-		return
-	}
-	if context_sparse_values, idx, err = makeOutputList(op, idx, "context_sparse_values"); err != nil {
-		scope.UpdateErr("ParseSequenceExample", err)
-		return
-	}
-	if context_sparse_shapes, idx, err = makeOutputList(op, idx, "context_sparse_shapes"); err != nil {
-		scope.UpdateErr("ParseSequenceExample", err)
-		return
-	}
-	if context_dense_values, idx, err = makeOutputList(op, idx, "context_dense_values"); err != nil {
-		scope.UpdateErr("ParseSequenceExample", err)
-		return
-	}
-	if feature_list_sparse_indices, idx, err = makeOutputList(op, idx, "feature_list_sparse_indices"); err != nil {
-		scope.UpdateErr("ParseSequenceExample", err)
-		return
-	}
-	if feature_list_sparse_values, idx, err = makeOutputList(op, idx, "feature_list_sparse_values"); err != nil {
-		scope.UpdateErr("ParseSequenceExample", err)
-		return
-	}
-	if feature_list_sparse_shapes, idx, err = makeOutputList(op, idx, "feature_list_sparse_shapes"); err != nil {
-		scope.UpdateErr("ParseSequenceExample", err)
-		return
-	}
-	if feature_list_dense_values, idx, err = makeOutputList(op, idx, "feature_list_dense_values"); err != nil {
-		scope.UpdateErr("ParseSequenceExample", err)
-		return
-	}
-	if feature_list_dense_lengths, idx, err = makeOutputList(op, idx, "feature_list_dense_lengths"); err != nil {
-		scope.UpdateErr("ParseSequenceExample", err)
-		return
-	}
-	return context_sparse_indices, context_sparse_values, context_sparse_shapes, context_dense_values, feature_list_sparse_indices, feature_list_sparse_values, feature_list_sparse_shapes, feature_list_dense_values, feature_list_dense_lengths
-}
-
-// Returns true if queue is closed.
-//
-// This operation returns true if the queue is closed and false if the queue
-// is open.
-//
-// Arguments:
-//	handle: The handle to a queue.
-func QueueIsClosedV2(scope *Scope, handle tf.Output) (is_closed tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "QueueIsClosedV2",
-		Input: []tf.Input{
-			handle,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Inverse 3D fast Fourier transform.
 //
 // Computes the inverse 3-dimensional discrete Fourier transform over the
@@ -38804,6 +38466,27 @@ func StatefulTruncatedNormal(scope *Scope, resource tf.Output, algorithm tf.Outp
 	return op.Output(0)
 }
 
+// Returns true if queue is closed.
+//
+// This operation returns true if the queue is closed and false if the queue
+// is open.
+//
+// Arguments:
+//	handle: The handle to a queue.
+func QueueIsClosedV2(scope *Scope, handle tf.Output) (is_closed tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "QueueIsClosedV2",
+		Input: []tf.Input{
+			handle,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Checks whether a quantile stream has been initialized.
 //
 // An Op that checks if quantile stream resource is initialized.
@@ -38826,6 +38509,214 @@ func IsBoostedTreesQuantileStreamResourceInitialized(scope *Scope, quantile_stre
 	return op.Output(0)
 }
 
+// ParseSequenceExampleAttr is an optional argument to ParseSequenceExample.
+type ParseSequenceExampleAttr func(optionalAttr)
+
+// ParseSequenceExampleNcontextSparse sets the optional Ncontext_sparse attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func ParseSequenceExampleNcontextSparse(value int64) ParseSequenceExampleAttr {
+	return func(m optionalAttr) {
+		m["Ncontext_sparse"] = value
+	}
+}
+
+// ParseSequenceExampleNcontextDense sets the optional Ncontext_dense attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func ParseSequenceExampleNcontextDense(value int64) ParseSequenceExampleAttr {
+	return func(m optionalAttr) {
+		m["Ncontext_dense"] = value
+	}
+}
+
+// ParseSequenceExampleNfeatureListSparse sets the optional Nfeature_list_sparse attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func ParseSequenceExampleNfeatureListSparse(value int64) ParseSequenceExampleAttr {
+	return func(m optionalAttr) {
+		m["Nfeature_list_sparse"] = value
+	}
+}
+
+// ParseSequenceExampleNfeatureListDense sets the optional Nfeature_list_dense attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func ParseSequenceExampleNfeatureListDense(value int64) ParseSequenceExampleAttr {
+	return func(m optionalAttr) {
+		m["Nfeature_list_dense"] = value
+	}
+}
+
+// ParseSequenceExampleContextSparseTypes sets the optional context_sparse_types attribute to value.
+//
+// value: A list of Ncontext_sparse types; the data types of data in
+// each context Feature given in context_sparse_keys.
+// Currently the ParseSingleSequenceExample supports DT_FLOAT (FloatList),
+// DT_INT64 (Int64List), and DT_STRING (BytesList).
+// If not specified, defaults to <>
+//
+// REQUIRES: len(value) >= 0
+func ParseSequenceExampleContextSparseTypes(value []tf.DataType) ParseSequenceExampleAttr {
+	return func(m optionalAttr) {
+		m["context_sparse_types"] = value
+	}
+}
+
+// ParseSequenceExampleFeatureListDenseTypes sets the optional feature_list_dense_types attribute to value.
+// If not specified, defaults to <>
+//
+// REQUIRES: len(value) >= 0
+func ParseSequenceExampleFeatureListDenseTypes(value []tf.DataType) ParseSequenceExampleAttr {
+	return func(m optionalAttr) {
+		m["feature_list_dense_types"] = value
+	}
+}
+
+// ParseSequenceExampleContextDenseShapes sets the optional context_dense_shapes attribute to value.
+//
+// value: A list of Ncontext_dense shapes; the shapes of data in
+// each context Feature given in context_dense_keys.
+// The number of elements in the Feature corresponding to context_dense_key[j]
+// must always equal context_dense_shapes[j].NumEntries().
+// The shape of context_dense_values[j] will match context_dense_shapes[j].
+// If not specified, defaults to <>
+//
+// REQUIRES: len(value) >= 0
+func ParseSequenceExampleContextDenseShapes(value []tf.Shape) ParseSequenceExampleAttr {
+	return func(m optionalAttr) {
+		m["context_dense_shapes"] = value
+	}
+}
+
+// ParseSequenceExampleFeatureListSparseTypes sets the optional feature_list_sparse_types attribute to value.
+//
+// value: A list of Nfeature_list_sparse types; the data types
+// of data in each FeatureList given in feature_list_sparse_keys.
+// Currently the ParseSingleSequenceExample supports DT_FLOAT (FloatList),
+// DT_INT64 (Int64List), and DT_STRING (BytesList).
+// If not specified, defaults to <>
+//
+// REQUIRES: len(value) >= 0
+func ParseSequenceExampleFeatureListSparseTypes(value []tf.DataType) ParseSequenceExampleAttr {
+	return func(m optionalAttr) {
+		m["feature_list_sparse_types"] = value
+	}
+}
+
+// ParseSequenceExampleFeatureListDenseShapes sets the optional feature_list_dense_shapes attribute to value.
+//
+// value: A list of Nfeature_list_dense shapes; the shapes of
+// data in each FeatureList given in feature_list_dense_keys.
+// The shape of each Feature in the FeatureList corresponding to
+// feature_list_dense_key[j] must always equal
+// feature_list_dense_shapes[j].NumEntries().
+// If not specified, defaults to <>
+//
+// REQUIRES: len(value) >= 0
+func ParseSequenceExampleFeatureListDenseShapes(value []tf.Shape) ParseSequenceExampleAttr {
+	return func(m optionalAttr) {
+		m["feature_list_dense_shapes"] = value
+	}
+}
+
+// Transforms a vector of brain.SequenceExample protos (as strings) into typed tensors.
+//
+// Arguments:
+//	serialized: A vector containing binary serialized SequenceExample protos.
+//	debug_name: A vector containing the names of the serialized protos.
+// May contain, for example, table key (descriptive) name for the
+// corresponding serialized proto.  This is purely useful for debugging
+// purposes, and the presence of values here has no effect on the output.
+// May also be an empty vector if no name is available.
+//	context_dense_defaults: A list of Ncontext_dense Tensors (some may be empty).
+// context_dense_defaults[j] provides default values
+// when the SequenceExample's context map lacks context_dense_key[j].
+// If an empty Tensor is provided for context_dense_defaults[j],
+// then the Feature context_dense_keys[j] is required.
+// The input type is inferred from context_dense_defaults[j], even when it's
+// empty.  If context_dense_defaults[j] is not empty, its shape must match
+// context_dense_shapes[j].
+//	feature_list_dense_missing_assumed_empty: A vector listing the
+// FeatureList keys which may be missing from the SequenceExamples.  If the
+// associated FeatureList is missing, it is treated as empty.  By default,
+// any FeatureList not listed in this vector must exist in the SequenceExamples.
+//	context_sparse_keys: A list of Ncontext_sparse string Tensors (scalars).
+// The keys expected in the Examples' features associated with context_sparse
+// values.
+//	context_dense_keys: A list of Ncontext_dense string Tensors (scalars).
+// The keys expected in the SequenceExamples' context features associated with
+// dense values.
+//	feature_list_sparse_keys: A list of Nfeature_list_sparse string Tensors
+// (scalars).  The keys expected in the FeatureLists associated with sparse
+// values.
+//	feature_list_dense_keys: A list of Nfeature_list_dense string Tensors (scalars).
+// The keys expected in the SequenceExamples' feature_lists associated
+// with lists of dense values.
+func ParseSequenceExample(scope *Scope, serialized tf.Output, debug_name tf.Output, context_dense_defaults []tf.Output, feature_list_dense_missing_assumed_empty []string, context_sparse_keys []string, context_dense_keys []string, feature_list_sparse_keys []string, feature_list_dense_keys []string, optional ...ParseSequenceExampleAttr) (context_sparse_indices []tf.Output, context_sparse_values []tf.Output, context_sparse_shapes []tf.Output, context_dense_values []tf.Output, feature_list_sparse_indices []tf.Output, feature_list_sparse_values []tf.Output, feature_list_sparse_shapes []tf.Output, feature_list_dense_values []tf.Output, feature_list_dense_lengths []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"feature_list_dense_missing_assumed_empty": feature_list_dense_missing_assumed_empty, "context_sparse_keys": context_sparse_keys, "context_dense_keys": context_dense_keys, "feature_list_sparse_keys": feature_list_sparse_keys, "feature_list_dense_keys": feature_list_dense_keys}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ParseSequenceExample",
+		Input: []tf.Input{
+			serialized, debug_name, tf.OutputList(context_dense_defaults),
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if context_sparse_indices, idx, err = makeOutputList(op, idx, "context_sparse_indices"); err != nil {
+		scope.UpdateErr("ParseSequenceExample", err)
+		return
+	}
+	if context_sparse_values, idx, err = makeOutputList(op, idx, "context_sparse_values"); err != nil {
+		scope.UpdateErr("ParseSequenceExample", err)
+		return
+	}
+	if context_sparse_shapes, idx, err = makeOutputList(op, idx, "context_sparse_shapes"); err != nil {
+		scope.UpdateErr("ParseSequenceExample", err)
+		return
+	}
+	if context_dense_values, idx, err = makeOutputList(op, idx, "context_dense_values"); err != nil {
+		scope.UpdateErr("ParseSequenceExample", err)
+		return
+	}
+	if feature_list_sparse_indices, idx, err = makeOutputList(op, idx, "feature_list_sparse_indices"); err != nil {
+		scope.UpdateErr("ParseSequenceExample", err)
+		return
+	}
+	if feature_list_sparse_values, idx, err = makeOutputList(op, idx, "feature_list_sparse_values"); err != nil {
+		scope.UpdateErr("ParseSequenceExample", err)
+		return
+	}
+	if feature_list_sparse_shapes, idx, err = makeOutputList(op, idx, "feature_list_sparse_shapes"); err != nil {
+		scope.UpdateErr("ParseSequenceExample", err)
+		return
+	}
+	if feature_list_dense_values, idx, err = makeOutputList(op, idx, "feature_list_dense_values"); err != nil {
+		scope.UpdateErr("ParseSequenceExample", err)
+		return
+	}
+	if feature_list_dense_lengths, idx, err = makeOutputList(op, idx, "feature_list_dense_lengths"); err != nil {
+		scope.UpdateErr("ParseSequenceExample", err)
+		return
+	}
+	return context_sparse_indices, context_sparse_values, context_sparse_shapes, context_dense_values, feature_list_sparse_indices, feature_list_sparse_values, feature_list_sparse_shapes, feature_list_dense_values, feature_list_dense_lengths
+}
+
 // Fast Fourier transform.
 //
 // Computes the 1-dimensional discrete Fourier transform over the inner-most
@@ -41742,6 +41633,115 @@ func ResourceApplyAdam(scope *Scope, var_ tf.Output, m tf.Output, v tf.Output, b
 	return scope.AddOperation(opspec)
 }
 
+// ResizeNearestNeighborAttr is an optional argument to ResizeNearestNeighbor.
+type ResizeNearestNeighborAttr func(optionalAttr)
+
+// ResizeNearestNeighborAlignCorners sets the optional align_corners attribute to value.
+//
+// value: If true, the centers of the 4 corner pixels of the input and output tensors are
+// aligned, preserving the values at the corner pixels. Defaults to false.
+// If not specified, defaults to false
+func ResizeNearestNeighborAlignCorners(value bool) ResizeNearestNeighborAttr {
+	return func(m optionalAttr) {
+		m["align_corners"] = value
+	}
+}
+
+// ResizeNearestNeighborHalfPixelCenters sets the optional half_pixel_centers attribute to value.
+// If not specified, defaults to false
+func ResizeNearestNeighborHalfPixelCenters(value bool) ResizeNearestNeighborAttr {
+	return func(m optionalAttr) {
+		m["half_pixel_centers"] = value
+	}
+}
+
+// Resize `images` to `size` using nearest neighbor interpolation.
+//
+// Arguments:
+//	images: 4-D with shape `[batch, height, width, channels]`.
+//	size: = A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
+// new size for the images.
+//
+// Returns 4-D with shape
+// `[batch, new_height, new_width, channels]`.
+func ResizeNearestNeighbor(scope *Scope, images tf.Output, size tf.Output, optional ...ResizeNearestNeighborAttr) (resized_images tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResizeNearestNeighbor",
+		Input: []tf.Input{
+			images, size,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// A placeholder op for a value that will be fed into the computation.
+//
+// Arguments:
+//	dtype: The type of elements in the tensor.
+//	shape: The shape of the tensor.
+//
+// Returns A tensor that will be provided using the infeed mechanism.
+func InfeedDequeue(scope *Scope, dtype tf.DataType, shape tf.Shape) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtype": dtype, "shape": shape}
+	opspec := tf.OpSpec{
+		Type: "InfeedDequeue",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Encodes a `RaggedTensor` into a `variant` Tensor.
+//
+//
+// Encodes the given `RaggedTensor` and returns a `variant` Tensor. If
+// `batched_input` is True, then input `RaggedTensor` is unbatched along the
+// zero-th dimension, each component `RaggedTensor` is encoded into a scalar
+// `variant` Tensor, and these are stacked to return a 1-D `variant` Tensor.
+// If `batched_input` is False, then the input `RaggedTensor` is encoded as is and
+// a scalar `variant` Tensor is returned. A `RaggedTensor` is encoded by first
+// creating a 1-D `variant` Tensor with `ragged_rank + 1` elements, containing the
+// splits and values Tensors of the `RaggedTensor`. Then the 1-D `variant` Tensor
+// is wrapped in a scalar `variant` Tensor. See `RaggedTensorFromVariant` for the
+// corresponding decoding logic.
+//
+//
+// Arguments:
+//	rt_nested_splits: A list of one or more Tensors representing the splits of the input
+// `RaggedTensor`.
+//	rt_dense_values: A Tensor representing the values of the input `RaggedTensor`.
+//	batched_input: A `bool` denoting whether the input is a batched `RaggedTensor`.
+//
+// Returns A `variant` Tensor that containing encoded `RaggedTensor`.
+func RaggedTensorToVariant(scope *Scope, rt_nested_splits []tf.Output, rt_dense_values tf.Output, batched_input bool) (encoded_ragged tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"batched_input": batched_input}
+	opspec := tf.OpSpec{
+		Type: "RaggedTensorToVariant",
+		Input: []tf.Input{
+			tf.OutputList(rt_nested_splits), rt_dense_values,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // ResourceApplyKerasMomentumAttr is an optional argument to ResourceApplyKerasMomentum.
 type ResourceApplyKerasMomentumAttr func(optionalAttr)
 

From 07d5aa230954276d9de201217b18e86815328ab8 Mon Sep 17 00:00:00 2001
From: Sachin Joglekar <srjoglekar@google.com>
Date: Thu, 18 Jun 2020 12:58:11 -0700
Subject: [PATCH 0523/1390] Use Requantize after per-channel Conv only if
 tensor min/max is different from RELU min/max beyond a limit. Also rectify
 RELU1 bound to (-1, 1)

PiperOrigin-RevId: 317162909
Change-Id: Ice90226436cccf49507bd17877222da755d22644
---
 .../hexagon/builders/conv_2d_builder.cc       | 11 ++-
 .../hexagon/builders/tests/conv_test.cc       | 84 +++++++++++++++++++
 2 files changed, 92 insertions(+), 3 deletions(-)

diff --git a/tensorflow/lite/delegates/hexagon/builders/conv_2d_builder.cc b/tensorflow/lite/delegates/hexagon/builders/conv_2d_builder.cc
index 97db6bf8fd0..a366522e35c 100644
--- a/tensorflow/lite/delegates/hexagon/builders/conv_2d_builder.cc
+++ b/tensorflow/lite/delegates/hexagon/builders/conv_2d_builder.cc
@@ -16,6 +16,7 @@ limitations under the License.
 
 #include <stdint.h>
 
+#include <cmath>
 #include <limits>
 
 #include "tensorflow/lite/c/builtin_op_data.h"
@@ -197,7 +198,7 @@ TfLiteStatus Conv2dOpBuilder::PopulateSubGraph(const TfLiteIntArray* inputs,
     conv_output_min = 0;
     conv_output_max = 6;
   } else if (activation == kTfLiteActRelu1) {
-    conv_output_min = 0;
+    conv_output_min = -1;
     conv_output_max = 1;
   } else if (activation == kTfLiteActRelu) {
     conv_output_min = 0;
@@ -351,8 +352,12 @@ TfLiteStatus Conv2dOpBuilder::PopulateSubGraph(const TfLiteIntArray* inputs,
     output_max_tensor = AddOutput(sizeof(float), 4, kScalarShape);
   }
 
-  // Requantize if activation was not None.
-  if (activation != kTfLiteActNone) {
+  // Requantize if activation was not None & the TFLite tensor's min/max is
+  // different (diff > 1e-2) from the RELU bounds.
+  const float min_bound_diff = std::abs(conv_output_min - output_min);
+  const float max_bound_diff = std::abs(conv_output_max - output_max);
+  if (activation != kTfLiteActNone &&
+      (min_bound_diff > 0.01 || max_bound_diff > 0.01)) {
     auto* requantized_min_const = graph_builder_->AddConstNodeWithData(
         kScalarShape, reinterpret_cast<char*>(&output_min), sizeof(output_min));
     auto* requantized_max_const = graph_builder_->AddConstNodeWithData(
diff --git a/tensorflow/lite/delegates/hexagon/builders/tests/conv_test.cc b/tensorflow/lite/delegates/hexagon/builders/tests/conv_test.cc
index 13fd768fded..eed1bf29aae 100644
--- a/tensorflow/lite/delegates/hexagon/builders/tests/conv_test.cc
+++ b/tensorflow/lite/delegates/hexagon/builders/tests/conv_test.cc
@@ -207,6 +207,43 @@ TEST(QuantizedConvolutionOpModel, SimpleConvTestReLU6Activation) {
                   1e-5)));
 }
 
+// Same as above, but the output min/max matches the RELU bounds.
+// Therefore, a Requantize node will not get added after Supernode.
+TEST(QuantizedConvolutionOpModel,
+     SimpleConvTestReLU6Activation_NoRequantizeRequired) {
+  QuantizedConvolutionOpModel m(
+      BuiltinOperator_CONV_2D, {TensorType_UINT8, {2, 2, 4, 1}, -63.5, 64},
+      {TensorType_UINT8, {3, 2, 2, 1}, -63.5, 64}, {TensorType_UINT8, {}, 0, 6},
+      Padding_VALID, /**dilation_factor**/ 1,
+      /**stride**/ 2, ActivationFunctionType_RELU6);
+  m.SetInput({
+      // First batch
+      1, 1, 1, 1,  // row = 1
+      2, 2, 2, 2,  // row = 2
+      // Second batch
+      1, 2, 3, 4,  // row = 1
+      1, 2, 3, 4,  // row = 2
+  });
+  m.SetFilter({
+      1, 2, 3, 4,    // first 2x2 filter
+      -1, 1, -1, 1,  // second 2x2 filter
+      -1, -1, 1, 1,  // third 2x2 filter
+  });
+  m.SetBias({1, 2, 3});
+
+  m.ApplyDelegateAndInvoke();
+
+  EXPECT_THAT(m.GetDequantizedOutput<uint8_t>(),
+              ElementsAreArray(ArrayFloatNear(
+                  {
+                      6, 2, 5,  // first batch, left
+                      6, 2, 5,  // first batch, right
+                      6, 4, 3,  // second batch, left
+                      6, 4, 3,  // second batch, right
+                  },
+                  2e-2)));
+}
+
 TEST(QuantizedConvolutionOpModel, SimplePerTensor_Int8) {
   QuantizedConvolutionOpModel m(
       BuiltinOperator_CONV_2D,
@@ -512,6 +549,53 @@ TEST(QuantizedConvolutionOpModel, DepthwiseConvSimplePerTensor_Int8) {
       ElementsAreArray(ArrayFloatNear({43, 48, 40, 52, 3, -4, 4, 4}, 0.6f)));
 }
 
+TEST(QuantizedConvolutionOpModel, DepthwiseConvSimplePerTensor_Int8_RELU1) {
+  QuantizedConvolutionOpModel m(
+      BuiltinOperator_DEPTHWISE_CONV_2D,
+      {TensorType_INT8, {1, 2, 3, 1}, -63.5, 64, 0.5, -1},
+      {TensorType_INT8,
+       // [1 * 2 * 2 * 4] as [input_channel, y, x, output_channel]
+       {1, 2, 2, 4},
+       0,
+       0,
+       0,
+       0,
+       /*per_channel_quantization=*/true,
+       /*per_channel_quantization_scales=*/{0.1, 2, 3, 0.4},
+       /*per_channel_quantization_offsets=*/{0, 0, 0, 0},
+       /*channel_index=*/3},
+      {TensorType_INT8, {}, -63.5, 64, 0.5, -1}, Padding_VALID,
+      /**dilation_factor**/ 1,
+      /**stride**/ 1, ActivationFunctionType_RELU_N1_TO_1);
+  m.SetInt8Input({
+      // [1 * 2 * 3 * 1] as [batch, y, x, input_channel]
+      3,   // batch = 0, y = 0, x = 0
+      1,   // batch = 0, y = 0, x = 1
+      -2,  // batch = 0, y = 0, x = 2
+      4,   // batch = 0, y = 1, x = 0
+      2,   // batch = 0, y = 1, x = 1
+      -4,  // batch = 0, y = 1, x = 2
+  });
+  m.SetPerChannelQuantizedFilter({
+      // [1 * 2 * 2 * 4] as [input_channel, y, x, output_channel]
+      // depth multiplier = 2
+      1, 2, 3, 4,  // y = 0, x = 0
+      3, 4, 5, 6,  // y = 0, x = 1
+      7, 8, 5, 6,  // y = 1, x = 0
+      3, 4, 1, 2,  // y = 1, x = 1
+  });
+  m.SetPerChannelQuantizedBias({3, -2, 4, 6});
+
+  // Reference output.
+  m.Invoke();
+  auto reference_output = m.GetDequantizedOutput<int8_t>();
+
+  m.ApplyDelegateAndInvoke();
+
+  EXPECT_THAT(m.GetDequantizedOutput<int8_t>(),
+              ElementsAreArray(ArrayFloatNear(reference_output, 1e-2)));
+}
+
 TEST(QuantizedConvolutionOpModel, DepthwiseConvSimplePerAxis_Int8) {
   QuantizedConvolutionOpModel m(
       BuiltinOperator_DEPTHWISE_CONV_2D,

From 41e7392f58391c18aa872c638ff2e1ac72326bcd Mon Sep 17 00:00:00 2001
From: Rahul Joshi <jurahul@google.com>
Date: Thu, 18 Jun 2020 13:09:04 -0700
Subject: [PATCH 0524/1390] - Eliminate all uses of passes that mark function
 visibility since the visibility   is now set correctly when importing. -
 Update tf_saved_model dialect verification to verify that exported functions
 are   marked public. - Eliminate function_visibility.mlir test. This test
 fails after the   tf_saved_model verification changes since its run
 tf.entry_function based   visibility on a tf_saved_model MLIR module. Also,
 these passes will be removed. - Fix TPURewritePass to mark the appropriate
 visibility on the serialized MLIR   attached to tf._TPUCompileMlir op.

PiperOrigin-RevId: 317165278
Change-Id: I8e8f6de4b56e89c303815edc3b34bcf0a4e82d2d
---
 .../compiler/mlir/lite/tf_tfl_passes.cc       |   3 -
 tensorflow/compiler/mlir/tensorflow/BUILD     |   1 -
 .../mlir/tensorflow/ir/tf_saved_model.cc      |  14 +-
 .../tensorflow/tests/function_visibility.mlir |  47 -----
 .../tf_saved_model_delete_unused_funcs.mlir   |  96 ----------
 .../tf_saved_model_freeze_global_tensors.mlir |   2 +-
 .../tensorflow/tests/tf_saved_model_ops.mlir  |   2 +-
 .../tests/tf_saved_model_ops_invalid.mlir     |  34 +++-
 ...timize_global_tensors_interprocedural.mlir |  22 +--
 .../transforms/mark_function_visibility.cc    | 165 ------------------
 .../mlir/tensorflow/transforms/passes.h       |  22 ---
 .../tensorflow/transforms/tpu_rewrite_pass.cc |   3 +
 .../tensorflow/utils/compile_mlir_util.cc     |   3 -
 tensorflow/compiler/tf2xla/mlir_tf2xla.cc     |   5 -
 14 files changed, 59 insertions(+), 360 deletions(-)
 delete mode 100644 tensorflow/compiler/mlir/tensorflow/tests/function_visibility.mlir
 delete mode 100644 tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model_delete_unused_funcs.mlir
 delete mode 100644 tensorflow/compiler/mlir/tensorflow/transforms/mark_function_visibility.cc

diff --git a/tensorflow/compiler/mlir/lite/tf_tfl_passes.cc b/tensorflow/compiler/mlir/lite/tf_tfl_passes.cc
index 589515d6246..008098f62ba 100644
--- a/tensorflow/compiler/mlir/lite/tf_tfl_passes.cc
+++ b/tensorflow/compiler/mlir/lite/tf_tfl_passes.cc
@@ -212,9 +212,6 @@ void CreateTFLStandardPipeline(OpPassManager& pm,
 
   // Saved model pass to mark global tensors immutable.
   pm.addPass(mlir::tf_saved_model::CreateOptimizeGlobalTensorsPass());
-  // Used to mark non-exported functions in saved model private.
-  pm.addPass(mlir::tf_saved_model::
-                 CreateMarkFunctionVisibilityUsingSavedModelLinkagePass());
   // Op fusion pass.
   pm.addPass(mlir::TFL::CreatePrepareCompositeFunctionsPass());
 
diff --git a/tensorflow/compiler/mlir/tensorflow/BUILD b/tensorflow/compiler/mlir/tensorflow/BUILD
index 17ed0e36a28..54e57512c32 100644
--- a/tensorflow/compiler/mlir/tensorflow/BUILD
+++ b/tensorflow/compiler/mlir/tensorflow/BUILD
@@ -491,7 +491,6 @@ cc_library(
         "transforms/graph_pruning.cc",
         "transforms/launch_to_device_attribute.cc",
         "transforms/layout_optimization.cc",
-        "transforms/mark_function_visibility.cc",
         "transforms/materialize_mlir_passthrough_op.cc",
         "transforms/optimize.cc",
         "transforms/optimize_global_tensors.cc",
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.cc
index 6af70158e14..d59532fef65 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.cc
@@ -229,8 +229,20 @@ static LogicalResult VerifySavedModelModule(
     }
   }
   for (auto func : module.getOps<FuncOp>()) {
+    const bool is_exported = IsExported(func);
+
+    if (is_exported && func.getVisibility() != FuncOp::Visibility::Public) {
+      return func.emitError()
+             << "exported function @" << func.getName() << " should be public";
+    }
+
+    if (!is_exported && func.getVisibility() == FuncOp::Visibility::Public) {
+      return func.emitError() << "non-exported function @" << func.getName()
+                              << " should be private";
+    }
+
     if (HasAnyTfSavedModelArgAttr(func)) {
-      if (!IsExported(func)) {
+      if (!is_exported) {
         return func.emitError()
                << "can only apply 'tf_saved_model' argument attributes "
                   "to exported functions";
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/function_visibility.mlir b/tensorflow/compiler/mlir/tensorflow/tests/function_visibility.mlir
deleted file mode 100644
index 55af3cffde3..00000000000
--- a/tensorflow/compiler/mlir/tensorflow/tests/function_visibility.mlir
+++ /dev/null
@@ -1,47 +0,0 @@
-// RUN: tf-opt -tf-saved-model-mark-func-visibility -split-input-file %s | FileCheck --check-prefix=SAVEDMODEL %s
-// RUN: tf-opt -tf-mark-func-visibility -split-input-file -verify-diagnostics %s | FileCheck %s
-
-
-module attributes {tf_saved_model.semantics} {
-  // SAVEDMODEL: func @func_exported_1() attributes {tf_saved_model.exported_names = ["func_exported_1"]}
-  func @func_exported_1() attributes {tf_saved_model.exported_names = ["func_exported_1"]} {
-    "tf.some_call"() {callee = {callee = {callee = @child}}} : () -> ()
-    return
-  }
-
-  // SAVEDMODEL: func @func_exported_2() attributes {tf_saved_model.exported_names = ["func_exported_2"]}
-  func @func_exported_2() attributes {tf_saved_model.exported_names = ["func_exported_2"]} {
-    "tf.some_call"() {callee = {callee = {callee = @child}}} : () -> ()
-    return
-  }
-
-  // SAVEDMODEL: func @func_not_exported() attributes {sym_visibility = "private"}
-  func @func_not_exported() {
-    return
-  }
-
-}
-
-// -----
-
-module {
-  // CHECK: func @func_with_entry_spec(%arg0: tensor<1xi32>) -> tensor<1xi32> attributes {tf.entry_function = {inputs = "x", outputs = "y"}}
-  func @func_with_entry_spec(%arg0: tensor<1xi32>) -> tensor<1xi32> attributes {tf.entry_function = {inputs = "x", outputs = "y"}} {
-    return %arg0 : tensor<1xi32>
-  }
-
-  // CHECK: func @func_without_entry_spec(%arg0: tensor<*xi32>, %arg1: tensor<*xi32>) -> tensor<*xi32> attributes {sym_visibility = "private"}
-  func @func_without_entry_spec(%arg0: tensor<*xi32>, %arg1: tensor<*xi32>) -> tensor<*xi32> {
-    %0 = "tf.AddV2"(%arg0, %arg1) {T = i32, device = ""} : (tensor<*xi32>, tensor<*xi32>) -> tensor<*xi32>
-    return %0 : tensor<*xi32>
-  }
-}
-
-// -----
-
-module {
-  // expected-error @+1 {{can't overwrite the visibility of function private_func_with_entry_spec with private visibility}}
-  func @private_func_with_entry_spec(%arg0: tensor<1xi32>) -> tensor<1xi32> attributes {tf.entry_function = {inputs = "x", outputs = "y"}, sym_visibility = "private"} {
-    return %arg0 : tensor<1xi32>
-  }
-}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model_delete_unused_funcs.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model_delete_unused_funcs.mlir
deleted file mode 100644
index 6f2c47a935f..00000000000
--- a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model_delete_unused_funcs.mlir
+++ /dev/null
@@ -1,96 +0,0 @@
-// RUN: tf-opt -tf-saved-model-mark-func-visibility -symbol-dce -split-input-file %s | FileCheck %s
-
-module attributes {tf_saved_model.semantics} {
-
-  // Test case: Unused function should be deleted.
-
-  // CHECK-NOT: func @unused
-  func @unused() {
-    return
-  }
-
-}
-
-// -----
-
-module attributes {tf_saved_model.semantics} {
-
-  // Test case: Root calls child. Child should not be deleted.
-
-  // CHECK: func @root
-  func @root() attributes {tf_saved_model.exported_names = ["root"]} {
-    "tf.some_call"() { callee = @child } : () -> ()
-    return
-  }
-
-  // CHECK: func @child
-  func @child() {
-    return
-  }
-
-}
-
-// -----
-
-module attributes {tf_saved_model.semantics} {
-
-  // Test case: Don't crash if attribute that doesn't reference a func.
-
-  "tf.some_opaque_global_variable"() { sym_name = "some_global" } : () -> ()
-
-  func @root2() attributes {tf_saved_model.exported_names = ["root2"]} {
-    "tf.do_something_with_a_global"() { global = @some_global } : () -> ()
-    return
-  }
-
-}
-
-// -----
-
-module attributes {tf_saved_model.semantics} {
-
-  // Test case: Delete recursively dead cycle.
-
-  // CHECK-NOT: func @recursively_dead0
-  func @recursively_dead0() {
-    "tf.some_call"() { callee = @recursively_dead1 } : () -> ()
-    return
-  }
-  // CHECK-NOT: func @recursively_dead1
-  func @recursively_dead1() {
-    "tf.some_call"() { callee = @recursively_dead0 } : () -> ()
-    return
-  }
-
-}
-
-// -----
-
-module attributes {tf_saved_model.semantics} {
-
-  // Test case: Root calls child with a deeply nested symbol reference.
-  // Child should not be deleted.
-
-  // CHECK: func @root
-  func @root() attributes {tf_saved_model.exported_names = ["root"]} {
-    "tf.some_call"() {callee = {callee = {callee = @child}}} : () -> ()
-    return
-  }
-
-  // CHECK: func @child
-  func @child() {
-    return
-  }
-
-}
-
-// -----
-
-// Test case: If the module doesn't have tf_saved_model semantics, then this
-// pass shouldn't do anything.
-module {
-  // CHECK: func @not_dead()
-  func @not_dead() {
-    return
-  }
-}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model_freeze_global_tensors.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model_freeze_global_tensors.mlir
index 38627b41b68..6c32a3bc4d6 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model_freeze_global_tensors.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model_freeze_global_tensors.mlir
@@ -64,7 +64,7 @@ module attributes {tf_saved_model.semantics} {
     return
   }
 
-  func @f_callee(%arg0: tensor<!tf.resource<tensor<f32>>>) {
+  func @f_callee(%arg0: tensor<!tf.resource<tensor<f32>>>) attributes {sym_visibility = "private"} {
     return
   }
 }
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model_ops.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model_ops.mlir
index aa1f996da07..05e7638645f 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model_ops.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model_ops.mlir
@@ -40,7 +40,7 @@ module attributes {tf_saved_model.semantics} {
     return %arg0 : tensor<f32>
   }
 
-  func @f() {
+  func @f() attributes {sym_visibility = "private"} {
     return
   }
 
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model_ops_invalid.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model_ops_invalid.mlir
index 544600cf6b8..f04e1a60b36 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model_ops_invalid.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model_ops_invalid.mlir
@@ -3,7 +3,7 @@
 module attributes {tf_saved_model.semantics} {
 
   // expected-error@+1 {{unknown tf_saved_model dialect arg attribute 'tf_saved_model.not_a_real_arg_attr'}}
-  func @f(%arg0: tensor<f32> {tf_saved_model.not_a_real_arg_attr = 1 : i32}) {
+  func @f(%arg0: tensor<f32> {tf_saved_model.not_a_real_arg_attr = 1 : i32}) attributes {sym_visibility = "private"} {
     return
   }
 
@@ -233,7 +233,7 @@ module attributes {tf_saved_model.semantics} {
   "tf_saved_model.global_tensor"() { is_mutable, sym_name = "v", type = tensor<?xf32>, value = dense<1.> : tensor<1xf32> } : () -> ()
   // expected-error@+1 {{can only apply 'tf_saved_model' argument attributes to exported functions}}
   func @f(%arg0: tensor<!tf.resource<tensor<?xf32>>> {tf_saved_model.bound_input = @v})
-  -> (tensor<?xf32> {tf_saved_model.index_path = []}) {
+  -> (tensor<?xf32> {tf_saved_model.index_path = []}) attributes {sym_visibility = "private"} {
     %0 = "tf.ReadVariableOp"(%arg0) : (tensor<!tf.resource<tensor<?xf32>>>) -> tensor<?xf32>
     return %0 : tensor<?xf32>
   }
@@ -273,7 +273,7 @@ module attributes {tf_saved_model.semantics} {
 
   // expected-error@+1 {{the initializer function should have no output}}
   "tf_saved_model.session_initializer"() { initializer = @init } : () -> ()
-  func @init() -> tensor<1xf32> {
+  func @init() -> tensor<1xf32> attributes {sym_visibility = "private"} {
     %0 = "tf.Const"() {value = dense<[1.0]> : tensor<1xf32> } : () -> tensor<1xf32>
     return %0 : tensor<1xf32>
   }
@@ -286,8 +286,34 @@ module attributes {tf_saved_model.semantics} {
   "tf_saved_model.session_initializer"() { initializer = @init } : () -> ()
   // expected-error@+1 {{there must be no more than one session_initializer op}}
   "tf_saved_model.session_initializer"() { initializer = @init } : () -> ()
-  func @init() -> tensor<1xf32> {
+  func @init() -> tensor<1xf32> attributes {sym_visibility = "private"} {
     %0 = "tf.Const"() {value = dense<[1.0]> : tensor<1xf32> } : () -> tensor<1xf32>
     return %0 : tensor<1xf32>
   }
 }
+
+// -----
+
+module attributes {tf_saved_model.semantics} {
+
+  // expected-error@+1 {{exported function @f should be public}}
+  func @f(
+    %arg0: tensor<f32> {tf.resource_name = "resource"}
+  ) attributes { sym_visibility = "private", tf_saved_model.exported_names = ["foo.some_func"] } {
+    return
+  }
+
+}
+
+// -----
+
+module attributes {tf_saved_model.semantics} {
+
+  // expected-error@+1 {{non-exported function @f should be private}}
+  func @f(
+    %arg0: tensor<f32> {tf.resource_name = "resource"}
+  ) {
+    return
+  }
+
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model_optimize_global_tensors_interprocedural.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model_optimize_global_tensors_interprocedural.mlir
index 91e8c9c4b66..14a0006cd3b 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model_optimize_global_tensors_interprocedural.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model_optimize_global_tensors_interprocedural.mlir
@@ -20,12 +20,12 @@ module attributes {tf_saved_model.semantics} {
     return %val : tensor<f32>
   }
 
-  func @f_callee(%arg0: tensor<*x!tf.resource>) -> tensor<f32> {
+  func @f_callee(%arg0: tensor<*x!tf.resource>) -> tensor<f32> attributes {sym_visibility = "private"} {
     %val = "tf.PartitionedCall"(%arg0) {config = "", config_proto = "", executor_type = "", f = @f_callee_callee} : (tensor<*x!tf.resource>) -> (tensor<f32>)
     return %val : tensor<f32>
   }
 
-  func @f_callee_callee(%arg0: tensor<*x!tf.resource>) -> tensor<f32> {
+  func @f_callee_callee(%arg0: tensor<*x!tf.resource>) -> tensor<f32> attributes {sym_visibility = "private"} {
     %val = "tf.ReadVariableOp"(%arg0) : (tensor<*x!tf.resource>) -> tensor<f32>
     return %val : tensor<f32>
   }
@@ -59,7 +59,7 @@ module attributes {tf_saved_model.semantics} {
     return %val : tensor<f32>
   }
 
-  func @f_common(%arg0: tensor<*x!tf.resource>) -> tensor<f32> {
+  func @f_common(%arg0: tensor<*x!tf.resource>) -> tensor<f32> attributes {sym_visibility = "private"} {
     %val = "tf.ReadVariableOp"(%arg0) : (tensor<*x!tf.resource>) -> tensor<f32>
     return %val : tensor<f32>
   }
@@ -85,7 +85,7 @@ module attributes {tf_saved_model.semantics} {
     return %val_2 : tensor<f32>
   }
 
-  func @f_callee(%arg0: tensor<*x!tf.resource>) -> tensor<f32> {
+  func @f_callee(%arg0: tensor<*x!tf.resource>) -> tensor<f32> attributes {sym_visibility = "private"} {
     %cst_1 = constant dense<2.0> : tensor<f32>
     return %cst_1 : tensor<f32>
   }
@@ -112,13 +112,13 @@ module attributes {tf_saved_model.semantics} {
   }
 
   // CHECK: func @f_callee(%arg0: tensor<*x!tf.resource>) -> tensor<f32>
-  func @f_callee(%arg0: tensor<*x!tf.resource>) -> tensor<f32> {
+  func @f_callee(%arg0: tensor<*x!tf.resource>) -> tensor<f32> attributes {sym_visibility = "private"} {
     %val = "tf.PartitionedCall"(%arg0) {config = "", config_proto = "", executor_type = "", f = @f_callee_callee} : (tensor<*x!tf.resource>) -> (tensor<f32>)
     return %val : tensor<f32>
   }
 
   // CHECK: func @f_callee_callee(%arg0: tensor<*x!tf.resource>) -> tensor<f32>
-  func @f_callee_callee(%arg0: tensor<*x!tf.resource>) -> tensor<f32> {
+  func @f_callee_callee(%arg0: tensor<*x!tf.resource>) -> tensor<f32> attributes {sym_visibility = "private"} {
     %c0 = "tf.Const"() { value = dense<1.0> : tensor<f32> } : () -> tensor<f32>
     "tf.AssignVariableOp"(%arg0, %c0) : (tensor<*x!tf.resource>, tensor<f32>) -> ()
     return %c0 : tensor<f32>
@@ -146,13 +146,13 @@ module attributes {tf_saved_model.semantics} {
   }
 
   // CHECK: func @f_callee(%arg0: tensor<*x!tf.resource>) -> tensor<f32>
-  func @f_callee(%arg0: tensor<*x!tf.resource>) -> tensor<f32> {
+  func @f_callee(%arg0: tensor<*x!tf.resource>) -> tensor<f32> attributes {sym_visibility = "private"} {
     %val = "tf.PartitionedCall"(%arg0) {config = "", config_proto = "", executor_type = "", f = @f_callee_callee} : (tensor<*x!tf.resource>) -> (tensor<f32>)
     return %val : tensor<f32>
   }
 
   // CHECK: func @f_callee_callee(%arg0: tensor<*x!tf.resource>) -> tensor<f32>
-  func @f_callee_callee(%arg0: tensor<*x!tf.resource>) -> tensor<f32> {
+  func @f_callee_callee(%arg0: tensor<*x!tf.resource>) -> tensor<f32> attributes {sym_visibility = "private"} {
     %c0 = "tf.Const"() { value = dense<1.0> : tensor<f32> } : () -> tensor<f32>
     "tf.AssignVariableOp"(%arg0, %c0) : (tensor<*x!tf.resource>, tensor<f32>) -> ()
     return %c0 : tensor<f32>
@@ -179,13 +179,13 @@ module attributes {tf_saved_model.semantics} {
 
 
   // CHECK: func @f(%arg0: tensor<*x!tf.resource>) -> tensor<f32>
-  func @f(%arg0: tensor<*x!tf.resource>) -> tensor<f32> {
+  func @f(%arg0: tensor<*x!tf.resource>) -> tensor<f32> attributes {sym_visibility = "private"} {
     %val = "tf.PartitionedCall"(%arg0) {config = "", config_proto = "", executor_type = "", f = @g} : (tensor<*x!tf.resource>) -> (tensor<f32>)
     return %val : tensor<f32>
   }
 
   // CHECK: func @g(%arg0: tensor<*x!tf.resource>) -> tensor<f32>
-  func @g(%arg0: tensor<*x!tf.resource>) -> tensor<f32> {
+  func @g(%arg0: tensor<*x!tf.resource>) -> tensor<f32> attributes {sym_visibility = "private"} {
     %val = "tf.PartitionedCall"(%arg0) {config = "", config_proto = "", executor_type = "", f = @f} : (tensor<*x!tf.resource>) -> (tensor<f32>)
     return %val : tensor<f32>
   }
@@ -212,7 +212,7 @@ module attributes {tf_saved_model.semantics} {
 
 
   // CHECK: func @f(%arg0: tensor<*x!tf.resource>) -> tensor<f32>
-  func @f(%arg0: tensor<*x!tf.resource>) -> tensor<f32> {
+  func @f(%arg0: tensor<*x!tf.resource>) -> tensor<f32> attributes {sym_visibility = "private"} {
     %c0 = "tf.Const"() { value = dense<1.0> : tensor<f32> } : () -> tensor<f32>
     "tf.AssignAddVariableOp"(%arg0, %c0) : (tensor<*x!tf.resource>, tensor<f32>) -> ()
     return %c0 : tensor<f32>
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/mark_function_visibility.cc b/tensorflow/compiler/mlir/tensorflow/transforms/mark_function_visibility.cc
deleted file mode 100644
index 31a80a4ecdb..00000000000
--- a/tensorflow/compiler/mlir/tensorflow/transforms/mark_function_visibility.cc
+++ /dev/null
@@ -1,165 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "llvm/ADT/STLExtras.h"
-#include "mlir/IR/Module.h"  // from @llvm-project
-#include "mlir/Pass/Pass.h"  // from @llvm-project
-#include "tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.h"
-
-#define DEBUG_TYPE "tf-shape-inference"
-
-namespace mlir {
-
-namespace {
-
-LogicalResult MarkFunctionVisibility(
-    ModuleOp module, llvm::function_ref<bool(FuncOp func)> IsExternalVisible) {
-  LogicalResult result = success();
-
-  for (auto func : module.getOps<FuncOp>()) {
-    FuncOp::Visibility old_visibility = func.getVisibility();
-
-    FuncOp::Visibility visibility = IsExternalVisible(func)
-                                        ? FuncOp::Visibility::Public
-                                        : FuncOp::Visibility::Private;
-
-    auto get_visibility_name = [](FuncOp::Visibility v) {
-      return v == FuncOp::Visibility::Public
-                 ? "public"
-                 : v == FuncOp::Visibility::Private ? "private" : "nested";
-    };
-
-    if (old_visibility != SymbolTable::Visibility::Public &&
-        old_visibility != visibility) {
-      result = func.emitError()
-               << "can't overwrite the visibility of function "
-               << func.getName() << " with "
-               << get_visibility_name(old_visibility) << " visibility";
-    }
-
-    LLVM_DEBUG(llvm::dbgs()
-               << "function " << func.getName() << " has "
-               << get_visibility_name(visibility) << " visibility \n");
-
-    func.setVisibility(visibility);
-  }
-
-  return result;
-}
-
-}  // anonymous namespace
-
-namespace TF {
-
-LogicalResult MarkFunctionVisibilityUsingEntryFunctionSpecification(
-    ModuleOp module) {
-  auto HasEntryFunctionSpecification = [](FuncOp func) -> bool {
-    auto attrs = func.getAttrOfType<mlir::DictionaryAttr>("tf.entry_function");
-    return attrs && !attrs.empty();
-  };
-  return MarkFunctionVisibility(module, HasEntryFunctionSpecification);
-}
-
-namespace {
-struct MarkFunctionVisibilityUsingEntryFunctionSpecificationPass
-    : public PassWrapper<
-          MarkFunctionVisibilityUsingEntryFunctionSpecificationPass,
-          OperationPass<ModuleOp>> {
-  void runOnOperation() override {
-    if (failed(MarkFunctionVisibilityUsingEntryFunctionSpecification(
-            getOperation()))) {
-      signalPassFailure();
-    }
-  }
-};
-}  // namespace
-
-static PassRegistration<
-    MarkFunctionVisibilityUsingEntryFunctionSpecificationPass>
-    pass("tf-mark-func-visibility",
-         "Use tf.entry_function to mark function visibility.");
-
-std::unique_ptr<OperationPass<ModuleOp>>
-CreateMarkFunctionVisibilityUsingEntryFunctionSpecificationPass() {
-  return std::make_unique<
-      MarkFunctionVisibilityUsingEntryFunctionSpecificationPass>();
-}
-
-// Marks the main function with public visibility, while other functions are
-// marked with private visibility.
-LogicalResult MarkOnlyMainFunctionWithPublicVisibility(ModuleOp module) {
-  for (auto func : module.getOps<FuncOp>()) {
-    if (func.getName() == "main") {
-      func.setVisibility(FuncOp::Visibility::Public);
-    } else {
-      func.setVisibility(FuncOp::Visibility::Private);
-    }
-  }
-  return success();
-}
-
-namespace {
-struct MarkOnlyMainFunctionWithPublicVisibilityPass
-    : public PassWrapper<MarkOnlyMainFunctionWithPublicVisibilityPass,
-                         OperationPass<ModuleOp>> {
-  void runOnOperation() override {
-    if (failed(MarkOnlyMainFunctionWithPublicVisibility(getOperation()))) {
-      signalPassFailure();
-    }
-  }
-};
-}  // namespace
-
-std::unique_ptr<OperationPass<ModuleOp>>
-CreateMarkOnlyMainFunctionWithPublicVisibilityPass() {
-  return std::make_unique<MarkOnlyMainFunctionWithPublicVisibilityPass>();
-}
-
-}  // namespace TF
-
-namespace tf_saved_model {
-
-static LogicalResult MarkFunctionVisibilityUsingSavedModelLinkage(
-    ModuleOp module) {
-  if (!tf_saved_model::HasTfSavedModelSemantics(module)) {
-    return success();
-  }
-  return MarkFunctionVisibility(module, tf_saved_model::IsExported);
-}
-
-namespace {
-struct MarkFunctionVisibilityUsingSavedModelLinkagePass
-    : public PassWrapper<MarkFunctionVisibilityUsingSavedModelLinkagePass,
-                         OperationPass<ModuleOp>> {
-  void runOnOperation() override {
-    if (failed(MarkFunctionVisibilityUsingSavedModelLinkage(getOperation()))) {
-      signalPassFailure();
-    }
-  }
-};
-}  // namespace
-
-static PassRegistration<MarkFunctionVisibilityUsingSavedModelLinkagePass> pass(
-    "tf-saved-model-mark-func-visibility",
-    "Use tf_saved_model linkage information to mark function visibility.");
-
-std::unique_ptr<OperationPass<ModuleOp>>
-CreateMarkFunctionVisibilityUsingSavedModelLinkagePass() {
-  return std::make_unique<MarkFunctionVisibilityUsingSavedModelLinkagePass>();
-}
-
-}  // namespace tf_saved_model
-
-}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/passes.h b/tensorflow/compiler/mlir/tensorflow/transforms/passes.h
index 7158d0f6be0..5cb15027fc5 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/passes.h
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/passes.h
@@ -117,21 +117,6 @@ std::unique_ptr<OperationPass<ModuleOp>> CreatePromoteVarHandlesToArgsPass();
 std::unique_ptr<OperationPass<FuncOp>>
 CreateConvertReadonlyReferenceVariablesToResourceVariablesPass();
 
-// Marks function visibility using tf.entry_function specification. That is,
-// functions with tf.entry_function attributes are marked with public
-// visibility while the other functions are marked with private visibility.
-LogicalResult MarkFunctionVisibilityUsingEntryFunctionSpecification(
-    ModuleOp module);
-// Creates a pass that uses tf.entry_function specification to mark function
-// visibility.
-std::unique_ptr<OperationPass<ModuleOp>>
-CreateMarkFunctionVisibilityUsingEntryFunctionSpecificationPass();
-
-// Creates a pass that marks the main function with public visibility, while
-// other functions are marked with private visibility.
-std::unique_ptr<OperationPass<ModuleOp>>
-CreateMarkOnlyMainFunctionWithPublicVisibilityPass();
-
 // Creates a simple device assignment pass on TF dialect for CoreRT use case.
 std::unique_ptr<OperationPass<FuncOp>> CreateSimpleTFDeviceAssignmentPass(
     llvm::StringRef default_device);
@@ -315,13 +300,6 @@ std::unique_ptr<OperationPass<ModuleOp>> CreateOptimizeGlobalTensorsPass();
 // Creates a pass that freezes tf_saved_model.global_tensor ops.
 std::unique_ptr<OperationPass<ModuleOp>> CreateFreezeGlobalTensorsPass();
 
-// Creates a pass that uses tf_saved_model dialect linkage information
-// to mark function visibility. That is, exported functions are marked with
-// public visibility while the other functions are marked with private
-// visibility.
-std::unique_ptr<OperationPass<ModuleOp>>
-CreateMarkFunctionVisibilityUsingSavedModelLinkagePass();
-
 }  // namespace tf_saved_model
 
 }  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_rewrite_pass.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_rewrite_pass.cc
index 696882cd105..ec9b3df525f 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_rewrite_pass.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_rewrite_pass.cc
@@ -146,6 +146,9 @@ LogicalResult EncapsulateFuncAndSerialize(FuncOp entry_func,
       // We can simply change name of TPU program's main function because there
       // should be no other reference to it.
       clone.setName("main");
+      clone.setVisibility(FuncOp::Visibility::Public);
+    } else {
+      clone.setVisibility(FuncOp::Visibility::Private);
     }
     symbol_table.insert(clone);
   }
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/compile_mlir_util.cc b/tensorflow/compiler/mlir/tensorflow/utils/compile_mlir_util.cc
index fd1ba3b1901..dac2fea87e2 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/compile_mlir_util.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/compile_mlir_util.cc
@@ -267,9 +267,6 @@ Status ConvertMLIRToXlaComputation(
     const XlaCompiler::ShapeRepresentationFn shape_representation_fn,
     std::vector<std::unique_ptr<mlir::Pass>> custom_legalization_passes) {
   mlir::PassManager tf2xla(module_op.getContext());
-  // Mark main function as public, and other functions as private.
-  tf2xla.addPass(
-      mlir::TF::CreateMarkOnlyMainFunctionWithPublicVisibilityPass());
   tf2xla.addNestedPass<mlir::FuncOp>(mlir::createCanonicalizerPass());
   tf2xla.addPass(mlir::TF::CreateTensorListOpsDecompositionPass());
   tf2xla.addPass(mlir::TF::CreateStackOpsDecompositionPass());
diff --git a/tensorflow/compiler/tf2xla/mlir_tf2xla.cc b/tensorflow/compiler/tf2xla/mlir_tf2xla.cc
index 43793be56a7..60d1f3da0c5 100644
--- a/tensorflow/compiler/tf2xla/mlir_tf2xla.cc
+++ b/tensorflow/compiler/tf2xla/mlir_tf2xla.cc
@@ -165,11 +165,6 @@ Status ConvertGraphDefToXlaViaMlir(
   device_set.AddDevice(&device);
   AddDevicesToOp(*module, &device_set);
 
-  if (failed(mlir::TF::MarkFunctionVisibilityUsingEntryFunctionSpecification(
-          *module))) {
-    return errors::Internal("Problem with mark function visibility");
-  }
-
   TF_RETURN_IF_ERROR(mlir::TF::RunBridgeWithStandardPipeline(
       *module, /*enable_logging=*/VLOG_IS_ON(1), /*enable_inliner=*/true));
 

From ededf6f4b9d1488c1d27df58b047fd5da6ad6c73 Mon Sep 17 00:00:00 2001
From: Sharada Shiddibhavi <sharada.shiddibhavi@intel.com>
Date: Thu, 18 Jun 2020 13:14:37 -0700
Subject: [PATCH 0525/1390] Update tensorflow/core/util/mkl_util.h

Addressing review comments

Co-authored-by: Penporn Koanantakool <38085909+penpornk@users.noreply.github.com>
---
 tensorflow/core/util/mkl_util.h | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/tensorflow/core/util/mkl_util.h b/tensorflow/core/util/mkl_util.h
index 996984eebc0..854d6e349cb 100644
--- a/tensorflow/core/util/mkl_util.h
+++ b/tensorflow/core/util/mkl_util.h
@@ -1538,13 +1538,8 @@ class MklDnnData {
   /// Set function for data buffer of user memory primitive.
   inline void SetUsrMemDataHandle(const Tensor* tensor,
                                   std::shared_ptr<stream> t_stream = nullptr) {
-    CHECK_NOTNULL(user_memory_);
-    CHECK_NOTNULL(tensor);
-#ifdef ENABLE_MKLDNN_THREADPOOL
-    user_memory_->set_data_handle(GetTensorBuffer(tensor), *t_stream);
-#else
-    user_memory_->set_data_handle(GetTensorBuffer(tensor));
-#endif  // ENABLE_MKLDNN_THREADPOOL
+    SetUsrMemDataHandle(GetTensorBuffer(tensor), t_stream);
+  }
   }
 
   /// allocate function for data buffer

From 51f3da9ca884951be412bccb766c3700ba2255f3 Mon Sep 17 00:00:00 2001
From: Yunxing Dai <yunxing@google.com>
Date: Thu, 18 Jun 2020 13:15:43 -0700
Subject: [PATCH 0526/1390] [Resubmit] Remove dynamic dimension of strided
 slice grad if input to strided slice is static.

If we slice a dynamic shaped tensor from a static tensor, the output
of the gradient should still be static. Unfortunately this cannot be
deduced alone by xla, so extra information is needed from the tf2xla
bridge.

PiperOrigin-RevId: 317166566
Change-Id: Ic3a16826242947a29cafd51b2f5c19e65d531fb9
---
 .../tf2xla/kernels/strided_slice_op.cc        | 24 +++++++++++++++++
 tensorflow/compiler/xla/client/xla_builder.cc | 23 ++++++++++++++++
 tensorflow/compiler/xla/client/xla_builder.h  |  6 +++++
 .../compiler/xla/client/xla_builder_test.cc   | 26 +++++++++++++++++++
 4 files changed, 79 insertions(+)

diff --git a/tensorflow/compiler/tf2xla/kernels/strided_slice_op.cc b/tensorflow/compiler/tf2xla/kernels/strided_slice_op.cc
index 2684c982600..784b790767c 100644
--- a/tensorflow/compiler/tf2xla/kernels/strided_slice_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/strided_slice_op.cc
@@ -350,6 +350,30 @@ class StridedSliceGradOp : public XlaOpKernel {
       grad = xla::Rev(grad, dimensions_to_reverse);
     }
     grad = xla::Pad(grad, zero, padding_config);
+
+    xla::XlaOp dynamic_shape = ctx->Input(0);
+    xla::Shape grad_shape = ctx->builder()->GetShape(grad).ValueOrDie();
+    ctx->set_dynamic_dimension_is_minus_one(true);
+    std::vector<int64> dynamic_size;
+    OP_REQUIRES_OK(ctx, ctx->ConstantInputAsIntVector(0, &dynamic_size));
+    // Input of strided_slice_op has to have the same shape as output.
+    DCHECK_EQ(grad_shape.rank(), input_shape.dims());
+    for (int64 dim = 0; dim < input_shape.dims(); ++dim) {
+      DCHECK_EQ(grad_shape.dimensions(dim), input_shape.dim_size(dim));
+      if (dynamic_size[dim] == -1) {
+        // Input is a dynamic dimension, set the same dynamic dimension size in
+        // the output.
+        auto dim_size = xla::Slice(dynamic_shape, {dim}, {dim + 1}, {1});
+        auto dim_size_scalar =
+            xla::Reshape(xla::ShapeUtil::MakeScalarShape(xla::S32), dim_size);
+        grad = xla::SetDimensionSize(grad, dim_size_scalar, dim);
+      } else if (grad_shape.is_dynamic_dimension(dim)) {
+        // Input is static but output is dynamic, respect input and remove any
+        // dynamic dim in the output.
+        grad = xla::RemoveDynamicDimension(grad, dim);
+      }
+    }
+
     ctx->SetOutput(0, grad);
   }
 
diff --git a/tensorflow/compiler/xla/client/xla_builder.cc b/tensorflow/compiler/xla/client/xla_builder.cc
index bfba48862f6..c7b6a7f9491 100644
--- a/tensorflow/compiler/xla/client/xla_builder.cc
+++ b/tensorflow/compiler/xla/client/xla_builder.cc
@@ -2727,6 +2727,25 @@ XlaOp XlaBuilder::GetDimensionSize(XlaOp operand, int64 dimension) {
   });
 }
 
+XlaOp XlaBuilder::RemoveDynamicDimension(XlaOp operand, int64 dimension) {
+  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    HloInstructionProto instr;
+    TF_ASSIGN_OR_RETURN(const Shape* operand_shape, GetShapePtr(operand));
+
+    Shape shape = *operand_shape;
+    shape.set_dynamic_dimension(dimension, false);
+    // Setting an op's dynamic dimension to its static size removes the dynamic
+    // dimension.
+    XlaOp static_size =
+        ConstantR0<int32>(this, operand_shape->dimensions(dimension));
+
+    *instr.mutable_shape() = shape.ToProto();
+    instr.add_dimensions(dimension);
+    return AddInstruction(std::move(instr), HloOpcode::kSetDimensionSize,
+                          {operand, static_size});
+  });
+}
+
 XlaOp XlaBuilder::SetDimensionSize(XlaOp operand, XlaOp val, int64 dimension) {
   return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
     HloInstructionProto instr;
@@ -3827,4 +3846,8 @@ XlaOp SetDimensionSize(const XlaOp operand, const XlaOp val, int64 dimension) {
   return operand.builder()->SetDimensionSize(operand, val, dimension);
 }
 
+XlaOp RemoveDynamicDimension(const XlaOp operand, int64 dimension) {
+  return operand.builder()->RemoveDynamicDimension(operand, dimension);
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/client/xla_builder.h b/tensorflow/compiler/xla/client/xla_builder.h
index ffa6a7c3439..b8af180b83e 100644
--- a/tensorflow/compiler/xla/client/xla_builder.h
+++ b/tensorflow/compiler/xla/client/xla_builder.h
@@ -704,6 +704,8 @@ class XlaBuilder {
 
   XlaOp SetDimensionSize(XlaOp operand, XlaOp val, int64 dimension);
 
+  XlaOp RemoveDynamicDimension(XlaOp operand, int64 dimension);
+
   StatusOr<XlaOp> AddInstruction(HloInstructionProto&& instr, HloOpcode opcode,
                                  absl::Span<const XlaOp> operands = {});
 
@@ -1151,6 +1153,7 @@ class XlaBuilder {
 
   friend XlaOp GetDimensionSize(XlaOp operand, int64 dimension);
   friend XlaOp SetDimensionSize(XlaOp operand, XlaOp val, int64 dimension);
+  friend XlaOp RemoveDynamicDimension(XlaOp operand, int64 dimension);
 
  protected:
   // Returns OK status if the given op was built using this builder. Otherwise,
@@ -2149,6 +2152,9 @@ XlaOp GetDimensionSize(XlaOp operand, int64 dimension);
 
 XlaOp SetDimensionSize(XlaOp operand, XlaOp val, int64 dimension);
 
+// Returns the same op but with dynamic dimension removed.
+XlaOp RemoveDynamicDimension(XlaOp operand, int64 dimension);
+
 // Implementation details below this point.
 //
 
diff --git a/tensorflow/compiler/xla/client/xla_builder_test.cc b/tensorflow/compiler/xla/client/xla_builder_test.cc
index 4fa47077fca..7011c946203 100644
--- a/tensorflow/compiler/xla/client/xla_builder_test.cc
+++ b/tensorflow/compiler/xla/client/xla_builder_test.cc
@@ -556,6 +556,32 @@ TEST_F(XlaBuilderTest, DynamicParameter) {
   EXPECT_TRUE(param_shape.is_dynamic_dimension(0));
 }
 
+TEST_F(XlaBuilderTest, SetDimensionSize) {
+  XlaBuilder b(TestName());
+  auto p0 = Parameter(&b, 0, ShapeUtil::MakeShape(F32, {10}), "p0");
+  auto p1 = Parameter(&b, 1, ShapeUtil::MakeShape(S32, {}), "p1");
+  auto set_dim_size = SetDimensionSize(p0, p1, 0);
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          BuildHloModule(&b, /*root=*/set_dim_size));
+  const Shape& root_shape =
+      module->entry_computation()->root_instruction()->shape();
+  EXPECT_TRUE(root_shape.is_dynamic_dimension(0));
+}
+
+TEST_F(XlaBuilderTest, RemoveDimensionSize) {
+  XlaBuilder b(TestName());
+  auto p0 = Parameter(&b, 0, ShapeUtil::MakeShape(F32, {10}), "p0");
+  auto p1 = Parameter(&b, 1, ShapeUtil::MakeShape(S32, {}), "p1");
+  auto set_dim_size = SetDimensionSize(p0, p1, 0);
+  auto remove_dim_size = RemoveDynamicDimension(set_dim_size, 0);
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          BuildHloModule(&b, /*root=*/remove_dim_size));
+  const Shape& root_shape =
+      module->entry_computation()->root_instruction()->shape();
+  // Dynamic dimension has been removed.
+  EXPECT_FALSE(root_shape.is_dynamic_dimension(0));
+}
+
 TEST_F(XlaBuilderTest, DynamicUnary) {
   XlaBuilder b(TestName());
   Shape tuple_param_shape = ShapeUtil::MakeTupleShape(

From 8944a3eeb18c2374f02759324cbeded018c5868b Mon Sep 17 00:00:00 2001
From: Dan Moldovan <mdan@google.com>
Date: Thu, 18 Jun 2020 13:23:05 -0700
Subject: [PATCH 0527/1390] Enable type annotations for python/autograph.

PiperOrigin-RevId: 317168016
Change-Id: I57341cc0347ab8eb008949e191da1415069203b7
---
 tensorflow/python/autograph/pyct/parser.py | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/tensorflow/python/autograph/pyct/parser.py b/tensorflow/python/autograph/pyct/parser.py
index 747d56e401d..9ac7c2ef2a6 100644
--- a/tensorflow/python/autograph/pyct/parser.py
+++ b/tensorflow/python/autograph/pyct/parser.py
@@ -22,6 +22,7 @@ from __future__ import division
 from __future__ import print_function
 
 import re
+import sys
 import textwrap
 import tokenize
 
@@ -33,11 +34,18 @@ from tensorflow.python.autograph.pyct import errors
 from tensorflow.python.autograph.pyct import inspect_utils
 
 
-STANDARD_PREAMBLE = textwrap.dedent("""
-    from __future__ import division
-    from __future__ import print_function
+PY2_PREAMBLE = textwrap.dedent("""
+from __future__ import division
+from __future__ import print_function
 """)
-STANDARD_PREAMBLE_LEN = 2
+PY3_PREAMBLE = ''
+
+if sys.version_info >= (3,):
+  STANDARD_PREAMBLE = PY3_PREAMBLE
+else:
+  STANDARD_PREAMBLE = PY2_PREAMBLE
+
+STANDARD_PREAMBLE_LEN = STANDARD_PREAMBLE.count('__future__')
 
 
 _LEADING_WHITESPACE = re.compile(r'\s*')

From f8431d0c293a34c5bfb91cf0c57384eaa47a9911 Mon Sep 17 00:00:00 2001
From: Sharada Shiddibhavi <sharada.shiddibhavi@intel.com>
Date: Thu, 18 Jun 2020 13:29:48 -0700
Subject: [PATCH 0528/1390] Update mkl_util.h

---
 tensorflow/core/util/mkl_util.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tensorflow/core/util/mkl_util.h b/tensorflow/core/util/mkl_util.h
index 854d6e349cb..eb1a105e07c 100644
--- a/tensorflow/core/util/mkl_util.h
+++ b/tensorflow/core/util/mkl_util.h
@@ -1540,7 +1540,6 @@ class MklDnnData {
                                   std::shared_ptr<stream> t_stream = nullptr) {
     SetUsrMemDataHandle(GetTensorBuffer(tensor), t_stream);
   }
-  }
 
   /// allocate function for data buffer
   inline void AllocateBuffer(size_t size) {

From 2663edb6691b50de8ca9445c311d02b72faa1bf5 Mon Sep 17 00:00:00 2001
From: Ken Franko <kfranko@google.com>
Date: Thu, 18 Jun 2020 13:29:28 -0700
Subject: [PATCH 0529/1390] Change TPUExtractOutsideCompilation pass for a
 Module pass.

This is needed for getting the devices from the module for assigning host device for outside compilation launch op.

PiperOrigin-RevId: 317169244
Change-Id: I734e7eeef3fdb037045d070ffd736be4ef8edee1
---
 .../compiler/mlir/tensorflow/transforms/passes.h      |  3 ++-
 .../transforms/tpu_extract_outside_compilation.cc     | 11 ++++++-----
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/passes.h b/tensorflow/compiler/mlir/tensorflow/transforms/passes.h
index 5cb15027fc5..a34be28c809 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/passes.h
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/passes.h
@@ -281,7 +281,8 @@ std::unique_ptr<OperationPass<FuncOp>> CreateTPUHostComputationExpansionPass();
 
 // Creates a pass that extract outside compilation (CPU ops inside TPU cluster)
 // ops to a separate parallel_execute region to run on CPU.
-std::unique_ptr<OperationPass<FuncOp>> CreateTPUExtractOutsideCompilationPass();
+std::unique_ptr<OperationPass<ModuleOp>>
+CreateTPUExtractOutsideCompilationPass();
 
 // Populates the supplied passmanager with the passes required to run the
 void CreateTPUBridgePipeline(OpPassManager& pm);
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_extract_outside_compilation.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_extract_outside_compilation.cc
index a2a19108326..503c9869557 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_extract_outside_compilation.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_extract_outside_compilation.cc
@@ -49,8 +49,9 @@ using OutsideClusterMap =
 // TODO(b/154363171): Add example tranformations.
 
 struct TPUExtractOutsideCompilation
-    : public PassWrapper<TPUExtractOutsideCompilation, FunctionPass> {
-  void runOnFunction() override;
+    : public PassWrapper<TPUExtractOutsideCompilation,
+                         OperationPass<ModuleOp>> {
+  void runOnOperation() override;
 };
 
 // Collects and clusters ops in `block` with the same `_xla_outside_compilation`
@@ -305,9 +306,9 @@ void CreateParallelExecuteFromOutsideClusters(
   }
 }
 
-void TPUExtractOutsideCompilation::runOnFunction() {
+void TPUExtractOutsideCompilation::runOnOperation() {
   auto extract_result =
-      getFunction().walk([&](tf_device::ClusterOp tpu_cluster) {
+      getOperation().walk([&](tf_device::ClusterOp tpu_cluster) {
         OutsideClusterMap clusters;
         if (failed(CollectAndGroupOutsideClusterOps(&tpu_cluster.GetBody(),
                                                     &clusters)))
@@ -325,7 +326,7 @@ void TPUExtractOutsideCompilation::runOnFunction() {
 
 }  // namespace
 
-std::unique_ptr<OperationPass<FuncOp>>
+std::unique_ptr<OperationPass<ModuleOp>>
 CreateTPUExtractOutsideCompilationPass() {
   return std::make_unique<TPUExtractOutsideCompilation>();
 }

From 6558da5a66ad6863e47abfe596eee2290524b1b7 Mon Sep 17 00:00:00 2001
From: Jiho Choi <jihochoi@google.com>
Date: Thu, 18 Jun 2020 13:30:08 -0700
Subject: [PATCH 0530/1390] Apply new TraceMe APIs.

PiperOrigin-RevId: 317169381
Change-Id: I2259895a8dde21e25661a239b9d4f5911a454adb
---
 tensorflow/compiler/xla/pjrt/BUILD               |  2 ++
 tensorflow/compiler/xla/pjrt/pjrt_client.cc      | 16 ++++++++--------
 tensorflow/core/profiler/lib/connected_traceme.h |  1 +
 3 files changed, 11 insertions(+), 8 deletions(-)

diff --git a/tensorflow/compiler/xla/pjrt/BUILD b/tensorflow/compiler/xla/pjrt/BUILD
index dd50d0577d4..e401a798d68 100644
--- a/tensorflow/compiler/xla/pjrt/BUILD
+++ b/tensorflow/compiler/xla/pjrt/BUILD
@@ -141,7 +141,9 @@ cc_library(
         "//tensorflow/compiler/xla/service/gpu:gpu_executable_run_options",
         "//tensorflow/core:allocator",
         "//tensorflow/core:lib",
+        "//tensorflow/core/profiler/lib:connected_traceme",
         "//tensorflow/core/profiler/lib:traceme",
+        "//tensorflow/core/profiler/lib:traceme_encode",
         "//tensorflow/stream_executor:event",
         "//tensorflow/stream_executor:stream",
         "//tensorflow/stream_executor/host:host_platform_id",
diff --git a/tensorflow/compiler/xla/pjrt/pjrt_client.cc b/tensorflow/compiler/xla/pjrt/pjrt_client.cc
index ccb72b7ce30..ef259cf1cfd 100644
--- a/tensorflow/compiler/xla/pjrt/pjrt_client.cc
+++ b/tensorflow/compiler/xla/pjrt/pjrt_client.cc
@@ -98,7 +98,9 @@ limitations under the License.
 #include "tensorflow/core/platform/mem.h"
 #include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/profiler/lib/connected_traceme.h"
 #include "tensorflow/core/profiler/lib/traceme.h"
+#include "tensorflow/core/profiler/lib/traceme_encode.h"
 #include "tensorflow/stream_executor/device_memory.h"
 #include "tensorflow/stream_executor/device_memory_allocator.h"
 #include "tensorflow/stream_executor/event.h"
@@ -1429,10 +1431,9 @@ StatusOr<ScopedShapedBuffer> PjRtExecutable::EnqueueExecution(
     int executable_idx, const RunId& run_id, const ExecuteOptions& options,
     Device* device, std::vector<PjRtBuffer::ScopedHold>* device_buffers) const {
   int device_ordinal = device->local_device_state()->device_ordinal();
-  tensorflow::profiler::TraceMe traceme([&] {
-    return absl::StrCat("LocalExecutable::Execute#run_id=", run_id.ToInt(),
-                        "#");
-  });
+  tensorflow::profiler::TraceMeConsumer activity(
+      "LocalExecutable::Execute", tensorflow::profiler::ContextType::kPjRt,
+      run_id.ToInt());
   VLOG(3) << "Replica " << replica << ", partition " << partition
           << " mapped to device ordinal for execution: " << device_ordinal;
 
@@ -1721,10 +1722,9 @@ PjRtExecutable::ExecuteOnLocalDevices(
     absl::Span<const std::vector<PjRtBuffer*>> argument_handles,
     const ExecuteOptions& options) const {
   RunId run_id;
-  tensorflow::profiler::TraceMe traceme([&] {
-    return absl::StrCat(
-        "LocalExecutable::ExecuteOnLocalDevices#run_id=", run_id.ToInt(), "#");
-  });
+  tensorflow::profiler::TraceMeProducer activity(
+      "LocalExecutable::ExecuteOnLocalDevices",
+      tensorflow::profiler::ContextType::kPjRt, run_id.ToInt());
 
   const int num_local_devices = local_devices_.size();
 
diff --git a/tensorflow/core/profiler/lib/connected_traceme.h b/tensorflow/core/profiler/lib/connected_traceme.h
index ed8b4ac1ad2..b55c4407fe6 100644
--- a/tensorflow/core/profiler/lib/connected_traceme.h
+++ b/tensorflow/core/profiler/lib/connected_traceme.h
@@ -29,6 +29,7 @@ enum class ContextType : int {
   kGeneric,
   kTfExecutor,
   kSharedBatchScheduler,
+  kPjRt,
 };
 
 /*

From 61d83075ad34fd33d7d44ea3341a055e26e775a6 Mon Sep 17 00:00:00 2001
From: Niranjan Hasabnis <niranjan.hasabnis@intel.com>
Date: Wed, 17 Jun 2020 14:13:29 -0700
Subject: [PATCH 0531/1390] Fixing MklTanh compilation error with DNNL0

---
 .../core/common_runtime/mkl_layout_pass.cc    |  3 +++
 .../common_runtime/mkl_layout_pass_test.cc    |  3 +++
 tensorflow/core/kernels/mkl_relu_op.cc        | 23 +++++++++++++++++--
 tensorflow/core/kernels/mkl_relu_op_test.cc   |  5 +++-
 4 files changed, 31 insertions(+), 3 deletions(-)

diff --git a/tensorflow/core/common_runtime/mkl_layout_pass.cc b/tensorflow/core/common_runtime/mkl_layout_pass.cc
index f6e42fc7e8c..778d5445cb2 100644
--- a/tensorflow/core/common_runtime/mkl_layout_pass.cc
+++ b/tensorflow/core/common_runtime/mkl_layout_pass.cc
@@ -682,12 +682,15 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     rinfo_.push_back(
         {csinfo_.requantize, mkl_op_registry::GetMklOpName(csinfo_.requantize),
          CopyAttrsAll, AlwaysRewrite, kRewriteForLayoutPropagation});
+#ifdef ENABLE_MKLDNN_V1
+    // Optimized TanhGrad support exists only in DNNL 1.x.
     rinfo_.push_back({csinfo_.tanh, mkl_op_registry::GetMklOpName(csinfo_.tanh),
                       CopyAttrsAll, AlwaysRewrite,
                       kRewriteForLayoutPropagation});
     rinfo_.push_back(
         {csinfo_.tanh_grad, mkl_op_registry::GetMklOpName(csinfo_.tanh_grad),
          CopyAttrsAll, AlwaysRewrite, kRewriteForLayoutPropagation});
+#endif  // ENABLE_MKLDNN_V1
     rinfo_.push_back(
         {csinfo_.reshape, mkl_op_registry::GetMklOpName(csinfo_.reshape),
          CopyAttrsAll, AlwaysRewrite, kRewriteForLayoutPropagation});
diff --git a/tensorflow/core/common_runtime/mkl_layout_pass_test.cc b/tensorflow/core/common_runtime/mkl_layout_pass_test.cc
index 9971f6c5d7e..d480c0a49ce 100644
--- a/tensorflow/core/common_runtime/mkl_layout_pass_test.cc
+++ b/tensorflow/core/common_runtime/mkl_layout_pass_test.cc
@@ -3024,6 +3024,8 @@ REGISTER_TEST_ALL_TYPES(NodeRewrite_LeakyReluLeakyReluGrad_Positive);
 // clang-format on
 
 // clang-format off
+#ifdef ENABLE_MKLDNN_V1
+
 #define REGISTER_TEST(NAME, T, INPUT)                                        \
   TEST_F(MklLayoutPassTest, NAME##_##T) {                                    \
     DCHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS);     \
@@ -3081,6 +3083,7 @@ REGISTER_TEST_ALL_TYPES(NodeRewrite_TanhGrad_Positive);
 }
 REGISTER_TEST_ALL_TYPES(NodeRewrite_TanhTanhGrad_Positive);
 #undef REGISTER_TEST
+#endif  // ENABLE_MKLDNN_V1
 // clang-format on
 
 TEST_F(MklLayoutPassTest, NodeRewrite_AvgPool_Positive) {
diff --git a/tensorflow/core/kernels/mkl_relu_op.cc b/tensorflow/core/kernels/mkl_relu_op.cc
index 6d79b8f3282..70aa1e937d3 100644
--- a/tensorflow/core/kernels/mkl_relu_op.cc
+++ b/tensorflow/core/kernels/mkl_relu_op.cc
@@ -19,7 +19,6 @@ limitations under the License.
 #include <unordered_map>
 
 #include "mkldnn.hpp"
-#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
@@ -27,6 +26,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/util/mkl_types.h"
 #include "tensorflow/core/util/mkl_util.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 
 using mkldnn::algorithm;
 using mkldnn::eltwise_forward;
@@ -269,7 +269,7 @@ class MklEltwiseBwdParams {
 
   MklEltwiseBwdParams(const memory::dims& src_dims,
                       const memory::desc& common_md, algorithm alg_kind,
-                      float alpha, float beta, int forward_input_type)
+                      float alpha, float beta, int forward_input_type = -1)
       : src_dims(src_dims),
         common_md(common_md),
         alg_kind(alg_kind),
@@ -644,7 +644,10 @@ class MklReluGradOpBase : public OpKernel {
   virtual int GetDiffSrcIndex() const { return 0; }
   // What is the type of input tensor that grad op receives from forward op --
   // is it 'x' (SRC) or 'y' (DST). For Relu-family, it is 'x', so fwd op SRC.
+
+#ifdef ENABLE_MKLDNN_V1
   virtual int GetTypeOfInputTensorFromFwdOp() const { return MKLDNN_ARG_SRC; }
+#endif
 
   void Compute(OpKernelContext* context) {
     try {
@@ -736,8 +739,16 @@ class MklReluGradOpBase : public OpKernel {
         common_md = src_md;
       }
 
+#ifdef ENABLE_MKLDNN_V1
       MklEltwiseBwdParams<T> bwdParams(src_dims, common_md, alg_kind, alpha_,
                                        beta_, GetTypeOfInputTensorFromFwdOp());
+#else
+      // MKLDNN V0 does not support reusing output of forward op in backward.
+      // So this optimization works only in MKLDNN v1.
+      MklEltwiseBwdParams<T> bwdParams(src_dims, common_md, alg_kind, alpha_,
+                                       beta_);
+#endif  // ENABLE_MKLDNN_V1
+
       MklEltwiseBwdPrimitive<T>* eltwise_bwd =
           MklEltwiseBwdPrimitiveFactory<T>::Get(bwdParams);
 
@@ -962,6 +973,11 @@ class MklEluGradOp
   }
 };
 
+#ifdef ENABLE_MKLDNN_V1
+// Optimized TanhGrad support exists in DNNL1.x only
+// (eltwise_tanh_use_dst_for_bwd). We can still support it with DNNL0.x, but
+// it will not be optimized. So we disable it for DNNL0.x.
+
 template <typename Device, typename T>
 class MklTanhOp : public MklReluOpBase<Device, T, ALGORITHM::eltwise_tanh> {
  public:
@@ -1043,6 +1059,7 @@ class MklTanhGradOp
         (static_cast<T*>(user_g))[0] * (static_cast<T>(1) - tanh * tanh);
   }
 };
+#endif  // ENABLE_MKLDNN_V1
 
 #define RELU6_UPPER_BOUND 6.0f
 template <typename Device, typename T>
@@ -1227,6 +1244,7 @@ TF_CALL_bfloat16(REGISTER_RELU_MKL_SUPPORTED_KERNELS_TYPES);
 TF_CALL_float(REGISTER_ELU_MKL_SUPPORTED_KERNELS_TYPES);
 TF_CALL_bfloat16(REGISTER_ELU_MKL_SUPPORTED_KERNELS_TYPES);
 
+#ifdef ENABLE_MKLDNN_V1
 #define REGISTER_TANH_MKL_SUPPORTED_KERNELS_TYPES(type)        \
   REGISTER_KERNEL_BUILDER(                                     \
       Name("_MklTanh")                                         \
@@ -1242,6 +1260,7 @@ TF_CALL_bfloat16(REGISTER_ELU_MKL_SUPPORTED_KERNELS_TYPES);
       MklTanhGradOp<CPUDevice, type>);
 TF_CALL_float(REGISTER_TANH_MKL_SUPPORTED_KERNELS_TYPES);
 TF_CALL_bfloat16(REGISTER_TANH_MKL_SUPPORTED_KERNELS_TYPES);
+#endif
 
 #define REGISTER_RELU6_MKL_SUPPORTED_KERNELS_TYPES(type)       \
   REGISTER_KERNEL_BUILDER(                                     \
diff --git a/tensorflow/core/kernels/mkl_relu_op_test.cc b/tensorflow/core/kernels/mkl_relu_op_test.cc
index d1fdf7ab4ae..86d7f979c1f 100644
--- a/tensorflow/core/kernels/mkl_relu_op_test.cc
+++ b/tensorflow/core/kernels/mkl_relu_op_test.cc
@@ -15,8 +15,8 @@ limitations under the License.
 
 #ifdef INTEL_MKL
 
-#include "mkldnn.hpp"
 #include "absl/strings/match.h"
+#include "mkldnn.hpp"
 #include "tensorflow/cc/ops/const_op.h"
 #include "tensorflow/cc/ops/nn_ops.h"
 #include "tensorflow/cc/ops/standard_ops.h"
@@ -121,8 +121,11 @@ static Graph* Activation(const string& op_name, const string& kind,
   BM(OP, 32, 64, 128, 256, cpu); \
   BM(OP, 33, 65, 129, 257, cpu);
 
+#ifdef ENABLE_MKLDNN_V1
+// Optimized MKLDNN TanhGrad support exists in DNNL1.x only.
 TEST_ALL_SIZES(Tanh)
 TEST_ALL_SIZES(TanhGrad)
+#endif  // ENABLE_MKLDNN_V1
 TEST_ALL_SIZES(Relu)
 TEST_ALL_SIZES(ReluGrad)
 TEST_ALL_SIZES(Elu)

From e0962f4c374f4cbf78ad27fd2391f976c1a2050d Mon Sep 17 00:00:00 2001
From: Jiho Choi <jihochoi@google.com>
Date: Thu, 18 Jun 2020 13:35:31 -0700
Subject: [PATCH 0532/1390] Remove the old grouping rule for PJRT.

PiperOrigin-RevId: 317170383
Change-Id: I96973a2d2cd0ca1fc786bc7500deb6b4fedd0534
---
 tensorflow/core/profiler/utils/group_events.cc  | 5 +----
 tensorflow/core/profiler/utils/xplane_schema.cc | 4 ----
 tensorflow/core/profiler/utils/xplane_schema.h  | 3 ---
 3 files changed, 1 insertion(+), 11 deletions(-)

diff --git a/tensorflow/core/profiler/utils/group_events.cc b/tensorflow/core/profiler/utils/group_events.cc
index be8dd506b0c..0772cff7b97 100644
--- a/tensorflow/core/profiler/utils/group_events.cc
+++ b/tensorflow/core/profiler/utils/group_events.cc
@@ -635,10 +635,7 @@ std::vector<InterThreadConnectInfo> CreateInterThreadConnectInfoList() {
        {StatType::kStepId, StatType::kIterNum}},
       {HostEventType::kKernelLaunch,
        HostEventType::kKernelExecute,
-       {StatType::kCorrelationId}},
-      {HostEventType::kLocalExecutableExecuteOnLocalDevice,
-       HostEventType::kLocalExecutableExecute,
-       {StatType::kRunId}}};
+       {StatType::kCorrelationId}}};
   return connect_info_list;
 }
 
diff --git a/tensorflow/core/profiler/utils/xplane_schema.cc b/tensorflow/core/profiler/utils/xplane_schema.cc
index be53dcbdc01..5ca8326d72c 100644
--- a/tensorflow/core/profiler/utils/xplane_schema.cc
+++ b/tensorflow/core/profiler/utils/xplane_schema.cc
@@ -91,10 +91,6 @@ const HostEventTypeMap& GetHostEventTypeMap() {
       {"WhileOp-StartBody", kWhileOpStartBody},
       {"ForOp", kForOp},
       {"PartitionedCallOp", kPartitionedCallOp},
-      // XLA related.
-      {"LocalExecutable::ExecuteOnLocalDevices",
-       kLocalExecutableExecuteOnLocalDevice},
-      {"LocalExecutable::Execute", kLocalExecutableExecute},
       // tf.data related.
       {"IteratorGetNextOp::DoCompute", kIteratorGetNextOp},
       {"IteratorGetNextAsOptionalOp::DoCompute", kIteratorGetNextAsOptionalOp},
diff --git a/tensorflow/core/profiler/utils/xplane_schema.h b/tensorflow/core/profiler/utils/xplane_schema.h
index a31814cef06..41774deaa59 100644
--- a/tensorflow/core/profiler/utils/xplane_schema.h
+++ b/tensorflow/core/profiler/utils/xplane_schema.h
@@ -81,9 +81,6 @@ enum HostEventType {
   kWhileOpStartBody,
   kForOp,
   kPartitionedCallOp,
-  // XLA related.
-  kLocalExecutableExecuteOnLocalDevice,
-  kLocalExecutableExecute,
   // tf.data related.
   kIteratorGetNextOp,
   kIteratorGetNextAsOptionalOp,

From 8f700fb2e0da382f1e2e9630f56a7922a8799a59 Mon Sep 17 00:00:00 2001
From: Berkin Ilbeyi <berkin@google.com>
Date: Thu, 18 Jun 2020 13:39:10 -0700
Subject: [PATCH 0533/1390] [XLA] Propagate memory spaces recursively inside
 nested fusions.

PiperOrigin-RevId: 317171110
Change-Id: I65004edb7498acb2f3b4238d9afbbb5d3930aab5
---
 .../xla/service/memory_space_propagation.cc   |  80 +++++++---
 .../xla/service/memory_space_propagation.h    |  11 +-
 .../service/memory_space_propagation_test.cc  | 148 ++++++++++++++++++
 3 files changed, 214 insertions(+), 25 deletions(-)

diff --git a/tensorflow/compiler/xla/service/memory_space_propagation.cc b/tensorflow/compiler/xla/service/memory_space_propagation.cc
index 80eb4017477..2eb15b14eaf 100644
--- a/tensorflow/compiler/xla/service/memory_space_propagation.cc
+++ b/tensorflow/compiler/xla/service/memory_space_propagation.cc
@@ -29,36 +29,78 @@ StatusOr<bool> MemorySpacePropagation::Run(HloModule* module) {
         // Propagate the operand subshapes.
         for (int operand_idx = 0; operand_idx < instruction->operand_count();
              ++operand_idx) {
-          modified |=
-              PropagateSubshapes(instruction->operand(operand_idx)->shape(),
-                                 instruction->fused_parameter(operand_idx));
+          for (const ShapeUtil::IndexedShape& indexed_shape :
+               ShapeUtil::GetLeafShapes(
+                   instruction->operand(operand_idx)->shape())) {
+            int64 memory_space = indexed_shape.shape.layout().memory_space();
+            modified |= Propagate(indexed_shape.index,
+                                  instruction->fused_parameter(operand_idx),
+                                  memory_space);
+          }
         }
 
         // Propagate output subshapes.
-        modified |= PropagateSubshapes(instruction->shape(),
-                                       instruction->fused_expression_root());
+        for (const ShapeUtil::IndexedShape& indexed_shape :
+             ShapeUtil::GetLeafShapes(instruction->shape())) {
+          int64 memory_space = indexed_shape.shape.layout().memory_space();
+          modified |=
+              Propagate(indexed_shape.index,
+                        instruction->fused_expression_root(), memory_space);
+        }
       }
     }
   }
   return modified;
 }
 
-bool MemorySpacePropagation::PropagateSubshapes(
-    const Shape& caller_shape, const HloInstruction* callee_instruction) const {
+bool MemorySpacePropagation::Propagate(ShapeIndexView index,
+                                       const HloInstruction* callee_instruction,
+                                       int64 memory_space) const {
   bool modified = false;
-  for (const ShapeUtil::IndexedShape& indexed_shape :
-       ShapeUtil::GetLeafShapes(caller_shape)) {
-    int64 memory_space = indexed_shape.shape.layout().memory_space();
-    const HloValue& value = dataflow_analysis_->GetUniqueValueAt(
-        callee_instruction, indexed_shape.index);
+  const HloValue& value = dataflow_analysis_->GetUniqueValueAt(
+      callee_instruction, index.ToShapeIndex());
 
-    for (const HloPosition& position : value.positions()) {
-      Shape* shape = ShapeUtil::GetMutableSubshape(
-          position.instruction->mutable_shape(), position.index);
-      if (shape->layout().memory_space() != memory_space) {
-        shape->mutable_layout()->set_memory_space(memory_space);
-        modified = true;
-      }
+  for (const HloPosition& position : value.positions()) {
+    HloInstruction* instruction = position.instruction;
+    Shape* shape = ShapeUtil::GetMutableSubshape(instruction->mutable_shape(),
+                                                 position.index);
+    if (shape->layout().memory_space() == memory_space) {
+      continue;
+    }
+    shape->mutable_layout()->set_memory_space(memory_space);
+    modified = true;
+
+    // For fusion outputs, propagate the memory space to the fusion root.
+    if (instruction->opcode() == HloOpcode::kFusion) {
+      Propagate(position.index, instruction->fused_expression_root(),
+                memory_space);
+    }
+
+    const HloInstruction* parent_fusion =
+        instruction->parent()->FusionInstruction();
+    // For nested fusion roots, pop one level up and propagate the memory space
+    // to the output of the calling fusion instruction.
+    if (instruction == instruction->parent()->root_instruction() &&
+        parent_fusion->parent()->IsFusionComputation()) {
+      Propagate(position.index, parent_fusion, memory_space);
+    }
+
+    // For nested fusion parameters, pop one level up and propagate the memory
+    // space to the operand of the calling fusion instruction.
+    if (instruction->opcode() == HloOpcode::kParameter &&
+        parent_fusion->parent()->IsFusionComputation()) {
+      const HloInstruction* fusion_operand =
+          parent_fusion->operand(instruction->parameter_number());
+      Propagate(position.index, fusion_operand, memory_space);
+    }
+  }
+
+  for (const HloUse& use : value.uses()) {
+    // For fusion uses, propagate the memory space to the fusion parameter.
+    if (use.instruction->opcode() == HloOpcode::kFusion) {
+      modified |= Propagate(
+          use.operand_index,
+          use.instruction->fused_parameter(use.operand_number), memory_space);
     }
   }
   return modified;
diff --git a/tensorflow/compiler/xla/service/memory_space_propagation.h b/tensorflow/compiler/xla/service/memory_space_propagation.h
index 65a1dfd14a6..510e9e69f79 100644
--- a/tensorflow/compiler/xla/service/memory_space_propagation.h
+++ b/tensorflow/compiler/xla/service/memory_space_propagation.h
@@ -31,12 +31,11 @@ class MemorySpacePropagation : public HloModulePass {
   StatusOr<bool> Run(HloModule* module) override;
 
  private:
-  // Given the caller shape (operand or output) and its corresponding
-  // insturction in the fused computation (parameter or root), propagates the
-  // memory space to all the subshapes in the callee side. Returns true if the
-  // module is modified.
-  bool PropagateSubshapes(const Shape& caller_shape,
-                          const HloInstruction* callee_instruction) const;
+  // Given the shape index (operand or output) and its corresponding instruction
+  // in the fused computation (parameter or root), propagates the memory space
+  // in the callee side. Returns true if the module is modified.
+  bool Propagate(ShapeIndexView index, const HloInstruction* callee_instruction,
+                 int64 memory_space) const;
 
   std::unique_ptr<HloDataflowAnalysis> dataflow_analysis_;
 };
diff --git a/tensorflow/compiler/xla/service/memory_space_propagation_test.cc b/tensorflow/compiler/xla/service/memory_space_propagation_test.cc
index 8d74958f6aa..de45af5a190 100644
--- a/tensorflow/compiler/xla/service/memory_space_propagation_test.cc
+++ b/tensorflow/compiler/xla/service/memory_space_propagation_test.cc
@@ -199,5 +199,153 @@ TEST_F(MemorySpacePropagationTest, TupleOutput) {
   EXPECT_EQ(module->Hash(), ref->Hash());
 }
 
+TEST_F(MemorySpacePropagationTest, NestedInputFusion) {
+  // Tests propagating the memory space to nested fusions on the input side.
+  absl::string_view hlo_string = R"(
+  HloModule NestedFusion
+
+  %bitcast_fusion {
+    %bf_param = s32[3,2]{0,1:T(128)} parameter(0)
+    ROOT %bitcast = s32[6]{0:T(128)} bitcast(%bf_param)
+  }
+
+  %fused_computation {
+    %param_1.3 = s32[1]{0:T(128)} parameter(1)
+    %constant.2 = s32[]{:T(128)} constant(-2147483648)
+    %pad.2 = s32[6]{0:T(128)} pad(s32[1]{0:T(128)} %param_1.3, s32[]{:T(128)} %constant.2), padding=0_5
+    %param_2.3 = s32[5]{0:T(128)} parameter(2)
+    %pad.3 = s32[6]{0:T(128)} pad(s32[5]{0:T(128)} %param_2.3, s32[]{:T(128)} %constant.2), padding=1_0
+    %maximum.1 = s32[6]{0:T(128)} maximum(s32[6]{0:T(128)} %pad.2, s32[6]{0:T(128)} %pad.3)
+    %param_0.1 = s32[3,2]{0,1:T(128)} parameter(0)
+    %fusion.1 = s32[6]{0:T(128)} fusion(%param_0.1), kind=kLoop, calls=bitcast_fusion
+    ROOT %add.0 = s32[6]{0:T(128)} add(s32[6]{0:T(128)} %maximum.1, s32[6]{0:T(128)} %fusion.1)
+  }
+
+  ENTRY %entry {
+    %param0 = s32[3,2]{0,1:T(128)} parameter(0)
+    %param1 = s32[1]{0:T(128)} parameter(1)
+    %param2 = s32[5]{0:T(128)} parameter(2)
+    %arg0 = s32[3,2]{0,1:T(128)S(1)} copy(%param0)
+    %arg1 = s32[1]{0:T(128)} copy(%param1)
+    %arg2 = s32[5]{0:T(128)S(1)} copy(%param2)
+    %fusion = s32[6]{0:T(128)S(1)} fusion(s32[3,2]{0,1:T(128)S(1)} %arg0, s32[1]{0:T(128)} %arg1, s32[5]{0:T(128)S(1)} %arg2), kind=kLoop, calls=%fused_computation
+    ROOT %root = s32[6]{0:T(128)} copy(%fusion)
+  }
+  )";
+  absl::string_view expected_hlo_string = R"(
+  HloModule NestedFusion
+
+  %bitcast_fusion {
+    %bf_param = s32[3,2]{0,1:T(128)S(1)} parameter(0)
+    ROOT %bitcast = s32[6]{0:T(128)S(1)} bitcast(%bf_param)
+  }
+
+  %fused_computation {
+    %param_1.3 = s32[1]{0:T(128)} parameter(1)
+    %constant.2 = s32[]{:T(128)} constant(-2147483648)
+    %pad.2 = s32[6]{0:T(128)} pad(s32[1]{0:T(128)} %param_1.3, s32[]{:T(128)} %constant.2), padding=0_5
+    %param_2.3 = s32[5]{0:T(128)S(1)} parameter(2)
+    %pad.3 = s32[6]{0:T(128)} pad(s32[5]{0:T(128)} %param_2.3, s32[]{:T(128)} %constant.2), padding=1_0
+    %maximum.1 = s32[6]{0:T(128)} maximum(s32[6]{0:T(128)} %pad.2, s32[6]{0:T(128)} %pad.3)
+    %param_0.1 = s32[3,2]{0,1:T(128)S(1)} parameter(0)
+    %fusion.1 = s32[6]{0:T(128)S(1)} fusion(%param_0.1), kind=kLoop, calls=bitcast_fusion
+    ROOT %add.0 = s32[6]{0:T(128)S(1)} add(s32[6]{0:T(128)} %maximum.1, s32[6]{0:T(128)S(1)} %fusion.1)
+  }
+
+  ENTRY %entry {
+    %param0 = s32[3,2]{0,1:T(128)} parameter(0)
+    %param1 = s32[1]{0:T(128)} parameter(1)
+    %param2 = s32[5]{0:T(128)} parameter(2)
+    %arg0 = s32[3,2]{0,1:T(128)S(1)} copy(%param0)
+    %arg1 = s32[1]{0:T(128)} copy(%param1)
+    %arg2 = s32[5]{0:T(128)S(1)} copy(%param2)
+    %fusion = s32[6]{0:T(128)S(1)} fusion(s32[3,2]{0,1:T(128)S(1)} %arg0, s32[1]{0:T(128)} %arg1, s32[5]{0:T(128)S(1)} %arg2), kind=kLoop, calls=%fused_computation
+    ROOT %root = s32[6]{0:T(128)} copy(%fusion)
+  }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnUnverifiedModule(hlo_string));
+  MemorySpacePropagation memory_space_propagation;
+  EXPECT_TRUE(memory_space_propagation.Run(module.get()).ValueOrDie());
+  TF_EXPECT_OK(Verify(module.get()));
+  TF_ASSERT_OK_AND_ASSIGN(auto ref,
+                          ParseAndReturnVerifiedModule(expected_hlo_string));
+  EXPECT_EQ(module->Hash(), ref->Hash());
+}
+
+TEST_F(MemorySpacePropagationTest, NestedOutputFusion) {
+  // Tests propagating the memory space to nested fusions on the output side.
+  absl::string_view hlo_string = R"(
+  HloModule NestedFusion
+
+  %bitcast_fusion {
+    %bf_param = s32[6]{0:T(128)} parameter(0)
+    ROOT %bitcast = s32[3,2]{0,1:T(128)} bitcast(%bf_param)
+  }
+
+  %fused_computation {
+    %param_1.3 = s32[1]{0:T(128)} parameter(1)
+    %constant.2 = s32[]{:T(128)} constant(-2147483648)
+    %pad.2 = s32[6]{0:T(128)} pad(s32[1]{0:T(128)} %param_1.3, s32[]{:T(128)} %constant.2), padding=0_5
+    %param_2.3 = s32[5]{0:T(128)} parameter(2)
+    %pad.3 = s32[6]{0:T(128)} pad(s32[5]{0:T(128)} %param_2.3, s32[]{:T(128)} %constant.2), padding=1_0
+    %maximum.1 = s32[6]{0:T(128)} maximum(s32[6]{0:T(128)} %pad.2, s32[6]{0:T(128)} %pad.3)
+    %param_0.1 = s32[6]{0:T(128)} parameter(0)
+    %add.0 = s32[6]{0:T(128)} add(s32[6]{0:T(128)} %maximum.1, s32[6]{0:T(128)} %param_0.1)
+    ROOT %fusion.1 = s32[3,2]{0,1:T(128)} fusion(%add.0), kind=kLoop, calls=bitcast_fusion
+  }
+
+  ENTRY %entry {
+    %param0 = s32[6]{0:T(128)} parameter(0)
+    %param1 = s32[1]{0:T(128)} parameter(1)
+    %param2 = s32[5]{0:T(128)} parameter(2)
+    %arg0 = s32[6]{0:T(128)S(1)} copy(%param0)
+    %arg1 = s32[1]{0:T(128)} copy(%param1)
+    %arg2 = s32[5]{0:T(128)S(1)} copy(%param2)
+    %fusion = s32[3,2]{0,1:T(128)S(1)} fusion(s32[6]{0:T(128)S(1)} %arg0, s32[1]{0:T(128)} %arg1, s32[5]{0:T(128)S(1)} %arg2), kind=kLoop, calls=%fused_computation
+    ROOT %root = s32[3,2]{0,1:T(128)} copy(%fusion)
+  }
+  )";
+  absl::string_view expected_hlo_string = R"(
+  HloModule NestedFusion
+
+  %bitcast_fusion {
+    %bf_param = s32[6]{0:T(128)S(1)} parameter(0)
+    ROOT %bitcast = s32[3,2]{0,1:T(128)S(1)} bitcast(%bf_param)
+  }
+
+  %fused_computation {
+    %param_1.3 = s32[1]{0:T(128)} parameter(1)
+    %constant.2 = s32[]{:T(128)} constant(-2147483648)
+    %pad.2 = s32[6]{0:T(128)} pad(s32[1]{0:T(128)} %param_1.3, s32[]{:T(128)} %constant.2), padding=0_5
+    %param_2.3 = s32[5]{0:T(128)S(1)} parameter(2)
+    %pad.3 = s32[6]{0:T(128)} pad(s32[5]{0:T(128)} %param_2.3, s32[]{:T(128)} %constant.2), padding=1_0
+    %maximum.1 = s32[6]{0:T(128)} maximum(s32[6]{0:T(128)} %pad.2, s32[6]{0:T(128)} %pad.3)
+    %param_0.1 = s32[6]{0:T(128)S(1)} parameter(0)
+    %add.0 = s32[6]{0:T(128)S(1)} add(s32[6]{0:T(128)} %maximum.1, s32[6]{0:T(128)S(1)} %param_0.1)
+    ROOT %fusion.1 = s32[3,2]{0,1:T(128)S(1)} fusion(%add.0), kind=kLoop, calls=bitcast_fusion
+  }
+
+  ENTRY %entry {
+    %param0 = s32[6]{0:T(128)} parameter(0)
+    %param1 = s32[1]{0:T(128)} parameter(1)
+    %param2 = s32[5]{0:T(128)} parameter(2)
+    %arg0 = s32[6]{0:T(128)S(1)} copy(%param0)
+    %arg1 = s32[1]{0:T(128)} copy(%param1)
+    %arg2 = s32[5]{0:T(128)S(1)} copy(%param2)
+    %fusion = s32[3,2]{0,1:T(128)S(1)} fusion(s32[6]{0:T(128)S(1)} %arg0, s32[1]{0:T(128)} %arg1, s32[5]{0:T(128)S(1)} %arg2), kind=kLoop, calls=%fused_computation
+    ROOT %root = s32[3,2]{0,1:T(128)} copy(%fusion)
+  }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnUnverifiedModule(hlo_string));
+  MemorySpacePropagation memory_space_propagation;
+  EXPECT_TRUE(memory_space_propagation.Run(module.get()).ValueOrDie());
+  TF_EXPECT_OK(Verify(module.get()));
+  TF_ASSERT_OK_AND_ASSIGN(auto ref,
+                          ParseAndReturnVerifiedModule(expected_hlo_string));
+  EXPECT_EQ(module->Hash(), ref->Hash());
+}
+
 }  // namespace
 }  // namespace xla

From fc5151130813140eb4189f77dd3a759c4077836d Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 18 Jun 2020 13:41:07 -0700
Subject: [PATCH 0534/1390] Delete orphaned comment

The comment for `loader_spec_` stuck around even after it was removed.

PiperOrigin-RevId: 317171521
Change-Id: Iddb6029fdad9cd5ef33bc4f4ea2653caed305658
---
 tensorflow/compiler/xla/service/gpu/kernel_thunk.h | 2 --
 1 file changed, 2 deletions(-)

diff --git a/tensorflow/compiler/xla/service/gpu/kernel_thunk.h b/tensorflow/compiler/xla/service/gpu/kernel_thunk.h
index 88351881f3a..25acabb239b 100644
--- a/tensorflow/compiler/xla/service/gpu/kernel_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/kernel_thunk.h
@@ -77,8 +77,6 @@ class KernelThunk : public Thunk {
   // Will be set by IrEmitterUnnested.
   LaunchDimensions launch_dimensions_;
 
-  // Describes how to load this kernel. ExecuteOnStream reuses this loader
-  // specification for all executions.
   mutable tensorflow::mutex mutex_;
 
   // Loaded kernels for each `StreamExecutor`.  Requires pointer stability of

From 8452c9f80ee02cb71fb72f638d3bdef754f15297 Mon Sep 17 00:00:00 2001
From: Gabriel Rasskin <grasskin@google.com>
Date: Thu, 18 Jun 2020 13:58:25 -0700
Subject: [PATCH 0535/1390] Added status_group fuzzer

---
 .../security/fuzzing/status_group_fuzz.cc     | 83 +++++++++++++++++++
 1 file changed, 83 insertions(+)
 create mode 100644 tensorflow/security/fuzzing/status_group_fuzz.cc

diff --git a/tensorflow/security/fuzzing/status_group_fuzz.cc b/tensorflow/security/fuzzing/status_group_fuzz.cc
new file mode 100644
index 00000000000..979fd444b48
--- /dev/null
+++ b/tensorflow/security/fuzzing/status_group_fuzz.cc
@@ -0,0 +1,83 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <cstdint>
+#include <cstdlib>
+
+#include "tensorflow/core/platform/status.h"
+
+#include <fuzzer/FuzzedDataProvider.h>
+
+// This is a fuzzer for `tensorflow::StatusGroup`. Since `Status` is used almost
+// everywhere, we need to ensure that the common functionality is safe. We don't
+// expect many crashes from this fuzzer
+
+namespace {
+
+tensorflow::error::Code BuildRandomErrorCode(uint32_t code){
+
+  // We cannot build a `Status` with error_code of 0 and a message, so force
+  // error code to be non-zero.
+  if (code == 0) {
+    return tensorflow::error::UNKNOWN;
+  }
+
+  return static_cast<tensorflow::error::Code>(code);
+}
+
+extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
+  tensorflow::error::Code error_code;
+
+  std::string error_message = "ERROR";
+
+  tensorflow::Status s, derived_s;
+
+  tensorflow::StatusGroup sg;
+
+  bool is_derived;
+
+  uint32_t code;
+
+  FuzzedDataProvider fuzzed_data(data, size);
+
+  while(fuzzed_data.remaining_bytes() > 0) {
+    code = fuzzed_data.ConsumeIntegral<uint32_t>();
+
+    error_code = BuildRandomErrorCode(code);
+
+    is_derived = fuzzed_data.ConsumeBool();
+
+    s = tensorflow::Status(error_code, error_message);
+
+    if(is_derived) {
+      derived_s = tensorflow::StatusGroup::MakeDerived(s);
+
+      sg.Update(derived_s);
+
+    } else {
+      sg.Update(s);
+
+    }
+  }
+
+  sg.as_summary_status();
+
+  sg.as_concatenated_status();
+
+  sg.AttachLogMessages();
+
+  return 0;
+}
+
+}  // namespace

From c41f4652b45bf70f20686e612b41574b4b8139d7 Mon Sep 17 00:00:00 2001
From: Marissa Ikonomidis <marissaw@google.com>
Date: Thu, 18 Jun 2020 13:52:21 -0700
Subject: [PATCH 0536/1390] Add an option to enable MLIR bridge for tpu_py_test
 rule

If enable_mlir_bridge is True, a new test will be generated that runs with the MLIR bridge enabled.
This option is off by default.

PiperOrigin-RevId: 317173675
Change-Id: I332e1ae24cf82fceea20fd0aff2cec7c9b236a24
---
 tensorflow/core/platform/default/distribute.bzl | 3 +++
 tensorflow/python/framework/test_util.py        | 3 +++
 tensorflow/python/tpu/tpu.bzl                   | 2 ++
 3 files changed, 8 insertions(+)

diff --git a/tensorflow/core/platform/default/distribute.bzl b/tensorflow/core/platform/default/distribute.bzl
index 46a5d826a79..b16d5e8cff7 100644
--- a/tensorflow/core/platform/default/distribute.bzl
+++ b/tensorflow/core/platform/default/distribute.bzl
@@ -22,6 +22,7 @@ def distribute_py_test(
         full_precision = False,
         disable_v2 = False,
         disable_v3 = False,
+        disable_mlir_bridge = True,
         **kwargs):
     """Generates py_test targets for CPU and GPU.
 
@@ -40,6 +41,7 @@ def distribute_py_test(
         full_precision: unused.
         disable_v2: whether tests for TPU version 2 should be generated.
         disable_v3: whether tests for TPU version 3 should be generated.
+        disable_mlir_bridge: whether to also run this with the mlir bridge enabled.
         **kwargs: extra keyword arguments to the non-tpu test.
     """
 
@@ -77,6 +79,7 @@ def distribute_py_test(
             tags = tpu_tags,
             disable_v2 = disable_v2,
             disable_v3 = disable_v3,
+            disable_mlir_bridge = disable_mlir_bridge,
         )
 
 register_extension_info(
diff --git a/tensorflow/python/framework/test_util.py b/tensorflow/python/framework/test_util.py
index a46bb7c9bda..8ddbcf34f3b 100644
--- a/tensorflow/python/framework/test_util.py
+++ b/tensorflow/python/framework/test_util.py
@@ -1933,6 +1933,9 @@ class TensorFlowTestCase(googletest.TestCase):
       # disable it here.
       pywrap_tf_session.TF_SetXlaConstantFoldingDisabled(True)
 
+    if is_mlir_bridge_enabled():
+      context.context().enable_mlir_bridge = True
+
     self._threads = []
     self._tempdir = None
     self._cached_session = None
diff --git a/tensorflow/python/tpu/tpu.bzl b/tensorflow/python/tpu/tpu.bzl
index 5453702d64d..3c26d9b49bf 100644
--- a/tensorflow/python/tpu/tpu.bzl
+++ b/tensorflow/python/tpu/tpu.bzl
@@ -25,6 +25,7 @@ def tpu_py_test(
         disable_v2 = False,
         disable_v3 = False,
         disable_experimental = False,
+        disable_mlir_bridge = True,
         args = [],
         **kwargs):
     """Generates identical unit test variants for various Cloud TPU versions.
@@ -37,6 +38,7 @@ def tpu_py_test(
         disable_v2: If true, don't generate TPU v2 tests.
         disable_v3: If true, don't generate TPU v3 tests.
         disable_experimental: Unused.
+        disable_mlir_bridge: Unused.
         args: Arguments to apply to tests.
         **kwargs: Additional named arguments to apply to tests.
     """

From dda51e1c94160b8252c51dd0ddca445d821ba8b9 Mon Sep 17 00:00:00 2001
From: Gabriel Rasskin <grasskin@google.com>
Date: Thu, 18 Jun 2020 13:59:11 -0700
Subject: [PATCH 0537/1390] Added status group fuzzer build rules

---
 tensorflow/security/fuzzing/BUILD | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/tensorflow/security/fuzzing/BUILD b/tensorflow/security/fuzzing/BUILD
index 9b5aeec2d36..871baa0055b 100644
--- a/tensorflow/security/fuzzing/BUILD
+++ b/tensorflow/security/fuzzing/BUILD
@@ -19,6 +19,14 @@ tf_fuzz_target(
     ],
 )
 
+tf_fuzz_target(
+    name = "status_group_fuzz",
+    srcs = ["status_group_fuzz.cc"],
+    deps = [
+        "//tensorflow/core/platform:status",
+    ],
+)
+
 # A trivial fuzzer with no pre-specified corpus.
 # TODO(mihaimaruseac): Move fuzz_session and the op fuzzers to a subdirectory
 tf_fuzz_target(

From ef52b4e0886b7212471462643e92e98bea0253be Mon Sep 17 00:00:00 2001
From: George Karpenkov <cheshire@google.com>
Date: Thu, 18 Jun 2020 13:53:10 -0700
Subject: [PATCH 0538/1390] [XLA/Client] Implement LocalClient::Run which
 supports buffer donation

PiperOrigin-RevId: 317173848
Change-Id: If92955ac5051376fbf0932b773b675459497c0c4
---
 .../compiler/xla/client/local_client.cc       | 47 +++++++++++++++----
 tensorflow/compiler/xla/client/local_client.h |  9 ++++
 .../tests/multiple_devices_on_host_test.cc    |  3 +-
 tensorflow/compiler/xla/tests/while_test.cc   |  6 ++-
 4 files changed, 53 insertions(+), 12 deletions(-)

diff --git a/tensorflow/compiler/xla/client/local_client.cc b/tensorflow/compiler/xla/client/local_client.cc
index afe115deda8..f71e8a2d56d 100644
--- a/tensorflow/compiler/xla/client/local_client.cc
+++ b/tensorflow/compiler/xla/client/local_client.cc
@@ -168,6 +168,26 @@ LocalExecutable::RunHelper(const absl::Span<const Shape* const> argument_shapes,
   return std::make_pair(service_options, std::move(stream));
 }
 
+StatusOr<ExecutableRunOptions> LocalExecutable::GetExecutableRunOptions(
+    absl::Span<Shape const* const> argument_shapes,
+    const ExecutableRunOptions& run_options) {
+  TF_ASSIGN_OR_RETURN(auto options_and_stream,
+                      RunHelper(argument_shapes, run_options));
+  ExecutableRunOptions options = options_and_stream.first.run_options();
+  options.set_device_ordinal(-1);
+  return options;
+}
+
+template <typename T>
+static StatusOr<T> BlockHostUntilDoneAfterAsyncCall(
+    se::Stream* stream, std::function<StatusOr<T>()> async_callback) {
+  StatusOr<T> result = async_callback();
+  Status block_status = stream->BlockHostUntilDone();
+  TF_RETURN_IF_ERROR(result.status());
+  TF_RETURN_IF_ERROR(block_status);
+  return result;
+}
+
 StatusOr<ScopedShapedBuffer> LocalExecutable::Run(
     const absl::Span<const ShapedBuffer* const> arguments,
     ExecutableRunOptions run_options) {
@@ -176,15 +196,24 @@ StatusOr<ScopedShapedBuffer> LocalExecutable::Run(
   for (const ShapedBuffer* const arg : arguments) {
     argument_shapes.push_back(&arg->on_host_shape());
   }
-  TF_ASSIGN_OR_RETURN(auto options_and_stream,
-                      RunHelper(argument_shapes, run_options));
-  ExecutableRunOptions options = options_and_stream.first.run_options();
-  options.set_device_ordinal(-1);
-  auto result = RunAsync(arguments, options);
-  Status block_status = options.stream()->BlockHostUntilDone();
-  TF_RETURN_IF_ERROR(result.status());
-  TF_RETURN_IF_ERROR(block_status);
-  return result;
+  TF_ASSIGN_OR_RETURN(ExecutableRunOptions options,
+                      GetExecutableRunOptions(argument_shapes, run_options));
+  return BlockHostUntilDoneAfterAsyncCall<xla::ScopedShapedBuffer>(
+      options.stream(), [&] { return RunAsync(arguments, options); });
+}
+
+StatusOr<ExecutionOutput> LocalExecutable::Run(
+    std::vector<ExecutionInput> arguments, ExecutableRunOptions run_options) {
+  std::vector<const Shape*> argument_shapes;
+  argument_shapes.reserve(arguments.size());
+  for (const ExecutionInput& arg : arguments) {
+    argument_shapes.push_back(&arg.shape());
+  }
+  TF_ASSIGN_OR_RETURN(ExecutableRunOptions options,
+                      GetExecutableRunOptions(argument_shapes, run_options));
+  return BlockHostUntilDoneAfterAsyncCall<ExecutionOutput>(
+      options.stream(),
+      [&] { return RunAsync(argument_shapes, std::move(arguments), options); });
 }
 
 static std::shared_ptr<HloSnapshot> DumpArguments(
diff --git a/tensorflow/compiler/xla/client/local_client.h b/tensorflow/compiler/xla/client/local_client.h
index 7cdeb9dcbf6..b00f5cc6801 100644
--- a/tensorflow/compiler/xla/client/local_client.h
+++ b/tensorflow/compiler/xla/client/local_client.h
@@ -51,6 +51,11 @@ class LocalExecutable {
       const absl::Span<const ShapedBuffer* const> arguments,
       ExecutableRunOptions run_options);
 
+  // Similar to Run(), but allows for donating argument buffers to the
+  // executable.
+  StatusOr<ExecutionOutput> Run(std::vector<ExecutionInput> arguments,
+                                ExecutableRunOptions run_options);
+
   // Similar to Run(), but need not block the host waiting for the computation
   // to complete before returning.
   StatusOr<ScopedShapedBuffer> RunAsync(
@@ -85,6 +90,10 @@ class LocalExecutable {
       const absl::Span<const Shape* const> argument_shapes,
       ExecutableRunOptions run_options);
 
+  StatusOr<ExecutableRunOptions> GetExecutableRunOptions(
+      absl::Span<Shape const* const> argument_shapes,
+      const ExecutableRunOptions& run_options);
+
   // The ordinal of the device which this executable was compiled for. The
   // executable can run on all equivalent devices (as determined by
   // Backend::devices_equivalent).
diff --git a/tensorflow/compiler/xla/tests/multiple_devices_on_host_test.cc b/tensorflow/compiler/xla/tests/multiple_devices_on_host_test.cc
index 2b19aaded9c..2231fc6feab 100644
--- a/tensorflow/compiler/xla/tests/multiple_devices_on_host_test.cc
+++ b/tensorflow/compiler/xla/tests/multiple_devices_on_host_test.cc
@@ -45,7 +45,8 @@ void CompileAndExecute(
       xla::ClientLibrary::GetXlaService(client->platform())
           ->backend()
           .memory_allocator());
-  StatusOr<ScopedShapedBuffer> result = executable->Run({}, execute_options);
+  StatusOr<ScopedShapedBuffer> result =
+      executable->Run(absl::Span<const ShapedBuffer* const>(), execute_options);
   {
     absl::MutexLock lock(results_mutex);
     results->emplace_back(device_ordinal, std::move(result));
diff --git a/tensorflow/compiler/xla/tests/while_test.cc b/tensorflow/compiler/xla/tests/while_test.cc
index d575bbb1f3e..8e8c3605cc7 100644
--- a/tensorflow/compiler/xla/tests/while_test.cc
+++ b/tensorflow/compiler/xla/tests/while_test.cc
@@ -1324,14 +1324,16 @@ void BM_WhileLoop(int num_iters) {
   options.set_allocator(&allocator);
   const int kWarmups = 2;
   for (int i = 0; i < kWarmups; ++i) {
-    auto result = executable->Run({}, options);
+    auto result =
+        executable->Run(absl::Span<const ShapedBuffer* const>(), options);
     ASSERT_TRUE(result.ok());
   }
 
   // Run benchmark.
   tensorflow::testing::StartTiming();
   for (int i = 0; i < num_iters; ++i) {
-    auto result = executable->Run({}, options);
+    auto result =
+        executable->Run(absl::Span<const ShapedBuffer* const>(), options);
     ASSERT_TRUE(result.ok());
   }
 }

From 3833402726d72d04dd2821e89c642f613e80a531 Mon Sep 17 00:00:00 2001
From: jonah-kohn <51345541+jonah-kohn@users.noreply.github.com>
Date: Thu, 18 Jun 2020 14:28:47 -0700
Subject: [PATCH 0539/1390] Cast optimizer parameters as python floats during
 serialization.

Accounted only for the case in which the hyper parameter is neither a callable nor a tensor, to avoid any troublesome casts.
---
 tensorflow/python/keras/optimizer_v2/optimizer_v2.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/keras/optimizer_v2/optimizer_v2.py b/tensorflow/python/keras/optimizer_v2/optimizer_v2.py
index c55b332bfc0..d8992bbe3e0 100644
--- a/tensorflow/python/keras/optimizer_v2/optimizer_v2.py
+++ b/tensorflow/python/keras/optimizer_v2/optimizer_v2.py
@@ -910,7 +910,7 @@ class OptimizerV2(trackable.Trackable):
       return value()
     if tensor_util.is_tensor(value):
       return backend.get_value(value)
-    return value
+    return float(value)
 
   def variables(self):
     """Returns variables of this Optimizer based on the order created."""

From bc1c0e86a677d9b1e5d3e3f0da85c445c2a7efe2 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 18 Jun 2020 14:24:06 -0700
Subject: [PATCH 0540/1390] Wrap save/restore logic in tf.function when in
 eager mode. This allows parallel saving and restoring when using multiple
 devices.

PiperOrigin-RevId: 317180143
Change-Id: Icdc740d02beb7c2d3236191add3b72fa103fc134
---
 .../grappler/optimizers/function_optimizer.cc |   8 +-
 .../parallel_device/parallel_device_test.py   |   4 -
 .../python/framework/auto_control_deps.py     |   2 +-
 tensorflow/python/training/saving/BUILD       |   1 -
 .../training/saving/functional_saver.py       | 111 ++++++------------
 .../training/saving/functional_saver_test.py  |  17 +--
 6 files changed, 42 insertions(+), 101 deletions(-)

diff --git a/tensorflow/core/grappler/optimizers/function_optimizer.cc b/tensorflow/core/grappler/optimizers/function_optimizer.cc
index 0e156aaa84c..a66e645e04b 100644
--- a/tensorflow/core/grappler/optimizers/function_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/function_optimizer.cc
@@ -837,6 +837,7 @@ const bool IsExemptFromSideEffectsExecutionValidation(const string& op) {
        "ParameterizedTruncatedNormal", "TruncatedNormal", "RandomShuffle",
        "Multinomial", "RandomGamma", "RandomGammaGrad", "RandomPoisson",
        "RandomPoissonV2",
+       // LINT.ThenChange(//tensorflow/python/framework/auto_control_deps.py)
 
        // ReadVariableOp marked as stateful because it consumes DT_RESOURCE,
        // but it can't generate any observable side-effect.
@@ -850,12 +851,7 @@ const bool IsExemptFromSideEffectsExecutionValidation(const string& op) {
        // the same device_ordinal on the same host.
        "EnqueueTPUEmbeddingSparseBatch", "EnqueueTPUEmbeddingIntegerBatch",
        "EnqueueTPUEmbeddingSparseTensorBatch",
-       "EnqueueTPUEmbeddingRaggedTensorBatch",
-
-       // SaveV2 and RestoreV2 should be allowed to operate in parallel on
-       // multiple hosts.
-       "SaveV2", "RestoreV2"});
-  // LINT.ThenChange(//tensorflow/python/framework/auto_control_deps.py)
+       "EnqueueTPUEmbeddingRaggedTensorBatch"});
   return exemption->contains(op);
 }
 
diff --git a/tensorflow/python/distribute/parallel_device/parallel_device_test.py b/tensorflow/python/distribute/parallel_device/parallel_device_test.py
index 1429c522aba..8fc3dcb5816 100644
--- a/tensorflow/python/distribute/parallel_device/parallel_device_test.py
+++ b/tensorflow/python/distribute/parallel_device/parallel_device_test.py
@@ -172,8 +172,6 @@ class ParallelDeviceTests(_VirtualDeviceTestCase):
       config.set_synchronous_execution(previous)
 
   def test_checkpointing(self):
-    self.skipTest(
-        "Disable saving until SaveableObject's methods are traceable.")
     prefix = os.path.join(self.get_temp_dir(), "ckpt")
     with self.device.scope():
       different_values = self.device.pack(
@@ -265,8 +263,6 @@ class LayerTests(_VirtualDeviceTestCase):
     self.assertIn(self.device.components[1], final_kernels[1].backing_device)
 
   def test_training_loop(self):
-    self.skipTest(
-        "Disable saving until SaveableObject's methods are traceable.")
     for _ in range(5):
       layer = _Dense(5)
       checkpoint = tracking.Checkpoint(layer=layer)
diff --git a/tensorflow/python/framework/auto_control_deps.py b/tensorflow/python/framework/auto_control_deps.py
index 4b47735e0bf..51dcb248b11 100644
--- a/tensorflow/python/framework/auto_control_deps.py
+++ b/tensorflow/python/framework/auto_control_deps.py
@@ -100,7 +100,7 @@ _ORDER_INSENSITIVE_STATEFUL_OPS = [
     "CudnnRNNV2", "CudnnRNNV3", "CudnnRNNBackpropV2", "CudnnRNNBackpropV3",
     "EnqueueTPUEmbeddingSparseBatch", "EnqueueTPUEmbeddingIntegerBatch",
     "EnqueueTPUEmbeddingSparseTensorBatch",
-    "EnqueueTPUEmbeddingRaggedTensorBatch", "RestoreV2", "SaveV2"
+    "EnqueueTPUEmbeddingRaggedTensorBatch"
 ]
 # LINT.ThenChange(//tensorflow/core/grappler/optimizers/function_optimizer.cc)
 
diff --git a/tensorflow/python/training/saving/BUILD b/tensorflow/python/training/saving/BUILD
index 12940840309..670a4c35c6f 100644
--- a/tensorflow/python/training/saving/BUILD
+++ b/tensorflow/python/training/saving/BUILD
@@ -43,7 +43,6 @@ cuda_py_test(
         ":checkpoint_options",
         ":functional_saver",
         ":saveable_hook",
-        "//tensorflow/python/eager:remote",
         "//tensorflow/python/eager:test",
     ],
 )
diff --git a/tensorflow/python/training/saving/functional_saver.py b/tensorflow/python/training/saving/functional_saver.py
index 3a9b565470d..c4334e096df 100644
--- a/tensorflow/python/training/saving/functional_saver.py
+++ b/tensorflow/python/training/saving/functional_saver.py
@@ -21,7 +21,6 @@ from __future__ import print_function
 import uuid
 
 from tensorflow.core.protobuf import saver_pb2
-from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -162,8 +161,7 @@ class MultiDeviceSaver(object):
         self._after_restore_callbacks.append(saveable.after_restore)
 
       if is_saveable:
-        host_device = saveable_object_util.set_cpu0(saveable.device)
-        saveables_by_device.setdefault(host_device, []).append(saveable)
+        saveables_by_device.setdefault(saveable.device, []).append(saveable)
 
     self._single_device_savers = {
         device: _SingleDeviceSaver(saveables)
@@ -249,50 +247,33 @@ class MultiDeviceSaver(object):
       tmp_checkpoint_prefix = string_ops.string_join(
           [file_prefix, sharded_suffix])
 
-    def save_fn():
-      num_shards = len(self._single_device_savers)
-      sharded_saves = []
-      sharded_prefixes = []
-      num_shards_tensor = constant_op.constant(num_shards, name="num_shards")
-      last_device = None
-      for shard, (device, saver) in enumerate(
-          sorted(self._single_device_savers.items())):
-        last_device = device
-        with ops.device(saveable_object_util.set_cpu0(device)):
-          shard_prefix = sharded_filename(tmp_checkpoint_prefix, shard,
-                                          num_shards_tensor)
-        sharded_prefixes.append(shard_prefix)
-        with ops.device(device):
-          # _SingleDeviceSaver will use the CPU device when necessary, but
-          # initial read operations should be placed on the SaveableObject's
-          # device.
-          sharded_saves.append(saver.save(shard_prefix, options))
+    num_shards = len(self._single_device_savers)
+    sharded_saves = []
+    sharded_prefixes = []
+    num_shards_tensor = constant_op.constant(num_shards, name="num_shards")
+    last_device = None
+    for shard, (device, saver) in enumerate(
+        sorted(self._single_device_savers.items())):
+      last_device = device
+      with ops.device(saveable_object_util.set_cpu0(device)):
+        shard_prefix = sharded_filename(tmp_checkpoint_prefix, shard,
+                                        num_shards_tensor)
+      sharded_prefixes.append(shard_prefix)
+      with ops.device(device):
+        # _SingleDeviceSaver will use the CPU device when necessary, but initial
+        # read operations should be placed on the SaveableObject's device.
+        sharded_saves.append(saver.save(shard_prefix, options))
 
-      with ops.control_dependencies(sharded_saves):
-        # Merge on the io_device if specified, otherwise co-locates the merge op
-        # with the last device used.
-        merge_device = (
-            options.experimental_io_device or
-            saveable_object_util.set_cpu0(last_device))
-        with ops.device(merge_device):
-          # V2 format write path consists of a metadata merge step.  Once
-          # merged, attempts to delete the temporary directory,
-          # "<user-fed prefix>_temp".
-          return gen_io_ops.merge_v2_checkpoints(
-              sharded_prefixes, file_prefix, delete_old_dirs=True)
-
-    # Since this will causes a function re-trace on each save, limit this to the
-    # cases where it is needed: eager and when there are multiple tasks/single
-    # device savers. Note that the retrace is needed to ensure we pickup the
-    # latest values of options like experimental_io_device.
-    if context.executing_eagerly() and len(self._single_device_savers) > 1:
-      # Explicitly place the identity op on the first device.
-      @def_function.function(experimental_compile=False)
-      def tf_function_save():
-        save_fn()
-      tf_function_save()
-    else:
-      return save_fn()
+    with ops.control_dependencies(sharded_saves):
+      # Merge on the io_device if specified, otherwise co-locates the merge op
+      # with the last device used.
+      merge_device = (options.experimental_io_device or
+                      saveable_object_util.set_cpu0(last_device))
+      with ops.device(merge_device):
+        # V2 format write path consists of a metadata merge step.  Once merged,
+        # attempts to delete the temporary directory, "<user-fed prefix>_temp".
+        return gen_io_ops.merge_v2_checkpoints(
+            sharded_prefixes, file_prefix, delete_old_dirs=True)
 
   def restore(self, file_prefix, options=None):
     """Restore the saveable objects from a checkpoint with `file_prefix`.
@@ -306,38 +287,12 @@ class MultiDeviceSaver(object):
       A dictionary mapping from SaveableObject names to restore operations.
     """
     options = options or checkpoint_options.CheckpointOptions()
-
-    def restore_fn():
-      restore_ops = {}
-      # Sort by device name to avoid propagating non-deterministic dictionary
-      # ordering in some Python versions.
-      for device, saver in sorted(self._single_device_savers.items()):
-        with ops.device(device):
-          restore_ops.update(saver.restore(file_prefix, options))
-
-      return restore_ops
-
-    # Since this will causes a function re-trace on each save, limit this to the
-    # cases where it is needed: eager and when there are multiple tasks/single
-    # device savers. Note that the retrace is needed to ensure we pickup the
-    # latest values of options like experimental_io_device.
-    if context.executing_eagerly() and len(self._single_device_savers) > 1:
-      first_device, _ = list(self._single_device_savers.items())[0]
-      @def_function.function(experimental_compile=False)
-      def tf_function_restore():
-        restore_ops = restore_fn()
-        restore_tensors = {}
-        # tf.functions must return tensors, thus we use control dependencies so
-        # that we can return a tensor which depends on the given op.
-        with ops.device(saveable_object_util.set_cpu0(first_device)):
-          for name, op in restore_ops.items():
-            with ops.control_dependencies([op]):
-              restore_tensors[name] = array_ops.identity(file_prefix)
-        return restore_tensors
-
-      restore_ops = tf_function_restore()
-    else:
-      restore_ops = restore_fn()
+    restore_ops = {}
+    # Sort by device name to avoid propagating non-deterministic dictionary
+    # ordering in some Python versions.
+    for device, saver in sorted(self._single_device_savers.items()):
+      with ops.device(device):
+        restore_ops.update(saver.restore(file_prefix, options))
 
     for callback in self._after_restore_callbacks:
       callback()
diff --git a/tensorflow/python/training/saving/functional_saver_test.py b/tensorflow/python/training/saving/functional_saver_test.py
index 8f3eef4fb9c..7db32ff72d7 100644
--- a/tensorflow/python/training/saving/functional_saver_test.py
+++ b/tensorflow/python/training/saving/functional_saver_test.py
@@ -21,7 +21,6 @@ from __future__ import print_function
 import os
 
 from tensorflow.python.eager import context
-from tensorflow.python.eager import remote
 from tensorflow.python.eager import test
 from tensorflow.python.eager import wrap_function
 from tensorflow.python.framework import config
@@ -30,7 +29,6 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.platform import gfile
-from tensorflow.python.training import server_lib
 from tensorflow.python.training.saving import checkpoint_options
 from tensorflow.python.training.saving import functional_saver
 from tensorflow.python.training.saving import saveable_hook
@@ -128,16 +126,13 @@ class SaverTest(test.TestCase):
     second_saver.restore(save_path)
     self.assertEqual(2., self.evaluate(v2))
 
-  def test_checkpoint_is_sharded_by_task(self):
-    servers = [server_lib.Server.create_local_server() for _ in range(3)]
-    cluster_spec = server_lib.ClusterSpec({
-        "worker": [s.target[len("grpc://"):] for s in servers]})
-    remote.connect_to_cluster(cluster_spec)
-    with ops.device("/job:worker/task:0/cpu:0"):
+  @test_util.run_in_graph_and_eager_modes
+  def test_checkpoint_is_sharded_by_device(self):
+    with ops.device("cpu:0"):
       v0 = resource_variable_ops.ResourceVariable(0.)
-    with ops.device("/job:worker/task:1/cpu:0"):
+    with ops.device("cpu:1"):
       v1 = resource_variable_ops.ResourceVariable(1.)
-    with ops.device("/job:worker/task:2/cpu:0"):
+    with ops.device("cpu:2"):
       v2 = resource_variable_ops.ResourceVariable(2.)
 
     self.evaluate([v0.initializer, v1.initializer, v2.initializer])
@@ -172,7 +167,7 @@ class SaverTest(test.TestCase):
         list(saveable_object_util.saveable_objects_for_op(v2, "v2")))
     prefix = os.path.join(self.get_temp_dir(), "ckpt")
     self.evaluate(saver.save(constant_op.constant(prefix), self.local_options))
-    self.assertEqual(2, len(gfile.Glob(prefix + "*")))
+    self.assertEqual(4, len(gfile.Glob(prefix + "*")))
     self.evaluate(v0.assign(-1.))
     self.evaluate(v1.assign(-1.))
     self.evaluate(v2.assign(-1.))

From cb6e1ed5d8a406861398c428ca5fd6b84b439357 Mon Sep 17 00:00:00 2001
From: Henry Tan <henrytan@google.com>
Date: Thu, 18 Jun 2020 14:28:42 -0700
Subject: [PATCH 0541/1390] Return `debug_string` when creating
 CompilationCacheKey.

PiperOrigin-RevId: 317181056
Change-Id: I02198244c1c3749ff1ecf4e0647b8daa80dd868c
---
 tensorflow/core/tpu/kernels/BUILD             |  16 ++
 .../kernels/tpu_compilation_cache_external.cc | 127 ---------------
 .../kernels/tpu_compilation_cache_external.h  |   8 -
 .../core/tpu/kernels/tpu_compile_c_api.h      |  19 ++-
 tensorflow/core/tpu/kernels/tpu_op_util.cc    | 151 ++++++++++++++++++
 tensorflow/core/tpu/kernels/tpu_op_util.h     |  40 +++++
 6 files changed, 223 insertions(+), 138 deletions(-)
 create mode 100644 tensorflow/core/tpu/kernels/tpu_op_util.cc
 create mode 100644 tensorflow/core/tpu/kernels/tpu_op_util.h

diff --git a/tensorflow/core/tpu/kernels/BUILD b/tensorflow/core/tpu/kernels/BUILD
index 94d3c8edf2b..9d38eb71f3c 100644
--- a/tensorflow/core/tpu/kernels/BUILD
+++ b/tensorflow/core/tpu/kernels/BUILD
@@ -405,6 +405,22 @@ cc_library(
     alwayslink = True,
 )
 
+cc_library(
+    name = "tpu_op_util",
+    srcs = ["tpu_op_util.cc"],
+    hdrs = ["tpu_op_util.h"],
+    deps = [
+        ":tpu_compilation_cache_key",
+        ":tpu_compile_c_api_hdrs",
+        ":tpu_mesh_state_interface",
+        "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core/protobuf/tpu:compile_metadata_proto_cc",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
 cc_library(
     name = "tpu_util",
     srcs = ["tpu_util.cc"],
diff --git a/tensorflow/core/tpu/kernels/tpu_compilation_cache_external.cc b/tensorflow/core/tpu/kernels/tpu_compilation_cache_external.cc
index 8cee90e8e55..c4442fc95d5 100644
--- a/tensorflow/core/tpu/kernels/tpu_compilation_cache_external.cc
+++ b/tensorflow/core/tpu/kernels/tpu_compilation_cache_external.cc
@@ -49,70 +49,6 @@ void PopulateEntry(const std::string& key, CompiledSubgraph* entry,
       absl::make_unique<TpuProgramGroup>(std::move(tpu_program_group));
   entry->initialized = true;
 }
-
-// Return fingerprint_in_metadata if it's not empty; otherwise read input tensor
-// data to compute the fingerprint.
-std::string GuaranteedConstFingerprint(
-    const string& fingerprint_in_metadata,
-    const OpInputList& guaranteed_constants) {
-  if (fingerprint_in_metadata.empty()) {
-    uint64_t fingerprint = 0;
-    for (const auto& constant : guaranteed_constants) {
-      fingerprint = TpuCompile_CreateGuaranteedConstFingerprint(
-          fingerprint, constant.tensor_data().data(),
-          constant.tensor_data().size());
-    }
-    return std::to_string(fingerprint);
-  } else {
-    return fingerprint_in_metadata;
-  }
-}
-
-std::string CreateShapePrefix(
-    const std::vector<tensorflow::TensorShape>& dynamic_shapes) {
-  std::string shapes_prefix;
-  for (const TensorShape& shape : dynamic_shapes) {
-    for (int64 size : shape.dim_sizes()) {
-      absl::StrAppend(&shapes_prefix, size, ",");
-    }
-    absl::StrAppend(&shapes_prefix, ";");
-  }
-  return shapes_prefix;
-}
-
-// Include compilation configurations of the arguments that are not captured
-// by the called graph.
-std::string CreateConfigPrefix(const TPUCompileMetadataProto& metadata) {
-  std::string config_prefix;
-  for (const auto& arg : metadata.args()) {
-    if (arg.is_same_data_across_replicas()) {
-      absl::StrAppend(&config_prefix, ":s");
-      // Same.
-    } else {
-      // Different.
-      absl::StrAppend(&config_prefix, ":");
-    }
-    if (arg.enable_xla_sharding() ==
-        tpu::TPUCompileMetadataProto::Arg::ALLOWED) {
-      // Enabled.
-      absl::StrAppend(&config_prefix, "e");
-    }
-    if (arg.unrestricted_layout()) {
-      // Unrestricted.
-      absl::StrAppend(&config_prefix, ":u");
-    }
-    absl::StrAppend(&config_prefix, ",type(", arg.dtype(), ")");
-    if (arg.has_shape()) {
-      absl::StrAppend(&config_prefix, ",shape(");
-      for (const auto& dim : arg.shape().dim()) {
-        absl::StrAppend(&config_prefix, dim.size(), ",");
-      }
-      absl::StrAppend(&config_prefix, ")");
-    }
-  }
-  return config_prefix;
-}
-
 }  // namespace
 
 TpuCompilationCacheExternal::EntryRefImpl::EntryRefImpl(
@@ -196,68 +132,5 @@ CompiledSubgraph* TpuCompilationCacheExternal::InitializeEntry(
   marked_for_eviction_size_ += main_entry->total_size;
   return main_entry;
 }
-
-/*static*/ TpuCompilationCacheKey
-TpuCompilationCacheExternal::CreateCompilationCacheKey(
-    absl::string_view function_name, uint64 function_library_fingerprint,
-    absl::string_view mlir_module,
-    const tensorflow::OpInputList& guaranteed_constants,
-    const std::vector<tensorflow::TensorShape>& dynamic_shapes,
-    const tensorflow::tpu::TPUCompileMetadataProto& metadata,
-    const TpuMeshStateInterface& mesh_state) {
-  VLOG(1) << "FunctionLibraryFingerprint:" << function_library_fingerprint;
-  std::string shapes_prefix = CreateShapePrefix(dynamic_shapes);
-  VLOG(1) << "shapes_prefix = " << shapes_prefix;
-  std::string config_prefix = CreateConfigPrefix(metadata);
-  VLOG(1) << "config_prefix = " << config_prefix;
-  std::vector<int32_t> flattened_device_ids;
-  if (metadata.has_device_assignment()) {
-    for (const auto& device :
-         metadata.device_assignment().computation_devices()) {
-      flattened_device_ids.insert(flattened_device_ids.end(),
-                                  device.replica_device_ids().begin(),
-                                  device.replica_device_ids().end());
-    }
-  }
-  // TODO(henrytan): return the debug_string.
-  const char* prefix =
-      TpuCompile_CreateCompilationCacheKey(CompilationCacheKeyProperty{
-          config_prefix.data(),
-          shapes_prefix.data(),
-          function_name.data(),
-          mlir_module.data(),
-          flattened_device_ids.data(),
-          flattened_device_ids.size(),
-          guaranteed_constants.size(),
-          function_library_fingerprint,
-          metadata.num_cores_per_replica(),
-          metadata.num_replicas(),
-          mesh_state.data(),
-      });
-  auto buffer_cleanup = gtl::MakeCleanup([prefix]() { delete[] prefix; });
-  TpuCompilationCacheKey key;
-  key.prefix = prefix;
-
-  // Guaranteed constants can be different across sessions. Use session_handle
-  // and guaranteed_const fingerprint to guarantee no collision.
-  if (guaranteed_constants.size() > 0) {
-    key.has_guaranteed_const = true;
-    key.session_handle = metadata.session_handle();
-    // Both `metadata` and `guaranteed_constants` lifetime are captured by
-    // reference based on the assumption that these variables lifetime is
-    // managed through the `TPUCompileOpKernelImpl` that outlives the
-    // lifetime of the compilation cache lookups.
-    string fingerprint;
-    key.guaranteed_const_fingerprint = [&metadata, &guaranteed_constants,
-                                        fingerprint]() mutable {
-      if (fingerprint.empty()) {
-        fingerprint = GuaranteedConstFingerprint(
-            metadata.guaranteed_const_fingerprint(), guaranteed_constants);
-      }
-      return fingerprint;
-    };
-  }
-  return key;
-}
 }  // namespace tpu
 }  // namespace tensorflow
diff --git a/tensorflow/core/tpu/kernels/tpu_compilation_cache_external.h b/tensorflow/core/tpu/kernels/tpu_compilation_cache_external.h
index 2c75cb4d053..fe251326a43 100644
--- a/tensorflow/core/tpu/kernels/tpu_compilation_cache_external.h
+++ b/tensorflow/core/tpu/kernels/tpu_compilation_cache_external.h
@@ -63,14 +63,6 @@ class TpuCompilationCacheExternal : public TpuCompilationCacheInterface {
   explicit TpuCompilationCacheExternal(int64 max_cache_size)
       : TpuCompilationCacheInterface(max_cache_size) {}
 
-  static TpuCompilationCacheKey CreateCompilationCacheKey(
-      absl::string_view function_name, uint64 function_library_fingerprint,
-      absl::string_view mlir_module,
-      const tensorflow::OpInputList& guaranteed_constants,
-      const std::vector<tensorflow::TensorShape>& dynamic_shapes,
-      const tensorflow::tpu::TPUCompileMetadataProto& metadata,
-      const TpuMeshStateInterface& mesh_state);
-
   string DebugString() const override { return "TpuCompilationCacheExternal"; }
 
  private:
diff --git a/tensorflow/core/tpu/kernels/tpu_compile_c_api.h b/tensorflow/core/tpu/kernels/tpu_compile_c_api.h
index d1546ed9610..c101e489d56 100644
--- a/tensorflow/core/tpu/kernels/tpu_compile_c_api.h
+++ b/tensorflow/core/tpu/kernels/tpu_compile_c_api.h
@@ -42,6 +42,13 @@ struct CompilationCacheKeyProperty {
   const XLA_TpuMeshState* mesh_state;
 };
 
+// Compilation cache key result returning both the key and a more verbose debug
+// version.
+struct CompilationCacheKeyResult {
+  const char* key;
+  const char* debug_string;
+};
+
 extern "C" {
 
 // Returns the number of available TPU core count.
@@ -49,9 +56,14 @@ TFTPU_CAPI_EXPORT int TpuTopology_AvailableCoreCount(
     const XLA_TpuMeshState* mesh_state, TpuCoreTypeEnum tpu_core_type);
 
 // Creates a unique compilation cache `key` used for `put` and `get` operations.
-// Returned buffer is heap-allocated and must be owned.
-TFTPU_CAPI_EXPORT const char* TpuCompile_CreateCompilationCacheKey(
-    CompilationCacheKeyProperty property);
+// Returned buffers are heap-allocated and must be owned.
+TFTPU_CAPI_EXPORT CompilationCacheKeyResult
+TpuCompile_CreateCompilationCacheKey(CompilationCacheKeyProperty property);
+
+// Destroys the CompilationCacheKeyResult returned by calling the
+// `TpuCompile_CreateCompilationCacheKey` API.
+TFTPU_CAPI_EXPORT void TpuCompile_DestroyCompilationCacheKey(
+    CompilationCacheKeyResult result);
 
 // Creates a guaranteed const fingerprint. Guarantee const is normally used in
 // TPU inference to avoid re-copying unchanged variables onto the TPU device.
@@ -75,6 +87,7 @@ TFTPU_CAPI_EXPORT void TpuCompile_BuildXLADeviceAssignment(
 struct TfTpu_CompileApiFn {
   TFTPU_ADD_FN_IN_STRUCT(TpuTopology_AvailableCoreCount);
   TFTPU_ADD_FN_IN_STRUCT(TpuCompile_CreateCompilationCacheKey);
+  TFTPU_ADD_FN_IN_STRUCT(TpuCompile_DestroyCompilationCacheKey);
   TFTPU_ADD_FN_IN_STRUCT(TpuCompile_CreateGuaranteedConstFingerprint);
   TFTPU_ADD_FN_IN_STRUCT(TpuCompile_CompileAheadOfTime);
   TFTPU_ADD_FN_IN_STRUCT(TpuCompile_BuildXLADeviceAssignment);
diff --git a/tensorflow/core/tpu/kernels/tpu_op_util.cc b/tensorflow/core/tpu/kernels/tpu_op_util.cc
new file mode 100644
index 00000000000..e2f717fea8b
--- /dev/null
+++ b/tensorflow/core/tpu/kernels/tpu_op_util.cc
@@ -0,0 +1,151 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/tpu/kernels/tpu_op_util.h"
+
+#include <string>
+
+#include "tensorflow/core/lib/gtl/cleanup.h"
+#include "tensorflow/core/tpu/kernels/tpu_compile_c_api.h"
+
+namespace tensorflow {
+namespace tpu {
+namespace {
+// Return fingerprint_in_metadata if it's not empty; otherwise read input tensor
+// data to compute the fingerprint.
+std::string GuaranteedConstFingerprint(
+    const string& fingerprint_in_metadata,
+    const OpInputList& guaranteed_constants) {
+  if (fingerprint_in_metadata.empty()) {
+    uint64_t fingerprint = 0;
+    for (const auto& constant : guaranteed_constants) {
+      fingerprint = TpuCompile_CreateGuaranteedConstFingerprint(
+          fingerprint, constant.tensor_data().data(),
+          constant.tensor_data().size());
+    }
+    return std::to_string(fingerprint);
+  } else {
+    return fingerprint_in_metadata;
+  }
+}
+
+std::string CreateShapePrefix(
+    const std::vector<tensorflow::TensorShape>& dynamic_shapes) {
+  std::string shapes_prefix;
+  for (const TensorShape& shape : dynamic_shapes) {
+    for (int64 size : shape.dim_sizes()) {
+      absl::StrAppend(&shapes_prefix, size, ",");
+    }
+    absl::StrAppend(&shapes_prefix, ";");
+  }
+  return shapes_prefix;
+}
+
+// Include compilation configurations of the arguments that are not captured
+// by the called graph.
+std::string CreateConfigPrefix(const TPUCompileMetadataProto& metadata) {
+  std::string config_prefix;
+  for (const auto& arg : metadata.args()) {
+    if (arg.is_same_data_across_replicas()) {
+      absl::StrAppend(&config_prefix, ":s");
+      // Same.
+    } else {
+      // Different.
+      absl::StrAppend(&config_prefix, ":");
+    }
+    if (arg.enable_xla_sharding() ==
+        tpu::TPUCompileMetadataProto::Arg::ALLOWED) {
+      // Enabled.
+      absl::StrAppend(&config_prefix, "e");
+    }
+    if (arg.unrestricted_layout()) {
+      // Unrestricted.
+      absl::StrAppend(&config_prefix, ":u");
+    }
+    absl::StrAppend(&config_prefix, ",type(", arg.dtype(), ")");
+    if (arg.has_shape()) {
+      absl::StrAppend(&config_prefix, ",shape(");
+      for (const auto& dim : arg.shape().dim()) {
+        absl::StrAppend(&config_prefix, dim.size(), ",");
+      }
+      absl::StrAppend(&config_prefix, ")");
+    }
+  }
+  return config_prefix;
+}
+}  // namespace
+
+TpuCompilationCacheKey CreateCompilationCacheKey(
+    absl::string_view function_name, uint64 function_library_fingerprint,
+    absl::string_view mlir_module, const OpInputList& guaranteed_constants,
+    const std::vector<TensorShape>& dynamic_shapes,
+    const TPUCompileMetadataProto& metadata,
+    const TpuMeshStateInterface& mesh_state) {
+  VLOG(1) << "FunctionLibraryFingerprint:" << function_library_fingerprint;
+  std::string shapes_prefix = CreateShapePrefix(dynamic_shapes);
+  VLOG(1) << "shapes_prefix = " << shapes_prefix;
+  std::string config_prefix = CreateConfigPrefix(metadata);
+  VLOG(1) << "config_prefix = " << config_prefix;
+  std::vector<int32_t> flattened_device_ids;
+  if (metadata.has_device_assignment()) {
+    for (const auto& device :
+         metadata.device_assignment().computation_devices()) {
+      flattened_device_ids.insert(flattened_device_ids.end(),
+                                  device.replica_device_ids().begin(),
+                                  device.replica_device_ids().end());
+    }
+  }
+  CompilationCacheKeyResult result =
+      TpuCompile_CreateCompilationCacheKey(CompilationCacheKeyProperty{
+          config_prefix.data(),
+          shapes_prefix.data(),
+          function_name.data(),
+          mlir_module.data(),
+          flattened_device_ids.data(),
+          flattened_device_ids.size(),
+          guaranteed_constants.size(),
+          function_library_fingerprint,
+          metadata.num_cores_per_replica(),
+          metadata.num_replicas(),
+          mesh_state.data(),
+      });
+  auto buffer_cleanup = gtl::MakeCleanup(
+      [result]() { TpuCompile_DestroyCompilationCacheKey(result); });
+  TpuCompilationCacheKey key;
+  key.prefix = result.key;
+  key.debug_string = result.debug_string;
+
+  // Guaranteed constants can be different across sessions. Use session_handle
+  // and guaranteed_const fingerprint to guarantee no collision.
+  if (guaranteed_constants.size() > 0) {
+    key.has_guaranteed_const = true;
+    key.session_handle = metadata.session_handle();
+    // Both `metadata` and `guaranteed_constants` lifetime are captured by
+    // reference based on the assumption that these variables lifetime is
+    // managed through the `TPUCompileOpKernelImpl` that outlives the
+    // lifetime of the compilation cache lookups.
+    string fingerprint;
+    key.guaranteed_const_fingerprint = [&metadata, &guaranteed_constants,
+                                        fingerprint]() mutable {
+      if (fingerprint.empty()) {
+        fingerprint = GuaranteedConstFingerprint(
+            metadata.guaranteed_const_fingerprint(), guaranteed_constants);
+      }
+      return fingerprint;
+    };
+  }
+  return key;
+}
+}  // namespace tpu
+}  // namespace tensorflow
diff --git a/tensorflow/core/tpu/kernels/tpu_op_util.h b/tensorflow/core/tpu/kernels/tpu_op_util.h
new file mode 100644
index 00000000000..0a9657ca05e
--- /dev/null
+++ b/tensorflow/core/tpu/kernels/tpu_op_util.h
@@ -0,0 +1,40 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_TPU_KERNELS_TPU_OP_UTIL_H_
+#define TENSORFLOW_CORE_TPU_KERNELS_TPU_OP_UTIL_H_
+
+#include <vector>
+
+#include "absl/strings/string_view.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/protobuf/tpu/compile_metadata.pb.h"
+#include "tensorflow/core/tpu/kernels/tpu_compilation_cache_key.h"
+#include "tensorflow/core/tpu/kernels/tpu_mesh_state_interface.h"
+
+namespace tensorflow {
+namespace tpu {
+// Creates a unique compilation cache `key`.
+TpuCompilationCacheKey CreateCompilationCacheKey(
+    absl::string_view function_name, uint64 function_library_fingerprint,
+    absl::string_view mlir_module, const OpInputList& guaranteed_constants,
+    const std::vector<TensorShape>& dynamic_shapes,
+    const TPUCompileMetadataProto& metadata,
+    const TpuMeshStateInterface& mesh_state);
+}  // namespace tpu
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TPU_KERNELS_TPU_OP_UTIL_H_

From 9d33f296d1edc2f656e253cf2a015d36daedd5c1 Mon Sep 17 00:00:00 2001
From: Henry Tan <henrytan@google.com>
Date: Thu, 18 Jun 2020 15:00:50 -0700
Subject: [PATCH 0542/1390] Prep change for publishing TPU Ops.

PiperOrigin-RevId: 317188030
Change-Id: I29f9236c0ade6bf586c8a52ead977b5d31aec357
---
 tensorflow/core/tpu/kernels/BUILD             | 11 +++
 .../tpu_compilation_cache_entry_unloader.h    | 69 +++++++++++++++++++
 2 files changed, 80 insertions(+)
 create mode 100644 tensorflow/core/tpu/kernels/tpu_compilation_cache_entry_unloader.h

diff --git a/tensorflow/core/tpu/kernels/BUILD b/tensorflow/core/tpu/kernels/BUILD
index 9d38eb71f3c..a41747ee8c5 100644
--- a/tensorflow/core/tpu/kernels/BUILD
+++ b/tensorflow/core/tpu/kernels/BUILD
@@ -450,3 +450,14 @@ cc_library(
     hdrs = ["tpu_compile_op.h"],
     deps = ["//tensorflow/core:framework"],
 )
+
+cc_library(
+    name = "tpu_compilation_cache_entry_unloader",
+    hdrs = ["tpu_compilation_cache_entry_unloader.h"],
+    deps = [
+        ":tpu_compilation_cache_interface",
+        "//tensorflow/core:framework",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/synchronization",
+    ],
+)
diff --git a/tensorflow/core/tpu/kernels/tpu_compilation_cache_entry_unloader.h b/tensorflow/core/tpu/kernels/tpu_compilation_cache_entry_unloader.h
new file mode 100644
index 00000000000..c298d8fcc12
--- /dev/null
+++ b/tensorflow/core/tpu/kernels/tpu_compilation_cache_entry_unloader.h
@@ -0,0 +1,69 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_TPU_KERNELS_COMPILATION_CACHE_ENTRY_UNLOADER_H_
+#define TENSORFLOW_CORE_TPU_KERNELS_COMPILATION_CACHE_ENTRY_UNLOADER_H_
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/synchronization/mutex.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/tpu/kernels/tpu_compilation_cache_interface.h"
+
+namespace tensorflow {
+namespace tpu {
+
+class TpuCompilationCacheEntryUnloader : public ResourceBase {
+ public:
+  explicit TpuCompilationCacheEntryUnloader(TpuCompilationCacheInterface* cache)
+      : cache_(cache) {
+    // Hold a reference to the cache until the unloader is destroyed.
+    cache_->Ref();
+    VLOG(1) << "Will unload compilation cache entries when session closes.";
+  }
+
+  ~TpuCompilationCacheEntryUnloader() override {
+    absl::MutexLock lock(&mu_);
+    for (int64 uid : cache_entry_uids_) {
+      Status s = cache_->MarkEntryForEviction(uid);
+      if (!s.ok()) {
+        LOG(WARNING) << "MarkEntryForEviction in "
+                        "~CompilationCacheEntryUnloader fails with error "
+                     << s;
+      }
+    }
+    // Release our reference to the cache.
+    cache_->Unref();
+  }
+
+  // Add cache entry uid to be unloaded in destructor.
+  void AddCacheEntryUid(int64 uid) {
+    absl::MutexLock lock(&mu_);
+    cache_entry_uids_.insert(uid);
+  }
+
+  std::string DebugString() const override {
+    return "CompilationCacheEntryUnloader";
+  }
+
+ private:
+  TF_DISALLOW_COPY_AND_ASSIGN(TpuCompilationCacheEntryUnloader);
+  mutable absl::Mutex mu_;
+  TpuCompilationCacheInterface* cache_;  // Not owned.
+  absl::flat_hash_set<int64> cache_entry_uids_ ABSL_GUARDED_BY(mu_);
+};
+
+}  // namespace tpu
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TPU_KERNELS_COMPILATION_CACHE_ENTRY_UNLOADER_H_

From 35b978db57eaa87f32e7c9c3e9a7c323e595c978 Mon Sep 17 00:00:00 2001
From: Sean Silva <silvasean@google.com>
Date: Thu, 18 Jun 2020 15:00:53 -0700
Subject: [PATCH 0543/1390] Move tfl-device-index-selector to TF directory.

There's nothing lite-specific about this pass.

PiperOrigin-RevId: 317188038
Change-Id: Iac9799e296e043aabf7aeabec2e8f72d07c77178
---
 tensorflow/compiler/mlir/lite/BUILD                  |  1 -
 tensorflow/compiler/mlir/lite/tf_tfl_passes.cc       |  2 +-
 tensorflow/compiler/mlir/lite/transforms/passes.h    |  3 ---
 tensorflow/compiler/mlir/tensorflow/BUILD            |  1 +
 .../transforms/device_index_selector.cc              | 12 ++++++------
 .../compiler/mlir/tensorflow/transforms/passes.h     |  3 +++
 .../tests/tf_device_index_selector.mlir              |  2 +-
 7 files changed, 12 insertions(+), 12 deletions(-)
 rename tensorflow/compiler/mlir/{lite => tensorflow}/transforms/device_index_selector.cc (92%)
 rename tensorflow/compiler/{mlir/lite => tensorflow}/tests/tf_device_index_selector.mlir (94%)

diff --git a/tensorflow/compiler/mlir/lite/BUILD b/tensorflow/compiler/mlir/lite/BUILD
index 8e9d615053c..8d4efeb3d60 100644
--- a/tensorflow/compiler/mlir/lite/BUILD
+++ b/tensorflow/compiler/mlir/lite/BUILD
@@ -314,7 +314,6 @@ tf_cc_test(
 cc_library(
     name = "tensorflow_lite_legalize_tf",
     srcs = [
-        "transforms/device_index_selector.cc",
         "transforms/dilated_conv.cc",
         "transforms/generated_legalize_tf.inc",
         "transforms/generated_lower_static_tensor_list.inc",
diff --git a/tensorflow/compiler/mlir/lite/tf_tfl_passes.cc b/tensorflow/compiler/mlir/lite/tf_tfl_passes.cc
index 008098f62ba..fed2896035b 100644
--- a/tensorflow/compiler/mlir/lite/tf_tfl_passes.cc
+++ b/tensorflow/compiler/mlir/lite/tf_tfl_passes.cc
@@ -63,7 +63,7 @@ void AddTFToTFLConversionPasses(const mlir::TFL::PassConfig& pass_config,
   standard_pipeline_options.enable_inliner = false;
   standard_pipeline_options.form_clusters = pass_config.form_clusters;
   mlir::TF::CreateTFStandardPipeline(*pass_manager, standard_pipeline_options);
-  pass_manager->addPass(mlir::TFL::CreateDeviceIndexSelectorPass());
+  pass_manager->addPass(mlir::TF::CreateDeviceIndexSelectorPass());
 
   if (pass_config.shape_inference) {
     pass_manager->addPass(mlir::TF::CreateTFShapeInferencePass());
diff --git a/tensorflow/compiler/mlir/lite/transforms/passes.h b/tensorflow/compiler/mlir/lite/transforms/passes.h
index 01e5eb1cb68..105c9394fb4 100644
--- a/tensorflow/compiler/mlir/lite/transforms/passes.h
+++ b/tensorflow/compiler/mlir/lite/transforms/passes.h
@@ -91,9 +91,6 @@ std::unique_ptr<OperationPass<ModuleOp>> CreateWhileOutlinePass();
 // Verifies runtime constraints.
 std::unique_ptr<OperationPass<FuncOp>> CreateRuntimeVerifyPass();
 
-// Creates function pass to select device index/fold tf.DeviceIndex.
-std::unique_ptr<OperationPass<FuncOp>> CreateDeviceIndexSelectorPass();
-
 }  // namespace TFL
 
 }  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tensorflow/BUILD b/tensorflow/compiler/mlir/tensorflow/BUILD
index 54e57512c32..7c0d427e87b 100644
--- a/tensorflow/compiler/mlir/tensorflow/BUILD
+++ b/tensorflow/compiler/mlir/tensorflow/BUILD
@@ -475,6 +475,7 @@ cc_library(
         "transforms/cluster_outlining.cc",
         "transforms/collection_ops_util.cc",
         "transforms/decompose_resource_ops_pass.cc",
+        "transforms/device_index_selector.cc",
         "transforms/einsum.cc",
         "transforms/executor_island_coarsening.cc",
         "transforms/executor_tpuv1_inline_tpu_island.cc",
diff --git a/tensorflow/compiler/mlir/lite/transforms/device_index_selector.cc b/tensorflow/compiler/mlir/tensorflow/transforms/device_index_selector.cc
similarity index 92%
rename from tensorflow/compiler/mlir/lite/transforms/device_index_selector.cc
rename to tensorflow/compiler/mlir/tensorflow/transforms/device_index_selector.cc
index d4aed750dc8..550647a915a 100644
--- a/tensorflow/compiler/mlir/lite/transforms/device_index_selector.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/device_index_selector.cc
@@ -21,11 +21,11 @@ limitations under the License.
 #include "mlir/IR/Operation.h"  // from @llvm-project
 #include "mlir/IR/PatternMatch.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
-#include "tensorflow/compiler/mlir/lite/transforms/passes.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
 
 namespace mlir {
-namespace TFL {
+namespace TF {
 namespace {
 
 // Folds the DeviceIndex op to a constant value. The DeviceIndex return the
@@ -55,8 +55,8 @@ void DeviceIndexSelector::runOnOperation() {
   // Convert all the DeviceIndex ops to constant values.
   func.getBody().walk([](TF::DeviceIndexOp op) {
     // This just selects the default in all cases where DeviceIndex feeds into
-    // tf.Case. This could be enhanced based on explicit TFLite specification or
-    // TAC in future.
+    // tf.Case. This could be enhanced to have some sort of policy in the
+    // future.
     OpBuilder b(op);
     RankedTensorType type = RankedTensorType::get({}, b.getIntegerType(32));
     int index = op.device_names().size();
@@ -79,7 +79,7 @@ std::unique_ptr<OperationPass<FuncOp>> CreateDeviceIndexSelectorPass() {
 }
 
 static PassRegistration<DeviceIndexSelector> pass(
-    "tfl-device-index-selector", "Fold tf.DeviceIndex to constant");
+    "tf-device-index-selector", "Fold tf.DeviceIndex to constant");
 
-}  // namespace TFL
+}  // namespace TF
 }  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/passes.h b/tensorflow/compiler/mlir/tensorflow/transforms/passes.h
index a34be28c809..168b317641d 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/passes.h
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/passes.h
@@ -147,6 +147,9 @@ std::unique_ptr<OperationPass<FuncOp>> CreateLegalizeHloToTfPass();
 // generally used beyond exporting to runtimes that supports these ops. In the
 // future these fusions may be codegen'd automatically.
 std::unique_ptr<OperationPass<FuncOp>> CreateFusedKernelMatcherPass();
+
+// Creates function pass to select device index/fold tf.DeviceIndex.
+std::unique_ptr<OperationPass<FuncOp>> CreateDeviceIndexSelectorPass();
 }  // namespace TF
 
 namespace tf_executor {
diff --git a/tensorflow/compiler/mlir/lite/tests/tf_device_index_selector.mlir b/tensorflow/compiler/tensorflow/tests/tf_device_index_selector.mlir
similarity index 94%
rename from tensorflow/compiler/mlir/lite/tests/tf_device_index_selector.mlir
rename to tensorflow/compiler/tensorflow/tests/tf_device_index_selector.mlir
index 1ac7f30d644..7fc2b210f91 100644
--- a/tensorflow/compiler/mlir/lite/tests/tf_device_index_selector.mlir
+++ b/tensorflow/compiler/tensorflow/tests/tf_device_index_selector.mlir
@@ -1,6 +1,6 @@
 // Test DeviceIndex selector.
 
-// RUN: tf-opt --tfl-device-index-selector %s | FileCheck %s
+// RUN: tf-opt --tf-device-index-selector %s | FileCheck %s
 
 // CHECK-LABEL: func @select
 func @select(%arg0: tensor<f32>, %arg1: tensor<f32>) -> (tensor<i32>, tensor<f32>) {

From 834fe68f365e1d7f082b596fe87471ce84c2c8ec Mon Sep 17 00:00:00 2001
From: Pete Warden <petewarden@google.com>
Date: Thu, 18 Jun 2020 15:16:08 -0700
Subject: [PATCH 0544/1390] Optimized Arduino library by enabling
 precompilation Precompilation allows Arduino users to build their sketches
 much faster, but requires some support from the library properties to enable.
 This has recently been upgraded to suppor the 'full' mode, as shown in
 https://github.com/arduino/arduino-cli/pull/611, so we want to take advantage
 of this.

PiperOrigin-RevId: 317191283
Change-Id: Ie44a31ba45105f65fdad0da487290aff5fa2a179
---
 tensorflow/lite/micro/tools/make/templates/library.properties | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tensorflow/lite/micro/tools/make/templates/library.properties b/tensorflow/lite/micro/tools/make/templates/library.properties
index e41fd8d8fbe..6e02748a0b4 100644
--- a/tensorflow/lite/micro/tools/make/templates/library.properties
+++ b/tensorflow/lite/micro/tools/make/templates/library.properties
@@ -7,4 +7,5 @@ paragraph=This library runs TensorFlow machine learning models on microcontrolle
 category=Data Processing
 url=https://www.tensorflow.org/lite/microcontrollers/overview
 ldflags=-lm
-includes=TensorFlowLite.h
\ No newline at end of file
+includes=TensorFlowLite.h
+precompiled=full

From 852cde437fdd062f52c42e47344029897ee67afd Mon Sep 17 00:00:00 2001
From: Gabriel Rasskin <grasskin@google.com>
Date: Thu, 18 Jun 2020 15:27:56 -0700
Subject: [PATCH 0545/1390] Make error_message constant

Co-authored-by: Mihai Maruseac <mihai.maruseac@gmail.com>
---
 tensorflow/security/fuzzing/status_group_fuzz.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/security/fuzzing/status_group_fuzz.cc b/tensorflow/security/fuzzing/status_group_fuzz.cc
index 979fd444b48..5e2b7eec403 100644
--- a/tensorflow/security/fuzzing/status_group_fuzz.cc
+++ b/tensorflow/security/fuzzing/status_group_fuzz.cc
@@ -39,7 +39,7 @@ tensorflow::error::Code BuildRandomErrorCode(uint32_t code){
 extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
   tensorflow::error::Code error_code;
 
-  std::string error_message = "ERROR";
+  const std::string error_message = "ERROR";
 
   tensorflow::Status s, derived_s;
 

From 14e942faaa56b749c81c52595d04a9bb7f26fa02 Mon Sep 17 00:00:00 2001
From: Gabriel Rasskin <grasskin@google.com>
Date: Thu, 18 Jun 2020 15:31:26 -0700
Subject: [PATCH 0546/1390] Update spacing and variable declaration

---
 .../security/fuzzing/status_group_fuzz.cc     | 27 ++++---------------
 1 file changed, 5 insertions(+), 22 deletions(-)

diff --git a/tensorflow/security/fuzzing/status_group_fuzz.cc b/tensorflow/security/fuzzing/status_group_fuzz.cc
index 5e2b7eec403..52d83d00866 100644
--- a/tensorflow/security/fuzzing/status_group_fuzz.cc
+++ b/tensorflow/security/fuzzing/status_group_fuzz.cc
@@ -14,9 +14,7 @@ limitations under the License.
 ==============================================================================*/
 #include <cstdint>
 #include <cstdlib>
-
 #include "tensorflow/core/platform/status.h"
-
 #include <fuzzer/FuzzedDataProvider.h>
 
 // This is a fuzzer for `tensorflow::StatusGroup`. Since `Status` is used almost
@@ -37,37 +35,22 @@ tensorflow::error::Code BuildRandomErrorCode(uint32_t code){
 }
 
 extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
-  tensorflow::error::Code error_code;
-
   const std::string error_message = "ERROR";
-
-  tensorflow::Status s, derived_s;
-
   tensorflow::StatusGroup sg;
-
-  bool is_derived;
-
-  uint32_t code;
-
   FuzzedDataProvider fuzzed_data(data, size);
 
   while(fuzzed_data.remaining_bytes() > 0) {
-    code = fuzzed_data.ConsumeIntegral<uint32_t>();
+    uint32_t code = fuzzed_data.ConsumeIntegral<uint32_t>();
+    tensorflow::error::Code error_code = BuildRandomErrorCode(code);
+    bool is_derived = fuzzed_data.ConsumeBool();
 
-    error_code = BuildRandomErrorCode(code);
-
-    is_derived = fuzzed_data.ConsumeBool();
-
-    s = tensorflow::Status(error_code, error_message);
+    tensorflow::Status s = tensorflow::Status(error_code, error_message);
 
     if(is_derived) {
-      derived_s = tensorflow::StatusGroup::MakeDerived(s);
-
+      tensorflow::Status derived_s = tensorflow::StatusGroup::MakeDerived(s);
       sg.Update(derived_s);
-
     } else {
       sg.Update(s);
-
     }
   }
 

From 71bbebbf4d04c1bcb6ed44e2156087c9fec06e9e Mon Sep 17 00:00:00 2001
From: Gabriel Rasskin <grasskin@google.com>
Date: Thu, 18 Jun 2020 15:32:15 -0700
Subject: [PATCH 0547/1390] Moved final StatusGroup method calls

---
 tensorflow/security/fuzzing/status_group_fuzz.cc | 2 --
 1 file changed, 2 deletions(-)

diff --git a/tensorflow/security/fuzzing/status_group_fuzz.cc b/tensorflow/security/fuzzing/status_group_fuzz.cc
index 52d83d00866..bc80cd72bc9 100644
--- a/tensorflow/security/fuzzing/status_group_fuzz.cc
+++ b/tensorflow/security/fuzzing/status_group_fuzz.cc
@@ -55,9 +55,7 @@ extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
   }
 
   sg.as_summary_status();
-
   sg.as_concatenated_status();
-
   sg.AttachLogMessages();
 
   return 0;

From a82b75c82b63c4397b3d6a215e439ca77e687a84 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 18 Jun 2020 15:35:42 -0700
Subject: [PATCH 0548/1390] [XLA/Client] Implement LocalClient::Run which
 supports buffer donation

PiperOrigin-RevId: 317195199
Change-Id: If4d35d0627fa068a0c2b522fdae52466abd21f51
---
 .../compiler/xla/client/local_client.cc       | 47 ++++---------------
 tensorflow/compiler/xla/client/local_client.h |  9 ----
 .../tests/multiple_devices_on_host_test.cc    |  3 +-
 tensorflow/compiler/xla/tests/while_test.cc   |  6 +--
 4 files changed, 12 insertions(+), 53 deletions(-)

diff --git a/tensorflow/compiler/xla/client/local_client.cc b/tensorflow/compiler/xla/client/local_client.cc
index f71e8a2d56d..afe115deda8 100644
--- a/tensorflow/compiler/xla/client/local_client.cc
+++ b/tensorflow/compiler/xla/client/local_client.cc
@@ -168,26 +168,6 @@ LocalExecutable::RunHelper(const absl::Span<const Shape* const> argument_shapes,
   return std::make_pair(service_options, std::move(stream));
 }
 
-StatusOr<ExecutableRunOptions> LocalExecutable::GetExecutableRunOptions(
-    absl::Span<Shape const* const> argument_shapes,
-    const ExecutableRunOptions& run_options) {
-  TF_ASSIGN_OR_RETURN(auto options_and_stream,
-                      RunHelper(argument_shapes, run_options));
-  ExecutableRunOptions options = options_and_stream.first.run_options();
-  options.set_device_ordinal(-1);
-  return options;
-}
-
-template <typename T>
-static StatusOr<T> BlockHostUntilDoneAfterAsyncCall(
-    se::Stream* stream, std::function<StatusOr<T>()> async_callback) {
-  StatusOr<T> result = async_callback();
-  Status block_status = stream->BlockHostUntilDone();
-  TF_RETURN_IF_ERROR(result.status());
-  TF_RETURN_IF_ERROR(block_status);
-  return result;
-}
-
 StatusOr<ScopedShapedBuffer> LocalExecutable::Run(
     const absl::Span<const ShapedBuffer* const> arguments,
     ExecutableRunOptions run_options) {
@@ -196,24 +176,15 @@ StatusOr<ScopedShapedBuffer> LocalExecutable::Run(
   for (const ShapedBuffer* const arg : arguments) {
     argument_shapes.push_back(&arg->on_host_shape());
   }
-  TF_ASSIGN_OR_RETURN(ExecutableRunOptions options,
-                      GetExecutableRunOptions(argument_shapes, run_options));
-  return BlockHostUntilDoneAfterAsyncCall<xla::ScopedShapedBuffer>(
-      options.stream(), [&] { return RunAsync(arguments, options); });
-}
-
-StatusOr<ExecutionOutput> LocalExecutable::Run(
-    std::vector<ExecutionInput> arguments, ExecutableRunOptions run_options) {
-  std::vector<const Shape*> argument_shapes;
-  argument_shapes.reserve(arguments.size());
-  for (const ExecutionInput& arg : arguments) {
-    argument_shapes.push_back(&arg.shape());
-  }
-  TF_ASSIGN_OR_RETURN(ExecutableRunOptions options,
-                      GetExecutableRunOptions(argument_shapes, run_options));
-  return BlockHostUntilDoneAfterAsyncCall<ExecutionOutput>(
-      options.stream(),
-      [&] { return RunAsync(argument_shapes, std::move(arguments), options); });
+  TF_ASSIGN_OR_RETURN(auto options_and_stream,
+                      RunHelper(argument_shapes, run_options));
+  ExecutableRunOptions options = options_and_stream.first.run_options();
+  options.set_device_ordinal(-1);
+  auto result = RunAsync(arguments, options);
+  Status block_status = options.stream()->BlockHostUntilDone();
+  TF_RETURN_IF_ERROR(result.status());
+  TF_RETURN_IF_ERROR(block_status);
+  return result;
 }
 
 static std::shared_ptr<HloSnapshot> DumpArguments(
diff --git a/tensorflow/compiler/xla/client/local_client.h b/tensorflow/compiler/xla/client/local_client.h
index b00f5cc6801..7cdeb9dcbf6 100644
--- a/tensorflow/compiler/xla/client/local_client.h
+++ b/tensorflow/compiler/xla/client/local_client.h
@@ -51,11 +51,6 @@ class LocalExecutable {
       const absl::Span<const ShapedBuffer* const> arguments,
       ExecutableRunOptions run_options);
 
-  // Similar to Run(), but allows for donating argument buffers to the
-  // executable.
-  StatusOr<ExecutionOutput> Run(std::vector<ExecutionInput> arguments,
-                                ExecutableRunOptions run_options);
-
   // Similar to Run(), but need not block the host waiting for the computation
   // to complete before returning.
   StatusOr<ScopedShapedBuffer> RunAsync(
@@ -90,10 +85,6 @@ class LocalExecutable {
       const absl::Span<const Shape* const> argument_shapes,
       ExecutableRunOptions run_options);
 
-  StatusOr<ExecutableRunOptions> GetExecutableRunOptions(
-      absl::Span<Shape const* const> argument_shapes,
-      const ExecutableRunOptions& run_options);
-
   // The ordinal of the device which this executable was compiled for. The
   // executable can run on all equivalent devices (as determined by
   // Backend::devices_equivalent).
diff --git a/tensorflow/compiler/xla/tests/multiple_devices_on_host_test.cc b/tensorflow/compiler/xla/tests/multiple_devices_on_host_test.cc
index 2231fc6feab..2b19aaded9c 100644
--- a/tensorflow/compiler/xla/tests/multiple_devices_on_host_test.cc
+++ b/tensorflow/compiler/xla/tests/multiple_devices_on_host_test.cc
@@ -45,8 +45,7 @@ void CompileAndExecute(
       xla::ClientLibrary::GetXlaService(client->platform())
           ->backend()
           .memory_allocator());
-  StatusOr<ScopedShapedBuffer> result =
-      executable->Run(absl::Span<const ShapedBuffer* const>(), execute_options);
+  StatusOr<ScopedShapedBuffer> result = executable->Run({}, execute_options);
   {
     absl::MutexLock lock(results_mutex);
     results->emplace_back(device_ordinal, std::move(result));
diff --git a/tensorflow/compiler/xla/tests/while_test.cc b/tensorflow/compiler/xla/tests/while_test.cc
index 8e8c3605cc7..d575bbb1f3e 100644
--- a/tensorflow/compiler/xla/tests/while_test.cc
+++ b/tensorflow/compiler/xla/tests/while_test.cc
@@ -1324,16 +1324,14 @@ void BM_WhileLoop(int num_iters) {
   options.set_allocator(&allocator);
   const int kWarmups = 2;
   for (int i = 0; i < kWarmups; ++i) {
-    auto result =
-        executable->Run(absl::Span<const ShapedBuffer* const>(), options);
+    auto result = executable->Run({}, options);
     ASSERT_TRUE(result.ok());
   }
 
   // Run benchmark.
   tensorflow::testing::StartTiming();
   for (int i = 0; i < num_iters; ++i) {
-    auto result =
-        executable->Run(absl::Span<const ShapedBuffer* const>(), options);
+    auto result = executable->Run({}, options);
     ASSERT_TRUE(result.ok());
   }
 }

From 40ef6a7ad67a973f971eae13b59b5f25777d037e Mon Sep 17 00:00:00 2001
From: rahul-kamat <rahulkamat@gmail.com>
Date: Thu, 18 Jun 2020 22:43:22 +0000
Subject: [PATCH 0549/1390] Map args & attrs to types and use map to annotate

---
 tensorflow/python/framework/python_op_gen.cc | 149 ++++++++-----------
 1 file changed, 59 insertions(+), 90 deletions(-)

diff --git a/tensorflow/python/framework/python_op_gen.cc b/tensorflow/python/framework/python_op_gen.cc
index 8c4d0f5b753..79c8800418c 100644
--- a/tensorflow/python/framework/python_op_gen.cc
+++ b/tensorflow/python/framework/python_op_gen.cc
@@ -47,6 +47,7 @@ constexpr char kEagerFallbackSuffix[] = "_eager_fallback";
 
 std::unordered_map<string, string> dtypes_map {
       {"_dtypes.float16", "_dtypes.Float16"},
+      {"_dtypes.half", "_dtypes.Half"},
       {"_dtypes.float32", "_dtypes.Float32"},
       {"_dtypes.float64", "_dtypes.Float64"},
       {"_dtypes.bfloat16", "_dtypes.BFloat16"},
@@ -162,7 +163,8 @@ class GenEagerPythonOp : public python_op_gen_internal::GenPythonOp {
 
   bool AddEagerFastPathAndGraphCode(const string& parameters,
                                     const std::vector<string>& output_sizes,
-                                    const string& eager_not_allowed_error);
+                                    const string& eager_not_allowed_error,
+                                    std::unordered_map<string, string>& type_map);
   bool AddEagerFallbackCode(const string& parameters,
                             const std::vector<string>& output_sizes,
                             const string& num_outputs_expr,
@@ -179,8 +181,11 @@ class GenEagerPythonOp : public python_op_gen_internal::GenPythonOp {
   void AddRawOpExport(const string& parameters);
 
   void GenerateTypeVars();
-  string GetTypeAnnotatedParams();
-  void AddReturnTypeAnnotation();
+
+  std::unordered_map<string, string> GetTypeAnnotationMap();
+
+  void AddReturnTypeAnnotation(std::unordered_map<string, string>& type_map);
+
   void AddAttrForArg(const string& attr, int arg_index) {
     gtl::InsertIfNotPresent(&inferred_attrs_, attr,
                             op_def_.input_arg(arg_index).name());
@@ -343,13 +348,22 @@ string GenEagerPythonOp::Code() {
     param_names_.push_back(param_and_default.first);
   }
 
-  string parameters;
+  std::unordered_map<string, string> type_map;
+  // Only populate map for whitelisted ops
   if (type_annotate_ops.find(op_def_.name()) != type_annotate_ops.end()) {
-    strings::StrAppend(&parameters, GetTypeAnnotatedParams());
-  } else {
-    for (const auto& param : params_no_default_) {
-      if (!parameters.empty()) strings::StrAppend(&parameters, ", ");
-      strings::StrAppend(&parameters, param.GetRenameTo());
+    type_map = GetTypeAnnotationMap();
+  }
+
+  string parameters;
+  for (const auto& param : params_no_default_) {
+    if (!parameters.empty()) strings::StrAppend(&parameters, ", ");
+    strings::StrAppend(&parameters, param.GetRenameTo());
+
+    // Add type annotations to param
+    if (type_map.find(param.GetName()) != type_map.end()) {
+      if(!type_map[param.GetName()].empty()) {
+        strings::StrAppend(&parameters, ": ", type_map[param.GetName()]);
+      }
     }
   }
 
@@ -393,7 +407,7 @@ string GenEagerPythonOp::Code() {
   string eager_not_allowed_error = GetEagerNotAllowedError();
 
   if (!AddEagerFastPathAndGraphCode(parameters_with_defaults, output_sizes,
-                                    eager_not_allowed_error)) {
+                                    eager_not_allowed_error, type_map)) {
     return result_;
   }
 
@@ -405,61 +419,56 @@ string GenEagerPythonOp::Code() {
   return prelude_ + result_;
 }
 
-string GenEagerPythonOp::GetTypeAnnotatedParams() {
-  // holds mappings from param name to its type annotation
-  std::unordered_map<string, string> param_type_map;
+std::unordered_map<string, string> GenEagerPythonOp::GetTypeAnnotationMap() {
+  std::unordered_map<string, string> type_map;
+  // Mapping attrs to TypeVars
   for (int i = 0; i<op_def_.attr_size(); ++i) {
     const auto& attr(op_def_.attr(i));
     if (attr.type() == "type") {
-      bool has_dtype_half = false;
-      for (int t : attr.allowed_values().list().type()) {
-        if (t == 19) { // DT_HALF = 19
-          has_dtype_half = true;
-          break;
-        }
-      }
-
-      // Do not add type annotations to an arg that can be of type half
-      if (has_dtype_half) continue;
-
       const string type_var_name = "TV_" + op_def_.name() + "_" + attr.name();
-      param_type_map[attr.name()] = type_var_name;
+      type_map[attr.name()] = type_var_name;
     }
   }
 
-  for (int i = 0; i < api_def_.arg_order_size(); ++i) {
-    const auto& arg = *FindInputArg(api_def_.arg_order(i), op_def_);
-    // Do not add type annotations to args that accept multiple tensors
+  // Mapping input Tensors to their types
+  for (int i = 0; i < op_def_.input_arg_size(); ++i) {
+    const auto& arg = op_def_.input_arg(i);
+    // Do not add type annotations to args that accept a sequence of tensors
     if (!arg.number_attr().empty()) continue;
     string type_annotation;
-    if (param_type_map.find(arg.type_attr()) != param_type_map.end()) {
+    if (type_map.find(arg.type_attr()) != type_map.end()) {
       // Get the correct TypeVar if input maps to an attr
-      strings::StrAppend(&type_annotation, "_ops.Tensor[", param_type_map[arg.type_attr()], "]");
+      strings::StrAppend(&type_annotation, "_ops.Tensor[", type_map[arg.type_attr()], "]");
     } else {
-      // Get the dtype of the tensor
+      // Get the dtype of the Tensor
       const string py_dtype = python_op_gen_internal::DataTypeToPython(arg.type(), "_dtypes.");
       if (dtypes_map.find(py_dtype) != dtypes_map.end()) {
         strings::StrAppend(&type_annotation, "_ops.Tensor[", dtypes_map[py_dtype], "]");
       }
     }
 
-    param_type_map[arg.name()] = type_annotation;
+    type_map[arg.name()] = type_annotation;
   }
 
-  // Add type annotations to params
-  string parameters;
-  for (const auto& param : params_no_default_) {
-    if (!parameters.empty()) strings::StrAppend(&parameters, ", ");
-    strings::StrAppend(&parameters, param.GetRenameTo());
-
-    if (param_type_map.find(param.GetName()) != param_type_map.end()) {
-      if(!param_type_map[param.GetName()].empty()){
-        strings::StrAppend(&parameters, ": ", param_type_map[param.GetName()]);
+  // Mapping output Tensor to its types
+  if (op_def_.output_arg_size() == 1) {
+    const auto& arg = op_def_.output_arg(0);
+    string type_annotation;
+    if (type_map.find(arg.type_attr()) != type_map.end()) {
+      // Get the correct TypeVar if input maps to an attr
+      strings::StrAppend(&type_annotation, "_ops.Tensor[", type_map[arg.type_attr()], "]");
+    } else {
+      // Get the dtype of the Tensor
+      const string py_dtype = python_op_gen_internal::DataTypeToPython(arg.type(), "_dtypes.");
+      if (dtypes_map.find(py_dtype) != dtypes_map.end()) {
+        strings::StrAppend(&type_annotation, "_ops.Tensor[", dtypes_map[py_dtype], "]");
       }
     }
+
+    type_map[arg.name()] = type_annotation;
   }
 
-  return parameters;
+  return type_map;
 }
 
 // Generate TypeVars using attrs
@@ -469,12 +478,7 @@ void GenEagerPythonOp::GenerateTypeVars() {
     const auto& attr(op_def_.attr(i));
     if (attr.type() == "type") {
       std::vector<string> allowed_types;
-      bool has_dtype_half = false;
       for (int t : attr.allowed_values().list().type()) {
-        if (t == 19) { // DT_HALF = 19
-          has_dtype_half = true;
-          break;
-        }
         DataType dtype = static_cast<DataType>(t);
         const string py_dtype = python_op_gen_internal::DataTypeToPython(dtype, "_dtypes.");
         if (dtypes_map.find(py_dtype) != dtypes_map.end()) {
@@ -482,9 +486,6 @@ void GenEagerPythonOp::GenerateTypeVars() {
         }
       }
 
-      // Do not create a type variable that includes the dtype half
-      if (has_dtype_half) continue;
-
       // If all dtypes are allowed, add them all
       if (allowed_types.empty()) {
         for (std::pair<string, string> map_dtype : dtypes_map) {
@@ -509,48 +510,16 @@ void GenEagerPythonOp::GenerateTypeVars() {
   if(added_typevar) strings::StrAppend(&result_, "\n");
 }
 
-void GenEagerPythonOp::AddReturnTypeAnnotation() {
-  string return_type = "";
+void GenEagerPythonOp::AddReturnTypeAnnotation(std::unordered_map<string, string>& type_map) {
   if (op_def_.output_arg_size() == 1) {
     const auto& arg = op_def_.output_arg(0);
-    // If the "type" field is set, the return Tensor has a single DataType
-    if (arg.type() != 0) {
-      const string py_dtype = python_op_gen_internal::DataTypeToPython(arg.type(), "_dtypes.");
-      if (dtypes_map.find(py_dtype) != dtypes_map.end()) {
-        strings::StrAppend(&return_type, "_ops.Tensor[", dtypes_map[py_dtype], "]");
+    // Add type annotations to param
+    if (type_map.find(arg.name()) != type_map.end()) {
+      if (!type_map[arg.name()].empty()) {
+        result_.erase(result_.length() - 2);
+        strings::StrAppend(&result_, " -> ", type_map[arg.name()], ":\n");
       }
     }
-    else {
-      for (int i = 0; i<op_def_.attr_size(); ++i) {
-        const auto& attr(op_def_.attr(i));
-        if (arg.type_attr() == attr.name() && attr.type() == "type") {
-          std::vector<string> allowed_types;
-          for (int t : attr.allowed_values().list().type()) {
-            // Do not add type annotations when return type can be half
-            if (t == 19) return; // DT_HALF = 19
-            DataType dtype = static_cast<DataType>(t);
-            const string py_dtype = python_op_gen_internal::DataTypeToPython(dtype, "_dtypes.");
-            allowed_types.emplace_back(py_dtype);
-          }
-
-          std::sort(allowed_types.begin(), allowed_types.end());
-
-          string typevar_dtypes;
-          for (std::vector<string>::iterator it = allowed_types.begin(); it != allowed_types.end(); ++it) {
-            if (!typevar_dtypes.empty()) strings::StrAppend(&typevar_dtypes, ", ");
-            strings::StrAppend(&typevar_dtypes, *it);
-          }
-
-          const string type_var_name = "TV_" + op_def_.name() + "_" + attr.name();
-          strings::StrAppend(&return_type, "_ops.Tensor[", type_var_name, "]");
-        }
-      }
-    }
-
-    if (!return_type.empty()) {
-      result_.erase(result_.length() - 2);
-      strings::StrAppend(&result_, " -> ", return_type, ":\n");
-    }
   }
 }
 
@@ -876,7 +845,7 @@ void GenEagerPythonOp::AddEagerFunctionTeardown(
 
 bool GenEagerPythonOp::AddEagerFastPathAndGraphCode(
     const string& parameters, const std::vector<string>& output_sizes,
-    const string& eager_not_allowed_error) {
+    const string& eager_not_allowed_error, std::unordered_map<string, string>& type_map) {
   if (type_annotate_ops.find(op_def_.name()) != type_annotate_ops.end()) {
     GenerateTypeVars();
   }
@@ -887,7 +856,7 @@ bool GenEagerPythonOp::AddEagerFastPathAndGraphCode(
   AddExport();
   AddDefLine(function_name_, parameters);
   if (type_annotate_ops.find(op_def_.name()) != type_annotate_ops.end()) {
-    AddReturnTypeAnnotation();
+    AddReturnTypeAnnotation(type_map);
   }
   AddDocStringDescription();
   AddDocStringArgs();

From 0deffad6acbc2f5848022bf8ae360c9adbdf1ef8 Mon Sep 17 00:00:00 2001
From: Scott Zhu <scottzhu@google.com>
Date: Thu, 18 Jun 2020 15:50:38 -0700
Subject: [PATCH 0550/1390] Make `return_state` as explicit kwarg in the
 Conv2DLSTM layer.

It was previously hide in the **kwargs, and we are also missing documentation for it.

The existing test case should already cover the functionality of it.

PiperOrigin-RevId: 317197835
Change-Id: Icfae1e177eeb886b41345078f6b93f282a94df5b
---
 .../keras/layers/convolutional_recurrent.py   | 43 +++++++++++--------
 ...orflow.keras.layers.-conv-l-s-t-m2-d.pbtxt |  2 +-
 ...orflow.keras.layers.-conv-l-s-t-m2-d.pbtxt |  2 +-
 3 files changed, 28 insertions(+), 19 deletions(-)

diff --git a/tensorflow/python/keras/layers/convolutional_recurrent.py b/tensorflow/python/keras/layers/convolutional_recurrent.py
index 19831429b73..6c812204cba 100644
--- a/tensorflow/python/keras/layers/convolutional_recurrent.py
+++ b/tensorflow/python/keras/layers/convolutional_recurrent.py
@@ -753,7 +753,9 @@ class ConvLSTM2D(ConvRNN2D):
       the `recurrent_kernel` weights matrix.
     bias_constraint: Constraint function applied to the bias vector.
     return_sequences: Boolean. Whether to return the last output
-      in the output sequence, or the full sequence.
+      in the output sequence, or the full sequence. (default False)
+    return_state: Boolean Whether to return the last state
+      in addition to the output. (default False)
     go_backwards: Boolean (default False).
       If True, process the input sequence backwards.
     stateful: Boolean (default False). If True, the last state
@@ -786,22 +788,27 @@ class ConvLSTM2D(ConvRNN2D):
         `(samples, time, rows, cols, channels)`
 
   Output shape:
-    - If `return_sequences`
-       - If data_format='channels_first'
-          5D tensor with shape:
-          `(samples, time, filters, output_row, output_col)`
-       - If data_format='channels_last'
-          5D tensor with shape:
-          `(samples, time, output_row, output_col, filters)`
-    - Else
-      - If data_format ='channels_first'
-          4D tensor with shape:
-          `(samples, filters, output_row, output_col)`
-      - If data_format='channels_last'
-          4D tensor with shape:
-          `(samples, output_row, output_col, filters)`
-      where `o_row` and `o_col` depend on the shape of the filter and
-      the padding
+    - If `return_state`: a list of tensors. The first tensor is
+      the output. The remaining tensors are the last states,
+      each 4D tensor with shape:
+      `(samples, filters, new_rows, new_cols)`
+      if data_format='channels_first'
+      or 4D tensor with shape:
+      `(samples, new_rows, new_cols, filters)`
+      if data_format='channels_last'.
+      `rows` and `cols` values might have changed due to padding.
+    - If `return_sequences`: 5D tensor with shape:
+      `(samples, timesteps, filters, new_rows, new_cols)`
+      if data_format='channels_first'
+      or 5D tensor with shape:
+      `(samples, timesteps, new_rows, new_cols, filters)`
+      if data_format='channels_last'.
+    - Else, 4D tensor with shape:
+      `(samples, filters, new_rows, new_cols)`
+      if data_format='channels_first'
+      or 4D tensor with shape:
+      `(samples, new_rows, new_cols, filters)`
+      if data_format='channels_last'.
 
   Raises:
     ValueError: in case of invalid constructor arguments.
@@ -834,6 +841,7 @@ class ConvLSTM2D(ConvRNN2D):
                recurrent_constraint=None,
                bias_constraint=None,
                return_sequences=False,
+               return_state=False,
                go_backwards=False,
                stateful=False,
                dropout=0.,
@@ -863,6 +871,7 @@ class ConvLSTM2D(ConvRNN2D):
                           dtype=kwargs.get('dtype'))
     super(ConvLSTM2D, self).__init__(cell,
                                      return_sequences=return_sequences,
+                                     return_state=return_state,
                                      go_backwards=go_backwards,
                                      stateful=stateful,
                                      **kwargs)
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt
index f77d613e354..958d06a0d0f 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt
@@ -207,7 +207,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'dilation_rate\', \'activation\', \'recurrent_activation\', \'use_bias\', \'kernel_initializer\', \'recurrent_initializer\', \'bias_initializer\', \'unit_forget_bias\', \'kernel_regularizer\', \'recurrent_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'recurrent_constraint\', \'bias_constraint\', \'return_sequences\', \'go_backwards\', \'stateful\', \'dropout\', \'recurrent_dropout\'], varargs=None, keywords=kwargs, defaults=[\'(1, 1)\', \'valid\', \'None\', \'(1, 1)\', \'tanh\', \'hard_sigmoid\', \'True\', \'glorot_uniform\', \'orthogonal\', \'zeros\', \'True\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'False\', \'False\', \'False\', \'0.0\', \'0.0\'], "
+    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'dilation_rate\', \'activation\', \'recurrent_activation\', \'use_bias\', \'kernel_initializer\', \'recurrent_initializer\', \'bias_initializer\', \'unit_forget_bias\', \'kernel_regularizer\', \'recurrent_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'recurrent_constraint\', \'bias_constraint\', \'return_sequences\', \'return_state\', \'go_backwards\', \'stateful\', \'dropout\', \'recurrent_dropout\'], varargs=None, keywords=kwargs, defaults=[\'(1, 1)\', \'valid\', \'None\', \'(1, 1)\', \'tanh\', \'hard_sigmoid\', \'True\', \'glorot_uniform\', \'orthogonal\', \'zeros\', \'True\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'False\', \'False\', \'False\', \'False\', \'0.0\', \'0.0\'], "
   }
   member_method {
     name: "add_loss"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt
index f77d613e354..958d06a0d0f 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt
@@ -207,7 +207,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'dilation_rate\', \'activation\', \'recurrent_activation\', \'use_bias\', \'kernel_initializer\', \'recurrent_initializer\', \'bias_initializer\', \'unit_forget_bias\', \'kernel_regularizer\', \'recurrent_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'recurrent_constraint\', \'bias_constraint\', \'return_sequences\', \'go_backwards\', \'stateful\', \'dropout\', \'recurrent_dropout\'], varargs=None, keywords=kwargs, defaults=[\'(1, 1)\', \'valid\', \'None\', \'(1, 1)\', \'tanh\', \'hard_sigmoid\', \'True\', \'glorot_uniform\', \'orthogonal\', \'zeros\', \'True\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'False\', \'False\', \'False\', \'0.0\', \'0.0\'], "
+    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'dilation_rate\', \'activation\', \'recurrent_activation\', \'use_bias\', \'kernel_initializer\', \'recurrent_initializer\', \'bias_initializer\', \'unit_forget_bias\', \'kernel_regularizer\', \'recurrent_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'recurrent_constraint\', \'bias_constraint\', \'return_sequences\', \'return_state\', \'go_backwards\', \'stateful\', \'dropout\', \'recurrent_dropout\'], varargs=None, keywords=kwargs, defaults=[\'(1, 1)\', \'valid\', \'None\', \'(1, 1)\', \'tanh\', \'hard_sigmoid\', \'True\', \'glorot_uniform\', \'orthogonal\', \'zeros\', \'True\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'False\', \'False\', \'False\', \'False\', \'0.0\', \'0.0\'], "
   }
   member_method {
     name: "add_loss"

From e08382691bfb897d584c5d5a8e8a0abe0472373d Mon Sep 17 00:00:00 2001
From: Chenkai Kuang <chenkai@google.com>
Date: Thu, 18 Jun 2020 15:54:03 -0700
Subject: [PATCH 0551/1390] Make "map_resources" overridable by subclass of
 `Trackable`.

This allows moving the implementation of map_resources from `tf.saved_model.save` to subclass of `Trackable`, e.g, Variable, DistributedVariable.

This is a non-functional change.

PiperOrigin-RevId: 317198449
Change-Id: I4aa48d4974b6547b5de8ac0f5c38f3da29d364bc
---
 tensorflow/python/distribute/BUILD            |  4 +--
 tensorflow/python/distribute/values.py        | 12 +++++++
 .../experimental/autocast_variable.py         |  7 ++++
 .../python/ops/resource_variable_ops.py       |  7 ++++
 tensorflow/python/saved_model/save.py         | 36 +++++--------------
 tensorflow/python/training/tracking/base.py   | 18 ++++++++++
 .../python/training/tracking/tracking.py      | 13 +++++++
 7 files changed, 66 insertions(+), 31 deletions(-)

diff --git a/tensorflow/python/distribute/BUILD b/tensorflow/python/distribute/BUILD
index 96559a9a740..7208807a18c 100644
--- a/tensorflow/python/distribute/BUILD
+++ b/tensorflow/python/distribute/BUILD
@@ -744,14 +744,12 @@ py_library(
         "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:math_ops",
-        "//tensorflow/python:tensor_util",
+        "//tensorflow/python:resource_variable_ops",
         "//tensorflow/python:tf_export",
         "//tensorflow/python:type_spec",
-        "//tensorflow/python:util",
         "//tensorflow/python:variable_scope",
         "//tensorflow/python:variables",
         "//tensorflow/python/eager:context",
-        "//tensorflow/python/eager:tape",
         "//tensorflow/python/training/saving:saveable_object",
         "//tensorflow/python/training/saving:saveable_object_util",
         "//tensorflow/python/training/tracking:base",
diff --git a/tensorflow/python/distribute/values.py b/tensorflow/python/distribute/values.py
index d0ed27c69de..60b2ea4fe31 100644
--- a/tensorflow/python/distribute/values.py
+++ b/tensorflow/python/distribute/values.py
@@ -32,6 +32,7 @@ from tensorflow.python.framework import type_spec
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import variable_scope as vs
 from tensorflow.python.ops import variables as variables_lib
 from tensorflow.python.training.saving import saveable_object
@@ -793,6 +794,17 @@ class DistributedVariable(DistributedDelegate, variables_lib.Variable,
       return ops.convert_to_tensor(
           self._get(), dtype=dtype, name=name, as_ref=as_ref)
 
+  def _map_resources(self):
+    """For implementing `Trackable`."""
+    new_obj = resource_variable_ops.copy_to_graph_uninitialized(self._primary)
+    obj_map, resource_map = {}, {}
+    for v in self._values:
+      obj_map[v] = new_obj
+      resource_map[v.handle] = new_obj.handle
+    obj_map[self] = new_obj
+    resource_map[self] = new_obj.handle
+    return obj_map, resource_map
+
 
 class _DistributedVariableSaveable(saveable_object.SaveableObject):
   """Class for defining how to restore a DistributedVariable."""
diff --git a/tensorflow/python/keras/mixed_precision/experimental/autocast_variable.py b/tensorflow/python/keras/mixed_precision/experimental/autocast_variable.py
index 7d0abe30581..57e8ced65a0 100644
--- a/tensorflow/python/keras/mixed_precision/experimental/autocast_variable.py
+++ b/tensorflow/python/keras/mixed_precision/experimental/autocast_variable.py
@@ -285,6 +285,13 @@ class AutoCastVariable(variables.Variable, core.Tensor):
     # models with normal variables, and vice versa.
     return self._variable._gather_saveables_for_checkpoint()  # pylint:disable=protected-access
 
+  def _map_resources(self):
+    # By delegating this method to the wrapped variable, SavedModel with
+    # AutoCastVariables are identical to SavedModel with normal variables.
+    obj_map, resource_map = self._variable._map_resources()  # pylint:disable=protected-access
+    obj_map[self] = obj_map[self._variable]
+    return obj_map, resource_map
+
   # TODO(reedwm): Maybe encode the fact the variable is an AutoCastVariable in
   # to_proto().
   def to_proto(self, export_scope=None):
diff --git a/tensorflow/python/ops/resource_variable_ops.py b/tensorflow/python/ops/resource_variable_ops.py
index 25f6347f034..cb235fcbe2d 100644
--- a/tensorflow/python/ops/resource_variable_ops.py
+++ b/tensorflow/python/ops/resource_variable_ops.py
@@ -633,6 +633,13 @@ class BaseResourceVariable(variables.VariableV1, core.Tensor):
     return gen_state_ops.resource_count_up_to(self.handle, limit=limit,
                                               T=self.dtype)
 
+  def _map_resources(self):
+    """For implementing `Trackable`."""
+    new_variable = copy_to_graph_uninitialized(self)
+    obj_map = {self: new_variable}
+    resource_map = {self._handle: new_variable.handle}
+    return obj_map, resource_map
+
   def _read_variable_op(self):
     variable_accessed(self)
 
diff --git a/tensorflow/python/saved_model/save.py b/tensorflow/python/saved_model/save.py
index 5844c80995f..802ce1d61b7 100644
--- a/tensorflow/python/saved_model/save.py
+++ b/tensorflow/python/saved_model/save.py
@@ -19,14 +19,12 @@ from __future__ import division
 from __future__ import print_function
 
 import collections
-import copy
 import os
 
 from tensorflow.core.framework import versions_pb2
 from tensorflow.core.protobuf import meta_graph_pb2
 from tensorflow.core.protobuf import saved_model_pb2
 from tensorflow.core.protobuf import saved_object_graph_pb2
-from tensorflow.python.distribute import distribute_utils as ds_utils
 from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
 from tensorflow.python.eager import function as defun
@@ -241,7 +239,7 @@ class _SaveableView(object):
     Creates resource handle ops in the current default graph, whereas
     `accessible_objects` will be from an eager context. Resource mapping adds
     resource handle ops to the main GraphDef of a SavedModel, which allows the
-    C++ loader API to interact with variables.
+    C++ loader API to interact with resources.
 
     Returns:
       A tuple of (object_map, resource_map, asset_info):
@@ -265,33 +263,15 @@ class _SaveableView(object):
         asset_index={})
 
     for node_id, obj in enumerate(self.nodes):
-      if isinstance(obj, tracking.CapturableResource):
-        new_obj = object_map[obj] = copy.copy(obj)
-        # pylint: disable=protected-access
-        with ops.device(obj._resource_device):
-          new_resource = new_obj._create_resource()
-        new_obj._resource_handle = new_resource
-        # pylint: enable=protected-access
-        resource_map[obj.resource_handle] = new_resource
-        self.captured_tensor_node_ids[obj.resource_handle] = node_id
-      elif (ds_utils.is_distributed_variable(obj) or
-            resource_variable_ops.is_resource_variable(obj)):
-        obj_to_copy = obj._primary if ds_utils.is_distributed_variable(  # pylint: disable=protected-access
-            obj) else obj
-        new_variable = resource_variable_ops.copy_to_graph_uninitialized(
-            obj_to_copy)
-        if ds_utils.is_distributed_variable(obj):
-          self.captured_tensor_node_ids[obj] = node_id
-          for v in obj.values:
-            object_map[v] = new_variable
-            resource_map[v.handle] = new_variable.handle
-            self.captured_tensor_node_ids[v.handle] = node_id
-        object_map[obj] = new_variable
-        resource_map[obj.handle] = new_variable.handle
-        self.captured_tensor_node_ids[obj.handle] = node_id
-      elif isinstance(obj, tracking.Asset):
+      if isinstance(obj, tracking.Asset):
         _process_asset(obj, asset_info, resource_map)
         self.captured_tensor_node_ids[obj.asset_path] = node_id
+      elif isinstance(obj, base.Trackable):
+        node_object_map, node_resource_map = obj._map_resources()  # pylint: disable=protected-access
+        for capturable in node_resource_map.keys():
+          self.captured_tensor_node_ids[capturable] = node_id
+        object_map.update(node_object_map)
+        resource_map.update(node_resource_map)
 
     # Note: some concrete functions can have been realized when tracing other
     # functions, and might closure-capture tensors from their parent functions.
diff --git a/tensorflow/python/training/tracking/base.py b/tensorflow/python/training/tracking/base.py
index e3cd9828724..ea76ad8db47 100644
--- a/tensorflow/python/training/tracking/base.py
+++ b/tensorflow/python/training/tracking/base.py
@@ -1021,3 +1021,21 @@ class Trackable(object):
     """
     del serialization_cache
     return dict()
+
+  def _map_resources(self):
+    """Makes new resource handle ops corresponding to existing resource tensors.
+
+    Internal sub-classes can override this to inform model saving how to add new
+    resource handle ops to the main GraphDef of a SavedModel (TF 1.x style
+    graph), which allows session based APIs (e.g, C++ loader API) to interact
+    with resources owned by this object.
+
+    Returns:
+      A tuple of (object_map, resource_map):
+        object_map: A dictionary mapping from objects that hold existing
+          resource tensors to replacement objects created to hold the new
+          resource tensors.
+        resource_map: A dictionary mapping from existing resource tensors to
+          newly created resource tensors.
+    """
+    return {}, {}
diff --git a/tensorflow/python/training/tracking/tracking.py b/tensorflow/python/training/tracking/tracking.py
index 553f0ec73bf..fb2735e6445 100644
--- a/tensorflow/python/training/tracking/tracking.py
+++ b/tensorflow/python/training/tracking/tracking.py
@@ -17,6 +17,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import copy
 import functools
 import weakref
 
@@ -243,6 +244,18 @@ class CapturableResource(base.Trackable):
         self._resource_handle = self._create_resource()
     return self._resource_handle
 
+  def _map_resources(self):
+    """For implementing `Trackable`."""
+    new_obj = copy.copy(self)
+    # pylint: disable=protected-access
+    with ops.device(self._resource_device):
+      new_resource = new_obj._create_resource()
+    new_obj._resource_handle = new_resource
+    # pylint: enable=protected-access
+    obj_map = {self: new_obj}
+    resource_map = {self.resource_handle: new_resource}
+    return obj_map, resource_map
+
   def _list_functions_for_serialization(self, unused_functions):
     @def_function.function(input_signature=[], autograph=False)
     def _creator():

From 39504c25d9de697d3568bc4d370722d0f48376cf Mon Sep 17 00:00:00 2001
From: Smit Hinsu <hinsu@google.com>
Date: Thu, 18 Jun 2020 15:55:24 -0700
Subject: [PATCH 0552/1390] Fix bug in xla-legalize-tf-with-tf2xla pass by
 handling non-tensor operands

Currently, it only expects tensor operands but that is not applicable for non tensorflow dialect ops.

PiperOrigin-RevId: 317198672
Change-Id: I1387e664de740d044ef535f6903e07d63fa02f6d
---
 .../mlir/xla/tests/legalize-tf-with-tf2xla.mlir      | 12 ++++++++++--
 .../mlir/xla/transforms/legalize_tf_with_tf2xla.cc   |  6 +++---
 2 files changed, 13 insertions(+), 5 deletions(-)

diff --git a/tensorflow/compiler/mlir/xla/tests/legalize-tf-with-tf2xla.mlir b/tensorflow/compiler/mlir/xla/tests/legalize-tf-with-tf2xla.mlir
index b8a6df54519..86a7f2b9e09 100644
--- a/tensorflow/compiler/mlir/xla/tests/legalize-tf-with-tf2xla.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/legalize-tf-with-tf2xla.mlir
@@ -35,7 +35,7 @@ func @not_whitelisted_op(%arg0: tensor<3xi32>, %arg1: tensor<i32>, %arg2: tensor
 // CHECK-LABEL: unranked_operand
 func @unranked_operand(%arg0: tensor<*xf32>) -> tensor<*xf32> {
   // CHECK: tf.Abs
-  // expected-remark@+1 {{lowering requires static shaped operands}}
+  // expected-remark@+1 {{lowering requires static shaped tensor operands}}
   %0 = "tf.Abs"(%arg0) : (tensor<*xf32>) -> tensor<*xf32>
 
   return %0 : tensor<*xf32>
@@ -44,12 +44,20 @@ func @unranked_operand(%arg0: tensor<*xf32>) -> tensor<*xf32> {
 // CHECK-LABEL: dynamic_operand
 func @dynamic_operand(%arg0: tensor<?xf32>) -> tensor<?xf32> {
   // CHECK: tf.Abs
-  // expected-remark@+1 {{lowering requires static shaped operands}}
+  // expected-remark@+1 {{lowering requires static shaped tensor operands}}
   %0 = "tf.Abs"(%arg0) : (tensor<?xf32>) -> tensor<?xf32>
 
   return %0 : tensor<?xf32>
 }
 
+// CHECK-LABEL: tuple_type
+func @tuple_type(%arg0: tuple<tensor<f32>, tensor<i32>>) -> tensor<f32> {
+  // Verifies that the pass can handle operands of non-tensor type like tuple
+  // from non TensorFlow ops.
+  %0 = "xla_hlo.get_tuple_element"(%arg0) {index = 0 : i32} : (tuple<tensor<f32>, tensor<i32>>) -> tensor<f32>
+  return %0 : tensor<f32>
+}
+
 // CHECK-LABEL: unsupported_dtype
 func @unsupported_dtype(%arg0: tensor<2x!tf.variant>) -> tensor<2x!tf.variant> {
   // CHECK: tf.AddN
diff --git a/tensorflow/compiler/mlir/xla/transforms/legalize_tf_with_tf2xla.cc b/tensorflow/compiler/mlir/xla/transforms/legalize_tf_with_tf2xla.cc
index e57d6938efb..ef79c8868bb 100644
--- a/tensorflow/compiler/mlir/xla/transforms/legalize_tf_with_tf2xla.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/legalize_tf_with_tf2xla.cc
@@ -337,9 +337,9 @@ LogicalResult FuncLegalizer::LegalizeOp(Operation* op) {
 
   // Only static shaped operands are supported in XLA builders for now.
   for (Type ty : op->getOperandTypes()) {
-    auto ranked_ty = ty.cast<ShapedType>();
-    if (!ranked_ty.hasStaticShape()) {
-      op->emitRemark() << "lowering requires static shaped operands";
+    auto ranked_ty = ty.dyn_cast<ShapedType>();
+    if (!ranked_ty || !ranked_ty.hasStaticShape()) {
+      op->emitRemark() << "lowering requires static shaped tensor operands";
       return success();
     }
   }

From 8d34408863b650564076f148edad9f91508abf04 Mon Sep 17 00:00:00 2001
From: Smit Hinsu <hinsu@google.com>
Date: Thu, 18 Jun 2020 16:02:23 -0700
Subject: [PATCH 0553/1390] Auto-generate following TensorFlow ops related to
 image

ResizeBilinearGrad ResizeBilinear AdjustContrastv2 ResizeNearestNeighbor
AdjustSaturation AdjustHue RGBToHSV HSVToRGB

PiperOrigin-RevId: 317199967
Change-Id: I1953acf599f2f7de686bda73b654e4c7b98dffd5
---
 .../mlir/tensorflow/ir/tf_generated_ops.td    | 153 ++++++++++++++++++
 1 file changed, 153 insertions(+)

diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td b/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
index 3a5deb9c569..dcd083fc398 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
@@ -164,6 +164,81 @@ def TF_AddV2Op : TF_Op<"AddV2", [Commutative, NoSideEffect, ResultsBroadcastable
   let hasFolder = 1;
 }
 
+def TF_AdjustContrastv2Op : TF_Op<"AdjustContrastv2", [NoSideEffect]> {
+  let summary = "Adjust the contrast of one or more images.";
+
+  let description = [{
+`images` is a tensor of at least 3 dimensions.  The last 3 dimensions are
+interpreted as `[height, width, channels]`.  The other dimensions only
+represent a collection of images, such as `[batch, height, width, channels].`
+
+Contrast is adjusted independently for each channel of each image.
+
+For each channel, the Op first computes the mean of the image pixels in the
+channel and then adjusts each component of each pixel to
+`(x - mean) * contrast_factor + mean`.
+  }];
+
+  let arguments = (ins
+    TensorOf<[F16, F32]>:$images,
+    F32Tensor:$contrast_factor
+  );
+
+  let results = (outs
+    TensorOf<[F16, F32]>:$output
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+}
+
+def TF_AdjustHueOp : TF_Op<"AdjustHue", [NoSideEffect]> {
+  let summary = "Adjust the hue of one or more images.";
+
+  let description = [{
+`images` is a tensor of at least 3 dimensions.  The last dimension is
+interpreted as channels, and must be three.
+
+The input image is considered in the RGB colorspace. Conceptually, the RGB
+colors are first mapped into HSV. A delta is then applied all the hue values,
+and then remapped back to RGB colorspace.
+  }];
+
+  let arguments = (ins
+    TensorOf<[F16, F32]>:$images,
+    F32Tensor:$delta
+  );
+
+  let results = (outs
+    TensorOf<[F16, F32]>:$output
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+}
+
+def TF_AdjustSaturationOp : TF_Op<"AdjustSaturation", [NoSideEffect]> {
+  let summary = "Adjust the saturation of one or more images.";
+
+  let description = [{
+`images` is a tensor of at least 3 dimensions.  The last dimension is
+interpreted as channels, and must be three.
+
+The input image is considered in the RGB colorspace. Conceptually, the RGB
+colors are first mapped into HSV. A scale is then applied all the saturation
+values, and then remapped back to RGB colorspace.
+  }];
+
+  let arguments = (ins
+    TensorOf<[F16, F32]>:$images,
+    F32Tensor:$scale
+  );
+
+  let results = (outs
+    TensorOf<[F16, F32]>:$output
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+}
+
 def TF_AllOp : TF_Op<"All", [NoSideEffect]> {
   let summary = [{
 Computes the "logical and" of elements across dimensions of a tensor.
@@ -3866,6 +3941,28 @@ tf.math.greater_equal(x, y) ==> [True, False, True, True]
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
 }
 
+def TF_HSVToRGBOp : TF_Op<"HSVToRGB", [NoSideEffect]> {
+  let summary = "Convert one or more images from HSV to RGB.";
+
+  let description = [{
+Outputs a tensor of the same shape as the `images` tensor, containing the RGB
+value of the pixels. The output is only well defined if the value in `images`
+are in `[0,1]`.
+
+See `rgb_to_hsv` for a description of the HSV encoding.
+  }];
+
+  let arguments = (ins
+    TF_FpTensor:$images
+  );
+
+  let results = (outs
+    TF_FpTensor:$output
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+}
+
 def TF_HashTableV2Op : TF_Op<"HashTableV2", []> {
   let summary = "Creates a non-initialized hash table.";
 
@@ -6733,6 +6830,41 @@ the dimension is padded with zeros.
   TF_DerivedResultTypeAttr Tcomplex = TF_DerivedResultTypeAttr<0>;
 }
 
+def TF_RGBToHSVOp : TF_Op<"RGBToHSV", [NoSideEffect]> {
+  let summary = "Converts one or more images from RGB to HSV.";
+
+  let description = [{
+Outputs a tensor of the same shape as the `images` tensor, containing the HSV
+value of the pixels. The output is only well defined if the value in `images`
+are in `[0,1]`.
+
+`output[..., 0]` contains hue, `output[..., 1]` contains saturation, and
+`output[..., 2]` contains value. All HSV values are in `[0,1]`. A hue of 0
+corresponds to pure red, hue 1/3 is pure green, and 2/3 is pure blue.
+
+Usage Example:
+
+>>> blue_image = tf.stack([
+...    tf.zeros([5,5]),
+...    tf.zeros([5,5]),
+...    tf.ones([5,5])],
+...    axis=-1)
+>>> blue_hsv_image = tf.image.rgb_to_hsv(blue_image)
+>>> blue_hsv_image[0,0].numpy()
+array([0.6666667, 1. , 1. ], dtype=float32)
+  }];
+
+  let arguments = (ins
+    TF_FpTensor:$images
+  );
+
+  let results = (outs
+    TF_FpTensor:$output
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+}
+
 def TF_RandomGammaGradOp : TF_Op<"RandomGammaGrad", [NoSideEffect, ResultsBroadcastableShape]>,
                            WithBroadcastableBinOpBuilder {
   let summary = [{
@@ -7230,6 +7362,27 @@ Input images can be of different types but output images are always float.
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
 }
 
+def TF_ResizeBilinearGradOp : TF_Op<"ResizeBilinearGrad", [NoSideEffect]> {
+  let summary = "Computes the gradient of bilinear interpolation.";
+
+  let description = [{
+  }];
+
+  let arguments = (ins
+    F32Tensor:$grads,
+    TF_FpTensor:$original_image,
+
+    DefaultValuedAttr<BoolAttr, "false">:$align_corners,
+    DefaultValuedAttr<BoolAttr, "false">:$half_pixel_centers
+  );
+
+  let results = (outs
+    TF_FpTensor:$output
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<1>;
+}
+
 def TF_ResizeNearestNeighborOp : TF_Op<"ResizeNearestNeighbor", [NoSideEffect]> {
   let summary = [{
 Resize `images` to `size` using nearest neighbor interpolation.

From 67544cd4bbdf8070adebbb077439cac300f479ca Mon Sep 17 00:00:00 2001
From: Ken Franko <kfranko@google.com>
Date: Thu, 18 Jun 2020 16:02:51 -0700
Subject: [PATCH 0554/1390] Add more outside compilation tests including
 multiple clusters and more variety of inputs/outputs.

PiperOrigin-RevId: 317200078
Change-Id: Id26e99059097073299ef5f681fae053b082ec149
---
 .../tpu/tpu_outside_compilation_test.py       | 95 ++++++++++++++++++-
 1 file changed, 91 insertions(+), 4 deletions(-)

diff --git a/tensorflow/python/tpu/tpu_outside_compilation_test.py b/tensorflow/python/tpu/tpu_outside_compilation_test.py
index f7ecb294c44..54c2598324c 100644
--- a/tensorflow/python/tpu/tpu_outside_compilation_test.py
+++ b/tensorflow/python/tpu/tpu_outside_compilation_test.py
@@ -18,13 +18,18 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from absl.testing import parameterized
+import numpy as np
+
 from tensorflow.python.distribute import tpu_strategy as tpu_lib
 from tensorflow.python.distribute.cluster_resolver import tpu_cluster_resolver
 from tensorflow.python.eager import def_function
 from tensorflow.python.eager import remote
 from tensorflow.python.eager import test
 from tensorflow.python.framework import constant_op
+from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import logging_ops
+from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import flags
 from tensorflow.python.tpu import tpu
@@ -52,7 +57,7 @@ def get_tpu_strategy():
   return tpu_lib.TPUStrategy(resolver)
 
 
-class TpuOutsideCompilationTest(test.TestCase):
+class TpuOutsideCompilationTest(test.TestCase, parameterized.TestCase):
 
   def testResourceVariableAssignOnHost(self):
     strategy = get_tpu_strategy()
@@ -79,6 +84,26 @@ class TpuOutsideCompilationTest(test.TestCase):
     self.assertAllEqual(4.0 * strategy.num_replicas_in_sync, v2.numpy())
     self.assertAllEqual(5.0, v.numpy())
 
+  def testHostNoInput(self):
+    strategy = get_tpu_strategy()
+
+    def outside_fn():
+      logging_ops.print_v2("Outside compiled")
+
+    @def_function.function
+    def train_step():
+
+      def tpu_fn(x):
+        x2 = x + 5.0
+        tpu.outside_compilation(outside_fn)
+        return x2 + 5.0
+
+      return strategy.run(tpu_fn, args=(25.0,))
+
+    self.assertAllEqual(
+        strategy.experimental_local_results(train_step()),
+        constant_op.constant(35., shape=(strategy.num_replicas_in_sync)))
+
   def testHostInputOnly(self):
     strategy = get_tpu_strategy()
 
@@ -120,13 +145,71 @@ class TpuOutsideCompilationTest(test.TestCase):
         strategy.experimental_local_results(train_step()),
         constant_op.constant(36., shape=(strategy.num_replicas_in_sync)))
 
-  def testOutsideCompilationControlFlowIf(self):
+  def testHostMultipleInputs(self):
+    strategy = get_tpu_strategy()
+    val0 = np.arange(6).reshape((2, 3)).astype(np.float32)
+    val1 = np.arange(6).reshape((3, 2)).astype(np.float32)
+
+    def outside_fn(arg0, arg1):
+      tmp = array_ops.reshape(arg1, array_ops.shape(arg0))
+      ret0 = arg0 + tmp
+      ret1 = math_ops.matmul(arg0, arg1)
+      ret2 = array_ops.concat([arg0, tmp], 0)
+      return ret0, ret1, ret2
+
+    @def_function.function
+    def train_step():
+
+      def tpu_fn(x, y):
+        a = x + 7.0
+        b = y * 2.0
+        c, d, e = tpu.outside_compilation(outside_fn, a, b)
+        return (math_ops.reduce_max(c) + math_ops.reduce_min(d) +
+                math_ops.reduce_sum(e))
+
+      return strategy.run(tpu_fn, args=(val0, val1))
+
+    self.assertAllEqual(
+        strategy.experimental_local_results(train_step()),
+        constant_op.constant(213., shape=(strategy.num_replicas_in_sync)))
+
+  def testMultipleClusters(self):
+    strategy = get_tpu_strategy()
+
+    def outside_fn1(x):
+      logging_ops.print_v2("Outside compiled", x)
+      return x + 6.0
+
+    def outside_fn2(x):
+      logging_ops.print_v2("Outside compiled", x)
+      return x - 18.0
+
+    @def_function.function
+    def train_step():
+
+      def tpu_fn(x):
+        x2 = x + 5.0
+        output1 = tpu.outside_compilation(outside_fn1, x2)
+        x3 = output1 + 3.0
+        output2 = tpu.outside_compilation(outside_fn2, x3)
+        return output2
+
+      return strategy.run(tpu_fn, args=(25.0,))
+
+    self.assertAllEqual(
+        strategy.experimental_local_results(train_step()),
+        constant_op.constant(21., shape=(strategy.num_replicas_in_sync)))
+
+  @parameterized.parameters((True), (False))
+  def testOutsideCompilationControlFlowIf(self, take_true_branch):
     strategy = get_tpu_strategy()
 
     def outside_fn(x):
       logging_ops.print_v2("Outside compiled", x)
       return x + 6.0
 
+    input_value = 51.0 if take_true_branch else 25.0
+
     @def_function.function
     def train_step():
 
@@ -137,11 +220,15 @@ class TpuOutsideCompilationTest(test.TestCase):
         else:
           return x2
 
-      return strategy.run(tpu_fn, args=(25.0,))
+      return strategy.run(tpu_fn, args=(input_value,))
 
+    output_value = 36.0
+    if take_true_branch:
+      output_value = 56.0
     self.assertAllEqual(
         strategy.experimental_local_results(train_step()),
-        constant_op.constant(36., shape=(strategy.num_replicas_in_sync)))
+        constant_op.constant(
+            output_value, shape=(strategy.num_replicas_in_sync)))
 
   def testOutsideCompilationControlFlowWhile(self):
     strategy = get_tpu_strategy()

From 4aea552e064cf92330e07e83a3b5a1ca2a7034d0 Mon Sep 17 00:00:00 2001
From: Henry Tan <henrytan@google.com>
Date: Thu, 18 Jun 2020 16:15:22 -0700
Subject: [PATCH 0555/1390] Publishing tpu_op_consts to tpu kernels library.

PiperOrigin-RevId: 317202394
Change-Id: Ib6a1f350af7384513a3744084a9959ed86278d1f
---
 tensorflow/core/tpu/kernels/BUILD             | 11 +++++-
 .../kernels/tpu_compilation_cache_external.h  |  5 +--
 tensorflow/core/tpu/kernels/tpu_op_consts.cc  | 24 ++++++++++++
 tensorflow/core/tpu/kernels/tpu_op_consts.h   | 39 +++++++++++++++++++
 4 files changed, 74 insertions(+), 5 deletions(-)
 create mode 100644 tensorflow/core/tpu/kernels/tpu_op_consts.cc
 create mode 100644 tensorflow/core/tpu/kernels/tpu_op_consts.h

diff --git a/tensorflow/core/tpu/kernels/BUILD b/tensorflow/core/tpu/kernels/BUILD
index a41747ee8c5..a9f2202cd45 100644
--- a/tensorflow/core/tpu/kernels/BUILD
+++ b/tensorflow/core/tpu/kernels/BUILD
@@ -321,6 +321,7 @@ cc_library(
         ":tpu_compile_c_api_hdrs",
         ":tpu_compile_op_support",
         ":tpu_mesh_state_interface",
+        ":tpu_op_consts",
         ":tpu_program_group",
         ":tpu_util",
         ":trace_util_hdrs",
@@ -433,7 +434,6 @@ cc_library(
         "//tensorflow/compiler/xla/client:compile_only_client",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
-        "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
     ],
     alwayslink = 1,
@@ -461,3 +461,12 @@ cc_library(
         "@com_google_absl//absl/synchronization",
     ],
 )
+
+cc_library(
+    name = "tpu_op_consts",
+    srcs = ["tpu_op_consts.cc"],
+    hdrs = ["tpu_op_consts.h"],
+    deps = [
+        "@com_google_absl//absl/base:core_headers",
+    ],
+)
diff --git a/tensorflow/core/tpu/kernels/tpu_compilation_cache_external.h b/tensorflow/core/tpu/kernels/tpu_compilation_cache_external.h
index fe251326a43..86615b15d4c 100644
--- a/tensorflow/core/tpu/kernels/tpu_compilation_cache_external.h
+++ b/tensorflow/core/tpu/kernels/tpu_compilation_cache_external.h
@@ -38,15 +38,12 @@ limitations under the License.
 #include "tensorflow/core/tpu/kernels/tpu_compile_c_api.h"
 #include "tensorflow/core/tpu/kernels/tpu_compile_op_support.h"
 #include "tensorflow/core/tpu/kernels/tpu_mesh_state_interface.h"
+#include "tensorflow/core/tpu/kernels/tpu_op_consts.h"
 #include "tensorflow/core/tpu/kernels/tpu_program_group.h"
 
 namespace tensorflow {
 namespace tpu {
 
-constexpr char kCompilationCacheResourceName[] = "tpu_compilation_cache";
-constexpr char kCompilationCacheUnloaderResourceName[] =
-    "tpu_compilation_cache_unloader";
-
 class TpuCompilationCacheExternal : public TpuCompilationCacheInterface {
  public:
   using Status = ::stream_executor::port::Status;
diff --git a/tensorflow/core/tpu/kernels/tpu_op_consts.cc b/tensorflow/core/tpu/kernels/tpu_op_consts.cc
new file mode 100644
index 00000000000..e5e1aacb3cc
--- /dev/null
+++ b/tensorflow/core/tpu/kernels/tpu_op_consts.cc
@@ -0,0 +1,24 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/tpu/kernels/tpu_op_consts.h"
+
+namespace tensorflow {
+namespace tpu {
+const char kCompilationCacheResourceName[] = "tpu_compilation_cache";
+const char kCompiledProtoCacheResourceName[] = "tpu_proto_cache";
+const char kCompilationCacheUnloaderResourceName[] =
+    "tpu_compilation_cache_unloader";
+}  // namespace tpu
+}  // namespace tensorflow
diff --git a/tensorflow/core/tpu/kernels/tpu_op_consts.h b/tensorflow/core/tpu/kernels/tpu_op_consts.h
new file mode 100644
index 00000000000..25223b7e429
--- /dev/null
+++ b/tensorflow/core/tpu/kernels/tpu_op_consts.h
@@ -0,0 +1,39 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_TPU_KERNELS_TPU_OP_CONSTS_H_
+#define TENSORFLOW_CORE_TPU_KERNELS_TPU_OP_CONSTS_H_
+
+#include "absl/base/attributes.h"
+
+namespace tensorflow {
+namespace tpu {
+
+// Resource names in the ResourceMgr.
+//
+// Name of cache for compiled TPU ISA protos. CompilationCache is created by
+// ConfigureDistributedTpuOp, so only the master has a CompilationCache.
+ABSL_CONST_INIT extern const char kCompilationCacheResourceName[];
+// Name of base class allowing Execute Ops to look up ISA protos.
+// CompiledProtoCache is created by InitializeHostForDistributedTpuOp, so each
+// tpu_worker has a CompiledProtoCache.
+ABSL_CONST_INIT extern const char kCompiledProtoCacheResourceName[];
+// Name of cache unloader for compiled TPU ISA protos. Cache unloader should be
+// put into TPU_SYSTEM device resource manager. Inference may use it to unload
+// cache entries created during lifetime of a DirectSession.
+ABSL_CONST_INIT extern const char kCompilationCacheUnloaderResourceName[];
+
+}  // namespace tpu
+}  // namespace tensorflow
+#endif  // TENSORFLOW_CORE_TPU_KERNELS_TPU_OP_CONSTS_H_

From aab151356d2334a9d6cec71ce5165e6e6c45c793 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 18 Jun 2020 16:24:14 -0700
Subject: [PATCH 0556/1390] [pjrt] Refresh stream error status in strategic
 places to flush out silent failures.

PiperOrigin-RevId: 317204018
Change-Id: If75a3ad9ec846ce1621cdba92a2dc738b65b7001
---
 tensorflow/compiler/xla/pjrt/local_device_state.cc |  4 ++++
 tensorflow/compiler/xla/pjrt/pjrt_client.cc        | 12 +++++++++---
 tensorflow/stream_executor/stream.cc               |  7 ++++++-
 3 files changed, 19 insertions(+), 4 deletions(-)

diff --git a/tensorflow/compiler/xla/pjrt/local_device_state.cc b/tensorflow/compiler/xla/pjrt/local_device_state.cc
index d173c891c95..a229e56001e 100644
--- a/tensorflow/compiler/xla/pjrt/local_device_state.cc
+++ b/tensorflow/compiler/xla/pjrt/local_device_state.cc
@@ -127,11 +127,15 @@ std::unique_ptr<se::Stream> LocalDeviceState::BorrowStreamFromPool() {
   } else {
     std::unique_ptr<se::Stream> stream = std::move(usage_stream_pool_.top());
     usage_stream_pool_.pop();
+    stream->RefreshStatus().IgnoreError();  // Can return error::Unimplemented
+    QCHECK(stream->ok());
     return stream;
   }
 }
 
 void LocalDeviceState::ReturnStreamToPool(std::unique_ptr<se::Stream> stream) {
+  stream->RefreshStatus().IgnoreError();  // Can return error::Unimplemented
+  QCHECK(stream->ok());
   absl::MutexLock lock(&mu_);
   usage_stream_pool_.push(std::move(stream));
 }
diff --git a/tensorflow/compiler/xla/pjrt/pjrt_client.cc b/tensorflow/compiler/xla/pjrt/pjrt_client.cc
index ef259cf1cfd..46f592100c9 100644
--- a/tensorflow/compiler/xla/pjrt/pjrt_client.cc
+++ b/tensorflow/compiler/xla/pjrt/pjrt_client.cc
@@ -751,16 +751,22 @@ StatusOr<std::unique_ptr<PjRtBuffer>> PjRtBuffer::FromHostLiteral(
     // memory that has already been allocated, and a possible Event
     // allocation.
 
+    se::Stream* h2d_stream = local_device->host_to_device_stream();
     ShapedBuffer buffer = device_buffer->AsShapedBuffer(
         compact_shape, on_device_shape, client->client()->platform());
     TF_CHECK_OK(transfer_manager->TransferLiteralToDeviceAsync(
-        local_device->host_to_device_stream(), literal, buffer));
+        h2d_stream, literal, buffer));
 
     std::shared_ptr<BufferSequencingEvent> event =
         device_buffer->definition_events()[0];
     TF_CHECK_OK(AddDestinationBufferSynchronization(
-        local_device, std::move(device_buffer), event,
-        local_device->host_to_device_stream()));
+        local_device, std::move(device_buffer), event, h2d_stream));
+
+    // This can sometimes catch the case where the literal memory has been
+    // freed before the H2D transfer was issued.
+    h2d_stream->RefreshStatus()
+        .IgnoreError();  // Can return error::Unimplemented
+    QCHECK(h2d_stream->ok());
   };
   client->h2d_transfer_pool()->Schedule(transfer_h2d);
   return py_buffer;
diff --git a/tensorflow/stream_executor/stream.cc b/tensorflow/stream_executor/stream.cc
index c63565c65a8..da418122375 100644
--- a/tensorflow/stream_executor/stream.cc
+++ b/tensorflow/stream_executor/stream.cc
@@ -285,7 +285,12 @@ Stream::~Stream() {
 
 port::Status Stream::RefreshStatus() {
   port::Status status = parent_->GetStatus(this);
-  CheckStatus(status);
+  // We should not put the stream in an error state, just because the GetStatus
+  // method is unimplemented.
+  if (status != port::Status(port::error::UNIMPLEMENTED,
+                             "GetStatus is not supported on this executor.")) {
+    CheckStatus(status);
+  }
   return status;
 }
 

From 1363f0f6e89cdc73ca46d43475bdb35750ed2e50 Mon Sep 17 00:00:00 2001
From: Yixing Fu <yxfu93@hotmail.com>
Date: Thu, 18 Jun 2020 19:28:34 -0400
Subject: [PATCH 0557/1390] add save load test for pathlib path

---
 tensorflow/python/keras/saving/save_test.py | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/tensorflow/python/keras/saving/save_test.py b/tensorflow/python/keras/saving/save_test.py
index 5c5846fe738..98d78735ad1 100644
--- a/tensorflow/python/keras/saving/save_test.py
+++ b/tensorflow/python/keras/saving/save_test.py
@@ -71,6 +71,12 @@ class TestSaveModel(test.TestCase, parameterized.TestCase):
     save.save_model(self.model, path)
     self.assert_saved_model(path)
 
+  @test_util.run_v2_only
+  def test_save_format_defaults_pathlib(self):
+    path = pathlib.Path(self.get_temp_dir()) / 'model_path'
+    save.save_model(self.model, path)
+    self.assert_saved_model(path)
+
   @test_util.run_v2_only
   def test_save_hdf5(self):
     path = os.path.join(self.get_temp_dir(), 'model')
@@ -81,6 +87,13 @@ class TestSaveModel(test.TestCase, parameterized.TestCase):
         'requires the model to be a Functional model or a Sequential model.'):
       save.save_model(self.subclassed_model, path, save_format='h5')
 
+  @test_util.run_v2_only
+  def test_save_load_hdf5_pathlib(self):
+    if sys.version_info >= (3, 6):
+      path = pathlib.Path(self.get_temp_dir()) / 'model'
+      save.save_model(self.model, path, save_format='h5')
+      save.load_model(path)
+
   @test_util.run_v2_only
   def test_save_tf(self):
     path = os.path.join(self.get_temp_dir(), 'model')

From 94e37f84f19384e685420ef7f90382fcfe719498 Mon Sep 17 00:00:00 2001
From: Dero Gharibian <dero@google.com>
Date: Thu, 18 Jun 2020 16:25:11 -0700
Subject: [PATCH 0558/1390] Remove unnecessary string copy.

PiperOrigin-RevId: 317204219
Change-Id: I85fab345945b6ea4f428f8aedc861eb79e5fd7e0
---
 tensorflow/core/kernels/summary_op.cc | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/tensorflow/core/kernels/summary_op.cc b/tensorflow/core/kernels/summary_op.cc
index 386a8964dba..f4c91fc9ff1 100644
--- a/tensorflow/core/kernels/summary_op.cc
+++ b/tensorflow/core/kernels/summary_op.cc
@@ -52,7 +52,8 @@ class SummaryScalarOp : public OpKernel {
     Summary s;
     for (int i = 0; i < Ttags.size(); i++) {
       Summary::Value* v = s.add_value();
-      v->set_tag(string(Ttags(i)));  // NOLINT
+      const tstring& Ttags_i = Ttags(i);
+      v->set_tag(Ttags_i.data(), Ttags_i.size());
       v->set_simple_value(float(Tvalues(i)));
     }
 
@@ -102,7 +103,8 @@ class SummaryHistoOp : public OpKernel {
 
     Summary s;
     Summary::Value* v = s.add_value();
-    v->set_tag(string(tags.scalar<tstring>()()));  // NOLINT
+    const tstring& tags0 = tags.scalar<tstring>()();
+    v->set_tag(tags0.data(), tags0.size());
     histo.EncodeToProto(v->mutable_histo(), false /* Drop zero buckets */);
 
     Tensor* summary_tensor = nullptr;

From e7733c8dd4cae1a1a03053b452acbbded7ab6014 Mon Sep 17 00:00:00 2001
From: Reed <reedwm@google.com>
Date: Thu, 18 Jun 2020 16:52:51 -0700
Subject: [PATCH 0559/1390] Add LINT.IfChange

---
 .../core/grappler/optimizers/auto_mixed_precision_lists.h       | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tensorflow/core/grappler/optimizers/auto_mixed_precision_lists.h b/tensorflow/core/grappler/optimizers/auto_mixed_precision_lists.h
index c6016548117..9041d934017 100644
--- a/tensorflow/core/grappler/optimizers/auto_mixed_precision_lists.h
+++ b/tensorflow/core/grappler/optimizers/auto_mixed_precision_lists.h
@@ -73,6 +73,7 @@ class AutoMixedPrecisionLists {
   static void AddTensorListOps(gtl::FlatSet<string>* list) {
     // Note: if a data structure op (such as TensorListPopBack) is added here,
     // IsTensorListReaderOp or IsTensorListWriterOp may need to be modified
+    // LINT.IfChange
     constexpr char* tensor_list_ops[] = {
         "TensorListConcat",
         "TensorListConcatLists",
@@ -90,6 +91,7 @@ class AutoMixedPrecisionLists {
         "TensorListSplit",
         "TensorListStack"
     };
+    // LINT.ThenChange(//tensorflow/core/grappler/optimizers/auto_mixed_precision.cc)
     for (auto op : tensor_list_ops) {
       list->insert(op);
     }

From 1c72a6c65e1733b55286c6361142a39d699732dc Mon Sep 17 00:00:00 2001
From: Yixing Fu <yxfu93@hotmail.com>
Date: Thu, 18 Jun 2020 19:53:22 -0400
Subject: [PATCH 0560/1390] convert export_dir as pathlike object to str in
 parse_saved_model

---
 tensorflow/python/saved_model/loader_impl.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/python/saved_model/loader_impl.py b/tensorflow/python/saved_model/loader_impl.py
index 2df2bea428e..71f6ed16c9d 100644
--- a/tensorflow/python/saved_model/loader_impl.py
+++ b/tensorflow/python/saved_model/loader_impl.py
@@ -83,11 +83,11 @@ def parse_saved_model(export_dir):
   """
   # Build the path to the SavedModel in pbtxt format.
   path_to_pbtxt = os.path.join(
-      compat.as_bytes(export_dir),
+      compat.as_bytes(compat.path_to_str(export_dir)),
       compat.as_bytes(constants.SAVED_MODEL_FILENAME_PBTXT))
   # Build the path to the SavedModel in pb format.
   path_to_pb = os.path.join(
-      compat.as_bytes(export_dir),
+      compat.as_bytes(compat.path_to_str(export_dir)),
       compat.as_bytes(constants.SAVED_MODEL_FILENAME_PB))
 
   # Parse the SavedModel protocol buffer.

From 83b09270dc34308ef60f2f68de540a4cb213e1e4 Mon Sep 17 00:00:00 2001
From: Lucy Fox <lucyfox@google.com>
Date: Thu, 18 Jun 2020 16:52:46 -0700
Subject: [PATCH 0561/1390] Update FusedKernelMatcher pass to use upstream util
 to get stripped op name.

Added this util upstream in D81435, so now using that instead and deleting the unneeded code here.

PiperOrigin-RevId: 317209256
Change-Id: Id2d8a1fca34ca85e59a05a85bf7f6f59b425c7c1
---
 .../tensorflow/transforms/fused_kernel_matcher.cc     | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/fused_kernel_matcher.cc b/tensorflow/compiler/mlir/tensorflow/transforms/fused_kernel_matcher.cc
index 4b10550df7b..d10f5e26e8f 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/fused_kernel_matcher.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/fused_kernel_matcher.cc
@@ -52,11 +52,6 @@ struct FusedKernelMatcherPass
   void runOnFunction() override;
 };
 
-// Returns an op's name with the dialect prefix stripped off.
-StringRef GetOpNameWithoutDialect(Operation *op) {
-  return op->getName().getStringRef().split(".").second;
-}
-
 bool IsActivationFunction(Operation *op) {
   return isa<EluOp>(op) || isa<ReluOp>(op) || isa<Relu6Op>(op);
 }
@@ -128,8 +123,8 @@ class FuseContractionWithBiasAdd : public OpRewritePattern<SrcOpT> {
     }
 
     SmallVector<Location, 3> locations{contraction.getLoc(), bias_add.getLoc()};
-    SmallVector<Attribute, 2> fused_ops{
-        StringAttr::get(GetOpNameWithoutDialect(bias_add), context)};
+    SmallVector<Attribute, 2> fused_ops{StringAttr::get(
+        bias_add.getOperation()->getName().stripDialect(), context)};
 
     // BiasAdd may or may not feed into an activation function.
     auto activation = GetActivation(bias_add);
@@ -143,7 +138,7 @@ class FuseContractionWithBiasAdd : public OpRewritePattern<SrcOpT> {
     if (fuse_activation) {
       locations.push_back(activation->getLoc());
       fused_ops.push_back(
-          StringAttr::get(GetOpNameWithoutDialect(activation), context));
+          StringAttr::get(activation->getName().stripDialect(), context));
       result_type = activation->getResultTypes().front();
     } else {
       result_type = bias_add.getResult().getType();

From 5f2e0240ee7977042e41d9c29c349a7b14301290 Mon Sep 17 00:00:00 2001
From: Yixing Fu <yxfu93@hotmail.com>
Date: Thu, 18 Jun 2020 19:57:23 -0400
Subject: [PATCH 0562/1390] modify docstring

---
 tensorflow/python/saved_model/loader_impl.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/saved_model/loader_impl.py b/tensorflow/python/saved_model/loader_impl.py
index 71f6ed16c9d..06cd988130d 100644
--- a/tensorflow/python/saved_model/loader_impl.py
+++ b/tensorflow/python/saved_model/loader_impl.py
@@ -73,7 +73,7 @@ def parse_saved_model(export_dir):
   """Reads the savedmodel.pb or savedmodel.pbtxt file containing `SavedModel`.
 
   Args:
-    export_dir: Directory containing the SavedModel file.
+    export_dir: String or Pathlike, path to the directory containing the SavedModel file.
 
   Returns:
     A `SavedModel` protocol buffer.

From 86770b177e88cfa25844188a655f8399f5c3526d Mon Sep 17 00:00:00 2001
From: jonah-kohn <51345541+jonah-kohn@users.noreply.github.com>
Date: Thu, 18 Jun 2020 17:00:04 -0700
Subject: [PATCH 0563/1390] Add show_dtype param to plot_model and
 model_to_dot.

---
 tensorflow/python/keras/utils/vis_utils.py | 29 +++++++++++++++++++---
 1 file changed, 25 insertions(+), 4 deletions(-)

diff --git a/tensorflow/python/keras/utils/vis_utils.py b/tensorflow/python/keras/utils/vis_utils.py
index 3720708543f..21de8014c2a 100644
--- a/tensorflow/python/keras/utils/vis_utils.py
+++ b/tensorflow/python/keras/utils/vis_utils.py
@@ -69,6 +69,7 @@ def add_edge(dot, src, dst):
 @keras_export('keras.utils.model_to_dot')
 def model_to_dot(model,
                  show_shapes=False,
+                 show_dtype=False,
                  show_layer_names=True,
                  rankdir='TB',
                  expand_nested=False,
@@ -79,6 +80,7 @@ def model_to_dot(model,
   Arguments:
     model: A Keras model instance.
     show_shapes: whether to display shape information.
+    show_dtype: whether to display layer dtypes.
     show_layer_names: whether to display layer names.
     rankdir: `rankdir` argument passed to PyDot,
         a string specifying the format of the plot:
@@ -150,8 +152,11 @@ def model_to_dot(model,
     if isinstance(layer, wrappers.Wrapper):
       if expand_nested and isinstance(layer.layer,
                                       functional.Functional):
-        submodel_wrapper = model_to_dot(layer.layer, show_shapes,
-                                        show_layer_names, rankdir,
+        submodel_wrapper = model_to_dot(layer.layer, 
+                                        show_shapes,
+                                        show_dtype,
+                                        show_layer_names, 
+                                        rankdir,
                                         expand_nested,
                                         subgraph=True)
         # sub_w : submodel_wrapper
@@ -165,8 +170,11 @@ def model_to_dot(model,
         class_name = '{}({})'.format(class_name, child_class_name)
 
     if expand_nested and isinstance(layer, functional.Functional):
-      submodel_not_wrapper = model_to_dot(layer, show_shapes,
-                                          show_layer_names, rankdir,
+      submodel_not_wrapper = model_to_dot(layer, 
+                                          show_shapes, 
+                                          show_dtype,
+                                          show_layer_names, 
+                                          rankdir,
                                           expand_nested,
                                           subgraph=True)
       # sub_n : submodel_not_wrapper
@@ -180,6 +188,16 @@ def model_to_dot(model,
       label = '{}: {}'.format(layer_name, class_name)
     else:
       label = class_name
+  
+    # Rebuild the label as a table including the layer's dtype.
+    if show_dtype: 
+      def format_dtype(dtype):
+          if dtype is None:
+            return '?'
+          else:
+            return str(dtype)
+          
+      label = '%s|%s' % (label, dtype)
 
     # Rebuild the label as a table including input/output shapes.
     if show_shapes:
@@ -260,6 +278,7 @@ def model_to_dot(model,
 def plot_model(model,
                to_file='model.png',
                show_shapes=False,
+               show_dtype=False,
                show_layer_names=True,
                rankdir='TB',
                expand_nested=False,
@@ -286,6 +305,7 @@ def plot_model(model,
     model: A Keras model instance
     to_file: File name of the plot image.
     show_shapes: whether to display shape information.
+    show_dtype: whether to display layer dtypes.
     show_layer_names: whether to display layer names.
     rankdir: `rankdir` argument passed to PyDot,
         a string specifying the format of the plot:
@@ -300,6 +320,7 @@ def plot_model(model,
   """
   dot = model_to_dot(model,
                      show_shapes=show_shapes,
+                     show_dtype=show_dtype,
                      show_layer_names=show_layer_names,
                      rankdir=rankdir,
                      expand_nested=expand_nested,

From c3ca4da9b46e3148bdb913d08fec3fd2727158e1 Mon Sep 17 00:00:00 2001
From: Yixing Fu <yxfu93@hotmail.com>
Date: Thu, 18 Jun 2020 20:00:06 -0400
Subject: [PATCH 0564/1390] only test pathlib on python v>3.6

---
 tensorflow/python/keras/saving/save_test.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/tensorflow/python/keras/saving/save_test.py b/tensorflow/python/keras/saving/save_test.py
index 98d78735ad1..66364666841 100644
--- a/tensorflow/python/keras/saving/save_test.py
+++ b/tensorflow/python/keras/saving/save_test.py
@@ -73,9 +73,10 @@ class TestSaveModel(test.TestCase, parameterized.TestCase):
 
   @test_util.run_v2_only
   def test_save_format_defaults_pathlib(self):
-    path = pathlib.Path(self.get_temp_dir()) / 'model_path'
-    save.save_model(self.model, path)
-    self.assert_saved_model(path)
+    if sys.version_info >= (3, 6):
+      path = pathlib.Path(self.get_temp_dir()) / 'model_path'
+      save.save_model(self.model, path)
+      self.assert_saved_model(path)
 
   @test_util.run_v2_only
   def test_save_hdf5(self):

From e96543f6fbba9fa112d7ca1d731b64e3654e1629 Mon Sep 17 00:00:00 2001
From: Ran Chen <crccw@google.com>
Date: Thu, 18 Jun 2020 16:55:26 -0700
Subject: [PATCH 0565/1390] Add MultiProcessPoolRunner

Tensorflow initialization can take a long time when GPUs are present. We cannot afford starting a new group of workers for every single test. MultiProcessPoolRunner uses a pool of workers so that we can avoid the initialization cost. Compared to MultiProcessRunner, it doesn't support terminating workers.

Note that implementation wise we could build MultiProcessPoolRunner on top of MultiProcessRunner or vice-versa if there's no need to support termination. Since it's easier for MultiProcessPoolRunner not to support termination, we choose MultiProcessPoolRunner on top of MultiProcessRunner.

PiperOrigin-RevId: 317209754
Change-Id: Ia439028c81c5a9f87b16d631a170158724ce47d4
---
 .../python/distribute/multi_process_runner.py | 205 ++++++++++++++++--
 .../distribute/multi_process_runner_test.py   |  69 +++++-
 2 files changed, 249 insertions(+), 25 deletions(-)

diff --git a/tensorflow/python/distribute/multi_process_runner.py b/tensorflow/python/distribute/multi_process_runner.py
index 8699e59b410..ce36287a9da 100644
--- a/tensorflow/python/distribute/multi_process_runner.py
+++ b/tensorflow/python/distribute/multi_process_runner.py
@@ -67,8 +67,7 @@ except ImportError:
 # exception stack trace info is stored in exc_info to pass on to parent process
 # to be re-raised.
 _ProcessStatusInfo = collections.namedtuple(
-    '_ProcessStatusInfo',
-    ['task_type', 'is_successful', 'exc_info', 'return_value'])
+    '_ProcessStatusInfo', ['is_successful', 'exc_info', 'return_value'])
 
 # Information returned from a successful MultiProcessRunner run.
 MultiProcessRunnerResult = collections.namedtuple('MultiProcessRunnerResult',
@@ -124,6 +123,7 @@ class MultiProcessRunner(object):
                stream_stdout=True,
                list_stdout=False,
                use_dill_for_args=True,
+               daemon=False,
                args=None,
                kwargs=None):
     """Creates a multi-process runner.
@@ -157,6 +157,7 @@ class MultiProcessRunner(object):
       use_dill_for_args: Whether to use dill to pickle `args` and `kwargs`. dill
         can pickle more objects, but doesn't work with types in
         `multiprocessing` library like `Mutex`.
+      daemon: Whether to start processes as daemons.
       args: Positional arguments to be sent to functions run on processes.
       kwargs: Keyword arguments to be sent to functions run on processes.
 
@@ -188,6 +189,7 @@ class MultiProcessRunner(object):
     self._list_stdout = list_stdout
     self._dependence_on_chief = True
     self._use_dill_for_args = use_dill_for_args
+    self._daemon = daemon
     self._args = args or ()
     self._kwargs = kwargs or {}
 
@@ -268,7 +270,8 @@ class MultiProcessRunner(object):
         test_env=test_env,
         target=_ProcFunc(),
         args=(resources, test_env, proc_func, args, kwargs,
-              self._use_dill_for_args))
+              self._use_dill_for_args),
+        daemon=self._daemon)
     p.start()
     self._processes[(task_type, task_id)] = p
     self._outstanding_subprocess_count += 1
@@ -568,7 +571,6 @@ class _ProcFunc(object):
         time.sleep(0.1)
     self._resources.process_status_queue.put(
         _ProcessStatusInfo(
-            task_type=task_type,
             is_successful=True,
             exc_info=None,
             return_value=None))
@@ -628,17 +630,9 @@ class _ProcFunc(object):
     if test_env.v2_enabled:
       v2_compat.enable_v2_behavior()
 
-    try:
-      with self._runtime_mode(test_env.executing_eagerly):
-        return_value = proc_func(*args, **kwargs)
-        is_successful = True
-        exc_info = None
-
-    except Exception:  # pylint: disable=broad-except
-      # Capture all exceptions to be reported to parent process.
-      return_value = None
-      is_successful = False
-      exc_info = sys.exc_info()
+    with self._runtime_mode(test_env.executing_eagerly):
+      info = _run_contained(proc_func, args, kwargs)
+      self._resources.process_status_queue.put(info)
 
       # Re-raise the exception in addition to reporting it to the parent
       # process, so that even if `--test_timeout` flag is set and the
@@ -647,18 +641,183 @@ class _ProcFunc(object):
       # instead of silently suppressing the error due to early bazel
       # timeout. Raising an error in the subprocess produces stack trace in
       # the log, but the program continues running.
-      raise
+      if not info.is_successful:
+        six.reraise(*info.exc_info)
 
-    finally:
-      info = _ProcessStatusInfo(
-          task_type=test_env.task_type,
-          is_successful=is_successful,
-          exc_info=exc_info,
-          return_value=return_value)
-      self._resources.process_status_queue.put(info)
       self._close_streaming()
 
 
+class MultiProcessPoolRunner(object):
+  """A utility class to start a process pool to simulate a cluster.
+
+  It's similar to MultiProcessRunner, but uses a pool of processes to avoid the
+  expensive initialization cost of Tensorflow.
+  """
+
+  def __init__(self, cluster_spec, initializer=None):
+    """Creates a multi-process pool runner.
+
+    Args:
+      cluster_spec: Dict for cluster spec. The following is an example of
+        cluster with three workers.
+        {"worker": ["worker0.example.com:2222",
+                    "worker1.example.com:2222",
+                    "worker2.example.com:2222"]}
+      initializer: a callable to called at the startup of worker processes.
+
+    Raises:
+      RuntimeError: if `multi_process_runner.test_main()` is not called.
+      ValueError: if there are more than one chief in the `cluster_spec`.
+    """
+    self._cluster_spec = cluster_spec
+    self._initializer = initializer
+    self._conn = {}
+    self._runner = None
+
+  def __del__(self):
+    self._reset()
+
+  def _reset(self):
+    for conn in self._conn.values():
+      conn.close()
+    self._conn = {}
+    if self._runner is not None:
+      self._runner.join()
+      self._runner = None
+
+  def _start(self):
+    """Starts the worker pool."""
+    # We need different arguments for different processes so we're passing a
+    # no-op proc_func here and use start_single_process instead.
+    #
+    # We also need to start the process pool as daemon, so that they don't block
+    # the program from exiting. Note that __del__ may not get called when
+    # there's an exception. The user may also store a pool runner in a global
+    # object to share across test cases
+    self._runner = MultiProcessRunner(
+        proc_func=lambda: None,
+        cluster_spec=self._cluster_spec,
+        use_dill_for_args=False,
+        daemon=True)
+    if self._initializer:
+      initializer = dill.dumps(self._initializer, dill.HIGHEST_PROTOCOL)
+    else:
+      initializer = None
+    for task_type, addresses in self._cluster_spec.items():
+      for task_id, _ in enumerate(addresses):
+        conn1, conn2 = multiprocessing.Pipe(duplex=True)
+        self._conn[(task_type, task_id)] = conn1
+        self._runner.start_single_process(
+            task_type,
+            task_id,
+            proc_func=_pool_runner_worker,
+            args=(initializer, conn2))
+
+  def run(self, proc_func, args=None, kwargs=None):
+    """Runs `proc_func` with `args` and `kwargs` on all jobs.
+
+    Args:
+      proc_func: The function to be run.
+      args: Optional positional arguments to be supplied in `proc_func`.
+      kwargs: Optional keyword arguments to be supplied in `proc_func`.
+
+    Returns:
+      A list of return values.
+    """
+    if self._runner is None:
+      self._start()
+
+    # Since we start the processes as daemon they're going to be killed by
+    # SIGTERM when the program exits. We only turn on streaming during run() to
+    # avoid printing the stacktrace caused by the SIGTERM.
+    self._runner._stream_stdout = True  # pylint: disable=protected-access
+
+    try:
+      proc_func = dill.dumps(proc_func, dill.HIGHEST_PROTOCOL)
+      for conn in self._conn.values():
+        conn.send((proc_func, args or [], kwargs or {}))
+
+      process_statuses = []
+      for (task_type, task_id), conn in self._conn.items():
+        logging.info('Waiting for the result from %s-%d', task_type, task_id)
+        try:
+          process_statuses.append(conn.recv())
+        except EOFError:
+          # This shouldn't happen due to exceptions in proc_func. This usually
+          # means bugs in the runner.
+          self._reset()
+          raise RuntimeError('Unexpected EOF. Worker process may have died. '
+                             'Please report a bug')
+
+      return_values = []
+      for process_status in process_statuses:
+        assert isinstance(process_status, _ProcessStatusInfo)
+        if not process_status.is_successful:
+          six.reraise(*process_status.exc_info)
+        if process_status.return_value is not None:
+          return_values.append(process_status.return_value)
+
+      return return_values
+    finally:
+      self._runner._stream_stdout = False  # pylint: disable=protected-access
+
+
+def _pool_runner_worker(initializer, conn):
+  """Function that runs on the workers in a pool.
+
+  It listens for callables to run and returns the result until `conn` is closed.
+  It captures the exceptions during executing the callable and return it through
+  `conn`.
+
+  Args:
+    initializer: A callable to execute during startup.
+    conn: A multiprocessing.Connection object to listen for tasks and send
+      results.
+  """
+  if initializer:
+    initializer = dill.loads(initializer)
+    initializer()
+  while True:
+    try:
+      proc_func, args, kwargs = conn.recv()
+    except EOFError:
+      break
+    proc_func = dill.loads(proc_func)
+    info = _run_contained(proc_func, args, kwargs)
+    sys.stdout.flush()
+    sys.stderr.flush()
+    conn.send(info)
+
+
+def _run_contained(proc_func, args, kwargs):
+  """Runs `proc_func` with `args` and `kwargs`.
+
+  The function returns _ProcessStatusInfo which captures the return value and
+  the exception.
+
+  Args:
+    proc_func: The function to be run.
+    args: Optional positional arguments to be supplied in `proc_func`.
+    kwargs: Optional keyword arguments to be supplied in `proc_func`.
+
+  Returns:
+    a _ProcessStatusInfo.
+  """
+  try:
+    return_value = proc_func(*args, **kwargs)
+    is_successful = True
+    exc_info = None
+  except Exception:  # pylint: disable=broad-except
+    return_value = None
+    is_successful = False
+    exc_info = sys.exc_info()
+  finally:
+    return _ProcessStatusInfo(  # pylint: disable=lost-exception
+        is_successful=is_successful,
+        exc_info=exc_info,
+        return_value=return_value)
+
+
 class SubprocessTimeoutError(RuntimeError):
   """An error that indicates there is at least one subprocess timing out.
 
diff --git a/tensorflow/python/distribute/multi_process_runner_test.py b/tensorflow/python/distribute/multi_process_runner_test.py
index aeba43b6b7c..d76ef5a5a3c 100644
--- a/tensorflow/python/distribute/multi_process_runner_test.py
+++ b/tensorflow/python/distribute/multi_process_runner_test.py
@@ -22,6 +22,8 @@ import json
 import os
 import threading
 import time
+import unittest
+
 from absl import logging
 
 from tensorflow.python.distribute import multi_process_runner
@@ -45,7 +47,7 @@ def proc_func_that_adds_simple_return_data():
   return 'dummy_data'
 
 
-def proc_func_that_return_args_and_kwargs(*args, **kwargs):
+def proc_func_that_returns_args_and_kwargs(*args, **kwargs):
   return list(args) + list(kwargs.items())
 
 
@@ -53,6 +55,20 @@ def proc_func_with_barrier():
   return multi_process_runner.barrier()
 
 
+def proc_func_that_returns_pid():
+  return os.getpid()
+
+
+V = None
+
+
+def proc_func_that_sets_global(val):
+  global V
+  old_val = V
+  V = val
+  return old_val
+
+
 class MultiProcessRunnerTest(test.TestCase):
 
   def _worker_idx(self):
@@ -95,7 +111,7 @@ class MultiProcessRunnerTest(test.TestCase):
 
   def test_multi_process_runner_args_passed_correctly(self):
     return_value = multi_process_runner.run(
-        proc_func_that_return_args_and_kwargs,
+        proc_func_that_returns_args_and_kwargs,
         multi_worker_test_base.create_cluster_spec(num_workers=1),
         args=('a', 'b'),
         kwargs={
@@ -325,5 +341,54 @@ class MultiProcessRunnerTest(test.TestCase):
                 for line in list_to_assert))
 
 
+class MultiProcessPoolRunnerTest(test.TestCase):
+
+  def test_same_process_across_runs(self):
+    cluster_spec = multi_worker_test_base.create_cluster_spec(num_workers=2)
+    runner = multi_process_runner.MultiProcessPoolRunner(cluster_spec)
+    pid = runner.run(proc_func_that_returns_pid)
+    for _ in range(3):
+      self.assertAllEqual(runner.run(proc_func_that_returns_pid), pid)
+
+  def test_exceptions_in_sub_process(self):
+    cluster_spec = multi_worker_test_base.create_cluster_spec(num_workers=2)
+    runner = multi_process_runner.MultiProcessPoolRunner(cluster_spec)
+    pid = runner.run(proc_func_that_returns_pid)
+    with self.assertRaisesRegexp(ValueError, 'This is an error.'):
+      runner.run(proc_func_that_errors)
+    self.assertAllEqual(runner.run(proc_func_that_returns_pid), pid)
+
+  def test_tf_config(self):
+    cluster_spec = multi_worker_test_base.create_cluster_spec(
+        has_chief=True, num_workers=2)
+    runner = multi_process_runner.MultiProcessPoolRunner(cluster_spec)
+    result = runner.run(proc_func_that_adds_task_type_in_return_data)
+
+    job_count_dict = {'worker': 2, 'chief': 1}
+    for data in result:
+      job_count_dict[data] -= 1
+
+    self.assertEqual(job_count_dict['worker'], 0)
+    self.assertEqual(job_count_dict['chief'], 0)
+
+  @unittest.expectedFailure
+  def test_exception_in_main_process(self):
+    # When there's an exception in the main process, __del__() is not called.
+    # This test is to verify MultiProcessPoolRunner can cope with __del__() not
+    # being called.
+    cluster_spec = multi_worker_test_base.create_cluster_spec(
+        has_chief=True, num_workers=2)
+    runner = multi_process_runner.MultiProcessPoolRunner(cluster_spec)
+    runner.run(proc_func_that_returns_pid)
+    raise ValueError('failure')
+
+  def test_initializer(self):
+    cluster_spec = multi_worker_test_base.create_cluster_spec(num_workers=2)
+    runner = multi_process_runner.MultiProcessPoolRunner(
+        cluster_spec, initializer=lambda: proc_func_that_sets_global(1))
+    result = runner.run(proc_func_that_sets_global, args=(2,))
+    self.assertAllEqual(result, [1, 1])
+
+
 if __name__ == '__main__':
   multi_process_runner.test_main()

From 965b93f2ee2ecf6dc7152adc31a76802b0ede85f Mon Sep 17 00:00:00 2001
From: Yixing Fu <yxfu93@hotmail.com>
Date: Thu, 18 Jun 2020 20:05:14 -0400
Subject: [PATCH 0566/1390] test pathlib path for save weights

---
 tensorflow/python/keras/saving/save_test.py | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/tensorflow/python/keras/saving/save_test.py b/tensorflow/python/keras/saving/save_test.py
index 66364666841..5b5da8c5047 100644
--- a/tensorflow/python/keras/saving/save_test.py
+++ b/tensorflow/python/keras/saving/save_test.py
@@ -119,6 +119,20 @@ class TestSaveModel(test.TestCase, parameterized.TestCase):
       save.save_model(self.model, path, save_format='tf')
       save.load_model(path)
 
+  @test_util.run_v2_only
+  def test_save_load_weights_tf_pathlib(self):
+    if sys.version_info >= (3, 6):
+      path = pathlib.Path(self.get_temp_dir()) / 'model'
+      self.model.save_weights(path, save_format='tf')
+      self.model.load_weights(path)
+
+  @test_util.run_v2_only
+  def test_save_load_weights_hdf5_pathlib(self):
+    if sys.version_info >= (3, 6):
+      path = pathlib.Path(self.get_temp_dir()) / 'model'
+      self.model.save_weights(path, save_format='h5')
+      self.model.load_weights(path)
+
   @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def test_saving_with_dense_features(self):
     cols = [

From fd3c95a3b4eb1418d6f87bed916ec17fff44adf0 Mon Sep 17 00:00:00 2001
From: jonah-kohn <51345541+jonah-kohn@users.noreply.github.com>
Date: Thu, 18 Jun 2020 17:07:30 -0700
Subject: [PATCH 0567/1390] Include show_dtype param in plot_model tests.

---
 tensorflow/python/keras/utils/vis_utils_test.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/tensorflow/python/keras/utils/vis_utils_test.py b/tensorflow/python/keras/utils/vis_utils_test.py
index 984014216be..0eb38a1d48f 100644
--- a/tensorflow/python/keras/utils/vis_utils_test.py
+++ b/tensorflow/python/keras/utils/vis_utils_test.py
@@ -36,7 +36,8 @@ class ModelToDotFormatTest(test.TestCase):
     model.add(keras.layers.Dense(5, name='dense'))
     dot_img_file = 'model_1.png'
     try:
-      vis_utils.plot_model(model, to_file=dot_img_file, show_shapes=True)
+      vis_utils.plot_model(model, to_file=dot_img_file, 
+                           show_shapes=True, show_dtype=True)
       self.assertTrue(file_io.file_exists(dot_img_file))
       file_io.delete_file(dot_img_file)
     except ImportError:
@@ -62,7 +63,8 @@ class ModelToDotFormatTest(test.TestCase):
     dot_img_file = 'model_2.png'
     try:
       vis_utils.plot_model(
-          model, to_file=dot_img_file, show_shapes=True, expand_nested=True)
+          model, to_file=dot_img_file, show_shapes=True,
+          show_dtype=True expand_nested=True)
       self.assertTrue(file_io.file_exists(dot_img_file))
       file_io.delete_file(dot_img_file)
     except ImportError:
@@ -76,7 +78,8 @@ class ModelToDotFormatTest(test.TestCase):
     dot_img_file = 'model_3.png'
     try:
       vis_utils.plot_model(
-          model, to_file=dot_img_file, show_shapes=True, expand_nested=True)
+          model, to_file=dot_img_file, show_shapes=True,
+          show_dtype=True, expand_nested=True)
       self.assertTrue(file_io.file_exists(dot_img_file))
       file_io.delete_file(dot_img_file)
     except ImportError:
@@ -88,7 +91,8 @@ class ModelToDotFormatTest(test.TestCase):
     dot_img_file = 'model_4.png'
     try:
       vis_utils.plot_model(
-          model, to_file=dot_img_file, show_shapes=True, expand_nested=True)
+          model, to_file=dot_img_file, show_shapes=True,
+          show_dtype=True expand_nested=True)
       self.assertTrue(file_io.file_exists(dot_img_file))
       file_io.delete_file(dot_img_file)
     except ImportError:

From ae76bc79213d4559b113899f438cf54283ec11c2 Mon Sep 17 00:00:00 2001
From: Marat Dukhan <maratek@google.com>
Date: Thu, 18 Jun 2020 17:19:14 -0700
Subject: [PATCH 0568/1390] Update XNNPACK dependency and document sparse
 inference capability

PiperOrigin-RevId: 317213816
Change-Id: I35431b40fd63d836d4fe979f65a71a181c0c820d
---
 tensorflow/lite/delegates/xnnpack/README.md | 37 +++++++++++++++++++++
 tensorflow/workspace.bzl                    |  8 ++---
 2 files changed, 41 insertions(+), 4 deletions(-)

diff --git a/tensorflow/lite/delegates/xnnpack/README.md b/tensorflow/lite/delegates/xnnpack/README.md
index 97d2d5565db..d94e92c7306 100644
--- a/tensorflow/lite/delegates/xnnpack/README.md
+++ b/tensorflow/lite/delegates/xnnpack/README.md
@@ -238,6 +238,43 @@ Below is the list of current operators and limitations:
 * Fused `NONE`, `RELU`, `RELU_N1_TO_1`, and `RELU6` activations are supported,
   but fused `TANH` and `SIGN_BIT` activations are not.
 
+### Sparse Inference (experimental)
+
+XNNPACK backend supports sparse inference for CNN models described in the
+[Fast Sparse ConvNets](https://arxiv.org/abs/1911.09723) paper. This
+functionality must be enabled at build-time via
+`--define xnn_enable_sparse=true` Bazel flag. Sparse inference is restricted
+to subgraphs with the following operators:
+
+* Sparse subgraph must start with a 3x3 stride-2 `CONV_2D` operator with
+  padding 1 on each side, no dilation, and 3 input channels.
+* Sparse subgraph must end with a `MEAN` operator that does reduction across
+  spatial axes.
+* Sparse subgraph may contain the following operators:
+  * `CONV_2D` with 1x1 kernel and no padding. It is important to have high
+    sparsity (at least 70%) in the filter of this operator to get speedup
+    over dense inference.
+  * `DEPTHWISE_CONV_2D` with 3x3 kernel, stride 1, no dilation, and padding 1
+    on each side.
+  * `DEPTHWISE_CONV_2D` with 3x3 kernel, stride 2, no dilation, and padding 1
+    on each side.
+  * `DEPTHWISE_CONV_2D` with 5x5 kernel, stride 1, no dilation, and padding 2
+    on each side.
+  * `DEPTHWISE_CONV_2D` with 5x5 kernel, stride 2, no dilation, and padding 2
+    on each side.
+  * `ADD` and `MUL` operators where both inputs are 4D tensors. If one of the
+    inputs to `ADD` or `MUL` is a constant tensor, it must be representable as
+    either a scalar, or a 1D vector.
+  * Unary elementwise operators `ABS`, `CEIL`, `FLOOR`, `HARD_SWISH`,
+    `LEAKY_RELU`, `LOGISTIC`, `NEG`, `RELU`, `RELU6`, `RELU_N1_TO_1`, `ROUND`,
+    and `SQUARE`.
+
+Pre-trained [Fast Sparse ConvNets models](https://github.com/google-research/google-research/tree/master/fastconvnets)
+provide examples that satisfy these constrains.
+
+In addition to acceleration, sparse models get the compression benefit by
+storing only non-zero values in the [TensorFlow Lite file format](https://github.com/tensorflow/tensorflow/blob/4aea552e064cf92330e07e83a3b5a1ca2a7034d0/tensorflow/lite/schema/schema.fbs#L84-L109).
+
 ### Other limitations
 
 * Dynamically allocated (with `kTfLiteDynamic` allocation type) inputs and
diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index f5b0b7537dc..52c573628ac 100755
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -164,11 +164,11 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
 
     tf_http_archive(
         name = "XNNPACK",
-        sha256 = "4af883fea0a6ada106867f29670a6c0b7af74bee85d74a2e04356a670814a3d4",
-        strip_prefix = "XNNPACK-69a6a7667d96a84c596b0f4e00632b2037c17723",
+        sha256 = "2527a30464b43bd03f137b2c455a0381e49eae63d09cfeee128a717dfbe962d5",
+        strip_prefix = "XNNPACK-8b283aa30a3186c6e640aed520543e9c067132d2",
         urls = [
-            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/google/XNNPACK/archive/69a6a7667d96a84c596b0f4e00632b2037c17723.zip",
-            "https://github.com/google/XNNPACK/archive/69a6a7667d96a84c596b0f4e00632b2037c17723.zip",
+            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/google/XNNPACK/archive/8b283aa30a3186c6e640aed520543e9c067132d2.zip",
+            "https://github.com/google/XNNPACK/archive/8b283aa30a3186c6e640aed520543e9c067132d2.zip",
         ],
     )
 

From 55e33877b8f81b5ede6365456ed3aa89bbe16d8e Mon Sep 17 00:00:00 2001
From: Yixing Fu <yxfu93@hotmail.com>
Date: Thu, 18 Jun 2020 20:37:52 -0400
Subject: [PATCH 0569/1390] Revert "test pathlib path for save weights"

This reverts commit 965b93f2ee2ecf6dc7152adc31a76802b0ede85f.
---
 tensorflow/python/keras/saving/save_test.py | 14 --------------
 1 file changed, 14 deletions(-)

diff --git a/tensorflow/python/keras/saving/save_test.py b/tensorflow/python/keras/saving/save_test.py
index 5b5da8c5047..66364666841 100644
--- a/tensorflow/python/keras/saving/save_test.py
+++ b/tensorflow/python/keras/saving/save_test.py
@@ -119,20 +119,6 @@ class TestSaveModel(test.TestCase, parameterized.TestCase):
       save.save_model(self.model, path, save_format='tf')
       save.load_model(path)
 
-  @test_util.run_v2_only
-  def test_save_load_weights_tf_pathlib(self):
-    if sys.version_info >= (3, 6):
-      path = pathlib.Path(self.get_temp_dir()) / 'model'
-      self.model.save_weights(path, save_format='tf')
-      self.model.load_weights(path)
-
-  @test_util.run_v2_only
-  def test_save_load_weights_hdf5_pathlib(self):
-    if sys.version_info >= (3, 6):
-      path = pathlib.Path(self.get_temp_dir()) / 'model'
-      self.model.save_weights(path, save_format='h5')
-      self.model.load_weights(path)
-
   @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def test_saving_with_dense_features(self):
     cols = [

From edc445eb184633f0ca10270bca6e9c0b834ab454 Mon Sep 17 00:00:00 2001
From: Yixing Fu <yxfu93@hotmail.com>
Date: Thu, 18 Jun 2020 20:38:02 -0400
Subject: [PATCH 0570/1390] Revert "only test pathlib on python v>3.6"

This reverts commit c3ca4da9b46e3148bdb913d08fec3fd2727158e1.
---
 tensorflow/python/keras/saving/save_test.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/tensorflow/python/keras/saving/save_test.py b/tensorflow/python/keras/saving/save_test.py
index 66364666841..98d78735ad1 100644
--- a/tensorflow/python/keras/saving/save_test.py
+++ b/tensorflow/python/keras/saving/save_test.py
@@ -73,10 +73,9 @@ class TestSaveModel(test.TestCase, parameterized.TestCase):
 
   @test_util.run_v2_only
   def test_save_format_defaults_pathlib(self):
-    if sys.version_info >= (3, 6):
-      path = pathlib.Path(self.get_temp_dir()) / 'model_path'
-      save.save_model(self.model, path)
-      self.assert_saved_model(path)
+    path = pathlib.Path(self.get_temp_dir()) / 'model_path'
+    save.save_model(self.model, path)
+    self.assert_saved_model(path)
 
   @test_util.run_v2_only
   def test_save_hdf5(self):

From 10fb2155fb720f9e0e70d9e48a934383b4b42c91 Mon Sep 17 00:00:00 2001
From: Yixing Fu <yxfu93@hotmail.com>
Date: Thu, 18 Jun 2020 20:38:08 -0400
Subject: [PATCH 0571/1390] Revert "modify docstring"

This reverts commit 5f2e0240ee7977042e41d9c29c349a7b14301290.
---
 tensorflow/python/saved_model/loader_impl.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/saved_model/loader_impl.py b/tensorflow/python/saved_model/loader_impl.py
index 06cd988130d..71f6ed16c9d 100644
--- a/tensorflow/python/saved_model/loader_impl.py
+++ b/tensorflow/python/saved_model/loader_impl.py
@@ -73,7 +73,7 @@ def parse_saved_model(export_dir):
   """Reads the savedmodel.pb or savedmodel.pbtxt file containing `SavedModel`.
 
   Args:
-    export_dir: String or Pathlike, path to the directory containing the SavedModel file.
+    export_dir: Directory containing the SavedModel file.
 
   Returns:
     A `SavedModel` protocol buffer.

From 9f0e739f5f1623b74c1835e9209c7c25b4c49380 Mon Sep 17 00:00:00 2001
From: Yixing Fu <yxfu93@hotmail.com>
Date: Thu, 18 Jun 2020 20:38:15 -0400
Subject: [PATCH 0572/1390] Revert "convert export_dir as pathlike object to
 str in parse_saved_model"

This reverts commit 1c72a6c65e1733b55286c6361142a39d699732dc.
---
 tensorflow/python/saved_model/loader_impl.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/python/saved_model/loader_impl.py b/tensorflow/python/saved_model/loader_impl.py
index 71f6ed16c9d..2df2bea428e 100644
--- a/tensorflow/python/saved_model/loader_impl.py
+++ b/tensorflow/python/saved_model/loader_impl.py
@@ -83,11 +83,11 @@ def parse_saved_model(export_dir):
   """
   # Build the path to the SavedModel in pbtxt format.
   path_to_pbtxt = os.path.join(
-      compat.as_bytes(compat.path_to_str(export_dir)),
+      compat.as_bytes(export_dir),
       compat.as_bytes(constants.SAVED_MODEL_FILENAME_PBTXT))
   # Build the path to the SavedModel in pb format.
   path_to_pb = os.path.join(
-      compat.as_bytes(compat.path_to_str(export_dir)),
+      compat.as_bytes(export_dir),
       compat.as_bytes(constants.SAVED_MODEL_FILENAME_PB))
 
   # Parse the SavedModel protocol buffer.

From 5869f657624ece35544c3fe54f999b2e3f449305 Mon Sep 17 00:00:00 2001
From: Yixing Fu <yxfu93@hotmail.com>
Date: Thu, 18 Jun 2020 20:38:24 -0400
Subject: [PATCH 0573/1390] Revert "add save load test for pathlib path"

This reverts commit 1363f0f6e89cdc73ca46d43475bdb35750ed2e50.
---
 tensorflow/python/keras/saving/save_test.py | 13 -------------
 1 file changed, 13 deletions(-)

diff --git a/tensorflow/python/keras/saving/save_test.py b/tensorflow/python/keras/saving/save_test.py
index 98d78735ad1..5c5846fe738 100644
--- a/tensorflow/python/keras/saving/save_test.py
+++ b/tensorflow/python/keras/saving/save_test.py
@@ -71,12 +71,6 @@ class TestSaveModel(test.TestCase, parameterized.TestCase):
     save.save_model(self.model, path)
     self.assert_saved_model(path)
 
-  @test_util.run_v2_only
-  def test_save_format_defaults_pathlib(self):
-    path = pathlib.Path(self.get_temp_dir()) / 'model_path'
-    save.save_model(self.model, path)
-    self.assert_saved_model(path)
-
   @test_util.run_v2_only
   def test_save_hdf5(self):
     path = os.path.join(self.get_temp_dir(), 'model')
@@ -87,13 +81,6 @@ class TestSaveModel(test.TestCase, parameterized.TestCase):
         'requires the model to be a Functional model or a Sequential model.'):
       save.save_model(self.subclassed_model, path, save_format='h5')
 
-  @test_util.run_v2_only
-  def test_save_load_hdf5_pathlib(self):
-    if sys.version_info >= (3, 6):
-      path = pathlib.Path(self.get_temp_dir()) / 'model'
-      save.save_model(self.model, path, save_format='h5')
-      save.load_model(path)
-
   @test_util.run_v2_only
   def test_save_tf(self):
     path = os.path.join(self.get_temp_dir(), 'model')

From 723751b20ef2aa0a4af39cad2581fd483ae78ad7 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 18 Jun 2020 18:15:12 -0700
Subject: [PATCH 0574/1390] Add set_outfeed_config in XLA HloInstruction.

PiperOrigin-RevId: 317222410
Change-Id: I5de8a5067f1002a9d656d4e26d145ffe3fe372ed
---
 tensorflow/compiler/xla/service/hlo_instruction.cc | 4 ++++
 tensorflow/compiler/xla/service/hlo_instruction.h  | 3 +++
 tensorflow/compiler/xla/service/hlo_instructions.h | 1 +
 3 files changed, 8 insertions(+)

diff --git a/tensorflow/compiler/xla/service/hlo_instruction.cc b/tensorflow/compiler/xla/service/hlo_instruction.cc
index cfa21b95dd2..6de76c1cc63 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.cc
+++ b/tensorflow/compiler/xla/service/hlo_instruction.cc
@@ -3908,6 +3908,10 @@ const string& HloInstruction::outfeed_config() const {
   return Cast<HloOutfeedInstruction>(this)->outfeed_config();
 }
 
+void HloInstruction::set_outfeed_config(const string& config) {
+  return Cast<HloOutfeedInstruction>(this)->set_outfeed_config(config);
+}
+
 const std::vector<ReplicaGroup>& HloInstruction::replica_groups() const {
   return Cast<HloCollectiveInstruction>(this)->replica_groups();
 }
diff --git a/tensorflow/compiler/xla/service/hlo_instruction.h b/tensorflow/compiler/xla/service/hlo_instruction.h
index 7a5d506b681..f3bb59ff625 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.h
+++ b/tensorflow/compiler/xla/service/hlo_instruction.h
@@ -1755,6 +1755,9 @@ class HloInstruction {
   // Returns the config for the Outfeed instruction.
   const string& outfeed_config() const;
 
+  // Delegates to HloOutfeedInstruction::set_outfeed_config.
+  void set_outfeed_config(const string& config);
+
   // Returns the shape for the Outfeed instruction.
   const Shape& outfeed_shape() const;
 
diff --git a/tensorflow/compiler/xla/service/hlo_instructions.h b/tensorflow/compiler/xla/service/hlo_instructions.h
index 6da01dc088e..f5a963ef063 100644
--- a/tensorflow/compiler/xla/service/hlo_instructions.h
+++ b/tensorflow/compiler/xla/service/hlo_instructions.h
@@ -1141,6 +1141,7 @@ class HloOutfeedInstruction : public HloInstruction {
   const Shape& outfeed_shape() const { return outfeed_shape_; }
   // Returns the config for the Outfeed instruction.
   const string& outfeed_config() const { return outfeed_config_; }
+  void set_outfeed_config(const string& config) { outfeed_config_ = config; }
   // Returns a serialized representation of this instruction.
   HloInstructionProto ToProto() const override;
 

From 50763f6e3db1685193c53f0e19405f87f1bcc3b4 Mon Sep 17 00:00:00 2001
From: rahul-kamat <rahulkamat@gmail.com>
Date: Fri, 19 Jun 2020 01:21:19 +0000
Subject: [PATCH 0575/1390] Extend typing to variable types, Annotate params
 with defaults

---
 tensorflow/python/framework/python_op_gen.cc | 20 ++++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

diff --git a/tensorflow/python/framework/python_op_gen.cc b/tensorflow/python/framework/python_op_gen.cc
index 79c8800418c..1ef3f9c342b 100644
--- a/tensorflow/python/framework/python_op_gen.cc
+++ b/tensorflow/python/framework/python_op_gen.cc
@@ -361,7 +361,7 @@ string GenEagerPythonOp::Code() {
 
     // Add type annotations to param
     if (type_map.find(param.GetName()) != type_map.end()) {
-      if(!type_map[param.GetName()].empty()) {
+      if (!type_map[param.GetName()].empty()) {
         strings::StrAppend(&parameters, ": ", type_map[param.GetName()]);
       }
     }
@@ -372,6 +372,19 @@ string GenEagerPythonOp::Code() {
     if (!parameters.empty()) strings::StrAppend(&parameters, ", ");
     if (!parameters_with_defaults.empty())
       strings::StrAppend(&parameters_with_defaults, ", ");
+
+    // Add type annotations to param_and_default
+    if (type_map.find(param_and_default.first.GetName()) != type_map.end()) {
+      if (!type_map[param_and_default.first.GetName()].empty()) {
+        strings::StrAppend(&parameters, ": ", type_map[param_and_default.first.GetName()]);
+        strings::StrAppend(&parameters_with_defaults,
+                           param_and_default.first.GetRenameTo(), ": ",
+                           type_map[param_and_default.first.GetName()], " ",
+                           "= ", param_and_default.second);
+        continue;
+      }
+    }
+
     strings::StrAppend(&parameters, param_and_default.first.GetRenameTo());
     strings::StrAppend(&parameters_with_defaults,
                        param_and_default.first.GetRenameTo(), "=",
@@ -427,6 +440,9 @@ std::unordered_map<string, string> GenEagerPythonOp::GetTypeAnnotationMap() {
     if (attr.type() == "type") {
       const string type_var_name = "TV_" + op_def_.name() + "_" + attr.name();
       type_map[attr.name()] = type_var_name;
+    } else if (attr.type() == "bool" || attr.type() == "float" ||
+               attr.type() == "int" || attr.type() == "bytes") {
+      type_map[attr.name()] = attr.type();
     }
   }
 
@@ -507,7 +523,7 @@ void GenEagerPythonOp::GenerateTypeVars() {
     }
   }
 
-  if(added_typevar) strings::StrAppend(&result_, "\n");
+  if (added_typevar) strings::StrAppend(&result_, "\n");
 }
 
 void GenEagerPythonOp::AddReturnTypeAnnotation(std::unordered_map<string, string>& type_map) {

From 0bef067291d26baf7cde98e7d3d6fbcf54161d58 Mon Sep 17 00:00:00 2001
From: peng <pengmeng@tencent.com>
Date: Fri, 19 Jun 2020 09:24:20 +0800
Subject: [PATCH 0576/1390] fix test error with no GPU config

---
 tensorflow/core/grappler/optimizers/remapper_test.cc | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/tensorflow/core/grappler/optimizers/remapper_test.cc b/tensorflow/core/grappler/optimizers/remapper_test.cc
index a947210d8a6..56f31cb49b3 100644
--- a/tensorflow/core/grappler/optimizers/remapper_test.cc
+++ b/tensorflow/core/grappler/optimizers/remapper_test.cc
@@ -485,11 +485,6 @@ TEST_F(RemapperTest, FuseConv2DWithBiasAndActivationOnGPU) {
   item.feed = {{"input", input_t}, {"filter", filter_t}, {"bias", bias_t}};
   TF_ASSERT_OK(s.ToGraphDef(&item.graph));
 
-  // Place all nodes on GPU.
-  for (int i = 0; i < item.graph.node_size(); ++i) {
-    item.graph.mutable_node(i)->set_device("/device:GPU:0");
-  }
-
   Remapper optimizer(RewriterConfig::AGGRESSIVE);  // trust placeholders shape
   //Remapper optimizer(RewriterConfig::ON);
   GraphDef output;

From 4e5c26fb53d6e6467ebbb5a65a73af8872d383d3 Mon Sep 17 00:00:00 2001
From: Jonah Kohn <51345541+jonah-kohn@users.noreply.github.com>
Date: Thu, 18 Jun 2020 18:40:17 -0700
Subject: [PATCH 0577/1390] Revert commit 3833402 for modular PRs

---
 tensorflow/python/keras/optimizer_v2/optimizer_v2.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/keras/optimizer_v2/optimizer_v2.py b/tensorflow/python/keras/optimizer_v2/optimizer_v2.py
index d8992bbe3e0..c55b332bfc0 100644
--- a/tensorflow/python/keras/optimizer_v2/optimizer_v2.py
+++ b/tensorflow/python/keras/optimizer_v2/optimizer_v2.py
@@ -910,7 +910,7 @@ class OptimizerV2(trackable.Trackable):
       return value()
     if tensor_util.is_tensor(value):
       return backend.get_value(value)
-    return float(value)
+    return value
 
   def variables(self):
     """Returns variables of this Optimizer based on the order created."""

From 8e70fe45468fd1c0b845814907bdf7d930e30f46 Mon Sep 17 00:00:00 2001
From: rahul-kamat <rahulkamat@gmail.com>
Date: Fri, 19 Jun 2020 02:09:23 +0000
Subject: [PATCH 0578/1390] Generate TypeVar name once, Change input & attr
 enumeration

---
 tensorflow/python/framework/python_op_gen.cc | 21 +++++++++-----------
 1 file changed, 9 insertions(+), 12 deletions(-)

diff --git a/tensorflow/python/framework/python_op_gen.cc b/tensorflow/python/framework/python_op_gen.cc
index 1ef3f9c342b..4ecfccb611f 100644
--- a/tensorflow/python/framework/python_op_gen.cc
+++ b/tensorflow/python/framework/python_op_gen.cc
@@ -180,10 +180,10 @@ class GenEagerPythonOp : public python_op_gen_internal::GenPythonOp {
 
   void AddRawOpExport(const string& parameters);
 
-  void GenerateTypeVars();
-
   std::unordered_map<string, string> GetTypeAnnotationMap();
 
+  void GenerateTypeVars(std::unordered_map<string, string>& type_map);
+
   void AddReturnTypeAnnotation(std::unordered_map<string, string>& type_map);
 
   void AddAttrForArg(const string& attr, int arg_index) {
@@ -435,8 +435,7 @@ string GenEagerPythonOp::Code() {
 std::unordered_map<string, string> GenEagerPythonOp::GetTypeAnnotationMap() {
   std::unordered_map<string, string> type_map;
   // Mapping attrs to TypeVars
-  for (int i = 0; i<op_def_.attr_size(); ++i) {
-    const auto& attr(op_def_.attr(i));
+  for (const auto& attr : op_def_.attr()) {
     if (attr.type() == "type") {
       const string type_var_name = "TV_" + op_def_.name() + "_" + attr.name();
       type_map[attr.name()] = type_var_name;
@@ -447,8 +446,7 @@ std::unordered_map<string, string> GenEagerPythonOp::GetTypeAnnotationMap() {
   }
 
   // Mapping input Tensors to their types
-  for (int i = 0; i < op_def_.input_arg_size(); ++i) {
-    const auto& arg = op_def_.input_arg(i);
+  for (const auto& arg : op_def_.input_arg()) {
     // Do not add type annotations to args that accept a sequence of tensors
     if (!arg.number_attr().empty()) continue;
     string type_annotation;
@@ -466,7 +464,7 @@ std::unordered_map<string, string> GenEagerPythonOp::GetTypeAnnotationMap() {
     type_map[arg.name()] = type_annotation;
   }
 
-  // Mapping output Tensor to its types
+  // Mapping output Tensor to its type
   if (op_def_.output_arg_size() == 1) {
     const auto& arg = op_def_.output_arg(0);
     string type_annotation;
@@ -488,10 +486,9 @@ std::unordered_map<string, string> GenEagerPythonOp::GetTypeAnnotationMap() {
 }
 
 // Generate TypeVars using attrs
-void GenEagerPythonOp::GenerateTypeVars() {
+void GenEagerPythonOp::GenerateTypeVars(std::unordered_map<string, string>& type_map) {
   bool added_typevar = false;
-  for (int i = 0; i<op_def_.attr_size(); ++i) {
-    const auto& attr(op_def_.attr(i));
+  for (const auto& attr : op_def_.attr()) {
     if (attr.type() == "type") {
       std::vector<string> allowed_types;
       for (int t : attr.allowed_values().list().type()) {
@@ -517,7 +514,7 @@ void GenEagerPythonOp::GenerateTypeVars() {
         strings::StrAppend(&typevar_dtypes, *it);
       }
 
-      const string type_var_name = "TV_" + op_def_.name() + "_" + attr.name();
+      const string type_var_name = type_map[attr.name()];
       strings::StrAppend(&result_, type_var_name, " = TypeVar(\"", type_var_name, "\", ", typevar_dtypes,")\n");
       added_typevar = true;
     }
@@ -863,7 +860,7 @@ bool GenEagerPythonOp::AddEagerFastPathAndGraphCode(
     const string& parameters, const std::vector<string>& output_sizes,
     const string& eager_not_allowed_error, std::unordered_map<string, string>& type_map) {
   if (type_annotate_ops.find(op_def_.name()) != type_annotate_ops.end()) {
-    GenerateTypeVars();
+    GenerateTypeVars(type_map);
   }
   if (api_def_.visibility() == ApiDef::VISIBLE) {
     strings::StrAppend(&result_, "@_dispatch.add_dispatch_list\n");

From b7edd44ee0f8c264e457c48138474f6e1bf5b18e Mon Sep 17 00:00:00 2001
From: Dan Moldovan <mdan@google.com>
Date: Thu, 18 Jun 2020 19:08:04 -0700
Subject: [PATCH 0579/1390] Enable type annotations for python/ops.

PiperOrigin-RevId: 317229132
Change-Id: I7055e650308c2fc83969385dd25e86fb5b073d75
---
 tensorflow/python/ops/logging_ops.py | 18 +++---------------
 1 file changed, 3 insertions(+), 15 deletions(-)

diff --git a/tensorflow/python/ops/logging_ops.py b/tensorflow/python/ops/logging_ops.py
index 8ca63f55987..02fce277690 100644
--- a/tensorflow/python/ops/logging_ops.py
+++ b/tensorflow/python/ops/logging_ops.py
@@ -54,11 +54,9 @@ except NameError:
 # call relies on certain conditionals for its dependencies.  Use
 # control_flow_ops.Assert.
 
-# Assert and Print are special symbols in python, so we must
-# have an upper-case version of them.
-#
-# For users with Python 3 or Python 2.7
-# with `from __future__ import print_function`, we could also allow lowercase.
+# Assert and Print are special symbols in Python 2, so we must
+# have an upper-case version of them. When support for it is dropped,
+# we can allow lowercase.
 # See https://github.com/tensorflow/tensorflow/issues/18053
 
 
@@ -83,11 +81,6 @@ def Print(input_, data, message=None, first_n=None, summarize=None, name=None):
     with jupyter notebook (printing to the notebook *server's* output, not into
     the notebook).
 
-  Additionally, to use tf.print in python 2.7, users must make sure to import
-  the following:
-
-  `from __future__ import print_function`
-
   Args:
     input_: A tensor passed through this op.
     data: A list of tensors to print out when op is evaluated.
@@ -148,11 +141,6 @@ def print_v2(*inputs, **kwargs):
   Python objects. Printed tensors will recursively show the first and last
   elements of each dimension to summarize.
 
-  @compatibility(python2)
-  In python 2.7, make sure to import the following:
-  `from __future__ import print_function`
-  @end_compatibility
-
   Example:
     Single-input usage:
 

From 13fe5862de7b95fd91aeec8f2d71e9f2e77b699b Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 18 Jun 2020 19:11:15 -0700
Subject: [PATCH 0580/1390] Integrate LLVM at
 https://github.com/llvm/llvm-project/commit/c830d517b4e4

PiperOrigin-RevId: 317229564
Change-Id: I10163c3e668996252d294018794081394cc0d25c
---
 third_party/mlir/test.BUILD | 1 +
 1 file changed, 1 insertion(+)

diff --git a/third_party/mlir/test.BUILD b/third_party/mlir/test.BUILD
index 23287ce28d6..14c2ba7778e 100644
--- a/third_party/mlir/test.BUILD
+++ b/third_party/mlir/test.BUILD
@@ -166,6 +166,7 @@ cc_library(
         "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:SCFDialect",
         "@llvm-project//mlir:StandardOps",
+        "@llvm-project//mlir:StandardOpsTransforms",
         "@llvm-project//mlir:Support",
         "@llvm-project//mlir:TargetNVVMIR",
         "@llvm-project//mlir:TargetROCDLIR",

From 4a14e778d64853a236941259693aa3c5813c18d8 Mon Sep 17 00:00:00 2001
From: David Majnemer <majnemer@google.com>
Date: Thu, 18 Jun 2020 19:11:40 -0700
Subject: [PATCH 0581/1390] [XLA] Introduce ManifestCheckingTest

PiperOrigin-RevId: 317229603
Change-Id: Ibcc9ea3895d520024f5d80d52330aeb3b970585d
---
 tensorflow/compiler/xla/tests/BUILD           |  23 +++-
 tensorflow/compiler/xla/tests/build_defs.bzl  |   7 +-
 .../xla/tests/client_library_test_base.h      |   3 +-
 tensorflow/compiler/xla/tests/hlo_test_base.h |   3 +-
 .../xla/tests/local_client_test_base.h        |   3 +-
 .../xla/tests/manifest_checking_test.cc       | 129 ++++++++++++++++++
 .../xla/tests/manifest_checking_test.h        |  35 +++++
 tensorflow/compiler/xla/tests/test_macros.cc  |  89 +-----------
 tensorflow/compiler/xla/tests/test_macros.h   | 118 +---------------
 9 files changed, 201 insertions(+), 209 deletions(-)
 create mode 100644 tensorflow/compiler/xla/tests/manifest_checking_test.cc
 create mode 100644 tensorflow/compiler/xla/tests/manifest_checking_test.h

diff --git a/tensorflow/compiler/xla/tests/BUILD b/tensorflow/compiler/xla/tests/BUILD
index e1863a8a4cf..9b36117602b 100644
--- a/tensorflow/compiler/xla/tests/BUILD
+++ b/tensorflow/compiler/xla/tests/BUILD
@@ -52,16 +52,26 @@ cc_library(
     name = "test_macros_header",
     testonly = True,
     hdrs = ["test_macros.h"],
-    deps = [
-        "//tensorflow/compiler/xla:types",
-        "//tensorflow/core:test",
-        "@com_google_absl//absl/strings",
-    ],
 )
 
 # Generate a test_macros_${BACKEND} library per backend with the proper copts.
 generate_backend_test_macros()
 
+cc_library(
+    name = "manifest_checking_test",
+    testonly = True,
+    srcs = ["manifest_checking_test.cc"],
+    hdrs = ["manifest_checking_test.h"],
+    deps = [
+        ":test_macros_header",
+        "//tensorflow/core:regexp_internal",
+        "//tensorflow/core:test",
+        "//tensorflow/core/platform:logging",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
 cc_library(
     name = "test_utils",
     srcs = ["test_utils.cc"],
@@ -136,6 +146,7 @@ cc_library(
     hdrs = ["hlo_test_base.h"],
     deps = [
         ":literal_test_util",
+        ":manifest_checking_test",
         ":test_utils",
         ":verified_hlo_module",
         "//tensorflow/compiler/xla:debug_options_flags",
@@ -193,6 +204,7 @@ cc_library(
     srcs = ["client_library_test_base.cc"],
     hdrs = ["client_library_test_base.h"],
     deps = [
+        ":manifest_checking_test",
         "//tensorflow/compiler/xla:array2d",
         "//tensorflow/compiler/xla:array3d",
         "//tensorflow/compiler/xla:array4d",
@@ -273,6 +285,7 @@ cc_library(
     hdrs = ["local_client_test_base.h"],
     deps = [
         ":client_library_test_base",
+        ":manifest_checking_test",
         ":verified_hlo_module",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
diff --git a/tensorflow/compiler/xla/tests/build_defs.bzl b/tensorflow/compiler/xla/tests/build_defs.bzl
index c0c0751b0de..94d870aa2ef 100644
--- a/tensorflow/compiler/xla/tests/build_defs.bzl
+++ b/tensorflow/compiler/xla/tests/build_defs.bzl
@@ -266,11 +266,6 @@ def generate_backend_test_macros(backends = []):
                 "-DXLA_DISABLED_MANIFEST=\\\"%s\\\"" % manifest,
             ],
             deps = [
-                "@com_google_absl//absl/container:flat_hash_map",
-                "@com_google_absl//absl/strings",
-                "//tensorflow/compiler/xla:types",
-                "//tensorflow/core:lib",
-                "//tensorflow/core:regexp_internal",
-                "//tensorflow/core:test",
+                "//tensorflow/core/platform:logging",
             ],
         )
diff --git a/tensorflow/compiler/xla/tests/client_library_test_base.h b/tensorflow/compiler/xla/tests/client_library_test_base.h
index 790497f888e..17bb70bdb42 100644
--- a/tensorflow/compiler/xla/tests/client_library_test_base.h
+++ b/tensorflow/compiler/xla/tests/client_library_test_base.h
@@ -35,6 +35,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
+#include "tensorflow/compiler/xla/tests/manifest_checking_test.h"
 #include "tensorflow/compiler/xla/tests/test_utils.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/core/bitmap.h"
@@ -62,7 +63,7 @@ std::vector<TestCase> ExpandUseBfloat16(
 }
 
 // A client library test establishes an in-process XLA client connection.
-class ClientLibraryTestBase : public ::testing::Test {
+class ClientLibraryTestBase : public ManifestCheckingTest {
  protected:
   explicit ClientLibraryTestBase(se::Platform* platform = nullptr);
 
diff --git a/tensorflow/compiler/xla/tests/hlo_test_base.h b/tensorflow/compiler/xla/tests/hlo_test_base.h
index 85b1876dd3c..17c2a55ba5b 100644
--- a/tensorflow/compiler/xla/tests/hlo_test_base.h
+++ b/tensorflow/compiler/xla/tests/hlo_test_base.h
@@ -32,6 +32,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/shape_layout.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
+#include "tensorflow/compiler/xla/tests/manifest_checking_test.h"
 #include "tensorflow/compiler/xla/tests/verified_hlo_module.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
@@ -67,7 +68,7 @@ namespace xla {
 //  )
 //
 // For a more detailed example, see "../tests/sample_text_test.cc".
-class HloTestBase : public ::testing::Test {
+class HloTestBase : public ManifestCheckingTest {
  public:
   // Creates a new HLO module for a test. The module created will have
   // TestName() for its name; it will also automatically populate its debug
diff --git a/tensorflow/compiler/xla/tests/local_client_test_base.h b/tensorflow/compiler/xla/tests/local_client_test_base.h
index ea457024618..c1951ad1021 100644
--- a/tensorflow/compiler/xla/tests/local_client_test_base.h
+++ b/tensorflow/compiler/xla/tests/local_client_test_base.h
@@ -32,6 +32,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/transfer_manager.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
+#include "tensorflow/compiler/xla/tests/manifest_checking_test.h"
 #include "tensorflow/compiler/xla/tests/verified_hlo_module.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/platform/mutex.h"
@@ -75,7 +76,7 @@ class TestAllocator : public se::StreamExecutorMemoryAllocator {
 };
 
 // A base class for tests which exercise the LocalClient interface.
-class LocalClientTestBase : public ::testing::Test {
+class LocalClientTestBase : public ManifestCheckingTest {
  protected:
   struct EigenThreadPoolWrapper;
   explicit LocalClientTestBase(se::Platform* platform = nullptr);
diff --git a/tensorflow/compiler/xla/tests/manifest_checking_test.cc b/tensorflow/compiler/xla/tests/manifest_checking_test.cc
new file mode 100644
index 00000000000..8806290472d
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/manifest_checking_test.cc
@@ -0,0 +1,129 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/tests/manifest_checking_test.h"
+
+#include <fstream>
+#include <iterator>
+#include <string>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/strings/ascii.h"
+#include "absl/strings/str_split.h"
+#include "tensorflow/compiler/xla/tests/test_macros.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/regexp.h"
+
+namespace xla {
+
+namespace {
+
+// Mapping from test name; i.e. MyTest.MyTestCase to platforms on which it is
+// disabled - a sequence of regexps.
+using ManifestT = absl::flat_hash_map<std::string, std::vector<std::string>>;
+
+ManifestT ReadManifest() {
+  ManifestT manifest;
+
+  absl::string_view path = absl::NullSafeStringView(kDisabledManifestPath);
+  if (path.empty()) {
+    return manifest;
+  }
+
+  // Note: parens are required to disambiguate vs function decl.
+  std::ifstream file_stream((std::string(path)));
+  std::string contents((std::istreambuf_iterator<char>(file_stream)),
+                       std::istreambuf_iterator<char>());
+
+  std::vector<std::string> lines = absl::StrSplit(contents, '\n');
+  for (std::string& line : lines) {
+    auto comment = line.find("//");
+    if (comment != std::string::npos) {
+      line = line.substr(0, comment);
+    }
+    if (line.empty()) {
+      continue;
+    }
+    absl::StripTrailingAsciiWhitespace(&line);
+    std::vector<std::string> pieces = absl::StrSplit(line, ' ');
+    CHECK_GE(pieces.size(), 1);
+    auto& platforms = manifest[pieces[0]];
+    for (size_t i = 1; i < pieces.size(); ++i) {
+      platforms.push_back(pieces[i]);
+    }
+  }
+  return manifest;
+}
+
+}  // namespace
+
+void ManifestCheckingTest::SetUp() {
+  const testing::TestInfo* test_info =
+      testing::UnitTest::GetInstance()->current_test_info();
+  absl::string_view test_case_name = test_info->test_suite_name();
+  absl::string_view test_name = test_info->name();
+  VLOG(1) << "test_case_name: " << test_case_name;
+  VLOG(1) << "test_name: " << test_name;
+
+  // Remove the type suffix from the test case name.
+  if (const char* type_param = test_info->type_param()) {
+    VLOG(1) << "type_param: " << type_param;
+    size_t last_slash = test_case_name.rfind('/');
+    test_case_name = test_case_name.substr(0, last_slash);
+    VLOG(1) << "test_case_name: " << test_case_name;
+  }
+
+  // Remove the test instantiation name if it is present.
+  auto first_slash = test_case_name.find('/');
+  if (first_slash != test_case_name.npos) {
+    test_case_name.remove_prefix(first_slash + 1);
+    VLOG(1) << "test_case_name: " << test_case_name;
+  }
+
+  ManifestT manifest = ReadManifest();
+
+  // If the test name ends with a slash followed by one or more characters,
+  // strip that off.
+  auto last_slash = test_name.rfind('/');
+  if (last_slash != test_name.npos) {
+    test_name = test_name.substr(0, last_slash);
+    VLOG(1) << "test_name: " << test_name;
+  }
+
+  // First try full match: test_case_name.test_name
+  // If that fails, try to find just the test_case_name; this would disable all
+  // tests in the test case.
+  auto it = manifest.find(absl::StrCat(test_case_name, ".", test_name));
+  if (it == manifest.end()) {
+    it = manifest.find(test_case_name);
+    if (it == manifest.end()) {
+      return;
+    }
+  }
+
+  // Expect a full match vs. one of the platform regexps to disable the test.
+  const std::vector<std::string>& disabled_platforms = it->second;
+  auto platform_string = kTestPlatform;
+  for (const auto& s : disabled_platforms) {
+    if (RE2::FullMatch(/*text=*/platform_string, /*re=*/s)) {
+      GTEST_SKIP();
+      return;
+    }
+  }
+
+  // We didn't hit in the disabled manifest entries, so don't disable it.
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/manifest_checking_test.h b/tensorflow/compiler/xla/tests/manifest_checking_test.h
new file mode 100644
index 00000000000..4f44ed76a3e
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/manifest_checking_test.h
@@ -0,0 +1,35 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_TESTS_MANIFEST_CHECKING_TEST_H_
+#define TENSORFLOW_COMPILER_XLA_TESTS_MANIFEST_CHECKING_TEST_H_
+
+#include "tensorflow/core/platform/test.h"
+
+namespace xla {
+
+// This class allows us to intercept the test name and use an arbitrary
+// heuristic to decide whether the test case should be disabled. We
+// determine whether the test case should be disabled by resolving the (test
+// case name, test name) in a manifest file.
+class ManifestCheckingTest : public ::testing::Test {
+ protected:
+  // This method runs before each test runs.
+  void SetUp() override;
+};
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_TESTS_MANIFEST_CHECKING_TEST_H_
diff --git a/tensorflow/compiler/xla/tests/test_macros.cc b/tensorflow/compiler/xla/tests/test_macros.cc
index dc9ac7b684a..9e85af76e89 100644
--- a/tensorflow/compiler/xla/tests/test_macros.cc
+++ b/tensorflow/compiler/xla/tests/test_macros.cc
@@ -15,93 +15,18 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/tests/test_macros.h"
 
-#include <fstream>
-#include <streambuf>
-#include <string>
-
-#include "absl/container/flat_hash_map.h"
-#include "absl/strings/str_cat.h"
-#include "absl/strings/str_split.h"
 #include "tensorflow/core/platform/logging.h"
-#include "tensorflow/core/platform/regexp.h"
 
 namespace xla {
-namespace {
 
-// Mapping from test name; i.e. MyTest.MyTestCase to platforms on which it is
-// disabled - a sequence of regexps.
-using ManifestT = absl::flat_hash_map<string, std::vector<string>>;
-
-ManifestT ReadManifest() {
-  ManifestT manifest;
-
-  string path = XLA_DISABLED_MANIFEST;
-  if (path.empty()) {
-    return manifest;
-  }
-
-  std::ifstream file_stream(path);
-  // Note: parens are required to disambiguate vs function decl.
-  string contents((std::istreambuf_iterator<char>(file_stream)),
-                  std::istreambuf_iterator<char>());
-
-  std::vector<string> lines = absl::StrSplit(contents, '\n');
-  for (string& line : lines) {
-    auto comment = line.find("//");
-    if (comment != string::npos) {
-      line = line.substr(0, comment);
-    }
-    if (line.empty()) {
-      continue;
-    }
-    absl::StripTrailingAsciiWhitespace(&line);
-    std::vector<string> pieces = absl::StrSplit(line, ' ');
-    CHECK_GE(pieces.size(), 1);
-    auto& platforms = manifest[pieces[0]];
-    for (int64 i = 1; i < pieces.size(); ++i) {
-      platforms.push_back(pieces[i]);
-    }
-  }
-  return manifest;
+static bool InitModule() {
+  kDisabledManifestPath = XLA_DISABLED_MANIFEST;
+  VLOG(1) << "kDisabledManifestPath: " << kDisabledManifestPath;
+  kTestPlatform = XLA_PLATFORM;
+  VLOG(1) << "kTestPlatform: " << kTestPlatform;
+  return false;
 }
 
-}  // namespace
-
-std::string PrependDisabledIfIndicated(absl::string_view test_case_name,
-                                       absl::string_view test_name) {
-  ManifestT manifest = ReadManifest();
-
-  // If the test name ends with a slash followed by one or more digits, strip
-  // that off; this is just a shard number, and matching on this would be
-  // unstable even if someone wanted to do it.
-  static LazyRE2 shard_num_pattern = {R"(/\d+$)"};
-  absl::string_view suffix;
-  if (RE2::PartialMatch(test_name, *shard_num_pattern, &suffix)) {
-    test_name.remove_suffix(suffix.size());
-  }
-
-  // First try full match: test_case_name.test_name
-  // If that fails, try to find just the test_case_name; this would disable all
-  // tests in the test case.
-  auto it = manifest.find(absl::StrCat(test_case_name, ".", test_name));
-  if (it == manifest.end()) {
-    it = manifest.find(test_case_name);
-    if (it == manifest.end()) {
-      return std::string(test_name);
-    }
-  }
-
-  // Expect a full match vs. one of the platform regexps to disable the test.
-  const std::vector<string>& disabled_platforms = it->second;
-  string platform_string = XLA_PLATFORM;
-  for (const auto& s : disabled_platforms) {
-    if (RE2::FullMatch(/*text=*/platform_string, /*re=*/s)) {
-      return absl::StrCat("DISABLED_", test_name);
-    }
-  }
-
-  // We didn't hit in the disabled manifest entries, so don't disable it.
-  return std::string(test_name);
-}
+static bool module_initialized = InitModule();
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/test_macros.h b/tensorflow/compiler/xla/tests/test_macros.h
index 33d2dff9721..f62bccbe850 100644
--- a/tensorflow/compiler/xla/tests/test_macros.h
+++ b/tensorflow/compiler/xla/tests/test_macros.h
@@ -28,12 +28,6 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_TESTS_TEST_MACROS_H_
 #define TENSORFLOW_COMPILER_XLA_TESTS_TEST_MACROS_H_
 
-#include <string>
-
-#include "absl/strings/string_view.h"
-#include "tensorflow/compiler/xla/types.h"
-#include "tensorflow/core/platform/test.h"
-
 #define DISABLED_ON_CPU(X) X
 #define DISABLED_ON_GPU(X) X
 #define DISABLED_ON_GPU_ROCM(X) X
@@ -79,117 +73,15 @@ limitations under the License.
 
 namespace xla {
 
-// Reads a disabled manifest file to resolve whether test cases should be
-// disabled on a particular platform. For a test that should be disabled,
-// returns DISABLED_ prepended to its name; otherwise returns the test name
-// unmodified.
-std::string PrependDisabledIfIndicated(absl::string_view test_case_name,
-                                       absl::string_view test_name);
+inline const char *kDisabledManifestPath = nullptr;
+inline const char *kTestPlatform = nullptr;
 
 }  // namespace xla
 
-// This is the internal "gtest" class instantiation -- it is identical to the
-// GTEST_TEST_ macro, except that we intercept the test name for potential
-// modification by PrependDisabledIfIndicated. That file can use an arbitrary
-// heuristic to decide whether the test case should be disabled, and we
-// determine whether the test case should be disabled by resolving the (test
-// case name, test name) in a manifest file.
-#define XLA_GTEST_TEST_(test_case_name, test_name, parent_class)             \
-  class GTEST_TEST_CLASS_NAME_(test_case_name, test_name)                    \
-      : public parent_class {                                                \
-   public:                                                                   \
-    GTEST_TEST_CLASS_NAME_(test_case_name, test_name)() {}                   \
-                                                                             \
-   private:                                                                  \
-    virtual void TestBody();                                                 \
-    static ::testing::TestInfo* const test_info_ GTEST_ATTRIBUTE_UNUSED_;    \
-    GTEST_DISALLOW_COPY_AND_ASSIGN_(GTEST_TEST_CLASS_NAME_(test_case_name,   \
-                                                           test_name));      \
-  };                                                                         \
-                                                                             \
-  ::testing::TestInfo* const GTEST_TEST_CLASS_NAME_(test_case_name,          \
-                                                    test_name)::test_info_ = \
-      ::testing::RegisterTest(                                               \
-          #test_case_name,                                                   \
-          ::xla::PrependDisabledIfIndicated(#test_case_name, #test_name)     \
-              .c_str(),                                                      \
-          nullptr, nullptr, __FILE__, __LINE__, []() -> parent_class* {      \
-            return new GTEST_TEST_CLASS_NAME_(test_case_name, test_name)();  \
-          });                                                                \
-  void GTEST_TEST_CLASS_NAME_(test_case_name, test_name)::TestBody()
+#define XLA_TEST_F(test_fixture, test_name) TEST_F(test_fixture, test_name)
 
-// This is identical to the TEST_F macro from "gtest", but it potentially
-// disables the test based on an external manifest file, DISABLED_MANIFEST.
-//
-// Per usual, you can see what tests are available via --gunit_list_tests and
-// choose to run tests that have been disabled via the manifest via
-// --gunit_also_run_disabled_tests.
-#define XLA_TEST_F(test_fixture, test_name) \
-  XLA_GTEST_TEST_(test_fixture, test_name, test_fixture)
+#define XLA_TEST_P(test_case_name, test_name) TEST_P(test_case_name, test_name)
 
-// Likewise, this is identical to the TEST_P macro from "gtest", but
-// potentially disables the test based on the DISABLED_MANIFEST file.
-//
-// We have to wrap this in an outer layer so that any DISABLED_ON_* macros will
-// be properly expanded before the stringification occurs.
-#define XLA_TEST_P_IMPL_(test_case_name, test_name)                            \
-  class GTEST_TEST_CLASS_NAME_(test_case_name, test_name)                      \
-      : public test_case_name {                                                \
-   public:                                                                     \
-    GTEST_TEST_CLASS_NAME_(test_case_name, test_name)() {}                     \
-    virtual void TestBody();                                                   \
-                                                                               \
-   private:                                                                    \
-    static int AddToRegistry() {                                               \
-      ::testing::UnitTest::GetInstance()                                       \
-          ->parameterized_test_registry()                                      \
-          .GetTestCasePatternHolder<test_case_name>(                           \
-              #test_case_name,                                                 \
-              ::testing::internal::CodeLocation(__FILE__, __LINE__))           \
-          ->AddTestPattern(                                                    \
-              #test_case_name,                                                 \
-              ::xla::PrependDisabledIfIndicated(#test_case_name, #test_name)   \
-                  .c_str(),                                                    \
-              new ::testing::internal::TestMetaFactory<GTEST_TEST_CLASS_NAME_( \
-                  test_case_name, test_name)>());                              \
-      return 0;                                                                \
-    }                                                                          \
-    static int gtest_registering_dummy_ GTEST_ATTRIBUTE_UNUSED_;               \
-    GTEST_DISALLOW_COPY_AND_ASSIGN_(GTEST_TEST_CLASS_NAME_(test_case_name,     \
-                                                           test_name));        \
-  };                                                                           \
-  int GTEST_TEST_CLASS_NAME_(test_case_name,                                   \
-                             test_name)::gtest_registering_dummy_ =            \
-      GTEST_TEST_CLASS_NAME_(test_case_name, test_name)::AddToRegistry();      \
-  void GTEST_TEST_CLASS_NAME_(test_case_name, test_name)::TestBody()
-
-#define XLA_TEST_P(test_case_name, test_name) \
-  XLA_TEST_P_IMPL_(test_case_name, test_name)
-
-// This is identical to the TEST_F macro from "gtest", but it potentially
-// disables the test based on an external manifest file, DISABLED_MANIFEST.
-#define XLA_TYPED_TEST(CaseName, TestName)                                     \
-  template <typename gtest_TypeParam_>                                         \
-  class GTEST_TEST_CLASS_NAME_(CaseName, TestName)                             \
-      : public CaseName<gtest_TypeParam_> {                                    \
-   private:                                                                    \
-    typedef CaseName<gtest_TypeParam_> TestFixture;                            \
-    typedef gtest_TypeParam_ TypeParam;                                        \
-    virtual void TestBody();                                                   \
-  };                                                                           \
-  bool gtest_##CaseName##_##TestName##_registered_ GTEST_ATTRIBUTE_UNUSED_ =   \
-      ::testing::internal::TypeParameterizedTest<                              \
-          CaseName,                                                            \
-          ::testing::internal::TemplateSel<GTEST_TEST_CLASS_NAME_(CaseName,    \
-                                                                  TestName)>,  \
-          GTEST_TYPE_PARAMS_(CaseName)>::                                      \
-          Register(                                                            \
-              "", ::testing::internal::CodeLocation(__FILE__, __LINE__),       \
-              #CaseName,                                                       \
-              ::xla::PrependDisabledIfIndicated(#CaseName, #TestName).c_str(), \
-              0);                                                              \
-  template <typename gtest_TypeParam_>                                         \
-  void GTEST_TEST_CLASS_NAME_(CaseName,                                        \
-                              TestName)<gtest_TypeParam_>::TestBody()
+#define XLA_TYPED_TEST(CaseName, TestName) TYPED_TEST(CaseName, TestName)
 
 #endif  // TENSORFLOW_COMPILER_XLA_TESTS_TEST_MACROS_H_

From 9c4b749b09b958c436e0681a4276b47fc9316a8a Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 18 Jun 2020 19:17:40 -0700
Subject: [PATCH 0582/1390] Internal change

PiperOrigin-RevId: 317230321
Change-Id: I043dae37768f6e9cf946d4db2a8c36123ed2d6d9
---
 tensorflow/core/platform/BUILD               |   7 -
 tensorflow/core/platform/tf32_utils.cc       |  30 --
 tensorflow/core/platform/tf32_utils.h        |  27 --
 tensorflow/python/BUILD                      |  11 -
 tensorflow/python/framework/config.py        |  31 ---
 tensorflow/python/util/tf32.cc               |  22 --
 tensorflow/stream_executor/cuda/BUILD        |   2 -
 tensorflow/stream_executor/cuda/cuda_blas.cc |  98 ++++---
 tensorflow/stream_executor/cuda/cuda_blas.h  |   8 +-
 tensorflow/stream_executor/cuda/cuda_dnn.cc  | 272 ++++++++-----------
 10 files changed, 172 insertions(+), 336 deletions(-)
 delete mode 100644 tensorflow/core/platform/tf32_utils.cc
 delete mode 100644 tensorflow/core/platform/tf32_utils.h
 delete mode 100644 tensorflow/python/util/tf32.cc

diff --git a/tensorflow/core/platform/BUILD b/tensorflow/core/platform/BUILD
index 33a1e7cfe0a..70bb8a89417 100644
--- a/tensorflow/core/platform/BUILD
+++ b/tensorflow/core/platform/BUILD
@@ -938,13 +938,6 @@ cc_library(
     alwayslink = 1,
 )
 
-cc_library(
-    name = "tf32_utils",
-    srcs = ["tf32_utils.cc"],
-    hdrs = ["tf32_utils.h"],
-    copts = tf_copts(),
-)
-
 tf_cc_tests(
     name = "low_level_library_tests",
     size = "small",
diff --git a/tensorflow/core/platform/tf32_utils.cc b/tensorflow/core/platform/tf32_utils.cc
deleted file mode 100644
index d2f40ea161a..00000000000
--- a/tensorflow/core/platform/tf32_utils.cc
+++ /dev/null
@@ -1,30 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/core/platform/tf32_utils.h"
-
-#include <atomic>
-
-namespace tensorflow {
-
-// Whether TensorFloat-32 should be used where supported.
-// TODO(nluehr): Maybe enable by default after TF32 Ampere testing.
-static std::atomic<bool> tf32_allowed{false};
-
-void allow_tf32_execution(bool allowed) { tf32_allowed = allowed; }
-
-bool tf32_execution_allowed() { return tf32_allowed; }
-
-}  // namespace tensorflow
diff --git a/tensorflow/core/platform/tf32_utils.h b/tensorflow/core/platform/tf32_utils.h
deleted file mode 100644
index 7a158d00ad3..00000000000
--- a/tensorflow/core/platform/tf32_utils.h
+++ /dev/null
@@ -1,27 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_CORE_PLATFORM_TF32_UTILS_H_
-#define TENSORFLOW_CORE_PLATFORM_TF32_UTILS_H_
-
-namespace tensorflow {
-
-void allow_tf32_execution(bool allowed);
-
-bool tf32_execution_allowed();
-
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_CORE_PLATFORM_TF32_UTILS_H_
diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index 5f9e2dfb1ff..de9cf9a24c7 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -788,16 +788,6 @@ tf_python_pybind_extension(
     ],
 )
 
-tf_python_pybind_extension(
-    name = "_pywrap_tf32_execution",
-    srcs = ["util/tf32.cc"],
-    module_name = "_pywrap_tf32_execution",
-    deps = [
-        "//tensorflow/core/platform:tf32_utils",
-        "@pybind11",
-    ],
-)
-
 tf_python_pybind_extension(
     name = "_pywrap_util_port",
     srcs = ["util/port_wrapper.cc"],
@@ -5688,7 +5678,6 @@ py_library(
         "//tensorflow:composite_tensor_whitelist",
     ],
     deps = [
-        ":_pywrap_tf32_execution",
         ":tf_decorator",
         ":tf_export",
         ":tf_stack",
diff --git a/tensorflow/python/framework/config.py b/tensorflow/python/framework/config.py
index 544b6882618..9ff16f2a327 100644
--- a/tensorflow/python/framework/config.py
+++ b/tensorflow/python/framework/config.py
@@ -18,42 +18,11 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python import _pywrap_tf32_execution
 from tensorflow.python.eager import context
 from tensorflow.python.util import deprecation
 from tensorflow.python.util.tf_export import tf_export
 
 
-# No tf_export until TF is built against CUDA11 which is required for TF32.
-def tensor_float_32_execution_allowed():
-  """Get if TensorFloat-32 operations are enabled on supported hardware.
-
-  Returns:
-    True if TensorFloat-32 execution is enabled and False otherwise.
-  """
-  return _pywrap_tf32_execution.is_allowed()
-
-
-# No tf_export until TF is built against CUDA 11 which is required for TF32.
-def allow_tensor_float_32_execution(allowed):
-  """Allow use of TensorFloat-32 with float32 ops on supported hardware.
-
-  TensorFloat-32 is a math mode introduced with the NVIDIA Ampere architecture.
-  TensorFloat-32 kernels take float32 inputs and produce float32 outputs.
-  Internally, the inputs are cast to a custom representation with 10-bit
-  mantissa (similar to float16) and 8-bit exponent (similar to float32) and are
-  executed using TensorCores with float32 accumulation. For more information,
-  see https://blogs.nvidia.com/blog/2020/05/14/tensorfloat-32-precision-format/.
-
-  TensorFloat-32 execution is disabled by default, but this may change in a
-  future version.
-
-  Args:
-    allowed: whether to allow TensorFloat-32 execution
-  """
-  _pywrap_tf32_execution.allow(allowed)
-
-
 @tf_export('config.threading.get_intra_op_parallelism_threads')
 def get_intra_op_parallelism_threads():
   """Get number of threads used within an individual op for parallelism.
diff --git a/tensorflow/python/util/tf32.cc b/tensorflow/python/util/tf32.cc
deleted file mode 100644
index 7dece6ccdae..00000000000
--- a/tensorflow/python/util/tf32.cc
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "pybind11/pybind11.h"
-#include "tensorflow/core/platform/tf32_utils.h"
-
-PYBIND11_MODULE(_pywrap_tf32_execution, m) {
-  m.def("allow", &tensorflow::allow_tf32_execution);
-  m.def("is_allowed", &tensorflow::tf32_execution_allowed);
-}
diff --git a/tensorflow/stream_executor/cuda/BUILD b/tensorflow/stream_executor/cuda/BUILD
index 3a14be9ad50..c3cf9f5db15 100644
--- a/tensorflow/stream_executor/cuda/BUILD
+++ b/tensorflow/stream_executor/cuda/BUILD
@@ -251,7 +251,6 @@ cc_library(
         "@local_config_cuda//cuda:cuda_headers",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
-        "//tensorflow/core/platform:tf32_utils",
         "//tensorflow/stream_executor",
         "//tensorflow/stream_executor:event",
         "//tensorflow/stream_executor:host_or_device_scalar",
@@ -357,7 +356,6 @@ cc_library(
         "@local_config_cuda//cuda:cudnn_header",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
-        "//tensorflow/core/platform:tf32_utils",
         "//tensorflow/stream_executor:dnn",
         "//tensorflow/stream_executor:event",
         "//tensorflow/stream_executor:plugin_registry",
diff --git a/tensorflow/stream_executor/cuda/cuda_blas.cc b/tensorflow/stream_executor/cuda/cuda_blas.cc
index fcd0e7b16fb..c9f0fc462c9 100644
--- a/tensorflow/stream_executor/cuda/cuda_blas.cc
+++ b/tensorflow/stream_executor/cuda/cuda_blas.cc
@@ -49,7 +49,6 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
 #include "third_party/eigen3/Eigen/Core"
-#include "tensorflow/core/platform/tf32_utils.h"
 #include "tensorflow/core/util/env_var.h"
 #include "tensorflow/stream_executor/cuda/cuda_activation.h"
 #include "tensorflow/stream_executor/cuda/cuda_gpu_executor.h"
@@ -102,6 +101,18 @@ static std::string ToString(cublasStatus_t status) {
   }
 }
 
+// Decide whether to enable TENSOR_OP_MATH
+static bool TensorOpMathEnabled() {
+  static bool is_enabled = [] {
+    bool is_disabled;
+    TF_CHECK_OK(
+        tensorflow::ReadBoolFromEnvVar("TF_DISABLE_CUBLAS_TENSOR_OP_MATH",
+                                       /*default_val=*/false, &is_disabled));
+    return !is_disabled;
+  }();
+  return is_enabled;
+}
+
 // cuBLAS has interfaces that permit pointers to be passed from either the host
 // memory space or the device memory space; however, you must instruct it as to
 // which address space those pointers are in with cublasSetPointerMode.
@@ -226,19 +237,6 @@ bool CUDABlas::Init() {
     return false;
   }
 
-  absl::MutexLock lock(&mu_);
-#if CUDA_VERSION >= 9000
-#if CUBLAS_VER_MAJOR >= 11
-  ret = cublasSetMathMode(blas_, CUBLAS_TF32_TENSOR_OP_MATH);
-#else
-  ret = cublasSetMathMode(blas_, CUBLAS_TENSOR_OP_MATH);
-#endif
-  if (ret != CUBLAS_STATUS_SUCCESS) {
-    LOG(ERROR) << "failed to set cublas default math mode: " << ToString(ret);
-    return false;
-  }
-#endif
-
   return true;
 }
 
@@ -401,7 +399,7 @@ cudaDataType_t CUDAComputationType(blas::ComputationType ty) {
 template <typename FuncT, typename... Args>
 bool CUDABlas::DoBlasInternalImpl(FuncT cublas_func, Stream *stream,
                                   bool pointer_mode_host, bool err_on_failure,
-                                  Args... args) {
+                                  bool use_tensor_op_math, Args... args) {
   absl::MutexLock lock(&mu_);
 
   CHECK(blas_ != nullptr);
@@ -415,10 +413,10 @@ bool CUDABlas::DoBlasInternalImpl(FuncT cublas_func, Stream *stream,
                                            : CUBLAS_POINTER_MODE_DEVICE)) {
     return false;
   }
-#if CUBLAS_VER_MAJOR >= 11
+#if CUDA_VERSION >= 9000
   ScopedCublasMathMode math_mode{blas_};
-  if (!tensorflow::tf32_execution_allowed()) {
-    if (!math_mode.Init(CUBLAS_DEFAULT_MATH)) {
+  if (use_tensor_op_math) {
+    if (!math_mode.Init(CUBLAS_TENSOR_OP_MATH)) {
       return false;
     }
   }
@@ -1635,9 +1633,21 @@ bool CUDABlas::DoBlasGemm(
     }
   }
 
+  bool use_tensor_ops = false;
+#if CUDA_VERSION >= 9000
+  int cc_major, cc_minor;
+  stream->parent()->GetDeviceDescription().cuda_compute_capability(&cc_major,
+                                                                   &cc_minor);
+
+  // GPUs < sm_70 don't support tensor ops.
+  if (cc_major >= 7 && TensorOpMathEnabled()) {
+    use_tensor_ops = true;
+  }
+#endif
+
   return DoBlasInternalImpl(
       cublasSgemmEx, stream, true /* = pointer_mode_host */,
-      true /* = err_on_failure= */, CUDABlasTranspose(transa),
+      true /* = err_on_failure= */, use_tensor_ops, CUDABlasTranspose(transa),
       CUDABlasTranspose(transb), m, n, k, &alpha, GpuMemory(a),
       SE_CUDA_DATA_HALF, lda, GpuMemory(b), SE_CUDA_DATA_HALF, ldb, &beta,
       GpuMemoryMutable(c), SE_CUDA_DATA_HALF, ldc);
@@ -1911,7 +1921,8 @@ static bool TensorOpsAvailable(int cc_major) {
   // strictly correct.  We can't simply enable it, though, as that would change
   // clients' behavior significantly: Using tensor ops on fp32 inputs cause them
   // to be rounded to fp16.
-  if (cc_major >= 7 && std::is_same<InType, Eigen::half>::value) {
+  if (cc_major >= 7 && TensorOpMathEnabled() &&
+      std::is_same<InType, Eigen::half>::value) {
     return true;
   }
 #endif
@@ -2259,8 +2270,7 @@ port::Status CUDABlas::DoBlasGemmBatchedInternal(
   if (stream->parent()->GetDeviceDescription().cuda_compute_capability(
           &cc_major, &cc_minor) &&
       cc_major >= 5) {
-    bool use_tensor_ops =
-        data_type == CUDA_R_16F || tensorflow::tf32_execution_allowed();
+    bool use_tensor_ops = TensorOpMathEnabled() && data_type == CUDA_R_16F;
     cublasGemmAlgo_t algo =
         (use_tensor_ops ? CUBLAS_GEMM_DFALT_TENSOR_OP : CUBLAS_GEMM_DFALT);
     cudaDataType_t compute_type =
@@ -2274,7 +2284,7 @@ port::Status CUDABlas::DoBlasGemmBatchedInternal(
     bool ok;
     ok = DoBlasInternalImpl(
         AS_LAMBDA(cublasGemmBatchedEx), stream, true /* = pointer_mode_host */,
-        true /* = err_on_failure */, CUDABlasTranspose(transa),
+        true /* = err_on_failure */, use_tensor_ops, CUDABlasTranspose(transa),
         CUDABlasTranspose(transb), m, n, k, &alpha, a_void_ptrs, data_type, lda,
         b_void_ptrs, data_type, ldb, &beta, c_void_ptrs, data_type, ldc,
         batch_count, compute_type, algo);
@@ -2409,25 +2419,33 @@ bool CUDABlas::DoBlasGemmStridedBatched(
     int lda, int64 stride_a, const DeviceMemory<Eigen::half> &b, int ldb,
     int64 stride_b, float beta, DeviceMemory<Eigen::half> *c, int ldc,
     int64 stride_c, int batch_count) {
-#if CUDA_VERSION >= 9010
+  bool use_tensor_ops = false;
+#if CUDA_VERSION >= 9000
   int cc_major, cc_minor;
   if (stream->parent()->GetDeviceDescription().cuda_compute_capability(
-          &cc_major, &cc_minor) &&
-      cc_major >= 5) {
-    cublasGemmAlgo_t algo =
-        (cc_major >= 7 ? CUBLAS_GEMM_DFALT_TENSOR_OP : CUBLAS_GEMM_DFALT);
-    bool ok = DoBlasInternalImpl(
-        AS_LAMBDA(cublasGemmStridedBatchedEx), stream,
-        true /* = pointer_mode_host */, true /* = err_on_failure */,
-        CUDABlasTranspose(transa), CUDABlasTranspose(transb), m, n, k, &alpha,
-        GpuMemory(a), CUDA_R_16F, lda, stride_a, GpuMemory(b), CUDA_R_16F, ldb,
-        stride_b, &beta, GpuMemoryMutable(c), CUDA_R_16F, ldc, stride_c,
-        batch_count, CUDA_R_32F, algo);
-    if (ok) {
-      return true;
+          &cc_major, &cc_minor)) {
+    // GPUs < sm_70 don't support tensor ops.
+    if (cc_major >= 7 && TensorOpMathEnabled()) {
+      use_tensor_ops = true;
     }
-    LOG(ERROR) << "failed BLAS call, see log for details";
-    return false;
+#if CUDA_VERSION >= 9010
+    if (cc_major >= 5) {
+      cublasGemmAlgo_t algo =
+          (use_tensor_ops ? CUBLAS_GEMM_DFALT_TENSOR_OP : CUBLAS_GEMM_DFALT);
+      bool ok = DoBlasInternalImpl(
+          AS_LAMBDA(cublasGemmStridedBatchedEx), stream,
+          true /* = pointer_mode_host */, true /* = err_on_failure */,
+          use_tensor_ops, CUDABlasTranspose(transa), CUDABlasTranspose(transb),
+          m, n, k, &alpha, GpuMemory(a), CUDA_R_16F, lda, stride_a,
+          GpuMemory(b), CUDA_R_16F, ldb, stride_b, &beta, GpuMemoryMutable(c),
+          CUDA_R_16F, ldc, stride_c, batch_count, CUDA_R_32F, algo);
+      if (ok) {
+        return true;
+      }
+      LOG(ERROR) << "failed BLAS call, see log for details";
+      return false;
+    }
+#endif
   }
 #endif
   // Either CUDA_VERSION < 9.1 or SM < 5.0. Fall back to a loop.
@@ -2440,7 +2458,7 @@ bool CUDABlas::DoBlasGemmStridedBatched(
         reinterpret_cast<__half *>(GpuMemoryMutable(c) + batch * stride_c);
     bool ok = DoBlasInternalImpl(
         cublasSgemmEx, stream, true /* = pointer_mode_host */,
-        true /* = err_on_failure= */, CUDABlasTranspose(transa),
+        true /* = err_on_failure= */, use_tensor_ops, CUDABlasTranspose(transa),
         CUDABlasTranspose(transb), m, n, k, &alpha, a_matrix, SE_CUDA_DATA_HALF,
         lda, b_matrix, SE_CUDA_DATA_HALF, ldb, &beta, c_matrix,
         SE_CUDA_DATA_HALF, ldc);
diff --git a/tensorflow/stream_executor/cuda/cuda_blas.h b/tensorflow/stream_executor/cuda/cuda_blas.h
index 556456c83db..817bdb72777 100644
--- a/tensorflow/stream_executor/cuda/cuda_blas.h
+++ b/tensorflow/stream_executor/cuda/cuda_blas.h
@@ -83,7 +83,7 @@ class CUDABlas : public blas::BlasSupport {
   template <typename FuncT, typename... Args>
   bool DoBlasInternalImpl(FuncT cublas_func, Stream *stream,
                           bool pointer_mode_host, bool err_on_failure,
-                          Args... args);
+                          bool use_tensor_op_math, Args... args);
 
   // Convenience functions that call DoBlasInternalImpl with different values
   // for err_on_failure.
@@ -91,7 +91,8 @@ class CUDABlas : public blas::BlasSupport {
   bool DoBlasInternal(FuncT cublas_func, Stream *stream, bool pointer_mode_host,
                       Args... args) {
     return DoBlasInternalImpl(cublas_func, stream, pointer_mode_host,
-                              /*err_on_failure=*/true, args...);
+                              /*err_on_failure=*/true, /*use_tensor_ops=*/false,
+                              args...);
   }
   template <typename FuncT, typename... Args>
   bool DoBlasInternalFailureOK(FuncT cublas_func, Stream *stream,
@@ -99,7 +100,8 @@ class CUDABlas : public blas::BlasSupport {
     // Tensor ops are hard-coded off in this path, but can still be enabled with
     // a specific algorithm choice as in DoBlasGemmWithAlgorithmImpl().
     return DoBlasInternalImpl(cublas_func, stream, pointer_mode_host,
-                              /*err_on_failure=*/false, args...);
+                              /*err_on_failure=*/false,
+                              /*use_tensor_ops=*/false, args...);
   }
 
   // A helper function to implement DoBlasGemmBatched interfaces for generic
diff --git a/tensorflow/stream_executor/cuda/cuda_dnn.cc b/tensorflow/stream_executor/cuda/cuda_dnn.cc
index 192bae91572..be18c989861 100644
--- a/tensorflow/stream_executor/cuda/cuda_dnn.cc
+++ b/tensorflow/stream_executor/cuda/cuda_dnn.cc
@@ -22,7 +22,6 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "third_party/eigen3/Eigen/Core"
 #include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/platform/tf32_utils.h"
 #include "tensorflow/core/util/env_var.h"
 #include "tensorflow/stream_executor/cuda/cuda_activation.h"
 #include "tensorflow/stream_executor/cuda/cuda_diagnostics.h"
@@ -602,6 +601,31 @@ class CudnnFilterDescriptor {
   SE_DISALLOW_COPY_AND_ASSIGN(CudnnFilterDescriptor);
 };
 
+// A helper function to decide whether to enable the TENSOR_OP_MATH math type
+bool TensorOpMathEnabled() {
+  static bool is_enabled = [] {
+    bool is_disabled = false;
+    TF_CHECK_OK(
+        tensorflow::ReadBoolFromEnvVar("TF_DISABLE_CUDNN_TENSOR_OP_MATH",
+                                       /*default_val=*/false, &is_disabled));
+    return !is_disabled;
+  }();
+  return is_enabled;
+}
+
+// A helper function to decide whether to enable the TENSOR_OP_MATH math type
+// for RNNs.
+bool RnnTensorOpMathEnabled() {
+  static bool is_enabled = [] {
+    bool is_disabled = false;
+    TF_CHECK_OK(
+        tensorflow::ReadBoolFromEnvVar("TF_DISABLE_CUDNN_RNN_TENSOR_OP_MATH",
+                                       /*default_val=*/false, &is_disabled));
+    return !is_disabled;
+  }();
+  return is_enabled;
+}
+
 // A helper function to decide whether to use
 // CUDNN_BATCHNORM_SPATIAL_PERSISTENT in batchnorm. This mode can be faster in
 // some tasks because an optimized path may be selected for CUDNN_DATA_FLOAT
@@ -706,6 +730,10 @@ class CudnnConvolutionDescriptor {
             : CUDNN_CROSS_CORRELATION,
         data_type));
 
+    // NOTE(benbarsdell): This only applies if tensor op math is enabled
+    //                      and algo selection is set to Default.
+    this->set_use_tensor_op_math(true);
+
 #if CUDNN_MAJOR >= 7
     VLOG(2) << "Requesting grouped convolution: "
             << convolution_descriptor.group_count();
@@ -717,15 +745,13 @@ class CudnnConvolutionDescriptor {
 #endif
   }
 
-  void set_use_tensor_op_math(bool use_tensor_op_math) {
+  void set_use_tensor_op_math(bool use_tensor_op_math) const {
 #if CUDNN_VERSION >= 7000
     cudnnMathType_t math_type =
-#if CUDNN_VERSION >= 8000
-        (use_tensor_op_math ? CUDNN_TENSOR_OP_MATH : CUDNN_FMA_MATH);
-#else
         (use_tensor_op_math ? CUDNN_TENSOR_OP_MATH : CUDNN_DEFAULT_MATH);
-#endif
-    CHECK_CUDNN_OK(cudnnSetConvolutionMathType(handle_.get(), math_type));
+    if (TensorOpMathEnabled()) {
+      CHECK_CUDNN_OK(cudnnSetConvolutionMathType(handle_.get(), math_type));
+    }
 #endif
   }
 
@@ -737,40 +763,6 @@ class CudnnConvolutionDescriptor {
   SE_DISALLOW_COPY_AND_ASSIGN(CudnnConvolutionDescriptor);
 };
 
-// A helper function to query if a CudnnConvolutionDescriptor has tensor_op_math
-// set
-static bool IsTensorMathOpSet(const CudnnConvolutionDescriptor& conv) {
-  cudnnMathType_t math_type;
-  CHECK_CUDNN_OK(cudnnGetConvolutionMathType(conv.handle(), &math_type));
-#if CUDNN_VERSION >= 8000
-  return math_type != CUDNN_FMA_MATH;
-#else
-  return math_type == CUDNN_TENSOR_OP_MATH;
-#endif
-}
-
-static bool TensorOpMathAvailable(int cc_major) {
-  return cc_major >= 7 && CUDNN_VERSION >= 7000;
-}
-
-static bool IsTensorMathAllowed(Stream* stream, dnn::DataType input_type) {
-  int cc_major, cc_minor;
-  std::tie(cc_major, cc_minor) = GetCcMajorMinor(stream);
-  if (!TensorOpMathAvailable(cc_major)) {
-    return false;
-  }
-  if (input_type == dnn::DataType::kFloat) {
-#if CUDNN_VERSION < 8000
-    return false;
-#else
-    if (!tensorflow::tf32_execution_allowed()) {
-      return false;
-    }
-#endif
-  }
-  return true;
-}
-
 // Turns a PoolingDescriptor structure into a cudnn pooling descriptor handle
 // within a scope.
 class CudnnPoolingDescriptor {
@@ -1163,31 +1155,21 @@ class CudnnRnnDescriptor : public dnn::RnnDescriptor {
     // in profile mode, which is run with algorithms returned from
     // GetRnnAlgorithms() (which are non-default and explicitly set whether to
     // use tensor ops). CuDNN 7.2.1 fixed this issue
-    bool allow_tensor_ops =
-        data_type != CUDNN_DATA_FLOAT || tensorflow::tf32_execution_allowed();
-    bool use_tensor_ops;
-    if (algorithm_config.algorithm().has_value()) {
-      use_tensor_ops = algorithm_config.algorithm()->tensor_ops_enabled();
-    } else {
-      use_tensor_ops = CUDNN_VERSION >= 7201 && allow_tensor_ops;
-    }
-
-    if (use_tensor_ops && !allow_tensor_ops) {
-      return port::Status(port::error::INVALID_ARGUMENT,
-                          "Algo requests disallowed tensor op evaluation.");
-    }
-
-    cudnnMathType_t math_type;
-    if (use_tensor_ops) {
-      math_type = CUDNN_TENSOR_OP_MATH;
-    } else {
-#if CUDNN_VERSION >= 8000
-      math_type = CUDNN_FMA_MATH;
+    if (RnnTensorOpMathEnabled()) {
+      cudnnMathType_t math_type;
+      if (algorithm_config.algorithm().has_value()) {
+        math_type = algorithm_config.algorithm()->tensor_ops_enabled()
+                        ? CUDNN_TENSOR_OP_MATH
+                        : CUDNN_DEFAULT_MATH;
+      } else {
+#if CUDNN_VERSION >= 7201
+        math_type = CUDNN_TENSOR_OP_MATH;
 #else
-      math_type = CUDNN_DEFAULT_MATH;
-#endif  // CUDNN_VERSION >= 8000
+        math_type = CUDNN_DEFAULT_MATH;
+#endif  // CUDNN_VERSION >= 7201
+      }
+      CHECK_CUDNN_OK(cudnnSetRNNMatrixMathType(rnn_desc.get(), math_type));
     }
-    CHECK_CUDNN_OK(cudnnSetRNNMatrixMathType(rnn_desc.get(), math_type));
 #endif  // CUDNN_VERSION >= 7000
 
     return CudnnRnnDescriptor(cudnn, std::move(rnn_desc), std::move(rnn_plan),
@@ -2578,11 +2560,10 @@ port::StatusOr<DeviceMemory<uint8>> AllocateCudnnConvolutionForwardWorkspace(
     const CudnnTensorDescriptor& output_nd,
     const dnn::AlgorithmDesc& algorithm_desc,
     ScratchAllocator* scratch_allocator) {
-  if (IsTensorMathOpSet(conv) != algorithm_desc.tensor_ops_enabled()) {
-    return port::Status(
-        port::error::INTERNAL,
-        "Mismatch between cudnn conv and algorithm descriptors.");
-  }
+  // TODO(csigg): This has side effects on the convolution descriptor. It is
+  // functionally correct because the convolution is run with the algorithm of
+  // the last call to this function, but should be fixed anyway.
+  conv.set_use_tensor_op_math(algorithm_desc.tensor_ops_enabled());
 
   // Query the size of the workspace and allocate it.
   size_t size_in_bytes;
@@ -2622,11 +2603,10 @@ AllocateCudnnConvolutionBackwardDataWorkspace(
     const CudnnTensorDescriptor& output_nd,
     const dnn::AlgorithmDesc& algorithm_desc,
     ScratchAllocator* scratch_allocator) {
-  if (IsTensorMathOpSet(conv) != algorithm_desc.tensor_ops_enabled()) {
-    return port::Status(
-        port::error::INTERNAL,
-        "Mismatch between cudnn conv and algorithm descriptors.");
-  }
+  // TODO(csigg): This has side effects on the convolution descriptor. It is
+  // functionally correct because the convolution is run with the algorithm of
+  // the last call to this function, but should be fixed anyway.
+  conv.set_use_tensor_op_math(algorithm_desc.tensor_ops_enabled());
 
   // Query the size of the workspace and allocate it.
   size_t size_in_bytes;
@@ -2668,11 +2648,10 @@ AllocateCudnnConvolutionBackwardFilterWorkspace(
     const CudnnTensorDescriptor& output_nd,
     const dnn::AlgorithmDesc& algorithm_desc,
     ScratchAllocator* scratch_allocator) {
-  if (IsTensorMathOpSet(conv) != algorithm_desc.tensor_ops_enabled()) {
-    return port::Status(
-        port::error::INTERNAL,
-        "Mismatch between cudnn conv and algorithm descriptors.");
-  }
+  // TODO(csigg): This has side effects on the convolution descriptor. It is
+  // functionally correct because the convolution is run with the algorithm of
+  // the last call to this function, but should be fixed anyway.
+  conv.set_use_tensor_op_math(algorithm_desc.tensor_ops_enabled());
 
   // Query the size of the workspace and allocate it.
   size_t size_in_bytes;
@@ -2706,42 +2685,18 @@ AllocateCudnnConvolutionBackwardFilterWorkspace(
   return scratch_allocator->AllocateBytes(size_in_bytes);
 }
 
-port::StatusOr<bool> UseTensorOps(Stream* stream, dnn::DataType type,
-                                  absl::optional<dnn::AlgorithmDesc> desc) {
-  bool use_tensor_ops;
-  if (desc.has_value()) {
-    use_tensor_ops = desc->tensor_ops_enabled();
-    if (use_tensor_ops && !IsTensorMathAllowed(stream, type)) {
-      return port::Status(port::error::INVALID_ARGUMENT,
-                          "Algo requests disallowed tensor op evaluation.");
-    }
-  } else {
-    use_tensor_ops = IsTensorMathAllowed(stream, type);
-  }
-  return use_tensor_ops;
+static bool TensorOpMathAvailable(int cc_major) {
+  return cc_major >= 7 && CUDNN_VERSION >= 7000 && TensorOpMathEnabled();
 }
 
-cudnnDataType_t GetRnnComputeType(dnn::DataType data_type);
-dnn::DataType GetConvAccumulatorType(dnn::DataType data_type);
-
 port::StatusOr<dnn::AlgorithmDesc> GetCudnnConvolutionForwardAlgorithm(
     Stream* stream, const CudnnHandle& cudnn,
     const dnn::AlgorithmConfig& algorithm_config,
     const CudnnTensorDescriptor& input_nd, const CudnnFilterDescriptor& filter,
-    dnn::DataType element_type,
-    const dnn::ConvolutionDescriptor& convolution_descriptor,
+    const CudnnConvolutionDescriptor& conv,
     const CudnnTensorDescriptor& output_nd, ScratchAllocator* scratch_allocator,
     DeviceMemory<uint8>* scratch) {
   absl::optional<dnn::AlgorithmDesc> algo_desc = algorithm_config.algorithm();
-
-  CudnnConvolutionDescriptor conv(
-      convolution_descriptor,
-      ToCudnnDataType(GetConvAccumulatorType(element_type)));
-  bool use_tensor_ops;
-  SE_ASSIGN_OR_RETURN(use_tensor_ops,
-                      UseTensorOps(stream, element_type, algo_desc));
-  conv.set_use_tensor_op_math(use_tensor_ops);
-
   if (!algo_desc.has_value()) {
     // Pick fastest algorithm within memory limit according to cuDNN's
     // heuristics.
@@ -2754,7 +2709,10 @@ port::StatusOr<dnn::AlgorithmDesc> GetCudnnConvolutionForwardAlgorithm(
                         GetCudnnConvolutionForwardAlgo(
                             cudnn, input_nd, filter, conv, output_nd,
                             specify_workspace_limit, memory_limit_bytes));
-    algo_desc = dnn::AlgorithmDesc(algo, use_tensor_ops);
+    int cc_major, cc_minor;
+    std::tie(cc_major, cc_minor) = GetCcMajorMinor(stream);
+    algo_desc = dnn::AlgorithmDesc(
+        algo, /*use_tensor_ops=*/TensorOpMathAvailable(cc_major));
   }
 
   const auto scratch_or = AllocateCudnnConvolutionForwardWorkspace(
@@ -2778,9 +2736,6 @@ port::StatusOr<dnn::AlgorithmDesc> GetCudnnConvolutionForwardAlgorithm(
                      "Returned status: ", scratch_or.status().ToString()));
   }
 
-  SE_ASSIGN_OR_RETURN(use_tensor_ops,
-                      UseTensorOps(stream, element_type, algo_desc));
-  conv.set_use_tensor_op_math(use_tensor_ops);
   SE_ASSIGN_OR_RETURN(*scratch, AllocateCudnnConvolutionForwardWorkspace(
                                     stream, cudnn, input_nd, filter, conv,
                                     output_nd, *algo_desc, scratch_allocator));
@@ -2791,19 +2746,10 @@ port::StatusOr<dnn::AlgorithmDesc> GetCudnnConvolutionBackwardDataAlgorithm(
     Stream* stream, const CudnnHandle& cudnn,
     const dnn::AlgorithmConfig& algorithm_config,
     const CudnnTensorDescriptor& input_nd, const CudnnFilterDescriptor& filter,
-    dnn::DataType element_type,
-    const dnn::ConvolutionDescriptor& convolution_descriptor,
+    const CudnnConvolutionDescriptor& conv,
     const CudnnTensorDescriptor& output_nd, ScratchAllocator* scratch_allocator,
     DeviceMemory<uint8>* scratch) {
   absl::optional<dnn::AlgorithmDesc> algo_desc = algorithm_config.algorithm();
-  CudnnConvolutionDescriptor conv(
-      convolution_descriptor,
-      ToCudnnDataType(GetConvAccumulatorType(element_type)));
-  bool use_tensor_ops;
-  SE_ASSIGN_OR_RETURN(use_tensor_ops,
-                      UseTensorOps(stream, element_type, algo_desc));
-  conv.set_use_tensor_op_math(use_tensor_ops);
-
   if (!algo_desc.has_value()) {
     // Pick fastest algorithm within memory limit according to cuDNN's
     // heuristics.
@@ -2816,7 +2762,10 @@ port::StatusOr<dnn::AlgorithmDesc> GetCudnnConvolutionBackwardDataAlgorithm(
                         GetCudnnConvolutionBackwardDataAlgo(
                             cudnn, input_nd, filter, conv, output_nd,
                             specify_workspace_limit, memory_limit_bytes));
-    algo_desc = dnn::AlgorithmDesc(algo, use_tensor_ops);
+    int cc_major, cc_minor;
+    std::tie(cc_major, cc_minor) = GetCcMajorMinor(stream);
+    algo_desc = dnn::AlgorithmDesc(
+        algo, /*use_tensor_ops=*/TensorOpMathAvailable(cc_major));
   }
 
   const auto scratch_or = AllocateCudnnConvolutionBackwardDataWorkspace(
@@ -2839,9 +2788,6 @@ port::StatusOr<dnn::AlgorithmDesc> GetCudnnConvolutionBackwardDataAlgorithm(
         "while a secondary algorithm is not provided.");
   }
 
-  SE_ASSIGN_OR_RETURN(use_tensor_ops,
-                      UseTensorOps(stream, element_type, algo_desc));
-  conv.set_use_tensor_op_math(use_tensor_ops);
   SE_ASSIGN_OR_RETURN(*scratch, AllocateCudnnConvolutionBackwardDataWorkspace(
                                     stream, cudnn, input_nd, filter, conv,
                                     output_nd, *algo_desc, scratch_allocator));
@@ -2852,19 +2798,10 @@ port::StatusOr<dnn::AlgorithmDesc> GetCudnnConvolutionBackwardFilterAlgorithm(
     Stream* stream, const CudnnHandle& cudnn,
     const dnn::AlgorithmConfig& algorithm_config,
     const CudnnTensorDescriptor& input_nd, const CudnnFilterDescriptor& filter,
-    dnn::DataType element_type,
-    const dnn::ConvolutionDescriptor& convolution_descriptor,
+    const CudnnConvolutionDescriptor& conv,
     const CudnnTensorDescriptor& output_nd, ScratchAllocator* scratch_allocator,
     DeviceMemory<uint8>* scratch) {
   absl::optional<dnn::AlgorithmDesc> algo_desc = algorithm_config.algorithm();
-  CudnnConvolutionDescriptor conv(
-      convolution_descriptor,
-      ToCudnnDataType(GetConvAccumulatorType(element_type)));
-  bool use_tensor_ops;
-  SE_ASSIGN_OR_RETURN(use_tensor_ops,
-                      UseTensorOps(stream, element_type, algo_desc));
-  conv.set_use_tensor_op_math(use_tensor_ops);
-
   if (!algo_desc.has_value()) {
     // Pick fastest algorithm within memory limit according to cuDNN's
     // heuristics.
@@ -2877,7 +2814,10 @@ port::StatusOr<dnn::AlgorithmDesc> GetCudnnConvolutionBackwardFilterAlgorithm(
                         GetCudnnConvolutionBackwardFilterAlgo(
                             cudnn, input_nd, filter, conv, output_nd,
                             specify_workspace_limit, memory_limit_bytes));
-    algo_desc = dnn::AlgorithmDesc(algo, use_tensor_ops);
+    int cc_major, cc_minor;
+    std::tie(cc_major, cc_minor) = GetCcMajorMinor(stream);
+    algo_desc = dnn::AlgorithmDesc(
+        algo, /*use_tensor_ops=*/TensorOpMathAvailable(cc_major));
   }
 
   auto scratch_or = AllocateCudnnConvolutionBackwardFilterWorkspace(
@@ -2900,9 +2840,6 @@ port::StatusOr<dnn::AlgorithmDesc> GetCudnnConvolutionBackwardFilterAlgorithm(
         "while a secondary algorithm is not provided.");
   }
 
-  SE_ASSIGN_OR_RETURN(use_tensor_ops,
-                      UseTensorOps(stream, element_type, algo_desc));
-  conv.set_use_tensor_op_math(use_tensor_ops);
   SE_ASSIGN_OR_RETURN(*scratch, AllocateCudnnConvolutionBackwardFilterWorkspace(
                                     stream, cudnn, input_nd, filter, conv,
                                     output_nd, *algo_desc, scratch_allocator));
@@ -3067,32 +3004,35 @@ port::Status CudnnSupport::DoPrepareForConvolution(
   CudnnTensorDescriptor output_nd(
       output_descriptor,
       ToCudnnDataType(element_type, output_descriptor.layout()));
+  CudnnConvolutionDescriptor conv(
+      convolution_descriptor,
+      ToCudnnDataType(GetConvAccumulatorType(element_type)));
 
   auto cudnn = cudnn_->GetHandle(parent_, stream);
 
   switch (kind) {
     case dnn::ConvolutionKind::FORWARD: {
-      SE_ASSIGN_OR_RETURN(*algorithm_desc,
-                          GetCudnnConvolutionForwardAlgorithm(
-                              stream, cudnn, algorithm_config, input_nd,
-                              filter_nd, element_type, convolution_descriptor,
-                              output_nd, scratch_allocator, scratch_memory));
+      SE_ASSIGN_OR_RETURN(
+          *algorithm_desc,
+          GetCudnnConvolutionForwardAlgorithm(
+              stream, cudnn, algorithm_config, input_nd, filter_nd, conv,
+              output_nd, scratch_allocator, scratch_memory));
       break;
     }
     case dnn::ConvolutionKind::BACKWARD_DATA: {
-      SE_ASSIGN_OR_RETURN(*algorithm_desc,
-                          GetCudnnConvolutionBackwardDataAlgorithm(
-                              stream, cudnn, algorithm_config, input_nd,
-                              filter_nd, element_type, convolution_descriptor,
-                              output_nd, scratch_allocator, scratch_memory));
+      SE_ASSIGN_OR_RETURN(
+          *algorithm_desc,
+          GetCudnnConvolutionBackwardDataAlgorithm(
+              stream, cudnn, algorithm_config, input_nd, filter_nd, conv,
+              output_nd, scratch_allocator, scratch_memory));
       break;
     }
     case dnn::ConvolutionKind::BACKWARD_FILTER: {
-      SE_ASSIGN_OR_RETURN(*algorithm_desc,
-                          GetCudnnConvolutionBackwardFilterAlgorithm(
-                              stream, cudnn, algorithm_config, input_nd,
-                              filter_nd, element_type, convolution_descriptor,
-                              output_nd, scratch_allocator, scratch_memory));
+      SE_ASSIGN_OR_RETURN(
+          *algorithm_desc,
+          GetCudnnConvolutionBackwardFilterAlgorithm(
+              stream, cudnn, algorithm_config, input_nd, filter_nd, conv,
+              output_nd, scratch_allocator, scratch_memory));
       break;
     }
     default:
@@ -3121,9 +3061,8 @@ port::Status CudnnSupport::DoConvolve(
   auto accumulator_type = GetConvAccumulatorType(element_type);
   CudnnConvolutionDescriptor conv(convolution_descriptor,
                                   ToCudnnDataType(accumulator_type));
-  SE_ASSIGN_OR_RETURN(bool use_tensor_ops,
-                      UseTensorOps(stream, element_type, algorithm_desc));
-  conv.set_use_tensor_op_math(use_tensor_ops);
+  // Set use_tensor_math param to correct value
+  conv.set_use_tensor_op_math(algorithm_desc.tensor_ops_enabled());
 
   auto cudnn = cudnn_->GetHandle(parent_, stream);
   // Alpha is the scaling factor for input.
@@ -3356,6 +3295,14 @@ port::Status CudnnSupport::DoConvolve(
   return port::Status::OK();
 }
 
+// A helper function to query if a CudnnConvolutionDescriptor has tensor_op_math
+// set
+static bool IsTensorMathOpSet(const CudnnConvolutionDescriptor& conv) {
+  cudnnMathType_t math_type;
+  CHECK_CUDNN_OK(cudnnGetConvolutionMathType(conv.handle(), &math_type));
+  return math_type == CUDNN_TENSOR_OP_MATH;
+}
+
 template <typename ElementType, typename BiasType, typename ScaleType,
           typename OutputType>
 port::Status CudnnSupport::DoFusedConvolveImpl(
@@ -3389,6 +3336,8 @@ port::Status CudnnSupport::DoFusedConvolveImpl(
       filter_descriptor,
       GetCudnnDataType<ElementType>(conv_input_descriptor.layout()));
   CudnnTensorDescriptor bias_nd(bias_descriptor, GetCudnnDataType<BiasType>());
+  CudnnConvolutionDescriptor conv(convolution_descriptor,
+                                  ToCudnnDataType(accumulator_type));
 
   auto cudnn = cudnn_->GetHandle(parent_, stream);
 
@@ -3398,14 +3347,9 @@ port::Status CudnnSupport::DoFusedConvolveImpl(
   SE_ASSIGN_OR_RETURN(
       dnn::AlgorithmDesc algo_desc,
       GetCudnnConvolutionForwardAlgorithm(
-          stream, cudnn, algorithm_config, conv_input_nd, filter,
-          dnn::ToDataType<ElementType>::value, convolution_descriptor,
+          stream, cudnn, algorithm_config, conv_input_nd, filter, conv,
           output_nd, scratch_allocator, &scratch));
 
-  CudnnConvolutionDescriptor conv(convolution_descriptor,
-                                  ToCudnnDataType(accumulator_type));
-  conv.set_use_tensor_op_math(algo_desc.tensor_ops_enabled());
-
   std::unique_ptr<GpuTimer, GpuTimerDeleter> timer;
   if (is_profiling) {
     timer.reset(new GpuTimer(parent_));  // NOLINT
@@ -3536,7 +3480,9 @@ bool CudnnSupport::GetRnnAlgorithms(
   for (auto i : algo_types) {
     out_algorithms->push_back({i, /*use_tensor_ops=*/false});
 #if CUDNN_VERSION >= 7100
-    out_algorithms->push_back({i, /*use_tensor_ops=*/true});
+    if (RnnTensorOpMathEnabled()) {
+      out_algorithms->push_back({i, /*use_tensor_ops=*/true});
+    }
 #endif
   }
   return true;

From 64f7bdd56a394ecae55c5006e483050569b9b136 Mon Sep 17 00:00:00 2001
From: Thai Nguyen <thaink@google.com>
Date: Thu, 18 Jun 2020 19:32:16 -0700
Subject: [PATCH 0583/1390] Disable tsan on InterpreterFlexTest and
 SelectiveBuiltInterpreterFlexTest

PiperOrigin-RevId: 317231748
Change-Id: I7ab662fd55024c0ed91bd78bfdc8e9206d78b3b6
---
 tensorflow/lite/delegates/flex/BUILD | 1 +
 tensorflow/lite/java/BUILD           | 1 +
 2 files changed, 2 insertions(+)

diff --git a/tensorflow/lite/delegates/flex/BUILD b/tensorflow/lite/delegates/flex/BUILD
index 42914bf5ab8..99bcf05ab4a 100644
--- a/tensorflow/lite/delegates/flex/BUILD
+++ b/tensorflow/lite/delegates/flex/BUILD
@@ -279,6 +279,7 @@ java_test(
         "no_oss",  # Currently requires --config=monolithic, b/118895218.
         # TODO(b/121204962): Re-enable test after fixing memory leaks.
         "noasan",
+        "notsan",  # TODO(b/158651814) Re-enable after fixing racing condition.
     ],
     test_class = "org.tensorflow.lite.InterpreterFlexTest",
     visibility = ["//visibility:private"],
diff --git a/tensorflow/lite/java/BUILD b/tensorflow/lite/java/BUILD
index 738d66a0eb1..89be932ab4d 100644
--- a/tensorflow/lite/java/BUILD
+++ b/tensorflow/lite/java/BUILD
@@ -304,6 +304,7 @@ java_test(
         "no_oss",  # Currently requires --config=monolithic, b/118895218.
         # TODO(b/121204962): Re-enable test after fixing memory leaks.
         "noasan",
+        "notsan",  # TODO(b/158651814) Re-enable after fixing racing condition.
     ],
     test_class = "org.tensorflow.lite.InterpreterFlexTest",
     visibility = ["//visibility:private"],

From c159f1599548428660c80dada924d69f269384a3 Mon Sep 17 00:00:00 2001
From: Scott Zhu <scottzhu@google.com>
Date: Thu, 18 Jun 2020 19:35:14 -0700
Subject: [PATCH 0584/1390] Fork the keras related tpu_strategy_test to keras
 integration test.

PiperOrigin-RevId: 317232048
Change-Id: If05867985ff1ff81ac45bb601b701ee68d4d5279
---
 tensorflow/python/distribute/BUILD            |  1 -
 .../python/distribute/tpu_strategy_test.py    | 19 -----
 .../python/keras/integration_test/BUILD       | 13 ++++
 .../integration_test/tpu_strategy_test.py     | 69 +++++++++++++++++++
 4 files changed, 82 insertions(+), 20 deletions(-)
 create mode 100644 tensorflow/python/keras/integration_test/tpu_strategy_test.py

diff --git a/tensorflow/python/distribute/BUILD b/tensorflow/python/distribute/BUILD
index 7208807a18c..4d77c12f975 100644
--- a/tensorflow/python/distribute/BUILD
+++ b/tensorflow/python/distribute/BUILD
@@ -654,7 +654,6 @@ tpu_py_test(
         "//tensorflow/python/distribute/cluster_resolver:cluster_resolver_lib",
         "//tensorflow/python/eager:remote",
         "//tensorflow/python/eager:test",
-        "//tensorflow/python/keras",
     ],
 )
 
diff --git a/tensorflow/python/distribute/tpu_strategy_test.py b/tensorflow/python/distribute/tpu_strategy_test.py
index 6dd7de500e4..400b12112d6 100644
--- a/tensorflow/python/distribute/tpu_strategy_test.py
+++ b/tensorflow/python/distribute/tpu_strategy_test.py
@@ -18,7 +18,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python import keras
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.distribute import distribute_lib
 from tensorflow.python.distribute import distribution_strategy_context
@@ -364,24 +363,6 @@ class TPUStrategyTest(test.TestCase):
         expected_result,
         strategy.experimental_local_results(train_step(next(input_iterator))))
 
-  def test_keras_metric_outside_strategy_scope_per_replica(self):
-    strategy = get_tpu_strategy()
-    metric = keras.metrics.Mean("test_metric", dtype=dtypes.float32)
-
-    dataset = dataset_ops.Dataset.range(strategy.num_replicas_in_sync *
-                                        2).batch(2)
-    dataset = strategy.experimental_distribute_dataset(dataset)
-
-    @def_function.function
-    def step_fn(i):
-      metric.update_state(i)
-
-    with self.assertRaisesRegex(ValueError, "Trying to run metric.update_state "
-                                            "in replica context"):
-      with strategy.scope():
-        for i in dataset:
-          strategy.run(step_fn, args=(i,))
-
   # TODO(b/145574622): Remove this test once it is re-enabled in values_test.py.
   def test_all_reduce_on_sync_on_read_variable(self):
     strategy = get_tpu_strategy()
diff --git a/tensorflow/python/keras/integration_test/BUILD b/tensorflow/python/keras/integration_test/BUILD
index 2ef775a190e..b23dcc59b97 100644
--- a/tensorflow/python/keras/integration_test/BUILD
+++ b/tensorflow/python/keras/integration_test/BUILD
@@ -2,6 +2,7 @@
 #   Contains Keras integration tests that verify with other TF high level APIs.
 
 load("//tensorflow:tensorflow.bzl", "cuda_py_test", "tf_py_test")
+load("//tensorflow/python/tpu:tpu.bzl", "tpu_py_test")
 
 package(
     default_visibility = [
@@ -91,3 +92,15 @@ cuda_py_test(
         "//tensorflow/python:extra_py_tests_deps",
     ],
 )
+
+tpu_py_test(
+    name = "tpu_strategy_test",
+    srcs = ["tpu_strategy_test.py"],
+    disable_experimental = True,
+    python_version = "PY3",
+    tags = ["no_oss"],
+    deps = [
+        "//tensorflow:tensorflow_py",
+        "//tensorflow/python:extra_py_tests_deps",
+    ],
+)
diff --git a/tensorflow/python/keras/integration_test/tpu_strategy_test.py b/tensorflow/python/keras/integration_test/tpu_strategy_test.py
new file mode 100644
index 00000000000..d24e96ae855
--- /dev/null
+++ b/tensorflow/python/keras/integration_test/tpu_strategy_test.py
@@ -0,0 +1,69 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for TPUStrategy."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl import flags
+
+import tensorflow as tf
+
+
+FLAGS = flags.FLAGS
+flags.DEFINE_string("tpu", "", "Name of TPU to connect to.")
+flags.DEFINE_string("project", None, "Name of GCP project with TPU.")
+flags.DEFINE_string("zone", None, "Name of GCP zone with TPU.")
+
+
+def get_tpu_cluster_resolver():
+  resolver = tf.distribute.cluster_resolver.TPUClusterResolver(
+      tpu=FLAGS.tpu,
+      zone=FLAGS.zone,
+      project=FLAGS.project,
+  )
+  return resolver
+
+
+def get_tpu_strategy():
+  resolver = get_tpu_cluster_resolver()
+  tf.config.experimental_connect_to_cluster(resolver)
+  tf.tpu.experimental.initialize_tpu_system(resolver)
+  return tf.distribute.experimental.TPUStrategy(resolver)
+
+
+class TpuStrategyTest(tf.test.TestCase):
+
+  def test_keras_metric_outside_strategy_scope_per_replica(self):
+    strategy = get_tpu_strategy()
+    metric = tf.keras.metrics.Mean("test_metric", dtype=tf.float32)
+
+    dataset = tf.data.Dataset.range(strategy.num_replicas_in_sync * 2).batch(2)
+    dataset = strategy.experimental_distribute_dataset(dataset)
+
+    @tf.function
+    def step_fn(i):
+      metric.update_state(i)
+
+    with self.assertRaisesRegex(ValueError, "Trying to run metric.update_state "
+                                            "in replica context"):
+      with strategy.scope():
+        for i in dataset:
+          strategy.run(step_fn, args=(i,))
+
+
+if __name__ == "__main__":
+  tf.test.main()

From 2399b25e139c729c0f0efd0efe5a009af04ef773 Mon Sep 17 00:00:00 2001
From: rahul-kamat <rahulkamat@gmail.com>
Date: Fri, 19 Jun 2020 02:49:05 +0000
Subject: [PATCH 0585/1390] Change variable names, Remove comments

---
 tensorflow/python/framework/python_op_gen.cc | 84 ++++++++++----------
 1 file changed, 41 insertions(+), 43 deletions(-)

diff --git a/tensorflow/python/framework/python_op_gen.cc b/tensorflow/python/framework/python_op_gen.cc
index 4ecfccb611f..062a9aa01e4 100644
--- a/tensorflow/python/framework/python_op_gen.cc
+++ b/tensorflow/python/framework/python_op_gen.cc
@@ -45,7 +45,7 @@ const int kRightMargin = 78;
 
 constexpr char kEagerFallbackSuffix[] = "_eager_fallback";
 
-std::unordered_map<string, string> dtypes_map {
+std::unordered_map<string, string> dtype_type {
       {"_dtypes.float16", "_dtypes.Float16"},
       {"_dtypes.half", "_dtypes.Half"},
       {"_dtypes.float32", "_dtypes.Float32"},
@@ -164,7 +164,7 @@ class GenEagerPythonOp : public python_op_gen_internal::GenPythonOp {
   bool AddEagerFastPathAndGraphCode(const string& parameters,
                                     const std::vector<string>& output_sizes,
                                     const string& eager_not_allowed_error,
-                                    std::unordered_map<string, string>& type_map);
+                                    std::unordered_map<string, string>& type_annotations);
   bool AddEagerFallbackCode(const string& parameters,
                             const std::vector<string>& output_sizes,
                             const string& num_outputs_expr,
@@ -182,9 +182,9 @@ class GenEagerPythonOp : public python_op_gen_internal::GenPythonOp {
 
   std::unordered_map<string, string> GetTypeAnnotationMap();
 
-  void GenerateTypeVars(std::unordered_map<string, string>& type_map);
+  void GenerateTypeVars(std::unordered_map<string, string>& type_annotations);
 
-  void AddReturnTypeAnnotation(std::unordered_map<string, string>& type_map);
+  void AddReturnTypeAnnotation(std::unordered_map<string, string>& type_annotations);
 
   void AddAttrForArg(const string& attr, int arg_index) {
     gtl::InsertIfNotPresent(&inferred_attrs_, attr,
@@ -348,10 +348,10 @@ string GenEagerPythonOp::Code() {
     param_names_.push_back(param_and_default.first);
   }
 
-  std::unordered_map<string, string> type_map;
+  std::unordered_map<string, string> type_annotations;
   // Only populate map for whitelisted ops
   if (type_annotate_ops.find(op_def_.name()) != type_annotate_ops.end()) {
-    type_map = GetTypeAnnotationMap();
+    type_annotations = GetTypeAnnotationMap();
   }
 
   string parameters;
@@ -360,9 +360,9 @@ string GenEagerPythonOp::Code() {
     strings::StrAppend(&parameters, param.GetRenameTo());
 
     // Add type annotations to param
-    if (type_map.find(param.GetName()) != type_map.end()) {
-      if (!type_map[param.GetName()].empty()) {
-        strings::StrAppend(&parameters, ": ", type_map[param.GetName()]);
+    if (type_annotations.find(param.GetName()) != type_annotations.end()) {
+      if (!type_annotations[param.GetName()].empty()) {
+        strings::StrAppend(&parameters, ": ", type_annotations[param.GetName()]);
       }
     }
   }
@@ -374,12 +374,12 @@ string GenEagerPythonOp::Code() {
       strings::StrAppend(&parameters_with_defaults, ", ");
 
     // Add type annotations to param_and_default
-    if (type_map.find(param_and_default.first.GetName()) != type_map.end()) {
-      if (!type_map[param_and_default.first.GetName()].empty()) {
-        strings::StrAppend(&parameters, ": ", type_map[param_and_default.first.GetName()]);
+    if (type_annotations.find(param_and_default.first.GetName()) != type_annotations.end()) {
+      if (!type_annotations[param_and_default.first.GetName()].empty()) {
+        strings::StrAppend(&parameters, ": ", type_annotations[param_and_default.first.GetName()]);
         strings::StrAppend(&parameters_with_defaults,
                            param_and_default.first.GetRenameTo(), ": ",
-                           type_map[param_and_default.first.GetName()], " ",
+                           type_annotations[param_and_default.first.GetName()], " ",
                            "= ", param_and_default.second);
         continue;
       }
@@ -420,7 +420,7 @@ string GenEagerPythonOp::Code() {
   string eager_not_allowed_error = GetEagerNotAllowedError();
 
   if (!AddEagerFastPathAndGraphCode(parameters_with_defaults, output_sizes,
-                                    eager_not_allowed_error, type_map)) {
+                                    eager_not_allowed_error, type_annotations)) {
     return result_;
   }
 
@@ -433,60 +433,58 @@ string GenEagerPythonOp::Code() {
 }
 
 std::unordered_map<string, string> GenEagerPythonOp::GetTypeAnnotationMap() {
-  std::unordered_map<string, string> type_map;
+  std::unordered_map<string, string> type_annotations;
   // Mapping attrs to TypeVars
   for (const auto& attr : op_def_.attr()) {
     if (attr.type() == "type") {
       const string type_var_name = "TV_" + op_def_.name() + "_" + attr.name();
-      type_map[attr.name()] = type_var_name;
+      type_annotations[attr.name()] = type_var_name;
     } else if (attr.type() == "bool" || attr.type() == "float" ||
                attr.type() == "int" || attr.type() == "bytes") {
-      type_map[attr.name()] = attr.type();
+      type_annotations[attr.name()] = attr.type();
     }
   }
 
   // Mapping input Tensors to their types
   for (const auto& arg : op_def_.input_arg()) {
-    // Do not add type annotations to args that accept a sequence of tensors
+    // Do not add type annotations to args that accept a sequence of Tensors
     if (!arg.number_attr().empty()) continue;
     string type_annotation;
-    if (type_map.find(arg.type_attr()) != type_map.end()) {
+    if (type_annotations.find(arg.type_attr()) != type_annotations.end()) {
       // Get the correct TypeVar if input maps to an attr
-      strings::StrAppend(&type_annotation, "_ops.Tensor[", type_map[arg.type_attr()], "]");
+      strings::StrAppend(&type_annotation, "_ops.Tensor[", type_annotations[arg.type_attr()], "]");
     } else {
       // Get the dtype of the Tensor
       const string py_dtype = python_op_gen_internal::DataTypeToPython(arg.type(), "_dtypes.");
-      if (dtypes_map.find(py_dtype) != dtypes_map.end()) {
-        strings::StrAppend(&type_annotation, "_ops.Tensor[", dtypes_map[py_dtype], "]");
+      if (dtype_type.find(py_dtype) != dtype_type.end()) {
+        strings::StrAppend(&type_annotation, "_ops.Tensor[", dtype_type[py_dtype], "]");
       }
     }
 
-    type_map[arg.name()] = type_annotation;
+    type_annotations[arg.name()] = type_annotation;
   }
 
   // Mapping output Tensor to its type
   if (op_def_.output_arg_size() == 1) {
     const auto& arg = op_def_.output_arg(0);
     string type_annotation;
-    if (type_map.find(arg.type_attr()) != type_map.end()) {
-      // Get the correct TypeVar if input maps to an attr
-      strings::StrAppend(&type_annotation, "_ops.Tensor[", type_map[arg.type_attr()], "]");
+    if (type_annotations.find(arg.type_attr()) != type_annotations.end()) {
+      strings::StrAppend(&type_annotation, "_ops.Tensor[", type_annotations[arg.type_attr()], "]");
     } else {
-      // Get the dtype of the Tensor
       const string py_dtype = python_op_gen_internal::DataTypeToPython(arg.type(), "_dtypes.");
-      if (dtypes_map.find(py_dtype) != dtypes_map.end()) {
-        strings::StrAppend(&type_annotation, "_ops.Tensor[", dtypes_map[py_dtype], "]");
+      if (dtype_type.find(py_dtype) != dtype_type.end()) {
+        strings::StrAppend(&type_annotation, "_ops.Tensor[", dtype_type[py_dtype], "]");
       }
     }
 
-    type_map[arg.name()] = type_annotation;
+    type_annotations[arg.name()] = type_annotation;
   }
 
-  return type_map;
+  return type_annotations;
 }
 
 // Generate TypeVars using attrs
-void GenEagerPythonOp::GenerateTypeVars(std::unordered_map<string, string>& type_map) {
+void GenEagerPythonOp::GenerateTypeVars(std::unordered_map<string, string>& type_annotations) {
   bool added_typevar = false;
   for (const auto& attr : op_def_.attr()) {
     if (attr.type() == "type") {
@@ -494,14 +492,14 @@ void GenEagerPythonOp::GenerateTypeVars(std::unordered_map<string, string>& type
       for (int t : attr.allowed_values().list().type()) {
         DataType dtype = static_cast<DataType>(t);
         const string py_dtype = python_op_gen_internal::DataTypeToPython(dtype, "_dtypes.");
-        if (dtypes_map.find(py_dtype) != dtypes_map.end()) {
-          allowed_types.emplace_back(dtypes_map[py_dtype]);
+        if (dtype_type.find(py_dtype) != dtype_type.end()) {
+          allowed_types.emplace_back(dtype_type[py_dtype]);
         }
       }
 
       // If all dtypes are allowed, add them all
       if (allowed_types.empty()) {
-        for (std::pair<string, string> map_dtype : dtypes_map) {
+        for (std::pair<string, string> map_dtype : dtype_type) {
           allowed_types.emplace_back(map_dtype.second);
         }
       }
@@ -514,7 +512,7 @@ void GenEagerPythonOp::GenerateTypeVars(std::unordered_map<string, string>& type
         strings::StrAppend(&typevar_dtypes, *it);
       }
 
-      const string type_var_name = type_map[attr.name()];
+      const string type_var_name = type_annotations[attr.name()];
       strings::StrAppend(&result_, type_var_name, " = TypeVar(\"", type_var_name, "\", ", typevar_dtypes,")\n");
       added_typevar = true;
     }
@@ -523,14 +521,14 @@ void GenEagerPythonOp::GenerateTypeVars(std::unordered_map<string, string>& type
   if (added_typevar) strings::StrAppend(&result_, "\n");
 }
 
-void GenEagerPythonOp::AddReturnTypeAnnotation(std::unordered_map<string, string>& type_map) {
+void GenEagerPythonOp::AddReturnTypeAnnotation(std::unordered_map<string, string>& type_annotations) {
   if (op_def_.output_arg_size() == 1) {
     const auto& arg = op_def_.output_arg(0);
     // Add type annotations to param
-    if (type_map.find(arg.name()) != type_map.end()) {
-      if (!type_map[arg.name()].empty()) {
+    if (type_annotations.find(arg.name()) != type_annotations.end()) {
+      if (!type_annotations[arg.name()].empty()) {
         result_.erase(result_.length() - 2);
-        strings::StrAppend(&result_, " -> ", type_map[arg.name()], ":\n");
+        strings::StrAppend(&result_, " -> ", type_annotations[arg.name()], ":\n");
       }
     }
   }
@@ -858,9 +856,9 @@ void GenEagerPythonOp::AddEagerFunctionTeardown(
 
 bool GenEagerPythonOp::AddEagerFastPathAndGraphCode(
     const string& parameters, const std::vector<string>& output_sizes,
-    const string& eager_not_allowed_error, std::unordered_map<string, string>& type_map) {
+    const string& eager_not_allowed_error, std::unordered_map<string, string>& type_annotations) {
   if (type_annotate_ops.find(op_def_.name()) != type_annotate_ops.end()) {
-    GenerateTypeVars(type_map);
+    GenerateTypeVars(type_annotations);
   }
   if (api_def_.visibility() == ApiDef::VISIBLE) {
     strings::StrAppend(&result_, "@_dispatch.add_dispatch_list\n");
@@ -869,7 +867,7 @@ bool GenEagerPythonOp::AddEagerFastPathAndGraphCode(
   AddExport();
   AddDefLine(function_name_, parameters);
   if (type_annotate_ops.find(op_def_.name()) != type_annotate_ops.end()) {
-    AddReturnTypeAnnotation(type_map);
+    AddReturnTypeAnnotation(type_annotations);
   }
   AddDocStringDescription();
   AddDocStringArgs();

From 4d54ef31394aefe270826790164edcc6d687bb63 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 18 Jun 2020 20:02:06 -0700
Subject: [PATCH 0586/1390] Enable type annotations for python/ops.

PiperOrigin-RevId: 317234494
Change-Id: I49a24cd1e2127a3c7b0f2eb217cfe023ce5b439f
---
 tensorflow/python/ops/logging_ops.py | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/tensorflow/python/ops/logging_ops.py b/tensorflow/python/ops/logging_ops.py
index 02fce277690..8ca63f55987 100644
--- a/tensorflow/python/ops/logging_ops.py
+++ b/tensorflow/python/ops/logging_ops.py
@@ -54,9 +54,11 @@ except NameError:
 # call relies on certain conditionals for its dependencies.  Use
 # control_flow_ops.Assert.
 
-# Assert and Print are special symbols in Python 2, so we must
-# have an upper-case version of them. When support for it is dropped,
-# we can allow lowercase.
+# Assert and Print are special symbols in python, so we must
+# have an upper-case version of them.
+#
+# For users with Python 3 or Python 2.7
+# with `from __future__ import print_function`, we could also allow lowercase.
 # See https://github.com/tensorflow/tensorflow/issues/18053
 
 
@@ -81,6 +83,11 @@ def Print(input_, data, message=None, first_n=None, summarize=None, name=None):
     with jupyter notebook (printing to the notebook *server's* output, not into
     the notebook).
 
+  Additionally, to use tf.print in python 2.7, users must make sure to import
+  the following:
+
+  `from __future__ import print_function`
+
   Args:
     input_: A tensor passed through this op.
     data: A list of tensors to print out when op is evaluated.
@@ -141,6 +148,11 @@ def print_v2(*inputs, **kwargs):
   Python objects. Printed tensors will recursively show the first and last
   elements of each dimension to summarize.
 
+  @compatibility(python2)
+  In python 2.7, make sure to import the following:
+  `from __future__ import print_function`
+  @end_compatibility
+
   Example:
     Single-input usage:
 

From 7e6e549c461118fbefdb11d03adbc80c27109a8a Mon Sep 17 00:00:00 2001
From: Yujing Zhang <yujingzhang@google.com>
Date: Thu, 18 Jun 2020 20:03:31 -0700
Subject: [PATCH 0587/1390] Support packed variable in DistributedVariable. Add
 an option to enable packed variable in TPUStrategy.

PiperOrigin-RevId: 317234665
Change-Id: I09e806cb8261815cd87a6d98817556dd8f7e8ed7
---
 tensorflow/python/distribute/BUILD            |   6 +-
 .../python/distribute/checkpointing_test.py   |   2 +
 .../custom_training_loop_input_test.py        |   7 +-
 .../python/distribute/distribute_utils.py     |   3 +
 .../distribute/packed_distributed_variable.py |  13 +-
 .../packed_distributed_variable_test.py       |   6 +-
 .../distribute/saved_model_test_base.py       |   1 +
 .../distribute/strategy_combinations.py       |  20 +-
 tensorflow/python/distribute/tpu_strategy.py  |  50 ++-
 .../python/distribute/tpu_strategy_test.py    | 410 +++++++++---------
 tensorflow/python/distribute/tpu_values.py    |  41 +-
 tensorflow/python/distribute/values.py        |  53 ++-
 tensorflow/python/distribute/values_test.py   |  91 ++--
 tensorflow/python/distribute/values_util.py   |  12 +-
 tensorflow/python/tpu/tpu.py                  |   9 +-
 15 files changed, 454 insertions(+), 270 deletions(-)

diff --git a/tensorflow/python/distribute/BUILD b/tensorflow/python/distribute/BUILD
index 4d77c12f975..0062705126f 100644
--- a/tensorflow/python/distribute/BUILD
+++ b/tensorflow/python/distribute/BUILD
@@ -654,6 +654,7 @@ tpu_py_test(
         "//tensorflow/python/distribute/cluster_resolver:cluster_resolver_lib",
         "//tensorflow/python/eager:remote",
         "//tensorflow/python/eager:test",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
@@ -787,6 +788,7 @@ py_library(
     name = "tpu_values",
     srcs = ["tpu_values.py"],
     deps = [
+        ":packed_distributed_variable",
         ":values",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:math_ops",
@@ -1602,7 +1604,7 @@ distribute_py_test(
     srcs = ["saved_model_save_load_test.py"],
     full_precision = True,
     main = "saved_model_save_load_test.py",
-    shard_count = 5,
+    shard_count = 7,
     tags = [
         "multi_and_single_gpu",
         "no_rocm",
@@ -1635,7 +1637,7 @@ distribute_py_test(
     srcs = ["saved_model_mixed_api_test.py"],
     full_precision = True,
     main = "saved_model_mixed_api_test.py",
-    shard_count = 5,
+    shard_count = 7,
     tags = [
         "multi_and_single_gpu",
         "no_rocm",
diff --git a/tensorflow/python/distribute/checkpointing_test.py b/tensorflow/python/distribute/checkpointing_test.py
index ad646905315..edd4c46c371 100644
--- a/tensorflow/python/distribute/checkpointing_test.py
+++ b/tensorflow/python/distribute/checkpointing_test.py
@@ -103,6 +103,7 @@ class TrainingCheckpointTests(test.TestCase, parameterized.TestCase):
               strategy_combinations.mirrored_strategy_with_one_cpu,
               strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
               strategy_combinations.tpu_strategy,
+              strategy_combinations.tpu_strategy_packed_var,
               strategy_combinations.central_storage_strategy_with_two_gpus,
           ],
           mode=["eager"]))
@@ -138,6 +139,7 @@ class TrainingCheckpointTests(test.TestCase, parameterized.TestCase):
               strategy_combinations.mirrored_strategy_with_one_cpu,
               strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
               strategy_combinations.tpu_strategy,
+              strategy_combinations.tpu_strategy_packed_var,
               strategy_combinations.central_storage_strategy_with_two_gpus,
           ],
           mode=["eager"]))
diff --git a/tensorflow/python/distribute/custom_training_loop_input_test.py b/tensorflow/python/distribute/custom_training_loop_input_test.py
index 5d1584f5aa7..e4f782810dd 100644
--- a/tensorflow/python/distribute/custom_training_loop_input_test.py
+++ b/tensorflow/python/distribute/custom_training_loop_input_test.py
@@ -197,7 +197,8 @@ class InputIterationTest(test.TestCase, parameterized.TestCase,
       combinations.combine(
           distribution=[
               strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
-              strategy_combinations.tpu_strategy
+              strategy_combinations.tpu_strategy,
+              strategy_combinations.tpu_strategy_packed_var,
           ],
           mode=["eager"]))
   def testNestedOutput(self, distribution):
@@ -748,6 +749,10 @@ class InputIterationTest(test.TestCase, parameterized.TestCase,
           mode=["eager"]
       ))
   def testMultiDeviceDataCapturedFunction(self, distribution):
+    if getattr(distribution, "_enable_packed_variable_in_eager_mode", False):
+      self.skipTest(
+          "Dataset captured function doesn't support packed tensors yet "
+          "(b/145922293).")
     inputs = constant_op.constant([2., 3.])
     dataset = lambda _: dataset_ops.Dataset.from_tensor_slices(inputs).repeat(5)
     input_iterator = iter(
diff --git a/tensorflow/python/distribute/distribute_utils.py b/tensorflow/python/distribute/distribute_utils.py
index ccf19521718..14b934b4a0f 100644
--- a/tensorflow/python/distribute/distribute_utils.py
+++ b/tensorflow/python/distribute/distribute_utils.py
@@ -148,6 +148,9 @@ def select_replica_mirrored(replica_id, structured):
         raise TypeError(
             "Expected value to be mirrored across replicas: %s in %s." %
             (x, structured))
+      packed_var = getattr(x, "_packed_variable", None)
+      if packed_var is not None:
+        return packed_var
       return x.values[replica_id]
     else:
       return x
diff --git a/tensorflow/python/distribute/packed_distributed_variable.py b/tensorflow/python/distribute/packed_distributed_variable.py
index 62512cb4414..c249b8efc1c 100644
--- a/tensorflow/python/distribute/packed_distributed_variable.py
+++ b/tensorflow/python/distribute/packed_distributed_variable.py
@@ -42,7 +42,7 @@ class PackedDistributedVariable(resource_variable_ops.BaseResourceVariable):
       name: Optional name for the variable. Defaults to `'Variable'` and gets
         uniquified automatically.
     """
-    if not context.executing_eagerly():
+    if not ops.executing_eagerly_outside_functions():
       raise ValueError(
           "PackedDistributedVariable should be created in eager mode.")
     if not distributed_variables:
@@ -84,6 +84,9 @@ class PackedDistributedVariable(resource_variable_ops.BaseResourceVariable):
   def devices(self):
     return self._devices
 
+  def on_device(self, device):
+    return PackedVarAndDevice(self, device)
+
   def get_var_on_device(self, device):
     for i, d in enumerate(self._devices):
       if d == device:
@@ -100,7 +103,10 @@ class PackedDistributedVariable(resource_variable_ops.BaseResourceVariable):
 
   @property
   def handle(self):
-    return self._handle
+    if context.executing_eagerly():
+      return self.get_var_on_current_device().handle
+    else:
+      return self._handle
 
   def _read_variable_op(self):
     if context.executing_eagerly():
@@ -269,7 +275,8 @@ class PackedVarAndDevice(object):
 
   @property
   def handle(self):
-    return self._var.handle
+    with ops.device(self._device):
+      return self._var.handle
 
   @property
   def op(self):
diff --git a/tensorflow/python/distribute/packed_distributed_variable_test.py b/tensorflow/python/distribute/packed_distributed_variable_test.py
index d29d19960a5..ec2e476e4b8 100644
--- a/tensorflow/python/distribute/packed_distributed_variable_test.py
+++ b/tensorflow/python/distribute/packed_distributed_variable_test.py
@@ -46,7 +46,7 @@ class PackedDistributedVariableTest(test.TestCase):
       v1 = resource_variable_ops.ResourceVariable(2.0, name='var1')
 
     packed_var = packed_distributed_variable.PackedDistributedVariable([v0, v1])
-    self.assertTrue(packed_var.handle.is_packed)
+    self.assertFalse(packed_var.handle.is_packed)
     self.assertTrue(packed_var.is_initialized)
 
     with ops.device('/cpu:0'):
@@ -61,6 +61,7 @@ class PackedDistributedVariableTest(test.TestCase):
 
     @def_function.function
     def update_var():
+      self.assertTrue(packed_var.handle.is_packed)
       with ops.device('/cpu:0'):
         packed_var.assign_add(3.0).assign_sub(1.0)
         read0 = packed_var.value()
@@ -85,7 +86,7 @@ class PackedDistributedVariableTest(test.TestCase):
 
     packed_var0 = packed_distributed_variable.PackedVarAndDevice(
         packed_var, device0)
-    self.assertTrue(packed_var0.handle.is_packed)
+    self.assertFalse(packed_var0.handle.is_packed)
     self.assertAllEqual(math_ops.mul(packed_var0, 2.0), 2.0)
 
     packed_var1 = packed_distributed_variable.PackedVarAndDevice(
@@ -94,6 +95,7 @@ class PackedDistributedVariableTest(test.TestCase):
 
     @def_function.function
     def func():
+      self.assertTrue(packed_var.handle.is_packed)
       var0 = packed_distributed_variable.PackedVarAndDevice(packed_var, device0)
       var0.assign_add(3.0)
       var1 = packed_distributed_variable.PackedVarAndDevice(packed_var, device1)
diff --git a/tensorflow/python/distribute/saved_model_test_base.py b/tensorflow/python/distribute/saved_model_test_base.py
index e544e51cddd..70ea582baff 100644
--- a/tensorflow/python/distribute/saved_model_test_base.py
+++ b/tensorflow/python/distribute/saved_model_test_base.py
@@ -58,6 +58,7 @@ strategies = [
     strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
     strategy_combinations.mirrored_strategy_with_two_gpus,
     strategy_combinations.tpu_strategy,
+    strategy_combinations.tpu_strategy_packed_var,
     strategy_combinations.central_storage_strategy_with_two_gpus,
 ]
 
diff --git a/tensorflow/python/distribute/strategy_combinations.py b/tensorflow/python/distribute/strategy_combinations.py
index 350b187f67f..1fa42cb8645 100644
--- a/tensorflow/python/distribute/strategy_combinations.py
+++ b/tensorflow/python/distribute/strategy_combinations.py
@@ -53,7 +53,11 @@ _did_connect_to_cluster = False
 
 
 # pylint: disable=missing-docstring
-def _get_tpu_strategy_creator(steps_per_run, use_single_core=False, **kwargs):
+def _get_tpu_strategy_creator(steps_per_run,
+                              use_single_core=False,
+                              enable_packed_variable=False,
+                              **kwargs):
+
   def _create_tpu_strategy():
     global _did_connect_to_cluster
 
@@ -87,10 +91,13 @@ def _get_tpu_strategy_creator(steps_per_run, use_single_core=False, **kwargs):
 
     # Steps per run is only supported in TF 1.x
     if tf2.enabled():
-      return tpu_lib.TPUStrategy(resolver, device_assignment, **kwargs)
+      strategy = tpu_lib.TPUStrategy(resolver, device_assignment, **kwargs)
     else:
-      return tpu_lib.TPUStrategyV1(resolver, steps_per_run,
-                                   device_assignment, **kwargs)
+      strategy = tpu_lib.TPUStrategyV1(resolver, steps_per_run,
+                                       device_assignment, **kwargs)
+    strategy._enable_packed_variable_in_eager_mode = enable_packed_variable  # pylint: disable=protected-access
+    return strategy
+
   return _create_tpu_strategy
 
 
@@ -117,6 +124,10 @@ one_device_strategy_gpu_on_worker_1 = combinations.NamedDistribution(
     required_gpus=1)
 tpu_strategy = combinations.NamedDistribution(
     "TPU", _get_tpu_strategy_creator(steps_per_run=2), required_tpu=True)
+tpu_strategy_packed_var = combinations.NamedDistribution(
+    "TPUPackedVar",
+    _get_tpu_strategy_creator(steps_per_run=2, enable_packed_variable=True),
+    required_tpu=True)
 tpu_strategy_one_step = combinations.NamedDistribution(
     "TPUOneStep", _get_tpu_strategy_creator(steps_per_run=1), required_tpu=True)
 tpu_strategy_one_core = combinations.NamedDistribution(
@@ -286,6 +297,7 @@ strategies_minus_default_and_tpu = [
 tpu_strategies = [
     tpu_strategy,  # steps_per_run=2
     tpu_strategy_one_step,
+    tpu_strategy_packed_var,
     cloud_tpu_strategy,
 ]
 
diff --git a/tensorflow/python/distribute/tpu_strategy.py b/tensorflow/python/distribute/tpu_strategy.py
index 9493ecce767..7e8f5b97e7e 100644
--- a/tensorflow/python/distribute/tpu_strategy.py
+++ b/tensorflow/python/distribute/tpu_strategy.py
@@ -141,6 +141,10 @@ class TPUStrategy(distribute_lib.Strategy):
         "num_workers").set(self.extended.num_hosts)
     distribute_lib.distribution_strategy_replica_gauge.get_cell(
         "num_replicas_per_worker").set(self.extended.num_replicas_per_host)
+    # Packed variable is used to reduce the overhead of function execution.
+    # For a DistributedVariable, only one variable handle is captured into a
+    # function graph. It's only supported in eager mode.
+    self._enable_packed_variable_in_eager_mode = False
 
   # TODO(cjfj): Modify `_call_for_each_replica` in `TPUExtended` such that this
   # can use the default implementation.
@@ -185,6 +189,10 @@ class TPUStrategyV1(distribute_lib.StrategyV1):
         "num_workers").set(self.extended.num_hosts)
     distribute_lib.distribution_strategy_replica_gauge.get_cell(
         "num_replicas_per_worker").set(self.extended.num_replicas_per_host)
+    # Packed variable is used to reduce the overhead of function execution.
+    # For a DistributedVariable, only one variable handle is captured into a
+    # function graph. It's only supported in eager mode.
+    self._enable_packed_variable_in_eager_mode = False
 
   @property
   def steps_per_run(self):
@@ -671,20 +679,29 @@ class TPUExtended(distribute_lib.StrategyExtendedV1):
       return cross_device_ops_lib.reduce_non_distributed_value(
           reduce_op, value, destinations, self._num_replicas_in_sync)
 
+    value_list = value.values
+    # pylint: disable=protected-access
+    if isinstance(
+        value,
+        values.DistributedVariable) and value._packed_variable is not None:
+      value_list = tuple(
+          value._packed_variable.on_device(d)
+          for d in value._packed_variable.devices)
+    # pylint: enable=protected-access
+
     # Currently XLA op by op mode has a limit for the number of inputs for a
     # single op, thus we break one `add_n` op into a group of `add_n` ops to
     # work around the constraint.
     # TODO(cjfj): Detect when it is possible to use `cross_replica_sum`.
     if len(value.values) <= _XLA_OP_BY_OP_INPUTS_LIMIT:
-      output = math_ops.add_n(value.values)
+      output = math_ops.add_n(value_list)
     else:
-      output = array_ops.zeros_like(
-          value.values[0], dtype=value.values[0].dtype)
-      for i in range(0, len(value.values), _XLA_OP_BY_OP_INPUTS_LIMIT):
-        output += math_ops.add_n(value.values[i:i + _XLA_OP_BY_OP_INPUTS_LIMIT])
+      output = array_ops.zeros_like(value_list[0], dtype=value_list[0].dtype)
+      for i in range(0, len(value_list), _XLA_OP_BY_OP_INPUTS_LIMIT):
+        output += math_ops.add_n(value_list[i:i + _XLA_OP_BY_OP_INPUTS_LIMIT])
 
     if reduce_op == reduce_util.ReduceOp.MEAN:
-      output *= (1. / len(value.values))
+      output *= (1. / len(value_list))
 
     devices = cross_device_ops_lib.get_devices_from(destinations)
 
@@ -710,17 +727,28 @@ class TPUExtended(distribute_lib.StrategyExtendedV1):
       else:
         return (fn(var, *args, **kwargs),)
 
-    # Otherwise, we revert to MirroredStrategy behavior and update each variable
-    # directly.
+    # Otherwise, we revert to MirroredStrategy behavior and update the variable
+    # on each replica directly.
     updates = []
-    for i, v in enumerate(var.values):
+    values_and_devices = []
+    packed_var = var._packed_variable  # pylint: disable=protected-access
+    if packed_var is not None:
+      for device in packed_var.devices:
+        values_and_devices.append((packed_var, device))
+    else:
+      for value in var.values:
+        values_and_devices.append((value, value.device))
+
+    for i, value_and_device in enumerate(values_and_devices):
+      value = value_and_device[0]
+      device = value_and_device[1]
       name = "update_%d" % i
-      with ops.device(v.device), \
+      with ops.device(device), \
            distribute_lib.UpdateContext(i), \
            ops.name_scope(name):
         # If args and kwargs are not mirrored, the value is returned as is.
         updates.append(
-            fn(v, *distribute_utils.select_replica_mirrored(i, args),
+            fn(value, *distribute_utils.select_replica_mirrored(i, args),
                **distribute_utils.select_replica_mirrored(i, kwargs)))
     return distribute_utils.update_regroup(self, updates, group)
 
diff --git a/tensorflow/python/distribute/tpu_strategy_test.py b/tensorflow/python/distribute/tpu_strategy_test.py
index 400b12112d6..4070336aae8 100644
--- a/tensorflow/python/distribute/tpu_strategy_test.py
+++ b/tensorflow/python/distribute/tpu_strategy_test.py
@@ -18,6 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from absl.testing import parameterized
+
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.distribute import distribute_lib
 from tensorflow.python.distribute import distribution_strategy_context
@@ -64,14 +66,17 @@ def get_tpu_cluster_resolver():
   return resolver
 
 
-def get_tpu_strategy():
+def get_tpu_strategy(enable_packed_var=False):
   resolver = get_tpu_cluster_resolver()
   remote.connect_to_cluster(resolver)
   tpu_strategy_util.initialize_tpu_system(resolver)
-  return tpu_lib.TPUStrategy(resolver)
+  strategy = tpu_lib.TPUStrategy(resolver)
+  strategy._enable_packed_variable_in_eager_mode = enable_packed_var
+  return strategy
 
 
-class TPUStrategyTest(test.TestCase):
+# TPU tests which don't use TPUStrategy.
+class TPUTest(test.TestCase):
 
   def test_multiple_initialize_system(self):
     resolver = get_tpu_cluster_resolver()
@@ -82,177 +87,6 @@ class TPUStrategyTest(test.TestCase):
       tpu_strategy_util.initialize_tpu_system(resolver)
       self.assertRegex(str(mock_log.call_args), "already been initialized")
 
-  def test_sequential_experimental_runs(self):
-    resolver = get_tpu_cluster_resolver()
-    remote.connect_to_cluster(resolver)
-    topology = tpu_strategy_util.initialize_tpu_system(resolver)
-    # Computation replicated to all cores.
-    device_assignment = device_assignment_lib.DeviceAssignment.build(
-        topology, num_replicas=2)
-    strategy = tpu_lib.TPUStrategy(
-        resolver, device_assignment=device_assignment)
-
-    # Computation on the 1st core.
-    device_assignment2 = device_assignment_lib.DeviceAssignment.build(
-        topology, num_replicas=1)
-    strategy2 = tpu_lib.TPUStrategy(
-        resolver, device_assignment=device_assignment2)
-
-    def computation(x):
-      return math_ops.square(x)
-
-    @def_function.function
-    def train_step():
-      outputs = strategy.experimental_local_results(
-          strategy.run(computation, args=([2., 2.],)))
-      outputs2 = strategy2.run(
-          computation, args=([outputs[0]],))
-      return outputs2
-
-    self.assertAllEqual([[16., 16.]], train_step())
-
-  def test_device_switch_case(self):
-    strategy = get_tpu_strategy()
-    with strategy.scope():
-      a = variables.Variable(1)
-
-    inference_iteration = variables.Variable(-1)
-
-    def inference_fn(x, i):
-      return a + x + i
-
-    @def_function.function
-    def run_inference(x):
-
-      def do_inference(device, inference_fn, i):
-        with ops.device(device):
-          return inference_fn(x, i)
-
-      branch_fns = {
-          0: (lambda: do_inference("/device:TPU:0", inference_fn, 0)),
-          1: (lambda: do_inference("/device:TPU:1", inference_fn, 1)),
-      }
-      branch_index = inference_iteration.assign_add(1, use_locking=True) % 2
-      return control_flow_ops.switch_case(branch_index, branch_fns)
-
-    self.assertAllEqual(2., run_inference(1))  # Use TPU core 0.
-    self.assertAllEqual(3., run_inference(1))  # Use TPU core 1.
-
-  def test_recover_from_compilation_failures(self):
-    # TODO(b/148150981): Stop skipping this test once recovery works
-    # for non-local TPU.
-    if FLAGS.tpu:
-      self.skipTest("Recovery fails for non-local TPU, see b/148150981")
-
-    # Disable automatic outside compilation.
-    config.set_soft_device_placement(False)
-    strategy = get_tpu_strategy()
-
-    @def_function.function
-    def compilation_failure_run():
-
-      def computation():
-        return random_ops.random_gamma([10], [0.5, 1.5])
-
-      return strategy.run(computation)
-
-    with self.assertRaisesRegexp(errors.InvalidArgumentError,
-                                 "TPU compilation failed"):
-      compilation_failure_run()
-
-    @def_function.function
-    def good_run():
-
-      def computation():
-        return random_ops.random_normal([10])
-
-      return strategy.run(computation)
-
-    good_run()
-
-  def test_dynamic_shape_with_outside_compilation_failure(self):
-    # Enable automatic outside compilation.
-    config.set_soft_device_placement(True)
-    strategy = get_tpu_strategy()
-    dataset = dataset_ops.Dataset.from_tensors(("string", 1.0)).repeat().batch(
-        2, drop_remainder=False)
-    dataset = strategy.experimental_distribute_dataset(dataset)
-    iterator = iter(dataset)
-
-    @def_function.function
-    def train_fn(iterator):
-
-      def step_fn(inputs):
-        _, inputs = inputs
-        return math_ops.reduce_sum(inputs)
-
-      return strategy.experimental_local_results(
-          strategy.run(step_fn, args=(next(iterator),)))
-
-    with self.assertRaisesRegex(errors.InternalError, "Compilation failure"):
-      logging.info(train_fn(iterator))
-
-  def test_computation_on_subset_cores(self):
-    resolver = get_tpu_cluster_resolver()
-    remote.connect_to_cluster(resolver)
-    topology = tpu_strategy_util.initialize_tpu_system(resolver)
-    all_core_strategy = tpu_lib.TPUStrategy(resolver)
-
-    with all_core_strategy.scope():
-      v = variables.Variable(0.0,
-                             aggregation=variables.VariableAggregation.MEAN)
-
-    # Computation on the 1st core.
-    device_assignment = device_assignment_lib.DeviceAssignment.build(
-        topology, num_replicas=1)
-    first_core_strategy = tpu_lib.TPUStrategy(
-        resolver, device_assignment=device_assignment)
-
-    # Computation on the 2nd core.
-    device_assignment2 = device_assignment_lib.DeviceAssignment(
-        topology, [[[0, 0, 0, 1]]])
-    second_core_strategy = tpu_lib.TPUStrategy(
-        resolver, device_assignment=device_assignment2)
-
-    @def_function.function
-    def train_step():
-
-      def step_fn():
-        return v + 1.0
-
-      all_core_strategy.run(step_fn)
-      r1 = first_core_strategy.run(step_fn)
-      r2 = second_core_strategy.run(step_fn)
-      return r1 + r2
-
-    train_step()
-    self.assertAllEqual(2., train_step())
-
-  def test_worker_devices_on_subset_cores(self):
-    resolver = get_tpu_cluster_resolver()
-    remote.connect_to_cluster(resolver)
-    topology = tpu_strategy_util.initialize_tpu_system(resolver)
-
-    # Strategy for the 1st core.
-    device_assignment = device_assignment_lib.DeviceAssignment.build(
-        topology, num_replicas=1)
-    first_core_strategy = tpu_lib.TPUStrategy(
-        resolver, device_assignment=device_assignment)
-
-    # Strategy for the 2nd core.
-    device_assignment2 = device_assignment_lib.DeviceAssignment(
-        topology, [[[0, 0, 0, 1]]])
-    second_core_strategy = tpu_lib.TPUStrategy(
-        resolver, device_assignment=device_assignment2)
-
-    self.assertLen(first_core_strategy.extended.worker_devices, 1)
-    self.assertEndsWith(first_core_strategy.extended.worker_devices[0],
-                        "device:TPU:0")
-
-    self.assertLen(second_core_strategy.extended.worker_devices, 1)
-    self.assertEndsWith(second_core_strategy.extended.worker_devices[0],
-                        "device:TPU:1")
-
   def test_tpu_tf_function_same_device(self):
     with ops.device("/device:TPU:0"):
       a = variables.Variable(1)
@@ -288,8 +122,194 @@ class TPUStrategyTest(test.TestCase):
       result = bar() + 1
       self.assertAllEqual(result, 2)
 
-  def test_control_output_in_while_body_fn(self):
-    strategy = get_tpu_strategy()
+
+@parameterized.named_parameters([("PackedVar", True), ("", False)])
+class TPUStrategyTest(test.TestCase, parameterized.TestCase):
+
+  def test_sequential_experimental_runs(self, enable_packed_var):
+    resolver = get_tpu_cluster_resolver()
+    remote.connect_to_cluster(resolver)
+    topology = tpu_strategy_util.initialize_tpu_system(resolver)
+    # Computation replicated to all cores.
+    device_assignment = device_assignment_lib.DeviceAssignment.build(
+        topology, num_replicas=2)
+    strategy = tpu_lib.TPUStrategy(
+        resolver, device_assignment=device_assignment)
+    strategy._enable_packed_variable_in_eager_mode = enable_packed_var
+
+    # Computation on the 1st core.
+    device_assignment2 = device_assignment_lib.DeviceAssignment.build(
+        topology, num_replicas=1)
+    strategy2 = tpu_lib.TPUStrategy(
+        resolver, device_assignment=device_assignment2)
+
+    def computation(x):
+      return math_ops.square(x)
+
+    @def_function.function
+    def train_step():
+      outputs = strategy.experimental_local_results(
+          strategy.run(computation, args=([2., 2.],)))
+      outputs2 = strategy2.run(
+          computation, args=([outputs[0]],))
+      return outputs2
+
+    self.assertAllEqual([[16., 16.]], train_step())
+
+  def test_device_switch_case(self, enable_packed_var):
+    strategy = get_tpu_strategy(enable_packed_var)
+    with strategy.scope():
+      a = variables.Variable(1)
+
+    inference_iteration = variables.Variable(-1)
+
+    def inference_fn(x, i):
+      return a + x + i
+
+    @def_function.function
+    def run_inference(x):
+
+      def do_inference(device, inference_fn, i):
+        with ops.device(device):
+          return inference_fn(x, i)
+
+      branch_fns = {
+          0: (lambda: do_inference("/device:TPU:0", inference_fn, 0)),
+          1: (lambda: do_inference("/device:TPU:1", inference_fn, 1)),
+      }
+      branch_index = inference_iteration.assign_add(1, use_locking=True) % 2
+      return control_flow_ops.switch_case(branch_index, branch_fns)
+
+    self.assertAllEqual(2., run_inference(1))  # Use TPU core 0.
+    self.assertAllEqual(3., run_inference(1))  # Use TPU core 1.
+
+  def test_recover_from_compilation_failures(self, enable_packed_var):
+    # TODO(b/148150981): Stop skipping this test once recovery works
+    # for non-local TPU.
+    if FLAGS.tpu:
+      self.skipTest("Recovery fails for non-local TPU, see b/148150981")
+
+    # Disable automatic outside compilation.
+    config.set_soft_device_placement(False)
+    strategy = get_tpu_strategy(enable_packed_var)
+
+    @def_function.function
+    def compilation_failure_run():
+
+      def computation():
+        return random_ops.random_gamma([10], [0.5, 1.5])
+
+      return strategy.run(computation)
+
+    with self.assertRaisesRegex(errors.InvalidArgumentError,
+                                "TPU compilation failed"):
+      compilation_failure_run()
+
+    @def_function.function
+    def good_run():
+
+      def computation():
+        return random_ops.random_normal([10])
+
+      return strategy.run(computation)
+
+    good_run()
+
+  def test_dynamic_shape_with_outside_compilation_failure(
+      self, enable_packed_var):
+    # Enable automatic outside compilation.
+    config.set_soft_device_placement(True)
+    strategy = get_tpu_strategy(enable_packed_var)
+    dataset = dataset_ops.Dataset.from_tensors(("string", 1.0)).repeat().batch(
+        2, drop_remainder=False)
+    dataset = strategy.experimental_distribute_dataset(dataset)
+    iterator = iter(dataset)
+
+    @def_function.function
+    def train_fn(iterator):
+
+      def step_fn(inputs):
+        _, inputs = inputs
+        return math_ops.reduce_sum(inputs)
+
+      return strategy.experimental_local_results(
+          strategy.run(step_fn, args=(next(iterator),)))
+
+    with self.assertRaisesRegex(errors.InternalError, "Compilation failure"):
+      logging.info(train_fn(iterator))
+
+  def test_computation_on_subset_cores(self, enable_packed_var):
+    resolver = get_tpu_cluster_resolver()
+    remote.connect_to_cluster(resolver)
+    topology = tpu_strategy_util.initialize_tpu_system(resolver)
+    all_core_strategy = tpu_lib.TPUStrategy(resolver)
+    all_core_strategy._enable_packed_variable_in_eager_mode = enable_packed_var
+
+    with all_core_strategy.scope():
+      v = variables.Variable(0.0,
+                             aggregation=variables.VariableAggregation.MEAN)
+
+    # Computation on the 1st core.
+    device_assignment = device_assignment_lib.DeviceAssignment.build(
+        topology, num_replicas=1)
+    first_core_strategy = tpu_lib.TPUStrategy(
+        resolver, device_assignment=device_assignment)
+    first_core_strategy._enable_packed_variable_in_eager_mode = (
+        enable_packed_var)
+
+    # Computation on the 2nd core.
+    device_assignment2 = device_assignment_lib.DeviceAssignment(
+        topology, [[[0, 0, 0, 1]]])
+    second_core_strategy = tpu_lib.TPUStrategy(
+        resolver, device_assignment=device_assignment2)
+    second_core_strategy._enable_packed_variable_in_eager_mode = (
+        enable_packed_var)
+
+    @def_function.function
+    def train_step():
+
+      def step_fn():
+        return v + 1.0
+
+      all_core_strategy.run(step_fn)
+      r1 = first_core_strategy.run(step_fn)
+      r2 = second_core_strategy.run(step_fn)
+      return r1 + r2
+
+    train_step()
+    self.assertAllEqual(2., train_step())
+
+  def test_worker_devices_on_subset_cores(self, enable_packed_var):
+    resolver = get_tpu_cluster_resolver()
+    remote.connect_to_cluster(resolver)
+    topology = tpu_strategy_util.initialize_tpu_system(resolver)
+
+    # Strategy for the 1st core.
+    device_assignment = device_assignment_lib.DeviceAssignment.build(
+        topology, num_replicas=1)
+    first_core_strategy = tpu_lib.TPUStrategy(
+        resolver, device_assignment=device_assignment)
+    first_core_strategy._enable_packed_variable_in_eager_mode = (
+        enable_packed_var)
+
+    # Strategy for the 2nd core.
+    device_assignment2 = device_assignment_lib.DeviceAssignment(
+        topology, [[[0, 0, 0, 1]]])
+    second_core_strategy = tpu_lib.TPUStrategy(
+        resolver, device_assignment=device_assignment2)
+    second_core_strategy._enable_packed_variable_in_eager_mode = (
+        enable_packed_var)
+
+    self.assertLen(first_core_strategy.extended.worker_devices, 1)
+    self.assertEndsWith(first_core_strategy.extended.worker_devices[0],
+                        "device:TPU:0")
+
+    self.assertLen(second_core_strategy.extended.worker_devices, 1)
+    self.assertEndsWith(second_core_strategy.extended.worker_devices[0],
+                        "device:TPU:1")
+
+  def test_control_output_in_while_body_fn(self, enable_packed_var):
+    strategy = get_tpu_strategy(enable_packed_var)
 
     with strategy.scope():
       v = variables.Variable(
@@ -307,8 +327,8 @@ class TPUStrategyTest(test.TestCase):
     train_step()
     self.assertEqual(2.0, v.numpy())
 
-  def test_cluster_in_graph_and_while_body_fn(self):
-    strategy = get_tpu_strategy()
+  def test_cluster_in_graph_and_while_body_fn(self, enable_packed_var):
+    strategy = get_tpu_strategy(enable_packed_var)
 
     @def_function.function
     def train_step():
@@ -328,8 +348,8 @@ class TPUStrategyTest(test.TestCase):
     sum_val = train_step().numpy().astype(float)
     self.assertEqual(sum_val, strategy.num_replicas_in_sync * 10)
 
-  def test_two_clusters_with_same_fn(self):
-    strategy = get_tpu_strategy()
+  def test_two_clusters_with_same_fn(self, enable_packed_var):
+    strategy = get_tpu_strategy(enable_packed_var)
 
     @def_function.function
     def foo(x):
@@ -342,8 +362,8 @@ class TPUStrategyTest(test.TestCase):
 
     bar(1)
 
-  def test_using_external_variable_inside_tf_function(self):
-    strategy = get_tpu_strategy()
+  def test_using_external_variable_inside_tf_function(self, enable_packed_var):
+    strategy = get_tpu_strategy(enable_packed_var)
     dataset = dataset_ops.Dataset.range(
         strategy.num_replicas_in_sync * 2,
         output_type=dtypes.float32).batch(strategy.num_replicas_in_sync)
@@ -364,8 +384,8 @@ class TPUStrategyTest(test.TestCase):
         strategy.experimental_local_results(train_step(next(input_iterator))))
 
   # TODO(b/145574622): Remove this test once it is re-enabled in values_test.py.
-  def test_all_reduce_on_sync_on_read_variable(self):
-    strategy = get_tpu_strategy()
+  def test_all_reduce_on_sync_on_read_variable(self, enable_packed_var):
+    strategy = get_tpu_strategy(enable_packed_var)
     dataset = dataset_ops.Dataset.range(
         strategy.num_replicas_in_sync, output_type=dtypes.float32).batch(
             strategy.num_replicas_in_sync, drop_remainder=True)
@@ -404,8 +424,8 @@ class TPUStrategyTest(test.TestCase):
     self.assertAllEqual((0.,), w.read_value())
 
   # TODO(b/140633529): Re-enable the test.
-  def disable_test_experimental_run_output_on_device(self):
-    strategy = get_tpu_strategy()
+  def disable_test_experimental_run_output_on_device(self, enable_packed_var):
+    strategy = get_tpu_strategy(enable_packed_var)
 
     def computation(x):
       return math_ops.square(x)
@@ -423,8 +443,8 @@ class TPUStrategyTest(test.TestCase):
     self.assertAllEqual("/job:localhost/replica:0/task:0/device:TPU:1",
                         results[1].backing_device)
 
-  def test_composite_input(self):
-    strategy = get_tpu_strategy()
+  def test_composite_input(self, enable_packed_var):
+    strategy = get_tpu_strategy(enable_packed_var)
     if strategy.num_replicas_in_sync != 2:
       self.skipTest("Test assumes two replicas.")
 
@@ -463,8 +483,9 @@ class TPUStrategyTest(test.TestCase):
     self.assertAllEqual(result,
                         [[[0.0, 1.0], [3.0, 8.0]], [[0.0, 1.0], [3.0, 8.0]]])
 
-  def test_composite_input_dynamic_shapes_outside_compilation(self):
-    strategy = get_tpu_strategy()
+  def test_composite_input_dynamic_shapes_outside_compilation(
+      self, enable_packed_var):
+    strategy = get_tpu_strategy(enable_packed_var)
     if strategy.num_replicas_in_sync != 2:
       self.skipTest("Test assumes two replicas.")
 
@@ -506,11 +527,11 @@ class TPUStrategyTest(test.TestCase):
     result = sparse_lookup(dataset)
     self.assertAllEqual(result, [[0.0, 2.0], [1.5, 5.0]])
 
-  def test_per_device_tracing_of_mirrored_variables(self):
+  def test_per_device_tracing_of_mirrored_variables(self, enable_packed_var):
     # Define trace_count as a list to avoid python scoping error
     trace_count = [0]
 
-    strategy = get_tpu_strategy()
+    strategy = get_tpu_strategy(enable_packed_var)
     with strategy.scope():
       variable = variables.Variable(0.0)
 
@@ -527,7 +548,10 @@ class TPUStrategyTest(test.TestCase):
 
     with strategy.scope():
       update_variable.get_concrete_function()
-      self.assertEqual(trace_count[0], len(strategy.extended.worker_devices))
+      self.assertLen(strategy.extended.worker_devices, trace_count[0])
+
+
+class TPUStrategyDataPrefetchTest(test.TestCase):
 
   def test_prefetch_to_device_default(self):
     strategy = get_tpu_strategy()
diff --git a/tensorflow/python/distribute/tpu_values.py b/tensorflow/python/distribute/tpu_values.py
index 40ab058ac7c..33885531966 100644
--- a/tensorflow/python/distribute/tpu_values.py
+++ b/tensorflow/python/distribute/tpu_values.py
@@ -24,6 +24,7 @@ from __future__ import print_function
 
 import contextlib
 
+from tensorflow.python.distribute import packed_distributed_variable as packed
 from tensorflow.python.distribute import values
 from tensorflow.python.eager import context
 from tensorflow.python.eager import tape
@@ -46,15 +47,27 @@ def _maybe_enter_graph(tensor):
       yield
 
 
+@contextlib.contextmanager
+def _maybe_on_device(var):
+  # Add a device scope for packed variables.
+  if isinstance(var, packed.PackedVarAndDevice):
+    with ops.device(var.device):
+      yield
+  else:
+    yield
+
+
 def _make_raw_assign_fn(raw_assign_fn):  # pylint: disable=missing-docstring
 
   def assign_fn(var, value, use_locking=False, name=None, read_value=True):  # pylint: disable=missing-docstring
     del use_locking  # Unused.
 
-    with _maybe_enter_graph(var.handle):
+    handle = var.handle
+    with _maybe_enter_graph(handle), _maybe_on_device(var):
       op = raw_assign_fn(
-          var.handle, ops.convert_to_tensor(value, dtype=var.dtype), name=name)
-
+          handle,
+          ops.convert_to_tensor(value, dtype=var.dtype),
+          name=name)
       with ops.control_dependencies([op]):
         return var._read_variable_op() if read_value else op  # pylint: disable=protected-access
 
@@ -97,23 +110,37 @@ class TPUVariableMixin(object):
 
   @property
   def handle(self):
+    """The handle by which this variable can be accessed."""
     # If we're in a tpu.rewrite(), return the replicated handle.
     tpu_context = enclosing_tpu_context()
     if tpu_context is None or context.executing_eagerly():
       return self._get_on_device_or_primary().handle
     else:
-      return tpu_context.get_replicated_var_handle(self._handle_id,
-                                                   self._values,
-                                                   self._is_mirrored())
+      is_packed = self._packed_var is not None
+      val = self._values
+      if is_packed:
+        val = [self._packed_var]
+
+      return tpu_context.get_replicated_var_handle(self._handle_id, val,
+                                                   self._is_mirrored(),
+                                                   is_packed)
 
   @property
   def device(self):
     return self.handle.device
 
   def _read_variable_op(self):
+    """Reads the value of this variable."""
     if self.trainable:
       tape.variable_accessed(self)
-    return gen_resource_variable_ops.read_variable_op(self.handle, self.dtype)
+
+    handle = self.handle
+    if getattr(handle, "is_packed", False):
+      # Add a device scope for a packed variable handle.
+      with ops.device(self._get_on_device_or_primary().device):
+        return gen_resource_variable_ops.read_variable_op(handle, self.dtype)
+    else:
+      return gen_resource_variable_ops.read_variable_op(handle, self.dtype)
 
   def read_value(self):
     if enclosing_tpu_context() is None:
diff --git a/tensorflow/python/distribute/values.py b/tensorflow/python/distribute/values.py
index 60b2ea4fe31..37643e03b18 100644
--- a/tensorflow/python/distribute/values.py
+++ b/tensorflow/python/distribute/values.py
@@ -472,6 +472,12 @@ class DistributedVariable(DistributedDelegate, variables_lib.Variable,
     # variable.
     self._var_policy = var_policy
 
+  @property
+  def _devices(self):
+    if self._packed_var is not None:
+      return tuple(d for d in self._packed_var.devices)
+    return tuple(v.device for v in self._values)
+
   def is_initialized(self, name=None):
     """Identifies if all the component variables are initialized.
 
@@ -482,6 +488,8 @@ class DistributedVariable(DistributedDelegate, variables_lib.Variable,
       The op that evaluates to True or False depending on if all the
       component variables are initialized.
     """
+    if self._packed_var is not None:
+      return self._packed_var.is_initialized()
     result = self._primary.is_initialized()
     # We iterate through the list of values except the last one to allow us to
     # name the final `logical_and` op the same name that is passed by the user
@@ -552,6 +560,10 @@ class DistributedVariable(DistributedDelegate, variables_lib.Variable,
   def aggregation(self):
     return self._aggregation
 
+  @property
+  def _packed_variable(self):
+    return self._packed_var
+
   @property
   def handle(self):
     replica_id = values_util.get_current_replica_id_as_int()
@@ -559,6 +571,8 @@ class DistributedVariable(DistributedDelegate, variables_lib.Variable,
       raise ValueError("`handle` is not available outside the replica context"
                        " or a `tf.distribute.Strategy.update()` call.")
     else:
+      if self._packed_var is not None:
+        return self._packed_var.handle
       return self._values[replica_id].handle
 
   def eval(self, session=None):
@@ -607,6 +621,33 @@ class DistributedVariable(DistributedDelegate, variables_lib.Variable,
   def _in_graph_mode(self):
     return self._primary._in_graph_mode  # pylint: disable=protected-access
 
+  def _get_replica(self, replica_id):
+    """Returns the value on a device with the given replica_id."""
+    if self._packed_var is not None:
+      return self._packed_var.on_device(self._devices[replica_id])
+    return self._values[replica_id]
+
+  def _get(self):
+    """Returns the value for the current device or raises a ValueError."""
+    replica_id = values_util.get_current_replica_id_as_int()
+    if replica_id is None:
+      return self._get_cross_replica()
+    else:
+      return self._get_replica(replica_id)
+
+  def _get_on_device_or_primary(self):
+    """Returns value in same replica or device if possible, else the _primary."""
+    replica_id = values_util.get_current_replica_id_as_int()
+    if replica_id is None:
+      # Try to find a value on the current device.
+      current_device = device_util.canonicalize(device_util.current())
+      for i, value in enumerate(self._values):
+        if device_util.canonicalize(value.device) == current_device:
+          return self._get_replica(i)
+      return self._get_replica(0)
+    else:
+      return self._get_replica(replica_id)
+
   def read_value(self):
     with ds_context.enter_or_assert_strategy(self._distribute_strategy):
       return array_ops.identity(self._get())
@@ -778,7 +819,8 @@ class DistributedVariable(DistributedDelegate, variables_lib.Variable,
       if ds_context.in_cross_replica_context():
         update_replica_id = distribute_lib.get_update_replica_id()
         if update_replica_id is not None:
-          return update_fn(self._values[update_replica_id], value, **kwargs)
+          replica_value = self._get_replica(update_replica_id)
+          return update_fn(replica_value, value, **kwargs)
         return self._update_cross_replica(update_fn, value, **kwargs)
       else:
         values_util.assert_replica_context(self.distribute_strategy)
@@ -802,6 +844,7 @@ class DistributedVariable(DistributedDelegate, variables_lib.Variable,
       obj_map[v] = new_obj
       resource_map[v.handle] = new_obj.handle
     obj_map[self] = new_obj
+    resource_map[self.handle] = new_obj.handle
     resource_map[self] = new_obj.handle
     return obj_map, resource_map
 
@@ -835,6 +878,12 @@ class _MirroredSaveable(saveable_object_util.ResourceVariableSaveable):
   def restore(self, restored_tensors, restored_shapes):
     """Restore the same value into all variables."""
     tensor, = restored_tensors
+    packed_var = self._mirrored_variable._packed_variable  # pylint: disable=protected-access
+    if packed_var is not None:
+      return control_flow_ops.group(
+          tuple(
+              values_util.assign_on_device(d, packed_var, tensor)
+              for d in packed_var.devices))
     return control_flow_ops.group(
         tuple(
             values_util.assign_on_device(v.device, v, tensor)
@@ -1013,7 +1062,7 @@ class SyncOnReadVariable(DistributedVariable):
 
   def _get_cross_replica(self):
     if self._aggregation == vs.VariableAggregation.ONLY_FIRST_REPLICA:
-      return self._primary
+      return self._get_replica(0)
 
     with ds_context.enter_or_assert_strategy(self._distribute_strategy):
       return self._distribute_strategy.reduce(
diff --git a/tensorflow/python/distribute/values_test.py b/tensorflow/python/distribute/values_test.py
index 0cb4d6ddd2a..d0e3eec22a8 100644
--- a/tensorflow/python/distribute/values_test.py
+++ b/tensorflow/python/distribute/values_test.py
@@ -42,7 +42,6 @@ from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
 from tensorflow.python.eager import test
 from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import device
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import indexed_slices
 from tensorflow.python.framework import ops
@@ -234,11 +233,11 @@ class DistributedValuesTest(test.TestCase, parameterized.TestCase):
           distribution=[
               strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
               strategy_combinations.tpu_strategy,
+              strategy_combinations.tpu_strategy_packed_var,
               # TODO(b/137795644): support CentralStroageStrategy
               # strategy_combinations.central_storage_strategy_with_two_gpus,
           ],
-          mode=["eager"]
-      ))
+          mode=["eager"]))
   def testMakeDistributedValueDefaultDevicePlacement(self, distribution):
     if not tf2.enabled():
       self.skipTest("Only V2 is supported.")
@@ -259,11 +258,11 @@ class DistributedValuesTest(test.TestCase, parameterized.TestCase):
           distribution=[
               strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
               strategy_combinations.tpu_strategy,
+              strategy_combinations.tpu_strategy_packed_var,
               # TODO(b/137795644): support CentralStroageStrategy
               # strategy_combinations.central_storage_strategy_with_two_gpus,
           ],
-          mode=["eager"]
-      ))
+          mode=["eager"]))
   def testMakeDistributedValueExplicitDevicePlacement(self, distribution):
     if not tf2.enabled():
       self.skipTest("Only V2 is supported.")
@@ -384,6 +383,16 @@ def _make_mirrored():
   return mirrored
 
 
+def mirrored_and_tpu_strategy_combinations():
+  return combinations.combine(
+      distribution=[
+          strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
+          strategy_combinations.tpu_strategy,
+          strategy_combinations.tpu_strategy_packed_var,
+      ],
+      mode=["graph", "eager"])
+
+
 class RegroupAndSelectDeviceTest(test.TestCase, parameterized.TestCase):
 
   def _is_per_replica(self, result, expected, klass=values.PerReplica):
@@ -563,6 +572,7 @@ class RegroupAndSelectDeviceTest(test.TestCase, parameterized.TestCase):
             strategy_combinations.mirrored_strategy_with_one_cpu,
             strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
             strategy_combinations.tpu_strategy,
+            strategy_combinations.tpu_strategy_packed_var,
             strategy_combinations.central_storage_strategy_with_two_gpus,
         ],
         synchronization=[
@@ -708,29 +718,40 @@ class DistributedVariableTest(test.TestCase, parameterized.TestCase):
         self.evaluate(
             distribution.experimental_local_results(distribution.run(assign)))
 
-  def testPackedVariable(self, distribution, synchronization, aggregation):
+
+@combinations.generate(
+    combinations.combine(
+        distribution=[
+            strategy_combinations.mirrored_strategy_with_one_cpu,
+            strategy_combinations.tpu_strategy,
+        ],
+        mode=["eager"]))
+class PackedDistributedVariableTest(test.TestCase, parameterized.TestCase):
+
+  def testPackedVariable(self, distribution):
     with distribution.scope():
-      v0 = variables_lib.Variable(
-          0., synchronization=synchronization, aggregation=aggregation)
-      if not isinstance(v0, values.DistributedVariable):
-        self.skipTest("This test doesn't apply to non DistributedVariables")
-
-    self.assertEqual(v0._packed_var, None)
-
-    device_type = device.DeviceSpec.from_string(v0._devices[0]).device_type
-    for d in v0._devices:
-      if device.DeviceSpec.from_string(d).device_type != device_type:
-        self.skipTest("Packing variables on devices of different types "
-                      "is not supported yet.")
+      v0 = variables_lib.Variable(0.)
+    self.assertIsNone(v0._packed_var)
 
     distribution._enable_packed_variable_in_eager_mode = True
     with distribution.scope():
-      v1 = variables_lib.Variable(
-          0., synchronization=synchronization, aggregation=aggregation)
-    if ops.executing_eagerly_outside_functions():
+      v1 = variables_lib.Variable(0)
       self.assertIsInstance(v1._packed_var, packed.PackedDistributedVariable)
-    else:
-      self.assertEqual(v1._packed_var, None)
+
+    devices = v1._devices
+    for i in range(1, len(devices)):
+      with distribute_lib.ReplicaContext(distribution, i):
+        v1.assign(i)
+    val = v1._get()
+    self.assertIsInstance(val, packed.PackedVarAndDevice)
+    self.assertEqual(val.device, devices[0])
+    self.assertEqual(self.evaluate(val.read_value()), 0)
+    for i in range(0, len(devices)):
+      with distribute_lib.ReplicaContext(distribution, i):
+        val = v1._get()
+        self.assertIsInstance(val, packed.PackedVarAndDevice)
+        self.assertEqual(val.device, devices[i])
+        self.assertEqual(self.evaluate(val.read_value()), i)
 
 
 class MirroredVariableTest(test.TestCase, parameterized.TestCase):
@@ -920,6 +941,7 @@ class MirroredVariableTest(test.TestCase, parameterized.TestCase):
           distribution=[
               strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
               strategy_combinations.tpu_strategy,
+              strategy_combinations.tpu_strategy_packed_var,
           ],
           mode=["eager"]))
   def testAssignValueInReplicaContextWithoutAggregation(self, distribution):
@@ -943,6 +965,7 @@ class MirroredVariableTest(test.TestCase, parameterized.TestCase):
               strategy_combinations.mirrored_strategy_with_one_cpu,
               strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
               strategy_combinations.tpu_strategy,
+              strategy_combinations.tpu_strategy_packed_var,
           ],
           mode=["graph", "eager"]))
   def testValueInReplicaContext(self, distribution):
@@ -968,6 +991,7 @@ class MirroredVariableTest(test.TestCase, parameterized.TestCase):
               strategy_combinations.mirrored_strategy_with_one_cpu,
               strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
               strategy_combinations.tpu_strategy,
+              strategy_combinations.tpu_strategy_packed_var,
           ],
           mode=["graph", "eager"]))
   def testAssignOutOfScope(self, distribution):
@@ -1041,6 +1065,7 @@ class MirroredVariableTest(test.TestCase, parameterized.TestCase):
           distribution=[
               strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
               strategy_combinations.tpu_strategy,
+              strategy_combinations.tpu_strategy_packed_var,
           ],
           mode=["eager"]))
   def testInitializedToSameValueInsideEagerRun(self, distribution):
@@ -1066,6 +1091,7 @@ class MirroredVariableTest(test.TestCase, parameterized.TestCase):
               strategy_combinations.mirrored_strategy_with_one_cpu,
               strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
               strategy_combinations.tpu_strategy,
+              strategy_combinations.tpu_strategy_packed_var,
           ],
           mode=["graph", "eager"]))
   def testAggregationOnlyFirstReplica(self, distribution):
@@ -1093,6 +1119,7 @@ class MirroredVariableTest(test.TestCase, parameterized.TestCase):
           distribution=[
               strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
               strategy_combinations.tpu_strategy,
+              strategy_combinations.tpu_strategy_packed_var,
           ],
           mode=["eager"]))
   def testInitScope(self, distribution):
@@ -1143,13 +1170,7 @@ class MirroredVariableTest(test.TestCase, parameterized.TestCase):
         distribution.experimental_local_results(distribution.run(add)))
     self.assertAllEqual([2, 2], per_replica_results)
 
-  @combinations.generate(
-      combinations.combine(
-          distribution=[
-              strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
-              strategy_combinations.tpu_strategy,
-          ],
-          mode=["graph", "eager"]))
+  @combinations.generate(mirrored_and_tpu_strategy_combinations())
   def testAssignAdd(self, distribution):
     with distribution.scope():
       v = variable_scope.variable(
@@ -1456,15 +1477,6 @@ class SyncOnReadVariablePropertiesTest(test.TestCase):
     self.assertEqual(2., self.evaluate(add1(replica_local)))
 
 
-def mirrored_and_tpu_strategy_combinations():
-  return combinations.combine(
-      distribution=[
-          strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
-          strategy_combinations.tpu_strategy,
-      ],
-      mode=["graph", "eager"])
-
-
 # TODO(b/144432582): Add variable aggregation type to combinations to simplify
 # tests.
 def strategy_and_run_tf_function_combinations():
@@ -1478,6 +1490,7 @@ def strategy_and_run_tf_function_combinations():
       experimental_run_tf_function=[True, False]) + combinations.combine(
           distribution=[
               strategy_combinations.tpu_strategy,
+              strategy_combinations.tpu_strategy_packed_var,
           ],
           mode=["graph", "eager"],
           experimental_run_tf_function=[True])
diff --git a/tensorflow/python/distribute/values_util.py b/tensorflow/python/distribute/values_util.py
index ddb0d2d0401..5909bdd229e 100644
--- a/tensorflow/python/distribute/values_util.py
+++ b/tensorflow/python/distribute/values_util.py
@@ -61,8 +61,14 @@ def on_write_assign_sub(var, value, use_locking=False, name=None,
 
 
 def assign_on_each_device(var, assign_func, value, read_value):
-  update = control_flow_ops.group(
-      tuple(assign_func(v.device, v, value) for v in var._values))  # pylint: disable=protected-access
+  """Update the variable on each replica with the given assign_func and value."""
+  if var._packed_variable is not None:  # pylint: disable=protected-access
+    update = control_flow_ops.group(
+        tuple(
+            assign_func(d, var._packed_variable, value) for d in var._devices))  # pylint: disable=protected-access
+  else:
+    update = control_flow_ops.group(
+        tuple(assign_func(v.device, v, value) for v in var._values))  # pylint: disable=protected-access
   if not read_value:
     return update
   with ops.control_dependencies([update] if update else []):
@@ -104,7 +110,7 @@ def on_read_assign_cross_replica(var, value, read_value=True):
       # TODO(anjs): Should this be over all the replicas in sync since we
       # call `reduce` on the variable during read?
       if var.aggregation == vs.VariableAggregation.SUM:
-        tensor = math_ops.cast(tensor / len(var._values), var.dtype)  # pylint: disable=protected-access
+        tensor = math_ops.cast(tensor / len(var._devices), var.dtype)  # pylint: disable=protected-access
       return assign_on_each_device(var, assign_on_device, tensor,
                                    read_value)
 
diff --git a/tensorflow/python/tpu/tpu.py b/tensorflow/python/tpu/tpu.py
index ce3aaa8a058..6f5f0bc26c2 100644
--- a/tensorflow/python/tpu/tpu.py
+++ b/tensorflow/python/tpu/tpu.py
@@ -298,7 +298,8 @@ class TPUReplicateContext(control_flow_ops.XLAControlFlowContext):
     self._pivot = pivot
     self._replicated_vars = {}
 
-  def get_replicated_var_handle(self, name, vars_, is_mirrored=False):
+  def get_replicated_var_handle(self, name, vars_, is_mirrored=False,
+                                is_packed=False):
     """Returns a variable handle for replicated TPU variable 'var'.
 
     This is a method used by an experimental replicated variable implementation
@@ -309,6 +310,7 @@ class TPUReplicateContext(control_flow_ops.XLAControlFlowContext):
       vars_: The replicated TPU variables.
       is_mirrored: Whether the variables are mirrored, which guarantees the
         values in each replica are always the same.
+      is_packed: Whether the replicated variables are packed into one variable.
 
     Returns:
       The handle of the TPU replicated input node.
@@ -320,7 +322,7 @@ class TPUReplicateContext(control_flow_ops.XLAControlFlowContext):
     if handle is not None:
       return handle
 
-    if device_assignment is not None:
+    if device_assignment is not None and not is_packed:
       # Find a variable copy for each replica in the device assignment.
       # Note that the order of devices for replicas for the variable and the
       # device assignment might not match.
@@ -356,7 +358,8 @@ class TPUReplicateContext(control_flow_ops.XLAControlFlowContext):
       graph._set_control_flow_context(self.outer_context)
       handle = tpu_ops.tpu_replicated_input([v.handle for v in replicated_vars],
                                             name=name + "/handle",
-                                            is_mirrored_variable=is_mirrored)
+                                            is_mirrored_variable=is_mirrored,
+                                            is_packed=is_packed)
       graph._set_control_flow_context(saved_context)
       # pylint: enable=protected-access
     self._replicated_vars[name] = handle

From 62082d40720b56974436dd17625c247a5fce2a6b Mon Sep 17 00:00:00 2001
From: YoungSeok Yoon <youngseokyoon@google.com>
Date: Thu, 18 Jun 2020 20:16:19 -0700
Subject: [PATCH 0588/1390] Add build flags for objc libraries

PiperOrigin-RevId: 317235962
Change-Id: I976ccd1ce3db49be3acac44f60b2dc44ed25d767
---
 tensorflow/lite/experimental/objc/BUILD.apple | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/tensorflow/lite/experimental/objc/BUILD.apple b/tensorflow/lite/experimental/objc/BUILD.apple
index ff7e8fa58e9..09d4547813a 100644
--- a/tensorflow/lite/experimental/objc/BUILD.apple
+++ b/tensorflow/lite/experimental/objc/BUILD.apple
@@ -97,7 +97,7 @@ objc_library(
         "//tensorflow/lite:testdata/add.bin",
         "//tensorflow/lite:testdata/add_quantized.bin",
     ],
-    tags = TFL_DEFAULT_TAGS,
+    tags = TFL_DEFAULT_TAGS + ["builder_default_ios_x86_64"],
     deps = [
         ":TensorFlowLite",
     ],
@@ -135,7 +135,10 @@ objc_library(
         "apis",
     ],
     module_name = "TestApp",
-    tags = TFL_DEFAULT_TAGS + ["manual"],
+    tags = TFL_DEFAULT_TAGS + [
+        "manual",
+        "builder_default_ios_x86_64",
+    ],
     deps = [
         ":TensorFlowLite",
     ],

From 85ad8031f60536361de71dd689c9d88848fefed6 Mon Sep 17 00:00:00 2001
From: Gaurav Jain <gjn@google.com>
Date: Thu, 18 Jun 2020 20:27:03 -0700
Subject: [PATCH 0589/1390] Expand dtype support for Neg

PiperOrigin-RevId: 317237033
Change-Id: I59c5e45d469f7bf704976b66bc122aaac3982b5e
---
 .../mlir/tensorflow/ir/tf_generated_ops.td    |  4 +--
 tensorflow/core/kernels/BUILD                 |  3 ++-
 .../core/kernels/cwise_op_gpu_neg.cu.cc       |  4 +--
 .../{cwise_op_neg.cc => cwise_op_neg_1.cc}    |  6 ++---
 tensorflow/core/kernels/cwise_op_neg_2.cc     | 26 +++++++++++++++++++
 tensorflow/core/ops/math_ops.cc               | 12 ++++-----
 .../kernel_tests/cwise_ops_unary_test.py      |  6 +++++
 7 files changed, 46 insertions(+), 15 deletions(-)
 rename tensorflow/core/kernels/{cwise_op_neg.cc => cwise_op_neg_1.cc} (87%)
 create mode 100644 tensorflow/core/kernels/cwise_op_neg_2.cc

diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td b/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
index dcd083fc398..3b1f3eec699 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
@@ -6059,11 +6059,11 @@ I.e., \\(y = -x\\).
   }];
 
   let arguments = (ins
-    TensorOf<[BF16, F16, F32, F64, I32, I64, TF_Complex128, TF_Complex64]>:$x
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64]>:$x
   );
 
   let results = (outs
-    TensorOf<[BF16, F16, F32, F64, I32, I64, TF_Complex128, TF_Complex64]>:$y
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64]>:$y
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index ffe2a035591..279dff92c58 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -6802,7 +6802,8 @@ filegroup(
         "cwise_op_minimum.cc",
         "cwise_op_mul_1.cc",
         "cwise_op_mul_2.cc",
-        "cwise_op_neg.cc",
+        "cwise_op_neg_1.cc",
+        "cwise_op_neg_2.cc",
         "cwise_op_pow.cc",
         "cwise_op_real.cc",
         "cwise_op_reciprocal.cc",
diff --git a/tensorflow/core/kernels/cwise_op_gpu_neg.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_neg.cu.cc
index ea1ca623560..4f7bb9b2075 100644
--- a/tensorflow/core/kernels/cwise_op_gpu_neg.cu.cc
+++ b/tensorflow/core/kernels/cwise_op_gpu_neg.cu.cc
@@ -19,8 +19,8 @@ limitations under the License.
 
 namespace tensorflow {
 namespace functor {
-DEFINE_UNARY7(neg, Eigen::half, float, double, int32, int64, complex64,
-              complex128);
+DEFINE_UNARY4(neg, int8, int16, int32, int64);
+DEFINE_UNARY6(neg, Eigen::half, float, double, bfloat16, complex64, complex128);
 }  // namespace functor
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/kernels/cwise_op_neg.cc b/tensorflow/core/kernels/cwise_op_neg_1.cc
similarity index 87%
rename from tensorflow/core/kernels/cwise_op_neg.cc
rename to tensorflow/core/kernels/cwise_op_neg_1.cc
index f52cf6c8b91..18a7c61be90 100644
--- a/tensorflow/core/kernels/cwise_op_neg.cc
+++ b/tensorflow/core/kernels/cwise_op_neg_1.cc
@@ -16,8 +16,7 @@ limitations under the License.
 #include "tensorflow/core/kernels/cwise_ops_common.h"
 
 namespace tensorflow {
-REGISTER8(UnaryOp, CPU, "Neg", functor::neg, float, Eigen::half, double, int32,
-          complex64, int64, complex128, bfloat16);
+REGISTER4(UnaryOp, CPU, "Neg", functor::neg, int8, int16, int32, int64);
 
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER3(UnaryOp, SYCL, "Neg", functor::neg, float, double, int64);
@@ -30,8 +29,7 @@ REGISTER_KERNEL_BUILDER(Name("Neg")
 #endif  // TENSORFLOW_USE_SYCL
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-REGISTER6(UnaryOp, GPU, "Neg", functor::neg, float, Eigen::half, double, int64,
-          complex64, complex128);
+REGISTER3(UnaryOp, GPU, "Neg", functor::neg, int8, int16, int64);
 
 // A special GPU kernel for int32.
 // TODO(b/25387198): Also enable int32 in device memory. This kernel
diff --git a/tensorflow/core/kernels/cwise_op_neg_2.cc b/tensorflow/core/kernels/cwise_op_neg_2.cc
new file mode 100644
index 00000000000..5ea78ad665c
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_neg_2.cc
@@ -0,0 +1,26 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/kernels/cwise_ops_common.h"
+
+namespace tensorflow {
+REGISTER6(UnaryOp, CPU, "Neg", functor::neg, Eigen::half, float, double,
+          bfloat16, complex64, complex128);
+
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+REGISTER6(UnaryOp, GPU, "Neg", functor::neg, Eigen::half, float, double,
+          bfloat16, complex64, complex128);
+#endif
+}  // namespace tensorflow
diff --git a/tensorflow/core/ops/math_ops.cc b/tensorflow/core/ops/math_ops.cc
index b81bb9d3afc..2a70f420260 100644
--- a/tensorflow/core/ops/math_ops.cc
+++ b/tensorflow/core/ops/math_ops.cc
@@ -201,12 +201,12 @@ REGISTER_OP("ComplexAbs")
     .SetShapeFn(shape_inference::UnchangedShape);
 
 // Declares cwise unary operations signature: 't -> 't
-#define UNARY()                                                          \
-  Input("x: T")                                                          \
-      .Output("y: T")                                                    \
-      .Attr(                                                             \
-          "T: {bfloat16, half, float, double, int32, int64, complex64, " \
-          "complex128}")                                                 \
+#define UNARY()                                                            \
+  Input("x: T")                                                            \
+      .Output("y: T")                                                      \
+      .Attr(                                                               \
+          "T: {bfloat16, half, float, double, int8, int16, int32, int64, " \
+          "complex64, complex128}")                                        \
       .SetShapeFn(shape_inference::UnchangedShape)
 
 #define UNARY_REAL()                              \
diff --git a/tensorflow/python/kernel_tests/cwise_ops_unary_test.py b/tensorflow/python/kernel_tests/cwise_ops_unary_test.py
index f4beaabc29a..df848a653d4 100644
--- a/tensorflow/python/kernel_tests/cwise_ops_unary_test.py
+++ b/tensorflow/python/kernel_tests/cwise_ops_unary_test.py
@@ -389,16 +389,22 @@ class UnaryOpTest(test.TestCase):
                   2).reshape(1, 3, 2).astype(dtypes_lib.bfloat16.as_numpy_dtype)
     self._compareCpu(x, np.abs, math_ops.abs)
     self._compareCpu(x, np.abs, _ABS)
+    self._compareBoth(x, np.negative, math_ops.negative)
+    self._compareBoth(x, np.negative, _NEG)
 
   def testInt8Basic(self):
     x = np.arange(-6, 6, 2).reshape(1, 3, 2).astype(np.int8)
     self._compareCpu(x, np.abs, math_ops.abs)
     self._compareCpu(x, np.abs, _ABS)
+    self._compareBoth(x, np.negative, math_ops.negative)
+    self._compareBoth(x, np.negative, _NEG)
 
   def testInt16Basic(self):
     x = np.arange(-6, 6, 2).reshape(1, 3, 2).astype(np.int16)
     self._compareCpu(x, np.abs, math_ops.abs)
     self._compareCpu(x, np.abs, _ABS)
+    self._compareBoth(x, np.negative, math_ops.negative)
+    self._compareBoth(x, np.negative, _NEG)
 
   def testInt32Basic(self):
     x = np.arange(-6, 6, 2).reshape(1, 3, 2).astype(np.int32)

From 2a05589bd4f5e3042d1baf539e564d7ab9bd6287 Mon Sep 17 00:00:00 2001
From: Taehee Jeong <taeheej@google.com>
Date: Thu, 18 Jun 2020 20:48:49 -0700
Subject: [PATCH 0590/1390] Add inference instruction for iOS

PiperOrigin-RevId: 317239235
Change-Id: I55bd7e43bc286f34024ccfc27db61d28304a651d
---
 tensorflow/lite/g3doc/guide/inference.md | 425 +++++++++++++++--------
 1 file changed, 287 insertions(+), 138 deletions(-)

diff --git a/tensorflow/lite/g3doc/guide/inference.md b/tensorflow/lite/g3doc/guide/inference.md
index 5f3fba98cff..6e47d6d5190 100644
--- a/tensorflow/lite/g3doc/guide/inference.md
+++ b/tensorflow/lite/g3doc/guide/inference.md
@@ -7,9 +7,9 @@ inference with a TensorFlow Lite model, you must run it through an
 The interpreter uses a static graph ordering and a custom (less-dynamic) memory
 allocator to ensure minimal load, initialization, and execution latency.
 
-This page describes how to access to the TensorFlow Lite interpreter and
-perform an inference using C++, Java, and Python, plus links to other resources
-for each [supported platform](#supported-platforms).
+This page describes how to access to the TensorFlow Lite interpreter and perform
+an inference using C++, Java, and Python, plus links to other resources for each
+[supported platform](#supported-platforms).
 
 [TOC]
 
@@ -17,31 +17,31 @@ for each [supported platform](#supported-platforms).
 
 TensorFlow Lite inference typically follows the following steps:
 
-1. **Loading a model**
+1.  **Loading a model**
 
-   You must load the `.tflite` model into memory, which contains the model's
-   execution graph.
+    You must load the `.tflite` model into memory, which contains the model's
+    execution graph.
 
-1. **Transforming data**
+1.  **Transforming data**
 
-   Raw input data for the model generally does not match the input data format
-   expected by the model. For example, you might need to resize an image or
-   change the image format to be compatible with the model.
+    Raw input data for the model generally does not match the input data format
+    expected by the model. For example, you might need to resize an image or
+    change the image format to be compatible with the model.
 
-1. **Running inference**
+1.  **Running inference**
 
-   This step involves using the TensorFlow Lite API to execute the model. It
-   involves a few steps such as building the interpreter, and allocating
-   tensors, as described in the following sections.
+    This step involves using the TensorFlow Lite API to execute the model. It
+    involves a few steps such as building the interpreter, and allocating
+    tensors, as described in the following sections.
 
-1. **Interpreting output**
+1.  **Interpreting output**
 
-   When you receive results from the model inference, you must interpret the
-   tensors in a meaningful way that's useful in your application.
+    When you receive results from the model inference, you must interpret the
+    tensors in a meaningful way that's useful in your application.
 
-   For example, a model might return only a list of probabilities. It's up to
-   you to map the probabilities to relevant categories and present it to your
-   end-user.
+    For example, a model might return only a list of probabilities. It's up to
+    you to map the probabilities to relevant categories and present it to your
+    end-user.
 
 ## Supported platforms
 
@@ -54,8 +54,8 @@ should be no surprise that the APIs try to avoid unnecessary copies at the
 expense of convenience. Similarly, consistency with TensorFlow APIs was not an
 explicit goal and some variance between languages is to be expected.
 
-Across all libraries, the TensorFlow Lite API enables you to load models,
-feed inputs, and retrieve inference outputs.
+Across all libraries, the TensorFlow Lite API enables you to load models, feed
+inputs, and retrieve inference outputs.
 
 ### Android
 
@@ -64,8 +64,8 @@ APIs. The Java APIs provide convenience and can be used directly within your
 Android Activity classes. The C++ APIs offer more flexibility and speed, but may
 require writing JNI wrappers to move data between Java and C++ layers.
 
-See below for details about using C++ and Java, or
-follow the [Android quickstart](android.md) for a tutorial and example code.
+See below for details about using C++ and Java, or follow the
+[Android quickstart](android.md) for a tutorial and example code.
 
 #### TensorFlow Lite Android wrapper code generator
 
@@ -86,103 +86,36 @@ On iOS, TensorFlow Lite is available with native iOS libraries written in
 [Swift](https://www.tensorflow.org/code/tensorflow/lite/experimental/swift)
 and
 [Objective-C](https://www.tensorflow.org/code/tensorflow/lite/experimental/objc).
+You can also use
+[C API](https://www.tensorflow.org/code/tensorflow/lite/c/c_api.h)
+directly in Objective-C codes.
 
-This page doesn't include a discussion for about these languages, so you should
-refer to the [iOS quickstart](ios.md) for a tutorial and example code.
+See below for details about using Swift, Objective-C and C API, or follow the
+[iOS quickstart](ios.md) for a tutorial and example code.
 
 ### Linux
 
 On Linux platforms (including [Raspberry Pi](build_rpi.md)), you can run
-inferences using TensorFlow Lite APIs available in C++ and Python, as shown
-in the following sections.
+inferences using TensorFlow Lite APIs available in C++ and Python, as shown in
+the following sections.
 
+## Running a model
 
-## Load and run a model in C++
+Running a TensorFlow Lite model involves a few simple steps:
 
-Running a TensorFlow Lite model with C++ involves a few simple steps:
-
-  1. Load the model into memory as a `FlatBufferModel`.
-  2. Build an `Interpreter` based on an existing `FlatBufferModel`.
-  3. Set input tensor values. (Optionally resize input tensors if the
-     predefined sizes are not desired.)
-  4. Invoke inference.
-  5. Read output tensor values.
-
-The [`FlatBufferModel`](
-https://www.tensorflow.org/lite/api_docs/cc/class/tflite/flat-buffer-model.html)
-class encapsulates a TensorFlow Lite model and you can
-build it in a couple of different ways, depending on where the model is stored:
-
-```c++
-class FlatBufferModel {
-  // Build a model based on a file. Return a nullptr in case of failure.
-  static std::unique_ptr<FlatBufferModel> BuildFromFile(
-      const char* filename,
-      ErrorReporter* error_reporter);
-
-  // Build a model based on a pre-loaded flatbuffer. The caller retains
-  // ownership of the buffer and should keep it alive until the returned object
-  // is destroyed. Return a nullptr in case of failure.
-  static std::unique_ptr<FlatBufferModel> BuildFromBuffer(
-      const char* buffer,
-      size_t buffer_size,
-      ErrorReporter* error_reporter);
-};
-```
-
-Note: If TensorFlow Lite detects the presence of the [Android NNAPI](
-https://developer.android.com/ndk/guides/neuralnetworks), it will
-automatically try to use shared memory to store the `FlatBufferModel`.
-
-Now that you have the model as a `FlatBufferModel` object, you can execute it
-with an [`Interpreter`](
-https://www.tensorflow.org/lite/api_docs/cc/class/tflite/interpreter.html).
-A single `FlatBufferModel` can be used
-simultaneously by more than one `Interpreter`.
-
-Caution: The `FlatBufferModel` object must remain valid until
-all instances of `Interpreter` using it have been destroyed.
-
-The important parts of the `Interpreter` API are shown in the
-code snippet below. It should be noted that:
-
-  * Tensors are represented by integers, in order to avoid string comparisons
-    (and any fixed dependency on string libraries).
-  * An interpreter must not be accessed from concurrent threads.
-  * Memory allocation for input and output tensors must be triggered
-    by calling `AllocateTensors()` right after resizing tensors.
-
-The simplest usage of TensorFlow Lite with C++ looks like this:
-
-```c++
-// Load the model
-std::unique_ptr<tflite::FlatBufferModel> model =
-    tflite::FlatBufferModel::BuildFromFile(filename);
-
-// Build the interpreter
-tflite::ops::builtin::BuiltinOpResolver resolver;
-std::unique_ptr<tflite::Interpreter> interpreter;
-tflite::InterpreterBuilder(*model, resolver)(&interpreter);
-
-// Resize input tensors, if desired.
-interpreter->AllocateTensors();
-
-float* input = interpreter->typed_input_tensor<float>(0);
-// Fill `input`.
-
-interpreter->Invoke();
-
-float* output = interpreter->typed_output_tensor<float>(0);
-```
-
-For more example code, see [`minimal.cc`](
-https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/examples/minimal/minimal.cc)
-and [`label_image.cc`](
-https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/examples/label_image/label_image.cc).
+1.  Load the model into memory.
+2.  Build an `Interpreter` based on an existing model.
+3.  Set input tensor values. (Optionally resize input tensors if the predefined
+    sizes are not desired.)
+4.  Invoke inference.
+5.  Read output tensor values.
 
+Following sections describe how these steps can be done in each language.
 
 ## Load and run a model in Java
 
+*Platform: Android*
+
 The Java API for running an inference with TensorFlow Lite is primarily designed
 for use with Android, so it's available as an Android library dependency:
 `org.tensorflow:tensorflow-lite`.
@@ -203,12 +136,12 @@ public Interpreter(@NotNull MappedByteBuffer mappedByteBuffer);
 ```
 
 In both cases, you must provide a valid TensorFlow Lite model or the API throws
-`IllegalArgumentException`. If you use `MappedByteBuffer` to
-initialize an `Interpreter`, it must remain unchanged for the whole lifetime
-of the `Interpreter`.
+`IllegalArgumentException`. If you use `MappedByteBuffer` to initialize an
+`Interpreter`, it must remain unchanged for the whole lifetime of the
+`Interpreter`.
 
-To then run an inference with the model, simply call `Interpreter.run()`.
-For example:
+To then run an inference with the model, simply call `Interpreter.run()`. For
+example:
 
 ```java
 try (Interpreter interpreter = new Interpreter(file_of_a_tensorflowlite_model)) {
@@ -228,9 +161,9 @@ In this case, each entry in `inputs` corresponds to an input tensor and
 output data.
 
 In both cases, the tensor indices should correspond to the values you gave to
-the [TensorFlow Lite Converter](../convert/) when you created the model.
-Be aware that the order of tensors in `input` must match the
-order given to the TensorFlow Lite Converter.
+the [TensorFlow Lite Converter](../convert/) when you created the model. Be
+aware that the order of tensors in `input` must match the order given to the
+TensorFlow Lite Converter.
 
 The `Interpreter` class also provides convenient functions for you to get the
 index of any model input or output using an operation name:
@@ -250,8 +183,8 @@ resources must be released after use by:
 interpreter.close();
 ```
 
-For an example project with Java, see the [Android image classification sample](
-https://github.com/tensorflow/examples/tree/master/lite/examples/image_classification/android).
+For an example project with Java, see the
+[Android image classification sample](https://github.com/tensorflow/examples/tree/master/lite/examples/image_classification/android).
 
 ### Supported data types (in Java)
 
@@ -295,13 +228,231 @@ have dynamic outputs, where the shape of output tensors can vary depending on
 the input. There's no straightforward way of handling this with the existing
 Java inference API, but planned extensions will make this possible.
 
+## Load and run a model in Swift
+
+*Platform: iOS*
+
+The
+[Swift API](https://www.tensorflow.org/code/tensorflow/lite/experimental/swift)
+is available in `TensorFlowLiteSwift` Pod from Cocoapods.
+
+First, you need to import `TensorFlowLite` module.
+
+```swift
+import TensorFlowLite
+```
+
+```swift
+// Getting model path
+guard
+  let modelPath = Bundle.main.path(forResource: "model", ofType: "tflite")
+else {
+  // Error handling...
+}
+
+do {
+  // Initialize an interpreter with the model.
+  let interpreter = try Interpreter(modelPath: modelPath)
+
+  // Allocate memory for the model's input `Tensor`s.
+  try interpreter.allocateTensors()
+
+  let inputData: Data  // Should be initialized
+
+  // input data preparation...
+
+  // Copy the input data to the input `Tensor`.
+  try self.interpreter.copy(inputData, toInputAt: 0)
+
+  // Run inference by invoking the `Interpreter`.
+  try self.interpreter.invoke()
+
+  // Get the output `Tensor`
+  let outputTensor = try self.interpreter.output(at: 0)
+
+  // Copy output to `Data` to process the inference results.
+  let outputSize = outputTensor.shape.dimensions.reduce(1, {x, y in x * y})
+  let outputData =
+        UnsafeMutableBufferPointer<Float32>.allocate(capacity: outputSize)
+  outputTensor.data.copyBytes(to: outputData)
+
+  if (error != nil) { /* Error handling... */ }
+} catch error {
+  // Error handling...
+}
+```
+
+## Load and run a model in Objective-C
+
+*Platform: iOS*
+
+The
+[Objective-C API](https://www.tensorflow.org/code/tensorflow/lite/experimental/objc)
+is available in `TensorFlowLiteObjC` Pod from Cocoapods.
+
+First, you need to import `TensorFlowLite` module.
+
+```objc
+@import TensorFlowLite;
+```
+
+```objc
+NSString *modelPath = [[NSBundle mainBundle] pathForResource:@"model"
+                                                      ofType:@"tflite"];
+NSError *error;
+
+// Initialize an interpreter with the model.
+TFLInterpreter *interpreter = [[TFLInterpreter alloc] initWithModelPath:modelPath
+                                                                  error:&error];
+if (error != nil) { /* Error handling... */ }
+
+// Allocate memory for the model's input `TFLTensor`s.
+[interpreter allocateTensorsWithError:&error];
+if (error != nil) { /* Error handling... */ }
+
+NSMutableData *inputData;  // Should be initialized
+// input data preparation...
+
+// Copy the input data to the input `TFLTensor`.
+[interpreter copyData:inputData toInputTensorAtIndex:0 error:&error];
+if (error != nil) { /* Error handling... */ }
+
+// Run inference by invoking the `TFLInterpreter`.
+[interpreter invokeWithError:&error];
+if (error != nil) { /* Error handling... */ }
+
+// Get the output `TFLTensor`
+TFLTensor *outputTensor = [interpreter outputTensorAtIndex:0 error:&error];
+if (error != nil) { /* Error handling... */ }
+
+// Copy output to `NSData` to process the inference results.
+NSData *outputData = [outputTensor dataWithError:&amp;error];
+if (error != nil) { /* Error handling... */ }
+```
+
+### Using C API in Objective-C code
+
+Currently Objective-C API does not support delegates. In order to use delegates
+with Objective-C code, you need to directly call underlying
+[C API](https://www.tensorflow.org/code/tensorflow/lite/c/c_api.h).
+
+```c
+#include "tensorflow/lite/c/c_api.h"
+```
+
+```c
+TfLiteModel* model = TfLiteModelCreateFromFile([modelPath UTF8String]);
+TfLiteInterpreterOptions* options = TfLiteInterpreterOptionsCreate();
+
+// Create the interpreter.
+TfLiteInterpreter* interpreter = TfLiteInterpreterCreate(model, options);
+
+// Allocate tensors and populate the input tensor data.
+TfLiteInterpreterAllocateTensors(interpreter);
+TfLiteTensor* input_tensor =
+    TfLiteInterpreterGetInputTensor(interpreter, 0);
+TfLiteTensorCopyFromBuffer(input_tensor, input.data(),
+                           input.size() * sizeof(float));
+
+// Execute inference.
+TfLiteInterpreterInvoke(interpreter);
+
+// Extract the output tensor data.
+const TfLiteTensor* output_tensor =
+//      TfLiteInterpreterGetOutputTensor(interpreter, 0);
+TfLiteTensorCopyToBuffer(output_tensor, output.data(),
+                         output.size() * sizeof(float));
+
+// Dispose of the model and interpreter objects.
+TfLiteInterpreterDelete(interpreter);
+TfLiteInterpreterOptionsDelete(options);
+TfLiteModelDelete(model);
+```
+
+## Load and run a model in C++
+
+*Platforms: Android and Linux*
+
+In C++, the model is stored in
+[`FlatBufferModel`](https://www.tensorflow.org/lite/api_docs/cc/class/tflite/flat-buffer-model.html)
+class. It encapsulates a TensorFlow Lite model and you can build it in a couple
+of different ways, depending on where the model is stored:
+
+```c++
+class FlatBufferModel {
+  // Build a model based on a file. Return a nullptr in case of failure.
+  static std::unique_ptr<FlatBufferModel> BuildFromFile(
+      const char* filename,
+      ErrorReporter* error_reporter);
+
+  // Build a model based on a pre-loaded flatbuffer. The caller retains
+  // ownership of the buffer and should keep it alive until the returned object
+  // is destroyed. Return a nullptr in case of failure.
+  static std::unique_ptr<FlatBufferModel> BuildFromBuffer(
+      const char* buffer,
+      size_t buffer_size,
+      ErrorReporter* error_reporter);
+};
+```
+
+Note: If TensorFlow Lite detects the presence of the
+[Android NNAPI](https://developer.android.com/ndk/guides/neuralnetworks), it
+will automatically try to use shared memory to store the `FlatBufferModel`.
+
+Now that you have the model as a `FlatBufferModel` object, you can execute it
+with an
+[`Interpreter`](https://www.tensorflow.org/lite/api_docs/cc/class/tflite/interpreter.html).
+A single `FlatBufferModel` can be used simultaneously by more than one
+`Interpreter`.
+
+Caution: The `FlatBufferModel` object must remain valid until all instances of
+`Interpreter` using it have been destroyed.
+
+The important parts of the `Interpreter` API are shown in the code snippet
+below. It should be noted that:
+
+*   Tensors are represented by integers, in order to avoid string comparisons
+    (and any fixed dependency on string libraries).
+*   An interpreter must not be accessed from concurrent threads.
+*   Memory allocation for input and output tensors must be triggered by calling
+    `AllocateTensors()` right after resizing tensors.
+
+The simplest usage of TensorFlow Lite with C++ looks like this:
+
+```c++
+// Load the model
+std::unique_ptr<tflite::FlatBufferModel> model =
+    tflite::FlatBufferModel::BuildFromFile(filename);
+
+// Build the interpreter
+tflite::ops::builtin::BuiltinOpResolver resolver;
+std::unique_ptr<tflite::Interpreter> interpreter;
+tflite::InterpreterBuilder(*model, resolver)(&interpreter);
+
+// Resize input tensors, if desired.
+interpreter->AllocateTensors();
+
+float* input = interpreter->typed_input_tensor<float>(0);
+// Fill `input`.
+
+interpreter->Invoke();
+
+float* output = interpreter->typed_output_tensor<float>(0);
+```
+
+For more example code, see
+[`minimal.cc`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/examples/minimal/minimal.cc)
+and
+[`label_image.cc`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/examples/label_image/label_image.cc).
 
 ## Load and run a model in Python
 
-The Python API for running an inference is provided in the `tf.lite`
-module. From which, you mostly need only [`tf.lite.Interpreter`](
-https://www.tensorflow.org/api_docs/python/tf/lite/Interpreter) to load
-a model and run an inference.
+*Platform: Linux*
+
+The Python API for running an inference is provided in the `tf.lite` module.
+From which, you mostly need only
+[`tf.lite.Interpreter`](https://www.tensorflow.org/api_docs/python/tf/lite/Interpreter)
+to load a model and run an inference.
 
 The following example shows how to use the Python interpreter to load a
 `.tflite` file and run inference with random input data:
@@ -358,13 +509,12 @@ interpreter.allocate_tensors()
 # Continue to get tensors and so forth, as shown above...
 ```
 
-For more Python sample code, see [`label_image.py`](
-https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/examples/python/label_image.py).
+For more Python sample code, see
+[`label_image.py`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/examples/python/label_image.py).
 
 Tip: Run `help(tf.lite.Interpreter)` in the Python terminal to get detailed
 documentation about the interpreter.
 
-
 ## Write a custom operator
 
 All TensorFlow Lite operators (both custom and builtin) are defined using a
@@ -379,10 +529,10 @@ typedef struct {
 } TfLiteRegistration;
 ```
 
-Refer to `context.h` for details on `TfLiteContext` and `TfLiteNode`. The
-former provides error reporting facilities and access to global objects,
-including all the tensors. The latter allows implementations to access their
-inputs and outputs.
+Refer to `context.h` for details on `TfLiteContext` and `TfLiteNode`. The former
+provides error reporting facilities and access to global objects, including all
+the tensors. The latter allows implementations to access their inputs and
+outputs.
 
 When the interpreter loads a model, it calls `init()` once for each node in the
 graph. A given `init()` will be called more than once if the op is used multiple
@@ -403,9 +553,9 @@ implementations can access their state using `node->user_data`.
 Finally, each time inference runs, the interpreter traverses the graph calling
 `invoke()`, and here too the state is available as `node->user_data`.
 
-Custom ops can be implemented in exactly the same way as builtin ops, by
-defined those four functions and a global registration function that usually
-looks like this:
+Custom ops can be implemented in exactly the same way as builtin ops, by defined
+those four functions and a global registration function that usually looks like
+this:
 
 ```c++
 namespace tflite {
@@ -461,8 +611,7 @@ You can optionally register custom ops (before you pass the resolver to the
 resolver.AddOp("MY_CUSTOM_OP", Register_MY_CUSTOM_OP());
 ```
 
-If the set of builtin ops is deemed to be too large, a new `OpResolver` could
-be code-generated  based on a given subset of ops, possibly only the ones
-contained in a given model. This is the equivalent of TensorFlow's selective
-registration (and a simple version of it is available in the `tools`
-directory).
+If the set of builtin ops is deemed to be too large, a new `OpResolver` could be
+code-generated based on a given subset of ops, possibly only the ones contained
+in a given model. This is the equivalent of TensorFlow's selective registration
+(and a simple version of it is available in the `tools` directory).

From 397494a2313aa51fe0b87b4e51d3a2349e4f8ecc Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 18 Jun 2020 20:51:42 -0700
Subject: [PATCH 0591/1390] Hoisting unconditional converts from conditional
 branch computations.

PiperOrigin-RevId: 317239618
Change-Id: If3b16ff4f2bbcf38ee1ca51f5e8b187c58ab8e91
---
 .../xla/service/conditional_code_motion.cc    | 285 ++++++++++++++++--
 .../xla/service/conditional_code_motion.h     |  15 +-
 .../service/conditional_code_motion_test.cc   | 101 ++++++-
 3 files changed, 369 insertions(+), 32 deletions(-)

diff --git a/tensorflow/compiler/xla/service/conditional_code_motion.cc b/tensorflow/compiler/xla/service/conditional_code_motion.cc
index eecdcc851e9..6db4c3eb6d4 100644
--- a/tensorflow/compiler/xla/service/conditional_code_motion.cc
+++ b/tensorflow/compiler/xla/service/conditional_code_motion.cc
@@ -106,7 +106,6 @@ class BranchVisitor {
         boundaries_.emplace_back(operand, i, inst);
         continue;
       }
-
       worklist_.push_back(operand);
       visited_.insert(operand);
     }
@@ -197,6 +196,7 @@ bool WorthHoisting(HloInstruction* instruction) {
     case HloOpcode::kMultiply:
     case HloOpcode::kDivide:
     case HloOpcode::kTuple:
+    case HloOpcode::kSqrt:
     case HloOpcode::kGetTupleElement:
       return true;
     default:
@@ -206,10 +206,11 @@ bool WorthHoisting(HloInstruction* instruction) {
 
 // Compare if the instructions to be visited at each branches are identical.
 bool InstructionWithinBranchIdentical(
-    const std::vector<HloInstruction*>& instructions, bool is_layout_senstive) {
+    const std::vector<HloInstruction*>& instructions,
+    bool is_layout_sensitive) {
   // Identical includes the shape of each operands are equal.
   auto eq_operand = [&](const HloInstruction* a, const HloInstruction* b) {
-    bool eq_operands = is_layout_senstive
+    bool eq_operands = is_layout_sensitive
                            ? ShapeUtil::Equal(a->shape(), b->shape())
                            : ShapeUtil::Compatible(a->shape(), b->shape());
     return eq_operands;
@@ -233,7 +234,7 @@ bool InstructionWithinBranchIdentical(
           auto old_channel_id = instruction->channel_id();
           instruction->set_channel_id(instructions[0]->channel_id());
           bool eq_instructions = instructions[0]->Identical(
-              *instruction, eq_operand, eq_computations, is_layout_senstive);
+              *instruction, eq_operand, eq_computations, is_layout_sensitive);
           instruction->set_channel_id(old_channel_id);
           return eq_instructions;
         });
@@ -243,7 +244,7 @@ bool InstructionWithinBranchIdentical(
                      [&](HloInstruction* instruction) {
                        return instructions[0]->Identical(
                            *instruction, eq_operand, eq_computations,
-                           is_layout_senstive);
+                           is_layout_sensitive);
                      });
 }
 
@@ -354,12 +355,228 @@ Status RemoveInstructionFromComputation(
   return Status::OK();
 }
 
+// Identify converts to be hoisted/rematerialized out of the branch
+// computations.
+absl::flat_hash_set<int64> FindSpecialConverts(HloInstruction* old_root,
+                                               int branch_count,
+                                               HloInstruction* conditional,
+                                               bool is_layout_sensitive) {
+  absl::flat_hash_set<int64> kspecial_convert;
+  for (int64 operand_num = 0; operand_num < old_root->operand_count();
+       ++operand_num) {
+    if (old_root->operand(operand_num)->opcode() != HloOpcode::kConvert) {
+      continue;
+    }
+    bool replica = true;
+    HloInstruction* kspecial_convert_candidate =
+        old_root->mutable_operand(operand_num);
+    // Check whether an identical candidate appears in other branches
+    for (int others = 1; others < branch_count; ++others) {
+      HloInstruction* others_root =
+          conditional->branch_computation(others)->root_instruction();
+      bool eq_shape =
+          is_layout_sensitive
+              ? ShapeUtil::Equal(others_root->operand(operand_num)->shape(),
+                                 kspecial_convert_candidate->shape())
+              : ShapeUtil::Compatible(
+                    others_root->operand(operand_num)->shape(),
+                    kspecial_convert_candidate->shape());
+      if ((others_root->operand(operand_num)->opcode() ==
+           HloOpcode::kConvert) &&
+          eq_shape) {
+        // Nothing to be done.
+      } else {
+        replica = false;
+        break;
+      }
+    }
+    if (replica) {
+      kspecial_convert.insert(operand_num);
+    }
+  }
+  return kspecial_convert;
+}
+
+// Restructuring the conditional instruction as follows:
+// i.e., %result = conditional() becomes
+// x = conditional()
+// y.{0..n} = gte(x, {0..n})
+// z = tuple(y.0, y.1, ...y.n)
+// Doing so ensures that we can accommodate the possible shape-change of the
+// conditional when the instructions are hoisted.
+Status RestructureConditionalInstruction(HloComputation* computation,
+                                         HloInstruction* conditional) {
+  HloInstruction* old_root = computation->root_instruction();
+  std::vector<HloInstruction*> new_operands;
+  int cur_index = 0;
+  for (; cur_index < ShapeUtil::TupleElementCount(conditional->shape());
+       ++cur_index) {
+    new_operands.push_back(
+        computation->AddInstruction(HloInstruction::CreateGetTupleElement(
+            ShapeUtil::GetTupleElementShape(conditional->shape(), cur_index),
+            conditional, cur_index)));
+  }
+  HloInstruction* new_tuple =
+      computation->AddInstruction(HloInstruction::CreateTuple(new_operands));
+  if (old_root == conditional) {
+    computation->set_root_instruction(new_tuple);
+  } else {
+    std::vector<HloInstruction*> new_tuple_users;
+    for (auto conditional_user : conditional->users()) {
+      auto is_new_gte = absl::c_find_if(
+          new_operands,
+          [&](HloInstruction* instr) { return instr == conditional_user; });
+      if (is_new_gte == new_operands.end()) {
+        new_tuple_users.push_back(conditional_user);
+      }
+    }
+    for (auto new_tuple_user : new_tuple_users) {
+      TF_RETURN_IF_ERROR(
+          conditional->ReplaceUseWith(new_tuple_user, new_tuple));
+    }
+  }
+  VLOG(2) << "computation after root restructure:\n" << computation->ToString();
+  return Status::OK();
+}
+
+StatusOr<bool> ConvertSpecialMove(HloInstruction* conditional,
+                                  bool is_layout_sensitive) {
+  int branch_count = conditional->branch_count();
+  if (branch_count <= 0) {
+    return false;
+  }
+
+  HloInstruction* old_root =
+      conditional->branch_computation(0)->root_instruction();
+  if (old_root->opcode() != HloOpcode::kTuple) {
+    return false;
+  } else {
+    VLOG(2) << "BEFORE :" << conditional->parent()->parent()->ToString();
+    // Identify the gte using `index'.
+    auto find_gte = [](const HloInstruction* conditional_result,
+                       int64 index) -> HloInstruction* {
+      for (HloInstruction* instr : conditional_result->users()) {
+        if (instr->opcode() != HloOpcode::kGetTupleElement) {
+          return nullptr;
+        }
+        if (instr->tuple_index() == index) {
+          return instr;
+        }
+      }
+      return nullptr;
+    };
+
+    // Captures tuple indices refering to converts to be rematerialized/hoisted.
+    absl::flat_hash_set<int64> kspecial_convert = FindSpecialConverts(
+        old_root, branch_count, conditional, is_layout_sensitive);
+
+    // Exit if we cannot find any converts to be hoisted.
+    if (kspecial_convert.empty()) {
+      return false;
+    }
+
+    TF_RETURN_IF_ERROR(
+        RestructureConditionalInstruction(conditional->parent(), conditional));
+
+    for (int branch = 0; branch < branch_count; branch++) {
+      old_root = conditional->branch_computation(branch)->root_instruction();
+      absl::flat_hash_map<HloInstruction*, int64> map_inst_to_tuple_index;
+      std::vector<HloInstruction*> new_operands(old_root->operand_count());
+      std::unordered_set<HloInstruction*> to_hoist_set;
+
+      for (int64 operand_num = 0; operand_num < old_root->operand_count();
+           ++operand_num) {
+        map_inst_to_tuple_index[old_root->mutable_operand(operand_num)] =
+            operand_num;
+      }
+      for (int64 operand_num = 0; operand_num < old_root->operand_count();
+           ++operand_num) {
+        HloInstruction* hoist = old_root->mutable_operand(operand_num);
+        if (!kspecial_convert.contains(operand_num)) {
+          new_operands[operand_num] = old_root->mutable_operand(operand_num);
+          continue;
+        }
+
+        to_hoist_set.insert(hoist);
+        int64 new_tuple_count = old_root->operand_count();
+
+        // Replace the hoisted instr in the tuple with the operand/operands.
+        // We will replace at least one of the operands of the hoist at the
+        // tuple place; the rest will be added at the end.
+        bool inplace = true;
+        CHECK(!hoist->operands().empty());
+        for (HloInstruction* prod : hoist->operands()) {
+          if (inplace) {
+            map_inst_to_tuple_index[prod] = map_inst_to_tuple_index[hoist];
+            new_operands[map_inst_to_tuple_index[hoist]] = prod;
+            inplace = false;
+          } else {
+            map_inst_to_tuple_index[prod] = new_tuple_count++;
+            new_operands.push_back(prod);
+          }
+        }
+      }
+
+      // Create the new root instruction.
+      HloComputation* cur_branch = conditional->branch_computation(branch);
+      HloInstruction* new_branch_root =
+          cur_branch->AddInstruction(HloInstruction::CreateTuple(new_operands));
+      // The shape can vary since the operands to convert are now
+      // being returned through the branches' root.
+      cur_branch->set_root_instruction(new_branch_root, true /*new shape*/);
+      TF_CHECK_OK(cur_branch->RemoveInstruction(old_root));
+
+      // Only one of the branches needs to change the conditional->parent().
+      if (branch != 0) {
+        continue;
+      }
+      HloComputation* conditional_parent = conditional->parent();
+      HloInstruction* newconditional =
+          conditional_parent->AddInstruction(HloInstruction::CreateConditional(
+              cur_branch->root_instruction()->shape(),
+              conditional->mutable_operand(0),
+              absl::MakeSpan(conditional->branch_computations()),
+              absl::MakeSpan(conditional->operands()).subspan(1)));
+      // Ensure that all the users of conditional refer to the new one.
+      TF_RETURN_IF_ERROR(
+          conditional->ReplaceAllUsesWithDifferentShape(newconditional));
+      TF_CHECK_OK(conditional_parent->RemoveInstruction(conditional));
+      conditional = newconditional;
+      // Add the hoisted instructions in the parent.
+      for (HloInstruction* hoist : to_hoist_set) {
+        VLOG(2) << "Hoisting instruction:" << hoist->ToString();
+        int64 hoist_index = map_inst_to_tuple_index[hoist];
+        // Find out the gte that captured the hoisted instr result.
+        HloInstruction* gte_hoist = find_gte(conditional, hoist_index);
+        CHECK(gte_hoist != nullptr);
+        std::vector<HloInstruction*> new_operands;
+        for (HloInstruction* op : hoist->operands()) {
+          HloInstruction* gte = conditional_parent->AddInstruction(
+              HloInstruction::CreateGetTupleElement(
+                  op->shape(), conditional, map_inst_to_tuple_index[op]));
+          new_operands.push_back(gte);
+        }
+        HloInstruction* hoisted = conditional_parent->AddInstruction(
+            hoist->CloneWithNewOperands(hoist->shape(), new_operands));
+        VLOG(2) << "Hoisted instruction in parent:" << hoisted->ToString();
+        TF_RETURN_IF_ERROR(gte_hoist->ReplaceAllUsesWith(hoisted));
+        TF_CHECK_OK(conditional_parent->RemoveInstruction(gte_hoist));
+      }
+      // No need to explicitly delete a hoisted instruction since if its dead
+      // then the subsequent DCE will remove it.
+    }
+  }
+  VLOG(2) << "AFTER :" << conditional->parent()->parent()->ToString();
+  return true;
+}
+
 // Hoist identical ops out of the conditional. The definition of identical
 // are the shape of the operands are identical and their properties are
 // identical. Will start from the root instruction of each branch and get
 // the identical ops to hoist.
 StatusOr<bool> MergeIdenticalElements(HloInstruction* conditional,
                                       bool is_layout_sensitive) {
+  VLOG(1) << " visiting conditional:" << conditional->ToString();
   int branch_count = conditional->branch_count();
   if (branch_count <= 0) {
     return false;
@@ -399,7 +616,7 @@ StatusOr<bool> MergeIdenticalElements(HloInstruction* conditional,
     }
   }
 
-  if (visitors[0].HoistInstructionSize() <= 1) {
+  if (visitors[0].HoistInstructionSize() < 1) {
     return false;
   }
 
@@ -442,7 +659,6 @@ StatusOr<bool> MergeIdenticalElements(HloInstruction* conditional,
         RemoveInstructionFromComputation(visitors[i].instructions_to_hoist(),
                                          conditional->branch_computation(i)));
   }
-
   return true;
 }
 
@@ -451,26 +667,55 @@ StatusOr<bool> MergeIdenticalElements(HloInstruction* conditional,
 StatusOr<bool> ConditionalCodeMotion::Run(HloModule* module) {
   bool changed = false;
 
-  // Gather all the conditional ops in our module. We do this ahead of time so
-  // we don't have to worry about mutating the lists of computations or
-  // instructions as we iterate.
-  std::vector<HloInstruction*> conditional_ops;
-  for (auto* comp : module->MakeComputationPostOrder()) {
-    for (auto* instr : comp->MakeInstructionPostOrder()) {
-      if (instr->opcode() == HloOpcode::kConditional) {
-        conditional_ops.push_back(instr);
+  if (pursue_full_conditional_code_motion_) {
+    std::vector<HloInstruction*> conditional_ops;
+    for (auto* comp : module->MakeComputationPostOrder()) {
+      for (auto* instr : comp->MakeInstructionPostOrder()) {
+        if (instr->opcode() == HloOpcode::kConditional) {
+          conditional_ops.push_back(instr);
+        }
       }
     }
+
+    for (HloInstruction* conditional_op : conditional_ops) {
+      TF_ASSIGN_OR_RETURN(
+          bool result,
+          MergeIdenticalElements(conditional_op, is_layout_sensitive_));
+      changed |= result;
+    }
+
+    if (changed) {
+      HloPassPipeline subpipeline("after_conditional_code_motion");
+      subpipeline.AddPass<HloDCE>();
+      subpipeline.AddPass<TupleSimplifier>();
+      subpipeline.AddPass<HloDCE>();
+      TF_ASSIGN_OR_RETURN(bool cleanup_changed, subpipeline.Run(module));
+      changed |= cleanup_changed;
+    }
   }
 
-  for (HloInstruction* conditional_op : conditional_ops) {
-    TF_ASSIGN_OR_RETURN(bool result, MergeIdenticalElements(
-                                         conditional_op, is_layout_sensitive_));
-    changed |= result;
+  // handling convert rematerialization/hoisting
+  {
+    std::vector<HloInstruction*> conditional_ops;
+    for (auto* comp : module->MakeComputationPostOrder()) {
+      for (auto* instr : comp->MakeInstructionPostOrder()) {
+        if (instr->opcode() == HloOpcode::kConditional) {
+          conditional_ops.push_back(instr);
+        }
+      }
+    }
+    for (HloInstruction* conditional_op : conditional_ops) {
+      TF_ASSIGN_OR_RETURN(
+          bool convert_result,
+          ConvertSpecialMove(conditional_op, is_layout_sensitive_));
+      changed |= convert_result;
+    }
   }
 
   if (changed) {
-    HloPassPipeline subpipeline("after_conditional_code_motion");
+    HloPassPipeline subpipeline(
+        "after_conditional_code_motion_after_convert_hoisting");
+    subpipeline.AddPass<HloDCE>();
     subpipeline.AddPass<TupleSimplifier>();
     subpipeline.AddPass<HloDCE>();
     TF_ASSIGN_OR_RETURN(bool cleanup_changed, subpipeline.Run(module));
diff --git a/tensorflow/compiler/xla/service/conditional_code_motion.h b/tensorflow/compiler/xla/service/conditional_code_motion.h
index 1197a8b3620..95f02833e15 100644
--- a/tensorflow/compiler/xla/service/conditional_code_motion.h
+++ b/tensorflow/compiler/xla/service/conditional_code_motion.h
@@ -23,7 +23,11 @@ limitations under the License.
 
 namespace xla {
 
-// HLO pass that moves identical ops out of conditional.
+// ConditionalCodeMotion specializes in hoisting/rematerializing
+// unconditional converts in the default mode.
+// When pursue_full_conditional_code_motion_ is set to true, the
+// full HLO pass moves identical ops out of a conditional in addition to moving
+// converts.
 // - The definition of identical are the shape of the operands are identical
 // and their properties are identical.
 // - Currently, only some types of instructions is supported.
@@ -35,13 +39,18 @@ class ConditionalCodeMotion : public HloModulePass {
  public:
   // If is_layout_sensitive is true, then the hoist process preserves layout
   // during identical comparison. Otherwise, layout is ignored.
-  explicit ConditionalCodeMotion(bool is_layout_sensitive = true)
-      : is_layout_sensitive_(is_layout_sensitive) {}
+  explicit ConditionalCodeMotion(
+      bool is_layout_sensitive = true,
+      bool pursue_full_conditional_code_motion = false)
+      : is_layout_sensitive_(is_layout_sensitive),
+        pursue_full_conditional_code_motion_(
+            pursue_full_conditional_code_motion) {}
   absl::string_view name() const override { return "conditional-code-motion"; }
   StatusOr<bool> Run(HloModule* module) override;
 
  private:
   const bool is_layout_sensitive_;
+  const bool pursue_full_conditional_code_motion_;
 };
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/conditional_code_motion_test.cc b/tensorflow/compiler/xla/service/conditional_code_motion_test.cc
index 4a52303a42a..38b2b515fa0 100644
--- a/tensorflow/compiler/xla/service/conditional_code_motion_test.cc
+++ b/tensorflow/compiler/xla/service/conditional_code_motion_test.cc
@@ -38,7 +38,86 @@ namespace {
 using ConditionalCodeMotionTest = HloTestBase;
 namespace op = xla::testing::opcode_matchers;
 
-TEST_F(ConditionalCodeMotionTest, DoNotMoveConvertOut) {
+TEST_F(ConditionalCodeMotionTest, MoveSubsetTupleOut) {
+  absl::string_view hlo_string =
+      R"(
+HloModule RemoveDotOpOut
+
+on_true {
+  %arg_tuple.1 = (f32[93184,4]{1,0}) parameter(0)
+  %get-tuple-element.1 = f32[93184,4]{1,0} get-tuple-element(%arg_tuple.1), index=0
+  %reshape.8493 = f32[2,512,364]{2,1,0} reshape(f32[93184,4]{1,0} %get-tuple-element.1)
+  %convert.2894 = bf16[2,512,364]{2,1,0} convert(f32[2,512,364]{2,1,0} %reshape.8493)
+  ROOT %tuple.1 = ( bf16[2,512,364]{2,1,0}, f32[2,512,364]{2,1,0}) tuple(%convert.2894, %reshape.8493)
+}
+
+on_false {
+  %arg_tuple.2 = (f32[93184,4]{1,0}) parameter(0)
+  %get-tuple-element.3 = f32[93184,4]{1,0} get-tuple-element(%arg_tuple.2), index=0
+  %reshape.9717 = f32[2,512,364]{2,1,0} reshape(f32[93184,4]{1,0} %get-tuple-element.3)
+  %add = f32[2,512,364]{2,1,0} add(f32[2,512,364]{2,1,0} %reshape.9717, f32[2,512,364]{2,1,0} %reshape.9717)
+  %convert.3604 = bf16[2,512,364]{2,1,0} convert(f32[2,512,364]{2,1,0} %reshape.9717), metadata={op_type="Cast" op_name="gradients/Cast_125_grad/Cast"}
+  ROOT %tuple.2 = (bf16[2,512,364]{2,1,0}, f32[2,512,364]{2,1,0}) tuple(%convert.3604, %add)
+}
+
+ENTRY main {
+  pred.1 = pred[] parameter(0)
+  arg_tuple.11 = (f32[93184,4]{1,0}) parameter(1)
+  arg_tuple.22 = (f32[93184,4]{1,0}) parameter(2)
+  conditional = (bf16[2,512,364]{2,1,0}, f32[2,512,364]{2,1,0}) conditional(pred.1, arg_tuple.11, arg_tuple.22), true_computation=on_true, false_computation=on_false
+  get-first-index = bf16[2,512,364]{2,1,0} get-tuple-element(conditional), index=0
+  get-first-index.2 = f32[2,512,364]{2,1,0} get-tuple-element(conditional), index=1
+  ROOT result = (bf16[2,512,364]{2,1,0}, f32[2,512,364]{2,1,0}) tuple(get-first-index, get-first-index.2)
+}
+)";
+  auto module = ParseAndReturnVerifiedModule(hlo_string).ValueOrDie();
+  ConditionalCodeMotion pass(true, true);
+  ASSERT_TRUE(pass.Run(&*module).ValueOrDie());
+
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, AllOf(op::Tuple(op::Convert(), op::GetTupleElement())));
+}
+
+TEST_F(ConditionalCodeMotionTest, MoveConvertOutConditionalRoot) {
+  absl::string_view hlo_string =
+      R"(
+HloModule RemoveDotOpOut
+
+on_true {
+  %arg_tuple.1 = (f32[93184,4]{1,0}) parameter(0)
+  %get-tuple-element.1 = f32[93184,4]{1,0} get-tuple-element(%arg_tuple.1), index=0
+  %reshape.8493 = f32[2,512,364]{2,1,0} reshape(f32[93184,4]{1,0} %get-tuple-element.1)
+  %add.8493 = f32[2,512,364]{2,1,0} add(f32[2,512,364]{2,1,0} %reshape.8493, f32[2,512,364]{2,1,0} %reshape.8493)
+  %convert.2894 = bf16[2,512,364]{2,1,0} convert(f32[2,512,364]{2,1,0} %add.8493)
+  ROOT %tuple.1 = ( bf16[2,512,364]{2,1,0}) tuple(%convert.2894)
+}
+
+on_false {
+  %arg_tuple.2 = (f32[93184,4]{1,0}) parameter(0)
+  %get-tuple-element.3 = f32[93184,4]{1,0} get-tuple-element(%arg_tuple.2), index=0
+  %reshape.9717 = f32[2,512,364]{2,1,0} reshape(f32[93184,4]{1,0} %get-tuple-element.3)
+  %add.8493 = f32[2,512,364]{2,1,0} add(f32[2,512,364]{2,1,0} %reshape.9717, f32[2,512,364]{2,1,0} %reshape.9717)
+  %sub.8493 = f32[2,512,364]{2,1,0} subtract(f32[2,512,364]{2,1,0} %add.8493, f32[2,512,364]{2,1,0} %reshape.9717)
+  %convert.3604 = bf16[2,512,364]{2,1,0} convert(f32[2,512,364]{2,1,0} %reshape.9717), metadata={op_type="Cast" op_name="gradients/Cast_125_grad/Cast"}
+  ROOT %tuple.2 = (bf16[2,512,364]{2,1,0}) tuple(%convert.3604)
+}
+
+ENTRY main {
+  pred.1 = pred[] parameter(0)
+  arg_tuple.11 = (f32[93184,4]{1,0}) parameter(1)
+  arg_tuple.22 = (f32[93184,4]{1,0}) parameter(2)
+  ROOT conditional = (bf16[2,512,364]{2,1,0}) conditional(pred.1, arg_tuple.11, arg_tuple.22), true_computation=on_true, false_computation=on_false
+}
+)";
+  auto module = ParseAndReturnVerifiedModule(hlo_string).ValueOrDie();
+  ConditionalCodeMotion pass(true, true);
+  ASSERT_TRUE(pass.Run(&*module).ValueOrDie());
+
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, AllOf(op::Tuple(op::Convert())));
+}
+
+TEST_F(ConditionalCodeMotionTest, MoveConvertOut) {
   absl::string_view hlo_string =
       R"(
 HloModule RemoveDotOpOut
@@ -65,12 +144,16 @@ ENTRY main {
   arg_tuple.22 = (f32[93184,4]{1,0}) parameter(2)
   conditional = (bf16[2,512,364]{2,1,0}) conditional(pred.1, arg_tuple.11, arg_tuple.22), true_computation=on_true, false_computation=on_false
   get-first-index = bf16[2,512,364]{2,1,0} get-tuple-element(conditional), index=0
-  ROOT result = (bf16[2,512,364]{2,1,0}) tuple(get-first-index)
+  add.1 = bf16[2,512,364]{2,1,0} add(bf16[2,512,364]{2,1,0} get-first-index, bf16[2,512,364]{2,1,0} get-first-index)
+  ROOT result = (bf16[2,512,364]{2,1,0}) tuple(add.1)
 }
 )";
   auto module = ParseAndReturnVerifiedModule(hlo_string).ValueOrDie();
-  ConditionalCodeMotion pass;
-  ASSERT_FALSE(pass.Run(&*module).ValueOrDie());
+  ConditionalCodeMotion pass(true, true);
+  ASSERT_TRUE(pass.Run(&*module).ValueOrDie());
+
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, AllOf(op::Tuple(op::Add(op::Convert(), op::Convert()))));
 }
 
 TEST_F(ConditionalCodeMotionTest, UserShareOperandCannotBeMoved) {
@@ -123,7 +206,7 @@ ENTRY main {
 }
 )";
   auto module = ParseAndReturnVerifiedModule(hlo_string).ValueOrDie();
-  ConditionalCodeMotion pass;
+  ConditionalCodeMotion pass(true, true);
   ASSERT_TRUE(pass.Run(&*module).ValueOrDie());
 
   const HloInstruction* conditional =
@@ -181,7 +264,7 @@ ENTRY main {
 }
 )";
   auto module = ParseAndReturnVerifiedModule(hlo_string).ValueOrDie();
-  ConditionalCodeMotion pass;
+  ConditionalCodeMotion pass(true, true);
   ASSERT_TRUE(pass.Run(&*module).ValueOrDie());
   const HloInstruction* conditional =
       FindInstruction(module.get(), "conditional");
@@ -245,7 +328,7 @@ ENTRY main {
 }
 )";
   auto module = ParseAndReturnVerifiedModule(hlo_string).ValueOrDie();
-  ConditionalCodeMotion pass;
+  ConditionalCodeMotion pass(true, true);
   ASSERT_TRUE(pass.Run(&*module).ValueOrDie());
 
   const HloInstruction* conditional =
@@ -317,7 +400,7 @@ ENTRY main {
 )";
 
   auto module = ParseAndReturnVerifiedModule(hlo_string).ValueOrDie();
-  ConditionalCodeMotion pass;
+  ConditionalCodeMotion pass(true, true);
   ASSERT_FALSE(pass.Run(&*module).ValueOrDie());
 }
 
@@ -390,7 +473,7 @@ ENTRY main {
 }
 )";
   auto module = ParseAndReturnVerifiedModule(hlo_string).ValueOrDie();
-  ConditionalCodeMotion pass;
+  ConditionalCodeMotion pass(true, true);
   ASSERT_TRUE(pass.Run(&*module).ValueOrDie());
   const HloInstruction* conditional =
       FindInstruction(module.get(), "conditional");

From 0b0eef4031fa2674a2c5d32aa7570a82c3def6a8 Mon Sep 17 00:00:00 2001
From: Chao Mei <chaomei@google.com>
Date: Thu, 18 Jun 2020 20:59:28 -0700
Subject: [PATCH 0592/1390] Move enabling xnnpack delegate to AllocateTensors
 to allow other delegates to be applied first.

PiperOrigin-RevId: 317240424
Change-Id: I89b616f891f65f7cff6beedbf5c2a372f7456592
---
 tensorflow/lite/interpreter.cc         | 15 +++++++++++++--
 tensorflow/lite/interpreter.h          | 15 +++++++++++----
 tensorflow/lite/interpreter_builder.cc | 17 ++++++-----------
 3 files changed, 30 insertions(+), 17 deletions(-)

diff --git a/tensorflow/lite/interpreter.cc b/tensorflow/lite/interpreter.cc
index cae2ca7dde0..b49aa5031bf 100644
--- a/tensorflow/lite/interpreter.cc
+++ b/tensorflow/lite/interpreter.cc
@@ -86,8 +86,9 @@ TfLiteQuantization GetQuantizationFromLegacy(
 }  // namespace
 
 Interpreter::Interpreter(ErrorReporter* error_reporter)
-    : error_reporter_(error_reporter ? error_reporter
-                                     : DefaultErrorReporter()) {
+    : error_reporter_(error_reporter ? error_reporter : DefaultErrorReporter()),
+      lazy_delegate_provider_(
+          TfLiteDelegatePtr(nullptr, [](TfLiteDelegate*) {})) {
   // TODO(b/128420794): Include the TFLite runtime version in the log.
   // Prod logging is useful for mobile platforms where scraping console logs is
   // critical for debugging.
@@ -175,6 +176,16 @@ TfLiteStatus Interpreter::SetVariables(std::vector<int> variables) {
 }
 
 TfLiteStatus Interpreter::AllocateTensors() {
+  // Apply the default delegate that TFLite will enable at this point to allow
+  // other user-level delegates to be applied first.
+  if (lazy_delegate_provider_) {
+    // The execution will fall back to default implementation if the XNNPACK
+    // delegate fails to be applied. Therefore, we ignore the return status
+    // here and let it fall through the rest of the code.
+    ModifyGraphWithDelegate(std::move(lazy_delegate_provider_));
+    lazy_delegate_provider_.reset();
+  }
+
   return primary_subgraph().AllocateTensors();
 }
 
diff --git a/tensorflow/lite/interpreter.h b/tensorflow/lite/interpreter.h
index 59cab6add6d..41377c4ce1f 100644
--- a/tensorflow/lite/interpreter.h
+++ b/tensorflow/lite/interpreter.h
@@ -347,10 +347,12 @@ class Interpreter {
   /// WARNING: Experimental interface, subject to change
   TfLiteStatus ReleaseNonPersistentMemory();
 
-  /// Update allocations for all tensors. This will redim dependent tensors
-  /// using the input tensor dimensionality as given. This is relatively
-  /// expensive. If you know that your sizes are not changing, you need not call
-  /// this. Returns status of success or failure.
+  // Update allocations for all tensors. This will redim dependent tensors
+  // using the input tensor dimensionality as given. This is relatively
+  // expensive. This *must be* called after the interpreter has been created
+  // and before running inference (and accessing tensor buffers), and *must be*
+  // called again if (and only if) an input tensor is resized. Returns status of
+  // success or failure.
   TfLiteStatus AllocateTensors();
 
   /// Invoke the interpreter (run the whole graph in dependency order).
@@ -594,6 +596,11 @@ class Interpreter {
 
   // A map of resources. Owned by interpreter and shared by multiple subgraphs.
   resource::ResourceMap resources_;
+
+  // Indicating a delegate that the TFLite interpreter will apply by default.
+  // A nullptr value means there's no delegate to be applied by default or the
+  // delegate has been applied and doesn't need to be applied again.
+  TfLiteDelegatePtr lazy_delegate_provider_;
 };
 
 }  // namespace impl
diff --git a/tensorflow/lite/interpreter_builder.cc b/tensorflow/lite/interpreter_builder.cc
index d73b298e595..4b491d41881 100644
--- a/tensorflow/lite/interpreter_builder.cc
+++ b/tensorflow/lite/interpreter_builder.cc
@@ -545,17 +545,7 @@ TfLiteStatus InterpreterBuilder::ParseTensors(
 
 TfLiteStatus InterpreterBuilder::ApplyDelegates(Interpreter* interpreter,
                                                 int num_threads) {
-  // First, apply XNNPACK delegate if applicable.
-  if (num_fp32_tensors_ > 0) {
-    // The execution will fall back to default implementation if the XNNPACK
-    // delegate fails to be applied. Therefore, we ignore the return status
-    // here and let it fall through the rest of the code.
-    if (auto xnnpack_delegate = MaybeCreateXNNPACKDelegate(num_threads)) {
-      interpreter->ModifyGraphWithDelegate(std::move(xnnpack_delegate));
-    }
-  }
-
-  // Secondly, apply Flex delegate if applicable.
+  // Apply Flex delegate if applicable.
   if (has_flex_op_) {
     if (auto flex_delegate = AcquireFlexDelegate()) {
       return interpreter->ModifyGraphWithDelegate(std::move(flex_delegate));
@@ -672,6 +662,11 @@ TfLiteStatus InterpreterBuilder::operator()(
     modified_subgraph->SetVariables(std::move(variables));
   }
 
+  if (num_fp32_tensors_ > 0) {
+    (*interpreter)->lazy_delegate_provider_ =
+        MaybeCreateXNNPACKDelegate(num_threads);
+  }
+
   if (ApplyDelegates(interpreter->get(), num_threads) != kTfLiteOk)
     return cleanup_and_error();
 

From b5bb616121f1805c5ff5391daf00c86b6bcad1ae Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 18 Jun 2020 21:06:35 -0700
Subject: [PATCH 0593/1390] *** Reason for rollback ***

 CL316577397 prevents H2D prefetch overlapping for TF2 MLPerf Resnet model on V100x8
More details in b/159372996

  With the rollback, the prefetching overlapping is back, and the training speed also recovered.
*** Original change description ***

Add DT_BOOL support to GPU variable ops

This is a follow-on to PR #38848 & PR #39172 and resolves remaining ask
in Issue #35994. The original PR tried to add many variable ops on the
GPU including DT_BOOL. However, this caused testCondModifyBoolPred to
fail and thus the DT_BOOL type was removed. The reason for the test
failure is once DT_BOOL variables are supported on the GPU, we need to
ensure the switch ops are also updated to not have ho...

***

PiperOrigin-RevId: 317241338
Change-Id: Id7b7d79622e0537ccb677f081b487014ac4d2395
---
 tensorflow/core/kernels/control_flow_ops.cc            | 10 +++++-----
 tensorflow/core/kernels/variable_ops.cc                |  3 ++-
 .../debug/lib/debug_graph_reconstruction_test.py       |  6 +++---
 tensorflow/python/ops/control_flow_ops_test.py         |  6 +++---
 4 files changed, 13 insertions(+), 12 deletions(-)

diff --git a/tensorflow/core/kernels/control_flow_ops.cc b/tensorflow/core/kernels/control_flow_ops.cc
index 435de3c5954..c8e83b6f672 100644
--- a/tensorflow/core/kernels/control_flow_ops.cc
+++ b/tensorflow/core/kernels/control_flow_ops.cc
@@ -111,17 +111,15 @@ REGISTER_GPU_SWITCH(uint64);
 TF_CALL_variant(REGISTER_GPU_SWITCH);
 TF_CALL_uint32(REGISTER_GPU_SWITCH);
 TF_CALL_uint32(REGISTER_GPU_REF_SWITCH);
-TF_CALL_bool(REGISTER_GPU_SWITCH);
-TF_CALL_bool(REGISTER_GPU_REF_SWITCH);
 
 #undef REGISTER_CPU_SWITCH
 #undef REGISTER_CPU_REF_SWITCH
 #undef REGISTER_GPU_SWITCH
 #undef REGISTER_GPU_REF_SWITCH
 
-// Special GPU kernels for int32, string & resource handles. Requiring all
-// inputs and outputs to be in host memory.
-// TODO(b/25387198): Also enable int32 in device memory.
+// Special GPU kernels for int32 and string.
+// TODO(b/25387198): Also enable int32 in device memory. This kernel
+// registration requires all int32 inputs and outputs to be in host memory.
 #define REGISTER_GPU_HOST_KERNEL(type)                    \
   REGISTER_KERNEL_BUILDER(Name("Switch")                  \
                               .Device(DEVICE_GPU)         \
@@ -151,6 +149,8 @@ TF_CALL_bool(REGISTER_GPU_REF_SWITCH);
 
 REGISTER_GPU_HOST_KERNEL(int32);
 REGISTER_GPU_HOST_REF_KERNEL(int32);
+REGISTER_GPU_HOST_KERNEL(bool);
+REGISTER_GPU_HOST_REF_KERNEL(bool);
 REGISTER_GPU_HOST_KERNEL(tstring);
 REGISTER_GPU_HOST_REF_KERNEL(tstring);
 REGISTER_GPU_HOST_KERNEL(ResourceHandle);
diff --git a/tensorflow/core/kernels/variable_ops.cc b/tensorflow/core/kernels/variable_ops.cc
index ccd33e8c75a..6f5e0b94eca 100644
--- a/tensorflow/core/kernels/variable_ops.cc
+++ b/tensorflow/core/kernels/variable_ops.cc
@@ -252,7 +252,8 @@ TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SYCL_KERNEL);
 
 TF_CALL_int64(REGISTER_GPU_KERNELS);
 TF_CALL_uint32(REGISTER_GPU_KERNELS);
-TF_CALL_GPU_ALL_TYPES(REGISTER_GPU_KERNELS);
+TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_KERNELS);
+TF_CALL_COMPLEX_TYPES(REGISTER_GPU_KERNELS);
 #undef REGISTER_GPU_KERNELS
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
diff --git a/tensorflow/python/debug/lib/debug_graph_reconstruction_test.py b/tensorflow/python/debug/lib/debug_graph_reconstruction_test.py
index b3baa6e7bc2..fb722efab4e 100644
--- a/tensorflow/python/debug/lib/debug_graph_reconstruction_test.py
+++ b/tensorflow/python/debug/lib/debug_graph_reconstruction_test.py
@@ -73,9 +73,9 @@ class ReconstructNonDebugGraphTest(test_util.TensorFlowTestCase):
           for attr_key in new_node.attr:
             if attr_key == "parallel_iterations":
               new_node.attr[attr_key].i = 1
-        elif new_node.op == "Switch" or new_node.op == "Identity":
-          # We don't check the inputs to Switch or Identity ops as their inputs
-          # may be Send/Recv nodes.
+        elif new_node.op == "Switch":
+          # We don't check the inputs to Switch ops as their inputs may be
+          # Send/Recv nodes.
           del new_node.input[:]
 
     return output_graph_def
diff --git a/tensorflow/python/ops/control_flow_ops_test.py b/tensorflow/python/ops/control_flow_ops_test.py
index 3ca9bda82f2..9254695d988 100644
--- a/tensorflow/python/ops/control_flow_ops_test.py
+++ b/tensorflow/python/ops/control_flow_ops_test.py
@@ -396,10 +396,10 @@ class CondTest(test_util.TensorFlowTestCase):
         fn2=lambda: math_ops.add(y, 23))
     self.assertEquals(self.evaluate(z), 24)
 
-  @test_util.run_v1_only("Exercises Ref variables")
+  @test_util.run_deprecated_v1
   def testCondModifyBoolPred(self):
-    # We want to use the GPU here because we want to ensure that we can update
-    # a boolean ref variable on the GPU.
+    # This test in particular used to fail only when running in GPU, hence
+    # use_gpu=True.
     with test_util.use_gpu():
       bool_var = variable_scope.get_variable(
           "bool_var", dtype=dtypes.bool, initializer=True)

From cfbdd27fe3f2b904609e1551490a01640ae4fcac Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 18 Jun 2020 21:17:54 -0700
Subject: [PATCH 0594/1390] Update ops-related pbtxt files.

PiperOrigin-RevId: 317242544
Change-Id: I36000fdb2d595b5006ea111105ece5ca6f537732
---
 .../core/ops/compat/ops_history_v2/Acos.pbtxt | 29 +++++++++++++++++++
 .../core/ops/compat/ops_history_v2/Asin.pbtxt | 29 +++++++++++++++++++
 .../core/ops/compat/ops_history_v2/Atan.pbtxt | 29 +++++++++++++++++++
 .../core/ops/compat/ops_history_v2/Inv.pbtxt  | 29 +++++++++++++++++++
 .../core/ops/compat/ops_history_v2/Neg.pbtxt  | 29 +++++++++++++++++++
 .../compat/ops_history_v2/Reciprocal.pbtxt    | 29 +++++++++++++++++++
 .../ops/compat/ops_history_v2/Round.pbtxt     | 29 +++++++++++++++++++
 .../ops/compat/ops_history_v2/Square.pbtxt    | 29 +++++++++++++++++++
 .../core/ops/compat/ops_history_v2/Tan.pbtxt  | 29 +++++++++++++++++++
 tensorflow/core/ops/ops.pbtxt                 | 18 ++++++++++++
 10 files changed, 279 insertions(+)

diff --git a/tensorflow/core/ops/compat/ops_history_v2/Acos.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Acos.pbtxt
index 3ed45186f6e..417dbfc7e7d 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Acos.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Acos.pbtxt
@@ -78,3 +78,32 @@ op {
     }
   }
 }
+op {
+  name: "Acos"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Asin.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Asin.pbtxt
index 7df768f7c66..c799ff99169 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Asin.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Asin.pbtxt
@@ -78,3 +78,32 @@ op {
     }
   }
 }
+op {
+  name: "Asin"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Atan.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Atan.pbtxt
index 86f0628ab53..4a80c7a751e 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Atan.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Atan.pbtxt
@@ -78,3 +78,32 @@ op {
     }
   }
 }
+op {
+  name: "Atan"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Inv.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Inv.pbtxt
index ca208664617..0c191790030 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Inv.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Inv.pbtxt
@@ -168,3 +168,32 @@ op {
     }
   }
 }
+op {
+  name: "Inv"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Neg.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Neg.pbtxt
index 77bb4a5872d..864d0257fe4 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Neg.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Neg.pbtxt
@@ -78,3 +78,32 @@ op {
     }
   }
 }
+op {
+  name: "Neg"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Reciprocal.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Reciprocal.pbtxt
index 5ea1abe4c9c..7e03554871a 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Reciprocal.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Reciprocal.pbtxt
@@ -78,3 +78,32 @@ op {
     }
   }
 }
+op {
+  name: "Reciprocal"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Round.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Round.pbtxt
index 4f59b21afd5..c5685dc6143 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Round.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Round.pbtxt
@@ -78,3 +78,32 @@ op {
     }
   }
 }
+op {
+  name: "Round"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Square.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Square.pbtxt
index 4d07faf4fd0..6af75b3ddc1 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Square.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Square.pbtxt
@@ -78,3 +78,32 @@ op {
     }
   }
 }
+op {
+  name: "Square"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Tan.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Tan.pbtxt
index 7dc7f84fd38..80e0b1e22c4 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Tan.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Tan.pbtxt
@@ -78,3 +78,32 @@ op {
     }
   }
 }
+op {
+  name: "Tan"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/ops.pbtxt b/tensorflow/core/ops/ops.pbtxt
index 1f1cf7444fb..dbd91c91b65 100644
--- a/tensorflow/core/ops/ops.pbtxt
+++ b/tensorflow/core/ops/ops.pbtxt
@@ -216,6 +216,8 @@ op {
         type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
+        type: DT_INT8
+        type: DT_INT16
         type: DT_INT32
         type: DT_INT64
         type: DT_COMPLEX64
@@ -2333,6 +2335,8 @@ op {
         type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
+        type: DT_INT8
+        type: DT_INT16
         type: DT_INT32
         type: DT_INT64
         type: DT_COMPLEX64
@@ -2646,6 +2650,8 @@ op {
         type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
+        type: DT_INT8
+        type: DT_INT16
         type: DT_INT32
         type: DT_INT64
         type: DT_COMPLEX64
@@ -19442,6 +19448,8 @@ op {
         type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
+        type: DT_INT8
+        type: DT_INT16
         type: DT_INT32
         type: DT_INT64
         type: DT_COMPLEX64
@@ -25498,6 +25506,8 @@ op {
         type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
+        type: DT_INT8
+        type: DT_INT16
         type: DT_INT32
         type: DT_INT64
         type: DT_COMPLEX64
@@ -35191,6 +35201,8 @@ op {
         type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
+        type: DT_INT8
+        type: DT_INT16
         type: DT_INT32
         type: DT_INT64
         type: DT_COMPLEX64
@@ -40686,6 +40698,8 @@ op {
         type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
+        type: DT_INT8
+        type: DT_INT16
         type: DT_INT32
         type: DT_INT64
         type: DT_COMPLEX64
@@ -48071,6 +48085,8 @@ op {
         type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
+        type: DT_INT8
+        type: DT_INT16
         type: DT_INT32
         type: DT_INT64
         type: DT_COMPLEX64
@@ -50832,6 +50848,8 @@ op {
         type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
+        type: DT_INT8
+        type: DT_INT16
         type: DT_INT32
         type: DT_INT64
         type: DT_COMPLEX64

From b7caba2c42285a8e1cb875bec1664d5b0e6c65e9 Mon Sep 17 00:00:00 2001
From: Ashwin Murthy <ashwinm@google.com>
Date: Thu, 18 Jun 2020 22:10:41 -0700
Subject: [PATCH 0595/1390] Update RNN conversion tflite g3doc

This uses the content from the blog post/dogfood announcement email

PiperOrigin-RevId: 317248288
Change-Id: I210c64bd54c70aa5b68742d59d6d36fa154e856c
---
 tensorflow/lite/g3doc/convert/rnn.md | 240 +++++++++++++++++++--------
 1 file changed, 167 insertions(+), 73 deletions(-)

diff --git a/tensorflow/lite/g3doc/convert/rnn.md b/tensorflow/lite/g3doc/convert/rnn.md
index 52bc287c151..734992c0904 100644
--- a/tensorflow/lite/g3doc/convert/rnn.md
+++ b/tensorflow/lite/g3doc/convert/rnn.md
@@ -1,99 +1,193 @@
-# Convert RNN models
+# TensorFlow RNN conversion to TensorFlow Lite
 
-The TensorFlow Lite interpreter currently implements a subset of TensorFlow
-operations, meaning some model architectures cannot immediately be converted due
-to missing operations.
+## Overview
 
-Some RNN-based architectures are affected by this. The following document
-outlines the current state of play and provides strategies for converting RNN
-models.
+TensorFlow Lite supports converting TensorFlow RNN models to TensorFlow Lite’s
+fused LSTM operators. Fused operators exist to maximize the performance of their
+underlying kernel implementations, as well as provide a higher level interface
+to define complex transformations like quantizatization.
 
-## Currently supported
+Since there are many variants of RNN APIs in TensorFlow, our approach has been
+two fold:
 
-Currently, RNN models using
-[`tf.compat.v1.nn.static_rnn`](https://www.tensorflow.org/api_docs/python/tf/nn/static_rnn)
-can be converted successfully as long as no `sequence_length` is specified.
+1.  Provide **native support for standard TensorFlow RNN APIs** like Keras LSTM.
+    This is the recommended option.
+1.  Provide an **interface** **into the conversion infrastructure for**
+    **user-defined** **RNN implementations** to plug in and get converted to
+    TensorFlow Lite. We provide a couple of out of box examples of such
+    conversion using lingvo’s
+    [LSTMCellSimple](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/compiler/mlir/lite/transforms/prepare_composite_functions_tf.cc#L123)
+    and
+    [LayerNormalizedLSTMCellSimple](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/compiler/mlir/lite/utils/lstm_utils.cc#L519)
+    RNN interfaces.
 
-The following `tf.compat.v1.nn.rnn_cell` operations work with
-`tf.compat.v1.nn.static_rnn`:
+## Converter API
 
-*   [tf.compat.v1.nn.rnn_cell.LSTMCell](https://www.tensorflow.org/api_docs/python/tf/nn/rnn_cell/LSTMCell)
-*   [tf.compat.v1.nn.rnn_cell.RNNCell](https://www.tensorflow.org/api_docs/python/tf/nn/rnn_cell/RNNCell)
-*   [tf.compat.v1.nn.rnn_cell.GRUCell](https://www.tensorflow.org/api_docs/python/tf/nn/rnn_cell/GRUCell)
-*   [tf.compat.v1.nn.rnn_cell.BasicLSTMCell](https://www.tensorflow.org/api_docs/python/tf/nn/rnn_cell/BasicLSTMCell)
-*   [tf.compat.v1.nn.rnn_cell.BasicRNNCell](https://www.tensorflow.org/api_docs/python/tf/nn/rnn_cell/BasicRNNCell)
+Currently this feature is available through the
+[tf-nightly](https://pypi.org/project/tf-nightly/) pip or from head. This will
+be available in the TensorFlow 2.3 release.
 
-In addition, TensorFlow Lite provides some experimental drop-in replacements for
-RNN operations that enable dynamic RNN architectures with TensorFlow Lite.
+This conversion functionality is available when converting to TensorFlow Lite
+via a SavedModel or from the Keras model directly. See example usages.
 
-Drop-in replacements are available for the following:
+### From saved model
 
-*   [tf.compat.v1.nn.dynamic_rnn](https://www.tensorflow.org/api_docs/python/tf/nn/dynamic_rnn)
-*   [tf.compat.v1.nn.bidirectional_dynamic_rnn](https://www.tensorflow.org/api_docs/python/tf/nn/bidirectional_dynamic_rnn)
-*   [tf.compat.v1.nn.rnn_cell.RNNCell](https://www.tensorflow.org/api_docs/python/tf/nn/rnn_cell/RNNCell)
-*   [tf.compat.v1.nn.rnn_cell.LSTMCell](https://www.tensorflow.org/api_docs/python/tf/nn/rnn_cell/LSTMCell)
+```
+# build a saved model. Here concrete_function is the exported function
+# corresponding to the TensorFlow model containing one or more
+# Keras LSTM layers.
+saved_model, saved_model_dir = build_saved_model_lstm(...)
+saved_model.save(saved_model_dir, save_format="tf", signatures=concrete_func)
 
-## Not currently supported
+# Convert the model.
+converter = TFLiteConverter.from_saved_model(saved_model_dir)
+tflite_model = converter.convert()
+```
 
-TensorFlow Lite does not currently support
-[Control Flow](https://www.tensorflow.org/api_docs/cc/group/control-flow-ops)
-operations. This means that, unless one of the conversion strategies discussed
-in the next section are employed, models built with the following TensorFlow
-functions will not convert successfully:
+### From Keras model
 
-*   [tf.compat.v1.nn.static_rnn](https://www.tensorflow.org/api_docs/python/tf/nn/static_rnn)
-    where a `sequence_length` is specified
-*   [tf.compat.v1.nn.dynamic_rnn](https://www.tensorflow.org/api_docs/python/tf/nn/dynamic_rnn)
-*   [tf.compat.v1.nn.bidirectional_dynamic_rnn](https://www.tensorflow.org/api_docs/python/tf/nn/bidirectional_dynamic_rnn)
+```
+# build a Keras model
+keras_model = build_keras_lstm(...)
 
-Note: TensorFlow Lite plans to implement all required Control Flow operations by
-the end of 2019. At this point, all RNN architectures will convert successfully.
+# Convert the model.
+converter = TFLiteConverter.from_keras_model(keras_model)
+tflite_model = converter.convert()
 
-## Conversion strategies
+```
 
-To convert an RNN model that uses the functions specified above, you will have
-to modify its architecture and retrain it. The following strategies can be used.
+## Example
 
-### 1. Refactoring
+Keras LSTM to TensorFlow Lite
+[Colab](https://colab.research.google.com/github/tensorflow/tensorflow/blob/master/tensorflow/lite/examples/experimental_new_converter/Keras_LSTM_fusion_Codelab.ipynb)
+illustrates the end to end usage with the TensorFlow Lite interpreter.
 
-The simplest approach, if possible, is to refactor the model architecture to use
-[tf.compat.v1.nn.static_rnn](https://www.tensorflow.org/api_docs/python/tf/nn/static_rnn)
-without `sequence_length`.
+## TensorFlow RNNs APIs supported
 
-### 2. Drop-in replacements that use op hints and fused ops
+### Keras LSTM conversion (recommended)
 
-TensorFlow Lite provides the some experimental drop-in replacements for RNN
-operations that enable dynamic RNN architectures with TensorFlow Lite. Using
-[OpHints](https://www.tensorflow.org/lite/guide/ops_custom#converting_tensorflow_models_to_convert_graphs),
-they run normally during training, but are substituted with special fused ops
-when run by the Lite interpreter.
+We support out-of-the-box conversion of Keras LSTM to TensorFlow Lite. For
+details on how this works please refer to the
+[Keras LSTM interface](https://colab.sandbox.google.com/github/tensorflow/tensorflow/blob/master/tensorflow/lite/examples/experimental_new_converter/Keras_LSTM_fusion_Codelab.ipynb)<span style="text-decoration:space;">
+</span>and to the conversion logic
+[here](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/compiler/mlir/lite/utils/lstm_utils.cc#L627).
 
-The following drop-in replacements are available:
+Also important is to highlight the TensorFlow Lite’s LSTM contract with respect
+to the Keras operation definition:
 
-*   [tf.compat.v1.lite.experimental.nn.dynamic_rnn](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/experimental/examples/lstm/rnn.py#L41)
-    *   replacement for tf.nn.dynamic_rnn
-*   [tf.compat.v1.lite.experimental.nn.bidirectional_dynamic_rnn](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/experimental/examples/lstm/rnn.py#L279)
-    *   replacement for tf.nn.bidirectional_dynamic_rnn
-*   [tf.compat.v1.lite.experimental.nn.TfLiteRNNCell](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/experimental/examples/lstm/rnn_cell.py#L39)
-    *   replacement for tf.nn.rnn_cell.RNNCell
-*   [tf.compat.v1.lite.experimental.nn.TfLiteLSTMCell](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/experimental/examples/lstm/rnn_cell.py#L159)
-    *   replacement for tf.nn.rnn_cell.LSTMCell
+1.  The dimension 0 of the input tensor is the batch size.
+1.  The dimension 0 of the recurrent\_weight tensor is the number of outputs.
+1.  The **weight** and **recurrent\_kernel** tensors are transposed.
+1.  The transposed weight, transposed recurrent\_kernel and bias tensors are
+    split into 4 equal sized tensors along the dimension 0. These correspond to
+    **input gate, forget gate, cell, and output gate**.
 
-Note: These replacements must be used together. For example, if you are using
-`tf.compat.v1.lite.experimental.nn.dynamic_rnn`, you must combine it with
-`tf.compat.v1.lite.experimental.nn.TfLiteRNNCell` instead of using
-`tf.compat.v1.nn.rnn_cell.RNNCell`.
+#### Keras LSTM Variants
 
-Instead of
-[tf.compat.v1.nn.rnn_cell.MultiRNNCell](https://www.tensorflow.org/api_docs/python/tf/nn/rnn_cell/MultiRNNCell),
-you should use
-[tf.compat.v1.keras.layers.StackedRNNCells](https://www.tensorflow.org/api_docs/python/tf/keras/layers/StackedRNNCells).
+##### Time major
 
-For a tutorial on using these replacements, see
-[TensorFlow Lite LSTM ops API](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/experimental/examples/lstm/g3doc/README.md).
+Users may choose time-major or no time-major. Keras LSTM adds a time-major
+attribute in the function def attributes. For Unidirectional sequence LSTM, we
+can simply map to unidirecional\_sequence\_lstm's
+[time major attribute](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/compiler/mlir/lite/ir/tfl_ops.td#L3508).
 
-For a Colab demonstrating these classes, refer to
-[TensorFlowLite_LSTM_Keras_Tutorial](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/experimental/examples/lstm/TensorFlowLite_LSTM_Keras_Tutorial.ipynb).
+##### BiDirectional LSTM
 
-Note: There is no replacement available for
-[tf.compat.v1.nn.rnn_cell.GRUCell](https://www.tensorflow.org/api_docs/python/tf/nn/rnn_cell/GRUCell).
+Bidirectional LSTM can be implemented with two Keras LSTM layers, one for
+forward and one for backward, see examples
+[here](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/keras/layers/wrappers.py#L381).
+Once we see the go\_backward attribute, we recognize it as backward LSTM, then
+we group forward & backward LSTM together. **This is future work.** Currently,
+this creates two UnidirectionalSequenceLSTM operators in the TensorFlow Lite
+model.
+
+### User-defined LSTM conversion examples
+
+TensorFlow Lite also provides a way to convert user defined LSTM
+implementations. Here we use Lingvo’s LSTM as an example of how that can be
+implemented. For details please refer to the
+[lingvo.LSTMCellSimple interface](https://github.com/tensorflow/lingvo/blob/master/lingvo/core/rnn_cell.py#L230)
+and the conversion logic
+[here](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/compiler/mlir/lite/transforms/prepare_composite_functions_tf.cc#L123).
+We also provide an example for another of Lingvo’s LSTM definitions in
+[lingvo.LayerNormalizedLSTMCellSimple interface](https://github.com/tensorflow/lingvo/blob/master/lingvo/core/rnn_cell.py#L1179)
+and its convertion logic
+[here](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/compiler/mlir/lite/transforms/prepare_composite_functions_tf.cc#L130).
+
+## “Bring your own TensorFlow RNN” to TensorFlow Lite
+
+If a user's RNN interface is different from the standard supported ones, there
+are a couple of options:
+
+**Option 1:** Write adapter code in TensorFlow python to adapt the RNN interface
+to the Keras RNN interface. This means a tf.function with
+[tf\_implements annotation](https://github.com/tensorflow/community/pull/113) on
+the generated RNN interface’s function that is identical to the one generated by
+the Keras LSTM layer. After this, the same conversion API used for Keras LSTM
+will work.
+
+**Option 2:** If the above is not possible (e.g. the Keras LSTM is missing some
+functionality that is currently exposed by TensorFlow Lite’s fused LSTM op like
+layer normalization), then extend the TensorFlow Lite converter by writing
+custom conversion code and plug it into the prepare-composite-functions
+MLIR-pass
+[here](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/compiler/mlir/lite/transforms/prepare_composite_functions_tf.cc#L108).
+The function’s interface should be treated like an API contract and should
+contain the arguments needed to convert to fused TensorFlow Lite LSTM
+operators - i.e. input, bias, weights, projection, layer normalization, etc. It
+is preferable for the tensors passed as arguments to this function to have known
+rank (i.e. RankedTensorType in MLIR). This makes it much easier to write
+conversion code that can assume these tensors as RankedTensorType and helps
+transform them to ranked tensors corresponding to the fused TensorFlow Lite
+operator’s operands.
+
+A complete example of such conversion flow is Lingvo’s LSTMCellSimple to
+TensorFlow Lite conversion.
+
+The LSTMCellSimple in Lingvo is defined
+[here](https://github.com/tensorflow/lingvo/blob/master/lingvo/core/rnn_cell.py#L230).
+Models trained with this LSTM cell can be converted to TensorFlow Lite as
+follows:
+
+1.  Wrap all uses of LSTMCellSimple in a tf.function with a tf\_implements
+    annotation that is labelled as such (e.g. lingvo.LSTMCellSimple would be a
+    good annotation name here). Make sure the tf.function that is generated
+    matches the interface of the function expected in the conversion code. This
+    is a contract between the model author adding the annotation and the
+    conversion code.
+1.  Extend the prepare-composite-functions pass to plug in a custom composite op
+    to TensorFlow Lite fused LSTM op conversion. See
+    [LSTMCellSimple](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/compiler/mlir/lite/transforms/prepare_composite_functions_tf.cc#L123)
+    conversion code.
+
+    The conversion contract:
+
+1.  **Weight** and **projection** tensors are transposed.
+
+1.  The **{input, recurrent}** to **{cell, input gate, forget gate, output
+    gate}** are extracted by slicing the transposed weight tensor.
+
+1.  The **{bias}** to **{cell, input gate, forget gate, output gate}** are
+    extracted by slicing the bias tensor.
+
+1.  The **projection** is extracted by slicing the transposed projection tensor.
+
+1.  Similar conversion is written for
+    [LayerNormalizedLSTMCellSimple](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/compiler/mlir/lite/utils/lstm_utils.cc#L519).
+
+1.  The rest of the TensorFlow Lite conversion infrastructure, including all the
+    [MLIR passes](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/compiler/mlir/lite/tf_tfl_passes.cc#L58)
+    defined as well as the final export to TensorFlow Lite flatbuffer can be
+    reused.
+
+## Known issues/limitations
+
+1.  Currently there is support only for converting stateless Keras LSTM (default
+    behavior in Keras). Stateful Keras LSTM conversion is future work.
+1.  It is still possible to model a stateful Keras LSTM layer using the
+    underlying stateless Keras LSTM layer and managing the state explicitly in
+    the user program. Such a TensorFlow program can still be converted to
+    TensorFlow Lite using the feature being described here.
+1.  Bidirectional LSTM is currently modelled as two UnidirectionalSequenceLSTM
+    operators in TensorFlow Lite. This will be replaced with a single
+    BidirectionalSequenceLSTM op.

From 158d4be42d7aea11a395d2f79483ac93289e1bb8 Mon Sep 17 00:00:00 2001
From: Xinyi Wang <wxinyi@google.com>
Date: Thu, 18 Jun 2020 22:15:44 -0700
Subject: [PATCH 0596/1390] Add get_next_as_optional method for a distributed
 iterator

The function is called on a distributed iterator and returns an `Optional` that contains the next value, the PerReplica input, from Distributed iterator or no value if this `iterator` has reached the end of the sequence.

PiperOrigin-RevId: 317248910
Change-Id: Ide217da1aff1d62f8d0d8f43423be2d859d933d3
---
 .../custom_training_loop_input_test.py        | 49 ++++++++++-
 .../python/distribute/distribute_lib.py       |  4 +
 tensorflow/python/distribute/input_lib.py     | 60 +++++++++++++
 .../python/distribute/input_lib_test.py       | 86 +++++++++++++------
 ...low.distribute.-distributed-iterator.pbtxt |  4 +
 5 files changed, 177 insertions(+), 26 deletions(-)

diff --git a/tensorflow/python/distribute/custom_training_loop_input_test.py b/tensorflow/python/distribute/custom_training_loop_input_test.py
index e4f782810dd..5660b5839ce 100644
--- a/tensorflow/python/distribute/custom_training_loop_input_test.py
+++ b/tensorflow/python/distribute/custom_training_loop_input_test.py
@@ -30,6 +30,7 @@ from tensorflow.python.distribute import strategy_combinations
 from tensorflow.python.eager import def_function
 from tensorflow.python.eager import test
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
@@ -136,8 +137,52 @@ class InputIterationTest(test.TestCase, parameterized.TestCase,
 
   @combinations.generate(
       combinations.combine(
-          distribution=strategy_combinations.tpu_strategies,
-          mode=["eager"]))
+          distribution=strategy_combinations.all_strategies, mode=["eager"]))
+  def testGetNextAsOptional(self, distribution):
+    data = [5., 6., 7., 8.]
+    dataset = get_dataset_from_tensor_slices(data).batch(2)
+    dist_dataset = distribution.experimental_distribute_dataset(dataset)
+    iterator = iter(dist_dataset)
+
+    def train_step(data):
+      return math_ops.square(data)
+
+    @def_function.function
+    def run(iterator):
+      return distribution.experimental_local_results(
+          distribution.run(
+              train_step, args=(iterator.get_next_as_optional().get_value(),)))
+
+    self.assert_equal_flattened([[25., 36.]], [run(iterator)])
+
+  @combinations.generate(
+      combinations.combine(
+          distribution=strategy_combinations.all_strategies, mode=["eager"]))
+  def testGetNextAsOptionalExampleUsage(self, distribution):
+    global_batch_size = 2
+    steps_per_loop = 6
+    dataset = dataset_ops.Dataset.range(
+        8, output_type=dtypes.int32).batch(global_batch_size)
+    distributed_iterator = iter(
+        distribution.experimental_distribute_dataset(dataset))
+
+    @def_function.function
+    def train_fn(distributed_iterator):
+
+      def step_fn(x):
+        return x
+
+      for _ in math_ops.range(steps_per_loop):
+        optional_data = distributed_iterator.get_next_as_optional()
+        if not optional_data.has_value():
+          break
+        distribution.run(step_fn, args=(optional_data.get_value(),))
+
+    train_fn(distributed_iterator)
+
+  @combinations.generate(
+      combinations.combine(
+          distribution=strategy_combinations.tpu_strategies, mode=["eager"]))
   def testFullEagerTPU(self, distribution):
     dataset = get_dataset_from_tensor_slices([5., 6., 7., 8.]).batch(2)
 
diff --git a/tensorflow/python/distribute/distribute_lib.py b/tensorflow/python/distribute/distribute_lib.py
index b6a89463426..ec0b911ebe0 100644
--- a/tensorflow/python/distribute/distribute_lib.py
+++ b/tensorflow/python/distribute/distribute_lib.py
@@ -200,6 +200,7 @@ import six
 from tensorflow.python.autograph.core import ag_ctx as autograph_ctx
 from tensorflow.python.autograph.impl import api as autograph
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.ops import iterator_ops
 from tensorflow.python.distribute import collective_util
 from tensorflow.python.distribute import device_util
 from tensorflow.python.distribute import distribution_strategy_context
@@ -2879,6 +2880,9 @@ class _DefaultDistributionExtended(StrategyExtendedV1):
     def get_next(self):
       return self._iterator.get_next()
 
+    def get_next_as_optional(self):
+      return iterator_ops.get_next_as_optional(self._iterator)
+
     @deprecated(None, "Use the iterator's `initializer` property instead.")
     def initialize(self):
       """Initialize underlying iterators.
diff --git a/tensorflow/python/distribute/input_lib.py b/tensorflow/python/distribute/input_lib.py
index ff468af7f87..e4a362a92c6 100644
--- a/tensorflow/python/distribute/input_lib.py
+++ b/tensorflow/python/distribute/input_lib.py
@@ -29,6 +29,7 @@ from tensorflow.python.data.experimental.ops import batching
 from tensorflow.python.data.experimental.ops import distribute
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import multi_device_iterator_ops
+from tensorflow.python.data.ops import optional_ops
 from tensorflow.python.distribute import device_util
 from tensorflow.python.distribute import distribute_utils
 from tensorflow.python.distribute import distribution_strategy_context
@@ -235,6 +236,40 @@ class DistributedIteratorInterface(collections.Iterator,
     raise NotImplementedError(
         "DistributedIterator.element_spec() must be implemented in descendants")
 
+  def get_next_as_optional(self):
+    """Returns a `tf.experimental.Optional` that contains the next value for all replicas.
+
+    If the `tf.distribute.DistributedIterator` has reached the end of the
+    sequence, the returned `tf.experimental.Optional` will have no value.
+
+    Example usage:
+
+    >>> strategy = tf.distribute.MirroredStrategy()
+    >>> global_batch_size = 2
+    >>> steps_per_loop = 2
+    >>> dataset = tf.data.Dataset.range(10).batch(global_batch_size)
+    >>> distributed_iterator = iter(
+    ...     strategy.experimental_distribute_dataset(dataset))
+    >>> def step_fn(x):
+    ...   return x
+    >>> @tf.function
+    ... def train_fn(distributed_iterator):
+    ...   for _ in tf.range(steps_per_loop):
+    ...     optional_data = distributed_iterator.get_next_as_optional()
+    ...     if not optional_data.has_value():
+    ...       break
+    ...     tf.print(strategy.run(step_fn, args=(optional_data.get_value(),)))
+    >>> train_fn(distributed_iterator)
+    ... # ([0 1],)
+    ... # ([2 3],)
+
+    Returns:
+      An `tf.experimental.Optional` object representing the next value from the
+      `tf.distribute.DistributedIterator` (if it has one) or no value.
+    """
+    raise NotImplementedError(
+        "get_next_as_optional() not implemented in descendants")
+
 
 @tf_export("distribute.DistributedDataset", v1=[])
 class DistributedDatasetInterface(collections.Iterable,
@@ -622,6 +657,31 @@ class DistributedIteratorBase(DistributedIteratorInterface):
   def __iter__(self):
     return self
 
+  def get_next_as_optional(self):
+    global_has_value, replicas = _get_next_as_optional(self, self._strategy)
+
+    def return_none():
+      return optional_ops.Optional.empty(self._element_spec)
+
+    def return_value(replicas):
+      """Wraps the inputs for replicas in an `tf.experimental.Optional`."""
+      results = []
+      for i, worker in enumerate(self._input_workers.worker_devices):
+        with ops.device(worker):
+          devices = self._input_workers.compute_devices_for_worker(i)
+          for j, device in enumerate(devices):
+            with ops.device(device):
+              result = replicas[i][j]
+              results.append(result)
+      replicas = results
+
+      return optional_ops.Optional.from_value(
+          distribute_utils.regroup(replicas))
+
+    return control_flow_ops.cond(global_has_value,
+                                 lambda: return_value(replicas),
+                                 lambda: return_none())  # pylint: disable=unnecessary-lambda
+
   def get_next(self, name=None):
     """Returns the next input from the iterator for all replicas."""
     if not self._enable_get_next_as_optional:
diff --git a/tensorflow/python/distribute/input_lib_test.py b/tensorflow/python/distribute/input_lib_test.py
index ff4436c4c8c..7f02d0121d0 100644
--- a/tensorflow/python/distribute/input_lib_test.py
+++ b/tensorflow/python/distribute/input_lib_test.py
@@ -185,38 +185,76 @@ class DistributedIteratorTestBase(test.TestCase):
       if not ops.executing_eagerly_outside_functions():
         evaluate(control_flow_ops.group(iterator.initializer))
 
-      for expected_value in expected_values:
-        next_element = iterator.get_next()
-        computed_value = evaluate(
-            [distribute_utils.select_replica(r, next_element)
-             for r in range(len(devices))])
-        self.assertEqual(len(expected_value), len(computed_value))
-        for i in range(len(expected_value)):
-          self.assertAllEqual(expected_value[i], computed_value[i])
+      def test_get_next(iterator):
+        for expected_value in expected_values:
+          next_element = iterator.get_next()
+          computed_value = evaluate([
+              distribute_utils.select_replica(r, next_element)
+              for r in range(len(devices))
+          ])
 
-      with self.assertRaises(errors.OutOfRangeError):
-        next_element = iterator.get_next()
-        evaluate(
-            [distribute_utils.select_replica(r, next_element)
-             for r in range(len(devices))])
+          self.assertEqual(len(expected_value), len(computed_value))
+          for i in range(len(expected_value)):
+            self.assertAllEqual(expected_value[i], computed_value[i])
 
-      # After re-initializing the iterator, should be able to iterate again.
-      if not ops.executing_eagerly_outside_functions():
-        evaluate(control_flow_ops.group(iterator.initializer))
+        with self.assertRaises(errors.OutOfRangeError):
+          next_element = iterator.get_next()
+          evaluate([
+              distribute_utils.select_replica(r, next_element)
+              for r in range(len(devices))
+          ])
+
+        # After re-initializing the iterator, should be able to iterate again.
+        if not ops.executing_eagerly_outside_functions():
+          evaluate(control_flow_ops.group(iterator.initializer))
+        else:
+          if api_type == "wrap_into_iterator":
+            self.skipTest("unsupported test combination")
+          else:
+            iterator = iter(dataset)
+
+        for expected_value in expected_values:
+          next_element = iterator.get_next()
+          computed_value = evaluate([
+              distribute_utils.select_replica(r, next_element)
+              for r in range(len(devices))
+          ])
+          self.assertEqual(len(expected_value), len(computed_value))
+          for i in range(len(expected_value)):
+            self.assertAllEqual(expected_value[i], computed_value[i])
+
+      def test_get_next_as_optional(iterator):
+        for expected_value in expected_values:
+          next_element = iterator.get_next_as_optional()
+          computed_value = evaluate([
+              distribute_utils.select_replica(r, next_element.get_value())
+              for r in range(len(devices))
+          ])
+
+          self.assertEqual(len(expected_value), len(computed_value))
+          for i in range(len(expected_value)):
+            self.assertAllEqual(expected_value[i], computed_value[i])
+
+        next_element = iterator.get_next_as_optional()
+        self.assertFalse(self.evaluate(next_element.has_value()))
+        with self.assertRaises(errors.InvalidArgumentError):
+          evaluate([
+              distribute_utils.select_replica(r, next_element.get_value())
+              for r in range(len(devices))
+          ])
+
+      test_get_next(iterator)
+
+      # re-initializing the iterator
+      if not tf2.enabled():
+        self.skipTest("Not testing get_next_as_optional in TF1")
       else:
         if api_type == "wrap_into_iterator":
           self.skipTest("unsupported test combination")
         else:
           iterator = iter(dataset)
 
-      for expected_value in expected_values:
-        next_element = iterator.get_next()
-        computed_value = evaluate(
-            [distribute_utils.select_replica(r, next_element)
-             for r in range(len(devices))])
-        self.assertEqual(len(expected_value), len(computed_value))
-        for i in range(len(expected_value)):
-          self.assertAllEqual(expected_value[i], computed_value[i])
+      test_get_next_as_optional(iterator)
 
     if iteration_type == "for_loop" and context.executing_eagerly():
       actual_values = []
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-distributed-iterator.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-distributed-iterator.pbtxt
index f712d9058b9..47899cc4188 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-distributed-iterator.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-distributed-iterator.pbtxt
@@ -13,4 +13,8 @@ tf_class {
     name: "get_next"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_next_as_optional"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
 }

From 539e9cb3a22793aad5d2df885e016f43b81a6a9f Mon Sep 17 00:00:00 2001
From: Meghna Natraj <mnatraj@google.com>
Date: Thu, 18 Jun 2020 22:33:44 -0700
Subject: [PATCH 0597/1390] Update quantization docs to use
 TFLiteConverter.from_saved_model() API instead of .from_keras_model() API

PiperOrigin-RevId: 317251205
Change-Id: Ia8166decfa76327e3fd44871b194ffcae0f049f8
---
 .../lite/g3doc/performance/post_training_quantization.md  | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tensorflow/lite/g3doc/performance/post_training_quantization.md b/tensorflow/lite/g3doc/performance/post_training_quantization.md
index ac584dd4c1c..dcf251e6d3d 100644
--- a/tensorflow/lite/g3doc/performance/post_training_quantization.md
+++ b/tensorflow/lite/g3doc/performance/post_training_quantization.md
@@ -34,7 +34,7 @@ weights from floating point to integer, which has 8-bits of precision:
 
 <pre>
 import tensorflow as tf
-converter = tf.lite.TFLiteConverter.from_keras_model(keras_model)
+converter = tf.lite.TFLiteConverter.from_saved_model(saved_model_dir)
 <b>converter.optimizations = [tf.lite.Optimize.DEFAULT]</b>
 tflite_quant_model = converter.convert()
 </pre>
@@ -68,7 +68,7 @@ the following steps:
 
 <pre>
 import tensorflow as tf
-converter = tf.lite.TFLiteConverter.from_keras_model(keras_model)
+converter = tf.lite.TFLiteConverter.from_saved_model(saved_model_dir)
 <b>converter.optimizations = [tf.lite.Optimize.DEFAULT]
 def representative_dataset_gen():
   for _ in range(num_calibration_steps):
@@ -96,7 +96,7 @@ the following steps:
 
 <pre>
 import tensorflow as tf
-converter = tf.lite.TFLiteConverter.from_keras_model(keras_model)
+converter = tf.lite.TFLiteConverter.from_saved_model(saved_model_dir)
 converter.optimizations = [tf.lite.Optimize.DEFAULT]
 def representative_dataset_gen():
   for _ in range(num_calibration_steps):
@@ -120,7 +120,7 @@ quantization of weights, use the following steps:
 
 <pre>
 import tensorflow as tf
-converter = tf.lite.TFLiteConverter.from_keras_model(keras_model)
+converter = tf.lite.TFLiteConverter.from_saved_model(saved_model_dir)
 <b>converter.optimizations = [tf.lite.Optimize.DEFAULT]
 converter.target_spec.supported_types = [tf.float16]</b>
 tflite_quant_model = converter.convert()

From 8e654afea4adba36b94b0f7a3d33a23e788612e0 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 18 Jun 2020 23:35:25 -0700
Subject: [PATCH 0598/1390] tf.numpy: Improve ndarray.__getitem__ to match
 numpy semantics.

PiperOrigin-RevId: 317256717
Change-Id: Ie89b81689f96242e3e9b01568e13937b80aaffc7
---
 .../python/ops/numpy_ops/np_array_ops.py      | 261 ++++++++++++++++--
 tensorflow/python/ops/numpy_ops/np_arrays.py  | 137 ---------
 2 files changed, 245 insertions(+), 153 deletions(-)

diff --git a/tensorflow/python/ops/numpy_ops/np_array_ops.py b/tensorflow/python/ops/numpy_ops/np_array_ops.py
index 906e53c556d..47236d45561 100644
--- a/tensorflow/python/ops/numpy_ops/np_array_ops.py
+++ b/tensorflow/python/ops/numpy_ops/np_array_ops.py
@@ -20,12 +20,15 @@ from __future__ import division
 from __future__ import print_function
 
 import math
+import numbers
+from typing import Sequence
 import numpy as np
 import six
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import clip_ops
 from tensorflow.python.ops import control_flow_ops
@@ -164,9 +167,11 @@ def full_like(a, fill_value, dtype=None, order='K', subok=True, shape=None):  #
 @np_utils.np_doc_only(np.array)
 def array(val, dtype=None, copy=True, ndmin=0):  # pylint: disable=redefined-outer-name
   """Since Tensors are immutable, a copy is made only if val is placed on a
+
   different device than the current one. Even if `copy` is False, a new Tensor
   may need to be built to satisfy `dtype` and `ndim`. This is used only if `val`
-  is an ndarray or a Tensor."""  # pylint:disable=g-docstring-missing-newline
+  is an ndarray or a Tensor.
+  """  # pylint:disable=g-docstring-missing-newline
   if dtype:
     dtype = np_utils.result_type(dtype)
   if isinstance(val, np_arrays.ndarray):
@@ -215,6 +220,8 @@ def array(val, dtype=None, copy=True, ndmin=0):  # pylint: disable=redefined-out
   result_t = np_utils.cond(
       np_utils.greater(ndmin, ndims), true_fn, lambda: result_t)
   return np_arrays.tensor_to_ndarray(result_t)
+
+
 # pylint: enable=g-short-docstring-punctuation,g-no-space-after-docstring-summary,g-doc-return-or-yield,g-doc-args
 
 
@@ -1446,14 +1453,13 @@ def take_along_axis(arr, indices, axis):  # pylint: disable=missing-docstring
   # broadcast.
   arr_shape_original = array_ops.shape(arr)
   indices_shape_original = array_ops.shape(indices)
-  arr_shape = array_ops.tensor_scatter_update(
-      arr_shape_original, [[axis]], [1])
-  indices_shape = array_ops.tensor_scatter_update(
-      indices_shape_original, [[axis]], [1])
-  broadcasted_shape = array_ops.broadcast_dynamic_shape(
-      arr_shape, indices_shape)
-  arr_shape = array_ops.tensor_scatter_update(
-      broadcasted_shape, [[axis]], [arr_shape_original[axis]])
+  arr_shape = array_ops.tensor_scatter_update(arr_shape_original, [[axis]], [1])
+  indices_shape = array_ops.tensor_scatter_update(indices_shape_original,
+                                                  [[axis]], [1])
+  broadcasted_shape = array_ops.broadcast_dynamic_shape(arr_shape,
+                                                        indices_shape)
+  arr_shape = array_ops.tensor_scatter_update(broadcasted_shape, [[axis]],
+                                              [arr_shape_original[axis]])
   indices_shape = array_ops.tensor_scatter_update(
       broadcasted_shape, [[axis]], [indices_shape_original[axis]])
   arr = array_ops.broadcast_to(arr, arr_shape)
@@ -1468,10 +1474,10 @@ def take_along_axis(arr, indices, axis):  # pylint: disable=missing-docstring
   swapaxes_ = lambda t: swapaxes(np_utils.tensor_to_ndarray(t), axis, -1).data
 
   dont_move_axis_to_end = math_ops.equal(axis, rank - 1)
-  arr = np_utils.cond(
-      dont_move_axis_to_end, lambda: arr, lambda: swapaxes_(arr))
-  indices = np_utils.cond(
-      dont_move_axis_to_end, lambda: indices, lambda: swapaxes_(indices))
+  arr = np_utils.cond(dont_move_axis_to_end, lambda: arr,
+                      lambda: swapaxes_(arr))
+  indices = np_utils.cond(dont_move_axis_to_end, lambda: indices,
+                          lambda: swapaxes_(indices))
 
   arr_shape = array_ops.shape(arr)
   arr = array_ops.reshape(arr, [-1, arr_shape[-1]])
@@ -1481,8 +1487,231 @@ def take_along_axis(arr, indices, axis):  # pylint: disable=missing-docstring
 
   result = array_ops.gather(arr, indices, batch_dims=1)
   result = array_ops.reshape(result, indices_shape)
-  result = np_utils.cond(
-      dont_move_axis_to_end, lambda: result, lambda: swapaxes_(result))
+  result = np_utils.cond(dont_move_axis_to_end, lambda: result,
+                         lambda: swapaxes_(result))
   result.set_shape(possible_result_shape)
 
-  return  np_utils.tensor_to_ndarray(result)
+  return np_utils.tensor_to_ndarray(result)
+
+
+_SLICE_ERORR = (
+    'only integers, slices (`:`), ellipsis (`...`), '
+    'numpy.newaxis (`None`) and integer or boolean arrays are valid indices')
+
+
+def _as_index(idx, need_scalar=True):
+  """Helper function to parse idx as an index.
+
+  Args:
+    idx: index
+    need_scalar: If idx needs to be a scalar value.
+
+  Returns:
+    A pair, (indx, bool). First one is the parsed index and can be a tensor,
+    or scalar integer / Dimension. Second one is True if rank is known to be 0.
+
+  Raises:
+    IndexError: For incorrect indices.
+  """
+  if isinstance(idx, (numbers.Integral, tensor_shape.Dimension)):
+    return idx, True
+  data = asarray(idx).data
+  if data.dtype == dtypes.bool:
+    if data.shape.ndims != 1:
+      # TODO(agarwal): handle higher rank boolean masks.
+      raise NotImplementedError('Need rank 1 for bool index %s' % idx)
+    data = array_ops.where_v2(data)
+    data = array_ops.reshape(data, [-1])
+  if need_scalar and data.shape.rank not in (None, 0):
+    raise IndexError(_SLICE_ERORR + ', got {!r}'.format(idx))
+  np_dtype = data.dtype.as_numpy_dtype
+  if not np.issubdtype(np_dtype, np.integer):
+    raise IndexError(_SLICE_ERORR + ', got {!r}'.format(idx))
+  if data.dtype not in (dtypes.int64, dtypes.int32):
+    # TF slicing can only handle int32/int64. So we need to cast.
+    promoted_dtype = np.promote_types(np.int32, np_dtype)
+    if promoted_dtype == np.int32:
+      data = math_ops.cast(data, dtypes.int32)
+    elif promoted_dtype == np.int64:
+      data = math_ops.cast(data, dtypes.int64)
+    else:
+      raise IndexError(_SLICE_ERORR + ', got {!r}'.format(idx))
+  return data, data.shape.rank == 0
+
+
+def _slice_helper(tensor, slice_spec):
+  """Helper function for __getitem__."""
+  begin, end, strides = [], [], []
+  new_axis_mask, shrink_axis_mask = 0, 0
+  begin_mask, end_mask = 0, 0
+  ellipsis_mask = 0
+  advanced_indices = []
+  shrink_indices = []
+  for index, s in enumerate(slice_spec):
+    if isinstance(s, slice):
+      if s.start is not None:
+        begin.append(_as_index(s.start)[0])
+      else:
+        begin.append(0)
+        begin_mask |= (1 << index)
+      if s.stop is not None:
+        end.append(_as_index(s.stop)[0])
+      else:
+        end.append(0)
+        end_mask |= (1 << index)
+      if s.step is not None:
+        strides.append(_as_index(s.step)[0])
+      else:
+        strides.append(1)
+    elif s is Ellipsis:
+      begin.append(0)
+      end.append(0)
+      strides.append(1)
+      ellipsis_mask |= (1 << index)
+    elif s is array_ops.newaxis:
+      begin.append(0)
+      end.append(0)
+      strides.append(1)
+      new_axis_mask |= (1 << index)
+    else:
+      s, is_scalar = _as_index(s, False)
+      if is_scalar:
+        begin.append(s)
+        end.append(s + 1)
+        strides.append(1)
+        shrink_axis_mask |= (1 << index)
+        shrink_indices.append(index)
+      else:
+        begin.append(0)
+        end.append(0)
+        strides.append(1)
+        begin_mask |= (1 << index)
+        end_mask |= (1 << index)
+        advanced_indices.append((index, s, ellipsis_mask != 0))
+
+  # stack possibly involves no tensors, so we must use op_scope correct graph.
+  with ops.name_scope(
+      None,
+      'strided_slice', [tensor] + begin + end + strides,
+      skip_on_eager=False) as name:
+    if begin:
+      packed_begin, packed_end, packed_strides = (array_ops.stack(begin),
+                                                  array_ops.stack(end),
+                                                  array_ops.stack(strides))
+      if (packed_begin.dtype == dtypes.int64 or
+          packed_end.dtype == dtypes.int64 or
+          packed_strides.dtype == dtypes.int64):
+        if packed_begin.dtype != dtypes.int64:
+          packed_begin = math_ops.cast(packed_begin, dtypes.int64)
+        if packed_end.dtype != dtypes.int64:
+          packed_end = math_ops.cast(packed_end, dtypes.int64)
+        if packed_strides.dtype != dtypes.int64:
+          packed_strides = math_ops.cast(packed_strides, dtypes.int64)
+    else:
+      var_empty = constant_op.constant([], dtype=dtypes.int32)
+      packed_begin = packed_end = packed_strides = var_empty
+    # TODO(agarwal): set_shape on tensor to set rank.
+    tensor = array_ops.strided_slice(
+        tensor,
+        packed_begin,
+        packed_end,
+        packed_strides,
+        begin_mask=begin_mask,
+        end_mask=end_mask,
+        shrink_axis_mask=shrink_axis_mask,
+        new_axis_mask=new_axis_mask,
+        ellipsis_mask=ellipsis_mask,
+        name=name)
+    if not advanced_indices:
+      return tensor
+    advanced_indices_map = {}
+    for index, data, had_ellipsis in advanced_indices:
+      if had_ellipsis:
+        num_shrink = len([x for x in shrink_indices if x > index])
+        dim = index - len(slice_spec) + num_shrink
+      else:
+        num_shrink = len([x for x in shrink_indices if x < index])
+        dim = index - num_shrink
+      advanced_indices_map[dim] = data
+    dims = sorted(advanced_indices_map.keys())
+    dims_contiguous = True
+    if len(dims) > 1:
+      if dims[0] < 0 and dims[-1] >= 0:  # not all same sign
+        dims_contiguous = False
+      else:
+        for i in range(len(dims) - 1):
+          if dims[i] + 1 != dims[i + 1]:
+            dims_contiguous = False
+            break
+    indices = [advanced_indices_map[x] for x in dims]
+    indices = [x.data for x in _promote_dtype(*indices)]
+    indices = np_utils.tf_broadcast(*indices)
+    stacked_indices = array_ops.stack(indices, axis=-1)
+    if not dims_contiguous:
+      tensor = moveaxis(tensor, dims, range(len(dims))).data
+      tensor_shape_prefix = array_ops.shape(
+          tensor, out_type=stacked_indices.dtype)[:len(dims)]
+      stacked_indices = array_ops.where_v2(
+          stacked_indices < 0, stacked_indices + tensor_shape_prefix,
+          stacked_indices)
+      return array_ops.gather_nd(tensor, stacked_indices)
+    # Note that gather_nd does not support gathering from inside the array.
+    # To avoid shuffling data back and forth, we transform the indices and
+    # do a gather instead.
+    rank = np_utils._maybe_static(array_ops.rank(tensor))  # pylint: disable=protected-access
+    dims = [(x + rank if x < 0 else x) for x in dims]
+    shape_tensor = array_ops.shape(tensor, out_type=stacked_indices.dtype)
+    dim_sizes = array_ops.gather(shape_tensor, dims)
+    if len(dims) == 1:
+      stacked_indices = indices[0]
+    stacked_indices = array_ops.where_v2(stacked_indices < 0,
+                                         stacked_indices + dim_sizes,
+                                         stacked_indices)
+    axis = dims[0]
+    if len(dims) > 1:
+      index_scaling = math_ops.cumprod(
+          dim_sizes, reverse=True, exclusive=True)
+      stacked_indices = math_ops.tensordot(
+          stacked_indices, index_scaling, axes=1)
+      flat_shape = array_ops.concat(
+          [shape_tensor[:axis], [-1], shape_tensor[axis + len(dims):]],
+          axis=0)
+      tensor = array_ops.reshape(tensor, flat_shape)
+
+    return array_ops.gather(tensor, stacked_indices, axis=axis)
+
+
+def _as_spec_tuple(slice_spec):
+  """Convert slice_spec to tuple."""
+  if isinstance(slice_spec,
+                Sequence) and not isinstance(slice_spec, np.ndarray):
+    is_index = True
+    for s in slice_spec:
+      if s is None or s is Ellipsis or isinstance(s, (Sequence, slice)):
+        is_index = False
+        break
+      elif isinstance(s, (np_arrays.ndarray, np.ndarray)) and s.ndim != 0:
+        is_index = False
+        break
+    if not is_index:
+      return tuple(slice_spec)
+  return (slice_spec,)
+
+
+def _getitem(self, slice_spec):
+  """Implementation of ndarray.__getitem__."""
+  if (isinstance(slice_spec, bool) or (isinstance(slice_spec, ops.Tensor) and
+                                       slice_spec.dtype == dtypes.bool) or
+      (isinstance(slice_spec, (np.ndarray, np_arrays.ndarray)) and
+       slice_spec.dtype == np.bool)):
+    return np_utils.tensor_to_ndarray(
+        array_ops.boolean_mask(tensor=self.data, mask=slice_spec))
+
+  if not isinstance(slice_spec, tuple):
+    slice_spec = _as_spec_tuple(slice_spec)
+
+  result_t = _slice_helper(self.data, slice_spec)
+  return np_utils.tensor_to_ndarray(result_t)
+
+
+setattr(np_arrays.ndarray, '__getitem__', _getitem)
diff --git a/tensorflow/python/ops/numpy_ops/np_arrays.py b/tensorflow/python/ops/numpy_ops/np_arrays.py
index 8bec8a469a2..88bf4e7499a 100644
--- a/tensorflow/python/ops/numpy_ops/np_arrays.py
+++ b/tensorflow/python/ops/numpy_ops/np_arrays.py
@@ -20,138 +20,17 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import numbers
 import numpy as np
 import six
 
 from tensorflow.python.framework import composite_tensor
-from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_spec
 from tensorflow.python.framework import type_spec
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.numpy_ops import np_dtypes
-from tensorflow.python.util import nest
-
-
-_SLICE_TYPE_ERROR = (
-    'Only integers, slices (`:`), ellipsis (`...`), '
-    'tf.newaxis (`None`) and scalar tf.int32/tf.int64 tensors are valid '
-    'indices')
-
-_SUPPORTED_SLICE_DTYPES = (dtypes.int32, dtypes.int32_ref, dtypes.int64,
-                           dtypes.int64_ref)
-
-
-def _check_index(idx):
-  """Check if a given value is a valid index into a tensor."""
-  if isinstance(idx, (numbers.Integral, tensor_shape.Dimension)):
-    return
-
-  # Optimistic check. Assumptions:
-  # * any object with a dtype is supported
-  # * any object with a dtype has a sizeable shape attribute.
-  dtype = getattr(idx, 'dtype', None)
-  if (dtype is None or dtypes.as_dtype(dtype) not in _SUPPORTED_SLICE_DTYPES or
-      idx.shape and len(idx.shape) == 1):
-    # TODO(slebedev): IndexError seems more appropriate here, but it
-    # will break `_slice_helper` contract.
-    raise TypeError(_SLICE_TYPE_ERROR + ', got {!r}'.format(idx))
-
-
-def _is_undefined_dimension(d):
-  return isinstance(d, tensor_shape.Dimension) and d.value is None
-
-
-def _slice_helper(tensor, slice_spec, var=None):
-  """Copied from array_ops._slice_helper, will be merged back later."""
-  if isinstance(slice_spec, bool) or \
-  (isinstance(slice_spec, ops.Tensor) and slice_spec.dtype == dtypes.bool) or \
-  (isinstance(slice_spec, np.ndarray) and slice_spec.dtype == bool):
-    return array_ops.boolean_mask(tensor=tensor, mask=slice_spec)
-
-  if not isinstance(slice_spec, (list, tuple)):
-    slice_spec = [slice_spec]
-
-  begin, end, strides = [], [], []
-  index = 0
-
-  new_axis_mask, shrink_axis_mask = 0, 0
-  begin_mask, end_mask = 0, 0
-  ellipsis_mask = 0
-  for s in slice_spec:
-    if isinstance(s, slice):
-      if s.start is not None and not _is_undefined_dimension(s.start):
-        _check_index(s.start)
-        begin.append(s.start)
-      else:
-        begin.append(0)
-        begin_mask |= (1 << index)
-      if s.stop is not None and not _is_undefined_dimension(s.stop):
-        _check_index(s.stop)
-        end.append(s.stop)
-      else:
-        end.append(0)
-        end_mask |= (1 << index)
-      if s.step is not None and not _is_undefined_dimension(s.step):
-        _check_index(s.step)
-        strides.append(s.step)
-      else:
-        strides.append(1)
-    elif s is Ellipsis:
-      begin.append(0)
-      end.append(0)
-      strides.append(1)
-      ellipsis_mask |= (1 << index)
-    elif s is array_ops.newaxis:
-      begin.append(0)
-      end.append(0)
-      strides.append(1)
-      new_axis_mask |= (1 << index)
-    else:
-      _check_index(s)
-      begin.append(s)
-      end.append(s + 1)
-      strides.append(1)
-      shrink_axis_mask |= (1 << index)
-    index += 1
-
-  # stack possibly involves no tensors, so we must use op_scope correct graph.
-  with ops.name_scope(
-      None,
-      'strided_slice', [tensor] + begin + end + strides,
-      skip_on_eager=False) as name:
-    if begin:
-      packed_begin, packed_end, packed_strides = (array_ops.stack(begin),
-                                                  array_ops.stack(end),
-                                                  array_ops.stack(strides))
-      if (packed_begin.dtype == dtypes.int64 or
-          packed_end.dtype == dtypes.int64 or
-          packed_strides.dtype == dtypes.int64):
-        if packed_begin.dtype != dtypes.int64:
-          packed_begin = math_ops.cast(packed_begin, dtypes.int64)
-        if packed_end.dtype != dtypes.int64:
-          packed_end = math_ops.cast(packed_end, dtypes.int64)
-        if packed_strides.dtype != dtypes.int64:
-          packed_strides = math_ops.cast(packed_strides, dtypes.int64)
-    else:
-      var_empty = constant_op.constant([], dtype=dtypes.int32)
-      packed_begin = packed_end = packed_strides = var_empty
-    return array_ops.strided_slice(
-        tensor,
-        packed_begin,
-        packed_end,
-        packed_strides,
-        begin_mask=begin_mask,
-        end_mask=end_mask,
-        shrink_axis_mask=shrink_axis_mask,
-        new_axis_mask=new_axis_mask,
-        ellipsis_mask=ellipsis_mask,
-        var=var,
-        name=name)
 
 
 def convert_to_tensor(value, dtype=None, dtype_hint=None):
@@ -361,22 +240,6 @@ class ndarray(composite_tensor.CompositeTensor):  # pylint: disable=invalid-name
   def __bool__(self):
     return self.__nonzero__()
 
-  def __getitem__(self, slice_spec):
-    # TODO(srbs): Need to support better indexing.
-    def _gettensor(x):
-      if isinstance(x, ndarray):
-        x = x.data
-      if isinstance(x, ops.Tensor) and x.dtype not in (
-          dtypes.int32, dtypes.int64):
-        # Currently _slice_helper will only work with int32/int64 tensors, but
-        # type inference by numpy can create {u,}int{8,16}, so just cast.
-        x = math_ops.cast(x, dtypes.int32)
-      return x
-    slice_spec = nest.map_structure(_gettensor, slice_spec)
-
-    result_t = _slice_helper(self.data, slice_spec)
-    return tensor_to_ndarray(result_t)
-
   def __iter__(self):
     if not isinstance(self.data, ops.EagerTensor):
       raise TypeError('Iteration over symbolic tensor is not allowed')

From e972c5572634efd188696038e9241b75cdcd69bc Mon Sep 17 00:00:00 2001
From: Gaurav Jain <gjn@google.com>
Date: Fri, 19 Jun 2020 00:07:20 -0700
Subject: [PATCH 0599/1390] Add uint32 & uint64 to TF_CALL_INTEGRAL_TYPES

Both uint32 & uint64 had been omitted from TF_CALL_INTEGRAL_TYPES due to
suggested concerns of size bloat. In reality it seems that the size
increase is only around 2MB. Further, this fixes #39649 since we are no
longer inadvertently using the XLA_CPU device to perform tf.reduce_mean.

PiperOrigin-RevId: 317259372
Change-Id: Iacf75eaedce198fbef4bd9fd59b6fefa584cbf34
---
 tensorflow/core/framework/register_types.h    | 21 +++++---------
 tensorflow/core/framework/types.cc            |  5 ----
 tensorflow/core/kernels/BUILD                 |  2 ++
 tensorflow/core/kernels/concat_lib_cpu.cc     |  2 --
 tensorflow/core/kernels/concat_op.cc          |  2 --
 tensorflow/core/kernels/constant_op.cc        |  1 -
 tensorflow/core/kernels/control_flow_ops.cc   |  5 ----
 .../core/kernels/data/dataset_test_base.cc    |  2 --
 tensorflow/core/kernels/dense_update_ops.cc   |  1 -
 .../core/kernels/dynamic_partition_op.cc      |  2 --
 tensorflow/core/kernels/fill_functor.cc       |  5 +++-
 tensorflow/core/kernels/gather_op.cc          |  2 --
 tensorflow/core/kernels/identity_op.cc        |  1 -
 tensorflow/core/kernels/ragged_gather_op.cc   |  2 --
 .../kernels/ragged_tensor_from_variant_op.cc  |  2 --
 .../kernels/ragged_tensor_to_tensor_op.cc     |  2 --
 .../kernels/ragged_tensor_to_variant_op.cc    |  2 --
 .../core/kernels/resource_variable_ops.cc     |  1 -
 tensorflow/core/kernels/split_lib_cpu.cc      |  1 -
 tensorflow/core/kernels/split_op.cc           |  1 -
 tensorflow/core/kernels/strided_slice_op.cc   |  2 --
 .../core/kernels/strided_slice_op_impl.h      |  2 --
 tensorflow/core/kernels/topk_op.cc            |  2 --
 .../core/kernels/topk_op_gpu_uint32.cu.cc     | 28 +++++++++++++++++++
 .../core/kernels/topk_op_gpu_uint64.cu.cc     | 28 +++++++++++++++++++
 tensorflow/core/util/batch_util.cc            |  8 ------
 .../core/util/saved_tensor_slice_util.h       |  2 ++
 27 files changed, 71 insertions(+), 63 deletions(-)
 create mode 100644 tensorflow/core/kernels/topk_op_gpu_uint32.cu.cc
 create mode 100644 tensorflow/core/kernels/topk_op_gpu_uint64.cu.cc

diff --git a/tensorflow/core/framework/register_types.h b/tensorflow/core/framework/register_types.h
index bc3e5e1743b..0cf6536e8c2 100644
--- a/tensorflow/core/framework/register_types.h
+++ b/tensorflow/core/framework/register_types.h
@@ -153,16 +153,9 @@ limitations under the License.
 #endif  // defined(IS_MOBILE_PLATFORM)  - end of TF_CALL_type defines
 
 // Defines for sets of types.
-
-// TODO(b/111604096): Add uint32 and uint64 to TF_CALL_INTEGRAL_TYPES.
-//
-// The uint32 and uint64 types were introduced in 10/2017 to be used via XLA and
-// thus were not included in TF_CALL_INTEGRAL_TYPES. Including them in
-// TF_CALL_INTEGRAL_TYPES should only happen after evaluating the effect on the
-// TF binary size and performance.
-#define TF_CALL_INTEGRAL_TYPES(m)                                      \
-  TF_CALL_int64(m) TF_CALL_int32(m) TF_CALL_uint16(m) TF_CALL_int16(m) \
-      TF_CALL_uint8(m) TF_CALL_int8(m)
+#define TF_CALL_INTEGRAL_TYPES(m)                                       \
+  TF_CALL_uint64(m) TF_CALL_int64(m) TF_CALL_uint32(m) TF_CALL_int32(m) \
+      TF_CALL_uint16(m) TF_CALL_int16(m) TF_CALL_uint8(m) TF_CALL_int8(m)
 
 #define TF_CALL_FLOAT_TYPES(m) \
   TF_CALL_half(m) TF_CALL_bfloat16(m) TF_CALL_float(m) TF_CALL_double(m)
@@ -174,10 +167,10 @@ limitations under the License.
 #define TF_CALL_REAL_NUMBER_TYPES_NO_BFLOAT16(m) \
   TF_CALL_INTEGRAL_TYPES(m) TF_CALL_half(m) TF_CALL_float(m) TF_CALL_double(m)
 
-#define TF_CALL_REAL_NUMBER_TYPES_NO_INT32(m)                              \
-  TF_CALL_half(m) TF_CALL_bfloat16(m) TF_CALL_float(m) TF_CALL_double(m)   \
-      TF_CALL_int64(m) TF_CALL_uint16(m) TF_CALL_int16(m) TF_CALL_uint8(m) \
-          TF_CALL_int8(m)
+#define TF_CALL_REAL_NUMBER_TYPES_NO_INT32(m)                                \
+  TF_CALL_half(m) TF_CALL_bfloat16(m) TF_CALL_float(m) TF_CALL_double(m)     \
+      TF_CALL_uint64(m) TF_CALL_int64(m) TF_CALL_uint32(m) TF_CALL_uint16(m) \
+          TF_CALL_int16(m) TF_CALL_uint8(m) TF_CALL_int8(m)
 
 #define TF_CALL_COMPLEX_TYPES(m) TF_CALL_complex64(m) TF_CALL_complex128(m)
 
diff --git a/tensorflow/core/framework/types.cc b/tensorflow/core/framework/types.cc
index 97eaec98ffe..d6455e012d0 100644
--- a/tensorflow/core/framework/types.cc
+++ b/tensorflow/core/framework/types.cc
@@ -238,11 +238,6 @@ int DataTypeSize(DataType dt) {
     TF_CALL_qint16(CASE);
     TF_CALL_quint16(CASE);
 
-    // uint32 and uint64 aren't included in TF_CALL_POD_TYPES because we
-    // don't want to define kernels for them at this stage to avoid binary
-    // bloat.
-    TF_CALL_uint32(CASE);
-    TF_CALL_uint64(CASE);
     default:
       return 0;
   }
diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index 279dff92c58..97f974c6af4 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -4900,7 +4900,9 @@ tf_kernel_library(
         "topk_op_gpu_double.cu.cc",
         "topk_op_gpu_float.cu.cc",
         "topk_op_gpu_half.cu.cc",
+        "topk_op_gpu_uint64.cu.cc",
         "topk_op_gpu_int64.cu.cc",
+        "topk_op_gpu_uint32.cu.cc",
         "topk_op_gpu_int32.cu.cc",
         "topk_op_gpu_int16.cu.cc",
         "topk_op_gpu_uint16.cu.cc",
diff --git a/tensorflow/core/kernels/concat_lib_cpu.cc b/tensorflow/core/kernels/concat_lib_cpu.cc
index da73d3d2c56..1dec589d3ff 100644
--- a/tensorflow/core/kernels/concat_lib_cpu.cc
+++ b/tensorflow/core/kernels/concat_lib_cpu.cc
@@ -116,8 +116,6 @@ REGISTER(qint8)
 REGISTER(quint16)
 REGISTER(qint16)
 REGISTER(qint32)
-REGISTER(uint32)
-REGISTER(uint64)
 
 #if defined(IS_MOBILE_PLATFORM) && !defined(SUPPORT_SELECTIVE_REGISTRATION) && \
     !defined(__ANDROID_TYPES_FULL__)
diff --git a/tensorflow/core/kernels/concat_op.cc b/tensorflow/core/kernels/concat_op.cc
index be3e9a67c5f..d3f3a04f33b 100644
--- a/tensorflow/core/kernels/concat_op.cc
+++ b/tensorflow/core/kernels/concat_op.cc
@@ -208,8 +208,6 @@ REGISTER_CONCAT(qint8);
 REGISTER_CONCAT(quint16);
 REGISTER_CONCAT(qint16);
 REGISTER_CONCAT(qint32);
-REGISTER_CONCAT(uint32);
-REGISTER_CONCAT(uint64);
 
 #undef REGISTER_CONCAT
 
diff --git a/tensorflow/core/kernels/constant_op.cc b/tensorflow/core/kernels/constant_op.cc
index 4bcbc076446..dc178d17d49 100644
--- a/tensorflow/core/kernels/constant_op.cc
+++ b/tensorflow/core/kernels/constant_op.cc
@@ -211,7 +211,6 @@ TF_CALL_ALL_TYPES(REGISTER_CPU_KERNEL);
 // the conversion from uint8 to quint8.
 REGISTER_KERNEL(CPU, quint8);
 REGISTER_KERNEL(CPU, quint16);
-REGISTER_KERNEL(CPU, uint32);
 #undef REGISTER_CPU_KERNEL
 
 #ifdef TENSORFLOW_USE_SYCL
diff --git a/tensorflow/core/kernels/control_flow_ops.cc b/tensorflow/core/kernels/control_flow_ops.cc
index c8e83b6f672..accb2c59540 100644
--- a/tensorflow/core/kernels/control_flow_ops.cc
+++ b/tensorflow/core/kernels/control_flow_ops.cc
@@ -101,16 +101,12 @@ TF_CALL_ALL_TYPES(REGISTER_CPU_SWITCH);
 TF_CALL_ALL_TYPES(REGISTER_CPU_REF_SWITCH);
 TF_CALL_QUANTIZED_TYPES(REGISTER_CPU_SWITCH);
 TF_CALL_QUANTIZED_TYPES(REGISTER_CPU_REF_SWITCH);
-REGISTER_CPU_SWITCH(uint64);
 
 TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_GPU_SWITCH);
 TF_CALL_QUANTIZED_TYPES(REGISTER_GPU_SWITCH);
 TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_GPU_REF_SWITCH);
 TF_CALL_QUANTIZED_TYPES(REGISTER_GPU_REF_SWITCH);
-REGISTER_GPU_SWITCH(uint64);
 TF_CALL_variant(REGISTER_GPU_SWITCH);
-TF_CALL_uint32(REGISTER_GPU_SWITCH);
-TF_CALL_uint32(REGISTER_GPU_REF_SWITCH);
 
 #undef REGISTER_CPU_SWITCH
 #undef REGISTER_CPU_REF_SWITCH
@@ -311,7 +307,6 @@ TF_CALL_QUANTIZED_TYPES(REGISTER_GPU_KERNEL);
 TF_CALL_QUANTIZED_TYPES(REGISTER_GPU_REF_KERNEL);
 REGISTER_GPU_KERNEL(bool);
 REGISTER_GPU_REF_KERNEL(bool);
-REGISTER_GPU_KERNEL(uint64);
 TF_CALL_variant(REGISTER_GPU_KERNEL);
 
 #undef REGISTER_GPU_KERNEL
diff --git a/tensorflow/core/kernels/data/dataset_test_base.cc b/tensorflow/core/kernels/data/dataset_test_base.cc
index b91ab9b733c..e41e35be1e9 100644
--- a/tensorflow/core/kernels/data/dataset_test_base.cc
+++ b/tensorflow/core/kernels/data/dataset_test_base.cc
@@ -220,8 +220,6 @@ Status DatasetOpsTestBase::ExpectEqual(const Tensor& a, const Tensor& b) {
     break;
     TF_CALL_NUMBER_TYPES(CASE);
     TF_CALL_tstring(CASE);
-    TF_CALL_uint32(CASE);
-    TF_CALL_uint64(CASE);
     // TODO(feihugis): figure out how to support variant tensors.
 #undef CASE
     default:
diff --git a/tensorflow/core/kernels/dense_update_ops.cc b/tensorflow/core/kernels/dense_update_ops.cc
index 55e4cd7606a..71235fca143 100644
--- a/tensorflow/core/kernels/dense_update_ops.cc
+++ b/tensorflow/core/kernels/dense_update_ops.cc
@@ -98,7 +98,6 @@ typedef Eigen::SyclDevice SYCLDevice;
 
 TF_CALL_ALL_TYPES(REGISTER_KERNELS);
 // uint32 not included in ALL_TYPES
-TF_CALL_uint32(REGISTER_KERNELS);
 TF_CALL_QUANTIZED_TYPES(REGISTER_KERNELS);
 // quint16 not included in QUANTIZIED_TYPES
 TF_CALL_quint16(REGISTER_KERNELS);
diff --git a/tensorflow/core/kernels/dynamic_partition_op.cc b/tensorflow/core/kernels/dynamic_partition_op.cc
index 90ed71dccce..95af19c4c48 100644
--- a/tensorflow/core/kernels/dynamic_partition_op.cc
+++ b/tensorflow/core/kernels/dynamic_partition_op.cc
@@ -164,8 +164,6 @@ class DynamicPartitionOp : public DynamicPartitionOp_Shared {
       DynamicPartitionOp<T>)
 
 TF_CALL_ALL_TYPES(REGISTER_DYNAMIC_PARTITION);
-// For partitioning fingerprints.
-TF_CALL_uint64(REGISTER_DYNAMIC_PARTITION);
 #undef REGISTER_DYNAMIC_PARTITION
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/fill_functor.cc b/tensorflow/core/kernels/fill_functor.cc
index 10dd3df1915..174a4e45a79 100644
--- a/tensorflow/core/kernels/fill_functor.cc
+++ b/tensorflow/core/kernels/fill_functor.cc
@@ -45,6 +45,8 @@ DEFINE_SETZERO_CPU(Eigen::half);
 DEFINE_SETZERO_CPU(bfloat16);
 DEFINE_SETZERO_CPU(float);
 DEFINE_SETZERO_CPU(double);
+DEFINE_SETZERO_CPU(uint32);
+DEFINE_SETZERO_CPU(uint64);
 DEFINE_SETZERO_CPU(uint8);
 DEFINE_SETZERO_CPU(int8);
 DEFINE_SETZERO_CPU(uint16);
@@ -96,6 +98,8 @@ DEFINE_SETONE_CPU(Eigen::half);
 DEFINE_SETONE_CPU(bfloat16);
 DEFINE_SETONE_CPU(float);
 DEFINE_SETONE_CPU(double);
+DEFINE_SETONE_CPU(uint32);
+DEFINE_SETONE_CPU(uint64);
 DEFINE_SETONE_CPU(uint8);
 DEFINE_SETONE_CPU(int8);
 DEFINE_SETONE_CPU(uint16);
@@ -137,7 +141,6 @@ struct FillFunctor<Eigen::ThreadPoolDevice, T> {
 TF_CALL_ALL_TYPES(DEFINE_FILL_CPU);
 DEFINE_FILL_CPU(quint8);
 DEFINE_FILL_CPU(quint16);
-DEFINE_FILL_CPU(uint32);
 #undef DEFINE_FILL_CPU
 
 #ifdef TENSORFLOW_USE_SYCL
diff --git a/tensorflow/core/kernels/gather_op.cc b/tensorflow/core/kernels/gather_op.cc
index 6d493a5f2ea..948567e019a 100644
--- a/tensorflow/core/kernels/gather_op.cc
+++ b/tensorflow/core/kernels/gather_op.cc
@@ -211,8 +211,6 @@ TF_CALL_ALL_TYPES(REGISTER_GATHER_CPU);
 TF_CALL_QUANTIZED_TYPES(REGISTER_GATHER_CPU);
 TF_CALL_quint16(REGISTER_GATHER_CPU);
 TF_CALL_qint16(REGISTER_GATHER_CPU);
-TF_CALL_uint32(REGISTER_GATHER_CPU);
-TF_CALL_uint64(REGISTER_GATHER_CPU);
 
 #undef REGISTER_GATHER_CPU
 
diff --git a/tensorflow/core/kernels/identity_op.cc b/tensorflow/core/kernels/identity_op.cc
index fd94df9a768..daa8a1ddb25 100644
--- a/tensorflow/core/kernels/identity_op.cc
+++ b/tensorflow/core/kernels/identity_op.cc
@@ -122,7 +122,6 @@ REGISTER_SYCL_HOST_KERNEL(bool);
 
 TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_GPU_KERNEL);
 REGISTER_GPU_KERNEL(Variant);
-TF_CALL_uint32(REGISTER_GPU_KERNEL);
 
 #undef REGISTER_GPU_KERNEL
 
diff --git a/tensorflow/core/kernels/ragged_gather_op.cc b/tensorflow/core/kernels/ragged_gather_op.cc
index 88c0d1ebd69..3bf82cba050 100644
--- a/tensorflow/core/kernels/ragged_gather_op.cc
+++ b/tensorflow/core/kernels/ragged_gather_op.cc
@@ -296,8 +296,6 @@ TF_CALL_tstring(REGISTER_CPU_KERNEL);
 TF_CALL_QUANTIZED_TYPES(REGISTER_CPU_KERNEL);
 TF_CALL_quint16(REGISTER_CPU_KERNEL);
 TF_CALL_qint16(REGISTER_CPU_KERNEL);
-TF_CALL_uint32(REGISTER_CPU_KERNEL);
-TF_CALL_uint64(REGISTER_CPU_KERNEL);
 #undef REGISTER_CPU_KERNEL
 #undef REGISTER_CPU_KERNEL_WITH_INDEX_TYPE
 
diff --git a/tensorflow/core/kernels/ragged_tensor_from_variant_op.cc b/tensorflow/core/kernels/ragged_tensor_from_variant_op.cc
index f83bcb38c6c..ad0712e6fd0 100644
--- a/tensorflow/core/kernels/ragged_tensor_from_variant_op.cc
+++ b/tensorflow/core/kernels/ragged_tensor_from_variant_op.cc
@@ -308,8 +308,6 @@ TF_CALL_tstring(REGISTER_KERNELS);
 TF_CALL_QUANTIZED_TYPES(REGISTER_KERNELS);
 TF_CALL_quint16(REGISTER_KERNELS);
 TF_CALL_qint16(REGISTER_KERNELS);
-TF_CALL_uint32(REGISTER_KERNELS);
-TF_CALL_uint64(REGISTER_KERNELS);
 #undef REGISTER_KERNELS
 #undef REGISTER_KERNELS_WITH_SPLIT_TYPE
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/ragged_tensor_to_tensor_op.cc b/tensorflow/core/kernels/ragged_tensor_to_tensor_op.cc
index d729c43f25a..9ae5d7ffbdc 100644
--- a/tensorflow/core/kernels/ragged_tensor_to_tensor_op.cc
+++ b/tensorflow/core/kernels/ragged_tensor_to_tensor_op.cc
@@ -561,8 +561,6 @@ TF_CALL_string(REGISTER_CPU_KERNEL);
 TF_CALL_QUANTIZED_TYPES(REGISTER_CPU_KERNEL);
 TF_CALL_quint16(REGISTER_CPU_KERNEL);
 TF_CALL_qint16(REGISTER_CPU_KERNEL);
-TF_CALL_uint32(REGISTER_CPU_KERNEL);
-TF_CALL_uint64(REGISTER_CPU_KERNEL);
 
 #undef REGISTER_CPU_KERNEL
 
diff --git a/tensorflow/core/kernels/ragged_tensor_to_variant_op.cc b/tensorflow/core/kernels/ragged_tensor_to_variant_op.cc
index 7a5ae1c6240..64c372b005e 100644
--- a/tensorflow/core/kernels/ragged_tensor_to_variant_op.cc
+++ b/tensorflow/core/kernels/ragged_tensor_to_variant_op.cc
@@ -213,8 +213,6 @@ TF_CALL_tstring(REGISTER_KERNELS);
 TF_CALL_QUANTIZED_TYPES(REGISTER_KERNELS);
 TF_CALL_quint16(REGISTER_KERNELS);
 TF_CALL_qint16(REGISTER_KERNELS);
-TF_CALL_uint32(REGISTER_KERNELS);
-TF_CALL_uint64(REGISTER_KERNELS);
 #undef REGISTER_KERNELS
 #undef REGISTER_KERNELS_WITH_SPLIT_TYPE
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/resource_variable_ops.cc b/tensorflow/core/kernels/resource_variable_ops.cc
index 0fc1d53749f..79a64cb9219 100644
--- a/tensorflow/core/kernels/resource_variable_ops.cc
+++ b/tensorflow/core/kernels/resource_variable_ops.cc
@@ -512,7 +512,6 @@ class AssignVariableOp<Device, Variant> : public OpKernel {
 
 TF_CALL_ALL_TYPES(REGISTER_KERNELS);
 TF_CALL_QUANTIZED_TYPES(REGISTER_KERNELS);
-TF_CALL_uint32(REGISTER_KERNELS);
 #undef REGISTER_KERNELS
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/tensorflow/core/kernels/split_lib_cpu.cc b/tensorflow/core/kernels/split_lib_cpu.cc
index 0cb0a94d498..a3060e4e90d 100644
--- a/tensorflow/core/kernels/split_lib_cpu.cc
+++ b/tensorflow/core/kernels/split_lib_cpu.cc
@@ -43,7 +43,6 @@ void Split<Eigen::ThreadPoolDevice, T, NDims>::operator()(
 
 TF_CALL_ALL_TYPES(DEFINE_CPU_KERNELS)
 DEFINE_CPU_KERNELS(quint8)
-DEFINE_CPU_KERNELS(uint64)
 
 #ifdef TENSORFLOW_USE_SYCL
 template <typename T, int NDims>
diff --git a/tensorflow/core/kernels/split_op.cc b/tensorflow/core/kernels/split_op.cc
index f09740c6198..08575f01f67 100644
--- a/tensorflow/core/kernels/split_op.cc
+++ b/tensorflow/core/kernels/split_op.cc
@@ -404,7 +404,6 @@ class SplitOpSYCL : public SplitOpBase<SYCLDevice, T> {
 
 TF_CALL_ALL_TYPES(REGISTER_SPLIT);
 REGISTER_SPLIT(quint8);
-REGISTER_SPLIT(uint64);
 
 #undef REGISTER_SPLIT
 
diff --git a/tensorflow/core/kernels/strided_slice_op.cc b/tensorflow/core/kernels/strided_slice_op.cc
index ccc1984bb98..b4099213303 100644
--- a/tensorflow/core/kernels/strided_slice_op.cc
+++ b/tensorflow/core/kernels/strided_slice_op.cc
@@ -440,8 +440,6 @@ class StridedSliceAssignOp : public OpKernel {
                           StridedSliceAssignOp<CPUDevice, type, true>)
 
 TF_CALL_ALL_TYPES(REGISTER_STRIDED_SLICE);
-TF_CALL_uint32(REGISTER_STRIDED_SLICE);
-TF_CALL_uint64(REGISTER_STRIDED_SLICE);
 
 #undef REGISTER_STRIDED_SLICE
 
diff --git a/tensorflow/core/kernels/strided_slice_op_impl.h b/tensorflow/core/kernels/strided_slice_op_impl.h
index 1ae959b7b3f..5ce1d773e33 100644
--- a/tensorflow/core/kernels/strided_slice_op_impl.h
+++ b/tensorflow/core/kernels/strided_slice_op_impl.h
@@ -287,8 +287,6 @@ TF_CALL_GPU_ALL_TYPES(DECLARE_FOR_N_GPU);
 #endif  // END GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 TF_CALL_ALL_TYPES(DECLARE_FOR_N_CPU);
-TF_CALL_uint32(DECLARE_FOR_N_CPU);
-TF_CALL_uint64(DECLARE_FOR_N_CPU);
 
 #ifdef TENSORFLOW_USE_SYCL
 #define PREVENT_FOR_N_SYCL(T) \
diff --git a/tensorflow/core/kernels/topk_op.cc b/tensorflow/core/kernels/topk_op.cc
index c555b42f005..50325b7bcfe 100644
--- a/tensorflow/core/kernels/topk_op.cc
+++ b/tensorflow/core/kernels/topk_op.cc
@@ -258,7 +258,6 @@ namespace functor {
 
 TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPEC);
 TF_CALL_INTEGRAL_TYPES(DECLARE_GPU_SPEC);
-TF_CALL_uint32(DECLARE_GPU_SPEC);
 
 #undef DECLARE_GPU_SPEC
 
@@ -276,7 +275,6 @@ TF_CALL_uint32(DECLARE_GPU_SPEC);
 
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_KERNELS);
 TF_CALL_INTEGRAL_TYPES(REGISTER_KERNELS);
-TF_CALL_uint32(REGISTER_KERNELS)
 #undef REGISTER_KERNELS
 
 #endif  // end GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/topk_op_gpu_uint32.cu.cc b/tensorflow/core/kernels/topk_op_gpu_uint32.cu.cc
new file mode 100644
index 00000000000..16e2e0e9420
--- /dev/null
+++ b/tensorflow/core/kernels/topk_op_gpu_uint32.cu.cc
@@ -0,0 +1,28 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/kernels/topk_op.h"
+#include "tensorflow/core/kernels/topk_op_gpu.h"
+
+namespace tensorflow {
+using Eigen::GpuDevice;
+
+template struct functor::TopKFunctor<GPUDevice, uint32>;
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/topk_op_gpu_uint64.cu.cc b/tensorflow/core/kernels/topk_op_gpu_uint64.cu.cc
new file mode 100644
index 00000000000..895247a63a2
--- /dev/null
+++ b/tensorflow/core/kernels/topk_op_gpu_uint64.cu.cc
@@ -0,0 +1,28 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/kernels/topk_op.h"
+#include "tensorflow/core/kernels/topk_op_gpu.h"
+
+namespace tensorflow {
+using Eigen::GpuDevice;
+
+template struct functor::TopKFunctor<GPUDevice, uint64>;
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/util/batch_util.cc b/tensorflow/core/util/batch_util.cc
index b88c365ced0..e03188b04da 100644
--- a/tensorflow/core/util/batch_util.cc
+++ b/tensorflow/core/util/batch_util.cc
@@ -182,8 +182,6 @@ Status CopyElementToSlice(Tensor element, Tensor* parent, int64 index) {
   switch (element.dtype()) {
     TF_CALL_ALL_TYPES(HANDLE_TYPE);
     TF_CALL_QUANTIZED_TYPES(HANDLE_TYPE);
-    TF_CALL_uint32(HANDLE_TYPE);
-    TF_CALL_uint64(HANDLE_TYPE);
 #undef HANDLE_TYPE
     default:
       return errors::Unimplemented("CopyElementToSlice Unhandled data type: ",
@@ -207,8 +205,6 @@ Status CopySliceToElement(const Tensor& parent, Tensor* element, int64 index) {
   switch (parent.dtype()) {
     TF_CALL_ALL_TYPES(HANDLE_TYPE);
     TF_CALL_QUANTIZED_TYPES(HANDLE_TYPE);
-    TF_CALL_uint32(HANDLE_TYPE);
-    TF_CALL_uint64(HANDLE_TYPE);
 #undef HANDLE_TYPE
     default:
       return errors::Unimplemented("CopySliceToElement Unhandled data type: ",
@@ -280,8 +276,6 @@ Status CopyContiguousSlices(const Tensor& src, int64 src_offset,
   switch (src.dtype()) {
     TF_CALL_ALL_TYPES(HANDLE_TYPE);
     TF_CALL_QUANTIZED_TYPES(HANDLE_TYPE);
-    TF_CALL_uint32(HANDLE_TYPE);
-    TF_CALL_uint64(HANDLE_TYPE);
 #undef HANDLE_TYPE
     default:
       return errors::Unimplemented("CopyContiguousSlices unhandled data type: ",
@@ -308,8 +302,6 @@ Status MaybeMoveSliceToElement(Tensor* parent, Tensor* element, int64 index) {
   switch (parent->dtype()) {
     TF_CALL_ALL_TYPES(HANDLE_TYPE);
     TF_CALL_QUANTIZED_TYPES(HANDLE_TYPE);
-    TF_CALL_uint32(HANDLE_TYPE);
-    TF_CALL_uint64(HANDLE_TYPE);
 #undef HANDLE_TYPE
     default:
       return errors::Unimplemented(
diff --git a/tensorflow/core/util/saved_tensor_slice_util.h b/tensorflow/core/util/saved_tensor_slice_util.h
index 09b9235b711..1f9768f5163 100644
--- a/tensorflow/core/util/saved_tensor_slice_util.h
+++ b/tensorflow/core/util/saved_tensor_slice_util.h
@@ -116,7 +116,9 @@ TENSOR_PROTO_EXTRACT_TYPE(double, double, double);
 TENSOR_PROTO_EXTRACT_TYPE_COMPLEX(complex64, scomplex, float);
 TENSOR_PROTO_EXTRACT_TYPE_COMPLEX(complex128, dcomplex, double);
 TENSOR_PROTO_EXTRACT_TYPE(int32, int, int32);
+TENSOR_PROTO_EXTRACT_TYPE(uint32, uint32, uint32);
 TENSOR_PROTO_EXTRACT_TYPE(int64, int64, protobuf_int64);
+TENSOR_PROTO_EXTRACT_TYPE(uint64, uint64, protobuf_uint64);
 TENSOR_PROTO_EXTRACT_TYPE(uint16, int, int32);
 TENSOR_PROTO_EXTRACT_TYPE(uint8, int, int32);
 TENSOR_PROTO_EXTRACT_TYPE(int8, int, int32);

From 9f20b156bc7862fb621756fd5d6744255b1f3735 Mon Sep 17 00:00:00 2001
From: George Karpenkov <cheshire@google.com>
Date: Fri, 19 Jun 2020 01:13:01 -0700
Subject: [PATCH 0600/1390] [XLA:GPU] [NFC] Clarify the precondition for the
 fast reduction emitter

PiperOrigin-RevId: 317266013
Change-Id: I384acac279f0db53f195d5b43318c38c87a1739c
---
 tensorflow/compiler/xla/service/gpu/ir_emission_utils.cc | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/tensorflow/compiler/xla/service/gpu/ir_emission_utils.cc b/tensorflow/compiler/xla/service/gpu/ir_emission_utils.cc
index b97aa3651c6..01bcf456f75 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emission_utils.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emission_utils.cc
@@ -226,6 +226,11 @@ bool IsReductionFromOrToContiguousDimensions(const HloInstruction& reduce) {
       dims_to_keep.push_back(dim);
     }
   }
+
+  // We support fast codegen for three cases:
+  // 1) Row reduction: (K, R)
+  // 2) Column reduction: (K, R, K)
+  // 3) "Batched" row reduction: (R, K, R)
   if (!LayoutUtil::AreDimensionsConsecutive(input->shape().layout(),
                                             dims_to_keep) &&
       !LayoutUtil::AreDimensionsConsecutive(input->shape().layout(),

From 051d1b70f5f0636316e2651630f0ade554f192c0 Mon Sep 17 00:00:00 2001
From: Stefano Galarraga <galarragas@google.com>
Date: Fri, 19 Jun 2020 01:21:24 -0700
Subject: [PATCH 0601/1390] Fix NNAPI delegation error on models with MAX/MIN
 operations with scalar quantized operators

PiperOrigin-RevId: 317266736
Change-Id: Ieed8a77685d4ca0d51389b5976addf0de167cfcf
---
 tensorflow/lite/delegates/nnapi/nnapi_delegate.cc |  2 ++
 tensorflow/lite/kernels/maximum_minimum_test.cc   | 11 +++++++++++
 2 files changed, 13 insertions(+)

diff --git a/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc b/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc
index a3a3f9fda4d..1c35ee370c2 100644
--- a/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc
+++ b/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc
@@ -160,6 +160,8 @@ bool IsScalarInputSupported(int builtin_code) {
     case kTfLiteBuiltinLess:
     case kTfLiteBuiltinLessEqual:
     case kTfLiteBuiltinPow:
+    case kTfLiteBuiltinMaximum:
+    case kTfLiteBuiltinMinimum:
       return true;
     default:
       return false;
diff --git a/tensorflow/lite/kernels/maximum_minimum_test.cc b/tensorflow/lite/kernels/maximum_minimum_test.cc
index 2c036e369bd..803fe91c460 100644
--- a/tensorflow/lite/kernels/maximum_minimum_test.cc
+++ b/tensorflow/lite/kernels/maximum_minimum_test.cc
@@ -190,6 +190,17 @@ TEST(MaximumOpTest, Int32WithBroadcastTest_ScalarY) {
                      data1, data2, {1, 0, -1, -2, 2, 2}, /*is_constant=*/true);
 }
 
+TEST(MaximumOpTest, Int8WithBroadcastTest_ScalarY) {
+  std::initializer_list<int8_t> data1 = {1, 0, -1, -2, 3, 11};
+  std::initializer_list<int8_t> data2 = {2};
+  TestModel<int8_t>(BuiltinOperator_MAXIMUM, {TensorType_INT8, {3, 1, 2}},
+                    {TensorType_INT8, {}}, {TensorType_INT8, {3, 1, 2}}, data1,
+                    data2, {2, 2, 2, 2, 3, 11}, /*is_constant=*/true);
+  TestModel<int8_t>(BuiltinOperator_MINIMUM, {TensorType_INT8, {3, 1, 2}},
+                    {TensorType_INT8, {}}, {TensorType_INT8, {3, 1, 2}}, data1,
+                    data2, {1, 0, -1, -2, 2, 2}, /*is_constant=*/true);
+}
+
 TEST(MaxMinOpTest, Int8Test8D) {
   std::initializer_list<int8_t> data1 = {1, 0, 2, 11, 2, 23};
   std::initializer_list<int8_t> data2 = {0, 0, 1, 12, 123, 1};

From e51b17f4582183a216d3a47450117c5e8cdd387d Mon Sep 17 00:00:00 2001
From: Adrian Kuegel <akuegel@google.com>
Date: Fri, 19 Jun 2020 01:49:14 -0700
Subject: [PATCH 0602/1390] Add a small test to cover the mlir generated Tanh
 GPU kernel.

This test is a first step towards being able to ensure that we don't
accidentally break the kernel generation.

PiperOrigin-RevId: 317269120
Change-Id: Iad6bdd7ab7e9fb819a478c947ba6294a191f1099
---
 tensorflow/core/kernels/BUILD                 | 19 +++++
 .../mlir_generated_op_gpu_tanh_test.cc        | 85 +++++++++++++++++++
 2 files changed, 104 insertions(+)
 create mode 100644 tensorflow/core/kernels/mlir_generated_op_gpu_tanh_test.cc

diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index 97f974c6af4..0b7a092033b 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -4168,6 +4168,25 @@ tf_kernel_library(
     ]),
 )
 
+tf_cuda_cc_test(
+    name = "mlir_generated_op_gpu_tanh_test",
+    size = "small",
+    srcs = if_mlir_generated_gpu_kernels_enabled(["mlir_generated_op_gpu_tanh_test.cc"]),
+    tags = tf_cuda_tests_tags() + ["no_rocm"],
+    deps = [
+        ":cwise_op",
+        ":ops_testutil",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:tensorflow",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+        "//tensorflow/core/common_runtime:device",
+        "//tensorflow/core/common_runtime:device_factory",
+    ],
+)
+
 tf_kernel_library(
     name = "nextafter_op",
     prefix = "nextafter_op",
diff --git a/tensorflow/core/kernels/mlir_generated_op_gpu_tanh_test.cc b/tensorflow/core/kernels/mlir_generated_op_gpu_tanh_test.cc
new file mode 100644
index 00000000000..39c1d709b1e
--- /dev/null
+++ b/tensorflow/core/kernels/mlir_generated_op_gpu_tanh_test.cc
@@ -0,0 +1,85 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <cmath>
+#include <memory>
+#include <vector>
+
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/common_runtime/device_factory.h"
+#include "tensorflow/core/framework/fake_input.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/kernels/ops_testutil.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace {
+
+class MlirGeneratedOpGpuTanhTest : public OpsTestBase {
+ protected:
+  void SetUp() override {
+    std::unique_ptr<tensorflow::Device> device_gpu(
+        tensorflow::DeviceFactory::NewDevice("GPU", {},
+                                             "/job:a/replica:0/task:0"));
+    SetDevice(tensorflow::DEVICE_GPU, std::move(device_gpu));
+  }
+  template <typename T, typename RT = T>
+  void RunTanhOp(std::initializer_list<T> input) {
+    TensorShape shape({2, 7});
+    TF_ASSERT_OK(NodeDefBuilder("tanh_op", "Tanh")
+                     .Input(FakeInput(DataTypeToEnum<T>::v()))
+                     .Attr("T", DataTypeToEnum<T>::v())
+                     .Finalize(node_def()));
+
+    TF_ASSERT_OK(InitOp());
+    AddInputFromArray<T>(shape, input);
+    TF_ASSERT_OK(RunOpKernel());
+
+    Tensor expected_tensor(allocator(), DataTypeToEnum<T>::value, shape);
+    std::vector<T> expected;
+    expected.reserve(input.size());
+    for (const T& inp : input) {
+      expected.push_back(static_cast<T>(std::tanh(static_cast<RT>(inp))));
+    }
+    test::FillValues<T>(&expected_tensor, expected);
+    test::ExpectClose(expected_tensor, *GetOutput(0));
+  }
+};
+
+TEST_F(MlirGeneratedOpGpuTanhTest, TanhFloat) {
+  RunTanhOp<float>({-18.0f, -9.0f, -1e-6f, -0.0f, 0.0f, 1e-6, 0.1f, 0.2f, 0.3f,
+                    0.5f, 0.7f, 0.9f, 9.0f, 18.0f});
+}
+
+TEST_F(MlirGeneratedOpGpuTanhTest, TanhDouble) {
+  RunTanhOp<double>({-18.0, -9.0, -1e-6, -0.0, 0.0, 1e-6, 0.1, 0.2, 0.3, 0.5,
+                     0.7, 0.9, 9.0, 18.0});
+}
+
+TEST_F(MlirGeneratedOpGpuTanhTest, TanhHalf) {
+  RunTanhOp<Eigen::half, float>(
+      {static_cast<Eigen::half>(-18.0), static_cast<Eigen::half>(-9.0),
+       static_cast<Eigen::half>(-1e-6), static_cast<Eigen::half>(-0.0),
+       static_cast<Eigen::half>(0.0), static_cast<Eigen::half>(1e-6),
+       static_cast<Eigen::half>(0.1), static_cast<Eigen::half>(0.2),
+       static_cast<Eigen::half>(0.3), static_cast<Eigen::half>(0.5),
+       static_cast<Eigen::half>(0.7), static_cast<Eigen::half>(0.9),
+       static_cast<Eigen::half>(9.0), static_cast<Eigen::half>(18.0)});
+}
+
+}  // namespace
+}  // end namespace tensorflow

From 772433a2a2120d0aefc6c3628c6254d5a1aaf19d Mon Sep 17 00:00:00 2001
From: YoungSeok Yoon <youngseokyoon@google.com>
Date: Fri, 19 Jun 2020 01:58:58 -0700
Subject: [PATCH 0603/1390] Add flag for using optimized TFLite CPU kernels on
 iOS

This adds new experimental flags to the interpreter options of TFLite Obj-C and
Swift APIs, which can be used for opting in to a set of highly optimized
floating point kernels provided via the XNNPACK delegate. The flags can be used
as follows.

Obj-C:

    TFLInterpreterOptions *options = [[TFLInterpreterOptions alloc] init];
    options.useXNNPACK = YES;
    NSError *error;
    TFLInterpreter *interpreter =
        [[TFLInterpreter alloc] initWithModelPath:@"model/path"
                                          options:options
                                            error:&error];

Swift:

    var options = InterpreterOptions()
    options.isXNNPackEnabled = true
    var interpreter = try Interpreter(modelPath: "model/path", options: options)

PiperOrigin-RevId: 317270012
Change-Id: I82aae43c3de13ab08af3c70513e2a458e807b0f1
---
 tensorflow/lite/delegates/xnnpack/BUILD       |  4 ++
 tensorflow/lite/experimental/ios/BUILD.apple  | 18 +++++
 tensorflow/lite/experimental/objc/BUILD.apple |  1 +
 .../objc/TensorFlowLiteObjC-nightly.podspec   |  1 +
 .../objc/TensorFlowLiteObjC.podspec           |  1 +
 .../objc/TensorFlowLiteObjC.podspec.template  |  1 +
 .../objc/apis/TFLInterpreterOptions.h         | 21 ++++++
 .../objc/sources/TFLInterpreter.mm            | 15 +++++
 .../objc/tests/TFLInterpreterOptionsTests.m   |  9 +++
 .../swift/Sources/Interpreter.swift           | 67 ++++++++++++++++---
 .../swift/Tests/InterpreterTests.swift        | 62 +++++++++++------
 11 files changed, 171 insertions(+), 29 deletions(-)

diff --git a/tensorflow/lite/delegates/xnnpack/BUILD b/tensorflow/lite/delegates/xnnpack/BUILD
index 97e6aea2a6b..eaf7d8f6f03 100644
--- a/tensorflow/lite/delegates/xnnpack/BUILD
+++ b/tensorflow/lite/delegates/xnnpack/BUILD
@@ -14,6 +14,10 @@ EMSCRIPTEN_LINKOPTS = [
     "-s TOTAL_MEMORY=134217728",
 ]
 
+exports_files([
+    "xnnpack_delegate.h",
+])
+
 cc_library(
     name = "xnnpack_delegate",
     srcs = ["xnnpack_delegate.cc"],
diff --git a/tensorflow/lite/experimental/ios/BUILD.apple b/tensorflow/lite/experimental/ios/BUILD.apple
index 1a85b604f9b..7a40ca7b8e7 100644
--- a/tensorflow/lite/experimental/ios/BUILD.apple
+++ b/tensorflow/lite/experimental/ios/BUILD.apple
@@ -18,10 +18,26 @@ sh_binary(
     ],
 )
 
+# When the static framework is built with bazel, the all header files are moved
+# to the "Headers" directory with no header path prefixes. This auxiliary rule
+# is used for stripping the path prefix to the "common.h" file included by the
+# "xnnpack_delegate.h" header.
+genrule(
+    name = "strip_xnnpack_include_hdr",
+    srcs = ["//tensorflow/lite/delegates/xnnpack:xnnpack_delegate.h"],
+    outs = ["xnnpack_delegate.h"],
+    cmd = """
+    sed 's|#include ".*common.h"|#include "common.h"|'\
+    "$(location //tensorflow/lite/delegates/xnnpack:xnnpack_delegate.h)"\
+    > "$@"
+    """,
+)
+
 # bazel build -c opt --config=ios_fat //tensorflow/lite/experimental/ios:TensorFlowLiteC_framework
 tflite_ios_static_framework(
     name = "TensorFlowLiteC_framework",
     hdrs = [
+        ":xnnpack_delegate.h",
         "//tensorflow/lite/c:c_api.h",
         "//tensorflow/lite/c:common.h",
     ],
@@ -105,6 +121,7 @@ cc_library(
     hdrs = [
         "//tensorflow/lite/c:c_api.h",
         "//tensorflow/lite/c:common.h",
+        "//tensorflow/lite/delegates/xnnpack:xnnpack_delegate.h",
     ],
     tags = [
         "nobuilder",
@@ -112,6 +129,7 @@ cc_library(
     ],
     deps = [
         "//tensorflow/lite/c:c_api",
+        "//tensorflow/lite/delegates/xnnpack:xnnpack_delegate",
     ],
 )
 
diff --git a/tensorflow/lite/experimental/objc/BUILD.apple b/tensorflow/lite/experimental/objc/BUILD.apple
index 09d4547813a..d26d90c46a1 100644
--- a/tensorflow/lite/experimental/objc/BUILD.apple
+++ b/tensorflow/lite/experimental/objc/BUILD.apple
@@ -64,6 +64,7 @@ objc_library(
     visibility = ios_visibility_whitelist(),
     deps = [
         "//tensorflow/lite/c:c_api",
+        "//tensorflow/lite/delegates/xnnpack:xnnpack_delegate",
     ],
     alwayslink = 1,
 )
diff --git a/tensorflow/lite/experimental/objc/TensorFlowLiteObjC-nightly.podspec b/tensorflow/lite/experimental/objc/TensorFlowLiteObjC-nightly.podspec
index e039fb57114..eed0f087f44 100644
--- a/tensorflow/lite/experimental/objc/TensorFlowLiteObjC-nightly.podspec
+++ b/tensorflow/lite/experimental/objc/TensorFlowLiteObjC-nightly.podspec
@@ -26,6 +26,7 @@ Pod::Spec.new do |s|
     objc_dir + '{apis,sources}/*.{h,m,mm}',
     tfl_dir + 'c/c_api.h',
     tfl_dir + 'c/common.h',
+    tfl_dir + 'delegates/xnnpack/xnnpack_delegate.h',
   ]
   s.module_map = objc_dir + 'apis/framework.modulemap'
   s.dependency 'TensorFlowLiteC', "~> #{s.version}"
diff --git a/tensorflow/lite/experimental/objc/TensorFlowLiteObjC.podspec b/tensorflow/lite/experimental/objc/TensorFlowLiteObjC.podspec
index c673cfad759..5817619a58f 100644
--- a/tensorflow/lite/experimental/objc/TensorFlowLiteObjC.podspec
+++ b/tensorflow/lite/experimental/objc/TensorFlowLiteObjC.podspec
@@ -26,6 +26,7 @@ Pod::Spec.new do |s|
     objc_dir + '{apis,sources}/*.{h,m,mm}',
     tfl_dir + 'c/c_api.h',
     tfl_dir + 'c/common.h',
+    tfl_dir + 'delegates/xnnpack/xnnpack_delegate.h',
   ]
   s.module_map = objc_dir + 'apis/framework.modulemap'
   s.dependency 'TensorFlowLiteC', "#{s.version}"
diff --git a/tensorflow/lite/experimental/objc/TensorFlowLiteObjC.podspec.template b/tensorflow/lite/experimental/objc/TensorFlowLiteObjC.podspec.template
index fc9e10e4a2c..4ab5753e016 100644
--- a/tensorflow/lite/experimental/objc/TensorFlowLiteObjC.podspec.template
+++ b/tensorflow/lite/experimental/objc/TensorFlowLiteObjC.podspec.template
@@ -26,6 +26,7 @@ Pod::Spec.new do |s|
     objc_dir + '{apis,sources}/*.{h,m,mm}',
     tfl_dir + 'c/c_api.h',
     tfl_dir + 'c/common.h',
+    tfl_dir + 'delegates/xnnpack/xnnpack_delegate.h',
   ]
   s.module_map = objc_dir + 'apis/framework.modulemap'
   s.dependency 'TensorFlowLiteC', '~> 0.0.1-nightly'
diff --git a/tensorflow/lite/experimental/objc/apis/TFLInterpreterOptions.h b/tensorflow/lite/experimental/objc/apis/TFLInterpreterOptions.h
index 6461fbf0178..d7dbb2bd970 100644
--- a/tensorflow/lite/experimental/objc/apis/TFLInterpreterOptions.h
+++ b/tensorflow/lite/experimental/objc/apis/TFLInterpreterOptions.h
@@ -25,6 +25,27 @@ NS_ASSUME_NONNULL_BEGIN
  */
 @property(nonatomic) NSUInteger numberOfThreads;
 
+/**
+ * Experimental: Enable an optimized set of floating point CPU kernels (provided by XNNPACK).
+ *
+ * Enabling this flag will enable use of a new, highly optimized set of CPU kernels provided via the
+ * XNNPACK delegate. Currently, this is restricted to a subset of floating point operations.
+ * Eventually, we plan to enable this by default, as it can provide significant performance benefits
+ * for many classes of floating point models. See
+ * https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/delegates/xnnpack/README.md
+ * for more details.
+ *
+ * Things to keep in mind when enabling this flag:
+ *
+ *     * Startup time and resize time may increase.
+ *     * Baseline memory consumption may increase.
+ *     * Compatibility with other delegates (e.g., GPU) has not been fully validated.
+ *     * Quantized models will not see any benefit.
+ *
+ * WARNING: This is an experimental interface that is subject to change.
+ */
+@property(nonatomic) BOOL useXNNPACK;
+
 /**
  * Initializes a new instance of `TFLInterpreterOptions`.
  *
diff --git a/tensorflow/lite/experimental/objc/sources/TFLInterpreter.mm b/tensorflow/lite/experimental/objc/sources/TFLInterpreter.mm
index 94031ee5428..34dd119885d 100644
--- a/tensorflow/lite/experimental/objc/sources/TFLInterpreter.mm
+++ b/tensorflow/lite/experimental/objc/sources/TFLInterpreter.mm
@@ -23,6 +23,7 @@
 #import "tensorflow/lite/experimental/objc/apis/TFLTensor.h"
 
 #include "tensorflow/lite/c/c_api.h"
+#include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
 
 NS_ASSUME_NONNULL_BEGIN
 
@@ -45,6 +46,9 @@ static void TFLInterpreterErrorReporter(void *user_data, const char *format, va_
 /** TfLiteInterpreter backed by C API. */
 @property(nonatomic, nullable) TfLiteInterpreter *interpreter;
 
+/** TfLiteDelegate backed by C API. */
+@property(nonatomic, nullable) TfLiteDelegate *xnnpack_delegate;
+
 @end
 
 @implementation TFLInterpreter
@@ -53,6 +57,7 @@ static void TFLInterpreterErrorReporter(void *user_data, const char *format, va_
 
 - (void)dealloc {
   TfLiteInterpreterDelete(_interpreter);
+  TfLiteXNNPackDelegateDelete(_xnnpack_delegate);
 }
 
 #pragma mark - Public
@@ -104,6 +109,16 @@ static void TFLInterpreterErrorReporter(void *user_data, const char *format, va_
       }
       TfLiteInterpreterOptionsSetErrorReporter(cOptions, TFLInterpreterErrorReporter, nullptr);
 
+      if (options.useXNNPACK) {
+        TfLiteXNNPackDelegateOptions xnnpack_options = TfLiteXNNPackDelegateOptionsDefault();
+        if (options.numberOfThreads > 0) {
+          xnnpack_options.num_threads = (int32_t)options.numberOfThreads;
+        }
+
+        _xnnpack_delegate = TfLiteXNNPackDelegateCreate(&xnnpack_options);
+        TfLiteInterpreterOptionsAddDelegate(cOptions, _xnnpack_delegate);
+      }
+
       _interpreter = TfLiteInterpreterCreate(model, cOptions);
       if (_interpreter == nullptr) {
         [TFLErrorUtil saveInterpreterErrorWithCode:TFLInterpreterErrorCodeFailedToCreateInterpreter
diff --git a/tensorflow/lite/experimental/objc/tests/TFLInterpreterOptionsTests.m b/tensorflow/lite/experimental/objc/tests/TFLInterpreterOptionsTests.m
index 00b800d6af9..286cba98b49 100644
--- a/tensorflow/lite/experimental/objc/tests/TFLInterpreterOptionsTests.m
+++ b/tensorflow/lite/experimental/objc/tests/TFLInterpreterOptionsTests.m
@@ -32,6 +32,7 @@ NS_ASSUME_NONNULL_BEGIN
   TFLInterpreterOptions *options = [[TFLInterpreterOptions alloc] init];
   XCTAssertNotNil(options);
   XCTAssertEqual(options.numberOfThreads, 0);
+  XCTAssertFalse(options.useXNNPACK);
 }
 
 - (void)testSetNumberOfThread {
@@ -44,6 +45,14 @@ NS_ASSUME_NONNULL_BEGIN
   XCTAssertEqual(options.numberOfThreads, 3);
 }
 
+- (void)testUseXNNPACK {
+  TFLInterpreterOptions *options = [[TFLInterpreterOptions alloc] init];
+  options.useXNNPACK = YES;
+  XCTAssertTrue(options.useXNNPACK);
+  options.useXNNPACK = NO;
+  XCTAssertFalse(options.useXNNPACK);
+}
+
 @end
 
 NS_ASSUME_NONNULL_END
diff --git a/tensorflow/lite/experimental/swift/Sources/Interpreter.swift b/tensorflow/lite/experimental/swift/Sources/Interpreter.swift
index b83c36c4e1d..3567822208d 100644
--- a/tensorflow/lite/experimental/swift/Sources/Interpreter.swift
+++ b/tensorflow/lite/experimental/swift/Sources/Interpreter.swift
@@ -39,6 +39,9 @@ public final class Interpreter {
   /// The underlying `TfLiteInterpreter` C pointer.
   private var cInterpreter: CInterpreter?
 
+  /// The underlying `TfLiteDelegate` C pointer for XNNPACK delegate.
+  private var cXNNPackDelegate: Delegate.CDelegate?
+
   /// Creates a new instance with the given values.
   ///
   /// - Parameters:
@@ -78,6 +81,14 @@ public final class Interpreter {
       )
     }
     delegates?.forEach { TfLiteInterpreterOptionsAddDelegate(cInterpreterOptions, $0.cDelegate) }
+
+    // Configure the XNNPack delegate after the other delegates explicitly added by the user.
+    options.map {
+      if $0.isXNNPackEnabled {
+        configureXNNPack(options: $0, cInterpreterOptions: cInterpreterOptions)
+      }
+    }
+
     guard let cInterpreter = TfLiteInterpreterCreate(model.cModel, cInterpreterOptions) else {
       throw InterpreterError.failedToCreateInterpreter
     }
@@ -86,6 +97,7 @@ public final class Interpreter {
 
   deinit {
     TfLiteInterpreterDelete(cInterpreter)
+    TfLiteXNNPackDelegateDelete(cXNNPackDelegate)
   }
 
   /// Invokes the interpreter to perform inference from the loaded graph.
@@ -201,12 +213,13 @@ public final class Interpreter {
     guard case 0...maxIndex = index else {
       throw InterpreterError.invalidTensorIndex(index: index, maxIndex: maxIndex)
     }
-    guard TfLiteInterpreterResizeInputTensor(
-      cInterpreter,
-      Int32(index),
-      shape.int32Dimensions,
-      Int32(shape.rank)
-    ) == kTfLiteOk
+    guard
+      TfLiteInterpreterResizeInputTensor(
+        cInterpreter,
+        Int32(index),
+        shape.int32Dimensions,
+        Int32(shape.rank)
+      ) == kTfLiteOk
     else {
       throw InterpreterError.failedToResizeInputTensor(index: index)
     }
@@ -236,11 +249,11 @@ public final class Interpreter {
     }
 
     #if swift(>=5.0)
-    let status = data.withUnsafeBytes {
-      TfLiteTensorCopyFromBuffer(cTensor, $0.baseAddress, data.count)
-    }
+      let status = data.withUnsafeBytes {
+        TfLiteTensorCopyFromBuffer(cTensor, $0.baseAddress, data.count)
+      }
     #else
-    let status = data.withUnsafeBytes { TfLiteTensorCopyFromBuffer(cTensor, $0, data.count) }
+      let status = data.withUnsafeBytes { TfLiteTensorCopyFromBuffer(cTensor, $0, data.count) }
     #endif  // swift(>=5.0)
     guard status == kTfLiteOk else { throw InterpreterError.failedToCopyDataToInputTensor }
     return try input(at: index)
@@ -256,6 +269,18 @@ public final class Interpreter {
       throw InterpreterError.failedToAllocateTensors
     }
   }
+
+  // MARK: - Private
+
+  private func configureXNNPack(options: Options, cInterpreterOptions: OpaquePointer) {
+    var cXNNPackOptions = TfLiteXNNPackDelegateOptionsDefault()
+    if let threadCount = options.threadCount, threadCount > 0 {
+      cXNNPackOptions.num_threads = Int32(threadCount)
+    }
+
+    cXNNPackDelegate = TfLiteXNNPackDelegateCreate(&cXNNPackOptions)
+    TfLiteInterpreterOptionsAddDelegate(cInterpreterOptions, cXNNPackDelegate)
+  }
 }
 
 extension Interpreter {
@@ -265,6 +290,28 @@ extension Interpreter {
     /// indicating that the `Interpreter` will decide the number of threads to use.
     public var threadCount: Int? = nil
 
+    /// Indicates whether an optimized set of floating point CPU kernels, provided by XNNPACK, is
+    /// enabled.
+    ///
+    /// - Experiment:
+    /// Enabling this flag will enable use of a new, highly optimized set of CPU kernels provided
+    /// via the XNNPACK delegate. Currently, this is restricted to a subset of floating point
+    /// operations. Eventually, we plan to enable this by default, as it can provide significant
+    /// performance benefits for many classes of floating point models. See
+    /// https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/delegates/xnnpack/README.md
+    /// for more details.
+    ///
+    /// - Important:
+    /// Things to keep in mind when enabling this flag:
+    ///
+    ///     * Startup time and resize time may increase.
+    ///     * Baseline memory consumption may increase.
+    ///     * Compatibility with other delegates (e.g., GPU) has not been fully validated.
+    ///     * Quantized models will not see any benefit.
+    ///
+    /// - Warning: This is an experimental interface that is subject to change.
+    public var isXNNPackEnabled: Bool = false
+
     /// Creates a new instance with the default values.
     public init() {}
   }
diff --git a/tensorflow/lite/experimental/swift/Tests/InterpreterTests.swift b/tensorflow/lite/experimental/swift/Tests/InterpreterTests.swift
index 8d0140279af..67d8120df4d 100644
--- a/tensorflow/lite/experimental/swift/Tests/InterpreterTests.swift
+++ b/tensorflow/lite/experimental/swift/Tests/InterpreterTests.swift
@@ -142,10 +142,12 @@ class InterpreterTests: XCTestCase {
   }
 
   func testResizeInputTensorAtIndexToShape_ThrowsInvalidIndex() {
-    XCTAssertThrowsError(try interpreter.resizeInput(
-      at: AddModel.invalidIndex,
-      to: [2, 2, 3]
-    )) { error in
+    XCTAssertThrowsError(
+      try interpreter.resizeInput(
+        at: AddModel.invalidIndex,
+        to: [2, 2, 3]
+      )
+    ) { error in
       let maxIndex = AddModel.inputTensorCount - 1
       self.assertEqualErrors(
         actual: error,
@@ -162,10 +164,12 @@ class InterpreterTests: XCTestCase {
   }
 
   func testCopyDataToInputTensorAtIndex_ThrowsInvalidIndex() {
-    XCTAssertThrowsError(try interpreter.copy(
-      AddModel.inputData,
-      toInputAt: AddModel.invalidIndex
-    )) { error in
+    XCTAssertThrowsError(
+      try interpreter.copy(
+        AddModel.inputData,
+        toInputAt: AddModel.invalidIndex
+      )
+    ) { error in
       let maxIndex = AddModel.inputTensorCount - 1
       self.assertEqualErrors(
         actual: error,
@@ -178,10 +182,12 @@ class InterpreterTests: XCTestCase {
     try interpreter.resizeInput(at: AddModel.validIndex, to: AddModel.shape)
     try interpreter.allocateTensors()
     let invalidData = Data(count: AddModel.dataCount - 1)
-    XCTAssertThrowsError(try interpreter.copy(
-      invalidData,
-      toInputAt: AddModel.validIndex
-    )) { error in
+    XCTAssertThrowsError(
+      try interpreter.copy(
+        invalidData,
+        toInputAt: AddModel.validIndex
+      )
+    ) { error in
       self.assertEqualErrors(
         actual: error,
         expected: .invalidTensorDataCount(provided: invalidData.count, required: AddModel.dataCount)
@@ -223,12 +229,20 @@ class InterpreterOptionsTests: XCTestCase {
   func testInitWithDefaultValues() {
     let options = Interpreter.Options()
     XCTAssertNil(options.threadCount)
+    XCTAssertFalse(options.isXNNPackEnabled)
   }
 
   func testInitWithCustomValues() {
     var options = Interpreter.Options()
+
     options.threadCount = 2
     XCTAssertEqual(options.threadCount, 2)
+
+    options.isXNNPackEnabled = false
+    XCTAssertFalse(options.isXNNPackEnabled)
+
+    options.isXNNPackEnabled = true
+    XCTAssertTrue(options.isXNNPackEnabled)
   }
 
   func testEquatable() {
@@ -242,6 +256,15 @@ class InterpreterOptionsTests: XCTestCase {
 
     options2.threadCount = 3
     XCTAssertNotEqual(options1, options2)
+
+    options2.threadCount = 2
+    XCTAssertEqual(options1, options2)
+
+    options2.isXNNPackEnabled = true
+    XCTAssertNotEqual(options1, options2)
+
+    options1.isXNNPackEnabled = true
+    XCTAssertEqual(options1, options2)
   }
 }
 
@@ -326,14 +349,15 @@ extension Array {
   init?(unsafeData: Data) {
     guard unsafeData.count % MemoryLayout<Element>.stride == 0 else { return nil }
     #if swift(>=5.0)
-    self = unsafeData.withUnsafeBytes { .init($0.bindMemory(to: Element.self)) }
+      self = unsafeData.withUnsafeBytes { .init($0.bindMemory(to: Element.self)) }
     #else
-    self = unsafeData.withUnsafeBytes {
-      .init(UnsafeBufferPointer<Element>(
-        start: $0,
-        count: unsafeData.count / MemoryLayout<Element>.stride
-      ))
-    }
+      self = unsafeData.withUnsafeBytes {
+        .init(
+          UnsafeBufferPointer<Element>(
+            start: $0,
+            count: unsafeData.count / MemoryLayout<Element>.stride
+          ))
+      }
     #endif  // swift(>=5.0)
   }
 }

From 4b0a6f818fa8e3c38fd0cf68d9a647f82cf6c93a Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 19 Jun 2020 02:01:50 -0700
Subject: [PATCH 0604/1390] Update GraphDef version to 437.

PiperOrigin-RevId: 317270285
Change-Id: Ib8d1e6dbb565c01d2bdf0304a03be1c1eebbde41
---
 tensorflow/core/public/version.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index 546d86e58fa..9a79fc1eddf 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -108,7 +108,7 @@ limitations under the License.
 
 #define TF_GRAPH_DEF_VERSION_MIN_PRODUCER 0
 #define TF_GRAPH_DEF_VERSION_MIN_CONSUMER 0
-#define TF_GRAPH_DEF_VERSION 436  // Updated: 2020/6/18
+#define TF_GRAPH_DEF_VERSION 437  // Updated: 2020/6/19
 
 // Checkpoint compatibility versions (the versions field in SavedSliceMeta).
 //

From d41d28120e6aac1efbad27523a78cd254434dc4e Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 19 Jun 2020 02:02:01 -0700
Subject: [PATCH 0605/1390] compat: Update forward compatibility horizon to
 2020-06-19

PiperOrigin-RevId: 317270310
Change-Id: Idc8188172496af9f2494c580cdab27558b16e4a8
---
 tensorflow/python/compat/compat.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/compat/compat.py b/tensorflow/python/compat/compat.py
index 32545ac8463..22988d26cfc 100644
--- a/tensorflow/python/compat/compat.py
+++ b/tensorflow/python/compat/compat.py
@@ -33,7 +33,7 @@ from tensorflow.python.util.tf_export import tf_export
 # This value changes every day with an automatic CL. It can be modified in code
 # via `forward_compatibility_horizon()` or with the environment variable
 # TF_FORWARD_COMPATIBILITY_DELTA_DAYS, which is added to the compatibility date.
-_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2020, 6, 18)
+_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2020, 6, 19)
 _FORWARD_COMPATIBILITY_DELTA_DAYS_VAR_NAME = "TF_FORWARD_COMPATIBILITY_DELTA_DAYS"
 _FORWARD_COMPATIBILITY_DATE_NUMBER = None
 

From 9d1ec55aed0a4d9baf7302974fefe08546bfad25 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 19 Jun 2020 02:04:36 -0700
Subject: [PATCH 0606/1390] Integrate LLVM at
 https://github.com/llvm/llvm-project/commit/7f0d7f326316

PiperOrigin-RevId: 317270655
Change-Id: Ic80ab697da45212c8d58bcda989e5ee0a330b565
---
 .../mlir/xla/transforms/hlo_legalize_to_lhlo.cc     | 13 +++++++------
 .../compiler/mlir/xla/transforms/legalize_tf.cc     |  4 ++--
 .../mlir/xla/transforms/lhlo_legalize_to_gpu.cc     |  2 +-
 .../xla/transforms/lhlo_legalize_to_llvm_pass.cc    |  2 +-
 .../transforms/lhlo_legalize_to_parallel_loops.cc   |  2 +-
 .../mlir/xla/transforms/xla_legalize_to_linalg.cc   |  4 ++--
 .../xla/service/mlir_gpu/kernel_lowering.cc         |  2 +-
 tensorflow/workspace.bzl                            |  4 ++--
 8 files changed, 17 insertions(+), 16 deletions(-)

diff --git a/tensorflow/compiler/mlir/xla/transforms/hlo_legalize_to_lhlo.cc b/tensorflow/compiler/mlir/xla/transforms/hlo_legalize_to_lhlo.cc
index 1cfe0c12e20..a11b08e0ea6 100644
--- a/tensorflow/compiler/mlir/xla/transforms/hlo_legalize_to_lhlo.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/hlo_legalize_to_lhlo.cc
@@ -389,10 +389,13 @@ struct HloLegalizeToLhlo
     target.addLegalOp<ModuleTerminatorOp>();
     target.addLegalOp<TensorFromElementsOp>();
     target.addIllegalDialect<xla_hlo::XlaHloDialect>();
+
+    BufferAssignmentTypeConverter converter;
     target.addDynamicallyLegalOp<FuncOp>([&](FuncOp op) {
       auto inputs = op.getType().getInputs();
-      return std::all_of(inputs.begin(), inputs.end(),
-                         [](Type input) { return input.isa<MemRefType>(); });
+      return llvm::all_of(inputs,
+                          [](Type input) { return input.isa<MemRefType>(); }) &&
+             converter.isLegal(&op.getBody());
     });
     target.addDynamicallyLegalOp<mlir::ReturnOp>([&](mlir::ReturnOp returnOp) {
       return std::all_of(returnOp.operand_type_begin(),
@@ -401,8 +404,7 @@ struct HloLegalizeToLhlo
     });
 
     auto module = getOperation();
-    BufferAssignmentTypeConverter converter;
-    module.walk([&](FuncOp func) {
+    module.walk([&](FuncOp func) -> WalkResult {
       BufferAssignmentPlacer bufferAssignment(func);
       OwningRewritePatternList patterns;
       populateHLOToLHLOConversionPattern(func.getContext(), &bufferAssignment,
@@ -418,8 +420,7 @@ struct HloLegalizeToLhlo
             /*allowMemrefFunctionResults=*/false>(&context, &bufferAssignment,
                                                   &converter, &patterns);
       }
-      return WalkResult(
-          applyPartialConversion(func, target, patterns, &converter));
+      return applyPartialConversion(func, target, patterns);
     });
   }
 
diff --git a/tensorflow/compiler/mlir/xla/transforms/legalize_tf.cc b/tensorflow/compiler/mlir/xla/transforms/legalize_tf.cc
index b7cad554043..1788cd1b270 100644
--- a/tensorflow/compiler/mlir/xla/transforms/legalize_tf.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/legalize_tf.cc
@@ -5238,8 +5238,8 @@ LogicalResult legalizeTF(Operation *op, bool allow_partial_conversion,
     // Fully qualify ReturnOp here as xla_hlo dialect also defines a ReturnOp.
     target.addLegalOp<ModuleOp, FuncOp, ModuleTerminatorOp, ::mlir::ReturnOp>();
     DenseSet<Operation *> nonlegalized_ops;
-    LogicalResult result = applyPartialConversion(
-        op, target, patterns, /*converter=*/nullptr, &nonlegalized_ops);
+    LogicalResult result =
+        applyPartialConversion(op, target, patterns, &nonlegalized_ops);
     // In order to enforce that the conversion result is fully converted,
     // fail if there are any nonlegalized ops in the set.
     if (failed(result) || !nonlegalized_ops.empty()) {
diff --git a/tensorflow/compiler/mlir/xla/transforms/lhlo_legalize_to_gpu.cc b/tensorflow/compiler/mlir/xla/transforms/lhlo_legalize_to_gpu.cc
index f0eb3cc1a0f..c23b8b49268 100644
--- a/tensorflow/compiler/mlir/xla/transforms/lhlo_legalize_to_gpu.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/lhlo_legalize_to_gpu.cc
@@ -177,7 +177,7 @@ struct LhloLegalizeToGpu : public PassWrapper<LhloLegalizeToGpu, FunctionPass> {
     target.addIllegalOp<ReduceOp>();
     auto func = getFunction();
     patterns.insert<LhloReduceToGPULaunchConverter>(func.getContext());
-    if (failed(applyPartialConversion(func, target, patterns, nullptr))) {
+    if (failed(applyPartialConversion(func, target, patterns))) {
       signalPassFailure();
     }
   }
diff --git a/tensorflow/compiler/mlir/xla/transforms/lhlo_legalize_to_llvm_pass.cc b/tensorflow/compiler/mlir/xla/transforms/lhlo_legalize_to_llvm_pass.cc
index 9b809049290..63265c4a7e7 100644
--- a/tensorflow/compiler/mlir/xla/transforms/lhlo_legalize_to_llvm_pass.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/lhlo_legalize_to_llvm_pass.cc
@@ -43,7 +43,7 @@ class TestLhloToLLVMPass
     target.addLegalOp<ModuleOp, ModuleTerminatorOp>();
     target.addIllegalDialect<XlaLhloDialect>();
 
-    if (failed(applyFullConversion(m, target, patterns, &converter))) {
+    if (failed(applyFullConversion(m, target, patterns))) {
       signalPassFailure();
     }
   }
diff --git a/tensorflow/compiler/mlir/xla/transforms/lhlo_legalize_to_parallel_loops.cc b/tensorflow/compiler/mlir/xla/transforms/lhlo_legalize_to_parallel_loops.cc
index b3112d49103..65962c5b7a5 100644
--- a/tensorflow/compiler/mlir/xla/transforms/lhlo_legalize_to_parallel_loops.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/lhlo_legalize_to_parallel_loops.cc
@@ -711,7 +711,7 @@ struct LhloLegalizeToParallelLoops
     target.addIllegalOp<xla_lhlo::ReduceOp, xla_lhlo::ReduceWindowOp,
                         xla_lhlo::SelectAndScatterOp>();
 
-    if (failed(applyPartialConversion(func, target, patterns, nullptr))) {
+    if (failed(applyPartialConversion(func, target, patterns))) {
       signalPassFailure();
     }
   }
diff --git a/tensorflow/compiler/mlir/xla/transforms/xla_legalize_to_linalg.cc b/tensorflow/compiler/mlir/xla/transforms/xla_legalize_to_linalg.cc
index ad78a01100b..8a2f8ce7d04 100644
--- a/tensorflow/compiler/mlir/xla/transforms/xla_legalize_to_linalg.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/xla_legalize_to_linalg.cc
@@ -867,7 +867,7 @@ struct LhloLegalizeToLinalg
 
     auto func = getFunction();
     populateLHLOToLinalgConversionPattern(func.getContext(), &patterns);
-    if (failed(applyPartialConversion(func, target, patterns, nullptr))) {
+    if (failed(applyPartialConversion(func, target, patterns))) {
       signalPassFailure();
     }
   }
@@ -882,7 +882,7 @@ struct HloLegalizeToLinalg
 
     auto func = getFunction();
     xla_hlo::populateHLOToLinalgConversionPattern(func.getContext(), &patterns);
-    if (failed(applyPartialConversion(func, target, patterns, nullptr))) {
+    if (failed(applyPartialConversion(func, target, patterns))) {
       signalPassFailure();
     }
   }
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/kernel_lowering.cc b/tensorflow/compiler/xla/service/mlir_gpu/kernel_lowering.cc
index 9d5b52df010..ecd1308be4b 100644
--- a/tensorflow/compiler/xla/service/mlir_gpu/kernel_lowering.cc
+++ b/tensorflow/compiler/xla/service/mlir_gpu/kernel_lowering.cc
@@ -552,7 +552,7 @@ class LowerToNVVMPass
     // TODO(csigg): Remove once we support replacing non-root ops.
     target.addLegalOp<::mlir::gpu::GPUModuleOp, ::mlir::gpu::ModuleEndOp,
                       ::mlir::gpu::YieldOp>();
-    if (failed(mlir::applyFullConversion(m, target, patterns, &converter))) {
+    if (failed(mlir::applyFullConversion(m, target, patterns))) {
       signalPassFailure();
     }
   }
diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index 52c573628ac..27eca0ee54f 100755
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -710,8 +710,8 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
     )
 
     # Check out LLVM and MLIR from llvm-project.
-    LLVM_COMMIT = "92d8ad02e92fed3884169ba5d98056fe4fa5660d"
-    LLVM_SHA256 = "a4995ace7ddaef0c49293dc65771f58ef1fea96ebe1f39aa0a2d6d75d07f6cc7"
+    LLVM_COMMIT = "7f0d7f32631648acf48bc23047635ab5e2058a1a"
+    LLVM_SHA256 = "2f1dbae231b3b8f9c67d6a4f578c8ce29f3aa2831313b34c40ff2edb4014476a"
     LLVM_URLS = [
         "https://storage.googleapis.com/mirror.tensorflow.org/github.com/llvm/llvm-project/archive/{commit}.tar.gz".format(commit = LLVM_COMMIT),
         "https://github.com/llvm/llvm-project/archive/{commit}.tar.gz".format(commit = LLVM_COMMIT),

From c662daf4891a1e6efe64797615c3bd2bebedc5f5 Mon Sep 17 00:00:00 2001
From: Smit Hinsu <hinsu@google.com>
Date: Fri, 19 Jun 2020 02:22:43 -0700
Subject: [PATCH 0607/1390] Override CustomCall in MlirHloBuilder

Also, enable mlir bridge for image ops compilers test. ResizeBilinear op
lowering usese CustomCall in case of TPU lowerings.

PiperOrigin-RevId: 317272443
Change-Id: I134c828cdc76552a0cbfdeb7c65532aa986314e2
---
 .../compiler/mlir/xla/ir/mlir_hlo_builder.cc  | 16 ++++++++++++
 .../compiler/mlir/xla/ir/mlir_hlo_builder.h   |  6 +++++
 .../xla/transforms/legalize_tf_with_tf2xla.cc |  8 ++++++
 tensorflow/compiler/tests/BUILD               |  1 +
 tensorflow/compiler/xla/client/xla_builder.cc | 26 ++++++++++++++-----
 tensorflow/compiler/xla/client/xla_builder.h  |  8 ++++++
 6 files changed, 58 insertions(+), 7 deletions(-)

diff --git a/tensorflow/compiler/mlir/xla/ir/mlir_hlo_builder.cc b/tensorflow/compiler/mlir/xla/ir/mlir_hlo_builder.cc
index 21b1ac5f0ea..3c11d8e590d 100644
--- a/tensorflow/compiler/mlir/xla/ir/mlir_hlo_builder.cc
+++ b/tensorflow/compiler/mlir/xla/ir/mlir_hlo_builder.cc
@@ -132,6 +132,22 @@ StatusOr<XlaOp> MlirHloBuilder::FftInternal(
   return MakeXlaOp(op);
 }
 
+StatusOr<XlaOp> MlirHloBuilder::CustomCallInternal(
+    const string& call_target_name, absl::Span<const XlaOp> operands,
+    const Shape& shape, const string& opaque,
+    absl::optional<absl::Span<const Shape>> operand_shapes_with_layout) {
+  if (operand_shapes_with_layout.has_value())
+    return Unimplemented(
+        "CustomCall doesn't support operands shapes with layout");
+  TF_ASSIGN_OR_RETURN(mlir::Type ty, ConvertShapeToType<mlir::RankedTensorType>(
+                                         shape, builder_));
+  auto op = builder_.create<mlir::xla_hlo::CustomCallOp>(
+      loc_, ty, GetValues(operands), builder_.getStringAttr(call_target_name),
+      /*has_side_effect=*/builder_.getBoolAttr(false),
+      builder_.getStringAttr(opaque));
+  return MakeXlaOp(op);
+}
+
 StatusOr<XlaOp> MlirHloBuilder::ReduceInternal(
     const Shape& shape, absl::Span<const XlaOp> all_operands,
     const XlaComputation& computation,
diff --git a/tensorflow/compiler/mlir/xla/ir/mlir_hlo_builder.h b/tensorflow/compiler/mlir/xla/ir/mlir_hlo_builder.h
index 4b28c32db99..4d7d93af7a7 100644
--- a/tensorflow/compiler/mlir/xla/ir/mlir_hlo_builder.h
+++ b/tensorflow/compiler/mlir/xla/ir/mlir_hlo_builder.h
@@ -124,6 +124,12 @@ class MlirHloBuilder : public XlaBuilder {
                               FftType fft_type,
                               absl::Span<const int64> fft_length) override;
 
+  StatusOr<XlaOp> CustomCallInternal(const string& call_target_name,
+                                     absl::Span<const XlaOp> operands,
+                                     const Shape& shape, const string& opaque,
+                                     absl::optional<absl::Span<const Shape>>
+                                         operand_shapes_with_layout) override;
+
   StatusOr<XlaOp> ReduceInternal(
       const Shape& shape, absl::Span<const XlaOp> all_operands,
       const XlaComputation& computation,
diff --git a/tensorflow/compiler/mlir/xla/transforms/legalize_tf_with_tf2xla.cc b/tensorflow/compiler/mlir/xla/transforms/legalize_tf_with_tf2xla.cc
index ef79c8868bb..8f96f4d1305 100644
--- a/tensorflow/compiler/mlir/xla/transforms/legalize_tf_with_tf2xla.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/legalize_tf_with_tf2xla.cc
@@ -88,6 +88,9 @@ static bool IsOpWhitelisted(Operation* op) {
     TypeID::get<TF::AddNOp>(),
     TypeID::get<TF::AddV2Op>(),
     TypeID::get<TF::AngleOp>(),
+    TypeID::get<TF::AdjustContrastv2Op>(),
+    TypeID::get<TF::AdjustHueOp>(),
+    TypeID::get<TF::AdjustSaturationOp>(),
     TypeID::get<TF::ApproximateEqualOp>(),
     TypeID::get<TF::ArgMaxOp>(),
     TypeID::get<TF::ArgMinOp>(),
@@ -127,6 +130,7 @@ static bool IsOpWhitelisted(Operation* op) {
     TypeID::get<TF::GatherNdOp>(),
     TypeID::get<TF::GreaterEqualOp>(),
     TypeID::get<TF::GreaterOp>(),
+    TypeID::get<TF::HSVToRGBOp>(),
     TypeID::get<TF::IFFT2DOp>(),
     TypeID::get<TF::IFFT3DOp>(),
     TypeID::get<TF::IFFTOp>(),
@@ -157,10 +161,14 @@ static bool IsOpWhitelisted(Operation* op) {
     TypeID::get<TF::PowOp>(),
     TypeID::get<TF::RFFT2DOp>(),
     TypeID::get<TF::RFFT3DOp>(),
+    TypeID::get<TF::RGBToHSVOp>(),
     TypeID::get<TF::RealDivOp>(),
     TypeID::get<TF::ReciprocalOp>(),
     TypeID::get<TF::ReciprocalGradOp>(),
     TypeID::get<TF::Relu6GradOp>(),
+    TypeID::get<TF::ResizeBilinearOp>(),
+    TypeID::get<TF::ResizeBilinearGradOp>(),
+    TypeID::get<TF::ResizeNearestNeighborOp>(),
     TypeID::get<TF::ReverseSequenceOp>(),
     TypeID::get<TF::RightShiftOp>(),
     TypeID::get<TF::RintOp>(),
diff --git a/tensorflow/compiler/tests/BUILD b/tensorflow/compiler/tests/BUILD
index b574622efce..034ec82de10 100644
--- a/tensorflow/compiler/tests/BUILD
+++ b/tensorflow/compiler/tests/BUILD
@@ -770,6 +770,7 @@ tf_xla_py_test(
     size = "small",
     timeout = "long",
     srcs = ["image_ops_test.py"],
+    enable_mlir_bridge = True,
     python_version = "PY3",
     shard_count = 10,
     tags = [
diff --git a/tensorflow/compiler/xla/client/xla_builder.cc b/tensorflow/compiler/xla/client/xla_builder.cc
index c7b6a7f9491..03ae23ea18b 100644
--- a/tensorflow/compiler/xla/client/xla_builder.cc
+++ b/tensorflow/compiler/xla/client/xla_builder.cc
@@ -1564,16 +1564,12 @@ XlaOp XlaBuilder::CustomCall(
     const Shape& shape, const string& opaque,
     absl::optional<absl::Span<const Shape>> operand_shapes_with_layout) {
   return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
-    HloInstructionProto instr;
     if (absl::StartsWith(call_target_name, "$")) {
       return InvalidArgument(
           "Invalid custom_call_target \"%s\": Call targets that start with '$' "
           "are reserved for internal use.",
           call_target_name);
     }
-    *instr.mutable_shape() = shape.ToProto();
-    instr.set_custom_call_target(call_target_name);
-    instr.set_backend_config(opaque);
     if (operand_shapes_with_layout.has_value()) {
       if (!LayoutUtil::HasLayout(shape)) {
         return InvalidArgument(
@@ -1586,7 +1582,6 @@ XlaOp XlaBuilder::CustomCall(
             "with constrained layout; given %d shapes, expected %d",
             operand_shapes_with_layout->size(), operands.size());
       }
-      instr.set_constrain_layout(true);
       int64 operand_num = 0;
       for (const Shape& operand_shape : *operand_shapes_with_layout) {
         if (!LayoutUtil::HasLayout(operand_shape)) {
@@ -1595,14 +1590,31 @@ XlaOp XlaBuilder::CustomCall(
               "constrained layout.",
               operand_num);
         }
-        *instr.add_operand_shapes_with_layout() = operand_shape.ToProto();
         ++operand_num;
       }
     }
-    return AddInstruction(std::move(instr), HloOpcode::kCustomCall, operands);
+    return CustomCallInternal(call_target_name, operands, shape, opaque,
+                              operand_shapes_with_layout);
   });
 }
 
+StatusOr<XlaOp> XlaBuilder::CustomCallInternal(
+    const string& call_target_name, absl::Span<const XlaOp> operands,
+    const Shape& shape, const string& opaque,
+    absl::optional<absl::Span<const Shape>> operand_shapes_with_layout) {
+  HloInstructionProto instr;
+  *instr.mutable_shape() = shape.ToProto();
+  instr.set_custom_call_target(call_target_name);
+  instr.set_backend_config(opaque);
+  if (operand_shapes_with_layout.has_value()) {
+    instr.set_constrain_layout(true);
+    for (const Shape& operand_shape : *operand_shapes_with_layout) {
+      *instr.add_operand_shapes_with_layout() = operand_shape.ToProto();
+    }
+  }
+  return AddInstruction(std::move(instr), HloOpcode::kCustomCall, operands);
+}
+
 XlaOp XlaBuilder::CustomCall(
     const string& call_target_name, absl::Span<const XlaOp> operands,
     const XlaComputation& computation, const Shape& shape, const string& opaque,
diff --git a/tensorflow/compiler/xla/client/xla_builder.h b/tensorflow/compiler/xla/client/xla_builder.h
index b8af180b83e..3fc26747468 100644
--- a/tensorflow/compiler/xla/client/xla_builder.h
+++ b/tensorflow/compiler/xla/client/xla_builder.h
@@ -527,6 +527,14 @@ class XlaBuilder {
       const Shape& shape_with_layout, const string& opaque,
       absl::optional<absl::Span<const Shape>> operand_shapes_with_layout);
 
+  // Internal version of CustomCall without computation that doesn't do op
+  // specific error handling and expects arguments to be legal. CustomCall
+  // method above calls this method after error handling.
+  virtual StatusOr<XlaOp> CustomCallInternal(
+      const string& call_target_name, absl::Span<const XlaOp> operands,
+      const Shape& shape_with_layout, const string& opaque,
+      absl::optional<absl::Span<const Shape>> operand_shapes_with_layout);
+
   XlaOp CustomCall(
       const string& call_target_name, absl::Span<const XlaOp> operands,
       const XlaComputation& computation, const Shape& shape_with_layout,

From 8ad5bc80e71921c3c2530d93d3856ba59e524c60 Mon Sep 17 00:00:00 2001
From: Lukas Geiger <lukas.geiger94@gmail.com>
Date: Fri, 19 Jun 2020 11:50:24 +0200
Subject: [PATCH 0608/1390] Remove unnecessary assert

---
 .../mixed_precision/experimental/autocast_variable_test.py      | 2 --
 1 file changed, 2 deletions(-)

diff --git a/tensorflow/python/keras/mixed_precision/experimental/autocast_variable_test.py b/tensorflow/python/keras/mixed_precision/experimental/autocast_variable_test.py
index 2fa7c103258..9036109af96 100644
--- a/tensorflow/python/keras/mixed_precision/experimental/autocast_variable_test.py
+++ b/tensorflow/python/keras/mixed_precision/experimental/autocast_variable_test.py
@@ -372,7 +372,6 @@ class AutoCastVariableTest(test.TestCase, parameterized.TestCase):
         # Variable should be increased, despite it appearing to be the same
         # float16 value.
         self.evaluate(x.assign(1. + small_tensor))
-        self.assertEqual(1. + small_val, self.evaluate(x._variable))
         self.assertEqual(1., self.evaluate(x.value()))
       self.assertEqual(1. + small_val, self.evaluate(x))
 
@@ -380,7 +379,6 @@ class AutoCastVariableTest(test.TestCase, parameterized.TestCase):
       with ops.get_default_graph()._enable_auto_casting_variables(
           dtypes.float16):
         self.evaluate(x.assign_add(small_tensor))
-        self.assertEqual(1. + small_val, self.evaluate(x._variable))
         self.assertEqual(1., self.evaluate(x.value()))
       self.assertEqual(1. + small_val, self.evaluate(x))
 

From dc8d42922b9ff89e717f130515c968186ec4504c Mon Sep 17 00:00:00 2001
From: Lukas Geiger <lukas.geiger94@gmail.com>
Date: Fri, 19 Jun 2020 12:33:01 +0200
Subject: [PATCH 0609/1390] Remove unnecessary control_dependencies

---
 .../mixed_precision/experimental/autocast_variable.py | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

diff --git a/tensorflow/python/keras/mixed_precision/experimental/autocast_variable.py b/tensorflow/python/keras/mixed_precision/experimental/autocast_variable.py
index ca6420f0c0b..b60100c7b48 100644
--- a/tensorflow/python/keras/mixed_precision/experimental/autocast_variable.py
+++ b/tensorflow/python/keras/mixed_precision/experimental/autocast_variable.py
@@ -190,25 +190,20 @@ class AutoCastVariable(variables.Variable, core.Tensor):
 
   def _apply_assign_update(
       self, update_fn, value, use_locking=None, name=None, read_value=True):
-    if not read_value:
-      return update_fn(value, use_locking, name, read_value)
-
     if context.executing_eagerly() or ops.inside_function():
       assign_op = update_fn(value, use_locking, name, False)
-      with ops.control_dependencies([assign_op]):
-        return self
+      return self if read_value else assign_op
 
     # Fallback to wrapping the returned variable in graph mode if possible
     assign_var = update_fn(value, use_locking, name, read_value)
-    if resource_variable_ops.is_resource_variable(assign_var):
+    if read_value and resource_variable_ops.is_resource_variable(assign_var):
       return create_autocast_variable(assign_var)
     return assign_var
 
   def _apply_update(self, update_fn, *args, **kwargs):
     update_var = update_fn(*args, **kwargs)
     if context.executing_eagerly() or ops.inside_function():
-      with ops.control_dependencies([update_var]):
-        return self
+      return self
 
     # Fallback to wrapping the returned variable in graph mode if possible
     if resource_variable_ops.is_resource_variable(update_var):

From b58e6000457f26c7a53a9b945642fbe2baddbf20 Mon Sep 17 00:00:00 2001
From: Alexander Belyaev <pifon@google.com>
Date: Fri, 19 Jun 2020 04:25:24 -0700
Subject: [PATCH 0610/1390] [XLA][MLIR] Enable xla_hlo.ReshapeOp ->
 xla_lhlo.ReshapeOp conversion.

PiperOrigin-RevId: 317284676
Change-Id: Ia845183efcfabe77f6eb66d8c56dcbfc82653982
---
 tensorflow/compiler/mlir/xla/transforms/hlo_legalize_to_lhlo.cc | 1 +
 tensorflow/compiler/mlir/xla/transforms/map_hlo_to_lhlo_op.h    | 1 +
 2 files changed, 2 insertions(+)

diff --git a/tensorflow/compiler/mlir/xla/transforms/hlo_legalize_to_lhlo.cc b/tensorflow/compiler/mlir/xla/transforms/hlo_legalize_to_lhlo.cc
index a11b08e0ea6..446f2aae833 100644
--- a/tensorflow/compiler/mlir/xla/transforms/hlo_legalize_to_lhlo.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/hlo_legalize_to_lhlo.cc
@@ -464,6 +464,7 @@ void populateHLOToLHLOConversionPattern(
       HloToLhloOpConverter<xla_hlo::RealOp>,
       HloToLhloOpConverter<xla_hlo::RemOp>,
       HloToLhloOpConverter<xla_hlo::RsqrtOp>,
+      HloToLhloOpConverter<xla_hlo::ReshapeOp>,
       HloToLhloOpConverter<xla_hlo::SelectOp>,
       HloToLhloOpConverter<xla_hlo::SignOp>,
       HloToLhloOpConverter<xla_hlo::SqrtOp>,
diff --git a/tensorflow/compiler/mlir/xla/transforms/map_hlo_to_lhlo_op.h b/tensorflow/compiler/mlir/xla/transforms/map_hlo_to_lhlo_op.h
index 4b9397795a1..8d5f27474a5 100644
--- a/tensorflow/compiler/mlir/xla/transforms/map_hlo_to_lhlo_op.h
+++ b/tensorflow/compiler/mlir/xla/transforms/map_hlo_to_lhlo_op.h
@@ -61,6 +61,7 @@ MAP_HLO_TO_LHLO(MulOp);
 MAP_HLO_TO_LHLO(NegOp);
 MAP_HLO_TO_LHLO(RealOp);
 MAP_HLO_TO_LHLO(ReduceOp);
+MAP_HLO_TO_LHLO(ReshapeOp);
 MAP_HLO_TO_LHLO(RemOp);
 MAP_HLO_TO_LHLO(RsqrtOp);
 MAP_HLO_TO_LHLO(SelectOp);

From 7b1a726ec3ab2d831167d32b9b820bcf14aba6f7 Mon Sep 17 00:00:00 2001
From: Devi Sandeep Endluri <sandeep0138@gmail.com>
Date: Fri, 19 Jun 2020 06:23:39 -0500
Subject: [PATCH 0611/1390] Ensure there are test samples for imdb dataset,
 when maxlen is low

With the current imdb.load_data(), the following results are seen
for different values of maxlen.

	load_data                (len(x_train), len(x_test))
------------------------------------------------------------
imdb.load_data(maxlen=50)    -->    (1035, 0)
imdb.load_data(maxlen=100)   -->    (5736, 0)
imdb.load_data(maxlen=200)   -->    (25000, 3913)
imdb.load_data()             -->    (25000, 25000)

Analysis: We can observe that when maxlen is low, the number
of test samples can be 0. This is because the train and test data is
concatenated, then the samples with length > maxlen are removed, and
the first 25,000 are considered as training data.

Fix: This can be fixed when data can be filtered first to remove the
ones with length > maxlen, and then concatenate to process further.
The following are the results after the fix.

     fixed load_data              (len(x_train), len(x_test))
------------------------------------------------------------
imdb.load_data(maxlen=50)    -->    (477, 558)
imdb.load_data(maxlen=100)   -->    (2773, 2963)
imdb.load_data(maxlen=200)   -->    (14244, 14669)
imdb.load_data()             -->    (25000, 25000)
---
 tensorflow/python/keras/datasets/imdb.py | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/tensorflow/python/keras/datasets/imdb.py b/tensorflow/python/keras/datasets/imdb.py
index 37403228edf..e359d691a5d 100644
--- a/tensorflow/python/keras/datasets/imdb.py
+++ b/tensorflow/python/keras/datasets/imdb.py
@@ -124,20 +124,24 @@ def load_data(path='imdb.npz',
   x_test = x_test[indices]
   labels_test = labels_test[indices]
 
-  xs = np.concatenate([x_train, x_test])
-  labels = np.concatenate([labels_train, labels_test])
-
   if start_char is not None:
-    xs = [[start_char] + [w + index_from for w in x] for x in xs]
+    x_train = [[start_char] + [w + index_from for w in x] for x in x_train]
+    x_test = [[start_char] + [w + index_from for w in x] for x in x_test]
   elif index_from:
-    xs = [[w + index_from for w in x] for x in xs]
+    x_train = [[w + index_from for w in x] for x in x_train]
+    x_test = [[w + index_from for w in x] for x in x_test]
 
   if maxlen:
-    xs, labels = _remove_long_seq(maxlen, xs, labels)
-    if not xs:
+    x_train, labels_train = _remove_long_seq(maxlen, x_train, labels_train)
+    x_test, labels_test = _remove_long_seq(maxlen, x_test, labels_test)
+    if not x_train or not x_test:
       raise ValueError('After filtering for sequences shorter than maxlen=' +
                        str(maxlen) + ', no sequence was kept. '
                        'Increase maxlen.')
+
+  xs = np.concatenate([x_train, x_test])
+  labels = np.concatenate([labels_train, labels_test])
+
   if not num_words:
     num_words = max(max(x) for x in xs)
 

From 6ddd920c44540284ab28cc7cdf2faef18b85ff11 Mon Sep 17 00:00:00 2001
From: Xinan Jiang <xinan.jxn@gmail.com>
Date: Fri, 19 Jun 2020 20:51:30 +0800
Subject: [PATCH 0612/1390] [MLIR][XLA] Modify for FileCheck commands

---
 tensorflow/compiler/xla/service/mlir_gpu/tests/gather.hlo | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/compiler/xla/service/mlir_gpu/tests/gather.hlo b/tensorflow/compiler/xla/service/mlir_gpu/tests/gather.hlo
index 99faa319bf6..8a24daae0f4 100644
--- a/tensorflow/compiler/xla/service/mlir_gpu/tests/gather.hlo
+++ b/tensorflow/compiler/xla/service/mlir_gpu/tests/gather.hlo
@@ -11,5 +11,5 @@ ENTRY %Gather (x: f32[100,10], y: s64[4,6]) -> f32[4,6,10] {
 
 // CHECK: func @gather(%[[ARG0:.*]]: [[TYPE0:.*]], %[[ARG1:.*]]: [[TYPE1:.*]], %[[RESULT:.*]]: [[RTYPE:.*]]) {
 // CHECK:   "xla_lhlo.gather"(%[[ARG0]], %[[ARG1]], %[[RESULT]])
-// CHECK:   {collapsed_slice_dims = dense<0> : tensor<1xi64>, index_vector_dim = 2 : i64, offset_dims = dense<2> : tensor<1xi64>, slice_sizes = dense<[1, 10]> : tensor<2xi64>, start_index_map = dense<0> : tensor<1xi64>} : ([[TYPE0]], [[TYPE1]], [[RTYPE]]) -> ()
+// CHECK-SAME:   {collapsed_slice_dims = dense<0> : tensor<1xi64>, index_vector_dim = 2 : i64, offset_dims = dense<2> : tensor<1xi64>, slice_sizes = dense<[1, 10]> : tensor<2xi64>, start_index_map = dense<0> : tensor<1xi64>} : ([[TYPE0]], [[TYPE1]], [[RTYPE]]) -> ()
 // CHECK: }

From 42579858f9cda701c7c69d4a1f89035f0a68b258 Mon Sep 17 00:00:00 2001
From: Xinan Jiang <xinan.jxn@gmail.com>
Date: Fri, 19 Jun 2020 20:56:08 +0800
Subject: [PATCH 0613/1390] [MLIR][XLA] Add GatherOp to HLO to LHLO converters

---
 tensorflow/compiler/mlir/xla/transforms/hlo_legalize_to_lhlo.cc | 1 +
 tensorflow/compiler/mlir/xla/transforms/map_hlo_to_lhlo_op.h    | 1 +
 2 files changed, 2 insertions(+)

diff --git a/tensorflow/compiler/mlir/xla/transforms/hlo_legalize_to_lhlo.cc b/tensorflow/compiler/mlir/xla/transforms/hlo_legalize_to_lhlo.cc
index 1cfe0c12e20..524965cfacd 100644
--- a/tensorflow/compiler/mlir/xla/transforms/hlo_legalize_to_lhlo.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/hlo_legalize_to_lhlo.cc
@@ -453,6 +453,7 @@ void populateHLOToLHLOConversionPattern(
       HloToLhloOpConverter<xla_hlo::DivOp>,
       HloToLhloOpConverter<xla_hlo::DotOp>,
       HloToLhloOpConverter<xla_hlo::ExpOp>,
+      HloToLhloOpConverter<xla_hlo::GatherOp>,
       HloToLhloOpConverter<xla_hlo::ImagOp>,
       HloToLhloOpConverter<xla_hlo::IotaOp>,
       HloToLhloOpConverter<xla_hlo::LogOp>,
diff --git a/tensorflow/compiler/mlir/xla/transforms/map_hlo_to_lhlo_op.h b/tensorflow/compiler/mlir/xla/transforms/map_hlo_to_lhlo_op.h
index 4b9397795a1..5e3d1cb9302 100644
--- a/tensorflow/compiler/mlir/xla/transforms/map_hlo_to_lhlo_op.h
+++ b/tensorflow/compiler/mlir/xla/transforms/map_hlo_to_lhlo_op.h
@@ -52,6 +52,7 @@ MAP_HLO_TO_LHLO(CosOp);
 MAP_HLO_TO_LHLO(DivOp);
 MAP_HLO_TO_LHLO(DotOp);
 MAP_HLO_TO_LHLO(ExpOp);
+MAP_HLO_TO_LHLO(GatherOp);
 MAP_HLO_TO_LHLO(ImagOp);
 MAP_HLO_TO_LHLO(IotaOp);
 MAP_HLO_TO_LHLO(LogOp);

From 0c7e61d6608e29324357fb5f79a9b925281521a0 Mon Sep 17 00:00:00 2001
From: Hanhan Wang <hanchung@google.com>
Date: Fri, 19 Jun 2020 07:09:13 -0700
Subject: [PATCH 0614/1390] Remove the canonicalize pattern for folding a pad
 op into the following conv op.

Basically rolledback for cl/305641881, the pattern could hurt performance
because the operation can't be fully tiled in Linalg transformation. In this
context, not everyone wants this pattern, so remove it from canonicalize
patterns.

PiperOrigin-RevId: 317302072
Change-Id: I19aa64e14eecccfd738ad3f775f3670974bc68f9
---
 tensorflow/compiler/mlir/xla/ir/hlo_ops.cc    | 56 ----------------
 tensorflow/compiler/mlir/xla/ir/hlo_ops.td    |  2 -
 .../compiler/mlir/xla/tests/canonicalize.mlir | 65 -------------------
 .../mlir/xla/transforms/canonicalize.td       | 51 ---------------
 4 files changed, 174 deletions(-)

diff --git a/tensorflow/compiler/mlir/xla/ir/hlo_ops.cc b/tensorflow/compiler/mlir/xla/ir/hlo_ops.cc
index d7950919883..e0fa1da93b8 100644
--- a/tensorflow/compiler/mlir/xla/ir/hlo_ops.cc
+++ b/tensorflow/compiler/mlir/xla/ir/hlo_ops.cc
@@ -106,53 +106,6 @@ DenseIntElementsAttr BuildSliceLimits(DenseIntElementsAttr start_indices,
   return GetI64ElementsAttr(slice_limits, builder);
 }
 
-// Returns the padding value of the given position. If padding_attr is a
-// nullptr, returns 0.
-static int64_t GetPaddingValue(DenseIntElementsAttr padding_attr,
-                               ArrayRef<uint64_t> index) {
-  if (!padding_attr) return 0;
-  return padding_attr.getValue<int64_t>(index);
-}
-
-static bool IsOnlyPaddingSpatialDims(Value lhs,
-                                     ConvDimensionNumbers dimension_numbers,
-                                     DenseIntElementsAttr edge_padding_low,
-                                     DenseIntElementsAttr edge_padding_high) {
-  const int64_t batch_dim = dimension_numbers.input_batch_dimension().getInt();
-  const int64_t feature_dim =
-      dimension_numbers.input_feature_dimension().getInt();
-  if (edge_padding_low.getValue<int64_t>(batch_dim) ||
-      edge_padding_high.getValue<int64_t>(batch_dim))
-    return false;
-  if (edge_padding_low.getValue<int64_t>(feature_dim) ||
-      edge_padding_high.getValue<int64_t>(feature_dim))
-    return false;
-  return true;
-}
-
-DenseIntElementsAttr BuildConvPaddingAttrs(
-    DenseIntElementsAttr edge_padding_low,
-    DenseIntElementsAttr edge_padding_high, DenseIntElementsAttr padding_attr,
-    ConvDimensionNumbers dimension_numbers, Builder* builder) {
-  SmallVector<int64_t, 4> padding_low, padding_high;
-  for (const auto& dim : dimension_numbers.input_spatial_dimensions()) {
-    unsigned i = dim.getZExtValue();
-    padding_low.push_back(edge_padding_low.getValue<int64_t>(i));
-    padding_high.push_back(edge_padding_high.getValue<int64_t>(i));
-  }
-
-  int rank = padding_low.size();
-  SmallVector<int64_t, 8> padding;
-  for (unsigned i = 0, e = rank; i < e; ++i) {
-    padding.push_back(GetPaddingValue(padding_attr, {i, 0}) + padding_low[i]);
-    padding.push_back(GetPaddingValue(padding_attr, {i, 1}) + padding_high[i]);
-  }
-  // padding_attr.getType() doesn't work because it is an optional attribute,
-  // which can be a nullptr.
-  auto type = RankedTensorType::get({rank, 2}, builder->getIntegerType(64));
-  return DenseIntElementsAttr::get(type, padding);
-}
-
 #include "tensorflow/compiler/mlir/xla/transforms/generated_canonicalize.inc"
 }  // namespace
 
@@ -2153,14 +2106,5 @@ LogicalResult deriveShapeFromFirstOperand(
   return success();
 }
 
-//===----------------------------------------------------------------------===//
-// ConvOp
-//===----------------------------------------------------------------------===//
-
-void ConvOp::getCanonicalizationPatterns(OwningRewritePatternList& results,
-                                         MLIRContext* context) {
-  results.insert<FoldPadIntoConv>(context);
-}
-
 }  // namespace xla_hlo
 }  // namespace mlir
diff --git a/tensorflow/compiler/mlir/xla/ir/hlo_ops.td b/tensorflow/compiler/mlir/xla/ir/hlo_ops.td
index b1745c73fbf..f92d1c5b85c 100644
--- a/tensorflow/compiler/mlir/xla/ir/hlo_ops.td
+++ b/tensorflow/compiler/mlir/xla/ir/hlo_ops.td
@@ -929,8 +929,6 @@ def HLO_ConvOp : HLO_Op<"convolution", [NoSideEffect]>, BASE_HLO_ConvOp {
   );
 
   let results = (outs HLO_Tensor);
-
-  let hasCanonicalizer = 1;
 }
 
 def HLO_CopyOp: HLO_Op<"copy", [NoSideEffect, SameOperandsAndResultType]>, BASE_HLO_CopyOp {
diff --git a/tensorflow/compiler/mlir/xla/tests/canonicalize.mlir b/tensorflow/compiler/mlir/xla/tests/canonicalize.mlir
index ef0f8c4d200..1954c3344df 100644
--- a/tensorflow/compiler/mlir/xla/tests/canonicalize.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/canonicalize.mlir
@@ -415,71 +415,6 @@ func @fold_copy(%arg : tensor<1x4xf32>) -> tensor<1x4xf32> {
   return %0 : tensor<1x4xf32>
 }
 
-// CHECK-LABEL: func @fold_pad_into_conv_f32
-func @fold_pad_into_conv_f32(%arg0 : tensor<1x32x32x3xf32>,
-                         %arg1 : tensor<7x7x3x64xf32>)
-    -> tensor<1x16x16x64xf32> {
-  //  CHECK-NOT: xla_hlo.pad
-  //      CHECK: xla_hlo.convolution
-  // CHECK-SAME: padding = dense<3> : tensor<2x2xi64>
-  %0 = xla_hlo.constant dense<0.000000e+00> : tensor<f32>
-  %1 = "xla_hlo.pad"(%arg0, %0) {
-    edge_padding_high = dense<[0, 3, 3, 0]> : tensor<4xi64>,
-    edge_padding_low = dense<[0, 3, 3, 0]> : tensor<4xi64>,
-    interior_padding = dense<0> : tensor<4xi64>
-  } : (tensor<1x32x32x3xf32>, tensor<f32>) -> tensor<1x38x38x3xf32>
-  %2 = "xla_hlo.convolution"(%1, %arg1) {
-    batch_group_count = 1 : i64,
-    dimension_numbers = {
-      input_batch_dimension = 0 : i64,
-      input_feature_dimension = 3 : i64,
-      input_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>,
-      kernel_input_feature_dimension = 2 : i64,
-      kernel_output_feature_dimension = 3 : i64,
-      kernel_spatial_dimensions = dense<[0, 1]> : tensor<2xi64>,
-      output_batch_dimension = 0 : i64,
-      output_feature_dimension = 3 : i64,
-      output_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>
-    },
-    feature_group_count = 1 : i64,
-    padding = dense<0> : tensor<2x2xi64>,
-    window_strides = dense<2> : tensor<2xi64>
-  } : (tensor<1x38x38x3xf32>, tensor<7x7x3x64xf32>) -> tensor<1x16x16x64xf32>
-  return %2 : tensor<1x16x16x64xf32>
-}
-
-// CHECK-LABEL: func @fold_pad_into_conv_i32
-func @fold_pad_into_conv_i32(%arg0 : tensor<1x32x32x3xi32>,
-                         %arg1 : tensor<7x7x3x64xi32>)
-    -> tensor<1x16x16x64xi32> {
-  //  CHECK-NOT: xla_hlo.pad
-  //      CHECK: xla_hlo.convolution
-  // CHECK-SAME: padding = dense<3> : tensor<2x2xi64>
-  %0 = xla_hlo.constant dense<0> : tensor<i32>
-  %1 = "xla_hlo.pad"(%arg0, %0) {
-    edge_padding_high = dense<[0, 3, 3, 0]> : tensor<4xi64>,
-    edge_padding_low = dense<[0, 3, 3, 0]> : tensor<4xi64>,
-    interior_padding = dense<0> : tensor<4xi64>
-  } : (tensor<1x32x32x3xi32>, tensor<i32>) -> tensor<1x38x38x3xi32>
-  %2 = "xla_hlo.convolution"(%1, %arg1) {
-    batch_group_count = 1 : i64,
-    dimension_numbers = {
-      input_batch_dimension = 0 : i64,
-      input_feature_dimension = 3 : i64,
-      input_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>,
-      kernel_input_feature_dimension = 2 : i64,
-      kernel_output_feature_dimension = 3 : i64,
-      kernel_spatial_dimensions = dense<[0, 1]> : tensor<2xi64>,
-      output_batch_dimension = 0 : i64,
-      output_feature_dimension = 3 : i64,
-      output_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>
-    },
-    feature_group_count = 1 : i64,
-    window_strides = dense<2> : tensor<2xi64>
-  } : (tensor<1x38x38x3xi32>, tensor<7x7x3x64xi32>) -> tensor<1x16x16x64xi32>
-  return %2 : tensor<1x16x16x64xi32>
-}
-
 // CHECK-LABEL: func @dynamic_reshape_not_actually_dynamic
 func @dynamic_reshape_not_actually_dynamic(%arg0: tensor<4xf32>, %shape: tensor<2xindex>) -> tensor<4x1xf32> {
   // CHECK: xla_hlo.reshape
diff --git a/tensorflow/compiler/mlir/xla/transforms/canonicalize.td b/tensorflow/compiler/mlir/xla/transforms/canonicalize.td
index b788cb80380..c319551d92a 100644
--- a/tensorflow/compiler/mlir/xla/transforms/canonicalize.td
+++ b/tensorflow/compiler/mlir/xla/transforms/canonicalize.td
@@ -28,54 +28,3 @@ def UnaryEinsumToEinsum : Pat<
   (HLO_UnaryEinsumOp $operand, $equation),
   (HLO_EinsumOp (HLO_ConstOp (GetScalarOfType<1> $operand)),
                 $operand, (UnaryToBinaryEinsumEq $equation))>;
-
-//===----------------------------------------------------------------------===//
-// Conv op patterns.
-//===----------------------------------------------------------------------===//
-
-def IsZero : Attr<CPred<
-  "($_self.isa<DenseFPElementsAttr>() &&"
-  "$_self.cast<DenseFPElementsAttr>().isSplat() &&"
-  "$_self.cast<DenseFPElementsAttr>().getSplatValue<FloatAttr>()"
-  ".getValue().isZero()) ||"
-  "($_self.isa<DenseIntElementsAttr>() &&"
-  "$_self.cast<DenseIntElementsAttr>().isSplat() &&"
-  "$_self.cast<DenseIntElementsAttr>().getSplatValue<IntegerAttr>()"
-  ".getInt() == 0)">>;
-
-def IsOnlyPaddingSpatialDims
-  : Constraint<CPred<"IsOnlyPaddingSpatialDims($0, $1, $2, $3)">>;
-
-def BuildConvPaddingAttrs : NativeCodeCall<
-  "BuildConvPaddingAttrs($0, $1, $2, $3, &$_builder)">;
-
-def FoldPadIntoConv : Pat<
-  (HLO_ConvOp
-    (HLO_PadOp $lhs,
-      (HLO_ConstOp IsZero:$padding_value),
-      $edge_padding_low,
-      $edge_padding_high,
-      IsZero:$interior_padding),
-    $rhs,
-    $window_strides,
-    $padding,
-    $lhs_dilation,
-    $rhs_dilation,
-    $dimension_numbers,
-    $feature_group_count,
-    $batch_group_count,
-    $precision_config),
-  (HLO_ConvOp
-    $lhs,
-    $rhs,
-    $window_strides,
-    (BuildConvPaddingAttrs $edge_padding_low, $edge_padding_high, $padding,
-      $dimension_numbers),
-    $lhs_dilation,
-    $rhs_dilation,
-    $dimension_numbers,
-    $feature_group_count,
-    $batch_group_count,
-    $precision_config),
-    [(IsOnlyPaddingSpatialDims $lhs, $dimension_numbers, $edge_padding_low,
-      $edge_padding_high)]>;

From 9e7d5ef6f25e436fffae03597838294d872404f0 Mon Sep 17 00:00:00 2001
From: "T.J. Alumbaugh" <talumbau@google.com>
Date: Fri, 19 Jun 2020 07:29:02 -0700
Subject: [PATCH 0615/1390] Full int8 quantization BatchMatMul

PiperOrigin-RevId: 317304259
Change-Id: Icf96d9d129db30b965e36f5c8befd27762b173b2
---
 tensorflow/compiler/mlir/lite/ir/tfl_ops.td   |   6 +-
 tensorflow/lite/kernels/batch_matmul.cc       |  92 ++++++++++-
 tensorflow/lite/kernels/batch_matmul_test.cc  | 156 ++++++++++++++++--
 .../kernels/internal/optimized/batch_matmul.h | 106 ++++++++++++
 .../kernels/internal/reference/batch_matmul.h |  93 +++++++++++
 tensorflow/lite/kernels/register.cc           |   4 +-
 .../lite/tools/optimize/operator_property.cc  |   6 +
 .../lite/tools/versioning/op_version.cc       |   2 +
 .../lite/tools/versioning/runtime_version.cc  |   1 +
 9 files changed, 439 insertions(+), 27 deletions(-)

diff --git a/tensorflow/compiler/mlir/lite/ir/tfl_ops.td b/tensorflow/compiler/mlir/lite/ir/tfl_ops.td
index 509c13ae161..33281cc58fb 100644
--- a/tensorflow/compiler/mlir/lite/ir/tfl_ops.td
+++ b/tensorflow/compiler/mlir/lite/ir/tfl_ops.td
@@ -953,14 +953,14 @@ in the batch dimensions and broadcasting.
   }];
 
   let arguments = (ins
-    TFL_TensorOf<[F32]>:$x,
-    TFL_TensorOf<[F32]>:$y,
+    TFL_TensorOf<[F32, QI8]>:$x,
+    TFL_TensorOf<[F32, QI8]>:$y,
     DefaultValuedAttr<BoolAttr, "false">:$adj_x,
     DefaultValuedAttr<BoolAttr, "false">:$adj_y
   );
 
    let results = (outs
-    TFL_TensorOf<[F32]>:$output
+    TFL_TensorOf<[F32, QI8]>:$output
   );
 
   let hasOptions = 1;
diff --git a/tensorflow/lite/kernels/batch_matmul.cc b/tensorflow/lite/kernels/batch_matmul.cc
index 8bc23c9c94a..a414a226504 100644
--- a/tensorflow/lite/kernels/batch_matmul.cc
+++ b/tensorflow/lite/kernels/batch_matmul.cc
@@ -19,6 +19,7 @@ limitations under the License.
 
 #include <algorithm>
 #include <cstdint>
+#include <limits>
 
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
@@ -52,6 +53,14 @@ enum KernelType {
 };
 
 struct OpData {
+  // The scaling factor from input to output (aka the 'real multiplier') can
+  // be represented as a fixed point multiplier plus a left shift.
+  int32_t output_multiplier;
+  int output_shift;
+  // The range of the fused activation layer. For example for kNone and
+  // uint8_t these would be 0 and 255.
+  int32_t output_activation_min;
+  int32_t output_activation_max;
   // The index of the temporary tensors where we store transposed LHS/RHS.
   int scratch_tensor_index;
   bool rhs_transposed;
@@ -274,6 +283,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 
   OpContext op_context(context, node);
   TF_LITE_ENSURE_OK(context, InitializeTemporaries(context, node, &op_context));
+  OpData* op_data = reinterpret_cast<OpData*>(node->user_data);
 
   bool adj_x = op_context.params->adj_x;
   bool adj_y = op_context.params->adj_y;
@@ -282,7 +292,24 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   const TfLiteTensor* rhs_data = GetInput(context, node, kInputRHSTensor);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
 
-  TF_LITE_ENSURE_TYPES_EQ(context, lhs_data->type, kTfLiteFloat32);
+  // Note that quantized inference requires that all tensors have their
+  // parameters set. This is usually done during quantized training.
+  if (lhs_data->type == kTfLiteInt8) {
+    double real_multiplier = 0.0;
+    TF_LITE_ENSURE_STATUS(GetQuantizedConvolutionMultipler(
+        context, lhs_data, rhs_data, output, &real_multiplier));
+    int exponent;
+    QuantizeMultiplier(real_multiplier, &op_data->output_multiplier, &exponent);
+    op_data->output_shift = exponent;
+    // BatchMatMul has no fused activation functions. Therefore, set
+    // output activation min and max to min and max of int8_t type,
+    // respecitvely.
+    op_data->output_activation_min = std::numeric_limits<int8_t>::min();
+    op_data->output_activation_max = std::numeric_limits<int8_t>::max();
+  }
+
+  TF_LITE_ENSURE(context, lhs_data->type == kTfLiteFloat32 ||
+                              lhs_data->type == kTfLiteInt8);
   TF_LITE_ENSURE(context, rhs_data->type == kTfLiteFloat32 ||
                               rhs_data->type == kTfLiteInt8);
   // Support dimensions between 2 and 4, inclusive.
@@ -433,6 +460,41 @@ TfLiteStatus EvalHybrid(TfLiteContext* context, TfLiteNode* node, OpData* data,
   return kTfLiteOk;
 }
 
+template <KernelType kernel_type>
+TfLiteStatus EvalInt8(TfLiteContext* context, const OpData* data,
+                      const RuntimeShape& lhs_shape, const TfLiteTensor* lhs,
+                      const RuntimeShape& rhs_shape, const TfLiteTensor* rhs,
+                      const RuntimeShape& output_shape, TfLiteTensor* output) {
+  // Reuse params struct from FullyConnected Op.
+  FullyConnectedParams op_params;
+  int32_t input_offset = -lhs->params.zero_point;
+  int32_t filter_offset = -rhs->params.zero_point;
+  int32_t output_offset = output->params.zero_point;
+  op_params.input_offset = input_offset;
+  op_params.weights_offset = filter_offset;
+  op_params.output_offset = output_offset;
+  op_params.output_multiplier = data->output_multiplier;
+  op_params.output_shift = data->output_shift;
+  op_params.quantized_activation_min = data->output_activation_min;
+  op_params.quantized_activation_max = data->output_activation_max;
+  op_params.lhs_cacheable = IsConstantTensor(lhs);
+  op_params.rhs_cacheable = IsConstantTensor(rhs);
+
+  if (kernel_type == kReference) {
+    reference_ops::BatchMatMul(op_params, rhs_shape, GetTensorData<int8_t>(rhs),
+                               lhs_shape, GetTensorData<int8_t>(lhs),
+                               GetTensorShape(output),
+                               GetTensorData<int8_t>(output));
+  } else {
+    optimized_ops::BatchMatMul(op_params, rhs_shape, GetTensorData<int8_t>(rhs),
+                               lhs_shape, GetTensorData<int8_t>(lhs),
+                               GetTensorShape(output),
+                               GetTensorData<int8_t>(output),
+                               CpuBackendContext::GetFromContext(context));
+  }
+  return kTfLiteOk;
+}
+
 template <KernelType kernel_type>
 TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
                            OpData* data, const RuntimeShape& lhs_shape,
@@ -448,25 +510,39 @@ TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
     return EvalHybrid<kernel_type>(
         context, node, data, lhs_shape, lhs, rhs_shape, rhs, input_quantized,
         scaling_factors, accum_scratch, row_sums, input_offsets, output);
+  } else if (lhs->type == kTfLiteInt8) {
+    return EvalInt8<kernel_type>(context, data, lhs_shape, lhs, rhs_shape, rhs,
+                                 GetTensorShape(output), output);
   } else {
-    TF_LITE_KERNEL_LOG(context,
-                       "Currently only hybrid quantization is supported.\n");
+    TF_LITE_KERNEL_LOG(
+        context, "Currently only hybrid and int8 quantization is supported.\n");
     return kTfLiteError;
   }
   return kTfLiteOk;
 }
 
-TfLiteTensor* GetRhs(TfLiteContext* context, TfLiteNode* node,
-                     const TfLiteTensor* rhs) {
+TfLiteTensor* GetTempRhs(TfLiteContext* context, TfLiteNode* node,
+                         const TfLiteTensor* rhs) {
   TfLiteTensor* transposed_rhs = GetTemporary(context, node, 1);
   if (rhs->type == kTfLiteInt8) {
-    // Get the quantization params from the weights tensors.
+    // Get the quantization params from the RHS tensor.
     transposed_rhs->params.scale = rhs->params.scale;
     transposed_rhs->params.zero_point = rhs->params.zero_point;
   }
   return transposed_rhs;
 }
 
+TfLiteTensor* GetTempLhs(TfLiteContext* context, TfLiteNode* node,
+                         const TfLiteTensor* lhs) {
+  TfLiteTensor* transposed_lhs = GetTemporary(context, node, 0);
+  if (lhs->type == kTfLiteInt8) {
+    // Get the quantization params from the LHS tensor.
+    transposed_lhs->params.scale = lhs->params.scale;
+    transposed_lhs->params.zero_point = lhs->params.zero_point;
+  }
+  return transposed_lhs;
+}
+
 // Perform a batch matrix multiply on
 // LHS <..., A, B>  X  RHS<..., B, C>
 // where the leading dimensions of LHS and RHS obey broadcasting rules
@@ -491,8 +567,8 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   bool adj_y = op_context.params->adj_y;
   bool adj_x = op_context.params->adj_x;
 
-  const TfLiteTensor* rhs_tensor = adj_y ? rhs : GetRhs(context, node, rhs);
-  const TfLiteTensor* lhs_tensor = adj_x ? GetTemporary(context, node, 0) : lhs;
+  const TfLiteTensor* rhs_tensor = adj_y ? rhs : GetTempRhs(context, node, rhs);
+  const TfLiteTensor* lhs_tensor = adj_x ? GetTempLhs(context, node, lhs) : lhs;
   if (!adj_y) {
     // TODO(b/154760341) Constant tensors should already be transposed, but
     // we transpose once if necessary for now.
diff --git a/tensorflow/lite/kernels/batch_matmul_test.cc b/tensorflow/lite/kernels/batch_matmul_test.cc
index 5e52479f49b..98df8ebe3db 100644
--- a/tensorflow/lite/kernels/batch_matmul_test.cc
+++ b/tensorflow/lite/kernels/batch_matmul_test.cc
@@ -24,8 +24,19 @@ limitations under the License.
 #include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
+
+namespace ops {
+namespace builtin {
+
+TfLiteRegistration* Register_BATCH_MATMUL_REF();
+TfLiteRegistration* Register_BATCH_MATMUL_GENERIC_OPTIMIZED();
+
+}  // namespace builtin
+}  // namespace ops
+
 namespace {
 
+using ::testing::ElementsAre;
 using ::testing::ElementsAreArray;
 
 template <typename T>
@@ -53,7 +64,20 @@ class BatchMatMulOpModel : public SingleOpModel {
   int output_id_;
 };
 
-TEST(BatchMatMulOpModelTest, Float32Test_Simple) {
+const auto kKernelMap = new std::map<string, TfLiteRegistration*>({
+    {"Reference", ops::builtin::Register_BATCH_MATMUL_REF()},
+    {"GenericOptimized",
+     ops::builtin::Register_BATCH_MATMUL_GENERIC_OPTIMIZED()},
+});
+
+class BatchMatMulOpTest : public SingleOpTest {
+ protected:
+  const std::map<string, TfLiteRegistration*>& GetKernelMap() override {
+    return *kKernelMap;
+  }
+};
+
+TEST_P(BatchMatMulOpTest, Float32Test_Simple) {
   BatchMatMulOpModel<float> model({TensorType_FLOAT32, {1, 2, 3}},
                                   {TensorType_FLOAT32, {1, 3, 4}});
   model.PopulateTensor<float>(model.lhs(), {1, 2, 3, 4, 5, 6});
@@ -65,7 +89,7 @@ TEST(BatchMatMulOpModelTest, Float32Test_Simple) {
   EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 2, 4}));
 }
 
-TEST(BatchMatMulOpModelTest, Float32Test_SimpleRHSAdjoint) {
+TEST_P(BatchMatMulOpTest, Float32Test_SimpleRHSAdjoint) {
   BatchMatMulOpModel<float> model({TensorType_FLOAT32, {1, 2, 3}},
                                   {TensorType_FLOAT32, {1, 4, 3}}, false, true);
   model.PopulateTensor<float>(model.lhs(), {1, 2, 3, 4, 5, 6});
@@ -77,7 +101,7 @@ TEST(BatchMatMulOpModelTest, Float32Test_SimpleRHSAdjoint) {
   EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 2, 4}));
 }
 
-TEST(BatchMatMulOpModelTest, Float32Test_SimpleLHSAdjoint) {
+TEST_P(BatchMatMulOpTest, Float32Test_SimpleLHSAdjoint) {
   BatchMatMulOpModel<float> model({TensorType_FLOAT32, {1, 3, 2}},
                                   {TensorType_FLOAT32, {1, 3, 4}}, true, false);
   model.PopulateTensor<float>(model.lhs(), {1, 4, 2, 5, 3, 6});
@@ -89,7 +113,7 @@ TEST(BatchMatMulOpModelTest, Float32Test_SimpleLHSAdjoint) {
   EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 2, 4}));
 }
 
-TEST(BatchMatMulOpModelTest, Float32Test_BatchSizeTwo) {
+TEST_P(BatchMatMulOpTest, Float32Test_BatchSizeTwo) {
   BatchMatMulOpModel<float> model({TensorType_FLOAT32, {2, 2, 3}},
                                   {TensorType_FLOAT32, {2, 3, 4}});
   model.PopulateTensor<float>(model.lhs(),
@@ -105,7 +129,7 @@ TEST(BatchMatMulOpModelTest, Float32Test_BatchSizeTwo) {
   EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({2, 2, 4}));
 }
 
-TEST(BatchMatMulOpModelTest, Float32Test_Broadcast) {
+TEST_P(BatchMatMulOpTest, Float32Test_Broadcast) {
   BatchMatMulOpModel<float> model({TensorType_FLOAT32, {2, 2, 3}},
                                   {TensorType_FLOAT32, {3, 4}});
   model.PopulateTensor<float>(model.lhs(),
@@ -121,7 +145,7 @@ TEST(BatchMatMulOpModelTest, Float32Test_Broadcast) {
   EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({2, 2, 4}));
 }
 
-TEST(BatchMatMulOpModelTest, Float32Test_BroadcastLHSAdjoint) {
+TEST_P(BatchMatMulOpTest, Float32Test_BroadcastLHSAdjoint) {
   BatchMatMulOpModel<float> model({TensorType_FLOAT32, {2, 3, 2}},
                                   {TensorType_FLOAT32, {3, 4}}, true, false);
   model.PopulateTensor<float>(model.lhs(),
@@ -137,7 +161,7 @@ TEST(BatchMatMulOpModelTest, Float32Test_BroadcastLHSAdjoint) {
   EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({2, 2, 4}));
 }
 
-TEST(BatchMatMulOpModelTest, Float32Test_Broadcast2) {
+TEST_P(BatchMatMulOpTest, Float32Test_Broadcast2) {
   BatchMatMulOpModel<float> model({TensorType_FLOAT32, {2, 1, 3, 2}},
                                   {TensorType_FLOAT32, {3, 2, 4}});
   model.PopulateTensor<float>(model.lhs(),
@@ -161,7 +185,7 @@ TEST(BatchMatMulOpModelTest, Float32Test_Broadcast2) {
   EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({2, 3, 3, 4}));
 }
 
-TEST(BatchMatMulOpModelTest, Float32Test_Broadcast2LHSAdjoint) {
+TEST_P(BatchMatMulOpTest, Float32Test_Broadcast2LHSAdjoint) {
   BatchMatMulOpModel<float> model({TensorType_FLOAT32, {2, 1, 2, 3}},
                                   {TensorType_FLOAT32, {3, 2, 4}}, true, false);
   model.PopulateTensor<float>(model.lhs(),
@@ -185,7 +209,7 @@ TEST(BatchMatMulOpModelTest, Float32Test_Broadcast2LHSAdjoint) {
   EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({2, 3, 3, 4}));
 }
 
-TEST(BatchMatMulOpModelTest, Float32Test_Broadcast2RHSAdjoint) {
+TEST_P(BatchMatMulOpTest, Float32Test_Broadcast2RHSAdjoint) {
   BatchMatMulOpModel<float> model({TensorType_FLOAT32, {2, 1, 3, 2}},
                                   {TensorType_FLOAT32, {3, 4, 2}}, false, true);
   model.PopulateTensor<float>(model.lhs(),
@@ -208,7 +232,7 @@ TEST(BatchMatMulOpModelTest, Float32Test_Broadcast2RHSAdjoint) {
   EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({2, 3, 3, 4}));
 }
 
-TEST(BatchMatMulOpModelTest, Float32Test_Broadcast2BothAdjoint) {
+TEST_P(BatchMatMulOpTest, Float32Test_Broadcast2BothAdjoint) {
   BatchMatMulOpModel<float> model({TensorType_FLOAT32, {2, 1, 2, 3}},
                                   {TensorType_FLOAT32, {3, 4, 2}}, true, true);
   model.PopulateTensor<float>(model.lhs(),
@@ -231,7 +255,7 @@ TEST(BatchMatMulOpModelTest, Float32Test_Broadcast2BothAdjoint) {
   EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({2, 3, 3, 4}));
 }
 
-TEST(BatchMatMulOpModelTest, Float32Test_BroadcastFromRHS) {
+TEST_P(BatchMatMulOpTest, Float32Test_BroadcastFromRHS) {
   BatchMatMulOpModel<float> model({TensorType_FLOAT32, {4, 5}},
                                   {TensorType_FLOAT32, {3, 1, 5, 2}});
   model.PopulateTensor<float>(
@@ -251,6 +275,10 @@ TEST(BatchMatMulOpModelTest, Float32Test_BroadcastFromRHS) {
   EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({3, 1, 4, 2}));
 }
 
+INSTANTIATE_TEST_SUITE_P(
+    BatchMatMulOpTest, BatchMatMulOpTest,
+    ::testing::ValuesIn(SingleOpTest::GetKernelTags(*kKernelMap)));
+
 // In the hybrid model the weights are quantized int8. But the input
 // and output are expected to be in float precision.
 class HybridAsymmetricBatchMatMulOpModel : public SingleOpModel {
@@ -304,7 +332,14 @@ class HybridAsymmetricBatchMatMulOpModel : public SingleOpModel {
   int input_size_;
 };
 
-TEST(HybridAsymmetricBatchMatMulOpTest, SimpleTestQuantizedInt8) {
+class HybridAsymmetricBatchMatMulOpTest : public SingleOpTest {
+ protected:
+  const std::map<string, TfLiteRegistration*>& GetKernelMap() override {
+    return *kKernelMap;
+  }
+};
+
+TEST_P(HybridAsymmetricBatchMatMulOpTest, SimpleTestQuantizedInt8) {
   HybridAsymmetricBatchMatMulOpModel m(
       /*units=*/3, /*batches=*/2,
       /*lhs=*/{TensorType_FLOAT32, {2, 10}},
@@ -335,7 +370,7 @@ TEST(HybridAsymmetricBatchMatMulOpTest, SimpleTestQuantizedInt8) {
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2, 3}));
 }
 
-TEST(HybridAsymmetricBatchMatMulOpTest, QuantizedInt8BroadcastWeights) {
+TEST_P(HybridAsymmetricBatchMatMulOpTest, QuantizedInt8BroadcastWeights) {
   HybridAsymmetricBatchMatMulOpModel m(
       /*units=*/3, /*batches=*/2,
       /*lhs=*/{TensorType_FLOAT32, {2, 2, 10}},
@@ -366,7 +401,7 @@ TEST(HybridAsymmetricBatchMatMulOpTest, QuantizedInt8BroadcastWeights) {
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2, 2, 3}));
 }
 
-TEST(HybridAsymmetricBatchMatMulOpTest, QuantizedInt8BroadcastBigWeights) {
+TEST_P(HybridAsymmetricBatchMatMulOpTest, QuantizedInt8BroadcastBigWeights) {
   HybridAsymmetricBatchMatMulOpModel m(
       /*units=*/9, /*batches=*/2,
       /*lhs=*/{TensorType_FLOAT32, {2, 2, 10}},
@@ -401,7 +436,7 @@ TEST(HybridAsymmetricBatchMatMulOpTest, QuantizedInt8BroadcastBigWeights) {
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2, 2, 9}));
 }
 
-TEST(HybridAsymmetricBatchMatMulOpTest, QuantizedInt8BroadcastInputs) {
+TEST_P(HybridAsymmetricBatchMatMulOpTest, QuantizedInt8BroadcastInputs) {
   HybridAsymmetricBatchMatMulOpModel m(
       /*units=*/3, /*batches=*/2,
       /*lhs=*/{TensorType_FLOAT32, {2, 10}},
@@ -431,5 +466,96 @@ TEST(HybridAsymmetricBatchMatMulOpTest, QuantizedInt8BroadcastInputs) {
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2, 2, 3}));
 }
 
+INSTANTIATE_TEST_SUITE_P(
+    HybridAsymmetricBatchMatMulOpTest, HybridAsymmetricBatchMatMulOpTest,
+    ::testing::ValuesIn(SingleOpTest::GetKernelTags(*kKernelMap)));
+
+class QuantizedBatchMatMulOpModel : public SingleOpModel {
+ public:
+  QuantizedBatchMatMulOpModel(int units, int batches, const TensorData& lhs,
+                              const TensorData& output = {TensorType_INT8},
+                              bool adj_x = false, bool adj_y = false)
+      : units_(units), batches_(batches) {
+    int total_input_size = 1;
+    for (size_t i = 0; i < lhs.shape.size(); ++i) {
+      total_input_size *= lhs.shape[i];
+    }
+    input_size_ = total_input_size / batches_;
+
+    lhs_id_ = AddInput(lhs);
+    rhs_id_ = AddInput({lhs.type, {input_size_, units_}, lhs.min, lhs.max});
+
+    output_id_ = AddOutput(output);
+
+    SetBuiltinOp(BuiltinOperator_BATCH_MATMUL,
+                 BuiltinOptions_BatchMatMulOptions,
+                 CreateBatchMatMulOptions(builder_, adj_x, adj_y).Union());
+    BuildInterpreter({GetShape(lhs_id_), GetShape(rhs_id_)});
+  }
+
+  template <typename T>
+  void SetWeights(const std::vector<float>& data) {
+    QuantizeAndPopulate<T>(rhs_id_, data);
+  }
+
+  template <typename T>
+  void SetInput(const std::vector<float>& data) {
+    QuantizeAndPopulate<T>(lhs_id_, data);
+  }
+
+  template <typename T>
+  std::vector<T> GetOutput() {
+    return ExtractVector<T>(output_id_);
+  }
+
+  template <typename T>
+  std::vector<float> GetDequantizedOutput() {
+    return Dequantize<T>(ExtractVector<T>(output_id_), GetScale(output_id_),
+                         GetZeroPoint(output_id_));
+  }
+
+ protected:
+  int lhs_id_;
+  int rhs_id_;
+  int output_id_;
+  int units_;
+  int batches_;
+  int input_size_;
+};
+
+class QuantizedBatchMatMulOpTest : public SingleOpTest {
+ protected:
+  const std::map<string, TfLiteRegistration*>& GetKernelMap() override {
+    return *kKernelMap;
+  }
+};
+
+TEST_P(QuantizedBatchMatMulOpTest, SimpleTestQuantizedInt8) {
+  QuantizedBatchMatMulOpModel m(
+      /*units=*/3, /*batches*/ 2,
+      /*lhs=*/{TensorType_INT8, {2, 10}, -63.5, 64},
+      /*output=*/{TensorType_INT8, {}, -127, 128});
+
+  m.SetWeights<int8_t>({
+      1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5,  5,  5,
+      6, 6, 6, 7, 7, 7, 8, 8, 8, 9, 9, 9, 10, 10, 10,
+  });
+
+  m.SetInput<int8_t>({
+      1, 2, 3, 4, 5, 6, 7, 8,  -9, -10,  // b = 0
+      1, 2, 3, 4, 5, 6, 7, -8, 9,  -10,  // b = 1
+  });
+
+  m.Invoke();
+
+  EXPECT_THAT(m.GetDequantizedOutput<int8_t>(),
+              ElementsAreArray(ArrayFloatNear({23, 23, 23, 57, 57, 57})));
+  EXPECT_THAT(m.GetOutput<int8_t>(), ElementsAre(22, 22, 22, 56, 56, 56));
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    QuantizedBatchMatMulOpTest, QuantizedBatchMatMulOpTest,
+    ::testing::ValuesIn(SingleOpTest::GetKernelTags(*kKernelMap)));
+
 }  // namespace
 }  // namespace tflite
diff --git a/tensorflow/lite/kernels/internal/optimized/batch_matmul.h b/tensorflow/lite/kernels/internal/optimized/batch_matmul.h
index 24b5012304f..5e622154d60 100644
--- a/tensorflow/lite/kernels/internal/optimized/batch_matmul.h
+++ b/tensorflow/lite/kernels/internal/optimized/batch_matmul.h
@@ -272,6 +272,112 @@ inline void BatchMatMul(const RuntimeShape& lhs_shape, const int8_t* lhs_data,
   }
 }
 
+inline void BatchMatMul(const FullyConnectedParams& params,
+                        const RuntimeShape& lhs_shape, const int8_t* lhs_data,
+                        const RuntimeShape& rhs_shape, const int8_t* rhs_data,
+                        const RuntimeShape& output_shape, int8_t* output_data,
+                        CpuBackendContext* context) {
+  using ::tflite::cpu_backend_gemm::Gemm;
+  using ::tflite::cpu_backend_gemm::GemmParams;
+  using ::tflite::cpu_backend_gemm::MatrixParams;
+
+  const RuntimeShape extended_lhs_shape =
+      RuntimeShape::ExtendedShape(5, lhs_shape);
+  const RuntimeShape extended_rhs_shape =
+      RuntimeShape::ExtendedShape(5, rhs_shape);
+
+  // Determine which dimension is the broadcast dimension.
+  auto broadcast_dim = [](int lhs_dim, int rhs_dim) {
+    if (lhs_dim == rhs_dim) return lhs_dim;
+    if (lhs_dim == 1) return rhs_dim;
+    TFLITE_DCHECK_EQ(rhs_dim, 1);
+    return lhs_dim;
+  };
+
+  // Compute the "extent" for iterating on this dimension.
+  // If we are broadcasting, then don't advance (i.e return 0).
+  auto extent = [](const RuntimeShape& shape, int x) {
+    if (shape.Dims(x) == 1) {
+      return 0;
+    }
+    int prod = 1;
+    for (int i = x + 1; i < shape.DimensionsCount(); ++i) {
+      prod *= shape.Dims(i);
+    }
+    return prod;
+  };
+
+  const int batch_dim0 =
+      broadcast_dim(extended_lhs_shape.Dims(0), extended_rhs_shape.Dims(0));
+  const int batch_dim1 =
+      broadcast_dim(extended_lhs_shape.Dims(1), extended_rhs_shape.Dims(1));
+  const int batch_dim2 =
+      broadcast_dim(extended_lhs_shape.Dims(2), extended_rhs_shape.Dims(2));
+
+  const int lhs_ext0 = extent(extended_lhs_shape, 0);
+  const int lhs_ext1 = extent(extended_lhs_shape, 1);
+  const int lhs_ext2 = extent(extended_lhs_shape, 2);
+  const int rhs_ext0 = extent(extended_rhs_shape, 0);
+  const int rhs_ext1 = extent(extended_rhs_shape, 1);
+  const int rhs_ext2 = extent(extended_rhs_shape, 2);
+
+  // Set params for each matrix multiply.
+  const int lhs_rows = extended_lhs_shape.Dims(3);
+  const int rhs_cols = extended_rhs_shape.Dims(4);
+  const int accum_depth = extended_lhs_shape.Dims(4);
+
+  const int32 input_offset = params.input_offset;
+  const int32 filter_offset = params.weights_offset;
+  const int32 output_offset = params.output_offset;
+  const int32 output_multiplier = params.output_multiplier;
+  const int output_shift = params.output_shift;
+  const int32 output_activation_min = params.quantized_activation_min;
+  const int32 output_activation_max = params.quantized_activation_max;
+  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
+
+  MatrixParams<int8_t> lhs_params;
+  lhs_params.order = cpu_backend_gemm::Order::kRowMajor;
+  lhs_params.rows = lhs_rows;
+  lhs_params.cols = accum_depth;
+  lhs_params.zero_point = -filter_offset;
+
+  MatrixParams<int8_t> rhs_params;
+  rhs_params.order = cpu_backend_gemm::Order::kColMajor;
+  rhs_params.rows = accum_depth;
+  rhs_params.cols = rhs_cols;
+  rhs_params.zero_point = -input_offset;
+
+  MatrixParams<int8_t> dst_params;
+  dst_params.order = cpu_backend_gemm::Order::kColMajor;
+  dst_params.rows = lhs_rows;
+  dst_params.cols = rhs_cols;
+  dst_params.zero_point = output_offset;
+
+  for (int b0 = 0; b0 < batch_dim0; ++b0) {
+    const int8_t* lhs_ptr0 = lhs_data + (b0 * lhs_ext0);
+    const int8_t* rhs_ptr0 = rhs_data + (b0 * rhs_ext0);
+    for (int b1 = 0; b1 < batch_dim1; ++b1) {
+      const int8_t* lhs_ptr1 = lhs_ptr0 + b1 * lhs_ext1;
+      const int8_t* rhs_ptr1 = rhs_ptr0 + b1 * rhs_ext1;
+      for (int b2 = 0; b2 < batch_dim2; ++b2) {
+        const int8_t* lhs_ptr2 = lhs_ptr1 + b2 * lhs_ext2;
+        const int8_t* rhs_ptr2 = rhs_ptr1 + b2 * rhs_ext2;
+        int8_t* out_ptr = output_data + ((b0 * batch_dim1 * batch_dim2) +
+                                         b1 * batch_dim2 + b2) *
+                                            lhs_rows * rhs_cols;
+
+        GemmParams<int32_t, int8_t> gemm_params;
+        gemm_params.clamp_min = output_activation_min;
+        gemm_params.clamp_max = output_activation_max;
+        gemm_params.multiplier_fixedpoint = output_multiplier;
+        gemm_params.multiplier_exponent = output_shift;
+        cpu_backend_gemm::Gemm(lhs_params, lhs_ptr2, rhs_params, rhs_ptr2,
+                               dst_params, out_ptr, gemm_params, context);
+      }
+    }
+  }
+}
+
 }  // namespace optimized_ops
 }  // namespace tflite
 
diff --git a/tensorflow/lite/kernels/internal/reference/batch_matmul.h b/tensorflow/lite/kernels/internal/reference/batch_matmul.h
index 1394bd9da64..05caefaca5d 100644
--- a/tensorflow/lite/kernels/internal/reference/batch_matmul.h
+++ b/tensorflow/lite/kernels/internal/reference/batch_matmul.h
@@ -217,6 +217,99 @@ inline void BatchMatMul(const RuntimeShape& lhs_shape, const int8_t* lhs_data,
   }
 }
 
+inline void BatchMatMul(const FullyConnectedParams& params,
+                        const RuntimeShape& lhs_shape, const int8_t* lhs_data,
+                        const RuntimeShape& rhs_shape, const int8_t* rhs_data,
+                        const RuntimeShape& output_shape, int8_t* output_data) {
+  const RuntimeShape extended_lhs_shape =
+      RuntimeShape::ExtendedShape(5, lhs_shape);
+  const RuntimeShape extended_rhs_shape =
+      RuntimeShape::ExtendedShape(5, rhs_shape);
+
+  // Determine which dimension is the broadcast dimension.
+  auto broadcast_dim = [](int lhs_dim, int rhs_dim) {
+    if (lhs_dim == rhs_dim) return lhs_dim;
+    if (lhs_dim == 1) return rhs_dim;
+    TFLITE_DCHECK_EQ(rhs_dim, 1);
+    return lhs_dim;
+  };
+
+  // Compute the "extent" for iterating on this dimension.
+  // If we are broadcasting, then don't advance (i.e return 0).
+  auto extent = [](const RuntimeShape& shape, int x) {
+    if (shape.Dims(x) == 1) {
+      return 0;
+    }
+    int prod = 1;
+    for (int i = x + 1; i < shape.DimensionsCount(); ++i) {
+      prod *= shape.Dims(i);
+    }
+    return prod;
+  };
+
+  const int batch_dim0 =
+      broadcast_dim(extended_lhs_shape.Dims(0), extended_rhs_shape.Dims(0));
+  const int batch_dim1 =
+      broadcast_dim(extended_lhs_shape.Dims(1), extended_rhs_shape.Dims(1));
+  const int batch_dim2 =
+      broadcast_dim(extended_lhs_shape.Dims(2), extended_rhs_shape.Dims(2));
+
+  const int lhs_ext0 = extent(extended_lhs_shape, 0);
+  const int lhs_ext1 = extent(extended_lhs_shape, 1);
+  const int lhs_ext2 = extent(extended_lhs_shape, 2);
+  const int rhs_ext0 = extent(extended_rhs_shape, 0);
+  const int rhs_ext1 = extent(extended_rhs_shape, 1);
+  const int rhs_ext2 = extent(extended_rhs_shape, 2);
+
+  // Set params for each matrix multiply.
+  const int lhs_rows = extended_lhs_shape.Dims(3);
+  const int rhs_cols = extended_rhs_shape.Dims(4);
+  const int accum_depth = extended_lhs_shape.Dims(4);
+
+  const int32 input_offset = params.input_offset;
+  const int32 filter_offset = params.weights_offset;
+  const int32 output_offset = params.output_offset;
+  const int32 output_multiplier = params.output_multiplier;
+  const int output_shift = params.output_shift;
+  const int32 output_activation_min = params.quantized_activation_min;
+  const int32 output_activation_max = params.quantized_activation_max;
+  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
+
+  for (int b0 = 0; b0 < batch_dim0; ++b0) {
+    const int8_t* lhs_ptr0 = lhs_data + (b0 * lhs_ext0);
+    const int8_t* rhs_ptr0 = rhs_data + (b0 * rhs_ext0);
+    for (int b1 = 0; b1 < batch_dim1; ++b1) {
+      const int8_t* lhs_ptr1 = lhs_ptr0 + b1 * lhs_ext1;
+      const int8_t* rhs_ptr1 = rhs_ptr0 + b1 * rhs_ext1;
+      for (int b2 = 0; b2 < batch_dim2; ++b2) {
+        const int8_t* lhs_ptr2 = lhs_ptr1 + b2 * lhs_ext2;
+        const int8_t* rhs_ptr2 = rhs_ptr1 + b2 * rhs_ext2;
+        int8_t* out_ptr = output_data + ((b0 * batch_dim1 * batch_dim2) +
+                                         b1 * batch_dim2 + b2) *
+                                            lhs_rows * rhs_cols;
+
+        for (int j = 0; j < rhs_cols; ++j) {
+          for (int i = 0; i < lhs_rows; ++i) {
+            int32_t total = 0;
+            for (int k = 0; k < accum_depth; ++k) {
+              int32 lhs_val = lhs_ptr2[accum_depth * i + k];
+              int32 rhs_val = rhs_ptr2[accum_depth * j + k];
+              total += (lhs_val + filter_offset) * (rhs_val + input_offset);
+            }
+            total = MultiplyByQuantizedMultiplier(total, output_multiplier,
+                                                  output_shift);
+            total += output_offset;
+            total = std::max(total, output_activation_min);
+            total = std::min(total, output_activation_max);
+            const int idx = lhs_rows * j + i;
+            out_ptr[idx] = static_cast<int8_t>(total);
+          }
+        }
+      }
+    }
+  }
+}
+
 }  // namespace reference_ops
 }  // namespace tflite
 
diff --git a/tensorflow/lite/kernels/register.cc b/tensorflow/lite/kernels/register.cc
index 90688a2aa1f..c3a4aaad16d 100644
--- a/tensorflow/lite/kernels/register.cc
+++ b/tensorflow/lite/kernels/register.cc
@@ -289,7 +289,9 @@ BuiltinOpResolver::BuiltinOpResolver() {
   AddBuiltin(BuiltinOperator_SCATTER_ND, Register_SCATTER_ND());
   AddBuiltin(BuiltinOperator_DENSIFY, Register_DENSIFY());
   AddBuiltin(BuiltinOperator_SEGMENT_SUM, Register_SEGMENT_SUM());
-  AddBuiltin(BuiltinOperator_BATCH_MATMUL, Register_BATCH_MATMUL());
+  AddBuiltin(BuiltinOperator_BATCH_MATMUL, Register_BATCH_MATMUL(),
+             /* min_version = */ 1,
+             /* max_version = */ 2);
   AddCustom("NumericVerify", tflite::ops::custom::Register_NUMERIC_VERIFY());
   // TODO(andrewharp, ahentz): Move these somewhere more appropriate so that
   // custom ops aren't always included by default.
diff --git a/tensorflow/lite/tools/optimize/operator_property.cc b/tensorflow/lite/tools/optimize/operator_property.cc
index 8a0cbca29e2..f2cb98ef31a 100644
--- a/tensorflow/lite/tools/optimize/operator_property.cc
+++ b/tensorflow/lite/tools/optimize/operator_property.cc
@@ -88,6 +88,12 @@ OperatorProperty GetOperatorProperty(const ModelT* model, int subgraph_index,
       property.restrict_same_input_output_scale = true;
       property.version = 2;
       break;
+    case BuiltinOperator_BATCH_MATMUL: {
+      property.inputs = {{0, {}}, {1, {}}};
+      property.outputs = {{0, {}}};
+      property.version = 2;
+      break;
+    }
     case BuiltinOperator_BATCH_TO_SPACE_ND:
     case BuiltinOperator_SPACE_TO_BATCH_ND:
     case BuiltinOperator_SPACE_TO_DEPTH:
diff --git a/tensorflow/lite/tools/versioning/op_version.cc b/tensorflow/lite/tools/versioning/op_version.cc
index 118e2d420f8..a97b9da47f1 100644
--- a/tensorflow/lite/tools/versioning/op_version.cc
+++ b/tensorflow/lite/tools/versioning/op_version.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "absl/strings/str_split.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/lite/kernels/internal/compatibility.h"
+#include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
 namespace {
@@ -518,6 +519,7 @@ int GetBuiltinOperatorVersion(const OpSignature& op_sig) {
     case BuiltinOperator_LESS:
     case BuiltinOperator_LESS_EQUAL:
     case BuiltinOperator_SELECT:
+    case BuiltinOperator_BATCH_MATMUL:
       if (op_sig.input_types.at(0) == TensorType_INT8) {
         return 2;
       }
diff --git a/tensorflow/lite/tools/versioning/runtime_version.cc b/tensorflow/lite/tools/versioning/runtime_version.cc
index 92a7001606f..36976354685 100644
--- a/tensorflow/lite/tools/versioning/runtime_version.cc
+++ b/tensorflow/lite/tools/versioning/runtime_version.cc
@@ -58,6 +58,7 @@ std::string FindMinimumRuntimeVersionForOp(tflite::BuiltinOperator op_code,
               {{BuiltinOperator_AVERAGE_POOL_2D, 2}, "1.14.0"},
               {{BuiltinOperator_AVERAGE_POOL_2D, 3}, kPendingReleaseVersion},
               {{BuiltinOperator_BATCH_MATMUL, 1}, kPendingReleaseVersion},
+              {{BuiltinOperator_BATCH_MATMUL, 2}, kPendingReleaseVersion},
               {{BuiltinOperator_CONV_2D, 1}, "1.5.0"},
               {{BuiltinOperator_CONV_2D, 2}, "1.14.0"},
               {{BuiltinOperator_CONV_2D, 3}, "1.14.0"},

From 07c54454eec55c1279c243a3c148eeee81b41ed5 Mon Sep 17 00:00:00 2001
From: Tamara Norman <tamaranorman@google.com>
Date: Fri, 19 Jun 2020 08:48:51 -0700
Subject: [PATCH 0616/1390] Add an option such that the cached host_value can
 be discarded

PiperOrigin-RevId: 317315157
Change-Id: I9d7145390a526003069321c7e04794e139a53c09
---
 tensorflow/compiler/xla/pjrt/pjrt_client.cc | 6 +++++-
 tensorflow/compiler/xla/pjrt/pjrt_client.h  | 8 ++++++--
 2 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/tensorflow/compiler/xla/pjrt/pjrt_client.cc b/tensorflow/compiler/xla/pjrt/pjrt_client.cc
index 46f592100c9..b4f0363e69a 100644
--- a/tensorflow/compiler/xla/pjrt/pjrt_client.cc
+++ b/tensorflow/compiler/xla/pjrt/pjrt_client.cc
@@ -1077,13 +1077,17 @@ Status PjRtBuffer::CopyToHostAsync() {
   return Status::OK();
 }
 
-StatusOr<std::shared_ptr<Literal>> PjRtBuffer::ToLiteral() {
+StatusOr<std::shared_ptr<Literal>> PjRtBuffer::ToLiteral(
+    const bool discard_cached_copy) {
   tensorflow::profiler::TraceMe traceme("PjRtBuffer::ToLiteral");
   TF_RETURN_IF_ERROR(CopyToHostAsync());
   std::shared_ptr<HostValue> host_value;
   {
     absl::MutexLock lock(&mu_);
     host_value = host_value_;
+    if (discard_cached_copy) {
+      host_value_ = nullptr;
+    }
   }
   if (host_value == nullptr) {
     return InvalidArgument("ToLiteral called on invalid buffer");
diff --git a/tensorflow/compiler/xla/pjrt/pjrt_client.h b/tensorflow/compiler/xla/pjrt/pjrt_client.h
index 754eb19bec6..8f74e6244d6 100644
--- a/tensorflow/compiler/xla/pjrt/pjrt_client.h
+++ b/tensorflow/compiler/xla/pjrt/pjrt_client.h
@@ -478,8 +478,12 @@ class PjRtBuffer {
 
   // Returns the buffer's value as an XLA Literal. If the value has previously
   // been prefetched to the host, then returns the prefetched version, otherwise
-  // copies the buffer to the host. Blocks until the value is ready.
-  StatusOr<std::shared_ptr<Literal>> ToLiteral();
+  // copies the buffer to the host. Blocks until the value is ready. If
+  // `discard_cached_copy` is true then buffer will no longer keep hold of a
+  // cached copy of the literal (i.e. The reference to the host value will be
+  // removed.)
+  StatusOr<std::shared_ptr<Literal>> ToLiteral(
+      bool discard_cached_copy = false);
 
   // Initiates a copy of the buffer to the host. Does not block waiting for
   // the transfer to complete. The value can be retrieved by a later call to

From 16cb89bd7b40fc816aa7440f62b443ca480bcf6d Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 19 Jun 2020 09:15:04 -0700
Subject: [PATCH 0617/1390] Qualify uses of std::string

PiperOrigin-RevId: 317319501
Change-Id: Ib75a31ad89fa1a6bda81450f2ab5ba07d7338ada
---
 tensorflow/lite/toco/tflite/export.cc         | 58 +++++++++----------
 tensorflow/lite/toco/tflite/export.h          | 12 ++--
 tensorflow/lite/toco/tflite/export_test.cc    | 56 ++++++++++--------
 tensorflow/lite/toco/tflite/import.cc         | 18 +++---
 tensorflow/lite/toco/tflite/import.h          |  6 +-
 tensorflow/lite/toco/tflite/import_test.cc    |  6 +-
 tensorflow/lite/toco/tflite/op_version.cc     |  8 +--
 .../lite/toco/tflite/op_version_test.cc       | 32 +++++-----
 tensorflow/lite/toco/tflite/operator.cc       | 37 ++++++------
 tensorflow/lite/toco/tflite/operator.h        | 12 ++--
 tensorflow/lite/toco/tflite/operator_test.cc  |  8 +--
 tensorflow/lite/toco/tflite/types.cc          |  6 +-
 12 files changed, 133 insertions(+), 126 deletions(-)

diff --git a/tensorflow/lite/toco/tflite/export.cc b/tensorflow/lite/toco/tflite/export.cc
index d72a902001d..d109ab875b5 100644
--- a/tensorflow/lite/toco/tflite/export.cc
+++ b/tensorflow/lite/toco/tflite/export.cc
@@ -52,7 +52,7 @@ using ::tflite::Tensor;
 namespace {
 
 // Check if a TensorFlow Op is a control flow op by its name.
-bool IsControlFlowOp(const string& tensorflow_op) {
+bool IsControlFlowOp(const std::string& tensorflow_op) {
   // Technically this is equivalent to `::tensorflow::Node::IsControlFlow()`.
   // It requires to construct a `::tensorflow::Graph` to use that helper
   // function, so we simply hardcode the list of control flow ops here.
@@ -68,7 +68,7 @@ bool IsControlFlowOp(const string& tensorflow_op) {
 }
 
 // Check if a TensorFlow Op is unsupported by the Flex runtime.
-bool IsUnsupportedFlexOp(const string& tensorflow_op) {
+bool IsUnsupportedFlexOp(const std::string& tensorflow_op) {
   if (IsControlFlowOp(tensorflow_op)) {
     return true;
   }
@@ -82,14 +82,14 @@ bool IsUnsupportedFlexOp(const string& tensorflow_op) {
 }
 
 // Map from operator name to TF Lite enum value, for all builtins.
-const std::map<string, BuiltinOperator>& GetBuiltinOpsMap() {
-  static std::map<string, BuiltinOperator>* builtin_ops = nullptr;
+const std::map<std::string, BuiltinOperator>& GetBuiltinOpsMap() {
+  static std::map<std::string, BuiltinOperator>* builtin_ops = nullptr;
   if (builtin_ops == nullptr) {
-    builtin_ops = new std::map<string, BuiltinOperator>();
+    builtin_ops = new std::map<std::string, BuiltinOperator>();
 
     for (int i = BuiltinOperator_MIN; i <= BuiltinOperator_MAX; ++i) {
       BuiltinOperator op = static_cast<BuiltinOperator>(i);
-      string name = EnumNameBuiltinOperator(op);
+      std::string name = EnumNameBuiltinOperator(op);
       if (op != BuiltinOperator_CUSTOM && !name.empty()) {
         (*builtin_ops)[name] = op;
       }
@@ -99,10 +99,10 @@ const std::map<string, BuiltinOperator>& GetBuiltinOpsMap() {
 }
 
 void WriteModelToString(const flatbuffers::FlatBufferBuilder& builder,
-                        string* file_contents) {
+                        std::string* file_contents) {
   const uint8_t* buffer = builder.GetBufferPointer();
   int size = builder.GetSize();
-  *file_contents = string(reinterpret_cast<const char*>(buffer), size);
+  *file_contents = std::string(reinterpret_cast<const char*>(buffer), size);
 }
 
 }  // Anonymous namespace.
@@ -115,7 +115,7 @@ OperatorKey::OperatorKey(
     bool enable_select_tf_ops) {
   // Get the op name (by Toco definition).
   const ::toco::Operator& op = *op_signature.op;
-  string name = HelpfulOperatorTypeName(op);
+  std::string name = HelpfulOperatorTypeName(op);
 
   bool is_builtin = false;
   const auto& builtin_ops = GetBuiltinOpsMap();
@@ -146,7 +146,7 @@ OperatorKey::OperatorKey(
       is_flex_op_ = true;
       flex_tensorflow_op_ = tensorflow_op;
       custom_code_ =
-          string(::tflite::kFlexCustomCodePrefix) + flex_tensorflow_op_;
+          std::string(::tflite::kFlexCustomCodePrefix) + flex_tensorflow_op_;
     } else {
       custom_code_ = tensorflow_op;
     }
@@ -158,7 +158,7 @@ OperatorKey::OperatorKey(
     is_flex_op_ = true;
     flex_tensorflow_op_ = name;
     custom_code_ =
-        string(::tflite::kFlexCustomCodePrefix) + flex_tensorflow_op_;
+        std::string(::tflite::kFlexCustomCodePrefix) + flex_tensorflow_op_;
   } else {
     // If Flex is disabled or the original TensorFlow NodeDef isn't available,
     // we produce a custom op. This gives developers a chance to implement
@@ -175,7 +175,7 @@ OperatorKey::OperatorKey(
 
 void LoadTensorsMap(const Model& model, TensorsMap* tensors_map) {
   // First find a list of unique array names.
-  std::set<string> names;
+  std::set<std::string> names;
   for (const auto& array_pair : model.GetArrayMap()) {
     names.insert(array_pair.first);
   }
@@ -218,7 +218,7 @@ Offset<Vector<Offset<Tensor>>> ExportTensors(
   std::map<int, Offset<Tensor>> ordered_tensors;
 
   for (const auto& array_pair : model.GetArrayMap()) {
-    const string& tensor_name = array_pair.first;
+    const std::string& tensor_name = array_pair.first;
     const toco::Array& array = *array_pair.second;
 
     int buffer_index = buffers_to_write->size();
@@ -283,7 +283,7 @@ Offset<Vector<int32_t>> ExportOutputTensors(
     const Model& model, const details::TensorsMap& tensors_map,
     FlatBufferBuilder* builder) {
   std::vector<int32_t> outputs;
-  for (const string& output : model.flags.output_arrays()) {
+  for (const std::string& output : model.flags.output_arrays()) {
     outputs.push_back(tensors_map.at(output));
   }
   return builder->CreateVector<int32_t>(outputs);
@@ -295,10 +295,10 @@ Offset<Vector<Offset<OperatorCode>>> ExportOperatorCodes(
     const details::OperatorsMap& operators_map, FlatBufferBuilder* builder,
     const ExportParams& params) {
   // Map from operator name to TF Lite enum value, for all builtins.
-  std::map<string, BuiltinOperator> builtin_ops;
+  std::map<std::string, BuiltinOperator> builtin_ops;
   for (int i = BuiltinOperator_MIN; i <= BuiltinOperator_MAX; ++i) {
     BuiltinOperator op = static_cast<BuiltinOperator>(i);
-    string name = EnumNameBuiltinOperator(op);
+    std::string name = EnumNameBuiltinOperator(op);
     if (op != BuiltinOperator_CUSTOM && !name.empty()) {
       builtin_ops[name] = op;
     }
@@ -349,13 +349,13 @@ Offset<Vector<Offset<Operator>>> ExportOperators(
   std::vector<Offset<Operator>> op_vector;
   for (const auto& op : model.operators) {
     std::vector<int32_t> inputs;
-    for (const string& input : op->inputs) {
+    for (const std::string& input : op->inputs) {
       // -1 is the ID for optional tensor in TFLite output
       int id = model.IsOptionalArray(input) ? -1 : tensors_map.at(input);
       inputs.push_back(id);
     }
     std::vector<int32_t> outputs;
-    for (const string& output : op->outputs) {
+    for (const std::string& output : op->outputs) {
       outputs.push_back(tensors_map.at(output));
     }
     const toco::OperatorSignature op_signature = {op.get(), &model};
@@ -428,15 +428,15 @@ Offset<Vector<Offset<Buffer>>> ExportBuffers(
   return builder->CreateVector(buffer_vector);
 }
 
-tensorflow::Status Export(const Model& model, string* output_file_contents,
+tensorflow::Status Export(const Model& model, std::string* output_file_contents,
                           const ExportParams& params) {
   const auto ops_by_type = BuildOperatorByTypeMap(params.enable_select_tf_ops);
   return Export(model, output_file_contents, params, ops_by_type);
 }
 
-void ParseControlFlowErrors(std::set<string>* custom_ops,
-                            std::vector<string>* error_msgs) {
-  std::set<string> unsupported_control_flow_ops;
+void ParseControlFlowErrors(std::set<std::string>* custom_ops,
+                            std::vector<std::string>* error_msgs) {
+  std::set<std::string> unsupported_control_flow_ops;
   // Check if unsupported ops contains control flow ops. It's impossible
   // to implement these ops as custom ops at the moment.
   for (const auto& op : *custom_ops) {
@@ -471,10 +471,10 @@ void ExportModelVersionBuffer(
 }
 
 tensorflow::Status Export(
-    const Model& model, string* output_file_contents,
+    const Model& model, std::string* output_file_contents,
     const ExportParams& params,
     const std::map<OperatorType, std::unique_ptr<BaseOperator>>& ops_by_type) {
-  for (const string& input_array : model.GetInvalidInputArrays()) {
+  for (const std::string& input_array : model.GetInvalidInputArrays()) {
     if (model.HasArray(input_array)) {
       return tensorflow::errors::InvalidArgument(
           absl::StrCat("Placeholder ", input_array,
@@ -509,11 +509,11 @@ tensorflow::Status Export(
   }
 
   // The set of used builtin ops.
-  std::set<string> builtin_ops;
+  std::set<std::string> builtin_ops;
   // The set of custom ops (not including Flex ops).
-  std::set<string> custom_ops;
+  std::set<std::string> custom_ops;
   // The set of Flex ops which are not supported.
-  std::set<string> unsupported_flex_ops;
+  std::set<std::string> unsupported_flex_ops;
 
   for (const auto& it : operators_map) {
     const details::OperatorKey& key = it.first;
@@ -540,7 +540,7 @@ tensorflow::Status Export(
                "40-tflite-op-request.md\n and pasting the following:\n\n";
       };
 
-      std::vector<string> error_msgs;
+      std::vector<std::string> error_msgs;
       ParseControlFlowErrors(&custom_ops, &error_msgs);
 
       // Remove ExpandDims and ReorderAxes from unimplemented list unless they
@@ -549,7 +549,7 @@ tensorflow::Status Export(
       // transformation is unable to run because the output shape is not
       // defined. This causes unnecessary confusion during model conversion
       // time.
-      std::set<string> custom_ops_final;
+      std::set<std::string> custom_ops_final;
       for (const auto& op_type : custom_ops) {
         if (op_type != "ReorderAxes" && op_type != "ExpandDims") {
           custom_ops_final.insert(op_type);
diff --git a/tensorflow/lite/toco/tflite/export.h b/tensorflow/lite/toco/tflite/export.h
index 3af77ffcf43..64f7c7b128f 100644
--- a/tensorflow/lite/toco/tflite/export.h
+++ b/tensorflow/lite/toco/tflite/export.h
@@ -35,19 +35,19 @@ struct ExportParams {
 
 // Transform the given tf.mini model into a TF Lite flatbuffer and deposit the
 // result in the given string.
-tensorflow::Status Export(const Model& model, string* output_file_contents,
+tensorflow::Status Export(const Model& model, std::string* output_file_contents,
                           const ExportParams& params);
 
 // Export API with custom TFLite operator mapping.
 tensorflow::Status Export(
-    const Model& model, string* output_file_contents,
+    const Model& model, std::string* output_file_contents,
     const ExportParams& params,
     const std::map<OperatorType, std::unique_ptr<BaseOperator>>& ops_by_type);
 
 // This is for backward-compatibility.
 // TODO(ycling): Remove the deprecated entry functions.
 inline void Export(const Model& model, bool allow_custom_ops,
-                   bool quantize_weights, string* output_file_contents) {
+                   bool quantize_weights, std::string* output_file_contents) {
   ExportParams params;
   params.allow_custom_ops = allow_custom_ops;
   params.quantize_weights =
@@ -60,7 +60,7 @@ inline void Export(const Model& model, bool allow_custom_ops,
 // TODO(ycling): Remove the deprecated entry functions.
 inline void Export(
     const Model& model, bool allow_custom_ops, bool quantize_weights,
-    string* output_file_contents,
+    std::string* output_file_contents,
     const std::map<OperatorType, std::unique_ptr<BaseOperator>>& ops_by_type) {
   ExportParams params;
   params.allow_custom_ops = allow_custom_ops;
@@ -72,7 +72,7 @@ inline void Export(
 
 // This is for backward-compatibility.
 // TODO(ycling): Remove the deprecated entry functions.
-inline void Export(const Model& model, string* output_file_contents) {
+inline void Export(const Model& model, std::string* output_file_contents) {
   ExportParams params;
   params.allow_custom_ops = true;
   auto status = Export(model, output_file_contents, params);
@@ -82,7 +82,7 @@ inline void Export(const Model& model, string* output_file_contents) {
 namespace details {
 
 // A map from tensor name to its final position in the TF Lite buffer.
-using TensorsMap = std::unordered_map<string, int>;
+using TensorsMap = std::unordered_map<std::string, int>;
 
 // A key to identify an operator.
 // Only when `type` is `kUnsupported`, `custom_code` is filled to
diff --git a/tensorflow/lite/toco/tflite/export_test.cc b/tensorflow/lite/toco/tflite/export_test.cc
index 19b77543c66..ed347a28d51 100644
--- a/tensorflow/lite/toco/tflite/export_test.cc
+++ b/tensorflow/lite/toco/tflite/export_test.cc
@@ -34,13 +34,13 @@ using ::testing::HasSubstr;
 class ExportTest : public ::testing::Test {
  protected:
   void ResetOperators() { input_model_.operators.clear(); }
-  void AddTensorsByName(std::initializer_list<string> names) {
-    for (const string& name : names) {
+  void AddTensorsByName(std::initializer_list<std::string> names) {
+    for (const std::string& name : names) {
       input_model_.GetOrCreateArray(name);
     }
   }
-  void AddOperatorsByName(std::initializer_list<string> names) {
-    for (const string& name : names) {
+  void AddOperatorsByName(std::initializer_list<std::string> names) {
+    for (const std::string& name : names) {
       if (name == "Conv") {
         auto* op = new ConvOperator;
         op->padding.type = PaddingType::kSame;
@@ -153,14 +153,15 @@ class ExportTest : public ::testing::Test {
   }
 
   tensorflow::Status ExportAndReturnStatus(const ExportParams& params) {
-    string result;
+    std::string result;
     return Export(input_model_, &result, params);
   }
 
-  std::vector<string> ExportAndSummarizeOperators(const ExportParams& params) {
-    std::vector<string> names;
+  std::vector<std::string> ExportAndSummarizeOperators(
+      const ExportParams& params) {
+    std::vector<std::string> names;
 
-    string result;
+    std::string result;
     auto status = Export(input_model_, &result, params);
     if (!status.ok()) {
       LOG(INFO) << status.error_message();
@@ -171,10 +172,12 @@ class ExportTest : public ::testing::Test {
 
     for (const ::tflite::OperatorCode* opcode : *model->operator_codes()) {
       if (opcode->builtin_code() != ::tflite::BuiltinOperator_CUSTOM) {
-        names.push_back(string("builtin:") + ::tflite::EnumNameBuiltinOperator(
-                                                 opcode->builtin_code()));
+        names.push_back(
+            std::string("builtin:") +
+            ::tflite::EnumNameBuiltinOperator(opcode->builtin_code()));
       } else {
-        names.push_back(string("custom:") + opcode->custom_code()->c_str());
+        names.push_back(std::string("custom:") +
+                        opcode->custom_code()->c_str());
       }
     }
 
@@ -185,7 +188,7 @@ class ExportTest : public ::testing::Test {
       const ExportParams& params) {
     std::vector<uint32_t> indices;
 
-    string result;
+    std::string result;
     if (!Export(input_model_, &result, params).ok()) return indices;
     auto* model = ::tflite::GetModel(result.data());
 
@@ -257,7 +260,7 @@ TEST_F(ExportTest, ExportMinRuntime) {
   params.enable_select_tf_ops = false;
   params.quantize_weights = QuantizedBufferType::NONE;
 
-  string output;
+  std::string output;
   auto status = Export(input_model_, &output, params);
   auto* model = ::tflite::GetModel(output.data());
   EXPECT_EQ(model->metadata()->size(), 1);
@@ -265,7 +268,8 @@ TEST_F(ExportTest, ExportMinRuntime) {
   auto buf = model->metadata()->Get(0)->buffer();
   auto* buffer = (*model->buffers())[buf];
   auto* array = buffer->data();
-  string version(reinterpret_cast<const char*>(array->data()), array->size());
+  std::string version(reinterpret_cast<const char*>(array->data()),
+                      array->size());
   EXPECT_EQ(version, "1.6.0");
 }
 
@@ -275,7 +279,7 @@ TEST_F(ExportTest, ExportEmptyMinRuntime) {
   ExportParams params;
   params.allow_custom_ops = true;
 
-  string output;
+  std::string output;
   auto status = Export(input_model_, &output, params);
   auto* model = ::tflite::GetModel(output.data());
   EXPECT_EQ(model->metadata()->size(), 1);
@@ -283,7 +287,8 @@ TEST_F(ExportTest, ExportEmptyMinRuntime) {
   auto buf = model->metadata()->Get(0)->buffer();
   auto* buffer = (*model->buffers())[buf];
   auto* array = buffer->data();
-  string version(reinterpret_cast<const char*>(array->data()), array->size());
+  std::string version(reinterpret_cast<const char*>(array->data()),
+                      array->size());
   EXPECT_EQ(version, "");
 }
 
@@ -296,7 +301,7 @@ TEST_F(ExportTest, UnsupportedControlFlowErrors) {
   // The model contains control flow ops which are not convertible, so we should
   // check the returned error message.
 
-  string output;
+  std::string output;
   const auto ops_by_type = BuildOperatorByTypeMap();
   auto status = Export(input_model_, &output, params, ops_by_type);
   EXPECT_EQ(status.error_message(),
@@ -318,7 +323,7 @@ TEST_F(ExportTest, UnsupportedOpsAndNeedEnableFlex) {
   params.allow_custom_ops = false;
   params.enable_select_tf_ops = false;
 
-  string output;
+  std::string output;
   const auto ops_by_type = BuildOperatorByTypeMap();
   auto status = Export(input_model_, &output, params, ops_by_type);
   EXPECT_EQ(
@@ -348,7 +353,7 @@ TEST_F(ExportTest, UnsupportedOpsNeedCustomImplementation) {
   params.allow_custom_ops = false;
   params.enable_select_tf_ops = true;
 
-  string output;
+  std::string output;
   const auto ops_by_type = BuildOperatorByTypeMap();
   auto status = Export(input_model_, &output, params, ops_by_type);
   EXPECT_EQ(
@@ -378,7 +383,7 @@ TEST_F(ExportTest, UnsupportedControlFlowAndCustomOpsErrors) {
   // The model contains control flow ops which are not convertible, so we should
   // check the returned error message.
 
-  string output;
+  std::string output;
   const auto ops_by_type = BuildOperatorByTypeMap();
   auto status = Export(input_model_, &output, params, ops_by_type);
   EXPECT_EQ(
@@ -407,11 +412,11 @@ TEST_F(ExportTest, UnsupportedControlFlowAndCustomOpsErrors) {
 TEST_F(ExportTest, QuantizeWeights) {
   // Sanity check for quantize_weights parameter.
   BuildQuantizableTestModel();
-  string unquantized_result;
+  std::string unquantized_result;
   Export(input_model_, true, /*quantize_weights*/ false, &unquantized_result);
 
   BuildQuantizableTestModel();
-  string quantized_result;
+  std::string quantized_result;
   Export(input_model_, true, /*quantize_weights*/ true, &quantized_result);
 
   // The quantized models should be smaller.
@@ -443,12 +448,13 @@ class OpSetsTest : public ExportTest {
     }
   }
 
-  std::vector<string> ImportExport(std::initializer_list<string> op_names) {
+  std::vector<std::string> ImportExport(
+      std::initializer_list<std::string> op_names) {
     ResetOperators();
     if (!import_all_ops_as_unsupported_) {
       AddOperatorsByName(op_names);
     } else {
-      for (const string& name : op_names) {
+      for (const std::string& name : op_names) {
         auto* op = new TensorFlowUnsupportedOperator;
         op->tensorflow_op = name;
         input_model_.operators.emplace_back(op);
@@ -644,7 +650,7 @@ TEST_F(VersionedOpExportTest, Export) {
   AddConvOp(false);
   AddConvOp(true);
 
-  string result;
+  std::string result;
   const auto ops_by_type = BuildFakeOperatorByTypeMap();
   Export(input_model_, true, false, &result, ops_by_type);
 
diff --git a/tensorflow/lite/toco/tflite/import.cc b/tensorflow/lite/toco/tflite/import.cc
index 0f3dd48652e..136aa4ffaa8 100644
--- a/tensorflow/lite/toco/tflite/import.cc
+++ b/tensorflow/lite/toco/tflite/import.cc
@@ -99,7 +99,7 @@ void ImportTensors(const ::tflite::Model& input_model, Model* model) {
 
 void ImportOperators(
     const ::tflite::Model& input_model,
-    const std::map<string, std::unique_ptr<BaseOperator>>& ops_by_name,
+    const std::map<std::string, std::unique_ptr<BaseOperator>>& ops_by_name,
     const details::TensorsTable& tensors_table,
     const details::OperatorsTable& operators_table, Model* model) {
   // TODO(aselle): add support for multiple subgraphs.
@@ -112,12 +112,12 @@ void ImportOperators(
       LOG(FATAL) << "Index " << index << " must be between zero and "
                  << operators_table.size();
     }
-    string opname = operators_table.at(index);
+    std::string opname = operators_table.at(index);
 
     // Find and use the appropriate operator deserialization factory.
     std::unique_ptr<Operator> new_op = nullptr;
     if (ops_by_name.count(opname) == 0) {
-      string effective_opname = "TENSORFLOW_UNSUPPORTED";
+      std::string effective_opname = "TENSORFLOW_UNSUPPORTED";
       if (ops_by_name.count(effective_opname) == 0) {
         LOG(FATAL) << "Internal logic error: TENSORFLOW_UNSUPPORTED not found.";
       }
@@ -147,10 +147,10 @@ void ImportOperators(
       auto input_index = inputs->Get(i);
       // input_index == -1 indicates optional tensor.
       if (input_index != -1) {
-        const string& input_name = tensors_table.at(input_index);
+        const std::string& input_name = tensors_table.at(input_index);
         op->inputs.push_back(input_name);
       } else {
-        const string& tensor_name =
+        const std::string& tensor_name =
             toco::AvailableArrayName(*model, "OptionalTensor");
         model->CreateOptionalArray(tensor_name);
         op->inputs.push_back(tensor_name);
@@ -159,7 +159,7 @@ void ImportOperators(
     auto outputs = input_op->outputs();
     for (int i = 0; i < outputs->Length(); i++) {
       auto output_index = outputs->Get(i);
-      const string& output_name = tensors_table.at(output_index);
+      const std::string& output_name = tensors_table.at(output_index);
       op->outputs.push_back(output_name);
     }
   }
@@ -173,7 +173,7 @@ void ImportIOTensors(const ModelFlags& model_flags,
     auto inputs = (*input_model.subgraphs())[0]->inputs();
     if (inputs) {
       for (int input : *inputs) {
-        const string& input_name = tensors_table.at(input);
+        const std::string& input_name = tensors_table.at(input);
         model->flags.add_input_arrays()->set_name(input_name);
       }
     }
@@ -184,7 +184,7 @@ void ImportIOTensors(const ModelFlags& model_flags,
     auto outputs = (*input_model.subgraphs())[0]->outputs();
     if (outputs) {
       for (int output : *outputs) {
-        const string& output_name = tensors_table.at(output);
+        const std::string& output_name = tensors_table.at(output);
         model->flags.add_output_arrays(output_name);
       }
     }
@@ -199,7 +199,7 @@ bool Verify(const void* buf, size_t len) {
 }  // namespace
 
 std::unique_ptr<Model> Import(const ModelFlags& model_flags,
-                              const string& input_file_contents) {
+                              const std::string& input_file_contents) {
   ::tflite::AlwaysTrueResolver r;
   if (!::tflite::Verify(input_file_contents.data(), input_file_contents.size(),
                         r, ::tflite::DefaultErrorReporter())) {
diff --git a/tensorflow/lite/toco/tflite/import.h b/tensorflow/lite/toco/tflite/import.h
index f5de3b53b5b..bac55aae8b9 100644
--- a/tensorflow/lite/toco/tflite/import.h
+++ b/tensorflow/lite/toco/tflite/import.h
@@ -24,17 +24,17 @@ namespace tflite {
 
 // Parse the given string as TF Lite flatbuffer and return a new tf.mini model.
 std::unique_ptr<Model> Import(const ModelFlags &model_flags,
-                              const string &input_file_contents);
+                              const std::string &input_file_contents);
 
 namespace details {
 
 // The names of all tensors found in a TF Lite model.
-using TensorsTable = std::vector<string>;
+using TensorsTable = std::vector<std::string>;
 
 // The names of all operators found in TF Lite model. If the operator is
 // builtin, the string representation of the corresponding enum value is used
 // as name.
-using OperatorsTable = std::vector<string>;
+using OperatorsTable = std::vector<std::string>;
 
 void LoadTensorsTable(const ::tflite::Model &input_model,
                       TensorsTable *tensors_table);
diff --git a/tensorflow/lite/toco/tflite/import_test.cc b/tensorflow/lite/toco/tflite/import_test.cc
index b00c4124d83..6163ebab45b 100644
--- a/tensorflow/lite/toco/tflite/import_test.cc
+++ b/tensorflow/lite/toco/tflite/import_test.cc
@@ -134,9 +134,9 @@ class ImportTest : public ::testing::Test {
 
     input_model_ = ::tflite::GetModel(builder_.GetBufferPointer());
   }
-  string InputModelAsString() {
-    return string(reinterpret_cast<char*>(builder_.GetBufferPointer()),
-                  builder_.GetSize());
+  std::string InputModelAsString() {
+    return std::string(reinterpret_cast<char*>(builder_.GetBufferPointer()),
+                       builder_.GetSize());
   }
   flatbuffers::FlatBufferBuilder builder_;
   const ::tflite::Model* input_model_ = nullptr;
diff --git a/tensorflow/lite/toco/tflite/op_version.cc b/tensorflow/lite/toco/tflite/op_version.cc
index cf127a9f459..efa53c69cae 100644
--- a/tensorflow/lite/toco/tflite/op_version.cc
+++ b/tensorflow/lite/toco/tflite/op_version.cc
@@ -29,7 +29,7 @@ namespace tflite {
 
 // Deprecated and please register new ops/versions in
 // tflite/tools/versioning/op_version.cc".
-string GetMinimumRuntimeVersionForModel(const Model& model) {
+std::string GetMinimumRuntimeVersionForModel(const Model& model) {
   // Use this as the placeholder string if a particular op is not yet included
   // in any Tensorflow's RC/Final release source package. Once that op is
   // included in the release, please update this with the real version string.
@@ -37,8 +37,8 @@ string GetMinimumRuntimeVersionForModel(const Model& model) {
   // A map from the version key of an op to its minimum runtime version.
   // For example, {{kAveragePool, 1}, "1.5.0"},  means the 1st version of
   // AveragePool requires a minimum TF Lite runtime version '1.5.0`.
-  static const std::map<std::pair<OperatorType, int>, string>* op_version_map =
-      new std::map<std::pair<OperatorType, int>, string>({
+  static const std::map<std::pair<OperatorType, int>, std::string>*
+      op_version_map = new std::map<std::pair<OperatorType, int>, std::string>({
           {{OperatorType::kAveragePool, 1}, "1.5.0"},
           {{OperatorType::kAveragePool, 2}, "1.14.0"},
           {{OperatorType::kAveragePool, 3}, kPendingReleaseOpVersion},
@@ -253,7 +253,7 @@ string GetMinimumRuntimeVersionForModel(const Model& model) {
       tflite::BuildOperatorByTypeMap(false /*enable_select_tf_ops=*/);
   OperatorSignature op_signature;
   op_signature.model = &model;
-  string model_min_version;
+  std::string model_min_version;
   for (const auto& op : model.operators) {
     if (op_types_map.find(op->type) == op_types_map.end()) continue;
     op_signature.op = op.get();
diff --git a/tensorflow/lite/toco/tflite/op_version_test.cc b/tensorflow/lite/toco/tflite/op_version_test.cc
index 14b086471b7..8466fc35ad7 100644
--- a/tensorflow/lite/toco/tflite/op_version_test.cc
+++ b/tensorflow/lite/toco/tflite/op_version_test.cc
@@ -27,9 +27,9 @@ TEST(OpVersionTest, MinimumVersionForSameOpVersions) {
   Model model;
   // Float convolutional kernel is introduced since '1.5.0'.
   std::unique_ptr<ConvOperator> conv(new ConvOperator());
-  const string conv_input = "conv_input";
-  const string conv_filter = "conv_filter";
-  const string conv_output = "conv_output";
+  const std::string conv_input = "conv_input";
+  const std::string conv_filter = "conv_filter";
+  const std::string conv_output = "conv_output";
   conv->inputs.push_back(conv_input);
   conv->inputs.push_back(conv_filter);
   conv->outputs.push_back(conv_output);
@@ -44,8 +44,8 @@ TEST(OpVersionTest, MinimumVersionForSameOpVersions) {
 
   // Float softmax kernel is introduced since '1.5.0'.
   std::unique_ptr<SoftmaxOperator> softmax(new SoftmaxOperator());
-  const string softmax_input = "softmax_input";
-  const string softmax_output = "softmax_output";
+  const std::string softmax_input = "softmax_input";
+  const std::string softmax_output = "softmax_output";
   softmax->inputs.push_back(softmax_input);
   softmax->outputs.push_back(softmax_output);
   array_map[softmax_input] = std::unique_ptr<Array>(new Array);
@@ -60,9 +60,9 @@ TEST(OpVersionTest, MinimumVersionForMultipleOpVersions) {
   Model model;
   // Dilated DepthWiseConvolution is introduced since '1.12.0'.
   std::unique_ptr<DepthwiseConvOperator> conv(new DepthwiseConvOperator());
-  const string conv_input = "conv_input";
-  const string conv_filter = "conv_filter";
-  const string conv_output = "conv_output";
+  const std::string conv_input = "conv_input";
+  const std::string conv_filter = "conv_filter";
+  const std::string conv_output = "conv_output";
   conv->inputs.push_back(conv_input);
   conv->inputs.push_back(conv_filter);
   conv->outputs.push_back(conv_output);
@@ -77,10 +77,10 @@ TEST(OpVersionTest, MinimumVersionForMultipleOpVersions) {
   // FullyConnected op with kShuffled4x16Int8 weight format is introduced from
   // '1.10.0'.
   std::unique_ptr<FullyConnectedOperator> fc(new FullyConnectedOperator());
-  const string fc_input = "fc_input";
-  const string fc_weights = "fc_weights";
-  const string fc_bias = "fc_bias";
-  const string fc_output = "fc_output";
+  const std::string fc_input = "fc_input";
+  const std::string fc_weights = "fc_weights";
+  const std::string fc_bias = "fc_bias";
+  const std::string fc_output = "fc_output";
   fc->inputs.push_back(fc_input);
   fc->inputs.push_back(fc_weights);
   fc->inputs.push_back(fc_bias);
@@ -121,10 +121,10 @@ TEST(OpVersionTest, MinimumVersionForMixedOpVersions) {
   // FullyConnected op with kShuffled4x16Int8 weight format is introduced from
   // '1.10.0'.
   std::unique_ptr<FullyConnectedOperator> fc(new FullyConnectedOperator());
-  const string fc_input = "fc_input";
-  const string fc_weights = "fc_weights";
-  const string fc_bias = "fc_bias";
-  const string fc_output = "fc_output";
+  const std::string fc_input = "fc_input";
+  const std::string fc_weights = "fc_weights";
+  const std::string fc_bias = "fc_bias";
+  const std::string fc_output = "fc_output";
   fc->inputs.push_back(fc_input);
   fc->inputs.push_back(fc_weights);
   fc->inputs.push_back(fc_bias);
diff --git a/tensorflow/lite/toco/tflite/operator.cc b/tensorflow/lite/toco/tflite/operator.cc
index fee10a19787..be539cf6054 100644
--- a/tensorflow/lite/toco/tflite/operator.cc
+++ b/tensorflow/lite/toco/tflite/operator.cc
@@ -238,7 +238,7 @@ class SpaceToBatchND
                    TocoOperator* op) const override {}
 
   int GetVersion(const OperatorSignature& op_signature) const override {
-    const string& input_name = op_signature.op->inputs[0];
+    const std::string& input_name = op_signature.op->inputs[0];
     const Array& input_array = op_signature.model->GetArray(input_name);
     ::tflite::OpSignature op_sig =
         GetVersioningOpSig(builtin_op(), op_signature);
@@ -268,8 +268,8 @@ class Sub : public BuiltinOperator<SubOperator, ::tflite::SubOptions,
   }
 
   int GetVersion(const OperatorSignature& op_signature) const override {
-    const string& input1_name = op_signature.op->inputs[0];
-    const string& input2_name = op_signature.op->inputs[1];
+    const std::string& input1_name = op_signature.op->inputs[0];
+    const std::string& input2_name = op_signature.op->inputs[1];
     const Array& input1_array = op_signature.model->GetArray(input1_name);
     const Array& input2_array = op_signature.model->GetArray(input2_name);
     ::tflite::OpSignature op_sig =
@@ -305,8 +305,8 @@ class Div : public BuiltinOperator<DivOperator, ::tflite::DivOptions,
   }
 
   int GetVersion(const OperatorSignature& op_signature) const override {
-    const string& input1_name = op_signature.op->inputs[0];
-    const string& input2_name = op_signature.op->inputs[1];
+    const std::string& input1_name = op_signature.op->inputs[0];
+    const std::string& input2_name = op_signature.op->inputs[1];
     const Array& input1_array = op_signature.model->GetArray(input1_name);
     const Array& input2_array = op_signature.model->GetArray(input2_name);
     ::tflite::OpSignature op_sig =
@@ -339,7 +339,7 @@ class BatchToSpaceND
                    TocoOperator* op) const override {}
 
   int GetVersion(const OperatorSignature& op_signature) const override {
-    const string& input_name = op_signature.op->inputs[0];
+    const std::string& input_name = op_signature.op->inputs[0];
     const Array& input_array = op_signature.model->GetArray(input_name);
     ::tflite::OpSignature op_sig =
         GetVersioningOpSig(builtin_op(), op_signature);
@@ -662,9 +662,9 @@ class Mul : public BuiltinOperator<MulOperator, ::tflite::MulOptions,
   }
 
   int GetVersion(const OperatorSignature& op_signature) const override {
-    const string& input1_name = op_signature.op->inputs[0];
-    const string& input2_name = op_signature.op->inputs[1];
-    const string& output_name = op_signature.op->outputs[0];
+    const std::string& input1_name = op_signature.op->inputs[0];
+    const std::string& input2_name = op_signature.op->inputs[1];
+    const std::string& output_name = op_signature.op->outputs[0];
     const Array& input1_array = op_signature.model->GetArray(input1_name);
     const Array& input2_array = op_signature.model->GetArray(input2_name);
     const Array& output_array = op_signature.model->GetArray(output_name);
@@ -1440,7 +1440,7 @@ class Unpack : public BuiltinOperator<UnpackOperator, ::tflite::UnpackOptions,
   }
 
   int GetVersion(const OperatorSignature& op_signature) const override {
-    const string& input_name = op_signature.op->inputs[0];
+    const std::string& input_name = op_signature.op->inputs[0];
     const Array& input_array = op_signature.model->GetArray(input_name);
     // If the op take int8/uint8 input, it is version 2.
     if (input_array.data_type == ArrayDataType::kInt8 ||
@@ -1577,7 +1577,7 @@ class Where : public BuiltinOperator<WhereOperator, ::tflite::WhereOptions,
 };
 
 std::unique_ptr<flexbuffers::Builder> WriteFlexOpOptions(
-    const string& tensorflow_node_def) {
+    const std::string& tensorflow_node_def) {
   auto fbb = absl::make_unique<flexbuffers::Builder>();
 
   ::tensorflow::NodeDef node_def;
@@ -1597,7 +1597,7 @@ std::unique_ptr<flexbuffers::Builder> WriteFlexOpOptions(
 
 class TensorFlowUnsupported : public BaseOperator {
  public:
-  TensorFlowUnsupported(const string& name, OperatorType type,
+  TensorFlowUnsupported(const std::string& name, OperatorType type,
                         bool enable_select_tf_ops)
       : BaseOperator(name, type), enable_select_tf_ops_(enable_select_tf_ops) {}
 
@@ -1676,7 +1676,7 @@ class TensorFlowUnsupported : public BaseOperator {
         case tensorflow::AttrValue::kList:
           if (attr.list().s_size() > 0) {
             auto start = fbb->StartVector(key);
-            for (const string& v : attr.list().s()) {
+            for (const std::string& v : attr.list().s()) {
               fbb->Add(v);
             }
             fbb->EndVector(start, /*typed=*/true, /*fixed=*/false);
@@ -1736,10 +1736,11 @@ class TensorFlowUnsupported : public BaseOperator {
           break;
         case flexbuffers::FBT_BOOL:
           (*attr)[key].set_b(value.AsBool());
-          if (string(key) == "_output_quantized") {
+          if (std::string(key) == "_output_quantized") {
             op->quantized = value.AsBool();
           }
-          if (string(key) == "_support_output_type_float_in_quantized_op") {
+          if (std::string(key) ==
+              "_support_output_type_float_in_quantized_op") {
             op->support_output_type_float_in_quantized_op = value.AsBool();
           }
           break;
@@ -2095,9 +2096,9 @@ std::map<OperatorType, std::unique_ptr<BaseOperator>> BuildOperatorByTypeMap(
   return result;
 }
 
-std::map<string, std::unique_ptr<BaseOperator>> BuildOperatorByNameMap(
+std::map<std::string, std::unique_ptr<BaseOperator>> BuildOperatorByNameMap(
     bool enable_select_tf_ops) {
-  std::map<string, std::unique_ptr<BaseOperator>> result;
+  std::map<std::string, std::unique_ptr<BaseOperator>> result;
 
   std::vector<std::unique_ptr<BaseOperator>> ops =
       BuildOperatorList(enable_select_tf_ops);
@@ -2109,7 +2110,7 @@ std::map<string, std::unique_ptr<BaseOperator>> BuildOperatorByNameMap(
 }
 
 bool ShouldExportAsFlexOp(bool enable_select_tf_ops,
-                          const string& tensorflow_op_name) {
+                          const std::string& tensorflow_op_name) {
   // If Flex ops aren't allow at all, simply return false.
   if (!enable_select_tf_ops) {
     return false;
diff --git a/tensorflow/lite/toco/tflite/operator.h b/tensorflow/lite/toco/tflite/operator.h
index 19d92145e0c..fb79b97f46e 100644
--- a/tensorflow/lite/toco/tflite/operator.h
+++ b/tensorflow/lite/toco/tflite/operator.h
@@ -30,7 +30,7 @@ class BaseOperator;
 // Return a map contained all know TF Lite Operators, keyed by their names.
 // TODO(ycling): The pattern to propagate parameters (e.g. enable_select_tf_ops)
 // is ugly here. Consider refactoring.
-std::map<string, std::unique_ptr<BaseOperator>> BuildOperatorByNameMap(
+std::map<std::string, std::unique_ptr<BaseOperator>> BuildOperatorByNameMap(
     bool enable_select_tf_ops = false);
 
 // Return a map contained all know TF Lite Operators, keyed by the type of
@@ -41,7 +41,7 @@ std::map<OperatorType, std::unique_ptr<BaseOperator>> BuildOperatorByTypeMap(
 // Write the custom option FlexBuffer with a serialized TensorFlow NodeDef
 // for a Flex op.
 std::unique_ptr<flexbuffers::Builder> WriteFlexOpOptions(
-    const string& tensorflow_node_def);
+    const std::string& tensorflow_node_def);
 
 // These are the flatbuffer types for custom and builtin options.
 using CustomOptions = flatbuffers::Vector<uint8_t>;
@@ -71,11 +71,11 @@ struct Options {
 class BaseOperator {
  public:
   // Build an operator with the given TF Lite name and tf.mini type.
-  BaseOperator(const string& name, OperatorType type)
+  BaseOperator(const std::string& name, OperatorType type)
       : name_(name), type_(type) {}
   virtual ~BaseOperator() = default;
 
-  string name() const { return name_; }
+  std::string name() const { return name_; }
   OperatorType type() const { return type_; }
 
   // Given a tf.mini operator, create the corresponding flatbuffer options and
@@ -111,7 +111,7 @@ class BaseOperator {
   }
 
  private:
-  string name_;
+  std::string name_;
   OperatorType type_;
 };
 
@@ -123,7 +123,7 @@ class BaseOperator {
 // Helper function to determine if a unsupported TensorFlow op should be
 // exported as an Flex op or a regular custom op.
 bool ShouldExportAsFlexOp(bool enable_select_tf_ops,
-                          const string& tensorflow_op_name);
+                          const std::string& tensorflow_op_name);
 
 }  // namespace tflite
 
diff --git a/tensorflow/lite/toco/tflite/operator_test.cc b/tensorflow/lite/toco/tflite/operator_test.cc
index a4fe01e4afd..cb466fef079 100644
--- a/tensorflow/lite/toco/tflite/operator_test.cc
+++ b/tensorflow/lite/toco/tflite/operator_test.cc
@@ -30,8 +30,8 @@ namespace {
 class OperatorTest : public ::testing::Test {
  protected:
   // Return the operator for the given name and type.
-  const BaseOperator& GetOperator(const string& name, OperatorType type) {
-    using OpsByName = std::map<string, std::unique_ptr<BaseOperator>>;
+  const BaseOperator& GetOperator(const std::string& name, OperatorType type) {
+    using OpsByName = std::map<std::string, std::unique_ptr<BaseOperator>>;
     using OpsByType = std::map<OperatorType, std::unique_ptr<BaseOperator>>;
 
     static auto* by_name = new OpsByName(BuildOperatorByNameMap());
@@ -86,7 +86,7 @@ class OperatorTest : public ::testing::Test {
   // Verify serialization and deserialization of simple operators (those
   // that don't have any configuration parameters).
   template <typename T>
-  void CheckSimpleOperator(const string& name, OperatorType type) {
+  void CheckSimpleOperator(const std::string& name, OperatorType type) {
     Options options;
     auto output_toco_op =
         SerializeAndDeserialize(GetOperator(name, type), T(), &options);
@@ -99,7 +99,7 @@ class OperatorTest : public ::testing::Test {
   }
 
   template <typename T>
-  void CheckReducerOperator(const string& name, OperatorType type) {
+  void CheckReducerOperator(const std::string& name, OperatorType type) {
     T op;
 
     op.keep_dims = false;
diff --git a/tensorflow/lite/toco/tflite/types.cc b/tensorflow/lite/toco/tflite/types.cc
index 96cad557baf..9d4ab8434d1 100644
--- a/tensorflow/lite/toco/tflite/types.cc
+++ b/tensorflow/lite/toco/tflite/types.cc
@@ -25,7 +25,7 @@ DataBuffer::FlatBufferOffset CopyStringToBuffer(
     const Array& array, flatbuffers::FlatBufferBuilder* builder) {
   const auto& src_data = array.GetBuffer<ArrayDataType::kString>().data;
   ::tflite::DynamicBuffer dyn_buffer;
-  for (const string& str : src_data) {
+  for (const std::string& str : src_data) {
     dyn_buffer.AddString(str.c_str(), str.length());
   }
   char* tensor_buffer;
@@ -58,12 +58,12 @@ DataBuffer::FlatBufferOffset CopyBuffer(
 
 void CopyStringFromBuffer(const ::tflite::Buffer& buffer, Array* array) {
   auto* src_data = reinterpret_cast<const char*>(buffer.data()->data());
-  std::vector<string>* dst_data =
+  std::vector<std::string>* dst_data =
       &array->GetMutableBuffer<ArrayDataType::kString>().data;
   int32_t num_strings = ::tflite::GetStringCount(src_data);
   for (int i = 0; i < num_strings; i++) {
     ::tflite::StringRef str_ref = ::tflite::GetString(src_data, i);
-    string this_str(str_ref.str, str_ref.len);
+    std::string this_str(str_ref.str, str_ref.len);
     dst_data->push_back(this_str);
   }
 }

From 642ad434d8315561ef9cc02cc9157436fe9c0f72 Mon Sep 17 00:00:00 2001
From: Jingyue Wu <jingyue@google.com>
Date: Fri, 19 Jun 2020 09:50:56 -0700
Subject: [PATCH 0618/1390] Fix exports_files.

cl/317237033 replaced cwise_op_neg.cc with cwise_op_neg_1.cc and
cwise_op_neg_2.cc.

PiperOrigin-RevId: 317325461
Change-Id: Ib44ff36474b7e55d9e84ff737ea82b9dac46b9f9
---
 tensorflow/core/kernels/BUILD | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index 0b7a092033b..e2ff5aed283 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -8802,7 +8802,8 @@ exports_files([
     "cwise_op_mod.cc",
     "cwise_op_mul_1.cc",
     "cwise_op_mul_2.cc",
-    "cwise_op_neg.cc",
+    "cwise_op_neg_1.cc",
+    "cwise_op_neg_2.cc",
     "cwise_op_not_equal_to_1.cc",
     "cwise_op_not_equal_to_2.cc",
     "cwise_op_round.cc",

From c3cc3c40b08a37535a281e1d7a5fd7d3d802aac6 Mon Sep 17 00:00:00 2001
From: Mihai Maruseac <mihaimaruseac@google.com>
Date: Fri, 19 Jun 2020 09:55:49 -0700
Subject: [PATCH 0619/1390] Move fuzzers for TF ops to own subdir. Trim some
 dependencies.

This duplicates some of the BUILD dependency tree to go around the need to link huge bottleneck dependencies (such as `//tensorflow/core:framework`). Until TF can use `cc_shared_library` in a stable way (and all support in Bazel exists), we will need to use the duplicated tree for fuzzing.

PiperOrigin-RevId: 317326319
Change-Id: I1493e3ae7340298971fe15bd3702b63657f9bf9f
---
 tensorflow/core/framework/BUILD               |   1 +
 tensorflow/security/fuzzing/BUILD             |  14 --
 tensorflow/security/fuzzing/op_fuzzing/BUILD  |  39 +++++
 .../fuzzing/op_fuzzing/fuzz_session.h         | 156 ++++++++++++++++++
 .../fuzzing/{ => op_fuzzing}/identity_fuzz.cc |   2 +-
 5 files changed, 197 insertions(+), 15 deletions(-)
 create mode 100644 tensorflow/security/fuzzing/op_fuzzing/BUILD
 create mode 100644 tensorflow/security/fuzzing/op_fuzzing/fuzz_session.h
 rename tensorflow/security/fuzzing/{ => op_fuzzing}/identity_fuzz.cc (95%)

diff --git a/tensorflow/core/framework/BUILD b/tensorflow/core/framework/BUILD
index 52f15dcb5c2..d47c74a629d 100644
--- a/tensorflow/core/framework/BUILD
+++ b/tensorflow/core/framework/BUILD
@@ -719,6 +719,7 @@ tf_cuda_library(
     visibility = [
         "//tensorflow/core:__pkg__",
         "//tensorflow/core/util:__pkg__",
+        "//tensorflow/security/fuzzing:__subpackages__",
     ],
     deps = [
         ":allocation_description_proto_cc",
diff --git a/tensorflow/security/fuzzing/BUILD b/tensorflow/security/fuzzing/BUILD
index 9b5aeec2d36..6b6c8275275 100644
--- a/tensorflow/security/fuzzing/BUILD
+++ b/tensorflow/security/fuzzing/BUILD
@@ -18,17 +18,3 @@ tf_fuzz_target(
         "//tensorflow/core/platform:status",
     ],
 )
-
-# A trivial fuzzer with no pre-specified corpus.
-# TODO(mihaimaruseac): Move fuzz_session and the op fuzzers to a subdirectory
-tf_fuzz_target(
-    name = "identity_fuzz",
-    srcs = ["identity_fuzz.cc"],
-    deps = [
-        "//tensorflow/cc:cc_ops",
-        "//tensorflow/core/kernels/fuzzing:fuzz_session",
-        # Needed only to transitiviely link dependencies
-        "//tensorflow/cc:scope",
-        "//tensorflow/core:core_cpu",
-    ],
-)
diff --git a/tensorflow/security/fuzzing/op_fuzzing/BUILD b/tensorflow/security/fuzzing/op_fuzzing/BUILD
new file mode 100644
index 00000000000..aacd2f16cc4
--- /dev/null
+++ b/tensorflow/security/fuzzing/op_fuzzing/BUILD
@@ -0,0 +1,39 @@
+# Fuzzing TensorFlow ops..
+# Most ops have a similar set of dependencies and a similar fuzzing
+# infrastructure. Hence, we gather everything in one single place.
+# Note that these fuzzers cover a large part of TF, they are not granular.
+
+load(
+    "//tensorflow/security/fuzzing:tf_fuzzing.bzl",
+    "tf_fuzz_target",
+)
+
+package(
+    licenses = ["notice"],  # Apache 2.0
+)
+
+# Since all ops need to have a graph created before being fuzzed, we define
+# this header-only library to handle the needed plumbing.
+cc_library(
+    name = "fuzz_session",
+    hdrs = ["fuzz_session.h"],
+    deps = [
+        "//tensorflow/cc:scope",
+        "//tensorflow/core:core_cpu_base",
+        "//tensorflow/core:session_options",
+        "//tensorflow/core/common_runtime:direct_session_internal",
+        "//tensorflow/core/framework:tensor",
+        "//tensorflow/core/platform:status",
+    ],
+)
+
+# A trivial fuzzer with no pre-specified corpus.
+tf_fuzz_target(
+    name = "identity_fuzz",
+    srcs = ["identity_fuzz.cc"],
+    deps = [
+        ":fuzz_session",
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/core/kernels:array",
+    ],
+)
diff --git a/tensorflow/security/fuzzing/op_fuzzing/fuzz_session.h b/tensorflow/security/fuzzing/op_fuzzing/fuzz_session.h
new file mode 100644
index 00000000000..575212b3b86
--- /dev/null
+++ b/tensorflow/security/fuzzing/op_fuzzing/fuzz_session.h
@@ -0,0 +1,156 @@
+/* Copyright 2016 Google Inc. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_SECURITY_FUZZING_OP_FUZZING_FUZZ_SESSION_H_
+#define TENSORFLOW_SECURITY_FUZZING_OP_FUZZING_FUZZ_SESSION_H_
+
+#include <cstdint>
+#include <cstdlib>
+#include <string>
+#include <vector>
+
+#include "tensorflow/cc/framework/scope.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/public/session.h"
+#include "tensorflow/core/public/session_options.h"
+
+// Standard invoking function macro to dispatch to a fuzzer class.
+#define STANDARD_TF_FUZZ_FUNCTION(FuzzerClass)                              \
+  extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) { \
+    static FuzzerClass* fuzzer = new FuzzerClass();                         \
+    return fuzzer->Fuzz(data, size);                                        \
+  }
+
+// Standard builder for hooking one placeholder to one op.
+#define SINGLE_INPUT_OP_BUILDER(dtype, opName)                          \
+  void BuildGraph(const Scope& scope) override {                        \
+    auto op_node =                                                      \
+        tensorflow::ops::Placeholder(scope.WithOpName("input"), dtype); \
+    (void)tensorflow::ops::opName(scope.WithOpName("output"), op_node); \
+  }
+
+namespace tensorflow {
+namespace fuzzing {
+
+// Create a TensorFlow session using a specific GraphDef created
+// by BuildGraph(), and make it available for fuzzing.
+// Users must override BuildGraph and FuzzImpl to specify
+// (1) which operations are being fuzzed; and
+// (2) How to translate the uint8_t* buffer from the fuzzer
+//     to a Tensor or Tensors that are semantically appropriate
+//     for the op under test.
+// For the simple cases of testing a single op that takes a single
+// input Tensor, use the SINGLE_INPUT_OP_BUILDER(dtype, opName) macro in place
+// of defining BuildGraphDef.
+//
+// Typical use:
+// class FooFuzzer : public FuzzSession {
+//   SINGLE_INPUT_OP_BUILDER(DT_INT8, Identity);
+//   void FuzzImpl(const uint8_t* data, size_t size) {
+//      ... convert data and size to a Tensor, pass it to:
+//      RunInputs({{"input", input_tensor}});
+//
+class FuzzSession {
+ public:
+  FuzzSession() : initialized_(false) {}
+  virtual ~FuzzSession() {}
+
+  // Constructs a Graph using the supplied Scope.
+  // By convention, the graph should have inputs named "input1", ...
+  // "inputN", and one output node, named "output".
+  // Users of FuzzSession should override this method to create their graph.
+  virtual void BuildGraph(const Scope& scope) = 0;
+
+  // Implements the logic that converts an opaque byte buffer
+  // from the fuzzer to Tensor inputs to the graph.  Users must override.
+  virtual void FuzzImpl(const uint8_t* data, size_t size) = 0;
+
+  // Initializes the FuzzSession.  Not safe for multithreading.
+  // Separate init function because the call to virtual BuildGraphDef
+  // can't be put into the constructor.
+  Status InitIfNeeded() {
+    if (initialized_) {
+      return Status::OK();
+    }
+    initialized_ = true;
+
+    Scope root = Scope::DisabledShapeInferenceScope().ExitOnError();
+    SessionOptions options;
+    session_ = std::unique_ptr<Session>(NewSession(options));
+
+    BuildGraph(root);
+
+    GraphDef graph_def;
+    TF_CHECK_OK(root.ToGraphDef(&graph_def));
+
+    Status status = session_->Create(graph_def);
+    if (!status.ok()) {
+      // This is FATAL, because this code is designed to fuzz an op
+      // within a session.  Failure to create the session means we
+      // can't send any data to the op.
+      LOG(FATAL) << "Could not create session: " << status.error_message();
+    }
+    return status;
+  }
+
+  // Runs the TF session by pulling on the "output" node, attaching
+  // the supplied input_tensor to the input node(s), and discarding
+  // any returned output.
+  // Note: We are ignoring Status from Run here since fuzzers don't need to
+  // check it (as that will slow them down and printing/logging is useless).
+  void RunInputs(const std::vector<std::pair<string, Tensor> >& inputs) {
+    RunInputsWithStatus(inputs).IgnoreError();
+  }
+
+  // Same as RunInputs but don't ignore status
+  Status RunInputsWithStatus(
+      const std::vector<std::pair<string, Tensor> >& inputs) {
+    return session_->Run(inputs, {}, {"output"}, nullptr);
+  }
+
+  // Dispatches to FuzzImpl;  small amount of sugar to keep the code
+  // of the per-op fuzzers tiny.
+  int Fuzz(const uint8_t* data, size_t size) {
+    Status status = InitIfNeeded();
+    TF_CHECK_OK(status) << "Fuzzer graph initialization failed: "
+                        << status.error_message();
+    // No return value from fuzzing:  Success is defined as "did not
+    // crash".  The actual application results are irrelevant.
+    FuzzImpl(data, size);
+    return 0;
+  }
+
+ private:
+  bool initialized_;
+  std::unique_ptr<Session> session_;
+};
+
+// A specialized fuzz implementation for ops that take
+// a single string.  Caller must still define the op
+// to plumb by overriding BuildGraph or using
+// a plumbing macro.
+class FuzzStringInputOp : public FuzzSession {
+  void FuzzImpl(const uint8_t* data, size_t size) final {
+    Tensor input_tensor(tensorflow::DT_STRING, TensorShape({}));
+    input_tensor.scalar<tstring>()() =
+        string(reinterpret_cast<const char*>(data), size);
+    RunInputs({{"input", input_tensor}});
+  }
+};
+
+}  // end namespace fuzzing
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_SECURITY_FUZZING_OP_FUZZING_FUZZ_SESSION_H_
diff --git a/tensorflow/security/fuzzing/identity_fuzz.cc b/tensorflow/security/fuzzing/op_fuzzing/identity_fuzz.cc
similarity index 95%
rename from tensorflow/security/fuzzing/identity_fuzz.cc
rename to tensorflow/security/fuzzing/op_fuzzing/identity_fuzz.cc
index 4c1049d381b..a63c35b45e2 100644
--- a/tensorflow/security/fuzzing/identity_fuzz.cc
+++ b/tensorflow/security/fuzzing/op_fuzzing/identity_fuzz.cc
@@ -14,7 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/cc/ops/standard_ops.h"
-#include "tensorflow/core/kernels/fuzzing/fuzz_session.h"
+#include "tensorflow/security/fuzzing/op_fuzzing/fuzz_session.h"
 
 namespace tensorflow {
 namespace fuzzing {

From 8e305b9cec35ccf9821c2fd2a82194e328d98704 Mon Sep 17 00:00:00 2001
From: Denisa Roberts <denisa.roberts@denisaroberts.me>
Date: Wed, 6 May 2020 11:29:37 -0400
Subject: [PATCH 0620/1390] Add Qr Grad for wide matrices

---
 tensorflow/python/kernel_tests/qr_op_test.py |  5 +--
 tensorflow/python/ops/linalg_grad.py         | 47 +++++++++++++++-----
 2 files changed, 38 insertions(+), 14 deletions(-)

diff --git a/tensorflow/python/kernel_tests/qr_op_test.py b/tensorflow/python/kernel_tests/qr_op_test.py
index d5337c183a6..0c291dbd940 100644
--- a/tensorflow/python/kernel_tests/qr_op_test.py
+++ b/tensorflow/python/kernel_tests/qr_op_test.py
@@ -278,14 +278,13 @@ if __name__ == "__main__":
                                     use_static_shape))
 
   # TODO(pfau): Get working with complex types.
-  # TODO(pfau): Get working with full_matrices when rows != cols
-  # TODO(pfau): Get working when rows < cols
+  # TODO(pfau): Get working with full_matrices when rows > cols
   # TODO(pfau): Get working with shapeholders (dynamic shapes)
   for full_matrices in False, True:
     for dtype in np.float32, np.float64:
       for rows in 1, 2, 5, 10:
         for cols in 1, 2, 5, 10:
-          if rows == cols or (not full_matrices and rows > cols):
+          if rows <= cols or (not full_matrices and rows > cols):
             for batch_dims in [(), (3,)] + [(3, 2)] * (max(rows, cols) < 10):
               shape = batch_dims + (rows, cols)
               name = "%s_%s_full_%s" % (dtype.__name__,
diff --git a/tensorflow/python/ops/linalg_grad.py b/tensorflow/python/ops/linalg_grad.py
index 437e28e7e6b..5ec372430ba 100644
--- a/tensorflow/python/ops/linalg_grad.py
+++ b/tensorflow/python/ops/linalg_grad.py
@@ -493,15 +493,10 @@ def _QrGrad(op, dq, dr):
   if (r.shape.ndims is None or r.shape.as_list()[-2] is None or
       r.shape.as_list()[-1] is None):
     raise NotImplementedError("QrGrad not implemented with dynamic shapes.")
-  if r.shape.dims[-2].value != r.shape.dims[-1].value:
+  if (r.shape.dims[-2].value > r.shape.dims[-1].value and
+      q.shape.dims[-2].value == q.shape.dims[-1].value):
     raise NotImplementedError("QrGrad not implemented when ncols > nrows "
-                              "or full_matrices is true and ncols != nrows.")
-
-  qdq = math_ops.matmul(q, dq, adjoint_a=True)
-  qdq_ = qdq - _linalg.adjoint(qdq)
-  rdr = math_ops.matmul(r, dr, adjoint_b=True)
-  rdr_ = rdr - _linalg.adjoint(rdr)
-  tril = array_ops.matrix_band_part(qdq_ + rdr_, -1, 0)
+                              "and full_matrices is true.")
 
   def _TriangularSolve(x, r):
     """Equiv to matmul(x, adjoint(matrix_inverse(r))) if r is upper-tri."""
@@ -509,9 +504,39 @@ def _QrGrad(op, dq, dr):
         linalg_ops.matrix_triangular_solve(
             r, _linalg.adjoint(x), lower=False, adjoint=False))
 
-  grad_a = math_ops.matmul(q, dr + _TriangularSolve(tril, r))
-  grad_b = _TriangularSolve(dq - math_ops.matmul(q, qdq), r)
-  return grad_a + grad_b
+  def _QrGradSquareAndDeepMatrices(q, r, dq, dr):
+    """Gradient for matrix orders num_rows >= num_cols
+    and full_matrices is false.
+    """
+    qdq = math_ops.matmul(q, dq, adjoint_a=True)
+    qdq_ = qdq - _linalg.adjoint(qdq)
+    rdr = math_ops.matmul(r, dr, adjoint_b=True)
+    rdr_ = rdr - _linalg.adjoint(rdr)
+    tril = array_ops.matrix_band_part(qdq_ + rdr_, -1, 0)
+
+    grad_a = math_ops.matmul(q, dr + _TriangularSolve(tril, r))
+    grad_b = _TriangularSolve(dq - math_ops.matmul(q, qdq), r)
+    return grad_a + grad_b
+
+  num_rows, num_cols = q.shape.dims[-2].value, r.shape.dims[-1]
+
+  if num_rows >= num_cols:
+    return _QrGradSquareAndDeepMatrices(q, r, dq, dr)
+
+  # Partition a = [x, y], r = [u, v] and reduce to the square case
+  a = op.inputs[0]
+  y = a[..., :, num_rows:]
+  u = r[..., :, :num_rows]
+  dv = dr[..., :, num_rows:]
+  du = dr[..., :, :num_rows]
+  dy = math_ops.matmul(q, dv)
+  dx = _QrGradSquareAndDeepMatrices(q,
+                                    u,
+                                    dq + math_ops.matmul(y,
+                                                         dv,
+                                                         adjoint_b=True),
+                                    du)
+  return array_ops.concat([dx, dy], axis=-1)
 
 
 @ops.RegisterGradient("MatrixSolve")

From 390b259dfc8407484533c1cb61ca3515ed5166e8 Mon Sep 17 00:00:00 2001
From: Gabriel Rasskin <grasskin@google.com>
Date: Fri, 19 Jun 2020 10:16:47 -0700
Subject: [PATCH 0621/1390] Fix unused variable issue with fuzzing methods

---
 tensorflow/security/fuzzing/status_group_fuzz.cc | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/tensorflow/security/fuzzing/status_group_fuzz.cc b/tensorflow/security/fuzzing/status_group_fuzz.cc
index bc80cd72bc9..989e1c9d1cb 100644
--- a/tensorflow/security/fuzzing/status_group_fuzz.cc
+++ b/tensorflow/security/fuzzing/status_group_fuzz.cc
@@ -54,9 +54,10 @@ extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
     }
   }
 
-  sg.as_summary_status();
-  sg.as_concatenated_status();
-  sg.AttachLogMessages();
+  // Ignore warnings that these values are unused
+  sg.as_summary_status().IgnoreError();;
+  sg.as_concatenated_status().IgnoreError();;
+  sg.AttachLogMessages().IgnoreError();;
 
   return 0;
 }

From 40b9713f6459abd043248c55b1b06ebf60712961 Mon Sep 17 00:00:00 2001
From: Gabriel Rasskin <grasskin@google.com>
Date: Fri, 19 Jun 2020 10:17:22 -0700
Subject: [PATCH 0622/1390] Doubling syntax

---
 tensorflow/security/fuzzing/status_group_fuzz.cc | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tensorflow/security/fuzzing/status_group_fuzz.cc b/tensorflow/security/fuzzing/status_group_fuzz.cc
index 989e1c9d1cb..002785734bb 100644
--- a/tensorflow/security/fuzzing/status_group_fuzz.cc
+++ b/tensorflow/security/fuzzing/status_group_fuzz.cc
@@ -55,9 +55,9 @@ extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
   }
 
   // Ignore warnings that these values are unused
-  sg.as_summary_status().IgnoreError();;
-  sg.as_concatenated_status().IgnoreError();;
-  sg.AttachLogMessages().IgnoreError();;
+  sg.as_summary_status().IgnoreError();
+  sg.as_concatenated_status().IgnoreError();
+  sg.AttachLogMessages().IgnoreError();
 
   return 0;
 }

From 4d751f9da4914d6e1fc7aafe2f5f27e0f96830d6 Mon Sep 17 00:00:00 2001
From: Katherine Wu <kathywu@google.com>
Date: Fri, 19 Jun 2020 10:12:17 -0700
Subject: [PATCH 0623/1390] Roll forward of cl/316247127: Make sure compiled
 metrics are accessible after loading from H5 or SavedModel.

PiperOrigin-RevId: 317329395
Change-Id: I33578515f36aa0ba227e75bda52966d493a4bebb
---
 .../python/keras/engine/compile_utils.py      |  14 +--
 tensorflow/python/keras/engine/functional.py  |  10 +-
 tensorflow/python/keras/engine/training.py    |   7 +-
 tensorflow/python/keras/metrics.py            |   5 +-
 tensorflow/python/keras/saving/hdf5_format.py |   1 +
 .../python/keras/saving/hdf5_format_test.py   | 114 +++++++-----------
 .../python/keras/saving/saved_model/load.py   |   1 +
 .../python/keras/saving/saving_utils.py       |  14 +++
 8 files changed, 79 insertions(+), 87 deletions(-)

diff --git a/tensorflow/python/keras/engine/compile_utils.py b/tensorflow/python/keras/engine/compile_utils.py
index 3858639f024..ba7ce624090 100644
--- a/tensorflow/python/keras/engine/compile_utils.py
+++ b/tensorflow/python/keras/engine/compile_utils.py
@@ -37,7 +37,7 @@ class Container(object):
   def __init__(self, output_names=None):
     self._output_names = output_names
 
-  def _build(self, y_pred):
+  def build(self, y_pred):
     if self._output_names is None:
       # In Subclass API, output names like 'output_1' are used for
       # `Metric` names.
@@ -131,9 +131,9 @@ class LossesContainer(Container):
     ]
     return [self._loss_metric] + per_output_metrics
 
-  def _build(self, y_pred):
+  def build(self, y_pred):
     """One-time setup of loss objects."""
-    super(LossesContainer, self)._build(y_pred)
+    super(LossesContainer, self).build(y_pred)
 
     self._losses = self._maybe_broadcast_to_outputs(y_pred, self._losses)
     self._losses = self._conform_to_outputs(y_pred, self._losses)
@@ -184,7 +184,7 @@ class LossesContainer(Container):
     sample_weight = self._conform_to_outputs(y_pred, sample_weight)
 
     if not self._built:
-      self._build(y_pred)
+      self.build(y_pred)
 
     y_pred = nest.flatten(y_pred)
     y_true = nest.flatten(y_true)
@@ -295,9 +295,9 @@ class MetricsContainer(Container):
       return []
     return self._metrics_in_order
 
-  def _build(self, y_pred, y_true):
+  def build(self, y_pred, y_true):
     """One-time setup of metric objects."""
-    super(MetricsContainer, self)._build(y_pred)
+    super(MetricsContainer, self).build(y_pred)
 
     self._metrics = self._maybe_broadcast_to_outputs(y_pred, self._metrics)
     self._metrics = self._conform_to_outputs(y_pred, self._metrics)
@@ -385,7 +385,7 @@ class MetricsContainer(Container):
     sample_weight = self._conform_to_outputs(y_pred, sample_weight)
 
     if not self._built:
-      self._build(y_pred, y_true)
+      self.build(y_pred, y_true)
 
     y_pred = nest.flatten(y_pred)
     y_true = nest.flatten(y_true) if y_true is not None else []
diff --git a/tensorflow/python/keras/engine/functional.py b/tensorflow/python/keras/engine/functional.py
index 0ef4840b651..0612d70044d 100644
--- a/tensorflow/python/keras/engine/functional.py
+++ b/tensorflow/python/keras/engine/functional.py
@@ -1007,10 +1007,12 @@ def _map_subgraph_network(inputs, outputs):
 
 def _should_skip_first_node(layer):
   """Returns True if the first layer node should not be saved or loaded."""
-  # Networks start with a pre-existing node linking their input to output.
-  # For a sequential model, it is first created with _is_graph_network = False,
-  # we have to keep the _is_graph_network check here.
-  return isinstance(layer, Functional) and layer._is_graph_network
+  # Networks that are constructed with an Input layer/shape start with a
+  # pre-existing node linking their input to output. This node is excluded from
+  # the network config.
+  return (isinstance(layer, Functional) and
+          # Filter out Sequential models without an input shape.
+          isinstance(layer._layers[0], input_layer_module.InputLayer))
 
 
 def _deserialize_keras_tensors(kwargs, layer_map):
diff --git a/tensorflow/python/keras/engine/training.py b/tensorflow/python/keras/engine/training.py
index ccd184a8bc4..a0ebec4f95e 100644
--- a/tensorflow/python/keras/engine/training.py
+++ b/tensorflow/python/keras/engine/training.py
@@ -436,7 +436,6 @@ class Model(base_layer.Layer, version_utils.ModelVersionSelector):
                            'Instead, in order to instantiate and build your '
                            'model, `call` your model on real tensor data (of '
                            'the correct dtype).')
-
     super(Model, self).build(input_shape)
 
   def call(self, inputs, training=None, mask=None):
@@ -2417,6 +2416,12 @@ class Model(base_layer.Layer, version_utils.ModelVersionSelector):
 
     self._saved_model_inputs_spec = specs
 
+    # Store the input shapes
+    if (self.__class__.__name__ == 'Sequential' and
+        self._build_input_shape is None):
+      self._build_input_shape = nest.map_structure(
+          lambda x: None if x is None else x.shape, specs)
+
   def _assert_weights_created(self):
     """Asserts that all the weights for the model have been created.
 
diff --git a/tensorflow/python/keras/metrics.py b/tensorflow/python/keras/metrics.py
index a67755b9333..7f40423595b 100644
--- a/tensorflow/python/keras/metrics.py
+++ b/tensorflow/python/keras/metrics.py
@@ -630,10 +630,9 @@ class MeanMetricWrapper(Mean):
   def from_config(cls, config):
     # Note that while MeanMetricWrapper itself isn't public, objects of this
     # class may be created and added to the model by calling model.compile.
+    fn = config.pop('fn', None)
     if cls is MeanMetricWrapper:
-      fn = get(config.pop('fn'))
-      return cls(fn, **config)
-
+      return cls(get(fn), **config)
     return super(MeanMetricWrapper, cls).from_config(config)
 
 
diff --git a/tensorflow/python/keras/saving/hdf5_format.py b/tensorflow/python/keras/saving/hdf5_format.py
index 800d609fe99..01a5e12e4c6 100644
--- a/tensorflow/python/keras/saving/hdf5_format.py
+++ b/tensorflow/python/keras/saving/hdf5_format.py
@@ -192,6 +192,7 @@ def load_model_from_hdf5(filepath, custom_objects=None, compile=True):  # pylint
       # Compile model.
       model.compile(**saving_utils.compile_args_from_training_config(
           training_config, custom_objects))
+      saving_utils.try_build_compiled_arguments(model)
 
       # Set optimizer weights.
       if 'optimizer_weights' in f:
diff --git a/tensorflow/python/keras/saving/hdf5_format_test.py b/tensorflow/python/keras/saving/hdf5_format_test.py
index 757385a25ea..b079bf8cac8 100644
--- a/tensorflow/python/keras/saving/hdf5_format_test.py
+++ b/tensorflow/python/keras/saving/hdf5_format_test.py
@@ -26,10 +26,12 @@ from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.python import keras
+from tensorflow.python import tf2
 from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.keras import combinations
 from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import optimizers
@@ -368,48 +370,54 @@ class TestWeightSavingAndLoading(test.TestCase, parameterized.TestCase):
 
 
 @keras_parameterized.run_with_all_saved_model_formats
-class TestWholeModelSaving(test.TestCase, parameterized.TestCase):
+class TestWholeModelSaving(keras_parameterized.TestCase):
 
   def _save_model_dir(self, dirname='saved_model'):
     temp_dir = self.get_temp_dir()
     self.addCleanup(shutil.rmtree, temp_dir, ignore_errors=True)
     return os.path.join(temp_dir, dirname)
 
-  def _assert_same_weights(self, model, loaded_model,
-                           original_optimizer_has_iterations_variable=True):
-    """Checks that the loaded weighs are the same as the original weights.
+  def _assert_same_weights_and_metrics(self, model, loaded_model):
+    """Checks that the loaded weights and metrics are the same as the original.
 
     Args:
       model: original model
       loaded_model: loaded model
-      original_optimizer_has_iterations_variable: If the original optimizer
-        uses an iterations variable. The loaded model will have a v2
-        optimizer, which always contains an iterations variable. So when
-        comparing the weights, the first variable in the loaded optimizer
-        weights may need to be ignored.
     """
     self.assertAllClose(model.weights, loaded_model.weights)
+
     if loaded_model.optimizer:
       if testing_utils.get_save_format() == 'tf':
         # TODO(b/153110928): Keras TF format doesn't restore optimizer weights
         # currently.
         return
-      if original_optimizer_has_iterations_variable:
-        self.assertAllClose(model.optimizer.weights,
-                            loaded_model.optimizer.weights)
-      else:
-        self.assertAllClose(model.optimizer.weights,
-                            loaded_model.optimizer.weights[1:])
+      self.assertAllClose(model.optimizer.weights,
+                          loaded_model.optimizer.weights)
 
-  def test_sequential_model_saving(self):
+    # In V1/Graph mode, the model isn't built, so the metrics are not loaded
+    # immediately (requires model to be called on some data before building
+    # metrics).
+    check_metrics = tf2.enabled() and context.executing_eagerly()
+
+    if check_metrics:
+      self.assertAllEqual([m.name for m in model.metrics],
+                          [m.name for m in loaded_model.metrics])
+
+  @keras_parameterized.run_with_all_model_types
+  @keras_parameterized.run_all_keras_modes
+  def test_save_and_load(self):
     saved_model_dir = self._save_model_dir()
     save_format = testing_utils.get_save_format()
 
+    if save_format == 'h5' and testing_utils.get_model_type() == 'subclass':
+      return  # HDF5 format currently does not allow saving classed models.
+
     with self.cached_session():
-      model = keras.models.Sequential()
-      model.add(keras.layers.Dense(2, input_shape=(3,)))
-      model.add(keras.layers.RepeatVector(3))
-      model.add(keras.layers.TimeDistributed(keras.layers.Dense(3)))
+      model = testing_utils.get_model_from_layers(
+          [keras.layers.Dense(2),
+           keras.layers.RepeatVector(3),
+           keras.layers.TimeDistributed(keras.layers.Dense(3))],
+          input_shape=(3,))
       model.compile(
           loss=keras.losses.MSE,
           optimizer=keras.optimizer_v2.rmsprop.RMSprop(lr=0.0001),
@@ -432,43 +440,35 @@ class TestWholeModelSaving(test.TestCase, parameterized.TestCase):
       out = model.predict(x)
       keras.models.save_model(model, saved_model_dir, save_format=save_format)
 
-      new_model = keras.models.load_model(saved_model_dir)
-      self._assert_same_weights(model, new_model)
+      loaded_model = keras.models.load_model(saved_model_dir)
+      self._assert_same_weights_and_metrics(model, loaded_model)
 
-      out2 = new_model.predict(x)
+      out2 = loaded_model.predict(x)
       self.assertAllClose(out, out2, atol=1e-05)
 
-      # test that new updates are the same with both models
-      model.train_on_batch(x, y)
-      new_model.train_on_batch(x, y)
-
       eval_out = model.evaluate(x, y)
-      eval_out2 = new_model.evaluate(x, y)
+      eval_out2 = loaded_model.evaluate(x, y)
       self.assertArrayNear(eval_out, eval_out2, 0.001)
 
-      out = model.predict(x)
-      out2 = new_model.predict(x)
-      # The model has been trained on two batches. So the tolerance is larger.
-      self.assertAllClose(out, out2, atol=0.01)
-
+  @test_util.run_in_graph_and_eager_modes
   def test_sequential_model_saving_without_input_shape(self):
     saved_model_dir = self._save_model_dir()
     save_format = testing_utils.get_save_format()
-    with ops.Graph().as_default(), self.cached_session():
+    with self.cached_session():
       model = keras.models.Sequential()
       model.add(keras.layers.Dense(2))
       model.add(keras.layers.RepeatVector(3))
       model.add(keras.layers.TimeDistributed(keras.layers.Dense(3)))
       model.compile(
           loss=keras.losses.MSE,
-          optimizer=keras.optimizers.RMSprop(lr=0.0001),
+          optimizer='rmsprop',
           metrics=[
               keras.metrics.categorical_accuracy,
-              keras.metrics.CategoricalAccuracy()
+              keras.metrics.CategoricalAccuracy(name='cat_acc')
           ],
           weighted_metrics=[
               keras.metrics.categorical_accuracy,
-              keras.metrics.CategoricalAccuracy()
+              keras.metrics.CategoricalAccuracy(name='cat_acc2')
           ],
           sample_weight_mode='temporal')
       x = np.random.random((1, 3))
@@ -479,12 +479,13 @@ class TestWholeModelSaving(test.TestCase, parameterized.TestCase):
       model.save(saved_model_dir, save_format=save_format)
 
       new_model = keras.models.load_model(saved_model_dir)
-      self._assert_same_weights(
-          model, new_model, original_optimizer_has_iterations_variable=False)
+
+      self._assert_same_weights_and_metrics(model, new_model)
 
       out2 = new_model.predict(x)
       self.assertAllClose(out, out2, atol=1e-05)
 
+  @test_util.run_in_graph_and_eager_modes
   def test_sequential_model_saving_without_compile(self):
     saved_model_dir = self._save_model_dir()
     save_format = testing_utils.get_save_format()
@@ -501,7 +502,7 @@ class TestWholeModelSaving(test.TestCase, parameterized.TestCase):
       keras.models.save_model(model, saved_model_dir, save_format=save_format)
 
       new_model = keras.models.load_model(saved_model_dir)
-      self._assert_same_weights(model, new_model)
+      self._assert_same_weights_and_metrics(model, new_model)
 
       out2 = new_model.predict(x)
       self.assertAllClose(out, out2, atol=1e-05)
@@ -535,42 +536,11 @@ class TestWholeModelSaving(test.TestCase, parameterized.TestCase):
           saved_model_dir,
           custom_objects={'CustomOp': CustomOp,
                           'custom_loss': custom_loss})
-      self._assert_same_weights(model, new_model)
+      self._assert_same_weights_and_metrics(model, new_model)
 
       out2 = new_model.predict(x)
       self.assertAllClose(out, out2, atol=1e-05)
 
-  def test_functional_model_saving(self):
-    saved_model_dir = self._save_model_dir()
-    save_format = testing_utils.get_save_format()
-    with ops.Graph().as_default(), self.cached_session():
-      inputs = keras.layers.Input(shape=(3,))
-      x = keras.layers.Dense(2)(inputs)
-      output = keras.layers.Dense(3)(x)
-
-      model = keras.models.Model(inputs, output)
-      model.compile(
-          loss=keras.losses.MSE,
-          optimizer=keras.optimizers.RMSprop(lr=0.0001),
-          metrics=[
-              keras.metrics.categorical_accuracy,
-              keras.metrics.CategoricalAccuracy()
-          ],
-          weighted_metrics=[
-              keras.metrics.categorical_accuracy,
-              keras.metrics.CategoricalAccuracy()
-          ])
-      x = np.random.random((1, 3))
-      y = np.random.random((1, 3))
-      model.train_on_batch(x, y)
-
-      out = model.predict(x)
-      keras.models.save_model(model, saved_model_dir, save_format=save_format)
-      model = keras.models.load_model(saved_model_dir)
-
-      out2 = model.predict(x)
-      self.assertAllClose(out, out2, atol=1e-05)
-
   def test_saving_without_compilation(self):
     saved_model_dir = self._save_model_dir()
     save_format = testing_utils.get_save_format()
diff --git a/tensorflow/python/keras/saving/saved_model/load.py b/tensorflow/python/keras/saving/saved_model/load.py
index a378c1b98e7..0b55e30c27b 100644
--- a/tensorflow/python/keras/saving/saved_model/load.py
+++ b/tensorflow/python/keras/saving/saved_model/load.py
@@ -129,6 +129,7 @@ def load(path, compile=True, options=None):  # pylint: disable=redefined-builtin
     if training_config is not None:
       model.compile(**saving_utils.compile_args_from_training_config(
           training_config))
+      saving_utils.try_build_compiled_arguments(model)
     else:
       logging.warning('No training configuration found in save file, so the '
                       'model was *not* compiled. Compile it manually.')
diff --git a/tensorflow/python/keras/saving/saving_utils.py b/tensorflow/python/keras/saving/saving_utils.py
index 3c9c33531bf..9fdf81cae2a 100644
--- a/tensorflow/python/keras/saving/saving_utils.py
+++ b/tensorflow/python/keras/saving/saving_utils.py
@@ -27,6 +27,7 @@ from tensorflow.python.keras import losses
 from tensorflow.python.keras import optimizers
 from tensorflow.python.keras.engine import base_layer_utils
 from tensorflow.python.keras.utils import generic_utils
+from tensorflow.python.keras.utils import version_utils
 from tensorflow.python.keras.utils.io_utils import ask_to_proceed_with_overwrite
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import nest
@@ -307,3 +308,16 @@ def _enforce_names_consistency(specs):
   if name_inconsistency:
     specs = nest.map_structure(_clear_name, specs)
   return specs
+
+
+def try_build_compiled_arguments(model):
+  if (not version_utils.is_v1_layer_or_model(model) and
+      model.outputs is not None):
+    try:
+      model.compiled_loss.build(model.outputs)
+      model.compiled_metrics.build(model.outputs, model.outputs)
+    except:  # pylint: disable=bare-except
+      logging.warning(
+          'Compiled the loaded model, but the compiled metrics have yet to '
+          'be built. `model.compile_metrics` will be empty until you train '
+          'or evaluate the model.')

From f1f5ed68595a56357d92985466d7e0687b23303e Mon Sep 17 00:00:00 2001
From: Mihai Maruseac <mihai.maruseac@gmail.com>
Date: Fri, 19 Jun 2020 17:22:51 +0000
Subject: [PATCH 0624/1390] Update
 tensorflow/security/fuzzing/status_group_fuzz.cc

---
 tensorflow/security/fuzzing/status_group_fuzz.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/security/fuzzing/status_group_fuzz.cc b/tensorflow/security/fuzzing/status_group_fuzz.cc
index 002785734bb..a560766410a 100644
--- a/tensorflow/security/fuzzing/status_group_fuzz.cc
+++ b/tensorflow/security/fuzzing/status_group_fuzz.cc
@@ -57,7 +57,7 @@ extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
   // Ignore warnings that these values are unused
   sg.as_summary_status().IgnoreError();
   sg.as_concatenated_status().IgnoreError();
-  sg.AttachLogMessages().IgnoreError();
+  sg.AttachLogMessages();
 
   return 0;
 }

From d00691f7aaa954bfc9194f62e729f8b41de899af Mon Sep 17 00:00:00 2001
From: Scott Zhu <scottzhu@google.com>
Date: Fri, 19 Jun 2020 10:15:24 -0700
Subject: [PATCH 0625/1390] Fork the keras related mirrored_strategy_test to
 keras/distribute.

PiperOrigin-RevId: 317329998
Change-Id: I7dc55499cb0409129729696c286862a6b6d574aa
---
 tensorflow/python/distribute/BUILD            |  1 -
 .../distribute/mirrored_strategy_test.py      | 47 ----------
 tensorflow/python/keras/distribute/BUILD      | 23 +++++
 .../distribute/mirrored_strategy_test.py      | 89 +++++++++++++++++++
 4 files changed, 112 insertions(+), 48 deletions(-)
 create mode 100644 tensorflow/python/keras/distribute/mirrored_strategy_test.py

diff --git a/tensorflow/python/distribute/BUILD b/tensorflow/python/distribute/BUILD
index 0062705126f..38c5550be16 100644
--- a/tensorflow/python/distribute/BUILD
+++ b/tensorflow/python/distribute/BUILD
@@ -1486,7 +1486,6 @@ cuda_py_test(
         "//tensorflow/python/autograph/core:test_lib",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:test",
-        "//tensorflow/python/keras/layers",
     ],
 )
 
diff --git a/tensorflow/python/distribute/mirrored_strategy_test.py b/tensorflow/python/distribute/mirrored_strategy_test.py
index 950b6f2446b..e6414b2704a 100644
--- a/tensorflow/python/distribute/mirrored_strategy_test.py
+++ b/tensorflow/python/distribute/mirrored_strategy_test.py
@@ -22,7 +22,6 @@ import json
 import sys
 
 from absl.testing import parameterized
-import numpy as np
 
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python import tf2
@@ -50,16 +49,12 @@ from tensorflow.python.framework import func_graph
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
-from tensorflow.python.keras.engine import training as keras_training
-from tensorflow.python.keras.layers import core as keras_core
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gradients
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
-from tensorflow.python.training import gradient_descent
-from tensorflow.python.training import optimizer as optimizer_lib
 from tensorflow.python.training import server_lib
 
 
@@ -988,22 +983,6 @@ class MockModel(object):
     return x
 
 
-class MiniModel(keras_training.Model):
-  """Minimal model for mnist.
-
-  Useful for testing and debugging on slow TPU simulators.
-  """
-
-  def __init__(self):
-    super(MiniModel, self).__init__(name="")
-    self.fc = keras_core.Dense(1, name="fc", kernel_initializer="ones",
-                               bias_initializer="ones")
-
-  def call(self, inputs, training=True):
-    inputs = array_ops.ones([1, 10])
-    return self.fc(inputs)
-
-
 @combinations.generate(
     combinations.combine(
         distribution=[
@@ -1116,32 +1095,6 @@ class MirroredStrategyDefunTest(test.TestCase):
     expected_result = values.PerReplica((5.0 * 1.25, 3.0 * 1.25))
     self._call_and_check(distribution, fn1, [factors], expected_result, [fn1])
 
-  def testTrain(self, distribution):
-    with distribution.scope():
-      mock_model = MiniModel()
-      mock_model.call = function.defun(mock_model.call)
-
-      def loss_fn(ctx):
-        del ctx
-        return mock_model(array_ops.ones([1, 10]))
-
-      gradients_fn = backprop.implicit_grad(loss_fn)
-      gradients_fn = optimizer_lib.get_filtered_grad_fn(gradients_fn)
-      grads_and_vars = distribution.extended.call_for_each_replica(
-          gradients_fn, args=(None,))
-
-      optimizer = gradient_descent.GradientDescentOptimizer(0.25)
-      update_ops = optimizer._distributed_apply(distribution, grads_and_vars)  # pylint: disable=protected-access
-
-      if not context.executing_eagerly():
-        self.evaluate(variables.global_variables_initializer())
-        self.evaluate(update_ops)
-
-      updated_var_values = self.evaluate(mock_model.variables)
-      # All variables start at 1.0 and get two updates of 0.25.
-      self.assertAllEqual(0.5 * np.ones([10, 1]), updated_var_values[0])
-      self.assertAllEqual([0.5], updated_var_values[1])
-
 
 @combinations.generate(
     combinations.combine(
diff --git a/tensorflow/python/keras/distribute/BUILD b/tensorflow/python/keras/distribute/BUILD
index ddf274f299f..247e655621c 100644
--- a/tensorflow/python/keras/distribute/BUILD
+++ b/tensorflow/python/keras/distribute/BUILD
@@ -324,6 +324,29 @@ cuda_py_test(
     ],
 )
 
+cuda_py_test(
+    name = "mirrored_strategy_test",
+    srcs = ["mirrored_strategy_test.py"],
+    python_version = "PY3",
+    tags = [
+        "multi_and_single_gpu",
+        "no_windows_gpu",  # TODO(b/130551176)
+    ],
+    deps = [
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:training_lib",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/distribute:combinations",
+        "//tensorflow/python/distribute:strategy_combinations",
+        "//tensorflow/python/eager:backprop",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:function",
+        "//tensorflow/python/eager:test",
+        "//tensorflow/python/keras/engine",
+        "//tensorflow/python/keras/layers:core",
+    ],
+)
+
 cuda_py_test(
     name = "multi_worker_test",
     srcs = ["multi_worker_test.py"],
diff --git a/tensorflow/python/keras/distribute/mirrored_strategy_test.py b/tensorflow/python/keras/distribute/mirrored_strategy_test.py
new file mode 100644
index 00000000000..2844af8cc3a
--- /dev/null
+++ b/tensorflow/python/keras/distribute/mirrored_strategy_test.py
@@ -0,0 +1,89 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for MirroredStrategy."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.distribute import combinations
+from tensorflow.python.distribute import strategy_combinations
+from tensorflow.python.eager import backprop
+from tensorflow.python.eager import context
+from tensorflow.python.eager import function
+from tensorflow.python.eager import test
+from tensorflow.python.keras.engine import training as keras_training
+from tensorflow.python.keras.layers import core as keras_core
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.training import gradient_descent
+from tensorflow.python.training import optimizer as optimizer_lib
+
+
+class MiniModel(keras_training.Model):
+  """Minimal model for mnist.
+
+  Useful for testing and debugging on slow TPU simulators.
+  """
+
+  def __init__(self):
+    super(MiniModel, self).__init__(name="")
+    self.fc = keras_core.Dense(1, name="fc", kernel_initializer="ones",
+                               bias_initializer="ones")
+
+  def call(self, inputs, training=True):
+    inputs = array_ops.ones([1, 10])
+    return self.fc(inputs)
+
+
+@combinations.generate(
+    combinations.combine(
+        distribution=[
+            strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
+        ],
+        mode=["graph", "eager"]))
+class MirroredStrategyDefunTest(test.TestCase):
+
+  def testTrain(self, distribution):
+    with distribution.scope():
+      mock_model = MiniModel()
+      mock_model.call = function.defun(mock_model.call)
+
+      def loss_fn(ctx):
+        del ctx
+        return mock_model(array_ops.ones([1, 10]))
+
+      gradients_fn = backprop.implicit_grad(loss_fn)
+      gradients_fn = optimizer_lib.get_filtered_grad_fn(gradients_fn)
+      grads_and_vars = distribution.extended.call_for_each_replica(
+          gradients_fn, args=(None,))
+
+      optimizer = gradient_descent.GradientDescentOptimizer(0.25)
+      update_ops = optimizer._distributed_apply(distribution, grads_and_vars)  # pylint: disable=protected-access
+
+      if not context.executing_eagerly():
+        self.evaluate(variables.global_variables_initializer())
+        self.evaluate(update_ops)
+
+      updated_var_values = self.evaluate(mock_model.variables)
+      # All variables start at 1.0 and get two updates of 0.25.
+      self.assertAllEqual(0.5 * np.ones([10, 1]), updated_var_values[0])
+      self.assertAllEqual([0.5], updated_var_values[1])
+
+
+if __name__ == "__main__":
+  test.main()

From 57f9d638c00083e864aaa7e4f8114c0dd3ba479c Mon Sep 17 00:00:00 2001
From: Bixia Zheng <bixia@google.com>
Date: Fri, 19 Jun 2020 10:17:18 -0700
Subject: [PATCH 0626/1390] [TF:TRT] Add a prefix to the warning messages from
 TF-TRT.

Add LOG_WARNING_WITH_PREFIX to common/utils.h. Replace the use of LOG(WARNING)
with this new macro.

PiperOrigin-RevId: 317330336
Change-Id: Ife0aa0347dd72f6eb0f8805af4d46a7d4cb099ea
---
 tensorflow/compiler/tf2tensorrt/BUILD         | 14 ++++++++
 .../compiler/tf2tensorrt/common/utils.h       | 35 +++++++++++++++++++
 .../tf2tensorrt/convert/convert_graph.cc      | 30 +++++++++-------
 .../tf2tensorrt/convert/convert_nodes.cc      | 27 +++++++-------
 .../tf2tensorrt/kernels/trt_engine_op.cc      | 25 +++++++------
 .../compiler/tf2tensorrt/segment/segment.cc   | 10 +++---
 .../compiler/tf2tensorrt/utils/py_utils.cc    |  8 +++--
 .../compiler/tf2tensorrt/utils/trt_logger.cc  |  3 +-
 8 files changed, 108 insertions(+), 44 deletions(-)
 create mode 100644 tensorflow/compiler/tf2tensorrt/common/utils.h

diff --git a/tensorflow/compiler/tf2tensorrt/BUILD b/tensorflow/compiler/tf2tensorrt/BUILD
index 4a8599e29f6..368cb5af2ed 100644
--- a/tensorflow/compiler/tf2tensorrt/BUILD
+++ b/tensorflow/compiler/tf2tensorrt/BUILD
@@ -79,6 +79,15 @@ tf_cuda_cc_test(
     ]),
 )
 
+cc_library(
+    name = "common_utils",
+    hdrs = ["common/utils.h"],
+    copts = tf_copts(),
+    deps = [
+        "//tensorflow/core/platform:logging",
+    ] + if_tensorrt([":tensorrt_lib"]),
+)
+
 cc_library(
     name = "trt_op_kernels",
     srcs = [
@@ -95,6 +104,7 @@ cc_library(
         ":trt_plugins",
         ":trt_resources",
         ":utils",
+        ":common_utils",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
         "@local_config_cuda//cuda:cuda_headers",
@@ -240,6 +250,7 @@ tf_cuda_library(
     hdrs = ["utils/trt_logger.h"],
     visibility = ["//visibility:public"],
     deps = [
+        ":common_utils",
         ":logger_registry",
         "//tensorflow/core:lib_proto_parsing",
     ] + if_tensorrt([":tensorrt_lib"]),
@@ -375,6 +386,7 @@ tf_cuda_library(
         "convert/trt_optimization_pass.h",
     ],
     deps = [
+        ":common_utils",
         ":logger_registry",
         ":segment",
         ":trt_allocator",
@@ -488,6 +500,7 @@ cc_library(
     ],
     copts = tf_copts(),
     deps = [
+        ":common_utils",
         "//tensorflow/core:graph",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
@@ -575,6 +588,7 @@ cc_library(
     hdrs = ["utils/py_utils.h"],
     copts = tf_copts(),
     deps = if_tensorrt([
+        ":common_utils",
         ":tensorrt_lib",
         "//tensorflow/stream_executor/platform:dso_loader",
     ]),
diff --git a/tensorflow/compiler/tf2tensorrt/common/utils.h b/tensorflow/compiler/tf2tensorrt/common/utils.h
new file mode 100644
index 00000000000..9ab0145e1ec
--- /dev/null
+++ b/tensorflow/compiler/tf2tensorrt/common/utils.h
@@ -0,0 +1,35 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_TF2TENSORRT_COMMON_UTILS_H_
+#define TENSORFLOW_COMPILER_TF2TENSORRT_COMMON_UTILS_H_
+
+#if GOOGLE_CUDA
+#if GOOGLE_TENSORRT
+
+#include "tensorflow/core/platform/logging.h"
+
+namespace tensorflow {
+namespace tensorrt {
+
+#define LOG_WARNING_WITH_PREFIX LOG(WARNING) << "TF-TRT Warning: "
+
+}  // namespace tensorrt
+}  // namespace tensorflow
+
+#endif
+#endif
+
+#endif  // TENSORFLOW_COMPILER_TF2TENSORRT_COMMON_UTILS_H_
diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc b/tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc
index 414d27477bc..1c51d51f1c9 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc
+++ b/tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/strings/str_cat.h"
+#include "tensorflow/compiler/tf2tensorrt/common/utils.h"
 #include "tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h"
 #include "tensorflow/compiler/tf2tensorrt/convert/logger_registry.h"
 #include "tensorflow/compiler/tf2tensorrt/convert/utils.h"
@@ -276,8 +277,9 @@ Status GetEngineInfo(const Graph* g,
   if (segment_devices.size() == 1) {
     info->device = *segment_devices.begin();
   } else if (segment_devices.size() > 1) {
-    LOG(WARNING) << "Detected multiple (" << segment_devices.size()
-                 << ") devices for the segment. Picking first one to continue.";
+    LOG_WARNING_WITH_PREFIX
+        << "Detected multiple (" << segment_devices.size()
+        << ") devices for the segment. Picking first one to continue.";
     info->device = *segment_devices.begin();
   } else {
     TfGpuId tf_gpu_id;
@@ -663,7 +665,7 @@ std::pair<int, Allocator*> GetDeviceAndAllocator(const ConversionParams& params,
       StrAppend(&msg, engine.device, "': ");
       for (auto d : devices) StrAppend(&msg, d->name(), ", ");
       StrAppend(&msg, ". Will get the allocator from first one.");
-      LOG(WARNING) << msg;
+      LOG_WARNING_WITH_PREFIX << msg;
     }
     AllocatorAttributes alloc_attr;
     cuda_device_id = devices[0]->tensorflow_gpu_device_info()->gpu_id;
@@ -671,8 +673,8 @@ std::pair<int, Allocator*> GetDeviceAndAllocator(const ConversionParams& params,
     VLOG(1) << "Using allocator " << dev_allocator->Name()
             << " and cuda_device_id " << cuda_device_id;
   } else {
-    LOG(WARNING) << "Cluster is set but device '" << engine.device
-                 << "' is not found in the cluster";
+    LOG_WARNING_WITH_PREFIX << "Cluster is set but device '" << engine.device
+                            << "' is not found in the cluster";
   }
   return std::make_pair(cuda_device_id, dev_allocator);
 }
@@ -770,8 +772,8 @@ Status ConvertAfterShapes(const ConversionParams& params) {
     Status status = GetEngineInfo(&graph, static_graph_properties, curr_segment,
                                   node_map, reverse_topo_order, &curr_engine);
     if (!status.ok()) {
-      LOG(WARNING) << "Failed to get engine info for segment " << t << ": "
-                   << status;
+      LOG_WARNING_WITH_PREFIX << "Failed to get engine info for segment " << t
+                              << ": " << status;
       continue;
     }
     curr_engine.precision_mode = params.precision_mode;
@@ -784,8 +786,9 @@ Status ConvertAfterShapes(const ConversionParams& params) {
                                             &graph, curr_engine.engine_name);
 
     if (!status.ok()) {
-      LOG(WARNING) << "Failed to register segment graphdef to the library " << t
-                   << ": " << status;
+      LOG_WARNING_WITH_PREFIX
+          << "Failed to register segment graphdef to the library " << t << ": "
+          << status;
       continue;
     }
 
@@ -836,7 +839,8 @@ Status ConvertAfterShapes(const ConversionParams& params) {
       alloc.reset(new TRTDeviceAllocator(device_alloc.second));
     } else {
       // Setting allocator as nullptr should get revert to the cudamalloc
-      LOG(WARNING) << "Can't identify the cuda device. Running on device 0 ";
+      LOG_WARNING_WITH_PREFIX
+          << "Can't identify the cuda device. Running on device 0 ";
     }
     cudaSetDevice(cuda_device_id);
     auto status =
@@ -850,9 +854,9 @@ Status ConvertAfterShapes(const ConversionParams& params) {
       LOG(INFO) << "Replaced " << msg << ".";
     } else {
       // Graph is not modified.
-      LOG(WARNING) << "Cannot replace " << msg
-                   << " reason: " << status.error_message()
-                   << " (keeping original segment).";
+      LOG_WARNING_WITH_PREFIX << "Cannot replace " << msg
+                              << " reason: " << status.error_message()
+                              << " (keeping original segment).";
     }
     if (VLOG_IS_ON(1)) {
       msg = "Segment consists of nodes: ";
diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc
index 28b27959afc..96cec556942 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc
+++ b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc
@@ -31,6 +31,7 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
 #include "absl/strings/string_view.h"
+#include "tensorflow/compiler/tf2tensorrt/common/utils.h"
 #include "tensorflow/compiler/tf2tensorrt/convert/utils.h"
 #include "tensorflow/compiler/tf2tensorrt/utils/trt_logger.h"
 #include "tensorflow/compiler/tf2tensorrt/utils/trt_shape_optimization_profiles.h"
@@ -1214,15 +1215,16 @@ static void InitializeTrtPlugins(nvinfer1::ILogger* trt_logger) {
   nvinfer1::IPluginCreator* const* trt_plugin_creator_list =
       getPluginRegistry()->getPluginCreatorList(&num_trt_plugins);
   if (!trt_plugin_creator_list) {
-    LOG(WARNING) << "Can not find any TensorRT plugins in registry.";
+    LOG_WARNING_WITH_PREFIX << "Can not find any TensorRT plugins in registry.";
   } else {
     VLOG(1) << "Found the following " << num_trt_plugins
             << " TensorRT plugins in registry:";
     for (int i = 0; i < num_trt_plugins; ++i) {
       if (!trt_plugin_creator_list[i]) {
-        LOG(WARNING) << "TensorRT plugin at index " << i
-                     << " is not accessible (null pointer returned by "
-                        "getPluginCreatorList for this plugin)";
+        LOG_WARNING_WITH_PREFIX
+            << "TensorRT plugin at index " << i
+            << " is not accessible (null pointer returned by "
+               "getPluginCreatorList for this plugin)";
       } else {
         VLOG(1) << "  " << trt_plugin_creator_list[i]->getPluginName();
       }
@@ -1827,9 +1829,9 @@ void Converter::MaybeApplyQuantizationRanges() {
       // are tensors which are created internally by TF-TRT. The ranges for
       // these unnamed ITensors are always inferred from user provided ranges,
       // thus there will also be a warning for the range(s) the user missed.
-      LOG(WARNING) << "Quantization range was not found for "
-                   << tensor->getName() << ". "
-                   << "Setting invalid quantization range.";
+      LOG_WARNING_WITH_PREFIX << "Quantization range was not found for "
+                              << tensor->getName() << ". "
+                              << "Setting invalid quantization range.";
       // Set the range to something unusable so the engine will fail if it
       // tries to actually use the tensor's range.
       tensor->setDynamicRange(0, 0);
@@ -4898,10 +4900,11 @@ Status ConvertFusedBatchNorm(OpConverterParams* params) {
     // Trying to use batchnorm in training mode is a very common problem.
     // Because the error message will only be printed in VLOG(1) by the
     // segmenter, we issue a special warning so that users will actually see it.
-    LOG(WARNING) << node_def.op() << " only supports is_training=false. If you "
-                 << "are using Keras, please call "
-                 << "keras.backend.set_learning_phase(0) before constructing "
-                 << "your model. At " << node_def.name();
+    LOG_WARNING_WITH_PREFIX
+        << node_def.op() << " only supports is_training=false. If you "
+        << "are using Keras, please call "
+        << "keras.backend.set_learning_phase(0) before constructing "
+        << "your model. At " << node_def.name();
     return errors::Unimplemented(node_def.op(),
                                  " only supports is_training=false, at ",
                                  node_def.name());
@@ -6039,7 +6042,7 @@ Status ConvertGraphDefToEngine(
         const string error_message =
             StrCat("Validation failed for ", node_name, " and input slot ",
                    slot_number, ": ", status.error_message());
-        LOG(WARNING) << error_message;
+        LOG_WARNING_WITH_PREFIX << error_message;
         return Status(status.code(), error_message);
       }
       VLOG(2) << "Adding engine input tensor " << node_name << " with shape "
diff --git a/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op.cc b/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op.cc
index d9b8e198f4f..ac4a331041d 100644
--- a/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op.cc
+++ b/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "absl/strings/ascii.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
+#include "tensorflow/compiler/tf2tensorrt/common/utils.h"
 #include "tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h"
 #include "tensorflow/compiler/tf2tensorrt/convert/utils.h"
 #include "tensorflow/compiler/tf2tensorrt/utils/trt_allocator.h"
@@ -613,8 +614,8 @@ void TRTEngineOp::ComputeAsync(OpKernelContext* ctx,
   }
   Status stat = ExecuteTrtEngine(ctx, engine_context, trt_context_idx);
   if (!stat.ok()) {
-    LOG(WARNING) << "Failed to execute engine: " << stat
-                 << " Retrying with native segment for " << name();
+    LOG_WARNING_WITH_PREFIX << "Failed to execute engine: " << stat
+                            << " Retrying with native segment for " << name();
     // Release any outputs that are allocated, ExecuteNativeSegment will
     // re-allocate them and fail if they are currently allocated.
     for (int i = 0; i < ctx->num_outputs(); i++) {
@@ -727,9 +728,9 @@ StatusOr<TrtUniquePtrType<nvinfer1::ICudaEngine>> TRTEngineOp::BuildEngine(
       calibrator, &engine, use_calibration, use_implicit_batch_, nullptr,
       &cache_resource->profiles_);
   if (!status.ok()) {
-    LOG(WARNING) << "Engine creation for " << name() << " failed. "
-                 << "The native segment will be used instead. "
-                 << "Reason: " << status;
+    LOG_WARNING_WITH_PREFIX << "Engine creation for " << name() << " failed. "
+                            << "The native segment will be used instead. "
+                            << "Reason: " << status;
     // Store an empty engine in the cache for these input shapes so we don't try
     // to build the same failing engine again.
     cache_resource->cache_.emplace(input_concrete_shapes,
@@ -791,8 +792,9 @@ StatusOr<std::pair<EngineContext*, int>> TRTEngineOp::GetEngine(
               FunctionDefToGraphDef(func_handle_, lib, &segment_graph_def_);
         }
         if (!status.ok()) {
-          LOG(WARNING) << "Getting segment graph for " << name() << " failed. "
-                       << "Reason: " << status;
+          LOG_WARNING_WITH_PREFIX << "Getting segment graph for " << name()
+                                  << " failed. "
+                                  << "Reason: " << status;
         }
       }
       auto result = BuildEngine(input_concrete_shapes, batch_size,
@@ -851,10 +853,11 @@ StatusOr<std::pair<EngineContext*, int>> TRTEngineOp::GetEngine(
   // If cache does not have a compatible engine then create a new engine.
   if (engine_contexts == nullptr) {
     if (!allow_build_at_runtime_) {
-      LOG(WARNING) << "Found no engine in cache matching input shapes. "
-                   << "Not building a new engine because "
-                   << "allow_build_at_runtime=False. "
-                   << "The native segment will be used instead.";
+      LOG_WARNING_WITH_PREFIX
+          << "Found no engine in cache matching input shapes. "
+          << "Not building a new engine because "
+          << "allow_build_at_runtime=False. "
+          << "The native segment will be used instead.";
       // Store an empty engine in the cache for these input shapes so we don't
       // try to build the same failing engine again.
       cache.emplace(input_concrete_shapes, absl::make_unique<EngineContext>());
diff --git a/tensorflow/compiler/tf2tensorrt/segment/segment.cc b/tensorflow/compiler/tf2tensorrt/segment/segment.cc
index 749335f1b09..32e30006f58 100644
--- a/tensorflow/compiler/tf2tensorrt/segment/segment.cc
+++ b/tensorflow/compiler/tf2tensorrt/segment/segment.cc
@@ -22,6 +22,7 @@ limitations under the License.
 
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
+#include "tensorflow/compiler/tf2tensorrt/common/utils.h"
 #include "tensorflow/compiler/tf2tensorrt/segment/union_find.h"
 #include "tensorflow/core/common_runtime/graph_constructor.h"
 #include "tensorflow/core/graph/algorithm.h"
@@ -748,9 +749,10 @@ Status SegmentGraph(const Graph* tf_graph,
         exclude_node(status.error_message());
       } else if (tftrt_op_blacklist.count(node->tf_node()->type_string())) {
         // WARNING verbosity since the user explicitly requests this behavior.
-        LOG(WARNING) << "Blacklisted as TF-TRT candidate, "
-                     << "(Op type: " << node->tf_node()->type_string() << "), "
-                     << "(Op name: " << node->name() << ")";
+        LOG_WARNING_WITH_PREFIX
+            << "Blacklisted as TF-TRT candidate, "
+            << "(Op type: " << node->tf_node()->type_string() << "), "
+            << "(Op name: " << node->name() << ")";
         exclude_node("Blacklisted with the env var TF_TRT_OP_BLACKLIST");
       } else {
         VLOG(2) << "Accepted as a TF-TRT candidate, "
@@ -1038,7 +1040,7 @@ Status SegmentGraph(const Graph* tf_graph,
       for (const auto& dev : dev_itr->second) {
         StrAppend(&s, dev, ", ");
       }
-      LOG(WARNING) << s;
+      LOG_WARNING_WITH_PREFIX << s;
     }
 
     segments->emplace_back(segment_nodes);
diff --git a/tensorflow/compiler/tf2tensorrt/utils/py_utils.cc b/tensorflow/compiler/tf2tensorrt/utils/py_utils.cc
index 885f58cd70c..a8e24aa8983 100644
--- a/tensorflow/compiler/tf2tensorrt/utils/py_utils.cc
+++ b/tensorflow/compiler/tf2tensorrt/utils/py_utils.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2tensorrt/utils/py_utils.h"
 
 #if GOOGLE_CUDA && GOOGLE_TENSORRT
+#include "tensorflow/compiler/tf2tensorrt/common/utils.h"
 #include "tensorflow/stream_executor/platform/dso_loader.h"
 #include "third_party/tensorrt/NvInfer.h"
 #endif
@@ -27,9 +28,10 @@ bool IsGoogleTensorRTEnabled() {
 #if GOOGLE_CUDA && GOOGLE_TENSORRT
   auto handle_or = se::internal::DsoLoader::TryDlopenTensorRTLibraries();
   if (!handle_or.ok()) {
-    LOG(WARNING) << "Cannot dlopen some TensorRT libraries. If you would like "
-                    "to use Nvidia GPU with TensorRT, please make sure the "
-                    "missing libraries mentioned above are installed properly.";
+    LOG_WARNING_WITH_PREFIX
+        << "Cannot dlopen some TensorRT libraries. If you would like "
+           "to use Nvidia GPU with TensorRT, please make sure the "
+           "missing libraries mentioned above are installed properly.";
     return false;
   } else {
     return true;
diff --git a/tensorflow/compiler/tf2tensorrt/utils/trt_logger.cc b/tensorflow/compiler/tf2tensorrt/utils/trt_logger.cc
index 6bb6f1f9dd8..193687ebc8c 100644
--- a/tensorflow/compiler/tf2tensorrt/utils/trt_logger.cc
+++ b/tensorflow/compiler/tf2tensorrt/utils/trt_logger.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #if GOOGLE_CUDA
 #if GOOGLE_TENSORRT
+#include "tensorflow/compiler/tf2tensorrt/common/utils.h"
 #include "tensorflow/compiler/tf2tensorrt/convert/logger_registry.h"
 #include "tensorflow/core/platform/logging.h"
 
@@ -35,7 +36,7 @@ void Logger::log(Severity severity, const char* msg) {
       break;
     }
     case Severity::kWARNING: {
-      LOG(WARNING) << name_ << " " << msg;
+      LOG_WARNING_WITH_PREFIX << name_ << " " << msg;
       break;
     }
     case Severity::kERROR: {

From ef1cabc7a8fb05f4c66b21aa2409a74e22635e45 Mon Sep 17 00:00:00 2001
From: Thomas Joerg <tjoerg@google.com>
Date: Fri, 19 Jun 2020 10:18:32 -0700
Subject: [PATCH 0627/1390] [XLA:GPU] Split reduce ops with large but
 non-consecutive reduction dimensions.

PiperOrigin-RevId: 317330616
Change-Id: Icdcf320b233479c2f74c5b40ee4c8d9a73a6088a
---
 tensorflow/compiler/xla/service/gpu/BUILD     |  28 ++++
 .../compiler/xla/service/gpu/gpu_compiler.cc  |   2 +
 .../xla/service/gpu/reduction_splitter.cc     | 117 +++++++++++++++
 .../xla/service/gpu/reduction_splitter.h      |  49 ++++++
 .../service/gpu/reduction_splitter_test.cc    | 140 ++++++++++++++++++
 .../reduction_degenerate_dim_remover_test.cc  |   1 +
 .../tests/reduction_layout_normalizer_test.cc |   1 +
 7 files changed, 338 insertions(+)
 create mode 100644 tensorflow/compiler/xla/service/gpu/reduction_splitter.cc
 create mode 100644 tensorflow/compiler/xla/service/gpu/reduction_splitter.h
 create mode 100644 tensorflow/compiler/xla/service/gpu/reduction_splitter_test.cc

diff --git a/tensorflow/compiler/xla/service/gpu/BUILD b/tensorflow/compiler/xla/service/gpu/BUILD
index 0eb82128159..472d2117a2c 100644
--- a/tensorflow/compiler/xla/service/gpu/BUILD
+++ b/tensorflow/compiler/xla/service/gpu/BUILD
@@ -1174,6 +1174,7 @@ cc_library(
         ":reduction_degenerate_dim_remover",
         ":reduction_dimension_grouper",
         ":reduction_layout_normalizer",
+        ":reduction_splitter",
         ":stream_assignment",
         ":stream_executor_util",
         ":target_constants",
@@ -1819,6 +1820,33 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "reduction_splitter",
+    srcs = ["reduction_splitter.cc"],
+    hdrs = ["reduction_splitter.h"],
+    deps = [
+        ":ir_emission_utils",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/service:hlo_pass",
+    ],
+)
+
+tf_cc_test(
+    name = "reduction_splitter_test",
+    srcs = ["reduction_splitter_test.cc"],
+    deps = [
+        ":reduction_splitter",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla:test_helpers",
+        "//tensorflow/compiler/xla/service:hlo_matchers",
+        "//tensorflow/compiler/xla/service:hlo_parser",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+    ],
+)
+
 cc_library(
     name = "reduction_layout_normalizer",
     srcs = ["reduction_layout_normalizer.cc"],
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc b/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
index cddbee92874..156cb112285 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
@@ -65,6 +65,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/gpu/reduction_degenerate_dim_remover.h"
 #include "tensorflow/compiler/xla/service/gpu/reduction_dimension_grouper.h"
 #include "tensorflow/compiler/xla/service/gpu/reduction_layout_normalizer.h"
+#include "tensorflow/compiler/xla/service/gpu/reduction_splitter.h"
 #include "tensorflow/compiler/xla/service/gpu/stream_assignment.h"
 #include "tensorflow/compiler/xla/service/gpu/stream_executor_util.h"
 #include "tensorflow/compiler/xla/service/gpu/target_constants.h"
@@ -371,6 +372,7 @@ Status GpuCompiler::OptimizeHloPostLayoutAssignment(
   pipeline.AddPass<ReductionDegenerateDimRemover>();
   pipeline.AddPass<ReductionLayoutNormalizer>();
   pipeline.AddPass<ReductionDimensionGrouper>();
+  pipeline.AddPass<HloPassFix<ReductionSplitter>>();
 
   // The LayoutAssignment pass may leave behind kCopy instructions which are
   // duplicate or NOPs, so remove them with algebraic simplification and CSE.
diff --git a/tensorflow/compiler/xla/service/gpu/reduction_splitter.cc b/tensorflow/compiler/xla/service/gpu/reduction_splitter.cc
new file mode 100644
index 00000000000..b68213ec35f
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/reduction_splitter.cc
@@ -0,0 +1,117 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/gpu/reduction_splitter.h"
+
+#include <algorithm>
+
+#include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
+#include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
+#include "tensorflow/compiler/xla/service/hlo_instructions.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+
+namespace xla {
+namespace gpu {
+
+class ReductionSplitterVisitor : public DfsHloRewriteVisitor {
+ public:
+  Status HandleReduce(HloInstruction *reduce) override {
+    VLOG(4) << "Input: " << reduce->ToString();
+
+    // Reductions with contiguous dimensions are lowered to efficient code. No
+    // need to split such ops.
+    if (IsReductionFromOrToContiguousDimensions(*reduce)) {
+      return Status::OK();
+    }
+    if (reduce->dimensions().size() < 2) {
+      return Status::OK();
+    }
+    if (!reduce->shape().IsArray()) {
+      // TODO(cheshire): Handle variadic reduction.
+      return Status::OK();
+    }
+
+    HloInstruction *operand = reduce->mutable_operand(0);
+    const Shape &shape = operand->shape();
+    CHECK(shape == LayoutUtil::GetWithDefaultLayout(shape))
+        << "Default layout should be enforced on reduction operand";
+    // Verify that contiguous dimensions have been grouped by the
+    // ReductionDimensionGrouper pass.
+    for (int64 i = 0; i < reduce->dimensions().size(); ++i) {
+      for (int64 j = i + 1; j < reduce->dimensions().size(); ++j) {
+        CHECK(abs(reduce->dimensions(i) - reduce->dimensions(j)) > 1)
+            << "Reduction dimensions must not be consecutive";
+      }
+    }
+
+    // The reduce op has non-contiguous dimensions. Look for the dimension with
+    // the largest shape dimension. Reducing along this dimension first will
+    // reduce the output size most effectively.
+    int64 max_shape_dim = 0;
+    int64 max_reduce_dim = 0;
+    const auto &input_shape = reduce->operand(0)->shape();
+    for (int64 i = 0; i < reduce->dimensions().size(); ++i) {
+      if (input_shape.dimensions(reduce->dimensions(i)) > max_shape_dim) {
+        max_reduce_dim = reduce->dimensions(i);
+        max_shape_dim = input_shape.dimensions(max_reduce_dim);
+      }
+    }
+    // TODO(tjoerg): Run microbenchmarks to tune this threshold.
+    if (max_shape_dim < 128) {
+      return Status::OK();
+    }
+
+    // Split the reduction into a pre-reduction and a final reduction.
+    VLOG(3) << "Splitting reduction " << reduce->name() << " at dimension "
+            << max_reduce_dim;
+    std::vector<int64> pre_reduce_dims;
+    pre_reduce_dims.push_back(max_reduce_dim);
+    std::vector<int64> pre_reduce_shape_dims(input_shape.dimensions().begin(),
+                                             input_shape.dimensions().end());
+    pre_reduce_shape_dims.erase(pre_reduce_shape_dims.begin() + max_reduce_dim);
+    Shape pre_reduce_shape = ShapeUtil::MakeShape(
+        reduce->shape().element_type(), pre_reduce_shape_dims);
+    std::unique_ptr<HloInstruction> pre_reduce = HloInstruction::CreateReduce(
+        pre_reduce_shape, reduce->mutable_operand(0),
+        reduce->mutable_operand(1), pre_reduce_dims, reduce->to_apply());
+    pre_reduce->set_metadata(reduce->metadata());
+
+    std::vector<int64> final_reduce_dims(reduce->dimensions().begin(),
+                                         reduce->dimensions().end());
+    final_reduce_dims.erase(
+        std::remove(final_reduce_dims.begin(), final_reduce_dims.end(),
+                    max_reduce_dim),
+        final_reduce_dims.end());
+    for (int64 i = 0; i < final_reduce_dims.size(); ++i) {
+      if (final_reduce_dims[i] > max_reduce_dim) {
+        final_reduce_dims[i]--;
+      }
+    }
+    std::unique_ptr<HloInstruction> final_reduce = HloInstruction::CreateReduce(
+        reduce->shape(),
+        reduce->parent()->AddInstruction(std::move(pre_reduce)),
+        reduce->mutable_operand(1), final_reduce_dims, reduce->to_apply());
+    return ReplaceWithNewInstruction(reduce, std::move(final_reduce));
+  }
+};
+
+StatusOr<bool> ReductionSplitter::Run(HloModule *module) {
+  TF_ASSIGN_OR_RETURN(bool changed,
+                      ReductionSplitterVisitor().RunOnModule(module));
+  return changed;
+}
+
+}  // namespace gpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/reduction_splitter.h b/tensorflow/compiler/xla/service/gpu/reduction_splitter.h
new file mode 100644
index 00000000000..f161b579eb8
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/reduction_splitter.h
@@ -0,0 +1,49 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_REDUCTION_SPLITTER_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_REDUCTION_SPLITTER_H_
+
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
+
+namespace xla {
+namespace gpu {
+
+// Splits a reduce op into two consecutive reduce ops if
+// * the reduce dimensions are not contiguous and
+// * at least one reduce dimension is large (i.e. corresponds to a large input
+//   shape dimension).
+//
+// Reductions with non-contiguous dimensions are emitted as simple element-wise
+// loops. This is inefficient when reducing large input shape dimensions.
+// Splitting such reductions allows using more efficient reduction emitters.
+//
+// This pass splits reduce ops into two consecutive reduce ops. Run it to a
+// fixpoint to split reduce ops along multiple large dimensions.
+//
+// Precondition: ReductionDimensionGrouper has been run and adjacent reduce
+// dimentsions have been grouped. Reduction layouts have been normalized.
+
+class ReductionSplitter : public HloModulePass {
+ public:
+  absl::string_view name() const override { return "reduction-splitter"; }
+
+  StatusOr<bool> Run(HloModule* module) override;
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_REDUCTION_SPLITTER_H_
diff --git a/tensorflow/compiler/xla/service/gpu/reduction_splitter_test.cc b/tensorflow/compiler/xla/service/gpu/reduction_splitter_test.cc
new file mode 100644
index 00000000000..1be55b84204
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/reduction_splitter_test.cc
@@ -0,0 +1,140 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/gpu/reduction_splitter.h"
+
+#include "tensorflow/compiler/xla/service/hlo_matchers.h"
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/compiler/xla/test_helpers.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+
+namespace xla {
+namespace gpu {
+namespace {
+
+namespace op = xla::testing::opcode_matchers;
+
+class ReductionSplitterTest : public HloTestBase {};
+
+TEST_F(ReductionSplitterTest, SplitReductionAtDimensionTwo) {
+  auto module = ParseAndReturnVerifiedModule(R"(
+  HloModule test
+
+  add_computation {
+    x = f32[] parameter(0)
+    y = f32[] parameter(1)
+    ROOT add = f32[] add(x, y)
+  }
+
+  ENTRY entry_computation {
+    param_0 = f16[6,16,512,64]{3,2,1,0} parameter(0)
+    transpose.1781 = f16[6,512,16,64]{3,1,2,0} transpose(param_0), dimensions={0,2,1,3}
+    convert.6986 = f32[6,512,16,64]{3,1,2,0} convert(transpose.1781)
+    bitcast.2136 = f32[6,16,512,64]{3,2,1,0} bitcast(convert.6986)
+    constant_11111 = f32[] constant(0)
+    ROOT reduce.982 = f32[16,64]{1,0} reduce(bitcast.2136, constant_11111), dimensions={0,2}, to_apply=add_computation
+  }
+  )")
+                    .ValueOrDie();
+  ASSERT_TRUE(ReductionSplitter().Run(module.get()).ValueOrDie());
+  SCOPED_TRACE(module->ToString());
+  const HloInstruction* root_reduction =
+      module->entry_computation()->root_instruction();
+  ASSERT_THAT(root_reduction, op::Reduce(op::Reduce(), op::Constant()));
+
+  auto* pre_reduction = root_reduction->operand(0);
+  EXPECT_THAT(pre_reduction->dimensions(), std::vector<int64>({2}));
+  EXPECT_THAT(pre_reduction->shape(), ShapeUtil::MakeShape(F32, {6, 16, 64}));
+  EXPECT_THAT(root_reduction->dimensions(), std::vector<int64>({0}));
+  EXPECT_THAT(root_reduction->shape(), ShapeUtil::MakeShape(F32, {16, 64}));
+}
+
+TEST_F(ReductionSplitterTest, SplitReductionAtDimensionZero) {
+  auto module = ParseAndReturnVerifiedModule(R"(
+  HloModule test
+
+  add_computation {
+    x = f32[] parameter(0)
+    y = f32[] parameter(1)
+    ROOT add = f32[] add(x, y)
+  }
+
+  ENTRY entry_computation {
+    param_0 = f32[1024,16,512,64,128]{4,3,2,1,0} parameter(0)
+    constant_11111 = f32[] constant(0)
+    ROOT reduce.982 = f32[16,64]{1,0} reduce(param_0, constant_11111), dimensions={2,0,4}, to_apply=add_computation
+  }
+  )")
+                    .ValueOrDie();
+  ASSERT_TRUE(ReductionSplitter().Run(module.get()).ValueOrDie());
+  SCOPED_TRACE(module->ToString());
+  const HloInstruction* root_reduction =
+      module->entry_computation()->root_instruction();
+  ASSERT_THAT(root_reduction, op::Reduce(op::Reduce(), op::Constant()));
+
+  auto* pre_reduction = root_reduction->operand(0);
+  EXPECT_THAT(pre_reduction->dimensions(), std::vector<int64>({0}));
+  EXPECT_THAT(pre_reduction->shape(),
+              ShapeUtil::MakeShape(F32, {16, 512, 64, 128}));
+  EXPECT_THAT(root_reduction->dimensions(), std::vector<int64>({1, 3}));
+  EXPECT_THAT(root_reduction->shape(), ShapeUtil::MakeShape(F32, {16, 64}));
+}
+
+TEST_F(ReductionSplitterTest, DontSplitReductionWithSmallDimensions) {
+  auto module = ParseAndReturnVerifiedModule(R"(
+  HloModule test
+
+  add_computation {
+    x = f32[] parameter(0)
+    y = f32[] parameter(1)
+    ROOT add = f32[] add(x, y)
+  }
+
+  ENTRY entry_computation {
+    param_0 = f32[8,1024,8]{2,1,0} parameter(0)
+    constant_11111 = f32[] constant(0)
+    ROOT reduce.982 = f32[1024]{0} reduce(param_0, constant_11111), dimensions={2,0}, to_apply=add_computation
+  }
+  )")
+                    .ValueOrDie();
+  EXPECT_FALSE(ReductionSplitter().Run(module.get()).ValueOrDie());
+}
+
+TEST_F(ReductionSplitterTest, DontSplitReductionsWithContiguousDimensions) {
+  auto module = ParseAndReturnVerifiedModule(R"(
+  HloModule test
+
+  add_computation {
+    x = f32[] parameter(0)
+    y = f32[] parameter(1)
+    ROOT add = f32[] add(x, y)
+  }
+
+  ENTRY entry_computation {
+    param_0 = f32[128,128,64,128]{3,2,1,0} parameter(0)
+    constant_11111 = f32[] constant(0)
+    // The dimenstions to keep (1 and 2) are contiguous.
+    ROOT reduce.982 = f32[128,64]{1,0} reduce(param_0, constant_11111), dimensions={3,0}, to_apply=add_computation
+  }
+  )")
+                    .ValueOrDie();
+  EXPECT_FALSE(ReductionSplitter().Run(module.get()).ValueOrDie());
+}
+
+}  // namespace
+}  // namespace gpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/tests/reduction_degenerate_dim_remover_test.cc b/tensorflow/compiler/xla/service/gpu/tests/reduction_degenerate_dim_remover_test.cc
index 2c5e704d7c2..92f558ee98d 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/reduction_degenerate_dim_remover_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/tests/reduction_degenerate_dim_remover_test.cc
@@ -37,6 +37,7 @@ class ReductionDegenerateDimRemoverTest : public GpuCodegenTest {
     DebugOptions debug_options = GpuCodegenTest::GetDebugOptionsForTest();
     debug_options.add_xla_disable_hlo_passes("reduction-layout-normalizer");
     debug_options.add_xla_disable_hlo_passes("reduction-dimension-grouper");
+    debug_options.add_xla_disable_hlo_passes("reduction-splitter");
     debug_options.add_xla_disable_hlo_passes("gpu-tree-reduction-rewriter");
     return debug_options;
   }
diff --git a/tensorflow/compiler/xla/service/gpu/tests/reduction_layout_normalizer_test.cc b/tensorflow/compiler/xla/service/gpu/tests/reduction_layout_normalizer_test.cc
index d06385480e5..b65c2842320 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/reduction_layout_normalizer_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/tests/reduction_layout_normalizer_test.cc
@@ -33,6 +33,7 @@ class ReductionLayoutNormalizerTest : public GpuCodegenTest {
   DebugOptions GetDebugOptionsForTest() override {
     DebugOptions debug_options = GpuCodegenTest::GetDebugOptionsForTest();
     debug_options.add_xla_disable_hlo_passes("reduction-dimension-grouper");
+    debug_options.add_xla_disable_hlo_passes("reduction-splitter");
     debug_options.add_xla_disable_hlo_passes("layout-assignment");
     debug_options.add_xla_disable_hlo_passes("gpu-tree-reduction-rewriter");
     return debug_options;

From e0780ef031fc27f4f2a71d745004d859a711c90a Mon Sep 17 00:00:00 2001
From: Reed Wanderman-Milne <reedwm@google.com>
Date: Fri, 19 Jun 2020 10:21:14 -0700
Subject: [PATCH 0628/1390] Fix fp16 FusedBatchNorm CPU crash if batch
 dimension is 1.

The GPU kernel outputs NaNs for the variance in this case, which is also incorrect, but better than crashing.

PiperOrigin-RevId: 317331280
Change-Id: Iea4e5a3337625796c50244e51d7ccb4b89f4c3e4
---
 tensorflow/core/kernels/redux_functor.h       |  6 ++--
 .../python/ops/nn_fused_batchnorm_test.py     | 28 +++++++++++++++++--
 2 files changed, 30 insertions(+), 4 deletions(-)

diff --git a/tensorflow/core/kernels/redux_functor.h b/tensorflow/core/kernels/redux_functor.h
index 30038c62dbd..e07fa5364f1 100644
--- a/tensorflow/core/kernels/redux_functor.h
+++ b/tensorflow/core/kernels/redux_functor.h
@@ -57,7 +57,8 @@ struct ReduceOuterDimensions {
     if (1 == outer_dim) {
       // Nothing to do but passing input to output.
       output->template flat<OutputT>() =
-          input.template flat<OutputT>().reshape(output_dims);
+          input.template flat<InputT>().template cast<OutputT>().reshape(
+              output_dims);
       return;
     }
 
@@ -226,7 +227,8 @@ struct ReduceMiddleDimensions {
     if ((1 == inner_dim * outer_dim)) {
       // Nothing to do.
       output->template flat<OutputT>() =
-          input.template flat<OutputT>().reshape(output_dims);
+          input.template flat<InputT>().template cast<OutputT>().reshape(
+              output_dims);
       return;
     } else if (1 == inner_dim) {
       // Equivalent to ReduceOuterDimensions.
diff --git a/tensorflow/python/ops/nn_fused_batchnorm_test.py b/tensorflow/python/ops/nn_fused_batchnorm_test.py
index 5497325f6c0..1742a919216 100644
--- a/tensorflow/python/ops/nn_fused_batchnorm_test.py
+++ b/tensorflow/python/ops/nn_fused_batchnorm_test.py
@@ -375,9 +375,10 @@ class BatchNormalizationTest(test.TestCase):
     self.assertLess(err_grad_x_2, err_tolerance)
     self.assertLess(err_grad_scale, err_tolerance)
 
-  def _runtests(self, x_shape, is_training, gradient_test=False):
+  def _runtests(self, x_shape, is_training, gradient_test=False,
+                cpu_only=False):
     use_gpu_vals = [False]
-    if test.is_gpu_available(cuda_only=True):
+    if test.is_gpu_available(cuda_only=True) and not cpu_only:
       use_gpu_vals += [True]
     factors = [1.0, 0.6]
     for dtype in [np.float16, np.float32]:
@@ -438,6 +439,11 @@ class BatchNormalizationTest(test.TestCase):
     x_shape = [0, 131, 127, 6]
     self._runtests(x_shape, False)
 
+  def testInferenceShape6(self):
+    x_shape = [1, 1, 1, 1]
+    # GPU kernel doesn't properly handle case where non-channel dimensions are 1
+    self._runtests(x_shape, False, cpu_only=True)
+
   def testTrainingShape1(self):
     x_shape = [1, 1, 6, 1]
     self._runtests(x_shape, True)
@@ -459,6 +465,11 @@ class BatchNormalizationTest(test.TestCase):
     x_shape = [0, 131, 127, 6]
     self._runtests(x_shape, True)
 
+  def testTrainingShape6(self):
+    x_shape = [1, 1, 1, 1]
+    # GPU kernel doesn't properly handle case where non-channel dimensions are 1
+    self._runtests(x_shape, True, cpu_only=True)
+
   @test_util.run_deprecated_v1
   def testBatchNormGradInferenceShape1(self):
     x_shape = [1, 1, 6, 1]
@@ -485,6 +496,13 @@ class BatchNormalizationTest(test.TestCase):
     x_shape = [0, 7, 11, 4]
     self._runtests(x_shape, is_training=False, gradient_test=True)
 
+  @test_util.run_deprecated_v1
+  def testBatchNormGradInferenceShape6(self):
+    x_shape = [1, 1, 1, 1]
+    # GPU kernel doesn't properly handle case where non-channel dimensions are 1
+    self._runtests(x_shape, is_training=False, gradient_test=True,
+                   cpu_only=True)
+
   @test_util.run_deprecated_v1
   def testBatchNormGradTrainingShape1(self):
     x_shape = [1, 1, 6, 1]
@@ -511,6 +529,12 @@ class BatchNormalizationTest(test.TestCase):
     x_shape = [0, 7, 11, 4]
     self._runtests(x_shape, is_training=True, gradient_test=True)
 
+  @test_util.run_deprecated_v1
+  def testBatchNormGradTrainingShape6(self):
+    x_shape = [1, 1, 1, 1]
+    # GPU kernel doesn't properly handle case where non-channel dimensions are 1
+    self._runtests(x_shape, is_training=True, gradient_test=True, cpu_only=True)
+
   def _testBatchNormGradGrad(self, config):
     shape = config['shape']
     err_tolerance = config['err_tolerance']

From 94f241379a351da7cb7a180967915beec48d14ef Mon Sep 17 00:00:00 2001
From: Vo Van Nghia <vovannghia2409@gmail.com>
Date: Fri, 19 Jun 2020 20:42:39 +0700
Subject: [PATCH 0629/1390] Add gcs_filesystem_test

---
 .../experimental/filesystem/plugins/gcs/BUILD | 24 +++++++++-
 .../filesystem/plugins/gcs/gcs_filesystem.cc  | 10 ++++-
 .../plugins/gcs/gcs_filesystem_test.cc        | 45 +++++++++++++++++++
 3 files changed, 77 insertions(+), 2 deletions(-)
 create mode 100644 tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem_test.cc

diff --git a/tensorflow/c/experimental/filesystem/plugins/gcs/BUILD b/tensorflow/c/experimental/filesystem/plugins/gcs/BUILD
index 43f3a507f0b..3a65824cd7c 100644
--- a/tensorflow/c/experimental/filesystem/plugins/gcs/BUILD
+++ b/tensorflow/c/experimental/filesystem/plugins/gcs/BUILD
@@ -1,5 +1,5 @@
 # Experimental gcs filesystem plugin.
-load("//tensorflow:tensorflow.bzl", "get_win_copts", "tf_cc_shared_object")
+load("//tensorflow:tensorflow.bzl", "get_win_copts", "tf_cc_shared_object", "tf_cc_test")
 
 package(
     licenses = ["notice"],  # Apache 2.0
@@ -42,3 +42,25 @@ cc_library(
         "//tensorflow/c:env",
     ],
 )
+
+tf_cc_test(
+    name = "gcs_filesystem_test",
+    srcs = [
+        "gcs_filesystem.cc",
+        "gcs_filesystem_test.cc",
+    ],
+    local_defines = ["TF_GCS_FILESYSTEM_TEST"],
+    tags = [
+        "manual",
+    ],
+    deps = [
+        "gcs_helper",
+        "//tensorflow/c:tf_status",
+        "//tensorflow/c/experimental/filesystem:filesystem_interface",
+        "@com_github_googlecloudplatform_google_cloud_cpp//:storage_client",
+        "@com_google_absl//absl/strings",
+    ] + [
+        "//tensorflow/c:tf_status_helper",
+        "@com_google_googletest//:gtest",
+    ],
+)
diff --git a/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem.cc b/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem.cc
index 24d85f359ef..f459710ddc6 100644
--- a/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem.cc
+++ b/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem.cc
@@ -24,6 +24,14 @@ limitations under the License.
 #include "tensorflow/c/experimental/filesystem/plugins/gcs/gcs_helper.h"
 #include "tensorflow/c/tf_status.h"
 
+#ifdef TF_GCS_FILESYSTEM_TEST
+// For testing purpose, we expose some functions.
+#define TF_STATIC
+#else
+// Otherwise, we don't expose any symbol.
+#define TF_STATIC static
+#endif
+
 // Implementation of a filesystem for GCS environments.
 // This filesystem will support `gs://` URI schemes.
 namespace gcs = google::cloud::storage;
@@ -122,7 +130,7 @@ namespace tf_read_only_memory_region {
 namespace tf_gcs_filesystem {
 
 // TODO(vnvo2409): Add lazy-loading and customizing parameters.
-static void Init(TF_Filesystem* filesystem, TF_Status* status) {
+TF_STATIC void Init(TF_Filesystem* filesystem, TF_Status* status) {
   google::cloud::StatusOr<gcs::Client> client =
       gcs::Client::CreateDefaultClient();
   if (!client) {
diff --git a/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem_test.cc b/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem_test.cc
new file mode 100644
index 00000000000..eb0fbfc33f7
--- /dev/null
+++ b/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem_test.cc
@@ -0,0 +1,45 @@
+#include "gtest/gtest.h"
+#include "tensorflow/c/experimental/filesystem/filesystem_interface.h"
+#include "tensorflow/c/tf_status_helper.h"
+
+#define ASSERT_TF_OK(x) ASSERT_EQ(TF_OK, TF_GetCode(x))
+
+// Forward declaration
+namespace tf_gcs_filesystem {
+void Init(TF_Filesystem* filesystem, TF_Status* status);
+}
+
+namespace tensorflow {
+namespace {
+
+class GCSFilesystemTest : public ::testing::Test {
+ public:
+  void SetUp() override {
+    status = TF_NewStatus();
+    filesystem = new TF_Filesystem;
+    tf_gcs_filesystem::Init(filesystem, status);
+    ASSERT_TF_OK(status) << "Can not initialize filesystem. "
+                         << TF_Message(status);
+  }
+  void TearDown() override {
+    TF_DeleteStatus(status);
+    // TODO(vnvo2409): Add filesystem cleanup
+    delete filesystem;
+  }
+
+ protected:
+  TF_Filesystem* filesystem;
+  TF_Status* status;
+};
+
+// We have to add this test here because there must be at least one test.
+// This test will be removed in the future.
+TEST_F(GCSFilesystemTest, TestInit) { ASSERT_TF_OK(status); }
+
+}  // namespace
+}  // namespace tensorflow
+
+GTEST_API_ int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}

From fbf407383c93774d10bd7c45cd66788a070b0e07 Mon Sep 17 00:00:00 2001
From: Nick Kreeger <kreeger@google.com>
Date: Fri, 19 Jun 2020 10:40:30 -0700
Subject: [PATCH 0630/1390] Reduce the size of TfLiteTensor for the TF Micro
 runtime.

This change uses the existing micro-specific build flag (TF_LITE_STATIC_MEMORY) to reduce the size of TfLiteTensor. In this build setting, only the minimum number of fields required for preparing and initializing a model in TFLM are used. This build define is opt-in only for internal builds and continues to be enabled by default in Makefile builds./

All TFLM internal targets can be built with this flag by adding '--copt=-DTF_LITE_STATIC_MEMORY'.

This change reduces the sizeof(TfLiteTensor) to 64 bytes (64bit systems) down from 112 bytes (64 bit systems).

TfLiteTensor struct reduced by 1.75x (~43% reduction)
Tail allocation reduced by: 2,592kb (~12.5% reduction)
Total allocation reduced by: 2,592kb (~12% reduction)

Optimized results from memory_arena_threshold_test:
Keyword Model:
--------------
[RecordingMicroAllocator] Arena allocation total 18448 bytes
[RecordingMicroAllocator] Arena allocation head 672 bytes
[RecordingMicroAllocator] Arena allocation tail 17776 bytes
[RecordingMicroAllocator] 'TfLiteTensor struct' used 3456 bytes with alignment overhead (requested 3456 bytes for 54 tensors)
[RecordingMicroAllocator] 'TfLiteTensor quantization data' used 1728 bytes with alignment overhead (requested 1728 bytes for 108 allocations)
[RecordingMicroAllocator] 'TfLiteTensor variable buffer data' used 10240 bytes with alignment overhead (requested 10240 bytes for 7 allocations)
[RecordingMicroAllocator] 'NodeAndRegistration struct' used 1200 bytes with alignment overhead (requested 1200 bytes for 15 NodeAndRegistration structs)
[RecordingMicroAllocator] 'Operator runtime data' used 148 bytes with alignment overhead (requested 148 bytes for 13 OpData structs)

Test Conv Model:
----------------
[RecordingMicroAllocator] Arena allocation total 10960 bytes
[RecordingMicroAllocator] Arena allocation head 7744 bytes
[RecordingMicroAllocator] Arena allocation tail 3216 bytes
[RecordingMicroAllocator] 'TfLiteTensor struct' used 960 bytes with alignment overhead (requested 960 bytes for 15 tensors)
[RecordingMicroAllocator] 'TfLiteTensor quantization data' used 768 bytes with alignment overhead (requested 752 bytes for 24 allocations)
[RecordingMicroAllocator] 'TfLiteTensor variable buffer data' used 0 bytes with alignment overhead (requested 0 bytes for 0 allocations)
[RecordingMicroAllocator] 'NodeAndRegistration struct' used 560 bytes with alignment overhead (requested 560 bytes for 7 NodeAndRegistration structs)
[RecordingMicroAllocator] 'Operator runtime data' used 136 bytes with alignment overhead (requested 136 bytes for 5 OpData structs)

PiperOrigin-RevId: 317335359
Change-Id: Ic3d4d2c3e62249f072ece8f621f9ef94eaa28589
---
 tensorflow/lite/c/common.h                    | 46 +++++++++++++++++++
 .../lite/micro/memory_arena_threshold_test.cc | 19 ++++++--
 .../lite/micro/micro_interpreter_test.cc      |  2 +-
 tensorflow/lite/micro/tools/make/Makefile     |  2 +
 .../benchmark/experimental/c/c_api_types.h    | 46 +++++++++++++++++++
 5 files changed, 111 insertions(+), 4 deletions(-)

diff --git a/tensorflow/lite/c/common.h b/tensorflow/lite/c/common.h
index 15823784d12..9093e5d50ad 100644
--- a/tensorflow/lite/c/common.h
+++ b/tensorflow/lite/c/common.h
@@ -375,6 +375,7 @@ typedef struct TfLiteSparsity {
 
 // An tensor in the interpreter system which is a wrapper around a buffer of
 // data including a dimensionality (or NULL if not currently defined).
+#ifndef TF_LITE_STATIC_MEMORY
 typedef struct TfLiteTensor {
   // The data type specification for data stored in `data`. This affects
   // what member of `data` union should be used.
@@ -439,6 +440,51 @@ typedef struct TfLiteTensor {
   // `dims_signature` contains [1, -1, -1, 3]).
   const TfLiteIntArray* dims_signature;
 } TfLiteTensor;
+#else
+// Specific reduced TfLiteTensor struct for TF Micro runtime. This struct
+// contains only the minimum fields required to initialize and prepare a micro
+// inference graph. The fields in this struct have been ordered from
+// largest-to-smallest for optimal struct sizeof.
+//
+// NOTE: This flag is opt-in only at compile time.
+typedef struct TfLiteTensor {
+  // TODO(b/155784997): Consider consolidating these quantization fields:
+  // Quantization information. Replaces params field above.
+  TfLiteQuantization quantization;
+
+  // Quantization information.
+  TfLiteQuantizationParams params;
+
+  // A union of data pointers. The appropriate type should be used for a typed
+  // tensor based on `type`.
+  TfLitePtrUnion data;
+
+  // A pointer to a structure representing the dimensionality interpretation
+  // that the buffer should have. NOTE: the product of elements of `dims`
+  // and the element datatype size should be equal to `bytes` below.
+  TfLiteIntArray* dims;
+
+  // The number of bytes required to store the data of this Tensor. I.e.
+  // (bytes of each element) * dims[0] * ... * dims[n-1].  For example, if
+  // type is kTfLiteFloat32 and dims = {3, 2} then
+  // bytes = sizeof(float) * 3 * 2 = 4 * 3 * 2 = 24.
+  size_t bytes;
+
+  // The data type specification for data stored in `data`. This affects
+  // what member of `data` union should be used.
+  TfLiteType type;
+
+  // How memory is mapped
+  //  kTfLiteMmapRo: Memory mapped read only.
+  //  i.e. weights
+  //  kTfLiteArenaRw: Arena allocated read write memory
+  //  (i.e. temporaries, outputs).
+  TfLiteAllocationType allocation_type;
+
+  // True if the tensor is a variable.
+  bool is_variable;
+} TfLiteTensor;
+#endif  // TF_LITE_STATIC_MEMORY
 
 #ifndef TF_LITE_STATIC_MEMORY
 // Free data memory of tensor `t`.
diff --git a/tensorflow/lite/micro/memory_arena_threshold_test.cc b/tensorflow/lite/micro/memory_arena_threshold_test.cc
index 4f49b57112a..b45de85a21b 100644
--- a/tensorflow/lite/micro/memory_arena_threshold_test.cc
+++ b/tensorflow/lite/micro/memory_arena_threshold_test.cc
@@ -41,9 +41,17 @@ constexpr int kKeywordModelNodeAndRegistrationCount = 15;
 
 // NOTE: These values are measured on x86-64:
 // TODO(b/158651472): Consider auditing these values on non-64 bit systems.
+//
+// Run this test with '--copt=-DTF_LITE_MICRO_OPTIMIZED_RUNTIME' to get
+// optimized memory runtime values:
+#ifdef TF_LITE_STATIC_MEMORY
+constexpr int kKeywordModelTotalSize = 18448;
+constexpr int kKeywordModelTailSize = 17776;
+#else
 constexpr int kKeywordModelTotalSize = 21040;
-constexpr int kKeywordModelHeadSize = 672;
 constexpr int kKeywordModelTailSize = 20368;
+#endif
+constexpr int kKeywordModelHeadSize = 672;
 constexpr int kKeywordModelTfLiteTensorVariableBufferDataSize = 10240;
 constexpr int kKeywordModelTfLiteTensorQuantizationDataSize = 1728;
 constexpr int kKeywordModelOpRuntimeDataSize = 148;
@@ -56,9 +64,14 @@ constexpr int kTestConvModelNodeAndRegistrationCount = 7;
 
 // NOTE: These values are measured on x86-64:
 // TODO(b/158651472): Consider auditing these values on non-64 bit systems.
+#ifdef TF_LITE_STATIC_MEMORY
+constexpr int kTestConvModelTotalSize = 10960;
+constexpr int kTestConvModelTailSize = 3216;
+#else
 constexpr int kTestConvModelTotalSize = 11680;
-constexpr int kTestConvModelHeadSize = 7744;
 constexpr int kTestConvModelTailSize = 3936;
+#endif
+constexpr int kTestConvModelHeadSize = 7744;
 constexpr int kTestConvModelTfLiteTensorQuantizationDataSize = 768;
 constexpr int kTestConvModelOpRuntimeDataSize = 136;
 
@@ -81,7 +94,7 @@ void EnsureAllocatedSizeThreshold(const char* allocation_type, size_t actual,
     TF_LITE_MICRO_EXPECT_NEAR(actual, expected, kAllocationThreshold);
     if (actual != expected) {
       TF_LITE_REPORT_ERROR(micro_test::reporter,
-                           "%s threshold failed: %ld != %ld", allocation_type,
+                           "%s threshold failed: %d != %d", allocation_type,
                            actual, expected);
     }
   } else {
diff --git a/tensorflow/lite/micro/micro_interpreter_test.cc b/tensorflow/lite/micro/micro_interpreter_test.cc
index f54c212b573..c577d8cb513 100644
--- a/tensorflow/lite/micro/micro_interpreter_test.cc
+++ b/tensorflow/lite/micro/micro_interpreter_test.cc
@@ -284,7 +284,7 @@ TF_LITE_MICRO_TEST(TestIncompleteInitializationAllocationsWithSmallArena) {
 
   tflite::testing::MockOpResolver mock_resolver;
   // 1kb is too small for the ComplexMockModel:
-  constexpr size_t allocator_buffer_size = 1048;
+  constexpr size_t allocator_buffer_size = 500;
   uint8_t allocator_buffer[allocator_buffer_size];
 
   tflite::RecordingMicroAllocator* allocator =
diff --git a/tensorflow/lite/micro/tools/make/Makefile b/tensorflow/lite/micro/tools/make/Makefile
index 8b6cba06a0b..a75c59b05c9 100644
--- a/tensorflow/lite/micro/tools/make/Makefile
+++ b/tensorflow/lite/micro/tools/make/Makefile
@@ -75,6 +75,8 @@ TEST_SCRIPT := tensorflow/lite/micro/testing/test_linux_binary.sh
 MICROLITE_LIBS := -lm
 
 # TODO(b/150240249): Add in -fno-rtti once that works for the Xtensa toolchain.
+# TODO(b/159155203): Consider TF_LITE_STATIC_MEMORY to align more with the fact
+# this flag is for an optimized micro runtime.
 CXXFLAGS := -std=c++11 -DTF_LITE_STATIC_MEMORY
 CCFLAGS  := -std=c11   -DTF_LITE_STATIC_MEMORY
 ARFLAGS := -r
diff --git a/tensorflow/lite/tools/benchmark/experimental/c/c_api_types.h b/tensorflow/lite/tools/benchmark/experimental/c/c_api_types.h
index 15823784d12..9093e5d50ad 100644
--- a/tensorflow/lite/tools/benchmark/experimental/c/c_api_types.h
+++ b/tensorflow/lite/tools/benchmark/experimental/c/c_api_types.h
@@ -375,6 +375,7 @@ typedef struct TfLiteSparsity {
 
 // An tensor in the interpreter system which is a wrapper around a buffer of
 // data including a dimensionality (or NULL if not currently defined).
+#ifndef TF_LITE_STATIC_MEMORY
 typedef struct TfLiteTensor {
   // The data type specification for data stored in `data`. This affects
   // what member of `data` union should be used.
@@ -439,6 +440,51 @@ typedef struct TfLiteTensor {
   // `dims_signature` contains [1, -1, -1, 3]).
   const TfLiteIntArray* dims_signature;
 } TfLiteTensor;
+#else
+// Specific reduced TfLiteTensor struct for TF Micro runtime. This struct
+// contains only the minimum fields required to initialize and prepare a micro
+// inference graph. The fields in this struct have been ordered from
+// largest-to-smallest for optimal struct sizeof.
+//
+// NOTE: This flag is opt-in only at compile time.
+typedef struct TfLiteTensor {
+  // TODO(b/155784997): Consider consolidating these quantization fields:
+  // Quantization information. Replaces params field above.
+  TfLiteQuantization quantization;
+
+  // Quantization information.
+  TfLiteQuantizationParams params;
+
+  // A union of data pointers. The appropriate type should be used for a typed
+  // tensor based on `type`.
+  TfLitePtrUnion data;
+
+  // A pointer to a structure representing the dimensionality interpretation
+  // that the buffer should have. NOTE: the product of elements of `dims`
+  // and the element datatype size should be equal to `bytes` below.
+  TfLiteIntArray* dims;
+
+  // The number of bytes required to store the data of this Tensor. I.e.
+  // (bytes of each element) * dims[0] * ... * dims[n-1].  For example, if
+  // type is kTfLiteFloat32 and dims = {3, 2} then
+  // bytes = sizeof(float) * 3 * 2 = 4 * 3 * 2 = 24.
+  size_t bytes;
+
+  // The data type specification for data stored in `data`. This affects
+  // what member of `data` union should be used.
+  TfLiteType type;
+
+  // How memory is mapped
+  //  kTfLiteMmapRo: Memory mapped read only.
+  //  i.e. weights
+  //  kTfLiteArenaRw: Arena allocated read write memory
+  //  (i.e. temporaries, outputs).
+  TfLiteAllocationType allocation_type;
+
+  // True if the tensor is a variable.
+  bool is_variable;
+} TfLiteTensor;
+#endif  // TF_LITE_STATIC_MEMORY
 
 #ifndef TF_LITE_STATIC_MEMORY
 // Free data memory of tensor `t`.

From 1332b9365e403baf448358465e6ffd3e5368e614 Mon Sep 17 00:00:00 2001
From: Rahul Joshi <jurahul@google.com>
Date: Fri, 19 Jun 2020 10:42:32 -0700
Subject: [PATCH 0631/1390] [NFC] Adopt FuncOp::isPublic() and friends

PiperOrigin-RevId: 317335755
Change-Id: I6f1b25798150bb3a255d4572831d3be4747d28c7
---
 tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.cc      | 4 ++--
 .../mlir/tensorflow/transforms/stack_ops_decomposition.cc     | 2 +-
 .../tensorflow/transforms/tensor_array_ops_decomposition.cc   | 2 +-
 .../tensorflow/transforms/tensor_list_ops_decomposition.cc    | 2 +-
 4 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.cc
index d59532fef65..ef248379d2e 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.cc
@@ -231,12 +231,12 @@ static LogicalResult VerifySavedModelModule(
   for (auto func : module.getOps<FuncOp>()) {
     const bool is_exported = IsExported(func);
 
-    if (is_exported && func.getVisibility() != FuncOp::Visibility::Public) {
+    if (is_exported && !func.isPublic()) {
       return func.emitError()
              << "exported function @" << func.getName() << " should be public";
     }
 
-    if (!is_exported && func.getVisibility() == FuncOp::Visibility::Public) {
+    if (!is_exported && func.isPublic()) {
       return func.emitError() << "non-exported function @" << func.getName()
                               << " should be private";
     }
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/stack_ops_decomposition.cc b/tensorflow/compiler/mlir/tensorflow/transforms/stack_ops_decomposition.cc
index c349c2b4c3e..734a7d04a86 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/stack_ops_decomposition.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/stack_ops_decomposition.cc
@@ -343,7 +343,7 @@ LogicalResult HandlePartitionedCallOp(
   }
   llvm::SmallDenseMap<Value, Value> callee_map;
   FuncOp lowered_callee = callee;
-  if (callee.getVisibility() != SymbolTable::Visibility::Private) {
+  if (!callee.isPrivate()) {
     // Clone non-private callee in case of signature change.
     lowered_callee = callee.clone();
     lowered_callee.setVisibility(SymbolTable::Visibility::Private);
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tensor_array_ops_decomposition.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tensor_array_ops_decomposition.cc
index cfeb2b1f031..a9e1243714e 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tensor_array_ops_decomposition.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tensor_array_ops_decomposition.cc
@@ -759,7 +759,7 @@ LogicalResult HandlePartitionedCallOp(
     return it->getSecond().accumulate_on_write;
   };
   FuncOp lowered_callee = callee;
-  if (callee.getVisibility() != SymbolTable::Visibility::Private) {
+  if (!callee.isPrivate()) {
     // Clone non-private callee in case of signature change.
     lowered_callee = callee.clone();
     lowered_callee.setVisibility(SymbolTable::Visibility::Private);
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tensor_list_ops_decomposition.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tensor_list_ops_decomposition.cc
index 9733bfe2290..b118ab6c6c9 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tensor_list_ops_decomposition.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tensor_list_ops_decomposition.cc
@@ -322,7 +322,7 @@ LogicalResult HandlePartitionedCallOp(
   // Rewrite the callee.
   llvm::SmallDenseMap<Value, SizeInfo> callee_map;
   FuncOp lowered_callee = callee;
-  if (callee.getVisibility() != SymbolTable::Visibility::Private) {
+  if (!callee.isPrivate()) {
     // Clone non-private callee in case of signature change.
     lowered_callee = callee.clone();
     lowered_callee.setVisibility(SymbolTable::Visibility::Private);

From 89de554b69d9b74896445a41250943d97e3b9e77 Mon Sep 17 00:00:00 2001
From: Scott Zhu <scottzhu@google.com>
Date: Fri, 19 Jun 2020 10:43:39 -0700
Subject: [PATCH 0632/1390] Fork keras related mirror_variable_test to
 keras/distribute.

PiperOrigin-RevId: 317335959
Change-Id: Ie34b8ddecdd137926959868157ab7ade7c72e1b9
---
 tensorflow/python/distribute/BUILD            |   1 -
 .../distribute/mirrored_variable_test.py      |  34 ------
 tensorflow/python/keras/distribute/BUILD      |  22 ++++
 .../distribute/mirrored_variable_test.py      | 106 ++++++++++++++++++
 4 files changed, 128 insertions(+), 35 deletions(-)
 create mode 100644 tensorflow/python/keras/distribute/mirrored_variable_test.py

diff --git a/tensorflow/python/distribute/BUILD b/tensorflow/python/distribute/BUILD
index 38c5550be16..f0f3766afe1 100644
--- a/tensorflow/python/distribute/BUILD
+++ b/tensorflow/python/distribute/BUILD
@@ -1511,7 +1511,6 @@ cuda_py_test(
         "//tensorflow/python:variable_scope",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:test",
-        "//tensorflow/python/keras/layers",
     ],
 )
 
diff --git a/tensorflow/python/distribute/mirrored_variable_test.py b/tensorflow/python/distribute/mirrored_variable_test.py
index 6623422b45f..df32a6babea 100644
--- a/tensorflow/python/distribute/mirrored_variable_test.py
+++ b/tensorflow/python/distribute/mirrored_variable_test.py
@@ -18,7 +18,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.distribute import collective_all_reduce_strategy
 from tensorflow.python.distribute import combinations
 from tensorflow.python.distribute import distribution_strategy_context as ds_context
@@ -32,7 +31,6 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import func_graph
 from tensorflow.python.framework import ops
-from tensorflow.python.keras.layers import core
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import rnn
@@ -208,38 +206,6 @@ class MirroredVariableCreationTest(test.TestCase):
       # The resulting mirrored variable will use the name from the first device.
       self.assertEqual("foo_0:0", result.name)
 
-  def testWithLayers(self, distribution):
-
-    def model_fn(features):
-
-      layer1 = core.Dense(1)
-      layer1(features)
-      layer2 = core.Dense(1)
-      layer2(features)
-      # We rely on names and orders to make sure replica references the same
-      # MirroredVariable. Uniquifying names may involve global states,
-      # merge_call switches threads so we need to test things work after
-      # merge_call.
-      ds_context.get_replica_context().merge_call(lambda _: _)
-      layer3 = core.Dense(1)
-      layer3(features)
-      return [(layer1.kernel, layer1.bias), (layer2.kernel, layer2.bias),
-              (layer3.kernel, layer3.bias)]
-
-    iterator = distribution.make_input_fn_iterator(
-        lambda _: dataset_ops.Dataset.from_tensors([[1.]]).repeat(10))
-    self.evaluate(iterator.initializer)
-    features = iterator.get_next()
-
-    with distribution.scope():
-      result = distribution.extended.call_for_each_replica(
-          model_fn, args=(features,))
-      for kernel, bias in result:
-        self.assertIsInstance(kernel, values.MirroredVariable)
-        self.assertAllDifferent(distribution.experimental_local_results(kernel))
-        self.assertIsInstance(bias, values.MirroredVariable)
-        self.assertAllDifferent(distribution.experimental_local_results(kernel))
-
   def testWithVariableAndVariableScope(self, distribution):
 
     def model_fn():
diff --git a/tensorflow/python/keras/distribute/BUILD b/tensorflow/python/keras/distribute/BUILD
index 247e655621c..c6a8f2c5f91 100644
--- a/tensorflow/python/keras/distribute/BUILD
+++ b/tensorflow/python/keras/distribute/BUILD
@@ -347,6 +347,28 @@ cuda_py_test(
     ],
 )
 
+cuda_py_test(
+    name = "mirrored_variable_test",
+    srcs = ["mirrored_variable_test.py"],
+    python_version = "PY3",
+    tags = [
+        "guitar",
+        "multi_and_single_gpu",
+    ],
+    deps = [
+        "//tensorflow/python:config",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/distribute:collective_all_reduce_strategy",
+        "//tensorflow/python/distribute:combinations",
+        "//tensorflow/python/distribute:distribute_lib",
+        "//tensorflow/python/distribute:strategy_combinations",
+        "//tensorflow/python/distribute:values",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:test",
+        "//tensorflow/python/keras/layers:core",
+    ],
+)
+
 cuda_py_test(
     name = "multi_worker_test",
     srcs = ["multi_worker_test.py"],
diff --git a/tensorflow/python/keras/distribute/mirrored_variable_test.py b/tensorflow/python/keras/distribute/mirrored_variable_test.py
new file mode 100644
index 00000000000..0edfa4806f2
--- /dev/null
+++ b/tensorflow/python/keras/distribute/mirrored_variable_test.py
@@ -0,0 +1,106 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Test MirroredVariable in MirroredStrategy and MultiWorkerMirroredStrategy."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.distribute import collective_all_reduce_strategy
+from tensorflow.python.distribute import combinations
+from tensorflow.python.distribute import distribution_strategy_context as ds_context
+from tensorflow.python.distribute import strategy_combinations
+from tensorflow.python.distribute import values
+from tensorflow.python.eager import context
+from tensorflow.python.eager import test
+from tensorflow.python.framework import config
+from tensorflow.python.keras.layers import core
+
+
+def _mimic_two_cpus():
+  cpus = config.list_physical_devices("CPU")
+
+  config.set_logical_device_configuration(cpus[0], [
+      context.LogicalDeviceConfiguration(),
+      context.LogicalDeviceConfiguration(),
+  ])
+
+
+@combinations.generate(
+    combinations.combine(
+        distribution=[
+            strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
+            combinations.NamedDistribution(
+                "Collective2CPUs",
+                # pylint: disable=g-long-lambda
+                lambda: collective_all_reduce_strategy.
+                CollectiveAllReduceStrategy._from_local_devices((
+                    "/device:CPU:0", "/device:CPU:1")),
+                required_gpus=0)
+        ],
+        mode=["graph", "eager"]))
+class MirroredVariableCreationTest(test.TestCase):
+  """Base class that tests mirrored variable creator.
+
+  Currently it assumes all strategy objects have two replicas.
+  """
+
+  @classmethod
+  def setUpClass(cls):
+    _mimic_two_cpus()
+
+  def assertAllDifferent(self, objs):
+    for i in range(len(objs)):
+      for j in range(len(objs)):
+        if i == j:
+          continue
+        self.assertIsNot(objs[i], objs[j])
+
+  def testWithLayers(self, distribution):
+
+    def model_fn(features):
+
+      layer1 = core.Dense(1)
+      layer1(features)
+      layer2 = core.Dense(1)
+      layer2(features)
+      # We rely on names and orders to make sure replica references the same
+      # MirroredVariable. Uniquifying names may involve global states,
+      # merge_call switches threads so we need to test things work after
+      # merge_call.
+      ds_context.get_replica_context().merge_call(lambda _: _)
+      layer3 = core.Dense(1)
+      layer3(features)
+      return [(layer1.kernel, layer1.bias), (layer2.kernel, layer2.bias),
+              (layer3.kernel, layer3.bias)]
+
+    iterator = distribution.make_input_fn_iterator(
+        lambda _: dataset_ops.Dataset.from_tensors([[1.]]).repeat(10))
+    self.evaluate(iterator.initializer)
+    features = iterator.get_next()
+
+    with distribution.scope():
+      result = distribution.extended.call_for_each_replica(
+          model_fn, args=(features,))
+      for kernel, bias in result:
+        self.assertIsInstance(kernel, values.MirroredVariable)
+        self.assertAllDifferent(distribution.experimental_local_results(kernel))
+        self.assertIsInstance(bias, values.MirroredVariable)
+        self.assertAllDifferent(distribution.experimental_local_results(kernel))
+
+
+if __name__ == "__main__":
+  test.main()

From 8088eddf203220799cd51ede0142329fa4665d3c Mon Sep 17 00:00:00 2001
From: Akshay Modi <nareshmodi@google.com>
Date: Fri, 19 Jun 2020 10:58:32 -0700
Subject: [PATCH 0633/1390] Let tensorflow op take precedence when doing
 "ndarray <op> tensor"

Also add a few more interop tests.

PiperOrigin-RevId: 317339113
Change-Id: Ic28fab7abefea681e1e8d840b8e4cf4f98b63f1e
---
 tensorflow/python/ops/numpy_ops/np_arrays.py  |  3 +-
 .../python/ops/numpy_ops/np_interop_test.py   | 50 ++++++++++++++++++
 .../python/ops/numpy_ops/np_math_ops.py       | 52 +++++++++++--------
 3 files changed, 82 insertions(+), 23 deletions(-)

diff --git a/tensorflow/python/ops/numpy_ops/np_arrays.py b/tensorflow/python/ops/numpy_ops/np_arrays.py
index 88bf4e7499a..65e8273375f 100644
--- a/tensorflow/python/ops/numpy_ops/np_arrays.py
+++ b/tensorflow/python/ops/numpy_ops/np_arrays.py
@@ -262,7 +262,8 @@ class ndarray(composite_tensor.CompositeTensor):  # pylint: disable=invalid-name
     """
     return np.asarray(self.data, dtype)
 
-  __array_priority__ = 110
+  # NOTE: we currently prefer interop with TF to allow TF to take precedence.
+  __array_priority__ = 90
 
   def __index__(self):
     """Returns a python scalar.
diff --git a/tensorflow/python/ops/numpy_ops/np_interop_test.py b/tensorflow/python/ops/numpy_ops/np_interop_test.py
index 052949dff9d..f52d3dae78b 100644
--- a/tensorflow/python/ops/numpy_ops/np_interop_test.py
+++ b/tensorflow/python/ops/numpy_ops/np_interop_test.py
@@ -19,12 +19,18 @@ from __future__ import division
 from __future__ import print_function
 
 
+import numpy as onp
+
+
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import def_function
+from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.numpy_ops import np_array_ops
 from tensorflow.python.ops.numpy_ops import np_arrays
+from tensorflow.python.ops.numpy_ops import np_math_ops
 from tensorflow.python.platform import test
 
 
@@ -88,6 +94,50 @@ class InteropTest(test.TestCase):
     self.assertEqual(10000, fn()[0])
     self.assertEqual(10000, def_function.function(fn)()[0])
 
+  def testTensorTFNPArrayInterop(self):
+    arr = np_array_ops.asarray(0.)
+    t = constant_op.constant(10.)
+
+    arr_plus_t = arr + t
+    t_plus_arr = t + arr
+
+    self.assertIsInstance(arr_plus_t, ops.Tensor)
+    self.assertIsInstance(t_plus_arr, ops.Tensor)
+    self.assertEqual(10., arr_plus_t.numpy())
+    self.assertEqual(10., t_plus_arr.numpy())
+
+  def testTensorTFNPOp(self):
+    t = constant_op.constant(10.)
+
+    sq = np_math_ops.square(t)
+    self.assertIsInstance(sq, np_arrays.ndarray)
+    self.assertEqual(100., sq)
+
+  def testTFNPArrayTFOpInterop(self):
+    arr = np_array_ops.asarray(10.)
+
+    # TODO(nareshmodi): Test more ops.
+    sq = math_ops.square(arr)
+    self.assertIsInstance(sq, ops.Tensor)
+    self.assertEqual(100., sq.numpy())
+
+  def testTFNPArrayNPOpInterop(self):
+    arr = np_array_ops.asarray([10.])
+
+    # TODO(nareshmodi): Test more ops.
+    sq = onp.square(arr)
+    self.assertIsInstance(sq, onp.ndarray)
+    self.assertEqual(100., sq[0])
+
+    # TODO(nareshmodi): Fails since the autopacking code doesn't use
+    # nest.flatten.
+#   def testAutopacking(self):
+#     arr1 = np_array_ops.asarray(1.)
+#     arr2 = np_array_ops.asarray(2.)
+#     arr3 = np_array_ops.asarray(3.)
+#     t = ops.convert_to_tensor_v2([arr1, arr2, arr3])
+
+#     self.assertEqual(t.numpy(), [1., 2., 3.])
 
 if __name__ == '__main__':
   ops.enable_eager_execution()
diff --git a/tensorflow/python/ops/numpy_ops/np_math_ops.py b/tensorflow/python/ops/numpy_ops/np_math_ops.py
index abfd9087ffd..361bfb50dec 100644
--- a/tensorflow/python/ops/numpy_ops/np_math_ops.py
+++ b/tensorflow/python/ops/numpy_ops/np_math_ops.py
@@ -917,29 +917,37 @@ def diff(a, n=1, axis=-1):  # pylint: disable=missing-function-docstring
   return _scalar(f, a)
 
 
-def _flip_args(f):
+def _wrap(f, reverse=False):
+  """Wraps binary ops so they can be added as operator overloads on ndarray."""
 
   def _f(a, b):
-    return f(b, a)
+    if reverse:
+      a, b = b, a
+
+    if getattr(b, '__array_priority__',
+               0) > np_arrays.ndarray.__array_priority__:
+      return NotImplemented
+
+    return f(a, b)
 
   return _f
 
 
 setattr(np_arrays.ndarray, '__abs__', absolute)
-setattr(np_arrays.ndarray, '__floordiv__', floor_divide)
-setattr(np_arrays.ndarray, '__rfloordiv__', _flip_args(floor_divide))
-setattr(np_arrays.ndarray, '__mod__', mod)
-setattr(np_arrays.ndarray, '__rmod__', _flip_args(mod))
-setattr(np_arrays.ndarray, '__add__', add)
-setattr(np_arrays.ndarray, '__radd__', _flip_args(add))
-setattr(np_arrays.ndarray, '__sub__', subtract)
-setattr(np_arrays.ndarray, '__rsub__', _flip_args(subtract))
-setattr(np_arrays.ndarray, '__mul__', multiply)
-setattr(np_arrays.ndarray, '__rmul__', _flip_args(multiply))
-setattr(np_arrays.ndarray, '__pow__', power)
-setattr(np_arrays.ndarray, '__rpow__', _flip_args(power))
-setattr(np_arrays.ndarray, '__truediv__', true_divide)
-setattr(np_arrays.ndarray, '__rtruediv__', _flip_args(true_divide))
+setattr(np_arrays.ndarray, '__floordiv__', _wrap(floor_divide))
+setattr(np_arrays.ndarray, '__rfloordiv__', _wrap(floor_divide, True))
+setattr(np_arrays.ndarray, '__mod__', _wrap(mod))
+setattr(np_arrays.ndarray, '__rmod__', _wrap(mod, True))
+setattr(np_arrays.ndarray, '__add__', _wrap(add))
+setattr(np_arrays.ndarray, '__radd__', _wrap(add, True))
+setattr(np_arrays.ndarray, '__sub__', _wrap(subtract))
+setattr(np_arrays.ndarray, '__rsub__', _wrap(subtract, True))
+setattr(np_arrays.ndarray, '__mul__', _wrap(multiply))
+setattr(np_arrays.ndarray, '__rmul__', _wrap(multiply, True))
+setattr(np_arrays.ndarray, '__pow__', _wrap(power))
+setattr(np_arrays.ndarray, '__rpow__', _wrap(power, True))
+setattr(np_arrays.ndarray, '__truediv__', _wrap(true_divide))
+setattr(np_arrays.ndarray, '__rtruediv__', _wrap(true_divide, True))
 
 
 def _comparison(tf_fun, x1, x2, cast_bool_to_int=False):
@@ -1031,12 +1039,12 @@ def logical_not(x):
 
 
 setattr(np_arrays.ndarray, '__invert__', logical_not)
-setattr(np_arrays.ndarray, '__lt__', less)
-setattr(np_arrays.ndarray, '__le__', less_equal)
-setattr(np_arrays.ndarray, '__gt__', greater)
-setattr(np_arrays.ndarray, '__ge__', greater_equal)
-setattr(np_arrays.ndarray, '__eq__', equal)
-setattr(np_arrays.ndarray, '__ne__', not_equal)
+setattr(np_arrays.ndarray, '__lt__', _wrap(less))
+setattr(np_arrays.ndarray, '__le__', _wrap(less_equal))
+setattr(np_arrays.ndarray, '__gt__', _wrap(greater))
+setattr(np_arrays.ndarray, '__ge__', _wrap(greater_equal))
+setattr(np_arrays.ndarray, '__eq__', _wrap(equal))
+setattr(np_arrays.ndarray, '__ne__', _wrap(not_equal))
 
 
 @np_utils.np_doc(np.linspace)

From 9152edc1f09e7420771628fd55ffef4b35b1cbc3 Mon Sep 17 00:00:00 2001
From: Bixia Zheng <bixia@google.com>
Date: Fri, 19 Jun 2020 11:04:49 -0700
Subject: [PATCH 0634/1390] [TF:TRT] Add flag
 TF_TRT_ALLOW_ENGINE_NATIVE_SEGMENT_EXECUTION.

The default value of the flag is True. When the flag value is false, the
bridge will report an error when the native segment of a TRTEngineOp is
executed.

Add test cases.

PiperOrigin-RevId: 317340632
Change-Id: Iacded09b38e63442bbd93076a079d385fb8a77e6
---
 .../tf2tensorrt/kernels/trt_engine_op.cc      | 34 ++++++++++++++++---
 .../compiler/tensorrt/trt_convert_test.py     | 33 +++++++++++++++++-
 2 files changed, 62 insertions(+), 5 deletions(-)

diff --git a/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op.cc b/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op.cc
index ac4a331041d..98d199ca9ab 100644
--- a/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op.cc
+++ b/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op.cc
@@ -45,6 +45,7 @@ limitations under the License.
 #include "tensorflow/core/platform/stream_executor.h"
 #include "tensorflow/core/platform/thread_annotations.h"
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/env_var.h"
 #include "tensorflow/stream_executor/lib/statusor.h"
 
 #if GOOGLE_CUDA
@@ -521,6 +522,17 @@ Status TRTEngineOp::VerifyInputShapes(
   return Status::OK();
 }
 
+static bool AllowEngineNativeSegmentExecution() {
+  bool value;
+  Status status =
+      ReadBoolFromEnvVar("TF_TRT_ALLOW_ENGINE_NATIVE_SEGMENT_EXECUTION",
+                         /*default_value=*/true, &value);
+  if (!status.ok()) {
+    LOG(ERROR) << status;
+  }
+  return value;
+}
+
 void TRTEngineOp::ComputeAsync(OpKernelContext* ctx,
                                AsyncOpKernel::DoneCallback done) {
   auto helper = new AsyncHelper(done);
@@ -605,17 +617,31 @@ void TRTEngineOp::ComputeAsync(OpKernelContext* ctx,
 
   EngineContext* engine_context = status.ValueOrDie().first;
   int trt_context_idx = status.ValueOrDie().second;
+  auto may_execute_native_segment = [&] {
+    if (!AllowEngineNativeSegmentExecution()) {
+      ctx->CtxFailure(
+          errors::Aborted("User disallowed engine native segment execution"));
+      return false;
+    }
+    return true;
+  };
   if (!engine_context->cuda_engine) {
-    VLOG(1) << "Engine retrieval for input shapes: "
-            << TensorShapeUtils::ShapeListString(input_concrete_shapes)
-            << " failed. Running native segment for " << name();
-    ExecuteNativeSegment(ctx, helper);
+    LOG_WARNING_WITH_PREFIX
+        << "Engine retrieval for input shapes: "
+        << TensorShapeUtils::ShapeListString(input_concrete_shapes)
+        << " failed. Running native segment for " << name();
+    if (may_execute_native_segment()) {
+      ExecuteNativeSegment(ctx, helper);
+    }
     return;
   }
   Status stat = ExecuteTrtEngine(ctx, engine_context, trt_context_idx);
   if (!stat.ok()) {
     LOG_WARNING_WITH_PREFIX << "Failed to execute engine: " << stat
                             << " Retrying with native segment for " << name();
+    if (!may_execute_native_segment()) {
+      return;
+    }
     // Release any outputs that are allocated, ExecuteNativeSegment will
     // re-allocate them and fail if they are currently allocated.
     for (int i = 0; i < ctx->num_outputs(); i++) {
diff --git a/tensorflow/python/compiler/tensorrt/trt_convert_test.py b/tensorflow/python/compiler/tensorrt/trt_convert_test.py
index df21e93f836..05ff6fcaebe 100644
--- a/tensorflow/python/compiler/tensorrt/trt_convert_test.py
+++ b/tensorflow/python/compiler/tensorrt/trt_convert_test.py
@@ -439,6 +439,7 @@ class TrtConvertTest(test_util.TensorFlowTestCase, parameterized.TestCase):
       self,
       input_saved_model_dir,
       input_saved_model_signature_key=_SAVED_MODEL_SIGNATURE_KEY,
+      max_workspace_size_bytes=10 << 20,  # Use a smaller workspace.
       precision_mode=trt_convert.TrtPrecisionMode.FP32,
       is_dynamic_op=True,
       maximum_cached_engines=2):
@@ -446,7 +447,7 @@ class TrtConvertTest(test_util.TensorFlowTestCase, parameterized.TestCase):
         input_saved_model_dir=input_saved_model_dir,
         input_saved_model_signature_key=input_saved_model_signature_key,
         conversion_params=trt_convert.DEFAULT_TRT_CONVERSION_PARAMS._replace(
-            max_workspace_size_bytes=10 << 20,  # Use a smaller workspace.
+            max_workspace_size_bytes=max_workspace_size_bytes,
             precision_mode=precision_mode,
             is_dynamic_op=is_dynamic_op,
             maximum_cached_engines=maximum_cached_engines))
@@ -924,6 +925,36 @@ class TrtConvertTest(test_util.TensorFlowTestCase, parameterized.TestCase):
         # to fall back to TF function.
         self._TestRun(sess, 2)
 
+  @test_util.run_v2_only
+  def testTrtGraphConverter_AllowEngineNativeSegmentExecution(self):
+    if not is_tensorrt_enabled():
+      return
+
+    np_input1, np_input2 = self._RandomInput([4, 1, 1])
+
+    # Create a model and save it.
+    input_saved_model_dir = self.mkdtemp()
+    root = self._GetModelForV2()
+    save.save(root, input_saved_model_dir,
+              {_SAVED_MODEL_SIGNATURE_KEY: root.run})
+
+    def _InputFn():
+      yield np_input1, np_input2
+
+    # Run TRT conversion and request an unreasonably large workspace.
+    converter = self._CreateConverterV2(
+        input_saved_model_dir, max_workspace_size_bytes=10 << 40)
+    converter.convert()
+
+    os.environ["TF_TRT_ALLOW_ENGINE_NATIVE_SEGMENT_EXECUTION"] = "False"
+    with self.assertRaisesRegex(
+        errors.AbortedError,
+        r"User disallowed engine native segment execution"):
+      converter.build(input_fn=_InputFn)
+
+    os.environ["TF_TRT_ALLOW_ENGINE_NATIVE_SEGMENT_EXECUTION"] = "True"
+    converter.build(input_fn=_InputFn)
+
   @test_util.run_v2_only
   def testBackwardCompatibility(self):
     """Load and execute a model that was saved in TF2.0."""

From 1129f21360b24fd957a248b86ddefa311d9f5658 Mon Sep 17 00:00:00 2001
From: Robert David <lrdx@google.com>
Date: Fri, 19 Jun 2020 11:16:53 -0700
Subject: [PATCH 0635/1390] Consistently call LSTM's gate biases in variable
 names <gatename>_gate_bias.

PiperOrigin-RevId: 317343158
Change-Id: I385ddbad6c1283b84574b2ec0b523ce9f88a4cd3
---
 .../kernels/bidirectional_sequence_lstm.cc    | 20 ++--
 .../bidirectional_sequence_lstm_test.cc       | 48 +++++-----
 tensorflow/lite/kernels/lstm.cc               | 56 ++++++-----
 tensorflow/lite/kernels/lstm_eval.cc          | 64 +++++++------
 tensorflow/lite/kernels/lstm_eval.h           |  8 +-
 tensorflow/lite/kernels/lstm_eval_test.cc     | 93 ++++++++++---------
 tensorflow/lite/kernels/lstm_test.cc          | 70 +++++++-------
 .../lite/kernels/optional_tensor_test.cc      |  8 +-
 .../kernels/unidirectional_sequence_lstm.cc   | 18 ++--
 .../unidirectional_sequence_lstm_test.cc      | 32 +++----
 .../calibration/builtin_logging_ops/lstm.cc   | 13 +--
 11 files changed, 224 insertions(+), 206 deletions(-)

diff --git a/tensorflow/lite/kernels/bidirectional_sequence_lstm.cc b/tensorflow/lite/kernels/bidirectional_sequence_lstm.cc
index 439fc94afad..fd60fe573ef 100644
--- a/tensorflow/lite/kernels/bidirectional_sequence_lstm.cc
+++ b/tensorflow/lite/kernels/bidirectional_sequence_lstm.cc
@@ -318,11 +318,11 @@ TfLiteStatus CheckLstmTensorDimensionsAndTypes(
   TF_LITE_ENSURE_EQ(context, forget_gate_bias->dims->data[0], n_cell);
   TF_LITE_ENSURE_TYPES_EQ(context, forget_gate_bias->type, kTfLiteFloat32);
 
-  const TfLiteTensor* cell_bias =
+  const TfLiteTensor* cell_gate_bias =
       GetInput(context, node, cell_gate_bias_tensor);
-  TF_LITE_ENSURE_EQ(context, cell_bias->dims->size, 1);
-  TF_LITE_ENSURE_EQ(context, cell_bias->dims->data[0], n_cell);
-  TF_LITE_ENSURE_EQ(context, cell_bias->type, kTfLiteFloat32);
+  TF_LITE_ENSURE_EQ(context, cell_gate_bias->dims->size, 1);
+  TF_LITE_ENSURE_EQ(context, cell_gate_bias->dims->data[0], n_cell);
+  TF_LITE_ENSURE_EQ(context, cell_gate_bias->type, kTfLiteFloat32);
 
   const TfLiteTensor* output_gate_bias =
       GetInput(context, node, output_gate_bias_tensor);
@@ -886,7 +886,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
       GetOptionalInputTensor(context, node, kFwInputGateBiasTensor);
   const TfLiteTensor* fw_forget_gate_bias =
       GetInput(context, node, kFwForgetGateBiasTensor);
-  const TfLiteTensor* fw_cell_bias =
+  const TfLiteTensor* fw_cell_gate_bias =
       GetInput(context, node, kFwCellGateBiasTensor);
   const TfLiteTensor* fw_output_gate_bias =
       GetInput(context, node, kFwOutputGateBiasTensor);
@@ -934,7 +934,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
       GetOptionalInputTensor(context, node, kBwInputGateBiasTensor);
   const TfLiteTensor* bw_forget_gate_bias =
       GetInput(context, node, kBwForgetGateBiasTensor);
-  const TfLiteTensor* bw_cell_bias =
+  const TfLiteTensor* bw_cell_gate_bias =
       GetInput(context, node, kBwCellGateBiasTensor);
   const TfLiteTensor* bw_output_gate_bias =
       GetInput(context, node, kBwOutputGateBiasTensor);
@@ -1029,7 +1029,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
           /*output_layer_norm_coefficients=*/nullptr, real_aux_input,
           fw_aux_input_to_input_weights, fw_aux_input_to_forget_weights,
           fw_aux_input_to_cell_weights, fw_aux_input_to_output_weights,
-          fw_input_gate_bias, fw_forget_gate_bias, fw_cell_bias,
+          fw_input_gate_bias, fw_forget_gate_bias, fw_cell_gate_bias,
           fw_output_gate_bias, fw_projection_weights, fw_projection_bias,
           &lstm_params,
           /*forward_sequence=*/true, time_major, /*output_offset=*/0,
@@ -1049,7 +1049,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
           /*output_layer_norm_coefficients=*/nullptr, real_aux_input,
           bw_aux_input_to_input_weights, bw_aux_input_to_forget_weights,
           bw_aux_input_to_cell_weights, bw_aux_input_to_output_weights,
-          bw_input_gate_bias, bw_forget_gate_bias, bw_cell_bias,
+          bw_input_gate_bias, bw_forget_gate_bias, bw_cell_gate_bias,
           bw_output_gate_bias, bw_projection_weights, bw_projection_bias,
           &lstm_params,
           /*forward_sequence=*/false, time_major, bw_output_offset,
@@ -1099,7 +1099,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
           /*output_layer_norm_coefficients=*/nullptr, real_aux_input,
           fw_aux_input_to_input_weights, fw_aux_input_to_forget_weights,
           fw_aux_input_to_cell_weights, fw_aux_input_to_output_weights,
-          fw_input_gate_bias, fw_forget_gate_bias, fw_cell_bias,
+          fw_input_gate_bias, fw_forget_gate_bias, fw_cell_gate_bias,
           fw_output_gate_bias, fw_projection_weights, fw_projection_bias,
           &lstm_params,
           /*forward_sequence=*/true, /*time_major=*/true, /*output_offset=*/0,
@@ -1125,7 +1125,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
           /*output_layer_norm_coefficients=*/nullptr, real_aux_input,
           bw_aux_input_to_input_weights, bw_aux_input_to_forget_weights,
           bw_aux_input_to_cell_weights, bw_aux_input_to_output_weights,
-          bw_input_gate_bias, bw_forget_gate_bias, bw_cell_bias,
+          bw_input_gate_bias, bw_forget_gate_bias, bw_cell_gate_bias,
           bw_output_gate_bias, bw_projection_weights, bw_projection_bias,
           &lstm_params,
           /*forward_sequence=*/false, /*time_major=*/true, bw_output_offset,
diff --git a/tensorflow/lite/kernels/bidirectional_sequence_lstm_test.cc b/tensorflow/lite/kernels/bidirectional_sequence_lstm_test.cc
index 3a52de130e3..778751aa04b 100644
--- a/tensorflow/lite/kernels/bidirectional_sequence_lstm_test.cc
+++ b/tensorflow/lite/kernels/bidirectional_sequence_lstm_test.cc
@@ -89,7 +89,7 @@ class BidirectionalLSTMOpModel : public SingleOpModel {
       fw_input_gate_bias_ = AddInput(TensorType_FLOAT32);
     }
     fw_forget_gate_bias_ = AddInput(TensorType_FLOAT32);
-    fw_cell_bias_ = AddInput(TensorType_FLOAT32);
+    fw_cell_gate_bias_ = AddInput(TensorType_FLOAT32);
     fw_output_gate_bias_ = AddInput(TensorType_FLOAT32);
 
     if (use_projection_weights) {
@@ -144,7 +144,7 @@ class BidirectionalLSTMOpModel : public SingleOpModel {
       bw_input_gate_bias_ = AddInput(TensorType_FLOAT32);
     }
     bw_forget_gate_bias_ = AddInput(TensorType_FLOAT32);
-    bw_cell_bias_ = AddInput(TensorType_FLOAT32);
+    bw_cell_gate_bias_ = AddInput(TensorType_FLOAT32);
     bw_output_gate_bias_ = AddInput(TensorType_FLOAT32);
 
     if (use_projection_weights) {
@@ -288,8 +288,8 @@ class BidirectionalLSTMOpModel : public SingleOpModel {
   }
 
   void SetCellBias(const std::vector<float>& f) {
-    PopulateTensor(fw_cell_bias_, f);
-    PopulateTensor(bw_cell_bias_, f);
+    PopulateTensor(fw_cell_gate_bias_, f);
+    PopulateTensor(bw_cell_gate_bias_, f);
   }
 
   void SetOutputGateBias(const std::vector<float>& f) {
@@ -364,7 +364,7 @@ class BidirectionalLSTMOpModel : public SingleOpModel {
 
   int fw_input_gate_bias_;
   int fw_forget_gate_bias_;
-  int fw_cell_bias_;
+  int fw_cell_gate_bias_;
   int fw_output_gate_bias_;
 
   int fw_projection_weights_;
@@ -386,7 +386,7 @@ class BidirectionalLSTMOpModel : public SingleOpModel {
 
   int bw_input_gate_bias_;
   int bw_forget_gate_bias_;
-  int bw_cell_bias_;
+  int bw_cell_gate_bias_;
   int bw_output_gate_bias_;
 
   int bw_projection_weights_;
@@ -467,7 +467,7 @@ TEST_P(LSTMOpTest, BlackBoxTestNoCifgNoPeepholeNoProjectionNoClipping) {
 
           {n_cell},  // input_gate_bias tensor
           {n_cell},  // forget_gate_bias tensor
-          {n_cell},  // cell_bias tensor
+          {n_cell},  // cell_gate_bias tensor
           {n_cell},  // output_gate_bias tensor
 
           {0, 0},  // projection_weight tensor
@@ -490,7 +490,7 @@ TEST_P(LSTMOpTest, BlackBoxTestNoCifgNoPeepholeNoProjectionNoClipping) {
 
           {n_cell},  // input_gate_bias tensor
           {n_cell},  // forget_gate_bias tensor
-          {n_cell},  // cell_bias tensor
+          {n_cell},  // cell_gate_bias tensor
           {n_cell},  // output_gate_bias tensor
 
           {0, 0},  // projection_weight tensor
@@ -633,7 +633,7 @@ TEST_P(LSTMOpTest, BlackBoxTestMergedOutput) {
 
           {n_cell},  // input_gate_bias tensor
           {n_cell},  // forget_gate_bias tensor
-          {n_cell},  // cell_bias tensor
+          {n_cell},  // cell_gate_bias tensor
           {n_cell},  // output_gate_bias tensor
 
           {0, 0},  // projection_weight tensor
@@ -656,7 +656,7 @@ TEST_P(LSTMOpTest, BlackBoxTestMergedOutput) {
 
           {n_cell},  // input_gate_bias tensor
           {n_cell},  // forget_gate_bias tensor
-          {n_cell},  // cell_bias tensor
+          {n_cell},  // cell_gate_bias tensor
           {n_cell},  // output_gate_bias tensor
 
           {0, 0},  // projection_weight tensor
@@ -796,7 +796,7 @@ TEST(LSTMOpTest, BlackBoxTestNoCifgNoPeepholeNoProjectionNoClippingReverse) {
 
           {n_cell},  // input_gate_bias tensor
           {n_cell},  // forget_gate_bias tensor
-          {n_cell},  // cell_bias tensor
+          {n_cell},  // cell_gate_bias tensor
           {n_cell},  // output_gate_bias tensor
 
           {0, 0},  // projection_weight tensor
@@ -819,7 +819,7 @@ TEST(LSTMOpTest, BlackBoxTestNoCifgNoPeepholeNoProjectionNoClippingReverse) {
 
           {n_cell},  // input_gate_bias tensor
           {n_cell},  // forget_gate_bias tensor
-          {n_cell},  // cell_bias tensor
+          {n_cell},  // cell_gate_bias tensor
           {n_cell},  // output_gate_bias tensor
 
           {0, 0},  // projection_weight tensor
@@ -956,7 +956,7 @@ TEST(LSTMOpTest, BlackBoxTestWithCifgWithPeepholeNoProjectionNoClipping) {
 
           {0},       // input_gate_bias tensor
           {n_cell},  // forget_gate_bias tensor
-          {n_cell},  // cell_bias tensor
+          {n_cell},  // cell_gate_bias tensor
           {n_cell},  // output_gate_bias tensor
 
           {0, 0},  // projection_weight tensor
@@ -978,7 +978,7 @@ TEST(LSTMOpTest, BlackBoxTestWithCifgWithPeepholeNoProjectionNoClipping) {
 
           {0},       // input_gate_bias tensor
           {n_cell},  // forget_gate_bias tensor
-          {n_cell},  // cell_bias tensor
+          {n_cell},  // cell_gate_bias tensor
           {n_cell},  // output_gate_bias tensor
 
           {0, 0},  // projection_weight tensor
@@ -1107,7 +1107,7 @@ TEST(LSTMOpTest,
 
           {0},       // input_gate_bias tensor
           {n_cell},  // forget_gate_bias tensor
-          {n_cell},  // cell_bias tensor
+          {n_cell},  // cell_gate_bias tensor
           {n_cell},  // output_gate_bias tensor
 
           {0, 0},  // projection_weight tensor
@@ -1129,7 +1129,7 @@ TEST(LSTMOpTest,
 
           {0},       // input_gate_bias tensor
           {n_cell},  // forget_gate_bias tensor
-          {n_cell},  // cell_bias tensor
+          {n_cell},  // cell_gate_bias tensor
           {n_cell},  // output_gate_bias tensor
 
           {0, 0},  // projection_weight tensor
@@ -1258,7 +1258,7 @@ TEST(LSTMOpTest, BlackBoxTestWithPeepholeWithProjectionNoClipping) {
 
           {n_cell},  // input_gate_bias tensor
           {n_cell},  // forget_gate_bias tensor
-          {n_cell},  // cell_bias tensor
+          {n_cell},  // cell_gate_bias tensor
           {n_cell},  // output_gate_bias tensor
 
           {n_output, n_cell},  // projection_weight tensor
@@ -1280,7 +1280,7 @@ TEST(LSTMOpTest, BlackBoxTestWithPeepholeWithProjectionNoClipping) {
 
           {n_cell},  // input_gate_bias tensor
           {n_cell},  // forget_gate_bias tensor
-          {n_cell},  // cell_bias tensor
+          {n_cell},  // cell_gate_bias tensor
           {n_cell},  // output_gate_bias tensor
 
           {n_output, n_cell},  // projection_weight tensor
@@ -1961,7 +1961,7 @@ TEST(LSTMOpTest, BlackBoxTestWithPeepholeWithProjectionNoClippingBatchMajor) {
 
           {n_cell},  // input_gate_bias tensor
           {n_cell},  // forget_gate_bias tensor
-          {n_cell},  // cell_bias tensor
+          {n_cell},  // cell_gate_bias tensor
           {n_cell},  // output_gate_bias tensor
 
           {n_output, n_cell},  // projection_weight tensor
@@ -1983,7 +1983,7 @@ TEST(LSTMOpTest, BlackBoxTestWithPeepholeWithProjectionNoClippingBatchMajor) {
 
           {n_cell},  // input_gate_bias tensor
           {n_cell},  // forget_gate_bias tensor
-          {n_cell},  // cell_bias tensor
+          {n_cell},  // cell_gate_bias tensor
           {n_cell},  // output_gate_bias tensor
 
           {n_output, n_cell},  // projection_weight tensor
@@ -2667,7 +2667,7 @@ TEST_P(LSTMOpTest, BlackBoxTestWithAuxInputZeroAuxWeight) {
 
           {n_cell},  // input_gate_bias tensor
           {n_cell},  // forget_gate_bias tensor
-          {n_cell},  // cell_bias tensor
+          {n_cell},  // cell_gate_bias tensor
           {n_cell},  // output_gate_bias tensor
 
           {0, 0},  // projection_weight tensor
@@ -2690,7 +2690,7 @@ TEST_P(LSTMOpTest, BlackBoxTestWithAuxInputZeroAuxWeight) {
 
           {n_cell},  // input_gate_bias tensor
           {n_cell},  // forget_gate_bias tensor
-          {n_cell},  // cell_bias tensor
+          {n_cell},  // cell_gate_bias tensor
           {n_cell},  // output_gate_bias tensor
 
           {0, 0},  // projection_weight tensor
@@ -2841,7 +2841,7 @@ TEST_P(LSTMOpTest, BlackBoxTestWithAuxInput) {
 
           {n_cell},  // input_gate_bias tensor
           {n_cell},  // forget_gate_bias tensor
-          {n_cell},  // cell_bias tensor
+          {n_cell},  // cell_gate_bias tensor
           {n_cell},  // output_gate_bias tensor
 
           {0, 0},  // projection_weight tensor
@@ -2864,7 +2864,7 @@ TEST_P(LSTMOpTest, BlackBoxTestWithAuxInput) {
 
           {n_cell},  // input_gate_bias tensor
           {n_cell},  // forget_gate_bias tensor
-          {n_cell},  // cell_bias tensor
+          {n_cell},  // cell_gate_bias tensor
           {n_cell},  // output_gate_bias tensor
 
           {0, 0},  // projection_weight tensor
diff --git a/tensorflow/lite/kernels/lstm.cc b/tensorflow/lite/kernels/lstm.cc
index aa6a112a022..b941f2237ca 100644
--- a/tensorflow/lite/kernels/lstm.cc
+++ b/tensorflow/lite/kernels/lstm.cc
@@ -407,7 +407,8 @@ TfLiteStatus PopulateQuantizedLstmParams8x8_8(
       GetOptionalInputTensor(context, node, kInputGateBiasTensor);
   const TfLiteTensor* forget_gate_bias =
       GetInput(context, node, kForgetGateBiasTensor);
-  const TfLiteTensor* cell_bias = GetInput(context, node, kCellGateBiasTensor);
+  const TfLiteTensor* cell_gate_bias =
+      GetInput(context, node, kCellGateBiasTensor);
   const TfLiteTensor* output_gate_bias =
       GetInput(context, node, kOutputGateBiasTensor);
 
@@ -446,10 +447,10 @@ TfLiteStatus PopulateQuantizedLstmParams8x8_8(
   int16_t* layer_norm_forget_weight_ptr = nullptr;
   int16_t* layer_norm_cell_weight_ptr = nullptr;
   int16_t* layer_norm_output_weight_ptr = nullptr;
-  int32_t* input_bias_ptr = nullptr;
-  int32_t* forget_bias_ptr = nullptr;
-  int32_t* cell_bias_ptr = nullptr;
-  int32_t* output_bias_ptr = nullptr;
+  int32_t* input_gate_bias_ptr = nullptr;
+  int32_t* forget_gate_bias_ptr = nullptr;
+  int32_t* cell_gate_bias_ptr = nullptr;
+  int32_t* output_gate_bias_ptr = nullptr;
   int32_t* proj_bias_ptr = nullptr;
   int16_t* cell_ptr = nullptr;
   int8_t* output_state_ptr = nullptr;
@@ -497,7 +498,7 @@ TfLiteStatus PopulateQuantizedLstmParams8x8_8(
   if (!use_cifg) {
     input_to_input_weight_ptr = input_to_input_weights->data.int8;
     recurrent_to_input_weight_ptr = recurrent_to_input_weights->data.int8;
-    input_bias_ptr = input_gate_bias->data.i32;
+    input_gate_bias_ptr = input_gate_bias->data.i32;
     input_to_input_weight_scale = input_to_input_weights->params.scale;
     recurrent_to_input_weight_scale = recurrent_to_input_weights->params.scale;
   }
@@ -547,9 +548,9 @@ TfLiteStatus PopulateQuantizedLstmParams8x8_8(
   recurrent_to_cell_weight_scale = recurrent_to_cell_weights->params.scale;
   recurrent_to_output_weight_ptr = recurrent_to_output_weights->data.int8;
   recurrent_to_output_weight_scale = recurrent_to_output_weights->params.scale;
-  forget_bias_ptr = forget_gate_bias->data.i32;
-  cell_bias_ptr = cell_bias->data.i32;
-  output_bias_ptr = output_gate_bias->data.i32;
+  forget_gate_bias_ptr = forget_gate_bias->data.i32;
+  cell_gate_bias_ptr = cell_gate_bias->data.i32;
+  output_gate_bias_ptr = output_gate_bias->data.i32;
   output_state_ptr = output_state->data.int8;
   cell_ptr = cell_state->data.i16;
   input_scale = input->params.scale;
@@ -875,13 +876,14 @@ TfLiteStatus CheckInputTensorDimensions(TfLiteContext* context,
     TF_LITE_ENSURE_TYPES_EQ(context, forget_gate_bias->type, kTfLiteFloat32);
   }
 
-  const TfLiteTensor* cell_bias = GetInput(context, node, kCellGateBiasTensor);
-  TF_LITE_ENSURE_EQ(context, cell_bias->dims->size, 1);
-  TF_LITE_ENSURE_EQ(context, cell_bias->dims->data[0], n_cell);
+  const TfLiteTensor* cell_gate_bias =
+      GetInput(context, node, kCellGateBiasTensor);
+  TF_LITE_ENSURE_EQ(context, cell_gate_bias->dims->size, 1);
+  TF_LITE_ENSURE_EQ(context, cell_gate_bias->dims->data[0], n_cell);
   if (is_integer) {
-    TF_LITE_ENSURE_TYPES_EQ(context, cell_bias->type, kTfLiteInt32);
+    TF_LITE_ENSURE_TYPES_EQ(context, cell_gate_bias->type, kTfLiteInt32);
   } else {
-    TF_LITE_ENSURE_TYPES_EQ(context, cell_bias->type, kTfLiteFloat32);
+    TF_LITE_ENSURE_TYPES_EQ(context, cell_gate_bias->type, kTfLiteFloat32);
   }
 
   const TfLiteTensor* output_gate_bias =
@@ -1526,7 +1528,8 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
       GetOptionalInputTensor(context, node, kInputGateBiasTensor);
   const TfLiteTensor* forget_gate_bias =
       GetInput(context, node, kForgetGateBiasTensor);
-  const TfLiteTensor* cell_bias = GetInput(context, node, kCellGateBiasTensor);
+  const TfLiteTensor* cell_gate_bias =
+      GetInput(context, node, kCellGateBiasTensor);
   const TfLiteTensor* output_gate_bias =
       GetInput(context, node, kOutputGateBiasTensor);
 
@@ -1560,8 +1563,9 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
           /*aux_input_to_forget_weights=*/nullptr,
           /*aux_input_to_cell_weights=*/nullptr,
           /*aux_input_to_output_weights=*/nullptr, input_gate_bias,
-          forget_gate_bias, cell_bias, output_gate_bias, projection_weights,
-          projection_bias, params, /*forward_sequence=*/true,
+          forget_gate_bias, cell_gate_bias, output_gate_bias,
+          projection_weights, projection_bias, params,
+          /*forward_sequence=*/true,
           /*time_major=*/true,
           /*output_offset=*/0, scratch_buffer, output_state, cell_state,
           output);
@@ -1603,8 +1607,9 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
             /*aux_input_to_forget_weights=*/nullptr,
             /*aux_input_to_cell_weights=*/nullptr,
             /*aux_input_to_output_weights=*/nullptr, input_gate_bias,
-            forget_gate_bias, cell_bias, output_gate_bias, projection_weights,
-            projection_bias, params, /*forward_sequence=*/true,
+            forget_gate_bias, cell_gate_bias, output_gate_bias,
+            projection_weights, projection_bias, params,
+            /*forward_sequence=*/true,
             /*time_major=*/true, /*output_offset=*/0, scratch_buffer,
             scaling_factors, prod_scaling_factors, recovered_cell_weights,
             input_quantized,
@@ -1631,10 +1636,11 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
               cell_to_output_weights, input_layer_norm_coefficients,
               forget_layer_norm_coefficients, cell_layer_norm_coefficients,
               output_layer_norm_coefficients, input_gate_bias, forget_gate_bias,
-              cell_bias, output_gate_bias, projection_weights, projection_bias,
-              params, &op_data->integer_lstm_param, output_state, cell_state,
-              output, scratch0, scratch1, scratch2, scratch3, scratch4,
-              scratch5, CpuBackendContext::GetFromContext(context));
+              cell_gate_bias, output_gate_bias, projection_weights,
+              projection_bias, params, &op_data->integer_lstm_param,
+              output_state, cell_state, output, scratch0, scratch1, scratch2,
+              scratch3, scratch4, scratch5,
+              CpuBackendContext::GetFromContext(context));
         } else {
           TfLiteTensor* scratch0 = GetTemporary(context, node, /*index=*/0);
           TfLiteTensor* scratch1 = GetTemporary(context, node, /*index=*/1);
@@ -1653,8 +1659,8 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
               cell_to_output_weights, input_layer_norm_coefficients,
               forget_layer_norm_coefficients, cell_layer_norm_coefficients,
               output_layer_norm_coefficients, input_gate_bias, forget_gate_bias,
-              cell_bias, output_gate_bias, projection_weights, projection_bias,
-              params, output_state, cell_state, output,
+              cell_gate_bias, output_gate_bias, projection_weights,
+              projection_bias, params, output_state, cell_state, output,
               &op_data->integer_lstm_param, scratch0, scratch1, scratch2,
               scratch3, scratch4, scratch5, scratch6, scratch7);
           return kTfLiteOk;
diff --git a/tensorflow/lite/kernels/lstm_eval.cc b/tensorflow/lite/kernels/lstm_eval.cc
index f45d46762bf..9bdbfa9d48d 100644
--- a/tensorflow/lite/kernels/lstm_eval.cc
+++ b/tensorflow/lite/kernels/lstm_eval.cc
@@ -942,10 +942,10 @@ inline void LstmStepHybrid(
 //   effective_proj_scale_b                  - optional
 //
 // Gate biases of size 'n_cell':
-//   input_bias_ptr                 - optional
-//   forget_bias_ptr
+//   input_gate_bias_ptr                 - optional
+//   forget_gate_bias_ptr
 //   cell_gate_bias_ptr
-//   output_bias_ptr
+//   output_gate_bias_ptr
 //
 // Layer norm coefficients of size 'n_cell', representing diagonal matrices.
 //   layer_norm_input_weight_ptr    - optional
@@ -1031,8 +1031,8 @@ inline void LstmStepInteger(
     int32_t layer_norm_cell_scale_b,
     const int16_t* layer_norm_output_weight_ptr,
     int32_t layer_norm_output_scale_a, int32_t layer_norm_output_scale_b,
-    const int32_t* input_bias_ptr, const int32_t* forget_bias_ptr,
-    const int32_t* cell_gate_bias_ptr, const int32_t* output_bias_ptr,
+    const int32_t* input_gate_bias_ptr, const int32_t* forget_gate_bias_ptr,
+    const int32_t* cell_gate_bias_ptr, const int32_t* output_gate_bias_ptr,
     int16_t quantized_cell_clip, int8_t quantized_proj_clip, int32_t cell_scale,
     int32_t input_variance_guard, int32_t forget_variance_guard,
     int32_t cell_variance_guard, int32_t output_variance_guard,
@@ -1098,7 +1098,7 @@ inline void LstmStepInteger(
 
   if (use_layer_norm) {
     tensor_utils::ApplyLayerNorm(
-        scratch_1_ptr, layer_norm_forget_weight_ptr, forget_bias_ptr,
+        scratch_1_ptr, layer_norm_forget_weight_ptr, forget_gate_bias_ptr,
         layer_norm_forget_scale_a, layer_norm_forget_scale_b,
         forget_variance_guard, n_batch, n_cell, scratch_1_ptr);
   }
@@ -1149,7 +1149,7 @@ inline void LstmStepInteger(
 
     if (use_layer_norm) {
       tensor_utils::ApplyLayerNorm(
-          scratch_0_ptr, layer_norm_input_weight_ptr, input_bias_ptr,
+          scratch_0_ptr, layer_norm_input_weight_ptr, input_gate_bias_ptr,
           layer_norm_input_scale_a, layer_norm_input_scale_b,
           input_variance_guard, n_batch, n_cell, scratch_0_ptr);
     }
@@ -1190,7 +1190,7 @@ inline void LstmStepInteger(
 
   if (use_layer_norm) {
     tensor_utils::ApplyLayerNorm(
-        scratch_3_ptr, layer_norm_output_weight_ptr, output_bias_ptr,
+        scratch_3_ptr, layer_norm_output_weight_ptr, output_gate_bias_ptr,
         layer_norm_output_scale_a, layer_norm_output_scale_b,
         output_variance_guard, n_batch, n_cell, scratch_3_ptr);
   }
@@ -1268,10 +1268,10 @@ inline void LstmStepInteger(
 //   effective_proj_scale_b                  - optional
 //
 // Gate biases of size 'n_cell':
-//   input_bias_ptr                 - optional
-//   forget_bias_ptr
+//   input_gate_bias_ptr                 - optional
+//   forget_gate_bias_ptr
 //   cell_gate_bias_ptr
-//   output_bias_ptr
+//   output_gate_bias_ptr
 //
 // Layer norm coefficients of size 'n_cell', representing diagonal matrices.
 //   layer_norm_input_weight_ptr    - optional
@@ -1358,8 +1358,8 @@ void LstmStepInteger(
     int32_t layer_norm_cell_scale_b,
     const int16_t* layer_norm_output_weight_ptr,
     int32_t layer_norm_output_scale_a, int32_t layer_norm_output_scale_b,
-    const int32_t* input_bias_ptr, const int32_t* forget_bias_ptr,
-    const int32_t* cell_gate_bias_ptr, const int32_t* output_bias_ptr,
+    const int32_t* input_gate_bias_ptr, const int32_t* forget_gate_bias_ptr,
+    const int32_t* cell_gate_bias_ptr, const int32_t* output_gate_bias_ptr,
     const int32_t* proj_bias_ptr, const TfLiteLSTMParams* params,
     const int32_t* intermediate_scale_a, const int32_t* intermediate_scale_b,
     const int32_t* intermediate_zp, int16_t quantized_cell_clip,
@@ -1391,7 +1391,8 @@ void LstmStepInteger(
   // Forget gate layer norm.
   tensor_utils::ApplyLayerNormFloat(
       scratch2, layer_norm_forget_weight_ptr, layer_norm_forget_scale_a,
-      layer_norm_forget_scale_b, forget_bias_ptr, n_batch, n_cell, scratch2);
+      layer_norm_forget_scale_b, forget_gate_bias_ptr, n_batch, n_cell,
+      scratch2);
 
   // Forget gate sigmoid.
   tensor_utils::ApplySigmoidFloat(scratch2, n_batch, n_cell, scratch2);
@@ -1444,7 +1445,8 @@ void LstmStepInteger(
   // Output gate with layer norm.
   tensor_utils::ApplyLayerNormFloat(
       scratch4, layer_norm_output_weight_ptr, layer_norm_output_scale_a,
-      layer_norm_output_scale_b, output_bias_ptr, n_batch, n_cell, scratch4);
+      layer_norm_output_scale_b, output_gate_bias_ptr, n_batch, n_cell,
+      scratch4);
 
   // Output gate sigmoid.
   tensor_utils::ApplySigmoidFloat(scratch4, n_batch, n_cell, scratch4);
@@ -1512,7 +1514,7 @@ TfLiteStatus EvalFloat(
     const TfLiteTensor* aux_input_to_cell_weights,
     const TfLiteTensor* aux_input_to_output_weights,
     const TfLiteTensor* input_gate_bias, const TfLiteTensor* forget_gate_bias,
-    const TfLiteTensor* cell_bias, const TfLiteTensor* output_gate_bias,
+    const TfLiteTensor* cell_gate_bias, const TfLiteTensor* output_gate_bias,
     const TfLiteTensor* projection_weights, const TfLiteTensor* projection_bias,
     const TfLiteLSTMParams* params, bool forward_sequence, bool time_major,
     int output_offset, TfLiteTensor* scratch_buffer, TfLiteTensor* output_state,
@@ -1595,7 +1597,7 @@ TfLiteStatus EvalFloat(
           GetTensorData<float>(output_layer_norm_coefficients),
           GetTensorData<float>(input_gate_bias),
           GetTensorData<float>(forget_gate_bias),
-          GetTensorData<float>(cell_bias),
+          GetTensorData<float>(cell_gate_bias),
           GetTensorData<float>(output_gate_bias),
           GetTensorData<float>(projection_weights),
           GetTensorData<float>(projection_bias), params, n_batch, n_cell,
@@ -1656,7 +1658,7 @@ TfLiteStatus EvalFloat(
             GetTensorData<float>(output_layer_norm_coefficients),
             GetTensorData<float>(input_gate_bias),
             GetTensorData<float>(forget_gate_bias),
-            GetTensorData<float>(cell_bias),
+            GetTensorData<float>(cell_gate_bias),
             GetTensorData<float>(output_gate_bias),
             GetTensorData<float>(projection_weights),
             GetTensorData<float>(projection_bias), params, /*n_batch=*/1,
@@ -1693,7 +1695,7 @@ TfLiteStatus EvalHybrid(
     const TfLiteTensor* aux_input_to_cell_weights,
     const TfLiteTensor* aux_input_to_output_weights,
     const TfLiteTensor* input_gate_bias, const TfLiteTensor* forget_gate_bias,
-    const TfLiteTensor* cell_bias, const TfLiteTensor* output_gate_bias,
+    const TfLiteTensor* cell_gate_bias, const TfLiteTensor* output_gate_bias,
     const TfLiteTensor* projection_weights, const TfLiteTensor* projection_bias,
     const TfLiteLSTMParams* params, bool forward_sequence, bool time_major,
     int output_offset, TfLiteTensor* scratch_buffer,
@@ -1802,7 +1804,7 @@ TfLiteStatus EvalHybrid(
           GetTensorData<float>(output_layer_norm_coefficients),
           GetTensorData<float>(input_gate_bias),
           GetTensorData<float>(forget_gate_bias),
-          GetTensorData<float>(cell_bias),
+          GetTensorData<float>(cell_gate_bias),
           GetTensorData<float>(output_gate_bias),
           GetTensorData<int8_t>(projection_weights),
           GetTensorScale(projection_weights),
@@ -1888,7 +1890,7 @@ TfLiteStatus EvalHybrid(
             GetTensorData<float>(output_layer_norm_coefficients),
             GetTensorData<float>(input_gate_bias),
             GetTensorData<float>(forget_gate_bias),
-            GetTensorData<float>(cell_bias),
+            GetTensorData<float>(cell_gate_bias),
             GetTensorData<float>(output_gate_bias),
             GetTensorData<int8_t>(projection_weights),
             GetTensorScale(projection_weights),
@@ -1930,7 +1932,7 @@ TfLiteStatus EvalInteger8x8_16(
     const TfLiteTensor* cell_layer_norm_coefficients,
     const TfLiteTensor* output_layer_norm_coefficients,
     const TfLiteTensor* input_gate_bias, const TfLiteTensor* forget_gate_bias,
-    const TfLiteTensor* cell_bias, const TfLiteTensor* output_gate_bias,
+    const TfLiteTensor* cell_gate_bias, const TfLiteTensor* output_gate_bias,
     const TfLiteTensor* projection_weights, const TfLiteTensor* projection_bias,
     const TfLiteLSTMParams* params,
     const lstm_eval::IntegerLstmParameter* integer_lstm_param,
@@ -2020,7 +2022,7 @@ TfLiteStatus EvalInteger8x8_16(
         integer_lstm_param->layer_norm_output_scale_b,
         GetTensorData<int32_t>(input_gate_bias),
         GetTensorData<int32_t>(forget_gate_bias),
-        GetTensorData<int32_t>(cell_bias),
+        GetTensorData<int32_t>(cell_gate_bias),
         GetTensorData<int32_t>(output_gate_bias),
         integer_lstm_param->quantized_cell_clip,
         integer_lstm_param->quantized_proj_clip, integer_lstm_param->cell_scale,
@@ -2065,7 +2067,7 @@ TfLiteStatus EvalInteger8x8_8(
     const TfLiteTensor* cell_layer_norm_coefficients,
     const TfLiteTensor* output_layer_norm_coefficients,
     const TfLiteTensor* input_gate_bias, const TfLiteTensor* forget_gate_bias,
-    const TfLiteTensor* cell_bias, const TfLiteTensor* output_gate_bias,
+    const TfLiteTensor* cell_gate_bias, const TfLiteTensor* output_gate_bias,
     const TfLiteTensor* projection_weights, const TfLiteTensor* projection_bias,
     const TfLiteLSTMParams* params, TfLiteTensor* output_state,
     TfLiteTensor* cell_state, TfLiteTensor* output,
@@ -2120,10 +2122,12 @@ TfLiteStatus EvalInteger8x8_8(
       GetTensorData<int16_t>(cell_layer_norm_coefficients);
   const int16_t* layer_norm_output_weight_ptr =
       GetTensorData<int16_t>(output_layer_norm_coefficients);
-  const int32_t* input_bias_ptr = GetTensorData<int32_t>(input_gate_bias);
-  const int32_t* forget_bias_ptr = GetTensorData<int32_t>(forget_gate_bias);
-  const int32_t* cell_gate_bias_ptr = GetTensorData<int32_t>(cell_bias);
-  const int32_t* output_bias_ptr = GetTensorData<int32_t>(output_gate_bias);
+  const int32_t* input_gate_bias_ptr = GetTensorData<int32_t>(input_gate_bias);
+  const int32_t* forget_gate_bias_ptr =
+      GetTensorData<int32_t>(forget_gate_bias);
+  const int32_t* cell_gate_bias_ptr = GetTensorData<int32_t>(cell_gate_bias);
+  const int32_t* output_gate_bias_ptr =
+      GetTensorData<int32_t>(output_gate_bias);
   const int32_t* proj_bias_ptr = GetTensorData<int32_t>(projection_bias);
   int16_t* cell_ptr = GetTensorData<int16_t>(cell_state);
   int8_t* output_state_ptr = GetTensorData<int8_t>(output_state);
@@ -2209,8 +2213,8 @@ TfLiteStatus EvalInteger8x8_8(
         integer_lstm_param->layer_norm_output_scale_a,
         integer_lstm_param->layer_norm_output_scale_b,
 
-        input_bias_ptr, forget_bias_ptr, cell_gate_bias_ptr, output_bias_ptr,
-        proj_bias_ptr,
+        input_gate_bias_ptr, forget_gate_bias_ptr, cell_gate_bias_ptr,
+        output_gate_bias_ptr, proj_bias_ptr,
 
         params, integer_lstm_param->intermediate_scale_a,
         integer_lstm_param->intermediate_scale_b,
diff --git a/tensorflow/lite/kernels/lstm_eval.h b/tensorflow/lite/kernels/lstm_eval.h
index 3c9b4bccf42..9b3bd0c54ec 100644
--- a/tensorflow/lite/kernels/lstm_eval.h
+++ b/tensorflow/lite/kernels/lstm_eval.h
@@ -117,7 +117,7 @@ TfLiteStatus EvalFloat(
     const TfLiteTensor* aux_input_to_cell_weights,
     const TfLiteTensor* aux_input_to_output_weights,
     const TfLiteTensor* input_gate_bias, const TfLiteTensor* forget_gate_bias,
-    const TfLiteTensor* cell_bias, const TfLiteTensor* output_gate_bias,
+    const TfLiteTensor* cell_gate_bias, const TfLiteTensor* output_gate_bias,
     const TfLiteTensor* projection_weights, const TfLiteTensor* projection_bias,
     const TfLiteLSTMParams* params, bool forward_sequence, bool time_major,
     int output_offset, TfLiteTensor* scratch_buffer, TfLiteTensor* output_state,
@@ -145,7 +145,7 @@ TfLiteStatus EvalHybrid(
     const TfLiteTensor* aux_input_to_cell_weights,
     const TfLiteTensor* aux_input_to_output_weights,
     const TfLiteTensor* input_gate_bias, const TfLiteTensor* forget_gate_bias,
-    const TfLiteTensor* cell_bias, const TfLiteTensor* output_gate_bias,
+    const TfLiteTensor* cell_gate_bias, const TfLiteTensor* output_gate_bias,
     const TfLiteTensor* projection_weights, const TfLiteTensor* projection_bias,
     const TfLiteLSTMParams* params, bool forward_sequence, bool time_major,
     int output_offset, TfLiteTensor* scratch_buffer,
@@ -174,7 +174,7 @@ TfLiteStatus EvalInteger8x8_16(
     const TfLiteTensor* cell_layer_norm_coefficients,
     const TfLiteTensor* output_layer_norm_coefficients,
     const TfLiteTensor* input_gate_bias, const TfLiteTensor* forget_gate_bias,
-    const TfLiteTensor* cell_bias, const TfLiteTensor* output_gate_bias,
+    const TfLiteTensor* cell_gate_bias, const TfLiteTensor* output_gate_bias,
     const TfLiteTensor* projection_weights, const TfLiteTensor* projection_bias,
     const TfLiteLSTMParams* params,
     const lstm_eval::IntegerLstmParameter* integer_lstm_param,
@@ -200,7 +200,7 @@ TfLiteStatus EvalInteger8x8_8(
     const TfLiteTensor* cell_layer_norm_coefficients,
     const TfLiteTensor* output_layer_norm_coefficients,
     const TfLiteTensor* input_gate_bias, const TfLiteTensor* forget_gate_bias,
-    const TfLiteTensor* cell_bias, const TfLiteTensor* output_gate_bias,
+    const TfLiteTensor* cell_gate_bias, const TfLiteTensor* output_gate_bias,
     const TfLiteTensor* projection_weights, const TfLiteTensor* projection_bias,
     const TfLiteLSTMParams* params, TfLiteTensor* output_state,
     TfLiteTensor* cell_state, TfLiteTensor* output,
diff --git a/tensorflow/lite/kernels/lstm_eval_test.cc b/tensorflow/lite/kernels/lstm_eval_test.cc
index baf2e5e83df..78459117859 100644
--- a/tensorflow/lite/kernels/lstm_eval_test.cc
+++ b/tensorflow/lite/kernels/lstm_eval_test.cc
@@ -113,10 +113,10 @@ class BaseLstmParam {
     TfLiteIntArrayFree(layer_norm_forget_tensor_.dims);
     TfLiteIntArrayFree(layer_norm_cell_tensor_.dims);
     TfLiteIntArrayFree(layer_norm_output_tensor_.dims);
-    TfLiteIntArrayFree(input_bias_tensor_.dims);
-    TfLiteIntArrayFree(forget_bias_tensor_.dims);
-    TfLiteIntArrayFree(cell_bias_tensor_.dims);
-    TfLiteIntArrayFree(output_bias_tensor_.dims);
+    TfLiteIntArrayFree(input_gate_bias_tensor_.dims);
+    TfLiteIntArrayFree(forget_gate_bias_tensor_.dims);
+    TfLiteIntArrayFree(cell_gate_bias_tensor_.dims);
+    TfLiteIntArrayFree(output_gate_bias_tensor_.dims);
     TfLiteIntArrayFree(projection_tensor_.dims);
     TfLiteIntArrayFree(projection_bias_tensor_.dims);
     TfLiteIntArrayFree(activation_tensor_.dims);
@@ -275,17 +275,17 @@ class BaseLstmParam {
   std::vector<int32_t> layer_norm_output_size_ = {n_cell_};
   TfLiteTensor layer_norm_output_tensor_;
 
-  std::vector<int32_t> input_bias_size_ = {n_cell_};
-  TfLiteTensor input_bias_tensor_;
+  std::vector<int32_t> input_gate_bias_size_ = {n_cell_};
+  TfLiteTensor input_gate_bias_tensor_;
 
-  std::vector<int32_t> forget_bias_size_ = {n_cell_};
-  TfLiteTensor forget_bias_tensor_;
+  std::vector<int32_t> forget_gate_bias_size_ = {n_cell_};
+  TfLiteTensor forget_gate_bias_tensor_;
 
-  std::vector<int32_t> cell_bias_size_ = {n_cell_};
-  TfLiteTensor cell_bias_tensor_;
+  std::vector<int32_t> cell_gate_bias_size_ = {n_cell_};
+  TfLiteTensor cell_gate_bias_tensor_;
 
-  std::vector<int32_t> output_bias_size_ = {n_cell_};
-  TfLiteTensor output_bias_tensor_;
+  std::vector<int32_t> output_gate_bias_size_ = {n_cell_};
+  TfLiteTensor output_gate_bias_tensor_;
 
   // projection_weights.
   std::vector<int8_t> projection_ = {
@@ -350,24 +350,28 @@ class QuantizedLstmParam : public BaseLstmParam {
     return &layer_norm_output_tensor_;
   }
   TfLiteTensor* GetInputBias() {
-    PackWeightToTensor(&input_bias_tensor_, input_bias_, input_bias_size_);
-    input_bias_tensor_.data.i32 = input_bias_.data();
-    return &input_bias_tensor_;
+    PackWeightToTensor(&input_gate_bias_tensor_, input_gate_bias_,
+                       input_gate_bias_size_);
+    input_gate_bias_tensor_.data.i32 = input_gate_bias_.data();
+    return &input_gate_bias_tensor_;
   }
   TfLiteTensor* GetForgetBias() {
-    PackWeightToTensor(&forget_bias_tensor_, forget_bias_, forget_bias_size_);
-    forget_bias_tensor_.data.i32 = forget_bias_.data();
-    return &forget_bias_tensor_;
+    PackWeightToTensor(&forget_gate_bias_tensor_, forget_gate_bias_,
+                       forget_gate_bias_size_);
+    forget_gate_bias_tensor_.data.i32 = forget_gate_bias_.data();
+    return &forget_gate_bias_tensor_;
   }
   TfLiteTensor* GetCellBias() {
-    PackWeightToTensor(&cell_bias_tensor_, cell_bias_, cell_bias_size_);
-    cell_bias_tensor_.data.i32 = cell_bias_.data();
-    return &cell_bias_tensor_;
+    PackWeightToTensor(&cell_gate_bias_tensor_, cell_gate_bias_,
+                       cell_gate_bias_size_);
+    cell_gate_bias_tensor_.data.i32 = cell_gate_bias_.data();
+    return &cell_gate_bias_tensor_;
   }
   TfLiteTensor* GetOutputBias() {
-    PackWeightToTensor(&output_bias_tensor_, output_bias_, output_bias_size_);
-    output_bias_tensor_.data.i32 = output_bias_.data();
-    return &output_bias_tensor_;
+    PackWeightToTensor(&output_gate_bias_tensor_, output_gate_bias_,
+                       output_gate_bias_size_);
+    output_gate_bias_tensor_.data.i32 = output_gate_bias_.data();
+    return &output_gate_bias_tensor_;
   }
   TfLiteTensor* GetProjectionBias() {
     PackWeightToTensor(&projection_bias_tensor_, projection_bias_,
@@ -539,22 +543,22 @@ class QuantizedLstmParam : public BaseLstmParam {
   };
 
   // input_gate_bias.
-  std::vector<int32_t> input_bias_ = {
+  std::vector<int32_t> input_gate_bias_ = {
       16, 4, 5, 6, 1, 1, 3, 4, -5, 6,  //
   };
 
   // forget_gate_bias.
-  std::vector<int32_t> forget_bias_ = {
+  std::vector<int32_t> forget_gate_bias_ = {
       16, 4, 5, 6, 1, 1, 3, 4, -5, 6,  //
   };
 
-  // cell_bias.
-  std::vector<int32_t> cell_bias_ = {
+  // cell_gate_bias.
+  std::vector<int32_t> cell_gate_bias_ = {
       16, 4, 5, 6, 1, 1, 3, 4, -5, 6,  //
   };
 
   // output_gate_bias.
-  std::vector<int32_t> output_bias_ = {
+  std::vector<int32_t> output_gate_bias_ = {
       16, 4, 5, 6, 1, 1, 3, 4, -5, 6,  //
   };
 
@@ -711,27 +715,28 @@ class HybridLstmParam : public BaseLstmParam {
     return &accum_scratch_tensor_;
   }
   TfLiteTensor* GetInputBias() {
-    PackWeightToTensor(&input_bias_tensor_, input_float_bias_,
-                       input_bias_size_);
-    input_bias_tensor_.data.f = input_float_bias_.data();
-    return &input_bias_tensor_;
+    PackWeightToTensor(&input_gate_bias_tensor_, input_float_bias_,
+                       input_gate_bias_size_);
+    input_gate_bias_tensor_.data.f = input_float_bias_.data();
+    return &input_gate_bias_tensor_;
   }
   TfLiteTensor* GetForgetBias() {
-    PackWeightToTensor(&forget_bias_tensor_, forget_float_bias_,
-                       forget_bias_size_);
-    forget_bias_tensor_.data.f = forget_float_bias_.data();
-    return &forget_bias_tensor_;
+    PackWeightToTensor(&forget_gate_bias_tensor_, forget_float_bias_,
+                       forget_gate_bias_size_);
+    forget_gate_bias_tensor_.data.f = forget_float_bias_.data();
+    return &forget_gate_bias_tensor_;
   }
   TfLiteTensor* GetCellBias() {
-    PackWeightToTensor(&cell_bias_tensor_, cell_float_bias_, cell_bias_size_);
-    cell_bias_tensor_.data.f = cell_float_bias_.data();
-    return &cell_bias_tensor_;
+    PackWeightToTensor(&cell_gate_bias_tensor_, cell_float_bias_,
+                       cell_gate_bias_size_);
+    cell_gate_bias_tensor_.data.f = cell_float_bias_.data();
+    return &cell_gate_bias_tensor_;
   }
   TfLiteTensor* GetOutputBias() {
-    PackWeightToTensor(&output_bias_tensor_, output_float_bias_,
-                       output_bias_size_);
-    output_bias_tensor_.data.f = output_float_bias_.data();
-    return &output_bias_tensor_;
+    PackWeightToTensor(&output_gate_bias_tensor_, output_float_bias_,
+                       output_gate_bias_size_);
+    output_gate_bias_tensor_.data.f = output_float_bias_.data();
+    return &output_gate_bias_tensor_;
   }
   TfLiteTensor* GetProjectionBias() {
     PackWeightToTensor(&projection_bias_tensor_, projection_float_bias_,
diff --git a/tensorflow/lite/kernels/lstm_test.cc b/tensorflow/lite/kernels/lstm_test.cc
index f8594f9adf0..a9023dce371 100644
--- a/tensorflow/lite/kernels/lstm_test.cc
+++ b/tensorflow/lite/kernels/lstm_test.cc
@@ -89,7 +89,7 @@ class LSTMOpModel : public SingleOpModel {
       input_gate_bias_ = AddInput(TensorType_FLOAT32);
     }
     forget_gate_bias_ = AddInput(TensorType_FLOAT32);
-    cell_bias_ = AddInput(TensorType_FLOAT32);
+    cell_gate_bias_ = AddInput(TensorType_FLOAT32);
     output_gate_bias_ = AddInput(TensorType_FLOAT32);
 
     if (use_projection_weights) {
@@ -211,7 +211,7 @@ class LSTMOpModel : public SingleOpModel {
   }
 
   void SetCellBias(const std::vector<float>& f) {
-    PopulateTensor(cell_bias_, f);
+    PopulateTensor(cell_gate_bias_, f);
   }
 
   void SetOutputGateBias(const std::vector<float>& f) {
@@ -261,7 +261,7 @@ class LSTMOpModel : public SingleOpModel {
 
   int input_gate_bias_;
   int forget_gate_bias_;
-  int cell_bias_;
+  int cell_gate_bias_;
   int output_gate_bias_;
 
   int projection_weights_;
@@ -498,7 +498,7 @@ TEST_F(NoCifgNoPeepholeNoProjectionNoClippingLstmTest, LstmBlackBoxTest) {
 
                        {n_cell},  // input_gate_bias tensor
                        {n_cell},  // forget_gate_bias tensor
-                       {n_cell},  // cell_bias tensor
+                       {n_cell},  // cell_gate_bias tensor
                        {n_cell},  // output_gate_bias tensor
 
                        {0, 0},  // projection_weight tensor
@@ -545,7 +545,7 @@ TEST_F(NoCifgNoPeepholeNoProjectionNoClippingOmittedLayerNormLstmTest,
 
                        {n_cell},  // input_gate_bias tensor
                        {n_cell},  // forget_gate_bias tensor
-                       {n_cell},  // cell_bias tensor
+                       {n_cell},  // cell_gate_bias tensor
                        {n_cell},  // output_gate_bias tensor
 
                        {0, 0},  // projection_weight tensor
@@ -601,7 +601,7 @@ TEST_P(NoCifgNoPeepholeNoProjectionNoClippingLstmTest,
 
                        {n_cell},  // input_gate_bias tensor
                        {n_cell},  // forget_gate_bias tensor
-                       {n_cell},  // cell_bias tensor
+                       {n_cell},  // cell_gate_bias tensor
                        {n_cell},  // output_gate_bias tensor
 
                        {0, 0},  // projection_weight tensor
@@ -652,7 +652,7 @@ TEST_P(NoCifgNoPeepholeNoProjectionNoClippingLstmInt8Test,
 
                        {n_cell},  // input_gate_bias tensor
                        {n_cell},  // forget_gate_bias tensor
-                       {n_cell},  // cell_bias tensor
+                       {n_cell},  // cell_gate_bias tensor
                        {n_cell},  // output_gate_bias tensor
 
                        {0, 0},  // projection_weight tensor
@@ -743,7 +743,7 @@ TEST_F(CifgNoPeepholeNoProjectionNoClippingLstmTest, LstmBlackBoxTest) {
 
                        {0},       // input_gate_bias tensor
                        {n_cell},  // forget_gate_bias tensor
-                       {n_cell},  // cell_bias tensor
+                       {n_cell},  // cell_gate_bias tensor
                        {n_cell},  // output_gate_bias tensor
 
                        {0, 0},  // projection_weight tensor
@@ -791,7 +791,7 @@ TEST_P(CifgNoPeepholeNoProjectionNoClippingLstmTest,
 
                        {0},       // input_gate_bias tensor
                        {n_cell},  // forget_gate_bias tensor
-                       {n_cell},  // cell_bias tensor
+                       {n_cell},  // cell_gate_bias tensor
                        {n_cell},  // output_gate_bias tensor
 
                        {0, 0},  // projection_weight tensor
@@ -840,7 +840,7 @@ TEST_P(CifgNoPeepholeNoProjectionNoClippingLstmInt8Test,
 
                        {0},       // input_gate_bias tensor
                        {n_cell},  // forget_gate_bias tensor
-                       {n_cell},  // cell_bias tensor
+                       {n_cell},  // cell_gate_bias tensor
                        {n_cell},  // output_gate_bias tensor
 
                        {0, 0},  // projection_weight tensor
@@ -1481,7 +1481,7 @@ TEST_F(NoCifgPeepholeProjectionNoClippingLstmTest, LstmBlackBoxTest) {
 
                        {n_cell},  // input_gate_bias tensor
                        {n_cell},  // forget_gate_bias tensor
-                       {n_cell},  // cell_bias tensor
+                       {n_cell},  // cell_gate_bias tensor
                        {n_cell},  // output_gate_bias tensor
 
                        {n_output, n_cell},  // projection_weight tensor
@@ -1528,7 +1528,7 @@ TEST_P(NoCifgPeepholeProjectionNoClippingLstmTest,
 
                        {n_cell},  // input_gate_bias tensor
                        {n_cell},  // forget_gate_bias tensor
-                       {n_cell},  // cell_bias tensor
+                       {n_cell},  // cell_gate_bias tensor
                        {n_cell},  // output_gate_bias tensor
 
                        {n_output, n_cell},  // projection_weight tensor
@@ -1577,7 +1577,7 @@ TEST_P(NoCifgPeepholeProjectionNoClippingLstmInt8Test,
 
                        {n_cell},  // input_gate_bias tensor
                        {n_cell},  // forget_gate_bias tensor
-                       {n_cell},  // cell_bias tensor
+                       {n_cell},  // cell_gate_bias tensor
                        {n_cell},  // output_gate_bias tensor
 
                        {n_output, n_cell},  // projection_weight tensor
@@ -1689,7 +1689,7 @@ TEST_F(NoCifgPeepholeProjectionNoClippingLayerNormLstmTest,
 
           {n_cell},  // input_gate_bias tensor
           {n_cell},  // forget_gate_bias tensor
-          {n_cell},  // cell_bias tensor
+          {n_cell},  // cell_gate_bias tensor
           {n_cell},  // output_gate_bias tensor
 
           {n_output, n_cell},  // projection_weight tensor
@@ -1760,7 +1760,7 @@ TEST_P(NoCifgPeepholeProjectionNoClippingLayerNormLstmTest,
 
           {n_cell},  // input_gate_bias tensor
           {n_cell},  // forget_gate_bias tensor
-          {n_cell},  // cell_bias tensor
+          {n_cell},  // cell_gate_bias tensor
           {n_cell},  // output_gate_bias tensor
 
           {n_output, n_cell},  // projection_weight tensor
@@ -1833,7 +1833,7 @@ TEST_P(NoCifgPeepholeProjectionNoClippingLayerNormLstmInt8Test,
 
           {n_cell},  // input_gate_bias tensor
           {n_cell},  // forget_gate_bias tensor
-          {n_cell},  // cell_bias tensor
+          {n_cell},  // cell_gate_bias tensor
           {n_cell},  // output_gate_bias tensor
 
           {n_output, n_cell},  // projection_weight tensor
@@ -1947,7 +1947,7 @@ TEST_F(CifgPeepholeProjectionNoClippingLayerNormLstmTest,
 
           {0},       // input_gate_bias tensor
           {n_cell},  // forget_gate_bias tensor
-          {n_cell},  // cell_bias tensor
+          {n_cell},  // cell_gate_bias tensor
           {n_cell},  // output_gate_bias tensor
 
           {n_output, n_cell},  // projection_weight tensor
@@ -2018,7 +2018,7 @@ TEST_P(CifgPeepholeProjectionNoClippingLayerNormLstmTest,
 
           {0},       // input_gate_bias tensor
           {n_cell},  // forget_gate_bias tensor
-          {n_cell},  // cell_bias tensor
+          {n_cell},  // cell_gate_bias tensor
           {n_cell},  // output_gate_bias tensor
 
           {n_output, n_cell},  // projection_weight tensor
@@ -2090,7 +2090,7 @@ TEST_P(CifgPeepholeProjectionNoClippingLayerNormLstmInt8Test,
 
           {0},       // input_gate_bias tensor
           {n_cell},  // forget_gate_bias tensor
-          {n_cell},  // cell_bias tensor
+          {n_cell},  // cell_gate_bias tensor
           {n_cell},  // output_gate_bias tensor
 
           {n_output, n_cell},  // projection_weight tensor
@@ -2195,8 +2195,8 @@ class LSTMIntegerOpModel : public SingleOpModel {
     }
     forget_gate_bias_ = AddInput({TensorType_INT32, input_shapes[13],
                                   ranges[13].first, ranges[13].second});
-    cell_bias_ = AddInput({TensorType_INT32, input_shapes[14], ranges[14].first,
-                           ranges[14].second});
+    cell_gate_bias_ = AddInput({TensorType_INT32, input_shapes[14],
+                                ranges[14].first, ranges[14].second});
     output_gate_bias_ = AddInput({TensorType_INT32, input_shapes[15],
                                   ranges[15].first, ranges[15].second});
 
@@ -2330,7 +2330,7 @@ class LSTMIntegerOpModel : public SingleOpModel {
   }
 
   void SetCellBias(const std::vector<float>& f) {
-    QuantizeAndPopulate<int32_t>(cell_bias_, f);
+    QuantizeAndPopulate<int32_t>(cell_gate_bias_, f);
   }
 
   void SetOutputGateBias(const std::vector<float>& f) {
@@ -2379,7 +2379,7 @@ class LSTMIntegerOpModel : public SingleOpModel {
 
   int input_gate_bias_;
   int forget_gate_bias_;
-  int cell_bias_;
+  int cell_gate_bias_;
   int output_gate_bias_;
 
   int projection_weights_;
@@ -2473,7 +2473,7 @@ TEST(LSTMIntegerOpModel, NoCifgYesLayerNormNoYesProjectionNoPeephole) {
 
       {n_cell},  // input_gate_bias tensor
       {n_cell},  // forget_gate_bias tensor
-      {n_cell},  // cell_bias tensor
+      {n_cell},  // cell_gate_bias tensor
       {n_cell},  // output_gate_bias tensor
 
       {n_output, n_cell},  // projection_weight tensor
@@ -2507,7 +2507,7 @@ TEST(LSTMIntegerOpModel, NoCifgYesLayerNormNoYesProjectionNoPeephole) {
 
       {-100, 100},  // input_gate_bias tensor
       {-100, 100},  // forget_gate_bias tensor
-      {-100, 100},  // cell_bias tensor
+      {-100, 100},  // cell_gate_bias tensor
       {-100, 100},  // output_gate_bias tensor
 
       {-0.5, 0.5},  // projection_weight tensor
@@ -2675,7 +2675,7 @@ TEST(LSTMIntegerOpModel, NoCifgYesLayerNormNoYesProjectionYesPeephole) {
 
       {n_cell},  // input_gate_bias tensor
       {n_cell},  // forget_gate_bias tensor
-      {n_cell},  // cell_bias tensor
+      {n_cell},  // cell_gate_bias tensor
       {n_cell},  // output_gate_bias tensor
 
       {n_output, n_cell},  // projection_weight tensor
@@ -2709,7 +2709,7 @@ TEST(LSTMIntegerOpModel, NoCifgYesLayerNormNoYesProjectionYesPeephole) {
 
       {-100, 100},  // input_gate_bias tensor
       {-100, 80},   // forget_gate_bias tensor
-      {-100, 100},  // cell_bias tensor
+      {-100, 100},  // cell_gate_bias tensor
       {-100, 100},  // output_gate_bias tensor
 
       {-0.5, 0.5},  // projection_weight tensor
@@ -2869,8 +2869,8 @@ class LSTMIntegerOpModel8x8_8 : public SingleOpModel {
     }
     forget_gate_bias_ = AddInput({TensorType_INT32, input_shapes[13],
                                   ranges[13].first, ranges[13].second});
-    cell_bias_ = AddInput({TensorType_INT32, input_shapes[14], ranges[14].first,
-                           ranges[14].second});
+    cell_gate_bias_ = AddInput({TensorType_INT32, input_shapes[14],
+                                ranges[14].first, ranges[14].second});
     output_gate_bias_ = AddInput({TensorType_INT32, input_shapes[15],
                                   ranges[15].first, ranges[15].second});
 
@@ -3004,7 +3004,7 @@ class LSTMIntegerOpModel8x8_8 : public SingleOpModel {
   }
 
   void SetCellBias(const std::vector<float>& f) {
-    QuantizeAndPopulate<int32_t>(cell_bias_, f);
+    QuantizeAndPopulate<int32_t>(cell_gate_bias_, f);
   }
 
   void SetOutputGateBias(const std::vector<float>& f) {
@@ -3053,7 +3053,7 @@ class LSTMIntegerOpModel8x8_8 : public SingleOpModel {
 
   int input_gate_bias_;
   int forget_gate_bias_;
-  int cell_bias_;
+  int cell_gate_bias_;
   int output_gate_bias_;
 
   int projection_weights_;
@@ -3148,7 +3148,7 @@ TEST(LSTMIntegerOpModel8x8_8, CifgYesLayerNormNoYesProjectionNoPeephole) {
 
       {0},       // input_gate_bias tensor
       {n_cell},  // forget_gate_bias tensor
-      {n_cell},  // cell_bias tensor
+      {n_cell},  // cell_gate_bias tensor
       {n_cell},  // output_gate_bias tensor
 
       {n_output, n_cell},  // projection_weight tensor
@@ -3182,7 +3182,7 @@ TEST(LSTMIntegerOpModel8x8_8, CifgYesLayerNormNoYesProjectionNoPeephole) {
 
       {-100, 100},  // input_gate_bias tensor
       {-100, 100},  // forget_gate_bias tensor
-      {-100, 100},  // cell_bias tensor
+      {-100, 100},  // cell_gate_bias tensor
       {-100, 100},  // output_gate_bias tensor
 
       {-0.5, 0.5},  // projection_weight tensor
@@ -3303,7 +3303,7 @@ TEST(LSTMOpModel, InvalidTypeTest) {
 
                        {n_cell},  // input_gate_bias tensor
                        {n_cell},  // forget_gate_bias tensor
-                       {n_cell},  // cell_bias tensor
+                       {n_cell},  // cell_gate_bias tensor
                        {n_cell},  // output_gate_bias tensor
 
                        {0, 0},  // projection_weight tensor
@@ -3338,7 +3338,7 @@ TEST(LSTMOpModel, InvalidTypeTest) {
 
                        {n_cell},  // input_gate_bias tensor
                        {n_cell},  // forget_gate_bias tensor
-                       {n_cell},  // cell_bias tensor
+                       {n_cell},  // cell_gate_bias tensor
                        {n_cell},  // output_gate_bias tensor
 
                        {0, 0},  // projection_weight tensor
diff --git a/tensorflow/lite/kernels/optional_tensor_test.cc b/tensorflow/lite/kernels/optional_tensor_test.cc
index 26d619276aa..9e83c74da8d 100644
--- a/tensorflow/lite/kernels/optional_tensor_test.cc
+++ b/tensorflow/lite/kernels/optional_tensor_test.cc
@@ -78,7 +78,7 @@ class LSTMOpModel : public SingleOpModel {
       input_gate_bias_ = AddInput(TensorType_FLOAT32);
     }
     forget_gate_bias_ = AddInput(TensorType_FLOAT32);
-    cell_bias_ = AddInput(TensorType_FLOAT32);
+    cell_gate_bias_ = AddInput(TensorType_FLOAT32);
     output_gate_bias_ = AddInput(TensorType_FLOAT32);
 
     if (use_projection_weights) {
@@ -161,7 +161,7 @@ class LSTMOpModel : public SingleOpModel {
   }
 
   void SetCellBias(std::initializer_list<float> f) {
-    PopulateTensor(cell_bias_, f);
+    PopulateTensor(cell_gate_bias_, f);
   }
 
   void SetOutputGateBias(std::initializer_list<float> f) {
@@ -209,7 +209,7 @@ class LSTMOpModel : public SingleOpModel {
 
   int input_gate_bias_;
   int forget_gate_bias_;
-  int cell_bias_;
+  int cell_gate_bias_;
   int output_gate_bias_;
 
   int projection_weights_;
@@ -256,7 +256,7 @@ TEST(LSTMOpTest, BlackBoxTestWithCifgWithPeepholeNoProjectionNoClipping) {
 
                        {0},       // input_gate_bias tensor
                        {n_cell},  // forget_gate_bias tensor
-                       {n_cell},  // cell_bias tensor
+                       {n_cell},  // cell_gate_bias tensor
                        {n_cell},  // output_gate_bias tensor
 
                        {0, 0},  // projection_weight tensor
diff --git a/tensorflow/lite/kernels/unidirectional_sequence_lstm.cc b/tensorflow/lite/kernels/unidirectional_sequence_lstm.cc
index f1c0f9d42a6..0b2cba72369 100644
--- a/tensorflow/lite/kernels/unidirectional_sequence_lstm.cc
+++ b/tensorflow/lite/kernels/unidirectional_sequence_lstm.cc
@@ -179,10 +179,10 @@ TfLiteStatus CheckInputTensorDimensions(TfLiteContext* context,
   TF_LITE_ENSURE_EQ(context, forget_gate_bias->dims->size, 1);
   TF_LITE_ENSURE_EQ(context, forget_gate_bias->dims->data[0], n_cell);
 
-  const TfLiteTensor* cell_bias =
+  const TfLiteTensor* cell_gate_bias =
       GetInput(context, node, lstm::full::kCellGateBiasTensor);
-  TF_LITE_ENSURE_EQ(context, cell_bias->dims->size, 1);
-  TF_LITE_ENSURE_EQ(context, cell_bias->dims->data[0], n_cell);
+  TF_LITE_ENSURE_EQ(context, cell_gate_bias->dims->size, 1);
+  TF_LITE_ENSURE_EQ(context, cell_gate_bias->dims->data[0], n_cell);
 
   const TfLiteTensor* output_gate_bias =
       GetInput(context, node, lstm::full::kOutputGateBiasTensor);
@@ -546,7 +546,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
       GetOptionalInputTensor(context, node, lstm::full::kInputGateBiasTensor);
   const TfLiteTensor* forget_gate_bias =
       GetInput(context, node, lstm::full::kForgetGateBiasTensor);
-  const TfLiteTensor* cell_bias =
+  const TfLiteTensor* cell_gate_bias =
       GetInput(context, node, lstm::full::kCellGateBiasTensor);
   const TfLiteTensor* output_gate_bias =
       GetInput(context, node, lstm::full::kOutputGateBiasTensor);
@@ -611,8 +611,9 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
           /*aux_input_to_forget_weights=*/nullptr,
           /*aux_input_to_cell_weights=*/nullptr,
           /*aux_input_to_output_weights=*/nullptr, input_gate_bias,
-          forget_gate_bias, cell_bias, output_gate_bias, projection_weights,
-          projection_bias, &lstm_params, /*forward_sequence=*/true, time_major,
+          forget_gate_bias, cell_gate_bias, output_gate_bias,
+          projection_weights, projection_bias, &lstm_params,
+          /*forward_sequence=*/true, time_major,
           /*output_offset=*/0, scratch_buffer, output_state, cell_state,
           output);
     }
@@ -648,8 +649,9 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
           /*aux_input_to_forget_weights=*/nullptr,
           /*aux_input_to_cell_weights=*/nullptr,
           /*aux_input_to_output_weights=*/nullptr, input_gate_bias,
-          forget_gate_bias, cell_bias, output_gate_bias, projection_weights,
-          projection_bias, &lstm_params, /*forward_sequence=*/true, time_major,
+          forget_gate_bias, cell_gate_bias, output_gate_bias,
+          projection_weights, projection_bias, &lstm_params,
+          /*forward_sequence=*/true, time_major,
           /*output_offset=*/0, scratch_buffer, scaling_factors,
           prod_scaling_factors, recovered_cell_weights, input_quantized,
           /*aux_input_quantized=*/nullptr, output_state_quantized,
diff --git a/tensorflow/lite/kernels/unidirectional_sequence_lstm_test.cc b/tensorflow/lite/kernels/unidirectional_sequence_lstm_test.cc
index ec20d76ae2e..74584ec9e85 100644
--- a/tensorflow/lite/kernels/unidirectional_sequence_lstm_test.cc
+++ b/tensorflow/lite/kernels/unidirectional_sequence_lstm_test.cc
@@ -85,7 +85,7 @@ class UnidirectionalLSTMOpModel : public SingleOpModel {
       input_gate_bias_ = AddInput(TensorType_FLOAT32);
     }
     forget_gate_bias_ = AddInput(TensorType_FLOAT32);
-    cell_bias_ = AddInput(TensorType_FLOAT32);
+    cell_gate_bias_ = AddInput(TensorType_FLOAT32);
     output_gate_bias_ = AddInput(TensorType_FLOAT32);
 
     if (use_projection_weights) {
@@ -187,7 +187,7 @@ class UnidirectionalLSTMOpModel : public SingleOpModel {
   }
 
   void SetCellBias(const std::vector<float>& f) {
-    PopulateTensor(cell_bias_, f);
+    PopulateTensor(cell_gate_bias_, f);
   }
 
   void SetOutputGateBias(const std::vector<float>& f) {
@@ -249,7 +249,7 @@ class UnidirectionalLSTMOpModel : public SingleOpModel {
 
   int input_gate_bias_;
   int forget_gate_bias_;
-  int cell_bias_;
+  int cell_gate_bias_;
   int output_gate_bias_;
 
   int projection_weights_;
@@ -530,7 +530,7 @@ TEST_F(NoCifgNoPeepholeNoProjectionNoClippingUnidirectionalLstmTest,
 
           {n_cell},  // input_gate_bias tensor
           {n_cell},  // forget_gate_bias tensor
-          {n_cell},  // cell_bias tensor
+          {n_cell},  // cell_gate_bias tensor
           {n_cell},  // output_gate_bias tensor
 
           {0, 0},  // projection_weight tensor
@@ -592,7 +592,7 @@ TEST_F(NoCifgNoPeepholeNoProjectionNoClippingUnidirectionalLstmTest,
 
           {n_cell},  // input_gate_bias tensor
           {n_cell},  // forget_gate_bias tensor
-          {n_cell},  // cell_bias tensor
+          {n_cell},  // cell_gate_bias tensor
           {n_cell},  // output_gate_bias tensor
 
           {0, 0},  // projection_weight tensor
@@ -658,7 +658,7 @@ TEST_P(NoCifgNoPeepholeNoProjectionNoClippingUnidirectionalLstmTest,
 
           {n_cell},  // input_gate_bias tensor
           {n_cell},  // forget_gate_bias tensor
-          {n_cell},  // cell_bias tensor
+          {n_cell},  // cell_gate_bias tensor
           {n_cell},  // output_gate_bias tensor
 
           {0, 0},  // projection_weight tensor
@@ -721,7 +721,7 @@ TEST_P(NoCifgNoPeepholeNoProjectionNoClippingUnidirectionalLstmTest,
 
           {n_cell},  // input_gate_bias tensor
           {n_cell},  // forget_gate_bias tensor
-          {n_cell},  // cell_bias tensor
+          {n_cell},  // cell_gate_bias tensor
           {n_cell},  // output_gate_bias tensor
 
           {0, 0},  // projection_weight tensor
@@ -833,7 +833,7 @@ TEST_F(CifgPeepholeNoProjectionNoClippingUnidirectionalLstmTest,
 
           {0},       // input_gate_bias tensor
           {n_cell},  // forget_gate_bias tensor
-          {n_cell},  // cell_bias tensor
+          {n_cell},  // cell_gate_bias tensor
           {n_cell},  // output_gate_bias tensor
 
           {0, 0},  // projection_weight tensor
@@ -894,7 +894,7 @@ TEST_P(CifgPeepholeNoProjectionNoClippingUnidirectionalLstmTest,
 
           {0},       // input_gate_bias tensor
           {n_cell},  // forget_gate_bias tensor
-          {n_cell},  // cell_bias tensor
+          {n_cell},  // cell_gate_bias tensor
           {n_cell},  // output_gate_bias tensor
 
           {0, 0},  // projection_weight tensor
@@ -957,7 +957,7 @@ TEST_P(CifgPeepholeNoProjectionNoClippingUnidirectionalLstmTest,
 
           {0},       // input_gate_bias tensor
           {n_cell},  // forget_gate_bias tensor
-          {n_cell},  // cell_bias tensor
+          {n_cell},  // cell_gate_bias tensor
           {n_cell},  // output_gate_bias tensor
 
           {0, 0},  // projection_weight tensor
@@ -1619,7 +1619,7 @@ TEST_F(NoCifgPeepholeProjectionClippingUnidirectionalLstmTest,
 
           {n_cell},  // input_gate_bias tensor
           {n_cell},  // forget_gate_bias tensor
-          {n_cell},  // cell_bias tensor
+          {n_cell},  // cell_gate_bias tensor
           {n_cell},  // output_gate_bias tensor
 
           {n_output, n_cell},  // projection_weight tensor
@@ -1688,7 +1688,7 @@ TEST_P(NoCifgPeepholeProjectionClippingUnidirectionalLstmTest,
 
           {n_cell},  // input_gate_bias tensor
           {n_cell},  // forget_gate_bias tensor
-          {n_cell},  // cell_bias tensor
+          {n_cell},  // cell_gate_bias tensor
           {n_cell},  // output_gate_bias tensor
 
           {n_output, n_cell},  // projection_weight tensor
@@ -1759,7 +1759,7 @@ TEST_P(NoCifgPeepholeProjectionClippingUnidirectionalLstmTest,
 
           {n_cell},  // input_gate_bias tensor
           {n_cell},  // forget_gate_bias tensor
-          {n_cell},  // cell_bias tensor
+          {n_cell},  // cell_gate_bias tensor
           {n_cell},  // output_gate_bias tensor
 
           {n_output, n_cell},  // projection_weight tensor
@@ -2430,7 +2430,7 @@ TEST_F(NoCifgPeepholeProjectionAndBiasClippingUnidirectionalLstmTest,
 
           {n_cell},  // input_gate_bias tensor
           {n_cell},  // forget_gate_bias tensor
-          {n_cell},  // cell_bias tensor
+          {n_cell},  // cell_gate_bias tensor
           {n_cell},  // output_gate_bias tensor
 
           {n_output, n_cell},  // projection_weight tensor
@@ -2636,7 +2636,7 @@ TEST_F(CifgPeepholeNoProjectionNoClippingLayerNormUnidirectionalLstmTest,
 
           {0},       // input_gate_bias tensor
           {n_cell},  // forget_gate_bias tensor
-          {n_cell},  // cell_bias tensor
+          {n_cell},  // cell_gate_bias tensor
           {n_cell},  // output_gate_bias tensor
 
           {0, 0},  // projection_weight tensor
@@ -2707,7 +2707,7 @@ TEST_F(CifgPeepholeNoProjectionNoClippingUnidirectionalLstmTest,
 
           {0},       // input_gate_bias tensor
           {n_cell},  // forget_gate_bias tensor
-          {n_cell},  // cell_bias tensor
+          {n_cell},  // cell_gate_bias tensor
           {n_cell},  // output_gate_bias tensor
 
           {0, 0},  // projection_weight tensor
diff --git a/tensorflow/lite/tools/optimize/calibration/builtin_logging_ops/lstm.cc b/tensorflow/lite/tools/optimize/calibration/builtin_logging_ops/lstm.cc
index 88ea7c1d591..50138442c25 100644
--- a/tensorflow/lite/tools/optimize/calibration/builtin_logging_ops/lstm.cc
+++ b/tensorflow/lite/tools/optimize/calibration/builtin_logging_ops/lstm.cc
@@ -299,7 +299,7 @@ TfLiteStatus EvalFloat(
     const TfLiteTensor* aux_input_to_cell_weights,
     const TfLiteTensor* aux_input_to_output_weights,
     const TfLiteTensor* input_gate_bias, const TfLiteTensor* forget_gate_bias,
-    const TfLiteTensor* cell_bias, const TfLiteTensor* output_gate_bias,
+    const TfLiteTensor* cell_gate_bias, const TfLiteTensor* output_gate_bias,
     const TfLiteTensor* projection_weights, const TfLiteTensor* projection_bias,
     const TfLiteLSTMParams* params, bool forward_sequence, bool time_major,
     int output_offset, TfLiteTensor* scratch_buffer, TfLiteTensor* output_state,
@@ -384,7 +384,7 @@ TfLiteStatus EvalFloat(
           GetTensorData<float>(output_layer_norm_coefficients),
           GetTensorData<float>(input_gate_bias),
           GetTensorData<float>(forget_gate_bias),
-          GetTensorData<float>(cell_bias),
+          GetTensorData<float>(cell_gate_bias),
           GetTensorData<float>(output_gate_bias),
           GetTensorData<float>(projection_weights),
           GetTensorData<float>(projection_bias), params, n_batch, n_cell,
@@ -446,7 +446,7 @@ TfLiteStatus EvalFloat(
             GetTensorData<float>(output_layer_norm_coefficients),
             GetTensorData<float>(input_gate_bias),
             GetTensorData<float>(forget_gate_bias),
-            GetTensorData<float>(cell_bias),
+            GetTensorData<float>(cell_gate_bias),
             GetTensorData<float>(output_gate_bias),
             GetTensorData<float>(projection_weights),
             GetTensorData<float>(projection_bias), params, /*n_batch=*/1,
@@ -527,7 +527,7 @@ TfLiteStatus lstm_eval(TfLiteContext* context, TfLiteNode* node, Logger* logger,
       context, node, ops::builtin::lstm::full::kInputGateBiasTensor);
   const TfLiteTensor* forget_gate_bias =
       GetInput(context, node, ops::builtin::lstm::full::kForgetGateBiasTensor);
-  const TfLiteTensor* cell_bias =
+  const TfLiteTensor* cell_gate_bias =
       GetInput(context, node, ops::builtin::lstm::full::kCellGateBiasTensor);
   const TfLiteTensor* output_gate_bias =
       GetInput(context, node, ops::builtin::lstm::full::kOutputGateBiasTensor);
@@ -570,8 +570,9 @@ TfLiteStatus lstm_eval(TfLiteContext* context, TfLiteNode* node, Logger* logger,
           /*aux_input_to_forget_weights=*/nullptr,
           /*aux_input_to_cell_weights=*/nullptr,
           /*aux_input_to_output_weights=*/nullptr, input_gate_bias,
-          forget_gate_bias, cell_bias, output_gate_bias, projection_weights,
-          projection_bias, params, /*forward_sequence=*/true,
+          forget_gate_bias, cell_gate_bias, output_gate_bias,
+          projection_weights, projection_bias, params,
+          /*forward_sequence=*/true,
           /*time_major=*/true,
           /*output_offset=*/0, scratch_buffer, output_state, cell_state, output,
           logger, intermediate_tensor_indexes, error_reporter);

From 7cd6c3115badaf79fff2b8809cb4a0b49a5f9c7c Mon Sep 17 00:00:00 2001
From: Denisa Roberts <denisa.roberts@denisaroberts.me>
Date: Fri, 19 Jun 2020 14:26:51 -0400
Subject: [PATCH 0636/1390] Allow gradient access to QR input

---
 tensorflow/python/eager/pywrap_gradient_exclusions.cc | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/tensorflow/python/eager/pywrap_gradient_exclusions.cc b/tensorflow/python/eager/pywrap_gradient_exclusions.cc
index 7da45e36118..7e9f0b16334 100644
--- a/tensorflow/python/eager/pywrap_gradient_exclusions.cc
+++ b/tensorflow/python/eager/pywrap_gradient_exclusions.cc
@@ -50,7 +50,7 @@ auto OpGradientInfoInit(const T &a) {
 
 absl::optional<tensorflow::gtl::FlatSet<int>> OpGradientUnusedInputIndices(
     const tensorflow::string &op_name) {
-  static std::array<OpIndexInfo, 349> a = {{
+  static std::array<OpIndexInfo, 348> a = {{
       {"Acosh"},
       {"AllToAll", 1, {0}},
       {"ApproximateEqual"},
@@ -222,7 +222,6 @@ absl::optional<tensorflow::gtl::FlatSet<int>> OpGradientUnusedInputIndices(
       {"PlaceholderWithDefault"},
       {"PopulationCount"},
       {"PreventGradient"},
-      {"Qr"},
       {"QuantizeAndDequantize"},
       {"QuantizeAndDequantizeV2"},
       {"QuantizeAndDequantizeV3"},

From 9aea03e98d2ef5416dff09f721d6d519ea83e74f Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 19 Jun 2020 11:39:56 -0700
Subject: [PATCH 0637/1390] Add StatefulRandom op to TF MLIR.

PiperOrigin-RevId: 317347849
Change-Id: I19b7b1b3157d065cc556cce7aba72b79ed9b776f
---
 .../mlir/tensorflow/ir/tf_generated_ops.td    | 26 +++++++++++++++++++
 1 file changed, 26 insertions(+)

diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td b/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
index 3b1f3eec699..dcef99e6971 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
@@ -9190,6 +9190,32 @@ def TF_StackV2Op : TF_Op<"StackV2", []> {
   );
 }
 
+def TF_StatelessRandomUniformOp : TF_Op<"StatelessRandomUniform", [NoSideEffect]> {
+  let summary = [{
+Outputs deterministic pseudorandom random values from a uniform distribution.
+  }];
+
+  let description = [{
+The generated values follow a uniform distribution in the range `[0, 1)`. The
+lower bound 0 is included in the range, while the upper bound 1 is excluded.
+
+The outputs are a deterministic function of `shape` and `seed`.
+  }];
+
+  let arguments = (ins
+    TF_I32OrI64Tensor:$shape,
+    TF_I32OrI64Tensor:$seed
+  );
+
+  let results = (outs
+    TF_FpTensor:$output
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+  TF_DerivedOperandTypeAttr Tseed = TF_DerivedOperandTypeAttr<1>;
+  TF_DerivedResultTypeAttr dtype = TF_DerivedResultTypeAttr<0>;
+}
+
 def TF_StopGradientOp : TF_Op<"StopGradient", [NoSideEffect, TF_AllTypesMatch<["input", "output"]>]> {
   let summary = "Stops gradient computation.";
 

From 02d312b25edc2d8999640d7fbe2a833c2eaab0d8 Mon Sep 17 00:00:00 2001
From: Ran Chen <crccw@google.com>
Date: Fri, 19 Jun 2020 11:44:48 -0700
Subject: [PATCH 0638/1390] *** Reason for rollback ***

break pip test

PiperOrigin-RevId: 317348830
Change-Id: Ia6968b2cd3bf001f6afbc4487234d88123d3d0a8
---
 .../python/distribute/multi_process_runner.py | 205 ++----------------
 .../distribute/multi_process_runner_test.py   |  69 +-----
 2 files changed, 25 insertions(+), 249 deletions(-)

diff --git a/tensorflow/python/distribute/multi_process_runner.py b/tensorflow/python/distribute/multi_process_runner.py
index ce36287a9da..8699e59b410 100644
--- a/tensorflow/python/distribute/multi_process_runner.py
+++ b/tensorflow/python/distribute/multi_process_runner.py
@@ -67,7 +67,8 @@ except ImportError:
 # exception stack trace info is stored in exc_info to pass on to parent process
 # to be re-raised.
 _ProcessStatusInfo = collections.namedtuple(
-    '_ProcessStatusInfo', ['is_successful', 'exc_info', 'return_value'])
+    '_ProcessStatusInfo',
+    ['task_type', 'is_successful', 'exc_info', 'return_value'])
 
 # Information returned from a successful MultiProcessRunner run.
 MultiProcessRunnerResult = collections.namedtuple('MultiProcessRunnerResult',
@@ -123,7 +124,6 @@ class MultiProcessRunner(object):
                stream_stdout=True,
                list_stdout=False,
                use_dill_for_args=True,
-               daemon=False,
                args=None,
                kwargs=None):
     """Creates a multi-process runner.
@@ -157,7 +157,6 @@ class MultiProcessRunner(object):
       use_dill_for_args: Whether to use dill to pickle `args` and `kwargs`. dill
         can pickle more objects, but doesn't work with types in
         `multiprocessing` library like `Mutex`.
-      daemon: Whether to start processes as daemons.
       args: Positional arguments to be sent to functions run on processes.
       kwargs: Keyword arguments to be sent to functions run on processes.
 
@@ -189,7 +188,6 @@ class MultiProcessRunner(object):
     self._list_stdout = list_stdout
     self._dependence_on_chief = True
     self._use_dill_for_args = use_dill_for_args
-    self._daemon = daemon
     self._args = args or ()
     self._kwargs = kwargs or {}
 
@@ -270,8 +268,7 @@ class MultiProcessRunner(object):
         test_env=test_env,
         target=_ProcFunc(),
         args=(resources, test_env, proc_func, args, kwargs,
-              self._use_dill_for_args),
-        daemon=self._daemon)
+              self._use_dill_for_args))
     p.start()
     self._processes[(task_type, task_id)] = p
     self._outstanding_subprocess_count += 1
@@ -571,6 +568,7 @@ class _ProcFunc(object):
         time.sleep(0.1)
     self._resources.process_status_queue.put(
         _ProcessStatusInfo(
+            task_type=task_type,
             is_successful=True,
             exc_info=None,
             return_value=None))
@@ -630,9 +628,17 @@ class _ProcFunc(object):
     if test_env.v2_enabled:
       v2_compat.enable_v2_behavior()
 
-    with self._runtime_mode(test_env.executing_eagerly):
-      info = _run_contained(proc_func, args, kwargs)
-      self._resources.process_status_queue.put(info)
+    try:
+      with self._runtime_mode(test_env.executing_eagerly):
+        return_value = proc_func(*args, **kwargs)
+        is_successful = True
+        exc_info = None
+
+    except Exception:  # pylint: disable=broad-except
+      # Capture all exceptions to be reported to parent process.
+      return_value = None
+      is_successful = False
+      exc_info = sys.exc_info()
 
       # Re-raise the exception in addition to reporting it to the parent
       # process, so that even if `--test_timeout` flag is set and the
@@ -641,181 +647,16 @@ class _ProcFunc(object):
       # instead of silently suppressing the error due to early bazel
       # timeout. Raising an error in the subprocess produces stack trace in
       # the log, but the program continues running.
-      if not info.is_successful:
-        six.reraise(*info.exc_info)
+      raise
 
-      self._close_streaming()
-
-
-class MultiProcessPoolRunner(object):
-  """A utility class to start a process pool to simulate a cluster.
-
-  It's similar to MultiProcessRunner, but uses a pool of processes to avoid the
-  expensive initialization cost of Tensorflow.
-  """
-
-  def __init__(self, cluster_spec, initializer=None):
-    """Creates a multi-process pool runner.
-
-    Args:
-      cluster_spec: Dict for cluster spec. The following is an example of
-        cluster with three workers.
-        {"worker": ["worker0.example.com:2222",
-                    "worker1.example.com:2222",
-                    "worker2.example.com:2222"]}
-      initializer: a callable to called at the startup of worker processes.
-
-    Raises:
-      RuntimeError: if `multi_process_runner.test_main()` is not called.
-      ValueError: if there are more than one chief in the `cluster_spec`.
-    """
-    self._cluster_spec = cluster_spec
-    self._initializer = initializer
-    self._conn = {}
-    self._runner = None
-
-  def __del__(self):
-    self._reset()
-
-  def _reset(self):
-    for conn in self._conn.values():
-      conn.close()
-    self._conn = {}
-    if self._runner is not None:
-      self._runner.join()
-      self._runner = None
-
-  def _start(self):
-    """Starts the worker pool."""
-    # We need different arguments for different processes so we're passing a
-    # no-op proc_func here and use start_single_process instead.
-    #
-    # We also need to start the process pool as daemon, so that they don't block
-    # the program from exiting. Note that __del__ may not get called when
-    # there's an exception. The user may also store a pool runner in a global
-    # object to share across test cases
-    self._runner = MultiProcessRunner(
-        proc_func=lambda: None,
-        cluster_spec=self._cluster_spec,
-        use_dill_for_args=False,
-        daemon=True)
-    if self._initializer:
-      initializer = dill.dumps(self._initializer, dill.HIGHEST_PROTOCOL)
-    else:
-      initializer = None
-    for task_type, addresses in self._cluster_spec.items():
-      for task_id, _ in enumerate(addresses):
-        conn1, conn2 = multiprocessing.Pipe(duplex=True)
-        self._conn[(task_type, task_id)] = conn1
-        self._runner.start_single_process(
-            task_type,
-            task_id,
-            proc_func=_pool_runner_worker,
-            args=(initializer, conn2))
-
-  def run(self, proc_func, args=None, kwargs=None):
-    """Runs `proc_func` with `args` and `kwargs` on all jobs.
-
-    Args:
-      proc_func: The function to be run.
-      args: Optional positional arguments to be supplied in `proc_func`.
-      kwargs: Optional keyword arguments to be supplied in `proc_func`.
-
-    Returns:
-      A list of return values.
-    """
-    if self._runner is None:
-      self._start()
-
-    # Since we start the processes as daemon they're going to be killed by
-    # SIGTERM when the program exits. We only turn on streaming during run() to
-    # avoid printing the stacktrace caused by the SIGTERM.
-    self._runner._stream_stdout = True  # pylint: disable=protected-access
-
-    try:
-      proc_func = dill.dumps(proc_func, dill.HIGHEST_PROTOCOL)
-      for conn in self._conn.values():
-        conn.send((proc_func, args or [], kwargs or {}))
-
-      process_statuses = []
-      for (task_type, task_id), conn in self._conn.items():
-        logging.info('Waiting for the result from %s-%d', task_type, task_id)
-        try:
-          process_statuses.append(conn.recv())
-        except EOFError:
-          # This shouldn't happen due to exceptions in proc_func. This usually
-          # means bugs in the runner.
-          self._reset()
-          raise RuntimeError('Unexpected EOF. Worker process may have died. '
-                             'Please report a bug')
-
-      return_values = []
-      for process_status in process_statuses:
-        assert isinstance(process_status, _ProcessStatusInfo)
-        if not process_status.is_successful:
-          six.reraise(*process_status.exc_info)
-        if process_status.return_value is not None:
-          return_values.append(process_status.return_value)
-
-      return return_values
     finally:
-      self._runner._stream_stdout = False  # pylint: disable=protected-access
-
-
-def _pool_runner_worker(initializer, conn):
-  """Function that runs on the workers in a pool.
-
-  It listens for callables to run and returns the result until `conn` is closed.
-  It captures the exceptions during executing the callable and return it through
-  `conn`.
-
-  Args:
-    initializer: A callable to execute during startup.
-    conn: A multiprocessing.Connection object to listen for tasks and send
-      results.
-  """
-  if initializer:
-    initializer = dill.loads(initializer)
-    initializer()
-  while True:
-    try:
-      proc_func, args, kwargs = conn.recv()
-    except EOFError:
-      break
-    proc_func = dill.loads(proc_func)
-    info = _run_contained(proc_func, args, kwargs)
-    sys.stdout.flush()
-    sys.stderr.flush()
-    conn.send(info)
-
-
-def _run_contained(proc_func, args, kwargs):
-  """Runs `proc_func` with `args` and `kwargs`.
-
-  The function returns _ProcessStatusInfo which captures the return value and
-  the exception.
-
-  Args:
-    proc_func: The function to be run.
-    args: Optional positional arguments to be supplied in `proc_func`.
-    kwargs: Optional keyword arguments to be supplied in `proc_func`.
-
-  Returns:
-    a _ProcessStatusInfo.
-  """
-  try:
-    return_value = proc_func(*args, **kwargs)
-    is_successful = True
-    exc_info = None
-  except Exception:  # pylint: disable=broad-except
-    return_value = None
-    is_successful = False
-    exc_info = sys.exc_info()
-  finally:
-    return _ProcessStatusInfo(  # pylint: disable=lost-exception
-        is_successful=is_successful,
-        exc_info=exc_info,
-        return_value=return_value)
+      info = _ProcessStatusInfo(
+          task_type=test_env.task_type,
+          is_successful=is_successful,
+          exc_info=exc_info,
+          return_value=return_value)
+      self._resources.process_status_queue.put(info)
+      self._close_streaming()
 
 
 class SubprocessTimeoutError(RuntimeError):
diff --git a/tensorflow/python/distribute/multi_process_runner_test.py b/tensorflow/python/distribute/multi_process_runner_test.py
index d76ef5a5a3c..aeba43b6b7c 100644
--- a/tensorflow/python/distribute/multi_process_runner_test.py
+++ b/tensorflow/python/distribute/multi_process_runner_test.py
@@ -22,8 +22,6 @@ import json
 import os
 import threading
 import time
-import unittest
-
 from absl import logging
 
 from tensorflow.python.distribute import multi_process_runner
@@ -47,7 +45,7 @@ def proc_func_that_adds_simple_return_data():
   return 'dummy_data'
 
 
-def proc_func_that_returns_args_and_kwargs(*args, **kwargs):
+def proc_func_that_return_args_and_kwargs(*args, **kwargs):
   return list(args) + list(kwargs.items())
 
 
@@ -55,20 +53,6 @@ def proc_func_with_barrier():
   return multi_process_runner.barrier()
 
 
-def proc_func_that_returns_pid():
-  return os.getpid()
-
-
-V = None
-
-
-def proc_func_that_sets_global(val):
-  global V
-  old_val = V
-  V = val
-  return old_val
-
-
 class MultiProcessRunnerTest(test.TestCase):
 
   def _worker_idx(self):
@@ -111,7 +95,7 @@ class MultiProcessRunnerTest(test.TestCase):
 
   def test_multi_process_runner_args_passed_correctly(self):
     return_value = multi_process_runner.run(
-        proc_func_that_returns_args_and_kwargs,
+        proc_func_that_return_args_and_kwargs,
         multi_worker_test_base.create_cluster_spec(num_workers=1),
         args=('a', 'b'),
         kwargs={
@@ -341,54 +325,5 @@ class MultiProcessRunnerTest(test.TestCase):
                 for line in list_to_assert))
 
 
-class MultiProcessPoolRunnerTest(test.TestCase):
-
-  def test_same_process_across_runs(self):
-    cluster_spec = multi_worker_test_base.create_cluster_spec(num_workers=2)
-    runner = multi_process_runner.MultiProcessPoolRunner(cluster_spec)
-    pid = runner.run(proc_func_that_returns_pid)
-    for _ in range(3):
-      self.assertAllEqual(runner.run(proc_func_that_returns_pid), pid)
-
-  def test_exceptions_in_sub_process(self):
-    cluster_spec = multi_worker_test_base.create_cluster_spec(num_workers=2)
-    runner = multi_process_runner.MultiProcessPoolRunner(cluster_spec)
-    pid = runner.run(proc_func_that_returns_pid)
-    with self.assertRaisesRegexp(ValueError, 'This is an error.'):
-      runner.run(proc_func_that_errors)
-    self.assertAllEqual(runner.run(proc_func_that_returns_pid), pid)
-
-  def test_tf_config(self):
-    cluster_spec = multi_worker_test_base.create_cluster_spec(
-        has_chief=True, num_workers=2)
-    runner = multi_process_runner.MultiProcessPoolRunner(cluster_spec)
-    result = runner.run(proc_func_that_adds_task_type_in_return_data)
-
-    job_count_dict = {'worker': 2, 'chief': 1}
-    for data in result:
-      job_count_dict[data] -= 1
-
-    self.assertEqual(job_count_dict['worker'], 0)
-    self.assertEqual(job_count_dict['chief'], 0)
-
-  @unittest.expectedFailure
-  def test_exception_in_main_process(self):
-    # When there's an exception in the main process, __del__() is not called.
-    # This test is to verify MultiProcessPoolRunner can cope with __del__() not
-    # being called.
-    cluster_spec = multi_worker_test_base.create_cluster_spec(
-        has_chief=True, num_workers=2)
-    runner = multi_process_runner.MultiProcessPoolRunner(cluster_spec)
-    runner.run(proc_func_that_returns_pid)
-    raise ValueError('failure')
-
-  def test_initializer(self):
-    cluster_spec = multi_worker_test_base.create_cluster_spec(num_workers=2)
-    runner = multi_process_runner.MultiProcessPoolRunner(
-        cluster_spec, initializer=lambda: proc_func_that_sets_global(1))
-    result = runner.run(proc_func_that_sets_global, args=(2,))
-    self.assertAllEqual(result, [1, 1])
-
-
 if __name__ == '__main__':
   multi_process_runner.test_main()

From 2417a15bf186cde76b601a53324ebbc2c9193124 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 19 Jun 2020 11:46:26 -0700
Subject: [PATCH 0639/1390] Clean up post fusion using DCE.

PiperOrigin-RevId: 317349120
Change-Id: I479d9967323d86e924315d2b1302bafd01ed4151
---
 tensorflow/compiler/xla/service/BUILD                  | 1 +
 tensorflow/compiler/xla/service/multi_output_fusion.cc | 5 +++++
 2 files changed, 6 insertions(+)

diff --git a/tensorflow/compiler/xla/service/BUILD b/tensorflow/compiler/xla/service/BUILD
index acd35cbc153..2fd457e8e47 100644
--- a/tensorflow/compiler/xla/service/BUILD
+++ b/tensorflow/compiler/xla/service/BUILD
@@ -1679,6 +1679,7 @@ cc_library(
     hdrs = ["multi_output_fusion.h"],
     deps = [
         ":hlo",
+        ":hlo_dce",
         ":hlo_pass",
         ":hlo_reachability",
         "//tensorflow/compiler/xla:debug_options_flags",
diff --git a/tensorflow/compiler/xla/service/multi_output_fusion.cc b/tensorflow/compiler/xla/service/multi_output_fusion.cc
index b95b27d6291..a21cec538d1 100644
--- a/tensorflow/compiler/xla/service/multi_output_fusion.cc
+++ b/tensorflow/compiler/xla/service/multi_output_fusion.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include "absl/container/flat_hash_set.h"
 #include "tensorflow/compiler/xla/debug_options_flags.h"
+#include "tensorflow/compiler/xla/service/hlo_dce.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_reachability.h"
@@ -126,6 +127,10 @@ StatusOr<bool> MultiOutputFusion::Run(HloModule* module) {
   candidates_index_.clear();
   all_fusion_candidates_.clear();
   reachability_.reset();
+  if (changed) {
+    HloDCE dce;
+    TF_RETURN_IF_ERROR(dce.Run(module).status());
+  }
   return changed;
 }
 

From f805153a25b00d12072bd728e91bb1621bfcf1b1 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 19 Jun 2020 11:48:22 -0700
Subject: [PATCH 0640/1390] Fix regression that omitted -DNDEBUG in opt builds
 after toolchain refactoring.

Not adding -DNDEBUG leads to enabling assertions which makes LLVM often 10x slower.

PiperOrigin-RevId: 317349500
Change-Id: I52df6ab5013ad5a02101dd96f18220e054c3e94c
---
 third_party/gpus/crosstool/cc_toolchain_config.bzl.tpl | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/third_party/gpus/crosstool/cc_toolchain_config.bzl.tpl b/third_party/gpus/crosstool/cc_toolchain_config.bzl.tpl
index 082ed950b04..eb320a94201 100644
--- a/third_party/gpus/crosstool/cc_toolchain_config.bzl.tpl
+++ b/third_party/gpus/crosstool/cc_toolchain_config.bzl.tpl
@@ -583,7 +583,11 @@ def _features(cpu, compiler, ctx):
                     ),
                 ],
             ),
-            feature(name = "opt"),
+            feature(name = "disable-assertions"),
+            feature(
+                name = "opt",
+                implies = ["disable-assertions"],
+            ),
             feature(name = "fastbuild"),
             feature(name = "dbg"),
             feature(name = "supports_dynamic_linker", enabled = True),

From f53e1aac6576b3f5e575f870da1fbdb286094444 Mon Sep 17 00:00:00 2001
From: Lu Wang <luwa@google.com>
Date: Fri, 19 Jun 2020 11:50:15 -0700
Subject: [PATCH 0641/1390] Stamp the version number for all metadata
 components simultaneously

PiperOrigin-RevId: 317349859
Change-Id: Ica912c3fb310889185c026e6d73ce4c69a9f0505
---
 .../lite/experimental/support/metadata/BUILD  | 24 ++++++++++-
 .../support/metadata/build_defs.bzl           | 43 +++++++++++++++++++
 .../experimental/support/metadata/cc/BUILD    | 13 +++++-
 .../metadata/cc/metadata_parser.h.template    | 28 ++++++++++++
 .../support/metadata/cc/test/BUILD            |  9 ++++
 .../metadata/cc/test/metadata_parser_test.cc  | 33 ++++++++++++++
 .../metadata/cc/test/metadata_version_test.cc |  2 +-
 .../experimental/support/metadata/java/BUILD  |  8 +++-
 .../support/metadata/MetadataExtractor.java   |  8 +---
 .../lite/support/metadata/MetadataParser.java | 27 ++++++++++++
 .../metadata/metadata_parser.py.template      | 26 +++++++++++
 .../support/metadata/metadata_parser_test.py  | 38 ++++++++++++++++
 .../support/metadata/metadata_schema.fbs      | 29 ++++++++-----
 13 files changed, 267 insertions(+), 21 deletions(-)
 create mode 100644 tensorflow/lite/experimental/support/metadata/build_defs.bzl
 create mode 100644 tensorflow/lite/experimental/support/metadata/cc/metadata_parser.h.template
 create mode 100644 tensorflow/lite/experimental/support/metadata/cc/test/metadata_parser_test.cc
 create mode 100644 tensorflow/lite/experimental/support/metadata/java/src/java/org/tensorflow/lite/support/metadata/MetadataParser.java
 create mode 100644 tensorflow/lite/experimental/support/metadata/metadata_parser.py.template
 create mode 100644 tensorflow/lite/experimental/support/metadata/metadata_parser_test.py

diff --git a/tensorflow/lite/experimental/support/metadata/BUILD b/tensorflow/lite/experimental/support/metadata/BUILD
index 4621c8c55d2..ba410d914c7 100644
--- a/tensorflow/lite/experimental/support/metadata/BUILD
+++ b/tensorflow/lite/experimental/support/metadata/BUILD
@@ -1,5 +1,6 @@
 load("//tensorflow:tensorflow.bzl", "py_test")
 load("@flatbuffers//:build_defs.bzl", "flatbuffer_android_library", "flatbuffer_cc_library", "flatbuffer_java_library", "flatbuffer_py_library")
+load("//tensorflow/lite/experimental/support/metadata:build_defs.bzl", "stamp_metadata_parser_version")
 
 package(
     default_visibility = [
@@ -51,9 +52,19 @@ flatbuffer_android_library(
     custom_package = "org.tensorflow.lite.support.metadata.schema",
 )
 
+# TODO(b/157813075): move the metadata python library to metadata/python/ when migrating to the new repo.
+stamp_metadata_parser_version(
+    name = "metadata_parser_py",
+    srcs = ["metadata_parser.py.template"],
+    outs = ["metadata_parser.py"],
+)
+
 py_library(
     name = "metadata",
-    srcs = ["metadata.py"],
+    srcs = [
+        "metadata.py",
+        ":metadata_parser_py",
+    ],
     data = [
         "//tensorflow/lite/experimental/support/metadata:metadata_schema.fbs",
     ],
@@ -89,3 +100,14 @@ py_test(
         "@six_archive//:six",
     ],
 )
+
+py_test(
+    name = "metadata_parser_test",
+    srcs = ["metadata_parser_test.py"],
+    python_version = "PY3",
+    srcs_version = "PY2AND3",
+    deps = [
+        ":metadata",
+        "//tensorflow/python:client_testlib",
+    ],
+)
diff --git a/tensorflow/lite/experimental/support/metadata/build_defs.bzl b/tensorflow/lite/experimental/support/metadata/build_defs.bzl
new file mode 100644
index 00000000000..3ea945770e0
--- /dev/null
+++ b/tensorflow/lite/experimental/support/metadata/build_defs.bzl
@@ -0,0 +1,43 @@
+"""Build rules to generate metadata schema versions."""
+
+METADATA_SCHEMA_FILE = "//tensorflow/lite/experimental/support/metadata:metadata_schema.fbs"
+
+def stamp_metadata_parser_version(
+        name,
+        srcs,
+        outs):
+    """Stamps the latest metadata parser version into the srcs files.
+
+    Replaces all the occurrences of "{LATEST_METADATA_PARSER_VERSION}" in the
+    srcs files with the metadata schema version extracted from
+    METADATA_SCHEMA_FILE and then outputs the generated file into outs,
+    respectively. The number of srcs files needs to match the number of outs
+    files.
+
+    Args:
+        name: Rule name. (required)
+        srcs: List of source files. (required)
+        outs: List of output files. (required)
+    """
+    if len(srcs) != len(outs):
+        fail(("The number of srcs files (%d) does not match that of the outs" +
+              " files (%d).") %
+             (len(srcs), len(outs)))
+
+    for i in range(0, len(srcs)):
+        native.genrule(
+            name = "%s_file%d" % (name, i),
+            srcs = [srcs[i]],
+            outs = [outs[i]],
+            tools = [METADATA_SCHEMA_FILE],
+            # Gets the metadata schema version from the file, and stamps it
+            # into the srcs file.
+            cmd = "version=$$(sed -n -e '/Schema Semantic version/ s/.*\\: *//p' $(location %s));" %
+                  METADATA_SCHEMA_FILE +
+                  'sed "s/{LATEST_METADATA_PARSER_VERSION}/$$version/" $< > $@',
+        )
+
+    native.filegroup(
+        name = name,
+        srcs = outs,
+    )
diff --git a/tensorflow/lite/experimental/support/metadata/cc/BUILD b/tensorflow/lite/experimental/support/metadata/cc/BUILD
index 832e2edb56d..8febc7a2237 100644
--- a/tensorflow/lite/experimental/support/metadata/cc/BUILD
+++ b/tensorflow/lite/experimental/support/metadata/cc/BUILD
@@ -1,12 +1,23 @@
+load("//tensorflow/lite/experimental/support/metadata:build_defs.bzl", "stamp_metadata_parser_version")
+
 package(
     default_visibility = ["//tensorflow/lite/experimental/support:users"],
     licenses = ["notice"],  # Apache 2.0
 )
 
+stamp_metadata_parser_version(
+    name = "metadata_parser_h",
+    srcs = ["metadata_parser.h.template"],
+    outs = ["metadata_parser.h"],
+)
+
 cc_library(
     name = "metadata_version",
     srcs = ["metadata_version.cc"],
-    hdrs = ["metadata_version.h"],
+    hdrs = [
+        "metadata_version.h",
+        ":metadata_parser_h",
+    ],
     deps = [
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/experimental/support/metadata:metadata_schema_cc",
diff --git a/tensorflow/lite/experimental/support/metadata/cc/metadata_parser.h.template b/tensorflow/lite/experimental/support/metadata/cc/metadata_parser.h.template
new file mode 100644
index 00000000000..dfb62d0de81
--- /dev/null
+++ b/tensorflow/lite/experimental/support/metadata/cc/metadata_parser.h.template
@@ -0,0 +1,28 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_SUPPORT_METADATA_CC_METADATA_PARSER_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_SUPPORT_METADATA_CC_METADATA_PARSER_H_
+
+namespace tflite {
+namespace metadata {
+
+// The version of the metadata parser that this metadata versioning library is
+// depending on.
+inline constexpr char kMatadataParserVersion[] = "{LATEST_METADATA_PARSER_VERSION}";
+
+}  // namespace metadata
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_SUPPORT_METADATA_CC_METADATA_PARSER_H_
diff --git a/tensorflow/lite/experimental/support/metadata/cc/test/BUILD b/tensorflow/lite/experimental/support/metadata/cc/test/BUILD
index fd829124c73..f9d78567d70 100644
--- a/tensorflow/lite/experimental/support/metadata/cc/test/BUILD
+++ b/tensorflow/lite/experimental/support/metadata/cc/test/BUILD
@@ -13,3 +13,12 @@ cc_test(
         "@flatbuffers",
     ],
 )
+
+cc_test(
+    name = "metadata_parser_test",
+    srcs = ["metadata_parser_test.cc"],
+    deps = [
+        "//tensorflow/lite/experimental/support/metadata/cc:metadata_version",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
diff --git a/tensorflow/lite/experimental/support/metadata/cc/test/metadata_parser_test.cc b/tensorflow/lite/experimental/support/metadata/cc/test/metadata_parser_test.cc
new file mode 100644
index 00000000000..af7b8791fe8
--- /dev/null
+++ b/tensorflow/lite/experimental/support/metadata/cc/test/metadata_parser_test.cc
@@ -0,0 +1,33 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/experimental/support/metadata/cc/metadata_parser.h"
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+
+namespace tflite {
+namespace metadata {
+namespace {
+
+using ::testing::MatchesRegex;
+
+TEST(MetadataParserTest, MatadataParserVersionIsWellFormed) {
+  // Validates that the version is well-formed (x.y.z).
+  EXPECT_THAT(kMatadataParserVersion, MatchesRegex("[0-9]+\\.[0-9]+\\.[0-9]+"));
+}
+
+}  // namespace
+}  // namespace metadata
+}  // namespace tflite
diff --git a/tensorflow/lite/experimental/support/metadata/cc/test/metadata_version_test.cc b/tensorflow/lite/experimental/support/metadata/cc/test/metadata_version_test.cc
index 02ecfdbd232..03f4d3bf28b 100644
--- a/tensorflow/lite/experimental/support/metadata/cc/test/metadata_version_test.cc
+++ b/tensorflow/lite/experimental/support/metadata/cc/test/metadata_version_test.cc
@@ -44,7 +44,7 @@ TEST(MetadataVersionTest,
                                             builder.GetSize(), &min_version),
             kTfLiteOk);
   // Validates that the version is well-formed (x.y.z).
-  EXPECT_THAT(min_version, MatchesRegex("[0-9]*\\.[0-9]*\\.[0-9]"));
+  EXPECT_THAT(min_version, MatchesRegex("[0-9]+\\.[0-9]+\\.[0-9]+"));
 }
 
 TEST(MetadataVersionTest,
diff --git a/tensorflow/lite/experimental/support/metadata/java/BUILD b/tensorflow/lite/experimental/support/metadata/java/BUILD
index c208752ae24..00d10bcca56 100644
--- a/tensorflow/lite/experimental/support/metadata/java/BUILD
+++ b/tensorflow/lite/experimental/support/metadata/java/BUILD
@@ -9,9 +9,13 @@ package(
     licenses = ["notice"],  # Apache 2.0
 )
 
+METADATA_SRCS = glob(
+    ["src/java/org/tensorflow/lite/support/metadata/**/*.java"],
+)
+
 android_library(
     name = "tensorflow-lite-support-metadata",
-    srcs = glob(["src/java/org/tensorflow/lite/support/metadata/**/*.java"]),
+    srcs = METADATA_SRCS,
     manifest = "AndroidManifest.xml",
     deps = [
         "//tensorflow/lite/experimental/support/metadata:metadata_schema_fbs_android",
@@ -22,7 +26,7 @@ android_library(
 
 java_library(
     name = "tensorflow-lite-support-metadata-lib",
-    srcs = glob(["src/java/org/tensorflow/lite/support/metadata/**/*.java"]),
+    srcs = METADATA_SRCS,
     javacopts = JAVACOPTS,
     resource_jars = [
         "//tensorflow/lite/experimental/support/metadata:libmetadata_schema_java.jar",
diff --git a/tensorflow/lite/experimental/support/metadata/java/src/java/org/tensorflow/lite/support/metadata/MetadataExtractor.java b/tensorflow/lite/experimental/support/metadata/java/src/java/org/tensorflow/lite/support/metadata/MetadataExtractor.java
index 9da5b59cf46..9bf5ae93138 100644
--- a/tensorflow/lite/experimental/support/metadata/java/src/java/org/tensorflow/lite/support/metadata/MetadataExtractor.java
+++ b/tensorflow/lite/experimental/support/metadata/java/src/java/org/tensorflow/lite/support/metadata/MetadataExtractor.java
@@ -52,10 +52,6 @@ import org.tensorflow.lite.support.metadata.schema.TensorMetadata;
  * MetadataExtractor} omits subgraph index as an input in its methods.
  */
 public class MetadataExtractor {
-  // TODO(b/156539454): remove the hardcode versioning number and populate the version through
-  // genrule.
-  /** The version of the metadata parser that this {@link MetadataExtractor} library depends on. */
-  public static final String METADATA_PARSER_VERSION = "1.0.1";
 
   /** The helper class to load metadata from TFLite model FlatBuffer. */
   private final ModelInfo modelInfo;
@@ -85,7 +81,7 @@ public class MetadataExtractor {
         System.err.printf(
             "<Warning> Some fields in the metadata belong to a future schema. The minimum parser"
                 + " version required is %s, but the version of the current metadata parser is %s",
-            metadataInfo.getMininumParserVersion(), METADATA_PARSER_VERSION);
+            metadataInfo.getMininumParserVersion(), MetadataParser.VERSION);
       }
 
       checkArgument(
@@ -290,7 +286,7 @@ public class MetadataExtractor {
     if (minVersion == null) {
       return true;
     }
-    return compareVersions(minVersion, METADATA_PARSER_VERSION) <= 0;
+    return compareVersions(minVersion, MetadataParser.VERSION) <= 0;
   }
 
   /**
diff --git a/tensorflow/lite/experimental/support/metadata/java/src/java/org/tensorflow/lite/support/metadata/MetadataParser.java b/tensorflow/lite/experimental/support/metadata/java/src/java/org/tensorflow/lite/support/metadata/MetadataParser.java
new file mode 100644
index 00000000000..195a330462b
--- /dev/null
+++ b/tensorflow/lite/experimental/support/metadata/java/src/java/org/tensorflow/lite/support/metadata/MetadataParser.java
@@ -0,0 +1,27 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+package org.tensorflow.lite.support.metadata;
+
+/** Information about the metadata parser that this metadata extractor library is depending on. */
+public final class MetadataParser {
+  /**
+   * The version of the metadata parser that this metadata extractor library is depending on. The
+   * value should match the value of "Schema Semantic version" in metadata_schema.fbs.
+   */
+  public static final String VERSION = "1.0.1";
+
+  private MetadataParser() {}
+}
diff --git a/tensorflow/lite/experimental/support/metadata/metadata_parser.py.template b/tensorflow/lite/experimental/support/metadata/metadata_parser.py.template
new file mode 100644
index 00000000000..a41ac06969c
--- /dev/null
+++ b/tensorflow/lite/experimental/support/metadata/metadata_parser.py.template
@@ -0,0 +1,26 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Information about the metadata parser that this python library depends on."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+
+class MetadataParser(object):
+  """Information about the metadata parser."""
+
+  # The version of the metadata parser.
+  VERSION = "{LATEST_METADATA_PARSER_VERSION}"
diff --git a/tensorflow/lite/experimental/support/metadata/metadata_parser_test.py b/tensorflow/lite/experimental/support/metadata/metadata_parser_test.py
new file mode 100644
index 00000000000..3b1d19278cd
--- /dev/null
+++ b/tensorflow/lite/experimental/support/metadata/metadata_parser_test.py
@@ -0,0 +1,38 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for tensorflow.lite.experimental.support.metadata.metadata_parser."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import re
+
+from tensorflow.lite.experimental.support.metadata import metadata_parser
+from tensorflow.python.framework import test_util
+from tensorflow.python.platform import test
+
+
+class MetadataParserTest(test_util.TensorFlowTestCase):
+
+  def test_version_wellFormedSemanticVersion(self):
+    # Validates that the version is well-formed (x.y.z).
+    self.assertTrue(
+        re.match('[0-9]+\\.[0-9]+\\.[0-9]+',
+                 metadata_parser.MetadataParser.VERSION))
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/lite/experimental/support/metadata/metadata_schema.fbs b/tensorflow/lite/experimental/support/metadata/metadata_schema.fbs
index 53c26b3e079..a88225f1960 100644
--- a/tensorflow/lite/experimental/support/metadata/metadata_schema.fbs
+++ b/tensorflow/lite/experimental/support/metadata/metadata_schema.fbs
@@ -29,18 +29,31 @@ namespace tflite;
 // generate the model interface. It is recommended to fill in at least those
 // enties to boost the codegen performance.
 
-// LINT.IfChange
-
-// The Metadata schema is versioned by the Semantic versioning number, which
-// tracks the schema changes according to the Semantic versioning rules.
+// The Metadata schema is versioned by the Semantic versioning number, such as
+// MAJOR.MINOR.PATCH. It tracks the schema changes according to the rules below:
+//  * Bump up the MAJOR number when making potentially backwards incompatible
+//    changes. It must be incremented if the new changes break the backwards
+//    compatibility. It may also include minor and patch level changes as
+//    needed. The true backwards compatibility is indicated by the file
+//    identifier.
+//  * Bump up the MINOR number when making backwards compatible updates for
+//    major features, such as supporting new content types or adding new
+//    processing units.
+//  * Bump up the PATCH number when making small backwards compatible changes,
+//    such as adding a new fields or deprecating certain fields (not deleting
+//    them).
 //
 // ModelMetadata.min_parser_version indicates the minimum necessary metadata
 // parser version to fully understand all fields in a given metadata flatbuffer.
 //
-// New fields and types will have associated comments with the schema version for
-// which they were added.
+// New fields and types will have associated comments with the schema version
+// for which they were added.
 //
+// LINT.IfChange
 // Schema Semantic version: 1.0.1
+// LINT.ThenChange(//tensorflow/lite/experimental/\
+//.    support/metadata/java/src/java/org/tensorflow/lite/support/metadata/\
+//.    MetadataParser.java)
 
 // This indicates the flatbuffer compatibility. The number will bump up when a
 // break change is applied to the schema, such as removing fields or adding new
@@ -53,10 +66,6 @@ file_identifier "M001";
 // File extension of any written files.
 file_extension "tflitemeta";
 
-// LINT.ThenChange(//tensorflow/lite/experimental/\
-//     /support/metadata/java/src/java/org/tensorflow/lite/support/metadata/\
-//     MetadataExtractor.java)
-
 // LINT.IfChange
 enum AssociatedFileType : byte {
   UNKNOWN = 0,

From a8456eae42648e81f8a8f75c793ad53473bebc70 Mon Sep 17 00:00:00 2001
From: Yunxing Dai <yunxing@google.com>
Date: Fri, 19 Jun 2020 11:59:51 -0700
Subject: [PATCH 0642/1390] Check the value shape of set dimension size in
 shape inference.

PiperOrigin-RevId: 317351738
Change-Id: Ia185e7745753711ca9ebf657522ef7422c9696ca
---
 tensorflow/compiler/xla/client/xla_builder.cc |  6 +++--
 .../compiler/xla/service/hlo_verifier.cc      |  3 ++-
 .../compiler/xla/service/shape_inference.cc   |  7 +++++-
 .../compiler/xla/service/shape_inference.h    |  5 ++++-
 .../xla/service/shape_inference_test.cc       | 22 +++++++++++++++++++
 5 files changed, 38 insertions(+), 5 deletions(-)

diff --git a/tensorflow/compiler/xla/client/xla_builder.cc b/tensorflow/compiler/xla/client/xla_builder.cc
index 03ae23ea18b..56e9aba6112 100644
--- a/tensorflow/compiler/xla/client/xla_builder.cc
+++ b/tensorflow/compiler/xla/client/xla_builder.cc
@@ -2762,9 +2762,11 @@ XlaOp XlaBuilder::SetDimensionSize(XlaOp operand, XlaOp val, int64 dimension) {
   return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
     HloInstructionProto instr;
     TF_ASSIGN_OR_RETURN(const Shape* operand_shape, GetShapePtr(operand));
+    TF_ASSIGN_OR_RETURN(const Shape* val_shape, GetShapePtr(val));
 
-    TF_ASSIGN_OR_RETURN(Shape shape, ShapeInference::InferSetDimensionSizeShape(
-                                         *operand_shape, dimension));
+    TF_ASSIGN_OR_RETURN(Shape shape,
+                        ShapeInference::InferSetDimensionSizeShape(
+                            *operand_shape, *val_shape, dimension));
     // Setting an op's dynamic dimension to the static size is a noop.
     TF_ASSIGN_OR_RETURN(const HloInstructionProto* val_proto,
                         LookUpInstruction(val));
diff --git a/tensorflow/compiler/xla/service/hlo_verifier.cc b/tensorflow/compiler/xla/service/hlo_verifier.cc
index 4661b8fd9e3..d8baebd6fdd 100644
--- a/tensorflow/compiler/xla/service/hlo_verifier.cc
+++ b/tensorflow/compiler/xla/service/hlo_verifier.cc
@@ -1123,7 +1123,8 @@ Status ShapeVerifier::HandleGetDimensionSize(HloInstruction* get_size) {
 Status ShapeVerifier::HandleSetDimensionSize(HloInstruction* set_size) {
   return CheckShape(set_size,
                     ShapeInference::InferSetDimensionSizeShape(
-                        set_size->operand(0)->shape(), set_size->dimension()));
+                        set_size->operand(0)->shape(),
+                        set_size->operand(1)->shape(), set_size->dimension()));
 }
 
 Status ShapeVerifier::CheckShape(const HloInstruction* instruction,
diff --git a/tensorflow/compiler/xla/service/shape_inference.cc b/tensorflow/compiler/xla/service/shape_inference.cc
index 75a80747c1d..bb4a38ded1e 100644
--- a/tensorflow/compiler/xla/service/shape_inference.cc
+++ b/tensorflow/compiler/xla/service/shape_inference.cc
@@ -2248,12 +2248,17 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
 }
 
 /* static */ StatusOr<Shape> ShapeInference::InferSetDimensionSizeShape(
-    const Shape& shape, int64 dimension) {
+    const Shape& shape, const Shape& val_shape, int64 dimension) {
   if (dimension < 0 || dimension >= shape.rank()) {
     return InvalidArgument("SetDimensionSize dimension out of bounds: %d.",
                            dimension);
   }
 
+  if (val_shape.rank() != 0 || val_shape.element_type() != S32) {
+    return InvalidArgument(
+        "SetDimensionSize's value has to be S32 scalar, got %s",
+        val_shape.ToString());
+  }
   // TODO(b/119580730): Remove this restriction when very large dimension size
   // is needed.
   if (shape.dimensions(dimension) > std::numeric_limits<int32>::max()) {
diff --git a/tensorflow/compiler/xla/service/shape_inference.h b/tensorflow/compiler/xla/service/shape_inference.h
index 2cb5930d098..d47d96ab52d 100644
--- a/tensorflow/compiler/xla/service/shape_inference.h
+++ b/tensorflow/compiler/xla/service/shape_inference.h
@@ -303,10 +303,13 @@ class ShapeInference {
       const Shape& updates_shape, const ProgramShape& to_apply_shape,
       const ScatterDimensionNumbers& scatter_dim_numbers);
 
+  // Helper that validates the given input shape to GetDimensionSize.
   static StatusOr<Shape> InferGetDimensionSizeShape(const Shape& shape,
                                                     int64 dimension);
 
-  static StatusOr<Shape> InferSetDimensionSizeShape(const Shape& shape,
+  // Helper that validates the given input shape to SetDimensionSize.
+  static StatusOr<Shape> InferSetDimensionSizeShape(const Shape& operand_shape,
+                                                    const Shape& val_shape,
                                                     int64 dimension);
 
   // Helper function for creating a Window proto from user-supplied data.
diff --git a/tensorflow/compiler/xla/service/shape_inference_test.cc b/tensorflow/compiler/xla/service/shape_inference_test.cc
index b5ecf6e583e..916d3ab15c8 100644
--- a/tensorflow/compiler/xla/service/shape_inference_test.cc
+++ b/tensorflow/compiler/xla/service/shape_inference_test.cc
@@ -1365,6 +1365,28 @@ TEST_F(ShapeInferenceTest, DotWithTwoContractingDimsPasses) {
   EXPECT_TRUE(ShapeUtil::Equal(inferred_status.ValueOrDie(), output_shape));
 }
 
+TEST_F(ShapeInferenceTest, ErrorSetDimensionSize) {
+  Shape arg_shape = ShapeUtil::MakeShape(F32, {5, 3});
+  Shape val_shape = ShapeUtil::MakeShape(S32, {1});
+  auto inferred_status = ShapeInference::InferSetDimensionSizeShape(
+      arg_shape, val_shape, /*dimension=*/0);
+
+  EXPECT_FALSE(inferred_status.ok());
+  EXPECT_THAT(inferred_status.status().error_message(),
+              HasSubstr("value has to be S32 scalar"));
+}
+
+TEST_F(ShapeInferenceTest, ErrorSetDimensionSizeWrongType) {
+  Shape arg_shape = ShapeUtil::MakeShape(F32, {5, 3});
+  Shape val_shape = ShapeUtil::MakeShape(U32, {});
+  auto inferred_status = ShapeInference::InferSetDimensionSizeShape(
+      arg_shape, val_shape, /*dimension=*/0);
+
+  EXPECT_FALSE(inferred_status.ok());
+  EXPECT_THAT(inferred_status.status().error_message(),
+              HasSubstr("value has to be S32 scalar"));
+}
+
 // BatchMatMul with different batch dimension sizes fails.
 TEST_F(ShapeInferenceTest, DotWithMismatchedBatchDimSizesFails) {
   Shape lhs_shape = ShapeUtil::MakeShape(F32, {2, 11, 3});

From 2229ae89c927b46355a15e8af22365d24afc25bf Mon Sep 17 00:00:00 2001
From: Jiho Choi <jihochoi@google.com>
Date: Fri, 19 Jun 2020 12:06:36 -0700
Subject: [PATCH 0643/1390] Use group_id as step_id.

PiperOrigin-RevId: 317353238
Change-Id: If52b2b4872c92d3f65af8f6ce1651e8c6da7dae7
---
 tensorflow/core/profiler/convert/BUILD        |  1 +
 .../convert/xplane_to_memory_profile.cc       | 59 +++++++------------
 .../convert/xplane_to_memory_profile.h        |  1 +
 .../convert/xplane_to_memory_profile_test.cc  |  3 +-
 .../profiler/protobuf/memory_profile.proto    |  4 +-
 5 files changed, 25 insertions(+), 43 deletions(-)

diff --git a/tensorflow/core/profiler/convert/BUILD b/tensorflow/core/profiler/convert/BUILD
index abf0176bf6f..06594b1aeaf 100644
--- a/tensorflow/core/profiler/convert/BUILD
+++ b/tensorflow/core/profiler/convert/BUILD
@@ -525,6 +525,7 @@ tf_cc_test(
         "//tensorflow/core:test_main",
         "//tensorflow/core/profiler/protobuf:memory_profile_proto_cc",
         "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
+        "//tensorflow/core/profiler/utils:group_events",
         "//tensorflow/core/profiler/utils:xplane_builder",
         "//tensorflow/core/profiler/utils:xplane_schema",
         "//tensorflow/core/profiler/utils:xplane_test_utils",
diff --git a/tensorflow/core/profiler/convert/xplane_to_memory_profile.cc b/tensorflow/core/profiler/convert/xplane_to_memory_profile.cc
index d039ca8da32..d7104c2bbf5 100644
--- a/tensorflow/core/profiler/convert/xplane_to_memory_profile.cc
+++ b/tensorflow/core/profiler/convert/xplane_to_memory_profile.cc
@@ -42,6 +42,8 @@ namespace profiler {
 
 namespace {
 
+constexpr int64 kInvalidStepId = -1;
+
 // Index of the time-sorted memory_profile_snapshots list, and the
 // MemoryActivityMetadata proto it contains.
 using IndexMetaPair = std::pair<int64 /*index*/, const MemoryActivityMetadata*>;
@@ -63,7 +65,7 @@ struct ActivityMetadata {
   int64 allocation_bytes = 0;
   uint64 address = 0;
   absl::string_view tf_op_name;
-  int64 step_id = -1;
+  int64 step_id = kInvalidStepId;
   absl::string_view region_type;
   int64 data_type = 0;
   absl::string_view tensor_shape;
@@ -129,7 +131,6 @@ void UpdateProfileSummary(const AggregationStats& stats, int64 time_offset_ps,
 MemoryProfile GenerateMemoryProfile(const XPlane* host_trace) {
   XPlaneVisitor plane = CreateTfXPlaneVisitor(host_trace);
   MemoryProfile memory_profile;
-  auto* step_count = memory_profile.mutable_step_count();
   // Iterate over all XEvents in the XPlane, and add the XStats to a new
   // MemoryProfileSnapshot if the EventType is kMemoryAllocation or
   // kMemoryDeallocation.
@@ -181,9 +182,8 @@ MemoryProfile GenerateMemoryProfile(const XPlane* host_trace) {
           case StatType::kTfOp:
             metadata.tf_op_name = stat.StrOrRefValue();
             break;
-          case StatType::kStepId:
+          case StatType::kGroupId:
             metadata.step_id = stat.IntValue();
-            if (metadata.step_id != 0) (*step_count)[metadata.step_id]++;
             break;
           case StatType::kRegionType:
             metadata.region_type = stat.StrOrRefValue();
@@ -214,40 +214,21 @@ MemoryProfile GenerateMemoryProfile(const XPlane* host_trace) {
   return memory_profile;
 }
 
-// Sequentialize step ids for the memory profile.
-void UpdateStepId(const tensorflow::protobuf::Map<
-                      tensorflow::protobuf_int64 /*orig_step_id*/,
-                      tensorflow::protobuf_int64 /*count*/>& step_count,
-                  PerAllocatorMemoryProfile* memory_profile) {
-  // Map from original random step id to sequential step id.
-  absl::flat_hash_map<int64 /*orig_step_id*/, int64 /*step_id*/> step_map;
-  constexpr int kUnknownStep = -2;
-  constexpr double kStepFilterRatio = 0.1;  // Magic number for filtering.
-  tensorflow::protobuf_int64 max_step_count = 0;
-  for (const auto& step_and_count : step_count) {
-    max_step_count = std::max(max_step_count, step_and_count.second);
-  }
-  // Filter out noisy and incomplete original step ids.
-  for (const auto& step_and_count : step_count) {
-    if (static_cast<double>(step_and_count.second) / max_step_count >
-        kStepFilterRatio) {
-      step_map[step_and_count.first] = kUnknownStep;
-    }
-  }
-
-  // Update the step ids in memory_profile for this allocator.
-  int64 step_id = -1;
+// Fix invalid step ids of snapshots at the beginning/end of the profile or at
+// the step boundaries. The snapshots with invalid step ids at the beginning get
+// 0 for their step ids. Those at the step boundaries or at the end get the
+// previous snapshot's step id + 1.
+void UpdateStepId(PerAllocatorMemoryProfile* memory_profile) {
+  int64 last_valid_step_id = -1;
+  // Snapshots are already sorted in time.
   for (auto& snapshot : *memory_profile->mutable_memory_profile_snapshots()) {
     DCHECK(snapshot.has_activity_metadata());
-    // Convert the random step id to sequential step id.
-    int64 orig_step_id = snapshot.activity_metadata().step_id();
-    if (step_map.contains(orig_step_id) &&
-        step_map[orig_step_id] == kUnknownStep) {
-      step_map[orig_step_id] = ++step_id;
+    if (snapshot.mutable_activity_metadata()->step_id() == kInvalidStepId) {
+      snapshot.mutable_activity_metadata()->set_step_id(last_valid_step_id + 1);
+    } else {
+      last_valid_step_id = snapshot.mutable_activity_metadata()->step_id();
     }
-    snapshot.mutable_activity_metadata()->set_step_id(step_id);
   }
-  VLOG(2) << "Max sequential step id in profile: " << step_id;
 }
 
 // Update the MemoryActivityMetadata for each deallocation event by copying from
@@ -481,14 +462,14 @@ void ProcessMemoryProfileProto(int64 max_num_snapshots,
       return a.time_offset_ps() < b.time_offset_ps();
     });
 
-    UpdateStepId(memory_profile->step_count(), allocator_memory_profile);
+    UpdateStepId(allocator_memory_profile);
     UpdateDeallocation(allocator_memory_profile);
 
-    int64 peak_bytes_profile = allocator_memory_profile->profile_summary()
-                                   .peak_stats()
-                                   .peak_bytes_in_use();
     int64 peak_step_id =
-        GetPeakMemoryStep(peak_bytes_profile, allocator_memory_profile);
+        GetPeakMemoryStep(allocator_memory_profile->profile_summary()
+                              .peak_stats()
+                              .peak_bytes_in_use(),
+                          allocator_memory_profile);
     ProcessActiveAllocations(peak_step_id, allocator_memory_profile);
     SampleSnapshots(max_num_snapshots, snapshots);
   }
diff --git a/tensorflow/core/profiler/convert/xplane_to_memory_profile.h b/tensorflow/core/profiler/convert/xplane_to_memory_profile.h
index 873ac800aa5..6eddaeeec71 100644
--- a/tensorflow/core/profiler/convert/xplane_to_memory_profile.h
+++ b/tensorflow/core/profiler/convert/xplane_to_memory_profile.h
@@ -25,6 +25,7 @@ namespace profiler {
 
 // Process the host threads XPlane and generate MemoryProfile result; at most
 // max_num_snapshots will be displayed on the UI.
+// REQUIRED: host_plane should have been grouped by calling GroupTfEvents().
 MemoryProfile ConvertXPlaneToMemoryProfile(const XPlane& host_plane,
                                            int64 max_num_snapshots = 1000);
 
diff --git a/tensorflow/core/profiler/convert/xplane_to_memory_profile_test.cc b/tensorflow/core/profiler/convert/xplane_to_memory_profile_test.cc
index 5ddcbcfc75d..c334318dcfe 100644
--- a/tensorflow/core/profiler/convert/xplane_to_memory_profile_test.cc
+++ b/tensorflow/core/profiler/convert/xplane_to_memory_profile_test.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/protobuf/memory_profile.pb.h"
 #include "tensorflow/core/profiler/protobuf/xplane.pb.h"
+#include "tensorflow/core/profiler/utils/group_events.h"
 #include "tensorflow/core/profiler/utils/xplane_builder.h"
 #include "tensorflow/core/profiler/utils/xplane_schema.h"
 #include "tensorflow/core/profiler/utils/xplane_test_utils.h"
@@ -84,11 +85,11 @@ TEST(ConvertXPlaneToMemoryProfile, OneAllocatorMultiActivitiesTest) {
                 {StatType::kRegionType, "temp"},
                 {StatType::kTensorShapes, "[1, 2]"}});
 
+  tensorflow::profiler::GroupTfEvents(&space, nullptr);
   MemoryProfile memory_profile = ConvertXPlaneToMemoryProfile(*host_plane);
   EXPECT_EQ(memory_profile.memory_profile_per_allocator().size(), 1);
   EXPECT_EQ(memory_profile.num_hosts(), 1);
   EXPECT_EQ(memory_profile.memory_ids_size(), 1);
-  EXPECT_EQ(memory_profile.step_count().size(), 1);
   EXPECT_EQ(memory_profile.memory_profile_per_allocator().begin()->first,
             "GPU_0_bfc");
   const auto& allocator_memory_profile =
diff --git a/tensorflow/core/profiler/protobuf/memory_profile.proto b/tensorflow/core/profiler/protobuf/memory_profile.proto
index 7a5272c60b2..4d492a56255 100644
--- a/tensorflow/core/profiler/protobuf/memory_profile.proto
+++ b/tensorflow/core/profiler/protobuf/memory_profile.proto
@@ -122,7 +122,5 @@ message MemoryProfile {
   // Ids for profiled memory allocators, used to populate memory selection list
   // at front end.
   repeated string memory_ids = 3;
-  // Map of original random int64 step id to the count of memory activity events
-  // assigned with it.
-  map<int64 /*orig_step_id*/, int64 /*count*/> step_count = 4;
+  reserved 4;
 }

From 2ec0214b48878b94763c4f41e095b4579d78b58f Mon Sep 17 00:00:00 2001
From: Kuangyuan Chen <chky@google.com>
Date: Fri, 19 Jun 2020 12:13:11 -0700
Subject: [PATCH 0644/1390] Internal rollback for saved model importer.

PiperOrigin-RevId: 317354577
Change-Id: I6fdc05921b2f1d85b34b1751ed00f65525d45ad3
---
 tensorflow/compiler/mlir/tensorflow/BUILD     |   4 +-
 .../mlir/tensorflow/ir/tf_saved_model.cc      |  25 ---
 .../mlir/tensorflow/ir/tf_saved_model_ops.td  |  24 ---
 .../tests/tf_saved_model/common_v1.py         |   1 -
 .../tests/tf_saved_model/hash_table_v1.py     |  92 -----------
 .../tensorflow/tests/tf_saved_model_ops.mlir  |   5 -
 .../tests/tf_saved_model_ops_invalid.mlir     |  33 ----
 .../mlir/tensorflow/translate/import_model.cc | 149 +++++-------------
 8 files changed, 43 insertions(+), 290 deletions(-)
 delete mode 100644 tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/hash_table_v1.py

diff --git a/tensorflow/compiler/mlir/tensorflow/BUILD b/tensorflow/compiler/mlir/tensorflow/BUILD
index 7c0d427e87b..bcabb13d301 100644
--- a/tensorflow/compiler/mlir/tensorflow/BUILD
+++ b/tensorflow/compiler/mlir/tensorflow/BUILD
@@ -661,9 +661,7 @@ cc_library(
         ":tensorflow_types",
         ":translate_utils",
         "//tensorflow/cc/saved_model:bundle_v2",
-        "//tensorflow/cc/saved_model:constants",
         "//tensorflow/cc/saved_model:loader_lite",
-        "//tensorflow/cc/saved_model:loader_util",
         "//tensorflow/compiler/jit:shape_inference_helpers",
         "//tensorflow/compiler/mlir:op_or_arg_name_mapper",
         "//tensorflow/compiler/tf2xla:functionalize_control_flow",
@@ -675,7 +673,6 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/grappler/utils:transitive_fanin",
-        "//tensorflow/core/platform:protobuf_internal",
         "//tensorflow/core/platform:types",
         "//tensorflow/stream_executor/lib",
         "@com_google_absl//absl/algorithm:container",
@@ -685,6 +682,7 @@ cc_library(
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:optional",
         "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:Analysis",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:StandardOps",
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.cc
index ef248379d2e..7db0eed7713 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.cc
@@ -76,23 +76,6 @@ static LogicalResult Verify(GlobalTensorOp global_tensor) {
   return success();
 }
 
-static LogicalResult Verify(SessionInitializerOp session_initializer) {
-  mlir::SymbolTable symbol_table(
-      session_initializer.getParentOfType<ModuleOp>());
-
-  auto init_func_op =
-      symbol_table.lookup<mlir::FuncOp>(session_initializer.initializer());
-  if (!init_func_op)
-    return session_initializer.emitOpError()
-           << "the initializer function does not exist";
-
-  if (!init_func_op.getType().getResults().empty())
-    return session_initializer.emitOpError()
-           << "the initializer function should have no output";
-
-  return success();
-}
-
 #define GET_OP_CLASSES
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.cc.inc"
 
@@ -249,14 +232,6 @@ static LogicalResult VerifySavedModelModule(
       }
     }
   }
-
-  auto session_initializers = module.getOps<SessionInitializerOp>();
-  if (std::distance(session_initializers.begin(), session_initializers.end()) >
-      1) {
-    return (*++session_initializers.begin()).emitError()
-           << "there must be no more than one session_initializer op";
-  }
-
   SymbolTable symbol_table(module);
   auto symbol_uses = SymbolTable::getSymbolUses(&module.getBodyRegion());
   if (!symbol_uses.hasValue()) {
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model_ops.td b/tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model_ops.td
index 497f4d90cb9..4431a160edf 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model_ops.td
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model_ops.td
@@ -128,28 +128,4 @@ def TfSavedModel_GlobalTensorOp : TfSavedModel_Op<"global_tensor"> {
   let verifier = [{ return Verify(*this); }];
 }
 
-def TfSavedModel_SessionInitializerOp: TfSavedModel_Op<"session_initializer"> {
-  let summary = "Initializes TensorFlow session state.";
-  let description = [{
-    Represents a session initializer function initializes TensorFlow session
-    state. It is used to initialize resources in the saved model before calling
-    any exported functions. There must be no more than one session initializer
-    in a saved model.
-
-    The `initializer` represents the initialization function. The function have
-    no output and this function should be only called once.
-
-    This is used, for example, to initialize hash tables stored in resources and
-    accessed by resource name (rather than as resource handles or bound inputs
-    which is how `global_tensor`s are referenced)
-  }];
-
-  let arguments = (ins
-    FlatSymbolRefAttr:$initializer
-  );
-
-
-  let verifier = [{ return Verify(*this); }];
-}
-
 #endif // SAVED_MODEL_DIALECT
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/common_v1.py b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/common_v1.py
index 51ccbeb1fbd..7171f63bb05 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/common_v1.py
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/common_v1.py
@@ -84,7 +84,6 @@ def do_test(signature_def_map, show_debug_info=False):
     builder.add_meta_graph_and_variables(
         sess, [tf.saved_model.tag_constants.SERVING],
         signature_def_map,
-        main_op=tf.tables_initializer(),
         strip_default_attrs=True)
     builder.save()
 
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/hash_table_v1.py b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/hash_table_v1.py
deleted file mode 100644
index 64847434b82..00000000000
--- a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/hash_table_v1.py
+++ /dev/null
@@ -1,92 +0,0 @@
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-# RUN: %p/hash_table_v1 | FileCheck %s
-
-# pylint: disable=missing-docstring,line-too-long
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import tensorflow.compat.v1 as tf
-from tensorflow.compiler.mlir.tensorflow.tests.tf_saved_model import common_v1
-
-# Verify that the tf.versions attribute exists. It is difficult to enforce
-# contents, since the version numbers change over time. The conversion logic
-# itself is verified in the common graphdef converter, so here just assert
-# it is being invoked.
-# CHECK: module
-# CHECK-SAME: tf.versions
-# CHECK-SAME: bad_consumers
-# CHECK-SAME: min_consumer
-# CHECK-SAME: producer
-
-# CHECK: "tf_saved_model.session_initializer"() {initializer = [[init:@.*]]} : () -> ()
-# CHECK: "tf_saved_model.global_tensor"()
-
-# CHECK:      func {{@[a-zA-Z_0-9]+}}(
-# CHECK-SAME: [[ARG0:%.*]]: tensor<i32>
-# CHECK-SAME: [[ARG1:%.*]]: tensor<!tf.resource
-# CHECK-SAME: attributes {{.*}} tf_saved_model.exported_names = ["key"]
-
-# CHECK-NEXT: [[R0:%.*]] = "tf.Const"()
-# CHECK-NEXT: [[R1:%.*]] = "tf.HashTableV2"()
-# CHECK-SAME: shared_name = "[[hash_table:.*]]"
-# CHECK-NEXT: [[R2:%.*]] = "tf.LookupTableFindV2"([[R1]], [[ARG0]], [[R0]])
-# CHECK-NEXT: [[R3:%.*]] = "tf.ReadVariableOp"([[ARG1]])
-# CHECK-NEXT: [[R4:%.*]] = "tf.AddV2"([[R2]], [[R3]])
-# CHECK-NEXT: return [[R4]]
-
-# CHECK:      func [[init]]
-# CHECK-NEXT: [[R5:%.*]] = "tf.Const"()
-# CHECK-NEXT: [[R6:%.*]] = "tf.Const"()
-# CHECK-NEXT: [[R7:%.*]] = "tf.HashTableV2"()
-# CHECK-SAME: shared_name = "[[hash_table]]"
-# CHECK-NEXT: "tf.LookupTableImportV2"([[R7]], [[R5]], [[R6]])
-
-
-def Test():
-
-  z = tf.compat.v1.get_variable(
-      name='y',
-      shape=(),
-      initializer=tf.random_normal_initializer(),
-      trainable=True)
-  table_initializer = tf.lookup.KeyValueTensorInitializer(
-      keys=[1, 2, 3, 4],
-      values=[5, 6, 7, 8],
-      key_dtype=tf.int32,
-      value_dtype=tf.float32)
-  table = tf.lookup.StaticHashTable(
-      table_initializer, default_value=tf.constant(0.0))
-
-  x = tf.placeholder(tf.int32, shape=(), name='input')
-  y = table.lookup(x)
-  r = tf.add(y, z)
-
-  tensor_info_x = tf.compat.v1.saved_model.utils.build_tensor_info(x)
-  tensor_info_r = tf.compat.v1.saved_model.utils.build_tensor_info(r)
-
-  return {
-      'key': (tf.compat.v1.saved_model.signature_def_utils.build_signature_def(
-          inputs={'x': tensor_info_x},
-          outputs={'r': tensor_info_r},
-          method_name='some_function'))
-  }
-
-
-if __name__ == '__main__':
-  common_v1.set_tf_options()
-  common_v1.do_test(Test())
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model_ops.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model_ops.mlir
index 05e7638645f..e2dc5785cf4 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model_ops.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model_ops.mlir
@@ -2,11 +2,6 @@
 
 module attributes {tf_saved_model.semantics} {
 
-  // CHECK: tf_saved_model.session_initializer
-  "tf_saved_model.session_initializer"() {
-    initializer = @f
-  } : () -> ()
-
   // Representation for constants: (immutable) global tensor.
   // CHECK: tf_saved_model.global_tensor
   "tf_saved_model.global_tensor"() {
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model_ops_invalid.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model_ops_invalid.mlir
index f04e1a60b36..7287fcf66c8 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model_ops_invalid.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model_ops_invalid.mlir
@@ -261,39 +261,6 @@ module attributes {tf_saved_model.semantics} {
 
 // -----
 
-module attributes {tf_saved_model.semantics} {
-
-  // expected-error@+1 {{the initializer function does not exist}}
-  "tf_saved_model.session_initializer"() { initializer = @init } : () -> ()
-}
-
-// -----
-
-module attributes {tf_saved_model.semantics} {
-
-  // expected-error@+1 {{the initializer function should have no output}}
-  "tf_saved_model.session_initializer"() { initializer = @init } : () -> ()
-  func @init() -> tensor<1xf32> attributes {sym_visibility = "private"} {
-    %0 = "tf.Const"() {value = dense<[1.0]> : tensor<1xf32> } : () -> tensor<1xf32>
-    return %0 : tensor<1xf32>
-  }
-}
-
-// -----
-
-module attributes {tf_saved_model.semantics} {
-
-  "tf_saved_model.session_initializer"() { initializer = @init } : () -> ()
-  // expected-error@+1 {{there must be no more than one session_initializer op}}
-  "tf_saved_model.session_initializer"() { initializer = @init } : () -> ()
-  func @init() -> tensor<1xf32> attributes {sym_visibility = "private"} {
-    %0 = "tf.Const"() {value = dense<[1.0]> : tensor<1xf32> } : () -> tensor<1xf32>
-    return %0 : tensor<1xf32>
-  }
-}
-
-// -----
-
 module attributes {tf_saved_model.semantics} {
 
   // expected-error@+1 {{exported function @f should be public}}
diff --git a/tensorflow/compiler/mlir/tensorflow/translate/import_model.cc b/tensorflow/compiler/mlir/tensorflow/translate/import_model.cc
index 3cff4217215..820d0ce31fb 100644
--- a/tensorflow/compiler/mlir/tensorflow/translate/import_model.cc
+++ b/tensorflow/compiler/mlir/tensorflow/translate/import_model.cc
@@ -60,8 +60,6 @@ limitations under the License.
 #include "mlir/IR/Types.h"  // from @llvm-project
 #include "mlir/IR/Verifier.h"  // from @llvm-project
 #include "mlir/Pass/PassManager.h"  // from @llvm-project
-#include "tensorflow/cc/saved_model/constants.h"
-#include "tensorflow/cc/saved_model/loader_util.h"
 #include "tensorflow/compiler/jit/shape_inference_helpers.h"
 #include "tensorflow/compiler/mlir/op_or_arg_name_mapper.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_attributes.h"
@@ -101,7 +99,6 @@ limitations under the License.
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/protobuf.h"
-#include "tensorflow/core/platform/protobuf_internal.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/protobuf/graph_debug_info.pb.h"
 #include "tensorflow/core/protobuf/meta_graph.pb.h"
@@ -119,7 +116,6 @@ using mlir::NamedAttrList;
 using mlir::TensorType;
 using mlir::TF::VarHandleOp;
 using mlir::tf_saved_model::GlobalTensorOp;
-using mlir::tf_saved_model::SessionInitializerOp;
 using stream_executor::port::StatusOr;
 
 namespace {
@@ -2959,13 +2955,6 @@ void SortSavedModelModule(mlir::ModuleOp module) {
     named_global_tensor.global_tensor.getOperation()->moveBefore(
         &module.getBody()->front());
   }
-
-  auto initializers = module.getOps<SessionInitializerOp>();
-  if (!initializers.empty()) {
-    (*initializers.begin())
-        .getOperation()
-        ->moveBefore(&module.getBody()->front());
-  }
 }
 
 Status CreateSavedModelIR(
@@ -3252,29 +3241,17 @@ class SavedModelSignatureDefImporter {
                                  absl::Span<std::string> exported_names,
                                  mlir::MLIRContext* context)
       : bundle_(bundle),
-        flib_def_(OpRegistry::Global(), graph_def().library()),
-        debug_info_(),
         exported_names_(exported_names),
-        module_(mlir::ModuleOp::create(mlir::UnknownLoc::get(context))) {
-    // debug_info might not be loaded with loader_lite.
-    if (bundle_.debug_info != nullptr) debug_info_ = *bundle_.debug_info;
-  }
+        module_(mlir::ModuleOp::create(mlir::UnknownLoc::get(context))) {}
 
   // Converts the SavedModel to the SavedModel dialect. Creates an MLIR function
   // for each signature.
   StatusOr<mlir::OwningModuleRef> ConvertSignatures();
-  Status ConvertSignature(const std::string& sig_def_key,
-                          const SignatureDef& signature_def);
-
-  // Converts the initialization graph in the SavedModel to an MLIR function.
-  Status ConvertInitializer();
-
-  // Converts a graph with feeds and fetches to an MLIR function.
-  StatusOr<mlir::OwningModuleRef> ConvertGraph(
-      const std::string& name,
-      const std::vector<std::pair<std::string, TensorInfo>>& inputs,
-      const std::vector<std::pair<std::string, TensorInfo>>& outputs,
-      const std::vector<std::string> control_outputs);
+  Status ConvertSignature(const GraphDef& graphdef,
+                          const std::string& sig_def_key,
+                          const SignatureDef& signature_def,
+                          const GraphDebugInfo& debug_info,
+                          const FunctionLibraryDefinition& flib_def);
 
   // Creates GlobalTensorOp for each variable and moves each VarHandle op to
   // the enclosing function's arguments.
@@ -3296,62 +3273,18 @@ class SavedModelSignatureDefImporter {
   GraphImportConfig::InputArrays ParseInputArrays(
       const std::vector<std::pair<std::string, TensorInfo>>& inputs);
 
-  const GraphDef& graph_def() const {
-    return bundle_.meta_graph_def.graph_def();
-  }
-  const FunctionLibraryDefinition& flib_def() const { return flib_def_; }
-  const GraphDebugInfo& debug_info() const { return debug_info_; }
-
   const SavedModelBundle& bundle_;
-  FunctionLibraryDefinition flib_def_;
-  GraphDebugInfo debug_info_;
   absl::Span<std::string> exported_names_;
   mlir::OwningModuleRef module_;
 };
 
-Status SavedModelSignatureDefImporter::ConvertInitializer() {
-  std::vector<AssetFileDef> asset_file_defs;
-  TF_RETURN_IF_ERROR(
-      internal::GetAssetFileDefs(bundle_.meta_graph_def, &asset_file_defs));
-
-  if (!asset_file_defs.empty())
-    return errors::Unimplemented(
-        absl::StrCat("Assets are not supported in signaturedef importer"));
-
-  std::string init_node_name;
-  TF_RETURN_IF_ERROR(
-      internal::GetInitOp("", bundle_.meta_graph_def, &init_node_name));
-
-  if (init_node_name.empty()) return Status::OK();
-
-  TF_ASSIGN_OR_RETURN(auto sub_module,
-                      ConvertGraph(init_node_name, {}, {}, {init_node_name}));
-
-  mlir::SymbolTable symbol_table(*sub_module);
-
-  auto init_func_op = symbol_table.lookup<mlir::FuncOp>(init_node_name);
-
-  init_func_op.removeAttr("tf.entry_function");
-
-  mlir::OpBuilder builder(module_->getBodyRegion());
-
-  builder.create<mlir::tf_saved_model::SessionInitializerOp>(
-      module_->getLoc(), builder.getSymbolRefAttr(init_func_op.getName()));
-
-  // Move the converted functions to top level MLIR module.
-  auto* block = module_->getBody();
-  auto* sub_block = sub_module->getBody();
-  block->getOperations().splice(
-      mlir::Block::iterator(block->getTerminator()), sub_block->getOperations(),
-      sub_block->begin(), mlir::Block::iterator(sub_block->getTerminator()));
-
-  return Status::OK();
-}
-
 StatusOr<mlir::OwningModuleRef>
 SavedModelSignatureDefImporter::ConvertSignatures() {
   const auto& signatures = bundle_.GetSignatures();
-  PopulateTfVersions(module_.get(), graph_def().versions());
+  const auto& graphdef = bundle_.meta_graph_def.graph_def();
+  PopulateTfVersions(module_.get(), graphdef.versions());
+
+  FunctionLibraryDefinition flib_def(OpRegistry::Global(), graphdef.library());
 
   // debug_info might not be loaded with loader_lite.
   GraphDebugInfo debug_info;
@@ -3374,10 +3307,9 @@ SavedModelSignatureDefImporter::ConvertSignatures() {
       continue;
     }
 
-    TF_RETURN_IF_ERROR(ConvertSignature(sig_def_key, signature_def));
+    TF_RETURN_IF_ERROR(ConvertSignature(graphdef, sig_def_key, signature_def,
+                                        debug_info, flib_def));
   }
-
-  TF_RETURN_IF_ERROR(ConvertInitializer());
   TF_RETURN_IF_ERROR(LiftVariables());
 
   mlir::OpBuilder builder(module_->getBodyRegion());
@@ -3388,32 +3320,10 @@ SavedModelSignatureDefImporter::ConvertSignatures() {
   return std::move(module_);
 }
 
-StatusOr<mlir::OwningModuleRef> SavedModelSignatureDefImporter::ConvertGraph(
-    const std::string& name,
-    const std::vector<std::pair<std::string, TensorInfo>>& inputs,
-    const std::vector<std::pair<std::string, TensorInfo>>& outputs,
-    const std::vector<std::string> control_outputs) {
-  GraphImportConfig specs;
-  specs.prune_unused_nodes = true;
-  specs.inputs = ParseInputArrays(inputs);
-  for (auto& output : outputs) specs.outputs.push_back(output.second.name());
-  specs.control_outputs = control_outputs;
-
-  // Convert sub-graphdef to sub-graph.
-  GraphConstructorOptions options;
-  options.allow_internal_ops = true;
-  options.add_default_attributes = true;
-  Graph graph(OpRegistry::Global());
-
-  TF_RETURN_IF_ERROR(ConvertGraphDefToGraph(options, graph_def(), &graph));
-
-  // Convert sub-graph to MLIR module.true
-  return GraphDefImporter::Convert(module_->getContext(), graph, debug_info(),
-                                   flib_def(), specs, name);
-}
-
 Status SavedModelSignatureDefImporter::ConvertSignature(
-    const std::string& sig_def_key, const SignatureDef& signature_def) {
+    const GraphDef& graphdef, const std::string& sig_def_key,
+    const SignatureDef& signature_def, const GraphDebugInfo& debug_info,
+    const FunctionLibraryDefinition& flib_def) {
   // Create local vectors for the input and output and sort them to be
   // deterministic. We don't want anyone to really depend on the order, client
   // should lookup argument/result mapping by attribute name.
@@ -3429,9 +3339,34 @@ Status SavedModelSignatureDefImporter::ConvertSignature(
     return lhs.first.size() < rhs.first.size() || lhs.first > rhs.first;
   });
 
+  GraphImportConfig specs;
+  specs.prune_unused_nodes = true;
+  specs.inputs = ParseInputArrays(inputs);
+  for (auto& output : outputs) specs.outputs.push_back(output.second.name());
+
+  // Remove unused nodes and create sub-graphdef.
+  GraphDef sub_graph_def;
+  TF_RETURN_IF_ERROR(tensorflow::grappler::SetTransitiveFaninGraph(
+      graphdef, &sub_graph_def,
+      /*terminal_nodes=*/{specs.outputs.begin(), specs.outputs.end()}));
+
+  // Set the function library definitions in the pruned graphdef.
+  *sub_graph_def.mutable_library() = flib_def.ToProto();
+
+  // Convert sub-graphdef to sub-graph.
+  GraphConstructorOptions options;
+  options.allow_internal_ops = true;
+  options.add_default_attributes = true;
+  Graph sub_graph(OpRegistry::Global());
+
+  TF_RETURN_IF_ERROR(
+      ConvertGraphDefToGraph(options, sub_graph_def, &sub_graph));
+
   // Convert sub-graph to MLIR module.
-  TF_ASSIGN_OR_RETURN(auto sub_module,
-                      ConvertGraph(sig_def_key, inputs, outputs, {}));
+  TF_ASSIGN_OR_RETURN(
+      auto sub_module,
+      GraphDefImporter::Convert(module_->getContext(), sub_graph, debug_info,
+                                flib_def, specs, sig_def_key));
   mlir::OpBuilder builder(sub_module->getBodyRegion());
 
   // Find the FuncOp which corresponds to current SignatureDef.

From 08a99df84299b4c55f40800c148f55773f3d6198 Mon Sep 17 00:00:00 2001
From: Jonah Kohn <51345541+jonah-kohn@users.noreply.github.com>
Date: Fri, 19 Jun 2020 12:34:16 -0700
Subject: [PATCH 0645/1390] Implement format_dtype as intended.

---
 tensorflow/python/keras/utils/vis_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/keras/utils/vis_utils.py b/tensorflow/python/keras/utils/vis_utils.py
index 21de8014c2a..49407d92ce3 100644
--- a/tensorflow/python/keras/utils/vis_utils.py
+++ b/tensorflow/python/keras/utils/vis_utils.py
@@ -197,7 +197,7 @@ def model_to_dot(model,
           else:
             return str(dtype)
           
-      label = '%s|%s' % (label, dtype)
+      label = '%s|%s' % (label, format_dtype(dtype))
 
     # Rebuild the label as a table including input/output shapes.
     if show_shapes:

From eeb6ccf2cf72a758db2ef41c925b03998229b3a8 Mon Sep 17 00:00:00 2001
From: Jonah Kohn <51345541+jonah-kohn@users.noreply.github.com>
Date: Fri, 19 Jun 2020 12:48:52 -0700
Subject: [PATCH 0646/1390] Correctly access layer attributes.

---
 tensorflow/python/keras/utils/vis_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/keras/utils/vis_utils.py b/tensorflow/python/keras/utils/vis_utils.py
index 49407d92ce3..aa2a4c978cb 100644
--- a/tensorflow/python/keras/utils/vis_utils.py
+++ b/tensorflow/python/keras/utils/vis_utils.py
@@ -197,7 +197,7 @@ def model_to_dot(model,
           else:
             return str(dtype)
           
-      label = '%s|%s' % (label, format_dtype(dtype))
+      label = '%s|%s' % (label, format_dtype(layer.dtype))
 
     # Rebuild the label as a table including input/output shapes.
     if show_shapes:

From 0869ff0af5cacc9d9ea85f6cfc0027166b964f4e Mon Sep 17 00:00:00 2001
From: Robert David <lrdx@google.com>
Date: Fri, 19 Jun 2020 12:50:42 -0700
Subject: [PATCH 0647/1390] Unabbreviate "proj" to "projection" a few places,
 to be consistent over the codebase.

PiperOrigin-RevId: 317361865
Change-Id: I72fedd0bda16a5668fb7b880128f36d2595f0042
---
 tensorflow/lite/kernels/lstm.cc      | 20 +++++++--------
 tensorflow/lite/kernels/lstm_eval.cc | 38 +++++++++++++++-------------
 2 files changed, 30 insertions(+), 28 deletions(-)

diff --git a/tensorflow/lite/kernels/lstm.cc b/tensorflow/lite/kernels/lstm.cc
index b941f2237ca..803fbba4eae 100644
--- a/tensorflow/lite/kernels/lstm.cc
+++ b/tensorflow/lite/kernels/lstm.cc
@@ -182,7 +182,7 @@ TfLiteStatus PopulateQuantizedLstmParams8x8_16(
   float input_to_output_weight_scale = default_scale;
   float recurrent_to_output_weight_scale = default_scale;
   float cell_to_output_weight_scale = default_scale;
-  float proj_weight_scale = default_scale;
+  float projection_weight_scale = default_scale;
   float layer_norm_input_scale = default_scale;
   float layer_norm_forget_scale = default_scale;
   float layer_norm_cell_scale = default_scale;
@@ -229,7 +229,7 @@ TfLiteStatus PopulateQuantizedLstmParams8x8_16(
   }
 
   if (use_projection) {
-    proj_weight_scale = projection_weights->params.scale;
+    projection_weight_scale = projection_weights->params.scale;
   }
   output_state_scale = output_state->params.scale;
 
@@ -276,7 +276,7 @@ TfLiteStatus PopulateQuantizedLstmParams8x8_16(
       std::pow(2, -15) / intermediate_scale[4] * std::pow(2, -15);
 
   effective_proj_scale =
-      proj_weight_scale * intermediate_scale[4] / output_state_scale;
+      projection_weight_scale * intermediate_scale[4] / output_state_scale;
 
   if (use_peephole) {
     if (!use_cifg) {
@@ -442,7 +442,7 @@ TfLiteStatus PopulateQuantizedLstmParams8x8_8(
   int8_t* input_to_output_weight_ptr = nullptr;
   int8_t* recurrent_to_output_weight_ptr = nullptr;
   int8_t* cell_to_output_weight_ptr = nullptr;
-  int8_t* proj_weight_ptr = nullptr;
+  int8_t* projection_weight_ptr = nullptr;
   int16_t* layer_norm_input_weight_ptr = nullptr;
   int16_t* layer_norm_forget_weight_ptr = nullptr;
   int16_t* layer_norm_cell_weight_ptr = nullptr;
@@ -451,7 +451,7 @@ TfLiteStatus PopulateQuantizedLstmParams8x8_8(
   int32_t* forget_gate_bias_ptr = nullptr;
   int32_t* cell_gate_bias_ptr = nullptr;
   int32_t* output_gate_bias_ptr = nullptr;
-  int32_t* proj_bias_ptr = nullptr;
+  int32_t* projection_bias_ptr = nullptr;
   int16_t* cell_ptr = nullptr;
   int8_t* output_state_ptr = nullptr;
 
@@ -469,7 +469,7 @@ TfLiteStatus PopulateQuantizedLstmParams8x8_8(
   float input_to_output_weight_scale = default_scale;
   float recurrent_to_output_weight_scale = default_scale;
   float cell_to_output_weight_scale = default_scale;
-  float proj_weight_scale = default_scale;
+  float projection_weight_scale = default_scale;
   float layer_norm_input_scale = default_scale;
   float layer_norm_forget_scale = default_scale;
   float layer_norm_cell_scale = default_scale;
@@ -528,10 +528,10 @@ TfLiteStatus PopulateQuantizedLstmParams8x8_8(
   }
 
   if (use_projection) {
-    proj_weight_ptr = projection_weights->data.int8;
-    proj_weight_scale = projection_weights->params.scale;
+    projection_weight_ptr = projection_weights->data.int8;
+    projection_weight_scale = projection_weights->params.scale;
     if (projection_bias) {
-      proj_bias_ptr = projection_bias->data.i32;
+      projection_bias_ptr = projection_bias->data.i32;
     }
   }
   output_state_scale = output_state->params.scale;
@@ -593,7 +593,7 @@ TfLiteStatus PopulateQuantizedLstmParams8x8_8(
                                         output_state_scale /
                                         intermediate_scale[11];
   effective_proj_scale =
-      proj_weight_scale * std::pow(2, -15) / output_state_scale;
+      projection_weight_scale * std::pow(2, -15) / output_state_scale;
 
   if (use_peephole) {
     if (!use_cifg) {
diff --git a/tensorflow/lite/kernels/lstm_eval.cc b/tensorflow/lite/kernels/lstm_eval.cc
index 9bdbfa9d48d..f38fdc95f3e 100644
--- a/tensorflow/lite/kernels/lstm_eval.cc
+++ b/tensorflow/lite/kernels/lstm_eval.cc
@@ -919,7 +919,7 @@ inline void LstmStepHybrid(
 //   cell_to_output_weights              - optional
 //
 // Quantized projection weights of size 'n_output * n_cell'
-//   proj_weight_ptr                     - optional
+//   projection_weight_ptr                     - optional
 //
 // Weight scales (scalars) for each of the weights above.
 //   effective_input_to_input_scale_a    - optional
@@ -1019,10 +1019,10 @@ inline void LstmStepInteger(
     int32_t effective_cell_to_forget_scale_b,
     const int16_t* cell_to_output_weight_ptr,
     int32_t effective_cell_to_output_scale_a,
-    int32_t effective_cell_to_output_scale_b, const int8_t* proj_weight_ptr,
-    int32_t effective_proj_scale_a, int32_t effective_proj_scale_b,
-    int32_t hidden_zp, int32_t effective_hidden_scale_a,
-    int32_t effective_hidden_scale_b,
+    int32_t effective_cell_to_output_scale_b,
+    const int8_t* projection_weight_ptr, int32_t effective_proj_scale_a,
+    int32_t effective_proj_scale_b, int32_t hidden_zp,
+    int32_t effective_hidden_scale_a, int32_t effective_hidden_scale_b,
     const int16_t* layer_norm_input_weight_ptr,
     int32_t layer_norm_input_scale_a, int32_t layer_norm_input_scale_b,
     const int16_t* layer_norm_forget_weight_ptr,
@@ -1055,7 +1055,7 @@ inline void LstmStepInteger(
   const bool use_cifg = (input_to_input_weight_ptr == nullptr);
   const bool use_peephole = (cell_to_output_weight_ptr != nullptr);
   const bool use_layer_norm = (layer_norm_forget_weight_ptr != nullptr);
-  const bool use_projection = (proj_weight_ptr != nullptr);
+  const bool use_projection = (projection_weight_ptr != nullptr);
 
   // Check for nullptrs.
   TFLITE_DCHECK(input_to_forget_effective_bias);
@@ -1208,7 +1208,7 @@ inline void LstmStepInteger(
   if (use_projection) {
     std::fill_n(output_ptr, n_batch * n_output, 0);
     tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-        scratch_4_ptr, projection_effective_bias, proj_weight_ptr,
+        scratch_4_ptr, projection_effective_bias, projection_weight_ptr,
         effective_proj_scale_a, effective_proj_scale_b, n_batch, n_cell,
         n_output, output_state_zp, scratch_5_ptr, output_ptr, context);
     if (quantized_proj_clip > 0) {
@@ -1245,7 +1245,7 @@ inline void LstmStepInteger(
 //   cell_to_output_weights              - optional
 //
 // Quantized projection weights of size 'n_output * n_cell'
-//   proj_weight_ptr                     - optional
+//   projection_weight_ptr                     - optional
 //
 // Weight scales (scalars) for each of the weights above.
 //   effective_input_to_input_scale_a    - optional
@@ -1348,9 +1348,9 @@ void LstmStepInteger(
     int32_t effective_cell_to_forget_scale_b,
     const int8_t* cell_to_output_weight_ptr,
     int32_t effective_cell_to_output_scale_a,
-    int32_t effective_cell_to_output_scale_b, const int8_t* proj_weight_ptr,
-    int32_t effective_proj_scale_a, int32_t effective_proj_scale_b,
-    const int16_t* layer_norm_input_weight_ptr,
+    int32_t effective_cell_to_output_scale_b,
+    const int8_t* projection_weight_ptr, int32_t effective_proj_scale_a,
+    int32_t effective_proj_scale_b, const int16_t* layer_norm_input_weight_ptr,
     int32_t layer_norm_input_scale_a, int32_t layer_norm_input_scale_b,
     const int16_t* layer_norm_forget_weight_ptr,
     int32_t layer_norm_forget_scale_a, int32_t layer_norm_forget_scale_b,
@@ -1360,7 +1360,7 @@ void LstmStepInteger(
     int32_t layer_norm_output_scale_a, int32_t layer_norm_output_scale_b,
     const int32_t* input_gate_bias_ptr, const int32_t* forget_gate_bias_ptr,
     const int32_t* cell_gate_bias_ptr, const int32_t* output_gate_bias_ptr,
-    const int32_t* proj_bias_ptr, const TfLiteLSTMParams* params,
+    const int32_t* projection_bias_ptr, const TfLiteLSTMParams* params,
     const int32_t* intermediate_scale_a, const int32_t* intermediate_scale_b,
     const int32_t* intermediate_zp, int16_t quantized_cell_clip,
     int8_t quantized_proj_clip, int n_batch, int n_cell, int n_input,
@@ -1476,8 +1476,9 @@ void LstmStepInteger(
 
   // Projection.
   tensor_utils::MatrixBatchVectorMultiply(
-      scratch3, proj_weight_ptr, effective_proj_scale_a, effective_proj_scale_b,
-      proj_bias_ptr, n_batch, n_cell, n_output, output_state_zp, output_ptr);
+      scratch3, projection_weight_ptr, effective_proj_scale_a,
+      effective_proj_scale_b, projection_bias_ptr, n_batch, n_cell, n_output,
+      output_state_zp, output_ptr);
 
   // Projection clipping.
   if (quantized_proj_clip > 0) {
@@ -2113,7 +2114,8 @@ TfLiteStatus EvalInteger8x8_8(
       GetTensorData<int8_t>(recurrent_to_output_weights);
   const int8_t* cell_to_output_weight_ptr =
       GetTensorData<int8_t>(cell_to_output_weights);
-  const int8_t* proj_weight_ptr = GetTensorData<int8_t>(projection_weights);
+  const int8_t* projection_weight_ptr =
+      GetTensorData<int8_t>(projection_weights);
   const int16_t* layer_norm_input_weight_ptr =
       GetTensorData<int16_t>(input_layer_norm_coefficients);
   const int16_t* layer_norm_forget_weight_ptr =
@@ -2128,7 +2130,7 @@ TfLiteStatus EvalInteger8x8_8(
   const int32_t* cell_gate_bias_ptr = GetTensorData<int32_t>(cell_gate_bias);
   const int32_t* output_gate_bias_ptr =
       GetTensorData<int32_t>(output_gate_bias);
-  const int32_t* proj_bias_ptr = GetTensorData<int32_t>(projection_bias);
+  const int32_t* projection_bias_ptr = GetTensorData<int32_t>(projection_bias);
   int16_t* cell_ptr = GetTensorData<int16_t>(cell_state);
   int8_t* output_state_ptr = GetTensorData<int8_t>(output_state);
   int8_t* output_ptr = nullptr;
@@ -2195,7 +2197,7 @@ TfLiteStatus EvalInteger8x8_8(
         integer_lstm_param->effective_cell_to_output_scale_a,
         integer_lstm_param->effective_cell_to_output_scale_b,
 
-        proj_weight_ptr, integer_lstm_param->effective_proj_scale_a,
+        projection_weight_ptr, integer_lstm_param->effective_proj_scale_a,
         integer_lstm_param->effective_proj_scale_b,
 
         layer_norm_input_weight_ptr,
@@ -2214,7 +2216,7 @@ TfLiteStatus EvalInteger8x8_8(
         integer_lstm_param->layer_norm_output_scale_b,
 
         input_gate_bias_ptr, forget_gate_bias_ptr, cell_gate_bias_ptr,
-        output_gate_bias_ptr, proj_bias_ptr,
+        output_gate_bias_ptr, projection_bias_ptr,
 
         params, integer_lstm_param->intermediate_scale_a,
         integer_lstm_param->intermediate_scale_b,

From bd98ba765aac3e20c9db89131aa3016964749133 Mon Sep 17 00:00:00 2001
From: Haoyu Zhang <haoyuzhang@google.com>
Date: Fri, 19 Jun 2020 13:00:32 -0700
Subject: [PATCH 0648/1390] Fix cancellation race condition in
 BaseRendezvousMgr::RegisterCall

PiperOrigin-RevId: 317363743
Change-Id: Ide89dd360a9885b5e8f67b12f362cbce8cb85d80
---
 .../base_rendezvous_mgr.cc                    | 67 +++++++++----------
 .../distributed_runtime/base_rendezvous_mgr.h | 16 ++---
 .../rpc/rpc_rendezvous_mgr.cc                 |  1 +
 3 files changed, 37 insertions(+), 47 deletions(-)

diff --git a/tensorflow/core/distributed_runtime/base_rendezvous_mgr.cc b/tensorflow/core/distributed_runtime/base_rendezvous_mgr.cc
index 7849e094cb9..4b398e4ecef 100644
--- a/tensorflow/core/distributed_runtime/base_rendezvous_mgr.cc
+++ b/tensorflow/core/distributed_runtime/base_rendezvous_mgr.cc
@@ -139,7 +139,7 @@ Status BaseRemoteRendezvous::Initialize(WorkerSession* session) {
   CHECK_NE(session, nullptr) << "session must not be null!";
   std::vector<DeferredCall> deferred_calls;
   {
-    mutex_lock l(init_mu_);
+    mutex_lock l(mu_);
     if (session_ != nullptr) {
       if (session_->worker_name() == session->worker_name()) {
         VLOG(1) << "Skipping rendezvous re-initialization.";
@@ -161,12 +161,12 @@ Status BaseRemoteRendezvous::Initialize(WorkerSession* session) {
 }
 
 WorkerSession* BaseRemoteRendezvous::session() {
-  tf_shared_lock l(init_mu_);
+  tf_shared_lock l(mu_);
   return session_;
 }
 
 bool BaseRemoteRendezvous::is_initialized() {
-  tf_shared_lock l(init_mu_);
+  tf_shared_lock l(mu_);
   return is_initialized_locked();
 }
 
@@ -176,7 +176,7 @@ Status BaseRemoteRendezvous::Send(const Rendezvous::ParsedKey& parsed,
   VLOG(1) << "BaseRemoteRendezvous Send " << this << " " << parsed.FullKey();
   WorkerSession* sess = nullptr;
   {
-    tf_shared_lock l(init_mu_);
+    tf_shared_lock l(mu_);
     if (!status_.ok()) return status_;
     DCHECK(is_initialized_locked());
     sess = session_;
@@ -198,7 +198,7 @@ Status BaseRemoteRendezvous::ValidateDevices(const ParsedKey& parsed,
   // (e.g. calling session())
   WorkerSession* sess = nullptr;
   {
-    tf_shared_lock l(init_mu_);
+    tf_shared_lock l(mu_);
     if (!status_.ok()) return status_;
     if (!is_initialized_locked()) {
       return errors::Internal("ValidateDevices called before initialization.");
@@ -345,7 +345,7 @@ void BaseRemoteRendezvous::RecvLocalAsync(const ParsedKey& parsed,
   // Test whether the rendezvous is initialized using a shared lock, to avoid
   // the need for exclusive access in the common case.
   if (TF_PREDICT_FALSE(!is_initialized())) {
-    mutex_lock l(init_mu_);
+    mutex_lock l(mu_);
     if (!is_initialized_locked()) {
       // RecvLocalAsync can be called (due to an incoming RecvTensor RPC from a
       // remote worker) before the RunStep (or PartialRunStep) RPC from the
@@ -386,8 +386,7 @@ void BaseRemoteRendezvous::StartAbort(const Status& s) {
   local_->StartAbort(derived_status);
   {
     // Aborts all active RecvTensor calls.
-    mutex_lock l(init_mu_);
-    mutex_lock l2(active_mu_);
+    mutex_lock l(mu_);
     if (status_.ok()) {
       status_ = derived_status;
       for (auto& entry : active_) {
@@ -402,42 +401,36 @@ void BaseRemoteRendezvous::StartAbort(const Status& s) {
 void BaseRemoteRendezvous::RegisterCall(BaseRecvTensorCall* call,
                                         const Rendezvous::Args& args) {
   CancellationManager* cm = args.cancellation_manager;
-  Status captured_status;
-  {
-    tf_shared_lock l(init_mu_);
-    if (!status_.ok()) {
-      captured_status = status_;
-    }
-  }
-  if (!captured_status.ok()) {
-    call->StartAbort(captured_status);
-    return;
-  }
-
   bool already_cancelled = false;
   InactiveCallback callback = [] {};
-  if (cm != nullptr) {
-    auto token = cm->get_cancellation_token();
-    already_cancelled = !cm->RegisterCallback(token, [this, call] {
-      {
-        tf_shared_lock l(active_mu_);
-        if (active_.find(call) == active_.end()) return;
-      }
+  {
+    mutex_lock l(mu_);
+    if (!status_.ok()) {
+      call->StartAbort(status_);
+      return;
+    }
+    if (cm != nullptr) {
+      auto token = cm->get_cancellation_token();
+      already_cancelled = !cm->RegisterCallback(token, [this, call] {
+        {
+          mutex_lock l(mu_);
+          if (active_.find(call) == active_.end()) return;
+          call->StartAbort(
+              errors::Cancelled("RecvFromRemoteAsync is cancelled."));
+        }
+      });
+      callback = [cm, token] { cm->TryDeregisterCallback(token); };
+    }
+    if (already_cancelled) {
       call->StartAbort(errors::Cancelled("RecvFromRemoteAsync is cancelled."));
-    });
-    callback = [cm, token] { cm->TryDeregisterCallback(token); };
-  }
-
-  if (already_cancelled) {
-    call->StartAbort(errors::Cancelled("RecvFromRemoteAsync is cancelled."));
-  } else {
-    mutex_lock l(active_mu_);
-    CHECK(active_.emplace(call, callback).second);
+    } else {
+      CHECK(active_.emplace(call, callback).second);
+    }
   }
 }
 
 void BaseRemoteRendezvous::DeregisterCall(BaseRecvTensorCall* call) {
-  mutex_lock l(active_mu_);
+  mutex_lock l(mu_);
   auto it = active_.find(call);
   if (it != active_.end()) {
     // Deregister the cancellation callback, if one was registered.
diff --git a/tensorflow/core/distributed_runtime/base_rendezvous_mgr.h b/tensorflow/core/distributed_runtime/base_rendezvous_mgr.h
index afa0f74ea2c..63409a31549 100644
--- a/tensorflow/core/distributed_runtime/base_rendezvous_mgr.h
+++ b/tensorflow/core/distributed_runtime/base_rendezvous_mgr.h
@@ -174,14 +174,12 @@ class BaseRemoteRendezvous : public RemoteRendezvous {
  private:
   Rendezvous* local_;  // Owns a Ref on this object.
 
-  // Guards mutable state that is read-mostly after this rendezvous is
-  // initialized.
-  mutable mutex init_mu_;
+  mutable mutex mu_;
 
   // Status given by StartAbort() if any.
-  Status status_ TF_GUARDED_BY(init_mu_);
+  Status status_ TF_GUARDED_BY(mu_);
 
-  WorkerSession* session_ TF_GUARDED_BY(init_mu_);  // Not owned.
+  WorkerSession* session_ TF_GUARDED_BY(mu_);  // Not owned.
 
   // Data structures to handle calls when partially initialized.
   struct DeferredCall {
@@ -190,16 +188,14 @@ class BaseRemoteRendezvous : public RemoteRendezvous {
 
     DeferredCall(const ParsedKey& parsed, DoneCallback done);
   };
-  std::vector<DeferredCall> deferred_calls_ TF_GUARDED_BY(init_mu_);
+  std::vector<DeferredCall> deferred_calls_ TF_GUARDED_BY(mu_);
 
   typedef std::function<void()> InactiveCallback;
 
-  // Active outstanding RecvTensor calls.
-  mutex active_mu_;
   std::unordered_map<BaseRecvTensorCall*, InactiveCallback> active_
-      TF_GUARDED_BY(active_mu_);
+      TF_GUARDED_BY(mu_);
 
-  bool is_initialized_locked() TF_SHARED_LOCKS_REQUIRED(init_mu_) {
+  bool is_initialized_locked() TF_SHARED_LOCKS_REQUIRED(mu_) {
     return session_ != nullptr;
   }
 
diff --git a/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.cc b/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.cc
index 512c17fcfcf..89fe6ced725 100644
--- a/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.cc
+++ b/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.cc
@@ -282,6 +282,7 @@ void RpcRemoteRendezvous::RecvFromRemoteAsync(
     // callback.
     call->ReleaseWorker(sess->worker_cache());
     call->done()(call->status(), Args(), Args(), Tensor(), false);
+    DeregisterCall(call);
     get_call_freelist()->Release(call);
     return;
   }

From 5ed030734c63118c3fe09470a1d2da6af204a0eb Mon Sep 17 00:00:00 2001
From: Andrew Audibert <aaudibert@google.com>
Date: Fri, 19 Jun 2020 13:03:24 -0700
Subject: [PATCH 0649/1390] [tf.data service] Avoid holding locks during RPC
 calls.

This CL fixes a deadlock where a worker holds its lock while making an RPC to the master, and the master holds its lock while making an RPC to the worker. The RPCs require locks to serve, so we end up deadlocked.

We can avoid this by never holding a lock while performing RPCs. This CL modifies the master locking to release the lock when making the `ProcessTask` RPC to the worker.

This change shouldn't affect any functionality - it should only reduce the scope of some locking.

PiperOrigin-RevId: 317364346
Change-Id: I21e5ed8cdaced1192a89ffda4f8f93418e5dc4a5
---
 tensorflow/core/data/service/master_impl.cc | 131 ++++++++++++--------
 tensorflow/core/data/service/master_impl.h  |  19 ++-
 2 files changed, 94 insertions(+), 56 deletions(-)

diff --git a/tensorflow/core/data/service/master_impl.cc b/tensorflow/core/data/service/master_impl.cc
index 37a884d540e..5c7917b4154 100644
--- a/tensorflow/core/data/service/master_impl.cc
+++ b/tensorflow/core/data/service/master_impl.cc
@@ -61,7 +61,8 @@ Status DataServiceMasterImpl::RegisterWorker(
   VLOG(3) << "Received register worker request";
   mutex_lock l(mu_);
   int64 worker_id = next_worker_id_++;
-  workers_.emplace_back(worker_id, request->worker_address());
+  workers_.push_back(
+      std::make_shared<Worker>(worker_id, request->worker_address()));
   response->set_worker_id(worker_id);
 
   // Allocate tasks to the worker.
@@ -70,17 +71,18 @@ Status DataServiceMasterImpl::RegisterWorker(
     if (job->finished()) {
       continue;
     }
-    int64 task_id = CreateTask(job.get(), request->worker_address());
+    const Task& task = CreateTaskLocked(job.get(), request->worker_address());
 
     TaskDef* task_def = response->add_tasks();
     *task_def->mutable_dataset() =
         datasets_by_id_[job->dataset_id()]->dataset_def();
     task_def->set_dataset_id(job->dataset_id());
     task_def->set_job_id(job->job_id());
-    task_def->set_task_id(task_id);
+    task_def->set_task_id(task.task_id());
   }
 
-  VLOG(1) << "Registered worker " << workers_.back().DebugString();
+  VLOG(1) << "Registered worker at address " << request->worker_address()
+          << " with id " << worker_id;
   return Status::OK();
 }
 
@@ -145,7 +147,6 @@ Status DataServiceMasterImpl::CreateJob(const CreateJobRequest* request,
   VLOG(3) << "Received create job request for dataset id "
           << request->dataset_id();
   ProcessingMode processing_mode = ProcessingMode(request->processing_mode());
-  mutex_lock l(mu_);
   int64 job_id;
   TF_RETURN_IF_ERROR(CreateJob(request->dataset_id(), processing_mode,
                                absl::optional<std::string>(), &job_id));
@@ -161,25 +162,30 @@ Status DataServiceMasterImpl::GetOrCreateJob(
   VLOG(3) << "Received get or create job request for dataset id "
           << request->dataset_id() << " with name " << request->job_name()
           << " and index " << request->job_name_index();
-  mutex_lock l(mu_);
   NamedJobKey key(request->job_name(), request->job_name_index());
   ProcessingMode requested_processing_mode =
       ProcessingMode(request->processing_mode());
-  std::shared_ptr<Job>* job = gtl::FindOrNull(named_jobs_, key);
-  if (job != nullptr) {
-    TF_RETURN_IF_ERROR(ValidateMatchingJob(**job, requested_processing_mode,
-                                           request->dataset_id()));
-    int64 job_id = (*job)->job_id();
-    response->set_job_id(job_id);
-    VLOG(3) << "Found existing job for name=" << request->job_name()
-            << ", index=" << request->job_name_index()
-            << ". job_id: " << job_id;
-    return Status::OK();
+  {
+    mutex_lock l(mu_);
+    std::shared_ptr<Job>* job = gtl::FindOrNull(named_jobs_, key);
+    if (job != nullptr) {
+      TF_RETURN_IF_ERROR(ValidateMatchingJob(**job, requested_processing_mode,
+                                             request->dataset_id()));
+      int64 job_id = (*job)->job_id();
+      response->set_job_id(job_id);
+      VLOG(3) << "Found existing job for name=" << request->job_name()
+              << ", index=" << request->job_name_index()
+              << ". job_id: " << job_id;
+      return Status::OK();
+    }
   }
   int64 job_id;
   TF_RETURN_IF_ERROR(CreateJob(request->dataset_id(), requested_processing_mode,
                                request->job_name(), &job_id));
-  named_jobs_[key] = jobs_[job_id];
+  {
+    mutex_lock l(mu_);
+    named_jobs_[key] = jobs_[job_id];
+  }
   response->set_job_id(job_id);
   VLOG(3) << "Created job " << job_id << " for dataset "
           << request->dataset_id() << " and name " << request->job_name();
@@ -211,8 +217,7 @@ Status DataServiceMasterImpl::ValidateMatchingJob(
 Status DataServiceMasterImpl::CreateJob(int64 dataset_id,
                                         ProcessingMode processing_mode,
                                         absl::optional<std::string> job_name,
-                                        int64* out_job_id)
-    EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+                                        int64* out_job_id) LOCKS_EXCLUDED(mu_) {
   switch (processing_mode) {
     case ProcessingMode::PARALLEL_EPOCHS:
       break;
@@ -225,41 +230,64 @@ Status DataServiceMasterImpl::CreateJob(int64 dataset_id,
                                    ProcessingModeToString(processing_mode),
                                    " not recognized");
   }
-  if (!datasets_by_id_.contains(dataset_id)) {
-    return errors::NotFound("Dataset id: <", dataset_id, "> not found.");
+  std::shared_ptr<Job> job;
+  std::vector<std::shared_ptr<Worker>> workers;
+  {
+    mutex_lock l(mu_);
+    if (!datasets_by_id_.contains(dataset_id)) {
+      return errors::NotFound("Dataset id: <", dataset_id, "> not found.");
+    }
+
+    int64 job_id = next_job_id_++;
+    DCHECK(!jobs_.contains(job_id));
+    job = std::make_shared<Job>(job_id, dataset_id, processing_mode, job_name);
+    jobs_[job_id] = job;
+
+    // Copy workers_ so that we can iterate through the workers without holding
+    // the lock. When a new worker is added in `RegisterWorker`, we iterate
+    // through the jobs in `jobs_` and give it a task for each job. So even if a
+    // new worker is registered after we release the lock, because this job has
+    // been added to `jobs_`, it will still receive a task for this job.
+    workers = workers_;
+    const Dataset& dataset = *datasets_by_id_[dataset_id];
+    if (VLOG_IS_ON(1)) {
+      VLOG(1) << "Sending tasks to workers for job " << job->job_id()
+              << ". Dataset id: " << dataset_id
+              << ". Dataset fingerprint: " << dataset.fingerprint()
+              << ". Dataset definition size: "
+              << datasets_by_id_[dataset_id]->dataset_def().ByteSizeLong();
+    }
   }
 
-  int64 job_id = next_job_id_++;
-  DCHECK(!jobs_.contains(job_id));
-  auto job =
-      std::make_shared<Job>(job_id, dataset_id, processing_mode, job_name);
-  jobs_[job_id] = job;
-
-  for (auto& worker : workers_) {
-    int64 task_id = CreateTask(job.get(), worker.address());
-
-    // TODO(aaudibert): perform these calls asynchronously.
-    // TODO(aaudibert): clean up in case some calls succeed, but later calls
-    // fail
-    TF_RETURN_IF_ERROR(AllocateTaskToWorker(tasks_.at(task_id), &worker));
+  for (auto& worker : workers) {
+    const Task& task = CreateTask(job.get(), worker->address());
+    Status s = AllocateTaskToWorker(task, worker.get());
+    if (!s.ok()) {
+      LOG(WARNING) << "Failed to allocate task with id " << task.task_id()
+                   << " to worker at address " << worker->address() << ": "
+                   << s.error_message();
+    }
   }
+  VLOG(1) << "Done sending tasks to workers for job " << job->job_id();
 
-  *out_job_id = job_id;
+  *out_job_id = job->job_id();
   return Status::OK();
 }
 
-int64 DataServiceMasterImpl::CreateTask(Job* job,
-                                        const std::string& worker_address)
-    EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+const DataServiceMasterImpl::Task& DataServiceMasterImpl::CreateTask(
+    Job* job, const std::string& worker_address) LOCKS_EXCLUDED(mu_) {
+  mutex_lock l(mu_);
+  return CreateTaskLocked(job, worker_address);
+}
+
+const DataServiceMasterImpl::Task& DataServiceMasterImpl::CreateTaskLocked(
+    Job* job, const std::string& worker_address) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
   int64 task_id = next_task_id_++;
   DCHECK(!tasks_.contains(task_id));
-  auto result =
-      tasks_.emplace(std::piecewise_construct, std::forward_as_tuple(task_id),
-                     std::forward_as_tuple(task_id, job->job_id(),
-                                           job->dataset_id(), worker_address));
+  tasks_.insert({task_id, Task(task_id, job->job_id(), job->dataset_id(),
+                               worker_address)});
   job->add_task_id(task_id);
-  DCHECK(result.second);
-  return task_id;
+  return tasks_.at(task_id);
 }
 
 Status DataServiceMasterImpl::EnsureWorkerStubInitialized(Worker* worker) {
@@ -273,14 +301,17 @@ Status DataServiceMasterImpl::EnsureWorkerStubInitialized(Worker* worker) {
 
 Status DataServiceMasterImpl::AllocateTaskToWorker(const Task& task,
                                                    Worker* worker)
-    EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+    LOCKS_EXCLUDED(mu_) {
   TF_RETURN_IF_ERROR(EnsureWorkerStubInitialized(worker));
   grpc::ClientContext client_ctx;
   ProcessTaskRequest req;
   req.mutable_task()->set_dataset_id(task.dataset_id());
-  DCHECK(datasets_by_id_.contains(task.dataset_id()));
-  *req.mutable_task()->mutable_dataset() =
-      datasets_by_id_.at(task.dataset_id())->dataset_def();
+  {
+    mutex_lock l(mu_);
+    DCHECK(datasets_by_id_.contains(task.dataset_id()));
+    *req.mutable_task()->mutable_dataset() =
+        datasets_by_id_.at(task.dataset_id())->dataset_def();
+  }
   req.mutable_task()->set_task_id(task.task_id());
   ProcessTaskResponse resp;
   grpc::Status s = worker->stub()->ProcessTask(&client_ctx, req, &resp);
@@ -321,8 +352,8 @@ Status DataServiceMasterImpl::GetWorkers(const GetWorkersRequest* request,
   VLOG(3) << "Enter GetWorkers";
   for (auto& worker : workers_) {
     WorkerInfo* info = response->add_workers();
-    info->set_address(worker.address());
-    info->set_id(worker.worker_id());
+    info->set_address(worker->address());
+    info->set_id(worker->worker_id());
   }
   VLOG(3) << "Returning list of " << workers_.size()
           << " workers from GetWorkers";
diff --git a/tensorflow/core/data/service/master_impl.h b/tensorflow/core/data/service/master_impl.h
index 0dc049a389c..67df2613118 100644
--- a/tensorflow/core/data/service/master_impl.h
+++ b/tensorflow/core/data/service/master_impl.h
@@ -177,16 +177,23 @@ class DataServiceMasterImpl {
   };
 
   // Registers a dataset with the given fingerprint, returning a new dataset id.
-  int64 RegisterDataset(uint64 fingerprint, const DatasetDef& dataset);
+  int64 RegisterDataset(uint64 fingerprint, const DatasetDef& dataset)
+      EXCLUSIVE_LOCKS_REQUIRED(mu_);
   // Initializes a workers stub, if it hasn't been initialized already.
   Status EnsureWorkerStubInitialized(Worker* worker);
   // Instructs a worker to begin processing a task.
-  Status AllocateTaskToWorker(const Task& task_id, Worker* worker);
+  Status AllocateTaskToWorker(const Task& task_id, Worker* worker)
+      LOCKS_EXCLUDED(mu_);
   // Creates a job and stores its job_id in `*job_id`.
   Status CreateJob(int64 dataset_id, ProcessingMode processing_mode,
-                   absl::optional<std::string> job_name, int64* out_job_id);
-  // Creates a new task for a job, returning the new task's id.
-  int64 CreateTask(Job* job, const std::string& worker_address);
+                   absl::optional<std::string> job_name, int64* out_job_id)
+      LOCKS_EXCLUDED(mu_);
+  // Creates a new task for a job, returning a reference to the task.
+  const Task& CreateTask(Job* job, const std::string& worker_address)
+      LOCKS_EXCLUDED(mu_);
+  // Same as `CreateTask`, but expects that the master lock is already held.
+  const Task& CreateTaskLocked(Job* job, const std::string& worker_address)
+      EXCLUSIVE_LOCKS_REQUIRED(mu_);
   // Validates that an existing job matches the given processing_mode and
   // dataset_id, returning an error status describing any difference.
   Status ValidateMatchingJob(const Job& job, ProcessingMode processing_mode,
@@ -202,7 +209,7 @@ class DataServiceMasterImpl {
   int64 next_task_id_ TF_GUARDED_BY(mu_) = 0;
 
   // Registered workers.
-  std::vector<Worker> workers_ TF_GUARDED_BY(mu_);
+  std::vector<std::shared_ptr<Worker>> workers_ TF_GUARDED_BY(mu_);
   // Registered datasets, keyed by dataset ids.
   absl::flat_hash_map<int64, std::shared_ptr<Dataset>> datasets_by_id_
       TF_GUARDED_BY(mu_);

From 3ae2cf96105c32a71db7796b901e3c8ae5d8f71a Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 19 Jun 2020 13:05:16 -0700
Subject: [PATCH 0650/1390] Suppress Clang error: unused variable 'row_base'
 when building with MSAN instrumentation

PiperOrigin-RevId: 317364738
Change-Id: Ic55fdfa9520f341d254aa42333da67ae5d340680
---
 tensorflow/core/kernels/eigen_contraction_kernel.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tensorflow/core/kernels/eigen_contraction_kernel.h b/tensorflow/core/kernels/eigen_contraction_kernel.h
index 7233020c1c1..ef4b9dbc012 100644
--- a/tensorflow/core/kernels/eigen_contraction_kernel.h
+++ b/tensorflow/core/kernels/eigen_contraction_kernel.h
@@ -171,6 +171,7 @@ struct mkldnn_gemm_kernel</*Scalar*/ float, IndexType, OutputMapper,
 #if DYNAMIC_ANNOTATIONS_ENABLED == 1 || defined(MEMORY_SANITIZER)
     for (IndexType col = 0; col < cols; ++col) {
       ResScalar* row_base = &output(0, col);
+      EIGEN_UNUSED_VARIABLE(row_base);  // Suppress unused variable error.
       TF_ANNOTATE_MEMORY_IS_INITIALIZED(row_base, sizeof(ResScalar) * rows);
     }
 #endif
@@ -241,6 +242,7 @@ struct mkldnn_gemm_s8u8s32_kernel {
 #if DYNAMIC_ANNOTATIONS_ENABLED == 1 || defined(MEMORY_SANITIZER)
     for (IndexType col = 0; col < cols; ++col) {
       ResScalar* row_base = &output(0, col);
+      EIGEN_UNUSED_VARIABLE(row_base);  // Suppress unused variable error.
       TF_ANNOTATE_MEMORY_IS_INITIALIZED(row_base, sizeof(ResScalar) * rows);
     }
 #endif

From c575e2ba93c442121d98d3f125d83fed1339924d Mon Sep 17 00:00:00 2001
From: Ayush Dubey <ayushd@google.com>
Date: Fri, 19 Jun 2020 13:06:52 -0700
Subject: [PATCH 0651/1390] Remove `run_deprecated_v1` annotations from
 collective ops tests.

PiperOrigin-RevId: 317365063
Change-Id: Ibf13ad8629947becd40038d41ee213d3466b6292
---
 tensorflow/python/ops/collective_ops_test.py | 380 ++++++++++---------
 1 file changed, 203 insertions(+), 177 deletions(-)

diff --git a/tensorflow/python/ops/collective_ops_test.py b/tensorflow/python/ops/collective_ops_test.py
index 9727593a1c5..8e3a95d7dbf 100644
--- a/tensorflow/python/ops/collective_ops_test.py
+++ b/tensorflow/python/ops/collective_ops_test.py
@@ -104,39 +104,42 @@ class CollectiveOpTest(test.TestCase):
     for i in range(group_size * num_instances):
       self.assertAllClose(results[i], expected, rtol=1e-5, atol=1e-5)
 
-  @test_util.run_deprecated_v1
   def testCollectiveReduce(self):
-    self._testCollectiveReduce(
-        inputs=[[0.1, 1.1, 2.1, 3.1, 4.1, 5.1, 6.1, 7.1],
-                [0.3, 1.3, 2.3, 3.3, 4.3, 5.3, 6.3, 7.3]],
-        expected=[0.2, 1.2, 2.2, 3.2, 4.2, 5.2, 6.2, 7.2],
-        set_graph_key=True)
+    # Tests that execute collectives need to be enclosed in graph or tf.function
+    with ops.Graph().as_default():
+      self._testCollectiveReduce(
+          inputs=[[0.1, 1.1, 2.1, 3.1, 4.1, 5.1, 6.1, 7.1],
+                  [0.3, 1.3, 2.3, 3.3, 4.3, 5.3, 6.3, 7.3]],
+          expected=[0.2, 1.2, 2.2, 3.2, 4.2, 5.2, 6.2, 7.2],
+          set_graph_key=True)
 
-  @test_util.run_deprecated_v1
   def testCollectiveAutoGraphKey(self):
-    self._testCollectiveReduce(
-        inputs=[[0.1, 1.1, 2.1, 3.1, 4.1, 5.1, 6.1, 7.1],
-                [0.3, 1.3, 2.3, 3.3, 4.3, 5.3, 6.3, 7.3]],
-        expected=[0.2, 1.2, 2.2, 3.2, 4.2, 5.2, 6.2, 7.2],
-        set_graph_key=False)
+    # Tests that execute collectives need to be enclosed in graph or tf.function
+    with ops.Graph().as_default():
+      self._testCollectiveReduce(
+          inputs=[[0.1, 1.1, 2.1, 3.1, 4.1, 5.1, 6.1, 7.1],
+                  [0.3, 1.3, 2.3, 3.3, 4.3, 5.3, 6.3, 7.3]],
+          expected=[0.2, 1.2, 2.2, 3.2, 4.2, 5.2, 6.2, 7.2],
+          set_graph_key=False)
 
-  @test_util.run_deprecated_v1
   def testFp16Reduce(self):
-    self._testCollectiveReduce(
-        inputs=[[0.1, 1.1, 2.1, 3.1, 4.1, 5.1, 6.1, 7.1],
-                [0.3, 1.3, 2.3, 3.3, 4.3, 5.3, 6.3, 7.3]],
-        expected=[0.2, 1.2, 2.2, 3.2, 4.2, 5.2, 6.2, 7.2],
-        set_graph_key=True,
-        fp16=True)
+    # Tests that execute collectives need to be enclosed in graph or tf.function
+    with ops.Graph().as_default():
+      self._testCollectiveReduce(
+          inputs=[[0.1, 1.1, 2.1, 3.1, 4.1, 5.1, 6.1, 7.1],
+                  [0.3, 1.3, 2.3, 3.3, 4.3, 5.3, 6.3, 7.3]],
+          expected=[0.2, 1.2, 2.2, 3.2, 4.2, 5.2, 6.2, 7.2],
+          set_graph_key=True,
+          fp16=True)
 
-  @test_util.run_deprecated_v1
   def testCollectiveMultipleConcurrentReduce(self):
-    self._testMultipleConcurrentCollectiveReduce(
-        [0.1, 1.1, 2.1, 3.1, 4.1, 5.1, 6.1, 7.1],
-        [0.3, 1.3, 2.3, 3.3, 4.3, 5.3, 6.3, 7.3],
-        [0.2, 1.2, 2.2, 3.2, 4.2, 5.2, 6.2, 7.2])
+    # Tests that execute collectives need to be enclosed in graph or tf.function
+    with ops.Graph().as_default():
+      self._testMultipleConcurrentCollectiveReduce(
+          [0.1, 1.1, 2.1, 3.1, 4.1, 5.1, 6.1, 7.1],
+          [0.3, 1.3, 2.3, 3.3, 4.3, 5.3, 6.3, 7.3],
+          [0.2, 1.2, 2.2, 3.2, 4.2, 5.2, 6.2, 7.2])
 
-  @test_util.run_deprecated_v1
   def testCollectiveTimeoutV1(self):
     timeout = 4.5
     kwargs = dict(
@@ -145,14 +148,17 @@ class CollectiveOpTest(test.TestCase):
         set_graph_key=True,
         timeout=timeout)
 
-    self._testCollectiveReduce(**kwargs)
+    # Tests that execute collectives need to be enclosed in graph or tf.function
+    with ops.Graph().as_default():
+      self._testCollectiveReduce(**kwargs)
 
     start_time = time.time()
-    with self.assertRaisesRegex(
-        errors.DeadlineExceededError,
-        'Collective has timed out waiting for other workers'):
-      self._testCollectiveReduce(
-          reported_group_size=len(kwargs['inputs']) + 1, **kwargs)
+    with ops.Graph().as_default():
+      with self.assertRaisesRegex(
+          errors.DeadlineExceededError,
+          'Collective has timed out waiting for other workers'):
+        self._testCollectiveReduce(
+            reported_group_size=len(kwargs['inputs']) + 1, **kwargs)
     elapsed = time.time() - start_time
     self.assertAllGreaterEqual(elapsed, timeout)
 
@@ -199,17 +205,18 @@ class CollectiveOpTest(test.TestCase):
     elapsed = time.time() - start_time
     self.assertAllGreaterEqual(elapsed, timeout)
 
-  @test_util.run_deprecated_v1
   def testNcclHintFallbackToRingReduce(self):
     """Tests that setting `communication_hint=nccl` works on non-GPU builds."""
     if kernels.get_registered_kernels_for_op('NcclAllReduce'):
       self.skipTest('Run only on non-GPU environments')
-    self._testCollectiveReduce(
-        inputs=[[0.1, 1.1, 2.1, 3.1, 4.1, 5.1, 6.1, 7.1],
-                [0.3, 1.3, 2.3, 3.3, 4.3, 5.3, 6.3, 7.3]],
-        expected=[0.2, 1.2, 2.2, 3.2, 4.2, 5.2, 6.2, 7.2],
-        set_graph_key=False,
-        communication_hint='nccl')
+    # Tests that execute collectives need to be enclosed in graph or tf.function
+    with ops.Graph().as_default():
+      self._testCollectiveReduce(
+          inputs=[[0.1, 1.1, 2.1, 3.1, 4.1, 5.1, 6.1, 7.1],
+                  [0.3, 1.3, 2.3, 3.3, 4.3, 5.3, 6.3, 7.3]],
+          expected=[0.2, 1.2, 2.2, 3.2, 4.2, 5.2, 6.2, 7.2],
+          set_graph_key=False,
+          communication_hint='nccl')
 
   def _testWhile(self, num_vars, num_iterations, key_base):
     group_size = 2
@@ -262,15 +269,16 @@ class CollectiveOpTest(test.TestCase):
           [((1 << (num_iterations + v)) * 1.) for v in range(num_vars)]
           for _ in range(group_size)])
 
-  @test_util.run_deprecated_v1
   def testSimpleWhile(self):
-    self._testWhile(num_vars=1, num_iterations=4, key_base=20)
+    # Tests that execute collectives need to be enclosed in graph or tf.function
+    with ops.Graph().as_default():
+      self._testWhile(num_vars=1, num_iterations=4, key_base=20)
 
-  @test_util.run_deprecated_v1
   def testWhileMultipleAllReduce(self):
-    self._testWhile(num_vars=2, num_iterations=4, key_base=20)
+    # Tests that execute collectives need to be enclosed in graph or tf.function
+    with ops.Graph().as_default():
+      self._testWhile(num_vars=2, num_iterations=4, key_base=20)
 
-  @test_util.run_deprecated_v1
   def testWhileWithScopedAllocator(self):
     group_size = 2
     group_key = 1
@@ -284,47 +292,52 @@ class CollectiveOpTest(test.TestCase):
     del rewrite_options.scoped_allocator_opts.enable_op[:]
     rewrite_options.scoped_allocator_opts.enable_op.append('CollectiveReduce')
 
-    with self.session(config=config) as sess:
-      run_ops = []
-      for i in range(group_size):
-        with ops.device('CPU:%d' % i):
-          constant = constant_op.constant(0.)
-          cond = lambda i: math_ops.less(i, 10.)
-          body = lambda i: math_ops.add(i, 1.)
-          input0 = control_flow_ops.while_loop(cond, body, [constant])
-          input1 = math_ops.add(constant, 5)
-          colred0 = collective_ops.all_reduce(input0, group_size, group_key,
-                                              instance_key0, 'Add', 'Id')
-          colred1 = collective_ops.all_reduce(input1, group_size, group_key,
-                                              instance_key1, 'Add', 'Id')
-          run_ops.append(math_ops.add_n([colred0, colred1]))
-      results = sess.run(run_ops)
+    # Tests that execute collectives need to be enclosed in graph or tf.function
+    with ops.Graph().as_default():
+      with self.session(config=config) as sess:
+        run_ops = []
+        for i in range(group_size):
+          with ops.device('CPU:%d' % i):
+            constant = constant_op.constant(0.)
+            cond = lambda i: math_ops.less(i, 10.)
+            body = lambda i: math_ops.add(i, 1.)
+            input0 = control_flow_ops.while_loop(cond, body, [constant])
+            input1 = math_ops.add(constant, 5)
+            colred0 = collective_ops.all_reduce(input0, group_size, group_key,
+                                                instance_key0, 'Add', 'Id')
+            colred1 = collective_ops.all_reduce(input1, group_size, group_key,
+                                                instance_key1, 'Add', 'Id')
+            run_ops.append(math_ops.add_n([colred0, colred1]))
+        results = sess.run(run_ops)
       self.assertEqual(results, [30., 30.])
 
-  @test_util.run_deprecated_v1
   def testCollectiveReduceScalar(self):
-    self._testCollectiveReduce(inputs=[0.1, 0.3], expected=0.2,
-                               set_graph_key=True)
+    # Tests that execute collectives need to be enclosed in graph or tf.function
+    with ops.Graph().as_default():
+      self._testCollectiveReduce(inputs=[0.1, 0.3], expected=0.2,
+                                 set_graph_key=True)
 
-  @test_util.run_deprecated_v1
   def testCollectiveReduceMaximum(self):
-    self._testCollectiveReduce(
-        inputs=[[1., 20., 3., 40., 5.], [10., 2., 30., 4., 50.]],
-        expected=[10., 20., 30., 40., 50.],
-        set_graph_key=True,
-        instance_key=30,
-        merge_op='Max',
-        final_op='Id')
+    # Tests that execute collectives need to be enclosed in graph or tf.function
+    with ops.Graph().as_default():
+      self._testCollectiveReduce(
+          inputs=[[1., 20., 3., 40., 5.], [10., 2., 30., 4., 50.]],
+          expected=[10., 20., 30., 40., 50.],
+          set_graph_key=True,
+          instance_key=30,
+          merge_op='Max',
+          final_op='Id')
 
-  @test_util.run_deprecated_v1
   def testCollectiveReduceMinimum(self):
-    self._testCollectiveReduce(
-        inputs=[[1., 20., 3., 40., 5.], [10., 2., 30., 4., 50.]],
-        expected=[1., 2., 3., 4., 5.],
-        set_graph_key=True,
-        instance_key=40,
-        merge_op='Min',
-        final_op='Id')
+    # Tests that execute collectives need to be enclosed in graph or tf.function
+    with ops.Graph().as_default():
+      self._testCollectiveReduce(
+          inputs=[[1., 20., 3., 40., 5.], [10., 2., 30., 4., 50.]],
+          expected=[1., 2., 3., 4., 5.],
+          set_graph_key=True,
+          instance_key=40,
+          merge_op='Min',
+          final_op='Id')
 
   def _testCollectiveBroadcast(self, in_val):
     group_key = 1
@@ -345,13 +358,15 @@ class CollectiveOpTest(test.TestCase):
     self.assertAllClose(results[0], in_val, rtol=1e-5, atol=1e-5)
     self.assertAllClose(results[1], in_val, rtol=1e-5, atol=1e-5)
 
-  @test_util.run_deprecated_v1
   def testCollectiveBroadcast(self):
-    self._testCollectiveBroadcast([0.1, 1.1, 2.1, 3.1, 4.1, 5.1, 6.1, 7.1])
+    # Tests that execute collectives need to be enclosed in graph or tf.function
+    with ops.Graph().as_default():
+      self._testCollectiveBroadcast([0.1, 1.1, 2.1, 3.1, 4.1, 5.1, 6.1, 7.1])
 
-  @test_util.run_deprecated_v1
   def testCollectiveBroadcastBool(self):
-    self._testCollectiveBroadcast([True, False])
+    # Tests that execute collectives need to be enclosed in graph or tf.function
+    with ops.Graph().as_default():
+      self._testCollectiveBroadcast([True, False])
 
   def _testCollectiveGather(self, t0, t1, expected, set_graph_key):
     group_key = 1
@@ -371,94 +386,101 @@ class CollectiveOpTest(test.TestCase):
     self.assertAllClose(results[0], expected, rtol=1e-5, atol=1e-5)
     self.assertAllClose(results[1], expected, rtol=1e-5, atol=1e-5)
 
-  @test_util.run_deprecated_v1
   def testCollectiveGather(self):
-    self._testCollectiveGather([0, 1, 2, 3, 4, 5, 6, 7],
-                               [10, 11, 12, 13, 14, 15, 16, 17],
-                               [0, 1, 2, 3, 4, 5, 6, 7,
-                                10, 11, 12, 13, 14, 15, 16, 17],
-                               True)
-    self._testCollectiveGather([[0, 1, 2, 3], [4, 5, 6, 7]],
-                               [[10, 11, 12, 13], [14, 15, 16, 17]],
-                               [[0, 1, 2, 3], [4, 5, 6, 7],
-                                [10, 11, 12, 13], [14, 15, 16, 17]],
-                               True)
-    self._testCollectiveGather([[[0, 1], [2, 3]], [[4, 5], [6, 7]]],
-                               [[[10, 11], [12, 13]], [[14, 15], [16, 17]]],
-                               [[[0, 1], [2, 3]], [[4, 5], [6, 7]],
-                                [[10, 11], [12, 13]], [[14, 15], [16, 17]]],
-                               True)
+    # Tests that execute collectives need to be enclosed in graph or tf.function
+    with ops.Graph().as_default():
+      self._testCollectiveGather([0, 1, 2, 3, 4, 5, 6, 7],
+                                 [10, 11, 12, 13, 14, 15, 16, 17],
+                                 [0, 1, 2, 3, 4, 5, 6, 7,
+                                  10, 11, 12, 13, 14, 15, 16, 17],
+                                 True)
+      self._testCollectiveGather([[0, 1, 2, 3], [4, 5, 6, 7]],
+                                 [[10, 11, 12, 13], [14, 15, 16, 17]],
+                                 [[0, 1, 2, 3], [4, 5, 6, 7],
+                                  [10, 11, 12, 13], [14, 15, 16, 17]],
+                                 True)
+      self._testCollectiveGather([[[0, 1], [2, 3]], [[4, 5], [6, 7]]],
+                                 [[[10, 11], [12, 13]], [[14, 15], [16, 17]]],
+                                 [[[0, 1], [2, 3]], [[4, 5], [6, 7]],
+                                  [[10, 11], [12, 13]], [[14, 15], [16, 17]]],
+                                 True)
 
-  @test_util.run_deprecated_v1
   def testCollectiveGatherShapeMismatch(self):
     group_key = 1
     instance_key = 1
     t0 = [1, 2, 3, 4]
     t1 = [5, 6, 7, 8]
     t2 = [9, 10]
-    with self.session(
-        config=config_pb2.ConfigProto(device_count={'CPU': 2})) as sess:
-      with ops.device('/CPU:0'):
-        in0 = constant_op.constant(t0)
-        c0 = collective_ops.all_gather(in0, 2, group_key, instance_key)
-      with ops.device('/CPU:1'):
-        in1 = constant_op.constant(t1)
-        in2 = constant_op.constant(t2)
-        c1 = collective_ops.all_gather(in1, 2, group_key, instance_key)
-        c2 = collective_ops.all_gather(in2, 2, group_key, instance_key)
-      run_options = config_pb2.RunOptions()
-      run_options.experimental.collective_graph_key = 1
-      sess.run([c0, c1], options=run_options)
-      with self.assertRaisesRegexp(errors.InvalidArgumentError,
-                                   'Shape mismatch'):
-        sess.run([c0, c2], options=run_options)
+    # Tests that execute collectives need to be enclosed in graph or tf.function
+    with ops.Graph().as_default():
+      with self.session(
+          config=config_pb2.ConfigProto(device_count={'CPU': 2})) as sess:
+        with ops.device('/CPU:0'):
+          in0 = constant_op.constant(t0)
+          c0 = collective_ops.all_gather(in0, 2, group_key, instance_key)
+        with ops.device('/CPU:1'):
+          in1 = constant_op.constant(t1)
+          in2 = constant_op.constant(t2)
+          c1 = collective_ops.all_gather(in1, 2, group_key, instance_key)
+          c2 = collective_ops.all_gather(in2, 2, group_key, instance_key)
+        run_options = config_pb2.RunOptions()
+        run_options.experimental.collective_graph_key = 1
+        sess.run([c0, c1], options=run_options)
+        with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                     'Shape mismatch'):
+          sess.run([c0, c2], options=run_options)
 
-  @test_util.run_deprecated_v1
   def testCollectiveGatherShapeMismatchAcrossDevices(self):
     group_key = 1
     instance_key = 1
     t0 = [1, 2, 3, 4]
     t1 = [5, 6]
-    with self.session(
-        config=config_pb2.ConfigProto(device_count={'CPU': 2})) as sess:
-      with ops.device('/CPU:0'):
-        in0 = constant_op.constant(t0)
-        c0 = collective_ops.all_gather(in0, 2, group_key, instance_key)
-      with ops.device('/CPU:1'):
-        in1 = constant_op.constant(t1)
-        c1 = collective_ops.all_gather(in1, 2, group_key, instance_key)
-      run_options = config_pb2.RunOptions()
-      run_options.experimental.collective_graph_key = 1
-      with self.assertRaisesRegexp(errors.InvalidArgumentError,
-                                   'Shape mismatch'):
-        sess.run([c0, c1], options=run_options)
+    # Tests that execute collectives need to be enclosed in graph or tf.function
+    with ops.Graph().as_default():
+      with self.session(
+          config=config_pb2.ConfigProto(device_count={'CPU': 2})) as sess:
+        with ops.device('/CPU:0'):
+          in0 = constant_op.constant(t0)
+          c0 = collective_ops.all_gather(in0, 2, group_key, instance_key)
+        with ops.device('/CPU:1'):
+          in1 = constant_op.constant(t1)
+          c1 = collective_ops.all_gather(in1, 2, group_key, instance_key)
+        run_options = config_pb2.RunOptions()
+        run_options.experimental.collective_graph_key = 1
+        with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                     'Shape mismatch'):
+          sess.run([c0, c1], options=run_options)
 
-  @test_util.run_deprecated_v1
   def testCollectiveGatherPolymorphicShape(self):
     t0 = [0, 1, 2, 3, 4, 5, 6, 7]
     t1 = [10, 11, 12, 13, 14, 15, 16, 17]
     group_size = 2
     group_key = 1
     instance_key = 123
-    with self.session(
-        config=config_pb2.ConfigProto(
-            device_count={'CPU': group_size})) as sess:
-      with ops.device('/CPU:0'):
-        in0 = array_ops.placeholder(dtype=dtypes.int32, shape=[None])
-        c0 = collective_ops.all_gather(in0, group_size, group_key, instance_key)
-      with ops.device('/CPU:1'):
-        in1 = array_ops.placeholder(dtype=dtypes.int32, shape=[None])
-        c1 = collective_ops.all_gather(in1, group_size, group_key, instance_key)
+    # Tests that execute collectives need to be enclosed in graph or tf.function
+    with ops.Graph().as_default():
+      with self.session(
+          config=config_pb2.ConfigProto(
+              device_count={'CPU': group_size})) as sess:
+        with ops.device('/CPU:0'):
+          in0 = array_ops.placeholder(dtype=dtypes.int32, shape=[None])
+          c0 = collective_ops.all_gather(in0, group_size, group_key,
+                                         instance_key)
+        with ops.device('/CPU:1'):
+          in1 = array_ops.placeholder(dtype=dtypes.int32, shape=[None])
+          c1 = collective_ops.all_gather(in1, group_size, group_key,
+                                         instance_key)
 
-      results = sess.run([c0, c1], feed_dict={in0: t0, in1: t1})
-      expected_output = [0, 1, 2, 3, 4, 5, 6, 7, 10, 11, 12, 13, 14, 15, 16, 17]
-      self.assertAllClose(results[0], expected_output, rtol=1e-5, atol=1e-5)
-      self.assertAllClose(results[1], expected_output, rtol=1e-5, atol=1e-5)
+        results = sess.run([c0, c1], feed_dict={in0: t0, in1: t1})
+        results_ = sess.run([c0, c1], feed_dict={in0: t0[1:], in1: t1[1:]})
 
-      results_ = sess.run([c0, c1], feed_dict={in0: t0[1:], in1: t1[1:]})
-      expected_output_ = [1, 2, 3, 4, 5, 6, 7, 11, 12, 13, 14, 15, 16, 17]
-      self.assertAllClose(results_[0], expected_output_, rtol=1e-5, atol=1e-5)
-      self.assertAllClose(results_[1], expected_output_, rtol=1e-5, atol=1e-5)
+    expected_output = [0, 1, 2, 3, 4, 5, 6, 7, 10, 11, 12, 13, 14, 15, 16, 17]
+    self.assertAllClose(results[0], expected_output, rtol=1e-5, atol=1e-5)
+    self.assertAllClose(results[1], expected_output, rtol=1e-5, atol=1e-5)
+
+    expected_output_ = [1, 2, 3, 4, 5, 6, 7, 11, 12, 13, 14, 15, 16, 17]
+    self.assertAllClose(results_[0], expected_output_, rtol=1e-5, atol=1e-5)
+    self.assertAllClose(results_[1], expected_output_, rtol=1e-5, atol=1e-5)
 
   @test_util.run_v2_only
   def testCollectiveGroupSizeMismatch(self):
@@ -492,8 +514,17 @@ class CollectiveOpTest(test.TestCase):
                                  'but that group has size'):
       run_all_reduce()
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v2_only
   def testCollectiveTensorsHaveNoDeviceSpecified(self):
+    context._reset_context()
+    cpus = config.list_physical_devices('CPU')
+    self.assertEqual(len(cpus), 1)
+    config.set_logical_device_configuration(cpus[0], [
+        context.LogicalDeviceConfiguration(),
+        context.LogicalDeviceConfiguration()
+    ])
+    context.ensure_initialized()
+
     group_size = 2
     group_key = 1
     instance_key = 1
@@ -517,20 +548,12 @@ class CollectiveOpTest(test.TestCase):
 
       return results
 
-    with self.session(config=config_pb2.ConfigProto(
-        device_count={'CPU': 2})) as sess:
-      with ops.device('/CPU:0'):
-        in0 = constant_op.constant(1)
-      with ops.device('/CPU:1'):
-        in1 = constant_op.constant(3)
-
-      result_op = fn([in0, in1])
-
-      run_options = config_pb2.RunOptions()
-      run_options.experimental.collective_graph_key = 1
-      result = sess.run(result_op, options=run_options)
-
-      self.assertAllClose(result, [2, 2])
+    with ops.device('/CPU:0'):
+      in0 = constant_op.constant(1)
+    with ops.device('/CPU:1'):
+      in1 = constant_op.constant(3)
+    result = fn([in0, in1])
+    self.assertAllClose(result, [2, 2])
 
   @test_util.run_v2_only
   def testCollectiveGroupSizeOne(self):
@@ -548,7 +571,6 @@ class CollectiveOpTest(test.TestCase):
         in_tensor, group_size, group_key, instance_key)
     self.assertAllEqual(in_value, gathered_tensor.numpy())
 
-  @test_util.run_deprecated_v1
   def testConstantWithScopedAllocator(self):
     group_size = 2
     group_key = 1
@@ -565,21 +587,25 @@ class CollectiveOpTest(test.TestCase):
     del rewrite_options.scoped_allocator_opts.enable_op[:]
     rewrite_options.scoped_allocator_opts.enable_op.append('CollectiveReduce')
 
-    with self.session(config=cfg) as sess:
-      run_ops = []
-      for i in range(group_size):
-        with ops.device('CPU:%d' % i):
-          constant = constant_op.constant(i + 1.)
-          input_tensor1 = array_ops.identity(constant)
-          input_tensor2 = array_ops.identity(constant)
-          reduced_tensor1 = collective_ops.all_reduce(
-              input_tensor1, group_size, group_key, instance_key1, 'Add', 'Id')
-          reduced_tensor2 = collective_ops.all_reduce(
-              input_tensor2, group_size, group_key, instance_key2, 'Add', 'Id')
-          run_ops.append(array_ops.identity(reduced_tensor1))
-          run_ops.append(array_ops.identity(reduced_tensor2))
-      results = sess.run(run_ops)
-      self.assertEqual(results, [3., 3., 3., 3.])
+    # Tests that execute collectives need to be enclosed in graph or tf.function
+    with ops.Graph().as_default():
+      with self.session(config=cfg) as sess:
+        run_ops = []
+        for i in range(group_size):
+          with ops.device('CPU:%d' % i):
+            constant = constant_op.constant(i + 1.)
+            input_tensor1 = array_ops.identity(constant)
+            input_tensor2 = array_ops.identity(constant)
+            reduced_tensor1 = collective_ops.all_reduce(
+                input_tensor1, group_size, group_key, instance_key1, 'Add',
+                'Id')
+            reduced_tensor2 = collective_ops.all_reduce(
+                input_tensor2, group_size, group_key, instance_key2, 'Add',
+                'Id')
+            run_ops.append(array_ops.identity(reduced_tensor1))
+            run_ops.append(array_ops.identity(reduced_tensor2))
+        results = sess.run(run_ops)
+    self.assertEqual(results, [3., 3., 3., 3.])
 
   @test_util.run_v2_only
   def testMultipleGroups(self):

From 72d30dfb8bc58be931604f853bd161a11b7c9fcc Mon Sep 17 00:00:00 2001
From: Haoyu Zhang <haoyuzhang@google.com>
Date: Fri, 19 Jun 2020 13:19:39 -0700
Subject: [PATCH 0652/1390] Raise error type corresponding to status code
 instead of generic RuntimeError.

PiperOrigin-RevId: 317367352
Change-Id: I35378b88a33269ac225632ae848398b819c694a1
---
 tensorflow/python/eager/pywrap_tensor.cc | 32 ++++++++++++++++--------
 1 file changed, 21 insertions(+), 11 deletions(-)

diff --git a/tensorflow/python/eager/pywrap_tensor.cc b/tensorflow/python/eager/pywrap_tensor.cc
index 031545531f1..0789eab6270 100644
--- a/tensorflow/python/eager/pywrap_tensor.cc
+++ b/tensorflow/python/eager/pywrap_tensor.cc
@@ -180,6 +180,15 @@ int ConvertDeviceName(PyObject* obj, const char** dst) {
   return 1;
 }
 
+void RaiseExceptionTypeFromTFStatus(TF_Status* status) {
+  TF_Code code = TF_GetCode(status);
+  PyObject* exception = tensorflow::PyExceptionRegistry::Lookup(code);
+  PyErr_SetObject(exception,
+                  pybind11::make_tuple(pybind11::none(), pybind11::none(),
+                                       TF_Message(status))
+                      .ptr());
+}
+
 }  // namespace
 
 namespace tensorflow {
@@ -305,13 +314,7 @@ TFE_TensorHandle* ConvertToEagerTensorUncached(TFE_Context* ctx,
                                                     device_name, status.get()));
     const TF_Code code = TF_GetCode(status.get());
     if (code != TF_OK) {
-      // Instead of raising a generic RuntimeError, raise an exception type
-      // based on the status error code.
-      PyObject* exception = PyExceptionRegistry::Lookup(code);
-      PyErr_SetObject(exception,
-                      pybind11::make_tuple(pybind11::none(), pybind11::none(),
-                                           TF_Message(status.get()))
-                          .ptr());
+      RaiseExceptionTypeFromTFStatus(status.get());
       return nullptr;
     }
   }
@@ -512,7 +515,9 @@ static PyObject* EagerTensor_datatype_enum(EagerTensor* self) {
 static PyObject* EagerTensor_shape_tuple(EagerTensor* self) {
   auto handle = self->handle;
   int n = TFE_TensorHandleNumDims(handle, &self->status);
-  if (MaybeRaiseExceptionFromTFStatus(&self->status, nullptr)) {
+  TF_Code code = TF_GetCode(&self->status);
+  if (code != TF_OK) {
+    RaiseExceptionTypeFromTFStatus(&self->status);
     // Cleanup self->status before returning.
     self->status.status = tensorflow::Status::OK();
     return nullptr;
@@ -522,13 +527,18 @@ static PyObject* EagerTensor_shape_tuple(EagerTensor* self) {
   for (int i = 0; i < n; ++i) {
     PyObject* dim =
         PyLong_FromLongLong(TFE_TensorHandleDim(handle, i, &self->status));
-    if (MaybeRaiseExceptionFromTFStatus(&self->status, nullptr) ||
-        dim == nullptr || PyTuple_SetItem(shape, i, dim) != 0) {
+    code = TF_GetCode(&self->status);
+    if (code != TF_OK || dim == nullptr ||
+        PyTuple_SetItem(shape, i, dim) != 0) {
+      if (code != TF_OK) {
+        RaiseExceptionTypeFromTFStatus(&self->status);
+      } else {
+        PyErr_SetString(PyExc_RuntimeError, "Error while creating shape");
+      }
       // Cleanup self->status before returning.
       self->status.status = tensorflow::Status::OK();
       Py_DECREF(shape);
       if (dim != nullptr) Py_DECREF(dim);
-      PyErr_SetString(PyExc_RuntimeError, "Error while creating shape");
       return nullptr;
     }
   }

From 4b3576c08171aee7edc80e929d2babdd899d67b7 Mon Sep 17 00:00:00 2001
From: Frank Chen <frankchn@google.com>
Date: Fri, 19 Jun 2020 14:01:00 -0700
Subject: [PATCH 0653/1390] Change quick_exit() to exit() to fix Mac OS nightly
 build problems

PiperOrigin-RevId: 317375174
Change-Id: Ia0adc40f0d952cb478118fc07189ccd0f2d9a073
---
 tensorflow/core/tpu/kernels/tpu_compile_op_common.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/tpu/kernels/tpu_compile_op_common.cc b/tensorflow/core/tpu/kernels/tpu_compile_op_common.cc
index 92d1fa1337e..79556cfa544 100644
--- a/tensorflow/core/tpu/kernels/tpu_compile_op_common.cc
+++ b/tensorflow/core/tpu/kernels/tpu_compile_op_common.cc
@@ -353,7 +353,7 @@ Status TpuCompileOpKernelCommon::CompileTFFunctionToHlo(
     return;
   }
 
-  std::quick_exit(42);
+  std::exit(42);
 }
 
 /* static */ Status TpuCompileOpKernelCommon::GetDynamicShapes(

From 83fe1bad15e65e8db5e546d683d0ad591f19fad7 Mon Sep 17 00:00:00 2001
From: Andrew Audibert <aaudibert@google.com>
Date: Fri, 19 Jun 2020 14:02:55 -0700
Subject: [PATCH 0654/1390] [tf.data service] Add test that different workers
 use independent shuffle orders.

If shuffle seeds are unspecified, shuffle order should be non-deterministically chosen on each worker.

PiperOrigin-RevId: 317375549
Change-Id: I35e32cfbbfb8558451a079875b495708347a23bf
---
 .../kernel_tests/data_service_ops_test.py     | 23 +++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/tensorflow/python/data/kernel_tests/data_service_ops_test.py b/tensorflow/python/data/kernel_tests/data_service_ops_test.py
index 796ab328980..488bf97f184 100644
--- a/tensorflow/python/data/kernel_tests/data_service_ops_test.py
+++ b/tensorflow/python/data/kernel_tests/data_service_ops_test.py
@@ -31,6 +31,7 @@ from tensorflow.python.eager import def_function
 from tensorflow.python.framework import combinations
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
+from tensorflow.python.framework import random_seed
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import tensor_array_ops
@@ -78,6 +79,28 @@ class DataServiceOpsTest(test_base.DatasetTestBase, parameterized.TestCase):
     results = [elem.numpy() for elem in ds]
     self.assertEqual(list(range(num_elements)), results)
 
+  @combinations.generate(test_base.eager_only_combinations())
+  def testDifferentShuffleOrders(self):
+    random_seed.set_random_seed(None)
+    num_elements = 100
+    master_address = self.create_cluster(2)
+    ds = dataset_ops.Dataset.range(num_elements)
+    ds = ds.shuffle(num_elements)
+    ds = _make_distributed_dataset(ds, master_address)
+    output = [elem.numpy() for elem in ds]
+
+    # The output will be two sequences of range(num_elements)
+    # non-deterministically interleaved together. If the orders of the elements
+    # were the same, first_order and second_order computed below will be equal.
+    first_order = {}
+    second_order = {}
+    for element in output:
+      if element in first_order:
+        second_order[element] = len(second_order)
+      else:
+        first_order[element] = len(first_order)
+    self.assertNotEqual(first_order, second_order)
+
   @combinations.generate(test_base.eager_only_combinations())
   def testMultipleEpochs(self):
     num_elements = 3

From 94bf57d06c546df46b2af36cfa11dc62dd35aebb Mon Sep 17 00:00:00 2001
From: Sanjoy Das <sanjoy@google.com>
Date: Fri, 19 Jun 2020 14:07:16 -0700
Subject: [PATCH 0655/1390] Do not try to compile trivially dead branches in
 the Case tf2xla lowering

This is important for the upcoming DeviceIndex op which can be used to select
one of many implementations depending on the device, and some of them may not be
compilable by tf2xla.

PiperOrigin-RevId: 317376420
Change-Id: I6428df6f4da238e5d2bc3618d51c579e34454945
---
 tensorflow/compiler/tests/BUILD               | 20 +++++
 tensorflow/compiler/tests/case_test.py        | 87 +++++++++++++++++++
 tensorflow/compiler/tf2xla/kernels/BUILD      |  1 +
 tensorflow/compiler/tf2xla/kernels/case_op.cc | 46 ++++++++--
 tensorflow/compiler/tf2xla/kernels/case_op.h  | 11 ++-
 5 files changed, 156 insertions(+), 9 deletions(-)
 create mode 100644 tensorflow/compiler/tests/case_test.py

diff --git a/tensorflow/compiler/tests/BUILD b/tensorflow/compiler/tests/BUILD
index 034ec82de10..42353451408 100644
--- a/tensorflow/compiler/tests/BUILD
+++ b/tensorflow/compiler/tests/BUILD
@@ -1453,6 +1453,26 @@ tf_xla_py_test(
     ],
 )
 
+tf_xla_py_test(
+    name = "case_test",
+    size = "small",
+    srcs = ["case_test.py"],
+    disabled_backends = ["cpu_ondemand"],
+    python_version = "PY3",
+    tags = [
+        "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
+    ],
+    use_xla_device = False,  # Uses tf.function(experimental_compile=True)
+    deps = [
+        ":xla_test",
+        "//tensorflow/compiler/tf2xla/python:xla",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:training",
+    ],
+)
+
 tf_xla_py_test(
     name = "gather_test",
     size = "medium",
diff --git a/tensorflow/compiler/tests/case_test.py b/tensorflow/compiler/tests/case_test.py
new file mode 100644
index 00000000000..3b2dff537da
--- /dev/null
+++ b/tensorflow/compiler/tests/case_test.py
@@ -0,0 +1,87 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for while loops in XLA."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.compiler.tests import xla_test
+from tensorflow.python.eager import def_function
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import image_ops
+from tensorflow.python.ops import io_ops
+from tensorflow.python.platform import test
+
+
+class CaseTest(xla_test.XLATestCase):
+
+  def testCaseBasic(self):
+
+    @def_function.function(experimental_compile=True)
+    def switch_case_test(branch_index):
+
+      def f1():
+        return array_ops.constant(17)
+
+      def f2():
+        return array_ops.constant(31)
+
+      def f3():
+        return array_ops.constant(-1)
+
+      return control_flow_ops.switch_case(
+          branch_index, branch_fns={
+              0: f1,
+              1: f2
+          }, default=f3)
+
+    with ops.device(self.device):
+      self.assertEqual(switch_case_test(array_ops.constant(0)).numpy(), 17)
+      self.assertEqual(switch_case_test(array_ops.constant(1)).numpy(), 31)
+      self.assertEqual(switch_case_test(array_ops.constant(2)).numpy(), -1)
+      self.assertEqual(switch_case_test(array_ops.constant(3)).numpy(), -1)
+
+  def testBranchIsPruned(self):
+
+    @def_function.function(experimental_compile=True)
+    def switch_case_test():
+      branch_index = array_ops.constant(0)
+
+      def f1():
+        return array_ops.constant(17)
+
+      def f2():
+        # Some operations that XLA cannot compile.
+        image_ops.decode_image(io_ops.read_file('/tmp/bmp'))
+        return array_ops.constant(31)
+
+      # This tests that we do not try to compile all branches if the branch
+      # index in trivially constant.
+      return control_flow_ops.switch_case(
+          branch_index, branch_fns={
+              0: f1,
+              1: f2
+          }, default=f2)
+
+    with ops.device(self.device):
+      self.assertEqual(switch_case_test().numpy(), 17)
+
+
+if __name__ == '__main__':
+  ops.enable_eager_execution()
+  test.main()
diff --git a/tensorflow/compiler/tf2xla/kernels/BUILD b/tensorflow/compiler/tf2xla/kernels/BUILD
index bfdfe38305b..bdaeeafd295 100644
--- a/tensorflow/compiler/tf2xla/kernels/BUILD
+++ b/tensorflow/compiler/tf2xla/kernels/BUILD
@@ -316,6 +316,7 @@ tf_kernel_library(
         "//tensorflow/compiler/tf2xla/ops:xla_ops",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client/lib:constants",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
diff --git a/tensorflow/compiler/tf2xla/kernels/case_op.cc b/tensorflow/compiler/tf2xla/kernels/case_op.cc
index 1b15c09f7e3..fbd54f1ef39 100644
--- a/tensorflow/compiler/tf2xla/kernels/case_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/case_op.cc
@@ -21,13 +21,14 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_context.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/compiler/xla/client/lib/constants.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/core/lib/core/errors.h"
 
 namespace tensorflow {
 
 XlaCaseOp::XlaCaseOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
-  OP_REQUIRES_OK(ctx, ctx->GetAttr("branches", &branches_));
+  OP_REQUIRES_OK(ctx, ctx->GetAttr("branches", &unpruned_branches_));
   OP_REQUIRES_OK(ctx, ctx->GetAttr("Tin", &input_types_));
   OP_REQUIRES_OK(ctx, ctx->GetAttr("Tout", &output_types_));
   if (!ctx->GetAttr(kXlaTokenInputNodesAttrName, &token_input_nodes_).ok()) {
@@ -41,12 +42,29 @@ XlaCaseOp::XlaCaseOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
   }
 }
 
+std::pair<std::vector<NameAttrList>, xla::XlaOp>
+XlaCaseOp::GetPrunedBranchesAndIndex(XlaOpKernelContext* ctx) {
+  xla::Literal branch_index_literal;
+  bool branch_index_is_constant =
+      ctx->ConstantInput(0, &branch_index_literal).ok();
+
+  if (!branch_index_is_constant) {
+    return {unpruned_branches_, ctx->Input(0)};
+  }
+
+  int32 branch_index = branch_index_literal.Get<int32>({});
+  if (branch_index < 0 || branch_index >= unpruned_branches_.size()) {
+    branch_index = unpruned_branches_.size() - 1;
+  }
+
+  std::vector<NameAttrList> pruned_branch = {unpruned_branches_[branch_index]};
+  return {pruned_branch, xla::ZerosLike(ctx->Input(0))};
+}
+
 // TODO(b/35949885): There is duplication here with the handling of the
 // while_op/if_op. Refactor the common code out/rework.
 void XlaCaseOp::Compile(XlaOpKernelContext* ctx) {
-  xla::XlaBuilder* b = ctx->builder();
-  int num_branches = branches_.size();
-  OP_REQUIRES(ctx, num_branches >= 1,
+  OP_REQUIRES(ctx, !unpruned_branches_.empty(),
               errors::InvalidArgument("Must provide at least one case branch"));
   OP_REQUIRES(ctx, input_type(0) == DT_INT32,
               errors::InvalidArgument(
@@ -55,6 +73,18 @@ void XlaCaseOp::Compile(XlaOpKernelContext* ctx) {
               errors::InvalidArgument(
                   "branch_index argument must be scalar for XLA compilation"));
 
+  xla::XlaBuilder* b = ctx->builder();
+
+  // We opportunistically prune out branches if the branch index is a
+  // compile-time constant.  This is important in the context of the DeviceIndex
+  // ops (and other such ops that may come later) since we may have a Case with
+  // trivially unselected branches that cannot be compiled into HLO.
+  std::vector<NameAttrList> branches;
+  xla::XlaOp branch_index;
+  std::tie(branches, branch_index) = GetPrunedBranchesAndIndex(ctx);
+
+  int num_branches = branches.size();
+
   VLOG(1) << "Building Case: " << input_types_.size() << " inputs";
 
   std::vector<XlaCompiler::Argument> arguments(input_types_.size());
@@ -94,7 +124,7 @@ void XlaCaseOp::Compile(XlaOpKernelContext* ctx) {
     std::vector<const FunctionBody*> case_bodies(num_branches);
     for (int branch_idx = 0; branch_idx < num_branches; branch_idx++) {
       OP_REQUIRES_OK(ctx, FindMustBeConstNodes(
-                              ctx, branches_[branch_idx],
+                              ctx, branches[branch_idx],
                               &case_branch_must_be_const_nodes[branch_idx],
                               &case_bodies[branch_idx]));
     }
@@ -133,7 +163,7 @@ void XlaCaseOp::Compile(XlaOpKernelContext* ctx) {
   std::vector<XlaCompiler::CompilationResult*> branch_results_p(num_branches);
   for (int j = 0; j < num_branches; ++j) {
     OP_REQUIRES_OK(ctx,
-                   compiler->CompileFunction(options, branches_[j], arguments,
+                   compiler->CompileFunction(options, branches[j], arguments,
                                              &branch_results[j]));
     branch_results_p[j] = &branch_results[j];
   }
@@ -171,7 +201,7 @@ void XlaCaseOp::Compile(XlaOpKernelContext* ctx) {
     for (int j = 0; j < num_branches; ++j) {
       branch_results[j] = {};
       OP_REQUIRES_OK(ctx,
-                     compiler->CompileFunction(options, branches_[j], arguments,
+                     compiler->CompileFunction(options, branches[j], arguments,
                                                &branch_results[j]));
     }
   }
@@ -277,7 +307,7 @@ void XlaCaseOp::Compile(XlaOpKernelContext* ctx) {
   auto input_tuple = xla::Tuple(b, inputs);
 
   xla::XlaOp outputs =
-      xla::Conditional(ctx->Input(0), absl::MakeSpan(result_computations),
+      xla::Conditional(branch_index, absl::MakeSpan(result_computations),
                        std::vector<xla::XlaOp>(num_branches, input_tuple));
   // Sets non-variable outputs.
   for (int i = 0; i < output_types_.size(); ++i) {
diff --git a/tensorflow/compiler/tf2xla/kernels/case_op.h b/tensorflow/compiler/tf2xla/kernels/case_op.h
index 4a61707864e..4d22a3db830 100644
--- a/tensorflow/compiler/tf2xla/kernels/case_op.h
+++ b/tensorflow/compiler/tf2xla/kernels/case_op.h
@@ -50,7 +50,16 @@ class XlaCaseOp : public XlaOpKernel {
  private:
   TF_DISALLOW_COPY_AND_ASSIGN(XlaCaseOp);
 
-  std::vector<NameAttrList> branches_;
+  // If the branch_index input is a constant: prunes out all but the branch
+  // corrresponding to that constant branch index, and returns that branch and
+  // the literal 0 (as the first and second component of the pair).
+  //
+  // If the branch_index input is not a constant: returns unpruned_branches_ and
+  // the branch_index input.
+  std::pair<std::vector<NameAttrList>, xla::XlaOp> GetPrunedBranchesAndIndex(
+      XlaOpKernelContext* ctx);
+
+  std::vector<NameAttrList> unpruned_branches_;
   DataTypeVector input_types_;
   DataTypeVector output_types_;
   bool has_token_input_output_;

From f60f6f0c1f68d729b2e501d5a0a668466acb7cda Mon Sep 17 00:00:00 2001
From: Michael Banfield <micban@google.com>
Date: Fri, 19 Jun 2020 14:11:21 -0700
Subject: [PATCH 0656/1390] Support uploading only new data when a file in GCS
 Filesystem is Flush()'d multiple times.

This uses the GCS compose API to avoid reuploading the entire file.

Also add some Vmodules for profiling GCS write paths.

PiperOrigin-RevId: 317377298
Change-Id: I3f36fd684c44070331ba1d9e6689efd0f74bfc0e
---
 .../core/platform/cloud/gcs_file_system.cc    |  150 +-
 .../core/platform/cloud/gcs_file_system.h     |    6 +-
 .../platform/cloud/gcs_file_system_test.cc    | 1620 ++++++++++-------
 3 files changed, 1065 insertions(+), 711 deletions(-)

diff --git a/tensorflow/core/platform/cloud/gcs_file_system.cc b/tensorflow/core/platform/cloud/gcs_file_system.cc
index 5d395f3d821..1bd4d86eef6 100644
--- a/tensorflow/core/platform/cloud/gcs_file_system.cc
+++ b/tensorflow/core/platform/cloud/gcs_file_system.cc
@@ -14,14 +14,18 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/core/platform/cloud/gcs_file_system.h"
+
 #include <stdio.h>
 #include <unistd.h>
+
 #include <algorithm>
 #include <cstdio>
 #include <cstdlib>
 #include <cstring>
 #include <fstream>
 #include <vector>
+
+#include "tensorflow/core/platform/strcat.h"
 #ifdef _WIN32
 #include <io.h>  // for _mktemp
 #endif
@@ -128,6 +132,15 @@ constexpr char kAllowedBucketLocations[] = "GCS_ALLOWED_BUCKET_LOCATIONS";
 // is running in and restricts to buckets in that region.
 constexpr char kDetectZoneSentinelValue[] = "auto";
 
+// How to upload new data when Flush() is called multiple times.
+// By default the entire file is reuploaded.
+constexpr char kAppendMode[] = "GCS_APPEND_MODE";
+// If GCS_APPEND_MODE=compose then instead the new data is uploaded to a
+// temporary object and composed with the original object. This is disabled by
+// default as the multiple API calls required add a risk of stranding temporary
+// objects.
+constexpr char kComposeAppend[] = "compose";
+
 Status GetTmpFilename(string* filename) {
   *filename = io::GetTempFilename("");
   return Status::OK();
@@ -379,15 +392,18 @@ class GcsWritableFile : public WritableFile {
                   GcsFileSystem* filesystem,
                   GcsFileSystem::TimeoutConfig* timeouts,
                   std::function<void()> file_cache_erase,
-                  RetryConfig retry_config)
+                  RetryConfig retry_config, bool compose_append)
       : bucket_(bucket),
         object_(object),
         filesystem_(filesystem),
         timeouts_(timeouts),
         file_cache_erase_(std::move(file_cache_erase)),
         sync_needed_(true),
-        retry_config_(retry_config) {
+        retry_config_(retry_config),
+        compose_append_(compose_append),
+        start_offset_(0) {
     // TODO: to make it safer, outfile_ should be constructed from an FD
+    VLOG(3) << "GcsWritableFile: " << GetGcsPath();
     if (GetTmpFilename(&tmp_content_filename_).ok()) {
       outfile_.open(tmp_content_filename_,
                     std::ofstream::binary | std::ofstream::app);
@@ -403,14 +419,18 @@ class GcsWritableFile : public WritableFile {
                   GcsFileSystem* filesystem, const string& tmp_content_filename,
                   GcsFileSystem::TimeoutConfig* timeouts,
                   std::function<void()> file_cache_erase,
-                  RetryConfig retry_config)
+                  RetryConfig retry_config, bool compose_append)
       : bucket_(bucket),
         object_(object),
         filesystem_(filesystem),
         timeouts_(timeouts),
         file_cache_erase_(std::move(file_cache_erase)),
         sync_needed_(true),
-        retry_config_(retry_config) {
+        retry_config_(retry_config),
+        compose_append_(compose_append),
+        start_offset_(0) {
+    VLOG(3) << "GcsWritableFile: " << GetGcsPath() << "with existing file "
+            << tmp_content_filename;
     tmp_content_filename_ = tmp_content_filename;
     outfile_.open(tmp_content_filename_,
                   std::ofstream::binary | std::ofstream::app);
@@ -423,6 +443,7 @@ class GcsWritableFile : public WritableFile {
 
   Status Append(StringPiece data) override {
     TF_RETURN_IF_ERROR(CheckWritable());
+    VLOG(3) << "Append: " << GetGcsPath() << " size " << data.length();
     sync_needed_ = true;
     outfile_ << data;
     if (!outfile_.good()) {
@@ -433,6 +454,7 @@ class GcsWritableFile : public WritableFile {
   }
 
   Status Close() override {
+    VLOG(3) << "Close:" << GetGcsPath();
     if (outfile_.is_open()) {
       Status sync_status = Sync();
       if (sync_status.ok()) {
@@ -443,18 +465,23 @@ class GcsWritableFile : public WritableFile {
     return Status::OK();
   }
 
-  Status Flush() override { return Sync(); }
+  Status Flush() override {
+    VLOG(3) << "Flush:" << GetGcsPath();
+    return Sync();
+  }
 
   Status Name(StringPiece* result) const override {
     return errors::Unimplemented("GCSWritableFile does not support Name()");
   }
 
   Status Sync() override {
+    VLOG(3) << "Sync started:" << GetGcsPath();
     TF_RETURN_IF_ERROR(CheckWritable());
     if (!sync_needed_) {
       return Status::OK();
     }
     Status status = SyncImpl();
+    VLOG(3) << "Sync finished " << GetGcsPath();
     if (status.ok()) {
       sync_needed_ = false;
     }
@@ -483,11 +510,26 @@ class GcsWritableFile : public WritableFile {
           "Could not write to the internal temporary file.");
     }
     string session_uri;
-    TF_RETURN_IF_ERROR(CreateNewUploadSession(&session_uri));
+    uint64 start_offset = 0;
+    string object_to_upload = object_;
+    bool should_compose = false;
+    if (compose_append_) {
+      start_offset = start_offset_;
+      // Only compose if the object has already been uploaded to GCS
+      should_compose = start_offset > 0;
+      if (should_compose) {
+        object_to_upload =
+            strings::StrCat(io::Dirname(object_), "/.tmpcompose/",
+                            io::Basename(object_), ".", start_offset_);
+      }
+    }
+    TF_RETURN_IF_ERROR(
+        CreateNewUploadSession(&session_uri, start_offset, object_to_upload));
     uint64 already_uploaded = 0;
     bool first_attempt = true;
     const Status upload_status = RetryingUtils::CallWithRetries(
-        [&first_attempt, &already_uploaded, &session_uri, this]() {
+        [&first_attempt, &already_uploaded, &session_uri, &start_offset,
+         this]() {
           if (!first_attempt) {
             bool completed;
             TF_RETURN_IF_ERROR(RequestUploadSessionStatus(
@@ -502,7 +544,7 @@ class GcsWritableFile : public WritableFile {
             }
           }
           first_attempt = false;
-          return UploadToSession(session_uri, already_uploaded);
+          return UploadToSession(session_uri, start_offset, already_uploaded);
         },
         retry_config_);
     if (upload_status.code() == errors::Code::NOT_FOUND) {
@@ -512,6 +554,12 @@ class GcsWritableFile : public WritableFile {
           strings::StrCat("Upload to gs://", bucket_, "/", object_,
                           " failed, caused by: ", upload_status.ToString()));
     }
+    if (upload_status.ok()) {
+      if (should_compose) {
+        TF_RETURN_IF_ERROR(AppendObject(object_to_upload));
+      }
+      TF_RETURN_IF_ERROR(GetCurrentFileSize(&start_offset_));
+    }
     return upload_status;
   }
 
@@ -534,7 +582,8 @@ class GcsWritableFile : public WritableFile {
   }
 
   /// Initiates a new resumable upload session.
-  Status CreateNewUploadSession(string* session_uri) {
+  Status CreateNewUploadSession(string* session_uri, uint64 start_offset,
+                                string object_to_upload) {
     uint64 file_size;
     TF_RETURN_IF_ERROR(GetCurrentFileSize(&file_size));
 
@@ -542,10 +591,11 @@ class GcsWritableFile : public WritableFile {
     std::unique_ptr<HttpRequest> request;
     TF_RETURN_IF_ERROR(filesystem_->CreateHttpRequest(&request));
 
-    request->SetUri(strings::StrCat(
-        kGcsUploadUriBase, "b/", bucket_,
-        "/o?uploadType=resumable&name=", request->EscapeString(object_)));
-    request->AddHeader("X-Upload-Content-Length", std::to_string(file_size));
+    request->SetUri(strings::StrCat(kGcsUploadUriBase, "b/", bucket_,
+                                    "/o?uploadType=resumable&name=",
+                                    request->EscapeString(object_to_upload)));
+    request->AddHeader("X-Upload-Content-Length",
+                       std::to_string(file_size - start_offset));
     request->SetPostEmptyBody();
     request->SetResultBuffer(&output_buffer);
     request->SetTimeouts(timeouts_->connect, timeouts_->idle,
@@ -561,6 +611,37 @@ class GcsWritableFile : public WritableFile {
     return Status::OK();
   }
 
+  /// Appends the data of append_object to the original object and deletes
+  /// append_object.
+  Status AppendObject(string append_object) {
+    VLOG(3) << "AppendObject: " << GetGcsPathWithObject(append_object) << " to "
+            << GetGcsPath();
+    std::unique_ptr<HttpRequest> request;
+    TF_RETURN_IF_ERROR(filesystem_->CreateHttpRequest(&request));
+
+    request->SetUri(strings::StrCat(kGcsUriBase, "b/", bucket_, "/o/",
+                                    request->EscapeString(object_),
+                                    "/compose"));
+
+    const string request_body =
+        strings::StrCat("{'sourceObjects': [{'name': '", object_,
+                        "'},{'name': '", append_object, "'}]}");
+    request->SetTimeouts(timeouts_->connect, timeouts_->idle,
+                         timeouts_->metadata);
+    request->AddHeader("content-type", "application/json");
+    request->SetPostFromBuffer(request_body.c_str(), request_body.size());
+    return RetryingUtils::CallWithRetries(
+        [&request, &append_object, this]() {
+          TF_RETURN_WITH_CONTEXT_IF_ERROR(request->Send(),
+                                          " when composing to ", GetGcsPath());
+          TF_RETURN_WITH_CONTEXT_IF_ERROR(
+              filesystem_->DeleteFile(GetGcsPathWithObject(append_object)),
+              " when cleaning up.");
+          return Status::OK();
+        },
+        retry_config_);
+  }
+
   /// \brief Requests status of a previously initiated upload session.
   ///
   /// If the upload has already succeeded, sets 'completed' to true.
@@ -628,7 +709,8 @@ class GcsWritableFile : public WritableFile {
     return Status::OK();
   }
 
-  Status UploadToSession(const string& session_uri, uint64 start_offset) {
+  Status UploadToSession(const string& session_uri, uint64 start_offset,
+                         uint64 already_uploaded) {
     uint64 file_size;
     TF_RETURN_IF_ERROR(GetCurrentFileSize(&file_size));
 
@@ -637,13 +719,14 @@ class GcsWritableFile : public WritableFile {
     request->SetUri(session_uri);
     if (file_size > 0) {
       request->AddHeader("Content-Range",
-                         strings::StrCat("bytes ", start_offset, "-",
-                                         file_size - 1, "/", file_size));
+                         strings::StrCat("bytes ", already_uploaded, "-",
+                                         file_size - start_offset - 1, "/",
+                                         file_size - start_offset));
     }
     request->SetTimeouts(timeouts_->connect, timeouts_->idle, timeouts_->write);
 
-    TF_RETURN_IF_ERROR(
-        request->SetPutFromFile(tmp_content_filename_, start_offset));
+    TF_RETURN_IF_ERROR(request->SetPutFromFile(
+        tmp_content_filename_, start_offset + already_uploaded));
     TF_RETURN_WITH_CONTEXT_IF_ERROR(request->Send(), " when uploading ",
                                     GetGcsPath());
     // Erase the file from the file cache on every successful write.
@@ -651,9 +734,10 @@ class GcsWritableFile : public WritableFile {
     return Status::OK();
   }
 
-  string GetGcsPath() const {
-    return strings::StrCat("gs://", bucket_, "/", object_);
+  string GetGcsPathWithObject(string object) const {
+    return strings::StrCat("gs://", bucket_, "/", object);
   }
+  string GetGcsPath() const { return GetGcsPathWithObject(object_); }
 
   string bucket_;
   string object_;
@@ -664,6 +748,8 @@ class GcsWritableFile : public WritableFile {
   std::function<void()> file_cache_erase_;
   bool sync_needed_;  // whether there is buffered data that needs to be synced
   RetryConfig retry_config_;
+  bool compose_append_;
+  uint64 start_offset_;
 };
 
 class GcsReadOnlyMemoryRegion : public ReadOnlyMemoryRegion {
@@ -849,6 +935,14 @@ GcsFileSystem::GcsFileSystem(bool make_default_cache) {
 
   GetEnvVar(kAllowedBucketLocations, SplitByCommaToLowercaseSet,
             &allowed_locations_);
+
+  StringPiece append_mode;
+  GetEnvVar(kAppendMode, StringPieceIdentity, &append_mode);
+  if (append_mode == kComposeAppend) {
+    compose_append_ = true;
+  } else {
+    compose_append_ = false;
+  }
 }
 
 GcsFileSystem::GcsFileSystem(
@@ -859,7 +953,8 @@ GcsFileSystem::GcsFileSystem(
     size_t stat_cache_max_entries, uint64 matching_paths_cache_max_age,
     size_t matching_paths_cache_max_entries, RetryConfig retry_config,
     TimeoutConfig timeouts, const std::unordered_set<string>& allowed_locations,
-    std::pair<const string, const string>* additional_header)
+    std::pair<const string, const string>* additional_header,
+    bool compose_append)
     : auth_provider_(std::move(auth_provider)),
       http_request_factory_(std::move(http_request_factory)),
       zone_provider_(std::move(zone_provider)),
@@ -872,6 +967,7 @@ GcsFileSystem::GcsFileSystem(
       bucket_location_cache_(new BucketLocationCache(
           kCacheNeverExpire, kBucketLocationCacheMaxEntries)),
       allowed_locations_(allowed_locations),
+      compose_append_(compose_append),
       timeouts_(timeouts),
       retry_config_(retry_config),
       additional_header_(additional_header) {}
@@ -1056,9 +1152,10 @@ Status GcsFileSystem::NewWritableFile(const string& fname,
                                       std::unique_ptr<WritableFile>* result) {
   string bucket, object;
   TF_RETURN_IF_ERROR(ParseGcsPath(fname, false, &bucket, &object));
-  result->reset(new GcsWritableFile(bucket, object, this, &timeouts_,
-                                    [this, fname]() { ClearFileCaches(fname); },
-                                    retry_config_));
+  result->reset(new GcsWritableFile(
+      bucket, object, this, &timeouts_,
+      [this, fname]() { ClearFileCaches(fname); }, retry_config_,
+      compose_append_));
   return Status::OK();
 }
 
@@ -1098,7 +1195,8 @@ Status GcsFileSystem::NewAppendableFile(const string& fname,
   TF_RETURN_IF_ERROR(ParseGcsPath(fname, false, &bucket, &object));
   result->reset(new GcsWritableFile(
       bucket, object, this, old_content_filename, &timeouts_,
-      [this, fname]() { ClearFileCaches(fname); }, retry_config_));
+      [this, fname]() { ClearFileCaches(fname); }, retry_config_,
+      compose_append_));
   return Status::OK();
 }
 
@@ -1629,6 +1727,7 @@ Status GcsFileSystem::RenameFile(const string& src, const string& target) {
 
 // Uses a GCS API command to copy the object and then deletes the old one.
 Status GcsFileSystem::RenameObject(const string& src, const string& target) {
+  VLOG(3) << "RenameObject: started gs://" << src << " to " << target;
   string src_bucket, src_object, target_bucket, target_object;
   TF_RETURN_IF_ERROR(ParseGcsPath(src, false, &src_bucket, &src_object));
   TF_RETURN_IF_ERROR(
@@ -1664,6 +1763,7 @@ Status GcsFileSystem::RenameObject(const string& src, const string& target) {
         "locations or storage classes is not supported.");
   }
 
+  VLOG(3) << "RenameObject: finished from: gs://" << src << " to " << target;
   // In case the delete API call failed, but the deletion actually happened
   // on the server side, we can't just retry the whole RenameFile operation
   // because the source object is already gone.
diff --git a/tensorflow/core/platform/cloud/gcs_file_system.h b/tensorflow/core/platform/cloud/gcs_file_system.h
index d1d8aed54d4..f066cc31eb4 100644
--- a/tensorflow/core/platform/cloud/gcs_file_system.h
+++ b/tensorflow/core/platform/cloud/gcs_file_system.h
@@ -122,7 +122,8 @@ class GcsFileSystem : public FileSystem {
                 size_t matching_paths_cache_max_entries,
                 RetryConfig retry_config, TimeoutConfig timeouts,
                 const std::unordered_set<string>& allowed_locations,
-                std::pair<const string, const string>* additional_header);
+                std::pair<const string, const string>* additional_header,
+                bool compose_append);
 
   Status NewRandomAccessFile(
       const string& fname, std::unique_ptr<RandomAccessFile>* result) override;
@@ -187,6 +188,8 @@ class GcsFileSystem : public FileSystem {
   std::unordered_set<string> allowed_locations() const {
     return allowed_locations_;
   }
+
+  bool compose_append() const { return compose_append_; }
   string additional_header_name() const {
     return additional_header_ ? additional_header_->first : "";
   }
@@ -373,6 +376,7 @@ class GcsFileSystem : public FileSystem {
   using BucketLocationCache = ExpiringLRUCache<string>;
   std::unique_ptr<BucketLocationCache> bucket_location_cache_;
   std::unordered_set<string> allowed_locations_;
+  bool compose_append_;
 
   TimeoutConfig timeouts_;
 
diff --git a/tensorflow/core/platform/cloud/gcs_file_system_test.cc b/tensorflow/core/platform/cloud/gcs_file_system_test.cc
index 544ddc32043..6892bd7cc26 100644
--- a/tensorflow/core/platform/cloud/gcs_file_system_test.cc
+++ b/tensorflow/core/platform/cloud/gcs_file_system_test.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/core/platform/cloud/http_request_fake.h"
 #include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/str_util.h"
+#include "tensorflow/core/platform/strcat.h"
 #include "tensorflow/core/platform/test.h"
 
 // Undef DeleteFile macro defined in wndows.h.
@@ -73,16 +74,16 @@ TEST(GcsFileSystemTest, NewRandomAccessFile_NoBlockCache) {
            "Range: 6-11\n"
            "Timeouts: 5 1 20\n",
            "6789")});
-  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-                   std::unique_ptr<HttpRequest::Factory>(
-                       new FakeHttpRequestFactory(&requests)),
-                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
-                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
-                   0 /* stat cache max age */, 0 /* stat cache max entries */,
-                   0 /* matching paths cache max age */,
-                   0 /* matching paths cache max entries */, kTestRetryConfig,
-                   kTestTimeoutConfig, *kAllowedLocationsDefault,
-                   nullptr /* gcs additional header */);
+  GcsFileSystem fs(
+      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+      std::unique_ptr<HttpRequest::Factory>(
+          new FakeHttpRequestFactory(&requests)),
+      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
+      0 /* max bytes */, 0 /* max staleness */, 0 /* stat cache max age */,
+      0 /* stat cache max entries */, 0 /* matching paths cache max age */,
+      0 /* matching paths cache max entries */, kTestRetryConfig,
+      kTestTimeoutConfig, *kAllowedLocationsDefault,
+      nullptr /* gcs additional header */, false /* compose append */);
 
   std::unique_ptr<RandomAccessFile> file;
   TF_EXPECT_OK(fs.NewRandomAccessFile("gs://bucket/random_access.txt", &file));
@@ -129,7 +130,7 @@ TEST(GcsFileSystemTest, NewRandomAccessFile_Buffered) {
       0 /* stat cache max entries */, 0 /* matching paths cache max age */,
       0 /* matching paths cache max entries */, kTestRetryConfig,
       kTestTimeoutConfig, *kAllowedLocationsDefault,
-      nullptr /* gcs additional header */);
+      nullptr /* gcs additional header */, false /* compose append */);
 
   std::unique_ptr<RandomAccessFile> file;
   TF_EXPECT_OK(fs.NewRandomAccessFile("gs://bucket/random_access.txt", &file));
@@ -177,7 +178,7 @@ TEST(GcsFileSystemTest, NewRandomAccessFile_Buffered_Errors) {
       0 /* stat cache max entries */, 0 /* matching paths cache max age */,
       0 /* matching paths cache max entries */, kTestRetryConfig,
       kTestTimeoutConfig, *kAllowedLocationsDefault,
-      nullptr /* gcs additional header */);
+      nullptr /* gcs additional header */, false /* compose append */);
 
   std::unique_ptr<RandomAccessFile> file;
   TF_EXPECT_OK(fs.NewRandomAccessFile("gs://bucket/random_access.txt", &file));
@@ -224,7 +225,7 @@ TEST(GcsFileSystemTest, NewRandomAccessFile_Buffered_ReadAtEOF) {
       0 /* stat cache max entries */, 0 /* matching paths cache max age */,
       0 /* matching paths cache max entries */, kTestRetryConfig,
       kTestTimeoutConfig, *kAllowedLocationsDefault,
-      nullptr /* gcs additional header */);
+      nullptr /* gcs additional header */, false /* compose append */);
 
   std::unique_ptr<RandomAccessFile> file;
   TF_EXPECT_OK(fs.NewRandomAccessFile("gs://bucket/random_access.txt", &file));
@@ -265,7 +266,7 @@ TEST(GcsFileSystemTest, NewRandomAccessFile_Buffered_CachedOutOfRange) {
       0 /* stat cache max entries */, 0 /* matching paths cache max age */,
       0 /* matching paths cache max entries */, kTestRetryConfig,
       kTestTimeoutConfig, *kAllowedLocationsDefault,
-      nullptr /* gcs additional header */);
+      nullptr /* gcs additional header */, false /* compose append */);
 
   std::unique_ptr<RandomAccessFile> file;
   TF_EXPECT_OK(fs.NewRandomAccessFile("gs://bucket/random_access.txt", &file));
@@ -316,7 +317,7 @@ TEST(GcsFileSystemTest, NewRandomAccessFile_Buffered_CachedNotSequential) {
       0 /* stat cache max entries */, 0 /* matching paths cache max age */,
       0 /* matching paths cache max entries */, kTestRetryConfig,
       kTestTimeoutConfig, *kAllowedLocationsDefault,
-      nullptr /* gcs additional header */);
+      nullptr /* gcs additional header */, false /* compose append */);
 
   std::unique_ptr<RandomAccessFile> file;
   TF_EXPECT_OK(fs.NewRandomAccessFile("gs://bucket/random_access.txt", &file));
@@ -357,7 +358,7 @@ TEST(GcsFileSystemTest, NewRandomAccessFile_Buffered_Growing) {
       0 /* stat cache max entries */, 0 /* matching paths cache max age */,
       0 /* matching paths cache max entries */, kTestRetryConfig,
       kTestTimeoutConfig, *kAllowedLocationsDefault,
-      nullptr /* gcs additional header */);
+      nullptr /* gcs additional header */, false /* compose append */);
 
   std::unique_ptr<RandomAccessFile> file;
   TF_EXPECT_OK(fs.NewRandomAccessFile("gs://bucket/random_access.txt", &file));
@@ -404,7 +405,7 @@ TEST(GcsFileSystemTest, NewRandomAccessFile_Buffered_ReadBackwards) {
       0 /* stat cache max entries */, 0 /* matching paths cache max age */,
       0 /* matching paths cache max entries */, kTestRetryConfig,
       kTestTimeoutConfig, *kAllowedLocationsDefault,
-      nullptr /* gcs additional header */);
+      nullptr /* gcs additional header */, false /* compose append */);
 
   std::unique_ptr<RandomAccessFile> file;
   TF_EXPECT_OK(fs.NewRandomAccessFile("gs://bucket/random_access.txt", &file));
@@ -437,16 +438,16 @@ TEST(GcsFileSystemTest,
             "location":"US-EAST1"
           })")});
 
-  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-                   std::unique_ptr<HttpRequest::Factory>(
-                       new FakeHttpRequestFactory(&requests)),
-                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
-                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
-                   0 /* stat cache max age */, 0 /* stat cache max entries */,
-                   0 /* matching paths cache max age */,
-                   0 /* matching paths cache max entries */, kTestRetryConfig,
-                   kTestTimeoutConfig, *kAllowedLocationsAuto,
-                   nullptr /* gcs additional header */);
+  GcsFileSystem fs(
+      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+      std::unique_ptr<HttpRequest::Factory>(
+          new FakeHttpRequestFactory(&requests)),
+      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
+      0 /* max bytes */, 0 /* max staleness */, 0 /* stat cache max age */,
+      0 /* stat cache max entries */, 0 /* matching paths cache max age */,
+      0 /* matching paths cache max entries */, kTestRetryConfig,
+      kTestTimeoutConfig, *kAllowedLocationsAuto,
+      nullptr /* gcs additional header */, false /* compose append */);
 
   std::unique_ptr<RandomAccessFile> file;
   TF_EXPECT_OK(fs.NewRandomAccessFile("gs://bucket/random_access.txt", &file));
@@ -479,16 +480,16 @@ TEST(GcsFileSystemTest, NewRandomAccessFile_WithLocationConstraintCaching) {
             "location":"US-EAST1"
           })")});
 
-  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-                   std::unique_ptr<HttpRequest::Factory>(
-                       new FakeHttpRequestFactory(&requests)),
-                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
-                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
-                   0 /* stat cache max age */, 0 /* stat cache max entries */,
-                   0 /* matching paths cache max age */,
-                   0 /* matching paths cache max entries */, kTestRetryConfig,
-                   kTestTimeoutConfig, *kAllowedLocationsAuto,
-                   nullptr /* gcs additional header */);
+  GcsFileSystem fs(
+      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+      std::unique_ptr<HttpRequest::Factory>(
+          new FakeHttpRequestFactory(&requests)),
+      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
+      0 /* max bytes */, 0 /* max staleness */, 0 /* stat cache max age */,
+      0 /* stat cache max entries */, 0 /* matching paths cache max age */,
+      0 /* matching paths cache max entries */, kTestRetryConfig,
+      kTestTimeoutConfig, *kAllowedLocationsAuto,
+      nullptr /* gcs additional header */, false /* compose append */);
 
   std::unique_ptr<RandomAccessFile> file;
 
@@ -520,16 +521,16 @@ TEST(GcsFileSystemTest,
             "location":"BARFOO"
           })")});
 
-  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-                   std::unique_ptr<HttpRequest::Factory>(
-                       new FakeHttpRequestFactory(&requests)),
-                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
-                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
-                   0 /* stat cache max age */, 0 /* stat cache max entries */,
-                   0 /* matching paths cache max age */,
-                   0 /* matching paths cache max entries */, kTestRetryConfig,
-                   kTestTimeoutConfig, *kAllowedLocationsAuto,
-                   nullptr /* gcs additional header */);
+  GcsFileSystem fs(
+      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+      std::unique_ptr<HttpRequest::Factory>(
+          new FakeHttpRequestFactory(&requests)),
+      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
+      0 /* max bytes */, 0 /* max staleness */, 0 /* stat cache max age */,
+      0 /* stat cache max entries */, 0 /* matching paths cache max age */,
+      0 /* matching paths cache max entries */, kTestRetryConfig,
+      kTestTimeoutConfig, *kAllowedLocationsAuto,
+      nullptr /* gcs additional header */, false /* compose append */);
 
   std::unique_ptr<RandomAccessFile> file;
   EXPECT_EQ(tensorflow::errors::FailedPrecondition(
@@ -552,16 +553,16 @@ TEST(GcsFileSystemTest, NewRandomAccessFile_NoBlockCache_DifferentN) {
            "Range: 3-12\n"
            "Timeouts: 5 1 20\n",
            "3456789")});
-  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-                   std::unique_ptr<HttpRequest::Factory>(
-                       new FakeHttpRequestFactory(&requests)),
-                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
-                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
-                   0 /* stat cache max age */, 0 /* stat cache max entries */,
-                   0 /* matching paths cache max age */,
-                   0 /* matching paths cache max entries */, kTestRetryConfig,
-                   kTestTimeoutConfig, *kAllowedLocationsDefault,
-                   nullptr /* gcs additional header */);
+  GcsFileSystem fs(
+      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+      std::unique_ptr<HttpRequest::Factory>(
+          new FakeHttpRequestFactory(&requests)),
+      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
+      0 /* max bytes */, 0 /* max staleness */, 0 /* stat cache max age */,
+      0 /* stat cache max entries */, 0 /* matching paths cache max age */,
+      0 /* matching paths cache max entries */, kTestRetryConfig,
+      kTestTimeoutConfig, *kAllowedLocationsDefault,
+      nullptr /* gcs additional header */, false /* compose append */);
 
   std::unique_ptr<RandomAccessFile> file;
   TF_EXPECT_OK(fs.NewRandomAccessFile("gs://bucket/random_access.txt", &file));
@@ -621,7 +622,7 @@ TEST(GcsFileSystemTest, NewRandomAccessFile_WithBlockCache) {
       0 /* stat cache max entries */, 0 /* matching paths cache max age */,
       0 /* matching paths cache max entries */, kTestRetryConfig,
       kTestTimeoutConfig, *kAllowedLocationsDefault,
-      nullptr /* gcs additional header */);
+      nullptr /* gcs additional header */, false /* compose append */);
 
   char scratch[100];
   StringPiece result;
@@ -710,7 +711,7 @@ TEST(GcsFileSystemTest, NewRandomAccessFile_WithBlockCache_Flush) {
       0 /* stat cache max entries */, 0 /* matching paths cache max age */,
       0 /* matching paths cache max entries */, kTestRetryConfig,
       kTestTimeoutConfig, *kAllowedLocationsDefault,
-      nullptr /* gcs additional header */);
+      nullptr /* gcs additional header */, false /* compose append */);
 
   char scratch[100];
   StringPiece result;
@@ -750,17 +751,17 @@ TEST(GcsFileSystemTest, NewRandomAccessFile_WithBlockCache_MaxStaleness) {
                            "Range: 8-15\n"
                            "Timeouts: 5 1 20\n",
                            "89abcdef")});
-  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-                   std::unique_ptr<HttpRequest::Factory>(
-                       new FakeHttpRequestFactory(&requests)),
-                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
-                   8 /* block size */, 16 /* max bytes */,
-                   3600 /* max staleness */, 3600 /* stat cache max age */,
-                   0 /* stat cache max entries */,
-                   0 /* matching paths cache max age */,
-                   0 /* matching paths cache max entries */, kTestRetryConfig,
-                   kTestTimeoutConfig, *kAllowedLocationsDefault,
-                   nullptr /* gcs additional header */);
+  GcsFileSystem fs(
+      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+      std::unique_ptr<HttpRequest::Factory>(
+          new FakeHttpRequestFactory(&requests)),
+      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 8 /* block size */,
+      16 /* max bytes */, 3600 /* max staleness */,
+      3600 /* stat cache max age */, 0 /* stat cache max entries */,
+      0 /* matching paths cache max age */,
+      0 /* matching paths cache max entries */, kTestRetryConfig,
+      kTestTimeoutConfig, *kAllowedLocationsDefault,
+      nullptr /* gcs additional header */, false /* compose append */);
   char scratch[100];
   StringPiece result;
   // There should only be two HTTP requests issued to GCS even though we iterate
@@ -830,7 +831,7 @@ TEST(GcsFileSystemTest,
       0 /* stat cache max entries */, 0 /* matching paths cache max age */,
       0 /* matching paths cache max entries */, kTestRetryConfig,
       kTestTimeoutConfig, *kAllowedLocationsDefault,
-      nullptr /* gcs additional header */);
+      nullptr /* gcs additional header */, false /* compose append */);
 
   std::unique_ptr<RandomAccessFile> file;
   TF_EXPECT_OK(fs.NewRandomAccessFile("gs://bucket/random_access.txt", &file));
@@ -849,17 +850,17 @@ TEST(GcsFileSystemTest,
 
 TEST(GcsFileSystemTest, NewRandomAccessFile_NoObjectName) {
   std::vector<HttpRequest*> requests;
-  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-                   std::unique_ptr<HttpRequest::Factory>(
-                       new FakeHttpRequestFactory(&requests)),
-                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
-                   0 /* read ahead bytes */, 0 /* max bytes */,
-                   0 /* max staleness */, 0 /* stat cache max age */,
-                   0 /* stat cache max entries */,
-                   0 /* matching paths cache max age */,
-                   0 /* matching paths cache max entries */, kTestRetryConfig,
-                   kTestTimeoutConfig, *kAllowedLocationsDefault,
-                   nullptr /* gcs additional header */);
+  GcsFileSystem fs(
+      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+      std::unique_ptr<HttpRequest::Factory>(
+          new FakeHttpRequestFactory(&requests)),
+      std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
+      0 /* read ahead bytes */, 0 /* max bytes */, 0 /* max staleness */,
+      0 /* stat cache max age */, 0 /* stat cache max entries */,
+      0 /* matching paths cache max age */,
+      0 /* matching paths cache max entries */, kTestRetryConfig,
+      kTestTimeoutConfig, *kAllowedLocationsDefault,
+      nullptr /* gcs additional header */, false /* compose append */);
 
   std::unique_ptr<RandomAccessFile> file;
   EXPECT_EQ(errors::Code::INVALID_ARGUMENT,
@@ -883,16 +884,16 @@ TEST(GcsFileSystemTest, NewRandomAccessFile_InconsistentRead) {
            "012")});
 
   // Set stat_cache_max_age to 1000s so that StatCache could work.
-  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-                   std::unique_ptr<HttpRequest::Factory>(
-                       new FakeHttpRequestFactory(&requests)),
-                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
-                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
-                   1e3 /* stat cache max age */, 0 /* stat cache max entries */,
-                   0 /* matching paths cache max age */,
-                   0 /* matching paths cache max entries */, kTestRetryConfig,
-                   kTestTimeoutConfig, *kAllowedLocationsDefault,
-                   nullptr /* gcs additional header */);
+  GcsFileSystem fs(
+      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+      std::unique_ptr<HttpRequest::Factory>(
+          new FakeHttpRequestFactory(&requests)),
+      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
+      0 /* max bytes */, 0 /* max staleness */, 1e3 /* stat cache max age */,
+      0 /* stat cache max entries */, 0 /* matching paths cache max age */,
+      0 /* matching paths cache max entries */, kTestRetryConfig,
+      kTestTimeoutConfig, *kAllowedLocationsDefault,
+      nullptr /* gcs additional header */, false /* compose append */);
 
   // Stat the file first so that the file stats are cached.
   FileStatistics stat;
@@ -959,7 +960,7 @@ TEST(GcsFileSystemTest, NewWritableFile) {
       0 /* stat cache max entries */, 0 /* matching paths cache max age */,
       0 /* matching paths cache max entries */, kTestRetryConfig,
       kTestTimeoutConfig, *kAllowedLocationsDefault,
-      nullptr /* gcs additional header */);
+      nullptr /* gcs additional header */, false /* compose append */);
 
   // Read from the file first, to fill the block cache.
   std::unique_ptr<RandomAccessFile> rfile;
@@ -1042,16 +1043,16 @@ TEST(GcsFileSystemTest, NewWritableFile_ResumeUploadSucceeds) {
                            "Timeouts: 5 1 30\n"
                            "Put body: t2\n",
                            "")});
-  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-                   std::unique_ptr<HttpRequest::Factory>(
-                       new FakeHttpRequestFactory(&requests)),
-                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
-                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
-                   0 /* stat cache max age */, 0 /* stat cache max entries */,
-                   0 /* matching paths cache max age */,
-                   0 /* matching paths cache max entries */, kTestRetryConfig,
-                   kTestTimeoutConfig, *kAllowedLocationsDefault,
-                   nullptr /* gcs additional header */);
+  GcsFileSystem fs(
+      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+      std::unique_ptr<HttpRequest::Factory>(
+          new FakeHttpRequestFactory(&requests)),
+      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
+      0 /* max bytes */, 0 /* max staleness */, 0 /* stat cache max age */,
+      0 /* stat cache max entries */, 0 /* matching paths cache max age */,
+      0 /* matching paths cache max entries */, kTestRetryConfig,
+      kTestTimeoutConfig, *kAllowedLocationsDefault,
+      nullptr /* gcs additional header */, false /* compose append */);
 
   std::unique_ptr<WritableFile> file;
   TF_EXPECT_OK(fs.NewWritableFile("gs://bucket/path/writeable.txt", &file));
@@ -1112,17 +1113,17 @@ TEST(GcsFileSystemTest, NewWritableFile_ResumeUploadSucceedsOnGetStatus) {
            "Range: 0-7\n"
            "Timeouts: 5 1 20\n",
            "01234567")});
-  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-                   std::unique_ptr<HttpRequest::Factory>(
-                       new FakeHttpRequestFactory(&requests)),
-                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
-                   8 /* block size */, 8 /* max bytes */,
-                   3600 /* max staleness */, 3600 /* stat cache max age */,
-                   0 /* stat cache max entries */,
-                   0 /* matching paths cache max age */,
-                   0 /* matching paths cache max entries */, kTestRetryConfig,
-                   kTestTimeoutConfig, *kAllowedLocationsDefault,
-                   nullptr /* gcs additional header */);
+  GcsFileSystem fs(
+      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+      std::unique_ptr<HttpRequest::Factory>(
+          new FakeHttpRequestFactory(&requests)),
+      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 8 /* block size */,
+      8 /* max bytes */, 3600 /* max staleness */,
+      3600 /* stat cache max age */, 0 /* stat cache max entries */,
+      0 /* matching paths cache max age */,
+      0 /* matching paths cache max entries */, kTestRetryConfig,
+      kTestTimeoutConfig, *kAllowedLocationsDefault,
+      nullptr /* gcs additional header */, false /* compose append */);
   // Pull the file's first block into the cache. This will trigger the first
   // HTTP request to GCS.
   std::unique_ptr<RandomAccessFile> rfile;
@@ -1208,7 +1209,8 @@ TEST(GcsFileSystemTest, NewWritableFile_ResumeUploadAllAttemptsFail) {
       0 /* stat cache max entries */, 0 /* matching paths cache max age */,
       0 /* matching paths cache max entries */,
       RetryConfig(2 /* .init_delay_time_us */), kTestTimeoutConfig,
-      *kAllowedLocationsDefault, nullptr /* gcs additional header */);
+      *kAllowedLocationsDefault, nullptr /* gcs additional header */,
+      false /* compose append */);
 
   std::unique_ptr<WritableFile> file;
   TF_EXPECT_OK(fs.NewWritableFile("gs://bucket/path/writeable.txt", &file));
@@ -1262,16 +1264,16 @@ TEST(GcsFileSystemTest, NewWritableFile_UploadReturns410) {
                            "Timeouts: 5 1 30\n"
                            "Put body: content1,content2\n",
                            "")});
-  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-                   std::unique_ptr<HttpRequest::Factory>(
-                       new FakeHttpRequestFactory(&requests)),
-                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
-                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
-                   0 /* stat cache max age */, 0 /* stat cache max entries */,
-                   0 /* matching paths cache max age */,
-                   0 /* matching paths cache max entries */, kTestRetryConfig,
-                   kTestTimeoutConfig, *kAllowedLocationsDefault,
-                   nullptr /* gcs additional header */);
+  GcsFileSystem fs(
+      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+      std::unique_ptr<HttpRequest::Factory>(
+          new FakeHttpRequestFactory(&requests)),
+      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
+      0 /* max bytes */, 0 /* max staleness */, 0 /* stat cache max age */,
+      0 /* stat cache max entries */, 0 /* matching paths cache max age */,
+      0 /* matching paths cache max entries */, kTestRetryConfig,
+      kTestTimeoutConfig, *kAllowedLocationsDefault,
+      nullptr /* gcs additional header */, false /* compose append */);
 
   {
     std::unique_ptr<WritableFile> file;
@@ -1302,16 +1304,16 @@ TEST(GcsFileSystemTest, NewWritableFile_UploadReturns410) {
 
 TEST(GcsFileSystemTest, NewWritableFile_NoObjectName) {
   std::vector<HttpRequest*> requests;
-  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-                   std::unique_ptr<HttpRequest::Factory>(
-                       new FakeHttpRequestFactory(&requests)),
-                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
-                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
-                   0 /* stat cache max age */, 0 /* stat cache max entries */,
-                   0 /* matching paths cache max age */,
-                   0 /* matching paths cache max entries */, kTestRetryConfig,
-                   kTestTimeoutConfig, *kAllowedLocationsDefault,
-                   nullptr /* gcs additional header */);
+  GcsFileSystem fs(
+      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+      std::unique_ptr<HttpRequest::Factory>(
+          new FakeHttpRequestFactory(&requests)),
+      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
+      0 /* max bytes */, 0 /* max staleness */, 0 /* stat cache max age */,
+      0 /* stat cache max entries */, 0 /* matching paths cache max age */,
+      0 /* matching paths cache max entries */, kTestRetryConfig,
+      kTestTimeoutConfig, *kAllowedLocationsDefault,
+      nullptr /* gcs additional header */, false /* compose append */);
 
   std::unique_ptr<WritableFile> file;
   EXPECT_EQ(errors::Code::INVALID_ARGUMENT,
@@ -1375,7 +1377,7 @@ TEST(GcsFileSystemTest, NewAppendableFile) {
       0 /* stat cache max entries */, 0 /* matching paths cache max age */,
       0 /* matching paths cache max entries */, kTestRetryConfig,
       kTestTimeoutConfig, *kAllowedLocationsDefault,
-      nullptr /* gcs additional header */);
+      nullptr /* gcs additional header */, false /* compose append */);
 
   // Create an appendable file. This should read the file from GCS, and pull its
   // contents into the block cache.
@@ -1401,16 +1403,16 @@ TEST(GcsFileSystemTest, NewAppendableFile) {
 
 TEST(GcsFileSystemTest, NewAppendableFile_NoObjectName) {
   std::vector<HttpRequest*> requests;
-  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-                   std::unique_ptr<HttpRequest::Factory>(
-                       new FakeHttpRequestFactory(&requests)),
-                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
-                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
-                   0 /* stat cache max age */, 0 /* stat cache max entries */,
-                   0 /* matching paths cache max age */,
-                   0 /* matching paths cache max entries */, kTestRetryConfig,
-                   kTestTimeoutConfig, *kAllowedLocationsDefault,
-                   nullptr /* gcs additional header */);
+  GcsFileSystem fs(
+      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+      std::unique_ptr<HttpRequest::Factory>(
+          new FakeHttpRequestFactory(&requests)),
+      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
+      0 /* max bytes */, 0 /* max staleness */, 0 /* stat cache max age */,
+      0 /* stat cache max entries */, 0 /* matching paths cache max age */,
+      0 /* matching paths cache max entries */, kTestRetryConfig,
+      kTestTimeoutConfig, *kAllowedLocationsDefault,
+      nullptr /* gcs additional header */, false /* compose append */);
 
   std::unique_ptr<WritableFile> file;
   EXPECT_EQ(errors::Code::INVALID_ARGUMENT,
@@ -1435,16 +1437,16 @@ TEST(GcsFileSystemTest, NewReadOnlyMemoryRegionFromFile) {
                            "Range: 0-",
                            content.size() - 1, "\n", "Timeouts: 5 1 20\n"),
            content)});
-  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-                   std::unique_ptr<HttpRequest::Factory>(
-                       new FakeHttpRequestFactory(&requests)),
-                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
-                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
-                   0 /* stat cache max age */, 0 /* stat cache max entries */,
-                   0 /* matching paths cache max age */,
-                   0 /* matching paths cache max entries */, kTestRetryConfig,
-                   kTestTimeoutConfig, *kAllowedLocationsDefault,
-                   nullptr /* gcs additional header */);
+  GcsFileSystem fs(
+      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+      std::unique_ptr<HttpRequest::Factory>(
+          new FakeHttpRequestFactory(&requests)),
+      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
+      0 /* max bytes */, 0 /* max staleness */, 0 /* stat cache max age */,
+      0 /* stat cache max entries */, 0 /* matching paths cache max age */,
+      0 /* matching paths cache max entries */, kTestRetryConfig,
+      kTestTimeoutConfig, *kAllowedLocationsDefault,
+      nullptr /* gcs additional header */, false /* compose append */);
 
   std::unique_ptr<ReadOnlyMemoryRegion> region;
   TF_EXPECT_OK(fs.NewReadOnlyMemoryRegionFromFile(
@@ -1456,16 +1458,16 @@ TEST(GcsFileSystemTest, NewReadOnlyMemoryRegionFromFile) {
 
 TEST(GcsFileSystemTest, NewReadOnlyMemoryRegionFromFile_NoObjectName) {
   std::vector<HttpRequest*> requests;
-  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-                   std::unique_ptr<HttpRequest::Factory>(
-                       new FakeHttpRequestFactory(&requests)),
-                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
-                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
-                   0 /* stat cache max age */, 0 /* stat cache max entries */,
-                   0 /* matching paths cache max age */,
-                   0 /* matching paths cache max entries */, kTestRetryConfig,
-                   kTestTimeoutConfig, *kAllowedLocationsDefault,
-                   nullptr /* gcs additional header */);
+  GcsFileSystem fs(
+      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+      std::unique_ptr<HttpRequest::Factory>(
+          new FakeHttpRequestFactory(&requests)),
+      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
+      0 /* max bytes */, 0 /* max staleness */, 0 /* stat cache max age */,
+      0 /* stat cache max entries */, 0 /* matching paths cache max age */,
+      0 /* matching paths cache max entries */, kTestRetryConfig,
+      kTestTimeoutConfig, *kAllowedLocationsDefault,
+      nullptr /* gcs additional header */, false /* compose append */);
 
   std::unique_ptr<ReadOnlyMemoryRegion> region;
   EXPECT_EQ(errors::Code::INVALID_ARGUMENT,
@@ -1480,16 +1482,16 @@ TEST(GcsFileSystemTest, FileExists_YesAsObject) {
       "Timeouts: 5 1 10\n",
       strings::StrCat("{\"size\": \"1010\",\"generation\": \"1\","
                       "\"updated\": \"2016-04-29T23:15:24.896Z\"}"))});
-  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-                   std::unique_ptr<HttpRequest::Factory>(
-                       new FakeHttpRequestFactory(&requests)),
-                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
-                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
-                   0 /* stat cache max age */, 0 /* stat cache max entries */,
-                   0 /* matching paths cache max age */,
-                   0 /* matching paths cache max entries */, kTestRetryConfig,
-                   kTestTimeoutConfig, *kAllowedLocationsDefault,
-                   nullptr /* gcs additional header */);
+  GcsFileSystem fs(
+      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+      std::unique_ptr<HttpRequest::Factory>(
+          new FakeHttpRequestFactory(&requests)),
+      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
+      0 /* max bytes */, 0 /* max staleness */, 0 /* stat cache max age */,
+      0 /* stat cache max entries */, 0 /* matching paths cache max age */,
+      0 /* matching paths cache max entries */, kTestRetryConfig,
+      kTestTimeoutConfig, *kAllowedLocationsDefault,
+      nullptr /* gcs additional header */, false /* compose append */);
 
   TF_EXPECT_OK(fs.FileExists("gs://bucket/path/file1.txt"));
 }
@@ -1510,16 +1512,16 @@ TEST(GcsFileSystemTest, FileExists_YesAsFolder) {
            "Timeouts: 5 1 10\n",
            "{\"items\": [ "
            "  { \"name\": \"path/subfolder/\" }]}")});
-  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-                   std::unique_ptr<HttpRequest::Factory>(
-                       new FakeHttpRequestFactory(&requests)),
-                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
-                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
-                   0 /* stat cache max age */, 0 /* stat cache max entries */,
-                   0 /* matching paths cache max age */,
-                   0 /* matching paths cache max entries */, kTestRetryConfig,
-                   kTestTimeoutConfig, *kAllowedLocationsDefault,
-                   nullptr /* gcs additional header */);
+  GcsFileSystem fs(
+      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+      std::unique_ptr<HttpRequest::Factory>(
+          new FakeHttpRequestFactory(&requests)),
+      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
+      0 /* max bytes */, 0 /* max staleness */, 0 /* stat cache max age */,
+      0 /* stat cache max entries */, 0 /* matching paths cache max age */,
+      0 /* matching paths cache max entries */, kTestRetryConfig,
+      kTestTimeoutConfig, *kAllowedLocationsDefault,
+      nullptr /* gcs additional header */, false /* compose append */);
 
   TF_EXPECT_OK(fs.FileExists("gs://bucket/path/subfolder"));
 }
@@ -1536,16 +1538,16 @@ TEST(GcsFileSystemTest, FileExists_YesAsBucket) {
            "Auth Token: fake_token\n"
            "Timeouts: 5 1 10\n",
            "{\"size\": \"100\"}")});
-  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-                   std::unique_ptr<HttpRequest::Factory>(
-                       new FakeHttpRequestFactory(&requests)),
-                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
-                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
-                   0 /* stat cache max age */, 0 /* stat cache max entries */,
-                   0 /* matching paths cache max age */,
-                   0 /* matching paths cache max entries */, kTestRetryConfig,
-                   kTestTimeoutConfig, *kAllowedLocationsDefault,
-                   nullptr /* gcs additional header */);
+  GcsFileSystem fs(
+      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+      std::unique_ptr<HttpRequest::Factory>(
+          new FakeHttpRequestFactory(&requests)),
+      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
+      0 /* max bytes */, 0 /* max staleness */, 0 /* stat cache max age */,
+      0 /* stat cache max entries */, 0 /* matching paths cache max age */,
+      0 /* matching paths cache max entries */, kTestRetryConfig,
+      kTestTimeoutConfig, *kAllowedLocationsDefault,
+      nullptr /* gcs additional header */, false /* compose append */);
 
   TF_EXPECT_OK(fs.FileExists("gs://bucket1"));
   TF_EXPECT_OK(fs.FileExists("gs://bucket1/"));
@@ -1566,16 +1568,16 @@ TEST(GcsFileSystemTest, FileExists_NotAsObjectOrFolder) {
            "Auth Token: fake_token\n"
            "Timeouts: 5 1 10\n",
            "{\"items\": []}")});
-  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-                   std::unique_ptr<HttpRequest::Factory>(
-                       new FakeHttpRequestFactory(&requests)),
-                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
-                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
-                   0 /* stat cache max age */, 0 /* stat cache max entries */,
-                   0 /* matching paths cache max age */,
-                   0 /* matching paths cache max entries */, kTestRetryConfig,
-                   kTestTimeoutConfig, *kAllowedLocationsDefault,
-                   nullptr /* gcs additional header */);
+  GcsFileSystem fs(
+      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+      std::unique_ptr<HttpRequest::Factory>(
+          new FakeHttpRequestFactory(&requests)),
+      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
+      0 /* max bytes */, 0 /* max staleness */, 0 /* stat cache max age */,
+      0 /* stat cache max entries */, 0 /* matching paths cache max age */,
+      0 /* matching paths cache max entries */, kTestRetryConfig,
+      kTestTimeoutConfig, *kAllowedLocationsDefault,
+      nullptr /* gcs additional header */, false /* compose append */);
 
   EXPECT_EQ(errors::Code::NOT_FOUND,
             fs.FileExists("gs://bucket/path/file1.txt").code());
@@ -1593,16 +1595,16 @@ TEST(GcsFileSystemTest, FileExists_NotAsBucket) {
            "Auth Token: fake_token\n"
            "Timeouts: 5 1 10\n",
            "", errors::NotFound("404"), 404)});
-  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-                   std::unique_ptr<HttpRequest::Factory>(
-                       new FakeHttpRequestFactory(&requests)),
-                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
-                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
-                   0 /* stat cache max age */, 0 /* stat cache max entries */,
-                   0 /* matching paths cache max age */,
-                   0 /* matching paths cache max entries */, kTestRetryConfig,
-                   kTestTimeoutConfig, *kAllowedLocationsDefault,
-                   nullptr /* gcs additional header */);
+  GcsFileSystem fs(
+      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+      std::unique_ptr<HttpRequest::Factory>(
+          new FakeHttpRequestFactory(&requests)),
+      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
+      0 /* max bytes */, 0 /* max staleness */, 0 /* stat cache max age */,
+      0 /* stat cache max entries */, 0 /* matching paths cache max age */,
+      0 /* matching paths cache max entries */, kTestRetryConfig,
+      kTestTimeoutConfig, *kAllowedLocationsDefault,
+      nullptr /* gcs additional header */, false /* compose append */);
   EXPECT_EQ(errors::Code::INVALID_ARGUMENT,
             fs.FileExists("gs://bucket2/").code());
   EXPECT_EQ(errors::Code::INVALID_ARGUMENT,
@@ -1641,7 +1643,7 @@ TEST(GcsFileSystemTest, FileExists_StatCache) {
       0 /* stat cache max entries */, 0 /* matching paths cache max age */,
       0 /* matching paths cache max entries */, kTestRetryConfig,
       kTestTimeoutConfig, *kAllowedLocationsDefault,
-      nullptr /* gcs additional header */);
+      nullptr /* gcs additional header */, false /* compose append */);
 
   // The stat cache will ensure that repeated lookups don't trigger additional
   // HTTP requests.
@@ -1668,7 +1670,7 @@ TEST(GcsFileSystemTest, FileExists_DirectoryMark) {
       0 /* stat cache max entries */, 0 /* matching paths cache max age */,
       0 /* matching paths cache max entries */, kTestRetryConfig,
       kTestTimeoutConfig, *kAllowedLocationsDefault,
-      nullptr /* gcs additional header */);
+      nullptr /* gcs additional header */, false /* compose append */);
 
   TF_EXPECT_OK(fs.FileExists("gs://bucket/dir/"));
   TF_EXPECT_OK(fs.IsDirectory("gs://bucket/dir/"));
@@ -1682,16 +1684,16 @@ TEST(GcsFileSystemTest, GetChildren_NoItems) {
       "Auth Token: fake_token\n"
       "Timeouts: 5 1 10\n",
       "{\"prefixes\": [\"path/subpath/\"]}")});
-  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-                   std::unique_ptr<HttpRequest::Factory>(
-                       new FakeHttpRequestFactory(&requests)),
-                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
-                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
-                   0 /* stat cache max age */, 0 /* stat cache max entries */,
-                   0 /* matching paths cache max age */,
-                   0 /* matching paths cache max entries */, kTestRetryConfig,
-                   kTestTimeoutConfig, *kAllowedLocationsDefault,
-                   nullptr /* gcs additional header */);
+  GcsFileSystem fs(
+      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+      std::unique_ptr<HttpRequest::Factory>(
+          new FakeHttpRequestFactory(&requests)),
+      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
+      0 /* max bytes */, 0 /* max staleness */, 0 /* stat cache max age */,
+      0 /* stat cache max entries */, 0 /* matching paths cache max age */,
+      0 /* matching paths cache max entries */, kTestRetryConfig,
+      kTestTimeoutConfig, *kAllowedLocationsDefault,
+      nullptr /* gcs additional header */, false /* compose append */);
 
   std::vector<string> children;
   TF_EXPECT_OK(fs.GetChildren("gs://bucket/path/", &children));
@@ -1710,16 +1712,16 @@ TEST(GcsFileSystemTest, GetChildren_ThreeFiles) {
       "  { \"name\": \"path/file1.txt\" },"
       "  { \"name\": \"path/file3.txt\" }],"
       "\"prefixes\": [\"path/subpath/\"]}")});
-  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-                   std::unique_ptr<HttpRequest::Factory>(
-                       new FakeHttpRequestFactory(&requests)),
-                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
-                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
-                   0 /* stat cache max age */, 0 /* stat cache max entries */,
-                   0 /* matching paths cache max age */,
-                   0 /* matching paths cache max entries */, kTestRetryConfig,
-                   kTestTimeoutConfig, *kAllowedLocationsDefault,
-                   nullptr /* gcs additional header */);
+  GcsFileSystem fs(
+      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+      std::unique_ptr<HttpRequest::Factory>(
+          new FakeHttpRequestFactory(&requests)),
+      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
+      0 /* max bytes */, 0 /* max staleness */, 0 /* stat cache max age */,
+      0 /* stat cache max entries */, 0 /* matching paths cache max age */,
+      0 /* matching paths cache max entries */, kTestRetryConfig,
+      kTestTimeoutConfig, *kAllowedLocationsDefault,
+      nullptr /* gcs additional header */, false /* compose append */);
 
   std::vector<string> children;
   TF_EXPECT_OK(fs.GetChildren("gs://bucket/path/", &children));
@@ -1739,16 +1741,16 @@ TEST(GcsFileSystemTest, GetChildren_SelfDirectoryMarker) {
       "  { \"name\": \"path/\" },"
       "  { \"name\": \"path/file3.txt\" }],"
       "\"prefixes\": [\"path/subpath/\"]}")});
-  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-                   std::unique_ptr<HttpRequest::Factory>(
-                       new FakeHttpRequestFactory(&requests)),
-                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
-                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
-                   0 /* stat cache max age */, 0 /* stat cache max entries */,
-                   0 /* matching paths cache max age */,
-                   0 /* matching paths cache max entries */, kTestRetryConfig,
-                   kTestTimeoutConfig, *kAllowedLocationsDefault,
-                   nullptr /* gcs additional header */);
+  GcsFileSystem fs(
+      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+      std::unique_ptr<HttpRequest::Factory>(
+          new FakeHttpRequestFactory(&requests)),
+      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
+      0 /* max bytes */, 0 /* max staleness */, 0 /* stat cache max age */,
+      0 /* stat cache max entries */, 0 /* matching paths cache max age */,
+      0 /* matching paths cache max entries */, kTestRetryConfig,
+      kTestTimeoutConfig, *kAllowedLocationsDefault,
+      nullptr /* gcs additional header */, false /* compose append */);
 
   std::vector<string> children;
   TF_EXPECT_OK(fs.GetChildren("gs://bucket/path/", &children));
@@ -1767,16 +1769,16 @@ TEST(GcsFileSystemTest, GetChildren_ThreeFiles_NoSlash) {
       "  { \"name\": \"path/file1.txt\" },"
       "  { \"name\": \"path/file3.txt\" }],"
       "\"prefixes\": [\"path/subpath/\"]}")});
-  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-                   std::unique_ptr<HttpRequest::Factory>(
-                       new FakeHttpRequestFactory(&requests)),
-                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
-                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
-                   0 /* stat cache max age */, 0 /* stat cache max entries */,
-                   0 /* matching paths cache max age */,
-                   0 /* matching paths cache max entries */, kTestRetryConfig,
-                   kTestTimeoutConfig, *kAllowedLocationsDefault,
-                   nullptr /* gcs additional header */);
+  GcsFileSystem fs(
+      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+      std::unique_ptr<HttpRequest::Factory>(
+          new FakeHttpRequestFactory(&requests)),
+      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
+      0 /* max bytes */, 0 /* max staleness */, 0 /* stat cache max age */,
+      0 /* stat cache max entries */, 0 /* matching paths cache max age */,
+      0 /* matching paths cache max entries */, kTestRetryConfig,
+      kTestTimeoutConfig, *kAllowedLocationsDefault,
+      nullptr /* gcs additional header */, false /* compose append */);
 
   std::vector<string> children;
   TF_EXPECT_OK(fs.GetChildren("gs://bucket/path", &children));
@@ -1792,16 +1794,16 @@ TEST(GcsFileSystemTest, GetChildren_Root) {
       "Auth Token: fake_token\n"
       "Timeouts: 5 1 10\n",
       "{}")});
-  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-                   std::unique_ptr<HttpRequest::Factory>(
-                       new FakeHttpRequestFactory(&requests)),
-                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
-                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
-                   0 /* stat cache max age */, 0 /* stat cache max entries */,
-                   0 /* matching paths cache max age */,
-                   0 /* matching paths cache max entries */, kTestRetryConfig,
-                   kTestTimeoutConfig, *kAllowedLocationsDefault,
-                   nullptr /* gcs additional header */);
+  GcsFileSystem fs(
+      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+      std::unique_ptr<HttpRequest::Factory>(
+          new FakeHttpRequestFactory(&requests)),
+      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
+      0 /* max bytes */, 0 /* max staleness */, 0 /* stat cache max age */,
+      0 /* stat cache max entries */, 0 /* matching paths cache max age */,
+      0 /* matching paths cache max entries */, kTestRetryConfig,
+      kTestTimeoutConfig, *kAllowedLocationsDefault,
+      nullptr /* gcs additional header */, false /* compose append */);
 
   std::vector<string> children;
   TF_EXPECT_OK(fs.GetChildren("gs://bucket-a-b-c", &children));
@@ -1817,16 +1819,16 @@ TEST(GcsFileSystemTest, GetChildren_Empty) {
       "Auth Token: fake_token\n"
       "Timeouts: 5 1 10\n",
       "{}")});
-  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-                   std::unique_ptr<HttpRequest::Factory>(
-                       new FakeHttpRequestFactory(&requests)),
-                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
-                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
-                   0 /* stat cache max age */, 0 /* stat cache max entries */,
-                   0 /* matching paths cache max age */,
-                   0 /* matching paths cache max entries */, kTestRetryConfig,
-                   kTestTimeoutConfig, *kAllowedLocationsDefault,
-                   nullptr /* gcs additional header */);
+  GcsFileSystem fs(
+      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+      std::unique_ptr<HttpRequest::Factory>(
+          new FakeHttpRequestFactory(&requests)),
+      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
+      0 /* max bytes */, 0 /* max staleness */, 0 /* stat cache max age */,
+      0 /* stat cache max entries */, 0 /* matching paths cache max age */,
+      0 /* matching paths cache max entries */, kTestRetryConfig,
+      kTestTimeoutConfig, *kAllowedLocationsDefault,
+      nullptr /* gcs additional header */, false /* compose append */);
 
   std::vector<string> children;
   TF_EXPECT_OK(fs.GetChildren("gs://bucket/path/", &children));
@@ -1858,16 +1860,16 @@ TEST(GcsFileSystemTest, GetChildren_Pagination) {
            "  { \"name\": \"path/file4.txt\" },"
            "  { \"name\": \"path/file5.txt\" }]}")});
 
-  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-                   std::unique_ptr<HttpRequest::Factory>(
-                       new FakeHttpRequestFactory(&requests)),
-                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
-                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
-                   0 /* stat cache max age */, 0 /* stat cache max entries */,
-                   0 /* matching paths cache max age */,
-                   0 /* matching paths cache max entries */, kTestRetryConfig,
-                   kTestTimeoutConfig, *kAllowedLocationsDefault,
-                   nullptr /* gcs additional header */);
+  GcsFileSystem fs(
+      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+      std::unique_ptr<HttpRequest::Factory>(
+          new FakeHttpRequestFactory(&requests)),
+      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
+      0 /* max bytes */, 0 /* max staleness */, 0 /* stat cache max age */,
+      0 /* stat cache max entries */, 0 /* matching paths cache max age */,
+      0 /* matching paths cache max entries */, kTestRetryConfig,
+      kTestTimeoutConfig, *kAllowedLocationsDefault,
+      nullptr /* gcs additional header */, false /* compose append */);
 
   std::vector<string> children;
   TF_EXPECT_OK(fs.GetChildren("gs://bucket/path", &children));
@@ -1885,16 +1887,16 @@ TEST(GcsFileSystemTest, GetMatchingPaths_NoWildcard) {
       "Timeouts: 5 1 10\n",
       "{\"items\": [ "
       "  { \"name\": \"path/subpath/file2.txt\" }]}")});
-  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-                   std::unique_ptr<HttpRequest::Factory>(
-                       new FakeHttpRequestFactory(&requests)),
-                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
-                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
-                   0 /* stat cache max age */, 0 /* stat cache max entries */,
-                   0 /* matching paths cache max age */,
-                   0 /* matching paths cache max entries */, kTestRetryConfig,
-                   kTestTimeoutConfig, *kAllowedLocationsDefault,
-                   nullptr /* gcs additional header */);
+  GcsFileSystem fs(
+      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+      std::unique_ptr<HttpRequest::Factory>(
+          new FakeHttpRequestFactory(&requests)),
+      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
+      0 /* max bytes */, 0 /* max staleness */, 0 /* stat cache max age */,
+      0 /* stat cache max entries */, 0 /* matching paths cache max age */,
+      0 /* matching paths cache max entries */, kTestRetryConfig,
+      kTestTimeoutConfig, *kAllowedLocationsDefault,
+      nullptr /* gcs additional header */, false /* compose append */);
 
   std::vector<string> result;
   TF_EXPECT_OK(
@@ -1913,16 +1915,16 @@ TEST(GcsFileSystemTest, GetMatchingPaths_BucketAndWildcard) {
       "  { \"name\": \"path/file1.txt\" },"
       "  { \"name\": \"path/subpath/file2.txt\" },"
       "  { \"name\": \"path/file3.txt\" }]}")});
-  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-                   std::unique_ptr<HttpRequest::Factory>(
-                       new FakeHttpRequestFactory(&requests)),
-                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
-                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
-                   0 /* stat cache max age */, 0 /* stat cache max entries */,
-                   0 /* matching paths cache max age */,
-                   0 /* matching paths cache max entries */, kTestRetryConfig,
-                   kTestTimeoutConfig, *kAllowedLocationsDefault,
-                   nullptr /* gcs additional header */);
+  GcsFileSystem fs(
+      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+      std::unique_ptr<HttpRequest::Factory>(
+          new FakeHttpRequestFactory(&requests)),
+      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
+      0 /* max bytes */, 0 /* max staleness */, 0 /* stat cache max age */,
+      0 /* stat cache max entries */, 0 /* matching paths cache max age */,
+      0 /* matching paths cache max entries */, kTestRetryConfig,
+      kTestTimeoutConfig, *kAllowedLocationsDefault,
+      nullptr /* gcs additional header */, false /* compose append */);
 
   std::vector<string> result;
   TF_EXPECT_OK(fs.GetMatchingPaths("gs://bucket/*/*", &result));
@@ -1942,16 +1944,16 @@ TEST(GcsFileSystemTest, GetMatchingPaths_FolderAndWildcard_Matches) {
       "  { \"name\": \"path/file1.txt\" },"
       "  { \"name\": \"path/subpath/file2.txt\" },"
       "  { \"name\": \"path/file3.txt\" }]}")});
-  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-                   std::unique_ptr<HttpRequest::Factory>(
-                       new FakeHttpRequestFactory(&requests)),
-                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
-                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
-                   0 /* stat cache max age */, 0 /* stat cache max entries */,
-                   0 /* matching paths cache max age */,
-                   0 /* matching paths cache max entries */, kTestRetryConfig,
-                   kTestTimeoutConfig, *kAllowedLocationsDefault,
-                   nullptr /* gcs additional header */);
+  GcsFileSystem fs(
+      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+      std::unique_ptr<HttpRequest::Factory>(
+          new FakeHttpRequestFactory(&requests)),
+      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
+      0 /* max bytes */, 0 /* max staleness */, 0 /* stat cache max age */,
+      0 /* stat cache max entries */, 0 /* matching paths cache max age */,
+      0 /* matching paths cache max entries */, kTestRetryConfig,
+      kTestTimeoutConfig, *kAllowedLocationsDefault,
+      nullptr /* gcs additional header */, false /* compose append */);
 
   std::vector<string> result;
   TF_EXPECT_OK(fs.GetMatchingPaths("gs://bucket/path/*/file2.txt", &result));
@@ -1968,16 +1970,16 @@ TEST(GcsFileSystemTest, GetMatchingPaths_SelfDirectoryMarker) {
       "{\"items\": [ "
       "  { \"name\": \"path/\" },"
       "  { \"name\": \"path/file3.txt\" }]}")});
-  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-                   std::unique_ptr<HttpRequest::Factory>(
-                       new FakeHttpRequestFactory(&requests)),
-                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
-                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
-                   0 /* stat cache max age */, 0 /* stat cache max entries */,
-                   0 /* matching paths cache max age */,
-                   0 /* matching paths cache max entries */, kTestRetryConfig,
-                   kTestTimeoutConfig, *kAllowedLocationsDefault,
-                   nullptr /* gcs additional header */);
+  GcsFileSystem fs(
+      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+      std::unique_ptr<HttpRequest::Factory>(
+          new FakeHttpRequestFactory(&requests)),
+      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
+      0 /* max bytes */, 0 /* max staleness */, 0 /* stat cache max age */,
+      0 /* stat cache max entries */, 0 /* matching paths cache max age */,
+      0 /* matching paths cache max entries */, kTestRetryConfig,
+      kTestTimeoutConfig, *kAllowedLocationsDefault,
+      nullptr /* gcs additional header */, false /* compose append */);
 
   std::vector<string> result;
   TF_EXPECT_OK(fs.GetMatchingPaths("gs://bucket/path/*", &result));
@@ -1993,16 +1995,16 @@ TEST(GcsFileSystemTest, GetMatchingPaths_SlashInObjectName) {
       "{\"items\": [ "
       "  { \"name\": \"path/\" },"
       "  { \"name\": \"path//foo.txt\" }]}")});
-  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-                   std::unique_ptr<HttpRequest::Factory>(
-                       new FakeHttpRequestFactory(&requests)),
-                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
-                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
-                   0 /* stat cache max age */, 0 /* stat cache max entries */,
-                   0 /* matching paths cache max age */,
-                   0 /* matching paths cache max entries */, kTestRetryConfig,
-                   kTestTimeoutConfig, *kAllowedLocationsDefault,
-                   nullptr /* gcs additional header */);
+  GcsFileSystem fs(
+      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+      std::unique_ptr<HttpRequest::Factory>(
+          new FakeHttpRequestFactory(&requests)),
+      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
+      0 /* max bytes */, 0 /* max staleness */, 0 /* stat cache max age */,
+      0 /* stat cache max entries */, 0 /* matching paths cache max age */,
+      0 /* matching paths cache max entries */, kTestRetryConfig,
+      kTestTimeoutConfig, *kAllowedLocationsDefault,
+      nullptr /* gcs additional header */, false /* compose append */);
 
   std::vector<string> result;
   TF_EXPECT_OK(fs.GetMatchingPaths("gs://bucket/path/*", &result));
@@ -2018,16 +2020,16 @@ TEST(GcsFileSystemTest, GetMatchingPaths_SlashInObjectNameEscaped) {
       "{\"items\": [ "
       "  { \"name\": \"path/\" },"
       "  { \"name\": \"path//foo.txt\" }]}")});
-  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-                   std::unique_ptr<HttpRequest::Factory>(
-                       new FakeHttpRequestFactory(&requests)),
-                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
-                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
-                   0 /* stat cache max age */, 0 /* stat cache max entries */,
-                   0 /* matching paths cache max age */,
-                   0 /* matching paths cache max entries */, kTestRetryConfig,
-                   kTestTimeoutConfig, *kAllowedLocationsDefault,
-                   nullptr /* gcs additional header */);
+  GcsFileSystem fs(
+      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+      std::unique_ptr<HttpRequest::Factory>(
+          new FakeHttpRequestFactory(&requests)),
+      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
+      0 /* max bytes */, 0 /* max staleness */, 0 /* stat cache max age */,
+      0 /* stat cache max entries */, 0 /* matching paths cache max age */,
+      0 /* matching paths cache max entries */, kTestRetryConfig,
+      kTestTimeoutConfig, *kAllowedLocationsDefault,
+      nullptr /* gcs additional header */, false /* compose append */);
 
   std::vector<string> result;
   TF_EXPECT_OK(fs.GetMatchingPaths("gs://bucket/path/\\/*", &result));
@@ -2044,16 +2046,16 @@ TEST(GcsFileSystemTest, GetMatchingPaths_FolderAndWildcard_NoMatches) {
       "  { \"name\": \"path/file1.txt\" },"
       "  { \"name\": \"path/subpath/file2.txt\" },"
       "  { \"name\": \"path/file3.txt\" }]}")});
-  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-                   std::unique_ptr<HttpRequest::Factory>(
-                       new FakeHttpRequestFactory(&requests)),
-                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
-                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
-                   0 /* stat cache max age */, 0 /* stat cache max entries */,
-                   0 /* matching paths cache max age */,
-                   0 /* matching paths cache max entries */, kTestRetryConfig,
-                   kTestTimeoutConfig, *kAllowedLocationsDefault,
-                   nullptr /* gcs additional header */);
+  GcsFileSystem fs(
+      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+      std::unique_ptr<HttpRequest::Factory>(
+          new FakeHttpRequestFactory(&requests)),
+      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
+      0 /* max bytes */, 0 /* max staleness */, 0 /* stat cache max age */,
+      0 /* stat cache max entries */, 0 /* matching paths cache max age */,
+      0 /* matching paths cache max entries */, kTestRetryConfig,
+      kTestTimeoutConfig, *kAllowedLocationsDefault,
+      nullptr /* gcs additional header */, false /* compose append */);
 
   std::vector<string> result;
   TF_EXPECT_OK(fs.GetMatchingPaths("gs://bucket/path/*/file3.txt", &result));
@@ -2062,16 +2064,16 @@ TEST(GcsFileSystemTest, GetMatchingPaths_FolderAndWildcard_NoMatches) {
 
 TEST(GcsFileSystemTest, GetMatchingPaths_OnlyWildcard) {
   std::vector<HttpRequest*> requests;
-  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-                   std::unique_ptr<HttpRequest::Factory>(
-                       new FakeHttpRequestFactory(&requests)),
-                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
-                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
-                   0 /* stat cache max age */, 0 /* stat cache max entries */,
-                   0 /* matching paths cache max age */,
-                   0 /* matching paths cache max entries */, kTestRetryConfig,
-                   kTestTimeoutConfig, *kAllowedLocationsDefault,
-                   nullptr /* gcs additional header */);
+  GcsFileSystem fs(
+      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+      std::unique_ptr<HttpRequest::Factory>(
+          new FakeHttpRequestFactory(&requests)),
+      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
+      0 /* max bytes */, 0 /* max staleness */, 0 /* stat cache max age */,
+      0 /* stat cache max entries */, 0 /* matching paths cache max age */,
+      0 /* matching paths cache max entries */, kTestRetryConfig,
+      kTestTimeoutConfig, *kAllowedLocationsDefault,
+      nullptr /* gcs additional header */, false /* compose append */);
 
   std::vector<string> result;
   EXPECT_EQ(errors::Code::INVALID_ARGUMENT,
@@ -2096,16 +2098,16 @@ TEST(GcsFileSystemTest, GetMatchingPaths_Cache) {
            "  { \"name\": \"path/file1.txt\" },"
            "  { \"name\": \"path/subpath/file2.txt\" },"
            "  { \"name\": \"path/file3.txt\" }]}")});
-  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-                   std::unique_ptr<HttpRequest::Factory>(
-                       new FakeHttpRequestFactory(&requests)),
-                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
-                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
-                   0 /* stat cache max age */, 0 /* stat cache max entries */,
-                   3600 /* matching paths cache max age */,
-                   0 /* matching paths cache max entries */, kTestRetryConfig,
-                   kTestTimeoutConfig, *kAllowedLocationsDefault,
-                   nullptr /* gcs additional header */);
+  GcsFileSystem fs(
+      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+      std::unique_ptr<HttpRequest::Factory>(
+          new FakeHttpRequestFactory(&requests)),
+      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
+      0 /* max bytes */, 0 /* max staleness */, 0 /* stat cache max age */,
+      0 /* stat cache max entries */, 3600 /* matching paths cache max age */,
+      0 /* matching paths cache max entries */, kTestRetryConfig,
+      kTestTimeoutConfig, *kAllowedLocationsDefault,
+      nullptr /* gcs additional header */, false /* compose append */);
 
   // Repeated calls to fs.GetMatchingPaths on these patterns should not lead to
   // any additional HTTP requests to GCS.
@@ -2139,16 +2141,16 @@ TEST(GcsFileSystemTest, GetMatchingPaths_Cache_Flush) {
            "Timeouts: 5 1 10\n",
            "{\"items\": [ "
            "  { \"name\": \"path/subpath/file2.txt\" }]}")});
-  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-                   std::unique_ptr<HttpRequest::Factory>(
-                       new FakeHttpRequestFactory(&requests)),
-                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
-                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
-                   0 /* stat cache max age */, 0 /* stat cache max entries */,
-                   3600 /* matching paths cache max age */,
-                   0 /* matching paths cache max entries */, kTestRetryConfig,
-                   kTestTimeoutConfig, *kAllowedLocationsDefault,
-                   nullptr /* gcs additional header */);
+  GcsFileSystem fs(
+      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+      std::unique_ptr<HttpRequest::Factory>(
+          new FakeHttpRequestFactory(&requests)),
+      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
+      0 /* max bytes */, 0 /* max staleness */, 0 /* stat cache max age */,
+      0 /* stat cache max entries */, 3600 /* matching paths cache max age */,
+      0 /* matching paths cache max entries */, kTestRetryConfig,
+      kTestTimeoutConfig, *kAllowedLocationsDefault,
+      nullptr /* gcs additional header */, false /* compose append */);
 
   // This loop should trigger the first HTTP request to GCS.
   for (int i = 0; i < 10; i++) {
@@ -2212,7 +2214,7 @@ TEST(GcsFileSystemTest, DeleteFile) {
       0 /* stat cache max entries */, 0 /* matching paths cache max age */,
       0 /* matching paths cache max entries */, kTestRetryConfig,
       kTestTimeoutConfig, *kAllowedLocationsDefault,
-      nullptr /* gcs additional header */);
+      nullptr /* gcs additional header */, false /* compose append */);
 
   // Do an initial read of the file to load its contents into the block cache.
   char scratch[100];
@@ -2231,16 +2233,16 @@ TEST(GcsFileSystemTest, DeleteFile) {
 
 TEST(GcsFileSystemTest, DeleteFile_NoObjectName) {
   std::vector<HttpRequest*> requests;
-  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-                   std::unique_ptr<HttpRequest::Factory>(
-                       new FakeHttpRequestFactory(&requests)),
-                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
-                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
-                   0 /* stat cache max age */, 0 /* stat cache max entries */,
-                   0 /* matching paths cache max age */,
-                   0 /* matching paths cache max entries */, kTestRetryConfig,
-                   kTestTimeoutConfig, *kAllowedLocationsDefault,
-                   nullptr /* gcs additional header */);
+  GcsFileSystem fs(
+      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+      std::unique_ptr<HttpRequest::Factory>(
+          new FakeHttpRequestFactory(&requests)),
+      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
+      0 /* max bytes */, 0 /* max staleness */, 0 /* stat cache max age */,
+      0 /* stat cache max entries */, 0 /* matching paths cache max age */,
+      0 /* matching paths cache max entries */, kTestRetryConfig,
+      kTestTimeoutConfig, *kAllowedLocationsDefault,
+      nullptr /* gcs additional header */, false /* compose append */);
 
   EXPECT_EQ(errors::Code::INVALID_ARGUMENT,
             fs.DeleteFile("gs://bucket/").code());
@@ -2283,7 +2285,7 @@ TEST(GcsFileSystemTest, DeleteFile_StatCacheRemoved) {
       0 /* stat cache max entries */, 0 /* matching paths cache max age */,
       0 /* matching paths cache max entries */, kTestRetryConfig,
       kTestTimeoutConfig, *kAllowedLocationsDefault,
-      nullptr /* gcs additional header */);
+      nullptr /* gcs additional header */, false /* compose append */);
 
   // Stats the file first so the stat is cached.
   FileStatistics stat_before_deletion;
@@ -2304,16 +2306,16 @@ TEST(GcsFileSystemTest, DeleteDir_Empty) {
       "Auth Token: fake_token\n"
       "Timeouts: 5 1 10\n",
       "{}")});
-  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-                   std::unique_ptr<HttpRequest::Factory>(
-                       new FakeHttpRequestFactory(&requests)),
-                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
-                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
-                   0 /* stat cache max age */, 0 /* stat cache max entries */,
-                   0 /* matching paths cache max age */,
-                   0 /* matching paths cache max entries */, kTestRetryConfig,
-                   kTestTimeoutConfig, *kAllowedLocationsDefault,
-                   nullptr /* gcs additional header */);
+  GcsFileSystem fs(
+      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+      std::unique_ptr<HttpRequest::Factory>(
+          new FakeHttpRequestFactory(&requests)),
+      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
+      0 /* max bytes */, 0 /* max staleness */, 0 /* stat cache max age */,
+      0 /* stat cache max entries */, 0 /* matching paths cache max age */,
+      0 /* matching paths cache max entries */, kTestRetryConfig,
+      kTestTimeoutConfig, *kAllowedLocationsDefault,
+      nullptr /* gcs additional header */, false /* compose append */);
 
   TF_EXPECT_OK(fs.DeleteDir("gs://bucket/path/"));
 }
@@ -2333,16 +2335,16 @@ TEST(GcsFileSystemTest, DeleteDir_OnlyDirMarkerLeft) {
                            "Timeouts: 5 1 10\n"
                            "Delete: yes\n",
                            "")});
-  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-                   std::unique_ptr<HttpRequest::Factory>(
-                       new FakeHttpRequestFactory(&requests)),
-                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
-                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
-                   0 /* stat cache max age */, 0 /* stat cache max entries */,
-                   0 /* matching paths cache max age */,
-                   0 /* matching paths cache max entries */, kTestRetryConfig,
-                   kTestTimeoutConfig, *kAllowedLocationsDefault,
-                   nullptr /* gcs additional header */);
+  GcsFileSystem fs(
+      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+      std::unique_ptr<HttpRequest::Factory>(
+          new FakeHttpRequestFactory(&requests)),
+      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
+      0 /* max bytes */, 0 /* max staleness */, 0 /* stat cache max age */,
+      0 /* stat cache max entries */, 0 /* matching paths cache max age */,
+      0 /* matching paths cache max entries */, kTestRetryConfig,
+      kTestTimeoutConfig, *kAllowedLocationsDefault,
+      nullptr /* gcs additional header */, false /* compose append */);
 
   TF_EXPECT_OK(fs.DeleteDir("gs://bucket/path/"));
 }
@@ -2353,16 +2355,16 @@ TEST(GcsFileSystemTest, DeleteDir_BucketOnly) {
       "name%2CnextPageToken&maxResults=2\nAuth Token: fake_token\n"
       "Timeouts: 5 1 10\n",
       "{}")});
-  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-                   std::unique_ptr<HttpRequest::Factory>(
-                       new FakeHttpRequestFactory(&requests)),
-                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
-                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
-                   0 /* stat cache max age */, 0 /* stat cache max entries */,
-                   0 /* matching paths cache max age */,
-                   0 /* matching paths cache max entries */, kTestRetryConfig,
-                   kTestTimeoutConfig, *kAllowedLocationsDefault,
-                   nullptr /* gcs additional header */);
+  GcsFileSystem fs(
+      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+      std::unique_ptr<HttpRequest::Factory>(
+          new FakeHttpRequestFactory(&requests)),
+      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
+      0 /* max bytes */, 0 /* max staleness */, 0 /* stat cache max age */,
+      0 /* stat cache max entries */, 0 /* matching paths cache max age */,
+      0 /* matching paths cache max entries */, kTestRetryConfig,
+      kTestTimeoutConfig, *kAllowedLocationsDefault,
+      nullptr /* gcs additional header */, false /* compose append */);
 
   TF_EXPECT_OK(fs.DeleteDir("gs://bucket"));
 }
@@ -2375,16 +2377,16 @@ TEST(GcsFileSystemTest, DeleteDir_NonEmpty) {
       "Timeouts: 5 1 10\n",
       "{\"items\": [ "
       "  { \"name\": \"path/file1.txt\" }]}")});
-  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-                   std::unique_ptr<HttpRequest::Factory>(
-                       new FakeHttpRequestFactory(&requests)),
-                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
-                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
-                   0 /* stat cache max age */, 0 /* stat cache max entries */,
-                   0 /* matching paths cache max age */,
-                   0 /* matching paths cache max entries */, kTestRetryConfig,
-                   kTestTimeoutConfig, *kAllowedLocationsDefault,
-                   nullptr /* gcs additional header */);
+  GcsFileSystem fs(
+      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+      std::unique_ptr<HttpRequest::Factory>(
+          new FakeHttpRequestFactory(&requests)),
+      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
+      0 /* max bytes */, 0 /* max staleness */, 0 /* stat cache max age */,
+      0 /* stat cache max entries */, 0 /* matching paths cache max age */,
+      0 /* matching paths cache max entries */, kTestRetryConfig,
+      kTestTimeoutConfig, *kAllowedLocationsDefault,
+      nullptr /* gcs additional header */, false /* compose append */);
 
   EXPECT_EQ(error::Code::FAILED_PRECONDITION,
             fs.DeleteDir("gs://bucket/path/").code());
@@ -2398,16 +2400,16 @@ TEST(GcsFileSystemTest, GetFileSize) {
       "Timeouts: 5 1 10\n",
       strings::StrCat("{\"size\": \"1010\",\"generation\": \"1\","
                       "\"updated\": \"2016-04-29T23:15:24.896Z\"}"))});
-  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-                   std::unique_ptr<HttpRequest::Factory>(
-                       new FakeHttpRequestFactory(&requests)),
-                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
-                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
-                   0 /* stat cache max age */, 0 /* stat cache max entries */,
-                   0 /* matching paths cache max age */,
-                   0 /* matching paths cache max entries */, kTestRetryConfig,
-                   kTestTimeoutConfig, *kAllowedLocationsDefault,
-                   nullptr /* gcs additional header */);
+  GcsFileSystem fs(
+      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+      std::unique_ptr<HttpRequest::Factory>(
+          new FakeHttpRequestFactory(&requests)),
+      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
+      0 /* max bytes */, 0 /* max staleness */, 0 /* stat cache max age */,
+      0 /* stat cache max entries */, 0 /* matching paths cache max age */,
+      0 /* matching paths cache max entries */, kTestRetryConfig,
+      kTestTimeoutConfig, *kAllowedLocationsDefault,
+      nullptr /* gcs additional header */, false /* compose append */);
 
   uint64 size;
   TF_EXPECT_OK(fs.GetFileSize("gs://bucket/file.txt", &size));
@@ -2416,16 +2418,16 @@ TEST(GcsFileSystemTest, GetFileSize) {
 
 TEST(GcsFileSystemTest, GetFileSize_NoObjectName) {
   std::vector<HttpRequest*> requests;
-  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-                   std::unique_ptr<HttpRequest::Factory>(
-                       new FakeHttpRequestFactory(&requests)),
-                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
-                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
-                   0 /* stat cache max age */, 0 /* stat cache max entries */,
-                   0 /* matching paths cache max age */,
-                   0 /* matching paths cache max entries */, kTestRetryConfig,
-                   kTestTimeoutConfig, *kAllowedLocationsDefault,
-                   nullptr /* gcs additional header */);
+  GcsFileSystem fs(
+      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+      std::unique_ptr<HttpRequest::Factory>(
+          new FakeHttpRequestFactory(&requests)),
+      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
+      0 /* max bytes */, 0 /* max staleness */, 0 /* stat cache max age */,
+      0 /* stat cache max entries */, 0 /* matching paths cache max age */,
+      0 /* matching paths cache max entries */, kTestRetryConfig,
+      kTestTimeoutConfig, *kAllowedLocationsDefault,
+      nullptr /* gcs additional header */, false /* compose append */);
 
   uint64 size;
   EXPECT_EQ(errors::Code::INVALID_ARGUMENT,
@@ -2502,16 +2504,16 @@ TEST(GcsFileSystemTest, RenameFile_Folder) {
            "Timeouts: 5 1 10\n"
            "Delete: yes\n",
            "")});
-  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-                   std::unique_ptr<HttpRequest::Factory>(
-                       new FakeHttpRequestFactory(&requests)),
-                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
-                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
-                   0 /* stat cache max age */, 0 /* stat cache max entries */,
-                   0 /* matching paths cache max age */,
-                   0 /* matching paths cache max entries */, kTestRetryConfig,
-                   kTestTimeoutConfig, *kAllowedLocationsDefault,
-                   nullptr /* gcs additional header */);
+  GcsFileSystem fs(
+      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+      std::unique_ptr<HttpRequest::Factory>(
+          new FakeHttpRequestFactory(&requests)),
+      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
+      0 /* max bytes */, 0 /* max staleness */, 0 /* stat cache max age */,
+      0 /* stat cache max entries */, 0 /* matching paths cache max age */,
+      0 /* matching paths cache max entries */, kTestRetryConfig,
+      kTestTimeoutConfig, *kAllowedLocationsDefault,
+      nullptr /* gcs additional header */, false /* compose append */);
 
   TF_EXPECT_OK(fs.RenameFile("gs://bucket/path1", "gs://bucket/path2/"));
 }
@@ -2603,7 +2605,7 @@ TEST(GcsFileSystemTest, RenameFile_Object) {
       0 /* stat cache max entries */, 0 /* matching paths cache max age */,
       0 /* matching paths cache max entries */, kTestRetryConfig,
       kTestTimeoutConfig, *kAllowedLocationsDefault,
-      nullptr /* gcs additional header */);
+      nullptr /* gcs additional header */, false /* compose append */);
   // Do an initial read of the source and destination files to load their
   // contents into the block cache.
   char scratch[100];
@@ -2684,7 +2686,7 @@ TEST(GcsFileSystemTest, RenameFile_Object_FlushTargetStatCache) {
       0 /* stat cache max entries */, 0 /* matching paths cache max age */,
       0 /* matching paths cache max entries */, kTestRetryConfig,
       kTestTimeoutConfig, *kAllowedLocationsDefault,
-      nullptr /* gcs additional header */);
+      nullptr /* gcs additional header */, false /* compose append */);
   // Do an initial stat of the destination file to load their contents into the
   // stat cache.
   FileStatistics stat_before_renaming;
@@ -2742,16 +2744,16 @@ TEST(GcsFileSystemTest, RenameFile_Object_DeletionRetried) {
            "Timeouts: 5 1 10\n"
            "Delete: yes\n",
            "", errors::NotFound("404"), 404)});
-  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-                   std::unique_ptr<HttpRequest::Factory>(
-                       new FakeHttpRequestFactory(&requests)),
-                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
-                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
-                   0 /* stat cache max age */, 0 /* stat cache max entries */,
-                   0 /* matching paths cache max age */,
-                   0 /* matching paths cache max entries */, kTestRetryConfig,
-                   kTestTimeoutConfig, *kAllowedLocationsDefault,
-                   nullptr /* gcs additional header */);
+  GcsFileSystem fs(
+      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+      std::unique_ptr<HttpRequest::Factory>(
+          new FakeHttpRequestFactory(&requests)),
+      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
+      0 /* max bytes */, 0 /* max staleness */, 0 /* stat cache max age */,
+      0 /* stat cache max entries */, 0 /* matching paths cache max age */,
+      0 /* matching paths cache max entries */, kTestRetryConfig,
+      kTestTimeoutConfig, *kAllowedLocationsDefault,
+      nullptr /* gcs additional header */, false /* compose append */);
 
   TF_EXPECT_OK(
       fs.RenameFile("gs://bucket/path/src.txt", "gs://bucket/path/dst.txt"));
@@ -2784,16 +2786,16 @@ TEST(GcsFileSystemTest, RenameFile_Object_Incomplete) {
            "Post: yes\n"
            "Timeouts: 5 1 10\n",
            "{\"done\": false}")});
-  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-                   std::unique_ptr<HttpRequest::Factory>(
-                       new FakeHttpRequestFactory(&requests)),
-                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
-                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
-                   0 /* stat cache max age */, 0 /* stat cache max entries */,
-                   0 /* matching paths cache max age */,
-                   0 /* matching paths cache max entries */, kTestRetryConfig,
-                   kTestTimeoutConfig, *kAllowedLocationsDefault,
-                   nullptr /* gcs additional header */);
+  GcsFileSystem fs(
+      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+      std::unique_ptr<HttpRequest::Factory>(
+          new FakeHttpRequestFactory(&requests)),
+      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
+      0 /* max bytes */, 0 /* max staleness */, 0 /* stat cache max age */,
+      0 /* stat cache max entries */, 0 /* matching paths cache max age */,
+      0 /* matching paths cache max entries */, kTestRetryConfig,
+      kTestTimeoutConfig, *kAllowedLocationsDefault,
+      nullptr /* gcs additional header */, false /* compose append */);
 
   EXPECT_EQ(
       errors::Code::UNIMPLEMENTED,
@@ -2809,16 +2811,16 @@ TEST(GcsFileSystemTest, Stat_Object) {
       "Timeouts: 5 1 10\n",
       strings::StrCat("{\"size\": \"1010\",\"generation\": \"1\","
                       "\"updated\": \"2016-04-29T23:15:24.896Z\"}"))});
-  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-                   std::unique_ptr<HttpRequest::Factory>(
-                       new FakeHttpRequestFactory(&requests)),
-                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
-                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
-                   0 /* stat cache max age */, 0 /* stat cache max entries */,
-                   0 /* matching paths cache max age */,
-                   0 /* matching paths cache max entries */, kTestRetryConfig,
-                   kTestTimeoutConfig, *kAllowedLocationsDefault,
-                   nullptr /* gcs additional header */);
+  GcsFileSystem fs(
+      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+      std::unique_ptr<HttpRequest::Factory>(
+          new FakeHttpRequestFactory(&requests)),
+      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
+      0 /* max bytes */, 0 /* max staleness */, 0 /* stat cache max age */,
+      0 /* stat cache max entries */, 0 /* matching paths cache max age */,
+      0 /* matching paths cache max entries */, kTestRetryConfig,
+      kTestTimeoutConfig, *kAllowedLocationsDefault,
+      nullptr /* gcs additional header */, false /* compose append */);
 
   FileStatistics stat;
   TF_EXPECT_OK(fs.Stat("gs://bucket/file.txt", &stat));
@@ -2843,16 +2845,16 @@ TEST(GcsFileSystemTest, Stat_Folder) {
            "Timeouts: 5 1 10\n",
            "{\"items\": [ "
            "  { \"name\": \"subfolder/\" }]}")});
-  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-                   std::unique_ptr<HttpRequest::Factory>(
-                       new FakeHttpRequestFactory(&requests)),
-                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
-                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
-                   0 /* stat cache max age */, 0 /* stat cache max entries */,
-                   0 /* matching paths cache max age */,
-                   0 /* matching paths cache max entries */, kTestRetryConfig,
-                   kTestTimeoutConfig, *kAllowedLocationsDefault,
-                   nullptr /* gcs additional header */);
+  GcsFileSystem fs(
+      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+      std::unique_ptr<HttpRequest::Factory>(
+          new FakeHttpRequestFactory(&requests)),
+      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
+      0 /* max bytes */, 0 /* max staleness */, 0 /* stat cache max age */,
+      0 /* stat cache max entries */, 0 /* matching paths cache max age */,
+      0 /* matching paths cache max entries */, kTestRetryConfig,
+      kTestTimeoutConfig, *kAllowedLocationsDefault,
+      nullptr /* gcs additional header */, false /* compose append */);
 
   FileStatistics stat;
   TF_EXPECT_OK(fs.Stat("gs://bucket/subfolder", &stat));
@@ -2876,16 +2878,16 @@ TEST(GcsFileSystemTest, Stat_ObjectOrFolderNotFound) {
            "Auth Token: fake_token\n"
            "Timeouts: 5 1 10\n",
            "{}")});
-  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-                   std::unique_ptr<HttpRequest::Factory>(
-                       new FakeHttpRequestFactory(&requests)),
-                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
-                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
-                   0 /* stat cache max age */, 0 /* stat cache max entries */,
-                   0 /* matching paths cache max age */,
-                   0 /* matching paths cache max entries */, kTestRetryConfig,
-                   kTestTimeoutConfig, *kAllowedLocationsDefault,
-                   nullptr /* gcs additional header */);
+  GcsFileSystem fs(
+      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+      std::unique_ptr<HttpRequest::Factory>(
+          new FakeHttpRequestFactory(&requests)),
+      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
+      0 /* max bytes */, 0 /* max staleness */, 0 /* stat cache max age */,
+      0 /* stat cache max entries */, 0 /* matching paths cache max age */,
+      0 /* matching paths cache max entries */, kTestRetryConfig,
+      kTestTimeoutConfig, *kAllowedLocationsDefault,
+      nullptr /* gcs additional header */, false /* compose append */);
 
   FileStatistics stat;
   EXPECT_EQ(error::Code::NOT_FOUND, fs.Stat("gs://bucket/path", &stat).code());
@@ -2897,16 +2899,16 @@ TEST(GcsFileSystemTest, Stat_Bucket) {
       "Auth Token: fake_token\n"
       "Timeouts: 5 1 10\n",
       "{}")});
-  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-                   std::unique_ptr<HttpRequest::Factory>(
-                       new FakeHttpRequestFactory(&requests)),
-                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
-                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
-                   0 /* stat cache max age */, 0 /* stat cache max entries */,
-                   0 /* matching paths cache max age */,
-                   0 /* matching paths cache max entries */, kTestRetryConfig,
-                   kTestTimeoutConfig, *kAllowedLocationsDefault,
-                   nullptr /* gcs additional header */);
+  GcsFileSystem fs(
+      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+      std::unique_ptr<HttpRequest::Factory>(
+          new FakeHttpRequestFactory(&requests)),
+      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
+      0 /* max bytes */, 0 /* max staleness */, 0 /* stat cache max age */,
+      0 /* stat cache max entries */, 0 /* matching paths cache max age */,
+      0 /* matching paths cache max entries */, kTestRetryConfig,
+      kTestTimeoutConfig, *kAllowedLocationsDefault,
+      nullptr /* gcs additional header */, false /* compose append */);
 
   FileStatistics stat;
   TF_EXPECT_OK(fs.Stat("gs://bucket/", &stat));
@@ -2921,16 +2923,16 @@ TEST(GcsFileSystemTest, Stat_BucketNotFound) {
       "Auth Token: fake_token\n"
       "Timeouts: 5 1 10\n",
       "", errors::NotFound("404"), 404)});
-  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-                   std::unique_ptr<HttpRequest::Factory>(
-                       new FakeHttpRequestFactory(&requests)),
-                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
-                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
-                   0 /* stat cache max age */, 0 /* stat cache max entries */,
-                   0 /* matching paths cache max age */,
-                   0 /* matching paths cache max entries */, kTestRetryConfig,
-                   kTestTimeoutConfig, *kAllowedLocationsDefault,
-                   nullptr /* gcs additional header */);
+  GcsFileSystem fs(
+      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+      std::unique_ptr<HttpRequest::Factory>(
+          new FakeHttpRequestFactory(&requests)),
+      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
+      0 /* max bytes */, 0 /* max staleness */, 0 /* stat cache max age */,
+      0 /* stat cache max entries */, 0 /* matching paths cache max age */,
+      0 /* matching paths cache max entries */, kTestRetryConfig,
+      kTestTimeoutConfig, *kAllowedLocationsDefault,
+      nullptr /* gcs additional header */, false /* compose append */);
 
   FileStatistics stat;
   EXPECT_EQ(error::Code::NOT_FOUND, fs.Stat("gs://bucket/", &stat).code());
@@ -2968,7 +2970,7 @@ TEST(GcsFileSystemTest, Stat_Cache) {
       0 /* stat cache max entries */, 0 /* matching paths cache max age */,
       0 /* matching paths cache max entries */, kTestRetryConfig,
       kTestTimeoutConfig, *kAllowedLocationsDefault,
-      nullptr /* gcs additional header */);
+      nullptr /* gcs additional header */, false /* compose append */);
 
   // Repeated calls to fs.Stat on these paths should not lead to any additional
   // HTTP requests to GCS.
@@ -3010,7 +3012,7 @@ TEST(GcsFileSystemTest, Stat_Cache_Flush) {
       0 /* stat cache max entries */, 0 /* matching paths cache max age */,
       0 /* matching paths cache max entries */, kTestRetryConfig,
       kTestTimeoutConfig, *kAllowedLocationsDefault,
-      nullptr /* gcs additional header */);
+      nullptr /* gcs additional header */, false /* compose append */);
   // There should be a single HTTP request to GCS for fs.Stat in this loop.
   for (int i = 0; i < 10; i++) {
     FileStatistics stat;
@@ -3038,16 +3040,16 @@ TEST(GcsFileSystemTest, Stat_FilenameEndingWithSlash) {
       "Timeouts: 5 1 10\n",
       strings::StrCat("{\"size\": \"5\",\"generation\": \"1\","
                       "\"updated\": \"2016-04-29T23:15:24.896Z\"}"))});
-  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-                   std::unique_ptr<HttpRequest::Factory>(
-                       new FakeHttpRequestFactory(&requests)),
-                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
-                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
-                   0 /* stat cache max age */, 0 /* stat cache max entries */,
-                   0 /* matching paths cache max age */,
-                   0 /* matching paths cache max entries */, kTestRetryConfig,
-                   kTestTimeoutConfig, *kAllowedLocationsDefault,
-                   nullptr /* gcs additional header */);
+  GcsFileSystem fs(
+      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+      std::unique_ptr<HttpRequest::Factory>(
+          new FakeHttpRequestFactory(&requests)),
+      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
+      0 /* max bytes */, 0 /* max staleness */, 0 /* stat cache max age */,
+      0 /* stat cache max entries */, 0 /* matching paths cache max age */,
+      0 /* matching paths cache max entries */, kTestRetryConfig,
+      kTestTimeoutConfig, *kAllowedLocationsDefault,
+      nullptr /* gcs additional header */, false /* compose append */);
 
   FileStatistics stat;
   TF_EXPECT_OK(fs.Stat("gs://bucket/dir/", &stat));
@@ -3070,16 +3072,16 @@ TEST(GcsFileSystemTest, IsDirectory_NotFound) {
            "Auth Token: fake_token\n"
            "Timeouts: 5 1 10\n",
            "", errors::NotFound("404"), 404)});
-  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-                   std::unique_ptr<HttpRequest::Factory>(
-                       new FakeHttpRequestFactory(&requests)),
-                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
-                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
-                   0 /* stat cache max age */, 0 /* stat cache max entries */,
-                   0 /* matching paths cache max age */,
-                   0 /* matching paths cache max entries */, kTestRetryConfig,
-                   kTestTimeoutConfig, *kAllowedLocationsDefault,
-                   nullptr /* gcs additional header */);
+  GcsFileSystem fs(
+      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+      std::unique_ptr<HttpRequest::Factory>(
+          new FakeHttpRequestFactory(&requests)),
+      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
+      0 /* max bytes */, 0 /* max staleness */, 0 /* stat cache max age */,
+      0 /* stat cache max entries */, 0 /* matching paths cache max age */,
+      0 /* matching paths cache max entries */, kTestRetryConfig,
+      kTestTimeoutConfig, *kAllowedLocationsDefault,
+      nullptr /* gcs additional header */, false /* compose append */);
 
   EXPECT_EQ(error::Code::NOT_FOUND,
             fs.IsDirectory("gs://bucket/file.txt").code());
@@ -3101,16 +3103,16 @@ TEST(GcsFileSystemTest, IsDirectory_NotDirectoryButObject) {
            "Timeouts: 5 1 10\n",
            strings::StrCat("{\"size\": \"1010\",\"generation\": \"1\","
                            "\"updated\": \"2016-04-29T23:15:24.896Z\"}"))});
-  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-                   std::unique_ptr<HttpRequest::Factory>(
-                       new FakeHttpRequestFactory(&requests)),
-                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
-                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
-                   0 /* stat cache max age */, 0 /* stat cache max entries */,
-                   0 /* matching paths cache max age */,
-                   0 /* matching paths cache max entries */, kTestRetryConfig,
-                   kTestTimeoutConfig, *kAllowedLocationsDefault,
-                   nullptr /* gcs additional header */);
+  GcsFileSystem fs(
+      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+      std::unique_ptr<HttpRequest::Factory>(
+          new FakeHttpRequestFactory(&requests)),
+      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
+      0 /* max bytes */, 0 /* max staleness */, 0 /* stat cache max age */,
+      0 /* stat cache max entries */, 0 /* matching paths cache max age */,
+      0 /* matching paths cache max entries */, kTestRetryConfig,
+      kTestTimeoutConfig, *kAllowedLocationsDefault,
+      nullptr /* gcs additional header */, false /* compose append */);
 
   EXPECT_EQ(error::Code::FAILED_PRECONDITION,
             fs.IsDirectory("gs://bucket/file.txt").code());
@@ -3132,16 +3134,16 @@ TEST(GcsFileSystemTest, IsDirectory_Yes) {
            "Auth Token: fake_token\n"
            "Timeouts: 5 1 10\n",
            "{\"items\": [{\"name\": \"subfolder/\"}]}")});
-  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-                   std::unique_ptr<HttpRequest::Factory>(
-                       new FakeHttpRequestFactory(&requests)),
-                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
-                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
-                   0 /* stat cache max age */, 0 /* stat cache max entries */,
-                   0 /* matching paths cache max age */,
-                   0 /* matching paths cache max entries */, kTestRetryConfig,
-                   kTestTimeoutConfig, *kAllowedLocationsDefault,
-                   nullptr /* gcs additional header */);
+  GcsFileSystem fs(
+      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+      std::unique_ptr<HttpRequest::Factory>(
+          new FakeHttpRequestFactory(&requests)),
+      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
+      0 /* max bytes */, 0 /* max staleness */, 0 /* stat cache max age */,
+      0 /* stat cache max entries */, 0 /* matching paths cache max age */,
+      0 /* matching paths cache max entries */, kTestRetryConfig,
+      kTestTimeoutConfig, *kAllowedLocationsDefault,
+      nullptr /* gcs additional header */, false /* compose append */);
 
   TF_EXPECT_OK(fs.IsDirectory("gs://bucket/subfolder"));
   TF_EXPECT_OK(fs.IsDirectory("gs://bucket/subfolder/"));
@@ -3159,16 +3161,16 @@ TEST(GcsFileSystemTest, IsDirectory_Bucket) {
            "Auth Token: fake_token\n"
            "Timeouts: 5 1 10\n",
            "{}")});
-  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-                   std::unique_ptr<HttpRequest::Factory>(
-                       new FakeHttpRequestFactory(&requests)),
-                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
-                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
-                   0 /* stat cache max age */, 0 /* stat cache max entries */,
-                   0 /* matching paths cache max age */,
-                   0 /* matching paths cache max entries */, kTestRetryConfig,
-                   kTestTimeoutConfig, *kAllowedLocationsDefault,
-                   nullptr /* gcs additional header */);
+  GcsFileSystem fs(
+      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+      std::unique_ptr<HttpRequest::Factory>(
+          new FakeHttpRequestFactory(&requests)),
+      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
+      0 /* max bytes */, 0 /* max staleness */, 0 /* stat cache max age */,
+      0 /* stat cache max entries */, 0 /* matching paths cache max age */,
+      0 /* matching paths cache max entries */, kTestRetryConfig,
+      kTestTimeoutConfig, *kAllowedLocationsDefault,
+      nullptr /* gcs additional header */, false /* compose append */);
 
   TF_EXPECT_OK(fs.IsDirectory("gs://bucket"));
   TF_EXPECT_OK(fs.IsDirectory("gs://bucket/"));
@@ -3180,16 +3182,16 @@ TEST(GcsFileSystemTest, IsDirectory_BucketNotFound) {
       "Auth Token: fake_token\n"
       "Timeouts: 5 1 10\n",
       "", errors::NotFound("404"), 404)});
-  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-                   std::unique_ptr<HttpRequest::Factory>(
-                       new FakeHttpRequestFactory(&requests)),
-                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
-                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
-                   0 /* stat cache max age */, 0 /* stat cache max entries */,
-                   0 /* matching paths cache max age */,
-                   0 /* matching paths cache max entries */, kTestRetryConfig,
-                   kTestTimeoutConfig, *kAllowedLocationsDefault,
-                   nullptr /* gcs additional header */);
+  GcsFileSystem fs(
+      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+      std::unique_ptr<HttpRequest::Factory>(
+          new FakeHttpRequestFactory(&requests)),
+      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
+      0 /* max bytes */, 0 /* max staleness */, 0 /* stat cache max age */,
+      0 /* stat cache max entries */, 0 /* matching paths cache max age */,
+      0 /* matching paths cache max entries */, kTestRetryConfig,
+      kTestTimeoutConfig, *kAllowedLocationsDefault,
+      nullptr /* gcs additional header */, false /* compose append */);
 
   EXPECT_EQ(error::Code::NOT_FOUND, fs.IsDirectory("gs://bucket/").code());
 }
@@ -3222,16 +3224,16 @@ TEST(GcsFileSystemTest, CreateDir_Folder) {
            "Timeouts: 5 1 10\n",
            strings::StrCat("{\"size\": \"1010\",\"generation\": \"1\","
                            "\"updated\": \"2016-04-29T23:15:24.896Z\"}"))});
-  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-                   std::unique_ptr<HttpRequest::Factory>(
-                       new FakeHttpRequestFactory(&requests)),
-                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
-                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
-                   0 /* stat cache max age */, 0 /* stat cache max entries */,
-                   0 /* matching paths cache max age */,
-                   0 /* matching paths cache max entries */, kTestRetryConfig,
-                   kTestTimeoutConfig, *kAllowedLocationsDefault,
-                   nullptr /* gcs additional header */);
+  GcsFileSystem fs(
+      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+      std::unique_ptr<HttpRequest::Factory>(
+          new FakeHttpRequestFactory(&requests)),
+      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
+      0 /* max bytes */, 0 /* max staleness */, 0 /* stat cache max age */,
+      0 /* stat cache max entries */, 0 /* matching paths cache max age */,
+      0 /* matching paths cache max entries */, kTestRetryConfig,
+      kTestTimeoutConfig, *kAllowedLocationsDefault,
+      nullptr /* gcs additional header */, false /* compose append */);
 
   TF_EXPECT_OK(fs.CreateDir("gs://bucket/subpath"));
   EXPECT_EQ(errors::AlreadyExists("gs://bucket/subpath/"),
@@ -3250,16 +3252,16 @@ TEST(GcsFileSystemTest, CreateDir_Bucket) {
            "Auth Token: fake_token\n"
            "Timeouts: 5 1 10\n",
            "")});
-  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-                   std::unique_ptr<HttpRequest::Factory>(
-                       new FakeHttpRequestFactory(&requests)),
-                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
-                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
-                   0 /* stat cache max age */, 0 /* stat cache max entries */,
-                   0 /* matching paths cache max age */,
-                   0 /* matching paths cache max entries */, kTestRetryConfig,
-                   kTestTimeoutConfig, *kAllowedLocationsDefault,
-                   nullptr /* gcs additional header */);
+  GcsFileSystem fs(
+      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+      std::unique_ptr<HttpRequest::Factory>(
+          new FakeHttpRequestFactory(&requests)),
+      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
+      0 /* max bytes */, 0 /* max staleness */, 0 /* stat cache max age */,
+      0 /* stat cache max entries */, 0 /* matching paths cache max age */,
+      0 /* matching paths cache max entries */, kTestRetryConfig,
+      kTestTimeoutConfig, *kAllowedLocationsDefault,
+      nullptr /* gcs additional header */, false /* compose append */);
 
   TF_EXPECT_OK(fs.CreateDir("gs://bucket/"));
   TF_EXPECT_OK(fs.CreateDir("gs://bucket"));
@@ -3322,16 +3324,16 @@ TEST(GcsFileSystemTest, DeleteRecursively_Ok) {
                            "Timeouts: 5 1 10\n"
                            "Delete: yes\n",
                            "")});
-  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-                   std::unique_ptr<HttpRequest::Factory>(
-                       new FakeHttpRequestFactory(&requests)),
-                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
-                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
-                   0 /* stat cache max age */, 0 /* stat cache max entries */,
-                   0 /* matching paths cache max age */,
-                   0 /* matching paths cache max entries */, kTestRetryConfig,
-                   kTestTimeoutConfig, *kAllowedLocationsDefault,
-                   nullptr /* gcs additional header */);
+  GcsFileSystem fs(
+      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+      std::unique_ptr<HttpRequest::Factory>(
+          new FakeHttpRequestFactory(&requests)),
+      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
+      0 /* max bytes */, 0 /* max staleness */, 0 /* stat cache max age */,
+      0 /* stat cache max entries */, 0 /* matching paths cache max age */,
+      0 /* matching paths cache max entries */, kTestRetryConfig,
+      kTestTimeoutConfig, *kAllowedLocationsDefault,
+      nullptr /* gcs additional header */, false /* compose append */);
 
   int64 undeleted_files, undeleted_dirs;
   TF_EXPECT_OK(fs.DeleteRecursively("gs://bucket/path", &undeleted_files,
@@ -3415,16 +3417,16 @@ TEST(GcsFileSystemTest, DeleteRecursively_DeletionErrors) {
            "Timeouts: 5 1 10\n",
            "", errors::NotFound("404"), 404)});
 
-  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-                   std::unique_ptr<HttpRequest::Factory>(
-                       new FakeHttpRequestFactory(&requests)),
-                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
-                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
-                   0 /* stat cache max age */, 0 /* stat cache max entries */,
-                   0 /* matching paths cache max age */,
-                   0 /* matching paths cache max entries */, kTestRetryConfig,
-                   kTestTimeoutConfig, *kAllowedLocationsDefault,
-                   nullptr /* gcs additional header */);
+  GcsFileSystem fs(
+      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+      std::unique_ptr<HttpRequest::Factory>(
+          new FakeHttpRequestFactory(&requests)),
+      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
+      0 /* max bytes */, 0 /* max staleness */, 0 /* stat cache max age */,
+      0 /* stat cache max entries */, 0 /* matching paths cache max age */,
+      0 /* matching paths cache max entries */, kTestRetryConfig,
+      kTestTimeoutConfig, *kAllowedLocationsDefault,
+      nullptr /* gcs additional header */, false /* compose append */);
 
   int64 undeleted_files, undeleted_dirs;
   TF_EXPECT_OK(fs.DeleteRecursively("gs://bucket/path", &undeleted_files,
@@ -3450,16 +3452,16 @@ TEST(GcsFileSystemTest, DeleteRecursively_NotAFolder) {
            "Auth Token: fake_token\n"
            "Timeouts: 5 1 10\n",
            "", errors::NotFound("404"), 404)});
-  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-                   std::unique_ptr<HttpRequest::Factory>(
-                       new FakeHttpRequestFactory(&requests)),
-                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
-                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
-                   0 /* stat cache max age */, 0 /* stat cache max entries */,
-                   0 /* matching paths cache max age */,
-                   0 /* matching paths cache max entries */, kTestRetryConfig,
-                   kTestTimeoutConfig, *kAllowedLocationsDefault,
-                   nullptr /* gcs additional header */);
+  GcsFileSystem fs(
+      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+      std::unique_ptr<HttpRequest::Factory>(
+          new FakeHttpRequestFactory(&requests)),
+      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
+      0 /* max bytes */, 0 /* max staleness */, 0 /* stat cache max age */,
+      0 /* stat cache max entries */, 0 /* matching paths cache max age */,
+      0 /* matching paths cache max entries */, kTestRetryConfig,
+      kTestTimeoutConfig, *kAllowedLocationsDefault,
+      nullptr /* gcs additional header */, false /* compose append */);
 
   int64 undeleted_files, undeleted_dirs;
   EXPECT_EQ(error::Code::NOT_FOUND,
@@ -3543,7 +3545,7 @@ TEST(GcsFileSystemTest, AdditionalRequestHeaderTest) {
       0 /* stat cache max entries */, 0 /* matching paths cache max age */,
       0 /* matching paths cache max entries */, kTestRetryConfig,
       kTestTimeoutConfig, *kAllowedLocationsDefault,
-      add_header /* gcs additional header */);
+      add_header /* gcs additional header */, false /* compose append */);
 
   std::unique_ptr<HttpRequest> request;
   TF_EXPECT_OK(fs7.CreateHttpRequest(&request));
@@ -3613,16 +3615,16 @@ TEST(GcsFileSystemTest, CreateHttpRequest) {
                            "Auth Token: fake_token\n"
                            "Header Hello: world\n",
                            "{}")});
-  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-                   std::unique_ptr<HttpRequest::Factory>(
-                       new FakeHttpRequestFactory(&requests)),
-                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
-                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
-                   0 /* stat cache max age */, 0 /* stat cache max entries */,
-                   0 /* matching paths cache max age */,
-                   0 /* matching paths cache max entries */, kTestRetryConfig,
-                   kTestTimeoutConfig, *kAllowedLocationsDefault,
-                   nullptr /* gcs additional header */);
+  GcsFileSystem fs(
+      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+      std::unique_ptr<HttpRequest::Factory>(
+          new FakeHttpRequestFactory(&requests)),
+      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
+      0 /* max bytes */, 0 /* max staleness */, 0 /* stat cache max age */,
+      0 /* stat cache max entries */, 0 /* matching paths cache max age */,
+      0 /* matching paths cache max entries */, kTestRetryConfig,
+      kTestTimeoutConfig, *kAllowedLocationsDefault,
+      nullptr /* gcs additional header */, false /* compose append */);
 
   std::unique_ptr<HttpRequest> request;
   TF_EXPECT_OK(fs.CreateHttpRequest(&request));
@@ -3676,16 +3678,16 @@ TEST(GcsFileSystemTest, Stat_StatsRecording) {
       "Timeouts: 5 1 10\n",
       strings::StrCat("{\"size\": \"1010\",\"generation\": \"1\","
                       "\"updated\": \"2016-04-29T23:15:24.896Z\"}"))});
-  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-                   std::unique_ptr<HttpRequest::Factory>(
-                       new FakeHttpRequestFactory(&requests)),
-                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
-                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
-                   0 /* stat cache max age */, 0 /* stat cache max entries */,
-                   0 /* matching paths cache max age */,
-                   0 /* matching paths cache max entries */, kTestRetryConfig,
-                   kTestTimeoutConfig, *kAllowedLocationsDefault,
-                   nullptr /* gcs additional header */);
+  GcsFileSystem fs(
+      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+      std::unique_ptr<HttpRequest::Factory>(
+          new FakeHttpRequestFactory(&requests)),
+      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
+      0 /* max bytes */, 0 /* max staleness */, 0 /* stat cache max age */,
+      0 /* stat cache max entries */, 0 /* matching paths cache max age */,
+      0 /* matching paths cache max entries */, kTestRetryConfig,
+      kTestTimeoutConfig, *kAllowedLocationsDefault,
+      nullptr /* gcs additional header */, false /* compose append */);
 
   TestGcsStats stats;
   fs.SetStats(&stats);
@@ -3703,16 +3705,16 @@ TEST(GcsFileSystemTest, NewRandomAccessFile_StatsRecording) {
       "Range: 0-5\n"
       "Timeouts: 5 1 20\n",
       "012345")});
-  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-                   std::unique_ptr<HttpRequest::Factory>(
-                       new FakeHttpRequestFactory(&requests)),
-                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
-                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
-                   0 /* stat cache max age */, 0 /* stat cache max entries */,
-                   0 /* matching paths cache max age */,
-                   0 /* matching paths cache max entries */, kTestRetryConfig,
-                   kTestTimeoutConfig, *kAllowedLocationsDefault,
-                   nullptr /* gcs additional header */);
+  GcsFileSystem fs(
+      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+      std::unique_ptr<HttpRequest::Factory>(
+          new FakeHttpRequestFactory(&requests)),
+      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
+      0 /* max bytes */, 0 /* max staleness */, 0 /* stat cache max age */,
+      0 /* stat cache max entries */, 0 /* matching paths cache max age */,
+      0 /* matching paths cache max entries */, kTestRetryConfig,
+      kTestTimeoutConfig, *kAllowedLocationsDefault,
+      nullptr /* gcs additional header */, false /* compose append */);
 
   TestGcsStats stats;
   fs.SetStats(&stats);
@@ -3732,5 +3734,253 @@ TEST(GcsFileSystemTest, NewRandomAccessFile_StatsRecording) {
   EXPECT_EQ(6, stats.block_retrieved_bytes_transferred_);
 }
 
+TEST(GcsFileSystemTest, NewAppendableFile_MultipleFlushesWithCompose) {
+  std::vector<string> contents(
+      {"content0,", "content1,", "content2,", "content3,"});
+  std::vector<HttpRequest*> requests({
+      // Fetch the file (stats and then content)
+      new FakeHttpRequest(
+          "Uri: "
+          "https://www.googleapis.com/storage/v1/b/bucket/o/"
+          "some%2Fpath%2Fappendable?fields=size%2Cgeneration%2Cupdated\n"
+          "Auth Token: fake_token\n"
+          "Timeouts: 5 1 10\n",
+          strings::StrCat("{\"size\": \"8\",\"generation\": \"1\","
+                          "\"updated\": \"2016-04-29T23:15:24.896Z\"}")),
+      new FakeHttpRequest(
+          "Uri: "
+          "https://storage.googleapis.com/bucket/some%2Fpath%2Fappendable\n"
+          "Auth Token: fake_token\n"
+          "Range: 0-1048575\n"
+          "Timeouts: 5 1 20\n",
+          contents[0]),
+      // Upload entire file
+      new FakeHttpRequest(
+          "Uri: https://www.googleapis.com/upload/storage/v1/b/bucket/o?"
+          "uploadType=resumable&name=some%2Fpath%2Fappendable\n"
+          "Auth Token: fake_token\n"
+          "Header X-Upload-Content-Length: 18\n"
+          "Post: yes\n"
+          "Timeouts: 5 1 10\n",
+          "", {{"Location", "https://custom/upload/location"}}),
+      new FakeHttpRequest(
+          strings::StrCat("Uri: https://custom/upload/location\n"
+                          "Auth Token: fake_token\n"
+                          "Header Content-Range: bytes 0-17/18\n"
+                          "Timeouts: 5 1 30\n"
+                          "Put body: ",
+                          contents[0], contents[1], "\n"),
+          ""),
+      // Upload new part to a temporary object
+      new FakeHttpRequest(
+          "Uri: "
+          "https://www.googleapis.com/upload/storage/v1/b/bucket/"
+          "o?uploadType=resumable&name=some%2Fpath%2F.tmpcompose%2Fappendable."
+          "18\n"
+          "Auth Token: fake_token\n"
+          "Header X-Upload-Content-Length: 9\n"
+          "Post: yes\n"
+          "Timeouts: 5 1 10\n",
+          "",
+          {{"Location",
+            "https://custom/upload/"
+            "location"}}),
+      new FakeHttpRequest(
+          strings::StrCat("Uri: https://custom/upload/location\n"
+                          "Auth Token: fake_token\n"
+                          "Header Content-Range: bytes 0-8/9\n"
+                          "Timeouts: 5 1 30\n"
+                          "Put body: ",
+                          contents[2], "\n"),
+          ""),
+      // Compose the new part at the end of the original object.
+      new FakeHttpRequest("Uri: "
+                          "https://www.googleapis.com/storage/v1/b/bucket/o/"
+                          "some%2Fpath%2Fappendable/compose\n"
+                          "Auth Token: fake_token\n"
+                          "Timeouts: 5 1 10\n"
+                          "Header content-type: application/json\n"
+                          "Post body: {'sourceObjects': [{'name': "
+                          "'some/path/appendable'},{'name': "
+                          "'some/path/.tmpcompose/appendable.18'}]}\n",
+                          ""),
+      // Delete the temporary object.
+      new FakeHttpRequest("Uri: "
+                          "https://www.googleapis.com/storage/v1/b/bucket/o/"
+                          "some%2Fpath%2F.tmpcompose%2Fappendable.18\n"
+                          "Auth Token: fake_token\n"
+                          "Timeouts: 5 1 10\n"
+                          "Delete: yes\n",
+                          ""),
+      new FakeHttpRequest(
+          "Uri: https://www.googleapis.com/upload/storage/v1/b/bucket/o?"
+          "uploadType=resumable&name=some%2Fpath%2F.tmpcompose%2Fappendable."
+          "27\n"
+          "Auth Token: fake_token\n"
+          "Header X-Upload-Content-Length: 9\n"
+          "Post: yes\n"
+          "Timeouts: 5 1 10\n",
+          "", {{"Location", "https://custom/upload/location"}}),
+      new FakeHttpRequest(
+          strings::StrCat("Uri: https://custom/upload/location\n"
+                          "Auth Token: fake_token\n"
+                          "Header Content-Range: bytes 0-8/9\n"
+                          "Timeouts: 5 1 30\n"
+                          "Put body: ",
+                          contents[3], "\n"),
+          ""),
+      new FakeHttpRequest("Uri: "
+                          "https://www.googleapis.com/storage/v1/b/bucket/o/"
+                          "some%2Fpath%2Fappendable/compose\n"
+                          "Auth Token: fake_token\n"
+                          "Timeouts: 5 1 10\n"
+                          "Header content-type: application/json\n"
+                          "Post body: {'sourceObjects': [{'name': "
+                          "'some/path/appendable'},{'name': "
+                          "'some/path/.tmpcompose/appendable.27'}]}\n",
+                          ""),
+      new FakeHttpRequest("Uri: "
+                          "https://www.googleapis.com/storage/v1/b/bucket/o/"
+                          "some%2Fpath%2F.tmpcompose%2Fappendable."
+                          "27\n"
+                          "Auth Token: fake_token\n"
+                          "Timeouts: 5 1 10\n"
+                          "Delete: yes\n",
+                          ""),
+  });
+  GcsFileSystem fs(
+      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+      std::unique_ptr<HttpRequest::Factory>(
+          new FakeHttpRequestFactory(&requests)),
+      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 32 /* block size */,
+      32 /* max bytes */, 0 /* max staleness */, 3600 /* stat cache max age */,
+      0 /* stat cache max entries */, 0 /* matching paths cache max age */,
+      0 /* matching paths cache max entries */, kTestRetryConfig,
+      kTestTimeoutConfig, *kAllowedLocationsDefault,
+      nullptr /* gcs additional header */, true /* compose append */);
+
+  // Create an appendable file. This should read the file from GCS, and pull its
+  // contents into the block cache.
+  std::unique_ptr<WritableFile> wfile;
+  TF_EXPECT_OK(
+      fs.NewAppendableFile("gs://bucket/some/path/appendable", &wfile));
+  TF_EXPECT_OK(wfile->Append(contents[1]));
+  TF_EXPECT_OK(wfile->Flush());
+  TF_EXPECT_OK(wfile->Append(contents[2]));
+  TF_EXPECT_OK(wfile->Flush());
+  TF_EXPECT_OK(wfile->Append(contents[3]));
+  TF_EXPECT_OK(wfile->Close());
+}
+
+TEST(GcsFileSystemTest, NewAppendableFile_MultipleFlushesWithoutCompose) {
+  std::vector<string> contents(
+      {"content0,", "content1,", "content2,", "content3,"});
+  std::vector<HttpRequest*> requests({
+      new FakeHttpRequest(
+          "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
+          "path%2Fappendable?fields=size%2Cgeneration%2Cupdated\n"
+          "Auth Token: fake_token\n"
+          "Timeouts: 5 1 10\n",
+          strings::StrCat("{\"size\": \"8\",\"generation\": \"1\","
+                          "\"updated\": \"2016-04-29T23:15:24.896Z\"}")),
+      new FakeHttpRequest(
+          "Uri: https://storage.googleapis.com/bucket/path%2Fappendable\n"
+          "Auth Token: fake_token\n"
+          "Range: 0-1048575\n"
+          "Timeouts: 5 1 20\n",
+          contents[0]),
+      new FakeHttpRequest(
+          "Uri: https://www.googleapis.com/upload/storage/v1/b/bucket/o?"
+          "uploadType=resumable&name=path%2Fappendable\n"
+          "Auth Token: fake_token\n"
+          "Header X-Upload-Content-Length: 18\n"
+          "Post: yes\n"
+          "Timeouts: 5 1 10\n",
+          "", {{"Location", "https://custom/upload/location"}}),
+      // Uploads entire file.
+      new FakeHttpRequest(
+          strings::StrCat("Uri: https://custom/upload/location\n"
+                          "Auth Token: fake_token\n"
+                          "Header Content-Range: bytes 0-17/18\n"
+                          "Timeouts: 5 1 30\n"
+                          "Put body: ",
+                          contents[0], contents[1], "\n"),
+          ""),
+      new FakeHttpRequest("Uri: "
+                          "https://www.googleapis.com/upload/storage/v1/b/"
+                          "bucket/o?"
+                          "uploadType=resumable&name=path%2Fappendable\n"
+                          "Auth Token: fake_token\n"
+                          "Header X-Upload-Content-Length: 27\n"
+                          "Post: yes\n"
+                          "Timeouts: 5 1 10\n",
+                          "",
+                          {{"Location",
+                            "https://custom/upload/"
+                            "location"}}),
+      // Uploads entire file again.
+      new FakeHttpRequest(
+          strings::StrCat("Uri: https://custom/upload/location\n"
+                          "Auth Token: fake_token\n"
+                          "Header Content-Range: bytes 0-26/27\n"
+                          "Timeouts: 5 1 30\n"
+                          "Put body: ",
+                          contents[0], contents[1], contents[2], "\n"),
+          ""),
+      new FakeHttpRequest(
+          "Uri: https://www.googleapis.com/upload/storage/v1/b/bucket/o?"
+          "uploadType=resumable&name=path%2Fappendable\n"
+          "Auth Token: fake_token\n"
+          "Header X-Upload-Content-Length: 36\n"
+          "Post: yes\n"
+          "Timeouts: 5 1 10\n",
+          "", {{"Location", "https://custom/upload/location"}}),
+      // Uploads entire file again.
+      new FakeHttpRequest(
+          strings::StrCat("Uri: https://custom/upload/location\n"
+                          "Auth Token: fake_token\n"
+                          "Header Content-Range: bytes 0-35/36\n"
+                          "Timeouts: 5 1 30\n"
+                          "Put body: ",
+                          contents[0], contents[1], contents[2], contents[3],
+                          "\n"),
+          ""),
+  });
+  GcsFileSystem fs(
+      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+      std::unique_ptr<HttpRequest::Factory>(
+          new FakeHttpRequestFactory(&requests)),
+      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 32 /* block size */,
+      32 /* max bytes */, 0 /* max staleness */, 3600 /* stat cache max age */,
+      0 /* stat cache max entries */, 0 /* matching paths cache max age */,
+      0 /* matching paths cache max entries */, kTestRetryConfig,
+      kTestTimeoutConfig, *kAllowedLocationsDefault,
+      nullptr /* gcs additional header */, false /* compose append */);
+
+  // Create an appendable file. This should read the file from GCS, and pull its
+  // contents into the block cache.
+  std::unique_ptr<WritableFile> wfile;
+  TF_EXPECT_OK(fs.NewAppendableFile("gs://bucket/path/appendable", &wfile));
+  TF_EXPECT_OK(wfile->Append(contents[1]));
+  TF_EXPECT_OK(wfile->Flush());
+  TF_EXPECT_OK(wfile->Append(contents[2]));
+  TF_EXPECT_OK(wfile->Flush());
+  TF_EXPECT_OK(wfile->Append(contents[3]));
+  TF_EXPECT_OK(wfile->Close());
+}
+
+TEST(GcsFileSystemTest, AppendModeCompose) {
+  unsetenv("GCS_APPEND_MODE");
+  setenv("GCS_APPEND_MODE", "compose", 1);
+  GcsFileSystem fs1;
+  EXPECT_EQ(true, fs1.compose_append());
+}
+
+TEST(GcsFileSystemTest, AppendModeDefault) {
+  unsetenv("GCS_APPEND_MODE");
+  GcsFileSystem fs1;
+  EXPECT_EQ(false, fs1.compose_append());
+}
+
 }  // namespace
 }  // namespace tensorflow

From 030e65acd56b55084a5c8a9f7e5ed3db3dc63093 Mon Sep 17 00:00:00 2001
From: Robert David <lrdx@google.com>
Date: Fri, 19 Jun 2020 14:17:36 -0700
Subject: [PATCH 0657/1390] Move GetTensorData calls directly to LstmStep call
 in the int8x8_8 version.

PiperOrigin-RevId: 317378494
Change-Id: I880de60ee5c963468b2eb47d8cca7a3b9765f57c
---
 tensorflow/lite/kernels/lstm_eval.cc | 102 ++++++++-------------------
 1 file changed, 31 insertions(+), 71 deletions(-)

diff --git a/tensorflow/lite/kernels/lstm_eval.cc b/tensorflow/lite/kernels/lstm_eval.cc
index f38fdc95f3e..b2f3d77912b 100644
--- a/tensorflow/lite/kernels/lstm_eval.cc
+++ b/tensorflow/lite/kernels/lstm_eval.cc
@@ -2091,50 +2091,6 @@ TfLiteStatus EvalInteger8x8_8(
   const int n_cell = input_to_output_weights->dims->data[0];
   const int n_output = recurrent_to_output_weights->dims->data[1];
 
-  // Weights and states.
-  const int8_t* input_to_input_weight_ptr =
-      GetTensorData<int8_t>(input_to_input_weights);
-  const int8_t* recurrent_to_input_weight_ptr =
-      GetTensorData<int8_t>(recurrent_to_input_weights);
-  const int8_t* cell_to_input_weight_ptr =
-      GetTensorData<int8_t>(cell_to_input_weights);
-  const int8_t* input_to_forget_weight_ptr =
-      GetTensorData<int8_t>(input_to_forget_weights);
-  const int8_t* recurrent_to_forget_weight_ptr =
-      GetTensorData<int8_t>(recurrent_to_forget_weights);
-  const int8_t* cell_to_forget_weight_ptr =
-      GetTensorData<int8_t>(cell_to_forget_weights);
-  const int8_t* input_to_cell_weight_ptr =
-      GetTensorData<int8_t>(input_to_cell_weights);
-  const int8_t* recurrent_to_cell_weight_ptr =
-      GetTensorData<int8_t>(recurrent_to_cell_weights);
-  const int8_t* input_to_output_weight_ptr =
-      GetTensorData<int8_t>(input_to_output_weights);
-  const int8_t* recurrent_to_output_weight_ptr =
-      GetTensorData<int8_t>(recurrent_to_output_weights);
-  const int8_t* cell_to_output_weight_ptr =
-      GetTensorData<int8_t>(cell_to_output_weights);
-  const int8_t* projection_weight_ptr =
-      GetTensorData<int8_t>(projection_weights);
-  const int16_t* layer_norm_input_weight_ptr =
-      GetTensorData<int16_t>(input_layer_norm_coefficients);
-  const int16_t* layer_norm_forget_weight_ptr =
-      GetTensorData<int16_t>(forget_layer_norm_coefficients);
-  const int16_t* layer_norm_cell_weight_ptr =
-      GetTensorData<int16_t>(cell_layer_norm_coefficients);
-  const int16_t* layer_norm_output_weight_ptr =
-      GetTensorData<int16_t>(output_layer_norm_coefficients);
-  const int32_t* input_gate_bias_ptr = GetTensorData<int32_t>(input_gate_bias);
-  const int32_t* forget_gate_bias_ptr =
-      GetTensorData<int32_t>(forget_gate_bias);
-  const int32_t* cell_gate_bias_ptr = GetTensorData<int32_t>(cell_gate_bias);
-  const int32_t* output_gate_bias_ptr =
-      GetTensorData<int32_t>(output_gate_bias);
-  const int32_t* projection_bias_ptr = GetTensorData<int32_t>(projection_bias);
-  int16_t* cell_ptr = GetTensorData<int16_t>(cell_state);
-  int8_t* output_state_ptr = GetTensorData<int8_t>(output_state);
-  int8_t* output_ptr = nullptr;
-
   const int32_t input_zp = input->params.zero_point;
   const int32_t output_state_zp = output_state->params.zero_point;
 
@@ -2146,89 +2102,93 @@ TfLiteStatus EvalInteger8x8_8(
 
   for (int t = 0; t < max_time; t++) {
     const int t_rel = t;
-    output_ptr = output->data.int8 + t_rel * output_step;
-
+    int8_t* output_ptr = GetTensorData<int8_t>(output) + t_rel * output_step;
     // Input can be int8 asymmetric or int16 symmetric.
-    const int8_t* input_ptr = input->data.int8 + t_rel * input_step;
+    const int8_t* input_ptr = GetTensorData<int8_t>(input) + t_rel * input_step;
     lstm_eval::LstmStepInteger(
         input_ptr, input_zp,
 
-        input_to_input_weight_ptr,
+        GetTensorData<int8_t>(input_to_input_weights),
         integer_lstm_param->effective_input_to_input_scale_a,
         integer_lstm_param->effective_input_to_input_scale_b,
 
-        input_to_forget_weight_ptr,
+        GetTensorData<int8_t>(input_to_forget_weights),
         integer_lstm_param->effective_input_to_forget_scale_a,
         integer_lstm_param->effective_input_to_forget_scale_b,
 
-        input_to_cell_weight_ptr,
+        GetTensorData<int8_t>(input_to_cell_weights),
         integer_lstm_param->effective_input_to_cell_scale_a,
         integer_lstm_param->effective_input_to_cell_scale_b,
 
-        input_to_output_weight_ptr,
+        GetTensorData<int8_t>(input_to_output_weights),
         integer_lstm_param->effective_input_to_output_scale_a,
         integer_lstm_param->effective_input_to_output_scale_b,
 
-        recurrent_to_input_weight_ptr,
+        GetTensorData<int8_t>(recurrent_to_input_weights),
         integer_lstm_param->effective_recurrent_to_input_scale_a,
         integer_lstm_param->effective_recurrent_to_input_scale_b,
 
-        recurrent_to_forget_weight_ptr,
+        GetTensorData<int8_t>(recurrent_to_forget_weights),
         integer_lstm_param->effective_recurrent_to_forget_scale_a,
         integer_lstm_param->effective_recurrent_to_forget_scale_b,
 
-        recurrent_to_cell_weight_ptr,
+        GetTensorData<int8_t>(recurrent_to_cell_weights),
         integer_lstm_param->effective_recurrent_to_cell_scale_a,
         integer_lstm_param->effective_recurrent_to_cell_scale_b,
 
-        recurrent_to_output_weight_ptr,
+        GetTensorData<int8_t>(recurrent_to_output_weights),
         integer_lstm_param->effective_recurrent_to_output_scale_a,
         integer_lstm_param->effective_recurrent_to_output_scale_b,
 
-        cell_to_input_weight_ptr,
+        GetTensorData<int8_t>(cell_to_input_weights),
         integer_lstm_param->effective_cell_to_input_scale_a,
         integer_lstm_param->effective_cell_to_input_scale_b,
 
-        cell_to_forget_weight_ptr,
+        GetTensorData<int8_t>(cell_to_forget_weights),
         integer_lstm_param->effective_cell_to_forget_scale_a,
         integer_lstm_param->effective_cell_to_forget_scale_b,
 
-        cell_to_output_weight_ptr,
+        GetTensorData<int8_t>(cell_to_output_weights),
         integer_lstm_param->effective_cell_to_output_scale_a,
         integer_lstm_param->effective_cell_to_output_scale_b,
 
-        projection_weight_ptr, integer_lstm_param->effective_proj_scale_a,
+        GetTensorData<int8_t>(projection_weights),
+        integer_lstm_param->effective_proj_scale_a,
         integer_lstm_param->effective_proj_scale_b,
 
-        layer_norm_input_weight_ptr,
+        GetTensorData<int16_t>(input_layer_norm_coefficients),
         integer_lstm_param->layer_norm_input_scale_a,
         integer_lstm_param->layer_norm_input_scale_b,
 
-        layer_norm_forget_weight_ptr,
+        GetTensorData<int16_t>(forget_layer_norm_coefficients),
         integer_lstm_param->layer_norm_forget_scale_a,
         integer_lstm_param->layer_norm_forget_scale_b,
 
-        layer_norm_cell_weight_ptr, integer_lstm_param->layer_norm_cell_scale_a,
+        GetTensorData<int16_t>(cell_layer_norm_coefficients),
+        integer_lstm_param->layer_norm_cell_scale_a,
         integer_lstm_param->layer_norm_cell_scale_b,
 
-        layer_norm_output_weight_ptr,
+        GetTensorData<int16_t>(output_layer_norm_coefficients),
         integer_lstm_param->layer_norm_output_scale_a,
         integer_lstm_param->layer_norm_output_scale_b,
 
-        input_gate_bias_ptr, forget_gate_bias_ptr, cell_gate_bias_ptr,
-        output_gate_bias_ptr, projection_bias_ptr,
+        GetTensorData<int32_t>(input_gate_bias),
+        GetTensorData<int32_t>(forget_gate_bias),
+        GetTensorData<int32_t>(cell_gate_bias),
+        GetTensorData<int32_t>(output_gate_bias),
+        GetTensorData<int32_t>(projection_bias),
 
         params, integer_lstm_param->intermediate_scale_a,
         integer_lstm_param->intermediate_scale_b,
         integer_lstm_param->intermediate_zp,
         integer_lstm_param->quantized_cell_clip,
         integer_lstm_param->quantized_proj_clip, n_batch, n_cell, n_input,
-        n_output, output_batch_leading_dim, output_state_ptr, output_state_zp,
-        cell_ptr, output_ptr, GetTensorData<int8_t>(scratch0),
-        GetTensorData<int8_t>(scratch1), GetTensorData<int16_t>(scratch2),
-        GetTensorData<int16_t>(scratch3), GetTensorData<int16_t>(scratch4),
-        GetTensorData<int16_t>(scratch5), GetTensorData<int16_t>(scratch6),
-        GetTensorData<int16_t>(scratch7));
+        n_output, output_batch_leading_dim, GetTensorData<int8_t>(output_state),
+        output_state_zp, GetTensorData<int16_t>(cell_state), output_ptr,
+        GetTensorData<int8_t>(scratch0), GetTensorData<int8_t>(scratch1),
+        GetTensorData<int16_t>(scratch2), GetTensorData<int16_t>(scratch3),
+        GetTensorData<int16_t>(scratch4), GetTensorData<int16_t>(scratch5),
+        GetTensorData<int16_t>(scratch6), GetTensorData<int16_t>(scratch7));
   }
 
   return kTfLiteOk;

From 9e7df609bca2e363aa3217b70c6f95578ba07f10 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 19 Jun 2020 14:21:31 -0700
Subject: [PATCH 0658/1390] Fix tokenization tests and update testing_utils to
 transfer state between layer creation.

PiperOrigin-RevId: 317379253
Change-Id: I786c2eb0506239de0e7f1a5f314a8f1b0bda10d4
---
 tensorflow/python/keras/testing_utils.py | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/tensorflow/python/keras/testing_utils.py b/tensorflow/python/keras/testing_utils.py
index 1928588fea1..cceaabe37a5 100644
--- a/tensorflow/python/keras/testing_utils.py
+++ b/tensorflow/python/keras/testing_utils.py
@@ -94,7 +94,8 @@ def layer_test(layer_cls,
                expected_output_shape=None,
                validate_training=True,
                adapt_data=None,
-               custom_objects=None):
+               custom_objects=None,
+               test_harness=None):
   """Test routine for a layer with a single input and single output.
 
   Arguments:
@@ -114,6 +115,8 @@ def layer_test(layer_cls,
       be tested for this layer. This is only relevant for PreprocessingLayers.
     custom_objects: Optional dictionary mapping name strings to custom objects
       in the layer class. This is helpful for testing custom layers.
+    test_harness: The Tensorflow test, if any, that this function is being
+      called in.
 
   Returns:
     The output data (Numpy array) returned by the layer, for additional
@@ -143,9 +146,15 @@ def layer_test(layer_cls,
     expected_output_dtype = input_dtype
 
   if dtypes.as_dtype(expected_output_dtype) == dtypes.string:
-    assert_equal = string_test
+    if test_harness:
+      assert_equal = test_harness.assertAllEqual
+    else:
+      assert_equal = string_test
   else:
-    assert_equal = numeric_test
+    if test_harness:
+      assert_equal = test_harness.assertAllClose
+    else:
+      assert_equal = numeric_test
 
   # instantiation
   kwargs = kwargs or {}
@@ -228,6 +237,7 @@ def layer_test(layer_cls,
   # test training mode (e.g. useful for dropout tests)
   # Rebuild the model to avoid the graph being reused between predict() and
   # See b/120160788 for more details. This should be mitigated after 2.0.
+  layer_weights = layer.get_weights()  # Get the layer weights BEFORE training.
   if validate_training:
     model = models.Model(x, layer(x))
     if _thread_local_data.run_eagerly is not None:
@@ -252,6 +262,8 @@ def layer_test(layer_cls,
   model = models.Sequential()
   model.add(layers.Input(shape=input_shape[1:], dtype=input_dtype))
   model.add(layer)
+
+  layer.set_weights(layer_weights)
   actual_output = model.predict(input_data)
   actual_output_shape = actual_output.shape
   for expected_dim, actual_dim in zip(computed_output_shape,

From e51d4027b323bef5d6042ee0b18d617193211ac8 Mon Sep 17 00:00:00 2001
From: rahul-kamat <rahulkamat@gmail.com>
Date: Fri, 19 Jun 2020 21:49:53 +0000
Subject: [PATCH 0659/1390] Add annotations to fallback ops

---
 tensorflow/python/framework/python_op_gen.cc | 56 ++++++++++----------
 1 file changed, 27 insertions(+), 29 deletions(-)

diff --git a/tensorflow/python/framework/python_op_gen.cc b/tensorflow/python/framework/python_op_gen.cc
index 062a9aa01e4..cdef72a155d 100644
--- a/tensorflow/python/framework/python_op_gen.cc
+++ b/tensorflow/python/framework/python_op_gen.cc
@@ -168,7 +168,8 @@ class GenEagerPythonOp : public python_op_gen_internal::GenPythonOp {
   bool AddEagerFallbackCode(const string& parameters,
                             const std::vector<string>& output_sizes,
                             const string& num_outputs_expr,
-                            const string& eager_not_allowed_error);
+                            const string& eager_not_allowed_error,
+                            std::unordered_map<string, string>& type_annotations);
   void AddEagerFastPathExecute();
 
   void AddEagerInferredAttrs(const string& indentation);
@@ -355,18 +356,19 @@ string GenEagerPythonOp::Code() {
   }
 
   string parameters;
+  // Param can be an input or an attr
   for (const auto& param : params_no_default_) {
     if (!parameters.empty()) strings::StrAppend(&parameters, ", ");
     strings::StrAppend(&parameters, param.GetRenameTo());
 
     // Add type annotations to param
     if (type_annotations.find(param.GetName()) != type_annotations.end()) {
-      if (!type_annotations[param.GetName()].empty()) {
-        strings::StrAppend(&parameters, ": ", type_annotations[param.GetName()]);
-      }
+      strings::StrAppend(&parameters, ": ", type_annotations[param.GetName()]);
     }
   }
 
+  // Append to parameters and parameters_with_defaults because multiple functions
+  // are generated (op and fallback op)
   string parameters_with_defaults = parameters;
   for (const auto& param_and_default : params_with_default_) {
     if (!parameters.empty()) strings::StrAppend(&parameters, ", ");
@@ -375,14 +377,12 @@ string GenEagerPythonOp::Code() {
 
     // Add type annotations to param_and_default
     if (type_annotations.find(param_and_default.first.GetName()) != type_annotations.end()) {
-      if (!type_annotations[param_and_default.first.GetName()].empty()) {
-        strings::StrAppend(&parameters, ": ", type_annotations[param_and_default.first.GetName()]);
-        strings::StrAppend(&parameters_with_defaults,
-                           param_and_default.first.GetRenameTo(), ": ",
-                           type_annotations[param_and_default.first.GetName()], " ",
-                           "= ", param_and_default.second);
-        continue;
-      }
+      const string param_type = type_annotations[param_and_default.first.GetName()];
+      strings::StrAppend(&parameters, param_and_default.first.GetRenameTo(), ": ", param_type);
+      strings::StrAppend(&parameters_with_defaults,
+                         param_and_default.first.GetRenameTo(), ": ",
+                         param_type, " = ", param_and_default.second);
+      continue;
     }
 
     strings::StrAppend(&parameters, param_and_default.first.GetRenameTo());
@@ -425,7 +425,7 @@ string GenEagerPythonOp::Code() {
   }
 
   if (!AddEagerFallbackCode(parameters, output_sizes, num_outputs_expr,
-                            eager_not_allowed_error)) {
+                            eager_not_allowed_error, type_annotations)) {
     return result_;
   }
 
@@ -449,35 +449,29 @@ std::unordered_map<string, string> GenEagerPythonOp::GetTypeAnnotationMap() {
   for (const auto& arg : op_def_.input_arg()) {
     // Do not add type annotations to args that accept a sequence of Tensors
     if (!arg.number_attr().empty()) continue;
-    string type_annotation;
     if (type_annotations.find(arg.type_attr()) != type_annotations.end()) {
       // Get the correct TypeVar if input maps to an attr
-      strings::StrAppend(&type_annotation, "_ops.Tensor[", type_annotations[arg.type_attr()], "]");
+      type_annotations[arg.name()] = "_ops.Tensor[" + type_annotations[arg.type_attr()] + "]";
     } else {
       // Get the dtype of the Tensor
       const string py_dtype = python_op_gen_internal::DataTypeToPython(arg.type(), "_dtypes.");
       if (dtype_type.find(py_dtype) != dtype_type.end()) {
-        strings::StrAppend(&type_annotation, "_ops.Tensor[", dtype_type[py_dtype], "]");
+        type_annotations[arg.name()] = "_ops.Tensor[" + dtype_type[py_dtype] + "]";
       }
     }
-
-    type_annotations[arg.name()] = type_annotation;
   }
 
   // Mapping output Tensor to its type
   if (op_def_.output_arg_size() == 1) {
     const auto& arg = op_def_.output_arg(0);
-    string type_annotation;
     if (type_annotations.find(arg.type_attr()) != type_annotations.end()) {
-      strings::StrAppend(&type_annotation, "_ops.Tensor[", type_annotations[arg.type_attr()], "]");
+      type_annotations[arg.name()] = "_ops.Tensor[" + type_annotations[arg.type_attr()] + "]";
     } else {
       const string py_dtype = python_op_gen_internal::DataTypeToPython(arg.type(), "_dtypes.");
       if (dtype_type.find(py_dtype) != dtype_type.end()) {
-        strings::StrAppend(&type_annotation, "_ops.Tensor[", dtype_type[py_dtype], "]");
+        type_annotations[arg.name()] = "_ops.Tensor[" + dtype_type[py_dtype] + "]";
       }
     }
-
-    type_annotations[arg.name()] = type_annotation;
   }
 
   return type_annotations;
@@ -521,19 +515,20 @@ void GenEagerPythonOp::GenerateTypeVars(std::unordered_map<string, string>& type
   if (added_typevar) strings::StrAppend(&result_, "\n");
 }
 
+// TODO(rahulkamat): Modify AddDefLine() to add return type annotation
 void GenEagerPythonOp::AddReturnTypeAnnotation(std::unordered_map<string, string>& type_annotations) {
   if (op_def_.output_arg_size() == 1) {
     const auto& arg = op_def_.output_arg(0);
     // Add type annotations to param
     if (type_annotations.find(arg.name()) != type_annotations.end()) {
-      if (!type_annotations[arg.name()].empty()) {
-        result_.erase(result_.length() - 2);
-        strings::StrAppend(&result_, " -> ", type_annotations[arg.name()], ":\n");
-      }
+      result_.erase(result_.length() - 2);
+      strings::StrAppend(&result_, " -> ", type_annotations[arg.name()], ":\n");
     }
   }
 }
 
+
+
 void GenEagerPythonOp::HandleGraphMode(
     const string& function_setup, const std::vector<string>& output_sizes) {
   strings::StrAppend(&result_, "  # Add nodes to the TensorFlow graph.\n");
@@ -903,11 +898,14 @@ bool GenEagerPythonOp::AddEagerFastPathAndGraphCode(
 
 bool GenEagerPythonOp::AddEagerFallbackCode(
     const string& parameters, const std::vector<string>& output_sizes,
-    const string& num_outputs_expr, const string& eager_not_allowed_error) {
+    const string& num_outputs_expr, const string& eager_not_allowed_error,
+    std::unordered_map<string, string>& type_annotations) {
   AddDefLine(
       strings::StrCat(function_name_, kEagerFallbackSuffix),
       strings::StrCat(parameters, parameters.empty() ? "" : ", ", "ctx"));
-
+  if (type_annotate_ops.find(op_def_.name()) != type_annotate_ops.end()) {
+    AddReturnTypeAnnotation(type_annotations);
+  }
   if (!eager_not_allowed_error.empty()) {
     strings::StrAppend(&result_, "  ", eager_not_allowed_error);
     return true;

From f129485019052480c99442f774fb7e4a59ae5227 Mon Sep 17 00:00:00 2001
From: Bixia Zheng <bixia@google.com>
Date: Fri, 19 Jun 2020 14:59:47 -0700
Subject: [PATCH 0660/1390] [TF:TRT] Cosmetic fix.

Rewrite two lines of #if into #if GOOGLE_CUDA && GOOGLE_TENSORRT.

PiperOrigin-RevId: 317386436
Change-Id: Icc8ae27a17900b6f0a198d32c6d73345084eab50
---
 tensorflow/compiler/tf2tensorrt/common/utils.h       |  6 ++----
 .../compiler/tf2tensorrt/convert/convert_graph.cc    |  6 ++----
 .../compiler/tf2tensorrt/convert/convert_graph.h     |  6 ++----
 .../tf2tensorrt/convert/convert_graph_test.cc        |  6 ++----
 .../compiler/tf2tensorrt/convert/convert_nodes.cc    |  6 ++----
 .../compiler/tf2tensorrt/convert/convert_nodes.h     |  6 ++----
 .../tf2tensorrt/convert/convert_nodes_test.cc        |  6 ++----
 .../compiler/tf2tensorrt/convert/logger_registry.cc  |  6 ++----
 .../compiler/tf2tensorrt/convert/logger_registry.h   |  5 +++--
 .../tf2tensorrt/convert/trt_optimization_pass.cc     |  6 ++----
 .../tf2tensorrt/convert/trt_optimization_pass.h      |  6 ++----
 .../tf2tensorrt/kernels/get_calibration_data_op.cc   |  6 ++----
 .../compiler/tf2tensorrt/kernels/trt_engine_op.cc    |  6 ++----
 .../tf2tensorrt/kernels/trt_engine_op_test.cc        |  6 ++----
 .../tf2tensorrt/kernels/trt_engine_resource_ops.cc   |  6 ++----
 .../kernels/trt_engine_resource_ops_test.cc          |  6 ++----
 .../tf2tensorrt/ops/get_calibration_data_op.cc       |  6 ++----
 tensorflow/compiler/tf2tensorrt/ops/trt_engine_op.cc |  6 ++----
 .../tf2tensorrt/ops/trt_engine_resource_ops.cc       |  6 ++----
 .../compiler/tf2tensorrt/plugin/plugin_cast.cu.cc    |  6 ++----
 tensorflow/compiler/tf2tensorrt/plugin/trt_plugin.cc |  6 ++----
 tensorflow/compiler/tf2tensorrt/plugin/trt_plugin.h  |  6 ++----
 tensorflow/compiler/tf2tensorrt/segment/segment.cc   |  6 ++----
 tensorflow/compiler/tf2tensorrt/segment/segment.h    |  6 ++----
 .../compiler/tf2tensorrt/segment/segment_test.cc     |  6 ++----
 tensorflow/compiler/tf2tensorrt/segment/union_find.h |  6 ++----
 tensorflow/compiler/tf2tensorrt/tensorrt_test.cc     |  6 ++----
 .../compiler/tf2tensorrt/utils/trt_allocator.cc      | 12 ++++--------
 .../compiler/tf2tensorrt/utils/trt_allocator.h       | 12 ++++--------
 .../compiler/tf2tensorrt/utils/trt_engine_utils.cc   |  6 ++----
 .../compiler/tf2tensorrt/utils/trt_engine_utils.h    |  6 ++----
 .../tf2tensorrt/utils/trt_int8_calibrator.cc         |  6 ++----
 .../compiler/tf2tensorrt/utils/trt_int8_calibrator.h |  6 ++----
 tensorflow/compiler/tf2tensorrt/utils/trt_logger.cc  |  6 ++----
 tensorflow/compiler/tf2tensorrt/utils/trt_logger.h   |  6 ++----
 .../compiler/tf2tensorrt/utils/trt_lru_cache.cc      |  6 ++----
 .../compiler/tf2tensorrt/utils/trt_lru_cache.h       |  6 ++----
 .../utils/trt_shape_optimization_profiles.h          |  6 ++----
 .../utils/trt_shape_optimization_profiles_test.cc    |  6 ++----
 39 files changed, 83 insertions(+), 162 deletions(-)

diff --git a/tensorflow/compiler/tf2tensorrt/common/utils.h b/tensorflow/compiler/tf2tensorrt/common/utils.h
index 9ab0145e1ec..b428733ecd4 100644
--- a/tensorflow/compiler/tf2tensorrt/common/utils.h
+++ b/tensorflow/compiler/tf2tensorrt/common/utils.h
@@ -16,8 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_TF2TENSORRT_COMMON_UTILS_H_
 #define TENSORFLOW_COMPILER_TF2TENSORRT_COMMON_UTILS_H_
 
-#if GOOGLE_CUDA
-#if GOOGLE_TENSORRT
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
 
 #include "tensorflow/core/platform/logging.h"
 
@@ -29,7 +28,6 @@ namespace tensorrt {
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#endif
-#endif
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
 
 #endif  // TENSORFLOW_COMPILER_TF2TENSORRT_COMMON_UTILS_H_
diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc b/tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc
index 1c51d51f1c9..5429aaf3362 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc
+++ b/tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc
@@ -53,8 +53,7 @@ limitations under the License.
 #include "tensorflow/core/util/device_name_utils.h"
 #include "tensorflow/tools/graph_transforms/transform_utils.h"
 
-#if GOOGLE_CUDA
-#if GOOGLE_TENSORRT
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
 #include "third_party/gpus/cuda/include/cuda_runtime_api.h"
 #include "third_party/tensorrt/NvInfer.h"
 namespace tensorflow {
@@ -884,5 +883,4 @@ Status ConvertAfterShapes(const ConversionParams& params) {
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#endif  // GOOGLE_TENSORRT
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_graph.h b/tensorflow/compiler/tf2tensorrt/convert/convert_graph.h
index 53ab84a6fa9..d3897e864fa 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/convert_graph.h
+++ b/tensorflow/compiler/tf2tensorrt/convert/convert_graph.h
@@ -24,8 +24,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/types.h"
 
-#if GOOGLE_CUDA
-#if GOOGLE_TENSORRT
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
 
 namespace tensorflow {
 namespace tensorrt {
@@ -66,7 +65,6 @@ Status RegisterGraphToFunctionLibrary(const GraphDef& segment_graph_def,
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#endif  // GOOGLE_TENSORRT
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
 
 #endif  // TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_CONVERT_GRAPH_H_
diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_graph_test.cc b/tensorflow/compiler/tf2tensorrt/convert/convert_graph_test.cc
index a1f523d6bfa..54fb1d56441 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/convert_graph_test.cc
+++ b/tensorflow/compiler/tf2tensorrt/convert/convert_graph_test.cc
@@ -34,8 +34,7 @@ limitations under the License.
 #include "tensorflow/core/protobuf/config.pb.h"  // NOLINT
 #include "tensorflow/core/public/session.h"
 
-#if GOOGLE_CUDA
-#if GOOGLE_TENSORRT
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
 
 namespace tensorflow {
 namespace tensorrt {
@@ -231,5 +230,4 @@ TEST_F(ConvertAfterShapesTest, DirectlyConnectedEngines) {
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#endif  // GOOGLE_TENSORRT
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc
index 96cec556942..2ec616ba621 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc
+++ b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc
@@ -59,8 +59,7 @@ limitations under the License.
 #include "tensorflow/core/util/env_var.h"
 #include "tensorflow/core/util/strided_slice_op.h"
 
-#if GOOGLE_CUDA
-#if GOOGLE_TENSORRT
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
 #include "third_party/tensorrt/NvInfer.h"
 #include "third_party/tensorrt/NvInferPlugin.h"
 
@@ -6258,5 +6257,4 @@ bool OutputEdgeValidator::operator()(const Edge* out_edge) const {
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#endif  // GOOGLE_TENSORRT
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h
index 7a1276c645c..a621735fad1 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h
+++ b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h
@@ -33,8 +33,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/stream_executor/lib/statusor.h"
 
-#if GOOGLE_CUDA
-#if GOOGLE_TENSORRT
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
 #include "third_party/tensorrt/NvInfer.h"
 
 namespace tensorflow {
@@ -694,7 +693,6 @@ BinaryOperationMap();
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#endif  // GOOGLE_TENSORRT
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
 
 #endif  // TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_CONVERT_NODES_H_
diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes_test.cc b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes_test.cc
index c24b169f651..53ec9ee7ada 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes_test.cc
+++ b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes_test.cc
@@ -21,8 +21,7 @@ limitations under the License.
 #include <unordered_map>
 #include <vector>
 
-#if GOOGLE_CUDA
-#if GOOGLE_TENSORRT
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
@@ -6636,5 +6635,4 @@ TEST_F(OpConverterTest, ConvertPad) {
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#endif  // GOOGLE_TENSORRT
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
diff --git a/tensorflow/compiler/tf2tensorrt/convert/logger_registry.cc b/tensorflow/compiler/tf2tensorrt/convert/logger_registry.cc
index 82e68cbb28d..07c9c2f1ea0 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/logger_registry.cc
+++ b/tensorflow/compiler/tf2tensorrt/convert/logger_registry.cc
@@ -12,8 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#if GOOGLE_CUDA
-#if GOOGLE_TENSORRT
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
 
 #include "tensorflow/compiler/tf2tensorrt/convert/logger_registry.h"
 
@@ -58,5 +57,4 @@ LoggerRegistry* GetLoggerRegistry() {
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#endif  // GOOGLE_TENSORRT
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
diff --git a/tensorflow/compiler/tf2tensorrt/convert/logger_registry.h b/tensorflow/compiler/tf2tensorrt/convert/logger_registry.h
index 45b302742d0..2a265cf7caa 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/logger_registry.h
+++ b/tensorflow/compiler/tf2tensorrt/convert/logger_registry.h
@@ -19,7 +19,8 @@ limitations under the License.
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/types.h"
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
+
 #include "third_party/tensorrt/NvInfer.h"
 
 namespace tensorflow {
@@ -53,5 +54,5 @@ class RegisterLogger {
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
 #endif  // TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_LOGGER_REGISTRY_H_
diff --git a/tensorflow/compiler/tf2tensorrt/convert/trt_optimization_pass.cc b/tensorflow/compiler/tf2tensorrt/convert/trt_optimization_pass.cc
index 72f4fe5ef9b..1cf98d135cb 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/trt_optimization_pass.cc
+++ b/tensorflow/compiler/tf2tensorrt/convert/trt_optimization_pass.cc
@@ -28,8 +28,7 @@ limitations under the License.
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/stacktrace.h"
 
-#if GOOGLE_CUDA
-#if GOOGLE_TENSORRT
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
 namespace tensorflow {
 namespace tensorrt {
 namespace convert {
@@ -302,5 +301,4 @@ static VerboseCustomGraphOptimizerRegistrar TRTOptimizationPass_Registrar(
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#endif
-#endif
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
diff --git a/tensorflow/compiler/tf2tensorrt/convert/trt_optimization_pass.h b/tensorflow/compiler/tf2tensorrt/convert/trt_optimization_pass.h
index f79048bb5f6..e0aaa5500ab 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/trt_optimization_pass.h
+++ b/tensorflow/compiler/tf2tensorrt/convert/trt_optimization_pass.h
@@ -23,8 +23,7 @@ limitations under the License.
 #include "tensorflow/core/grappler/optimizers/custom_graph_optimizer.h"
 #include "tensorflow/core/platform/logging.h"
 
-#if GOOGLE_CUDA
-#if GOOGLE_TENSORRT
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
 
 namespace tensorflow {
 namespace tensorrt {
@@ -83,6 +82,5 @@ class TRTOptimizationPass : public grappler::CustomGraphOptimizer {
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#endif  // GOOGLE_CUDA
-#endif  // GOOGLE_TENSORRT
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
 #endif  // TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_TRT_OPTIMIZATION_PASS_H_
diff --git a/tensorflow/compiler/tf2tensorrt/kernels/get_calibration_data_op.cc b/tensorflow/compiler/tf2tensorrt/kernels/get_calibration_data_op.cc
index 3143b06817e..76fb40b9520 100644
--- a/tensorflow/compiler/tf2tensorrt/kernels/get_calibration_data_op.cc
+++ b/tensorflow/compiler/tf2tensorrt/kernels/get_calibration_data_op.cc
@@ -22,8 +22,7 @@ limitations under the License.
 #include "tensorflow/core/framework/resource_mgr.h"
 #include "tensorflow/core/lib/core/refcount.h"
 
-#if GOOGLE_CUDA
-#if GOOGLE_TENSORRT
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
 
 namespace tensorflow {
 namespace tensorrt {
@@ -67,5 +66,4 @@ REGISTER_KERNEL_BUILDER(Name("GetCalibrationDataOp").Device(DEVICE_GPU),
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#endif  // GOOGLE_TENSORRT
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
diff --git a/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op.cc b/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op.cc
index 98d199ca9ab..1094555a622 100644
--- a/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op.cc
+++ b/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op.cc
@@ -48,8 +48,7 @@ limitations under the License.
 #include "tensorflow/core/util/env_var.h"
 #include "tensorflow/stream_executor/lib/statusor.h"
 
-#if GOOGLE_CUDA
-#if GOOGLE_TENSORRT
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
 #include "third_party/gpus/cuda/include/cuda_runtime_api.h"
 #include "third_party/tensorrt/NvInfer.h"
 
@@ -1009,5 +1008,4 @@ REGISTER_KERNEL_BUILDER(Name("TRTEngineOp").Device(DEVICE_GPU), TRTEngineOp);
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#endif  // GOOGLE_TENSORRT
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
diff --git a/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op_test.cc b/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op_test.cc
index a06010de1c7..71193dc24cf 100644
--- a/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op_test.cc
+++ b/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op_test.cc
@@ -50,8 +50,7 @@ limitations under the License.
 #include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/public/version.h"
 
-#if GOOGLE_CUDA
-#if GOOGLE_TENSORRT
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
 
 namespace tensorflow {
 namespace tensorrt {
@@ -306,5 +305,4 @@ TYPED_TEST(TRTEngineOpTest, Basic) {
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#endif  // GOOGLE_TENSORRT
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
diff --git a/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_resource_ops.cc b/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_resource_ops.cc
index 2c5821df6ac..3b6e7e91d3b 100644
--- a/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_resource_ops.cc
+++ b/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_resource_ops.cc
@@ -33,8 +33,7 @@ limitations under the License.
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/thread_annotations.h"
 
-#if GOOGLE_CUDA
-#if GOOGLE_TENSORRT
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
 #include "third_party/tensorrt/NvInfer.h"
 
 namespace tensorflow {
@@ -251,5 +250,4 @@ REGISTER_KERNEL_BUILDER(Name("SerializeTRTResource").Device(DEVICE_GPU),
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#endif  // GOOGLE_TENSORRT
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
diff --git a/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_resource_ops_test.cc b/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_resource_ops_test.cc
index 4a24160569d..6a073ee24d0 100644
--- a/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_resource_ops_test.cc
+++ b/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_resource_ops_test.cc
@@ -48,8 +48,7 @@ limitations under the License.
 #include "tensorflow/core/platform/tstring.h"
 #include "tensorflow/core/platform/types.h"
 
-#if GOOGLE_CUDA
-#if GOOGLE_TENSORRT
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
 
 namespace tensorflow {
 namespace tensorrt {
@@ -246,5 +245,4 @@ TEST_F(TRTEngineResourceOpsTest, Basic) {
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#endif  // GOOGLE_TENSORRT
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
diff --git a/tensorflow/compiler/tf2tensorrt/ops/get_calibration_data_op.cc b/tensorflow/compiler/tf2tensorrt/ops/get_calibration_data_op.cc
index 573172b92e6..2af3164c3e2 100644
--- a/tensorflow/compiler/tf2tensorrt/ops/get_calibration_data_op.cc
+++ b/tensorflow/compiler/tf2tensorrt/ops/get_calibration_data_op.cc
@@ -13,8 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#if GOOGLE_CUDA
-#if GOOGLE_TENSORRT
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
 
 #include "tensorflow/core/framework/common_shape_fns.h"
 #include "tensorflow/core/framework/op.h"
@@ -34,5 +33,4 @@ Returns calibration data for the given resource name
 
 }  // namespace tensorflow
 
-#endif  // GOOGLE_TENSORRT
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
diff --git a/tensorflow/compiler/tf2tensorrt/ops/trt_engine_op.cc b/tensorflow/compiler/tf2tensorrt/ops/trt_engine_op.cc
index bd3c2b299a9..2527fe9b910 100644
--- a/tensorflow/compiler/tf2tensorrt/ops/trt_engine_op.cc
+++ b/tensorflow/compiler/tf2tensorrt/ops/trt_engine_op.cc
@@ -13,8 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#if GOOGLE_CUDA
-#if GOOGLE_TENSORRT
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
 
 #include "tensorflow/core/framework/common_shape_fns.h"
 #include "tensorflow/core/framework/op.h"
@@ -59,5 +58,4 @@ REGISTER_OP("TRTEngineOp")
     .Attr("static_engine: bool = true");
 }  // namespace tensorflow
 
-#endif  // GOOGLE_TENSORRT
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
diff --git a/tensorflow/compiler/tf2tensorrt/ops/trt_engine_resource_ops.cc b/tensorflow/compiler/tf2tensorrt/ops/trt_engine_resource_ops.cc
index 01911de66ec..3141092de03 100644
--- a/tensorflow/compiler/tf2tensorrt/ops/trt_engine_resource_ops.cc
+++ b/tensorflow/compiler/tf2tensorrt/ops/trt_engine_resource_ops.cc
@@ -13,8 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#if GOOGLE_CUDA
-#if GOOGLE_TENSORRT
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
 
 #include "tensorflow/core/framework/common_shape_fns.h"
 #include "tensorflow/core/framework/op.h"
@@ -46,5 +45,4 @@ REGISTER_OP("SerializeTRTResource")
 
 }  // namespace tensorflow
 
-#endif  // GOOGLE_TENSORRT
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
diff --git a/tensorflow/compiler/tf2tensorrt/plugin/plugin_cast.cu.cc b/tensorflow/compiler/tf2tensorrt/plugin/plugin_cast.cu.cc
index 4c0d8b0392a..141a7d1f462 100644
--- a/tensorflow/compiler/tf2tensorrt/plugin/plugin_cast.cu.cc
+++ b/tensorflow/compiler/tf2tensorrt/plugin/plugin_cast.cu.cc
@@ -17,8 +17,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2tensorrt/plugin/trt_plugin.h"
 #include "tensorflow/core/platform/logging.h"
 
-#if GOOGLE_CUDA
-#if GOOGLE_TENSORRT
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
 #define EIGEN_USE_GPU  // For definition of Eigen::GpuDevice.
 #include "third_party/gpus/cuda/include/cuda_runtime_api.h"
 #include "tensorflow/core/util/gpu_kernel_helper.h"
@@ -234,5 +233,4 @@ REGISTER_TFTRT_PLUGIN(CastPluginCreator);
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#endif  // GOOGLE_CUDA
-#endif  // GOOGLE_TENSORRT
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
diff --git a/tensorflow/compiler/tf2tensorrt/plugin/trt_plugin.cc b/tensorflow/compiler/tf2tensorrt/plugin/trt_plugin.cc
index 563ce724f43..83d5f9b5965 100644
--- a/tensorflow/compiler/tf2tensorrt/plugin/trt_plugin.cc
+++ b/tensorflow/compiler/tf2tensorrt/plugin/trt_plugin.cc
@@ -17,8 +17,7 @@ limitations under the License.
 
 #include <cstring>
 
-#if GOOGLE_CUDA
-#if GOOGLE_TENSORRT
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
 
 namespace tensorflow {
 namespace tensorrt {
@@ -30,5 +29,4 @@ const char* kTfTrtPluginNamespace = "TF";
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#endif  // GOOGLE_CUDA
-#endif  // GOOGLE_TENSORRT
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
diff --git a/tensorflow/compiler/tf2tensorrt/plugin/trt_plugin.h b/tensorflow/compiler/tf2tensorrt/plugin/trt_plugin.h
index bdb046e6c71..600ac6683da 100644
--- a/tensorflow/compiler/tf2tensorrt/plugin/trt_plugin.h
+++ b/tensorflow/compiler/tf2tensorrt/plugin/trt_plugin.h
@@ -20,8 +20,7 @@ limitations under the License.
 
 #include "tensorflow/core/platform/logging.h"
 
-#if GOOGLE_CUDA
-#if GOOGLE_TENSORRT
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
 #include "third_party/tensorrt/NvInfer.h"
 
 namespace tensorflow {
@@ -90,7 +89,6 @@ class TrtPluginRegistrar {
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#endif  // GOOGLE_TENSORRT
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
 
 #endif  // TENSORFLOW_COMPILER_TF2TENSORRT_PLUGIN_TRT_PLUGIN_H_
diff --git a/tensorflow/compiler/tf2tensorrt/segment/segment.cc b/tensorflow/compiler/tf2tensorrt/segment/segment.cc
index 32e30006f58..d9080b6f69a 100644
--- a/tensorflow/compiler/tf2tensorrt/segment/segment.cc
+++ b/tensorflow/compiler/tf2tensorrt/segment/segment.cc
@@ -35,8 +35,7 @@ limitations under the License.
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/util/env_var.h"
 
-#if GOOGLE_CUDA
-#if GOOGLE_TENSORRT
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
 
 namespace tensorflow {
 namespace tensorrt {
@@ -1062,5 +1061,4 @@ Status SegmentGraph(const Graph* tf_graph,
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#endif  // GOOGLE_TENSORRT
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
diff --git a/tensorflow/compiler/tf2tensorrt/segment/segment.h b/tensorflow/compiler/tf2tensorrt/segment/segment.h
index 7295c8f0d9d..3f79983cfd2 100644
--- a/tensorflow/compiler/tf2tensorrt/segment/segment.h
+++ b/tensorflow/compiler/tf2tensorrt/segment/segment.h
@@ -25,8 +25,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/types.h"
 
-#if GOOGLE_CUDA
-#if GOOGLE_TENSORRT
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
 
 namespace tensorflow {
 namespace tensorrt {
@@ -67,7 +66,6 @@ Status SegmentGraph(const Graph* tf_graph,
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#endif  // GOOGLE_TENSORRT
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
 
 #endif  // TENSORFLOW_COMPILER_TF2TENSORRT_SEGMENT_SEGMENT_H_
diff --git a/tensorflow/compiler/tf2tensorrt/segment/segment_test.cc b/tensorflow/compiler/tf2tensorrt/segment/segment_test.cc
index 2437481a9c4..f3bc5bfbee6 100644
--- a/tensorflow/compiler/tf2tensorrt/segment/segment_test.cc
+++ b/tensorflow/compiler/tf2tensorrt/segment/segment_test.cc
@@ -26,8 +26,7 @@ limitations under the License.
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/public/session.h"
 
-#if GOOGLE_CUDA
-#if GOOGLE_TENSORRT
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
 
 namespace tensorflow {
 namespace tensorrt {
@@ -522,5 +521,4 @@ TEST_F(SegmentTest, IncompatibleBatchSizes) {
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#endif  // GOOGLE_TENSORRT
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
diff --git a/tensorflow/compiler/tf2tensorrt/segment/union_find.h b/tensorflow/compiler/tf2tensorrt/segment/union_find.h
index 70e83c12fca..b53615ec019 100644
--- a/tensorflow/compiler/tf2tensorrt/segment/union_find.h
+++ b/tensorflow/compiler/tf2tensorrt/segment/union_find.h
@@ -19,8 +19,7 @@ limitations under the License.
 #include "absl/strings/str_format.h"
 #include "absl/types/optional.h"
 
-#if GOOGLE_CUDA
-#if GOOGLE_TENSORRT
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
 
 namespace tensorflow {
 namespace tensorrt {
@@ -217,7 +216,6 @@ UnionFind<T>* UnionFind<T>::FindRoot() {
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#endif  // GOOGLE_TENSORRT
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
 
 #endif  // TENSORFLOW_COMPILER_TF2TENSORRT_SEGMENT_UNION_FIND_H_
diff --git a/tensorflow/compiler/tf2tensorrt/tensorrt_test.cc b/tensorflow/compiler/tf2tensorrt/tensorrt_test.cc
index 510591bfe00..e994d20df33 100644
--- a/tensorflow/compiler/tf2tensorrt/tensorrt_test.cc
+++ b/tensorflow/compiler/tf2tensorrt/tensorrt_test.cc
@@ -18,8 +18,7 @@ limitations under the License.
 #include "tensorflow/core/platform/stream_executor.h"
 #include "tensorflow/core/platform/test.h"
 
-#if GOOGLE_CUDA
-#if GOOGLE_TENSORRT
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
 #include "third_party/gpus/cuda/include/cuda.h"
 #include "third_party/gpus/cuda/include/cuda_runtime_api.h"
 #include "third_party/tensorrt/NvInfer.h"
@@ -164,5 +163,4 @@ TEST(TensorrtTest, BasicFunctions) {
 }  // namespace
 }  // namespace tensorflow
 
-#endif  // GOOGLE_TENSORRT
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
diff --git a/tensorflow/compiler/tf2tensorrt/utils/trt_allocator.cc b/tensorflow/compiler/tf2tensorrt/utils/trt_allocator.cc
index 617ea7fad5c..d4f3a524577 100644
--- a/tensorflow/compiler/tf2tensorrt/utils/trt_allocator.cc
+++ b/tensorflow/compiler/tf2tensorrt/utils/trt_allocator.cc
@@ -17,11 +17,9 @@ limitations under the License.
 
 #include "tensorflow/core/platform/logging.h"
 
-#if GOOGLE_CUDA
-#if GOOGLE_TENSORRT
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
 #include "third_party/gpus/cuda/include/cuda_runtime_api.h"
-#endif  // GOOGLE_TENSORRT
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
 
 namespace tensorflow {
 namespace tensorrt {
@@ -52,8 +50,7 @@ void* Align(uint64_t alignment, uint64_t size, void*& ptr, uint64_t& space) {
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#if GOOGLE_CUDA
-#if GOOGLE_TENSORRT
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
 
 namespace tensorflow {
 namespace tensorrt {
@@ -113,5 +110,4 @@ void TRTDeviceAllocator::free(void* memory) {
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#endif  // GOOGLE_TENSORRT
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
diff --git a/tensorflow/compiler/tf2tensorrt/utils/trt_allocator.h b/tensorflow/compiler/tf2tensorrt/utils/trt_allocator.h
index 4ab8b52f523..d219a8a14e8 100644
--- a/tensorflow/compiler/tf2tensorrt/utils/trt_allocator.h
+++ b/tensorflow/compiler/tf2tensorrt/utils/trt_allocator.h
@@ -20,11 +20,9 @@ limitations under the License.
 
 #include "tensorflow/core/framework/allocator.h"
 
-#if GOOGLE_CUDA
-#if GOOGLE_TENSORRT
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
 #include "third_party/tensorrt/NvInfer.h"
-#endif  // GOOGLE_TENSORRT
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
 
 namespace tensorflow {
 namespace tensorrt {
@@ -33,8 +31,7 @@ void* Align(uint64_t alignment, uint64_t size, void*& ptr, uint64_t& space);
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#if GOOGLE_CUDA
-#if GOOGLE_TENSORRT
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
 
 namespace tensorflow {
 namespace tensorrt {
@@ -69,6 +66,5 @@ class TRTDeviceAllocator : public TRTBaseAllocator {
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#endif  // GOOGLE_TENSORRT
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
 #endif  // TENSORFLOW_COMPILER_TF2TENSORRT_UTILS_TRT_ALLOCATOR_H_
diff --git a/tensorflow/compiler/tf2tensorrt/utils/trt_engine_utils.cc b/tensorflow/compiler/tf2tensorrt/utils/trt_engine_utils.cc
index ed997b267b1..8ccfb8b06f0 100644
--- a/tensorflow/compiler/tf2tensorrt/utils/trt_engine_utils.cc
+++ b/tensorflow/compiler/tf2tensorrt/utils/trt_engine_utils.cc
@@ -25,8 +25,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/errors.h"
 
-#if GOOGLE_CUDA
-#if GOOGLE_TENSORRT
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
 #include "third_party/tensorrt/NvInfer.h"
 
 namespace tensorflow {
@@ -257,5 +256,4 @@ Status TrtEnqueue(nvinfer1::IExecutionContext* execution_context,
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#endif  // GOOGLE_TENSORRT
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
diff --git a/tensorflow/compiler/tf2tensorrt/utils/trt_engine_utils.h b/tensorflow/compiler/tf2tensorrt/utils/trt_engine_utils.h
index a471749877a..1ea4fe28cb4 100644
--- a/tensorflow/compiler/tf2tensorrt/utils/trt_engine_utils.h
+++ b/tensorflow/compiler/tf2tensorrt/utils/trt_engine_utils.h
@@ -24,8 +24,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/lib/core/status.h"
 
-#if GOOGLE_CUDA
-#if GOOGLE_TENSORRT
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
 #include "third_party/tensorrt/NvInfer.h"
 
 namespace tensorflow {
@@ -91,7 +90,6 @@ Status TrtEnqueue(nvinfer1::IExecutionContext* execution_context,
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#endif  // GOOGLE_TENSORRT
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
 
 #endif  // TENSORFLOW_COMPILER_TF2TENSORRT_UTILS_TRT_ENGINE_UTILS_H_
diff --git a/tensorflow/compiler/tf2tensorrt/utils/trt_int8_calibrator.cc b/tensorflow/compiler/tf2tensorrt/utils/trt_int8_calibrator.cc
index 554c127fa37..24271e352a7 100644
--- a/tensorflow/compiler/tf2tensorrt/utils/trt_int8_calibrator.cc
+++ b/tensorflow/compiler/tf2tensorrt/utils/trt_int8_calibrator.cc
@@ -20,8 +20,7 @@ limitations under the License.
 
 #include "tensorflow/core/platform/logging.h"
 
-#if GOOGLE_CUDA
-#if GOOGLE_TENSORRT
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
 #include "third_party/gpus/cuda/include/cuda_runtime_api.h"
 
 namespace tensorflow {
@@ -147,5 +146,4 @@ TRTInt8Calibrator::~TRTInt8Calibrator() {
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#endif
-#endif
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
diff --git a/tensorflow/compiler/tf2tensorrt/utils/trt_int8_calibrator.h b/tensorflow/compiler/tf2tensorrt/utils/trt_int8_calibrator.h
index 06b39716490..4c670e85f52 100644
--- a/tensorflow/compiler/tf2tensorrt/utils/trt_int8_calibrator.h
+++ b/tensorflow/compiler/tf2tensorrt/utils/trt_int8_calibrator.h
@@ -22,8 +22,7 @@ limitations under the License.
 #include <utility>
 #include "tensorflow/core/platform/mutex.h"
 
-#if GOOGLE_CUDA
-#if GOOGLE_TENSORRT
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
 
 #include "third_party/gpus/cuda/include/cuda_runtime_api.h"
 #include "third_party/tensorrt/NvInfer.h"
@@ -101,6 +100,5 @@ struct TRTInt8Calibrator : public nvinfer1::IInt8EntropyCalibrator {
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#endif
-#endif
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
 #endif  // TENSORFLOW_COMPILER_TF2TENSORRT_UTILS_TRT_INT8_CALIBRATOR_H_
diff --git a/tensorflow/compiler/tf2tensorrt/utils/trt_logger.cc b/tensorflow/compiler/tf2tensorrt/utils/trt_logger.cc
index 193687ebc8c..e34bf5e7397 100644
--- a/tensorflow/compiler/tf2tensorrt/utils/trt_logger.cc
+++ b/tensorflow/compiler/tf2tensorrt/utils/trt_logger.cc
@@ -15,8 +15,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/tf2tensorrt/utils/trt_logger.h"
 
-#if GOOGLE_CUDA
-#if GOOGLE_TENSORRT
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
 #include "tensorflow/compiler/tf2tensorrt/common/utils.h"
 #include "tensorflow/compiler/tf2tensorrt/convert/logger_registry.h"
 #include "tensorflow/core/platform/logging.h"
@@ -68,5 +67,4 @@ REGISTER_TENSORRT_LOGGER("DefaultLogger", Logger::GetLogger());
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#endif  // GOOGLE_CUDA
-#endif  // GOOGLE_TENSORRT
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
diff --git a/tensorflow/compiler/tf2tensorrt/utils/trt_logger.h b/tensorflow/compiler/tf2tensorrt/utils/trt_logger.h
index 2ade1b48f47..ce6552e8fe9 100644
--- a/tensorflow/compiler/tf2tensorrt/utils/trt_logger.h
+++ b/tensorflow/compiler/tf2tensorrt/utils/trt_logger.h
@@ -18,8 +18,7 @@ limitations under the License.
 
 #include "tensorflow/core/platform/types.h"
 
-#if GOOGLE_CUDA
-#if GOOGLE_TENSORRT
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
 #include "third_party/tensorrt/NvInfer.h"
 
 namespace tensorflow {
@@ -40,7 +39,6 @@ class Logger : public nvinfer1::ILogger {
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#endif  // GOOGLE_TENSORRT
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
 
 #endif  // TENSORFLOW_COMPILER_TF2TENSORRT_UTILS_TRT_LOGGER_H_
diff --git a/tensorflow/compiler/tf2tensorrt/utils/trt_lru_cache.cc b/tensorflow/compiler/tf2tensorrt/utils/trt_lru_cache.cc
index fbcdaad52c0..ee7e6272372 100644
--- a/tensorflow/compiler/tf2tensorrt/utils/trt_lru_cache.cc
+++ b/tensorflow/compiler/tf2tensorrt/utils/trt_lru_cache.cc
@@ -23,8 +23,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/platform/mutex.h"
 
-#if GOOGLE_CUDA
-#if GOOGLE_TENSORRT
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
 #include "third_party/tensorrt/NvInfer.h"
 
 namespace tensorflow {
@@ -141,5 +140,4 @@ EngineContext* TRTEngineCacheResource::GetEngineContext(const int profile_id) {
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#endif  // GOOGLE_TENSORRT
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
diff --git a/tensorflow/compiler/tf2tensorrt/utils/trt_lru_cache.h b/tensorflow/compiler/tf2tensorrt/utils/trt_lru_cache.h
index 8e345254f75..991b9a949e4 100644
--- a/tensorflow/compiler/tf2tensorrt/utils/trt_lru_cache.h
+++ b/tensorflow/compiler/tf2tensorrt/utils/trt_lru_cache.h
@@ -115,8 +115,7 @@ class LRUCache {
   }
 };
 
-#if GOOGLE_CUDA
-#if GOOGLE_TENSORRT
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
 
 struct EngineContext {
   EngineContext() {}  // Creates an empty context.
@@ -223,8 +222,7 @@ class TRTEngineCacheResource : public ResourceBase {
   TrtShapeOptimizationProfile profiles_;
 };
 
-#endif  // GOOGLE_TENSORRT
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
 
 }  // namespace tensorrt
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2tensorrt/utils/trt_shape_optimization_profiles.h b/tensorflow/compiler/tf2tensorrt/utils/trt_shape_optimization_profiles.h
index 40c7f5dcf31..fc688b14139 100644
--- a/tensorflow/compiler/tf2tensorrt/utils/trt_shape_optimization_profiles.h
+++ b/tensorflow/compiler/tf2tensorrt/utils/trt_shape_optimization_profiles.h
@@ -29,8 +29,7 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 
-#if GOOGLE_CUDA
-#if GOOGLE_TENSORRT
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
 
 #include "third_party/tensorrt/NvInfer.h"
 
@@ -173,6 +172,5 @@ class TrtShapeOptimizationProfile {
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#endif  // GOOGLE_TENSORRT
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
 #endif  // TENSORFLOW_COMPILER_TF2TENSORRT_UTILS_TRT_SHAPE_OPTIMIZATION_PROFILES_H_
diff --git a/tensorflow/compiler/tf2tensorrt/utils/trt_shape_optimization_profiles_test.cc b/tensorflow/compiler/tf2tensorrt/utils/trt_shape_optimization_profiles_test.cc
index 501810587e0..32c2200fb71 100644
--- a/tensorflow/compiler/tf2tensorrt/utils/trt_shape_optimization_profiles_test.cc
+++ b/tensorflow/compiler/tf2tensorrt/utils/trt_shape_optimization_profiles_test.cc
@@ -13,8 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#if GOOGLE_CUDA
-#if GOOGLE_TENSORRT
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
 
 #include <string.h>
 
@@ -214,5 +213,4 @@ TEST_F(TrtShapeOptimizationProfileTest, Dynamic) {
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#endif  // GOOGLE_TENSORRT
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT

From 32d63d0a3efb5e0b65dc6f590e54248054cf9f14 Mon Sep 17 00:00:00 2001
From: Nathan Luehr <nluehr@nvidia.com>
Date: Fri, 15 May 2020 11:40:22 -0500
Subject: [PATCH 0661/1390] Removed TENSOR_OP disable env vars.

* TF_DISABLE_CUBLAS_TENSOR_OP_MATH
* TF_DISABLE_CUDNN_TENSOR_OP_MATH
* TF_DISABLE_CUDNN_RNN_TENSOR_OP_MATH
---
 tensorflow/stream_executor/cuda/cuda_blas.cc | 21 ++------
 tensorflow/stream_executor/cuda/cuda_dnn.cc  | 55 +++++---------------
 2 files changed, 16 insertions(+), 60 deletions(-)

diff --git a/tensorflow/stream_executor/cuda/cuda_blas.cc b/tensorflow/stream_executor/cuda/cuda_blas.cc
index c9f0fc462c9..65c07e72154 100644
--- a/tensorflow/stream_executor/cuda/cuda_blas.cc
+++ b/tensorflow/stream_executor/cuda/cuda_blas.cc
@@ -101,18 +101,6 @@ static std::string ToString(cublasStatus_t status) {
   }
 }
 
-// Decide whether to enable TENSOR_OP_MATH
-static bool TensorOpMathEnabled() {
-  static bool is_enabled = [] {
-    bool is_disabled;
-    TF_CHECK_OK(
-        tensorflow::ReadBoolFromEnvVar("TF_DISABLE_CUBLAS_TENSOR_OP_MATH",
-                                       /*default_val=*/false, &is_disabled));
-    return !is_disabled;
-  }();
-  return is_enabled;
-}
-
 // cuBLAS has interfaces that permit pointers to be passed from either the host
 // memory space or the device memory space; however, you must instruct it as to
 // which address space those pointers are in with cublasSetPointerMode.
@@ -1640,7 +1628,7 @@ bool CUDABlas::DoBlasGemm(
                                                                    &cc_minor);
 
   // GPUs < sm_70 don't support tensor ops.
-  if (cc_major >= 7 && TensorOpMathEnabled()) {
+  if (cc_major >= 7) {
     use_tensor_ops = true;
   }
 #endif
@@ -1921,8 +1909,7 @@ static bool TensorOpsAvailable(int cc_major) {
   // strictly correct.  We can't simply enable it, though, as that would change
   // clients' behavior significantly: Using tensor ops on fp32 inputs cause them
   // to be rounded to fp16.
-  if (cc_major >= 7 && TensorOpMathEnabled() &&
-      std::is_same<InType, Eigen::half>::value) {
+  if (cc_major >= 7 && std::is_same<InType, Eigen::half>::value) {
     return true;
   }
 #endif
@@ -2270,7 +2257,7 @@ port::Status CUDABlas::DoBlasGemmBatchedInternal(
   if (stream->parent()->GetDeviceDescription().cuda_compute_capability(
           &cc_major, &cc_minor) &&
       cc_major >= 5) {
-    bool use_tensor_ops = TensorOpMathEnabled() && data_type == CUDA_R_16F;
+    bool use_tensor_ops = data_type == CUDA_R_16F;
     cublasGemmAlgo_t algo =
         (use_tensor_ops ? CUBLAS_GEMM_DFALT_TENSOR_OP : CUBLAS_GEMM_DFALT);
     cudaDataType_t compute_type =
@@ -2425,7 +2412,7 @@ bool CUDABlas::DoBlasGemmStridedBatched(
   if (stream->parent()->GetDeviceDescription().cuda_compute_capability(
           &cc_major, &cc_minor)) {
     // GPUs < sm_70 don't support tensor ops.
-    if (cc_major >= 7 && TensorOpMathEnabled()) {
+    if (cc_major >= 7) {
       use_tensor_ops = true;
     }
 #if CUDA_VERSION >= 9010
diff --git a/tensorflow/stream_executor/cuda/cuda_dnn.cc b/tensorflow/stream_executor/cuda/cuda_dnn.cc
index be18c989861..e46c271443b 100644
--- a/tensorflow/stream_executor/cuda/cuda_dnn.cc
+++ b/tensorflow/stream_executor/cuda/cuda_dnn.cc
@@ -601,31 +601,6 @@ class CudnnFilterDescriptor {
   SE_DISALLOW_COPY_AND_ASSIGN(CudnnFilterDescriptor);
 };
 
-// A helper function to decide whether to enable the TENSOR_OP_MATH math type
-bool TensorOpMathEnabled() {
-  static bool is_enabled = [] {
-    bool is_disabled = false;
-    TF_CHECK_OK(
-        tensorflow::ReadBoolFromEnvVar("TF_DISABLE_CUDNN_TENSOR_OP_MATH",
-                                       /*default_val=*/false, &is_disabled));
-    return !is_disabled;
-  }();
-  return is_enabled;
-}
-
-// A helper function to decide whether to enable the TENSOR_OP_MATH math type
-// for RNNs.
-bool RnnTensorOpMathEnabled() {
-  static bool is_enabled = [] {
-    bool is_disabled = false;
-    TF_CHECK_OK(
-        tensorflow::ReadBoolFromEnvVar("TF_DISABLE_CUDNN_RNN_TENSOR_OP_MATH",
-                                       /*default_val=*/false, &is_disabled));
-    return !is_disabled;
-  }();
-  return is_enabled;
-}
-
 // A helper function to decide whether to use
 // CUDNN_BATCHNORM_SPATIAL_PERSISTENT in batchnorm. This mode can be faster in
 // some tasks because an optimized path may be selected for CUDNN_DATA_FLOAT
@@ -749,9 +724,7 @@ class CudnnConvolutionDescriptor {
 #if CUDNN_VERSION >= 7000
     cudnnMathType_t math_type =
         (use_tensor_op_math ? CUDNN_TENSOR_OP_MATH : CUDNN_DEFAULT_MATH);
-    if (TensorOpMathEnabled()) {
-      CHECK_CUDNN_OK(cudnnSetConvolutionMathType(handle_.get(), math_type));
-    }
+    CHECK_CUDNN_OK(cudnnSetConvolutionMathType(handle_.get(), math_type));
 #endif
   }
 
@@ -1155,21 +1128,19 @@ class CudnnRnnDescriptor : public dnn::RnnDescriptor {
     // in profile mode, which is run with algorithms returned from
     // GetRnnAlgorithms() (which are non-default and explicitly set whether to
     // use tensor ops). CuDNN 7.2.1 fixed this issue
-    if (RnnTensorOpMathEnabled()) {
-      cudnnMathType_t math_type;
-      if (algorithm_config.algorithm().has_value()) {
-        math_type = algorithm_config.algorithm()->tensor_ops_enabled()
-                        ? CUDNN_TENSOR_OP_MATH
-                        : CUDNN_DEFAULT_MATH;
-      } else {
+    cudnnMathType_t math_type;
+    if (algorithm_config.algorithm().has_value()) {
+      math_type = algorithm_config.algorithm()->tensor_ops_enabled()
+                      ? CUDNN_TENSOR_OP_MATH
+                      : CUDNN_DEFAULT_MATH;
+    } else {
 #if CUDNN_VERSION >= 7201
-        math_type = CUDNN_TENSOR_OP_MATH;
+      math_type = CUDNN_TENSOR_OP_MATH;
 #else
-        math_type = CUDNN_DEFAULT_MATH;
+      math_type = CUDNN_DEFAULT_MATH;
 #endif  // CUDNN_VERSION >= 7201
-      }
-      CHECK_CUDNN_OK(cudnnSetRNNMatrixMathType(rnn_desc.get(), math_type));
     }
+    CHECK_CUDNN_OK(cudnnSetRNNMatrixMathType(rnn_desc.get(), math_type));
 #endif  // CUDNN_VERSION >= 7000
 
     return CudnnRnnDescriptor(cudnn, std::move(rnn_desc), std::move(rnn_plan),
@@ -2686,7 +2657,7 @@ AllocateCudnnConvolutionBackwardFilterWorkspace(
 }
 
 static bool TensorOpMathAvailable(int cc_major) {
-  return cc_major >= 7 && CUDNN_VERSION >= 7000 && TensorOpMathEnabled();
+  return cc_major >= 7 && CUDNN_VERSION >= 7000;
 }
 
 port::StatusOr<dnn::AlgorithmDesc> GetCudnnConvolutionForwardAlgorithm(
@@ -3480,9 +3451,7 @@ bool CudnnSupport::GetRnnAlgorithms(
   for (auto i : algo_types) {
     out_algorithms->push_back({i, /*use_tensor_ops=*/false});
 #if CUDNN_VERSION >= 7100
-    if (RnnTensorOpMathEnabled()) {
-      out_algorithms->push_back({i, /*use_tensor_ops=*/true});
-    }
+    out_algorithms->push_back({i, /*use_tensor_ops=*/true});
 #endif
   }
   return true;

From d2afc9ce83b48170f0821b6b3b5debddd857d320 Mon Sep 17 00:00:00 2001
From: Nathan Luehr <nluehr@nvidia.com>
Date: Fri, 15 May 2020 11:46:41 -0500
Subject: [PATCH 0662/1390] Add global setting control TF32 execution

---
 tensorflow/core/platform/BUILD         |  7 +++++++
 tensorflow/core/platform/tf32_utils.cc | 27 ++++++++++++++++++++++++++
 tensorflow/core/platform/tf32_utils.h  | 27 ++++++++++++++++++++++++++
 3 files changed, 61 insertions(+)
 create mode 100644 tensorflow/core/platform/tf32_utils.cc
 create mode 100644 tensorflow/core/platform/tf32_utils.h

diff --git a/tensorflow/core/platform/BUILD b/tensorflow/core/platform/BUILD
index 70bb8a89417..33a1e7cfe0a 100644
--- a/tensorflow/core/platform/BUILD
+++ b/tensorflow/core/platform/BUILD
@@ -938,6 +938,13 @@ cc_library(
     alwayslink = 1,
 )
 
+cc_library(
+    name = "tf32_utils",
+    srcs = ["tf32_utils.cc"],
+    hdrs = ["tf32_utils.h"],
+    copts = tf_copts(),
+)
+
 tf_cc_tests(
     name = "low_level_library_tests",
     size = "small",
diff --git a/tensorflow/core/platform/tf32_utils.cc b/tensorflow/core/platform/tf32_utils.cc
new file mode 100644
index 00000000000..715b5996dc3
--- /dev/null
+++ b/tensorflow/core/platform/tf32_utils.cc
@@ -0,0 +1,27 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/platform/tf32_utils.h"
+
+namespace tensorflow {
+
+// TODO(nluehr): enable tf32 execution by default after TF32 Ampere testing.
+static bool tf32_enabled = false;
+
+void allow_tf32_execution(bool allow) { tf32_enabled = allow; }
+
+bool tf32_execution_allowed() { return tf32_enabled; }
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/platform/tf32_utils.h b/tensorflow/core/platform/tf32_utils.h
new file mode 100644
index 00000000000..a0ce58f9bbd
--- /dev/null
+++ b/tensorflow/core/platform/tf32_utils.h
@@ -0,0 +1,27 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PLATFORM_TF32_UTILS_H_
+#define TENSORFLOW_CORE_PLATFORM_TF32_UTILS_H_
+
+namespace tensorflow {
+
+void allow_tf32_execution(bool allow);
+
+bool tf32_execution_allowed();
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PLATFORM_TF32_UTILS_H_

From 16033c0b3484409a965acc0dd3054695145311a8 Mon Sep 17 00:00:00 2001
From: Nathan Luehr <nluehr@nvidia.com>
Date: Fri, 15 May 2020 13:33:02 -0500
Subject: [PATCH 0663/1390] Python tf.config tf32 interface

---
 tensorflow/python/BUILD               | 11 +++++++++++
 tensorflow/python/framework/config.py | 26 ++++++++++++++++++++++++++
 tensorflow/python/util/tf32.cc        | 22 ++++++++++++++++++++++
 3 files changed, 59 insertions(+)
 create mode 100644 tensorflow/python/util/tf32.cc

diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index de9cf9a24c7..5f9e2dfb1ff 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -788,6 +788,16 @@ tf_python_pybind_extension(
     ],
 )
 
+tf_python_pybind_extension(
+    name = "_pywrap_tf32_execution",
+    srcs = ["util/tf32.cc"],
+    module_name = "_pywrap_tf32_execution",
+    deps = [
+        "//tensorflow/core/platform:tf32_utils",
+        "@pybind11",
+    ],
+)
+
 tf_python_pybind_extension(
     name = "_pywrap_util_port",
     srcs = ["util/port_wrapper.cc"],
@@ -5678,6 +5688,7 @@ py_library(
         "//tensorflow:composite_tensor_whitelist",
     ],
     deps = [
+        ":_pywrap_tf32_execution",
         ":tf_decorator",
         ":tf_export",
         ":tf_stack",
diff --git a/tensorflow/python/framework/config.py b/tensorflow/python/framework/config.py
index 9ff16f2a327..cb95965dfb2 100644
--- a/tensorflow/python/framework/config.py
+++ b/tensorflow/python/framework/config.py
@@ -18,10 +18,36 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python import _pywrap_tf32_execution
 from tensorflow.python.eager import context
 from tensorflow.python.util import deprecation
 from tensorflow.python.util.tf_export import tf_export
 
+def tensor_float32_execution_allowed():
+  """Get if TensorFloat-32 operations are enabled on supported hardware.
+
+  Returns:
+    True if TensorFloat-32 execution is enabled and False otherwise.
+  """
+  return _pywrap_tf32_execution.is_allowed()
+
+def allow_tensor_float_32_execution(allow):
+  """Allow use of TensorFloat-32 with float32 ops on supported hardware.
+
+  TensorFloat-32 is a math mode introduced with the NVIDIA Ampere architecture.
+  TensorFloat-32 kernels take float32 inputs and produce float32 outputs.
+  Internally, the inputs are cast to a custom representation with 10-bit
+  mantissa (similar to float16) and 8-bit exponent (similar to float32) and are
+  executed using TensorCores with float32 accumulation. For more information,
+  see https://blogs.nvidia.com/blog/2020/05/14/tensorfloat-32-precision-format/.
+
+  TensorFloat-32 execution is disabled by default, but this may change in a
+  future version.
+  
+  Args:
+    allow: whether to allow TensorFloat-32 execution
+  """
+  _pywrap_tf32_execution.allow(allow)
 
 @tf_export('config.threading.get_intra_op_parallelism_threads')
 def get_intra_op_parallelism_threads():
diff --git a/tensorflow/python/util/tf32.cc b/tensorflow/python/util/tf32.cc
new file mode 100644
index 00000000000..7dece6ccdae
--- /dev/null
+++ b/tensorflow/python/util/tf32.cc
@@ -0,0 +1,22 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "pybind11/pybind11.h"
+#include "tensorflow/core/platform/tf32_utils.h"
+
+PYBIND11_MODULE(_pywrap_tf32_execution, m) {
+  m.def("allow", &tensorflow::allow_tf32_execution);
+  m.def("is_allowed", &tensorflow::tf32_execution_allowed);
+}

From 376efd71b10eb7d7b900c3f7e7aff99bf15b0196 Mon Sep 17 00:00:00 2001
From: Nathan Luehr <nluehr@nvidia.com>
Date: Tue, 19 May 2020 14:58:30 -0500
Subject: [PATCH 0664/1390] Convolution TF32 Plumbing

---
 tensorflow/stream_executor/cuda/BUILD       |   1 +
 tensorflow/stream_executor/cuda/cuda_dnn.cc | 200 +++++++++++++-------
 2 files changed, 135 insertions(+), 66 deletions(-)

diff --git a/tensorflow/stream_executor/cuda/BUILD b/tensorflow/stream_executor/cuda/BUILD
index c3cf9f5db15..cdc0de7c72a 100644
--- a/tensorflow/stream_executor/cuda/BUILD
+++ b/tensorflow/stream_executor/cuda/BUILD
@@ -356,6 +356,7 @@ cc_library(
         "@local_config_cuda//cuda:cudnn_header",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "//tensorflow/core/platform:tf32_utils",
         "//tensorflow/stream_executor:dnn",
         "//tensorflow/stream_executor:event",
         "//tensorflow/stream_executor:plugin_registry",
diff --git a/tensorflow/stream_executor/cuda/cuda_dnn.cc b/tensorflow/stream_executor/cuda/cuda_dnn.cc
index e46c271443b..53ba31d7d0d 100644
--- a/tensorflow/stream_executor/cuda/cuda_dnn.cc
+++ b/tensorflow/stream_executor/cuda/cuda_dnn.cc
@@ -20,8 +20,8 @@ limitations under the License.
 #include <utility>
 
 #include "absl/strings/str_cat.h"
-#include "third_party/eigen3/Eigen/Core"
 #include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/platform/tf32_utils.h"
 #include "tensorflow/core/util/env_var.h"
 #include "tensorflow/stream_executor/cuda/cuda_activation.h"
 #include "tensorflow/stream_executor/cuda/cuda_diagnostics.h"
@@ -42,6 +42,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/scratch_allocator.h"
 #include "tensorflow/stream_executor/stream.h"
 #include "tensorflow/stream_executor/stream_executor_pimpl.h"
+#include "third_party/eigen3/Eigen/Core"
 // clang-format off
 #include "third_party/gpus/cudnn/cudnn.h"
 #include "absl/strings/string_view.h"
@@ -705,10 +706,6 @@ class CudnnConvolutionDescriptor {
             : CUDNN_CROSS_CORRELATION,
         data_type));
 
-    // NOTE(benbarsdell): This only applies if tensor op math is enabled
-    //                      and algo selection is set to Default.
-    this->set_use_tensor_op_math(true);
-
 #if CUDNN_MAJOR >= 7
     VLOG(2) << "Requesting grouped convolution: "
             << convolution_descriptor.group_count();
@@ -720,10 +717,14 @@ class CudnnConvolutionDescriptor {
 #endif
   }
 
-  void set_use_tensor_op_math(bool use_tensor_op_math) const {
+  void set_use_tensor_op_math(bool use_tensor_op_math) {
 #if CUDNN_VERSION >= 7000
     cudnnMathType_t math_type =
+#if CUDNN_VERSION >= 8000
+        (use_tensor_op_math ? CUDNN_DEFAULT_MATH : CUDNN_FMA_MATH);
+#else
         (use_tensor_op_math ? CUDNN_TENSOR_OP_MATH : CUDNN_DEFAULT_MATH);
+#endif
     CHECK_CUDNN_OK(cudnnSetConvolutionMathType(handle_.get(), math_type));
 #endif
   }
@@ -736,6 +737,38 @@ class CudnnConvolutionDescriptor {
   SE_DISALLOW_COPY_AND_ASSIGN(CudnnConvolutionDescriptor);
 };
 
+// A helper function to query if a CudnnConvolutionDescriptor has tensor_op_math
+// set
+static bool IsTensorMathOpSet(const CudnnConvolutionDescriptor& conv) {
+  cudnnMathType_t math_type;
+  CHECK_CUDNN_OK(cudnnGetConvolutionMathType(conv.handle(), &math_type));
+#if CUDNN_VERSION >= 8000
+  return math_type != CUDNN_FMA_MATH;
+#else
+  return math_type == CUDNN_TENSOR_OP_MATH;
+#endif
+}
+
+static bool TensorOpMathAvailable(int cc_major) {
+  return cc_major >= 7 && CUDNN_VERSION >= 7000;
+}
+
+static bool IsTensorMathAllowed(Stream* stream, dnn::DataType input_type) {
+  int cc_major, cc_minor;
+  std::tie(cc_major, cc_minor) = GetCcMajorMinor(stream);
+  if (!TensorOpMathAvailable(cc_major)) {
+    return false;
+  }
+  if (input_type == dnn::DataType::kFloat) {
+    if (CUDNN_VERSION < 8000) {
+      return false;
+    } else if (!tensorflow::tf32_execution_allowed()) {
+      return false;
+    }
+  }
+  return true;
+}
+
 // Turns a PoolingDescriptor structure into a cudnn pooling descriptor handle
 // within a scope.
 class CudnnPoolingDescriptor {
@@ -2531,10 +2564,11 @@ port::StatusOr<DeviceMemory<uint8>> AllocateCudnnConvolutionForwardWorkspace(
     const CudnnTensorDescriptor& output_nd,
     const dnn::AlgorithmDesc& algorithm_desc,
     ScratchAllocator* scratch_allocator) {
-  // TODO(csigg): This has side effects on the convolution descriptor. It is
-  // functionally correct because the convolution is run with the algorithm of
-  // the last call to this function, but should be fixed anyway.
-  conv.set_use_tensor_op_math(algorithm_desc.tensor_ops_enabled());
+  if (IsTensorMathOpSet(conv) != algorithm_desc.tensor_ops_enabled()) {
+    return port::Status(
+        port::error::INTERNAL,
+        "Mismatch between cudnn conv and algorithm descriptors.");
+  }
 
   // Query the size of the workspace and allocate it.
   size_t size_in_bytes;
@@ -2574,10 +2608,11 @@ AllocateCudnnConvolutionBackwardDataWorkspace(
     const CudnnTensorDescriptor& output_nd,
     const dnn::AlgorithmDesc& algorithm_desc,
     ScratchAllocator* scratch_allocator) {
-  // TODO(csigg): This has side effects on the convolution descriptor. It is
-  // functionally correct because the convolution is run with the algorithm of
-  // the last call to this function, but should be fixed anyway.
-  conv.set_use_tensor_op_math(algorithm_desc.tensor_ops_enabled());
+  if (IsTensorMathOpSet(conv) != algorithm_desc.tensor_ops_enabled()) {
+    return port::Status(
+        port::error::INTERNAL,
+        "Mismatch between cudnn conv and algorithm descriptors.");
+  }
 
   // Query the size of the workspace and allocate it.
   size_t size_in_bytes;
@@ -2619,10 +2654,11 @@ AllocateCudnnConvolutionBackwardFilterWorkspace(
     const CudnnTensorDescriptor& output_nd,
     const dnn::AlgorithmDesc& algorithm_desc,
     ScratchAllocator* scratch_allocator) {
-  // TODO(csigg): This has side effects on the convolution descriptor. It is
-  // functionally correct because the convolution is run with the algorithm of
-  // the last call to this function, but should be fixed anyway.
-  conv.set_use_tensor_op_math(algorithm_desc.tensor_ops_enabled());
+  if (IsTensorMathOpSet(conv) != algorithm_desc.tensor_ops_enabled()) {
+    return port::Status(
+        port::error::INTERNAL,
+        "Mismatch between cudnn conv and algorithm descriptors.");
+  }
 
   // Query the size of the workspace and allocate it.
   size_t size_in_bytes;
@@ -2656,18 +2692,39 @@ AllocateCudnnConvolutionBackwardFilterWorkspace(
   return scratch_allocator->AllocateBytes(size_in_bytes);
 }
 
-static bool TensorOpMathAvailable(int cc_major) {
-  return cc_major >= 7 && CUDNN_VERSION >= 7000;
+port::StatusOr<bool> UseTensorOps(Stream* stream, dnn::DataType type,
+                                  absl::optional<dnn::AlgorithmDesc> desc) {
+  bool use_tensor_ops;
+  if (desc.has_value()) {
+    use_tensor_ops = desc->tensor_ops_enabled();
+    if (use_tensor_ops && !IsTensorMathAllowed(stream, type)) {
+      return port::Status(port::error::INVALID_ARGUMENT,
+                          "Algo requests disallowed tensor op evaluation.");
+    }
+  } else {
+    use_tensor_ops = IsTensorMathAllowed(stream, type);
+  }
+  return use_tensor_ops;
 }
 
 port::StatusOr<dnn::AlgorithmDesc> GetCudnnConvolutionForwardAlgorithm(
     Stream* stream, const CudnnHandle& cudnn,
     const dnn::AlgorithmConfig& algorithm_config,
     const CudnnTensorDescriptor& input_nd, const CudnnFilterDescriptor& filter,
-    const CudnnConvolutionDescriptor& conv,
+    dnn::DataType element_type,
+    const dnn::ConvolutionDescriptor& convolution_descriptor,
     const CudnnTensorDescriptor& output_nd, ScratchAllocator* scratch_allocator,
     DeviceMemory<uint8>* scratch) {
   absl::optional<dnn::AlgorithmDesc> algo_desc = algorithm_config.algorithm();
+
+  CudnnConvolutionDescriptor conv(
+      convolution_descriptor,
+      ToCudnnDataType(GetConvAccumulatorType(element_type)));
+  bool use_tensor_ops;
+  SE_ASSIGN_OR_RETURN(use_tensor_ops,
+                      UseTensorOps(stream, element_type, algo_desc));
+  conv.set_use_tensor_op_math(use_tensor_ops);
+
   if (!algo_desc.has_value()) {
     // Pick fastest algorithm within memory limit according to cuDNN's
     // heuristics.
@@ -2680,10 +2737,7 @@ port::StatusOr<dnn::AlgorithmDesc> GetCudnnConvolutionForwardAlgorithm(
                         GetCudnnConvolutionForwardAlgo(
                             cudnn, input_nd, filter, conv, output_nd,
                             specify_workspace_limit, memory_limit_bytes));
-    int cc_major, cc_minor;
-    std::tie(cc_major, cc_minor) = GetCcMajorMinor(stream);
-    algo_desc = dnn::AlgorithmDesc(
-        algo, /*use_tensor_ops=*/TensorOpMathAvailable(cc_major));
+    algo_desc = dnn::AlgorithmDesc(algo, use_tensor_ops);
   }
 
   const auto scratch_or = AllocateCudnnConvolutionForwardWorkspace(
@@ -2707,6 +2761,9 @@ port::StatusOr<dnn::AlgorithmDesc> GetCudnnConvolutionForwardAlgorithm(
                      "Returned status: ", scratch_or.status().ToString()));
   }
 
+  SE_ASSIGN_OR_RETURN(use_tensor_ops,
+                      UseTensorOps(stream, element_type, algo_desc));
+  conv.set_use_tensor_op_math(use_tensor_ops);
   SE_ASSIGN_OR_RETURN(*scratch, AllocateCudnnConvolutionForwardWorkspace(
                                     stream, cudnn, input_nd, filter, conv,
                                     output_nd, *algo_desc, scratch_allocator));
@@ -2717,10 +2774,19 @@ port::StatusOr<dnn::AlgorithmDesc> GetCudnnConvolutionBackwardDataAlgorithm(
     Stream* stream, const CudnnHandle& cudnn,
     const dnn::AlgorithmConfig& algorithm_config,
     const CudnnTensorDescriptor& input_nd, const CudnnFilterDescriptor& filter,
-    const CudnnConvolutionDescriptor& conv,
+    dnn::DataType element_type,
+    const dnn::ConvolutionDescriptor& convolution_descriptor,
     const CudnnTensorDescriptor& output_nd, ScratchAllocator* scratch_allocator,
     DeviceMemory<uint8>* scratch) {
   absl::optional<dnn::AlgorithmDesc> algo_desc = algorithm_config.algorithm();
+  CudnnConvolutionDescriptor conv(
+      convolution_descriptor,
+      ToCudnnDataType(GetConvAccumulatorType(element_type)));
+  bool use_tensor_ops;
+  SE_ASSIGN_OR_RETURN(use_tensor_ops,
+                      UseTensorOps(stream, element_type, algo_desc));
+  conv.set_use_tensor_op_math(use_tensor_ops);
+
   if (!algo_desc.has_value()) {
     // Pick fastest algorithm within memory limit according to cuDNN's
     // heuristics.
@@ -2733,10 +2799,7 @@ port::StatusOr<dnn::AlgorithmDesc> GetCudnnConvolutionBackwardDataAlgorithm(
                         GetCudnnConvolutionBackwardDataAlgo(
                             cudnn, input_nd, filter, conv, output_nd,
                             specify_workspace_limit, memory_limit_bytes));
-    int cc_major, cc_minor;
-    std::tie(cc_major, cc_minor) = GetCcMajorMinor(stream);
-    algo_desc = dnn::AlgorithmDesc(
-        algo, /*use_tensor_ops=*/TensorOpMathAvailable(cc_major));
+    algo_desc = dnn::AlgorithmDesc(algo, use_tensor_ops);
   }
 
   const auto scratch_or = AllocateCudnnConvolutionBackwardDataWorkspace(
@@ -2759,6 +2822,9 @@ port::StatusOr<dnn::AlgorithmDesc> GetCudnnConvolutionBackwardDataAlgorithm(
         "while a secondary algorithm is not provided.");
   }
 
+  SE_ASSIGN_OR_RETURN(use_tensor_ops,
+                      UseTensorOps(stream, element_type, algo_desc));
+  conv.set_use_tensor_op_math(use_tensor_ops);
   SE_ASSIGN_OR_RETURN(*scratch, AllocateCudnnConvolutionBackwardDataWorkspace(
                                     stream, cudnn, input_nd, filter, conv,
                                     output_nd, *algo_desc, scratch_allocator));
@@ -2769,10 +2835,19 @@ port::StatusOr<dnn::AlgorithmDesc> GetCudnnConvolutionBackwardFilterAlgorithm(
     Stream* stream, const CudnnHandle& cudnn,
     const dnn::AlgorithmConfig& algorithm_config,
     const CudnnTensorDescriptor& input_nd, const CudnnFilterDescriptor& filter,
-    const CudnnConvolutionDescriptor& conv,
+    dnn::DataType element_type,
+    const dnn::ConvolutionDescriptor& convolution_descriptor,
     const CudnnTensorDescriptor& output_nd, ScratchAllocator* scratch_allocator,
     DeviceMemory<uint8>* scratch) {
   absl::optional<dnn::AlgorithmDesc> algo_desc = algorithm_config.algorithm();
+  CudnnConvolutionDescriptor conv(
+      convolution_descriptor,
+      ToCudnnDataType(GetConvAccumulatorType(element_type)));
+  bool use_tensor_ops;
+  SE_ASSIGN_OR_RETURN(use_tensor_ops,
+                      UseTensorOps(stream, element_type, algo_desc));
+  conv.set_use_tensor_op_math(use_tensor_ops);
+
   if (!algo_desc.has_value()) {
     // Pick fastest algorithm within memory limit according to cuDNN's
     // heuristics.
@@ -2785,10 +2860,7 @@ port::StatusOr<dnn::AlgorithmDesc> GetCudnnConvolutionBackwardFilterAlgorithm(
                         GetCudnnConvolutionBackwardFilterAlgo(
                             cudnn, input_nd, filter, conv, output_nd,
                             specify_workspace_limit, memory_limit_bytes));
-    int cc_major, cc_minor;
-    std::tie(cc_major, cc_minor) = GetCcMajorMinor(stream);
-    algo_desc = dnn::AlgorithmDesc(
-        algo, /*use_tensor_ops=*/TensorOpMathAvailable(cc_major));
+    algo_desc = dnn::AlgorithmDesc(algo, use_tensor_ops);
   }
 
   auto scratch_or = AllocateCudnnConvolutionBackwardFilterWorkspace(
@@ -2811,6 +2883,9 @@ port::StatusOr<dnn::AlgorithmDesc> GetCudnnConvolutionBackwardFilterAlgorithm(
         "while a secondary algorithm is not provided.");
   }
 
+  SE_ASSIGN_OR_RETURN(use_tensor_ops,
+                      UseTensorOps(stream, element_type, algo_desc));
+  conv.set_use_tensor_op_math(use_tensor_ops);
   SE_ASSIGN_OR_RETURN(*scratch, AllocateCudnnConvolutionBackwardFilterWorkspace(
                                     stream, cudnn, input_nd, filter, conv,
                                     output_nd, *algo_desc, scratch_allocator));
@@ -2975,35 +3050,32 @@ port::Status CudnnSupport::DoPrepareForConvolution(
   CudnnTensorDescriptor output_nd(
       output_descriptor,
       ToCudnnDataType(element_type, output_descriptor.layout()));
-  CudnnConvolutionDescriptor conv(
-      convolution_descriptor,
-      ToCudnnDataType(GetConvAccumulatorType(element_type)));
 
   auto cudnn = cudnn_->GetHandle(parent_, stream);
 
   switch (kind) {
     case dnn::ConvolutionKind::FORWARD: {
-      SE_ASSIGN_OR_RETURN(
-          *algorithm_desc,
-          GetCudnnConvolutionForwardAlgorithm(
-              stream, cudnn, algorithm_config, input_nd, filter_nd, conv,
-              output_nd, scratch_allocator, scratch_memory));
+      SE_ASSIGN_OR_RETURN(*algorithm_desc,
+                          GetCudnnConvolutionForwardAlgorithm(
+                              stream, cudnn, algorithm_config, input_nd,
+                              filter_nd, element_type, convolution_descriptor,
+                              output_nd, scratch_allocator, scratch_memory));
       break;
     }
     case dnn::ConvolutionKind::BACKWARD_DATA: {
-      SE_ASSIGN_OR_RETURN(
-          *algorithm_desc,
-          GetCudnnConvolutionBackwardDataAlgorithm(
-              stream, cudnn, algorithm_config, input_nd, filter_nd, conv,
-              output_nd, scratch_allocator, scratch_memory));
+      SE_ASSIGN_OR_RETURN(*algorithm_desc,
+                          GetCudnnConvolutionBackwardDataAlgorithm(
+                              stream, cudnn, algorithm_config, input_nd,
+                              filter_nd, element_type, convolution_descriptor,
+                              output_nd, scratch_allocator, scratch_memory));
       break;
     }
     case dnn::ConvolutionKind::BACKWARD_FILTER: {
-      SE_ASSIGN_OR_RETURN(
-          *algorithm_desc,
-          GetCudnnConvolutionBackwardFilterAlgorithm(
-              stream, cudnn, algorithm_config, input_nd, filter_nd, conv,
-              output_nd, scratch_allocator, scratch_memory));
+      SE_ASSIGN_OR_RETURN(*algorithm_desc,
+                          GetCudnnConvolutionBackwardFilterAlgorithm(
+                              stream, cudnn, algorithm_config, input_nd,
+                              filter_nd, element_type, convolution_descriptor,
+                              output_nd, scratch_allocator, scratch_memory));
       break;
     }
     default:
@@ -3032,8 +3104,9 @@ port::Status CudnnSupport::DoConvolve(
   auto accumulator_type = GetConvAccumulatorType(element_type);
   CudnnConvolutionDescriptor conv(convolution_descriptor,
                                   ToCudnnDataType(accumulator_type));
-  // Set use_tensor_math param to correct value
-  conv.set_use_tensor_op_math(algorithm_desc.tensor_ops_enabled());
+  SE_ASSIGN_OR_RETURN(bool use_tensor_ops,
+                      UseTensorOps(stream, element_type, algorithm_desc));
+  conv.set_use_tensor_op_math(use_tensor_ops);
 
   auto cudnn = cudnn_->GetHandle(parent_, stream);
   // Alpha is the scaling factor for input.
@@ -3266,14 +3339,6 @@ port::Status CudnnSupport::DoConvolve(
   return port::Status::OK();
 }
 
-// A helper function to query if a CudnnConvolutionDescriptor has tensor_op_math
-// set
-static bool IsTensorMathOpSet(const CudnnConvolutionDescriptor& conv) {
-  cudnnMathType_t math_type;
-  CHECK_CUDNN_OK(cudnnGetConvolutionMathType(conv.handle(), &math_type));
-  return math_type == CUDNN_TENSOR_OP_MATH;
-}
-
 template <typename ElementType, typename BiasType, typename ScaleType,
           typename OutputType>
 port::Status CudnnSupport::DoFusedConvolveImpl(
@@ -3307,8 +3372,6 @@ port::Status CudnnSupport::DoFusedConvolveImpl(
       filter_descriptor,
       GetCudnnDataType<ElementType>(conv_input_descriptor.layout()));
   CudnnTensorDescriptor bias_nd(bias_descriptor, GetCudnnDataType<BiasType>());
-  CudnnConvolutionDescriptor conv(convolution_descriptor,
-                                  ToCudnnDataType(accumulator_type));
 
   auto cudnn = cudnn_->GetHandle(parent_, stream);
 
@@ -3318,9 +3381,14 @@ port::Status CudnnSupport::DoFusedConvolveImpl(
   SE_ASSIGN_OR_RETURN(
       dnn::AlgorithmDesc algo_desc,
       GetCudnnConvolutionForwardAlgorithm(
-          stream, cudnn, algorithm_config, conv_input_nd, filter, conv,
+          stream, cudnn, algorithm_config, conv_input_nd, filter,
+          dnn::ToDataType<ElementType>::value, convolution_descriptor,
           output_nd, scratch_allocator, &scratch));
 
+  CudnnConvolutionDescriptor conv(convolution_descriptor,
+                                  ToCudnnDataType(accumulator_type));
+  conv.set_use_tensor_op_math(algo_desc.tensor_ops_enabled());
+
   std::unique_ptr<GpuTimer, GpuTimerDeleter> timer;
   if (is_profiling) {
     timer.reset(new GpuTimer(parent_));  // NOLINT

From 107e6348236d35a9fcdb4a1375ff07bf4975131c Mon Sep 17 00:00:00 2001
From: Nathan Luehr <nluehr@nvidia.com>
Date: Tue, 19 May 2020 15:54:10 -0500
Subject: [PATCH 0665/1390] Plumb TF32 for RNN

---
 tensorflow/stream_executor/cuda/cuda_dnn.cc | 30 ++++++++++++++-------
 1 file changed, 21 insertions(+), 9 deletions(-)

diff --git a/tensorflow/stream_executor/cuda/cuda_dnn.cc b/tensorflow/stream_executor/cuda/cuda_dnn.cc
index 53ba31d7d0d..820d39c7201 100644
--- a/tensorflow/stream_executor/cuda/cuda_dnn.cc
+++ b/tensorflow/stream_executor/cuda/cuda_dnn.cc
@@ -1161,17 +1161,26 @@ class CudnnRnnDescriptor : public dnn::RnnDescriptor {
     // in profile mode, which is run with algorithms returned from
     // GetRnnAlgorithms() (which are non-default and explicitly set whether to
     // use tensor ops). CuDNN 7.2.1 fixed this issue
-    cudnnMathType_t math_type;
+    bool allow_tensor_ops =
+        data_type != CUDNN_DATA_FLOAT || tensorflow::tf32_execution_allowed();
+    bool use_tensor_ops;
     if (algorithm_config.algorithm().has_value()) {
-      math_type = algorithm_config.algorithm()->tensor_ops_enabled()
-                      ? CUDNN_TENSOR_OP_MATH
-                      : CUDNN_DEFAULT_MATH;
+      use_tensor_ops = algorithm_config.algorithm()->tensor_ops_enabled();
     } else {
-#if CUDNN_VERSION >= 7201
-      math_type = CUDNN_TENSOR_OP_MATH;
-#else
-      math_type = CUDNN_DEFAULT_MATH;
-#endif  // CUDNN_VERSION >= 7201
+      use_tensor_ops = CUDNN_VERSION >= 7201 && allow_tensor_ops;
+    }
+
+    if (use_tensor_ops && !allow_tensor_ops) {
+      return port::Status(port::error::INVALID_ARGUMENT,
+                          "Algo requests disallowed tensor op evaluation.");
+    }
+
+    cudnnMathType_t math_type;
+    if (use_tensor_ops) {
+      math_type =
+          CUDNN_VERSION >= 8000 ? CUDNN_DEFAULT_MATH : CUDNN_TENSOR_OP_MATH;
+    } else {
+      math_type = CUDNN_VERSION >= 8000 ? CUDNN_FMA_MATH : CUDNN_DEFAULT_MATH;
     }
     CHECK_CUDNN_OK(cudnnSetRNNMatrixMathType(rnn_desc.get(), math_type));
 #endif  // CUDNN_VERSION >= 7000
@@ -2707,6 +2716,9 @@ port::StatusOr<bool> UseTensorOps(Stream* stream, dnn::DataType type,
   return use_tensor_ops;
 }
 
+cudnnDataType_t GetRnnComputeType(dnn::DataType data_type);
+dnn::DataType GetConvAccumulatorType(dnn::DataType data_type);
+
 port::StatusOr<dnn::AlgorithmDesc> GetCudnnConvolutionForwardAlgorithm(
     Stream* stream, const CudnnHandle& cudnn,
     const dnn::AlgorithmConfig& algorithm_config,

From 03d836e2616574c657f7702e24e4fc79c661115b Mon Sep 17 00:00:00 2001
From: Nathan Luehr <nluehr@nvidia.com>
Date: Wed, 20 May 2020 10:06:35 -0500
Subject: [PATCH 0666/1390] Plumb TF32 for cublas gemm

---
 tensorflow/stream_executor/cuda/BUILD        |  1 +
 tensorflow/stream_executor/cuda/cuda_blas.cc | 84 +++++++++-----------
 tensorflow/stream_executor/cuda/cuda_blas.h  |  8 +-
 3 files changed, 43 insertions(+), 50 deletions(-)

diff --git a/tensorflow/stream_executor/cuda/BUILD b/tensorflow/stream_executor/cuda/BUILD
index cdc0de7c72a..3a14be9ad50 100644
--- a/tensorflow/stream_executor/cuda/BUILD
+++ b/tensorflow/stream_executor/cuda/BUILD
@@ -251,6 +251,7 @@ cc_library(
         "@local_config_cuda//cuda:cuda_headers",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "//tensorflow/core/platform:tf32_utils",
         "//tensorflow/stream_executor",
         "//tensorflow/stream_executor:event",
         "//tensorflow/stream_executor:host_or_device_scalar",
diff --git a/tensorflow/stream_executor/cuda/cuda_blas.cc b/tensorflow/stream_executor/cuda/cuda_blas.cc
index 65c07e72154..e2cbb0b75df 100644
--- a/tensorflow/stream_executor/cuda/cuda_blas.cc
+++ b/tensorflow/stream_executor/cuda/cuda_blas.cc
@@ -48,7 +48,7 @@ limitations under the License.
 
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
-#include "third_party/eigen3/Eigen/Core"
+#include "tensorflow/core/platform/tf32_utils.h"
 #include "tensorflow/core/util/env_var.h"
 #include "tensorflow/stream_executor/cuda/cuda_activation.h"
 #include "tensorflow/stream_executor/cuda/cuda_gpu_executor.h"
@@ -66,6 +66,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/plugin_registry.h"
 #include "tensorflow/stream_executor/scratch_allocator.h"
 #include "tensorflow/stream_executor/stream_executor.h"
+#include "third_party/eigen3/Eigen/Core"
 
 namespace stream_executor {
 namespace gpu {
@@ -225,6 +226,18 @@ bool CUDABlas::Init() {
     return false;
   }
 
+#if CUDA_VERSION >= 9000
+#if CUBLAS_VER_MAJOR >= 11
+  ret = cublasSetMathMode(blas_, CUBLAS_TF32_TENSOR_OP_MATH);
+#else
+  ret = cublasSetMathMode(blas_, CUBLAS_TENSOR_OP_MATH);
+#endif
+  if (ret != CUBLAS_STATUS_SUCCESS) {
+    LOG(ERROR) << "failed to set cublas default math mode: " << ToString(ret);
+    return false;
+  }
+#endif
+
   return true;
 }
 
@@ -387,7 +400,7 @@ cudaDataType_t CUDAComputationType(blas::ComputationType ty) {
 template <typename FuncT, typename... Args>
 bool CUDABlas::DoBlasInternalImpl(FuncT cublas_func, Stream *stream,
                                   bool pointer_mode_host, bool err_on_failure,
-                                  bool use_tensor_op_math, Args... args) {
+                                  Args... args) {
   absl::MutexLock lock(&mu_);
 
   CHECK(blas_ != nullptr);
@@ -401,10 +414,10 @@ bool CUDABlas::DoBlasInternalImpl(FuncT cublas_func, Stream *stream,
                                            : CUBLAS_POINTER_MODE_DEVICE)) {
     return false;
   }
-#if CUDA_VERSION >= 9000
+#if CUBLAS_VER_MAJOR >= 11
   ScopedCublasMathMode math_mode{blas_};
-  if (use_tensor_op_math) {
-    if (!math_mode.Init(CUBLAS_TENSOR_OP_MATH)) {
+  if (!tensorflow::tf32_execution_allowed()) {
+    if (!math_mode.Init(CUBLAS_DEFAULT_MATH)) {
       return false;
     }
   }
@@ -1621,21 +1634,9 @@ bool CUDABlas::DoBlasGemm(
     }
   }
 
-  bool use_tensor_ops = false;
-#if CUDA_VERSION >= 9000
-  int cc_major, cc_minor;
-  stream->parent()->GetDeviceDescription().cuda_compute_capability(&cc_major,
-                                                                   &cc_minor);
-
-  // GPUs < sm_70 don't support tensor ops.
-  if (cc_major >= 7) {
-    use_tensor_ops = true;
-  }
-#endif
-
   return DoBlasInternalImpl(
       cublasSgemmEx, stream, true /* = pointer_mode_host */,
-      true /* = err_on_failure= */, use_tensor_ops, CUDABlasTranspose(transa),
+      true /* = err_on_failure= */, CUDABlasTranspose(transa),
       CUDABlasTranspose(transb), m, n, k, &alpha, GpuMemory(a),
       SE_CUDA_DATA_HALF, lda, GpuMemory(b), SE_CUDA_DATA_HALF, ldb, &beta,
       GpuMemoryMutable(c), SE_CUDA_DATA_HALF, ldc);
@@ -2257,7 +2258,8 @@ port::Status CUDABlas::DoBlasGemmBatchedInternal(
   if (stream->parent()->GetDeviceDescription().cuda_compute_capability(
           &cc_major, &cc_minor) &&
       cc_major >= 5) {
-    bool use_tensor_ops = data_type == CUDA_R_16F;
+    bool use_tensor_ops =
+        data_type == CUDA_R_16F || tensorflow::tf32_execution_allowed();
     cublasGemmAlgo_t algo =
         (use_tensor_ops ? CUBLAS_GEMM_DFALT_TENSOR_OP : CUBLAS_GEMM_DFALT);
     cudaDataType_t compute_type =
@@ -2271,7 +2273,7 @@ port::Status CUDABlas::DoBlasGemmBatchedInternal(
     bool ok;
     ok = DoBlasInternalImpl(
         AS_LAMBDA(cublasGemmBatchedEx), stream, true /* = pointer_mode_host */,
-        true /* = err_on_failure */, use_tensor_ops, CUDABlasTranspose(transa),
+        true /* = err_on_failure */, CUDABlasTranspose(transa),
         CUDABlasTranspose(transb), m, n, k, &alpha, a_void_ptrs, data_type, lda,
         b_void_ptrs, data_type, ldb, &beta, c_void_ptrs, data_type, ldc,
         batch_count, compute_type, algo);
@@ -2406,33 +2408,25 @@ bool CUDABlas::DoBlasGemmStridedBatched(
     int lda, int64 stride_a, const DeviceMemory<Eigen::half> &b, int ldb,
     int64 stride_b, float beta, DeviceMemory<Eigen::half> *c, int ldc,
     int64 stride_c, int batch_count) {
-  bool use_tensor_ops = false;
-#if CUDA_VERSION >= 9000
+#if CUDA_VERSION >= 9010
   int cc_major, cc_minor;
   if (stream->parent()->GetDeviceDescription().cuda_compute_capability(
-          &cc_major, &cc_minor)) {
-    // GPUs < sm_70 don't support tensor ops.
-    if (cc_major >= 7) {
-      use_tensor_ops = true;
+          &cc_major, &cc_minor) &&
+      cc_major >= 5) {
+    cublasGemmAlgo_t algo =
+        (cc_major >= 7 ? CUBLAS_GEMM_DFALT_TENSOR_OP : CUBLAS_GEMM_DFALT);
+    bool ok = DoBlasInternalImpl(
+        AS_LAMBDA(cublasGemmStridedBatchedEx), stream,
+        true /* = pointer_mode_host */, true /* = err_on_failure */,
+        CUDABlasTranspose(transa), CUDABlasTranspose(transb), m, n, k, &alpha,
+        GpuMemory(a), CUDA_R_16F, lda, stride_a, GpuMemory(b), CUDA_R_16F, ldb,
+        stride_b, &beta, GpuMemoryMutable(c), CUDA_R_16F, ldc, stride_c,
+        batch_count, CUDA_R_32F, algo);
+    if (ok) {
+      return true;
     }
-#if CUDA_VERSION >= 9010
-    if (cc_major >= 5) {
-      cublasGemmAlgo_t algo =
-          (use_tensor_ops ? CUBLAS_GEMM_DFALT_TENSOR_OP : CUBLAS_GEMM_DFALT);
-      bool ok = DoBlasInternalImpl(
-          AS_LAMBDA(cublasGemmStridedBatchedEx), stream,
-          true /* = pointer_mode_host */, true /* = err_on_failure */,
-          use_tensor_ops, CUDABlasTranspose(transa), CUDABlasTranspose(transb),
-          m, n, k, &alpha, GpuMemory(a), CUDA_R_16F, lda, stride_a,
-          GpuMemory(b), CUDA_R_16F, ldb, stride_b, &beta, GpuMemoryMutable(c),
-          CUDA_R_16F, ldc, stride_c, batch_count, CUDA_R_32F, algo);
-      if (ok) {
-        return true;
-      }
-      LOG(ERROR) << "failed BLAS call, see log for details";
-      return false;
-    }
-#endif
+    LOG(ERROR) << "failed BLAS call, see log for details";
+    return false;
   }
 #endif
   // Either CUDA_VERSION < 9.1 or SM < 5.0. Fall back to a loop.
@@ -2445,7 +2439,7 @@ bool CUDABlas::DoBlasGemmStridedBatched(
         reinterpret_cast<__half *>(GpuMemoryMutable(c) + batch * stride_c);
     bool ok = DoBlasInternalImpl(
         cublasSgemmEx, stream, true /* = pointer_mode_host */,
-        true /* = err_on_failure= */, use_tensor_ops, CUDABlasTranspose(transa),
+        true /* = err_on_failure= */, CUDABlasTranspose(transa),
         CUDABlasTranspose(transb), m, n, k, &alpha, a_matrix, SE_CUDA_DATA_HALF,
         lda, b_matrix, SE_CUDA_DATA_HALF, ldb, &beta, c_matrix,
         SE_CUDA_DATA_HALF, ldc);
diff --git a/tensorflow/stream_executor/cuda/cuda_blas.h b/tensorflow/stream_executor/cuda/cuda_blas.h
index 817bdb72777..556456c83db 100644
--- a/tensorflow/stream_executor/cuda/cuda_blas.h
+++ b/tensorflow/stream_executor/cuda/cuda_blas.h
@@ -83,7 +83,7 @@ class CUDABlas : public blas::BlasSupport {
   template <typename FuncT, typename... Args>
   bool DoBlasInternalImpl(FuncT cublas_func, Stream *stream,
                           bool pointer_mode_host, bool err_on_failure,
-                          bool use_tensor_op_math, Args... args);
+                          Args... args);
 
   // Convenience functions that call DoBlasInternalImpl with different values
   // for err_on_failure.
@@ -91,8 +91,7 @@ class CUDABlas : public blas::BlasSupport {
   bool DoBlasInternal(FuncT cublas_func, Stream *stream, bool pointer_mode_host,
                       Args... args) {
     return DoBlasInternalImpl(cublas_func, stream, pointer_mode_host,
-                              /*err_on_failure=*/true, /*use_tensor_ops=*/false,
-                              args...);
+                              /*err_on_failure=*/true, args...);
   }
   template <typename FuncT, typename... Args>
   bool DoBlasInternalFailureOK(FuncT cublas_func, Stream *stream,
@@ -100,8 +99,7 @@ class CUDABlas : public blas::BlasSupport {
     // Tensor ops are hard-coded off in this path, but can still be enabled with
     // a specific algorithm choice as in DoBlasGemmWithAlgorithmImpl().
     return DoBlasInternalImpl(cublas_func, stream, pointer_mode_host,
-                              /*err_on_failure=*/false,
-                              /*use_tensor_ops=*/false, args...);
+                              /*err_on_failure=*/false, args...);
   }
 
   // A helper function to implement DoBlasGemmBatched interfaces for generic

From 7791e36a57b3d67e625e298a7e8a1063a26571fd Mon Sep 17 00:00:00 2001
From: Nathan Luehr <nluehr@nvidia.com>
Date: Mon, 8 Jun 2020 11:21:49 -0500
Subject: [PATCH 0667/1390] Address review comments

---
 tensorflow/core/platform/tf32_utils.cc | 10 ++++++----
 tensorflow/core/platform/tf32_utils.h  |  2 +-
 tensorflow/python/framework/config.py  |  7 +++++--
 3 files changed, 12 insertions(+), 7 deletions(-)

diff --git a/tensorflow/core/platform/tf32_utils.cc b/tensorflow/core/platform/tf32_utils.cc
index 715b5996dc3..4456e768c0a 100644
--- a/tensorflow/core/platform/tf32_utils.cc
+++ b/tensorflow/core/platform/tf32_utils.cc
@@ -14,14 +14,16 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/core/platform/tf32_utils.h"
+#include <atomic>
 
 namespace tensorflow {
 
-// TODO(nluehr): enable tf32 execution by default after TF32 Ampere testing.
-static bool tf32_enabled = false;
+// Whether TensorFloat-32 should be used where supported.
+// TODO(nluehr): Maybe enable by default after TF32 Ampere testing.
+static std::atomic<bool> tf32_allowed{false};
 
-void allow_tf32_execution(bool allow) { tf32_enabled = allow; }
+void allow_tf32_execution(bool allowed) { tf32_allowed = allowed; }
 
-bool tf32_execution_allowed() { return tf32_enabled; }
+bool tf32_execution_allowed() { return tf32_allowed; }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/platform/tf32_utils.h b/tensorflow/core/platform/tf32_utils.h
index a0ce58f9bbd..7a158d00ad3 100644
--- a/tensorflow/core/platform/tf32_utils.h
+++ b/tensorflow/core/platform/tf32_utils.h
@@ -18,7 +18,7 @@ limitations under the License.
 
 namespace tensorflow {
 
-void allow_tf32_execution(bool allow);
+void allow_tf32_execution(bool allowed);
 
 bool tf32_execution_allowed();
 
diff --git a/tensorflow/python/framework/config.py b/tensorflow/python/framework/config.py
index cb95965dfb2..e80ad1d72c4 100644
--- a/tensorflow/python/framework/config.py
+++ b/tensorflow/python/framework/config.py
@@ -23,6 +23,8 @@ from tensorflow.python.eager import context
 from tensorflow.python.util import deprecation
 from tensorflow.python.util.tf_export import tf_export
 
+
+# No tf_export until TF is built against CUDA11 which is required for TF32.
 def tensor_float32_execution_allowed():
   """Get if TensorFloat-32 operations are enabled on supported hardware.
 
@@ -31,7 +33,8 @@ def tensor_float32_execution_allowed():
   """
   return _pywrap_tf32_execution.is_allowed()
 
-def allow_tensor_float_32_execution(allow):
+# No tf_export until TF is built against CUDA11 which is required for TF32.
+def allow_tensor_float_32_execution(allowed):
   """Allow use of TensorFloat-32 with float32 ops on supported hardware.
 
   TensorFloat-32 is a math mode introduced with the NVIDIA Ampere architecture.
@@ -47,7 +50,7 @@ def allow_tensor_float_32_execution(allow):
   Args:
     allow: whether to allow TensorFloat-32 execution
   """
-  _pywrap_tf32_execution.allow(allow)
+  _pywrap_tf32_execution.allow(allowed)
 
 @tf_export('config.threading.get_intra_op_parallelism_threads')
 def get_intra_op_parallelism_threads():

From 6ff37474dd16d77f65d510a9f3bf0f56d08122b9 Mon Sep 17 00:00:00 2001
From: Nathan Luehr <nluehr@nvidia.com>
Date: Tue, 9 Jun 2020 15:35:47 -0500
Subject: [PATCH 0668/1390] Use CUDNN_TENSOR_OP_MATH to enable tensor cores.

---
 tensorflow/stream_executor/cuda/cuda_dnn.cc | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/tensorflow/stream_executor/cuda/cuda_dnn.cc b/tensorflow/stream_executor/cuda/cuda_dnn.cc
index 820d39c7201..166fa0e32d0 100644
--- a/tensorflow/stream_executor/cuda/cuda_dnn.cc
+++ b/tensorflow/stream_executor/cuda/cuda_dnn.cc
@@ -721,7 +721,7 @@ class CudnnConvolutionDescriptor {
 #if CUDNN_VERSION >= 7000
     cudnnMathType_t math_type =
 #if CUDNN_VERSION >= 8000
-        (use_tensor_op_math ? CUDNN_DEFAULT_MATH : CUDNN_FMA_MATH);
+        (use_tensor_op_math ? CUDNN_TENSOR_OP_MATH : CUDNN_FMA_MATH);
 #else
         (use_tensor_op_math ? CUDNN_TENSOR_OP_MATH : CUDNN_DEFAULT_MATH);
 #endif
@@ -1177,8 +1177,7 @@ class CudnnRnnDescriptor : public dnn::RnnDescriptor {
 
     cudnnMathType_t math_type;
     if (use_tensor_ops) {
-      math_type =
-          CUDNN_VERSION >= 8000 ? CUDNN_DEFAULT_MATH : CUDNN_TENSOR_OP_MATH;
+      math_type = CUDNN_TENSOR_OP_MATH;
     } else {
       math_type = CUDNN_VERSION >= 8000 ? CUDNN_FMA_MATH : CUDNN_DEFAULT_MATH;
     }

From 8384080492a80b486fd000e3fbd6626cae0fb2cc Mon Sep 17 00:00:00 2001
From: Nathan Luehr <nluehr@nvidia.com>
Date: Thu, 11 Jun 2020 15:35:04 -0500
Subject: [PATCH 0669/1390] Make python names consistent

---
 tensorflow/python/framework/config.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/framework/config.py b/tensorflow/python/framework/config.py
index e80ad1d72c4..c19e514a932 100644
--- a/tensorflow/python/framework/config.py
+++ b/tensorflow/python/framework/config.py
@@ -34,7 +34,7 @@ def tensor_float32_execution_allowed():
   return _pywrap_tf32_execution.is_allowed()
 
 # No tf_export until TF is built against CUDA11 which is required for TF32.
-def allow_tensor_float_32_execution(allowed):
+def allow_tensor_float32_execution(allowed):
   """Allow use of TensorFloat-32 with float32 ops on supported hardware.
 
   TensorFloat-32 is a math mode introduced with the NVIDIA Ampere architecture.

From 5fe636fb4a74b58bd7da6de9ae1f286ad395c272 Mon Sep 17 00:00:00 2001
From: Reed <reedwm@google.com>
Date: Thu, 11 Jun 2020 17:16:28 -0700
Subject: [PATCH 0670/1390] Use float_32 instead of float32 in function names

---
 tensorflow/python/framework/config.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/python/framework/config.py b/tensorflow/python/framework/config.py
index c19e514a932..767a1a99f4f 100644
--- a/tensorflow/python/framework/config.py
+++ b/tensorflow/python/framework/config.py
@@ -25,7 +25,7 @@ from tensorflow.python.util.tf_export import tf_export
 
 
 # No tf_export until TF is built against CUDA11 which is required for TF32.
-def tensor_float32_execution_allowed():
+def tensor_float_32_execution_allowed():
   """Get if TensorFloat-32 operations are enabled on supported hardware.
 
   Returns:
@@ -34,7 +34,7 @@ def tensor_float32_execution_allowed():
   return _pywrap_tf32_execution.is_allowed()
 
 # No tf_export until TF is built against CUDA11 which is required for TF32.
-def allow_tensor_float32_execution(allowed):
+def allow_tensor_float_32_execution(allowed):
   """Allow use of TensorFloat-32 with float32 ops on supported hardware.
 
   TensorFloat-32 is a math mode introduced with the NVIDIA Ampere architecture.

From 434f49d54fc8165c2f019bd114272ce2200202da Mon Sep 17 00:00:00 2001
From: Nathan Luehr <nluehr@nvidia.com>
Date: Fri, 19 Jun 2020 15:44:04 -0500
Subject: [PATCH 0671/1390] Rework gemm TF32

---
 tensorflow/stream_executor/cuda/cuda_blas.cc | 142 +++++++++++++++----
 1 file changed, 113 insertions(+), 29 deletions(-)

diff --git a/tensorflow/stream_executor/cuda/cuda_blas.cc b/tensorflow/stream_executor/cuda/cuda_blas.cc
index e2cbb0b75df..e9e3635d8c1 100644
--- a/tensorflow/stream_executor/cuda/cuda_blas.cc
+++ b/tensorflow/stream_executor/cuda/cuda_blas.cc
@@ -226,18 +226,6 @@ bool CUDABlas::Init() {
     return false;
   }
 
-#if CUDA_VERSION >= 9000
-#if CUBLAS_VER_MAJOR >= 11
-  ret = cublasSetMathMode(blas_, CUBLAS_TF32_TENSOR_OP_MATH);
-#else
-  ret = cublasSetMathMode(blas_, CUBLAS_TENSOR_OP_MATH);
-#endif
-  if (ret != CUBLAS_STATUS_SUCCESS) {
-    LOG(ERROR) << "failed to set cublas default math mode: " << ToString(ret);
-    return false;
-  }
-#endif
-
   return true;
 }
 
@@ -1634,6 +1622,13 @@ bool CUDABlas::DoBlasGemm(
     }
   }
 
+#if CUDA_VERSION < 11000
+  ScopedCublasMathMode math_mode{blas_};
+  if (!math_mode.Init(CUBLAS_TENSOR_OP_MATH)) {
+    return false;
+  }
+#endif
+
   return DoBlasInternalImpl(
       cublasSgemmEx, stream, true /* = pointer_mode_host */,
       true /* = err_on_failure= */, CUDABlasTranspose(transa),
@@ -1681,6 +1676,16 @@ bool CUDABlas::DoBlasGemm(Stream *stream, blas::Transpose transa,
                       "precondition violation";
     }
   }
+
+#if CUBLAS_VER_MAJOR >= 11
+  ScopedCublasMathMode math_mode{blas_};
+  if (tensorflow::tf32_execution_allowed()) {
+    if (!math_mode.Init(CUBLAS_TF32_TENSOR_OP_MATH)) {
+      return false;
+    }
+  }
+#endif
+
   return DoBlasInternal(cublasSgemm, stream, true /* = pointer_mode_host */,
                         CUDABlasTranspose(transa), CUDABlasTranspose(transb), m,
                         n, k, &alpha, GpuMemory(a), lda, GpuMemory(b), ldb,
@@ -1707,6 +1712,16 @@ bool CUDABlas::DoBlasGemm(Stream *stream, blas::Transpose transa,
                           DeviceMemory<std::complex<float>> *c, int ldc) {
   auto cb_alpha = GpuComplexValue(alpha);
   auto cb_beta = GpuComplexValue(beta);
+
+#if CUBLAS_VER_MAJOR >= 11
+  ScopedCublasMathMode math_mode{blas_};
+  if (tensorflow::tf32_execution_allowed()) {
+    if (!math_mode.Init(CUBLAS_TF32_TENSOR_OP_MATH)) {
+      return false;
+    }
+  }
+#endif
+
   return DoBlasInternal(cublasCgemm, stream, true /* = pointer_mode_host */,
                         CUDABlasTranspose(transa), CUDABlasTranspose(transb), m,
                         n, k, GpuComplex(&cb_alpha), GpuComplex(GpuMemory(a)),
@@ -1903,20 +1918,6 @@ static bool UsesTensorOps(blas::AlgorithmType algo) {
 #endif
 }
 
-template <typename InType>
-static bool TensorOpsAvailable(int cc_major) {
-#if CUDA_VERSION >= 9000
-  // cublas *does* allow tensor ops on inputs that are not fp16, so this is not
-  // strictly correct.  We can't simply enable it, though, as that would change
-  // clients' behavior significantly: Using tensor ops on fp32 inputs cause them
-  // to be rounded to fp16.
-  if (cc_major >= 7 && std::is_same<InType, Eigen::half>::value) {
-    return true;
-  }
-#endif
-  return false;
-}
-
 template <typename InT, typename OutT, typename CompT>
 bool CUDABlas::DoBlasGemmWithAlgorithmImpl(
     Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
@@ -1935,17 +1936,52 @@ bool CUDABlas::DoBlasGemmWithAlgorithmImpl(
     return false;
   }
 
-  if (UsesTensorOps(algorithm) && !TensorOpsAvailable<InT>(cc_major)) {
-    if (std::is_same<InT, Eigen::half>::value) {
+  bool algo_uses_tensor_ops = UsesTensorOps(algorithm);
+  cublasMath_t math_type = CUBLAS_DEFAULT_MATH;
+  if (algo_uses_tensor_ops) {
+    if (cc_major < 7) {
       VLOG(2) << "DoBlasGemmWithAlgorithm returning false because algorithm "
               << algorithm
               << " uses tensor ops, but tensor ops are not available in sm"
               << cc_major << "X devices.";
+      return false;
+    } else if (std::is_same<InT, float>::value) {
+#if CUDA_VERSION < 11000
+      VLOG(2) << "DoBlasGemmWithAlgorithm returning false because algorithm "
+              << algorithm
+              << " uses tensor ops, but tensor ops are not available for fp32"
+              << " inputs.";
+      return false;
+#else
+      if (cc_major < 8) {
+        VLOG(2) << "DoBlasGemmWithAlgorithm returning false because algorithm "
+                << algorithm
+                << " uses tensor ops, but tensor ops are not available in sm"
+                << cc_major << "X devices for float input types.";
+        return false;
+      } else if (!tensorflow::tf32_execution_allowed()) {
+        VLOG(2) << "DoBlasGemmWithAlgorithm returning false because algorithm "
+                << algorithm
+                << " uses tensor ops, but tensor ops are disabled for fp32"
+                << " inputs.";
+        return false;
+      }
+      math_type = CUBLAS_TF32_TENSOR_OP_MATH;
+#endif
+    } else if (std::is_same<InT, Eigen::half>::value) {
+#if CUDA_VERSION < 11000
+      math_type = CUBLAS_TENSOR_OP_MATH;
+#endif
     } else {
       VLOG(2) << "DoBlasGemmWithAlgorithm returning false because algorithm "
               << algorithm
-              << " uses tensor ops, but the input data type is not fp16.";
+              << " uses tensor ops, which are not supported for InT.";
+      return false;
     }
+  }
+
+  ScopedCublasMathMode math_mode{blas_};
+  if (!math_mode.Init(math_type)) {
     return false;
   }
 
@@ -2325,6 +2361,13 @@ bool CUDABlas::DoBlasGemmBatched(
     int ldc, int batch_count, ScratchAllocator *scratch_allocator) {
   // Note: The func passed here (cublasSgemmBatched) is not actually called,
   // due to special handling of fp16 inside DoBlasGemmBatchedInternal.
+#if CUDA_VERSION < 11000
+  ScopedCublasMathMode math_mode{blas_};
+  if (!math_mode.Init(CUBLAS_TENSOR_OP_MATH)) {
+    return false;
+  }
+#endif
+
   port::Status status = DoBlasGemmBatchedInternal(
       cublasSgemmBatched, stream, transa, transb, m, n, k, alpha, a_array, lda,
       b_array, ldb, beta, c_array, ldc, batch_count, scratch_allocator);
@@ -2341,6 +2384,15 @@ bool CUDABlas::DoBlasGemmBatched(
     const port::ArraySlice<DeviceMemory<float> *> &b_array, int ldb, float beta,
     const port::ArraySlice<DeviceMemory<float> *> &c_array, int ldc,
     int batch_count, ScratchAllocator *scratch_allocator) {
+#if CUBLAS_VER_MAJOR >= 11
+  ScopedCublasMathMode math_mode{blas_};
+  if (tensorflow::tf32_execution_allowed()) {
+    if (!math_mode.Init(CUBLAS_TF32_TENSOR_OP_MATH)) {
+      return false;
+    }
+  }
+#endif
+
   port::Status status = DoBlasGemmBatchedInternal(
       cublasSgemmBatched, stream, transa, transb, m, n, k, alpha, a_array, lda,
       b_array, ldb, beta, c_array, ldc, batch_count, scratch_allocator);
@@ -2375,6 +2427,15 @@ bool CUDABlas::DoBlasGemmBatched(
     int ldb, std::complex<float> beta,
     const port::ArraySlice<DeviceMemory<std::complex<float>> *> &c_array,
     int ldc, int batch_count, ScratchAllocator *scratch_allocator) {
+#if CUBLAS_VER_MAJOR >= 11
+  ScopedCublasMathMode math_mode{blas_};
+  if (tensorflow::tf32_execution_allowed()) {
+    if (!math_mode.Init(CUBLAS_TF32_TENSOR_OP_MATH)) {
+      return false;
+    }
+  }
+#endif
+
   port::Status status = DoBlasGemmBatchedInternal(
       cublasCgemmBatched, stream, transa, transb, m, n, k, alpha, a_array, lda,
       b_array, ldb, beta, c_array, ldc, batch_count, scratch_allocator);
@@ -2408,6 +2469,13 @@ bool CUDABlas::DoBlasGemmStridedBatched(
     int lda, int64 stride_a, const DeviceMemory<Eigen::half> &b, int ldb,
     int64 stride_b, float beta, DeviceMemory<Eigen::half> *c, int ldc,
     int64 stride_c, int batch_count) {
+#if CUDA_VERSION < 11000
+  ScopedCublasMathMode math_mode{blas_};
+  if (!math_mode.Init(CUBLAS_TENSOR_OP_MATH)) {
+    return false;
+  }
+#endif
+
 #if CUDA_VERSION >= 9010
   int cc_major, cc_minor;
   if (stream->parent()->GetDeviceDescription().cuda_compute_capability(
@@ -2457,6 +2525,14 @@ bool CUDABlas::DoBlasGemmStridedBatched(
     int64 stride_a, const DeviceMemory<float> &b, int ldb, int64 stride_b,
     float beta, DeviceMemory<float> *c, int ldc, int64 stride_c,
     int batch_count) {
+#if CUBLAS_VER_MAJOR >= 11
+  ScopedCublasMathMode math_mode{blas_};
+  if (tensorflow::tf32_execution_allowed()) {
+    if (!math_mode.Init(CUBLAS_TF32_TENSOR_OP_MATH)) {
+      return false;
+    }
+  }
+#endif
   return DoBlasInternal(
       cublasSgemmStridedBatched, stream, true /* = pointer_mode_host */,
       CUDABlasTranspose(transa), CUDABlasTranspose(transb), m, n, k, &alpha,
@@ -2484,6 +2560,14 @@ bool CUDABlas::DoBlasGemmStridedBatched(
     const DeviceMemory<std::complex<float>> &b, int ldb, int64 stride_b,
     std::complex<float> beta, DeviceMemory<std::complex<float>> *c, int ldc,
     int64 stride_c, int batch_count) {
+#if CUBLAS_VER_MAJOR >= 11
+  ScopedCublasMathMode math_mode{blas_};
+  if (tensorflow::tf32_execution_allowed()) {
+    if (!math_mode.Init(CUBLAS_TF32_TENSOR_OP_MATH)) {
+      return false;
+    }
+  }
+#endif
   auto cb_alpha = GpuComplexValue(alpha);
   auto cb_beta = GpuComplexValue(beta);
   return DoBlasInternal(

From 90443c2b17a321f3589039dd560773b8d47a5cb7 Mon Sep 17 00:00:00 2001
From: Jacques Pienaar <jpienaar@google.com>
Date: Fri, 19 Jun 2020 15:15:51 -0700
Subject: [PATCH 0672/1390] Enable test for generating tf_ops

PiperOrigin-RevId: 317389351
Change-Id: I54f6ac53a974cf603ed5fe1a30b6fbb464c80d28
---
 tensorflow/compiler/mlir/tensorflow/BUILD | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/tensorflow/compiler/mlir/tensorflow/BUILD b/tensorflow/compiler/mlir/tensorflow/BUILD
index bcabb13d301..11a455eec72 100644
--- a/tensorflow/compiler/mlir/tensorflow/BUILD
+++ b/tensorflow/compiler/mlir/tensorflow/BUILD
@@ -57,6 +57,7 @@ gentbl(
     td_srcs = [
         ":tensorflow_ops_td_files",
     ],
+    test = True,
 )
 
 gentbl(
@@ -88,6 +89,7 @@ gentbl(
     td_srcs = [
         ":tensorflow_ops_td_files",
     ],
+    test = True,
 )
 
 gentbl(
@@ -112,6 +114,7 @@ gentbl(
         "@llvm-project//mlir:include/mlir/IR/OpBase.td",
         "@llvm-project//mlir:include/mlir/Dialect/StandardOps/IR/Ops.td",
     ],
+    test = True,
 )
 
 gentbl(
@@ -137,6 +140,7 @@ gentbl(
         "@llvm-project//mlir:include/mlir/Dialect/StandardOps/IR/Ops.td",
         "@llvm-project//mlir:include/mlir/IR/OpBase.td",
     ],
+    test = True,
 )
 
 gentbl(
@@ -161,6 +165,7 @@ gentbl(
         "@llvm-project//mlir:include/mlir/IR/OpBase.td",
         "@llvm-project//mlir:include/mlir/Dialect/StandardOps/IR/Ops.td",
     ],
+    test = True,
 )
 
 gentbl(

From fb5a3c3c6a4894a15bcc88ba9655c6afff8ff3f5 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 19 Jun 2020 15:30:49 -0700
Subject: [PATCH 0673/1390] Fix issues caught by
 -Wtautological-constant-compare

This fixes a check that is always evaluates to true.  The check expression is a
conditional operator that returns 1 or 2, both of which convert to true.  This
fix puts in the enum value that causes the check to pass.

PiperOrigin-RevId: 317391929
Change-Id: I12f3f9b06f494b4bcf0e4ff194d4dc4edafd52e2
---
 tensorflow/core/kernels/hexagon/graph_transferer.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/kernels/hexagon/graph_transferer.cc b/tensorflow/core/kernels/hexagon/graph_transferer.cc
index 7f15f3ab20d..9d6d0563b5f 100644
--- a/tensorflow/core/kernels/hexagon/graph_transferer.cc
+++ b/tensorflow/core/kernels/hexagon/graph_transferer.cc
@@ -667,7 +667,7 @@ void GraphTransferer::RegisterNodeWithPaddingAndStrides(
       << "Op " << node.type_string() << " not found in map(id = " << op_type_id
       << ")";
   // Safety check of padding id
-  CHECK(padding == Padding::VALID ? 1 : 2);
+  CHECK(padding == Padding::SAME);
   AppendNodeParamsWithIoParams(
       shape_refiner, node, node.name(), id, node.type_string(), op_type_id,
       static_cast<int>(padding), node.num_inputs(), extra_inputs,

From ca7c0657c2e22343810957951cfb27aa5bf4c165 Mon Sep 17 00:00:00 2001
From: Yasir Modak <42785357+ymodak@users.noreply.github.com>
Date: Fri, 19 Jun 2020 15:43:04 -0700
Subject: [PATCH 0674/1390] update arg doc for sigmoid_cross_entropy

fixes #40593
---
 tensorflow/python/ops/losses/losses_impl.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/tensorflow/python/ops/losses/losses_impl.py b/tensorflow/python/ops/losses/losses_impl.py
index 6a7b4b68420..2887b3d78f9 100644
--- a/tensorflow/python/ops/losses/losses_impl.py
+++ b/tensorflow/python/ops/losses/losses_impl.py
@@ -676,8 +676,9 @@ def sigmoid_cross_entropy(
       `{0, 1}`.
     logits: Float `[batch_size, num_classes]` logits outputs of the network.
     weights: Optional `Tensor` whose rank is either 0, or the same rank as
-      `labels`, and must be broadcastable to `labels` (i.e., all dimensions must
-      be either `1`, or the same as the corresponding `losses` dimension).
+    `multi_class_labels`, and must be broadcastable to `multi_class_labels` 
+    (i.e., all dimensions must be either `1`, or the same as the 
+    corresponding `losses` dimension).
     label_smoothing: If greater than `0` then smooth the labels.
     scope: The scope for the operations performed in computing the loss.
     loss_collection: collection to which the loss will be added.

From 18c43b7bd488f0216ce11127158df7084e26cdb4 Mon Sep 17 00:00:00 2001
From: Haoyu Zhang <haoyuzhang@google.com>
Date: Fri, 19 Jun 2020 15:43:22 -0700
Subject: [PATCH 0675/1390] Clear cancel callback when gRPC eager call returns
 with state.

PiperOrigin-RevId: 317393892
Change-Id: Ife800821494dd4cc2992eec9a5470d989596a6d7
---
 .../core/distributed_runtime/rpc/eager/grpc_eager_service_impl.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service_impl.h b/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service_impl.h
index 924112e0d96..1d65f945f27 100644
--- a/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service_impl.h
+++ b/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service_impl.h
@@ -81,6 +81,7 @@ class GrpcEagerServiceImpl : public AsyncServiceInterface {
       local_impl_.RunComponentFunction(call_opts.get(), &call->request,
                                        &call->response,
                                        [call, call_opts](const Status& s) {
+                                         call->ClearCancelCallback();
                                          call->SendResponse(ToGrpcStatus(s));
                                        });
     });

From e3809400b295e8a66fd8e2b20a348356c78a0435 Mon Sep 17 00:00:00 2001
From: Kuangyuan Chen <chky@google.com>
Date: Fri, 19 Jun 2020 15:51:19 -0700
Subject: [PATCH 0676/1390] Import initialization graph in SignatureDef
 SavedModels as an MLIR function in TF saved model dialect. -Mark the init
 function referenced by SessionInitializerOp as an exported function with the
 reserved name "__tf_saved_model_session_initializer". -Remove variable init
 logic in init function if it is already imported as a global tensor. -Add a
 canonicalizer to SessionInitializerOp to remove empty init functions. -Return
 error on SessionInitializerOp in IREE compiler pipeline.

PiperOrigin-RevId: 317395165
Change-Id: Idb7e54cac08add9fc8f2ccfbaf341135fdf59e3b
---
 tensorflow/compiler/mlir/tensorflow/BUILD     |   4 +-
 .../mlir/tensorflow/ir/tf_saved_model.cc      |  75 +++++-
 .../mlir/tensorflow/ir/tf_saved_model.h       |   4 +
 .../mlir/tensorflow/ir/tf_saved_model_ops.td  |  26 ++
 .../tests/tf_saved_model/common_v1.py         |  12 +-
 .../tests/tf_saved_model/hash_table_v1.py     |  92 +++++++
 .../tf_saved_model/remove_init_variable_v1.py |  74 ++++++
 .../tensorflow/tests/tf_saved_model_ops.mlir  |  16 ++
 .../tests/tf_saved_model_ops_invalid.mlir     |  68 +++++
 .../mlir/tensorflow/translate/import_model.cc | 235 ++++++++++++++----
 10 files changed, 549 insertions(+), 57 deletions(-)
 create mode 100644 tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/hash_table_v1.py
 create mode 100644 tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/remove_init_variable_v1.py

diff --git a/tensorflow/compiler/mlir/tensorflow/BUILD b/tensorflow/compiler/mlir/tensorflow/BUILD
index 11a455eec72..b159815d5eb 100644
--- a/tensorflow/compiler/mlir/tensorflow/BUILD
+++ b/tensorflow/compiler/mlir/tensorflow/BUILD
@@ -666,7 +666,9 @@ cc_library(
         ":tensorflow_types",
         ":translate_utils",
         "//tensorflow/cc/saved_model:bundle_v2",
+        "//tensorflow/cc/saved_model:constants",
         "//tensorflow/cc/saved_model:loader_lite",
+        "//tensorflow/cc/saved_model:loader_util",
         "//tensorflow/compiler/jit:shape_inference_helpers",
         "//tensorflow/compiler/mlir:op_or_arg_name_mapper",
         "//tensorflow/compiler/tf2xla:functionalize_control_flow",
@@ -678,6 +680,7 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/grappler/utils:transitive_fanin",
+        "//tensorflow/core/platform:protobuf_internal",
         "//tensorflow/core/platform:types",
         "//tensorflow/stream_executor/lib",
         "@com_google_absl//absl/algorithm:container",
@@ -687,7 +690,6 @@ cc_library(
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:optional",
         "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:Analysis",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:StandardOps",
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.cc
index 7db0eed7713..ef55761686e 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "mlir/IR/Identifier.h"  // from @llvm-project
 #include "mlir/IR/Module.h"  // from @llvm-project
 #include "mlir/IR/OpImplementation.h"  // from @llvm-project
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
 #include "mlir/IR/StandardTypes.h"  // from @llvm-project
 #include "mlir/IR/SymbolTable.h"  // from @llvm-project
 #include "mlir/IR/TypeUtilities.h"  // from @llvm-project
@@ -76,6 +77,23 @@ static LogicalResult Verify(GlobalTensorOp global_tensor) {
   return success();
 }
 
+static LogicalResult Verify(SessionInitializerOp session_initializer) {
+  mlir::SymbolTable symbol_table(
+      session_initializer.getParentOfType<ModuleOp>());
+
+  auto init_func_op =
+      symbol_table.lookup<mlir::FuncOp>(session_initializer.initializer());
+  if (!init_func_op)
+    return session_initializer.emitOpError()
+           << "the initializer function does not exist";
+
+  if (!init_func_op.getType().getResults().empty())
+    return session_initializer.emitOpError()
+           << "the initializer function should have no output";
+
+  return success();
+}
+
 #define GET_OP_CLASSES
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.cc.inc"
 
@@ -214,12 +232,12 @@ static LogicalResult VerifySavedModelModule(
   for (auto func : module.getOps<FuncOp>()) {
     const bool is_exported = IsExported(func);
 
-    if (is_exported && !func.isPublic()) {
+    if (is_exported && func.getVisibility() != FuncOp::Visibility::Public) {
       return func.emitError()
              << "exported function @" << func.getName() << " should be public";
     }
 
-    if (!is_exported && func.isPublic()) {
+    if (!is_exported && func.getVisibility() == FuncOp::Visibility::Public) {
       return func.emitError() << "non-exported function @" << func.getName()
                               << " should be private";
     }
@@ -232,6 +250,19 @@ static LogicalResult VerifySavedModelModule(
       }
     }
   }
+
+  auto session_initializers = module.getOps<SessionInitializerOp>();
+  if (!session_initializers.empty() &&
+      !llvm::hasSingleElement(session_initializers)) {
+    return (*++session_initializers.begin()).emitError()
+           << "there must be no more than one session_initializer op";
+  }
+
+  auto is_init = [&session_initializers](mlir::FuncOp func) {
+    if (session_initializers.empty()) return false;
+    return (*session_initializers.begin()).initializer() == func.getName();
+  };
+
   SymbolTable symbol_table(module);
   auto symbol_uses = SymbolTable::getSymbolUses(&module.getBodyRegion());
   if (!symbol_uses.hasValue()) {
@@ -242,6 +273,12 @@ static LogicalResult VerifySavedModelModule(
     auto func = symbol_table.lookup<FuncOp>(
         symbol_use.getSymbolRef().cast<FlatSymbolRefAttr>().getValue());
     if (func && IsExported(func)) {
+      // If it is an init function, then it can be used by the unique
+      // session_initializer op.
+      if (is_init(func) &&
+          llvm::isa<SessionInitializerOp>(symbol_use.getUser()))
+        continue;
+
       return symbol_use.getUser()
           ->emitError("exported function cannot be internally referenced")
           .attachNote(func.getLoc())
@@ -361,5 +398,39 @@ GlobalTensorOp LookupBoundInput(FuncOp func, int arg_index,
   return symbol_table.lookup<GlobalTensorOp>(attr.getValue());
 }
 
+SessionInitializerOp GetSessionInitializerOp(mlir::ModuleOp op) {
+  auto initializers = op.getOps<SessionInitializerOp>();
+  if (initializers.empty()) return {};
+  return *initializers.begin();
+}
+
+class OptimizeSessionInitializerPattern
+    : public OpRewritePattern<SessionInitializerOp> {
+ public:
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(SessionInitializerOp op,
+                                PatternRewriter &rewriter) const override {
+    SymbolTable symbol_table(op.getParentOfType<ModuleOp>());
+    auto init_func_op = symbol_table.lookup<mlir::FuncOp>(op.initializer());
+
+    // The init function can only be referenced from the SessionInitializerOp.
+    // And there is at most one SessionInitializerOp in the module. So both ops
+    // have no other uses and can be simply erased.
+    if (init_func_op.front().begin()->isKnownTerminator()) {
+      rewriter.eraseOp(init_func_op);
+      rewriter.eraseOp(op);
+      return success();
+    }
+
+    return failure();
+  }
+};
+
+void SessionInitializerOp::getCanonicalizationPatterns(
+    OwningRewritePatternList &results, MLIRContext *context) {
+  results.insert<OptimizeSessionInitializerPattern>(context);
+}
+
 }  // namespace tf_saved_model
 }  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.h b/tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.h
index 47ebb1a1be5..b6f8753cc51 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.h
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.h
@@ -61,6 +61,10 @@ GlobalTensorOp LookupBoundInput(FuncOp func, int arg_index,
 // should have.
 Type GetBoundInputArgTypeFor(GlobalTensorOp global_tensor);
 
+// Returns the session initializer of this module if it exists. Returns null
+// otherwise.
+SessionInitializerOp GetSessionInitializerOp(mlir::ModuleOp op);
+
 }  // namespace tf_saved_model
 }  // namespace mlir
 
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model_ops.td b/tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model_ops.td
index 4431a160edf..dc1210a4d2a 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model_ops.td
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model_ops.td
@@ -128,4 +128,30 @@ def TfSavedModel_GlobalTensorOp : TfSavedModel_Op<"global_tensor"> {
   let verifier = [{ return Verify(*this); }];
 }
 
+def TfSavedModel_SessionInitializerOp: TfSavedModel_Op<"session_initializer"> {
+  let summary = "Initializes TensorFlow session state.";
+  let description = [{
+    The session initializer op marks a function that must be called by an
+    external agent exactly once to initialize TensorFlow session state, and this
+    must happen before any other exported functions are called. There must be no
+    more than one session initializer in a saved model.
+
+    The `initializer` represents the initialization function. The function have
+    no output and this function should be only called once.
+
+    This is used, for example, to initialize hash tables stored in resources and
+    accessed by resource name (rather than as resource handles or bound inputs
+    which is how `global_tensor`s are referenced)
+  }];
+
+  let arguments = (ins
+    FlatSymbolRefAttr:$initializer
+  );
+
+
+  let verifier = [{ return Verify(*this); }];
+
+  let hasCanonicalizer = 1;
+}
+
 #endif // SAVED_MODEL_DIALECT
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/common_v1.py b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/common_v1.py
index 7171f63bb05..5bfcfa5378a 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/common_v1.py
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/common_v1.py
@@ -46,7 +46,10 @@ def set_tf_options():
 # This function needs to take a "create_module_fn", as opposed to just the
 # module itself, because the creation of the module has to be delayed until
 # after absl and tensorflow have run various initialization steps.
-def do_test(signature_def_map, show_debug_info=False):
+def do_test(signature_def_map,
+            init_op=None,
+            canonicalize=False,
+            show_debug_info=False):
   """Runs test.
 
   1. Performs absl and tf "main"-like initialization that must run before almost
@@ -61,6 +64,9 @@ def do_test(signature_def_map, show_debug_info=False):
   Args:
     signature_def_map: A map from string key to signature_def. The key will be
       used as function name in the resulting MLIR.
+    init_op: The initializer op for the saved model. If set, it will generate a
+      initializer graph in the resulting MLIR.
+    canonicalize: If true, canonicalizer will be run on the resulting MLIR.
     show_debug_info: If true, shows debug locations in the resulting MLIR.
   """
 
@@ -84,6 +90,7 @@ def do_test(signature_def_map, show_debug_info=False):
     builder.add_meta_graph_and_variables(
         sess, [tf.saved_model.tag_constants.SERVING],
         signature_def_map,
+        main_op=init_op,
         strip_default_attrs=True)
     builder.save()
 
@@ -97,6 +104,9 @@ def do_test(signature_def_map, show_debug_info=False):
     mlir = pywrap_mlir.experimental_run_pass_pipeline(mlir,
                                                       'tf-standard-pipeline',
                                                       show_debug_info)
+    if canonicalize:
+      mlir = pywrap_mlir.experimental_run_pass_pipeline(mlir, 'canonicalize',
+                                                        show_debug_info)
     print(mlir)
 
   app.run(app_main)
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/hash_table_v1.py b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/hash_table_v1.py
new file mode 100644
index 00000000000..16290455608
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/hash_table_v1.py
@@ -0,0 +1,92 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+# RUN: %p/hash_table_v1 | FileCheck %s
+
+# pylint: disable=missing-docstring,line-too-long
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow.compat.v1 as tf
+from tensorflow.compiler.mlir.tensorflow.tests.tf_saved_model import common_v1
+
+# Verify that the tf.versions attribute exists. It is difficult to enforce
+# contents, since the version numbers change over time. The conversion logic
+# itself is verified in the common graphdef converter, so here just assert
+# it is being invoked.
+# CHECK: module
+# CHECK-SAME: tf.versions
+# CHECK-SAME: bad_consumers
+# CHECK-SAME: min_consumer
+# CHECK-SAME: producer
+
+# CHECK: "tf_saved_model.session_initializer"() {initializer = [[init:@.*]]} : () -> ()
+# CHECK: "tf_saved_model.global_tensor"()
+
+# CHECK:      func [[init]]
+# CHECK-NEXT: [[R5:%.*]] = "tf.Const"()
+# CHECK-NEXT: [[R6:%.*]] = "tf.Const"()
+# CHECK-NEXT: [[R7:%.*]] = "tf.HashTableV2"()
+# CHECK-SAME: shared_name = "[[hash_table:.*]]"
+# CHECK-NEXT: "tf.LookupTableImportV2"([[R7]], [[R5]], [[R6]])
+
+# CHECK:      func {{@[a-zA-Z_0-9]+}}(
+# CHECK-SAME: [[ARG0:%.*]]: tensor<i32>
+# CHECK-SAME: [[ARG1:%.*]]: tensor<!tf.resource
+# CHECK-SAME: attributes {{.*}} tf_saved_model.exported_names = ["key"]
+
+# CHECK-NEXT: [[R0:%.*]] = "tf.Const"()
+# CHECK-NEXT: [[R1:%.*]] = "tf.HashTableV2"()
+# CHECK-SAME: shared_name = "[[hash_table]]"
+# CHECK-NEXT: [[R2:%.*]] = "tf.LookupTableFindV2"([[R1]], [[ARG0]], [[R0]])
+# CHECK-NEXT: [[R3:%.*]] = "tf.ReadVariableOp"([[ARG1]])
+# CHECK-NEXT: [[R4:%.*]] = "tf.AddV2"([[R2]], [[R3]])
+# CHECK-NEXT: return [[R4]]
+
+
+def Test():
+
+  z = tf.compat.v1.get_variable(
+      name='y',
+      shape=(),
+      initializer=tf.random_normal_initializer(),
+      trainable=True)
+  table_initializer = tf.lookup.KeyValueTensorInitializer(
+      keys=[1, 2, 3, 4],
+      values=[5, 6, 7, 8],
+      key_dtype=tf.int32,
+      value_dtype=tf.float32)
+  table = tf.lookup.StaticHashTable(
+      table_initializer, default_value=tf.constant(0.0))
+
+  x = tf.placeholder(tf.int32, shape=(), name='input')
+  y = table.lookup(x)
+  r = tf.add(y, z)
+
+  tensor_info_x = tf.compat.v1.saved_model.utils.build_tensor_info(x)
+  tensor_info_r = tf.compat.v1.saved_model.utils.build_tensor_info(r)
+
+  return {
+      'key': (tf.compat.v1.saved_model.signature_def_utils.build_signature_def(
+          inputs={'x': tensor_info_x},
+          outputs={'r': tensor_info_r},
+          method_name='some_function'))
+  }
+
+
+if __name__ == '__main__':
+  common_v1.set_tf_options()
+  common_v1.do_test(Test(), tf.tables_initializer())
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/remove_init_variable_v1.py b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/remove_init_variable_v1.py
new file mode 100644
index 00000000000..117132649d7
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/remove_init_variable_v1.py
@@ -0,0 +1,74 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+# RUN: %p/remove_init_variable_v1 | FileCheck %s
+
+# pylint: disable=missing-docstring,line-too-long
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow.compat.v1 as tf
+from tensorflow.compiler.mlir.tensorflow.tests.tf_saved_model import common_v1
+
+# Verify that the tf.versions attribute exists. It is difficult to enforce
+# contents, since the version numbers change over time. The conversion logic
+# itself is verified in the common graphdef converter, so here just assert
+# it is being invoked.
+# CHECK: module
+# CHECK-SAME: tf.versions
+# CHECK-SAME: bad_consumers
+# CHECK-SAME: min_consumer
+# CHECK-SAME: producer
+
+# CHECK: "tf_saved_model.global_tensor"() {is_mutable, sym_name = "[[VAR:[a-zA-Z_0-9]+]]", type = tensor<1x3xf32>, value = {{.*}} : tensor<1x3xf32>} : () -> ()
+# CHECK-NOT: session_initializer
+
+# CHECK:      func {{@[a-zA-Z_0-9]+}}(
+# CHECK-SAME:   [[ARG0:%.*]]: tensor<3x1xf32> {tf_saved_model.index_path = ["x"]},
+# CHECK-SAME:   [[ARG1:%.*]]: tensor<!tf.resource<tensor<1x3xf32>>> {tf_saved_model.bound_input = @[[VAR]]})
+# CHECK-SAME:             -> (tensor<3x3xf32> {tf_saved_model.index_path = ["r"]})
+# CHECK-SAME: attributes {{.*}} tf_saved_model.exported_names = ["key"]
+
+# CHECK-NEXT: [[R0:%.*]] = "tf.ReadVariableOp"([[ARG1]]) {{{.*}}} : (tensor<!tf.resource<tensor<1x3xf32>>>) -> tensor<1x3xf32>
+# CHECK-NEXT: [[R1:%.*]] = "tf.MatMul"([[ARG0]], [[R0]]) {{{.*}}} : (tensor<3x1xf32>, tensor<1x3xf32>) -> tensor<3x3xf32>
+# CHECK-NEXT: return [[R1]] : tensor<3x3xf32>
+
+
+def Test():
+
+  x = tf.constant([[1.0], [1.0], [1.0]])
+  y = tf.compat.v1.get_variable(
+      name='y',
+      shape=(1, 3),
+      initializer=tf.random_normal_initializer(),
+      trainable=True)
+  r = tf.matmul(x, y)
+
+  tensor_info_x = tf.compat.v1.saved_model.utils.build_tensor_info(x)
+  tensor_info_r = tf.compat.v1.saved_model.utils.build_tensor_info(r)
+
+  return {
+      'key': (tf.compat.v1.saved_model.signature_def_utils.build_signature_def(
+          inputs={'x': tensor_info_x},
+          outputs={'r': tensor_info_r},
+          method_name='some_function'))
+  }
+
+
+if __name__ == '__main__':
+  common_v1.set_tf_options()
+  common_v1.do_test(
+      Test(), tf.initializers.global_variables(), canonicalize=True)
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model_ops.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model_ops.mlir
index e2dc5785cf4..26cdf025a10 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model_ops.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model_ops.mlir
@@ -2,6 +2,11 @@
 
 module attributes {tf_saved_model.semantics} {
 
+  // CHECK: tf_saved_model.session_initializer
+  "tf_saved_model.session_initializer"() {
+    initializer = @init
+  } : () -> ()
+
   // Representation for constants: (immutable) global tensor.
   // CHECK: tf_saved_model.global_tensor
   "tf_saved_model.global_tensor"() {
@@ -39,6 +44,17 @@ module attributes {tf_saved_model.semantics} {
     return
   }
 
+  // Representation for init functions
+  // CHECK: func @init
+  // CHECK-SAME: exported_names = ["__tf_saved_model_session_initializer"]
+  func @init(
+    %arg1: tensor<!tf.resource<tensor<1x64xf32>>> {tf_saved_model.bound_input = @some_constant}
+  ) attributes {tf_saved_model.exported_names = ["__tf_saved_model_session_initializer"]}
+  {
+    "tf.some_call"(%arg1) : (tensor<!tf.resource<tensor<1x64xf32>>>) -> ()
+    return
+  }
+
 }
 
 // -----
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model_ops_invalid.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model_ops_invalid.mlir
index 7287fcf66c8..260174b184f 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model_ops_invalid.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model_ops_invalid.mlir
@@ -261,6 +261,39 @@ module attributes {tf_saved_model.semantics} {
 
 // -----
 
+module attributes {tf_saved_model.semantics} {
+
+  // expected-error@+1 {{the initializer function does not exist}}
+  "tf_saved_model.session_initializer"() { initializer = @init } : () -> ()
+}
+
+// -----
+
+module attributes {tf_saved_model.semantics} {
+
+  // expected-error@+1 {{the initializer function should have no output}}
+  "tf_saved_model.session_initializer"() { initializer = @init } : () -> ()
+  func @init() -> tensor<1xf32> attributes {sym_visibility = "private"} {
+    %0 = "tf.Const"() {value = dense<[1.0]> : tensor<1xf32> } : () -> tensor<1xf32>
+    return %0 : tensor<1xf32>
+  }
+}
+
+// -----
+
+module attributes {tf_saved_model.semantics} {
+
+  "tf_saved_model.session_initializer"() { initializer = @init } : () -> ()
+  // expected-error@+1 {{there must be no more than one session_initializer op}}
+  "tf_saved_model.session_initializer"() { initializer = @init } : () -> ()
+  func @init() -> tensor<1xf32> attributes {sym_visibility = "private"} {
+    %0 = "tf.Const"() {value = dense<[1.0]> : tensor<1xf32> } : () -> tensor<1xf32>
+    return %0 : tensor<1xf32>
+  }
+}
+
+// -----
+
 module attributes {tf_saved_model.semantics} {
 
   // expected-error@+1 {{exported function @f should be public}}
@@ -284,3 +317,38 @@ module attributes {tf_saved_model.semantics} {
   }
 
 }
+
+// -----
+
+module attributes {tf_saved_model.semantics} {
+
+  // expected-error@+1 {{the initializer function does not exist}}
+  "tf_saved_model.session_initializer"() { initializer = @init } : () -> ()
+}
+
+// -----
+
+module attributes {tf_saved_model.semantics} {
+
+  // expected-error@+1 {{the initializer function should have no output}}
+  "tf_saved_model.session_initializer"() { initializer = @init } : () -> ()
+  func @init() -> (tensor<1xf32> {tf_saved_model.index_path = ["output"]})
+    attributes { tf_saved_model.exported_names = ["__tf_saved_model_session_initializer"] } {
+    %0 = "tf.Const"() {value = dense<[1.0]> : tensor<1xf32> } : () -> tensor<1xf32>
+    return %0 : tensor<1xf32>
+  }
+}
+
+// -----
+
+module attributes {tf_saved_model.semantics} {
+
+  "tf_saved_model.session_initializer"() { initializer = @init } : () -> ()
+  // expected-error@+1 {{there must be no more than one session_initializer op}}
+  "tf_saved_model.session_initializer"() { initializer = @init } : () -> ()
+  func @init() -> (tensor<1xf32> {tf_saved_model.index_path = ["output"]})
+    attributes { tf_saved_model.exported_names = ["__tf_saved_model_session_initializer"] } {
+    %0 = "tf.Const"() {value = dense<[1.0]> : tensor<1xf32> } : () -> tensor<1xf32>
+    return %0 : tensor<1xf32>
+  }
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/translate/import_model.cc b/tensorflow/compiler/mlir/tensorflow/translate/import_model.cc
index 820d0ce31fb..fea809c0798 100644
--- a/tensorflow/compiler/mlir/tensorflow/translate/import_model.cc
+++ b/tensorflow/compiler/mlir/tensorflow/translate/import_model.cc
@@ -60,6 +60,8 @@ limitations under the License.
 #include "mlir/IR/Types.h"  // from @llvm-project
 #include "mlir/IR/Verifier.h"  // from @llvm-project
 #include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "tensorflow/cc/saved_model/constants.h"
+#include "tensorflow/cc/saved_model/loader_util.h"
 #include "tensorflow/compiler/jit/shape_inference_helpers.h"
 #include "tensorflow/compiler/mlir/op_or_arg_name_mapper.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_attributes.h"
@@ -116,6 +118,7 @@ using mlir::NamedAttrList;
 using mlir::TensorType;
 using mlir::TF::VarHandleOp;
 using mlir::tf_saved_model::GlobalTensorOp;
+using mlir::tf_saved_model::SessionInitializerOp;
 using stream_executor::port::StatusOr;
 
 namespace {
@@ -2955,6 +2958,13 @@ void SortSavedModelModule(mlir::ModuleOp module) {
     named_global_tensor.global_tensor.getOperation()->moveBefore(
         &module.getBody()->front());
   }
+
+  auto initializers = module.getOps<SessionInitializerOp>();
+  if (!initializers.empty()) {
+    (*initializers.begin())
+        .getOperation()
+        ->moveBefore(&module.getBody()->front());
+  }
 }
 
 Status CreateSavedModelIR(
@@ -3241,17 +3251,32 @@ class SavedModelSignatureDefImporter {
                                  absl::Span<std::string> exported_names,
                                  mlir::MLIRContext* context)
       : bundle_(bundle),
+        flib_def_(OpRegistry::Global(), graph_def().library()),
+        debug_info_(),
         exported_names_(exported_names),
-        module_(mlir::ModuleOp::create(mlir::UnknownLoc::get(context))) {}
+        module_(mlir::ModuleOp::create(mlir::UnknownLoc::get(context))) {
+    // debug_info might not be loaded with loader_lite.
+    if (bundle_.debug_info != nullptr) debug_info_ = *bundle_.debug_info;
+  }
 
   // Converts the SavedModel to the SavedModel dialect. Creates an MLIR function
   // for each signature.
   StatusOr<mlir::OwningModuleRef> ConvertSignatures();
-  Status ConvertSignature(const GraphDef& graphdef,
-                          const std::string& sig_def_key,
-                          const SignatureDef& signature_def,
-                          const GraphDebugInfo& debug_info,
-                          const FunctionLibraryDefinition& flib_def);
+  Status ConvertSignature(const std::string& sig_def_key,
+                          const SignatureDef& signature_def);
+
+  // Converts the initialization graph in the SavedModel to an MLIR function.
+  Status ConvertInitializer();
+
+  // Converts a graph with feeds and fetches to an MLIR function.
+  StatusOr<mlir::OwningModuleRef> ConvertGraph(
+      const std::string& name,
+      const std::vector<std::pair<std::string, TensorInfo>>& inputs,
+      const std::vector<std::pair<std::string, TensorInfo>>& outputs,
+      const std::vector<std::string> control_outputs);
+
+  // Coarsens the islands in `module_`.
+  Status CoarsenIslands();
 
   // Creates GlobalTensorOp for each variable and moves each VarHandle op to
   // the enclosing function's arguments.
@@ -3262,6 +3287,10 @@ class SavedModelSignatureDefImporter {
   // tensor's shape is used to provide the most accurate nested shape.
   void LiftVariable(VarHandleOp op, GlobalTensorOp global_tensor);
 
+  // Removes the variable and related ops in the init function if it is already
+  // imported as a global tensor.
+  void RemoveVariable(VarHandleOp op);
+
   using VarGlobalMap = llvm::MapVector<
       llvm::StringRef,
       std::pair<GlobalTensorOp, llvm::SmallVector<VarHandleOp, 2>>>;
@@ -3273,18 +3302,68 @@ class SavedModelSignatureDefImporter {
   GraphImportConfig::InputArrays ParseInputArrays(
       const std::vector<std::pair<std::string, TensorInfo>>& inputs);
 
+  const GraphDef& graph_def() const {
+    return bundle_.meta_graph_def.graph_def();
+  }
+  const FunctionLibraryDefinition& flib_def() const { return flib_def_; }
+  const GraphDebugInfo& debug_info() const { return debug_info_; }
+
   const SavedModelBundle& bundle_;
+  FunctionLibraryDefinition flib_def_;
+  GraphDebugInfo debug_info_;
   absl::Span<std::string> exported_names_;
   mlir::OwningModuleRef module_;
 };
 
+Status SavedModelSignatureDefImporter::ConvertInitializer() {
+  std::vector<AssetFileDef> asset_file_defs;
+  TF_RETURN_IF_ERROR(
+      internal::GetAssetFileDefs(bundle_.meta_graph_def, &asset_file_defs));
+
+  if (!asset_file_defs.empty())
+    return errors::Unimplemented(
+        absl::StrCat("Assets are not supported in signaturedef importer"));
+
+  std::string init_node_name;
+  TF_RETURN_IF_ERROR(
+      internal::GetInitOp("", bundle_.meta_graph_def, &init_node_name));
+
+  if (init_node_name.empty()) return Status::OK();
+
+  TF_ASSIGN_OR_RETURN(auto sub_module,
+                      ConvertGraph(init_node_name, {}, {}, {init_node_name}));
+
+  mlir::SymbolTable symbol_table(*sub_module);
+
+  auto init_func_op = symbol_table.lookup<mlir::FuncOp>(init_node_name);
+
+  init_func_op.removeAttr("tf.entry_function");
+
+  mlir::OpBuilder builder(module_->getBodyRegion());
+
+  // Set the exported name of init function to an reserved name for
+  // tf_saved_model.
+  init_func_op.setAttr(
+      "tf_saved_model.exported_names",
+      builder.getStrArrayAttr({"__tf_saved_model_session_initializer"}));
+
+  builder.create<mlir::tf_saved_model::SessionInitializerOp>(
+      module_->getLoc(), builder.getSymbolRefAttr(init_func_op.getName()));
+
+  // Move the converted functions to top level MLIR module.
+  auto* block = module_->getBody();
+  auto* sub_block = sub_module->getBody();
+  block->getOperations().splice(
+      mlir::Block::iterator(block->getTerminator()), sub_block->getOperations(),
+      sub_block->begin(), mlir::Block::iterator(sub_block->getTerminator()));
+
+  return Status::OK();
+}
+
 StatusOr<mlir::OwningModuleRef>
 SavedModelSignatureDefImporter::ConvertSignatures() {
   const auto& signatures = bundle_.GetSignatures();
-  const auto& graphdef = bundle_.meta_graph_def.graph_def();
-  PopulateTfVersions(module_.get(), graphdef.versions());
-
-  FunctionLibraryDefinition flib_def(OpRegistry::Global(), graphdef.library());
+  PopulateTfVersions(module_.get(), graph_def().versions());
 
   // debug_info might not be loaded with loader_lite.
   GraphDebugInfo debug_info;
@@ -3307,23 +3386,49 @@ SavedModelSignatureDefImporter::ConvertSignatures() {
       continue;
     }
 
-    TF_RETURN_IF_ERROR(ConvertSignature(graphdef, sig_def_key, signature_def,
-                                        debug_info, flib_def));
+    TF_RETURN_IF_ERROR(ConvertSignature(sig_def_key, signature_def));
   }
-  TF_RETURN_IF_ERROR(LiftVariables());
+
+  TF_RETURN_IF_ERROR(ConvertInitializer());
 
   mlir::OpBuilder builder(module_->getBodyRegion());
   module_->setAttr("tf_saved_model.semantics", builder.getUnitAttr());
+
+  TF_RETURN_IF_ERROR(CoarsenIslands());
+  TF_RETURN_IF_ERROR(LiftVariables());
+
   SortSavedModelModule(*module_);
   MarkSavedModelFunctionVisibility(*module_);
 
   return std::move(module_);
 }
 
+StatusOr<mlir::OwningModuleRef> SavedModelSignatureDefImporter::ConvertGraph(
+    const std::string& name,
+    const std::vector<std::pair<std::string, TensorInfo>>& inputs,
+    const std::vector<std::pair<std::string, TensorInfo>>& outputs,
+    const std::vector<std::string> control_outputs) {
+  GraphImportConfig specs;
+  specs.prune_unused_nodes = true;
+  specs.inputs = ParseInputArrays(inputs);
+  for (auto& output : outputs) specs.outputs.push_back(output.second.name());
+  specs.control_outputs = control_outputs;
+
+  // Convert sub-graphdef to sub-graph.
+  GraphConstructorOptions options;
+  options.allow_internal_ops = true;
+  options.add_default_attributes = true;
+  Graph graph(OpRegistry::Global());
+
+  TF_RETURN_IF_ERROR(ConvertGraphDefToGraph(options, graph_def(), &graph));
+
+  // Convert sub-graph to MLIR module.true
+  return GraphDefImporter::Convert(module_->getContext(), graph, debug_info(),
+                                   flib_def(), specs, name);
+}
+
 Status SavedModelSignatureDefImporter::ConvertSignature(
-    const GraphDef& graphdef, const std::string& sig_def_key,
-    const SignatureDef& signature_def, const GraphDebugInfo& debug_info,
-    const FunctionLibraryDefinition& flib_def) {
+    const std::string& sig_def_key, const SignatureDef& signature_def) {
   // Create local vectors for the input and output and sort them to be
   // deterministic. We don't want anyone to really depend on the order, client
   // should lookup argument/result mapping by attribute name.
@@ -3339,34 +3444,9 @@ Status SavedModelSignatureDefImporter::ConvertSignature(
     return lhs.first.size() < rhs.first.size() || lhs.first > rhs.first;
   });
 
-  GraphImportConfig specs;
-  specs.prune_unused_nodes = true;
-  specs.inputs = ParseInputArrays(inputs);
-  for (auto& output : outputs) specs.outputs.push_back(output.second.name());
-
-  // Remove unused nodes and create sub-graphdef.
-  GraphDef sub_graph_def;
-  TF_RETURN_IF_ERROR(tensorflow::grappler::SetTransitiveFaninGraph(
-      graphdef, &sub_graph_def,
-      /*terminal_nodes=*/{specs.outputs.begin(), specs.outputs.end()}));
-
-  // Set the function library definitions in the pruned graphdef.
-  *sub_graph_def.mutable_library() = flib_def.ToProto();
-
-  // Convert sub-graphdef to sub-graph.
-  GraphConstructorOptions options;
-  options.allow_internal_ops = true;
-  options.add_default_attributes = true;
-  Graph sub_graph(OpRegistry::Global());
-
-  TF_RETURN_IF_ERROR(
-      ConvertGraphDefToGraph(options, sub_graph_def, &sub_graph));
-
   // Convert sub-graph to MLIR module.
-  TF_ASSIGN_OR_RETURN(
-      auto sub_module,
-      GraphDefImporter::Convert(module_->getContext(), sub_graph, debug_info,
-                                flib_def, specs, sig_def_key));
+  TF_ASSIGN_OR_RETURN(auto sub_module,
+                      ConvertGraph(sig_def_key, inputs, outputs, {}));
   mlir::OpBuilder builder(sub_module->getBodyRegion());
 
   // Find the FuncOp which corresponds to current SignatureDef.
@@ -3399,16 +3479,28 @@ Status SavedModelSignatureDefImporter::ConvertSignature(
       sub_block->begin(), mlir::Block::iterator(sub_block->getTerminator()));
 
   return Status::OK();
-}
+}  // namespace
 
 Status SavedModelSignatureDefImporter::LiftVariables() {
   VarGlobalMap var_globals;
+  llvm::SmallVector<VarHandleOp, 4> init_vars;
 
-  auto walker = [&var_globals](mlir::Operation* op) {
-    if (auto var_handle_op = llvm::dyn_cast<VarHandleOp>(op))
-      var_globals[var_handle_op.shared_name()].second.push_back(var_handle_op);
-    else if (op->getName().getStringRef() == "tf.VariableV2")
+  auto session_initializer =
+      mlir::tf_saved_model::GetSessionInitializerOp(*module_);
+
+  auto walker = [&var_globals, &init_vars,
+                 &session_initializer](mlir::Operation* op) {
+    if (auto var_handle_op = llvm::dyn_cast<VarHandleOp>(op)) {
+      if (session_initializer &&
+          session_initializer.initializer() ==
+              var_handle_op.getParentOfType<mlir::FuncOp>().getName())
+        init_vars.push_back(var_handle_op);
+      else
+        var_globals[var_handle_op.shared_name()].second.push_back(
+            var_handle_op);
+    } else if (op->getName().getStringRef() == "tf.VariableV2") {
       return mlir::WalkResult::interrupt();
+    }
     return mlir::WalkResult::advance();
   };
   bool contains_ref_variable = module_->walk(walker).wasInterrupted();
@@ -3425,9 +3517,51 @@ Status SavedModelSignatureDefImporter::LiftVariables() {
     for (VarHandleOp var_handle : it.second.second)
       LiftVariable(var_handle, it.second.first);
 
+  for (auto op : init_vars) RemoveVariable(op);
+
   return Status::OK();
 }
 
+Status SavedModelSignatureDefImporter::CoarsenIslands() {
+  mlir::StatusScopedDiagnosticHandler diag_handler(module_->getContext());
+
+  mlir::PassManager pm(module_->getContext());
+  pm.addNestedPass<mlir::FuncOp>(
+      mlir::tf_executor::CreateTFExecutorIslandCoarseningPass());
+  if (mlir::failed(pm.run(*module_)))
+    return diag_handler.Combine(
+        errors::Internal("failed to coarsening islands."));
+
+  return Status::OK();
+}
+
+void SavedModelSignatureDefImporter::RemoveVariable(VarHandleOp op) {
+  llvm::SmallVector<mlir::Operation*, 4> work_list;
+  work_list.push_back(op);
+  while (!work_list.empty()) {
+    auto* op = work_list.back();
+    work_list.pop_back();
+
+    for (mlir::Value res : op->getResults()) {
+      for (mlir::Operation* user : res.getUsers()) {
+        work_list.push_back(user);
+      }
+    }
+
+    for (auto& use : op->getOpOperands()) {
+      if (mlir::Value value = use.get()) {
+        mlir::Operation* def = value.getDefiningOp();
+        work_list.push_back(def);
+      }
+    }
+
+    op->dropAllReferences();
+    op->dropAllDefinedValueUses();
+
+    op->erase();
+  }
+}
+
 void SavedModelSignatureDefImporter::LiftVariable(
     VarHandleOp op, GlobalTensorOp global_tensor) {
   mlir::OpBuilder builder(&module_->getBodyRegion());
@@ -3460,12 +3594,7 @@ void SavedModelSignatureDefImporter::LiftVariable(
   // Add the newly added function param to entry block's arguments.
   auto new_value = func_op.front().addArgument(resource_type);
 
-  // Remove the VarHandleOp also updating the containing island's return type.
-  DCHECK(llvm::isa<mlir::tf_executor::IslandOp>(op.getParentOp()));
-  DCHECK(llvm::cast<mlir::tf_executor::IslandOp>(op.getParentOp())
-             .WrapsSingleOp());
   op.getOperation()->replaceAllUsesWith(llvm::ArrayRef<mlir::Value>(new_value));
-  op.getParentOp()->getResult(0).setType(resource_type);
   op.getOperation()->erase();
 }
 

From 1823f877359bb138c57a005c30aba8832dfa79fb Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 19 Jun 2020 15:55:58 -0700
Subject: [PATCH 0677/1390] Fix issues with `TypeIndex` on MacOS, i.e. hash on
 the type name where available since this otherwise causes problems when
 loading different shared libraries with `RTLD_LOCAL`.

PiperOrigin-RevId: 317395983
Change-Id: I14b3add5fa19725b2150b68813364d16b8320130
---
 tensorflow/core/framework/type_index.h | 38 ++++++++++++++++++++------
 1 file changed, 30 insertions(+), 8 deletions(-)

diff --git a/tensorflow/core/framework/type_index.h b/tensorflow/core/framework/type_index.h
index fd27d8bcb35..fcf68677a12 100644
--- a/tensorflow/core/framework/type_index.h
+++ b/tensorflow/core/framework/type_index.h
@@ -24,6 +24,10 @@ limitations under the License.
 
 #include "tensorflow/core/platform/types.h"
 
+#if defined(MACOS) || defined(TARGET_OS_MAC)
+#include "tensorflow/core/platform/hash.h"
+#endif  // defined(MACOS) || defined(TARGET_OS_MAC)
+
 namespace tensorflow {
 
 // On some platforms, we would like to avoid using RTTI in order to have smaller
@@ -53,10 +57,33 @@ class TypeIndex {
 
   // Returns a TypeIndex object that corresponds to a typename.
   template <typename T>
-  static TypeIndex Make(const char* name) {
+  static TypeIndex Make() {
     static bool hash_bit[1];
+
+#if defined(__GXX_RTTI) || defined(_CPPRTTI)
+
+#if defined(MACOS) || defined(TARGET_OS_MAC)
+    // Use a hash based on the type name to avoid issues due to RTLD_LOCAL on
+    // MacOS (b/156979412).
+    return TypeIndex(Hash64(typeid(T).name()), typeid(T).name());
+#else
+    // Use the real type name if we have RTTI.
     return TypeIndex(static_cast<uint64>(reinterpret_cast<intptr_t>(hash_bit)),
-                     name);
+                     typeid(T).name());
+#endif  // defined(MACOS) || defined(TARGET_OS_MAC)
+
+#else
+#if defined(MACOS) || defined(TARGET_OS_MAC)
+    // Warn MacOS users that not using RTTI can cause problems (b/156979412).
+#warning \
+    "Compiling with RTTI disabled on MacOS can cause problems when comparing " \
+    "types across shared libraries."
+#endif  // defined(MACOS) || defined(TARGET_OS_MAC)
+
+    // No type names available.
+    return TypeIndex(static_cast<uint64>(reinterpret_cast<intptr_t>(hash_bit)),
+                     "[RTTI disabled]");
+#endif  // __GXX_RTTI
   }
 
  private:
@@ -70,12 +97,7 @@ class TypeIndex {
 
 template <typename T>
 inline TypeIndex MakeTypeIndex() {
-#if defined(__GXX_RTTI) || defined(_CPPRTTI)
-  // Use the real type name if we have RTTI.
-  return TypeIndex::Make<T>(typeid(T).name());
-#else
-  return TypeIndex::Make<T>("[RTTI disabled]");
-#endif  // __GXX_RTTI
+  return TypeIndex::Make<T>();
 }
 
 }  // namespace tensorflow

From 54a7bd1d738e0f2b8b9b729f48f8e90f2f39b3c6 Mon Sep 17 00:00:00 2001
From: Nathan Luehr <nluehr@nvidia.com>
Date: Fri, 19 Jun 2020 18:02:40 -0500
Subject: [PATCH 0678/1390] Remove unnecessary tf32_execution_allowed check

---
 tensorflow/stream_executor/cuda/cuda_blas.cc | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/tensorflow/stream_executor/cuda/cuda_blas.cc b/tensorflow/stream_executor/cuda/cuda_blas.cc
index e9e3635d8c1..e387690da26 100644
--- a/tensorflow/stream_executor/cuda/cuda_blas.cc
+++ b/tensorflow/stream_executor/cuda/cuda_blas.cc
@@ -402,14 +402,6 @@ bool CUDABlas::DoBlasInternalImpl(FuncT cublas_func, Stream *stream,
                                            : CUBLAS_POINTER_MODE_DEVICE)) {
     return false;
   }
-#if CUBLAS_VER_MAJOR >= 11
-  ScopedCublasMathMode math_mode{blas_};
-  if (!tensorflow::tf32_execution_allowed()) {
-    if (!math_mode.Init(CUBLAS_DEFAULT_MATH)) {
-      return false;
-    }
-  }
-#endif
   cublasStatus_t ret = cublas_func(blas_, args...);
   if ((err_on_failure || VLOG_IS_ON(3)) && ret != CUBLAS_STATUS_SUCCESS) {
     LOG(ERROR) << "failed to run cuBLAS routine: " << ToString(ret);

From 866eb4828af9aa63e75b51fa05e84cc6b3176e5b Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 19 Jun 2020 15:58:05 -0700
Subject: [PATCH 0679/1390] tf.numpy avoid using typing module given Kokoro
 breakage.

PiperOrigin-RevId: 317396321
Change-Id: Ifa05dcbbdd8998708e52ba52cdb03e33e4952f21
---
 tensorflow/python/ops/numpy_ops/np_array_ops.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/tensorflow/python/ops/numpy_ops/np_array_ops.py b/tensorflow/python/ops/numpy_ops/np_array_ops.py
index 47236d45561..74f1bc8cbef 100644
--- a/tensorflow/python/ops/numpy_ops/np_array_ops.py
+++ b/tensorflow/python/ops/numpy_ops/np_array_ops.py
@@ -21,7 +21,6 @@ from __future__ import print_function
 
 import math
 import numbers
-from typing import Sequence
 import numpy as np
 import six
 
@@ -1684,10 +1683,10 @@ def _slice_helper(tensor, slice_spec):
 def _as_spec_tuple(slice_spec):
   """Convert slice_spec to tuple."""
   if isinstance(slice_spec,
-                Sequence) and not isinstance(slice_spec, np.ndarray):
+                (list, tuple)) and not isinstance(slice_spec, np.ndarray):
     is_index = True
     for s in slice_spec:
-      if s is None or s is Ellipsis or isinstance(s, (Sequence, slice)):
+      if s is None or s is Ellipsis or isinstance(s, (list, tuple, slice)):
         is_index = False
         break
       elif isinstance(s, (np_arrays.ndarray, np.ndarray)) and s.ndim != 0:

From fac99746cb7382af83fc9922c345a1bd68caa516 Mon Sep 17 00:00:00 2001
From: Chenkai Kuang <chenkai@google.com>
Date: Fri, 19 Jun 2020 16:08:23 -0700
Subject: [PATCH 0680/1390] Override "map_resources" in AggregatingVariable.

PiperOrigin-RevId: 317398115
Change-Id: Ic57325aacf0fb45a66a469428f544fb94f3cc031
---
 tensorflow/python/distribute/ps_values.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/tensorflow/python/distribute/ps_values.py b/tensorflow/python/distribute/ps_values.py
index 37cd6e12d90..5fb2d42b626 100644
--- a/tensorflow/python/distribute/ps_values.py
+++ b/tensorflow/python/distribute/ps_values.py
@@ -166,6 +166,14 @@ class AggregatingVariable(variables_lib.Variable, core.Tensor):
   def _gather_saveables_for_checkpoint(self):
     return {trackable.VARIABLE_VALUE_KEY: self._v}
 
+  def _map_resources(self):
+    """For implementing `Trackable`."""
+    # By delegating this method to the wrapped variable, SavedModel with
+    # AggregatingVariable are identical to SavedModel with normal variables.
+    obj_map, resource_map = self._v._map_resources()  # pylint:disable=protected-access
+    obj_map[self] = obj_map[self._v]
+    return obj_map, resource_map
+
   # pylint: disable=multiple-statements
   def __add__(self, o):
     return self._v + o

From a70ad66828f198f4f495c3f29445b396045b6c3f Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 19 Jun 2020 16:23:33 -0700
Subject: [PATCH 0681/1390] GitHub Issue #40462

PiperOrigin-RevId: 317400375
Change-Id: I13891b5c2f41ac97674ddfda679c4273d53b25ef
---
 tensorflow/python/keras/preprocessing/image_dataset.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/keras/preprocessing/image_dataset.py b/tensorflow/python/keras/preprocessing/image_dataset.py
index d287c4ef372..6e78ac3c03a 100644
--- a/tensorflow/python/keras/preprocessing/image_dataset.py
+++ b/tensorflow/python/keras/preprocessing/image_dataset.py
@@ -147,7 +147,7 @@ def image_dataset_from_directory(directory,
           'directory. If you wish to infer the labels from the subdirectory '
           'names in the target directory, pass `labels="inferred"`. '
           'If you wish to get a dataset that only contains images '
-          '(no labels), pass `labels=None`.')
+          '(no labels), pass `labels_mode=None`.')
     if class_names:
       raise ValueError('You can only pass `class_names` if the labels are '
                        'inferred from the subdirectory names in the target '

From 62683d061cf31d05588a94cc333b53542cea9568 Mon Sep 17 00:00:00 2001
From: George Karpenkov <cheshire@google.com>
Date: Fri, 19 Jun 2020 16:25:08 -0700
Subject: [PATCH 0682/1390] [XLA] Rollback of rollback of "Implement
 LocalClient::Run which supports buffer donation"

PiperOrigin-RevId: 317400695
Change-Id: I56f1f8df347d5a3b2bad9526c7315c63ad6ddadb
---
 .../compiler/xla/client/local_client.cc       | 26 ++++++++++++-------
 tensorflow/compiler/xla/client/local_client.h | 21 +++++++++++++++
 .../tests/multiple_devices_on_host_test.cc    |  3 ++-
 tensorflow/compiler/xla/tests/while_test.cc   |  6 +++--
 4 files changed, 44 insertions(+), 12 deletions(-)

diff --git a/tensorflow/compiler/xla/client/local_client.cc b/tensorflow/compiler/xla/client/local_client.cc
index afe115deda8..aa252067e19 100644
--- a/tensorflow/compiler/xla/client/local_client.cc
+++ b/tensorflow/compiler/xla/client/local_client.cc
@@ -176,15 +176,23 @@ StatusOr<ScopedShapedBuffer> LocalExecutable::Run(
   for (const ShapedBuffer* const arg : arguments) {
     argument_shapes.push_back(&arg->on_host_shape());
   }
-  TF_ASSIGN_OR_RETURN(auto options_and_stream,
-                      RunHelper(argument_shapes, run_options));
-  ExecutableRunOptions options = options_and_stream.first.run_options();
-  options.set_device_ordinal(-1);
-  auto result = RunAsync(arguments, options);
-  Status block_status = options.stream()->BlockHostUntilDone();
-  TF_RETURN_IF_ERROR(result.status());
-  TF_RETURN_IF_ERROR(block_status);
-  return result;
+  return AsyncCallAndBlockHostUntilDone<xla::ScopedShapedBuffer>(
+      argument_shapes, run_options, [&](const ExecutableRunOptions& options) {
+        return RunAsync(arguments, options);
+      });
+}
+
+StatusOr<ExecutionOutput> LocalExecutable::Run(
+    std::vector<ExecutionInput> arguments, ExecutableRunOptions run_options) {
+  std::vector<const Shape*> argument_shapes;
+  argument_shapes.reserve(arguments.size());
+  for (const ExecutionInput& arg : arguments) {
+    argument_shapes.push_back(&arg.shape());
+  }
+  return AsyncCallAndBlockHostUntilDone<ExecutionOutput>(
+      argument_shapes, run_options, [&](const ExecutableRunOptions& options) {
+        return RunAsync(argument_shapes, std::move(arguments), options);
+      });
 }
 
 static std::shared_ptr<HloSnapshot> DumpArguments(
diff --git a/tensorflow/compiler/xla/client/local_client.h b/tensorflow/compiler/xla/client/local_client.h
index 7cdeb9dcbf6..3241ac73d54 100644
--- a/tensorflow/compiler/xla/client/local_client.h
+++ b/tensorflow/compiler/xla/client/local_client.h
@@ -51,6 +51,11 @@ class LocalExecutable {
       const absl::Span<const ShapedBuffer* const> arguments,
       ExecutableRunOptions run_options);
 
+  // Similar to Run(), but allows for donating argument buffers to the
+  // executable.
+  StatusOr<ExecutionOutput> Run(std::vector<ExecutionInput> arguments,
+                                ExecutableRunOptions run_options);
+
   // Similar to Run(), but need not block the host waiting for the computation
   // to complete before returning.
   StatusOr<ScopedShapedBuffer> RunAsync(
@@ -90,6 +95,22 @@ class LocalExecutable {
   // Backend::devices_equivalent).
   int build_device_ordinal() const { return build_options_.device_ordinal(); }
 
+  template <typename T>
+  StatusOr<T> AsyncCallAndBlockHostUntilDone(
+      absl::Span<Shape const* const> argument_shapes,
+      const ExecutableRunOptions& run_options,
+      std::function<StatusOr<T>(const ExecutableRunOptions&)> async_callback) {
+    TF_ASSIGN_OR_RETURN(auto options_and_stream,
+                        RunHelper(argument_shapes, run_options));
+    ExecutableRunOptions options = options_and_stream.first.run_options();
+    options.set_device_ordinal(-1);
+    StatusOr<T> result = async_callback(options);
+    Status block_status = options.stream()->BlockHostUntilDone();
+    TF_RETURN_IF_ERROR(result.status());
+    TF_RETURN_IF_ERROR(block_status);
+    return result;
+  }
+
   // Compiled computation.
   std::unique_ptr<Executable> executable_;
 
diff --git a/tensorflow/compiler/xla/tests/multiple_devices_on_host_test.cc b/tensorflow/compiler/xla/tests/multiple_devices_on_host_test.cc
index 2b19aaded9c..2231fc6feab 100644
--- a/tensorflow/compiler/xla/tests/multiple_devices_on_host_test.cc
+++ b/tensorflow/compiler/xla/tests/multiple_devices_on_host_test.cc
@@ -45,7 +45,8 @@ void CompileAndExecute(
       xla::ClientLibrary::GetXlaService(client->platform())
           ->backend()
           .memory_allocator());
-  StatusOr<ScopedShapedBuffer> result = executable->Run({}, execute_options);
+  StatusOr<ScopedShapedBuffer> result =
+      executable->Run(absl::Span<const ShapedBuffer* const>(), execute_options);
   {
     absl::MutexLock lock(results_mutex);
     results->emplace_back(device_ordinal, std::move(result));
diff --git a/tensorflow/compiler/xla/tests/while_test.cc b/tensorflow/compiler/xla/tests/while_test.cc
index d575bbb1f3e..8e8c3605cc7 100644
--- a/tensorflow/compiler/xla/tests/while_test.cc
+++ b/tensorflow/compiler/xla/tests/while_test.cc
@@ -1324,14 +1324,16 @@ void BM_WhileLoop(int num_iters) {
   options.set_allocator(&allocator);
   const int kWarmups = 2;
   for (int i = 0; i < kWarmups; ++i) {
-    auto result = executable->Run({}, options);
+    auto result =
+        executable->Run(absl::Span<const ShapedBuffer* const>(), options);
     ASSERT_TRUE(result.ok());
   }
 
   // Run benchmark.
   tensorflow::testing::StartTiming();
   for (int i = 0; i < num_iters; ++i) {
-    auto result = executable->Run({}, options);
+    auto result =
+        executable->Run(absl::Span<const ShapedBuffer* const>(), options);
     ASSERT_TRUE(result.ok());
   }
 }

From 3c9a2f200e7c439ce5e3345bdd9055b88b9777ca Mon Sep 17 00:00:00 2001
From: Peng Wang <wangpeng@google.com>
Date: Fri, 19 Jun 2020 16:34:05 -0700
Subject: [PATCH 0683/1390] [TF-numpy] Changed all callsites of
 np_doc/np_doc_only to use string names, to avoid blocking imports when some
 numpy symbols are missing (e.g. because of an older version of numpy).

Also moved `np_fun_name` to be the first positional argument of np_doc/np_doc_only.

PiperOrigin-RevId: 317402093
Change-Id: I8b3ba54909e9507c7ab062f5bf5d7f13ad549317
---
 .../python/ops/numpy_ops/np_array_ops.py      | 156 ++++++------
 .../python/ops/numpy_ops/np_math_ops.py       | 228 +++++++++---------
 tensorflow/python/ops/numpy_ops/np_utils.py   |  91 ++++---
 .../python/ops/numpy_ops/np_utils_test.py     |  10 +-
 4 files changed, 254 insertions(+), 231 deletions(-)

diff --git a/tensorflow/python/ops/numpy_ops/np_array_ops.py b/tensorflow/python/ops/numpy_ops/np_array_ops.py
index 74f1bc8cbef..a87c72ed763 100644
--- a/tensorflow/python/ops/numpy_ops/np_array_ops.py
+++ b/tensorflow/python/ops/numpy_ops/np_array_ops.py
@@ -41,17 +41,17 @@ from tensorflow.python.ops.numpy_ops import np_utils
 from tensorflow.python.util import nest
 
 
-@np_utils.np_doc(np.empty)
+@np_utils.np_doc('empty')
 def empty(shape, dtype=float):  # pylint: disable=redefined-outer-name
   return zeros(shape, dtype)
 
 
-@np_utils.np_doc(np.empty_like)
+@np_utils.np_doc('empty_like')
 def empty_like(a, dtype=None):
   return zeros_like(a, dtype)
 
 
-@np_utils.np_doc(np.zeros)
+@np_utils.np_doc('zeros')
 def zeros(shape, dtype=float):  # pylint: disable=redefined-outer-name
   dtype = (
       np_utils.result_type(dtype) if dtype else np_dtypes.default_float_type())
@@ -60,7 +60,7 @@ def zeros(shape, dtype=float):  # pylint: disable=redefined-outer-name
   return np_arrays.tensor_to_ndarray(array_ops.zeros(shape, dtype=dtype))
 
 
-@np_utils.np_doc(np.zeros_like)
+@np_utils.np_doc('zeros_like')
 def zeros_like(a, dtype=None):  # pylint: disable=missing-docstring
   if isinstance(a, np_arrays.ndarray):
     a = a.data
@@ -75,7 +75,7 @@ def zeros_like(a, dtype=None):  # pylint: disable=missing-docstring
   return np_arrays.tensor_to_ndarray(array_ops.zeros_like(a, dtype))
 
 
-@np_utils.np_doc(np.ones)
+@np_utils.np_doc('ones')
 def ones(shape, dtype=float):  # pylint: disable=redefined-outer-name
   if dtype:
     dtype = np_utils.result_type(dtype)
@@ -84,7 +84,7 @@ def ones(shape, dtype=float):  # pylint: disable=redefined-outer-name
   return np_arrays.tensor_to_ndarray(array_ops.ones(shape, dtype=dtype))
 
 
-@np_utils.np_doc(np.ones_like)
+@np_utils.np_doc('ones_like')
 def ones_like(a, dtype=None):
   if isinstance(a, np_arrays.ndarray):
     a = a.data
@@ -95,7 +95,7 @@ def ones_like(a, dtype=None):
   return np_arrays.tensor_to_ndarray(array_ops.ones_like(a, dtype))
 
 
-@np_utils.np_doc(np.eye)
+@np_utils.np_doc('eye')
 def eye(N, M=None, k=0, dtype=float):  # pylint: disable=invalid-name,missing-docstring
   if dtype:
     dtype = np_utils.result_type(dtype)
@@ -127,12 +127,12 @@ def eye(N, M=None, k=0, dtype=float):  # pylint: disable=invalid-name,missing-do
       array_ops.matrix_diag(diagonal=diagonal_, num_rows=N, num_cols=M, k=k))
 
 
-@np_utils.np_doc(np.identity)
+@np_utils.np_doc('identity')
 def identity(n, dtype=float):
   return eye(N=n, M=n, dtype=dtype)
 
 
-@np_utils.np_doc(np.full)
+@np_utils.np_doc('full')
 def full(shape, fill_value, dtype=None):  # pylint: disable=redefined-outer-name
   if not isinstance(shape, np_arrays.ndarray):
     shape = asarray(np_arrays.convert_to_tensor(shape, dtype_hint=np.int32))
@@ -144,7 +144,7 @@ def full(shape, fill_value, dtype=None):  # pylint: disable=redefined-outer-name
 
 # Using doc only here since np full_like signature doesn't seem to have the
 # shape argument (even though it exists in the documentation online).
-@np_utils.np_doc_only(np.full_like)
+@np_utils.np_doc_only('full_like')
 def full_like(a, fill_value, dtype=None, order='K', subok=True, shape=None):  # pylint: disable=missing-docstring,redefined-outer-name
   """order, subok and shape arguments mustn't be changed."""
   if order != 'K':
@@ -163,7 +163,7 @@ def full_like(a, fill_value, dtype=None, order='K', subok=True, shape=None):  #
 
 # TODO(wangpeng): investigate whether we can make `copy` default to False.
 # pylint: disable=g-short-docstring-punctuation,g-no-space-after-docstring-summary,g-doc-return-or-yield,g-doc-args
-@np_utils.np_doc_only(np.array)
+@np_utils.np_doc_only('array')
 def array(val, dtype=None, copy=True, ndmin=0):  # pylint: disable=redefined-outer-name
   """Since Tensors are immutable, a copy is made only if val is placed on a
 
@@ -224,7 +224,7 @@ def array(val, dtype=None, copy=True, ndmin=0):  # pylint: disable=redefined-out
 # pylint: enable=g-short-docstring-punctuation,g-no-space-after-docstring-summary,g-doc-return-or-yield,g-doc-args
 
 
-@np_utils.np_doc(np.asarray)
+@np_utils.np_doc('asarray')
 def asarray(a, dtype=None):
   if dtype:
     dtype = np_utils.result_type(dtype)
@@ -233,18 +233,18 @@ def asarray(a, dtype=None):
   return array(a, dtype, copy=False)
 
 
-@np_utils.np_doc(np.asanyarray)
+@np_utils.np_doc('asanyarray')
 def asanyarray(a, dtype=None):
   return asarray(a, dtype)
 
 
-@np_utils.np_doc(np.ascontiguousarray)
+@np_utils.np_doc('ascontiguousarray')
 def ascontiguousarray(a, dtype=None):
   return array(a, dtype, ndmin=1)
 
 
 # Numerical ranges.
-@np_utils.np_doc(np.arange)
+@np_utils.np_doc('arange')
 def arange(start, stop=None, step=1, dtype=None):
   """Returns `step`-separated values in the range [start, stop).
 
@@ -286,7 +286,7 @@ def arange(start, stop=None, step=1, dtype=None):
 
 
 # Building matrices.
-@np_utils.np_doc(np.diag)
+@np_utils.np_doc('diag')
 def diag(v, k=0):  # pylint: disable=missing-docstring
   """Raises an error if input is not 1- or 2-d."""
   v = asarray(v).data
@@ -321,7 +321,7 @@ def diag(v, k=0):  # pylint: disable=missing-docstring
   return np_utils.tensor_to_ndarray(result)
 
 
-@np_utils.np_doc(np.diagonal)
+@np_utils.np_doc('diagonal')
 def diagonal(a, offset=0, axis1=0, axis2=1):  # pylint: disable=missing-docstring
   a = asarray(a).data
 
@@ -352,7 +352,7 @@ def diagonal(a, offset=0, axis1=0, axis2=1):  # pylint: disable=missing-docstrin
   return a
 
 
-@np_utils.np_doc(np.diagflat)
+@np_utils.np_doc('diagflat')
 def diagflat(v, k=0):
   v = asarray(v)
   return diag(array_ops.reshape(v.data, [-1]), k)
@@ -363,21 +363,21 @@ def _promote_dtype(*arrays):
   return [asarray(a, dtype=dtype) for a in arrays]
 
 
-@np_utils.np_doc(np.all)
+@np_utils.np_doc('all')
 def all(a, axis=None, keepdims=None):  # pylint: disable=redefined-builtin
   a = asarray(a, dtype=bool)
   return np_utils.tensor_to_ndarray(
       math_ops.reduce_all(input_tensor=a.data, axis=axis, keepdims=keepdims))
 
 
-@np_utils.np_doc(np.any)
+@np_utils.np_doc('any')
 def any(a, axis=None, keepdims=None):  # pylint: disable=redefined-builtin
   a = asarray(a, dtype=bool)
   return np_utils.tensor_to_ndarray(
       math_ops.reduce_any(input_tensor=a.data, axis=axis, keepdims=keepdims))
 
 
-@np_utils.np_doc(np.compress)
+@np_utils.np_doc('compress')
 def compress(condition, a, axis=None):  # pylint: disable=redefined-outer-name,missing-function-docstring
   condition = asarray(condition, dtype=bool)
   a = asarray(a)
@@ -408,7 +408,7 @@ def compress(condition, a, axis=None):  # pylint: disable=redefined-outer-name,m
       array_ops.boolean_mask(tensor=a_t, mask=condition_t, axis=axis))
 
 
-@np_utils.np_doc(np.copy)
+@np_utils.np_doc('copy')
 def copy(a):
   return array(a, copy=True)
 
@@ -424,7 +424,7 @@ def _maybe_promote_to_int(a):
   return a
 
 
-@np_utils.np_doc(np.cumprod)
+@np_utils.np_doc('cumprod')
 def cumprod(a, axis=None, dtype=None):  # pylint: disable=missing-docstring
   a = asarray(a, dtype=dtype)
 
@@ -440,7 +440,7 @@ def cumprod(a, axis=None, dtype=None):  # pylint: disable=missing-docstring
   return np_utils.tensor_to_ndarray(math_ops.cumprod(a.data, axis))
 
 
-@np_utils.np_doc(np.cumsum)
+@np_utils.np_doc('cumsum')
 def cumsum(a, axis=None, dtype=None):  # pylint: disable=missing-docstring
   a = asarray(a, dtype=dtype)
 
@@ -456,7 +456,7 @@ def cumsum(a, axis=None, dtype=None):  # pylint: disable=missing-docstring
   return np_utils.tensor_to_ndarray(math_ops.cumsum(a.data, axis))
 
 
-@np_utils.np_doc(np.imag)
+@np_utils.np_doc('imag')
 def imag(a):
   a = asarray(a)
   # TODO(srbs): np.imag returns a scalar if a is a scalar, whereas we always
@@ -536,7 +536,7 @@ def _reduce(tf_fn,
       tf_fn(input_tensor=a.data, axis=axis, keepdims=keepdims))
 
 
-@np_utils.np_doc(np.sum)
+@np_utils.np_doc('sum')
 def sum(a, axis=None, dtype=None, keepdims=None):  # pylint: disable=redefined-builtin
   return _reduce(
       math_ops.reduce_sum,
@@ -547,7 +547,7 @@ def sum(a, axis=None, dtype=None, keepdims=None):  # pylint: disable=redefined-b
       tf_bool_fn=math_ops.reduce_any)
 
 
-@np_utils.np_doc(np.prod)
+@np_utils.np_doc('prod')
 def prod(a, axis=None, dtype=None, keepdims=None):
   return _reduce(
       math_ops.reduce_prod,
@@ -558,7 +558,7 @@ def prod(a, axis=None, dtype=None, keepdims=None):
       tf_bool_fn=math_ops.reduce_all)
 
 
-@np_utils.np_doc(np.mean)
+@np_utils.np_doc('mean')
 def mean(a, axis=None, dtype=None, keepdims=None):
   return _reduce(
       math_ops.reduce_mean,
@@ -569,7 +569,7 @@ def mean(a, axis=None, dtype=None, keepdims=None):
       promote_int=_TO_FLOAT)
 
 
-@np_utils.np_doc(np.amax)
+@np_utils.np_doc('amax')
 def amax(a, axis=None, keepdims=None):
   return _reduce(
       math_ops.reduce_max,
@@ -582,7 +582,7 @@ def amax(a, axis=None, keepdims=None):
       preserve_bool=True)
 
 
-@np_utils.np_doc(np.amin)
+@np_utils.np_doc('amin')
 def amin(a, axis=None, keepdims=None):
   return _reduce(
       math_ops.reduce_min,
@@ -595,7 +595,7 @@ def amin(a, axis=None, keepdims=None):
       preserve_bool=True)
 
 
-@np_utils.np_doc(np.var)
+@np_utils.np_doc('var')
 def var(a, axis=None, dtype=None, out=None, ddof=0, keepdims=None):  # pylint: disable=missing-docstring
   if dtype:
     working_dtype = np_utils.result_type(a, dtype)
@@ -642,7 +642,7 @@ def var(a, axis=None, dtype=None, out=None, ddof=0, keepdims=None):  # pylint: d
   return np_utils.tensor_to_ndarray(result)
 
 
-@np_utils.np_doc(np.std)
+@np_utils.np_doc('std')
 def std(a, axis=None, keepdims=None):  # pylint: disable=missing-function-docstring
   return _reduce(
       math_ops.reduce_std,
@@ -653,7 +653,7 @@ def std(a, axis=None, keepdims=None):  # pylint: disable=missing-function-docstr
       promote_int=_TO_FLOAT)
 
 
-@np_utils.np_doc(np.ravel)
+@np_utils.np_doc('ravel')
 def ravel(a):  # pylint: disable=missing-docstring
   a = asarray(a)
   out = np_utils.cond(
@@ -665,7 +665,7 @@ def ravel(a):  # pylint: disable=missing-docstring
 setattr(np_arrays.ndarray, 'ravel', ravel)
 
 
-@np_utils.np_doc(np.real)
+@np_utils.np_doc('real')
 def real(val):
   val = asarray(val)
   # TODO(srbs): np.real returns a scalar if val is a scalar, whereas we always
@@ -673,7 +673,7 @@ def real(val):
   return np_utils.tensor_to_ndarray(math_ops.real(val.data))
 
 
-@np_utils.np_doc(np.repeat)
+@np_utils.np_doc('repeat')
 def repeat(a, repeats, axis=None):  # pylint: disable=missing-docstring
   a = asarray(a).data
   original_shape = a._shape_as_list()  # pylint: disable=protected-access
@@ -704,7 +704,7 @@ def repeat(a, repeats, axis=None):  # pylint: disable=missing-docstring
   return np_utils.tensor_to_ndarray(result)
 
 
-@np_utils.np_doc(np.around)
+@np_utils.np_doc('around')
 def around(a, decimals=0):  # pylint: disable=missing-docstring
   a = asarray(a)
   dtype = a.dtype
@@ -726,7 +726,7 @@ def around(a, decimals=0):  # pylint: disable=missing-docstring
 setattr(np_arrays.ndarray, '__round__', around)
 
 
-@np_utils.np_doc(np.reshape)
+@np_utils.np_doc('reshape')
 def reshape(a, newshape, order='C'):
   """order argument can only b 'C' or 'F'."""
   if order not in {'C', 'F'}:
@@ -758,19 +758,19 @@ def _reshape_method_wrapper(a, *newshape, **kwargs):
   return reshape(a, newshape, order=order)
 
 
-@np_utils.np_doc(np.expand_dims)
+@np_utils.np_doc('expand_dims')
 def expand_dims(a, axis):
   a = asarray(a)
   return np_utils.tensor_to_ndarray(array_ops.expand_dims(a.data, axis=axis))
 
 
-@np_utils.np_doc(np.squeeze)
+@np_utils.np_doc('squeeze')
 def squeeze(a, axis=None):
   a = asarray(a)
   return np_utils.tensor_to_ndarray(array_ops.squeeze(a, axis))
 
 
-@np_utils.np_doc(np.transpose)
+@np_utils.np_doc('transpose')
 def transpose(a, axes=None):
   a = asarray(a)
   if axes is not None:
@@ -778,7 +778,7 @@ def transpose(a, axes=None):
   return np_utils.tensor_to_ndarray(array_ops.transpose(a=a.data, perm=axes))
 
 
-@np_utils.np_doc(np.swapaxes)
+@np_utils.np_doc('swapaxes')
 def swapaxes(a, axis1, axis2):  # pylint: disable=missing-docstring
   a = asarray(a).data
 
@@ -794,7 +794,7 @@ def swapaxes(a, axis1, axis2):  # pylint: disable=missing-docstring
   return np_utils.tensor_to_ndarray(a)
 
 
-@np_utils.np_doc(np.moveaxis)
+@np_utils.np_doc('moveaxis')
 def moveaxis(a, source, destination):  # pylint: disable=missing-docstring
   """Raises ValueError if source, destination not in (-ndim(a), ndim(a))."""
   if not source and not destination:
@@ -914,7 +914,7 @@ setattr(np_arrays.ndarray, 'reshape', _reshape_method_wrapper)
 setattr(np_arrays.ndarray, '__setitem__', _setitem)
 
 
-@np_utils.np_doc(np.pad)
+@np_utils.np_doc('pad')
 def pad(ary, pad_width, mode, constant_values=0):
   """Only supports modes 'constant', 'reflect' and 'symmetric' currently."""
   if not (mode == 'constant' or mode == 'reflect' or mode == 'symmetric'):
@@ -930,7 +930,7 @@ def pad(ary, pad_width, mode, constant_values=0):
           constant_values=constant_values))
 
 
-@np_utils.np_doc(np.take)
+@np_utils.np_doc('take')
 def take(a, indices, axis=None, out=None, mode='clip'):
   """out argument is not supported, and default mode is clip."""
   if out is not None:
@@ -957,7 +957,7 @@ def take(a, indices, axis=None, out=None, mode='clip'):
   return np_utils.tensor_to_ndarray(array_ops.gather(a, indices, axis=axis))
 
 
-@np_utils.np_doc_only(np.where)
+@np_utils.np_doc_only('where')
 def where(condition, x=None, y=None):
   """Raises ValueError if exactly one of x or y is not None."""
   condition = asarray(condition, dtype=np.bool_)
@@ -970,7 +970,7 @@ def where(condition, x=None, y=None):
   raise ValueError('Both x and y must be ndarrays, or both must be None.')
 
 
-@np_utils.np_doc(np.select)
+@np_utils.np_doc('select')
 def select(condlist, choicelist, default=0):  # pylint: disable=missing-docstring
   if len(condlist) != len(choicelist):
     msg = 'condlist must have length equal to choicelist ({} vs {})'
@@ -987,19 +987,19 @@ def select(condlist, choicelist, default=0):  # pylint: disable=missing-docstrin
   return output
 
 
-@np_utils.np_doc(np.shape)
+@np_utils.np_doc('shape')
 def shape(a):
   a = asarray(a)
   return a.shape
 
 
-@np_utils.np_doc(np.ndim)
+@np_utils.np_doc('ndim')
 def ndim(a):
   a = asarray(a)
   return a.ndim
 
 
-@np_utils.np_doc(np.isscalar)
+@np_utils.np_doc('isscalar')
 def isscalar(a):
   return ndim(a) == 0
 
@@ -1034,7 +1034,7 @@ def _boundaries_to_sizes(a, boundaries, axis):
   return sizes
 
 
-@np_utils.np_doc(np.split)
+@np_utils.np_doc('split')
 def split(ary, indices_or_sections, axis=0):
   ary = asarray(ary)
   if not isinstance(indices_or_sections, six.integer_types):
@@ -1043,26 +1043,26 @@ def split(ary, indices_or_sections, axis=0):
   return [np_utils.tensor_to_ndarray(a) for a in result]
 
 
-def _split_on_axis(np_fun, axis):
+def _split_on_axis(np_fun_name, axis):
 
-  @np_utils.np_doc(np_fun)
+  @np_utils.np_doc(np_fun_name)
   def f(ary, indices_or_sections):
     return split(ary, indices_or_sections, axis=axis)
 
   return f
 
 
-vsplit = _split_on_axis(np.vsplit, axis=0)
-hsplit = _split_on_axis(np.hsplit, axis=1)
-dsplit = _split_on_axis(np.dsplit, axis=2)
+vsplit = _split_on_axis('vsplit', axis=0)
+hsplit = _split_on_axis('hsplit', axis=1)
+dsplit = _split_on_axis('dsplit', axis=2)
 
 
-@np_utils.np_doc(np.broadcast_to)
+@np_utils.np_doc('broadcast_to')
 def broadcast_to(array, shape):  # pylint: disable=redefined-outer-name
   return full(shape, array)
 
 
-@np_utils.np_doc(np.stack)
+@np_utils.np_doc('stack')
 def stack(arrays, axis=0):  # pylint: disable=missing-function-docstring
   if isinstance(arrays, (np_arrays.ndarray, ops.Tensor)):
     arrays = asarray(arrays)
@@ -1077,7 +1077,7 @@ def stack(arrays, axis=0):  # pylint: disable=missing-function-docstring
   return asarray(array_ops.stack(unwrapped_arrays, axis))
 
 
-@np_utils.np_doc(np.hstack)
+@np_utils.np_doc('hstack')
 def hstack(tup):
   arrays = [atleast_1d(a) for a in tup]
   arrays = _promote_dtype(*arrays)  # pylint: disable=protected-access
@@ -1091,7 +1091,7 @@ def hstack(tup):
       lambda: array_ops.concat(unwrapped_arrays, axis=1))
 
 
-@np_utils.np_doc(np.vstack)
+@np_utils.np_doc('vstack')
 def vstack(tup):
   arrays = [atleast_2d(a) for a in tup]
   arrays = _promote_dtype(*arrays)  # pylint: disable=protected-access
@@ -1101,7 +1101,7 @@ def vstack(tup):
   return array_ops.concat(unwrapped_arrays, axis=0)
 
 
-@np_utils.np_doc(np.dstack)
+@np_utils.np_doc('dstack')
 def dstack(tup):
   arrays = [atleast_3d(a) for a in tup]
   arrays = _promote_dtype(*arrays)  # pylint: disable=protected-access
@@ -1148,17 +1148,17 @@ def _atleast_nd(n, new_shape, *arys):
     return arys
 
 
-@np_utils.np_doc(np.atleast_1d)
+@np_utils.np_doc('atleast_1d')
 def atleast_1d(*arys):
   return _atleast_nd(1, _pad_left_to, *arys)
 
 
-@np_utils.np_doc(np.atleast_2d)
+@np_utils.np_doc('atleast_2d')
 def atleast_2d(*arys):
   return _atleast_nd(2, _pad_left_to, *arys)
 
 
-@np_utils.np_doc(np.atleast_3d)
+@np_utils.np_doc('atleast_3d')
 def atleast_3d(*arys):  # pylint: disable=missing-docstring
 
   def new_shape(_, old_shape):
@@ -1175,7 +1175,7 @@ def atleast_3d(*arys):  # pylint: disable=missing-docstring
   return _atleast_nd(3, new_shape, *arys)
 
 
-@np_utils.np_doc(np.nonzero)
+@np_utils.np_doc('nonzero')
 def nonzero(a):
   a = atleast_1d(a).data
   if a.shape.rank is None:
@@ -1189,7 +1189,7 @@ def nonzero(a):
           axis=1))
 
 
-@np_utils.np_doc(np.diag_indices)
+@np_utils.np_doc('diag_indices')
 def diag_indices(n, ndim=2):  # pylint: disable=missing-docstring,redefined-outer-name
   if n < 0:
     raise ValueError(
@@ -1202,7 +1202,7 @@ def diag_indices(n, ndim=2):  # pylint: disable=missing-docstring,redefined-oute
   return (math_ops.range(n),) * ndim
 
 
-@np_utils.np_doc(np.tri)
+@np_utils.np_doc('tri')
 def tri(N, M=None, k=0, dtype=None):  # pylint: disable=invalid-name,missing-docstring
   M = M if M is not None else N
   if dtype is not None:
@@ -1229,7 +1229,7 @@ def tri(N, M=None, k=0, dtype=None):  # pylint: disable=invalid-name,missing-doc
   return np_utils.tensor_to_ndarray(r)
 
 
-@np_utils.np_doc(np.tril)
+@np_utils.np_doc('tril')
 def tril(m, k=0):  # pylint: disable=missing-docstring
   m = asarray(m).data
   if m.shape.ndims is None:
@@ -1251,7 +1251,7 @@ def tril(m, k=0):  # pylint: disable=missing-docstring
           array_ops.broadcast_to(mask, array_ops.shape(m)), m, z))
 
 
-@np_utils.np_doc(np.triu)
+@np_utils.np_doc('triu')
 def triu(m, k=0):  # pylint: disable=missing-docstring
   m = asarray(m).data
   if m.shape.ndims is None:
@@ -1273,7 +1273,7 @@ def triu(m, k=0):  # pylint: disable=missing-docstring
           array_ops.broadcast_to(mask, array_ops.shape(m)), z, m))
 
 
-@np_utils.np_doc(np.flip)
+@np_utils.np_doc('flip')
 def flip(m, axis=None):  # pylint: disable=missing-docstring
   m = asarray(m).data
 
@@ -1286,17 +1286,17 @@ def flip(m, axis=None):  # pylint: disable=missing-docstring
   return np_utils.tensor_to_ndarray(array_ops.reverse(m, [axis]))
 
 
-@np_utils.np_doc(np.flipud)
+@np_utils.np_doc('flipud')
 def flipud(m):  # pylint: disable=missing-docstring
   return flip(m, 0)
 
 
-@np_utils.np_doc(np.fliplr)
+@np_utils.np_doc('fliplr')
 def fliplr(m):  # pylint: disable=missing-docstring
   return flip(m, 1)
 
 
-@np_utils.np_doc(np.roll)
+@np_utils.np_doc('roll')
 def roll(a, shift, axis=None):  # pylint: disable=missing-docstring
   a = asarray(a).data
 
@@ -1309,7 +1309,7 @@ def roll(a, shift, axis=None):  # pylint: disable=missing-docstring
   return np_utils.tensor_to_ndarray(array_ops.reshape(a, original_shape))
 
 
-@np_utils.np_doc(np.rot90)
+@np_utils.np_doc('rot90')
 def rot90(m, k=1, axes=(0, 1)):  # pylint: disable=missing-docstring
   m_rank = array_ops.rank(m)
   ax1, ax2 = np_utils._canonicalize_axes(axes, m_rank)  # pylint: disable=protected-access
@@ -1329,7 +1329,7 @@ def rot90(m, k=1, axes=(0, 1)):  # pylint: disable=missing-docstring
       return flip(transpose(m, perm), ax2)
 
 
-@np_utils.np_doc(np.vander)
+@np_utils.np_doc('vander')
 def vander(x, N=None, increasing=False):  # pylint: disable=missing-docstring,invalid-name
   x = asarray(x).data
 
@@ -1368,7 +1368,7 @@ def vander(x, N=None, increasing=False):  # pylint: disable=missing-docstring,in
           x, math_ops.cast(math_ops.range(start, limit, delta), dtype=x.dtype)))
 
 
-@np_utils.np_doc(np.ix_)
+@np_utils.np_doc('ix_')
 def ix_(*args):  # pylint: disable=missing-docstring
   n = len(args)
   output = []
@@ -1400,7 +1400,7 @@ def ix_(*args):  # pylint: disable=missing-docstring
   return output
 
 
-@np_utils.np_doc(np.broadcast_arrays)
+@np_utils.np_doc('broadcast_arrays')
 def broadcast_arrays(*args, **kwargs):  # pylint: disable=missing-docstring
   subok = kwargs.pop('subok', False)
   if subok:
@@ -1413,7 +1413,7 @@ def broadcast_arrays(*args, **kwargs):  # pylint: disable=missing-docstring
   return [np_utils.tensor_to_ndarray(arg) for arg in args]
 
 
-@np_utils.np_doc_only(np.sign)
+@np_utils.np_doc_only('sign')
 def sign(x, out=None, where=None, **kwargs):  # pylint: disable=missing-docstring,redefined-outer-name
   if out:
     raise ValueError('tf.numpy doesnt support setting out.')
@@ -1434,7 +1434,7 @@ def sign(x, out=None, where=None, **kwargs):  # pylint: disable=missing-docstrin
 
 # Note that np.take_along_axis may not be present in some supported versions of
 # numpy.
-@np_utils.np_doc(None, np_fun_name='take_along_axis')
+@np_utils.np_doc('take_along_axis')
 def take_along_axis(arr, indices, axis):  # pylint: disable=missing-docstring
   arr = asarray(arr)
   indices = asarray(indices)
diff --git a/tensorflow/python/ops/numpy_ops/np_math_ops.py b/tensorflow/python/ops/numpy_ops/np_math_ops.py
index 361bfb50dec..3cf26095dd8 100644
--- a/tensorflow/python/ops/numpy_ops/np_math_ops.py
+++ b/tensorflow/python/ops/numpy_ops/np_math_ops.py
@@ -40,7 +40,7 @@ from tensorflow.python.ops.numpy_ops import np_dtypes
 from tensorflow.python.ops.numpy_ops import np_utils
 
 
-@np_utils.np_doc_only(np.dot)
+@np_utils.np_doc_only('dot')
 def dot(a, b):  # pylint: disable=missing-docstring
 
   def f(a, b):  # pylint: disable=missing-docstring
@@ -67,7 +67,7 @@ def _bin_op(tf_fun, a, b, promote=True):
   return np_utils.tensor_to_ndarray(tf_fun(a.data, b.data))
 
 
-@np_utils.np_doc(np.add)
+@np_utils.np_doc('add')
 def add(x1, x2):
 
   def add_or_or(x1, x2):
@@ -79,12 +79,12 @@ def add(x1, x2):
   return _bin_op(add_or_or, x1, x2)
 
 
-@np_utils.np_doc(np.subtract)
+@np_utils.np_doc('subtract')
 def subtract(x1, x2):
   return _bin_op(math_ops.subtract, x1, x2)
 
 
-@np_utils.np_doc(np.multiply)
+@np_utils.np_doc('multiply')
 def multiply(x1, x2):
 
   def mul_or_and(x1, x2):
@@ -96,7 +96,7 @@ def multiply(x1, x2):
   return _bin_op(mul_or_and, x1, x2)
 
 
-@np_utils.np_doc(np.true_divide)
+@np_utils.np_doc('true_divide')
 def true_divide(x1, x2):  # pylint: disable=missing-function-docstring
 
   def _avoid_float64(x1, x2):
@@ -123,7 +123,7 @@ def true_divide(x1, x2):  # pylint: disable=missing-function-docstring
 divide = true_divide
 
 
-@np_utils.np_doc(np.floor_divide)
+@np_utils.np_doc('floor_divide')
 def floor_divide(x1, x2):  # pylint: disable=missing-function-docstring
 
   def f(x1, x2):
@@ -136,7 +136,7 @@ def floor_divide(x1, x2):  # pylint: disable=missing-function-docstring
   return _bin_op(f, x1, x2)
 
 
-@np_utils.np_doc(np.mod)
+@np_utils.np_doc('mod')
 def mod(x1, x2):  # pylint: disable=missing-function-docstring
 
   def f(x1, x2):
@@ -152,12 +152,12 @@ def mod(x1, x2):  # pylint: disable=missing-function-docstring
 remainder = mod
 
 
-@np_utils.np_doc(np.divmod)
+@np_utils.np_doc('divmod')
 def divmod(x1, x2):  # pylint: disable=redefined-builtin
   return floor_divide(x1, x2), mod(x1, x2)
 
 
-@np_utils.np_doc(np.maximum)
+@np_utils.np_doc('maximum')
 def maximum(x1, x2):
 
   def max_or_or(x1, x2):
@@ -169,7 +169,7 @@ def maximum(x1, x2):
   return _bin_op(max_or_or, x1, x2)
 
 
-@np_utils.np_doc(np.minimum)
+@np_utils.np_doc('minimum')
 def minimum(x1, x2):
 
   def min_or_and(x1, x2):
@@ -181,7 +181,7 @@ def minimum(x1, x2):
   return _bin_op(min_or_and, x1, x2)
 
 
-@np_utils.np_doc(np.clip)
+@np_utils.np_doc('clip')
 def clip(a, a_min, a_max):  # pylint: disable=missing-docstring
   if a_min is None and a_max is None:
     raise ValueError('Not more than one of `a_min` and `a_max` may be `None`.')
@@ -196,7 +196,7 @@ def clip(a, a_min, a_max):  # pylint: disable=missing-docstring
             *np_utils.tf_broadcast(a.data, a_min.data, a_max.data)))
 
 
-@np_utils.np_doc(np.matmul)
+@np_utils.np_doc('matmul')
 def matmul(x1, x2):  # pylint: disable=missing-docstring
 
   def f(x1, x2):
@@ -215,12 +215,12 @@ def matmul(x1, x2):  # pylint: disable=missing-docstring
   return _bin_op(f, x1, x2)
 
 
-@np_utils.np_doc(np.tensordot)
+@np_utils.np_doc('tensordot')
 def tensordot(a, b, axes=2):
   return _bin_op(lambda a, b: math_ops.tensordot(a, b, axes=axes), a, b)
 
 
-@np_utils.np_doc_only(np.inner)
+@np_utils.np_doc_only('inner')
 def inner(a, b):  # pylint: disable=missing-function-docstring
 
   def f(a, b):
@@ -233,7 +233,7 @@ def inner(a, b):  # pylint: disable=missing-function-docstring
   return _bin_op(f, a, b)
 
 
-@np_utils.np_doc(np.cross)
+@np_utils.np_doc('cross')
 def cross(a, b, axisa=-1, axisb=-1, axisc=-1, axis=None):  # pylint: disable=missing-docstring
 
   def f(a, b):  # pylint: disable=missing-docstring
@@ -309,7 +309,7 @@ def cross(a, b, axisa=-1, axisb=-1, axisc=-1, axis=None):  # pylint: disable=mis
   return _bin_op(f, a, b)
 
 
-@np_utils.np_doc_only(np.vdot)
+@np_utils.np_doc_only('vdot')
 def vdot(a, b):  # pylint: disable=missing-docstring
   a, b = np_array_ops._promote_dtype(a, b)
   a = np_array_ops.reshape(a, [-1])
@@ -319,27 +319,27 @@ def vdot(a, b):  # pylint: disable=missing-docstring
   return dot(a, b)
 
 
-@np_utils.np_doc(np.power)
+@np_utils.np_doc('power')
 def power(x1, x2):
   return _bin_op(math_ops.pow, x1, x2)
 
 
-@np_utils.np_doc(np.float_power)
+@np_utils.np_doc('float_power')
 def float_power(x1, x2):
   return power(x1, x2)
 
 
-@np_utils.np_doc(np.arctan2)
+@np_utils.np_doc('arctan2')
 def arctan2(x1, x2):
   return _bin_op(math_ops.atan2, x1, x2)
 
 
-@np_utils.np_doc(np.nextafter)
+@np_utils.np_doc('nextafter')
 def nextafter(x1, x2):
   return _bin_op(math_ops.nextafter, x1, x2)
 
 
-@np_utils.np_doc(np.heaviside)
+@np_utils.np_doc('heaviside')
 def heaviside(x1, x2):  # pylint: disable=missing-function-docstring
 
   def f(x1, x2):
@@ -353,12 +353,12 @@ def heaviside(x1, x2):  # pylint: disable=missing-function-docstring
   return y
 
 
-@np_utils.np_doc(np.hypot)
+@np_utils.np_doc('hypot')
 def hypot(x1, x2):
   return sqrt(square(x1) + square(x2))
 
 
-@np_utils.np_doc(np.kron)
+@np_utils.np_doc('kron')
 def kron(a, b):  # pylint: disable=missing-function-docstring
   # pylint: disable=protected-access,g-complex-comprehension
   a, b = np_array_ops._promote_dtype(a, b)
@@ -389,7 +389,7 @@ def kron(a, b):  # pylint: disable=missing-function-docstring
   return np_array_ops.reshape(a_reshaped * b_reshaped, out_shape)
 
 
-@np_utils.np_doc(np.outer)
+@np_utils.np_doc('outer')
 def outer(a, b):
 
   def f(a, b):
@@ -399,7 +399,7 @@ def outer(a, b):
 
 
 # This can also be implemented via tf.reduce_logsumexp
-@np_utils.np_doc(np.logaddexp)
+@np_utils.np_doc('logaddexp')
 def logaddexp(x1, x2):
   amax = maximum(x1, x2)
   delta = x1 - x2
@@ -409,7 +409,7 @@ def logaddexp(x1, x2):
       amax + log1p(exp(-abs(delta))))
 
 
-@np_utils.np_doc(np.logaddexp2)
+@np_utils.np_doc('logaddexp2')
 def logaddexp2(x1, x2):
   amax = maximum(x1, x2)
   delta = x1 - x2
@@ -419,7 +419,7 @@ def logaddexp2(x1, x2):
       amax + log1p(exp2(-abs(delta))) / np.log(2))
 
 
-@np_utils.np_doc(np.polyval)
+@np_utils.np_doc('polyval')
 def polyval(p, x):  # pylint: disable=missing-function-docstring
 
   def f(p, x):
@@ -437,7 +437,7 @@ def polyval(p, x):  # pylint: disable=missing-function-docstring
   return _bin_op(f, p, x)
 
 
-@np_utils.np_doc(np.isclose)
+@np_utils.np_doc('isclose')
 def isclose(a, b, rtol=1e-05, atol=1e-08, equal_nan=False):  # pylint: disable=missing-docstring
 
   def f(a, b):  # pylint: disable=missing-docstring
@@ -455,7 +455,7 @@ def isclose(a, b, rtol=1e-05, atol=1e-08, equal_nan=False):  # pylint: disable=m
   return _bin_op(f, a, b)
 
 
-@np_utils.np_doc(np.allclose)
+@np_utils.np_doc('allclose')
 def allclose(a, b, rtol=1e-05, atol=1e-08, equal_nan=False):
   return np_array_ops.all(
       isclose(a, b, rtol=rtol, atol=atol, equal_nan=equal_nan))
@@ -490,13 +490,13 @@ def _tf_gcd(x1, x2):  # pylint: disable=missing-function-docstring
 
 
 # Note that np.gcd may not be present in some supported versions of numpy.
-@np_utils.np_doc(None, np_fun_name='gcd')
+@np_utils.np_doc('gcd')
 def gcd(x1, x2):
   return _bin_op(_tf_gcd, x1, x2)
 
 
 # Note that np.lcm may not be present in some supported versions of numpy.
-@np_utils.np_doc(None, np_fun_name='lcm')
+@np_utils.np_doc('lcm')
 def lcm(x1, x2):  # pylint: disable=missing-function-docstring
 
   def f(x1, x2):
@@ -527,22 +527,22 @@ def _bitwise_binary_op(tf_fn, x1, x2):  # pylint: disable=missing-function-docst
   return _bin_op(f, x1, x2)
 
 
-@np_utils.np_doc(np.bitwise_and)
+@np_utils.np_doc('bitwise_and')
 def bitwise_and(x1, x2):
   return _bitwise_binary_op(bitwise_ops.bitwise_and, x1, x2)
 
 
-@np_utils.np_doc(np.bitwise_or)
+@np_utils.np_doc('bitwise_or')
 def bitwise_or(x1, x2):
   return _bitwise_binary_op(bitwise_ops.bitwise_or, x1, x2)
 
 
-@np_utils.np_doc(np.bitwise_xor)
+@np_utils.np_doc('bitwise_xor')
 def bitwise_xor(x1, x2):
   return _bitwise_binary_op(bitwise_ops.bitwise_xor, x1, x2)
 
 
-@np_utils.np_doc(np.bitwise_not)
+@np_utils.np_doc('bitwise_not')
 def bitwise_not(x):
 
   def f(x):
@@ -574,62 +574,62 @@ def _scalar(tf_fn, x, promote_to_float=False):
   return np_utils.tensor_to_ndarray(tf_fn(x.data))
 
 
-@np_utils.np_doc(np.log)
+@np_utils.np_doc('log')
 def log(x):
   return _scalar(math_ops.log, x, True)
 
 
-@np_utils.np_doc(np.exp)
+@np_utils.np_doc('exp')
 def exp(x):
   return _scalar(math_ops.exp, x, True)
 
 
-@np_utils.np_doc(np.sqrt)
+@np_utils.np_doc('sqrt')
 def sqrt(x):
   return _scalar(math_ops.sqrt, x, True)
 
 
-@np_utils.np_doc(np.abs)
+@np_utils.np_doc('abs')
 def abs(x):  # pylint: disable=redefined-builtin
   return _scalar(math_ops.abs, x)
 
 
-@np_utils.np_doc(np.absolute)
+@np_utils.np_doc('absolute')
 def absolute(x):
   return abs(x)
 
 
-@np_utils.np_doc(np.fabs)
+@np_utils.np_doc('fabs')
 def fabs(x):
   return abs(x)
 
 
-@np_utils.np_doc(np.ceil)
+@np_utils.np_doc('ceil')
 def ceil(x):
   return _scalar(math_ops.ceil, x, True)
 
 
-@np_utils.np_doc(np.floor)
+@np_utils.np_doc('floor')
 def floor(x):
   return _scalar(math_ops.floor, x, True)
 
 
-@np_utils.np_doc(np.conj)
+@np_utils.np_doc('conj')
 def conj(x):
   return _scalar(math_ops.conj, x)
 
 
-@np_utils.np_doc(np.negative)
+@np_utils.np_doc('negative')
 def negative(x):
   return _scalar(math_ops.negative, x)
 
 
-@np_utils.np_doc(np.reciprocal)
+@np_utils.np_doc('reciprocal')
 def reciprocal(x):
   return _scalar(math_ops.reciprocal, x)
 
 
-@np_utils.np_doc(np.signbit)
+@np_utils.np_doc('signbit')
 def signbit(x):
 
   def f(x):
@@ -640,67 +640,67 @@ def signbit(x):
   return _scalar(f, x)
 
 
-@np_utils.np_doc(np.sin)
+@np_utils.np_doc('sin')
 def sin(x):
   return _scalar(math_ops.sin, x, True)
 
 
-@np_utils.np_doc(np.cos)
+@np_utils.np_doc('cos')
 def cos(x):
   return _scalar(math_ops.cos, x, True)
 
 
-@np_utils.np_doc(np.tan)
+@np_utils.np_doc('tan')
 def tan(x):
   return _scalar(math_ops.tan, x, True)
 
 
-@np_utils.np_doc(np.sinh)
+@np_utils.np_doc('sinh')
 def sinh(x):
   return _scalar(math_ops.sinh, x, True)
 
 
-@np_utils.np_doc(np.cosh)
+@np_utils.np_doc('cosh')
 def cosh(x):
   return _scalar(math_ops.cosh, x, True)
 
 
-@np_utils.np_doc(np.tanh)
+@np_utils.np_doc('tanh')
 def tanh(x):
   return _scalar(math_ops.tanh, x, True)
 
 
-@np_utils.np_doc(np.arcsin)
+@np_utils.np_doc('arcsin')
 def arcsin(x):
   return _scalar(math_ops.asin, x, True)
 
 
-@np_utils.np_doc(np.arccos)
+@np_utils.np_doc('arccos')
 def arccos(x):
   return _scalar(math_ops.acos, x, True)
 
 
-@np_utils.np_doc(np.arctan)
+@np_utils.np_doc('arctan')
 def arctan(x):
   return _scalar(math_ops.atan, x, True)
 
 
-@np_utils.np_doc(np.arcsinh)
+@np_utils.np_doc('arcsinh')
 def arcsinh(x):
   return _scalar(math_ops.asinh, x, True)
 
 
-@np_utils.np_doc(np.arccosh)
+@np_utils.np_doc('arccosh')
 def arccosh(x):
   return _scalar(math_ops.acosh, x, True)
 
 
-@np_utils.np_doc(np.arctanh)
+@np_utils.np_doc('arctanh')
 def arctanh(x):
   return _scalar(math_ops.atanh, x, True)
 
 
-@np_utils.np_doc(np.deg2rad)
+@np_utils.np_doc('deg2rad')
 def deg2rad(x):
 
   def f(x):
@@ -709,7 +709,7 @@ def deg2rad(x):
   return _scalar(f, x, True)
 
 
-@np_utils.np_doc(np.rad2deg)
+@np_utils.np_doc('rad2deg')
 def rad2deg(x):
   return x * (180.0 / np.pi)
 
@@ -719,7 +719,7 @@ _tf_float_types = [
 ]
 
 
-@np_utils.np_doc(np.angle)
+@np_utils.np_doc('angle')
 def angle(z, deg=False):  # pylint: disable=missing-function-docstring
 
   def f(x):
@@ -735,7 +735,7 @@ def angle(z, deg=False):  # pylint: disable=missing-function-docstring
   return y
 
 
-@np_utils.np_doc(np.cbrt)
+@np_utils.np_doc('cbrt')
 def cbrt(x):
 
   def f(x):
@@ -746,12 +746,12 @@ def cbrt(x):
   return _scalar(f, x, True)
 
 
-@np_utils.np_doc(np.conjugate)
+@np_utils.np_doc('conjugate')
 def conjugate(x):
   return _scalar(math_ops.conj, x)
 
 
-@np_utils.np_doc(np.exp2)
+@np_utils.np_doc('exp2')
 def exp2(x):
 
   def f(x):
@@ -760,12 +760,12 @@ def exp2(x):
   return _scalar(f, x, True)
 
 
-@np_utils.np_doc(np.expm1)
+@np_utils.np_doc('expm1')
 def expm1(x):
   return _scalar(math_ops.expm1, x, True)
 
 
-@np_utils.np_doc(np.fix)
+@np_utils.np_doc('fix')
 def fix(x):
 
   def f(x):
@@ -774,36 +774,36 @@ def fix(x):
   return _scalar(f, x, True)
 
 
-@np_utils.np_doc(np.iscomplex)
+@np_utils.np_doc('iscomplex')
 def iscomplex(x):
   return np_array_ops.imag(x) != 0
 
 
-@np_utils.np_doc(np.isreal)
+@np_utils.np_doc('isreal')
 def isreal(x):
   return np_array_ops.imag(x) == 0
 
 
-@np_utils.np_doc(np.iscomplexobj)
+@np_utils.np_doc('iscomplexobj')
 def iscomplexobj(x):
   x = np_array_ops.array(x)
   return np.issubdtype(x.dtype, np.complexfloating)
 
 
-@np_utils.np_doc(np.isrealobj)
+@np_utils.np_doc('isrealobj')
 def isrealobj(x):
   return not iscomplexobj(x)
 
 
-@np_utils.np_doc(np.isnan)
+@np_utils.np_doc('isnan')
 def isnan(x):
   return _scalar(math_ops.is_nan, x, True)
 
 
-def _make_nan_reduction(onp_reduction, reduction, init_val):
+def _make_nan_reduction(np_fun_name, reduction, init_val):
   """Helper to generate nan* functions."""
 
-  @np_utils.np_doc(onp_reduction)
+  @np_utils.np_doc(np_fun_name)
   def nan_reduction(a, axis=None, dtype=None, keepdims=False):
     a = np_array_ops.array(a)
     v = np_array_ops.array(init_val, dtype=a.dtype)
@@ -816,11 +816,11 @@ def _make_nan_reduction(onp_reduction, reduction, init_val):
   return nan_reduction
 
 
-nansum = _make_nan_reduction(np.nansum, np_array_ops.sum, 0)
-nanprod = _make_nan_reduction(np.nanprod, np_array_ops.prod, 1)
+nansum = _make_nan_reduction('nansum', np_array_ops.sum, 0)
+nanprod = _make_nan_reduction('nanprod', np_array_ops.prod, 1)
 
 
-@np_utils.np_doc(np.nanmean)
+@np_utils.np_doc('nanmean')
 def nanmean(a, axis=None, dtype=None, keepdims=None):  # pylint: disable=missing-docstring
   a = np_array_ops.array(a)
   if np.issubdtype(a.dtype, np.bool_) or np.issubdtype(a.dtype, np.integer):
@@ -833,47 +833,47 @@ def nanmean(a, axis=None, dtype=None, keepdims=None):  # pylint: disable=missing
   return nansum(a, axis=axis, dtype=dtype, keepdims=keepdims) / normalizer
 
 
-@np_utils.np_doc(np.isfinite)
+@np_utils.np_doc('isfinite')
 def isfinite(x):
   return _scalar(math_ops.is_finite, x, True)
 
 
-@np_utils.np_doc(np.isinf)
+@np_utils.np_doc('isinf')
 def isinf(x):
   return _scalar(math_ops.is_inf, x, True)
 
 
-@np_utils.np_doc(np.isneginf)
+@np_utils.np_doc('isneginf')
 def isneginf(x):
   return x == np_array_ops.full_like(x, -np.inf)
 
 
-@np_utils.np_doc(np.isposinf)
+@np_utils.np_doc('isposinf')
 def isposinf(x):
   return x == np_array_ops.full_like(x, np.inf)
 
 
-@np_utils.np_doc(np.log2)
+@np_utils.np_doc('log2')
 def log2(x):
   return log(x) / np.log(2)
 
 
-@np_utils.np_doc(np.log10)
+@np_utils.np_doc('log10')
 def log10(x):
   return log(x) / np.log(10)
 
 
-@np_utils.np_doc(np.log1p)
+@np_utils.np_doc('log1p')
 def log1p(x):
   return _scalar(math_ops.log1p, x, True)
 
 
-@np_utils.np_doc(np.positive)
+@np_utils.np_doc('positive')
 def positive(x):
   return _scalar(lambda x: x, x)
 
 
-@np_utils.np_doc(np.sinc)
+@np_utils.np_doc('sinc')
 def sinc(x):
 
   def f(x):
@@ -884,12 +884,12 @@ def sinc(x):
   return _scalar(f, x, True)
 
 
-@np_utils.np_doc(np.square)
+@np_utils.np_doc('square')
 def square(x):
   return _scalar(math_ops.square, x)
 
 
-@np_utils.np_doc(np.diff)
+@np_utils.np_doc('diff')
 def diff(a, n=1, axis=-1):  # pylint: disable=missing-function-docstring
 
   def f(a):
@@ -964,37 +964,37 @@ def _comparison(tf_fun, x1, x2, cast_bool_to_int=False):
   return np_utils.tensor_to_ndarray(tf_fun(x1, x2))
 
 
-@np_utils.np_doc(np.equal)
+@np_utils.np_doc('equal')
 def equal(x1, x2):
   return _comparison(math_ops.equal, x1, x2)
 
 
-@np_utils.np_doc(np.not_equal)
+@np_utils.np_doc('not_equal')
 def not_equal(x1, x2):
   return _comparison(math_ops.not_equal, x1, x2)
 
 
-@np_utils.np_doc(np.greater)
+@np_utils.np_doc('greater')
 def greater(x1, x2):
   return _comparison(math_ops.greater, x1, x2, True)
 
 
-@np_utils.np_doc(np.greater_equal)
+@np_utils.np_doc('greater_equal')
 def greater_equal(x1, x2):
   return _comparison(math_ops.greater_equal, x1, x2, True)
 
 
-@np_utils.np_doc(np.less)
+@np_utils.np_doc('less')
 def less(x1, x2):
   return _comparison(math_ops.less, x1, x2, True)
 
 
-@np_utils.np_doc(np.less_equal)
+@np_utils.np_doc('less_equal')
 def less_equal(x1, x2):
   return _comparison(math_ops.less_equal, x1, x2, True)
 
 
-@np_utils.np_doc(np.array_equal)
+@np_utils.np_doc('array_equal')
 def array_equal(a1, a2):  # pylint: disable=missing-function-docstring
 
   def f(x1, x2):
@@ -1017,22 +1017,22 @@ def _logical_binary_op(tf_fun, x1, x2):
   return np_utils.tensor_to_ndarray(tf_fun(x1.data, x2.data))
 
 
-@np_utils.np_doc(np.logical_and)
+@np_utils.np_doc('logical_and')
 def logical_and(x1, x2):
   return _logical_binary_op(math_ops.logical_and, x1, x2)
 
 
-@np_utils.np_doc(np.logical_or)
+@np_utils.np_doc('logical_or')
 def logical_or(x1, x2):
   return _logical_binary_op(math_ops.logical_or, x1, x2)
 
 
-@np_utils.np_doc(np.logical_xor)
+@np_utils.np_doc('logical_xor')
 def logical_xor(x1, x2):
   return _logical_binary_op(math_ops.logical_xor, x1, x2)
 
 
-@np_utils.np_doc(np.logical_not)
+@np_utils.np_doc('logical_not')
 def logical_not(x):
   x = np_array_ops.array(x, dtype=np.bool_)
   return np_utils.tensor_to_ndarray(math_ops.logical_not(x.data))
@@ -1047,7 +1047,7 @@ setattr(np_arrays.ndarray, '__eq__', _wrap(equal))
 setattr(np_arrays.ndarray, '__ne__', _wrap(not_equal))
 
 
-@np_utils.np_doc(np.linspace)
+@np_utils.np_doc('linspace')
 def linspace(  # pylint: disable=missing-docstring
     start,
     stop,
@@ -1086,7 +1086,7 @@ def linspace(  # pylint: disable=missing-docstring
     return np_arrays.tensor_to_ndarray(result)
 
 
-@np_utils.np_doc(np.logspace)
+@np_utils.np_doc('logspace')
 def logspace(start, stop, num=50, endpoint=True, base=10.0, dtype=None, axis=0):
   dtype = np_utils.result_type(start, stop, dtype)
   result = linspace(
@@ -1097,7 +1097,7 @@ def logspace(start, stop, num=50, endpoint=True, base=10.0, dtype=None, axis=0):
   return np_arrays.tensor_to_ndarray(result)
 
 
-@np_utils.np_doc(np.geomspace)
+@np_utils.np_doc('geomspace')
 def geomspace(start, stop, num=50, endpoint=True, dtype=None, axis=0):  # pylint: disable=missing-docstring
   dtype = dtype or np_utils.result_type(start, stop, float(num),
                                         np_array_ops.zeros((), dtype))
@@ -1121,13 +1121,13 @@ def geomspace(start, stop, num=50, endpoint=True, dtype=None, axis=0):  # pylint
   return np_utils.tensor_to_ndarray(math_ops.cast(res, dtype))
 
 
-@np_utils.np_doc(np.ptp)
+@np_utils.np_doc('ptp')
 def ptp(a, axis=None, keepdims=None):
   return (np_array_ops.amax(a, axis=axis, keepdims=keepdims) -
           np_array_ops.amin(a, axis=axis, keepdims=keepdims))
 
 
-@np_utils.np_doc_only(np.concatenate)
+@np_utils.np_doc_only('concatenate')
 def concatenate(arys, axis=0):
   if not isinstance(arys, (list, tuple)):
     arys = [arys]
@@ -1138,7 +1138,7 @@ def concatenate(arys, axis=0):
   return np_arrays.tensor_to_ndarray(array_ops.concat(arys, axis))
 
 
-@np_utils.np_doc_only(np.tile)
+@np_utils.np_doc_only('tile')
 def tile(a, reps):  # pylint: disable=missing-function-docstring
   a = np_array_ops.array(a).data
   reps = np_array_ops.array(reps, dtype=dtypes.int32).reshape([-1]).data
@@ -1155,13 +1155,13 @@ def tile(a, reps):  # pylint: disable=missing-function-docstring
   return np_arrays.tensor_to_ndarray(array_ops.tile(a, reps))
 
 
-@np_utils.np_doc(np.count_nonzero)
+@np_utils.np_doc('count_nonzero')
 def count_nonzero(a, axis=None):
   return np_arrays.tensor_to_ndarray(
       math_ops.count_nonzero(np_array_ops.array(a).data, axis))
 
 
-@np_utils.np_doc(np.argsort)
+@np_utils.np_doc('argsort')
 def argsort(a, axis=-1, kind='quicksort', order=None):  # pylint: disable=missing-docstring
   # TODO(nareshmodi): make string tensors also work.
   if kind not in ('quicksort', 'stable'):
@@ -1186,7 +1186,7 @@ def argsort(a, axis=-1, kind='quicksort', order=None):  # pylint: disable=missin
   return np_array_ops.array(tf_ans, dtype=np.intp)
 
 
-@np_utils.np_doc(np.sort)
+@np_utils.np_doc('sort')
 def sort(a, axis=-1, kind='quicksort', order=None):  # pylint: disable=missing-docstring
   if kind != 'quicksort':
     raise ValueError("Only 'quicksort' is supported.")
@@ -1212,17 +1212,17 @@ def _argminmax(fn, a, axis=None):
   return np_utils.tensor_to_ndarray(fn(input=a_t, axis=axis))
 
 
-@np_utils.np_doc(np.argmax)
+@np_utils.np_doc('argmax')
 def argmax(a, axis=None):
   return _argminmax(math_ops.argmax, a, axis)
 
 
-@np_utils.np_doc(np.argmin)
+@np_utils.np_doc('argmin')
 def argmin(a, axis=None):
   return _argminmax(math_ops.argmin, a, axis)
 
 
-@np_utils.np_doc(np.append)
+@np_utils.np_doc('append')
 def append(arr, values, axis=None):
   if axis is None:
     return concatenate([np_array_ops.ravel(arr), np_array_ops.ravel(values)], 0)
@@ -1230,7 +1230,7 @@ def append(arr, values, axis=None):
     return concatenate([arr, values], axis=axis)
 
 
-@np_utils.np_doc(np.average)
+@np_utils.np_doc('average')
 def average(a, axis=None, weights=None, returned=False):  # pylint: disable=missing-docstring
   if axis is not None and not isinstance(axis, six.integer_types):
     # TODO(wangpeng): Support tuple of ints as `axis`
@@ -1293,7 +1293,7 @@ def average(a, axis=None, weights=None, returned=False):  # pylint: disable=miss
   return avg
 
 
-@np_utils.np_doc(np.trace)
+@np_utils.np_doc('trace')
 def trace(a, offset=0, axis1=0, axis2=1, dtype=None):  # pylint: disable=missing-docstring
   if dtype:
     dtype = np_utils.result_type(dtype)
@@ -1311,7 +1311,7 @@ def trace(a, offset=0, axis1=0, axis2=1, dtype=None):  # pylint: disable=missing
   return np_array_ops.sum(a, -1, dtype)
 
 
-@np_utils.np_doc(np.meshgrid)
+@np_utils.np_doc('meshgrid')
 def meshgrid(*xi, **kwargs):
   """This currently requires copy=True and sparse=False."""
   sparse = kwargs.get('sparse', False)
diff --git a/tensorflow/python/ops/numpy_ops/np_utils.py b/tensorflow/python/ops/numpy_ops/np_utils.py
index 186e56816fe..04ec38d611c 100644
--- a/tensorflow/python/ops/numpy_ops/np_utils.py
+++ b/tensorflow/python/ops/numpy_ops/np_utils.py
@@ -233,28 +233,71 @@ def _is_compatible_param_kind(a, b):
   return relax(a) == relax(b)
 
 
-def np_doc(np_fun, np_fun_name=None):
-  """Attachs numpy docstring to a function.
+def _prepare_np_fun_name_and_fun(np_fun_name, np_fun):
+  """Mutually propagates information between `np_fun_name` and `np_fun`.
+
+  If one is None and the other is not, we'll try to make the former not None in
+  a best effort.
 
   Args:
-    np_fun: the numpy function whose docstring will be used.
-    np_fun_name: optional name for the np_fun symbol. At least one of np_fun or
+    np_fun_name: name for the np_fun symbol. At least one of np_fun or
       np_fun_name shoud be set.
+    np_fun: the numpy function whose docstring will be used.
 
   Returns:
-    A function decorator that attaches the docstring from `np_fun` to the
-    decorated function.
+    Processed `np_fun_name` and `np_fun`.
   """
+  if np_fun_name is not None:
+    assert isinstance(np_fun_name, str)
+  if np_fun is not None:
+    assert not isinstance(np_fun, str)
   if np_fun is None:
     assert np_fun_name is not None
     try:
       np_fun = getattr(np, str(np_fun_name))
     except AttributeError:
       np_fun = None
-  np_sig = _np_signature(np_fun)
   if np_fun_name is None:
     assert np_fun is not None
     np_fun_name = np_fun.__name__
+  return np_fun_name, np_fun
+
+
+def _np_doc_helper(f, np_f, np_fun_name=None, unsupported_params=None):
+  """Helper to get docs."""
+  assert np_f or np_fun_name
+  if not np_fun_name:
+    np_fun_name = np_f.__name__
+  doc = 'TensorFlow variant of `numpy.%s`.\n\n' % np_fun_name
+  if unsupported_params:
+    doc += 'Unsupported arguments: ' + ', '.join(
+        '`' + name + '`' for name in unsupported_params) + '.\n\n'
+  if _has_docstring(f):
+    doc += f.__doc__
+    doc = _add_blank_line(doc)
+  if _has_docstring(np_f):
+    doc += 'Documentation for `numpy.%s`:\n\n' % np_f.__name__
+    # TODO(wangpeng): It looks like code snippets in numpy doc don't work
+    # correctly with doctest. Fix that and remove the reformatting of the np_f
+    # comment.
+    doc += np_f.__doc__.replace('>>>', '>')
+  return doc
+
+
+def np_doc(np_fun_name, np_fun=None):
+  """Attachs numpy docstring to a function.
+
+  Args:
+    np_fun_name: name for the np_fun symbol. At least one of np_fun or
+      np_fun_name shoud be set.
+    np_fun: (optional) the numpy function whose docstring will be used.
+
+  Returns:
+    A function decorator that attaches the docstring from `np_fun` to the
+    decorated function.
+  """
+  np_fun_name, np_fun = _prepare_np_fun_name_and_fun(np_fun_name, np_fun)
+  np_sig = _np_signature(np_fun)
 
   def decorator(f):
     """The decorator."""
@@ -294,44 +337,24 @@ def np_doc(np_fun, np_fun_name=None):
   return decorator
 
 
-def _np_doc_helper(f, np_f, np_fun_name=None, unsupported_params=None):
-  """Helper to get docs."""
-  if not unsupported_params and not _has_docstring(f) and _has_docstring(np_f):
-    # TODO(wangpeng): It looks like code snippets in numpy doc don't work
-    # correctly with doctest. Fix that and remove the reformatting of the np_f
-    # comment, here and below.
-    return np_f.__doc__.replace('>>>', '>')
-  assert np_f or np_fun_name
-  if not np_fun_name:
-    np_fun_name = np_f.__name__
-  doc = 'TensorFlow variant of `numpy.%s`.\n\n' % np_fun_name
-  if unsupported_params:
-    doc += 'Unsupported arguments: ' + ', '.join(
-        '`' + name + '`' for name in unsupported_params) + '.\n\n'
-  if _has_docstring(f):
-    doc += f.__doc__
-    doc = _add_blank_line(doc)
-  if _has_docstring(np_f):
-    doc += 'Documentation for `numpy.%s`:\n\n' % np_f.__name__
-    doc += np_f.__doc__.replace('>>>', '>')
-  return doc
-
-
-def np_doc_only(np_f):
+def np_doc_only(np_fun_name, np_fun=None):
   """Attachs numpy docstring to a function.
 
   This differs from np_doc in that it doesn't check for a match in signature.
 
   Args:
-    np_f: the numpy function whose docstring will be used.
+    np_fun_name: name for the np_fun symbol. At least one of np_fun or
+      np_fun_name shoud be set.
+    np_fun: (optional) the numpy function whose docstring will be used.
 
   Returns:
-    A function decorator that attaches the docstring from `np_f` to the
+    A function decorator that attaches the docstring from `np_fun` to the
     decorated function.
   """
+  np_fun_name, np_fun = _prepare_np_fun_name_and_fun(np_fun_name, np_fun)
 
   def decorator(f):
-    f.__doc__ = _np_doc_helper(f, np_f)
+    f.__doc__ = _np_doc_helper(f, np_fun, np_fun_name=np_fun_name)
     return f
 
   return decorator
diff --git a/tensorflow/python/ops/numpy_ops/np_utils_test.py b/tensorflow/python/ops/numpy_ops/np_utils_test.py
index 6d0dfa51185..38b51f05e6e 100644
--- a/tensorflow/python/ops/numpy_ops/np_utils_test.py
+++ b/tensorflow/python/ops/numpy_ops/np_utils_test.py
@@ -31,7 +31,7 @@ class UtilsTest(test.TestCase):
       """np_fun docstring."""
       return
 
-    @np_utils.np_doc(np_fun)
+    @np_utils.np_doc(None, np_fun=np_fun)
     def f():
       """f docstring."""
       return
@@ -47,7 +47,7 @@ np_fun docstring."""
 
   def testNpDocName(self):
 
-    @np_utils.np_doc(None, np_fun_name='foo')
+    @np_utils.np_doc('foo')
     def f():
       """f docstring."""
       return
@@ -70,20 +70,20 @@ f docstring.
     # pylint: disable=unused-variable
     with self.assertRaisesRegexp(TypeError, 'Cannot find parameter'):
 
-      @np_utils.np_doc(np_fun)
+      @np_utils.np_doc(None, np_fun=np_fun)
       def f1(a):
         return
 
     with self.assertRaisesRegexp(TypeError, 'is of kind'):
 
-      @np_utils.np_doc(np_fun)
+      @np_utils.np_doc(None, np_fun=np_fun)
       def f2(x, kwargs):
         return
 
     with self.assertRaisesRegexp(TypeError,
                                  'Parameter "y" should have a default value'):
 
-      @np_utils.np_doc(np_fun)
+      @np_utils.np_doc(None, np_fun=np_fun)
       def f3(x, y):
         return
 

From f5a0fdaa0aeff548623811b887c6da34303ab25f Mon Sep 17 00:00:00 2001
From: Lukas Geiger <lukas.geiger94@gmail.com>
Date: Sat, 20 Jun 2020 01:41:44 +0200
Subject: [PATCH 0684/1390] Use executing_eagerly_outside_functions

---
 .../keras/mixed_precision/experimental/autocast_variable.py   | 4 ++--
 .../mixed_precision/experimental/autocast_variable_test.py    | 3 +++
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/tensorflow/python/keras/mixed_precision/experimental/autocast_variable.py b/tensorflow/python/keras/mixed_precision/experimental/autocast_variable.py
index b60100c7b48..a717fbb41e2 100644
--- a/tensorflow/python/keras/mixed_precision/experimental/autocast_variable.py
+++ b/tensorflow/python/keras/mixed_precision/experimental/autocast_variable.py
@@ -190,7 +190,7 @@ class AutoCastVariable(variables.Variable, core.Tensor):
 
   def _apply_assign_update(
       self, update_fn, value, use_locking=None, name=None, read_value=True):
-    if context.executing_eagerly() or ops.inside_function():
+    if ops.executing_eagerly_outside_functions():
       assign_op = update_fn(value, use_locking, name, False)
       return self if read_value else assign_op
 
@@ -202,7 +202,7 @@ class AutoCastVariable(variables.Variable, core.Tensor):
 
   def _apply_update(self, update_fn, *args, **kwargs):
     update_var = update_fn(*args, **kwargs)
-    if context.executing_eagerly() or ops.inside_function():
+    if ops.executing_eagerly_outside_functions():
       return self
 
     # Fallback to wrapping the returned variable in graph mode if possible
diff --git a/tensorflow/python/keras/mixed_precision/experimental/autocast_variable_test.py b/tensorflow/python/keras/mixed_precision/experimental/autocast_variable_test.py
index 9036109af96..cb5a5d7cb3f 100644
--- a/tensorflow/python/keras/mixed_precision/experimental/autocast_variable_test.py
+++ b/tensorflow/python/keras/mixed_precision/experimental/autocast_variable_test.py
@@ -345,6 +345,9 @@ class AutoCastVariableTest(test.TestCase, parameterized.TestCase):
 
   @combinations.generate(maybe_distribute)
   def test_assign_tf_function(self, distribution):
+    if not context.executing_eagerly():
+      self.skipTest("Test is not compatible with graph mode")
+
     with distribution.scope():
       x = get_var(0., dtypes.float32)
       x = autocast_variable.create_autocast_variable(x)

From 2c5e5a643cc6ba68da46e4e2058f178434848dd8 Mon Sep 17 00:00:00 2001
From: Peng Wang <wangpeng@google.com>
Date: Fri, 19 Jun 2020 16:37:27 -0700
Subject: [PATCH 0685/1390] [TF-numpy] Adds an accessor class for numpy_ops, in
 order to be tf_exported'ed.

PiperOrigin-RevId: 317402589
Change-Id: I6bb5f4f9d3b42cf8c2653d60d80c20b37d0bd59f
---
 tensorflow/python/ops/numpy_ops/BUILD         |  1 +
 .../python/ops/numpy_ops/np_accessor.py       | 32 +++++++++++++++++++
 2 files changed, 33 insertions(+)
 create mode 100644 tensorflow/python/ops/numpy_ops/np_accessor.py

diff --git a/tensorflow/python/ops/numpy_ops/BUILD b/tensorflow/python/ops/numpy_ops/BUILD
index 3f18a7b3e01..3479a622bc0 100644
--- a/tensorflow/python/ops/numpy_ops/BUILD
+++ b/tensorflow/python/ops/numpy_ops/BUILD
@@ -11,6 +11,7 @@ py_library(
     name = "numpy",
     srcs = [
         "__init__.py",
+        "np_accessor.py",
         "np_array_ops.py",
         "np_arrays.py",
         "np_dtypes.py",
diff --git a/tensorflow/python/ops/numpy_ops/np_accessor.py b/tensorflow/python/ops/numpy_ops/np_accessor.py
new file mode 100644
index 00000000000..64786d2c50a
--- /dev/null
+++ b/tensorflow/python/ops/numpy_ops/np_accessor.py
@@ -0,0 +1,32 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""An accessor class for numpy_ops contents."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.ops import numpy_ops
+
+
+class Numpy:
+  """An accessor class that forwards attribute accesses to module `numpy_ops`.
+  """
+
+  def __getattr__(self, attr):
+    return getattr(numpy_ops, attr)
+
+
+numpy = Numpy()

From f51b649394b8b0c2cbf8179ebd4b64b5a915110b Mon Sep 17 00:00:00 2001
From: Austin Anderson <angerson@google.com>
Date: Fri, 19 Jun 2020 16:39:28 -0700
Subject: [PATCH 0686/1390] Experimental internal CI changes

PiperOrigin-RevId: 317402932
Change-Id: Ibaef72f01e06f4518b85b21f82000599eed7e4bd
---
 .../per_release/scripts/nonpip_gpu.sh         | 75 +++++++++++++++++++
 1 file changed, 75 insertions(+)
 create mode 100644 tensorflow/tools/ci_build/per_release/scripts/nonpip_gpu.sh

diff --git a/tensorflow/tools/ci_build/per_release/scripts/nonpip_gpu.sh b/tensorflow/tools/ci_build/per_release/scripts/nonpip_gpu.sh
new file mode 100644
index 00000000000..6fd7c3d5854
--- /dev/null
+++ b/tensorflow/tools/ci_build/per_release/scripts/nonpip_gpu.sh
@@ -0,0 +1,75 @@
+#!/bin/bash
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+set -e
+set -x
+
+if [[ -n "${KOKORO_ARTIFACTS_DIR}" ]]; then
+  cd "${KOKORO_ARTIFACTS_DIR}"
+  ls
+  source "$(find "${KOKORO_ARTIFACTS_DIR}" -name "common_google.sh")"
+  cd git/gob-tensorflow
+
+fi
+
+if [[ -z "${TF_KOKORO_PY_VERSION}" ]]; then
+  echo "You must set TF_KOKORO_PY_VERSION, e.g. '3.7', indicating the "
+  echo "Python version to be used for this build."
+  exit 2
+fi
+
+source tensorflow/tools/ci_build/release/common.sh
+
+install_ubuntu_16_pip_deps "pip${TF_KOKORO_PY_VERSION}"
+# Update bazel
+install_bazelisk
+
+# Run configure.
+export TF_NEED_GCP=1
+export TF_NEED_HDFS=1
+export TF_NEED_S3=1
+export TF_NEED_CUDA=1
+export TF_CUDA_VERSION=10
+export TF_CUDNN_VERSION=7
+export TF_NEED_TENSORRT=1
+export TENSORRT_INSTALL_PATH=/usr/local/tensorrt
+export CC_OPT_FLAGS='-mavx'
+export PYTHON_BIN_PATH=$(which "python${TF_KOKORO_PY_VERSION}")
+export TF2_BEHAVIOR=1
+export PROJECT_NAME="tensorflow_gpu"
+export LD_LIBRARY_PATH="/usr/local/cuda:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:$TENSORRT_INSTALL_PATH/lib"
+export TF_CUDA_COMPUTE_CAPABILITIES=sm_35,sm_37,sm_52,sm_60,sm_61,compute_70
+
+yes "" | "$PYTHON_BIN_PATH" configure.py
+
+# Get the default test targets for bazel.
+source tensorflow/tools/ci_build/build_scripts/PRESUBMIT_BUILD_TARGETS.sh
+
+# Exclude -no_oss_py36, for example
+tag_filters="gpu,requires-gpu,-no_gpu,-no_oss,-oss_serial,-no_oss_py${TF_KOKORO_PY_VERSION//.}"
+
+set +e
+bazel test --config=cuda --config=opt \
+  --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.1:toolchain \
+  --linkopt=-lrt \
+  --action_env=TF2_BEHAVIOR="${TF2_BEHAVIOR}" \
+  --test_lang_filters=py \
+  --build_tag_filters=${tag_filters} \
+  --test_tag_filters=${tag_filters} \
+  --test_timeout="300,450,1200,3600" --local_test_jobs=4 \
+  --test_output=errors --verbose_failures=true --keep_going \
+  --run_under=//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute \
+  -- ${DEFAULT_BAZEL_TARGETS} -//tensorflow/lite/...
+test_xml_summary_exit

From 6116b7f9114f28dcffd685222285a8c5f7db3daa Mon Sep 17 00:00:00 2001
From: George Karpenkov <cheshire@google.com>
Date: Fri, 19 Jun 2020 17:06:02 -0700
Subject: [PATCH 0687/1390] [XLA] [client] Implement a RunAsync overload which
 does not need a vector of shapes

PiperOrigin-RevId: 317406952
Change-Id: I69d8cc8a68ffdfbf70e2969f5df5e6adba7d2e1d
---
 tensorflow/compiler/xla/client/local_client.cc | 10 ++++++++++
 tensorflow/compiler/xla/client/local_client.h  |  3 +++
 2 files changed, 13 insertions(+)

diff --git a/tensorflow/compiler/xla/client/local_client.cc b/tensorflow/compiler/xla/client/local_client.cc
index aa252067e19..5fc9909fa2a 100644
--- a/tensorflow/compiler/xla/client/local_client.cc
+++ b/tensorflow/compiler/xla/client/local_client.cc
@@ -320,6 +320,16 @@ StatusOr<ExecutionOutput> LocalExecutable::RunAsync(
   return std::move(outputs);
 }
 
+StatusOr<ExecutionOutput> LocalExecutable::RunAsync(
+    std::vector<ExecutionInput> arguments, ExecutableRunOptions run_options) {
+  std::vector<const Shape*> argument_shapes;
+  argument_shapes.reserve(arguments.size());
+  for (const ExecutionInput& arg : arguments) {
+    argument_shapes.push_back(&arg.shape());
+  }
+  return RunAsync(argument_shapes, std::move(arguments), run_options);
+}
+
 se::Platform* LocalClient::platform() const {
   return local_service_->backend().platform();
 }
diff --git a/tensorflow/compiler/xla/client/local_client.h b/tensorflow/compiler/xla/client/local_client.h
index 3241ac73d54..8b91f4a1739 100644
--- a/tensorflow/compiler/xla/client/local_client.h
+++ b/tensorflow/compiler/xla/client/local_client.h
@@ -68,6 +68,9 @@ class LocalExecutable {
       absl::Span<Shape const* const> argument_host_shapes,
       std::vector<ExecutionInput> arguments, ExecutableRunOptions run_options);
 
+  StatusOr<ExecutionOutput> RunAsync(std::vector<ExecutionInput> arguments,
+                                     ExecutableRunOptions run_options);
+
   // Return the options used to build the executable.
   const ExecutableBuildOptions& build_options() const { return build_options_; }
 

From f840a6226841eadee32ada80898279b5d9a7ca3b Mon Sep 17 00:00:00 2001
From: Sanjoy Das <sanjoy@google.com>
Date: Fri, 19 Jun 2020 17:08:17 -0700
Subject: [PATCH 0688/1390] Rollback "[TF:TRT] Cosmetic fix."

PiperOrigin-RevId: 317407274
Change-Id: I73cd486acf9091e6678e553ab9b0545288f73324
---
 tensorflow/compiler/tf2tensorrt/common/utils.h       |  6 ++++--
 .../compiler/tf2tensorrt/convert/convert_graph.cc    |  6 ++++--
 .../compiler/tf2tensorrt/convert/convert_graph.h     |  6 ++++--
 .../tf2tensorrt/convert/convert_graph_test.cc        |  6 ++++--
 .../compiler/tf2tensorrt/convert/convert_nodes.cc    |  6 ++++--
 .../compiler/tf2tensorrt/convert/convert_nodes.h     |  6 ++++--
 .../tf2tensorrt/convert/convert_nodes_test.cc        |  6 ++++--
 .../compiler/tf2tensorrt/convert/logger_registry.cc  |  6 ++++--
 .../compiler/tf2tensorrt/convert/logger_registry.h   |  5 ++---
 .../tf2tensorrt/convert/trt_optimization_pass.cc     |  6 ++++--
 .../tf2tensorrt/convert/trt_optimization_pass.h      |  6 ++++--
 .../tf2tensorrt/kernels/get_calibration_data_op.cc   |  6 ++++--
 .../compiler/tf2tensorrt/kernels/trt_engine_op.cc    |  6 ++++--
 .../tf2tensorrt/kernels/trt_engine_op_test.cc        |  6 ++++--
 .../tf2tensorrt/kernels/trt_engine_resource_ops.cc   |  6 ++++--
 .../kernels/trt_engine_resource_ops_test.cc          |  6 ++++--
 .../tf2tensorrt/ops/get_calibration_data_op.cc       |  6 ++++--
 tensorflow/compiler/tf2tensorrt/ops/trt_engine_op.cc |  6 ++++--
 .../tf2tensorrt/ops/trt_engine_resource_ops.cc       |  6 ++++--
 .../compiler/tf2tensorrt/plugin/plugin_cast.cu.cc    |  6 ++++--
 tensorflow/compiler/tf2tensorrt/plugin/trt_plugin.cc |  6 ++++--
 tensorflow/compiler/tf2tensorrt/plugin/trt_plugin.h  |  6 ++++--
 tensorflow/compiler/tf2tensorrt/segment/segment.cc   |  6 ++++--
 tensorflow/compiler/tf2tensorrt/segment/segment.h    |  6 ++++--
 .../compiler/tf2tensorrt/segment/segment_test.cc     |  6 ++++--
 tensorflow/compiler/tf2tensorrt/segment/union_find.h |  6 ++++--
 tensorflow/compiler/tf2tensorrt/tensorrt_test.cc     |  6 ++++--
 .../compiler/tf2tensorrt/utils/trt_allocator.cc      | 12 ++++++++----
 .../compiler/tf2tensorrt/utils/trt_allocator.h       | 12 ++++++++----
 .../compiler/tf2tensorrt/utils/trt_engine_utils.cc   |  6 ++++--
 .../compiler/tf2tensorrt/utils/trt_engine_utils.h    |  6 ++++--
 .../tf2tensorrt/utils/trt_int8_calibrator.cc         |  6 ++++--
 .../compiler/tf2tensorrt/utils/trt_int8_calibrator.h |  6 ++++--
 tensorflow/compiler/tf2tensorrt/utils/trt_logger.cc  |  6 ++++--
 tensorflow/compiler/tf2tensorrt/utils/trt_logger.h   |  6 ++++--
 .../compiler/tf2tensorrt/utils/trt_lru_cache.cc      |  6 ++++--
 .../compiler/tf2tensorrt/utils/trt_lru_cache.h       |  6 ++++--
 .../utils/trt_shape_optimization_profiles.h          |  6 ++++--
 .../utils/trt_shape_optimization_profiles_test.cc    |  6 ++++--
 39 files changed, 162 insertions(+), 83 deletions(-)

diff --git a/tensorflow/compiler/tf2tensorrt/common/utils.h b/tensorflow/compiler/tf2tensorrt/common/utils.h
index b428733ecd4..9ab0145e1ec 100644
--- a/tensorflow/compiler/tf2tensorrt/common/utils.h
+++ b/tensorflow/compiler/tf2tensorrt/common/utils.h
@@ -16,7 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_TF2TENSORRT_COMMON_UTILS_H_
 #define TENSORFLOW_COMPILER_TF2TENSORRT_COMMON_UTILS_H_
 
-#if GOOGLE_CUDA && GOOGLE_TENSORRT
+#if GOOGLE_CUDA
+#if GOOGLE_TENSORRT
 
 #include "tensorflow/core/platform/logging.h"
 
@@ -28,6 +29,7 @@ namespace tensorrt {
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
+#endif
+#endif
 
 #endif  // TENSORFLOW_COMPILER_TF2TENSORRT_COMMON_UTILS_H_
diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc b/tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc
index 5429aaf3362..1c51d51f1c9 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc
+++ b/tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc
@@ -53,7 +53,8 @@ limitations under the License.
 #include "tensorflow/core/util/device_name_utils.h"
 #include "tensorflow/tools/graph_transforms/transform_utils.h"
 
-#if GOOGLE_CUDA && GOOGLE_TENSORRT
+#if GOOGLE_CUDA
+#if GOOGLE_TENSORRT
 #include "third_party/gpus/cuda/include/cuda_runtime_api.h"
 #include "third_party/tensorrt/NvInfer.h"
 namespace tensorflow {
@@ -883,4 +884,5 @@ Status ConvertAfterShapes(const ConversionParams& params) {
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
+#endif  // GOOGLE_TENSORRT
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_graph.h b/tensorflow/compiler/tf2tensorrt/convert/convert_graph.h
index d3897e864fa..53ab84a6fa9 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/convert_graph.h
+++ b/tensorflow/compiler/tf2tensorrt/convert/convert_graph.h
@@ -24,7 +24,8 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/types.h"
 
-#if GOOGLE_CUDA && GOOGLE_TENSORRT
+#if GOOGLE_CUDA
+#if GOOGLE_TENSORRT
 
 namespace tensorflow {
 namespace tensorrt {
@@ -65,6 +66,7 @@ Status RegisterGraphToFunctionLibrary(const GraphDef& segment_graph_def,
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
+#endif  // GOOGLE_TENSORRT
+#endif  // GOOGLE_CUDA
 
 #endif  // TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_CONVERT_GRAPH_H_
diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_graph_test.cc b/tensorflow/compiler/tf2tensorrt/convert/convert_graph_test.cc
index 54fb1d56441..a1f523d6bfa 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/convert_graph_test.cc
+++ b/tensorflow/compiler/tf2tensorrt/convert/convert_graph_test.cc
@@ -34,7 +34,8 @@ limitations under the License.
 #include "tensorflow/core/protobuf/config.pb.h"  // NOLINT
 #include "tensorflow/core/public/session.h"
 
-#if GOOGLE_CUDA && GOOGLE_TENSORRT
+#if GOOGLE_CUDA
+#if GOOGLE_TENSORRT
 
 namespace tensorflow {
 namespace tensorrt {
@@ -230,4 +231,5 @@ TEST_F(ConvertAfterShapesTest, DirectlyConnectedEngines) {
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
+#endif  // GOOGLE_TENSORRT
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc
index 2ec616ba621..96cec556942 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc
+++ b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc
@@ -59,7 +59,8 @@ limitations under the License.
 #include "tensorflow/core/util/env_var.h"
 #include "tensorflow/core/util/strided_slice_op.h"
 
-#if GOOGLE_CUDA && GOOGLE_TENSORRT
+#if GOOGLE_CUDA
+#if GOOGLE_TENSORRT
 #include "third_party/tensorrt/NvInfer.h"
 #include "third_party/tensorrt/NvInferPlugin.h"
 
@@ -6257,4 +6258,5 @@ bool OutputEdgeValidator::operator()(const Edge* out_edge) const {
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
+#endif  // GOOGLE_TENSORRT
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h
index a621735fad1..7a1276c645c 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h
+++ b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h
@@ -33,7 +33,8 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/stream_executor/lib/statusor.h"
 
-#if GOOGLE_CUDA && GOOGLE_TENSORRT
+#if GOOGLE_CUDA
+#if GOOGLE_TENSORRT
 #include "third_party/tensorrt/NvInfer.h"
 
 namespace tensorflow {
@@ -693,6 +694,7 @@ BinaryOperationMap();
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
+#endif  // GOOGLE_TENSORRT
+#endif  // GOOGLE_CUDA
 
 #endif  // TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_CONVERT_NODES_H_
diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes_test.cc b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes_test.cc
index 53ec9ee7ada..c24b169f651 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes_test.cc
+++ b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes_test.cc
@@ -21,7 +21,8 @@ limitations under the License.
 #include <unordered_map>
 #include <vector>
 
-#if GOOGLE_CUDA && GOOGLE_TENSORRT
+#if GOOGLE_CUDA
+#if GOOGLE_TENSORRT
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
@@ -6635,4 +6636,5 @@ TEST_F(OpConverterTest, ConvertPad) {
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
+#endif  // GOOGLE_TENSORRT
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/compiler/tf2tensorrt/convert/logger_registry.cc b/tensorflow/compiler/tf2tensorrt/convert/logger_registry.cc
index 07c9c2f1ea0..82e68cbb28d 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/logger_registry.cc
+++ b/tensorflow/compiler/tf2tensorrt/convert/logger_registry.cc
@@ -12,7 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#if GOOGLE_CUDA && GOOGLE_TENSORRT
+#if GOOGLE_CUDA
+#if GOOGLE_TENSORRT
 
 #include "tensorflow/compiler/tf2tensorrt/convert/logger_registry.h"
 
@@ -57,4 +58,5 @@ LoggerRegistry* GetLoggerRegistry() {
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
+#endif  // GOOGLE_TENSORRT
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/compiler/tf2tensorrt/convert/logger_registry.h b/tensorflow/compiler/tf2tensorrt/convert/logger_registry.h
index 2a265cf7caa..45b302742d0 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/logger_registry.h
+++ b/tensorflow/compiler/tf2tensorrt/convert/logger_registry.h
@@ -19,8 +19,7 @@ limitations under the License.
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/types.h"
 
-#if GOOGLE_CUDA && GOOGLE_TENSORRT
-
+#if GOOGLE_CUDA
 #include "third_party/tensorrt/NvInfer.h"
 
 namespace tensorflow {
@@ -54,5 +53,5 @@ class RegisterLogger {
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
+#endif  // GOOGLE_CUDA
 #endif  // TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_LOGGER_REGISTRY_H_
diff --git a/tensorflow/compiler/tf2tensorrt/convert/trt_optimization_pass.cc b/tensorflow/compiler/tf2tensorrt/convert/trt_optimization_pass.cc
index 1cf98d135cb..72f4fe5ef9b 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/trt_optimization_pass.cc
+++ b/tensorflow/compiler/tf2tensorrt/convert/trt_optimization_pass.cc
@@ -28,7 +28,8 @@ limitations under the License.
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/stacktrace.h"
 
-#if GOOGLE_CUDA && GOOGLE_TENSORRT
+#if GOOGLE_CUDA
+#if GOOGLE_TENSORRT
 namespace tensorflow {
 namespace tensorrt {
 namespace convert {
@@ -301,4 +302,5 @@ static VerboseCustomGraphOptimizerRegistrar TRTOptimizationPass_Registrar(
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
+#endif
+#endif
diff --git a/tensorflow/compiler/tf2tensorrt/convert/trt_optimization_pass.h b/tensorflow/compiler/tf2tensorrt/convert/trt_optimization_pass.h
index e0aaa5500ab..f79048bb5f6 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/trt_optimization_pass.h
+++ b/tensorflow/compiler/tf2tensorrt/convert/trt_optimization_pass.h
@@ -23,7 +23,8 @@ limitations under the License.
 #include "tensorflow/core/grappler/optimizers/custom_graph_optimizer.h"
 #include "tensorflow/core/platform/logging.h"
 
-#if GOOGLE_CUDA && GOOGLE_TENSORRT
+#if GOOGLE_CUDA
+#if GOOGLE_TENSORRT
 
 namespace tensorflow {
 namespace tensorrt {
@@ -82,5 +83,6 @@ class TRTOptimizationPass : public grappler::CustomGraphOptimizer {
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
+#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_TENSORRT
 #endif  // TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_TRT_OPTIMIZATION_PASS_H_
diff --git a/tensorflow/compiler/tf2tensorrt/kernels/get_calibration_data_op.cc b/tensorflow/compiler/tf2tensorrt/kernels/get_calibration_data_op.cc
index 76fb40b9520..3143b06817e 100644
--- a/tensorflow/compiler/tf2tensorrt/kernels/get_calibration_data_op.cc
+++ b/tensorflow/compiler/tf2tensorrt/kernels/get_calibration_data_op.cc
@@ -22,7 +22,8 @@ limitations under the License.
 #include "tensorflow/core/framework/resource_mgr.h"
 #include "tensorflow/core/lib/core/refcount.h"
 
-#if GOOGLE_CUDA && GOOGLE_TENSORRT
+#if GOOGLE_CUDA
+#if GOOGLE_TENSORRT
 
 namespace tensorflow {
 namespace tensorrt {
@@ -66,4 +67,5 @@ REGISTER_KERNEL_BUILDER(Name("GetCalibrationDataOp").Device(DEVICE_GPU),
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
+#endif  // GOOGLE_TENSORRT
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op.cc b/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op.cc
index 1094555a622..98d199ca9ab 100644
--- a/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op.cc
+++ b/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op.cc
@@ -48,7 +48,8 @@ limitations under the License.
 #include "tensorflow/core/util/env_var.h"
 #include "tensorflow/stream_executor/lib/statusor.h"
 
-#if GOOGLE_CUDA && GOOGLE_TENSORRT
+#if GOOGLE_CUDA
+#if GOOGLE_TENSORRT
 #include "third_party/gpus/cuda/include/cuda_runtime_api.h"
 #include "third_party/tensorrt/NvInfer.h"
 
@@ -1008,4 +1009,5 @@ REGISTER_KERNEL_BUILDER(Name("TRTEngineOp").Device(DEVICE_GPU), TRTEngineOp);
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
+#endif  // GOOGLE_TENSORRT
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op_test.cc b/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op_test.cc
index 71193dc24cf..a06010de1c7 100644
--- a/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op_test.cc
+++ b/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op_test.cc
@@ -50,7 +50,8 @@ limitations under the License.
 #include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/public/version.h"
 
-#if GOOGLE_CUDA && GOOGLE_TENSORRT
+#if GOOGLE_CUDA
+#if GOOGLE_TENSORRT
 
 namespace tensorflow {
 namespace tensorrt {
@@ -305,4 +306,5 @@ TYPED_TEST(TRTEngineOpTest, Basic) {
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
+#endif  // GOOGLE_TENSORRT
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_resource_ops.cc b/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_resource_ops.cc
index 3b6e7e91d3b..2c5821df6ac 100644
--- a/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_resource_ops.cc
+++ b/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_resource_ops.cc
@@ -33,7 +33,8 @@ limitations under the License.
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/thread_annotations.h"
 
-#if GOOGLE_CUDA && GOOGLE_TENSORRT
+#if GOOGLE_CUDA
+#if GOOGLE_TENSORRT
 #include "third_party/tensorrt/NvInfer.h"
 
 namespace tensorflow {
@@ -250,4 +251,5 @@ REGISTER_KERNEL_BUILDER(Name("SerializeTRTResource").Device(DEVICE_GPU),
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
+#endif  // GOOGLE_TENSORRT
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_resource_ops_test.cc b/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_resource_ops_test.cc
index 6a073ee24d0..4a24160569d 100644
--- a/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_resource_ops_test.cc
+++ b/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_resource_ops_test.cc
@@ -48,7 +48,8 @@ limitations under the License.
 #include "tensorflow/core/platform/tstring.h"
 #include "tensorflow/core/platform/types.h"
 
-#if GOOGLE_CUDA && GOOGLE_TENSORRT
+#if GOOGLE_CUDA
+#if GOOGLE_TENSORRT
 
 namespace tensorflow {
 namespace tensorrt {
@@ -245,4 +246,5 @@ TEST_F(TRTEngineResourceOpsTest, Basic) {
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
+#endif  // GOOGLE_TENSORRT
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/compiler/tf2tensorrt/ops/get_calibration_data_op.cc b/tensorflow/compiler/tf2tensorrt/ops/get_calibration_data_op.cc
index 2af3164c3e2..573172b92e6 100644
--- a/tensorflow/compiler/tf2tensorrt/ops/get_calibration_data_op.cc
+++ b/tensorflow/compiler/tf2tensorrt/ops/get_calibration_data_op.cc
@@ -13,7 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#if GOOGLE_CUDA && GOOGLE_TENSORRT
+#if GOOGLE_CUDA
+#if GOOGLE_TENSORRT
 
 #include "tensorflow/core/framework/common_shape_fns.h"
 #include "tensorflow/core/framework/op.h"
@@ -33,4 +34,5 @@ Returns calibration data for the given resource name
 
 }  // namespace tensorflow
 
-#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
+#endif  // GOOGLE_TENSORRT
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/compiler/tf2tensorrt/ops/trt_engine_op.cc b/tensorflow/compiler/tf2tensorrt/ops/trt_engine_op.cc
index 2527fe9b910..bd3c2b299a9 100644
--- a/tensorflow/compiler/tf2tensorrt/ops/trt_engine_op.cc
+++ b/tensorflow/compiler/tf2tensorrt/ops/trt_engine_op.cc
@@ -13,7 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#if GOOGLE_CUDA && GOOGLE_TENSORRT
+#if GOOGLE_CUDA
+#if GOOGLE_TENSORRT
 
 #include "tensorflow/core/framework/common_shape_fns.h"
 #include "tensorflow/core/framework/op.h"
@@ -58,4 +59,5 @@ REGISTER_OP("TRTEngineOp")
     .Attr("static_engine: bool = true");
 }  // namespace tensorflow
 
-#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
+#endif  // GOOGLE_TENSORRT
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/compiler/tf2tensorrt/ops/trt_engine_resource_ops.cc b/tensorflow/compiler/tf2tensorrt/ops/trt_engine_resource_ops.cc
index 3141092de03..01911de66ec 100644
--- a/tensorflow/compiler/tf2tensorrt/ops/trt_engine_resource_ops.cc
+++ b/tensorflow/compiler/tf2tensorrt/ops/trt_engine_resource_ops.cc
@@ -13,7 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#if GOOGLE_CUDA && GOOGLE_TENSORRT
+#if GOOGLE_CUDA
+#if GOOGLE_TENSORRT
 
 #include "tensorflow/core/framework/common_shape_fns.h"
 #include "tensorflow/core/framework/op.h"
@@ -45,4 +46,5 @@ REGISTER_OP("SerializeTRTResource")
 
 }  // namespace tensorflow
 
-#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
+#endif  // GOOGLE_TENSORRT
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/compiler/tf2tensorrt/plugin/plugin_cast.cu.cc b/tensorflow/compiler/tf2tensorrt/plugin/plugin_cast.cu.cc
index 141a7d1f462..4c0d8b0392a 100644
--- a/tensorflow/compiler/tf2tensorrt/plugin/plugin_cast.cu.cc
+++ b/tensorflow/compiler/tf2tensorrt/plugin/plugin_cast.cu.cc
@@ -17,7 +17,8 @@ limitations under the License.
 #include "tensorflow/compiler/tf2tensorrt/plugin/trt_plugin.h"
 #include "tensorflow/core/platform/logging.h"
 
-#if GOOGLE_CUDA && GOOGLE_TENSORRT
+#if GOOGLE_CUDA
+#if GOOGLE_TENSORRT
 #define EIGEN_USE_GPU  // For definition of Eigen::GpuDevice.
 #include "third_party/gpus/cuda/include/cuda_runtime_api.h"
 #include "tensorflow/core/util/gpu_kernel_helper.h"
@@ -233,4 +234,5 @@ REGISTER_TFTRT_PLUGIN(CastPluginCreator);
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
+#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_TENSORRT
diff --git a/tensorflow/compiler/tf2tensorrt/plugin/trt_plugin.cc b/tensorflow/compiler/tf2tensorrt/plugin/trt_plugin.cc
index 83d5f9b5965..563ce724f43 100644
--- a/tensorflow/compiler/tf2tensorrt/plugin/trt_plugin.cc
+++ b/tensorflow/compiler/tf2tensorrt/plugin/trt_plugin.cc
@@ -17,7 +17,8 @@ limitations under the License.
 
 #include <cstring>
 
-#if GOOGLE_CUDA && GOOGLE_TENSORRT
+#if GOOGLE_CUDA
+#if GOOGLE_TENSORRT
 
 namespace tensorflow {
 namespace tensorrt {
@@ -29,4 +30,5 @@ const char* kTfTrtPluginNamespace = "TF";
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
+#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_TENSORRT
diff --git a/tensorflow/compiler/tf2tensorrt/plugin/trt_plugin.h b/tensorflow/compiler/tf2tensorrt/plugin/trt_plugin.h
index 600ac6683da..bdb046e6c71 100644
--- a/tensorflow/compiler/tf2tensorrt/plugin/trt_plugin.h
+++ b/tensorflow/compiler/tf2tensorrt/plugin/trt_plugin.h
@@ -20,7 +20,8 @@ limitations under the License.
 
 #include "tensorflow/core/platform/logging.h"
 
-#if GOOGLE_CUDA && GOOGLE_TENSORRT
+#if GOOGLE_CUDA
+#if GOOGLE_TENSORRT
 #include "third_party/tensorrt/NvInfer.h"
 
 namespace tensorflow {
@@ -89,6 +90,7 @@ class TrtPluginRegistrar {
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
+#endif  // GOOGLE_TENSORRT
+#endif  // GOOGLE_CUDA
 
 #endif  // TENSORFLOW_COMPILER_TF2TENSORRT_PLUGIN_TRT_PLUGIN_H_
diff --git a/tensorflow/compiler/tf2tensorrt/segment/segment.cc b/tensorflow/compiler/tf2tensorrt/segment/segment.cc
index d9080b6f69a..32e30006f58 100644
--- a/tensorflow/compiler/tf2tensorrt/segment/segment.cc
+++ b/tensorflow/compiler/tf2tensorrt/segment/segment.cc
@@ -35,7 +35,8 @@ limitations under the License.
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/util/env_var.h"
 
-#if GOOGLE_CUDA && GOOGLE_TENSORRT
+#if GOOGLE_CUDA
+#if GOOGLE_TENSORRT
 
 namespace tensorflow {
 namespace tensorrt {
@@ -1061,4 +1062,5 @@ Status SegmentGraph(const Graph* tf_graph,
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
+#endif  // GOOGLE_TENSORRT
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/compiler/tf2tensorrt/segment/segment.h b/tensorflow/compiler/tf2tensorrt/segment/segment.h
index 3f79983cfd2..7295c8f0d9d 100644
--- a/tensorflow/compiler/tf2tensorrt/segment/segment.h
+++ b/tensorflow/compiler/tf2tensorrt/segment/segment.h
@@ -25,7 +25,8 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/types.h"
 
-#if GOOGLE_CUDA && GOOGLE_TENSORRT
+#if GOOGLE_CUDA
+#if GOOGLE_TENSORRT
 
 namespace tensorflow {
 namespace tensorrt {
@@ -66,6 +67,7 @@ Status SegmentGraph(const Graph* tf_graph,
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
+#endif  // GOOGLE_TENSORRT
+#endif  // GOOGLE_CUDA
 
 #endif  // TENSORFLOW_COMPILER_TF2TENSORRT_SEGMENT_SEGMENT_H_
diff --git a/tensorflow/compiler/tf2tensorrt/segment/segment_test.cc b/tensorflow/compiler/tf2tensorrt/segment/segment_test.cc
index f3bc5bfbee6..2437481a9c4 100644
--- a/tensorflow/compiler/tf2tensorrt/segment/segment_test.cc
+++ b/tensorflow/compiler/tf2tensorrt/segment/segment_test.cc
@@ -26,7 +26,8 @@ limitations under the License.
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/public/session.h"
 
-#if GOOGLE_CUDA && GOOGLE_TENSORRT
+#if GOOGLE_CUDA
+#if GOOGLE_TENSORRT
 
 namespace tensorflow {
 namespace tensorrt {
@@ -521,4 +522,5 @@ TEST_F(SegmentTest, IncompatibleBatchSizes) {
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
+#endif  // GOOGLE_TENSORRT
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/compiler/tf2tensorrt/segment/union_find.h b/tensorflow/compiler/tf2tensorrt/segment/union_find.h
index b53615ec019..70e83c12fca 100644
--- a/tensorflow/compiler/tf2tensorrt/segment/union_find.h
+++ b/tensorflow/compiler/tf2tensorrt/segment/union_find.h
@@ -19,7 +19,8 @@ limitations under the License.
 #include "absl/strings/str_format.h"
 #include "absl/types/optional.h"
 
-#if GOOGLE_CUDA && GOOGLE_TENSORRT
+#if GOOGLE_CUDA
+#if GOOGLE_TENSORRT
 
 namespace tensorflow {
 namespace tensorrt {
@@ -216,6 +217,7 @@ UnionFind<T>* UnionFind<T>::FindRoot() {
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
+#endif  // GOOGLE_TENSORRT
+#endif  // GOOGLE_CUDA
 
 #endif  // TENSORFLOW_COMPILER_TF2TENSORRT_SEGMENT_UNION_FIND_H_
diff --git a/tensorflow/compiler/tf2tensorrt/tensorrt_test.cc b/tensorflow/compiler/tf2tensorrt/tensorrt_test.cc
index e994d20df33..510591bfe00 100644
--- a/tensorflow/compiler/tf2tensorrt/tensorrt_test.cc
+++ b/tensorflow/compiler/tf2tensorrt/tensorrt_test.cc
@@ -18,7 +18,8 @@ limitations under the License.
 #include "tensorflow/core/platform/stream_executor.h"
 #include "tensorflow/core/platform/test.h"
 
-#if GOOGLE_CUDA && GOOGLE_TENSORRT
+#if GOOGLE_CUDA
+#if GOOGLE_TENSORRT
 #include "third_party/gpus/cuda/include/cuda.h"
 #include "third_party/gpus/cuda/include/cuda_runtime_api.h"
 #include "third_party/tensorrt/NvInfer.h"
@@ -163,4 +164,5 @@ TEST(TensorrtTest, BasicFunctions) {
 }  // namespace
 }  // namespace tensorflow
 
-#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
+#endif  // GOOGLE_TENSORRT
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/compiler/tf2tensorrt/utils/trt_allocator.cc b/tensorflow/compiler/tf2tensorrt/utils/trt_allocator.cc
index d4f3a524577..617ea7fad5c 100644
--- a/tensorflow/compiler/tf2tensorrt/utils/trt_allocator.cc
+++ b/tensorflow/compiler/tf2tensorrt/utils/trt_allocator.cc
@@ -17,9 +17,11 @@ limitations under the License.
 
 #include "tensorflow/core/platform/logging.h"
 
-#if GOOGLE_CUDA && GOOGLE_TENSORRT
+#if GOOGLE_CUDA
+#if GOOGLE_TENSORRT
 #include "third_party/gpus/cuda/include/cuda_runtime_api.h"
-#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
+#endif  // GOOGLE_TENSORRT
+#endif  // GOOGLE_CUDA
 
 namespace tensorflow {
 namespace tensorrt {
@@ -50,7 +52,8 @@ void* Align(uint64_t alignment, uint64_t size, void*& ptr, uint64_t& space) {
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#if GOOGLE_CUDA && GOOGLE_TENSORRT
+#if GOOGLE_CUDA
+#if GOOGLE_TENSORRT
 
 namespace tensorflow {
 namespace tensorrt {
@@ -110,4 +113,5 @@ void TRTDeviceAllocator::free(void* memory) {
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
+#endif  // GOOGLE_TENSORRT
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/compiler/tf2tensorrt/utils/trt_allocator.h b/tensorflow/compiler/tf2tensorrt/utils/trt_allocator.h
index d219a8a14e8..4ab8b52f523 100644
--- a/tensorflow/compiler/tf2tensorrt/utils/trt_allocator.h
+++ b/tensorflow/compiler/tf2tensorrt/utils/trt_allocator.h
@@ -20,9 +20,11 @@ limitations under the License.
 
 #include "tensorflow/core/framework/allocator.h"
 
-#if GOOGLE_CUDA && GOOGLE_TENSORRT
+#if GOOGLE_CUDA
+#if GOOGLE_TENSORRT
 #include "third_party/tensorrt/NvInfer.h"
-#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
+#endif  // GOOGLE_TENSORRT
+#endif  // GOOGLE_CUDA
 
 namespace tensorflow {
 namespace tensorrt {
@@ -31,7 +33,8 @@ void* Align(uint64_t alignment, uint64_t size, void*& ptr, uint64_t& space);
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#if GOOGLE_CUDA && GOOGLE_TENSORRT
+#if GOOGLE_CUDA
+#if GOOGLE_TENSORRT
 
 namespace tensorflow {
 namespace tensorrt {
@@ -66,5 +69,6 @@ class TRTDeviceAllocator : public TRTBaseAllocator {
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
+#endif  // GOOGLE_TENSORRT
+#endif  // GOOGLE_CUDA
 #endif  // TENSORFLOW_COMPILER_TF2TENSORRT_UTILS_TRT_ALLOCATOR_H_
diff --git a/tensorflow/compiler/tf2tensorrt/utils/trt_engine_utils.cc b/tensorflow/compiler/tf2tensorrt/utils/trt_engine_utils.cc
index 8ccfb8b06f0..ed997b267b1 100644
--- a/tensorflow/compiler/tf2tensorrt/utils/trt_engine_utils.cc
+++ b/tensorflow/compiler/tf2tensorrt/utils/trt_engine_utils.cc
@@ -25,7 +25,8 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/errors.h"
 
-#if GOOGLE_CUDA && GOOGLE_TENSORRT
+#if GOOGLE_CUDA
+#if GOOGLE_TENSORRT
 #include "third_party/tensorrt/NvInfer.h"
 
 namespace tensorflow {
@@ -256,4 +257,5 @@ Status TrtEnqueue(nvinfer1::IExecutionContext* execution_context,
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
+#endif  // GOOGLE_TENSORRT
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/compiler/tf2tensorrt/utils/trt_engine_utils.h b/tensorflow/compiler/tf2tensorrt/utils/trt_engine_utils.h
index 1ea4fe28cb4..a471749877a 100644
--- a/tensorflow/compiler/tf2tensorrt/utils/trt_engine_utils.h
+++ b/tensorflow/compiler/tf2tensorrt/utils/trt_engine_utils.h
@@ -24,7 +24,8 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/lib/core/status.h"
 
-#if GOOGLE_CUDA && GOOGLE_TENSORRT
+#if GOOGLE_CUDA
+#if GOOGLE_TENSORRT
 #include "third_party/tensorrt/NvInfer.h"
 
 namespace tensorflow {
@@ -90,6 +91,7 @@ Status TrtEnqueue(nvinfer1::IExecutionContext* execution_context,
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
+#endif  // GOOGLE_TENSORRT
+#endif  // GOOGLE_CUDA
 
 #endif  // TENSORFLOW_COMPILER_TF2TENSORRT_UTILS_TRT_ENGINE_UTILS_H_
diff --git a/tensorflow/compiler/tf2tensorrt/utils/trt_int8_calibrator.cc b/tensorflow/compiler/tf2tensorrt/utils/trt_int8_calibrator.cc
index 24271e352a7..554c127fa37 100644
--- a/tensorflow/compiler/tf2tensorrt/utils/trt_int8_calibrator.cc
+++ b/tensorflow/compiler/tf2tensorrt/utils/trt_int8_calibrator.cc
@@ -20,7 +20,8 @@ limitations under the License.
 
 #include "tensorflow/core/platform/logging.h"
 
-#if GOOGLE_CUDA && GOOGLE_TENSORRT
+#if GOOGLE_CUDA
+#if GOOGLE_TENSORRT
 #include "third_party/gpus/cuda/include/cuda_runtime_api.h"
 
 namespace tensorflow {
@@ -146,4 +147,5 @@ TRTInt8Calibrator::~TRTInt8Calibrator() {
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
+#endif
+#endif
diff --git a/tensorflow/compiler/tf2tensorrt/utils/trt_int8_calibrator.h b/tensorflow/compiler/tf2tensorrt/utils/trt_int8_calibrator.h
index 4c670e85f52..06b39716490 100644
--- a/tensorflow/compiler/tf2tensorrt/utils/trt_int8_calibrator.h
+++ b/tensorflow/compiler/tf2tensorrt/utils/trt_int8_calibrator.h
@@ -22,7 +22,8 @@ limitations under the License.
 #include <utility>
 #include "tensorflow/core/platform/mutex.h"
 
-#if GOOGLE_CUDA && GOOGLE_TENSORRT
+#if GOOGLE_CUDA
+#if GOOGLE_TENSORRT
 
 #include "third_party/gpus/cuda/include/cuda_runtime_api.h"
 #include "third_party/tensorrt/NvInfer.h"
@@ -100,5 +101,6 @@ struct TRTInt8Calibrator : public nvinfer1::IInt8EntropyCalibrator {
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
+#endif
+#endif
 #endif  // TENSORFLOW_COMPILER_TF2TENSORRT_UTILS_TRT_INT8_CALIBRATOR_H_
diff --git a/tensorflow/compiler/tf2tensorrt/utils/trt_logger.cc b/tensorflow/compiler/tf2tensorrt/utils/trt_logger.cc
index e34bf5e7397..193687ebc8c 100644
--- a/tensorflow/compiler/tf2tensorrt/utils/trt_logger.cc
+++ b/tensorflow/compiler/tf2tensorrt/utils/trt_logger.cc
@@ -15,7 +15,8 @@ limitations under the License.
 
 #include "tensorflow/compiler/tf2tensorrt/utils/trt_logger.h"
 
-#if GOOGLE_CUDA && GOOGLE_TENSORRT
+#if GOOGLE_CUDA
+#if GOOGLE_TENSORRT
 #include "tensorflow/compiler/tf2tensorrt/common/utils.h"
 #include "tensorflow/compiler/tf2tensorrt/convert/logger_registry.h"
 #include "tensorflow/core/platform/logging.h"
@@ -67,4 +68,5 @@ REGISTER_TENSORRT_LOGGER("DefaultLogger", Logger::GetLogger());
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
+#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_TENSORRT
diff --git a/tensorflow/compiler/tf2tensorrt/utils/trt_logger.h b/tensorflow/compiler/tf2tensorrt/utils/trt_logger.h
index ce6552e8fe9..2ade1b48f47 100644
--- a/tensorflow/compiler/tf2tensorrt/utils/trt_logger.h
+++ b/tensorflow/compiler/tf2tensorrt/utils/trt_logger.h
@@ -18,7 +18,8 @@ limitations under the License.
 
 #include "tensorflow/core/platform/types.h"
 
-#if GOOGLE_CUDA && GOOGLE_TENSORRT
+#if GOOGLE_CUDA
+#if GOOGLE_TENSORRT
 #include "third_party/tensorrt/NvInfer.h"
 
 namespace tensorflow {
@@ -39,6 +40,7 @@ class Logger : public nvinfer1::ILogger {
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
+#endif  // GOOGLE_TENSORRT
+#endif  // GOOGLE_CUDA
 
 #endif  // TENSORFLOW_COMPILER_TF2TENSORRT_UTILS_TRT_LOGGER_H_
diff --git a/tensorflow/compiler/tf2tensorrt/utils/trt_lru_cache.cc b/tensorflow/compiler/tf2tensorrt/utils/trt_lru_cache.cc
index ee7e6272372..fbcdaad52c0 100644
--- a/tensorflow/compiler/tf2tensorrt/utils/trt_lru_cache.cc
+++ b/tensorflow/compiler/tf2tensorrt/utils/trt_lru_cache.cc
@@ -23,7 +23,8 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/platform/mutex.h"
 
-#if GOOGLE_CUDA && GOOGLE_TENSORRT
+#if GOOGLE_CUDA
+#if GOOGLE_TENSORRT
 #include "third_party/tensorrt/NvInfer.h"
 
 namespace tensorflow {
@@ -140,4 +141,5 @@ EngineContext* TRTEngineCacheResource::GetEngineContext(const int profile_id) {
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
+#endif  // GOOGLE_TENSORRT
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/compiler/tf2tensorrt/utils/trt_lru_cache.h b/tensorflow/compiler/tf2tensorrt/utils/trt_lru_cache.h
index 991b9a949e4..8e345254f75 100644
--- a/tensorflow/compiler/tf2tensorrt/utils/trt_lru_cache.h
+++ b/tensorflow/compiler/tf2tensorrt/utils/trt_lru_cache.h
@@ -115,7 +115,8 @@ class LRUCache {
   }
 };
 
-#if GOOGLE_CUDA && GOOGLE_TENSORRT
+#if GOOGLE_CUDA
+#if GOOGLE_TENSORRT
 
 struct EngineContext {
   EngineContext() {}  // Creates an empty context.
@@ -222,7 +223,8 @@ class TRTEngineCacheResource : public ResourceBase {
   TrtShapeOptimizationProfile profiles_;
 };
 
-#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
+#endif  // GOOGLE_TENSORRT
+#endif  // GOOGLE_CUDA
 
 }  // namespace tensorrt
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2tensorrt/utils/trt_shape_optimization_profiles.h b/tensorflow/compiler/tf2tensorrt/utils/trt_shape_optimization_profiles.h
index fc688b14139..40c7f5dcf31 100644
--- a/tensorflow/compiler/tf2tensorrt/utils/trt_shape_optimization_profiles.h
+++ b/tensorflow/compiler/tf2tensorrt/utils/trt_shape_optimization_profiles.h
@@ -29,7 +29,8 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 
-#if GOOGLE_CUDA && GOOGLE_TENSORRT
+#if GOOGLE_CUDA
+#if GOOGLE_TENSORRT
 
 #include "third_party/tensorrt/NvInfer.h"
 
@@ -172,5 +173,6 @@ class TrtShapeOptimizationProfile {
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
+#endif  // GOOGLE_TENSORRT
+#endif  // GOOGLE_CUDA
 #endif  // TENSORFLOW_COMPILER_TF2TENSORRT_UTILS_TRT_SHAPE_OPTIMIZATION_PROFILES_H_
diff --git a/tensorflow/compiler/tf2tensorrt/utils/trt_shape_optimization_profiles_test.cc b/tensorflow/compiler/tf2tensorrt/utils/trt_shape_optimization_profiles_test.cc
index 32c2200fb71..501810587e0 100644
--- a/tensorflow/compiler/tf2tensorrt/utils/trt_shape_optimization_profiles_test.cc
+++ b/tensorflow/compiler/tf2tensorrt/utils/trt_shape_optimization_profiles_test.cc
@@ -13,7 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#if GOOGLE_CUDA && GOOGLE_TENSORRT
+#if GOOGLE_CUDA
+#if GOOGLE_TENSORRT
 
 #include <string.h>
 
@@ -213,4 +214,5 @@ TEST_F(TrtShapeOptimizationProfileTest, Dynamic) {
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
+#endif  // GOOGLE_TENSORRT
+#endif  // GOOGLE_CUDA

From 715b02167d188d15f0205273b93ea6bbd606f4c5 Mon Sep 17 00:00:00 2001
From: Wenhao Jia <jiawenhao@google.com>
Date: Fri, 19 Jun 2020 17:27:50 -0700
Subject: [PATCH 0689/1390] Restore TpuPlatform auto registration code.

PiperOrigin-RevId: 317409587
Change-Id: If44d7a39a45c4c7026f70a4d79d965a54c4db295
---
 tensorflow/stream_executor/tpu/tpu_platform.cc | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/tensorflow/stream_executor/tpu/tpu_platform.cc b/tensorflow/stream_executor/tpu/tpu_platform.cc
index 13a845829c1..4bccd822e91 100644
--- a/tensorflow/stream_executor/tpu/tpu_platform.cc
+++ b/tensorflow/stream_executor/tpu/tpu_platform.cc
@@ -134,4 +134,12 @@ void RegisterTpuPlatform() {
   }
 }
 
+REGISTER_MODULE_INITIALIZER(tpu_platform, RegisterTpuPlatform());
+
+// Note that module initialization sequencing is not supported in the
+// open-source project, so this will be a no-op there.
+REGISTER_MODULE_INITIALIZER_SEQUENCE(tpu_platform, multi_platform_manager);
+REGISTER_MODULE_INITIALIZER_SEQUENCE(multi_platform_manager_listener,
+                                     tpu_platform);
+
 }  // namespace tensorflow

From 3427843d707cf166cbd07755bd11a1c7dea76730 Mon Sep 17 00:00:00 2001
From: Akshay Modi <nareshmodi@google.com>
Date: Fri, 19 Jun 2020 17:33:29 -0700
Subject: [PATCH 0690/1390] Dist strat interop

PiperOrigin-RevId: 317410192
Change-Id: Ibfd1e3ac143422ccffa5f240075f5ae93a90ad07
---
 .../python/ops/numpy_ops/np_interop_test.py   | 49 +++++++++++++++++++
 1 file changed, 49 insertions(+)

diff --git a/tensorflow/python/ops/numpy_ops/np_interop_test.py b/tensorflow/python/ops/numpy_ops/np_interop_test.py
index f52d3dae78b..9580b787202 100644
--- a/tensorflow/python/ops/numpy_ops/np_interop_test.py
+++ b/tensorflow/python/ops/numpy_ops/np_interop_test.py
@@ -22,8 +22,13 @@ from __future__ import print_function
 import numpy as onp
 
 
+from tensorflow.python.distribute import distribution_strategy_context
+from tensorflow.python.distribute import mirrored_strategy
+from tensorflow.python.distribute import reduce_util
 from tensorflow.python.eager import backprop
+from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
+from tensorflow.python.framework import config
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import control_flow_ops
@@ -36,6 +41,17 @@ from tensorflow.python.platform import test
 
 class InteropTest(test.TestCase):
 
+  def setUp(self):
+    super(InteropTest, self).setUp()
+    physical_devices = config.list_physical_devices('CPU')
+    configs = config.get_logical_device_configuration(physical_devices[0])
+    if configs is None:
+      logical_devices = [
+          context.LogicalDeviceConfiguration() for _ in range(3)
+      ]
+      config.set_logical_device_configuration(physical_devices[0],
+                                              logical_devices)
+
   def testGradientTapeInterop(self):
     with backprop.GradientTape() as t:
       x = np_array_ops.asarray(3.0)
@@ -139,6 +155,39 @@ class InteropTest(test.TestCase):
 
 #     self.assertEqual(t.numpy(), [1., 2., 3.])
 
+  def testDistStratInterop(self):
+    strategy = mirrored_strategy.MirroredStrategy(
+        devices=['CPU:0', 'CPU:1', 'CPU:2'])
+
+    multiplier = np_array_ops.asarray(5.)
+
+    with strategy.scope():
+      @def_function.function
+      def run():
+        ctx = distribution_strategy_context.get_replica_context()
+        val = np_array_ops.asarray(ctx.replica_id_in_sync_group)
+        return val * multiplier
+
+      distributed_values = strategy.run(run)
+      reduced = strategy.reduce(reduce_util.ReduceOp.SUM,
+                                distributed_values, axis=None)
+
+    values = distributed_values.values
+
+    # Note that this should match the number of virtual CPUs.
+    self.assertLen(values, 3)
+    self.assertIsInstance(values[0], np_arrays.ndarray)
+    self.assertIsInstance(values[1], np_arrays.ndarray)
+    self.assertIsInstance(values[2], np_arrays.ndarray)
+    self.assertAllClose(values[0], 0)
+    self.assertAllClose(values[1], 5)
+    self.assertAllClose(values[2], 10)
+
+    # "strategy.reduce" doesn't rewrap in ndarray.
+    # self.assertIsInstance(reduced, np_arrays.ndarray)
+    self.assertAllClose(reduced, 15)
+
+
 if __name__ == '__main__':
   ops.enable_eager_execution()
   test.main()

From d737ef92f9c9e251ee3e9ad21090c489281a9dc9 Mon Sep 17 00:00:00 2001
From: XingyuLong <halolong@hotmail.com>
Date: Fri, 19 Jun 2020 20:45:26 -0400
Subject: [PATCH 0691/1390] Update

---
 .../filesystem/plugins/gcs/gcs_helper.cc      |  34 ++
 .../filesystem/plugins/gcs/gcs_helper.h       |  33 ++
 .../tf_saved_model/remove_init_variable_v1.py |  74 +++
 .../transforms/device_index_selector.cc       |  85 +++
 .../tests/tf_device_index_selector.mlir       |  25 +
 tensorflow/compiler/tests/case_test.py        |  87 +++
 .../compiler/tf2tensorrt/common/utils.h       |  35 ++
 .../service/cpu/test_target_triple_helper.h   |  28 +
 .../xla/service/gpu/reduction_splitter.cc     | 117 ++++
 .../xla/service/gpu/reduction_splitter.h      |  49 ++
 .../service/gpu/reduction_splitter_test.cc    | 140 +++++
 .../xla/tests/manifest_checking_test.cc       | 129 +++++
 .../xla/tests/manifest_checking_test.h        |  35 ++
 .../api_def_BandedTriangularSolve.pbtxt       |   4 +
 .../api_def/base_api/api_def_BesselI0.pbtxt   |   4 +
 .../api_def/base_api/api_def_BesselI1.pbtxt   |   4 +
 .../api_def/base_api/api_def_BesselJ0.pbtxt   |   4 +
 .../api_def/base_api/api_def_BesselJ1.pbtxt   |   4 +
 .../api_def/base_api/api_def_BesselK0.pbtxt   |   4 +
 .../api_def/base_api/api_def_BesselK0e.pbtxt  |   4 +
 .../api_def/base_api/api_def_BesselK1.pbtxt   |   4 +
 .../api_def/base_api/api_def_BesselK1e.pbtxt  |   4 +
 .../api_def/base_api/api_def_BesselY0.pbtxt   |   4 +
 .../api_def/base_api/api_def_BesselY1.pbtxt   |   4 +
 ...tatelessParameterizedTruncatedNormal.pbtxt |  54 ++
 .../api_def_BandedTriangularSolve.pbtxt       |   4 +
 .../kernels/banded_triangular_solve_op.cc     | 293 ++++++++++
 .../banded_triangular_solve_op_test.cc        | 180 ++++++
 tensorflow/core/kernels/cwise_op_neg_1.cc     |  44 ++
 tensorflow/core/kernels/cwise_op_neg_2.cc     |  26 +
 .../mlir_generated_op_gpu_tanh_test.cc        |  85 +++
 .../special_math/special_math_op_bessel.cc    |  78 +++
 .../special_math_op_gpu_bessel.cu.cc          |  41 ++
 .../core/kernels/topk_op_gpu_uint32.cu.cc     |  28 +
 .../core/kernels/topk_op_gpu_uint64.cu.cc     |  28 +
 .../BandedTriangularSolve.pbtxt               |  42 ++
 .../ops/compat/ops_history_v2/BesselI0.pbtxt  |  23 +
 .../ops/compat/ops_history_v2/BesselI1.pbtxt  |  23 +
 .../ops/compat/ops_history_v2/BesselJ0.pbtxt  |  23 +
 .../ops/compat/ops_history_v2/BesselJ1.pbtxt  |  23 +
 .../ops/compat/ops_history_v2/BesselK0.pbtxt  |  23 +
 .../ops/compat/ops_history_v2/BesselK0e.pbtxt |  23 +
 .../ops/compat/ops_history_v2/BesselK1.pbtxt  |  23 +
 .../ops/compat/ops_history_v2/BesselK1e.pbtxt |  23 +
 .../ops/compat/ops_history_v2/BesselY0.pbtxt  |  23 +
 .../ops/compat/ops_history_v2/BesselY1.pbtxt  |  23 +
 ...tatelessParameterizedTruncatedNormal.pbtxt |  65 +++
 .../utils/op_metrics_db_utils_test.cc         |  46 ++
 .../tpu_compilation_cache_entry_impl.h        | 108 ++++
 .../tpu_compilation_cache_entry_unloader.h    |  69 +++
 .../kernels/tpu_compilation_cache_interface.h | 355 ++++++++++++
 tensorflow/core/tpu/kernels/tpu_op_consts.cc  |  24 +
 tensorflow/core/tpu/kernels/tpu_op_consts.h   |  39 ++
 tensorflow/core/tpu/kernels/tpu_op_util.cc    | 151 +++++
 tensorflow/core/tpu/kernels/tpu_op_util.h     |  40 ++
 tensorflow/core/tpu/tpu_library_init_fns.inc  | 166 ++++++
 .../support/metadata/build_defs.bzl           |  43 ++
 .../metadata/cc/metadata_parser.h.template    |  28 +
 .../metadata/cc/test/metadata_parser_test.cc  |  33 ++
 .../lite/support/metadata/MetadataParser.java |  27 +
 .../metadata/metadata_parser.py.template      |  26 +
 .../support/metadata/metadata_parser_test.py  |  38 ++
 .../himax_we1_evb/detection_responder.cc      |  33 ++
 .../himax_we1_evb/image_provider.cc           |  41 ++
 .../lite/micro/himax_we1_evb/debug_log.cc     |  32 ++
 .../make/targets/himax_we1_evb_makefile.inc   |  91 +++
 .../tools/optimize/testdata/mixed16x8.bin     | Bin 0 -> 1184 bytes
 .../tools/optimize/testdata/transpose.bin     | Bin 0 -> 544 bytes
 .../python/framework/python_op_gen_test.cc    |  42 ++
 .../distribute/mirrored_strategy_test.py      |  89 +++
 .../distribute/mirrored_variable_test.py      | 106 ++++
 .../integration_test/tpu_strategy_test.py     |  69 +++
 .../banded_triangular_solve_op_test.py        | 232 ++++++++
 .../parameterized_truncated_normal_op_test.py | 520 ++++++++++++++++++
 tensorflow/python/ops/numpy_ops/README.md     |  94 ++++
 .../python/ops/numpy_ops/np_accessor.py       |  32 ++
 .../python/ops/numpy_ops/np_interop_test.py   | 144 +++++
 tensorflow/security/fuzzing/op_fuzzing/BUILD  |  39 ++
 .../fuzzing/op_fuzzing/fuzz_session.h         | 156 ++++++
 .../fuzzing/op_fuzzing/identity_fuzz.cc       |  45 ++
 .../security/fuzzing/status_group_fuzz.cc     |  66 +++
 ...flow.distribute.-distributed-dataset.pbtxt |  16 +
 ...low.distribute.-distributed-iterator.pbtxt |  20 +
 .../linux/mkl/install_openmpi_horovod.sh      |  80 +++
 .../per_release/scripts/nonpip_gpu.sh         |  75 +++
 85 files changed, 5328 insertions(+)
 create mode 100644 tensorflow/c/experimental/filesystem/plugins/gcs/gcs_helper.cc
 create mode 100644 tensorflow/c/experimental/filesystem/plugins/gcs/gcs_helper.h
 create mode 100644 tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/remove_init_variable_v1.py
 create mode 100644 tensorflow/compiler/mlir/tensorflow/transforms/device_index_selector.cc
 create mode 100644 tensorflow/compiler/tensorflow/tests/tf_device_index_selector.mlir
 create mode 100644 tensorflow/compiler/tests/case_test.py
 create mode 100644 tensorflow/compiler/tf2tensorrt/common/utils.h
 create mode 100644 tensorflow/compiler/xla/service/cpu/test_target_triple_helper.h
 create mode 100644 tensorflow/compiler/xla/service/gpu/reduction_splitter.cc
 create mode 100644 tensorflow/compiler/xla/service/gpu/reduction_splitter.h
 create mode 100644 tensorflow/compiler/xla/service/gpu/reduction_splitter_test.cc
 create mode 100644 tensorflow/compiler/xla/tests/manifest_checking_test.cc
 create mode 100644 tensorflow/compiler/xla/tests/manifest_checking_test.h
 create mode 100644 tensorflow/core/api_def/base_api/api_def_BandedTriangularSolve.pbtxt
 create mode 100644 tensorflow/core/api_def/base_api/api_def_BesselI0.pbtxt
 create mode 100644 tensorflow/core/api_def/base_api/api_def_BesselI1.pbtxt
 create mode 100644 tensorflow/core/api_def/base_api/api_def_BesselJ0.pbtxt
 create mode 100644 tensorflow/core/api_def/base_api/api_def_BesselJ1.pbtxt
 create mode 100644 tensorflow/core/api_def/base_api/api_def_BesselK0.pbtxt
 create mode 100644 tensorflow/core/api_def/base_api/api_def_BesselK0e.pbtxt
 create mode 100644 tensorflow/core/api_def/base_api/api_def_BesselK1.pbtxt
 create mode 100644 tensorflow/core/api_def/base_api/api_def_BesselK1e.pbtxt
 create mode 100644 tensorflow/core/api_def/base_api/api_def_BesselY0.pbtxt
 create mode 100644 tensorflow/core/api_def/base_api/api_def_BesselY1.pbtxt
 create mode 100644 tensorflow/core/api_def/base_api/api_def_StatelessParameterizedTruncatedNormal.pbtxt
 create mode 100644 tensorflow/core/api_def/python_api/api_def_BandedTriangularSolve.pbtxt
 create mode 100644 tensorflow/core/kernels/banded_triangular_solve_op.cc
 create mode 100644 tensorflow/core/kernels/banded_triangular_solve_op_test.cc
 create mode 100644 tensorflow/core/kernels/cwise_op_neg_1.cc
 create mode 100644 tensorflow/core/kernels/cwise_op_neg_2.cc
 create mode 100644 tensorflow/core/kernels/mlir_generated_op_gpu_tanh_test.cc
 create mode 100644 tensorflow/core/kernels/special_math/special_math_op_bessel.cc
 create mode 100644 tensorflow/core/kernels/special_math/special_math_op_gpu_bessel.cu.cc
 create mode 100644 tensorflow/core/kernels/topk_op_gpu_uint32.cu.cc
 create mode 100644 tensorflow/core/kernels/topk_op_gpu_uint64.cu.cc
 create mode 100644 tensorflow/core/ops/compat/ops_history_v2/BandedTriangularSolve.pbtxt
 create mode 100644 tensorflow/core/ops/compat/ops_history_v2/BesselI0.pbtxt
 create mode 100644 tensorflow/core/ops/compat/ops_history_v2/BesselI1.pbtxt
 create mode 100644 tensorflow/core/ops/compat/ops_history_v2/BesselJ0.pbtxt
 create mode 100644 tensorflow/core/ops/compat/ops_history_v2/BesselJ1.pbtxt
 create mode 100644 tensorflow/core/ops/compat/ops_history_v2/BesselK0.pbtxt
 create mode 100644 tensorflow/core/ops/compat/ops_history_v2/BesselK0e.pbtxt
 create mode 100644 tensorflow/core/ops/compat/ops_history_v2/BesselK1.pbtxt
 create mode 100644 tensorflow/core/ops/compat/ops_history_v2/BesselK1e.pbtxt
 create mode 100644 tensorflow/core/ops/compat/ops_history_v2/BesselY0.pbtxt
 create mode 100644 tensorflow/core/ops/compat/ops_history_v2/BesselY1.pbtxt
 create mode 100644 tensorflow/core/ops/compat/ops_history_v2/StatelessParameterizedTruncatedNormal.pbtxt
 create mode 100644 tensorflow/core/profiler/utils/op_metrics_db_utils_test.cc
 create mode 100644 tensorflow/core/tpu/kernels/tpu_compilation_cache_entry_impl.h
 create mode 100644 tensorflow/core/tpu/kernels/tpu_compilation_cache_entry_unloader.h
 create mode 100644 tensorflow/core/tpu/kernels/tpu_compilation_cache_interface.h
 create mode 100644 tensorflow/core/tpu/kernels/tpu_op_consts.cc
 create mode 100644 tensorflow/core/tpu/kernels/tpu_op_consts.h
 create mode 100644 tensorflow/core/tpu/kernels/tpu_op_util.cc
 create mode 100644 tensorflow/core/tpu/kernels/tpu_op_util.h
 create mode 100644 tensorflow/core/tpu/tpu_library_init_fns.inc
 create mode 100644 tensorflow/lite/experimental/support/metadata/build_defs.bzl
 create mode 100644 tensorflow/lite/experimental/support/metadata/cc/metadata_parser.h.template
 create mode 100644 tensorflow/lite/experimental/support/metadata/cc/test/metadata_parser_test.cc
 create mode 100644 tensorflow/lite/experimental/support/metadata/java/src/java/org/tensorflow/lite/support/metadata/MetadataParser.java
 create mode 100644 tensorflow/lite/experimental/support/metadata/metadata_parser.py.template
 create mode 100644 tensorflow/lite/experimental/support/metadata/metadata_parser_test.py
 create mode 100644 tensorflow/lite/micro/examples/person_detection_experimental/himax_we1_evb/detection_responder.cc
 create mode 100644 tensorflow/lite/micro/examples/person_detection_experimental/himax_we1_evb/image_provider.cc
 create mode 100644 tensorflow/lite/micro/himax_we1_evb/debug_log.cc
 create mode 100644 tensorflow/lite/micro/tools/make/targets/himax_we1_evb_makefile.inc
 create mode 100644 tensorflow/lite/tools/optimize/testdata/mixed16x8.bin
 create mode 100644 tensorflow/lite/tools/optimize/testdata/transpose.bin
 create mode 100644 tensorflow/python/framework/python_op_gen_test.cc
 create mode 100644 tensorflow/python/keras/distribute/mirrored_strategy_test.py
 create mode 100644 tensorflow/python/keras/distribute/mirrored_variable_test.py
 create mode 100644 tensorflow/python/keras/integration_test/tpu_strategy_test.py
 create mode 100644 tensorflow/python/kernel_tests/banded_triangular_solve_op_test.py
 create mode 100644 tensorflow/python/kernel_tests/random/parameterized_truncated_normal_op_test.py
 create mode 100644 tensorflow/python/ops/numpy_ops/README.md
 create mode 100644 tensorflow/python/ops/numpy_ops/np_accessor.py
 create mode 100644 tensorflow/python/ops/numpy_ops/np_interop_test.py
 create mode 100644 tensorflow/security/fuzzing/op_fuzzing/BUILD
 create mode 100644 tensorflow/security/fuzzing/op_fuzzing/fuzz_session.h
 create mode 100644 tensorflow/security/fuzzing/op_fuzzing/identity_fuzz.cc
 create mode 100644 tensorflow/security/fuzzing/status_group_fuzz.cc
 create mode 100644 tensorflow/tools/api/golden/v2/tensorflow.distribute.-distributed-dataset.pbtxt
 create mode 100644 tensorflow/tools/api/golden/v2/tensorflow.distribute.-distributed-iterator.pbtxt
 create mode 100755 tensorflow/tools/ci_build/linux/mkl/install_openmpi_horovod.sh
 create mode 100644 tensorflow/tools/ci_build/per_release/scripts/nonpip_gpu.sh

diff --git a/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_helper.cc b/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_helper.cc
new file mode 100644
index 00000000000..4504a9f3b35
--- /dev/null
+++ b/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_helper.cc
@@ -0,0 +1,34 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/c/experimental/filesystem/plugins/gcs/gcs_helper.h"
+
+#include <stdio.h>
+
+#include <fstream>
+#include <string>
+#include <utility>
+
+TempFile::TempFile(const char* temp_file_name, std::ios::openmode mode)
+    : std::fstream(temp_file_name, mode), name_(temp_file_name) {}
+
+TempFile::TempFile(TempFile&& rhs)
+    : std::fstream(std::move(rhs)), name_(std::move(rhs.name_)) {}
+
+TempFile::~TempFile() {
+  std::fstream::close();
+  std::remove(name_.c_str());
+}
+
+const std::string TempFile::getName() const { return name_; }
diff --git a/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_helper.h b/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_helper.h
new file mode 100644
index 00000000000..1a521ca4f1e
--- /dev/null
+++ b/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_helper.h
@@ -0,0 +1,33 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_C_EXPERIMENTAL_FILESYSTEM_PLUGINS_GCS_GCS_HELPER_H_
+#define TENSORFLOW_C_EXPERIMENTAL_FILESYSTEM_PLUGINS_GCS_GCS_HELPER_H_
+
+#include <fstream>
+#include <string>
+
+class TempFile : public std::fstream {
+ public:
+  // We should specify openmode each time we call TempFile.
+  TempFile(const char* temp_file_name, std::ios::openmode mode);
+  TempFile(TempFile&& rhs);
+  ~TempFile() override;
+  const std::string getName() const;
+
+ private:
+  const std::string name_;
+};
+
+#endif  // TENSORFLOW_C_EXPERIMENTAL_FILESYSTEM_PLUGINS_GCS_GCS_HELPER_H_
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/remove_init_variable_v1.py b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/remove_init_variable_v1.py
new file mode 100644
index 00000000000..117132649d7
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/remove_init_variable_v1.py
@@ -0,0 +1,74 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+# RUN: %p/remove_init_variable_v1 | FileCheck %s
+
+# pylint: disable=missing-docstring,line-too-long
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow.compat.v1 as tf
+from tensorflow.compiler.mlir.tensorflow.tests.tf_saved_model import common_v1
+
+# Verify that the tf.versions attribute exists. It is difficult to enforce
+# contents, since the version numbers change over time. The conversion logic
+# itself is verified in the common graphdef converter, so here just assert
+# it is being invoked.
+# CHECK: module
+# CHECK-SAME: tf.versions
+# CHECK-SAME: bad_consumers
+# CHECK-SAME: min_consumer
+# CHECK-SAME: producer
+
+# CHECK: "tf_saved_model.global_tensor"() {is_mutable, sym_name = "[[VAR:[a-zA-Z_0-9]+]]", type = tensor<1x3xf32>, value = {{.*}} : tensor<1x3xf32>} : () -> ()
+# CHECK-NOT: session_initializer
+
+# CHECK:      func {{@[a-zA-Z_0-9]+}}(
+# CHECK-SAME:   [[ARG0:%.*]]: tensor<3x1xf32> {tf_saved_model.index_path = ["x"]},
+# CHECK-SAME:   [[ARG1:%.*]]: tensor<!tf.resource<tensor<1x3xf32>>> {tf_saved_model.bound_input = @[[VAR]]})
+# CHECK-SAME:             -> (tensor<3x3xf32> {tf_saved_model.index_path = ["r"]})
+# CHECK-SAME: attributes {{.*}} tf_saved_model.exported_names = ["key"]
+
+# CHECK-NEXT: [[R0:%.*]] = "tf.ReadVariableOp"([[ARG1]]) {{{.*}}} : (tensor<!tf.resource<tensor<1x3xf32>>>) -> tensor<1x3xf32>
+# CHECK-NEXT: [[R1:%.*]] = "tf.MatMul"([[ARG0]], [[R0]]) {{{.*}}} : (tensor<3x1xf32>, tensor<1x3xf32>) -> tensor<3x3xf32>
+# CHECK-NEXT: return [[R1]] : tensor<3x3xf32>
+
+
+def Test():
+
+  x = tf.constant([[1.0], [1.0], [1.0]])
+  y = tf.compat.v1.get_variable(
+      name='y',
+      shape=(1, 3),
+      initializer=tf.random_normal_initializer(),
+      trainable=True)
+  r = tf.matmul(x, y)
+
+  tensor_info_x = tf.compat.v1.saved_model.utils.build_tensor_info(x)
+  tensor_info_r = tf.compat.v1.saved_model.utils.build_tensor_info(r)
+
+  return {
+      'key': (tf.compat.v1.saved_model.signature_def_utils.build_signature_def(
+          inputs={'x': tensor_info_x},
+          outputs={'r': tensor_info_r},
+          method_name='some_function'))
+  }
+
+
+if __name__ == '__main__':
+  common_v1.set_tf_options()
+  common_v1.do_test(
+      Test(), tf.initializers.global_variables(), canonicalize=True)
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/device_index_selector.cc b/tensorflow/compiler/mlir/tensorflow/transforms/device_index_selector.cc
new file mode 100644
index 00000000000..550647a915a
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/device_index_selector.cc
@@ -0,0 +1,85 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Converts DeviceIndex to constant device.
+
+#include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
+
+namespace mlir {
+namespace TF {
+namespace {
+
+// Folds the DeviceIndex op to a constant value. The DeviceIndex return the
+// index of the device the op should run on. The user can use this to provide
+// different op specializations. E.g.,
+//
+// ```mlir
+//  %1 = "tf.DeviceIndex"()
+//          {device = "", device_names = ["CPU", "GPU"]} : () -> tensor<i32>
+//  %4 = "tf.Case"(%1, %arg0, %arg1)
+//          {branches = [@foo, @baz], output_shapes = [#tf.shape<>]} :
+//            (tensor<i32>, tensor<f32>, tensor<f32>) -> tensor<f32>
+// ```
+//
+// Shows an example where there are 2 different functions which could be
+// executed to produce the same values but with different functions optimized
+// for CPU or GPU.
+struct DeviceIndexSelector
+    : public PassWrapper<DeviceIndexSelector, OperationPass<FuncOp>> {
+  void runOnOperation() override;
+};
+
+}  // namespace
+
+void DeviceIndexSelector::runOnOperation() {
+  FuncOp func = getOperation();
+  // Convert all the DeviceIndex ops to constant values.
+  func.getBody().walk([](TF::DeviceIndexOp op) {
+    // This just selects the default in all cases where DeviceIndex feeds into
+    // tf.Case. This could be enhanced to have some sort of policy in the
+    // future.
+    OpBuilder b(op);
+    RankedTensorType type = RankedTensorType::get({}, b.getIntegerType(32));
+    int index = op.device_names().size();
+    for (auto use : op.getOperation()->getUsers()) {
+      // Skip if it doesn't feed into case. Alternatively this could always
+      // return the CPU device index if it exists.
+      if (!isa<TF::CaseOp>(use)) return;
+    }
+    DenseElementsAttr attr =
+        DenseElementsAttr::get(type, b.getI32IntegerAttr(index));
+    auto constant = b.create<ConstantOp>(op.getLoc(), type, attr);
+    op.replaceAllUsesWith(constant.getOperation());
+    op.erase();
+  });
+}
+
+// Creates an instance of the TensorFlow DeviceIndex selector pass.
+std::unique_ptr<OperationPass<FuncOp>> CreateDeviceIndexSelectorPass() {
+  return std::make_unique<DeviceIndexSelector>();
+}
+
+static PassRegistration<DeviceIndexSelector> pass(
+    "tf-device-index-selector", "Fold tf.DeviceIndex to constant");
+
+}  // namespace TF
+}  // namespace mlir
diff --git a/tensorflow/compiler/tensorflow/tests/tf_device_index_selector.mlir b/tensorflow/compiler/tensorflow/tests/tf_device_index_selector.mlir
new file mode 100644
index 00000000000..7fc2b210f91
--- /dev/null
+++ b/tensorflow/compiler/tensorflow/tests/tf_device_index_selector.mlir
@@ -0,0 +1,25 @@
+// Test DeviceIndex selector.
+
+// RUN: tf-opt --tf-device-index-selector %s | FileCheck %s
+
+// CHECK-LABEL: func @select
+func @select(%arg0: tensor<f32>, %arg1: tensor<f32>) -> (tensor<i32>, tensor<f32>) {
+  // CHECK:  %[[first:.*]] = "tf.DeviceIndex"
+  // CHECK: constant dense<2>
+  // CHECK:  return %[[first]],
+  %0 = "tf.DeviceIndex"() {device = "", device_names = ["CPU", "GPU"]} : () -> tensor<i32>
+  %1 = "tf.DeviceIndex"() {device = "", device_names = ["CPU", "GPU"]} : () -> tensor<i32>
+  %4 = "tf.Case"(%1, %arg0, %arg1) {branches = [@sub, @add], output_shapes = [#tf.shape<>]} : (tensor<i32>, tensor<f32>, tensor<f32>) -> tensor<f32>
+
+  return %0, %4 : tensor<i32>, tensor<f32>
+}
+
+func @add(%i: tensor<i32>, %arg0: tensor<*xf32>, %arg1: tensor<*xf32>) -> tensor<*xf32> {
+  %0 = "tf.Add"(%arg0, %arg1): (tensor<*xf32>, tensor<*xf32>) -> tensor<*xf32>
+  return %0 : tensor<*xf32>
+}
+
+func @sub(%i: tensor<i32>, %arg0: tensor<*xf32>, %arg1: tensor<*xf32>) -> tensor<*xf32> {
+  %0 = "tf.Sub"(%arg0, %arg1) : (tensor<*xf32>, tensor<*xf32>) -> tensor<*xf32>
+  return %0 : tensor<*xf32>
+}
diff --git a/tensorflow/compiler/tests/case_test.py b/tensorflow/compiler/tests/case_test.py
new file mode 100644
index 00000000000..3b2dff537da
--- /dev/null
+++ b/tensorflow/compiler/tests/case_test.py
@@ -0,0 +1,87 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for while loops in XLA."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.compiler.tests import xla_test
+from tensorflow.python.eager import def_function
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import image_ops
+from tensorflow.python.ops import io_ops
+from tensorflow.python.platform import test
+
+
+class CaseTest(xla_test.XLATestCase):
+
+  def testCaseBasic(self):
+
+    @def_function.function(experimental_compile=True)
+    def switch_case_test(branch_index):
+
+      def f1():
+        return array_ops.constant(17)
+
+      def f2():
+        return array_ops.constant(31)
+
+      def f3():
+        return array_ops.constant(-1)
+
+      return control_flow_ops.switch_case(
+          branch_index, branch_fns={
+              0: f1,
+              1: f2
+          }, default=f3)
+
+    with ops.device(self.device):
+      self.assertEqual(switch_case_test(array_ops.constant(0)).numpy(), 17)
+      self.assertEqual(switch_case_test(array_ops.constant(1)).numpy(), 31)
+      self.assertEqual(switch_case_test(array_ops.constant(2)).numpy(), -1)
+      self.assertEqual(switch_case_test(array_ops.constant(3)).numpy(), -1)
+
+  def testBranchIsPruned(self):
+
+    @def_function.function(experimental_compile=True)
+    def switch_case_test():
+      branch_index = array_ops.constant(0)
+
+      def f1():
+        return array_ops.constant(17)
+
+      def f2():
+        # Some operations that XLA cannot compile.
+        image_ops.decode_image(io_ops.read_file('/tmp/bmp'))
+        return array_ops.constant(31)
+
+      # This tests that we do not try to compile all branches if the branch
+      # index in trivially constant.
+      return control_flow_ops.switch_case(
+          branch_index, branch_fns={
+              0: f1,
+              1: f2
+          }, default=f2)
+
+    with ops.device(self.device):
+      self.assertEqual(switch_case_test().numpy(), 17)
+
+
+if __name__ == '__main__':
+  ops.enable_eager_execution()
+  test.main()
diff --git a/tensorflow/compiler/tf2tensorrt/common/utils.h b/tensorflow/compiler/tf2tensorrt/common/utils.h
new file mode 100644
index 00000000000..9ab0145e1ec
--- /dev/null
+++ b/tensorflow/compiler/tf2tensorrt/common/utils.h
@@ -0,0 +1,35 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_TF2TENSORRT_COMMON_UTILS_H_
+#define TENSORFLOW_COMPILER_TF2TENSORRT_COMMON_UTILS_H_
+
+#if GOOGLE_CUDA
+#if GOOGLE_TENSORRT
+
+#include "tensorflow/core/platform/logging.h"
+
+namespace tensorflow {
+namespace tensorrt {
+
+#define LOG_WARNING_WITH_PREFIX LOG(WARNING) << "TF-TRT Warning: "
+
+}  // namespace tensorrt
+}  // namespace tensorflow
+
+#endif
+#endif
+
+#endif  // TENSORFLOW_COMPILER_TF2TENSORRT_COMMON_UTILS_H_
diff --git a/tensorflow/compiler/xla/service/cpu/test_target_triple_helper.h b/tensorflow/compiler/xla/service/cpu/test_target_triple_helper.h
new file mode 100644
index 00000000000..857de4a8143
--- /dev/null
+++ b/tensorflow/compiler/xla/service/cpu/test_target_triple_helper.h
@@ -0,0 +1,28 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_TEST_TARGET_TRIPLE_HELPER_H_
+#define TENSORFLOW_TEST_TARGET_TRIPLE_HELPER_H_
+
+#if (defined(__powerpc__) || \
+     defined(__ppc__) && (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__))
+static const char kTargetCpuForHost[] = "ppc";
+static const char kTargetTripleForHost[] = "ppc64le-ibm-linux-gnu";
+#else
+static const char kTargetCpuForHost[] = "";
+static const char kTargetTripleForHost[] = "x86_64-pc-linux";
+#endif
+
+#endif
diff --git a/tensorflow/compiler/xla/service/gpu/reduction_splitter.cc b/tensorflow/compiler/xla/service/gpu/reduction_splitter.cc
new file mode 100644
index 00000000000..b68213ec35f
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/reduction_splitter.cc
@@ -0,0 +1,117 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/gpu/reduction_splitter.h"
+
+#include <algorithm>
+
+#include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
+#include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
+#include "tensorflow/compiler/xla/service/hlo_instructions.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+
+namespace xla {
+namespace gpu {
+
+class ReductionSplitterVisitor : public DfsHloRewriteVisitor {
+ public:
+  Status HandleReduce(HloInstruction *reduce) override {
+    VLOG(4) << "Input: " << reduce->ToString();
+
+    // Reductions with contiguous dimensions are lowered to efficient code. No
+    // need to split such ops.
+    if (IsReductionFromOrToContiguousDimensions(*reduce)) {
+      return Status::OK();
+    }
+    if (reduce->dimensions().size() < 2) {
+      return Status::OK();
+    }
+    if (!reduce->shape().IsArray()) {
+      // TODO(cheshire): Handle variadic reduction.
+      return Status::OK();
+    }
+
+    HloInstruction *operand = reduce->mutable_operand(0);
+    const Shape &shape = operand->shape();
+    CHECK(shape == LayoutUtil::GetWithDefaultLayout(shape))
+        << "Default layout should be enforced on reduction operand";
+    // Verify that contiguous dimensions have been grouped by the
+    // ReductionDimensionGrouper pass.
+    for (int64 i = 0; i < reduce->dimensions().size(); ++i) {
+      for (int64 j = i + 1; j < reduce->dimensions().size(); ++j) {
+        CHECK(abs(reduce->dimensions(i) - reduce->dimensions(j)) > 1)
+            << "Reduction dimensions must not be consecutive";
+      }
+    }
+
+    // The reduce op has non-contiguous dimensions. Look for the dimension with
+    // the largest shape dimension. Reducing along this dimension first will
+    // reduce the output size most effectively.
+    int64 max_shape_dim = 0;
+    int64 max_reduce_dim = 0;
+    const auto &input_shape = reduce->operand(0)->shape();
+    for (int64 i = 0; i < reduce->dimensions().size(); ++i) {
+      if (input_shape.dimensions(reduce->dimensions(i)) > max_shape_dim) {
+        max_reduce_dim = reduce->dimensions(i);
+        max_shape_dim = input_shape.dimensions(max_reduce_dim);
+      }
+    }
+    // TODO(tjoerg): Run microbenchmarks to tune this threshold.
+    if (max_shape_dim < 128) {
+      return Status::OK();
+    }
+
+    // Split the reduction into a pre-reduction and a final reduction.
+    VLOG(3) << "Splitting reduction " << reduce->name() << " at dimension "
+            << max_reduce_dim;
+    std::vector<int64> pre_reduce_dims;
+    pre_reduce_dims.push_back(max_reduce_dim);
+    std::vector<int64> pre_reduce_shape_dims(input_shape.dimensions().begin(),
+                                             input_shape.dimensions().end());
+    pre_reduce_shape_dims.erase(pre_reduce_shape_dims.begin() + max_reduce_dim);
+    Shape pre_reduce_shape = ShapeUtil::MakeShape(
+        reduce->shape().element_type(), pre_reduce_shape_dims);
+    std::unique_ptr<HloInstruction> pre_reduce = HloInstruction::CreateReduce(
+        pre_reduce_shape, reduce->mutable_operand(0),
+        reduce->mutable_operand(1), pre_reduce_dims, reduce->to_apply());
+    pre_reduce->set_metadata(reduce->metadata());
+
+    std::vector<int64> final_reduce_dims(reduce->dimensions().begin(),
+                                         reduce->dimensions().end());
+    final_reduce_dims.erase(
+        std::remove(final_reduce_dims.begin(), final_reduce_dims.end(),
+                    max_reduce_dim),
+        final_reduce_dims.end());
+    for (int64 i = 0; i < final_reduce_dims.size(); ++i) {
+      if (final_reduce_dims[i] > max_reduce_dim) {
+        final_reduce_dims[i]--;
+      }
+    }
+    std::unique_ptr<HloInstruction> final_reduce = HloInstruction::CreateReduce(
+        reduce->shape(),
+        reduce->parent()->AddInstruction(std::move(pre_reduce)),
+        reduce->mutable_operand(1), final_reduce_dims, reduce->to_apply());
+    return ReplaceWithNewInstruction(reduce, std::move(final_reduce));
+  }
+};
+
+StatusOr<bool> ReductionSplitter::Run(HloModule *module) {
+  TF_ASSIGN_OR_RETURN(bool changed,
+                      ReductionSplitterVisitor().RunOnModule(module));
+  return changed;
+}
+
+}  // namespace gpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/reduction_splitter.h b/tensorflow/compiler/xla/service/gpu/reduction_splitter.h
new file mode 100644
index 00000000000..f161b579eb8
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/reduction_splitter.h
@@ -0,0 +1,49 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_REDUCTION_SPLITTER_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_REDUCTION_SPLITTER_H_
+
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
+
+namespace xla {
+namespace gpu {
+
+// Splits a reduce op into two consecutive reduce ops if
+// * the reduce dimensions are not contiguous and
+// * at least one reduce dimension is large (i.e. corresponds to a large input
+//   shape dimension).
+//
+// Reductions with non-contiguous dimensions are emitted as simple element-wise
+// loops. This is inefficient when reducing large input shape dimensions.
+// Splitting such reductions allows using more efficient reduction emitters.
+//
+// This pass splits reduce ops into two consecutive reduce ops. Run it to a
+// fixpoint to split reduce ops along multiple large dimensions.
+//
+// Precondition: ReductionDimensionGrouper has been run and adjacent reduce
+// dimentsions have been grouped. Reduction layouts have been normalized.
+
+class ReductionSplitter : public HloModulePass {
+ public:
+  absl::string_view name() const override { return "reduction-splitter"; }
+
+  StatusOr<bool> Run(HloModule* module) override;
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_REDUCTION_SPLITTER_H_
diff --git a/tensorflow/compiler/xla/service/gpu/reduction_splitter_test.cc b/tensorflow/compiler/xla/service/gpu/reduction_splitter_test.cc
new file mode 100644
index 00000000000..1be55b84204
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/reduction_splitter_test.cc
@@ -0,0 +1,140 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/gpu/reduction_splitter.h"
+
+#include "tensorflow/compiler/xla/service/hlo_matchers.h"
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/compiler/xla/test_helpers.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+
+namespace xla {
+namespace gpu {
+namespace {
+
+namespace op = xla::testing::opcode_matchers;
+
+class ReductionSplitterTest : public HloTestBase {};
+
+TEST_F(ReductionSplitterTest, SplitReductionAtDimensionTwo) {
+  auto module = ParseAndReturnVerifiedModule(R"(
+  HloModule test
+
+  add_computation {
+    x = f32[] parameter(0)
+    y = f32[] parameter(1)
+    ROOT add = f32[] add(x, y)
+  }
+
+  ENTRY entry_computation {
+    param_0 = f16[6,16,512,64]{3,2,1,0} parameter(0)
+    transpose.1781 = f16[6,512,16,64]{3,1,2,0} transpose(param_0), dimensions={0,2,1,3}
+    convert.6986 = f32[6,512,16,64]{3,1,2,0} convert(transpose.1781)
+    bitcast.2136 = f32[6,16,512,64]{3,2,1,0} bitcast(convert.6986)
+    constant_11111 = f32[] constant(0)
+    ROOT reduce.982 = f32[16,64]{1,0} reduce(bitcast.2136, constant_11111), dimensions={0,2}, to_apply=add_computation
+  }
+  )")
+                    .ValueOrDie();
+  ASSERT_TRUE(ReductionSplitter().Run(module.get()).ValueOrDie());
+  SCOPED_TRACE(module->ToString());
+  const HloInstruction* root_reduction =
+      module->entry_computation()->root_instruction();
+  ASSERT_THAT(root_reduction, op::Reduce(op::Reduce(), op::Constant()));
+
+  auto* pre_reduction = root_reduction->operand(0);
+  EXPECT_THAT(pre_reduction->dimensions(), std::vector<int64>({2}));
+  EXPECT_THAT(pre_reduction->shape(), ShapeUtil::MakeShape(F32, {6, 16, 64}));
+  EXPECT_THAT(root_reduction->dimensions(), std::vector<int64>({0}));
+  EXPECT_THAT(root_reduction->shape(), ShapeUtil::MakeShape(F32, {16, 64}));
+}
+
+TEST_F(ReductionSplitterTest, SplitReductionAtDimensionZero) {
+  auto module = ParseAndReturnVerifiedModule(R"(
+  HloModule test
+
+  add_computation {
+    x = f32[] parameter(0)
+    y = f32[] parameter(1)
+    ROOT add = f32[] add(x, y)
+  }
+
+  ENTRY entry_computation {
+    param_0 = f32[1024,16,512,64,128]{4,3,2,1,0} parameter(0)
+    constant_11111 = f32[] constant(0)
+    ROOT reduce.982 = f32[16,64]{1,0} reduce(param_0, constant_11111), dimensions={2,0,4}, to_apply=add_computation
+  }
+  )")
+                    .ValueOrDie();
+  ASSERT_TRUE(ReductionSplitter().Run(module.get()).ValueOrDie());
+  SCOPED_TRACE(module->ToString());
+  const HloInstruction* root_reduction =
+      module->entry_computation()->root_instruction();
+  ASSERT_THAT(root_reduction, op::Reduce(op::Reduce(), op::Constant()));
+
+  auto* pre_reduction = root_reduction->operand(0);
+  EXPECT_THAT(pre_reduction->dimensions(), std::vector<int64>({0}));
+  EXPECT_THAT(pre_reduction->shape(),
+              ShapeUtil::MakeShape(F32, {16, 512, 64, 128}));
+  EXPECT_THAT(root_reduction->dimensions(), std::vector<int64>({1, 3}));
+  EXPECT_THAT(root_reduction->shape(), ShapeUtil::MakeShape(F32, {16, 64}));
+}
+
+TEST_F(ReductionSplitterTest, DontSplitReductionWithSmallDimensions) {
+  auto module = ParseAndReturnVerifiedModule(R"(
+  HloModule test
+
+  add_computation {
+    x = f32[] parameter(0)
+    y = f32[] parameter(1)
+    ROOT add = f32[] add(x, y)
+  }
+
+  ENTRY entry_computation {
+    param_0 = f32[8,1024,8]{2,1,0} parameter(0)
+    constant_11111 = f32[] constant(0)
+    ROOT reduce.982 = f32[1024]{0} reduce(param_0, constant_11111), dimensions={2,0}, to_apply=add_computation
+  }
+  )")
+                    .ValueOrDie();
+  EXPECT_FALSE(ReductionSplitter().Run(module.get()).ValueOrDie());
+}
+
+TEST_F(ReductionSplitterTest, DontSplitReductionsWithContiguousDimensions) {
+  auto module = ParseAndReturnVerifiedModule(R"(
+  HloModule test
+
+  add_computation {
+    x = f32[] parameter(0)
+    y = f32[] parameter(1)
+    ROOT add = f32[] add(x, y)
+  }
+
+  ENTRY entry_computation {
+    param_0 = f32[128,128,64,128]{3,2,1,0} parameter(0)
+    constant_11111 = f32[] constant(0)
+    // The dimenstions to keep (1 and 2) are contiguous.
+    ROOT reduce.982 = f32[128,64]{1,0} reduce(param_0, constant_11111), dimensions={3,0}, to_apply=add_computation
+  }
+  )")
+                    .ValueOrDie();
+  EXPECT_FALSE(ReductionSplitter().Run(module.get()).ValueOrDie());
+}
+
+}  // namespace
+}  // namespace gpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/manifest_checking_test.cc b/tensorflow/compiler/xla/tests/manifest_checking_test.cc
new file mode 100644
index 00000000000..8806290472d
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/manifest_checking_test.cc
@@ -0,0 +1,129 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/tests/manifest_checking_test.h"
+
+#include <fstream>
+#include <iterator>
+#include <string>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/strings/ascii.h"
+#include "absl/strings/str_split.h"
+#include "tensorflow/compiler/xla/tests/test_macros.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/regexp.h"
+
+namespace xla {
+
+namespace {
+
+// Mapping from test name; i.e. MyTest.MyTestCase to platforms on which it is
+// disabled - a sequence of regexps.
+using ManifestT = absl::flat_hash_map<std::string, std::vector<std::string>>;
+
+ManifestT ReadManifest() {
+  ManifestT manifest;
+
+  absl::string_view path = absl::NullSafeStringView(kDisabledManifestPath);
+  if (path.empty()) {
+    return manifest;
+  }
+
+  // Note: parens are required to disambiguate vs function decl.
+  std::ifstream file_stream((std::string(path)));
+  std::string contents((std::istreambuf_iterator<char>(file_stream)),
+                       std::istreambuf_iterator<char>());
+
+  std::vector<std::string> lines = absl::StrSplit(contents, '\n');
+  for (std::string& line : lines) {
+    auto comment = line.find("//");
+    if (comment != std::string::npos) {
+      line = line.substr(0, comment);
+    }
+    if (line.empty()) {
+      continue;
+    }
+    absl::StripTrailingAsciiWhitespace(&line);
+    std::vector<std::string> pieces = absl::StrSplit(line, ' ');
+    CHECK_GE(pieces.size(), 1);
+    auto& platforms = manifest[pieces[0]];
+    for (size_t i = 1; i < pieces.size(); ++i) {
+      platforms.push_back(pieces[i]);
+    }
+  }
+  return manifest;
+}
+
+}  // namespace
+
+void ManifestCheckingTest::SetUp() {
+  const testing::TestInfo* test_info =
+      testing::UnitTest::GetInstance()->current_test_info();
+  absl::string_view test_case_name = test_info->test_suite_name();
+  absl::string_view test_name = test_info->name();
+  VLOG(1) << "test_case_name: " << test_case_name;
+  VLOG(1) << "test_name: " << test_name;
+
+  // Remove the type suffix from the test case name.
+  if (const char* type_param = test_info->type_param()) {
+    VLOG(1) << "type_param: " << type_param;
+    size_t last_slash = test_case_name.rfind('/');
+    test_case_name = test_case_name.substr(0, last_slash);
+    VLOG(1) << "test_case_name: " << test_case_name;
+  }
+
+  // Remove the test instantiation name if it is present.
+  auto first_slash = test_case_name.find('/');
+  if (first_slash != test_case_name.npos) {
+    test_case_name.remove_prefix(first_slash + 1);
+    VLOG(1) << "test_case_name: " << test_case_name;
+  }
+
+  ManifestT manifest = ReadManifest();
+
+  // If the test name ends with a slash followed by one or more characters,
+  // strip that off.
+  auto last_slash = test_name.rfind('/');
+  if (last_slash != test_name.npos) {
+    test_name = test_name.substr(0, last_slash);
+    VLOG(1) << "test_name: " << test_name;
+  }
+
+  // First try full match: test_case_name.test_name
+  // If that fails, try to find just the test_case_name; this would disable all
+  // tests in the test case.
+  auto it = manifest.find(absl::StrCat(test_case_name, ".", test_name));
+  if (it == manifest.end()) {
+    it = manifest.find(test_case_name);
+    if (it == manifest.end()) {
+      return;
+    }
+  }
+
+  // Expect a full match vs. one of the platform regexps to disable the test.
+  const std::vector<std::string>& disabled_platforms = it->second;
+  auto platform_string = kTestPlatform;
+  for (const auto& s : disabled_platforms) {
+    if (RE2::FullMatch(/*text=*/platform_string, /*re=*/s)) {
+      GTEST_SKIP();
+      return;
+    }
+  }
+
+  // We didn't hit in the disabled manifest entries, so don't disable it.
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/manifest_checking_test.h b/tensorflow/compiler/xla/tests/manifest_checking_test.h
new file mode 100644
index 00000000000..4f44ed76a3e
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/manifest_checking_test.h
@@ -0,0 +1,35 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_TESTS_MANIFEST_CHECKING_TEST_H_
+#define TENSORFLOW_COMPILER_XLA_TESTS_MANIFEST_CHECKING_TEST_H_
+
+#include "tensorflow/core/platform/test.h"
+
+namespace xla {
+
+// This class allows us to intercept the test name and use an arbitrary
+// heuristic to decide whether the test case should be disabled. We
+// determine whether the test case should be disabled by resolving the (test
+// case name, test name) in a manifest file.
+class ManifestCheckingTest : public ::testing::Test {
+ protected:
+  // This method runs before each test runs.
+  void SetUp() override;
+};
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_TESTS_MANIFEST_CHECKING_TEST_H_
diff --git a/tensorflow/core/api_def/base_api/api_def_BandedTriangularSolve.pbtxt b/tensorflow/core/api_def/base_api/api_def_BandedTriangularSolve.pbtxt
new file mode 100644
index 00000000000..ba5e1bdcaf2
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_BandedTriangularSolve.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "BandedTriangularSolve"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_BesselI0.pbtxt b/tensorflow/core/api_def/base_api/api_def_BesselI0.pbtxt
new file mode 100644
index 00000000000..2c47960429c
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_BesselI0.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "BesselI0"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_BesselI1.pbtxt b/tensorflow/core/api_def/base_api/api_def_BesselI1.pbtxt
new file mode 100644
index 00000000000..e0007b44162
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_BesselI1.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "BesselI1"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_BesselJ0.pbtxt b/tensorflow/core/api_def/base_api/api_def_BesselJ0.pbtxt
new file mode 100644
index 00000000000..4010afadcb8
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_BesselJ0.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "BesselJ0"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_BesselJ1.pbtxt b/tensorflow/core/api_def/base_api/api_def_BesselJ1.pbtxt
new file mode 100644
index 00000000000..12d16910227
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_BesselJ1.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "BesselJ1"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_BesselK0.pbtxt b/tensorflow/core/api_def/base_api/api_def_BesselK0.pbtxt
new file mode 100644
index 00000000000..31d701c821b
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_BesselK0.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "BesselK0"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_BesselK0e.pbtxt b/tensorflow/core/api_def/base_api/api_def_BesselK0e.pbtxt
new file mode 100644
index 00000000000..fac0c1b3459
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_BesselK0e.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "BesselK0e"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_BesselK1.pbtxt b/tensorflow/core/api_def/base_api/api_def_BesselK1.pbtxt
new file mode 100644
index 00000000000..de80f304540
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_BesselK1.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "BesselK1"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_BesselK1e.pbtxt b/tensorflow/core/api_def/base_api/api_def_BesselK1e.pbtxt
new file mode 100644
index 00000000000..c565a85def2
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_BesselK1e.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "BesselK1e"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_BesselY0.pbtxt b/tensorflow/core/api_def/base_api/api_def_BesselY0.pbtxt
new file mode 100644
index 00000000000..af57e504d65
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_BesselY0.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "BesselY0"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_BesselY1.pbtxt b/tensorflow/core/api_def/base_api/api_def_BesselY1.pbtxt
new file mode 100644
index 00000000000..b2cd9827f6f
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_BesselY1.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "BesselY1"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_StatelessParameterizedTruncatedNormal.pbtxt b/tensorflow/core/api_def/base_api/api_def_StatelessParameterizedTruncatedNormal.pbtxt
new file mode 100644
index 00000000000..15bd4670cef
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_StatelessParameterizedTruncatedNormal.pbtxt
@@ -0,0 +1,54 @@
+op {
+  graph_op_name: "StatelessParameterizedTruncatedNormal"
+  visibility: HIDDEN
+  in_arg {
+    name: "shape"
+    description: <<END
+The shape of the output tensor.
+END
+  }
+  in_arg {
+    name: "seed"
+    description: <<END
+2 seeds (shape [2]).
+END
+  }
+  in_arg {
+    name: "means"
+    description: <<END
+The mean parameter of each batch.
+END
+  }
+  in_arg {
+    name: "stddevs"
+    description: <<END
+The standard deviation parameter of each batch. Must be greater than 0.
+END
+  }
+  in_arg {
+    name: "minvals"
+    description: <<END
+The minimum cutoff. May be -infinity.
+END
+  }
+  in_arg {
+    name: "maxvals"
+    description: <<END
+The maximum cutoff. May be +infinity, and must be more than the minval
+for each batch.
+END
+  }
+  attr {
+    name: "dtype"
+    description: <<END
+The type of the output.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+The outputs are truncated normal samples and are a deterministic function of
+`shape`, `seed`, `minvals`, `maxvals`, `means` and `stddevs`.
+END
+  }
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_BandedTriangularSolve.pbtxt b/tensorflow/core/api_def/python_api/api_def_BandedTriangularSolve.pbtxt
new file mode 100644
index 00000000000..ba5e1bdcaf2
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_BandedTriangularSolve.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "BandedTriangularSolve"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/kernels/banded_triangular_solve_op.cc b/tensorflow/core/kernels/banded_triangular_solve_op.cc
new file mode 100644
index 00000000000..d01a015502a
--- /dev/null
+++ b/tensorflow/core/kernels/banded_triangular_solve_op.cc
@@ -0,0 +1,293 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// See docs in ../ops/linalg_ops.cc.
+
+#include "third_party/eigen3/Eigen/Core"
+#include "tensorflow/core/framework/kernel_def_builder.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/kernels/fill_functor.h"
+#include "tensorflow/core/kernels/linalg_ops_common.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/matmul_bcast.h"
+
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+
+template <typename Scalar>
+Scalar eigen_conj(const Scalar& scalar) {
+  return Eigen::numext::conj<Scalar>(scalar);
+}
+
+// Sequential batch matrix triangular solve kernel that calls Eigen's
+// matrix triangular solve.
+template <typename Scalar>
+struct SequentialBandedTriangularSolveKernel {
+  using Matrix =
+      Eigen::Matrix<Scalar, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
+  using ConstMatrixMap = Eigen::Map<const Matrix>;
+  using MatrixMap = Eigen::Map<Matrix>;
+  using RealScalar = typename Eigen::NumTraits<Scalar>::Real;
+
+  static ConstMatrixMap ConstTensorSliceToEigenMatrix(const Tensor& t,
+                                                      int slice) {
+    return ConstMatrixMap(
+        t.flat<Scalar>().data() + slice * t.dim_size(1) * t.dim_size(2),
+        t.dim_size(1), t.dim_size(2));
+  }
+
+  static MatrixMap TensorSliceToEigenMatrix(Tensor* t, int slice) {
+    return MatrixMap(
+        t->flat<Scalar>().data() + slice * t->dim_size(1) * t->dim_size(2),
+        t->dim_size(1), t->dim_size(2));
+  }
+
+  static void Run(const Tensor& in_x, const Tensor& in_y, bool lower,
+                  bool adjoint, const MatMulBCast& bcast, Tensor* out,
+                  int start, int limit) {
+    const bool should_bcast = bcast.IsBroadcastingRequired();
+    const auto& x_batch_indices = bcast.x_batch_indices();
+    const auto& y_batch_indices = bcast.y_batch_indices();
+    int num_bands = in_x.dim_size(1);
+    int matrix_size = in_x.dim_size(2);
+
+    for (int64 i = start; i < limit; ++i) {
+      const int64 x_batch_index = should_bcast ? x_batch_indices[i] : i;
+      const int64 y_batch_index = should_bcast ? y_batch_indices[i] : i;
+      auto matrix = ConstTensorSliceToEigenMatrix(in_x, x_batch_index);
+      auto rhs = ConstTensorSliceToEigenMatrix(in_y, y_batch_index);
+      auto output = TensorSliceToEigenMatrix(out, i);
+      // Below, we use the standard algorithm for computing a triangular solve,
+      // except we band limit it.
+      // Given A x = b, where A is lower triangular,
+      // x_i = (b_i - sum a_ij * x_j) / a_ii, where the sum is from
+      // j = 0 to i - 1.
+      //
+      // Now, in a banded triangular matrix, when i exceeds the band size,
+      // then the sum goes from j = i - band_size to i - 1, since the other
+      // elements are zero.
+      //
+      // Finally, given the band storage format, we'll need to change the
+      // indexing.
+      if (lower) {
+        if (!adjoint) {
+          output.row(0) = rhs.row(0) / matrix(0, 0);
+          for (int i = 1; i < matrix_size; ++i) {
+            if (i < num_bands) {
+              output.row(i).noalias() =
+                  (rhs.row(i) - matrix.block(1, i, i, 1).reverse().transpose() *
+                                    output.topRows(i)) /
+                  matrix(0, i);
+            } else {
+              output.row(i).noalias() =
+                  (rhs.row(i) -
+                   matrix.block(1, i, num_bands - 1, 1).reverse().transpose() *
+                       output.middleRows(i - (num_bands - 1), num_bands - 1)) /
+                  matrix(0, i);
+            }
+          }
+        } else {
+          // In the adjoint case, here and below, we now have an upper (lower)
+          // triangular matrix, and thus need to work through with the other
+          // case. We can't simply conjugate `matrix` and use the upper (lower)
+          // algorithm because the band storage format for upper and lower
+          // triangular matrices are different (in the lower case, we pad
+          // entries on the left, and in the upper case we pad entries on the
+          // right.
+          output.row(matrix_size - 1) =
+              rhs.row(matrix_size - 1) / eigen_conj(matrix(0, matrix_size - 1));
+          for (int i = matrix_size - 1; i >= 0; --i) {
+            output.row(i).noalias() = rhs.row(i);
+            for (int j = i + 1; j < std::min(matrix_size, i + num_bands); ++j) {
+              output.row(i).noalias() -=
+                  eigen_conj(matrix(j - i, j)) * output.row(j);
+            }
+            output.row(i) /= eigen_conj(matrix(0, i));
+          }
+        }
+      } else {
+        if (!adjoint) {
+          output.row(matrix_size - 1) =
+              rhs.row(matrix_size - 1) / matrix(num_bands - 1, matrix_size - 1);
+          for (int i = 1; i < matrix_size; ++i) {
+            int k = matrix_size - 1 - i;
+            if (i < num_bands) {
+              output.row(k).noalias() =
+                  (rhs.row(k) - matrix.block(num_bands - 1 - i, k, i, 1)
+                                        .reverse()
+                                        .transpose() *
+                                    output.bottomRows(i)) /
+                  matrix(num_bands - 1, k);
+            } else {
+              output.row(k).noalias() =
+                  (rhs.row(k) -
+                   matrix.block(0, k, num_bands - 1, 1).reverse().transpose() *
+                       output.middleRows(k + 1, num_bands - 1)) /
+                  matrix(num_bands - 1, k);
+            }
+          }
+        } else {
+          output.row(0) = rhs.row(0) / eigen_conj(matrix(num_bands - 1, 0));
+          for (int i = 1; i < matrix_size; ++i) {
+            output.row(i).noalias() = rhs.row(i);
+            for (int j = std::max(0, i - (num_bands - 1)); j < i; ++j) {
+              output.row(i).noalias() -=
+                  eigen_conj(matrix(num_bands - 1 - (i - j), j)) *
+                  output.row(j);
+            }
+            output.row(i) /= eigen_conj(matrix(num_bands - 1, i));
+          }
+        }
+      }
+    }
+  }
+};
+
+template <typename Scalar>
+struct LaunchBatchBandedTriangularSolve;
+
+template <typename Scalar>
+struct LaunchBatchBandedTriangularSolve {
+  static void Launch(OpKernelContext* context, const Tensor& in_x,
+                     const Tensor& in_y, bool adjoint, bool lower,
+                     const MatMulBCast& bcast, Tensor* out) {
+    // Number of banded matrix triangular solves i.e. size of the batch.
+    const int64 batch_size = bcast.output_batch_size();
+    const int64 cost_per_unit =
+        in_x.dim_size(1) * in_x.dim_size(2) * in_y.dim_size(2);
+    auto worker_threads = *(context->device()->tensorflow_cpu_worker_threads());
+
+    using Matrix =
+        Eigen::Matrix<Scalar, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
+    using ConstMatrixMap = Eigen::Map<const Matrix>;
+    using RealScalar = typename Eigen::NumTraits<Scalar>::Real;
+    // Check diagonal before doing any solves. This is the first row in the
+    // lower case and else is the last row.
+    auto matrix = ConstMatrixMap(in_x.flat<Scalar>().data(), in_x.dim_size(1),
+                                 in_x.dim_size(2));
+    RealScalar min_abs_pivot;
+    if (lower) {
+      min_abs_pivot = matrix.row(0).cwiseAbs().minCoeff();
+    } else {
+      min_abs_pivot = matrix.row(in_x.dim_size(1) - 1).cwiseAbs().minCoeff();
+    }
+    OP_REQUIRES(context, min_abs_pivot > RealScalar(0),
+                errors::InvalidArgument("Input matrix is not invertible."));
+
+    Shard(worker_threads.num_threads, worker_threads.workers, batch_size,
+          cost_per_unit,
+          [&in_x, &in_y, adjoint, lower, &bcast, out](int start, int limit) {
+            SequentialBandedTriangularSolveKernel<Scalar>::Run(
+                in_x, in_y, lower, adjoint, bcast, out, start, limit);
+          });
+  }
+};
+
+template <typename Scalar>
+class BandedTriangularSolveOpCpu : public OpKernel {
+ public:
+  explicit BandedTriangularSolveOpCpu(OpKernelConstruction* context)
+      : OpKernel(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("lower", &lower_));
+    OP_REQUIRES_OK(context, context->GetAttr("adjoint", &adjoint_));
+  }
+
+  ~BandedTriangularSolveOpCpu() override {}
+
+  void Compute(OpKernelContext* ctx) override {
+    const Tensor& in0 = ctx->input(0);
+    const Tensor& in1 = ctx->input(1);
+
+    ValidateInputTensors(ctx, in0, in1);
+
+    MatMulBCast bcast(in0.shape().dim_sizes(), in1.shape().dim_sizes());
+    OP_REQUIRES(
+        ctx, bcast.IsValid(),
+        errors::InvalidArgument(
+            "In[0] and In[1] must have compatible batch dimensions: ",
+            in0.shape().DebugString(), " vs. ", in1.shape().DebugString()));
+
+    TensorShape out_shape = bcast.output_batch_shape();
+    auto batch_size = bcast.output_batch_size();
+    auto d0 = in0.dim_size(in0.dims() - 2);  // Band size.
+    auto d1 = in0.dim_size(in0.dims() - 1);
+    Tensor in0_reshaped;
+    OP_REQUIRES(
+        ctx,
+        in0_reshaped.CopyFrom(in0, TensorShape({bcast.x_batch_size(), d0, d1})),
+        errors::Internal("Failed to reshape In[0] from ",
+                         in0.shape().DebugString()));
+    auto d2 = in1.dim_size(in1.dims() - 2);
+    auto d3 = in1.dim_size(in1.dims() - 1);
+    Tensor in1_reshaped;
+    OP_REQUIRES(
+        ctx,
+        in1_reshaped.CopyFrom(in1, TensorShape({bcast.y_batch_size(), d2, d3})),
+        errors::Internal("Failed to reshape In[1] from ",
+                         in1.shape().DebugString()));
+    OP_REQUIRES(ctx, d1 == d2,
+                errors::InvalidArgument(
+                    "In[0] mismatch In[1] shape: ", d1, " vs. ", d2, ": ",
+                    in0.shape().DebugString(), " ", in1.shape().DebugString(),
+                    " ", lower_, " ", adjoint_));
+    out_shape.AddDim(d1);
+    out_shape.AddDim(d3);
+    Tensor* out = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, out_shape, &out));
+    if (out->NumElements() == 0) {
+      return;
+    }
+    Tensor out_reshaped;
+    OP_REQUIRES(ctx,
+                out_reshaped.CopyFrom(*out, TensorShape({batch_size, d1, d3})),
+                errors::Internal("Failed to reshape output from ",
+                                 out->shape().DebugString()));
+    LaunchBatchBandedTriangularSolve<Scalar>::Launch(
+        ctx, in0_reshaped, in1_reshaped, adjoint_, lower_, bcast,
+        &out_reshaped);
+  }
+
+ private:
+  void ValidateInputTensors(OpKernelContext* ctx, const Tensor& in0,
+                            const Tensor& in1) {
+    OP_REQUIRES(
+        ctx, in0.dims() >= 2,
+        errors::InvalidArgument("In[0] ndims must be >= 2: ", in0.dims()));
+
+    OP_REQUIRES(
+        ctx, in1.dims() >= 2,
+        errors::InvalidArgument("In[1] ndims must be >= 2: ", in1.dims()));
+  }
+  bool lower_;
+  bool adjoint_;
+};
+
+#define REGISTER_BANDED_TRIANGULAR_SOLVE_CPU(TYPE)        \
+  REGISTER_KERNEL_BUILDER(Name("BandedTriangularSolve")   \
+                              .Device(DEVICE_CPU)         \
+                              .TypeConstraint<TYPE>("T"), \
+                          BandedTriangularSolveOpCpu<TYPE>);
+
+REGISTER_BANDED_TRIANGULAR_SOLVE_CPU(float);
+REGISTER_BANDED_TRIANGULAR_SOLVE_CPU(double);
+REGISTER_BANDED_TRIANGULAR_SOLVE_CPU(complex64);
+REGISTER_BANDED_TRIANGULAR_SOLVE_CPU(complex128);
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/banded_triangular_solve_op_test.cc b/tensorflow/core/kernels/banded_triangular_solve_op_test.cc
new file mode 100644
index 00000000000..37e904a3e0e
--- /dev/null
+++ b/tensorflow/core/kernels/banded_triangular_solve_op_test.cc
@@ -0,0 +1,180 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/graph/node_builder.h"
+#include "tensorflow/core/graph/testlib.h"
+#include "tensorflow/core/kernels/matrix_set_diag_op.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/test_benchmark.h"
+
+namespace tensorflow {
+namespace {
+
+Node* SetDiag(int num_bands, Graph* g, Node* bands, Node* triangular) {
+  Node* ret;
+  Tensor bandwidth(DT_INT32, TensorShape({2}));
+  bandwidth.flat<int32>()(0) = -(num_bands - 1);
+  bandwidth.flat<int32>()(1) = 0;
+  TF_CHECK_OK(NodeBuilder(g->NewName("n"), "MatrixSetDiagV3")
+                  .Input(triangular)
+                  .Input(bands)
+                  .Input(test::graph::Constant(g, bandwidth))
+                  .Attr("align", "RIGHT_LEFT")
+                  .Finalize(g, &ret));
+  return ret;
+}
+
+Node* BandedTriangularSolve(Graph* g, Node* in0, Node* in1) {
+  Node* ret;
+  TF_CHECK_OK(NodeBuilder(g->NewName("n"), "BandedTriangularSolve")
+                  .Input(in0)
+                  .Input(in1)
+                  .Attr("lower", true)
+                  .Attr("adjoint", false)
+                  .Finalize(g, &ret));
+  return ret;
+}
+
+Node* MatrixTriangularSolve(Graph* g, Node* in0, Node* in1) {
+  Node* ret;
+  TF_CHECK_OK(NodeBuilder(g->NewName("n"), "MatrixTriangularSolve")
+                  .Input(in0)
+                  .Input(in1)
+                  .Attr("lower", true)
+                  .Attr("adjoint", false)
+                  .Finalize(g, &ret));
+  return ret;
+}
+
+template <typename T>
+static Graph* BandedTriangularSolve(int64 num_bands, int64 n, int64 m,
+                                    bool use_banded_solver, DataType type) {
+  Graph* g = new Graph(OpRegistry::Global());
+  Tensor in0(type, TensorShape({num_bands, n}));
+  // Set diagonal to nonzero to guarantee invertibility.
+  in0.flat<T>().setRandom();
+  in0.flat<T>() =
+      in0.flat<T>().abs() + in0.flat<T>().constant(static_cast<T>(0.5));
+  Tensor in1(type, TensorShape({n, m}));
+  in1.flat<T>().setRandom();
+  if (use_banded_solver) {
+    BandedTriangularSolve(g, test::graph::Constant(g, in0),
+                          test::graph::Constant(g, in1));
+  } else {
+    // Create a zero tensor.
+    Tensor in2(type, TensorShape({n, n}));
+    in2.flat<T>().setZero();
+    Node* triangular_matrix =
+        SetDiag(num_bands, g, test::graph::Constant(g, in0),
+                test::graph::Constant(g, in2));
+    MatrixTriangularSolve(g, triangular_matrix, test::graph::Constant(g, in1));
+  }
+  return g;
+}
+
+// Macro arguments names: --------------------------------------------------- //
+//   K: Number of bands
+//   N: Inner dimension of LHS, Inner dimension of RHS.
+//   M: Outer dimensions of RHS
+//   BS: boolean indicating whether to use the banded solver
+//    T: C++ type of scalars (e.g. float, std::complex)
+//   TT: TensorFlow type of scalars (e.g. DT_FLOAT, DT_COMPLEX128
+#define BM_BandedTriangularSolveDev(K, N, M, BS, T, TT, D)                     \
+  static void BM_BandedTriangularSolve##_##K##_##N##_##M##_##BS##_##TT(        \
+      int iters) {                                                             \
+    testing::UseRealTime();                                                    \
+    testing::ItemsProcessed(static_cast<int64>(iters) * K * N + N * M);        \
+    test::Benchmark(#D, BandedTriangularSolve<T>(K, N, M, BS, TT)).Run(iters); \
+  }                                                                            \
+  BENCHMARK(BM_BandedTriangularSolve##_##K##_##N##_##M##_##BS##_##TT);
+
+#define BM_BandedTriangularSolve(K, N, M, BS, D)                \
+  BM_BandedTriangularSolveDev(K, N, M, BS, float, DT_FLOAT, D); \
+  BM_BandedTriangularSolveDev(K, N, M, BS, double, DT_DOUBLE, D);
+
+// Small number of bands, few rhs
+BM_BandedTriangularSolve(2, 32, 1, true, cpu);
+BM_BandedTriangularSolve(2, 32, 1, false, cpu);
+BM_BandedTriangularSolve(4, 32, 1, true, cpu);
+BM_BandedTriangularSolve(4, 32, 1, false, cpu);
+BM_BandedTriangularSolve(8, 32, 1, true, cpu);
+BM_BandedTriangularSolve(8, 32, 1, false, cpu);
+BM_BandedTriangularSolve(16, 32, 1, true, cpu);
+BM_BandedTriangularSolve(16, 32, 1, false, cpu);
+BM_BandedTriangularSolve(2, 128, 1, true, cpu);
+BM_BandedTriangularSolve(2, 128, 1, false, cpu);
+BM_BandedTriangularSolve(4, 128, 1, true, cpu);
+BM_BandedTriangularSolve(4, 128, 1, false, cpu);
+BM_BandedTriangularSolve(8, 128, 1, true, cpu);
+BM_BandedTriangularSolve(8, 128, 1, false, cpu);
+BM_BandedTriangularSolve(16, 128, 1, true, cpu);
+BM_BandedTriangularSolve(16, 128, 1, false, cpu);
+BM_BandedTriangularSolve(2, 512, 1, true, cpu);
+BM_BandedTriangularSolve(2, 512, 1, false, cpu);
+BM_BandedTriangularSolve(4, 512, 1, true, cpu);
+BM_BandedTriangularSolve(4, 512, 1, false, cpu);
+BM_BandedTriangularSolve(8, 512, 1, true, cpu);
+BM_BandedTriangularSolve(8, 512, 1, false, cpu);
+BM_BandedTriangularSolve(16, 512, 1, true, cpu);
+BM_BandedTriangularSolve(16, 512, 1, false, cpu);
+
+// Larger # rhs
+BM_BandedTriangularSolve(2, 32, 32, true, cpu);
+BM_BandedTriangularSolve(2, 32, 32, false, cpu);
+BM_BandedTriangularSolve(4, 32, 32, true, cpu);
+BM_BandedTriangularSolve(4, 32, 32, false, cpu);
+BM_BandedTriangularSolve(8, 32, 32, true, cpu);
+BM_BandedTriangularSolve(8, 32, 32, false, cpu);
+BM_BandedTriangularSolve(16, 32, 32, true, cpu);
+BM_BandedTriangularSolve(16, 32, 32, false, cpu);
+BM_BandedTriangularSolve(2, 128, 128, true, cpu);
+BM_BandedTriangularSolve(2, 128, 128, false, cpu);
+BM_BandedTriangularSolve(4, 128, 128, true, cpu);
+BM_BandedTriangularSolve(4, 128, 128, false, cpu);
+BM_BandedTriangularSolve(8, 128, 128, true, cpu);
+BM_BandedTriangularSolve(8, 128, 128, false, cpu);
+BM_BandedTriangularSolve(16, 128, 128, true, cpu);
+BM_BandedTriangularSolve(16, 128, 128, false, cpu);
+BM_BandedTriangularSolve(2, 512, 512, true, cpu);
+BM_BandedTriangularSolve(2, 512, 512, false, cpu);
+BM_BandedTriangularSolve(4, 512, 512, true, cpu);
+BM_BandedTriangularSolve(4, 512, 512, false, cpu);
+BM_BandedTriangularSolve(8, 512, 512, true, cpu);
+BM_BandedTriangularSolve(8, 512, 512, false, cpu);
+BM_BandedTriangularSolve(16, 512, 512, true, cpu);
+BM_BandedTriangularSolve(16, 512, 512, false, cpu);
+
+BM_BandedTriangularSolve(2, 2048, 2048, true, cpu);
+BM_BandedTriangularSolve(2, 2048, 2048, false, cpu);
+BM_BandedTriangularSolve(4, 2048, 2048, true, cpu);
+BM_BandedTriangularSolve(4, 2048, 2048, false, cpu);
+BM_BandedTriangularSolve(8, 2048, 2048, true, cpu);
+BM_BandedTriangularSolve(8, 2048, 2048, false, cpu);
+BM_BandedTriangularSolve(16, 2048, 2048, true, cpu);
+BM_BandedTriangularSolve(16, 2048, 2048, false, cpu);
+BM_BandedTriangularSolve(32, 2048, 2048, true, cpu);
+BM_BandedTriangularSolve(32, 2048, 2048, false, cpu);
+BM_BandedTriangularSolve(64, 2048, 2048, true, cpu);
+BM_BandedTriangularSolve(64, 2048, 2048, false, cpu);
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_neg_1.cc b/tensorflow/core/kernels/cwise_op_neg_1.cc
new file mode 100644
index 00000000000..18a7c61be90
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_neg_1.cc
@@ -0,0 +1,44 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/kernels/cwise_ops_common.h"
+
+namespace tensorflow {
+REGISTER4(UnaryOp, CPU, "Neg", functor::neg, int8, int16, int32, int64);
+
+#ifdef TENSORFLOW_USE_SYCL
+REGISTER3(UnaryOp, SYCL, "Neg", functor::neg, float, double, int64);
+REGISTER_KERNEL_BUILDER(Name("Neg")
+                            .Device(DEVICE_SYCL)
+                            .HostMemory("x")
+                            .HostMemory("y")
+                            .TypeConstraint<int32>("T"),
+                        UnaryOp<CPUDevice, functor::neg<int32>>);
+#endif  // TENSORFLOW_USE_SYCL
+
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+REGISTER3(UnaryOp, GPU, "Neg", functor::neg, int8, int16, int64);
+
+// A special GPU kernel for int32.
+// TODO(b/25387198): Also enable int32 in device memory. This kernel
+// registration requires all int32 inputs and outputs to be in host memory.
+REGISTER_KERNEL_BUILDER(Name("Neg")
+                            .Device(DEVICE_GPU)
+                            .HostMemory("x")
+                            .HostMemory("y")
+                            .TypeConstraint<int32>("T"),
+                        UnaryOp<CPUDevice, functor::neg<int32>>);
+#endif
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_neg_2.cc b/tensorflow/core/kernels/cwise_op_neg_2.cc
new file mode 100644
index 00000000000..5ea78ad665c
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_neg_2.cc
@@ -0,0 +1,26 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/kernels/cwise_ops_common.h"
+
+namespace tensorflow {
+REGISTER6(UnaryOp, CPU, "Neg", functor::neg, Eigen::half, float, double,
+          bfloat16, complex64, complex128);
+
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+REGISTER6(UnaryOp, GPU, "Neg", functor::neg, Eigen::half, float, double,
+          bfloat16, complex64, complex128);
+#endif
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/mlir_generated_op_gpu_tanh_test.cc b/tensorflow/core/kernels/mlir_generated_op_gpu_tanh_test.cc
new file mode 100644
index 00000000000..39c1d709b1e
--- /dev/null
+++ b/tensorflow/core/kernels/mlir_generated_op_gpu_tanh_test.cc
@@ -0,0 +1,85 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <cmath>
+#include <memory>
+#include <vector>
+
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/common_runtime/device_factory.h"
+#include "tensorflow/core/framework/fake_input.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/kernels/ops_testutil.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace {
+
+class MlirGeneratedOpGpuTanhTest : public OpsTestBase {
+ protected:
+  void SetUp() override {
+    std::unique_ptr<tensorflow::Device> device_gpu(
+        tensorflow::DeviceFactory::NewDevice("GPU", {},
+                                             "/job:a/replica:0/task:0"));
+    SetDevice(tensorflow::DEVICE_GPU, std::move(device_gpu));
+  }
+  template <typename T, typename RT = T>
+  void RunTanhOp(std::initializer_list<T> input) {
+    TensorShape shape({2, 7});
+    TF_ASSERT_OK(NodeDefBuilder("tanh_op", "Tanh")
+                     .Input(FakeInput(DataTypeToEnum<T>::v()))
+                     .Attr("T", DataTypeToEnum<T>::v())
+                     .Finalize(node_def()));
+
+    TF_ASSERT_OK(InitOp());
+    AddInputFromArray<T>(shape, input);
+    TF_ASSERT_OK(RunOpKernel());
+
+    Tensor expected_tensor(allocator(), DataTypeToEnum<T>::value, shape);
+    std::vector<T> expected;
+    expected.reserve(input.size());
+    for (const T& inp : input) {
+      expected.push_back(static_cast<T>(std::tanh(static_cast<RT>(inp))));
+    }
+    test::FillValues<T>(&expected_tensor, expected);
+    test::ExpectClose(expected_tensor, *GetOutput(0));
+  }
+};
+
+TEST_F(MlirGeneratedOpGpuTanhTest, TanhFloat) {
+  RunTanhOp<float>({-18.0f, -9.0f, -1e-6f, -0.0f, 0.0f, 1e-6, 0.1f, 0.2f, 0.3f,
+                    0.5f, 0.7f, 0.9f, 9.0f, 18.0f});
+}
+
+TEST_F(MlirGeneratedOpGpuTanhTest, TanhDouble) {
+  RunTanhOp<double>({-18.0, -9.0, -1e-6, -0.0, 0.0, 1e-6, 0.1, 0.2, 0.3, 0.5,
+                     0.7, 0.9, 9.0, 18.0});
+}
+
+TEST_F(MlirGeneratedOpGpuTanhTest, TanhHalf) {
+  RunTanhOp<Eigen::half, float>(
+      {static_cast<Eigen::half>(-18.0), static_cast<Eigen::half>(-9.0),
+       static_cast<Eigen::half>(-1e-6), static_cast<Eigen::half>(-0.0),
+       static_cast<Eigen::half>(0.0), static_cast<Eigen::half>(1e-6),
+       static_cast<Eigen::half>(0.1), static_cast<Eigen::half>(0.2),
+       static_cast<Eigen::half>(0.3), static_cast<Eigen::half>(0.5),
+       static_cast<Eigen::half>(0.7), static_cast<Eigen::half>(0.9),
+       static_cast<Eigen::half>(9.0), static_cast<Eigen::half>(18.0)});
+}
+
+}  // namespace
+}  // end namespace tensorflow
diff --git a/tensorflow/core/kernels/special_math/special_math_op_bessel.cc b/tensorflow/core/kernels/special_math/special_math_op_bessel.cc
new file mode 100644
index 00000000000..8efa183655e
--- /dev/null
+++ b/tensorflow/core/kernels/special_math/special_math_op_bessel.cc
@@ -0,0 +1,78 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/kernels/cwise_ops_common.h"
+#include "tensorflow/core/kernels/special_math/special_math_op_misc_impl.h"
+
+namespace tensorflow {
+REGISTER3(UnaryOp, CPU, "BesselI0", functor::bessel_i0, Eigen::half, float,
+          double);
+REGISTER3(UnaryOp, CPU, "BesselI1", functor::bessel_i1, Eigen::half, float,
+          double);
+REGISTER3(UnaryOp, CPU, "BesselI0e", functor::bessel_i0e, Eigen::half, float,
+          double);
+REGISTER3(UnaryOp, CPU, "BesselI1e", functor::bessel_i1e, Eigen::half, float,
+          double);
+
+REGISTER3(UnaryOp, CPU, "BesselK0", functor::bessel_k0, Eigen::half, float,
+          double);
+REGISTER3(UnaryOp, CPU, "BesselK1", functor::bessel_k1, Eigen::half, float,
+          double);
+REGISTER3(UnaryOp, CPU, "BesselK0e", functor::bessel_k0e, Eigen::half, float,
+          double);
+REGISTER3(UnaryOp, CPU, "BesselK1e", functor::bessel_k1e, Eigen::half, float,
+          double);
+
+REGISTER3(UnaryOp, CPU, "BesselJ0", functor::bessel_j0, Eigen::half, float,
+          double);
+REGISTER3(UnaryOp, CPU, "BesselJ1", functor::bessel_j1, Eigen::half, float,
+          double);
+
+REGISTER3(UnaryOp, CPU, "BesselY0", functor::bessel_y0, Eigen::half, float,
+          double);
+REGISTER3(UnaryOp, CPU, "BesselY1", functor::bessel_y1, Eigen::half, float,
+          double);
+
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
+REGISTER3(UnaryOp, GPU, "BesselI0", functor::bessel_i0, Eigen::half, float,
+          double);
+REGISTER3(UnaryOp, GPU, "BesselI1", functor::bessel_i1, Eigen::half, float,
+          double);
+REGISTER3(UnaryOp, GPU, "BesselI0e", functor::bessel_i0e, Eigen::half, float,
+          double);
+REGISTER3(UnaryOp, GPU, "BesselI1e", functor::bessel_i1e, Eigen::half, float,
+          double);
+
+REGISTER3(UnaryOp, GPU, "BesselK0", functor::bessel_k0, Eigen::half, float,
+          double);
+REGISTER3(UnaryOp, GPU, "BesselK1", functor::bessel_k1, Eigen::half, float,
+          double);
+REGISTER3(UnaryOp, GPU, "BesselK0e", functor::bessel_k0e, Eigen::half, float,
+          double);
+REGISTER3(UnaryOp, GPU, "BesselK1e", functor::bessel_k1e, Eigen::half, float,
+          double);
+
+REGISTER3(UnaryOp, GPU, "BesselJ0", functor::bessel_j0, Eigen::half, float,
+          double);
+REGISTER3(UnaryOp, GPU, "BesselJ1", functor::bessel_j1, Eigen::half, float,
+          double);
+
+REGISTER3(UnaryOp, GPU, "BesselY0", functor::bessel_y0, Eigen::half, float,
+          double);
+REGISTER3(UnaryOp, GPU, "BesselY1", functor::bessel_y1, Eigen::half, float,
+          double);
+#endif
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/special_math/special_math_op_gpu_bessel.cu.cc b/tensorflow/core/kernels/special_math/special_math_op_gpu_bessel.cu.cc
new file mode 100644
index 00000000000..bfb4f253390
--- /dev/null
+++ b/tensorflow/core/kernels/special_math/special_math_op_gpu_bessel.cu.cc
@@ -0,0 +1,41 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
+#include "tensorflow/core/kernels/cwise_ops_gpu_common.cu.h"
+#include "tensorflow/core/kernels/special_math/special_math_op_misc_impl.h"
+
+namespace tensorflow {
+namespace functor {
+DEFINE_UNARY3(bessel_i0, Eigen::half, float, double);
+DEFINE_UNARY3(bessel_i1, Eigen::half, float, double);
+DEFINE_UNARY3(bessel_i0e, Eigen::half, float, double);
+DEFINE_UNARY3(bessel_i1e, Eigen::half, float, double);
+
+DEFINE_UNARY3(bessel_k0, Eigen::half, float, double);
+DEFINE_UNARY3(bessel_k1, Eigen::half, float, double);
+DEFINE_UNARY3(bessel_k0e, Eigen::half, float, double);
+DEFINE_UNARY3(bessel_k1e, Eigen::half, float, double);
+
+DEFINE_UNARY3(bessel_j0, Eigen::half, float, double);
+DEFINE_UNARY3(bessel_j1, Eigen::half, float, double);
+DEFINE_UNARY3(bessel_y0, Eigen::half, float, double);
+DEFINE_UNARY3(bessel_y1, Eigen::half, float, double);
+
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/tensorflow/core/kernels/topk_op_gpu_uint32.cu.cc b/tensorflow/core/kernels/topk_op_gpu_uint32.cu.cc
new file mode 100644
index 00000000000..16e2e0e9420
--- /dev/null
+++ b/tensorflow/core/kernels/topk_op_gpu_uint32.cu.cc
@@ -0,0 +1,28 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/kernels/topk_op.h"
+#include "tensorflow/core/kernels/topk_op_gpu.h"
+
+namespace tensorflow {
+using Eigen::GpuDevice;
+
+template struct functor::TopKFunctor<GPUDevice, uint32>;
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/topk_op_gpu_uint64.cu.cc b/tensorflow/core/kernels/topk_op_gpu_uint64.cu.cc
new file mode 100644
index 00000000000..895247a63a2
--- /dev/null
+++ b/tensorflow/core/kernels/topk_op_gpu_uint64.cu.cc
@@ -0,0 +1,28 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/kernels/topk_op.h"
+#include "tensorflow/core/kernels/topk_op_gpu.h"
+
+namespace tensorflow {
+using Eigen::GpuDevice;
+
+template struct functor::TopKFunctor<GPUDevice, uint64>;
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/ops/compat/ops_history_v2/BandedTriangularSolve.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/BandedTriangularSolve.pbtxt
new file mode 100644
index 00000000000..5cf85a62392
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v2/BandedTriangularSolve.pbtxt
@@ -0,0 +1,42 @@
+op {
+  name: "BandedTriangularSolve"
+  input_arg {
+    name: "matrix"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rhs"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "lower"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "adjoint"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+        type: DT_HALF
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/BesselI0.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/BesselI0.pbtxt
new file mode 100644
index 00000000000..78d524c916c
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v2/BesselI0.pbtxt
@@ -0,0 +1,23 @@
+op {
+  name: "BesselI0"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/BesselI1.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/BesselI1.pbtxt
new file mode 100644
index 00000000000..e756c4655dd
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v2/BesselI1.pbtxt
@@ -0,0 +1,23 @@
+op {
+  name: "BesselI1"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/BesselJ0.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/BesselJ0.pbtxt
new file mode 100644
index 00000000000..35e14e5fdf1
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v2/BesselJ0.pbtxt
@@ -0,0 +1,23 @@
+op {
+  name: "BesselJ0"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/BesselJ1.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/BesselJ1.pbtxt
new file mode 100644
index 00000000000..ef8814ea8f7
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v2/BesselJ1.pbtxt
@@ -0,0 +1,23 @@
+op {
+  name: "BesselJ1"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/BesselK0.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/BesselK0.pbtxt
new file mode 100644
index 00000000000..ebb364d0371
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v2/BesselK0.pbtxt
@@ -0,0 +1,23 @@
+op {
+  name: "BesselK0"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/BesselK0e.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/BesselK0e.pbtxt
new file mode 100644
index 00000000000..e3e680c9549
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v2/BesselK0e.pbtxt
@@ -0,0 +1,23 @@
+op {
+  name: "BesselK0e"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/BesselK1.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/BesselK1.pbtxt
new file mode 100644
index 00000000000..f7ca7c2f6e2
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v2/BesselK1.pbtxt
@@ -0,0 +1,23 @@
+op {
+  name: "BesselK1"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/BesselK1e.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/BesselK1e.pbtxt
new file mode 100644
index 00000000000..96fe68d7b7f
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v2/BesselK1e.pbtxt
@@ -0,0 +1,23 @@
+op {
+  name: "BesselK1e"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/BesselY0.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/BesselY0.pbtxt
new file mode 100644
index 00000000000..cd62af34773
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v2/BesselY0.pbtxt
@@ -0,0 +1,23 @@
+op {
+  name: "BesselY0"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/BesselY1.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/BesselY1.pbtxt
new file mode 100644
index 00000000000..06f4c08eaf6
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v2/BesselY1.pbtxt
@@ -0,0 +1,23 @@
+op {
+  name: "BesselY1"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/StatelessParameterizedTruncatedNormal.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/StatelessParameterizedTruncatedNormal.pbtxt
new file mode 100644
index 00000000000..598125677b1
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v2/StatelessParameterizedTruncatedNormal.pbtxt
@@ -0,0 +1,65 @@
+op {
+  name: "StatelessParameterizedTruncatedNormal"
+  input_arg {
+    name: "shape"
+    type_attr: "S"
+  }
+  input_arg {
+    name: "seed"
+    type_attr: "Tseed"
+  }
+  input_arg {
+    name: "means"
+    type_attr: "dtype"
+  }
+  input_arg {
+    name: "stddevs"
+    type_attr: "dtype"
+  }
+  input_arg {
+    name: "minvals"
+    type_attr: "dtype"
+  }
+  input_arg {
+    name: "maxvals"
+    type_attr: "dtype"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "S"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "Tseed"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/profiler/utils/op_metrics_db_utils_test.cc b/tensorflow/core/profiler/utils/op_metrics_db_utils_test.cc
new file mode 100644
index 00000000000..12c68426b2e
--- /dev/null
+++ b/tensorflow/core/profiler/utils/op_metrics_db_utils_test.cc
@@ -0,0 +1,46 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/profiler/utils/op_metrics_db_utils.h"
+
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/profiler/protobuf/op_metrics.pb.h"
+
+namespace tensorflow {
+namespace profiler {
+namespace {
+
+constexpr double kMaxError = 1E-10;
+
+TEST(OpMetricsDbTest, IdleTimeRatio) {
+  OpMetricsDb metrics_db_0;
+  metrics_db_0.set_total_time_ps(100000000);
+  metrics_db_0.set_total_op_time_ps(60000000);
+  EXPECT_NEAR(0.4, IdleTimeRatio(metrics_db_0), kMaxError);
+
+  OpMetricsDb metrics_db_1;
+  metrics_db_1.set_total_time_ps(200000000);
+  metrics_db_1.set_total_op_time_ps(150000000);
+  EXPECT_NEAR(0.25, IdleTimeRatio(metrics_db_1), kMaxError);
+
+  OpMetricsDb metrics_db_2;
+  metrics_db_1.set_total_time_ps(0);
+  metrics_db_1.set_total_op_time_ps(0);
+  EXPECT_NEAR(1.0, IdleTimeRatio(metrics_db_2), kMaxError);
+}
+
+}  // namespace
+}  // namespace profiler
+}  // namespace tensorflow
diff --git a/tensorflow/core/tpu/kernels/tpu_compilation_cache_entry_impl.h b/tensorflow/core/tpu/kernels/tpu_compilation_cache_entry_impl.h
new file mode 100644
index 00000000000..501f802b01f
--- /dev/null
+++ b/tensorflow/core/tpu/kernels/tpu_compilation_cache_entry_impl.h
@@ -0,0 +1,108 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_TPU_KERNELS_TPU_COMPILATION_CACHE_ENTRY_IMPL_H_
+#define TENSORFLOW_CORE_TPU_KERNELS_TPU_COMPILATION_CACHE_ENTRY_IMPL_H_
+
+#include "tensorflow/core/tpu/kernels/compiled_subgraph.h"
+#include "tensorflow/core/tpu/kernels/tpu_compilation_cache_interface.h"
+#include "tensorflow/core/tpu/kernels/tpu_executable_info.pb.h"
+
+namespace tensorflow {
+namespace tpu {
+
+// Wrapper for a cache entry that holds a reference to the entry until the
+// wrapper is deleted. This wrapper is the concrete type of
+// CompilationCacheEntryRef returned by Lookup.
+template <typename CacheEntryType>
+class CompilationCacheEntryRefImpl
+    : public CompilationCacheEntryRef<CacheEntryType> {
+ public:
+  CompilationCacheEntryRefImpl(TpuCompilationCacheInterface* parent,
+                               CompiledSubgraph* entry, int index);
+
+  ~CompilationCacheEntryRefImpl() override;
+
+  Status ToSubEntryRef(CompilationCacheFetchTarget fetch_target) override;
+
+ protected:
+  TpuCompilationCacheInterface* parent_;  // Not owned.
+  // A reference to entry_ is acquired in the constructor and released via
+  // parent->DiscardEntryRefs in the destructor.
+  CompiledSubgraph* entry_;
+  // The index of the program in entry_ that is returned by the get method.
+  int index_;
+};
+
+template <typename CacheEntryType>
+CompilationCacheEntryRefImpl<CacheEntryType>::CompilationCacheEntryRefImpl(
+    TpuCompilationCacheInterface* parent, CompiledSubgraph* entry, int index)
+    : parent_(parent), entry_(entry), index_(index) {
+  if (entry_ == nullptr) {
+    return;
+  }
+  if (entry_->main_entry == nullptr) {
+    entry_->Ref();
+  } else {
+    // This is a sharding/unsharding entry nested in a main entry. Only
+    // refcount the main entry.
+    entry_->main_entry->Ref();
+  }
+}
+
+template <typename CacheEntryType>
+CompilationCacheEntryRefImpl<CacheEntryType>::~CompilationCacheEntryRefImpl() {
+  if (entry_ == nullptr) {
+    return;
+  }
+  if (entry_->main_entry == nullptr) {
+    parent_->DiscardEntryRefs({entry_});
+  } else {
+    parent_->DiscardEntryRefs({entry_->main_entry});
+  }
+}
+
+template <typename CacheEntryType>
+Status CompilationCacheEntryRefImpl<CacheEntryType>::ToSubEntryRef(
+    CompilationCacheFetchTarget fetch_target) {
+  CompiledSubgraph* target = nullptr;
+  switch (fetch_target) {
+    case CompilationCacheFetchTarget::MAIN:
+      target = entry_;
+      break;
+    case CompilationCacheFetchTarget::SHARDING:
+      target = entry_->sharding_entry.get();
+      break;
+    case CompilationCacheFetchTarget::UNSHARDING:
+      target = entry_->unsharding_entry.get();
+      break;
+    default:
+      return xla::InvalidArgument("Invalid fetch target: %d", fetch_target);
+  }
+
+  if (target == nullptr) {
+    // Cache entry does not have an unsharding subentry. Unref and replace
+    // with nullptr.
+    parent_->DiscardEntryRefs({entry_});
+  }
+  // Otherwise, since the refcount is always on the main entry, we don't
+  // need ref/unref.
+  entry_ = target;
+  return Status::OK();
+}
+
+}  // namespace tpu
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TPU_KERNELS_TPU_COMPILATION_CACHE_ENTRY_IMPL_H_
diff --git a/tensorflow/core/tpu/kernels/tpu_compilation_cache_entry_unloader.h b/tensorflow/core/tpu/kernels/tpu_compilation_cache_entry_unloader.h
new file mode 100644
index 00000000000..c298d8fcc12
--- /dev/null
+++ b/tensorflow/core/tpu/kernels/tpu_compilation_cache_entry_unloader.h
@@ -0,0 +1,69 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_TPU_KERNELS_COMPILATION_CACHE_ENTRY_UNLOADER_H_
+#define TENSORFLOW_CORE_TPU_KERNELS_COMPILATION_CACHE_ENTRY_UNLOADER_H_
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/synchronization/mutex.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/tpu/kernels/tpu_compilation_cache_interface.h"
+
+namespace tensorflow {
+namespace tpu {
+
+class TpuCompilationCacheEntryUnloader : public ResourceBase {
+ public:
+  explicit TpuCompilationCacheEntryUnloader(TpuCompilationCacheInterface* cache)
+      : cache_(cache) {
+    // Hold a reference to the cache until the unloader is destroyed.
+    cache_->Ref();
+    VLOG(1) << "Will unload compilation cache entries when session closes.";
+  }
+
+  ~TpuCompilationCacheEntryUnloader() override {
+    absl::MutexLock lock(&mu_);
+    for (int64 uid : cache_entry_uids_) {
+      Status s = cache_->MarkEntryForEviction(uid);
+      if (!s.ok()) {
+        LOG(WARNING) << "MarkEntryForEviction in "
+                        "~CompilationCacheEntryUnloader fails with error "
+                     << s;
+      }
+    }
+    // Release our reference to the cache.
+    cache_->Unref();
+  }
+
+  // Add cache entry uid to be unloaded in destructor.
+  void AddCacheEntryUid(int64 uid) {
+    absl::MutexLock lock(&mu_);
+    cache_entry_uids_.insert(uid);
+  }
+
+  std::string DebugString() const override {
+    return "CompilationCacheEntryUnloader";
+  }
+
+ private:
+  TF_DISALLOW_COPY_AND_ASSIGN(TpuCompilationCacheEntryUnloader);
+  mutable absl::Mutex mu_;
+  TpuCompilationCacheInterface* cache_;  // Not owned.
+  absl::flat_hash_set<int64> cache_entry_uids_ ABSL_GUARDED_BY(mu_);
+};
+
+}  // namespace tpu
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TPU_KERNELS_COMPILATION_CACHE_ENTRY_UNLOADER_H_
diff --git a/tensorflow/core/tpu/kernels/tpu_compilation_cache_interface.h b/tensorflow/core/tpu/kernels/tpu_compilation_cache_interface.h
new file mode 100644
index 00000000000..f92893b78f6
--- /dev/null
+++ b/tensorflow/core/tpu/kernels/tpu_compilation_cache_interface.h
@@ -0,0 +1,355 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_TPU_KERNELS_TPU_COMPILATION_CACHE_INTERFACE_H_
+#define TENSORFLOW_CORE_TPU_KERNELS_TPU_COMPILATION_CACHE_INTERFACE_H_
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "absl/base/thread_annotations.h"
+#include "absl/container/node_hash_map.h"
+#include "absl/strings/str_cat.h"
+#include "absl/synchronization/mutex.h"
+#include "tensorflow/compiler/tf2xla/host_compute_metadata.pb.h"
+#include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/core/distributed_runtime/rpc/grpc_call.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/lib/core/refcount.h"
+#include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/profiler/lib/traceme.h"
+#include "tensorflow/core/protobuf/config.pb.h"
+#include "tensorflow/core/tpu/kernels/compiled_subgraph.h"
+#include "tensorflow/core/tpu/kernels/tpu_compilation_cache.pb.h"
+#include "tensorflow/core/tpu/kernels/tpu_compilation_cache_key.h"
+#include "tensorflow/core/tpu/kernels/tpu_compilation_cache_metrics.h"
+#include "tensorflow/core/tpu/kernels/trace_util.h"
+
+namespace tensorflow {
+namespace tpu {
+
+// Base class that holds references to compiled protos so that the protos are
+// not garbage-collected before being used by execute ops. Use
+// TpuCompilationCache::MakePerStepRefHolder to create an instance of a concrete
+// ref holder object.
+class CompilationRefHolder : public ResourceBase {
+ public:
+  ~CompilationRefHolder() override = default;
+};
+
+// Base class for a reference to a cached tpu program. A unique_ptr to a
+// CompilationCacheEntryRef is returned by all the cache Lookup methods below,
+// and ensures the underlying proto is not garbage-collected until the client
+// discards the ptr.
+template <typename CacheEntryType>
+class CompilationCacheEntryRef {
+ public:
+  virtual ~CompilationCacheEntryRef() = default;
+
+  // Returns a CompilationCacheEntry that should not be used beyond the lifetime
+  // of the tpu::CompilationCacheEntryRef.
+  virtual CacheEntryType get() = 0;
+
+  // Mutates this ref to point to the entry's subentry (for
+  // sharding/unsharding) or main entry (unchanged) as specified by
+  // fetch_target. The refcount is kept unchanged, since we only track the
+  // refcount of the main entry. The entry ref needs to point to the main
+  // entry before this call.
+  //
+  // If the requested subentry does not exist, the ref will point to a nullptr
+  // entry, and the original entry will be unref'ed.
+  virtual Status ToSubEntryRef(CompilationCacheFetchTarget fetch_target) = 0;
+};
+
+class TpuCompilationCacheInterface : public ResourceBase {
+ public:
+  explicit TpuCompilationCacheInterface(int64 max_cache_size);
+  ~TpuCompilationCacheInterface() override;
+
+  // Ensures there is an entry for key present in the cache. By the time
+  // CompileIfKeyAbsent returns there is guaranteed to be an entry in the cache
+  // for key, and that entry will remain valid at least until
+  // per_step_ref_holder is deleted. The first call to CompileIfKeyAbsent with a
+  // key that is not in the cache will evaluate compile_function to compute the
+  // value to use in the entry. Subsequent calls with the same key will block
+  // until compile_function completes. Other cache reads and inserts may proceed
+  // on other threads while compile_function is executing. If
+  // per_step_ref_holder is nullptr then the caller is responsible for calling
+  // Release(subgraph_key) to manually discard its reference to the compiled
+  // program, once the caller will not look up the compiled program again.
+  //
+  // compile_function should compile the subgraph represented by key and fill in
+  // one TPUExecutableProto per model-parallel core into its passed argument. It
+  // should return OK if and only if compilation succeeds. The executable proto
+  // vector will be discarded on non-OK status.
+  Status CompileIfKeyAbsent(
+      const TpuCompilationCacheKey& subgraph_key,
+      const SessionMetadata* session_metadata,
+      CompilationRefHolder* per_step_ref_holder, int64* uid,
+      std::vector<string>* proto_key, std::vector<bool>* may_modify_variables,
+      absl::Span<const xla::HloProto* const>* hlo_metadatas,
+      const std::function<Status(TpuProgramGroupInterface*)>& compile_function);
+
+  // Differences between MarkEntryForEviction and Release:
+  // There are two modes of managing cache entries:
+  // 1) LRU eviction + pinning; 2) manual.
+  // We use mode 1) if CompilationRefHolder is provided to CompileIfKeyAbsent.
+  // Otherwise it is manual mode (mainly used by XRT).
+  // MarkEntryForEviction should only be used in mode 1) to eagerly evict cache
+  // entries when callers know that they do not need them anymore.
+  // Release should only be used in mode 2) to explicitly remove an entry.
+
+  // Mark the entry indexed by `subgraph_uid` for eviction. This should only be
+  // called if per_step_ref_holder was NOT nullptr in the corresponding call to
+  // CompileIfKeyAbsent(subgraph_key, ...). Otherwise, use Release(int64
+  // subgraph_uid).
+  Status MarkEntryForEviction(int64 subgraph_uid);
+
+  // Manually discards a reference to the compiled subgraph. This should only be
+  // called if per_step_ref_holder was nullptr in the corresponding call to
+  // CompileIfKeyAbsent(subgraph_key, ...).
+  Status Release(int64 subgraph_uid);
+
+  // Looks up an executable corresponding to the model-parallel core index of
+  // the subgraph represented by key. On success a pointer to an EntryRef
+  // holding the program is returned in entry.
+  template <typename CacheEntryRef, typename CacheEntryRefImpl>
+  Status Lookup(const string& proto_key, std::unique_ptr<CacheEntryRef>* entry);
+
+  // Looks up an executable corresponding to the model-parallel core index of
+  // the subgraph represented by uid. On success a pointer to an EntryRef
+  // holding the program is returned in entry.
+  template <typename CacheEntryRef, typename CacheEntryRefImpl>
+  Status Lookup(int64 uid, int proto_index,
+                std::unique_ptr<CacheEntryRef>* entry);
+
+  // Looks up the subgraph represented by uid, and returns the vector of keys,
+  // one per core, corresponding to that subgraph.
+  Status GetKeysFromUid(int64 uid, std::vector<string>* keys);
+
+  // Makes a reference holder for this cache, that can be stored in the per-step
+  // resource manager and will ensure that compiled entries persist until the
+  // end of a step.
+  CompilationRefHolder* MakePerStepRefHolder();
+
+  // Convenience method called by ~RefHolder without mu_ held. Calls
+  // DiscardEntryRef on every element of entries.
+  void DiscardEntryRefs(gtl::ArraySlice<CompiledSubgraph*> entries);
+
+  string DebugString() const override { return "TpuCompilationCacheBase"; }
+
+ protected:
+  std::string ConstructCompilationCacheKey(const TpuCompilationCacheKey& key) {
+    if (!key.has_guaranteed_const) {
+      return key.prefix;
+    }
+    return absl::StrCat(key.prefix, "|", key.session_handle, "|",
+                        key.guaranteed_const_fingerprint());
+  }
+
+  // Private implementation of the generic CompilationRefHolder that knows about
+  // CompiledSubgraph entries.
+  class RefHolder : public CompilationRefHolder {
+   public:
+    explicit RefHolder(TpuCompilationCacheInterface* parent);
+    ~RefHolder() override;
+
+    // Adds entry to the list of entries that will be released when the
+    // RefHolder is destroyed. Each entry is released via a call to
+    // parent_->DiscardEntryRefs.
+    void AddRef(CompiledSubgraph* entry);
+
+    string DebugString() const override;
+
+   private:
+    TpuCompilationCacheInterface* parent_;  // Not owned.
+    std::vector<CompiledSubgraph*> entries_;
+  };
+
+  // The bulk of implementation of CompileIfKeyAbsent() with the exception
+  // of unloading programs that corresponds to possibly removed cache
+  // entries. The split helps to manage locking since we prefer to perform
+  // unloading without holding extra locks.
+  Status CompileIfKeyAbsentHelper(
+      const TpuCompilationCacheKey& subgraph_key,
+      const SessionMetadata* session_metadata,
+      CompilationRefHolder* per_step_ref_holder, int64* uid,
+      std::vector<string>* proto_key, std::vector<bool>* may_modify_variables,
+      std::vector<CompiledSubgraph*>* removed_entries,
+      absl::Span<const xla::HloProto* const>* hlo_metadatas,
+      const std::function<Status(TpuProgramGroupInterface*)>& compile_function);
+
+  // This is called by the cache when entry is marked for eviction; by
+  // a RefHolder (via DiscardEntryRefs) when a step completes; and by
+  // an EntryRefImpl when it is destroyed. Releases one reference to entry
+  // if more than 1 remains. If only one reference is left, the entry is removed
+  // from cache_ and is returned to the caller; which must eventually call
+  // UnloadAndDestroy(). We do not call UnloadAndDestroy within DiscardEntryRef
+  // to avoid holding the lock during program unloading.
+  ABSL_MUST_USE_RESULT CompiledSubgraph* DiscardEntryRef(
+      CompiledSubgraph* entry) ABSL_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
+  // Marks the oldest unmarked entry for eviction. Requires that there is at
+  // least one such entry. In case the evicted entry had only 1 reference it
+  // is removed from the cache and returned to the caller which must eventually
+  // call UnloadAndDestroy.
+  ABSL_MUST_USE_RESULT CompiledSubgraph* MarkOldestEntryForEviction()
+      ABSL_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
+  // Updates datastructures to indicate that entry, which had been marked for
+  // eviction, has been looked up. This is called by CompileIfKeyAbsent when an
+  // entry is newly created, or an entry that has been marked for eviction but
+  // not yet evicted is looked up.
+  //
+  // First the entry is unmarked for eviction, i.e. the cache gains a reference
+  // to entry, entry's last_use field is set to be the most recent value of
+  // use_counter_ and entries_by_last_use_ is updated accordingly.
+  //
+  // Next, the size of the cache is examined to see if any other entries need to
+  // be marked for eviction now that entry has been unmarked. While the total
+  // size of unmarked cached entries is greater than max_cache_size_, entries
+  // are marked for eviction in LRU order. The most recently used entry is never
+  // marked for eviction, so an entry larger than the max cache size will remain
+  // in the cache until it is replaced by something else. In case some entries
+  // actually were removed from the cache, they are a returned to the caller via
+  // removed_entries. The caller must eventually delete them by calling
+  // UnloadAndDestroy.
+  void LookupEntryMarkedForEviction(
+      CompiledSubgraph* entry, std::vector<CompiledSubgraph*>* removed_entries)
+      ABSL_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
+  // Removes the entry with given key from cache.
+  size_t RemoveEntry(const string& key) ABSL_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
+  // Inserts the given key and entry to cache.
+  void InsertEntry(const string& key, CompiledSubgraph* entry)
+      ABSL_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
+  // Returns the cache key matching given subgraph_key.
+  string FindCacheKey(const TpuCompilationCacheKey& subgraph_key)
+      ABSL_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
+  // Creates a new entry by running initialize_programs and places it in the
+  // cache to be looked up by key. The new entry is in the 'marked for eviction'
+  // state (not present in entries_by_last_use_) and the caller is expected to
+  // call LookupEntryMarkedForEviction after InitializeEntry.
+  //
+  // **InitializeEntry releases mu_ during the call to initialize_programs.**
+  virtual CompiledSubgraph* InitializeEntry(
+      const string& key,
+      const std::function<Status(TpuProgramGroupInterface*)>&
+          initialize_programs,
+      const TpuCompilationCacheKey& subgraph_key)
+      ABSL_EXCLUSIVE_LOCKS_REQUIRED(mu_) = 0;
+
+  // Unloads the program associated with the entry from all local devices
+  // and deletes the entry itself. It is assumed no one else has a reference
+  // to it and all related keys had already been removed from the cache.
+  // The call can perform device IO so no locks should be held while calling it.
+  void UnloadAndDestroy(CompiledSubgraph* entry) ABSL_LOCKS_EXCLUDED(mu_);
+
+  // The maximum size of entries that are stored in the cache before entries are
+  // marked for eviction.
+  const int64 max_cache_size_;
+  // Mutex to protect access to shared resources under multi-threading
+  // environment.
+  absl::Mutex mu_;
+  // The total size of entries that are stored and not marked for eviction.
+  int64 cache_size_ ABSL_GUARDED_BY(mu_) = 0;
+  // The total size of entries that are marked for eviction.
+  int64 marked_for_eviction_size_ ABSL_GUARDED_BY(mu_) = 0;
+  // The value to assign to the last_use field of the next entry that is looked
+  // up.
+  int64 use_counter_ ABSL_GUARDED_BY(mu_) = 0;
+  // session_key_map_ and fingerprint_key_map_ are used for looking up the
+  // cache_ key matching a given subgraph key. When doing a lookup, check
+  // session_key_map_ first to avoid unnecessay fingerprint computation.
+  // Map from key prefix + session_handle to a cache_ key.
+  absl::node_hash_map<string, string> session_key_map_ ABSL_GUARDED_BY(mu_);
+  // Map from key prefix + fingerprint to a cache_ key.
+  absl::node_hash_map<string, string> fingerprint_key_map_ ABSL_GUARDED_BY(mu_);
+  // All the subgraph entries that can be looked up in the cache. An entry is
+  // marked for eviction iff it is present in cache_ and not in
+  // entries_by_last_use_.
+  std::unordered_map<string, CompiledSubgraph*> cache_ ABSL_GUARDED_BY(mu_);
+  // All the subgraph entries that can be looked up in the cache, indexed by
+  // uid.
+  absl::node_hash_map<int64, CompiledSubgraph*> entries_by_uid_
+      ABSL_GUARDED_BY(mu_);
+  // All the protos that can be looked up in the cache, indexed by proto
+  // key. The value of the map is a subgraph and the index of the proto compiled
+  // for that subgraph.
+  std::unordered_map<string, std::pair<CompiledSubgraph*, int>>
+      entries_by_proto_key_ ABSL_GUARDED_BY(mu_);
+  // Map from last_use to entry, used to mark entries for eviction in LRU
+  // order. If an entry's last_use counter is not present as a key in
+  // entries_by_last_use_ then the entry has been marked for eviction.
+  std::map<int64, CompiledSubgraph*> entries_by_last_use_ ABSL_GUARDED_BY(mu_);
+
+  TpuCompilationCacheMetrics tpu_compilation_cache_metrics_;
+
+ private:
+  TpuCompilationCacheInterface(const TpuCompilationCacheInterface&) = delete;
+  TpuCompilationCacheInterface& operator=(const TpuCompilationCacheInterface&) =
+      delete;
+};
+
+template <typename CacheEntryRef, typename CacheEntryRefImpl>
+Status TpuCompilationCacheInterface::Lookup(
+    int64 uid, int proto_index, std::unique_ptr<CacheEntryRef>* entry) {
+  entry->reset();
+
+  profiler::TraceMe proto_lookup_traceme(
+      "TPU compilation cache proto lookup by uid",
+      /*level=*/2);
+
+  absl::MutexLock lock(&mu_);
+  const auto iter = entries_by_uid_.find(uid);
+  if (iter == entries_by_uid_.end()) {
+    return errors::NotFound("No subgraph found for uid ", uid);
+  }
+  CompiledSubgraph* cache_entry = iter->second;
+  if (proto_index < 0 ||
+      proto_index >= cache_entry->tpu_program_group->program_count()) {
+    return errors::NotFound("No proto found for core index ", proto_index,
+                            " in subgraph with uid ", uid);
+  }
+  *entry = absl::make_unique<CacheEntryRefImpl>(this, cache_entry, proto_index);
+  return Status::OK();
+}
+
+template <typename CacheEntryRef, typename CacheEntryRefImpl>
+Status TpuCompilationCacheInterface::Lookup(
+    const string& proto_key, std::unique_ptr<CacheEntryRef>* entry) {
+  entry->reset();
+
+  profiler::TraceMe proto_lookup_traceme("TPU compilation cache proto lookup",
+                                         /*level=*/2);
+
+  absl::MutexLock lock(&mu_);
+  const auto iter = entries_by_proto_key_.find(proto_key);
+  if (iter == entries_by_proto_key_.end()) {
+    return errors::NotFound("No proto found for key ", proto_key);
+  }
+  CompiledSubgraph* cache_entry = iter->second.first;
+  int proto_index = iter->second.second;
+  *entry = absl::make_unique<CacheEntryRefImpl>(this, cache_entry, proto_index);
+  return Status::OK();
+}
+
+}  // namespace tpu
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TPU_KERNELS_TPU_COMPILATION_CACHE_INTERFACE_H_
diff --git a/tensorflow/core/tpu/kernels/tpu_op_consts.cc b/tensorflow/core/tpu/kernels/tpu_op_consts.cc
new file mode 100644
index 00000000000..e5e1aacb3cc
--- /dev/null
+++ b/tensorflow/core/tpu/kernels/tpu_op_consts.cc
@@ -0,0 +1,24 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/tpu/kernels/tpu_op_consts.h"
+
+namespace tensorflow {
+namespace tpu {
+const char kCompilationCacheResourceName[] = "tpu_compilation_cache";
+const char kCompiledProtoCacheResourceName[] = "tpu_proto_cache";
+const char kCompilationCacheUnloaderResourceName[] =
+    "tpu_compilation_cache_unloader";
+}  // namespace tpu
+}  // namespace tensorflow
diff --git a/tensorflow/core/tpu/kernels/tpu_op_consts.h b/tensorflow/core/tpu/kernels/tpu_op_consts.h
new file mode 100644
index 00000000000..25223b7e429
--- /dev/null
+++ b/tensorflow/core/tpu/kernels/tpu_op_consts.h
@@ -0,0 +1,39 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_TPU_KERNELS_TPU_OP_CONSTS_H_
+#define TENSORFLOW_CORE_TPU_KERNELS_TPU_OP_CONSTS_H_
+
+#include "absl/base/attributes.h"
+
+namespace tensorflow {
+namespace tpu {
+
+// Resource names in the ResourceMgr.
+//
+// Name of cache for compiled TPU ISA protos. CompilationCache is created by
+// ConfigureDistributedTpuOp, so only the master has a CompilationCache.
+ABSL_CONST_INIT extern const char kCompilationCacheResourceName[];
+// Name of base class allowing Execute Ops to look up ISA protos.
+// CompiledProtoCache is created by InitializeHostForDistributedTpuOp, so each
+// tpu_worker has a CompiledProtoCache.
+ABSL_CONST_INIT extern const char kCompiledProtoCacheResourceName[];
+// Name of cache unloader for compiled TPU ISA protos. Cache unloader should be
+// put into TPU_SYSTEM device resource manager. Inference may use it to unload
+// cache entries created during lifetime of a DirectSession.
+ABSL_CONST_INIT extern const char kCompilationCacheUnloaderResourceName[];
+
+}  // namespace tpu
+}  // namespace tensorflow
+#endif  // TENSORFLOW_CORE_TPU_KERNELS_TPU_OP_CONSTS_H_
diff --git a/tensorflow/core/tpu/kernels/tpu_op_util.cc b/tensorflow/core/tpu/kernels/tpu_op_util.cc
new file mode 100644
index 00000000000..e2f717fea8b
--- /dev/null
+++ b/tensorflow/core/tpu/kernels/tpu_op_util.cc
@@ -0,0 +1,151 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/tpu/kernels/tpu_op_util.h"
+
+#include <string>
+
+#include "tensorflow/core/lib/gtl/cleanup.h"
+#include "tensorflow/core/tpu/kernels/tpu_compile_c_api.h"
+
+namespace tensorflow {
+namespace tpu {
+namespace {
+// Return fingerprint_in_metadata if it's not empty; otherwise read input tensor
+// data to compute the fingerprint.
+std::string GuaranteedConstFingerprint(
+    const string& fingerprint_in_metadata,
+    const OpInputList& guaranteed_constants) {
+  if (fingerprint_in_metadata.empty()) {
+    uint64_t fingerprint = 0;
+    for (const auto& constant : guaranteed_constants) {
+      fingerprint = TpuCompile_CreateGuaranteedConstFingerprint(
+          fingerprint, constant.tensor_data().data(),
+          constant.tensor_data().size());
+    }
+    return std::to_string(fingerprint);
+  } else {
+    return fingerprint_in_metadata;
+  }
+}
+
+std::string CreateShapePrefix(
+    const std::vector<tensorflow::TensorShape>& dynamic_shapes) {
+  std::string shapes_prefix;
+  for (const TensorShape& shape : dynamic_shapes) {
+    for (int64 size : shape.dim_sizes()) {
+      absl::StrAppend(&shapes_prefix, size, ",");
+    }
+    absl::StrAppend(&shapes_prefix, ";");
+  }
+  return shapes_prefix;
+}
+
+// Include compilation configurations of the arguments that are not captured
+// by the called graph.
+std::string CreateConfigPrefix(const TPUCompileMetadataProto& metadata) {
+  std::string config_prefix;
+  for (const auto& arg : metadata.args()) {
+    if (arg.is_same_data_across_replicas()) {
+      absl::StrAppend(&config_prefix, ":s");
+      // Same.
+    } else {
+      // Different.
+      absl::StrAppend(&config_prefix, ":");
+    }
+    if (arg.enable_xla_sharding() ==
+        tpu::TPUCompileMetadataProto::Arg::ALLOWED) {
+      // Enabled.
+      absl::StrAppend(&config_prefix, "e");
+    }
+    if (arg.unrestricted_layout()) {
+      // Unrestricted.
+      absl::StrAppend(&config_prefix, ":u");
+    }
+    absl::StrAppend(&config_prefix, ",type(", arg.dtype(), ")");
+    if (arg.has_shape()) {
+      absl::StrAppend(&config_prefix, ",shape(");
+      for (const auto& dim : arg.shape().dim()) {
+        absl::StrAppend(&config_prefix, dim.size(), ",");
+      }
+      absl::StrAppend(&config_prefix, ")");
+    }
+  }
+  return config_prefix;
+}
+}  // namespace
+
+TpuCompilationCacheKey CreateCompilationCacheKey(
+    absl::string_view function_name, uint64 function_library_fingerprint,
+    absl::string_view mlir_module, const OpInputList& guaranteed_constants,
+    const std::vector<TensorShape>& dynamic_shapes,
+    const TPUCompileMetadataProto& metadata,
+    const TpuMeshStateInterface& mesh_state) {
+  VLOG(1) << "FunctionLibraryFingerprint:" << function_library_fingerprint;
+  std::string shapes_prefix = CreateShapePrefix(dynamic_shapes);
+  VLOG(1) << "shapes_prefix = " << shapes_prefix;
+  std::string config_prefix = CreateConfigPrefix(metadata);
+  VLOG(1) << "config_prefix = " << config_prefix;
+  std::vector<int32_t> flattened_device_ids;
+  if (metadata.has_device_assignment()) {
+    for (const auto& device :
+         metadata.device_assignment().computation_devices()) {
+      flattened_device_ids.insert(flattened_device_ids.end(),
+                                  device.replica_device_ids().begin(),
+                                  device.replica_device_ids().end());
+    }
+  }
+  CompilationCacheKeyResult result =
+      TpuCompile_CreateCompilationCacheKey(CompilationCacheKeyProperty{
+          config_prefix.data(),
+          shapes_prefix.data(),
+          function_name.data(),
+          mlir_module.data(),
+          flattened_device_ids.data(),
+          flattened_device_ids.size(),
+          guaranteed_constants.size(),
+          function_library_fingerprint,
+          metadata.num_cores_per_replica(),
+          metadata.num_replicas(),
+          mesh_state.data(),
+      });
+  auto buffer_cleanup = gtl::MakeCleanup(
+      [result]() { TpuCompile_DestroyCompilationCacheKey(result); });
+  TpuCompilationCacheKey key;
+  key.prefix = result.key;
+  key.debug_string = result.debug_string;
+
+  // Guaranteed constants can be different across sessions. Use session_handle
+  // and guaranteed_const fingerprint to guarantee no collision.
+  if (guaranteed_constants.size() > 0) {
+    key.has_guaranteed_const = true;
+    key.session_handle = metadata.session_handle();
+    // Both `metadata` and `guaranteed_constants` lifetime are captured by
+    // reference based on the assumption that these variables lifetime is
+    // managed through the `TPUCompileOpKernelImpl` that outlives the
+    // lifetime of the compilation cache lookups.
+    string fingerprint;
+    key.guaranteed_const_fingerprint = [&metadata, &guaranteed_constants,
+                                        fingerprint]() mutable {
+      if (fingerprint.empty()) {
+        fingerprint = GuaranteedConstFingerprint(
+            metadata.guaranteed_const_fingerprint(), guaranteed_constants);
+      }
+      return fingerprint;
+    };
+  }
+  return key;
+}
+}  // namespace tpu
+}  // namespace tensorflow
diff --git a/tensorflow/core/tpu/kernels/tpu_op_util.h b/tensorflow/core/tpu/kernels/tpu_op_util.h
new file mode 100644
index 00000000000..0a9657ca05e
--- /dev/null
+++ b/tensorflow/core/tpu/kernels/tpu_op_util.h
@@ -0,0 +1,40 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_TPU_KERNELS_TPU_OP_UTIL_H_
+#define TENSORFLOW_CORE_TPU_KERNELS_TPU_OP_UTIL_H_
+
+#include <vector>
+
+#include "absl/strings/string_view.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/protobuf/tpu/compile_metadata.pb.h"
+#include "tensorflow/core/tpu/kernels/tpu_compilation_cache_key.h"
+#include "tensorflow/core/tpu/kernels/tpu_mesh_state_interface.h"
+
+namespace tensorflow {
+namespace tpu {
+// Creates a unique compilation cache `key`.
+TpuCompilationCacheKey CreateCompilationCacheKey(
+    absl::string_view function_name, uint64 function_library_fingerprint,
+    absl::string_view mlir_module, const OpInputList& guaranteed_constants,
+    const std::vector<TensorShape>& dynamic_shapes,
+    const TPUCompileMetadataProto& metadata,
+    const TpuMeshStateInterface& mesh_state);
+}  // namespace tpu
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TPU_KERNELS_TPU_OP_UTIL_H_
diff --git a/tensorflow/core/tpu/tpu_library_init_fns.inc b/tensorflow/core/tpu/tpu_library_init_fns.inc
new file mode 100644
index 00000000000..e21d7f195ad
--- /dev/null
+++ b/tensorflow/core/tpu/tpu_library_init_fns.inc
@@ -0,0 +1,166 @@
+namespace {
+
+tensorflow::Status SetTpuConfigStructFns(void* library_handle) {
+  auto* config_fn = tensorflow::tpu::ConfigApiFn();
+
+  TFTPU_SET_FN(config_fn, ConfigureDistributedTpuOp_DoWork);
+  TFTPU_SET_FN(config_fn, WaitForDistributedTpuOp_DoWork);
+  TFTPU_SET_FN(config_fn, ShutdownDistributedTpuOp_DoWork);
+  TFTPU_SET_FN(config_fn, InitializeHostForDistributedTpuOp_DoWork);
+  TFTPU_SET_FN(config_fn, SetGlobalTPUArrayOp_DoWork);
+  TFTPU_SET_FN(config_fn, DisconnectDistributedTpuChipsOp_DoWork);
+  TFTPU_SET_FN(config_fn, TpuConfigurationApi_FreeCharArray);
+  TFTPU_SET_FN(config_fn, TpuConfigurationApi_FreeInt32Array);
+
+  return tensorflow::Status::OK();
+}
+
+tensorflow::Status SetTpuMeshStateStructFns(void* library_handle) {
+  auto* mesh_state_fn = tensorflow::tpu::MeshStateApiFn();
+
+  TFTPU_SET_FN(mesh_state_fn, TpuMeshState_Create);
+  TFTPU_SET_FN(mesh_state_fn, TpuMeshState_Free);
+  TFTPU_SET_FN(mesh_state_fn, TpuMeshState_MeshCommonState);
+
+  return tensorflow::Status::OK();
+}
+
+tensorflow::Status SetCompileStructFn(void* library_handle) {
+  auto* compile_fn = tensorflow::tpu::CompileApiFn();
+
+  TFTPU_SET_FN(compile_fn, TpuTopology_AvailableCoreCount);
+  TFTPU_SET_FN(compile_fn, TpuCompile_CreateCompilationCacheKey);
+  TFTPU_SET_FN(compile_fn, TpuCompile_CreateGuaranteedConstFingerprint);
+  TFTPU_SET_FN(compile_fn, TpuCompile_CompileAheadOfTime);
+  TFTPU_SET_FN(compile_fn, TpuCompile_BuildXLADeviceAssignment);
+
+  return tensorflow::Status::OK();
+}
+
+tensorflow::Status SetExecutorStructFn(void* library_handle) {
+  auto* executor_fn = tensorflow::tpu::ExecutorApiFn();
+
+  TFTPU_SET_FN(executor_fn, TpuPlatform_New);
+  TFTPU_SET_FN(executor_fn, TpuPlatform_Free);
+  TFTPU_SET_FN(executor_fn, TpuPlatform_Initialize);
+  TFTPU_SET_FN(executor_fn, TpuPlatform_Initialized);
+  TFTPU_SET_FN(executor_fn, TpuPlatform_GetExecutor);
+  TFTPU_SET_FN(executor_fn, TpuPlatform_Id);
+  TFTPU_SET_FN(executor_fn, TpuPlatform_VisibleDeviceCount);
+  TFTPU_SET_FN(executor_fn, TpuPlatform_TpuMemoryLimit);
+  TFTPU_SET_FN(executor_fn, TpuPlatform_ShouldRegisterTpuDeviceToDeviceCopy);
+
+  TFTPU_SET_FN(executor_fn, TpuExecutor_Init);
+  TFTPU_SET_FN(executor_fn, TpuExecutor_Free);
+  TFTPU_SET_FN(executor_fn, TpuExecutor_PlatformDeviceCount);
+  TFTPU_SET_FN(executor_fn, TpuExecutor_Allocate);
+  TFTPU_SET_FN(executor_fn, TpuExecutor_Deallocate);
+  TFTPU_SET_FN(executor_fn, TpuExecutor_GetAllocatorStats);
+  TFTPU_SET_FN(executor_fn, TpuExecutor_DeviceMemoryUsage);
+  TFTPU_SET_FN(executor_fn, TpuExecutor_AllocateStream);
+  TFTPU_SET_FN(executor_fn, TpuExecutor_DeallocateStream);
+  TFTPU_SET_FN(executor_fn, TpuExecutor_CreateStreamDependency);
+  TFTPU_SET_FN(executor_fn, TpuExecutor_GetStatus);
+  TFTPU_SET_FN(executor_fn, TpuExecutor_AllocateEvent);
+  TFTPU_SET_FN(executor_fn, TpuExecutor_DeallocateEvent);
+  TFTPU_SET_FN(executor_fn, TpuExecutor_PollForEventStatus);
+  TFTPU_SET_FN(executor_fn, TpuExecutor_RecordEvent);
+  TFTPU_SET_FN(executor_fn, TpuExecutor_WaitForEvent);
+  TFTPU_SET_FN(executor_fn, TpuExecutor_AllocateTimer);
+  TFTPU_SET_FN(executor_fn, TpuExecutor_DeallocateTimer);
+  TFTPU_SET_FN(executor_fn, TpuExecutor_StartTimer);
+  TFTPU_SET_FN(executor_fn, TpuExecutor_StopTimer);
+  TFTPU_SET_FN(executor_fn, TpuExecutor_SynchronousMemcpyToHost);
+  TFTPU_SET_FN(executor_fn, TpuExecutor_SynchronousMemcpyFromHost);
+  TFTPU_SET_FN(executor_fn, TpuExecutor_MemcpyToHost);
+  TFTPU_SET_FN(executor_fn, TpuExecutor_MemcpyFromHost);
+  TFTPU_SET_FN(executor_fn, TpuExecutor_EnqueueInfeed);
+  TFTPU_SET_FN(executor_fn, TpuExecutor_DequeueOutfeed);
+  TFTPU_SET_FN(executor_fn, TpuExecutor_WaitForInfeedReady);
+  TFTPU_SET_FN(executor_fn, TpuExecutor_WaitForOutfeedReady);
+  TFTPU_SET_FN(executor_fn, TpuExecutor_BlockHostUntilDone);
+  TFTPU_SET_FN(executor_fn, TpuExecutor_BlockUntilDoneOrFailed);
+  TFTPU_SET_FN(executor_fn, TpuExecutor_SyncAndForgetFailedStreams);
+  TFTPU_SET_FN(executor_fn, TpuExecutor_SynchronizeAllActivity);
+
+  TFTPU_SET_FN(executor_fn, TpuStream_New);
+  TFTPU_SET_FN(executor_fn, TpuStream_Free);
+  TFTPU_SET_FN(executor_fn, TpuStream_Stream);
+  TFTPU_SET_FN(executor_fn, TpuStream_Status);
+  TFTPU_SET_FN(executor_fn, TpuStream_IsSameSharedMemoryLocation);
+  TFTPU_SET_FN(executor_fn, TpuStream_TpuEnqueueOnDeviceSendRecvLocal);
+
+  TFTPU_SET_FN(executor_fn, TpuEvent_New);
+  TFTPU_SET_FN(executor_fn, TpuEvent_Free);
+
+  TFTPU_SET_FN(executor_fn, TpuTimer_New);
+  TFTPU_SET_FN(executor_fn, TpuTimer_Free);
+  TFTPU_SET_FN(executor_fn, TpuTimer_Nanoseconds);
+  TFTPU_SET_FN(executor_fn, TpuTimer_Microseconds);
+
+  TFTPU_SET_FN(executor_fn, TpuStatus_New);
+  TFTPU_SET_FN(executor_fn, TpuStatus_Create);
+  TFTPU_SET_FN(executor_fn, TpuStatus_Free);
+  TFTPU_SET_FN(executor_fn, TpuStatus_Message);
+  TFTPU_SET_FN(executor_fn, TpuStatus_Code);
+  TFTPU_SET_FN(executor_fn, TpuStatus_Ok);
+
+  TFTPU_SET_FN(executor_fn, TpuStreamExecutorConfig_Default);
+  TFTPU_SET_FN(executor_fn, TpuStreamExecutorConfig_SetOrdinal);
+  TFTPU_SET_FN(executor_fn, TpuStreamExecutorConfig_Free);
+
+  TFTPU_SET_FN(executor_fn, TpuDeviceDescription_New);
+  TFTPU_SET_FN(executor_fn, TpuDeviceDescription_Free);
+
+  TFTPU_SET_FN(executor_fn, TpuExecutor_CreateDeviceDescription);
+  TFTPU_SET_FN(executor_fn, TpuExecutor_NewDeviceOptions);
+  TFTPU_SET_FN(executor_fn, TpuExecutor_FreeDeviceOptions);
+  TFTPU_SET_FN(executor_fn, TpuExecutor_HostCallback);
+
+  TFTPU_SET_FN(executor_fn, TpuTransferManager_New);
+  TFTPU_SET_FN(executor_fn, TpuTransferManager_Free);
+  TFTPU_SET_FN(executor_fn, TpuTransferManager_PlatformId);
+  TFTPU_SET_FN(executor_fn, TpuTransferManager_HostShapeToDeviceShape);
+  TFTPU_SET_FN(executor_fn, TpuTransferManager_TransferLiteralToDeviceAsync);
+  TFTPU_SET_FN(executor_fn, TpuTransferManager_TransferLiteralFromDevice);
+  TFTPU_SET_FN(executor_fn, TpuTransferManager_GetByteSizeRequirement);
+  TFTPU_SET_FN(executor_fn, TpuTransferManager_WriteSingleTupleIndexTable);
+
+  TFTPU_SET_FN(executor_fn, TpuComputationPlacer_New);
+  TFTPU_SET_FN(executor_fn, TpuComputationPlacer_Free);
+
+  return tensorflow::Status::OK();
+}
+
+tensorflow::Status SetTpuNodeContextStructFns(void* library_handle) {
+  auto* node_context_fn = tensorflow::tpu::NodeContextApiFn();
+
+  TFTPU_SET_FN(node_context_fn, TpuNodeContext_Create);
+  TFTPU_SET_FN(node_context_fn, TpuNodeContext_Free);
+  TFTPU_SET_FN(node_context_fn, TpuNodeContext_StopChipHeartbeats);
+  TFTPU_SET_FN(node_context_fn, TpuNodeContext_CloseTpuHost);
+
+  return tensorflow::Status::OK();
+}
+
+tensorflow::Status SetTpuUtilStructFns(void* library_handle) {
+  auto* util_fn = tensorflow::tpu::UtilApiFn();
+
+  TFTPU_SET_FN(util_fn, TpuCompile_IsTpuCompilationEnabled);
+  TFTPU_SET_FN(util_fn, TpuCompile_ToTpuShapeRepresentation);
+
+  return tensorflow::Status::OK();
+}
+
+tensorflow::Status InitializeTpuStructFns(void* library_handle) {
+  TF_RETURN_IF_ERROR(SetTpuConfigStructFns(library_handle));
+  TF_RETURN_IF_ERROR(SetTpuMeshStateStructFns(library_handle));
+  TF_RETURN_IF_ERROR(SetCompileStructFn(library_handle));
+  TF_RETURN_IF_ERROR(SetExecutorStructFn(library_handle));
+  TF_RETURN_IF_ERROR(SetTpuNodeContextStructFns(library_handle));
+  TF_RETURN_IF_ERROR(SetTpuUtilStructFns(library_handle));
+
+  return tensorflow::Status::OK();
+}
+
+}  // namespace
\ No newline at end of file
diff --git a/tensorflow/lite/experimental/support/metadata/build_defs.bzl b/tensorflow/lite/experimental/support/metadata/build_defs.bzl
new file mode 100644
index 00000000000..3ea945770e0
--- /dev/null
+++ b/tensorflow/lite/experimental/support/metadata/build_defs.bzl
@@ -0,0 +1,43 @@
+"""Build rules to generate metadata schema versions."""
+
+METADATA_SCHEMA_FILE = "//tensorflow/lite/experimental/support/metadata:metadata_schema.fbs"
+
+def stamp_metadata_parser_version(
+        name,
+        srcs,
+        outs):
+    """Stamps the latest metadata parser version into the srcs files.
+
+    Replaces all the occurrences of "{LATEST_METADATA_PARSER_VERSION}" in the
+    srcs files with the metadata schema version extracted from
+    METADATA_SCHEMA_FILE and then outputs the generated file into outs,
+    respectively. The number of srcs files needs to match the number of outs
+    files.
+
+    Args:
+        name: Rule name. (required)
+        srcs: List of source files. (required)
+        outs: List of output files. (required)
+    """
+    if len(srcs) != len(outs):
+        fail(("The number of srcs files (%d) does not match that of the outs" +
+              " files (%d).") %
+             (len(srcs), len(outs)))
+
+    for i in range(0, len(srcs)):
+        native.genrule(
+            name = "%s_file%d" % (name, i),
+            srcs = [srcs[i]],
+            outs = [outs[i]],
+            tools = [METADATA_SCHEMA_FILE],
+            # Gets the metadata schema version from the file, and stamps it
+            # into the srcs file.
+            cmd = "version=$$(sed -n -e '/Schema Semantic version/ s/.*\\: *//p' $(location %s));" %
+                  METADATA_SCHEMA_FILE +
+                  'sed "s/{LATEST_METADATA_PARSER_VERSION}/$$version/" $< > $@',
+        )
+
+    native.filegroup(
+        name = name,
+        srcs = outs,
+    )
diff --git a/tensorflow/lite/experimental/support/metadata/cc/metadata_parser.h.template b/tensorflow/lite/experimental/support/metadata/cc/metadata_parser.h.template
new file mode 100644
index 00000000000..dfb62d0de81
--- /dev/null
+++ b/tensorflow/lite/experimental/support/metadata/cc/metadata_parser.h.template
@@ -0,0 +1,28 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_SUPPORT_METADATA_CC_METADATA_PARSER_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_SUPPORT_METADATA_CC_METADATA_PARSER_H_
+
+namespace tflite {
+namespace metadata {
+
+// The version of the metadata parser that this metadata versioning library is
+// depending on.
+inline constexpr char kMatadataParserVersion[] = "{LATEST_METADATA_PARSER_VERSION}";
+
+}  // namespace metadata
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_SUPPORT_METADATA_CC_METADATA_PARSER_H_
diff --git a/tensorflow/lite/experimental/support/metadata/cc/test/metadata_parser_test.cc b/tensorflow/lite/experimental/support/metadata/cc/test/metadata_parser_test.cc
new file mode 100644
index 00000000000..af7b8791fe8
--- /dev/null
+++ b/tensorflow/lite/experimental/support/metadata/cc/test/metadata_parser_test.cc
@@ -0,0 +1,33 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/experimental/support/metadata/cc/metadata_parser.h"
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+
+namespace tflite {
+namespace metadata {
+namespace {
+
+using ::testing::MatchesRegex;
+
+TEST(MetadataParserTest, MatadataParserVersionIsWellFormed) {
+  // Validates that the version is well-formed (x.y.z).
+  EXPECT_THAT(kMatadataParserVersion, MatchesRegex("[0-9]+\\.[0-9]+\\.[0-9]+"));
+}
+
+}  // namespace
+}  // namespace metadata
+}  // namespace tflite
diff --git a/tensorflow/lite/experimental/support/metadata/java/src/java/org/tensorflow/lite/support/metadata/MetadataParser.java b/tensorflow/lite/experimental/support/metadata/java/src/java/org/tensorflow/lite/support/metadata/MetadataParser.java
new file mode 100644
index 00000000000..195a330462b
--- /dev/null
+++ b/tensorflow/lite/experimental/support/metadata/java/src/java/org/tensorflow/lite/support/metadata/MetadataParser.java
@@ -0,0 +1,27 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+package org.tensorflow.lite.support.metadata;
+
+/** Information about the metadata parser that this metadata extractor library is depending on. */
+public final class MetadataParser {
+  /**
+   * The version of the metadata parser that this metadata extractor library is depending on. The
+   * value should match the value of "Schema Semantic version" in metadata_schema.fbs.
+   */
+  public static final String VERSION = "1.0.1";
+
+  private MetadataParser() {}
+}
diff --git a/tensorflow/lite/experimental/support/metadata/metadata_parser.py.template b/tensorflow/lite/experimental/support/metadata/metadata_parser.py.template
new file mode 100644
index 00000000000..a41ac06969c
--- /dev/null
+++ b/tensorflow/lite/experimental/support/metadata/metadata_parser.py.template
@@ -0,0 +1,26 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Information about the metadata parser that this python library depends on."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+
+class MetadataParser(object):
+  """Information about the metadata parser."""
+
+  # The version of the metadata parser.
+  VERSION = "{LATEST_METADATA_PARSER_VERSION}"
diff --git a/tensorflow/lite/experimental/support/metadata/metadata_parser_test.py b/tensorflow/lite/experimental/support/metadata/metadata_parser_test.py
new file mode 100644
index 00000000000..3b1d19278cd
--- /dev/null
+++ b/tensorflow/lite/experimental/support/metadata/metadata_parser_test.py
@@ -0,0 +1,38 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for tensorflow.lite.experimental.support.metadata.metadata_parser."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import re
+
+from tensorflow.lite.experimental.support.metadata import metadata_parser
+from tensorflow.python.framework import test_util
+from tensorflow.python.platform import test
+
+
+class MetadataParserTest(test_util.TensorFlowTestCase):
+
+  def test_version_wellFormedSemanticVersion(self):
+    # Validates that the version is well-formed (x.y.z).
+    self.assertTrue(
+        re.match('[0-9]+\\.[0-9]+\\.[0-9]+',
+                 metadata_parser.MetadataParser.VERSION))
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/lite/micro/examples/person_detection_experimental/himax_we1_evb/detection_responder.cc b/tensorflow/lite/micro/examples/person_detection_experimental/himax_we1_evb/detection_responder.cc
new file mode 100644
index 00000000000..ae5de962fd3
--- /dev/null
+++ b/tensorflow/lite/micro/examples/person_detection_experimental/himax_we1_evb/detection_responder.cc
@@ -0,0 +1,33 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/micro/examples/person_detection_experimental/detection_responder.h"
+
+#include "hx_drv_tflm.h"
+
+// This dummy implementation writes person and no person scores to the error
+// console. Real applications will want to take some custom action instead, and
+// should implement their own versions of this function.
+void RespondToDetection(tflite::ErrorReporter* error_reporter,
+                        int8_t person_score, int8_t no_person_score) {
+  if (person_score > no_person_score) {
+    hx_drv_led_on(HX_DRV_LED_GREEN);
+  } else {
+    hx_drv_led_off(HX_DRV_LED_GREEN);
+  }
+
+  TF_LITE_REPORT_ERROR(error_reporter, "person score:%d no person score %d",
+                       person_score, no_person_score);
+}
diff --git a/tensorflow/lite/micro/examples/person_detection_experimental/himax_we1_evb/image_provider.cc b/tensorflow/lite/micro/examples/person_detection_experimental/himax_we1_evb/image_provider.cc
new file mode 100644
index 00000000000..871a40a867d
--- /dev/null
+++ b/tensorflow/lite/micro/examples/person_detection_experimental/himax_we1_evb/image_provider.cc
@@ -0,0 +1,41 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/micro/examples/person_detection_experimental/image_provider.h"
+
+#include "hx_drv_tflm.h"
+#include "tensorflow/lite/micro/examples/person_detection_experimental/model_settings.h"
+
+hx_drv_sensor_image_config_t g_pimg_config;
+
+TfLiteStatus GetImage(tflite::ErrorReporter* error_reporter, int image_width,
+                      int image_height, int channels, int8_t* image_data) {
+  static bool is_initialized = false;
+
+  if (!is_initialized) {
+    if (hx_drv_sensor_initial(&g_pimg_config) != HX_DRV_LIB_PASS) {
+      return kTfLiteError;
+    }
+    is_initialized = true;
+  }
+
+  hx_drv_sensor_capture(&g_pimg_config);
+
+  hx_drv_image_rescale((uint8_t*)g_pimg_config.raw_address,
+                       g_pimg_config.img_width, g_pimg_config.img_height,
+                       image_data, image_width, image_height);
+
+  return kTfLiteOk;
+}
diff --git a/tensorflow/lite/micro/himax_we1_evb/debug_log.cc b/tensorflow/lite/micro/himax_we1_evb/debug_log.cc
new file mode 100644
index 00000000000..5bc3c7fae35
--- /dev/null
+++ b/tensorflow/lite/micro/himax_we1_evb/debug_log.cc
@@ -0,0 +1,32 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Implementation for the DebugLog() function that prints to the UART on the
+// SparkFun Edge microcontroller. The same should work for other targets using
+// the Ambiq Apollo 3.
+
+#include "tensorflow/lite/micro/debug_log.h"
+
+#include "hx_drv_tflm.h"
+
+extern "C" void DebugLog(const char* s) {
+  static bool is_initialized = false;
+  if (!is_initialized) {
+    hx_drv_uart_initial();
+    is_initialized = true;
+  }
+
+  hx_drv_uart_print("%s", s);
+}
diff --git a/tensorflow/lite/micro/tools/make/targets/himax_we1_evb_makefile.inc b/tensorflow/lite/micro/tools/make/targets/himax_we1_evb_makefile.inc
new file mode 100644
index 00000000000..60fc2e7cca1
--- /dev/null
+++ b/tensorflow/lite/micro/tools/make/targets/himax_we1_evb_makefile.inc
@@ -0,0 +1,91 @@
+# Settings for himax WE_1 evb.
+ifeq ($(TARGET), himax_we1_evb)
+  
+  CC_TOOL = ccac
+  AR_TOOL = arac
+  CXX_TOOL = ccac
+  LD_TOOL := ccac
+  TARGET_ARCH := arc
+  #ARC_TOOLCHAIN := mwdt 
+
+  BUILD_ARC_MLI := false
+  ARC_MLI_PRE_COMPILED_TARGET := himax_arcem9d_r16
+  
+  include $(MAKEFILE_DIR)/targets/arc/arc_common.inc
+
+  #download SDK & MLI
+  HIMAX_WE1_SDK_NAME := himax_we1_sdk
+  $(eval $(call add_third_party_download,$(HIMAX_WE1_SDK_URL),$(HIMAX_WE1_SDK_MD5),$(HIMAX_WE1_SDK_NAME),))
+
+  #export path of toolchain
+  #export PATH := $(MAKEFILE_DIR)/downloads/$(HIMAX_WE1_SDK_NAME)/image_gen_linux_v3/:$(PATH)
+  
+  TCF_FILE := $(PWD)/$(MAKEFILE_DIR)/downloads/$(HIMAX_WE1_SDK_NAME)/arcem9d_wei_r16.tcf
+  LCF_FILE := $(PWD)/$(MAKEFILE_DIR)/downloads/$(HIMAX_WE1_SDK_NAME)/memory.lcf
+  ARCLIB_FILE :=  $(PWD)/$(MAKEFILE_DIR)/downloads/$(HIMAX_WE1_SDK_NAME)/libembarc.a
+  LIB_HEADER_FILE :=  $(PWD)/$(MAKEFILE_DIR)/downloads/$(HIMAX_WE1_SDK_NAME)/hx_drv_tflm.h
+  
+
+  DEFAULT_HEAPSZ := 8192
+  DEFAULT_STACKSZ := 8192
+
+  TCF_FILE_NAME = $(notdir $(TCF_FILE))
+  ARC_TARGET_FILES_DIRS = $(dir $(TCF_FILE_NAME))
+  MAKE_PROJECT_FILES += $(TCF_FILE_NAME)
+    
+  LCF_FILE_NAME = $(notdir $(LCF_FILE))
+  ARC_TARGET_FILES_DIRS += $(dir $(LCF_FILE))
+  MAKE_PROJECT_FILES += $(LCF_FILE_NAME)
+  
+  ARCLIB_FILE_NAME = $(notdir $(ARCLIB_FILE))
+  ARC_TARGET_FILES_DIRS += $(dir $(ARCLIB_FILE))
+  MAKE_PROJECT_FILES += $(ARCLIB_FILE_NAME)
+  
+  LIB_HEADER_FILE_NAME = $(notdir $(LIB_HEADER_FILE))
+  ARC_TARGET_FILES_DIRS += $(dir $(LIB_HEADER_FILE))
+  MAKE_PROJECT_FILES += $(LIB_HEADER_FILE_NAME)
+  
+  
+    
+  # Need a pointer to the TCF and lcf file
+
+  PLATFORM_FLAGS = \
+    -DNDEBUG \
+    -g \
+    -DCPU_ARC \
+    -Hnosdata \
+    -DTF_LITE_STATIC_MEMORY \
+    -tcf=$(TCF_FILE_NAME) \
+    -Hnocopyr \
+    -Hpurge \
+    -Hcl \
+    -fslp-vectorize-aggressive \
+    -ffunction-sections \
+    -fdata-sections \
+    -tcf_core_config \
+
+  CXXFLAGS += -fno-rtti -DSCRATCH_MEM_Z_SIZE=0x10000 $(PLATFORM_FLAGS)
+  CCFLAGS += $(PLATFORM_FLAGS)
+
+  INCLUDES+= \
+    -I $(MAKEFILE_DIR)/downloads/$(WEI_SDK_NAME) \
+    -I $(MAKEFILE_DIR)/downloads/kissfft
+
+  GENERATED_PROJECT_INCLUDES += \
+    -I. \
+    -I./third_party/kissfft
+
+  LDFLAGS += \
+    -Hheap=8192 \
+    -tcf=$(TCF_FILE_NAME) \
+    -Hnocopyr \
+    -m \
+    -Hldopt=-Coutput=$(TARGET).map \
+    $(LCF_FILE_NAME) \
+    -Hldopt=-Bgrouplib $(ARCLIB_FILE_NAME)
+
+  CXXFLAGS := $(filter-out -std=c++11,$(CXXFLAGS))
+  CCFLAGS := $(filter-out -std=c11,$(CCFLAGS))
+  MICROLITE_LIBS := $(filter-out -lm,$(MICROLITE_LIBS))
+
+endif
diff --git a/tensorflow/lite/tools/optimize/testdata/mixed16x8.bin b/tensorflow/lite/tools/optimize/testdata/mixed16x8.bin
new file mode 100644
index 0000000000000000000000000000000000000000..c1f615e966eb164341d8ba4d56e1e359ef516388
GIT binary patch
literal 1184
zcmb1PU|<Mw^D$;%5Mq#FU}4~3;9(G85Mf|okYEsEU|?Vd$+Ivp=rA%ch%hoRm@qIf
zurM$%FhXg8KmY%8FfcH%F)%O)GcYjZX6D5gmFAUX=BCD%r4|)u=I1f6GcYh*VPIg`
z!@$6>hJk@Whk=1Xg@J(qWH-pHr2qf_gJ_T$hI)o3dIk&(3IG59Px=4<e;rgVhk=3N
zM8_7J6rsa*F83tt!!pzCW8ZACjp5O@%fH}Zzb@6l-sA9UJCUH>cJI#2x6#}#XYV(I
z+0J^akG-Ncll|PMa(3Mu8TRL0SnTgR%h)@mnb}(lm)YNIw6YgxVzg)bKG|+RlZpKt
zJw5xz<5l)73~XS3f_%oo0OCRX4RR*~*zYW0d45p%_&fV6IOpe;r52T>rs#pgMu35V
zfrmi^9-=%93=C%&7#I#PFfeQZ>t|qKVqjp10EaXKg9uzL$aD~f!~!UsCBSAu><7t!
z#9%atEy4g&0g5?x29UiF(?BX17%iaT4N?JfI|Bn3g8&0NIIg+C$`}|y<{)EGyo1z(
z+z4WW<UmS6G)SD4fq~%xBLhPSBg9`Ie}KfD{{8<CvJ=Dx#W9FB`S<@n$Xy^dNIwHZ
zy)%Sn0H=-2yn@n_cq4Flg2X{)gD}Y6N&o);Pl2X4n0YOD%rj(QU<fWrEJ;l(%?U^>
zD#<L#%+E_raZb$1(a+6KNzI8j)KAXOD>KjsMVOIGd{Sm&G1PpJ-Aw=g{|BWtklir*
zzTmOXnne5bgHjVy!V-%z6O(dM{R`lJ2H6M0ZU6uOPhenR0EGd}4Hf@!hao6Gk>Y1i
zJi_7vW+%w+3=DkWRLB5M83GJ^3=9k|43KmRvICTcL8%N>N`Ue_t~dqBd8VZ1m1LGw
i;<5{ru0Z)8WF{z&u`#fM%Mb=5HU<s`b_kn+ff)eEuD3e?

literal 0
HcmV?d00001

diff --git a/tensorflow/lite/tools/optimize/testdata/transpose.bin b/tensorflow/lite/tools/optimize/testdata/transpose.bin
new file mode 100644
index 0000000000000000000000000000000000000000..a76886e5b473b8de04253089e2acc931f5b6ec86
GIT binary patch
literal 544
zcmb1OU|<Mw^D$;%;A4<rU}4~3;9(G85MkhBU|?WkU|@K`$iR>TQp3Q&zyifC3=9kw
z3=9k)y&PaMHn1E60~-Se0}BHiSPY_`fuX^k0SWy1|NlP=0|+uO@Po|tclK9s&d)1L
zEh<S((PLm_U|<knU|`^35P`dfhk=11g@J(~f`Ng-2WmD*9mswWuxgOIVU~b&Lfi(@
z2XZq=2BaQ@L4E@9K{SXD!yqwc1_p)|NOlzc|Nq|x8V)K@dsF`Z{||B>$Xt+K1_lQQ
z2+hC%3XA;Gl7i9_{YsE|AiVAW|No${0;z$Su>!-418_4yVFxh-68<24APkBhP>6tR
z0mTp~ejFGW7(i~;0f!ev1{4w?cc8ejp~eMbhQdKs1_p-Ayn@masJ}qsFboPWNF1_*
K<B)-YK^*|CrY)8L

literal 0
HcmV?d00001

diff --git a/tensorflow/python/framework/python_op_gen_test.cc b/tensorflow/python/framework/python_op_gen_test.cc
new file mode 100644
index 00000000000..5185086fdd3
--- /dev/null
+++ b/tensorflow/python/framework/python_op_gen_test.cc
@@ -0,0 +1,42 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/python/framework/python_op_gen.h"
+
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_def.pb.h"
+#include "tensorflow/core/framework/op_gen_lib.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace {
+
+TEST(PythonOpGen, Basic) {
+  OpList ops;
+  OpRegistry::Global()->Export(false, &ops);
+
+  ApiDefMap api_def_map(ops);
+
+  string code = GetPythonOps(ops, api_def_map, {}, "");
+
+  EXPECT_TRUE(absl::StrContains(code, "def case"));
+
+  // TODO(mdan): Add tests to verify type annotations are correctly added.
+}
+
+// TODO(mdan): Include more tests with synhtetic ops and api defs.
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/python/keras/distribute/mirrored_strategy_test.py b/tensorflow/python/keras/distribute/mirrored_strategy_test.py
new file mode 100644
index 00000000000..2844af8cc3a
--- /dev/null
+++ b/tensorflow/python/keras/distribute/mirrored_strategy_test.py
@@ -0,0 +1,89 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for MirroredStrategy."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.distribute import combinations
+from tensorflow.python.distribute import strategy_combinations
+from tensorflow.python.eager import backprop
+from tensorflow.python.eager import context
+from tensorflow.python.eager import function
+from tensorflow.python.eager import test
+from tensorflow.python.keras.engine import training as keras_training
+from tensorflow.python.keras.layers import core as keras_core
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.training import gradient_descent
+from tensorflow.python.training import optimizer as optimizer_lib
+
+
+class MiniModel(keras_training.Model):
+  """Minimal model for mnist.
+
+  Useful for testing and debugging on slow TPU simulators.
+  """
+
+  def __init__(self):
+    super(MiniModel, self).__init__(name="")
+    self.fc = keras_core.Dense(1, name="fc", kernel_initializer="ones",
+                               bias_initializer="ones")
+
+  def call(self, inputs, training=True):
+    inputs = array_ops.ones([1, 10])
+    return self.fc(inputs)
+
+
+@combinations.generate(
+    combinations.combine(
+        distribution=[
+            strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
+        ],
+        mode=["graph", "eager"]))
+class MirroredStrategyDefunTest(test.TestCase):
+
+  def testTrain(self, distribution):
+    with distribution.scope():
+      mock_model = MiniModel()
+      mock_model.call = function.defun(mock_model.call)
+
+      def loss_fn(ctx):
+        del ctx
+        return mock_model(array_ops.ones([1, 10]))
+
+      gradients_fn = backprop.implicit_grad(loss_fn)
+      gradients_fn = optimizer_lib.get_filtered_grad_fn(gradients_fn)
+      grads_and_vars = distribution.extended.call_for_each_replica(
+          gradients_fn, args=(None,))
+
+      optimizer = gradient_descent.GradientDescentOptimizer(0.25)
+      update_ops = optimizer._distributed_apply(distribution, grads_and_vars)  # pylint: disable=protected-access
+
+      if not context.executing_eagerly():
+        self.evaluate(variables.global_variables_initializer())
+        self.evaluate(update_ops)
+
+      updated_var_values = self.evaluate(mock_model.variables)
+      # All variables start at 1.0 and get two updates of 0.25.
+      self.assertAllEqual(0.5 * np.ones([10, 1]), updated_var_values[0])
+      self.assertAllEqual([0.5], updated_var_values[1])
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/keras/distribute/mirrored_variable_test.py b/tensorflow/python/keras/distribute/mirrored_variable_test.py
new file mode 100644
index 00000000000..0edfa4806f2
--- /dev/null
+++ b/tensorflow/python/keras/distribute/mirrored_variable_test.py
@@ -0,0 +1,106 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Test MirroredVariable in MirroredStrategy and MultiWorkerMirroredStrategy."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.distribute import collective_all_reduce_strategy
+from tensorflow.python.distribute import combinations
+from tensorflow.python.distribute import distribution_strategy_context as ds_context
+from tensorflow.python.distribute import strategy_combinations
+from tensorflow.python.distribute import values
+from tensorflow.python.eager import context
+from tensorflow.python.eager import test
+from tensorflow.python.framework import config
+from tensorflow.python.keras.layers import core
+
+
+def _mimic_two_cpus():
+  cpus = config.list_physical_devices("CPU")
+
+  config.set_logical_device_configuration(cpus[0], [
+      context.LogicalDeviceConfiguration(),
+      context.LogicalDeviceConfiguration(),
+  ])
+
+
+@combinations.generate(
+    combinations.combine(
+        distribution=[
+            strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
+            combinations.NamedDistribution(
+                "Collective2CPUs",
+                # pylint: disable=g-long-lambda
+                lambda: collective_all_reduce_strategy.
+                CollectiveAllReduceStrategy._from_local_devices((
+                    "/device:CPU:0", "/device:CPU:1")),
+                required_gpus=0)
+        ],
+        mode=["graph", "eager"]))
+class MirroredVariableCreationTest(test.TestCase):
+  """Base class that tests mirrored variable creator.
+
+  Currently it assumes all strategy objects have two replicas.
+  """
+
+  @classmethod
+  def setUpClass(cls):
+    _mimic_two_cpus()
+
+  def assertAllDifferent(self, objs):
+    for i in range(len(objs)):
+      for j in range(len(objs)):
+        if i == j:
+          continue
+        self.assertIsNot(objs[i], objs[j])
+
+  def testWithLayers(self, distribution):
+
+    def model_fn(features):
+
+      layer1 = core.Dense(1)
+      layer1(features)
+      layer2 = core.Dense(1)
+      layer2(features)
+      # We rely on names and orders to make sure replica references the same
+      # MirroredVariable. Uniquifying names may involve global states,
+      # merge_call switches threads so we need to test things work after
+      # merge_call.
+      ds_context.get_replica_context().merge_call(lambda _: _)
+      layer3 = core.Dense(1)
+      layer3(features)
+      return [(layer1.kernel, layer1.bias), (layer2.kernel, layer2.bias),
+              (layer3.kernel, layer3.bias)]
+
+    iterator = distribution.make_input_fn_iterator(
+        lambda _: dataset_ops.Dataset.from_tensors([[1.]]).repeat(10))
+    self.evaluate(iterator.initializer)
+    features = iterator.get_next()
+
+    with distribution.scope():
+      result = distribution.extended.call_for_each_replica(
+          model_fn, args=(features,))
+      for kernel, bias in result:
+        self.assertIsInstance(kernel, values.MirroredVariable)
+        self.assertAllDifferent(distribution.experimental_local_results(kernel))
+        self.assertIsInstance(bias, values.MirroredVariable)
+        self.assertAllDifferent(distribution.experimental_local_results(kernel))
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/keras/integration_test/tpu_strategy_test.py b/tensorflow/python/keras/integration_test/tpu_strategy_test.py
new file mode 100644
index 00000000000..d24e96ae855
--- /dev/null
+++ b/tensorflow/python/keras/integration_test/tpu_strategy_test.py
@@ -0,0 +1,69 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for TPUStrategy."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl import flags
+
+import tensorflow as tf
+
+
+FLAGS = flags.FLAGS
+flags.DEFINE_string("tpu", "", "Name of TPU to connect to.")
+flags.DEFINE_string("project", None, "Name of GCP project with TPU.")
+flags.DEFINE_string("zone", None, "Name of GCP zone with TPU.")
+
+
+def get_tpu_cluster_resolver():
+  resolver = tf.distribute.cluster_resolver.TPUClusterResolver(
+      tpu=FLAGS.tpu,
+      zone=FLAGS.zone,
+      project=FLAGS.project,
+  )
+  return resolver
+
+
+def get_tpu_strategy():
+  resolver = get_tpu_cluster_resolver()
+  tf.config.experimental_connect_to_cluster(resolver)
+  tf.tpu.experimental.initialize_tpu_system(resolver)
+  return tf.distribute.experimental.TPUStrategy(resolver)
+
+
+class TpuStrategyTest(tf.test.TestCase):
+
+  def test_keras_metric_outside_strategy_scope_per_replica(self):
+    strategy = get_tpu_strategy()
+    metric = tf.keras.metrics.Mean("test_metric", dtype=tf.float32)
+
+    dataset = tf.data.Dataset.range(strategy.num_replicas_in_sync * 2).batch(2)
+    dataset = strategy.experimental_distribute_dataset(dataset)
+
+    @tf.function
+    def step_fn(i):
+      metric.update_state(i)
+
+    with self.assertRaisesRegex(ValueError, "Trying to run metric.update_state "
+                                            "in replica context"):
+      with strategy.scope():
+        for i in dataset:
+          strategy.run(step_fn, args=(i,))
+
+
+if __name__ == "__main__":
+  tf.test.main()
diff --git a/tensorflow/python/kernel_tests/banded_triangular_solve_op_test.py b/tensorflow/python/kernel_tests/banded_triangular_solve_op_test.py
new file mode 100644
index 00000000000..bd0fdae03c5
--- /dev/null
+++ b/tensorflow/python/kernel_tests/banded_triangular_solve_op_test.py
@@ -0,0 +1,232 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for tensorflow.ops.math_ops.banded_triangular_solve."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import linalg_ops
+from tensorflow.python.platform import test
+
+
+class BandedTriangularSolveOpTest(test.TestCase):
+
+  def _verifySolveAllWays(self, x, y, dtypes, batch_dims=None):
+    for lower in (False,):
+      for adjoint in (False, True):
+        for use_placeholder in True, False:
+          self._verifySolve(
+              x,
+              y,
+              lower=lower,
+              adjoint=adjoint,
+              batch_dims=batch_dims,
+              use_placeholder=use_placeholder,
+              dtypes=dtypes)
+
+  def _verifySolveAllWaysReal(self, x, y, batch_dims=None):
+    self._verifySolveAllWays(x, y, (np.float32, np.float64), batch_dims)
+
+  def _verifySolveAllWaysComplex(self, x, y, batch_dims=None):
+    self._verifySolveAllWays(x, y, (np.complex64, np.complex128), batch_dims)
+
+  def _verifySolve(self,
+                   x,
+                   y,
+                   lower=True,
+                   adjoint=False,
+                   batch_dims=None,
+                   use_placeholder=False,
+                   dtypes=(np.float32, np.float64)):
+    for np_type in dtypes:
+      a = x.astype(np_type)
+      b = y.astype(np_type)
+
+      # Now we need to convert a to a dense triangular matrix.
+      def make_diags(diags, lower=True):
+        n = len(diags[0])
+        a = np.zeros(n * n, dtype=diags.dtype)
+        if lower:
+          for i, diag in enumerate(diags):
+            a[n * i:n * n:n + 1] = diag[i:]
+        else:
+          diags_flip = np.flip(diags, 0)
+          for i, diag in enumerate(diags_flip):
+            a[i:(n - i) * n:n + 1] = diag[:(n - i)]
+        return a.reshape(n, n)
+
+      # For numpy.solve we have to explicitly zero out the strictly
+      # upper or lower triangle.
+      if a.size > 0:
+        a_np = make_diags(a, lower=lower)
+      else:
+        a_np = a
+      if adjoint:
+        a_np = np.conj(np.transpose(a_np))
+
+      if batch_dims is not None:
+        a = np.tile(a, batch_dims + [1, 1])
+        a_np = np.tile(a_np, batch_dims + [1, 1])
+        b = np.tile(b, batch_dims + [1, 1])
+
+      with self.cached_session(use_gpu=True):
+        a_tf = a
+        b_tf = b
+        if use_placeholder:
+          a_tf = array_ops.placeholder_with_default(a_tf, shape=None)
+          b_tf = array_ops.placeholder_with_default(b_tf, shape=None)
+        tf_ans = linalg_ops.banded_triangular_solve(
+            a_tf, b_tf, lower=lower, adjoint=adjoint)
+        tf_val = self.evaluate(tf_ans)
+        np_ans = np.linalg.solve(a_np, b)
+        self.assertEqual(np_ans.shape, tf_val.shape)
+        self.assertAllClose(np_ans, tf_val)
+
+  @test_util.run_deprecated_v1
+  def testSolve(self):
+    # 1x1 matrix, single rhs.
+    matrix = np.array([[0.1]])
+    rhs0 = np.array([[1.]])
+    self._verifySolveAllWaysReal(matrix, rhs0)
+    # 2x2 matrix with 2 bands, single right-hand side.
+    # Corresponds to the lower triangular
+    # [[1., 0.], [3., 4.]]
+    # and upper triangular
+    # [[2., 1.], [0., 3.]]
+    matrix = np.array([[1., 4.], [2., 3.]])
+    rhs0 = np.array([[1.], [1.]])
+    self._verifySolveAllWaysReal(matrix, rhs0)
+    # 2x2 matrix with 2 bands, 3 right-hand sides.
+    rhs1 = np.array([[1., 0., 1.], [0., 1., 1.]])
+    self._verifySolveAllWaysReal(matrix, rhs1)
+    # 4 x 4 matrix with 2 bands, 3 right hand sides.
+    # Corresponds to the lower triangular
+    # [[1.,  0., 0., 0.],
+    #  [-1., 2., 0., 0.],
+    #  [0., -2., 3., 0.],
+    #  [0., 0., -3., 4.]]
+    # and upper triangular
+    # [[1.,  1., 0., 0.],
+    #  [0., -1., 2., 0.],
+    #  [0., 0., -2., 3.],
+    #  [0., 0., 0., -3.]]
+    matrix = np.array([[1., 2., 3., 4.], [1., -1., -2., -3.]])
+    rhs0 = np.array([[1., 0., 1.], [0., 1., 1.], [-1., 2., 1.], [0., -1., -1.]])
+    self._verifySolveAllWaysReal(matrix, rhs0)
+
+  def testSolveBandSizeSmaller(self):
+    rhs0 = np.random.randn(6, 4)
+
+    # 6 x 6 matrix with 2 bands. Ensure all non-zero entries.
+    matrix = 2. * np.random.uniform(size=[3, 6]) + 1.
+    self._verifySolveAllWaysReal(matrix, rhs0)
+
+    # 6 x 6 matrix with 3 bands. Ensure all non-zero entries.
+    matrix = 2. * np.random.uniform(size=[3, 6]) + 1.
+    self._verifySolveAllWaysReal(matrix, rhs0)
+
+  @test_util.run_deprecated_v1
+  def testSolveComplex(self):
+    if test.is_built_with_rocm():
+      self.skipTest("ROCm does not support BLAS operations for complex types")
+    # 1x1 matrix, single rhs.
+    matrix = np.array([[0.1 + 1j * 0.1]])
+    rhs0 = np.array([[1. + 1j]])
+    self._verifySolveAllWaysComplex(matrix, rhs0)
+    # 2x2 matrix with 2 bands, single right-hand side.
+    # Corresponds to
+    # [[1. + 1j, 0.], [4 + 1j, 2 + 1j]]
+    matrix = np.array([[1., 2.], [3., 4.]]).astype(np.complex64)
+    matrix += 1j * matrix
+    rhs0 = np.array([[1.], [1.]]).astype(np.complex64)
+    rhs0 += 1j * rhs0
+    self._verifySolveAllWaysComplex(matrix, rhs0)
+    # 2x2 matrix with 2 bands, 3 right-hand sides.
+    rhs1 = np.array([[1., 0., 1.], [0., 1., 1.]]).astype(np.complex64)
+    rhs1 += 1j * rhs1
+    self._verifySolveAllWaysComplex(matrix, rhs1)
+
+  @test_util.run_deprecated_v1
+  def testSolveBatch(self):
+    matrix = np.array([[1., 2.], [3., 4.]])
+    rhs = np.array([[1., 0., 1.], [0., 1., 1.]])
+    # Batch of 2x3x2x2 matrices, 2x3x2x3 right-hand sides.
+    self._verifySolveAllWaysReal(matrix, rhs, batch_dims=[2, 3])
+    # Batch of 3x2x2x2 matrices, 3x2x2x3 right-hand sides.
+    self._verifySolveAllWaysReal(matrix, rhs, batch_dims=[3, 2])
+
+    matrix = np.array([[1., 2., 3., 4.], [-1., -2., -3., -4.],
+                       [-1., 1., 2., 3.]])
+    rhs = np.array([[-1., 2.], [1., 1.], [0., 1.], [2., 3.]])
+    # Batch of 2x3x4x4 matrices with 3 bands, 2x3x4x2 right-hand sides.
+    self._verifySolveAllWaysReal(matrix, rhs, batch_dims=[2, 3])
+    # Batch of 3x2x4x4 matrices with 3 bands, 3x2x4x2 right-hand sides.
+    self._verifySolveAllWaysReal(matrix, rhs, batch_dims=[3, 2])
+
+  @test_util.run_deprecated_v1
+  def testSolveBatchComplex(self):
+    if test.is_built_with_rocm():
+      self.skipTest("ROCm does not support BLAS operations for complex types")
+    matrix = np.array([[1., 2.], [3., 4.]]).astype(np.complex64)
+    matrix += 1j * matrix
+    rhs = np.array([[1., 0., 1.], [0., 1., 1.]]).astype(np.complex64)
+    rhs += 1j * rhs
+    # Batch of 2x3x2x2 matrices, 2x3x2x3 right-hand sides.
+    self._verifySolveAllWaysComplex(matrix, rhs, batch_dims=[2, 3])
+    # Batch of 3x2x2x2 matrices, 3x2x2x3 right-hand sides.
+    self._verifySolveAllWaysComplex(matrix, rhs, batch_dims=[3, 2])
+
+  @test_util.run_deprecated_v1
+  def testWrongDimensions(self):
+    # The matrix should have the same number of rows as the
+    # right-hand sides.
+    matrix = np.array([[1., 1.], [1., 1.]])
+    rhs = np.array([[1., 0.]])
+    with self.cached_session(use_gpu=True):
+      with self.assertRaises(ValueError):
+        self._verifySolve(matrix, rhs)
+      with self.assertRaises(ValueError):
+        self._verifySolve(matrix, rhs, batch_dims=[2, 3])
+
+    # Number of bands exceeds the dimension of the matrix.
+    matrix = np.ones((6, 4))
+    rhs = np.ones((4, 2))
+    with self.cached_session(use_gpu=True):
+      with self.assertRaises(ValueError):
+        self._verifySolve(matrix, rhs)
+      with self.assertRaises(ValueError):
+        self._verifySolve(matrix, rhs, batch_dims=[2, 3])
+
+  @test_util.run_deprecated_v1
+  @test_util.disable_xla("XLA cannot throw assertion errors during a kernel.")
+  def testNotInvertible(self):
+    # The input should be invertible.
+    # The matrix is singular because it has a zero on the diagonal.
+    # FIXME(rmlarsen): The GPU kernel does not check for singularity.
+    singular_matrix = np.array([[1., 0., -1.], [-1., 0., 1.], [0., -1., 1.]])
+    with self.cached_session():
+      with self.assertRaisesOpError("Input matrix is not invertible."):
+        self._verifySolve(singular_matrix, singular_matrix)
+      with self.assertRaisesOpError("Input matrix is not invertible."):
+        self._verifySolve(singular_matrix, singular_matrix, batch_dims=[2, 3])
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/kernel_tests/random/parameterized_truncated_normal_op_test.py b/tensorflow/python/kernel_tests/random/parameterized_truncated_normal_op_test.py
new file mode 100644
index 00000000000..309c3e404db
--- /dev/null
+++ b/tensorflow/python/kernel_tests/random/parameterized_truncated_normal_op_test.py
@@ -0,0 +1,520 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for ParameterizedTruncatedNormalOp."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import functools
+import math
+import timeit
+
+import numpy as np
+from six.moves import range  # pylint: disable=redefined-builtin
+
+from tensorflow.core.protobuf import config_pb2
+from tensorflow.python.client import session
+from tensorflow.python.eager import backprop
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import random_seed
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import stateless_random_ops as stateless
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+from tensorflow.python.platform import tf_logging
+
+
+def _get_stddev_inside_bounds_before_using_randn(gpu):
+  # The boundary where the randn sampler is used varies between CPU and GPU.
+  if gpu:
+    return 1.3
+  else:
+    return 1.7
+
+
+class TruncatedNormalMoments(object):
+  memoized_moments = None
+  mean = None
+  stddev = None
+  minval = None
+  maxval = None
+
+  def __init__(self, mean, stddev, minval, maxval):
+    self.memoized_moments = [1.0]  # 0th moment
+    self.mean = np.double(mean)
+    self.stddev = np.double(stddev)
+    # NOTE(ringwalt): The formula doesn't handle infinite values.
+    self.minval = np.double(max(-10, minval))
+    self.maxval = np.double(min(10, maxval))
+
+  def __getitem__(self, moment):
+    """Calculates the truncated normal moments.
+
+    Args:
+      moment: The number for the moment.
+
+    Returns:
+      The value for the given moment.
+
+    Uses the recurrence relation described in:
+        http://www.smp.uq.edu.au/people/YoniNazarathy/teaching_projects
+            /studentWork/EricOrjebin_TruncatedNormalMoments.pdf
+    """
+    assert moment > 0
+    # The test case must ensure it can import scipy.stats before this point.
+    import scipy.stats  # pylint: disable=g-import-not-at-top
+    dist = scipy.stats.norm(loc=self.mean, scale=self.stddev)
+    for k in range(len(self.memoized_moments), moment + 1):
+      m_k_minus_2 = self.memoized_moments[k - 2] if k > 1 else np.double(0.0)
+      m_k_minus_1 = self.memoized_moments[k - 1]
+      numerator = (np.power(self.maxval, k - 1) * dist.pdf(self.maxval) -
+                   np.power(self.minval, k - 1) * dist.pdf(self.minval))
+      denominator = dist.cdf(self.maxval) - dist.cdf(self.minval)
+      m = ((k - 1) * self.stddev**2 * m_k_minus_2 + self.mean * m_k_minus_1 -
+           self.stddev * numerator / denominator)
+      assert abs(m) < 1e50  # ensure numerical accuracy
+      self.memoized_moments.append(m)
+    return self.memoized_moments[moment]
+
+
+def calculate_moments(samples, max_moment):
+  moments = [0.0] * (max_moment + 1)
+  for k in range(len(moments)):
+    moments[k] = np.mean(samples**k, axis=0)
+  return moments
+
+
+def z_test(real, expected, i, num_samples):
+  numerical_error = 1e-6  # per-operation error
+  moment_mean = expected[i]
+  moment_squared = expected[2 * i]
+  moment_var = moment_squared - moment_mean * moment_mean
+
+  error_per_moment = i * numerical_error
+  total_variance = moment_var / float(num_samples) + error_per_moment
+  return abs((real[i] - moment_mean) / math.sqrt(total_variance))
+
+
+class ParameterizedTruncatedNormalTest(test.TestCase):
+  z_limit = 6.0
+
+  # Stop at moment 10 to avoid numerical errors in the theoretical moments.
+  max_moment = 10
+
+  def validateMoments(self,
+                      shape,
+                      mean,
+                      stddev,
+                      minval,
+                      maxval,
+                      use_stateless=False,
+                      seed=1618):
+    try:
+      # TruncatedNormalMoments requires scipy.stats.
+      # Give up early if we are unable to import it.
+      random_seed.set_random_seed(seed)
+      with self.cached_session(use_gpu=True):
+        if use_stateless:
+          # Generate a seed that stateless ops can use.
+          new_seed = random_ops.random_uniform([2],
+                                               seed=seed,
+                                               minval=0,
+                                               maxval=(2**31 - 1),
+                                               dtype=np.int32)
+          samples = stateless.stateless_parameterized_truncated_normal(
+              shape, new_seed, mean, stddev, minval, maxval).eval()
+        else:
+          samples = random_ops.parameterized_truncated_normal(
+              shape, mean, stddev, minval, maxval).eval()
+        assert (~np.isnan(samples)).all()
+      moments = calculate_moments(samples, self.max_moment)
+      expected_moments = TruncatedNormalMoments(mean, stddev, minval, maxval)
+      num_samples = functools.reduce(lambda x, y: x * y, shape, 1)
+      for i in range(1, len(moments)):
+        self.assertLess(
+            z_test(moments, expected_moments, i, num_samples), self.z_limit)
+    except ImportError as e:
+      tf_logging.warn("Cannot test truncated normal op: %s" % str(e))
+
+  def validateKolmogorovSmirnov(self,
+                                shape,
+                                mean,
+                                stddev,
+                                minval,
+                                maxval,
+                                use_stateless=False,
+                                seed=1618):
+    try:
+      import scipy.stats  # pylint: disable=g-import-not-at-top
+      random_seed.set_random_seed(seed)
+      with self.cached_session(use_gpu=True):
+        if use_stateless:
+          new_seed = random_ops.random_uniform([2],
+                                               seed=seed,
+                                               minval=0,
+                                               maxval=(2**31 - 1),
+                                               dtype=np.int32)
+          samples = stateless.stateless_parameterized_truncated_normal(
+              shape, new_seed, mean, stddev, minval, maxval).eval()
+        else:
+          samples = random_ops.parameterized_truncated_normal(
+              shape, mean, stddev, minval, maxval).eval()
+
+      assert (~np.isnan(samples)).all()
+      minval = max(mean - stddev * 10, minval)
+      maxval = min(mean + stddev * 10, maxval)
+      dist = scipy.stats.norm(loc=mean, scale=stddev)
+      cdf_min = dist.cdf(minval)
+      cdf_max = dist.cdf(maxval)
+
+      def truncated_cdf(x):
+        return np.clip((dist.cdf(x) - cdf_min) / (cdf_max - cdf_min), 0.0, 1.0)
+
+      pvalue = scipy.stats.kstest(samples, truncated_cdf)[1]
+      self.assertGreater(pvalue, 1e-10)
+    except ImportError as e:
+      tf_logging.warn("Cannot test truncated normal op: %s" % str(e))
+
+  @test_util.run_deprecated_v1
+  def testDefaults(self):
+    self.validateMoments([int(1e5)], 0.0, 1.0, -2.0, 2.0)
+    self.validateMoments([int(1e5)], 0.0, 1.0, -2.0, 2.0, use_stateless=True)
+
+  @test_util.run_deprecated_v1
+  def testShifted(self):
+    self.validateMoments([int(1e5)], -1.0, 1.0, -2.0, 2.0)
+    self.validateMoments([int(1e5)], -1.0, 1.0, -2.0, 2.0, use_stateless=True)
+
+  @test_util.run_deprecated_v1
+  def testRightTail(self):
+    self.validateMoments([int(1e5)], 0.0, 1.0, 4.0, np.infty)
+    self.validateMoments([int(1e5)],
+                         0.0,
+                         1.0,
+                         4.0,
+                         np.infty,
+                         use_stateless=True)
+
+  @test_util.run_deprecated_v1
+  def testLeftTail(self):
+    self.validateMoments([int(1e5)], 0.0, 1.0, -np.infty, -4.0)
+    self.validateMoments([int(1e5)],
+                         0.0,
+                         1.0,
+                         -np.infty,
+                         -4.0,
+                         use_stateless=True)
+
+  @test_util.run_deprecated_v1
+  def testLeftTailTwoSidedBounds(self):
+    self.validateMoments([int(1e5)], 0.0, 1.0, -6.0, -3.0)
+    self.validateMoments([int(1e5)], 0.0, 1.0, -6.0, -3.0, use_stateless=True)
+
+  @test_util.run_deprecated_v1
+  @test_util.disable_xla("Low probability region")
+  def testTwoSidedLeftTailShifted(self):
+    self.validateKolmogorovSmirnov([int(1e5)], 6.0, 1.0, -1.0, 1.0)
+    self.validateKolmogorovSmirnov([int(1e5)],
+                                   6.0,
+                                   1.0,
+                                   -1.0,
+                                   1.0,
+                                   use_stateless=True)
+
+  @test_util.run_deprecated_v1
+  @test_util.disable_xla("Low probability region")
+  def testRightTailShifted(self):
+    self.validateMoments([int(1e5)], -5.0, 1.0, 2.0, np.infty)
+    self.validateMoments([int(1e5)],
+                         -5.0,
+                         1.0,
+                         2.0,
+                         np.infty,
+                         use_stateless=True)
+
+  # Take the normal distribution around the mean, but truncating the left tail
+  # far from the mean.
+  @test_util.run_deprecated_v1
+  def testTruncateOnLeft_entireTailOnRight(self):
+    self.validateKolmogorovSmirnov([int(1e5)], 10.0, 1.0, 4.0, np.infty)
+    self.validateKolmogorovSmirnov([int(1e5)],
+                                   10.0,
+                                   1.0,
+                                   4.0,
+                                   np.infty,
+                                   use_stateless=True)
+
+  # Take the normal distribution around the mean, but truncating the right tail.
+  @test_util.run_deprecated_v1
+  def testTruncateOnRight_entireTailOnLeft(self):
+    self.validateKolmogorovSmirnov([int(1e5)], -8, 1.0, -np.infty, -4.0)
+    self.validateKolmogorovSmirnov([int(1e5)],
+                                   -8.,
+                                   1.0,
+                                   -np.infty,
+                                   -4.0,
+                                   use_stateless=True)
+
+  @test_util.run_deprecated_v1
+  def testSmallStddev(self):
+    self.validateKolmogorovSmirnov([int(1e5)], 0.0, 0.1, 0.05, 0.10)
+    self.validateKolmogorovSmirnov([int(1e5)],
+                                   0.0,
+                                   0.1,
+                                   0.05,
+                                   0.10,
+                                   use_stateless=True)
+
+  @test_util.run_deprecated_v1
+  def testSamplingWithSmallStdDevFarFromBound(self):
+    sample_op = random_ops.parameterized_truncated_normal(
+        shape=(int(1e5),), means=0.8, stddevs=0.05, minvals=-1., maxvals=1.)
+    new_seed = random_ops.random_uniform([2],
+                                         seed=1234,
+                                         minval=0,
+                                         maxval=(2**31 - 1),
+                                         dtype=np.int32)
+    sample_op_stateless = stateless.stateless_parameterized_truncated_normal(
+        shape=(int(1e5),),
+        seed=new_seed,
+        means=0.8,
+        stddevs=0.05,
+        minvals=-1.,
+        maxvals=1.)
+
+    with self.session(use_gpu=True) as sess:
+      samples, samples_stateless = sess.run([sample_op, sample_op_stateless])
+      # 0. is more than 16 standard deviations from the mean, and
+      # should have a likelihood < 1e-57.
+      assert (~np.isnan(samples)).all()
+      assert (~np.isnan(samples_stateless)).all()
+      self.assertAllGreater(samples, 0.)
+      self.assertAllGreater(samples_stateless, 0.)
+
+  def testStatelessParameterizedTruncatedNormalHasGrads(self):
+    mean = variables.Variable(0.01)
+    stddev = variables.Variable(1.)
+    minval = variables.Variable(-1.)
+    maxval = variables.Variable(1.)
+
+    with self.cached_session(use_gpu=True) as sess:
+      with backprop.GradientTape(persistent=True) as tape:
+        samples = stateless.stateless_parameterized_truncated_normal(
+            [1], [1, 2], mean, stddev, minval, maxval)
+
+      sess.run(variables.variables_initializer([mean, stddev, minval, maxval]))
+      [mean_grad, std_grad], mean_actual_grad, std_actual_grad = sess.run([
+          tape.gradient(samples, [mean, stddev]),
+          array_ops.ones_like(mean),
+          (samples - mean) / stddev])
+      self.assertAllClose(mean_grad, mean_actual_grad)
+      self.assertAllClose(std_grad, std_actual_grad[0])
+
+      try:
+        import scipy.stats  # pylint:disable=g-import-not-at-top
+        truncnorm = scipy.stats.truncnorm(a=-1., b=1., loc=0., scale=1.)
+        samples_np, [minval_grad, maxval_grad] = sess.run([
+            samples, tape.gradient(samples, [minval, maxval])])
+
+        sample_cdf = truncnorm.cdf(samples_np)
+        # These come from the implicit reparameterization trick.
+        scipy_maxval_grad = np.exp(
+            0.5 * (samples_np ** 2 - ((1. - 0.01) / 1.) ** 2) +
+            np.log(sample_cdf))
+
+        scipy_minval_grad = np.exp(
+            0.5 * (samples_np ** 2 - ((-1. - 0.01) / 1.) ** 2) +
+            np.log1p(-sample_cdf))
+
+        self.assertAllClose(minval_grad, scipy_minval_grad[0], rtol=1e-2)
+        self.assertAllClose(maxval_grad, scipy_maxval_grad[0], rtol=1e-2)
+
+      except ImportError as e:
+        tf_logging.warn("Cannot test truncated normal op: %s" % str(e))
+
+  @test_util.run_deprecated_v1
+  def testSamplingAtRandnSwitchover(self):
+    # The randn sampler is used as the bounds are moved farther from the mean,
+    # and the probability of accepting a sample increases the farther the
+    # bounds are from the mean.
+    # This test asserts that at the point of switchover, both samplers are
+    # working (not raising an error or returning nan) and returning the
+    # expected moments.
+    use_gpu = test.is_gpu_available()
+    stddev_inside_bounds_before_using_randn = (
+        _get_stddev_inside_bounds_before_using_randn(use_gpu))
+
+    epsilon = 0.001
+    self.validateMoments(
+        shape=[int(1e6)],
+        mean=0.,
+        stddev=1.0,
+        minval=-epsilon,
+        maxval=stddev_inside_bounds_before_using_randn - epsilon)
+    self.validateMoments(
+        shape=[int(1e6)],
+        mean=0.,
+        stddev=1.0,
+        minval=-epsilon,
+        maxval=stddev_inside_bounds_before_using_randn + epsilon)
+
+    self.validateMoments(
+        shape=[int(1e6)],
+        mean=0.,
+        stddev=1.0,
+        minval=-epsilon,
+        maxval=stddev_inside_bounds_before_using_randn - epsilon,
+        use_stateless=True)
+    self.validateMoments(
+        shape=[int(1e6)],
+        mean=0.,
+        stddev=1.0,
+        minval=-epsilon,
+        maxval=stddev_inside_bounds_before_using_randn + epsilon,
+        use_stateless=True)
+
+
+# Benchmarking code
+def parameterized_vs_naive(shape, num_iters, use_gpu=False):
+  np.random.seed(1618)  # Make it reproducible.
+
+  # No CSE/CF.
+  optimizer_options = config_pb2.OptimizerOptions(
+      opt_level=config_pb2.OptimizerOptions.L0)
+  config = config_pb2.ConfigProto(graph_options=config_pb2.GraphOptions(
+      optimizer_options=optimizer_options))
+
+  with session.Session(config=config) as sess:
+    with ops.device("/cpu:0" if not use_gpu else None):
+      param_op = control_flow_ops.group(
+          random_ops.parameterized_truncated_normal(shape))
+      naive_op = control_flow_ops.group(random_ops.truncated_normal(shape))
+
+    # Burn-in to avoid session setup costs in the timing.
+    sess.run(param_op)
+    sess.run(param_op)
+    param_dt = timeit.timeit(lambda: sess.run(param_op), number=num_iters)
+    sess.run(naive_op)
+    sess.run(naive_op)
+    naive_dt = timeit.timeit(lambda: sess.run(naive_op), number=num_iters)
+    return param_dt, naive_dt
+
+
+def randn_sampler_switchover(shape, num_iters, use_gpu=False):
+  # Benchmark by constructing samplers on the threshold of using the randn
+  # rejection sampling and check that this threshold is set correctly by
+  # benchmarking with bounds just above and below this threshold.
+  # The uniform and randn samplers should have about the same performance
+  # at this point.
+
+  stddev_inside_bounds_before_using_randn = (
+      _get_stddev_inside_bounds_before_using_randn(use_gpu))
+
+  epsilon = 0.001
+
+  np.random.seed(1618)  # Make it reproducible.
+
+  # No CSE/CF.
+  optimizer_options = config_pb2.OptimizerOptions(
+      opt_level=config_pb2.OptimizerOptions.L0)
+  config = config_pb2.ConfigProto(
+      graph_options=config_pb2.GraphOptions(
+          optimizer_options=optimizer_options))
+
+  with session.Session(config=config) as sess:
+    with ops.device("/cpu:0" if not use_gpu else "/gpu:0"):
+      uniform_sampler_op = control_flow_ops.group(
+          random_ops.parameterized_truncated_normal(
+              shape,
+              means=0.,
+              stddevs=1.0,
+              minvals=-stddev_inside_bounds_before_using_randn + epsilon,
+              maxvals=0.01))
+      randn_sampler_op = control_flow_ops.group(
+          random_ops.parameterized_truncated_normal(
+              shape,
+              means=0.,
+              stddevs=1.0,
+              minvals=-stddev_inside_bounds_before_using_randn - epsilon,
+              maxvals=0.01))
+
+    # Burn-in to avoid session setup costs in the timing.
+    sess.run(uniform_sampler_op)
+    sess.run(uniform_sampler_op)
+    uniform_dt = timeit.timeit(
+        lambda: sess.run(uniform_sampler_op), number=num_iters)
+
+    sess.run(randn_sampler_op)
+    sess.run(randn_sampler_op)
+    randn_dt = timeit.timeit(
+        lambda: sess.run(randn_sampler_op), number=num_iters)
+
+    return randn_dt, uniform_dt
+
+
+class TruncatedNormalBenchmark(test.Benchmark):
+
+  def benchmarkParameterizedOpVsNaiveOpCpu(self):
+    self._benchmarkParameterizedOpVsNaiveOp(False)
+
+  def benchmarkParameterizedOpVsNaiveOpGpu(self):
+    self._benchmarkParameterizedOpVsNaiveOp(True)
+
+  def _benchmarkParameterizedOpVsNaiveOp(self, use_gpu):
+    num_iters = 50
+    print(("Composition of new ParameterizedTruncatedNormalOp vs. "
+           "naive TruncatedNormalOp [%d iters]") % num_iters)
+    print("Shape\tsec(parameterized)\tsec(naive)\tspeedup")
+
+    for shape in [[10000, 100], [1000, 1000], [1000000], [100, 100, 100],
+                  [20, 20, 20, 20]]:
+      p_dt, n_dt = parameterized_vs_naive(shape, num_iters, use_gpu)
+      print("%s\t%.3f\t%.3f\t%.2f" % (shape, p_dt, n_dt, p_dt / n_dt))
+
+      shape_str = "-".join(map(str, shape))
+      self.report_benchmark(
+          name="parameterized_shape" + shape_str,
+          iters=num_iters,
+          wall_time=p_dt)
+      self.report_benchmark(
+          name="naive_shape" + shape_str, iters=num_iters, wall_time=n_dt)
+
+  def benchmarkRandnSamplerCPU(self):
+    self._benchmarkRandnSampler(False)
+
+  def benchmarkRandnSamplerGPU(self):
+    self._benchmarkRandnSampler(True)
+
+  def _benchmarkRandnSampler(self, use_gpu):
+    num_iters = 100
+    shape = [int(1e6)]
+    randn_dt, uniform_dt = randn_sampler_switchover(shape, num_iters, use_gpu)
+
+    print(("Randn Sampler vs uniform samplers [%d iters]\t%.4f\t%.4f") %
+          (num_iters, randn_dt, uniform_dt))
+
+    gpu_str = "_gpu" if use_gpu else "_cpu"
+    self.report_benchmark(
+        name="randn_sampler" + gpu_str, iters=num_iters, wall_time=randn_dt)
+    self.report_benchmark(
+        name="uniform_sampler" + gpu_str, iters=num_iters, wall_time=uniform_dt)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/ops/numpy_ops/README.md b/tensorflow/python/ops/numpy_ops/README.md
new file mode 100644
index 00000000000..3dc37423d26
--- /dev/null
+++ b/tensorflow/python/ops/numpy_ops/README.md
@@ -0,0 +1,94 @@
+# tf.experimental.numpy
+
+This module provides a subset of numpy API, built on top of TensorFlow
+operations. APIs are based on numpy 1.16 version.
+
+The set of supported APIs may be expanded over time. Also future releases may
+change the baseline version of numpy API being supported. A list of some
+systematic differences with numpy are listed later in the "Differences with
+Numpy" section.
+
+## Types
+
+The module provide an `ndarray` class which wraps an immutable `tf.Tensor`.
+Additional functions are provided which accept array-like objects. Here
+array-like objects includes `ndarrays` as defined by this module, as well as
+`tf.Tensor`, in addition to types accepted by `numpy`.
+
+A subset of `numpy` dtypes are supported, along with `tf.bfloat16`.
+Additionally, support is provided for selecting the default float type
+(`np.float32` vs `np.float64`) given that some applications may prefer lower
+precision.
+
+## Device Support
+
+Given that `ndarray` and functions wrap TensorFlow constructs, the code will
+have GPU and TPU support on par with TensorFlow. Also the code can be wrapped
+with `tf.function` and XLA compiled. Device placement can be controlled by using
+`with tf.device` scopes.
+
+## Graph and Eager Modes
+
+Eager mode execution should typically match numpy semantics of executing
+op-by-op. However the same code can be executed in graph mode, by putting it
+inside a `tf.function`. This can change behavior of certain operations since
+symbolic execution may not have information that is computed during runtime.
+
+Some differences are:
+
+  * Shapes can be incomplete or unknown. This means that `ndarray.shape`,
+    `ndarray.size` and `ndarray.ndim` can return `ndarray` objects instead of
+    returning integer (or tuple of integer) values.
+  * Python control flow based on `ndarray` values may not work and may have to
+    be rewritten to use `tf.cond` or `tf.while_loop`. Note that autograph
+    conversion as part of `tf.function` should still work.
+  * `__len__`, `__iter__` and `__index__` properties of `ndarray` may similarly
+    not work in graph mode.
+
+## Mutation and Variables
+
+`ndarrays` currently wrap immutable `tf.Tensor`. Also currently mutation
+operations like slice assigns are not supported. This may change in the future.
+
+There is currently no explict construct on par with `tf.Variable`. However one
+can directly construct a `tf.Variable` and use that with the numpy APIs in this
+module. See section on Interop.
+
+## Interop
+
+The numpy API calls can be interleaved with TensorFlow calls without incurring
+Tensor data copies. This is true even if the `ndarray` or `tf.Tensor` is placed
+on a non-CPU device.
+
+Additionally, one could put these calls in a `with tf.GradientTape()` context to
+compute gradients through the numpy API calls. Similarly, code vectorization can
+be done using `tf.vectorized_map()`.
+
+In general, the expected behavior should be on par with that of code involving
+`tf.Tensor` and running stateless TensorFlow functions on them.
+
+## Array Interface
+
+The `ndarray` class implements the `__array__` interface. This should allow
+these objects to be passed into contexts that expect a `numpy` or array-like
+object (e.g. matplotlib).
+
+
+## Differences with Numpy
+
+Here is a non-exhaustive list of differences:
+
+  * Not all dtypes are currently supported. e.g. `np.float96`, `np.float128`.
+    `np.object`, `np.str`, `np.recarray` types are not supported.
+  * `ndarray` storage is in C order only. Fortran order, views, stride_tricks
+    are not supported.
+  * Only a subset of functions and modules are supported. This set would be
+    expanded over time. For supported functions, some arguments or argument
+    values may not be supported. This differences are listed in the function
+    comments.
+  * Buffer mutation is currently not supported. `ndarrays` wrap immutable
+    tensors. This means that output buffer arguments (e..g `out` in ufuncs) are
+    not supported
+  * full `ufunc` support is not provided.
+  * Numpy C API is not supported. Numpy's Cython and Swig integration are not
+    supported.
diff --git a/tensorflow/python/ops/numpy_ops/np_accessor.py b/tensorflow/python/ops/numpy_ops/np_accessor.py
new file mode 100644
index 00000000000..64786d2c50a
--- /dev/null
+++ b/tensorflow/python/ops/numpy_ops/np_accessor.py
@@ -0,0 +1,32 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""An accessor class for numpy_ops contents."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.ops import numpy_ops
+
+
+class Numpy:
+  """An accessor class that forwards attribute accesses to module `numpy_ops`.
+  """
+
+  def __getattr__(self, attr):
+    return getattr(numpy_ops, attr)
+
+
+numpy = Numpy()
diff --git a/tensorflow/python/ops/numpy_ops/np_interop_test.py b/tensorflow/python/ops/numpy_ops/np_interop_test.py
new file mode 100644
index 00000000000..f52d3dae78b
--- /dev/null
+++ b/tensorflow/python/ops/numpy_ops/np_interop_test.py
@@ -0,0 +1,144 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for interop between TF ops, numpy_ops, and numpy methods."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+
+import numpy as onp
+
+
+from tensorflow.python.eager import backprop
+from tensorflow.python.eager import def_function
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.numpy_ops import np_array_ops
+from tensorflow.python.ops.numpy_ops import np_arrays
+from tensorflow.python.ops.numpy_ops import np_math_ops
+from tensorflow.python.platform import test
+
+
+class InteropTest(test.TestCase):
+
+  def testGradientTapeInterop(self):
+    with backprop.GradientTape() as t:
+      x = np_array_ops.asarray(3.0)
+      y = np_array_ops.asarray(2.0)
+
+      t.watch([x, y])
+
+      xx = 2 * x
+      yy = 3 * y
+
+    dx, dy = t.gradient([xx, yy], [x, y])
+
+    # TODO(nareshmodi): Gradient tape returns tensors. Is it possible to rewrap?
+    self.assertAllClose(dx, 2.0)
+    self.assertAllClose(dy, 3.0)
+
+  def testFunctionInterop(self):
+    x = np_array_ops.asarray(3.0)
+    y = np_array_ops.asarray(2.0)
+
+    add = lambda x, y: x + y
+    add_fn = def_function.function(add)
+
+    raw_result = add(x, y)
+    fn_result = add_fn(x, y)
+
+    self.assertIsInstance(raw_result, np_arrays.ndarray)
+    self.assertIsInstance(fn_result, np_arrays.ndarray)
+    self.assertAllClose(raw_result, fn_result)
+
+  def testCondInterop(self):
+    x = np_array_ops.asarray(3.0)
+
+    def fn(x):
+      x_plus_1 = control_flow_ops.cond(x > 0, lambda: x+1, lambda: x+2)
+      x_plus_2 = control_flow_ops.cond(x < 0, lambda: x+1, lambda: x+2)
+
+      return x_plus_1, x_plus_2
+
+    raw_x_plus_1, raw_x_plus_2 = fn(x)
+    fn_x_plus_1, fn_x_plus_2 = def_function.function(fn)(x)
+
+    self.assertAllClose(raw_x_plus_1, x + 1)
+    self.assertAllClose(raw_x_plus_2, x + 2)
+
+    self.assertAllClose(fn_x_plus_1, x + 1)
+    self.assertAllClose(fn_x_plus_2, x + 2)
+
+  def testWhileInterop(self):
+    def fn():
+      x = np_array_ops.asarray(0)
+      c = lambda x: x < 10000
+      b = lambda x: [x + 1]
+      return control_flow_ops.while_loop_v2(c, b, [x], parallel_iterations=20)
+
+    self.assertEqual(10000, fn()[0])
+    self.assertEqual(10000, def_function.function(fn)()[0])
+
+  def testTensorTFNPArrayInterop(self):
+    arr = np_array_ops.asarray(0.)
+    t = constant_op.constant(10.)
+
+    arr_plus_t = arr + t
+    t_plus_arr = t + arr
+
+    self.assertIsInstance(arr_plus_t, ops.Tensor)
+    self.assertIsInstance(t_plus_arr, ops.Tensor)
+    self.assertEqual(10., arr_plus_t.numpy())
+    self.assertEqual(10., t_plus_arr.numpy())
+
+  def testTensorTFNPOp(self):
+    t = constant_op.constant(10.)
+
+    sq = np_math_ops.square(t)
+    self.assertIsInstance(sq, np_arrays.ndarray)
+    self.assertEqual(100., sq)
+
+  def testTFNPArrayTFOpInterop(self):
+    arr = np_array_ops.asarray(10.)
+
+    # TODO(nareshmodi): Test more ops.
+    sq = math_ops.square(arr)
+    self.assertIsInstance(sq, ops.Tensor)
+    self.assertEqual(100., sq.numpy())
+
+  def testTFNPArrayNPOpInterop(self):
+    arr = np_array_ops.asarray([10.])
+
+    # TODO(nareshmodi): Test more ops.
+    sq = onp.square(arr)
+    self.assertIsInstance(sq, onp.ndarray)
+    self.assertEqual(100., sq[0])
+
+    # TODO(nareshmodi): Fails since the autopacking code doesn't use
+    # nest.flatten.
+#   def testAutopacking(self):
+#     arr1 = np_array_ops.asarray(1.)
+#     arr2 = np_array_ops.asarray(2.)
+#     arr3 = np_array_ops.asarray(3.)
+#     t = ops.convert_to_tensor_v2([arr1, arr2, arr3])
+
+#     self.assertEqual(t.numpy(), [1., 2., 3.])
+
+if __name__ == '__main__':
+  ops.enable_eager_execution()
+  test.main()
diff --git a/tensorflow/security/fuzzing/op_fuzzing/BUILD b/tensorflow/security/fuzzing/op_fuzzing/BUILD
new file mode 100644
index 00000000000..aacd2f16cc4
--- /dev/null
+++ b/tensorflow/security/fuzzing/op_fuzzing/BUILD
@@ -0,0 +1,39 @@
+# Fuzzing TensorFlow ops..
+# Most ops have a similar set of dependencies and a similar fuzzing
+# infrastructure. Hence, we gather everything in one single place.
+# Note that these fuzzers cover a large part of TF, they are not granular.
+
+load(
+    "//tensorflow/security/fuzzing:tf_fuzzing.bzl",
+    "tf_fuzz_target",
+)
+
+package(
+    licenses = ["notice"],  # Apache 2.0
+)
+
+# Since all ops need to have a graph created before being fuzzed, we define
+# this header-only library to handle the needed plumbing.
+cc_library(
+    name = "fuzz_session",
+    hdrs = ["fuzz_session.h"],
+    deps = [
+        "//tensorflow/cc:scope",
+        "//tensorflow/core:core_cpu_base",
+        "//tensorflow/core:session_options",
+        "//tensorflow/core/common_runtime:direct_session_internal",
+        "//tensorflow/core/framework:tensor",
+        "//tensorflow/core/platform:status",
+    ],
+)
+
+# A trivial fuzzer with no pre-specified corpus.
+tf_fuzz_target(
+    name = "identity_fuzz",
+    srcs = ["identity_fuzz.cc"],
+    deps = [
+        ":fuzz_session",
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/core/kernels:array",
+    ],
+)
diff --git a/tensorflow/security/fuzzing/op_fuzzing/fuzz_session.h b/tensorflow/security/fuzzing/op_fuzzing/fuzz_session.h
new file mode 100644
index 00000000000..575212b3b86
--- /dev/null
+++ b/tensorflow/security/fuzzing/op_fuzzing/fuzz_session.h
@@ -0,0 +1,156 @@
+/* Copyright 2016 Google Inc. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_SECURITY_FUZZING_OP_FUZZING_FUZZ_SESSION_H_
+#define TENSORFLOW_SECURITY_FUZZING_OP_FUZZING_FUZZ_SESSION_H_
+
+#include <cstdint>
+#include <cstdlib>
+#include <string>
+#include <vector>
+
+#include "tensorflow/cc/framework/scope.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/public/session.h"
+#include "tensorflow/core/public/session_options.h"
+
+// Standard invoking function macro to dispatch to a fuzzer class.
+#define STANDARD_TF_FUZZ_FUNCTION(FuzzerClass)                              \
+  extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) { \
+    static FuzzerClass* fuzzer = new FuzzerClass();                         \
+    return fuzzer->Fuzz(data, size);                                        \
+  }
+
+// Standard builder for hooking one placeholder to one op.
+#define SINGLE_INPUT_OP_BUILDER(dtype, opName)                          \
+  void BuildGraph(const Scope& scope) override {                        \
+    auto op_node =                                                      \
+        tensorflow::ops::Placeholder(scope.WithOpName("input"), dtype); \
+    (void)tensorflow::ops::opName(scope.WithOpName("output"), op_node); \
+  }
+
+namespace tensorflow {
+namespace fuzzing {
+
+// Create a TensorFlow session using a specific GraphDef created
+// by BuildGraph(), and make it available for fuzzing.
+// Users must override BuildGraph and FuzzImpl to specify
+// (1) which operations are being fuzzed; and
+// (2) How to translate the uint8_t* buffer from the fuzzer
+//     to a Tensor or Tensors that are semantically appropriate
+//     for the op under test.
+// For the simple cases of testing a single op that takes a single
+// input Tensor, use the SINGLE_INPUT_OP_BUILDER(dtype, opName) macro in place
+// of defining BuildGraphDef.
+//
+// Typical use:
+// class FooFuzzer : public FuzzSession {
+//   SINGLE_INPUT_OP_BUILDER(DT_INT8, Identity);
+//   void FuzzImpl(const uint8_t* data, size_t size) {
+//      ... convert data and size to a Tensor, pass it to:
+//      RunInputs({{"input", input_tensor}});
+//
+class FuzzSession {
+ public:
+  FuzzSession() : initialized_(false) {}
+  virtual ~FuzzSession() {}
+
+  // Constructs a Graph using the supplied Scope.
+  // By convention, the graph should have inputs named "input1", ...
+  // "inputN", and one output node, named "output".
+  // Users of FuzzSession should override this method to create their graph.
+  virtual void BuildGraph(const Scope& scope) = 0;
+
+  // Implements the logic that converts an opaque byte buffer
+  // from the fuzzer to Tensor inputs to the graph.  Users must override.
+  virtual void FuzzImpl(const uint8_t* data, size_t size) = 0;
+
+  // Initializes the FuzzSession.  Not safe for multithreading.
+  // Separate init function because the call to virtual BuildGraphDef
+  // can't be put into the constructor.
+  Status InitIfNeeded() {
+    if (initialized_) {
+      return Status::OK();
+    }
+    initialized_ = true;
+
+    Scope root = Scope::DisabledShapeInferenceScope().ExitOnError();
+    SessionOptions options;
+    session_ = std::unique_ptr<Session>(NewSession(options));
+
+    BuildGraph(root);
+
+    GraphDef graph_def;
+    TF_CHECK_OK(root.ToGraphDef(&graph_def));
+
+    Status status = session_->Create(graph_def);
+    if (!status.ok()) {
+      // This is FATAL, because this code is designed to fuzz an op
+      // within a session.  Failure to create the session means we
+      // can't send any data to the op.
+      LOG(FATAL) << "Could not create session: " << status.error_message();
+    }
+    return status;
+  }
+
+  // Runs the TF session by pulling on the "output" node, attaching
+  // the supplied input_tensor to the input node(s), and discarding
+  // any returned output.
+  // Note: We are ignoring Status from Run here since fuzzers don't need to
+  // check it (as that will slow them down and printing/logging is useless).
+  void RunInputs(const std::vector<std::pair<string, Tensor> >& inputs) {
+    RunInputsWithStatus(inputs).IgnoreError();
+  }
+
+  // Same as RunInputs but don't ignore status
+  Status RunInputsWithStatus(
+      const std::vector<std::pair<string, Tensor> >& inputs) {
+    return session_->Run(inputs, {}, {"output"}, nullptr);
+  }
+
+  // Dispatches to FuzzImpl;  small amount of sugar to keep the code
+  // of the per-op fuzzers tiny.
+  int Fuzz(const uint8_t* data, size_t size) {
+    Status status = InitIfNeeded();
+    TF_CHECK_OK(status) << "Fuzzer graph initialization failed: "
+                        << status.error_message();
+    // No return value from fuzzing:  Success is defined as "did not
+    // crash".  The actual application results are irrelevant.
+    FuzzImpl(data, size);
+    return 0;
+  }
+
+ private:
+  bool initialized_;
+  std::unique_ptr<Session> session_;
+};
+
+// A specialized fuzz implementation for ops that take
+// a single string.  Caller must still define the op
+// to plumb by overriding BuildGraph or using
+// a plumbing macro.
+class FuzzStringInputOp : public FuzzSession {
+  void FuzzImpl(const uint8_t* data, size_t size) final {
+    Tensor input_tensor(tensorflow::DT_STRING, TensorShape({}));
+    input_tensor.scalar<tstring>()() =
+        string(reinterpret_cast<const char*>(data), size);
+    RunInputs({{"input", input_tensor}});
+  }
+};
+
+}  // end namespace fuzzing
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_SECURITY_FUZZING_OP_FUZZING_FUZZ_SESSION_H_
diff --git a/tensorflow/security/fuzzing/op_fuzzing/identity_fuzz.cc b/tensorflow/security/fuzzing/op_fuzzing/identity_fuzz.cc
new file mode 100644
index 00000000000..a63c35b45e2
--- /dev/null
+++ b/tensorflow/security/fuzzing/op_fuzzing/identity_fuzz.cc
@@ -0,0 +1,45 @@
+/* Copyright 2016 Google Inc. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/security/fuzzing/op_fuzzing/fuzz_session.h"
+
+namespace tensorflow {
+namespace fuzzing {
+
+class FuzzIdentity : public FuzzSession {
+  SINGLE_INPUT_OP_BUILDER(DT_INT8, Identity);
+
+  void FuzzImpl(const uint8_t* data, size_t size) final {
+    Tensor input_tensor(tensorflow::DT_INT8,
+                        TensorShape({static_cast<int64>(size)}));
+    auto flat_tensor = input_tensor.flat<int8>();
+    for (size_t i = 0; i < size; i++) {
+      flat_tensor(i) = data[i];
+    }
+
+    // Note:  For many ops, we don't care about this success -- but when
+    // testing to make sure the harness actually works, it's useful.
+    Status s = RunInputsWithStatus({{"input", input_tensor}});
+    if (!s.ok()) {
+      LOG(ERROR) << "Execution failed: " << s.error_message();
+    }
+  }
+};
+
+STANDARD_TF_FUZZ_FUNCTION(FuzzIdentity);
+
+}  // end namespace fuzzing
+}  // end namespace tensorflow
diff --git a/tensorflow/security/fuzzing/status_group_fuzz.cc b/tensorflow/security/fuzzing/status_group_fuzz.cc
new file mode 100644
index 00000000000..a3db42a5e2e
--- /dev/null
+++ b/tensorflow/security/fuzzing/status_group_fuzz.cc
@@ -0,0 +1,66 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <fuzzer/FuzzedDataProvider.h>
+
+#include <cstdint>
+#include <cstdlib>
+
+#include "tensorflow/core/platform/status.h"
+
+// This is a fuzzer for `tensorflow::StatusGroup`. Since `Status` is used almost
+// everywhere, we need to ensure that the common functionality is safe. We don't
+// expect many crashes from this fuzzer
+
+namespace {
+
+tensorflow::error::Code BuildRandomErrorCode(uint32_t code) {
+  // We cannot build a `Status` with error_code of 0 and a message, so force
+  // error code to be non-zero.
+  if (code == 0) {
+    return tensorflow::error::UNKNOWN;
+  }
+
+  return static_cast<tensorflow::error::Code>(code);
+}
+
+extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
+  const std::string error_message = "ERROR";
+  tensorflow::StatusGroup sg;
+  FuzzedDataProvider fuzzed_data(data, size);
+
+  while (fuzzed_data.remaining_bytes() > 0) {
+    uint32_t code = fuzzed_data.ConsumeIntegral<uint32_t>();
+    tensorflow::error::Code error_code = BuildRandomErrorCode(code);
+    bool is_derived = fuzzed_data.ConsumeBool();
+
+    tensorflow::Status s = tensorflow::Status(error_code, error_message);
+
+    if (is_derived) {
+      tensorflow::Status derived_s = tensorflow::StatusGroup::MakeDerived(s);
+      sg.Update(derived_s);
+    } else {
+      sg.Update(s);
+    }
+  }
+
+  // Ignore warnings that these values are unused
+  sg.as_summary_status().IgnoreError();
+  sg.as_concatenated_status().IgnoreError();
+  sg.AttachLogMessages();
+
+  return 0;
+}
+
+}  // namespace
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-distributed-dataset.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-distributed-dataset.pbtxt
new file mode 100644
index 00000000000..a7b229c6c7c
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-distributed-dataset.pbtxt
@@ -0,0 +1,16 @@
+path: "tensorflow.distribute.DistributedDataset"
+tf_class {
+  is_instance: "<class \'tensorflow.python.distribute.input_lib.DistributedDatasetInterface\'>"
+  is_instance: "<class \'collections.abc.Iterable\'>"
+  member {
+    name: "element_spec"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+  }
+  member_method {
+    name: "reduce"
+    argspec: "args=[\'self\', \'initial_state\', \'reduce_func\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-distributed-iterator.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-distributed-iterator.pbtxt
new file mode 100644
index 00000000000..47899cc4188
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-distributed-iterator.pbtxt
@@ -0,0 +1,20 @@
+path: "tensorflow.distribute.DistributedIterator"
+tf_class {
+  is_instance: "<class \'tensorflow.python.distribute.input_lib.DistributedIteratorInterface\'>"
+  is_instance: "<class \'collections.abc.Iterator\'>"
+  member {
+    name: "element_spec"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+  }
+  member_method {
+    name: "get_next"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_next_as_optional"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/ci_build/linux/mkl/install_openmpi_horovod.sh b/tensorflow/tools/ci_build/linux/mkl/install_openmpi_horovod.sh
new file mode 100755
index 00000000000..9bc92ca4fef
--- /dev/null
+++ b/tensorflow/tools/ci_build/linux/mkl/install_openmpi_horovod.sh
@@ -0,0 +1,80 @@
+#!/usr/bin/env bash
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+# Install OpenMPI, OpenSSH and Horovod during Intel(R) MKL container build
+# Usage: install_openmpi_horovod.sh [OPENMPI_VERSION=<openmpi version>] [OPENMPI_DOWNLOAD_URL=<openmpi download url>] 
+# [HOROVOD_VERSION=<horovod version>]
+
+set -e
+
+# Set default
+OPENMPI_VERSION=${OPENMPI_VERSION:-openmpi-2.1.1}
+OPENMPI_DOWNLOAD_URL=${OPENMPI_DOWNLOAD_URL:-https://www.open-mpi.org/software/ompi/v2.1/downloads/openmpi-2.1.1.tar.gz}
+HOROVOD_VERSION=${HOROVOD_VERSION:-0.19.1}
+
+# Install Open MPI
+echo "Installing OpenMPI version ${OPENMPI_VERSION} ..."
+echo "OpenMPI Download url ${OPENMPI_DOWNLOAD_URL} ..."
+
+mkdir /tmp/openmpi
+cd /tmp/openmpi
+curl -fSsL -O ${OPENMPI_DOWNLOAD_URL}
+tar zxf ${OPENMPI_VERSION}.tar.gz
+cd ${OPENMPI_VERSION}
+./configure --enable-mpirun-prefix-by-default
+make -j $(nproc) all
+make install
+ldconfig
+cd /
+rm -rf /tmp/openmpi
+
+# Create a wrapper for OpenMPI to allow running as root by default
+mv /usr/local/bin/mpirun /usr/local/bin/mpirun.real
+echo '#!/bin/bash' > /usr/local/bin/mpirun
+echo 'mpirun.real --allow-run-as-root "$@"' >> /usr/local/bin/mpirun
+chmod a+x /usr/local/bin/mpirun
+
+# Configure OpenMPI to run good defaults:
+echo "btl_tcp_if_exclude = lo,docker0" >> /usr/local/etc/openmpi-mca-params.conf
+
+# Check mpi version
+echo 'OpenMPI version:'
+mpirun --version
+
+# Install OpenSSH for MPI to communicate between containers
+apt-get clean && apt-get update && \
+    apt-get install -y --no-install-recommends --fix-missing \
+        openssh-client openssh-server libnuma-dev && \
+    rm -rf /var/lib/apt/lists/*
+if [[ $?  == "0" ]]; then
+    echo "PASS: OpenSSH installation"
+else
+    yum -y update && yum -y install numactl-devel openssh-server openssh-clients && \
+        yum clean all
+    if [[ $?  == "0" ]]; then
+        echo "PASS: OpenSSH installation"
+    else
+        echo "Unsupported Linux distribution. Aborting!" && exit 1
+    fi
+fi
+mkdir -p /var/run/sshd
+# Allow OpenSSH to talk to containers without asking for confirmation
+grep -v StrictHostKeyChecking /etc/ssh/ssh_config > /etc/ssh/ssh_config.new
+echo " StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new
+mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config
+
+# Install Horovod
+HOROVOD_WITH_TENSORFLOW=1
+python3 -m pip install --no-cache-dir horovod==${HOROVOD_VERSION}
diff --git a/tensorflow/tools/ci_build/per_release/scripts/nonpip_gpu.sh b/tensorflow/tools/ci_build/per_release/scripts/nonpip_gpu.sh
new file mode 100644
index 00000000000..6fd7c3d5854
--- /dev/null
+++ b/tensorflow/tools/ci_build/per_release/scripts/nonpip_gpu.sh
@@ -0,0 +1,75 @@
+#!/bin/bash
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+set -e
+set -x
+
+if [[ -n "${KOKORO_ARTIFACTS_DIR}" ]]; then
+  cd "${KOKORO_ARTIFACTS_DIR}"
+  ls
+  source "$(find "${KOKORO_ARTIFACTS_DIR}" -name "common_google.sh")"
+  cd git/gob-tensorflow
+
+fi
+
+if [[ -z "${TF_KOKORO_PY_VERSION}" ]]; then
+  echo "You must set TF_KOKORO_PY_VERSION, e.g. '3.7', indicating the "
+  echo "Python version to be used for this build."
+  exit 2
+fi
+
+source tensorflow/tools/ci_build/release/common.sh
+
+install_ubuntu_16_pip_deps "pip${TF_KOKORO_PY_VERSION}"
+# Update bazel
+install_bazelisk
+
+# Run configure.
+export TF_NEED_GCP=1
+export TF_NEED_HDFS=1
+export TF_NEED_S3=1
+export TF_NEED_CUDA=1
+export TF_CUDA_VERSION=10
+export TF_CUDNN_VERSION=7
+export TF_NEED_TENSORRT=1
+export TENSORRT_INSTALL_PATH=/usr/local/tensorrt
+export CC_OPT_FLAGS='-mavx'
+export PYTHON_BIN_PATH=$(which "python${TF_KOKORO_PY_VERSION}")
+export TF2_BEHAVIOR=1
+export PROJECT_NAME="tensorflow_gpu"
+export LD_LIBRARY_PATH="/usr/local/cuda:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:$TENSORRT_INSTALL_PATH/lib"
+export TF_CUDA_COMPUTE_CAPABILITIES=sm_35,sm_37,sm_52,sm_60,sm_61,compute_70
+
+yes "" | "$PYTHON_BIN_PATH" configure.py
+
+# Get the default test targets for bazel.
+source tensorflow/tools/ci_build/build_scripts/PRESUBMIT_BUILD_TARGETS.sh
+
+# Exclude -no_oss_py36, for example
+tag_filters="gpu,requires-gpu,-no_gpu,-no_oss,-oss_serial,-no_oss_py${TF_KOKORO_PY_VERSION//.}"
+
+set +e
+bazel test --config=cuda --config=opt \
+  --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.1:toolchain \
+  --linkopt=-lrt \
+  --action_env=TF2_BEHAVIOR="${TF2_BEHAVIOR}" \
+  --test_lang_filters=py \
+  --build_tag_filters=${tag_filters} \
+  --test_tag_filters=${tag_filters} \
+  --test_timeout="300,450,1200,3600" --local_test_jobs=4 \
+  --test_output=errors --verbose_failures=true --keep_going \
+  --run_under=//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute \
+  -- ${DEFAULT_BAZEL_TARGETS} -//tensorflow/lite/...
+test_xml_summary_exit

From f0d0485b0de521ff273c8d91acbb2fbabe57baa7 Mon Sep 17 00:00:00 2001
From: Gaurav Jain <gjn@google.com>
Date: Fri, 19 Jun 2020 18:02:12 -0700
Subject: [PATCH 0692/1390] Rollforward: Add DT_BOOL support to GPU variable
 ops

Identity on bool had a HostMemory requirement which was causing excessive copies.

PiperOrigin-RevId: 317413034
Change-Id: Ica75743c9d202f5cc5fb8c12a475eda84507f0be
---
 tensorflow/core/kernels/control_flow_ops.cc            | 10 +++++-----
 tensorflow/core/kernels/identity_op.cc                 |  2 +-
 tensorflow/core/kernels/variable_ops.cc                |  3 +--
 .../debug/lib/debug_graph_reconstruction_test.py       |  6 +++---
 tensorflow/python/ops/control_flow_ops_test.py         |  6 +++---
 5 files changed, 13 insertions(+), 14 deletions(-)

diff --git a/tensorflow/core/kernels/control_flow_ops.cc b/tensorflow/core/kernels/control_flow_ops.cc
index accb2c59540..1a0082c6a3b 100644
--- a/tensorflow/core/kernels/control_flow_ops.cc
+++ b/tensorflow/core/kernels/control_flow_ops.cc
@@ -107,15 +107,17 @@ TF_CALL_QUANTIZED_TYPES(REGISTER_GPU_SWITCH);
 TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_GPU_REF_SWITCH);
 TF_CALL_QUANTIZED_TYPES(REGISTER_GPU_REF_SWITCH);
 TF_CALL_variant(REGISTER_GPU_SWITCH);
+TF_CALL_bool(REGISTER_GPU_SWITCH);
+TF_CALL_bool(REGISTER_GPU_REF_SWITCH);
 
 #undef REGISTER_CPU_SWITCH
 #undef REGISTER_CPU_REF_SWITCH
 #undef REGISTER_GPU_SWITCH
 #undef REGISTER_GPU_REF_SWITCH
 
-// Special GPU kernels for int32 and string.
-// TODO(b/25387198): Also enable int32 in device memory. This kernel
-// registration requires all int32 inputs and outputs to be in host memory.
+// Special GPU kernels for int32, string & resource handles. Requiring all
+// inputs and outputs to be in host memory.
+// TODO(b/25387198): Also enable int32 in device memory.
 #define REGISTER_GPU_HOST_KERNEL(type)                    \
   REGISTER_KERNEL_BUILDER(Name("Switch")                  \
                               .Device(DEVICE_GPU)         \
@@ -145,8 +147,6 @@ TF_CALL_variant(REGISTER_GPU_SWITCH);
 
 REGISTER_GPU_HOST_KERNEL(int32);
 REGISTER_GPU_HOST_REF_KERNEL(int32);
-REGISTER_GPU_HOST_KERNEL(bool);
-REGISTER_GPU_HOST_REF_KERNEL(bool);
 REGISTER_GPU_HOST_KERNEL(tstring);
 REGISTER_GPU_HOST_REF_KERNEL(tstring);
 REGISTER_GPU_HOST_KERNEL(ResourceHandle);
diff --git a/tensorflow/core/kernels/identity_op.cc b/tensorflow/core/kernels/identity_op.cc
index daa8a1ddb25..4b226dd72d4 100644
--- a/tensorflow/core/kernels/identity_op.cc
+++ b/tensorflow/core/kernels/identity_op.cc
@@ -122,6 +122,7 @@ REGISTER_SYCL_HOST_KERNEL(bool);
 
 TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_GPU_KERNEL);
 REGISTER_GPU_KERNEL(Variant);
+REGISTER_GPU_KERNEL(bool);
 
 #undef REGISTER_GPU_KERNEL
 
@@ -157,7 +158,6 @@ REGISTER_GPU_KERNEL(Variant);
                           IdentityOp)
 
 REGISTER_GPU_HOST_KERNEL(int32);
-REGISTER_GPU_HOST_KERNEL(bool);
 REGISTER_GPU_HOST_KERNEL(tstring);
 REGISTER_GPU_HOST_KERNEL(ResourceHandle);
 
diff --git a/tensorflow/core/kernels/variable_ops.cc b/tensorflow/core/kernels/variable_ops.cc
index 6f5e0b94eca..ccd33e8c75a 100644
--- a/tensorflow/core/kernels/variable_ops.cc
+++ b/tensorflow/core/kernels/variable_ops.cc
@@ -252,8 +252,7 @@ TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SYCL_KERNEL);
 
 TF_CALL_int64(REGISTER_GPU_KERNELS);
 TF_CALL_uint32(REGISTER_GPU_KERNELS);
-TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_KERNELS);
-TF_CALL_COMPLEX_TYPES(REGISTER_GPU_KERNELS);
+TF_CALL_GPU_ALL_TYPES(REGISTER_GPU_KERNELS);
 #undef REGISTER_GPU_KERNELS
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
diff --git a/tensorflow/python/debug/lib/debug_graph_reconstruction_test.py b/tensorflow/python/debug/lib/debug_graph_reconstruction_test.py
index fb722efab4e..b3baa6e7bc2 100644
--- a/tensorflow/python/debug/lib/debug_graph_reconstruction_test.py
+++ b/tensorflow/python/debug/lib/debug_graph_reconstruction_test.py
@@ -73,9 +73,9 @@ class ReconstructNonDebugGraphTest(test_util.TensorFlowTestCase):
           for attr_key in new_node.attr:
             if attr_key == "parallel_iterations":
               new_node.attr[attr_key].i = 1
-        elif new_node.op == "Switch":
-          # We don't check the inputs to Switch ops as their inputs may be
-          # Send/Recv nodes.
+        elif new_node.op == "Switch" or new_node.op == "Identity":
+          # We don't check the inputs to Switch or Identity ops as their inputs
+          # may be Send/Recv nodes.
           del new_node.input[:]
 
     return output_graph_def
diff --git a/tensorflow/python/ops/control_flow_ops_test.py b/tensorflow/python/ops/control_flow_ops_test.py
index 9254695d988..3ca9bda82f2 100644
--- a/tensorflow/python/ops/control_flow_ops_test.py
+++ b/tensorflow/python/ops/control_flow_ops_test.py
@@ -396,10 +396,10 @@ class CondTest(test_util.TensorFlowTestCase):
         fn2=lambda: math_ops.add(y, 23))
     self.assertEquals(self.evaluate(z), 24)
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("Exercises Ref variables")
   def testCondModifyBoolPred(self):
-    # This test in particular used to fail only when running in GPU, hence
-    # use_gpu=True.
+    # We want to use the GPU here because we want to ensure that we can update
+    # a boolean ref variable on the GPU.
     with test_util.use_gpu():
       bool_var = variable_scope.get_variable(
           "bool_var", dtype=dtypes.bool, initializer=True)

From aa7ff6aa28977826e7acae379e82da22482b2bf2 Mon Sep 17 00:00:00 2001
From: George Karpenkov <cheshire@google.com>
Date: Fri, 19 Jun 2020 18:16:58 -0700
Subject: [PATCH 0693/1390] [TF2XLA] Set up aliasing for resource variables
 even when not returning a tuple

PiperOrigin-RevId: 317414582
Change-Id: I45cd1f314331cb86a0257e7b7cf9d0639be84e99
---
 tensorflow/compiler/tf2xla/xla_compiler.cc | 41 +++++++++++++++-------
 1 file changed, 29 insertions(+), 12 deletions(-)

diff --git a/tensorflow/compiler/tf2xla/xla_compiler.cc b/tensorflow/compiler/tf2xla/xla_compiler.cc
index 1cf3e10b774..c1aef3ff690 100644
--- a/tensorflow/compiler/tf2xla/xla_compiler.cc
+++ b/tensorflow/compiler/tf2xla/xla_compiler.cc
@@ -268,6 +268,7 @@ Status BuildComputation(
               return a->arg_num() < b->arg_num();
             });
 
+  std::vector<xla::XlaBuilder::InputOutputAlias> aliases;
   for (const XlaResource* resource : arg_resources) {
     DCHECK_LT(resource->arg_num(), args.size());
     const XlaCompiler::Argument& arg = args[resource->arg_num()];
@@ -289,20 +290,19 @@ Status BuildComputation(
       update.type = resource->type();
       update.shape = resource->shape();
       update.modified = modified;
-      if (is_entry_computation && always_return_tuple &&
+      if (is_entry_computation &&
           arg.resource_kind != XlaResource::kTensorArray &&
           alias_resource_update) {
         // Assuming tuple arg and results are used.
-        int64 output_index = elems.size();
-        if (use_tuple_arg) {
-          builder->SetUpAlias(/*output_index=*/{output_index},
-                              /*param_number=*/0,
-                              /*param_index=*/{update.input_index});
-        } else {
-          builder->SetUpAlias(/*output_index=*/{output_index},
-                              /*param_number=*/update.input_index,
-                              /*param_index=*/{});
-        }
+        xla::ShapeIndex param_index =
+            use_tuple_arg ? xla::ShapeIndex({update.input_index})
+                          : xla::ShapeIndex{};
+        int param_number = use_tuple_arg ? 0 : update.input_index;
+        int64 output_index_num = elems.size();
+        xla::ShapeIndex output_index = xla::ShapeIndex({output_index_num});
+        VLOG(3) << "Storing alias: " << output_index.ToString() << ": ("
+                << param_number << ", " << param_index.ToString() << ")";
+        aliases.push_back({output_index, param_number, param_index});
       }
       for (const auto& grad : resource->tensor_array_gradients()) {
         update.tensor_array_gradients_accessed.insert(grad.first);
@@ -381,8 +381,25 @@ Status BuildComputation(
     xla::XlaScopedShardingAssignment assign_sharding(builder, op_sharding);
     tuple = xla::Tuple(builder, elems);
   }
-  if (!always_return_tuple && elems.size() == 1) {
+  bool returns_tuple = always_return_tuple || elems.size() != 1;
+  VLOG(3) << "Computation returns a tuple=" << returns_tuple;
+  if (!returns_tuple) {
     xla::GetTupleElement(tuple, 0);
+
+    for (xla::XlaBuilder::InputOutputAlias& alias : aliases) {
+      if (alias.output_index == xla::ShapeIndex({0})) {
+        VLOG(3) << "For aliased parameter " << alias.param_number << ": "
+                << alias.param_index.ToString()
+                << " normalizing output_index from {0} to {}, as a scalar is "
+                   "returned from the cluster";
+        alias.output_index = xla::ShapeIndex({});
+      }
+    }
+  }
+
+  for (xla::XlaBuilder::InputOutputAlias& alias : aliases) {
+    builder->SetUpAlias(alias.output_index, alias.param_number,
+                        alias.param_index);
   }
 
   xla::StatusOr<xla::XlaComputation> computation_status = builder->Build();

From ce7fef0740ebe0e7f92ba1709e1ce04cb5f59b2c Mon Sep 17 00:00:00 2001
From: Chao Mei <chaomei@google.com>
Date: Fri, 19 Jun 2020 19:09:57 -0700
Subject: [PATCH 0694/1390] Output unconsumed cmdline flags to help using
 TFLite benchmark tool.

PiperOrigin-RevId: 317419402
Change-Id: I599b487a739808bc8f954b798bd54b75c9b715ad
---
 tensorflow/lite/tools/benchmark/benchmark_model.cc |  7 +++++++
 tensorflow/lite/tools/command_line_flags.cc        | 11 ++++++++++-
 tensorflow/lite/tools/command_line_flags.h         |  3 +++
 tensorflow/lite/tools/command_line_flags_test.cc   |  8 ++++++++
 4 files changed, 28 insertions(+), 1 deletion(-)

diff --git a/tensorflow/lite/tools/benchmark/benchmark_model.cc b/tensorflow/lite/tools/benchmark/benchmark_model.cc
index 2a858e7a326..b67faa8c36b 100644
--- a/tensorflow/lite/tools/benchmark/benchmark_model.cc
+++ b/tensorflow/lite/tools/benchmark/benchmark_model.cc
@@ -229,6 +229,13 @@ TfLiteStatus BenchmarkModel::ParseFlags(int* argc, char** argv) {
     TFLITE_LOG(ERROR) << usage;
     return kTfLiteError;
   }
+
+  std::string unconsumed_args =
+      Flags::ArgsToString(*argc, const_cast<const char**>(argv));
+  if (!unconsumed_args.empty()) {
+    TFLITE_LOG(WARN) << "Unconsumed cmdline flags: " << unconsumed_args;
+  }
+
   return kTfLiteOk;
 }
 
diff --git a/tensorflow/lite/tools/command_line_flags.cc b/tensorflow/lite/tools/command_line_flags.cc
index 92ddb1622c6..4f646ae27f4 100644
--- a/tensorflow/lite/tools/command_line_flags.cc
+++ b/tensorflow/lite/tools/command_line_flags.cc
@@ -325,6 +325,15 @@ std::string Flag::GetTypeName() const {
     usage_text << "\t" << flag.usage_text_ << "\n";
   }
   return usage_text.str();
-}  // namespace tflite
+}
+
+/*static*/ std::string Flags::ArgsToString(int argc, const char** argv) {
+  std::string args;
+  for (int i = 1; i < argc; ++i) {
+    args.append(argv[i]);
+    if (i != argc - 1) args.append(" ");
+  }
+  return args;
+}
 
 }  // namespace tflite
diff --git a/tensorflow/lite/tools/command_line_flags.h b/tensorflow/lite/tools/command_line_flags.h
index 95e64a19e18..4cc09f4b2c5 100644
--- a/tensorflow/lite/tools/command_line_flags.h
+++ b/tensorflow/lite/tools/command_line_flags.h
@@ -140,6 +140,9 @@ class Flags {
   // usage_text strings in flag_list[].
   static std::string Usage(const std::string& cmdline,
                            const std::vector<Flag>& flag_list);
+
+  // Return a space separated string containing argv[1, ..., argc-1].
+  static std::string ArgsToString(int argc, const char** argv);
 };
 }  // namespace tflite
 
diff --git a/tensorflow/lite/tools/command_line_flags_test.cc b/tensorflow/lite/tools/command_line_flags_test.cc
index 0216d7a0636..afd1264a0ad 100644
--- a/tensorflow/lite/tools/command_line_flags_test.cc
+++ b/tensorflow/lite/tools/command_line_flags_test.cc
@@ -356,5 +356,13 @@ TEST(CommandLineFlagsTest, DuplicateFlagsAndArgs) {
   EXPECT_EQ(argc, 2);
 }
 
+TEST(CommandLineFlagsTest, ArgsToString) {
+  int argc = 3;
+  const char* argv_strings[] = {"program_name", "--some_int=1", "--some_int=2"};
+  std::string args =
+      Flags::ArgsToString(argc, reinterpret_cast<const char**>(argv_strings));
+  EXPECT_EQ("--some_int=1 --some_int=2", args);
+}
+
 }  // namespace
 }  // namespace tflite

From f3cb68f0ac458281e5b06cd0066607d112adb0e2 Mon Sep 17 00:00:00 2001
From: Andy Ly <lyandy@google.com>
Date: Fri, 19 Jun 2020 19:18:15 -0700
Subject: [PATCH 0695/1390] Remove assertions checking device map size in
 TPUVariableRuntimeReformattingPass.

It is possible for more than one replicated device to be populated (e.g. replicated host).

PiperOrigin-RevId: 317419924
Change-Id: I1d8fa95654f324557d78adf4570adc6c3cfdabb4
---
 .../transforms/tpu_variable_runtime_reformatting.cc           | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_variable_runtime_reformatting.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_variable_runtime_reformatting.cc
index ec4a25c6fdd..d88982d9ee7 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_variable_runtime_reformatting.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_variable_runtime_reformatting.cc
@@ -261,7 +261,6 @@ tf_device::ReplicateOp AddInputsToReplicateOp(
   // placed in logical core 0.
   // TODO(b/148913020): Remove this constraint once model parallelism is
   // supported.
-  assert(devices.size() == 1);
   assert(devices.find(tensorflow::GetDeviceAliasForLogicalCore(0))
              ->getSecond()
              .size() == num_replicas);
@@ -369,9 +368,6 @@ llvm::SmallVector<TF::VarHandleOp, 4> CreateStateVars(
 
   // TODO(b/148913020): Remove this constraint once model parallelism is
   // supported.
-  assert(devices.size() == 1 &&
-         "As model parallelism is not supported yet, tf_device.replicate "
-         "`devices` attribute should have one dictionary element.");
   const auto& device_list =
       devices.find(tensorflow::GetDeviceAliasForLogicalCore(0))->getSecond();
 

From b1fdd334e7443a4faf54cbca63b8dedd46ee48e1 Mon Sep 17 00:00:00 2001
From: Smit Hinsu <hinsu@google.com>
Date: Fri, 19 Jun 2020 19:18:34 -0700
Subject: [PATCH 0696/1390] Rewrite masks while applying ellipsis mask to
 tf.StridedSlice op

begin_mask and end_mask needs to be adjusted while simplifying StridedSlice op by removing ellipsis_mask. new_axis_mask is already removed before this and we don't yet support shrink_axis_mask.

PiperOrigin-RevId: 317419960
Change-Id: Ie4a5f404f95f5b909065311a54cbbed64d4ccf4b
---
 .../compiler/mlir/lite/tests/prepare-tf.mlir  | 20 +++++++
 .../mlir/lite/transforms/prepare_tf.cc        | 54 ++++++++++---------
 2 files changed, 50 insertions(+), 24 deletions(-)

diff --git a/tensorflow/compiler/mlir/lite/tests/prepare-tf.mlir b/tensorflow/compiler/mlir/lite/tests/prepare-tf.mlir
index e95f3d011e2..719430959d0 100644
--- a/tensorflow/compiler/mlir/lite/tests/prepare-tf.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/prepare-tf.mlir
@@ -528,6 +528,26 @@ func @PadStridedSliceNewAxisMask2(%arg0: tensor<4x64x64x1xf32>) -> tensor<1x4x64
   return %1 : tensor<1x4x64x64xf32>
 }
 
+// CHECK-LABEL: @StridedSliceRewriteMasks
+func @StridedSliceRewriteMasks(%arg0: tensor<8x4x16x2xf32>) -> tensor<8x4x16x1xf32> {
+  %cst = "tf.Const"() {device = "", value = dense<[1, 0, 1]> : tensor<3xi32>} : () -> tensor<3xi32>
+  %cst_0 = "tf.Const"() {device = "", value = dense<[1, 0, 0]> : tensor<3xi32>} : () -> tensor<3xi32>
+  %cst_1 = "tf.Const"() {device = "", value = dense<1> : tensor<3xi32>} : () -> tensor<3xi32>
+
+  // CHECK: %[[CST:.*]] = constant dense<[1, 0, 0, 1]> : tensor<4xi32>
+  // CHECK: %[[CST0:.*]] = constant dense<[1, 0, 0, 0]> : tensor<4xi32>
+  // CHECK: %[[CST1:.*]] = constant dense<1> : tensor<4xi32>
+  // CHECK: %[[RESULT:.*]] = "tf.StridedSlice"(%arg0, %[[CST]], %[[CST0]], %[[CST1]])
+  // CHECK-SAME: begin_mask = 7 : i64
+  // CHECK-SAME: ellipsis_mask = 0 : i64
+  // CHECK-SAME: end_mask = 14 : i64
+  // CHECK-SAME: new_axis_mask = 0 : i64
+  // CHECK-SAME: shrink_axis_mask = 0 : i64
+
+  %0 = "tf.StridedSlice"(%arg0, %cst, %cst_0, %cst_1) {begin_mask = 1 : i64, device = "", ellipsis_mask = 2 : i64, end_mask = 4 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<8x4x16x2xf32>, tensor<3xi32>, tensor<3xi32>, tensor<3xi32>) -> tensor<8x4x16x1xf32>
+  return %0 : tensor<8x4x16x1xf32>
+}
+
 // CHECK-LABEL: @MatrixSetDiagV2Conversion
 func @MatrixSetDiagV2Conversion(%arg0: tensor<3x3xi32>, %arg1: tensor<3xi32>) -> tensor<3x3xi32> {
   %cst = constant dense<0> : tensor<i32>
diff --git a/tensorflow/compiler/mlir/lite/transforms/prepare_tf.cc b/tensorflow/compiler/mlir/lite/transforms/prepare_tf.cc
index 3310c521a5a..6ee988496fa 100644
--- a/tensorflow/compiler/mlir/lite/transforms/prepare_tf.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/prepare_tf.cc
@@ -584,46 +584,50 @@ struct ConvertTFStridedSlice : public RewritePattern {
 
     const int ellipsis_filled_dim_size = input_size - begin_shape[0] + 1;
 
-    llvm::APInt new_begin_mask = strided_slice_op.begin_mask();
-    llvm::APInt new_end_mask = strided_slice_op.end_mask();
+    int64_t begin_mask = strided_slice_op.begin_mask().getSExtValue();
+    int64_t end_mask = strided_slice_op.end_mask().getSExtValue();
+    int64_t new_begin_mask = 0;
+    int64_t new_end_mask = 0;
 
     SmallVector<int32_t, 4> padded_begin;
     SmallVector<int32_t, 4> padded_end;
     SmallVector<int32_t, 4> padded_stride;
 
     // Before the ellipsis.
-    uint64_t index = 1;
-    int count = 0;
-
-    while (index < ellipsis_mask) {
-      padded_begin.push_back(begin_dense_elem_attr.getValue<int32_t>(count));
-      padded_end.push_back(end_dense_elem_attr.getValue<int32_t>(count));
-      padded_stride.push_back(stride_dense_elem_attr.getValue<int32_t>(count));
-      index <<= 1;
-      count++;
+    int index = 0;
+    int new_index = 0;
+    while (((ellipsis_mask >> index) & 1) == 0) {
+      padded_begin.push_back(begin_dense_elem_attr.getValue<int32_t>(index));
+      padded_end.push_back(end_dense_elem_attr.getValue<int32_t>(index));
+      padded_stride.push_back(stride_dense_elem_attr.getValue<int32_t>(index));
+      if ((begin_mask >> index) & 1) new_begin_mask |= (1 << new_index);
+      if ((end_mask >> index) & 1) new_end_mask |= (1 << new_index);
+      ++index;
+      ++new_index;
     }
 
     // Ellipsis.
-    for (int i = 0; i < ellipsis_filled_dim_size; ++i) {
-      new_begin_mask |= ellipsis_mask;
-      new_end_mask |= ellipsis_mask;
+    for (; new_index < index + ellipsis_filled_dim_size; ++new_index) {
+      new_begin_mask |= (1 << new_index);
+      new_end_mask |= (1 << new_index);
 
       // Mimic the begin/end/strides mask behavior.
       padded_begin.push_back(0);
       padded_end.push_back(0);
       padded_stride.push_back(1);
-
-      ellipsis_mask <<= 1;
     }
 
     // Account for ellipsis mask.
-    count++;
+    ++index;
 
     // After the ellipsis.
-    for (; count < begin_shape[0]; ++count) {
-      padded_begin.push_back(begin_dense_elem_attr.getValue<int32_t>(count));
-      padded_end.push_back(end_dense_elem_attr.getValue<int32_t>(count));
-      padded_stride.push_back(stride_dense_elem_attr.getValue<int32_t>(count));
+    for (; index < begin_shape[0]; ++index) {
+      padded_begin.push_back(begin_dense_elem_attr.getValue<int32_t>(index));
+      padded_end.push_back(end_dense_elem_attr.getValue<int32_t>(index));
+      padded_stride.push_back(stride_dense_elem_attr.getValue<int32_t>(index));
+
+      if ((begin_mask >> index) & 1) new_begin_mask |= (1 << new_index);
+      if ((end_mask >> index) & 1) new_end_mask |= (1 << new_index);
     }
 
     auto attribute_type = rewriter.getIntegerType(64);
@@ -645,7 +649,7 @@ struct ConvertTFStridedSlice : public RewritePattern {
         end_op.getResult(), stride_op.getResult(),
         rewriter.getIntegerAttr(attribute_type, new_begin_mask),
         rewriter.getIntegerAttr(attribute_type, new_end_mask),
-        rewriter.getI64IntegerAttr(0),
+        /*ellipsis_maks=*/rewriter.getI64IntegerAttr(0),
         rewriter.getIntegerAttr(attribute_type,
                                 strided_slice_op.new_axis_mask()),
         rewriter.getIntegerAttr(attribute_type,
@@ -655,10 +659,12 @@ struct ConvertTFStridedSlice : public RewritePattern {
 
   LogicalResult matchAndRewrite(Operation *op,
                                 PatternRewriter &rewriter) const override {
-    // TODO(renjieliu): Consider expand the transformation for shrink
-    // mask as well.
     TF::StridedSliceOp strided_slice_op = llvm::cast<TF::StridedSliceOp>(op);
 
+    // TODO(renjieliu): Consider expand the transformation for shrink mask as
+    // well.
+    if (strided_slice_op.shrink_axis_mask().getZExtValue()) return failure();
+
     // Handle new axis mask.
     uint64_t new_axis_mask = strided_slice_op.new_axis_mask().getZExtValue();
     if (new_axis_mask != 0) {

From 429c0b423eb17fc7e782b3188e08637c68177c76 Mon Sep 17 00:00:00 2001
From: Robert David <lrdx@google.com>
Date: Fri, 19 Jun 2020 19:22:08 -0700
Subject: [PATCH 0697/1390] Integer LSTMs: Name scratch arrays Based on what
 gate they are representing. Make naming consistent with float/hybrid
 versions.

PiperOrigin-RevId: 317420201
Change-Id: Ia9447e51fce1530e75103c4db3759908592af983
---
 tensorflow/lite/kernels/lstm_eval.cc          | 210 ++++++++++--------
 .../calibration/builtin_logging_ops/lstm.cc   |  13 +-
 2 files changed, 130 insertions(+), 93 deletions(-)

diff --git a/tensorflow/lite/kernels/lstm_eval.cc b/tensorflow/lite/kernels/lstm_eval.cc
index b2f3d77912b..e5bdf3e9a1e 100644
--- a/tensorflow/lite/kernels/lstm_eval.cc
+++ b/tensorflow/lite/kernels/lstm_eval.cc
@@ -216,9 +216,8 @@ inline void LstmStepFloat(
     const float* projection_weights_ptr, const float* projection_bias_ptr,
     const TfLiteLSTMParams* params, int n_batch, int n_cell, int n_input,
     int n_aux_input, int n_output, int output_batch_leading_dim,
-    float* output_state_ptr, float* cell_state_ptr, float* input_gate_scratch,
-    float* forget_gate_scratch, float* cell_gate_scratch,
-    float* output_gate_scratch, float* output_ptr) {
+    float* output_state_ptr, float* cell_state_ptr, float* scratch0,
+    float* scratch1, float* scratch2, float* scratch3, float* output_ptr) {
   ruy::profiler::ScopeLabel label("LstmStepFloat");
   // Since we have already checked that weights are all there or none, we can
   // check the existence of only one to the get the condition.
@@ -226,6 +225,12 @@ inline void LstmStepFloat(
   const bool use_peephole = (cell_to_output_weights_ptr != nullptr);
   const bool use_layer_norm = (forget_layer_norm_coefficients_ptr != nullptr);
 
+  // Make named scratch buffers for the different gates.
+  float* input_gate_scratch = scratch0;
+  float* forget_gate_scratch = scratch1;
+  float* cell_gate_scratch = scratch2;
+  float* output_gate_scratch = scratch3;
+
   // Initialize scratch buffers with bias for regular lstm or initialize with
   // zero for layer norm lstm.
   if (use_layer_norm) {
@@ -531,9 +536,8 @@ inline void LstmStepHybrid(
     const int8_t* projection_weights_ptr, float projection_weights_scale,
     const float* projection_bias_ptr, const TfLiteLSTMParams* params,
     int n_batch, int n_cell, int n_input, int n_aux_input, int n_output,
-    int output_batch_leading_dim, float* input_gate_scratch,
-    float* forget_gate_scratch, float* cell_gate_scratch,
-    float* output_gate_scratch, float* scaling_factors,
+    int output_batch_leading_dim, float* scratch0, float* scratch1,
+    float* scratch2, float* scratch3, float* scaling_factors,
     float* scaling_factors_scratch, float* recovered_cell_weights,
     int8_t* quantized_input_ptr, int8_t* quantized_aux_input_ptr,
     int8_t* quantized_output_state_ptr, int8_t* quantized_cell_state_ptr,
@@ -548,6 +552,12 @@ inline void LstmStepHybrid(
   const bool use_peephole = (cell_to_output_weights_ptr != nullptr);
   const bool use_layer_norm = (forget_layer_norm_coefficients_ptr != nullptr);
 
+  // Make named scratch buffers for the different gates.
+  float* input_gate_scratch = scratch0;
+  float* forget_gate_scratch = scratch1;
+  float* cell_gate_scratch = scratch2;
+  float* output_gate_scratch = scratch3;
+
   // Initialize scratch buffers with bias for regular lstm or initialize with
   // zero for layer norm lstm.
   if (use_layer_norm) {
@@ -974,12 +984,12 @@ inline void LstmStepHybrid(
 //
 // Temporary pre-allocated storage for the calculation. Each is of size n_cell *
 // n_batch.
-//   scratch_0
-//   scratch_1
-//   scratch_2
-//   scratch_3
-//   scratch_4
-//   scratch_5: this scratch buffer is created purely for optimizing the
+//   scratch0
+//   scratch1
+//   scratch2
+//   scratch3
+//   scratch4
+//   scratch5: this scratch buffer is created purely for optimizing the
 //              MatrixBatchVectorMultiplyAccumulate.
 //
 // Outputs:
@@ -1047,10 +1057,15 @@ inline void LstmStepInteger(
     const int32_t* projection_effective_bias, int n_batch, int n_cell,
     int n_input, int n_output, int8_t* output_state_ptr,
     int32_t output_state_zp, int16_t* cell_ptr, int8_t* output_ptr,
-    int16_t* scratch_0_ptr, int16_t* scratch_1_ptr, int16_t* scratch_2_ptr,
-    int16_t* scratch_3_ptr, int8_t* scratch_4_ptr, int32_t* scratch_5_ptr,
-    CpuBackendContext* context) {
+    int16_t* scratch0, int16_t* scratch1, int16_t* scratch2, int16_t* scratch3,
+    int8_t* scratch4, int32_t* scratch5, CpuBackendContext* context) {
   ruy::profiler::ScopeLabel label("LstmStepInteger");
+  // Make named scratch buffers for the different gates.
+  int16_t* input_gate_scratch = scratch0;
+  int16_t* forget_gate_scratch = scratch1;
+  int16_t* cell_gate_scratch = scratch2;
+  int16_t* output_gate_scratch = scratch3;
+
   // Get hyper parameters.
   const bool use_cifg = (input_to_input_weight_ptr == nullptr);
   const bool use_peephole = (cell_to_output_weight_ptr != nullptr);
@@ -1072,99 +1087,103 @@ inline void LstmStepInteger(
 
   // Set scratch to 0.
   if (!use_cifg) {
-    std::fill_n(scratch_0_ptr, n_batch * n_cell, 0);
+    std::fill_n(input_gate_scratch, n_batch * n_cell, 0);
   }
-  std::fill_n(scratch_1_ptr, n_batch * n_cell, 0);
-  std::fill_n(scratch_2_ptr, n_batch * n_cell, 0);
-  std::fill_n(scratch_3_ptr, n_batch * n_cell, 0);
+  std::fill_n(forget_gate_scratch, n_batch * n_cell, 0);
+  std::fill_n(cell_gate_scratch, n_batch * n_cell, 0);
+  std::fill_n(output_gate_scratch, n_batch * n_cell, 0);
 
   // Forget gate.
   tensor_utils::MatrixBatchVectorMultiplyAccumulate(
       input_ptr, input_to_forget_effective_bias, input_to_forget_weight_ptr,
       effective_input_to_forget_scale_a, effective_input_to_forget_scale_b,
-      n_batch, n_input, n_cell, 0, scratch_5_ptr, scratch_1_ptr, context);
+      n_batch, n_input, n_cell, 0, scratch5, forget_gate_scratch, context);
 
   tensor_utils::MatrixBatchVectorMultiplyAccumulate(
       output_state_ptr, recurrent_to_forget_effective_bias,
       recurrent_to_forget_weight_ptr, effective_recurrent_to_forget_scale_a,
       effective_recurrent_to_forget_scale_b, n_batch, n_output, n_cell, 0,
-      scratch_5_ptr, scratch_1_ptr, context);
+      scratch5, forget_gate_scratch, context);
   if (use_peephole) {
     tensor_utils::VectorBatchVectorCwiseProductAccumulate(
         cell_to_forget_weight_ptr, n_output, cell_ptr, n_batch,
         effective_cell_to_forget_scale_a, effective_cell_to_forget_scale_b,
-        scratch_1_ptr);
+        forget_gate_scratch);
   }
 
   if (use_layer_norm) {
     tensor_utils::ApplyLayerNorm(
-        scratch_1_ptr, layer_norm_forget_weight_ptr, forget_gate_bias_ptr,
+        forget_gate_scratch, layer_norm_forget_weight_ptr, forget_gate_bias_ptr,
         layer_norm_forget_scale_a, layer_norm_forget_scale_b,
-        forget_variance_guard, n_batch, n_cell, scratch_1_ptr);
+        forget_variance_guard, n_batch, n_cell, forget_gate_scratch);
   }
 
-  tensor_utils::ApplySigmoid(scratch_1_ptr, n_batch, n_cell, scratch_1_ptr);
+  tensor_utils::ApplySigmoid(forget_gate_scratch, n_batch, n_cell,
+                             forget_gate_scratch);
 
-  // Modulation gate.
+  // Cell gate.
   tensor_utils::MatrixBatchVectorMultiplyAccumulate(
       input_ptr, input_to_cell_effective_bias, input_to_cell_weight_ptr,
       effective_input_to_cell_scale_a, effective_input_to_cell_scale_b, n_batch,
-      n_input, n_cell, 0, scratch_5_ptr, scratch_2_ptr, context);
+      n_input, n_cell, 0, scratch5, cell_gate_scratch, context);
 
   tensor_utils::MatrixBatchVectorMultiplyAccumulate(
       output_state_ptr, recurrent_to_cell_effective_bias,
       recurrent_to_cell_weight_ptr, effective_recurrent_to_cell_scale_a,
       effective_recurrent_to_cell_scale_b, n_batch, n_output, n_cell, 0,
-      scratch_5_ptr, scratch_2_ptr, context);
+      scratch5, cell_gate_scratch, context);
 
   if (use_layer_norm) {
-    tensor_utils::ApplyLayerNorm(scratch_2_ptr, layer_norm_cell_weight_ptr,
+    tensor_utils::ApplyLayerNorm(cell_gate_scratch, layer_norm_cell_weight_ptr,
                                  cell_gate_bias_ptr, layer_norm_cell_scale_a,
                                  layer_norm_cell_scale_b, cell_variance_guard,
-                                 n_batch, n_cell, scratch_2_ptr);
+                                 n_batch, n_cell, cell_gate_scratch);
   }
 
-  tensor_utils::ApplyTanh(3, scratch_2_ptr, n_batch, n_cell, scratch_2_ptr);
+  tensor_utils::ApplyTanh(3, cell_gate_scratch, n_batch, n_cell,
+                          cell_gate_scratch);
 
   // Input gate.
   if (use_cifg) {
-    tensor_utils::Sub1Vector(scratch_1_ptr, n_batch * n_cell, scratch_0_ptr);
+    tensor_utils::Sub1Vector(forget_gate_scratch, n_batch * n_cell,
+                             input_gate_scratch);
   } else {
     tensor_utils::MatrixBatchVectorMultiplyAccumulate(
         input_ptr, input_to_input_effective_bias, input_to_input_weight_ptr,
         effective_input_to_input_scale_a, effective_input_to_input_scale_b,
-        n_batch, n_input, n_cell, 0, scratch_5_ptr, scratch_0_ptr, context);
+        n_batch, n_input, n_cell, 0, scratch5, input_gate_scratch, context);
 
     tensor_utils::MatrixBatchVectorMultiplyAccumulate(
         output_state_ptr, recurrent_to_input_effective_bias,
         recurrent_to_input_weight_ptr, effective_recurrent_to_input_scale_a,
         effective_recurrent_to_input_scale_b, n_batch, n_output, n_cell, 0,
-        scratch_5_ptr, scratch_0_ptr, context);
+        scratch5, input_gate_scratch, context);
     if (use_peephole) {
       tensor_utils::VectorBatchVectorCwiseProductAccumulate(
           cell_to_input_weight_ptr, n_output, cell_ptr, n_batch,
           effective_cell_to_input_scale_a, effective_cell_to_input_scale_b,
-          scratch_0_ptr);
+          input_gate_scratch);
     }
 
     if (use_layer_norm) {
       tensor_utils::ApplyLayerNorm(
-          scratch_0_ptr, layer_norm_input_weight_ptr, input_gate_bias_ptr,
+          input_gate_scratch, layer_norm_input_weight_ptr, input_gate_bias_ptr,
           layer_norm_input_scale_a, layer_norm_input_scale_b,
-          input_variance_guard, n_batch, n_cell, scratch_0_ptr);
+          input_variance_guard, n_batch, n_cell, input_gate_scratch);
     }
-    tensor_utils::ApplySigmoid(scratch_0_ptr, n_batch, n_cell, scratch_0_ptr);
+    tensor_utils::ApplySigmoid(input_gate_scratch, n_batch, n_cell,
+                               input_gate_scratch);
   }
 
   // New cell.
-  tensor_utils::CwiseMul(scratch_1_ptr, cell_ptr, n_batch, n_cell, 15,
-                         scratch_1_ptr);
+  tensor_utils::CwiseMul(forget_gate_scratch, cell_ptr, n_batch, n_cell, 15,
+                         forget_gate_scratch);
 
-  tensor_utils::CwiseMul(scratch_0_ptr, scratch_2_ptr, n_batch, n_cell,
-                         30 + cell_scale, scratch_2_ptr);
+  tensor_utils::CwiseMul(input_gate_scratch, cell_gate_scratch, n_batch, n_cell,
+                         30 + cell_scale, cell_gate_scratch);
 
-  tensor_utils::CwiseAdd(scratch_1_ptr, scratch_2_ptr, n_batch, n_cell,
-                         cell_ptr);
+  tensor_utils::CwiseAdd(forget_gate_scratch, cell_gate_scratch, n_batch,
+                         n_cell, cell_ptr);
 
   if (quantized_cell_clip > 0) {
     tensor_utils::CwiseClipping(cell_ptr, quantized_cell_clip, n_batch, n_cell);
@@ -1174,49 +1193,50 @@ inline void LstmStepInteger(
   tensor_utils::MatrixBatchVectorMultiplyAccumulate(
       input_ptr, input_to_output_effective_bias, input_to_output_weight_ptr,
       effective_input_to_output_scale_a, effective_input_to_output_scale_b,
-      n_batch, n_input, n_cell, 0, scratch_5_ptr, scratch_3_ptr, context);
+      n_batch, n_input, n_cell, 0, scratch5, output_gate_scratch, context);
 
   tensor_utils::MatrixBatchVectorMultiplyAccumulate(
       output_state_ptr, recurrent_to_output_effective_bias,
       recurrent_to_output_weight_ptr, effective_recurrent_to_output_scale_a,
       effective_recurrent_to_output_scale_b, n_batch, n_output, n_cell, 0,
-      scratch_5_ptr, scratch_3_ptr, context);
+      scratch5, output_gate_scratch, context);
   if (use_peephole) {
     tensor_utils::VectorBatchVectorCwiseProductAccumulate(
         cell_to_output_weight_ptr, n_output, cell_ptr, n_batch,
         effective_cell_to_output_scale_a, effective_cell_to_output_scale_b,
-        scratch_3_ptr);
+        output_gate_scratch);
   }
 
   if (use_layer_norm) {
     tensor_utils::ApplyLayerNorm(
-        scratch_3_ptr, layer_norm_output_weight_ptr, output_gate_bias_ptr,
+        output_gate_scratch, layer_norm_output_weight_ptr, output_gate_bias_ptr,
         layer_norm_output_scale_a, layer_norm_output_scale_b,
-        output_variance_guard, n_batch, n_cell, scratch_3_ptr);
+        output_variance_guard, n_batch, n_cell, output_gate_scratch);
   }
 
-  tensor_utils::ApplySigmoid(scratch_3_ptr, n_batch, n_cell, scratch_3_ptr);
+  tensor_utils::ApplySigmoid(output_gate_scratch, n_batch, n_cell,
+                             output_gate_scratch);
 
   // Hidden.
   tensor_utils::ApplyTanh(15 + cell_scale, cell_ptr, n_batch, n_cell,
-                          scratch_0_ptr);
+                          input_gate_scratch);
 
-  tensor_utils::CwiseMul(scratch_3_ptr, scratch_0_ptr, effective_hidden_scale_a,
-                         effective_hidden_scale_b, n_batch, n_cell, hidden_zp,
-                         scratch_4_ptr);
+  tensor_utils::CwiseMul(output_gate_scratch, input_gate_scratch,
+                         effective_hidden_scale_a, effective_hidden_scale_b,
+                         n_batch, n_cell, hidden_zp, scratch4);
   // Projection.
   if (use_projection) {
     std::fill_n(output_ptr, n_batch * n_output, 0);
     tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-        scratch_4_ptr, projection_effective_bias, projection_weight_ptr,
+        scratch4, projection_effective_bias, projection_weight_ptr,
         effective_proj_scale_a, effective_proj_scale_b, n_batch, n_cell,
-        n_output, output_state_zp, scratch_5_ptr, output_ptr, context);
+        n_output, output_state_zp, scratch5, output_ptr, context);
     if (quantized_proj_clip > 0) {
       tensor_utils::CwiseClipping(output_ptr, quantized_proj_clip, n_batch,
                                   n_output);
     }
   } else {
-    std::copy_n(scratch_4_ptr, n_batch * n_output, output_ptr);
+    std::copy_n(scratch4, n_batch * n_output, output_ptr);
   }
   std::copy_n(output_ptr, n_batch * n_output, output_state_ptr);
 }
@@ -1300,14 +1320,14 @@ inline void LstmStepInteger(
 //
 // Temporary pre-allocated storage for the calculation. Each is of size n_cell *
 // n_batch.
-//   scratch_0
-//   scratch_1
-//   scratch_2
-//   scratch_3
-//   scratch_4
-//   scratch_5
-//   scratch_6
-//   scratch_7
+//   scratch0
+//   scratch1
+//   scratch2
+//   scratch3
+//   scratch4
+//   scratch5
+//   scratch6
+//   scratch7
 //
 // Outputs:
 //   output_state_ptr - size 'n_batch * n_output'
@@ -1369,6 +1389,12 @@ void LstmStepInteger(
     int8_t* scratch0, int8_t* scratch1, int16_t* scratch2, int16_t* scratch3,
     int16_t* scratch4, int16_t* scratch5, int16_t* scratch6,
     int16_t* scratch7) {
+  // Make named scratch buffers for the different gates.
+  int16_t* input_gate_scratch = scratch5;
+  int16_t* forget_gate_scratch = scratch2;
+  int16_t* cell_gate_scratch = scratch3;
+  int16_t* output_gate_scratch = scratch4;
+
   // Forget gate.
   std::fill_n(scratch0, n_batch * n_cell, 0);
   std::fill_n(scratch1, n_batch * n_cell, 0);
@@ -1386,16 +1412,17 @@ void LstmStepInteger(
   tensor_utils::TwoGateSaturationgAdd(
       scratch0, intermediate_zp[4], scratch1, intermediate_zp[5],
       intermediate_scale_a[2], intermediate_scale_b[2], intermediate_scale_a[3],
-      intermediate_scale_b[3], n_batch, n_cell, scratch2);
+      intermediate_scale_b[3], n_batch, n_cell, forget_gate_scratch);
 
   // Forget gate layer norm.
   tensor_utils::ApplyLayerNormFloat(
-      scratch2, layer_norm_forget_weight_ptr, layer_norm_forget_scale_a,
-      layer_norm_forget_scale_b, forget_gate_bias_ptr, n_batch, n_cell,
-      scratch2);
+      forget_gate_scratch, layer_norm_forget_weight_ptr,
+      layer_norm_forget_scale_a, layer_norm_forget_scale_b,
+      forget_gate_bias_ptr, n_batch, n_cell, forget_gate_scratch);
 
   // Forget gate sigmoid.
-  tensor_utils::ApplySigmoidFloat(scratch2, n_batch, n_cell, scratch2);
+  tensor_utils::ApplySigmoidFloat(forget_gate_scratch, n_batch, n_cell,
+                                  forget_gate_scratch);
 
   // Update gate.
   std::fill_n(scratch0, n_batch * n_cell, 0);
@@ -1413,15 +1440,17 @@ void LstmStepInteger(
   tensor_utils::TwoGateSaturationgAdd(
       scratch0, intermediate_zp[7], scratch1, intermediate_zp[8],
       intermediate_scale_a[4], intermediate_scale_b[4], intermediate_scale_a[5],
-      intermediate_scale_b[5], n_batch, n_cell, scratch3);
+      intermediate_scale_b[5], n_batch, n_cell, cell_gate_scratch);
 
-  // Update gate with layer norm.
+  // Update gate layer norm.
   tensor_utils::ApplyLayerNormFloat(
-      scratch3, layer_norm_cell_weight_ptr, layer_norm_cell_scale_a,
-      layer_norm_cell_scale_b, cell_gate_bias_ptr, n_batch, n_cell, scratch3);
+      cell_gate_scratch, layer_norm_cell_weight_ptr, layer_norm_cell_scale_a,
+      layer_norm_cell_scale_b, cell_gate_bias_ptr, n_batch, n_cell,
+      cell_gate_scratch);
 
   // Update gate tanh.
-  tensor_utils::ApplyTanhFloat(scratch3, n_batch, n_cell, -12, scratch3);
+  tensor_utils::ApplyTanhFloat(cell_gate_scratch, n_batch, n_cell, -12,
+                               cell_gate_scratch);
 
   // Output gate.
   std::fill_n(scratch0, n_batch * n_cell, 0);
@@ -1440,26 +1469,28 @@ void LstmStepInteger(
   tensor_utils::TwoGateSaturationgAdd(
       scratch0, intermediate_zp[10], scratch1, intermediate_zp[11],
       intermediate_scale_a[6], intermediate_scale_b[6], intermediate_scale_a[7],
-      intermediate_scale_b[7], n_batch, n_cell, scratch4);
+      intermediate_scale_b[7], n_batch, n_cell, output_gate_scratch);
 
   // Output gate with layer norm.
   tensor_utils::ApplyLayerNormFloat(
-      scratch4, layer_norm_output_weight_ptr, layer_norm_output_scale_a,
-      layer_norm_output_scale_b, output_gate_bias_ptr, n_batch, n_cell,
-      scratch4);
+      output_gate_scratch, layer_norm_output_weight_ptr,
+      layer_norm_output_scale_a, layer_norm_output_scale_b,
+      output_gate_bias_ptr, n_batch, n_cell, output_gate_scratch);
 
   // Output gate sigmoid.
-  tensor_utils::ApplySigmoidFloat(scratch4, n_batch, n_cell, scratch4);
+  tensor_utils::ApplySigmoidFloat(output_gate_scratch, n_batch, n_cell,
+                                  output_gate_scratch);
 
   // Input gate with cifg
-  tensor_utils::Sub1Vector(scratch2, n_batch * n_cell, scratch5);
+  tensor_utils::Sub1Vector(forget_gate_scratch, n_batch * n_cell,
+                           input_gate_scratch);
 
   // New cell.
-  tensor_utils::CwiseMul(scratch2, cell_ptr, n_batch, n_cell, 15 + 15 - 15,
-                         scratch6);
+  tensor_utils::CwiseMul(forget_gate_scratch, cell_ptr, n_batch, n_cell,
+                         15 + 15 - 15, scratch6);
 
-  tensor_utils::CwiseMul(scratch5, scratch3, n_batch, n_cell, 15 + 15 - 15,
-                         scratch7);
+  tensor_utils::CwiseMul(input_gate_scratch, cell_gate_scratch, n_batch, n_cell,
+                         15 + 15 - 15, scratch7);
 
   tensor_utils::CwiseAdd(scratch6, scratch7, n_batch, n_cell, cell_ptr);
 
@@ -1468,15 +1499,16 @@ void LstmStepInteger(
   }
 
   // Cell to hidden.
-  tensor_utils::ApplyTanhFloat(cell_ptr, n_batch, n_cell, -15, scratch2);
+  tensor_utils::ApplyTanhFloat(cell_ptr, n_batch, n_cell, -15,
+                               forget_gate_scratch);
 
   std::vector<int16_t> hidden(n_batch * n_cell);
-  tensor_utils::CwiseMul(scratch4, scratch2, n_batch, n_cell, 15 + 15 - 15,
-                         scratch3);
+  tensor_utils::CwiseMul(output_gate_scratch, forget_gate_scratch, n_batch,
+                         n_cell, 15 + 15 - 15, cell_gate_scratch);
 
   // Projection.
   tensor_utils::MatrixBatchVectorMultiply(
-      scratch3, projection_weight_ptr, effective_proj_scale_a,
+      cell_gate_scratch, projection_weight_ptr, effective_proj_scale_a,
       effective_proj_scale_b, projection_bias_ptr, n_batch, n_cell, n_output,
       output_state_zp, output_ptr);
 
diff --git a/tensorflow/lite/tools/optimize/calibration/builtin_logging_ops/lstm.cc b/tensorflow/lite/tools/optimize/calibration/builtin_logging_ops/lstm.cc
index 50138442c25..09ce81c1d97 100644
--- a/tensorflow/lite/tools/optimize/calibration/builtin_logging_ops/lstm.cc
+++ b/tensorflow/lite/tools/optimize/calibration/builtin_logging_ops/lstm.cc
@@ -62,11 +62,16 @@ inline void LstmStepWithAuxInput(
     const float* projection_weights_ptr, const float* projection_bias_ptr,
     const TfLiteLSTMParams* params, int n_batch, int n_cell, int n_input,
     int n_aux_input, int n_output, int output_batch_leading_dim,
-    float* output_state_ptr, float* cell_state_ptr, float* input_gate_scratch,
-    float* forget_gate_scratch, float* cell_gate_scratch,
-    float* output_gate_scratch, float* output_ptr, Logger* logger,
-    const std::vector<int>& intermediate_tensor_indexes,
+    float* output_state_ptr, float* cell_state_ptr, float* scratch0,
+    float* scratch1, float* scratch2, float* scratch3, float* output_ptr,
+    Logger* logger, const std::vector<int>& intermediate_tensor_indexes,
     ErrorReporter* error_reporter) {
+  // Make named scratch buffers for the different gates.
+  float* input_gate_scratch = scratch0;
+  float* forget_gate_scratch = scratch1;
+  float* cell_gate_scratch = scratch2;
+  float* output_gate_scratch = scratch3;
+
   // Since we have already checked that weights are all there or none, we can
   // check the existence of only one to the get the condition.
   const bool use_cifg = (input_to_input_weights_ptr == nullptr);

From 7ee328496f1ea1306f212ff1b9b4242e1cbd286b Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 19 Jun 2020 19:30:45 -0700
Subject: [PATCH 0698/1390] Add code examples to README.md

PiperOrigin-RevId: 317420765
Change-Id: I26de720075125387c652fc30d09215055e1dddad
---
 tensorflow/python/ops/numpy_ops/README.md   | 121 +++++++++++++-------
 tensorflow/python/ops/numpy_ops/__init__.py |   3 -
 2 files changed, 78 insertions(+), 46 deletions(-)

diff --git a/tensorflow/python/ops/numpy_ops/README.md b/tensorflow/python/ops/numpy_ops/README.md
index 3dc37423d26..812f6b946c0 100644
--- a/tensorflow/python/ops/numpy_ops/README.md
+++ b/tensorflow/python/ops/numpy_ops/README.md
@@ -8,6 +8,14 @@ change the baseline version of numpy API being supported. A list of some
 systematic differences with numpy are listed later in the "Differences with
 Numpy" section.
 
+## Getting Started
+
+```python
+import tensorflow as tf
+from tf.experimental import numpy as np
+print(np.ones([2,1]) + np.ones([1, 2]))
+```
+
 ## Types
 
 The module provide an `ndarray` class which wraps an immutable `tf.Tensor`.
@@ -15,44 +23,12 @@ Additional functions are provided which accept array-like objects. Here
 array-like objects includes `ndarrays` as defined by this module, as well as
 `tf.Tensor`, in addition to types accepted by `numpy`.
 
-A subset of `numpy` dtypes are supported, along with `tf.bfloat16`.
-Additionally, support is provided for selecting the default float type
-(`np.float32` vs `np.float64`) given that some applications may prefer lower
-precision.
+A subset of `numpy` dtypes are supported. Type promotion follows numpy
+semantics.
 
-## Device Support
-
-Given that `ndarray` and functions wrap TensorFlow constructs, the code will
-have GPU and TPU support on par with TensorFlow. Also the code can be wrapped
-with `tf.function` and XLA compiled. Device placement can be controlled by using
-`with tf.device` scopes.
-
-## Graph and Eager Modes
-
-Eager mode execution should typically match numpy semantics of executing
-op-by-op. However the same code can be executed in graph mode, by putting it
-inside a `tf.function`. This can change behavior of certain operations since
-symbolic execution may not have information that is computed during runtime.
-
-Some differences are:
-
-  * Shapes can be incomplete or unknown. This means that `ndarray.shape`,
-    `ndarray.size` and `ndarray.ndim` can return `ndarray` objects instead of
-    returning integer (or tuple of integer) values.
-  * Python control flow based on `ndarray` values may not work and may have to
-    be rewritten to use `tf.cond` or `tf.while_loop`. Note that autograph
-    conversion as part of `tf.function` should still work.
-  * `__len__`, `__iter__` and `__index__` properties of `ndarray` may similarly
-    not work in graph mode.
-
-## Mutation and Variables
-
-`ndarrays` currently wrap immutable `tf.Tensor`. Also currently mutation
-operations like slice assigns are not supported. This may change in the future.
-
-There is currently no explict construct on par with `tf.Variable`. However one
-can directly construct a `tf.Variable` and use that with the numpy APIs in this
-module. See section on Interop.
+```python
+print(np.ones([1, 2], dtype=np.int16) + np.ones([2, 1], dtype=np.uint8))
+```
 
 ## Interop
 
@@ -67,28 +43,87 @@ be done using `tf.vectorized_map()`.
 In general, the expected behavior should be on par with that of code involving
 `tf.Tensor` and running stateless TensorFlow functions on them.
 
+```python
+np.sum(np.ones([1, 2]) + tf.ones([2, 1]))
+```
+
 ## Array Interface
 
 The `ndarray` class implements the `__array__` interface. This should allow
 these objects to be passed into contexts that expect a `numpy` or array-like
 object (e.g. matplotlib).
 
+```python
+import numpy as onp
+onp.sum(np.ones([1, 2]) + onp.ones([2, 1]))
+```
+
+## Device Support
+
+Given that `ndarray` and functions wrap TensorFlow constructs, the code will
+have GPU and TPU support on par with TensorFlow. Also the code can be wrapped
+with `tf.function` and XLA compiled. Device placement can be controlled by using
+`with tf.device` scopes.
+
+```python
+with tf.device("GPU:0"):
+  x = np.ones([1, 2])
+print(tf.convert_to_tensor(x).device)
+```
+
+## Graph and Eager Modes
+
+Eager mode execution should typically match numpy semantics of executing
+op-by-op. However the same code can be executed in graph mode, by putting it
+inside a `tf.function`. The function body can contain numpy code, and the inputs
+can be ndarray as well.
+
+```python
+@tf.function
+def f(x, y):
+  return np.sum(x + y)
+
+f(np.ones([1, 2]), tf.ones([2, 1]))
+```
+
+Note that this can change behavior of certain operations since symbolic
+execution may not have information that is computed during runtime.
+
+Some differences are:
+
+*   Shapes can be incomplete or unknown. This means that `ndarray.shape`,
+    `ndarray.size` and `ndarray.ndim` can return `ndarray` objects instead of
+    returning integer (or tuple of integer) values.
+*   Python control flow based on `ndarray` values may not work and may have to
+    be rewritten to use `tf.cond` or `tf.while_loop`. Note that autograph
+    conversion as part of `tf.function` should still work.
+*   `__len__`, `__iter__` and `__index__` properties of `ndarray` may similarly
+    not work in graph mode.
+
+## Mutation and Variables
+
+`ndarrays` currently wrap immutable `tf.Tensor`. Also currently mutation
+operations like slice assigns are not supported. This may change in the future.
+
+There is currently no explict construct on par with `tf.Variable`. However one
+can directly construct a `tf.Variable` and use that with the numpy APIs in this
+module. See section on Interop.
 
 ## Differences with Numpy
 
 Here is a non-exhaustive list of differences:
 
-  * Not all dtypes are currently supported. e.g. `np.float96`, `np.float128`.
+*   Not all dtypes are currently supported. e.g. `np.float96`, `np.float128`.
     `np.object`, `np.str`, `np.recarray` types are not supported.
-  * `ndarray` storage is in C order only. Fortran order, views, stride_tricks
+*   `ndarray` storage is in C order only. Fortran order, views, stride_tricks
     are not supported.
-  * Only a subset of functions and modules are supported. This set would be
+*   Only a subset of functions and modules are supported. This set would be
     expanded over time. For supported functions, some arguments or argument
     values may not be supported. This differences are listed in the function
     comments.
-  * Buffer mutation is currently not supported. `ndarrays` wrap immutable
+*   Buffer mutation is currently not supported. `ndarrays` wrap immutable
     tensors. This means that output buffer arguments (e..g `out` in ufuncs) are
     not supported
-  * full `ufunc` support is not provided.
-  * Numpy C API is not supported. Numpy's Cython and Swig integration are not
+*   full `ufunc` support is not provided.
+*   Numpy C API is not supported. Numpy's Cython and Swig integration are not
     supported.
diff --git a/tensorflow/python/ops/numpy_ops/__init__.py b/tensorflow/python/ops/numpy_ops/__init__.py
index 10ace06df9a..c520ece7843 100644
--- a/tensorflow/python/ops/numpy_ops/__init__.py
+++ b/tensorflow/python/ops/numpy_ops/__init__.py
@@ -31,9 +31,6 @@ array-like objects includes `ndarrays` as defined by this module, as well as
 `tf.Tensor`, in addition to types accepted by `numpy`.
 
 A subset of `numpy` dtypes are supported, along with `tf.bfloat16`.
-Additionally, support is provided for selecting the default float type
-(`np.float32` vs `np.float64`) given that some applications may prefer lower
-precision.
 
 Device Support
 -------------

From 09ec15539eece57b257ce9074918282d88523d56 Mon Sep 17 00:00:00 2001
From: Peng Wang <wangpeng@google.com>
Date: Fri, 19 Jun 2020 19:42:33 -0700
Subject: [PATCH 0699/1390] [TF-numpy] Adds some constants.

PiperOrigin-RevId: 317421570
Change-Id: I467e979095fbd2311a64913e6419ca8317f68734
---
 tensorflow/python/ops/numpy_ops/np_math_ops.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/tensorflow/python/ops/numpy_ops/np_math_ops.py b/tensorflow/python/ops/numpy_ops/np_math_ops.py
index 3cf26095dd8..df3d7cf32ab 100644
--- a/tensorflow/python/ops/numpy_ops/np_math_ops.py
+++ b/tensorflow/python/ops/numpy_ops/np_math_ops.py
@@ -40,6 +40,11 @@ from tensorflow.python.ops.numpy_ops import np_dtypes
 from tensorflow.python.ops.numpy_ops import np_utils
 
 
+pi = np.pi
+e = np.e
+inf = np.inf
+
+
 @np_utils.np_doc_only('dot')
 def dot(a, b):  # pylint: disable=missing-docstring
 

From fcba37557a30e707b1368ee270a29e41410ad546 Mon Sep 17 00:00:00 2001
From: Mark Daoust <markdaoust@google.com>
Date: Fri, 19 Jun 2020 20:36:23 -0700
Subject: [PATCH 0700/1390] In `.adapt`, don't freeze any shape values, only
 the number of dimensions.

Currently, this freezes the batch size and other shape elements. This is too strict, leads to downstream failures if you have:

* a different batch_size for `adapt()` and `fit()`.
* variable sequence length or image size.

PiperOrigin-RevId: 317425544
Change-Id: I8cfeceeb6816d2f70ed112a04f51fdd15e6658bf
---
 .../keras/engine/base_preprocessing_layer.py  | 19 +++++++---
 .../engine/base_preprocessing_layer_test.py   | 38 +++++++++++++++++--
 2 files changed, 48 insertions(+), 9 deletions(-)

diff --git a/tensorflow/python/keras/engine/base_preprocessing_layer.py b/tensorflow/python/keras/engine/base_preprocessing_layer.py
index b2ab0880422..c8ba1229ff5 100644
--- a/tensorflow/python/keras/engine/base_preprocessing_layer.py
+++ b/tensorflow/python/keras/engine/base_preprocessing_layer.py
@@ -185,14 +185,21 @@ class CombinerPreprocessingLayer(PreprocessingLayer):
       if not self.built:
         try:
           # If this is a Numpy array or tensor, we can get shape from .shape.
-          # If not, an attribute error will be thrown (and we can assume the
-          # input data is a scalar with shape None.
-          shape = data_element.shape
+          # If not, an attribute error will be thrown.
+          data_shape = data_element.shape
+          data_shape_nones = tuple([None]*len(data_element.shape))
         except AttributeError:
-          shape = None
+          # The input has an unknown number of dimensions.
+          data_shape = None
+          data_shape_nones = None
+
         # TODO (b/159261555): move this to base layer build.
-        self._batch_input_shape = shape
-        self.build(shape)
+        batch_input_shape = getattr(self, '_batch_input_shape', None)
+        if batch_input_shape is None:
+          # Set the number of dimensions.
+          self._batch_input_shape = data_shape_nones
+
+        self.build(data_shape)
 
       # Once we have built the Layer, we can process the input data. We do so
       # until we've gotten an exception indicating that we have no more data.
diff --git a/tensorflow/python/keras/engine/base_preprocessing_layer_test.py b/tensorflow/python/keras/engine/base_preprocessing_layer_test.py
index 70d088cf3d3..a3a36a9bf11 100644
--- a/tensorflow/python/keras/engine/base_preprocessing_layer_test.py
+++ b/tensorflow/python/keras/engine/base_preprocessing_layer_test.py
@@ -122,11 +122,11 @@ class AddingPreprocessingLayerV1(
   pass
 
 
-def get_layer():
+def get_layer(**kwargs):
   if context.executing_eagerly():
-    return AddingPreprocessingLayer()
+    return AddingPreprocessingLayer(**kwargs)
   else:
-    return AddingPreprocessingLayerV1()
+    return AddingPreprocessingLayerV1(**kwargs)
 
 
 @keras_parameterized.run_all_keras_modes
@@ -366,6 +366,38 @@ class PreprocessingLayerTest(keras_parameterized.TestCase):
     with self.assertRaisesRegex(RuntimeError, "Unable to restore a layer of"):
       _ = keras.models.load_model(output_path)
 
+  def test_adapt_sets_input_shape_rank(self):
+    """Check that `.adapt()` sets the `input_shape`'s rank."""
+    # Shape: (3,1,2)
+    adapt_dataset = np.array([[[1., 2.]],
+                              [[3., 4.]],
+                              [[5., 6.]]], dtype=np.float32)
+
+    layer = get_layer()
+    layer.adapt(adapt_dataset)
+
+    input_dataset = np.array([[[1., 2.], [3., 4.]],
+                              [[3., 4.], [5., 6.]]], dtype=np.float32)
+    layer(input_dataset)
+
+    model = keras.Sequential([layer])
+    self.assertTrue(model.built)
+    self.assertEqual(model.input_shape, (None, None, None))
+
+  def test_adapt_doesnt_overwrite_input_shape(self):
+    """Check that `.adapt()` doesn't change the `input_shape`."""
+    # Shape: (3, 1, 2)
+    adapt_dataset = np.array([[[1., 2.]],
+                              [[3., 4.]],
+                              [[5., 6.]]], dtype=np.float32)
+
+    layer = get_layer(input_shape=[1, 2])
+    layer.adapt(adapt_dataset)
+
+    model = keras.Sequential([layer])
+    self.assertTrue(model.built)
+    self.assertEqual(model.input_shape, (None, 1, 2))
+
 
 @keras_parameterized.run_all_keras_modes
 class ConvertToListTest(keras_parameterized.TestCase):

From 728a4a4405d17dc9dc24a0a584cbdb49cd9c001e Mon Sep 17 00:00:00 2001
From: David Majnemer <majnemer@google.com>
Date: Fri, 19 Jun 2020 21:10:52 -0700
Subject: [PATCH 0701/1390] Unbreak the MSVC build

Don't use inline variables, our MSVC builds do not like them.

Instead, simulate them using inline functions + function-scope static variables.

PiperOrigin-RevId: 317428053
Change-Id: Icf8838c159ab9f132ad32360633046f4c2224a79
---
 .../compiler/xla/tests/manifest_checking_test.cc      |  4 ++--
 tensorflow/compiler/xla/tests/test_macros.cc          |  8 ++++----
 tensorflow/compiler/xla/tests/test_macros.h           | 11 +++++++++--
 3 files changed, 15 insertions(+), 8 deletions(-)

diff --git a/tensorflow/compiler/xla/tests/manifest_checking_test.cc b/tensorflow/compiler/xla/tests/manifest_checking_test.cc
index 8806290472d..ac6204f9df9 100644
--- a/tensorflow/compiler/xla/tests/manifest_checking_test.cc
+++ b/tensorflow/compiler/xla/tests/manifest_checking_test.cc
@@ -37,7 +37,7 @@ using ManifestT = absl::flat_hash_map<std::string, std::vector<std::string>>;
 ManifestT ReadManifest() {
   ManifestT manifest;
 
-  absl::string_view path = absl::NullSafeStringView(kDisabledManifestPath);
+  absl::string_view path = absl::NullSafeStringView(*DisabledManifestPath());
   if (path.empty()) {
     return manifest;
   }
@@ -115,7 +115,7 @@ void ManifestCheckingTest::SetUp() {
 
   // Expect a full match vs. one of the platform regexps to disable the test.
   const std::vector<std::string>& disabled_platforms = it->second;
-  auto platform_string = kTestPlatform;
+  auto platform_string = *TestPlatform();
   for (const auto& s : disabled_platforms) {
     if (RE2::FullMatch(/*text=*/platform_string, /*re=*/s)) {
       GTEST_SKIP();
diff --git a/tensorflow/compiler/xla/tests/test_macros.cc b/tensorflow/compiler/xla/tests/test_macros.cc
index 9e85af76e89..eecbb89b877 100644
--- a/tensorflow/compiler/xla/tests/test_macros.cc
+++ b/tensorflow/compiler/xla/tests/test_macros.cc
@@ -20,10 +20,10 @@ limitations under the License.
 namespace xla {
 
 static bool InitModule() {
-  kDisabledManifestPath = XLA_DISABLED_MANIFEST;
-  VLOG(1) << "kDisabledManifestPath: " << kDisabledManifestPath;
-  kTestPlatform = XLA_PLATFORM;
-  VLOG(1) << "kTestPlatform: " << kTestPlatform;
+  *DisabledManifestPath() = XLA_DISABLED_MANIFEST;
+  VLOG(1) << "DisabledManifestPath: " << *DisabledManifestPath();
+  *TestPlatform() = XLA_PLATFORM;
+  VLOG(1) << "TestPlatform: " << *TestPlatform();
   return false;
 }
 
diff --git a/tensorflow/compiler/xla/tests/test_macros.h b/tensorflow/compiler/xla/tests/test_macros.h
index f62bccbe850..16cc9ff6feb 100644
--- a/tensorflow/compiler/xla/tests/test_macros.h
+++ b/tensorflow/compiler/xla/tests/test_macros.h
@@ -73,8 +73,15 @@ limitations under the License.
 
 namespace xla {
 
-inline const char *kDisabledManifestPath = nullptr;
-inline const char *kTestPlatform = nullptr;
+inline const char** DisabledManifestPath() {
+  static const char* disabled_manifest_path = nullptr;
+  return &disabled_manifest_path;
+}
+
+inline const char** TestPlatform() {
+  static const char* test_platform = nullptr;
+  return &test_platform;
+}
 
 }  // namespace xla
 

From 8efcda9caa52c3935e9db3cb9f4a787d0b97dae7 Mon Sep 17 00:00:00 2001
From: Peng Wang <wangpeng@google.com>
Date: Fri, 19 Jun 2020 21:56:50 -0700
Subject: [PATCH 0702/1390] [TF-numpy] Added `einsum` to numpy_ops.

PiperOrigin-RevId: 317431034
Change-Id: I209bc52b5c03369526eb45486c1b6c09352f9f06
---
 .../python/ops/numpy_ops/np_math_ops.py       | 28 +++++++++++++++++++
 1 file changed, 28 insertions(+)

diff --git a/tensorflow/python/ops/numpy_ops/np_math_ops.py b/tensorflow/python/ops/numpy_ops/np_math_ops.py
index df3d7cf32ab..f9d3b34e90d 100644
--- a/tensorflow/python/ops/numpy_ops/np_math_ops.py
+++ b/tensorflow/python/ops/numpy_ops/np_math_ops.py
@@ -34,6 +34,7 @@ from tensorflow.python.ops import clip_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import sort_ops
+from tensorflow.python.ops import special_math_ops
 from tensorflow.python.ops.numpy_ops import np_array_ops
 from tensorflow.python.ops.numpy_ops import np_arrays
 from tensorflow.python.ops.numpy_ops import np_dtypes
@@ -1336,3 +1337,30 @@ def meshgrid(*xi, **kwargs):
   outputs = [np_utils.tensor_to_ndarray(output) for output in outputs]
 
   return outputs
+
+
+@np_utils.np_doc('einsum')
+def einsum(subscripts, *operands, casting='safe', optimize=False):  # pylint: disable=missing-docstring
+  if casting == 'safe':
+    operands = np_array_ops._promote_dtype(*operands)  # pylint: disable=protected-access
+  elif casting == 'no':
+    operands = [np_array_ops.asarray(x) for x in operands]
+  else:
+    raise ValueError('casting policy not supported: %s' % casting)
+  if not optimize:
+    # TF doesn't have a "no optimization" option.
+    # TODO(wangpeng): Print a warning that np and tf use different
+    #   optimizations.
+    tf_optimize = 'greedy'
+  elif optimize == True:  # pylint: disable=singleton-comparison,g-explicit-bool-comparison
+    tf_optimize = 'greedy'
+  elif optimize == 'greedy':
+    tf_optimize = 'greedy'
+  elif optimize == 'optimal':
+    tf_optimize = 'optimal'
+  else:
+    raise ValueError('`optimize` method not supported: %s' % optimize)
+  operands = [x.data for x in operands]
+  res = special_math_ops.einsum(subscripts, *operands, optimize=tf_optimize)
+  res = np_utils.tensor_to_ndarray(res)
+  return res

From 7f2bfd570962ab3401cd5744ec638fcf523d26c5 Mon Sep 17 00:00:00 2001
From: Sanjoy Das <sanjoy@google.com>
Date: Fri, 19 Jun 2020 22:10:56 -0700
Subject: [PATCH 0703/1390] Have the HloEvaluator print out intermediate values
 on VLOG(100)

This is useful for tracing through very small HLO programs.

PiperOrigin-RevId: 317432566
Change-Id: I1b70ed0f81ed57915ecccaf8cb85fefc857e2151
---
 tensorflow/compiler/xla/service/hlo_evaluator.cc | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/tensorflow/compiler/xla/service/hlo_evaluator.cc b/tensorflow/compiler/xla/service/hlo_evaluator.cc
index 3dd6d82784f..ae8f49df4b4 100644
--- a/tensorflow/compiler/xla/service/hlo_evaluator.cc
+++ b/tensorflow/compiler/xla/service/hlo_evaluator.cc
@@ -274,6 +274,13 @@ StatusOr<Literal> HloEvaluator::Evaluate(
   engine_.seed(seed_);
 
   TF_RETURN_IF_ERROR(computation.Accept(this));
+
+  if (VLOG_IS_ON(100)) {
+    for (const HloInstruction* instr : computation.instructions()) {
+      VLOG(100) << instr->name() << " = " << GetEvaluatedLiteralFor(instr);
+    }
+  }
+
   return GetEvaluatedLiteralFor(computation.root_instruction()).Clone();
 }
 

From 27cb9aa8342b120f739310a6a6222c37620984a4 Mon Sep 17 00:00:00 2001
From: Frank Chen <frankchn@google.com>
Date: Fri, 19 Jun 2020 22:49:37 -0700
Subject: [PATCH 0704/1390] Add SnappyInputStream implementation.

This is to support snappy compression/decompression on new version of snapshot dataset, as the dataset uses input streams instead.

PiperOrigin-RevId: 317435680
Change-Id: Ie57b43e73b6b7911398d883c3c5a0de72973288e
---
 tensorflow/core/BUILD                         |   1 +
 tensorflow/core/lib/io/BUILD                  |  17 +-
 .../core/lib/io/snappy/snappy_inputstream.cc  | 153 +++++++++++++++
 .../core/lib/io/snappy/snappy_inputstream.h   |  89 +++++++++
 ...{snappy_buffers_test.cc => snappy_test.cc} | 182 ++++++++++++++++--
 5 files changed, 427 insertions(+), 15 deletions(-)
 create mode 100644 tensorflow/core/lib/io/snappy/snappy_inputstream.cc
 create mode 100644 tensorflow/core/lib/io/snappy/snappy_inputstream.h
 rename tensorflow/core/lib/io/snappy/{snappy_buffers_test.cc => snappy_test.cc} (59%)

diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index 7f1c1bd549b..0f709750897 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -1865,6 +1865,7 @@ cc_library(
         "//tensorflow/core/lib/io:record_reader",
         "//tensorflow/core/lib/io:record_writer",
         "//tensorflow/core/lib/io:snappy_inputbuffer",
+        "//tensorflow/core/lib/io:snappy_inputstream",
         "//tensorflow/core/lib/io:snappy_outputbuffer",
         "//tensorflow/core/lib/io:table",
         "//tensorflow/core/lib/io:table_options",
diff --git a/tensorflow/core/lib/io/BUILD b/tensorflow/core/lib/io/BUILD
index d03a895b429..5e1704a50c1 100644
--- a/tensorflow/core/lib/io/BUILD
+++ b/tensorflow/core/lib/io/BUILD
@@ -208,6 +208,19 @@ cc_library(
     alwayslink = True,
 )
 
+cc_library(
+    name = "snappy_inputstream",
+    srcs = ["snappy/snappy_inputstream.cc"],
+    hdrs = ["snappy/snappy_inputstream.h"],
+    deps = [
+        ":inputstream_interface",
+        "//tensorflow/core/platform:errors",
+        "//tensorflow/core/platform:platform_port",
+        "@com_google_absl//absl/memory",
+    ],
+    alwayslink = True,
+)
+
 cc_library(
     name = "cache",
     srcs = [
@@ -354,6 +367,7 @@ filegroup(
         "record_reader.h",
         "record_writer.h",
         "snappy/snappy_inputbuffer.h",
+        "snappy/snappy_inputstream.h",
         "snappy/snappy_outputbuffer.h",
         "table.h",
         "table_builder.h",
@@ -377,7 +391,7 @@ filegroup(
         "random_inputstream_test.cc",
         "record_reader_writer_test.cc",
         "recordio_test.cc",
-        "snappy/snappy_buffers_test.cc",
+        "snappy/snappy_test.cc",
         "table_test.cc",
         "zlib_buffers_test.cc",
     ],
@@ -409,6 +423,7 @@ filegroup(
         "inputbuffer.h",
         "iterator.h",
         "snappy/snappy_inputbuffer.h",
+        "snappy/snappy_inputstream.h",
         "snappy/snappy_outputbuffer.h",
         "zlib_compression_options.h",
         "zlib_inputstream.h",
diff --git a/tensorflow/core/lib/io/snappy/snappy_inputstream.cc b/tensorflow/core/lib/io/snappy/snappy_inputstream.cc
new file mode 100644
index 00000000000..7e77971f4f1
--- /dev/null
+++ b/tensorflow/core/lib/io/snappy/snappy_inputstream.cc
@@ -0,0 +1,153 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/lib/io/snappy/snappy_inputstream.h"
+
+#include "absl/memory/memory.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/snappy.h"
+
+namespace tensorflow {
+namespace io {
+
+SnappyInputStream::SnappyInputStream(InputStreamInterface* input_stream,
+                                     size_t output_buffer_bytes,
+                                     bool owns_input_stream)
+    : input_stream_(input_stream),
+      output_buffer_bytes_(output_buffer_bytes),
+      owns_input_stream_(owns_input_stream),
+      bytes_read_(0),
+      output_buffer_(new char[output_buffer_bytes]),
+      next_out_(nullptr),
+      avail_out_(0) {}
+
+SnappyInputStream::SnappyInputStream(InputStreamInterface* input_stream,
+                                     size_t output_buffer_bytes)
+    : SnappyInputStream(input_stream, output_buffer_bytes, false) {}
+
+SnappyInputStream::~SnappyInputStream() {
+  if (owns_input_stream_) {
+    delete input_stream_;
+  }
+}
+
+Status SnappyInputStream::ReadNBytes(int64 bytes_to_read, tstring* result) {
+  result->clear();
+  result->resize_uninitialized(bytes_to_read);
+
+  char* result_ptr = result->mdata();
+
+  // Read as many bytes as possible from the cache.
+  size_t bytes_read = ReadBytesFromCache(bytes_to_read, result_ptr);
+  bytes_to_read -= bytes_read;
+  result_ptr += bytes_read;
+
+  while (bytes_to_read > 0) {
+    DCHECK_EQ(avail_out_, 0);
+
+    // Fill the cache with more data.
+    TF_RETURN_IF_ERROR(Inflate());
+
+    size_t bytes_read = ReadBytesFromCache(bytes_to_read, result_ptr);
+    bytes_to_read -= bytes_read;
+    result_ptr += bytes_read;
+  }
+
+  return Status::OK();
+}
+
+#if defined(PLATFORM_GOOGLE)
+Status SnappyInputStream::ReadNBytes(int64 bytes_to_read, absl::Cord* result) {
+  // TODO(frankchn): Optimize this instead of bouncing through the buffer.
+  tstring buf;
+  TF_RETURN_IF_ERROR(ReadNBytes(bytes_to_read, &buf));
+  result->Clear();
+  result->Append(buf.data());
+  return Status::OK();
+}
+#endif
+
+Status SnappyInputStream::Inflate() {
+  tstring compressed_block_length_ts;
+  uint32 compressed_block_length;
+
+  TF_RETURN_IF_ERROR(
+      input_stream_->ReadNBytes(sizeof(uint32), &compressed_block_length_ts));
+  for (int i = 0; i < sizeof(uint32); ++i) {
+    compressed_block_length =
+        (compressed_block_length << 8) |
+        static_cast<unsigned char>(compressed_block_length_ts.data()[i]);
+  }
+
+  tstring compressed_block;
+  compressed_block.resize_uninitialized(compressed_block_length);
+
+  Status s =
+      input_stream_->ReadNBytes(compressed_block_length, &compressed_block);
+  if (errors::IsOutOfRange(s)) {
+    return errors::DataLoss("Failed to read ", compressed_block_length,
+                            " bytes from file. Possible data corruption.");
+  }
+  TF_RETURN_IF_ERROR(s);
+
+  size_t uncompressed_length;
+  if (!port::Snappy_GetUncompressedLength(compressed_block.data(),
+                                          compressed_block_length,
+                                          &uncompressed_length)) {
+    return errors::DataLoss("Parsing error in Snappy_GetUncompressedLength");
+  }
+
+  DCHECK_EQ(avail_out_, 0);
+  if (output_buffer_bytes_ < uncompressed_length) {
+    return errors::ResourceExhausted(
+        "Output buffer(size: ", output_buffer_bytes_,
+        " bytes"
+        ") too small. Should be larger than ",
+        uncompressed_length, " bytes.");
+  }
+
+  next_out_ = output_buffer_.get();
+  if (!port::Snappy_Uncompress(compressed_block.data(), compressed_block_length,
+                               output_buffer_.get())) {
+    return errors::DataLoss("Snappy_Uncompress failed.");
+  }
+  avail_out_ += uncompressed_length;
+
+  return Status::OK();
+}
+
+size_t SnappyInputStream::ReadBytesFromCache(size_t bytes_to_read,
+                                             char* result) {
+  size_t can_read_bytes = std::min(bytes_to_read, avail_out_);
+  if (can_read_bytes) {
+    memcpy(result, next_out_, can_read_bytes);
+    next_out_ += can_read_bytes;
+    avail_out_ -= can_read_bytes;
+  }
+  bytes_read_ += can_read_bytes;
+  return can_read_bytes;
+}
+
+int64 SnappyInputStream::Tell() const { return bytes_read_; }
+
+Status SnappyInputStream::Reset() {
+  TF_RETURN_IF_ERROR(input_stream_->Reset());
+  avail_out_ = 0;
+  bytes_read_ = 0;
+  return Status::OK();
+}
+
+}  // namespace io
+}  // namespace tensorflow
diff --git a/tensorflow/core/lib/io/snappy/snappy_inputstream.h b/tensorflow/core/lib/io/snappy/snappy_inputstream.h
new file mode 100644
index 00000000000..bbe8eaf0dda
--- /dev/null
+++ b/tensorflow/core/lib/io/snappy/snappy_inputstream.h
@@ -0,0 +1,89 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_LIB_IO_SNAPPY_SNAPPY_INPUTSTREAM_H_
+#define TENSORFLOW_CORE_LIB_IO_SNAPPY_SNAPPY_INPUTSTREAM_H_
+
+#include "tensorflow/core/lib/io/inputstream_interface.h"
+
+namespace tensorflow {
+namespace io {
+
+class SnappyInputStream : public InputStreamInterface {
+ public:
+  // Creates a SnappyInputStream for `input_stream`.
+  //
+  // Takes ownership  of `input_stream` iff `owns_input_stream` is true.
+  SnappyInputStream(InputStreamInterface* input_stream,
+                    size_t output_buffer_bytes, bool owns_input_stream);
+
+  // Equivalent to the previous constructor with owns_input_stream = false.
+  explicit SnappyInputStream(InputStreamInterface* input_stream,
+                             size_t output_buffer_bytes);
+
+  ~SnappyInputStream() override;
+
+  // Reads bytes_to_read bytes into *result, overwriting *result.
+  //
+  // Return Status codes:
+  // OK:           If successful.
+  // OUT_OF_RANGE: If there are not enough bytes to read before
+  //               the end of the stream.
+  // ABORTED:      If inflate() fails, we return the error code with the
+  //               error message in `z_stream_->msg`.
+  // others:       If reading from stream failed.
+  Status ReadNBytes(int64 bytes_to_read, tstring* result) override;
+
+#if defined(PLATFORM_GOOGLE)
+  Status ReadNBytes(int64 bytes_to_read, absl::Cord* result) override;
+#endif
+
+  int64 Tell() const override;
+
+  Status Reset() override;
+
+ private:
+  // Decompress the next chunk of data and place the data into the cache.
+  Status Inflate();
+
+  // Attempt to read `bytes_to_read` from the decompressed data cache. Returns
+  // the actual number of bytes read.
+  size_t ReadBytesFromCache(size_t bytes_to_read, char* result);
+
+  InputStreamInterface* input_stream_;
+  const size_t output_buffer_bytes_;
+  const bool owns_input_stream_;
+
+  // Specifies the number of decompressed bytes currently read.
+  int64 bytes_read_;
+
+  // output_buffer_ contains decompressed data not yet read by the client.
+  std::unique_ptr<char[]> output_buffer_;
+
+  // next_out_ points to the position in the `output_buffer_` that contains the
+  // next unread byte.
+  char* next_out_;
+
+  // avail_out_ specifies the number of bytes left in the output_buffers_ that
+  // is not yet read.
+  size_t avail_out_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(SnappyInputStream);
+};
+
+}  // namespace io
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_LIB_IO_SNAPPY_SNAPPY_INPUTSTREAM_H_
diff --git a/tensorflow/core/lib/io/snappy/snappy_buffers_test.cc b/tensorflow/core/lib/io/snappy/snappy_test.cc
similarity index 59%
rename from tensorflow/core/lib/io/snappy/snappy_buffers_test.cc
rename to tensorflow/core/lib/io/snappy/snappy_test.cc
index 521b49b73f2..b7d5eae6cc5 100644
--- a/tensorflow/core/lib/io/snappy/snappy_buffers_test.cc
+++ b/tensorflow/core/lib/io/snappy/snappy_test.cc
@@ -15,7 +15,9 @@ limitations under the License.
 
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/io/inputbuffer.h"
+#include "tensorflow/core/lib/io/random_inputstream.h"
 #include "tensorflow/core/lib/io/snappy/snappy_inputbuffer.h"
+#include "tensorflow/core/lib/io/snappy/snappy_inputstream.h"
 #include "tensorflow/core/lib/io/snappy/snappy_outputbuffer.h"
 
 namespace tensorflow {
@@ -50,18 +52,17 @@ static string GenTestString(int copies = 1) {
   return result;
 }
 
-Status TestMultipleWrites(size_t compress_input_buf_size,
-                          size_t compress_output_buf_size,
-                          size_t uncompress_input_buf_size,
-                          size_t uncompress_output_buf_size, int num_writes = 1,
-                          bool with_flush = false, int num_copies = 1,
-                          bool corrupt_compressed_file = false) {
+Status TestMultipleWritesWriteFile(size_t compress_input_buf_size,
+                                   size_t compress_output_buf_size,
+                                   int num_writes, bool with_flush,
+                                   int num_copies, bool corrupt_compressed_file,
+                                   string& fname, string& data,
+                                   string& expected_result) {
   Env* env = Env::Default();
 
-  string fname = testing::TmpDir() + "/snappy_buffers_test";
-  string data = GenTestString(num_copies);
+  fname = testing::TmpDir() + "/snappy_buffers_test";
+  data = GenTestString(num_copies);
   std::unique_ptr<WritableFile> file_writer;
-  string expected_result;
 
   TF_RETURN_IF_ERROR(env->NewWritableFile(fname, &file_writer));
   io::SnappyOutputBuffer out(file_writer.get(), compress_input_buf_size,
@@ -112,6 +113,25 @@ Status TestMultipleWrites(size_t compress_input_buf_size,
     fname = corrupt_fname;
   }
 
+  return Status::OK();
+}
+
+Status TestMultipleWrites(size_t compress_input_buf_size,
+                          size_t compress_output_buf_size,
+                          size_t uncompress_input_buf_size,
+                          size_t uncompress_output_buf_size, int num_writes = 1,
+                          bool with_flush = false, int num_copies = 1,
+                          bool corrupt_compressed_file = false) {
+  Env* env = Env::Default();
+
+  string expected_result;
+  string fname;
+  string data;
+
+  TF_RETURN_IF_ERROR(TestMultipleWritesWriteFile(
+      compress_input_buf_size, compress_output_buf_size, num_writes, with_flush,
+      num_copies, corrupt_compressed_file, fname, data, expected_result));
+
   std::unique_ptr<RandomAccessFile> file_reader;
   TF_RETURN_IF_ERROR(env->NewRandomAccessFile(fname, &file_reader));
   io::SnappyInputBuffer in(file_reader.get(), uncompress_input_buf_size,
@@ -131,15 +151,56 @@ Status TestMultipleWrites(size_t compress_input_buf_size,
     }
     TF_RETURN_IF_ERROR(in.Reset());
   }
+
   return Status::OK();
 }
 
-void TestTell(size_t compress_input_buf_size, size_t compress_output_buf_size,
-              size_t uncompress_input_buf_size,
-              size_t uncompress_output_buf_size, int num_copies = 1) {
+Status TestMultipleWritesInputStream(
+    size_t compress_input_buf_size, size_t compress_output_buf_size,
+    size_t uncompress_input_buf_size, size_t uncompress_output_buf_size,
+    int num_writes = 1, bool with_flush = false, int num_copies = 1,
+    bool corrupt_compressed_file = false) {
   Env* env = Env::Default();
-  string fname = testing::TmpDir() + "/snappy_buffers_test";
-  string data = GenTestString(num_copies);
+
+  string expected_result;
+  string fname;
+  string data;
+
+  TF_RETURN_IF_ERROR(TestMultipleWritesWriteFile(
+      compress_input_buf_size, compress_output_buf_size, num_writes, with_flush,
+      num_copies, corrupt_compressed_file, fname, data, expected_result));
+
+  std::unique_ptr<RandomAccessFile> file_reader;
+  TF_RETURN_IF_ERROR(env->NewRandomAccessFile(fname, &file_reader));
+  io::RandomAccessInputStream random_input_stream(file_reader.get(), false);
+  io::SnappyInputStream snappy_input_stream(&random_input_stream,
+                                            uncompress_output_buf_size);
+
+  for (int attempt = 0; attempt < 2; ++attempt) {
+    string actual_result;
+    for (int i = 0; i < num_writes; ++i) {
+      tstring decompressed_output;
+      TF_RETURN_IF_ERROR(
+          snappy_input_stream.ReadNBytes(data.size(), &decompressed_output));
+      strings::StrAppend(&actual_result, decompressed_output);
+    }
+
+    if (actual_result.compare(expected_result)) {
+      return errors::DataLoss("Actual and expected results don't match.");
+    }
+    TF_RETURN_IF_ERROR(snappy_input_stream.Reset());
+  }
+  return Status::OK();
+}
+
+void TestTellWriteFile(size_t compress_input_buf_size,
+                       size_t compress_output_buf_size,
+                       size_t uncompress_input_buf_size,
+                       size_t uncompress_output_buf_size, int num_copies,
+                       string& fname, string& data) {
+  Env* env = Env::Default();
+  fname = testing::TmpDir() + "/snappy_buffers_test";
+  data = GenTestString(num_copies);
 
   // Write the compressed file.
   std::unique_ptr<WritableFile> file_writer;
@@ -150,6 +211,18 @@ void TestTell(size_t compress_input_buf_size, size_t compress_output_buf_size,
   TF_CHECK_OK(out.Flush());
   TF_CHECK_OK(file_writer->Flush());
   TF_CHECK_OK(file_writer->Close());
+}
+
+void TestTell(size_t compress_input_buf_size, size_t compress_output_buf_size,
+              size_t uncompress_input_buf_size,
+              size_t uncompress_output_buf_size, int num_copies = 1) {
+  Env* env = Env::Default();
+  string data;
+  string fname;
+
+  TestTellWriteFile(compress_input_buf_size, compress_output_buf_size,
+                    uncompress_input_buf_size, uncompress_output_buf_size,
+                    num_copies, fname, data);
 
   tstring first_half(string(data, 0, data.size() / 2));
   tstring bytes_read;
@@ -175,6 +248,43 @@ void TestTell(size_t compress_input_buf_size, size_t compress_output_buf_size,
   EXPECT_EQ(bytes_read, data);
 }
 
+void TestTellInputStream(size_t compress_input_buf_size,
+                         size_t compress_output_buf_size,
+                         size_t uncompress_input_buf_size,
+                         size_t uncompress_output_buf_size,
+                         int num_copies = 1) {
+  Env* env = Env::Default();
+  string data;
+  string fname;
+
+  TestTellWriteFile(compress_input_buf_size, compress_output_buf_size,
+                    uncompress_input_buf_size, uncompress_output_buf_size,
+                    num_copies, fname, data);
+
+  tstring first_half(string(data, 0, data.size() / 2));
+  tstring bytes_read;
+  std::unique_ptr<RandomAccessFile> file_reader;
+  TF_CHECK_OK(env->NewRandomAccessFile(fname, &file_reader));
+  io::RandomAccessInputStream random_input_stream(file_reader.get(), false);
+  io::SnappyInputStream in(&random_input_stream, uncompress_output_buf_size);
+
+  // Read the first half of the uncompressed file and expect that Tell()
+  // returns half the uncompressed length of the file.
+  TF_CHECK_OK(in.ReadNBytes(first_half.size(), &bytes_read));
+  EXPECT_EQ(in.Tell(), first_half.size());
+  EXPECT_EQ(bytes_read, first_half);
+
+  // Read the remaining half of the uncompressed file and expect that
+  // Tell() points past the end of file.
+  tstring second_half;
+  TF_CHECK_OK(in.ReadNBytes(data.size() - first_half.size(), &second_half));
+  EXPECT_EQ(in.Tell(), data.size());
+  bytes_read.append(second_half);
+
+  // Expect that the file is correctly read.
+  EXPECT_EQ(bytes_read, data);
+}
+
 static bool SnappyCompressionSupported() {
   string out;
   StringPiece in = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa";
@@ -187,6 +297,7 @@ TEST(SnappyBuffers, MultipleWritesWithoutFlush) {
     return;
   }
   TF_CHECK_OK(TestMultipleWrites(10000, 10000, 10000, 10000, 2));
+  TF_CHECK_OK(TestMultipleWritesInputStream(10000, 10000, 10000, 10000, 2));
 }
 
 TEST(SnappyBuffers, MultipleWriteCallsWithFlush) {
@@ -195,6 +306,8 @@ TEST(SnappyBuffers, MultipleWriteCallsWithFlush) {
     return;
   }
   TF_CHECK_OK(TestMultipleWrites(10000, 10000, 10000, 10000, 2, true));
+  TF_CHECK_OK(
+      TestMultipleWritesInputStream(10000, 10000, 10000, 10000, 2, true));
 }
 
 TEST(SnappyBuffers, SmallUncompressInputBuffer) {
@@ -208,6 +321,17 @@ TEST(SnappyBuffers, SmallUncompressInputBuffer) {
                                      COMPRESSED_RECORD_SIZE, " bytes."));
 }
 
+TEST(SnappyBuffers, SmallUncompressInputStream) {
+  if (!SnappyCompressionSupported()) {
+    fprintf(stderr, "skipping compression tests\n");
+    return;
+  }
+  CHECK_EQ(TestMultipleWritesInputStream(10000, 10000, 10000, 10, 2, true),
+           errors::ResourceExhausted(
+               "Output buffer(size: 10 bytes) too small. ",
+               "Should be larger than ", GetRecord().size(), " bytes."));
+}
+
 TEST(SnappyBuffers, CorruptBlock) {
   if (!SnappyCompressionSupported()) {
     fprintf(stderr, "skipping compression tests\n");
@@ -218,6 +342,17 @@ TEST(SnappyBuffers, CorruptBlock) {
                             " bytes from file. ", "Possible data corruption."));
 }
 
+TEST(SnappyBuffers, CorruptBlockInputStream) {
+  if (!SnappyCompressionSupported()) {
+    fprintf(stderr, "skipping compression tests\n");
+    return;
+  }
+  CHECK_EQ(
+      TestMultipleWritesInputStream(10000, 10000, 700, 10000, 2, true, 1, true),
+      errors::DataLoss("Failed to read ", COMPRESSED_RECORD_SIZE,
+                       " bytes from file. ", "Possible data corruption."));
+}
+
 TEST(SnappyBuffers, CorruptBlockLargeInputBuffer) {
   if (!SnappyCompressionSupported()) {
     fprintf(stderr, "skipping compression tests\n");
@@ -227,6 +362,17 @@ TEST(SnappyBuffers, CorruptBlockLargeInputBuffer) {
            errors::OutOfRange("EOF reached"));
 }
 
+TEST(SnappyBuffers, CorruptBlockLargeInputStream) {
+  if (!SnappyCompressionSupported()) {
+    fprintf(stderr, "skipping compression tests\n");
+    return;
+  }
+  CHECK_EQ(TestMultipleWritesInputStream(10000, 10000, 2000, 10000, 2, true, 1,
+                                         true),
+           errors::DataLoss("Failed to read ", COMPRESSED_RECORD_SIZE,
+                            " bytes from file. Possible data corruption."));
+}
+
 TEST(SnappyBuffers, Tell) {
   if (!SnappyCompressionSupported()) {
     fprintf(stderr, "skipping compression tests\n");
@@ -235,4 +381,12 @@ TEST(SnappyBuffers, Tell) {
   TestTell(10000, 10000, 2000, 10000, 2);
 }
 
+TEST(SnappyBuffers, TellInputStream) {
+  if (!SnappyCompressionSupported()) {
+    fprintf(stderr, "skipping compression tests\n");
+    return;
+  }
+  TestTellInputStream(10000, 10000, 2000, 10000, 2);
+}
+
 }  // namespace tensorflow

From ae78d9c3150c448e14c2a624989884722b65a18e Mon Sep 17 00:00:00 2001
From: Ruoxin Sang <rxsang@google.com>
Date: Sat, 20 Jun 2020 00:06:21 -0700
Subject: [PATCH 0705/1390] Add a `TPUClusterResolver.connect` API to simplify
 TPU initialization.

RELNOTES=Add a `tf.distribute.cluster_resolver.TPUClusterResolver.connect` API to simplify TPU initialization.
PiperOrigin-RevId: 317439811
Change-Id: I2f1a944f3c440356b21da27a72855c969f1c3b3b
---
 .../tpu/tpu_cluster_resolver.py               | 58 ++++++++++++++++---
 tensorflow/python/distribute/tpu_strategy.py  |  5 +-
 tensorflow/python/tpu/tpu_strategy_util.py    | 16 +++--
 ...ter_resolver.-t-p-u-cluster-resolver.pbtxt |  4 ++
 ...ter_resolver.-t-p-u-cluster-resolver.pbtxt |  4 ++
 tensorflow/tools/docs/BUILD                   |  2 +-
 6 files changed, 74 insertions(+), 15 deletions(-)

diff --git a/tensorflow/python/distribute/cluster_resolver/tpu/tpu_cluster_resolver.py b/tensorflow/python/distribute/cluster_resolver/tpu/tpu_cluster_resolver.py
index 943b736fde4..e42420ec644 100644
--- a/tensorflow/python/distribute/cluster_resolver/tpu/tpu_cluster_resolver.py
+++ b/tensorflow/python/distribute/cluster_resolver/tpu/tpu_cluster_resolver.py
@@ -52,16 +52,59 @@ class TPUClusterResolver(cluster_resolver.ClusterResolver):
   """Cluster Resolver for Google Cloud TPUs.
 
   This is an implementation of cluster resolvers for the Google Cloud TPU
-  service. As Cloud TPUs are in alpha, you will need to specify a API definition
-  file for this to consume, in addition to a list of Cloud TPUs in your Google
-  Cloud Platform project.
+  service.
 
   TPUClusterResolver supports the following distinct environments:
   Google Compute Engine
   Google Kubernetes Engine
   Google internal
+
+  It can be passed into `tf.distribute.TPUStrategy` to support TF2 training on
+  Cloud TPUs.
   """
 
+  @staticmethod
+  def connect(tpu=None,
+              zone=None,
+              project=None):
+    """Initializes TPU and returns a TPUClusterResolver.
+
+    This API will connect to remote TPU cluster and initialize the TPU
+    hardwares. Example usage:
+
+    >>> resolver = tf.distribute.cluster_resolver.TPUClusterResolver.connect(
+    ...     tpu='')
+
+    It can be viewed as convenient wrapper of the following code:
+
+    >>> resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='')
+    >>> tf.config.experimental_connect_to_cluster(resolver)
+    >>> tf.tpu.experimental.initialize_tpu_system(resolver)
+
+    Args:
+      tpu: A string corresponding to the TPU to use. It can be the TPU name or
+        TPU worker gRPC address. If not set, it will try automatically resolve
+        the TPU address on Cloud TPUs.
+      zone: Zone where the TPUs are located. If omitted or empty, we will assume
+        that the zone of the TPU is the same as the zone of the GCE VM, which we
+        will try to discover from the GCE metadata service.
+      project: Name of the GCP project containing Cloud TPUs. If omitted or
+        empty, we will try to discover the project name of the GCE VM from the
+        GCE metadata service.
+
+    Returns:
+      An instance of TPUClusterResolver object.
+
+    Raises:
+      NotFoundError: If no TPU devices found in eager mode.
+    """
+    resolver = TPUClusterResolver(tpu, zone, project)
+    from tensorflow.python.eager import remote  # pylint: disable=g-import-not-at-top
+    remote.connect_to_cluster(resolver)
+    from tensorflow.python.tpu import tpu_strategy_util  # pylint: disable=g-import-not-at-top
+    tpu_strategy_util.initialize_tpu_system(resolver)
+    return resolver
+
   @staticmethod
   def _get_device_dict_and_cores(devices):
     """Returns a dict of hosts to cores and total cores given devices names.
@@ -110,12 +153,9 @@ class TPUClusterResolver(cluster_resolver.ClusterResolver):
     for the IP addresses and ports of each Cloud TPU listed.
 
     Args:
-      tpu: A string corresponding to the TPU to use. If the string is an empty
-        string, the string 'local', or a string that begins with 'grpc://', then
-          it is assumed to not correspond with a Cloud TPU and will instead be
-          passed as the session master and no ClusterSpec propagation will be
-          done. In the future, this may also support a list of strings when
-          multiple Cloud TPUs are used.
+      tpu: A string corresponding to the TPU to use. It can be the TPU name or
+        TPU worker gRPC address. If not set, it will try automatically resolve
+        the TPU address on Cloud TPUs.
       zone: Zone where the TPUs are located. If omitted or empty, we will assume
         that the zone of the TPU is the same as the zone of the GCE VM, which we
         will try to discover from the GCE metadata service.
diff --git a/tensorflow/python/distribute/tpu_strategy.py b/tensorflow/python/distribute/tpu_strategy.py
index 7e8f5b97e7e..e2e75260593 100644
--- a/tensorflow/python/distribute/tpu_strategy.py
+++ b/tensorflow/python/distribute/tpu_strategy.py
@@ -338,7 +338,10 @@ class TPUExtended(distribute_lib.StrategyExtendedV1):
     if context.executing_eagerly():
       # In async remote eager, we want to sync the exectors before exiting the
       # program.
-      atexit.register(context.async_wait)
+      def async_wait():
+        if context.context()._context_handle is not None:  # pylint: disable=protected-access
+          context.async_wait()
+      atexit.register(async_wait)
 
   # TODO(bfontain): Remove once a proper dataset API exists for prefetching
   # a dataset to multiple devices exists.
diff --git a/tensorflow/python/tpu/tpu_strategy_util.py b/tensorflow/python/tpu/tpu_strategy_util.py
index 543c91167cd..c315d7c5e1b 100644
--- a/tensorflow/python/tpu/tpu_strategy_util.py
+++ b/tensorflow/python/tpu/tpu_strategy_util.py
@@ -24,6 +24,7 @@ from tensorflow.python.distribute.cluster_resolver.tpu_cluster_resolver import T
 from tensorflow.python.eager import context
 from tensorflow.python.eager import function
 from tensorflow.python.framework import device
+from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.tpu import topology
@@ -47,8 +48,8 @@ def initialize_tpu_system(cluster_resolver=None):
     The tf.tpu.Topology object for the topology of the TPU cluster.
 
   Raises:
-    RuntimeError: If no TPU devices found for eager execution or if run in a
-        tf.function.
+    RuntimeError: If running inside a tf.function.
+    NotFoundError: If no TPU devices found in eager mode.
   """
   job = None
   if cluster_resolver is None:
@@ -93,8 +94,15 @@ def initialize_tpu_system(cluster_resolver=None):
     # The TPU_SYSTEM device must match the device used in tpu.initialize_system
     # exactly, otherwise you can get errors if there are multiple TPU_SYSTEM
     # devices available.
-    with ops.device(tpu._tpu_system_device_name(job)):  # pylint: disable=protected-access
-      output = _tpu_init_fn()
+    try:
+      with ops.device(tpu._tpu_system_device_name(job)):  # pylint: disable=protected-access
+        output = _tpu_init_fn()
+      context.async_wait()
+    except errors.InvalidArgumentError as e:
+      raise errors.NotFoundError(
+          None, None,
+          "TPUs not found in the cluster. Failed in initialization: "
+          + str(e))
 
     # Clear out the eager context caches since the memory is invalid now.
     logging.info("Clearing out eager caches")
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.distribute.cluster_resolver.-t-p-u-cluster-resolver.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distribute.cluster_resolver.-t-p-u-cluster-resolver.pbtxt
index 658212aca5e..8eee489df93 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.distribute.cluster_resolver.-t-p-u-cluster-resolver.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.distribute.cluster_resolver.-t-p-u-cluster-resolver.pbtxt
@@ -15,6 +15,10 @@ tf_class {
     name: "cluster_spec"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "connect"
+    argspec: "args=[\'tpu\', \'zone\', \'project\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
   member_method {
     name: "get_job_name"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.cluster_resolver.-t-p-u-cluster-resolver.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.cluster_resolver.-t-p-u-cluster-resolver.pbtxt
index 658212aca5e..8eee489df93 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.distribute.cluster_resolver.-t-p-u-cluster-resolver.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.cluster_resolver.-t-p-u-cluster-resolver.pbtxt
@@ -15,6 +15,10 @@ tf_class {
     name: "cluster_spec"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "connect"
+    argspec: "args=[\'tpu\', \'zone\', \'project\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
   member_method {
     name: "get_job_name"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/docs/BUILD b/tensorflow/tools/docs/BUILD
index f347cab91bb..d2f5469a55c 100644
--- a/tensorflow/tools/docs/BUILD
+++ b/tensorflow/tools/docs/BUILD
@@ -11,7 +11,7 @@ package(
 
 exports_files(["LICENSE"])
 
-tpu_module = "tpu.,distribute.tpu_strategy"
+tpu_module = "tpu.,distribute.tpu_strategy,distribute.cluster_resolver.tpu,distribute.cluster_resolver.tpu_oss"
 
 py_library(
     name = "tf_doctest_lib",

From fd2d4cdb70211c6e3434e72a600b94f4c5398b0e Mon Sep 17 00:00:00 2001
From: Priya Gupta <priyag@google.com>
Date: Sat, 20 Jun 2020 00:22:08 -0700
Subject: [PATCH 0706/1390] Expand `Strategy.scope` API docstring to include
 details on what it does and when to use it.

PiperOrigin-RevId: 317440737
Change-Id: Ie9730f32acf19a9848c9fbfaf25720b6af64feda
---
 .../python/distribute/distribute_lib.py       | 87 ++++++++++++++++++-
 1 file changed, 83 insertions(+), 4 deletions(-)

diff --git a/tensorflow/python/distribute/distribute_lib.py b/tensorflow/python/distribute/distribute_lib.py
index ec0b911ebe0..2b4ec361868 100644
--- a/tensorflow/python/distribute/distribute_lib.py
+++ b/tensorflow/python/distribute/distribute_lib.py
@@ -787,17 +787,96 @@ class StrategyBase(object):
     finally:
       self._scale_loss_for_estimator = False
 
+  # pylint: disable=line-too-long
   def scope(self):
-    """Returns a context manager selecting this Strategy as current.
+    """Context manager to make the strategy current and distribute variables.
 
-    Inside a `with strategy.scope():` code block, this thread
-    will use a variable creator set by `strategy`, and will
-    enter its "cross-replica context".
+    This method returns a context manager, and is used as follows:
+
+    >>> strategy = tf.distribute.MirroredStrategy()
+    >>> # Variable created inside scope:
+    >>> with strategy.scope():
+    ...   mirrored_variable = tf.Variable(1.)
+    >>> mirrored_variable
+    MirroredVariable:{
+      0: <tf.Variable 'Variable:0' shape=() dtype=float32, numpy=1.0>
+    }
+    >>> # Variable created outside scope:
+    >>> regular_variable = tf.Variable(1.)
+    >>> regular_variable
+    <tf.Variable 'Variable:0' shape=() dtype=float32, numpy=1.0>
+
+    _What happens when Strategy.scope is entered?_
+
+    * `strategy` is installed in the global context as the "current" strategy.
+      Inside this scope, `tf.distribute.get_strategy()` will now return this
+      strategy. Outside this scope, it returns the default no-op strategy.
+    * Entering the scope also enters the "cross-replica context". See
+      `tf.distribute.StrategyExtended` for an explanation on cross-replica and
+      replica contexts.
+    * Variable creation inside `scope` is intercepted by the strategy. Each
+      strategy defines how it wants to affect the variable creation. Sync
+      strategies like `MirroredStrategy`, `TPUStrategy` and
+      `MultiWorkerMiroredStrategy` create variables replicated on each replica,
+      whereas `ParameterServerStrategy` creates variables on the parameter
+      servers. This is done using a custom `tf.variable_creator_scope`.
+    * In some strategies, a default device scope may also be entered: in
+      `MultiWorkerMiroredStrategy`, a default device scope of "/CPU:0" is
+      entered on each worker.
+
+    Note: Entering a scope does not automatically distribute a computation, except
+      in the case of high level training framework like keras `model.fit`. If
+      you're not using `model.fit`, you
+      need to use `strategy.run` API to explicitly distribute that computation.
+      See an example in the [custom training loop tutorial](https://www.tensorflow.org/tutorials/distribute/custom_training).
+
+
+    _What should be in scope and what should be outside?_
+
+    There are a number of requirements on what needs to happen inside the scope.
+    However, in places where we have information about which strategy is in use,
+    we often enter the scope for the user, so they don't have to do it
+    explicitly (i.e. calling those either inside or outside the scope is OK).
+
+    * Anything that creates variables that should be distributed variables
+      must be in `strategy.scope`. This can be either by directly putting it in
+      scope, or relying on another API like `strategy.run` or `model.fit` to
+      enter it for you. Any variable that is created outside scope will not be
+      distributed and may have performance implications. Common things that
+      create variables in TF: models, optimizers, metrics. These should always
+      be created inside the scope. Another source of variable creation can be
+      a checkpoint restore - when variables are created lazily. Note that any
+      variable created inside a strategy captures the strategy information. So
+      reading and writing to these variables outside the `strategy.scope` can
+      also work seamlessly, without the user having to enter the scope.
+    * Some strategy APIs (such as `strategy.run` and `strategy.reduce`) which
+      require to be in a strategy's scope, enter the scope for you
+      automatically, which means when using those APIs you don't need to
+      enter the scope yourself.
+    * When a `tf.keras.Model` is created inside a `strategy.scope`, we capture
+      this information. When high level training frameworks methods such as
+      `model.compile`, `model.fit` etc are then called
+      on this model, we automatically enter the scope, as well as use this
+      strategy to distribute the training etc. See
+      detailed example in [distributed keras tutorial](https://www.tensorflow.org/tutorials/distribute/keras).
+      Note that simply calling the `model(..)` is not impacted - only high
+      level training framework APIs are. `model.compile`, `model.fit`,
+      `model.evaluate`, `model.predict` and `model.save` can all be called
+      inside or outside the scope.
+    * The following can be either inside or outside the scope:
+      ** Creating the input datasets
+      ** Defining `tf.function`s that represent your training step
+      ** Saving APIs such as `tf.saved_model.save`. Loading creates variables,
+         so that should go inside the scope if you want to train the model in a
+         distributed way.
+      ** Checkpoint saving. As mentioned above - `checkpoint.restore` may
+         sometimes need to be inside scope if it creates variables.
 
     Returns:
       A context manager.
     """
     return self._extended._scope(self)  # pylint: disable=protected-access
+  # pylint: enable=line-too-long
 
   @doc_controls.do_not_doc_inheritable  # DEPRECATED, moving to `extended`
   def colocate_vars_with(self, colocate_with_variable):

From cf081ee890a0d0fadaaed6a45f07e59600142722 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sat, 20 Jun 2020 02:01:44 -0700
Subject: [PATCH 0707/1390] Update GraphDef version to 438.

PiperOrigin-RevId: 317446620
Change-Id: I5c667cb1a7edf507d36a10e90cc03b52cf9227b3
---
 tensorflow/core/public/version.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index 9a79fc1eddf..64f1ef6b585 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -108,7 +108,7 @@ limitations under the License.
 
 #define TF_GRAPH_DEF_VERSION_MIN_PRODUCER 0
 #define TF_GRAPH_DEF_VERSION_MIN_CONSUMER 0
-#define TF_GRAPH_DEF_VERSION 437  // Updated: 2020/6/19
+#define TF_GRAPH_DEF_VERSION 438  // Updated: 2020/6/20
 
 // Checkpoint compatibility versions (the versions field in SavedSliceMeta).
 //

From 7a0531c5aac0788ee810c8e11fe00840830d8b0d Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sat, 20 Jun 2020 02:01:45 -0700
Subject: [PATCH 0708/1390] compat: Update forward compatibility horizon to
 2020-06-20

PiperOrigin-RevId: 317446623
Change-Id: I57979687a0cc094b1caac5caad0bf6641bc3f575
---
 tensorflow/python/compat/compat.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/compat/compat.py b/tensorflow/python/compat/compat.py
index 22988d26cfc..ba6e375843b 100644
--- a/tensorflow/python/compat/compat.py
+++ b/tensorflow/python/compat/compat.py
@@ -33,7 +33,7 @@ from tensorflow.python.util.tf_export import tf_export
 # This value changes every day with an automatic CL. It can be modified in code
 # via `forward_compatibility_horizon()` or with the environment variable
 # TF_FORWARD_COMPATIBILITY_DELTA_DAYS, which is added to the compatibility date.
-_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2020, 6, 19)
+_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2020, 6, 20)
 _FORWARD_COMPATIBILITY_DELTA_DAYS_VAR_NAME = "TF_FORWARD_COMPATIBILITY_DELTA_DAYS"
 _FORWARD_COMPATIBILITY_DATE_NUMBER = None
 

From 5b46965a7dbcb7d775d9bca1b6bc4ee4f4652101 Mon Sep 17 00:00:00 2001
From: Lukas Geiger <lukas.geiger94@gmail.com>
Date: Sat, 20 Jun 2020 14:34:22 +0200
Subject: [PATCH 0709/1390] Use ObjectIdentitySet for
 FuncGraph.control_captures

---
 tensorflow/python/framework/func_graph.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/framework/func_graph.py b/tensorflow/python/framework/func_graph.py
index b0f8821b17f..94d5913cbd1 100644
--- a/tensorflow/python/framework/func_graph.py
+++ b/tensorflow/python/framework/func_graph.py
@@ -187,7 +187,7 @@ class FuncGraph(ops.Graph):
     self.inputs = []
     self.outputs = []
     self.control_outputs = []
-    self.control_captures = set()
+    self.control_captures = object_identity.ObjectIdentitySet()
     self.structured_input_signature = None
     self.structured_outputs = None
     self._weak_variables = []

From c2009c6bc53f244e59d2788156e3a485f09ce2da Mon Sep 17 00:00:00 2001
From: Vo Van Nghia <vovannghia2409@gmail.com>
Date: Sat, 20 Jun 2020 21:47:50 +0700
Subject: [PATCH 0710/1390] Add new appendable file

---
 .../filesystem/plugins/gcs/gcs_filesystem.cc  | 39 ++++++++++++++++++-
 1 file changed, 37 insertions(+), 2 deletions(-)

diff --git a/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem.cc b/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem.cc
index 24d85f359ef..954ee4eefe8 100644
--- a/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem.cc
+++ b/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem.cc
@@ -135,6 +135,10 @@ static void Init(TF_Filesystem* filesystem, TF_Status* status) {
   TF_SetStatus(status, TF_OK, "");
 }
 
+static void Cleanup(TF_Filesystem* filesystem) {
+  plugin_memory_free(filesystem->plugin_filesystem);
+}
+
 // TODO(vnvo2409): Implement later
 
 static void NewWritableFile(const TF_Filesystem* filesystem, const char* path,
@@ -145,9 +149,37 @@ static void NewWritableFile(const TF_Filesystem* filesystem, const char* path,
   if (TF_GetCode(status) != TF_OK) return;
 
   auto gcs_client = static_cast<gcs::Client*>(filesystem->plugin_filesystem);
-  TempFile outfile(TF_GetTempFileName(""), std::ios::binary | std::ios::out);
+  char* temp_file_name = TF_GetTempFileName("");
   file->plugin_file = new tf_writable_file::GCSFile(
-      {bucket, object, gcs_client, std::move(outfile), true});
+      {bucket, object, gcs_client,
+       TempFile(temp_file_name, std::ios::binary | std::ios::out), true});
+  // We are responsible for freeing the pointer returned by TF_GetTempFileName
+  free(temp_file_name);
+  TF_SetStatus(status, TF_OK, "");
+}
+
+static void NewAppendableFile(const TF_Filesystem* filesystem, const char* path,
+                              TF_WritableFile* file, TF_Status* status) {
+  char* bucket;
+  char* object;
+  ParseGCSPath(path, false, &bucket, &object, status);
+  if (TF_GetCode(status) != TF_OK) return;
+
+  auto gcs_client = static_cast<gcs::Client*>(filesystem->plugin_filesystem);
+  char* temp_file_name = TF_GetTempFileName("");
+
+  auto gcs_status = gcs_client->DownloadToFile(bucket, object, temp_file_name);
+  TF_SetStatusFromGCSStatus(gcs_status, status);
+  auto status_code = TF_GetCode(status);
+  if (status_code != TF_OK && status_code != TF_NOT_FOUND) {
+    return;
+  }
+  // If this file does not exist on server, we will need to sync it.
+  bool sync_need = (status_code == TF_NOT_FOUND);
+  file->plugin_file = new tf_writable_file::GCSFile(
+      {bucket, object, gcs_client,
+       TempFile(temp_file_name, std::ios::binary | std::ios::app), sync_need});
+  free(temp_file_name);
   TF_SetStatus(status, TF_OK, "");
 }
 
@@ -165,7 +197,10 @@ static void ProvideFilesystemSupportFor(TF_FilesystemPluginOps* ops,
   ops->filesystem_ops = static_cast<TF_FilesystemOps*>(
       plugin_memory_allocate(TF_FILESYSTEM_OPS_SIZE));
   ops->filesystem_ops->init = tf_gcs_filesystem::Init;
+  ops->filesystem_ops->cleanup = tf_gcs_filesystem::Cleanup;
   ops->filesystem_ops->new_writable_file = tf_gcs_filesystem::NewWritableFile;
+  ops->filesystem_ops->new_appendable_file =
+      tf_gcs_filesystem::NewAppendableFile;
 }
 
 void TF_InitPlugin(TF_FilesystemPluginInfo* info) {

From 5da215da086da25561afb6059ad52ac8c1cc7e98 Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Sat, 20 Jun 2020 15:10:30 +0000
Subject: [PATCH 0711/1390] Fix TypeError when sparse.from_dense is called with
 tf.string

This PR tries to fix the issue raised in 40633 where
sparse.from_dense throws out TypeError with tf.string as input.

The issue was that from_dense uses `tf.constant(0, dtype)` to get
the zero value which fails on tf.string. This PR changes to `zeros_like`
to work with tf.string.

This PR fixes 40633.

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/python/ops/sparse_ops.py      | 2 +-
 tensorflow/python/ops/sparse_ops_test.py | 9 +++++++++
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/tensorflow/python/ops/sparse_ops.py b/tensorflow/python/ops/sparse_ops.py
index 07eb35e84ce..cee1dc23aa0 100644
--- a/tensorflow/python/ops/sparse_ops.py
+++ b/tensorflow/python/ops/sparse_ops.py
@@ -120,7 +120,7 @@ def from_dense(tensor, name=None):
   with ops.name_scope(name, "dense_to_sparse"):
     tensor = ops.convert_to_tensor(tensor)
     indices = array_ops.where_v2(
-        math_ops.not_equal(tensor, array_ops.constant(0, tensor.dtype)))
+        math_ops.not_equal(tensor, array_ops.zeros_like(tensor)))
     values = array_ops.gather_nd(tensor, indices)
     shape = array_ops.shape(tensor, out_type=dtypes.int64)
     return sparse_tensor.SparseTensor(indices, values, shape)
diff --git a/tensorflow/python/ops/sparse_ops_test.py b/tensorflow/python/ops/sparse_ops_test.py
index 91151ba8461..cbe611ac647 100644
--- a/tensorflow/python/ops/sparse_ops_test.py
+++ b/tensorflow/python/ops/sparse_ops_test.py
@@ -180,6 +180,15 @@ class SparseOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
         array_ops.transpose(dense_of_sparse))
     self.assertAllEqual(expected, result)
 
+  def testConstantStringToSparse(self):
+    # Test case for GitHub issue 40633.
+    tensor = constant_op.constant(list("ababa"))
+    sparse = sparse_ops.from_dense(tensor)
+    result = self.evaluate(sparse)
+    self.assertAllEqual([[0], [1], [2], [3], [4]], result.indices)
+    self.assertAllEqual([b'a', b'b', b'a', b'b', b'a'], result.values)
+    self.assertAllEqual([5], result.dense_shape)
+
 
 if __name__ == '__main__':
   googletest.main()

From 6d2ce43b03cde78a53b5fa3dfddef0cf718ccac8 Mon Sep 17 00:00:00 2001
From: Andy Ly <lyandy@google.com>
Date: Sat, 20 Jun 2020 08:32:38 -0700
Subject: [PATCH 0712/1390] Remove empty description fields in
 tf_generated_ops.td and tf_ops.td. (NFC)

PiperOrigin-RevId: 317466704
Change-Id: Ic0bbec0c7013c2f2238a4f4e5763632c846f1337
---
 .../mlir/tensorflow/ir/tf_generated_ops.td    | 229 +-----------------
 .../compiler/mlir/tensorflow/ir/tf_ops.td     |  11 +-
 2 files changed, 4 insertions(+), 236 deletions(-)

diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td b/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
index dcef99e6971..1e5dad345f8 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
@@ -52,9 +52,6 @@ an output element, this operation computes \\(y = |x|\\).
 def TF_AcosOp : TF_Op<"Acos", [NoSideEffect, SameOperandsAndResultType]> {
   let summary = "Computes acos of x element-wise.";
 
-  let description = [{
-  }];
-
   let arguments = (ins
     TensorOf<[BF16, F16, F32, F64, I32, I64, TF_Complex128, TF_Complex64]>:$x
   );
@@ -371,9 +368,6 @@ retained with length 1.
 def TF_ApproximateEqualOp : TF_Op<"ApproximateEqual", [Commutative, NoSideEffect]> {
   let summary = "Returns the truth value of abs(x-y) < tolerance element-wise.";
 
-  let description = [{
-  }];
-
   let arguments = (ins
     TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$x,
     TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$y,
@@ -734,9 +728,6 @@ window in `value`.
 def TF_AvgPoolGradOp : TF_Op<"AvgPoolGrad", [NoSideEffect]> {
   let summary = "Computes gradients of the average pooling function.";
 
-  let description = [{
-  }];
-
   let arguments = (ins
     I32Tensor:$orig_input_shape,
     TF_FpTensor:$grad,
@@ -1402,9 +1393,6 @@ An n-way switch statement, implementing the following:
 def TF_CastOp : TF_Op<"Cast", [NoSideEffect, SameOperandsAndResultShape]> {
   let summary = "Cast x of type SrcT to y of DstT.";
 
-  let description = [{
-  }];
-
   let arguments = (ins
     TF_Tensor:$x,
 
@@ -1424,9 +1412,6 @@ def TF_CastOp : TF_Op<"Cast", [NoSideEffect, SameOperandsAndResultShape]> {
 def TF_CeilOp : TF_Op<"Ceil", [NoSideEffect, SameOperandsAndResultType]> {
   let summary = "Returns element-wise smallest integer not less than x.";
 
-  let description = [{
-  }];
-
   let arguments = (ins
     TF_FpTensor:$x
   );
@@ -1485,9 +1470,6 @@ greater than `clip_value_max` are set to `clip_value_max`.
 def TF_CollectiveBcastRecvOp : TF_Op<"CollectiveBcastRecv", []> {
   let summary = "Receives a tensor value broadcast from another device.";
 
-  let description = [{
-  }];
-
   let arguments = (ins
     I64Attr:$group_size,
     I64Attr:$group_key,
@@ -1507,9 +1489,6 @@ def TF_CollectiveBcastRecvOp : TF_Op<"CollectiveBcastRecv", []> {
 def TF_CollectiveBcastSendOp : TF_Op<"CollectiveBcastSend", []> {
   let summary = "Broadcasts a tensor value to one or more other devices.";
 
-  let description = [{
-  }];
-
   let arguments = (ins
     TensorOf<[F16, F32, F64, I1, I32, I64]>:$input,
 
@@ -1533,9 +1512,6 @@ def TF_CollectiveGatherOp : TF_Op<"CollectiveGather", []> {
 Mutually accumulates multiple tensors of identical type and shape.
   }];
 
-  let description = [{
-  }];
-
   let arguments = (ins
     TensorOf<[F16, F32, F64, I32, I64]>:$input,
 
@@ -1559,9 +1535,6 @@ def TF_CollectiveReduceOp : TF_Op<"CollectiveReduce", [SameOperandsAndResultType
 Mutually reduces multiple tensors of identical type and shape.
   }];
 
-  let description = [{
-  }];
-
   let arguments = (ins
     TensorOf<[F16, F32, F64, I32, I64]>:$input,
 
@@ -1641,9 +1614,6 @@ value is computed as \\( \sqrt{a^2 + b^2}\\).
 def TF_ConcatOp : TF_Op<"Concat", [NoSideEffect]> {
   let summary = "Concatenates tensors along one dimension.";
 
-  let description = [{
-  }];
-
   let arguments = (ins
     I32Tensor:$concat_dim,
     Variadic<TF_Tensor>:$values
@@ -1700,9 +1670,6 @@ This is typically used by gradient computations for a concat operation.
 def TF_ConcatV2Op : TF_Op<"ConcatV2", [NoSideEffect]> {
   let summary = "Concatenates tensors along one dimension.";
 
-  let description = [{
-  }];
-
   let arguments = (ins
     Variadic<TF_Tensor>:$values,
     TF_I32OrI64Tensor:$axis
@@ -1842,9 +1809,6 @@ def TF_Conv2DBackpropFilterOp : TF_Op<"Conv2DBackpropFilter", [NoSideEffect, TF_
 Computes the gradients of convolution with respect to the filter.
   }];
 
-  let description = [{
-  }];
-
   let arguments = (ins
     TF_FpTensor:$input,
     I32Tensor:$filter_sizes,
@@ -1878,9 +1842,6 @@ def TF_Conv2DBackpropInputOp : TF_Op<"Conv2DBackpropInput", [NoSideEffect, TF_La
 Computes the gradients of convolution with respect to the input.
   }];
 
-  let description = [{
-  }];
-
   let arguments = (ins
     I32Tensor:$input_sizes,
     TensorOf<[BF16, F16, F32, F64, I32]>:$filter,
@@ -1952,9 +1913,6 @@ def TF_Conv3DBackpropFilterV2Op : TF_Op<"Conv3DBackpropFilterV2", [NoSideEffect]
 Computes the gradients of 3-D convolution with respect to the filter.
   }];
 
-  let description = [{
-  }];
-
   let arguments = (ins
     TF_FpTensor:$input,
     I32Tensor:$filter_sizes,
@@ -1978,9 +1936,6 @@ def TF_Conv3DBackpropInputV2Op : TF_Op<"Conv3DBackpropInputV2", [NoSideEffect]>
 Computes the gradients of 3-D convolution with respect to the input.
   }];
 
-  let description = [{
-  }];
-
   let arguments = (ins
     TF_I32OrI64Tensor:$input_sizes,
     TF_FpTensor:$filter,
@@ -2465,9 +2420,6 @@ horizontal and vertices strides, `strides = [1, stride, stride, 1]`.
 def TF_DeviceIndexOp : TF_Op<"DeviceIndex", [NoSideEffect]> {
   let summary = "Return the index of device the op runs.";
 
-  let description = [{
-  }];
-
   let arguments = (ins
     StrArrayAttr:$device_names
   );
@@ -2792,9 +2744,6 @@ def TF_EluGradOp : TF_Op<"EluGrad", [NoSideEffect, SameOperandsAndResultType]> {
 Computes gradients for the exponential linear (Elu) operation.
   }];
 
-  let description = [{
-  }];
-
   let arguments = (ins
     TF_FpTensor:$gradients,
     TF_FpTensor:$outputs
@@ -2814,9 +2763,6 @@ Creates a tensor with the given shape.
 This operation creates a tensor of `shape` and `dtype`.
   }];
 
-  let description = [{
-  }];
-
   let arguments = (ins
     I32Tensor:$shape,
 
@@ -2946,9 +2892,6 @@ tf.math.equal(x, y) ==> array([True,  True])
 def TF_ErfOp : TF_Op<"Erf", [NoSideEffect, SameOperandsAndResultType]> {
   let summary = "Computes the Gauss error function of `x` element-wise.";
 
-  let description = [{
-  }];
-
   let arguments = (ins
     TF_FpTensor:$x
   );
@@ -2965,9 +2908,6 @@ def TF_ErfcOp : TF_Op<"Erfc", [NoSideEffect, SameOperandsAndResultType]> {
 Computes the complementary error function of `x` element-wise.
   }];
 
-  let description = [{
-  }];
-
   let arguments = (ins
     TF_FpTensor:$x
   );
@@ -2982,9 +2922,6 @@ Computes the complementary error function of `x` element-wise.
 def TF_ErfinvOp : TF_Op<"Erfinv", [NoSideEffect]> {
   let summary = "";
 
-  let description = [{
-  }];
-
   let arguments = (ins
     TF_FpTensor:$x
   );
@@ -3190,9 +3127,6 @@ def TF_FakeParamOp : TF_Op<"FakeParam", [NoSideEffect]> {
   intermediate output needed for the gradient computation of the other branch).
   }];
 
-  let description = [{
-  }];
-
   let arguments = (ins
     TF_ShapeAttr:$shape
   );
@@ -3402,9 +3336,6 @@ fill([2, 3], 9) ==> [[9, 9, 9]
 def TF_FloorOp : TF_Op<"Floor", [NoSideEffect, SameOperandsAndResultType]> {
   let summary = "Returns element-wise largest integer not greater than x.";
 
-  let description = [{
-  }];
-
   let arguments = (ins
     TF_FpTensor:$x
   );
@@ -4212,9 +4143,6 @@ def TF_IgammaGradAOp : TF_Op<"IgammaGradA", [NoSideEffect, ResultsBroadcastableS
                        WithBroadcastableBinOpBuilder {
   let summary = "Computes the gradient of `igamma(a, x)` wrt `a`.";
 
-  let description = [{
-  }];
-
   let arguments = (ins
     TF_F32OrF64Tensor:$a,
     TF_F32OrF64Tensor:$x
@@ -4487,9 +4415,6 @@ tf.math.is_nan(x) ==> [False, True, False, True, False]
 def TF_IteratorGetNextOp : TF_Op<"IteratorGetNext", []> {
   let summary = "Gets the next output from the given iterator .";
 
-  let description = [{
-  }];
-
   let arguments = (ins
     TF_ResourceTensor:$iterator
   );
@@ -4558,9 +4483,6 @@ convolutional neural networks (NIPS 2012)](http://papers.nips.cc/paper/4824-imag
 def TF_LRNGradOp : TF_Op<"LRNGrad", [NoSideEffect]> {
   let summary = "Gradients for Local Response Normalization.";
 
-  let description = [{
-  }];
-
   let arguments = (ins
     TensorOf<[BF16, F16, F32]>:$input_grads,
     TensorOf<[BF16, F16, F32]>:$input_image,
@@ -4582,9 +4504,6 @@ def TF_LRNGradOp : TF_Op<"LRNGrad", [NoSideEffect]> {
 def TF_LeakyReluOp : TF_Op<"LeakyRelu", [NoSideEffect, SameOperandsAndResultType]> {
   let summary = "Computes rectified linear: `max(features, features * alpha)`.";
 
-  let description = [{
-  }];
-
   let arguments = (ins
     TF_FpTensor:$features,
 
@@ -4605,9 +4524,6 @@ def TF_LeakyReluGradOp : TF_Op<"LeakyReluGrad", [NoSideEffect, SameOperandsAndRe
 Computes rectified linear gradients for a LeakyRelu operation.
   }];
 
-  let description = [{
-  }];
-
   let arguments = (ins
     TF_FpTensor:$gradients,
     TF_FpTensor:$features,
@@ -4888,9 +4804,6 @@ def TF_LogicalAndOp : TF_Op<"LogicalAnd", [Commutative, NoSideEffect, ResultsBro
 def TF_LogicalNotOp : TF_Op<"LogicalNot", [NoSideEffect, SameOperandsAndResultType]> {
   let summary = "Returns the truth value of `NOT x` element-wise.";
 
-  let description = [{
-  }];
-
   let arguments = (ins
     I1Tensor:$x
   );
@@ -4971,9 +4884,6 @@ The tensor `values` must be of the type of the table values.
 def TF_LookupTableSizeV2Op : TF_Op<"LookupTableSizeV2", []> {
   let summary = "Computes the number of elements in the given table.";
 
-  let description = [{
-  }];
-
   let arguments = (ins
     TF_ResourceTensor:$table_handle
   );
@@ -5658,9 +5568,6 @@ retained with length 1.
 def TF_MaxPoolOp : TF_Op<"MaxPool", [NoSideEffect, TF_FoldOperandsTransposeInterface]> {
   let summary = "Performs max pooling on the input.";
 
-  let description = [{
-  }];
-
   let arguments = (ins
     TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Qint8, TF_Uint16, TF_Uint8]>:$input,
 
@@ -5687,9 +5594,6 @@ def TF_MaxPoolOp : TF_Op<"MaxPool", [NoSideEffect, TF_FoldOperandsTransposeInter
 def TF_MaxPool3DOp : TF_Op<"MaxPool3D", [NoSideEffect]> {
   let summary = "Performs 3D max pooling on the input.";
 
-  let description = [{
-  }];
-
   let arguments = (ins
     TensorOf<[BF16, F16, F32]>:$input,
 
@@ -5709,9 +5613,6 @@ def TF_MaxPool3DOp : TF_Op<"MaxPool3D", [NoSideEffect]> {
 def TF_MaxPool3DGradOp : TF_Op<"MaxPool3DGrad", [NoSideEffect]> {
   let summary = "Computes gradients of 3D max pooling function.";
 
-  let description = [{
-  }];
-
   let arguments = (ins
     TensorOf<[BF16, F16, F32]>:$orig_input,
     TensorOf<[BF16, F16, F32]>:$orig_output,
@@ -5734,9 +5635,6 @@ def TF_MaxPool3DGradOp : TF_Op<"MaxPool3DGrad", [NoSideEffect]> {
 def TF_MaxPoolGradOp : TF_Op<"MaxPoolGrad", [NoSideEffect]> {
   let summary = "Computes gradients of the maxpooling function.";
 
-  let description = [{
-  }];
-
   let arguments = (ins
     TF_IntOrFpTensor:$orig_input,
     TF_IntOrFpTensor:$orig_output,
@@ -6015,9 +5913,6 @@ Returns x * y element-wise. Returns zero if y is zero, even if x if infinite or
 def TF_MultinomialOp : TF_Op<"Multinomial", []> {
   let summary = "Draws samples from a multinomial distribution.";
 
-  let description = [{
-  }];
-
   let arguments = (ins
     TF_IntOrFpTensor:$logits,
     I32Tensor:$num_samples,
@@ -6037,9 +5932,6 @@ def TF_MultinomialOp : TF_Op<"Multinomial", []> {
 def TF_NdtriOp : TF_Op<"Ndtri", [NoSideEffect]> {
   let summary = "";
 
-  let description = [{
-  }];
-
   let arguments = (ins
     TF_FpTensor:$x
   );
@@ -6074,9 +5966,6 @@ I.e., \\(y = -x\\).
 def TF_NoOp : TF_Op<"NoOp", [NoSideEffect]> {
   let summary = "Does nothing. Only useful as a placeholder for control edges.";
 
-  let description = [{
-  }];
-
   let arguments = (ins);
 
   let results = (outs);
@@ -6330,9 +6219,6 @@ output =
 def TF_OutfeedEnqueueTupleOp : TF_Op<"OutfeedEnqueueTuple", []> {
   let summary = "Enqueue multiple Tensor values on the computation outfeed.";
 
-  let description = [{
-  }];
-
   let arguments = (ins
     Variadic<TF_Tensor>:$inputs
   );
@@ -6617,9 +6503,6 @@ q_full, r_full = qr(a, full_matrices=True)
 def TF_QuantizeAndDequantizeOp : TF_Op<"QuantizeAndDequantize", [NoSideEffect, SameOperandsAndResultType]> {
   let summary = "Use QuantizeAndDequantizeV2 instead.";
 
-  let description = [{
-  }];
-
   let arguments = (ins
     TF_FpTensor:$input,
 
@@ -6871,9 +6754,6 @@ def TF_RandomGammaGradOp : TF_Op<"RandomGammaGrad", [NoSideEffect, ResultsBroadc
 Computes the derivative of a Gamma random sample w.r.t. `alpha`.
   }];
 
-  let description = [{
-  }];
-
   let arguments = (ins
     TF_F32OrF64Tensor:$alpha,
     TF_F32OrF64Tensor:$sample
@@ -7203,9 +7083,6 @@ array([ 0.,  0., -0.,  3.], dtype=float32)
 def TF_Relu6Op : TF_Op<"Relu6", [NoSideEffect, SameOperandsAndResultType]> {
   let summary = "Computes rectified linear 6: `min(max(features, 0), 6)`.";
 
-  let description = [{
-  }];
-
   let arguments = (ins
     TF_IntOrFpTensor:$features
   );
@@ -7220,9 +7097,6 @@ def TF_Relu6Op : TF_Op<"Relu6", [NoSideEffect, SameOperandsAndResultType]> {
 def TF_Relu6GradOp : TF_Op<"Relu6Grad", [NoSideEffect, SameOperandsAndResultType]> {
   let summary = "Computes rectified linear 6 gradients for a Relu6 operation.";
 
-  let description = [{
-  }];
-
   let arguments = (ins
     TF_IntOrFpTensor:$gradients,
     TF_IntOrFpTensor:$features
@@ -7238,9 +7112,6 @@ def TF_Relu6GradOp : TF_Op<"Relu6Grad", [NoSideEffect, SameOperandsAndResultType
 def TF_ReluGradOp : TF_Op<"ReluGrad", [NoSideEffect, SameOperandsAndResultType]> {
   let summary = "Computes rectified linear gradients for a Relu operation.";
 
-  let description = [{
-  }];
-
   let arguments = (ins
     TF_IntOrFpTensor:$gradients,
     TF_IntOrFpTensor:$features
@@ -7365,9 +7236,6 @@ Input images can be of different types but output images are always float.
 def TF_ResizeBilinearGradOp : TF_Op<"ResizeBilinearGrad", [NoSideEffect]> {
   let summary = "Computes the gradient of bilinear interpolation.";
 
-  let description = [{
-  }];
-
   let arguments = (ins
     F32Tensor:$grads,
     TF_FpTensor:$original_image,
@@ -7388,9 +7256,6 @@ def TF_ResizeNearestNeighborOp : TF_Op<"ResizeNearestNeighbor", [NoSideEffect]>
 Resize `images` to `size` using nearest neighbor interpolation.
   }];
 
-  let description = [{
-  }];
-
   let arguments = (ins
     TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Uint16, TF_Uint8]>:$images,
     I32Tensor:$size,
@@ -7507,9 +7372,6 @@ var <- var - mom
 def TF_ResourceApplyGradientDescentOp : TF_Op<"ResourceApplyGradientDescent", []> {
   let summary = "Update '*var' by subtracting 'alpha' * 'delta' from it.";
 
-  let description = [{
-  }];
-
   let arguments = (ins
     TF_ResourceTensor:$var,
     TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$alpha,
@@ -8292,9 +8154,6 @@ select(condition, t, e) ==> [[1, 2],
 def TF_SelectV2Op : TF_Op<"SelectV2", [NoSideEffect, ResultsBroadcastableShape]> {
   let summary = "";
 
-  let description = [{
-  }];
-
   let arguments = (ins
     I1Tensor:$condition,
     TF_Tensor:$t,
@@ -8343,9 +8202,6 @@ def TF_SeluGradOp : TF_Op<"SeluGrad", [NoSideEffect, SameOperandsAndResultType]>
 Computes gradients for the scaled exponential linear (Selu) operation.
   }];
 
-  let description = [{
-  }];
-
   let arguments = (ins
     TF_FpTensor:$gradients,
     TF_FpTensor:$outputs
@@ -8596,9 +8452,6 @@ whose values are extracted from 'input' starting at the offsets in
 def TF_SnapshotOp : TF_Op<"Snapshot", [NoSideEffect, SameOperandsAndResultType]> {
   let summary = "Returns a copy of the input tensor.";
 
-  let description = [{
-  }];
-
   let arguments = (ins
     TF_Tensor:$input
   );
@@ -8663,9 +8516,6 @@ Inputs are the logits, not probabilities.
 def TF_SoftplusOp : TF_Op<"Softplus", [NoSideEffect, SameOperandsAndResultType]> {
   let summary = "Computes softplus: `log(exp(features) + 1)`.";
 
-  let description = [{
-  }];
-
   let arguments = (ins
     TF_FpTensor:$features
   );
@@ -8680,9 +8530,6 @@ def TF_SoftplusOp : TF_Op<"Softplus", [NoSideEffect, SameOperandsAndResultType]>
 def TF_SoftplusGradOp : TF_Op<"SoftplusGrad", [NoSideEffect, SameOperandsAndResultType]> {
   let summary = "Computes softplus gradients for a softplus operation.";
 
-  let description = [{
-  }];
-
   let arguments = (ins
     TF_FpTensor:$gradients,
     TF_FpTensor:$features
@@ -8698,9 +8545,6 @@ def TF_SoftplusGradOp : TF_Op<"SoftplusGrad", [NoSideEffect, SameOperandsAndResu
 def TF_SoftsignOp : TF_Op<"Softsign", [NoSideEffect, SameOperandsAndResultType]> {
   let summary = "Computes softsign: `features / (abs(features) + 1)`.";
 
-  let description = [{
-  }];
-
   let arguments = (ins
     TF_FpTensor:$features
   );
@@ -8715,9 +8559,6 @@ def TF_SoftsignOp : TF_Op<"Softsign", [NoSideEffect, SameOperandsAndResultType]>
 def TF_SoftsignGradOp : TF_Op<"SoftsignGrad", [NoSideEffect, SameOperandsAndResultType]> {
   let summary = "Computes softsign gradients for a softsign operation.";
 
-  let description = [{
-  }];
-
   let arguments = (ins
     TF_FpTensor:$gradients,
     TF_FpTensor:$features
@@ -8965,9 +8806,6 @@ are checked during execution.
 def TF_SplitOp : TF_Op<"Split", [NoSideEffect]> {
   let summary = "Splits a tensor into `num_split` tensors along one dimension.";
 
-  let description = [{
-  }];
-
   let arguments = (ins
     I32Tensor:$split_dim,
     TF_Tensor:$value
@@ -8986,9 +8824,6 @@ def TF_SplitOp : TF_Op<"Split", [NoSideEffect]> {
 def TF_SplitVOp : TF_Op<"SplitV", [NoSideEffect]> {
   let summary = "Splits a tensor into `num_split` tensors along one dimension.";
 
-  let description = [{
-  }];
-
   let arguments = (ins
     TF_Tensor:$value,
     TF_I32OrI64Tensor:$size_splits,
@@ -9052,11 +8887,11 @@ I.e., \\(y = x * x = x^2\\).
   }];
 
   let arguments = (ins
-    TensorOf<[BF16, F16, F32, F64, I32, I64, TF_Complex128, TF_Complex64]>:$x
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64]>:$x
   );
 
   let results = (outs
-    TensorOf<[BF16, F16, F32, F64, I32, I64, TF_Complex128, TF_Complex64]>:$y
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64]>:$y
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -9125,9 +8960,6 @@ shape(squeeze(t, [2, 4])) ==> [1, 2, 3, 1]
 def TF_StackCloseV2Op : TF_Op<"StackCloseV2", []> {
   let summary = "Delete the stack from its resource container.";
 
-  let description = [{
-  }];
-
   let arguments = (ins
     TF_ResourceTensor:$handle
   );
@@ -9138,9 +8970,6 @@ def TF_StackCloseV2Op : TF_Op<"StackCloseV2", []> {
 def TF_StackPopV2Op : TF_Op<"StackPopV2", []> {
   let summary = "Pop the element at the top of the stack.";
 
-  let description = [{
-  }];
-
   let arguments = (ins
     TF_ResourceTensor:$handle
   );
@@ -9155,9 +8984,6 @@ def TF_StackPopV2Op : TF_Op<"StackPopV2", []> {
 def TF_StackPushV2Op : TF_Op<"StackPushV2", []> {
   let summary = "Push an element onto the stack.";
 
-  let description = [{
-  }];
-
   let arguments = (ins
     TF_ResourceTensor:$handle,
     TF_Tensor:$elem,
@@ -9175,9 +9001,6 @@ def TF_StackPushV2Op : TF_Op<"StackPushV2", []> {
 def TF_StackV2Op : TF_Op<"StackV2", []> {
   let summary = "A stack that produces elements in first-in last-out order.";
 
-  let description = [{
-  }];
-
   let arguments = (ins
     I32Tensor:$max_size,
 
@@ -9895,9 +9718,6 @@ calculation gets its own TensorArray accumulator.
 def TF_TensorArrayReadV3Op : TF_Op<"TensorArrayReadV3", []> {
   let summary = "Read an element from the TensorArray into output `value`.";
 
-  let description = [{
-  }];
-
   let arguments = (ins
     TF_ResourceTensor:$handle,
     I32Tensor:$index,
@@ -9937,9 +9757,6 @@ Scatter the data from the input value into specific TensorArray elements.
 def TF_TensorArraySizeV3Op : TF_Op<"TensorArraySizeV3", []> {
   let summary = "Get the current size of the TensorArray.";
 
-  let description = [{
-  }];
-
   let arguments = (ins
     TF_ResourceTensor:$handle,
     F32Tensor:$flow_in
@@ -10016,9 +9833,6 @@ Write data via Write and read via Read or Pack.
 def TF_TensorArrayWriteV3Op : TF_Op<"TensorArrayWriteV3", []> {
   let summary = "Push an element onto the tensor_array.";
 
-  let description = [{
-  }];
-
   let arguments = (ins
     TF_ResourceTensor:$handle,
     I32Tensor:$index,
@@ -10139,9 +9953,6 @@ values: The tensor.
 def TF_TensorListGetItemOp : TF_Op<"TensorListGetItem", [NoSideEffect]> {
   let summary = "";
 
-  let description = [{
-  }];
-
   let arguments = (ins
     TF_VariantTensor:$input_handle,
     I32Tensor:$index,
@@ -10271,9 +10082,6 @@ output_handle: The TensorList.
 def TF_TensorListSetItemOp : TF_Op<"TensorListSetItem", [NoSideEffect]> {
   let summary = "";
 
-  let description = [{
-  }];
-
   let arguments = (ins
     TF_VariantTensor:$input_handle,
     I32Tensor:$index,
@@ -11063,9 +10871,6 @@ def TF_XdivyOp : TF_Op<"Xdivy", [NoSideEffect, ResultsBroadcastableShape]>,
                  WithBroadcastableBinOpBuilder {
   let summary = "Returns 0 if x == 0, and x / y otherwise, elementwise.";
 
-  let description = [{
-  }];
-
   let arguments = (ins
     TensorOf<[F16, F32, F64, TF_Complex128, TF_Complex64]>:$x,
     TensorOf<[F16, F32, F64, TF_Complex128, TF_Complex64]>:$y
@@ -11242,9 +11047,6 @@ def TF_XlaHostComputeOp : TF_Op<"XlaHostCompute", []> {
 A pseudo-op to represent host-side computation in an XLA program.
   }];
 
-  let description = [{
-  }];
-
   let arguments = (ins
     Variadic<TF_Tensor>:$inputs,
 
@@ -11315,9 +11117,6 @@ https://www.tensorflow.org/performance/xla/operation_semantics#pad
 def TF_XlaRecvFromHostOp : TF_Op<"XlaRecvFromHost", []> {
   let summary = "An op to receive a tensor from the host.";
 
-  let description = [{
-  }];
-
   let arguments = (ins
     TF_ShapeAttr:$shape,
     StrAttr:$key
@@ -11355,9 +11154,6 @@ https://www.tensorflow.org/performance/xla/operation_semantics#reduce .
 def TF_XlaReplicaIdOp : TF_Op<"XlaReplicaId", [NoSideEffect]> {
   let summary = "Replica ID.";
 
-  let description = [{
-  }];
-
   let arguments = (ins);
 
   let results = (outs
@@ -11397,9 +11193,6 @@ i=0...N-1.
 def TF_XlaSendToHostOp : TF_Op<"XlaSendToHost", []> {
   let summary = "An op to send a tensor to the host.";
 
-  let description = [{
-  }];
-
   let arguments = (ins
     TF_Tensor:$input,
 
@@ -11443,9 +11236,6 @@ tensor such that tensor[...,:,:] = u[..., :, :] * Diag(s[..., :]) * Transpose(v[
 def TF_Xlog1pyOp : TF_Op<"Xlog1py", [NoSideEffect]> {
   let summary = "Returns 0 if x == 0, and x * log1p(y) otherwise, elementwise.";
 
-  let description = [{
-  }];
-
   let arguments = (ins
     TensorOf<[F16, F32, F64, TF_Complex128, TF_Complex64]>:$x,
     TensorOf<[F16, F32, F64, TF_Complex128, TF_Complex64]>:$y
@@ -11462,9 +11252,6 @@ def TF_XlogyOp : TF_Op<"Xlogy", [NoSideEffect, ResultsBroadcastableShape]>,
                  WithBroadcastableBinOpBuilder {
   let summary = "Returns 0 if x == 0, and x * log(y) otherwise, elementwise.";
 
-  let description = [{
-  }];
-
   let arguments = (ins
     TensorOf<[F16, F32, F64, TF_Complex128, TF_Complex64]>:$x,
     TensorOf<[F16, F32, F64, TF_Complex128, TF_Complex64]>:$y
@@ -11480,9 +11267,6 @@ def TF_XlogyOp : TF_Op<"Xlogy", [NoSideEffect, ResultsBroadcastableShape]>,
 def TF_ZerosLikeOp : TF_Op<"ZerosLike", [NoSideEffect, SameOperandsAndResultType]> {
   let summary = "Returns a tensor of zeros with the same shape and type as x.";
 
-  let description = [{
-  }];
-
   let arguments = (ins
     TF_Tensor:$x
   );
@@ -11589,9 +11373,6 @@ expected to create these operators.
 def TF__HostComputeMlirOp : TF_Op<"_HostComputeMlir", []> {
   let summary = "A host-side computation called from a TPU device.";
 
-  let description = [{
-  }];
-
   let arguments = (ins
     Variadic<TF_Tensor>:$inputs,
 
@@ -11671,9 +11452,6 @@ def TF__XlaRecvAtHostOp : TF_Op<"_XlaRecvAtHost", []> {
 A placeholder op to receive values from a running XLA computation.
   }];
 
-  let description = [{
-  }];
-
   let arguments = (ins
     TF_StrTensor:$dynamic_key,
 
@@ -11691,9 +11469,6 @@ A placeholder op to receive values from a running XLA computation.
 def TF__XlaSendFromHostOp : TF_Op<"_XlaSendFromHost", []> {
   let summary = "A placeholder op to send values to a running XLA computation.";
 
-  let description = [{
-  }];
-
   let arguments = (ins
     Variadic<TF_Tensor>:$inputs,
     TF_StrTensor:$dynamic_key,
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td
index d8675bb786f..24e88b0e966 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td
@@ -232,6 +232,7 @@ else_branch: A function that takes 'inputs' and returns a list of
 
 def TF_YieldOp : TF_Op<"Yield", [Terminator]> {
   let summary = "Yield operation";
+
   let description = [{
     The "yield" operation represents a return operation within the conditional
     and body of structured control flow (e.g., if and while). The operation
@@ -497,6 +498,7 @@ Inserts a placeholder for a tensor that will be always fed.
 
 def TF_PlaceholderWithDefaultOp : TF_Op<"PlaceholderWithDefault", [NoSideEffect]> {
   let summary = "Placeholder op";
+
   let description = [{
     A placeholder op that passes through input when its output is not fed.
   }];
@@ -839,9 +841,6 @@ def TF_XlaShardingOp : TF_Op<"XlaSharding", [NoSideEffect]> {
 An op which shards the input based on the given sharding attribute.
   }];
 
-  let description = [{
-  }];
-
   let arguments = (ins
     TF_Tensor:$input,
 
@@ -858,9 +857,6 @@ An op which shards the input based on the given sharding attribute.
 def TF_InfeedDequeueTupleOp : TF_Op<"InfeedDequeueTuple", []> {
   let summary = "Fetches multiple values from infeed as an XLA tuple.";
 
-  let description = [{
-  }];
-
   let arguments = (ins
     OptionalAttr<StrAttr>:$_XlaSharding
   );
@@ -904,9 +900,6 @@ def TF_BatchDatasetV2Op : TF_Op<"BatchDatasetV2", [NoSideEffect]> {
 Creates a dataset that batches `batch_size` elements from `input_dataset`.
   }];
 
-  let description = [{
-  }];
-
   let arguments = (ins
     TF_VariantTensor:$input_dataset,
     I64Tensor:$batch_size,

From e647a3b425ea63ff5e2e2338815ca4aea188c619 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sat, 20 Jun 2020 11:28:17 -0700
Subject: [PATCH 0713/1390] Add experimental C API to access EagerContext
 context ID.

PiperOrigin-RevId: 317476439
Change-Id: I9e97bce61cf526695f0c903b5f4f837116fef455
---
 tensorflow/c/eager/c_api_experimental.cc | 6 ++++++
 tensorflow/c/eager/c_api_experimental.h  | 8 ++++++++
 tensorflow/python/tfe_wrapper.cc         | 3 +++
 3 files changed, 17 insertions(+)

diff --git a/tensorflow/c/eager/c_api_experimental.cc b/tensorflow/c/eager/c_api_experimental.cc
index 9937fd7551f..7390cf243be 100644
--- a/tensorflow/c/eager/c_api_experimental.cc
+++ b/tensorflow/c/eager/c_api_experimental.cc
@@ -60,6 +60,12 @@ void TFE_ContextDisableGraphCollection(TFE_Context* ctx) {
   context->SetShouldStoreGraphs(false);
 }
 
+uint64_t TFE_GetContextId(TFE_Context* ctx) {
+  tensorflow::EagerContext* context =
+      tensorflow::ContextFromInterface(tensorflow::unwrap(ctx));
+  return context->GetContextId();
+}
+
 void TFE_MonitoringCounterCellIncrementBy(TFE_MonitoringCounterCell* cell,
                                           int64_t value) {
   cell->cell.IncrementBy(value);
diff --git a/tensorflow/c/eager/c_api_experimental.h b/tensorflow/c/eager/c_api_experimental.h
index 1b8efe61ee0..1af76c01154 100644
--- a/tensorflow/c/eager/c_api_experimental.h
+++ b/tensorflow/c/eager/c_api_experimental.h
@@ -300,6 +300,14 @@ TF_CAPI_EXPORT extern void TFE_ContextOptionsSetLazyRemoteInputsCopy(
 TF_CAPI_EXPORT extern void TFE_ContextOptionsSetTfrt(TFE_ContextOptions*,
                                                      bool use_tfrt);
 
+// Returns the context_id from the EagerContext which is used by the
+// EagerService to maintain consistency between client and worker. The
+// context_id is initialized with a dummy value and is later set when the worker
+// is initialized (either locally or remotely). The context_id can change during
+// the process lifetime although this should cause the worker to be
+// reinitialized (e.g. cleared caches) as well.
+TF_CAPI_EXPORT extern uint64_t TFE_GetContextId(TFE_Context* ctx);
+
 // -----------------------------------------------------------------------------
 // Cancellation APIs.
 
diff --git a/tensorflow/python/tfe_wrapper.cc b/tensorflow/python/tfe_wrapper.cc
index 00137f6f492..80cce331353 100644
--- a/tensorflow/python/tfe_wrapper.cc
+++ b/tensorflow/python/tfe_wrapper.cc
@@ -461,6 +461,9 @@ PYBIND11_MODULE(_pywrap_tfe, m) {
   m.def("TFE_ContextClearCaches", [](py::handle& o) {
     TFE_ContextClearCaches(tensorflow::InputTFE_Context(o));
   });
+  m.def("TFE_GetContextId", [](py::handle& ctx) {
+    return TFE_GetContextId(tensorflow::InputTFE_Context(ctx));
+  });
   m.def("TFE_ContextGetDevicePlacementPolicy", [](py::handle& ctx) {
     return TFE_ContextGetDevicePlacementPolicy(
         tensorflow::InputTFE_Context(ctx));

From 39a2286a0e28069e8c1f8100dd807c3d24aacfdc Mon Sep 17 00:00:00 2001
From: Ruoxin Sang <rxsang@google.com>
Date: Sat, 20 Jun 2020 13:06:21 -0700
Subject: [PATCH 0714/1390] Graduate TPUStrategy from experimental.

RELNOTES=Make TPUStrategy symbol non experimental.
PiperOrigin-RevId: 317482072
Change-Id: I8bf596729699cb02fa275dfb63855c2dc68c1d42
---
 .../debug/lib/check_numerics_callback.py      |   2 +-
 .../python/debug/lib/dumping_callback.py      |   2 +-
 tensorflow/python/distribute/README.md        |   2 +-
 .../python/distribute/distribute_lib.py       |  34 +--
 .../python/distribute/mirrored_strategy.py    |   5 +-
 tensorflow/python/distribute/tpu_strategy.py  | 196 +++++++++++++++++-
 tensorflow/python/tpu/tpu_embedding_v2.py     |  12 +-
 ...ensorflow.distribute.-t-p-u-strategy.pbtxt |  99 +++++++++
 .../api/golden/v2/tensorflow.distribute.pbtxt |   4 +
 .../tools/compatibility/tf_upgrade_v2.py      |   2 +-
 .../tools/compatibility/tf_upgrade_v2_test.py |   2 +-
 11 files changed, 323 insertions(+), 37 deletions(-)
 create mode 100644 tensorflow/tools/api/golden/v2/tensorflow.distribute.-t-p-u-strategy.pbtxt

diff --git a/tensorflow/python/debug/lib/check_numerics_callback.py b/tensorflow/python/debug/lib/check_numerics_callback.py
index 796fabae301..bd88ec5e122 100644
--- a/tensorflow/python/debug/lib/check_numerics_callback.py
+++ b/tensorflow/python/debug/lib/check_numerics_callback.py
@@ -422,7 +422,7 @@ def enable_check_numerics(stack_height_limit=30,
   tf.debugging.enable_check_numerics()
 
   resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='')
-  strategy = tf.distribute.experimental.TPUStrategy(resolver)
+  strategy = tf.distribute.TPUStrategy(resolver)
   with strategy.scope():
     # ...
   ```
diff --git a/tensorflow/python/debug/lib/dumping_callback.py b/tensorflow/python/debug/lib/dumping_callback.py
index 563b52f8f63..56de65d2339 100644
--- a/tensorflow/python/debug/lib/dumping_callback.py
+++ b/tensorflow/python/debug/lib/dumping_callback.py
@@ -737,7 +737,7 @@ def enable_dump_debug_info(dump_root,
       logdir, tensor_debug_mode="FULL_HEALTH")
 
   resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='')
-  strategy = tf.distribute.experimental.TPUStrategy(resolver)
+  strategy = tf.distribute.TPUStrategy(resolver)
   with strategy.scope():
     # ...
   ```
diff --git a/tensorflow/python/distribute/README.md b/tensorflow/python/distribute/README.md
index f44a4ee8531..26a62dacb30 100644
--- a/tensorflow/python/distribute/README.md
+++ b/tensorflow/python/distribute/README.md
@@ -49,7 +49,7 @@ model.evaluate(dataset)
 
 ```python
 # Create the strategy instance.
-tpu_strategy = tf.distribute.experimental.TPUStrategy(resolver)
+tpu_strategy = tf.distribute.TPUStrategy(resolver)
 
 
 # Create the keras model under strategy.scope()
diff --git a/tensorflow/python/distribute/distribute_lib.py b/tensorflow/python/distribute/distribute_lib.py
index 2b4ec361868..5abfb6e1c09 100644
--- a/tensorflow/python/distribute/distribute_lib.py
+++ b/tensorflow/python/distribute/distribute_lib.py
@@ -618,7 +618,7 @@ class InputOptions(
   resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='')
   tf.config.experimental_connect_to_cluster(resolver)
   tf.tpu.experimental.initialize_tpu_system(resolver)
-  strategy = tf.distribute.experimental.TPUStrategy(resolver)
+  strategy = tf.distribute.TPUStrategy(resolver)
 
   dataset = tf.data.Dataset.range(16)
   distributed_dataset_on_host = (
@@ -1462,17 +1462,17 @@ class Strategy(StrategyBase):
     topology = tf.tpu.experimental.initialize_tpu_system(resolver)
     device_assignment = tf.tpu.experimental.DeviceAssignment.build(
         topology,
-        computation_shape=[1, 1, 2],
+        computation_shape=[1, 1, 1, 2],
         num_replicas=4)
-    strategy = tf.distribute.experimental.TPUStrategy(
-        resolver, device_assignment=device_assignment)
+    strategy = tf.distribute.TPUStrategy(
+        resolver, experimental_device_assignment=device_assignment)
     iterator = iter(inputs)
 
     @tf.function()
     def step_fn(inputs):
       output = tf.add(inputs, inputs)
 
-      // Add operation will be executed on logical device 0.
+      # Add operation will be executed on logical device 0.
       output = strategy.experimental_assign_to_logical_device(output, 0)
       return output
 
@@ -1517,10 +1517,10 @@ class Strategy(StrategyBase):
     topology = tf.tpu.experimental.initialize_tpu_system(resolver)
     device_assignment = tf.tpu.experimental.DeviceAssignment.build(
         topology,
-        computation_shape=[2, 2, 2],
+        computation_shape=[1, 2, 2, 2],
         num_replicas=1)
-    strategy = tf.distribute.experimental.TPUStrategy(
-        resolver, device_assignment=device_assignment)
+    strategy = tf.distribute.TPUStrategy(
+        resolver, experimental_device_assignment=device_assignment)
 
     iterator = iter(inputs)
 
@@ -1529,8 +1529,8 @@ class Strategy(StrategyBase):
       inputs = strategy.experimental_split_to_logical_devices(
         inputs, [1, 2, 4, 1])
 
-      // model() function will be executed on 8 logical devices with `inputs`
-      // split 2 * 4  ways.
+      # model() function will be executed on 8 logical devices with `inputs`
+      # split 2 * 4  ways.
       output = model(inputs)
       return output
 
@@ -1571,10 +1571,10 @@ class Strategy(StrategyBase):
     topology = tf.tpu.experimental.initialize_tpu_system(resolver)
     device_assignment = tf.tpu.experimental.DeviceAssignment.build(
         topology,
-        computation_shape=[1, 1, 2],
+        computation_shape=[1, 1, 1, 2],
         num_replicas=4)
-    strategy = tf.distribute.experimental.TPUStrategy(
-        resolver, device_assignment=device_assignment)
+    strategy = tf.distribute.TPUStrategy(
+        resolver, experimental_device_assignment=device_assignment)
 
     iterator = iter(inputs)
 
@@ -1584,12 +1584,12 @@ class Strategy(StrategyBase):
       images = strategy.experimental_split_to_logical_devices(
         inputs, [1, 2, 4, 1])
 
-      // model() function will be executed on 8 logical devices with `inputs`
-      // split 2 * 4  ways.
+      # model() function will be executed on 8 logical devices with `inputs`
+      # split 2 * 4  ways.
       output = model(inputs)
 
-      // For loss calculation, all logical devices share the same logits
-      // and labels.
+      # For loss calculation, all logical devices share the same logits
+      # and labels.
       labels = strategy.experimental_replicate_to_logical_devices(labels)
       output = strategy.experimental_replicate_to_logical_devices(output)
       loss = loss_fn(labels, output)
diff --git a/tensorflow/python/distribute/mirrored_strategy.py b/tensorflow/python/distribute/mirrored_strategy.py
index 36598634fac..e01b5e6792d 100644
--- a/tensorflow/python/distribute/mirrored_strategy.py
+++ b/tensorflow/python/distribute/mirrored_strategy.py
@@ -190,9 +190,8 @@ class MirroredStrategy(distribute_lib.Strategy):
 
   This strategy is typically used for training on one
   machine with multiple GPUs. For TPUs, use
-  `tf.distribute.experimental.TPUStrategy`. To use `MirroredStrategy` with
-  multiple workers, please refer to
-  `tf.distribute.experimental.MultiWorkerMirroredStrategy`.
+  `tf.distribute.TPUStrategy`. To use `MirroredStrategy` with multiple workers,
+  please refer to `tf.distribute.experimental.MultiWorkerMirroredStrategy`.
 
   For example, a variable created under a `MirroredStrategy` is a
   `MirroredVariable`. If no devices are specified in the constructor argument of
diff --git a/tensorflow/python/distribute/tpu_strategy.py b/tensorflow/python/distribute/tpu_strategy.py
index e2e75260593..dcd1671841f 100644
--- a/tensorflow/python/distribute/tpu_strategy.py
+++ b/tensorflow/python/distribute/tpu_strategy.py
@@ -23,6 +23,7 @@ import collections
 import contextlib
 import copy
 import weakref
+from absl import logging
 
 import numpy as np
 
@@ -57,6 +58,7 @@ from tensorflow.python.tpu import tpu
 from tensorflow.python.tpu import tpu_strategy_util
 from tensorflow.python.tpu import training_loop
 from tensorflow.python.tpu.ops import tpu_ops
+from tensorflow.python.util import deprecation
 from tensorflow.python.util import nest
 from tensorflow.python.util.tf_export import tf_export
 
@@ -97,9 +99,188 @@ def validate_run_function(fn):
         "eager behavior is enabled.")
 
 
+@tf_export("distribute.TPUStrategy", v1=[])
+class TPUStrategyV2(distribute_lib.Strategy):
+  """Synchronous training on TPUs and TPU Pods.
+
+  To construct a TPUStrategy object, you need to run the
+  initialization code as below:
+
+  >>> resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='')
+  >>> tf.config.experimental_connect_to_cluster(resolver)
+  >>> tf.tpu.experimental.initialize_tpu_system(resolver)
+  >>> strategy = tf.distribute.TPUStrategy(resolver)
+
+  While using distribution strategies, the variables created within the
+  strategy's scope will be replicated across all the replicas and can be kept in
+  sync using all-reduce algorithms.
+
+  To run TF2 programs on TPUs, you can either use `.compile` and
+  `.fit` APIs in `tf.keras` with TPUStrategy, or write your own customized
+  training loop by calling `strategy.run` directly. Note that
+  TPUStrategy doesn't support pure eager execution, so please make sure the
+  function passed into `strategy.run` is a `tf.function` or
+  `strategy.run` is called inside a `tf.function` if eager
+  behavior is enabled. See more details in https://www.tensorflow.org/guide/tpu.
+
+  `experimental_distribute_datasets_from_function` and
+  `experimental_distribute_dataset` APIs can be used to distribute the dataset
+  across the TPU workers when writing your own training loop. If you are using
+  `fit` and `compile` methods available in `tf.keras.Model`, then Keras will
+  handle the distribution for you.
+
+  An example of writing customized training loop on TPUs:
+
+  >>> with strategy.scope():
+  ...   model = tf.keras.Sequential([
+  ...     tf.keras.layers.Dense(2, input_shape=(5,)),
+  ...   ])
+  ...   optimizer = tf.keras.optimizers.SGD(learning_rate=0.1)
+
+  >>> def dataset_fn(ctx):
+  ...   x = np.random.random((2, 5)).astype(np.float32)
+  ...   y = np.random.randint(2, size=(2, 1))
+  ...   dataset = tf.data.Dataset.from_tensor_slices((x, y))
+  ...   return dataset.repeat().batch(1, drop_remainder=True)
+  >>> dist_dataset = strategy.experimental_distribute_datasets_from_function(
+  ...     dataset_fn)
+  >>> iterator = iter(dist_dataset)
+
+  >>> @tf.function()
+  ... def train_step(iterator):
+  ...
+  ...   def step_fn(inputs):
+  ...     features, labels = inputs
+  ...     with tf.GradientTape() as tape:
+  ...       logits = model(features, training=True)
+  ...       loss = tf.keras.losses.sparse_categorical_crossentropy(
+  ...           labels, logits)
+  ...
+  ...     grads = tape.gradient(loss, model.trainable_variables)
+  ...     optimizer.apply_gradients(zip(grads, model.trainable_variables))
+  ...
+  ...   strategy.run(step_fn, args=(next(iterator),))
+
+  >>> train_step(iterator)
+
+  For the advanced use cases like model parallelism, you can set
+  `experimental_device_assignment` argument when creating TPUStrategy to specify
+  number of replicas and number of logical devices. Below is an example to
+  initialize TPU system with 2 logical devices and 1 replica.
+
+  >>> resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='')
+  >>> tf.config.experimental_connect_to_cluster(resolver)
+  >>> topology = tf.tpu.experimental.initialize_tpu_system(resolver)
+  >>> device_assignment = tf.tpu.experimental.DeviceAssignment.build(
+  ...     topology,
+  ...     computation_shape=[1, 1, 1, 2],
+  ...     num_replicas=1)
+  >>> strategy = tf.distribute.TPUStrategy(
+  ...     resolver, experimental_device_assignment=device_assignment)
+
+  Then you can run a `tf.add` operation only on logical device 0.
+
+  >>> @tf.function()
+  ... def step_fn(inputs):
+  ...   features, _ = inputs
+  ...   output = tf.add(features, features)
+  ...
+  ...   # Add operation will be executed on logical device 0.
+  ...   output = strategy.experimental_assign_to_logical_device(output, 0)
+  ...   return output
+  >>> dist_dataset = strategy.experimental_distribute_datasets_from_function(
+  ...     dataset_fn)
+  >>> iterator = iter(dist_dataset)
+  >>> strategy.run(step_fn, args=(next(iterator),))
+  """
+
+  def __init__(self,
+               tpu_cluster_resolver=None,
+               experimental_device_assignment=None):
+    """Synchronous training in TPU donuts or Pods.
+
+    Args:
+      tpu_cluster_resolver: A tf.distribute.cluster_resolver.TPUClusterResolver,
+        which provides information about the TPU cluster. If None, it will
+        assume running on a local TPU worker.
+      experimental_device_assignment: Optional
+        `tf.tpu.experimental.DeviceAssignment` to specify the placement of
+        replicas on the TPU cluster.
+    """
+    super(TPUStrategyV2, self).__init__(TPUExtended(
+        self, tpu_cluster_resolver,
+        device_assignment=experimental_device_assignment))
+    distribute_lib.distribution_strategy_gauge.get_cell("V2").set("TPUStrategy")
+    distribute_lib.distribution_strategy_replica_gauge.get_cell(
+        "num_workers").set(self.extended.num_hosts)
+    distribute_lib.distribution_strategy_replica_gauge.get_cell(
+        "num_replicas_per_worker").set(self.extended.num_replicas_per_host)
+    # Packed variable is used to reduce the overhead of function execution.
+    # For a DistributedVariable, only one variable handle is captured into a
+    # function graph. It's only supported in eager mode.
+    self._enable_packed_variable_in_eager_mode = False
+
+  def run(self, fn, args=(), kwargs=None, options=None):
+    """Run the computation defined by `fn` on each TPU replica.
+
+    Executes ops specified by `fn` on each replica. If `args` or `kwargs` have
+    `tf.distribute.DistributedValues`, such as those produced by a
+    `tf.distribute.DistributedDataset` from
+    `tf.distribute.Strategy.experimental_distribute_dataset` or
+    `tf.distribute.Strategy.experimental_distribute_datasets_from_function`,
+    when `fn` is executed on a particular replica, it will be executed with the
+    component of `tf.distribute.DistributedValues` that correspond to that
+    replica.
+
+    `fn` may call `tf.distribute.get_replica_context()` to access members such
+    as `all_reduce`.
+
+    All arguments in `args` or `kwargs` should either be nest of tensors or
+    `tf.distribute.DistributedValues` containing tensors or composite tensors.
+
+    Example usage:
+
+    >>> resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='')
+    >>> tf.config.experimental_connect_to_cluster(resolver)
+    >>> tf.tpu.experimental.initialize_tpu_system(resolver)
+    >>> strategy = tf.distribute.TPUStrategy(resolver)
+    >>> @tf.function
+    ... def run():
+    ...   def value_fn(value_context):
+    ...     return value_context.num_replicas_in_sync
+    ...   distributed_values = (
+    ...       strategy.experimental_distribute_values_from_function(value_fn))
+    ...   def replica_fn(input):
+    ...     return input * 2
+    ...   return strategy.run(replica_fn, args=(distributed_values,))
+    >>> result = run()
+
+    Args:
+      fn: The function to run. The output must be a `tf.nest` of `Tensor`s.
+      args: (Optional) Positional arguments to `fn`.
+      kwargs: (Optional) Keyword arguments to `fn`.
+      options: (Optional) An instance of `tf.distribute.RunOptions` specifying
+        the options to run `fn`.
+
+    Returns:
+      Merged return value of `fn` across replicas. The structure of the return
+      value is the same as the return value from `fn`. Each element in the
+      structure can either be `tf.distribute.DistributedValues`, `Tensor`
+      objects, or `Tensor`s (for example, if running on a single replica).
+    """
+    validate_run_function(fn)
+
+    # Note: the target function is converted to graph even when in Eager mode,
+    # so autograph is on by default here.
+    fn = autograph.tf_convert(fn, autograph_ctx.control_status_ctx())
+    options = options or distribute_lib.RunOptions()
+    return self.extended.tpu_run(fn, args, kwargs, options)
+
+
 @tf_export("distribute.experimental.TPUStrategy", v1=[])
+@deprecation.deprecated_endpoints("distribute.experimental.TPUStrategy")
 class TPUStrategy(distribute_lib.Strategy):
-  """TPU distribution strategy implementation.
+  """Synchronous training on TPUs and TPU Pods.
 
   To construct a TPUStrategy object, you need to run the
   initialization code as below:
@@ -109,9 +290,9 @@ class TPUStrategy(distribute_lib.Strategy):
   >>> tf.tpu.experimental.initialize_tpu_system(resolver)
   >>> strategy = tf.distribute.experimental.TPUStrategy(resolver)
 
-  While using distribution strategies, the variables created within strategy's
-  scope will be replicated across all the replicas and can be kept in sync
-  using all-reduce algorithms.
+  While using distribution strategies, the variables created within the
+  strategy's scope will be replicated across all the replicas and can be kept in
+  sync using all-reduce algorithms.
 
   To run TF2 programs on TPUs, you can either use `.compile` and
   `.fit` APIs in `tf.keras` with TPUStrategy, or write your own customized
@@ -131,9 +312,12 @@ class TPUStrategy(distribute_lib.Strategy):
       tpu_cluster_resolver: A tf.distribute.cluster_resolver.TPUClusterResolver,
         which provides information about the TPU cluster.
       device_assignment: Optional `tf.tpu.experimental.DeviceAssignment` to
-        specify the placement of replicas on the TPU cluster. Currently only
-        supports the usecase of using a single core within a TPU cluster.
+        specify the placement of replicas on the TPU cluster.
     """
+    logging.warning(
+        "`tf.distribute.experimental.TPUStrategy` is deprecated, please use "
+        " the non experimental symbol `tf.distribute.TPUStrategy` instead.")
+
     super(TPUStrategy, self).__init__(TPUExtended(
         self, tpu_cluster_resolver, device_assignment=device_assignment))
     distribute_lib.distribution_strategy_gauge.get_cell("V2").set("TPUStrategy")
diff --git a/tensorflow/python/tpu/tpu_embedding_v2.py b/tensorflow/python/tpu/tpu_embedding_v2.py
index e5cfba7c587..6db67fea367 100644
--- a/tensorflow/python/tpu/tpu_embedding_v2.py
+++ b/tensorflow/python/tpu/tpu_embedding_v2.py
@@ -132,7 +132,7 @@ class TPUEmbedding(tracking.AutoTrackable):
   First lets look at the `TPUStrategy` mode. Initial setup looks like:
 
   ```python
-  strategy = tf.distribute.experimental.TPUStrategy(...)
+  strategy = tf.distribute.TPUStrategy(...)
   with strategy.scope():
     embedding = tf.tpu.experimental.embedding.TPUEmbedding(
         feature_config=feature_config,
@@ -234,7 +234,7 @@ class TPUEmbedding(tracking.AutoTrackable):
     """Creates the TPUEmbedding mid level API object.
 
     ```python
-    strategy = tf.distribute.experimental.TPUStrategy(...)
+    strategy = tf.distribute.TPUStrategy(...)
     with strategy.scope():
       embedding = tf.tpu.experimental.embedding.TPUEmbedding(
           feature_config=tf.tpu.experimental.embedding.FeatureConfig(
@@ -512,7 +512,7 @@ class TPUEmbedding(tracking.AutoTrackable):
     ensure you understand the effect of applying a zero gradient.
 
     ```python
-    strategy = tf.distribute.experimental.TPUStrategy(...)
+    strategy = tf.distribute.TPUStrategy(...)
     with strategy.scope():
       embedding = tf.tpu.experimental.embedding.TPUEmbedding(...)
 
@@ -603,7 +603,7 @@ class TPUEmbedding(tracking.AutoTrackable):
     `(batch_size, max_sequence_length, dim)` instead.
 
     ```python
-    strategy = tf.distribute.experimental.TPUStrategy(...)
+    strategy = tf.distribute.TPUStrategy(...)
     with strategy.scope():
       embedding = tf.tpu.experimental.embedding.TPUEmbedding(...)
 
@@ -1054,13 +1054,13 @@ class TPUEmbedding(tracking.AutoTrackable):
     embedding tables. We expect that the batch size of each of the tensors in
     features matches the per core batch size. This will automatically happen if
     your input dataset is batched to the global batch size and you use
-    `tf.distribute.experimental.TPUStrategy`'s `experimental_distribute_dataset`
+    `tf.distribute.TPUStrategy`'s `experimental_distribute_dataset`
     or if you use `experimental_distribute_datasets_from_function` and batch
     to the per core batch size computed by the context passed to your input
     function.
 
     ```python
-    strategy = tf.distribute.experimental.TPUStrategy(...)
+    strategy = tf.distribute.TPUStrategy(...)
     with strategy.scope():
       embedding = tf.tpu.experimental.embedding.TPUEmbedding(...)
 
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-t-p-u-strategy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-t-p-u-strategy.pbtxt
new file mode 100644
index 00000000000..f41a08454e2
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-t-p-u-strategy.pbtxt
@@ -0,0 +1,99 @@
+path: "tensorflow.distribute.TPUStrategy"
+tf_class {
+  is_instance: "<class \'tensorflow.python.distribute.tpu_strategy.TPUStrategyV2\'>"
+  is_instance: "<class \'tensorflow.python.distribute.distribute_lib.Strategy\'>"
+  is_instance: "<class \'tensorflow.python.distribute.distribute_lib.StrategyBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "extended"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "num_replicas_in_sync"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'tpu_cluster_resolver\', \'experimental_device_assignment\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "colocate_vars_with"
+    argspec: "args=[\'self\', \'colocate_with_variable\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "configure"
+    argspec: "args=[\'self\', \'session_config\', \'cluster_spec\', \'task_type\', \'task_id\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "experimental_assign_to_logical_device"
+    argspec: "args=[\'self\', \'tensor\', \'logical_device_id\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "experimental_distribute_dataset"
+    argspec: "args=[\'self\', \'dataset\', \'options\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "experimental_distribute_datasets_from_function"
+    argspec: "args=[\'self\', \'dataset_fn\', \'options\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "experimental_distribute_values_from_function"
+    argspec: "args=[\'self\', \'value_fn\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "experimental_local_results"
+    argspec: "args=[\'self\', \'value\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "experimental_make_numpy_dataset"
+    argspec: "args=[\'self\', \'numpy_input\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "experimental_replicate_to_logical_devices"
+    argspec: "args=[\'self\', \'tensor\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "experimental_run"
+    argspec: "args=[\'self\', \'fn\', \'input_iterator\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "experimental_run_v2"
+    argspec: "args=[\'self\', \'fn\', \'args\', \'kwargs\', \'options\'], varargs=None, keywords=None, defaults=[\'()\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "experimental_split_to_logical_devices"
+    argspec: "args=[\'self\', \'tensor\', \'partition_dimensions\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "group"
+    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "make_dataset_iterator"
+    argspec: "args=[\'self\', \'dataset\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "make_input_fn_iterator"
+    argspec: "args=[\'self\', \'input_fn\', \'replication_mode\'], varargs=None, keywords=None, defaults=[\'InputReplicationMode.PER_WORKER\'], "
+  }
+  member_method {
+    name: "reduce"
+    argspec: "args=[\'self\', \'reduce_op\', \'value\', \'axis\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "run"
+    argspec: "args=[\'self\', \'fn\', \'args\', \'kwargs\', \'options\'], varargs=None, keywords=None, defaults=[\'()\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "scope"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "unwrap"
+    argspec: "args=[\'self\', \'value\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_config_proto"
+    argspec: "args=[\'self\', \'config_proto\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.pbtxt
index 009cb7fe400..d3867889a4f 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.distribute.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.pbtxt
@@ -72,6 +72,10 @@ tf_module {
     name: "StrategyExtended"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "TPUStrategy"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "cluster_resolver"
     mtype: "<type \'module\'>"
diff --git a/tensorflow/tools/compatibility/tf_upgrade_v2.py b/tensorflow/tools/compatibility/tf_upgrade_v2.py
index d27c75fb44e..1475a74d62f 100644
--- a/tensorflow/tools/compatibility/tf_upgrade_v2.py
+++ b/tensorflow/tools/compatibility/tf_upgrade_v2.py
@@ -883,7 +883,7 @@ class TFAPIChangeSpec(ast_edits.NoUpdateSpec):
     contrib_tpu_strategy_warning = (
         ast_edits.ERROR,
         "(Manual edit required) tf.contrib.distribute.TPUStrategy has "
-        "been migrated to tf.distribute.experimental.TPUStrategy. Note the "
+        "been migrated to tf.distribute.TPUStrategy. Note the "
         "slight changes in constructor. " + distribute_strategy_api_changes)
 
     contrib_collective_strategy_warning = (
diff --git a/tensorflow/tools/compatibility/tf_upgrade_v2_test.py b/tensorflow/tools/compatibility/tf_upgrade_v2_test.py
index 47b9899a6b7..185a3b07f8d 100644
--- a/tensorflow/tools/compatibility/tf_upgrade_v2_test.py
+++ b/tensorflow/tools/compatibility/tf_upgrade_v2_test.py
@@ -2155,7 +2155,7 @@ def _log_prob(self, x):
     expected = "tf.contrib.distribute.TPUStrategy"
     _, _, errors, new_text = self._upgrade(text)
     self.assertEqual(expected, new_text)
-    self.assertIn("migrated to tf.distribute.experimental.TPUStrategy",
+    self.assertIn("migrated to tf.distribute.TPUStrategy",
                   errors[0])
 
     text = "tf.contrib.distribute.foo"

From 059c05e4da07225faa651c46df9b9be6f83e6914 Mon Sep 17 00:00:00 2001
From: Gaurav Jain <gjn@google.com>
Date: Sat, 20 Jun 2020 14:12:04 -0700
Subject: [PATCH 0715/1390] Handle DT_UINT32 & DT_UINT64 in transpose

PiperOrigin-RevId: 317485472
Change-Id: I5c62d7977f44f57f544ebe27e82f141fe07ce73e
---
 tensorflow/core/kernels/transpose_functor.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tensorflow/core/kernels/transpose_functor.h b/tensorflow/core/kernels/transpose_functor.h
index a89fc40d772..0c22b11b7c6 100644
--- a/tensorflow/core/kernels/transpose_functor.h
+++ b/tensorflow/core/kernels/transpose_functor.h
@@ -191,11 +191,13 @@ Status DoTransposeImpl(const Device& d, const Tensor& in,
     case DT_FLOAT:
     case DT_INT32:
     case DT_QINT32:
+    case DT_UINT32:
       Transpose<Device, uint32>::run(d, in, perm, out);
       break;
 
     case DT_DOUBLE:
     case DT_INT64:
+    case DT_UINT64:
       Transpose<Device, uint64>::run(d, in, perm, out);
       break;
 

From c654edb1f30c7fb4e4ae60fb20b8a5661fee0ed1 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sat, 20 Jun 2020 15:08:56 -0700
Subject: [PATCH 0716/1390] Update Eigen to
 https://gitlab.com/libeigen/eigen/-/commit/6b9c92fe7eff0dedb031cec38004c9c3667f3057

PiperOrigin-RevId: 317488881
Change-Id: Icf75e932f8568a2135234de236fd0f94869ad05a
---
 tensorflow/workspace.bzl | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index 27eca0ee54f..e6a15b422eb 100755
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -237,11 +237,11 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
         name = "eigen_archive",
         build_file = clean_dep("//third_party:eigen.BUILD"),
         patch_file = clean_dep("//third_party/eigen3:gpu_packet_math.patch"),
-        sha256 = "a6b790c40e637cddc2554a7bed40622d30311210c8ce94524cdf5496417fd2bc",  # SHARED_EIGEN_SHA
-        strip_prefix = "eigen-6228f27234ca84545e711fb27b7850f6829af3d9",
+        sha256 = "d26ada177ed9b696a9447fc85d209932a032c8ffc51630cf15eea8629b29dad6",  # SHARED_EIGEN_SHA
+        strip_prefix = "eigen-6b9c92fe7eff0dedb031cec38004c9c3667f3057",
         urls = [
-            "https://storage.googleapis.com/mirror.tensorflow.org/gitlab.com/libeigen/eigen/-/archive/6228f27234ca84545e711fb27b7850f6829af3d9/eigen-6228f27234ca84545e711fb27b7850f6829af3d9.tar.gz",
-            "https://gitlab.com/libeigen/eigen/-/archive/6228f27234ca84545e711fb27b7850f6829af3d9/eigen-6228f27234ca84545e711fb27b7850f6829af3d9.tar.gz",
+            "https://storage.googleapis.com/mirror.tensorflow.org/gitlab.com/libeigen/eigen/-/archive/6b9c92fe7eff0dedb031cec38004c9c3667f3057/eigen-6b9c92fe7eff0dedb031cec38004c9c3667f3057.tar.gz",
+            "https://gitlab.com/libeigen/eigen/-/archive/6b9c92fe7eff0dedb031cec38004c9c3667f3057/eigen-6b9c92fe7eff0dedb031cec38004c9c3667f3057.tar.gz",
         ],
     )
 

From ae423bd3bf2eb3d41e37a358429e7436e1f0b56a Mon Sep 17 00:00:00 2001
From: Gaurav Jain <gjn@google.com>
Date: Sat, 20 Jun 2020 15:09:35 -0700
Subject: [PATCH 0717/1390] Rollback: Add uint32 & uint64 to
 TF_CALL_INTEGRAL_TYPES

PiperOrigin-RevId: 317488920
Change-Id: I65736e7f4a1004ff634194343dc4ec237a227a19
---
 tensorflow/core/framework/register_types.h    | 21 +++++++++-----
 tensorflow/core/framework/types.cc            |  5 ++++
 tensorflow/core/kernels/BUILD                 |  2 --
 tensorflow/core/kernels/concat_lib_cpu.cc     |  2 ++
 tensorflow/core/kernels/concat_op.cc          |  2 ++
 tensorflow/core/kernels/constant_op.cc        |  1 +
 tensorflow/core/kernels/control_flow_ops.cc   |  5 ++++
 .../core/kernels/data/dataset_test_base.cc    |  2 ++
 tensorflow/core/kernels/dense_update_ops.cc   |  1 +
 .../core/kernels/dynamic_partition_op.cc      |  2 ++
 tensorflow/core/kernels/fill_functor.cc       |  5 +---
 tensorflow/core/kernels/gather_op.cc          |  2 ++
 tensorflow/core/kernels/identity_op.cc        |  1 +
 tensorflow/core/kernels/ragged_gather_op.cc   |  2 ++
 .../kernels/ragged_tensor_from_variant_op.cc  |  2 ++
 .../kernels/ragged_tensor_to_tensor_op.cc     |  2 ++
 .../kernels/ragged_tensor_to_variant_op.cc    |  2 ++
 .../core/kernels/resource_variable_ops.cc     |  1 +
 tensorflow/core/kernels/split_lib_cpu.cc      |  1 +
 tensorflow/core/kernels/split_op.cc           |  1 +
 tensorflow/core/kernels/strided_slice_op.cc   |  2 ++
 .../core/kernels/strided_slice_op_impl.h      |  2 ++
 tensorflow/core/kernels/topk_op.cc            |  2 ++
 .../core/kernels/topk_op_gpu_uint32.cu.cc     | 28 -------------------
 .../core/kernels/topk_op_gpu_uint64.cu.cc     | 28 -------------------
 tensorflow/core/util/batch_util.cc            |  8 ++++++
 .../core/util/saved_tensor_slice_util.h       |  2 --
 27 files changed, 63 insertions(+), 71 deletions(-)
 delete mode 100644 tensorflow/core/kernels/topk_op_gpu_uint32.cu.cc
 delete mode 100644 tensorflow/core/kernels/topk_op_gpu_uint64.cu.cc

diff --git a/tensorflow/core/framework/register_types.h b/tensorflow/core/framework/register_types.h
index 0cf6536e8c2..bc3e5e1743b 100644
--- a/tensorflow/core/framework/register_types.h
+++ b/tensorflow/core/framework/register_types.h
@@ -153,9 +153,16 @@ limitations under the License.
 #endif  // defined(IS_MOBILE_PLATFORM)  - end of TF_CALL_type defines
 
 // Defines for sets of types.
-#define TF_CALL_INTEGRAL_TYPES(m)                                       \
-  TF_CALL_uint64(m) TF_CALL_int64(m) TF_CALL_uint32(m) TF_CALL_int32(m) \
-      TF_CALL_uint16(m) TF_CALL_int16(m) TF_CALL_uint8(m) TF_CALL_int8(m)
+
+// TODO(b/111604096): Add uint32 and uint64 to TF_CALL_INTEGRAL_TYPES.
+//
+// The uint32 and uint64 types were introduced in 10/2017 to be used via XLA and
+// thus were not included in TF_CALL_INTEGRAL_TYPES. Including them in
+// TF_CALL_INTEGRAL_TYPES should only happen after evaluating the effect on the
+// TF binary size and performance.
+#define TF_CALL_INTEGRAL_TYPES(m)                                      \
+  TF_CALL_int64(m) TF_CALL_int32(m) TF_CALL_uint16(m) TF_CALL_int16(m) \
+      TF_CALL_uint8(m) TF_CALL_int8(m)
 
 #define TF_CALL_FLOAT_TYPES(m) \
   TF_CALL_half(m) TF_CALL_bfloat16(m) TF_CALL_float(m) TF_CALL_double(m)
@@ -167,10 +174,10 @@ limitations under the License.
 #define TF_CALL_REAL_NUMBER_TYPES_NO_BFLOAT16(m) \
   TF_CALL_INTEGRAL_TYPES(m) TF_CALL_half(m) TF_CALL_float(m) TF_CALL_double(m)
 
-#define TF_CALL_REAL_NUMBER_TYPES_NO_INT32(m)                                \
-  TF_CALL_half(m) TF_CALL_bfloat16(m) TF_CALL_float(m) TF_CALL_double(m)     \
-      TF_CALL_uint64(m) TF_CALL_int64(m) TF_CALL_uint32(m) TF_CALL_uint16(m) \
-          TF_CALL_int16(m) TF_CALL_uint8(m) TF_CALL_int8(m)
+#define TF_CALL_REAL_NUMBER_TYPES_NO_INT32(m)                              \
+  TF_CALL_half(m) TF_CALL_bfloat16(m) TF_CALL_float(m) TF_CALL_double(m)   \
+      TF_CALL_int64(m) TF_CALL_uint16(m) TF_CALL_int16(m) TF_CALL_uint8(m) \
+          TF_CALL_int8(m)
 
 #define TF_CALL_COMPLEX_TYPES(m) TF_CALL_complex64(m) TF_CALL_complex128(m)
 
diff --git a/tensorflow/core/framework/types.cc b/tensorflow/core/framework/types.cc
index d6455e012d0..97eaec98ffe 100644
--- a/tensorflow/core/framework/types.cc
+++ b/tensorflow/core/framework/types.cc
@@ -238,6 +238,11 @@ int DataTypeSize(DataType dt) {
     TF_CALL_qint16(CASE);
     TF_CALL_quint16(CASE);
 
+    // uint32 and uint64 aren't included in TF_CALL_POD_TYPES because we
+    // don't want to define kernels for them at this stage to avoid binary
+    // bloat.
+    TF_CALL_uint32(CASE);
+    TF_CALL_uint64(CASE);
     default:
       return 0;
   }
diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index e2ff5aed283..7da864a6027 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -4919,9 +4919,7 @@ tf_kernel_library(
         "topk_op_gpu_double.cu.cc",
         "topk_op_gpu_float.cu.cc",
         "topk_op_gpu_half.cu.cc",
-        "topk_op_gpu_uint64.cu.cc",
         "topk_op_gpu_int64.cu.cc",
-        "topk_op_gpu_uint32.cu.cc",
         "topk_op_gpu_int32.cu.cc",
         "topk_op_gpu_int16.cu.cc",
         "topk_op_gpu_uint16.cu.cc",
diff --git a/tensorflow/core/kernels/concat_lib_cpu.cc b/tensorflow/core/kernels/concat_lib_cpu.cc
index 1dec589d3ff..da73d3d2c56 100644
--- a/tensorflow/core/kernels/concat_lib_cpu.cc
+++ b/tensorflow/core/kernels/concat_lib_cpu.cc
@@ -116,6 +116,8 @@ REGISTER(qint8)
 REGISTER(quint16)
 REGISTER(qint16)
 REGISTER(qint32)
+REGISTER(uint32)
+REGISTER(uint64)
 
 #if defined(IS_MOBILE_PLATFORM) && !defined(SUPPORT_SELECTIVE_REGISTRATION) && \
     !defined(__ANDROID_TYPES_FULL__)
diff --git a/tensorflow/core/kernels/concat_op.cc b/tensorflow/core/kernels/concat_op.cc
index d3f3a04f33b..be3e9a67c5f 100644
--- a/tensorflow/core/kernels/concat_op.cc
+++ b/tensorflow/core/kernels/concat_op.cc
@@ -208,6 +208,8 @@ REGISTER_CONCAT(qint8);
 REGISTER_CONCAT(quint16);
 REGISTER_CONCAT(qint16);
 REGISTER_CONCAT(qint32);
+REGISTER_CONCAT(uint32);
+REGISTER_CONCAT(uint64);
 
 #undef REGISTER_CONCAT
 
diff --git a/tensorflow/core/kernels/constant_op.cc b/tensorflow/core/kernels/constant_op.cc
index dc178d17d49..4bcbc076446 100644
--- a/tensorflow/core/kernels/constant_op.cc
+++ b/tensorflow/core/kernels/constant_op.cc
@@ -211,6 +211,7 @@ TF_CALL_ALL_TYPES(REGISTER_CPU_KERNEL);
 // the conversion from uint8 to quint8.
 REGISTER_KERNEL(CPU, quint8);
 REGISTER_KERNEL(CPU, quint16);
+REGISTER_KERNEL(CPU, uint32);
 #undef REGISTER_CPU_KERNEL
 
 #ifdef TENSORFLOW_USE_SYCL
diff --git a/tensorflow/core/kernels/control_flow_ops.cc b/tensorflow/core/kernels/control_flow_ops.cc
index 1a0082c6a3b..435de3c5954 100644
--- a/tensorflow/core/kernels/control_flow_ops.cc
+++ b/tensorflow/core/kernels/control_flow_ops.cc
@@ -101,12 +101,16 @@ TF_CALL_ALL_TYPES(REGISTER_CPU_SWITCH);
 TF_CALL_ALL_TYPES(REGISTER_CPU_REF_SWITCH);
 TF_CALL_QUANTIZED_TYPES(REGISTER_CPU_SWITCH);
 TF_CALL_QUANTIZED_TYPES(REGISTER_CPU_REF_SWITCH);
+REGISTER_CPU_SWITCH(uint64);
 
 TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_GPU_SWITCH);
 TF_CALL_QUANTIZED_TYPES(REGISTER_GPU_SWITCH);
 TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_GPU_REF_SWITCH);
 TF_CALL_QUANTIZED_TYPES(REGISTER_GPU_REF_SWITCH);
+REGISTER_GPU_SWITCH(uint64);
 TF_CALL_variant(REGISTER_GPU_SWITCH);
+TF_CALL_uint32(REGISTER_GPU_SWITCH);
+TF_CALL_uint32(REGISTER_GPU_REF_SWITCH);
 TF_CALL_bool(REGISTER_GPU_SWITCH);
 TF_CALL_bool(REGISTER_GPU_REF_SWITCH);
 
@@ -307,6 +311,7 @@ TF_CALL_QUANTIZED_TYPES(REGISTER_GPU_KERNEL);
 TF_CALL_QUANTIZED_TYPES(REGISTER_GPU_REF_KERNEL);
 REGISTER_GPU_KERNEL(bool);
 REGISTER_GPU_REF_KERNEL(bool);
+REGISTER_GPU_KERNEL(uint64);
 TF_CALL_variant(REGISTER_GPU_KERNEL);
 
 #undef REGISTER_GPU_KERNEL
diff --git a/tensorflow/core/kernels/data/dataset_test_base.cc b/tensorflow/core/kernels/data/dataset_test_base.cc
index e41e35be1e9..b91ab9b733c 100644
--- a/tensorflow/core/kernels/data/dataset_test_base.cc
+++ b/tensorflow/core/kernels/data/dataset_test_base.cc
@@ -220,6 +220,8 @@ Status DatasetOpsTestBase::ExpectEqual(const Tensor& a, const Tensor& b) {
     break;
     TF_CALL_NUMBER_TYPES(CASE);
     TF_CALL_tstring(CASE);
+    TF_CALL_uint32(CASE);
+    TF_CALL_uint64(CASE);
     // TODO(feihugis): figure out how to support variant tensors.
 #undef CASE
     default:
diff --git a/tensorflow/core/kernels/dense_update_ops.cc b/tensorflow/core/kernels/dense_update_ops.cc
index 71235fca143..55e4cd7606a 100644
--- a/tensorflow/core/kernels/dense_update_ops.cc
+++ b/tensorflow/core/kernels/dense_update_ops.cc
@@ -98,6 +98,7 @@ typedef Eigen::SyclDevice SYCLDevice;
 
 TF_CALL_ALL_TYPES(REGISTER_KERNELS);
 // uint32 not included in ALL_TYPES
+TF_CALL_uint32(REGISTER_KERNELS);
 TF_CALL_QUANTIZED_TYPES(REGISTER_KERNELS);
 // quint16 not included in QUANTIZIED_TYPES
 TF_CALL_quint16(REGISTER_KERNELS);
diff --git a/tensorflow/core/kernels/dynamic_partition_op.cc b/tensorflow/core/kernels/dynamic_partition_op.cc
index 95af19c4c48..90ed71dccce 100644
--- a/tensorflow/core/kernels/dynamic_partition_op.cc
+++ b/tensorflow/core/kernels/dynamic_partition_op.cc
@@ -164,6 +164,8 @@ class DynamicPartitionOp : public DynamicPartitionOp_Shared {
       DynamicPartitionOp<T>)
 
 TF_CALL_ALL_TYPES(REGISTER_DYNAMIC_PARTITION);
+// For partitioning fingerprints.
+TF_CALL_uint64(REGISTER_DYNAMIC_PARTITION);
 #undef REGISTER_DYNAMIC_PARTITION
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/fill_functor.cc b/tensorflow/core/kernels/fill_functor.cc
index 174a4e45a79..10dd3df1915 100644
--- a/tensorflow/core/kernels/fill_functor.cc
+++ b/tensorflow/core/kernels/fill_functor.cc
@@ -45,8 +45,6 @@ DEFINE_SETZERO_CPU(Eigen::half);
 DEFINE_SETZERO_CPU(bfloat16);
 DEFINE_SETZERO_CPU(float);
 DEFINE_SETZERO_CPU(double);
-DEFINE_SETZERO_CPU(uint32);
-DEFINE_SETZERO_CPU(uint64);
 DEFINE_SETZERO_CPU(uint8);
 DEFINE_SETZERO_CPU(int8);
 DEFINE_SETZERO_CPU(uint16);
@@ -98,8 +96,6 @@ DEFINE_SETONE_CPU(Eigen::half);
 DEFINE_SETONE_CPU(bfloat16);
 DEFINE_SETONE_CPU(float);
 DEFINE_SETONE_CPU(double);
-DEFINE_SETONE_CPU(uint32);
-DEFINE_SETONE_CPU(uint64);
 DEFINE_SETONE_CPU(uint8);
 DEFINE_SETONE_CPU(int8);
 DEFINE_SETONE_CPU(uint16);
@@ -141,6 +137,7 @@ struct FillFunctor<Eigen::ThreadPoolDevice, T> {
 TF_CALL_ALL_TYPES(DEFINE_FILL_CPU);
 DEFINE_FILL_CPU(quint8);
 DEFINE_FILL_CPU(quint16);
+DEFINE_FILL_CPU(uint32);
 #undef DEFINE_FILL_CPU
 
 #ifdef TENSORFLOW_USE_SYCL
diff --git a/tensorflow/core/kernels/gather_op.cc b/tensorflow/core/kernels/gather_op.cc
index 948567e019a..6d493a5f2ea 100644
--- a/tensorflow/core/kernels/gather_op.cc
+++ b/tensorflow/core/kernels/gather_op.cc
@@ -211,6 +211,8 @@ TF_CALL_ALL_TYPES(REGISTER_GATHER_CPU);
 TF_CALL_QUANTIZED_TYPES(REGISTER_GATHER_CPU);
 TF_CALL_quint16(REGISTER_GATHER_CPU);
 TF_CALL_qint16(REGISTER_GATHER_CPU);
+TF_CALL_uint32(REGISTER_GATHER_CPU);
+TF_CALL_uint64(REGISTER_GATHER_CPU);
 
 #undef REGISTER_GATHER_CPU
 
diff --git a/tensorflow/core/kernels/identity_op.cc b/tensorflow/core/kernels/identity_op.cc
index 4b226dd72d4..d15b64597f5 100644
--- a/tensorflow/core/kernels/identity_op.cc
+++ b/tensorflow/core/kernels/identity_op.cc
@@ -122,6 +122,7 @@ REGISTER_SYCL_HOST_KERNEL(bool);
 
 TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_GPU_KERNEL);
 REGISTER_GPU_KERNEL(Variant);
+TF_CALL_uint32(REGISTER_GPU_KERNEL);
 REGISTER_GPU_KERNEL(bool);
 
 #undef REGISTER_GPU_KERNEL
diff --git a/tensorflow/core/kernels/ragged_gather_op.cc b/tensorflow/core/kernels/ragged_gather_op.cc
index 3bf82cba050..88c0d1ebd69 100644
--- a/tensorflow/core/kernels/ragged_gather_op.cc
+++ b/tensorflow/core/kernels/ragged_gather_op.cc
@@ -296,6 +296,8 @@ TF_CALL_tstring(REGISTER_CPU_KERNEL);
 TF_CALL_QUANTIZED_TYPES(REGISTER_CPU_KERNEL);
 TF_CALL_quint16(REGISTER_CPU_KERNEL);
 TF_CALL_qint16(REGISTER_CPU_KERNEL);
+TF_CALL_uint32(REGISTER_CPU_KERNEL);
+TF_CALL_uint64(REGISTER_CPU_KERNEL);
 #undef REGISTER_CPU_KERNEL
 #undef REGISTER_CPU_KERNEL_WITH_INDEX_TYPE
 
diff --git a/tensorflow/core/kernels/ragged_tensor_from_variant_op.cc b/tensorflow/core/kernels/ragged_tensor_from_variant_op.cc
index ad0712e6fd0..f83bcb38c6c 100644
--- a/tensorflow/core/kernels/ragged_tensor_from_variant_op.cc
+++ b/tensorflow/core/kernels/ragged_tensor_from_variant_op.cc
@@ -308,6 +308,8 @@ TF_CALL_tstring(REGISTER_KERNELS);
 TF_CALL_QUANTIZED_TYPES(REGISTER_KERNELS);
 TF_CALL_quint16(REGISTER_KERNELS);
 TF_CALL_qint16(REGISTER_KERNELS);
+TF_CALL_uint32(REGISTER_KERNELS);
+TF_CALL_uint64(REGISTER_KERNELS);
 #undef REGISTER_KERNELS
 #undef REGISTER_KERNELS_WITH_SPLIT_TYPE
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/ragged_tensor_to_tensor_op.cc b/tensorflow/core/kernels/ragged_tensor_to_tensor_op.cc
index 9ae5d7ffbdc..d729c43f25a 100644
--- a/tensorflow/core/kernels/ragged_tensor_to_tensor_op.cc
+++ b/tensorflow/core/kernels/ragged_tensor_to_tensor_op.cc
@@ -561,6 +561,8 @@ TF_CALL_string(REGISTER_CPU_KERNEL);
 TF_CALL_QUANTIZED_TYPES(REGISTER_CPU_KERNEL);
 TF_CALL_quint16(REGISTER_CPU_KERNEL);
 TF_CALL_qint16(REGISTER_CPU_KERNEL);
+TF_CALL_uint32(REGISTER_CPU_KERNEL);
+TF_CALL_uint64(REGISTER_CPU_KERNEL);
 
 #undef REGISTER_CPU_KERNEL
 
diff --git a/tensorflow/core/kernels/ragged_tensor_to_variant_op.cc b/tensorflow/core/kernels/ragged_tensor_to_variant_op.cc
index 64c372b005e..7a5ae1c6240 100644
--- a/tensorflow/core/kernels/ragged_tensor_to_variant_op.cc
+++ b/tensorflow/core/kernels/ragged_tensor_to_variant_op.cc
@@ -213,6 +213,8 @@ TF_CALL_tstring(REGISTER_KERNELS);
 TF_CALL_QUANTIZED_TYPES(REGISTER_KERNELS);
 TF_CALL_quint16(REGISTER_KERNELS);
 TF_CALL_qint16(REGISTER_KERNELS);
+TF_CALL_uint32(REGISTER_KERNELS);
+TF_CALL_uint64(REGISTER_KERNELS);
 #undef REGISTER_KERNELS
 #undef REGISTER_KERNELS_WITH_SPLIT_TYPE
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/resource_variable_ops.cc b/tensorflow/core/kernels/resource_variable_ops.cc
index 79a64cb9219..0fc1d53749f 100644
--- a/tensorflow/core/kernels/resource_variable_ops.cc
+++ b/tensorflow/core/kernels/resource_variable_ops.cc
@@ -512,6 +512,7 @@ class AssignVariableOp<Device, Variant> : public OpKernel {
 
 TF_CALL_ALL_TYPES(REGISTER_KERNELS);
 TF_CALL_QUANTIZED_TYPES(REGISTER_KERNELS);
+TF_CALL_uint32(REGISTER_KERNELS);
 #undef REGISTER_KERNELS
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/tensorflow/core/kernels/split_lib_cpu.cc b/tensorflow/core/kernels/split_lib_cpu.cc
index a3060e4e90d..0cb0a94d498 100644
--- a/tensorflow/core/kernels/split_lib_cpu.cc
+++ b/tensorflow/core/kernels/split_lib_cpu.cc
@@ -43,6 +43,7 @@ void Split<Eigen::ThreadPoolDevice, T, NDims>::operator()(
 
 TF_CALL_ALL_TYPES(DEFINE_CPU_KERNELS)
 DEFINE_CPU_KERNELS(quint8)
+DEFINE_CPU_KERNELS(uint64)
 
 #ifdef TENSORFLOW_USE_SYCL
 template <typename T, int NDims>
diff --git a/tensorflow/core/kernels/split_op.cc b/tensorflow/core/kernels/split_op.cc
index 08575f01f67..f09740c6198 100644
--- a/tensorflow/core/kernels/split_op.cc
+++ b/tensorflow/core/kernels/split_op.cc
@@ -404,6 +404,7 @@ class SplitOpSYCL : public SplitOpBase<SYCLDevice, T> {
 
 TF_CALL_ALL_TYPES(REGISTER_SPLIT);
 REGISTER_SPLIT(quint8);
+REGISTER_SPLIT(uint64);
 
 #undef REGISTER_SPLIT
 
diff --git a/tensorflow/core/kernels/strided_slice_op.cc b/tensorflow/core/kernels/strided_slice_op.cc
index b4099213303..ccc1984bb98 100644
--- a/tensorflow/core/kernels/strided_slice_op.cc
+++ b/tensorflow/core/kernels/strided_slice_op.cc
@@ -440,6 +440,8 @@ class StridedSliceAssignOp : public OpKernel {
                           StridedSliceAssignOp<CPUDevice, type, true>)
 
 TF_CALL_ALL_TYPES(REGISTER_STRIDED_SLICE);
+TF_CALL_uint32(REGISTER_STRIDED_SLICE);
+TF_CALL_uint64(REGISTER_STRIDED_SLICE);
 
 #undef REGISTER_STRIDED_SLICE
 
diff --git a/tensorflow/core/kernels/strided_slice_op_impl.h b/tensorflow/core/kernels/strided_slice_op_impl.h
index 5ce1d773e33..1ae959b7b3f 100644
--- a/tensorflow/core/kernels/strided_slice_op_impl.h
+++ b/tensorflow/core/kernels/strided_slice_op_impl.h
@@ -287,6 +287,8 @@ TF_CALL_GPU_ALL_TYPES(DECLARE_FOR_N_GPU);
 #endif  // END GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 TF_CALL_ALL_TYPES(DECLARE_FOR_N_CPU);
+TF_CALL_uint32(DECLARE_FOR_N_CPU);
+TF_CALL_uint64(DECLARE_FOR_N_CPU);
 
 #ifdef TENSORFLOW_USE_SYCL
 #define PREVENT_FOR_N_SYCL(T) \
diff --git a/tensorflow/core/kernels/topk_op.cc b/tensorflow/core/kernels/topk_op.cc
index 50325b7bcfe..c555b42f005 100644
--- a/tensorflow/core/kernels/topk_op.cc
+++ b/tensorflow/core/kernels/topk_op.cc
@@ -258,6 +258,7 @@ namespace functor {
 
 TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPEC);
 TF_CALL_INTEGRAL_TYPES(DECLARE_GPU_SPEC);
+TF_CALL_uint32(DECLARE_GPU_SPEC);
 
 #undef DECLARE_GPU_SPEC
 
@@ -275,6 +276,7 @@ TF_CALL_INTEGRAL_TYPES(DECLARE_GPU_SPEC);
 
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_KERNELS);
 TF_CALL_INTEGRAL_TYPES(REGISTER_KERNELS);
+TF_CALL_uint32(REGISTER_KERNELS)
 #undef REGISTER_KERNELS
 
 #endif  // end GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/topk_op_gpu_uint32.cu.cc b/tensorflow/core/kernels/topk_op_gpu_uint32.cu.cc
deleted file mode 100644
index 16e2e0e9420..00000000000
--- a/tensorflow/core/kernels/topk_op_gpu_uint32.cu.cc
+++ /dev/null
@@ -1,28 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#if GOOGLE_CUDA
-#define EIGEN_USE_GPU
-
-#include "tensorflow/core/kernels/topk_op.h"
-#include "tensorflow/core/kernels/topk_op_gpu.h"
-
-namespace tensorflow {
-using Eigen::GpuDevice;
-
-template struct functor::TopKFunctor<GPUDevice, uint32>;
-}  // namespace tensorflow
-
-#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/topk_op_gpu_uint64.cu.cc b/tensorflow/core/kernels/topk_op_gpu_uint64.cu.cc
deleted file mode 100644
index 895247a63a2..00000000000
--- a/tensorflow/core/kernels/topk_op_gpu_uint64.cu.cc
+++ /dev/null
@@ -1,28 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#if GOOGLE_CUDA
-#define EIGEN_USE_GPU
-
-#include "tensorflow/core/kernels/topk_op.h"
-#include "tensorflow/core/kernels/topk_op_gpu.h"
-
-namespace tensorflow {
-using Eigen::GpuDevice;
-
-template struct functor::TopKFunctor<GPUDevice, uint64>;
-}  // namespace tensorflow
-
-#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/util/batch_util.cc b/tensorflow/core/util/batch_util.cc
index e03188b04da..b88c365ced0 100644
--- a/tensorflow/core/util/batch_util.cc
+++ b/tensorflow/core/util/batch_util.cc
@@ -182,6 +182,8 @@ Status CopyElementToSlice(Tensor element, Tensor* parent, int64 index) {
   switch (element.dtype()) {
     TF_CALL_ALL_TYPES(HANDLE_TYPE);
     TF_CALL_QUANTIZED_TYPES(HANDLE_TYPE);
+    TF_CALL_uint32(HANDLE_TYPE);
+    TF_CALL_uint64(HANDLE_TYPE);
 #undef HANDLE_TYPE
     default:
       return errors::Unimplemented("CopyElementToSlice Unhandled data type: ",
@@ -205,6 +207,8 @@ Status CopySliceToElement(const Tensor& parent, Tensor* element, int64 index) {
   switch (parent.dtype()) {
     TF_CALL_ALL_TYPES(HANDLE_TYPE);
     TF_CALL_QUANTIZED_TYPES(HANDLE_TYPE);
+    TF_CALL_uint32(HANDLE_TYPE);
+    TF_CALL_uint64(HANDLE_TYPE);
 #undef HANDLE_TYPE
     default:
       return errors::Unimplemented("CopySliceToElement Unhandled data type: ",
@@ -276,6 +280,8 @@ Status CopyContiguousSlices(const Tensor& src, int64 src_offset,
   switch (src.dtype()) {
     TF_CALL_ALL_TYPES(HANDLE_TYPE);
     TF_CALL_QUANTIZED_TYPES(HANDLE_TYPE);
+    TF_CALL_uint32(HANDLE_TYPE);
+    TF_CALL_uint64(HANDLE_TYPE);
 #undef HANDLE_TYPE
     default:
       return errors::Unimplemented("CopyContiguousSlices unhandled data type: ",
@@ -302,6 +308,8 @@ Status MaybeMoveSliceToElement(Tensor* parent, Tensor* element, int64 index) {
   switch (parent->dtype()) {
     TF_CALL_ALL_TYPES(HANDLE_TYPE);
     TF_CALL_QUANTIZED_TYPES(HANDLE_TYPE);
+    TF_CALL_uint32(HANDLE_TYPE);
+    TF_CALL_uint64(HANDLE_TYPE);
 #undef HANDLE_TYPE
     default:
       return errors::Unimplemented(
diff --git a/tensorflow/core/util/saved_tensor_slice_util.h b/tensorflow/core/util/saved_tensor_slice_util.h
index 1f9768f5163..09b9235b711 100644
--- a/tensorflow/core/util/saved_tensor_slice_util.h
+++ b/tensorflow/core/util/saved_tensor_slice_util.h
@@ -116,9 +116,7 @@ TENSOR_PROTO_EXTRACT_TYPE(double, double, double);
 TENSOR_PROTO_EXTRACT_TYPE_COMPLEX(complex64, scomplex, float);
 TENSOR_PROTO_EXTRACT_TYPE_COMPLEX(complex128, dcomplex, double);
 TENSOR_PROTO_EXTRACT_TYPE(int32, int, int32);
-TENSOR_PROTO_EXTRACT_TYPE(uint32, uint32, uint32);
 TENSOR_PROTO_EXTRACT_TYPE(int64, int64, protobuf_int64);
-TENSOR_PROTO_EXTRACT_TYPE(uint64, uint64, protobuf_uint64);
 TENSOR_PROTO_EXTRACT_TYPE(uint16, int, int32);
 TENSOR_PROTO_EXTRACT_TYPE(uint8, int, int32);
 TENSOR_PROTO_EXTRACT_TYPE(int8, int, int32);

From 793c5ca70b26763c4f85f3c2961d82653d665caf Mon Sep 17 00:00:00 2001
From: Peng Wang <wangpeng@google.com>
Date: Sat, 20 Jun 2020 15:46:53 -0700
Subject: [PATCH 0718/1390] [TF-numpy] Fixed einsum's signature to be Python2
 compatible.

PiperOrigin-RevId: 317490711
Change-Id: Ia33d3958bb3163131d40262cd7247411be4ade52
---
 tensorflow/python/ops/numpy_ops/np_math_ops.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tensorflow/python/ops/numpy_ops/np_math_ops.py b/tensorflow/python/ops/numpy_ops/np_math_ops.py
index f9d3b34e90d..b242e51a2e5 100644
--- a/tensorflow/python/ops/numpy_ops/np_math_ops.py
+++ b/tensorflow/python/ops/numpy_ops/np_math_ops.py
@@ -1340,7 +1340,9 @@ def meshgrid(*xi, **kwargs):
 
 
 @np_utils.np_doc('einsum')
-def einsum(subscripts, *operands, casting='safe', optimize=False):  # pylint: disable=missing-docstring
+def einsum(subscripts, *operands, **kwargs):  # pylint: disable=missing-docstring
+  casting = kwargs.get('casting', 'safe')
+  optimize = kwargs.get('optimize', False)
   if casting == 'safe':
     operands = np_array_ops._promote_dtype(*operands)  # pylint: disable=protected-access
   elif casting == 'no':

From 443e2ae2847cc75df554507e53c083ed5312039e Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sat, 20 Jun 2020 17:06:03 -0700
Subject: [PATCH 0719/1390] Slightly relax test tolerance to fix failure in
 xla_gpu version of image preprocessing test.

PiperOrigin-RevId: 317495267
Change-Id: I47cc7556f207e7f8061d68320b764c2663bcccc1
---
 .../keras/layers/preprocessing/image_preprocessing_test.py      | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/keras/layers/preprocessing/image_preprocessing_test.py b/tensorflow/python/keras/layers/preprocessing/image_preprocessing_test.py
index 24011225a29..5cb7cec5b7b 100644
--- a/tensorflow/python/keras/layers/preprocessing/image_preprocessing_test.py
+++ b/tensorflow/python/keras/layers/preprocessing/image_preprocessing_test.py
@@ -976,7 +976,7 @@ class RandomRotationTest(keras_parameterized.TestCase):
         output = strat.run(lambda: layer(input_images, training=True))
       values = output.values
       self.assertAllEqual(2, len(values))
-      self.assertAllClose(values[0], values[1])
+      self.assertAllClose(values[0], values[1], rtol=1e-5)
 
   @tf_test_util.run_v2_only
   def test_config_with_custom_name(self):

From 2e7f0f4f0491fae7ac9c4d0893b87820f8a00fad Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sat, 20 Jun 2020 20:32:38 -0700
Subject: [PATCH 0720/1390] GitHub Issue #40462 fix typo

PiperOrigin-RevId: 317504961
Change-Id: I80c98d7f317280588a400eadcd0adf19d728ce66
---
 tensorflow/python/keras/preprocessing/image_dataset.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/keras/preprocessing/image_dataset.py b/tensorflow/python/keras/preprocessing/image_dataset.py
index 6e78ac3c03a..7c7df083559 100644
--- a/tensorflow/python/keras/preprocessing/image_dataset.py
+++ b/tensorflow/python/keras/preprocessing/image_dataset.py
@@ -147,7 +147,7 @@ def image_dataset_from_directory(directory,
           'directory. If you wish to infer the labels from the subdirectory '
           'names in the target directory, pass `labels="inferred"`. '
           'If you wish to get a dataset that only contains images '
-          '(no labels), pass `labels_mode=None`.')
+          '(no labels), pass `label_mode=None`.')
     if class_names:
       raise ValueError('You can only pass `class_names` if the labels are '
                        'inferred from the subdirectory names in the target '

From 50c07b67b1ae8b2bbbb33282bf5f7506b164dabc Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sun, 21 Jun 2020 02:02:00 -0700
Subject: [PATCH 0721/1390] Update GraphDef version to 439.

PiperOrigin-RevId: 317522315
Change-Id: I7af95c686872f428e270b19f3d6ca6e24b77841d
---
 tensorflow/core/public/version.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index 64f1ef6b585..bfea50a1932 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -108,7 +108,7 @@ limitations under the License.
 
 #define TF_GRAPH_DEF_VERSION_MIN_PRODUCER 0
 #define TF_GRAPH_DEF_VERSION_MIN_CONSUMER 0
-#define TF_GRAPH_DEF_VERSION 438  // Updated: 2020/6/20
+#define TF_GRAPH_DEF_VERSION 439  // Updated: 2020/6/21
 
 // Checkpoint compatibility versions (the versions field in SavedSliceMeta).
 //

From f45d6083b766183e045552dafa8e46586eb3d4fc Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sun, 21 Jun 2020 02:02:03 -0700
Subject: [PATCH 0722/1390] compat: Update forward compatibility horizon to
 2020-06-21

PiperOrigin-RevId: 317522319
Change-Id: Id26ff638f85872a3615fe2fe71e8cf0fced872ed
---
 tensorflow/python/compat/compat.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/compat/compat.py b/tensorflow/python/compat/compat.py
index ba6e375843b..3f622ef844d 100644
--- a/tensorflow/python/compat/compat.py
+++ b/tensorflow/python/compat/compat.py
@@ -33,7 +33,7 @@ from tensorflow.python.util.tf_export import tf_export
 # This value changes every day with an automatic CL. It can be modified in code
 # via `forward_compatibility_horizon()` or with the environment variable
 # TF_FORWARD_COMPATIBILITY_DELTA_DAYS, which is added to the compatibility date.
-_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2020, 6, 20)
+_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2020, 6, 21)
 _FORWARD_COMPATIBILITY_DELTA_DAYS_VAR_NAME = "TF_FORWARD_COMPATIBILITY_DELTA_DAYS"
 _FORWARD_COMPATIBILITY_DATE_NUMBER = None
 

From a1f9c4d80b68c47deddfbfb4ee5ccc093d347d48 Mon Sep 17 00:00:00 2001
From: Samuel Holt <6444377+samholt@users.noreply.github.com>
Date: Mon, 22 Jun 2020 01:28:32 +0100
Subject: [PATCH 0723/1390] Update
 tensorflow/python/keras/layers/convolutional.py

Co-authored-by: kyscg <yasaswisrichandragandhi@gmail.com>
---
 tensorflow/python/keras/layers/convolutional.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/keras/layers/convolutional.py b/tensorflow/python/keras/layers/convolutional.py
index 471d94570a5..19d5ea71527 100644
--- a/tensorflow/python/keras/layers/convolutional.py
+++ b/tensorflow/python/keras/layers/convolutional.py
@@ -421,7 +421,7 @@ class Conv1D(Conv):
       specifying the stride length of the convolution.
       Specifying any stride value != 1 is incompatible with specifying
       any `dilation_rate` value != 1.
-    padding: One of `"valid"`, `"causal"` or `"same"` (case-insensitive).
+    padding: One of `"valid"`, `"same"` or `"causal"` (case-insensitive).
       `"valid"` means no padding. `"same"` results in padding evenly to 
       the left/right or up/down of the input such that output has the same 
       height/width dimension as the input.

From ebf57bdfc7f942394d4d6c67d6bbe62276cc75e6 Mon Sep 17 00:00:00 2001
From: Brian Zhao <bmzhao@google.com>
Date: Sun, 21 Jun 2020 19:06:43 -0700
Subject: [PATCH 0724/1390] Moving RAII helpers for TensorHandle, Tensor, and
 Operation to their respective classes.

PiperOrigin-RevId: 317578771
Change-Id: Iaf674696ea7d7dfdf94924f4c60d555a613c5f57
---
 tensorflow/c/eager/abstract_context.h         | 14 +++++
 tensorflow/c/eager/abstract_operation.h       | 15 ++++++
 tensorflow/c/eager/abstract_tensor_handle.h   | 16 ++++++
 .../c/eager/immediate_execution_context.h     | 15 ++++++
 .../c/eager/immediate_execution_operation.h   | 16 ++++++
 .../eager/immediate_execution_tensor_handle.h | 14 +++++
 .../c/experimental/saved_model/core/ops/BUILD | 50 +++--------------
 .../core/ops/owned_eager_context.h            | 54 -------------------
 .../saved_model/core/ops/owned_eager_op.h     | 42 ---------------
 .../saved_model/core/ops/owned_tensor.h       | 42 ---------------
 .../core/ops/owned_tensor_handle.h            | 54 -------------------
 .../saved_model/core/ops/variable_ops.cc      | 29 +++++-----
 .../saved_model/core/ops/variable_ops.h       |  5 +-
 .../saved_model/core/ops/variable_ops_test.cc | 21 ++++----
 tensorflow/c/tensor_interface.h               | 14 +++++
 .../core/common_runtime/eager/context.h       | 13 +++++
 16 files changed, 152 insertions(+), 262 deletions(-)
 delete mode 100644 tensorflow/c/experimental/saved_model/core/ops/owned_eager_context.h
 delete mode 100644 tensorflow/c/experimental/saved_model/core/ops/owned_eager_op.h
 delete mode 100644 tensorflow/c/experimental/saved_model/core/ops/owned_tensor.h
 delete mode 100644 tensorflow/c/experimental/saved_model/core/ops/owned_tensor_handle.h

diff --git a/tensorflow/c/eager/abstract_context.h b/tensorflow/c/eager/abstract_context.h
index 59c726349ac..36d983e1408 100644
--- a/tensorflow/c/eager/abstract_context.h
+++ b/tensorflow/c/eager/abstract_context.h
@@ -15,6 +15,7 @@ limitations under the License.
 #ifndef TENSORFLOW_C_EAGER_ABSTRACT_CONTEXT_H_
 #define TENSORFLOW_C_EAGER_ABSTRACT_CONTEXT_H_
 
+#include <memory>
 #include <vector>
 
 #include "tensorflow/c/eager/abstract_function.h"
@@ -64,6 +65,19 @@ class AbstractContext {
   const AbstractContextKind kind_;
 };
 
+namespace internal {
+struct AbstractContextDeleter {
+  void operator()(AbstractContext* p) const {
+    if (p != nullptr) {
+      p->Release();
+    }
+  }
+};
+}  // namespace internal
+
+using AbstractContextPtr =
+    std::unique_ptr<AbstractContext, internal::AbstractContextDeleter>;
+
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_C_EAGER_ABSTRACT_CONTEXT_H_
diff --git a/tensorflow/c/eager/abstract_operation.h b/tensorflow/c/eager/abstract_operation.h
index da4b6ecb75e..817d7656ec8 100644
--- a/tensorflow/c/eager/abstract_operation.h
+++ b/tensorflow/c/eager/abstract_operation.h
@@ -15,6 +15,8 @@ limitations under the License.
 #ifndef TENSORFLOW_C_EAGER_ABSTRACT_OPERATION_H_
 #define TENSORFLOW_C_EAGER_ABSTRACT_OPERATION_H_
 
+#include <memory>
+
 #include "absl/types/span.h"
 #include "tensorflow/c/eager/abstract_tensor_handle.h"
 #include "tensorflow/c/tensor_interface.h"
@@ -110,6 +112,19 @@ class AbstractOperation {
   const AbstractOperationKind kind_;
 };
 
+namespace internal {
+struct AbstractOperationDeleter {
+  void operator()(AbstractOperation* p) const {
+    if (p != nullptr) {
+      p->Release();
+    }
+  }
+};
+}  // namespace internal
+
+using AbstractOpPtr =
+    std::unique_ptr<AbstractOperation, internal::AbstractOperationDeleter>;
+
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_C_EAGER_ABSTRACT_OPERATION_H_
diff --git a/tensorflow/c/eager/abstract_tensor_handle.h b/tensorflow/c/eager/abstract_tensor_handle.h
index 14acac29bb9..64b941d0729 100644
--- a/tensorflow/c/eager/abstract_tensor_handle.h
+++ b/tensorflow/c/eager/abstract_tensor_handle.h
@@ -15,6 +15,8 @@ limitations under the License.
 #ifndef TENSORFLOW_C_EAGER_ABSTRACT_TENSOR_HANDLE_H_
 #define TENSORFLOW_C_EAGER_ABSTRACT_TENSOR_HANDLE_H_
 
+#include <memory>
+
 namespace tensorflow {
 
 // Abstract interface to a Tensor handle in either tracing or immediate
@@ -40,6 +42,20 @@ class AbstractTensorHandle {
   const AbstractTensorHandleKind kind_;
 };
 
+namespace internal {
+struct AbstractTensorHandleDeleter {
+  void operator()(AbstractTensorHandle* p) const {
+    if (p != nullptr) {
+      p->Release();
+    }
+  }
+};
+}  // namespace internal
+
+using AbstractTensorHandlePtr =
+    std::unique_ptr<AbstractTensorHandle,
+                    internal::AbstractTensorHandleDeleter>;
+
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_C_EAGER_ABSTRACT_TENSOR_HANDLE_H_
diff --git a/tensorflow/c/eager/immediate_execution_context.h b/tensorflow/c/eager/immediate_execution_context.h
index 0e3fe8cd4e1..77d59dd23e2 100644
--- a/tensorflow/c/eager/immediate_execution_context.h
+++ b/tensorflow/c/eager/immediate_execution_context.h
@@ -15,6 +15,7 @@ limitations under the License.
 #ifndef TENSORFLOW_C_EAGER_IMMEDIATE_EXECUTION_CONTEXT_H_
 #define TENSORFLOW_C_EAGER_IMMEDIATE_EXECUTION_CONTEXT_H_
 
+#include <memory>
 #include <vector>
 
 #include "absl/types/optional.h"
@@ -107,6 +108,20 @@ class ImmediateExecutionContext : public AbstractContext {
   ~ImmediateExecutionContext() override {}
 };
 
+namespace internal {
+struct ImmediateExecutionContextDeleter {
+  void operator()(ImmediateExecutionContext* p) const {
+    if (p != nullptr) {
+      p->Release();
+    }
+  }
+};
+}  // namespace internal
+
+using ImmediateContextPtr =
+    std::unique_ptr<ImmediateExecutionContext,
+                    internal::ImmediateExecutionContextDeleter>;
+
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_C_EAGER_IMMEDIATE_EXECUTION_CONTEXT_H_
diff --git a/tensorflow/c/eager/immediate_execution_operation.h b/tensorflow/c/eager/immediate_execution_operation.h
index 31413b5b4b9..4e2959ba7af 100644
--- a/tensorflow/c/eager/immediate_execution_operation.h
+++ b/tensorflow/c/eager/immediate_execution_operation.h
@@ -15,6 +15,8 @@ limitations under the License.
 #ifndef TENSORFLOW_C_EAGER_IMMEDIATE_EXECUTION_OPERATION_H_
 #define TENSORFLOW_C_EAGER_IMMEDIATE_EXECUTION_OPERATION_H_
 
+#include <memory>
+
 #include "absl/types/span.h"
 #include "tensorflow/c/eager/abstract_operation.h"
 #include "tensorflow/c/eager/immediate_execution_tensor_handle.h"
@@ -48,6 +50,20 @@ class ImmediateExecutionOperation : public AbstractOperation {
   ~ImmediateExecutionOperation() override {}
 };
 
+namespace internal {
+struct ImmediateExecutionOperationDeleter {
+  void operator()(ImmediateExecutionOperation* p) const {
+    if (p != nullptr) {
+      p->Release();
+    }
+  }
+};
+}  // namespace internal
+
+using ImmediateOpPtr =
+    std::unique_ptr<ImmediateExecutionOperation,
+                    internal::ImmediateExecutionOperationDeleter>;
+
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_C_EAGER_IMMEDIATE_EXECUTION_OPERATION_H_
diff --git a/tensorflow/c/eager/immediate_execution_tensor_handle.h b/tensorflow/c/eager/immediate_execution_tensor_handle.h
index 1f5a77e54ee..31aa3aa0f75 100644
--- a/tensorflow/c/eager/immediate_execution_tensor_handle.h
+++ b/tensorflow/c/eager/immediate_execution_tensor_handle.h
@@ -59,6 +59,20 @@ class ImmediateExecutionTensorHandle : public AbstractTensorHandle {
   ~ImmediateExecutionTensorHandle() override {}
 };
 
+namespace internal {
+struct ImmediateExecutionTensorHandleDeleter {
+  void operator()(ImmediateExecutionTensorHandle* p) const {
+    if (p != nullptr) {
+      p->Release();
+    }
+  }
+};
+}  // namespace internal
+
+using ImmediateTensorHandlePtr =
+    std::unique_ptr<ImmediateExecutionTensorHandle,
+                    internal::ImmediateExecutionTensorHandleDeleter>;
+
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_C_EAGER_IMMEDIATE_EXECUTION_TENSOR_HANDLE_H_
diff --git a/tensorflow/c/experimental/saved_model/core/ops/BUILD b/tensorflow/c/experimental/saved_model/core/ops/BUILD
index 332b92bec45..1e2496487f9 100644
--- a/tensorflow/c/experimental/saved_model/core/ops/BUILD
+++ b/tensorflow/c/experimental/saved_model/core/ops/BUILD
@@ -14,44 +14,6 @@ package(
     licenses = ["notice"],  # Apache 2.0
 )
 
-cc_library(
-    name = "owned_eager_op",
-    hdrs = [
-        "owned_eager_op.h",
-    ],
-    deps = [
-        "//tensorflow/c/eager:immediate_execution_operation",
-    ],
-)
-
-cc_library(
-    name = "owned_tensor_handle",
-    hdrs = [
-        "owned_tensor_handle.h",
-    ],
-    deps = [
-        "//tensorflow/c/eager:immediate_execution_tensor_handle",
-        "//tensorflow/core/common_runtime/eager:tensor_handle",
-    ],
-)
-
-cc_library(
-    name = "owned_eager_context",
-    hdrs = ["owned_eager_context.h"],
-    deps = [
-        "//tensorflow/c/eager:immediate_execution_context",
-        "//tensorflow/core/common_runtime/eager:context",
-    ],
-)
-
-cc_library(
-    name = "owned_tensor",
-    hdrs = ["owned_tensor.h"],
-    deps = [
-        "//tensorflow/c:tensor_interface",
-    ],
-)
-
 cc_library(
     name = "variable_ops",
     srcs = [
@@ -61,10 +23,10 @@ cc_library(
         "variable_ops.h",
     ],
     deps = [
-        ":owned_eager_op",
-        ":owned_tensor_handle",
+        "//tensorflow/c/eager:abstract_operation",
         "//tensorflow/c/eager:abstract_tensor_handle",
         "//tensorflow/c/eager:immediate_execution_context",
+        "//tensorflow/c/eager:immediate_execution_operation",
         "//tensorflow/c/eager:immediate_execution_tensor_handle",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
@@ -79,11 +41,11 @@ tf_cc_test(
         "variable_ops_test.cc",
     ],
     deps = [
-        ":owned_eager_context",
-        ":owned_tensor",
-        ":owned_tensor_handle",
         ":variable_ops",
-        "//tensorflow/core:all_kernels",
+        "//tensorflow/c:tensor_interface",
+        "//tensorflow/c/eager:abstract_tensor_handle",
+        "//tensorflow/c/eager:immediate_execution_context",
+        "//tensorflow/c/eager:immediate_execution_tensor_handle",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
diff --git a/tensorflow/c/experimental/saved_model/core/ops/owned_eager_context.h b/tensorflow/c/experimental/saved_model/core/ops/owned_eager_context.h
deleted file mode 100644
index d944fcb51a2..00000000000
--- a/tensorflow/c/experimental/saved_model/core/ops/owned_eager_context.h
+++ /dev/null
@@ -1,54 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_OPS_OWNED_EAGER_CONTEXT_H_
-#define TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_OPS_OWNED_EAGER_CONTEXT_H_
-
-#include <memory>
-
-#include "tensorflow/c/eager/immediate_execution_context.h"
-#include "tensorflow/core/common_runtime/eager/context.h"
-
-namespace tensorflow {
-namespace internal {
-
-struct ImmediateExecutionContextDeleter {
-  void operator()(ImmediateExecutionContext* p) const {
-    if (p != nullptr) {
-      p->Release();
-    }
-  }
-};
-
-struct EagerContextDeleter {
-  void operator()(EagerContext* p) const {
-    if (p != nullptr) {
-      p->Release();
-    }
-  }
-};
-
-}  // namespace internal
-
-using AbstractContextPtr =
-    std::unique_ptr<ImmediateExecutionContext,
-                    internal::ImmediateExecutionContextDeleter>;
-
-using EagerContextPtr =
-    std::unique_ptr<EagerContext, internal::EagerContextDeleter>;
-
-}  // namespace tensorflow
-
-#endif  // THIRD_PARTY_TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_OPS_OWNED_EAGER_CONTEXT_H_
diff --git a/tensorflow/c/experimental/saved_model/core/ops/owned_eager_op.h b/tensorflow/c/experimental/saved_model/core/ops/owned_eager_op.h
deleted file mode 100644
index b3a08334a97..00000000000
--- a/tensorflow/c/experimental/saved_model/core/ops/owned_eager_op.h
+++ /dev/null
@@ -1,42 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_OWNED_EAGER_OP_H_
-#define TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_OWNED_EAGER_OP_H_
-
-#include <memory>
-
-#include "tensorflow/c/eager/immediate_execution_operation.h"
-
-namespace tensorflow {
-namespace internal {
-
-struct ImmediateExecutionOperationDeleter {
-  void operator()(ImmediateExecutionOperation* p) const {
-    if (p != nullptr) {
-      p->Release();
-    }
-  }
-};
-
-}  // namespace internal
-
-using AbstractOpPtr =
-    std::unique_ptr<ImmediateExecutionOperation,
-                    internal::ImmediateExecutionOperationDeleter>;
-
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_OWNED_EAGER_OP_H_
diff --git a/tensorflow/c/experimental/saved_model/core/ops/owned_tensor.h b/tensorflow/c/experimental/saved_model/core/ops/owned_tensor.h
deleted file mode 100644
index 335d9e46c7a..00000000000
--- a/tensorflow/c/experimental/saved_model/core/ops/owned_tensor.h
+++ /dev/null
@@ -1,42 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_OPS_OWNED_TENSOR_H_
-#define TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_OPS_OWNED_TENSOR_H_
-
-#include <memory>
-
-#include "tensorflow/c/tensor_interface.h"
-
-namespace tensorflow {
-namespace internal {
-
-struct AbstractTensorInterfaceDeleter {
-  void operator()(AbstractTensorInterface* p) const {
-    if (p != nullptr) {
-      p->Release();
-    }
-  }
-};
-
-}  // namespace internal
-
-using AbstractTensorPtr =
-    std::unique_ptr<AbstractTensorInterface,
-                    internal::AbstractTensorInterfaceDeleter>;
-
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_OPS_OWNED_TENSOR_H_
diff --git a/tensorflow/c/experimental/saved_model/core/ops/owned_tensor_handle.h b/tensorflow/c/experimental/saved_model/core/ops/owned_tensor_handle.h
deleted file mode 100644
index c52ebaa2479..00000000000
--- a/tensorflow/c/experimental/saved_model/core/ops/owned_tensor_handle.h
+++ /dev/null
@@ -1,54 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_OWNED_TENSOR_HANDLE_H_
-#define TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_OWNED_TENSOR_HANDLE_H_
-
-#include <memory>
-
-#include "tensorflow/c/eager/immediate_execution_tensor_handle.h"
-#include "tensorflow/core/common_runtime/eager/tensor_handle.h"
-
-namespace tensorflow {
-namespace internal {
-
-struct TensorHandleDeleter {
-  void operator()(TensorHandle* p) const {
-    if (p != nullptr) {
-      p->Release();
-    }
-  }
-};
-
-struct AbstractTensorHandleDeleter {
-  void operator()(ImmediateExecutionTensorHandle* p) const {
-    if (p != nullptr) {
-      p->Release();
-    }
-  }
-};
-
-}  // namespace internal
-
-using TensorHandlePtr =
-    std::unique_ptr<TensorHandle, internal::TensorHandleDeleter>;
-
-using AbstractTensorHandlePtr =
-    std::unique_ptr<ImmediateExecutionTensorHandle,
-                    internal::AbstractTensorHandleDeleter>;
-
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_OWNED_TENSOR_HANDLE_H_
diff --git a/tensorflow/c/experimental/saved_model/core/ops/variable_ops.cc b/tensorflow/c/experimental/saved_model/core/ops/variable_ops.cc
index eb06662722e..67c592fc16b 100644
--- a/tensorflow/c/experimental/saved_model/core/ops/variable_ops.cc
+++ b/tensorflow/c/experimental/saved_model/core/ops/variable_ops.cc
@@ -16,10 +16,11 @@ limitations under the License.
 #include "tensorflow/c/experimental/saved_model/core/ops/variable_ops.h"
 
 #include "absl/types/span.h"
+#include "tensorflow/c/eager/abstract_operation.h"
 #include "tensorflow/c/eager/abstract_tensor_handle.h"
 #include "tensorflow/c/eager/immediate_execution_context.h"
-#include "tensorflow/c/experimental/saved_model/core/ops/owned_eager_op.h"
-#include "tensorflow/c/experimental/saved_model/core/ops/owned_tensor_handle.h"
+#include "tensorflow/c/eager/immediate_execution_operation.h"
+#include "tensorflow/c/eager/immediate_execution_tensor_handle.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/lib/gtl/inlined_vector.h"
@@ -35,8 +36,8 @@ static const char kNoSharingResourceID[] =
 
 Status CreateUninitializedResourceVariable(ImmediateExecutionContext* ctx,
                                            DataType dtype, TensorShape shape,
-                                           AbstractTensorHandlePtr* handle) {
-  AbstractOpPtr varhandle_op = AbstractOpPtr(ctx->CreateOperation());
+                                           ImmediateTensorHandlePtr* handle) {
+  ImmediateOpPtr varhandle_op(ctx->CreateOperation());
 
   TF_RETURN_IF_ERROR(varhandle_op->Reset("VarHandleOp", nullptr));
   TF_RETURN_IF_ERROR(varhandle_op->SetAttrType("dtype", dtype));
@@ -55,17 +56,19 @@ Status CreateUninitializedResourceVariable(ImmediateExecutionContext* ctx,
   int num_retvals = 1;
   TF_RETURN_IF_ERROR(varhandle_op->Execute(
       absl::MakeSpan(&var_handle, num_retvals), &num_retvals));
-  if (var_handle->getKind() != ImmediateExecutionTensorHandle::kKind) {
+  AbstractTensorHandlePtr owned_var_handle(var_handle);
+  if (owned_var_handle->getKind() != ImmediateExecutionTensorHandle::kKind) {
     return errors::Internal("Unexpected tensor handle kind.");
   }
-  handle->reset(reinterpret_cast<ImmediateExecutionTensorHandle*>(var_handle));
+  handle->reset(reinterpret_cast<ImmediateExecutionTensorHandle*>(
+      owned_var_handle.release()));
   return Status();
 }
 
 Status AssignVariable(ImmediateExecutionContext* ctx,
                       ImmediateExecutionTensorHandle* variable_handle,
                       DataType dtype, ImmediateExecutionTensorHandle* value) {
-  AbstractOpPtr assign_op(ctx->CreateOperation());
+  ImmediateOpPtr assign_op(ctx->CreateOperation());
   TF_RETURN_IF_ERROR(assign_op->Reset("AssignVariableOp", nullptr));
   TF_RETURN_IF_ERROR(assign_op->SetAttrType("dtype", dtype));
   TF_RETURN_IF_ERROR(assign_op->AddInput(variable_handle));
@@ -78,8 +81,8 @@ Status AssignVariable(ImmediateExecutionContext* ctx,
 
 Status ReadVariable(ImmediateExecutionContext* ctx,
                     ImmediateExecutionTensorHandle* variable_handle,
-                    DataType dtype, AbstractTensorHandlePtr* output) {
-  AbstractOpPtr read_op = AbstractOpPtr(ctx->CreateOperation());
+                    DataType dtype, ImmediateTensorHandlePtr* output) {
+  ImmediateOpPtr read_op(ctx->CreateOperation());
   TF_RETURN_IF_ERROR(read_op->Reset("ReadVariableOp", nullptr));
   TF_RETURN_IF_ERROR(read_op->SetAttrType("dtype", dtype));
   TF_RETURN_IF_ERROR(read_op->AddInput(variable_handle));
@@ -88,16 +91,18 @@ Status ReadVariable(ImmediateExecutionContext* ctx,
   int num_retvals = 1;
   TF_RETURN_IF_ERROR(
       read_op->Execute(absl::MakeSpan(&value, num_retvals), &num_retvals));
-  if (value->getKind() != ImmediateExecutionTensorHandle::kKind) {
+  AbstractTensorHandlePtr owned_value(value);
+  if (owned_value->getKind() != ImmediateExecutionTensorHandle::kKind) {
     return errors::Internal("Unexpected tensor handle kind.");
   }
-  output->reset(reinterpret_cast<ImmediateExecutionTensorHandle*>(value));
+  output->reset(
+      reinterpret_cast<ImmediateExecutionTensorHandle*>(owned_value.release()));
   return Status();
 }
 
 Status DestroyResource(ImmediateExecutionContext* ctx,
                        ImmediateExecutionTensorHandle* handle) {
-  AbstractOpPtr destroy_op = AbstractOpPtr(ctx->CreateOperation());
+  ImmediateOpPtr destroy_op(ctx->CreateOperation());
   TF_RETURN_IF_ERROR(destroy_op->Reset("DestroyResourceOp", nullptr));
   TF_RETURN_IF_ERROR(destroy_op->SetAttrBool("ignore_lookup_error", true));
   TF_RETURN_IF_ERROR(destroy_op->AddInput(handle));
diff --git a/tensorflow/c/experimental/saved_model/core/ops/variable_ops.h b/tensorflow/c/experimental/saved_model/core/ops/variable_ops.h
index 038b2c3d62a..13c941a77fe 100644
--- a/tensorflow/c/experimental/saved_model/core/ops/variable_ops.h
+++ b/tensorflow/c/experimental/saved_model/core/ops/variable_ops.h
@@ -18,7 +18,6 @@ limitations under the License.
 
 #include "tensorflow/c/eager/immediate_execution_context.h"
 #include "tensorflow/c/eager/immediate_execution_tensor_handle.h"
-#include "tensorflow/c/experimental/saved_model/core/ops/owned_tensor_handle.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/platform/status.h"
@@ -32,7 +31,7 @@ namespace internal {
 // https://github.com/tensorflow/tensorflow/blob/516608035f85cec8b126712b0ff8407220206b22/tensorflow/python/ops/resource_variable_ops.py#L1867-L1872
 Status CreateUninitializedResourceVariable(ImmediateExecutionContext* ctx,
                                            DataType dtype, TensorShape shape,
-                                           AbstractTensorHandlePtr* handle);
+                                           ImmediateTensorHandlePtr* handle);
 
 // Executes an AssignVariableOp using `ctx`, assigning the variable associated
 // with `variable_handle` with `value`. `dtype` must be the datatype of the
@@ -48,7 +47,7 @@ Status AssignVariable(ImmediateExecutionContext* ctx,
 // the dtype of the variable associated with `variable_handle`.
 Status ReadVariable(ImmediateExecutionContext* ctx,
                     ImmediateExecutionTensorHandle* variable_handle,
-                    DataType dtype, AbstractTensorHandlePtr* output);
+                    DataType dtype, ImmediateTensorHandlePtr* output);
 
 // Executes DestroyResourceOp on `handle`, using `ctx`. This is equivalent to
 // the cleanup that occurs in a tf.Variable's EagerResourceDeleter:
diff --git a/tensorflow/c/experimental/saved_model/core/ops/variable_ops_test.cc b/tensorflow/c/experimental/saved_model/core/ops/variable_ops_test.cc
index 3c57ed4d38a..09c45332efc 100644
--- a/tensorflow/c/experimental/saved_model/core/ops/variable_ops_test.cc
+++ b/tensorflow/c/experimental/saved_model/core/ops/variable_ops_test.cc
@@ -17,9 +17,8 @@ limitations under the License.
 
 #include <memory>
 
-#include "tensorflow/c/experimental/saved_model/core/ops/owned_eager_context.h"
-#include "tensorflow/c/experimental/saved_model/core/ops/owned_tensor.h"
-#include "tensorflow/c/experimental/saved_model/core/ops/owned_tensor_handle.h"
+#include "tensorflow/c/eager/immediate_execution_tensor_handle.h"
+#include "tensorflow/c/tensor_interface.h"
 #include "tensorflow/core/common_runtime/device_mgr.h"
 #include "tensorflow/core/common_runtime/eager/context.h"
 #include "tensorflow/core/framework/tensor.h"
@@ -30,10 +29,10 @@ limitations under the License.
 namespace tensorflow {
 namespace {
 
-AbstractTensorHandlePtr CreateScalarTensorHandle(EagerContext* context,
-                                                 float value) {
+ImmediateTensorHandlePtr CreateScalarTensorHandle(EagerContext* context,
+                                                  float value) {
   AbstractTensorPtr tensor(context->CreateFloatScalar(value));
-  AbstractTensorHandlePtr handle(context->CreateLocalHandle(tensor.get()));
+  ImmediateTensorHandlePtr handle(context->CreateLocalHandle(tensor.get()));
   return handle;
 }
 
@@ -62,7 +61,7 @@ class VariableOpsTest : public ::testing::Test {
 // Sanity check for variable creation
 TEST_F(VariableOpsTest, CreateVariableSuccessful) {
   // Create a DT_Resource TensorHandle that points to a scalar DT_FLOAT tensor
-  AbstractTensorHandlePtr handle;
+  ImmediateTensorHandlePtr handle;
   TF_EXPECT_OK(internal::CreateUninitializedResourceVariable(
       context(), DT_FLOAT, {}, &handle));
   // The created TensorHandle should be a DT_Resource
@@ -72,7 +71,7 @@ TEST_F(VariableOpsTest, CreateVariableSuccessful) {
 // Sanity check for variable destruction
 TEST_F(VariableOpsTest, DestroyVariableSuccessful) {
   // Create a DT_Resource TensorHandle that points to a scalar DT_FLOAT tensor
-  AbstractTensorHandlePtr handle;
+  ImmediateTensorHandlePtr handle;
   TF_EXPECT_OK(internal::CreateUninitializedResourceVariable(
       context(), DT_FLOAT, {}, &handle));
 
@@ -83,18 +82,18 @@ TEST_F(VariableOpsTest, DestroyVariableSuccessful) {
 // Sanity check for handle assignment and reading
 TEST_F(VariableOpsTest, AssignVariableAndReadSuccessful) {
   // Create a DT_Resource TensorHandle that points to a scalar DT_FLOAT tensor
-  AbstractTensorHandlePtr variable;
+  ImmediateTensorHandlePtr variable;
   TF_EXPECT_OK(internal::CreateUninitializedResourceVariable(
       context(), DT_FLOAT, {}, &variable));
 
   // Create a Scalar float TensorHandle with value 42, and assign it to
   // the variable.
-  AbstractTensorHandlePtr my_value = CreateScalarTensorHandle(context(), 42.0);
+  ImmediateTensorHandlePtr my_value = CreateScalarTensorHandle(context(), 42.0);
   TF_EXPECT_OK(internal::AssignVariable(context(), variable.get(), DT_FLOAT,
                                         my_value.get()));
 
   // Read back the value from the variable, and check that it is 42.
-  AbstractTensorHandlePtr read_value_handle;
+  ImmediateTensorHandlePtr read_value_handle;
   TF_EXPECT_OK(internal::ReadVariable(context(), variable.get(), DT_FLOAT,
                                       &read_value_handle));
   Status status;
diff --git a/tensorflow/c/tensor_interface.h b/tensorflow/c/tensor_interface.h
index eb0d28b0bf9..d165c84980c 100644
--- a/tensorflow/c/tensor_interface.h
+++ b/tensorflow/c/tensor_interface.h
@@ -54,6 +54,20 @@ class AbstractTensorInterface {
   virtual ~AbstractTensorInterface() {}
 };
 
+namespace internal {
+struct AbstractTensorInterfaceDeleter {
+  void operator()(AbstractTensorInterface* p) const {
+    if (p != nullptr) {
+      p->Release();
+    }
+  }
+};
+}  // namespace internal
+
+using AbstractTensorPtr =
+    std::unique_ptr<AbstractTensorInterface,
+                    internal::AbstractTensorInterfaceDeleter>;
+
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_C_TENSOR_INTERFACE_H_
diff --git a/tensorflow/core/common_runtime/eager/context.h b/tensorflow/core/common_runtime/eager/context.h
index 141327c08cb..3dab7c08d77 100644
--- a/tensorflow/core/common_runtime/eager/context.h
+++ b/tensorflow/core/common_runtime/eager/context.h
@@ -722,6 +722,19 @@ inline EagerContext* ContextFromInterface(ImmediateExecutionContext* context) {
   return down_cast<EagerContext*>(context);
 }
 
+namespace internal {
+struct EagerContextDeleter {
+  void operator()(EagerContext* p) const {
+    if (p != nullptr) {
+      p->Release();
+    }
+  }
+};
+}  // namespace internal
+
+using EagerContextPtr =
+    std::unique_ptr<EagerContext, internal::EagerContextDeleter>;
+
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_CORE_COMMON_RUNTIME_EAGER_CONTEXT_H_

From 8e9a05117bd5c810fa49d56a4cb8a99dc952eac4 Mon Sep 17 00:00:00 2001
From: Shanqing Cai <cais@google.com>
Date: Sun, 21 Jun 2020 19:49:53 -0700
Subject: [PATCH 0725/1390] Improve the documentation of
 tf.debugging.assert_shapes()

- Specify the first argument as a list of (tensor, shape) tuples.
  We no longer want to publicly support hashable Tensors, as in TF2
  Tensors are not hashable.
- Better explain the shape iterables.

PiperOrigin-RevId: 317581098
Change-Id: I6bb2a114c884ee5b282f085865a5e6e30eb6ae15
---
 tensorflow/python/ops/check_ops.py | 29 ++++++++++++++++++++++-------
 1 file changed, 22 insertions(+), 7 deletions(-)

diff --git a/tensorflow/python/ops/check_ops.py b/tensorflow/python/ops/check_ops.py
index 680796df48d..1e4b3ca1bb4 100644
--- a/tensorflow/python/ops/check_ops.py
+++ b/tensorflow/python/ops/check_ops.py
@@ -146,7 +146,7 @@ def _unary_assert_doc(sym, sym_name):
 
     Raises:
       InvalidArgumentError: if the check can be performed immediately and
-        `x {sym}` is False. The check can be performed immediately during 
+        `x {sym}` is False. The check can be performed immediately during
         eager execution or if `x` is statically known.
     """.format(
         sym=sym, sym_name=cap_sym_name, opname=opname)
@@ -209,7 +209,7 @@ def _binary_assert_doc(sym):
 
     Raises:
       InvalidArgumentError: if the check can be performed immediately and
-        `x {sym} y` is False. The check can be performed immediately during 
+        `x {sym} y` is False. The check can be performed immediately during
         eager execution or if `x` and `y` are statically known.
     """.format(
         sym=sym, opname=opname)
@@ -1634,7 +1634,7 @@ def assert_shapes_v2(shapes, data=None, summarize=None, message=None,
   >>> n = 10
   >>> q = 3
   >>> d = 7
-  >>> x = tf.zeros([n,q]) 
+  >>> x = tf.zeros([n,q])
   >>> y = tf.ones([n,d])
   >>> param = tf.Variable([1.0, 2.0, 3.0])
   >>> scalar = 1.0
@@ -1644,9 +1644,9 @@ def assert_shapes_v2(shapes, data=None, summarize=None, message=None,
   ...  (param, ('Q',)),
   ...  (scalar, ()),
   ... ])
-  
+
   >>> tf.debugging.assert_shapes([
-  ...   (x, ('N', 'D')), 
+  ...   (x, ('N', 'D')),
   ...   (y, ('N', 'D'))
   ... ])
   Traceback (most recent call last):
@@ -1745,8 +1745,23 @@ def assert_shapes(shapes, data=None, summarize=None, message=None, name=None):
   prefix) are both treated as having a single dimension of size one.
 
   Args:
-    shapes: dictionary with (`Tensor` to shape) items, or a list of
-      (`Tensor`, shape) tuples. A shape must be an iterable.
+    shapes: A list of (`Tensor`, `shape`) tuples, wherein `shape` is the
+      expected shape of `Tensor`. See the example code above. The `shape` must
+      be an iterable. Each element of the iterable can be either a concrete
+      integer value or a string that abstractly represents the dimension.
+      For example,
+        - `('N', 'Q')` specifies a 2D shape wherein the first and second
+          dimensions of shape may or may not be equal.
+        - `('N', 'N', 'Q')` specifies a 3D shape wherein the first and second
+          dimensions are equal.
+        - `(1, 'N')` specifies a 2D shape wherein the first dimension is
+          exactly 1 and the second dimension can be any value.
+      Note that the abstract dimension letters take effect across different
+      tuple elements of the list. For example,
+      `tf.debugging.assert_shapes([(x, ('N', 'A')), (y, ('N', 'B'))]` asserts
+      that both `x` and `y` are rank-2 tensors and their first dimensions are
+      equal (`N`).
+      `shape` can also be a `tf.TensorShape`.
     data: The tensors to print out if the condition is False.  Defaults to error
       message and first few entries of the violating tensor.
     summarize: Print this many entries of the tensor.

From 142ddd38a5cc4acf6f6880b64751376267ab527a Mon Sep 17 00:00:00 2001
From: rahul-kamat <rahulkamat@gmail.com>
Date: Mon, 22 Jun 2020 03:34:21 +0000
Subject: [PATCH 0726/1390] Extract finding arg annotation logic into helper
 function

---
 tensorflow/python/framework/python_op_gen.cc | 37 +++++++++-----------
 tensorflow/python/framework/python_op_gen.h  |  6 ++++
 2 files changed, 23 insertions(+), 20 deletions(-)

diff --git a/tensorflow/python/framework/python_op_gen.cc b/tensorflow/python/framework/python_op_gen.cc
index 116d86017be..42fb88bec88 100644
--- a/tensorflow/python/framework/python_op_gen.cc
+++ b/tensorflow/python/framework/python_op_gen.cc
@@ -449,29 +449,13 @@ std::unordered_map<string, string> GenEagerPythonOp::GetTypeAnnotationMap() {
   for (const auto& arg : op_def_.input_arg()) {
     // Do not add type annotations to args that accept a sequence of Tensors
     if (!arg.number_attr().empty()) continue;
-    if (type_annotations.find(arg.type_attr()) != type_annotations.end()) {
-      // Get the correct TypeVar if input maps to an attr
-      type_annotations[arg.name()] = "_ops.Tensor[" + type_annotations[arg.type_attr()] + "]";
-    } else {
-      // Get the dtype of the Tensor
-      const string py_dtype = python_op_gen_internal::DataTypeToPython(arg.type(), "_dtypes.");
-      if (dtype_type.find(py_dtype) != dtype_type.end()) {
-        type_annotations[arg.name()] = "_ops.Tensor[" + dtype_type[py_dtype] + "]";
-      }
-    }
+    type_annotations[arg.name()] = GetArgAnnotation(arg, type_annotations);
   }
 
   // Mapping output Tensor to its type
   if (op_def_.output_arg_size() == 1) {
     const auto& arg = op_def_.output_arg(0);
-    if (type_annotations.find(arg.type_attr()) != type_annotations.end()) {
-      type_annotations[arg.name()] = "_ops.Tensor[" + type_annotations[arg.type_attr()] + "]";
-    } else {
-      const string py_dtype = python_op_gen_internal::DataTypeToPython(arg.type(), "_dtypes.");
-      if (dtype_type.find(py_dtype) != dtype_type.end()) {
-        type_annotations[arg.name()] = "_ops.Tensor[" + dtype_type[py_dtype] + "]";
-      }
-    }
+    type_annotations[arg.name()] = GetArgAnnotation(arg, type_annotations);
   }
 
   return type_annotations;
@@ -527,8 +511,6 @@ void GenEagerPythonOp::AddReturnTypeAnnotation(std::unordered_map<string, string
   }
 }
 
-
-
 void GenEagerPythonOp::HandleGraphMode(
     const string& function_setup, const std::vector<string>& output_sizes) {
   strings::StrAppend(&result_, "  # Add nodes to the TensorFlow graph.\n");
@@ -1262,4 +1244,19 @@ string GetPythonWrappers(const char* op_list_buf, size_t op_list_len) {
   return GetPythonOpsImpl(ops, api_def_map, {});
 }
 
+string GetArgAnnotation(const auto& arg, std::unordered_map<string, string>& type_annotations) {
+  if (type_annotations.find(arg.type_attr()) != type_annotations.end()) {
+    // Get the correct TypeVar if arg maps to an attr
+    return "_ops.Tensor[" + type_annotations[arg.type_attr()] + "]";
+  } else {
+    // Get the dtype of the Tensor
+    const string py_dtype = python_op_gen_internal::DataTypeToPython(arg.type(), "_dtypes.");
+    if (dtype_type.find(py_dtype) != dtype_type.end()) {
+      return "_ops.Tensor[" + dtype_type[py_dtype] + "]";
+    }
+  }
+
+  return "Any";
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/python/framework/python_op_gen.h b/tensorflow/python/framework/python_op_gen.h
index f1cd6e49013..8d5eb68bed1 100644
--- a/tensorflow/python/framework/python_op_gen.h
+++ b/tensorflow/python/framework/python_op_gen.h
@@ -49,6 +49,12 @@ void PrintPythonOps(const OpList& ops, const ApiDefMap& api_defs,
 // length of that buffer.
 string GetPythonWrappers(const char* op_list_buf, size_t op_list_len);
 
+// Get the type annotation for an arg
+// `arg` should be an input or output of an op
+// `type_annotations` should contain attr names mapped to TypeVar names
+string GetArgAnnotation(const auto& arg,
+                        std::unordered_map<string, string>& type_annotations);
+
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_PYTHON_FRAMEWORK_PYTHON_OP_GEN_H_

From db121c7f4b9f8d5a993fc9ed7c502bba2b68b135 Mon Sep 17 00:00:00 2001
From: YoungSeok Yoon <youngseokyoon@google.com>
Date: Sun, 21 Jun 2020 21:08:24 -0700
Subject: [PATCH 0727/1390] Clean up Makefile based build scripts for TFLite
 iOS

For TFLite iOS, bazel has been the recommended way to build the static framework
and the Makefile based script is not even meeting feature parity with the bazel
build (e.g., no GPU delegate support).

PiperOrigin-RevId: 317587116
Change-Id: Idbdf60687f35e1ef0bdb074668317818eeaa95e5
---
 .../lite/lib_package/create_ios_frameworks.sh | 120 ------------------
 .../tools/make/build_ios_universal_lib.sh     |  12 +-
 2 files changed, 10 insertions(+), 122 deletions(-)
 delete mode 100755 tensorflow/lite/lib_package/create_ios_frameworks.sh

diff --git a/tensorflow/lite/lib_package/create_ios_frameworks.sh b/tensorflow/lite/lib_package/create_ios_frameworks.sh
deleted file mode 100755
index 71aee65848f..00000000000
--- a/tensorflow/lite/lib_package/create_ios_frameworks.sh
+++ /dev/null
@@ -1,120 +0,0 @@
-#!/bin/bash
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-# TODO(ycling): Refactoring - Move this script into `tools/make`.
-set -e
-
-echo "Starting"
-TFLITE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)/.."
-
-usage() {
-  echo "Usage: $(basename "$0") [-a]"
-  echo "-g build with GPU delegate"
-  exit 1
-}
-
-USE_GPU_DELEGATE="false"
-FRAMEWORK_NAME="tensorflow_lite"
-while getopts "g" opt_name; do
-  case "$opt_name" in
-    g)
-        USE_GPU_DELEGATE="true"
-        FRAMEWORK_NAME="tensorflow_lite_gpu"
-        ;;
-    *) usage;;
-  esac
-done
-shift $((OPTIND - 1))
-readonly USE_GPU_DELEGATE
-readonly FRAMEWORK_NAME
-
-if [ $USE_GPU_DELEGATE == "true" ] ; then
-  for filename in metal_delegate.h libmetal_delegate.a ; do
-    if [[ ! -f "${TFLITE_DIR}/delegates/gpu/${filename}" ]] ; then
-      echo "File ${TFLITE_DIR}/delegates/gpu/${filename} doesn't exist."
-      echo "It's required for building TFLite Framework with GPU. Aborting."
-      exit 1
-    fi
-  done
-fi
-
-TMP_DIR=$(mktemp -d)
-echo "Package dir: " $TMP_DIR
-FW_DIR=$TMP_DIR/tensorflow_lite_ios_frameworks
-FW_DIR_TFLITE=$FW_DIR/$FRAMEWORK_NAME.framework
-FW_DIR_TFLITE_HDRS=$FW_DIR_TFLITE/Headers
-
-echo "Creating target Headers directories"
-mkdir -p $FW_DIR_TFLITE_HDRS
-
-echo "Headers, populating: TensorFlow Lite"
-cd $TFLITE_DIR/../..
-
-find tensorflow/lite -name '*.h' \
-    -not -path 'tensorflow/lite/tools/*' \
-    -not -path 'tensorflow/lite/examples/*' \
-    -not -path 'tensorflow/lite/gen/*' \
-    -not -path 'tensorflow/lite/toco/*' \
-    -not -path 'tensorflow/lite/nnapi/*' \
-    -not -path 'tensorflow/lite/java/*' \
-    | tar -cf $FW_DIR_TFLITE_HDRS/tmp.tar -T -
-cd $FW_DIR_TFLITE_HDRS
-tar xf tmp.tar
-rm -f tmp.tar
-
-echo "Headers, populating: Flatbuffer"
-cd $TFLITE_DIR/tools/make/downloads/flatbuffers/include/
-find . -name '*.h' | tar -cf $FW_DIR_TFLITE_HDRS/tmp.tar -T -
-cd $FW_DIR_TFLITE_HDRS
-tar xf tmp.tar
-rm -f tmp.tar
-
-cd $TFLITE_DIR/../..
-echo "Generate LICENSE files and copy to target"
-bazel build //tensorflow/tools/lib_package:clicenses_generate
-cp $TFLITE_DIR/../../LICENSE $FW_DIR_TFLITE
-cp $TFLITE_DIR/../../bazel-bin/tensorflow/tools/lib_package/THIRD_PARTY_TF_C_LICENSES \
-   $FW_DIR_TFLITE
-
-echo "Copying static libraries"
-# Note: There must be a static library with the same name
-# as the framework name.
-cp $TFLITE_DIR/tools/make/gen/lib/libtensorflow-lite.a \
-    $FW_DIR_TFLITE/$FRAMEWORK_NAME
-if [ $USE_GPU_DELEGATE == "true" ] ; then
-  cp "${TFLITE_DIR}/delegates/gpu/libmetal_delegate.a" \
-      $FW_DIR_TFLITE/libmetal_delegate.a
-fi
-
-# This is required, otherwise they interfere with the documentation of the
-# pod at cocoapods.org.
-echo "Remove all README files"
-cd $FW_DIR_TFLITE_HDRS
-find . -type f -name README\* -exec rm -f {} \;
-find . -type f -name readme\* -exec rm -f {} \;
-
-TARGET_GEN_LOCATION="$TFLITE_DIR/gen/ios_frameworks"
-echo "Moving results to target: " $TARGET_GEN_LOCATION
-cd $FW_DIR
-zip -q -r $FRAMEWORK_NAME.framework.zip $FRAMEWORK_NAME.framework -x .DS_Store
-rm -rf $TARGET_GEN_LOCATION
-mkdir -p $TARGET_GEN_LOCATION
-cp -r $FRAMEWORK_NAME.framework.zip $TARGET_GEN_LOCATION
-
-echo "Cleaning up"
-rm -rf $TMP_DIR
-
-echo "Finished"
diff --git a/tensorflow/lite/tools/make/build_ios_universal_lib.sh b/tensorflow/lite/tools/make/build_ios_universal_lib.sh
index 74bf9183541..87ca0649303 100755
--- a/tensorflow/lite/tools/make/build_ios_universal_lib.sh
+++ b/tensorflow/lite/tools/make/build_ios_universal_lib.sh
@@ -1,4 +1,4 @@
-#!/bin/bash -x
+#!/bin/bash +x
 # Copyright 2017 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -14,7 +14,15 @@
 # limitations under the License.
 # ==============================================================================
 
-set -e
+echo "========================================================================="
+echo "WARNING: This build script is deprecated and no longer maintained. Please"
+echo "         refer to the iOS build guide to learn how to build the latest   "
+echo "         version of TFLite static framework for iOS using bazel.         "
+echo "         https://www.tensorflow.org/lite/guide/build_ios                 "
+echo "========================================================================="
+sleep 5s
+
+set -ex
 
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 cd "$SCRIPT_DIR/../../../.."

From f24d25d8603c3fb7465922ad0e68737dade34db2 Mon Sep 17 00:00:00 2001
From: "Jae H. Yoo" <jaeyoo@google.com>
Date: Sun, 21 Jun 2020 21:29:52 -0700
Subject: [PATCH 0728/1390] Rename kTfLiteActRelu1 to kTfLiteActReluN1To1

This CL renames `kTfLiteActRelu1` to `kTfLiteActReluN1To1` because it has minimum clipping at -1, not 0. Also, This CL finishes renaming because TFLite already uses `kTfLiteBuiltinReluN1To1`, `ActivationFunctionType_RELU_N1_TO_1` for this op.

PiperOrigin-RevId: 317589358
Change-Id: I2424104da45234346749b3921d563e9161e809cc
---
 tensorflow/lite/c/builtin_op_data.h           |  5 +++--
 .../lite/core/api/flatbuffer_conversions.cc   |  2 +-
 .../delegates/gpu/common/model_builder.cc     |  6 +++---
 .../hexagon/builders/conv_2d_builder.cc       |  2 +-
 tensorflow/lite/delegates/hexagon/utils.cc    |  2 +-
 .../delegates/xnnpack/xnnpack_delegate.cc     |  4 ++--
 .../builders/activation_layer_builder.cc      |  6 +++---
 .../lite/experimental/writer/enum_mapping.h   |  2 +-
 tensorflow/lite/kernels/fully_connected.cc    |  2 +-
 .../lite/kernels/internal/tensor_utils.h      |  2 +-
 tensorflow/lite/kernels/kernel_util.cc        |  2 +-
 tensorflow/lite/kernels/kernel_util.h         |  2 +-
 .../lite/micro/kernels/activation_utils.h     |  2 +-
 tensorflow/lite/micro/kernels/add_test.cc     |  6 +++---
 tensorflow/lite/micro/kernels/mul_test.cc     |  2 +-
 tensorflow/lite/micro/kernels/pooling_test.cc | 19 ++++++++++---------
 tensorflow/lite/micro/kernels/sub_test.cc     |  6 +++---
 17 files changed, 37 insertions(+), 35 deletions(-)

diff --git a/tensorflow/lite/c/builtin_op_data.h b/tensorflow/lite/c/builtin_op_data.h
index 9e0e82bc906..232f5f95928 100644
--- a/tensorflow/lite/c/builtin_op_data.h
+++ b/tensorflow/lite/c/builtin_op_data.h
@@ -67,8 +67,9 @@ typedef struct {
 typedef enum {
   kTfLiteActNone = 0,
   kTfLiteActRelu,
-  kTfLiteActRelu1,  // min(max(-1, x), 1)
-  kTfLiteActRelu6,  // min(max(0, x), 6)
+  kTfLiteActReluN1To1,                    // min(max(-1, x), 1)
+  kTfLiteActRelu1 = kTfLiteActReluN1To1,  // kTfLiteActRelu1 will be deprecated.
+  kTfLiteActRelu6,                        // min(max(0, x), 6)
   kTfLiteActTanh,
   kTfLiteActSignBit,
   kTfLiteActSigmoid,
diff --git a/tensorflow/lite/core/api/flatbuffer_conversions.cc b/tensorflow/lite/core/api/flatbuffer_conversions.cc
index 2a4dfbb6ff4..73d785bf369 100644
--- a/tensorflow/lite/core/api/flatbuffer_conversions.cc
+++ b/tensorflow/lite/core/api/flatbuffer_conversions.cc
@@ -109,7 +109,7 @@ TfLiteFusedActivation ConvertActivation(ActivationFunctionType activation) {
     case ActivationFunctionType_RELU:
       return kTfLiteActRelu;
     case ActivationFunctionType_RELU_N1_TO_1:
-      return kTfLiteActRelu1;
+      return kTfLiteActReluN1To1;
     case ActivationFunctionType_RELU6:
       return kTfLiteActRelu6;
     case ActivationFunctionType_TANH:
diff --git a/tensorflow/lite/delegates/gpu/common/model_builder.cc b/tensorflow/lite/delegates/gpu/common/model_builder.cc
index 01f94c94888..8b5261cfd98 100644
--- a/tensorflow/lite/delegates/gpu/common/model_builder.cc
+++ b/tensorflow/lite/delegates/gpu/common/model_builder.cc
@@ -109,7 +109,7 @@ absl::Status IsActivationSupported(TfLiteFusedActivation fused_activation) {
   switch (fused_activation) {
     case kTfLiteActNone:
     case kTfLiteActRelu:
-    case kTfLiteActRelu1:
+    case kTfLiteActReluN1To1:
     case kTfLiteActRelu6:
     case kTfLiteActTanh:
       return absl::OkStatus();
@@ -140,12 +140,12 @@ absl::Status MaybeFuseActivation(TfLiteFusedActivation fused_activation,
   }
   switch (fused_activation) {
     case kTfLiteActRelu:
-    case kTfLiteActRelu1:
+    case kTfLiteActReluN1To1:
     case kTfLiteActRelu6: {
       ReLUAttributes attr;
       attr.clip = fused_activation == kTfLiteActRelu
                       ? 0.0f
-                      : (fused_activation == kTfLiteActRelu1 ? 1.0f : 6.0f);
+                      : (fused_activation == kTfLiteActReluN1To1 ? 1.0f : 6.0f);
       for (auto index : output_indices) {
         Node* activation_node;
         RETURN_IF_ERROR(
diff --git a/tensorflow/lite/delegates/hexagon/builders/conv_2d_builder.cc b/tensorflow/lite/delegates/hexagon/builders/conv_2d_builder.cc
index a366522e35c..cfddd2c2b97 100644
--- a/tensorflow/lite/delegates/hexagon/builders/conv_2d_builder.cc
+++ b/tensorflow/lite/delegates/hexagon/builders/conv_2d_builder.cc
@@ -197,7 +197,7 @@ TfLiteStatus Conv2dOpBuilder::PopulateSubGraph(const TfLiteIntArray* inputs,
   if (activation == kTfLiteActRelu6) {
     conv_output_min = 0;
     conv_output_max = 6;
-  } else if (activation == kTfLiteActRelu1) {
+  } else if (activation == kTfLiteActReluN1To1) {
     conv_output_min = -1;
     conv_output_max = 1;
   } else if (activation == kTfLiteActRelu) {
diff --git a/tensorflow/lite/delegates/hexagon/utils.cc b/tensorflow/lite/delegates/hexagon/utils.cc
index 9253836a3b1..223d4a8a826 100644
--- a/tensorflow/lite/delegates/hexagon/utils.cc
+++ b/tensorflow/lite/delegates/hexagon/utils.cc
@@ -26,7 +26,7 @@ namespace {
 
 bool IsActivationReluOrNone(TfLiteFusedActivation activation) {
   return (activation == kTfLiteActRelu || activation == kTfLiteActRelu6 ||
-          activation == kTfLiteActRelu1 || activation == kTfLiteActNone);
+          activation == kTfLiteActReluN1To1 || activation == kTfLiteActNone);
 }
 
 bool TensorTypeMatch(int tensor_id, TfLiteContext* context,
diff --git a/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.cc b/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.cc
index 739e45f62e4..0afc9c32122 100644
--- a/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.cc
+++ b/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.cc
@@ -330,7 +330,7 @@ class Subgraph {
         *output_min = 0.0f;
         *output_max = +std::numeric_limits<float>::infinity();
         return kTfLiteOk;
-      case kTfLiteActRelu1:
+      case kTfLiteActReluN1To1:
         *output_min = -1.0f;
         *output_max = +1.0f;
         return kTfLiteOk;
@@ -497,7 +497,7 @@ class Subgraph {
             context, "unsupported fused activation (Relu) in node #%d",
             node_index);
         return kTfLiteOk;
-      case kTfLiteActRelu1:
+      case kTfLiteActReluN1To1:
         TF_LITE_MAYBE_KERNEL_LOG(
             context, "unsupported fused activation (ReluMinus1To1) in node #%d",
             node_index);
diff --git a/tensorflow/lite/experimental/delegates/coreml/builders/activation_layer_builder.cc b/tensorflow/lite/experimental/delegates/coreml/builders/activation_layer_builder.cc
index ec032d8421e..df853797c8a 100644
--- a/tensorflow/lite/experimental/delegates/coreml/builders/activation_layer_builder.cc
+++ b/tensorflow/lite/experimental/delegates/coreml/builders/activation_layer_builder.cc
@@ -41,7 +41,7 @@ CoreML::Specification::NeuralNetworkLayer* ActivationLayerBuilder::Build() {
       layer_->mutable_activation()->mutable_relu();
       break;
     // Relu1 and Relu6 layers are fully composed in PopulateSubgraph().
-    case kTfLiteActRelu1:  // clip(-1, 1)
+    case kTfLiteActReluN1To1:  // clip(-1, 1)
       layer_->mutable_unary()->set_alpha(-1);
       layer_->mutable_unary()->set_type(
           CoreML::Specification::UnaryFunctionLayerParams::THRESHOLD);
@@ -64,7 +64,7 @@ CoreML::Specification::NeuralNetworkLayer* ActivationLayerBuilder::Build() {
 }
 
 TfLiteStatus ActivationLayerBuilder::PopulateSubgraph(TfLiteContext* context) {
-  if (!(activation_ == kTfLiteActRelu6 || activation_ == kTfLiteActRelu1)) {
+  if (!(activation_ == kTfLiteActRelu6 || activation_ == kTfLiteActReluN1To1)) {
     builder_output_ = AddOutput();
     return kTfLiteOk;
   }
@@ -125,7 +125,7 @@ OpBuilder* CreateReluOpBuilder(GraphBuilder* graph_builder) {
 }
 
 OpBuilder* CreateReluN1To1OpBuilder(GraphBuilder* graph_builder) {
-  return new ActivationLayerBuilder(graph_builder, kTfLiteActRelu1);
+  return new ActivationLayerBuilder(graph_builder, kTfLiteActReluN1To1);
 }
 
 OpBuilder* CreateRelu6OpBuilder(GraphBuilder* graph_builder) {
diff --git a/tensorflow/lite/experimental/writer/enum_mapping.h b/tensorflow/lite/experimental/writer/enum_mapping.h
index b78d610c4c5..5eabbcb2015 100644
--- a/tensorflow/lite/experimental/writer/enum_mapping.h
+++ b/tensorflow/lite/experimental/writer/enum_mapping.h
@@ -29,7 +29,7 @@ inline ActivationFunctionType TfLiteActivationToSchemaActivation(
       return ActivationFunctionType_NONE;
     case kTfLiteActRelu:
       return ActivationFunctionType_RELU;
-    case kTfLiteActRelu1:
+    case kTfLiteActReluN1To1:
       return ActivationFunctionType_RELU_N1_TO_1;
     case kTfLiteActRelu6:
       return ActivationFunctionType_RELU6;
diff --git a/tensorflow/lite/kernels/fully_connected.cc b/tensorflow/lite/kernels/fully_connected.cc
index 8b7a7832dbb..9cbbcae9c51 100644
--- a/tensorflow/lite/kernels/fully_connected.cc
+++ b/tensorflow/lite/kernels/fully_connected.cc
@@ -312,7 +312,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   if (!is_pie && !is_hybrid) {
     TF_LITE_ENSURE(context, params->activation == kTfLiteActNone ||
                                 params->activation == kTfLiteActRelu ||
-                                params->activation == kTfLiteActRelu1 ||
+                                params->activation == kTfLiteActReluN1To1 ||
                                 params->activation == kTfLiteActRelu6);
   }
   return PrepareImpl(context, node);
diff --git a/tensorflow/lite/kernels/internal/tensor_utils.h b/tensorflow/lite/kernels/internal/tensor_utils.h
index e2af88d50e3..8c956c49f5f 100644
--- a/tensorflow/lite/kernels/internal/tensor_utils.h
+++ b/tensorflow/lite/kernels/internal/tensor_utils.h
@@ -587,7 +587,7 @@ inline void ApplyActivationToVector(const float* __restrict__ vector,
       return;
     case kTfLiteActRelu:
       return ApplyReluToVector(vector, v_size, result);
-    case kTfLiteActRelu1:
+    case kTfLiteActReluN1To1:
       return ApplyRelu1ToVector(vector, v_size, result);
     case kTfLiteActRelu6:
       return ApplyRelu6ToVector(vector, v_size, result);
diff --git a/tensorflow/lite/kernels/kernel_util.cc b/tensorflow/lite/kernels/kernel_util.cc
index 032726a7860..164aec3f224 100644
--- a/tensorflow/lite/kernels/kernel_util.cc
+++ b/tensorflow/lite/kernels/kernel_util.cc
@@ -188,7 +188,7 @@ void CalculateActivationRangeQuantizedImpl(TfLiteFusedActivation activation,
   } else if (activation == kTfLiteActRelu6) {
     *act_min = std::max(qmin, quantize(0.0));
     *act_max = std::min(qmax, quantize(6.0));
-  } else if (activation == kTfLiteActRelu1) {
+  } else if (activation == kTfLiteActReluN1To1) {
     *act_min = std::max(qmin, quantize(-1.0));
     *act_max = std::min(qmax, quantize(1.0));
   } else {
diff --git a/tensorflow/lite/kernels/kernel_util.h b/tensorflow/lite/kernels/kernel_util.h
index 6fc69fa1629..6bd6bb1c7ed 100644
--- a/tensorflow/lite/kernels/kernel_util.h
+++ b/tensorflow/lite/kernels/kernel_util.h
@@ -169,7 +169,7 @@ void CalculateActivationRange(TfLiteFusedActivation activation,
   } else if (activation == kTfLiteActRelu6) {
     *activation_min = 0;
     *activation_max = 6;
-  } else if (activation == kTfLiteActRelu1) {
+  } else if (activation == kTfLiteActReluN1To1) {
     *activation_min = -1;
     *activation_max = 1;
   } else {
diff --git a/tensorflow/lite/micro/kernels/activation_utils.h b/tensorflow/lite/micro/kernels/activation_utils.h
index a71826211c0..95ecc26dd52 100644
--- a/tensorflow/lite/micro/kernels/activation_utils.h
+++ b/tensorflow/lite/micro/kernels/activation_utils.h
@@ -35,7 +35,7 @@ inline float ActivationValFloat(TfLiteFusedActivation act, float a) {
       return a;
     case kTfLiteActRelu:
       return TfLiteMax(0.0f, a);
-    case kTfLiteActRelu1:
+    case kTfLiteActReluN1To1:
       return TfLiteMax(-1.0f, TfLiteMin(a, 1.0f));
     case kTfLiteActRelu6:
       return TfLiteMax(0.0f, TfLiteMin(a, 6.0f));
diff --git a/tensorflow/lite/micro/kernels/add_test.cc b/tensorflow/lite/micro/kernels/add_test.cc
index 60164ab4746..6c66e0d4aaf 100644
--- a/tensorflow/lite/micro/kernels/add_test.cc
+++ b/tensorflow/lite/micro/kernels/add_test.cc
@@ -201,7 +201,7 @@ TF_LITE_MICRO_TEST(FloatAddActivationRelu1) {
   float output_data[output_dims_count];
   tflite::testing::TestAddFloat(inout_shape, input1_values, inout_shape,
                                 input2_values, inout_shape, golden_values,
-                                kTfLiteActRelu1, output_data);
+                                kTfLiteActReluN1To1, output_data);
 }
 
 TF_LITE_MICRO_TEST(FloatAddVariousInputShapes) {
@@ -313,7 +313,7 @@ TF_LITE_MICRO_TEST(QuantizedAddActivationRelu1Uint8) {
       inout_shape, input1_values, input1_quantized, scales[0], zero_points[0],
       inout_shape, input2_values, input2_quantized, scales[1], zero_points[1],
       inout_shape, golden_values, golden_quantized, scales[2], zero_points[2],
-      kTfLiteActRelu1, output);
+      kTfLiteActReluN1To1, output);
 }
 
 TF_LITE_MICRO_TEST(QuantizedAddActivationRelu1Int8) {
@@ -334,7 +334,7 @@ TF_LITE_MICRO_TEST(QuantizedAddActivationRelu1Int8) {
       inout_shape, input1_values, input1_quantized, scales[0], zero_points[0],
       inout_shape, input2_values, input2_quantized, scales[1], zero_points[1],
       inout_shape, golden_values, golden_quantized, scales[2], zero_points[2],
-      kTfLiteActRelu1, output);
+      kTfLiteActReluN1To1, output);
 }
 
 TF_LITE_MICRO_TEST(QuantizedAddVariousInputShapesUint8) {
diff --git a/tensorflow/lite/micro/kernels/mul_test.cc b/tensorflow/lite/micro/kernels/mul_test.cc
index 6b4d4f07b64..f69bf2aa17e 100644
--- a/tensorflow/lite/micro/kernels/mul_test.cc
+++ b/tensorflow/lite/micro/kernels/mul_test.cc
@@ -402,7 +402,7 @@ TF_LITE_MICRO_TEST(FloatRelu) {
       {0.1, 0.2, 0.3, 0.5},     // input2 data
       {4, 1, 2, 2, 1},          // output shape
       {-0.2, 0.04, 0.21, 0.4},  // expected output data
-      output_data, kTfLiteActRelu1);
+      output_data, kTfLiteActReluN1To1);
 }
 
 TF_LITE_MICRO_TEST(FloatBroadcast) {
diff --git a/tensorflow/lite/micro/kernels/pooling_test.cc b/tensorflow/lite/micro/kernels/pooling_test.cc
index 9e11e9a4d57..35a77662e07 100644
--- a/tensorflow/lite/micro/kernels/pooling_test.cc
+++ b/tensorflow/lite/micro/kernels/pooling_test.cc
@@ -417,7 +417,8 @@ TF_LITE_MICRO_TEST(SimpleAveragePoolTestInt8PaddingValidStride1Stride2Relu) {
       kTfLitePaddingValid, kTfLiteActRelu, output_data);
 }
 
-TF_LITE_MICRO_TEST(SimpleAveragePoolTestInt8PaddingValidStride2Stride1Relu1) {
+TF_LITE_MICRO_TEST(
+    SimpleAveragePoolTestInt8PaddingValidStride2Stride1ReluN1To1) {
   using tflite::testing::F2QS;
 
   const float input_min = -15.9375;
@@ -439,7 +440,7 @@ TF_LITE_MICRO_TEST(SimpleAveragePoolTestInt8PaddingValidStride2Stride1Relu1) {
        F2QS(-0.25, output_min, output_max), F2QS(0.75, output_min, output_max)},
       {4, 1, 1, 2, 1},         // Output shape
       output_min, output_max,  // output quantization range
-      kTfLitePaddingValid, kTfLiteActRelu1, output_data);
+      kTfLitePaddingValid, kTfLiteActReluN1To1, output_data);
 }
 
 TF_LITE_MICRO_TEST(SimpleAveragePoolTestInt8PaddingValidStride2Relu6) {
@@ -532,7 +533,7 @@ TF_LITE_MICRO_TEST(SimpleMaxPoolTestFloatRelu) {
                                     output_data);
 }
 
-TF_LITE_MICRO_TEST(SimpleMaxPoolTestFloatRelu1) {
+TF_LITE_MICRO_TEST(SimpleMaxPoolTestFloatReluN1To1) {
   float output_data[2];
   tflite::testing::TestMaxPoolFloat({4, 1, 2, 4, 1},  // Input shape
                                     {
@@ -548,7 +549,7 @@ TF_LITE_MICRO_TEST(SimpleMaxPoolTestFloatRelu1) {
                                         0.7,
                                     },
                                     {4, 1, 1, 2, 1},  // Output shape
-                                    kTfLitePaddingValid, kTfLiteActRelu1,
+                                    kTfLitePaddingValid, kTfLiteActReluN1To1,
                                     output_data);
 
   tflite::testing::TestMaxPoolFloat({4, 1, 2, 4, 1},  // Input shape
@@ -565,7 +566,7 @@ TF_LITE_MICRO_TEST(SimpleMaxPoolTestFloatRelu1) {
                                         1.0,
                                     },
                                     {4, 1, 1, 2, 1},  // Output shape
-                                    kTfLitePaddingValid, kTfLiteActRelu1,
+                                    kTfLitePaddingValid, kTfLiteActReluN1To1,
                                     output_data);
 }
 
@@ -713,7 +714,7 @@ TF_LITE_MICRO_TEST(MaxPoolTestUInt8ActRelu) {
       kTfLitePaddingValid, kTfLiteActRelu, output_data);
 }
 
-TF_LITE_MICRO_TEST(MaxPoolTestUInt8ActRelu1) {
+TF_LITE_MICRO_TEST(MaxPoolTestUInt8ActReluN1To1) {
   using tflite::testing::F2Q;
 
   uint8_t output_data[2];
@@ -743,7 +744,7 @@ TF_LITE_MICRO_TEST(MaxPoolTestUInt8ActRelu1) {
       {// Output values
        F2Q(-1.0, output_min, output_max), F2Q(1.0, output_min, output_max)},
       output_min, output_max, {4, 1, 1, 2, 1},  // Output shape
-      kTfLitePaddingValid, kTfLiteActRelu1, output_data);
+      kTfLitePaddingValid, kTfLiteActReluN1To1, output_data);
 }
 
 TF_LITE_MICRO_TEST(MaxPoolTestUInt8ActRelu6) {
@@ -944,7 +945,7 @@ TF_LITE_MICRO_TEST(MaxPoolTestUInt8ActRelu) {
       kTfLitePaddingValid, kTfLiteActRelu, output_data);
 }
 
-TF_LITE_MICRO_TEST(MaxPoolTestUInt8ActRelu1) {
+TF_LITE_MICRO_TEST(MaxPoolTestUInt8ActReluN1To1) {
   using tflite::testing::F2QS;
 
   int8_t output_data[2];
@@ -974,7 +975,7 @@ TF_LITE_MICRO_TEST(MaxPoolTestUInt8ActRelu1) {
       {// Output values
        F2QS(-1.0, output_min, output_max), F2QS(1.0, output_min, output_max)},
       output_min, output_max, {4, 1, 1, 2, 1},  // Output shape
-      kTfLitePaddingValid, kTfLiteActRelu1, output_data);
+      kTfLitePaddingValid, kTfLiteActReluN1To1, output_data);
 }
 
 TF_LITE_MICRO_TEST(MaxPoolTestUInt8ActRelu6) {
diff --git a/tensorflow/lite/micro/kernels/sub_test.cc b/tensorflow/lite/micro/kernels/sub_test.cc
index d6ab48ead36..b8de6eba453 100644
--- a/tensorflow/lite/micro/kernels/sub_test.cc
+++ b/tensorflow/lite/micro/kernels/sub_test.cc
@@ -201,7 +201,7 @@ TF_LITE_MICRO_TEST(FloatSubActivationRelu1) {
   float output_data[output_dims_count];
   tflite::testing::TestSubFloat(inout_shape, input1_values, inout_shape,
                                 input2_values, inout_shape, golden_values,
-                                kTfLiteActRelu1, output_data);
+                                kTfLiteActReluN1To1, output_data);
 }
 
 TF_LITE_MICRO_TEST(FloatSubVariousInputShapes) {
@@ -313,7 +313,7 @@ TF_LITE_MICRO_TEST(QuantizedSubActivationRelu1Uint8) {
       inout_shape, input1_values, input1_quantized, scales[0], zero_points[0],
       inout_shape, input2_values, input2_quantized, scales[1], zero_points[1],
       inout_shape, golden_values, golden_quantized, scales[2], zero_points[2],
-      kTfLiteActRelu1, output);
+      kTfLiteActReluN1To1, output);
 }
 
 TF_LITE_MICRO_TEST(QuantizedSubActivationRelu1Int8) {
@@ -334,7 +334,7 @@ TF_LITE_MICRO_TEST(QuantizedSubActivationRelu1Int8) {
       inout_shape, input1_values, input1_quantized, scales[0], zero_points[0],
       inout_shape, input2_values, input2_quantized, scales[1], zero_points[1],
       inout_shape, golden_values, golden_quantized, scales[2], zero_points[2],
-      kTfLiteActRelu1, output);
+      kTfLiteActReluN1To1, output);
 }
 
 TF_LITE_MICRO_TEST(QuantizedSubVariousInputShapesUint8) {

From 27c8448a544840463c54a749bdefe440b52e4ab7 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sun, 21 Jun 2020 21:58:35 -0700
Subject: [PATCH 0729/1390] tf.numpy: Add support for functions in np.random
 module. Remove np.bfloat16 and add some more dtypes.

PiperOrigin-RevId: 317591184
Change-Id: Iea428ad85119233a66ba04a8c9e7e41908ce23bf
---
 tensorflow/python/ops/numpy_ops/np_dtypes.py  |  23 ++--
 tensorflow/python/ops/numpy_ops/np_random.py  |  75 +++++++++--
 .../python/ops/numpy_ops/np_random_test.py    | 126 +++++++++++++++++-
 3 files changed, 197 insertions(+), 27 deletions(-)

diff --git a/tensorflow/python/ops/numpy_ops/np_dtypes.py b/tensorflow/python/ops/numpy_ops/np_dtypes.py
index cefd09f800d..6f335773ab7 100644
--- a/tensorflow/python/ops/numpy_ops/np_dtypes.py
+++ b/tensorflow/python/ops/numpy_ops/np_dtypes.py
@@ -26,33 +26,32 @@ import numpy as np
 # pylint: disable=unused-import
 # pylint: disable=g-bad-import-order
 from numpy import bool_
+from numpy import complex_
+from numpy import complex128
+from numpy import complex64
+from numpy import float_
+from numpy import float16
+from numpy import float32
+from numpy import float64
+from numpy import inexact
 from numpy import int_
 from numpy import int16
 from numpy import int32
 from numpy import int64
 from numpy import int8
+from numpy import object_
+from numpy import string_
 from numpy import uint16
 from numpy import uint32
 from numpy import uint64
 from numpy import uint8
-from numpy import float_
-from numpy import float16
-from numpy import float32
-from numpy import float64
-from numpy import complex_
-from numpy import complex64
-from numpy import complex128
-
-from numpy import inexact
+from numpy import unicode_
 
 from numpy import iinfo
 from numpy import issubdtype
 
 from numpy import inf
 
-# TODO(wangpeng): Make bfloat16 a numpy dtype instead of using TF's
-from tensorflow.python.framework.dtypes import bfloat16
-# pylint: enable=g-bad-import-order
 # pylint: enable=unused-import
 
 _to_float32 = {
diff --git a/tensorflow/python/ops/numpy_ops/np_random.py b/tensorflow/python/ops/numpy_ops/np_random.py
index 801a7549b97..a7556dbddd4 100644
--- a/tensorflow/python/ops/numpy_ops/np_random.py
+++ b/tensorflow/python/ops/numpy_ops/np_random.py
@@ -13,19 +13,44 @@
 # limitations under the License.
 # ==============================================================================
 """Random functions."""
+
+# pylint: disable=g-direct-tensorflow-import
+
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import numpy as np
+import numpy as onp
 
 from tensorflow.python.framework import random_seed
+from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import random_ops
+from tensorflow.python.ops.numpy_ops import np_array_ops
+from tensorflow.python.ops.numpy_ops import np_dtypes
 from tensorflow.python.ops.numpy_ops import np_utils
 
-DEFAULT_RANDN_DTYPE = np.float32
+# TODO(agarwal): deprecate this.
+DEFAULT_RANDN_DTYPE = onp.float32
 
 
+@np_utils.np_doc('random.seed')
+def seed(s):
+  """Sets the seed for the random number generator.
+
+  Uses `tf.set_random_seed`.
+
+  Args:
+    s: an integer.
+  """
+  try:
+    s = int(s)
+  except TypeError:
+    # TODO(wangpeng): support this?
+    raise ValueError('np.seed currently only support integer arguments.')
+  random_seed.set_seed(s)
+
+
+@np_utils.np_doc('random.randn')
 def randn(*args):
   """Returns samples from a normal distribution.
 
@@ -40,17 +65,45 @@ def randn(*args):
   # TODO(wangpeng): Use new stateful RNG
   if np_utils.isscalar(args):
     args = (args,)
+  dtype = np_dtypes.default_float_type()
+  return np_utils.tensor_to_ndarray(random_ops.random_normal(args, dtype=dtype))
+
+
+@np_utils.np_doc('random.uniform')
+def uniform(low=0.0, high=1.0, size=None):
+  dtype = np_dtypes.default_float_type()
+  low = np_array_ops.asarray(low, dtype=dtype)
+  high = np_array_ops.asarray(high, dtype=dtype)
+  if size is None:
+    size = array_ops.broadcast_dynamic_shape(low.shape, high.shape)
   return np_utils.tensor_to_ndarray(
-      random_ops.random_normal(args, dtype=DEFAULT_RANDN_DTYPE))
+      random_ops.random_uniform(
+          shape=size, minval=low, maxval=high, dtype=dtype))
 
 
-def seed(s):
-  """Sets the seed for the random number generator.
+@np_utils.np_doc('random.random')
+def random(size=None):
+  return uniform(0., 1., size)
 
-  Uses `tf.set_random_seed`.
 
-  Args:
-    s: an integer.
-  """
-  # TODO(wangpeng): make the signature the same as numpy
-  random_seed.set_seed(s)
+@np_utils.np_doc('random.rand')
+def rand(*size):
+  return uniform(0., 1., size)
+
+
+@np_utils.np_doc('random.randint')
+def randint(low, high=None, size=None, dtype=onp.int):  # pylint: disable=missing-function-docstring
+  low = int(low)
+  if high is None:
+    high = low
+    low = 0
+  if size is None:
+    size = ()
+  elif isinstance(size, int):
+    size = (size,)
+  dtype = np_utils.result_type(dtype)
+  if dtype not in (onp.int32, onp.int64):
+    raise ValueError('Only np.int32 or np.int64 types are supported')
+  return np_utils.tensor_to_ndarray(
+      random_ops.random_uniform(
+          shape=size, minval=low, maxval=high, dtype=dtype))
diff --git a/tensorflow/python/ops/numpy_ops/np_random_test.py b/tensorflow/python/ops/numpy_ops/np_random_test.py
index 3423b2234e8..95bc7606e1a 100644
--- a/tensorflow/python/ops/numpy_ops/np_random_test.py
+++ b/tensorflow/python/ops/numpy_ops/np_random_test.py
@@ -13,22 +13,138 @@
 # limitations under the License.
 # ==============================================================================
 """Tests for tf numpy random number methods."""
+# pylint: disable=g-direct-tensorflow-import
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import numpy as np
+from absl.testing import parameterized
+import numpy as onp
 from six.moves import range
 
 from tensorflow.python.framework import ops
+from tensorflow.python.ops import numpy_ops as np
 # Needed for ndarray.reshape.
 from tensorflow.python.ops.numpy_ops import np_array_ops  # pylint: disable=unused-import
+from tensorflow.python.ops.numpy_ops import np_dtypes
 from tensorflow.python.ops.numpy_ops import np_random
 from tensorflow.python.platform import test
 
 
-class RandomTest(test.TestCase):
+class SeedTest(test.TestCase):
+
+  def test(self):
+    np.random.seed(1)
+    np.random.seed(np.int32(1))
+    with self.assertRaises(ValueError):
+      np.random.seed((1, 3))
+
+
+class RandomTestBase(test.TestCase, parameterized.TestCase):
+
+  def _test(self, *args, **kw_args):
+    onp_dtype = kw_args.pop('onp_dtype', None)
+    allow_float64 = kw_args.pop('allow_float64', True)
+    old_allow_float64 = np_dtypes.is_allow_float64()
+    np_dtypes.set_allow_float64(allow_float64)
+    old_func = getattr(self, 'onp_func', None)
+    # TODO(agarwal): Note that onp can return a scalar type while np returns
+    # ndarrays. Currently np does not support scalar types.
+    self.onp_func = lambda *args, **kwargs: onp.asarray(  # pylint: disable=g-long-lambda
+        old_func(*args, **kwargs))
+    np_out = self.np_func(*args, **kw_args)
+    onp_out = onp.asarray(self.onp_func(*args, **kw_args))
+    if onp_dtype is not None:
+      onp_out = onp_out.astype(onp_dtype)
+    self.assertEqual(np_out.shape, onp_out.shape)
+    self.assertEqual(np_out.dtype, onp_out.dtype)
+    np_dtypes.set_allow_float64(old_allow_float64)
+
+
+class RandNTest(RandomTestBase):
+
+  def setUp(self):
+    self.np_func = np.random.randn
+    self.onp_func = onp.random.randn
+    super(RandNTest, self).setUp()
+
+  @parameterized.parameters((), (2), (2, 3))
+  def test_float64(self, *dims):
+    self._test(*dims)
+
+  @parameterized.parameters((), (2), ((2,)), (2, 3))
+  def test_float32(self, *dims):
+    self._test(*dims, allow_float64=False, onp_dtype=np.float32)
+
+
+class UniformTest(RandomTestBase):
+
+  def setUp(self):
+    self.np_func = np.random.uniform
+    self.onp_func = onp.random.uniform
+    super(UniformTest, self).setUp()
+
+  @parameterized.parameters(
+      ((), (), None),
+      (1, (), None),
+      ((), 1, None),
+      (1, 1, None),
+      ((1, 2), (2, 1), None),
+      ((1, 2, 1), (2, 1, 1), (2, 2, 2)),
+      ((), (), (2, 2, 2)),
+  )
+  def test_broadcast(self, low_shape, high_shape, size):
+    low = np.zeros(low_shape).astype(np.float64)
+    high = np.ones(high_shape).astype(np.float64)
+    self._test(low=low, high=high, size=size)
+
+  def test_float32(self):
+    self._test(0, 1, (1, 2), allow_float64=False, onp_dtype=np.float32)
+
+  def test_dtype_cast(self):
+    self._test(np.int8(0), np.uint8(1), (1, 2))
+
+
+class RandomTest(RandomTestBase):
+
+  def setUp(self):
+    self.np_func = np.random.random
+    self.onp_func = onp.random.random
+    super(RandomTest, self).setUp()
+
+  @parameterized.parameters((None,), ((),), ((1,),), ((1, 2),))
+  def test(self, size):
+    self._test(size)
+
+
+class RandTest(RandomTestBase):
+
+  def setUp(self):
+    self.np_func = np.random.rand
+    self.onp_func = onp.random.rand
+    super(RandTest, self).setUp()
+
+  @parameterized.parameters((), (1,), (1, 2))
+  def test(self, *size):
+    self._test(*size)
+
+
+class RandIntTest(RandomTestBase):
+
+  def setUp(self):
+    self.np_func = np.random.randint
+    self.onp_func = onp.random.randint
+    super(RandIntTest, self).setUp()
+
+  @parameterized.parameters((0, 1, None, 'l'), (0, 1, None, np.int64),
+                            (0, 1, 2, np.int32), (0, 1, (), np.int32),
+                            (0, 1, (2), np.int64), (0, 1, (2, 2), 'l'))
+  def test(self, low, high, size, dtype):
+    self._test(low, high, size=size, dtype=dtype)
+
+
+class RandNDistriutionTest(test.TestCase):
 
   def assertNotAllClose(self, a, b, **kwargs):
     try:
@@ -38,7 +154,7 @@ class RandomTest(test.TestCase):
     raise AssertionError('The two values are close at all %d elements' %
                          np.size(a))
 
-  def testRandN(self):
+  def testDistribution(self):
 
     def run_test(*args):
       num_samples = 1000
@@ -50,7 +166,9 @@ class RandomTest(test.TestCase):
       # Test output shape.
       for output in outputs:
         self.assertEqual(output.shape, tuple(args))
-        self.assertEqual(output.dtype.type, np_random.DEFAULT_RANDN_DTYPE)
+        default_dtype = (
+            np.float64 if np_dtypes.is_allow_float64() else np.float32)
+        self.assertEqual(output.dtype.type, default_dtype)
 
       if np.prod(args):  # Don't bother with empty arrays.
         outputs = [output.tolist() for output in outputs]

From 8fc628cf78066539e0ed6e90d93169c231e5a3d7 Mon Sep 17 00:00:00 2001
From: Marat Dukhan <maratek@google.com>
Date: Sun, 21 Jun 2020 23:01:26 -0700
Subject: [PATCH 0730/1390] Add unit test for Prelu in XNNPACK delegate

- Add PreluTester class and unit test for XNNPACK-delegated Prelu operator
- Relax restrictions on the number of input/output dimensions in delegated
  Prelu operators

PiperOrigin-RevId: 317596110
Change-Id: I99c97c45212e80ec600c4644b858cd15c3d34220
---
 tensorflow/lite/delegates/xnnpack/BUILD       |  32 +
 .../delegates/xnnpack/leaky_relu_tester.h     |   1 -
 .../lite/delegates/xnnpack/prelu_test.cc      | 565 ++++++++++++++++++
 .../lite/delegates/xnnpack/prelu_tester.cc    | 237 ++++++++
 .../lite/delegates/xnnpack/prelu_tester.h     |  88 +++
 .../delegates/xnnpack/xnnpack_delegate.cc     |   6 +-
 6 files changed, 926 insertions(+), 3 deletions(-)
 create mode 100644 tensorflow/lite/delegates/xnnpack/prelu_test.cc
 create mode 100644 tensorflow/lite/delegates/xnnpack/prelu_tester.cc
 create mode 100644 tensorflow/lite/delegates/xnnpack/prelu_tester.h

diff --git a/tensorflow/lite/delegates/xnnpack/BUILD b/tensorflow/lite/delegates/xnnpack/BUILD
index eaf7d8f6f03..e0d3d39f719 100644
--- a/tensorflow/lite/delegates/xnnpack/BUILD
+++ b/tensorflow/lite/delegates/xnnpack/BUILD
@@ -180,6 +180,23 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "prelu_tester",
+    testonly = 1,
+    srcs = ["prelu_tester.cc"],
+    hdrs = ["prelu_tester.h"],
+    deps = [
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite:schema_fbs_version",
+        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/kernels:builtin_ops",
+        "//tensorflow/lite/schema:schema_fbs",
+        "@FP16",
+        "@com_google_googletest//:gtest",
+        "@flatbuffers",
+    ],
+)
+
 cc_library(
     name = "reduce_tester",
     testonly = 1,
@@ -527,6 +544,21 @@ cc_test(
     ],
 )
 
+cc_test(
+    name = "prelu_test",
+    srcs = ["prelu_test.cc"],
+    linkopts = select({
+        "//tensorflow:emscripten": EMSCRIPTEN_LINKOPTS,
+        "//conditions:default": [],
+    }),
+    deps = [
+        ":prelu_tester",
+        ":test_main",
+        ":xnnpack_delegate_test_mode",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
 cc_test(
     name = "relu_test",
     srcs = ["relu_test.cc"],
diff --git a/tensorflow/lite/delegates/xnnpack/leaky_relu_tester.h b/tensorflow/lite/delegates/xnnpack/leaky_relu_tester.h
index f1d9efd7209..191dc938e89 100644
--- a/tensorflow/lite/delegates/xnnpack/leaky_relu_tester.h
+++ b/tensorflow/lite/delegates/xnnpack/leaky_relu_tester.h
@@ -21,7 +21,6 @@ limitations under the License.
 
 #include <gtest/gtest.h>
 #include "tensorflow/lite/c/common.h"
-#include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
 namespace xnnpack {
diff --git a/tensorflow/lite/delegates/xnnpack/prelu_test.cc b/tensorflow/lite/delegates/xnnpack/prelu_test.cc
new file mode 100644
index 00000000000..ae40c032f32
--- /dev/null
+++ b/tensorflow/lite/delegates/xnnpack/prelu_test.cc
@@ -0,0 +1,565 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <random>
+
+#include <gtest/gtest.h>
+#include "tensorflow/lite/delegates/xnnpack/prelu_tester.h"
+#include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
+
+namespace tflite {
+namespace xnnpack {
+
+TEST(Prelu, DISABLED_4DBy4D) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  PreluTester()
+      .InputShape({batch, height, width, channels})
+      .SlopeShape({batch, height, width, channels})
+      .Test(xnnpack_delegate.get());
+}
+
+TEST(Prelu, 4DBy4DBroadcastChannels) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  PreluTester()
+      .InputShape({batch, height, width, channels})
+      .SlopeShape({1, 1, 1, channels})
+      .Test(xnnpack_delegate.get());
+}
+
+TEST(Prelu, DISABLED_4DBy4DBroadcastWidth) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  PreluTester()
+      .InputShape({batch, height, width, channels})
+      .SlopeShape({1, 1, width, 1})
+      .Test(xnnpack_delegate.get());
+}
+
+TEST(Prelu, DISABLED_4DBy4DBroadcastHeight) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  PreluTester()
+      .InputShape({batch, height, width, channels})
+      .SlopeShape({1, height, 1, 1})
+      .Test(xnnpack_delegate.get());
+}
+
+TEST(Prelu, DISABLED_4DBy4DBroadcastBatch) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  PreluTester()
+      .InputShape({batch, height, width, channels})
+      .SlopeShape({batch, 1, 1, 1})
+      .Test(xnnpack_delegate.get());
+}
+
+TEST(Prelu, DISABLED_4DBy4DBroadcastHeightWidthChannels) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  PreluTester()
+      .InputShape({batch, height, width, channels})
+      .SlopeShape({1, height, width, channels})
+      .Test(xnnpack_delegate.get());
+}
+
+TEST(Prelu, DISABLED_4DBy3D) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  PreluTester()
+      .InputShape({batch, height, width, channels})
+      .SlopeShape({height, width, channels})
+      .Test(xnnpack_delegate.get());
+}
+
+TEST(Prelu, DISABLED_4DBy2D) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  PreluTester()
+      .InputShape({batch, height, width, channels})
+      .SlopeShape({width, channels})
+      .Test(xnnpack_delegate.get());
+}
+
+TEST(Prelu, 4DBy1D) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  PreluTester()
+      .InputShape({batch, height, width, channels})
+      .SlopeShape({channels})
+      .Test(xnnpack_delegate.get());
+}
+
+TEST(Prelu, DISABLED_4DBy0D) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  PreluTester()
+      .InputShape({batch, height, width, channels})
+      .SlopeShape({})
+      .Test(xnnpack_delegate.get());
+}
+
+TEST(Prelu, DISABLED_3DBy3D) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  PreluTester()
+      .InputShape({batch, width, channels})
+      .SlopeShape({batch, width, channels})
+      .Test(xnnpack_delegate.get());
+}
+
+TEST(Prelu, 3DBy3DBroadcastChannels) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  PreluTester()
+      .InputShape({batch, width, channels})
+      .SlopeShape({1, 1, channels})
+      .Test(xnnpack_delegate.get());
+}
+
+TEST(Prelu, DISABLED_3DBy3DBroadcastWidth) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  PreluTester()
+      .InputShape({batch, width, channels})
+      .SlopeShape({1, width, 1})
+      .Test(xnnpack_delegate.get());
+}
+
+TEST(Prelu, DISABLED_3DBy3DBroadcastBatch) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  PreluTester()
+      .InputShape({batch, width, channels})
+      .SlopeShape({batch, 1, 1})
+      .Test(xnnpack_delegate.get());
+}
+
+TEST(Prelu, DISABLED_3DBy3DBroadcastWidthChannels) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  PreluTester()
+      .InputShape({batch, width, channels})
+      .SlopeShape({1, width, channels})
+      .Test(xnnpack_delegate.get());
+}
+
+TEST(Prelu, DISABLED_3DBy2D) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  PreluTester()
+      .InputShape({batch, width, channels})
+      .SlopeShape({width, channels})
+      .Test(xnnpack_delegate.get());
+}
+
+TEST(Prelu, 3DBy1D) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  PreluTester()
+      .InputShape({batch, width, channels})
+      .SlopeShape({channels})
+      .Test(xnnpack_delegate.get());
+}
+
+TEST(Prelu, DISABLED_3DBy0D) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  PreluTester()
+      .InputShape({batch, width, channels})
+      .SlopeShape({})
+      .Test(xnnpack_delegate.get());
+}
+
+TEST(Prelu, DISABLED_2DBy2D) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto channels = shape_rng();
+
+  PreluTester()
+      .InputShape({batch, channels})
+      .SlopeShape({batch, channels})
+      .Test(xnnpack_delegate.get());
+}
+
+TEST(Prelu, 2DBy2DBroadcastChannels) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto channels = shape_rng();
+
+  PreluTester()
+      .InputShape({batch, channels})
+      .SlopeShape({1, channels})
+      .Test(xnnpack_delegate.get());
+}
+
+TEST(Prelu, DISABLED_2DBy2DBroadcastBatch) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto channels = shape_rng();
+
+  PreluTester()
+      .InputShape({batch, channels})
+      .SlopeShape({batch, 1})
+      .Test(xnnpack_delegate.get());
+}
+
+TEST(Prelu, 2DBy1D) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto channels = shape_rng();
+
+  PreluTester()
+      .InputShape({batch, channels})
+      .SlopeShape({channels})
+      .Test(xnnpack_delegate.get());
+}
+
+TEST(Prelu, DISABLED_2DBy0D) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto channels = shape_rng();
+
+  PreluTester()
+      .InputShape({batch, channels})
+      .SlopeShape({})
+      .Test(xnnpack_delegate.get());
+}
+
+TEST(Prelu, 1DBy1D) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+
+  PreluTester().InputShape({batch}).SlopeShape({batch}).Test(
+      xnnpack_delegate.get());
+}
+
+TEST(Prelu, DISABLED_1DBy0D) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+
+  PreluTester().InputShape({batch}).SlopeShape({}).Test(xnnpack_delegate.get());
+}
+
+TEST(Prelu, FP16Weights) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  PreluTester()
+      .InputShape({batch, height, width, channels})
+      .SlopeShape({channels})
+      .FP16Weights()
+      .Test(xnnpack_delegate.get());
+}
+
+TEST(Prelu, SparseWeights) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  PreluTester()
+      .InputShape({batch, height, width, channels})
+      .SlopeShape({channels})
+      .SparseWeights()
+      .Test(xnnpack_delegate.get());
+}
+
+TEST(Prelu, MultiThreading) {
+  TfLiteXNNPackDelegateOptions delegate_options =
+      TfLiteXNNPackDelegateOptionsDefault();
+  delegate_options.num_threads = 2;
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  PreluTester()
+      .InputShape({batch, height, width, channels})
+      .SlopeShape({channels})
+      .Test(xnnpack_delegate.get());
+}
+
+}  // namespace xnnpack
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/xnnpack/prelu_tester.cc b/tensorflow/lite/delegates/xnnpack/prelu_tester.cc
new file mode 100644
index 00000000000..90fa0fb67c0
--- /dev/null
+++ b/tensorflow/lite/delegates/xnnpack/prelu_tester.cc
@@ -0,0 +1,237 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/xnnpack/prelu_tester.h"
+
+#include <array>
+#include <cstdint>
+#include <functional>
+#include <numeric>
+#include <random>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "third_party/FP16/include/fp16.h"
+#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/version.h"
+
+namespace tflite {
+namespace xnnpack {
+
+void PreluTester::Test(TfLiteDelegate* delegate) const {
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto input_rng = std::bind(std::uniform_real_distribution<float>(-1.0f, 1.0f),
+                             std::ref(rng));
+
+  std::vector<char> buffer = CreateTfLiteModel();
+  const Model* model = GetModel(buffer.data());
+
+  std::unique_ptr<Interpreter> delegate_interpreter;
+  ASSERT_EQ(
+      InterpreterBuilder(model, ::tflite::ops::builtin::BuiltinOpResolver())(
+          &delegate_interpreter),
+      kTfLiteOk);
+  std::unique_ptr<Interpreter> default_interpreter;
+  ASSERT_EQ(
+      InterpreterBuilder(model, ::tflite::ops::builtin::BuiltinOpResolver())(
+          &default_interpreter),
+      kTfLiteOk);
+
+  ASSERT_TRUE(delegate_interpreter);
+  ASSERT_TRUE(default_interpreter);
+
+  ASSERT_EQ(delegate_interpreter->inputs().size(), 1);
+  ASSERT_EQ(default_interpreter->inputs().size(), 1);
+
+  ASSERT_EQ(delegate_interpreter->outputs().size(), 1);
+  ASSERT_EQ(default_interpreter->outputs().size(), 1);
+
+  ASSERT_EQ(delegate_interpreter->AllocateTensors(), kTfLiteOk);
+  ASSERT_EQ(default_interpreter->AllocateTensors(), kTfLiteOk);
+
+  ASSERT_EQ(delegate_interpreter->ModifyGraphWithDelegate(delegate), kTfLiteOk);
+
+  float* default_input_data = default_interpreter->typed_tensor<float>(
+      default_interpreter->inputs()[0]);
+  std::generate(default_input_data,
+                default_input_data + ComputeSize(InputShape()),
+                std::ref(input_rng));
+
+  float* xnnpack_input_data = delegate_interpreter->typed_tensor<float>(
+      delegate_interpreter->inputs()[0]);
+  std::copy(default_input_data, default_input_data + ComputeSize(InputShape()),
+            xnnpack_input_data);
+
+  ASSERT_EQ(default_interpreter->Invoke(), kTfLiteOk);
+  ASSERT_EQ(delegate_interpreter->Invoke(), kTfLiteOk);
+
+  float* default_output_data = default_interpreter->typed_tensor<float>(
+      default_interpreter->outputs()[0]);
+  float* xnnpack_output_data = delegate_interpreter->typed_tensor<float>(
+      delegate_interpreter->outputs()[0]);
+
+  for (size_t i = 0; i < ComputeSize(OutputShape()); i++) {
+    ASSERT_EQ(default_output_data[i], xnnpack_output_data[i]);
+  }
+}
+
+std::vector<char> PreluTester::CreateTfLiteModel() const {
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto slope_rng = std::bind(std::uniform_real_distribution<float>(0.25f, 0.5f),
+                             std::ref(rng));
+
+  flatbuffers::FlatBufferBuilder builder;
+  std::vector<flatbuffers::Offset<OperatorCode>> operator_codes{
+      {CreateOperatorCode(builder, BuiltinOperator_PRELU)}};
+  if (FP16Weights()) {
+    operator_codes.emplace_back(
+        CreateOperatorCode(builder, BuiltinOperator_DEQUANTIZE));
+  } else if (SparseWeights()) {
+    operator_codes.emplace_back(
+        CreateOperatorCode(builder, BuiltinOperator_DENSIFY));
+  }
+
+  std::vector<flatbuffers::Offset<Buffer>> buffers{{
+      CreateBuffer(builder, builder.CreateVector({})),
+  }};
+
+  if (FP16Weights()) {
+    std::vector<uint16_t> slope_data(ComputeSize(SlopeShape()));
+    std::generate(slope_data.begin(), slope_data.end(),
+                  std::bind(fp16_ieee_from_fp32_value, slope_rng));
+
+    buffers.push_back(CreateBuffer(
+        builder, builder.CreateVector(
+                     reinterpret_cast<const uint8_t*>(slope_data.data()),
+                     sizeof(uint16_t) * slope_data.size())));
+  } else {
+    std::vector<float> slope_data(ComputeSize(SlopeShape()));
+    std::generate(slope_data.begin(), slope_data.end(), slope_rng);
+
+    buffers.push_back(CreateBuffer(
+        builder, builder.CreateVector(
+                     reinterpret_cast<const uint8_t*>(slope_data.data()),
+                     sizeof(float) * slope_data.size())));
+  }
+
+  std::vector<flatbuffers::Offset<Tensor>> tensors;
+  std::vector<flatbuffers::Offset<Operator>> operators;
+  if (FP16Weights()) {
+    tensors.emplace_back(CreateTensor(
+        builder,
+        builder.CreateVector<int32_t>(SlopeShape().data(), SlopeShape().size()),
+        TensorType_FLOAT16, /*buffer=*/1));
+  } else if (SparseWeights()) {
+    const int dims_count = SlopeShape().size();
+    std::vector<flatbuffers::Offset<DimensionMetadata>> dim_metadata(
+        dims_count);
+    std::vector<int> traversal_order(dims_count);
+    for (int i = 0; i < dims_count; i++) {
+      traversal_order[i] = i;
+      dim_metadata[i] = CreateDimensionMetadata(builder, DimensionType_DENSE,
+                                                SlopeShape()[i]);
+    }
+    const flatbuffers::Offset<SparsityParameters> sparsity_param =
+        CreateSparsityParameters(builder, builder.CreateVector(traversal_order),
+                                 0, builder.CreateVector(dim_metadata));
+    tensors.emplace_back(CreateTensor(
+        builder,
+        builder.CreateVector<int32_t>(SlopeShape().data(), SlopeShape().size()),
+        TensorType_FLOAT32, /*buffer=*/1, /*name=*/0, /*quantization=*/0,
+        /*is_variable=*/false, /*sparsity=*/sparsity_param));
+  }
+  if (FP16Weights()) {
+    const std::array<int32_t, 1> dequantize_inputs{{0}};
+    const std::array<int32_t, 1> dequantize_outputs{{2}};
+    operators.emplace_back(CreateOperator(
+        builder, /*opcode_index=*/1,
+        builder.CreateVector<int32_t>(dequantize_inputs.data(),
+                                      dequantize_inputs.size()),
+        builder.CreateVector<int32_t>(dequantize_outputs.data(),
+                                      dequantize_outputs.size())));
+  } else if (SparseWeights()) {
+    const std::array<int32_t, 1> densify_inputs{{0}};
+    const std::array<int32_t, 1> densify_outputs{{2}};
+    operators.emplace_back(
+        CreateOperator(builder, /*opcode_index=*/1,
+                       builder.CreateVector<int32_t>(densify_inputs.data(),
+                                                     densify_inputs.size()),
+                       builder.CreateVector<int32_t>(densify_outputs.data(),
+                                                     densify_outputs.size())));
+  }
+  tensors.emplace_back(CreateTensor(
+      builder,
+      builder.CreateVector<int32_t>(InputShape().data(), InputShape().size()),
+      TensorType_FLOAT32));
+  tensors.emplace_back(CreateTensor(
+      builder,
+      builder.CreateVector<int32_t>(SlopeShape().data(), SlopeShape().size()),
+      TensorType_FLOAT32,
+      /*buffer=*/(FP16Weights() || SparseWeights()) ? 0 : 1));
+  tensors.emplace_back(CreateTensor(
+      builder,
+      builder.CreateVector<int32_t>(OutputShape().data(), OutputShape().size()),
+      TensorType_FLOAT32));
+
+  const std::array<int32_t, 2> op_inputs{
+      {static_cast<int>(tensors.size()) - 3,
+       static_cast<int>(tensors.size()) - 2}};
+  const std::array<int32_t, 1> op_outputs{
+      {static_cast<int>(tensors.size()) - 1}};
+  operators.emplace_back(CreateOperator(
+      builder, /*opcode_index=*/0,
+      builder.CreateVector<int32_t>(op_inputs.data(), op_inputs.size()),
+      builder.CreateVector<int32_t>(op_outputs.data(), op_outputs.size())));
+
+  const std::array<int32_t, 1> subgraph_inputs{
+      {static_cast<int32_t>(tensors.size() - 3)}};
+  const std::array<int32_t, 1> subgraph_outputs{
+      {static_cast<int32_t>(tensors.size()) - 1}};
+  flatbuffers::Offset<SubGraph> subgraph = CreateSubGraph(
+      builder, builder.CreateVector(tensors.data(), tensors.size()),
+      builder.CreateVector<int32_t>(subgraph_inputs.data(),
+                                    subgraph_inputs.size()),
+      builder.CreateVector<int32_t>(subgraph_outputs.data(),
+                                    subgraph_outputs.size()),
+      builder.CreateVector(operators.data(), operators.size()));
+
+  flatbuffers::Offset<flatbuffers::String> description =
+      builder.CreateString("PReLU model");
+
+  flatbuffers::Offset<Model> model_buffer = CreateModel(
+      builder, TFLITE_SCHEMA_VERSION,
+      builder.CreateVector(operator_codes.data(), operator_codes.size()),
+      builder.CreateVector(&subgraph, 1), description,
+      builder.CreateVector(buffers.data(), buffers.size()));
+
+  builder.Finish(model_buffer);
+
+  return std::vector<char>(builder.GetBufferPointer(),
+                           builder.GetBufferPointer() + builder.GetSize());
+}
+
+int32_t PreluTester::ComputeSize(const std::vector<int32_t>& shape) {
+  return std::accumulate(shape.cbegin(), shape.cend(), 1,
+                         std::multiplies<int32_t>());
+}
+
+}  // namespace xnnpack
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/xnnpack/prelu_tester.h b/tensorflow/lite/delegates/xnnpack/prelu_tester.h
new file mode 100644
index 00000000000..e89bae6029b
--- /dev/null
+++ b/tensorflow/lite/delegates/xnnpack/prelu_tester.h
@@ -0,0 +1,88 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_XNNPACK_PRELU_TESTER_H_
+#define TENSORFLOW_LITE_DELEGATES_XNNPACK_PRELU_TESTER_H_
+
+#include <cstdint>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "tensorflow/lite/c/common.h"
+
+namespace tflite {
+namespace xnnpack {
+
+class PreluTester {
+ public:
+  PreluTester() = default;
+  PreluTester(const PreluTester&) = delete;
+  PreluTester& operator=(const PreluTester&) = delete;
+
+  inline PreluTester& InputShape(std::initializer_list<int32_t> shape) {
+    for (auto it = shape.begin(); it != shape.end(); ++it) {
+      EXPECT_GT(*it, 0);
+    }
+    input_shape_ = std::vector<int32_t>(shape.begin(), shape.end());
+    return *this;
+  }
+
+  inline const std::vector<int32_t>& InputShape() const { return input_shape_; }
+
+  inline PreluTester& SlopeShape(std::initializer_list<int32_t> shape) {
+    for (auto it = shape.begin(); it != shape.end(); ++it) {
+      EXPECT_GT(*it, 0);
+    }
+    slope_shape_ = std::vector<int32_t>(shape.begin(), shape.end());
+    return *this;
+  }
+
+  inline const std::vector<int32_t>& SlopeShape() const { return slope_shape_; }
+
+  inline const std::vector<int32_t>& OutputShape() const {
+    return InputShape();
+  }
+
+  inline PreluTester& FP16Weights() {
+    fp16_weights_ = true;
+    return *this;
+  }
+
+  inline bool FP16Weights() const { return fp16_weights_; }
+
+  inline PreluTester& SparseWeights() {
+    sparse_weights_ = true;
+    return *this;
+  }
+
+  inline bool SparseWeights() const { return sparse_weights_; }
+
+  void Test(TfLiteDelegate* delegate) const;
+
+ private:
+  std::vector<char> CreateTfLiteModel() const;
+
+  static int32_t ComputeSize(const std::vector<int32_t>& shape);
+
+  std::vector<int32_t> input_shape_;
+  std::vector<int32_t> slope_shape_;
+  bool fp16_weights_ = false;
+  bool sparse_weights_ = false;
+};
+
+}  // namespace xnnpack
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_XNNPACK_PRELU_TESTER_H_
diff --git a/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.cc b/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.cc
index 0afc9c32122..31468ef7407 100644
--- a/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.cc
+++ b/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.cc
@@ -2266,7 +2266,8 @@ class Subgraph {
     const TfLiteTensor& input_tensor = tensors[node->inputs->data[0]];
     TF_LITE_ENSURE_STATUS(CheckTensorFloatType(
         logging_context, input_tensor, node->inputs->data[0], node_index));
-    TF_LITE_ENSURE_STATUS(CheckTensorShape(logging_context, input_tensor, 4,
+    TF_LITE_ENSURE_STATUS(CheckTensorShape(logging_context, input_tensor, 1,
+                                           XNN_MAX_TENSOR_DIMS,
                                            node->inputs->data[0]));
     TF_LITE_ENSURE_STATUS(CheckTensorNonDynamicAllocation(
         logging_context, input_tensor, node->inputs->data[0], node_index));
@@ -2284,7 +2285,8 @@ class Subgraph {
     const TfLiteTensor& output_tensor = tensors[node->outputs->data[0]];
     TF_LITE_ENSURE_STATUS(CheckTensorFloatType(
         logging_context, output_tensor, node->outputs->data[0], node_index));
-    TF_LITE_ENSURE_STATUS(CheckTensorShape(logging_context, output_tensor, 4,
+    TF_LITE_ENSURE_STATUS(CheckTensorShape(logging_context, output_tensor, 1,
+                                           XNN_MAX_TENSOR_DIMS,
                                            node->outputs->data[0]));
     TF_LITE_ENSURE_STATUS(CheckTensorNonDynamicAllocation(
         logging_context, output_tensor, node->outputs->data[0], node_index));

From 9f43ebd68c35a1f8f0f995352f7108b72fb41f50 Mon Sep 17 00:00:00 2001
From: Bixia Zheng <bixia@google.com>
Date: Sun, 21 Jun 2020 23:18:37 -0700
Subject: [PATCH 0731/1390] [TF:TRT] Enforce no native segment execution for
 TfTrtIntegrationTestBase tests.

Report an error if the native segment for an TRTEngineOp is executed. This
ensures that the TRTEngineOp constructed by the bridge is acceptable to
TensorRT to catch bugs.

Modify TrtModeTestBase to not build static engines for a graph that generates
dynamic shaped values. This avoids native segment execution that is not caused
by the inconsistency between the bridge and TensorRT.

Temporarily allow native segment execution for VGGBlockTest and
VGGBlockNCHWTest to workaround b/159459919.

PiperOrigin-RevId: 317597344
Change-Id: I6c268c5c912a1fddcffa5d9763399976ddc0299e
---
 .../compiler/tensorrt/test/tf_trt_integration_test_base.py | 1 +
 tensorflow/python/compiler/tensorrt/test/trt_mode_test.py  | 5 +++++
 .../python/compiler/tensorrt/test/vgg_block_nchw_test.py   | 7 +++++++
 tensorflow/python/compiler/tensorrt/test/vgg_block_test.py | 7 +++++++
 4 files changed, 20 insertions(+)

diff --git a/tensorflow/python/compiler/tensorrt/test/tf_trt_integration_test_base.py b/tensorflow/python/compiler/tensorrt/test/tf_trt_integration_test_base.py
index 8b93750fde4..87fa55a32bd 100644
--- a/tensorflow/python/compiler/tensorrt/test/tf_trt_integration_test_base.py
+++ b/tensorflow/python/compiler/tensorrt/test/tf_trt_integration_test_base.py
@@ -971,4 +971,5 @@ def _AddTests(test_class):
 
 
 if is_tensorrt_enabled():
+  os.environ["TF_TRT_ALLOW_ENGINE_NATIVE_SEGMENT_EXECUTION"] = "False"
   _AddTests(TfTrtIntegrationTestBase)
diff --git a/tensorflow/python/compiler/tensorrt/test/trt_mode_test.py b/tensorflow/python/compiler/tensorrt/test/trt_mode_test.py
index 878ab4cbd8e..c67de7432cd 100644
--- a/tensorflow/python/compiler/tensorrt/test/trt_mode_test.py
+++ b/tensorflow/python/compiler/tensorrt/test/trt_mode_test.py
@@ -40,6 +40,11 @@ class TrtModeTestBase(trt_test.TfTrtIntegrationTestBase):
     q = q + 5.0
     return array_ops.identity(q, name="output_0")
 
+  def ShouldRunTest(self, run_params):
+    # Squeeze op produces dynamic shaped values. Therefore, we don't run the
+    # test with static engine to avoid native segment execution.
+    return (run_params.dynamic_engine, "test dynamic engine only")
+
   def GetParams(self):
     """The input has 1 as a first dimension, which is removed by the squeeze.
 
diff --git a/tensorflow/python/compiler/tensorrt/test/vgg_block_nchw_test.py b/tensorflow/python/compiler/tensorrt/test/vgg_block_nchw_test.py
index 368ffad30a4..8fd9606812d 100644
--- a/tensorflow/python/compiler/tensorrt/test/vgg_block_nchw_test.py
+++ b/tensorflow/python/compiler/tensorrt/test/vgg_block_nchw_test.py
@@ -18,6 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import os
+
 import numpy as np
 
 from tensorflow.python.compiler.tensorrt.test import tf_trt_integration_test_base as trt_test
@@ -69,6 +71,11 @@ class VGGBlockNCHWTest(trt_test.TfTrtIntegrationTestBase):
     """Return the expected engines to build."""
     return ["TRTEngineOp_0"]
 
+  # TODO(b/159459919): remove this routine to disallow native segment execution.
+  def setUp(self):
+    super(trt_test.TfTrtIntegrationTestBase, self).setUp()
+    os.environ["TF_TRT_ALLOW_ENGINE_NATIVE_SEGMENT_EXECUTION"] = "True"
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/compiler/tensorrt/test/vgg_block_test.py b/tensorflow/python/compiler/tensorrt/test/vgg_block_test.py
index f1b41327a58..9d81cd6dcc3 100644
--- a/tensorflow/python/compiler/tensorrt/test/vgg_block_test.py
+++ b/tensorflow/python/compiler/tensorrt/test/vgg_block_test.py
@@ -18,6 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import os
+
 import numpy as np
 
 from tensorflow.python.compiler.tensorrt.test import tf_trt_integration_test_base as trt_test
@@ -60,6 +62,11 @@ class VGGBlockTest(trt_test.TfTrtIntegrationTestBase):
     """Return the expected engines to build."""
     return ["TRTEngineOp_0"]
 
+  # TODO(b/159459919): remove this routine to disallow native segment execution.
+  def setUp(self):
+    super(trt_test.TfTrtIntegrationTestBase, self).setUp()
+    os.environ["TF_TRT_ALLOW_ENGINE_NATIVE_SEGMENT_EXECUTION"] = "True"
+
 
 if __name__ == "__main__":
   test.main()

From 5229c77d94bdae111fcba9efb353c67c7c58fa44 Mon Sep 17 00:00:00 2001
From: Adrian Kuegel <akuegel@google.com>
Date: Mon, 22 Jun 2020 00:18:35 -0700
Subject: [PATCH 0732/1390] Add cuda_root filegroup target to the windows BUILD
 template.

When I added this to the linux BUILD template, I forgot to add it here, too.
Adjust cuda_configure.bzl.oss so that it copies the binaries with .exe
extension on Windows.

PiperOrigin-RevId: 317601952
Change-Id: I0712bcd926372cb9d067ead7f92270d52883bfd9
---
 third_party/gpus/cuda/BUILD.windows.tpl |  8 ++++++++
 third_party/gpus/cuda_configure.bzl     | 13 +++++++------
 2 files changed, 15 insertions(+), 6 deletions(-)

diff --git a/third_party/gpus/cuda/BUILD.windows.tpl b/third_party/gpus/cuda/BUILD.windows.tpl
index e83da1429dd..55a9ec3d1ab 100644
--- a/third_party/gpus/cuda/BUILD.windows.tpl
+++ b/third_party/gpus/cuda/BUILD.windows.tpl
@@ -196,6 +196,14 @@ cc_library(
     data = [":cuda-nvvm"],
 )
 
+filegroup(
+    name = "cuda_root",
+    srcs = [
+        "cuda/bin/fatbinary.exe",
+        "cuda/bin/bin2c.exe",
+    ],
+)
+
 bzl_library(
     name = "build_defs_bzl",
     srcs = ["build_defs.bzl"],
diff --git a/third_party/gpus/cuda_configure.bzl b/third_party/gpus/cuda_configure.bzl
index 0b87ba1ae2a..3374965f415 100644
--- a/third_party/gpus/cuda_configure.bzl
+++ b/third_party/gpus/cuda_configure.bzl
@@ -1062,20 +1062,21 @@ def _create_local_cuda_repository(repository_ctx):
     ))
 
     # copy files mentioned in third_party/nccl/build_defs.bzl.tpl
+    file_ext = ".exe" if is_windows(repository_ctx) else ""
     copy_rules.append(make_copy_files_rule(
         repository_ctx,
         name = "cuda-bin",
         srcs = [
             cuda_config.cuda_toolkit_path + "/bin/" + "crt/link.stub",
-            cuda_config.cuda_toolkit_path + "/bin/" + "nvlink",
-            cuda_config.cuda_toolkit_path + "/bin/" + "fatbinary",
-            cuda_config.cuda_toolkit_path + "/bin/" + "bin2c",
+            cuda_config.cuda_toolkit_path + "/bin/" + "nvlink" + file_ext,
+            cuda_config.cuda_toolkit_path + "/bin/" + "fatbinary" + file_ext,
+            cuda_config.cuda_toolkit_path + "/bin/" + "bin2c" + file_ext,
         ],
         outs = [
             "cuda/bin/" + "crt/link.stub",
-            "cuda/bin/" + "nvlink",
-            "cuda/bin/" + "fatbinary",
-            "cuda/bin/" + "bin2c",
+            "cuda/bin/" + "nvlink" + file_ext,
+            "cuda/bin/" + "fatbinary" + file_ext,
+            "cuda/bin/" + "bin2c" + file_ext,
         ],
     ))
 

From 8717d5c92c17bad370856dfb1debab48b6158f8b Mon Sep 17 00:00:00 2001
From: Jens Elofsson <jens.elofsson@arm.com>
Date: Mon, 22 Jun 2020 09:38:10 +0200
Subject: [PATCH 0733/1390] Fix reviewer comments.

---
 tensorflow/lite/micro/examples/network_tester/network_model.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/lite/micro/examples/network_tester/network_model.h b/tensorflow/lite/micro/examples/network_tester/network_model.h
index 0431d7deee7..5b4b4cf3070 100644
--- a/tensorflow/lite/micro/examples/network_tester/network_model.h
+++ b/tensorflow/lite/micro/examples/network_tester/network_model.h
@@ -67,4 +67,4 @@ const unsigned char network_model[] = {
     0x08, 0x00, 0x07, 0x00, 0x06, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11};
 const unsigned int network_model_len = 576;
 
-#endif
+#endif  // TENSORFLOW_LITE_MICRO_EXAMPLES_NETWORK_TESTER_NETWORK_MODEL_H_

From 072cf7ee4b11e40fdb995e248c82774caf8eb989 Mon Sep 17 00:00:00 2001
From: Akshay Modi <nareshmodi@google.com>
Date: Mon, 22 Jun 2020 00:42:58 -0700
Subject: [PATCH 0734/1390] Disable __setitem__ on tf numpy's ndarray for
 experimental release

PiperOrigin-RevId: 317604521
Change-Id: I19c6a78125fcd29109e45a66099e7325f7136fdf
---
 tensorflow/python/ops/numpy_ops/BUILD         | 10 ---
 .../python/ops/numpy_ops/np_array_ops.py      | 56 ---------------
 .../python/ops/numpy_ops/np_array_ops_test.py | 20 ------
 .../python/ops/numpy_ops/np_backprop_test.py  | 69 -------------------
 4 files changed, 155 deletions(-)
 delete mode 100644 tensorflow/python/ops/numpy_ops/np_backprop_test.py

diff --git a/tensorflow/python/ops/numpy_ops/BUILD b/tensorflow/python/ops/numpy_ops/BUILD
index 3479a622bc0..704e5a27b48 100644
--- a/tensorflow/python/ops/numpy_ops/BUILD
+++ b/tensorflow/python/ops/numpy_ops/BUILD
@@ -65,16 +65,6 @@ cuda_py_test(
     ],
 )
 
-cuda_py_test(
-    name = "np_backprop_test",
-    srcs = ["np_backprop_test.py"],
-    deps = [
-        ":numpy",
-        "//tensorflow/python:platform",
-        "@absl_py//absl/testing:parameterized",
-    ],
-)
-
 cuda_py_test(
     name = "np_logic_test",
     srcs = ["np_logic_test.py"],
diff --git a/tensorflow/python/ops/numpy_ops/np_array_ops.py b/tensorflow/python/ops/numpy_ops/np_array_ops.py
index a87c72ed763..25241272699 100644
--- a/tensorflow/python/ops/numpy_ops/np_array_ops.py
+++ b/tensorflow/python/ops/numpy_ops/np_array_ops.py
@@ -852,66 +852,10 @@ def moveaxis(a, source, destination):  # pylint: disable=missing-docstring
   return np_utils.tensor_to_ndarray(a)
 
 
-def _setitem(arr, index, value):
-  """Sets the `value` at `index` in the array `arr`.
-
-  This works by replacing the slice at `index` in the tensor with `value`.
-  Since tensors are immutable, this builds a new tensor using the `tf.concat`
-  op. Currently, only 0-d and 1-d indices are supported.
-
-  Note that this may break gradients e.g.
-
-  a = tf_np.array([1, 2, 3])
-  old_a_t = a.data
-
-  with tf.GradientTape(persistent=True) as g:
-    g.watch(a.data)
-    b = a * 2
-    a[0] = 5
-  g.gradient(b.data, [a.data])  # [None]
-  g.gradient(b.data, [old_a_t])  # [[2., 2., 2.]]
-
-  Here `d_b / d_a` is `[None]` since a.data no longer points to the same
-  tensor.
-
-  Args:
-    arr: array_like.
-    index: scalar or 1-d integer array.
-    value: value to set at index.
-
-  Returns:
-    ndarray
-
-  Raises:
-    ValueError: if `index` is not a scalar or 1-d array.
-  """
-  # TODO(srbs): Figure out a solution to the gradient problem.
-  arr = asarray(arr)
-  index = asarray(index)
-  if index.ndim == 0:
-    index = ravel(index)
-  elif index.ndim > 1:
-    raise ValueError('index must be a scalar or a 1-d array.')
-  value = asarray(value, dtype=arr.dtype)
-  if arr.shape[len(index):] != value.shape:
-    value = full(arr.shape[len(index):], value)
-  prefix_t = arr.data[:index.data[0]]
-  postfix_t = arr.data[index.data[0] + 1:]
-  if len(index) == 1:
-    arr._data = array_ops.concat(  # pylint: disable=protected-access
-        [prefix_t, array_ops.expand_dims(value.data, 0), postfix_t], 0)
-  else:
-    subarray = arr[index.data[0]]
-    _setitem(subarray, index[1:], value)
-    arr._data = array_ops.concat(  # pylint: disable=protected-access
-        [prefix_t, array_ops.expand_dims(subarray.data, 0), postfix_t], 0)
-
-
 # TODO(wangpeng): Make a custom `setattr` that also sets docstring for the
 #   method.
 setattr(np_arrays.ndarray, 'transpose', transpose)
 setattr(np_arrays.ndarray, 'reshape', _reshape_method_wrapper)
-setattr(np_arrays.ndarray, '__setitem__', _setitem)
 
 
 @np_utils.np_doc('pad')
diff --git a/tensorflow/python/ops/numpy_ops/np_array_ops_test.py b/tensorflow/python/ops/numpy_ops/np_array_ops_test.py
index 25a6a53507a..d52e0c4ea83 100644
--- a/tensorflow/python/ops/numpy_ops/np_array_ops_test.py
+++ b/tensorflow/python/ops/numpy_ops/np_array_ops_test.py
@@ -911,26 +911,6 @@ class ArrayMethodsTest(test.TestCase):
     run_test(np.arange(30).reshape(2, 3, 5).tolist(), [2, 0, 1])
     run_test(np.arange(30).reshape(2, 3, 5).tolist(), [2, 1, 0])
 
-  def testSetItem(self):
-
-    def run_test(arr, index, value):
-      for fn in self.array_transforms:
-        value_arg = fn(value)
-        tf_array = np_array_ops.array(arr)
-        np_array = np.array(arr)
-        tf_array[index] = value_arg
-        # TODO(srbs): "setting an array element with a sequence" is thrown
-        # if we do not wrap value_arg in a numpy array. Investigate how this can
-        # be avoided.
-        np_array[index] = np.array(value_arg)
-        self.match(tf_array, np_array)
-
-    run_test([1, 2, 3], 1, 5)
-    run_test([[1, 2], [3, 4]], 0, [6, 7])
-    run_test([[1, 2], [3, 4]], 1, [6, 7])
-    run_test([[1, 2], [3, 4]], (0, 1), 6)
-    run_test([[1, 2], [3, 4]], 0, 6)  # Value needs to broadcast.
-
   def match_shape(self, actual, expected, msg=None):
     if msg:
       msg = 'Shape match failed for: {}. Expected: {} Actual: {}'.format(
diff --git a/tensorflow/python/ops/numpy_ops/np_backprop_test.py b/tensorflow/python/ops/numpy_ops/np_backprop_test.py
deleted file mode 100644
index 65c532153a4..00000000000
--- a/tensorflow/python/ops/numpy_ops/np_backprop_test.py
+++ /dev/null
@@ -1,69 +0,0 @@
-# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for backpropgration on tf-numpy functions."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.python.eager import backprop
-from tensorflow.python.framework import ops
-from tensorflow.python.ops.numpy_ops import np_array_ops
-# Required for operator overloads
-from tensorflow.python.ops.numpy_ops import np_math_ops  # pylint: disable=unused-import
-from tensorflow.python.platform import test
-
-
-class BackpropTest(test.TestCase):
-
-  def test_setitem(self):
-    # Single integer index.
-    a = np_array_ops.array([1., 2., 3.])
-    b = np_array_ops.array(5.)
-    c = np_array_ops.array(10.)
-
-    tensors = [arr.data for arr in [a, b, c]]
-    with backprop.GradientTape() as g:
-      g.watch(tensors)
-      a[1] = b + c
-      loss = np_array_ops.sum(a)
-
-    gradients = g.gradient(loss.data, tensors)
-    self.assertSequenceEqual(
-        np_array_ops.array(gradients[0]).tolist(), [1., 0., 1.])
-    self.assertEqual(np_array_ops.array(gradients[1]).tolist(), 1.)
-    self.assertEqual(np_array_ops.array(gradients[2]).tolist(), 1.)
-
-    # Tuple index.
-    a = np_array_ops.array([[[1., 2.], [3., 4.]], [[5., 6.],
-                                                   [7., 8.]]])  # 2x2x2 array.
-    b = np_array_ops.array([10., 11.])
-
-    tensors = [arr.data for arr in [a, b]]
-    with backprop.GradientTape() as g:
-      g.watch(tensors)
-      a[(1, 0)] = b
-      loss = np_array_ops.sum(a)
-
-    gradients = g.gradient(loss.data, tensors)
-    self.assertSequenceEqual(
-        np_array_ops.array(gradients[0]).tolist(),
-        [[[1., 1.], [1., 1.]], [[0., 0.], [1., 1.]]])
-    self.assertEqual(np_array_ops.array(gradients[1]).tolist(), [1., 1.])
-
-
-if __name__ == '__main__':
-  ops.enable_eager_execution()
-  test.main()

From 149a0a1d5a1b920568b8148a31c57903b116e80a Mon Sep 17 00:00:00 2001
From: Adrian Kuegel <akuegel@google.com>
Date: Mon, 22 Jun 2020 01:38:54 -0700
Subject: [PATCH 0735/1390] Add support for windows.

The tools from the cuda toolkit have the .exe extension on Windows.

PiperOrigin-RevId: 317610348
Change-Id: I6f4bad4c651c8ffb347f06d1aef8ffe00de90620
---
 tensorflow/core/kernels/cubin_headers/build_defs.bzl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/kernels/cubin_headers/build_defs.bzl b/tensorflow/core/kernels/cubin_headers/build_defs.bzl
index 5880cbe8add..c3e44b7a974 100644
--- a/tensorflow/core/kernels/cubin_headers/build_defs.bzl
+++ b/tensorflow/core/kernels/cubin_headers/build_defs.bzl
@@ -5,7 +5,7 @@ load("@local_config_cuda//cuda:build_defs.bzl", "cuda_gpu_architectures", "if_cu
 def _lookup_file(filegroup, path):
     """Extracts file at (relative) path in filegroup."""
     for file in filegroup.files.to_list():
-        if file.path.endswith(path):
+        if file.path.endswith(path) or file.path.endswith(path + ".exe"):
             return file
     return None
 

From 0868ca7bb26c167918209d5b6885ecc695d7b05a Mon Sep 17 00:00:00 2001
From: Tamara Norman <tamaranorman@google.com>
Date: Mon, 22 Jun 2020 01:49:32 -0700
Subject: [PATCH 0736/1390] Allow a shape to be passed to CopyToHostAsync

PiperOrigin-RevId: 317611333
Change-Id: I4526f9dbd1b223eb23fe928326afca0eb133c2f5
---
 tensorflow/compiler/xla/pjrt/BUILD          |  2 +
 tensorflow/compiler/xla/pjrt/pjrt_client.cc | 67 ++++++++++++++-------
 tensorflow/compiler/xla/pjrt/pjrt_client.h  | 24 ++++++--
 3 files changed, 67 insertions(+), 26 deletions(-)

diff --git a/tensorflow/compiler/xla/pjrt/BUILD b/tensorflow/compiler/xla/pjrt/BUILD
index e401a798d68..695ba9dee93 100644
--- a/tensorflow/compiler/xla/pjrt/BUILD
+++ b/tensorflow/compiler/xla/pjrt/BUILD
@@ -149,6 +149,7 @@ cc_library(
         "//tensorflow/stream_executor/host:host_platform_id",
         "//tensorflow/stream_executor/lib",
         "@com_google_absl//absl/base",
+        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/memory",
@@ -156,6 +157,7 @@ cc_library(
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/time",
+        "@com_google_absl//absl/types:optional",
         "@com_google_absl//absl/types:span",
     ],
 )
diff --git a/tensorflow/compiler/xla/pjrt/pjrt_client.cc b/tensorflow/compiler/xla/pjrt/pjrt_client.cc
index b4f0363e69a..e341a11d64f 100644
--- a/tensorflow/compiler/xla/pjrt/pjrt_client.cc
+++ b/tensorflow/compiler/xla/pjrt/pjrt_client.cc
@@ -76,11 +76,13 @@ limitations under the License.
 #include "absl/strings/str_format.h"
 #include "absl/synchronization/mutex.h"
 #include "absl/time/time.h"
+#include "absl/types/optional.h"
 #include "absl/types/span.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/cpu_function_runtime.h"
 #include "tensorflow/compiler/xla/executable_run_options.h"
+#include "tensorflow/compiler/xla/layout.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/pjrt/distributed/protocol.pb.h"
@@ -861,10 +863,10 @@ StatusOr<std::shared_ptr<TrackedDeviceBuffer>> PjRtBuffer::Release(
     if (device_buffer_ == nullptr) {
       return std::shared_ptr<TrackedDeviceBuffer>();
     }
-    // Set host_value_ and device_buffer_ to null now so that no other thread
-    // can add a hold while we are in WaitForOutstandingUsageHolds()
+    // Clear host_values_ and set device_buffer_ to null now so that no other
+    // thread can add a hold while we are in WaitForOutstandingUsageHolds()
     // below.
-    host_value_ = nullptr;
+    host_values_.clear();
     std::swap(device_buffer_, device_buffer);
     WaitForOutstandingUsageHolds();
     // Now that all holds have completed and no more can be added, we can get
@@ -999,7 +1001,7 @@ void PjRtBuffer::ConfirmDonation(TrackedDeviceBuffer* device_buffer) {
     device_buffer->ReleaseDeviceMemory();
     // Make *this invalid so it can't be used again. Any threads blocking in
     // Release or GetBufferWithHold will see an invalid buffer and return.
-    host_value_ = nullptr;
+    host_values_.clear();
     device_buffer_.reset();
   }
   // Unblock another thread, if any, trying to get a donation hold.
@@ -1019,7 +1021,14 @@ void PjRtBuffer::DropHold(ScopedHold::Type type, TrackedDeviceBuffer* buffer) {
   }
 }
 
-Status PjRtBuffer::CopyToHostAsync() {
+Status PjRtBuffer::CopyToHostAsync(absl::optional<xla::Layout> layout) {
+  return CopyToHostAsyncInternal(/*discard_cached_copy=*/false, layout)
+      .status();
+}
+
+StatusOr<std::shared_ptr<PjRtBuffer::HostValue>>
+PjRtBuffer::CopyToHostAsyncInternal(bool discard_cached_copy,
+                                    absl::optional<xla::Layout> layout) {
   if (IsEmptyTuple()) {
     return InvalidArgument("CopyToHostAsync called on empty tuple");
   }
@@ -1027,6 +1036,8 @@ Status PjRtBuffer::CopyToHostAsync() {
   std::shared_ptr<HostValue> host_value;
   LocalDeviceState* local_device = device_->local_device_state();
   se::Stream* stream = local_device->GetDeviceToHostStream();
+  const xla::Layout& host_layout =
+      layout.has_value() ? layout.value() : on_host_shape_.layout();
   {
     absl::MutexLock lock(&mu_);
     // We can't perform any other action while a donation hold is in progress.
@@ -1034,17 +1045,36 @@ Status PjRtBuffer::CopyToHostAsync() {
     if (device_buffer_ == nullptr) {
       return InvalidArgument("CopyToHostAsync() called on invalid buffer.");
     }
-    if (host_value_) {
-      // The host value has already been requested or is available.
-      return Status::OK();
+    if (discard_cached_copy) {
+      auto it = host_values_.find(host_layout);
+      if (it != host_values_.end()) {
+        host_value = it->second;
+        host_values_.erase(it);
+        return host_value;
+      } else {
+        host_value = std::make_shared<HostValue>();
+      }
+    } else {
+      std::shared_ptr<HostValue>& host_value_ref = host_values_[host_layout];
+      if (host_value_ref) {
+        return host_value_ref;
+      }
+      host_value = host_value_ref = std::make_shared<HostValue>();
     }
-    host_value = host_value_ = std::make_shared<HostValue>();
     AcquireHoldLocked(&device_buffer);
   }
   WaitForBufferDefinitionEventsOnStream(*device_buffer, stream);
-  host_value->value = std::make_shared<Literal>(on_host_shape_);
+  Shape host_shape;
+  if (layout.has_value()) {
+    host_shape = ShapeUtil::MakeShape(on_host_shape_.element_type(),
+                                      on_host_shape_.dimensions());
+    *host_shape.mutable_layout() = host_layout;
+  } else {
+    host_shape = on_host_shape_;
+  }
+  host_value->value = std::make_shared<Literal>(host_shape);
   ShapedBuffer shaped_buffer = device_buffer->AsShapedBuffer(
-      on_host_shape_, on_device_shape_, client_->client()->platform());
+      host_shape, on_device_shape_, client_->client()->platform());
   client_->client()->backend().transfer_manager()->TransferLiteralFromDevice(
       stream, shaped_buffer, host_value->value.get(),
       [host_value](Status done_status) {
@@ -1074,21 +1104,14 @@ Status PjRtBuffer::CopyToHostAsync() {
   RecordUsage(std::move(device_buffer), local_device, local_device, usage_event,
               stream,
               /*prefer_to_retain_reference=*/true);
-  return Status::OK();
+  return host_value;
 }
 
 StatusOr<std::shared_ptr<Literal>> PjRtBuffer::ToLiteral(
-    const bool discard_cached_copy) {
+    const bool discard_cached_copy, absl::optional<xla::Layout> layout) {
   tensorflow::profiler::TraceMe traceme("PjRtBuffer::ToLiteral");
-  TF_RETURN_IF_ERROR(CopyToHostAsync());
-  std::shared_ptr<HostValue> host_value;
-  {
-    absl::MutexLock lock(&mu_);
-    host_value = host_value_;
-    if (discard_cached_copy) {
-      host_value_ = nullptr;
-    }
-  }
+  TF_ASSIGN_OR_RETURN(std::shared_ptr<HostValue> host_value,
+                      CopyToHostAsyncInternal(discard_cached_copy, layout));
   if (host_value == nullptr) {
     return InvalidArgument("ToLiteral called on invalid buffer");
   }
diff --git a/tensorflow/compiler/xla/pjrt/pjrt_client.h b/tensorflow/compiler/xla/pjrt/pjrt_client.h
index 8f74e6244d6..c50d09f631c 100644
--- a/tensorflow/compiler/xla/pjrt/pjrt_client.h
+++ b/tensorflow/compiler/xla/pjrt/pjrt_client.h
@@ -20,15 +20,18 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
 #include "absl/container/inlined_vector.h"
 #include "absl/strings/string_view.h"
 #include "absl/synchronization/mutex.h"
 #include "absl/synchronization/notification.h"
+#include "absl/types/optional.h"
 #include "absl/types/span.h"
 #include "tensorflow/compiler/xla/client/executable_build_options.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/client/xla_computation.h"
+#include "tensorflow/compiler/xla/layout.h"
 #include "tensorflow/compiler/xla/pjrt/local_device_state.h"
 #include "tensorflow/compiler/xla/pjrt/tracked_device_buffer.h"
 #include "tensorflow/compiler/xla/service/computation_placer.h"
@@ -481,14 +484,17 @@ class PjRtBuffer {
   // copies the buffer to the host. Blocks until the value is ready. If
   // `discard_cached_copy` is true then buffer will no longer keep hold of a
   // cached copy of the literal (i.e. The reference to the host value will be
-  // removed.)
+  // removed.) If a layout is passed than a literal with this layout will be
+  // returned.
   StatusOr<std::shared_ptr<Literal>> ToLiteral(
-      bool discard_cached_copy = false);
+      bool discard_cached_copy = false,
+      absl::optional<xla::Layout> layout = {});
 
   // Initiates a copy of the buffer to the host. Does not block waiting for
   // the transfer to complete. The value can be retrieved by a later call to
-  // ToLiteral().
-  Status CopyToHostAsync();
+  // ToLiteral(). If a layout is passed then a cached copy with this layout will
+  // be created.
+  Status CopyToHostAsync(absl::optional<xla::Layout> layout = {});
 
   // Drops the buffer's reference to its associated device memory, leaving the
   // buffer in an invalid state. The memory will be freed lazily when all async
@@ -596,6 +602,14 @@ class PjRtBuffer {
   // successfully donated to an execution.
   void ConfirmDonation(TrackedDeviceBuffer* device_buffer);
 
+  // Initiates a copy of the buffer to the host. Does not block waiting for
+  // the transfer to complete. A host value is returned and if
+  // `discard_cached_copy` is false stored in an internal buffer so that future
+  // transfers don't have to transfer the data from host again. If a layout is
+  // passed then a literal of this layout will be returned and possibly cached.
+  StatusOr<std::shared_ptr<HostValue>> CopyToHostAsyncInternal(
+      bool discard_cached_copy, absl::optional<xla::Layout> layout);
+
   // Drops a hold without taking any other action. Does a sanity check that
   // buffer==device_buffer_ or device_buffer_==nullptr.
   void DropHold(ScopedHold::Type type, TrackedDeviceBuffer* buffer);
@@ -614,6 +628,8 @@ class PjRtBuffer {
 
   mutable absl::Mutex mu_;
   std::shared_ptr<TrackedDeviceBuffer> device_buffer_ TF_GUARDED_BY(mu_);
+  absl::flat_hash_map<xla::Layout, std::shared_ptr<HostValue>> host_values_
+      TF_GUARDED_BY(mu_);
   std::shared_ptr<HostValue> host_value_ TF_GUARDED_BY(mu_);
   // Count of holds on the buffer.
   std::array<int, ScopedHold::Type::kMaxValue> holds_ TF_GUARDED_BY(mu_);

From f3a8d13fa908f13e1fdfe83a9140fb6b9d815dc1 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 22 Jun 2020 02:02:14 -0700
Subject: [PATCH 0737/1390] Update GraphDef version to 440.

PiperOrigin-RevId: 317612594
Change-Id: I43b88a42cc0f461da2356f3d71ce710d6da2df0b
---
 tensorflow/core/public/version.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index bfea50a1932..e63ca48863f 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -108,7 +108,7 @@ limitations under the License.
 
 #define TF_GRAPH_DEF_VERSION_MIN_PRODUCER 0
 #define TF_GRAPH_DEF_VERSION_MIN_CONSUMER 0
-#define TF_GRAPH_DEF_VERSION 439  // Updated: 2020/6/21
+#define TF_GRAPH_DEF_VERSION 440  // Updated: 2020/6/22
 
 // Checkpoint compatibility versions (the versions field in SavedSliceMeta).
 //

From 1bb6b68f211a3005f3d42757cb81da9463361e55 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 22 Jun 2020 02:02:23 -0700
Subject: [PATCH 0738/1390] compat: Update forward compatibility horizon to
 2020-06-22

PiperOrigin-RevId: 317612617
Change-Id: I0656a5a1850d1bba744f5638dc2bb55eaf81702a
---
 tensorflow/python/compat/compat.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/compat/compat.py b/tensorflow/python/compat/compat.py
index 3f622ef844d..301f4608aba 100644
--- a/tensorflow/python/compat/compat.py
+++ b/tensorflow/python/compat/compat.py
@@ -33,7 +33,7 @@ from tensorflow.python.util.tf_export import tf_export
 # This value changes every day with an automatic CL. It can be modified in code
 # via `forward_compatibility_horizon()` or with the environment variable
 # TF_FORWARD_COMPATIBILITY_DELTA_DAYS, which is added to the compatibility date.
-_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2020, 6, 21)
+_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2020, 6, 22)
 _FORWARD_COMPATIBILITY_DELTA_DAYS_VAR_NAME = "TF_FORWARD_COMPATIBILITY_DELTA_DAYS"
 _FORWARD_COMPATIBILITY_DATE_NUMBER = None
 

From 9b1e89b77cf7dc9948ab4684e43867ac2485c2f6 Mon Sep 17 00:00:00 2001
From: Christian Sigg <csigg@google.com>
Date: Mon, 22 Jun 2020 03:58:04 -0700
Subject: [PATCH 0739/1390] Add RBE Dockerfile for CUDA11+cuDNN8.

PiperOrigin-RevId: 317626204
Change-Id: I75d4dc8dbde5dd4f3c59e047afe38043cbfb5a1c
---
 ...dnn8-ubuntu18.04-manylinux2010-multipython | 97 +++++++++++++++++++
 1 file changed, 97 insertions(+)
 create mode 100644 tensorflow/tools/ci_build/Dockerfile.rbe.cuda11.0-cudnn8-ubuntu18.04-manylinux2010-multipython

diff --git a/tensorflow/tools/ci_build/Dockerfile.rbe.cuda11.0-cudnn8-ubuntu18.04-manylinux2010-multipython b/tensorflow/tools/ci_build/Dockerfile.rbe.cuda11.0-cudnn8-ubuntu18.04-manylinux2010-multipython
new file mode 100644
index 00000000000..cd841a77aba
--- /dev/null
+++ b/tensorflow/tools/ci_build/Dockerfile.rbe.cuda11.0-cudnn8-ubuntu18.04-manylinux2010-multipython
@@ -0,0 +1,97 @@
+# Dockerfile to build a manylinux 2010 compliant cross-compiler.
+#
+# Builds a devtoolset gcc/libstdc++ that targets manylinux 2010 compatible
+# glibc (2.12) and system libstdc++ (4.4).
+#
+# To push a new version, run:
+# $ docker build -f Dockerfile.rbe.cuda11.0-cudnn8-ubuntu18.04-manylinux2010-multipython \
+#  --tag "gcr.io/tensorflow-testing/nosla-cuda11.0-cudnn8-ubuntu18.04-manylinux2010-multipython" .
+# $ docker push gcr.io/tensorflow-testing/nosla-cuda11.0-cudnn8-ubuntu18.04-manylinux2010-multipython
+
+FROM nvidia/cuda:11.0-cudnn8-devel-ubuntu18.04-rc as devtoolset
+
+ENV DEBIAN_FRONTEND=noninteractive
+RUN apt-get update && apt-get install -y \
+      cpio \
+      file \
+      flex \
+      g++ \
+      make \
+      patch \
+      rpm2cpio \
+      unar \
+      wget \
+      xz-utils \
+      && \
+    rm -rf /var/lib/apt/lists/*
+
+ADD devtoolset/fixlinks.sh fixlinks.sh
+ADD devtoolset/build_devtoolset.sh build_devtoolset.sh
+ADD devtoolset/rpm-patch.sh rpm-patch.sh
+
+# Set up a sysroot for glibc 2.12 / libstdc++ 4.4 / devtoolset-7 in /dt7.
+RUN /build_devtoolset.sh devtoolset-7 /dt7
+# Set up a sysroot for glibc 2.12 / libstdc++ 4.4 / devtoolset-8 in /dt8.
+RUN /build_devtoolset.sh devtoolset-8 /dt8
+
+# TODO(klimek): Split up into two different docker images.
+FROM nvidia/cuda:11.0-cudnn8-devel-ubuntu18.04-rc
+COPY --from=devtoolset /dt7 /dt7
+COPY --from=devtoolset /dt8 /dt8
+
+# Install TensorRT.
+RUN echo \
+    deb https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1604/x86_64 / \
+    > /etc/apt/sources.list.d/nvidia-ml.list \
+      && \
+    apt-get update && apt-get install -y \
+    libnvinfer-dev=7.1.3-1+cuda11.0 \
+    libnvinfer7=7.1.3-1+cuda11.0 \
+    libnvinfer-plugin-dev=7.1.3-1+cuda11.0 \
+    libnvinfer-plugin7=7.1.3-1+cuda11.0 \
+      && \
+    rm -rf /var/lib/apt/lists/*
+
+# Copy and run the install scripts.
+ARG DEBIAN_FRONTEND=noninteractive
+
+COPY install/install_bootstrap_deb_packages.sh /install/
+RUN /install/install_bootstrap_deb_packages.sh
+
+COPY install/install_deb_packages.sh /install/
+RUN /install/install_deb_packages.sh
+
+# Install additional packages needed for this image:
+# - dependencies to build Python from source
+# - patchelf, as it is required by auditwheel
+RUN apt-get update && apt-get install -y \
+    libbz2-dev \
+    libffi-dev \
+    libgdbm-dev \
+    libncurses5-dev \
+    libnss3-dev \
+    libreadline-dev \
+    patchelf \
+      && \
+    rm -rf /var/lib/apt/lists/*
+
+COPY install/install_bazel.sh /install/
+RUN /install/install_bazel.sh
+
+COPY install/build_and_install_python.sh /install/
+RUN /install/build_and_install_python.sh "2.7.17" "--enable-unicode=ucs4"
+RUN /install/build_and_install_python.sh "3.5.9"
+RUN /install/build_and_install_python.sh "3.6.9"
+RUN /install/build_and_install_python.sh "3.7.7"
+RUN /install/build_and_install_python.sh "3.8.2"
+
+COPY install/install_pip_packages_by_version.sh /install/
+RUN /install/install_pip_packages_by_version.sh "/usr/local/bin/pip2.7"
+RUN /install/install_pip_packages_by_version.sh "/usr/local/bin/pip3.8"
+RUN /install/install_pip_packages_by_version.sh "/usr/local/bin/pip3.5"
+RUN /install/install_pip_packages_by_version.sh "/usr/local/bin/pip3.6"
+RUN /install/install_pip_packages_by_version.sh "/usr/local/bin/pip3.7"
+
+ENV CLANG_VERSION="r42cab985fd95ba4f3f290e7bb26b93805edb447d"
+COPY install/install_latest_clang.sh /install/
+RUN /install/install_latest_clang.sh

From b00a7808a7b29a78762b54e29aac87a77254b4b6 Mon Sep 17 00:00:00 2001
From: Christian Sigg <csigg@google.com>
Date: Mon, 22 Jun 2020 03:58:58 -0700
Subject: [PATCH 0740/1390] Add extra header file for cuDNN 8.

PiperOrigin-RevId: 317626279
Change-Id: I99b969a73555932b25081f37b64f71ac6de662d6
---
 third_party/gpus/cuda_configure.bzl | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/third_party/gpus/cuda_configure.bzl b/third_party/gpus/cuda_configure.bzl
index 3374965f415..a192c022a47 100644
--- a/third_party/gpus/cuda_configure.bzl
+++ b/third_party/gpus/cuda_configure.bzl
@@ -1081,17 +1081,16 @@ def _create_local_cuda_repository(repository_ctx):
     ))
 
     # Select the headers based on the cuDNN version (strip '64_' for Windows).
-    if cuda_config.cudnn_version.rsplit("_", 1)[0] < "8":
-        cudnn_headers = ["cudnn.h"]
-    else:
-        cudnn_headers = [
+    cudnn_headers = ["cudnn.h"]
+    if cuda_config.cudnn_version.rsplit("_", 1)[0] >= "8":
+        cudnn_headers += [
+            "cudnn_backend.h",
             "cudnn_adv_infer.h",
             "cudnn_adv_train.h",
             "cudnn_cnn_infer.h",
             "cudnn_cnn_train.h",
             "cudnn_ops_infer.h",
             "cudnn_ops_train.h",
-            "cudnn.h",
             "cudnn_version.h",
         ]
 

From 3f28ac1b635a4b33c67a0676e4029741530d96d4 Mon Sep 17 00:00:00 2001
From: Dan Moldovan <mdan@google.com>
Date: Mon, 22 Jun 2020 06:37:01 -0700
Subject: [PATCH 0741/1390] Always return the interned version from as_dtype,
 so that dtypes created dynamically using `DType(value)` retain the exact
 value and type of their original counterparts. This allows more brittle code
 performing exact type checks to continue to work as expected once #40132
 lands.

PiperOrigin-RevId: 317643144
Change-Id: Ia17008a65f9300b28a0f2c7bf18a2213b2f407af
---
 tensorflow/python/framework/dtypes.py      |  5 ++++-
 tensorflow/python/framework/dtypes_test.py | 10 +++++++---
 2 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/tensorflow/python/framework/dtypes.py b/tensorflow/python/framework/dtypes.py
index 994a7eea494..9eeae83c68a 100644
--- a/tensorflow/python/framework/dtypes.py
+++ b/tensorflow/python/framework/dtypes.py
@@ -607,6 +607,9 @@ assert len(_ANY_TO_TF) == sum(
 def as_dtype(type_value):
   """Converts the given `type_value` to a `DType`.
 
+  Note: `DType` values are interned. When passed a new `DType` object,
+  `as_dtype` always returns the interned value.
+
   Args:
     type_value: A value that can be converted to a `tf.DType` object. This may
       currently be a `tf.DType` object, a [`DataType`
@@ -620,7 +623,7 @@ def as_dtype(type_value):
     TypeError: If `type_value` cannot be converted to a `DType`.
   """
   if isinstance(type_value, DType):
-    return type_value
+    return _INTERN_TABLE[type_value.as_datatype_enum]
 
   if isinstance(type_value, np.dtype):
     try:
diff --git a/tensorflow/python/framework/dtypes_test.py b/tensorflow/python/framework/dtypes_test.py
index 041cc5280cd..1b7e02b6179 100644
--- a/tensorflow/python/framework/dtypes_test.py
+++ b/tensorflow/python/framework/dtypes_test.py
@@ -325,15 +325,19 @@ class TypesTest(test_util.TensorFlowTestCase):
     for enum in dtypes._TYPE_TO_STRING:
       dtype = dtypes.DType(enum)
       ctor, args = dtype.__reduce__()
-      self.assertEquals(ctor, dtypes.as_dtype)
-      self.assertEquals(args, (dtype.name,))
+      self.assertEqual(ctor, dtypes.as_dtype)
+      self.assertEqual(args, (dtype.name,))
       reconstructed = ctor(*args)
-      self.assertEquals(reconstructed, dtype)
+      self.assertEqual(reconstructed, dtype)
 
   def testAsDtypeInvalidArgument(self):
     with self.assertRaises(TypeError):
       dtypes.as_dtype((dtypes.int32, dtypes.float32))
 
+  def testAsDtypeReturnsInternedVersion(self):
+    dt = dtypes.DType(types_pb2.DT_VARIANT)
+    self.assertIs(dtypes.as_dtype(dt), dtypes.variant)
+
 
 if __name__ == "__main__":
   googletest.main()

From 2624f6c405e12012c8cdd90594fbc4ea573a2d91 Mon Sep 17 00:00:00 2001
From: Lukas Geiger <lukas.geiger94@gmail.com>
Date: Mon, 22 Jun 2020 15:44:17 +0200
Subject: [PATCH 0742/1390] Replace deprecated numpy.ndarray.tostring() with
 tobytes()

---
 tensorflow/python/framework/tensor_util.py           | 2 +-
 tensorflow/python/kernel_tests/decode_raw_op_test.py | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/tensorflow/python/framework/tensor_util.py b/tensorflow/python/framework/tensor_util.py
index 968b635250a..3d5a16dd0f6 100644
--- a/tensorflow/python/framework/tensor_util.py
+++ b/tensorflow/python/framework/tensor_util.py
@@ -525,7 +525,7 @@ def make_tensor_proto(values, dtype=None, shape=None, verify_shape=False,
     if nparray.size * nparray.itemsize >= (1 << 31):
       raise ValueError(
           "Cannot create a tensor proto whose content is larger than 2GB.")
-    tensor_proto.tensor_content = nparray.tostring()
+    tensor_proto.tensor_content = nparray.tobytes()
     return tensor_proto
 
   # If we were not given values as a numpy array, compute the proto_values
diff --git a/tensorflow/python/kernel_tests/decode_raw_op_test.py b/tensorflow/python/kernel_tests/decode_raw_op_test.py
index ae8d40dbaea..00bdea2b43d 100644
--- a/tensorflow/python/kernel_tests/decode_raw_op_test.py
+++ b/tensorflow/python/kernel_tests/decode_raw_op_test.py
@@ -84,22 +84,22 @@ class DecodeRawOpTest(test.TestCase):
   def testToFloat16(self):
     result = np.matrix([[1, -2, -3, 4]], dtype="<f2")
     self.assertAllEqual(
-        result, parsing_ops.decode_raw([result.tostring()], dtypes.float16))
+        result, parsing_ops.decode_raw([result.tobytes()], dtypes.float16))
 
   def testToBool(self):
     result = np.matrix([[True, False, False, True]], dtype="<b1")
     self.assertAllEqual(
-        result, parsing_ops.decode_raw([result.tostring()], dtypes.bool))
+        result, parsing_ops.decode_raw([result.tobytes()], dtypes.bool))
 
   def testToComplex64(self):
     result = np.matrix([[1 + 1j, 2 - 2j, -3 + 3j, -4 - 4j]], dtype="<c8")
     self.assertAllEqual(
-        result, parsing_ops.decode_raw([result.tostring()], dtypes.complex64))
+        result, parsing_ops.decode_raw([result.tobytes()], dtypes.complex64))
 
   def testToComplex128(self):
     result = np.matrix([[1 + 1j, 2 - 2j, -3 + 3j, -4 - 4j]], dtype="<c16")
     self.assertAllEqual(
-        result, parsing_ops.decode_raw([result.tostring()], dtypes.complex128))
+        result, parsing_ops.decode_raw([result.tobytes()], dtypes.complex128))
 
   def testEmptyStringInput(self):
     for num_inputs in range(3):

From 15691d14b471a5f794360e4ec4060f48d8f80d15 Mon Sep 17 00:00:00 2001
From: Lukas Geiger <lukas.geiger94@gmail.com>
Date: Mon, 22 Jun 2020 15:46:12 +0200
Subject: [PATCH 0743/1390] Replace deprecated np.asscalar with .item()

---
 tensorflow/python/ops/numpy_ops/np_arrays.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/ops/numpy_ops/np_arrays.py b/tensorflow/python/ops/numpy_ops/np_arrays.py
index 65e8273375f..97602806aac 100644
--- a/tensorflow/python/ops/numpy_ops/np_arrays.py
+++ b/tensorflow/python/ops/numpy_ops/np_arrays.py
@@ -282,7 +282,7 @@ class ndarray(composite_tensor.CompositeTensor):  # pylint: disable=invalid-name
     # TODO(wangpeng): Handle graph mode
     if not isinstance(self.data, ops.EagerTensor):
       raise TypeError('Indexing using symbolic tensor is not allowed')
-    return np.asscalar(self.data.numpy())
+    return self.data.numpy().item()
 
   def tolist(self):
     return self.data.numpy().tolist()

From a44416d0fc2ef89a6119e75ad35ef3824edb8cbd Mon Sep 17 00:00:00 2001
From: zilinzhu <zilinzhu@tencent.com>
Date: Mon, 22 Jun 2020 21:49:59 +0800
Subject: [PATCH 0744/1390] fix broken figures

---
 tensorflow/compiler/xla/g3doc/tiled_layout.md | 14 ++++++++------
 tensorflow/compiler/xla/xla_data.proto        |  2 +-
 2 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/tensorflow/compiler/xla/g3doc/tiled_layout.md b/tensorflow/compiler/xla/g3doc/tiled_layout.md
index 21e88ceab62..b40f0a95a3d 100644
--- a/tensorflow/compiler/xla/g3doc/tiled_layout.md
+++ b/tensorflow/compiler/xla/g3doc/tiled_layout.md
@@ -3,9 +3,10 @@
 Caution: Tiled layout is *pre-release* and this describes how it's intended to
 work. Errors may be silently ignored.
 
-<center> ![](images/xla_array_layout_figure1.png)
-
-Figure 1 </center>
+<p align="center">
+  <img src="images/xla_array_layout_figure1.png">
+  Figure 1
+</p>
 
 Figure 1 shows how an array F32[3,5] is laid out in memory with 2x2 tiling. A
 shape with this layout is written as F32[3,5]{1,0:(2,2)}, where 1,0 relates to
@@ -120,9 +121,10 @@ element follows the formula above as expected.
 
 XLA's tiling becomes even more flexible by applying it repeatedly.
 
-<center> ![](images/xla_array_layout_figure2.png)
-
-Figure 2 </center>
+<p align="center">
+  <img src="images/xla_array_layout_figure2.png">
+  Figure 2
+</p>
 
 Figure 2 shows how an array of size 4x8 is tiled by two levels of tiling (first
 2x4 then 2x1). We represent this repeated tiling as (2,4)(2,1). Each color
diff --git a/tensorflow/compiler/xla/xla_data.proto b/tensorflow/compiler/xla/xla_data.proto
index 5c21121b98e..e8b6105d3fe 100644
--- a/tensorflow/compiler/xla/xla_data.proto
+++ b/tensorflow/compiler/xla/xla_data.proto
@@ -120,7 +120,7 @@ enum Format {
 }
 
 // Describes a tile used in tiling-based layout. Refer to
-// g3doc/third_party/tensorflow/compiler/xla/g3doc/layout_with_tiling.md for
+// g3doc/third_party/tensorflow/compiler/xla/g3doc/tiled_layout.md for
 // details about tiling-based layout.
 message TileProto {
   // Number of elements in each dimension of the tile. It's ordered from the

From 241a9158334339b565705b86e6ac3aac88d950e4 Mon Sep 17 00:00:00 2001
From: abhichou4 <abhichou4@gmail.com>
Date: Mon, 22 Jun 2020 19:43:25 +0530
Subject: [PATCH 0745/1390] init commit

---
 tensorflow/python/eager/forwardprop.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/tensorflow/python/eager/forwardprop.py b/tensorflow/python/eager/forwardprop.py
index 762fea85d8c..d1b08e655ed 100644
--- a/tensorflow/python/eager/forwardprop.py
+++ b/tensorflow/python/eager/forwardprop.py
@@ -176,6 +176,12 @@ def _jvp_dispatch(op_name, attr_tuple, inputs, outputs, tangents):
 pywrap_tfe.TFE_Py_RegisterJVPFunction(_jvp_dispatch)
 
 
+def _jvp_dispatch_batch(op_name, attr_tuple, inputs, outputs, tangents):
+  """Computes jvps of a regular op for a batch of tangents"""
+  return control_flow_ops.vectorized_map(
+      functools.partial(_jvp_dispatch, op_name, attr_tuple, inputs, outputs),
+      tangents)
+
 @tf_export("autodiff.ForwardAccumulator", v1=[])
 class ForwardAccumulator(object):
   """Computes Jacobian-vector products ("JVP"s) using forward-mode autodiff.

From 12f5cd7dde06b2e2e76d39fd114d8ce37231f33b Mon Sep 17 00:00:00 2001
From: Stephan Herhut <herhut@google.com>
Date: Mon, 22 Jun 2020 06:59:05 -0700
Subject: [PATCH 0746/1390] Propagate noalias and alignment properties of
 TensorFlow ABI into kernels.

PiperOrigin-RevId: 317646198
Change-Id: I20a649c3127586106c250e213e6b6700fb302495
---
 .../mlir/tools/kernel_gen/cubin_creator.cc    | 23 +++++++++++++------
 1 file changed, 16 insertions(+), 7 deletions(-)

diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/cubin_creator.cc b/tensorflow/compiler/mlir/tools/kernel_gen/cubin_creator.cc
index b534b5a5604..79969a22572 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/cubin_creator.cc
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/cubin_creator.cc
@@ -135,11 +135,11 @@ Status LowerTfOpToLhloWithDynamicShapes(mlir::ModuleOp module) {
   return Status::OK();
 }
 
-struct PropagateStaticKnowledge
-    : public mlir::PassWrapper<PropagateStaticKnowledge,
+struct PropagateTensorFlowABIKnowledge
+    : public mlir::PassWrapper<PropagateTensorFlowABIKnowledge,
                                mlir::OperationPass<mlir::LLVM::LLVMFuncOp>> {
-  explicit PropagateStaticKnowledge(mlir::FunctionType type,
-                                    llvm::ArrayRef<uint32_t> same_shape_)
+  explicit PropagateTensorFlowABIKnowledge(mlir::FunctionType type,
+                                           llvm::ArrayRef<uint32_t> same_shape_)
       : func_type(type), same_shape(same_shape_) {}
 
   void runOnOperation() override {
@@ -148,6 +148,11 @@ struct PropagateStaticKnowledge
     // we insert constants into the code and replace usages accordingly.
     // We do not change the signature so that we keep a somewhat stable ABI
     // that is easy to undertand by tools.
+    // We also know that tensorflow aligns all allocated pointers by 16, so
+    // we pass this on. Furthermore, we know that arguments never alias. More
+    // precicely, they may only alias (due to reuse) if the kernel does not
+    // read from a position it previously has written to. We express this with
+    // the noalias attribute.
     mlir::LLVM::LLVMFuncOp func = getOperation();
 
     // This only works if the function is local and we can rewrite it.
@@ -172,6 +177,9 @@ struct PropagateStaticKnowledge
         return;
       }
       positions.push_back(arg_pos);
+      // Set alignment and aliasing on the pointers.
+      func.setArgAttr(arg_pos + 1, "llvm.noalias", b.getBoolAttr(true));
+      func.setArgAttr(arg_pos + 1, "llvm.align", b.getIndexAttr(16));
       // Replace the offset with zero. Offset is argument number 3.
       func.getArgument(arg_pos + 2).replaceAllUsesWith(zero);
       // Forward over base_ptr, aligned_ptr, offset, size and stride arguments.
@@ -213,7 +221,7 @@ struct PropagateStaticKnowledge
   llvm::ArrayRef<uint32_t> same_shape;
 };
 
-Status PropagateStaticShapeKnowledgeToKernel(
+Status PropagateTensorFlowABIKnowledgeToKernel(
     mlir::ModuleOp module, llvm::ArrayRef<uint32_t> same_shape) {
   // Grab the original signature from the single function.
   auto func = *module.getBody()->op_begin<mlir::FuncOp>();
@@ -228,7 +236,8 @@ Status PropagateStaticShapeKnowledgeToKernel(
                       /*printAfterOnlyOnChange=*/false, llvm::dbgs());
   auto& kernel_pm = pm.nest<::mlir::gpu::GPUModuleOp>();
   kernel_pm.addNestedPass<mlir::LLVM::LLVMFuncOp>(
-      absl::make_unique<PropagateStaticKnowledge>(func.getType(), same_shape));
+      absl::make_unique<PropagateTensorFlowABIKnowledge>(func.getType(),
+                                                         same_shape));
 
   if (failed(pm.run(module))) {
     return InternalError("Static knowledge propagation failed.");
@@ -263,7 +272,7 @@ StatusOr<std::vector<uint8_t>> tensorflow::kernel_gen::GenerateCubinForTfCode(
   }
   TF_RETURN_IF_ERROR(xla::mlir_gpu::LowerKernelBodiesToNVVM(module.get()));
   TF_RETURN_IF_ERROR(
-      PropagateStaticShapeKnowledgeToKernel(module.get(), same_shape));
+      PropagateTensorFlowABIKnowledgeToKernel(module.get(), same_shape));
 
   mlir::OwningModuleRef kernel_module =
       xla::mlir_gpu::ExtractKernelModule(*module).ValueOrDie();

From bb73be3e363c26d94d44f54edb176006b5403735 Mon Sep 17 00:00:00 2001
From: Alex Zinenko <zinenko@google.com>
Date: Mon, 22 Jun 2020 07:24:04 -0700
Subject: [PATCH 0747/1390] [mlir][xla] LHLO-to-Affine: simplify DotOp
 conversion with nested builders

MLIR recently introduced a new idiom for constructing loop nests. Use it to
make the legalization of LHLO to affine loops more concise and readable.

PiperOrigin-RevId: 317649515
Change-Id: Idfab27b4655d6df90d940fb7b064ea9941d8a700
---
 .../xla/transforms/lhlo_legalize_to_affine.cc | 89 ++++++++++---------
 1 file changed, 46 insertions(+), 43 deletions(-)

diff --git a/tensorflow/compiler/mlir/xla/transforms/lhlo_legalize_to_affine.cc b/tensorflow/compiler/mlir/xla/transforms/lhlo_legalize_to_affine.cc
index 56b9f5879f6..904a30e847a 100644
--- a/tensorflow/compiler/mlir/xla/transforms/lhlo_legalize_to_affine.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/lhlo_legalize_to_affine.cc
@@ -31,6 +31,17 @@ namespace mlir {
 namespace xla_lhlo {
 namespace {
 
+// Builds an affine loop nest iterating from zeros to "upper_bounds" with unit
+// steps, and populates the body of the innermost loop using "body_builder".
+static void BuildBoundedAffineLoopNest(
+    OpBuilder& builder, Location location, ArrayRef<int64_t> upper_bounds,
+    function_ref<void(OpBuilder&, Location, ValueRange)> body_builder) {
+  SmallVector<int64_t, 3> lower_bounds(upper_bounds.size(), /*Value=*/0);
+  SmallVector<int64_t, 3> steps(upper_bounds.size(), /*Value=*/1);
+  buildAffineLoopNest(builder, location, lower_bounds, upper_bounds, steps,
+                      body_builder);
+}
+
 struct DotOpConverter : public OpRewritePattern<DotOp> {
   using OpRewritePattern<DotOp>::OpRewritePattern;
 
@@ -48,37 +59,29 @@ struct DotOpConverter : public OpRewritePattern<DotOp> {
     if ((lhs_type.getRank() != 2) || (rhs_type.getRank() != 2)) {
       return failure();
     }
-    SmallVector<Value, 4> lhs_indices, rhs_indices, result_indices;
-    const auto& loc = op.getLoc();
 
-    // Create the canonical ijk form of matmul.
-    auto forOp = rewriter.create<AffineForOp>(loc, 0, shape_lhs[0]);
-    lhs_indices.push_back(forOp.getInductionVar());
-    result_indices.push_back(forOp.getInductionVar());
+    LogicalResult map_status = success();
+    auto body_builder = [&](OpBuilder& builder, Location loc, ValueRange ivs) {
+      SmallVector<Value, 2> lhs_indices{ivs[0], ivs[2]},
+          rhs_indices{ivs[2], ivs[1]}, result_indices{ivs[0], ivs[1]};
 
-    rewriter.setInsertionPointToStart(forOp.getBody());
-    forOp = rewriter.create<AffineForOp>(loc, 0, shape_rhs.back());
-    result_indices.push_back(forOp.getInductionVar());
-    rhs_indices.resize(2);
-    rhs_indices[1] = forOp.getInductionVar();
+      auto l = builder.create<AffineLoadOp>(loc, lhs, lhs_indices);
+      auto r = builder.create<AffineLoadOp>(loc, rhs, rhs_indices);
+      auto result =
+          rewriter.create<AffineLoadOp>(loc, op.output(), result_indices);
+      Value op_result = xla_lhlo::XlaOpToStdScalarOp::map<DotOp>(
+          op, element_type, {l, r, result}, &builder);
+      map_status = success(op_result != nullptr);
+      if (failed(map_status)) return;
+      builder.create<AffineStoreOp>(loc, op_result, op.output(),
+                                    result_indices);
+    };
 
-    rewriter.setInsertionPointToStart(forOp.getBody());
-    forOp = rewriter.create<AffineForOp>(loc, 0, shape_rhs.front());
-    lhs_indices.push_back(forOp.getInductionVar());
-    rhs_indices[0] = forOp.getInductionVar();
+    BuildBoundedAffineLoopNest(rewriter, op.getLoc(),
+                               {shape_lhs[0], shape_rhs[1], shape_rhs[0]},
+                               body_builder);
+    if (failed(map_status)) return failure();
 
-    // Construct the innermost loop body.
-    rewriter.setInsertionPointToStart(forOp.getBody());
-    auto l = rewriter.create<AffineLoadOp>(loc, lhs, lhs_indices);
-    auto r = rewriter.create<AffineLoadOp>(loc, rhs, rhs_indices);
-    auto result =
-        rewriter.create<AffineLoadOp>(loc, op.output(), result_indices);
-    Value op_result = xla_lhlo::XlaOpToStdScalarOp::map<DotOp>(
-        op, element_type, {l, r, result}, &rewriter);
-    if (op_result == nullptr) {
-      return failure();
-    }
-    rewriter.create<AffineStoreOp>(loc, op_result, op.output(), result_indices);
     rewriter.eraseOp(op);
     return success();
   }
@@ -99,22 +102,22 @@ struct BinaryOpConverter : public OpRewritePattern<LhloOpTy> {
     if (lhs_type.getShape() != rhs_type.getShape()) {
       return failure();
     }
-    const auto& shape = lhs_type.getShape();
-    SmallVector<Value, 4> induction_vars;
-    const auto loc = op.getLoc();
-    for (int i = 0; i < shape.size(); ++i) {
-      auto forOp = rewriter.create<AffineForOp>(loc, 0, shape[i]);
-      induction_vars.push_back(forOp.getInductionVar());
-      rewriter.setInsertionPointToStart(forOp.getBody());
-    }
-    auto l = rewriter.create<AffineLoadOp>(loc, lhs, induction_vars);
-    auto r = rewriter.create<AffineLoadOp>(loc, rhs, induction_vars);
-    Value opResult = xla_lhlo::XlaOpToStdScalarOp::map<LhloOpTy>(
-        op, element_type, {l, r}, &rewriter);
-    if (opResult == nullptr) {
-      return failure();
-    }
-    rewriter.create<AffineStoreOp>(loc, opResult, op.out(), induction_vars);
+
+    LogicalResult map_status = success();
+    auto body_builder = [&](OpBuilder& builder, Location loc,
+                            ValueRange induction_vars) {
+      auto l = builder.create<AffineLoadOp>(loc, lhs, induction_vars);
+      auto r = builder.create<AffineLoadOp>(loc, rhs, induction_vars);
+      Value op_result = xla_lhlo::XlaOpToStdScalarOp::map<LhloOpTy>(
+          op, element_type, {l, r}, &builder);
+      map_status = success(op_result != nullptr);
+      if (failed(map_status)) return;
+      rewriter.create<AffineStoreOp>(loc, op_result, op.out(), induction_vars);
+    };
+
+    BuildBoundedAffineLoopNest(rewriter, op.getLoc(), lhs_type.getShape(),
+                               body_builder);
+    if (failed(map_status)) return failure();
     rewriter.eraseOp(op);
     return success();
   }

From ddd6743b5fd9b358f6e86bd91cfdc9eff4a4b159 Mon Sep 17 00:00:00 2001
From: xiaohong1031 <guozhong.zhuang@intel.com>
Date: Mon, 22 Jun 2020 09:57:46 -0700
Subject: [PATCH 0748/1390] Fix conv and remapper test with successful Windows
 build

---
 .../core/grappler/optimizers/remapper_test.cc |  2 ++
 tensorflow/core/kernels/conv_ops_test.cc      | 30 +++++++++++++++++++
 2 files changed, 32 insertions(+)

diff --git a/tensorflow/core/grappler/optimizers/remapper_test.cc b/tensorflow/core/grappler/optimizers/remapper_test.cc
index 3aba1a590ce..9d734801916 100644
--- a/tensorflow/core/grappler/optimizers/remapper_test.cc
+++ b/tensorflow/core/grappler/optimizers/remapper_test.cc
@@ -679,6 +679,7 @@ TEST_F(RemapperTest, FuseMatMulWithBiasAndActivation) {
   }
 }
 
+#ifndef INTEL_MKL
 TEST_F(RemapperTest, FuseConv2DWithBatchNorm) {
   using ops::Placeholder;
 
@@ -922,6 +923,7 @@ TEST_F(RemapperTest, FuseConv2DWithSqueezeAndBias) {
   ASSERT_EQ(tensors.size(), 1);
   test::ExpectTensorNear<float>(tensors[0], tensors_expected[0], 1e-6);
 }
+#endif
 
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/conv_ops_test.cc b/tensorflow/core/kernels/conv_ops_test.cc
index 21dffa3cc5e..9113bf82a1b 100644
--- a/tensorflow/core/kernels/conv_ops_test.cc
+++ b/tensorflow/core/kernels/conv_ops_test.cc
@@ -1028,12 +1028,14 @@ TYPED_TEST_P(FusedConv2DWithBiasOpTest, SpatialConvolution) {
   this->VerifyConv2DWithBias(filter_size, filter_count);
 }
 
+#ifndef INTEL_MKL
 TYPED_TEST_P(FusedConv2DWithBiasOpTest, ExplicitPaddingConvolution) {
   const int filter_size = 3;
   const int filter_count = 12;
   this->VerifyConv2DWithBias(filter_size, filter_count,
                              /*explicit_paddings=*/{0, 0, 1, 2, 3, 4, 0, 0});
 }
+#endif
 
 TYPED_TEST_P(FusedConv2DWithBiasOpTest, OneByOneConvolutionAndActivation) {
   const int filter_size = 1;
@@ -1062,6 +1064,7 @@ TYPED_TEST_P(FusedConv2DWithBiasOpTest, SpatialConvolutionAndActivation) {
   }
 }
 
+#ifndef INTEL_MKL
 TYPED_TEST_P(FusedConv2DWithBiasOpTest,
              ExplicitPaddingConvolutionAndActivation) {
   const int filter_size = 3;
@@ -1072,6 +1075,7 @@ TYPED_TEST_P(FusedConv2DWithBiasOpTest,
         /*explicit_paddings=*/{0, 0, 1, 2, 3, 4, 0, 0});
   }
 }
+#endif
 
 // -------------------------------------------------------------------------- //
 // Conv2D + FusedBatchNorm + {Activation}                                     //
@@ -1095,6 +1099,7 @@ TYPED_TEST_P(FusedConv2DWithBatchNormOpTest, SpatialConvolution) {
   this->VerifyConv2DWithBatchNorm(filter_size, filter_count);
 }
 
+#ifndef INTEL_MKL
 TYPED_TEST_P(FusedConv2DWithBatchNormOpTest, ExplicitPaddingConvolution) {
   const int filter_size = 3;
   const int filter_count = 12;
@@ -1102,6 +1107,7 @@ TYPED_TEST_P(FusedConv2DWithBatchNormOpTest, ExplicitPaddingConvolution) {
       filter_size, filter_count,
       /*explicit_paddings=*/{0, 0, 1, 2, 3, 4, 0, 0});
 }
+#endif
 
 TYPED_TEST_P(FusedConv2DWithBatchNormOpTest, OneByOneConvolutionAndActivation) {
   const int filter_size = 1;
@@ -1131,6 +1137,7 @@ TYPED_TEST_P(FusedConv2DWithBatchNormOpTest, SpatialConvolutionAndActivation) {
   }
 }
 
+#ifndef INTEL_MKL
 TYPED_TEST_P(FusedConv2DWithBatchNormOpTest,
              ExplicitPaddingConvolutionAndActivation) {
   const int filter_size = 3;
@@ -1141,7 +1148,9 @@ TYPED_TEST_P(FusedConv2DWithBatchNormOpTest,
         /*explicit_paddings=*/{0, 0, 1, 2, 3, 4, 0, 0});
   }
 }
+#endif
 
+#ifndef INTEL_MKL
 REGISTER_TYPED_TEST_SUITE_P(FusedConv2DWithBiasOpTest,          //
                             OneByOneConvolution,                //
                             ImageSizeConvolution,               //
@@ -1161,14 +1170,35 @@ REGISTER_TYPED_TEST_SUITE_P(FusedConv2DWithBatchNormOpTest,     //
                             ImageSizeConvolutionAndActivation,  //
                             SpatialConvolutionAndActivation,    //
                             ExplicitPaddingConvolutionAndActivation);
+#else
+REGISTER_TYPED_TEST_SUITE_P(FusedConv2DWithBiasOpTest,          //
+                            OneByOneConvolution,                //
+                            ImageSizeConvolution,               //
+                            SpatialConvolution,                 //
+                            OneByOneConvolutionAndActivation,   //
+                            ImageSizeConvolutionAndActivation,  //
+                            SpatialConvolutionAndActivation);
+
+REGISTER_TYPED_TEST_SUITE_P(FusedConv2DWithBatchNormOpTest,     //
+                            OneByOneConvolution,                //
+                            ImageSizeConvolution,               //
+                            SpatialConvolution,                 //
+                            OneByOneConvolutionAndActivation,   //
+                            ImageSizeConvolutionAndActivation,  //
+                            SpatialConvolutionAndActivation);
+#endif
+
 
 using FusedBiasAddDataTypes = ::testing::Types<float, double>;
 INSTANTIATE_TYPED_TEST_SUITE_P(Test, FusedConv2DWithBiasOpTest,
                                FusedBiasAddDataTypes);
 
+
+#ifndef INTEL_MKL
 using FusedBatchNormDataTypes = ::testing::Types<float>;
 INSTANTIATE_TYPED_TEST_SUITE_P(Test, FusedConv2DWithBatchNormOpTest,
                                FusedBatchNormDataTypes);
+#endif
 
 #endif  // TENSORFLOW_USE_ROCM
 }  // namespace tensorflow

From e25fcc8393ccf903ccfe75ca26b3e7dffdd69c5f Mon Sep 17 00:00:00 2001
From: Raman Sarokin <sorokin@google.com>
Date: Mon, 22 Jun 2020 09:41:48 -0700
Subject: [PATCH 0749/1390] Removed virtual method GPUObjectDescriptor*
 GetGPUDescriptor() for GPUObject.

PiperOrigin-RevId: 317672008
Change-Id: Iae9189569290c5d50dd1d2645dceac5385b2641c
---
 tensorflow/lite/delegates/gpu/cl/arguments.cc | 13 ++++----
 tensorflow/lite/delegates/gpu/cl/arguments.h  |  4 ++-
 tensorflow/lite/delegates/gpu/cl/gpu_object.h |  1 -
 .../lite/delegates/gpu/cl/kernels/prelu.h     | 14 ++++----
 .../lite/delegates/gpu/cl/kernels/winograd.cc | 33 ++++++++++---------
 .../lite/delegates/gpu/cl/linear_storage.cc   |  9 ++---
 .../lite/delegates/gpu/cl/linear_storage.h    | 29 +++++++++++++---
 tensorflow/lite/delegates/gpu/cl/tensor.h     |  3 --
 8 files changed, 62 insertions(+), 44 deletions(-)

diff --git a/tensorflow/lite/delegates/gpu/cl/arguments.cc b/tensorflow/lite/delegates/gpu/cl/arguments.cc
index 53303eab079..6955681a366 100644
--- a/tensorflow/lite/delegates/gpu/cl/arguments.cc
+++ b/tensorflow/lite/delegates/gpu/cl/arguments.cc
@@ -221,8 +221,9 @@ void Arguments::AddObjectRef(const std::string& name, AccessType access_type,
 }
 
 void Arguments::AddObject(const std::string& name, AccessType access_type,
-                          GPUObjectPtr&& object) {
-  objects_[name] = {access_type, std::move(object)};
+                          GPUObjectPtr&& object,
+                          GPUObjectDescriptorPtr&& descriptor_ptr) {
+  objects_[name] = {access_type, std::move(object), std::move(descriptor_ptr)};
 }
 
 void Arguments::AddGPUResources(const std::string& name,
@@ -411,7 +412,8 @@ absl::Status Arguments::Merge(Arguments&& args, const std::string& postfix) {
       return absl::InvalidArgumentError(
           absl::StrCat("Object name collision. Name - ", name));
     }
-    objects_[name] = {v.second.access_type, std::move(v.second.obj_ptr)};
+    objects_[name] = {v.second.access_type, std::move(v.second.obj_ptr),
+                      std::move(v.second.descriptor)};
   }
   for (const auto& v : args.int_values_) {
     AddInt(RenameArg(object_names, postfix, v.first), v.second.value);
@@ -677,7 +679,7 @@ absl::Status Arguments::ResolveSelector(
     desc_ptr = it->second.descriptor.get();
     access_type = it->second.access_type;
   } else if (auto it = objects_.find(object_name); it != objects_.end()) {
-    desc_ptr = it->second.obj_ptr->GetGPUDescriptor();
+    desc_ptr = it->second.descriptor.get();
     access_type = it->second.access_type;
   } else {
     return absl::NotFoundError(
@@ -760,8 +762,7 @@ absl::Status Arguments::ResolveSelectorsPass(
 absl::Status Arguments::AddObjectArgs() {
   for (auto& t : objects_) {
     AddGPUResources(t.first,
-                    t.second.obj_ptr->GetGPUDescriptor()->GetGPUResources(
-                        t.second.access_type));
+                    t.second.descriptor->GetGPUResources(t.second.access_type));
     RETURN_IF_ERROR(SetGPUResources(
         t.first, t.second.obj_ptr->GetGPUResources(t.second.access_type)));
   }
diff --git a/tensorflow/lite/delegates/gpu/cl/arguments.h b/tensorflow/lite/delegates/gpu/cl/arguments.h
index edeab4a603b..4bebb0b2628 100644
--- a/tensorflow/lite/delegates/gpu/cl/arguments.h
+++ b/tensorflow/lite/delegates/gpu/cl/arguments.h
@@ -50,7 +50,8 @@ class Arguments {
   void AddObjectRef(const std::string& name, AccessType access_type,
                     GPUObjectDescriptorPtr&& descriptor_ptr);
   void AddObject(const std::string& name, AccessType access_type,
-                 GPUObjectPtr&& object);
+                 GPUObjectPtr&& object,
+                 GPUObjectDescriptorPtr&& descriptor_ptr);
 
   absl::Status SetInt(const std::string& name, int value);
   absl::Status SetFloat(const std::string& name, float value);
@@ -162,6 +163,7 @@ class Arguments {
   struct ObjectArg {
     AccessType access_type;
     GPUObjectPtr obj_ptr;
+    GPUObjectDescriptorPtr descriptor;
   };
   std::map<std::string, ObjectArg> objects_;
 };
diff --git a/tensorflow/lite/delegates/gpu/cl/gpu_object.h b/tensorflow/lite/delegates/gpu/cl/gpu_object.h
index fec8999e2bc..a6d28436872 100644
--- a/tensorflow/lite/delegates/gpu/cl/gpu_object.h
+++ b/tensorflow/lite/delegates/gpu/cl/gpu_object.h
@@ -149,7 +149,6 @@ class GPUObject {
   GPUObject(const GPUObject&) = delete;
   GPUObject& operator=(const GPUObject&) = delete;
   virtual ~GPUObject() = default;
-  virtual const GPUObjectDescriptor* GetGPUDescriptor() const = 0;
   virtual GPUResourcesWithValue GetGPUResources(
       AccessType access_type) const = 0;
 };
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/prelu.h b/tensorflow/lite/delegates/gpu/cl/kernels/prelu.h
index 68e4c7b7626..80ee49f77ca 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/prelu.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/prelu.h
@@ -69,17 +69,17 @@ absl::Status CreatePReLU(const CreationContext& creation_context,
 template <DataType T>
 absl::Status PReLU::UploadParameters(
     const tflite::gpu::Tensor<Linear, T>& parameters, CLContext* context) {
-  LinearStorageCreateInfo create_info;
-  create_info.storage_type =
+  TensorLinearDescriptor desc;
+  desc.storage_type =
       DeduceLinearStorageType(definition_.GetPrimaryStorageType());
-  create_info.data_type = definition_.GetPrimaryDataType();
-  RETURN_IF_ERROR(
-      CreateLinearStorage(create_info, parameters, context, &alpha_));
+  desc.element_type = definition_.GetPrimaryDataType();
+  RETURN_IF_ERROR(CreateLinearStorage(desc, parameters, context, &alpha_));
 
   LinearStorage lt;
-  RETURN_IF_ERROR(CreateLinearStorage(create_info, parameters, context, &lt));
+  RETURN_IF_ERROR(CreateLinearStorage(desc, parameters, context, &lt));
   args_.AddObject("alpha", AccessType::READ,
-                  absl::make_unique<LinearStorage>(std::move(lt)));
+                  absl::make_unique<LinearStorage>(std::move(lt)),
+                  absl::make_unique<TensorLinearDescriptor>(desc));
 
   return absl::OkStatus();
 }
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/winograd.cc b/tensorflow/lite/delegates/gpu/cl/kernels/winograd.cc
index 2dcb72637ec..d38b72e61a6 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/winograd.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/winograd.cc
@@ -378,14 +378,15 @@ absl::Status Winograd4x4To36::UploadBt(CLContext* context) {
     bt_aligned.data[y * 8 + 7] = 0.0f;
   }
 
-  LinearStorageCreateInfo create_info;
-  create_info.storage_type = LinearStorageType::TEXTURE_2D;
-  create_info.data_type = definition_.GetDataType();
+  TensorLinearDescriptor desc;
+  desc.storage_type = LinearStorageType::TEXTURE_2D;
+  desc.element_type = definition_.GetDataType();
 
   LinearStorage lt;
-  RETURN_IF_ERROR(CreateLinearStorage(create_info, bt_aligned, context, &lt));
+  RETURN_IF_ERROR(CreateLinearStorage(desc, bt_aligned, context, &lt));
   args_.AddObject("bt", AccessType::READ,
-                  absl::make_unique<LinearStorage>(std::move(lt)));
+                  absl::make_unique<LinearStorage>(std::move(lt)),
+                  absl::make_unique<TensorLinearDescriptor>(desc));
   return absl::OkStatus();
 }
 
@@ -492,13 +493,14 @@ absl::Status Winograd36To4x4::UploadAt(CLContext* context) {
     at_aligned.data[y * 8 + 7] = 0.0f;
   }
 
-  LinearStorageCreateInfo create_info;
-  create_info.storage_type = LinearStorageType::TEXTURE_2D;
-  create_info.data_type = definition_.GetDataType();
+  TensorLinearDescriptor desc;
+  desc.storage_type = LinearStorageType::TEXTURE_2D;
+  desc.element_type = definition_.GetDataType();
   LinearStorage lt;
-  RETURN_IF_ERROR(CreateLinearStorage(create_info, at_aligned, context, &lt));
+  RETURN_IF_ERROR(CreateLinearStorage(desc, at_aligned, context, &lt));
   args_.AddObject("at", AccessType::READ,
-                  absl::make_unique<LinearStorage>(std::move(lt)));
+                  absl::make_unique<LinearStorage>(std::move(lt)),
+                  absl::make_unique<TensorLinearDescriptor>(desc));
   return absl::OkStatus();
 }
 
@@ -550,14 +552,15 @@ absl::Status CreateWinograd36To4x4(
     const tflite::gpu::Tensor<Linear, DataType::FLOAT32>& biases,
     Winograd36To4x4* result) {
   *result = Winograd36To4x4(definition);
-  LinearStorageCreateInfo create_info;
-  create_info.storage_type = LinearStorageType::TEXTURE_2D;
-  create_info.data_type = definition.GetDataType();
+  TensorLinearDescriptor desc;
+  desc.storage_type = LinearStorageType::TEXTURE_2D;
+  desc.element_type = definition.GetDataType();
   LinearStorage lt;
   RETURN_IF_ERROR(
-      CreateLinearStorage(create_info, biases, creation_context.context, &lt));
+      CreateLinearStorage(desc, biases, creation_context.context, &lt));
   result->args_.AddObject("biases", AccessType::READ,
-                          absl::make_unique<LinearStorage>(std::move(lt)));
+                          absl::make_unique<LinearStorage>(std::move(lt)),
+                          absl::make_unique<TensorLinearDescriptor>(desc));
   return result->UploadAt(creation_context.context);
 }
 
diff --git a/tensorflow/lite/delegates/gpu/cl/linear_storage.cc b/tensorflow/lite/delegates/gpu/cl/linear_storage.cc
index 84d91b9136e..47504a34c2b 100644
--- a/tensorflow/lite/delegates/gpu/cl/linear_storage.cc
+++ b/tensorflow/lite/delegates/gpu/cl/linear_storage.cc
@@ -76,10 +76,7 @@ absl::Status TensorLinearDescriptor::PerformReadSelector(
 
 LinearStorage::LinearStorage(int depth, LinearStorageType storage_type,
                              DataType data_type)
-    : depth_(depth), storage_type_(storage_type), data_type_(data_type) {
-  desc_.storage_type = storage_type;
-  desc_.element_type = data_type;
-}
+    : depth_(depth), storage_type_(storage_type), data_type_(data_type) {}
 
 LinearStorage::LinearStorage(LinearStorage&& storage)
     : GPUObject(std::move(storage)),
@@ -89,8 +86,7 @@ LinearStorage::LinearStorage(LinearStorage&& storage)
       depth_(storage.depth_),
       name_(std::move(storage.name_)),
       storage_type_(storage.storage_type_),
-      data_type_(storage.data_type_),
-      desc_(storage.desc_) {
+      data_type_(storage.data_type_) {
   storage.memory_ = nullptr;
 }
 
@@ -103,7 +99,6 @@ LinearStorage& LinearStorage::operator=(LinearStorage&& storage) {
     name_ = std::move(storage.name_);
     std::swap(storage_type_, storage.storage_type_);
     std::swap(data_type_, storage.data_type_);
-    desc_ = storage.desc_;
     GPUObject::operator=(std::move(storage));
   }
   return *this;
diff --git a/tensorflow/lite/delegates/gpu/cl/linear_storage.h b/tensorflow/lite/delegates/gpu/cl/linear_storage.h
index 474c5652db2..c7f55e1d91d 100644
--- a/tensorflow/lite/delegates/gpu/cl/linear_storage.h
+++ b/tensorflow/lite/delegates/gpu/cl/linear_storage.h
@@ -92,9 +92,6 @@ class LinearStorage : public GPUObject {
   std::string ReadLinearFLT4(const std::string& z_coord) const;
   std::string GetDeclaration() const;
 
-  const GPUObjectDescriptor* GetGPUDescriptor() const override {
-    return &desc_;
-  }
   GPUResourcesWithValue GetGPUResources(AccessType access_type) const override;
 
  private:
@@ -115,7 +112,6 @@ class LinearStorage : public GPUObject {
   std::string name_;
   LinearStorageType storage_type_;
   DataType data_type_;
-  TensorLinearDescriptor desc_;
 };
 
 absl::Status CreateBufferLinearStorage(int size, DataType data_type, void* data,
@@ -152,6 +148,31 @@ absl::Status CreateLinearStorage(const LinearStorageCreateInfo& creation_info,
   return absl::OkStatus();
 }
 
+template <DataType T>
+absl::Status CreateLinearStorage(const TensorLinearDescriptor& descriptor,
+                                 const tflite::gpu::Tensor<Linear, T>& tensor,
+                                 CLContext* context, LinearStorage* result) {
+  LinearStorageCreateInfo creation_info;
+  creation_info.storage_type = descriptor.storage_type;
+  creation_info.data_type = descriptor.element_type;
+  int size = creation_info.aligned_size != 0 ? creation_info.aligned_size
+                                             : tensor.shape.v;
+  const int depth = DivideRoundUp(size, 4);
+  if (creation_info.data_type == DataType::FLOAT32) {
+    std::vector<float4> gpu_data(depth);
+    CopyLinearFLT4(tensor, absl::MakeSpan(gpu_data));
+    RETURN_IF_ERROR(CreateLinearStorage(creation_info, depth, gpu_data.data(),
+                                        context, result));
+  } else {
+    std::vector<half4> gpu_data(depth);
+    CopyLinearFLT4(tensor, absl::MakeSpan(gpu_data));
+    RETURN_IF_ERROR(CreateLinearStorage(creation_info, depth, gpu_data.data(),
+                                        context, result));
+  }
+  result->SetName(creation_info.name);
+  return absl::OkStatus();
+}
+
 }  // namespace cl
 }  // namespace gpu
 }  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/tensor.h b/tensorflow/lite/delegates/gpu/cl/tensor.h
index c1b0b14709f..7de42a810ec 100644
--- a/tensorflow/lite/delegates/gpu/cl/tensor.h
+++ b/tensorflow/lite/delegates/gpu/cl/tensor.h
@@ -58,9 +58,6 @@ class Tensor : public GPUObject {
 
   virtual ~Tensor() { Release(); }
 
-  const GPUObjectDescriptor* GetGPUDescriptor() const override {
-    return &descriptor_;
-  }
   GPUResourcesWithValue GetGPUResources(AccessType access_type) const override;
 
   int Width() const { return shape_.w; }

From ea00842c91376ffee148f54ddb1ec7f09bfc95db Mon Sep 17 00:00:00 2001
From: Vo Van Nghia <vovannghia2409@gmail.com>
Date: Tue, 23 Jun 2020 00:00:36 +0700
Subject: [PATCH 0750/1390] Add the licenses

---
 .../filesystem/plugins/gcs/gcs_filesystem_test.cc  | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem_test.cc b/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem_test.cc
index eb0fbfc33f7..6dc5e164a1c 100644
--- a/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem_test.cc
+++ b/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem_test.cc
@@ -1,3 +1,17 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
 #include "gtest/gtest.h"
 #include "tensorflow/c/experimental/filesystem/filesystem_interface.h"
 #include "tensorflow/c/tf_status_helper.h"

From 22f6939be9cc23d14610c3e21fd30f7345c1b125 Mon Sep 17 00:00:00 2001
From: Feng Liu <fengliuai@google.com>
Date: Mon, 22 Jun 2020 09:43:32 -0700
Subject: [PATCH 0751/1390] Apply default quantization parameter before
 quantization pass

By this order, the default quantization parameter is only applied on the
activations and the weight quantization parameter will use the parameters from
the weight content.

PiperOrigin-RevId: 317672365
Change-Id: Ib7b02ae19105db124721242ea51a5ccc1d5aa68e
---
 tensorflow/compiler/mlir/lite/tf_tfl_passes.cc | 14 +++++---------
 1 file changed, 5 insertions(+), 9 deletions(-)

diff --git a/tensorflow/compiler/mlir/lite/tf_tfl_passes.cc b/tensorflow/compiler/mlir/lite/tf_tfl_passes.cc
index fed2896035b..3fa2eae42f2 100644
--- a/tensorflow/compiler/mlir/lite/tf_tfl_passes.cc
+++ b/tensorflow/compiler/mlir/lite/tf_tfl_passes.cc
@@ -39,22 +39,18 @@ namespace tensorflow {
 void AddQuantizationPasses(const mlir::TFL::QuantizationSpecs& quant_specs,
                            mlir::OpPassManager* pass_manager) {
   pass_manager->addPass(mlir::TFL::CreatePrepareQuantizePass(quant_specs));
-  pass_manager->addPass(mlir::TFL::CreateQuantizePass());
-  bool emit_quant_adaptor_ops =
-      quant_specs.inference_type != quant_specs.inference_input_type;
-  pass_manager->addPass(
-      mlir::TFL::CreatePostQuantizePass(emit_quant_adaptor_ops));
-
   if (quant_specs.default_ranges.first.hasValue() ||
       quant_specs.default_ranges.second.hasValue()) {
     pass_manager->addPass(mlir::TFL::CreateDefaultQuantParamsPass(
         quant_specs.default_ranges.first.getValueOr(0.0),
         quant_specs.default_ranges.second.getValueOr(0.0),
         quant_specs.IsSignedInferenceType()));
-    pass_manager->addPass(mlir::TFL::CreateQuantizePass());
-    pass_manager->addPass(
-        mlir::TFL::CreatePostQuantizePass(emit_quant_adaptor_ops));
   }
+  pass_manager->addPass(mlir::TFL::CreateQuantizePass());
+  bool emit_quant_adaptor_ops =
+      quant_specs.inference_type != quant_specs.inference_input_type;
+  pass_manager->addPass(
+      mlir::TFL::CreatePostQuantizePass(emit_quant_adaptor_ops));
 }
 
 void AddTFToTFLConversionPasses(const mlir::TFL::PassConfig& pass_config,

From eb32bf1ae46571375b00c9e3e146dad7706286ce Mon Sep 17 00:00:00 2001
From: Bruce Fontaine <bfontain@google.com>
Date: Mon, 22 Jun 2020 09:43:37 -0700
Subject: [PATCH 0752/1390] Clean up _set_prefetch_on_host method from
 TPUStrategy.

PiperOrigin-RevId: 317672388
Change-Id: Icb03401a792df4d071659f101ba06edf59010577
---
 .../custom_training_loop_input_test.py        | 10 +--
 tensorflow/python/distribute/tpu_strategy.py  | 43 +++++++-----
 .../python/distribute/tpu_strategy_test.py    | 69 +++++++++++++++++--
 3 files changed, 98 insertions(+), 24 deletions(-)

diff --git a/tensorflow/python/distribute/custom_training_loop_input_test.py b/tensorflow/python/distribute/custom_training_loop_input_test.py
index 5660b5839ce..748cb7834fc 100644
--- a/tensorflow/python/distribute/custom_training_loop_input_test.py
+++ b/tensorflow/python/distribute/custom_training_loop_input_test.py
@@ -454,10 +454,11 @@ class InputIterationTest(test.TestCase, parameterized.TestCase,
       ))
   def testDistributeDatasetHostPrefetch(self, distribution):
     data = [5., 6., 7., 8.]
-    distribution.extended._set_prefetch_on_host(True)  # pylint: disable=protected-access
     input_iterator = iter(
         distribution.experimental_distribute_dataset(
-            get_dataset_from_tensor_slices(data).batch(2)))
+            get_dataset_from_tensor_slices(data).batch(2),
+            distribute_lib.InputOptions(
+                experimental_prefetch_to_device=False)))
 
     local_results = distribution.experimental_local_results(
         input_iterator.get_next())
@@ -473,10 +474,11 @@ class InputIterationTest(test.TestCase, parameterized.TestCase,
       ))
   def testDistributeDatasetFunctionHostPrefetch(self, distribution):
     data = [5., 6., 7., 8.]
-    distribution.extended._set_prefetch_on_host(True)  # pylint: disable=protected-access
     input_iterator = iter(
         distribution.experimental_distribute_datasets_from_function(
-            lambda _: get_dataset_from_tensor_slices(data)))
+            lambda _: get_dataset_from_tensor_slices(data),
+            distribute_lib.InputOptions(
+                experimental_prefetch_to_device=False)))
 
     local_results = distribution.experimental_local_results(
         input_iterator.get_next())
diff --git a/tensorflow/python/distribute/tpu_strategy.py b/tensorflow/python/distribute/tpu_strategy.py
index dcd1671841f..4b3c4be0ccd 100644
--- a/tensorflow/python/distribute/tpu_strategy.py
+++ b/tensorflow/python/distribute/tpu_strategy.py
@@ -47,12 +47,14 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import device_spec
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.tpu import device_assignment as device_assignment_lib  # pylint: disable=unused-import
 from tensorflow.python.tpu import tpu
 from tensorflow.python.tpu import tpu_strategy_util
@@ -515,7 +517,6 @@ class TPUExtended(distribute_lib.StrategyExtendedV1):
     self._require_static_shapes = True
 
     self.experimental_enable_get_next_as_optional = True
-    self._prefetch_to_device = True
 
     self._logical_device_stack = [0]
 
@@ -527,16 +528,6 @@ class TPUExtended(distribute_lib.StrategyExtendedV1):
           context.async_wait()
       atexit.register(async_wait)
 
-  # TODO(bfontain): Remove once a proper dataset API exists for prefetching
-  # a dataset to multiple devices exists.
-  # If value is true, this forces prefetch of data to the host's memeory rather
-  # than the individual TPU device's memory. This is needed when using for TPU
-  # Embeddings as a) sparse tensors cannot be prefetched to the TPU device
-  # memory and b) TPU Embedding enqueue operation are CPU ops and this avoids
-  # a copy back to the host for dense tensors
-  def _set_prefetch_on_host(self, value):
-    self._prefetch_to_device = not value
-
   def _validate_colocate_with_variable(self, colocate_with_variable):
     distribute_utils. validate_colocate(colocate_with_variable, self)
 
@@ -575,17 +566,32 @@ class TPUExtended(distribute_lib.StrategyExtendedV1):
         session)
 
   def _get_input_workers(self, options):
-    prefetch_to_device = self._prefetch_to_device
-    if options:
-      prefetch_to_device = options.experimental_prefetch_to_device
-    if prefetch_to_device:
+    if not options or options.experimental_prefetch_to_device:
       return input_lib.InputWorkers(
           tuple(self._device_input_worker_devices.items()))
     else:
       return input_lib.InputWorkers(
           tuple(self._host_input_worker_devices.items()))
 
+  def _check_spec(self, element_spec):
+    if isinstance(element_spec, values.PerReplicaSpec):
+      element_spec = element_spec._component_specs  # pylint: disable=protected-access
+    specs = nest.flatten_with_joined_string_paths(element_spec)
+    for path, spec in specs:
+      if isinstance(spec, (sparse_tensor.SparseTensorSpec,
+                           ragged_tensor.RaggedTensorSpec)):
+        raise ValueError(
+            "Found tensor {} with spec {}. TPUStrategy does not support "
+            "distributed datasets with device prefetch when using sparse or "
+            "ragged tensors. If you indend to use sparse or ragged tensors, "
+            "please pass a tf.distribute.InputOptions object with "
+            "experimental_prefetch_to_device set to False to your dataset "
+            "distribution function.".format(path, type(spec)))
+
   def _experimental_distribute_dataset(self, dataset, options):
+    if options is None or options.experimental_prefetch_to_device:
+      self._check_spec(dataset.element_spec)
+
     return input_lib.get_distributed_dataset(
         dataset,
         self._get_input_workers(options),
@@ -603,12 +609,17 @@ class TPUExtended(distribute_lib.StrategyExtendedV1):
           input_pipeline_id=i,
           num_replicas_in_sync=self._num_replicas_in_sync))
 
-    return input_lib.get_distributed_datasets_from_function(
+    distributed_dataset = input_lib.get_distributed_datasets_from_function(
         dataset_fn,
         input_workers,
         input_contexts,
         self._container_strategy())
 
+    # We can only check after the dataset_fn is called.
+    if options is None or options.experimental_prefetch_to_device:
+      self._check_spec(distributed_dataset.element_spec)
+    return distributed_dataset
+
   def _experimental_distribute_values_from_function(self, value_fn):
     per_replica_values = []
     for replica_id in range(self._num_replicas_in_sync):
diff --git a/tensorflow/python/distribute/tpu_strategy_test.py b/tensorflow/python/distribute/tpu_strategy_test.py
index 4070336aae8..5e47e750d87 100644
--- a/tensorflow/python/distribute/tpu_strategy_test.py
+++ b/tensorflow/python/distribute/tpu_strategy_test.py
@@ -44,6 +44,7 @@ from tensorflow.python.ops import embedding_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import variables
+from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.platform import flags
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.tpu import device_assignment as device_assignment_lib
@@ -475,9 +476,11 @@ class TPUStrategyTest(test.TestCase, parameterized.TestCase):
 
       return dataset.map(make_sparse)
 
-    strategy.extended._set_prefetch_on_host(True)  # pylint: disable=protected-access
     dataset = iter(
-        strategy.experimental_distribute_datasets_from_function(dataset_fn))
+        strategy.experimental_distribute_datasets_from_function(
+            dataset_fn,
+            distribute_lib.InputOptions(
+                experimental_prefetch_to_device=False)))
 
     result = sparse_lookup(dataset)
     self.assertAllEqual(result,
@@ -520,9 +523,11 @@ class TPUStrategyTest(test.TestCase, parameterized.TestCase):
 
       return dataset.map(make_sparse)
 
-    strategy.extended._set_prefetch_on_host(True)  # pylint: disable=protected-access
     dataset = iter(
-        strategy.experimental_distribute_datasets_from_function(dataset_fn))
+        strategy.experimental_distribute_datasets_from_function(
+            dataset_fn,
+            options=distribute_lib.InputOptions(
+                experimental_prefetch_to_device=False)))
 
     result = sparse_lookup(dataset)
     self.assertAllEqual(result, [[0.0, 2.0], [1.5, 5.0]])
@@ -594,5 +599,61 @@ class TPUStrategyDataPrefetchTest(test.TestCase):
         dataset_item.values[0].device)
     self.assertEqual(dataset_location.device_type, "CPU")
 
+  def test_prefetch_to_device_sparse_dataset(self):
+    strategy = get_tpu_strategy()
+    # Values here aren't important.
+    dataset = dataset_ops.Dataset.from_tensors(
+        sparse_tensor.SparseTensor(indices=[[0, 0], [0, 1], [1, 0]],
+                                   values=[1, 2, 3],
+                                   dense_shape=[2, 2]))
+    dataset = dataset.repeat()
+    dataset = dataset.batch(strategy.num_replicas_in_sync)
+
+    with self.assertRaisesRegex(ValueError, "TPUStrategy does not support"):
+      iter(strategy.experimental_distribute_dataset(dataset))
+
+  def test_prefetch_to_device_ragged_dataset(self):
+    strategy = get_tpu_strategy()
+    # Values here aren't important.
+    dataset = dataset_ops.Dataset.from_tensors(
+        ragged_tensor.RaggedTensor.from_row_splits(
+            values=[1, 2, 3],
+            row_splits=[0, 2, 3]))
+    dataset = dataset.repeat()
+    dataset = dataset.batch(strategy.num_replicas_in_sync)
+
+    with self.assertRaisesRegex(ValueError, "TPUStrategy does not support"):
+      iter(strategy.experimental_distribute_dataset(dataset))
+
+  def test_prefetch_to_device_sparse_dataset_fn(self):
+    strategy = get_tpu_strategy()
+    def dataset_fn(ctx):
+      del ctx
+      # Values here aren't important.
+      dataset = dataset_ops.Dataset.from_tensors(
+          sparse_tensor.SparseTensor(indices=[[0, 0], [0, 1], [1, 0]],
+                                     values=[1, 2, 3],
+                                     dense_shape=[2, 2]))
+      dataset = dataset.repeat()
+      return dataset.batch(strategy.num_replicas_in_sync)
+
+    with self.assertRaisesRegex(ValueError, "TPUStrategy does not support"):
+      iter(strategy.experimental_distribute_datasets_from_function(dataset_fn))
+
+  def test_prefetch_to_device_ragged_dataset_fn(self):
+    strategy = get_tpu_strategy()
+    def dataset_fn(ctx):
+      del ctx
+      # Values here aren't important.
+      dataset = dataset_ops.Dataset.from_tensors(
+          ragged_tensor.RaggedTensor.from_row_splits(
+              values=[1, 2, 3],
+              row_splits=[0, 2, 3]))
+      dataset = dataset.repeat()
+      return dataset.batch(strategy.num_replicas_in_sync)
+
+    with self.assertRaisesRegex(ValueError, "TPUStrategy does not support"):
+      iter(strategy.experimental_distribute_datasets_from_function(dataset_fn))
+
 if __name__ == "__main__":
   test.main()

From 7f12fa50f1615e4a2e78482a39e1a5d2db809734 Mon Sep 17 00:00:00 2001
From: Christian Sigg <csigg@google.com>
Date: Mon, 22 Jun 2020 09:46:09 -0700
Subject: [PATCH 0753/1390] Update NCCL to v2.7.3.

PiperOrigin-RevId: 317672859
Change-Id: Ice6b6ac0875f1b6f8daa6ad9d9539621b22e6666
---
 tensorflow/workspace.bzl       |   8 +-
 third_party/nccl/archive.BUILD |   1 +
 third_party/nccl/archive.patch | 181 ++++++++++++++++++++++++++++++++-
 3 files changed, 183 insertions(+), 7 deletions(-)

diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index e6a15b422eb..73698987a08 100755
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -802,11 +802,11 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
         name = "nccl_archive",
         build_file = clean_dep("//third_party:nccl/archive.BUILD"),
         patch_file = clean_dep("//third_party/nccl:archive.patch"),
-        sha256 = "7ff66aca18392b162829612e02c00b123a58ec35869334f72d7e5afaf5ea4a13",
-        strip_prefix = "nccl-3701130b3c1bcdb01c14b3cb70fe52498c1e82b7",
+        sha256 = "67e15ce3d12ba9ea1e0cb239599202b0f61c146149699341043c072de388e90a",
+        strip_prefix = "nccl-5949d96f36d050e59d05872f8bbffd2549318e95",
         urls = [
-            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/nvidia/nccl/archive/3701130b3c1bcdb01c14b3cb70fe52498c1e82b7.tar.gz",
-            "https://github.com/nvidia/nccl/archive/3701130b3c1bcdb01c14b3cb70fe52498c1e82b7.tar.gz",
+            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/nvidia/nccl/archive/5949d96f36d050e59d05872f8bbffd2549318e95.tar.gz",
+            "https://github.com/nvidia/nccl/archive/5949d96f36d050e59d05872f8bbffd2549318e95.tar.gz",
         ],
     )
 
diff --git a/third_party/nccl/archive.BUILD b/third_party/nccl/archive.BUILD
index 9b0c9bdda1d..028b348caff 100644
--- a/third_party/nccl/archive.BUILD
+++ b/third_party/nccl/archive.BUILD
@@ -46,6 +46,7 @@ gen_device_srcs(
         "src/collectives/device/broadcast.cu.cc",
         "src/collectives/device/reduce.cu.cc",
         "src/collectives/device/reduce_scatter.cu.cc",
+        "src/collectives/device/sendrecv.cu.cc",
     ],
 )
 
diff --git a/third_party/nccl/archive.patch b/third_party/nccl/archive.patch
index ea0ead71fb2..94ef48d00e8 100644
--- a/third_party/nccl/archive.patch
+++ b/third_party/nccl/archive.patch
@@ -22,6 +22,10 @@ diff --git a/src/collectives/device/reduce_scatter.cu b/src/collectives/device/r
 similarity index 100%
 rename from src/collectives/device/reduce_scatter.cu
 rename to src/collectives/device/reduce_scatter.cu.cc
+diff --git a/src/collectives/device/sendrecv.cu b/src/collectives/device/sendrecv.cu.cc
+similarity index 100%
+rename from src/collectives/device/sendrecv.cu
+rename to src/collectives/device/sendrecv.cu.cc
 diff --git a/src/nccl.h.in b/src/nccl.h
 similarity index 98%
 rename from src/nccl.h.in
@@ -38,12 +42,183 @@ index 985274e..7ebb1e1 100644
 -#define NCCL_PATCH ${nccl:Patch}
 -#define NCCL_SUFFIX "${nccl:Suffix}"
 +#define NCCL_MAJOR 2
-+#define NCCL_MINOR 5
-+#define NCCL_PATCH 7
++#define NCCL_MINOR 7
++#define NCCL_PATCH 3
 +#define NCCL_SUFFIX ""
  
 -#define NCCL_VERSION_CODE ${nccl:Version}
-+#define NCCL_VERSION_CODE 2507
++#define NCCL_VERSION_CODE 2703
  #define NCCL_VERSION(X,Y,Z) ((X) * 1000 + (Y) * 100 + (Z))
  
  #ifdef __cplusplus
+See https://github.com/NVIDIA/nccl/pull/322.patch
+From 410d341bd4569f60282576daa5c991717dbd560e Mon Sep 17 00:00:00 2001
+From: Danilo <doak@google.com>
+Date: Tue, 14 Apr 2020 14:52:42 +0200
+Subject: [PATCH 1/2] Fix memory leak in xml.cc.
+
+This patch fixes the memory leak documented in
+https://github.com/NVIDIA/nccl/issues/321, where one of the buffers
+allocated by realpath(), inside getPciPath() is not freed.
+
+The memory management aspect of this function also seemed odd and
+unecessary, as the realpath() function is documented to only write up to
+PATH_MAX bytes to the buffer passed to it, meaning we don't need dynamic
+memory allocation at all. I also changed the function signature of
+getPciPath to enforce the use of a fixed-size buffer.
+---
+ src/graph/xml.cc | 23 ++++++++++++-----------
+ 1 file changed, 12 insertions(+), 11 deletions(-)
+
+diff --git a/src/graph/xml.cc b/src/graph/xml.cc
+index 550cfcd0c..8fea91950 100644
+--- a/src/graph/xml.cc
++++ b/src/graph/xml.cc
+@@ -323,12 +323,14 @@ ncclResult_t ncclTopoGetXmlFromFile(const char* xmlTopoFile, struct ncclXml* xml
+ static void memcpylower(char* dst, const char* src, const size_t size) {
+   for (int i=0; i<size; i++) dst[i] = tolower(src[i]);
+ }
+-static ncclResult_t getPciPath(const char* busId, char** path) {
++
++static ncclResult_t getPciPath(const char* busId, char path[PATH_MAX+1]) {
+   char busPath[] = "/sys/class/pci_bus/0000:00/../../0000:00:00.0";
+   memcpylower(busPath+sizeof("/sys/class/pci_bus/")-1, busId, BUSID_REDUCED_SIZE-1);
+   memcpylower(busPath+sizeof("/sys/class/pci_bus/0000:00/../../")-1, busId, BUSID_SIZE-1);
+-  *path = realpath(busPath, NULL);
+-  if (*path == NULL) {
++  // Ensure that the returned string will always be null-terminated;
++  path[PATH_MAX] = 0;
++  if (realpath(busPath, path) == NULL) {
+     WARN("Could not find real path of %s", busPath);
+     return ncclSystemError;
+   }
+@@ -462,16 +464,16 @@ ncclResult_t ncclTopoGetXmlFromSys(struct ncclXmlNode* pciNode, struct ncclXml*
+   // Fill info, then parent
+   const char* busId;
+   NCCLCHECK(xmlGetAttr(pciNode, "busid", &busId));
+-  char* path = NULL;
++  char path[PATH_MAX+1];
+   int index;
+   NCCLCHECK(xmlGetAttrIndex(pciNode, "class", &index));
+   if (index == -1) {
+-    if (path == NULL) NCCLCHECK(getPciPath(busId, &path));
++    NCCLCHECK(getPciPath(busId, path));
+     NCCLCHECK(ncclTopoSetAttrFromSys(pciNode, path, "class", "class"));
+   }
+   NCCLCHECK(xmlGetAttrIndex(pciNode, "link_speed", &index));
+   if (index == -1) {
+-    if (path == NULL) NCCLCHECK(getPciPath(busId, &path));
++    NCCLCHECK(getPciPath(busId, path));
+     char deviceSpeedStr[MAX_STR_LEN];
+     float deviceSpeed;
+     NCCLCHECK(ncclTopoGetStrFromSys(path, "max_link_speed", deviceSpeedStr));
+@@ -484,7 +486,7 @@ ncclResult_t ncclTopoGetXmlFromSys(struct ncclXmlNode* pciNode, struct ncclXml*
+   }
+   NCCLCHECK(xmlGetAttrIndex(pciNode, "link_width", &index));
+   if (index == -1) {
+-    if (path == NULL) NCCLCHECK(getPciPath(busId, &path));
++    NCCLCHECK(getPciPath(busId, path));
+     char strValue[MAX_STR_LEN];
+     NCCLCHECK(ncclTopoGetStrFromSys(path, "max_link_width", strValue));
+     int deviceWidth = strtol(strValue, NULL, 0);
+@@ -494,7 +496,7 @@ ncclResult_t ncclTopoGetXmlFromSys(struct ncclXmlNode* pciNode, struct ncclXml*
+   }
+   struct ncclXmlNode* parent = pciNode->parent;
+   if (parent == NULL) {
+-    if (path == NULL) NCCLCHECK(getPciPath(busId, &path));
++    NCCLCHECK(getPciPath(busId, path));
+ 
+     // Save that for later in case next step is a CPU
+     char numaIdStr[MAX_STR_LEN];
+@@ -544,7 +546,6 @@ ncclResult_t ncclTopoGetXmlFromSys(struct ncclXmlNode* pciNode, struct ncclXml*
+   } else if (strcmp(parent->name, "cpu") == 0) {
+     NCCLCHECK(ncclTopoGetXmlFromCpu(parent, xml));
+   }
+-  free(path);
+   return ncclSuccess;
+ }
+ 
+@@ -640,8 +641,8 @@ ncclResult_t ncclTopoGetXmlFromGpu(struct ncclXmlNode* pciNode, nvmlDevice_t nvm
+     if (index == -1) {
+       const char* busId;
+       NCCLCHECK(xmlGetAttr(sub, "target", &busId));
+-      char* path;
+-      NCCLCHECK(getPciPath(busId, &path));
++      char path[PATH_MAX+1];
++      NCCLCHECK(getPciPath(busId, path));
+       NCCLCHECK(ncclTopoSetAttrFromSys(sub, path, "class", "tclass"));
+     }
+   }
+
+From f02d51952ac587237ea5f7c607a5b379381d09d7 Mon Sep 17 00:00:00 2001
+From: Danilo <doak@google.com>
+Date: Tue, 14 Apr 2020 22:17:49 +0200
+Subject: [PATCH 2/2] Performance tweaks in ncclTopoGetXmlFromSys.
+
+Reduce the number of getPciPath calls to a single one per invocation
+and split the function in two so that the large `path` buffer does
+not linger the in the stack during recursive calls.
+---
+ src/graph/xml.cc | 17 +++++++++++------
+ 1 file changed, 11 insertions(+), 6 deletions(-)
+
+diff --git a/src/graph/xml.cc b/src/graph/xml.cc
+index 8fea91950..42eb68a4b 100644
+--- a/src/graph/xml.cc
++++ b/src/graph/xml.cc
+@@ -460,20 +460,21 @@ int checkBDFFormat(char* bdf) {
+   return 1;
+ }
+ 
+-ncclResult_t ncclTopoGetXmlFromSys(struct ncclXmlNode* pciNode, struct ncclXml* xml) {
++ncclResult_t ncclTopoGetXmlNodeFromSys(struct ncclXmlNode* pciNode,
++                                       struct ncclXml* xml,
++                                       struct ncclXmlNode** return_parent) {
+   // Fill info, then parent
+   const char* busId;
+   NCCLCHECK(xmlGetAttr(pciNode, "busid", &busId));
+   char path[PATH_MAX+1];
++  NCCLCHECK(getPciPath(busId, path));
+   int index;
+   NCCLCHECK(xmlGetAttrIndex(pciNode, "class", &index));
+   if (index == -1) {
+-    NCCLCHECK(getPciPath(busId, path));
+     NCCLCHECK(ncclTopoSetAttrFromSys(pciNode, path, "class", "class"));
+   }
+   NCCLCHECK(xmlGetAttrIndex(pciNode, "link_speed", &index));
+   if (index == -1) {
+-    NCCLCHECK(getPciPath(busId, path));
+     char deviceSpeedStr[MAX_STR_LEN];
+     float deviceSpeed;
+     NCCLCHECK(ncclTopoGetStrFromSys(path, "max_link_speed", deviceSpeedStr));
+@@ -486,7 +487,6 @@ ncclResult_t ncclTopoGetXmlFromSys(struct ncclXmlNode* pciNode, struct ncclXml*
+   }
+   NCCLCHECK(xmlGetAttrIndex(pciNode, "link_width", &index));
+   if (index == -1) {
+-    NCCLCHECK(getPciPath(busId, path));
+     char strValue[MAX_STR_LEN];
+     NCCLCHECK(ncclTopoGetStrFromSys(path, "max_link_width", strValue));
+     int deviceWidth = strtol(strValue, NULL, 0);
+@@ -496,8 +496,6 @@ ncclResult_t ncclTopoGetXmlFromSys(struct ncclXmlNode* pciNode, struct ncclXml*
+   }
+   struct ncclXmlNode* parent = pciNode->parent;
+   if (parent == NULL) {
+-    NCCLCHECK(getPciPath(busId, path));
+-
+     // Save that for later in case next step is a CPU
+     char numaIdStr[MAX_STR_LEN];
+     NCCLCHECK(ncclTopoGetStrFromSys(path, "numa_node", numaIdStr));
+@@ -541,6 +539,13 @@ ncclResult_t ncclTopoGetXmlFromSys(struct ncclXmlNode* pciNode, struct ncclXml*
+     pciNode->parent = parent;
+     parent->subs[parent->nSubs++] = pciNode;
+   }
++  *return_parent = parent;
++  return ncclSuccess;
++}
++
++ncclResult_t ncclTopoGetXmlFromSys(struct ncclXmlNode* pciNode, struct ncclXml* xml) {
++  struct ncclXmlNode* parent;
++  ncclTopoGetXmlNodeFromSys(pciNode, xml, &parent);
+   if (strcmp(parent->name, "pci") == 0) {
+     NCCLCHECK(ncclTopoGetXmlFromSys(parent, xml));
+   } else if (strcmp(parent->name, "cpu") == 0) {

From 604e993cd0a7d542be7e22ce24c116e270642fc9 Mon Sep 17 00:00:00 2001
From: Akshay Modi <nareshmodi@google.com>
Date: Mon, 22 Jun 2020 09:46:27 -0700
Subject: [PATCH 0754/1390] Special case tfnp ndarrays in some places, and
 minor improvements to tfnp array functions

PiperOrigin-RevId: 317672896
Change-Id: I3495884c4367775b387ad34cf35e937ad3b195c8
---
 tensorflow/python/eager/BUILD                  |  1 +
 tensorflow/python/eager/function.py            | 18 ++++++++++++++++--
 tensorflow/python/ops/numpy_ops/np_arrays.py   | 17 +++++++++++++++--
 .../python/ops/numpy_ops/np_interop_test.py    |  3 ++-
 4 files changed, 34 insertions(+), 5 deletions(-)

diff --git a/tensorflow/python/eager/BUILD b/tensorflow/python/eager/BUILD
index af5f3d16408..f51bd97e488 100644
--- a/tensorflow/python/eager/BUILD
+++ b/tensorflow/python/eager/BUILD
@@ -560,6 +560,7 @@ py_library(
         "//tensorflow/python:graph_to_function_def",
         "//tensorflow/python:pywrap_tf_session",
         "//tensorflow/python:util",
+        "//tensorflow/python/ops/numpy_ops:numpy",
         "//third_party/py/numpy",
         "@six_archive//:six",
     ],
diff --git a/tensorflow/python/eager/function.py b/tensorflow/python/eager/function.py
index ca1e60c1b7b..845abe43134 100644
--- a/tensorflow/python/eager/function.py
+++ b/tensorflow/python/eager/function.py
@@ -81,6 +81,9 @@ from tensorflow.python.util import tf_inspect
 ag_ctx = lazy_loader.LazyLoader(
     "ag_ctx", globals(),
     "tensorflow.python.autograph.core.ag_ctx")
+np_arrays = lazy_loader.LazyLoader(
+    "np_arrays", globals(),
+    "tensorflow.python.ops.numpy_ops.np_arrays")
 
 
 FORWARD_FUNCTION_ATTRIBUTE_NAME = "forward_function_name"
@@ -1487,6 +1490,11 @@ class ConcreteFunction(object):
     self._func_graph = func_graph
     self._captured_inputs = self._func_graph.external_captures
     self._captured_closures = self._func_graph.deferred_external_captures
+    structured_outputs = self._func_graph.structured_outputs
+    self._ndarrays_list = (
+        isinstance(structured_outputs, (list, tuple)) and
+        all([isinstance(o, np_arrays.ndarray) for o in structured_outputs]))
+    self._ndarray_singleton = isinstance(structured_outputs, np_arrays.ndarray)
 
     # function_spec defines the structured signature.
     self._set_function_spec(function_spec)
@@ -2153,9 +2161,15 @@ class ConcreteFunction(object):
     if self._func_graph.structured_outputs is None:
       return result
 
+    if result:
+      if self._ndarrays_list:
+        return [np_arrays.tensor_to_ndarray(o) for o in result]
+      elif self._ndarray_singleton:
+        return np_arrays.tensor_to_ndarray(result[0])
+
     # Replace outputs with results, skipping over any 'None' values.
-    outputs_list = nest.flatten(self._func_graph.structured_outputs,
-                                expand_composites=True)
+    outputs_list = nest.flatten(
+        self._func_graph.structured_outputs, expand_composites=True)
     j = 0
     for i, o in enumerate(outputs_list):
       if o is not None:
diff --git a/tensorflow/python/ops/numpy_ops/np_arrays.py b/tensorflow/python/ops/numpy_ops/np_arrays.py
index 65e8273375f..d5ad87a887e 100644
--- a/tensorflow/python/ops/numpy_ops/np_arrays.py
+++ b/tensorflow/python/ops/numpy_ops/np_arrays.py
@@ -149,10 +149,23 @@ class ndarray(composite_tensor.CompositeTensor):  # pylint: disable=invalid-name
     if dtype and dtype != buffer.dtype:
       buffer = array_ops.bitcast(buffer, dtype)
     self._data = buffer
+    self._type_spec_internal = None
+
+  @classmethod
+  def from_tensor(cls, tensor):
+    o = cls.__new__(cls, None)
+    # pylint: disable=protected-access
+    o._data = tensor
+    o._type_spec_internal = None
+    # pylint: enable=protected-access
+    return o
 
   @property
   def _type_spec(self):
-    return NdarraySpec(type_spec.type_spec_from_value(self._data))
+    if self._type_spec_internal is None:
+      self._type_spec_internal = NdarraySpec(
+          type_spec.type_spec_from_value(self._data))
+    return self._type_spec_internal
 
   @property
   def data(self):
@@ -295,7 +308,7 @@ class ndarray(composite_tensor.CompositeTensor):  # pylint: disable=invalid-name
 
 
 def tensor_to_ndarray(tensor):
-  return ndarray(tensor._shape_tuple(), dtype=tensor.dtype, buffer=tensor)  # pylint: disable=protected-access
+  return ndarray.from_tensor(tensor)
 
 
 def ndarray_to_tensor(arr, dtype=None, name=None, as_ref=False):
diff --git a/tensorflow/python/ops/numpy_ops/np_interop_test.py b/tensorflow/python/ops/numpy_ops/np_interop_test.py
index 9580b787202..f6882e968b5 100644
--- a/tensorflow/python/ops/numpy_ops/np_interop_test.py
+++ b/tensorflow/python/ops/numpy_ops/np_interop_test.py
@@ -64,7 +64,8 @@ class InteropTest(test.TestCase):
 
     dx, dy = t.gradient([xx, yy], [x, y])
 
-    # TODO(nareshmodi): Gradient tape returns tensors. Is it possible to rewrap?
+    self.assertIsInstance(dx, np_arrays.ndarray)
+    self.assertIsInstance(dy, np_arrays.ndarray)
     self.assertAllClose(dx, 2.0)
     self.assertAllClose(dy, 3.0)
 

From 9afaf559d9e9553bc651faa35fc0e6d6f9a04a84 Mon Sep 17 00:00:00 2001
From: Nathan Luehr <nluehr@nvidia.com>
Date: Mon, 22 Jun 2020 12:28:02 -0500
Subject: [PATCH 0755/1390] Locate CUDNN_FMA_MATH inside CUDNN 8 macros

---
 tensorflow/stream_executor/cuda/cuda_dnn.cc | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/tensorflow/stream_executor/cuda/cuda_dnn.cc b/tensorflow/stream_executor/cuda/cuda_dnn.cc
index 166fa0e32d0..6f0f94c7da2 100644
--- a/tensorflow/stream_executor/cuda/cuda_dnn.cc
+++ b/tensorflow/stream_executor/cuda/cuda_dnn.cc
@@ -1176,11 +1176,11 @@ class CudnnRnnDescriptor : public dnn::RnnDescriptor {
     }
 
     cudnnMathType_t math_type;
-    if (use_tensor_ops) {
-      math_type = CUDNN_TENSOR_OP_MATH;
-    } else {
-      math_type = CUDNN_VERSION >= 8000 ? CUDNN_FMA_MATH : CUDNN_DEFAULT_MATH;
-    }
+#if CUDNN_VERSION >= 8000
+    math_type = use_tensor_ops ? CUDNN_TENSOR_OP_MATH : CUDNN_FMA_MATH;
+#else
+    math_type = use_tensor_ops ? CUDNN_TENSOR_OP_MATH : CUDNN_DEFAULT_MATH;
+#endif
     CHECK_CUDNN_OK(cudnnSetRNNMatrixMathType(rnn_desc.get(), math_type));
 #endif  // CUDNN_VERSION >= 7000
 

From 2afae99d7fd51dc9c3143718cbd19051a0b2140b Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 22 Jun 2020 09:51:54 -0700
Subject: [PATCH 0756/1390] Migrating deprecated aliases for absl::StrSplit,
 and the corresponding predicates

PiperOrigin-RevId: 317673991
Change-Id: I77569b9f0c18095c9ade9bcf3c306d2db0fb2048
---
 tensorflow/lite/toco/args.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/lite/toco/args.cc b/tensorflow/lite/toco/args.cc
index c30b98ce516..94fd850fdc5 100644
--- a/tensorflow/lite/toco/args.cc
+++ b/tensorflow/lite/toco/args.cc
@@ -114,7 +114,7 @@ inline bool TryStripSuffixString(absl::string_view str,
 bool Arg<toco::IntList>::Parse(std::string text) {
   parsed_value_.elements.clear();
   specified_ = true;
-  // strings::Split("") produces {""}, but we need {} on empty input.
+  // absl::StrSplit("") produces {""}, but we need {} on empty input.
   // TODO(aselle): Moved this from elsewhere, but ahentz recommends we could
   // use absl::SplitLeadingDec32Values(text.c_str(), &parsed_values_.elements)
   if (!text.empty()) {

From 251923169ddde0b69c58fb41b23443423ababd2d Mon Sep 17 00:00:00 2001
From: Rahul Joshi <jurahul@google.com>
Date: Mon, 22 Jun 2020 09:58:59 -0700
Subject: [PATCH 0757/1390] [NFC] Eliminate use of .getBlocks() when not needed
       Also use llvm::hasSingleElement() instead of .size() == 1

PiperOrigin-RevId: 317675565
Change-Id: I4f0e8892957c2b20e115584fe7424da68d53b67a
---
 tensorflow/compiler/mlir/lite/flatbuffer_export.cc    |  6 +++---
 .../compiler/mlir/lite/transforms/post_quantize.cc    |  2 +-
 tensorflow/compiler/mlir/tensorflow/ir/tf_device.cc   |  6 +++---
 tensorflow/compiler/mlir/tensorflow/ir/tf_executor.cc |  6 +++---
 .../mlir/tensorflow/transforms/fold_switch.cc         |  2 +-
 .../transforms/materialize_mlir_passthrough_op.cc     |  2 +-
 .../transforms/promote_resources_to_args.cc           |  4 ++--
 .../mlir/tensorflow/transforms/resource_op_lifting.cc |  2 +-
 .../transforms/tpu_sharding_identification_pass.cc    | 11 ++++-------
 .../mlir/tensorflow/translate/export_graphdef.cc      |  2 +-
 .../tensorflow/translate/tf_functional_to_executor.cc |  4 ++--
 .../tensorflow/translate/translate_tf_dialect_op.cc   |  5 +++--
 tensorflow/compiler/mlir/xla/mlir_hlo_to_hlo.cc       |  4 ++--
 .../mlir/xla/transforms/hlo_legalize_to_lhlo.cc       |  8 ++++----
 .../mlir/xla/transforms/legalize_tf_with_tf2xla.cc    |  8 +++++---
 .../compiler/mlir/xla/transforms/lhlo_fuse_linalg.cc  |  5 +++--
 .../compiler/mlir/xla/transforms/xla_hlo_fusion.cc    |  2 +-
 .../compiler/xla/service/mlir_gpu/kernel_lowering.cc  |  2 +-
 18 files changed, 41 insertions(+), 40 deletions(-)

diff --git a/tensorflow/compiler/mlir/lite/flatbuffer_export.cc b/tensorflow/compiler/mlir/lite/flatbuffer_export.cc
index a260670015a..e34e7ae7ca6 100644
--- a/tensorflow/compiler/mlir/lite/flatbuffer_export.cc
+++ b/tensorflow/compiler/mlir/lite/flatbuffer_export.cc
@@ -240,10 +240,10 @@ static bool IsValidTFLiteMlirModule(ModuleOp module) {
   }
 
   for (auto fn : module.getOps<FuncOp>()) {
-    if (fn.getBlocks().size() != 1) {
+    if (!llvm::hasSingleElement(fn)) {
       return fn.emitError("should have exactly one basic block"), false;
     }
-    auto& bb = fn.getBlocks().front();
+    auto& bb = fn.front();
 
     for (auto arg : bb.getArguments()) {
       if (!HasValidTFLiteType(arg, fn))
@@ -1089,7 +1089,7 @@ void Translator::InitializeNamesFromAttribute(FuncOp fn, bool* has_input_attr) {
           dict_attr.get("outputs").dyn_cast_or_null<mlir::StringAttr>()) {
     str.getValue().split(output_names, ',', /*MaxSplit=*/-1,
                          /*KeepEmpty=*/false);
-    auto term = fn.getBlocks().back().getTerminator();
+    auto term = fn.back().getTerminator();
     if (output_names.size() != term->getNumOperands()) {
       fn.emitWarning() << "output names (" << output_names.size()
                        << ") != terminator operands (" << term->getNumOperands()
diff --git a/tensorflow/compiler/mlir/lite/transforms/post_quantize.cc b/tensorflow/compiler/mlir/lite/transforms/post_quantize.cc
index 9a1da0ad03d..33380e00543 100644
--- a/tensorflow/compiler/mlir/lite/transforms/post_quantize.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/post_quantize.cc
@@ -52,7 +52,7 @@ class PostQuantizePass : public PassWrapper<PostQuantizePass, FunctionPass> {
 
 void RemoveQuantizationAdaptorOps(FuncOp func) {
   mlir::OpBuilder builder(func.getBody());
-  auto& bb = func.getBlocks().front();
+  auto& bb = func.front();
   auto* terminator = bb.getTerminator();
 
   int num_args = bb.getNumArguments();
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_device.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_device.cc
index b8f0585040c..7dd74282487 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_device.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_device.cc
@@ -299,13 +299,13 @@ ParseResult ParseReplicateOp(OpAsmParser* parser, OperationState* state) {
       parser->parseRegion(body, region_args, region_arg_types))
     return failure();
 
-  if (body.getBlocks().size() > 1)
-    return parser->emitError(loc) << "expects a single block region";
-
   // Ensure that the region is well formed: it contains at least a block with
   // a ReturnOp terminator.
   ReplicateOp::ensureTerminator(body, parser->getBuilder(), state->location);
 
+  if (!llvm::hasSingleElement(body))
+    return parser->emitError(loc) << "expects a single block region";
+
   Operation& terminator = body.front().back();
   if (!isa<ReturnOp>(terminator))
     return parser->emitError(loc) << "expects a tf_device.return terminator";
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_executor.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_executor.cc
index 3403651eef8..1e66eee06bb 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_executor.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_executor.cc
@@ -220,13 +220,13 @@ ParseResult ParseGraphOp(OpAsmParser &parser, OperationState &result) {
   Region &body = *result.addRegion();
   if (parser.parseRegion(body, llvm::None, llvm::None)) return failure();
 
-  if (body.getBlocks().size() > 1)
-    return parser.emitError(loc) << "expects a single block region";
-
   // Ensure that the region is well formed: it contains at least a block with
   // a FetchOp terminator.
   GraphOp::ensureTerminator(body, parser.getBuilder(), result.location);
 
+  if (!llvm::hasSingleElement(body))
+    return parser.emitError(loc) << "expects a single block region";
+
   // Get the results type from the terminator type inside the graph.
   Operation &fetch = body.back().back();
   if (!isa<FetchOp>(fetch))
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/fold_switch.cc b/tensorflow/compiler/mlir/tensorflow/transforms/fold_switch.cc
index 4d26747ebdc..b47378762a9 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/fold_switch.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/fold_switch.cc
@@ -199,7 +199,7 @@ static void MatchSwitchFoldOps(tf_executor::SwitchOp switch_op,
 // Folds merge nodes with only a single non-dead input.
 static LogicalResult FoldMergeNodes(FuncOp function, const DeadQueue& queue) {
   // Create builder for val_index of MergeOp.
-  auto* block = &function.getBlocks().front();
+  auto* block = &function.front();
   OpBuilder builder = OpBuilder::atBlockEnd(block);
   auto type = builder.getIntegerType(32);
   auto build_index = [&](Location loc, int value) {
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/materialize_mlir_passthrough_op.cc b/tensorflow/compiler/mlir/tensorflow/transforms/materialize_mlir_passthrough_op.cc
index 94fdfb310ac..3ed27d7ce30 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/materialize_mlir_passthrough_op.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/materialize_mlir_passthrough_op.cc
@@ -71,7 +71,7 @@ void MaterializePassthroughOpPass::runOnFunction() {
       return;
     }
     Region &body = main.getBody();
-    if (body.getBlocks().size() != 1) {
+    if (!llvm::hasSingleElement(body)) {
       op->emitError() << "MLIR Opaque Op expects a main() entry point with a "
                          "single block\n";
       return;
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/promote_resources_to_args.cc b/tensorflow/compiler/mlir/tensorflow/transforms/promote_resources_to_args.cc
index cece23b4750..af36770f496 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/promote_resources_to_args.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/promote_resources_to_args.cc
@@ -80,11 +80,11 @@ constexpr char kResourceNameArgAttr[] = "tf.resource_name";
 
 // Checks if a function has only one block.
 mlir::LogicalResult CheckSingleBlockFunction(FuncOp function) {
-  if (!hasSingleElement(function.getBlocks()))
+  if (!llvm::hasSingleElement(function)) {
     return function.emitError()
            << "expects function '" << function.getName()
            << "' to have 1 block, got " << function.getBlocks().size();
-
+  }
   return success();
 }
 
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/resource_op_lifting.cc b/tensorflow/compiler/mlir/tensorflow/transforms/resource_op_lifting.cc
index ed7ebc25c9f..799ab3a0f0d 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/resource_op_lifting.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/resource_op_lifting.cc
@@ -1113,7 +1113,7 @@ LogicalResult ResourceLiftingForFunctionalControlFlow(FuncOp function) {
   // This routine should only be called when control flow operations are still
   // represented with TF IfOp and WhileOp operations. In this case, there should
   // be only one basic blocks in the MLIR representation.
-  if (!hasSingleElement(function.getBlocks())) {
+  if (!llvm::hasSingleElement(function)) {
     return function.emitError()
            << "expect the function to have 1 block while it has "
            << function.getBlocks().size();
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_sharding_identification_pass.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_sharding_identification_pass.cc
index f8b6e364f55..b05e87c6485 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_sharding_identification_pass.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_sharding_identification_pass.cc
@@ -159,8 +159,7 @@ llvm::SmallVector<FunctionAndArgumentInfo, 4> ExtractFunctionsConnectedToArg(
   while (!functions_to_parse.empty()) {
     llvm::SmallVector<FunctionAndArgumentInfo, 4> newly_discovered_functions;
     for (auto function_info : functions_to_parse) {
-      Block& func_entry_block =
-          function_info.func.getBody().getBlocks().front();
+      Block& func_entry_block = function_info.func.front();
       auto argument =
           func_entry_block.getArgument(function_info.argument_index);
 
@@ -186,8 +185,7 @@ void IdentifyXlaShardingForComputationInputs(
     StringRef logical_core_0_sharding, tf_device::ClusterFuncOp cluster_func_op,
     FuncOp cluster_function, Builder* builder) {
   // Look up function definition from module.
-  Block& cluster_function_block =
-      cluster_function.getBody().getBlocks().front();
+  Block& cluster_function_block = cluster_function.front();
   ModuleOp module = cluster_func_op.getParentOfType<ModuleOp>();
 
   llvm::SmallVector<llvm::StringRef, 8> sharding_for_args(
@@ -215,8 +213,7 @@ void IdentifyXlaShardingForComputationInputs(
 
         const int function_argument_index = function_arg_info.argument_index;
         auto& parsed_function = function_arg_info.func;
-        Block& parsed_function_block =
-            parsed_function.getBody().getBlocks().front();
+        Block& parsed_function_block = parsed_function.front();
         arg_sharding = ParseInputSharding(
             parsed_function_block.getArgument(function_argument_index));
       }
@@ -245,7 +242,7 @@ void IdentifyXlaShardingForComputationOutputs(
     tf_device::ClusterFuncOp cluster_func, Builder* builder) {
   // By default return values from logical core 0 is used if no sharding
   // configuration is defined.
-  Block& function_block = func.getBody().getBlocks().front();
+  Block& function_block = func.front();
   Operation* terminator = function_block.getTerminator();
   llvm::SmallVector<llvm::StringRef, 8> sharding_for_rets(
       terminator->getNumOperands(), logical_core_0_sharding);
diff --git a/tensorflow/compiler/mlir/tensorflow/translate/export_graphdef.cc b/tensorflow/compiler/mlir/tensorflow/translate/export_graphdef.cc
index 262f6f4e50c..8cd14894f8f 100644
--- a/tensorflow/compiler/mlir/tensorflow/translate/export_graphdef.cc
+++ b/tensorflow/compiler/mlir/tensorflow/translate/export_graphdef.cc
@@ -128,7 +128,7 @@ class LegalizedOpOrValLocNameMapper : public OpOrArgLocNameMapper {
 Status HasSingleGraphSingleOpIslandsFunctions(mlir::ModuleOp module) {
   Status status = Status::OK();
   module.walk([&](mlir::FuncOp function) {
-    if (function.getBlocks().size() != 1) {
+    if (!llvm::hasSingleElement(function)) {
       status = errors::FailedPrecondition(
           kInvalidExecutorGraphMsg,
           "only single block functions are supported.");
diff --git a/tensorflow/compiler/mlir/tensorflow/translate/tf_functional_to_executor.cc b/tensorflow/compiler/mlir/tensorflow/translate/tf_functional_to_executor.cc
index 29f98de6448..78019119d9d 100644
--- a/tensorflow/compiler/mlir/tensorflow/translate/tf_functional_to_executor.cc
+++ b/tensorflow/compiler/mlir/tensorflow/translate/tf_functional_to_executor.cc
@@ -46,13 +46,13 @@ struct FunctionalToExecutorDialectConversion
 }  // end anonymous namespace
 
 void FunctionalToExecutorDialectConversion::runOnFunction() {
-  if (getFunction().getBlocks().size() != 1) {
+  if (!llvm::hasSingleElement(getFunction())) {
     LLVM_DEBUG(llvm::dbgs() << "Expect single block function, skip conversion "
                                "to tf_executor dialect\n");
     return;
   }
   auto loc = getFunction().getLoc();
-  mlir::Block& body = getFunction().getBody().front();
+  mlir::Block& body = getFunction().front();
   // Find region of interest and ReturnOp.
   auto copy_range = body.without_terminator();
   if (copy_range.begin() != copy_range.end() &&
diff --git a/tensorflow/compiler/mlir/tensorflow/translate/translate_tf_dialect_op.cc b/tensorflow/compiler/mlir/tensorflow/translate/translate_tf_dialect_op.cc
index bd3fe9876ff..5236bdeffbf 100644
--- a/tensorflow/compiler/mlir/tensorflow/translate/translate_tf_dialect_op.cc
+++ b/tensorflow/compiler/mlir/tensorflow/translate/translate_tf_dialect_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/Support/ToolOutputFile.h"
 #include "mlir/IR/Function.h"  // from @llvm-project
 #include "mlir/IR/Location.h"  // from @llvm-project
@@ -26,12 +27,12 @@ static mlir::Operation* ExtractOnlyOp(mlir::ModuleOp module) {
   mlir::FuncOp fn = module.lookupSymbol<mlir::FuncOp>("main");
   if (!fn) return nullptr;
 
-  if (fn.getBlocks().size() != 1) return nullptr;
+  if (!llvm::hasSingleElement(fn)) return nullptr;
 
   // Here, modules with exactly two operations in the only basic block are
   // supported. The last operation should be a terminator operation and the
   // other operation is the operation of interest.
-  auto& block = fn.getBlocks().front();
+  auto& block = fn.front();
   if (block.getOperations().size() != 2) return nullptr;
   if (!block.back().isKnownTerminator()) return nullptr;
 
diff --git a/tensorflow/compiler/mlir/xla/mlir_hlo_to_hlo.cc b/tensorflow/compiler/mlir/xla/mlir_hlo_to_hlo.cc
index 60d9a698731..7a576780c61 100644
--- a/tensorflow/compiler/mlir/xla/mlir_hlo_to_hlo.cc
+++ b/tensorflow/compiler/mlir/xla/mlir_hlo_to_hlo.cc
@@ -1148,13 +1148,13 @@ LogicalResult ConvertToHloModule::LowerFunctionCall(
 
 LogicalResult ConvertToHloModule::RunOnFunction(mlir::FuncOp f) {
   if (lowered_computation_.count(f)) return success();
-  if (f.getBlocks().size() != 1) {
+  if (!llvm::hasSingleElement(f)) {
     return f.emitError("only single block Function supported");
   }
 
   // Create a sub-builder if this is not the main function.
   std::unique_ptr<xla::XlaBuilder> builder_up;
-  bool entry_function = f.getName().str() == "main";
+  bool entry_function = f.getName() == "main";
   if (!entry_function)
     builder_up = module_builder_.CreateSubBuilder(f.getName().str());
   auto& builder = entry_function ? module_builder_ : *builder_up;
diff --git a/tensorflow/compiler/mlir/xla/transforms/hlo_legalize_to_lhlo.cc b/tensorflow/compiler/mlir/xla/transforms/hlo_legalize_to_lhlo.cc
index 446f2aae833..7cdc0d92207 100644
--- a/tensorflow/compiler/mlir/xla/transforms/hlo_legalize_to_lhlo.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/hlo_legalize_to_lhlo.cc
@@ -230,10 +230,10 @@ struct HloToLhloReduceOpConverter : public BaseOpConversion<xla_hlo::ReduceOp> {
     auto loc = op.getLoc();
     // TODO(b/137624192) Implement variadic reduce.
     if (op.getNumResults() != 1) return failure();
-    if (op.getParentRegion()->getBlocks().size() != 1) {
-      op.emitOpError() << "tensor to buffer conversion expects a single block "
-                          "in the region containing the operation";
-      return failure();
+    if (!llvm::hasSingleElement(op.body())) {
+      return op.emitOpError()
+             << "tensor to buffer conversion expects a single block "
+                "in the region containing the operation";
     }
     const auto& original_results = op.getResults();
     SmallVector<Value, 4> buffer_args(operands.begin(), operands.end());
diff --git a/tensorflow/compiler/mlir/xla/transforms/legalize_tf_with_tf2xla.cc b/tensorflow/compiler/mlir/xla/transforms/legalize_tf_with_tf2xla.cc
index 8f96f4d1305..54453406ef7 100644
--- a/tensorflow/compiler/mlir/xla/transforms/legalize_tf_with_tf2xla.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/legalize_tf_with_tf2xla.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/Optional.h"
+#include "llvm/ADT/STLExtras.h"
 #include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
 #include "mlir/IR/Diagnostics.h"  // from @llvm-project
 #include "mlir/IR/Function.h"  // from @llvm-project
@@ -320,13 +321,14 @@ LogicalResult FuncLegalizer::PrepareParams() {
 }
 
 LogicalResult FuncLegalizer::Legalize() {
+  if (func_.empty()) return success();
+
   // TensorFlow functions don't use CFGs.
-  if (func_.getBlocks().size() > 1) {
+  if (!llvm::hasSingleElement(func_)) {
     emitError(func_.getLoc()) << "requires at most one block in a TF function";
     return failure();
   }
-  if (func_.getBlocks().empty()) return success();
-  Block& block = func_.getBlocks().front();
+  Block& block = func_.front();
 
   std::vector<Operation*> ops;
   ops.reserve(block.getOperations().size());
diff --git a/tensorflow/compiler/mlir/xla/transforms/lhlo_fuse_linalg.cc b/tensorflow/compiler/mlir/xla/transforms/lhlo_fuse_linalg.cc
index e16ab571b4d..f0971fdf76e 100644
--- a/tensorflow/compiler/mlir/xla/transforms/lhlo_fuse_linalg.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/lhlo_fuse_linalg.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include "mlir/Dialect/Linalg/Analysis/DependenceAnalysis.h"
 #include "absl/memory/memory.h"
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/STLExtras.h"
 #include "mlir/Dialect/Linalg/Transforms/Transforms.h"  // from @llvm-project
 #include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
@@ -44,7 +45,7 @@ class LhloFuseLinalg : public PassWrapper<LhloFuseLinalg, FunctionPass> {
     auto func = getFunction();
 
     // TODO(pifon): Remove assumption that the function has a single block.
-    if (func.getBlocks().size() != 1) {
+    if (!llvm::hasSingleElement(func)) {
       emitError(func.getLoc(), "The function needs to have a single block.");
       signalPassFailure();
       return;
@@ -58,7 +59,7 @@ class LhloFuseLinalg : public PassWrapper<LhloFuseLinalg, FunctionPass> {
     for (auto func_arg : func.getArguments()) {
       result_buffers.insert(func_arg);
     }
-    for (auto& block : func.getBlocks()) {
+    for (auto& block : func) {
       auto returnOp = mlir::dyn_cast<mlir::ReturnOp>(block.getTerminator());
       if (!returnOp) continue;
       for (auto operand : returnOp.getOperands()) {
diff --git a/tensorflow/compiler/mlir/xla/transforms/xla_hlo_fusion.cc b/tensorflow/compiler/mlir/xla/transforms/xla_hlo_fusion.cc
index c4eb0e143d2..5d3eda0bea5 100644
--- a/tensorflow/compiler/mlir/xla/transforms/xla_hlo_fusion.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/xla_hlo_fusion.cc
@@ -487,7 +487,7 @@ struct XlaHloFusion : public mlir::PassWrapper<XlaHloFusion, FunctionPass> {
     }
 
     // process each block and do fusion within a block.
-    for (Block& block : func.getBlocks()) {
+    for (Block& block : func) {
       SmallVector<Operation*, 4> op_list;
       for (Operation& op : block) {
         op_list.push_back(&op);
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/kernel_lowering.cc b/tensorflow/compiler/xla/service/mlir_gpu/kernel_lowering.cc
index ecd1308be4b..3f99d40c717 100644
--- a/tensorflow/compiler/xla/service/mlir_gpu/kernel_lowering.cc
+++ b/tensorflow/compiler/xla/service/mlir_gpu/kernel_lowering.cc
@@ -301,7 +301,7 @@ struct RewriteKernelSignature
         signalPassFailure();
         return;
       }
-      if (func.getBlocks().size() != 1) {
+      if (!llvm::hasSingleElement(func)) {
         func.emitError() << "surrounding function has more than one block";
         signalPassFailure();
         return;

From 834f2bd72627ee8499637ec3c1a526b1dcd19a79 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 22 Jun 2020 10:01:26 -0700
Subject: [PATCH 0758/1390] Enable tf.image.ssim to run on tensor-like inputs
 and add SSIM name space.

PiperOrigin-RevId: 317676190
Change-Id: I08408eaf397ace235b5f50513096cbd9ba46d5a8
---
 tensorflow/python/ops/image_ops_impl.py | 31 ++++++++++++++-----------
 tensorflow/python/ops/image_ops_test.py | 23 ++++++++++++++++++
 2 files changed, 41 insertions(+), 13 deletions(-)

diff --git a/tensorflow/python/ops/image_ops_impl.py b/tensorflow/python/ops/image_ops_impl.py
index a934639d524..683681b5c98 100644
--- a/tensorflow/python/ops/image_ops_impl.py
+++ b/tensorflow/python/ops/image_ops_impl.py
@@ -3642,20 +3642,25 @@ def ssim(img1,
     values are in range (-1, 1], when pixel values are non-negative. Returns
     a tensor with shape: broadcast(img1.shape[:-3], img2.shape[:-3]).
   """
-  _, _, checks = _verify_compatible_image_shapes(img1, img2)
-  with ops.control_dependencies(checks):
-    img1 = array_ops.identity(img1)
+  with ops.name_scope(None, 'SSIM', [img1, img2]):
+    # Convert to tensor if needed.
+    img1 = ops.convert_to_tensor(img1, name='img1')
+    img2 = ops.convert_to_tensor(img2, name='img2')
+    # Shape checking.
+    _, _, checks = _verify_compatible_image_shapes(img1, img2)
+    with ops.control_dependencies(checks):
+      img1 = array_ops.identity(img1)
 
-  # Need to convert the images to float32.  Scale max_val accordingly so that
-  # SSIM is computed correctly.
-  max_val = math_ops.cast(max_val, img1.dtype)
-  max_val = convert_image_dtype(max_val, dtypes.float32)
-  img1 = convert_image_dtype(img1, dtypes.float32)
-  img2 = convert_image_dtype(img2, dtypes.float32)
-  ssim_per_channel, _ = _ssim_per_channel(img1, img2, max_val, filter_size,
-                                          filter_sigma, k1, k2)
-  # Compute average over color channels.
-  return math_ops.reduce_mean(ssim_per_channel, [-1])
+    # Need to convert the images to float32.  Scale max_val accordingly so that
+    # SSIM is computed correctly.
+    max_val = math_ops.cast(max_val, img1.dtype)
+    max_val = convert_image_dtype(max_val, dtypes.float32)
+    img1 = convert_image_dtype(img1, dtypes.float32)
+    img2 = convert_image_dtype(img2, dtypes.float32)
+    ssim_per_channel, _ = _ssim_per_channel(img1, img2, max_val, filter_size,
+                                            filter_sigma, k1, k2)
+    # Compute average over color channels.
+    return math_ops.reduce_mean(ssim_per_channel, [-1])
 
 
 # Default values obtained by Wang et al.
diff --git a/tensorflow/python/ops/image_ops_test.py b/tensorflow/python/ops/image_ops_test.py
index 3530885fe07..0206ccf9b33 100644
--- a/tensorflow/python/ops/image_ops_test.py
+++ b/tensorflow/python/ops/image_ops_test.py
@@ -4865,6 +4865,29 @@ class SSIMTest(test_util.TensorFlowTestCase):
     with self.cached_session(use_gpu=True):
       self.assertAllClose(expected, self.evaluate(ssim), atol=1e-4)
 
+  def testBatchNumpyInputs(self):
+    img = self._LoadTestImages()
+    expected = self._ssim[np.triu_indices(3, k=1)]
+
+    img1, img2 = zip(*itertools.combinations(img, 2))
+    img1 = np.concatenate(img1)
+    img2 = np.concatenate(img2)
+
+    with self.cached_session(use_gpu=True):
+      img1 = self.evaluate(constant_op.constant(img1))
+      img2 = self.evaluate(constant_op.constant(img2))
+
+    ssim = image_ops.ssim(
+        img1,
+        img2,
+        1.0,
+        filter_size=11,
+        filter_sigma=1.5,
+        k1=0.01,
+        k2=0.03)
+    with self.cached_session(use_gpu=True):
+      self.assertAllClose(expected, self.evaluate(ssim), atol=1e-4)
+
   def testBroadcast(self):
     img = self._LoadTestImages()[:2]
     expected = self._ssim[:2, :2]

From 3f973428767cfbbbeeb412b3d7e0be8e566591fb Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 22 Jun 2020 10:01:29 -0700
Subject: [PATCH 0759/1390] Fix the api docstrings for on_*_batch_begin
 methods.

PiperOrigin-RevId: 317676199
Change-Id: I15f8eb706c5841f058d5ac237c2939a7a4d31809
---
 tensorflow/python/keras/callbacks.py | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/tensorflow/python/keras/callbacks.py b/tensorflow/python/keras/callbacks.py
index 1fae5abd84b..138a682c739 100644
--- a/tensorflow/python/keras/callbacks.py
+++ b/tensorflow/python/keras/callbacks.py
@@ -420,8 +420,9 @@ class CallbackList(object):
 
     Arguments:
         batch: Integer, index of batch within the current epoch.
-        logs: Dict. Has keys `batch` and `size` representing the current batch
-          number and the size of the batch.
+        logs: Dict, contains the return value of `model.train_step`. Typically,
+          the values of the `Model`'s metrics are returned.  Example:
+          `{'loss': 0.2, 'accuracy': 0.7}`.
     """
     # TODO(b/150629188): Make ProgBarLogger callback not use batch hooks
     # when verbose != 1
@@ -443,8 +444,9 @@ class CallbackList(object):
 
     Arguments:
         batch: Integer, index of batch within the current epoch.
-        logs: Dict. Has keys `batch` and `size` representing the current batch
-          number and the size of the batch.
+        logs: Dict, contains the return value of `model.test_step`. Typically,
+          the values of the `Model`'s metrics are returned.  Example:
+          `{'loss': 0.2, 'accuracy': 0.7}`.
     """
     if self._should_call_test_batch_hooks:
       self._call_batch_hook(ModeKeys.TEST, 'begin', batch, logs=logs)
@@ -464,8 +466,9 @@ class CallbackList(object):
 
     Arguments:
         batch: Integer, index of batch within the current epoch.
-        logs: Dict. Has keys `batch` and `size` representing the current batch
-          number and the size of the batch.
+        logs: Dict, contains the return value of `model.predict_step`,
+          it typically returns a dict with a key 'outputs' containing
+          the model's outputs.
     """
     if self._should_call_predict_batch_hooks:
       self._call_batch_hook(ModeKeys.PREDICT, 'begin', batch, logs=logs)

From 60be0c3c0ea9e8ffd9189f1cc755199384d8faf0 Mon Sep 17 00:00:00 2001
From: "T.J. Alumbaugh" <talumbau@google.com>
Date: Mon, 22 Jun 2020 10:06:04 -0700
Subject: [PATCH 0760/1390] Restore functionality of
 --define=tflite_with_ruy=true

PiperOrigin-RevId: 317677225
Change-Id: If6533fdfeb21f676dd4b77a536b1aca894a03003
---
 tensorflow/lite/kernels/BUILD                 | 58 +++++++++----------
 tensorflow/lite/kernels/conv.cc               | 14 ++---
 tensorflow/lite/kernels/conv_test.cc          |  2 +-
 tensorflow/lite/kernels/cpu_backend_context.h |  2 +-
 tensorflow/lite/kernels/cpu_backend_gemm.h    |  6 +-
 .../kernels/cpu_backend_gemm_custom_gemv.h    |  6 +-
 .../lite/kernels/cpu_backend_gemm_eigen.cc    |  4 +-
 .../lite/kernels/cpu_backend_gemm_eigen.h     |  4 +-
 .../lite/kernels/cpu_backend_gemm_gemmlowp.h  |  4 +-
 .../lite/kernels/cpu_backend_threadpool.h     |  6 +-
 .../optimized/depthwiseconv_multithread.h     |  2 +-
 tensorflow/lite/tools/make/Makefile           |  2 +-
 12 files changed, 55 insertions(+), 55 deletions(-)

diff --git a/tensorflow/lite/kernels/BUILD b/tensorflow/lite/kernels/BUILD
index b16a85c65d8..9d3e5929d82 100644
--- a/tensorflow/lite/kernels/BUILD
+++ b/tensorflow/lite/kernels/BUILD
@@ -14,8 +14,8 @@ package(
 # This will cause TFLite to build with ruy only, providing a smaller binary.
 # WARNING: This build flag is experimental and subject to change.
 config_setting(
-    name = "tflite_with_ruy_only_explicit_true",
-    define_values = {"TFLITE_WITH_RUY_ONLY": "true"},
+    name = "tflite_with_ruy_explicit_true",
+    define_values = {"tflite_with_ruy": "true"},
 )
 
 # Disables usage of ruy as the exclusive GEMM backend in TFLite kernels.
@@ -23,14 +23,14 @@ config_setting(
 # the default GEMM option at runtime.
 # WARNING: This build flag is experimental and subject to change.
 config_setting(
-    name = "tflite_with_ruy_only_explicit_false",
-    define_values = {"TFLITE_WITH_RUY_ONLY": "false"},
+    name = "tflite_with_ruy_explicit_false",
+    define_values = {"tflite_with_ruy": "false"},
 )
 
 ###### Beginning of config_setting's to match aarch64 ######
 #
 # We need to identify the aarch64 instruction set to decide whether to enable
-# TFLITE_WITH_RUY_ONLY by default. This is surprisingly hard to do because select()
+# TFLITE_WITH_RUY by default. This is surprisingly hard to do because select()
 # can only consume config_setting's, these config_settings are not centralized,
 # and the "cpu" value which they define are free-form strings and there is no
 # standardization of the strings that we need to match for the aarch64 architecture.
@@ -239,45 +239,45 @@ cc_test(
 )
 
 cc_library(
-    name = "tflite_with_ruy_only_enabled",
+    name = "tflite_with_ruy_enabled",
     build_for_embedded = True,
-    defines = ["TFLITE_WITH_RUY_ONLY"],
+    defines = ["TFLITE_WITH_RUY"],
     visibility = ["//visibility:private"],
 )
 
 cc_library(
-    name = "tflite_with_ruy_only_and_caching_enabled",
+    name = "tflite_with_ruy_and_caching_enabled",
     defines = [
-        "TFLITE_WITH_RUY_ONLY",
+        "TFLITE_WITH_RUY",
         "TFLITE_WITH_RUY_GEMV",
     ],
     visibility = ["//visibility:private"],
 )
 
 cc_library(
-    name = "tflite_with_ruy_only_default",
+    name = "tflite_with_ruy_default",
     build_for_embedded = True,
     select_deps = {
-        ":chromiumos_arm64": [":tflite_with_ruy_only_enabled"],
-        ":cpu_aarch64": [":tflite_with_ruy_only_enabled"],
-        ":cpu_arm64": [":tflite_with_ruy_only_enabled"],
-        ":cpu_arm64e": [":tflite_with_ruy_only_enabled"],
-        ":cpu_ios_arm64": [":tflite_with_ruy_only_enabled"],
-        ":cpu_ios_arm64e": [":tflite_with_ruy_only_enabled"],
-        ":cpu_arm64_v8a": [":tflite_with_ruy_only_enabled"],
-        "//tensorflow:android_arm": ["tflite_with_ruy_only_enabled"],
+        ":chromiumos_arm64": [":tflite_with_ruy_enabled"],
+        ":cpu_aarch64": [":tflite_with_ruy_enabled"],
+        ":cpu_arm64": [":tflite_with_ruy_enabled"],
+        ":cpu_arm64e": [":tflite_with_ruy_enabled"],
+        ":cpu_ios_arm64": [":tflite_with_ruy_enabled"],
+        ":cpu_ios_arm64e": [":tflite_with_ruy_enabled"],
+        ":cpu_arm64_v8a": [":tflite_with_ruy_enabled"],
+        "//tensorflow:android_arm": ["tflite_with_ruy_enabled"],
         "//conditions:default": [],
     },
     visibility = ["//visibility:private"],
 )
 
 cc_library(
-    name = "tflite_with_ruy_only",
+    name = "tflite_with_ruy",
     build_for_embedded = True,
     select_deps = {
-        ":tflite_with_ruy_only_explicit_true": [":tflite_with_ruy_only_enabled"],
-        ":tflite_with_ruy_only_explicit_false": [],
-        "//conditions:default": [":tflite_with_ruy_only_default"],
+        ":tflite_with_ruy_explicit_true": [":tflite_with_ruy_enabled"],
+        ":tflite_with_ruy_explicit_false": [],
+        "//conditions:default": [":tflite_with_ruy_default"],
     },
 )
 
@@ -291,7 +291,7 @@ cc_library(
     ],
     copts = tflite_copts(),
     deps = [
-        ":tflite_with_ruy_only",
+        ":tflite_with_ruy",
         ":op_macros",
         # For now this unconditionally depends on both ruy and gemmlowp.
         # See the comment inside class CpuBackendContext on the
@@ -311,11 +311,11 @@ cc_library(
     copts = tflite_copts(),
     deps = [
         ":cpu_backend_context",
-        ":tflite_with_ruy_only",
+        ":tflite_with_ruy",
         "//tensorflow/lite/kernels/internal:compatibility",
         "//tensorflow/lite/kernels/internal:types",
         # For now this unconditionally depends on both ruy and gemmlowp.
-        # We only need to depend on gemmlowp when tflite_with_ruy_only
+        # We only need to depend on gemmlowp when tflite_with_ruy
         # is false, but putting these dependencies in a select() seems to
         # defeat copybara's rewriting rules.
         "@ruy//ruy:context",
@@ -349,20 +349,20 @@ cc_library(
     ],
     copts = tflite_copts(),
     deps = [
-        ":tflite_with_ruy_only",
+        ":tflite_with_ruy",
         "//tensorflow/lite/kernels/internal:common",
         "//tensorflow/lite/kernels/internal:compatibility",
         "//tensorflow/lite/kernels/internal:cpu_check",
         "//tensorflow/lite/kernels/internal:types",
         ":cpu_backend_context",
         ":cpu_backend_threadpool",
-        # Depend on ruy regardless of `tflite_with_ruy_only`. See the comment in
+        # Depend on ruy regardless of `tflite_with_ruy`. See the comment in
         # cpu_backend_gemm.h about why ruy is the generic path.
         "@ruy//ruy",
         "@ruy//ruy:matrix",
         "@ruy//ruy:path",
         "@ruy//ruy/profiler:instrumentation",
-        # We only need to depend on gemmlowp and Eigen when tflite_with_ruy_only
+        # We only need to depend on gemmlowp and Eigen when tflite_with_ruy
         # is false, but putting these dependencies in a select() seems to
         # defeat copybara's rewriting rules.
         "@gemmlowp",
@@ -605,7 +605,7 @@ cc_library(
         "//tensorflow/lite/kernels/internal:cppmath",
         "//tensorflow/lite:string",
         "@farmhash_archive//:farmhash",
-    ] + [":tflite_with_ruy_only_and_caching_enabled"],
+    ] + [":tflite_with_ruy_and_caching_enabled"],
 )
 
 cc_library(
diff --git a/tensorflow/lite/kernels/conv.cc b/tensorflow/lite/kernels/conv.cc
index 88765b2f9c4..81069de1abe 100644
--- a/tensorflow/lite/kernels/conv.cc
+++ b/tensorflow/lite/kernels/conv.cc
@@ -26,7 +26,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/compatibility.h"
 #include "tensorflow/lite/kernels/internal/types.h"
 // b/131835803 forces us to include multithreaded_conv.h before optimized_ops.h
-#ifndef TFLITE_WITH_RUY_ONLY
+#ifndef TFLITE_WITH_RUY
 #include "tensorflow/lite/kernels/internal/optimized/multithreaded_conv.h"
 #endif
 #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
@@ -765,8 +765,8 @@ void EvalFloat(TfLiteContext* context, TfLiteNode* node,
       break;
     }
     case kMultithreadOptimized: {
-#ifdef TFLITE_WITH_RUY_ONLY
-      // See Register_CONV_2D: we should never be here when TFLITE_WITH_RUY_ONLY
+#ifdef TFLITE_WITH_RUY
+      // See Register_CONV_2D: we should never be here when TFLITE_WITH_RUY
       // was enabled. We #if out this code in order to get the corresponding
       // binary size benefits.
       TFLITE_DCHECK(false);
@@ -1051,8 +1051,8 @@ TfLiteRegistration* Register_CONVOLUTION_CBLAS_OPT() {
 TfLiteRegistration* Register_CONV_2D() {
 #if defined TFLITE_USE_APPLE_ACCELERATE_FOR_CONV
   return Register_CONVOLUTION_CBLAS_OPT();
-#elif defined TFLITE_WITH_RUY_ONLY
-  // TFLITE_WITH_RUY_ONLY optimizes the generic kernel type.
+#elif defined TFLITE_WITH_RUY
+  // TFLITE_WITH_RUY optimizes the generic kernel type.
   return Register_CONVOLUTION_GENERIC_OPT();
 #else
   return Register_CONVOLUTION_MULTITHREADED_OPT();
@@ -1063,8 +1063,8 @@ TfLiteRegistration* Register_CONV_2D() {
 // models only need the UINT8 type. TFLite's op registration mechanism doesn't
 // yet allow for more nuanced registration mechanisms.
 TfLiteRegistration* Register_CONV_2D_UINT8() {
-#if defined TFLITE_WITH_RUY_ONLY
-  // TFLITE_WITH_RUY_ONLY optimizes the generic kernel type.
+#if defined TFLITE_WITH_RUY
+  // TFLITE_WITH_RUY optimizes the generic kernel type.
   return Register_CONVOLUTION_GENERIC_OPT_UINT8();
 #else
   return Register_CONV_2D();
diff --git a/tensorflow/lite/kernels/conv_test.cc b/tensorflow/lite/kernels/conv_test.cc
index a1fd34eb1cb..ac78bc6b353 100644
--- a/tensorflow/lite/kernels/conv_test.cc
+++ b/tensorflow/lite/kernels/conv_test.cc
@@ -148,7 +148,7 @@ class ConvolutionOpModel : public BaseConvolutionOpModel<float> {
 const auto kKernelMap = new std::map<string, TfLiteRegistration*>({
     {"Reference", ops::builtin::Register_CONVOLUTION_REF()},
     {"GenericOptimized", ops::builtin::Register_CONVOLUTION_GENERIC_OPT()},
-#ifndef TFLITE_WITH_RUY_ONLY
+#ifndef TFLITE_WITH_RUY
     {"MultithreadedOptimized",
      ops::builtin::Register_CONVOLUTION_MULTITHREADED_OPT()},
 #endif
diff --git a/tensorflow/lite/kernels/cpu_backend_context.h b/tensorflow/lite/kernels/cpu_backend_context.h
index 19ef88bf8e3..124b9b849a2 100644
--- a/tensorflow/lite/kernels/cpu_backend_context.h
+++ b/tensorflow/lite/kernels/cpu_backend_context.h
@@ -56,7 +56,7 @@ class CpuBackendContext final : public TfLiteInternalBackendContext {
   // (see :cpu_backend_gemm), for now a CpuBackendContext always
   // stores both a gemmlowp context and a ruy context.
   // TODO(b/131416458): Once call sites all go through abstractions,
-  // elide what can be elided based on TFLITE_WITH_RUY_ONLY.
+  // elide what can be elided based on TFLITE_WITH_RUY.
   const std::unique_ptr<ruy::Context> ruy_context_;
   const std::unique_ptr<gemmlowp::GemmContext> gemmlowp_context_;
 
diff --git a/tensorflow/lite/kernels/cpu_backend_gemm.h b/tensorflow/lite/kernels/cpu_backend_gemm.h
index f4d20d8970a..a95c4d15a82 100644
--- a/tensorflow/lite/kernels/cpu_backend_gemm.h
+++ b/tensorflow/lite/kernels/cpu_backend_gemm.h
@@ -24,7 +24,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/cpu_backend_gemm_params.h"
 #include "tensorflow/lite/kernels/cpu_backend_gemm_ruy.h"
 
-#ifndef TFLITE_WITH_RUY_ONLY
+#ifndef TFLITE_WITH_RUY
 #include "tensorflow/lite/kernels/cpu_backend_gemm_eigen.h"
 #include "tensorflow/lite/kernels/cpu_backend_gemm_gemmlowp.h"
 #endif
@@ -42,7 +42,7 @@ template <typename LhsScalar, typename RhsScalar, typename AccumScalar,
 struct GemmImpl : detail::GemmImplUsingRuy<LhsScalar, RhsScalar, AccumScalar,
                                            DstScalar, quantization_flavor> {};
 
-#ifndef TFLITE_WITH_RUY_ONLY
+#ifndef TFLITE_WITH_RUY
 
 /* Specializations using gemmlowp */
 
@@ -82,7 +82,7 @@ template <>
 struct GemmImpl<float, float, float, float, QuantizationFlavor::kFloatingPoint>
     : detail::GemmImplUsingEigen {};
 
-#endif  // not TFLITE_WITH_RUY_ONLY
+#endif  // not TFLITE_WITH_RUY
 
 /* Public entry point */
 
diff --git a/tensorflow/lite/kernels/cpu_backend_gemm_custom_gemv.h b/tensorflow/lite/kernels/cpu_backend_gemm_custom_gemv.h
index 224f8ecea41..2712d7d2cd1 100644
--- a/tensorflow/lite/kernels/cpu_backend_gemm_custom_gemv.h
+++ b/tensorflow/lite/kernels/cpu_backend_gemm_custom_gemv.h
@@ -591,10 +591,10 @@ struct CustomGemvImpl<LhsScalar, RhsScalar, std::int32_t, DstScalar,
 // The float specialization below is unconditionally faster than ruy
 // because ruy does not currently have any Gemv path.
 // But it is not unconditionally faster than Eigen, which is what is used
-// unless TFLITE_WITH_RUY_ONLY is defined. Indeed, Eigen has decently efficient
+// unless TFLITE_WITH_RUY is defined. Indeed, Eigen has decently efficient
 // Gemv paths, and they may use AVX instructions, while the present
 // NEON intrinsics code maps at best to SSE4 on x86.
-#ifdef TFLITE_WITH_RUY_ONLY
+#ifdef TFLITE_WITH_RUY
 
 // We want to use fused multiply-add when it's available (that is, on A64
 // unconditionally and on A32 with VFPv4) because it's often faster, and
@@ -778,7 +778,7 @@ struct CustomGemvImpl<float, float, float, float,
   }
 };
 
-#endif  // TFLITE_WITH_RUY_ONLY
+#endif  // TFLITE_WITH_RUY
 
 #endif  // USE_NEON
 
diff --git a/tensorflow/lite/kernels/cpu_backend_gemm_eigen.cc b/tensorflow/lite/kernels/cpu_backend_gemm_eigen.cc
index 334baa5f7ed..ebde7a0b935 100644
--- a/tensorflow/lite/kernels/cpu_backend_gemm_eigen.cc
+++ b/tensorflow/lite/kernels/cpu_backend_gemm_eigen.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TFLITE_WITH_RUY_ONLY
+#ifndef TFLITE_WITH_RUY
 
 #include "tensorflow/lite/kernels/cpu_backend_gemm_eigen.h"
 
@@ -78,4 +78,4 @@ void GemmImplUsingEigen::Run(
 }  // namespace cpu_backend_gemm
 }  // namespace tflite
 
-#endif  // not TFLITE_WITH_RUY_ONLY
+#endif  // not TFLITE_WITH_RUY
diff --git a/tensorflow/lite/kernels/cpu_backend_gemm_eigen.h b/tensorflow/lite/kernels/cpu_backend_gemm_eigen.h
index 3a7fd9df624..bd4733dcfae 100644
--- a/tensorflow/lite/kernels/cpu_backend_gemm_eigen.h
+++ b/tensorflow/lite/kernels/cpu_backend_gemm_eigen.h
@@ -16,7 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_KERNELS_CPU_BACKEND_GEMM_EIGEN_H_
 #define TENSORFLOW_LITE_KERNELS_CPU_BACKEND_GEMM_EIGEN_H_
 
-#ifndef TFLITE_WITH_RUY_ONLY
+#ifndef TFLITE_WITH_RUY
 
 #include "tensorflow/lite/kernels/cpu_backend_context.h"
 #include "tensorflow/lite/kernels/cpu_backend_gemm_params.h"
@@ -37,6 +37,6 @@ struct GemmImplUsingEigen {
 }  // namespace cpu_backend_gemm
 }  // namespace tflite
 
-#endif  // not TFLITE_WITH_RUY_ONLY
+#endif  // not TFLITE_WITH_RUY
 
 #endif  // TENSORFLOW_LITE_KERNELS_CPU_BACKEND_GEMM_EIGEN_H_
diff --git a/tensorflow/lite/kernels/cpu_backend_gemm_gemmlowp.h b/tensorflow/lite/kernels/cpu_backend_gemm_gemmlowp.h
index 77d37aac291..b7926679ae4 100644
--- a/tensorflow/lite/kernels/cpu_backend_gemm_gemmlowp.h
+++ b/tensorflow/lite/kernels/cpu_backend_gemm_gemmlowp.h
@@ -19,7 +19,7 @@ limitations under the License.
 #include <tuple>
 
 #include "tensorflow/lite/kernels/internal/compatibility.h"
-#ifndef TFLITE_WITH_RUY_ONLY
+#ifndef TFLITE_WITH_RUY
 
 #include <cstdint>
 #include <type_traits>
@@ -190,6 +190,6 @@ struct GemmImplUsingGemmlowp<LhsScalar, RhsScalar, AccumScalar, DstScalar,
 }  // namespace cpu_backend_gemm
 }  // namespace tflite
 
-#endif  // not TFLITE_WITH_RUY_ONLY
+#endif  // not TFLITE_WITH_RUY
 
 #endif  // TENSORFLOW_LITE_KERNELS_CPU_BACKEND_GEMM_GEMMLOWP_H_
diff --git a/tensorflow/lite/kernels/cpu_backend_threadpool.h b/tensorflow/lite/kernels/cpu_backend_threadpool.h
index 60a5ebfde29..39eafd51d6a 100644
--- a/tensorflow/lite/kernels/cpu_backend_threadpool.h
+++ b/tensorflow/lite/kernels/cpu_backend_threadpool.h
@@ -19,7 +19,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/cpu_backend_context.h"
 #include "tensorflow/lite/kernels/internal/compatibility.h"
 
-#ifdef TFLITE_WITH_RUY_ONLY
+#ifdef TFLITE_WITH_RUY
 #include "ruy/context.h"  // from @ruy
 #include "ruy/thread_pool.h"  // from @ruy
 #else
@@ -29,7 +29,7 @@ limitations under the License.
 namespace tflite {
 namespace cpu_backend_threadpool {
 
-#ifdef TFLITE_WITH_RUY_ONLY
+#ifdef TFLITE_WITH_RUY
 
 using Task = ruy::Task;
 
@@ -41,7 +41,7 @@ void Execute(int tasks_count, TaskType* tasks,
       tasks_count, tasks);
 }
 
-#else  // not TFLITE_WITH_RUY_ONLY
+#else  // not TFLITE_WITH_RUY
 
 using Task = gemmlowp::Task;
 
diff --git a/tensorflow/lite/kernels/internal/optimized/depthwiseconv_multithread.h b/tensorflow/lite/kernels/internal/optimized/depthwiseconv_multithread.h
index 0e13222b28a..7d8838a076e 100644
--- a/tensorflow/lite/kernels/internal/optimized/depthwiseconv_multithread.h
+++ b/tensorflow/lite/kernels/internal/optimized/depthwiseconv_multithread.h
@@ -132,7 +132,7 @@ inline void DepthwiseConv(const DepthwiseParams& params,
   int thread_count = HowManyConvThreads(output_shape, filter_shape);
   const int max_threads = cpu_backend_context->max_num_threads();
   thread_count = std::max(1, std::min(thread_count, max_threads));
-#ifndef TFLITE_WITH_RUY_ONLY
+#ifndef TFLITE_WITH_RUY
   // Cap the number of threads to 2 for float path to avoid regression in
   // performance (b/132294857).
   if (std::is_floating_point<T>::value) {
diff --git a/tensorflow/lite/tools/make/Makefile b/tensorflow/lite/tools/make/Makefile
index 266aa94ad93..f8b67fbbe7d 100644
--- a/tensorflow/lite/tools/make/Makefile
+++ b/tensorflow/lite/tools/make/Makefile
@@ -187,7 +187,7 @@ ifeq ($(TARGET_ARCH),aarch64)
 	BUILD_WITH_RUY=true
 endif
 ifeq ($(BUILD_WITH_RUY),true)
-  CXXFLAGS += -DTFLITE_WITH_RUY_ONLY
+  CXXFLAGS += -DTFLITE_WITH_RUY
 endif
 
 BUILD_WITH_RUY_PROFILER ?= false

From 3f2c98610e06fa5fbf296f6c25e5b4020dbbefa5 Mon Sep 17 00:00:00 2001
From: Nathan Luehr <nluehr@nvidia.com>
Date: Mon, 22 Jun 2020 12:29:33 -0500
Subject: [PATCH 0761/1390] Move ScopedCublasMathMode inside DoBlasInternalImpl

---
 tensorflow/stream_executor/cuda/cuda_blas.cc | 127 ++++++++++---------
 tensorflow/stream_executor/cuda/cuda_blas.h  |  10 +-
 2 files changed, 76 insertions(+), 61 deletions(-)

diff --git a/tensorflow/stream_executor/cuda/cuda_blas.cc b/tensorflow/stream_executor/cuda/cuda_blas.cc
index e387690da26..e86e3089038 100644
--- a/tensorflow/stream_executor/cuda/cuda_blas.cc
+++ b/tensorflow/stream_executor/cuda/cuda_blas.cc
@@ -386,9 +386,9 @@ cudaDataType_t CUDAComputationType(blas::ComputationType ty) {
 }  // namespace
 
 template <typename FuncT, typename... Args>
-bool CUDABlas::DoBlasInternalImpl(FuncT cublas_func, Stream *stream,
+bool CUDABlas::DoBlasInternalImpl(FuncT cublas_func, Stream* stream,
                                   bool pointer_mode_host, bool err_on_failure,
-                                  Args... args) {
+                                  cublasMath_t math_type, Args... args) {
   absl::MutexLock lock(&mu_);
 
   CHECK(blas_ != nullptr);
@@ -396,6 +396,20 @@ bool CUDABlas::DoBlasInternalImpl(FuncT cublas_func, Stream *stream,
     return false;
   }
 
+#if CUDA_VERSION >= 9000
+  ScopedCublasMathMode math_mode{blas_};
+#if CUBLAS_VER_MAJOR >= 11
+  if (math_type == CUBLAS_TF32_TENSOR_OP_MATH &&
+      tensorflow::tf32_execution_allowed()) {
+#else
+  if (math_type == CUBLAS_TENSOR_OP_MATH) {
+#endif
+    if (!math_mode.Init(math_type)) {
+      return false;
+    }
+  }
+#endif
+
   gpu::ScopedActivateExecutorContext sac{parent_};
   ScopedCublasPointerMode pointer_mode{blas_};
   if (!pointer_mode.Init(pointer_mode_host ? CUBLAS_POINTER_MODE_HOST
@@ -1615,15 +1629,14 @@ bool CUDABlas::DoBlasGemm(
   }
 
 #if CUDA_VERSION < 11000
-  ScopedCublasMathMode math_mode{blas_};
-  if (!math_mode.Init(CUBLAS_TENSOR_OP_MATH)) {
-    return false;
-  }
+  cublasMath_t math_type = CUBLAS_TENSOR_OP_MATH;
+#else
+  cublasMath_t math_type = CUBLAS_DEFAULT_MATH;
 #endif
 
   return DoBlasInternalImpl(
       cublasSgemmEx, stream, true /* = pointer_mode_host */,
-      true /* = err_on_failure= */, CUDABlasTranspose(transa),
+      true /* = err_on_failure= */, math_type, CUDABlasTranspose(transa),
       CUDABlasTranspose(transb), m, n, k, &alpha, GpuMemory(a),
       SE_CUDA_DATA_HALF, lda, GpuMemory(b), SE_CUDA_DATA_HALF, ldb, &beta,
       GpuMemoryMutable(c), SE_CUDA_DATA_HALF, ldc);
@@ -1669,19 +1682,17 @@ bool CUDABlas::DoBlasGemm(Stream *stream, blas::Transpose transa,
     }
   }
 
-#if CUBLAS_VER_MAJOR >= 11
-  ScopedCublasMathMode math_mode{blas_};
-  if (tensorflow::tf32_execution_allowed()) {
-    if (!math_mode.Init(CUBLAS_TF32_TENSOR_OP_MATH)) {
-      return false;
-    }
-  }
+#if CUDA_VERSION < 11000
+  cublasMath_t math_type = CUBLAS_DEFAULT_MATH;
+#else
+  cublasMath_t math_type = CUBLAS_TF32_TENSOR_OP_MATH;
 #endif
 
-  return DoBlasInternal(cublasSgemm, stream, true /* = pointer_mode_host */,
-                        CUDABlasTranspose(transa), CUDABlasTranspose(transb), m,
-                        n, k, &alpha, GpuMemory(a), lda, GpuMemory(b), ldb,
-                        &beta, GpuMemoryMutable(c), ldc);
+  return DoBlasInternalImpl(
+      cublasSgemm, stream, true /* = pointer_mode_host */,
+      true /* = err_on_failure */, math_type, CUDABlasTranspose(transa),
+      CUDABlasTranspose(transb), m, n, k, &alpha, GpuMemory(a), lda,
+      GpuMemory(b), ldb, &beta, GpuMemoryMutable(c), ldc);
 }
 
 bool CUDABlas::DoBlasGemm(Stream *stream, blas::Transpose transa,
@@ -1704,16 +1715,6 @@ bool CUDABlas::DoBlasGemm(Stream *stream, blas::Transpose transa,
                           DeviceMemory<std::complex<float>> *c, int ldc) {
   auto cb_alpha = GpuComplexValue(alpha);
   auto cb_beta = GpuComplexValue(beta);
-
-#if CUBLAS_VER_MAJOR >= 11
-  ScopedCublasMathMode math_mode{blas_};
-  if (tensorflow::tf32_execution_allowed()) {
-    if (!math_mode.Init(CUBLAS_TF32_TENSOR_OP_MATH)) {
-      return false;
-    }
-  }
-#endif
-
   return DoBlasInternal(cublasCgemm, stream, true /* = pointer_mode_host */,
                         CUDABlasTranspose(transa), CUDABlasTranspose(transb), m,
                         n, k, GpuComplex(&cb_alpha), GpuComplex(GpuMemory(a)),
@@ -2286,10 +2287,27 @@ port::Status CUDABlas::DoBlasGemmBatchedInternal(
   if (stream->parent()->GetDeviceDescription().cuda_compute_capability(
           &cc_major, &cc_minor) &&
       cc_major >= 5) {
-    bool use_tensor_ops =
-        data_type == CUDA_R_16F || tensorflow::tf32_execution_allowed();
-    cublasGemmAlgo_t algo =
-        (use_tensor_ops ? CUBLAS_GEMM_DFALT_TENSOR_OP : CUBLAS_GEMM_DFALT);
+    cublasMath_t math_type;
+    cublasGemmAlgo_t algo;
+    if (data_type == CUDA_R_16F) {
+#if CUDA_VERSION < 11000
+      math_type = CUBLAS_TENSOR_OP_MATH;
+#else
+      math_type = CUBLAS_DEFAULT_MATH;
+#endif
+      algo = CUBLAS_GEMM_DFALT_TENSOR_OP;
+#if CUBLAS_VER_MAJOR >= 11
+    } else if (data_type == CUDA_R_32F) {
+      // DoBlassInternalImpl will switch math_type back to CUBLAS_DEFAULT_MATH
+      // if TF32 is disabled.
+      math_type = CUBLAS_TF32_TENSOR_OP_MATH;
+      algo = tensorflow::tf32_execution_allowed() ? CUBLAS_GEMM_DFALT_TENSOR_OP
+                                                  : CUBLAS_GEMM_DFALT;
+#endif
+    } else {
+      math_type = CUBLAS_DEFAULT_MATH;
+      algo = CUBLAS_GEMM_DFALT;
+    }
     cudaDataType_t compute_type =
         (data_type == CUDA_R_16F ? CUDA_R_32F : data_type);
     const void **a_void_ptrs = reinterpret_cast<const void **>(
@@ -2301,7 +2319,7 @@ port::Status CUDABlas::DoBlasGemmBatchedInternal(
     bool ok;
     ok = DoBlasInternalImpl(
         AS_LAMBDA(cublasGemmBatchedEx), stream, true /* = pointer_mode_host */,
-        true /* = err_on_failure */, CUDABlasTranspose(transa),
+        true /* = err_on_failure */, math_type, CUDABlasTranspose(transa),
         CUDABlasTranspose(transb), m, n, k, &alpha, a_void_ptrs, data_type, lda,
         b_void_ptrs, data_type, ldb, &beta, c_void_ptrs, data_type, ldc,
         batch_count, compute_type, algo);
@@ -2475,9 +2493,14 @@ bool CUDABlas::DoBlasGemmStridedBatched(
       cc_major >= 5) {
     cublasGemmAlgo_t algo =
         (cc_major >= 7 ? CUBLAS_GEMM_DFALT_TENSOR_OP : CUBLAS_GEMM_DFALT);
+#if CUDA_VERSION < 11000
+    cublasMath_t math_type = CUBLAS_TENSOR_OP_MATH;
+#else
+    cublasMath_t math_type = CUBLAS_DEFAULT_MATH;
+#endif
     bool ok = DoBlasInternalImpl(
         AS_LAMBDA(cublasGemmStridedBatchedEx), stream,
-        true /* = pointer_mode_host */, true /* = err_on_failure */,
+        true /* = pointer_mode_host */, true /* = err_on_failure */, math_type,
         CUDABlasTranspose(transa), CUDABlasTranspose(transb), m, n, k, &alpha,
         GpuMemory(a), CUDA_R_16F, lda, stride_a, GpuMemory(b), CUDA_R_16F, ldb,
         stride_b, &beta, GpuMemoryMutable(c), CUDA_R_16F, ldc, stride_c,
@@ -2499,10 +2522,10 @@ bool CUDABlas::DoBlasGemmStridedBatched(
         reinterpret_cast<__half *>(GpuMemoryMutable(c) + batch * stride_c);
     bool ok = DoBlasInternalImpl(
         cublasSgemmEx, stream, true /* = pointer_mode_host */,
-        true /* = err_on_failure= */, CUDABlasTranspose(transa),
-        CUDABlasTranspose(transb), m, n, k, &alpha, a_matrix, SE_CUDA_DATA_HALF,
-        lda, b_matrix, SE_CUDA_DATA_HALF, ldb, &beta, c_matrix,
-        SE_CUDA_DATA_HALF, ldc);
+        true /* = err_on_failure= */, CUBLAS_DEFAULT_MATH,
+        CUDABlasTranspose(transa), CUDABlasTranspose(transb), m, n, k, &alpha,
+        a_matrix, SE_CUDA_DATA_HALF, lda, b_matrix, SE_CUDA_DATA_HALF, ldb,
+        &beta, c_matrix, SE_CUDA_DATA_HALF, ldc);
     if (!ok) {
       LOG(ERROR) << "failed BLAS call, see log for details";
       return false;
@@ -2517,19 +2540,17 @@ bool CUDABlas::DoBlasGemmStridedBatched(
     int64 stride_a, const DeviceMemory<float> &b, int ldb, int64 stride_b,
     float beta, DeviceMemory<float> *c, int ldc, int64 stride_c,
     int batch_count) {
-#if CUBLAS_VER_MAJOR >= 11
-  ScopedCublasMathMode math_mode{blas_};
-  if (tensorflow::tf32_execution_allowed()) {
-    if (!math_mode.Init(CUBLAS_TF32_TENSOR_OP_MATH)) {
-      return false;
-    }
-  }
+#if CUDA_VERSION < 11000
+  cublasMath_t math_type = CUBLAS_DEFAULT_MATH;
+#else
+  cublasMath_t math_type = CUBLAS_TF32_TENSOR_OP_MATH;
 #endif
-  return DoBlasInternal(
+  return DoBlasInternalImpl(
       cublasSgemmStridedBatched, stream, true /* = pointer_mode_host */,
-      CUDABlasTranspose(transa), CUDABlasTranspose(transb), m, n, k, &alpha,
-      GpuMemory(a), lda, stride_a, GpuMemory(b), ldb, stride_b, &beta,
-      GpuMemoryMutable(c), ldc, stride_c, batch_count);
+      true /* = err_on_failure */, math_type, CUDABlasTranspose(transa),
+      CUDABlasTranspose(transb), m, n, k, &alpha, GpuMemory(a), lda, stride_a,
+      GpuMemory(b), ldb, stride_b, &beta, GpuMemoryMutable(c), ldc, stride_c,
+      batch_count);
 }
 
 bool CUDABlas::DoBlasGemmStridedBatched(
@@ -2552,14 +2573,6 @@ bool CUDABlas::DoBlasGemmStridedBatched(
     const DeviceMemory<std::complex<float>> &b, int ldb, int64 stride_b,
     std::complex<float> beta, DeviceMemory<std::complex<float>> *c, int ldc,
     int64 stride_c, int batch_count) {
-#if CUBLAS_VER_MAJOR >= 11
-  ScopedCublasMathMode math_mode{blas_};
-  if (tensorflow::tf32_execution_allowed()) {
-    if (!math_mode.Init(CUBLAS_TF32_TENSOR_OP_MATH)) {
-      return false;
-    }
-  }
-#endif
   auto cb_alpha = GpuComplexValue(alpha);
   auto cb_beta = GpuComplexValue(beta);
   return DoBlasInternal(
diff --git a/tensorflow/stream_executor/cuda/cuda_blas.h b/tensorflow/stream_executor/cuda/cuda_blas.h
index 556456c83db..ca03ce96465 100644
--- a/tensorflow/stream_executor/cuda/cuda_blas.h
+++ b/tensorflow/stream_executor/cuda/cuda_blas.h
@@ -81,9 +81,9 @@ class CUDABlas : public blas::BlasSupport {
   // err_on_failure:     Whether to print an error if the cublas function fails.
   // args:               Arguments of cuBLAS function.
   template <typename FuncT, typename... Args>
-  bool DoBlasInternalImpl(FuncT cublas_func, Stream *stream,
+  bool DoBlasInternalImpl(FuncT cublas_func, Stream* stream,
                           bool pointer_mode_host, bool err_on_failure,
-                          Args... args);
+                          cublasMath_t math_type, Args... args);
 
   // Convenience functions that call DoBlasInternalImpl with different values
   // for err_on_failure.
@@ -91,7 +91,8 @@ class CUDABlas : public blas::BlasSupport {
   bool DoBlasInternal(FuncT cublas_func, Stream *stream, bool pointer_mode_host,
                       Args... args) {
     return DoBlasInternalImpl(cublas_func, stream, pointer_mode_host,
-                              /*err_on_failure=*/true, args...);
+                              /*err_on_failure=*/true, CUBLAS_DEFAULT_MATH,
+                              args...);
   }
   template <typename FuncT, typename... Args>
   bool DoBlasInternalFailureOK(FuncT cublas_func, Stream *stream,
@@ -99,7 +100,8 @@ class CUDABlas : public blas::BlasSupport {
     // Tensor ops are hard-coded off in this path, but can still be enabled with
     // a specific algorithm choice as in DoBlasGemmWithAlgorithmImpl().
     return DoBlasInternalImpl(cublas_func, stream, pointer_mode_host,
-                              /*err_on_failure=*/false, args...);
+                              /*err_on_failure=*/false, CUBLAS_DEFAULT_MATH,
+                              args...);
   }
 
   // A helper function to implement DoBlasGemmBatched interfaces for generic

From 01c3f45f0fc90dc8e2d0b6d697beb8b48940dc15 Mon Sep 17 00:00:00 2001
From: Geoffrey Martin-Noble <gcmn@google.com>
Date: Mon, 22 Jun 2020 10:07:49 -0700
Subject: [PATCH 0762/1390] Update LLVM commit to match upstream

PiperOrigin-RevId: 317677703
Change-Id: I7ae6332a7811402bbcce17f205b1c8a5fd854133
---
 tensorflow/workspace.bzl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index 73698987a08..5af8c8272b8 100755
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -710,8 +710,8 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
     )
 
     # Check out LLVM and MLIR from llvm-project.
-    LLVM_COMMIT = "7f0d7f32631648acf48bc23047635ab5e2058a1a"
-    LLVM_SHA256 = "2f1dbae231b3b8f9c67d6a4f578c8ce29f3aa2831313b34c40ff2edb4014476a"
+    LLVM_COMMIT = "7e825abd5704ce28b166f9463d4bd304348fd2a9"
+    LLVM_SHA256 = "a21b752ee1866e195f3f72c7931c79f8c4ecc0f14861488284bdc2bdf14d6fe9"
     LLVM_URLS = [
         "https://storage.googleapis.com/mirror.tensorflow.org/github.com/llvm/llvm-project/archive/{commit}.tar.gz".format(commit = LLVM_COMMIT),
         "https://github.com/llvm/llvm-project/archive/{commit}.tar.gz".format(commit = LLVM_COMMIT),

From 36c56b0faac9cae5e17738796a77a0d1e5108bea Mon Sep 17 00:00:00 2001
From: Raman Sarokin <sorokin@google.com>
Date: Mon, 22 Jun 2020 10:10:18 -0700
Subject: [PATCH 0763/1390] Removed trivial copy constructor, copy assignment
 operator.

PiperOrigin-RevId: 317678239
Change-Id: I02723cbe192c225f18231719201e350a138d3818
---
 tensorflow/lite/delegates/gpu/cl/gpu_object.h     |  8 --------
 tensorflow/lite/delegates/gpu/cl/linear_storage.h | 14 --------------
 tensorflow/lite/delegates/gpu/cl/tensor_type.h    | 14 --------------
 3 files changed, 36 deletions(-)

diff --git a/tensorflow/lite/delegates/gpu/cl/gpu_object.h b/tensorflow/lite/delegates/gpu/cl/gpu_object.h
index a6d28436872..faf18b539e2 100644
--- a/tensorflow/lite/delegates/gpu/cl/gpu_object.h
+++ b/tensorflow/lite/delegates/gpu/cl/gpu_object.h
@@ -105,14 +105,6 @@ struct GPUResourcesWithValue {
 class GPUObjectDescriptor {
  public:
   GPUObjectDescriptor() = default;
-  GPUObjectDescriptor(const GPUObjectDescriptor& obj_desc)
-      : state_vars_(obj_desc.state_vars_) {}
-  GPUObjectDescriptor& operator=(const GPUObjectDescriptor& obj_desc) {
-    if (this != &obj_desc) {
-      state_vars_ = obj_desc.state_vars_;
-    }
-    return *this;
-  }
   virtual ~GPUObjectDescriptor() = default;
 
   void SetStateVar(const std::string& key, const std::string& value) const {
diff --git a/tensorflow/lite/delegates/gpu/cl/linear_storage.h b/tensorflow/lite/delegates/gpu/cl/linear_storage.h
index c7f55e1d91d..2c0770ef3dc 100644
--- a/tensorflow/lite/delegates/gpu/cl/linear_storage.h
+++ b/tensorflow/lite/delegates/gpu/cl/linear_storage.h
@@ -41,20 +41,6 @@ struct TensorLinearDescriptor : public GPUObjectDescriptor {
   LinearStorageType storage_type;
   DataType element_type;  // FLOAT32 or FLOAT16
 
-  TensorLinearDescriptor() = default;
-  TensorLinearDescriptor(const TensorLinearDescriptor& desc)
-      : GPUObjectDescriptor(desc),
-        storage_type(desc.storage_type),
-        element_type(desc.element_type) {}
-  TensorLinearDescriptor& operator=(const TensorLinearDescriptor& desc) {
-    if (this != &desc) {
-      storage_type = desc.storage_type;
-      element_type = desc.element_type;
-      GPUObjectDescriptor::operator=(desc);
-    }
-    return *this;
-  }
-
   absl::Status PerformSelector(const std::string& selector,
                                const std::vector<std::string>& args,
                                const std::vector<std::string>& template_args,
diff --git a/tensorflow/lite/delegates/gpu/cl/tensor_type.h b/tensorflow/lite/delegates/gpu/cl/tensor_type.h
index 2d4ae0c7335..7e173753217 100644
--- a/tensorflow/lite/delegates/gpu/cl/tensor_type.h
+++ b/tensorflow/lite/delegates/gpu/cl/tensor_type.h
@@ -48,20 +48,6 @@ struct TensorDescriptor : public GPUObjectDescriptor {
   TensorDescriptor() = default;
   TensorDescriptor(DataType dt, TensorStorageType st, Layout l)
       : data_type(dt), storage_type(st), layout(l) {}
-  TensorDescriptor(const TensorDescriptor& desc)
-      : GPUObjectDescriptor(desc),
-        data_type(desc.data_type),
-        storage_type(desc.storage_type),
-        layout(desc.layout) {}
-  TensorDescriptor& operator=(const TensorDescriptor& desc) {
-    if (this != &desc) {
-      data_type = desc.data_type;
-      storage_type = desc.storage_type;
-      layout = desc.layout;
-      GPUObjectDescriptor::operator=(desc);
-    }
-    return *this;
-  }
 
   bool operator==(const TensorDescriptor& d) const {
     return data_type == d.data_type && storage_type == d.storage_type &&

From a6c009467a89875eb78339c7556755bbbbf4c84e Mon Sep 17 00:00:00 2001
From: Gaurav Jain <gjn@google.com>
Date: Mon, 22 Jun 2020 10:20:00 -0700
Subject: [PATCH 0764/1390] Retrace tf.function when using different variables

Since variable ids can be used to python data structures it can be
dangerous to reuse the previous function trace if the shape and dtype is
the same. We thus remove the optimization and ensure we always retrace.

PiperOrigin-RevId: 317680370
Change-Id: I1dfa3a626074e623b735869aa724138072e2c274
---
 tensorflow/python/eager/def_function_test.py |  4 +-
 tensorflow/python/eager/function.py          | 29 +++++------
 tensorflow/python/eager/function_test.py     | 52 +++++++++++++++-----
 3 files changed, 55 insertions(+), 30 deletions(-)

diff --git a/tensorflow/python/eager/def_function_test.py b/tensorflow/python/eager/def_function_test.py
index 0549da2c256..6dc4e322bbd 100644
--- a/tensorflow/python/eager/def_function_test.py
+++ b/tensorflow/python/eager/def_function_test.py
@@ -568,10 +568,10 @@ class DefFunctionTest(test.TestCase, parameterized.TestCase):
     self.assertEqual(trace_count[0], 1)
     self.assertEqual(self.evaluate(v1), 2.0)
     double_variable(v2)
-    self.assertEqual(trace_count[0], 1 if ops.Tensor._USE_EQUALITY else 2)
+    self.assertEqual(trace_count[0], 2)
     self.assertEqual(self.evaluate(v2), 4.0)
     double_variable(v3)
-    self.assertEqual(trace_count[0], 2 if ops.Tensor._USE_EQUALITY else 3)
+    self.assertEqual(trace_count[0], 3)
     self.assertEqual(self.evaluate(v3), 8)
 
   def testShapeCache(self):
diff --git a/tensorflow/python/eager/function.py b/tensorflow/python/eager/function.py
index 845abe43134..7b235749533 100644
--- a/tensorflow/python/eager/function.py
+++ b/tensorflow/python/eager/function.py
@@ -92,7 +92,7 @@ IMPLEMENTS_ATTRIBUTE_NAME = "_implements"
 SHARED_RENDEZVOUS_ATTRIBUTE_NAME = "shared_rendezvous"
 
 
-def _make_input_signature_hashable(elem, variable_map=None):
+def _make_input_signature_hashable(elem):
   """Rewrite input signature to be hashable.
 
   We replace nested variables in the input signature with TensorSpec in order to
@@ -100,18 +100,13 @@ def _make_input_signature_hashable(elem, variable_map=None):
 
   Args:
     elem: Input signature element
-    variable_map: Internal argument used for tracking variable aliases
 
   Returns:
     A hashable object for the requested input signature
   """
-  if variable_map is None:
-    variable_map = {}
-
   # TODO(slebedev): consider using nest.
   if isinstance(elem, tuple):
-    return tuple(map(lambda e: _make_input_signature_hashable(e, variable_map),
-                     elem))
+    return tuple(map(_make_input_signature_hashable, elem))
 
   try:
     hash(elem)
@@ -122,15 +117,17 @@ def _make_input_signature_hashable(elem, variable_map=None):
     v = elem()
 
     if resource_variable_ops.is_resource_variable(v):
-      idx = variable_map.get(id(v))
-      if idx is None:
-        idx = len(variable_map)
-        variable_map[id(v)] = idx
-
-      # We include the class name to avoid having different types of variables
-      # having the same hash. We Also include the variable index which allows
-      # us to return a different hash if variables have been aliased in a call.
-      return v.__class__, tensor_spec.TensorSpec(v.shape, v.dtype), idx
+      # We special case variables here to use unique_id as the cache key. This
+      # ensures we have to retrace whenever a different variable is passed in.
+      # This is needed to support cases where the user may use the id of a
+      # variable in the function perhaps as a lookup in a dictionary.
+      #
+      # This choice leads to more retracing when we could have possibly used the
+      # shape and dtype instead. However, we expect the number of variables in a
+      # program to be bounded, and correspondingly the number of retraces.
+      #
+      # Note we also include the class name to avoid collisions with strings.
+      return v.__class__, v._unique_id  # pylint: disable=protected-access
 
     if _is_ndarray(v):
       # Numpy arrays are not hashable, but when calling functions we treat them
diff --git a/tensorflow/python/eager/function_test.py b/tensorflow/python/eager/function_test.py
index f6a36701deb..2c49795ba8a 100644
--- a/tensorflow/python/eager/function_test.py
+++ b/tensorflow/python/eager/function_test.py
@@ -18,6 +18,7 @@ from __future__ import division
 from __future__ import print_function
 
 import collections
+import copy
 import functools
 import itertools
 import multiprocessing.pool
@@ -2930,30 +2931,57 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
     # should only get a miss if the aliasing changed.
     defined(x, y, z)
     self.assertLen(total_function_cache(defined), 1)
-
-    # Calling again is a cache hit
     defined(x, y, z)
     self.assertLen(total_function_cache(defined), 1)
 
-    # Re-arranging arguments doesn't change signature
+    # Re-arranging arguments causes cache miss
     defined(z, y, x)
-    self.assertLen(total_function_cache(defined),
-                   1 if ops.Tensor._USE_EQUALITY else 2)
+    self.assertLen(total_function_cache(defined), 2)
+    defined(z, y, x)
+    self.assertLen(total_function_cache(defined), 2)
 
     # Aliasing causes cache miss
     defined(x, x, z)
-    self.assertLen(total_function_cache(defined),
-                   2 if ops.Tensor._USE_EQUALITY else 3)
+    self.assertLen(total_function_cache(defined), 3)
+    defined(x, x, z)
+    self.assertLen(total_function_cache(defined), 3)
 
-    # Re-arranging arguments doesn't change signature
+    # Re-arranging arguments causes cache miss
     defined(y, y, z)
-    self.assertLen(total_function_cache(defined),
-                   2 if ops.Tensor._USE_EQUALITY else 4)
+    self.assertLen(total_function_cache(defined), 4)
+    defined(y, y, z)
+    self.assertLen(total_function_cache(defined), 4)
 
     # Different alias positions causes cache miss
     defined(z, y, y)
-    self.assertLen(total_function_cache(defined),
-                   3 if ops.Tensor._USE_EQUALITY else 5)
+    self.assertLen(total_function_cache(defined), 5)
+    defined(z, y, y)
+    self.assertLen(total_function_cache(defined), 5)
+
+    x_copy = copy.deepcopy(x)
+
+    # Deep copy causes cache miss
+    defined(x_copy, y, z)
+    self.assertLen(total_function_cache(defined), 6)
+    defined(x_copy, y, z)
+    self.assertLen(total_function_cache(defined), 6)
+
+  def testVariableRetracing(self):
+    v1 = variables.Variable(1.)
+    v2 = variables.Variable(1.)
+    v3 = copy.deepcopy(variables.Variable(1.))
+
+    var_dict = {id(v1): constant_op.constant(1),
+                id(v2): constant_op.constant(2),
+                id(v3): constant_op.constant(3)}
+
+    @function.defun
+    def lookup_tensor(v):
+      return var_dict[id(v)]
+
+    self.assertEqual(1, lookup_tensor(v1).numpy())
+    self.assertEqual(2, lookup_tensor(v2).numpy())
+    self.assertEqual(3, lookup_tensor(v3).numpy())
 
   def testDecoratedMethodInspect(self):
 

From a2100ebf3e049f3a3ff3f7e797e9aef5a5f8a1d3 Mon Sep 17 00:00:00 2001
From: Jonah Kohn <51345541+jonah-kohn@users.noreply.github.com>
Date: Mon, 22 Jun 2020 11:02:25 -0700
Subject: [PATCH 0765/1390] Respond to linter syntax error

---
 tensorflow/python/keras/utils/vis_utils_test.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/keras/utils/vis_utils_test.py b/tensorflow/python/keras/utils/vis_utils_test.py
index 0eb38a1d48f..b1dc4c21ab8 100644
--- a/tensorflow/python/keras/utils/vis_utils_test.py
+++ b/tensorflow/python/keras/utils/vis_utils_test.py
@@ -64,7 +64,7 @@ class ModelToDotFormatTest(test.TestCase):
     try:
       vis_utils.plot_model(
           model, to_file=dot_img_file, show_shapes=True,
-          show_dtype=True expand_nested=True)
+          show_dtype=True, expand_nested=True)
       self.assertTrue(file_io.file_exists(dot_img_file))
       file_io.delete_file(dot_img_file)
     except ImportError:

From 9001cc1d4583a413b7a8d636ff7735886cca0ffd Mon Sep 17 00:00:00 2001
From: Jonah Kohn <51345541+jonah-kohn@users.noreply.github.com>
Date: Mon, 22 Jun 2020 11:02:46 -0700
Subject: [PATCH 0766/1390] Fit linter indentation requirements.

---
 tensorflow/python/keras/utils/vis_utils.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tensorflow/python/keras/utils/vis_utils.py b/tensorflow/python/keras/utils/vis_utils.py
index aa2a4c978cb..acfb589f51b 100644
--- a/tensorflow/python/keras/utils/vis_utils.py
+++ b/tensorflow/python/keras/utils/vis_utils.py
@@ -192,10 +192,10 @@ def model_to_dot(model,
     # Rebuild the label as a table including the layer's dtype.
     if show_dtype: 
       def format_dtype(dtype):
-          if dtype is None:
-            return '?'
-          else:
-            return str(dtype)
+        if dtype is None:
+          return '?'
+        else:
+          return str(dtype)
           
       label = '%s|%s' % (label, format_dtype(layer.dtype))
 

From 75a3975ab83fef46213e8d3a84a42f308cb93f6f Mon Sep 17 00:00:00 2001
From: Nick Kreeger <kreeger@google.com>
Date: Mon, 22 Jun 2020 10:22:18 -0700
Subject: [PATCH 0767/1390] Fix c-style casts and const usage in
 MicroAllocator.

This change was introduced in cl/316533499 (PR: https://github.com/tensorflow/tensorflow/pull/38121). Lint was complaining of c-style casts, upon fixing it also was hiding const usage.

PiperOrigin-RevId: 317680917
Change-Id: I4d874564875e58eb5f6905c7b75562f90588bb22
---
 tensorflow/lite/micro/micro_allocator.cc | 21 ++++++++++++---------
 1 file changed, 12 insertions(+), 9 deletions(-)

diff --git a/tensorflow/lite/micro/micro_allocator.cc b/tensorflow/lite/micro/micro_allocator.cc
index f3b64bc9f39..d6c192c3ed3 100644
--- a/tensorflow/lite/micro/micro_allocator.cc
+++ b/tensorflow/lite/micro/micro_allocator.cc
@@ -166,11 +166,12 @@ class AllocationInfoBuilder {
   //  - If there's no metadata available, offline_planner_offsets is not set
   //  - If there's metadata available, offline_planner_offsets will point to the
   //    first offset in the metadata buffer list.
-  TfLiteStatus GetOfflinePlannedOffsets(const Model* model,
-                                        int32_t** offline_planner_offsets);
+  TfLiteStatus GetOfflinePlannedOffsets(
+      const Model* model, const int32_t** offline_planner_offsets);
 
   // Add allocaiton information for the tensors.
-  TfLiteStatus AddTensors(const SubGraph* subgraph, int32_t* offline_offsets,
+  TfLiteStatus AddTensors(const SubGraph* subgraph,
+                          const int32_t* offline_offsets,
                           TfLiteTensor* runtime_tensors);
 
   // Add allocation information for the scratch buffers.
@@ -206,7 +207,7 @@ TfLiteStatus AllocationInfoBuilder::Allocate() {
 }
 
 TfLiteStatus AllocationInfoBuilder::AddTensors(const SubGraph* subgraph,
-                                               int32_t* offline_offsets,
+                                               const int32_t* offline_offsets,
                                                TfLiteTensor* runtime_tensors) {
   // Set up allocation info for all tensors.
   for (size_t i = 0; i < tensor_count_; ++i) {
@@ -299,7 +300,7 @@ TfLiteStatus AllocationInfoBuilder::AddTensors(const SubGraph* subgraph,
 // |    4    | Arena byte offset of tensor #1 or -1 to allocate at runtime     |
 // | 3+(n-1) | Arena byte offset of tensor #(n-1) or -1 to allocate at runtime |
 TfLiteStatus AllocationInfoBuilder::GetOfflinePlannedOffsets(
-    const Model* model, int32_t** offline_planner_offsets) {
+    const Model* model, const int32_t** offline_planner_offsets) {
   if (model->metadata()) {
     for (size_t i = 0; i < model->metadata()->size(); ++i) {
       auto metadata = model->metadata()->Get(i);
@@ -309,9 +310,11 @@ TfLiteStatus AllocationInfoBuilder::GetOfflinePlannedOffsets(
             model->buffers();
         auto* buffer = (*buffers)[metadata->buffer()];
         auto* array = buffer->data();
-        const uint32_t* metadata_buffer = (uint32_t*)array->data();
-        const size_t nbr_tensors = (size_t)metadata_buffer[2];
-        *offline_planner_offsets = (int32_t*)&metadata_buffer[3];
+        const uint32_t* metadata_buffer =
+            reinterpret_cast<const uint32_t*>(array->data());
+        const size_t nbr_tensors = static_cast<size_t>(metadata_buffer[2]);
+        *offline_planner_offsets =
+            reinterpret_cast<const int32_t*>(&metadata_buffer[3]);
 
         if (tensor_count_ != nbr_tensors) {
           TF_LITE_REPORT_ERROR(reporter_,
@@ -893,7 +896,7 @@ TfLiteStatus MicroAllocator::CommitStaticMemoryPlan(const Model* model,
     TF_LITE_ENSURE_STATUS(
         builder.Init(subgraph->tensors()->size(), scratch_buffer_count_));
 
-    int32_t* offline_planner_offsets = nullptr;
+    const int32_t* offline_planner_offsets = nullptr;
     TF_LITE_ENSURE_STATUS(
         builder.GetOfflinePlannedOffsets(model, &offline_planner_offsets));
     TF_LITE_ENSURE_STATUS(builder.AddTensors(subgraph, offline_planner_offsets,

From 3a22b091cca63a3ae4c680d21a2e5e2c72712968 Mon Sep 17 00:00:00 2001
From: Robert David <lrdx@google.com>
Date: Mon, 22 Jun 2020 10:29:31 -0700
Subject: [PATCH 0768/1390] Rename a few remaining cell_ptr and cell_scale to
 cell_state_ptr and cell_state_scale.

PiperOrigin-RevId: 317682665
Change-Id: I89ddd7893fd1478bb7e4b9ce8873d3c5e084deb1
---
 tensorflow/lite/kernels/lstm_eval.cc | 47 +++++++++++++++-------------
 1 file changed, 25 insertions(+), 22 deletions(-)

diff --git a/tensorflow/lite/kernels/lstm_eval.cc b/tensorflow/lite/kernels/lstm_eval.cc
index e5bdf3e9a1e..5b4e8a8d479 100644
--- a/tensorflow/lite/kernels/lstm_eval.cc
+++ b/tensorflow/lite/kernels/lstm_eval.cc
@@ -976,7 +976,7 @@ inline void LstmStepHybrid(
 // Scalar values:
 //   quantized_cell_clip: quantized clip value for cell.
 //   quantized_proj_clip: quantized clip value for projection.
-//   cell_scale: the power of two scale for cell state.
+//   cell_state_scale: the power of two scale for cell state.
 //
 // Zero points:
 //   output_state_zp: zero point of output state
@@ -1043,9 +1043,10 @@ inline void LstmStepInteger(
     int32_t layer_norm_output_scale_a, int32_t layer_norm_output_scale_b,
     const int32_t* input_gate_bias_ptr, const int32_t* forget_gate_bias_ptr,
     const int32_t* cell_gate_bias_ptr, const int32_t* output_gate_bias_ptr,
-    int16_t quantized_cell_clip, int8_t quantized_proj_clip, int32_t cell_scale,
-    int32_t input_variance_guard, int32_t forget_variance_guard,
-    int32_t cell_variance_guard, int32_t output_variance_guard,
+    int16_t quantized_cell_clip, int8_t quantized_proj_clip,
+    int32_t cell_state_scale, int32_t input_variance_guard,
+    int32_t forget_variance_guard, int32_t cell_variance_guard,
+    int32_t output_variance_guard,
     const int32_t* input_to_forget_effective_bias,
     const int32_t* recurrent_to_forget_effective_bias,
     const int32_t* input_to_cell_effective_bias,
@@ -1056,7 +1057,7 @@ inline void LstmStepInteger(
     const int32_t* recurrent_to_input_effective_bias,
     const int32_t* projection_effective_bias, int n_batch, int n_cell,
     int n_input, int n_output, int8_t* output_state_ptr,
-    int32_t output_state_zp, int16_t* cell_ptr, int8_t* output_ptr,
+    int32_t output_state_zp, int16_t* cell_state_ptr, int8_t* output_ptr,
     int16_t* scratch0, int16_t* scratch1, int16_t* scratch2, int16_t* scratch3,
     int8_t* scratch4, int32_t* scratch5, CpuBackendContext* context) {
   ruy::profiler::ScopeLabel label("LstmStepInteger");
@@ -1106,7 +1107,7 @@ inline void LstmStepInteger(
       scratch5, forget_gate_scratch, context);
   if (use_peephole) {
     tensor_utils::VectorBatchVectorCwiseProductAccumulate(
-        cell_to_forget_weight_ptr, n_output, cell_ptr, n_batch,
+        cell_to_forget_weight_ptr, n_output, cell_state_ptr, n_batch,
         effective_cell_to_forget_scale_a, effective_cell_to_forget_scale_b,
         forget_gate_scratch);
   }
@@ -1160,7 +1161,7 @@ inline void LstmStepInteger(
         scratch5, input_gate_scratch, context);
     if (use_peephole) {
       tensor_utils::VectorBatchVectorCwiseProductAccumulate(
-          cell_to_input_weight_ptr, n_output, cell_ptr, n_batch,
+          cell_to_input_weight_ptr, n_output, cell_state_ptr, n_batch,
           effective_cell_to_input_scale_a, effective_cell_to_input_scale_b,
           input_gate_scratch);
     }
@@ -1175,18 +1176,19 @@ inline void LstmStepInteger(
                                input_gate_scratch);
   }
 
-  // New cell.
-  tensor_utils::CwiseMul(forget_gate_scratch, cell_ptr, n_batch, n_cell, 15,
-                         forget_gate_scratch);
+  // New cell state.
+  tensor_utils::CwiseMul(forget_gate_scratch, cell_state_ptr, n_batch, n_cell,
+                         15, forget_gate_scratch);
 
   tensor_utils::CwiseMul(input_gate_scratch, cell_gate_scratch, n_batch, n_cell,
-                         30 + cell_scale, cell_gate_scratch);
+                         30 + cell_state_scale, cell_gate_scratch);
 
   tensor_utils::CwiseAdd(forget_gate_scratch, cell_gate_scratch, n_batch,
-                         n_cell, cell_ptr);
+                         n_cell, cell_state_ptr);
 
   if (quantized_cell_clip > 0) {
-    tensor_utils::CwiseClipping(cell_ptr, quantized_cell_clip, n_batch, n_cell);
+    tensor_utils::CwiseClipping(cell_state_ptr, quantized_cell_clip, n_batch,
+                                n_cell);
   }
 
   // Ouptut gate.
@@ -1202,7 +1204,7 @@ inline void LstmStepInteger(
       scratch5, output_gate_scratch, context);
   if (use_peephole) {
     tensor_utils::VectorBatchVectorCwiseProductAccumulate(
-        cell_to_output_weight_ptr, n_output, cell_ptr, n_batch,
+        cell_to_output_weight_ptr, n_output, cell_state_ptr, n_batch,
         effective_cell_to_output_scale_a, effective_cell_to_output_scale_b,
         output_gate_scratch);
   }
@@ -1218,8 +1220,8 @@ inline void LstmStepInteger(
                              output_gate_scratch);
 
   // Hidden.
-  tensor_utils::ApplyTanh(15 + cell_scale, cell_ptr, n_batch, n_cell,
-                          input_gate_scratch);
+  tensor_utils::ApplyTanh(15 + cell_state_scale, cell_state_ptr, n_batch,
+                          n_cell, input_gate_scratch);
 
   tensor_utils::CwiseMul(output_gate_scratch, input_gate_scratch,
                          effective_hidden_scale_a, effective_hidden_scale_b,
@@ -1312,7 +1314,7 @@ inline void LstmStepInteger(
 // Scalar values:
 //   quantized_cell_clip: quantized clip value for cell.
 //   quantized_proj_clip: quantized clip value for projection.
-//   cell_scale: the power of two scale for cell state.
+//   cell_state_scale: the power of two scale for cell state.
 //
 // Zero points:
 //   output_state_zp: zero point of output state.
@@ -1385,7 +1387,7 @@ void LstmStepInteger(
     const int32_t* intermediate_zp, int16_t quantized_cell_clip,
     int8_t quantized_proj_clip, int n_batch, int n_cell, int n_input,
     int n_output, int output_batch_leading_dim, int8_t* output_state_ptr,
-    int32_t output_state_zp, int16_t* cell_ptr, int8_t* output_ptr,
+    int32_t output_state_zp, int16_t* cell_state_ptr, int8_t* output_ptr,
     int8_t* scratch0, int8_t* scratch1, int16_t* scratch2, int16_t* scratch3,
     int16_t* scratch4, int16_t* scratch5, int16_t* scratch6,
     int16_t* scratch7) {
@@ -1486,20 +1488,21 @@ void LstmStepInteger(
                            input_gate_scratch);
 
   // New cell.
-  tensor_utils::CwiseMul(forget_gate_scratch, cell_ptr, n_batch, n_cell,
+  tensor_utils::CwiseMul(forget_gate_scratch, cell_state_ptr, n_batch, n_cell,
                          15 + 15 - 15, scratch6);
 
   tensor_utils::CwiseMul(input_gate_scratch, cell_gate_scratch, n_batch, n_cell,
                          15 + 15 - 15, scratch7);
 
-  tensor_utils::CwiseAdd(scratch6, scratch7, n_batch, n_cell, cell_ptr);
+  tensor_utils::CwiseAdd(scratch6, scratch7, n_batch, n_cell, cell_state_ptr);
 
   if (quantized_cell_clip > 0) {
-    tensor_utils::CwiseClipping(cell_ptr, quantized_cell_clip, n_batch, n_cell);
+    tensor_utils::CwiseClipping(cell_state_ptr, quantized_cell_clip, n_batch,
+                                n_cell);
   }
 
   // Cell to hidden.
-  tensor_utils::ApplyTanhFloat(cell_ptr, n_batch, n_cell, -15,
+  tensor_utils::ApplyTanhFloat(cell_state_ptr, n_batch, n_cell, -15,
                                forget_gate_scratch);
 
   std::vector<int16_t> hidden(n_batch * n_cell);

From 1bd014826e1a6113d8b6af47912c8e9c5c06a560 Mon Sep 17 00:00:00 2001
From: Jonah Kohn <51345541+jonah-kohn@users.noreply.github.com>
Date: Mon, 22 Jun 2020 11:29:18 -0700
Subject: [PATCH 0769/1390] Cast optimizer params as floats during
 serialization.

---
 tensorflow/python/keras/optimizer_v2/optimizer_v2.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/keras/optimizer_v2/optimizer_v2.py b/tensorflow/python/keras/optimizer_v2/optimizer_v2.py
index c55b332bfc0..d8992bbe3e0 100644
--- a/tensorflow/python/keras/optimizer_v2/optimizer_v2.py
+++ b/tensorflow/python/keras/optimizer_v2/optimizer_v2.py
@@ -910,7 +910,7 @@ class OptimizerV2(trackable.Trackable):
       return value()
     if tensor_util.is_tensor(value):
       return backend.get_value(value)
-    return value
+    return float(value)
 
   def variables(self):
     """Returns variables of this Optimizer based on the order created."""

From 7e59a2ab28df8f9cc50de9f064b61d23622af38a Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 22 Jun 2020 10:33:03 -0700
Subject: [PATCH 0770/1390] Qualify uses of std::string

PiperOrigin-RevId: 317683481
Change-Id: I3f6bb93d67e18623bf5d049ae0cc03bd2f937e5f
---
 .../convert_expanddims_to_reshape.cc          |  5 ++-
 .../convert_reorder_axes.cc                   |  7 ++--
 .../convert_trivial_pack_to_reshape.cc        |  2 +-
 .../convert_trivial_transpose_to_reshape.cc   |  5 ++-
 .../create_im2col_arrays.cc                   |  4 +-
 .../toco/graph_transformations/dequantize.cc  | 22 +++++-----
 .../ensure_bias_vectors.cc                    |  9 ++--
 ...int8_weights_safe_for_fast_int8_kernels.cc |  2 +-
 .../fuse_binary_into_preceding_affine.cc      |  2 +-
 .../graph_transformations.cc                  | 20 ++++-----
 .../graph_transformations.h                   | 14 +++----
 .../group_bidirectional_sequence_ops.cc       | 34 +++++++--------
 .../graph_transformations/hardcode_min_max.cc |  8 ++--
 .../identify_dilated_conv.cc                  |  2 +-
 .../identify_hardswish.cc                     |  2 +-
 .../graph_transformations/identify_lstm.cc    | 10 ++---
 .../identify_lstm_merge_inputs.cc             | 13 +++---
 .../identify_lstm_split_inputs.cc             |  2 +-
 .../toco/graph_transformations/lstm_utils.cc  | 15 +++----
 .../toco/graph_transformations/lstm_utils.h   | 15 +++----
 .../make_initial_dequantize_operator.cc       |  5 ++-
 .../merge_reshape_into_preceding_transpose.cc |  4 +-
 .../move_binary_operator_before_reshape.cc    |  2 +-
 ...gate_activation_function_into_constants.cc |  2 +-
 .../propagate_array_data_types.cc             |  4 +-
 .../propagate_default_min_max.cc              |  2 +-
 .../propagate_fixed_sizes.cc                  | 42 +++++++++----------
 .../quantization_util.cc                      |  4 +-
 .../graph_transformations/quantization_util.h |  2 +-
 .../toco/graph_transformations/quantize.cc    |  2 +-
 ...minmax_and_narrow_range_from_fake_quant.cc |  2 +-
 .../remove_successive_transpose.cc            |  4 +-
 .../remove_trivial_concatenation_input.cc     |  8 ++--
 .../remove_trivial_passthrough.cc             |  7 ++--
 ...emove_trivial_quantized_activation_func.cc |  4 +-
 .../remove_trivial_quantized_min_max.cc       |  4 +-
 .../graph_transformations/remove_unused_op.cc |  2 +-
 .../reorder_elementwise_unary.cc              |  6 +--
 .../reorder_reshape_transpose.cc              |  6 +--
 .../resolve_batch_normalization.cc            | 10 +++--
 .../resolve_constant_concatenation.cc         |  6 +--
 .../resolve_reorder_axes.cc                   | 14 +++----
 .../resolve_tensorflow_concat.cc              |  4 +-
 .../resolve_tensorflow_matmul.cc              |  6 +--
 .../resolve_tensorflow_switch.cc              |  2 +-
 .../shuffle_fc_weights.cc                     |  4 +-
 .../unfuse_activation_functions.cc            |  2 +-
 .../unpartition_embedding_lookup.cc           |  9 ++--
 .../unroll_batch_matmul.cc                    | 25 +++++------
 49 files changed, 200 insertions(+), 187 deletions(-)

diff --git a/tensorflow/lite/toco/graph_transformations/convert_expanddims_to_reshape.cc b/tensorflow/lite/toco/graph_transformations/convert_expanddims_to_reshape.cc
index 6b632fb5f46..6695f5a844b 100644
--- a/tensorflow/lite/toco/graph_transformations/convert_expanddims_to_reshape.cc
+++ b/tensorflow/lite/toco/graph_transformations/convert_expanddims_to_reshape.cc
@@ -76,8 +76,9 @@ namespace toco {
   reshape_op->outputs = expand_op->outputs;
 
   // Create a new input array
-  string axis_array_name = expand_op->inputs[1];
-  string shape_array_name = toco::AvailableArrayName(*model, axis_array_name);
+  std::string axis_array_name = expand_op->inputs[1];
+  std::string shape_array_name =
+      toco::AvailableArrayName(*model, axis_array_name);
   Array& shape_array = model->GetOrCreateArray(shape_array_name);
   *(shape_array.mutable_shape()->mutable_dims()) = {
       1, static_cast<int>(reshape_dims.size())};
diff --git a/tensorflow/lite/toco/graph_transformations/convert_reorder_axes.cc b/tensorflow/lite/toco/graph_transformations/convert_reorder_axes.cc
index 1440cd1c1a7..c2214d6f97c 100644
--- a/tensorflow/lite/toco/graph_transformations/convert_reorder_axes.cc
+++ b/tensorflow/lite/toco/graph_transformations/convert_reorder_axes.cc
@@ -41,7 +41,7 @@ TensorFlowReshapeOperator* CreateReshapeFromReorderAxes(
                                    input_shape.dims(3) * input_shape.dims(2)};
 
   // Create a new input array for Reshape.
-  string reshape_array_name =
+  std::string reshape_array_name =
       AvailableArrayName(*model, reshape_op->outputs[0]);
   reshape_op->inputs.push_back(reshape_array_name);
 
@@ -71,7 +71,8 @@ TransposeOperator* CreateTransposeFromReorderAxes(
   GetShuffleShape(input_axes_order, output_axes_order, &permutations_data);
 
   // Create a new input permutations array for Transpose.
-  string perm_array_name = AvailableArrayName(*model, transpose_op->outputs[0]);
+  std::string perm_array_name =
+      AvailableArrayName(*model, transpose_op->outputs[0]);
   transpose_op->inputs.push_back(perm_array_name);
 
   Array& perm_array = model->GetOrCreateArray(perm_array_name);
@@ -104,7 +105,7 @@ TransposeOperator* CreateTransposeFromReorderAxes(
 
   // Get input array. If kFakeQuant is the input into ReorderAxes, get the input
   // array passed into kFakeQuant. kFakeQuant op is dropped when possible.
-  string constant_input_array_name = input_array_name;
+  std::string constant_input_array_name = input_array_name;
   if (!input_array.buffer) {
     const auto* op_producing_input = GetOpWithOutput(*model, input_array_name);
     if (op_producing_input &&
diff --git a/tensorflow/lite/toco/graph_transformations/convert_trivial_pack_to_reshape.cc b/tensorflow/lite/toco/graph_transformations/convert_trivial_pack_to_reshape.cc
index 31f9ced8cf5..2ad6280b955 100644
--- a/tensorflow/lite/toco/graph_transformations/convert_trivial_pack_to_reshape.cc
+++ b/tensorflow/lite/toco/graph_transformations/convert_trivial_pack_to_reshape.cc
@@ -59,7 +59,7 @@ namespace toco {
   reshape_op->outputs = pack_op->outputs;
 
   // Create shape param.
-  string shape_array_name =
+  std::string shape_array_name =
       AvailableArrayName(*model, pack_op->outputs[0] + "_shape");
   Array& shape_array = model->GetOrCreateArray(shape_array_name);
   const int shape_array_dims = 1 + input_array.shape().dimensions_count();
diff --git a/tensorflow/lite/toco/graph_transformations/convert_trivial_transpose_to_reshape.cc b/tensorflow/lite/toco/graph_transformations/convert_trivial_transpose_to_reshape.cc
index 2b5aaea2b23..2d1e5090f4a 100644
--- a/tensorflow/lite/toco/graph_transformations/convert_trivial_transpose_to_reshape.cc
+++ b/tensorflow/lite/toco/graph_transformations/convert_trivial_transpose_to_reshape.cc
@@ -90,8 +90,9 @@ bool TransposeAffectsMemoryOrder(std::vector<int> perm,
   reshape_op->outputs = transpose_op->outputs;
 
   // Create a new input array for the shape input
-  string perm_array_name = transpose_op->inputs[1];
-  string shape_array_name = toco::AvailableArrayName(*model, perm_array_name);
+  std::string perm_array_name = transpose_op->inputs[1];
+  std::string shape_array_name =
+      toco::AvailableArrayName(*model, perm_array_name);
   Array& shape_array = model->GetOrCreateArray(shape_array_name);
   *(shape_array.mutable_shape()->mutable_dims()) = {
       1, static_cast<int>(output_dims.size())};
diff --git a/tensorflow/lite/toco/graph_transformations/create_im2col_arrays.cc b/tensorflow/lite/toco/graph_transformations/create_im2col_arrays.cc
index 8e93bc23789..a788b81672d 100644
--- a/tensorflow/lite/toco/graph_transformations/create_im2col_arrays.cc
+++ b/tensorflow/lite/toco/graph_transformations/create_im2col_arrays.cc
@@ -49,7 +49,7 @@ bool ProcessConvOperator(Model* model, ConvOperator* op) {
 
   // Create the im2col array.
   CHECK_EQ(op->outputs.size(), 1);
-  const string& im2col_array_name =
+  const std::string& im2col_array_name =
       AvailableArrayName(*model, op->inputs[0] + "_im2col");
   model->GetOrCreateArray(im2col_array_name);
   op->outputs.push_back(im2col_array_name);
@@ -65,7 +65,7 @@ bool ProcessTransposeConvOperator(Model* model, TransposeConvOperator* op) {
 
   // Always create an im2col array for transpose_conv.
   CHECK_EQ(op->outputs.size(), 1);
-  const string& im2col_array_name = AvailableArrayName(
+  const std::string& im2col_array_name = AvailableArrayName(
       *model, op->inputs[TransposeConvOperator::DATA_INPUT] + "_im2col");
   model->GetOrCreateArray(im2col_array_name);
   op->outputs.push_back(im2col_array_name);
diff --git a/tensorflow/lite/toco/graph_transformations/dequantize.cc b/tensorflow/lite/toco/graph_transformations/dequantize.cc
index cc5dddbb40e..6872fc47344 100644
--- a/tensorflow/lite/toco/graph_transformations/dequantize.cc
+++ b/tensorflow/lite/toco/graph_transformations/dequantize.cc
@@ -41,7 +41,7 @@ void DequantizeBuffer(Array* array) {
 }
 
 std::vector<std::unique_ptr<Operator>>::iterator FindFirstOpWithInput(
-    Model* model, const string& array_name) {
+    Model* model, const std::string& array_name) {
   for (auto it = model->operators.begin(); it != model->operators.end(); ++it) {
     for (const auto& input : it->get()->inputs) {
       if (input == array_name) {
@@ -52,7 +52,7 @@ std::vector<std::unique_ptr<Operator>>::iterator FindFirstOpWithInput(
   return model->operators.end();
 }
 
-void ClearArrayQuantizationParams(const string& array_name, Model* model) {
+void ClearArrayQuantizationParams(const std::string& array_name, Model* model) {
   auto* array = &model->GetArray(array_name);
   CHECK(array->quantization_params);
   for (auto& input_array : *model->flags.mutable_input_arrays()) {
@@ -75,7 +75,7 @@ void ClearArrayQuantizationParams(const string& array_name, Model* model) {
   array->quantization_params = nullptr;
 }
 
-bool DequantizeArray(const string& array_name,
+bool DequantizeArray(const std::string& array_name,
                      GraphTransformation* transformation, Model* model) {
   auto* array = &model->GetArray(array_name);
   if (!array->quantization_params) {
@@ -133,7 +133,7 @@ bool DequantizeArray(const string& array_name,
   if (IsInputArray(*model, array_name)) {
     must_insert_fakequant_after = true;
   }
-  for (const string& output_array : model->flags.output_arrays()) {
+  for (const std::string& output_array : model->flags.output_arrays()) {
     if (array_name == output_array) {
       must_insert_fakequant_before = true;
     }
@@ -152,7 +152,7 @@ bool DequantizeArray(const string& array_name,
   auto* fakequant_op = new FakeQuantOperator;
   model->operators.emplace(FindFirstOpWithInput(model, array_name),
                            fakequant_op);
-  const string& new_array_name = AvailableArrayName(*model, array_name);
+  const std::string& new_array_name = AvailableArrayName(*model, array_name);
   auto& new_array = model->GetOrCreateArray(new_array_name);
   new_array.data_type = ArrayDataType::kFloat;
   new_array.copy_shape(array->shape());
@@ -162,7 +162,7 @@ bool DequantizeArray(const string& array_name,
   fakequant_op->narrow_range = array->narrow_range;
   if (must_insert_fakequant_before) {
     for (const auto& op : model->operators) {
-      for (string& output : op->outputs) {
+      for (std::string& output : op->outputs) {
         if (output == array_name) {
           output = new_array_name;
         }
@@ -172,7 +172,7 @@ bool DequantizeArray(const string& array_name,
     fakequant_op->outputs = {array_name};
   } else {
     for (const auto& op : model->operators) {
-      for (string& input : op->inputs) {
+      for (std::string& input : op->inputs) {
         if (input == array_name) {
           input = new_array_name;
         }
@@ -209,15 +209,15 @@ bool DequantizeArray(const string& array_name,
     return ::tensorflow::Status::OK();
   }
 
-  std::vector<string> arrays;
-  for (const string& input : op->inputs) {
+  std::vector<std::string> arrays;
+  for (const std::string& input : op->inputs) {
     arrays.push_back(input);
   }
-  for (const string& output : op->outputs) {
+  for (const std::string& output : op->outputs) {
     arrays.push_back(output);
   }
   bool changed = false;
-  for (const string& array : arrays) {
+  for (const std::string& array : arrays) {
     if (!model->IsOptionalArray(array)) {
       changed |= DequantizeArray(array, this, model);
     }
diff --git a/tensorflow/lite/toco/graph_transformations/ensure_bias_vectors.cc b/tensorflow/lite/toco/graph_transformations/ensure_bias_vectors.cc
index fcad8bc0086..09bd9aedcf0 100644
--- a/tensorflow/lite/toco/graph_transformations/ensure_bias_vectors.cc
+++ b/tensorflow/lite/toco/graph_transformations/ensure_bias_vectors.cc
@@ -27,7 +27,7 @@ namespace toco {
 namespace {
 
 int GetOutputDepthFromWeights(const Model& model, const Operator& op) {
-  const string& weights_name = op.inputs[1];
+  const std::string& weights_name = op.inputs[1];
   const auto& weights_shape = model.GetArray(weights_name).shape();
   if (op.type == OperatorType::kConv ||
       op.type == OperatorType::kFullyConnected ||
@@ -56,13 +56,14 @@ bool ProcessLinearOperator(Model* model, Operator* op) {
   if (CheckOpInputSize(*op)) {
     return false;
   }
-  const string& output_name = op->outputs[0];
-  const string& weights_name = op->inputs[1];
+  const std::string& output_name = op->outputs[0];
+  const std::string& weights_name = op->inputs[1];
   if (!model->GetArray(weights_name).has_shape()) {
     return false;
   }
   const int depth = GetOutputDepthFromWeights(*model, *op);
-  const string& bias_name = AvailableArrayName(*model, output_name + "_bias");
+  const std::string& bias_name =
+      AvailableArrayName(*model, output_name + "_bias");
   op->inputs.push_back(bias_name);
   auto& bias_array = model->GetOrCreateArray(bias_name);
   bias_array.data_type = ArrayDataType::kFloat;
diff --git a/tensorflow/lite/toco/graph_transformations/ensure_uint8_weights_safe_for_fast_int8_kernels.cc b/tensorflow/lite/toco/graph_transformations/ensure_uint8_weights_safe_for_fast_int8_kernels.cc
index 918bb489995..5854e74b507 100644
--- a/tensorflow/lite/toco/graph_transformations/ensure_uint8_weights_safe_for_fast_int8_kernels.cc
+++ b/tensorflow/lite/toco/graph_transformations/ensure_uint8_weights_safe_for_fast_int8_kernels.cc
@@ -152,7 +152,7 @@ namespace toco {
       return ::tensorflow::Status::OK();
   }
 
-  const string& name = op.inputs[weights_index];
+  const std::string& name = op.inputs[weights_index];
   auto& array = model->GetArray(name);
   if (!array.buffer) {
     return ::tensorflow::Status::OK();
diff --git a/tensorflow/lite/toco/graph_transformations/fuse_binary_into_preceding_affine.cc b/tensorflow/lite/toco/graph_transformations/fuse_binary_into_preceding_affine.cc
index 05a2fecf31d..1de12b4f959 100644
--- a/tensorflow/lite/toco/graph_transformations/fuse_binary_into_preceding_affine.cc
+++ b/tensorflow/lite/toco/graph_transformations/fuse_binary_into_preceding_affine.cc
@@ -260,7 +260,7 @@ void FuseMulOrDivParamsIntoPrecedingAffine(Model* model, Operator* preceding_op,
     return ::tensorflow::Status::OK();
   }
 
-  for (const string& output_array : model->flags.output_arrays()) {
+  for (const std::string& output_array : model->flags.output_arrays()) {
     if (preceding_op->outputs[0] == output_array) {
       return ::tensorflow::Status::OK();
     }
diff --git a/tensorflow/lite/toco/graph_transformations/graph_transformations.cc b/tensorflow/lite/toco/graph_transformations/graph_transformations.cc
index e4eb7698597..bee666531a7 100644
--- a/tensorflow/lite/toco/graph_transformations/graph_transformations.cc
+++ b/tensorflow/lite/toco/graph_transformations/graph_transformations.cc
@@ -29,7 +29,7 @@ namespace toco {
 
 namespace {
 
-void PrintModelStats(const string& label, const Model& model) {
+void PrintModelStats(const std::string& label, const Model& model) {
   int quantized_arrays = 0;
   for (const auto& array : model.GetArrayMap()) {
     if (array.second->quantization_params) {
@@ -57,8 +57,8 @@ void PrintModelStats(const string& label, const Model& model) {
 void DiscardUselessConnectedComponentsAndRNNBackEdges(Model* model) {
   // Identify the set of arrays that are in 'useful' connected components
   // of the graph, which means connected to output arrays.
-  std::unordered_set<string> useful_arrays;
-  for (const string& output_array : model->flags.output_arrays()) {
+  std::unordered_set<std::string> useful_arrays;
+  for (const std::string& output_array : model->flags.output_arrays()) {
     useful_arrays.insert(output_array);
   }
   bool found_new_useful_arrays;
@@ -66,15 +66,15 @@ void DiscardUselessConnectedComponentsAndRNNBackEdges(Model* model) {
     found_new_useful_arrays = false;
     for (const auto& op : model->operators) {
       bool op_touches_useful_arrays = false;
-      for (const string& output : op->outputs) {
+      for (const std::string& output : op->outputs) {
         op_touches_useful_arrays |= useful_arrays.count(output);
       }
       if (op_touches_useful_arrays) {
-        for (const string& input : op->inputs) {
+        for (const std::string& input : op->inputs) {
           found_new_useful_arrays |= !useful_arrays.count(input);
           useful_arrays.insert(input);
         }
-        for (const string& output : op->outputs) {
+        for (const std::string& output : op->outputs) {
           found_new_useful_arrays |= !useful_arrays.count(output);
           useful_arrays.insert(output);
         }
@@ -91,7 +91,7 @@ void DiscardUselessConnectedComponentsAndRNNBackEdges(Model* model) {
     }
   } while (found_new_useful_arrays);
   // Erase arrays that aren't useful, and that are discardable.
-  model->EraseArrays([&](const string& name) {
+  model->EraseArrays([&](const std::string& name) {
     return (!useful_arrays.count(name) && IsDiscardableArray(*model, name));
   });
   // Erase operators that do not produce a useful output array.
@@ -101,7 +101,7 @@ void DiscardUselessConnectedComponentsAndRNNBackEdges(Model* model) {
     if (useful_arrays.count((*it)->outputs[0])) {
       ++it;
     } else {
-      for (const string& output : (*it)->outputs) {
+      for (const std::string& output : (*it)->outputs) {
         CHECK(!useful_arrays.count(output));
       }
       it = model->operators.erase(it);
@@ -156,7 +156,7 @@ bool GraphTransformationsPass(int increment, Model* model,
                         << " at op_index=" << op_index << "/"
                         << model->operators.size() - 1;
       }
-      for (const string& message : transformation->Messages()) {
+      for (const std::string& message : transformation->Messages()) {
         VLOG(log_level) << transformation->Name() << " " << made_a_change_msg
                         << " at op_index=" << op_index << "/"
                         << model->operators.size() - 1 << ": " << message;
@@ -191,7 +191,7 @@ bool GraphTransformationsPass(int increment, Model* model,
 }  // namespace
 
 tensorflow::Status RunGraphTransformationsWithStatus(
-    Model* model, const string& msg,
+    Model* model, const std::string& msg,
     const GraphTransformationsSet& transformations) {
   PrintModelStats(toco::port::StringF("Before %s", msg), *model);
   int pass_index = 0;
diff --git a/tensorflow/lite/toco/graph_transformations/graph_transformations.h b/tensorflow/lite/toco/graph_transformations/graph_transformations.h
index 07b9fd4c5cf..4d7278fcaf9 100644
--- a/tensorflow/lite/toco/graph_transformations/graph_transformations.h
+++ b/tensorflow/lite/toco/graph_transformations/graph_transformations.h
@@ -33,7 +33,7 @@ class GraphTransformation {
   virtual ~GraphTransformation() {}
   // Returns the list of messages that this graph transformation
   // generated since ClearMessages() was called.
-  const std::vector<string>& Messages() const { return messages_; }
+  const std::vector<std::string>& Messages() const { return messages_; }
   // Clears the list of messages; should be called after every
   // run of this graph transformation.
   void ClearMessages() { return messages_.clear(); }
@@ -48,7 +48,7 @@ class GraphTransformation {
   GraphTransformation() {}
 
   // List of messages generated by this graph transformation.
-  std::vector<string> messages_;
+  std::vector<std::string> messages_;
 
  private:
   GraphTransformation(const GraphTransformation& other) = delete;
@@ -74,7 +74,7 @@ class GraphTransformationsSet {
     }
   }
   void Add(GraphTransformation* transformation) {
-    const string& name = transformation->Name();
+    const std::string& name = transformation->Name();
     CHECK(!names_.count(name));
     names_.insert(name);
     transformations_.emplace_back(transformation);
@@ -92,7 +92,7 @@ class GraphTransformationsSet {
   GraphTransformationsSet(const GraphTransformationsSet&& other) = delete;
   std::vector<std::unique_ptr<GraphTransformation>> transformations_;
   // Names of transformations in the set. Only used to guard against dupes.
-  std::unordered_set<string> names_;
+  std::unordered_set<std::string> names_;
 };
 
 // Run the given list of graph transformations on the model.
@@ -103,11 +103,11 @@ class GraphTransformationsSet {
 // the resulting raw pointers, and this RunGraphTransformations
 // takes care of delete'ing these pointers.
 tensorflow::Status RunGraphTransformationsWithStatus(
-    Model* model, const string& msg,
+    Model* model, const std::string& msg,
     const GraphTransformationsSet& transformations);
 
 inline void RunGraphTransformations(
-    Model* model, const string& msg,
+    Model* model, const std::string& msg,
     const GraphTransformationsSet& transformations) {
   auto s = RunGraphTransformationsWithStatus(model, msg, transformations);
   CHECK(s.ok()) << s.error_message();
@@ -232,7 +232,7 @@ class PropagateDefaultMinMax : public GraphTransformation {
   }
 
  private:
-  bool SetArrayMinMax(const string& array_name, Array* array);
+  bool SetArrayMinMax(const std::string& array_name, Array* array);
   std::vector<std::pair<ArrayDataType, MinMax>> type_ranges_;
 };
 
diff --git a/tensorflow/lite/toco/graph_transformations/group_bidirectional_sequence_ops.cc b/tensorflow/lite/toco/graph_transformations/group_bidirectional_sequence_ops.cc
index fa252b1a61b..4f9caeb77b0 100644
--- a/tensorflow/lite/toco/graph_transformations/group_bidirectional_sequence_ops.cc
+++ b/tensorflow/lite/toco/graph_transformations/group_bidirectional_sequence_ops.cc
@@ -197,7 +197,7 @@ void ConstructBidirectionalSequenceOp(
   constexpr int kBwInputActivationStartIndex = 37;
   constexpr int kAuxInputStartIndex = 39;
   (*bi_op)->inputs.reserve(kBidirectionalSequenceLstmInputsCount);
-  const string& input_array_name =
+  const std::string& input_array_name =
       AvailableArrayName(*model, "bidirectional_sequence_lstm_input_0");
   model->GetOrCreateArray(input_array_name);
   // The input will be changed later.
@@ -232,7 +232,7 @@ void ConstructBidirectionalSequenceOp(
 
   // TODO(renjieliu): Deal with Auxiliary input and weights for 39 - 47.
   for (; i <= kBidirectionalSequenceLstmInputsCount; ++i) {
-    const string& temp_array_name = AvailableArrayName(
+    const std::string& temp_array_name = AvailableArrayName(
         *model, "bidirectional_sequence_lstm_temp_" + std::to_string(i));
     model->CreateOptionalArray(temp_array_name);
     (*bi_op)->inputs.push_back(temp_array_name);
@@ -240,9 +240,9 @@ void ConstructBidirectionalSequenceOp(
 
   // Deal with outputs.
   (*bi_op)->outputs.reserve(2);
-  const string& fw_output_array_name =
+  const std::string& fw_output_array_name =
       AvailableArrayName(*model, "bidirectional_sequence_lstm_fw_output_0");
-  const string& bw_output_array_name =
+  const std::string& bw_output_array_name =
       AvailableArrayName(*model, "bidirectional_sequence_lstm_bw_output_0");
   model->GetOrCreateArray(fw_output_array_name);
   model->GetOrCreateArray(bw_output_array_name);
@@ -260,7 +260,7 @@ void ConstructBidirectionalSequenceOp(
   constexpr int kBwInputsStartIndex = 5;
   constexpr int kAuxInputsStartIndex = 9;
   (*bi_op)->inputs.reserve(kBidirectionalSequenceRnnInputsCount);
-  const string& input_array_name =
+  const std::string& input_array_name =
       AvailableArrayName(*model, "bidirectional_sequence_rnn_input_0");
   model->GetOrCreateArray(input_array_name);
   // The input will be changed later.
@@ -280,7 +280,7 @@ void ConstructBidirectionalSequenceOp(
 
   // TODO(renjieliu): Deal with optional weights.
   for (; i < kBidirectionalSequenceRnnInputsCount; ++i) {
-    const string& temp_array_name = AvailableArrayName(
+    const std::string& temp_array_name = AvailableArrayName(
         *model, "bidirectional_sequence_rnn_temp_" + std::to_string(i));
     model->CreateOptionalArray(temp_array_name);
     (*bi_op)->inputs.push_back(temp_array_name);
@@ -288,9 +288,9 @@ void ConstructBidirectionalSequenceOp(
 
   // Deal with outputs.
   (*bi_op)->outputs.reserve(2);
-  const string& fw_output_array_name =
+  const std::string& fw_output_array_name =
       AvailableArrayName(*model, "bidirectional_sequence_rnn_fw_output_0");
-  const string& bw_output_array_name =
+  const std::string& bw_output_array_name =
       AvailableArrayName(*model, "bidirectional_sequence_rnn_bw_output_0");
   model->GetOrCreateArray(fw_output_array_name);
   model->GetOrCreateArray(bw_output_array_name);
@@ -318,7 +318,7 @@ void GroupFwBwSequenceOps(Model* model, std::stack<Operator*> fw_sequence_ops,
 
 template <typename T>
 void RewireBidirectionalSequenceSequenceOpsConnections(
-    OperatorType operator_type, const string& input_array_name,
+    OperatorType operator_type, const std::string& input_array_name,
     const std::vector<T*>& bidirectional_sequence_ops,
     std::vector<std::unique_ptr<Operator>>::iterator* op_it, Model* model) {
   int aux_input_index = -1;
@@ -333,8 +333,8 @@ void RewireBidirectionalSequenceSequenceOpsConnections(
       // Should not reach here.
       DCHECK(false);
   }
-  string cur_fw_input = input_array_name;
-  string cur_bw_input = input_array_name;
+  std::string cur_fw_input = input_array_name;
+  std::string cur_bw_input = input_array_name;
   for (size_t i = 0; i < bidirectional_sequence_ops.size(); ++i) {
     DeleteArrayIfUnusedOutsideOfOp(bidirectional_sequence_ops[i]->inputs[0],
                                    bidirectional_sequence_ops[i], model);
@@ -371,8 +371,8 @@ void RewireFinalUnpackOutputs(const UnpackOperator& original_unpack_operator,
   (*final_unpack_operator)->num = original_unpack_operator.num;
 
   for (size_t i = 0; i < original_unpack_operator.outputs.size(); ++i) {
-    const string& output_array_name = original_unpack_operator.outputs[i];
-    const string& final_unpack_output_array_name = AvailableArrayName(
+    const std::string& output_array_name = original_unpack_operator.outputs[i];
+    const std::string& final_unpack_output_array_name = AvailableArrayName(
         *model, "bidirectional_sequence_unpack_" + std::to_string(i));
     model->GetOrCreateArray(final_unpack_output_array_name);
     (*final_unpack_operator)->outputs.push_back(final_unpack_output_array_name);
@@ -381,7 +381,7 @@ void RewireFinalUnpackOutputs(const UnpackOperator& original_unpack_operator,
       // If there's a following op after the unpack, it must be a concat op.
       DCHECK(unpack_following_op->type == OperatorType::kConcatenation);
       // For every output of the concat, rewire the outputs.
-      for (const string& concat_output : unpack_following_op->outputs) {
+      for (const std::string& concat_output : unpack_following_op->outputs) {
         (*final_unpack_operator)->outputs[i] = concat_output;
       }
       // Remove the concat op.
@@ -454,7 +454,7 @@ template <typename T>
                        &bidirectional_sequence_ops);
 
   // Rewire the inputs & outputs.
-  string current_input = first_fw_sequence_input->outputs[0];
+  std::string current_input = first_fw_sequence_input->outputs[0];
   RewireBidirectionalSequenceSequenceOpsConnections(
       operator_type, current_input, bidirectional_sequence_ops, &op_it, model);
 
@@ -525,7 +525,7 @@ template <typename T>
                        &bidirectional_sequence_lstm_ops);
 
   // Rewire the inputs & outputs.
-  string current_input = first_fw_lstm_input->outputs[0];
+  std::string current_input = first_fw_lstm_input->outputs[0];
   RewireBidirectionalSequenceSequenceOpsConnections(
       OperatorType::kBidirectionalSequenceLstm, current_input,
       bidirectional_sequence_lstm_ops, &op_it, model);
@@ -601,7 +601,7 @@ template <typename T>
                        &bidirectional_sequence_rnn_ops);
 
   // Rewire the inputs & outputs.
-  string current_input = first_fw_rnn_input->outputs[0];
+  std::string current_input = first_fw_rnn_input->outputs[0];
   RewireBidirectionalSequenceSequenceOpsConnections(
       OperatorType::kBidirectionalSequenceRnn, current_input,
       bidirectional_sequence_rnn_ops, &op_it, model);
diff --git a/tensorflow/lite/toco/graph_transformations/hardcode_min_max.cc b/tensorflow/lite/toco/graph_transformations/hardcode_min_max.cc
index 171d522daa7..c065e32c4df 100644
--- a/tensorflow/lite/toco/graph_transformations/hardcode_min_max.cc
+++ b/tensorflow/lite/toco/graph_transformations/hardcode_min_max.cc
@@ -279,10 +279,10 @@ bool MinMaxApproximatelyEqual(const MinMax& minmax1, const MinMax& minmax2) {
 // If multiple of these arrays have MinMax, then these are required
 // to agree with each other.
 bool PropagateMinMaxAmongArrays(Model* model,
-                                const std::vector<string>& array_names) {
-  string reference_array_name;
+                                const std::vector<std::string>& array_names) {
+  std::string reference_array_name;
   MinMax* reference_minmax = nullptr;
-  for (const string& array_name : array_names) {
+  for (const std::string& array_name : array_names) {
     if (model->GetArray(array_name).minmax) {
       reference_array_name = array_name;
       reference_minmax = model->GetArray(array_name).minmax.get();
@@ -294,7 +294,7 @@ bool PropagateMinMaxAmongArrays(Model* model,
     return false;
   }
   bool changed = false;
-  for (const string& array_name : array_names) {
+  for (const std::string& array_name : array_names) {
     auto& array = model->GetArray(array_name);
     if (array.minmax) {
       CHECK(MinMaxApproximatelyEqual(*array.minmax, *reference_minmax))
diff --git a/tensorflow/lite/toco/graph_transformations/identify_dilated_conv.cc b/tensorflow/lite/toco/graph_transformations/identify_dilated_conv.cc
index 1940068d32a..2ba39e74d3a 100644
--- a/tensorflow/lite/toco/graph_transformations/identify_dilated_conv.cc
+++ b/tensorflow/lite/toco/graph_transformations/identify_dilated_conv.cc
@@ -206,7 +206,7 @@ bool ResolveDilatedConv(Model* model, Operator* conv_base_op, Operator* stb_op,
   }
 
   // Conv Op
-  const string& input_of_conv_op =
+  const std::string& input_of_conv_op =
       has_expand_op ? post_stb_op->outputs[0] : stb_op->outputs[0];
   auto* conv_base_op = GetOpWithInput(*model, input_of_conv_op);
   bool changed = false;
diff --git a/tensorflow/lite/toco/graph_transformations/identify_hardswish.cc b/tensorflow/lite/toco/graph_transformations/identify_hardswish.cc
index 00758a22177..5bc49899e0b 100644
--- a/tensorflow/lite/toco/graph_transformations/identify_hardswish.cc
+++ b/tensorflow/lite/toco/graph_transformations/identify_hardswish.cc
@@ -78,7 +78,7 @@ using util::IsBinaryOp;
   // 1. non-constant input of add_with_relu6_op
   // 2. 1/6
   // 3. (and add_with_relu6_op[0].outputs[0] - which we already know!)
-  std::vector<string> mul_inputs = mul_op->inputs;
+  std::vector<std::string> mul_inputs = mul_op->inputs;
   mul_inputs.insert(mul_inputs.end(), output_op->inputs.begin(),
                     output_op->inputs.end());
 
diff --git a/tensorflow/lite/toco/graph_transformations/identify_lstm.cc b/tensorflow/lite/toco/graph_transformations/identify_lstm.cc
index 43ce90a0444..14f81779147 100644
--- a/tensorflow/lite/toco/graph_transformations/identify_lstm.cc
+++ b/tensorflow/lite/toco/graph_transformations/identify_lstm.cc
@@ -35,7 +35,7 @@ std::vector<std::unique_ptr<Operator>>::iterator FindOperator(
   return it;
 }
 
-bool ValidateSourceOp(const Model& model, const string& array_name,
+bool ValidateSourceOp(const Model& model, const std::string& array_name,
                       OperatorType op_type, Operator** source_op) {
   if (op_type == OperatorType::kNone) {
     CHECK(!source_op);
@@ -184,7 +184,7 @@ bool MatchOperatorInputs(const Operator& op, const Model& model,
                            &state_remember_mul)) {
     return ::tensorflow::Status::OK();
   }
-  const string prev_state = state_forget_mul->inputs[0];
+  const std::string prev_state = state_forget_mul->inputs[0];
 
   // State forget gate
   Operator* state_forget_sig;
@@ -271,16 +271,16 @@ bool MatchOperatorInputs(const Operator& op, const Model& model,
               LogName(*lstm_cell_op));
 
   // Create temp arrays used internally during runtime.
-  const string base_name(FindLongestCommonPrefix(
+  const std::string base_name(FindLongestCommonPrefix(
       lstm_cell_op->outputs[LstmCellOperator::STATE_OUTPUT],
       lstm_cell_op->outputs[LstmCellOperator::ACTIV_OUTPUT]));
-  const string& concat_temp_array_name =
+  const std::string& concat_temp_array_name =
       AvailableArrayName(*model, base_name + "concat_temp");
   auto& concat_temp_array = model->GetOrCreateArray(concat_temp_array_name);
   concat_temp_array.data_type =
       model->GetArray(concat_inputs->outputs[0]).data_type;
   lstm_cell_op->outputs[LstmCellOperator::CONCAT_TEMP] = concat_temp_array_name;
-  const string& activ_temp_array_name =
+  const std::string& activ_temp_array_name =
       AvailableArrayName(*model, base_name + "activ_temp");
   auto& activ_temp_array = model->GetOrCreateArray(activ_temp_array_name);
   activ_temp_array.data_type =
diff --git a/tensorflow/lite/toco/graph_transformations/identify_lstm_merge_inputs.cc b/tensorflow/lite/toco/graph_transformations/identify_lstm_merge_inputs.cc
index 2ac1d380813..cfa5f879f44 100644
--- a/tensorflow/lite/toco/graph_transformations/identify_lstm_merge_inputs.cc
+++ b/tensorflow/lite/toco/graph_transformations/identify_lstm_merge_inputs.cc
@@ -45,12 +45,12 @@ namespace toco {
 
   // Identify prev_activ_input, prev_state_input as required Op inputs,
   // using the rnn_states in the model flag.
-  string prev_activ_input;
+  std::string prev_activ_input;
   if (!GetMatchingRnnArray(model, src_op->outputs[kOutputTensor],
                            &prev_activ_input)) {
     return ::tensorflow::Status::OK();
   }
-  string prev_state_input;
+  std::string prev_state_input;
   if (!GetMatchingRnnArray(model, src_op->outputs[kCellStateTensor],
                            &prev_state_input)) {
     return ::tensorflow::Status::OK();
@@ -72,9 +72,10 @@ namespace toco {
   CHECK_EQ(num_cell, num_output);
 
   // Create tensorflow_graphdef style's one big weight tensor.
-  const string base_name(FindLongestCommonPrefix(
+  const std::string base_name(FindLongestCommonPrefix(
       src_op->outputs[kOutputTensor], src_op->outputs[kCellStateTensor]));
-  string merged_weights = AvailableArrayName(*model, base_name + "weights");
+  std::string merged_weights =
+      AvailableArrayName(*model, base_name + "weights");
   auto& array = model->GetOrCreateArray(merged_weights);
   array.data_type = ArrayDataType::kFloat;
   int weights_dim1 = 4 * num_cell;
@@ -117,7 +118,7 @@ namespace toco {
       num_cell * 3, num_input);
 
   // Create tensorflow_graphdef style's one big bias tensor.
-  string merged_biases = AvailableArrayName(*model, base_name + "biases");
+  std::string merged_biases = AvailableArrayName(*model, base_name + "biases");
   auto& bias_array = model->GetOrCreateArray(merged_biases);
   bias_array.data_type = ArrayDataType::kFloat;
   bias_array.copy_shape(Shape({weights_dim1}));
@@ -160,7 +161,7 @@ namespace toco {
   lstm_cell_op->outputs[LstmCellOperator::ACTIV_TEMP] =
       src_op->outputs[kOutputStateTensor];
   // Create a new temp array for the fourth output.
-  const string& concat_temp_array_name =
+  const std::string& concat_temp_array_name =
       AvailableArrayName(*model, base_name + "concat_temp");
   model->GetOrCreateArray(concat_temp_array_name);
   lstm_cell_op->outputs[LstmCellOperator::CONCAT_TEMP] = concat_temp_array_name;
diff --git a/tensorflow/lite/toco/graph_transformations/identify_lstm_split_inputs.cc b/tensorflow/lite/toco/graph_transformations/identify_lstm_split_inputs.cc
index 62f4124fb4e..8359534435a 100644
--- a/tensorflow/lite/toco/graph_transformations/identify_lstm_split_inputs.cc
+++ b/tensorflow/lite/toco/graph_transformations/identify_lstm_split_inputs.cc
@@ -86,7 +86,7 @@ namespace toco {
   // Get original weight tensor and decompose 1 tensor to 8 sub tensors.
   Array& kernel =
       model->GetArray(curr_op->inputs[LstmCellOperator::WEIGHTS_INPUT]);
-  const string base_name(FindLongestCommonPrefix(
+  const std::string base_name(FindLongestCommonPrefix(
       curr_op->outputs[LstmCellOperator::ACTIV_OUTPUT],
       curr_op->outputs[LstmCellOperator::STATE_OUTPUT]));
 
diff --git a/tensorflow/lite/toco/graph_transformations/lstm_utils.cc b/tensorflow/lite/toco/graph_transformations/lstm_utils.cc
index 3414a7fd7fe..a8b69205c0d 100644
--- a/tensorflow/lite/toco/graph_transformations/lstm_utils.cc
+++ b/tensorflow/lite/toco/graph_transformations/lstm_utils.cc
@@ -16,8 +16,8 @@ limitations under the License.
 
 namespace toco {
 
-void CreateOptionalArray(Model* model, string* input_array_buffer,
-                         const string& array_name) {
+void CreateOptionalArray(Model* model, std::string* input_array_buffer,
+                         const std::string& array_name) {
   *input_array_buffer = array_name;
   model->CreateOptionalArray(array_name);
 }
@@ -39,7 +39,7 @@ void CopyArrayData(const Buffer<ArrayDataType::kFloat>& src_buffer,
 }
 
 Buffer<ArrayDataType::kFloat>* CreateFloatArrayBuffer(Model* model,
-                                                      string* array_name,
+                                                      std::string* array_name,
                                                       const Shape& shape) {
   *array_name = AvailableArrayName(*model, *array_name);
   auto& array = model->GetOrCreateArray(*array_name);
@@ -51,8 +51,8 @@ Buffer<ArrayDataType::kFloat>* CreateFloatArrayBuffer(Model* model,
   return buffer;
 }
 
-void CopySubArrayToArray(Model* model, string* array_name,
-                         const string& tensor_name, int dim1_size,
+void CopySubArrayToArray(Model* model, std::string* array_name,
+                         const std::string& tensor_name, int dim1_size,
                          int dim2_size, const Array& original_array,
                          int start_idx1, int start_idx2) {
   // Determine whether it's bias or not, create shape, buffer.
@@ -83,8 +83,9 @@ void CopyArrayToSubArray(Buffer<ArrayDataType::kFloat>& tensor_buffer,
                 dim1_copy_size, dim2_copy_size);
 }
 
-bool GetMatchingRnnArray(Model* model, const string& back_edge_source_array,
-                         string* rnn_array) {
+bool GetMatchingRnnArray(Model* model,
+                         const std::string& back_edge_source_array,
+                         std::string* rnn_array) {
   for (const auto& rnn_state : model->flags.rnn_states()) {
     if (rnn_state.back_edge_source_array() == back_edge_source_array) {
       *rnn_array = rnn_state.state_array();
diff --git a/tensorflow/lite/toco/graph_transformations/lstm_utils.h b/tensorflow/lite/toco/graph_transformations/lstm_utils.h
index 949292ee84b..102fe7d6cfc 100644
--- a/tensorflow/lite/toco/graph_transformations/lstm_utils.h
+++ b/tensorflow/lite/toco/graph_transformations/lstm_utils.h
@@ -62,12 +62,12 @@ enum ExtendedLstmCellOutputs {
 };
 
 // Create optional array used for optional tensor in ExtendedLstmCell inputs.
-void CreateOptionalArray(Model* model, string* input_array_buffer,
-                         const string& array_name);
+void CreateOptionalArray(Model* model, std::string* input_array_buffer,
+                         const std::string& array_name);
 
 // Create float array and get its buffer.
 Buffer<ArrayDataType::kFloat>* CreateFloatArrayBuffer(Model* model,
-                                                      string* array_name,
+                                                      std::string* array_name,
                                                       const Shape& shape);
 
 // Copy data from one array to the other one (supports 1D and 2D array),
@@ -91,8 +91,8 @@ void CopyArrayData(const Buffer<ArrayDataType::kFloat>& src_buffer,
 
 // Copy a subset of array data and create a smaller array,
 // mostly used for spliting weights and bias for Lstm cell.
-void CopySubArrayToArray(Model* model, string* array_name,
-                         const string& tensor_name, int dim1_size,
+void CopySubArrayToArray(Model* model, std::string* array_name,
+                         const std::string& tensor_name, int dim1_size,
                          int dim2_size, const Array& original_array,
                          int start_idx1, int start_idx2);
 
@@ -103,8 +103,9 @@ void CopyArrayToSubArray(Buffer<ArrayDataType::kFloat>& tensor_buffer,
                          int start_idx1, int start_idx2);
 
 // Get mating rnn array inputs using rnn_states flag.
-bool GetMatchingRnnArray(Model* model, const string& back_edge_source_array,
-                         string* rnn_array);
+bool GetMatchingRnnArray(Model* model,
+                         const std::string& back_edge_source_array,
+                         std::string* rnn_array);
 
 }  // namespace toco
 
diff --git a/tensorflow/lite/toco/graph_transformations/make_initial_dequantize_operator.cc b/tensorflow/lite/toco/graph_transformations/make_initial_dequantize_operator.cc
index b914838b91c..7783b41767c 100644
--- a/tensorflow/lite/toco/graph_transformations/make_initial_dequantize_operator.cc
+++ b/tensorflow/lite/toco/graph_transformations/make_initial_dequantize_operator.cc
@@ -31,7 +31,8 @@ namespace toco {
 // generate this output to be removed by graph transformations.  Note that there
 // may be more than one operator that takes the input_array as their input, and
 // that some of these may be removed by graph transformations.
-bool AddDequantizeOperatorToInput(const string& input_name, const Operator* op,
+bool AddDequantizeOperatorToInput(const std::string& input_name,
+                                  const Operator* op,
                                   GraphTransformation* transformation,
                                   Model* model) {
   // An operator with the required output may be a dequantize operator already
@@ -65,7 +66,7 @@ bool AddDequantizeOperatorToInput(const string& input_name, const Operator* op,
   const auto& dequantized_input_name =
       AvailableArrayName(*model, input_name + "_dequantized");
   for (auto& other_op : model->operators) {
-    for (string& other_op_input : other_op->inputs) {
+    for (std::string& other_op_input : other_op->inputs) {
       if (other_op_input == input_name) {
         other_op_input = dequantized_input_name;
       }
diff --git a/tensorflow/lite/toco/graph_transformations/merge_reshape_into_preceding_transpose.cc b/tensorflow/lite/toco/graph_transformations/merge_reshape_into_preceding_transpose.cc
index 80170fe8bcb..96ccc22d9e9 100644
--- a/tensorflow/lite/toco/graph_transformations/merge_reshape_into_preceding_transpose.cc
+++ b/tensorflow/lite/toco/graph_transformations/merge_reshape_into_preceding_transpose.cc
@@ -117,8 +117,8 @@ std::vector<int32> ReshapeToTranspose(const Model& model,
     return ::tensorflow::Status::OK();
   }
 
-  const string intermediate_name = reshape_op->inputs[0];
-  const string output_name = reshape_op->outputs[0];
+  const std::string intermediate_name = reshape_op->inputs[0];
+  const std::string output_name = reshape_op->outputs[0];
 
   // Guarantee the input is only consume by the reshape.
   if (CountOpsWithInput(*model, intermediate_name) != 1) {
diff --git a/tensorflow/lite/toco/graph_transformations/move_binary_operator_before_reshape.cc b/tensorflow/lite/toco/graph_transformations/move_binary_operator_before_reshape.cc
index 0f3c4d34d66..222d1fd1e08 100644
--- a/tensorflow/lite/toco/graph_transformations/move_binary_operator_before_reshape.cc
+++ b/tensorflow/lite/toco/graph_transformations/move_binary_operator_before_reshape.cc
@@ -141,7 +141,7 @@ bool IsTailOfShape(const Shape& tail, const Shape& shape) {
   }
 
   // EXTRA CHECKS ON CONNECTING ARRAY
-  for (const string& output_array : model->flags.output_arrays()) {
+  for (const std::string& output_array : model->flags.output_arrays()) {
     if (binary_op->inputs[variable_input_idx] == output_array) {
       AddMessageF(
           "Not moving %s because the output of reshape op %s is an output op.",
diff --git a/tensorflow/lite/toco/graph_transformations/propagate_activation_function_into_constants.cc b/tensorflow/lite/toco/graph_transformations/propagate_activation_function_into_constants.cc
index 95de60262e7..a66a4cd0124 100644
--- a/tensorflow/lite/toco/graph_transformations/propagate_activation_function_into_constants.cc
+++ b/tensorflow/lite/toco/graph_transformations/propagate_activation_function_into_constants.cc
@@ -52,7 +52,7 @@ namespace toco {
   }
 
   // Filter to the list of supported ops.
-  string src_op_input;
+  std::string src_op_input;
   switch (src_op->type) {
     case OperatorType::kGather:
       src_op_input = src_op->inputs[0];
diff --git a/tensorflow/lite/toco/graph_transformations/propagate_array_data_types.cc b/tensorflow/lite/toco/graph_transformations/propagate_array_data_types.cc
index 49d59de860b..5eda1950745 100644
--- a/tensorflow/lite/toco/graph_transformations/propagate_array_data_types.cc
+++ b/tensorflow/lite/toco/graph_transformations/propagate_array_data_types.cc
@@ -48,7 +48,7 @@ void SetDataTypeForAllOutputs(Model* model, Operator* op,
   }
   // Record data types of output before processing, so we can see at the
   // end if we changed anything, and return the correct boolean value.
-  std::unordered_map<string, ArrayDataType> old_output_data_types;
+  std::unordered_map<std::string, ArrayDataType> old_output_data_types;
   for (const auto& output : op->outputs) {
     old_output_data_types[output] = model->GetArray(output).data_type;
   }
@@ -171,7 +171,7 @@ void SetDataTypeForAllOutputs(Model* model, Operator* op,
         return ::tensorflow::Status::OK();
       }
       for (int i = 0; i < op->outputs.size(); ++i) {
-        const string& output = op->outputs[i];
+        const std::string& output = op->outputs[i];
         const ArrayDataType data_type = unsupported_op->output_data_types[i];
         model->GetArray(output).data_type = data_type;
       }
diff --git a/tensorflow/lite/toco/graph_transformations/propagate_default_min_max.cc b/tensorflow/lite/toco/graph_transformations/propagate_default_min_max.cc
index d31ba956afd..bf1109ddba5 100644
--- a/tensorflow/lite/toco/graph_transformations/propagate_default_min_max.cc
+++ b/tensorflow/lite/toco/graph_transformations/propagate_default_min_max.cc
@@ -70,7 +70,7 @@ bool SupportsMinMax(const Array& array) {
 
 // Sets the min/max on the given array, adjusting the reference_minmax for the
 // final data type of the array if it is already specified.
-bool PropagateDefaultMinMax::SetArrayMinMax(const string& array_name,
+bool PropagateDefaultMinMax::SetArrayMinMax(const std::string& array_name,
                                             Array* array) {
   CHECK(!array->minmax);
 
diff --git a/tensorflow/lite/toco/graph_transformations/propagate_fixed_sizes.cc b/tensorflow/lite/toco/graph_transformations/propagate_fixed_sizes.cc
index 006e624eb7a..1524cfe7f35 100644
--- a/tensorflow/lite/toco/graph_transformations/propagate_fixed_sizes.cc
+++ b/tensorflow/lite/toco/graph_transformations/propagate_fixed_sizes.cc
@@ -268,7 +268,7 @@ void ProcessDepthwiseConvOperator(Model* model, DepthwiseConvOperator* op) {
   const auto& weights_shape = weights_array.shape();
   CHECK_EQ(weights_shape.dimensions_count(), 4);
 
-  const string& output_name = op->outputs[0];
+  const std::string& output_name = op->outputs[0];
   const int input_depth = input_shape.dims(3);
   const int output_depth = weights_shape.dims(3);
   // TensorFlow doesn't define the depth_multiplier value on DepthwiseConv ops,
@@ -302,7 +302,7 @@ void ProcessDepthToSpaceOperator(Model* model, DepthToSpaceOperator* op) {
   const auto& input_shape = input_array.shape();
   CHECK_EQ(input_shape.dimensions_count(), 4);
 
-  const string& output_name = op->outputs[0];
+  const std::string& output_name = op->outputs[0];
   const int block_size = op->block_size;
   CHECK_NE(block_size, 0) << "Invalid block_size in " << output_name;
   const int batch = input_shape.dims(0);
@@ -325,7 +325,7 @@ void ProcessSpaceToDepthOperator(Model* model, SpaceToDepthOperator* op) {
   const auto& input_shape = input_array.shape();
   CHECK_EQ(input_shape.dimensions_count(), 4);
 
-  const string& output_name = op->outputs[0];
+  const std::string& output_name = op->outputs[0];
   const int block_size = op->block_size;
   CHECK_NE(block_size, 0) << "Invalid block_size in " << output_name;
   const int batch = input_shape.dims(0);
@@ -470,7 +470,7 @@ void ProcessSimpleOperator(Model* model, Operator* op, int input_index) {
     return;
   }
 
-  const string& output_name = op->outputs[0];
+  const std::string& output_name = op->outputs[0];
   auto& output_array = model->GetArray(output_name);
   if (output_array.has_shape()) {
     return;
@@ -487,7 +487,7 @@ void ProcessSimpleBinaryOperator(Model* model, Operator* op) {
   if (!input0_array.has_shape() || !input1_array.has_shape()) {
     return;
   }
-  const string& output_name = op->outputs[0];
+  const std::string& output_name = op->outputs[0];
   auto& output_array = model->GetArray(output_name);
   ComputeBinaryOperatorOutputSize(input0_array.shape(), input1_array.shape(),
                                   &output_array);
@@ -639,14 +639,14 @@ void ProcessSliceOperator(Model* model, SliceOperator* op) {
 }
 
 void ProcessReorderAxesOperator(Model* model, ReorderAxesOperator* op) {
-  const string& input_name = op->inputs[0];
+  const std::string& input_name = op->inputs[0];
   const auto& input_array = model->GetArray(input_name);
   // Yield until input dims have been resolved.
   if (!input_array.has_shape()) {
     return;
   }
   const auto& input_shape = input_array.shape();
-  const string& output_name = op->outputs[0];
+  const std::string& output_name = op->outputs[0];
   Shape* output_shape = model->GetArray(output_name).mutable_shape();
   ShuffleDims(input_shape, op->input_axes_order, op->output_axes_order,
               output_shape);
@@ -757,7 +757,7 @@ void ProcessRangeOperator(Model* model, RangeOperator* op) {
 
 void ProcessTensorFlowSplitOperator(Model* model, TensorFlowSplitOperator* op) {
   CHECK_EQ(op->inputs.size(), 2);
-  const string& input_name = op->inputs[1];
+  const std::string& input_name = op->inputs[1];
   const auto& input_array = model->GetArray(input_name);
   // Yield until input dims have been resolved.
   if (!input_array.has_shape()) {
@@ -892,7 +892,7 @@ void ProcessTensorFlowSplitVOperator(Model* model,
 }
 
 void ProcessAveragePoolOperator(Model* model, AveragePoolOperator* op) {
-  const string& input_name = op->inputs[0];
+  const std::string& input_name = op->inputs[0];
   const auto& input_array = model->GetArray(input_name);
   // Yield until input dims have been resolved.
   if (!input_array.has_shape()) {
@@ -900,7 +900,7 @@ void ProcessAveragePoolOperator(Model* model, AveragePoolOperator* op) {
   }
   const auto& input_shape = input_array.shape();
   CHECK_EQ(input_shape.dimensions_count(), 4);
-  const string& output_name = op->outputs[0];
+  const std::string& output_name = op->outputs[0];
   const int output_depth = input_shape.dims(3);
   ComputeConvSizes(input_shape, output_depth, op->kwidth, op->kheight,
                    op->stride_width, op->stride_height, 1, 1, op->padding.type,
@@ -909,7 +909,7 @@ void ProcessAveragePoolOperator(Model* model, AveragePoolOperator* op) {
 }
 
 void ProcessMaxPoolOperator(Model* model, MaxPoolOperator* op) {
-  const string& input_name = op->inputs[0];
+  const std::string& input_name = op->inputs[0];
   const auto& input_array = model->GetArray(input_name);
   // Yield until input dims have been resolved.
   if (!input_array.has_shape()) {
@@ -917,7 +917,7 @@ void ProcessMaxPoolOperator(Model* model, MaxPoolOperator* op) {
   }
   const auto& input_shape = input_array.shape();
   CHECK_EQ(input_shape.dimensions_count(), 4);
-  const string& output_name = op->outputs[0];
+  const std::string& output_name = op->outputs[0];
   const int output_depth = input_shape.dims(3);
   ComputeConvSizes(input_shape, output_depth, op->kwidth, op->kheight,
                    op->stride_width, op->stride_height, 1, 1, op->padding.type,
@@ -926,7 +926,7 @@ void ProcessMaxPoolOperator(Model* model, MaxPoolOperator* op) {
 }
 
 void ProcessL2PoolOperator(Model* model, L2PoolOperator* op) {
-  const string& input_name = op->inputs[0];
+  const std::string& input_name = op->inputs[0];
   const auto& input_array = model->GetArray(input_name);
   // Yield until input dims have been resolved.
   if (!input_array.has_shape()) {
@@ -936,7 +936,7 @@ void ProcessL2PoolOperator(Model* model, L2PoolOperator* op) {
   if (input_shape.dimensions_count() < 4) {
     LOG(FATAL) << "missing dimensions for " << input_name;
   }
-  const string& output_name = op->outputs[0];
+  const std::string& output_name = op->outputs[0];
   const int output_depth = input_shape.dims(3);
   ComputeConvSizes(input_shape, output_depth, op->kwidth, op->kheight,
                    op->stride_width, op->stride_height, 1, 1, op->padding.type,
@@ -954,7 +954,7 @@ void ProcessResizeBilinearOperator(Model* model, ResizeBilinearOperator* op) {
   }
   const auto& input_data_shape = model->GetArray(op->inputs[0]).shape();
 
-  const string& output_size_name = op->inputs[1];
+  const std::string& output_size_name = op->inputs[1];
   const auto& output_size_array = model->GetArray(output_size_name);
   CHECK(output_size_array.data_type == ArrayDataType::kInt32);
   CHECK(output_size_array.has_shape());
@@ -982,7 +982,7 @@ void ProcessResizeNearestNeighborOperator(Model* model,
   }
   const auto& input_data_shape = model->GetArray(op->inputs[0]).shape();
 
-  const string& output_size_name = op->inputs[1];
+  const std::string& output_size_name = op->inputs[1];
   const auto& output_size_array = model->GetArray(output_size_name);
   CHECK(output_size_array.data_type == ArrayDataType::kInt32);
   CHECK(output_size_array.has_shape());
@@ -1862,7 +1862,7 @@ void ProcessArgMinMaxOperator(Model* model, Op* op) {
     }
   }
 
-  const string& output_name = op->outputs[0];
+  const std::string& output_name = op->outputs[0];
   auto& output_array = model->GetArray(output_name);
   if (output_array.has_shape()) {
     return;
@@ -1880,7 +1880,7 @@ void ProcessSparseToDenseOperator(Model* model, SparseToDenseOperator* op) {
   // Output should not go over four dimensions.
   CHECK_LE(output_shape_array.shape().dims(0), 4);
 
-  const string& output_name = op->outputs[0];
+  const std::string& output_name = op->outputs[0];
   Array& output_array = model->GetArray(output_name);
   if (output_array.has_shape()) return;
 
@@ -2015,7 +2015,7 @@ void ProcessUnpackOperator(Model* model, UnpackOperator* op) {
       output_dims.push_back(input_dims[i]);
     }
   }
-  for (const string& output_name : op->outputs) {
+  for (const std::string& output_name : op->outputs) {
     auto& output_array = model->GetArray(output_name);
     if (output_array.has_shape()) {
       return;
@@ -2149,7 +2149,7 @@ void ProcessScatterNdOperator(Model* model, ScatterNdOperator* op) {
   *modified = false;
   auto it = model->operators.begin() + op_index;
   auto* op = it->get();
-  std::unordered_map<string, std::vector<int>> old_output_dims;
+  std::unordered_map<std::string, std::vector<int>> old_output_dims;
   for (const auto& output : op->outputs) {
     if (model->GetArray(output).has_shape()) {
       old_output_dims[output] = model->GetArray(output).shape().dims();
@@ -2400,7 +2400,7 @@ void ProcessScatterNdOperator(Model* model, ScatterNdOperator* op) {
         return ::tensorflow::Status::OK();
       }
       for (int i = 0; i < op->outputs.size(); ++i) {
-        const string& output = op->outputs[i];
+        const std::string& output = op->outputs[i];
         model->GetArray(output).copy_shape(unsupported_op->output_shapes.at(i));
       }
       break;
diff --git a/tensorflow/lite/toco/graph_transformations/quantization_util.cc b/tensorflow/lite/toco/graph_transformations/quantization_util.cc
index 23749abf0b1..76ead658107 100644
--- a/tensorflow/lite/toco/graph_transformations/quantization_util.cc
+++ b/tensorflow/lite/toco/graph_transformations/quantization_util.cc
@@ -164,7 +164,7 @@ std::unique_ptr<GenericBuffer> QuantizeBuffer(
 
 template <ArrayDataType A>
 void QuantizeArray(GraphTransformation* transformation, Model* model,
-                   const string& name,
+                   const std::string& name,
                    const QuantizationParams& quantization_params) {
   auto& array = model->GetArray(name);
   CHECK(array.data_type == ArrayDataType::kFloat);
@@ -184,7 +184,7 @@ void QuantizeArray(GraphTransformation* transformation, Model* model,
 }  // namespace
 
 void QuantizeArray(GraphTransformation* transformation, Model* model,
-                   const string& name, ArrayDataType quantized_data_type,
+                   const std::string& name, ArrayDataType quantized_data_type,
                    const QuantizationParams& quantization_params) {
   ArrayDataType adjusted_data_type = quantized_data_type;
   auto& array = model->GetArray(name);
diff --git a/tensorflow/lite/toco/graph_transformations/quantization_util.h b/tensorflow/lite/toco/graph_transformations/quantization_util.h
index d226aeab8b7..d1d72b98f9e 100644
--- a/tensorflow/lite/toco/graph_transformations/quantization_util.h
+++ b/tensorflow/lite/toco/graph_transformations/quantization_util.h
@@ -47,7 +47,7 @@ void ChooseQuantizationParamsForArrayAndQuantizedDataType(
 // Quantizes an array by setting its data type and (if constant) quantizing
 // all values in the array.
 void QuantizeArray(GraphTransformation* transformation, Model* model,
-                   const string& name, ArrayDataType quantized_data_type,
+                   const std::string& name, ArrayDataType quantized_data_type,
                    const QuantizationParams& quantization_params);
 
 // Returns true if the given array, when quantized, contains only values between
diff --git a/tensorflow/lite/toco/graph_transformations/quantize.cc b/tensorflow/lite/toco/graph_transformations/quantize.cc
index e6fd88c9787..c5848f83dd3 100644
--- a/tensorflow/lite/toco/graph_transformations/quantize.cc
+++ b/tensorflow/lite/toco/graph_transformations/quantize.cc
@@ -121,7 +121,7 @@ bool SupportOutputTypeFloatInQuantizedOp(const Operator& op) {
   }
   return false;
 }
-const MinMax& GetOrComputeMinMax(Model* model, const string& array_name) {
+const MinMax& GetOrComputeMinMax(Model* model, const std::string& array_name) {
   auto& array = model->GetArray(array_name);
   // Normally we should have a MinMax recorded on this Array,
   // so we just use it.
diff --git a/tensorflow/lite/toco/graph_transformations/read_array_minmax_and_narrow_range_from_fake_quant.cc b/tensorflow/lite/toco/graph_transformations/read_array_minmax_and_narrow_range_from_fake_quant.cc
index 4d621018dc3..30875c7e59e 100644
--- a/tensorflow/lite/toco/graph_transformations/read_array_minmax_and_narrow_range_from_fake_quant.cc
+++ b/tensorflow/lite/toco/graph_transformations/read_array_minmax_and_narrow_range_from_fake_quant.cc
@@ -29,7 +29,7 @@ namespace {
 
 bool ApplyAttrsToArray(GraphTransformation* transformation, Model* model,
                        const FakeQuantOperator& fq_op,
-                       const string& array_name) {
+                       const std::string& array_name) {
   bool changed = false;
   auto& annotated_array = model->GetArray(array_name);
   if (!annotated_array.minmax) {
diff --git a/tensorflow/lite/toco/graph_transformations/remove_successive_transpose.cc b/tensorflow/lite/toco/graph_transformations/remove_successive_transpose.cc
index 6eccda04c18..5e3b6f7b615 100644
--- a/tensorflow/lite/toco/graph_transformations/remove_successive_transpose.cc
+++ b/tensorflow/lite/toco/graph_transformations/remove_successive_transpose.cc
@@ -43,8 +43,8 @@ bool TransformsToIdentity(std::vector<int> const& perm1,
   return true;
 }
 
-void ReplaceOpInputsWith(Model* model, const string& lookfor,
-                         const string& replacewith) {
+void ReplaceOpInputsWith(Model* model, const std::string& lookfor,
+                         const std::string& replacewith) {
   for (const auto& op : model->operators) {
     for (int i = 0; i < op->inputs.size(); ++i) {
       if (op->inputs[i] == lookfor) {
diff --git a/tensorflow/lite/toco/graph_transformations/remove_trivial_concatenation_input.cc b/tensorflow/lite/toco/graph_transformations/remove_trivial_concatenation_input.cc
index df4a4ea51c4..dc210fc03d8 100644
--- a/tensorflow/lite/toco/graph_transformations/remove_trivial_concatenation_input.cc
+++ b/tensorflow/lite/toco/graph_transformations/remove_trivial_concatenation_input.cc
@@ -41,9 +41,9 @@ namespace toco {
   if (concat_op->type != OperatorType::kConcatenation) {
     return ::tensorflow::Status::OK();
   }
-  std::vector<string> trivial_inputs;
-  std::vector<string> nontrivial_inputs;
-  for (const string& input : concat_op->inputs) {
+  std::vector<std::string> trivial_inputs;
+  std::vector<std::string> nontrivial_inputs;
+  for (const std::string& input : concat_op->inputs) {
     const auto& input_array = model->GetArray(input);
     const bool is_trivial =
         input_array.has_shape() && input_array.shape().dimensions_count() == 0;
@@ -60,7 +60,7 @@ namespace toco {
 
   // Drop trivial inputs.
   concat_op->inputs = nontrivial_inputs;
-  for (const string& input : trivial_inputs) {
+  for (const std::string& input : trivial_inputs) {
     DeleteArrayIfUnusedOutsideOfOp(input, concat_op, model);
   }
   *modified = true;
diff --git a/tensorflow/lite/toco/graph_transformations/remove_trivial_passthrough.cc b/tensorflow/lite/toco/graph_transformations/remove_trivial_passthrough.cc
index bd529bd9ecd..45dbec83471 100644
--- a/tensorflow/lite/toco/graph_transformations/remove_trivial_passthrough.cc
+++ b/tensorflow/lite/toco/graph_transformations/remove_trivial_passthrough.cc
@@ -29,7 +29,7 @@ namespace {
 // array instead. from_array is assumed to be discardable, and consequently
 // this only updates operator edges (since discardable arrays only
 // appear there, and not e.g. in model flags).
-void Reroute(const string& from, const string& to, Model* model) {
+void Reroute(const std::string& from, const std::string& to, Model* model) {
   for (const auto& op : model->operators) {
     for (auto& output : op->outputs) {
       if (output == from) {
@@ -92,8 +92,9 @@ bool RemoveTrivialPassthroughOp(GraphTransformation* transformation,
     }
   }
 
-  const string main_input_name = passthru_op->inputs[main_input_array_index];
-  const string output_name = passthru_op->outputs[0];
+  const std::string main_input_name =
+      passthru_op->inputs[main_input_array_index];
+  const std::string output_name = passthru_op->outputs[0];
 
   if (IsDiscardableArray(*model, output_name)) {
     transformation->AddMessageF(
diff --git a/tensorflow/lite/toco/graph_transformations/remove_trivial_quantized_activation_func.cc b/tensorflow/lite/toco/graph_transformations/remove_trivial_quantized_activation_func.cc
index 56acf22f7f1..80d28e0fc6d 100644
--- a/tensorflow/lite/toco/graph_transformations/remove_trivial_quantized_activation_func.cc
+++ b/tensorflow/lite/toco/graph_transformations/remove_trivial_quantized_activation_func.cc
@@ -32,7 +32,7 @@ namespace {
 
 bool IsTrivialUnfusedActivationFunc(GraphTransformation* transformation,
                                     const Model& model, OperatorType op_type,
-                                    const string& input_array_name) {
+                                    const std::string& input_array_name) {
   double clamp_min;
   double clamp_max;
   switch (op_type) {
@@ -60,7 +60,7 @@ bool IsTrivialUnfusedActivationFunc(GraphTransformation* transformation,
 bool IsTrivialFusedActivationFunc(
     GraphTransformation* transformation, const Model& model,
     FusedActivationFunctionType activation_function,
-    const string& output_array_name) {
+    const std::string& output_array_name) {
   double clamp_min;
   double clamp_max;
   switch (activation_function) {
diff --git a/tensorflow/lite/toco/graph_transformations/remove_trivial_quantized_min_max.cc b/tensorflow/lite/toco/graph_transformations/remove_trivial_quantized_min_max.cc
index f1037994c97..bd9281fe34e 100644
--- a/tensorflow/lite/toco/graph_transformations/remove_trivial_quantized_min_max.cc
+++ b/tensorflow/lite/toco/graph_transformations/remove_trivial_quantized_min_max.cc
@@ -31,8 +31,8 @@ namespace toco {
 namespace {
 
 bool IsTrivialMinMax(GraphTransformation* transformation, const Model& model,
-                     OperatorType op_type, const string& input_array_name,
-                     const string& clamp_value_array_name) {
+                     OperatorType op_type, const std::string& input_array_name,
+                     const std::string& clamp_value_array_name) {
   const auto& clamp_value_array = model.GetArray(clamp_value_array_name);
   if (!IsConstantParameterArray(model, clamp_value_array_name)) {
     transformation->AddMessageF("Clip value array %s is non-constant",
diff --git a/tensorflow/lite/toco/graph_transformations/remove_unused_op.cc b/tensorflow/lite/toco/graph_transformations/remove_unused_op.cc
index 384b5f22911..bcdb4cbe77e 100644
--- a/tensorflow/lite/toco/graph_transformations/remove_unused_op.cc
+++ b/tensorflow/lite/toco/graph_transformations/remove_unused_op.cc
@@ -58,7 +58,7 @@ namespace toco {
     if (found_output_as_rnn_state_array) {
       continue;
     }
-    for (const string& output_array : model->flags.output_arrays()) {
+    for (const std::string& output_array : model->flags.output_arrays()) {
       if (output == output_array) {
         return ::tensorflow::Status::OK();
       }
diff --git a/tensorflow/lite/toco/graph_transformations/reorder_elementwise_unary.cc b/tensorflow/lite/toco/graph_transformations/reorder_elementwise_unary.cc
index 17a5e9a1d6a..158c7f95085 100644
--- a/tensorflow/lite/toco/graph_transformations/reorder_elementwise_unary.cc
+++ b/tensorflow/lite/toco/graph_transformations/reorder_elementwise_unary.cc
@@ -75,7 +75,7 @@ bool IsMoveOperator(OperatorType optype) {
     return ::tensorflow::Status::OK();
   }
 
-  const string intermediate_name = element_op->inputs[0];
+  const std::string intermediate_name = element_op->inputs[0];
   auto it = FindOpWithOutput(*model, intermediate_name);
   if (it == model->operators.end()) {
     AddMessageF("No preceding operator");
@@ -103,8 +103,8 @@ bool IsMoveOperator(OperatorType optype) {
   }
 
   // op->inputs may change so we need to keep a value by copy.
-  const string input_name = move_op->inputs[0];
-  const string output_name = element_op->outputs[0];
+  const std::string input_name = move_op->inputs[0];
+  const std::string output_name = element_op->outputs[0];
 
   AddMessageF("Swapping around operators with %s and %s", LogName(*element_op),
               LogName(*move_op));
diff --git a/tensorflow/lite/toco/graph_transformations/reorder_reshape_transpose.cc b/tensorflow/lite/toco/graph_transformations/reorder_reshape_transpose.cc
index 0fbcf9f73b1..9852b3382cd 100644
--- a/tensorflow/lite/toco/graph_transformations/reorder_reshape_transpose.cc
+++ b/tensorflow/lite/toco/graph_transformations/reorder_reshape_transpose.cc
@@ -138,9 +138,9 @@ std::vector<int> ComputeNewPerm(std::vector<int> input_dims,
   }
 
   // Need to copy to keep static if permutated.
-  const string input_name = reshape_op->inputs[0];
-  const string intermediate_name = reshape_op->outputs[0];
-  const string output_name = transpose_op->outputs[0];
+  const std::string input_name = reshape_op->inputs[0];
+  const std::string intermediate_name = reshape_op->outputs[0];
+  const std::string output_name = transpose_op->outputs[0];
 
   // Intermediate should not be consumed by any other operators.
   if (CountOpsWithInput(*model, intermediate_name) != 1) {
diff --git a/tensorflow/lite/toco/graph_transformations/resolve_batch_normalization.cc b/tensorflow/lite/toco/graph_transformations/resolve_batch_normalization.cc
index 6e5815ee94d..aee511e2beb 100644
--- a/tensorflow/lite/toco/graph_transformations/resolve_batch_normalization.cc
+++ b/tensorflow/lite/toco/graph_transformations/resolve_batch_normalization.cc
@@ -62,12 +62,14 @@ namespace toco {
   // Create the new Mul, Add operators
   auto* mul_op = new MulOperator;
   auto* add_op = new AddOperator;
-  const string mul_name =
+  const std::string mul_name =
       AvailableArrayName(*model, bn_op->outputs[0] + "_mul");
-  const string add_name =
+  const std::string add_name =
       AvailableArrayName(*model, bn_op->outputs[0] + "_add");
-  const string mul_param_name = AvailableArrayName(*model, mul_name + "_param");
-  const string add_param_name = AvailableArrayName(*model, add_name + "_param");
+  const std::string mul_param_name =
+      AvailableArrayName(*model, mul_name + "_param");
+  const std::string add_param_name =
+      AvailableArrayName(*model, add_name + "_param");
   mul_op->inputs = {bn_op->inputs[0], mul_param_name};
   mul_op->outputs = {mul_name};
   add_op->inputs = {mul_name, add_param_name};
diff --git a/tensorflow/lite/toco/graph_transformations/resolve_constant_concatenation.cc b/tensorflow/lite/toco/graph_transformations/resolve_constant_concatenation.cc
index 7c9aa025f64..208c345639b 100644
--- a/tensorflow/lite/toco/graph_transformations/resolve_constant_concatenation.cc
+++ b/tensorflow/lite/toco/graph_transformations/resolve_constant_concatenation.cc
@@ -147,7 +147,7 @@ void SetMinMaxForConcatenedArray(GraphTransformation* transformation,
   const auto* concat_op =
       static_cast<const ConcatenationOperator*>(concat_base_op);
 
-  for (const string& input_name : concat_op->inputs) {
+  for (const std::string& input_name : concat_op->inputs) {
     // We only expect constant unquantized arrays as input, otherwise we return.
     // We  also make sure the shapes of the input arrays are known and they are
     // all discardable.
@@ -166,10 +166,10 @@ void SetMinMaxForConcatenedArray(GraphTransformation* transformation,
   const int concatenation_axis = concat_op->axis;
 
   CHECK_EQ(concat_op->outputs.size(), 1);
-  string concatenated_array_name = concat_op->outputs[0];
+  std::string concatenated_array_name = concat_op->outputs[0];
   Array& concatenated_array = model->GetOrCreateArray(concatenated_array_name);
   std::vector<Array*> input_arrays;
-  for (const string& input_name : concat_op->inputs) {
+  for (const std::string& input_name : concat_op->inputs) {
     input_arrays.push_back(&model->GetArray(input_name));
   }
 
diff --git a/tensorflow/lite/toco/graph_transformations/resolve_reorder_axes.cc b/tensorflow/lite/toco/graph_transformations/resolve_reorder_axes.cc
index a3d3e863757..a685f67745b 100644
--- a/tensorflow/lite/toco/graph_transformations/resolve_reorder_axes.cc
+++ b/tensorflow/lite/toco/graph_transformations/resolve_reorder_axes.cc
@@ -27,19 +27,19 @@ namespace toco {
 
 namespace {
 
-void RenameArray(Model* model, const string& oldname,
-                 const string& desired_newname) {
-  const string& newname = AvailableArrayName(*model, desired_newname);
+void RenameArray(Model* model, const std::string& oldname,
+                 const std::string& desired_newname) {
+  const std::string& newname = AvailableArrayName(*model, desired_newname);
   auto& arrays = model->GetMutableArrayMap();
   arrays[newname] = std::move(arrays[oldname]);
   arrays.erase(oldname);
   for (const auto& op : model->operators) {
-    for (string& input : op->inputs) {
+    for (std::string& input : op->inputs) {
       if (input == oldname) {
         input = newname;
       }
     }
-    for (string& output : op->outputs) {
+    for (std::string& output : op->outputs) {
       if (output == oldname) {
         output = newname;
       }
@@ -89,8 +89,8 @@ void ReorderAxes(AxesOrder input_axes_order, AxesOrder output_axes_order,
   auto* reorder_op = static_cast<ReorderAxesOperator*>(op);
 
   // Intentionally copies, not references.
-  const string input_array_name = reorder_op->inputs[0];
-  const string output_array_name = reorder_op->outputs[0];
+  const std::string input_array_name = reorder_op->inputs[0];
+  const std::string output_array_name = reorder_op->outputs[0];
 
   auto& input_array = model->GetArray(input_array_name);
   auto& output_array = model->GetArray(output_array_name);
diff --git a/tensorflow/lite/toco/graph_transformations/resolve_tensorflow_concat.cc b/tensorflow/lite/toco/graph_transformations/resolve_tensorflow_concat.cc
index b5a11529764..1ce05336be9 100644
--- a/tensorflow/lite/toco/graph_transformations/resolve_tensorflow_concat.cc
+++ b/tensorflow/lite/toco/graph_transformations/resolve_tensorflow_concat.cc
@@ -44,8 +44,8 @@ namespace toco {
   if (tf_concat_op->type == OperatorType::kConcatV2) {
     axis_pos = tf_concat_op->inputs.size() - 1;
   }
-  const string axis_name = tf_concat_op->inputs[axis_pos];
-  std::vector<string> concat_input_names;
+  const std::string axis_name = tf_concat_op->inputs[axis_pos];
+  std::vector<std::string> concat_input_names;
   for (std::size_t i = 0; i < tf_concat_op->inputs.size(); i++) {
     if (i != axis_pos) {
       concat_input_names.push_back(tf_concat_op->inputs[i]);
diff --git a/tensorflow/lite/toco/graph_transformations/resolve_tensorflow_matmul.cc b/tensorflow/lite/toco/graph_transformations/resolve_tensorflow_matmul.cc
index ac95d609e91..a6d653d055d 100644
--- a/tensorflow/lite/toco/graph_transformations/resolve_tensorflow_matmul.cc
+++ b/tensorflow/lite/toco/graph_transformations/resolve_tensorflow_matmul.cc
@@ -27,7 +27,7 @@ namespace toco {
 namespace {
 
 TransposeOperator* FindTransposeOpWithInput(const Model& model,
-                                            const string& array_name) {
+                                            const std::string& array_name) {
   for (auto it = model.operators.begin(); it != model.operators.end(); ++it) {
     Operator* op = it->get();
     if (op->type != OperatorType::kTranspose) {
@@ -74,8 +74,8 @@ TransposeOperator* FindTransposeOpWithInput(const Model& model,
     DCHECK_EQ(matmul_it->get(), matmul_op);
   };
 
-  string input_lhs = matmul_op->inputs[0];
-  string input_rhs = matmul_op->inputs[1];
+  std::string input_lhs = matmul_op->inputs[0];
+  std::string input_rhs = matmul_op->inputs[1];
 
   // Handle `transpose_a` with best effort: If the dimension of lhs is known,
   // insert a `Transpose` op.
diff --git a/tensorflow/lite/toco/graph_transformations/resolve_tensorflow_switch.cc b/tensorflow/lite/toco/graph_transformations/resolve_tensorflow_switch.cc
index 5c3176ced34..854dce39a27 100644
--- a/tensorflow/lite/toco/graph_transformations/resolve_tensorflow_switch.cc
+++ b/tensorflow/lite/toco/graph_transformations/resolve_tensorflow_switch.cc
@@ -37,7 +37,7 @@ namespace toco {
 
   CHECK_EQ(switch_op->inputs.size(), 2);
   CHECK_EQ(switch_op->outputs.size(), 2);
-  const string& predicate_name = switch_op->inputs[1];
+  const std::string& predicate_name = switch_op->inputs[1];
   // If the predicate array hasn't been resolved to a constant yet,
   // we need to yield.
   if (!IsConstantParameterArray(*model, predicate_name)) {
diff --git a/tensorflow/lite/toco/graph_transformations/shuffle_fc_weights.cc b/tensorflow/lite/toco/graph_transformations/shuffle_fc_weights.cc
index 195ea70e34b..7eadd01c949 100644
--- a/tensorflow/lite/toco/graph_transformations/shuffle_fc_weights.cc
+++ b/tensorflow/lite/toco/graph_transformations/shuffle_fc_weights.cc
@@ -37,7 +37,7 @@ namespace toco {
     return ::tensorflow::Status::OK();
   }
   const Array& input_array = model->GetArray(fc_op->inputs[0]);
-  const string& weights_name = fc_op->inputs[1];
+  const std::string& weights_name = fc_op->inputs[1];
   Array& weights_array = model->GetArray(weights_name);
   const Array& output_array = model->GetArray(fc_op->outputs[0]);
   // Exit if this FC op isn't quantized with uint8 inputs and int16 outputs,
@@ -143,7 +143,7 @@ namespace toco {
   // Add a second output array to this FC op, serving as a workspace to perform
   // runtime shuffling/xoring of its input activations.
   CHECK_EQ(fc_op->outputs.size(), 1);
-  const string& shuffled_input_workspace_array_name =
+  const std::string& shuffled_input_workspace_array_name =
       AvailableArrayName(*model, fc_op->inputs[0] + "_shuffled");
   fc_op->outputs.push_back(shuffled_input_workspace_array_name);
   auto& shuffled_input_workspace_array =
diff --git a/tensorflow/lite/toco/graph_transformations/unfuse_activation_functions.cc b/tensorflow/lite/toco/graph_transformations/unfuse_activation_functions.cc
index 3e36dd5a45c..2a3cd91551b 100644
--- a/tensorflow/lite/toco/graph_transformations/unfuse_activation_functions.cc
+++ b/tensorflow/lite/toco/graph_transformations/unfuse_activation_functions.cc
@@ -64,7 +64,7 @@ namespace toco {
   // Wire up arrays, constructing a new intermediate array to connect the
   // op to its new unfused activation function.
   ac_op->outputs = op->outputs;
-  const string& tmp_array_name =
+  const std::string& tmp_array_name =
       AvailableArrayName(*model, op->outputs[0] + "_unfused");
   CHECK(!model->HasArray(tmp_array_name));
 
diff --git a/tensorflow/lite/toco/graph_transformations/unpartition_embedding_lookup.cc b/tensorflow/lite/toco/graph_transformations/unpartition_embedding_lookup.cc
index 1f7035c21e2..294c39069f7 100644
--- a/tensorflow/lite/toco/graph_transformations/unpartition_embedding_lookup.cc
+++ b/tensorflow/lite/toco/graph_transformations/unpartition_embedding_lookup.cc
@@ -55,8 +55,8 @@ namespace toco {
   auto* stitch_op = static_cast<DynamicStitchOperator*>(op_it->get());
 
   // Split up the DynamicStitch inputs into the indices and data.
-  std::vector<string> stitch_indices_inputs;
-  std::vector<string> stitch_data_inputs;
+  std::vector<std::string> stitch_indices_inputs;
+  std::vector<std::string> stitch_data_inputs;
   for (size_t i = 0; i < stitch_op->num_partitions; ++i) {
     stitch_indices_inputs.push_back(stitch_op->inputs[i]);
   }
@@ -67,7 +67,8 @@ namespace toco {
 
   // Validate all indices come from the same DynamicPartition.
   DynamicPartitionOperator* indices_partition_op = nullptr;
-  for (const string& indices_partition_output_name : stitch_indices_inputs) {
+  for (const std::string& indices_partition_output_name :
+       stitch_indices_inputs) {
     auto* op = GetOpWithOutput(*model, indices_partition_output_name);
     CHECK(op) << "Source of " << indices_partition_output_name << " not found";
     if (op->type != OperatorType::kDynamicPartition) {
@@ -112,7 +113,7 @@ namespace toco {
 
   // Find all of the gathers used for the data inputs.
   std::vector<GatherOperator*> gather_ops;
-  for (const string& gather_output_name : stitch_data_inputs) {
+  for (const std::string& gather_output_name : stitch_data_inputs) {
     auto* op = GetOpWithOutput(*model, gather_output_name);
     CHECK(op) << "Source of " << gather_output_name << " not found";
     if (op->type != OperatorType::kGather) {
diff --git a/tensorflow/lite/toco/graph_transformations/unroll_batch_matmul.cc b/tensorflow/lite/toco/graph_transformations/unroll_batch_matmul.cc
index 16dfaf7fc80..5bf000e2784 100644
--- a/tensorflow/lite/toco/graph_transformations/unroll_batch_matmul.cc
+++ b/tensorflow/lite/toco/graph_transformations/unroll_batch_matmul.cc
@@ -34,9 +34,10 @@ absl::InlinedVector<int64, 4> ToInlinedVector(const std::vector<int>& vec) {
   return absl::InlinedVector<int64, 4>(vec.begin(), vec.end());
 }
 
-std::vector<string> SliceInput(
-    const string& input, const string& base_name, const string& input_name,
-    const int batch_size, const Array& input_array, Model* model,
+std::vector<std::string> SliceInput(
+    const std::string& input, const std::string& base_name,
+    const std::string& input_name, const int batch_size,
+    const Array& input_array, Model* model,
     std::vector<std::unique_ptr<Operator>>::iterator* tail_it) {
   int rank = input_array.shape().dimensions_count();
   int num_rows = input_array.shape().dims(rank - 2);
@@ -54,7 +55,7 @@ std::vector<string> SliceInput(
   *tail_it = model->operators.emplace(*tail_it, reshape_op) + 1;
 
   // Slice along each batch index and remember the slice output for future use.
-  std::vector<string> slice_outputs;
+  std::vector<std::string> slice_outputs;
   for (int batch_idx = 0; batch_idx < batch_size; ++batch_idx) {
     std::string batch_name =
         absl::StrCat(base_name, "_b", batch_idx, "/slice_", input_name);
@@ -110,10 +111,10 @@ std::vector<int32> GetTransposeShape(const Shape& input_shape,
   return output_shape;
 }
 
-TransposeOperator* TransposeInput(const string& input, Model* model) {
+TransposeOperator* TransposeInput(const std::string& input, Model* model) {
   const auto& input_array = model->GetArray(input);
   const auto perm_array = GetTransposePerm(input_array);
-  const string perm_array_name = CreateInt32Array(
+  const std::string perm_array_name = CreateInt32Array(
       model, AvailableArrayName(*model, input + "/transpose/perm"), perm_array);
   auto* transpose_op = new TransposeOperator;
   transpose_op->inputs = {input, perm_array_name};
@@ -141,8 +142,8 @@ TransposeOperator* TransposeInput(const string& input, Model* model) {
       static_cast<const BatchMatMulOperator*>(batch_op_it->get());
   auto& tail_it = batch_op_it;
 
-  string input_lhs = batch_op->inputs[0];
-  string input_rhs = batch_op->inputs[1];
+  std::string input_lhs = batch_op->inputs[0];
+  std::string input_rhs = batch_op->inputs[1];
   const auto& input_lhs_array = model->GetArray(input_lhs);
   const auto& input_rhs_array = model->GetArray(input_rhs);
   if (!input_lhs_array.has_shape() || !input_rhs_array.has_shape())
@@ -195,19 +196,19 @@ TransposeOperator* TransposeInput(const string& input, Model* model) {
   }
   AddMessageF("Unrolling BatchMatMul %s %d times", LogName(*batch_op),
               bcast.output_batch_size());
-  string base_name = std::string(batch_op->outputs[0]);
+  std::string base_name = std::string(batch_op->outputs[0]);
 
   // Compute slices for each batch in the LHS and RHS.
-  std::vector<string> slice_a_outputs =
+  std::vector<std::string> slice_a_outputs =
       SliceInput(input_lhs, base_name, "a", bcast.x_batch_size(), input_array_a,
                  model, &tail_it);
-  std::vector<string> slice_b_outputs =
+  std::vector<std::string> slice_b_outputs =
       SliceInput(input_rhs, base_name, "b", bcast.y_batch_size(), input_array_b,
                  model, &tail_it);
 
   // Compute (single batch) MatMul for each output batch. The MatMul outputs are
   // then packed together into one output Tensor.
-  std::vector<string> pack_inputs;
+  std::vector<std::string> pack_inputs;
   for (int batch_idx = 0; batch_idx < bcast.output_batch_size(); ++batch_idx) {
     std::string batch_name =
         absl::StrCat(batch_op->outputs[0], "_b", batch_idx);

From f5eb8a40e98224d98be8ca4881c7261d2d04ad17 Mon Sep 17 00:00:00 2001
From: Christian Sigg <csigg@google.com>
Date: Mon, 22 Jun 2020 10:38:48 -0700
Subject: [PATCH 0771/1390] Change gpu_device_test to cope with CUDA stream
 priority supporting [-2,0] in CUDA 8 instead of [-1,0] before.

PiperOrigin-RevId: 317684869
Change-Id: Ib741cd1015f64358d323dda761250c990ce4ca41
---
 .../core/common_runtime/gpu/gpu_device_test.cc | 18 +++++++-----------
 1 file changed, 7 insertions(+), 11 deletions(-)

diff --git a/tensorflow/core/common_runtime/gpu/gpu_device_test.cc b/tensorflow/core/common_runtime/gpu/gpu_device_test.cc
index dae744380e9..6448fc56af7 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_device_test.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_device_test.cc
@@ -234,9 +234,9 @@ TEST_F(GPUDeviceTest, SingleVirtualDeviceWithInvalidPriority) {
     SessionOptions opts =
         MakeSessionOptions("0", 0, 1, {{123, 456}}, {{-1, 2}});
 #else
-    // Priority outside the range (-1, 0) for NVidia GPUs
+    // Priority outside the range (-2, 0) for NVidia GPUs
     SessionOptions opts =
-        MakeSessionOptions("0", 0, 1, {{123, 456}}, {{-2, 0}});
+        MakeSessionOptions("0", 0, 1, {{123, 456}}, {{-3, 0}});
 #endif
     std::vector<std::unique_ptr<Device>> devices;
     Status status = DeviceFactory::GetFactory("GPU")->CreateDevices(
@@ -249,9 +249,7 @@ TEST_F(GPUDeviceTest, SingleVirtualDeviceWithInvalidPriority) {
         " virtual device 0 on GPU# 0");
 #else
     ExpectErrorMessageSubstr(
-        status,
-        "Priority -2 is outside the range of supported priorities [-1,0] for"
-        " virtual device 0 on GPU# 0");
+        status, "Priority -3 is outside the range of supported priorities");
 #endif
   }
   {
@@ -259,7 +257,7 @@ TEST_F(GPUDeviceTest, SingleVirtualDeviceWithInvalidPriority) {
     // Priority outside the range (0, 2) for AMD GPUs
     SessionOptions opts = MakeSessionOptions("0", 0, 1, {{123, 456}}, {{0, 3}});
 #else
-    // Priority outside the range (-1, 0) for NVidia GPUs
+    // Priority outside the range (-2, 0) for NVidia GPUs
     SessionOptions opts = MakeSessionOptions("0", 0, 1, {{123, 456}}, {{0, 1}});
 #endif
     std::vector<std::unique_ptr<Device>> devices;
@@ -273,9 +271,7 @@ TEST_F(GPUDeviceTest, SingleVirtualDeviceWithInvalidPriority) {
         " virtual device 0 on GPU# 0");
 #else
     ExpectErrorMessageSubstr(
-        status,
-        "Priority 1 is outside the range of supported priorities [-1,0] for"
-        " virtual device 0 on GPU# 0");
+        status, "Priority 1 is outside the range of supported priorities");
 #endif
   }
 }
@@ -296,7 +292,7 @@ TEST_F(GPUDeviceTest, MultipleVirtualDevices) {
   // Valid range for priority values on AMD GPUs in (0,2)
   SessionOptions opts = MakeSessionOptions("0", 0, 1, {{123, 456}}, {{0, 1}});
 #else
-  // Valid range for priority values on NVidia GPUs in (-1, 0)
+  // Valid range for priority values on NVidia GPUs in (-2, 0)
   SessionOptions opts = MakeSessionOptions("0", 0, 1, {{123, 456}}, {{0, -1}});
 #endif
   std::vector<std::unique_ptr<Device>> devices;
@@ -347,7 +343,7 @@ TEST_F(GPUDeviceTest, MultipleVirtualDevicesWithPriority) {
     // Valid range for priority values on AMD GPUs in (0,2)
     SessionOptions opts = MakeSessionOptions("0", 0, 1, {{123, 456}}, {{2, 1}});
 #else
-    // Valid range for priority values on NVidia GPUs in (-1, 0)
+    // Valid range for priority values on NVidia GPUs in (-2, 0)
     SessionOptions opts =
         MakeSessionOptions("0", 0, 1, {{123, 456}}, {{-1, 0}});
 #endif

From 6010d444ad3b9a45cb9247616830e207bb17f820 Mon Sep 17 00:00:00 2001
From: Vo Van Nghia <vovannghia2409@gmail.com>
Date: Tue, 23 Jun 2020 02:10:52 +0700
Subject: [PATCH 0772/1390] Fix internal build

---
 tensorflow/c/experimental/filesystem/plugins/gcs/BUILD        | 3 ++-
 .../filesystem/plugins/gcs/gcs_filesystem_test.cc             | 4 +++-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/tensorflow/c/experimental/filesystem/plugins/gcs/BUILD b/tensorflow/c/experimental/filesystem/plugins/gcs/BUILD
index 3a65824cd7c..395ce633370 100644
--- a/tensorflow/c/experimental/filesystem/plugins/gcs/BUILD
+++ b/tensorflow/c/experimental/filesystem/plugins/gcs/BUILD
@@ -61,6 +61,7 @@ tf_cc_test(
         "@com_google_absl//absl/strings",
     ] + [
         "//tensorflow/c:tf_status_helper",
-        "@com_google_googletest//:gtest",
+        "//tensorflow/core/platform:stacktrace_handler",
+        "//tensorflow/core/platform:test",
     ],
 )
diff --git a/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem_test.cc b/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem_test.cc
index 6dc5e164a1c..a4cb62a673b 100644
--- a/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem_test.cc
+++ b/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem_test.cc
@@ -12,9 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "gtest/gtest.h"
 #include "tensorflow/c/experimental/filesystem/filesystem_interface.h"
 #include "tensorflow/c/tf_status_helper.h"
+#include "tensorflow/core/platform/stacktrace_handler.h"
+#include "tensorflow/core/platform/test.h"
 
 #define ASSERT_TF_OK(x) ASSERT_EQ(TF_OK, TF_GetCode(x))
 
@@ -54,6 +55,7 @@ TEST_F(GCSFilesystemTest, TestInit) { ASSERT_TF_OK(status); }
 }  // namespace tensorflow
 
 GTEST_API_ int main(int argc, char** argv) {
+  tensorflow::testing::InstallStacktraceHandler();
   ::testing::InitGoogleTest(&argc, argv);
   return RUN_ALL_TESTS();
 }

From c041b5de75463f76bf8d9461e0f79ea9ecec498f Mon Sep 17 00:00:00 2001
From: Chris Kennelly <ckennelly@google.com>
Date: Mon, 22 Jun 2020 10:55:18 -0700
Subject: [PATCH 0773/1390] Replace ARCH_K8 with __x86_64__.

PiperOrigin-RevId: 317689006
Change-Id: I7e47b17ef53b3cc223b64ff179fcdc3777c61eb7
---
 tensorflow/core/lib/gtl/manual_constructor_test.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/lib/gtl/manual_constructor_test.cc b/tensorflow/core/lib/gtl/manual_constructor_test.cc
index 35cbc78b664..77824326a7d 100644
--- a/tensorflow/core/lib/gtl/manual_constructor_test.cc
+++ b/tensorflow/core/lib/gtl/manual_constructor_test.cc
@@ -92,7 +92,7 @@ TEST(ManualConstructorTest, Alignment) {
 
   EXPECT_EQ(reinterpret_cast<char*>(test2.b.get()) - &test2.a,
             reinterpret_cast<char*>(&control2.b) - &control2.a);
-#ifdef ARCH_K8
+#ifdef __x86_64__
   EXPECT_EQ(reinterpret_cast<intptr_t>(test2.b.get()) % 16, 0);
 #endif
 }

From 18569dde74a866758b2008042a6ed2e3c8bdb027 Mon Sep 17 00:00:00 2001
From: Christian Sigg <csigg@google.com>
Date: Mon, 22 Jun 2020 11:08:36 -0700
Subject: [PATCH 0774/1390] Set up RBE builds with CUDA11, cuDNN8, TensorRT7.

PiperOrigin-RevId: 317692556
Change-Id: Ifd1e78cec6a6083e507e98baf6702401f18f67aa
---
 .bazelrc                                      | 80 +++++++++++--------
 .../preconfig/generate/containers.bzl         |  1 +
 .../toolchains/remote_config/configs.bzl      | 13 +++
 .../toolchains/remote_config/containers.bzl   |  7 ++
 4 files changed, 67 insertions(+), 34 deletions(-)

diff --git a/.bazelrc b/.bazelrc
index e67c3eecc3b..f11c376df65 100644
--- a/.bazelrc
+++ b/.bazelrc
@@ -58,13 +58,12 @@
 #
 #
 # Remote build execution options (only configured to work with TF team projects for now.)
-#     rbe:        General RBE options shared by all flavors.
-#     rbe_linux:  General RBE options used on all linux builds.
-#     rbe_win:    General RBE options used on all windows builds.
+#     rbe:       General RBE options shared by all flavors.
+#     rbe_linux: General RBE options used on all linux builds.
+#     rbe_win:   General RBE options used on all windows builds.
 #
-#     rbe_cpu_linux:        RBE options to build with only CPU support.
-#     rbe_linux_cuda_nvcc:  RBE options to build with GPU support using nvcc.
-#     rbe_gpu_linux:        An alias for rbe_linux_cuda_nvcc
+#     rbe_cpu_linux:           RBE options to build with only CPU support.
+#     rbe_linux_cuda_nvcc_py*: RBE options to build with GPU support using nvcc.
 #
 #     rbe_linux_py2: Linux Python 2 RBE config.
 #     rbe_linux_py3: Linux Python 3 RBE config
@@ -400,33 +399,48 @@ build:rbe_linux_cuda_base --repo_env=REMOTE_GPU_TESTING=1
 build:rbe_linux_cuda_base --repo_env=TF_NEED_CUDA=1
 test:rbe_linux_cuda_base --test_env=LD_LIBRARY_PATH="/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64"
 
-build:rbe_linux_cuda_nvcc --config=rbe_linux_cuda_base
-build:rbe_linux_cuda_nvcc --crosstool_top="@ubuntu18.04-py3-gcc7_manylinux2010-cuda10.1-cudnn7-tensorrt6.0_config_cuda//crosstool:toolchain"
-build:rbe_linux_cuda_nvcc --extra_toolchains="@ubuntu18.04-py3-gcc7_manylinux2010-cuda10.1-cudnn7-tensorrt6.0_config_cuda//crosstool:toolchain-linux-x86_64"
-build:rbe_linux_cuda_nvcc --extra_execution_platforms="@ubuntu18.04-py3-gcc7_manylinux2010-cuda10.1-cudnn7-tensorrt6.0_config_platform//:platform"
-build:rbe_linux_cuda_nvcc --host_platform="@ubuntu18.04-py3-gcc7_manylinux2010-cuda10.1-cudnn7-tensorrt6.0_config_platform//:platform"
-build:rbe_linux_cuda_nvcc --platforms="@ubuntu18.04-py3-gcc7_manylinux2010-cuda10.1-cudnn7-tensorrt6.0_config_platform//:platform"
-build:rbe_linux_cuda_nvcc --repo_env=TF_CUDA_CONFIG_REPO="@ubuntu18.04-py3-gcc7_manylinux2010-cuda10.1-cudnn7-tensorrt6.0_config_cuda"
-build:rbe_linux_cuda_nvcc --repo_env=TF_TENSORRT_CONFIG_REPO="@ubuntu18.04-py3-gcc7_manylinux2010-cuda10.1-cudnn7-tensorrt6.0_config_tensorrt"
-build:rbe_linux_cuda_nvcc --repo_env=TF_NCCL_CONFIG_REPO="@ubuntu18.04-py3-gcc7_manylinux2010-cuda10.1-cudnn7-tensorrt6.0_config_nccl"
-build:rbe_linux_cuda_nvcc --define=using_cuda_nvcc=true
-test:rbe_linux_cuda_nvcc --config=rbe_linux_cuda_base
+build:rbe_linux_cuda10.1_nvcc_base --config=rbe_linux_cuda_base
+build:rbe_linux_cuda10.1_nvcc_base --define=using_cuda_nvcc=true
+build:rbe_linux_cuda10.1_nvcc_base --crosstool_top="@ubuntu18.04-gcc7_manylinux2010-cuda10.1-cudnn7-tensorrt6.0_config_cuda//crosstool:toolchain"
+build:rbe_linux_cuda10.1_nvcc_base --extra_toolchains="@ubuntu18.04-gcc7_manylinux2010-cuda10.1-cudnn7-tensorrt6.0_config_cuda//crosstool:toolchain-linux-x86_64"
+build:rbe_linux_cuda10.1_nvcc_base --extra_execution_platforms="@ubuntu18.04-gcc7_manylinux2010-cuda10.1-cudnn7-tensorrt6.0_config_platform//:platform"
+build:rbe_linux_cuda10.1_nvcc_base --host_platform="@ubuntu18.04-gcc7_manylinux2010-cuda10.1-cudnn7-tensorrt6.0_config_platform//:platform"
+build:rbe_linux_cuda10.1_nvcc_base --platforms="@ubuntu18.04-gcc7_manylinux2010-cuda10.1-cudnn7-tensorrt6.0_config_platform//:platform"
+build:rbe_linux_cuda10.1_nvcc_base --repo_env=TF_CUDA_CONFIG_REPO="@ubuntu18.04-gcc7_manylinux2010-cuda10.1-cudnn7-tensorrt6.0_config_cuda"
+build:rbe_linux_cuda10.1_nvcc_base --repo_env=TF_TENSORRT_CONFIG_REPO="@ubuntu18.04-gcc7_manylinux2010-cuda10.1-cudnn7-tensorrt6.0_config_tensorrt"
+build:rbe_linux_cuda10.1_nvcc_base --repo_env=TF_NCCL_CONFIG_REPO="@ubuntu18.04-gcc7_manylinux2010-cuda10.1-cudnn7-tensorrt6.0_config_nccl"
+build:rbe_linux_cuda10.1_nvcc_py2.7 --config=rbe_linux_cuda10.1_nvcc_base --repo_env=TF_PYTHON_CONFIG_REPO="@ubuntu18.04-gcc7_manylinux2010-cuda10.1-cudnn7-tensorrt6.0_config_python2.7"
+build:rbe_linux_cuda10.1_nvcc_py3.5 --config=rbe_linux_cuda10.1_nvcc_base --repo_env=TF_PYTHON_CONFIG_REPO="@ubuntu18.04-gcc7_manylinux2010-cuda10.1-cudnn7-tensorrt6.0_config_python3.5"
+build:rbe_linux_cuda10.1_nvcc_py3.6 --config=rbe_linux_cuda10.1_nvcc_base --repo_env=TF_PYTHON_CONFIG_REPO="@ubuntu18.04-gcc7_manylinux2010-cuda10.1-cudnn7-tensorrt6.0_config_python3.6"
+build:rbe_linux_cuda10.1_nvcc_py3.7 --config=rbe_linux_cuda10.1_nvcc_base --repo_env=TF_PYTHON_CONFIG_REPO="@ubuntu18.04-gcc7_manylinux2010-cuda10.1-cudnn7-tensorrt6.0_config_python3.7"
+build:rbe_linux_cuda10.1_nvcc_py3.8 --config=rbe_linux_cuda10.1_nvcc_base --repo_env=TF_PYTHON_CONFIG_REPO="@ubuntu18.04-gcc7_manylinux2010-cuda10.1-cudnn7-tensorrt6.0_config_python3.8"
 
-build:rbe_linux_cuda_nvcc_base --config=rbe_linux_cuda_base
-build:rbe_linux_cuda_nvcc_base --crosstool_top="@ubuntu18.04-gcc7_manylinux2010-cuda10.1-cudnn7-tensorrt6.0_config_cuda//crosstool:toolchain"
-build:rbe_linux_cuda_nvcc_base --extra_toolchains="@ubuntu18.04-gcc7_manylinux2010-cuda10.1-cudnn7-tensorrt6.0_config_cuda//crosstool:toolchain-linux-x86_64"
-build:rbe_linux_cuda_nvcc_base --extra_execution_platforms="@ubuntu18.04-gcc7_manylinux2010-cuda10.1-cudnn7-tensorrt6.0_config_platform//:platform"
-build:rbe_linux_cuda_nvcc_base --host_platform="@ubuntu18.04-gcc7_manylinux2010-cuda10.1-cudnn7-tensorrt6.0_config_platform//:platform"
-build:rbe_linux_cuda_nvcc_base --platforms="@ubuntu18.04-gcc7_manylinux2010-cuda10.1-cudnn7-tensorrt6.0_config_platform//:platform"
-build:rbe_linux_cuda_nvcc_base --repo_env=TF_CUDA_CONFIG_REPO="@ubuntu18.04-gcc7_manylinux2010-cuda10.1-cudnn7-tensorrt6.0_config_cuda"
-build:rbe_linux_cuda_nvcc_base --repo_env=TF_TENSORRT_CONFIG_REPO="@ubuntu18.04-gcc7_manylinux2010-cuda10.1-cudnn7-tensorrt6.0_config_tensorrt"
-build:rbe_linux_cuda_nvcc_base --repo_env=TF_NCCL_CONFIG_REPO="@ubuntu18.04-gcc7_manylinux2010-cuda10.1-cudnn7-tensorrt6.0_config_nccl"
-build:rbe_linux_cuda_nvcc_base --define=using_cuda_nvcc=true
-build:rbe_linux_cuda_nvcc_py27 --config=rbe_linux_cuda_nvcc_base --repo_env=TF_PYTHON_CONFIG_REPO="@ubuntu18.04-gcc7_manylinux2010-cuda10.1-cudnn7-tensorrt6.0_config_python2.7"
-build:rbe_linux_cuda_nvcc_py35 --config=rbe_linux_cuda_nvcc_base --repo_env=TF_PYTHON_CONFIG_REPO="@ubuntu18.04-gcc7_manylinux2010-cuda10.1-cudnn7-tensorrt6.0_config_python3.5"
-build:rbe_linux_cuda_nvcc_py36 --config=rbe_linux_cuda_nvcc_base --repo_env=TF_PYTHON_CONFIG_REPO="@ubuntu18.04-gcc7_manylinux2010-cuda10.1-cudnn7-tensorrt6.0_config_python3.6"
-build:rbe_linux_cuda_nvcc_py37 --config=rbe_linux_cuda_nvcc_base --repo_env=TF_PYTHON_CONFIG_REPO="@ubuntu18.04-gcc7_manylinux2010-cuda10.1-cudnn7-tensorrt6.0_config_python3.7"
-build:rbe_linux_cuda_nvcc_py38 --config=rbe_linux_cuda_nvcc_base --repo_env=TF_PYTHON_CONFIG_REPO="@ubuntu18.04-gcc7_manylinux2010-cuda10.1-cudnn7-tensorrt6.0_config_python3.8"
+build:rbe_linux_cuda11.0_nvcc_base --config=rbe_linux_cuda_base
+build:rbe_linux_cuda11.0_nvcc_base --define=using_cuda_nvcc=true
+build:rbe_linux_cuda11.0_nvcc_base --crosstool_top="@ubuntu18.04-gcc7_manylinux2010-cuda11.0-cudnn8-tensorrt7.1_config_cuda//crosstool:toolchain"
+build:rbe_linux_cuda11.0_nvcc_base --extra_toolchains="@ubuntu18.04-gcc7_manylinux2010-cuda11.0-cudnn8-tensorrt7.1_config_cuda//crosstool:toolchain-linux-x86_64"
+build:rbe_linux_cuda11.0_nvcc_base --extra_execution_platforms="@ubuntu18.04-gcc7_manylinux2010-cuda11.0-cudnn8-tensorrt7.1_config_platform//:platform"
+build:rbe_linux_cuda11.0_nvcc_base --host_platform="@ubuntu18.04-gcc7_manylinux2010-cuda11.0-cudnn8-tensorrt7.1_config_platform//:platform"
+build:rbe_linux_cuda11.0_nvcc_base --platforms="@ubuntu18.04-gcc7_manylinux2010-cuda11.0-cudnn8-tensorrt7.1_config_platform//:platform"
+build:rbe_linux_cuda11.0_nvcc_base --repo_env=TF_CUDA_CONFIG_REPO="@ubuntu18.04-gcc7_manylinux2010-cuda11.0-cudnn8-tensorrt7.1_config_cuda"
+build:rbe_linux_cuda11.0_nvcc_base --repo_env=TF_TENSORRT_CONFIG_REPO="@ubuntu18.04-gcc7_manylinux2010-cuda11.0-cudnn8-tensorrt7.1_config_tensorrt"
+build:rbe_linux_cuda11.0_nvcc_base --repo_env=TF_NCCL_CONFIG_REPO="@ubuntu18.04-gcc7_manylinux2010-cuda11.0-cudnn8-tensorrt7.1_config_nccl"
+build:rbe_linux_cuda11.0_nvcc_py2.7 --config=rbe_linux_cuda11.0_nvcc_base --repo_env=TF_PYTHON_CONFIG_REPO="@ubuntu18.04-gcc7_manylinux2010-cuda11.0-cudnn8-tensorrt7.1_config_python2.7"
+build:rbe_linux_cuda11.0_nvcc_py3.5 --config=rbe_linux_cuda11.0_nvcc_base --repo_env=TF_PYTHON_CONFIG_REPO="@ubuntu18.04-gcc7_manylinux2010-cuda11.0-cudnn8-tensorrt7.1_config_python3.5"
+build:rbe_linux_cuda11.0_nvcc_py3.6 --config=rbe_linux_cuda11.0_nvcc_base --repo_env=TF_PYTHON_CONFIG_REPO="@ubuntu18.04-gcc7_manylinux2010-cuda11.0-cudnn8-tensorrt7.1_config_python3.6"
+build:rbe_linux_cuda11.0_nvcc_py3.7 --config=rbe_linux_cuda11.0_nvcc_base --repo_env=TF_PYTHON_CONFIG_REPO="@ubuntu18.04-gcc7_manylinux2010-cuda11.0-cudnn8-tensorrt7.1_config_python3.7"
+build:rbe_linux_cuda11.0_nvcc_py3.8 --config=rbe_linux_cuda11.0_nvcc_base --repo_env=TF_PYTHON_CONFIG_REPO="@ubuntu18.04-gcc7_manylinux2010-cuda11.0-cudnn8-tensorrt7.1_config_python3.8"
+
+# Map default to CUDA 10.1.
+build:rbe_linux_cuda_nvcc_py27 --config=rbe_linux_cuda10.1_nvcc_py2.7
+build:rbe_linux_cuda_nvcc_py35 --config=rbe_linux_cuda10.1_nvcc_py3.5
+build:rbe_linux_cuda_nvcc_py36 --config=rbe_linux_cuda10.1_nvcc_py3.6
+build:rbe_linux_cuda_nvcc_py37 --config=rbe_linux_cuda10.1_nvcc_py3.7
+build:rbe_linux_cuda_nvcc_py38 --config=rbe_linux_cuda10.1_nvcc_py3.8
+
+# Deprecated configs that people might still use.
+build:rbe_linux_cuda_nvcc --config=rbe_linux_cuda_nvcc_py36
+build:rbe_gpu_linux       --config=rbe_linux_cuda_nvcc
 
 build:rbe_linux_cuda_clang_base --config=rbe_linux_cuda_base
 build:rbe_linux_cuda_clang_base --crosstool_top="@ubuntu16.04-clang_manylinux2010-cuda10.1-cudnn7-tensorrt6.0_config_cuda//crosstool:toolchain"
@@ -444,8 +458,6 @@ build:rbe_linux_cuda_clang_py36 --config=rbe_linux_cuda_clang_base --repo_env=TF
 build:rbe_linux_cuda_clang_py37 --config=rbe_linux_cuda_clang_base --repo_env=TF_PYTHON_CONFIG_REPO="@ubuntu16.04-clang_manylinux2010-cuda10.1-cudnn7-tensorrt6.0_config_python3.7"
 build:rbe_linux_cuda_clang_py38 --config=rbe_linux_cuda_clang_base --repo_env=TF_PYTHON_CONFIG_REPO="@ubuntu16.04-clang_manylinux2010-cuda10.1-cudnn7-tensorrt6.0_config_python3.8"
 
-common:rbe_gpu_linux --config=rbe_linux_cuda_nvcc
-
 build:rbe_linux_py2 --config=rbe_linux
 build:rbe_linux_py2 --repo_env=PYTHON_BIN_PATH="/usr/bin/python2"
 build:rbe_linux_py2 --python_path="/usr/bin/python2"
diff --git a/third_party/toolchains/preconfig/generate/containers.bzl b/third_party/toolchains/preconfig/generate/containers.bzl
index 4a3ed0b5225..260b7c31717 100644
--- a/third_party/toolchains/preconfig/generate/containers.bzl
+++ b/third_party/toolchains/preconfig/generate/containers.bzl
@@ -11,6 +11,7 @@ container_digests = {
     "cuda10.1-cudnn7-ubuntu16.04-manylinux2010": "sha256:5e6d21c8ef226316eb6df5e2e6015244c16a8e5d936b52a09820442d2f8a919f",
     "cuda10.1-cudnn7-ubuntu16.04-manylinux2010-multipython": "sha256:3f890a951c81a201d60d0161a56ce628a90323be0c7f795550caa37f6f41a85c",
     "cuda10.1-cudnn7-ubuntu18.04-manylinux2010-multipython": "sha256:bd7666d1ef49b2b2e2a64981f1c9234deeccdb0d5198b30ff4289c3dfcffedbf",
+    "cuda11.0-cudnn8-ubuntu18.04-manylinux2010-multipython": "sha256:b52edb4e35c780334ba417b008927722ae668847715a1624e9b2984e99c05338",
     "rocm-ubuntu16.04": "sha256:e645447dd6127325f3e97b8bf23424f637a8579d963b34fcc6772cf7cfaa0ebe",
     "windows-1803": "sha256:f109576c7c0c8a1783ff22b666e8923b52dbbe7933f69a1c7a7275202c304a12",
 }
diff --git a/third_party/toolchains/remote_config/configs.bzl b/third_party/toolchains/remote_config/configs.bzl
index 3bbf99e0e36..501759dcd50 100644
--- a/third_party/toolchains/remote_config/configs.bzl
+++ b/third_party/toolchains/remote_config/configs.bzl
@@ -52,6 +52,19 @@ def initialize_rbe_configs():
         python_install_path = "/usr/local",
     )
 
+    tensorflow_rbe_config(
+        name = "ubuntu18.04-gcc7_manylinux2010-cuda11.0-cudnn8-tensorrt7.1",
+        compiler = "/dt7/usr/bin/gcc",
+        compiler_prefix = "/usr/bin",
+        cuda_version = "11.0",
+        cudnn_version = "8",
+        os = "ubuntu18.04-manylinux2010-multipython",
+        python_versions = ["2.7", "3.5", "3.6", "3.7", "3.8"],
+        tensorrt_install_path = "/usr",
+        tensorrt_version = "7.1",
+        python_install_path = "/usr/local",
+    )
+
     # TODO(klimek): Delete this once all users are migrated to a python-version
     # independent configuration. In the future, use
     # "ubuntu16.04-gcc7_manylinux2010-cuda10.1-cudnn7-tensorrt6.0" instead.
diff --git a/third_party/toolchains/remote_config/containers.bzl b/third_party/toolchains/remote_config/containers.bzl
index 8f6dae7f311..d94bce91675 100644
--- a/third_party/toolchains/remote_config/containers.bzl
+++ b/third_party/toolchains/remote_config/containers.bzl
@@ -38,6 +38,13 @@ containers = {
         "digest": container_digests["cuda10.1-cudnn7-ubuntu18.04-manylinux2010-multipython"],
     },
 
+    # Built with //tensorflow/tools/ci_build/Dockerfile.rbe.cuda11.0-cudnn8-ubuntu18.04-manylinux2010-multipython.
+    "cuda11.0-cudnn8-ubuntu18.04-manylinux2010-multipython": {
+        "registry": "gcr.io",
+        "repository": "tensorflow-testing/nosla-cuda11.0-cudnn8-ubuntu18.04-manylinux2010-multipython",
+        "digest": container_digests["cuda11.0-cudnn8-ubuntu18.04-manylinux2010-multipython"],
+    },
+
     # Built with //tensorflow/tools/ci_build/Dockerfile.rbe.rocm-ubuntu16.04
     "rocm-ubuntu16.04": {
         "registry": "gcr.io",

From 8785b4f5b8701c3aab07695e47b8ff036f41e672 Mon Sep 17 00:00:00 2001
From: Christian Sigg <csigg@google.com>
Date: Mon, 22 Jun 2020 11:32:54 -0700
Subject: [PATCH 0775/1390] Make gpu_ftz_test ready for CUDA 11.

CUDA 11 changed the libdevice implementation of expf from previously two to one ex2.approx.ftz. Change the CHECK directives to handle both cases.

PiperOrigin-RevId: 317698875
Change-Id: Idd7b28c77427f299b80fa1b7f4b9be8c7881f963
---
 .../xla/service/gpu/tests/gpu_ftz_test.cc       | 17 +++++++----------
 1 file changed, 7 insertions(+), 10 deletions(-)

diff --git a/tensorflow/compiler/xla/service/gpu/tests/gpu_ftz_test.cc b/tensorflow/compiler/xla/service/gpu/tests/gpu_ftz_test.cc
index 282f7b24a31..9b58457d129 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/gpu_ftz_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_ftz_test.cc
@@ -92,26 +92,23 @@ TEST_F(GpuFtzDisabledTest, MultiplyFtz) {
 }
 
 // In NVPTX, exp(float) is implemented in libdevice, and consults __nvvm_reflect
-// to determine whether or not ftz is enabled.  The implementation uses two
-// calls to ex2.approx.  When ftz is on, we get two calls to the ftz version;
-// when ftz is off, we get one call to the ftz version and one call to the
-// regular version.
+// to determine whether or not ftz is enabled.
+// The implementation in CUDA 11 uses one ex2.approx.ftz, irrespective of ftz
+// being enabled or not. In previous CUDA versions, there is a leading
+// ex2.approx that does obey the ftz setting.
+// Instead of pattern matching implementation details, it might be better to
+// value-test the actual result instead. TODO(csigg): change to value-test.
 TEST_F(GpuFtzEnabledTest, ExpFtz) {
   CompileAndOptionallyVerifyPtx(CreateUnaryOpModule(HloOpcode::kExp), R"(
     CHECK-NOT: ex2.approx.f32
     CHECK:     ex2.approx.ftz.f32
     CHECK-NOT: ex2.approx.f32
-    CHECK:     ex2.approx.ftz.f32
-    CHECK-NOT: ex2.approx.f32
-    CHECK-NOT: ex2.approx.ftz.f32
   )");
 }
 
 TEST_F(GpuFtzDisabledTest, ExpFtz) {
   CompileAndOptionallyVerifyPtx(CreateUnaryOpModule(HloOpcode::kExp), R"(
-    CHECK-NOT: ex2.approx.f32
-    CHECK-DAG: ex2.approx.ftz.f32
-    CHECK-DAG: ex2.approx.f32
+    CHECK:     ex2.approx.ftz.f32
     CHECK-NOT: ex2.approx.f32
     CHECK-NOT: ex2.approx.ftz.f32
   )");

From 79518facb4b857af9d9d5df2da463fdbf7eb0e3e Mon Sep 17 00:00:00 2001
From: Mihai Maruseac <mihaimaruseac@google.com>
Date: Mon, 22 Jun 2020 11:37:07 -0700
Subject: [PATCH 0776/1390] Upper bound `numpy` dependency to `1.19.0`.

Since `numpy==1.19.0` contains at least one breaking ABI change (numpy/numpy#15355), we need to either upper bound the dependency's range or fix our code to support both ABIs. Since the second requires more changes, we prefer the first one for now.

PiperOrigin-RevId: 317699730
Change-Id: Ia62a779f9ec42d63d3fac1b69cd75e6084358d2f
---
 tensorflow/tools/pip_package/setup.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tensorflow/tools/pip_package/setup.py b/tensorflow/tools/pip_package/setup.py
index 92061b396ce..42233f80c1c 100644
--- a/tensorflow/tools/pip_package/setup.py
+++ b/tensorflow/tools/pip_package/setup.py
@@ -58,7 +58,9 @@ REQUIRED_PACKAGES = [
     'google_pasta >= 0.1.8',
     'h5py >= 2.10.0, < 2.11.0',
     'keras_preprocessing >= 1.1.1, < 1.2',
-    'numpy >= 1.16.0, < 2.0',
+    # TODO(mihaimaruseac): numpy 1.19.0 has ABI breakage
+    # https://github.com/numpy/numpy/pull/15355
+    'numpy >= 1.16.0, < 1.19.0',
     'opt_einsum >= 2.3.2',
     'protobuf >= 3.9.2',
     'tensorboard >= 2.2.0, < 2.3.0',

From 44067f0783c56ad092f6ef5ea1034e6926559d86 Mon Sep 17 00:00:00 2001
From: Davide Libenzi <dlibenzi@google.com>
Date: Mon, 22 Jun 2020 11:41:43 -0700
Subject: [PATCH 0777/1390] Make XRT CPU/GPU use MaybeOwning buffer interface,
 so the new copy protection CL won't break aliasing.

PiperOrigin-RevId: 317700747
Change-Id: Ie7b5bb1989cd4359b30ad86a450de5bff0962c31
---
 .../xla/service/cpu/cpu_executable.cc         |  11 +-
 tensorflow/compiler/xla/service/executable.cc |  47 ++++
 tensorflow/compiler/xla/service/executable.h  |  38 +--
 .../xla/service/maybe_owning_device_memory.cc |   8 +
 .../xla/service/maybe_owning_device_memory.h  |   4 +
 tensorflow/compiler/xla/shape_util.cc         |  32 ++-
 tensorflow/compiler/xla/shape_util.h          |   6 +-
 tensorflow/compiler/xrt/BUILD                 |   1 +
 .../compiler/xrt/kernels/xrt_execute_op.cc    | 255 ++++++++----------
 tensorflow/compiler/xrt/xrt_util.cc           | 134 +++++++++
 tensorflow/compiler/xrt/xrt_util.h            |  21 ++
 11 files changed, 385 insertions(+), 172 deletions(-)

diff --git a/tensorflow/compiler/xla/service/cpu/cpu_executable.cc b/tensorflow/compiler/xla/service/cpu/cpu_executable.cc
index 4552d7b5ba9..d095d220b97 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_executable.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_executable.cc
@@ -299,12 +299,11 @@ StatusOr<ExecutionOutput> CpuExecutable::ExecuteAsyncOnStream(
       const Shape& expected_shape =
           entry_comp->parameter_instruction(i)->shape();
       const Shape& actual_shape = arguments[i].Buffers().shape();
-      CHECK(
-          Shape::Equal().IgnoreDynamicDimension()(expected_shape, actual_shape))
-          << absl::StreamFormat(
-                 "Shape mismatch on argument %d.  Expected %s, but was %s.", i,
-                 expected_shape.ToString(/*print_layout=*/true),
-                 actual_shape.ToString(/*print_layout=*/true));
+      TF_RET_CHECK(
+          ShapeUtil::DynamicShapeIsCompatible(actual_shape, expected_shape))
+          << "Shape mismatch on argument " << i << ", "
+          << expected_shape.ToString(/*print_layout=*/true) << " vs. "
+          << actual_shape.ToString(/*print_layout=*/true);
     }
   }
 
diff --git a/tensorflow/compiler/xla/service/executable.cc b/tensorflow/compiler/xla/service/executable.cc
index ebf7cc440dd..61ce6200a28 100644
--- a/tensorflow/compiler/xla/service/executable.cc
+++ b/tensorflow/compiler/xla/service/executable.cc
@@ -28,10 +28,57 @@ limitations under the License.
 #include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/lib/strings/proto_serialization.h"
 #include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/errors.h"
 #include "tensorflow/stream_executor/device_description.h"
 
 namespace xla {
 
+ExecutionInput::~ExecutionInput() {
+  for (auto& index : unowned_indices_) {
+    auto buffer = buffers_.mutable_element(index)->Release();
+    if (buffer) {
+      buffer->Release();
+    }
+  }
+}
+
+Status ExecutionInput::SetDynamicShape(Shape dynamic_shape) {
+  const Shape& input_shape = shape();
+  if (!ShapeUtil::DynamicShapeIsCompatible(input_shape, dynamic_shape)) {
+    return tensorflow::errors::InvalidArgument(
+        "Cannot set dynamic shape: ", input_shape.DebugString(), " vs. ",
+        dynamic_shape.DebugString());
+  }
+  dynamic_shape_ = absl::make_unique<Shape>(std::move(dynamic_shape));
+  return Status::OK();
+}
+
+void ExecutionInput::SetUnownedBuffer(const ShapeIndex& index,
+                                      MaybeOwningDeviceMemory buffer) {
+  *buffers_.mutable_element(index) = std::move(buffer);
+  unowned_indices_.insert(index);
+}
+
+xla::StatusOr<xla::ShapedBuffer> ExecutionInput::ToShapedBuffer(
+    se::DeviceMemoryAllocator* allocator, int device_ordinal) const {
+  const Shape& input_shape = shape();
+  xla::ShapedBuffer shaped_buffer(input_shape, input_shape,
+                                  allocator->platform(), device_ordinal);
+  for (const auto& index_buffer : Buffers()) {
+    const tensorflow::se::OwningDeviceMemory* mem =
+        index_buffer.second.AsOwningDeviceMemory();
+    if (mem != nullptr && (mem->allocator() != allocator ||
+                           mem->device_ordinal() != device_ordinal)) {
+      return tensorflow::errors::InvalidArgument(
+          "Device buffer at index ", index_buffer.first.ToString(),
+          " has mismatching allocator/device");
+    }
+    shaped_buffer.set_buffer(index_buffer.second.AsDeviceMemoryBase(),
+                             index_buffer.first);
+  }
+  return std::move(shaped_buffer);
+}
+
 StatusOr<ScopedShapedBuffer> Executable::ExecuteOnStream(
     const ServiceExecutableRunOptions* run_options,
     absl::Span<const ShapedBuffer* const> arguments,
diff --git a/tensorflow/compiler/xla/service/executable.h b/tensorflow/compiler/xla/service/executable.h
index 2c979662d24..6881f6dd68a 100644
--- a/tensorflow/compiler/xla/service/executable.h
+++ b/tensorflow/compiler/xla/service/executable.h
@@ -17,6 +17,7 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_XLA_SERVICE_EXECUTABLE_H_
 
 #include <memory>
+#include <set>
 #include <utility>
 #include <vector>
 
@@ -65,31 +66,32 @@ class ExecutionInput {
       : buffers_(std::move(buffers)) {}
   ExecutionInput(ExecutionInput&&) = default;
 
-  ~ExecutionInput() {
-    for (auto& index : unowned_indices_) {
-      auto buffer = buffers_.mutable_element(index)->Release();
-      if (buffer) {
-        buffer->Release();
-      }
-    }
-  }
+  ~ExecutionInput();
 
   ExecutionInput& operator=(ExecutionInput&&) = default;
 
-  const Shape& shape() const { return buffers_.shape(); }
+  const Shape& shape() const {
+    return dynamic_shape_ != nullptr ? *dynamic_shape_ : buffers_.shape();
+  }
+
+  Status SetDynamicShape(Shape dynamic_shape);
+
+  xla::StatusOr<xla::ShapedBuffer> ToShapedBuffer(
+      se::DeviceMemoryAllocator* allocator, int device_ordinal) const;
 
   void SetBuffer(const ShapeIndex& index, MaybeOwningDeviceMemory buffer) {
     *buffers_.mutable_element(index) = std::move(buffer);
   }
 
   void SetUnownedBuffer(const ShapeIndex& index,
-                        MaybeOwningDeviceMemory buffer) {
-    *buffers_.mutable_element(index) = std::move(buffer);
-    unowned_indices_.push_back(index);
-  }
+                        MaybeOwningDeviceMemory buffer);
 
   void SetUnownedIndex(const ShapeIndex& index) {
-    unowned_indices_.push_back(index);
+    unowned_indices_.insert(index);
+  }
+
+  void ClearUnownedIndex(const ShapeIndex& index) {
+    unowned_indices_.erase(index);
   }
 
   const ShapeTree<MaybeOwningDeviceMemory>& Buffers() const { return buffers_; }
@@ -106,9 +108,10 @@ class ExecutionInput {
 
  private:
   ShapeTree<MaybeOwningDeviceMemory> buffers_;
-  // (Unordered) set of indices of buffers that should be returned to the
-  // caller if an error occurs when enqueuing the computation.
-  std::vector<ShapeIndex> unowned_indices_;
+  // Set of indices of buffers that should be returned to the caller if an error
+  // occurs when enqueuing the computation.
+  std::set<ShapeIndex> unowned_indices_;
+  std::unique_ptr<Shape> dynamic_shape_;
 };
 
 // ExecutionOutput encapsulates the output buffers of a execution and the
@@ -145,7 +148,6 @@ class ExecutionOutput {
     to_be_released_.push_back(std::move(mem));
   }
 
-
   // Should be called once it is known that the execute operation succeeded,
   // before returning the ExecutionOutput to the caller.
   ExecutionOutput& Commit() {
diff --git a/tensorflow/compiler/xla/service/maybe_owning_device_memory.cc b/tensorflow/compiler/xla/service/maybe_owning_device_memory.cc
index c4bf48bcc00..c7505f5fa4a 100644
--- a/tensorflow/compiler/xla/service/maybe_owning_device_memory.cc
+++ b/tensorflow/compiler/xla/service/maybe_owning_device_memory.cc
@@ -14,7 +14,9 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/compiler/xla/service/maybe_owning_device_memory.h"
+
 #include "absl/types/variant.h"
+
 namespace xla {
 
 tensorflow::se::DeviceMemoryBase MaybeOwningDeviceMemory::AsDeviceMemoryBase()
@@ -38,4 +40,10 @@ MaybeOwningDeviceMemory::Release() {
   return std::move(absl::get<tensorflow::se::OwningDeviceMemory>(mem_));
 }
 
+const tensorflow::se::OwningDeviceMemory*
+MaybeOwningDeviceMemory::AsOwningDeviceMemory() const {
+  return HasOwnership() ? &absl::get<tensorflow::se::OwningDeviceMemory>(mem_)
+                        : nullptr;
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/maybe_owning_device_memory.h b/tensorflow/compiler/xla/service/maybe_owning_device_memory.h
index 7d23d178130..0b56fed0a72 100644
--- a/tensorflow/compiler/xla/service/maybe_owning_device_memory.h
+++ b/tensorflow/compiler/xla/service/maybe_owning_device_memory.h
@@ -57,6 +57,10 @@ class MaybeOwningDeviceMemory {
   // A nullopt is returned if the HasOwnership() == false;
   absl::optional<tensorflow::se::OwningDeviceMemory> Release();
 
+  // If the device memory is owned, returns a pointer to the internal
+  // OwningDeviceMemory, otherwise nullptr is returned.
+  const tensorflow::se::OwningDeviceMemory* AsOwningDeviceMemory() const;
+
   // Returns true if the device_memory has ownership over underlying memory.
   bool HasOwnership() const;
 
diff --git a/tensorflow/compiler/xla/shape_util.cc b/tensorflow/compiler/xla/shape_util.cc
index ab46e49b181..bce40578132 100644
--- a/tensorflow/compiler/xla/shape_util.cc
+++ b/tensorflow/compiler/xla/shape_util.cc
@@ -1461,7 +1461,7 @@ ShapeUtil::ReshapeLeavesDimensionsUnmodified(
   return shape;
 }
 
-/* static */ bool ShapeUtil::DynamicShapeIsCompatible(
+/* static */ bool ShapeUtil::DynamicArrayShapeIsCompatible(
     const xla::Shape& dynamic_shape, const xla::Shape& bounded_shape) {
   if (dynamic_shape.rank() != bounded_shape.rank()) {
     return false;
@@ -1474,6 +1474,36 @@ ShapeUtil::ReshapeLeavesDimensionsUnmodified(
   return true;
 }
 
+/* static */ bool ShapeUtil::DynamicShapeIsCompatible(
+    const xla::Shape& dynamic_shape, const xla::Shape& bounded_shape) {
+  bool compatible = true;
+  xla::ShapeUtil::ForEachSubshape(dynamic_shape, [&](const Shape& sub_shape,
+                                                     const ShapeIndex& index) {
+    if (compatible) {
+      auto subshape_result = TryGetSubshape(bounded_shape, index);
+      if (subshape_result.ok()) {
+        const Shape* bounded_sub_shape = subshape_result.ConsumeValueOrDie();
+        if (sub_shape.IsTuple()) {
+          if (!bounded_sub_shape->IsTuple()) {
+            compatible = false;
+          }
+        } else {
+          if (bounded_sub_shape->IsTuple()) {
+            compatible = false;
+          } else if (!sub_shape.is_static() &&
+                     !DynamicArrayShapeIsCompatible(sub_shape,
+                                                    *bounded_sub_shape)) {
+            compatible = false;
+          }
+        }
+      } else {
+        compatible = false;
+      }
+    }
+  });
+  return compatible;
+}
+
 /* static */ Shape ShapeUtil::FilterDimensions(
     const std::function<bool(int64)>& p, Shape shape) {
   CHECK(shape.IsArray());
diff --git a/tensorflow/compiler/xla/shape_util.h b/tensorflow/compiler/xla/shape_util.h
index dde56587482..fe1a8acf6e4 100644
--- a/tensorflow/compiler/xla/shape_util.h
+++ b/tensorflow/compiler/xla/shape_util.h
@@ -657,7 +657,11 @@ class ShapeUtil {
                                 Shape shape);
 
   // Returns true if `dynamic_shape` has dimensions that are less-equal to the
-  // "bounded_shape".
+  // "bounded_shape". Shapes must be arrays.
+  static bool DynamicArrayShapeIsCompatible(const xla::Shape& dynamic_shape,
+                                            const xla::Shape& bounded_shape);
+
+  // Same as DynamicArrayShapeIsCompatible() but supports tuples.
   static bool DynamicShapeIsCompatible(const xla::Shape& dynamic_shape,
                                        const xla::Shape& bounded_shape);
 
diff --git a/tensorflow/compiler/xrt/BUILD b/tensorflow/compiler/xrt/BUILD
index 332c8ff9a14..6a704be4adb 100644
--- a/tensorflow/compiler/xrt/BUILD
+++ b/tensorflow/compiler/xrt/BUILD
@@ -74,6 +74,7 @@ cc_library(
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/service:backend",
         "//tensorflow/compiler/xla/service:executable",
+        "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:shaped_buffer",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
diff --git a/tensorflow/compiler/xrt/kernels/xrt_execute_op.cc b/tensorflow/compiler/xrt/kernels/xrt_execute_op.cc
index 2fc599e42df..3bd8af577c8 100644
--- a/tensorflow/compiler/xrt/kernels/xrt_execute_op.cc
+++ b/tensorflow/compiler/xrt/kernels/xrt_execute_op.cc
@@ -51,12 +51,6 @@ namespace tensorflow {
 
 namespace {
 
-struct InputBuffers {
-  std::vector<RefPtr<XRTTupleAllocation>> input_tuples;
-  std::vector<xla::ShapedBuffer> input_allocations;
-  std::vector<xla::ShapedBuffer*> input_pointers;
-};
-
 uint32 InitialRandomSeed() {
   // Support plumbing the TF seed through to XLA is being worked on.
   // If a user wants deterministic behavior, their best option
@@ -80,75 +74,51 @@ uint32 GetXLARandomSeed() {
   return counter.fetch_add(2);
 }
 
-xla::StatusOr<InputBuffers> GetInputBuffers(
-    XRTMemoryManager::WorkingSet* working_set, xla::Backend* backend,
-    const std::vector<InputCoords>& input_coords, bool release_inputs) {
-  InputBuffers input_buffers;
-  input_buffers.input_tuples.reserve(input_coords.size());
-  input_buffers.input_allocations.reserve(input_coords.size());
-  input_buffers.input_pointers.reserve(input_coords.size());
-  for (size_t i = 0; i < input_coords.size(); ++i) {
-    TF_RETURN_IF_ERROR(
-        working_set->LookupAndPin(backend, input_coords[i].handle));
-    auto tuple = working_set->PinnedTuples().back();
-    input_buffers.input_tuples.emplace_back(tuple);
-    if (release_inputs) {
-      // We are holding a reference to the tuple, so we can safely delete it
-      // from the resource manager here.
-      TF_RETURN_IF_ERROR(
-          working_set->MemoryManager()->Release(input_coords[i].handle));
-      VLOG(2) << "Released allocation handle " << input_coords[i].handle;
-    }
-    if (input_coords[i].index.empty()) {
-      TF_ASSIGN_OR_RETURN(xla::ShapedBuffer shaped_buffer,
-                          tuple->ToShapedBuffer());
-      input_buffers.input_allocations.emplace_back(std::move(shaped_buffer));
-    } else {
-      TF_ASSIGN_OR_RETURN(xla::ShapedBuffer shaped_buffer,
-                          tuple->ToShapedBuffer());
-      TF_ASSIGN_OR_RETURN(xla::ShapedBuffer sub_shaped_buffer,
-                          shaped_buffer.SubShapedBuffer(input_coords[i].index));
-      input_buffers.input_allocations.emplace_back(
-          std::move(sub_shaped_buffer));
-    }
+std::vector<bool> GetDynamicInputInfo(
+    const xla::ComputationLayout& computation_layout) {
+  std::vector<bool> input_is_dynamic;
+  input_is_dynamic.reserve(computation_layout.parameter_count());
+  for (int64 i = 0; i < computation_layout.parameter_count(); ++i) {
+    input_is_dynamic.push_back(
+        !computation_layout.parameter_shape(i).is_static());
   }
-  for (size_t i = 0; i < input_buffers.input_allocations.size(); ++i) {
-    input_buffers.input_pointers.push_back(&input_buffers.input_allocations[i]);
-  }
-  return std::move(input_buffers);
+  return input_is_dynamic;
 }
 
-xla::StatusOr<InputBuffers> GetChainedOpInputs(
+xla::StatusOr<std::vector<RefPtr<XRTTupleAllocation>>> GetInputTuples(
+    xla::LocalExecutable* executable, XRTMemoryManager::WorkingSet* working_set,
+    xla::Backend* backend, const std::vector<InputCoords>& input_coords,
+    bool release_inputs) {
+  const xla::ComputationLayout& computation_layout =
+      executable->executable()->module_config().entry_computation_layout();
+
+  return GetInputTupleAllocations(
+      input_coords, working_set, backend, computation_layout.parameter_count(),
+      [&](int64 i) { return computation_layout.parameter_shape(i); },
+      release_inputs);
+}
+
+xla::StatusOr<std::vector<RefPtr<XRTTupleAllocation>>> GetChainedOpInputTuples(
     const xrt::XRTChainedExecuteOp& op,
     absl::Span<const RefPtr<XRTTupleAllocation>> op_inputs) {
-  InputBuffers input_buffers;
-  input_buffers.input_tuples.reserve(op.inputs_size());
-  input_buffers.input_allocations.reserve(op.inputs_size());
-  input_buffers.input_pointers.reserve(op.inputs_size());
+  std::vector<RefPtr<XRTTupleAllocation>> input_tuples;
+  input_tuples.reserve(op.inputs_size());
   for (int i = 0; i < op.inputs_size(); ++i) {
     auto& input = op.inputs(i);
-    input_buffers.input_tuples.emplace_back(op_inputs[i]);
     // Thanks to the greatness of proto3, there is no way to query for
     // explicitly set fields, so the default for output_index (zero) means no
     // sub-index. As consequence, the real index is output_index - 1.
     if (input.output_index() == 0) {
-      TF_ASSIGN_OR_RETURN(xla::ShapedBuffer shaped_buffer,
-                          input_buffers.input_tuples.back()->ToShapedBuffer());
-      input_buffers.input_allocations.emplace_back(std::move(shaped_buffer));
+      input_tuples.emplace_back(op_inputs[i]);
     } else {
-      TF_ASSIGN_OR_RETURN(xla::ShapedBuffer shaped_buffer,
-                          input_buffers.input_tuples.back()->ToShapedBuffer());
-      TF_ASSIGN_OR_RETURN(
-          xla::ShapedBuffer sub_shaped_buffer,
-          shaped_buffer.SubShapedBuffer({input.output_index() - 1}));
-      input_buffers.input_allocations.emplace_back(
-          std::move(sub_shaped_buffer));
+      XRTTupleAllocation* sub_tuple;
+      TF_RETURN_IF_ERROR(XRTTupleAllocation::MakeSubBuffer(
+          op_inputs[i].get(), {input.output_index() - 1}, &sub_tuple,
+          /*alias_parent_allocation=*/true));
+      input_tuples.emplace_back(sub_tuple);
     }
   }
-  for (size_t i = 0; i < input_buffers.input_allocations.size(); ++i) {
-    input_buffers.input_pointers.push_back(&input_buffers.input_allocations[i]);
-  }
-  return std::move(input_buffers);
+  return input_tuples;
 }
 
 // Given a shape, returns a byte array representing the shape metadata of the
@@ -228,12 +198,11 @@ Status UpdateMetadata(se::Stream* stream, se::DeviceMemory<uint8>* buffer,
 // As we can't expand the size of an existing memory allocation, a reallocation
 // is required. A list of new allocations are returned after this function. The
 // caller is reponsible for maintaining those allocations.
-xla::StatusOr<std::vector<se::OwningDeviceMemory>> UpdateDynamicInputs(
+Status UpdateDynamicInputs(
     se::Stream* stream, se::DeviceMemoryAllocator* allocator,
-    std::vector<xla::ShapedBuffer*> runtime_inputs,
+    std::vector<xla::ExecutionInput>* execution_inputs,
     const std::vector<xla::ShapeLayout>& compile_time_shapes) {
-  std::vector<se::OwningDeviceMemory> new_allocations;
-  TF_RET_CHECK(runtime_inputs.size() == compile_time_shapes.size());
+  TF_RET_CHECK(execution_inputs->size() == compile_time_shapes.size());
   TF_ASSIGN_OR_RETURN(auto compiler, xla::Compiler::GetForPlatform(
                                          stream->parent()->platform()));
   auto shape_size_fn = compiler->ShapeSizeBytesFunction();
@@ -242,57 +211,61 @@ xla::StatusOr<std::vector<se::OwningDeviceMemory>> UpdateDynamicInputs(
     if (compile_time_shape.is_static()) {
       continue;
     }
-    auto* runtime_input = runtime_inputs[i];
-
+    xla::ExecutionInput* execution_input = &(*execution_inputs)[i];
     bool element_modified = false;
     TF_RETURN_IF_ERROR(xla::ShapeUtil::ForEachSubshapeWithStatus(
         compile_time_shape,
-        [&](const xla::Shape& compile_time_shape,
+        [&](const xla::Shape& sub_shape,
             const xla::ShapeIndex& index) -> Status {
-          if (compile_time_shape.IsTuple() || compile_time_shape.is_static()) {
+          if (sub_shape.IsTuple() || sub_shape.is_static()) {
             return Status::OK();
           }
-          const xla::Shape& runtime_shape = xla::ShapeUtil::GetSubshape(
-              runtime_input->on_device_shape(), index);
-          TF_RET_CHECK(!runtime_shape.IsTuple());
-          TF_RET_CHECK(xla::ShapeUtil::DynamicShapeIsCompatible(
-              runtime_shape, compile_time_shape));
-          se::DeviceMemoryBase* static_input =
-              runtime_input->buffers().mutable_element(index);
           TF_ASSIGN_OR_RETURN(
-              auto dynamic_input,
+              const xla::Shape* runtime_shape,
+              xla::ShapeUtil::TryGetSubshape(execution_input->shape(), index));
+          TF_RET_CHECK(!runtime_shape->IsTuple());
+          TF_RET_CHECK(xla::ShapeUtil::DynamicArrayShapeIsCompatible(
+              *runtime_shape, sub_shape));
+          TF_ASSIGN_OR_RETURN(
+              se::OwningDeviceMemory dynamic_input,
               allocator->Allocate(stream->parent()->device_ordinal(),
-                                  shape_size_fn(compile_time_shape)));
-          new_allocations.emplace_back(std::move(dynamic_input));
-          se::DeviceMemory<uint8>* dynamic_input_base =
-              new_allocations.back().ptr();
+                                  shape_size_fn(sub_shape)));
+
+          se::DeviceMemoryBase static_input =
+              execution_input->Buffer(index).AsDeviceMemoryBase();
+          se::DeviceMemory<uint8>* dynamic_input_base = dynamic_input.ptr();
           // Send the original data to the new location.
-          stream->ThenMemcpyD2D(dynamic_input_base, *static_input,
-                                static_input->size());
+          stream->ThenMemcpyD2D(dynamic_input_base, static_input,
+                                static_input.size());
           TF_RETURN_IF_ERROR(UpdateMetadata(stream, dynamic_input_base,
-                                            compile_time_shape, runtime_shape));
+                                            sub_shape, *runtime_shape));
           // Modify the memory location in the input shape tree to point to the
           // new input.
-          runtime_input->set_buffer(*dynamic_input_base, index);
+          execution_input->SetBuffer(
+              index, xla::MaybeOwningDeviceMemory(std::move(dynamic_input)));
+          execution_input->ClearUnownedIndex(index);
           element_modified = true;
           return Status::OK();
         }));
     if (element_modified) {
-      runtime_input->set_shapes(compile_time_shape, compile_time_shape);
+      TF_RETURN_IF_ERROR(execution_input->SetDynamicShape(compile_time_shape));
+      TF_ASSIGN_OR_RETURN(xla::ShapedBuffer shaped_buffer,
+                          execution_input->ToShapedBuffer(
+                              allocator, stream->parent()->device_ordinal()));
       // The input location has been modified, need to fix tuple table to
       // point to the correct address.
       TF_ASSIGN_OR_RETURN(
           auto transfer_manager,
           xla::TransferManager::GetForPlatform(stream->parent()->platform()));
       TF_RETURN_IF_ERROR(
-          transfer_manager->WriteTupleIndexTablesAsync(stream, *runtime_input));
+          transfer_manager->WriteTupleIndexTablesAsync(stream, shaped_buffer));
     }
   }
-  return std::move(new_allocations);
+  return Status::OK();
 }
 
 xla::StatusOr<xla::Literal> ReadMetadataLiteral(
-    se::Stream* stream, se::DeviceMemoryBase* buffer,
+    se::Stream* stream, se::DeviceMemoryBase buffer,
     const xla::Shape& buffer_shape, xla::TransferManager* transfer_manager) {
   TF_ASSIGN_OR_RETURN(auto compiler, xla::Compiler::GetForPlatform(
                                          stream->parent()->platform()));
@@ -302,7 +275,7 @@ xla::StatusOr<xla::Literal> ReadMetadataLiteral(
   const int64 offset = shape_size_fn(buffer_shape_static);
   int64 metadata_size = shape_size_fn(buffer_shape) - offset;
   TF_RET_CHECK(metadata_size != 0);
-  auto buffer_8 = se::DeviceMemory<uint8>(*buffer);
+  auto buffer_8 = se::DeviceMemory<uint8>(buffer);
   auto metadata_buffer =
       stream->parent()->GetSubBuffer(&buffer_8, offset, metadata_size);
   return transfer_manager->TransferArrayFromDevice(
@@ -315,7 +288,7 @@ xla::StatusOr<xla::Literal> ReadMetadataLiteral(
 // dimension sizes from the metadata, and update output shapes. The result shape
 // is a static and concrete shape.
 xla::Status UpdateDynamicOutputs(se::Stream* stream,
-                                 xla::ShapedBuffer* shaped_buffer,
+                                 const xla::ShapedBuffer& shaped_buffer,
                                  xla::Shape* output_host_shape,
                                  xla::Shape* output_device_shape) {
   DCHECK(output_device_shape->is_dynamic());
@@ -323,8 +296,8 @@ xla::Status UpdateDynamicOutputs(se::Stream* stream,
       auto transfer_manager,
       xla::TransferManager::GetForPlatform(stream->parent()->platform()));
   TF_RETURN_IF_ERROR(stream->BlockHostUntilDone());
-  TF_RETURN_IF_ERROR(shaped_buffer->buffers().ForEachMutableElementWithStatus(
-      [&](const xla::ShapeIndex& index, se::DeviceMemoryBase* buffer) {
+  TF_RETURN_IF_ERROR(shaped_buffer.buffers().ForEachElementWithStatus(
+      [&](const xla::ShapeIndex& index, const se::DeviceMemoryBase& buffer) {
         const xla::Shape& buffer_shape =
             xla::ShapeUtil::GetSubshape(*output_device_shape, index);
         if (buffer_shape.IsTuple()) {
@@ -352,19 +325,18 @@ xla::Status UpdateDynamicOutputs(se::Stream* stream,
   return Status::OK();
 }
 
-// Create output tuple from run_result.
 xla::StatusOr<RefPtr<XRTTupleAllocation>> CreateOutputTuple(
-    se::Stream* stream, xla::ScopedShapedBuffer run_result,
-    xla::Backend* backend, int device_ordinal) {
+    se::Stream* stream, xla::ExecutionOutput run_result, xla::Backend* backend,
+    int device_ordinal) {
   XRTTupleAllocation* output_tuple;
-  xla::ShapedBuffer shaped_buffer = run_result.release();
+  const xla::ScopedShapedBuffer& shaped_buffer = run_result.Result();
   if (shaped_buffer.on_device_shape().is_dynamic()) {
     // Update dynamic shapes from output buffer, and create a XRT tensor with
     // dimension sizes read from metadata.
     xla::Shape output_host_shape = shaped_buffer.on_host_shape();
     xla::Shape output_device_shape = shaped_buffer.on_device_shape();
     TF_RETURN_IF_ERROR(UpdateDynamicOutputs(
-        stream, &shaped_buffer, &output_host_shape, &output_device_shape));
+        stream, shaped_buffer, &output_host_shape, &output_device_shape));
     TF_RETURN_IF_ERROR(XRTTupleAllocation::CreateFromBuffer(
         shaped_buffer, output_host_shape, output_device_shape, backend,
         device_ordinal, &output_tuple));
@@ -373,15 +345,27 @@ xla::StatusOr<RefPtr<XRTTupleAllocation>> CreateOutputTuple(
     TF_RETURN_IF_ERROR(XRTTupleAllocation::CreateFromBuffer(
         shaped_buffer, backend, device_ordinal, &output_tuple));
   }
+  // After the output tuple is created, we can release the output result
+  // buffers, to make sure they won't be cleared by its destructor.
+  (void)run_result.ConsumeResult().release();
   return RefPtr<XRTTupleAllocation>(output_tuple);
 }
 
 xla::StatusOr<RefPtr<XRTTupleAllocation>> RunExecutable(
     OpKernelContext* context, XRTGenericDeviceAccessor::ScopedRef* device_ref,
-    xla::LocalExecutable* executable, const InputBuffers& input_buffers,
-    se::Stream* stream, int rng_seed,
+    xla::LocalExecutable* executable,
+    absl::Span<const RefPtr<XRTTupleAllocation>> input_tuples,
+    bool release_inputs, se::Stream* stream, int rng_seed,
     const xrt::CommonExecutionConfig& config) {
-  VLOG(2) << "Executing computation.";
+  const xla::ComputationLayout& computation_layout =
+      executable->executable()->module_config().entry_computation_layout();
+  std::vector<bool> input_is_dynamic = GetDynamicInputInfo(computation_layout);
+  TF_ASSIGN_OR_RETURN(
+      std::vector<xla::ExecutionInput> execution_inputs,
+      GetArgumentsBuffers(
+          executable->executable()->module().input_output_alias_config(),
+          input_tuples, input_is_dynamic, release_inputs));
+
   xla::ExecutableRunOptions run_options;
   run_options.set_stream(stream);
   run_options.set_allocator(device_ref->backend()->memory_allocator());
@@ -419,51 +403,28 @@ xla::StatusOr<RefPtr<XRTTupleAllocation>> RunExecutable(
   }
   run_options.set_gpu_executable_run_options(&gpu_options);
 
-  Env* env = Env::Default();
-  auto start_time = env->NowMicros();
   const std::vector<xla::ShapeLayout>& shape_layouts =
       executable->executable()
           ->module_config()
           .entry_computation_layout()
           .parameter_layouts();
-  TF_ASSIGN_OR_RETURN(auto new_allocations,
-                      UpdateDynamicInputs(stream, run_options.allocator(),
-                                          input_buffers.input_pointers,
-                                          shape_layouts));
-  auto new_allocations_ptr =
-      std::make_shared<std::vector<se::OwningDeviceMemory>>(
-          std::move(new_allocations));
+  TF_RETURN_IF_ERROR(UpdateDynamicInputs(stream, run_options.allocator(),
+                                         &execution_inputs, shape_layouts));
   TF_ASSIGN_OR_RETURN(
-      xla::ScopedShapedBuffer run_result,
-      executable->Run(input_buffers.input_pointers, run_options));
-  // Retain the new allocation for input memory until the end of execution.
-  stream->ThenDoHostCallback([new_allocations_ptr]() { return Status::OK(); });
-
-  auto elapsed = env->NowMicros() - start_time;
-  VLOG(2) << "Elapsed time: " << elapsed << "us";
+      xla::ExecutionOutput run_result,
+      executable->Run(std::move(execution_inputs), run_options));
 
   TF_ASSIGN_OR_RETURN(
       RefPtr<XRTTupleAllocation> output_tuple_ptr,
       CreateOutputTuple(stream, std::move(run_result), device_ref->backend(),
                         device_ref->device_ordinal()));
-
   // The ScopedShapedBuffer returned by the executable Run() API, in case of
   // input/output buffer aliasing, might have holes in it, which need to be
   // filled using the proper input tuples buffers which are the source of
   // aliasing.
-  const xla::HloInputOutputAliasConfig& input_output_alias =
-      executable->executable()->module().input_output_alias_config();
-  auto alias_function =
-      [&](const xla::ShapeIndex& output_index,
-          const xla::HloInputOutputAliasConfig::Alias& alias) -> Status {
-    TF_RET_CHECK(alias.parameter_number < input_buffers.input_tuples.size());
-    return alias.kind == xla::HloInputOutputAliasConfig::AliasKind::kUserAlias
-               ? output_tuple_ptr->AliasBufferFrom(
-                     *input_buffers.input_tuples[alias.parameter_number],
-                     alias.parameter_index, output_index)
-               : Status::OK();
-  };
-  TF_RETURN_IF_ERROR(input_output_alias.ForEachAliasWithStatus(alias_function));
+  TF_RETURN_IF_ERROR(RebuildOutputAliases(
+      output_tuple_ptr, input_tuples,
+      executable->executable()->module().input_output_alias_config()));
 
   return std::move(output_tuple_ptr);
 }
@@ -471,12 +432,13 @@ xla::StatusOr<RefPtr<XRTTupleAllocation>> RunExecutable(
 xla::StatusOr<RefPtr<XRTTupleAllocation>> ExecuteComputation(
     OpKernelContext* context, XRTMemoryManager* memory_manager,
     XRTGenericDeviceAccessor::ScopedRef* device_ref,
-    xla::LocalExecutable* executable, const InputBuffers& input_buffers,
-    se::Stream* stream, int rng_seed,
+    xla::LocalExecutable* executable,
+    absl::Span<const RefPtr<XRTTupleAllocation>> input_tuples,
+    bool release_inputs, se::Stream* stream, int rng_seed,
     const xrt::CommonExecutionConfig& config) {
   auto runfn = [&]() {
-    return RunExecutable(context, device_ref, executable, input_buffers, stream,
-                         rng_seed, config);
+    return RunExecutable(context, device_ref, executable, input_tuples,
+                         release_inputs, stream, rng_seed, config);
   };
 
   // We pass zero as requested_free_size as there is no simple way to get the
@@ -495,12 +457,13 @@ xla::StatusOr<RefPtr<XRTTupleAllocation>> ExecuteComputation(
     se::Stream* stream, int rng_seed,
     const xrt::CommonExecutionConfig& config) {
   XRTMemoryManager::WorkingSet working_set(memory_manager);
-  TF_ASSIGN_OR_RETURN(InputBuffers input_buffers,
-                      GetInputBuffers(&working_set, device_ref->backend(),
-                                      input_coords, release_inputs));
+  TF_ASSIGN_OR_RETURN(
+      std::vector<RefPtr<XRTTupleAllocation>> input_tuples,
+      GetInputTuples(executable, &working_set, device_ref->backend(),
+                     input_coords, release_inputs));
   return ExecuteComputation(context, memory_manager.get(), device_ref,
-                            executable, input_buffers, stream, rng_seed,
-                            config);
+                            executable, input_tuples, release_inputs, stream,
+                            rng_seed, config);
 }
 
 // XRTExecuteOp
@@ -653,16 +616,16 @@ Status XRTExecuteChainedOp::DoWork(OpKernelContext* context) {
   auto execute_op = [&](const xrt::XRTChainedExecuteOp& op,
                         absl::Span<const RefPtr<XRTTupleAllocation>> op_inputs)
       -> xla::StatusOr<RefPtr<XRTTupleAllocation>> {
-    TF_ASSIGN_OR_RETURN(InputBuffers input_buffers,
-                        GetChainedOpInputs(op, op_inputs));
-
     std::unique_ptr<XRTCompilationCacheEntryRef> entry;
     TF_RETURN_IF_ERROR(cache->Lookup(op.computation_handle(), &entry));
     xla::LocalExecutable* executable = entry->get().get_executable();
 
-    return ExecuteComputation(context, memory_manager.get(), &device_ref,
-                              executable, input_buffers, stream, rng_seed,
-                              config.common_config());
+    TF_ASSIGN_OR_RETURN(std::vector<RefPtr<XRTTupleAllocation>> input_tuples,
+                        GetChainedOpInputTuples(op, op_inputs));
+
+    return ExecuteComputation(
+        context, memory_manager.get(), &device_ref, executable, input_tuples,
+        /*release_inputs=*/false, stream, rng_seed, config.common_config());
   };
 
   return ExecuteChained(context, memory_manager, device_ref.backend(),
diff --git a/tensorflow/compiler/xrt/xrt_util.cc b/tensorflow/compiler/xrt/xrt_util.cc
index b8a0afc92c5..926ba23c7af 100644
--- a/tensorflow/compiler/xrt/xrt_util.cc
+++ b/tensorflow/compiler/xrt/xrt_util.cc
@@ -221,6 +221,140 @@ xla::StatusOr<std::vector<InputCoords>> GetComputationInputs(
   return std::move(input_coords);
 }
 
+bool InputShapeMatches(const xla::Shape& parameter_shape,
+                       const xla::Shape& input_shape) {
+  auto shape_checker = [&](const xla::Shape& pshape,
+                           const xla::ShapeIndex& index) {
+    if (pshape.IsArray()) {
+      TF_ASSIGN_OR_RETURN(const xla::Shape* ishape,
+                          xla::ShapeUtil::TryGetSubshape(input_shape, index));
+      if (pshape.rank() != ishape->rank() ||
+          pshape.element_type() != ishape->element_type()) {
+        return errors::InvalidArgument("Mismatching shapes");
+      }
+      if (pshape.is_static() && pshape.layout() != ishape->layout()) {
+        return errors::InvalidArgument("Mismatching layouts");
+      }
+      for (int64 dim = 0; dim < pshape.rank(); ++dim) {
+        if (pshape.is_dynamic_dimension(dim)) {
+          if (pshape.dimensions(dim) < ishape->dimensions(dim)) {
+            return errors::InvalidArgument("Mismatching shapes");
+          }
+        } else if (pshape.dimensions(dim) != ishape->dimensions(dim)) {
+          return errors::InvalidArgument("Mismatching shapes");
+        }
+      }
+    }
+    return Status::OK();
+  };
+  return xla::ShapeUtil::ForEachSubshapeWithStatus(parameter_shape,
+                                                   shape_checker)
+      .ok();
+}
+
+xla::StatusOr<std::vector<RefPtr<XRTTupleAllocation>>> GetInputTupleAllocations(
+    const std::vector<InputCoords>& input_coords,
+    XRTMemoryManager::WorkingSet* working_set, xla::Backend* backend,
+    int64 num_input_shapes,
+    const std::function<xla::Shape(int64)>& shape_getter, bool release_inputs) {
+  if (input_coords.size() != num_input_shapes) {
+    return errors::InvalidArgument(
+        "Number of inputs does not match executable proto input shapes: ",
+        input_coords.size(), " vs. ", num_input_shapes);
+  }
+  std::vector<RefPtr<XRTTupleAllocation>> input_tuples;
+  input_tuples.reserve(input_coords.size());
+  for (size_t i = 0; i < input_coords.size(); ++i) {
+    TF_RETURN_IF_ERROR(
+        working_set->LookupAndPin(backend, input_coords[i].handle));
+    auto tuple = working_set->PinnedTuples().back();
+    if (release_inputs) {
+      // We are holding a reference to the tuple, so we can safely delete it
+      // from the resource manager here.
+      TF_RETURN_IF_ERROR(
+          working_set->MemoryManager()->Release(input_coords[i].handle));
+      VLOG(2) << "Released allocation handle " << input_coords[i].handle;
+    }
+    xla::Shape input_shape = shape_getter(i);
+    if (!InputShapeMatches(input_shape, tuple->on_host_shape())) {
+      return errors::InvalidArgument(
+          "Run-time shape mismatch for XRTExecute argument[", i, "] (",
+          input_coords[i].handle, "). Expected ", input_shape.DebugString(),
+          "; got ", tuple->on_host_shape().DebugString());
+    }
+    if (input_coords[i].index.empty()) {
+      input_tuples.emplace_back(std::move(tuple));
+    } else {
+      XRTTupleAllocation* sub_tuple;
+      TF_RETURN_IF_ERROR(XRTTupleAllocation::MakeSubBuffer(
+          tuple.get(), input_coords[i].index, &sub_tuple,
+          /*alias_parent_allocation=*/true));
+      input_tuples.emplace_back(sub_tuple);
+    }
+  }
+  return std::move(input_tuples);
+}
+
+Status RebuildOutputAliases(
+    const RefPtr<XRTTupleAllocation>& output_tuple,
+    absl::Span<const RefPtr<XRTTupleAllocation>> input_tuples,
+    const xla::HloInputOutputAliasConfig& input_output_alias) {
+  auto alias_function =
+      [&](const xla::ShapeIndex& output_index,
+          const xla::HloInputOutputAliasConfig::Alias& alias) -> Status {
+    TF_RET_CHECK(alias.parameter_number < input_tuples.size());
+    return alias.kind == xla::HloInputOutputAliasConfig::AliasKind::kUserAlias
+               ? output_tuple->AliasBufferFrom(
+                     *input_tuples[alias.parameter_number],
+                     alias.parameter_index, output_index)
+               : Status::OK();
+  };
+  return input_output_alias.ForEachAliasWithStatus(alias_function);
+}
+
+xla::StatusOr<std::vector<xla::ExecutionInput>> GetArgumentsBuffers(
+    const xla::HloInputOutputAliasConfig& input_output_alias,
+    absl::Span<const RefPtr<XRTTupleAllocation>> input_tuples,
+    const std::vector<bool>& input_is_dynamic, bool release_inputs) {
+  auto is_dynamic = [&](size_t arg) {
+    return arg < input_is_dynamic.size() && input_is_dynamic[arg];
+  };
+  std::vector<xla::ExecutionInput> arguments;
+  // Don't alias dynamic input -- Due to the underlying implementation,
+  // aliased inputs have two owners: XRTAllocation and return value of
+  // this function. If an argument is dynamic and the ownership is
+  // released to output of this function, TPUExecute will free it and
+  // reallocate a new one, which creates a double freeing issue where
+  // XRTAllocation also attempts to release the buffer.
+  bool alias_outputs = release_inputs && input_tuples.size() == 1 &&
+                       input_tuples[0]->IsExclusiveOwner() && !is_dynamic(0);
+  arguments.reserve(input_tuples.size());
+  for (int64 i = 0; i < input_tuples.size(); ++i) {
+    auto alias_checker =
+        [&](const xla::ShapeIndex& index) -> xla::StatusOr<bool> {
+      // Only the buffers which the caller explicitly marked as aliased
+      // (kUserAlias), should create aliases.
+      // The XLA compiler might create opportunistic aliases (kSystemAlias)
+      // which need a different handling. With a system alias we know that XLA
+      // is going to reuse a given input parameter buffer for a given output, so
+      // unless it is known at call site that the input buffer has no more uses,
+      // a copy needs to be made at call site. With user specified alias the
+      // caller tells us that he expects a given output to land over the buffers
+      // of a given parametter.
+      if (input_output_alias.ParameterAliasKind(i, index) ==
+          xla::HloInputOutputAliasConfig::AliasKind::kUserAlias) {
+        TF_RET_CHECK(!is_dynamic(i));
+        return true;
+      }
+      return alias_outputs;
+    };
+    TF_ASSIGN_OR_RETURN(xla::ExecutionInput exec_input,
+                        input_tuples[i]->ToExecutionInput(alias_checker));
+    arguments.emplace_back(std::move(exec_input));
+  }
+  return std::move(arguments);
+}
+
 Status CreateExecuteOutput(OpKernelContext* context,
                            XRTMemoryManager* memory_manager,
                            RefPtr<XRTTupleAllocation> output_tuple,
diff --git a/tensorflow/compiler/xrt/xrt_util.h b/tensorflow/compiler/xrt/xrt_util.h
index cc1480fdb00..832c106621f 100644
--- a/tensorflow/compiler/xrt/xrt_util.h
+++ b/tensorflow/compiler/xrt/xrt_util.h
@@ -23,6 +23,8 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/compiler/xla/service/backend.h"
+#include "tensorflow/compiler/xla/service/hlo_input_output_alias_config.h"
+#include "tensorflow/compiler/xla/shape.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/xla.pb.h"
@@ -69,6 +71,25 @@ xla::DebugOptions BuildXlaDebugOptions(const xla::DebugOptions& ref_options);
 xla::StatusOr<std::vector<InputCoords>> GetComputationInputs(
     OpKernelContext* context, const char* input_name);
 
+bool InputShapeMatches(const xla::Shape& parameter_shape,
+                       const xla::Shape& input_shape);
+
+xla::StatusOr<std::vector<RefPtr<XRTTupleAllocation>>> GetInputTupleAllocations(
+    const std::vector<InputCoords>& input_coords,
+    XRTMemoryManager::WorkingSet* working_set, xla::Backend* backend,
+    int64 num_input_shapes,
+    const std::function<xla::Shape(int64)>& shape_getter, bool release_inputs);
+
+Status RebuildOutputAliases(
+    const RefPtr<XRTTupleAllocation>& output_tuple,
+    absl::Span<const RefPtr<XRTTupleAllocation>> input_tuples,
+    const xla::HloInputOutputAliasConfig& input_output_alias);
+
+xla::StatusOr<std::vector<xla::ExecutionInput>> GetArgumentsBuffers(
+    const xla::HloInputOutputAliasConfig& input_output_alias,
+    absl::Span<const RefPtr<XRTTupleAllocation>> input_tuples,
+    const std::vector<bool>& input_is_dynamic, bool release_inputs);
+
 // Create the XRT execute output tensor given the computation result
 // (output_tuple). The return_exploded_tuple tells whether a tuple result should
 // be returned as vector of handles representing each tuple child.

From 8cf97846290cf7d8b95256fe3123abaaa8c8e553 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 22 Jun 2020 11:45:59 -0700
Subject: [PATCH 0778/1390] Update Eigen to:
 https://gitlab.com/libeigen/eigen/-/commit/386d809bde475c65b7940f290efe80e6a05878c4

PiperOrigin-RevId: 317701612
Change-Id: I115babafbfc7fbad62ac4408da765dc1cf190610
---
 tensorflow/workspace.bzl | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index 5af8c8272b8..0f591ba8b90 100755
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -237,11 +237,11 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
         name = "eigen_archive",
         build_file = clean_dep("//third_party:eigen.BUILD"),
         patch_file = clean_dep("//third_party/eigen3:gpu_packet_math.patch"),
-        sha256 = "d26ada177ed9b696a9447fc85d209932a032c8ffc51630cf15eea8629b29dad6",  # SHARED_EIGEN_SHA
-        strip_prefix = "eigen-6b9c92fe7eff0dedb031cec38004c9c3667f3057",
+        sha256 = "f632d82e43ffc46adfac9043beace700b0265748075e7edc0701d81380258038",  # SHARED_EIGEN_SHA
+        strip_prefix = "eigen-386d809bde475c65b7940f290efe80e6a05878c4",
         urls = [
-            "https://storage.googleapis.com/mirror.tensorflow.org/gitlab.com/libeigen/eigen/-/archive/6b9c92fe7eff0dedb031cec38004c9c3667f3057/eigen-6b9c92fe7eff0dedb031cec38004c9c3667f3057.tar.gz",
-            "https://gitlab.com/libeigen/eigen/-/archive/6b9c92fe7eff0dedb031cec38004c9c3667f3057/eigen-6b9c92fe7eff0dedb031cec38004c9c3667f3057.tar.gz",
+            "https://storage.googleapis.com/mirror.tensorflow.org/gitlab.com/libeigen/eigen/-/archive/386d809bde475c65b7940f290efe80e6a05878c4/eigen-386d809bde475c65b7940f290efe80e6a05878c4.tar.gz",
+            "https://gitlab.com/libeigen/eigen/-/archive/386d809bde475c65b7940f290efe80e6a05878c4/eigen-386d809bde475c65b7940f290efe80e6a05878c4.tar.gz",
         ],
     )
 

From 39d080e8b9c0b1aa3896d4dca11169760d9ba506 Mon Sep 17 00:00:00 2001
From: Yujing Zhang <yujingzhang@google.com>
Date: Mon, 22 Jun 2020 11:48:57 -0700
Subject: [PATCH 0779/1390] Use the same CompositeDevice name on remote workers
 as the one on a client.

PiperOrigin-RevId: 317702206
Change-Id: I7068efb25eb930252f89a167108ed59c69c2078f
---
 .../core/common_runtime/composite_device.cc   | 17 ++++++----
 .../core/common_runtime/composite_device.h    |  5 +++
 .../common_runtime/composite_device_test.cc   | 16 ++++++++++
 .../core/common_runtime/eager/context.cc      | 24 +++++++++-----
 .../core/common_runtime/eager/context.h       |  7 +++--
 .../core/common_runtime/eager/context_test.cc | 31 +++++++++++++++++++
 .../core/common_runtime/eager/execute.cc      |  2 +-
 .../common_runtime/eager/tensor_handle.cc     |  9 +++---
 .../core/common_runtime/eager/tensor_handle.h |  2 +-
 .../eager/eager_service_impl.cc               |  2 +-
 .../eager/eager_service_impl_test.cc          |  4 +++
 .../eager/remote_copy_node.cc                 |  1 +
 tensorflow/core/protobuf/eager_service.proto  |  2 ++
 13 files changed, 99 insertions(+), 23 deletions(-)

diff --git a/tensorflow/core/common_runtime/composite_device.cc b/tensorflow/core/common_runtime/composite_device.cc
index 7fd41e00a04..d4548946cbf 100644
--- a/tensorflow/core/common_runtime/composite_device.cc
+++ b/tensorflow/core/common_runtime/composite_device.cc
@@ -25,6 +25,16 @@ const char* const kCompositeDeviceType = "COMPOSITE";
 std::unique_ptr<CompositeDevice> CompositeDevice::MakeDevice(
     const std::vector<string>& underlying_devices, const int unique_device_id,
     const DeviceNameUtils::ParsedName& host_name, Status* status) {
+  DeviceNameUtils::ParsedName parsed_name = host_name;
+  parsed_name.type = kCompositeDeviceType;
+  parsed_name.id = unique_device_id;
+  const string device_name = DeviceNameUtils::ParsedNameToString(parsed_name);
+  return CompositeDevice::MakeDevice(underlying_devices, device_name, status);
+}
+
+std::unique_ptr<CompositeDevice> CompositeDevice::MakeDevice(
+    const std::vector<string>& underlying_devices, const string& device_name,
+    Status* status) {
   if (underlying_devices.empty()) {
     status->Update(
         errors::InvalidArgument("underlying_devices should not be empty."));
@@ -63,13 +73,8 @@ std::unique_ptr<CompositeDevice> CompositeDevice::MakeDevice(
     }
   }
 
-  DeviceNameUtils::ParsedName parsed_composite_name = host_name;
   DeviceAttributes device_attributes;
-  parsed_composite_name.type = kCompositeDeviceType;
-  parsed_composite_name.id = unique_device_id;
-  const string composite_name =
-      DeviceNameUtils::ParsedNameToString(parsed_composite_name);
-  device_attributes.set_name(composite_name);
+  device_attributes.set_name(device_name);
   device_attributes.set_device_type(kCompositeDeviceType);
 
   return absl::WrapUnique(
diff --git a/tensorflow/core/common_runtime/composite_device.h b/tensorflow/core/common_runtime/composite_device.h
index 850eae55e8d..c68c395198a 100644
--- a/tensorflow/core/common_runtime/composite_device.h
+++ b/tensorflow/core/common_runtime/composite_device.h
@@ -48,6 +48,11 @@ class CompositeDevice : public Device {
       const std::vector<string>& underlying_devices, const int unique_device_id,
       const DeviceNameUtils::ParsedName& host_name, Status* status);
 
+  // Helper for creating a CompositeDevice with the given device name.
+  static std::unique_ptr<CompositeDevice> MakeDevice(
+      const std::vector<string>& underlying_devices, const string& device_name,
+      Status* status);
+
  private:
   CompositeDevice(const DeviceAttributes& device_attributes,
                   const std::vector<string>& underlying_devices)
diff --git a/tensorflow/core/common_runtime/composite_device_test.cc b/tensorflow/core/common_runtime/composite_device_test.cc
index 73a6ae44912..7d195a7a08e 100644
--- a/tensorflow/core/common_runtime/composite_device_test.cc
+++ b/tensorflow/core/common_runtime/composite_device_test.cc
@@ -80,4 +80,20 @@ TEST(CompositeDeviceTest, Basic) {
   }
 }
 
+TEST(CompositeDeviceTest, DeviceName) {
+  const string composite_device_name =
+      "/job:localhost/replica:0/task:0/device:CPU:10";
+  std::vector<string> underlying_devices;
+  underlying_devices.push_back("/job:worker/replica:0/task:0/device:CPU:0");
+  underlying_devices.push_back("/job:worker/replica:0/task:0/device:CPU:1");
+  Status status;
+  std::unique_ptr<CompositeDevice> composite_device =
+      CompositeDevice::MakeDevice(underlying_devices, composite_device_name,
+                                  &status);
+  TF_ASSERT_OK(status);
+  EXPECT_EQ(composite_device->name(), composite_device_name);
+  EXPECT_EQ(composite_device->device_type(), kCompositeDeviceType);
+  EXPECT_EQ(underlying_devices, *composite_device->underlying_devices());
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/eager/context.cc b/tensorflow/core/common_runtime/eager/context.cc
index 6dc0a3a8200..7ca40fc6cf6 100644
--- a/tensorflow/core/common_runtime/eager/context.cc
+++ b/tensorflow/core/common_runtime/eager/context.cc
@@ -893,7 +893,7 @@ Status EagerContext::FindDeviceFromName(const char* device_name,
 }
 
 Status EagerContext::FindCompositeDeviceFromName(
-    const char* device_name, CompositeDevice** device) const {
+    StringPiece device_name, CompositeDevice** device) const {
   tf_shared_lock l(composite_devices_mu_);
   for (const auto& d : composite_devices_) {
     if (d.second->name() == device_name) {
@@ -939,8 +939,13 @@ Status EagerContext::RegisterCustomDevice(
 }
 
 Status EagerContext::FindOrCreateCompositeDevice(
-    const std::vector<string>& underlying_devices,
+    const std::vector<string>& underlying_devices, const string& device_name,
     CompositeDevice** composite_device) {
+  if (!device_name.empty() &&
+      FindCompositeDeviceFromName(device_name, composite_device).ok()) {
+    return Status::OK();
+  }
+
   const uint64 hash_key = Fingerprint64(absl::StrJoin(underlying_devices, ","));
 
   mutex_lock l(composite_devices_mu_);
@@ -951,11 +956,16 @@ Status EagerContext::FindOrCreateCompositeDevice(
   }
 
   Status s;
-  // Create a CompositeDevice on the same task as the host CPU, in order to
-  // trigger packed TensorHandle copy from a client to a remote worker.
-  auto device =
-      CompositeDevice::MakeDevice(underlying_devices, composite_devices_.size(),
-                                  HostCPU()->parsed_name(), &s);
+  std::unique_ptr<CompositeDevice> device;
+  if (device_name.empty()) {
+    // Create a CompositeDevice on the same task as the host CPU, in order to
+    // trigger packed TensorHandle copy from a client to a remote worker.
+    device = CompositeDevice::MakeDevice(underlying_devices,
+                                         composite_devices_.size(),
+                                         HostCPU()->parsed_name(), &s);
+  } else {
+    device = CompositeDevice::MakeDevice(underlying_devices, device_name, &s);
+  }
   TF_RETURN_IF_ERROR(s);
   *composite_device = device.get();
   pflr_->AddCompositeDevice(*composite_device);
diff --git a/tensorflow/core/common_runtime/eager/context.h b/tensorflow/core/common_runtime/eager/context.h
index 3dab7c08d77..c16e1f0f4ad 100644
--- a/tensorflow/core/common_runtime/eager/context.h
+++ b/tensorflow/core/common_runtime/eager/context.h
@@ -486,7 +486,7 @@ class EagerContext : public ImmediateExecutionContext, public core::RefCounted {
 
   Status FindDeviceFromName(const char* device_name, Device** device) const;
 
-  Status FindCompositeDeviceFromName(const char* device_name,
+  Status FindCompositeDeviceFromName(StringPiece device_name,
                                      CompositeDevice** device) const;
 
   Status FindCustomDeviceFromName(const string& device_name,
@@ -495,9 +495,10 @@ class EagerContext : public ImmediateExecutionContext, public core::RefCounted {
   Status RegisterCustomDevice(const string& name,
                               std::unique_ptr<CustomDevice> device);
 
-  // Find or create a composite device with the given `underlying_devices`.
+  // Find or create a composite device with the given `underlying_devices` and
+  // `device_name` (if not empty).
   Status FindOrCreateCompositeDevice(
-      const std::vector<string>& underlying_devices,
+      const std::vector<string>& underlying_devices, const string& device_name,
       CompositeDevice** composite_device);
 
   bool OnSameTask(const Device* first, const Device* second) const;
diff --git a/tensorflow/core/common_runtime/eager/context_test.cc b/tensorflow/core/common_runtime/eager/context_test.cc
index c6ed61c80c4..7f34884b4db 100644
--- a/tensorflow/core/common_runtime/eager/context_test.cc
+++ b/tensorflow/core/common_runtime/eager/context_test.cc
@@ -177,6 +177,7 @@ TEST_F(EagerContextTest, CompositeDevice) {
       "/job:worker/replica:0/task:0/device:CPU:1"};
   CompositeDevice* composite_device_0 = nullptr;
   TF_ASSERT_OK(context()->FindOrCreateCompositeDevice(underlying_devices,
+                                                      /*device_name=*/"",
                                                       &composite_device_0));
   EXPECT_EQ(composite_device_0->name(),
             "/job:localhost/replica:0/task:0/device:COMPOSITE:0");
@@ -186,11 +187,13 @@ TEST_F(EagerContextTest, CompositeDevice) {
   EXPECT_EQ(device, composite_device_0);
   CompositeDevice* composite_device_1 = nullptr;
   TF_ASSERT_OK(context()->FindOrCreateCompositeDevice(underlying_devices,
+                                                      /*device_name=*/"",
                                                       &composite_device_1));
   EXPECT_EQ(composite_device_1, composite_device_0);
   underlying_devices.push_back("/job:worker/replica:0/task:0/device:CPU:2");
   CompositeDevice* composite_device_2 = nullptr;
   TF_ASSERT_OK(context()->FindOrCreateCompositeDevice(underlying_devices,
+                                                      /*device_name=*/"",
                                                       &composite_device_2));
   EXPECT_EQ(composite_device_2->name(),
             "/job:localhost/replica:0/task:0/device:COMPOSITE:1");
@@ -202,5 +205,33 @@ TEST_F(EagerContextTest, CompositeDevice) {
       "/job:localhost/replica:0/task:0/device:COMPOSITE:2", &device)));
 }
 
+TEST_F(EagerContextTest, CompositeDeviceWithGivenName) {
+  InitContext(SessionOptions(), DEVICE_PLACEMENT_EXPLICIT);
+  const std::vector<string> underlying_devices_0 = {
+      "/job:worker/replica:0/task:0/device:CPU:0",
+      "/job:worker/replica:0/task:0/device:CPU:1"};
+  const string composite_device_name =
+      "/job:worker1/replica:0/task:0/device:COMPOSITE:5";
+  // Create a CompositeDevice with the given name.
+  CompositeDevice* composite_device_0 = nullptr;
+  TF_ASSERT_OK(context()->FindOrCreateCompositeDevice(
+      underlying_devices_0, composite_device_name, &composite_device_0));
+  EXPECT_EQ(composite_device_0->name(), composite_device_name);
+
+  CompositeDevice* device = nullptr;
+  TF_EXPECT_OK(
+      context()->FindCompositeDeviceFromName(composite_device_name, &device));
+  EXPECT_EQ(device, composite_device_0);
+
+  std::vector<string> underlying_devices_1 = {
+      "/job:worker/replica:0/task:0/device:CPU:1",
+      "/job:worker/replica:0/task:0/device:CPU:2"};
+  // Find a CompositeDevice with the given name.
+  CompositeDevice* composite_device_1 = nullptr;
+  TF_ASSERT_OK(context()->FindOrCreateCompositeDevice(
+      underlying_devices_1, composite_device_0->name(), &composite_device_1));
+  EXPECT_EQ(composite_device_1, composite_device_0);
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/eager/execute.cc b/tensorflow/core/common_runtime/eager/execute.cc
index 5be0ad8db57..40b737c8ccf 100644
--- a/tensorflow/core/common_runtime/eager/execute.cc
+++ b/tensorflow/core/common_runtime/eager/execute.cc
@@ -415,7 +415,7 @@ Status GetOrCreateKernelAndDevice(
       TF_RETURN_IF_ERROR(GetDeviceForInput(ctx, input, &input_device));
       input_dev_ptrs.push_back(input_device);
       CompositeDevice* composite_device = nullptr;
-      if (ctx.FindCompositeDeviceFromName(input_device->name().c_str(),
+      if (ctx.FindCompositeDeviceFromName(input_device->name(),
                                           &composite_device)
               .ok()) {
         composite_devices[input_device->name()] =
diff --git a/tensorflow/core/common_runtime/eager/tensor_handle.cc b/tensorflow/core/common_runtime/eager/tensor_handle.cc
index 9e607c97683..ef3e7a3cd28 100644
--- a/tensorflow/core/common_runtime/eager/tensor_handle.cc
+++ b/tensorflow/core/common_runtime/eager/tensor_handle.cc
@@ -320,6 +320,7 @@ TensorHandle::TensorHandle(Device* d, Device* op_device,
 Status TensorHandle::CreatePackedHandle(std::vector<TensorHandle*>&& handles,
                                         const tensorflow::DataType dtype,
                                         const tensorflow::TensorShape& shape,
+                                        const string& device_name,
                                         EagerContext* ctx,
                                         TensorHandle** packed_handle) {
   if (handles.empty()) {
@@ -343,8 +344,8 @@ Status TensorHandle::CreatePackedHandle(std::vector<TensorHandle*>&& handles,
   }
 
   CompositeDevice* composite_device = nullptr;
-  TF_RETURN_IF_ERROR(
-      ctx->FindOrCreateCompositeDevice(devices, &composite_device));
+  TF_RETURN_IF_ERROR(ctx->FindOrCreateCompositeDevice(devices, device_name,
+                                                      &composite_device));
   *packed_handle =
       new TensorHandle(std::move(handles), composite_device, dtype, shape, ctx);
   (*packed_handle)->SetResourceHandleInfo(std::move(resource_handle_info));
@@ -363,8 +364,8 @@ Status TensorHandle::CreatePackedHandle(std::vector<TensorHandle*>&& handles,
   tensorflow::DataType dtype = handles.at(0)->dtype;
   tensorflow::TensorShape shape;
   TF_RETURN_IF_ERROR(handles.at(0)->Shape(&shape));
-  return CreatePackedHandle(std::move(handles), dtype, shape, ctx,
-                            packed_handle);
+  return CreatePackedHandle(std::move(handles), dtype, shape,
+                            /*device_name*/ "", ctx, packed_handle);
 }
 
 TensorHandle::TensorHandle(std::vector<TensorHandle*>&& handles, Device* device,
diff --git a/tensorflow/core/common_runtime/eager/tensor_handle.h b/tensorflow/core/common_runtime/eager/tensor_handle.h
index a14df475e0f..6ac48bdac26 100644
--- a/tensorflow/core/common_runtime/eager/tensor_handle.h
+++ b/tensorflow/core/common_runtime/eager/tensor_handle.h
@@ -94,7 +94,7 @@ class TensorHandle : public ImmediateExecutionTensorHandle,
   static Status CreatePackedHandle(std::vector<TensorHandle*>&& handles,
                                    const tensorflow::DataType dtype,
                                    const tensorflow::TensorShape& shape,
-                                   EagerContext* ctx,
+                                   const string& device_name, EagerContext* ctx,
                                    TensorHandle** packed_handle);
   static Status CreatePackedHandle(std::vector<TensorHandle*>&& handles,
                                    EagerContext* ctx,
diff --git a/tensorflow/core/distributed_runtime/eager/eager_service_impl.cc b/tensorflow/core/distributed_runtime/eager/eager_service_impl.cc
index f1e70d53757..fd0606538c4 100644
--- a/tensorflow/core/distributed_runtime/eager/eager_service_impl.cc
+++ b/tensorflow/core/distributed_runtime/eager/eager_service_impl.cc
@@ -685,7 +685,7 @@ Status EagerServiceImpl::SendPackedHandle(
   // Create a unshaped packed TensorHandle.
   TF_RETURN_IF_ERROR(TensorHandle::CreatePackedHandle(
       std::move(handles_to_pack), handles.at(0)->dtype, TensorShape(),
-      eager_context, &packed_handle));
+      send_packed_handle.device_name(), eager_context, &packed_handle));
 
   for (auto* h : handles) {
     // Unref handle since it has a ref in the packed handle now.
diff --git a/tensorflow/core/distributed_runtime/eager/eager_service_impl_test.cc b/tensorflow/core/distributed_runtime/eager/eager_service_impl_test.cc
index 7a315ca1ea5..a2412eb9625 100644
--- a/tensorflow/core/distributed_runtime/eager/eager_service_impl_test.cc
+++ b/tensorflow/core/distributed_runtime/eager/eager_service_impl_test.cc
@@ -1071,6 +1071,8 @@ TEST_F(EagerServiceImplTest, SendPackedHandleTest) {
   const string device0 = "/job:localhost/replica:0/task:0/device:CPU:0";
   const string device1 = "/job:localhost/replica:0/task:1/device:CPU:0";
   const string device2 = "/job:localhost/replica:0/task:2/device:CPU:0";
+  const string composite_device =
+      "/job:localhost/replica:0/task:0/device:COMPOSITE:0";
 
   uint64 context_id = random::New64();
   CreateContextRequest request;
@@ -1125,6 +1127,8 @@ TEST_F(EagerServiceImplTest, SendPackedHandleTest) {
 
   EXPECT_EQ(packed_handle->Type(), TensorHandle::PACKED);
   EXPECT_EQ(packed_handle->NumPackedHandles(), 3);
+  EXPECT_EQ(absl::get<Device*>(packed_handle->device())->name(),
+            composite_device);
 
   TensorHandle* handle0 = nullptr;
   TF_ASSERT_OK(packed_handle->ExtractPackedHandle(0, &handle0));
diff --git a/tensorflow/core/distributed_runtime/eager/remote_copy_node.cc b/tensorflow/core/distributed_runtime/eager/remote_copy_node.cc
index 41027d43188..a5eeed6a0b6 100644
--- a/tensorflow/core/distributed_runtime/eager/remote_copy_node.cc
+++ b/tensorflow/core/distributed_runtime/eager/remote_copy_node.cc
@@ -298,6 +298,7 @@ Status SerializePackedHandle(const uint64 op_id, TensorHandle* packed_handle,
                              const Device* target_device, EagerContext* ctx,
                              SendPackedHandleOp* op) {
   op->set_op_id(op_id);
+  op->set_device_name(VariantDeviceName(packed_handle->DeviceOrHostCPU(*ctx)));
   for (int i = 0; i < packed_handle->NumPackedHandles(); ++i) {
     TensorHandle* h = nullptr;
     TF_RETURN_IF_ERROR(packed_handle->ExtractPackedHandle(i, &h));
diff --git a/tensorflow/core/protobuf/eager_service.proto b/tensorflow/core/protobuf/eager_service.proto
index 3fe2bd486ba..179ef19f805 100644
--- a/tensorflow/core/protobuf/eager_service.proto
+++ b/tensorflow/core/protobuf/eager_service.proto
@@ -258,6 +258,8 @@ message SendPackedHandleOp {
   }
 
   repeated Handle handles = 2;
+
+  string device_name = 3;
 }
 
 ////////////////////////////////////////////////////////////////////////////////

From 4a340a291feaf53274df8baedbc3ae85d8891eb5 Mon Sep 17 00:00:00 2001
From: Frank Chen <frankchn@google.com>
Date: Mon, 22 Jun 2020 11:51:03 -0700
Subject: [PATCH 0780/1390] Add snappy compression support to RecordReader,
 RecordWriter, and tf.data snapshot

PiperOrigin-RevId: 317702626
Change-Id: Ic4c583387fb39646ac9e195ff96a3c5052da07c9
---
 tensorflow/core/BUILD                         |  1 +
 .../data/experimental/snapshot_util_test.cc   |  6 +++
 tensorflow/core/lib/io/BUILD                  | 18 ++++++++
 tensorflow/core/lib/io/record_reader.cc       | 36 ++++++++-------
 tensorflow/core/lib/io/record_reader.h        | 14 ++++--
 .../core/lib/io/record_reader_writer_test.cc  | 38 ++++++++++++++++
 tensorflow/core/lib/io/record_writer.cc       | 44 +++++++++++--------
 tensorflow/core/lib/io/record_writer.h        | 15 +++++--
 .../io/snappy/snappy_compression_options.h    | 36 +++++++++++++++
 9 files changed, 166 insertions(+), 42 deletions(-)
 create mode 100644 tensorflow/core/lib/io/snappy/snappy_compression_options.h

diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index 0f709750897..695035c91e9 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -1864,6 +1864,7 @@ cc_library(
         "//tensorflow/core/lib/io:random_inputstream",
         "//tensorflow/core/lib/io:record_reader",
         "//tensorflow/core/lib/io:record_writer",
+        "//tensorflow/core/lib/io:snappy_compression_options",
         "//tensorflow/core/lib/io:snappy_inputbuffer",
         "//tensorflow/core/lib/io:snappy_inputstream",
         "//tensorflow/core/lib/io:snappy_outputbuffer",
diff --git a/tensorflow/core/kernels/data/experimental/snapshot_util_test.cc b/tensorflow/core/kernels/data/experimental/snapshot_util_test.cc
index aedc0e194d7..e253014bf94 100644
--- a/tensorflow/core/kernels/data/experimental/snapshot_util_test.cc
+++ b/tensorflow/core/kernels/data/experimental/snapshot_util_test.cc
@@ -88,6 +88,7 @@ TEST(SnapshotUtilTest, CombinationRoundTripTest) {
 
   SnapshotRoundTrip(io::compression::kNone, 2);
   SnapshotRoundTrip(io::compression::kGzip, 2);
+  SnapshotRoundTrip(io::compression::kSnappy, 2);
 }
 
 void SnapshotReaderBenchmarkLoop(int iters, std::string compression_type,
@@ -195,11 +196,16 @@ void SnapshotTFRecordWriterGzipBenchmark(int iters) {
   SnapshotWriterBenchmarkLoop(iters, io::compression::kGzip, 2);
 }
 
+void SnapshotTFRecordWriterSnappyBenchmark(int iters) {
+  SnapshotWriterBenchmarkLoop(iters, io::compression::kSnappy, 2);
+}
+
 BENCHMARK(SnapshotCustomWriterNoneBenchmark);
 BENCHMARK(SnapshotCustomWriterGzipBenchmark);
 BENCHMARK(SnapshotCustomWriterSnappyBenchmark);
 BENCHMARK(SnapshotTFRecordWriterNoneBenchmark);
 BENCHMARK(SnapshotTFRecordWriterGzipBenchmark);
+BENCHMARK(SnapshotTFRecordWriterSnappyBenchmark);
 
 }  // namespace
 }  // namespace snapshot_util
diff --git a/tensorflow/core/lib/io/BUILD b/tensorflow/core/lib/io/BUILD
index 5e1704a50c1..797e9ad1a4b 100644
--- a/tensorflow/core/lib/io/BUILD
+++ b/tensorflow/core/lib/io/BUILD
@@ -145,6 +145,8 @@ cc_library(
         ":compression",
         ":inputstream_interface",
         ":random_inputstream",
+        ":snappy_compression_options",
+        ":snappy_inputstream",
         ":zlib_compression_options",
         ":zlib_inputstream",
         "//tensorflow/core/lib/core:coding",
@@ -164,6 +166,8 @@ cc_library(
     hdrs = ["record_writer.h"],
     deps = [
         ":compression",
+        ":snappy_compression_options",
+        ":snappy_outputbuffer",
         ":zlib_compression_options",
         ":zlib_outputbuffer",
         "//tensorflow/core/lib/core:coding",
@@ -221,6 +225,15 @@ cc_library(
     alwayslink = True,
 )
 
+cc_library(
+    name = "snappy_compression_options",
+    hdrs = ["snappy/snappy_compression_options.h"],
+    deps = [
+        "//tensorflow/core/platform:types",
+    ],
+    alwayslink = True,
+)
+
 cc_library(
     name = "cache",
     srcs = [
@@ -336,6 +349,9 @@ filegroup(
         "random_inputstream.h",
         "record_reader.cc",
         "record_reader.h",
+        "snappy/snappy_compression_options.h",
+        "snappy/snappy_inputstream.cc",
+        "snappy/snappy_inputstream.h",
         "table.cc",
         "table.h",
         "table_builder.cc",
@@ -366,6 +382,7 @@ filegroup(
         "random_inputstream.h",
         "record_reader.h",
         "record_writer.h",
+        "snappy/snappy_compression_options.h",
         "snappy/snappy_inputbuffer.h",
         "snappy/snappy_inputstream.h",
         "snappy/snappy_outputbuffer.h",
@@ -422,6 +439,7 @@ filegroup(
     srcs = [
         "inputbuffer.h",
         "iterator.h",
+        "snappy/snappy_compression_options.h",
         "snappy/snappy_inputbuffer.h",
         "snappy/snappy_inputstream.h",
         "snappy/snappy_outputbuffer.h",
diff --git a/tensorflow/core/lib/io/record_reader.cc b/tensorflow/core/lib/io/record_reader.cc
index 1af81bd902c..40e516f5ef9 100644
--- a/tensorflow/core/lib/io/record_reader.cc
+++ b/tensorflow/core/lib/io/record_reader.cc
@@ -31,26 +31,26 @@ namespace io {
 RecordReaderOptions RecordReaderOptions::CreateRecordReaderOptions(
     const string& compression_type) {
   RecordReaderOptions options;
+
+#if defined(IS_SLIM_BUILD)
+  if (compression_type != compression::kNone) {
+    LOG(ERROR) << "Compression is not supported but compression_type is set."
+               << " No compression will be used.";
+  }
+#else
   if (compression_type == compression::kZlib) {
     options.compression_type = io::RecordReaderOptions::ZLIB_COMPRESSION;
-#if defined(IS_SLIM_BUILD)
-    LOG(ERROR) << "Compression is not supported but compression_type is set."
-               << " No compression will be used.";
-#else
     options.zlib_options = io::ZlibCompressionOptions::DEFAULT();
-#endif  // IS_SLIM_BUILD
   } else if (compression_type == compression::kGzip) {
     options.compression_type = io::RecordReaderOptions::ZLIB_COMPRESSION;
-#if defined(IS_SLIM_BUILD)
-    LOG(ERROR) << "Compression is not supported but compression_type is set."
-               << " No compression will be used.";
-#else
     options.zlib_options = io::ZlibCompressionOptions::GZIP();
-#endif  // IS_SLIM_BUILD
+  } else if (compression_type == compression::kSnappy) {
+    options.compression_type = io::RecordReaderOptions::SNAPPY_COMPRESSION;
   } else if (compression_type != compression::kNone) {
     LOG(ERROR) << "Unsupported compression_type:" << compression_type
                << ". No compression will be used.";
   }
+#endif
   return options;
 }
 
@@ -63,20 +63,26 @@ RecordReader::RecordReader(RandomAccessFile* file,
     input_stream_.reset(new BufferedInputStream(input_stream_.release(),
                                                 options.buffer_size, true));
   }
-  if (options.compression_type == RecordReaderOptions::ZLIB_COMPRESSION) {
-// We don't have zlib available on all embedded platforms, so fail.
 #if defined(IS_SLIM_BUILD)
-    LOG(FATAL) << "Zlib compression is unsupported on mobile platforms.";
-#else   // IS_SLIM_BUILD
+  if (options.compression_type != RecordReaderOptions::NONE) {
+    LOG(FATAL) << "Compression is unsupported on mobile platforms.";
+  }
+#else
+  if (options.compression_type == RecordReaderOptions::ZLIB_COMPRESSION) {
     input_stream_.reset(new ZlibInputStream(
         input_stream_.release(), options.zlib_options.input_buffer_size,
         options.zlib_options.output_buffer_size, options.zlib_options, true));
-#endif  // IS_SLIM_BUILD
+  } else if (options.compression_type ==
+             RecordReaderOptions::SNAPPY_COMPRESSION) {
+    input_stream_.reset(
+        new SnappyInputStream(input_stream_.release(),
+                              options.snappy_options.output_buffer_size, true));
   } else if (options.compression_type == RecordReaderOptions::NONE) {
     // Nothing to do.
   } else {
     LOG(FATAL) << "Unrecognized compression type :" << options.compression_type;
   }
+#endif
 }
 
 // Read n+4 bytes from file, verify that checksum of first n bytes is
diff --git a/tensorflow/core/lib/io/record_reader.h b/tensorflow/core/lib/io/record_reader.h
index dd7def79f05..07709990a64 100644
--- a/tensorflow/core/lib/io/record_reader.h
+++ b/tensorflow/core/lib/io/record_reader.h
@@ -20,6 +20,8 @@ limitations under the License.
 #include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/lib/io/inputstream_interface.h"
 #if !defined(IS_SLIM_BUILD)
+#include "tensorflow/core/lib/io/snappy/snappy_compression_options.h"
+#include "tensorflow/core/lib/io/snappy/snappy_inputstream.h"
 #include "tensorflow/core/lib/io/zlib_compression_options.h"
 #include "tensorflow/core/lib/io/zlib_inputstream.h"
 #endif  // IS_SLIM_BUILD
@@ -32,9 +34,12 @@ class RandomAccessFile;
 
 namespace io {
 
-class RecordReaderOptions {
- public:
-  enum CompressionType { NONE = 0, ZLIB_COMPRESSION = 1 };
+struct RecordReaderOptions {
+  enum CompressionType {
+    NONE = 0,
+    ZLIB_COMPRESSION = 1,
+    SNAPPY_COMPRESSION = 2
+  };
   CompressionType compression_type = NONE;
 
   // If buffer_size is non-zero, then all reads must be sequential, and no
@@ -46,8 +51,9 @@ class RecordReaderOptions {
       const string& compression_type);
 
 #if !defined(IS_SLIM_BUILD)
-  // Options specific to zlib compression.
+  // Options specific to compression.
   ZlibCompressionOptions zlib_options;
+  SnappyCompressionOptions snappy_options;
 #endif  // IS_SLIM_BUILD
 };
 
diff --git a/tensorflow/core/lib/io/record_reader_writer_test.cc b/tensorflow/core/lib/io/record_reader_writer_test.cc
index 373c0d8b664..486b238bd29 100644
--- a/tensorflow/core/lib/io/record_reader_writer_test.cc
+++ b/tensorflow/core/lib/io/record_reader_writer_test.cc
@@ -158,6 +158,44 @@ TEST(RecordReaderWriterTest, TestBasics) {
   }
 }
 
+TEST(RecordReaderWriterTest, TestSnappy) {
+  Env* env = Env::Default();
+  string fname = testing::TmpDir() + "/record_reader_writer_snappy_test";
+
+  for (auto buf_size : BufferSizes()) {
+    // Snappy compression needs output buffer size > 1.
+    if (buf_size == 1) continue;
+    {
+      std::unique_ptr<WritableFile> file;
+      TF_CHECK_OK(env->NewWritableFile(fname, &file));
+
+      io::RecordWriterOptions options;
+      options.compression_type = io::RecordWriterOptions::SNAPPY_COMPRESSION;
+      options.zlib_options.output_buffer_size = buf_size;
+      io::RecordWriter writer(file.get(), options);
+      TF_EXPECT_OK(writer.WriteRecord("abc"));
+      TF_EXPECT_OK(writer.WriteRecord("defg"));
+      TF_CHECK_OK(writer.Flush());
+    }
+
+    {
+      std::unique_ptr<RandomAccessFile> read_file;
+      // Read it back with the RecordReader.
+      TF_CHECK_OK(env->NewRandomAccessFile(fname, &read_file));
+      io::RecordReaderOptions options;
+      options.compression_type = io::RecordReaderOptions::SNAPPY_COMPRESSION;
+      options.zlib_options.input_buffer_size = buf_size;
+      io::RecordReader reader(read_file.get(), options);
+      uint64 offset = 0;
+      tstring record;
+      TF_CHECK_OK(reader.ReadRecord(&offset, &record));
+      EXPECT_EQ("abc", record);
+      TF_CHECK_OK(reader.ReadRecord(&offset, &record));
+      EXPECT_EQ("defg", record);
+    }
+  }
+}
+
 TEST(RecordReaderWriterTest, TestZlib) {
   Env* env = Env::Default();
   string fname = testing::TmpDir() + "/record_reader_writer_zlib_test";
diff --git a/tensorflow/core/lib/io/record_writer.cc b/tensorflow/core/lib/io/record_writer.cc
index 52d0ef9a358..c82963d40c2 100644
--- a/tensorflow/core/lib/io/record_writer.cc
+++ b/tensorflow/core/lib/io/record_writer.cc
@@ -23,45 +23,49 @@ limitations under the License.
 namespace tensorflow {
 namespace io {
 namespace {
-bool IsZlibCompressed(RecordWriterOptions options) {
+bool IsZlibCompressed(const RecordWriterOptions& options) {
   return options.compression_type == RecordWriterOptions::ZLIB_COMPRESSION;
 }
+
+bool IsSnappyCompressed(const RecordWriterOptions& options) {
+  return options.compression_type == RecordWriterOptions::SNAPPY_COMPRESSION;
+}
 }  // namespace
 
 RecordWriterOptions RecordWriterOptions::CreateRecordWriterOptions(
     const string& compression_type) {
   RecordWriterOptions options;
+#if defined(IS_SLIM_BUILD)
+  if (compression_type != compression::kNone) {
+    LOG(ERROR) << "Compression is not supported but compression_type is set."
+               << " No compression will be used.";
+  }
+#else
   if (compression_type == compression::kZlib) {
     options.compression_type = io::RecordWriterOptions::ZLIB_COMPRESSION;
-#if defined(IS_SLIM_BUILD)
-    LOG(ERROR) << "Compression is not supported but compression_type is set."
-               << " No compression will be used.";
-#else
     options.zlib_options = io::ZlibCompressionOptions::DEFAULT();
-#endif  // IS_SLIM_BUILD
   } else if (compression_type == compression::kGzip) {
     options.compression_type = io::RecordWriterOptions::ZLIB_COMPRESSION;
-#if defined(IS_SLIM_BUILD)
-    LOG(ERROR) << "Compression is not supported but compression_type is set."
-               << " No compression will be used.";
-#else
     options.zlib_options = io::ZlibCompressionOptions::GZIP();
-#endif  // IS_SLIM_BUILD
+  } else if (compression_type == compression::kSnappy) {
+    options.compression_type = io::RecordWriterOptions::SNAPPY_COMPRESSION;
   } else if (compression_type != compression::kNone) {
     LOG(ERROR) << "Unsupported compression_type:" << compression_type
                << ". No compression will be used.";
   }
+#endif
   return options;
 }
 
 RecordWriter::RecordWriter(WritableFile* dest,
                            const RecordWriterOptions& options)
     : dest_(dest), options_(options) {
-  if (IsZlibCompressed(options)) {
-// We don't have zlib available on all embedded platforms, so fail.
 #if defined(IS_SLIM_BUILD)
-    LOG(FATAL) << "Zlib compression is unsupported on mobile platforms.";
-#else   // IS_SLIM_BUILD
+  if (compression_type != compression::kNone) {
+    LOG(FATAL) << "Compression is unsupported on mobile platforms.";
+  }
+#else
+  if (IsZlibCompressed(options)) {
     ZlibOutputBuffer* zlib_output_buffer = new ZlibOutputBuffer(
         dest, options.zlib_options.input_buffer_size,
         options.zlib_options.output_buffer_size, options.zlib_options);
@@ -71,12 +75,16 @@ RecordWriter::RecordWriter(WritableFile* dest,
                  << s.ToString();
     }
     dest_ = zlib_output_buffer;
-#endif  // IS_SLIM_BUILD
+  } else if (IsSnappyCompressed(options)) {
+    dest_ =
+        new SnappyOutputBuffer(dest, options.snappy_options.input_buffer_size,
+                               options.snappy_options.output_buffer_size);
   } else if (options.compression_type == RecordWriterOptions::NONE) {
     // Nothing to do
   } else {
     LOG(FATAL) << "Unspecified compression type :" << options.compression_type;
   }
+#endif
 }
 
 RecordWriter::~RecordWriter() {
@@ -130,14 +138,12 @@ Status RecordWriter::WriteRecord(const absl::Cord& data) {
 
 Status RecordWriter::Close() {
   if (dest_ == nullptr) return Status::OK();
-#if !defined(IS_SLIM_BUILD)
-  if (IsZlibCompressed(options_)) {
+  if (IsZlibCompressed(options_) || IsSnappyCompressed(options_)) {
     Status s = dest_->Close();
     delete dest_;
     dest_ = nullptr;
     return s;
   }
-#endif  // IS_SLIM_BUILD
   return Status::OK();
 }
 
diff --git a/tensorflow/core/lib/io/record_writer.h b/tensorflow/core/lib/io/record_writer.h
index 012c2fbbc91..243dc847ec5 100644
--- a/tensorflow/core/lib/io/record_writer.h
+++ b/tensorflow/core/lib/io/record_writer.h
@@ -21,6 +21,8 @@ limitations under the License.
 #include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/lib/hash/crc32c.h"
 #if !defined(IS_SLIM_BUILD)
+#include "tensorflow/core/lib/io/snappy/snappy_compression_options.h"
+#include "tensorflow/core/lib/io/snappy/snappy_outputbuffer.h"
 #include "tensorflow/core/lib/io/zlib_compression_options.h"
 #include "tensorflow/core/lib/io/zlib_outputbuffer.h"
 #endif  // IS_SLIM_BUILD
@@ -34,17 +36,22 @@ class WritableFile;
 
 namespace io {
 
-class RecordWriterOptions {
+struct RecordWriterOptions {
  public:
-  enum CompressionType { NONE = 0, ZLIB_COMPRESSION = 1 };
+  enum CompressionType {
+    NONE = 0,
+    ZLIB_COMPRESSION = 1,
+    SNAPPY_COMPRESSION = 2
+  };
   CompressionType compression_type = NONE;
 
   static RecordWriterOptions CreateRecordWriterOptions(
       const string& compression_type);
 
-// Options specific to zlib compression.
 #if !defined(IS_SLIM_BUILD)
+  // Options specific to compression.
   tensorflow::io::ZlibCompressionOptions zlib_options;
+  tensorflow::io::SnappyCompressionOptions snappy_options;
 #endif  // IS_SLIM_BUILD
 };
 
@@ -70,7 +77,7 @@ class RecordWriter {
   // implicit Close() call in the destructor.
   ~RecordWriter();
 
-  Status WriteRecord(StringPiece slice);
+  Status WriteRecord(StringPiece data);
 
 #if defined(PLATFORM_GOOGLE)
   Status WriteRecord(const absl::Cord& data);
diff --git a/tensorflow/core/lib/io/snappy/snappy_compression_options.h b/tensorflow/core/lib/io/snappy/snappy_compression_options.h
new file mode 100644
index 00000000000..d3d798bfa8f
--- /dev/null
+++ b/tensorflow/core/lib/io/snappy/snappy_compression_options.h
@@ -0,0 +1,36 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_LIB_IO_SNAPPY_SNAPPY_COMPRESSION_OPTIONS_H_
+#define TENSORFLOW_CORE_LIB_IO_SNAPPY_SNAPPY_COMPRESSION_OPTIONS_H_
+
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace io {
+
+struct SnappyCompressionOptions {
+  // Size of the buffer used for caching the data read from source file.
+  int64 input_buffer_size = 256 << 10;
+
+  // Size of the sink buffer where the compressed/decompressed data produced by
+  // snappy is cached.
+  int64 output_buffer_size = 256 << 10;
+};
+
+}  // namespace io
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_LIB_IO_SNAPPY_SNAPPY_COMPRESSION_OPTIONS_H_

From 0a946a0069d1964d19b6b42f0ca35161bf4c7843 Mon Sep 17 00:00:00 2001
From: Yi Situ <yisitu@google.com>
Date: Mon, 22 Jun 2020 12:02:30 -0700
Subject: [PATCH 0781/1390] Misc ClangTidy cleanups.

PiperOrigin-RevId: 317705032
Change-Id: I94c55b6d083c97f9962567ee28c2e952958a5ea8
---
 tensorflow/core/profiler/internal/gpu/cupti_tracer.cc  | 7 ++++---
 tensorflow/core/profiler/internal/gpu/device_tracer.cc | 2 +-
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/tensorflow/core/profiler/internal/gpu/cupti_tracer.cc b/tensorflow/core/profiler/internal/gpu/cupti_tracer.cc
index c9c626b7289..931801427e7 100644
--- a/tensorflow/core/profiler/internal/gpu/cupti_tracer.cc
+++ b/tensorflow/core/profiler/internal/gpu/cupti_tracer.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
 #include "absl/container/node_hash_map.h"
+#include "absl/container/node_hash_set.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/host_info.h"
@@ -683,7 +684,7 @@ Status CreateAndRecordEvent(CUevent *event, CUstream stream) {
 // Note: cuStreamGetCtx only available after CUDA 9.2.
 class ScopedCudaContext {
  public:
-  ScopedCudaContext(CUstream stream) : stream_(stream) {
+  explicit ScopedCudaContext(CUstream stream) : stream_(stream) {
     CuptiApiTracingDisabler disabler;  // don't trace cuda call in this func.
     CUcontext context;
     if (cuStreamGetCtx(stream, &context) != CUDA_SUCCESS) return;
@@ -1244,7 +1245,7 @@ class CuptiDriverApiHookWithCudaEvent : public CuptiDriverApiHook {
   // However there is no guarantee that we receive such callbacks in pairs, we
   // maintain a on-going API calls to make sure no memory leaks.
   struct CuptiApiCallbackContext {
-    CuptiApiCallbackContext(std::vector<uint32> &&r)
+    explicit CuptiApiCallbackContext(std::vector<uint32> &&r)
         : record_indices(std::move(r)) {}
     std::vector<uint32> record_indices;
   };
@@ -1252,7 +1253,7 @@ class CuptiDriverApiHookWithCudaEvent : public CuptiDriverApiHook {
   const CuptiTracerOptions option_;
   CuptiInterface *cupti_interface_;
   CuptiTraceCollector *collector_;
-  std::set<CuptiApiCallbackContext *> callback_contexts_;
+  absl::node_hash_set<CuptiApiCallbackContext *> callback_contexts_;
   std::vector<std::unique_ptr<CudaEventRecorder>> cuda_event_recorders_;
   TF_DISALLOW_COPY_AND_ASSIGN(CuptiDriverApiHookWithCudaEvent);
 };
diff --git a/tensorflow/core/profiler/internal/gpu/device_tracer.cc b/tensorflow/core/profiler/internal/gpu/device_tracer.cc
index 9c3e2d67bf0..3c0ac04caf2 100644
--- a/tensorflow/core/profiler/internal/gpu/device_tracer.cc
+++ b/tensorflow/core/profiler/internal/gpu/device_tracer.cc
@@ -241,7 +241,7 @@ class CuptiTraceCollectorImpl : public CuptiTraceCollector {
   std::string ReportDroppedEvents() {
     absl::MutexLock lock(&mutex_);
     string result;
-    for (const auto dropped : dropped_events_) {
+    for (const auto& dropped : dropped_events_) {
       absl::StrAppend(&result, " ", dropped.second, " events dropped because ",
                       dropped.first, ";");
     }

From 9ca89a201cf56b475b97197bb5605126e225038f Mon Sep 17 00:00:00 2001
From: George Karpenkov <cheshire@google.com>
Date: Mon, 22 Jun 2020 12:07:25 -0700
Subject: [PATCH 0782/1390] [TF2XLA] [NFC] Refactor datastructures for resource
 variables to not require snapshotting for compilation

Previously, `BuildXlaCompilerArguments` required taking a snapshot of all
resource variables in order to start compiling.
In this CL it can operate with a span of pointers to resource variables instead
(we CHECK at runtime that the lock is held).
That refactoring allows to launch the XLA compilation without creating an extra
reference to the underlying Tensor of the passed resource variables.

PiperOrigin-RevId: 317706126
Change-Id: I37a97601a08f165b23b4745e1b032bf91c21c313
---
 tensorflow/compiler/jit/kernels/xla_ops.cc    | 49 ++++++----
 .../compiler/jit/xla_compile_on_demand_op.cc  | 55 ++++++-----
 .../compiler/jit/xla_compile_on_demand_op.h   |  5 +-
 tensorflow/compiler/jit/xla_launch_util.cc    | 91 ++++++++++---------
 tensorflow/compiler/jit/xla_launch_util.h     | 61 +++++++------
 5 files changed, 141 insertions(+), 120 deletions(-)

diff --git a/tensorflow/compiler/jit/kernels/xla_ops.cc b/tensorflow/compiler/jit/kernels/xla_ops.cc
index 0fc1a349adc..e3542586c89 100644
--- a/tensorflow/compiler/jit/kernels/xla_ops.cc
+++ b/tensorflow/compiler/jit/kernels/xla_ops.cc
@@ -108,8 +108,7 @@ class XlaExecutableClosure {
   explicit XlaExecutableClosure(
       xla::LocalClient* client, xla::LocalExecutable* executable,
       const XlaCompiler::CompilationResult* compilation_result,
-      std::map<int, OptionalTensor> resource_var_snapshots,
-      int num_constant_args)
+      ResourceVarsSnapshot resource_var_snapshots, int num_constant_args)
       : client_(client),
         executable_(executable),
         compilation_result_(compilation_result),
@@ -124,7 +123,7 @@ class XlaExecutableClosure {
   const XlaCompiler::CompilationResult* compilation_result() const {
     return compilation_result_;
   }
-  const std::map<int, OptionalTensor>& resource_var_snapshots() const {
+  const ResourceVarsSnapshot& resource_var_snapshots() const {
     return resource_var_snapshots_;
   }
   int num_constant_args() const { return num_constant_args_; }
@@ -133,7 +132,7 @@ class XlaExecutableClosure {
   xla::LocalClient* client_;
   xla::LocalExecutable* executable_;
   const XlaCompiler::CompilationResult* compilation_result_;
-  std::map<int, OptionalTensor> resource_var_snapshots_;
+  ResourceVarsSnapshot resource_var_snapshots_;
   int num_constant_args_;
 
   TF_DISALLOW_COPY_AND_ASSIGN(XlaExecutableClosure);
@@ -276,10 +275,10 @@ static Status BuildCompilationCache(OpKernelContext* ctx,
 
 static Status CompileToLocalExecutable(
     OpKernelContext* ctx, const NameAttrList& function, bool has_ref_vars,
-    const XlaPlatformInfo& platform_info, absl::Span<const int> resources,
+    const XlaPlatformInfo& platform_info,
+    absl::Span<VariableInfo const> variable_infos,
     absl::Span<const int> constants, bool lazy, xla::LocalClient** client,
-    std::map<int, OptionalTensor>* variables,
-    const XlaCompiler::CompilationResult** kernel,
+    const XlaCompiler::CompilationResult** compilation_result,
     xla::LocalExecutable** executable) {
   // We store information about the JIT-compiled XLA computation
   // in the ResourceMgr.
@@ -299,7 +298,6 @@ static Status CompileToLocalExecutable(
   // this is more obviously correct.)
   core::ScopedUnref cache_ref(cache);
 
-  TF_RETURN_IF_ERROR(SnapshotResourceVariables(ctx, resources, variables));
   *client = static_cast<xla::LocalClient*>(cache->client());
 
   absl::optional<se::TfAllocatorAdapter> tf_allocator_adapter;
@@ -337,11 +335,11 @@ static Status CompileToLocalExecutable(
 
   std::vector<XlaCompiler::Argument> args;
   TF_RETURN_IF_ERROR(XlaComputationLaunchContext::BuildXlaCompilerArguments(
-      constant_args, *variables, ctx, &args));
+      constant_args, variable_infos, ctx, &args));
   return cache->Compile(options, function, args, compile_options,
                         lazy ? XlaCompilationCache::CompileMode::kLazy
                              : XlaCompilationCache::CompileMode::kStrict,
-                        kernel, executable);
+                        compilation_result, executable);
 }
 
 void XlaLocalLaunchBase::Compute(OpKernelContext* ctx) {
@@ -349,16 +347,22 @@ void XlaLocalLaunchBase::Compute(OpKernelContext* ctx) {
           << Canonicalize(function_.name(), AttrSlice(&function_.attr()));
 
   xla::LocalClient* client;
-  const XlaCompiler::CompilationResult* kernel;
+  const XlaCompiler::CompilationResult* compilation_result;
   xla::LocalExecutable* executable;
-  std::map<int, OptionalTensor> variables;
 
+  ResourceVarsSnapshot variables;
   {
+    std::vector<VariableInfo> variable_infos;
+    OP_REQUIRES_OK(
+        ctx, GetVariableInfosFromCtxInputs(ctx, resources_, &variable_infos));
+    OP_REQUIRES_OK(ctx, LockVariables(absl::MakeSpan(variable_infos)));
     Status s = CompileToLocalExecutable(
         ctx, function_, /*has_ref_vars=*/has_ref_vars_, platform_info_,
-        resources_, constants_, /*lazy=*/false, &client, &variables, &kernel,
-        &executable);
+        variable_infos, constants_, /*lazy=*/false, &client,
+        &compilation_result, &executable);
     OP_REQUIRES_OK(ctx, s);
+    OP_REQUIRES_OK(ctx, SnapshotResourceVariables(ctx, resources_,
+                                                  variable_infos, &variables));
   }
 
   se::Stream* stream =
@@ -373,7 +377,7 @@ void XlaLocalLaunchBase::Compute(OpKernelContext* ctx) {
       client, allocator,
       /*allocate_xla_tensors=*/platform_info_.is_on_xla_device(),
       platform_info_.UseMultipleStreams());
-  launch_context.PopulateInputs(ctx, kernel, variables,
+  launch_context.PopulateInputs(ctx, compilation_result, variables,
                                 /*missing_ctx_input_prefix=*/0);
 
   // Execute the computation.
@@ -413,7 +417,7 @@ void XlaLocalLaunchBase::Compute(OpKernelContext* ctx) {
       executable->executable()->module().input_output_alias_config();
   OP_REQUIRES_OK(
       ctx, launch_context.PopulateOutputs(
-               ctx, kernel, run_result.ConsumeValueOrDie(),
+               ctx, compilation_result, run_result.ConsumeValueOrDie(),
                /*missing_ctx_input_prefix=*/0, input_output_alias, variables));
   VLOG(1) << "Done";
 }
@@ -494,7 +498,7 @@ void XlaCompileOp::Compute(OpKernelContext* ctx) {
   xla::LocalClient* client;
   const XlaCompiler::CompilationResult* kernel;
   xla::LocalExecutable* executable;
-  std::map<int, OptionalTensor> variables;
+  ResourceVarsSnapshot variables;
 
   bool cannot_compile_cluster;
   {
@@ -506,9 +510,16 @@ void XlaCompileOp::Compute(OpKernelContext* ctx) {
       cannot_compile_cluster) {
     executable = nullptr;
   } else {
+    std::vector<VariableInfo> variable_infos;
+    OP_REQUIRES_OK(
+        ctx, GetVariableInfosFromCtxInputs(ctx, resources_, &variable_infos));
+    OP_REQUIRES_OK(ctx, LockVariables(absl::MakeSpan(variable_infos)));
     Status status = CompileToLocalExecutable(
-        ctx, function_, has_ref_vars_, platform_info_, resources_, constants_,
-        /*lazy=*/!must_compile_, &client, &variables, &kernel, &executable);
+        ctx, function_, has_ref_vars_, platform_info_, variable_infos,
+        constants_,
+        /*lazy=*/!must_compile_, &client, &kernel, &executable);
+    OP_REQUIRES_OK(ctx, SnapshotResourceVariables(ctx, resources_,
+                                                  variable_infos, &variables));
     if (must_compile_ || status.code() != error::UNIMPLEMENTED) {
       OP_REQUIRES_OK(ctx, status);
     }
diff --git a/tensorflow/compiler/jit/xla_compile_on_demand_op.cc b/tensorflow/compiler/jit/xla_compile_on_demand_op.cc
index e1ad0e8c5af..afaee614f02 100644
--- a/tensorflow/compiler/jit/xla_compile_on_demand_op.cc
+++ b/tensorflow/compiler/jit/xla_compile_on_demand_op.cc
@@ -28,32 +28,23 @@ limitations under the License.
 
 namespace tensorflow {
 
-namespace {
-std::map<int, OptionalTensor> GetVariables(OpKernelContext* ctx) {
-  std::map<int, OptionalTensor> variables;
-  for (int64 i = 0; i < ctx->num_inputs(); ++i) {
+// Returns argument indices corresponding to the resource variable inputs of
+// kernel context `ctx`.
+static std::vector<int> GetResourceVariableIndices(OpKernelContext* ctx) {
+  std::vector<int> out;
+  for (int64 i = 0; i < ctx->num_inputs(); i++) {
     if (ctx->input(i).dtype() == DT_RESOURCE) {
-      core::RefCountPtr<Var> variable;
-      ResourceHandle handle = HandleFromInput(ctx, i);
-      OptionalTensor& optional = variables[i];
-      optional.name = handle.name();
-      if (LookupResource(ctx, handle, &variable).ok()) {
-        tf_shared_lock lock(*variable->mu());
-        optional.present = true;
-        optional.value = *variable->tensor();
-      }
+      out.push_back(i);
     }
   }
-  return variables;
+  return out;
 }
-}  // namespace
 
 Status XlaCompileOnDemandOp::Run(OpKernelContext* ctx,
                                  const XlaDevice::Metadata& metadata,
                                  const XlaCompiler::CompilationResult* result,
-                                 xla::LocalExecutable* executable) {
-  std::map<int, OptionalTensor> variables = GetVariables(ctx);
-
+                                 xla::LocalExecutable* executable,
+                                 const ResourceVarsSnapshot& variable_args) {
   xla::LocalClient* client = metadata.client();
 
   // Builds an XLA allocator for the device.
@@ -62,7 +53,7 @@ Status XlaCompileOnDemandOp::Run(OpKernelContext* ctx,
       /*allocate_xla_tensors=*/true,
       /*use_multiple_streams=*/metadata.UseMultipleStreams());
 
-  launch_context.PopulateInputs(ctx, result, variables,
+  launch_context.PopulateInputs(ctx, result, variable_args,
                                 /*missing_ctx_input_prefix=*/0);
 
   se::Stream* stream =
@@ -87,7 +78,7 @@ Status XlaCompileOnDemandOp::Run(OpKernelContext* ctx,
       executable->executable()->module().input_output_alias_config();
   TF_RETURN_IF_ERROR(launch_context.PopulateOutputs(
       ctx, result, run_result.ConsumeValueOrDie(),
-      /*missing_ctx_input_prefix=*/0, input_output_alias, variables));
+      /*missing_ctx_input_prefix=*/0, input_output_alias, variable_args));
   return Status::OK();
 }
 
@@ -115,7 +106,7 @@ Status XlaCompileOnDemandOp::ShouldArgumentBeConstant(
 Status XlaCompileOnDemandOp::Compile(
     OpKernelContext* ctx, const XlaDevice::Metadata& metadata,
     const XlaCompiler::CompilationResult** result,
-    xla::LocalExecutable** executable) {
+    ResourceVarsSnapshot* variable_args, xla::LocalExecutable** executable) {
   std::map<int, Tensor> constant_arguments;
   for (int64 i = 0; i < ctx->num_inputs(); ++i) {
     const Tensor& device_tensor = ctx->input(i);
@@ -190,12 +181,18 @@ Status XlaCompileOnDemandOp::Compile(
   // rather than a one-element tuple.
   compile_options.always_return_tuple = false;
 
-  std::map<int, OptionalTensor> variable_args = GetVariables(ctx);
-
+  std::vector<int> variables_indices = GetResourceVariableIndices(ctx);
   std::vector<XlaCompiler::Argument> args;
-
-  TF_RETURN_IF_ERROR(XlaComputationLaunchContext::BuildXlaCompilerArguments(
-      constant_arguments, variable_args, ctx, &args));
+  {
+    std::vector<VariableInfo> variable_infos;
+    TF_RETURN_IF_ERROR(
+        GetVariableInfosFromCtxInputs(ctx, variables_indices, &variable_infos));
+    TF_RETURN_IF_ERROR(LockVariables(absl::MakeSpan(variable_infos)));
+    TF_RETURN_IF_ERROR(SnapshotResourceVariables(
+        ctx, variables_indices, variable_infos, variable_args));
+    TF_RETURN_IF_ERROR(XlaComputationLaunchContext::BuildXlaCompilerArguments(
+        constant_arguments, variable_infos, ctx, &args));
+  }
 
   return cache->CompileSingleOp(options, args, ctx, compile_options, result,
                                 executable);
@@ -206,8 +203,10 @@ void XlaCompileOnDemandOp::Compute(OpKernelContext* ctx) {
   xla::LocalExecutable* executable;
   const XlaDevice::Metadata* metadata;
   OP_REQUIRES_OK(ctx, XlaDevice::GetMetadata(ctx, &metadata));
-  OP_REQUIRES_OK(ctx, Compile(ctx, *metadata, &result, &executable));
-  OP_REQUIRES_OK(ctx, Run(ctx, *metadata, result, executable));
+  ResourceVarsSnapshot variable_args;
+  OP_REQUIRES_OK(ctx,
+                 Compile(ctx, *metadata, &result, &variable_args, &executable));
+  OP_REQUIRES_OK(ctx, Run(ctx, *metadata, result, executable, variable_args));
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/xla_compile_on_demand_op.h b/tensorflow/compiler/jit/xla_compile_on_demand_op.h
index 98f634db98f..cc5f2f1e42f 100644
--- a/tensorflow/compiler/jit/xla_compile_on_demand_op.h
+++ b/tensorflow/compiler/jit/xla_compile_on_demand_op.h
@@ -20,6 +20,7 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_JIT_XLA_COMPILE_ON_DEMAND_OP_H_
 
 #include "tensorflow/compiler/jit/xla_device.h"
+#include "tensorflow/compiler/jit/xla_launch_util.h"
 #include "tensorflow/compiler/tf2xla/xla_compiler.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/core/framework/function.h"
@@ -47,10 +48,12 @@ class XlaCompileOnDemandOp : public OpKernel {
                                 bool* result);
   Status Compile(OpKernelContext* ctx, const XlaDevice::Metadata& metadata,
                  const XlaCompiler::CompilationResult** result,
+                 ResourceVarsSnapshot* variable_args,
                  xla::LocalExecutable** executable);
   Status Run(OpKernelContext* ctx, const XlaDevice::Metadata& metadata,
              const XlaCompiler::CompilationResult* result,
-             xla::LocalExecutable* executable);
+             xla::LocalExecutable* executable,
+             const ResourceVarsSnapshot& variable_args);
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/xla_launch_util.cc b/tensorflow/compiler/jit/xla_launch_util.cc
index 25eed134e35..fc0ff8d9445 100644
--- a/tensorflow/compiler/jit/xla_launch_util.cc
+++ b/tensorflow/compiler/jit/xla_launch_util.cc
@@ -52,7 +52,8 @@ const char kPossibleNonVariableResourceHintMessage[] =
     "resource inputs to XLA.";
 }  // anonymous namespace
 
-VariableInfo::VariableInfo(int index, Var* var) : index_(index), var_(var) {}
+VariableInfo::VariableInfo(int index, absl::string_view name, Var* var)
+    : index_(index), name_(name), var_(var) {}
 VariableInfo::VariableInfo(VariableInfo&& other)
     : index_(other.index_), var_(other.var_), lock_held_(other.lock_held_) {
   other.index_ = -1;
@@ -87,16 +88,15 @@ VariableInfo::~VariableInfo() {
 // Returns a vector of VariableInfo instances for the resource variable inputs
 // to the kernel with context `ctx`.  The input indices for the resource
 // variable inputs are in `variable_indices`.
-static Status GetVariableInfosFromCtxInputs(
-    OpKernelContext* ctx, absl::Span<const int> variable_indices,
-    std::vector<VariableInfo>* result) {
+Status GetVariableInfosFromCtxInputs(OpKernelContext* ctx,
+                                     absl::Span<const int> variable_indices,
+                                     std::vector<VariableInfo>* result) {
   std::vector<const ResourceHandle*> resource_handles;
   absl::c_transform(
       variable_indices, std::back_inserter(resource_handles),
       [&](int variable_idx) { return &HandleFromInput(ctx, variable_idx); });
 
   std::vector<core::RefCountPtr<Var>> variables;
-
   Status s = LookupResources(ctx, resource_handles, &variables);
   if (!s.ok()) {
     errors::AppendToMessage(&s, kPossibleNonVariableResourceHintMessage);
@@ -109,7 +109,9 @@ static Status GetVariableInfosFromCtxInputs(
     // *Release* the variable because we're going to unref it later in
     // ~VariableInfo.
     Var* variable = variables[i].release();
-    result->emplace_back(variable_indices[i], variable);
+    int input_idx = variable_indices[i];
+    std::string var_name = HandleFromInput(ctx, input_idx).name();
+    result->emplace_back(input_idx, var_name, variable);
   }
 
   return Status::OK();
@@ -162,21 +164,12 @@ Status LockVariables(absl::Span<VariableInfo> variables) {
 
 Status SnapshotResourceVariables(OpKernelContext* ctx,
                                  absl::Span<const int> variable_indices,
-                                 std::map<int, OptionalTensor>* result) {
-  std::vector<VariableInfo> variable_infos;
-  TF_RETURN_IF_ERROR(
-      GetVariableInfosFromCtxInputs(ctx, variable_indices, &variable_infos));
-  TF_RETURN_IF_ERROR(LockVariables(absl::MakeSpan(variable_infos)));
-
+                                 absl::Span<VariableInfo const> variable_infos,
+                                 ResourceVarsSnapshot* result) {
   for (int i = 0; i < variable_indices.size(); i++) {
-    if (variable_infos[i].var()) {
-      OptionalTensor& tensor = (*result)[variable_indices[i]];
-      tensor.name = HandleFromInput(ctx, variable_indices[i]).name();
-      tensor.present = true;
-      tensor.value = *variable_infos[i].var()->tensor();
-    } else {
-      (*result)[variable_indices[i]] = OptionalTensor();
-    }
+    Var* var = variable_infos[i].var();
+    (*result)[variable_indices[i]] =
+        var ? absl::make_optional(*var->tensor()) : absl::nullopt;
   }
   return Status::OK();
 }
@@ -197,8 +190,7 @@ XlaComputationLaunchContext::XlaComputationLaunchContext(
 void XlaComputationLaunchContext::PopulateInputs(
     OpKernelContext* ctx,
     const XlaCompiler::CompilationResult* compilation_result,
-    const std::map<int, OptionalTensor>& variables,
-    int missing_ctx_input_prefix) {
+    const ResourceVarsSnapshot& variables, int missing_ctx_input_prefix) {
   // Build ShapedBuffers that point directly to the Tensor buffers.
   arg_ptrs_ =
       std::vector<ShapedBuffer*>(compilation_result->xla_input_shapes.size());
@@ -210,7 +202,7 @@ void XlaComputationLaunchContext::PopulateInputs(
     CHECK_GE(arg_num, missing_ctx_input_prefix);
     const xla::Shape& shape = compilation_result->xla_input_shapes[i];
     const Tensor* t = variables.count(arg_num)
-                          ? &(variables.at(arg_num).value)
+                          ? &(variables.at(arg_num).value())
                           : &(ctx->input(arg_num - missing_ctx_input_prefix));
     CHECK(t);
 
@@ -262,7 +254,7 @@ static const Tensor* FindAliasedTensorForOutput(
     int output_num, OpKernelContext* ctx, int missing_ctx_input_prefix,
     const xla::HloInputOutputAliasConfig& input_output_alias,
     absl::Span<const int> input_mapping,
-    const std::map<int, OptionalTensor>& resource_var_snapshots) {
+    const ResourceVarsSnapshot& resource_var_snapshots) {
   if (MustAliasOutput(input_output_alias, output_num)) {
     int xla_param = input_output_alias.GetAliasedParameter({output_num})
                         .value()
@@ -274,8 +266,8 @@ static const Tensor* FindAliasedTensorForOutput(
     // entry time.
     if (input_tensor->dtype() == DT_RESOURCE) {
       auto& v = resource_var_snapshots.at(missing_ctx_input_prefix + tf_param);
-      CHECK(v.present);
-      return &v.value;
+      CHECK(v.has_value());
+      return &v.value();
     }
     return input_tensor;
   }
@@ -298,9 +290,9 @@ static Tensor GetOrCreateTensorForOutput(
     int output_num, OpKernelContext* ctx, int missing_ctx_input_prefix,
     const xla::HloInputOutputAliasConfig& input_output_alias,
     absl::Span<const int> input_mapping,
-    const std::map<int, OptionalTensor>& resource_var_snapshots,
-    DataType output_dtype, const TensorShape& output_shape,
-    se::DeviceMemoryBase output_buffer, Allocator* output_allocator) {
+    const ResourceVarsSnapshot& resource_var_snapshots, DataType output_dtype,
+    const TensorShape& output_shape, se::DeviceMemoryBase output_buffer,
+    Allocator* output_allocator) {
   if (const Tensor* aliased_tensor = FindAliasedTensorForOutput(
           output_num, ctx, missing_ctx_input_prefix, input_output_alias,
           input_mapping, resource_var_snapshots)) {
@@ -431,13 +423,13 @@ static xla::StatusOr<std::vector<VariableInfo>> GatherVariableInfo(
     // TODO(b/35625933): tensorflow::Var should contain a PersistentTensor,
     // not a Tensor.
     Var* variable = nullptr;
-    TF_RETURN_IF_ERROR(LookupOrCreateResource<Var>(
-        ctx, HandleFromInput(ctx, actual_input_index), &variable,
-        [&write](Var** ptr) {
-          *ptr = new Var(write.type);
-          return Status::OK();
-        }));
-    variable_infos.emplace_back(actual_input_index, variable);
+    const ResourceHandle handle = HandleFromInput(ctx, actual_input_index);
+    TF_RETURN_IF_ERROR(LookupOrCreateResource<Var>(ctx, handle, &variable,
+                                                   [&write](Var** ptr) {
+                                                     *ptr = new Var(write.type);
+                                                     return Status::OK();
+                                                   }));
+    variable_infos.emplace_back(actual_input_index, handle.name(), variable);
   }
   return variable_infos;
 }
@@ -447,7 +439,7 @@ Status XlaComputationLaunchContext::PopulateOutputs(
     const XlaCompiler::CompilationResult* compilation_result,
     ScopedShapedBuffer output, int missing_ctx_input_prefix,
     const xla::HloInputOutputAliasConfig& input_output_alias,
-    const std::map<int, OptionalTensor>& resource_var_snapshots) {
+    const ResourceVarsSnapshot& resource_var_snapshots) {
   se::Stream* stream =
       ctx->op_device_context() ? ctx->op_device_context()->stream() : nullptr;
   Allocator* allocator = ctx->device()->GetAllocator({});
@@ -564,12 +556,21 @@ Status XlaComputationLaunchContext::PopulateOutputs(
 
 Status XlaComputationLaunchContext::BuildXlaCompilerArguments(
     const std::map<int, Tensor>& constant_args,
-    const std::map<int, OptionalTensor>& variable_args, OpKernelContext* ctx,
+    absl::Span<VariableInfo const> variable_args, OpKernelContext* ctx,
     std::vector<XlaCompiler::Argument>* args) {
   args->resize(ctx->num_inputs());
 
+  absl::flat_hash_map<int, const VariableInfo*> variable_info_lookup;
+  for (const VariableInfo& info : variable_args) {
+    CHECK(!info.var() || info.lock_held())
+        << "Need to hold the lock on resource variables "
+           "before calling BuildXlaCompilerArguments";
+    variable_info_lookup.emplace(info.index(), &info);
+  }
+
   for (int64 input_num = 0; input_num < ctx->num_inputs(); ++input_num) {
     XlaCompiler::Argument& arg = (*args)[input_num];
+
     if (constant_args.count(input_num) > 0) {
       // Handles compile-time constants.
       const Tensor& input = constant_args.at(input_num);
@@ -578,7 +579,7 @@ Status XlaComputationLaunchContext::BuildXlaCompilerArguments(
       arg.type = input.dtype();
       arg.shape = input.shape();
       arg.constant_value = input;
-    } else if (variable_args.count(input_num) == 0) {
+    } else if (variable_info_lookup.count(input_num) == 0) {
       // Handles the non-constant arguments.
       const Tensor& input = ctx->input(input_num);
       TF_RET_CHECK(input.dtype() != DT_RESOURCE);
@@ -594,14 +595,14 @@ Status XlaComputationLaunchContext::BuildXlaCompilerArguments(
       // Handles resource variables.
       const Tensor& input = ctx->input(input_num);
       TF_RET_CHECK(input.dtype() == DT_RESOURCE);
-      const OptionalTensor& variable = variable_args.at(input_num);
-      arg.name = variable.name;
+      const VariableInfo& variable = *variable_info_lookup[input_num];
+      arg.name = std::string(variable.name());
       arg.kind = XlaCompiler::Argument::kResource;
       arg.resource_kind = XlaResource::kVariable;
-      if (variable.present) {
-        const Tensor& value = variable.value;
-        arg.type = value.dtype();
-        arg.shape = value.shape();
+      if (variable.var()) {
+        const Tensor* value = variable.var()->tensor();
+        arg.type = value->dtype();
+        arg.shape = value->shape();
         arg.initialized = true;
       } else {
         // The values of uninitialized variables are not passed as inputs, since
diff --git a/tensorflow/compiler/jit/xla_launch_util.h b/tensorflow/compiler/jit/xla_launch_util.h
index 9a7f20cb310..92b6c4c8a08 100644
--- a/tensorflow/compiler/jit/xla_launch_util.h
+++ b/tensorflow/compiler/jit/xla_launch_util.h
@@ -34,36 +34,17 @@ limitations under the License.
 
 namespace tensorflow {
 
-// Struct that represents a possibly-absent Tensor.
-struct OptionalTensor {
-  string name;           // A descriptive name
-  bool present = false;  // Is the tensor present?
-  Tensor value;          // If present, what is the Tensor's value?
-};
-
-// Takes a snapshot of the values of resource variable arguments, whose indices
-// are specified in `variable_indices` argument. We snapshot tensors that back
-// resource variables since concurrent updates may modify the shape, and it is
-// important that the shapes used for compilation match the true shapes of the
-// buffers.
-//
-// We snapshot the entire set of resource variables as one atomic operation.
-// This models Read->* dependencies between resource variable operations.  See
-// jit/resource_operation_safety_analysis for details.
-//
-// Returns a map of TensorFlow argument index to resource variable. If a
-// resource variable is not initialized, the corresponding OptionalTensor
-// will have its `present` field set to false.
-Status SnapshotResourceVariables(OpKernelContext* ctx,
-                                 absl::Span<const int> variable_indices,
-                                 std::map<int, OptionalTensor>* result);
+// Snapshot of resource variables for a TF kernel invocation, mapping from
+// parameter number to values at execution time. If the resource variable is not
+// initialized, the value will not be present.
+using ResourceVarsSnapshot = absl::flat_hash_map<int, absl::optional<Tensor>>;
 
 // Information about the state of a variable passed as input to the _XlaCompile
 // and _XlaRun operators.  Unlocks the resource variable and decrements its
 // refcount on destruction.
 class VariableInfo {
  public:
-  explicit VariableInfo(int index, Var* var);
+  explicit VariableInfo(int index, absl::string_view name, Var* var);
   VariableInfo(VariableInfo&& other);
 
   VariableInfo& operator=(VariableInfo&& other);
@@ -79,6 +60,9 @@ class VariableInfo {
   // "empty", i.e. it does not track a resource variable.
   Var* var() const { return var_; }
 
+  // Returns the variable name.
+  absl::string_view name() const { return name_; }
+
   // Returns true if the resource variable lock was successfully acquired by
   // this thread.
   bool lock_held() const { return lock_held_; }
@@ -88,6 +72,7 @@ class VariableInfo {
 
  private:
   int index_;
+  std::string name_;
   Var* var_;
 
   // We can't use a optional<mutex_lock> here because it confuses the compiler's
@@ -96,6 +81,20 @@ class VariableInfo {
   bool lock_held_ = false;
 };
 
+// Takes a snapshot of the values of resource variable arguments, whose indices
+// are specified in `variable_indices` argument. We snapshot tensors that back
+// resource variables since concurrent updates may modify the shape, and it is
+// important that the shapes used for compilation match the true shapes of the
+// buffers.
+//
+// We snapshot the entire set of resource variables as one atomic operation.
+// This models Read->* dependencies between resource variable operations.  See
+// jit/resource_operation_safety_analysis for details.
+Status SnapshotResourceVariables(OpKernelContext* ctx,
+                                 absl::Span<const int> variable_indices,
+                                 absl::Span<VariableInfo const> variable_infos,
+                                 ResourceVarsSnapshot* result);
+
 // Acquires the mutexes for all the variables in `variables` using a
 // deadlock-safe protocol (acquire the mutexes in increasing-address order).
 //
@@ -104,6 +103,13 @@ class VariableInfo {
 Status LockVariables(absl::Span<VariableInfo> variables)
     TF_EXCLUSIVE_LOCK_FUNCTION();
 
+// Returns a vector of VariableInfo instances for the resource variable inputs
+// to the kernel with context `ctx`.  The input indices for the resource
+// variable inputs are in `variable_indices`.
+Status GetVariableInfosFromCtxInputs(OpKernelContext* ctx,
+                                     absl::Span<const int> variable_indices,
+                                     std::vector<VariableInfo>* result);
+
 // Helper class to perform the marshalling of TensorFlow inputs and outputs to
 // ShapedBuffers suitable for passing to an XLA computation.
 class XlaComputationLaunchContext {
@@ -123,9 +129,10 @@ class XlaComputationLaunchContext {
 
   // Builds a XlaCompiler::Argument vector from the arguments to an XlaLaunch
   // op.
+  // Precondition: variables in `variable_args` are locked.
   static Status BuildXlaCompilerArguments(
       const std::map<int, Tensor>& constant_args,
-      const std::map<int, OptionalTensor>& variable_args, OpKernelContext* ctx,
+      absl::Span<VariableInfo const> variable_args, OpKernelContext* ctx,
       std::vector<XlaCompiler::Argument>* args);
 
   // Add all inputs within `ctx` as XLA arguments (returned by arguments()).
@@ -137,7 +144,7 @@ class XlaComputationLaunchContext {
   // (in other words, no inputs actually required by the kernel can be missing).
   void PopulateInputs(OpKernelContext* ctx,
                       const XlaCompiler::CompilationResult* compilation_result,
-                      const std::map<int, OptionalTensor>& variables,
+                      const ResourceVarsSnapshot& variables,
                       int missing_ctx_input_prefix);
 
   // Given the XLA output in `output`, populate all outputs of `ctx`.  Also
@@ -155,7 +162,7 @@ class XlaComputationLaunchContext {
       const XlaCompiler::CompilationResult* compilation_result,
       xla::ScopedShapedBuffer output, int missing_ctx_input_prefix,
       const xla::HloInputOutputAliasConfig& input_output_alias,
-      const std::map<int, OptionalTensor>& resource_var_snapshots);
+      const ResourceVarsSnapshot& resource_var_snapshots);
 
   // Return the argument list. Only valid after PopulateInputs() has been
   // called.

From af8f596d21f03372291384227cae75d95fa6e468 Mon Sep 17 00:00:00 2001
From: Sung Jin Hwang <sjhwang@google.com>
Date: Mon, 22 Jun 2020 12:13:10 -0700
Subject: [PATCH 0783/1390] Add _SwitchN op to the handled op lists in
 convert_variables_to_constants_v2.

Grappler transforms tf.switch_case() to use _SwitchN instead of Case op, and
because _SwitchN is currently not handled convert_variables_to_constants_v2()
fails when function contains tf.switch_case.

PiperOrigin-RevId: 317707465
Change-Id: I3524c0c50f0acd02c2347c3c4f49ece5a5d671f7
---
 .../python/framework/convert_to_constants.py  |  3 +-
 .../framework/convert_to_constants_test.py    | 34 +++++++++++++++++++
 2 files changed, 36 insertions(+), 1 deletion(-)

diff --git a/tensorflow/python/framework/convert_to_constants.py b/tensorflow/python/framework/convert_to_constants.py
index 87c74c3263d..4c3cbb06bf1 100644
--- a/tensorflow/python/framework/convert_to_constants.py
+++ b/tensorflow/python/framework/convert_to_constants.py
@@ -235,7 +235,8 @@ class _Node(_Convertible):
       return _If(node, function, enclosing_graph)
     elif node.op in ["While", "StatelessWhile"]:
       return _While(node, function, enclosing_graph)
-    elif node.op in ["Enter", "Exit", "Identity", "NextIteration", "Switch"]:
+    elif node.op in [
+        "Enter", "Exit", "Identity", "NextIteration", "Switch", "_SwitchN"]:
       return _Intermediate(node, function, enclosing_graph)
     else:
       return _Node(node, function, enclosing_graph)
diff --git a/tensorflow/python/framework/convert_to_constants_test.py b/tensorflow/python/framework/convert_to_constants_test.py
index b1e11003939..7252082d084 100644
--- a/tensorflow/python/framework/convert_to_constants_test.py
+++ b/tensorflow/python/framework/convert_to_constants_test.py
@@ -486,6 +486,40 @@ class VariablesToConstantsTest(test.TestCase):
     root, output_func = self._freezeModel(model)
     self._testConvertedFunction(root, root.f, output_func, input_data)
 
+  @test_util.run_v2_only
+  def testSwitchCase(self):
+    """Test a switch_case statement."""
+    input_data = {
+        "i": constant_op.constant(np.random.randint(0, 3, dtype=np.int32)),
+        "x": constant_op.constant(
+            np.asarray(np.random.random_sample((10, 3)), dtype=np.float32)),
+    }
+
+    w0 = variables.Variable(np.random.random_sample((3, 4)), dtype=np.float32)
+    w1 = variables.Variable(np.random.random_sample((3, 4)), dtype=np.float32)
+    w2 = variables.Variable(np.random.random_sample((4,)), dtype=np.float32)
+
+    def branch0(x):
+      return math_ops.matmul(x, w0)
+
+    def branch1(x):
+      return math_ops.matmul(x, w1)
+
+    def branch2(x):
+      x = array_ops.pad(x, [[0, 0], [0, 1]])
+      return x + w2
+
+    @def_function.function(input_signature=[
+        tensor_spec.TensorSpec(shape=[], dtype=dtypes.int32),
+        tensor_spec.TensorSpec(shape=[10, 3], dtype=dtypes.float32),
+    ])
+    def model(i, x):
+      return control_flow_ops.switch_case(i, [
+          lambda: branch0(x), lambda: branch1(x), lambda: branch2(x)])
+
+    root, output_func = self._freezeModel(model)
+    self._testConvertedFunction(root, root.f, output_func, input_data)
+
 
 class ConvertVariablesToConstantsSessionTest(test.TestCase):
 

From dfd21eaec6e3da7210bca9bfc951fcffc117a56c Mon Sep 17 00:00:00 2001
From: Akshay Modi <nareshmodi@google.com>
Date: Mon, 22 Jun 2020 12:16:17 -0700
Subject: [PATCH 0784/1390] Undo some of the composite tensor changes in
 gradient tape code

Its not clear that we should always take gradients with respect to component tensors.

PiperOrigin-RevId: 317708164
Change-Id: I8a0cdddd705497e5539857afcbb60aaa38821e0c
---
 tensorflow/python/eager/backprop.py           |  9 ++-
 tensorflow/python/eager/backprop_test.py      | 69 -------------------
 tensorflow/python/ops/numpy_ops/np_arrays.py  |  4 ++
 .../python/ops/numpy_ops/np_interop_test.py   |  5 +-
 4 files changed, 11 insertions(+), 76 deletions(-)

diff --git a/tensorflow/python/eager/backprop.py b/tensorflow/python/eager/backprop.py
index 5c2deb9c0f2..8da3f71360a 100644
--- a/tensorflow/python/eager/backprop.py
+++ b/tensorflow/python/eager/backprop.py
@@ -1024,7 +1024,7 @@ class GradientTape(object):
             "derivatives.", 1)
 
     flat_targets = []
-    for t in nest.flatten(target, expand_composites=True):
+    for t in nest.flatten(target):
       if not backprop_util.IsTrainable(t):
         logging.vlog(
             logging.WARN, "The dtype of the target tensor must be "
@@ -1035,7 +1035,7 @@ class GradientTape(object):
           t = ops.convert_to_tensor(t)
       flat_targets.append(t)
 
-    flat_sources = nest.flatten(sources, expand_composites=True)
+    flat_sources = nest.flatten(sources)
     flat_sources_raw = flat_sources
     flat_sources = [_handle_or_self(x) for x in flat_sources]
     for t in flat_sources_raw:
@@ -1051,8 +1051,7 @@ class GradientTape(object):
 
     if output_gradients is not None:
       output_gradients = [None if x is None else ops.convert_to_tensor(x)
-                          for x in nest.flatten(
-                              output_gradients, expand_composites=True)]
+                          for x in nest.flatten(output_gradients)]
 
     flat_grad = imperative_grad.imperative_grad(
         self._tape,
@@ -1067,7 +1066,7 @@ class GradientTape(object):
       self._watched_variables = self._tape.watched_variables()
       self._tape = None
 
-    grad = nest.pack_sequence_as(sources, flat_grad, expand_composites=True)
+    grad = nest.pack_sequence_as(sources, flat_grad)
     return grad
 
   def jacobian(self,
diff --git a/tensorflow/python/eager/backprop_test.py b/tensorflow/python/eager/backprop_test.py
index abdac526ce4..a0f98fc0a44 100644
--- a/tensorflow/python/eager/backprop_test.py
+++ b/tensorflow/python/eager/backprop_test.py
@@ -28,7 +28,6 @@ from tensorflow.python.eager import def_function
 from tensorflow.python.eager import function
 from tensorflow.python.eager import tape as tape_lib
 from tensorflow.python.eager import test
-from tensorflow.python.framework import composite_tensor
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
@@ -37,7 +36,6 @@ from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.framework import test_util
-from tensorflow.python.framework import type_spec
 from tensorflow.python.framework.memory_checker import MemoryChecker
 from tensorflow.python.layers.pooling import max_pooling3d
 from tensorflow.python.ops import array_ops
@@ -54,44 +52,6 @@ from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import sparse_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.training import training
-from tensorflow.python.util import nest
-
-
-# TODO(nareshmodi): This is copied from composite_tensor_test.py. Extract it out
-# to a common library to avoid duplication.
-class CTSpec(type_spec.TypeSpec):
-  """A generic CompositeTensor TypeSpec, used for constructing tests."""
-
-  def __init__(self, component_specs):
-    self.component_specs = component_specs
-
-  value_type = property(lambda self: CT)
-  _component_specs = property(lambda self: self.component_specs)
-
-  def _serialize(self):
-    return (self.component_specs,)
-
-  def _to_components(self, value):
-    return value.components
-
-  def _from_components(self, tensor_list):
-    return CT(tensor_list)
-
-
-class CT(composite_tensor.CompositeTensor):
-  """A generic CompositeTensor, used for constructing tests."""
-  _type_spec_class = CTSpec
-
-  def __init__(self, components):
-    if isinstance(components, list):
-      components = tuple(components)
-    self.components = components
-
-  @property
-  def _type_spec(self):
-    component_specs = nest.map_structure(type_spec.type_spec_from_value,
-                                         self.components)
-    return self._type_spec_class(component_specs)
 
 
 class BackpropTest(test.TestCase, parameterized.TestCase):
@@ -1621,35 +1581,6 @@ class BackpropTest(test.TestCase, parameterized.TestCase):
     memory_checker.report()
     memory_checker.assert_no_leak_if_all_possibly_except_one()
 
-  def testCompositeTensorAsSource(self):
-    t = CT([constant_op.constant(3.), constant_op.constant(2.)])
-    with backprop.GradientTape() as gt:
-      gt.watch(t)
-      y = CT([t.components[0] * 2, t.components[1] * 3])
-
-    grad = gt.gradient(y, t)
-    expected_grad = CT([constant_op.constant(2.), constant_op.constant(3.)])
-
-    flat_grads = nest.flatten(grad, expand_composites=True)
-    flat_expected_grads = nest.flatten(expected_grad, expand_composites=True)
-
-    self.assertAllClose(flat_grads, flat_expected_grads)
-
-  def testCompositeTensorAsOutputGradients(self):
-    t = CT([constant_op.constant(3.), constant_op.constant(2.)])
-    with backprop.GradientTape() as gt:
-      gt.watch(t)
-      y = CT([t.components[0] * 2, t.components[1] * 3])
-
-    output_gradients = CT([constant_op.constant(5.), constant_op.constant(10.)])
-    grad = gt.gradient(y, t, output_gradients=output_gradients)
-    expected_grad = CT([constant_op.constant(10.), constant_op.constant(30.)])
-
-    flat_grads = nest.flatten(grad, expand_composites=True)
-    flat_expected_grads = nest.flatten(expected_grad, expand_composites=True)
-
-    self.assertAllClose(flat_grads, flat_expected_grads)
-
 
 class JacobianTest(test.TestCase):
 
diff --git a/tensorflow/python/ops/numpy_ops/np_arrays.py b/tensorflow/python/ops/numpy_ops/np_arrays.py
index d5ad87a887e..46252dd169c 100644
--- a/tensorflow/python/ops/numpy_ops/np_arrays.py
+++ b/tensorflow/python/ops/numpy_ops/np_arrays.py
@@ -306,6 +306,10 @@ class ndarray(composite_tensor.CompositeTensor):  # pylint: disable=invalid-name
   def __repr__(self):
     return 'ndarray<{}>'.format(self.data.__repr__())
 
+  @property
+  def _id(self):
+    return self.data._id  # pylint: disable=protected-access
+
 
 def tensor_to_ndarray(tensor):
   return ndarray.from_tensor(tensor)
diff --git a/tensorflow/python/ops/numpy_ops/np_interop_test.py b/tensorflow/python/ops/numpy_ops/np_interop_test.py
index f6882e968b5..5c7560b12d9 100644
--- a/tensorflow/python/ops/numpy_ops/np_interop_test.py
+++ b/tensorflow/python/ops/numpy_ops/np_interop_test.py
@@ -64,8 +64,9 @@ class InteropTest(test.TestCase):
 
     dx, dy = t.gradient([xx, yy], [x, y])
 
-    self.assertIsInstance(dx, np_arrays.ndarray)
-    self.assertIsInstance(dy, np_arrays.ndarray)
+    # # TODO(nareshmodi): Figure out a way to rewrap ndarray as tensors.
+    # self.assertIsInstance(dx, np_arrays.ndarray)
+    # self.assertIsInstance(dy, np_arrays.ndarray)
     self.assertAllClose(dx, 2.0)
     self.assertAllClose(dy, 3.0)
 

From 780c0a29fe66b93a8fd67fd8cc1c63ad6ea60a17 Mon Sep 17 00:00:00 2001
From: Haitang Hu <hthu@google.com>
Date: Mon, 22 Jun 2020 12:21:45 -0700
Subject: [PATCH 0785/1390] Update device_id to be int32 rather than int64.

PiperOrigin-RevId: 317709385
Change-Id: I577dd469d223cc05c50dbdb6a8bd908e2e757344
---
 .../c/eager/parallel_device/parallel_device_lib.cc     | 10 +++++-----
 .../c/eager/parallel_device/parallel_device_testlib.cc |  4 ++--
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/tensorflow/c/eager/parallel_device/parallel_device_lib.cc b/tensorflow/c/eager/parallel_device/parallel_device_lib.cc
index d0149b29c08..768f686bd88 100644
--- a/tensorflow/c/eager/parallel_device/parallel_device_lib.cc
+++ b/tensorflow/c/eager/parallel_device/parallel_device_lib.cc
@@ -262,14 +262,14 @@ std::unique_ptr<ParallelTensor> ParallelDevice::DeviceIDs(
   components.reserve(underlying_devices_.size());
   for (int device_index = 0; device_index < underlying_devices_.size();
        ++device_index) {
-    int64_t* device_id = new int64_t;
+    int32_t* device_id = new int32_t;
     *device_id = device_index;
     std::unique_ptr<TF_Tensor, decltype(&TF_DeleteTensor)> tensor(
         TF_NewTensor(
-            TF_INT64, /*dims=*/nullptr, /*num_dims=*/0, device_id,
-            sizeof(int64_t),
+            TF_INT32, /*dims=*/nullptr, /*num_dims=*/0, device_id,
+            sizeof(int32_t),
             [](void* data, size_t, void* arg) {
-              delete reinterpret_cast<int64_t*>(data);
+              delete reinterpret_cast<int32_t*>(data);
             },
             nullptr),
         TF_DeleteTensor);
@@ -283,7 +283,7 @@ std::unique_ptr<ParallelTensor> ParallelDevice::DeviceIDs(
     if (TF_GetCode(status) != TF_OK) return nullptr;
     TFE_OpSetAttrTensor(const_op.get(), "value", tensor.get(), status);
     if (TF_GetCode(status) != TF_OK) return nullptr;
-    TFE_OpSetAttrType(const_op.get(), "dtype", TF_INT64);
+    TFE_OpSetAttrType(const_op.get(), "dtype", TF_INT32);
     TFE_TensorHandle* device_handle;
     int num_outputs = 1;
     TFE_Execute(const_op.get(), &device_handle, &num_outputs, status);
diff --git a/tensorflow/c/eager/parallel_device/parallel_device_testlib.cc b/tensorflow/c/eager/parallel_device/parallel_device_testlib.cc
index fba47865c36..828dcbae093 100644
--- a/tensorflow/c/eager/parallel_device/parallel_device_testlib.cc
+++ b/tensorflow/c/eager/parallel_device/parallel_device_testlib.cc
@@ -296,8 +296,8 @@ void BasicTestsForTwoDevices(TFE_Context* context, const char* first_device,
     TFE_DeleteTensorHandle(result_handle);
     ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
 
-    ExpectScalarEq<int64_t>(components[0].get(), 0);
-    ExpectScalarEq<int64_t>(components[1].get(), 1);
+    ExpectScalarEq<int32_t>(components[0].get(), 0);
+    ExpectScalarEq<int32_t>(components[1].get(), 1);
     std::string first_device =
         TFE_TensorHandleBackingDeviceName(components[0].get(), status.get());
     ASSERT_EQ(underlying_devices[0], first_device);

From aac1dd5788000f05f19f247e3bacacd81810d72b Mon Sep 17 00:00:00 2001
From: Tomer Kaftan <kaftan@google.com>
Date: Mon, 22 Jun 2020 12:24:15 -0700
Subject: [PATCH 0786/1390] Updates Keras layer `__call__` to always set
 `training`, with the following priority order:     # Training mode for
 `Layer.__call__` is set via (in order of priority):     # (1) The `training`
 argument passed to this `Layer.__call__`, if it is not None     # (2) The
 training mode of an outer `Layer.__call__`.     # (3) The default mode set by
 `tf.keras.backed.set_learning_phase` (if set).     # (4) Any non-None default
 value for `training` specified in the `call`     #  signature     # (5) False
 (treating the layer as if it's in inference)

Previously (4) and (5) were missing, leading to crashes for layers that do not provide a default argument for `training`.

Note that (4) is fragile to reflection issues, and may get confused by decorators.

PiperOrigin-RevId: 317709904
Change-Id: I58039a4d9e5106bcb27f4cfbf65e6762f1b40807
---
 tensorflow/python/keras/engine/base_layer.py  | 58 +++++++++---
 .../python/keras/engine/base_layer_test.py    | 91 +++++++++++++++++++
 .../python/keras/engine/functional_test.py    | 68 ++++++++++----
 .../keras/layers/rnn_cell_wrapper_v2.py       |  5 +
 .../keras/tests/custom_training_loop_test.py  |  2 +-
 5 files changed, 192 insertions(+), 32 deletions(-)

diff --git a/tensorflow/python/keras/engine/base_layer.py b/tensorflow/python/keras/engine/base_layer.py
index fbec5382a08..97eb0447a69 100644
--- a/tensorflow/python/keras/engine/base_layer.py
+++ b/tensorflow/python/keras/engine/base_layer.py
@@ -943,10 +943,14 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
       kwargs['mask'] = input_masks
 
     # Training mode for `Layer.call` is set via (in order of priority):
-    # (1) The `training` argument passed to this `Layer.call`.
+    # (1) The `training` argument passed to this `Layer.call`, if it is not None
     # (2) The training mode of an outer `Layer.call`.
-    # (3) The default mode set by `tf.keras.backed.set_learning_phase` (if set).
-    training_mode = self._set_training_mode(args, kwargs, call_context)
+    # (3) The default mode set by `tf.keras.backend.set_learning_phase` (if set)
+    # (4) Any non-None default value for `training` specified in the call
+    #  signature
+    # (5) False (treating the layer as if it's in inference)
+    args, kwargs, training_mode = self._set_training_mode(
+        args, kwargs, call_context)
 
     # Losses are cleared for all sublayers on the outermost `Layer.call`.
     # Losses are not cleared on inner `Layer.call`s, because sublayers can be
@@ -1020,7 +1024,7 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
     # propagate `training` value from this layer's calling layer.
     training_value = None
     training_arg_passed_by_framework = False
-    # Priority 1: `training` was explicitly passed.
+    # Priority 1: `training` was explicitly passed a non-None value.
     if self._call_arg_was_passed('training', args, kwargs):
       training_value = self._get_call_arg_value('training', args, kwargs)
       if not self._expects_training_arg:
@@ -1030,17 +1034,23 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
       # Priority 2: `training` was passed to a parent layer.
       if call_context.training is not None:
         training_value = call_context.training
-      # Priority 3a: `learning_phase()` has been set.
+      # Priority 3: `learning_phase()` has been set.
       elif backend.global_learning_phase_is_set():
         training_value = backend.learning_phase()
-
-      if self._expects_training_arg and training_value is not None:
         # Force the training_value to be bool type which matches to the contract
         # for layer/model call args.
         if tensor_util.is_tensor(training_value):
           training_value = math_ops.cast(training_value, dtypes.bool)
         else:
           training_value = bool(training_value)
+      # Priority 4: trace layer with the default training argument specified
+      # in the `call` signature (or in inference mode if the `call` signature
+      # specifies no non-None default).
+      else:
+        training_value = self._default_training_arg
+      # In cases (2), (3), (4) the training argument is passed automatically
+      # by the framework, and will not be hard-coded into the model.
+      if self._expects_training_arg:
         args, kwargs = self._set_call_arg_value('training', training_value,
                                                 args, kwargs)
         training_arg_passed_by_framework = True
@@ -1150,6 +1160,8 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
       # (1) `training` was passed to this `Layer.call`.
       if self._call_arg_was_passed('training', args, kwargs):
         training_mode = self._get_call_arg_value('training', args, kwargs)
+      # If no `training` arg was passed, or `None` was explicitly passed,
+      # the framework will make a decision about the training mode is.
       if training_mode is None:
         call_ctx_training = call_context.training
         # (2) `training` mode is inferred from an outer `Layer.call`.
@@ -1165,10 +1177,15 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
             training_mode = math_ops.cast(training_mode, dtypes.bool)
           else:
             training_mode = bool(training_mode)
+        # (4) We default to using `call`'s default value for `training`,
+        # or treating the layer as if it is in inference if no non-None default
+        # is specified in the `call` signature.
+        else:
+          training_mode = self._default_training_arg
 
-        # For case (2) or (3), `training` arg is passed by framework.
-        if training_mode is not None:
-          kwargs['training'] = training_mode
+        # For case (2), (3), (4) `training` arg is passed by framework.
+        args, kwargs = self._set_call_arg_value('training', training_mode, args,
+                                                kwargs)
     else:
       if 'training' in kwargs:
         # `training` was passed to this `Layer` but is not needed for
@@ -1178,7 +1195,7 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
         # Grab the current `training` mode from any outer `Layer.call`.
         training_mode = call_context.training
 
-    return training_mode
+    return args, kwargs, training_mode
 
   def _autographed_call(self):
     # Wrapping `call` function in autograph to allow for dynamic control
@@ -2529,7 +2546,7 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
       if len(args) > arg_pos:
         args = list(args)
         args[arg_pos] = new_value
-        return args, kwargs
+        return tuple(args), kwargs
     if new_value is None and pop_kwarg_if_none:
       kwargs.pop(arg_name, None)
     else:
@@ -2873,6 +2890,10 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
     call_fn_args = self._call_fn_args
     self._expects_training_arg = ('training' in call_fn_args or
                                   self._call_accepts_kwargs)
+    # The default training arg will be any (non-None) default specified in the
+    # method signature, or `False` if no non-None default is specified.
+    self._default_training_arg = self._call_fn_arg_defaults.get(
+        'training') or False
     self._expects_mask_arg = ('mask' in call_fn_args or
                               self._call_accepts_kwargs)
 
@@ -2892,6 +2913,19 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
       return all_args[1:]
     return all_args
 
+  @property
+  @tracking.cached_per_instance
+  def _call_fn_arg_defaults(self):
+    call_fn_args = self._call_fn_args
+    call_fn_defaults = self._call_full_argspec.defaults or []
+    defaults = dict()
+
+    # The call arg defaults are an n-tuple of the last n elements of the args
+    # list. (n = # of elements that have a default argument)
+    for i in range(-1 * len(call_fn_defaults), 0):
+      defaults[call_fn_args[i]] = call_fn_defaults[i]
+    return defaults
+
   @property
   @tracking.cached_per_instance
   def _call_fn_arg_positions(self):
diff --git a/tensorflow/python/keras/engine/base_layer_test.py b/tensorflow/python/keras/engine/base_layer_test.py
index b861d7e4b5b..58a0799329a 100644
--- a/tensorflow/python/keras/engine/base_layer_test.py
+++ b/tensorflow/python/keras/engine/base_layer_test.py
@@ -629,6 +629,96 @@ class BaseLayerTest(keras_parameterized.TestCase):
     self.assertTrue(layer.built)
     self.assertEqual([None, 3], layer._build_input_shape.as_list())
 
+  @combinations.generate(combinations.combine(mode=['eager']))
+  def custom_layer_training_arg(self):
+    class CustomLayerNoTrainingArg(base_layer.Layer):
+
+      def __init__(self, nested_layer=None):
+        self._nested_layer = nested_layer or array_ops.identity
+
+      def call(self, inputs):
+        return self._nested_layer(inputs)
+
+    class CustomLayerDefaultTrainingMissing(base_layer.Layer):
+
+      def __init__(self, nested_layer=None):
+        self._nested_layer = nested_layer or array_ops.identity
+
+      def call(self, inputs, training):
+        if training:
+          return self._nested_layer(inputs)
+        else:
+          return self._nested_layer(inputs) * 0.5
+
+    class CustomLayerDefaultTrainingFalse(base_layer.Layer):
+
+      def __init__(self, nested_layer=None):
+        self._nested_layer = nested_layer or array_ops.identity
+
+      def call(self, inputs, training=False):
+        if training:
+          return self._nested_layer(inputs)
+        else:
+          return self._nested_layer(inputs) * 0.5
+
+    class CustomLayerDefaultTrainingTrue(base_layer.Layer):
+
+      def __init__(self, nested_layer=None):
+        self._nested_layer = nested_layer or array_ops.identity
+
+      def call(self, inputs, training=True):
+        if training:
+          return self._nested_layer(inputs)
+        else:
+          return self._nested_layer(inputs) * 0.5
+
+    x = array_ops.ones(shape=(1, 1))
+
+    # If the layer signature doesn't specify a default training arg,
+    # run it in inference mode when to training arg is passed
+    # to __call__
+    layer = CustomLayerDefaultTrainingMissing()
+    self.assertAllEqual(layer(x), x * 0.5)
+    self.assertAllEqual(layer(x, training=False), x * 0.5)
+    self.assertAllEqual(layer(x, training=True), x)
+
+    # If the layer signature specifies `False` as the default training arg,
+    # run it in inference mode when no training arg is passed
+    # to __call__
+    layer = CustomLayerDefaultTrainingFalse()
+    self.assertAllEqual(layer(x), x * 0.5)
+    self.assertAllEqual(layer(x, training=False), x * 0.5)
+    self.assertAllEqual(layer(x, training=True), x)
+
+    # If the layer signature specifies `True` as the default training arg,
+    # explicitly run it in training mode when no training arg is passed
+    # to __call__
+    layer = CustomLayerDefaultTrainingTrue()
+    self.assertAllEqual(layer(x), x)
+    self.assertAllEqual(layer(x, training=False), x * 0.5)
+    self.assertAllEqual(layer(x, training=True), x)
+
+    # Outer layers/models should set the training context implicitly for all
+    # nested layers, respecting whatever mode the outer layer was run with.
+    layer = CustomLayerDefaultTrainingTrue(CustomLayerDefaultTrainingFalse())
+    self.assertAllEqual(layer(x), x)
+    self.assertAllEqual(layer(x, training=False), x * 0.25)
+    self.assertAllEqual(layer(x, training=True), x)
+
+    layer = CustomLayerDefaultTrainingFalse(CustomLayerDefaultTrainingTrue())
+    self.assertAllEqual(layer(x), x * 0.25)
+    self.assertAllEqual(layer(x, training=False), x * 0.25)
+    self.assertAllEqual(layer(x, training=True), x)
+
+    # If the outer layer `call` doesn't take a training argument at all,
+    # it'll set the nested scope as inference when no training arg is passed in.
+    # If a training arg is passed in it won't use it directly in `call`, but
+    # it will set the nested training mode.
+    layer = CustomLayerNoTrainingArg(CustomLayerDefaultTrainingTrue())
+    self.assertAllEqual(layer(x), x * 0.5)
+    self.assertAllEqual(layer(x, training=False), x * 0.5)
+    self.assertAllEqual(layer(x, training=True), x)
+
   def test_activity_regularizer_string(self):
 
     class MyLayer(base_layer.Layer):
@@ -1387,6 +1477,7 @@ class DTypeTest(keras_parameterized.TestCase):
     class IdentityLayerWithArgs(base_layer.Layer):
 
       def call(self, inputs, *args, **kwargs):
+        kwargs.pop('training', None)
         return nest.flatten([inputs, args, kwargs])
 
     layer = IdentityLayerWithArgs(dtype='float64')
diff --git a/tensorflow/python/keras/engine/functional_test.py b/tensorflow/python/keras/engine/functional_test.py
index a7e314d4a49..3c14411deb9 100644
--- a/tensorflow/python/keras/engine/functional_test.py
+++ b/tensorflow/python/keras/engine/functional_test.py
@@ -2036,43 +2036,73 @@ class CacheCorrectnessTest(keras_parameterized.TestCase):
 
   def test_training_passed_during_construction(self):
 
+    def _call(inputs, training):
+      if training is None:
+        return inputs * -1.0
+      elif training:
+        return inputs
+      else:
+        return inputs * 0.0
+
     class MyLayer(base_layer.Layer):
 
-      def call(self, x, training=None):
-        if training is None:
-          return x * -1.0
-        elif training:
-          return x
-        else:
-          return x * 0.0
+      def call(self, inputs, training=True):
+        return _call(inputs, training)
 
     my_layer = MyLayer()
     x = np.ones((1, 10))
 
+    # Hard-coded `true` value passed during construction is respected.
     inputs = input_layer_lib.Input(10)
     outputs = my_layer(inputs, training=True)
     network = functional.Functional(inputs, outputs)
+    self.assertAllEqual(network(x, training=True), _call(x, True))
+    self.assertAllEqual(network(x, training=False), _call(x, True))
+    self.assertAllEqual(network(x), _call(x, True))
 
-    # Hard-coded value passed during construction is respected.
-    self.assertAllEqual(network(x, training=False), x)
-
+    # Hard-coded `false` value passed during construction is respected.
     inputs = input_layer_lib.Input(10)
     outputs = my_layer(inputs, training=False)
     network = functional.Functional(inputs, outputs)
+    self.assertAllEqual(network(x, training=True), _call(x, False))
+    self.assertAllEqual(network(x, training=False), _call(x, False))
+    self.assertAllEqual(network(x), _call(x, False))
 
-    network(x, training=True)
-    # Hard-coded value passed during construction is respected.
-    self.assertAllEqual(network(x, training=True), x * 0.0)
+    if context.executing_eagerly():
+      # In v2, construction still works when no `training` is specified
+      # When no value passed during construction, it uses the runtime value.
+      inputs = input_layer_lib.Input(10)
+      outputs = my_layer(inputs)
+      network = functional.Functional(inputs, outputs)
+      self.assertAllEqual(network(x, training=True), _call(x, True))
+      self.assertAllEqual(network(x, training=False), _call(x, False))
+      self.assertAllEqual(network(x), _call(x, False))
 
+    # `None` value passed positionally during construction is ignored at runtime
+    inputs = input_layer_lib.Input(10)
+    outputs = my_layer(inputs, None)
+    network = functional.Functional(inputs, outputs)
+    self.assertAllEqual(network(x, training=True), _call(x, True))
+    self.assertAllEqual(network(x, training=False), _call(x, False))
+    if context.executing_eagerly():
+      self.assertAllEqual(network(x), _call(x, False))
+    else:
+      # in v1 training would have defaulted to using the `None` inside the layer
+      # if training is not passed at runtime
+      self.assertAllEqual(network(x), _call(x, None))
+
+    # `None` value passed as kwarg during construction is ignored at runtime.
     inputs = input_layer_lib.Input(10)
     outputs = my_layer(inputs, training=None)
     network = functional.Functional(inputs, outputs)
-
-    # `None` value passed during construction is overridden.
-    self.assertAllEqual(network(x, training=True), x)
-    # `None` value passed during construction is overridden.
-    self.assertAllEqual(network(x, training=False), x * 0.0)
-
+    self.assertAllEqual(network(x, training=True), _call(x, True))
+    self.assertAllEqual(network(x, training=False), _call(x, False))
+    if context.executing_eagerly():
+      self.assertAllEqual(network(x), _call(x, False))
+    else:
+      # in v1 training would have defaulted to using the `None` inside the layer
+      # if training is not passed at runtime
+      self.assertAllEqual(network(x), _call(x, None))
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/layers/rnn_cell_wrapper_v2.py b/tensorflow/python/keras/layers/rnn_cell_wrapper_v2.py
index 4356244b292..d387a375aa2 100644
--- a/tensorflow/python/keras/layers/rnn_cell_wrapper_v2.py
+++ b/tensorflow/python/keras/layers/rnn_cell_wrapper_v2.py
@@ -27,6 +27,7 @@ from __future__ import print_function
 
 from tensorflow.python.keras.layers import recurrent
 from tensorflow.python.ops import rnn_cell_wrapper_impl
+from tensorflow.python.util import tf_inspect
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -41,6 +42,10 @@ class _RNNCellWrapperV2(recurrent.AbstractRNNCell):
   def __init__(self, cell, *args, **kwargs):
     super(_RNNCellWrapperV2, self).__init__(*args, **kwargs)
     self.cell = cell
+    cell_call_spec = tf_inspect.getfullargspec(cell.call)
+    self._expects_training_arg = ("training" in cell_call_spec.args) or (
+        cell_call_spec.varkw is not None
+    )
 
   def call(self, inputs, state, **kwargs):
     """Runs the RNN cell step computation.
diff --git a/tensorflow/python/keras/tests/custom_training_loop_test.py b/tensorflow/python/keras/tests/custom_training_loop_test.py
index 5b3310b2b40..6291933ac99 100644
--- a/tensorflow/python/keras/tests/custom_training_loop_test.py
+++ b/tensorflow/python/keras/tests/custom_training_loop_test.py
@@ -186,7 +186,7 @@ class CustomTrainingLoopTest(keras_parameterized.TestCase):
 
     def train_step(x):
       no_learning_phase_out = model(x)
-      self.assertIsNone(model.layer.training)
+      self.assertFalse(model.layer.training)
       with keras.backend.learning_phase_scope(0):
         inf_learning_phase_out = model(x)
       self.assertEqual(model.layer.training, 0)

From 38d95ad2d87a94b9260ff9ef242a9d5f12640147 Mon Sep 17 00:00:00 2001
From: Yujing Zhang <yujingzhang@google.com>
Date: Mon, 22 Jun 2020 12:29:26 -0700
Subject: [PATCH 0787/1390] [Cleanup] Remove allowed_devices of ResourceHandle
 since it's no longer used.

PiperOrigin-RevId: 317710941
Change-Id: Ib1920c5ee25d405290f852b725d693ee5ea09766
---
 .../base_api/api_def_VarHandleOp.pbtxt        |  4 +-
 .../core/common_runtime/eager/execute.cc      | 32 +++---------
 .../common_runtime/eager/tensor_handle.cc     | 51 +++++--------------
 .../core/common_runtime/eager/tensor_handle.h | 20 +++-----
 .../eager/tensor_handle_test.cc               | 17 +++----
 .../distributed_runtime/eager/remote_mgr.cc   | 12 ++---
 tensorflow/core/framework/resource_handle.cc  |  6 ---
 tensorflow/core/framework/resource_handle.h   | 13 +----
 .../core/framework/resource_handle.proto      |  4 +-
 tensorflow/core/framework/resource_mgr.cc     | 39 ++------------
 tensorflow/core/framework/resource_mgr.h      | 28 +++++-----
 .../core/framework/resource_mgr_test.cc       | 47 -----------------
 .../core/kernels/resource_variable_ops.cc     |  8 +--
 .../core/kernels/resource_variable_ops.h      |  4 --
 .../resource_variable_ops_test.py             | 37 --------------
 15 files changed, 63 insertions(+), 259 deletions(-)

diff --git a/tensorflow/core/api_def/base_api/api_def_VarHandleOp.pbtxt b/tensorflow/core/api_def/base_api/api_def_VarHandleOp.pbtxt
index 39606a07184..29ffcdaad6b 100644
--- a/tensorflow/core/api_def/base_api/api_def_VarHandleOp.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_VarHandleOp.pbtxt
@@ -28,8 +28,8 @@ END
   attr {
     name: "allowed_devices"
     description: <<END
-The allowed devices containing the resource variable. Set when the output
-ResourceHandle represents a per-replica/partitioned resource variable.
+DEPRECATED. The allowed devices containing the resource variable. Set when the
+output ResourceHandle represents a per-replica/partitioned resource variable.
 END
   }
   summary: "Creates a handle to a Variable resource."
diff --git a/tensorflow/core/common_runtime/eager/execute.cc b/tensorflow/core/common_runtime/eager/execute.cc
index 40b737c8ccf..a94c882b3b3 100644
--- a/tensorflow/core/common_runtime/eager/execute.cc
+++ b/tensorflow/core/common_runtime/eager/execute.cc
@@ -432,13 +432,12 @@ Status GetOrCreateKernelAndDevice(
         // looking it up in ResourceMgr, which is slow). So we just get
         // resource_dtypes_and_shapes for all DT_RESOURCE inputs. If
         // resource_dtypes_and_shapes is not empty, take the first element.
-        TensorHandle::ResourceHandleInfo resource_handle_info;
-        TF_RETURN_IF_ERROR(input->GetResourceHandleInfo(&resource_handle_info));
-        std::vector<DtypeAndPartialTensorShape>* resource_dtypes_and_shapes =
-            &resource_handle_info.dtypes_and_shapes;
-        if (!resource_dtypes_and_shapes->empty()) {
+        std::vector<DtypeAndPartialTensorShape> resource_dtypes_and_shapes;
+        TF_RETURN_IF_ERROR(input->GetResourceHandleDtypesAndShapes(
+            &resource_dtypes_and_shapes));
+        if (!resource_dtypes_and_shapes.empty()) {
           const DtypeAndPartialTensorShape& dtype_and_shape =
-              resource_dtypes_and_shapes->at(0);
+              resource_dtypes_and_shapes.at(0);
           input_resource_variable_dtypes_and_shapes[i] = dtype_and_shape;
 
           // Add _Arg index, dtype and shape to "cache_key".
@@ -695,13 +694,8 @@ Status StoreResourceDtypesAndShapes(const eager::Operation& remote_op,
     TF_RETURN_IF_ERROR(attr_slice.Find("dtype", &dtype));
     const AttrValue* shape;
     TF_RETURN_IF_ERROR(attr_slice.Find("shape", &shape));
-    TensorHandle::ResourceHandleInfo resource_handle_info = {
-        {DtypeAndPartialTensorShape{dtype->type(), shape->shape()}}, {}};
-    // "allowed_devices" is set only when the output represents a
-    // per-replica/partitioned resource variable.
-    TryGetNodeAttr(attr_slice, "allowed_devices",
-                   &resource_handle_info.allowed_devices);
-    retvals[0]->SetResourceHandleInfo(std::move(resource_handle_info));
+    retvals[0]->SetResourceHandleDtypeAndShape(
+        {DtypeAndPartialTensorShape{dtype->type(), shape->shape()}});
   }
   return Status::OK();
 }
@@ -985,18 +979,6 @@ Status MaybeUpdateOpDevice(EagerOperation* op) {
       // is a resource we must pin it to prevent different device selection.
       // TODO(iga): null device can mean "unspecified" or "CPU". Clean this up.
       if (resource_device != op_device || op->Device() == kVariantDeviceNull) {
-        std::vector<string> allowed_devices;
-        TF_RETURN_IF_ERROR(
-            tensor_handle->GetResourceAllowedDevices(&allowed_devices));
-        if (!allowed_devices.empty()) {
-          // TODO(b/145922293): Support allowed_devices specified in wildcard
-          // patterns.
-          if (std::find(allowed_devices.begin(), allowed_devices.end(),
-                        op->DeviceName()) != allowed_devices.end()) {
-            TF_RETURN_IF_ERROR(ctx.FindDeviceFromName(op->DeviceName().c_str(),
-                                                      &resource_device));
-          }
-        }
         DVLOG(1) << (resource_device != op_device ? "Changing " : "Setting ")
                  << "device of operation " << op->Name() << " to "
                  << resource_device->name() << " because input #" << i
diff --git a/tensorflow/core/common_runtime/eager/tensor_handle.cc b/tensorflow/core/common_runtime/eager/tensor_handle.cc
index ef3e7a3cd28..0cd55959924 100644
--- a/tensorflow/core/common_runtime/eager/tensor_handle.cc
+++ b/tensorflow/core/common_runtime/eager/tensor_handle.cc
@@ -145,13 +145,13 @@ Status TensorHandle::PackedTensorHandleData::ExtractPackedHandle(
   return Status::OK();
 }
 
-void TensorHandle::SetResourceHandleInfo(
-    ResourceHandleInfo&& resource_handle_info) {
-  resource_handle_info_ = std::move(resource_handle_info);
+void TensorHandle::SetResourceHandleDtypeAndShape(
+    std::vector<DtypeAndPartialTensorShape> dtypes_and_shapes) {
+  handle_dtypes_and_shapes_ = std::move(dtypes_and_shapes);
 }
 
-Status TensorHandle::GetResourceHandleInfoImpl(
-    std::function<void()> set_resource_info) {
+Status TensorHandle::GetResourceHandleDtypesAndShapes(
+    std::vector<DtypeAndPartialTensorShape>* result) {
   if (dtype != DT_RESOURCE) {
     return errors::InvalidArgument(
         "TensorHandle::GetResourceDtypeAndShape should be called on tensor "
@@ -160,7 +160,7 @@ Status TensorHandle::GetResourceHandleInfoImpl(
   }
 
   if (Type() != LOCAL) {
-    set_resource_info();
+    *result = handle_dtypes_and_shapes_;
     return Status::OK();
   }
 
@@ -170,32 +170,10 @@ Status TensorHandle::GetResourceHandleInfoImpl(
   auto& data = absl::get<LocalTensorHandleData>(data_);
   TF_RETURN_IF_ERROR(data.WaitReady("TensorHandle::GetResourceHandleInfo"));
 
-  set_resource_info();
+  *result = handle_dtypes_and_shapes_;
   return Status::OK();
 }
 
-Status TensorHandle::GetResourceHandleInfo(ResourceHandleInfo* result) {
-  auto get_resource_info = [result, this]() {
-    *result = resource_handle_info_;
-  };
-  return GetResourceHandleInfoImpl(get_resource_info);
-}
-
-Status TensorHandle::GetResourceHandleDtypesAndShapes(
-    std::vector<DtypeAndPartialTensorShape>* result) {
-  auto get_resource_info = [result, this]() {
-    *result = resource_handle_info_.dtypes_and_shapes;
-  };
-  return GetResourceHandleInfoImpl(get_resource_info);
-}
-
-Status TensorHandle::GetResourceAllowedDevices(std::vector<string>* result) {
-  auto get_resource_info = [result, this]() {
-    *result = resource_handle_info_.allowed_devices;
-  };
-  return GetResourceHandleInfoImpl(get_resource_info);
-}
-
 int TensorHandle::NumPackedHandles() const {
   if (Type() != PACKED) {
     return 0;
@@ -270,9 +248,8 @@ TensorHandle::TensorHandle(tensorflow::Tensor&& t, Device* d, Device* op_device,
       resource_remote_device_incarnation_(
           GetRemoteDeviceIncarnation(resource_device_)),
       ctx_(ctx),
-      resource_handle_info_(
-          {t.flat<class ResourceHandle>()(0).dtypes_and_shapes(),
-           t.flat<class ResourceHandle>()(0).allowed_devices()}),
+      handle_dtypes_and_shapes_(
+          t.flat<class ResourceHandle>()(0).dtypes_and_shapes()),
       data_(absl::in_place_type<LocalTensorHandleData>, std::move(t)) {
   DVLOG(3) << "Creating Local TensorHandle: " << this
            << " device: " << VariantDeviceDebugString(device_)
@@ -327,10 +304,10 @@ Status TensorHandle::CreatePackedHandle(std::vector<TensorHandle*>&& handles,
     return errors::InvalidArgument("Handles should not be empty.");
   }
 
-  ResourceHandleInfo resource_handle_info;
+  std::vector<DtypeAndPartialTensorShape> dtypes_and_shapes;
   if (dtype == DT_RESOURCE) {
     TF_RETURN_IF_ERROR(
-        handles.at(0)->GetResourceHandleInfo(&resource_handle_info));
+        handles.at(0)->GetResourceHandleDtypesAndShapes(&dtypes_and_shapes));
   }
   std::vector<string> devices;
   for (auto* handle : handles) {
@@ -348,7 +325,8 @@ Status TensorHandle::CreatePackedHandle(std::vector<TensorHandle*>&& handles,
                                                       &composite_device));
   *packed_handle =
       new TensorHandle(std::move(handles), composite_device, dtype, shape, ctx);
-  (*packed_handle)->SetResourceHandleInfo(std::move(resource_handle_info));
+  (*packed_handle)
+      ->SetResourceHandleDtypeAndShape(std::move(dtypes_and_shapes));
   return Status::OK();
 }
 
@@ -898,8 +876,7 @@ Status TensorHandle::SetTensor(tensorflow::Tensor&& t, const Device* d) {
 
     if (t.dtype() == DT_RESOURCE && t.NumElements() > 0) {
       auto& resource_handle = t.flat<class ResourceHandle>()(0);
-      resource_handle_info_ = {resource_handle.dtypes_and_shapes(),
-                               resource_handle.allowed_devices()};
+      handle_dtypes_and_shapes_ = resource_handle.dtypes_and_shapes();
     }
     auto& data = absl::get<LocalTensorHandleData>(data_);
     return data.SetTensor(std::move(t));
diff --git a/tensorflow/core/common_runtime/eager/tensor_handle.h b/tensorflow/core/common_runtime/eager/tensor_handle.h
index 6ac48bdac26..8ef482cd82c 100644
--- a/tensorflow/core/common_runtime/eager/tensor_handle.h
+++ b/tensorflow/core/common_runtime/eager/tensor_handle.h
@@ -226,19 +226,13 @@ class TensorHandle : public ImmediateExecutionTensorHandle,
 
   string DebugString() const;
 
-  struct ResourceHandleInfo {
-    std::vector<DtypeAndPartialTensorShape> dtypes_and_shapes;
-    std::vector<string> allowed_devices;
-  };
-
-  void SetResourceHandleInfo(ResourceHandleInfo&& resource_handle_info);
+  void SetResourceHandleDtypeAndShape(
+      std::vector<DtypeAndPartialTensorShape> dtypes_and_shapes);
 
   // If this TensorHandle is 1) a local tensor, and 2) a resource handle,
-  // return data types, shapes and allowed devices of the underlying resource.
-  Status GetResourceHandleInfo(ResourceHandleInfo* result);
+  // return data types and shapes of the underlying resource.
   Status GetResourceHandleDtypesAndShapes(
       std::vector<DtypeAndPartialTensorShape>* result);
-  Status GetResourceAllowedDevices(std::vector<string>* result);
 
   // Returns the number of packed handles. 0 if the handle type is not PACKED.
   int NumPackedHandles() const;
@@ -261,8 +255,6 @@ class TensorHandle : public ImmediateExecutionTensorHandle,
   // with a ready version of the tensor handle data.
   bool IsReady() const;
 
-  Status GetResourceHandleInfoImpl(std::function<void()> set_resource_info);
-
   VariantDevice const device_;
 
   // Device in which the op producing this tensor was executed. Equals to
@@ -308,9 +300,9 @@ class TensorHandle : public ImmediateExecutionTensorHandle,
   Status is_poisoned_;
 
   // If this TensorHandle 1) is a local tensor, and 2) is a resource handle or
-  // refers to a remote resource handle, we store data types, shapes and allowed
-  // devices for the underlying resource.
-  ResourceHandleInfo resource_handle_info_;
+  // refers to a remote resource handle, we store data types and shapes for
+  // the underlying resource.
+  std::vector<DtypeAndPartialTensorShape> handle_dtypes_and_shapes_;
 
   // A handle data which refers to multiple TensorHandles of the same dtype and
   // shape.
diff --git a/tensorflow/core/common_runtime/eager/tensor_handle_test.cc b/tensorflow/core/common_runtime/eager/tensor_handle_test.cc
index 28092c0a604..40cec3fcc49 100644
--- a/tensorflow/core/common_runtime/eager/tensor_handle_test.cc
+++ b/tensorflow/core/common_runtime/eager/tensor_handle_test.cc
@@ -150,13 +150,13 @@ TEST_F(PackedTensorHandleTest, PackedHandle) {
   Device* d0 = ListDevices().at(0);
   TensorHandle* h0 =
       TensorHandle::CreateLocalHandle(std::move(t0), d0, d0, d0, context());
-  h0->SetResourceHandleInfo({{dtype_and_shape}, {}});
+  h0->SetResourceHandleDtypeAndShape({dtype_and_shape});
   handles.push_back(h0);
   Tensor t1(dtype, shape);
   Device* d1 = ListDevices().at(1);
   TensorHandle* h1 =
       TensorHandle::CreateLocalHandle(std::move(t1), d1, d1, d1, context());
-  h1->SetResourceHandleInfo({{dtype_and_shape}, {}});
+  h1->SetResourceHandleDtypeAndShape({dtype_and_shape});
   handles.push_back(h1);
 
   // Create 2 remote TensorHandles (not ready).
@@ -185,13 +185,12 @@ TEST_F(PackedTensorHandleTest, PackedHandle) {
   TensorShape packed_shape;
   TF_ASSERT_OK(packed_handle->Shape(&packed_shape));
   EXPECT_EQ(packed_shape, shape);
-  TensorHandle::ResourceHandleInfo resource_handle_info;
-  TF_ASSERT_OK(packed_handle->GetResourceHandleInfo(&resource_handle_info));
-  EXPECT_EQ(resource_handle_info.dtypes_and_shapes.size(), 1);
-  EXPECT_EQ(resource_handle_info.dtypes_and_shapes.at(0).dtype, DT_FLOAT);
-  EXPECT_EQ(
-      resource_handle_info.dtypes_and_shapes.at(0).shape.IsIdenticalTo({2, 2}),
-      true);
+  std::vector<DtypeAndPartialTensorShape> dtypes_and_shapes;
+  TF_ASSERT_OK(
+      packed_handle->GetResourceHandleDtypesAndShapes(&dtypes_and_shapes));
+  EXPECT_EQ(dtypes_and_shapes.size(), 1);
+  EXPECT_EQ(dtypes_and_shapes.at(0).dtype, DT_FLOAT);
+  EXPECT_EQ(dtypes_and_shapes.at(0).shape.IsIdenticalTo({2, 2}), true);
 
   CompositeDevice* device = reinterpret_cast<CompositeDevice*>(
       absl::get<Device*>(packed_handle->device()));
diff --git a/tensorflow/core/distributed_runtime/eager/remote_mgr.cc b/tensorflow/core/distributed_runtime/eager/remote_mgr.cc
index 94a4f199337..9003f2b3f17 100644
--- a/tensorflow/core/distributed_runtime/eager/remote_mgr.cc
+++ b/tensorflow/core/distributed_runtime/eager/remote_mgr.cc
@@ -167,24 +167,22 @@ Status RemoteMgr::DeserializeRemoteTensorHandle(const RemoteTensorHandle& in,
         parent_->FindDeviceFromName(device_name.c_str(), &device));
     *out = TensorHandle::CreateLazyRemoteHandle(in.op_id(), in.output_num(),
                                                 in.dtype(), device, parent_);
-    TensorHandle::ResourceHandleInfo resource_handle_info;
-    std::vector<DtypeAndPartialTensorShape>* dtypes_and_shapes =
-        &resource_handle_info.dtypes_and_shapes;
+    std::vector<DtypeAndPartialTensorShape> dtypes_and_shapes;
     if (!GetMirroredResourceShape(RemoteTensorHandleInternal(in),
-                                  dtypes_and_shapes)
+                                  &dtypes_and_shapes)
              .ok()) {
       for (const auto& dtype_and_shape_proto :
            in.resource_dtypes_and_shapes()) {
-        dtypes_and_shapes->push_back(DtypeAndPartialTensorShape{
+        dtypes_and_shapes.push_back(DtypeAndPartialTensorShape{
             dtype_and_shape_proto.dtype(),
             TensorShape(dtype_and_shape_proto.shape())});
       }
       mutex_lock l(mirrored_resource_shape_mu_);
       mirrored_resource_shape_map_.emplace(
           RemoteTensorHandleInternal(in.op_id(), in.output_num()),
-          *dtypes_and_shapes);
+          dtypes_and_shapes);
     }
-    (*out)->SetResourceHandleInfo(std::move(resource_handle_info));
+    (*out)->SetResourceHandleDtypeAndShape(std::move(dtypes_and_shapes));
   }
 
   return Status::OK();
diff --git a/tensorflow/core/framework/resource_handle.cc b/tensorflow/core/framework/resource_handle.cc
index 2db5cfa301c..e7f4c2afc90 100644
--- a/tensorflow/core/framework/resource_handle.cc
+++ b/tensorflow/core/framework/resource_handle.cc
@@ -38,9 +38,6 @@ void ResourceHandle::AsProto(ResourceHandleProto* proto) const {
     dtype_and_shape->set_dtype(dtype_and_shape_pair.dtype);
     dtype_and_shape_pair.shape.AsProto(dtype_and_shape->mutable_shape());
   }
-  for (const string& device : allowed_devices_) {
-    *proto->add_allowed_devices() = device;
-  }
 }
 
 void ResourceHandle::FromProto(const ResourceHandleProto& proto) {
@@ -56,9 +53,6 @@ void ResourceHandle::FromProto(const ResourceHandleProto& proto) {
     dtypes_and_shapes.push_back(DtypeAndPartialTensorShape{dtype, shape});
   }
   dtypes_and_shapes_ = std::move(dtypes_and_shapes);
-  for (const string& device : proto.allowed_devices()) {
-    allowed_devices_.push_back(device);
-  }
 }
 
 string ResourceHandle::SerializeAsString() const {
diff --git a/tensorflow/core/framework/resource_handle.h b/tensorflow/core/framework/resource_handle.h
index 88c9f9da190..9acb94b6e79 100644
--- a/tensorflow/core/framework/resource_handle.h
+++ b/tensorflow/core/framework/resource_handle.h
@@ -39,14 +39,8 @@ class ResourceHandle {
 
   // Unique name for the device containing the resource.
   const std::string& device() const { return device_; }
-  // Names of the devices containing the resource.
-  const std::vector<string>& allowed_devices() const {
-    return allowed_devices_;
-  }
+
   void set_device(const std::string& device) { device_ = device; }
-  void set_allowed_devices(const std::vector<string>& devices) {
-    allowed_devices_ = devices;
-  }
 
   // Container in which this resource is placed.
   const std::string& container() const { return container_; }
@@ -93,12 +87,7 @@ class ResourceHandle {
       "cd2c89b7-88b7-44c8-ad83-06c2a9158347";
 
  public:
-  // The default device containing the resource, where the ResourceHandle is
-  // initially created.
   std::string device_;
-  // A set of devices containing the resource. If empty, the resource only
-  // exists on device_. Can be represented in wildcard patterns.
-  std::vector<string> allowed_devices_;
   std::string container_;
   std::string name_;
   uint64 hash_code_ = 0;
diff --git a/tensorflow/core/framework/resource_handle.proto b/tensorflow/core/framework/resource_handle.proto
index eb0d1631c2f..5a41750475d 100644
--- a/tensorflow/core/framework/resource_handle.proto
+++ b/tensorflow/core/framework/resource_handle.proto
@@ -41,7 +41,5 @@ message ResourceHandleProto {
   // Data types and shapes for the underlying resource.
   repeated DtypeAndShape dtypes_and_shapes = 6;
 
-  // A set of devices containing the resource. If empty, the resource only
-  // exists on `device`.
-  repeated string allowed_devices = 7;
+  reserved 7;
 }
diff --git a/tensorflow/core/framework/resource_mgr.cc b/tensorflow/core/framework/resource_mgr.cc
index fd524b05bb9..e6ecfbb9190 100644
--- a/tensorflow/core/framework/resource_mgr.cc
+++ b/tensorflow/core/framework/resource_mgr.cc
@@ -36,8 +36,7 @@ static std::atomic<int64> current_id_;
 ResourceHandle MakeResourceHandle(
     const string& container, const string& name, const DeviceBase& device,
     const TypeIndex& type_index,
-    const std::vector<DtypeAndPartialTensorShape>& dtypes_and_shapes,
-    const std::vector<string>& allowed_devices) {
+    const std::vector<DtypeAndPartialTensorShape>& dtypes_and_shapes) {
   ResourceHandle result;
   result.set_device(device.name());
   result.set_container(container);
@@ -49,7 +48,6 @@ ResourceHandle MakeResourceHandle(
   result.set_hash_code(type_index.hash_code());
   result.set_maybe_type_name(type_index.name());
   result.set_dtypes_and_shapes(dtypes_and_shapes);
-  result.set_allowed_devices(allowed_devices);
   return result;
 }
 
@@ -67,39 +65,12 @@ Status MakeResourceHandleToOutput(OpKernelContext* context, int output_index,
 namespace internal {
 
 Status ValidateDevice(OpKernelContext* ctx, const ResourceHandle& p) {
-  const string& current_device_name = ctx->device()->attributes().name();
-  if (current_device_name == p.device()) {
-    return Status::OK();
-  }
-  DeviceNameUtils::ParsedName parsed_current_device_name;
-  if (!DeviceNameUtils::ParseFullName(current_device_name,
-                                      &parsed_current_device_name)) {
+  if (ctx->device()->attributes().name() != p.device()) {
     return errors::InvalidArgument(
-        "Cannot parse device name in OpKernelContext: ", current_device_name);
+        "Trying to access resource ", p.name(), " located in device ",
+        p.device(), " from device ", ctx->device()->attributes().name());
   }
-
-  for (const string& device : p.allowed_devices()) {
-    DeviceNameUtils::ParsedName parsed;
-    if (!DeviceNameUtils::ParseFullName(device, &parsed)) {
-      return errors::InvalidArgument("Cannot parse allowed device name: ",
-                                     device);
-    }
-    if (DeviceNameUtils::IsCompleteSpecification(parsed,
-                                                 parsed_current_device_name)) {
-      return Status::OK();
-    }
-  }
-  string error_message = strings::StrCat("Trying to access resource ", p.name(),
-                                         " located in device ", p.device(),
-                                         " from device ", current_device_name);
-  if (!p.allowed_devices().empty()) {
-    absl::StrAppend(&error_message, " (allowed devices: ");
-    for (const string& device : p.allowed_devices()) {
-      absl::StrAppend(&error_message, device, ", ");
-    }
-    absl::StrAppend(&error_message, ") ");
-  }
-  return errors::InvalidArgument(error_message);
+  return Status::OK();
 }
 
 }  // end namespace internal
diff --git a/tensorflow/core/framework/resource_mgr.h b/tensorflow/core/framework/resource_mgr.h
index 3a9b97c7831..b0e4eace16e 100644
--- a/tensorflow/core/framework/resource_mgr.h
+++ b/tensorflow/core/framework/resource_mgr.h
@@ -291,31 +291,27 @@ class ResourceMgr {
 ResourceHandle MakeResourceHandle(
     const string& container, const string& name, const DeviceBase& device,
     const TypeIndex& type_index,
-    const std::vector<DtypeAndPartialTensorShape>& dtypes_and_shapes = {},
-    const std::vector<string>& allowed_devices = {}) TF_MUST_USE_RESULT;
+    const std::vector<DtypeAndPartialTensorShape>& dtypes_and_shapes = {})
+    TF_MUST_USE_RESULT;
 
 template <typename T>
 ResourceHandle MakeResourceHandle(
     OpKernelContext* ctx, const string& container, const string& name,
-    const std::vector<DtypeAndPartialTensorShape>& dtypes_and_shapes = {},
-    const std::vector<string>& allowed_devices = {}) {
-  return MakeResourceHandle(container.empty()
-                                ? ctx->resource_manager()->default_container()
-                                : container,
-                            name, *ctx->device(), MakeTypeIndex<T>(),
-                            dtypes_and_shapes, allowed_devices);
+    const std::vector<DtypeAndPartialTensorShape>& dtypes_and_shapes = {}) {
+  return MakeResourceHandle(
+      container.empty() ? ctx->resource_manager()->default_container()
+                        : container,
+      name, *ctx->device(), MakeTypeIndex<T>(), dtypes_and_shapes);
 }
 
 template <typename T>
 ResourceHandle MakeResourceHandle(
     OpKernelConstruction* ctx, const string& container, const string& name,
-    const std::vector<DtypeAndPartialTensorShape>& dtypes_and_shapes = {},
-    const std::vector<string>& allowed_devices = {}) {
-  return MakeResourceHandle(container.empty()
-                                ? ctx->resource_manager()->default_container()
-                                : container,
-                            name, *ctx->device(), MakeTypeIndex<T>(),
-                            dtypes_and_shapes, allowed_devices);
+    const std::vector<DtypeAndPartialTensorShape>& dtypes_and_shapes = {}) {
+  return MakeResourceHandle(
+      container.empty() ? ctx->resource_manager()->default_container()
+                        : container,
+      name, *ctx->device(), MakeTypeIndex<T>(), dtypes_and_shapes);
 }
 
 Status MakeResourceHandleToOutput(OpKernelContext* context, int output_index,
diff --git a/tensorflow/core/framework/resource_mgr_test.cc b/tensorflow/core/framework/resource_mgr_test.cc
index a48024123a6..f524ff77c11 100644
--- a/tensorflow/core/framework/resource_mgr_test.cc
+++ b/tensorflow/core/framework/resource_mgr_test.cc
@@ -352,51 +352,4 @@ TEST(ResourceHandleTest, DeleteUsingResourceHandle) {
   EXPECT_NE(LookupResource<StubResource>(&ctx, p, &lookup_r).ok(), true);
 }
 
-TEST(ResourceHandleTest, AllowedDevices) {
-  const std::vector<string> device_names = {
-      "/job:worker/replica:0/task:0/device:CPU:0",
-      "/job:worker/replica:0/task:0/device:CPU:2",
-      "/job:worker/replica:1/task:3/device:CPU:5"};
-  std::vector<StubDevice> devices;
-  for (const string& name : device_names) {
-    devices.emplace_back(name);
-  }
-
-  std::vector<OpKernelContext::Params> params(device_names.size());
-  std::vector<std::unique_ptr<ResourceMgr>> resource_mgrs;
-  std::vector<std::unique_ptr<OpKernelContext>> ctxs;
-  for (int i = 0; i < device_names.size(); ++i) {
-    resource_mgrs.emplace_back(
-        absl::make_unique<ResourceMgr>(/* default_container= */ ""));
-    params[i].resource_manager = resource_mgrs[i].get();
-    params[i].device = &(devices[i]);
-    ctxs.emplace_back(
-        absl::make_unique<OpKernelContext>(&(params[i]), /* num_outputs= */ 0));
-  }
-
-  const string partially_specified_name =
-      "/job:worker/replica:0/task:0/device:CPU:*";
-  const string& fully_specified_name = device_names.at(2);
-  const std::vector<string> allowed_devices = {partially_specified_name,
-                                               fully_specified_name};
-  // Create a ResourceHandle on device 0.
-  ResourceHandle p = MakeResourceHandle<StubResource>(
-      ctxs[0].get(), "container", "name",
-      /* dtypes_and_shapes= */ {}, allowed_devices);
-
-  std::vector<StubResource*> resources;
-  for (const auto& ctx : ctxs) {
-    StubResource* r = new StubResource;
-    TF_EXPECT_OK(CreateResource(ctx.get(), p, r));
-    resources.push_back(r);
-  }
-
-  for (int i = 0; i < ctxs.size(); ++i) {
-    core::RefCountPtr<StubResource> lookup_r;
-    TF_EXPECT_OK(LookupResource<StubResource>(ctxs[i].get(), p, &lookup_r));
-    EXPECT_EQ(lookup_r.get(), resources[i]);
-    TF_EXPECT_OK(DeleteResource(ctxs[i].get(), p));
-  }
-}
-
 }  // end namespace tensorflow
diff --git a/tensorflow/core/kernels/resource_variable_ops.cc b/tensorflow/core/kernels/resource_variable_ops.cc
index 0fc1d53749f..b9c883c7e2f 100644
--- a/tensorflow/core/kernels/resource_variable_ops.cc
+++ b/tensorflow/core/kernels/resource_variable_ops.cc
@@ -222,8 +222,6 @@ VarHandleOp::VarHandleOp(OpKernelConstruction* context) : OpKernel(context) {
   OP_REQUIRES_OK(context, context->GetAttr("dtype", &dtype_and_shape_.dtype));
   PartialTensorShape shape;
   OP_REQUIRES_OK(context, context->GetAttr("shape", &dtype_and_shape_.shape));
-  OP_REQUIRES_OK(context,
-                 context->GetAttr("allowed_devices", &allowed_devices_));
 
   is_anonymous_ = name_ == ResourceHandle::ANONYMOUS_NAME;
 
@@ -234,8 +232,7 @@ VarHandleOp::VarHandleOp(OpKernelConstruction* context) : OpKernel(context) {
                                                    &resource_, attr));
     resource_.scalar<ResourceHandle>()() = MakeResourceHandle<Var>(
         context, container_, name_,
-        std::vector<DtypeAndPartialTensorShape>{dtype_and_shape_},
-        allowed_devices_);
+        std::vector<DtypeAndPartialTensorShape>{dtype_and_shape_});
   }
 }
 
@@ -248,8 +245,7 @@ void VarHandleOp::Compute(OpKernelContext* ctx) {
         ctx, ctx->allocate_temp(DT_RESOURCE, TensorShape({}), &handle, attr));
     handle.scalar<ResourceHandle>()() = MakeResourceHandle<Var>(
         ctx, container_, name_,
-        std::vector<DtypeAndPartialTensorShape>{dtype_and_shape_},
-        allowed_devices_);
+        std::vector<DtypeAndPartialTensorShape>{dtype_and_shape_});
     ctx->set_output(0, handle);
   } else {
     ctx->set_output(0, resource_);
diff --git a/tensorflow/core/kernels/resource_variable_ops.h b/tensorflow/core/kernels/resource_variable_ops.h
index 5935fa91d21..1bb70b537c1 100644
--- a/tensorflow/core/kernels/resource_variable_ops.h
+++ b/tensorflow/core/kernels/resource_variable_ops.h
@@ -36,10 +36,6 @@ class VarHandleOp : public OpKernel {
   Tensor resource_;
 
   DtypeAndPartialTensorShape dtype_and_shape_;
-
-  // A set of devices containing the resource variable. Set when the output
-  // ResourceHandle represents a per-replica/partitioned resource variable.
-  std::vector<string> allowed_devices_;
 };
 
 class ReadVariableOp : public OpKernel {
diff --git a/tensorflow/python/kernel_tests/resource_variable_ops_test.py b/tensorflow/python/kernel_tests/resource_variable_ops_test.py
index b45e9dfb2bc..fb172fbcb10 100644
--- a/tensorflow/python/kernel_tests/resource_variable_ops_test.py
+++ b/tensorflow/python/kernel_tests/resource_variable_ops_test.py
@@ -30,7 +30,6 @@ from tensorflow.core.framework import tensor_pb2
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
-from tensorflow.python.framework import config
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import cpp_shape_inference_pb2
 from tensorflow.python.framework import dtypes
@@ -1513,41 +1512,5 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase,
     self.assertAllEqual(expected, result)
 
 
-class PerReplicaResourceHandleTest(test_util.TensorFlowTestCase):
-
-  def setUp(self):
-    super(PerReplicaResourceHandleTest, self).setUp()
-    cpus = config.list_physical_devices("CPU")
-    # Set 2 virtual CPUs
-    config.set_logical_device_configuration(cpus[0], [
-        context.LogicalDeviceConfiguration(),
-        context.LogicalDeviceConfiguration(),
-    ])
-
-  @test_util.disable_tfrt("Multiple device support. b/154956430")
-  def testAllowedDevices(self):
-    device0 = "/job:localhost/replica:0/task:0/device:CPU:0"
-    device1 = "/job:localhost/replica:0/task:0/device:CPU:1"
-    value0 = 1
-    value1 = 2
-    with context.eager_mode():
-      handle = resource_variable_ops.var_handle_op(
-          dtype=dtypes.int32, shape=[], allowed_devices=[device0, device1])
-      with ops.device(device0):
-        assign0 = resource_variable_ops.assign_variable_op(handle, value0)
-      with ops.device(device1):
-        assign1 = resource_variable_ops.assign_variable_op(handle, value1)
-      with ops.control_dependencies([assign0, assign1]):
-        with ops.device(device0):
-          read0 = resource_variable_ops.read_variable_op(
-              handle, dtype=dtypes.int32)
-        with ops.device(device1):
-          read1 = resource_variable_ops.read_variable_op(
-              handle, dtype=dtypes.int32)
-
-      self.assertAllEqual(value0, read0)
-      self.assertAllEqual(value1, read1)
-
-
 if __name__ == "__main__":
   test.main()

From c27b834b49002cca86e8ad2bcbb98a71dfa7538a Mon Sep 17 00:00:00 2001
From: Bruce Fontaine <bfontain@google.com>
Date: Mon, 22 Jun 2020 13:09:21 -0700
Subject: [PATCH 0788/1390] Wrap save/restore logic in tf.function when in
 eager mode. This allows parallel saving and restoring when using multiple
 devices.

PiperOrigin-RevId: 317719780
Change-Id: Ifb7e34f708da4121b49fb38d8dad046d45fedc42
---
 .../grappler/optimizers/function_optimizer.cc |   8 +-
 .../parallel_device/parallel_device_test.py   |   4 +
 .../python/framework/auto_control_deps.py     |   2 +-
 tensorflow/python/training/saving/BUILD       |   1 +
 .../training/saving/functional_saver.py       | 111 ++++++++++++------
 .../training/saving/functional_saver_test.py  |  17 ++-
 6 files changed, 101 insertions(+), 42 deletions(-)

diff --git a/tensorflow/core/grappler/optimizers/function_optimizer.cc b/tensorflow/core/grappler/optimizers/function_optimizer.cc
index a66e645e04b..0e156aaa84c 100644
--- a/tensorflow/core/grappler/optimizers/function_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/function_optimizer.cc
@@ -837,7 +837,6 @@ const bool IsExemptFromSideEffectsExecutionValidation(const string& op) {
        "ParameterizedTruncatedNormal", "TruncatedNormal", "RandomShuffle",
        "Multinomial", "RandomGamma", "RandomGammaGrad", "RandomPoisson",
        "RandomPoissonV2",
-       // LINT.ThenChange(//tensorflow/python/framework/auto_control_deps.py)
 
        // ReadVariableOp marked as stateful because it consumes DT_RESOURCE,
        // but it can't generate any observable side-effect.
@@ -851,7 +850,12 @@ const bool IsExemptFromSideEffectsExecutionValidation(const string& op) {
        // the same device_ordinal on the same host.
        "EnqueueTPUEmbeddingSparseBatch", "EnqueueTPUEmbeddingIntegerBatch",
        "EnqueueTPUEmbeddingSparseTensorBatch",
-       "EnqueueTPUEmbeddingRaggedTensorBatch"});
+       "EnqueueTPUEmbeddingRaggedTensorBatch",
+
+       // SaveV2 and RestoreV2 should be allowed to operate in parallel on
+       // multiple hosts.
+       "SaveV2", "RestoreV2"});
+  // LINT.ThenChange(//tensorflow/python/framework/auto_control_deps.py)
   return exemption->contains(op);
 }
 
diff --git a/tensorflow/python/distribute/parallel_device/parallel_device_test.py b/tensorflow/python/distribute/parallel_device/parallel_device_test.py
index 8fc3dcb5816..1429c522aba 100644
--- a/tensorflow/python/distribute/parallel_device/parallel_device_test.py
+++ b/tensorflow/python/distribute/parallel_device/parallel_device_test.py
@@ -172,6 +172,8 @@ class ParallelDeviceTests(_VirtualDeviceTestCase):
       config.set_synchronous_execution(previous)
 
   def test_checkpointing(self):
+    self.skipTest(
+        "Disable saving until SaveableObject's methods are traceable.")
     prefix = os.path.join(self.get_temp_dir(), "ckpt")
     with self.device.scope():
       different_values = self.device.pack(
@@ -263,6 +265,8 @@ class LayerTests(_VirtualDeviceTestCase):
     self.assertIn(self.device.components[1], final_kernels[1].backing_device)
 
   def test_training_loop(self):
+    self.skipTest(
+        "Disable saving until SaveableObject's methods are traceable.")
     for _ in range(5):
       layer = _Dense(5)
       checkpoint = tracking.Checkpoint(layer=layer)
diff --git a/tensorflow/python/framework/auto_control_deps.py b/tensorflow/python/framework/auto_control_deps.py
index 51dcb248b11..4b47735e0bf 100644
--- a/tensorflow/python/framework/auto_control_deps.py
+++ b/tensorflow/python/framework/auto_control_deps.py
@@ -100,7 +100,7 @@ _ORDER_INSENSITIVE_STATEFUL_OPS = [
     "CudnnRNNV2", "CudnnRNNV3", "CudnnRNNBackpropV2", "CudnnRNNBackpropV3",
     "EnqueueTPUEmbeddingSparseBatch", "EnqueueTPUEmbeddingIntegerBatch",
     "EnqueueTPUEmbeddingSparseTensorBatch",
-    "EnqueueTPUEmbeddingRaggedTensorBatch"
+    "EnqueueTPUEmbeddingRaggedTensorBatch", "RestoreV2", "SaveV2"
 ]
 # LINT.ThenChange(//tensorflow/core/grappler/optimizers/function_optimizer.cc)
 
diff --git a/tensorflow/python/training/saving/BUILD b/tensorflow/python/training/saving/BUILD
index 670a4c35c6f..12940840309 100644
--- a/tensorflow/python/training/saving/BUILD
+++ b/tensorflow/python/training/saving/BUILD
@@ -43,6 +43,7 @@ cuda_py_test(
         ":checkpoint_options",
         ":functional_saver",
         ":saveable_hook",
+        "//tensorflow/python/eager:remote",
         "//tensorflow/python/eager:test",
     ],
 )
diff --git a/tensorflow/python/training/saving/functional_saver.py b/tensorflow/python/training/saving/functional_saver.py
index c4334e096df..3a9b565470d 100644
--- a/tensorflow/python/training/saving/functional_saver.py
+++ b/tensorflow/python/training/saving/functional_saver.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 import uuid
 
 from tensorflow.core.protobuf import saver_pb2
+from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -161,7 +162,8 @@ class MultiDeviceSaver(object):
         self._after_restore_callbacks.append(saveable.after_restore)
 
       if is_saveable:
-        saveables_by_device.setdefault(saveable.device, []).append(saveable)
+        host_device = saveable_object_util.set_cpu0(saveable.device)
+        saveables_by_device.setdefault(host_device, []).append(saveable)
 
     self._single_device_savers = {
         device: _SingleDeviceSaver(saveables)
@@ -247,33 +249,50 @@ class MultiDeviceSaver(object):
       tmp_checkpoint_prefix = string_ops.string_join(
           [file_prefix, sharded_suffix])
 
-    num_shards = len(self._single_device_savers)
-    sharded_saves = []
-    sharded_prefixes = []
-    num_shards_tensor = constant_op.constant(num_shards, name="num_shards")
-    last_device = None
-    for shard, (device, saver) in enumerate(
-        sorted(self._single_device_savers.items())):
-      last_device = device
-      with ops.device(saveable_object_util.set_cpu0(device)):
-        shard_prefix = sharded_filename(tmp_checkpoint_prefix, shard,
-                                        num_shards_tensor)
-      sharded_prefixes.append(shard_prefix)
-      with ops.device(device):
-        # _SingleDeviceSaver will use the CPU device when necessary, but initial
-        # read operations should be placed on the SaveableObject's device.
-        sharded_saves.append(saver.save(shard_prefix, options))
+    def save_fn():
+      num_shards = len(self._single_device_savers)
+      sharded_saves = []
+      sharded_prefixes = []
+      num_shards_tensor = constant_op.constant(num_shards, name="num_shards")
+      last_device = None
+      for shard, (device, saver) in enumerate(
+          sorted(self._single_device_savers.items())):
+        last_device = device
+        with ops.device(saveable_object_util.set_cpu0(device)):
+          shard_prefix = sharded_filename(tmp_checkpoint_prefix, shard,
+                                          num_shards_tensor)
+        sharded_prefixes.append(shard_prefix)
+        with ops.device(device):
+          # _SingleDeviceSaver will use the CPU device when necessary, but
+          # initial read operations should be placed on the SaveableObject's
+          # device.
+          sharded_saves.append(saver.save(shard_prefix, options))
 
-    with ops.control_dependencies(sharded_saves):
-      # Merge on the io_device if specified, otherwise co-locates the merge op
-      # with the last device used.
-      merge_device = (options.experimental_io_device or
-                      saveable_object_util.set_cpu0(last_device))
-      with ops.device(merge_device):
-        # V2 format write path consists of a metadata merge step.  Once merged,
-        # attempts to delete the temporary directory, "<user-fed prefix>_temp".
-        return gen_io_ops.merge_v2_checkpoints(
-            sharded_prefixes, file_prefix, delete_old_dirs=True)
+      with ops.control_dependencies(sharded_saves):
+        # Merge on the io_device if specified, otherwise co-locates the merge op
+        # with the last device used.
+        merge_device = (
+            options.experimental_io_device or
+            saveable_object_util.set_cpu0(last_device))
+        with ops.device(merge_device):
+          # V2 format write path consists of a metadata merge step.  Once
+          # merged, attempts to delete the temporary directory,
+          # "<user-fed prefix>_temp".
+          return gen_io_ops.merge_v2_checkpoints(
+              sharded_prefixes, file_prefix, delete_old_dirs=True)
+
+    # Since this will causes a function re-trace on each save, limit this to the
+    # cases where it is needed: eager and when there are multiple tasks/single
+    # device savers. Note that the retrace is needed to ensure we pickup the
+    # latest values of options like experimental_io_device.
+    if context.executing_eagerly() and len(self._single_device_savers) > 1:
+      # Explicitly place the identity op on the first device.
+      @def_function.function(experimental_compile=False)
+      def tf_function_save():
+        save_fn()
+      tf_function_save()
+    else:
+      return save_fn()
 
   def restore(self, file_prefix, options=None):
     """Restore the saveable objects from a checkpoint with `file_prefix`.
@@ -287,12 +306,38 @@ class MultiDeviceSaver(object):
       A dictionary mapping from SaveableObject names to restore operations.
     """
     options = options or checkpoint_options.CheckpointOptions()
-    restore_ops = {}
-    # Sort by device name to avoid propagating non-deterministic dictionary
-    # ordering in some Python versions.
-    for device, saver in sorted(self._single_device_savers.items()):
-      with ops.device(device):
-        restore_ops.update(saver.restore(file_prefix, options))
+
+    def restore_fn():
+      restore_ops = {}
+      # Sort by device name to avoid propagating non-deterministic dictionary
+      # ordering in some Python versions.
+      for device, saver in sorted(self._single_device_savers.items()):
+        with ops.device(device):
+          restore_ops.update(saver.restore(file_prefix, options))
+
+      return restore_ops
+
+    # Since this will causes a function re-trace on each save, limit this to the
+    # cases where it is needed: eager and when there are multiple tasks/single
+    # device savers. Note that the retrace is needed to ensure we pickup the
+    # latest values of options like experimental_io_device.
+    if context.executing_eagerly() and len(self._single_device_savers) > 1:
+      first_device, _ = list(self._single_device_savers.items())[0]
+      @def_function.function(experimental_compile=False)
+      def tf_function_restore():
+        restore_ops = restore_fn()
+        restore_tensors = {}
+        # tf.functions must return tensors, thus we use control dependencies so
+        # that we can return a tensor which depends on the given op.
+        with ops.device(saveable_object_util.set_cpu0(first_device)):
+          for name, op in restore_ops.items():
+            with ops.control_dependencies([op]):
+              restore_tensors[name] = array_ops.identity(file_prefix)
+        return restore_tensors
+
+      restore_ops = tf_function_restore()
+    else:
+      restore_ops = restore_fn()
 
     for callback in self._after_restore_callbacks:
       callback()
diff --git a/tensorflow/python/training/saving/functional_saver_test.py b/tensorflow/python/training/saving/functional_saver_test.py
index 7db32ff72d7..8f3eef4fb9c 100644
--- a/tensorflow/python/training/saving/functional_saver_test.py
+++ b/tensorflow/python/training/saving/functional_saver_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 import os
 
 from tensorflow.python.eager import context
+from tensorflow.python.eager import remote
 from tensorflow.python.eager import test
 from tensorflow.python.eager import wrap_function
 from tensorflow.python.framework import config
@@ -29,6 +30,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.platform import gfile
+from tensorflow.python.training import server_lib
 from tensorflow.python.training.saving import checkpoint_options
 from tensorflow.python.training.saving import functional_saver
 from tensorflow.python.training.saving import saveable_hook
@@ -126,13 +128,16 @@ class SaverTest(test.TestCase):
     second_saver.restore(save_path)
     self.assertEqual(2., self.evaluate(v2))
 
-  @test_util.run_in_graph_and_eager_modes
-  def test_checkpoint_is_sharded_by_device(self):
-    with ops.device("cpu:0"):
+  def test_checkpoint_is_sharded_by_task(self):
+    servers = [server_lib.Server.create_local_server() for _ in range(3)]
+    cluster_spec = server_lib.ClusterSpec({
+        "worker": [s.target[len("grpc://"):] for s in servers]})
+    remote.connect_to_cluster(cluster_spec)
+    with ops.device("/job:worker/task:0/cpu:0"):
       v0 = resource_variable_ops.ResourceVariable(0.)
-    with ops.device("cpu:1"):
+    with ops.device("/job:worker/task:1/cpu:0"):
       v1 = resource_variable_ops.ResourceVariable(1.)
-    with ops.device("cpu:2"):
+    with ops.device("/job:worker/task:2/cpu:0"):
       v2 = resource_variable_ops.ResourceVariable(2.)
 
     self.evaluate([v0.initializer, v1.initializer, v2.initializer])
@@ -167,7 +172,7 @@ class SaverTest(test.TestCase):
         list(saveable_object_util.saveable_objects_for_op(v2, "v2")))
     prefix = os.path.join(self.get_temp_dir(), "ckpt")
     self.evaluate(saver.save(constant_op.constant(prefix), self.local_options))
-    self.assertEqual(4, len(gfile.Glob(prefix + "*")))
+    self.assertEqual(2, len(gfile.Glob(prefix + "*")))
     self.evaluate(v0.assign(-1.))
     self.evaluate(v1.assign(-1.))
     self.evaluate(v2.assign(-1.))

From de1fc3c67d1e419bd26381ccfdf1a96ae50ad3b8 Mon Sep 17 00:00:00 2001
From: abhichou4 <abhichou4@gmail.com>
Date: Tue, 23 Jun 2020 01:59:52 +0530
Subject: [PATCH 0789/1390] wrap _jvp_helper

---
 tensorflow/python/eager/forwardprop.py      | 62 +++++++++++++++++----
 tensorflow/python/eager/forwardprop_test.py | 36 ++++++++++++
 2 files changed, 86 insertions(+), 12 deletions(-)

diff --git a/tensorflow/python/eager/forwardprop.py b/tensorflow/python/eager/forwardprop.py
index d1b08e655ed..ed634780d23 100644
--- a/tensorflow/python/eager/forwardprop.py
+++ b/tensorflow/python/eager/forwardprop.py
@@ -140,6 +140,48 @@ def _jvp_helper(op_name, attr_tuple, inputs, outputs, tangents):
     return output_tangents
 
 
+def _jvp_helper_wrapper(
+    op_name, attr_tuple, inputs, outputs, tangents, batch_size
+):
+    """Computes a batch of Jacobian-vector product for an op.
+
+  Args:
+    op_name: A string, the type of operation being executed.
+    attr_tuple: Attributes of the operation.
+    inputs: A flat list of input Tensors to the operation.
+    outputs: A flat list of output Tensors from the operation.
+    tangents: A flat list of Tensors, same shape as `[batch_size] + input_shape`.
+
+  Returns:
+    A flat list of tangents corresponding to `outputs`.
+  """
+  use_pfor = False
+  if batch_size is not None:
+    use_pfor = True
+    for primal, tangent in zip(inputs, tangents):
+      if tangent.rank == primal.rank + 1:
+        if tangent.shape != [batch_size] + primal.shape:
+          raise ValueError(
+            "Tangent {} was expected to be of shape "
+            "{} but is instead of shape {}".format(
+            tangent, [batch_size] + primal.shape, tangent.shape
+          )
+        )
+        else:
+          raise ValueError(
+            "Invalid argument batch_size for rank "
+            "{}, {} tangents and primals".format(tangent.rank, primal.rank)
+          )
+
+  if use_pfor:
+    return control_flow_ops.vectorized_map(
+      functools.partial(_jvp_helper, op_name, attr_tuple, inputs, outputs),
+      tangents,
+    )
+  else:
+    return _jvp_helper(op_name, attr_tuple, inputs, outputs, tangents)
+
+
 # TODO(allenl): experimental_relax_shapes for gradients which rely on static
 # shape information are underspecialized. We may want hand-written forward
 # implementations, or a more satisfying story about how we re-specialize
@@ -152,36 +194,32 @@ def _jvp_helper(op_name, attr_tuple, inputs, outputs, tangents):
 # run unnecessary computation. The function does not create variables, so the
 # two symbols are otherwise equivalent.
 _jvp_relaxed_shapes = function.defun(
-    _jvp_helper, experimental_relax_shapes=True)
-_jvp_exact_shapes = function.defun(
-    _jvp_helper, experimental_relax_shapes=False)
+    _jvp_helper_wrapper, experimental_relax_shapes=True
+)
+_jvp_exact_shapes = function.defun(_jvp_helper_wrapper, experimental_relax_shapes=False)
 
 # The maximum number of exact-shape traces to perform for a single op before
 # switching to shape relaxation.
 _TRACE_COUNT_LIMIT = 32
 
 
-def _jvp_dispatch(op_name, attr_tuple, inputs, outputs, tangents):
+def _jvp_dispatch(
+  op_name, attr_tuple, inputs, outputs, tangents, batch_size=None
+):
   """Determine which forwardprop function to call."""
   # Note that this _TRACE_COUNT read races with writes. That's fine, it just
   # means we may trace a few more exact shapes before moving on to relaxation.
   if _TRACE_COUNT.get(op_name, 0) < _TRACE_COUNT_LIMIT:
     return _jvp_exact_shapes(
-        op_name, attr_tuple, inputs, outputs, tangents)
+        op_name, attr_tuple, inputs, outputs, tangents, batch_size)
   else:
     return _jvp_relaxed_shapes(
-        op_name, attr_tuple, inputs, outputs, tangents)
+        op_name, attr_tuple, inputs, outputs, tangents, batch_size)
 
 
 pywrap_tfe.TFE_Py_RegisterJVPFunction(_jvp_dispatch)
 
 
-def _jvp_dispatch_batch(op_name, attr_tuple, inputs, outputs, tangents):
-  """Computes jvps of a regular op for a batch of tangents"""
-  return control_flow_ops.vectorized_map(
-      functools.partial(_jvp_dispatch, op_name, attr_tuple, inputs, outputs),
-      tangents)
-
 @tf_export("autodiff.ForwardAccumulator", v1=[])
 class ForwardAccumulator(object):
   """Computes Jacobian-vector products ("JVP"s) using forward-mode autodiff.
diff --git a/tensorflow/python/eager/forwardprop_test.py b/tensorflow/python/eager/forwardprop_test.py
index 4a1156f534f..b80ee177297 100644
--- a/tensorflow/python/eager/forwardprop_test.py
+++ b/tensorflow/python/eager/forwardprop_test.py
@@ -246,6 +246,42 @@ class ForwardpropTest(test.TestCase, parameterized.TestCase):
         ))
     self.assertAllClose([2. * 5. + 3. * 4.], self.evaluate(vp))
 
+  def testJVPFunctionWithBatchOfTangents(self):
+    add_outputs = (constant_op.constant(4.),)
+    jvp_flat = forwardprop._jvp_dispatch(
+        op_name="Add",
+        attr_tuple=(),
+        inputs=(constant_op.constant(1.), constant_op.constant(3.)),
+        outputs=add_outputs,
+        tangents=(
+            constant_op.constant([1., 2., 3.]),
+            constant_op.constant([4., 5., 6.]),
+        ),
+	batch_size=3)
+
+    # Using evaluate and asserting with just a list works too
+    # but the output is more explicit this way
+    self.assertAllClose(
+      [constant_op.constant([1. + 4., 2. + 5., 3. + 6.])],
+      jvp_flat
+    )
+
+    mul_outputs = (constant_op.constant([20.]),)
+    jvp_flat = forwardprop._jvp_dispatch(
+        op_name="Mul",
+        attr_tuple=(),
+        inputs=(constant_op.constant([4.]), constant_op.constant([5.])),
+        outputs=mul_outputs,
+        tangents=(
+            constant_op.constant([[1.], [0.], [1.]]),
+            constant_op.constant([[0.], [1.], [1.]]),
+        ),
+	batch_size=3)
+    self.assertAllClose(
+      [constant_op.constant([[5.], [4.], [5. + 4.]])],
+      jvp_flat
+    )
+
   def testNonDifferentiableOpWithInputTangent(self):
     x = constant_op.constant(1.)
     with forwardprop.ForwardAccumulator(x, 2.) as acc1:

From 50086e6b0f6bcfd271db08d4ce5b817a92a660cd Mon Sep 17 00:00:00 2001
From: XingyuLong <halolong@hotmail.com>
Date: Mon, 22 Jun 2020 16:35:49 -0400
Subject: [PATCH 0790/1390] update

---
 .bazelrc                                      |  80 +++---
 .../parallel_device/parallel_device_lib.cc    |  10 +-
 .../parallel_device_testlib.cc                |   4 +-
 tensorflow/compiler/jit/kernels/xla_ops.cc    |  49 ++--
 .../compiler/jit/xla_compile_on_demand_op.cc  |  55 ++--
 .../compiler/jit/xla_compile_on_demand_op.h   |   5 +-
 tensorflow/compiler/jit/xla_launch_util.cc    |  91 +++----
 tensorflow/compiler/jit/xla_launch_util.h     |  61 +++--
 .../compiler/mlir/lite/flatbuffer_export.cc   |   6 +-
 .../compiler/mlir/lite/tf_tfl_passes.cc       |  14 +-
 .../mlir/lite/transforms/post_quantize.cc     |   2 +-
 .../compiler/mlir/tensorflow/ir/tf_device.cc  |   6 +-
 .../mlir/tensorflow/ir/tf_executor.cc         |   6 +-
 .../mlir/tensorflow/transforms/fold_switch.cc |   2 +-
 .../materialize_mlir_passthrough_op.cc        |   2 +-
 .../transforms/promote_resources_to_args.cc   |   4 +-
 .../transforms/resource_op_lifting.cc         |   2 +-
 .../tpu_sharding_identification_pass.cc       |  11 +-
 .../tensorflow/translate/export_graphdef.cc   |   2 +-
 .../translate/tf_functional_to_executor.cc    |   4 +-
 .../translate/translate_tf_dialect_op.cc      |   5 +-
 .../compiler/mlir/xla/mlir_hlo_to_hlo.cc      |   4 +-
 .../xla/transforms/hlo_legalize_to_lhlo.cc    |   8 +-
 .../xla/transforms/legalize_tf_with_tf2xla.cc |   8 +-
 .../mlir/xla/transforms/lhlo_fuse_linalg.cc   |   5 +-
 .../xla/transforms/lhlo_legalize_to_affine.cc |  89 +++---
 .../mlir/xla/transforms/xla_hlo_fusion.cc     |   2 +-
 .../xla/service/cpu/cpu_executable.cc         |  11 +-
 tensorflow/compiler/xla/service/executable.cc |  47 ++++
 tensorflow/compiler/xla/service/executable.h  |  38 +--
 .../xla/service/gpu/tests/gpu_ftz_test.cc     |  17 +-
 .../xla/service/maybe_owning_device_memory.cc |   8 +
 .../xla/service/maybe_owning_device_memory.h  |   4 +
 .../xla/service/mlir_gpu/kernel_lowering.cc   |   2 +-
 tensorflow/compiler/xla/shape_util.cc         |  32 ++-
 tensorflow/compiler/xla/shape_util.h          |   6 +-
 tensorflow/compiler/xrt/BUILD                 |   1 +
 .../compiler/xrt/kernels/xrt_execute_op.cc    | 255 ++++++++----------
 tensorflow/compiler/xrt/xrt_util.cc           | 134 +++++++++
 tensorflow/compiler/xrt/xrt_util.h            |  21 ++
 tensorflow/core/BUILD                         |   1 +
 .../base_api/api_def_VarHandleOp.pbtxt        |   4 +-
 .../core/common_runtime/composite_device.cc   |  17 +-
 .../core/common_runtime/composite_device.h    |   5 +
 .../common_runtime/composite_device_test.cc   |  16 ++
 .../core/common_runtime/eager/context.cc      |  24 +-
 .../core/common_runtime/eager/context.h       |   7 +-
 .../core/common_runtime/eager/context_test.cc |  31 +++
 .../core/common_runtime/eager/execute.cc      |  34 +--
 .../common_runtime/eager/tensor_handle.cc     |  60 ++---
 .../core/common_runtime/eager/tensor_handle.h |  22 +-
 .../eager/tensor_handle_test.cc               |  17 +-
 .../common_runtime/gpu/gpu_device_test.cc     |  18 +-
 .../eager/eager_service_impl.cc               |   2 +-
 .../eager/eager_service_impl_test.cc          |   4 +
 .../eager/remote_copy_node.cc                 |   1 +
 .../distributed_runtime/eager/remote_mgr.cc   |  12 +-
 tensorflow/core/framework/resource_handle.cc  |   6 -
 tensorflow/core/framework/resource_handle.h   |  13 +-
 .../core/framework/resource_handle.proto      |   4 +-
 tensorflow/core/framework/resource_mgr.cc     |  39 +--
 tensorflow/core/framework/resource_mgr.h      |  28 +-
 .../core/framework/resource_mgr_test.cc       |  47 ----
 .../grappler/optimizers/function_optimizer.cc |   8 +-
 .../data/experimental/snapshot_util_test.cc   |   6 +
 .../core/kernels/non_max_suppression_op.cu.cc |  19 --
 .../core/kernels/resource_variable_ops.cc     |   8 +-
 .../core/kernels/resource_variable_ops.h      |   4 -
 .../core/lib/gtl/manual_constructor_test.cc   |   2 +-
 tensorflow/core/lib/io/BUILD                  |  18 ++
 tensorflow/core/lib/io/record_reader.cc       |  36 +--
 tensorflow/core/lib/io/record_reader.h        |  14 +-
 .../core/lib/io/record_reader_writer_test.cc  |  38 +++
 tensorflow/core/lib/io/record_writer.cc       |  44 +--
 tensorflow/core/lib/io/record_writer.h        |  15 +-
 .../io/snappy/snappy_compression_options.h    |  36 +++
 .../profiler/internal/gpu/cupti_tracer.cc     |   7 +-
 .../profiler/internal/gpu/device_tracer.cc    |   2 +-
 tensorflow/core/protobuf/eager_service.proto  |   2 +
 tensorflow/core/util/gpu_device_functions.h   |   4 +
 .../core/util/gpu_kernel_helper_test.cu.cc    | 128 +++++----
 tensorflow/lite/delegates/gpu/cl/arguments.cc |  13 +-
 tensorflow/lite/delegates/gpu/cl/arguments.h  |   4 +-
 tensorflow/lite/delegates/gpu/cl/gpu_object.h |   9 -
 .../lite/delegates/gpu/cl/kernels/prelu.h     |  14 +-
 .../lite/delegates/gpu/cl/kernels/winograd.cc |  33 +--
 .../lite/delegates/gpu/cl/linear_storage.cc   |   9 +-
 .../lite/delegates/gpu/cl/linear_storage.h    |  43 +--
 tensorflow/lite/delegates/gpu/cl/tensor.h     |   3 -
 .../lite/delegates/gpu/cl/tensor_type.h       |  14 -
 tensorflow/lite/kernels/BUILD                 |  58 ++--
 tensorflow/lite/kernels/conv.cc               |  14 +-
 tensorflow/lite/kernels/conv_test.cc          |   2 +-
 tensorflow/lite/kernels/cpu_backend_context.h |   2 +-
 tensorflow/lite/kernels/cpu_backend_gemm.h    |   6 +-
 .../kernels/cpu_backend_gemm_custom_gemv.h    |   6 +-
 .../lite/kernels/cpu_backend_gemm_eigen.cc    |   4 +-
 .../lite/kernels/cpu_backend_gemm_eigen.h     |   4 +-
 .../lite/kernels/cpu_backend_gemm_gemmlowp.h  |   4 +-
 .../lite/kernels/cpu_backend_threadpool.h     |   6 +-
 .../optimized/depthwiseconv_multithread.h     |   2 +-
 tensorflow/lite/kernels/lstm_eval.cc          |  47 ++--
 tensorflow/lite/micro/micro_allocator.cc      |  21 +-
 tensorflow/lite/toco/args.cc                  |   2 +-
 .../convert_expanddims_to_reshape.cc          |   5 +-
 .../convert_reorder_axes.cc                   |   7 +-
 .../convert_trivial_pack_to_reshape.cc        |   2 +-
 .../convert_trivial_transpose_to_reshape.cc   |   5 +-
 .../create_im2col_arrays.cc                   |   4 +-
 .../toco/graph_transformations/dequantize.cc  |  22 +-
 .../ensure_bias_vectors.cc                    |   9 +-
 ...int8_weights_safe_for_fast_int8_kernels.cc |   2 +-
 .../fuse_binary_into_preceding_affine.cc      |   2 +-
 .../graph_transformations.cc                  |  20 +-
 .../graph_transformations.h                   |  14 +-
 .../group_bidirectional_sequence_ops.cc       |  34 +--
 .../graph_transformations/hardcode_min_max.cc |   8 +-
 .../identify_dilated_conv.cc                  |   2 +-
 .../identify_hardswish.cc                     |   2 +-
 .../graph_transformations/identify_lstm.cc    |  10 +-
 .../identify_lstm_merge_inputs.cc             |  13 +-
 .../identify_lstm_split_inputs.cc             |   2 +-
 .../toco/graph_transformations/lstm_utils.cc  |  15 +-
 .../toco/graph_transformations/lstm_utils.h   |  15 +-
 .../make_initial_dequantize_operator.cc       |   5 +-
 .../merge_reshape_into_preceding_transpose.cc |   4 +-
 .../move_binary_operator_before_reshape.cc    |   2 +-
 ...gate_activation_function_into_constants.cc |   2 +-
 .../propagate_array_data_types.cc             |   4 +-
 .../propagate_default_min_max.cc              |   2 +-
 .../propagate_fixed_sizes.cc                  |  42 +--
 .../quantization_util.cc                      |   4 +-
 .../graph_transformations/quantization_util.h |   2 +-
 .../toco/graph_transformations/quantize.cc    |   2 +-
 ...minmax_and_narrow_range_from_fake_quant.cc |   2 +-
 .../remove_successive_transpose.cc            |   4 +-
 .../remove_trivial_concatenation_input.cc     |   8 +-
 .../remove_trivial_passthrough.cc             |   7 +-
 ...emove_trivial_quantized_activation_func.cc |   4 +-
 .../remove_trivial_quantized_min_max.cc       |   4 +-
 .../graph_transformations/remove_unused_op.cc |   2 +-
 .../reorder_elementwise_unary.cc              |   6 +-
 .../reorder_reshape_transpose.cc              |   6 +-
 .../resolve_batch_normalization.cc            |  10 +-
 .../resolve_constant_concatenation.cc         |   6 +-
 .../resolve_reorder_axes.cc                   |  14 +-
 .../resolve_tensorflow_concat.cc              |   4 +-
 .../resolve_tensorflow_matmul.cc              |   6 +-
 .../resolve_tensorflow_switch.cc              |   2 +-
 .../shuffle_fc_weights.cc                     |   4 +-
 .../unfuse_activation_functions.cc            |   2 +-
 .../unpartition_embedding_lookup.cc           |   9 +-
 .../unroll_batch_matmul.cc                    |  25 +-
 tensorflow/lite/tools/make/Makefile           |   2 +-
 .../custom_training_loop_input_test.py        |  10 +-
 .../parallel_device/parallel_device_test.py   |   4 +
 tensorflow/python/distribute/tpu_strategy.py  |  43 +--
 .../python/distribute/tpu_strategy_test.py    |  69 ++++-
 tensorflow/python/eager/BUILD                 |   1 +
 tensorflow/python/eager/backprop.py           |   9 +-
 tensorflow/python/eager/backprop_test.py      |  69 -----
 tensorflow/python/eager/def_function_test.py  |   4 +-
 tensorflow/python/eager/function.py           |  47 ++--
 tensorflow/python/eager/function_test.py      |  52 +++-
 .../python/framework/auto_control_deps.py     |   2 +-
 .../python/framework/convert_to_constants.py  |   3 +-
 .../framework/convert_to_constants_test.py    |  34 +++
 tensorflow/python/keras/callbacks.py          |  15 +-
 tensorflow/python/keras/datasets/imdb.py      |  18 +-
 tensorflow/python/keras/engine/base_layer.py  |  58 +++-
 .../python/keras/engine/base_layer_test.py    |  91 +++++++
 .../python/keras/engine/functional_test.py    |  68 +++--
 .../keras/layers/rnn_cell_wrapper_v2.py       |   5 +
 .../keras/tests/custom_training_loop_test.py  |   2 +-
 .../resource_variable_ops_test.py             |  37 ---
 tensorflow/python/ops/image_ops_impl.py       |  31 ++-
 tensorflow/python/ops/image_ops_test.py       |  23 ++
 tensorflow/python/ops/numpy_ops/np_arrays.py  |  21 +-
 .../python/ops/numpy_ops/np_interop_test.py   |   4 +-
 tensorflow/python/training/saving/BUILD       |   1 +
 .../training/saving/functional_saver.py       | 111 +++++---
 .../training/saving/functional_saver_test.py  |  17 +-
 tensorflow/tools/pip_package/setup.py         |   4 +-
 tensorflow/workspace.bzl                      |  20 +-
 third_party/gpus/cuda/BUILD                   |   9 -
 third_party/gpus/rocm/BUILD                   |   6 -
 third_party/nccl/archive.BUILD                |   1 +
 third_party/nccl/archive.patch                | 181 ++++++++++++-
 .../preconfig/generate/containers.bzl         |   1 +
 .../toolchains/remote_config/configs.bzl      |  13 +
 .../toolchains/remote_config/containers.bzl   |   7 +
 191 files changed, 2258 insertions(+), 1458 deletions(-)
 create mode 100644 tensorflow/core/lib/io/snappy/snappy_compression_options.h

diff --git a/.bazelrc b/.bazelrc
index e67c3eecc3b..f11c376df65 100644
--- a/.bazelrc
+++ b/.bazelrc
@@ -58,13 +58,12 @@
 #
 #
 # Remote build execution options (only configured to work with TF team projects for now.)
-#     rbe:        General RBE options shared by all flavors.
-#     rbe_linux:  General RBE options used on all linux builds.
-#     rbe_win:    General RBE options used on all windows builds.
+#     rbe:       General RBE options shared by all flavors.
+#     rbe_linux: General RBE options used on all linux builds.
+#     rbe_win:   General RBE options used on all windows builds.
 #
-#     rbe_cpu_linux:        RBE options to build with only CPU support.
-#     rbe_linux_cuda_nvcc:  RBE options to build with GPU support using nvcc.
-#     rbe_gpu_linux:        An alias for rbe_linux_cuda_nvcc
+#     rbe_cpu_linux:           RBE options to build with only CPU support.
+#     rbe_linux_cuda_nvcc_py*: RBE options to build with GPU support using nvcc.
 #
 #     rbe_linux_py2: Linux Python 2 RBE config.
 #     rbe_linux_py3: Linux Python 3 RBE config
@@ -400,33 +399,48 @@ build:rbe_linux_cuda_base --repo_env=REMOTE_GPU_TESTING=1
 build:rbe_linux_cuda_base --repo_env=TF_NEED_CUDA=1
 test:rbe_linux_cuda_base --test_env=LD_LIBRARY_PATH="/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64"
 
-build:rbe_linux_cuda_nvcc --config=rbe_linux_cuda_base
-build:rbe_linux_cuda_nvcc --crosstool_top="@ubuntu18.04-py3-gcc7_manylinux2010-cuda10.1-cudnn7-tensorrt6.0_config_cuda//crosstool:toolchain"
-build:rbe_linux_cuda_nvcc --extra_toolchains="@ubuntu18.04-py3-gcc7_manylinux2010-cuda10.1-cudnn7-tensorrt6.0_config_cuda//crosstool:toolchain-linux-x86_64"
-build:rbe_linux_cuda_nvcc --extra_execution_platforms="@ubuntu18.04-py3-gcc7_manylinux2010-cuda10.1-cudnn7-tensorrt6.0_config_platform//:platform"
-build:rbe_linux_cuda_nvcc --host_platform="@ubuntu18.04-py3-gcc7_manylinux2010-cuda10.1-cudnn7-tensorrt6.0_config_platform//:platform"
-build:rbe_linux_cuda_nvcc --platforms="@ubuntu18.04-py3-gcc7_manylinux2010-cuda10.1-cudnn7-tensorrt6.0_config_platform//:platform"
-build:rbe_linux_cuda_nvcc --repo_env=TF_CUDA_CONFIG_REPO="@ubuntu18.04-py3-gcc7_manylinux2010-cuda10.1-cudnn7-tensorrt6.0_config_cuda"
-build:rbe_linux_cuda_nvcc --repo_env=TF_TENSORRT_CONFIG_REPO="@ubuntu18.04-py3-gcc7_manylinux2010-cuda10.1-cudnn7-tensorrt6.0_config_tensorrt"
-build:rbe_linux_cuda_nvcc --repo_env=TF_NCCL_CONFIG_REPO="@ubuntu18.04-py3-gcc7_manylinux2010-cuda10.1-cudnn7-tensorrt6.0_config_nccl"
-build:rbe_linux_cuda_nvcc --define=using_cuda_nvcc=true
-test:rbe_linux_cuda_nvcc --config=rbe_linux_cuda_base
+build:rbe_linux_cuda10.1_nvcc_base --config=rbe_linux_cuda_base
+build:rbe_linux_cuda10.1_nvcc_base --define=using_cuda_nvcc=true
+build:rbe_linux_cuda10.1_nvcc_base --crosstool_top="@ubuntu18.04-gcc7_manylinux2010-cuda10.1-cudnn7-tensorrt6.0_config_cuda//crosstool:toolchain"
+build:rbe_linux_cuda10.1_nvcc_base --extra_toolchains="@ubuntu18.04-gcc7_manylinux2010-cuda10.1-cudnn7-tensorrt6.0_config_cuda//crosstool:toolchain-linux-x86_64"
+build:rbe_linux_cuda10.1_nvcc_base --extra_execution_platforms="@ubuntu18.04-gcc7_manylinux2010-cuda10.1-cudnn7-tensorrt6.0_config_platform//:platform"
+build:rbe_linux_cuda10.1_nvcc_base --host_platform="@ubuntu18.04-gcc7_manylinux2010-cuda10.1-cudnn7-tensorrt6.0_config_platform//:platform"
+build:rbe_linux_cuda10.1_nvcc_base --platforms="@ubuntu18.04-gcc7_manylinux2010-cuda10.1-cudnn7-tensorrt6.0_config_platform//:platform"
+build:rbe_linux_cuda10.1_nvcc_base --repo_env=TF_CUDA_CONFIG_REPO="@ubuntu18.04-gcc7_manylinux2010-cuda10.1-cudnn7-tensorrt6.0_config_cuda"
+build:rbe_linux_cuda10.1_nvcc_base --repo_env=TF_TENSORRT_CONFIG_REPO="@ubuntu18.04-gcc7_manylinux2010-cuda10.1-cudnn7-tensorrt6.0_config_tensorrt"
+build:rbe_linux_cuda10.1_nvcc_base --repo_env=TF_NCCL_CONFIG_REPO="@ubuntu18.04-gcc7_manylinux2010-cuda10.1-cudnn7-tensorrt6.0_config_nccl"
+build:rbe_linux_cuda10.1_nvcc_py2.7 --config=rbe_linux_cuda10.1_nvcc_base --repo_env=TF_PYTHON_CONFIG_REPO="@ubuntu18.04-gcc7_manylinux2010-cuda10.1-cudnn7-tensorrt6.0_config_python2.7"
+build:rbe_linux_cuda10.1_nvcc_py3.5 --config=rbe_linux_cuda10.1_nvcc_base --repo_env=TF_PYTHON_CONFIG_REPO="@ubuntu18.04-gcc7_manylinux2010-cuda10.1-cudnn7-tensorrt6.0_config_python3.5"
+build:rbe_linux_cuda10.1_nvcc_py3.6 --config=rbe_linux_cuda10.1_nvcc_base --repo_env=TF_PYTHON_CONFIG_REPO="@ubuntu18.04-gcc7_manylinux2010-cuda10.1-cudnn7-tensorrt6.0_config_python3.6"
+build:rbe_linux_cuda10.1_nvcc_py3.7 --config=rbe_linux_cuda10.1_nvcc_base --repo_env=TF_PYTHON_CONFIG_REPO="@ubuntu18.04-gcc7_manylinux2010-cuda10.1-cudnn7-tensorrt6.0_config_python3.7"
+build:rbe_linux_cuda10.1_nvcc_py3.8 --config=rbe_linux_cuda10.1_nvcc_base --repo_env=TF_PYTHON_CONFIG_REPO="@ubuntu18.04-gcc7_manylinux2010-cuda10.1-cudnn7-tensorrt6.0_config_python3.8"
 
-build:rbe_linux_cuda_nvcc_base --config=rbe_linux_cuda_base
-build:rbe_linux_cuda_nvcc_base --crosstool_top="@ubuntu18.04-gcc7_manylinux2010-cuda10.1-cudnn7-tensorrt6.0_config_cuda//crosstool:toolchain"
-build:rbe_linux_cuda_nvcc_base --extra_toolchains="@ubuntu18.04-gcc7_manylinux2010-cuda10.1-cudnn7-tensorrt6.0_config_cuda//crosstool:toolchain-linux-x86_64"
-build:rbe_linux_cuda_nvcc_base --extra_execution_platforms="@ubuntu18.04-gcc7_manylinux2010-cuda10.1-cudnn7-tensorrt6.0_config_platform//:platform"
-build:rbe_linux_cuda_nvcc_base --host_platform="@ubuntu18.04-gcc7_manylinux2010-cuda10.1-cudnn7-tensorrt6.0_config_platform//:platform"
-build:rbe_linux_cuda_nvcc_base --platforms="@ubuntu18.04-gcc7_manylinux2010-cuda10.1-cudnn7-tensorrt6.0_config_platform//:platform"
-build:rbe_linux_cuda_nvcc_base --repo_env=TF_CUDA_CONFIG_REPO="@ubuntu18.04-gcc7_manylinux2010-cuda10.1-cudnn7-tensorrt6.0_config_cuda"
-build:rbe_linux_cuda_nvcc_base --repo_env=TF_TENSORRT_CONFIG_REPO="@ubuntu18.04-gcc7_manylinux2010-cuda10.1-cudnn7-tensorrt6.0_config_tensorrt"
-build:rbe_linux_cuda_nvcc_base --repo_env=TF_NCCL_CONFIG_REPO="@ubuntu18.04-gcc7_manylinux2010-cuda10.1-cudnn7-tensorrt6.0_config_nccl"
-build:rbe_linux_cuda_nvcc_base --define=using_cuda_nvcc=true
-build:rbe_linux_cuda_nvcc_py27 --config=rbe_linux_cuda_nvcc_base --repo_env=TF_PYTHON_CONFIG_REPO="@ubuntu18.04-gcc7_manylinux2010-cuda10.1-cudnn7-tensorrt6.0_config_python2.7"
-build:rbe_linux_cuda_nvcc_py35 --config=rbe_linux_cuda_nvcc_base --repo_env=TF_PYTHON_CONFIG_REPO="@ubuntu18.04-gcc7_manylinux2010-cuda10.1-cudnn7-tensorrt6.0_config_python3.5"
-build:rbe_linux_cuda_nvcc_py36 --config=rbe_linux_cuda_nvcc_base --repo_env=TF_PYTHON_CONFIG_REPO="@ubuntu18.04-gcc7_manylinux2010-cuda10.1-cudnn7-tensorrt6.0_config_python3.6"
-build:rbe_linux_cuda_nvcc_py37 --config=rbe_linux_cuda_nvcc_base --repo_env=TF_PYTHON_CONFIG_REPO="@ubuntu18.04-gcc7_manylinux2010-cuda10.1-cudnn7-tensorrt6.0_config_python3.7"
-build:rbe_linux_cuda_nvcc_py38 --config=rbe_linux_cuda_nvcc_base --repo_env=TF_PYTHON_CONFIG_REPO="@ubuntu18.04-gcc7_manylinux2010-cuda10.1-cudnn7-tensorrt6.0_config_python3.8"
+build:rbe_linux_cuda11.0_nvcc_base --config=rbe_linux_cuda_base
+build:rbe_linux_cuda11.0_nvcc_base --define=using_cuda_nvcc=true
+build:rbe_linux_cuda11.0_nvcc_base --crosstool_top="@ubuntu18.04-gcc7_manylinux2010-cuda11.0-cudnn8-tensorrt7.1_config_cuda//crosstool:toolchain"
+build:rbe_linux_cuda11.0_nvcc_base --extra_toolchains="@ubuntu18.04-gcc7_manylinux2010-cuda11.0-cudnn8-tensorrt7.1_config_cuda//crosstool:toolchain-linux-x86_64"
+build:rbe_linux_cuda11.0_nvcc_base --extra_execution_platforms="@ubuntu18.04-gcc7_manylinux2010-cuda11.0-cudnn8-tensorrt7.1_config_platform//:platform"
+build:rbe_linux_cuda11.0_nvcc_base --host_platform="@ubuntu18.04-gcc7_manylinux2010-cuda11.0-cudnn8-tensorrt7.1_config_platform//:platform"
+build:rbe_linux_cuda11.0_nvcc_base --platforms="@ubuntu18.04-gcc7_manylinux2010-cuda11.0-cudnn8-tensorrt7.1_config_platform//:platform"
+build:rbe_linux_cuda11.0_nvcc_base --repo_env=TF_CUDA_CONFIG_REPO="@ubuntu18.04-gcc7_manylinux2010-cuda11.0-cudnn8-tensorrt7.1_config_cuda"
+build:rbe_linux_cuda11.0_nvcc_base --repo_env=TF_TENSORRT_CONFIG_REPO="@ubuntu18.04-gcc7_manylinux2010-cuda11.0-cudnn8-tensorrt7.1_config_tensorrt"
+build:rbe_linux_cuda11.0_nvcc_base --repo_env=TF_NCCL_CONFIG_REPO="@ubuntu18.04-gcc7_manylinux2010-cuda11.0-cudnn8-tensorrt7.1_config_nccl"
+build:rbe_linux_cuda11.0_nvcc_py2.7 --config=rbe_linux_cuda11.0_nvcc_base --repo_env=TF_PYTHON_CONFIG_REPO="@ubuntu18.04-gcc7_manylinux2010-cuda11.0-cudnn8-tensorrt7.1_config_python2.7"
+build:rbe_linux_cuda11.0_nvcc_py3.5 --config=rbe_linux_cuda11.0_nvcc_base --repo_env=TF_PYTHON_CONFIG_REPO="@ubuntu18.04-gcc7_manylinux2010-cuda11.0-cudnn8-tensorrt7.1_config_python3.5"
+build:rbe_linux_cuda11.0_nvcc_py3.6 --config=rbe_linux_cuda11.0_nvcc_base --repo_env=TF_PYTHON_CONFIG_REPO="@ubuntu18.04-gcc7_manylinux2010-cuda11.0-cudnn8-tensorrt7.1_config_python3.6"
+build:rbe_linux_cuda11.0_nvcc_py3.7 --config=rbe_linux_cuda11.0_nvcc_base --repo_env=TF_PYTHON_CONFIG_REPO="@ubuntu18.04-gcc7_manylinux2010-cuda11.0-cudnn8-tensorrt7.1_config_python3.7"
+build:rbe_linux_cuda11.0_nvcc_py3.8 --config=rbe_linux_cuda11.0_nvcc_base --repo_env=TF_PYTHON_CONFIG_REPO="@ubuntu18.04-gcc7_manylinux2010-cuda11.0-cudnn8-tensorrt7.1_config_python3.8"
+
+# Map default to CUDA 10.1.
+build:rbe_linux_cuda_nvcc_py27 --config=rbe_linux_cuda10.1_nvcc_py2.7
+build:rbe_linux_cuda_nvcc_py35 --config=rbe_linux_cuda10.1_nvcc_py3.5
+build:rbe_linux_cuda_nvcc_py36 --config=rbe_linux_cuda10.1_nvcc_py3.6
+build:rbe_linux_cuda_nvcc_py37 --config=rbe_linux_cuda10.1_nvcc_py3.7
+build:rbe_linux_cuda_nvcc_py38 --config=rbe_linux_cuda10.1_nvcc_py3.8
+
+# Deprecated configs that people might still use.
+build:rbe_linux_cuda_nvcc --config=rbe_linux_cuda_nvcc_py36
+build:rbe_gpu_linux       --config=rbe_linux_cuda_nvcc
 
 build:rbe_linux_cuda_clang_base --config=rbe_linux_cuda_base
 build:rbe_linux_cuda_clang_base --crosstool_top="@ubuntu16.04-clang_manylinux2010-cuda10.1-cudnn7-tensorrt6.0_config_cuda//crosstool:toolchain"
@@ -444,8 +458,6 @@ build:rbe_linux_cuda_clang_py36 --config=rbe_linux_cuda_clang_base --repo_env=TF
 build:rbe_linux_cuda_clang_py37 --config=rbe_linux_cuda_clang_base --repo_env=TF_PYTHON_CONFIG_REPO="@ubuntu16.04-clang_manylinux2010-cuda10.1-cudnn7-tensorrt6.0_config_python3.7"
 build:rbe_linux_cuda_clang_py38 --config=rbe_linux_cuda_clang_base --repo_env=TF_PYTHON_CONFIG_REPO="@ubuntu16.04-clang_manylinux2010-cuda10.1-cudnn7-tensorrt6.0_config_python3.8"
 
-common:rbe_gpu_linux --config=rbe_linux_cuda_nvcc
-
 build:rbe_linux_py2 --config=rbe_linux
 build:rbe_linux_py2 --repo_env=PYTHON_BIN_PATH="/usr/bin/python2"
 build:rbe_linux_py2 --python_path="/usr/bin/python2"
diff --git a/tensorflow/c/eager/parallel_device/parallel_device_lib.cc b/tensorflow/c/eager/parallel_device/parallel_device_lib.cc
index d0149b29c08..768f686bd88 100644
--- a/tensorflow/c/eager/parallel_device/parallel_device_lib.cc
+++ b/tensorflow/c/eager/parallel_device/parallel_device_lib.cc
@@ -262,14 +262,14 @@ std::unique_ptr<ParallelTensor> ParallelDevice::DeviceIDs(
   components.reserve(underlying_devices_.size());
   for (int device_index = 0; device_index < underlying_devices_.size();
        ++device_index) {
-    int64_t* device_id = new int64_t;
+    int32_t* device_id = new int32_t;
     *device_id = device_index;
     std::unique_ptr<TF_Tensor, decltype(&TF_DeleteTensor)> tensor(
         TF_NewTensor(
-            TF_INT64, /*dims=*/nullptr, /*num_dims=*/0, device_id,
-            sizeof(int64_t),
+            TF_INT32, /*dims=*/nullptr, /*num_dims=*/0, device_id,
+            sizeof(int32_t),
             [](void* data, size_t, void* arg) {
-              delete reinterpret_cast<int64_t*>(data);
+              delete reinterpret_cast<int32_t*>(data);
             },
             nullptr),
         TF_DeleteTensor);
@@ -283,7 +283,7 @@ std::unique_ptr<ParallelTensor> ParallelDevice::DeviceIDs(
     if (TF_GetCode(status) != TF_OK) return nullptr;
     TFE_OpSetAttrTensor(const_op.get(), "value", tensor.get(), status);
     if (TF_GetCode(status) != TF_OK) return nullptr;
-    TFE_OpSetAttrType(const_op.get(), "dtype", TF_INT64);
+    TFE_OpSetAttrType(const_op.get(), "dtype", TF_INT32);
     TFE_TensorHandle* device_handle;
     int num_outputs = 1;
     TFE_Execute(const_op.get(), &device_handle, &num_outputs, status);
diff --git a/tensorflow/c/eager/parallel_device/parallel_device_testlib.cc b/tensorflow/c/eager/parallel_device/parallel_device_testlib.cc
index fba47865c36..828dcbae093 100644
--- a/tensorflow/c/eager/parallel_device/parallel_device_testlib.cc
+++ b/tensorflow/c/eager/parallel_device/parallel_device_testlib.cc
@@ -296,8 +296,8 @@ void BasicTestsForTwoDevices(TFE_Context* context, const char* first_device,
     TFE_DeleteTensorHandle(result_handle);
     ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
 
-    ExpectScalarEq<int64_t>(components[0].get(), 0);
-    ExpectScalarEq<int64_t>(components[1].get(), 1);
+    ExpectScalarEq<int32_t>(components[0].get(), 0);
+    ExpectScalarEq<int32_t>(components[1].get(), 1);
     std::string first_device =
         TFE_TensorHandleBackingDeviceName(components[0].get(), status.get());
     ASSERT_EQ(underlying_devices[0], first_device);
diff --git a/tensorflow/compiler/jit/kernels/xla_ops.cc b/tensorflow/compiler/jit/kernels/xla_ops.cc
index 0fc1a349adc..e3542586c89 100644
--- a/tensorflow/compiler/jit/kernels/xla_ops.cc
+++ b/tensorflow/compiler/jit/kernels/xla_ops.cc
@@ -108,8 +108,7 @@ class XlaExecutableClosure {
   explicit XlaExecutableClosure(
       xla::LocalClient* client, xla::LocalExecutable* executable,
       const XlaCompiler::CompilationResult* compilation_result,
-      std::map<int, OptionalTensor> resource_var_snapshots,
-      int num_constant_args)
+      ResourceVarsSnapshot resource_var_snapshots, int num_constant_args)
       : client_(client),
         executable_(executable),
         compilation_result_(compilation_result),
@@ -124,7 +123,7 @@ class XlaExecutableClosure {
   const XlaCompiler::CompilationResult* compilation_result() const {
     return compilation_result_;
   }
-  const std::map<int, OptionalTensor>& resource_var_snapshots() const {
+  const ResourceVarsSnapshot& resource_var_snapshots() const {
     return resource_var_snapshots_;
   }
   int num_constant_args() const { return num_constant_args_; }
@@ -133,7 +132,7 @@ class XlaExecutableClosure {
   xla::LocalClient* client_;
   xla::LocalExecutable* executable_;
   const XlaCompiler::CompilationResult* compilation_result_;
-  std::map<int, OptionalTensor> resource_var_snapshots_;
+  ResourceVarsSnapshot resource_var_snapshots_;
   int num_constant_args_;
 
   TF_DISALLOW_COPY_AND_ASSIGN(XlaExecutableClosure);
@@ -276,10 +275,10 @@ static Status BuildCompilationCache(OpKernelContext* ctx,
 
 static Status CompileToLocalExecutable(
     OpKernelContext* ctx, const NameAttrList& function, bool has_ref_vars,
-    const XlaPlatformInfo& platform_info, absl::Span<const int> resources,
+    const XlaPlatformInfo& platform_info,
+    absl::Span<VariableInfo const> variable_infos,
     absl::Span<const int> constants, bool lazy, xla::LocalClient** client,
-    std::map<int, OptionalTensor>* variables,
-    const XlaCompiler::CompilationResult** kernel,
+    const XlaCompiler::CompilationResult** compilation_result,
     xla::LocalExecutable** executable) {
   // We store information about the JIT-compiled XLA computation
   // in the ResourceMgr.
@@ -299,7 +298,6 @@ static Status CompileToLocalExecutable(
   // this is more obviously correct.)
   core::ScopedUnref cache_ref(cache);
 
-  TF_RETURN_IF_ERROR(SnapshotResourceVariables(ctx, resources, variables));
   *client = static_cast<xla::LocalClient*>(cache->client());
 
   absl::optional<se::TfAllocatorAdapter> tf_allocator_adapter;
@@ -337,11 +335,11 @@ static Status CompileToLocalExecutable(
 
   std::vector<XlaCompiler::Argument> args;
   TF_RETURN_IF_ERROR(XlaComputationLaunchContext::BuildXlaCompilerArguments(
-      constant_args, *variables, ctx, &args));
+      constant_args, variable_infos, ctx, &args));
   return cache->Compile(options, function, args, compile_options,
                         lazy ? XlaCompilationCache::CompileMode::kLazy
                              : XlaCompilationCache::CompileMode::kStrict,
-                        kernel, executable);
+                        compilation_result, executable);
 }
 
 void XlaLocalLaunchBase::Compute(OpKernelContext* ctx) {
@@ -349,16 +347,22 @@ void XlaLocalLaunchBase::Compute(OpKernelContext* ctx) {
           << Canonicalize(function_.name(), AttrSlice(&function_.attr()));
 
   xla::LocalClient* client;
-  const XlaCompiler::CompilationResult* kernel;
+  const XlaCompiler::CompilationResult* compilation_result;
   xla::LocalExecutable* executable;
-  std::map<int, OptionalTensor> variables;
 
+  ResourceVarsSnapshot variables;
   {
+    std::vector<VariableInfo> variable_infos;
+    OP_REQUIRES_OK(
+        ctx, GetVariableInfosFromCtxInputs(ctx, resources_, &variable_infos));
+    OP_REQUIRES_OK(ctx, LockVariables(absl::MakeSpan(variable_infos)));
     Status s = CompileToLocalExecutable(
         ctx, function_, /*has_ref_vars=*/has_ref_vars_, platform_info_,
-        resources_, constants_, /*lazy=*/false, &client, &variables, &kernel,
-        &executable);
+        variable_infos, constants_, /*lazy=*/false, &client,
+        &compilation_result, &executable);
     OP_REQUIRES_OK(ctx, s);
+    OP_REQUIRES_OK(ctx, SnapshotResourceVariables(ctx, resources_,
+                                                  variable_infos, &variables));
   }
 
   se::Stream* stream =
@@ -373,7 +377,7 @@ void XlaLocalLaunchBase::Compute(OpKernelContext* ctx) {
       client, allocator,
       /*allocate_xla_tensors=*/platform_info_.is_on_xla_device(),
       platform_info_.UseMultipleStreams());
-  launch_context.PopulateInputs(ctx, kernel, variables,
+  launch_context.PopulateInputs(ctx, compilation_result, variables,
                                 /*missing_ctx_input_prefix=*/0);
 
   // Execute the computation.
@@ -413,7 +417,7 @@ void XlaLocalLaunchBase::Compute(OpKernelContext* ctx) {
       executable->executable()->module().input_output_alias_config();
   OP_REQUIRES_OK(
       ctx, launch_context.PopulateOutputs(
-               ctx, kernel, run_result.ConsumeValueOrDie(),
+               ctx, compilation_result, run_result.ConsumeValueOrDie(),
                /*missing_ctx_input_prefix=*/0, input_output_alias, variables));
   VLOG(1) << "Done";
 }
@@ -494,7 +498,7 @@ void XlaCompileOp::Compute(OpKernelContext* ctx) {
   xla::LocalClient* client;
   const XlaCompiler::CompilationResult* kernel;
   xla::LocalExecutable* executable;
-  std::map<int, OptionalTensor> variables;
+  ResourceVarsSnapshot variables;
 
   bool cannot_compile_cluster;
   {
@@ -506,9 +510,16 @@ void XlaCompileOp::Compute(OpKernelContext* ctx) {
       cannot_compile_cluster) {
     executable = nullptr;
   } else {
+    std::vector<VariableInfo> variable_infos;
+    OP_REQUIRES_OK(
+        ctx, GetVariableInfosFromCtxInputs(ctx, resources_, &variable_infos));
+    OP_REQUIRES_OK(ctx, LockVariables(absl::MakeSpan(variable_infos)));
     Status status = CompileToLocalExecutable(
-        ctx, function_, has_ref_vars_, platform_info_, resources_, constants_,
-        /*lazy=*/!must_compile_, &client, &variables, &kernel, &executable);
+        ctx, function_, has_ref_vars_, platform_info_, variable_infos,
+        constants_,
+        /*lazy=*/!must_compile_, &client, &kernel, &executable);
+    OP_REQUIRES_OK(ctx, SnapshotResourceVariables(ctx, resources_,
+                                                  variable_infos, &variables));
     if (must_compile_ || status.code() != error::UNIMPLEMENTED) {
       OP_REQUIRES_OK(ctx, status);
     }
diff --git a/tensorflow/compiler/jit/xla_compile_on_demand_op.cc b/tensorflow/compiler/jit/xla_compile_on_demand_op.cc
index e1ad0e8c5af..afaee614f02 100644
--- a/tensorflow/compiler/jit/xla_compile_on_demand_op.cc
+++ b/tensorflow/compiler/jit/xla_compile_on_demand_op.cc
@@ -28,32 +28,23 @@ limitations under the License.
 
 namespace tensorflow {
 
-namespace {
-std::map<int, OptionalTensor> GetVariables(OpKernelContext* ctx) {
-  std::map<int, OptionalTensor> variables;
-  for (int64 i = 0; i < ctx->num_inputs(); ++i) {
+// Returns argument indices corresponding to the resource variable inputs of
+// kernel context `ctx`.
+static std::vector<int> GetResourceVariableIndices(OpKernelContext* ctx) {
+  std::vector<int> out;
+  for (int64 i = 0; i < ctx->num_inputs(); i++) {
     if (ctx->input(i).dtype() == DT_RESOURCE) {
-      core::RefCountPtr<Var> variable;
-      ResourceHandle handle = HandleFromInput(ctx, i);
-      OptionalTensor& optional = variables[i];
-      optional.name = handle.name();
-      if (LookupResource(ctx, handle, &variable).ok()) {
-        tf_shared_lock lock(*variable->mu());
-        optional.present = true;
-        optional.value = *variable->tensor();
-      }
+      out.push_back(i);
     }
   }
-  return variables;
+  return out;
 }
-}  // namespace
 
 Status XlaCompileOnDemandOp::Run(OpKernelContext* ctx,
                                  const XlaDevice::Metadata& metadata,
                                  const XlaCompiler::CompilationResult* result,
-                                 xla::LocalExecutable* executable) {
-  std::map<int, OptionalTensor> variables = GetVariables(ctx);
-
+                                 xla::LocalExecutable* executable,
+                                 const ResourceVarsSnapshot& variable_args) {
   xla::LocalClient* client = metadata.client();
 
   // Builds an XLA allocator for the device.
@@ -62,7 +53,7 @@ Status XlaCompileOnDemandOp::Run(OpKernelContext* ctx,
       /*allocate_xla_tensors=*/true,
       /*use_multiple_streams=*/metadata.UseMultipleStreams());
 
-  launch_context.PopulateInputs(ctx, result, variables,
+  launch_context.PopulateInputs(ctx, result, variable_args,
                                 /*missing_ctx_input_prefix=*/0);
 
   se::Stream* stream =
@@ -87,7 +78,7 @@ Status XlaCompileOnDemandOp::Run(OpKernelContext* ctx,
       executable->executable()->module().input_output_alias_config();
   TF_RETURN_IF_ERROR(launch_context.PopulateOutputs(
       ctx, result, run_result.ConsumeValueOrDie(),
-      /*missing_ctx_input_prefix=*/0, input_output_alias, variables));
+      /*missing_ctx_input_prefix=*/0, input_output_alias, variable_args));
   return Status::OK();
 }
 
@@ -115,7 +106,7 @@ Status XlaCompileOnDemandOp::ShouldArgumentBeConstant(
 Status XlaCompileOnDemandOp::Compile(
     OpKernelContext* ctx, const XlaDevice::Metadata& metadata,
     const XlaCompiler::CompilationResult** result,
-    xla::LocalExecutable** executable) {
+    ResourceVarsSnapshot* variable_args, xla::LocalExecutable** executable) {
   std::map<int, Tensor> constant_arguments;
   for (int64 i = 0; i < ctx->num_inputs(); ++i) {
     const Tensor& device_tensor = ctx->input(i);
@@ -190,12 +181,18 @@ Status XlaCompileOnDemandOp::Compile(
   // rather than a one-element tuple.
   compile_options.always_return_tuple = false;
 
-  std::map<int, OptionalTensor> variable_args = GetVariables(ctx);
-
+  std::vector<int> variables_indices = GetResourceVariableIndices(ctx);
   std::vector<XlaCompiler::Argument> args;
-
-  TF_RETURN_IF_ERROR(XlaComputationLaunchContext::BuildXlaCompilerArguments(
-      constant_arguments, variable_args, ctx, &args));
+  {
+    std::vector<VariableInfo> variable_infos;
+    TF_RETURN_IF_ERROR(
+        GetVariableInfosFromCtxInputs(ctx, variables_indices, &variable_infos));
+    TF_RETURN_IF_ERROR(LockVariables(absl::MakeSpan(variable_infos)));
+    TF_RETURN_IF_ERROR(SnapshotResourceVariables(
+        ctx, variables_indices, variable_infos, variable_args));
+    TF_RETURN_IF_ERROR(XlaComputationLaunchContext::BuildXlaCompilerArguments(
+        constant_arguments, variable_infos, ctx, &args));
+  }
 
   return cache->CompileSingleOp(options, args, ctx, compile_options, result,
                                 executable);
@@ -206,8 +203,10 @@ void XlaCompileOnDemandOp::Compute(OpKernelContext* ctx) {
   xla::LocalExecutable* executable;
   const XlaDevice::Metadata* metadata;
   OP_REQUIRES_OK(ctx, XlaDevice::GetMetadata(ctx, &metadata));
-  OP_REQUIRES_OK(ctx, Compile(ctx, *metadata, &result, &executable));
-  OP_REQUIRES_OK(ctx, Run(ctx, *metadata, result, executable));
+  ResourceVarsSnapshot variable_args;
+  OP_REQUIRES_OK(ctx,
+                 Compile(ctx, *metadata, &result, &variable_args, &executable));
+  OP_REQUIRES_OK(ctx, Run(ctx, *metadata, result, executable, variable_args));
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/xla_compile_on_demand_op.h b/tensorflow/compiler/jit/xla_compile_on_demand_op.h
index 98f634db98f..cc5f2f1e42f 100644
--- a/tensorflow/compiler/jit/xla_compile_on_demand_op.h
+++ b/tensorflow/compiler/jit/xla_compile_on_demand_op.h
@@ -20,6 +20,7 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_JIT_XLA_COMPILE_ON_DEMAND_OP_H_
 
 #include "tensorflow/compiler/jit/xla_device.h"
+#include "tensorflow/compiler/jit/xla_launch_util.h"
 #include "tensorflow/compiler/tf2xla/xla_compiler.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/core/framework/function.h"
@@ -47,10 +48,12 @@ class XlaCompileOnDemandOp : public OpKernel {
                                 bool* result);
   Status Compile(OpKernelContext* ctx, const XlaDevice::Metadata& metadata,
                  const XlaCompiler::CompilationResult** result,
+                 ResourceVarsSnapshot* variable_args,
                  xla::LocalExecutable** executable);
   Status Run(OpKernelContext* ctx, const XlaDevice::Metadata& metadata,
              const XlaCompiler::CompilationResult* result,
-             xla::LocalExecutable* executable);
+             xla::LocalExecutable* executable,
+             const ResourceVarsSnapshot& variable_args);
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/xla_launch_util.cc b/tensorflow/compiler/jit/xla_launch_util.cc
index 25eed134e35..fc0ff8d9445 100644
--- a/tensorflow/compiler/jit/xla_launch_util.cc
+++ b/tensorflow/compiler/jit/xla_launch_util.cc
@@ -52,7 +52,8 @@ const char kPossibleNonVariableResourceHintMessage[] =
     "resource inputs to XLA.";
 }  // anonymous namespace
 
-VariableInfo::VariableInfo(int index, Var* var) : index_(index), var_(var) {}
+VariableInfo::VariableInfo(int index, absl::string_view name, Var* var)
+    : index_(index), name_(name), var_(var) {}
 VariableInfo::VariableInfo(VariableInfo&& other)
     : index_(other.index_), var_(other.var_), lock_held_(other.lock_held_) {
   other.index_ = -1;
@@ -87,16 +88,15 @@ VariableInfo::~VariableInfo() {
 // Returns a vector of VariableInfo instances for the resource variable inputs
 // to the kernel with context `ctx`.  The input indices for the resource
 // variable inputs are in `variable_indices`.
-static Status GetVariableInfosFromCtxInputs(
-    OpKernelContext* ctx, absl::Span<const int> variable_indices,
-    std::vector<VariableInfo>* result) {
+Status GetVariableInfosFromCtxInputs(OpKernelContext* ctx,
+                                     absl::Span<const int> variable_indices,
+                                     std::vector<VariableInfo>* result) {
   std::vector<const ResourceHandle*> resource_handles;
   absl::c_transform(
       variable_indices, std::back_inserter(resource_handles),
       [&](int variable_idx) { return &HandleFromInput(ctx, variable_idx); });
 
   std::vector<core::RefCountPtr<Var>> variables;
-
   Status s = LookupResources(ctx, resource_handles, &variables);
   if (!s.ok()) {
     errors::AppendToMessage(&s, kPossibleNonVariableResourceHintMessage);
@@ -109,7 +109,9 @@ static Status GetVariableInfosFromCtxInputs(
     // *Release* the variable because we're going to unref it later in
     // ~VariableInfo.
     Var* variable = variables[i].release();
-    result->emplace_back(variable_indices[i], variable);
+    int input_idx = variable_indices[i];
+    std::string var_name = HandleFromInput(ctx, input_idx).name();
+    result->emplace_back(input_idx, var_name, variable);
   }
 
   return Status::OK();
@@ -162,21 +164,12 @@ Status LockVariables(absl::Span<VariableInfo> variables) {
 
 Status SnapshotResourceVariables(OpKernelContext* ctx,
                                  absl::Span<const int> variable_indices,
-                                 std::map<int, OptionalTensor>* result) {
-  std::vector<VariableInfo> variable_infos;
-  TF_RETURN_IF_ERROR(
-      GetVariableInfosFromCtxInputs(ctx, variable_indices, &variable_infos));
-  TF_RETURN_IF_ERROR(LockVariables(absl::MakeSpan(variable_infos)));
-
+                                 absl::Span<VariableInfo const> variable_infos,
+                                 ResourceVarsSnapshot* result) {
   for (int i = 0; i < variable_indices.size(); i++) {
-    if (variable_infos[i].var()) {
-      OptionalTensor& tensor = (*result)[variable_indices[i]];
-      tensor.name = HandleFromInput(ctx, variable_indices[i]).name();
-      tensor.present = true;
-      tensor.value = *variable_infos[i].var()->tensor();
-    } else {
-      (*result)[variable_indices[i]] = OptionalTensor();
-    }
+    Var* var = variable_infos[i].var();
+    (*result)[variable_indices[i]] =
+        var ? absl::make_optional(*var->tensor()) : absl::nullopt;
   }
   return Status::OK();
 }
@@ -197,8 +190,7 @@ XlaComputationLaunchContext::XlaComputationLaunchContext(
 void XlaComputationLaunchContext::PopulateInputs(
     OpKernelContext* ctx,
     const XlaCompiler::CompilationResult* compilation_result,
-    const std::map<int, OptionalTensor>& variables,
-    int missing_ctx_input_prefix) {
+    const ResourceVarsSnapshot& variables, int missing_ctx_input_prefix) {
   // Build ShapedBuffers that point directly to the Tensor buffers.
   arg_ptrs_ =
       std::vector<ShapedBuffer*>(compilation_result->xla_input_shapes.size());
@@ -210,7 +202,7 @@ void XlaComputationLaunchContext::PopulateInputs(
     CHECK_GE(arg_num, missing_ctx_input_prefix);
     const xla::Shape& shape = compilation_result->xla_input_shapes[i];
     const Tensor* t = variables.count(arg_num)
-                          ? &(variables.at(arg_num).value)
+                          ? &(variables.at(arg_num).value())
                           : &(ctx->input(arg_num - missing_ctx_input_prefix));
     CHECK(t);
 
@@ -262,7 +254,7 @@ static const Tensor* FindAliasedTensorForOutput(
     int output_num, OpKernelContext* ctx, int missing_ctx_input_prefix,
     const xla::HloInputOutputAliasConfig& input_output_alias,
     absl::Span<const int> input_mapping,
-    const std::map<int, OptionalTensor>& resource_var_snapshots) {
+    const ResourceVarsSnapshot& resource_var_snapshots) {
   if (MustAliasOutput(input_output_alias, output_num)) {
     int xla_param = input_output_alias.GetAliasedParameter({output_num})
                         .value()
@@ -274,8 +266,8 @@ static const Tensor* FindAliasedTensorForOutput(
     // entry time.
     if (input_tensor->dtype() == DT_RESOURCE) {
       auto& v = resource_var_snapshots.at(missing_ctx_input_prefix + tf_param);
-      CHECK(v.present);
-      return &v.value;
+      CHECK(v.has_value());
+      return &v.value();
     }
     return input_tensor;
   }
@@ -298,9 +290,9 @@ static Tensor GetOrCreateTensorForOutput(
     int output_num, OpKernelContext* ctx, int missing_ctx_input_prefix,
     const xla::HloInputOutputAliasConfig& input_output_alias,
     absl::Span<const int> input_mapping,
-    const std::map<int, OptionalTensor>& resource_var_snapshots,
-    DataType output_dtype, const TensorShape& output_shape,
-    se::DeviceMemoryBase output_buffer, Allocator* output_allocator) {
+    const ResourceVarsSnapshot& resource_var_snapshots, DataType output_dtype,
+    const TensorShape& output_shape, se::DeviceMemoryBase output_buffer,
+    Allocator* output_allocator) {
   if (const Tensor* aliased_tensor = FindAliasedTensorForOutput(
           output_num, ctx, missing_ctx_input_prefix, input_output_alias,
           input_mapping, resource_var_snapshots)) {
@@ -431,13 +423,13 @@ static xla::StatusOr<std::vector<VariableInfo>> GatherVariableInfo(
     // TODO(b/35625933): tensorflow::Var should contain a PersistentTensor,
     // not a Tensor.
     Var* variable = nullptr;
-    TF_RETURN_IF_ERROR(LookupOrCreateResource<Var>(
-        ctx, HandleFromInput(ctx, actual_input_index), &variable,
-        [&write](Var** ptr) {
-          *ptr = new Var(write.type);
-          return Status::OK();
-        }));
-    variable_infos.emplace_back(actual_input_index, variable);
+    const ResourceHandle handle = HandleFromInput(ctx, actual_input_index);
+    TF_RETURN_IF_ERROR(LookupOrCreateResource<Var>(ctx, handle, &variable,
+                                                   [&write](Var** ptr) {
+                                                     *ptr = new Var(write.type);
+                                                     return Status::OK();
+                                                   }));
+    variable_infos.emplace_back(actual_input_index, handle.name(), variable);
   }
   return variable_infos;
 }
@@ -447,7 +439,7 @@ Status XlaComputationLaunchContext::PopulateOutputs(
     const XlaCompiler::CompilationResult* compilation_result,
     ScopedShapedBuffer output, int missing_ctx_input_prefix,
     const xla::HloInputOutputAliasConfig& input_output_alias,
-    const std::map<int, OptionalTensor>& resource_var_snapshots) {
+    const ResourceVarsSnapshot& resource_var_snapshots) {
   se::Stream* stream =
       ctx->op_device_context() ? ctx->op_device_context()->stream() : nullptr;
   Allocator* allocator = ctx->device()->GetAllocator({});
@@ -564,12 +556,21 @@ Status XlaComputationLaunchContext::PopulateOutputs(
 
 Status XlaComputationLaunchContext::BuildXlaCompilerArguments(
     const std::map<int, Tensor>& constant_args,
-    const std::map<int, OptionalTensor>& variable_args, OpKernelContext* ctx,
+    absl::Span<VariableInfo const> variable_args, OpKernelContext* ctx,
     std::vector<XlaCompiler::Argument>* args) {
   args->resize(ctx->num_inputs());
 
+  absl::flat_hash_map<int, const VariableInfo*> variable_info_lookup;
+  for (const VariableInfo& info : variable_args) {
+    CHECK(!info.var() || info.lock_held())
+        << "Need to hold the lock on resource variables "
+           "before calling BuildXlaCompilerArguments";
+    variable_info_lookup.emplace(info.index(), &info);
+  }
+
   for (int64 input_num = 0; input_num < ctx->num_inputs(); ++input_num) {
     XlaCompiler::Argument& arg = (*args)[input_num];
+
     if (constant_args.count(input_num) > 0) {
       // Handles compile-time constants.
       const Tensor& input = constant_args.at(input_num);
@@ -578,7 +579,7 @@ Status XlaComputationLaunchContext::BuildXlaCompilerArguments(
       arg.type = input.dtype();
       arg.shape = input.shape();
       arg.constant_value = input;
-    } else if (variable_args.count(input_num) == 0) {
+    } else if (variable_info_lookup.count(input_num) == 0) {
       // Handles the non-constant arguments.
       const Tensor& input = ctx->input(input_num);
       TF_RET_CHECK(input.dtype() != DT_RESOURCE);
@@ -594,14 +595,14 @@ Status XlaComputationLaunchContext::BuildXlaCompilerArguments(
       // Handles resource variables.
       const Tensor& input = ctx->input(input_num);
       TF_RET_CHECK(input.dtype() == DT_RESOURCE);
-      const OptionalTensor& variable = variable_args.at(input_num);
-      arg.name = variable.name;
+      const VariableInfo& variable = *variable_info_lookup[input_num];
+      arg.name = std::string(variable.name());
       arg.kind = XlaCompiler::Argument::kResource;
       arg.resource_kind = XlaResource::kVariable;
-      if (variable.present) {
-        const Tensor& value = variable.value;
-        arg.type = value.dtype();
-        arg.shape = value.shape();
+      if (variable.var()) {
+        const Tensor* value = variable.var()->tensor();
+        arg.type = value->dtype();
+        arg.shape = value->shape();
         arg.initialized = true;
       } else {
         // The values of uninitialized variables are not passed as inputs, since
diff --git a/tensorflow/compiler/jit/xla_launch_util.h b/tensorflow/compiler/jit/xla_launch_util.h
index 9a7f20cb310..92b6c4c8a08 100644
--- a/tensorflow/compiler/jit/xla_launch_util.h
+++ b/tensorflow/compiler/jit/xla_launch_util.h
@@ -34,36 +34,17 @@ limitations under the License.
 
 namespace tensorflow {
 
-// Struct that represents a possibly-absent Tensor.
-struct OptionalTensor {
-  string name;           // A descriptive name
-  bool present = false;  // Is the tensor present?
-  Tensor value;          // If present, what is the Tensor's value?
-};
-
-// Takes a snapshot of the values of resource variable arguments, whose indices
-// are specified in `variable_indices` argument. We snapshot tensors that back
-// resource variables since concurrent updates may modify the shape, and it is
-// important that the shapes used for compilation match the true shapes of the
-// buffers.
-//
-// We snapshot the entire set of resource variables as one atomic operation.
-// This models Read->* dependencies between resource variable operations.  See
-// jit/resource_operation_safety_analysis for details.
-//
-// Returns a map of TensorFlow argument index to resource variable. If a
-// resource variable is not initialized, the corresponding OptionalTensor
-// will have its `present` field set to false.
-Status SnapshotResourceVariables(OpKernelContext* ctx,
-                                 absl::Span<const int> variable_indices,
-                                 std::map<int, OptionalTensor>* result);
+// Snapshot of resource variables for a TF kernel invocation, mapping from
+// parameter number to values at execution time. If the resource variable is not
+// initialized, the value will not be present.
+using ResourceVarsSnapshot = absl::flat_hash_map<int, absl::optional<Tensor>>;
 
 // Information about the state of a variable passed as input to the _XlaCompile
 // and _XlaRun operators.  Unlocks the resource variable and decrements its
 // refcount on destruction.
 class VariableInfo {
  public:
-  explicit VariableInfo(int index, Var* var);
+  explicit VariableInfo(int index, absl::string_view name, Var* var);
   VariableInfo(VariableInfo&& other);
 
   VariableInfo& operator=(VariableInfo&& other);
@@ -79,6 +60,9 @@ class VariableInfo {
   // "empty", i.e. it does not track a resource variable.
   Var* var() const { return var_; }
 
+  // Returns the variable name.
+  absl::string_view name() const { return name_; }
+
   // Returns true if the resource variable lock was successfully acquired by
   // this thread.
   bool lock_held() const { return lock_held_; }
@@ -88,6 +72,7 @@ class VariableInfo {
 
  private:
   int index_;
+  std::string name_;
   Var* var_;
 
   // We can't use a optional<mutex_lock> here because it confuses the compiler's
@@ -96,6 +81,20 @@ class VariableInfo {
   bool lock_held_ = false;
 };
 
+// Takes a snapshot of the values of resource variable arguments, whose indices
+// are specified in `variable_indices` argument. We snapshot tensors that back
+// resource variables since concurrent updates may modify the shape, and it is
+// important that the shapes used for compilation match the true shapes of the
+// buffers.
+//
+// We snapshot the entire set of resource variables as one atomic operation.
+// This models Read->* dependencies between resource variable operations.  See
+// jit/resource_operation_safety_analysis for details.
+Status SnapshotResourceVariables(OpKernelContext* ctx,
+                                 absl::Span<const int> variable_indices,
+                                 absl::Span<VariableInfo const> variable_infos,
+                                 ResourceVarsSnapshot* result);
+
 // Acquires the mutexes for all the variables in `variables` using a
 // deadlock-safe protocol (acquire the mutexes in increasing-address order).
 //
@@ -104,6 +103,13 @@ class VariableInfo {
 Status LockVariables(absl::Span<VariableInfo> variables)
     TF_EXCLUSIVE_LOCK_FUNCTION();
 
+// Returns a vector of VariableInfo instances for the resource variable inputs
+// to the kernel with context `ctx`.  The input indices for the resource
+// variable inputs are in `variable_indices`.
+Status GetVariableInfosFromCtxInputs(OpKernelContext* ctx,
+                                     absl::Span<const int> variable_indices,
+                                     std::vector<VariableInfo>* result);
+
 // Helper class to perform the marshalling of TensorFlow inputs and outputs to
 // ShapedBuffers suitable for passing to an XLA computation.
 class XlaComputationLaunchContext {
@@ -123,9 +129,10 @@ class XlaComputationLaunchContext {
 
   // Builds a XlaCompiler::Argument vector from the arguments to an XlaLaunch
   // op.
+  // Precondition: variables in `variable_args` are locked.
   static Status BuildXlaCompilerArguments(
       const std::map<int, Tensor>& constant_args,
-      const std::map<int, OptionalTensor>& variable_args, OpKernelContext* ctx,
+      absl::Span<VariableInfo const> variable_args, OpKernelContext* ctx,
       std::vector<XlaCompiler::Argument>* args);
 
   // Add all inputs within `ctx` as XLA arguments (returned by arguments()).
@@ -137,7 +144,7 @@ class XlaComputationLaunchContext {
   // (in other words, no inputs actually required by the kernel can be missing).
   void PopulateInputs(OpKernelContext* ctx,
                       const XlaCompiler::CompilationResult* compilation_result,
-                      const std::map<int, OptionalTensor>& variables,
+                      const ResourceVarsSnapshot& variables,
                       int missing_ctx_input_prefix);
 
   // Given the XLA output in `output`, populate all outputs of `ctx`.  Also
@@ -155,7 +162,7 @@ class XlaComputationLaunchContext {
       const XlaCompiler::CompilationResult* compilation_result,
       xla::ScopedShapedBuffer output, int missing_ctx_input_prefix,
       const xla::HloInputOutputAliasConfig& input_output_alias,
-      const std::map<int, OptionalTensor>& resource_var_snapshots);
+      const ResourceVarsSnapshot& resource_var_snapshots);
 
   // Return the argument list. Only valid after PopulateInputs() has been
   // called.
diff --git a/tensorflow/compiler/mlir/lite/flatbuffer_export.cc b/tensorflow/compiler/mlir/lite/flatbuffer_export.cc
index a260670015a..e34e7ae7ca6 100644
--- a/tensorflow/compiler/mlir/lite/flatbuffer_export.cc
+++ b/tensorflow/compiler/mlir/lite/flatbuffer_export.cc
@@ -240,10 +240,10 @@ static bool IsValidTFLiteMlirModule(ModuleOp module) {
   }
 
   for (auto fn : module.getOps<FuncOp>()) {
-    if (fn.getBlocks().size() != 1) {
+    if (!llvm::hasSingleElement(fn)) {
       return fn.emitError("should have exactly one basic block"), false;
     }
-    auto& bb = fn.getBlocks().front();
+    auto& bb = fn.front();
 
     for (auto arg : bb.getArguments()) {
       if (!HasValidTFLiteType(arg, fn))
@@ -1089,7 +1089,7 @@ void Translator::InitializeNamesFromAttribute(FuncOp fn, bool* has_input_attr) {
           dict_attr.get("outputs").dyn_cast_or_null<mlir::StringAttr>()) {
     str.getValue().split(output_names, ',', /*MaxSplit=*/-1,
                          /*KeepEmpty=*/false);
-    auto term = fn.getBlocks().back().getTerminator();
+    auto term = fn.back().getTerminator();
     if (output_names.size() != term->getNumOperands()) {
       fn.emitWarning() << "output names (" << output_names.size()
                        << ") != terminator operands (" << term->getNumOperands()
diff --git a/tensorflow/compiler/mlir/lite/tf_tfl_passes.cc b/tensorflow/compiler/mlir/lite/tf_tfl_passes.cc
index fed2896035b..3fa2eae42f2 100644
--- a/tensorflow/compiler/mlir/lite/tf_tfl_passes.cc
+++ b/tensorflow/compiler/mlir/lite/tf_tfl_passes.cc
@@ -39,22 +39,18 @@ namespace tensorflow {
 void AddQuantizationPasses(const mlir::TFL::QuantizationSpecs& quant_specs,
                            mlir::OpPassManager* pass_manager) {
   pass_manager->addPass(mlir::TFL::CreatePrepareQuantizePass(quant_specs));
-  pass_manager->addPass(mlir::TFL::CreateQuantizePass());
-  bool emit_quant_adaptor_ops =
-      quant_specs.inference_type != quant_specs.inference_input_type;
-  pass_manager->addPass(
-      mlir::TFL::CreatePostQuantizePass(emit_quant_adaptor_ops));
-
   if (quant_specs.default_ranges.first.hasValue() ||
       quant_specs.default_ranges.second.hasValue()) {
     pass_manager->addPass(mlir::TFL::CreateDefaultQuantParamsPass(
         quant_specs.default_ranges.first.getValueOr(0.0),
         quant_specs.default_ranges.second.getValueOr(0.0),
         quant_specs.IsSignedInferenceType()));
-    pass_manager->addPass(mlir::TFL::CreateQuantizePass());
-    pass_manager->addPass(
-        mlir::TFL::CreatePostQuantizePass(emit_quant_adaptor_ops));
   }
+  pass_manager->addPass(mlir::TFL::CreateQuantizePass());
+  bool emit_quant_adaptor_ops =
+      quant_specs.inference_type != quant_specs.inference_input_type;
+  pass_manager->addPass(
+      mlir::TFL::CreatePostQuantizePass(emit_quant_adaptor_ops));
 }
 
 void AddTFToTFLConversionPasses(const mlir::TFL::PassConfig& pass_config,
diff --git a/tensorflow/compiler/mlir/lite/transforms/post_quantize.cc b/tensorflow/compiler/mlir/lite/transforms/post_quantize.cc
index 9a1da0ad03d..33380e00543 100644
--- a/tensorflow/compiler/mlir/lite/transforms/post_quantize.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/post_quantize.cc
@@ -52,7 +52,7 @@ class PostQuantizePass : public PassWrapper<PostQuantizePass, FunctionPass> {
 
 void RemoveQuantizationAdaptorOps(FuncOp func) {
   mlir::OpBuilder builder(func.getBody());
-  auto& bb = func.getBlocks().front();
+  auto& bb = func.front();
   auto* terminator = bb.getTerminator();
 
   int num_args = bb.getNumArguments();
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_device.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_device.cc
index b8f0585040c..7dd74282487 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_device.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_device.cc
@@ -299,13 +299,13 @@ ParseResult ParseReplicateOp(OpAsmParser* parser, OperationState* state) {
       parser->parseRegion(body, region_args, region_arg_types))
     return failure();
 
-  if (body.getBlocks().size() > 1)
-    return parser->emitError(loc) << "expects a single block region";
-
   // Ensure that the region is well formed: it contains at least a block with
   // a ReturnOp terminator.
   ReplicateOp::ensureTerminator(body, parser->getBuilder(), state->location);
 
+  if (!llvm::hasSingleElement(body))
+    return parser->emitError(loc) << "expects a single block region";
+
   Operation& terminator = body.front().back();
   if (!isa<ReturnOp>(terminator))
     return parser->emitError(loc) << "expects a tf_device.return terminator";
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_executor.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_executor.cc
index 3403651eef8..1e66eee06bb 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_executor.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_executor.cc
@@ -220,13 +220,13 @@ ParseResult ParseGraphOp(OpAsmParser &parser, OperationState &result) {
   Region &body = *result.addRegion();
   if (parser.parseRegion(body, llvm::None, llvm::None)) return failure();
 
-  if (body.getBlocks().size() > 1)
-    return parser.emitError(loc) << "expects a single block region";
-
   // Ensure that the region is well formed: it contains at least a block with
   // a FetchOp terminator.
   GraphOp::ensureTerminator(body, parser.getBuilder(), result.location);
 
+  if (!llvm::hasSingleElement(body))
+    return parser.emitError(loc) << "expects a single block region";
+
   // Get the results type from the terminator type inside the graph.
   Operation &fetch = body.back().back();
   if (!isa<FetchOp>(fetch))
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/fold_switch.cc b/tensorflow/compiler/mlir/tensorflow/transforms/fold_switch.cc
index 4d26747ebdc..b47378762a9 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/fold_switch.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/fold_switch.cc
@@ -199,7 +199,7 @@ static void MatchSwitchFoldOps(tf_executor::SwitchOp switch_op,
 // Folds merge nodes with only a single non-dead input.
 static LogicalResult FoldMergeNodes(FuncOp function, const DeadQueue& queue) {
   // Create builder for val_index of MergeOp.
-  auto* block = &function.getBlocks().front();
+  auto* block = &function.front();
   OpBuilder builder = OpBuilder::atBlockEnd(block);
   auto type = builder.getIntegerType(32);
   auto build_index = [&](Location loc, int value) {
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/materialize_mlir_passthrough_op.cc b/tensorflow/compiler/mlir/tensorflow/transforms/materialize_mlir_passthrough_op.cc
index 94fdfb310ac..3ed27d7ce30 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/materialize_mlir_passthrough_op.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/materialize_mlir_passthrough_op.cc
@@ -71,7 +71,7 @@ void MaterializePassthroughOpPass::runOnFunction() {
       return;
     }
     Region &body = main.getBody();
-    if (body.getBlocks().size() != 1) {
+    if (!llvm::hasSingleElement(body)) {
       op->emitError() << "MLIR Opaque Op expects a main() entry point with a "
                          "single block\n";
       return;
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/promote_resources_to_args.cc b/tensorflow/compiler/mlir/tensorflow/transforms/promote_resources_to_args.cc
index cece23b4750..af36770f496 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/promote_resources_to_args.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/promote_resources_to_args.cc
@@ -80,11 +80,11 @@ constexpr char kResourceNameArgAttr[] = "tf.resource_name";
 
 // Checks if a function has only one block.
 mlir::LogicalResult CheckSingleBlockFunction(FuncOp function) {
-  if (!hasSingleElement(function.getBlocks()))
+  if (!llvm::hasSingleElement(function)) {
     return function.emitError()
            << "expects function '" << function.getName()
            << "' to have 1 block, got " << function.getBlocks().size();
-
+  }
   return success();
 }
 
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/resource_op_lifting.cc b/tensorflow/compiler/mlir/tensorflow/transforms/resource_op_lifting.cc
index ed7ebc25c9f..799ab3a0f0d 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/resource_op_lifting.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/resource_op_lifting.cc
@@ -1113,7 +1113,7 @@ LogicalResult ResourceLiftingForFunctionalControlFlow(FuncOp function) {
   // This routine should only be called when control flow operations are still
   // represented with TF IfOp and WhileOp operations. In this case, there should
   // be only one basic blocks in the MLIR representation.
-  if (!hasSingleElement(function.getBlocks())) {
+  if (!llvm::hasSingleElement(function)) {
     return function.emitError()
            << "expect the function to have 1 block while it has "
            << function.getBlocks().size();
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_sharding_identification_pass.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_sharding_identification_pass.cc
index f8b6e364f55..b05e87c6485 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_sharding_identification_pass.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_sharding_identification_pass.cc
@@ -159,8 +159,7 @@ llvm::SmallVector<FunctionAndArgumentInfo, 4> ExtractFunctionsConnectedToArg(
   while (!functions_to_parse.empty()) {
     llvm::SmallVector<FunctionAndArgumentInfo, 4> newly_discovered_functions;
     for (auto function_info : functions_to_parse) {
-      Block& func_entry_block =
-          function_info.func.getBody().getBlocks().front();
+      Block& func_entry_block = function_info.func.front();
       auto argument =
           func_entry_block.getArgument(function_info.argument_index);
 
@@ -186,8 +185,7 @@ void IdentifyXlaShardingForComputationInputs(
     StringRef logical_core_0_sharding, tf_device::ClusterFuncOp cluster_func_op,
     FuncOp cluster_function, Builder* builder) {
   // Look up function definition from module.
-  Block& cluster_function_block =
-      cluster_function.getBody().getBlocks().front();
+  Block& cluster_function_block = cluster_function.front();
   ModuleOp module = cluster_func_op.getParentOfType<ModuleOp>();
 
   llvm::SmallVector<llvm::StringRef, 8> sharding_for_args(
@@ -215,8 +213,7 @@ void IdentifyXlaShardingForComputationInputs(
 
         const int function_argument_index = function_arg_info.argument_index;
         auto& parsed_function = function_arg_info.func;
-        Block& parsed_function_block =
-            parsed_function.getBody().getBlocks().front();
+        Block& parsed_function_block = parsed_function.front();
         arg_sharding = ParseInputSharding(
             parsed_function_block.getArgument(function_argument_index));
       }
@@ -245,7 +242,7 @@ void IdentifyXlaShardingForComputationOutputs(
     tf_device::ClusterFuncOp cluster_func, Builder* builder) {
   // By default return values from logical core 0 is used if no sharding
   // configuration is defined.
-  Block& function_block = func.getBody().getBlocks().front();
+  Block& function_block = func.front();
   Operation* terminator = function_block.getTerminator();
   llvm::SmallVector<llvm::StringRef, 8> sharding_for_rets(
       terminator->getNumOperands(), logical_core_0_sharding);
diff --git a/tensorflow/compiler/mlir/tensorflow/translate/export_graphdef.cc b/tensorflow/compiler/mlir/tensorflow/translate/export_graphdef.cc
index 262f6f4e50c..8cd14894f8f 100644
--- a/tensorflow/compiler/mlir/tensorflow/translate/export_graphdef.cc
+++ b/tensorflow/compiler/mlir/tensorflow/translate/export_graphdef.cc
@@ -128,7 +128,7 @@ class LegalizedOpOrValLocNameMapper : public OpOrArgLocNameMapper {
 Status HasSingleGraphSingleOpIslandsFunctions(mlir::ModuleOp module) {
   Status status = Status::OK();
   module.walk([&](mlir::FuncOp function) {
-    if (function.getBlocks().size() != 1) {
+    if (!llvm::hasSingleElement(function)) {
       status = errors::FailedPrecondition(
           kInvalidExecutorGraphMsg,
           "only single block functions are supported.");
diff --git a/tensorflow/compiler/mlir/tensorflow/translate/tf_functional_to_executor.cc b/tensorflow/compiler/mlir/tensorflow/translate/tf_functional_to_executor.cc
index 29f98de6448..78019119d9d 100644
--- a/tensorflow/compiler/mlir/tensorflow/translate/tf_functional_to_executor.cc
+++ b/tensorflow/compiler/mlir/tensorflow/translate/tf_functional_to_executor.cc
@@ -46,13 +46,13 @@ struct FunctionalToExecutorDialectConversion
 }  // end anonymous namespace
 
 void FunctionalToExecutorDialectConversion::runOnFunction() {
-  if (getFunction().getBlocks().size() != 1) {
+  if (!llvm::hasSingleElement(getFunction())) {
     LLVM_DEBUG(llvm::dbgs() << "Expect single block function, skip conversion "
                                "to tf_executor dialect\n");
     return;
   }
   auto loc = getFunction().getLoc();
-  mlir::Block& body = getFunction().getBody().front();
+  mlir::Block& body = getFunction().front();
   // Find region of interest and ReturnOp.
   auto copy_range = body.without_terminator();
   if (copy_range.begin() != copy_range.end() &&
diff --git a/tensorflow/compiler/mlir/tensorflow/translate/translate_tf_dialect_op.cc b/tensorflow/compiler/mlir/tensorflow/translate/translate_tf_dialect_op.cc
index bd3fe9876ff..5236bdeffbf 100644
--- a/tensorflow/compiler/mlir/tensorflow/translate/translate_tf_dialect_op.cc
+++ b/tensorflow/compiler/mlir/tensorflow/translate/translate_tf_dialect_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/Support/ToolOutputFile.h"
 #include "mlir/IR/Function.h"  // from @llvm-project
 #include "mlir/IR/Location.h"  // from @llvm-project
@@ -26,12 +27,12 @@ static mlir::Operation* ExtractOnlyOp(mlir::ModuleOp module) {
   mlir::FuncOp fn = module.lookupSymbol<mlir::FuncOp>("main");
   if (!fn) return nullptr;
 
-  if (fn.getBlocks().size() != 1) return nullptr;
+  if (!llvm::hasSingleElement(fn)) return nullptr;
 
   // Here, modules with exactly two operations in the only basic block are
   // supported. The last operation should be a terminator operation and the
   // other operation is the operation of interest.
-  auto& block = fn.getBlocks().front();
+  auto& block = fn.front();
   if (block.getOperations().size() != 2) return nullptr;
   if (!block.back().isKnownTerminator()) return nullptr;
 
diff --git a/tensorflow/compiler/mlir/xla/mlir_hlo_to_hlo.cc b/tensorflow/compiler/mlir/xla/mlir_hlo_to_hlo.cc
index 60d9a698731..7a576780c61 100644
--- a/tensorflow/compiler/mlir/xla/mlir_hlo_to_hlo.cc
+++ b/tensorflow/compiler/mlir/xla/mlir_hlo_to_hlo.cc
@@ -1148,13 +1148,13 @@ LogicalResult ConvertToHloModule::LowerFunctionCall(
 
 LogicalResult ConvertToHloModule::RunOnFunction(mlir::FuncOp f) {
   if (lowered_computation_.count(f)) return success();
-  if (f.getBlocks().size() != 1) {
+  if (!llvm::hasSingleElement(f)) {
     return f.emitError("only single block Function supported");
   }
 
   // Create a sub-builder if this is not the main function.
   std::unique_ptr<xla::XlaBuilder> builder_up;
-  bool entry_function = f.getName().str() == "main";
+  bool entry_function = f.getName() == "main";
   if (!entry_function)
     builder_up = module_builder_.CreateSubBuilder(f.getName().str());
   auto& builder = entry_function ? module_builder_ : *builder_up;
diff --git a/tensorflow/compiler/mlir/xla/transforms/hlo_legalize_to_lhlo.cc b/tensorflow/compiler/mlir/xla/transforms/hlo_legalize_to_lhlo.cc
index 446f2aae833..7cdc0d92207 100644
--- a/tensorflow/compiler/mlir/xla/transforms/hlo_legalize_to_lhlo.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/hlo_legalize_to_lhlo.cc
@@ -230,10 +230,10 @@ struct HloToLhloReduceOpConverter : public BaseOpConversion<xla_hlo::ReduceOp> {
     auto loc = op.getLoc();
     // TODO(b/137624192) Implement variadic reduce.
     if (op.getNumResults() != 1) return failure();
-    if (op.getParentRegion()->getBlocks().size() != 1) {
-      op.emitOpError() << "tensor to buffer conversion expects a single block "
-                          "in the region containing the operation";
-      return failure();
+    if (!llvm::hasSingleElement(op.body())) {
+      return op.emitOpError()
+             << "tensor to buffer conversion expects a single block "
+                "in the region containing the operation";
     }
     const auto& original_results = op.getResults();
     SmallVector<Value, 4> buffer_args(operands.begin(), operands.end());
diff --git a/tensorflow/compiler/mlir/xla/transforms/legalize_tf_with_tf2xla.cc b/tensorflow/compiler/mlir/xla/transforms/legalize_tf_with_tf2xla.cc
index 8f96f4d1305..54453406ef7 100644
--- a/tensorflow/compiler/mlir/xla/transforms/legalize_tf_with_tf2xla.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/legalize_tf_with_tf2xla.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/Optional.h"
+#include "llvm/ADT/STLExtras.h"
 #include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
 #include "mlir/IR/Diagnostics.h"  // from @llvm-project
 #include "mlir/IR/Function.h"  // from @llvm-project
@@ -320,13 +321,14 @@ LogicalResult FuncLegalizer::PrepareParams() {
 }
 
 LogicalResult FuncLegalizer::Legalize() {
+  if (func_.empty()) return success();
+
   // TensorFlow functions don't use CFGs.
-  if (func_.getBlocks().size() > 1) {
+  if (!llvm::hasSingleElement(func_)) {
     emitError(func_.getLoc()) << "requires at most one block in a TF function";
     return failure();
   }
-  if (func_.getBlocks().empty()) return success();
-  Block& block = func_.getBlocks().front();
+  Block& block = func_.front();
 
   std::vector<Operation*> ops;
   ops.reserve(block.getOperations().size());
diff --git a/tensorflow/compiler/mlir/xla/transforms/lhlo_fuse_linalg.cc b/tensorflow/compiler/mlir/xla/transforms/lhlo_fuse_linalg.cc
index e16ab571b4d..f0971fdf76e 100644
--- a/tensorflow/compiler/mlir/xla/transforms/lhlo_fuse_linalg.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/lhlo_fuse_linalg.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include "mlir/Dialect/Linalg/Analysis/DependenceAnalysis.h"
 #include "absl/memory/memory.h"
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/STLExtras.h"
 #include "mlir/Dialect/Linalg/Transforms/Transforms.h"  // from @llvm-project
 #include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
@@ -44,7 +45,7 @@ class LhloFuseLinalg : public PassWrapper<LhloFuseLinalg, FunctionPass> {
     auto func = getFunction();
 
     // TODO(pifon): Remove assumption that the function has a single block.
-    if (func.getBlocks().size() != 1) {
+    if (!llvm::hasSingleElement(func)) {
       emitError(func.getLoc(), "The function needs to have a single block.");
       signalPassFailure();
       return;
@@ -58,7 +59,7 @@ class LhloFuseLinalg : public PassWrapper<LhloFuseLinalg, FunctionPass> {
     for (auto func_arg : func.getArguments()) {
       result_buffers.insert(func_arg);
     }
-    for (auto& block : func.getBlocks()) {
+    for (auto& block : func) {
       auto returnOp = mlir::dyn_cast<mlir::ReturnOp>(block.getTerminator());
       if (!returnOp) continue;
       for (auto operand : returnOp.getOperands()) {
diff --git a/tensorflow/compiler/mlir/xla/transforms/lhlo_legalize_to_affine.cc b/tensorflow/compiler/mlir/xla/transforms/lhlo_legalize_to_affine.cc
index 56b9f5879f6..904a30e847a 100644
--- a/tensorflow/compiler/mlir/xla/transforms/lhlo_legalize_to_affine.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/lhlo_legalize_to_affine.cc
@@ -31,6 +31,17 @@ namespace mlir {
 namespace xla_lhlo {
 namespace {
 
+// Builds an affine loop nest iterating from zeros to "upper_bounds" with unit
+// steps, and populates the body of the innermost loop using "body_builder".
+static void BuildBoundedAffineLoopNest(
+    OpBuilder& builder, Location location, ArrayRef<int64_t> upper_bounds,
+    function_ref<void(OpBuilder&, Location, ValueRange)> body_builder) {
+  SmallVector<int64_t, 3> lower_bounds(upper_bounds.size(), /*Value=*/0);
+  SmallVector<int64_t, 3> steps(upper_bounds.size(), /*Value=*/1);
+  buildAffineLoopNest(builder, location, lower_bounds, upper_bounds, steps,
+                      body_builder);
+}
+
 struct DotOpConverter : public OpRewritePattern<DotOp> {
   using OpRewritePattern<DotOp>::OpRewritePattern;
 
@@ -48,37 +59,29 @@ struct DotOpConverter : public OpRewritePattern<DotOp> {
     if ((lhs_type.getRank() != 2) || (rhs_type.getRank() != 2)) {
       return failure();
     }
-    SmallVector<Value, 4> lhs_indices, rhs_indices, result_indices;
-    const auto& loc = op.getLoc();
 
-    // Create the canonical ijk form of matmul.
-    auto forOp = rewriter.create<AffineForOp>(loc, 0, shape_lhs[0]);
-    lhs_indices.push_back(forOp.getInductionVar());
-    result_indices.push_back(forOp.getInductionVar());
+    LogicalResult map_status = success();
+    auto body_builder = [&](OpBuilder& builder, Location loc, ValueRange ivs) {
+      SmallVector<Value, 2> lhs_indices{ivs[0], ivs[2]},
+          rhs_indices{ivs[2], ivs[1]}, result_indices{ivs[0], ivs[1]};
 
-    rewriter.setInsertionPointToStart(forOp.getBody());
-    forOp = rewriter.create<AffineForOp>(loc, 0, shape_rhs.back());
-    result_indices.push_back(forOp.getInductionVar());
-    rhs_indices.resize(2);
-    rhs_indices[1] = forOp.getInductionVar();
+      auto l = builder.create<AffineLoadOp>(loc, lhs, lhs_indices);
+      auto r = builder.create<AffineLoadOp>(loc, rhs, rhs_indices);
+      auto result =
+          rewriter.create<AffineLoadOp>(loc, op.output(), result_indices);
+      Value op_result = xla_lhlo::XlaOpToStdScalarOp::map<DotOp>(
+          op, element_type, {l, r, result}, &builder);
+      map_status = success(op_result != nullptr);
+      if (failed(map_status)) return;
+      builder.create<AffineStoreOp>(loc, op_result, op.output(),
+                                    result_indices);
+    };
 
-    rewriter.setInsertionPointToStart(forOp.getBody());
-    forOp = rewriter.create<AffineForOp>(loc, 0, shape_rhs.front());
-    lhs_indices.push_back(forOp.getInductionVar());
-    rhs_indices[0] = forOp.getInductionVar();
+    BuildBoundedAffineLoopNest(rewriter, op.getLoc(),
+                               {shape_lhs[0], shape_rhs[1], shape_rhs[0]},
+                               body_builder);
+    if (failed(map_status)) return failure();
 
-    // Construct the innermost loop body.
-    rewriter.setInsertionPointToStart(forOp.getBody());
-    auto l = rewriter.create<AffineLoadOp>(loc, lhs, lhs_indices);
-    auto r = rewriter.create<AffineLoadOp>(loc, rhs, rhs_indices);
-    auto result =
-        rewriter.create<AffineLoadOp>(loc, op.output(), result_indices);
-    Value op_result = xla_lhlo::XlaOpToStdScalarOp::map<DotOp>(
-        op, element_type, {l, r, result}, &rewriter);
-    if (op_result == nullptr) {
-      return failure();
-    }
-    rewriter.create<AffineStoreOp>(loc, op_result, op.output(), result_indices);
     rewriter.eraseOp(op);
     return success();
   }
@@ -99,22 +102,22 @@ struct BinaryOpConverter : public OpRewritePattern<LhloOpTy> {
     if (lhs_type.getShape() != rhs_type.getShape()) {
       return failure();
     }
-    const auto& shape = lhs_type.getShape();
-    SmallVector<Value, 4> induction_vars;
-    const auto loc = op.getLoc();
-    for (int i = 0; i < shape.size(); ++i) {
-      auto forOp = rewriter.create<AffineForOp>(loc, 0, shape[i]);
-      induction_vars.push_back(forOp.getInductionVar());
-      rewriter.setInsertionPointToStart(forOp.getBody());
-    }
-    auto l = rewriter.create<AffineLoadOp>(loc, lhs, induction_vars);
-    auto r = rewriter.create<AffineLoadOp>(loc, rhs, induction_vars);
-    Value opResult = xla_lhlo::XlaOpToStdScalarOp::map<LhloOpTy>(
-        op, element_type, {l, r}, &rewriter);
-    if (opResult == nullptr) {
-      return failure();
-    }
-    rewriter.create<AffineStoreOp>(loc, opResult, op.out(), induction_vars);
+
+    LogicalResult map_status = success();
+    auto body_builder = [&](OpBuilder& builder, Location loc,
+                            ValueRange induction_vars) {
+      auto l = builder.create<AffineLoadOp>(loc, lhs, induction_vars);
+      auto r = builder.create<AffineLoadOp>(loc, rhs, induction_vars);
+      Value op_result = xla_lhlo::XlaOpToStdScalarOp::map<LhloOpTy>(
+          op, element_type, {l, r}, &builder);
+      map_status = success(op_result != nullptr);
+      if (failed(map_status)) return;
+      rewriter.create<AffineStoreOp>(loc, op_result, op.out(), induction_vars);
+    };
+
+    BuildBoundedAffineLoopNest(rewriter, op.getLoc(), lhs_type.getShape(),
+                               body_builder);
+    if (failed(map_status)) return failure();
     rewriter.eraseOp(op);
     return success();
   }
diff --git a/tensorflow/compiler/mlir/xla/transforms/xla_hlo_fusion.cc b/tensorflow/compiler/mlir/xla/transforms/xla_hlo_fusion.cc
index c4eb0e143d2..5d3eda0bea5 100644
--- a/tensorflow/compiler/mlir/xla/transforms/xla_hlo_fusion.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/xla_hlo_fusion.cc
@@ -487,7 +487,7 @@ struct XlaHloFusion : public mlir::PassWrapper<XlaHloFusion, FunctionPass> {
     }
 
     // process each block and do fusion within a block.
-    for (Block& block : func.getBlocks()) {
+    for (Block& block : func) {
       SmallVector<Operation*, 4> op_list;
       for (Operation& op : block) {
         op_list.push_back(&op);
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_executable.cc b/tensorflow/compiler/xla/service/cpu/cpu_executable.cc
index 4552d7b5ba9..d095d220b97 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_executable.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_executable.cc
@@ -299,12 +299,11 @@ StatusOr<ExecutionOutput> CpuExecutable::ExecuteAsyncOnStream(
       const Shape& expected_shape =
           entry_comp->parameter_instruction(i)->shape();
       const Shape& actual_shape = arguments[i].Buffers().shape();
-      CHECK(
-          Shape::Equal().IgnoreDynamicDimension()(expected_shape, actual_shape))
-          << absl::StreamFormat(
-                 "Shape mismatch on argument %d.  Expected %s, but was %s.", i,
-                 expected_shape.ToString(/*print_layout=*/true),
-                 actual_shape.ToString(/*print_layout=*/true));
+      TF_RET_CHECK(
+          ShapeUtil::DynamicShapeIsCompatible(actual_shape, expected_shape))
+          << "Shape mismatch on argument " << i << ", "
+          << expected_shape.ToString(/*print_layout=*/true) << " vs. "
+          << actual_shape.ToString(/*print_layout=*/true);
     }
   }
 
diff --git a/tensorflow/compiler/xla/service/executable.cc b/tensorflow/compiler/xla/service/executable.cc
index ebf7cc440dd..61ce6200a28 100644
--- a/tensorflow/compiler/xla/service/executable.cc
+++ b/tensorflow/compiler/xla/service/executable.cc
@@ -28,10 +28,57 @@ limitations under the License.
 #include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/lib/strings/proto_serialization.h"
 #include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/errors.h"
 #include "tensorflow/stream_executor/device_description.h"
 
 namespace xla {
 
+ExecutionInput::~ExecutionInput() {
+  for (auto& index : unowned_indices_) {
+    auto buffer = buffers_.mutable_element(index)->Release();
+    if (buffer) {
+      buffer->Release();
+    }
+  }
+}
+
+Status ExecutionInput::SetDynamicShape(Shape dynamic_shape) {
+  const Shape& input_shape = shape();
+  if (!ShapeUtil::DynamicShapeIsCompatible(input_shape, dynamic_shape)) {
+    return tensorflow::errors::InvalidArgument(
+        "Cannot set dynamic shape: ", input_shape.DebugString(), " vs. ",
+        dynamic_shape.DebugString());
+  }
+  dynamic_shape_ = absl::make_unique<Shape>(std::move(dynamic_shape));
+  return Status::OK();
+}
+
+void ExecutionInput::SetUnownedBuffer(const ShapeIndex& index,
+                                      MaybeOwningDeviceMemory buffer) {
+  *buffers_.mutable_element(index) = std::move(buffer);
+  unowned_indices_.insert(index);
+}
+
+xla::StatusOr<xla::ShapedBuffer> ExecutionInput::ToShapedBuffer(
+    se::DeviceMemoryAllocator* allocator, int device_ordinal) const {
+  const Shape& input_shape = shape();
+  xla::ShapedBuffer shaped_buffer(input_shape, input_shape,
+                                  allocator->platform(), device_ordinal);
+  for (const auto& index_buffer : Buffers()) {
+    const tensorflow::se::OwningDeviceMemory* mem =
+        index_buffer.second.AsOwningDeviceMemory();
+    if (mem != nullptr && (mem->allocator() != allocator ||
+                           mem->device_ordinal() != device_ordinal)) {
+      return tensorflow::errors::InvalidArgument(
+          "Device buffer at index ", index_buffer.first.ToString(),
+          " has mismatching allocator/device");
+    }
+    shaped_buffer.set_buffer(index_buffer.second.AsDeviceMemoryBase(),
+                             index_buffer.first);
+  }
+  return std::move(shaped_buffer);
+}
+
 StatusOr<ScopedShapedBuffer> Executable::ExecuteOnStream(
     const ServiceExecutableRunOptions* run_options,
     absl::Span<const ShapedBuffer* const> arguments,
diff --git a/tensorflow/compiler/xla/service/executable.h b/tensorflow/compiler/xla/service/executable.h
index 2c979662d24..6881f6dd68a 100644
--- a/tensorflow/compiler/xla/service/executable.h
+++ b/tensorflow/compiler/xla/service/executable.h
@@ -17,6 +17,7 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_XLA_SERVICE_EXECUTABLE_H_
 
 #include <memory>
+#include <set>
 #include <utility>
 #include <vector>
 
@@ -65,31 +66,32 @@ class ExecutionInput {
       : buffers_(std::move(buffers)) {}
   ExecutionInput(ExecutionInput&&) = default;
 
-  ~ExecutionInput() {
-    for (auto& index : unowned_indices_) {
-      auto buffer = buffers_.mutable_element(index)->Release();
-      if (buffer) {
-        buffer->Release();
-      }
-    }
-  }
+  ~ExecutionInput();
 
   ExecutionInput& operator=(ExecutionInput&&) = default;
 
-  const Shape& shape() const { return buffers_.shape(); }
+  const Shape& shape() const {
+    return dynamic_shape_ != nullptr ? *dynamic_shape_ : buffers_.shape();
+  }
+
+  Status SetDynamicShape(Shape dynamic_shape);
+
+  xla::StatusOr<xla::ShapedBuffer> ToShapedBuffer(
+      se::DeviceMemoryAllocator* allocator, int device_ordinal) const;
 
   void SetBuffer(const ShapeIndex& index, MaybeOwningDeviceMemory buffer) {
     *buffers_.mutable_element(index) = std::move(buffer);
   }
 
   void SetUnownedBuffer(const ShapeIndex& index,
-                        MaybeOwningDeviceMemory buffer) {
-    *buffers_.mutable_element(index) = std::move(buffer);
-    unowned_indices_.push_back(index);
-  }
+                        MaybeOwningDeviceMemory buffer);
 
   void SetUnownedIndex(const ShapeIndex& index) {
-    unowned_indices_.push_back(index);
+    unowned_indices_.insert(index);
+  }
+
+  void ClearUnownedIndex(const ShapeIndex& index) {
+    unowned_indices_.erase(index);
   }
 
   const ShapeTree<MaybeOwningDeviceMemory>& Buffers() const { return buffers_; }
@@ -106,9 +108,10 @@ class ExecutionInput {
 
  private:
   ShapeTree<MaybeOwningDeviceMemory> buffers_;
-  // (Unordered) set of indices of buffers that should be returned to the
-  // caller if an error occurs when enqueuing the computation.
-  std::vector<ShapeIndex> unowned_indices_;
+  // Set of indices of buffers that should be returned to the caller if an error
+  // occurs when enqueuing the computation.
+  std::set<ShapeIndex> unowned_indices_;
+  std::unique_ptr<Shape> dynamic_shape_;
 };
 
 // ExecutionOutput encapsulates the output buffers of a execution and the
@@ -145,7 +148,6 @@ class ExecutionOutput {
     to_be_released_.push_back(std::move(mem));
   }
 
-
   // Should be called once it is known that the execute operation succeeded,
   // before returning the ExecutionOutput to the caller.
   ExecutionOutput& Commit() {
diff --git a/tensorflow/compiler/xla/service/gpu/tests/gpu_ftz_test.cc b/tensorflow/compiler/xla/service/gpu/tests/gpu_ftz_test.cc
index 282f7b24a31..9b58457d129 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/gpu_ftz_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_ftz_test.cc
@@ -92,26 +92,23 @@ TEST_F(GpuFtzDisabledTest, MultiplyFtz) {
 }
 
 // In NVPTX, exp(float) is implemented in libdevice, and consults __nvvm_reflect
-// to determine whether or not ftz is enabled.  The implementation uses two
-// calls to ex2.approx.  When ftz is on, we get two calls to the ftz version;
-// when ftz is off, we get one call to the ftz version and one call to the
-// regular version.
+// to determine whether or not ftz is enabled.
+// The implementation in CUDA 11 uses one ex2.approx.ftz, irrespective of ftz
+// being enabled or not. In previous CUDA versions, there is a leading
+// ex2.approx that does obey the ftz setting.
+// Instead of pattern matching implementation details, it might be better to
+// value-test the actual result instead. TODO(csigg): change to value-test.
 TEST_F(GpuFtzEnabledTest, ExpFtz) {
   CompileAndOptionallyVerifyPtx(CreateUnaryOpModule(HloOpcode::kExp), R"(
     CHECK-NOT: ex2.approx.f32
     CHECK:     ex2.approx.ftz.f32
     CHECK-NOT: ex2.approx.f32
-    CHECK:     ex2.approx.ftz.f32
-    CHECK-NOT: ex2.approx.f32
-    CHECK-NOT: ex2.approx.ftz.f32
   )");
 }
 
 TEST_F(GpuFtzDisabledTest, ExpFtz) {
   CompileAndOptionallyVerifyPtx(CreateUnaryOpModule(HloOpcode::kExp), R"(
-    CHECK-NOT: ex2.approx.f32
-    CHECK-DAG: ex2.approx.ftz.f32
-    CHECK-DAG: ex2.approx.f32
+    CHECK:     ex2.approx.ftz.f32
     CHECK-NOT: ex2.approx.f32
     CHECK-NOT: ex2.approx.ftz.f32
   )");
diff --git a/tensorflow/compiler/xla/service/maybe_owning_device_memory.cc b/tensorflow/compiler/xla/service/maybe_owning_device_memory.cc
index c4bf48bcc00..c7505f5fa4a 100644
--- a/tensorflow/compiler/xla/service/maybe_owning_device_memory.cc
+++ b/tensorflow/compiler/xla/service/maybe_owning_device_memory.cc
@@ -14,7 +14,9 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/compiler/xla/service/maybe_owning_device_memory.h"
+
 #include "absl/types/variant.h"
+
 namespace xla {
 
 tensorflow::se::DeviceMemoryBase MaybeOwningDeviceMemory::AsDeviceMemoryBase()
@@ -38,4 +40,10 @@ MaybeOwningDeviceMemory::Release() {
   return std::move(absl::get<tensorflow::se::OwningDeviceMemory>(mem_));
 }
 
+const tensorflow::se::OwningDeviceMemory*
+MaybeOwningDeviceMemory::AsOwningDeviceMemory() const {
+  return HasOwnership() ? &absl::get<tensorflow::se::OwningDeviceMemory>(mem_)
+                        : nullptr;
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/maybe_owning_device_memory.h b/tensorflow/compiler/xla/service/maybe_owning_device_memory.h
index 7d23d178130..0b56fed0a72 100644
--- a/tensorflow/compiler/xla/service/maybe_owning_device_memory.h
+++ b/tensorflow/compiler/xla/service/maybe_owning_device_memory.h
@@ -57,6 +57,10 @@ class MaybeOwningDeviceMemory {
   // A nullopt is returned if the HasOwnership() == false;
   absl::optional<tensorflow::se::OwningDeviceMemory> Release();
 
+  // If the device memory is owned, returns a pointer to the internal
+  // OwningDeviceMemory, otherwise nullptr is returned.
+  const tensorflow::se::OwningDeviceMemory* AsOwningDeviceMemory() const;
+
   // Returns true if the device_memory has ownership over underlying memory.
   bool HasOwnership() const;
 
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/kernel_lowering.cc b/tensorflow/compiler/xla/service/mlir_gpu/kernel_lowering.cc
index ecd1308be4b..3f99d40c717 100644
--- a/tensorflow/compiler/xla/service/mlir_gpu/kernel_lowering.cc
+++ b/tensorflow/compiler/xla/service/mlir_gpu/kernel_lowering.cc
@@ -301,7 +301,7 @@ struct RewriteKernelSignature
         signalPassFailure();
         return;
       }
-      if (func.getBlocks().size() != 1) {
+      if (!llvm::hasSingleElement(func)) {
         func.emitError() << "surrounding function has more than one block";
         signalPassFailure();
         return;
diff --git a/tensorflow/compiler/xla/shape_util.cc b/tensorflow/compiler/xla/shape_util.cc
index ab46e49b181..bce40578132 100644
--- a/tensorflow/compiler/xla/shape_util.cc
+++ b/tensorflow/compiler/xla/shape_util.cc
@@ -1461,7 +1461,7 @@ ShapeUtil::ReshapeLeavesDimensionsUnmodified(
   return shape;
 }
 
-/* static */ bool ShapeUtil::DynamicShapeIsCompatible(
+/* static */ bool ShapeUtil::DynamicArrayShapeIsCompatible(
     const xla::Shape& dynamic_shape, const xla::Shape& bounded_shape) {
   if (dynamic_shape.rank() != bounded_shape.rank()) {
     return false;
@@ -1474,6 +1474,36 @@ ShapeUtil::ReshapeLeavesDimensionsUnmodified(
   return true;
 }
 
+/* static */ bool ShapeUtil::DynamicShapeIsCompatible(
+    const xla::Shape& dynamic_shape, const xla::Shape& bounded_shape) {
+  bool compatible = true;
+  xla::ShapeUtil::ForEachSubshape(dynamic_shape, [&](const Shape& sub_shape,
+                                                     const ShapeIndex& index) {
+    if (compatible) {
+      auto subshape_result = TryGetSubshape(bounded_shape, index);
+      if (subshape_result.ok()) {
+        const Shape* bounded_sub_shape = subshape_result.ConsumeValueOrDie();
+        if (sub_shape.IsTuple()) {
+          if (!bounded_sub_shape->IsTuple()) {
+            compatible = false;
+          }
+        } else {
+          if (bounded_sub_shape->IsTuple()) {
+            compatible = false;
+          } else if (!sub_shape.is_static() &&
+                     !DynamicArrayShapeIsCompatible(sub_shape,
+                                                    *bounded_sub_shape)) {
+            compatible = false;
+          }
+        }
+      } else {
+        compatible = false;
+      }
+    }
+  });
+  return compatible;
+}
+
 /* static */ Shape ShapeUtil::FilterDimensions(
     const std::function<bool(int64)>& p, Shape shape) {
   CHECK(shape.IsArray());
diff --git a/tensorflow/compiler/xla/shape_util.h b/tensorflow/compiler/xla/shape_util.h
index dde56587482..fe1a8acf6e4 100644
--- a/tensorflow/compiler/xla/shape_util.h
+++ b/tensorflow/compiler/xla/shape_util.h
@@ -657,7 +657,11 @@ class ShapeUtil {
                                 Shape shape);
 
   // Returns true if `dynamic_shape` has dimensions that are less-equal to the
-  // "bounded_shape".
+  // "bounded_shape". Shapes must be arrays.
+  static bool DynamicArrayShapeIsCompatible(const xla::Shape& dynamic_shape,
+                                            const xla::Shape& bounded_shape);
+
+  // Same as DynamicArrayShapeIsCompatible() but supports tuples.
   static bool DynamicShapeIsCompatible(const xla::Shape& dynamic_shape,
                                        const xla::Shape& bounded_shape);
 
diff --git a/tensorflow/compiler/xrt/BUILD b/tensorflow/compiler/xrt/BUILD
index 332c8ff9a14..6a704be4adb 100644
--- a/tensorflow/compiler/xrt/BUILD
+++ b/tensorflow/compiler/xrt/BUILD
@@ -74,6 +74,7 @@ cc_library(
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/service:backend",
         "//tensorflow/compiler/xla/service:executable",
+        "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:shaped_buffer",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
diff --git a/tensorflow/compiler/xrt/kernels/xrt_execute_op.cc b/tensorflow/compiler/xrt/kernels/xrt_execute_op.cc
index 2fc599e42df..3bd8af577c8 100644
--- a/tensorflow/compiler/xrt/kernels/xrt_execute_op.cc
+++ b/tensorflow/compiler/xrt/kernels/xrt_execute_op.cc
@@ -51,12 +51,6 @@ namespace tensorflow {
 
 namespace {
 
-struct InputBuffers {
-  std::vector<RefPtr<XRTTupleAllocation>> input_tuples;
-  std::vector<xla::ShapedBuffer> input_allocations;
-  std::vector<xla::ShapedBuffer*> input_pointers;
-};
-
 uint32 InitialRandomSeed() {
   // Support plumbing the TF seed through to XLA is being worked on.
   // If a user wants deterministic behavior, their best option
@@ -80,75 +74,51 @@ uint32 GetXLARandomSeed() {
   return counter.fetch_add(2);
 }
 
-xla::StatusOr<InputBuffers> GetInputBuffers(
-    XRTMemoryManager::WorkingSet* working_set, xla::Backend* backend,
-    const std::vector<InputCoords>& input_coords, bool release_inputs) {
-  InputBuffers input_buffers;
-  input_buffers.input_tuples.reserve(input_coords.size());
-  input_buffers.input_allocations.reserve(input_coords.size());
-  input_buffers.input_pointers.reserve(input_coords.size());
-  for (size_t i = 0; i < input_coords.size(); ++i) {
-    TF_RETURN_IF_ERROR(
-        working_set->LookupAndPin(backend, input_coords[i].handle));
-    auto tuple = working_set->PinnedTuples().back();
-    input_buffers.input_tuples.emplace_back(tuple);
-    if (release_inputs) {
-      // We are holding a reference to the tuple, so we can safely delete it
-      // from the resource manager here.
-      TF_RETURN_IF_ERROR(
-          working_set->MemoryManager()->Release(input_coords[i].handle));
-      VLOG(2) << "Released allocation handle " << input_coords[i].handle;
-    }
-    if (input_coords[i].index.empty()) {
-      TF_ASSIGN_OR_RETURN(xla::ShapedBuffer shaped_buffer,
-                          tuple->ToShapedBuffer());
-      input_buffers.input_allocations.emplace_back(std::move(shaped_buffer));
-    } else {
-      TF_ASSIGN_OR_RETURN(xla::ShapedBuffer shaped_buffer,
-                          tuple->ToShapedBuffer());
-      TF_ASSIGN_OR_RETURN(xla::ShapedBuffer sub_shaped_buffer,
-                          shaped_buffer.SubShapedBuffer(input_coords[i].index));
-      input_buffers.input_allocations.emplace_back(
-          std::move(sub_shaped_buffer));
-    }
+std::vector<bool> GetDynamicInputInfo(
+    const xla::ComputationLayout& computation_layout) {
+  std::vector<bool> input_is_dynamic;
+  input_is_dynamic.reserve(computation_layout.parameter_count());
+  for (int64 i = 0; i < computation_layout.parameter_count(); ++i) {
+    input_is_dynamic.push_back(
+        !computation_layout.parameter_shape(i).is_static());
   }
-  for (size_t i = 0; i < input_buffers.input_allocations.size(); ++i) {
-    input_buffers.input_pointers.push_back(&input_buffers.input_allocations[i]);
-  }
-  return std::move(input_buffers);
+  return input_is_dynamic;
 }
 
-xla::StatusOr<InputBuffers> GetChainedOpInputs(
+xla::StatusOr<std::vector<RefPtr<XRTTupleAllocation>>> GetInputTuples(
+    xla::LocalExecutable* executable, XRTMemoryManager::WorkingSet* working_set,
+    xla::Backend* backend, const std::vector<InputCoords>& input_coords,
+    bool release_inputs) {
+  const xla::ComputationLayout& computation_layout =
+      executable->executable()->module_config().entry_computation_layout();
+
+  return GetInputTupleAllocations(
+      input_coords, working_set, backend, computation_layout.parameter_count(),
+      [&](int64 i) { return computation_layout.parameter_shape(i); },
+      release_inputs);
+}
+
+xla::StatusOr<std::vector<RefPtr<XRTTupleAllocation>>> GetChainedOpInputTuples(
     const xrt::XRTChainedExecuteOp& op,
     absl::Span<const RefPtr<XRTTupleAllocation>> op_inputs) {
-  InputBuffers input_buffers;
-  input_buffers.input_tuples.reserve(op.inputs_size());
-  input_buffers.input_allocations.reserve(op.inputs_size());
-  input_buffers.input_pointers.reserve(op.inputs_size());
+  std::vector<RefPtr<XRTTupleAllocation>> input_tuples;
+  input_tuples.reserve(op.inputs_size());
   for (int i = 0; i < op.inputs_size(); ++i) {
     auto& input = op.inputs(i);
-    input_buffers.input_tuples.emplace_back(op_inputs[i]);
     // Thanks to the greatness of proto3, there is no way to query for
     // explicitly set fields, so the default for output_index (zero) means no
     // sub-index. As consequence, the real index is output_index - 1.
     if (input.output_index() == 0) {
-      TF_ASSIGN_OR_RETURN(xla::ShapedBuffer shaped_buffer,
-                          input_buffers.input_tuples.back()->ToShapedBuffer());
-      input_buffers.input_allocations.emplace_back(std::move(shaped_buffer));
+      input_tuples.emplace_back(op_inputs[i]);
     } else {
-      TF_ASSIGN_OR_RETURN(xla::ShapedBuffer shaped_buffer,
-                          input_buffers.input_tuples.back()->ToShapedBuffer());
-      TF_ASSIGN_OR_RETURN(
-          xla::ShapedBuffer sub_shaped_buffer,
-          shaped_buffer.SubShapedBuffer({input.output_index() - 1}));
-      input_buffers.input_allocations.emplace_back(
-          std::move(sub_shaped_buffer));
+      XRTTupleAllocation* sub_tuple;
+      TF_RETURN_IF_ERROR(XRTTupleAllocation::MakeSubBuffer(
+          op_inputs[i].get(), {input.output_index() - 1}, &sub_tuple,
+          /*alias_parent_allocation=*/true));
+      input_tuples.emplace_back(sub_tuple);
     }
   }
-  for (size_t i = 0; i < input_buffers.input_allocations.size(); ++i) {
-    input_buffers.input_pointers.push_back(&input_buffers.input_allocations[i]);
-  }
-  return std::move(input_buffers);
+  return input_tuples;
 }
 
 // Given a shape, returns a byte array representing the shape metadata of the
@@ -228,12 +198,11 @@ Status UpdateMetadata(se::Stream* stream, se::DeviceMemory<uint8>* buffer,
 // As we can't expand the size of an existing memory allocation, a reallocation
 // is required. A list of new allocations are returned after this function. The
 // caller is reponsible for maintaining those allocations.
-xla::StatusOr<std::vector<se::OwningDeviceMemory>> UpdateDynamicInputs(
+Status UpdateDynamicInputs(
     se::Stream* stream, se::DeviceMemoryAllocator* allocator,
-    std::vector<xla::ShapedBuffer*> runtime_inputs,
+    std::vector<xla::ExecutionInput>* execution_inputs,
     const std::vector<xla::ShapeLayout>& compile_time_shapes) {
-  std::vector<se::OwningDeviceMemory> new_allocations;
-  TF_RET_CHECK(runtime_inputs.size() == compile_time_shapes.size());
+  TF_RET_CHECK(execution_inputs->size() == compile_time_shapes.size());
   TF_ASSIGN_OR_RETURN(auto compiler, xla::Compiler::GetForPlatform(
                                          stream->parent()->platform()));
   auto shape_size_fn = compiler->ShapeSizeBytesFunction();
@@ -242,57 +211,61 @@ xla::StatusOr<std::vector<se::OwningDeviceMemory>> UpdateDynamicInputs(
     if (compile_time_shape.is_static()) {
       continue;
     }
-    auto* runtime_input = runtime_inputs[i];
-
+    xla::ExecutionInput* execution_input = &(*execution_inputs)[i];
     bool element_modified = false;
     TF_RETURN_IF_ERROR(xla::ShapeUtil::ForEachSubshapeWithStatus(
         compile_time_shape,
-        [&](const xla::Shape& compile_time_shape,
+        [&](const xla::Shape& sub_shape,
             const xla::ShapeIndex& index) -> Status {
-          if (compile_time_shape.IsTuple() || compile_time_shape.is_static()) {
+          if (sub_shape.IsTuple() || sub_shape.is_static()) {
             return Status::OK();
           }
-          const xla::Shape& runtime_shape = xla::ShapeUtil::GetSubshape(
-              runtime_input->on_device_shape(), index);
-          TF_RET_CHECK(!runtime_shape.IsTuple());
-          TF_RET_CHECK(xla::ShapeUtil::DynamicShapeIsCompatible(
-              runtime_shape, compile_time_shape));
-          se::DeviceMemoryBase* static_input =
-              runtime_input->buffers().mutable_element(index);
           TF_ASSIGN_OR_RETURN(
-              auto dynamic_input,
+              const xla::Shape* runtime_shape,
+              xla::ShapeUtil::TryGetSubshape(execution_input->shape(), index));
+          TF_RET_CHECK(!runtime_shape->IsTuple());
+          TF_RET_CHECK(xla::ShapeUtil::DynamicArrayShapeIsCompatible(
+              *runtime_shape, sub_shape));
+          TF_ASSIGN_OR_RETURN(
+              se::OwningDeviceMemory dynamic_input,
               allocator->Allocate(stream->parent()->device_ordinal(),
-                                  shape_size_fn(compile_time_shape)));
-          new_allocations.emplace_back(std::move(dynamic_input));
-          se::DeviceMemory<uint8>* dynamic_input_base =
-              new_allocations.back().ptr();
+                                  shape_size_fn(sub_shape)));
+
+          se::DeviceMemoryBase static_input =
+              execution_input->Buffer(index).AsDeviceMemoryBase();
+          se::DeviceMemory<uint8>* dynamic_input_base = dynamic_input.ptr();
           // Send the original data to the new location.
-          stream->ThenMemcpyD2D(dynamic_input_base, *static_input,
-                                static_input->size());
+          stream->ThenMemcpyD2D(dynamic_input_base, static_input,
+                                static_input.size());
           TF_RETURN_IF_ERROR(UpdateMetadata(stream, dynamic_input_base,
-                                            compile_time_shape, runtime_shape));
+                                            sub_shape, *runtime_shape));
           // Modify the memory location in the input shape tree to point to the
           // new input.
-          runtime_input->set_buffer(*dynamic_input_base, index);
+          execution_input->SetBuffer(
+              index, xla::MaybeOwningDeviceMemory(std::move(dynamic_input)));
+          execution_input->ClearUnownedIndex(index);
           element_modified = true;
           return Status::OK();
         }));
     if (element_modified) {
-      runtime_input->set_shapes(compile_time_shape, compile_time_shape);
+      TF_RETURN_IF_ERROR(execution_input->SetDynamicShape(compile_time_shape));
+      TF_ASSIGN_OR_RETURN(xla::ShapedBuffer shaped_buffer,
+                          execution_input->ToShapedBuffer(
+                              allocator, stream->parent()->device_ordinal()));
       // The input location has been modified, need to fix tuple table to
       // point to the correct address.
       TF_ASSIGN_OR_RETURN(
           auto transfer_manager,
           xla::TransferManager::GetForPlatform(stream->parent()->platform()));
       TF_RETURN_IF_ERROR(
-          transfer_manager->WriteTupleIndexTablesAsync(stream, *runtime_input));
+          transfer_manager->WriteTupleIndexTablesAsync(stream, shaped_buffer));
     }
   }
-  return std::move(new_allocations);
+  return Status::OK();
 }
 
 xla::StatusOr<xla::Literal> ReadMetadataLiteral(
-    se::Stream* stream, se::DeviceMemoryBase* buffer,
+    se::Stream* stream, se::DeviceMemoryBase buffer,
     const xla::Shape& buffer_shape, xla::TransferManager* transfer_manager) {
   TF_ASSIGN_OR_RETURN(auto compiler, xla::Compiler::GetForPlatform(
                                          stream->parent()->platform()));
@@ -302,7 +275,7 @@ xla::StatusOr<xla::Literal> ReadMetadataLiteral(
   const int64 offset = shape_size_fn(buffer_shape_static);
   int64 metadata_size = shape_size_fn(buffer_shape) - offset;
   TF_RET_CHECK(metadata_size != 0);
-  auto buffer_8 = se::DeviceMemory<uint8>(*buffer);
+  auto buffer_8 = se::DeviceMemory<uint8>(buffer);
   auto metadata_buffer =
       stream->parent()->GetSubBuffer(&buffer_8, offset, metadata_size);
   return transfer_manager->TransferArrayFromDevice(
@@ -315,7 +288,7 @@ xla::StatusOr<xla::Literal> ReadMetadataLiteral(
 // dimension sizes from the metadata, and update output shapes. The result shape
 // is a static and concrete shape.
 xla::Status UpdateDynamicOutputs(se::Stream* stream,
-                                 xla::ShapedBuffer* shaped_buffer,
+                                 const xla::ShapedBuffer& shaped_buffer,
                                  xla::Shape* output_host_shape,
                                  xla::Shape* output_device_shape) {
   DCHECK(output_device_shape->is_dynamic());
@@ -323,8 +296,8 @@ xla::Status UpdateDynamicOutputs(se::Stream* stream,
       auto transfer_manager,
       xla::TransferManager::GetForPlatform(stream->parent()->platform()));
   TF_RETURN_IF_ERROR(stream->BlockHostUntilDone());
-  TF_RETURN_IF_ERROR(shaped_buffer->buffers().ForEachMutableElementWithStatus(
-      [&](const xla::ShapeIndex& index, se::DeviceMemoryBase* buffer) {
+  TF_RETURN_IF_ERROR(shaped_buffer.buffers().ForEachElementWithStatus(
+      [&](const xla::ShapeIndex& index, const se::DeviceMemoryBase& buffer) {
         const xla::Shape& buffer_shape =
             xla::ShapeUtil::GetSubshape(*output_device_shape, index);
         if (buffer_shape.IsTuple()) {
@@ -352,19 +325,18 @@ xla::Status UpdateDynamicOutputs(se::Stream* stream,
   return Status::OK();
 }
 
-// Create output tuple from run_result.
 xla::StatusOr<RefPtr<XRTTupleAllocation>> CreateOutputTuple(
-    se::Stream* stream, xla::ScopedShapedBuffer run_result,
-    xla::Backend* backend, int device_ordinal) {
+    se::Stream* stream, xla::ExecutionOutput run_result, xla::Backend* backend,
+    int device_ordinal) {
   XRTTupleAllocation* output_tuple;
-  xla::ShapedBuffer shaped_buffer = run_result.release();
+  const xla::ScopedShapedBuffer& shaped_buffer = run_result.Result();
   if (shaped_buffer.on_device_shape().is_dynamic()) {
     // Update dynamic shapes from output buffer, and create a XRT tensor with
     // dimension sizes read from metadata.
     xla::Shape output_host_shape = shaped_buffer.on_host_shape();
     xla::Shape output_device_shape = shaped_buffer.on_device_shape();
     TF_RETURN_IF_ERROR(UpdateDynamicOutputs(
-        stream, &shaped_buffer, &output_host_shape, &output_device_shape));
+        stream, shaped_buffer, &output_host_shape, &output_device_shape));
     TF_RETURN_IF_ERROR(XRTTupleAllocation::CreateFromBuffer(
         shaped_buffer, output_host_shape, output_device_shape, backend,
         device_ordinal, &output_tuple));
@@ -373,15 +345,27 @@ xla::StatusOr<RefPtr<XRTTupleAllocation>> CreateOutputTuple(
     TF_RETURN_IF_ERROR(XRTTupleAllocation::CreateFromBuffer(
         shaped_buffer, backend, device_ordinal, &output_tuple));
   }
+  // After the output tuple is created, we can release the output result
+  // buffers, to make sure they won't be cleared by its destructor.
+  (void)run_result.ConsumeResult().release();
   return RefPtr<XRTTupleAllocation>(output_tuple);
 }
 
 xla::StatusOr<RefPtr<XRTTupleAllocation>> RunExecutable(
     OpKernelContext* context, XRTGenericDeviceAccessor::ScopedRef* device_ref,
-    xla::LocalExecutable* executable, const InputBuffers& input_buffers,
-    se::Stream* stream, int rng_seed,
+    xla::LocalExecutable* executable,
+    absl::Span<const RefPtr<XRTTupleAllocation>> input_tuples,
+    bool release_inputs, se::Stream* stream, int rng_seed,
     const xrt::CommonExecutionConfig& config) {
-  VLOG(2) << "Executing computation.";
+  const xla::ComputationLayout& computation_layout =
+      executable->executable()->module_config().entry_computation_layout();
+  std::vector<bool> input_is_dynamic = GetDynamicInputInfo(computation_layout);
+  TF_ASSIGN_OR_RETURN(
+      std::vector<xla::ExecutionInput> execution_inputs,
+      GetArgumentsBuffers(
+          executable->executable()->module().input_output_alias_config(),
+          input_tuples, input_is_dynamic, release_inputs));
+
   xla::ExecutableRunOptions run_options;
   run_options.set_stream(stream);
   run_options.set_allocator(device_ref->backend()->memory_allocator());
@@ -419,51 +403,28 @@ xla::StatusOr<RefPtr<XRTTupleAllocation>> RunExecutable(
   }
   run_options.set_gpu_executable_run_options(&gpu_options);
 
-  Env* env = Env::Default();
-  auto start_time = env->NowMicros();
   const std::vector<xla::ShapeLayout>& shape_layouts =
       executable->executable()
           ->module_config()
           .entry_computation_layout()
           .parameter_layouts();
-  TF_ASSIGN_OR_RETURN(auto new_allocations,
-                      UpdateDynamicInputs(stream, run_options.allocator(),
-                                          input_buffers.input_pointers,
-                                          shape_layouts));
-  auto new_allocations_ptr =
-      std::make_shared<std::vector<se::OwningDeviceMemory>>(
-          std::move(new_allocations));
+  TF_RETURN_IF_ERROR(UpdateDynamicInputs(stream, run_options.allocator(),
+                                         &execution_inputs, shape_layouts));
   TF_ASSIGN_OR_RETURN(
-      xla::ScopedShapedBuffer run_result,
-      executable->Run(input_buffers.input_pointers, run_options));
-  // Retain the new allocation for input memory until the end of execution.
-  stream->ThenDoHostCallback([new_allocations_ptr]() { return Status::OK(); });
-
-  auto elapsed = env->NowMicros() - start_time;
-  VLOG(2) << "Elapsed time: " << elapsed << "us";
+      xla::ExecutionOutput run_result,
+      executable->Run(std::move(execution_inputs), run_options));
 
   TF_ASSIGN_OR_RETURN(
       RefPtr<XRTTupleAllocation> output_tuple_ptr,
       CreateOutputTuple(stream, std::move(run_result), device_ref->backend(),
                         device_ref->device_ordinal()));
-
   // The ScopedShapedBuffer returned by the executable Run() API, in case of
   // input/output buffer aliasing, might have holes in it, which need to be
   // filled using the proper input tuples buffers which are the source of
   // aliasing.
-  const xla::HloInputOutputAliasConfig& input_output_alias =
-      executable->executable()->module().input_output_alias_config();
-  auto alias_function =
-      [&](const xla::ShapeIndex& output_index,
-          const xla::HloInputOutputAliasConfig::Alias& alias) -> Status {
-    TF_RET_CHECK(alias.parameter_number < input_buffers.input_tuples.size());
-    return alias.kind == xla::HloInputOutputAliasConfig::AliasKind::kUserAlias
-               ? output_tuple_ptr->AliasBufferFrom(
-                     *input_buffers.input_tuples[alias.parameter_number],
-                     alias.parameter_index, output_index)
-               : Status::OK();
-  };
-  TF_RETURN_IF_ERROR(input_output_alias.ForEachAliasWithStatus(alias_function));
+  TF_RETURN_IF_ERROR(RebuildOutputAliases(
+      output_tuple_ptr, input_tuples,
+      executable->executable()->module().input_output_alias_config()));
 
   return std::move(output_tuple_ptr);
 }
@@ -471,12 +432,13 @@ xla::StatusOr<RefPtr<XRTTupleAllocation>> RunExecutable(
 xla::StatusOr<RefPtr<XRTTupleAllocation>> ExecuteComputation(
     OpKernelContext* context, XRTMemoryManager* memory_manager,
     XRTGenericDeviceAccessor::ScopedRef* device_ref,
-    xla::LocalExecutable* executable, const InputBuffers& input_buffers,
-    se::Stream* stream, int rng_seed,
+    xla::LocalExecutable* executable,
+    absl::Span<const RefPtr<XRTTupleAllocation>> input_tuples,
+    bool release_inputs, se::Stream* stream, int rng_seed,
     const xrt::CommonExecutionConfig& config) {
   auto runfn = [&]() {
-    return RunExecutable(context, device_ref, executable, input_buffers, stream,
-                         rng_seed, config);
+    return RunExecutable(context, device_ref, executable, input_tuples,
+                         release_inputs, stream, rng_seed, config);
   };
 
   // We pass zero as requested_free_size as there is no simple way to get the
@@ -495,12 +457,13 @@ xla::StatusOr<RefPtr<XRTTupleAllocation>> ExecuteComputation(
     se::Stream* stream, int rng_seed,
     const xrt::CommonExecutionConfig& config) {
   XRTMemoryManager::WorkingSet working_set(memory_manager);
-  TF_ASSIGN_OR_RETURN(InputBuffers input_buffers,
-                      GetInputBuffers(&working_set, device_ref->backend(),
-                                      input_coords, release_inputs));
+  TF_ASSIGN_OR_RETURN(
+      std::vector<RefPtr<XRTTupleAllocation>> input_tuples,
+      GetInputTuples(executable, &working_set, device_ref->backend(),
+                     input_coords, release_inputs));
   return ExecuteComputation(context, memory_manager.get(), device_ref,
-                            executable, input_buffers, stream, rng_seed,
-                            config);
+                            executable, input_tuples, release_inputs, stream,
+                            rng_seed, config);
 }
 
 // XRTExecuteOp
@@ -653,16 +616,16 @@ Status XRTExecuteChainedOp::DoWork(OpKernelContext* context) {
   auto execute_op = [&](const xrt::XRTChainedExecuteOp& op,
                         absl::Span<const RefPtr<XRTTupleAllocation>> op_inputs)
       -> xla::StatusOr<RefPtr<XRTTupleAllocation>> {
-    TF_ASSIGN_OR_RETURN(InputBuffers input_buffers,
-                        GetChainedOpInputs(op, op_inputs));
-
     std::unique_ptr<XRTCompilationCacheEntryRef> entry;
     TF_RETURN_IF_ERROR(cache->Lookup(op.computation_handle(), &entry));
     xla::LocalExecutable* executable = entry->get().get_executable();
 
-    return ExecuteComputation(context, memory_manager.get(), &device_ref,
-                              executable, input_buffers, stream, rng_seed,
-                              config.common_config());
+    TF_ASSIGN_OR_RETURN(std::vector<RefPtr<XRTTupleAllocation>> input_tuples,
+                        GetChainedOpInputTuples(op, op_inputs));
+
+    return ExecuteComputation(
+        context, memory_manager.get(), &device_ref, executable, input_tuples,
+        /*release_inputs=*/false, stream, rng_seed, config.common_config());
   };
 
   return ExecuteChained(context, memory_manager, device_ref.backend(),
diff --git a/tensorflow/compiler/xrt/xrt_util.cc b/tensorflow/compiler/xrt/xrt_util.cc
index b8a0afc92c5..926ba23c7af 100644
--- a/tensorflow/compiler/xrt/xrt_util.cc
+++ b/tensorflow/compiler/xrt/xrt_util.cc
@@ -221,6 +221,140 @@ xla::StatusOr<std::vector<InputCoords>> GetComputationInputs(
   return std::move(input_coords);
 }
 
+bool InputShapeMatches(const xla::Shape& parameter_shape,
+                       const xla::Shape& input_shape) {
+  auto shape_checker = [&](const xla::Shape& pshape,
+                           const xla::ShapeIndex& index) {
+    if (pshape.IsArray()) {
+      TF_ASSIGN_OR_RETURN(const xla::Shape* ishape,
+                          xla::ShapeUtil::TryGetSubshape(input_shape, index));
+      if (pshape.rank() != ishape->rank() ||
+          pshape.element_type() != ishape->element_type()) {
+        return errors::InvalidArgument("Mismatching shapes");
+      }
+      if (pshape.is_static() && pshape.layout() != ishape->layout()) {
+        return errors::InvalidArgument("Mismatching layouts");
+      }
+      for (int64 dim = 0; dim < pshape.rank(); ++dim) {
+        if (pshape.is_dynamic_dimension(dim)) {
+          if (pshape.dimensions(dim) < ishape->dimensions(dim)) {
+            return errors::InvalidArgument("Mismatching shapes");
+          }
+        } else if (pshape.dimensions(dim) != ishape->dimensions(dim)) {
+          return errors::InvalidArgument("Mismatching shapes");
+        }
+      }
+    }
+    return Status::OK();
+  };
+  return xla::ShapeUtil::ForEachSubshapeWithStatus(parameter_shape,
+                                                   shape_checker)
+      .ok();
+}
+
+xla::StatusOr<std::vector<RefPtr<XRTTupleAllocation>>> GetInputTupleAllocations(
+    const std::vector<InputCoords>& input_coords,
+    XRTMemoryManager::WorkingSet* working_set, xla::Backend* backend,
+    int64 num_input_shapes,
+    const std::function<xla::Shape(int64)>& shape_getter, bool release_inputs) {
+  if (input_coords.size() != num_input_shapes) {
+    return errors::InvalidArgument(
+        "Number of inputs does not match executable proto input shapes: ",
+        input_coords.size(), " vs. ", num_input_shapes);
+  }
+  std::vector<RefPtr<XRTTupleAllocation>> input_tuples;
+  input_tuples.reserve(input_coords.size());
+  for (size_t i = 0; i < input_coords.size(); ++i) {
+    TF_RETURN_IF_ERROR(
+        working_set->LookupAndPin(backend, input_coords[i].handle));
+    auto tuple = working_set->PinnedTuples().back();
+    if (release_inputs) {
+      // We are holding a reference to the tuple, so we can safely delete it
+      // from the resource manager here.
+      TF_RETURN_IF_ERROR(
+          working_set->MemoryManager()->Release(input_coords[i].handle));
+      VLOG(2) << "Released allocation handle " << input_coords[i].handle;
+    }
+    xla::Shape input_shape = shape_getter(i);
+    if (!InputShapeMatches(input_shape, tuple->on_host_shape())) {
+      return errors::InvalidArgument(
+          "Run-time shape mismatch for XRTExecute argument[", i, "] (",
+          input_coords[i].handle, "). Expected ", input_shape.DebugString(),
+          "; got ", tuple->on_host_shape().DebugString());
+    }
+    if (input_coords[i].index.empty()) {
+      input_tuples.emplace_back(std::move(tuple));
+    } else {
+      XRTTupleAllocation* sub_tuple;
+      TF_RETURN_IF_ERROR(XRTTupleAllocation::MakeSubBuffer(
+          tuple.get(), input_coords[i].index, &sub_tuple,
+          /*alias_parent_allocation=*/true));
+      input_tuples.emplace_back(sub_tuple);
+    }
+  }
+  return std::move(input_tuples);
+}
+
+Status RebuildOutputAliases(
+    const RefPtr<XRTTupleAllocation>& output_tuple,
+    absl::Span<const RefPtr<XRTTupleAllocation>> input_tuples,
+    const xla::HloInputOutputAliasConfig& input_output_alias) {
+  auto alias_function =
+      [&](const xla::ShapeIndex& output_index,
+          const xla::HloInputOutputAliasConfig::Alias& alias) -> Status {
+    TF_RET_CHECK(alias.parameter_number < input_tuples.size());
+    return alias.kind == xla::HloInputOutputAliasConfig::AliasKind::kUserAlias
+               ? output_tuple->AliasBufferFrom(
+                     *input_tuples[alias.parameter_number],
+                     alias.parameter_index, output_index)
+               : Status::OK();
+  };
+  return input_output_alias.ForEachAliasWithStatus(alias_function);
+}
+
+xla::StatusOr<std::vector<xla::ExecutionInput>> GetArgumentsBuffers(
+    const xla::HloInputOutputAliasConfig& input_output_alias,
+    absl::Span<const RefPtr<XRTTupleAllocation>> input_tuples,
+    const std::vector<bool>& input_is_dynamic, bool release_inputs) {
+  auto is_dynamic = [&](size_t arg) {
+    return arg < input_is_dynamic.size() && input_is_dynamic[arg];
+  };
+  std::vector<xla::ExecutionInput> arguments;
+  // Don't alias dynamic input -- Due to the underlying implementation,
+  // aliased inputs have two owners: XRTAllocation and return value of
+  // this function. If an argument is dynamic and the ownership is
+  // released to output of this function, TPUExecute will free it and
+  // reallocate a new one, which creates a double freeing issue where
+  // XRTAllocation also attempts to release the buffer.
+  bool alias_outputs = release_inputs && input_tuples.size() == 1 &&
+                       input_tuples[0]->IsExclusiveOwner() && !is_dynamic(0);
+  arguments.reserve(input_tuples.size());
+  for (int64 i = 0; i < input_tuples.size(); ++i) {
+    auto alias_checker =
+        [&](const xla::ShapeIndex& index) -> xla::StatusOr<bool> {
+      // Only the buffers which the caller explicitly marked as aliased
+      // (kUserAlias), should create aliases.
+      // The XLA compiler might create opportunistic aliases (kSystemAlias)
+      // which need a different handling. With a system alias we know that XLA
+      // is going to reuse a given input parameter buffer for a given output, so
+      // unless it is known at call site that the input buffer has no more uses,
+      // a copy needs to be made at call site. With user specified alias the
+      // caller tells us that he expects a given output to land over the buffers
+      // of a given parametter.
+      if (input_output_alias.ParameterAliasKind(i, index) ==
+          xla::HloInputOutputAliasConfig::AliasKind::kUserAlias) {
+        TF_RET_CHECK(!is_dynamic(i));
+        return true;
+      }
+      return alias_outputs;
+    };
+    TF_ASSIGN_OR_RETURN(xla::ExecutionInput exec_input,
+                        input_tuples[i]->ToExecutionInput(alias_checker));
+    arguments.emplace_back(std::move(exec_input));
+  }
+  return std::move(arguments);
+}
+
 Status CreateExecuteOutput(OpKernelContext* context,
                            XRTMemoryManager* memory_manager,
                            RefPtr<XRTTupleAllocation> output_tuple,
diff --git a/tensorflow/compiler/xrt/xrt_util.h b/tensorflow/compiler/xrt/xrt_util.h
index cc1480fdb00..832c106621f 100644
--- a/tensorflow/compiler/xrt/xrt_util.h
+++ b/tensorflow/compiler/xrt/xrt_util.h
@@ -23,6 +23,8 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/compiler/xla/service/backend.h"
+#include "tensorflow/compiler/xla/service/hlo_input_output_alias_config.h"
+#include "tensorflow/compiler/xla/shape.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/xla.pb.h"
@@ -69,6 +71,25 @@ xla::DebugOptions BuildXlaDebugOptions(const xla::DebugOptions& ref_options);
 xla::StatusOr<std::vector<InputCoords>> GetComputationInputs(
     OpKernelContext* context, const char* input_name);
 
+bool InputShapeMatches(const xla::Shape& parameter_shape,
+                       const xla::Shape& input_shape);
+
+xla::StatusOr<std::vector<RefPtr<XRTTupleAllocation>>> GetInputTupleAllocations(
+    const std::vector<InputCoords>& input_coords,
+    XRTMemoryManager::WorkingSet* working_set, xla::Backend* backend,
+    int64 num_input_shapes,
+    const std::function<xla::Shape(int64)>& shape_getter, bool release_inputs);
+
+Status RebuildOutputAliases(
+    const RefPtr<XRTTupleAllocation>& output_tuple,
+    absl::Span<const RefPtr<XRTTupleAllocation>> input_tuples,
+    const xla::HloInputOutputAliasConfig& input_output_alias);
+
+xla::StatusOr<std::vector<xla::ExecutionInput>> GetArgumentsBuffers(
+    const xla::HloInputOutputAliasConfig& input_output_alias,
+    absl::Span<const RefPtr<XRTTupleAllocation>> input_tuples,
+    const std::vector<bool>& input_is_dynamic, bool release_inputs);
+
 // Create the XRT execute output tensor given the computation result
 // (output_tuple). The return_exploded_tuple tells whether a tuple result should
 // be returned as vector of handles representing each tuple child.
diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index 0f709750897..695035c91e9 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -1864,6 +1864,7 @@ cc_library(
         "//tensorflow/core/lib/io:random_inputstream",
         "//tensorflow/core/lib/io:record_reader",
         "//tensorflow/core/lib/io:record_writer",
+        "//tensorflow/core/lib/io:snappy_compression_options",
         "//tensorflow/core/lib/io:snappy_inputbuffer",
         "//tensorflow/core/lib/io:snappy_inputstream",
         "//tensorflow/core/lib/io:snappy_outputbuffer",
diff --git a/tensorflow/core/api_def/base_api/api_def_VarHandleOp.pbtxt b/tensorflow/core/api_def/base_api/api_def_VarHandleOp.pbtxt
index 39606a07184..29ffcdaad6b 100644
--- a/tensorflow/core/api_def/base_api/api_def_VarHandleOp.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_VarHandleOp.pbtxt
@@ -28,8 +28,8 @@ END
   attr {
     name: "allowed_devices"
     description: <<END
-The allowed devices containing the resource variable. Set when the output
-ResourceHandle represents a per-replica/partitioned resource variable.
+DEPRECATED. The allowed devices containing the resource variable. Set when the
+output ResourceHandle represents a per-replica/partitioned resource variable.
 END
   }
   summary: "Creates a handle to a Variable resource."
diff --git a/tensorflow/core/common_runtime/composite_device.cc b/tensorflow/core/common_runtime/composite_device.cc
index 7fd41e00a04..d4548946cbf 100644
--- a/tensorflow/core/common_runtime/composite_device.cc
+++ b/tensorflow/core/common_runtime/composite_device.cc
@@ -25,6 +25,16 @@ const char* const kCompositeDeviceType = "COMPOSITE";
 std::unique_ptr<CompositeDevice> CompositeDevice::MakeDevice(
     const std::vector<string>& underlying_devices, const int unique_device_id,
     const DeviceNameUtils::ParsedName& host_name, Status* status) {
+  DeviceNameUtils::ParsedName parsed_name = host_name;
+  parsed_name.type = kCompositeDeviceType;
+  parsed_name.id = unique_device_id;
+  const string device_name = DeviceNameUtils::ParsedNameToString(parsed_name);
+  return CompositeDevice::MakeDevice(underlying_devices, device_name, status);
+}
+
+std::unique_ptr<CompositeDevice> CompositeDevice::MakeDevice(
+    const std::vector<string>& underlying_devices, const string& device_name,
+    Status* status) {
   if (underlying_devices.empty()) {
     status->Update(
         errors::InvalidArgument("underlying_devices should not be empty."));
@@ -63,13 +73,8 @@ std::unique_ptr<CompositeDevice> CompositeDevice::MakeDevice(
     }
   }
 
-  DeviceNameUtils::ParsedName parsed_composite_name = host_name;
   DeviceAttributes device_attributes;
-  parsed_composite_name.type = kCompositeDeviceType;
-  parsed_composite_name.id = unique_device_id;
-  const string composite_name =
-      DeviceNameUtils::ParsedNameToString(parsed_composite_name);
-  device_attributes.set_name(composite_name);
+  device_attributes.set_name(device_name);
   device_attributes.set_device_type(kCompositeDeviceType);
 
   return absl::WrapUnique(
diff --git a/tensorflow/core/common_runtime/composite_device.h b/tensorflow/core/common_runtime/composite_device.h
index 850eae55e8d..c68c395198a 100644
--- a/tensorflow/core/common_runtime/composite_device.h
+++ b/tensorflow/core/common_runtime/composite_device.h
@@ -48,6 +48,11 @@ class CompositeDevice : public Device {
       const std::vector<string>& underlying_devices, const int unique_device_id,
       const DeviceNameUtils::ParsedName& host_name, Status* status);
 
+  // Helper for creating a CompositeDevice with the given device name.
+  static std::unique_ptr<CompositeDevice> MakeDevice(
+      const std::vector<string>& underlying_devices, const string& device_name,
+      Status* status);
+
  private:
   CompositeDevice(const DeviceAttributes& device_attributes,
                   const std::vector<string>& underlying_devices)
diff --git a/tensorflow/core/common_runtime/composite_device_test.cc b/tensorflow/core/common_runtime/composite_device_test.cc
index 73a6ae44912..7d195a7a08e 100644
--- a/tensorflow/core/common_runtime/composite_device_test.cc
+++ b/tensorflow/core/common_runtime/composite_device_test.cc
@@ -80,4 +80,20 @@ TEST(CompositeDeviceTest, Basic) {
   }
 }
 
+TEST(CompositeDeviceTest, DeviceName) {
+  const string composite_device_name =
+      "/job:localhost/replica:0/task:0/device:CPU:10";
+  std::vector<string> underlying_devices;
+  underlying_devices.push_back("/job:worker/replica:0/task:0/device:CPU:0");
+  underlying_devices.push_back("/job:worker/replica:0/task:0/device:CPU:1");
+  Status status;
+  std::unique_ptr<CompositeDevice> composite_device =
+      CompositeDevice::MakeDevice(underlying_devices, composite_device_name,
+                                  &status);
+  TF_ASSERT_OK(status);
+  EXPECT_EQ(composite_device->name(), composite_device_name);
+  EXPECT_EQ(composite_device->device_type(), kCompositeDeviceType);
+  EXPECT_EQ(underlying_devices, *composite_device->underlying_devices());
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/eager/context.cc b/tensorflow/core/common_runtime/eager/context.cc
index 6dc0a3a8200..7ca40fc6cf6 100644
--- a/tensorflow/core/common_runtime/eager/context.cc
+++ b/tensorflow/core/common_runtime/eager/context.cc
@@ -893,7 +893,7 @@ Status EagerContext::FindDeviceFromName(const char* device_name,
 }
 
 Status EagerContext::FindCompositeDeviceFromName(
-    const char* device_name, CompositeDevice** device) const {
+    StringPiece device_name, CompositeDevice** device) const {
   tf_shared_lock l(composite_devices_mu_);
   for (const auto& d : composite_devices_) {
     if (d.second->name() == device_name) {
@@ -939,8 +939,13 @@ Status EagerContext::RegisterCustomDevice(
 }
 
 Status EagerContext::FindOrCreateCompositeDevice(
-    const std::vector<string>& underlying_devices,
+    const std::vector<string>& underlying_devices, const string& device_name,
     CompositeDevice** composite_device) {
+  if (!device_name.empty() &&
+      FindCompositeDeviceFromName(device_name, composite_device).ok()) {
+    return Status::OK();
+  }
+
   const uint64 hash_key = Fingerprint64(absl::StrJoin(underlying_devices, ","));
 
   mutex_lock l(composite_devices_mu_);
@@ -951,11 +956,16 @@ Status EagerContext::FindOrCreateCompositeDevice(
   }
 
   Status s;
-  // Create a CompositeDevice on the same task as the host CPU, in order to
-  // trigger packed TensorHandle copy from a client to a remote worker.
-  auto device =
-      CompositeDevice::MakeDevice(underlying_devices, composite_devices_.size(),
-                                  HostCPU()->parsed_name(), &s);
+  std::unique_ptr<CompositeDevice> device;
+  if (device_name.empty()) {
+    // Create a CompositeDevice on the same task as the host CPU, in order to
+    // trigger packed TensorHandle copy from a client to a remote worker.
+    device = CompositeDevice::MakeDevice(underlying_devices,
+                                         composite_devices_.size(),
+                                         HostCPU()->parsed_name(), &s);
+  } else {
+    device = CompositeDevice::MakeDevice(underlying_devices, device_name, &s);
+  }
   TF_RETURN_IF_ERROR(s);
   *composite_device = device.get();
   pflr_->AddCompositeDevice(*composite_device);
diff --git a/tensorflow/core/common_runtime/eager/context.h b/tensorflow/core/common_runtime/eager/context.h
index 3dab7c08d77..c16e1f0f4ad 100644
--- a/tensorflow/core/common_runtime/eager/context.h
+++ b/tensorflow/core/common_runtime/eager/context.h
@@ -486,7 +486,7 @@ class EagerContext : public ImmediateExecutionContext, public core::RefCounted {
 
   Status FindDeviceFromName(const char* device_name, Device** device) const;
 
-  Status FindCompositeDeviceFromName(const char* device_name,
+  Status FindCompositeDeviceFromName(StringPiece device_name,
                                      CompositeDevice** device) const;
 
   Status FindCustomDeviceFromName(const string& device_name,
@@ -495,9 +495,10 @@ class EagerContext : public ImmediateExecutionContext, public core::RefCounted {
   Status RegisterCustomDevice(const string& name,
                               std::unique_ptr<CustomDevice> device);
 
-  // Find or create a composite device with the given `underlying_devices`.
+  // Find or create a composite device with the given `underlying_devices` and
+  // `device_name` (if not empty).
   Status FindOrCreateCompositeDevice(
-      const std::vector<string>& underlying_devices,
+      const std::vector<string>& underlying_devices, const string& device_name,
       CompositeDevice** composite_device);
 
   bool OnSameTask(const Device* first, const Device* second) const;
diff --git a/tensorflow/core/common_runtime/eager/context_test.cc b/tensorflow/core/common_runtime/eager/context_test.cc
index c6ed61c80c4..7f34884b4db 100644
--- a/tensorflow/core/common_runtime/eager/context_test.cc
+++ b/tensorflow/core/common_runtime/eager/context_test.cc
@@ -177,6 +177,7 @@ TEST_F(EagerContextTest, CompositeDevice) {
       "/job:worker/replica:0/task:0/device:CPU:1"};
   CompositeDevice* composite_device_0 = nullptr;
   TF_ASSERT_OK(context()->FindOrCreateCompositeDevice(underlying_devices,
+                                                      /*device_name=*/"",
                                                       &composite_device_0));
   EXPECT_EQ(composite_device_0->name(),
             "/job:localhost/replica:0/task:0/device:COMPOSITE:0");
@@ -186,11 +187,13 @@ TEST_F(EagerContextTest, CompositeDevice) {
   EXPECT_EQ(device, composite_device_0);
   CompositeDevice* composite_device_1 = nullptr;
   TF_ASSERT_OK(context()->FindOrCreateCompositeDevice(underlying_devices,
+                                                      /*device_name=*/"",
                                                       &composite_device_1));
   EXPECT_EQ(composite_device_1, composite_device_0);
   underlying_devices.push_back("/job:worker/replica:0/task:0/device:CPU:2");
   CompositeDevice* composite_device_2 = nullptr;
   TF_ASSERT_OK(context()->FindOrCreateCompositeDevice(underlying_devices,
+                                                      /*device_name=*/"",
                                                       &composite_device_2));
   EXPECT_EQ(composite_device_2->name(),
             "/job:localhost/replica:0/task:0/device:COMPOSITE:1");
@@ -202,5 +205,33 @@ TEST_F(EagerContextTest, CompositeDevice) {
       "/job:localhost/replica:0/task:0/device:COMPOSITE:2", &device)));
 }
 
+TEST_F(EagerContextTest, CompositeDeviceWithGivenName) {
+  InitContext(SessionOptions(), DEVICE_PLACEMENT_EXPLICIT);
+  const std::vector<string> underlying_devices_0 = {
+      "/job:worker/replica:0/task:0/device:CPU:0",
+      "/job:worker/replica:0/task:0/device:CPU:1"};
+  const string composite_device_name =
+      "/job:worker1/replica:0/task:0/device:COMPOSITE:5";
+  // Create a CompositeDevice with the given name.
+  CompositeDevice* composite_device_0 = nullptr;
+  TF_ASSERT_OK(context()->FindOrCreateCompositeDevice(
+      underlying_devices_0, composite_device_name, &composite_device_0));
+  EXPECT_EQ(composite_device_0->name(), composite_device_name);
+
+  CompositeDevice* device = nullptr;
+  TF_EXPECT_OK(
+      context()->FindCompositeDeviceFromName(composite_device_name, &device));
+  EXPECT_EQ(device, composite_device_0);
+
+  std::vector<string> underlying_devices_1 = {
+      "/job:worker/replica:0/task:0/device:CPU:1",
+      "/job:worker/replica:0/task:0/device:CPU:2"};
+  // Find a CompositeDevice with the given name.
+  CompositeDevice* composite_device_1 = nullptr;
+  TF_ASSERT_OK(context()->FindOrCreateCompositeDevice(
+      underlying_devices_1, composite_device_0->name(), &composite_device_1));
+  EXPECT_EQ(composite_device_1, composite_device_0);
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/eager/execute.cc b/tensorflow/core/common_runtime/eager/execute.cc
index 5be0ad8db57..a94c882b3b3 100644
--- a/tensorflow/core/common_runtime/eager/execute.cc
+++ b/tensorflow/core/common_runtime/eager/execute.cc
@@ -415,7 +415,7 @@ Status GetOrCreateKernelAndDevice(
       TF_RETURN_IF_ERROR(GetDeviceForInput(ctx, input, &input_device));
       input_dev_ptrs.push_back(input_device);
       CompositeDevice* composite_device = nullptr;
-      if (ctx.FindCompositeDeviceFromName(input_device->name().c_str(),
+      if (ctx.FindCompositeDeviceFromName(input_device->name(),
                                           &composite_device)
               .ok()) {
         composite_devices[input_device->name()] =
@@ -432,13 +432,12 @@ Status GetOrCreateKernelAndDevice(
         // looking it up in ResourceMgr, which is slow). So we just get
         // resource_dtypes_and_shapes for all DT_RESOURCE inputs. If
         // resource_dtypes_and_shapes is not empty, take the first element.
-        TensorHandle::ResourceHandleInfo resource_handle_info;
-        TF_RETURN_IF_ERROR(input->GetResourceHandleInfo(&resource_handle_info));
-        std::vector<DtypeAndPartialTensorShape>* resource_dtypes_and_shapes =
-            &resource_handle_info.dtypes_and_shapes;
-        if (!resource_dtypes_and_shapes->empty()) {
+        std::vector<DtypeAndPartialTensorShape> resource_dtypes_and_shapes;
+        TF_RETURN_IF_ERROR(input->GetResourceHandleDtypesAndShapes(
+            &resource_dtypes_and_shapes));
+        if (!resource_dtypes_and_shapes.empty()) {
           const DtypeAndPartialTensorShape& dtype_and_shape =
-              resource_dtypes_and_shapes->at(0);
+              resource_dtypes_and_shapes.at(0);
           input_resource_variable_dtypes_and_shapes[i] = dtype_and_shape;
 
           // Add _Arg index, dtype and shape to "cache_key".
@@ -695,13 +694,8 @@ Status StoreResourceDtypesAndShapes(const eager::Operation& remote_op,
     TF_RETURN_IF_ERROR(attr_slice.Find("dtype", &dtype));
     const AttrValue* shape;
     TF_RETURN_IF_ERROR(attr_slice.Find("shape", &shape));
-    TensorHandle::ResourceHandleInfo resource_handle_info = {
-        {DtypeAndPartialTensorShape{dtype->type(), shape->shape()}}, {}};
-    // "allowed_devices" is set only when the output represents a
-    // per-replica/partitioned resource variable.
-    TryGetNodeAttr(attr_slice, "allowed_devices",
-                   &resource_handle_info.allowed_devices);
-    retvals[0]->SetResourceHandleInfo(std::move(resource_handle_info));
+    retvals[0]->SetResourceHandleDtypeAndShape(
+        {DtypeAndPartialTensorShape{dtype->type(), shape->shape()}});
   }
   return Status::OK();
 }
@@ -985,18 +979,6 @@ Status MaybeUpdateOpDevice(EagerOperation* op) {
       // is a resource we must pin it to prevent different device selection.
       // TODO(iga): null device can mean "unspecified" or "CPU". Clean this up.
       if (resource_device != op_device || op->Device() == kVariantDeviceNull) {
-        std::vector<string> allowed_devices;
-        TF_RETURN_IF_ERROR(
-            tensor_handle->GetResourceAllowedDevices(&allowed_devices));
-        if (!allowed_devices.empty()) {
-          // TODO(b/145922293): Support allowed_devices specified in wildcard
-          // patterns.
-          if (std::find(allowed_devices.begin(), allowed_devices.end(),
-                        op->DeviceName()) != allowed_devices.end()) {
-            TF_RETURN_IF_ERROR(ctx.FindDeviceFromName(op->DeviceName().c_str(),
-                                                      &resource_device));
-          }
-        }
         DVLOG(1) << (resource_device != op_device ? "Changing " : "Setting ")
                  << "device of operation " << op->Name() << " to "
                  << resource_device->name() << " because input #" << i
diff --git a/tensorflow/core/common_runtime/eager/tensor_handle.cc b/tensorflow/core/common_runtime/eager/tensor_handle.cc
index 9e607c97683..0cd55959924 100644
--- a/tensorflow/core/common_runtime/eager/tensor_handle.cc
+++ b/tensorflow/core/common_runtime/eager/tensor_handle.cc
@@ -145,13 +145,13 @@ Status TensorHandle::PackedTensorHandleData::ExtractPackedHandle(
   return Status::OK();
 }
 
-void TensorHandle::SetResourceHandleInfo(
-    ResourceHandleInfo&& resource_handle_info) {
-  resource_handle_info_ = std::move(resource_handle_info);
+void TensorHandle::SetResourceHandleDtypeAndShape(
+    std::vector<DtypeAndPartialTensorShape> dtypes_and_shapes) {
+  handle_dtypes_and_shapes_ = std::move(dtypes_and_shapes);
 }
 
-Status TensorHandle::GetResourceHandleInfoImpl(
-    std::function<void()> set_resource_info) {
+Status TensorHandle::GetResourceHandleDtypesAndShapes(
+    std::vector<DtypeAndPartialTensorShape>* result) {
   if (dtype != DT_RESOURCE) {
     return errors::InvalidArgument(
         "TensorHandle::GetResourceDtypeAndShape should be called on tensor "
@@ -160,7 +160,7 @@ Status TensorHandle::GetResourceHandleInfoImpl(
   }
 
   if (Type() != LOCAL) {
-    set_resource_info();
+    *result = handle_dtypes_and_shapes_;
     return Status::OK();
   }
 
@@ -170,32 +170,10 @@ Status TensorHandle::GetResourceHandleInfoImpl(
   auto& data = absl::get<LocalTensorHandleData>(data_);
   TF_RETURN_IF_ERROR(data.WaitReady("TensorHandle::GetResourceHandleInfo"));
 
-  set_resource_info();
+  *result = handle_dtypes_and_shapes_;
   return Status::OK();
 }
 
-Status TensorHandle::GetResourceHandleInfo(ResourceHandleInfo* result) {
-  auto get_resource_info = [result, this]() {
-    *result = resource_handle_info_;
-  };
-  return GetResourceHandleInfoImpl(get_resource_info);
-}
-
-Status TensorHandle::GetResourceHandleDtypesAndShapes(
-    std::vector<DtypeAndPartialTensorShape>* result) {
-  auto get_resource_info = [result, this]() {
-    *result = resource_handle_info_.dtypes_and_shapes;
-  };
-  return GetResourceHandleInfoImpl(get_resource_info);
-}
-
-Status TensorHandle::GetResourceAllowedDevices(std::vector<string>* result) {
-  auto get_resource_info = [result, this]() {
-    *result = resource_handle_info_.allowed_devices;
-  };
-  return GetResourceHandleInfoImpl(get_resource_info);
-}
-
 int TensorHandle::NumPackedHandles() const {
   if (Type() != PACKED) {
     return 0;
@@ -270,9 +248,8 @@ TensorHandle::TensorHandle(tensorflow::Tensor&& t, Device* d, Device* op_device,
       resource_remote_device_incarnation_(
           GetRemoteDeviceIncarnation(resource_device_)),
       ctx_(ctx),
-      resource_handle_info_(
-          {t.flat<class ResourceHandle>()(0).dtypes_and_shapes(),
-           t.flat<class ResourceHandle>()(0).allowed_devices()}),
+      handle_dtypes_and_shapes_(
+          t.flat<class ResourceHandle>()(0).dtypes_and_shapes()),
       data_(absl::in_place_type<LocalTensorHandleData>, std::move(t)) {
   DVLOG(3) << "Creating Local TensorHandle: " << this
            << " device: " << VariantDeviceDebugString(device_)
@@ -320,16 +297,17 @@ TensorHandle::TensorHandle(Device* d, Device* op_device,
 Status TensorHandle::CreatePackedHandle(std::vector<TensorHandle*>&& handles,
                                         const tensorflow::DataType dtype,
                                         const tensorflow::TensorShape& shape,
+                                        const string& device_name,
                                         EagerContext* ctx,
                                         TensorHandle** packed_handle) {
   if (handles.empty()) {
     return errors::InvalidArgument("Handles should not be empty.");
   }
 
-  ResourceHandleInfo resource_handle_info;
+  std::vector<DtypeAndPartialTensorShape> dtypes_and_shapes;
   if (dtype == DT_RESOURCE) {
     TF_RETURN_IF_ERROR(
-        handles.at(0)->GetResourceHandleInfo(&resource_handle_info));
+        handles.at(0)->GetResourceHandleDtypesAndShapes(&dtypes_and_shapes));
   }
   std::vector<string> devices;
   for (auto* handle : handles) {
@@ -343,11 +321,12 @@ Status TensorHandle::CreatePackedHandle(std::vector<TensorHandle*>&& handles,
   }
 
   CompositeDevice* composite_device = nullptr;
-  TF_RETURN_IF_ERROR(
-      ctx->FindOrCreateCompositeDevice(devices, &composite_device));
+  TF_RETURN_IF_ERROR(ctx->FindOrCreateCompositeDevice(devices, device_name,
+                                                      &composite_device));
   *packed_handle =
       new TensorHandle(std::move(handles), composite_device, dtype, shape, ctx);
-  (*packed_handle)->SetResourceHandleInfo(std::move(resource_handle_info));
+  (*packed_handle)
+      ->SetResourceHandleDtypeAndShape(std::move(dtypes_and_shapes));
   return Status::OK();
 }
 
@@ -363,8 +342,8 @@ Status TensorHandle::CreatePackedHandle(std::vector<TensorHandle*>&& handles,
   tensorflow::DataType dtype = handles.at(0)->dtype;
   tensorflow::TensorShape shape;
   TF_RETURN_IF_ERROR(handles.at(0)->Shape(&shape));
-  return CreatePackedHandle(std::move(handles), dtype, shape, ctx,
-                            packed_handle);
+  return CreatePackedHandle(std::move(handles), dtype, shape,
+                            /*device_name*/ "", ctx, packed_handle);
 }
 
 TensorHandle::TensorHandle(std::vector<TensorHandle*>&& handles, Device* device,
@@ -897,8 +876,7 @@ Status TensorHandle::SetTensor(tensorflow::Tensor&& t, const Device* d) {
 
     if (t.dtype() == DT_RESOURCE && t.NumElements() > 0) {
       auto& resource_handle = t.flat<class ResourceHandle>()(0);
-      resource_handle_info_ = {resource_handle.dtypes_and_shapes(),
-                               resource_handle.allowed_devices()};
+      handle_dtypes_and_shapes_ = resource_handle.dtypes_and_shapes();
     }
     auto& data = absl::get<LocalTensorHandleData>(data_);
     return data.SetTensor(std::move(t));
diff --git a/tensorflow/core/common_runtime/eager/tensor_handle.h b/tensorflow/core/common_runtime/eager/tensor_handle.h
index a14df475e0f..8ef482cd82c 100644
--- a/tensorflow/core/common_runtime/eager/tensor_handle.h
+++ b/tensorflow/core/common_runtime/eager/tensor_handle.h
@@ -94,7 +94,7 @@ class TensorHandle : public ImmediateExecutionTensorHandle,
   static Status CreatePackedHandle(std::vector<TensorHandle*>&& handles,
                                    const tensorflow::DataType dtype,
                                    const tensorflow::TensorShape& shape,
-                                   EagerContext* ctx,
+                                   const string& device_name, EagerContext* ctx,
                                    TensorHandle** packed_handle);
   static Status CreatePackedHandle(std::vector<TensorHandle*>&& handles,
                                    EagerContext* ctx,
@@ -226,19 +226,13 @@ class TensorHandle : public ImmediateExecutionTensorHandle,
 
   string DebugString() const;
 
-  struct ResourceHandleInfo {
-    std::vector<DtypeAndPartialTensorShape> dtypes_and_shapes;
-    std::vector<string> allowed_devices;
-  };
-
-  void SetResourceHandleInfo(ResourceHandleInfo&& resource_handle_info);
+  void SetResourceHandleDtypeAndShape(
+      std::vector<DtypeAndPartialTensorShape> dtypes_and_shapes);
 
   // If this TensorHandle is 1) a local tensor, and 2) a resource handle,
-  // return data types, shapes and allowed devices of the underlying resource.
-  Status GetResourceHandleInfo(ResourceHandleInfo* result);
+  // return data types and shapes of the underlying resource.
   Status GetResourceHandleDtypesAndShapes(
       std::vector<DtypeAndPartialTensorShape>* result);
-  Status GetResourceAllowedDevices(std::vector<string>* result);
 
   // Returns the number of packed handles. 0 if the handle type is not PACKED.
   int NumPackedHandles() const;
@@ -261,8 +255,6 @@ class TensorHandle : public ImmediateExecutionTensorHandle,
   // with a ready version of the tensor handle data.
   bool IsReady() const;
 
-  Status GetResourceHandleInfoImpl(std::function<void()> set_resource_info);
-
   VariantDevice const device_;
 
   // Device in which the op producing this tensor was executed. Equals to
@@ -308,9 +300,9 @@ class TensorHandle : public ImmediateExecutionTensorHandle,
   Status is_poisoned_;
 
   // If this TensorHandle 1) is a local tensor, and 2) is a resource handle or
-  // refers to a remote resource handle, we store data types, shapes and allowed
-  // devices for the underlying resource.
-  ResourceHandleInfo resource_handle_info_;
+  // refers to a remote resource handle, we store data types and shapes for
+  // the underlying resource.
+  std::vector<DtypeAndPartialTensorShape> handle_dtypes_and_shapes_;
 
   // A handle data which refers to multiple TensorHandles of the same dtype and
   // shape.
diff --git a/tensorflow/core/common_runtime/eager/tensor_handle_test.cc b/tensorflow/core/common_runtime/eager/tensor_handle_test.cc
index 28092c0a604..40cec3fcc49 100644
--- a/tensorflow/core/common_runtime/eager/tensor_handle_test.cc
+++ b/tensorflow/core/common_runtime/eager/tensor_handle_test.cc
@@ -150,13 +150,13 @@ TEST_F(PackedTensorHandleTest, PackedHandle) {
   Device* d0 = ListDevices().at(0);
   TensorHandle* h0 =
       TensorHandle::CreateLocalHandle(std::move(t0), d0, d0, d0, context());
-  h0->SetResourceHandleInfo({{dtype_and_shape}, {}});
+  h0->SetResourceHandleDtypeAndShape({dtype_and_shape});
   handles.push_back(h0);
   Tensor t1(dtype, shape);
   Device* d1 = ListDevices().at(1);
   TensorHandle* h1 =
       TensorHandle::CreateLocalHandle(std::move(t1), d1, d1, d1, context());
-  h1->SetResourceHandleInfo({{dtype_and_shape}, {}});
+  h1->SetResourceHandleDtypeAndShape({dtype_and_shape});
   handles.push_back(h1);
 
   // Create 2 remote TensorHandles (not ready).
@@ -185,13 +185,12 @@ TEST_F(PackedTensorHandleTest, PackedHandle) {
   TensorShape packed_shape;
   TF_ASSERT_OK(packed_handle->Shape(&packed_shape));
   EXPECT_EQ(packed_shape, shape);
-  TensorHandle::ResourceHandleInfo resource_handle_info;
-  TF_ASSERT_OK(packed_handle->GetResourceHandleInfo(&resource_handle_info));
-  EXPECT_EQ(resource_handle_info.dtypes_and_shapes.size(), 1);
-  EXPECT_EQ(resource_handle_info.dtypes_and_shapes.at(0).dtype, DT_FLOAT);
-  EXPECT_EQ(
-      resource_handle_info.dtypes_and_shapes.at(0).shape.IsIdenticalTo({2, 2}),
-      true);
+  std::vector<DtypeAndPartialTensorShape> dtypes_and_shapes;
+  TF_ASSERT_OK(
+      packed_handle->GetResourceHandleDtypesAndShapes(&dtypes_and_shapes));
+  EXPECT_EQ(dtypes_and_shapes.size(), 1);
+  EXPECT_EQ(dtypes_and_shapes.at(0).dtype, DT_FLOAT);
+  EXPECT_EQ(dtypes_and_shapes.at(0).shape.IsIdenticalTo({2, 2}), true);
 
   CompositeDevice* device = reinterpret_cast<CompositeDevice*>(
       absl::get<Device*>(packed_handle->device()));
diff --git a/tensorflow/core/common_runtime/gpu/gpu_device_test.cc b/tensorflow/core/common_runtime/gpu/gpu_device_test.cc
index dae744380e9..6448fc56af7 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_device_test.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_device_test.cc
@@ -234,9 +234,9 @@ TEST_F(GPUDeviceTest, SingleVirtualDeviceWithInvalidPriority) {
     SessionOptions opts =
         MakeSessionOptions("0", 0, 1, {{123, 456}}, {{-1, 2}});
 #else
-    // Priority outside the range (-1, 0) for NVidia GPUs
+    // Priority outside the range (-2, 0) for NVidia GPUs
     SessionOptions opts =
-        MakeSessionOptions("0", 0, 1, {{123, 456}}, {{-2, 0}});
+        MakeSessionOptions("0", 0, 1, {{123, 456}}, {{-3, 0}});
 #endif
     std::vector<std::unique_ptr<Device>> devices;
     Status status = DeviceFactory::GetFactory("GPU")->CreateDevices(
@@ -249,9 +249,7 @@ TEST_F(GPUDeviceTest, SingleVirtualDeviceWithInvalidPriority) {
         " virtual device 0 on GPU# 0");
 #else
     ExpectErrorMessageSubstr(
-        status,
-        "Priority -2 is outside the range of supported priorities [-1,0] for"
-        " virtual device 0 on GPU# 0");
+        status, "Priority -3 is outside the range of supported priorities");
 #endif
   }
   {
@@ -259,7 +257,7 @@ TEST_F(GPUDeviceTest, SingleVirtualDeviceWithInvalidPriority) {
     // Priority outside the range (0, 2) for AMD GPUs
     SessionOptions opts = MakeSessionOptions("0", 0, 1, {{123, 456}}, {{0, 3}});
 #else
-    // Priority outside the range (-1, 0) for NVidia GPUs
+    // Priority outside the range (-2, 0) for NVidia GPUs
     SessionOptions opts = MakeSessionOptions("0", 0, 1, {{123, 456}}, {{0, 1}});
 #endif
     std::vector<std::unique_ptr<Device>> devices;
@@ -273,9 +271,7 @@ TEST_F(GPUDeviceTest, SingleVirtualDeviceWithInvalidPriority) {
         " virtual device 0 on GPU# 0");
 #else
     ExpectErrorMessageSubstr(
-        status,
-        "Priority 1 is outside the range of supported priorities [-1,0] for"
-        " virtual device 0 on GPU# 0");
+        status, "Priority 1 is outside the range of supported priorities");
 #endif
   }
 }
@@ -296,7 +292,7 @@ TEST_F(GPUDeviceTest, MultipleVirtualDevices) {
   // Valid range for priority values on AMD GPUs in (0,2)
   SessionOptions opts = MakeSessionOptions("0", 0, 1, {{123, 456}}, {{0, 1}});
 #else
-  // Valid range for priority values on NVidia GPUs in (-1, 0)
+  // Valid range for priority values on NVidia GPUs in (-2, 0)
   SessionOptions opts = MakeSessionOptions("0", 0, 1, {{123, 456}}, {{0, -1}});
 #endif
   std::vector<std::unique_ptr<Device>> devices;
@@ -347,7 +343,7 @@ TEST_F(GPUDeviceTest, MultipleVirtualDevicesWithPriority) {
     // Valid range for priority values on AMD GPUs in (0,2)
     SessionOptions opts = MakeSessionOptions("0", 0, 1, {{123, 456}}, {{2, 1}});
 #else
-    // Valid range for priority values on NVidia GPUs in (-1, 0)
+    // Valid range for priority values on NVidia GPUs in (-2, 0)
     SessionOptions opts =
         MakeSessionOptions("0", 0, 1, {{123, 456}}, {{-1, 0}});
 #endif
diff --git a/tensorflow/core/distributed_runtime/eager/eager_service_impl.cc b/tensorflow/core/distributed_runtime/eager/eager_service_impl.cc
index f1e70d53757..fd0606538c4 100644
--- a/tensorflow/core/distributed_runtime/eager/eager_service_impl.cc
+++ b/tensorflow/core/distributed_runtime/eager/eager_service_impl.cc
@@ -685,7 +685,7 @@ Status EagerServiceImpl::SendPackedHandle(
   // Create a unshaped packed TensorHandle.
   TF_RETURN_IF_ERROR(TensorHandle::CreatePackedHandle(
       std::move(handles_to_pack), handles.at(0)->dtype, TensorShape(),
-      eager_context, &packed_handle));
+      send_packed_handle.device_name(), eager_context, &packed_handle));
 
   for (auto* h : handles) {
     // Unref handle since it has a ref in the packed handle now.
diff --git a/tensorflow/core/distributed_runtime/eager/eager_service_impl_test.cc b/tensorflow/core/distributed_runtime/eager/eager_service_impl_test.cc
index 7a315ca1ea5..a2412eb9625 100644
--- a/tensorflow/core/distributed_runtime/eager/eager_service_impl_test.cc
+++ b/tensorflow/core/distributed_runtime/eager/eager_service_impl_test.cc
@@ -1071,6 +1071,8 @@ TEST_F(EagerServiceImplTest, SendPackedHandleTest) {
   const string device0 = "/job:localhost/replica:0/task:0/device:CPU:0";
   const string device1 = "/job:localhost/replica:0/task:1/device:CPU:0";
   const string device2 = "/job:localhost/replica:0/task:2/device:CPU:0";
+  const string composite_device =
+      "/job:localhost/replica:0/task:0/device:COMPOSITE:0";
 
   uint64 context_id = random::New64();
   CreateContextRequest request;
@@ -1125,6 +1127,8 @@ TEST_F(EagerServiceImplTest, SendPackedHandleTest) {
 
   EXPECT_EQ(packed_handle->Type(), TensorHandle::PACKED);
   EXPECT_EQ(packed_handle->NumPackedHandles(), 3);
+  EXPECT_EQ(absl::get<Device*>(packed_handle->device())->name(),
+            composite_device);
 
   TensorHandle* handle0 = nullptr;
   TF_ASSERT_OK(packed_handle->ExtractPackedHandle(0, &handle0));
diff --git a/tensorflow/core/distributed_runtime/eager/remote_copy_node.cc b/tensorflow/core/distributed_runtime/eager/remote_copy_node.cc
index 41027d43188..a5eeed6a0b6 100644
--- a/tensorflow/core/distributed_runtime/eager/remote_copy_node.cc
+++ b/tensorflow/core/distributed_runtime/eager/remote_copy_node.cc
@@ -298,6 +298,7 @@ Status SerializePackedHandle(const uint64 op_id, TensorHandle* packed_handle,
                              const Device* target_device, EagerContext* ctx,
                              SendPackedHandleOp* op) {
   op->set_op_id(op_id);
+  op->set_device_name(VariantDeviceName(packed_handle->DeviceOrHostCPU(*ctx)));
   for (int i = 0; i < packed_handle->NumPackedHandles(); ++i) {
     TensorHandle* h = nullptr;
     TF_RETURN_IF_ERROR(packed_handle->ExtractPackedHandle(i, &h));
diff --git a/tensorflow/core/distributed_runtime/eager/remote_mgr.cc b/tensorflow/core/distributed_runtime/eager/remote_mgr.cc
index 94a4f199337..9003f2b3f17 100644
--- a/tensorflow/core/distributed_runtime/eager/remote_mgr.cc
+++ b/tensorflow/core/distributed_runtime/eager/remote_mgr.cc
@@ -167,24 +167,22 @@ Status RemoteMgr::DeserializeRemoteTensorHandle(const RemoteTensorHandle& in,
         parent_->FindDeviceFromName(device_name.c_str(), &device));
     *out = TensorHandle::CreateLazyRemoteHandle(in.op_id(), in.output_num(),
                                                 in.dtype(), device, parent_);
-    TensorHandle::ResourceHandleInfo resource_handle_info;
-    std::vector<DtypeAndPartialTensorShape>* dtypes_and_shapes =
-        &resource_handle_info.dtypes_and_shapes;
+    std::vector<DtypeAndPartialTensorShape> dtypes_and_shapes;
     if (!GetMirroredResourceShape(RemoteTensorHandleInternal(in),
-                                  dtypes_and_shapes)
+                                  &dtypes_and_shapes)
              .ok()) {
       for (const auto& dtype_and_shape_proto :
            in.resource_dtypes_and_shapes()) {
-        dtypes_and_shapes->push_back(DtypeAndPartialTensorShape{
+        dtypes_and_shapes.push_back(DtypeAndPartialTensorShape{
             dtype_and_shape_proto.dtype(),
             TensorShape(dtype_and_shape_proto.shape())});
       }
       mutex_lock l(mirrored_resource_shape_mu_);
       mirrored_resource_shape_map_.emplace(
           RemoteTensorHandleInternal(in.op_id(), in.output_num()),
-          *dtypes_and_shapes);
+          dtypes_and_shapes);
     }
-    (*out)->SetResourceHandleInfo(std::move(resource_handle_info));
+    (*out)->SetResourceHandleDtypeAndShape(std::move(dtypes_and_shapes));
   }
 
   return Status::OK();
diff --git a/tensorflow/core/framework/resource_handle.cc b/tensorflow/core/framework/resource_handle.cc
index 2db5cfa301c..e7f4c2afc90 100644
--- a/tensorflow/core/framework/resource_handle.cc
+++ b/tensorflow/core/framework/resource_handle.cc
@@ -38,9 +38,6 @@ void ResourceHandle::AsProto(ResourceHandleProto* proto) const {
     dtype_and_shape->set_dtype(dtype_and_shape_pair.dtype);
     dtype_and_shape_pair.shape.AsProto(dtype_and_shape->mutable_shape());
   }
-  for (const string& device : allowed_devices_) {
-    *proto->add_allowed_devices() = device;
-  }
 }
 
 void ResourceHandle::FromProto(const ResourceHandleProto& proto) {
@@ -56,9 +53,6 @@ void ResourceHandle::FromProto(const ResourceHandleProto& proto) {
     dtypes_and_shapes.push_back(DtypeAndPartialTensorShape{dtype, shape});
   }
   dtypes_and_shapes_ = std::move(dtypes_and_shapes);
-  for (const string& device : proto.allowed_devices()) {
-    allowed_devices_.push_back(device);
-  }
 }
 
 string ResourceHandle::SerializeAsString() const {
diff --git a/tensorflow/core/framework/resource_handle.h b/tensorflow/core/framework/resource_handle.h
index 88c9f9da190..9acb94b6e79 100644
--- a/tensorflow/core/framework/resource_handle.h
+++ b/tensorflow/core/framework/resource_handle.h
@@ -39,14 +39,8 @@ class ResourceHandle {
 
   // Unique name for the device containing the resource.
   const std::string& device() const { return device_; }
-  // Names of the devices containing the resource.
-  const std::vector<string>& allowed_devices() const {
-    return allowed_devices_;
-  }
+
   void set_device(const std::string& device) { device_ = device; }
-  void set_allowed_devices(const std::vector<string>& devices) {
-    allowed_devices_ = devices;
-  }
 
   // Container in which this resource is placed.
   const std::string& container() const { return container_; }
@@ -93,12 +87,7 @@ class ResourceHandle {
       "cd2c89b7-88b7-44c8-ad83-06c2a9158347";
 
  public:
-  // The default device containing the resource, where the ResourceHandle is
-  // initially created.
   std::string device_;
-  // A set of devices containing the resource. If empty, the resource only
-  // exists on device_. Can be represented in wildcard patterns.
-  std::vector<string> allowed_devices_;
   std::string container_;
   std::string name_;
   uint64 hash_code_ = 0;
diff --git a/tensorflow/core/framework/resource_handle.proto b/tensorflow/core/framework/resource_handle.proto
index eb0d1631c2f..5a41750475d 100644
--- a/tensorflow/core/framework/resource_handle.proto
+++ b/tensorflow/core/framework/resource_handle.proto
@@ -41,7 +41,5 @@ message ResourceHandleProto {
   // Data types and shapes for the underlying resource.
   repeated DtypeAndShape dtypes_and_shapes = 6;
 
-  // A set of devices containing the resource. If empty, the resource only
-  // exists on `device`.
-  repeated string allowed_devices = 7;
+  reserved 7;
 }
diff --git a/tensorflow/core/framework/resource_mgr.cc b/tensorflow/core/framework/resource_mgr.cc
index fd524b05bb9..e6ecfbb9190 100644
--- a/tensorflow/core/framework/resource_mgr.cc
+++ b/tensorflow/core/framework/resource_mgr.cc
@@ -36,8 +36,7 @@ static std::atomic<int64> current_id_;
 ResourceHandle MakeResourceHandle(
     const string& container, const string& name, const DeviceBase& device,
     const TypeIndex& type_index,
-    const std::vector<DtypeAndPartialTensorShape>& dtypes_and_shapes,
-    const std::vector<string>& allowed_devices) {
+    const std::vector<DtypeAndPartialTensorShape>& dtypes_and_shapes) {
   ResourceHandle result;
   result.set_device(device.name());
   result.set_container(container);
@@ -49,7 +48,6 @@ ResourceHandle MakeResourceHandle(
   result.set_hash_code(type_index.hash_code());
   result.set_maybe_type_name(type_index.name());
   result.set_dtypes_and_shapes(dtypes_and_shapes);
-  result.set_allowed_devices(allowed_devices);
   return result;
 }
 
@@ -67,39 +65,12 @@ Status MakeResourceHandleToOutput(OpKernelContext* context, int output_index,
 namespace internal {
 
 Status ValidateDevice(OpKernelContext* ctx, const ResourceHandle& p) {
-  const string& current_device_name = ctx->device()->attributes().name();
-  if (current_device_name == p.device()) {
-    return Status::OK();
-  }
-  DeviceNameUtils::ParsedName parsed_current_device_name;
-  if (!DeviceNameUtils::ParseFullName(current_device_name,
-                                      &parsed_current_device_name)) {
+  if (ctx->device()->attributes().name() != p.device()) {
     return errors::InvalidArgument(
-        "Cannot parse device name in OpKernelContext: ", current_device_name);
+        "Trying to access resource ", p.name(), " located in device ",
+        p.device(), " from device ", ctx->device()->attributes().name());
   }
-
-  for (const string& device : p.allowed_devices()) {
-    DeviceNameUtils::ParsedName parsed;
-    if (!DeviceNameUtils::ParseFullName(device, &parsed)) {
-      return errors::InvalidArgument("Cannot parse allowed device name: ",
-                                     device);
-    }
-    if (DeviceNameUtils::IsCompleteSpecification(parsed,
-                                                 parsed_current_device_name)) {
-      return Status::OK();
-    }
-  }
-  string error_message = strings::StrCat("Trying to access resource ", p.name(),
-                                         " located in device ", p.device(),
-                                         " from device ", current_device_name);
-  if (!p.allowed_devices().empty()) {
-    absl::StrAppend(&error_message, " (allowed devices: ");
-    for (const string& device : p.allowed_devices()) {
-      absl::StrAppend(&error_message, device, ", ");
-    }
-    absl::StrAppend(&error_message, ") ");
-  }
-  return errors::InvalidArgument(error_message);
+  return Status::OK();
 }
 
 }  // end namespace internal
diff --git a/tensorflow/core/framework/resource_mgr.h b/tensorflow/core/framework/resource_mgr.h
index 3a9b97c7831..b0e4eace16e 100644
--- a/tensorflow/core/framework/resource_mgr.h
+++ b/tensorflow/core/framework/resource_mgr.h
@@ -291,31 +291,27 @@ class ResourceMgr {
 ResourceHandle MakeResourceHandle(
     const string& container, const string& name, const DeviceBase& device,
     const TypeIndex& type_index,
-    const std::vector<DtypeAndPartialTensorShape>& dtypes_and_shapes = {},
-    const std::vector<string>& allowed_devices = {}) TF_MUST_USE_RESULT;
+    const std::vector<DtypeAndPartialTensorShape>& dtypes_and_shapes = {})
+    TF_MUST_USE_RESULT;
 
 template <typename T>
 ResourceHandle MakeResourceHandle(
     OpKernelContext* ctx, const string& container, const string& name,
-    const std::vector<DtypeAndPartialTensorShape>& dtypes_and_shapes = {},
-    const std::vector<string>& allowed_devices = {}) {
-  return MakeResourceHandle(container.empty()
-                                ? ctx->resource_manager()->default_container()
-                                : container,
-                            name, *ctx->device(), MakeTypeIndex<T>(),
-                            dtypes_and_shapes, allowed_devices);
+    const std::vector<DtypeAndPartialTensorShape>& dtypes_and_shapes = {}) {
+  return MakeResourceHandle(
+      container.empty() ? ctx->resource_manager()->default_container()
+                        : container,
+      name, *ctx->device(), MakeTypeIndex<T>(), dtypes_and_shapes);
 }
 
 template <typename T>
 ResourceHandle MakeResourceHandle(
     OpKernelConstruction* ctx, const string& container, const string& name,
-    const std::vector<DtypeAndPartialTensorShape>& dtypes_and_shapes = {},
-    const std::vector<string>& allowed_devices = {}) {
-  return MakeResourceHandle(container.empty()
-                                ? ctx->resource_manager()->default_container()
-                                : container,
-                            name, *ctx->device(), MakeTypeIndex<T>(),
-                            dtypes_and_shapes, allowed_devices);
+    const std::vector<DtypeAndPartialTensorShape>& dtypes_and_shapes = {}) {
+  return MakeResourceHandle(
+      container.empty() ? ctx->resource_manager()->default_container()
+                        : container,
+      name, *ctx->device(), MakeTypeIndex<T>(), dtypes_and_shapes);
 }
 
 Status MakeResourceHandleToOutput(OpKernelContext* context, int output_index,
diff --git a/tensorflow/core/framework/resource_mgr_test.cc b/tensorflow/core/framework/resource_mgr_test.cc
index a48024123a6..f524ff77c11 100644
--- a/tensorflow/core/framework/resource_mgr_test.cc
+++ b/tensorflow/core/framework/resource_mgr_test.cc
@@ -352,51 +352,4 @@ TEST(ResourceHandleTest, DeleteUsingResourceHandle) {
   EXPECT_NE(LookupResource<StubResource>(&ctx, p, &lookup_r).ok(), true);
 }
 
-TEST(ResourceHandleTest, AllowedDevices) {
-  const std::vector<string> device_names = {
-      "/job:worker/replica:0/task:0/device:CPU:0",
-      "/job:worker/replica:0/task:0/device:CPU:2",
-      "/job:worker/replica:1/task:3/device:CPU:5"};
-  std::vector<StubDevice> devices;
-  for (const string& name : device_names) {
-    devices.emplace_back(name);
-  }
-
-  std::vector<OpKernelContext::Params> params(device_names.size());
-  std::vector<std::unique_ptr<ResourceMgr>> resource_mgrs;
-  std::vector<std::unique_ptr<OpKernelContext>> ctxs;
-  for (int i = 0; i < device_names.size(); ++i) {
-    resource_mgrs.emplace_back(
-        absl::make_unique<ResourceMgr>(/* default_container= */ ""));
-    params[i].resource_manager = resource_mgrs[i].get();
-    params[i].device = &(devices[i]);
-    ctxs.emplace_back(
-        absl::make_unique<OpKernelContext>(&(params[i]), /* num_outputs= */ 0));
-  }
-
-  const string partially_specified_name =
-      "/job:worker/replica:0/task:0/device:CPU:*";
-  const string& fully_specified_name = device_names.at(2);
-  const std::vector<string> allowed_devices = {partially_specified_name,
-                                               fully_specified_name};
-  // Create a ResourceHandle on device 0.
-  ResourceHandle p = MakeResourceHandle<StubResource>(
-      ctxs[0].get(), "container", "name",
-      /* dtypes_and_shapes= */ {}, allowed_devices);
-
-  std::vector<StubResource*> resources;
-  for (const auto& ctx : ctxs) {
-    StubResource* r = new StubResource;
-    TF_EXPECT_OK(CreateResource(ctx.get(), p, r));
-    resources.push_back(r);
-  }
-
-  for (int i = 0; i < ctxs.size(); ++i) {
-    core::RefCountPtr<StubResource> lookup_r;
-    TF_EXPECT_OK(LookupResource<StubResource>(ctxs[i].get(), p, &lookup_r));
-    EXPECT_EQ(lookup_r.get(), resources[i]);
-    TF_EXPECT_OK(DeleteResource(ctxs[i].get(), p));
-  }
-}
-
 }  // end namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/function_optimizer.cc b/tensorflow/core/grappler/optimizers/function_optimizer.cc
index a66e645e04b..0e156aaa84c 100644
--- a/tensorflow/core/grappler/optimizers/function_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/function_optimizer.cc
@@ -837,7 +837,6 @@ const bool IsExemptFromSideEffectsExecutionValidation(const string& op) {
        "ParameterizedTruncatedNormal", "TruncatedNormal", "RandomShuffle",
        "Multinomial", "RandomGamma", "RandomGammaGrad", "RandomPoisson",
        "RandomPoissonV2",
-       // LINT.ThenChange(//tensorflow/python/framework/auto_control_deps.py)
 
        // ReadVariableOp marked as stateful because it consumes DT_RESOURCE,
        // but it can't generate any observable side-effect.
@@ -851,7 +850,12 @@ const bool IsExemptFromSideEffectsExecutionValidation(const string& op) {
        // the same device_ordinal on the same host.
        "EnqueueTPUEmbeddingSparseBatch", "EnqueueTPUEmbeddingIntegerBatch",
        "EnqueueTPUEmbeddingSparseTensorBatch",
-       "EnqueueTPUEmbeddingRaggedTensorBatch"});
+       "EnqueueTPUEmbeddingRaggedTensorBatch",
+
+       // SaveV2 and RestoreV2 should be allowed to operate in parallel on
+       // multiple hosts.
+       "SaveV2", "RestoreV2"});
+  // LINT.ThenChange(//tensorflow/python/framework/auto_control_deps.py)
   return exemption->contains(op);
 }
 
diff --git a/tensorflow/core/kernels/data/experimental/snapshot_util_test.cc b/tensorflow/core/kernels/data/experimental/snapshot_util_test.cc
index aedc0e194d7..e253014bf94 100644
--- a/tensorflow/core/kernels/data/experimental/snapshot_util_test.cc
+++ b/tensorflow/core/kernels/data/experimental/snapshot_util_test.cc
@@ -88,6 +88,7 @@ TEST(SnapshotUtilTest, CombinationRoundTripTest) {
 
   SnapshotRoundTrip(io::compression::kNone, 2);
   SnapshotRoundTrip(io::compression::kGzip, 2);
+  SnapshotRoundTrip(io::compression::kSnappy, 2);
 }
 
 void SnapshotReaderBenchmarkLoop(int iters, std::string compression_type,
@@ -195,11 +196,16 @@ void SnapshotTFRecordWriterGzipBenchmark(int iters) {
   SnapshotWriterBenchmarkLoop(iters, io::compression::kGzip, 2);
 }
 
+void SnapshotTFRecordWriterSnappyBenchmark(int iters) {
+  SnapshotWriterBenchmarkLoop(iters, io::compression::kSnappy, 2);
+}
+
 BENCHMARK(SnapshotCustomWriterNoneBenchmark);
 BENCHMARK(SnapshotCustomWriterGzipBenchmark);
 BENCHMARK(SnapshotCustomWriterSnappyBenchmark);
 BENCHMARK(SnapshotTFRecordWriterNoneBenchmark);
 BENCHMARK(SnapshotTFRecordWriterGzipBenchmark);
+BENCHMARK(SnapshotTFRecordWriterSnappyBenchmark);
 
 }  // namespace
 }  // namespace snapshot_util
diff --git a/tensorflow/core/kernels/non_max_suppression_op.cu.cc b/tensorflow/core/kernels/non_max_suppression_op.cu.cc
index 53559b20419..c2cae2ab212 100644
--- a/tensorflow/core/kernels/non_max_suppression_op.cu.cc
+++ b/tensorflow/core/kernels/non_max_suppression_op.cu.cc
@@ -28,25 +28,6 @@ limitations under the License.
 #include "tensorflow/core/util/gpu_launch_config.h"
 #include "tensorflow/stream_executor/stream_executor.h"
 
-#define TF_RETURN_IF_CUDA_ERROR(result)                   \
-  do {                                                    \
-    cudaError_t error(result);                            \
-    if (!SE_PREDICT_TRUE(error == cudaSuccess)) {         \
-      return errors::Internal("Cuda call failed with ",   \
-                              cudaGetErrorString(error)); \
-    }                                                     \
-  } while (0)
-
-#define TF_OP_REQUIRES_CUDA_SUCCESS(context, result)                   \
-  do {                                                                 \
-    cudaError_t error(result);                                         \
-    if (!SE_PREDICT_TRUE(error == cudaSuccess)) {                      \
-      context->SetStatus(errors::Internal("Cuda call failed with",     \
-                                          cudaGetErrorString(error))); \
-      return;                                                          \
-    }                                                                  \
-  } while (0)
-
 struct __align__(16) Box {
   float x1, y1, x2, y2;
 };
diff --git a/tensorflow/core/kernels/resource_variable_ops.cc b/tensorflow/core/kernels/resource_variable_ops.cc
index 0fc1d53749f..b9c883c7e2f 100644
--- a/tensorflow/core/kernels/resource_variable_ops.cc
+++ b/tensorflow/core/kernels/resource_variable_ops.cc
@@ -222,8 +222,6 @@ VarHandleOp::VarHandleOp(OpKernelConstruction* context) : OpKernel(context) {
   OP_REQUIRES_OK(context, context->GetAttr("dtype", &dtype_and_shape_.dtype));
   PartialTensorShape shape;
   OP_REQUIRES_OK(context, context->GetAttr("shape", &dtype_and_shape_.shape));
-  OP_REQUIRES_OK(context,
-                 context->GetAttr("allowed_devices", &allowed_devices_));
 
   is_anonymous_ = name_ == ResourceHandle::ANONYMOUS_NAME;
 
@@ -234,8 +232,7 @@ VarHandleOp::VarHandleOp(OpKernelConstruction* context) : OpKernel(context) {
                                                    &resource_, attr));
     resource_.scalar<ResourceHandle>()() = MakeResourceHandle<Var>(
         context, container_, name_,
-        std::vector<DtypeAndPartialTensorShape>{dtype_and_shape_},
-        allowed_devices_);
+        std::vector<DtypeAndPartialTensorShape>{dtype_and_shape_});
   }
 }
 
@@ -248,8 +245,7 @@ void VarHandleOp::Compute(OpKernelContext* ctx) {
         ctx, ctx->allocate_temp(DT_RESOURCE, TensorShape({}), &handle, attr));
     handle.scalar<ResourceHandle>()() = MakeResourceHandle<Var>(
         ctx, container_, name_,
-        std::vector<DtypeAndPartialTensorShape>{dtype_and_shape_},
-        allowed_devices_);
+        std::vector<DtypeAndPartialTensorShape>{dtype_and_shape_});
     ctx->set_output(0, handle);
   } else {
     ctx->set_output(0, resource_);
diff --git a/tensorflow/core/kernels/resource_variable_ops.h b/tensorflow/core/kernels/resource_variable_ops.h
index 5935fa91d21..1bb70b537c1 100644
--- a/tensorflow/core/kernels/resource_variable_ops.h
+++ b/tensorflow/core/kernels/resource_variable_ops.h
@@ -36,10 +36,6 @@ class VarHandleOp : public OpKernel {
   Tensor resource_;
 
   DtypeAndPartialTensorShape dtype_and_shape_;
-
-  // A set of devices containing the resource variable. Set when the output
-  // ResourceHandle represents a per-replica/partitioned resource variable.
-  std::vector<string> allowed_devices_;
 };
 
 class ReadVariableOp : public OpKernel {
diff --git a/tensorflow/core/lib/gtl/manual_constructor_test.cc b/tensorflow/core/lib/gtl/manual_constructor_test.cc
index 35cbc78b664..77824326a7d 100644
--- a/tensorflow/core/lib/gtl/manual_constructor_test.cc
+++ b/tensorflow/core/lib/gtl/manual_constructor_test.cc
@@ -92,7 +92,7 @@ TEST(ManualConstructorTest, Alignment) {
 
   EXPECT_EQ(reinterpret_cast<char*>(test2.b.get()) - &test2.a,
             reinterpret_cast<char*>(&control2.b) - &control2.a);
-#ifdef ARCH_K8
+#ifdef __x86_64__
   EXPECT_EQ(reinterpret_cast<intptr_t>(test2.b.get()) % 16, 0);
 #endif
 }
diff --git a/tensorflow/core/lib/io/BUILD b/tensorflow/core/lib/io/BUILD
index 5e1704a50c1..797e9ad1a4b 100644
--- a/tensorflow/core/lib/io/BUILD
+++ b/tensorflow/core/lib/io/BUILD
@@ -145,6 +145,8 @@ cc_library(
         ":compression",
         ":inputstream_interface",
         ":random_inputstream",
+        ":snappy_compression_options",
+        ":snappy_inputstream",
         ":zlib_compression_options",
         ":zlib_inputstream",
         "//tensorflow/core/lib/core:coding",
@@ -164,6 +166,8 @@ cc_library(
     hdrs = ["record_writer.h"],
     deps = [
         ":compression",
+        ":snappy_compression_options",
+        ":snappy_outputbuffer",
         ":zlib_compression_options",
         ":zlib_outputbuffer",
         "//tensorflow/core/lib/core:coding",
@@ -221,6 +225,15 @@ cc_library(
     alwayslink = True,
 )
 
+cc_library(
+    name = "snappy_compression_options",
+    hdrs = ["snappy/snappy_compression_options.h"],
+    deps = [
+        "//tensorflow/core/platform:types",
+    ],
+    alwayslink = True,
+)
+
 cc_library(
     name = "cache",
     srcs = [
@@ -336,6 +349,9 @@ filegroup(
         "random_inputstream.h",
         "record_reader.cc",
         "record_reader.h",
+        "snappy/snappy_compression_options.h",
+        "snappy/snappy_inputstream.cc",
+        "snappy/snappy_inputstream.h",
         "table.cc",
         "table.h",
         "table_builder.cc",
@@ -366,6 +382,7 @@ filegroup(
         "random_inputstream.h",
         "record_reader.h",
         "record_writer.h",
+        "snappy/snappy_compression_options.h",
         "snappy/snappy_inputbuffer.h",
         "snappy/snappy_inputstream.h",
         "snappy/snappy_outputbuffer.h",
@@ -422,6 +439,7 @@ filegroup(
     srcs = [
         "inputbuffer.h",
         "iterator.h",
+        "snappy/snappy_compression_options.h",
         "snappy/snappy_inputbuffer.h",
         "snappy/snappy_inputstream.h",
         "snappy/snappy_outputbuffer.h",
diff --git a/tensorflow/core/lib/io/record_reader.cc b/tensorflow/core/lib/io/record_reader.cc
index 1af81bd902c..40e516f5ef9 100644
--- a/tensorflow/core/lib/io/record_reader.cc
+++ b/tensorflow/core/lib/io/record_reader.cc
@@ -31,26 +31,26 @@ namespace io {
 RecordReaderOptions RecordReaderOptions::CreateRecordReaderOptions(
     const string& compression_type) {
   RecordReaderOptions options;
+
+#if defined(IS_SLIM_BUILD)
+  if (compression_type != compression::kNone) {
+    LOG(ERROR) << "Compression is not supported but compression_type is set."
+               << " No compression will be used.";
+  }
+#else
   if (compression_type == compression::kZlib) {
     options.compression_type = io::RecordReaderOptions::ZLIB_COMPRESSION;
-#if defined(IS_SLIM_BUILD)
-    LOG(ERROR) << "Compression is not supported but compression_type is set."
-               << " No compression will be used.";
-#else
     options.zlib_options = io::ZlibCompressionOptions::DEFAULT();
-#endif  // IS_SLIM_BUILD
   } else if (compression_type == compression::kGzip) {
     options.compression_type = io::RecordReaderOptions::ZLIB_COMPRESSION;
-#if defined(IS_SLIM_BUILD)
-    LOG(ERROR) << "Compression is not supported but compression_type is set."
-               << " No compression will be used.";
-#else
     options.zlib_options = io::ZlibCompressionOptions::GZIP();
-#endif  // IS_SLIM_BUILD
+  } else if (compression_type == compression::kSnappy) {
+    options.compression_type = io::RecordReaderOptions::SNAPPY_COMPRESSION;
   } else if (compression_type != compression::kNone) {
     LOG(ERROR) << "Unsupported compression_type:" << compression_type
                << ". No compression will be used.";
   }
+#endif
   return options;
 }
 
@@ -63,20 +63,26 @@ RecordReader::RecordReader(RandomAccessFile* file,
     input_stream_.reset(new BufferedInputStream(input_stream_.release(),
                                                 options.buffer_size, true));
   }
-  if (options.compression_type == RecordReaderOptions::ZLIB_COMPRESSION) {
-// We don't have zlib available on all embedded platforms, so fail.
 #if defined(IS_SLIM_BUILD)
-    LOG(FATAL) << "Zlib compression is unsupported on mobile platforms.";
-#else   // IS_SLIM_BUILD
+  if (options.compression_type != RecordReaderOptions::NONE) {
+    LOG(FATAL) << "Compression is unsupported on mobile platforms.";
+  }
+#else
+  if (options.compression_type == RecordReaderOptions::ZLIB_COMPRESSION) {
     input_stream_.reset(new ZlibInputStream(
         input_stream_.release(), options.zlib_options.input_buffer_size,
         options.zlib_options.output_buffer_size, options.zlib_options, true));
-#endif  // IS_SLIM_BUILD
+  } else if (options.compression_type ==
+             RecordReaderOptions::SNAPPY_COMPRESSION) {
+    input_stream_.reset(
+        new SnappyInputStream(input_stream_.release(),
+                              options.snappy_options.output_buffer_size, true));
   } else if (options.compression_type == RecordReaderOptions::NONE) {
     // Nothing to do.
   } else {
     LOG(FATAL) << "Unrecognized compression type :" << options.compression_type;
   }
+#endif
 }
 
 // Read n+4 bytes from file, verify that checksum of first n bytes is
diff --git a/tensorflow/core/lib/io/record_reader.h b/tensorflow/core/lib/io/record_reader.h
index dd7def79f05..07709990a64 100644
--- a/tensorflow/core/lib/io/record_reader.h
+++ b/tensorflow/core/lib/io/record_reader.h
@@ -20,6 +20,8 @@ limitations under the License.
 #include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/lib/io/inputstream_interface.h"
 #if !defined(IS_SLIM_BUILD)
+#include "tensorflow/core/lib/io/snappy/snappy_compression_options.h"
+#include "tensorflow/core/lib/io/snappy/snappy_inputstream.h"
 #include "tensorflow/core/lib/io/zlib_compression_options.h"
 #include "tensorflow/core/lib/io/zlib_inputstream.h"
 #endif  // IS_SLIM_BUILD
@@ -32,9 +34,12 @@ class RandomAccessFile;
 
 namespace io {
 
-class RecordReaderOptions {
- public:
-  enum CompressionType { NONE = 0, ZLIB_COMPRESSION = 1 };
+struct RecordReaderOptions {
+  enum CompressionType {
+    NONE = 0,
+    ZLIB_COMPRESSION = 1,
+    SNAPPY_COMPRESSION = 2
+  };
   CompressionType compression_type = NONE;
 
   // If buffer_size is non-zero, then all reads must be sequential, and no
@@ -46,8 +51,9 @@ class RecordReaderOptions {
       const string& compression_type);
 
 #if !defined(IS_SLIM_BUILD)
-  // Options specific to zlib compression.
+  // Options specific to compression.
   ZlibCompressionOptions zlib_options;
+  SnappyCompressionOptions snappy_options;
 #endif  // IS_SLIM_BUILD
 };
 
diff --git a/tensorflow/core/lib/io/record_reader_writer_test.cc b/tensorflow/core/lib/io/record_reader_writer_test.cc
index 373c0d8b664..486b238bd29 100644
--- a/tensorflow/core/lib/io/record_reader_writer_test.cc
+++ b/tensorflow/core/lib/io/record_reader_writer_test.cc
@@ -158,6 +158,44 @@ TEST(RecordReaderWriterTest, TestBasics) {
   }
 }
 
+TEST(RecordReaderWriterTest, TestSnappy) {
+  Env* env = Env::Default();
+  string fname = testing::TmpDir() + "/record_reader_writer_snappy_test";
+
+  for (auto buf_size : BufferSizes()) {
+    // Snappy compression needs output buffer size > 1.
+    if (buf_size == 1) continue;
+    {
+      std::unique_ptr<WritableFile> file;
+      TF_CHECK_OK(env->NewWritableFile(fname, &file));
+
+      io::RecordWriterOptions options;
+      options.compression_type = io::RecordWriterOptions::SNAPPY_COMPRESSION;
+      options.zlib_options.output_buffer_size = buf_size;
+      io::RecordWriter writer(file.get(), options);
+      TF_EXPECT_OK(writer.WriteRecord("abc"));
+      TF_EXPECT_OK(writer.WriteRecord("defg"));
+      TF_CHECK_OK(writer.Flush());
+    }
+
+    {
+      std::unique_ptr<RandomAccessFile> read_file;
+      // Read it back with the RecordReader.
+      TF_CHECK_OK(env->NewRandomAccessFile(fname, &read_file));
+      io::RecordReaderOptions options;
+      options.compression_type = io::RecordReaderOptions::SNAPPY_COMPRESSION;
+      options.zlib_options.input_buffer_size = buf_size;
+      io::RecordReader reader(read_file.get(), options);
+      uint64 offset = 0;
+      tstring record;
+      TF_CHECK_OK(reader.ReadRecord(&offset, &record));
+      EXPECT_EQ("abc", record);
+      TF_CHECK_OK(reader.ReadRecord(&offset, &record));
+      EXPECT_EQ("defg", record);
+    }
+  }
+}
+
 TEST(RecordReaderWriterTest, TestZlib) {
   Env* env = Env::Default();
   string fname = testing::TmpDir() + "/record_reader_writer_zlib_test";
diff --git a/tensorflow/core/lib/io/record_writer.cc b/tensorflow/core/lib/io/record_writer.cc
index 52d0ef9a358..c82963d40c2 100644
--- a/tensorflow/core/lib/io/record_writer.cc
+++ b/tensorflow/core/lib/io/record_writer.cc
@@ -23,45 +23,49 @@ limitations under the License.
 namespace tensorflow {
 namespace io {
 namespace {
-bool IsZlibCompressed(RecordWriterOptions options) {
+bool IsZlibCompressed(const RecordWriterOptions& options) {
   return options.compression_type == RecordWriterOptions::ZLIB_COMPRESSION;
 }
+
+bool IsSnappyCompressed(const RecordWriterOptions& options) {
+  return options.compression_type == RecordWriterOptions::SNAPPY_COMPRESSION;
+}
 }  // namespace
 
 RecordWriterOptions RecordWriterOptions::CreateRecordWriterOptions(
     const string& compression_type) {
   RecordWriterOptions options;
+#if defined(IS_SLIM_BUILD)
+  if (compression_type != compression::kNone) {
+    LOG(ERROR) << "Compression is not supported but compression_type is set."
+               << " No compression will be used.";
+  }
+#else
   if (compression_type == compression::kZlib) {
     options.compression_type = io::RecordWriterOptions::ZLIB_COMPRESSION;
-#if defined(IS_SLIM_BUILD)
-    LOG(ERROR) << "Compression is not supported but compression_type is set."
-               << " No compression will be used.";
-#else
     options.zlib_options = io::ZlibCompressionOptions::DEFAULT();
-#endif  // IS_SLIM_BUILD
   } else if (compression_type == compression::kGzip) {
     options.compression_type = io::RecordWriterOptions::ZLIB_COMPRESSION;
-#if defined(IS_SLIM_BUILD)
-    LOG(ERROR) << "Compression is not supported but compression_type is set."
-               << " No compression will be used.";
-#else
     options.zlib_options = io::ZlibCompressionOptions::GZIP();
-#endif  // IS_SLIM_BUILD
+  } else if (compression_type == compression::kSnappy) {
+    options.compression_type = io::RecordWriterOptions::SNAPPY_COMPRESSION;
   } else if (compression_type != compression::kNone) {
     LOG(ERROR) << "Unsupported compression_type:" << compression_type
                << ". No compression will be used.";
   }
+#endif
   return options;
 }
 
 RecordWriter::RecordWriter(WritableFile* dest,
                            const RecordWriterOptions& options)
     : dest_(dest), options_(options) {
-  if (IsZlibCompressed(options)) {
-// We don't have zlib available on all embedded platforms, so fail.
 #if defined(IS_SLIM_BUILD)
-    LOG(FATAL) << "Zlib compression is unsupported on mobile platforms.";
-#else   // IS_SLIM_BUILD
+  if (compression_type != compression::kNone) {
+    LOG(FATAL) << "Compression is unsupported on mobile platforms.";
+  }
+#else
+  if (IsZlibCompressed(options)) {
     ZlibOutputBuffer* zlib_output_buffer = new ZlibOutputBuffer(
         dest, options.zlib_options.input_buffer_size,
         options.zlib_options.output_buffer_size, options.zlib_options);
@@ -71,12 +75,16 @@ RecordWriter::RecordWriter(WritableFile* dest,
                  << s.ToString();
     }
     dest_ = zlib_output_buffer;
-#endif  // IS_SLIM_BUILD
+  } else if (IsSnappyCompressed(options)) {
+    dest_ =
+        new SnappyOutputBuffer(dest, options.snappy_options.input_buffer_size,
+                               options.snappy_options.output_buffer_size);
   } else if (options.compression_type == RecordWriterOptions::NONE) {
     // Nothing to do
   } else {
     LOG(FATAL) << "Unspecified compression type :" << options.compression_type;
   }
+#endif
 }
 
 RecordWriter::~RecordWriter() {
@@ -130,14 +138,12 @@ Status RecordWriter::WriteRecord(const absl::Cord& data) {
 
 Status RecordWriter::Close() {
   if (dest_ == nullptr) return Status::OK();
-#if !defined(IS_SLIM_BUILD)
-  if (IsZlibCompressed(options_)) {
+  if (IsZlibCompressed(options_) || IsSnappyCompressed(options_)) {
     Status s = dest_->Close();
     delete dest_;
     dest_ = nullptr;
     return s;
   }
-#endif  // IS_SLIM_BUILD
   return Status::OK();
 }
 
diff --git a/tensorflow/core/lib/io/record_writer.h b/tensorflow/core/lib/io/record_writer.h
index 012c2fbbc91..243dc847ec5 100644
--- a/tensorflow/core/lib/io/record_writer.h
+++ b/tensorflow/core/lib/io/record_writer.h
@@ -21,6 +21,8 @@ limitations under the License.
 #include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/lib/hash/crc32c.h"
 #if !defined(IS_SLIM_BUILD)
+#include "tensorflow/core/lib/io/snappy/snappy_compression_options.h"
+#include "tensorflow/core/lib/io/snappy/snappy_outputbuffer.h"
 #include "tensorflow/core/lib/io/zlib_compression_options.h"
 #include "tensorflow/core/lib/io/zlib_outputbuffer.h"
 #endif  // IS_SLIM_BUILD
@@ -34,17 +36,22 @@ class WritableFile;
 
 namespace io {
 
-class RecordWriterOptions {
+struct RecordWriterOptions {
  public:
-  enum CompressionType { NONE = 0, ZLIB_COMPRESSION = 1 };
+  enum CompressionType {
+    NONE = 0,
+    ZLIB_COMPRESSION = 1,
+    SNAPPY_COMPRESSION = 2
+  };
   CompressionType compression_type = NONE;
 
   static RecordWriterOptions CreateRecordWriterOptions(
       const string& compression_type);
 
-// Options specific to zlib compression.
 #if !defined(IS_SLIM_BUILD)
+  // Options specific to compression.
   tensorflow::io::ZlibCompressionOptions zlib_options;
+  tensorflow::io::SnappyCompressionOptions snappy_options;
 #endif  // IS_SLIM_BUILD
 };
 
@@ -70,7 +77,7 @@ class RecordWriter {
   // implicit Close() call in the destructor.
   ~RecordWriter();
 
-  Status WriteRecord(StringPiece slice);
+  Status WriteRecord(StringPiece data);
 
 #if defined(PLATFORM_GOOGLE)
   Status WriteRecord(const absl::Cord& data);
diff --git a/tensorflow/core/lib/io/snappy/snappy_compression_options.h b/tensorflow/core/lib/io/snappy/snappy_compression_options.h
new file mode 100644
index 00000000000..d3d798bfa8f
--- /dev/null
+++ b/tensorflow/core/lib/io/snappy/snappy_compression_options.h
@@ -0,0 +1,36 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_LIB_IO_SNAPPY_SNAPPY_COMPRESSION_OPTIONS_H_
+#define TENSORFLOW_CORE_LIB_IO_SNAPPY_SNAPPY_COMPRESSION_OPTIONS_H_
+
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace io {
+
+struct SnappyCompressionOptions {
+  // Size of the buffer used for caching the data read from source file.
+  int64 input_buffer_size = 256 << 10;
+
+  // Size of the sink buffer where the compressed/decompressed data produced by
+  // snappy is cached.
+  int64 output_buffer_size = 256 << 10;
+};
+
+}  // namespace io
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_LIB_IO_SNAPPY_SNAPPY_COMPRESSION_OPTIONS_H_
diff --git a/tensorflow/core/profiler/internal/gpu/cupti_tracer.cc b/tensorflow/core/profiler/internal/gpu/cupti_tracer.cc
index c9c626b7289..931801427e7 100644
--- a/tensorflow/core/profiler/internal/gpu/cupti_tracer.cc
+++ b/tensorflow/core/profiler/internal/gpu/cupti_tracer.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
 #include "absl/container/node_hash_map.h"
+#include "absl/container/node_hash_set.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/host_info.h"
@@ -683,7 +684,7 @@ Status CreateAndRecordEvent(CUevent *event, CUstream stream) {
 // Note: cuStreamGetCtx only available after CUDA 9.2.
 class ScopedCudaContext {
  public:
-  ScopedCudaContext(CUstream stream) : stream_(stream) {
+  explicit ScopedCudaContext(CUstream stream) : stream_(stream) {
     CuptiApiTracingDisabler disabler;  // don't trace cuda call in this func.
     CUcontext context;
     if (cuStreamGetCtx(stream, &context) != CUDA_SUCCESS) return;
@@ -1244,7 +1245,7 @@ class CuptiDriverApiHookWithCudaEvent : public CuptiDriverApiHook {
   // However there is no guarantee that we receive such callbacks in pairs, we
   // maintain a on-going API calls to make sure no memory leaks.
   struct CuptiApiCallbackContext {
-    CuptiApiCallbackContext(std::vector<uint32> &&r)
+    explicit CuptiApiCallbackContext(std::vector<uint32> &&r)
         : record_indices(std::move(r)) {}
     std::vector<uint32> record_indices;
   };
@@ -1252,7 +1253,7 @@ class CuptiDriverApiHookWithCudaEvent : public CuptiDriverApiHook {
   const CuptiTracerOptions option_;
   CuptiInterface *cupti_interface_;
   CuptiTraceCollector *collector_;
-  std::set<CuptiApiCallbackContext *> callback_contexts_;
+  absl::node_hash_set<CuptiApiCallbackContext *> callback_contexts_;
   std::vector<std::unique_ptr<CudaEventRecorder>> cuda_event_recorders_;
   TF_DISALLOW_COPY_AND_ASSIGN(CuptiDriverApiHookWithCudaEvent);
 };
diff --git a/tensorflow/core/profiler/internal/gpu/device_tracer.cc b/tensorflow/core/profiler/internal/gpu/device_tracer.cc
index 9c3e2d67bf0..3c0ac04caf2 100644
--- a/tensorflow/core/profiler/internal/gpu/device_tracer.cc
+++ b/tensorflow/core/profiler/internal/gpu/device_tracer.cc
@@ -241,7 +241,7 @@ class CuptiTraceCollectorImpl : public CuptiTraceCollector {
   std::string ReportDroppedEvents() {
     absl::MutexLock lock(&mutex_);
     string result;
-    for (const auto dropped : dropped_events_) {
+    for (const auto& dropped : dropped_events_) {
       absl::StrAppend(&result, " ", dropped.second, " events dropped because ",
                       dropped.first, ";");
     }
diff --git a/tensorflow/core/protobuf/eager_service.proto b/tensorflow/core/protobuf/eager_service.proto
index 3fe2bd486ba..179ef19f805 100644
--- a/tensorflow/core/protobuf/eager_service.proto
+++ b/tensorflow/core/protobuf/eager_service.proto
@@ -258,6 +258,8 @@ message SendPackedHandleOp {
   }
 
   repeated Handle handles = 2;
+
+  string device_name = 3;
 }
 
 ////////////////////////////////////////////////////////////////////////////////
diff --git a/tensorflow/core/util/gpu_device_functions.h b/tensorflow/core/util/gpu_device_functions.h
index d4e09a7fc98..c43b99ac452 100644
--- a/tensorflow/core/util/gpu_device_functions.h
+++ b/tensorflow/core/util/gpu_device_functions.h
@@ -53,6 +53,8 @@ using gpuEvent_t = cudaEvent_t;
 #define gpuEventCreate cudaEventCreate
 #define gpuEventCreateWithFlags cudaEventCreateWithFlags
 #define gpuEventDisableTiming cudaEventDisableTiming
+#define gpuDeviceSynchronize cudaDeviceSynchronize
+#define gpuFree cudaFree
 #elif TENSORFLOW_USE_ROCM
 using gpuFloatComplex = hipFloatComplex;
 using gpuDoubleComplex = hipDoubleComplex;
@@ -68,6 +70,8 @@ using cudaError_t = int;
 #define gpuEventCreate hipEventCreate
 #define gpuEventCreateWithFlags hipEventCreateWithFlags
 #define gpuEventDisableTiming hipEventDisableTiming
+#define gpuDeviceSynchronize hipDeviceSynchronize
+#define gpuFree hipFree
 static std::string cudaGetErrorString(int err) { return std::to_string(err); }
 #endif
 
diff --git a/tensorflow/core/util/gpu_kernel_helper_test.cu.cc b/tensorflow/core/util/gpu_kernel_helper_test.cu.cc
index c089511e964..b876d0890f0 100644
--- a/tensorflow/core/util/gpu_kernel_helper_test.cu.cc
+++ b/tensorflow/core/util/gpu_kernel_helper_test.cu.cc
@@ -13,9 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #define EIGEN_USE_GPU
 
+#include <time.h>
+
 #include <numeric>
 
 #include "tensorflow/core/lib/core/status_test_util.h"
@@ -25,14 +27,14 @@ limitations under the License.
 
 #define CUDA_EXPECT_SUCCESS                                 \
   {                                                         \
-    cudaDeviceSynchronize();                                \
+    gpuDeviceSynchronize();                                 \
     cudaError_t err = cudaGetLastError();                   \
     EXPECT_EQ(cudaSuccess, err) << cudaGetErrorString(err); \
   }
 
 #define CUDA_ASSERT_SUCCESS                                 \
   {                                                         \
-    cudaDeviceSynchronize();                                \
+    gpuDeviceSynchronize();                                 \
     cudaError_t err = cudaGetLastError();                   \
     ASSERT_EQ(cudaSuccess, err) << cudaGetErrorString(err); \
   }
@@ -94,8 +96,7 @@ __global__ void Count3D(Gpu3DLaunchConfig config, int bufsize,
   }
 }
 
-__global__ void CudaShuffleGetSrcLaneTest(
-    unsigned* __restrict__ failure_count) {
+__global__ void GpuShuffleGetSrcLaneTest(unsigned* __restrict__ failure_count) {
   unsigned lane_id = GpuLaneId();
   for (int width = warpSize; width > 1; width /= 2) {
     auto check_result = [&](const char* op_name, int param, unsigned actual,
@@ -103,31 +104,38 @@ __global__ void CudaShuffleGetSrcLaneTest(
       if (actual != expected) {
         printf("Cuda%sGetSrcLane(%d, %d) for lane %d returned %d, not %d\n",
                op_name, param, width, lane_id, actual, expected);
-        CudaAtomicAdd(failure_count, 1);
+        GpuAtomicAdd(failure_count, 1);
       }
     };
+
     for (int src_lane = -warpSize; src_lane <= warpSize; ++src_lane) {
-      unsigned actual_lane = detail::CudaShuffleGetSrcLane(src_lane, width);
+#if TENSORFLOW_USE_ROCM
+      if (src_lane < 0 || src_lane >= width) continue;
+#endif
+      unsigned actual_lane = detail::GpuShuffleGetSrcLane(src_lane, width);
       unsigned expect_lane =
-          CudaShuffleSync(kCudaWarpAll, lane_id, src_lane, width);
+          GpuShuffleSync(kCudaWarpAll, lane_id, src_lane, width);
       check_result("Shuffle", src_lane, actual_lane, expect_lane);
     }
+
     for (unsigned delta = 0; delta <= warpSize; ++delta) {
-      unsigned actual_lane = detail::CudaShuffleUpGetSrcLane(delta, width);
+      unsigned actual_lane = detail::GpuShuffleUpGetSrcLane(delta, width);
       unsigned expect_lane =
-          CudaShuffleUpSync(kCudaWarpAll, lane_id, delta, width);
+          GpuShuffleUpSync(kCudaWarpAll, lane_id, delta, width);
       check_result("ShuffleUp", delta, actual_lane, expect_lane);
     }
+
     for (unsigned delta = 0; delta <= warpSize; ++delta) {
-      unsigned actual_lane = detail::CudaShuffleDownGetSrcLane(delta, width);
+      unsigned actual_lane = detail::GpuShuffleDownGetSrcLane(delta, width);
       unsigned expect_lane =
-          CudaShuffleDownSync(kCudaWarpAll, lane_id, delta, width);
+          GpuShuffleDownSync(kCudaWarpAll, lane_id, delta, width);
       check_result("ShuffleDown", delta, actual_lane, expect_lane);
     }
+
     for (int lane_lane = warpSize; lane_lane > 0; lane_lane /= 2) {
-      unsigned actual_lane = detail::CudaShuffleXorGetSrcLane(lane_lane, width);
+      unsigned actual_lane = detail::GpuShuffleXorGetSrcLane(lane_lane, width);
       unsigned expect_lane =
-          CudaShuffleXorSync(kCudaWarpAll, lane_id, lane_lane, width);
+          GpuShuffleXorSync(kCudaWarpAll, lane_id, lane_lane, width);
       check_result("ShuffleXor", lane_lane, actual_lane, expect_lane);
     }
   }
@@ -137,19 +145,32 @@ __global__ void CudaShuffleGetSrcLaneTest(
 
 class GpuLaunchConfigTest : public ::testing::Test {
  protected:
-  const int bufsize = 1024;
+  static const int bufsize = 1024;
   int* outbuf = nullptr;
+  int* outbuf_host = nullptr;
+  int hostbuf[bufsize];
   Eigen::GpuStreamDevice stream;
   Eigen::GpuDevice d = Eigen::GpuDevice(&stream);
 
+  void copyToHost() {
+#if TENSORFLOW_USE_ROCM
+    hipMemcpy(hostbuf, outbuf, sizeof(int) * bufsize, hipMemcpyDeviceToHost);
+#endif
+  }
   virtual void SetUp() {
+#if GOOGLE_CUDA
     cudaError_t err = cudaMallocManaged(&outbuf, sizeof(int) * bufsize);
+    outbuf_host = outbuf;
+#else
+    cudaError_t err = hipMalloc(&outbuf, sizeof(int) * bufsize);
+    outbuf_host = hostbuf;
+#endif
     ASSERT_EQ(cudaSuccess, err) << cudaGetErrorString(err);
   }
 
   virtual void TearDown() {
-    cudaDeviceSynchronize();
-    cudaFree(outbuf);
+    gpuDeviceSynchronize();
+    gpuFree(outbuf);
     outbuf = nullptr;
   }
 };
@@ -158,28 +179,32 @@ TEST_F(GpuLaunchConfigTest, GetGpuLaunchConfig) {
   GpuLaunchConfig cfg;
 
 // test valid inputs
-#define TEST_LAUNCH_PARAMETER(work_element_count)                              \
-  cfg = GetGpuLaunchConfig(bufsize, d);                                        \
-  TF_CHECK_OK(GpuLaunchKernel(SetOutbufZero, cfg.block_count,                  \
-                              cfg.thread_per_block, 0, d.stream(), cfg,        \
-                              outbuf));                                        \
-  CUDA_ASSERT_SUCCESS                                                          \
-  cfg = GetGpuLaunchConfig(work_element_count, d);                             \
-  TF_CHECK_OK(GpuLaunchKernel(Count1D, cfg.block_count, cfg.thread_per_block,  \
-                              0, d.stream(), cfg, bufsize, outbuf));           \
-  CUDA_EXPECT_SUCCESS                                                          \
-  EXPECT_EQ(work_element_count, std::accumulate(outbuf, outbuf + bufsize, 0)); \
-                                                                               \
-  cfg = GetGpuLaunchConfig(bufsize, d, SetOutbufZero, 0, 0);                   \
-  TF_CHECK_OK(GpuLaunchKernel(SetOutbufZero, cfg.block_count,                  \
-                              cfg.thread_per_block, 0, d.stream(), cfg,        \
-                              outbuf));                                        \
-  CUDA_ASSERT_SUCCESS                                                          \
-  cfg = GetGpuLaunchConfig(work_element_count, d, Count1D, 0, 0);              \
-  TF_CHECK_OK(GpuLaunchKernel(Count1D, cfg.block_count, cfg.thread_per_block,  \
-                              0, d.stream(), cfg, bufsize, outbuf));           \
-  CUDA_EXPECT_SUCCESS                                                          \
-  EXPECT_EQ(work_element_count, std::accumulate(outbuf, outbuf + bufsize, 0))
+#define TEST_LAUNCH_PARAMETER(work_element_count)                             \
+  cfg = GetGpuLaunchConfig(bufsize, d);                                       \
+  TF_CHECK_OK(GpuLaunchKernel(SetOutbufZero, cfg.block_count,                 \
+                              cfg.thread_per_block, 0, d.stream(), cfg,       \
+                              outbuf));                                       \
+  CUDA_ASSERT_SUCCESS                                                         \
+  cfg = GetGpuLaunchConfig(work_element_count, d);                            \
+  TF_CHECK_OK(GpuLaunchKernel(Count1D, cfg.block_count, cfg.thread_per_block, \
+                              0, d.stream(), cfg, bufsize, outbuf));          \
+  CUDA_EXPECT_SUCCESS                                                         \
+  copyToHost();                                                               \
+  EXPECT_EQ(work_element_count,                                               \
+            std::accumulate(outbuf_host, outbuf_host + bufsize, 0));          \
+                                                                              \
+  cfg = GetGpuLaunchConfig(bufsize, d, SetOutbufZero, 0, 0);                  \
+  TF_CHECK_OK(GpuLaunchKernel(SetOutbufZero, cfg.block_count,                 \
+                              cfg.thread_per_block, 0, d.stream(), cfg,       \
+                              outbuf));                                       \
+  CUDA_ASSERT_SUCCESS                                                         \
+  cfg = GetGpuLaunchConfig(work_element_count, d, Count1D, 0, 0);             \
+  TF_CHECK_OK(GpuLaunchKernel(Count1D, cfg.block_count, cfg.thread_per_block, \
+                              0, d.stream(), cfg, bufsize, outbuf));          \
+  CUDA_EXPECT_SUCCESS                                                         \
+  copyToHost();                                                               \
+  EXPECT_EQ(work_element_count,                                               \
+            std::accumulate(outbuf_host, outbuf_host + bufsize, 0));
 
   TEST_LAUNCH_PARAMETER(128);
   TEST_LAUNCH_PARAMETER(129);
@@ -221,7 +246,9 @@ TEST_F(GpuLaunchConfigTest, GetGpu2DLaunchConfig) {
   TF_EXPECT_OK(GpuLaunchKernel(Count2D, cfg.block_count, cfg.thread_per_block, \
                                0, d.stream(), cfg, bufsize, outbuf));          \
   CUDA_EXPECT_SUCCESS                                                          \
-  EXPECT_EQ(dimx* dimy, std::accumulate(outbuf, outbuf + bufsize, 0));         \
+  copyToHost();                                                                \
+  EXPECT_EQ(dimx* dimy,                                                        \
+            std::accumulate(outbuf_host, outbuf_host + bufsize, 0));           \
                                                                                \
   cfg1d = GetGpuLaunchConfig(bufsize, d, SetOutbufZero, 0, 0);                 \
   TF_EXPECT_OK(GpuLaunchKernel(SetOutbufZero, cfg1d.block_count,               \
@@ -232,7 +259,8 @@ TEST_F(GpuLaunchConfigTest, GetGpu2DLaunchConfig) {
   TF_EXPECT_OK(GpuLaunchKernel(Count2D, cfg.block_count, cfg.thread_per_block, \
                                0, d.stream(), cfg, bufsize, outbuf));          \
   CUDA_EXPECT_SUCCESS                                                          \
-  EXPECT_EQ(dimx* dimy, std::accumulate(outbuf, outbuf + bufsize, 0))
+  copyToHost();                                                                \
+  EXPECT_EQ(dimx* dimy, std::accumulate(outbuf_host, outbuf_host + bufsize, 0))
 
   TEST_LAUNCH_PARAMETER(128, 128);
   TEST_LAUNCH_PARAMETER(129, 64);
@@ -263,7 +291,9 @@ TEST_F(GpuLaunchConfigTest, GetGpu3DLaunchConfig) {
   TF_EXPECT_OK(GpuLaunchKernel(Count3D, cfg.block_count, cfg.thread_per_block, \
                                0, d.stream(), cfg, bufsize, outbuf));          \
   CUDA_EXPECT_SUCCESS                                                          \
-  EXPECT_EQ(dimx* dimy* dimz, std::accumulate(outbuf, outbuf + bufsize, 0))
+  copyToHost();                                                                \
+  EXPECT_EQ(dimx* dimy* dimz,                                                  \
+            std::accumulate(outbuf_host, outbuf_host + bufsize, 0))
 
   TEST_LAUNCH_PARAMETER(128, 128, 128);
   TEST_LAUNCH_PARAMETER(129, 64, 1024);
@@ -282,15 +312,19 @@ TEST_F(GpuLaunchConfigTest, GetGpu3DLaunchConfig) {
 
 TEST(CudaDeviceFunctionsTest, ShuffleGetSrcLane) {
   unsigned* failure_count;
+#if GOOGLE_CUDA
   ASSERT_EQ(cudaMallocManaged(&failure_count, sizeof(unsigned)), cudaSuccess);
+#else
+  ASSERT_EQ(hipHostMalloc(&failure_count, sizeof(unsigned), 0), cudaSuccess);
+#endif
   *failure_count = 0;
-  TF_EXPECT_OK(GpuLaunchKernel(CudaShuffleGetSrcLaneTest, 1, 32, 0, nullptr,
-                               failure_count));
-  ASSERT_EQ(cudaDeviceSynchronize(), cudaSuccess);
+  TF_EXPECT_OK(GpuLaunchKernel(GpuShuffleGetSrcLaneTest, 1, TF_RED_WARPSIZE, 0,
+                               nullptr, failure_count));
+  ASSERT_EQ(gpuDeviceSynchronize(), cudaSuccess);
   ASSERT_EQ(*failure_count, 0);
-  cudaFree(failure_count);
+  gpuFree(failure_count);
 }
 
 }  // namespace tensorflow
 
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/tensorflow/lite/delegates/gpu/cl/arguments.cc b/tensorflow/lite/delegates/gpu/cl/arguments.cc
index 53303eab079..6955681a366 100644
--- a/tensorflow/lite/delegates/gpu/cl/arguments.cc
+++ b/tensorflow/lite/delegates/gpu/cl/arguments.cc
@@ -221,8 +221,9 @@ void Arguments::AddObjectRef(const std::string& name, AccessType access_type,
 }
 
 void Arguments::AddObject(const std::string& name, AccessType access_type,
-                          GPUObjectPtr&& object) {
-  objects_[name] = {access_type, std::move(object)};
+                          GPUObjectPtr&& object,
+                          GPUObjectDescriptorPtr&& descriptor_ptr) {
+  objects_[name] = {access_type, std::move(object), std::move(descriptor_ptr)};
 }
 
 void Arguments::AddGPUResources(const std::string& name,
@@ -411,7 +412,8 @@ absl::Status Arguments::Merge(Arguments&& args, const std::string& postfix) {
       return absl::InvalidArgumentError(
           absl::StrCat("Object name collision. Name - ", name));
     }
-    objects_[name] = {v.second.access_type, std::move(v.second.obj_ptr)};
+    objects_[name] = {v.second.access_type, std::move(v.second.obj_ptr),
+                      std::move(v.second.descriptor)};
   }
   for (const auto& v : args.int_values_) {
     AddInt(RenameArg(object_names, postfix, v.first), v.second.value);
@@ -677,7 +679,7 @@ absl::Status Arguments::ResolveSelector(
     desc_ptr = it->second.descriptor.get();
     access_type = it->second.access_type;
   } else if (auto it = objects_.find(object_name); it != objects_.end()) {
-    desc_ptr = it->second.obj_ptr->GetGPUDescriptor();
+    desc_ptr = it->second.descriptor.get();
     access_type = it->second.access_type;
   } else {
     return absl::NotFoundError(
@@ -760,8 +762,7 @@ absl::Status Arguments::ResolveSelectorsPass(
 absl::Status Arguments::AddObjectArgs() {
   for (auto& t : objects_) {
     AddGPUResources(t.first,
-                    t.second.obj_ptr->GetGPUDescriptor()->GetGPUResources(
-                        t.second.access_type));
+                    t.second.descriptor->GetGPUResources(t.second.access_type));
     RETURN_IF_ERROR(SetGPUResources(
         t.first, t.second.obj_ptr->GetGPUResources(t.second.access_type)));
   }
diff --git a/tensorflow/lite/delegates/gpu/cl/arguments.h b/tensorflow/lite/delegates/gpu/cl/arguments.h
index edeab4a603b..4bebb0b2628 100644
--- a/tensorflow/lite/delegates/gpu/cl/arguments.h
+++ b/tensorflow/lite/delegates/gpu/cl/arguments.h
@@ -50,7 +50,8 @@ class Arguments {
   void AddObjectRef(const std::string& name, AccessType access_type,
                     GPUObjectDescriptorPtr&& descriptor_ptr);
   void AddObject(const std::string& name, AccessType access_type,
-                 GPUObjectPtr&& object);
+                 GPUObjectPtr&& object,
+                 GPUObjectDescriptorPtr&& descriptor_ptr);
 
   absl::Status SetInt(const std::string& name, int value);
   absl::Status SetFloat(const std::string& name, float value);
@@ -162,6 +163,7 @@ class Arguments {
   struct ObjectArg {
     AccessType access_type;
     GPUObjectPtr obj_ptr;
+    GPUObjectDescriptorPtr descriptor;
   };
   std::map<std::string, ObjectArg> objects_;
 };
diff --git a/tensorflow/lite/delegates/gpu/cl/gpu_object.h b/tensorflow/lite/delegates/gpu/cl/gpu_object.h
index fec8999e2bc..faf18b539e2 100644
--- a/tensorflow/lite/delegates/gpu/cl/gpu_object.h
+++ b/tensorflow/lite/delegates/gpu/cl/gpu_object.h
@@ -105,14 +105,6 @@ struct GPUResourcesWithValue {
 class GPUObjectDescriptor {
  public:
   GPUObjectDescriptor() = default;
-  GPUObjectDescriptor(const GPUObjectDescriptor& obj_desc)
-      : state_vars_(obj_desc.state_vars_) {}
-  GPUObjectDescriptor& operator=(const GPUObjectDescriptor& obj_desc) {
-    if (this != &obj_desc) {
-      state_vars_ = obj_desc.state_vars_;
-    }
-    return *this;
-  }
   virtual ~GPUObjectDescriptor() = default;
 
   void SetStateVar(const std::string& key, const std::string& value) const {
@@ -149,7 +141,6 @@ class GPUObject {
   GPUObject(const GPUObject&) = delete;
   GPUObject& operator=(const GPUObject&) = delete;
   virtual ~GPUObject() = default;
-  virtual const GPUObjectDescriptor* GetGPUDescriptor() const = 0;
   virtual GPUResourcesWithValue GetGPUResources(
       AccessType access_type) const = 0;
 };
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/prelu.h b/tensorflow/lite/delegates/gpu/cl/kernels/prelu.h
index 68e4c7b7626..80ee49f77ca 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/prelu.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/prelu.h
@@ -69,17 +69,17 @@ absl::Status CreatePReLU(const CreationContext& creation_context,
 template <DataType T>
 absl::Status PReLU::UploadParameters(
     const tflite::gpu::Tensor<Linear, T>& parameters, CLContext* context) {
-  LinearStorageCreateInfo create_info;
-  create_info.storage_type =
+  TensorLinearDescriptor desc;
+  desc.storage_type =
       DeduceLinearStorageType(definition_.GetPrimaryStorageType());
-  create_info.data_type = definition_.GetPrimaryDataType();
-  RETURN_IF_ERROR(
-      CreateLinearStorage(create_info, parameters, context, &alpha_));
+  desc.element_type = definition_.GetPrimaryDataType();
+  RETURN_IF_ERROR(CreateLinearStorage(desc, parameters, context, &alpha_));
 
   LinearStorage lt;
-  RETURN_IF_ERROR(CreateLinearStorage(create_info, parameters, context, &lt));
+  RETURN_IF_ERROR(CreateLinearStorage(desc, parameters, context, &lt));
   args_.AddObject("alpha", AccessType::READ,
-                  absl::make_unique<LinearStorage>(std::move(lt)));
+                  absl::make_unique<LinearStorage>(std::move(lt)),
+                  absl::make_unique<TensorLinearDescriptor>(desc));
 
   return absl::OkStatus();
 }
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/winograd.cc b/tensorflow/lite/delegates/gpu/cl/kernels/winograd.cc
index 2dcb72637ec..d38b72e61a6 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/winograd.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/winograd.cc
@@ -378,14 +378,15 @@ absl::Status Winograd4x4To36::UploadBt(CLContext* context) {
     bt_aligned.data[y * 8 + 7] = 0.0f;
   }
 
-  LinearStorageCreateInfo create_info;
-  create_info.storage_type = LinearStorageType::TEXTURE_2D;
-  create_info.data_type = definition_.GetDataType();
+  TensorLinearDescriptor desc;
+  desc.storage_type = LinearStorageType::TEXTURE_2D;
+  desc.element_type = definition_.GetDataType();
 
   LinearStorage lt;
-  RETURN_IF_ERROR(CreateLinearStorage(create_info, bt_aligned, context, &lt));
+  RETURN_IF_ERROR(CreateLinearStorage(desc, bt_aligned, context, &lt));
   args_.AddObject("bt", AccessType::READ,
-                  absl::make_unique<LinearStorage>(std::move(lt)));
+                  absl::make_unique<LinearStorage>(std::move(lt)),
+                  absl::make_unique<TensorLinearDescriptor>(desc));
   return absl::OkStatus();
 }
 
@@ -492,13 +493,14 @@ absl::Status Winograd36To4x4::UploadAt(CLContext* context) {
     at_aligned.data[y * 8 + 7] = 0.0f;
   }
 
-  LinearStorageCreateInfo create_info;
-  create_info.storage_type = LinearStorageType::TEXTURE_2D;
-  create_info.data_type = definition_.GetDataType();
+  TensorLinearDescriptor desc;
+  desc.storage_type = LinearStorageType::TEXTURE_2D;
+  desc.element_type = definition_.GetDataType();
   LinearStorage lt;
-  RETURN_IF_ERROR(CreateLinearStorage(create_info, at_aligned, context, &lt));
+  RETURN_IF_ERROR(CreateLinearStorage(desc, at_aligned, context, &lt));
   args_.AddObject("at", AccessType::READ,
-                  absl::make_unique<LinearStorage>(std::move(lt)));
+                  absl::make_unique<LinearStorage>(std::move(lt)),
+                  absl::make_unique<TensorLinearDescriptor>(desc));
   return absl::OkStatus();
 }
 
@@ -550,14 +552,15 @@ absl::Status CreateWinograd36To4x4(
     const tflite::gpu::Tensor<Linear, DataType::FLOAT32>& biases,
     Winograd36To4x4* result) {
   *result = Winograd36To4x4(definition);
-  LinearStorageCreateInfo create_info;
-  create_info.storage_type = LinearStorageType::TEXTURE_2D;
-  create_info.data_type = definition.GetDataType();
+  TensorLinearDescriptor desc;
+  desc.storage_type = LinearStorageType::TEXTURE_2D;
+  desc.element_type = definition.GetDataType();
   LinearStorage lt;
   RETURN_IF_ERROR(
-      CreateLinearStorage(create_info, biases, creation_context.context, &lt));
+      CreateLinearStorage(desc, biases, creation_context.context, &lt));
   result->args_.AddObject("biases", AccessType::READ,
-                          absl::make_unique<LinearStorage>(std::move(lt)));
+                          absl::make_unique<LinearStorage>(std::move(lt)),
+                          absl::make_unique<TensorLinearDescriptor>(desc));
   return result->UploadAt(creation_context.context);
 }
 
diff --git a/tensorflow/lite/delegates/gpu/cl/linear_storage.cc b/tensorflow/lite/delegates/gpu/cl/linear_storage.cc
index 84d91b9136e..47504a34c2b 100644
--- a/tensorflow/lite/delegates/gpu/cl/linear_storage.cc
+++ b/tensorflow/lite/delegates/gpu/cl/linear_storage.cc
@@ -76,10 +76,7 @@ absl::Status TensorLinearDescriptor::PerformReadSelector(
 
 LinearStorage::LinearStorage(int depth, LinearStorageType storage_type,
                              DataType data_type)
-    : depth_(depth), storage_type_(storage_type), data_type_(data_type) {
-  desc_.storage_type = storage_type;
-  desc_.element_type = data_type;
-}
+    : depth_(depth), storage_type_(storage_type), data_type_(data_type) {}
 
 LinearStorage::LinearStorage(LinearStorage&& storage)
     : GPUObject(std::move(storage)),
@@ -89,8 +86,7 @@ LinearStorage::LinearStorage(LinearStorage&& storage)
       depth_(storage.depth_),
       name_(std::move(storage.name_)),
       storage_type_(storage.storage_type_),
-      data_type_(storage.data_type_),
-      desc_(storage.desc_) {
+      data_type_(storage.data_type_) {
   storage.memory_ = nullptr;
 }
 
@@ -103,7 +99,6 @@ LinearStorage& LinearStorage::operator=(LinearStorage&& storage) {
     name_ = std::move(storage.name_);
     std::swap(storage_type_, storage.storage_type_);
     std::swap(data_type_, storage.data_type_);
-    desc_ = storage.desc_;
     GPUObject::operator=(std::move(storage));
   }
   return *this;
diff --git a/tensorflow/lite/delegates/gpu/cl/linear_storage.h b/tensorflow/lite/delegates/gpu/cl/linear_storage.h
index 474c5652db2..2c0770ef3dc 100644
--- a/tensorflow/lite/delegates/gpu/cl/linear_storage.h
+++ b/tensorflow/lite/delegates/gpu/cl/linear_storage.h
@@ -41,20 +41,6 @@ struct TensorLinearDescriptor : public GPUObjectDescriptor {
   LinearStorageType storage_type;
   DataType element_type;  // FLOAT32 or FLOAT16
 
-  TensorLinearDescriptor() = default;
-  TensorLinearDescriptor(const TensorLinearDescriptor& desc)
-      : GPUObjectDescriptor(desc),
-        storage_type(desc.storage_type),
-        element_type(desc.element_type) {}
-  TensorLinearDescriptor& operator=(const TensorLinearDescriptor& desc) {
-    if (this != &desc) {
-      storage_type = desc.storage_type;
-      element_type = desc.element_type;
-      GPUObjectDescriptor::operator=(desc);
-    }
-    return *this;
-  }
-
   absl::Status PerformSelector(const std::string& selector,
                                const std::vector<std::string>& args,
                                const std::vector<std::string>& template_args,
@@ -92,9 +78,6 @@ class LinearStorage : public GPUObject {
   std::string ReadLinearFLT4(const std::string& z_coord) const;
   std::string GetDeclaration() const;
 
-  const GPUObjectDescriptor* GetGPUDescriptor() const override {
-    return &desc_;
-  }
   GPUResourcesWithValue GetGPUResources(AccessType access_type) const override;
 
  private:
@@ -115,7 +98,6 @@ class LinearStorage : public GPUObject {
   std::string name_;
   LinearStorageType storage_type_;
   DataType data_type_;
-  TensorLinearDescriptor desc_;
 };
 
 absl::Status CreateBufferLinearStorage(int size, DataType data_type, void* data,
@@ -152,6 +134,31 @@ absl::Status CreateLinearStorage(const LinearStorageCreateInfo& creation_info,
   return absl::OkStatus();
 }
 
+template <DataType T>
+absl::Status CreateLinearStorage(const TensorLinearDescriptor& descriptor,
+                                 const tflite::gpu::Tensor<Linear, T>& tensor,
+                                 CLContext* context, LinearStorage* result) {
+  LinearStorageCreateInfo creation_info;
+  creation_info.storage_type = descriptor.storage_type;
+  creation_info.data_type = descriptor.element_type;
+  int size = creation_info.aligned_size != 0 ? creation_info.aligned_size
+                                             : tensor.shape.v;
+  const int depth = DivideRoundUp(size, 4);
+  if (creation_info.data_type == DataType::FLOAT32) {
+    std::vector<float4> gpu_data(depth);
+    CopyLinearFLT4(tensor, absl::MakeSpan(gpu_data));
+    RETURN_IF_ERROR(CreateLinearStorage(creation_info, depth, gpu_data.data(),
+                                        context, result));
+  } else {
+    std::vector<half4> gpu_data(depth);
+    CopyLinearFLT4(tensor, absl::MakeSpan(gpu_data));
+    RETURN_IF_ERROR(CreateLinearStorage(creation_info, depth, gpu_data.data(),
+                                        context, result));
+  }
+  result->SetName(creation_info.name);
+  return absl::OkStatus();
+}
+
 }  // namespace cl
 }  // namespace gpu
 }  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/tensor.h b/tensorflow/lite/delegates/gpu/cl/tensor.h
index c1b0b14709f..7de42a810ec 100644
--- a/tensorflow/lite/delegates/gpu/cl/tensor.h
+++ b/tensorflow/lite/delegates/gpu/cl/tensor.h
@@ -58,9 +58,6 @@ class Tensor : public GPUObject {
 
   virtual ~Tensor() { Release(); }
 
-  const GPUObjectDescriptor* GetGPUDescriptor() const override {
-    return &descriptor_;
-  }
   GPUResourcesWithValue GetGPUResources(AccessType access_type) const override;
 
   int Width() const { return shape_.w; }
diff --git a/tensorflow/lite/delegates/gpu/cl/tensor_type.h b/tensorflow/lite/delegates/gpu/cl/tensor_type.h
index 2d4ae0c7335..7e173753217 100644
--- a/tensorflow/lite/delegates/gpu/cl/tensor_type.h
+++ b/tensorflow/lite/delegates/gpu/cl/tensor_type.h
@@ -48,20 +48,6 @@ struct TensorDescriptor : public GPUObjectDescriptor {
   TensorDescriptor() = default;
   TensorDescriptor(DataType dt, TensorStorageType st, Layout l)
       : data_type(dt), storage_type(st), layout(l) {}
-  TensorDescriptor(const TensorDescriptor& desc)
-      : GPUObjectDescriptor(desc),
-        data_type(desc.data_type),
-        storage_type(desc.storage_type),
-        layout(desc.layout) {}
-  TensorDescriptor& operator=(const TensorDescriptor& desc) {
-    if (this != &desc) {
-      data_type = desc.data_type;
-      storage_type = desc.storage_type;
-      layout = desc.layout;
-      GPUObjectDescriptor::operator=(desc);
-    }
-    return *this;
-  }
 
   bool operator==(const TensorDescriptor& d) const {
     return data_type == d.data_type && storage_type == d.storage_type &&
diff --git a/tensorflow/lite/kernels/BUILD b/tensorflow/lite/kernels/BUILD
index b16a85c65d8..9d3e5929d82 100644
--- a/tensorflow/lite/kernels/BUILD
+++ b/tensorflow/lite/kernels/BUILD
@@ -14,8 +14,8 @@ package(
 # This will cause TFLite to build with ruy only, providing a smaller binary.
 # WARNING: This build flag is experimental and subject to change.
 config_setting(
-    name = "tflite_with_ruy_only_explicit_true",
-    define_values = {"TFLITE_WITH_RUY_ONLY": "true"},
+    name = "tflite_with_ruy_explicit_true",
+    define_values = {"tflite_with_ruy": "true"},
 )
 
 # Disables usage of ruy as the exclusive GEMM backend in TFLite kernels.
@@ -23,14 +23,14 @@ config_setting(
 # the default GEMM option at runtime.
 # WARNING: This build flag is experimental and subject to change.
 config_setting(
-    name = "tflite_with_ruy_only_explicit_false",
-    define_values = {"TFLITE_WITH_RUY_ONLY": "false"},
+    name = "tflite_with_ruy_explicit_false",
+    define_values = {"tflite_with_ruy": "false"},
 )
 
 ###### Beginning of config_setting's to match aarch64 ######
 #
 # We need to identify the aarch64 instruction set to decide whether to enable
-# TFLITE_WITH_RUY_ONLY by default. This is surprisingly hard to do because select()
+# TFLITE_WITH_RUY by default. This is surprisingly hard to do because select()
 # can only consume config_setting's, these config_settings are not centralized,
 # and the "cpu" value which they define are free-form strings and there is no
 # standardization of the strings that we need to match for the aarch64 architecture.
@@ -239,45 +239,45 @@ cc_test(
 )
 
 cc_library(
-    name = "tflite_with_ruy_only_enabled",
+    name = "tflite_with_ruy_enabled",
     build_for_embedded = True,
-    defines = ["TFLITE_WITH_RUY_ONLY"],
+    defines = ["TFLITE_WITH_RUY"],
     visibility = ["//visibility:private"],
 )
 
 cc_library(
-    name = "tflite_with_ruy_only_and_caching_enabled",
+    name = "tflite_with_ruy_and_caching_enabled",
     defines = [
-        "TFLITE_WITH_RUY_ONLY",
+        "TFLITE_WITH_RUY",
         "TFLITE_WITH_RUY_GEMV",
     ],
     visibility = ["//visibility:private"],
 )
 
 cc_library(
-    name = "tflite_with_ruy_only_default",
+    name = "tflite_with_ruy_default",
     build_for_embedded = True,
     select_deps = {
-        ":chromiumos_arm64": [":tflite_with_ruy_only_enabled"],
-        ":cpu_aarch64": [":tflite_with_ruy_only_enabled"],
-        ":cpu_arm64": [":tflite_with_ruy_only_enabled"],
-        ":cpu_arm64e": [":tflite_with_ruy_only_enabled"],
-        ":cpu_ios_arm64": [":tflite_with_ruy_only_enabled"],
-        ":cpu_ios_arm64e": [":tflite_with_ruy_only_enabled"],
-        ":cpu_arm64_v8a": [":tflite_with_ruy_only_enabled"],
-        "//tensorflow:android_arm": ["tflite_with_ruy_only_enabled"],
+        ":chromiumos_arm64": [":tflite_with_ruy_enabled"],
+        ":cpu_aarch64": [":tflite_with_ruy_enabled"],
+        ":cpu_arm64": [":tflite_with_ruy_enabled"],
+        ":cpu_arm64e": [":tflite_with_ruy_enabled"],
+        ":cpu_ios_arm64": [":tflite_with_ruy_enabled"],
+        ":cpu_ios_arm64e": [":tflite_with_ruy_enabled"],
+        ":cpu_arm64_v8a": [":tflite_with_ruy_enabled"],
+        "//tensorflow:android_arm": ["tflite_with_ruy_enabled"],
         "//conditions:default": [],
     },
     visibility = ["//visibility:private"],
 )
 
 cc_library(
-    name = "tflite_with_ruy_only",
+    name = "tflite_with_ruy",
     build_for_embedded = True,
     select_deps = {
-        ":tflite_with_ruy_only_explicit_true": [":tflite_with_ruy_only_enabled"],
-        ":tflite_with_ruy_only_explicit_false": [],
-        "//conditions:default": [":tflite_with_ruy_only_default"],
+        ":tflite_with_ruy_explicit_true": [":tflite_with_ruy_enabled"],
+        ":tflite_with_ruy_explicit_false": [],
+        "//conditions:default": [":tflite_with_ruy_default"],
     },
 )
 
@@ -291,7 +291,7 @@ cc_library(
     ],
     copts = tflite_copts(),
     deps = [
-        ":tflite_with_ruy_only",
+        ":tflite_with_ruy",
         ":op_macros",
         # For now this unconditionally depends on both ruy and gemmlowp.
         # See the comment inside class CpuBackendContext on the
@@ -311,11 +311,11 @@ cc_library(
     copts = tflite_copts(),
     deps = [
         ":cpu_backend_context",
-        ":tflite_with_ruy_only",
+        ":tflite_with_ruy",
         "//tensorflow/lite/kernels/internal:compatibility",
         "//tensorflow/lite/kernels/internal:types",
         # For now this unconditionally depends on both ruy and gemmlowp.
-        # We only need to depend on gemmlowp when tflite_with_ruy_only
+        # We only need to depend on gemmlowp when tflite_with_ruy
         # is false, but putting these dependencies in a select() seems to
         # defeat copybara's rewriting rules.
         "@ruy//ruy:context",
@@ -349,20 +349,20 @@ cc_library(
     ],
     copts = tflite_copts(),
     deps = [
-        ":tflite_with_ruy_only",
+        ":tflite_with_ruy",
         "//tensorflow/lite/kernels/internal:common",
         "//tensorflow/lite/kernels/internal:compatibility",
         "//tensorflow/lite/kernels/internal:cpu_check",
         "//tensorflow/lite/kernels/internal:types",
         ":cpu_backend_context",
         ":cpu_backend_threadpool",
-        # Depend on ruy regardless of `tflite_with_ruy_only`. See the comment in
+        # Depend on ruy regardless of `tflite_with_ruy`. See the comment in
         # cpu_backend_gemm.h about why ruy is the generic path.
         "@ruy//ruy",
         "@ruy//ruy:matrix",
         "@ruy//ruy:path",
         "@ruy//ruy/profiler:instrumentation",
-        # We only need to depend on gemmlowp and Eigen when tflite_with_ruy_only
+        # We only need to depend on gemmlowp and Eigen when tflite_with_ruy
         # is false, but putting these dependencies in a select() seems to
         # defeat copybara's rewriting rules.
         "@gemmlowp",
@@ -605,7 +605,7 @@ cc_library(
         "//tensorflow/lite/kernels/internal:cppmath",
         "//tensorflow/lite:string",
         "@farmhash_archive//:farmhash",
-    ] + [":tflite_with_ruy_only_and_caching_enabled"],
+    ] + [":tflite_with_ruy_and_caching_enabled"],
 )
 
 cc_library(
diff --git a/tensorflow/lite/kernels/conv.cc b/tensorflow/lite/kernels/conv.cc
index 88765b2f9c4..81069de1abe 100644
--- a/tensorflow/lite/kernels/conv.cc
+++ b/tensorflow/lite/kernels/conv.cc
@@ -26,7 +26,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/compatibility.h"
 #include "tensorflow/lite/kernels/internal/types.h"
 // b/131835803 forces us to include multithreaded_conv.h before optimized_ops.h
-#ifndef TFLITE_WITH_RUY_ONLY
+#ifndef TFLITE_WITH_RUY
 #include "tensorflow/lite/kernels/internal/optimized/multithreaded_conv.h"
 #endif
 #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
@@ -765,8 +765,8 @@ void EvalFloat(TfLiteContext* context, TfLiteNode* node,
       break;
     }
     case kMultithreadOptimized: {
-#ifdef TFLITE_WITH_RUY_ONLY
-      // See Register_CONV_2D: we should never be here when TFLITE_WITH_RUY_ONLY
+#ifdef TFLITE_WITH_RUY
+      // See Register_CONV_2D: we should never be here when TFLITE_WITH_RUY
       // was enabled. We #if out this code in order to get the corresponding
       // binary size benefits.
       TFLITE_DCHECK(false);
@@ -1051,8 +1051,8 @@ TfLiteRegistration* Register_CONVOLUTION_CBLAS_OPT() {
 TfLiteRegistration* Register_CONV_2D() {
 #if defined TFLITE_USE_APPLE_ACCELERATE_FOR_CONV
   return Register_CONVOLUTION_CBLAS_OPT();
-#elif defined TFLITE_WITH_RUY_ONLY
-  // TFLITE_WITH_RUY_ONLY optimizes the generic kernel type.
+#elif defined TFLITE_WITH_RUY
+  // TFLITE_WITH_RUY optimizes the generic kernel type.
   return Register_CONVOLUTION_GENERIC_OPT();
 #else
   return Register_CONVOLUTION_MULTITHREADED_OPT();
@@ -1063,8 +1063,8 @@ TfLiteRegistration* Register_CONV_2D() {
 // models only need the UINT8 type. TFLite's op registration mechanism doesn't
 // yet allow for more nuanced registration mechanisms.
 TfLiteRegistration* Register_CONV_2D_UINT8() {
-#if defined TFLITE_WITH_RUY_ONLY
-  // TFLITE_WITH_RUY_ONLY optimizes the generic kernel type.
+#if defined TFLITE_WITH_RUY
+  // TFLITE_WITH_RUY optimizes the generic kernel type.
   return Register_CONVOLUTION_GENERIC_OPT_UINT8();
 #else
   return Register_CONV_2D();
diff --git a/tensorflow/lite/kernels/conv_test.cc b/tensorflow/lite/kernels/conv_test.cc
index a1fd34eb1cb..ac78bc6b353 100644
--- a/tensorflow/lite/kernels/conv_test.cc
+++ b/tensorflow/lite/kernels/conv_test.cc
@@ -148,7 +148,7 @@ class ConvolutionOpModel : public BaseConvolutionOpModel<float> {
 const auto kKernelMap = new std::map<string, TfLiteRegistration*>({
     {"Reference", ops::builtin::Register_CONVOLUTION_REF()},
     {"GenericOptimized", ops::builtin::Register_CONVOLUTION_GENERIC_OPT()},
-#ifndef TFLITE_WITH_RUY_ONLY
+#ifndef TFLITE_WITH_RUY
     {"MultithreadedOptimized",
      ops::builtin::Register_CONVOLUTION_MULTITHREADED_OPT()},
 #endif
diff --git a/tensorflow/lite/kernels/cpu_backend_context.h b/tensorflow/lite/kernels/cpu_backend_context.h
index 19ef88bf8e3..124b9b849a2 100644
--- a/tensorflow/lite/kernels/cpu_backend_context.h
+++ b/tensorflow/lite/kernels/cpu_backend_context.h
@@ -56,7 +56,7 @@ class CpuBackendContext final : public TfLiteInternalBackendContext {
   // (see :cpu_backend_gemm), for now a CpuBackendContext always
   // stores both a gemmlowp context and a ruy context.
   // TODO(b/131416458): Once call sites all go through abstractions,
-  // elide what can be elided based on TFLITE_WITH_RUY_ONLY.
+  // elide what can be elided based on TFLITE_WITH_RUY.
   const std::unique_ptr<ruy::Context> ruy_context_;
   const std::unique_ptr<gemmlowp::GemmContext> gemmlowp_context_;
 
diff --git a/tensorflow/lite/kernels/cpu_backend_gemm.h b/tensorflow/lite/kernels/cpu_backend_gemm.h
index f4d20d8970a..a95c4d15a82 100644
--- a/tensorflow/lite/kernels/cpu_backend_gemm.h
+++ b/tensorflow/lite/kernels/cpu_backend_gemm.h
@@ -24,7 +24,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/cpu_backend_gemm_params.h"
 #include "tensorflow/lite/kernels/cpu_backend_gemm_ruy.h"
 
-#ifndef TFLITE_WITH_RUY_ONLY
+#ifndef TFLITE_WITH_RUY
 #include "tensorflow/lite/kernels/cpu_backend_gemm_eigen.h"
 #include "tensorflow/lite/kernels/cpu_backend_gemm_gemmlowp.h"
 #endif
@@ -42,7 +42,7 @@ template <typename LhsScalar, typename RhsScalar, typename AccumScalar,
 struct GemmImpl : detail::GemmImplUsingRuy<LhsScalar, RhsScalar, AccumScalar,
                                            DstScalar, quantization_flavor> {};
 
-#ifndef TFLITE_WITH_RUY_ONLY
+#ifndef TFLITE_WITH_RUY
 
 /* Specializations using gemmlowp */
 
@@ -82,7 +82,7 @@ template <>
 struct GemmImpl<float, float, float, float, QuantizationFlavor::kFloatingPoint>
     : detail::GemmImplUsingEigen {};
 
-#endif  // not TFLITE_WITH_RUY_ONLY
+#endif  // not TFLITE_WITH_RUY
 
 /* Public entry point */
 
diff --git a/tensorflow/lite/kernels/cpu_backend_gemm_custom_gemv.h b/tensorflow/lite/kernels/cpu_backend_gemm_custom_gemv.h
index 224f8ecea41..2712d7d2cd1 100644
--- a/tensorflow/lite/kernels/cpu_backend_gemm_custom_gemv.h
+++ b/tensorflow/lite/kernels/cpu_backend_gemm_custom_gemv.h
@@ -591,10 +591,10 @@ struct CustomGemvImpl<LhsScalar, RhsScalar, std::int32_t, DstScalar,
 // The float specialization below is unconditionally faster than ruy
 // because ruy does not currently have any Gemv path.
 // But it is not unconditionally faster than Eigen, which is what is used
-// unless TFLITE_WITH_RUY_ONLY is defined. Indeed, Eigen has decently efficient
+// unless TFLITE_WITH_RUY is defined. Indeed, Eigen has decently efficient
 // Gemv paths, and they may use AVX instructions, while the present
 // NEON intrinsics code maps at best to SSE4 on x86.
-#ifdef TFLITE_WITH_RUY_ONLY
+#ifdef TFLITE_WITH_RUY
 
 // We want to use fused multiply-add when it's available (that is, on A64
 // unconditionally and on A32 with VFPv4) because it's often faster, and
@@ -778,7 +778,7 @@ struct CustomGemvImpl<float, float, float, float,
   }
 };
 
-#endif  // TFLITE_WITH_RUY_ONLY
+#endif  // TFLITE_WITH_RUY
 
 #endif  // USE_NEON
 
diff --git a/tensorflow/lite/kernels/cpu_backend_gemm_eigen.cc b/tensorflow/lite/kernels/cpu_backend_gemm_eigen.cc
index 334baa5f7ed..ebde7a0b935 100644
--- a/tensorflow/lite/kernels/cpu_backend_gemm_eigen.cc
+++ b/tensorflow/lite/kernels/cpu_backend_gemm_eigen.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TFLITE_WITH_RUY_ONLY
+#ifndef TFLITE_WITH_RUY
 
 #include "tensorflow/lite/kernels/cpu_backend_gemm_eigen.h"
 
@@ -78,4 +78,4 @@ void GemmImplUsingEigen::Run(
 }  // namespace cpu_backend_gemm
 }  // namespace tflite
 
-#endif  // not TFLITE_WITH_RUY_ONLY
+#endif  // not TFLITE_WITH_RUY
diff --git a/tensorflow/lite/kernels/cpu_backend_gemm_eigen.h b/tensorflow/lite/kernels/cpu_backend_gemm_eigen.h
index 3a7fd9df624..bd4733dcfae 100644
--- a/tensorflow/lite/kernels/cpu_backend_gemm_eigen.h
+++ b/tensorflow/lite/kernels/cpu_backend_gemm_eigen.h
@@ -16,7 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_KERNELS_CPU_BACKEND_GEMM_EIGEN_H_
 #define TENSORFLOW_LITE_KERNELS_CPU_BACKEND_GEMM_EIGEN_H_
 
-#ifndef TFLITE_WITH_RUY_ONLY
+#ifndef TFLITE_WITH_RUY
 
 #include "tensorflow/lite/kernels/cpu_backend_context.h"
 #include "tensorflow/lite/kernels/cpu_backend_gemm_params.h"
@@ -37,6 +37,6 @@ struct GemmImplUsingEigen {
 }  // namespace cpu_backend_gemm
 }  // namespace tflite
 
-#endif  // not TFLITE_WITH_RUY_ONLY
+#endif  // not TFLITE_WITH_RUY
 
 #endif  // TENSORFLOW_LITE_KERNELS_CPU_BACKEND_GEMM_EIGEN_H_
diff --git a/tensorflow/lite/kernels/cpu_backend_gemm_gemmlowp.h b/tensorflow/lite/kernels/cpu_backend_gemm_gemmlowp.h
index 77d37aac291..b7926679ae4 100644
--- a/tensorflow/lite/kernels/cpu_backend_gemm_gemmlowp.h
+++ b/tensorflow/lite/kernels/cpu_backend_gemm_gemmlowp.h
@@ -19,7 +19,7 @@ limitations under the License.
 #include <tuple>
 
 #include "tensorflow/lite/kernels/internal/compatibility.h"
-#ifndef TFLITE_WITH_RUY_ONLY
+#ifndef TFLITE_WITH_RUY
 
 #include <cstdint>
 #include <type_traits>
@@ -190,6 +190,6 @@ struct GemmImplUsingGemmlowp<LhsScalar, RhsScalar, AccumScalar, DstScalar,
 }  // namespace cpu_backend_gemm
 }  // namespace tflite
 
-#endif  // not TFLITE_WITH_RUY_ONLY
+#endif  // not TFLITE_WITH_RUY
 
 #endif  // TENSORFLOW_LITE_KERNELS_CPU_BACKEND_GEMM_GEMMLOWP_H_
diff --git a/tensorflow/lite/kernels/cpu_backend_threadpool.h b/tensorflow/lite/kernels/cpu_backend_threadpool.h
index 60a5ebfde29..39eafd51d6a 100644
--- a/tensorflow/lite/kernels/cpu_backend_threadpool.h
+++ b/tensorflow/lite/kernels/cpu_backend_threadpool.h
@@ -19,7 +19,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/cpu_backend_context.h"
 #include "tensorflow/lite/kernels/internal/compatibility.h"
 
-#ifdef TFLITE_WITH_RUY_ONLY
+#ifdef TFLITE_WITH_RUY
 #include "ruy/context.h"  // from @ruy
 #include "ruy/thread_pool.h"  // from @ruy
 #else
@@ -29,7 +29,7 @@ limitations under the License.
 namespace tflite {
 namespace cpu_backend_threadpool {
 
-#ifdef TFLITE_WITH_RUY_ONLY
+#ifdef TFLITE_WITH_RUY
 
 using Task = ruy::Task;
 
@@ -41,7 +41,7 @@ void Execute(int tasks_count, TaskType* tasks,
       tasks_count, tasks);
 }
 
-#else  // not TFLITE_WITH_RUY_ONLY
+#else  // not TFLITE_WITH_RUY
 
 using Task = gemmlowp::Task;
 
diff --git a/tensorflow/lite/kernels/internal/optimized/depthwiseconv_multithread.h b/tensorflow/lite/kernels/internal/optimized/depthwiseconv_multithread.h
index 0e13222b28a..7d8838a076e 100644
--- a/tensorflow/lite/kernels/internal/optimized/depthwiseconv_multithread.h
+++ b/tensorflow/lite/kernels/internal/optimized/depthwiseconv_multithread.h
@@ -132,7 +132,7 @@ inline void DepthwiseConv(const DepthwiseParams& params,
   int thread_count = HowManyConvThreads(output_shape, filter_shape);
   const int max_threads = cpu_backend_context->max_num_threads();
   thread_count = std::max(1, std::min(thread_count, max_threads));
-#ifndef TFLITE_WITH_RUY_ONLY
+#ifndef TFLITE_WITH_RUY
   // Cap the number of threads to 2 for float path to avoid regression in
   // performance (b/132294857).
   if (std::is_floating_point<T>::value) {
diff --git a/tensorflow/lite/kernels/lstm_eval.cc b/tensorflow/lite/kernels/lstm_eval.cc
index e5bdf3e9a1e..5b4e8a8d479 100644
--- a/tensorflow/lite/kernels/lstm_eval.cc
+++ b/tensorflow/lite/kernels/lstm_eval.cc
@@ -976,7 +976,7 @@ inline void LstmStepHybrid(
 // Scalar values:
 //   quantized_cell_clip: quantized clip value for cell.
 //   quantized_proj_clip: quantized clip value for projection.
-//   cell_scale: the power of two scale for cell state.
+//   cell_state_scale: the power of two scale for cell state.
 //
 // Zero points:
 //   output_state_zp: zero point of output state
@@ -1043,9 +1043,10 @@ inline void LstmStepInteger(
     int32_t layer_norm_output_scale_a, int32_t layer_norm_output_scale_b,
     const int32_t* input_gate_bias_ptr, const int32_t* forget_gate_bias_ptr,
     const int32_t* cell_gate_bias_ptr, const int32_t* output_gate_bias_ptr,
-    int16_t quantized_cell_clip, int8_t quantized_proj_clip, int32_t cell_scale,
-    int32_t input_variance_guard, int32_t forget_variance_guard,
-    int32_t cell_variance_guard, int32_t output_variance_guard,
+    int16_t quantized_cell_clip, int8_t quantized_proj_clip,
+    int32_t cell_state_scale, int32_t input_variance_guard,
+    int32_t forget_variance_guard, int32_t cell_variance_guard,
+    int32_t output_variance_guard,
     const int32_t* input_to_forget_effective_bias,
     const int32_t* recurrent_to_forget_effective_bias,
     const int32_t* input_to_cell_effective_bias,
@@ -1056,7 +1057,7 @@ inline void LstmStepInteger(
     const int32_t* recurrent_to_input_effective_bias,
     const int32_t* projection_effective_bias, int n_batch, int n_cell,
     int n_input, int n_output, int8_t* output_state_ptr,
-    int32_t output_state_zp, int16_t* cell_ptr, int8_t* output_ptr,
+    int32_t output_state_zp, int16_t* cell_state_ptr, int8_t* output_ptr,
     int16_t* scratch0, int16_t* scratch1, int16_t* scratch2, int16_t* scratch3,
     int8_t* scratch4, int32_t* scratch5, CpuBackendContext* context) {
   ruy::profiler::ScopeLabel label("LstmStepInteger");
@@ -1106,7 +1107,7 @@ inline void LstmStepInteger(
       scratch5, forget_gate_scratch, context);
   if (use_peephole) {
     tensor_utils::VectorBatchVectorCwiseProductAccumulate(
-        cell_to_forget_weight_ptr, n_output, cell_ptr, n_batch,
+        cell_to_forget_weight_ptr, n_output, cell_state_ptr, n_batch,
         effective_cell_to_forget_scale_a, effective_cell_to_forget_scale_b,
         forget_gate_scratch);
   }
@@ -1160,7 +1161,7 @@ inline void LstmStepInteger(
         scratch5, input_gate_scratch, context);
     if (use_peephole) {
       tensor_utils::VectorBatchVectorCwiseProductAccumulate(
-          cell_to_input_weight_ptr, n_output, cell_ptr, n_batch,
+          cell_to_input_weight_ptr, n_output, cell_state_ptr, n_batch,
           effective_cell_to_input_scale_a, effective_cell_to_input_scale_b,
           input_gate_scratch);
     }
@@ -1175,18 +1176,19 @@ inline void LstmStepInteger(
                                input_gate_scratch);
   }
 
-  // New cell.
-  tensor_utils::CwiseMul(forget_gate_scratch, cell_ptr, n_batch, n_cell, 15,
-                         forget_gate_scratch);
+  // New cell state.
+  tensor_utils::CwiseMul(forget_gate_scratch, cell_state_ptr, n_batch, n_cell,
+                         15, forget_gate_scratch);
 
   tensor_utils::CwiseMul(input_gate_scratch, cell_gate_scratch, n_batch, n_cell,
-                         30 + cell_scale, cell_gate_scratch);
+                         30 + cell_state_scale, cell_gate_scratch);
 
   tensor_utils::CwiseAdd(forget_gate_scratch, cell_gate_scratch, n_batch,
-                         n_cell, cell_ptr);
+                         n_cell, cell_state_ptr);
 
   if (quantized_cell_clip > 0) {
-    tensor_utils::CwiseClipping(cell_ptr, quantized_cell_clip, n_batch, n_cell);
+    tensor_utils::CwiseClipping(cell_state_ptr, quantized_cell_clip, n_batch,
+                                n_cell);
   }
 
   // Ouptut gate.
@@ -1202,7 +1204,7 @@ inline void LstmStepInteger(
       scratch5, output_gate_scratch, context);
   if (use_peephole) {
     tensor_utils::VectorBatchVectorCwiseProductAccumulate(
-        cell_to_output_weight_ptr, n_output, cell_ptr, n_batch,
+        cell_to_output_weight_ptr, n_output, cell_state_ptr, n_batch,
         effective_cell_to_output_scale_a, effective_cell_to_output_scale_b,
         output_gate_scratch);
   }
@@ -1218,8 +1220,8 @@ inline void LstmStepInteger(
                              output_gate_scratch);
 
   // Hidden.
-  tensor_utils::ApplyTanh(15 + cell_scale, cell_ptr, n_batch, n_cell,
-                          input_gate_scratch);
+  tensor_utils::ApplyTanh(15 + cell_state_scale, cell_state_ptr, n_batch,
+                          n_cell, input_gate_scratch);
 
   tensor_utils::CwiseMul(output_gate_scratch, input_gate_scratch,
                          effective_hidden_scale_a, effective_hidden_scale_b,
@@ -1312,7 +1314,7 @@ inline void LstmStepInteger(
 // Scalar values:
 //   quantized_cell_clip: quantized clip value for cell.
 //   quantized_proj_clip: quantized clip value for projection.
-//   cell_scale: the power of two scale for cell state.
+//   cell_state_scale: the power of two scale for cell state.
 //
 // Zero points:
 //   output_state_zp: zero point of output state.
@@ -1385,7 +1387,7 @@ void LstmStepInteger(
     const int32_t* intermediate_zp, int16_t quantized_cell_clip,
     int8_t quantized_proj_clip, int n_batch, int n_cell, int n_input,
     int n_output, int output_batch_leading_dim, int8_t* output_state_ptr,
-    int32_t output_state_zp, int16_t* cell_ptr, int8_t* output_ptr,
+    int32_t output_state_zp, int16_t* cell_state_ptr, int8_t* output_ptr,
     int8_t* scratch0, int8_t* scratch1, int16_t* scratch2, int16_t* scratch3,
     int16_t* scratch4, int16_t* scratch5, int16_t* scratch6,
     int16_t* scratch7) {
@@ -1486,20 +1488,21 @@ void LstmStepInteger(
                            input_gate_scratch);
 
   // New cell.
-  tensor_utils::CwiseMul(forget_gate_scratch, cell_ptr, n_batch, n_cell,
+  tensor_utils::CwiseMul(forget_gate_scratch, cell_state_ptr, n_batch, n_cell,
                          15 + 15 - 15, scratch6);
 
   tensor_utils::CwiseMul(input_gate_scratch, cell_gate_scratch, n_batch, n_cell,
                          15 + 15 - 15, scratch7);
 
-  tensor_utils::CwiseAdd(scratch6, scratch7, n_batch, n_cell, cell_ptr);
+  tensor_utils::CwiseAdd(scratch6, scratch7, n_batch, n_cell, cell_state_ptr);
 
   if (quantized_cell_clip > 0) {
-    tensor_utils::CwiseClipping(cell_ptr, quantized_cell_clip, n_batch, n_cell);
+    tensor_utils::CwiseClipping(cell_state_ptr, quantized_cell_clip, n_batch,
+                                n_cell);
   }
 
   // Cell to hidden.
-  tensor_utils::ApplyTanhFloat(cell_ptr, n_batch, n_cell, -15,
+  tensor_utils::ApplyTanhFloat(cell_state_ptr, n_batch, n_cell, -15,
                                forget_gate_scratch);
 
   std::vector<int16_t> hidden(n_batch * n_cell);
diff --git a/tensorflow/lite/micro/micro_allocator.cc b/tensorflow/lite/micro/micro_allocator.cc
index f3b64bc9f39..d6c192c3ed3 100644
--- a/tensorflow/lite/micro/micro_allocator.cc
+++ b/tensorflow/lite/micro/micro_allocator.cc
@@ -166,11 +166,12 @@ class AllocationInfoBuilder {
   //  - If there's no metadata available, offline_planner_offsets is not set
   //  - If there's metadata available, offline_planner_offsets will point to the
   //    first offset in the metadata buffer list.
-  TfLiteStatus GetOfflinePlannedOffsets(const Model* model,
-                                        int32_t** offline_planner_offsets);
+  TfLiteStatus GetOfflinePlannedOffsets(
+      const Model* model, const int32_t** offline_planner_offsets);
 
   // Add allocaiton information for the tensors.
-  TfLiteStatus AddTensors(const SubGraph* subgraph, int32_t* offline_offsets,
+  TfLiteStatus AddTensors(const SubGraph* subgraph,
+                          const int32_t* offline_offsets,
                           TfLiteTensor* runtime_tensors);
 
   // Add allocation information for the scratch buffers.
@@ -206,7 +207,7 @@ TfLiteStatus AllocationInfoBuilder::Allocate() {
 }
 
 TfLiteStatus AllocationInfoBuilder::AddTensors(const SubGraph* subgraph,
-                                               int32_t* offline_offsets,
+                                               const int32_t* offline_offsets,
                                                TfLiteTensor* runtime_tensors) {
   // Set up allocation info for all tensors.
   for (size_t i = 0; i < tensor_count_; ++i) {
@@ -299,7 +300,7 @@ TfLiteStatus AllocationInfoBuilder::AddTensors(const SubGraph* subgraph,
 // |    4    | Arena byte offset of tensor #1 or -1 to allocate at runtime     |
 // | 3+(n-1) | Arena byte offset of tensor #(n-1) or -1 to allocate at runtime |
 TfLiteStatus AllocationInfoBuilder::GetOfflinePlannedOffsets(
-    const Model* model, int32_t** offline_planner_offsets) {
+    const Model* model, const int32_t** offline_planner_offsets) {
   if (model->metadata()) {
     for (size_t i = 0; i < model->metadata()->size(); ++i) {
       auto metadata = model->metadata()->Get(i);
@@ -309,9 +310,11 @@ TfLiteStatus AllocationInfoBuilder::GetOfflinePlannedOffsets(
             model->buffers();
         auto* buffer = (*buffers)[metadata->buffer()];
         auto* array = buffer->data();
-        const uint32_t* metadata_buffer = (uint32_t*)array->data();
-        const size_t nbr_tensors = (size_t)metadata_buffer[2];
-        *offline_planner_offsets = (int32_t*)&metadata_buffer[3];
+        const uint32_t* metadata_buffer =
+            reinterpret_cast<const uint32_t*>(array->data());
+        const size_t nbr_tensors = static_cast<size_t>(metadata_buffer[2]);
+        *offline_planner_offsets =
+            reinterpret_cast<const int32_t*>(&metadata_buffer[3]);
 
         if (tensor_count_ != nbr_tensors) {
           TF_LITE_REPORT_ERROR(reporter_,
@@ -893,7 +896,7 @@ TfLiteStatus MicroAllocator::CommitStaticMemoryPlan(const Model* model,
     TF_LITE_ENSURE_STATUS(
         builder.Init(subgraph->tensors()->size(), scratch_buffer_count_));
 
-    int32_t* offline_planner_offsets = nullptr;
+    const int32_t* offline_planner_offsets = nullptr;
     TF_LITE_ENSURE_STATUS(
         builder.GetOfflinePlannedOffsets(model, &offline_planner_offsets));
     TF_LITE_ENSURE_STATUS(builder.AddTensors(subgraph, offline_planner_offsets,
diff --git a/tensorflow/lite/toco/args.cc b/tensorflow/lite/toco/args.cc
index c30b98ce516..94fd850fdc5 100644
--- a/tensorflow/lite/toco/args.cc
+++ b/tensorflow/lite/toco/args.cc
@@ -114,7 +114,7 @@ inline bool TryStripSuffixString(absl::string_view str,
 bool Arg<toco::IntList>::Parse(std::string text) {
   parsed_value_.elements.clear();
   specified_ = true;
-  // strings::Split("") produces {""}, but we need {} on empty input.
+  // absl::StrSplit("") produces {""}, but we need {} on empty input.
   // TODO(aselle): Moved this from elsewhere, but ahentz recommends we could
   // use absl::SplitLeadingDec32Values(text.c_str(), &parsed_values_.elements)
   if (!text.empty()) {
diff --git a/tensorflow/lite/toco/graph_transformations/convert_expanddims_to_reshape.cc b/tensorflow/lite/toco/graph_transformations/convert_expanddims_to_reshape.cc
index 6b632fb5f46..6695f5a844b 100644
--- a/tensorflow/lite/toco/graph_transformations/convert_expanddims_to_reshape.cc
+++ b/tensorflow/lite/toco/graph_transformations/convert_expanddims_to_reshape.cc
@@ -76,8 +76,9 @@ namespace toco {
   reshape_op->outputs = expand_op->outputs;
 
   // Create a new input array
-  string axis_array_name = expand_op->inputs[1];
-  string shape_array_name = toco::AvailableArrayName(*model, axis_array_name);
+  std::string axis_array_name = expand_op->inputs[1];
+  std::string shape_array_name =
+      toco::AvailableArrayName(*model, axis_array_name);
   Array& shape_array = model->GetOrCreateArray(shape_array_name);
   *(shape_array.mutable_shape()->mutable_dims()) = {
       1, static_cast<int>(reshape_dims.size())};
diff --git a/tensorflow/lite/toco/graph_transformations/convert_reorder_axes.cc b/tensorflow/lite/toco/graph_transformations/convert_reorder_axes.cc
index 1440cd1c1a7..c2214d6f97c 100644
--- a/tensorflow/lite/toco/graph_transformations/convert_reorder_axes.cc
+++ b/tensorflow/lite/toco/graph_transformations/convert_reorder_axes.cc
@@ -41,7 +41,7 @@ TensorFlowReshapeOperator* CreateReshapeFromReorderAxes(
                                    input_shape.dims(3) * input_shape.dims(2)};
 
   // Create a new input array for Reshape.
-  string reshape_array_name =
+  std::string reshape_array_name =
       AvailableArrayName(*model, reshape_op->outputs[0]);
   reshape_op->inputs.push_back(reshape_array_name);
 
@@ -71,7 +71,8 @@ TransposeOperator* CreateTransposeFromReorderAxes(
   GetShuffleShape(input_axes_order, output_axes_order, &permutations_data);
 
   // Create a new input permutations array for Transpose.
-  string perm_array_name = AvailableArrayName(*model, transpose_op->outputs[0]);
+  std::string perm_array_name =
+      AvailableArrayName(*model, transpose_op->outputs[0]);
   transpose_op->inputs.push_back(perm_array_name);
 
   Array& perm_array = model->GetOrCreateArray(perm_array_name);
@@ -104,7 +105,7 @@ TransposeOperator* CreateTransposeFromReorderAxes(
 
   // Get input array. If kFakeQuant is the input into ReorderAxes, get the input
   // array passed into kFakeQuant. kFakeQuant op is dropped when possible.
-  string constant_input_array_name = input_array_name;
+  std::string constant_input_array_name = input_array_name;
   if (!input_array.buffer) {
     const auto* op_producing_input = GetOpWithOutput(*model, input_array_name);
     if (op_producing_input &&
diff --git a/tensorflow/lite/toco/graph_transformations/convert_trivial_pack_to_reshape.cc b/tensorflow/lite/toco/graph_transformations/convert_trivial_pack_to_reshape.cc
index 31f9ced8cf5..2ad6280b955 100644
--- a/tensorflow/lite/toco/graph_transformations/convert_trivial_pack_to_reshape.cc
+++ b/tensorflow/lite/toco/graph_transformations/convert_trivial_pack_to_reshape.cc
@@ -59,7 +59,7 @@ namespace toco {
   reshape_op->outputs = pack_op->outputs;
 
   // Create shape param.
-  string shape_array_name =
+  std::string shape_array_name =
       AvailableArrayName(*model, pack_op->outputs[0] + "_shape");
   Array& shape_array = model->GetOrCreateArray(shape_array_name);
   const int shape_array_dims = 1 + input_array.shape().dimensions_count();
diff --git a/tensorflow/lite/toco/graph_transformations/convert_trivial_transpose_to_reshape.cc b/tensorflow/lite/toco/graph_transformations/convert_trivial_transpose_to_reshape.cc
index 2b5aaea2b23..2d1e5090f4a 100644
--- a/tensorflow/lite/toco/graph_transformations/convert_trivial_transpose_to_reshape.cc
+++ b/tensorflow/lite/toco/graph_transformations/convert_trivial_transpose_to_reshape.cc
@@ -90,8 +90,9 @@ bool TransposeAffectsMemoryOrder(std::vector<int> perm,
   reshape_op->outputs = transpose_op->outputs;
 
   // Create a new input array for the shape input
-  string perm_array_name = transpose_op->inputs[1];
-  string shape_array_name = toco::AvailableArrayName(*model, perm_array_name);
+  std::string perm_array_name = transpose_op->inputs[1];
+  std::string shape_array_name =
+      toco::AvailableArrayName(*model, perm_array_name);
   Array& shape_array = model->GetOrCreateArray(shape_array_name);
   *(shape_array.mutable_shape()->mutable_dims()) = {
       1, static_cast<int>(output_dims.size())};
diff --git a/tensorflow/lite/toco/graph_transformations/create_im2col_arrays.cc b/tensorflow/lite/toco/graph_transformations/create_im2col_arrays.cc
index 8e93bc23789..a788b81672d 100644
--- a/tensorflow/lite/toco/graph_transformations/create_im2col_arrays.cc
+++ b/tensorflow/lite/toco/graph_transformations/create_im2col_arrays.cc
@@ -49,7 +49,7 @@ bool ProcessConvOperator(Model* model, ConvOperator* op) {
 
   // Create the im2col array.
   CHECK_EQ(op->outputs.size(), 1);
-  const string& im2col_array_name =
+  const std::string& im2col_array_name =
       AvailableArrayName(*model, op->inputs[0] + "_im2col");
   model->GetOrCreateArray(im2col_array_name);
   op->outputs.push_back(im2col_array_name);
@@ -65,7 +65,7 @@ bool ProcessTransposeConvOperator(Model* model, TransposeConvOperator* op) {
 
   // Always create an im2col array for transpose_conv.
   CHECK_EQ(op->outputs.size(), 1);
-  const string& im2col_array_name = AvailableArrayName(
+  const std::string& im2col_array_name = AvailableArrayName(
       *model, op->inputs[TransposeConvOperator::DATA_INPUT] + "_im2col");
   model->GetOrCreateArray(im2col_array_name);
   op->outputs.push_back(im2col_array_name);
diff --git a/tensorflow/lite/toco/graph_transformations/dequantize.cc b/tensorflow/lite/toco/graph_transformations/dequantize.cc
index cc5dddbb40e..6872fc47344 100644
--- a/tensorflow/lite/toco/graph_transformations/dequantize.cc
+++ b/tensorflow/lite/toco/graph_transformations/dequantize.cc
@@ -41,7 +41,7 @@ void DequantizeBuffer(Array* array) {
 }
 
 std::vector<std::unique_ptr<Operator>>::iterator FindFirstOpWithInput(
-    Model* model, const string& array_name) {
+    Model* model, const std::string& array_name) {
   for (auto it = model->operators.begin(); it != model->operators.end(); ++it) {
     for (const auto& input : it->get()->inputs) {
       if (input == array_name) {
@@ -52,7 +52,7 @@ std::vector<std::unique_ptr<Operator>>::iterator FindFirstOpWithInput(
   return model->operators.end();
 }
 
-void ClearArrayQuantizationParams(const string& array_name, Model* model) {
+void ClearArrayQuantizationParams(const std::string& array_name, Model* model) {
   auto* array = &model->GetArray(array_name);
   CHECK(array->quantization_params);
   for (auto& input_array : *model->flags.mutable_input_arrays()) {
@@ -75,7 +75,7 @@ void ClearArrayQuantizationParams(const string& array_name, Model* model) {
   array->quantization_params = nullptr;
 }
 
-bool DequantizeArray(const string& array_name,
+bool DequantizeArray(const std::string& array_name,
                      GraphTransformation* transformation, Model* model) {
   auto* array = &model->GetArray(array_name);
   if (!array->quantization_params) {
@@ -133,7 +133,7 @@ bool DequantizeArray(const string& array_name,
   if (IsInputArray(*model, array_name)) {
     must_insert_fakequant_after = true;
   }
-  for (const string& output_array : model->flags.output_arrays()) {
+  for (const std::string& output_array : model->flags.output_arrays()) {
     if (array_name == output_array) {
       must_insert_fakequant_before = true;
     }
@@ -152,7 +152,7 @@ bool DequantizeArray(const string& array_name,
   auto* fakequant_op = new FakeQuantOperator;
   model->operators.emplace(FindFirstOpWithInput(model, array_name),
                            fakequant_op);
-  const string& new_array_name = AvailableArrayName(*model, array_name);
+  const std::string& new_array_name = AvailableArrayName(*model, array_name);
   auto& new_array = model->GetOrCreateArray(new_array_name);
   new_array.data_type = ArrayDataType::kFloat;
   new_array.copy_shape(array->shape());
@@ -162,7 +162,7 @@ bool DequantizeArray(const string& array_name,
   fakequant_op->narrow_range = array->narrow_range;
   if (must_insert_fakequant_before) {
     for (const auto& op : model->operators) {
-      for (string& output : op->outputs) {
+      for (std::string& output : op->outputs) {
         if (output == array_name) {
           output = new_array_name;
         }
@@ -172,7 +172,7 @@ bool DequantizeArray(const string& array_name,
     fakequant_op->outputs = {array_name};
   } else {
     for (const auto& op : model->operators) {
-      for (string& input : op->inputs) {
+      for (std::string& input : op->inputs) {
         if (input == array_name) {
           input = new_array_name;
         }
@@ -209,15 +209,15 @@ bool DequantizeArray(const string& array_name,
     return ::tensorflow::Status::OK();
   }
 
-  std::vector<string> arrays;
-  for (const string& input : op->inputs) {
+  std::vector<std::string> arrays;
+  for (const std::string& input : op->inputs) {
     arrays.push_back(input);
   }
-  for (const string& output : op->outputs) {
+  for (const std::string& output : op->outputs) {
     arrays.push_back(output);
   }
   bool changed = false;
-  for (const string& array : arrays) {
+  for (const std::string& array : arrays) {
     if (!model->IsOptionalArray(array)) {
       changed |= DequantizeArray(array, this, model);
     }
diff --git a/tensorflow/lite/toco/graph_transformations/ensure_bias_vectors.cc b/tensorflow/lite/toco/graph_transformations/ensure_bias_vectors.cc
index fcad8bc0086..09bd9aedcf0 100644
--- a/tensorflow/lite/toco/graph_transformations/ensure_bias_vectors.cc
+++ b/tensorflow/lite/toco/graph_transformations/ensure_bias_vectors.cc
@@ -27,7 +27,7 @@ namespace toco {
 namespace {
 
 int GetOutputDepthFromWeights(const Model& model, const Operator& op) {
-  const string& weights_name = op.inputs[1];
+  const std::string& weights_name = op.inputs[1];
   const auto& weights_shape = model.GetArray(weights_name).shape();
   if (op.type == OperatorType::kConv ||
       op.type == OperatorType::kFullyConnected ||
@@ -56,13 +56,14 @@ bool ProcessLinearOperator(Model* model, Operator* op) {
   if (CheckOpInputSize(*op)) {
     return false;
   }
-  const string& output_name = op->outputs[0];
-  const string& weights_name = op->inputs[1];
+  const std::string& output_name = op->outputs[0];
+  const std::string& weights_name = op->inputs[1];
   if (!model->GetArray(weights_name).has_shape()) {
     return false;
   }
   const int depth = GetOutputDepthFromWeights(*model, *op);
-  const string& bias_name = AvailableArrayName(*model, output_name + "_bias");
+  const std::string& bias_name =
+      AvailableArrayName(*model, output_name + "_bias");
   op->inputs.push_back(bias_name);
   auto& bias_array = model->GetOrCreateArray(bias_name);
   bias_array.data_type = ArrayDataType::kFloat;
diff --git a/tensorflow/lite/toco/graph_transformations/ensure_uint8_weights_safe_for_fast_int8_kernels.cc b/tensorflow/lite/toco/graph_transformations/ensure_uint8_weights_safe_for_fast_int8_kernels.cc
index 918bb489995..5854e74b507 100644
--- a/tensorflow/lite/toco/graph_transformations/ensure_uint8_weights_safe_for_fast_int8_kernels.cc
+++ b/tensorflow/lite/toco/graph_transformations/ensure_uint8_weights_safe_for_fast_int8_kernels.cc
@@ -152,7 +152,7 @@ namespace toco {
       return ::tensorflow::Status::OK();
   }
 
-  const string& name = op.inputs[weights_index];
+  const std::string& name = op.inputs[weights_index];
   auto& array = model->GetArray(name);
   if (!array.buffer) {
     return ::tensorflow::Status::OK();
diff --git a/tensorflow/lite/toco/graph_transformations/fuse_binary_into_preceding_affine.cc b/tensorflow/lite/toco/graph_transformations/fuse_binary_into_preceding_affine.cc
index 05a2fecf31d..1de12b4f959 100644
--- a/tensorflow/lite/toco/graph_transformations/fuse_binary_into_preceding_affine.cc
+++ b/tensorflow/lite/toco/graph_transformations/fuse_binary_into_preceding_affine.cc
@@ -260,7 +260,7 @@ void FuseMulOrDivParamsIntoPrecedingAffine(Model* model, Operator* preceding_op,
     return ::tensorflow::Status::OK();
   }
 
-  for (const string& output_array : model->flags.output_arrays()) {
+  for (const std::string& output_array : model->flags.output_arrays()) {
     if (preceding_op->outputs[0] == output_array) {
       return ::tensorflow::Status::OK();
     }
diff --git a/tensorflow/lite/toco/graph_transformations/graph_transformations.cc b/tensorflow/lite/toco/graph_transformations/graph_transformations.cc
index e4eb7698597..bee666531a7 100644
--- a/tensorflow/lite/toco/graph_transformations/graph_transformations.cc
+++ b/tensorflow/lite/toco/graph_transformations/graph_transformations.cc
@@ -29,7 +29,7 @@ namespace toco {
 
 namespace {
 
-void PrintModelStats(const string& label, const Model& model) {
+void PrintModelStats(const std::string& label, const Model& model) {
   int quantized_arrays = 0;
   for (const auto& array : model.GetArrayMap()) {
     if (array.second->quantization_params) {
@@ -57,8 +57,8 @@ void PrintModelStats(const string& label, const Model& model) {
 void DiscardUselessConnectedComponentsAndRNNBackEdges(Model* model) {
   // Identify the set of arrays that are in 'useful' connected components
   // of the graph, which means connected to output arrays.
-  std::unordered_set<string> useful_arrays;
-  for (const string& output_array : model->flags.output_arrays()) {
+  std::unordered_set<std::string> useful_arrays;
+  for (const std::string& output_array : model->flags.output_arrays()) {
     useful_arrays.insert(output_array);
   }
   bool found_new_useful_arrays;
@@ -66,15 +66,15 @@ void DiscardUselessConnectedComponentsAndRNNBackEdges(Model* model) {
     found_new_useful_arrays = false;
     for (const auto& op : model->operators) {
       bool op_touches_useful_arrays = false;
-      for (const string& output : op->outputs) {
+      for (const std::string& output : op->outputs) {
         op_touches_useful_arrays |= useful_arrays.count(output);
       }
       if (op_touches_useful_arrays) {
-        for (const string& input : op->inputs) {
+        for (const std::string& input : op->inputs) {
           found_new_useful_arrays |= !useful_arrays.count(input);
           useful_arrays.insert(input);
         }
-        for (const string& output : op->outputs) {
+        for (const std::string& output : op->outputs) {
           found_new_useful_arrays |= !useful_arrays.count(output);
           useful_arrays.insert(output);
         }
@@ -91,7 +91,7 @@ void DiscardUselessConnectedComponentsAndRNNBackEdges(Model* model) {
     }
   } while (found_new_useful_arrays);
   // Erase arrays that aren't useful, and that are discardable.
-  model->EraseArrays([&](const string& name) {
+  model->EraseArrays([&](const std::string& name) {
     return (!useful_arrays.count(name) && IsDiscardableArray(*model, name));
   });
   // Erase operators that do not produce a useful output array.
@@ -101,7 +101,7 @@ void DiscardUselessConnectedComponentsAndRNNBackEdges(Model* model) {
     if (useful_arrays.count((*it)->outputs[0])) {
       ++it;
     } else {
-      for (const string& output : (*it)->outputs) {
+      for (const std::string& output : (*it)->outputs) {
         CHECK(!useful_arrays.count(output));
       }
       it = model->operators.erase(it);
@@ -156,7 +156,7 @@ bool GraphTransformationsPass(int increment, Model* model,
                         << " at op_index=" << op_index << "/"
                         << model->operators.size() - 1;
       }
-      for (const string& message : transformation->Messages()) {
+      for (const std::string& message : transformation->Messages()) {
         VLOG(log_level) << transformation->Name() << " " << made_a_change_msg
                         << " at op_index=" << op_index << "/"
                         << model->operators.size() - 1 << ": " << message;
@@ -191,7 +191,7 @@ bool GraphTransformationsPass(int increment, Model* model,
 }  // namespace
 
 tensorflow::Status RunGraphTransformationsWithStatus(
-    Model* model, const string& msg,
+    Model* model, const std::string& msg,
     const GraphTransformationsSet& transformations) {
   PrintModelStats(toco::port::StringF("Before %s", msg), *model);
   int pass_index = 0;
diff --git a/tensorflow/lite/toco/graph_transformations/graph_transformations.h b/tensorflow/lite/toco/graph_transformations/graph_transformations.h
index 07b9fd4c5cf..4d7278fcaf9 100644
--- a/tensorflow/lite/toco/graph_transformations/graph_transformations.h
+++ b/tensorflow/lite/toco/graph_transformations/graph_transformations.h
@@ -33,7 +33,7 @@ class GraphTransformation {
   virtual ~GraphTransformation() {}
   // Returns the list of messages that this graph transformation
   // generated since ClearMessages() was called.
-  const std::vector<string>& Messages() const { return messages_; }
+  const std::vector<std::string>& Messages() const { return messages_; }
   // Clears the list of messages; should be called after every
   // run of this graph transformation.
   void ClearMessages() { return messages_.clear(); }
@@ -48,7 +48,7 @@ class GraphTransformation {
   GraphTransformation() {}
 
   // List of messages generated by this graph transformation.
-  std::vector<string> messages_;
+  std::vector<std::string> messages_;
 
  private:
   GraphTransformation(const GraphTransformation& other) = delete;
@@ -74,7 +74,7 @@ class GraphTransformationsSet {
     }
   }
   void Add(GraphTransformation* transformation) {
-    const string& name = transformation->Name();
+    const std::string& name = transformation->Name();
     CHECK(!names_.count(name));
     names_.insert(name);
     transformations_.emplace_back(transformation);
@@ -92,7 +92,7 @@ class GraphTransformationsSet {
   GraphTransformationsSet(const GraphTransformationsSet&& other) = delete;
   std::vector<std::unique_ptr<GraphTransformation>> transformations_;
   // Names of transformations in the set. Only used to guard against dupes.
-  std::unordered_set<string> names_;
+  std::unordered_set<std::string> names_;
 };
 
 // Run the given list of graph transformations on the model.
@@ -103,11 +103,11 @@ class GraphTransformationsSet {
 // the resulting raw pointers, and this RunGraphTransformations
 // takes care of delete'ing these pointers.
 tensorflow::Status RunGraphTransformationsWithStatus(
-    Model* model, const string& msg,
+    Model* model, const std::string& msg,
     const GraphTransformationsSet& transformations);
 
 inline void RunGraphTransformations(
-    Model* model, const string& msg,
+    Model* model, const std::string& msg,
     const GraphTransformationsSet& transformations) {
   auto s = RunGraphTransformationsWithStatus(model, msg, transformations);
   CHECK(s.ok()) << s.error_message();
@@ -232,7 +232,7 @@ class PropagateDefaultMinMax : public GraphTransformation {
   }
 
  private:
-  bool SetArrayMinMax(const string& array_name, Array* array);
+  bool SetArrayMinMax(const std::string& array_name, Array* array);
   std::vector<std::pair<ArrayDataType, MinMax>> type_ranges_;
 };
 
diff --git a/tensorflow/lite/toco/graph_transformations/group_bidirectional_sequence_ops.cc b/tensorflow/lite/toco/graph_transformations/group_bidirectional_sequence_ops.cc
index fa252b1a61b..4f9caeb77b0 100644
--- a/tensorflow/lite/toco/graph_transformations/group_bidirectional_sequence_ops.cc
+++ b/tensorflow/lite/toco/graph_transformations/group_bidirectional_sequence_ops.cc
@@ -197,7 +197,7 @@ void ConstructBidirectionalSequenceOp(
   constexpr int kBwInputActivationStartIndex = 37;
   constexpr int kAuxInputStartIndex = 39;
   (*bi_op)->inputs.reserve(kBidirectionalSequenceLstmInputsCount);
-  const string& input_array_name =
+  const std::string& input_array_name =
       AvailableArrayName(*model, "bidirectional_sequence_lstm_input_0");
   model->GetOrCreateArray(input_array_name);
   // The input will be changed later.
@@ -232,7 +232,7 @@ void ConstructBidirectionalSequenceOp(
 
   // TODO(renjieliu): Deal with Auxiliary input and weights for 39 - 47.
   for (; i <= kBidirectionalSequenceLstmInputsCount; ++i) {
-    const string& temp_array_name = AvailableArrayName(
+    const std::string& temp_array_name = AvailableArrayName(
         *model, "bidirectional_sequence_lstm_temp_" + std::to_string(i));
     model->CreateOptionalArray(temp_array_name);
     (*bi_op)->inputs.push_back(temp_array_name);
@@ -240,9 +240,9 @@ void ConstructBidirectionalSequenceOp(
 
   // Deal with outputs.
   (*bi_op)->outputs.reserve(2);
-  const string& fw_output_array_name =
+  const std::string& fw_output_array_name =
       AvailableArrayName(*model, "bidirectional_sequence_lstm_fw_output_0");
-  const string& bw_output_array_name =
+  const std::string& bw_output_array_name =
       AvailableArrayName(*model, "bidirectional_sequence_lstm_bw_output_0");
   model->GetOrCreateArray(fw_output_array_name);
   model->GetOrCreateArray(bw_output_array_name);
@@ -260,7 +260,7 @@ void ConstructBidirectionalSequenceOp(
   constexpr int kBwInputsStartIndex = 5;
   constexpr int kAuxInputsStartIndex = 9;
   (*bi_op)->inputs.reserve(kBidirectionalSequenceRnnInputsCount);
-  const string& input_array_name =
+  const std::string& input_array_name =
       AvailableArrayName(*model, "bidirectional_sequence_rnn_input_0");
   model->GetOrCreateArray(input_array_name);
   // The input will be changed later.
@@ -280,7 +280,7 @@ void ConstructBidirectionalSequenceOp(
 
   // TODO(renjieliu): Deal with optional weights.
   for (; i < kBidirectionalSequenceRnnInputsCount; ++i) {
-    const string& temp_array_name = AvailableArrayName(
+    const std::string& temp_array_name = AvailableArrayName(
         *model, "bidirectional_sequence_rnn_temp_" + std::to_string(i));
     model->CreateOptionalArray(temp_array_name);
     (*bi_op)->inputs.push_back(temp_array_name);
@@ -288,9 +288,9 @@ void ConstructBidirectionalSequenceOp(
 
   // Deal with outputs.
   (*bi_op)->outputs.reserve(2);
-  const string& fw_output_array_name =
+  const std::string& fw_output_array_name =
       AvailableArrayName(*model, "bidirectional_sequence_rnn_fw_output_0");
-  const string& bw_output_array_name =
+  const std::string& bw_output_array_name =
       AvailableArrayName(*model, "bidirectional_sequence_rnn_bw_output_0");
   model->GetOrCreateArray(fw_output_array_name);
   model->GetOrCreateArray(bw_output_array_name);
@@ -318,7 +318,7 @@ void GroupFwBwSequenceOps(Model* model, std::stack<Operator*> fw_sequence_ops,
 
 template <typename T>
 void RewireBidirectionalSequenceSequenceOpsConnections(
-    OperatorType operator_type, const string& input_array_name,
+    OperatorType operator_type, const std::string& input_array_name,
     const std::vector<T*>& bidirectional_sequence_ops,
     std::vector<std::unique_ptr<Operator>>::iterator* op_it, Model* model) {
   int aux_input_index = -1;
@@ -333,8 +333,8 @@ void RewireBidirectionalSequenceSequenceOpsConnections(
       // Should not reach here.
       DCHECK(false);
   }
-  string cur_fw_input = input_array_name;
-  string cur_bw_input = input_array_name;
+  std::string cur_fw_input = input_array_name;
+  std::string cur_bw_input = input_array_name;
   for (size_t i = 0; i < bidirectional_sequence_ops.size(); ++i) {
     DeleteArrayIfUnusedOutsideOfOp(bidirectional_sequence_ops[i]->inputs[0],
                                    bidirectional_sequence_ops[i], model);
@@ -371,8 +371,8 @@ void RewireFinalUnpackOutputs(const UnpackOperator& original_unpack_operator,
   (*final_unpack_operator)->num = original_unpack_operator.num;
 
   for (size_t i = 0; i < original_unpack_operator.outputs.size(); ++i) {
-    const string& output_array_name = original_unpack_operator.outputs[i];
-    const string& final_unpack_output_array_name = AvailableArrayName(
+    const std::string& output_array_name = original_unpack_operator.outputs[i];
+    const std::string& final_unpack_output_array_name = AvailableArrayName(
         *model, "bidirectional_sequence_unpack_" + std::to_string(i));
     model->GetOrCreateArray(final_unpack_output_array_name);
     (*final_unpack_operator)->outputs.push_back(final_unpack_output_array_name);
@@ -381,7 +381,7 @@ void RewireFinalUnpackOutputs(const UnpackOperator& original_unpack_operator,
       // If there's a following op after the unpack, it must be a concat op.
       DCHECK(unpack_following_op->type == OperatorType::kConcatenation);
       // For every output of the concat, rewire the outputs.
-      for (const string& concat_output : unpack_following_op->outputs) {
+      for (const std::string& concat_output : unpack_following_op->outputs) {
         (*final_unpack_operator)->outputs[i] = concat_output;
       }
       // Remove the concat op.
@@ -454,7 +454,7 @@ template <typename T>
                        &bidirectional_sequence_ops);
 
   // Rewire the inputs & outputs.
-  string current_input = first_fw_sequence_input->outputs[0];
+  std::string current_input = first_fw_sequence_input->outputs[0];
   RewireBidirectionalSequenceSequenceOpsConnections(
       operator_type, current_input, bidirectional_sequence_ops, &op_it, model);
 
@@ -525,7 +525,7 @@ template <typename T>
                        &bidirectional_sequence_lstm_ops);
 
   // Rewire the inputs & outputs.
-  string current_input = first_fw_lstm_input->outputs[0];
+  std::string current_input = first_fw_lstm_input->outputs[0];
   RewireBidirectionalSequenceSequenceOpsConnections(
       OperatorType::kBidirectionalSequenceLstm, current_input,
       bidirectional_sequence_lstm_ops, &op_it, model);
@@ -601,7 +601,7 @@ template <typename T>
                        &bidirectional_sequence_rnn_ops);
 
   // Rewire the inputs & outputs.
-  string current_input = first_fw_rnn_input->outputs[0];
+  std::string current_input = first_fw_rnn_input->outputs[0];
   RewireBidirectionalSequenceSequenceOpsConnections(
       OperatorType::kBidirectionalSequenceRnn, current_input,
       bidirectional_sequence_rnn_ops, &op_it, model);
diff --git a/tensorflow/lite/toco/graph_transformations/hardcode_min_max.cc b/tensorflow/lite/toco/graph_transformations/hardcode_min_max.cc
index 171d522daa7..c065e32c4df 100644
--- a/tensorflow/lite/toco/graph_transformations/hardcode_min_max.cc
+++ b/tensorflow/lite/toco/graph_transformations/hardcode_min_max.cc
@@ -279,10 +279,10 @@ bool MinMaxApproximatelyEqual(const MinMax& minmax1, const MinMax& minmax2) {
 // If multiple of these arrays have MinMax, then these are required
 // to agree with each other.
 bool PropagateMinMaxAmongArrays(Model* model,
-                                const std::vector<string>& array_names) {
-  string reference_array_name;
+                                const std::vector<std::string>& array_names) {
+  std::string reference_array_name;
   MinMax* reference_minmax = nullptr;
-  for (const string& array_name : array_names) {
+  for (const std::string& array_name : array_names) {
     if (model->GetArray(array_name).minmax) {
       reference_array_name = array_name;
       reference_minmax = model->GetArray(array_name).minmax.get();
@@ -294,7 +294,7 @@ bool PropagateMinMaxAmongArrays(Model* model,
     return false;
   }
   bool changed = false;
-  for (const string& array_name : array_names) {
+  for (const std::string& array_name : array_names) {
     auto& array = model->GetArray(array_name);
     if (array.minmax) {
       CHECK(MinMaxApproximatelyEqual(*array.minmax, *reference_minmax))
diff --git a/tensorflow/lite/toco/graph_transformations/identify_dilated_conv.cc b/tensorflow/lite/toco/graph_transformations/identify_dilated_conv.cc
index 1940068d32a..2ba39e74d3a 100644
--- a/tensorflow/lite/toco/graph_transformations/identify_dilated_conv.cc
+++ b/tensorflow/lite/toco/graph_transformations/identify_dilated_conv.cc
@@ -206,7 +206,7 @@ bool ResolveDilatedConv(Model* model, Operator* conv_base_op, Operator* stb_op,
   }
 
   // Conv Op
-  const string& input_of_conv_op =
+  const std::string& input_of_conv_op =
       has_expand_op ? post_stb_op->outputs[0] : stb_op->outputs[0];
   auto* conv_base_op = GetOpWithInput(*model, input_of_conv_op);
   bool changed = false;
diff --git a/tensorflow/lite/toco/graph_transformations/identify_hardswish.cc b/tensorflow/lite/toco/graph_transformations/identify_hardswish.cc
index 00758a22177..5bc49899e0b 100644
--- a/tensorflow/lite/toco/graph_transformations/identify_hardswish.cc
+++ b/tensorflow/lite/toco/graph_transformations/identify_hardswish.cc
@@ -78,7 +78,7 @@ using util::IsBinaryOp;
   // 1. non-constant input of add_with_relu6_op
   // 2. 1/6
   // 3. (and add_with_relu6_op[0].outputs[0] - which we already know!)
-  std::vector<string> mul_inputs = mul_op->inputs;
+  std::vector<std::string> mul_inputs = mul_op->inputs;
   mul_inputs.insert(mul_inputs.end(), output_op->inputs.begin(),
                     output_op->inputs.end());
 
diff --git a/tensorflow/lite/toco/graph_transformations/identify_lstm.cc b/tensorflow/lite/toco/graph_transformations/identify_lstm.cc
index 43ce90a0444..14f81779147 100644
--- a/tensorflow/lite/toco/graph_transformations/identify_lstm.cc
+++ b/tensorflow/lite/toco/graph_transformations/identify_lstm.cc
@@ -35,7 +35,7 @@ std::vector<std::unique_ptr<Operator>>::iterator FindOperator(
   return it;
 }
 
-bool ValidateSourceOp(const Model& model, const string& array_name,
+bool ValidateSourceOp(const Model& model, const std::string& array_name,
                       OperatorType op_type, Operator** source_op) {
   if (op_type == OperatorType::kNone) {
     CHECK(!source_op);
@@ -184,7 +184,7 @@ bool MatchOperatorInputs(const Operator& op, const Model& model,
                            &state_remember_mul)) {
     return ::tensorflow::Status::OK();
   }
-  const string prev_state = state_forget_mul->inputs[0];
+  const std::string prev_state = state_forget_mul->inputs[0];
 
   // State forget gate
   Operator* state_forget_sig;
@@ -271,16 +271,16 @@ bool MatchOperatorInputs(const Operator& op, const Model& model,
               LogName(*lstm_cell_op));
 
   // Create temp arrays used internally during runtime.
-  const string base_name(FindLongestCommonPrefix(
+  const std::string base_name(FindLongestCommonPrefix(
       lstm_cell_op->outputs[LstmCellOperator::STATE_OUTPUT],
       lstm_cell_op->outputs[LstmCellOperator::ACTIV_OUTPUT]));
-  const string& concat_temp_array_name =
+  const std::string& concat_temp_array_name =
       AvailableArrayName(*model, base_name + "concat_temp");
   auto& concat_temp_array = model->GetOrCreateArray(concat_temp_array_name);
   concat_temp_array.data_type =
       model->GetArray(concat_inputs->outputs[0]).data_type;
   lstm_cell_op->outputs[LstmCellOperator::CONCAT_TEMP] = concat_temp_array_name;
-  const string& activ_temp_array_name =
+  const std::string& activ_temp_array_name =
       AvailableArrayName(*model, base_name + "activ_temp");
   auto& activ_temp_array = model->GetOrCreateArray(activ_temp_array_name);
   activ_temp_array.data_type =
diff --git a/tensorflow/lite/toco/graph_transformations/identify_lstm_merge_inputs.cc b/tensorflow/lite/toco/graph_transformations/identify_lstm_merge_inputs.cc
index 2ac1d380813..cfa5f879f44 100644
--- a/tensorflow/lite/toco/graph_transformations/identify_lstm_merge_inputs.cc
+++ b/tensorflow/lite/toco/graph_transformations/identify_lstm_merge_inputs.cc
@@ -45,12 +45,12 @@ namespace toco {
 
   // Identify prev_activ_input, prev_state_input as required Op inputs,
   // using the rnn_states in the model flag.
-  string prev_activ_input;
+  std::string prev_activ_input;
   if (!GetMatchingRnnArray(model, src_op->outputs[kOutputTensor],
                            &prev_activ_input)) {
     return ::tensorflow::Status::OK();
   }
-  string prev_state_input;
+  std::string prev_state_input;
   if (!GetMatchingRnnArray(model, src_op->outputs[kCellStateTensor],
                            &prev_state_input)) {
     return ::tensorflow::Status::OK();
@@ -72,9 +72,10 @@ namespace toco {
   CHECK_EQ(num_cell, num_output);
 
   // Create tensorflow_graphdef style's one big weight tensor.
-  const string base_name(FindLongestCommonPrefix(
+  const std::string base_name(FindLongestCommonPrefix(
       src_op->outputs[kOutputTensor], src_op->outputs[kCellStateTensor]));
-  string merged_weights = AvailableArrayName(*model, base_name + "weights");
+  std::string merged_weights =
+      AvailableArrayName(*model, base_name + "weights");
   auto& array = model->GetOrCreateArray(merged_weights);
   array.data_type = ArrayDataType::kFloat;
   int weights_dim1 = 4 * num_cell;
@@ -117,7 +118,7 @@ namespace toco {
       num_cell * 3, num_input);
 
   // Create tensorflow_graphdef style's one big bias tensor.
-  string merged_biases = AvailableArrayName(*model, base_name + "biases");
+  std::string merged_biases = AvailableArrayName(*model, base_name + "biases");
   auto& bias_array = model->GetOrCreateArray(merged_biases);
   bias_array.data_type = ArrayDataType::kFloat;
   bias_array.copy_shape(Shape({weights_dim1}));
@@ -160,7 +161,7 @@ namespace toco {
   lstm_cell_op->outputs[LstmCellOperator::ACTIV_TEMP] =
       src_op->outputs[kOutputStateTensor];
   // Create a new temp array for the fourth output.
-  const string& concat_temp_array_name =
+  const std::string& concat_temp_array_name =
       AvailableArrayName(*model, base_name + "concat_temp");
   model->GetOrCreateArray(concat_temp_array_name);
   lstm_cell_op->outputs[LstmCellOperator::CONCAT_TEMP] = concat_temp_array_name;
diff --git a/tensorflow/lite/toco/graph_transformations/identify_lstm_split_inputs.cc b/tensorflow/lite/toco/graph_transformations/identify_lstm_split_inputs.cc
index 62f4124fb4e..8359534435a 100644
--- a/tensorflow/lite/toco/graph_transformations/identify_lstm_split_inputs.cc
+++ b/tensorflow/lite/toco/graph_transformations/identify_lstm_split_inputs.cc
@@ -86,7 +86,7 @@ namespace toco {
   // Get original weight tensor and decompose 1 tensor to 8 sub tensors.
   Array& kernel =
       model->GetArray(curr_op->inputs[LstmCellOperator::WEIGHTS_INPUT]);
-  const string base_name(FindLongestCommonPrefix(
+  const std::string base_name(FindLongestCommonPrefix(
       curr_op->outputs[LstmCellOperator::ACTIV_OUTPUT],
       curr_op->outputs[LstmCellOperator::STATE_OUTPUT]));
 
diff --git a/tensorflow/lite/toco/graph_transformations/lstm_utils.cc b/tensorflow/lite/toco/graph_transformations/lstm_utils.cc
index 3414a7fd7fe..a8b69205c0d 100644
--- a/tensorflow/lite/toco/graph_transformations/lstm_utils.cc
+++ b/tensorflow/lite/toco/graph_transformations/lstm_utils.cc
@@ -16,8 +16,8 @@ limitations under the License.
 
 namespace toco {
 
-void CreateOptionalArray(Model* model, string* input_array_buffer,
-                         const string& array_name) {
+void CreateOptionalArray(Model* model, std::string* input_array_buffer,
+                         const std::string& array_name) {
   *input_array_buffer = array_name;
   model->CreateOptionalArray(array_name);
 }
@@ -39,7 +39,7 @@ void CopyArrayData(const Buffer<ArrayDataType::kFloat>& src_buffer,
 }
 
 Buffer<ArrayDataType::kFloat>* CreateFloatArrayBuffer(Model* model,
-                                                      string* array_name,
+                                                      std::string* array_name,
                                                       const Shape& shape) {
   *array_name = AvailableArrayName(*model, *array_name);
   auto& array = model->GetOrCreateArray(*array_name);
@@ -51,8 +51,8 @@ Buffer<ArrayDataType::kFloat>* CreateFloatArrayBuffer(Model* model,
   return buffer;
 }
 
-void CopySubArrayToArray(Model* model, string* array_name,
-                         const string& tensor_name, int dim1_size,
+void CopySubArrayToArray(Model* model, std::string* array_name,
+                         const std::string& tensor_name, int dim1_size,
                          int dim2_size, const Array& original_array,
                          int start_idx1, int start_idx2) {
   // Determine whether it's bias or not, create shape, buffer.
@@ -83,8 +83,9 @@ void CopyArrayToSubArray(Buffer<ArrayDataType::kFloat>& tensor_buffer,
                 dim1_copy_size, dim2_copy_size);
 }
 
-bool GetMatchingRnnArray(Model* model, const string& back_edge_source_array,
-                         string* rnn_array) {
+bool GetMatchingRnnArray(Model* model,
+                         const std::string& back_edge_source_array,
+                         std::string* rnn_array) {
   for (const auto& rnn_state : model->flags.rnn_states()) {
     if (rnn_state.back_edge_source_array() == back_edge_source_array) {
       *rnn_array = rnn_state.state_array();
diff --git a/tensorflow/lite/toco/graph_transformations/lstm_utils.h b/tensorflow/lite/toco/graph_transformations/lstm_utils.h
index 949292ee84b..102fe7d6cfc 100644
--- a/tensorflow/lite/toco/graph_transformations/lstm_utils.h
+++ b/tensorflow/lite/toco/graph_transformations/lstm_utils.h
@@ -62,12 +62,12 @@ enum ExtendedLstmCellOutputs {
 };
 
 // Create optional array used for optional tensor in ExtendedLstmCell inputs.
-void CreateOptionalArray(Model* model, string* input_array_buffer,
-                         const string& array_name);
+void CreateOptionalArray(Model* model, std::string* input_array_buffer,
+                         const std::string& array_name);
 
 // Create float array and get its buffer.
 Buffer<ArrayDataType::kFloat>* CreateFloatArrayBuffer(Model* model,
-                                                      string* array_name,
+                                                      std::string* array_name,
                                                       const Shape& shape);
 
 // Copy data from one array to the other one (supports 1D and 2D array),
@@ -91,8 +91,8 @@ void CopyArrayData(const Buffer<ArrayDataType::kFloat>& src_buffer,
 
 // Copy a subset of array data and create a smaller array,
 // mostly used for spliting weights and bias for Lstm cell.
-void CopySubArrayToArray(Model* model, string* array_name,
-                         const string& tensor_name, int dim1_size,
+void CopySubArrayToArray(Model* model, std::string* array_name,
+                         const std::string& tensor_name, int dim1_size,
                          int dim2_size, const Array& original_array,
                          int start_idx1, int start_idx2);
 
@@ -103,8 +103,9 @@ void CopyArrayToSubArray(Buffer<ArrayDataType::kFloat>& tensor_buffer,
                          int start_idx1, int start_idx2);
 
 // Get mating rnn array inputs using rnn_states flag.
-bool GetMatchingRnnArray(Model* model, const string& back_edge_source_array,
-                         string* rnn_array);
+bool GetMatchingRnnArray(Model* model,
+                         const std::string& back_edge_source_array,
+                         std::string* rnn_array);
 
 }  // namespace toco
 
diff --git a/tensorflow/lite/toco/graph_transformations/make_initial_dequantize_operator.cc b/tensorflow/lite/toco/graph_transformations/make_initial_dequantize_operator.cc
index b914838b91c..7783b41767c 100644
--- a/tensorflow/lite/toco/graph_transformations/make_initial_dequantize_operator.cc
+++ b/tensorflow/lite/toco/graph_transformations/make_initial_dequantize_operator.cc
@@ -31,7 +31,8 @@ namespace toco {
 // generate this output to be removed by graph transformations.  Note that there
 // may be more than one operator that takes the input_array as their input, and
 // that some of these may be removed by graph transformations.
-bool AddDequantizeOperatorToInput(const string& input_name, const Operator* op,
+bool AddDequantizeOperatorToInput(const std::string& input_name,
+                                  const Operator* op,
                                   GraphTransformation* transformation,
                                   Model* model) {
   // An operator with the required output may be a dequantize operator already
@@ -65,7 +66,7 @@ bool AddDequantizeOperatorToInput(const string& input_name, const Operator* op,
   const auto& dequantized_input_name =
       AvailableArrayName(*model, input_name + "_dequantized");
   for (auto& other_op : model->operators) {
-    for (string& other_op_input : other_op->inputs) {
+    for (std::string& other_op_input : other_op->inputs) {
       if (other_op_input == input_name) {
         other_op_input = dequantized_input_name;
       }
diff --git a/tensorflow/lite/toco/graph_transformations/merge_reshape_into_preceding_transpose.cc b/tensorflow/lite/toco/graph_transformations/merge_reshape_into_preceding_transpose.cc
index 80170fe8bcb..96ccc22d9e9 100644
--- a/tensorflow/lite/toco/graph_transformations/merge_reshape_into_preceding_transpose.cc
+++ b/tensorflow/lite/toco/graph_transformations/merge_reshape_into_preceding_transpose.cc
@@ -117,8 +117,8 @@ std::vector<int32> ReshapeToTranspose(const Model& model,
     return ::tensorflow::Status::OK();
   }
 
-  const string intermediate_name = reshape_op->inputs[0];
-  const string output_name = reshape_op->outputs[0];
+  const std::string intermediate_name = reshape_op->inputs[0];
+  const std::string output_name = reshape_op->outputs[0];
 
   // Guarantee the input is only consume by the reshape.
   if (CountOpsWithInput(*model, intermediate_name) != 1) {
diff --git a/tensorflow/lite/toco/graph_transformations/move_binary_operator_before_reshape.cc b/tensorflow/lite/toco/graph_transformations/move_binary_operator_before_reshape.cc
index 0f3c4d34d66..222d1fd1e08 100644
--- a/tensorflow/lite/toco/graph_transformations/move_binary_operator_before_reshape.cc
+++ b/tensorflow/lite/toco/graph_transformations/move_binary_operator_before_reshape.cc
@@ -141,7 +141,7 @@ bool IsTailOfShape(const Shape& tail, const Shape& shape) {
   }
 
   // EXTRA CHECKS ON CONNECTING ARRAY
-  for (const string& output_array : model->flags.output_arrays()) {
+  for (const std::string& output_array : model->flags.output_arrays()) {
     if (binary_op->inputs[variable_input_idx] == output_array) {
       AddMessageF(
           "Not moving %s because the output of reshape op %s is an output op.",
diff --git a/tensorflow/lite/toco/graph_transformations/propagate_activation_function_into_constants.cc b/tensorflow/lite/toco/graph_transformations/propagate_activation_function_into_constants.cc
index 95de60262e7..a66a4cd0124 100644
--- a/tensorflow/lite/toco/graph_transformations/propagate_activation_function_into_constants.cc
+++ b/tensorflow/lite/toco/graph_transformations/propagate_activation_function_into_constants.cc
@@ -52,7 +52,7 @@ namespace toco {
   }
 
   // Filter to the list of supported ops.
-  string src_op_input;
+  std::string src_op_input;
   switch (src_op->type) {
     case OperatorType::kGather:
       src_op_input = src_op->inputs[0];
diff --git a/tensorflow/lite/toco/graph_transformations/propagate_array_data_types.cc b/tensorflow/lite/toco/graph_transformations/propagate_array_data_types.cc
index 49d59de860b..5eda1950745 100644
--- a/tensorflow/lite/toco/graph_transformations/propagate_array_data_types.cc
+++ b/tensorflow/lite/toco/graph_transformations/propagate_array_data_types.cc
@@ -48,7 +48,7 @@ void SetDataTypeForAllOutputs(Model* model, Operator* op,
   }
   // Record data types of output before processing, so we can see at the
   // end if we changed anything, and return the correct boolean value.
-  std::unordered_map<string, ArrayDataType> old_output_data_types;
+  std::unordered_map<std::string, ArrayDataType> old_output_data_types;
   for (const auto& output : op->outputs) {
     old_output_data_types[output] = model->GetArray(output).data_type;
   }
@@ -171,7 +171,7 @@ void SetDataTypeForAllOutputs(Model* model, Operator* op,
         return ::tensorflow::Status::OK();
       }
       for (int i = 0; i < op->outputs.size(); ++i) {
-        const string& output = op->outputs[i];
+        const std::string& output = op->outputs[i];
         const ArrayDataType data_type = unsupported_op->output_data_types[i];
         model->GetArray(output).data_type = data_type;
       }
diff --git a/tensorflow/lite/toco/graph_transformations/propagate_default_min_max.cc b/tensorflow/lite/toco/graph_transformations/propagate_default_min_max.cc
index d31ba956afd..bf1109ddba5 100644
--- a/tensorflow/lite/toco/graph_transformations/propagate_default_min_max.cc
+++ b/tensorflow/lite/toco/graph_transformations/propagate_default_min_max.cc
@@ -70,7 +70,7 @@ bool SupportsMinMax(const Array& array) {
 
 // Sets the min/max on the given array, adjusting the reference_minmax for the
 // final data type of the array if it is already specified.
-bool PropagateDefaultMinMax::SetArrayMinMax(const string& array_name,
+bool PropagateDefaultMinMax::SetArrayMinMax(const std::string& array_name,
                                             Array* array) {
   CHECK(!array->minmax);
 
diff --git a/tensorflow/lite/toco/graph_transformations/propagate_fixed_sizes.cc b/tensorflow/lite/toco/graph_transformations/propagate_fixed_sizes.cc
index 006e624eb7a..1524cfe7f35 100644
--- a/tensorflow/lite/toco/graph_transformations/propagate_fixed_sizes.cc
+++ b/tensorflow/lite/toco/graph_transformations/propagate_fixed_sizes.cc
@@ -268,7 +268,7 @@ void ProcessDepthwiseConvOperator(Model* model, DepthwiseConvOperator* op) {
   const auto& weights_shape = weights_array.shape();
   CHECK_EQ(weights_shape.dimensions_count(), 4);
 
-  const string& output_name = op->outputs[0];
+  const std::string& output_name = op->outputs[0];
   const int input_depth = input_shape.dims(3);
   const int output_depth = weights_shape.dims(3);
   // TensorFlow doesn't define the depth_multiplier value on DepthwiseConv ops,
@@ -302,7 +302,7 @@ void ProcessDepthToSpaceOperator(Model* model, DepthToSpaceOperator* op) {
   const auto& input_shape = input_array.shape();
   CHECK_EQ(input_shape.dimensions_count(), 4);
 
-  const string& output_name = op->outputs[0];
+  const std::string& output_name = op->outputs[0];
   const int block_size = op->block_size;
   CHECK_NE(block_size, 0) << "Invalid block_size in " << output_name;
   const int batch = input_shape.dims(0);
@@ -325,7 +325,7 @@ void ProcessSpaceToDepthOperator(Model* model, SpaceToDepthOperator* op) {
   const auto& input_shape = input_array.shape();
   CHECK_EQ(input_shape.dimensions_count(), 4);
 
-  const string& output_name = op->outputs[0];
+  const std::string& output_name = op->outputs[0];
   const int block_size = op->block_size;
   CHECK_NE(block_size, 0) << "Invalid block_size in " << output_name;
   const int batch = input_shape.dims(0);
@@ -470,7 +470,7 @@ void ProcessSimpleOperator(Model* model, Operator* op, int input_index) {
     return;
   }
 
-  const string& output_name = op->outputs[0];
+  const std::string& output_name = op->outputs[0];
   auto& output_array = model->GetArray(output_name);
   if (output_array.has_shape()) {
     return;
@@ -487,7 +487,7 @@ void ProcessSimpleBinaryOperator(Model* model, Operator* op) {
   if (!input0_array.has_shape() || !input1_array.has_shape()) {
     return;
   }
-  const string& output_name = op->outputs[0];
+  const std::string& output_name = op->outputs[0];
   auto& output_array = model->GetArray(output_name);
   ComputeBinaryOperatorOutputSize(input0_array.shape(), input1_array.shape(),
                                   &output_array);
@@ -639,14 +639,14 @@ void ProcessSliceOperator(Model* model, SliceOperator* op) {
 }
 
 void ProcessReorderAxesOperator(Model* model, ReorderAxesOperator* op) {
-  const string& input_name = op->inputs[0];
+  const std::string& input_name = op->inputs[0];
   const auto& input_array = model->GetArray(input_name);
   // Yield until input dims have been resolved.
   if (!input_array.has_shape()) {
     return;
   }
   const auto& input_shape = input_array.shape();
-  const string& output_name = op->outputs[0];
+  const std::string& output_name = op->outputs[0];
   Shape* output_shape = model->GetArray(output_name).mutable_shape();
   ShuffleDims(input_shape, op->input_axes_order, op->output_axes_order,
               output_shape);
@@ -757,7 +757,7 @@ void ProcessRangeOperator(Model* model, RangeOperator* op) {
 
 void ProcessTensorFlowSplitOperator(Model* model, TensorFlowSplitOperator* op) {
   CHECK_EQ(op->inputs.size(), 2);
-  const string& input_name = op->inputs[1];
+  const std::string& input_name = op->inputs[1];
   const auto& input_array = model->GetArray(input_name);
   // Yield until input dims have been resolved.
   if (!input_array.has_shape()) {
@@ -892,7 +892,7 @@ void ProcessTensorFlowSplitVOperator(Model* model,
 }
 
 void ProcessAveragePoolOperator(Model* model, AveragePoolOperator* op) {
-  const string& input_name = op->inputs[0];
+  const std::string& input_name = op->inputs[0];
   const auto& input_array = model->GetArray(input_name);
   // Yield until input dims have been resolved.
   if (!input_array.has_shape()) {
@@ -900,7 +900,7 @@ void ProcessAveragePoolOperator(Model* model, AveragePoolOperator* op) {
   }
   const auto& input_shape = input_array.shape();
   CHECK_EQ(input_shape.dimensions_count(), 4);
-  const string& output_name = op->outputs[0];
+  const std::string& output_name = op->outputs[0];
   const int output_depth = input_shape.dims(3);
   ComputeConvSizes(input_shape, output_depth, op->kwidth, op->kheight,
                    op->stride_width, op->stride_height, 1, 1, op->padding.type,
@@ -909,7 +909,7 @@ void ProcessAveragePoolOperator(Model* model, AveragePoolOperator* op) {
 }
 
 void ProcessMaxPoolOperator(Model* model, MaxPoolOperator* op) {
-  const string& input_name = op->inputs[0];
+  const std::string& input_name = op->inputs[0];
   const auto& input_array = model->GetArray(input_name);
   // Yield until input dims have been resolved.
   if (!input_array.has_shape()) {
@@ -917,7 +917,7 @@ void ProcessMaxPoolOperator(Model* model, MaxPoolOperator* op) {
   }
   const auto& input_shape = input_array.shape();
   CHECK_EQ(input_shape.dimensions_count(), 4);
-  const string& output_name = op->outputs[0];
+  const std::string& output_name = op->outputs[0];
   const int output_depth = input_shape.dims(3);
   ComputeConvSizes(input_shape, output_depth, op->kwidth, op->kheight,
                    op->stride_width, op->stride_height, 1, 1, op->padding.type,
@@ -926,7 +926,7 @@ void ProcessMaxPoolOperator(Model* model, MaxPoolOperator* op) {
 }
 
 void ProcessL2PoolOperator(Model* model, L2PoolOperator* op) {
-  const string& input_name = op->inputs[0];
+  const std::string& input_name = op->inputs[0];
   const auto& input_array = model->GetArray(input_name);
   // Yield until input dims have been resolved.
   if (!input_array.has_shape()) {
@@ -936,7 +936,7 @@ void ProcessL2PoolOperator(Model* model, L2PoolOperator* op) {
   if (input_shape.dimensions_count() < 4) {
     LOG(FATAL) << "missing dimensions for " << input_name;
   }
-  const string& output_name = op->outputs[0];
+  const std::string& output_name = op->outputs[0];
   const int output_depth = input_shape.dims(3);
   ComputeConvSizes(input_shape, output_depth, op->kwidth, op->kheight,
                    op->stride_width, op->stride_height, 1, 1, op->padding.type,
@@ -954,7 +954,7 @@ void ProcessResizeBilinearOperator(Model* model, ResizeBilinearOperator* op) {
   }
   const auto& input_data_shape = model->GetArray(op->inputs[0]).shape();
 
-  const string& output_size_name = op->inputs[1];
+  const std::string& output_size_name = op->inputs[1];
   const auto& output_size_array = model->GetArray(output_size_name);
   CHECK(output_size_array.data_type == ArrayDataType::kInt32);
   CHECK(output_size_array.has_shape());
@@ -982,7 +982,7 @@ void ProcessResizeNearestNeighborOperator(Model* model,
   }
   const auto& input_data_shape = model->GetArray(op->inputs[0]).shape();
 
-  const string& output_size_name = op->inputs[1];
+  const std::string& output_size_name = op->inputs[1];
   const auto& output_size_array = model->GetArray(output_size_name);
   CHECK(output_size_array.data_type == ArrayDataType::kInt32);
   CHECK(output_size_array.has_shape());
@@ -1862,7 +1862,7 @@ void ProcessArgMinMaxOperator(Model* model, Op* op) {
     }
   }
 
-  const string& output_name = op->outputs[0];
+  const std::string& output_name = op->outputs[0];
   auto& output_array = model->GetArray(output_name);
   if (output_array.has_shape()) {
     return;
@@ -1880,7 +1880,7 @@ void ProcessSparseToDenseOperator(Model* model, SparseToDenseOperator* op) {
   // Output should not go over four dimensions.
   CHECK_LE(output_shape_array.shape().dims(0), 4);
 
-  const string& output_name = op->outputs[0];
+  const std::string& output_name = op->outputs[0];
   Array& output_array = model->GetArray(output_name);
   if (output_array.has_shape()) return;
 
@@ -2015,7 +2015,7 @@ void ProcessUnpackOperator(Model* model, UnpackOperator* op) {
       output_dims.push_back(input_dims[i]);
     }
   }
-  for (const string& output_name : op->outputs) {
+  for (const std::string& output_name : op->outputs) {
     auto& output_array = model->GetArray(output_name);
     if (output_array.has_shape()) {
       return;
@@ -2149,7 +2149,7 @@ void ProcessScatterNdOperator(Model* model, ScatterNdOperator* op) {
   *modified = false;
   auto it = model->operators.begin() + op_index;
   auto* op = it->get();
-  std::unordered_map<string, std::vector<int>> old_output_dims;
+  std::unordered_map<std::string, std::vector<int>> old_output_dims;
   for (const auto& output : op->outputs) {
     if (model->GetArray(output).has_shape()) {
       old_output_dims[output] = model->GetArray(output).shape().dims();
@@ -2400,7 +2400,7 @@ void ProcessScatterNdOperator(Model* model, ScatterNdOperator* op) {
         return ::tensorflow::Status::OK();
       }
       for (int i = 0; i < op->outputs.size(); ++i) {
-        const string& output = op->outputs[i];
+        const std::string& output = op->outputs[i];
         model->GetArray(output).copy_shape(unsupported_op->output_shapes.at(i));
       }
       break;
diff --git a/tensorflow/lite/toco/graph_transformations/quantization_util.cc b/tensorflow/lite/toco/graph_transformations/quantization_util.cc
index 23749abf0b1..76ead658107 100644
--- a/tensorflow/lite/toco/graph_transformations/quantization_util.cc
+++ b/tensorflow/lite/toco/graph_transformations/quantization_util.cc
@@ -164,7 +164,7 @@ std::unique_ptr<GenericBuffer> QuantizeBuffer(
 
 template <ArrayDataType A>
 void QuantizeArray(GraphTransformation* transformation, Model* model,
-                   const string& name,
+                   const std::string& name,
                    const QuantizationParams& quantization_params) {
   auto& array = model->GetArray(name);
   CHECK(array.data_type == ArrayDataType::kFloat);
@@ -184,7 +184,7 @@ void QuantizeArray(GraphTransformation* transformation, Model* model,
 }  // namespace
 
 void QuantizeArray(GraphTransformation* transformation, Model* model,
-                   const string& name, ArrayDataType quantized_data_type,
+                   const std::string& name, ArrayDataType quantized_data_type,
                    const QuantizationParams& quantization_params) {
   ArrayDataType adjusted_data_type = quantized_data_type;
   auto& array = model->GetArray(name);
diff --git a/tensorflow/lite/toco/graph_transformations/quantization_util.h b/tensorflow/lite/toco/graph_transformations/quantization_util.h
index d226aeab8b7..d1d72b98f9e 100644
--- a/tensorflow/lite/toco/graph_transformations/quantization_util.h
+++ b/tensorflow/lite/toco/graph_transformations/quantization_util.h
@@ -47,7 +47,7 @@ void ChooseQuantizationParamsForArrayAndQuantizedDataType(
 // Quantizes an array by setting its data type and (if constant) quantizing
 // all values in the array.
 void QuantizeArray(GraphTransformation* transformation, Model* model,
-                   const string& name, ArrayDataType quantized_data_type,
+                   const std::string& name, ArrayDataType quantized_data_type,
                    const QuantizationParams& quantization_params);
 
 // Returns true if the given array, when quantized, contains only values between
diff --git a/tensorflow/lite/toco/graph_transformations/quantize.cc b/tensorflow/lite/toco/graph_transformations/quantize.cc
index e6fd88c9787..c5848f83dd3 100644
--- a/tensorflow/lite/toco/graph_transformations/quantize.cc
+++ b/tensorflow/lite/toco/graph_transformations/quantize.cc
@@ -121,7 +121,7 @@ bool SupportOutputTypeFloatInQuantizedOp(const Operator& op) {
   }
   return false;
 }
-const MinMax& GetOrComputeMinMax(Model* model, const string& array_name) {
+const MinMax& GetOrComputeMinMax(Model* model, const std::string& array_name) {
   auto& array = model->GetArray(array_name);
   // Normally we should have a MinMax recorded on this Array,
   // so we just use it.
diff --git a/tensorflow/lite/toco/graph_transformations/read_array_minmax_and_narrow_range_from_fake_quant.cc b/tensorflow/lite/toco/graph_transformations/read_array_minmax_and_narrow_range_from_fake_quant.cc
index 4d621018dc3..30875c7e59e 100644
--- a/tensorflow/lite/toco/graph_transformations/read_array_minmax_and_narrow_range_from_fake_quant.cc
+++ b/tensorflow/lite/toco/graph_transformations/read_array_minmax_and_narrow_range_from_fake_quant.cc
@@ -29,7 +29,7 @@ namespace {
 
 bool ApplyAttrsToArray(GraphTransformation* transformation, Model* model,
                        const FakeQuantOperator& fq_op,
-                       const string& array_name) {
+                       const std::string& array_name) {
   bool changed = false;
   auto& annotated_array = model->GetArray(array_name);
   if (!annotated_array.minmax) {
diff --git a/tensorflow/lite/toco/graph_transformations/remove_successive_transpose.cc b/tensorflow/lite/toco/graph_transformations/remove_successive_transpose.cc
index 6eccda04c18..5e3b6f7b615 100644
--- a/tensorflow/lite/toco/graph_transformations/remove_successive_transpose.cc
+++ b/tensorflow/lite/toco/graph_transformations/remove_successive_transpose.cc
@@ -43,8 +43,8 @@ bool TransformsToIdentity(std::vector<int> const& perm1,
   return true;
 }
 
-void ReplaceOpInputsWith(Model* model, const string& lookfor,
-                         const string& replacewith) {
+void ReplaceOpInputsWith(Model* model, const std::string& lookfor,
+                         const std::string& replacewith) {
   for (const auto& op : model->operators) {
     for (int i = 0; i < op->inputs.size(); ++i) {
       if (op->inputs[i] == lookfor) {
diff --git a/tensorflow/lite/toco/graph_transformations/remove_trivial_concatenation_input.cc b/tensorflow/lite/toco/graph_transformations/remove_trivial_concatenation_input.cc
index df4a4ea51c4..dc210fc03d8 100644
--- a/tensorflow/lite/toco/graph_transformations/remove_trivial_concatenation_input.cc
+++ b/tensorflow/lite/toco/graph_transformations/remove_trivial_concatenation_input.cc
@@ -41,9 +41,9 @@ namespace toco {
   if (concat_op->type != OperatorType::kConcatenation) {
     return ::tensorflow::Status::OK();
   }
-  std::vector<string> trivial_inputs;
-  std::vector<string> nontrivial_inputs;
-  for (const string& input : concat_op->inputs) {
+  std::vector<std::string> trivial_inputs;
+  std::vector<std::string> nontrivial_inputs;
+  for (const std::string& input : concat_op->inputs) {
     const auto& input_array = model->GetArray(input);
     const bool is_trivial =
         input_array.has_shape() && input_array.shape().dimensions_count() == 0;
@@ -60,7 +60,7 @@ namespace toco {
 
   // Drop trivial inputs.
   concat_op->inputs = nontrivial_inputs;
-  for (const string& input : trivial_inputs) {
+  for (const std::string& input : trivial_inputs) {
     DeleteArrayIfUnusedOutsideOfOp(input, concat_op, model);
   }
   *modified = true;
diff --git a/tensorflow/lite/toco/graph_transformations/remove_trivial_passthrough.cc b/tensorflow/lite/toco/graph_transformations/remove_trivial_passthrough.cc
index bd529bd9ecd..45dbec83471 100644
--- a/tensorflow/lite/toco/graph_transformations/remove_trivial_passthrough.cc
+++ b/tensorflow/lite/toco/graph_transformations/remove_trivial_passthrough.cc
@@ -29,7 +29,7 @@ namespace {
 // array instead. from_array is assumed to be discardable, and consequently
 // this only updates operator edges (since discardable arrays only
 // appear there, and not e.g. in model flags).
-void Reroute(const string& from, const string& to, Model* model) {
+void Reroute(const std::string& from, const std::string& to, Model* model) {
   for (const auto& op : model->operators) {
     for (auto& output : op->outputs) {
       if (output == from) {
@@ -92,8 +92,9 @@ bool RemoveTrivialPassthroughOp(GraphTransformation* transformation,
     }
   }
 
-  const string main_input_name = passthru_op->inputs[main_input_array_index];
-  const string output_name = passthru_op->outputs[0];
+  const std::string main_input_name =
+      passthru_op->inputs[main_input_array_index];
+  const std::string output_name = passthru_op->outputs[0];
 
   if (IsDiscardableArray(*model, output_name)) {
     transformation->AddMessageF(
diff --git a/tensorflow/lite/toco/graph_transformations/remove_trivial_quantized_activation_func.cc b/tensorflow/lite/toco/graph_transformations/remove_trivial_quantized_activation_func.cc
index 56acf22f7f1..80d28e0fc6d 100644
--- a/tensorflow/lite/toco/graph_transformations/remove_trivial_quantized_activation_func.cc
+++ b/tensorflow/lite/toco/graph_transformations/remove_trivial_quantized_activation_func.cc
@@ -32,7 +32,7 @@ namespace {
 
 bool IsTrivialUnfusedActivationFunc(GraphTransformation* transformation,
                                     const Model& model, OperatorType op_type,
-                                    const string& input_array_name) {
+                                    const std::string& input_array_name) {
   double clamp_min;
   double clamp_max;
   switch (op_type) {
@@ -60,7 +60,7 @@ bool IsTrivialUnfusedActivationFunc(GraphTransformation* transformation,
 bool IsTrivialFusedActivationFunc(
     GraphTransformation* transformation, const Model& model,
     FusedActivationFunctionType activation_function,
-    const string& output_array_name) {
+    const std::string& output_array_name) {
   double clamp_min;
   double clamp_max;
   switch (activation_function) {
diff --git a/tensorflow/lite/toco/graph_transformations/remove_trivial_quantized_min_max.cc b/tensorflow/lite/toco/graph_transformations/remove_trivial_quantized_min_max.cc
index f1037994c97..bd9281fe34e 100644
--- a/tensorflow/lite/toco/graph_transformations/remove_trivial_quantized_min_max.cc
+++ b/tensorflow/lite/toco/graph_transformations/remove_trivial_quantized_min_max.cc
@@ -31,8 +31,8 @@ namespace toco {
 namespace {
 
 bool IsTrivialMinMax(GraphTransformation* transformation, const Model& model,
-                     OperatorType op_type, const string& input_array_name,
-                     const string& clamp_value_array_name) {
+                     OperatorType op_type, const std::string& input_array_name,
+                     const std::string& clamp_value_array_name) {
   const auto& clamp_value_array = model.GetArray(clamp_value_array_name);
   if (!IsConstantParameterArray(model, clamp_value_array_name)) {
     transformation->AddMessageF("Clip value array %s is non-constant",
diff --git a/tensorflow/lite/toco/graph_transformations/remove_unused_op.cc b/tensorflow/lite/toco/graph_transformations/remove_unused_op.cc
index 384b5f22911..bcdb4cbe77e 100644
--- a/tensorflow/lite/toco/graph_transformations/remove_unused_op.cc
+++ b/tensorflow/lite/toco/graph_transformations/remove_unused_op.cc
@@ -58,7 +58,7 @@ namespace toco {
     if (found_output_as_rnn_state_array) {
       continue;
     }
-    for (const string& output_array : model->flags.output_arrays()) {
+    for (const std::string& output_array : model->flags.output_arrays()) {
       if (output == output_array) {
         return ::tensorflow::Status::OK();
       }
diff --git a/tensorflow/lite/toco/graph_transformations/reorder_elementwise_unary.cc b/tensorflow/lite/toco/graph_transformations/reorder_elementwise_unary.cc
index 17a5e9a1d6a..158c7f95085 100644
--- a/tensorflow/lite/toco/graph_transformations/reorder_elementwise_unary.cc
+++ b/tensorflow/lite/toco/graph_transformations/reorder_elementwise_unary.cc
@@ -75,7 +75,7 @@ bool IsMoveOperator(OperatorType optype) {
     return ::tensorflow::Status::OK();
   }
 
-  const string intermediate_name = element_op->inputs[0];
+  const std::string intermediate_name = element_op->inputs[0];
   auto it = FindOpWithOutput(*model, intermediate_name);
   if (it == model->operators.end()) {
     AddMessageF("No preceding operator");
@@ -103,8 +103,8 @@ bool IsMoveOperator(OperatorType optype) {
   }
 
   // op->inputs may change so we need to keep a value by copy.
-  const string input_name = move_op->inputs[0];
-  const string output_name = element_op->outputs[0];
+  const std::string input_name = move_op->inputs[0];
+  const std::string output_name = element_op->outputs[0];
 
   AddMessageF("Swapping around operators with %s and %s", LogName(*element_op),
               LogName(*move_op));
diff --git a/tensorflow/lite/toco/graph_transformations/reorder_reshape_transpose.cc b/tensorflow/lite/toco/graph_transformations/reorder_reshape_transpose.cc
index 0fbcf9f73b1..9852b3382cd 100644
--- a/tensorflow/lite/toco/graph_transformations/reorder_reshape_transpose.cc
+++ b/tensorflow/lite/toco/graph_transformations/reorder_reshape_transpose.cc
@@ -138,9 +138,9 @@ std::vector<int> ComputeNewPerm(std::vector<int> input_dims,
   }
 
   // Need to copy to keep static if permutated.
-  const string input_name = reshape_op->inputs[0];
-  const string intermediate_name = reshape_op->outputs[0];
-  const string output_name = transpose_op->outputs[0];
+  const std::string input_name = reshape_op->inputs[0];
+  const std::string intermediate_name = reshape_op->outputs[0];
+  const std::string output_name = transpose_op->outputs[0];
 
   // Intermediate should not be consumed by any other operators.
   if (CountOpsWithInput(*model, intermediate_name) != 1) {
diff --git a/tensorflow/lite/toco/graph_transformations/resolve_batch_normalization.cc b/tensorflow/lite/toco/graph_transformations/resolve_batch_normalization.cc
index 6e5815ee94d..aee511e2beb 100644
--- a/tensorflow/lite/toco/graph_transformations/resolve_batch_normalization.cc
+++ b/tensorflow/lite/toco/graph_transformations/resolve_batch_normalization.cc
@@ -62,12 +62,14 @@ namespace toco {
   // Create the new Mul, Add operators
   auto* mul_op = new MulOperator;
   auto* add_op = new AddOperator;
-  const string mul_name =
+  const std::string mul_name =
       AvailableArrayName(*model, bn_op->outputs[0] + "_mul");
-  const string add_name =
+  const std::string add_name =
       AvailableArrayName(*model, bn_op->outputs[0] + "_add");
-  const string mul_param_name = AvailableArrayName(*model, mul_name + "_param");
-  const string add_param_name = AvailableArrayName(*model, add_name + "_param");
+  const std::string mul_param_name =
+      AvailableArrayName(*model, mul_name + "_param");
+  const std::string add_param_name =
+      AvailableArrayName(*model, add_name + "_param");
   mul_op->inputs = {bn_op->inputs[0], mul_param_name};
   mul_op->outputs = {mul_name};
   add_op->inputs = {mul_name, add_param_name};
diff --git a/tensorflow/lite/toco/graph_transformations/resolve_constant_concatenation.cc b/tensorflow/lite/toco/graph_transformations/resolve_constant_concatenation.cc
index 7c9aa025f64..208c345639b 100644
--- a/tensorflow/lite/toco/graph_transformations/resolve_constant_concatenation.cc
+++ b/tensorflow/lite/toco/graph_transformations/resolve_constant_concatenation.cc
@@ -147,7 +147,7 @@ void SetMinMaxForConcatenedArray(GraphTransformation* transformation,
   const auto* concat_op =
       static_cast<const ConcatenationOperator*>(concat_base_op);
 
-  for (const string& input_name : concat_op->inputs) {
+  for (const std::string& input_name : concat_op->inputs) {
     // We only expect constant unquantized arrays as input, otherwise we return.
     // We  also make sure the shapes of the input arrays are known and they are
     // all discardable.
@@ -166,10 +166,10 @@ void SetMinMaxForConcatenedArray(GraphTransformation* transformation,
   const int concatenation_axis = concat_op->axis;
 
   CHECK_EQ(concat_op->outputs.size(), 1);
-  string concatenated_array_name = concat_op->outputs[0];
+  std::string concatenated_array_name = concat_op->outputs[0];
   Array& concatenated_array = model->GetOrCreateArray(concatenated_array_name);
   std::vector<Array*> input_arrays;
-  for (const string& input_name : concat_op->inputs) {
+  for (const std::string& input_name : concat_op->inputs) {
     input_arrays.push_back(&model->GetArray(input_name));
   }
 
diff --git a/tensorflow/lite/toco/graph_transformations/resolve_reorder_axes.cc b/tensorflow/lite/toco/graph_transformations/resolve_reorder_axes.cc
index a3d3e863757..a685f67745b 100644
--- a/tensorflow/lite/toco/graph_transformations/resolve_reorder_axes.cc
+++ b/tensorflow/lite/toco/graph_transformations/resolve_reorder_axes.cc
@@ -27,19 +27,19 @@ namespace toco {
 
 namespace {
 
-void RenameArray(Model* model, const string& oldname,
-                 const string& desired_newname) {
-  const string& newname = AvailableArrayName(*model, desired_newname);
+void RenameArray(Model* model, const std::string& oldname,
+                 const std::string& desired_newname) {
+  const std::string& newname = AvailableArrayName(*model, desired_newname);
   auto& arrays = model->GetMutableArrayMap();
   arrays[newname] = std::move(arrays[oldname]);
   arrays.erase(oldname);
   for (const auto& op : model->operators) {
-    for (string& input : op->inputs) {
+    for (std::string& input : op->inputs) {
       if (input == oldname) {
         input = newname;
       }
     }
-    for (string& output : op->outputs) {
+    for (std::string& output : op->outputs) {
       if (output == oldname) {
         output = newname;
       }
@@ -89,8 +89,8 @@ void ReorderAxes(AxesOrder input_axes_order, AxesOrder output_axes_order,
   auto* reorder_op = static_cast<ReorderAxesOperator*>(op);
 
   // Intentionally copies, not references.
-  const string input_array_name = reorder_op->inputs[0];
-  const string output_array_name = reorder_op->outputs[0];
+  const std::string input_array_name = reorder_op->inputs[0];
+  const std::string output_array_name = reorder_op->outputs[0];
 
   auto& input_array = model->GetArray(input_array_name);
   auto& output_array = model->GetArray(output_array_name);
diff --git a/tensorflow/lite/toco/graph_transformations/resolve_tensorflow_concat.cc b/tensorflow/lite/toco/graph_transformations/resolve_tensorflow_concat.cc
index b5a11529764..1ce05336be9 100644
--- a/tensorflow/lite/toco/graph_transformations/resolve_tensorflow_concat.cc
+++ b/tensorflow/lite/toco/graph_transformations/resolve_tensorflow_concat.cc
@@ -44,8 +44,8 @@ namespace toco {
   if (tf_concat_op->type == OperatorType::kConcatV2) {
     axis_pos = tf_concat_op->inputs.size() - 1;
   }
-  const string axis_name = tf_concat_op->inputs[axis_pos];
-  std::vector<string> concat_input_names;
+  const std::string axis_name = tf_concat_op->inputs[axis_pos];
+  std::vector<std::string> concat_input_names;
   for (std::size_t i = 0; i < tf_concat_op->inputs.size(); i++) {
     if (i != axis_pos) {
       concat_input_names.push_back(tf_concat_op->inputs[i]);
diff --git a/tensorflow/lite/toco/graph_transformations/resolve_tensorflow_matmul.cc b/tensorflow/lite/toco/graph_transformations/resolve_tensorflow_matmul.cc
index ac95d609e91..a6d653d055d 100644
--- a/tensorflow/lite/toco/graph_transformations/resolve_tensorflow_matmul.cc
+++ b/tensorflow/lite/toco/graph_transformations/resolve_tensorflow_matmul.cc
@@ -27,7 +27,7 @@ namespace toco {
 namespace {
 
 TransposeOperator* FindTransposeOpWithInput(const Model& model,
-                                            const string& array_name) {
+                                            const std::string& array_name) {
   for (auto it = model.operators.begin(); it != model.operators.end(); ++it) {
     Operator* op = it->get();
     if (op->type != OperatorType::kTranspose) {
@@ -74,8 +74,8 @@ TransposeOperator* FindTransposeOpWithInput(const Model& model,
     DCHECK_EQ(matmul_it->get(), matmul_op);
   };
 
-  string input_lhs = matmul_op->inputs[0];
-  string input_rhs = matmul_op->inputs[1];
+  std::string input_lhs = matmul_op->inputs[0];
+  std::string input_rhs = matmul_op->inputs[1];
 
   // Handle `transpose_a` with best effort: If the dimension of lhs is known,
   // insert a `Transpose` op.
diff --git a/tensorflow/lite/toco/graph_transformations/resolve_tensorflow_switch.cc b/tensorflow/lite/toco/graph_transformations/resolve_tensorflow_switch.cc
index 5c3176ced34..854dce39a27 100644
--- a/tensorflow/lite/toco/graph_transformations/resolve_tensorflow_switch.cc
+++ b/tensorflow/lite/toco/graph_transformations/resolve_tensorflow_switch.cc
@@ -37,7 +37,7 @@ namespace toco {
 
   CHECK_EQ(switch_op->inputs.size(), 2);
   CHECK_EQ(switch_op->outputs.size(), 2);
-  const string& predicate_name = switch_op->inputs[1];
+  const std::string& predicate_name = switch_op->inputs[1];
   // If the predicate array hasn't been resolved to a constant yet,
   // we need to yield.
   if (!IsConstantParameterArray(*model, predicate_name)) {
diff --git a/tensorflow/lite/toco/graph_transformations/shuffle_fc_weights.cc b/tensorflow/lite/toco/graph_transformations/shuffle_fc_weights.cc
index 195ea70e34b..7eadd01c949 100644
--- a/tensorflow/lite/toco/graph_transformations/shuffle_fc_weights.cc
+++ b/tensorflow/lite/toco/graph_transformations/shuffle_fc_weights.cc
@@ -37,7 +37,7 @@ namespace toco {
     return ::tensorflow::Status::OK();
   }
   const Array& input_array = model->GetArray(fc_op->inputs[0]);
-  const string& weights_name = fc_op->inputs[1];
+  const std::string& weights_name = fc_op->inputs[1];
   Array& weights_array = model->GetArray(weights_name);
   const Array& output_array = model->GetArray(fc_op->outputs[0]);
   // Exit if this FC op isn't quantized with uint8 inputs and int16 outputs,
@@ -143,7 +143,7 @@ namespace toco {
   // Add a second output array to this FC op, serving as a workspace to perform
   // runtime shuffling/xoring of its input activations.
   CHECK_EQ(fc_op->outputs.size(), 1);
-  const string& shuffled_input_workspace_array_name =
+  const std::string& shuffled_input_workspace_array_name =
       AvailableArrayName(*model, fc_op->inputs[0] + "_shuffled");
   fc_op->outputs.push_back(shuffled_input_workspace_array_name);
   auto& shuffled_input_workspace_array =
diff --git a/tensorflow/lite/toco/graph_transformations/unfuse_activation_functions.cc b/tensorflow/lite/toco/graph_transformations/unfuse_activation_functions.cc
index 3e36dd5a45c..2a3cd91551b 100644
--- a/tensorflow/lite/toco/graph_transformations/unfuse_activation_functions.cc
+++ b/tensorflow/lite/toco/graph_transformations/unfuse_activation_functions.cc
@@ -64,7 +64,7 @@ namespace toco {
   // Wire up arrays, constructing a new intermediate array to connect the
   // op to its new unfused activation function.
   ac_op->outputs = op->outputs;
-  const string& tmp_array_name =
+  const std::string& tmp_array_name =
       AvailableArrayName(*model, op->outputs[0] + "_unfused");
   CHECK(!model->HasArray(tmp_array_name));
 
diff --git a/tensorflow/lite/toco/graph_transformations/unpartition_embedding_lookup.cc b/tensorflow/lite/toco/graph_transformations/unpartition_embedding_lookup.cc
index 1f7035c21e2..294c39069f7 100644
--- a/tensorflow/lite/toco/graph_transformations/unpartition_embedding_lookup.cc
+++ b/tensorflow/lite/toco/graph_transformations/unpartition_embedding_lookup.cc
@@ -55,8 +55,8 @@ namespace toco {
   auto* stitch_op = static_cast<DynamicStitchOperator*>(op_it->get());
 
   // Split up the DynamicStitch inputs into the indices and data.
-  std::vector<string> stitch_indices_inputs;
-  std::vector<string> stitch_data_inputs;
+  std::vector<std::string> stitch_indices_inputs;
+  std::vector<std::string> stitch_data_inputs;
   for (size_t i = 0; i < stitch_op->num_partitions; ++i) {
     stitch_indices_inputs.push_back(stitch_op->inputs[i]);
   }
@@ -67,7 +67,8 @@ namespace toco {
 
   // Validate all indices come from the same DynamicPartition.
   DynamicPartitionOperator* indices_partition_op = nullptr;
-  for (const string& indices_partition_output_name : stitch_indices_inputs) {
+  for (const std::string& indices_partition_output_name :
+       stitch_indices_inputs) {
     auto* op = GetOpWithOutput(*model, indices_partition_output_name);
     CHECK(op) << "Source of " << indices_partition_output_name << " not found";
     if (op->type != OperatorType::kDynamicPartition) {
@@ -112,7 +113,7 @@ namespace toco {
 
   // Find all of the gathers used for the data inputs.
   std::vector<GatherOperator*> gather_ops;
-  for (const string& gather_output_name : stitch_data_inputs) {
+  for (const std::string& gather_output_name : stitch_data_inputs) {
     auto* op = GetOpWithOutput(*model, gather_output_name);
     CHECK(op) << "Source of " << gather_output_name << " not found";
     if (op->type != OperatorType::kGather) {
diff --git a/tensorflow/lite/toco/graph_transformations/unroll_batch_matmul.cc b/tensorflow/lite/toco/graph_transformations/unroll_batch_matmul.cc
index 16dfaf7fc80..5bf000e2784 100644
--- a/tensorflow/lite/toco/graph_transformations/unroll_batch_matmul.cc
+++ b/tensorflow/lite/toco/graph_transformations/unroll_batch_matmul.cc
@@ -34,9 +34,10 @@ absl::InlinedVector<int64, 4> ToInlinedVector(const std::vector<int>& vec) {
   return absl::InlinedVector<int64, 4>(vec.begin(), vec.end());
 }
 
-std::vector<string> SliceInput(
-    const string& input, const string& base_name, const string& input_name,
-    const int batch_size, const Array& input_array, Model* model,
+std::vector<std::string> SliceInput(
+    const std::string& input, const std::string& base_name,
+    const std::string& input_name, const int batch_size,
+    const Array& input_array, Model* model,
     std::vector<std::unique_ptr<Operator>>::iterator* tail_it) {
   int rank = input_array.shape().dimensions_count();
   int num_rows = input_array.shape().dims(rank - 2);
@@ -54,7 +55,7 @@ std::vector<string> SliceInput(
   *tail_it = model->operators.emplace(*tail_it, reshape_op) + 1;
 
   // Slice along each batch index and remember the slice output for future use.
-  std::vector<string> slice_outputs;
+  std::vector<std::string> slice_outputs;
   for (int batch_idx = 0; batch_idx < batch_size; ++batch_idx) {
     std::string batch_name =
         absl::StrCat(base_name, "_b", batch_idx, "/slice_", input_name);
@@ -110,10 +111,10 @@ std::vector<int32> GetTransposeShape(const Shape& input_shape,
   return output_shape;
 }
 
-TransposeOperator* TransposeInput(const string& input, Model* model) {
+TransposeOperator* TransposeInput(const std::string& input, Model* model) {
   const auto& input_array = model->GetArray(input);
   const auto perm_array = GetTransposePerm(input_array);
-  const string perm_array_name = CreateInt32Array(
+  const std::string perm_array_name = CreateInt32Array(
       model, AvailableArrayName(*model, input + "/transpose/perm"), perm_array);
   auto* transpose_op = new TransposeOperator;
   transpose_op->inputs = {input, perm_array_name};
@@ -141,8 +142,8 @@ TransposeOperator* TransposeInput(const string& input, Model* model) {
       static_cast<const BatchMatMulOperator*>(batch_op_it->get());
   auto& tail_it = batch_op_it;
 
-  string input_lhs = batch_op->inputs[0];
-  string input_rhs = batch_op->inputs[1];
+  std::string input_lhs = batch_op->inputs[0];
+  std::string input_rhs = batch_op->inputs[1];
   const auto& input_lhs_array = model->GetArray(input_lhs);
   const auto& input_rhs_array = model->GetArray(input_rhs);
   if (!input_lhs_array.has_shape() || !input_rhs_array.has_shape())
@@ -195,19 +196,19 @@ TransposeOperator* TransposeInput(const string& input, Model* model) {
   }
   AddMessageF("Unrolling BatchMatMul %s %d times", LogName(*batch_op),
               bcast.output_batch_size());
-  string base_name = std::string(batch_op->outputs[0]);
+  std::string base_name = std::string(batch_op->outputs[0]);
 
   // Compute slices for each batch in the LHS and RHS.
-  std::vector<string> slice_a_outputs =
+  std::vector<std::string> slice_a_outputs =
       SliceInput(input_lhs, base_name, "a", bcast.x_batch_size(), input_array_a,
                  model, &tail_it);
-  std::vector<string> slice_b_outputs =
+  std::vector<std::string> slice_b_outputs =
       SliceInput(input_rhs, base_name, "b", bcast.y_batch_size(), input_array_b,
                  model, &tail_it);
 
   // Compute (single batch) MatMul for each output batch. The MatMul outputs are
   // then packed together into one output Tensor.
-  std::vector<string> pack_inputs;
+  std::vector<std::string> pack_inputs;
   for (int batch_idx = 0; batch_idx < bcast.output_batch_size(); ++batch_idx) {
     std::string batch_name =
         absl::StrCat(batch_op->outputs[0], "_b", batch_idx);
diff --git a/tensorflow/lite/tools/make/Makefile b/tensorflow/lite/tools/make/Makefile
index 266aa94ad93..f8b67fbbe7d 100644
--- a/tensorflow/lite/tools/make/Makefile
+++ b/tensorflow/lite/tools/make/Makefile
@@ -187,7 +187,7 @@ ifeq ($(TARGET_ARCH),aarch64)
 	BUILD_WITH_RUY=true
 endif
 ifeq ($(BUILD_WITH_RUY),true)
-  CXXFLAGS += -DTFLITE_WITH_RUY_ONLY
+  CXXFLAGS += -DTFLITE_WITH_RUY
 endif
 
 BUILD_WITH_RUY_PROFILER ?= false
diff --git a/tensorflow/python/distribute/custom_training_loop_input_test.py b/tensorflow/python/distribute/custom_training_loop_input_test.py
index 5660b5839ce..748cb7834fc 100644
--- a/tensorflow/python/distribute/custom_training_loop_input_test.py
+++ b/tensorflow/python/distribute/custom_training_loop_input_test.py
@@ -454,10 +454,11 @@ class InputIterationTest(test.TestCase, parameterized.TestCase,
       ))
   def testDistributeDatasetHostPrefetch(self, distribution):
     data = [5., 6., 7., 8.]
-    distribution.extended._set_prefetch_on_host(True)  # pylint: disable=protected-access
     input_iterator = iter(
         distribution.experimental_distribute_dataset(
-            get_dataset_from_tensor_slices(data).batch(2)))
+            get_dataset_from_tensor_slices(data).batch(2),
+            distribute_lib.InputOptions(
+                experimental_prefetch_to_device=False)))
 
     local_results = distribution.experimental_local_results(
         input_iterator.get_next())
@@ -473,10 +474,11 @@ class InputIterationTest(test.TestCase, parameterized.TestCase,
       ))
   def testDistributeDatasetFunctionHostPrefetch(self, distribution):
     data = [5., 6., 7., 8.]
-    distribution.extended._set_prefetch_on_host(True)  # pylint: disable=protected-access
     input_iterator = iter(
         distribution.experimental_distribute_datasets_from_function(
-            lambda _: get_dataset_from_tensor_slices(data)))
+            lambda _: get_dataset_from_tensor_slices(data),
+            distribute_lib.InputOptions(
+                experimental_prefetch_to_device=False)))
 
     local_results = distribution.experimental_local_results(
         input_iterator.get_next())
diff --git a/tensorflow/python/distribute/parallel_device/parallel_device_test.py b/tensorflow/python/distribute/parallel_device/parallel_device_test.py
index 8fc3dcb5816..1429c522aba 100644
--- a/tensorflow/python/distribute/parallel_device/parallel_device_test.py
+++ b/tensorflow/python/distribute/parallel_device/parallel_device_test.py
@@ -172,6 +172,8 @@ class ParallelDeviceTests(_VirtualDeviceTestCase):
       config.set_synchronous_execution(previous)
 
   def test_checkpointing(self):
+    self.skipTest(
+        "Disable saving until SaveableObject's methods are traceable.")
     prefix = os.path.join(self.get_temp_dir(), "ckpt")
     with self.device.scope():
       different_values = self.device.pack(
@@ -263,6 +265,8 @@ class LayerTests(_VirtualDeviceTestCase):
     self.assertIn(self.device.components[1], final_kernels[1].backing_device)
 
   def test_training_loop(self):
+    self.skipTest(
+        "Disable saving until SaveableObject's methods are traceable.")
     for _ in range(5):
       layer = _Dense(5)
       checkpoint = tracking.Checkpoint(layer=layer)
diff --git a/tensorflow/python/distribute/tpu_strategy.py b/tensorflow/python/distribute/tpu_strategy.py
index dcd1671841f..4b3c4be0ccd 100644
--- a/tensorflow/python/distribute/tpu_strategy.py
+++ b/tensorflow/python/distribute/tpu_strategy.py
@@ -47,12 +47,14 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import device_spec
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.tpu import device_assignment as device_assignment_lib  # pylint: disable=unused-import
 from tensorflow.python.tpu import tpu
 from tensorflow.python.tpu import tpu_strategy_util
@@ -515,7 +517,6 @@ class TPUExtended(distribute_lib.StrategyExtendedV1):
     self._require_static_shapes = True
 
     self.experimental_enable_get_next_as_optional = True
-    self._prefetch_to_device = True
 
     self._logical_device_stack = [0]
 
@@ -527,16 +528,6 @@ class TPUExtended(distribute_lib.StrategyExtendedV1):
           context.async_wait()
       atexit.register(async_wait)
 
-  # TODO(bfontain): Remove once a proper dataset API exists for prefetching
-  # a dataset to multiple devices exists.
-  # If value is true, this forces prefetch of data to the host's memeory rather
-  # than the individual TPU device's memory. This is needed when using for TPU
-  # Embeddings as a) sparse tensors cannot be prefetched to the TPU device
-  # memory and b) TPU Embedding enqueue operation are CPU ops and this avoids
-  # a copy back to the host for dense tensors
-  def _set_prefetch_on_host(self, value):
-    self._prefetch_to_device = not value
-
   def _validate_colocate_with_variable(self, colocate_with_variable):
     distribute_utils. validate_colocate(colocate_with_variable, self)
 
@@ -575,17 +566,32 @@ class TPUExtended(distribute_lib.StrategyExtendedV1):
         session)
 
   def _get_input_workers(self, options):
-    prefetch_to_device = self._prefetch_to_device
-    if options:
-      prefetch_to_device = options.experimental_prefetch_to_device
-    if prefetch_to_device:
+    if not options or options.experimental_prefetch_to_device:
       return input_lib.InputWorkers(
           tuple(self._device_input_worker_devices.items()))
     else:
       return input_lib.InputWorkers(
           tuple(self._host_input_worker_devices.items()))
 
+  def _check_spec(self, element_spec):
+    if isinstance(element_spec, values.PerReplicaSpec):
+      element_spec = element_spec._component_specs  # pylint: disable=protected-access
+    specs = nest.flatten_with_joined_string_paths(element_spec)
+    for path, spec in specs:
+      if isinstance(spec, (sparse_tensor.SparseTensorSpec,
+                           ragged_tensor.RaggedTensorSpec)):
+        raise ValueError(
+            "Found tensor {} with spec {}. TPUStrategy does not support "
+            "distributed datasets with device prefetch when using sparse or "
+            "ragged tensors. If you indend to use sparse or ragged tensors, "
+            "please pass a tf.distribute.InputOptions object with "
+            "experimental_prefetch_to_device set to False to your dataset "
+            "distribution function.".format(path, type(spec)))
+
   def _experimental_distribute_dataset(self, dataset, options):
+    if options is None or options.experimental_prefetch_to_device:
+      self._check_spec(dataset.element_spec)
+
     return input_lib.get_distributed_dataset(
         dataset,
         self._get_input_workers(options),
@@ -603,12 +609,17 @@ class TPUExtended(distribute_lib.StrategyExtendedV1):
           input_pipeline_id=i,
           num_replicas_in_sync=self._num_replicas_in_sync))
 
-    return input_lib.get_distributed_datasets_from_function(
+    distributed_dataset = input_lib.get_distributed_datasets_from_function(
         dataset_fn,
         input_workers,
         input_contexts,
         self._container_strategy())
 
+    # We can only check after the dataset_fn is called.
+    if options is None or options.experimental_prefetch_to_device:
+      self._check_spec(distributed_dataset.element_spec)
+    return distributed_dataset
+
   def _experimental_distribute_values_from_function(self, value_fn):
     per_replica_values = []
     for replica_id in range(self._num_replicas_in_sync):
diff --git a/tensorflow/python/distribute/tpu_strategy_test.py b/tensorflow/python/distribute/tpu_strategy_test.py
index 4070336aae8..5e47e750d87 100644
--- a/tensorflow/python/distribute/tpu_strategy_test.py
+++ b/tensorflow/python/distribute/tpu_strategy_test.py
@@ -44,6 +44,7 @@ from tensorflow.python.ops import embedding_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import variables
+from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.platform import flags
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.tpu import device_assignment as device_assignment_lib
@@ -475,9 +476,11 @@ class TPUStrategyTest(test.TestCase, parameterized.TestCase):
 
       return dataset.map(make_sparse)
 
-    strategy.extended._set_prefetch_on_host(True)  # pylint: disable=protected-access
     dataset = iter(
-        strategy.experimental_distribute_datasets_from_function(dataset_fn))
+        strategy.experimental_distribute_datasets_from_function(
+            dataset_fn,
+            distribute_lib.InputOptions(
+                experimental_prefetch_to_device=False)))
 
     result = sparse_lookup(dataset)
     self.assertAllEqual(result,
@@ -520,9 +523,11 @@ class TPUStrategyTest(test.TestCase, parameterized.TestCase):
 
       return dataset.map(make_sparse)
 
-    strategy.extended._set_prefetch_on_host(True)  # pylint: disable=protected-access
     dataset = iter(
-        strategy.experimental_distribute_datasets_from_function(dataset_fn))
+        strategy.experimental_distribute_datasets_from_function(
+            dataset_fn,
+            options=distribute_lib.InputOptions(
+                experimental_prefetch_to_device=False)))
 
     result = sparse_lookup(dataset)
     self.assertAllEqual(result, [[0.0, 2.0], [1.5, 5.0]])
@@ -594,5 +599,61 @@ class TPUStrategyDataPrefetchTest(test.TestCase):
         dataset_item.values[0].device)
     self.assertEqual(dataset_location.device_type, "CPU")
 
+  def test_prefetch_to_device_sparse_dataset(self):
+    strategy = get_tpu_strategy()
+    # Values here aren't important.
+    dataset = dataset_ops.Dataset.from_tensors(
+        sparse_tensor.SparseTensor(indices=[[0, 0], [0, 1], [1, 0]],
+                                   values=[1, 2, 3],
+                                   dense_shape=[2, 2]))
+    dataset = dataset.repeat()
+    dataset = dataset.batch(strategy.num_replicas_in_sync)
+
+    with self.assertRaisesRegex(ValueError, "TPUStrategy does not support"):
+      iter(strategy.experimental_distribute_dataset(dataset))
+
+  def test_prefetch_to_device_ragged_dataset(self):
+    strategy = get_tpu_strategy()
+    # Values here aren't important.
+    dataset = dataset_ops.Dataset.from_tensors(
+        ragged_tensor.RaggedTensor.from_row_splits(
+            values=[1, 2, 3],
+            row_splits=[0, 2, 3]))
+    dataset = dataset.repeat()
+    dataset = dataset.batch(strategy.num_replicas_in_sync)
+
+    with self.assertRaisesRegex(ValueError, "TPUStrategy does not support"):
+      iter(strategy.experimental_distribute_dataset(dataset))
+
+  def test_prefetch_to_device_sparse_dataset_fn(self):
+    strategy = get_tpu_strategy()
+    def dataset_fn(ctx):
+      del ctx
+      # Values here aren't important.
+      dataset = dataset_ops.Dataset.from_tensors(
+          sparse_tensor.SparseTensor(indices=[[0, 0], [0, 1], [1, 0]],
+                                     values=[1, 2, 3],
+                                     dense_shape=[2, 2]))
+      dataset = dataset.repeat()
+      return dataset.batch(strategy.num_replicas_in_sync)
+
+    with self.assertRaisesRegex(ValueError, "TPUStrategy does not support"):
+      iter(strategy.experimental_distribute_datasets_from_function(dataset_fn))
+
+  def test_prefetch_to_device_ragged_dataset_fn(self):
+    strategy = get_tpu_strategy()
+    def dataset_fn(ctx):
+      del ctx
+      # Values here aren't important.
+      dataset = dataset_ops.Dataset.from_tensors(
+          ragged_tensor.RaggedTensor.from_row_splits(
+              values=[1, 2, 3],
+              row_splits=[0, 2, 3]))
+      dataset = dataset.repeat()
+      return dataset.batch(strategy.num_replicas_in_sync)
+
+    with self.assertRaisesRegex(ValueError, "TPUStrategy does not support"):
+      iter(strategy.experimental_distribute_datasets_from_function(dataset_fn))
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/eager/BUILD b/tensorflow/python/eager/BUILD
index af5f3d16408..f51bd97e488 100644
--- a/tensorflow/python/eager/BUILD
+++ b/tensorflow/python/eager/BUILD
@@ -560,6 +560,7 @@ py_library(
         "//tensorflow/python:graph_to_function_def",
         "//tensorflow/python:pywrap_tf_session",
         "//tensorflow/python:util",
+        "//tensorflow/python/ops/numpy_ops:numpy",
         "//third_party/py/numpy",
         "@six_archive//:six",
     ],
diff --git a/tensorflow/python/eager/backprop.py b/tensorflow/python/eager/backprop.py
index 5c2deb9c0f2..8da3f71360a 100644
--- a/tensorflow/python/eager/backprop.py
+++ b/tensorflow/python/eager/backprop.py
@@ -1024,7 +1024,7 @@ class GradientTape(object):
             "derivatives.", 1)
 
     flat_targets = []
-    for t in nest.flatten(target, expand_composites=True):
+    for t in nest.flatten(target):
       if not backprop_util.IsTrainable(t):
         logging.vlog(
             logging.WARN, "The dtype of the target tensor must be "
@@ -1035,7 +1035,7 @@ class GradientTape(object):
           t = ops.convert_to_tensor(t)
       flat_targets.append(t)
 
-    flat_sources = nest.flatten(sources, expand_composites=True)
+    flat_sources = nest.flatten(sources)
     flat_sources_raw = flat_sources
     flat_sources = [_handle_or_self(x) for x in flat_sources]
     for t in flat_sources_raw:
@@ -1051,8 +1051,7 @@ class GradientTape(object):
 
     if output_gradients is not None:
       output_gradients = [None if x is None else ops.convert_to_tensor(x)
-                          for x in nest.flatten(
-                              output_gradients, expand_composites=True)]
+                          for x in nest.flatten(output_gradients)]
 
     flat_grad = imperative_grad.imperative_grad(
         self._tape,
@@ -1067,7 +1066,7 @@ class GradientTape(object):
       self._watched_variables = self._tape.watched_variables()
       self._tape = None
 
-    grad = nest.pack_sequence_as(sources, flat_grad, expand_composites=True)
+    grad = nest.pack_sequence_as(sources, flat_grad)
     return grad
 
   def jacobian(self,
diff --git a/tensorflow/python/eager/backprop_test.py b/tensorflow/python/eager/backprop_test.py
index abdac526ce4..a0f98fc0a44 100644
--- a/tensorflow/python/eager/backprop_test.py
+++ b/tensorflow/python/eager/backprop_test.py
@@ -28,7 +28,6 @@ from tensorflow.python.eager import def_function
 from tensorflow.python.eager import function
 from tensorflow.python.eager import tape as tape_lib
 from tensorflow.python.eager import test
-from tensorflow.python.framework import composite_tensor
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
@@ -37,7 +36,6 @@ from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.framework import test_util
-from tensorflow.python.framework import type_spec
 from tensorflow.python.framework.memory_checker import MemoryChecker
 from tensorflow.python.layers.pooling import max_pooling3d
 from tensorflow.python.ops import array_ops
@@ -54,44 +52,6 @@ from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import sparse_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.training import training
-from tensorflow.python.util import nest
-
-
-# TODO(nareshmodi): This is copied from composite_tensor_test.py. Extract it out
-# to a common library to avoid duplication.
-class CTSpec(type_spec.TypeSpec):
-  """A generic CompositeTensor TypeSpec, used for constructing tests."""
-
-  def __init__(self, component_specs):
-    self.component_specs = component_specs
-
-  value_type = property(lambda self: CT)
-  _component_specs = property(lambda self: self.component_specs)
-
-  def _serialize(self):
-    return (self.component_specs,)
-
-  def _to_components(self, value):
-    return value.components
-
-  def _from_components(self, tensor_list):
-    return CT(tensor_list)
-
-
-class CT(composite_tensor.CompositeTensor):
-  """A generic CompositeTensor, used for constructing tests."""
-  _type_spec_class = CTSpec
-
-  def __init__(self, components):
-    if isinstance(components, list):
-      components = tuple(components)
-    self.components = components
-
-  @property
-  def _type_spec(self):
-    component_specs = nest.map_structure(type_spec.type_spec_from_value,
-                                         self.components)
-    return self._type_spec_class(component_specs)
 
 
 class BackpropTest(test.TestCase, parameterized.TestCase):
@@ -1621,35 +1581,6 @@ class BackpropTest(test.TestCase, parameterized.TestCase):
     memory_checker.report()
     memory_checker.assert_no_leak_if_all_possibly_except_one()
 
-  def testCompositeTensorAsSource(self):
-    t = CT([constant_op.constant(3.), constant_op.constant(2.)])
-    with backprop.GradientTape() as gt:
-      gt.watch(t)
-      y = CT([t.components[0] * 2, t.components[1] * 3])
-
-    grad = gt.gradient(y, t)
-    expected_grad = CT([constant_op.constant(2.), constant_op.constant(3.)])
-
-    flat_grads = nest.flatten(grad, expand_composites=True)
-    flat_expected_grads = nest.flatten(expected_grad, expand_composites=True)
-
-    self.assertAllClose(flat_grads, flat_expected_grads)
-
-  def testCompositeTensorAsOutputGradients(self):
-    t = CT([constant_op.constant(3.), constant_op.constant(2.)])
-    with backprop.GradientTape() as gt:
-      gt.watch(t)
-      y = CT([t.components[0] * 2, t.components[1] * 3])
-
-    output_gradients = CT([constant_op.constant(5.), constant_op.constant(10.)])
-    grad = gt.gradient(y, t, output_gradients=output_gradients)
-    expected_grad = CT([constant_op.constant(10.), constant_op.constant(30.)])
-
-    flat_grads = nest.flatten(grad, expand_composites=True)
-    flat_expected_grads = nest.flatten(expected_grad, expand_composites=True)
-
-    self.assertAllClose(flat_grads, flat_expected_grads)
-
 
 class JacobianTest(test.TestCase):
 
diff --git a/tensorflow/python/eager/def_function_test.py b/tensorflow/python/eager/def_function_test.py
index 0549da2c256..6dc4e322bbd 100644
--- a/tensorflow/python/eager/def_function_test.py
+++ b/tensorflow/python/eager/def_function_test.py
@@ -568,10 +568,10 @@ class DefFunctionTest(test.TestCase, parameterized.TestCase):
     self.assertEqual(trace_count[0], 1)
     self.assertEqual(self.evaluate(v1), 2.0)
     double_variable(v2)
-    self.assertEqual(trace_count[0], 1 if ops.Tensor._USE_EQUALITY else 2)
+    self.assertEqual(trace_count[0], 2)
     self.assertEqual(self.evaluate(v2), 4.0)
     double_variable(v3)
-    self.assertEqual(trace_count[0], 2 if ops.Tensor._USE_EQUALITY else 3)
+    self.assertEqual(trace_count[0], 3)
     self.assertEqual(self.evaluate(v3), 8)
 
   def testShapeCache(self):
diff --git a/tensorflow/python/eager/function.py b/tensorflow/python/eager/function.py
index ca1e60c1b7b..7b235749533 100644
--- a/tensorflow/python/eager/function.py
+++ b/tensorflow/python/eager/function.py
@@ -81,6 +81,9 @@ from tensorflow.python.util import tf_inspect
 ag_ctx = lazy_loader.LazyLoader(
     "ag_ctx", globals(),
     "tensorflow.python.autograph.core.ag_ctx")
+np_arrays = lazy_loader.LazyLoader(
+    "np_arrays", globals(),
+    "tensorflow.python.ops.numpy_ops.np_arrays")
 
 
 FORWARD_FUNCTION_ATTRIBUTE_NAME = "forward_function_name"
@@ -89,7 +92,7 @@ IMPLEMENTS_ATTRIBUTE_NAME = "_implements"
 SHARED_RENDEZVOUS_ATTRIBUTE_NAME = "shared_rendezvous"
 
 
-def _make_input_signature_hashable(elem, variable_map=None):
+def _make_input_signature_hashable(elem):
   """Rewrite input signature to be hashable.
 
   We replace nested variables in the input signature with TensorSpec in order to
@@ -97,18 +100,13 @@ def _make_input_signature_hashable(elem, variable_map=None):
 
   Args:
     elem: Input signature element
-    variable_map: Internal argument used for tracking variable aliases
 
   Returns:
     A hashable object for the requested input signature
   """
-  if variable_map is None:
-    variable_map = {}
-
   # TODO(slebedev): consider using nest.
   if isinstance(elem, tuple):
-    return tuple(map(lambda e: _make_input_signature_hashable(e, variable_map),
-                     elem))
+    return tuple(map(_make_input_signature_hashable, elem))
 
   try:
     hash(elem)
@@ -119,15 +117,17 @@ def _make_input_signature_hashable(elem, variable_map=None):
     v = elem()
 
     if resource_variable_ops.is_resource_variable(v):
-      idx = variable_map.get(id(v))
-      if idx is None:
-        idx = len(variable_map)
-        variable_map[id(v)] = idx
-
-      # We include the class name to avoid having different types of variables
-      # having the same hash. We Also include the variable index which allows
-      # us to return a different hash if variables have been aliased in a call.
-      return v.__class__, tensor_spec.TensorSpec(v.shape, v.dtype), idx
+      # We special case variables here to use unique_id as the cache key. This
+      # ensures we have to retrace whenever a different variable is passed in.
+      # This is needed to support cases where the user may use the id of a
+      # variable in the function perhaps as a lookup in a dictionary.
+      #
+      # This choice leads to more retracing when we could have possibly used the
+      # shape and dtype instead. However, we expect the number of variables in a
+      # program to be bounded, and correspondingly the number of retraces.
+      #
+      # Note we also include the class name to avoid collisions with strings.
+      return v.__class__, v._unique_id  # pylint: disable=protected-access
 
     if _is_ndarray(v):
       # Numpy arrays are not hashable, but when calling functions we treat them
@@ -1487,6 +1487,11 @@ class ConcreteFunction(object):
     self._func_graph = func_graph
     self._captured_inputs = self._func_graph.external_captures
     self._captured_closures = self._func_graph.deferred_external_captures
+    structured_outputs = self._func_graph.structured_outputs
+    self._ndarrays_list = (
+        isinstance(structured_outputs, (list, tuple)) and
+        all([isinstance(o, np_arrays.ndarray) for o in structured_outputs]))
+    self._ndarray_singleton = isinstance(structured_outputs, np_arrays.ndarray)
 
     # function_spec defines the structured signature.
     self._set_function_spec(function_spec)
@@ -2153,9 +2158,15 @@ class ConcreteFunction(object):
     if self._func_graph.structured_outputs is None:
       return result
 
+    if result:
+      if self._ndarrays_list:
+        return [np_arrays.tensor_to_ndarray(o) for o in result]
+      elif self._ndarray_singleton:
+        return np_arrays.tensor_to_ndarray(result[0])
+
     # Replace outputs with results, skipping over any 'None' values.
-    outputs_list = nest.flatten(self._func_graph.structured_outputs,
-                                expand_composites=True)
+    outputs_list = nest.flatten(
+        self._func_graph.structured_outputs, expand_composites=True)
     j = 0
     for i, o in enumerate(outputs_list):
       if o is not None:
diff --git a/tensorflow/python/eager/function_test.py b/tensorflow/python/eager/function_test.py
index f6a36701deb..2c49795ba8a 100644
--- a/tensorflow/python/eager/function_test.py
+++ b/tensorflow/python/eager/function_test.py
@@ -18,6 +18,7 @@ from __future__ import division
 from __future__ import print_function
 
 import collections
+import copy
 import functools
 import itertools
 import multiprocessing.pool
@@ -2930,30 +2931,57 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
     # should only get a miss if the aliasing changed.
     defined(x, y, z)
     self.assertLen(total_function_cache(defined), 1)
-
-    # Calling again is a cache hit
     defined(x, y, z)
     self.assertLen(total_function_cache(defined), 1)
 
-    # Re-arranging arguments doesn't change signature
+    # Re-arranging arguments causes cache miss
     defined(z, y, x)
-    self.assertLen(total_function_cache(defined),
-                   1 if ops.Tensor._USE_EQUALITY else 2)
+    self.assertLen(total_function_cache(defined), 2)
+    defined(z, y, x)
+    self.assertLen(total_function_cache(defined), 2)
 
     # Aliasing causes cache miss
     defined(x, x, z)
-    self.assertLen(total_function_cache(defined),
-                   2 if ops.Tensor._USE_EQUALITY else 3)
+    self.assertLen(total_function_cache(defined), 3)
+    defined(x, x, z)
+    self.assertLen(total_function_cache(defined), 3)
 
-    # Re-arranging arguments doesn't change signature
+    # Re-arranging arguments causes cache miss
     defined(y, y, z)
-    self.assertLen(total_function_cache(defined),
-                   2 if ops.Tensor._USE_EQUALITY else 4)
+    self.assertLen(total_function_cache(defined), 4)
+    defined(y, y, z)
+    self.assertLen(total_function_cache(defined), 4)
 
     # Different alias positions causes cache miss
     defined(z, y, y)
-    self.assertLen(total_function_cache(defined),
-                   3 if ops.Tensor._USE_EQUALITY else 5)
+    self.assertLen(total_function_cache(defined), 5)
+    defined(z, y, y)
+    self.assertLen(total_function_cache(defined), 5)
+
+    x_copy = copy.deepcopy(x)
+
+    # Deep copy causes cache miss
+    defined(x_copy, y, z)
+    self.assertLen(total_function_cache(defined), 6)
+    defined(x_copy, y, z)
+    self.assertLen(total_function_cache(defined), 6)
+
+  def testVariableRetracing(self):
+    v1 = variables.Variable(1.)
+    v2 = variables.Variable(1.)
+    v3 = copy.deepcopy(variables.Variable(1.))
+
+    var_dict = {id(v1): constant_op.constant(1),
+                id(v2): constant_op.constant(2),
+                id(v3): constant_op.constant(3)}
+
+    @function.defun
+    def lookup_tensor(v):
+      return var_dict[id(v)]
+
+    self.assertEqual(1, lookup_tensor(v1).numpy())
+    self.assertEqual(2, lookup_tensor(v2).numpy())
+    self.assertEqual(3, lookup_tensor(v3).numpy())
 
   def testDecoratedMethodInspect(self):
 
diff --git a/tensorflow/python/framework/auto_control_deps.py b/tensorflow/python/framework/auto_control_deps.py
index 51dcb248b11..4b47735e0bf 100644
--- a/tensorflow/python/framework/auto_control_deps.py
+++ b/tensorflow/python/framework/auto_control_deps.py
@@ -100,7 +100,7 @@ _ORDER_INSENSITIVE_STATEFUL_OPS = [
     "CudnnRNNV2", "CudnnRNNV3", "CudnnRNNBackpropV2", "CudnnRNNBackpropV3",
     "EnqueueTPUEmbeddingSparseBatch", "EnqueueTPUEmbeddingIntegerBatch",
     "EnqueueTPUEmbeddingSparseTensorBatch",
-    "EnqueueTPUEmbeddingRaggedTensorBatch"
+    "EnqueueTPUEmbeddingRaggedTensorBatch", "RestoreV2", "SaveV2"
 ]
 # LINT.ThenChange(//tensorflow/core/grappler/optimizers/function_optimizer.cc)
 
diff --git a/tensorflow/python/framework/convert_to_constants.py b/tensorflow/python/framework/convert_to_constants.py
index 87c74c3263d..4c3cbb06bf1 100644
--- a/tensorflow/python/framework/convert_to_constants.py
+++ b/tensorflow/python/framework/convert_to_constants.py
@@ -235,7 +235,8 @@ class _Node(_Convertible):
       return _If(node, function, enclosing_graph)
     elif node.op in ["While", "StatelessWhile"]:
       return _While(node, function, enclosing_graph)
-    elif node.op in ["Enter", "Exit", "Identity", "NextIteration", "Switch"]:
+    elif node.op in [
+        "Enter", "Exit", "Identity", "NextIteration", "Switch", "_SwitchN"]:
       return _Intermediate(node, function, enclosing_graph)
     else:
       return _Node(node, function, enclosing_graph)
diff --git a/tensorflow/python/framework/convert_to_constants_test.py b/tensorflow/python/framework/convert_to_constants_test.py
index b1e11003939..7252082d084 100644
--- a/tensorflow/python/framework/convert_to_constants_test.py
+++ b/tensorflow/python/framework/convert_to_constants_test.py
@@ -486,6 +486,40 @@ class VariablesToConstantsTest(test.TestCase):
     root, output_func = self._freezeModel(model)
     self._testConvertedFunction(root, root.f, output_func, input_data)
 
+  @test_util.run_v2_only
+  def testSwitchCase(self):
+    """Test a switch_case statement."""
+    input_data = {
+        "i": constant_op.constant(np.random.randint(0, 3, dtype=np.int32)),
+        "x": constant_op.constant(
+            np.asarray(np.random.random_sample((10, 3)), dtype=np.float32)),
+    }
+
+    w0 = variables.Variable(np.random.random_sample((3, 4)), dtype=np.float32)
+    w1 = variables.Variable(np.random.random_sample((3, 4)), dtype=np.float32)
+    w2 = variables.Variable(np.random.random_sample((4,)), dtype=np.float32)
+
+    def branch0(x):
+      return math_ops.matmul(x, w0)
+
+    def branch1(x):
+      return math_ops.matmul(x, w1)
+
+    def branch2(x):
+      x = array_ops.pad(x, [[0, 0], [0, 1]])
+      return x + w2
+
+    @def_function.function(input_signature=[
+        tensor_spec.TensorSpec(shape=[], dtype=dtypes.int32),
+        tensor_spec.TensorSpec(shape=[10, 3], dtype=dtypes.float32),
+    ])
+    def model(i, x):
+      return control_flow_ops.switch_case(i, [
+          lambda: branch0(x), lambda: branch1(x), lambda: branch2(x)])
+
+    root, output_func = self._freezeModel(model)
+    self._testConvertedFunction(root, root.f, output_func, input_data)
+
 
 class ConvertVariablesToConstantsSessionTest(test.TestCase):
 
diff --git a/tensorflow/python/keras/callbacks.py b/tensorflow/python/keras/callbacks.py
index 1fae5abd84b..138a682c739 100644
--- a/tensorflow/python/keras/callbacks.py
+++ b/tensorflow/python/keras/callbacks.py
@@ -420,8 +420,9 @@ class CallbackList(object):
 
     Arguments:
         batch: Integer, index of batch within the current epoch.
-        logs: Dict. Has keys `batch` and `size` representing the current batch
-          number and the size of the batch.
+        logs: Dict, contains the return value of `model.train_step`. Typically,
+          the values of the `Model`'s metrics are returned.  Example:
+          `{'loss': 0.2, 'accuracy': 0.7}`.
     """
     # TODO(b/150629188): Make ProgBarLogger callback not use batch hooks
     # when verbose != 1
@@ -443,8 +444,9 @@ class CallbackList(object):
 
     Arguments:
         batch: Integer, index of batch within the current epoch.
-        logs: Dict. Has keys `batch` and `size` representing the current batch
-          number and the size of the batch.
+        logs: Dict, contains the return value of `model.test_step`. Typically,
+          the values of the `Model`'s metrics are returned.  Example:
+          `{'loss': 0.2, 'accuracy': 0.7}`.
     """
     if self._should_call_test_batch_hooks:
       self._call_batch_hook(ModeKeys.TEST, 'begin', batch, logs=logs)
@@ -464,8 +466,9 @@ class CallbackList(object):
 
     Arguments:
         batch: Integer, index of batch within the current epoch.
-        logs: Dict. Has keys `batch` and `size` representing the current batch
-          number and the size of the batch.
+        logs: Dict, contains the return value of `model.predict_step`,
+          it typically returns a dict with a key 'outputs' containing
+          the model's outputs.
     """
     if self._should_call_predict_batch_hooks:
       self._call_batch_hook(ModeKeys.PREDICT, 'begin', batch, logs=logs)
diff --git a/tensorflow/python/keras/datasets/imdb.py b/tensorflow/python/keras/datasets/imdb.py
index 37403228edf..e359d691a5d 100644
--- a/tensorflow/python/keras/datasets/imdb.py
+++ b/tensorflow/python/keras/datasets/imdb.py
@@ -124,20 +124,24 @@ def load_data(path='imdb.npz',
   x_test = x_test[indices]
   labels_test = labels_test[indices]
 
-  xs = np.concatenate([x_train, x_test])
-  labels = np.concatenate([labels_train, labels_test])
-
   if start_char is not None:
-    xs = [[start_char] + [w + index_from for w in x] for x in xs]
+    x_train = [[start_char] + [w + index_from for w in x] for x in x_train]
+    x_test = [[start_char] + [w + index_from for w in x] for x in x_test]
   elif index_from:
-    xs = [[w + index_from for w in x] for x in xs]
+    x_train = [[w + index_from for w in x] for x in x_train]
+    x_test = [[w + index_from for w in x] for x in x_test]
 
   if maxlen:
-    xs, labels = _remove_long_seq(maxlen, xs, labels)
-    if not xs:
+    x_train, labels_train = _remove_long_seq(maxlen, x_train, labels_train)
+    x_test, labels_test = _remove_long_seq(maxlen, x_test, labels_test)
+    if not x_train or not x_test:
       raise ValueError('After filtering for sequences shorter than maxlen=' +
                        str(maxlen) + ', no sequence was kept. '
                        'Increase maxlen.')
+
+  xs = np.concatenate([x_train, x_test])
+  labels = np.concatenate([labels_train, labels_test])
+
   if not num_words:
     num_words = max(max(x) for x in xs)
 
diff --git a/tensorflow/python/keras/engine/base_layer.py b/tensorflow/python/keras/engine/base_layer.py
index fbec5382a08..97eb0447a69 100644
--- a/tensorflow/python/keras/engine/base_layer.py
+++ b/tensorflow/python/keras/engine/base_layer.py
@@ -943,10 +943,14 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
       kwargs['mask'] = input_masks
 
     # Training mode for `Layer.call` is set via (in order of priority):
-    # (1) The `training` argument passed to this `Layer.call`.
+    # (1) The `training` argument passed to this `Layer.call`, if it is not None
     # (2) The training mode of an outer `Layer.call`.
-    # (3) The default mode set by `tf.keras.backed.set_learning_phase` (if set).
-    training_mode = self._set_training_mode(args, kwargs, call_context)
+    # (3) The default mode set by `tf.keras.backend.set_learning_phase` (if set)
+    # (4) Any non-None default value for `training` specified in the call
+    #  signature
+    # (5) False (treating the layer as if it's in inference)
+    args, kwargs, training_mode = self._set_training_mode(
+        args, kwargs, call_context)
 
     # Losses are cleared for all sublayers on the outermost `Layer.call`.
     # Losses are not cleared on inner `Layer.call`s, because sublayers can be
@@ -1020,7 +1024,7 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
     # propagate `training` value from this layer's calling layer.
     training_value = None
     training_arg_passed_by_framework = False
-    # Priority 1: `training` was explicitly passed.
+    # Priority 1: `training` was explicitly passed a non-None value.
     if self._call_arg_was_passed('training', args, kwargs):
       training_value = self._get_call_arg_value('training', args, kwargs)
       if not self._expects_training_arg:
@@ -1030,17 +1034,23 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
       # Priority 2: `training` was passed to a parent layer.
       if call_context.training is not None:
         training_value = call_context.training
-      # Priority 3a: `learning_phase()` has been set.
+      # Priority 3: `learning_phase()` has been set.
       elif backend.global_learning_phase_is_set():
         training_value = backend.learning_phase()
-
-      if self._expects_training_arg and training_value is not None:
         # Force the training_value to be bool type which matches to the contract
         # for layer/model call args.
         if tensor_util.is_tensor(training_value):
           training_value = math_ops.cast(training_value, dtypes.bool)
         else:
           training_value = bool(training_value)
+      # Priority 4: trace layer with the default training argument specified
+      # in the `call` signature (or in inference mode if the `call` signature
+      # specifies no non-None default).
+      else:
+        training_value = self._default_training_arg
+      # In cases (2), (3), (4) the training argument is passed automatically
+      # by the framework, and will not be hard-coded into the model.
+      if self._expects_training_arg:
         args, kwargs = self._set_call_arg_value('training', training_value,
                                                 args, kwargs)
         training_arg_passed_by_framework = True
@@ -1150,6 +1160,8 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
       # (1) `training` was passed to this `Layer.call`.
       if self._call_arg_was_passed('training', args, kwargs):
         training_mode = self._get_call_arg_value('training', args, kwargs)
+      # If no `training` arg was passed, or `None` was explicitly passed,
+      # the framework will make a decision about the training mode is.
       if training_mode is None:
         call_ctx_training = call_context.training
         # (2) `training` mode is inferred from an outer `Layer.call`.
@@ -1165,10 +1177,15 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
             training_mode = math_ops.cast(training_mode, dtypes.bool)
           else:
             training_mode = bool(training_mode)
+        # (4) We default to using `call`'s default value for `training`,
+        # or treating the layer as if it is in inference if no non-None default
+        # is specified in the `call` signature.
+        else:
+          training_mode = self._default_training_arg
 
-        # For case (2) or (3), `training` arg is passed by framework.
-        if training_mode is not None:
-          kwargs['training'] = training_mode
+        # For case (2), (3), (4) `training` arg is passed by framework.
+        args, kwargs = self._set_call_arg_value('training', training_mode, args,
+                                                kwargs)
     else:
       if 'training' in kwargs:
         # `training` was passed to this `Layer` but is not needed for
@@ -1178,7 +1195,7 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
         # Grab the current `training` mode from any outer `Layer.call`.
         training_mode = call_context.training
 
-    return training_mode
+    return args, kwargs, training_mode
 
   def _autographed_call(self):
     # Wrapping `call` function in autograph to allow for dynamic control
@@ -2529,7 +2546,7 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
       if len(args) > arg_pos:
         args = list(args)
         args[arg_pos] = new_value
-        return args, kwargs
+        return tuple(args), kwargs
     if new_value is None and pop_kwarg_if_none:
       kwargs.pop(arg_name, None)
     else:
@@ -2873,6 +2890,10 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
     call_fn_args = self._call_fn_args
     self._expects_training_arg = ('training' in call_fn_args or
                                   self._call_accepts_kwargs)
+    # The default training arg will be any (non-None) default specified in the
+    # method signature, or `False` if no non-None default is specified.
+    self._default_training_arg = self._call_fn_arg_defaults.get(
+        'training') or False
     self._expects_mask_arg = ('mask' in call_fn_args or
                               self._call_accepts_kwargs)
 
@@ -2892,6 +2913,19 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
       return all_args[1:]
     return all_args
 
+  @property
+  @tracking.cached_per_instance
+  def _call_fn_arg_defaults(self):
+    call_fn_args = self._call_fn_args
+    call_fn_defaults = self._call_full_argspec.defaults or []
+    defaults = dict()
+
+    # The call arg defaults are an n-tuple of the last n elements of the args
+    # list. (n = # of elements that have a default argument)
+    for i in range(-1 * len(call_fn_defaults), 0):
+      defaults[call_fn_args[i]] = call_fn_defaults[i]
+    return defaults
+
   @property
   @tracking.cached_per_instance
   def _call_fn_arg_positions(self):
diff --git a/tensorflow/python/keras/engine/base_layer_test.py b/tensorflow/python/keras/engine/base_layer_test.py
index b861d7e4b5b..58a0799329a 100644
--- a/tensorflow/python/keras/engine/base_layer_test.py
+++ b/tensorflow/python/keras/engine/base_layer_test.py
@@ -629,6 +629,96 @@ class BaseLayerTest(keras_parameterized.TestCase):
     self.assertTrue(layer.built)
     self.assertEqual([None, 3], layer._build_input_shape.as_list())
 
+  @combinations.generate(combinations.combine(mode=['eager']))
+  def custom_layer_training_arg(self):
+    class CustomLayerNoTrainingArg(base_layer.Layer):
+
+      def __init__(self, nested_layer=None):
+        self._nested_layer = nested_layer or array_ops.identity
+
+      def call(self, inputs):
+        return self._nested_layer(inputs)
+
+    class CustomLayerDefaultTrainingMissing(base_layer.Layer):
+
+      def __init__(self, nested_layer=None):
+        self._nested_layer = nested_layer or array_ops.identity
+
+      def call(self, inputs, training):
+        if training:
+          return self._nested_layer(inputs)
+        else:
+          return self._nested_layer(inputs) * 0.5
+
+    class CustomLayerDefaultTrainingFalse(base_layer.Layer):
+
+      def __init__(self, nested_layer=None):
+        self._nested_layer = nested_layer or array_ops.identity
+
+      def call(self, inputs, training=False):
+        if training:
+          return self._nested_layer(inputs)
+        else:
+          return self._nested_layer(inputs) * 0.5
+
+    class CustomLayerDefaultTrainingTrue(base_layer.Layer):
+
+      def __init__(self, nested_layer=None):
+        self._nested_layer = nested_layer or array_ops.identity
+
+      def call(self, inputs, training=True):
+        if training:
+          return self._nested_layer(inputs)
+        else:
+          return self._nested_layer(inputs) * 0.5
+
+    x = array_ops.ones(shape=(1, 1))
+
+    # If the layer signature doesn't specify a default training arg,
+    # run it in inference mode when to training arg is passed
+    # to __call__
+    layer = CustomLayerDefaultTrainingMissing()
+    self.assertAllEqual(layer(x), x * 0.5)
+    self.assertAllEqual(layer(x, training=False), x * 0.5)
+    self.assertAllEqual(layer(x, training=True), x)
+
+    # If the layer signature specifies `False` as the default training arg,
+    # run it in inference mode when no training arg is passed
+    # to __call__
+    layer = CustomLayerDefaultTrainingFalse()
+    self.assertAllEqual(layer(x), x * 0.5)
+    self.assertAllEqual(layer(x, training=False), x * 0.5)
+    self.assertAllEqual(layer(x, training=True), x)
+
+    # If the layer signature specifies `True` as the default training arg,
+    # explicitly run it in training mode when no training arg is passed
+    # to __call__
+    layer = CustomLayerDefaultTrainingTrue()
+    self.assertAllEqual(layer(x), x)
+    self.assertAllEqual(layer(x, training=False), x * 0.5)
+    self.assertAllEqual(layer(x, training=True), x)
+
+    # Outer layers/models should set the training context implicitly for all
+    # nested layers, respecting whatever mode the outer layer was run with.
+    layer = CustomLayerDefaultTrainingTrue(CustomLayerDefaultTrainingFalse())
+    self.assertAllEqual(layer(x), x)
+    self.assertAllEqual(layer(x, training=False), x * 0.25)
+    self.assertAllEqual(layer(x, training=True), x)
+
+    layer = CustomLayerDefaultTrainingFalse(CustomLayerDefaultTrainingTrue())
+    self.assertAllEqual(layer(x), x * 0.25)
+    self.assertAllEqual(layer(x, training=False), x * 0.25)
+    self.assertAllEqual(layer(x, training=True), x)
+
+    # If the outer layer `call` doesn't take a training argument at all,
+    # it'll set the nested scope as inference when no training arg is passed in.
+    # If a training arg is passed in it won't use it directly in `call`, but
+    # it will set the nested training mode.
+    layer = CustomLayerNoTrainingArg(CustomLayerDefaultTrainingTrue())
+    self.assertAllEqual(layer(x), x * 0.5)
+    self.assertAllEqual(layer(x, training=False), x * 0.5)
+    self.assertAllEqual(layer(x, training=True), x)
+
   def test_activity_regularizer_string(self):
 
     class MyLayer(base_layer.Layer):
@@ -1387,6 +1477,7 @@ class DTypeTest(keras_parameterized.TestCase):
     class IdentityLayerWithArgs(base_layer.Layer):
 
       def call(self, inputs, *args, **kwargs):
+        kwargs.pop('training', None)
         return nest.flatten([inputs, args, kwargs])
 
     layer = IdentityLayerWithArgs(dtype='float64')
diff --git a/tensorflow/python/keras/engine/functional_test.py b/tensorflow/python/keras/engine/functional_test.py
index a7e314d4a49..3c14411deb9 100644
--- a/tensorflow/python/keras/engine/functional_test.py
+++ b/tensorflow/python/keras/engine/functional_test.py
@@ -2036,43 +2036,73 @@ class CacheCorrectnessTest(keras_parameterized.TestCase):
 
   def test_training_passed_during_construction(self):
 
+    def _call(inputs, training):
+      if training is None:
+        return inputs * -1.0
+      elif training:
+        return inputs
+      else:
+        return inputs * 0.0
+
     class MyLayer(base_layer.Layer):
 
-      def call(self, x, training=None):
-        if training is None:
-          return x * -1.0
-        elif training:
-          return x
-        else:
-          return x * 0.0
+      def call(self, inputs, training=True):
+        return _call(inputs, training)
 
     my_layer = MyLayer()
     x = np.ones((1, 10))
 
+    # Hard-coded `true` value passed during construction is respected.
     inputs = input_layer_lib.Input(10)
     outputs = my_layer(inputs, training=True)
     network = functional.Functional(inputs, outputs)
+    self.assertAllEqual(network(x, training=True), _call(x, True))
+    self.assertAllEqual(network(x, training=False), _call(x, True))
+    self.assertAllEqual(network(x), _call(x, True))
 
-    # Hard-coded value passed during construction is respected.
-    self.assertAllEqual(network(x, training=False), x)
-
+    # Hard-coded `false` value passed during construction is respected.
     inputs = input_layer_lib.Input(10)
     outputs = my_layer(inputs, training=False)
     network = functional.Functional(inputs, outputs)
+    self.assertAllEqual(network(x, training=True), _call(x, False))
+    self.assertAllEqual(network(x, training=False), _call(x, False))
+    self.assertAllEqual(network(x), _call(x, False))
 
-    network(x, training=True)
-    # Hard-coded value passed during construction is respected.
-    self.assertAllEqual(network(x, training=True), x * 0.0)
+    if context.executing_eagerly():
+      # In v2, construction still works when no `training` is specified
+      # When no value passed during construction, it uses the runtime value.
+      inputs = input_layer_lib.Input(10)
+      outputs = my_layer(inputs)
+      network = functional.Functional(inputs, outputs)
+      self.assertAllEqual(network(x, training=True), _call(x, True))
+      self.assertAllEqual(network(x, training=False), _call(x, False))
+      self.assertAllEqual(network(x), _call(x, False))
 
+    # `None` value passed positionally during construction is ignored at runtime
+    inputs = input_layer_lib.Input(10)
+    outputs = my_layer(inputs, None)
+    network = functional.Functional(inputs, outputs)
+    self.assertAllEqual(network(x, training=True), _call(x, True))
+    self.assertAllEqual(network(x, training=False), _call(x, False))
+    if context.executing_eagerly():
+      self.assertAllEqual(network(x), _call(x, False))
+    else:
+      # in v1 training would have defaulted to using the `None` inside the layer
+      # if training is not passed at runtime
+      self.assertAllEqual(network(x), _call(x, None))
+
+    # `None` value passed as kwarg during construction is ignored at runtime.
     inputs = input_layer_lib.Input(10)
     outputs = my_layer(inputs, training=None)
     network = functional.Functional(inputs, outputs)
-
-    # `None` value passed during construction is overridden.
-    self.assertAllEqual(network(x, training=True), x)
-    # `None` value passed during construction is overridden.
-    self.assertAllEqual(network(x, training=False), x * 0.0)
-
+    self.assertAllEqual(network(x, training=True), _call(x, True))
+    self.assertAllEqual(network(x, training=False), _call(x, False))
+    if context.executing_eagerly():
+      self.assertAllEqual(network(x), _call(x, False))
+    else:
+      # in v1 training would have defaulted to using the `None` inside the layer
+      # if training is not passed at runtime
+      self.assertAllEqual(network(x), _call(x, None))
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/layers/rnn_cell_wrapper_v2.py b/tensorflow/python/keras/layers/rnn_cell_wrapper_v2.py
index 4356244b292..d387a375aa2 100644
--- a/tensorflow/python/keras/layers/rnn_cell_wrapper_v2.py
+++ b/tensorflow/python/keras/layers/rnn_cell_wrapper_v2.py
@@ -27,6 +27,7 @@ from __future__ import print_function
 
 from tensorflow.python.keras.layers import recurrent
 from tensorflow.python.ops import rnn_cell_wrapper_impl
+from tensorflow.python.util import tf_inspect
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -41,6 +42,10 @@ class _RNNCellWrapperV2(recurrent.AbstractRNNCell):
   def __init__(self, cell, *args, **kwargs):
     super(_RNNCellWrapperV2, self).__init__(*args, **kwargs)
     self.cell = cell
+    cell_call_spec = tf_inspect.getfullargspec(cell.call)
+    self._expects_training_arg = ("training" in cell_call_spec.args) or (
+        cell_call_spec.varkw is not None
+    )
 
   def call(self, inputs, state, **kwargs):
     """Runs the RNN cell step computation.
diff --git a/tensorflow/python/keras/tests/custom_training_loop_test.py b/tensorflow/python/keras/tests/custom_training_loop_test.py
index 5b3310b2b40..6291933ac99 100644
--- a/tensorflow/python/keras/tests/custom_training_loop_test.py
+++ b/tensorflow/python/keras/tests/custom_training_loop_test.py
@@ -186,7 +186,7 @@ class CustomTrainingLoopTest(keras_parameterized.TestCase):
 
     def train_step(x):
       no_learning_phase_out = model(x)
-      self.assertIsNone(model.layer.training)
+      self.assertFalse(model.layer.training)
       with keras.backend.learning_phase_scope(0):
         inf_learning_phase_out = model(x)
       self.assertEqual(model.layer.training, 0)
diff --git a/tensorflow/python/kernel_tests/resource_variable_ops_test.py b/tensorflow/python/kernel_tests/resource_variable_ops_test.py
index b45e9dfb2bc..fb172fbcb10 100644
--- a/tensorflow/python/kernel_tests/resource_variable_ops_test.py
+++ b/tensorflow/python/kernel_tests/resource_variable_ops_test.py
@@ -30,7 +30,6 @@ from tensorflow.core.framework import tensor_pb2
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
-from tensorflow.python.framework import config
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import cpp_shape_inference_pb2
 from tensorflow.python.framework import dtypes
@@ -1513,41 +1512,5 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase,
     self.assertAllEqual(expected, result)
 
 
-class PerReplicaResourceHandleTest(test_util.TensorFlowTestCase):
-
-  def setUp(self):
-    super(PerReplicaResourceHandleTest, self).setUp()
-    cpus = config.list_physical_devices("CPU")
-    # Set 2 virtual CPUs
-    config.set_logical_device_configuration(cpus[0], [
-        context.LogicalDeviceConfiguration(),
-        context.LogicalDeviceConfiguration(),
-    ])
-
-  @test_util.disable_tfrt("Multiple device support. b/154956430")
-  def testAllowedDevices(self):
-    device0 = "/job:localhost/replica:0/task:0/device:CPU:0"
-    device1 = "/job:localhost/replica:0/task:0/device:CPU:1"
-    value0 = 1
-    value1 = 2
-    with context.eager_mode():
-      handle = resource_variable_ops.var_handle_op(
-          dtype=dtypes.int32, shape=[], allowed_devices=[device0, device1])
-      with ops.device(device0):
-        assign0 = resource_variable_ops.assign_variable_op(handle, value0)
-      with ops.device(device1):
-        assign1 = resource_variable_ops.assign_variable_op(handle, value1)
-      with ops.control_dependencies([assign0, assign1]):
-        with ops.device(device0):
-          read0 = resource_variable_ops.read_variable_op(
-              handle, dtype=dtypes.int32)
-        with ops.device(device1):
-          read1 = resource_variable_ops.read_variable_op(
-              handle, dtype=dtypes.int32)
-
-      self.assertAllEqual(value0, read0)
-      self.assertAllEqual(value1, read1)
-
-
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/ops/image_ops_impl.py b/tensorflow/python/ops/image_ops_impl.py
index a934639d524..683681b5c98 100644
--- a/tensorflow/python/ops/image_ops_impl.py
+++ b/tensorflow/python/ops/image_ops_impl.py
@@ -3642,20 +3642,25 @@ def ssim(img1,
     values are in range (-1, 1], when pixel values are non-negative. Returns
     a tensor with shape: broadcast(img1.shape[:-3], img2.shape[:-3]).
   """
-  _, _, checks = _verify_compatible_image_shapes(img1, img2)
-  with ops.control_dependencies(checks):
-    img1 = array_ops.identity(img1)
+  with ops.name_scope(None, 'SSIM', [img1, img2]):
+    # Convert to tensor if needed.
+    img1 = ops.convert_to_tensor(img1, name='img1')
+    img2 = ops.convert_to_tensor(img2, name='img2')
+    # Shape checking.
+    _, _, checks = _verify_compatible_image_shapes(img1, img2)
+    with ops.control_dependencies(checks):
+      img1 = array_ops.identity(img1)
 
-  # Need to convert the images to float32.  Scale max_val accordingly so that
-  # SSIM is computed correctly.
-  max_val = math_ops.cast(max_val, img1.dtype)
-  max_val = convert_image_dtype(max_val, dtypes.float32)
-  img1 = convert_image_dtype(img1, dtypes.float32)
-  img2 = convert_image_dtype(img2, dtypes.float32)
-  ssim_per_channel, _ = _ssim_per_channel(img1, img2, max_val, filter_size,
-                                          filter_sigma, k1, k2)
-  # Compute average over color channels.
-  return math_ops.reduce_mean(ssim_per_channel, [-1])
+    # Need to convert the images to float32.  Scale max_val accordingly so that
+    # SSIM is computed correctly.
+    max_val = math_ops.cast(max_val, img1.dtype)
+    max_val = convert_image_dtype(max_val, dtypes.float32)
+    img1 = convert_image_dtype(img1, dtypes.float32)
+    img2 = convert_image_dtype(img2, dtypes.float32)
+    ssim_per_channel, _ = _ssim_per_channel(img1, img2, max_val, filter_size,
+                                            filter_sigma, k1, k2)
+    # Compute average over color channels.
+    return math_ops.reduce_mean(ssim_per_channel, [-1])
 
 
 # Default values obtained by Wang et al.
diff --git a/tensorflow/python/ops/image_ops_test.py b/tensorflow/python/ops/image_ops_test.py
index 3530885fe07..0206ccf9b33 100644
--- a/tensorflow/python/ops/image_ops_test.py
+++ b/tensorflow/python/ops/image_ops_test.py
@@ -4865,6 +4865,29 @@ class SSIMTest(test_util.TensorFlowTestCase):
     with self.cached_session(use_gpu=True):
       self.assertAllClose(expected, self.evaluate(ssim), atol=1e-4)
 
+  def testBatchNumpyInputs(self):
+    img = self._LoadTestImages()
+    expected = self._ssim[np.triu_indices(3, k=1)]
+
+    img1, img2 = zip(*itertools.combinations(img, 2))
+    img1 = np.concatenate(img1)
+    img2 = np.concatenate(img2)
+
+    with self.cached_session(use_gpu=True):
+      img1 = self.evaluate(constant_op.constant(img1))
+      img2 = self.evaluate(constant_op.constant(img2))
+
+    ssim = image_ops.ssim(
+        img1,
+        img2,
+        1.0,
+        filter_size=11,
+        filter_sigma=1.5,
+        k1=0.01,
+        k2=0.03)
+    with self.cached_session(use_gpu=True):
+      self.assertAllClose(expected, self.evaluate(ssim), atol=1e-4)
+
   def testBroadcast(self):
     img = self._LoadTestImages()[:2]
     expected = self._ssim[:2, :2]
diff --git a/tensorflow/python/ops/numpy_ops/np_arrays.py b/tensorflow/python/ops/numpy_ops/np_arrays.py
index 65e8273375f..46252dd169c 100644
--- a/tensorflow/python/ops/numpy_ops/np_arrays.py
+++ b/tensorflow/python/ops/numpy_ops/np_arrays.py
@@ -149,10 +149,23 @@ class ndarray(composite_tensor.CompositeTensor):  # pylint: disable=invalid-name
     if dtype and dtype != buffer.dtype:
       buffer = array_ops.bitcast(buffer, dtype)
     self._data = buffer
+    self._type_spec_internal = None
+
+  @classmethod
+  def from_tensor(cls, tensor):
+    o = cls.__new__(cls, None)
+    # pylint: disable=protected-access
+    o._data = tensor
+    o._type_spec_internal = None
+    # pylint: enable=protected-access
+    return o
 
   @property
   def _type_spec(self):
-    return NdarraySpec(type_spec.type_spec_from_value(self._data))
+    if self._type_spec_internal is None:
+      self._type_spec_internal = NdarraySpec(
+          type_spec.type_spec_from_value(self._data))
+    return self._type_spec_internal
 
   @property
   def data(self):
@@ -293,9 +306,13 @@ class ndarray(composite_tensor.CompositeTensor):  # pylint: disable=invalid-name
   def __repr__(self):
     return 'ndarray<{}>'.format(self.data.__repr__())
 
+  @property
+  def _id(self):
+    return self.data._id  # pylint: disable=protected-access
+
 
 def tensor_to_ndarray(tensor):
-  return ndarray(tensor._shape_tuple(), dtype=tensor.dtype, buffer=tensor)  # pylint: disable=protected-access
+  return ndarray.from_tensor(tensor)
 
 
 def ndarray_to_tensor(arr, dtype=None, name=None, as_ref=False):
diff --git a/tensorflow/python/ops/numpy_ops/np_interop_test.py b/tensorflow/python/ops/numpy_ops/np_interop_test.py
index e00c305c35b..bd6dbeaf14c 100644
--- a/tensorflow/python/ops/numpy_ops/np_interop_test.py
+++ b/tensorflow/python/ops/numpy_ops/np_interop_test.py
@@ -64,7 +64,9 @@ class InteropTest(test.TestCase):
 
     dx, dy = t.gradient([xx, yy], [x, y])
 
-    # TODO(nareshmodi): Gradient tape returns tensors. Is it possible to rewrap?
+    # # TODO(nareshmodi): Figure out a way to rewrap ndarray as tensors.
+    # self.assertIsInstance(dx, np_arrays.ndarray)
+    # self.assertIsInstance(dy, np_arrays.ndarray)
     self.assertAllClose(dx, 2.0)
     self.assertAllClose(dy, 3.0)
 
diff --git a/tensorflow/python/training/saving/BUILD b/tensorflow/python/training/saving/BUILD
index 670a4c35c6f..12940840309 100644
--- a/tensorflow/python/training/saving/BUILD
+++ b/tensorflow/python/training/saving/BUILD
@@ -43,6 +43,7 @@ cuda_py_test(
         ":checkpoint_options",
         ":functional_saver",
         ":saveable_hook",
+        "//tensorflow/python/eager:remote",
         "//tensorflow/python/eager:test",
     ],
 )
diff --git a/tensorflow/python/training/saving/functional_saver.py b/tensorflow/python/training/saving/functional_saver.py
index c4334e096df..3a9b565470d 100644
--- a/tensorflow/python/training/saving/functional_saver.py
+++ b/tensorflow/python/training/saving/functional_saver.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 import uuid
 
 from tensorflow.core.protobuf import saver_pb2
+from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -161,7 +162,8 @@ class MultiDeviceSaver(object):
         self._after_restore_callbacks.append(saveable.after_restore)
 
       if is_saveable:
-        saveables_by_device.setdefault(saveable.device, []).append(saveable)
+        host_device = saveable_object_util.set_cpu0(saveable.device)
+        saveables_by_device.setdefault(host_device, []).append(saveable)
 
     self._single_device_savers = {
         device: _SingleDeviceSaver(saveables)
@@ -247,33 +249,50 @@ class MultiDeviceSaver(object):
       tmp_checkpoint_prefix = string_ops.string_join(
           [file_prefix, sharded_suffix])
 
-    num_shards = len(self._single_device_savers)
-    sharded_saves = []
-    sharded_prefixes = []
-    num_shards_tensor = constant_op.constant(num_shards, name="num_shards")
-    last_device = None
-    for shard, (device, saver) in enumerate(
-        sorted(self._single_device_savers.items())):
-      last_device = device
-      with ops.device(saveable_object_util.set_cpu0(device)):
-        shard_prefix = sharded_filename(tmp_checkpoint_prefix, shard,
-                                        num_shards_tensor)
-      sharded_prefixes.append(shard_prefix)
-      with ops.device(device):
-        # _SingleDeviceSaver will use the CPU device when necessary, but initial
-        # read operations should be placed on the SaveableObject's device.
-        sharded_saves.append(saver.save(shard_prefix, options))
+    def save_fn():
+      num_shards = len(self._single_device_savers)
+      sharded_saves = []
+      sharded_prefixes = []
+      num_shards_tensor = constant_op.constant(num_shards, name="num_shards")
+      last_device = None
+      for shard, (device, saver) in enumerate(
+          sorted(self._single_device_savers.items())):
+        last_device = device
+        with ops.device(saveable_object_util.set_cpu0(device)):
+          shard_prefix = sharded_filename(tmp_checkpoint_prefix, shard,
+                                          num_shards_tensor)
+        sharded_prefixes.append(shard_prefix)
+        with ops.device(device):
+          # _SingleDeviceSaver will use the CPU device when necessary, but
+          # initial read operations should be placed on the SaveableObject's
+          # device.
+          sharded_saves.append(saver.save(shard_prefix, options))
 
-    with ops.control_dependencies(sharded_saves):
-      # Merge on the io_device if specified, otherwise co-locates the merge op
-      # with the last device used.
-      merge_device = (options.experimental_io_device or
-                      saveable_object_util.set_cpu0(last_device))
-      with ops.device(merge_device):
-        # V2 format write path consists of a metadata merge step.  Once merged,
-        # attempts to delete the temporary directory, "<user-fed prefix>_temp".
-        return gen_io_ops.merge_v2_checkpoints(
-            sharded_prefixes, file_prefix, delete_old_dirs=True)
+      with ops.control_dependencies(sharded_saves):
+        # Merge on the io_device if specified, otherwise co-locates the merge op
+        # with the last device used.
+        merge_device = (
+            options.experimental_io_device or
+            saveable_object_util.set_cpu0(last_device))
+        with ops.device(merge_device):
+          # V2 format write path consists of a metadata merge step.  Once
+          # merged, attempts to delete the temporary directory,
+          # "<user-fed prefix>_temp".
+          return gen_io_ops.merge_v2_checkpoints(
+              sharded_prefixes, file_prefix, delete_old_dirs=True)
+
+    # Since this will causes a function re-trace on each save, limit this to the
+    # cases where it is needed: eager and when there are multiple tasks/single
+    # device savers. Note that the retrace is needed to ensure we pickup the
+    # latest values of options like experimental_io_device.
+    if context.executing_eagerly() and len(self._single_device_savers) > 1:
+      # Explicitly place the identity op on the first device.
+      @def_function.function(experimental_compile=False)
+      def tf_function_save():
+        save_fn()
+      tf_function_save()
+    else:
+      return save_fn()
 
   def restore(self, file_prefix, options=None):
     """Restore the saveable objects from a checkpoint with `file_prefix`.
@@ -287,12 +306,38 @@ class MultiDeviceSaver(object):
       A dictionary mapping from SaveableObject names to restore operations.
     """
     options = options or checkpoint_options.CheckpointOptions()
-    restore_ops = {}
-    # Sort by device name to avoid propagating non-deterministic dictionary
-    # ordering in some Python versions.
-    for device, saver in sorted(self._single_device_savers.items()):
-      with ops.device(device):
-        restore_ops.update(saver.restore(file_prefix, options))
+
+    def restore_fn():
+      restore_ops = {}
+      # Sort by device name to avoid propagating non-deterministic dictionary
+      # ordering in some Python versions.
+      for device, saver in sorted(self._single_device_savers.items()):
+        with ops.device(device):
+          restore_ops.update(saver.restore(file_prefix, options))
+
+      return restore_ops
+
+    # Since this will causes a function re-trace on each save, limit this to the
+    # cases where it is needed: eager and when there are multiple tasks/single
+    # device savers. Note that the retrace is needed to ensure we pickup the
+    # latest values of options like experimental_io_device.
+    if context.executing_eagerly() and len(self._single_device_savers) > 1:
+      first_device, _ = list(self._single_device_savers.items())[0]
+      @def_function.function(experimental_compile=False)
+      def tf_function_restore():
+        restore_ops = restore_fn()
+        restore_tensors = {}
+        # tf.functions must return tensors, thus we use control dependencies so
+        # that we can return a tensor which depends on the given op.
+        with ops.device(saveable_object_util.set_cpu0(first_device)):
+          for name, op in restore_ops.items():
+            with ops.control_dependencies([op]):
+              restore_tensors[name] = array_ops.identity(file_prefix)
+        return restore_tensors
+
+      restore_ops = tf_function_restore()
+    else:
+      restore_ops = restore_fn()
 
     for callback in self._after_restore_callbacks:
       callback()
diff --git a/tensorflow/python/training/saving/functional_saver_test.py b/tensorflow/python/training/saving/functional_saver_test.py
index 7db32ff72d7..8f3eef4fb9c 100644
--- a/tensorflow/python/training/saving/functional_saver_test.py
+++ b/tensorflow/python/training/saving/functional_saver_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 import os
 
 from tensorflow.python.eager import context
+from tensorflow.python.eager import remote
 from tensorflow.python.eager import test
 from tensorflow.python.eager import wrap_function
 from tensorflow.python.framework import config
@@ -29,6 +30,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.platform import gfile
+from tensorflow.python.training import server_lib
 from tensorflow.python.training.saving import checkpoint_options
 from tensorflow.python.training.saving import functional_saver
 from tensorflow.python.training.saving import saveable_hook
@@ -126,13 +128,16 @@ class SaverTest(test.TestCase):
     second_saver.restore(save_path)
     self.assertEqual(2., self.evaluate(v2))
 
-  @test_util.run_in_graph_and_eager_modes
-  def test_checkpoint_is_sharded_by_device(self):
-    with ops.device("cpu:0"):
+  def test_checkpoint_is_sharded_by_task(self):
+    servers = [server_lib.Server.create_local_server() for _ in range(3)]
+    cluster_spec = server_lib.ClusterSpec({
+        "worker": [s.target[len("grpc://"):] for s in servers]})
+    remote.connect_to_cluster(cluster_spec)
+    with ops.device("/job:worker/task:0/cpu:0"):
       v0 = resource_variable_ops.ResourceVariable(0.)
-    with ops.device("cpu:1"):
+    with ops.device("/job:worker/task:1/cpu:0"):
       v1 = resource_variable_ops.ResourceVariable(1.)
-    with ops.device("cpu:2"):
+    with ops.device("/job:worker/task:2/cpu:0"):
       v2 = resource_variable_ops.ResourceVariable(2.)
 
     self.evaluate([v0.initializer, v1.initializer, v2.initializer])
@@ -167,7 +172,7 @@ class SaverTest(test.TestCase):
         list(saveable_object_util.saveable_objects_for_op(v2, "v2")))
     prefix = os.path.join(self.get_temp_dir(), "ckpt")
     self.evaluate(saver.save(constant_op.constant(prefix), self.local_options))
-    self.assertEqual(4, len(gfile.Glob(prefix + "*")))
+    self.assertEqual(2, len(gfile.Glob(prefix + "*")))
     self.evaluate(v0.assign(-1.))
     self.evaluate(v1.assign(-1.))
     self.evaluate(v2.assign(-1.))
diff --git a/tensorflow/tools/pip_package/setup.py b/tensorflow/tools/pip_package/setup.py
index 92061b396ce..42233f80c1c 100644
--- a/tensorflow/tools/pip_package/setup.py
+++ b/tensorflow/tools/pip_package/setup.py
@@ -58,7 +58,9 @@ REQUIRED_PACKAGES = [
     'google_pasta >= 0.1.8',
     'h5py >= 2.10.0, < 2.11.0',
     'keras_preprocessing >= 1.1.1, < 1.2',
-    'numpy >= 1.16.0, < 2.0',
+    # TODO(mihaimaruseac): numpy 1.19.0 has ABI breakage
+    # https://github.com/numpy/numpy/pull/15355
+    'numpy >= 1.16.0, < 1.19.0',
     'opt_einsum >= 2.3.2',
     'protobuf >= 3.9.2',
     'tensorboard >= 2.2.0, < 2.3.0',
diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index e6a15b422eb..0f591ba8b90 100755
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -237,11 +237,11 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
         name = "eigen_archive",
         build_file = clean_dep("//third_party:eigen.BUILD"),
         patch_file = clean_dep("//third_party/eigen3:gpu_packet_math.patch"),
-        sha256 = "d26ada177ed9b696a9447fc85d209932a032c8ffc51630cf15eea8629b29dad6",  # SHARED_EIGEN_SHA
-        strip_prefix = "eigen-6b9c92fe7eff0dedb031cec38004c9c3667f3057",
+        sha256 = "f632d82e43ffc46adfac9043beace700b0265748075e7edc0701d81380258038",  # SHARED_EIGEN_SHA
+        strip_prefix = "eigen-386d809bde475c65b7940f290efe80e6a05878c4",
         urls = [
-            "https://storage.googleapis.com/mirror.tensorflow.org/gitlab.com/libeigen/eigen/-/archive/6b9c92fe7eff0dedb031cec38004c9c3667f3057/eigen-6b9c92fe7eff0dedb031cec38004c9c3667f3057.tar.gz",
-            "https://gitlab.com/libeigen/eigen/-/archive/6b9c92fe7eff0dedb031cec38004c9c3667f3057/eigen-6b9c92fe7eff0dedb031cec38004c9c3667f3057.tar.gz",
+            "https://storage.googleapis.com/mirror.tensorflow.org/gitlab.com/libeigen/eigen/-/archive/386d809bde475c65b7940f290efe80e6a05878c4/eigen-386d809bde475c65b7940f290efe80e6a05878c4.tar.gz",
+            "https://gitlab.com/libeigen/eigen/-/archive/386d809bde475c65b7940f290efe80e6a05878c4/eigen-386d809bde475c65b7940f290efe80e6a05878c4.tar.gz",
         ],
     )
 
@@ -710,8 +710,8 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
     )
 
     # Check out LLVM and MLIR from llvm-project.
-    LLVM_COMMIT = "7f0d7f32631648acf48bc23047635ab5e2058a1a"
-    LLVM_SHA256 = "2f1dbae231b3b8f9c67d6a4f578c8ce29f3aa2831313b34c40ff2edb4014476a"
+    LLVM_COMMIT = "7e825abd5704ce28b166f9463d4bd304348fd2a9"
+    LLVM_SHA256 = "a21b752ee1866e195f3f72c7931c79f8c4ecc0f14861488284bdc2bdf14d6fe9"
     LLVM_URLS = [
         "https://storage.googleapis.com/mirror.tensorflow.org/github.com/llvm/llvm-project/archive/{commit}.tar.gz".format(commit = LLVM_COMMIT),
         "https://github.com/llvm/llvm-project/archive/{commit}.tar.gz".format(commit = LLVM_COMMIT),
@@ -802,11 +802,11 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
         name = "nccl_archive",
         build_file = clean_dep("//third_party:nccl/archive.BUILD"),
         patch_file = clean_dep("//third_party/nccl:archive.patch"),
-        sha256 = "7ff66aca18392b162829612e02c00b123a58ec35869334f72d7e5afaf5ea4a13",
-        strip_prefix = "nccl-3701130b3c1bcdb01c14b3cb70fe52498c1e82b7",
+        sha256 = "67e15ce3d12ba9ea1e0cb239599202b0f61c146149699341043c072de388e90a",
+        strip_prefix = "nccl-5949d96f36d050e59d05872f8bbffd2549318e95",
         urls = [
-            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/nvidia/nccl/archive/3701130b3c1bcdb01c14b3cb70fe52498c1e82b7.tar.gz",
-            "https://github.com/nvidia/nccl/archive/3701130b3c1bcdb01c14b3cb70fe52498c1e82b7.tar.gz",
+            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/nvidia/nccl/archive/5949d96f36d050e59d05872f8bbffd2549318e95.tar.gz",
+            "https://github.com/nvidia/nccl/archive/5949d96f36d050e59d05872f8bbffd2549318e95.tar.gz",
         ],
     )
 
diff --git a/third_party/gpus/cuda/BUILD b/third_party/gpus/cuda/BUILD
index 413d28a2723..e69de29bb2d 100644
--- a/third_party/gpus/cuda/BUILD
+++ b/third_party/gpus/cuda/BUILD
@@ -1,9 +0,0 @@
-load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
-
-bzl_library(
-    name = "build_defs_bzl",
-    srcs = ["build_defs.bzl"],
-    deps = [
-        "@bazel_skylib//lib:selects",
-    ],
-)
diff --git a/third_party/gpus/rocm/BUILD b/third_party/gpus/rocm/BUILD
index bc2dd419259..e69de29bb2d 100644
--- a/third_party/gpus/rocm/BUILD
+++ b/third_party/gpus/rocm/BUILD
@@ -1,6 +0,0 @@
-load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
-
-bzl_library(
-    name = "build_defs_bzl",
-    srcs = ["build_defs.bzl"],
-)
diff --git a/third_party/nccl/archive.BUILD b/third_party/nccl/archive.BUILD
index 9b0c9bdda1d..028b348caff 100644
--- a/third_party/nccl/archive.BUILD
+++ b/third_party/nccl/archive.BUILD
@@ -46,6 +46,7 @@ gen_device_srcs(
         "src/collectives/device/broadcast.cu.cc",
         "src/collectives/device/reduce.cu.cc",
         "src/collectives/device/reduce_scatter.cu.cc",
+        "src/collectives/device/sendrecv.cu.cc",
     ],
 )
 
diff --git a/third_party/nccl/archive.patch b/third_party/nccl/archive.patch
index ea0ead71fb2..94ef48d00e8 100644
--- a/third_party/nccl/archive.patch
+++ b/third_party/nccl/archive.patch
@@ -22,6 +22,10 @@ diff --git a/src/collectives/device/reduce_scatter.cu b/src/collectives/device/r
 similarity index 100%
 rename from src/collectives/device/reduce_scatter.cu
 rename to src/collectives/device/reduce_scatter.cu.cc
+diff --git a/src/collectives/device/sendrecv.cu b/src/collectives/device/sendrecv.cu.cc
+similarity index 100%
+rename from src/collectives/device/sendrecv.cu
+rename to src/collectives/device/sendrecv.cu.cc
 diff --git a/src/nccl.h.in b/src/nccl.h
 similarity index 98%
 rename from src/nccl.h.in
@@ -38,12 +42,183 @@ index 985274e..7ebb1e1 100644
 -#define NCCL_PATCH ${nccl:Patch}
 -#define NCCL_SUFFIX "${nccl:Suffix}"
 +#define NCCL_MAJOR 2
-+#define NCCL_MINOR 5
-+#define NCCL_PATCH 7
++#define NCCL_MINOR 7
++#define NCCL_PATCH 3
 +#define NCCL_SUFFIX ""
  
 -#define NCCL_VERSION_CODE ${nccl:Version}
-+#define NCCL_VERSION_CODE 2507
++#define NCCL_VERSION_CODE 2703
  #define NCCL_VERSION(X,Y,Z) ((X) * 1000 + (Y) * 100 + (Z))
  
  #ifdef __cplusplus
+See https://github.com/NVIDIA/nccl/pull/322.patch
+From 410d341bd4569f60282576daa5c991717dbd560e Mon Sep 17 00:00:00 2001
+From: Danilo <doak@google.com>
+Date: Tue, 14 Apr 2020 14:52:42 +0200
+Subject: [PATCH 1/2] Fix memory leak in xml.cc.
+
+This patch fixes the memory leak documented in
+https://github.com/NVIDIA/nccl/issues/321, where one of the buffers
+allocated by realpath(), inside getPciPath() is not freed.
+
+The memory management aspect of this function also seemed odd and
+unecessary, as the realpath() function is documented to only write up to
+PATH_MAX bytes to the buffer passed to it, meaning we don't need dynamic
+memory allocation at all. I also changed the function signature of
+getPciPath to enforce the use of a fixed-size buffer.
+---
+ src/graph/xml.cc | 23 ++++++++++++-----------
+ 1 file changed, 12 insertions(+), 11 deletions(-)
+
+diff --git a/src/graph/xml.cc b/src/graph/xml.cc
+index 550cfcd0c..8fea91950 100644
+--- a/src/graph/xml.cc
++++ b/src/graph/xml.cc
+@@ -323,12 +323,14 @@ ncclResult_t ncclTopoGetXmlFromFile(const char* xmlTopoFile, struct ncclXml* xml
+ static void memcpylower(char* dst, const char* src, const size_t size) {
+   for (int i=0; i<size; i++) dst[i] = tolower(src[i]);
+ }
+-static ncclResult_t getPciPath(const char* busId, char** path) {
++
++static ncclResult_t getPciPath(const char* busId, char path[PATH_MAX+1]) {
+   char busPath[] = "/sys/class/pci_bus/0000:00/../../0000:00:00.0";
+   memcpylower(busPath+sizeof("/sys/class/pci_bus/")-1, busId, BUSID_REDUCED_SIZE-1);
+   memcpylower(busPath+sizeof("/sys/class/pci_bus/0000:00/../../")-1, busId, BUSID_SIZE-1);
+-  *path = realpath(busPath, NULL);
+-  if (*path == NULL) {
++  // Ensure that the returned string will always be null-terminated;
++  path[PATH_MAX] = 0;
++  if (realpath(busPath, path) == NULL) {
+     WARN("Could not find real path of %s", busPath);
+     return ncclSystemError;
+   }
+@@ -462,16 +464,16 @@ ncclResult_t ncclTopoGetXmlFromSys(struct ncclXmlNode* pciNode, struct ncclXml*
+   // Fill info, then parent
+   const char* busId;
+   NCCLCHECK(xmlGetAttr(pciNode, "busid", &busId));
+-  char* path = NULL;
++  char path[PATH_MAX+1];
+   int index;
+   NCCLCHECK(xmlGetAttrIndex(pciNode, "class", &index));
+   if (index == -1) {
+-    if (path == NULL) NCCLCHECK(getPciPath(busId, &path));
++    NCCLCHECK(getPciPath(busId, path));
+     NCCLCHECK(ncclTopoSetAttrFromSys(pciNode, path, "class", "class"));
+   }
+   NCCLCHECK(xmlGetAttrIndex(pciNode, "link_speed", &index));
+   if (index == -1) {
+-    if (path == NULL) NCCLCHECK(getPciPath(busId, &path));
++    NCCLCHECK(getPciPath(busId, path));
+     char deviceSpeedStr[MAX_STR_LEN];
+     float deviceSpeed;
+     NCCLCHECK(ncclTopoGetStrFromSys(path, "max_link_speed", deviceSpeedStr));
+@@ -484,7 +486,7 @@ ncclResult_t ncclTopoGetXmlFromSys(struct ncclXmlNode* pciNode, struct ncclXml*
+   }
+   NCCLCHECK(xmlGetAttrIndex(pciNode, "link_width", &index));
+   if (index == -1) {
+-    if (path == NULL) NCCLCHECK(getPciPath(busId, &path));
++    NCCLCHECK(getPciPath(busId, path));
+     char strValue[MAX_STR_LEN];
+     NCCLCHECK(ncclTopoGetStrFromSys(path, "max_link_width", strValue));
+     int deviceWidth = strtol(strValue, NULL, 0);
+@@ -494,7 +496,7 @@ ncclResult_t ncclTopoGetXmlFromSys(struct ncclXmlNode* pciNode, struct ncclXml*
+   }
+   struct ncclXmlNode* parent = pciNode->parent;
+   if (parent == NULL) {
+-    if (path == NULL) NCCLCHECK(getPciPath(busId, &path));
++    NCCLCHECK(getPciPath(busId, path));
+ 
+     // Save that for later in case next step is a CPU
+     char numaIdStr[MAX_STR_LEN];
+@@ -544,7 +546,6 @@ ncclResult_t ncclTopoGetXmlFromSys(struct ncclXmlNode* pciNode, struct ncclXml*
+   } else if (strcmp(parent->name, "cpu") == 0) {
+     NCCLCHECK(ncclTopoGetXmlFromCpu(parent, xml));
+   }
+-  free(path);
+   return ncclSuccess;
+ }
+ 
+@@ -640,8 +641,8 @@ ncclResult_t ncclTopoGetXmlFromGpu(struct ncclXmlNode* pciNode, nvmlDevice_t nvm
+     if (index == -1) {
+       const char* busId;
+       NCCLCHECK(xmlGetAttr(sub, "target", &busId));
+-      char* path;
+-      NCCLCHECK(getPciPath(busId, &path));
++      char path[PATH_MAX+1];
++      NCCLCHECK(getPciPath(busId, path));
+       NCCLCHECK(ncclTopoSetAttrFromSys(sub, path, "class", "tclass"));
+     }
+   }
+
+From f02d51952ac587237ea5f7c607a5b379381d09d7 Mon Sep 17 00:00:00 2001
+From: Danilo <doak@google.com>
+Date: Tue, 14 Apr 2020 22:17:49 +0200
+Subject: [PATCH 2/2] Performance tweaks in ncclTopoGetXmlFromSys.
+
+Reduce the number of getPciPath calls to a single one per invocation
+and split the function in two so that the large `path` buffer does
+not linger the in the stack during recursive calls.
+---
+ src/graph/xml.cc | 17 +++++++++++------
+ 1 file changed, 11 insertions(+), 6 deletions(-)
+
+diff --git a/src/graph/xml.cc b/src/graph/xml.cc
+index 8fea91950..42eb68a4b 100644
+--- a/src/graph/xml.cc
++++ b/src/graph/xml.cc
+@@ -460,20 +460,21 @@ int checkBDFFormat(char* bdf) {
+   return 1;
+ }
+ 
+-ncclResult_t ncclTopoGetXmlFromSys(struct ncclXmlNode* pciNode, struct ncclXml* xml) {
++ncclResult_t ncclTopoGetXmlNodeFromSys(struct ncclXmlNode* pciNode,
++                                       struct ncclXml* xml,
++                                       struct ncclXmlNode** return_parent) {
+   // Fill info, then parent
+   const char* busId;
+   NCCLCHECK(xmlGetAttr(pciNode, "busid", &busId));
+   char path[PATH_MAX+1];
++  NCCLCHECK(getPciPath(busId, path));
+   int index;
+   NCCLCHECK(xmlGetAttrIndex(pciNode, "class", &index));
+   if (index == -1) {
+-    NCCLCHECK(getPciPath(busId, path));
+     NCCLCHECK(ncclTopoSetAttrFromSys(pciNode, path, "class", "class"));
+   }
+   NCCLCHECK(xmlGetAttrIndex(pciNode, "link_speed", &index));
+   if (index == -1) {
+-    NCCLCHECK(getPciPath(busId, path));
+     char deviceSpeedStr[MAX_STR_LEN];
+     float deviceSpeed;
+     NCCLCHECK(ncclTopoGetStrFromSys(path, "max_link_speed", deviceSpeedStr));
+@@ -486,7 +487,6 @@ ncclResult_t ncclTopoGetXmlFromSys(struct ncclXmlNode* pciNode, struct ncclXml*
+   }
+   NCCLCHECK(xmlGetAttrIndex(pciNode, "link_width", &index));
+   if (index == -1) {
+-    NCCLCHECK(getPciPath(busId, path));
+     char strValue[MAX_STR_LEN];
+     NCCLCHECK(ncclTopoGetStrFromSys(path, "max_link_width", strValue));
+     int deviceWidth = strtol(strValue, NULL, 0);
+@@ -496,8 +496,6 @@ ncclResult_t ncclTopoGetXmlFromSys(struct ncclXmlNode* pciNode, struct ncclXml*
+   }
+   struct ncclXmlNode* parent = pciNode->parent;
+   if (parent == NULL) {
+-    NCCLCHECK(getPciPath(busId, path));
+-
+     // Save that for later in case next step is a CPU
+     char numaIdStr[MAX_STR_LEN];
+     NCCLCHECK(ncclTopoGetStrFromSys(path, "numa_node", numaIdStr));
+@@ -541,6 +539,13 @@ ncclResult_t ncclTopoGetXmlFromSys(struct ncclXmlNode* pciNode, struct ncclXml*
+     pciNode->parent = parent;
+     parent->subs[parent->nSubs++] = pciNode;
+   }
++  *return_parent = parent;
++  return ncclSuccess;
++}
++
++ncclResult_t ncclTopoGetXmlFromSys(struct ncclXmlNode* pciNode, struct ncclXml* xml) {
++  struct ncclXmlNode* parent;
++  ncclTopoGetXmlNodeFromSys(pciNode, xml, &parent);
+   if (strcmp(parent->name, "pci") == 0) {
+     NCCLCHECK(ncclTopoGetXmlFromSys(parent, xml));
+   } else if (strcmp(parent->name, "cpu") == 0) {
diff --git a/third_party/toolchains/preconfig/generate/containers.bzl b/third_party/toolchains/preconfig/generate/containers.bzl
index 4a3ed0b5225..260b7c31717 100644
--- a/third_party/toolchains/preconfig/generate/containers.bzl
+++ b/third_party/toolchains/preconfig/generate/containers.bzl
@@ -11,6 +11,7 @@ container_digests = {
     "cuda10.1-cudnn7-ubuntu16.04-manylinux2010": "sha256:5e6d21c8ef226316eb6df5e2e6015244c16a8e5d936b52a09820442d2f8a919f",
     "cuda10.1-cudnn7-ubuntu16.04-manylinux2010-multipython": "sha256:3f890a951c81a201d60d0161a56ce628a90323be0c7f795550caa37f6f41a85c",
     "cuda10.1-cudnn7-ubuntu18.04-manylinux2010-multipython": "sha256:bd7666d1ef49b2b2e2a64981f1c9234deeccdb0d5198b30ff4289c3dfcffedbf",
+    "cuda11.0-cudnn8-ubuntu18.04-manylinux2010-multipython": "sha256:b52edb4e35c780334ba417b008927722ae668847715a1624e9b2984e99c05338",
     "rocm-ubuntu16.04": "sha256:e645447dd6127325f3e97b8bf23424f637a8579d963b34fcc6772cf7cfaa0ebe",
     "windows-1803": "sha256:f109576c7c0c8a1783ff22b666e8923b52dbbe7933f69a1c7a7275202c304a12",
 }
diff --git a/third_party/toolchains/remote_config/configs.bzl b/third_party/toolchains/remote_config/configs.bzl
index 3bbf99e0e36..501759dcd50 100644
--- a/third_party/toolchains/remote_config/configs.bzl
+++ b/third_party/toolchains/remote_config/configs.bzl
@@ -52,6 +52,19 @@ def initialize_rbe_configs():
         python_install_path = "/usr/local",
     )
 
+    tensorflow_rbe_config(
+        name = "ubuntu18.04-gcc7_manylinux2010-cuda11.0-cudnn8-tensorrt7.1",
+        compiler = "/dt7/usr/bin/gcc",
+        compiler_prefix = "/usr/bin",
+        cuda_version = "11.0",
+        cudnn_version = "8",
+        os = "ubuntu18.04-manylinux2010-multipython",
+        python_versions = ["2.7", "3.5", "3.6", "3.7", "3.8"],
+        tensorrt_install_path = "/usr",
+        tensorrt_version = "7.1",
+        python_install_path = "/usr/local",
+    )
+
     # TODO(klimek): Delete this once all users are migrated to a python-version
     # independent configuration. In the future, use
     # "ubuntu16.04-gcc7_manylinux2010-cuda10.1-cudnn7-tensorrt6.0" instead.
diff --git a/third_party/toolchains/remote_config/containers.bzl b/third_party/toolchains/remote_config/containers.bzl
index 8f6dae7f311..d94bce91675 100644
--- a/third_party/toolchains/remote_config/containers.bzl
+++ b/third_party/toolchains/remote_config/containers.bzl
@@ -38,6 +38,13 @@ containers = {
         "digest": container_digests["cuda10.1-cudnn7-ubuntu18.04-manylinux2010-multipython"],
     },
 
+    # Built with //tensorflow/tools/ci_build/Dockerfile.rbe.cuda11.0-cudnn8-ubuntu18.04-manylinux2010-multipython.
+    "cuda11.0-cudnn8-ubuntu18.04-manylinux2010-multipython": {
+        "registry": "gcr.io",
+        "repository": "tensorflow-testing/nosla-cuda11.0-cudnn8-ubuntu18.04-manylinux2010-multipython",
+        "digest": container_digests["cuda11.0-cudnn8-ubuntu18.04-manylinux2010-multipython"],
+    },
+
     # Built with //tensorflow/tools/ci_build/Dockerfile.rbe.rocm-ubuntu16.04
     "rocm-ubuntu16.04": {
         "registry": "gcr.io",

From 31acd85aee129cd7172c1498a72d2f136048a79e Mon Sep 17 00:00:00 2001
From: Feng Liu <fengliuai@google.com>
Date: Mon, 22 Jun 2020 13:39:29 -0700
Subject: [PATCH 0791/1390] Add quantization support for tfl.transpose_conv

PiperOrigin-RevId: 317725618
Change-Id: If98427ba8b732edc7e1bd30f9c0f11d565d08a84
---
 tensorflow/compiler/mlir/lite/ir/tfl_ops.td   |  4 ++++
 .../lite/tests/prepare-quantize-signed.mlir   | 23 +++++++++++++++++++
 2 files changed, 27 insertions(+)

diff --git a/tensorflow/compiler/mlir/lite/ir/tfl_ops.td b/tensorflow/compiler/mlir/lite/ir/tfl_ops.td
index 33281cc58fb..f379b241f9d 100644
--- a/tensorflow/compiler/mlir/lite/ir/tfl_ops.td
+++ b/tensorflow/compiler/mlir/lite/ir/tfl_ops.td
@@ -571,6 +571,8 @@ def TFL_TransposeConvOp: TFL_Op<"transpose_conv", [
     TFL_OperandHasRank<2, 4>,
     PredOpTrait<"input and output must have same element type",
       TFL_TCresVTEtIsSameAsOp<0, 2>>,
+    AccumulatorUniformScale<3, 1, 2>,
+    TFL_ChannelDimIndexInterface, AffineOpCoefficient<0, 2>,
     TFL_GpuTargetOp,
     TFL_SparseOp]> {
   let summary = "Transpose convolution operator";
@@ -596,6 +598,8 @@ def TFL_TransposeConvOp: TFL_Op<"transpose_conv", [
   let verifier = [{ return Verify(*this); }];
 
   let extraClassDeclaration = [{
+    // ChannelDimIndexInterface:
+    int GetChannelDimIndex() { return 0; }
     // SparseOpInterface:
     std::vector<int> GetSparseOperands() { return {1}; }
     std::vector<std::vector<int>> GetFloatBlockSize() { return {}; }
diff --git a/tensorflow/compiler/mlir/lite/tests/prepare-quantize-signed.mlir b/tensorflow/compiler/mlir/lite/tests/prepare-quantize-signed.mlir
index e1f496b91f4..4a83616408e 100644
--- a/tensorflow/compiler/mlir/lite/tests/prepare-quantize-signed.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/prepare-quantize-signed.mlir
@@ -70,6 +70,7 @@ func @prepareAdd(%arg0: tensor<2x2xf32>) -> tensor<2x2xf32> {
 }
 
 // CHECK-LABEL: prepareConv2DSplat
+// PerTensor-LABEL: prepareConv2DSplat
 func @prepareConv2DSplat(%arg0: tensor<1x5x5x3xf32>) -> tensor<1x5x5x3xf32> {
   %w = constant dense<127.0> : tensor<3x3x3x3xf32>
   %b = constant dense<0.0> : tensor<3xf32>
@@ -89,6 +90,7 @@ func @prepareConv2DSplat(%arg0: tensor<1x5x5x3xf32>) -> tensor<1x5x5x3xf32> {
 }
 
 // CHECK-LABEL: prepareConv2D
+// PerTensor-LABEL: prepareConv2D
 func @prepareConv2D(%arg0: tensor<1x5x5x1xf32>) -> tensor<1x5x5x3xf32> {
   %w = constant dense<[[[[0.0]]], [[[127.0]]], [[[-127.0]]]]> : tensor<3x1x1x1xf32>
   %b = constant dense<0.0> : tensor<3xf32>
@@ -108,6 +110,7 @@ func @prepareConv2D(%arg0: tensor<1x5x5x1xf32>) -> tensor<1x5x5x3xf32> {
 }
 
 // CHECK-LABEL: prepareDepthwiseConv2D
+// PerTensor-LABEL: prepareDepthwiseConv2D
 func @prepareDepthwiseConv2D(%arg0: tensor<1x224x224x3xf32>) -> tensor<1x112x112x32xf32> {
   %w = constant dense<127.0> : tensor<32x3x3x3xf32>
   %b = constant dense<0.0> : tensor<32xf32>
@@ -127,6 +130,7 @@ func @prepareDepthwiseConv2D(%arg0: tensor<1x224x224x3xf32>) -> tensor<1x112x112
 }
 
 // CHECK-LABEL: QuantizeFullyConnected
+// PerTensor-LABEL: QuantizeFullyConnected
 func @QuantizeFullyConnected(%arg0: tensor<1x224x224x3xf32>) -> tensor<1x112x112x32xf32> {
   %w = constant dense<127.0> : tensor<32x12xf32>
   %b = constant dense<0.0> : tensor<32xf32>
@@ -143,3 +147,22 @@ func @QuantizeFullyConnected(%arg0: tensor<1x224x224x3xf32>) -> tensor<1x112x112
 // PerTensor: %[[dq:.*]] = "tfl.dequantize"(%0) : (tensor<32x12x!quant.uniform<i8<-127:127>:f32, 1.000000e+00>>) -> tensor<32x12xf32>
 // PerTensor: "tfl.fully_connected"(%arg0, %[[dq]]
 }
+
+// CHECK-LABEL: QuantizeTransposeConv
+// PerTensor-LABEL: QuantizeTransposeConv
+func @QuantizeTransposeConv(%arg0: tensor<32x4x4x128xf32>, %arg1: tensor<4xi32>) -> tensor<1x32x42x128xf32> {
+  %w = constant dense<127.0> : tensor<1x32x42x128xf32>
+  %b = constant dense<0.0> : tensor<1x32x42x128xf32>
+  %tc = "tfl.transpose_conv"(%arg1, %arg0, %w, %b) {padding = "SAME", stride_h = 2 : i32, stride_w = 2 : i32} : (tensor<4xi32>, tensor<32x4x4x128xf32>, tensor<1x32x42x128xf32>, tensor<1x32x42x128xf32>) -> tensor<1x32x42x128xf32>
+  return %tc : tensor<1x32x42x128xf32>
+
+// CHECK: %[[CST:.*]] = constant dense<1.270000e+02> : tensor<1x32x42x128xf32>
+// CHECK: %[[QUANTIZE:.*]] = "tfl.quantize"(%[[CST]]) {qtype = tensor<1x32x42x128x!quant.uniform<i8<-127:127>:f32:0, {1.000000e+00}>>, volatile}
+// CHECK: %[[DEQUANTIZE:.*]] = "tfl.dequantize"(%[[QUANTIZE]]) : (tensor<1x32x42x128x!quant.uniform<i8<-127:127>:f32:0, {1.000000e+00}>>) -> tensor<1x32x42x128xf32>
+// CHECK: "tfl.transpose_conv"(%arg1, %arg0, %[[DEQUANTIZE]]
+
+// PerTensor: %[[CST:.*]] = constant dense<1.270000e+02> : tensor<1x32x42x128xf32>
+// PerTensor: %[[QUANTIZE:.*]] = "tfl.quantize"(%[[CST]]) {qtype = tensor<1x32x42x128x!quant.uniform<i8<-127:127>:f32, 1.000000e+00>>, volatile}
+// PerTensor: %[[DEQUANTIZE:.*]] = "tfl.dequantize"(%[[QUANTIZE]]) : (tensor<1x32x42x128x!quant.uniform<i8<-127:127>:f32, 1.000000e+00>>) -> tensor<1x32x42x128xf32>
+// PerTensor: "tfl.transpose_conv"(%arg1, %arg0, %[[DEQUANTIZE]]
+}

From e46f544fab673c88e284acbbc0839de59b076978 Mon Sep 17 00:00:00 2001
From: XingyuLong <halolong@hotmail.com>
Date: Mon, 22 Jun 2020 16:59:15 -0400
Subject: [PATCH 0792/1390] Update

---
 .../core/kernels/topk_op_gpu_uint32.cu.cc     | 28 +++++++++++++++++++
 .../core/kernels/topk_op_gpu_uint64.cu.cc     | 28 +++++++++++++++++++
 tensorflow/python/ops/numpy_ops/README.md     |  5 ++++
 .../python/ops/numpy_ops/np_interop_test.py   |  2 +-
 4 files changed, 62 insertions(+), 1 deletion(-)
 create mode 100644 tensorflow/core/kernels/topk_op_gpu_uint32.cu.cc
 create mode 100644 tensorflow/core/kernels/topk_op_gpu_uint64.cu.cc

diff --git a/tensorflow/core/kernels/topk_op_gpu_uint32.cu.cc b/tensorflow/core/kernels/topk_op_gpu_uint32.cu.cc
new file mode 100644
index 00000000000..16e2e0e9420
--- /dev/null
+++ b/tensorflow/core/kernels/topk_op_gpu_uint32.cu.cc
@@ -0,0 +1,28 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/kernels/topk_op.h"
+#include "tensorflow/core/kernels/topk_op_gpu.h"
+
+namespace tensorflow {
+using Eigen::GpuDevice;
+
+template struct functor::TopKFunctor<GPUDevice, uint32>;
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/topk_op_gpu_uint64.cu.cc b/tensorflow/core/kernels/topk_op_gpu_uint64.cu.cc
new file mode 100644
index 00000000000..895247a63a2
--- /dev/null
+++ b/tensorflow/core/kernels/topk_op_gpu_uint64.cu.cc
@@ -0,0 +1,28 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/kernels/topk_op.h"
+#include "tensorflow/core/kernels/topk_op_gpu.h"
+
+namespace tensorflow {
+using Eigen::GpuDevice;
+
+template struct functor::TopKFunctor<GPUDevice, uint64>;
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/python/ops/numpy_ops/README.md b/tensorflow/python/ops/numpy_ops/README.md
index 812f6b946c0..0735b6b2295 100644
--- a/tensorflow/python/ops/numpy_ops/README.md
+++ b/tensorflow/python/ops/numpy_ops/README.md
@@ -8,6 +8,7 @@ change the baseline version of numpy API being supported. A list of some
 systematic differences with numpy are listed later in the "Differences with
 Numpy" section.
 
+
 ## Getting Started
 
 ```python
@@ -23,6 +24,7 @@ Additional functions are provided which accept array-like objects. Here
 array-like objects includes `ndarrays` as defined by this module, as well as
 `tf.Tensor`, in addition to types accepted by `numpy`.
 
+
 A subset of `numpy` dtypes are supported. Type promotion follows numpy
 semantics.
 
@@ -65,6 +67,7 @@ have GPU and TPU support on par with TensorFlow. Also the code can be wrapped
 with `tf.function` and XLA compiled. Device placement can be controlled by using
 `with tf.device` scopes.
 
+
 ```python
 with tf.device("GPU:0"):
   x = np.ones([1, 2])
@@ -75,6 +78,7 @@ print(tf.convert_to_tensor(x).device)
 
 Eager mode execution should typically match numpy semantics of executing
 op-by-op. However the same code can be executed in graph mode, by putting it
+
 inside a `tf.function`. The function body can contain numpy code, and the inputs
 can be ndarray as well.
 
@@ -109,6 +113,7 @@ There is currently no explict construct on par with `tf.Variable`. However one
 can directly construct a `tf.Variable` and use that with the numpy APIs in this
 module. See section on Interop.
 
+
 ## Differences with Numpy
 
 Here is a non-exhaustive list of differences:
diff --git a/tensorflow/python/ops/numpy_ops/np_interop_test.py b/tensorflow/python/ops/numpy_ops/np_interop_test.py
index 5c7560b12d9..bd6dbeaf14c 100644
--- a/tensorflow/python/ops/numpy_ops/np_interop_test.py
+++ b/tensorflow/python/ops/numpy_ops/np_interop_test.py
@@ -36,7 +36,7 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.numpy_ops import np_array_ops
 from tensorflow.python.ops.numpy_ops import np_arrays
 from tensorflow.python.ops.numpy_ops import np_math_ops
-from tensorflow.python.platform import test
+from tensorflow.python.platform import 
 
 
 class InteropTest(test.TestCase):

From a3dc11ea136770d70d27c4ef02d01ee744d68bd4 Mon Sep 17 00:00:00 2001
From: Nick Kreeger <kreeger@google.com>
Date: Mon, 22 Jun 2020 14:00:22 -0700
Subject: [PATCH 0793/1390] Add endian-aware flatbuffer conversions to
 MicroAllocator.

Currently, MicroAllocator manually maps TfLite*Array struct values directly to flatbuffer values. This change cleans up other instances inside MicroAllocator that are not endian-aware.

This works only on little-endian (LE) architecture systems because of the layout of TfLite*Array:

struct TfLiteIntArray {
  int size;
  int data[];
}

The compiler maintains mapping, but |size| and |data| are laid out as the following in LE:

[lowest-order-byte(e.g. data) .... highest-order-byte(e.g. size)]

Casting and remapping work on LE because the vector is written in the lowest-order-byte sequence. On BE systems, this memory savings trick does not work and requires a malloc from the arena and manual copying of values from the flatbuffer.

PiperOrigin-RevId: 317730072
Change-Id: I1baff898356e3d82b2faed6468a50ae44acd3082
---
 tensorflow/lite/micro/micro_allocator.cc | 96 +++++++++++++-----------
 1 file changed, 53 insertions(+), 43 deletions(-)

diff --git a/tensorflow/lite/micro/micro_allocator.cc b/tensorflow/lite/micro/micro_allocator.cc
index d6c192c3ed3..bf9e38d1050 100644
--- a/tensorflow/lite/micro/micro_allocator.cc
+++ b/tensorflow/lite/micro/micro_allocator.cc
@@ -392,28 +392,48 @@ TfLiteStatus CommitPlan(ErrorReporter* error_reporter, MemoryPlanner* planner,
 
 namespace internal {
 
-// Allocate a TfLiteIntArray and copy the contents of a FlatBuffers Vector
-// into it.
-template <class T>
-TfLiteStatus FlatBufferIntArrayToTfLiteIntArray(
+// Handles architecture safe mapping of flatbuffer vectors to a TfLite*Array
+// struct. Matching types are required (e.g. float and TfLiteFloatArray).
+template <typename kFlatBufferVectorType, typename kTfLiteArrayType>
+TfLiteStatus FlatBufferVectorToTfLiteTypeArray(
     SimpleMemoryAllocator* allocator, ErrorReporter* error_reporter,
-    const flatbuffers::Vector<T>* flat_array, TfLiteIntArray** result) {
-  TfLiteIntArray* ret =
-      reinterpret_cast<TfLiteIntArray*>(allocator->AllocateFromTail(
-          TfLiteIntArrayGetSizeInBytes(flat_array->Length()),
-          alignof(TfLiteIntArray)));
-  if (nullptr == ret) {
-    TF_LITE_REPORT_ERROR(
-        error_reporter,
-        "Failed to allocate %d bytes of memory to copy an array.",
-        TfLiteIntArrayGetSizeInBytes(flat_array->Length()));
-    return kTfLiteError;
+    const flatbuffers::Vector<kFlatBufferVectorType>* flatbuffer_array,
+    kTfLiteArrayType** result) {
+  TFLITE_DCHECK(error_reporter != nullptr);
+  TFLITE_DCHECK(flatbuffer_array != nullptr);
+  // Only two conversions are supported - float and int32 - ensure that these
+  // match at compile time instead of duplicating functions here:
+  static_assert((std::is_same<kFlatBufferVectorType, int32_t>() &&
+                 std::is_same<kTfLiteArrayType, TfLiteIntArray>()) ||
+                (std::is_same<kFlatBufferVectorType, float>() &&
+                 std::is_same<kTfLiteArrayType, TfLiteFloatArray>()));
+  if (FLATBUFFERS_LITTLEENDIAN) {
+    // On little-endian machines, TfLite*Array happens to have the same memory
+    // layout as flatbuffers:Vector<kFlatBufferVectorType>, so we can
+    // reinterpret_cast the flatbuffer vector and avoid a copy and malloc.
+    *result = const_cast<kTfLiteArrayType*>(
+        reinterpret_cast<const kTfLiteArrayType*>(flatbuffer_array));
+  } else {
+    // Big-endian architecture can not use the same memory layout as
+    // flatbuffers::Vector<kFlatBufferVectorType>. Allocate from the tail and
+    // copy values from the flatbuffer into the newly allocated chunk.
+    kTfLiteArrayType* array =
+        reinterpret_cast<kTfLiteArrayType*>(allocator->AllocateFromTail(
+            TfLiteIntArrayGetSizeInBytes(flatbuffer_array->Length()),
+            alignof(kTfLiteArrayType)));
+    if (array == nullptr) {
+      TF_LITE_REPORT_ERROR(
+          error_reporter,
+          "Failed to allocate %d bytes of memory to copy an array.",
+          TfLiteIntArrayGetSizeInBytes(flatbuffer_array->Length()));
+      return kTfLiteError;
+    }
+    array->size = flatbuffer_array->Length();
+    for (int i = 0; i < array->size; ++i) {
+      array->data[i] = flatbuffer_array->Get(i);
+    }
+    *result = array;
   }
-  ret->size = flat_array->Length();
-  for (int64_t i = 0; i < static_cast<int64_t>(flat_array->Length()); i++) {
-    ret->data[i] = flat_array->Get(i);
-  }
-  *result = ret;
   return kTfLiteOk;
 }
 
@@ -469,27 +489,17 @@ TfLiteStatus InitializeTfLiteTensorFromFlatbuffer(
   TF_LITE_ENSURE_STATUS(BytesRequiredForTensor(
       flatbuffer_tensor, &result->bytes, &type_size, error_reporter));
 
-  // TODO(b/159043126): Cleanup endian casting by doing all endian casting in
-  // one spot:
   if (flatbuffer_tensor.shape() == nullptr) {
     // flatbuffer_tensor.shape() can return a nullptr in the case of a scalar
     // tensor.
     result->dims = const_cast<TfLiteIntArray*>(&kZeroLengthIntArray);
-  } else if (!FLATBUFFERS_LITTLEENDIAN) {
-    // Big-endian architecture. Copy and byte-swap the little-endian shape
-    // data.
-    TF_LITE_ENSURE_STATUS(FlatBufferIntArrayToTfLiteIntArray(
-        allocator, error_reporter, flatbuffer_tensor.shape(), &(result->dims)));
   } else {
-    // On little-endian machines, TfLiteIntArray happens to have the same
-    // memory layout as flatbuffers:Vector<int>, so we can reinterpret_cast the
-    // tensor shape vector and avoid a copy.
     // TFLM doesn't allow reshaping the tensor which requires dynamic memory
     // allocation so it is safe to drop the const qualifier. In the future, if
     // we really want to update the tensor shape, we can always pass in a new
-    // TfLiteIntArray - especially we have to do so if the dimension is changed.
-    result->dims = const_cast<TfLiteIntArray*>(
-        reinterpret_cast<const TfLiteIntArray*>(flatbuffer_tensor.shape()));
+    // TfLiteIntArray - especially we have to do so if the dimension is
+    TF_LITE_ENSURE_STATUS(FlatBufferVectorToTfLiteTypeArray(
+        allocator, error_reporter, flatbuffer_tensor.shape(), &(result->dims)));
   }
 
   // Copy the quantization information from the serialized data.
@@ -531,9 +541,9 @@ TfLiteStatus InitializeTfLiteTensorFromFlatbuffer(
       return kTfLiteError;
     }
 
-    // TODO(b/159043126): Check for big endian before casting flatbuffer values.
-    quantization->scale = const_cast<TfLiteFloatArray*>(
-        reinterpret_cast<const TfLiteFloatArray*>(src_quantization->scale()));
+    TF_LITE_ENSURE_STATUS(FlatBufferVectorToTfLiteTypeArray(
+        allocator, error_reporter, src_quantization->scale(),
+        &quantization->scale));
 
     quantization->zero_point->size = channels;
     int* zero_point_data = quantization->zero_point->data;
@@ -815,13 +825,13 @@ TfLiteStatus MicroAllocator::PrepareNodeAndRegistrationDataFromFlatbuffer(
                                    (void**)(&builtin_data)));
     }
 
-    // Disregard const qualifier to workaround with existing API.
-    // TODO(b/159043126): Check for big endian before casting flatbuffer values.
-    TfLiteIntArray* inputs_array = const_cast<TfLiteIntArray*>(
-        reinterpret_cast<const TfLiteIntArray*>(op->inputs()));
-    // TODO(b/159043126): Check for big endian before casting flatbuffer values.
-    TfLiteIntArray* outputs_array = const_cast<TfLiteIntArray*>(
-        reinterpret_cast<const TfLiteIntArray*>(op->outputs()));
+    TfLiteIntArray* inputs_array;
+    TF_LITE_ENSURE_STATUS(internal::FlatBufferVectorToTfLiteTypeArray(
+        memory_allocator_, error_reporter_, op->inputs(), &inputs_array));
+
+    TfLiteIntArray* outputs_array;
+    TF_LITE_ENSURE_STATUS(internal::FlatBufferVectorToTfLiteTypeArray(
+        memory_allocator_, error_reporter_, op->outputs(), &outputs_array));
 
     TfLiteNode* node = &(node_and_registrations[i].node);
     *node = {};

From 17fb3fcf7801df02c5d7696fb08e1dd55c9d0fe1 Mon Sep 17 00:00:00 2001
From: Rick Chao <rchao@google.com>
Date: Mon, 22 Jun 2020 14:21:25 -0700
Subject: [PATCH 0794/1390] MultiProcessRunner: Add
 UnexpectedSubprocessExitError to be raised if the exit code from a subprocess
 in unexpected. This results in subprocess having segfault failing the test,
 which would not have before this change.

PiperOrigin-RevId: 317734369
Change-Id: I008f9bd4dd5b7a68e8e4a7f17b7e5d490a30c09a
---
 .../python/distribute/multi_process_runner.py | 38 +++++++++++++++++--
 .../distribute/multi_process_runner_test.py   | 35 +++++++++++++++++
 2 files changed, 70 insertions(+), 3 deletions(-)

diff --git a/tensorflow/python/distribute/multi_process_runner.py b/tensorflow/python/distribute/multi_process_runner.py
index 8699e59b410..af527b67b4b 100644
--- a/tensorflow/python/distribute/multi_process_runner.py
+++ b/tensorflow/python/distribute/multi_process_runner.py
@@ -463,14 +463,28 @@ class MultiProcessRunner(object):
     process_statuses = self._queue_to_list(self._process_status_queue)
     if not self._all_forced_terminated and len(
         process_statuses) != self._outstanding_subprocess_count:
-      raise RuntimeError(
-          'missing statuses from %d subproceses.' %
-          (self._outstanding_subprocess_count - len(process_statuses)))
+      raise UnexpectedSubprocessExitError(
+          'Missing status(es) from %d subprocess(es). See logs for details.' %
+          (self._outstanding_subprocess_count - len(process_statuses)),
+          self._get_mpr_result(process_statuses))
     for process_status in process_statuses:
       assert isinstance(process_status, _ProcessStatusInfo)
       if not process_status.is_successful:
         six.reraise(*process_status.exc_info)
 
+    # Checking all the processes that are expected to exit properly.
+    for (task_type, task_id), p in self._processes.items():
+      if self._dependence_on_chief and task_type != 'chief':
+        # If _dependence_on_chief, other processes may have been
+        # forced-terminated, which is expected.
+        continue
+      # Successfully exiting process has exit code 0.
+      if p.exitcode > 0:
+        raise UnexpectedSubprocessExitError(
+            'Subprocess %s-%d exited with exit code %d. See logs for details.' %
+            (task_type, task_id, p.exitcode),
+            self._get_mpr_result(process_statuses))
+
     logging.info('Joining log reading threads.')
     for thread in self._reading_threads:
       thread.join()
@@ -506,6 +520,8 @@ class MultiProcessRunner(object):
     for (task_type, task_id), p in self._processes.items():
       try:
         os.kill(p.pid, sig)
+        logging.info('%s-%d terminated with signal %r.', task_type, task_id,
+                     sig)
       except ProcessLookupError:
         logging.info('Attempting to kill %s-%d but it does not exist.',
                      task_type, task_id)
@@ -658,6 +674,9 @@ class _ProcFunc(object):
       self._resources.process_status_queue.put(info)
       self._close_streaming()
 
+    # Exit with code 0 as it's considered successful exit at this point.
+    sys.exit(0)
+
 
 class SubprocessTimeoutError(RuntimeError):
   """An error that indicates there is at least one subprocess timing out.
@@ -672,6 +691,19 @@ class SubprocessTimeoutError(RuntimeError):
     self.mpr_result = mpr_result
 
 
+class UnexpectedSubprocessExitError(RuntimeError):
+  """An error indicating there is at least one subprocess with unexpected exit.
+
+  When this is raised, a `MultiProcessRunnerResult` object can be retrieved by
+  `UnexpectedSubprocessExitError`'s mpr_result attribute. See
+  `MultiProcessRunner.join()` for more information.
+  """
+
+  def __init__(self, msg, mpr_result):
+    super(UnexpectedSubprocessExitError, self).__init__(msg)
+    self.mpr_result = mpr_result
+
+
 def _set_tf_config(task_type, task_id, cluster_spec, rpc_layer=None):
   """Set TF_CONFIG environment variable."""
   tf_config_dict = {
diff --git a/tensorflow/python/distribute/multi_process_runner_test.py b/tensorflow/python/distribute/multi_process_runner_test.py
index aeba43b6b7c..6194ac527d5 100644
--- a/tensorflow/python/distribute/multi_process_runner_test.py
+++ b/tensorflow/python/distribute/multi_process_runner_test.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import ctypes
 import json
 import os
 import threading
@@ -324,6 +325,40 @@ class MultiProcessRunnerTest(test.TestCase):
             any('(logging) {}-0, i: {}'.format(job, iteration) in line
                 for line in list_to_assert))
 
+  def test_seg_fault_raises_error(self):
+
+    def proc_func_expected_to_seg_fault():
+      ctypes.string_at(0)  # Intentionally made seg fault.
+
+    with self.assertRaises(
+        multi_process_runner.UnexpectedSubprocessExitError) as cm:
+      multi_process_runner.run(
+          proc_func_expected_to_seg_fault,
+          multi_worker_test_base.create_cluster_spec(num_workers=1),
+          list_stdout=True)
+    self.assertIn('Missing status(es) from 1 subprocess(es).',
+                  str(cm.exception))
+    list_to_assert = cm.exception.mpr_result.stdout
+    self.assertTrue(any('SIGSEGV' in line for line in list_to_assert))
+
+  def test_seg_fault_in_chief_raises_error(self):
+
+    def proc_func_expected_to_seg_fault():
+      if multi_worker_test_base.get_task_type() == 'worker':
+        time.sleep(10000)
+      ctypes.string_at(0)  # Intentionally made seg fault.
+
+    with self.assertRaises(
+        multi_process_runner.UnexpectedSubprocessExitError) as cm:
+      multi_process_runner.run(
+          proc_func_expected_to_seg_fault,
+          multi_worker_test_base.create_cluster_spec(
+              has_chief=True, num_workers=1),
+          list_stdout=True)
+    self.assertIn('Subprocess chief-0 exited with exit code',
+                  str(cm.exception))
+    list_to_assert = cm.exception.mpr_result.stdout
+    self.assertTrue(any('SIGSEGV' in line for line in list_to_assert))
 
 if __name__ == '__main__':
   multi_process_runner.test_main()

From 46533be64f35dee2bdec54cf3576c55e2d152a42 Mon Sep 17 00:00:00 2001
From: XingyuLong <halolong@hotmail.com>
Date: Mon, 22 Jun 2020 17:29:47 -0400
Subject: [PATCH 0795/1390] resolve the conflict

---
 .../core/kernels/topk_op_gpu_uint32.cu.cc     | 28 -------------------
 .../core/kernels/topk_op_gpu_uint64.cu.cc     | 28 -------------------
 tensorflow/python/ops/numpy_ops/README.md     |  5 ----
 .../python/ops/numpy_ops/np_interop_test.py   |  2 +-
 4 files changed, 1 insertion(+), 62 deletions(-)
 delete mode 100644 tensorflow/core/kernels/topk_op_gpu_uint32.cu.cc
 delete mode 100644 tensorflow/core/kernels/topk_op_gpu_uint64.cu.cc

diff --git a/tensorflow/core/kernels/topk_op_gpu_uint32.cu.cc b/tensorflow/core/kernels/topk_op_gpu_uint32.cu.cc
deleted file mode 100644
index 16e2e0e9420..00000000000
--- a/tensorflow/core/kernels/topk_op_gpu_uint32.cu.cc
+++ /dev/null
@@ -1,28 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#if GOOGLE_CUDA
-#define EIGEN_USE_GPU
-
-#include "tensorflow/core/kernels/topk_op.h"
-#include "tensorflow/core/kernels/topk_op_gpu.h"
-
-namespace tensorflow {
-using Eigen::GpuDevice;
-
-template struct functor::TopKFunctor<GPUDevice, uint32>;
-}  // namespace tensorflow
-
-#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/topk_op_gpu_uint64.cu.cc b/tensorflow/core/kernels/topk_op_gpu_uint64.cu.cc
deleted file mode 100644
index 895247a63a2..00000000000
--- a/tensorflow/core/kernels/topk_op_gpu_uint64.cu.cc
+++ /dev/null
@@ -1,28 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#if GOOGLE_CUDA
-#define EIGEN_USE_GPU
-
-#include "tensorflow/core/kernels/topk_op.h"
-#include "tensorflow/core/kernels/topk_op_gpu.h"
-
-namespace tensorflow {
-using Eigen::GpuDevice;
-
-template struct functor::TopKFunctor<GPUDevice, uint64>;
-}  // namespace tensorflow
-
-#endif  // GOOGLE_CUDA
diff --git a/tensorflow/python/ops/numpy_ops/README.md b/tensorflow/python/ops/numpy_ops/README.md
index 0735b6b2295..812f6b946c0 100644
--- a/tensorflow/python/ops/numpy_ops/README.md
+++ b/tensorflow/python/ops/numpy_ops/README.md
@@ -8,7 +8,6 @@ change the baseline version of numpy API being supported. A list of some
 systematic differences with numpy are listed later in the "Differences with
 Numpy" section.
 
-
 ## Getting Started
 
 ```python
@@ -24,7 +23,6 @@ Additional functions are provided which accept array-like objects. Here
 array-like objects includes `ndarrays` as defined by this module, as well as
 `tf.Tensor`, in addition to types accepted by `numpy`.
 
-
 A subset of `numpy` dtypes are supported. Type promotion follows numpy
 semantics.
 
@@ -67,7 +65,6 @@ have GPU and TPU support on par with TensorFlow. Also the code can be wrapped
 with `tf.function` and XLA compiled. Device placement can be controlled by using
 `with tf.device` scopes.
 
-
 ```python
 with tf.device("GPU:0"):
   x = np.ones([1, 2])
@@ -78,7 +75,6 @@ print(tf.convert_to_tensor(x).device)
 
 Eager mode execution should typically match numpy semantics of executing
 op-by-op. However the same code can be executed in graph mode, by putting it
-
 inside a `tf.function`. The function body can contain numpy code, and the inputs
 can be ndarray as well.
 
@@ -113,7 +109,6 @@ There is currently no explict construct on par with `tf.Variable`. However one
 can directly construct a `tf.Variable` and use that with the numpy APIs in this
 module. See section on Interop.
 
-
 ## Differences with Numpy
 
 Here is a non-exhaustive list of differences:
diff --git a/tensorflow/python/ops/numpy_ops/np_interop_test.py b/tensorflow/python/ops/numpy_ops/np_interop_test.py
index bd6dbeaf14c..5c7560b12d9 100644
--- a/tensorflow/python/ops/numpy_ops/np_interop_test.py
+++ b/tensorflow/python/ops/numpy_ops/np_interop_test.py
@@ -36,7 +36,7 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.numpy_ops import np_array_ops
 from tensorflow.python.ops.numpy_ops import np_arrays
 from tensorflow.python.ops.numpy_ops import np_math_ops
-from tensorflow.python.platform import 
+from tensorflow.python.platform import test
 
 
 class InteropTest(test.TestCase):

From 0c227aed65e62f741a88c9915923d262710fc8c9 Mon Sep 17 00:00:00 2001
From: Scott Wegner <swegner@google.com>
Date: Mon, 22 Jun 2020 14:26:34 -0700
Subject: [PATCH 0796/1390] Special case tfnp ndarrays in some places, and
 minor improvements to tfnp array functions

PiperOrigin-RevId: 317735476
Change-Id: I58123c5fb64257975e220df4e57f2a1f74856da7
---
 tensorflow/python/eager/BUILD                  |  1 -
 tensorflow/python/eager/function.py            | 18 ++----------------
 tensorflow/python/ops/numpy_ops/np_arrays.py   | 17 ++---------------
 .../python/ops/numpy_ops/np_interop_test.py    |  4 +---
 4 files changed, 5 insertions(+), 35 deletions(-)

diff --git a/tensorflow/python/eager/BUILD b/tensorflow/python/eager/BUILD
index f51bd97e488..af5f3d16408 100644
--- a/tensorflow/python/eager/BUILD
+++ b/tensorflow/python/eager/BUILD
@@ -560,7 +560,6 @@ py_library(
         "//tensorflow/python:graph_to_function_def",
         "//tensorflow/python:pywrap_tf_session",
         "//tensorflow/python:util",
-        "//tensorflow/python/ops/numpy_ops:numpy",
         "//third_party/py/numpy",
         "@six_archive//:six",
     ],
diff --git a/tensorflow/python/eager/function.py b/tensorflow/python/eager/function.py
index 7b235749533..4a08e66b441 100644
--- a/tensorflow/python/eager/function.py
+++ b/tensorflow/python/eager/function.py
@@ -81,9 +81,6 @@ from tensorflow.python.util import tf_inspect
 ag_ctx = lazy_loader.LazyLoader(
     "ag_ctx", globals(),
     "tensorflow.python.autograph.core.ag_ctx")
-np_arrays = lazy_loader.LazyLoader(
-    "np_arrays", globals(),
-    "tensorflow.python.ops.numpy_ops.np_arrays")
 
 
 FORWARD_FUNCTION_ATTRIBUTE_NAME = "forward_function_name"
@@ -1487,11 +1484,6 @@ class ConcreteFunction(object):
     self._func_graph = func_graph
     self._captured_inputs = self._func_graph.external_captures
     self._captured_closures = self._func_graph.deferred_external_captures
-    structured_outputs = self._func_graph.structured_outputs
-    self._ndarrays_list = (
-        isinstance(structured_outputs, (list, tuple)) and
-        all([isinstance(o, np_arrays.ndarray) for o in structured_outputs]))
-    self._ndarray_singleton = isinstance(structured_outputs, np_arrays.ndarray)
 
     # function_spec defines the structured signature.
     self._set_function_spec(function_spec)
@@ -2158,15 +2150,9 @@ class ConcreteFunction(object):
     if self._func_graph.structured_outputs is None:
       return result
 
-    if result:
-      if self._ndarrays_list:
-        return [np_arrays.tensor_to_ndarray(o) for o in result]
-      elif self._ndarray_singleton:
-        return np_arrays.tensor_to_ndarray(result[0])
-
     # Replace outputs with results, skipping over any 'None' values.
-    outputs_list = nest.flatten(
-        self._func_graph.structured_outputs, expand_composites=True)
+    outputs_list = nest.flatten(self._func_graph.structured_outputs,
+                                expand_composites=True)
     j = 0
     for i, o in enumerate(outputs_list):
       if o is not None:
diff --git a/tensorflow/python/ops/numpy_ops/np_arrays.py b/tensorflow/python/ops/numpy_ops/np_arrays.py
index fd26318bea9..40ccfa9dc50 100644
--- a/tensorflow/python/ops/numpy_ops/np_arrays.py
+++ b/tensorflow/python/ops/numpy_ops/np_arrays.py
@@ -149,23 +149,10 @@ class ndarray(composite_tensor.CompositeTensor):  # pylint: disable=invalid-name
     if dtype and dtype != buffer.dtype:
       buffer = array_ops.bitcast(buffer, dtype)
     self._data = buffer
-    self._type_spec_internal = None
-
-  @classmethod
-  def from_tensor(cls, tensor):
-    o = cls.__new__(cls, None)
-    # pylint: disable=protected-access
-    o._data = tensor
-    o._type_spec_internal = None
-    # pylint: enable=protected-access
-    return o
 
   @property
   def _type_spec(self):
-    if self._type_spec_internal is None:
-      self._type_spec_internal = NdarraySpec(
-          type_spec.type_spec_from_value(self._data))
-    return self._type_spec_internal
+    return NdarraySpec(type_spec.type_spec_from_value(self._data))
 
   @property
   def data(self):
@@ -312,7 +299,7 @@ class ndarray(composite_tensor.CompositeTensor):  # pylint: disable=invalid-name
 
 
 def tensor_to_ndarray(tensor):
-  return ndarray.from_tensor(tensor)
+  return ndarray(tensor._shape_tuple(), dtype=tensor.dtype, buffer=tensor)  # pylint: disable=protected-access
 
 
 def ndarray_to_tensor(arr, dtype=None, name=None, as_ref=False):
diff --git a/tensorflow/python/ops/numpy_ops/np_interop_test.py b/tensorflow/python/ops/numpy_ops/np_interop_test.py
index 5c7560b12d9..9580b787202 100644
--- a/tensorflow/python/ops/numpy_ops/np_interop_test.py
+++ b/tensorflow/python/ops/numpy_ops/np_interop_test.py
@@ -64,9 +64,7 @@ class InteropTest(test.TestCase):
 
     dx, dy = t.gradient([xx, yy], [x, y])
 
-    # # TODO(nareshmodi): Figure out a way to rewrap ndarray as tensors.
-    # self.assertIsInstance(dx, np_arrays.ndarray)
-    # self.assertIsInstance(dy, np_arrays.ndarray)
+    # TODO(nareshmodi): Gradient tape returns tensors. Is it possible to rewrap?
     self.assertAllClose(dx, 2.0)
     self.assertAllClose(dy, 3.0)
 

From be8d4be4f4e75c5d720aa4b4c9da025d5d54c1be Mon Sep 17 00:00:00 2001
From: abhichou4 <abhichou4@gmail.com>
Date: Tue, 23 Jun 2020 03:08:16 +0530
Subject: [PATCH 0797/1390] remove use_pfor, use tf.concat

---
 tensorflow/python/eager/forwardprop.py      | 12 +++++-------
 tensorflow/python/eager/forwardprop_test.py |  4 ++--
 2 files changed, 7 insertions(+), 9 deletions(-)

diff --git a/tensorflow/python/eager/forwardprop.py b/tensorflow/python/eager/forwardprop.py
index ed634780d23..1a6d6a58d0e 100644
--- a/tensorflow/python/eager/forwardprop.py
+++ b/tensorflow/python/eager/forwardprop.py
@@ -141,9 +141,9 @@ def _jvp_helper(op_name, attr_tuple, inputs, outputs, tangents):
 
 
 def _jvp_helper_wrapper(
-    op_name, attr_tuple, inputs, outputs, tangents, batch_size
+  op_name, attr_tuple, inputs, outputs, tangents, batch_size
 ):
-    """Computes a batch of Jacobian-vector product for an op.
+  """Computes a batch of Jacobian-vector product for an op.
 
   Args:
     op_name: A string, the type of operation being executed.
@@ -155,12 +155,10 @@ def _jvp_helper_wrapper(
   Returns:
     A flat list of tangents corresponding to `outputs`.
   """
-  use_pfor = False
-  if batch_size is not None:
-    use_pfor = True
+  if batch_size:
     for primal, tangent in zip(inputs, tangents):
       if tangent.rank == primal.rank + 1:
-        if tangent.shape != [batch_size] + primal.shape:
+        if tangent.shape != array_ops.concat([batch_size], primal.shape):
           raise ValueError(
             "Tangent {} was expected to be of shape "
             "{} but is instead of shape {}".format(
@@ -173,7 +171,7 @@ def _jvp_helper_wrapper(
             "{}, {} tangents and primals".format(tangent.rank, primal.rank)
           )
 
-  if use_pfor:
+  if batch_size:
     return control_flow_ops.vectorized_map(
       functools.partial(_jvp_helper, op_name, attr_tuple, inputs, outputs),
       tangents,
diff --git a/tensorflow/python/eager/forwardprop_test.py b/tensorflow/python/eager/forwardprop_test.py
index b80ee177297..1743cd64967 100644
--- a/tensorflow/python/eager/forwardprop_test.py
+++ b/tensorflow/python/eager/forwardprop_test.py
@@ -257,7 +257,7 @@ class ForwardpropTest(test.TestCase, parameterized.TestCase):
             constant_op.constant([1., 2., 3.]),
             constant_op.constant([4., 5., 6.]),
         ),
-	batch_size=3)
+        batch_size=3)
 
     # Using evaluate and asserting with just a list works too
     # but the output is more explicit this way
@@ -276,7 +276,7 @@ class ForwardpropTest(test.TestCase, parameterized.TestCase):
             constant_op.constant([[1.], [0.], [1.]]),
             constant_op.constant([[0.], [1.], [1.]]),
         ),
-	batch_size=3)
+        batch_size=3)
     self.assertAllClose(
       [constant_op.constant([[5.], [4.], [5. + 4.]])],
       jvp_flat

From 20572f9ed0f4c6b02b0226281809c63868773a0e Mon Sep 17 00:00:00 2001
From: Geoffrey Martin-Noble <gcmn@google.com>
Date: Mon, 22 Jun 2020 14:27:36 -0700
Subject: [PATCH 0798/1390] Rename and alias MLIR targets to match CMake

These differ from the CMake library names somewhat arbitrarily, which just creates confusion.

PiperOrigin-RevId: 317735676
Change-Id: Ia9d3d1645467b1af72f917b01d1a3f6061bceb34
---
 third_party/mlir/BUILD | 117 +++++++++++++++++++++++++----------------
 1 file changed, 71 insertions(+), 46 deletions(-)

diff --git a/third_party/mlir/BUILD b/third_party/mlir/BUILD
index a0d10667d23..2be59fc44b1 100644
--- a/third_party/mlir/BUILD
+++ b/third_party/mlir/BUILD
@@ -282,7 +282,7 @@ cc_library(
     deps = [
         ":AVX512IncGen",
         ":IR",
-        ":SideEffects",
+        ":SideEffectInterfaces",
         ":VectorOps",
         "@llvm-project//llvm:Core",
         "@llvm-project//llvm:Support",
@@ -305,9 +305,9 @@ cc_library(
         ":IR",
         ":LLVMAVX512",
         ":LLVMDialect",
-        ":LLVMTransforms",
         ":Pass",
         ":StandardOps",
+        ":StandardToLLVM",
         ":Support",
         ":Transforms",
         ":VectorOps",
@@ -489,7 +489,7 @@ cc_library(
         ":EDSC",
         ":IR",
         ":LoopLikeInterface",
-        ":SideEffects",
+        ":SideEffectInterfaces",
         ":StandardOps",
         ":Support",
         "@llvm-project//llvm:Support",
@@ -571,7 +571,7 @@ gentbl(
 )
 
 cc_library(
-    name = "AffineToStandardTransforms",
+    name = "AffineToStandard",
     srcs = glob([
         "lib/Conversion/AffineToStandard/*.cpp",
         "lib/Conversion/AffineToStandard/*.h",
@@ -591,6 +591,11 @@ cc_library(
     ],
 )
 
+alias(
+    name = "AffineToStandardTransforms",
+    actual = "AffineToStandard",
+)
+
 # SDBM dialect only contains attribute components that can be constructed given
 # a dialect object, so whenever it is used it must also be registered. Therefore
 # we don't split out the registration library for it.
@@ -631,7 +636,7 @@ cc_library(
         ":IR",
         ":LoopLikeInterface",
         ":SCFIncGen",
-        ":SideEffects",
+        ":SideEffectInterfaces",
         ":StandardOps",
         ":Support",
         "@llvm-project//llvm:Support",
@@ -719,7 +724,7 @@ cc_library(
         ":InferTypeOpInterface",
         ":MLIRShapeCanonicalizationIncGen",
         ":ShapeOpsIncGen",
-        ":SideEffects",
+        ":SideEffectInterfaces",
         ":Support",
         "@llvm-project//llvm:Support",
     ],
@@ -833,7 +838,7 @@ cc_library(
         ":ControlFlowInterfaces",
         ":EDSC",
         ":IR",
-        ":SideEffects",
+        ":SideEffectInterfaces",
         ":StandardOpsIncGen",
         ":Support",
         ":ViewLikeInterface",
@@ -895,7 +900,7 @@ cc_library(
         ":DialectUtils",
         ":EDSC",
         ":IR",
-        ":SideEffects",
+        ":SideEffectInterfaces",
         ":StandardOps",
         ":Support",
         ":VectorOpsIncGen",
@@ -1070,7 +1075,7 @@ cc_library(
         ":ControlFlowInterfaces",
         ":IR",
         ":LLVMOpsIncGen",
-        ":SideEffects",
+        ":SideEffectInterfaces",
         ":Support",
         "@llvm-project//llvm:AsmParser",
         "@llvm-project//llvm:BitReader",
@@ -1193,7 +1198,7 @@ cc_library(
         ":GPUOpsIncGen",
         ":IR",
         ":LLVMDialect",
-        ":SideEffects",
+        ":SideEffectInterfaces",
         ":StandardOps",
         ":Support",
     ],
@@ -1271,8 +1276,8 @@ cc_library(
         ":GPUDialect",
         ":IR",
         ":LLVMDialect",
-        ":LLVMTransforms",
         ":StandardOps",
+        ":StandardToLLVM",
         "@llvm-project//llvm:Support",
     ],
 )
@@ -1311,9 +1316,9 @@ cc_library(
         ":GPUToNVVMGen",
         ":GPUTransforms",
         ":IR",
-        ":LLVMTransforms",
         ":NVVMDialect",
         ":Pass",
+        ":StandardToLLVM",
         ":Transforms",
         "@llvm-project//llvm:Support",
     ],
@@ -1333,10 +1338,10 @@ cc_library(
         ":ConversionPassIncGen",
         ":GPUDialect",
         ":LLVMDialect",
-        ":LLVMTransforms",
         ":Pass",
         ":ROCDLDialect",
         ":StandardOps",
+        ":StandardToLLVM",
         ":Transforms",
         ":VectorOps",
     ],
@@ -1375,9 +1380,9 @@ cc_library(
         ":GPUDialect",
         ":GPUToROCDLTGen",
         ":GPUTransforms",
-        ":LLVMTransforms",
         ":Pass",
         ":ROCDLDialect",
+        ":StandardToLLVM",
         ":Transforms",
         ":VectorOps",
         ":VectorToLLVM",
@@ -1475,7 +1480,7 @@ cc_library(
         ":SCFDialect",
         ":SPIRVDialect",
         ":SPIRVLowering",
-        ":StandardToSPIRVConversions",
+        ":StandardToSPIRVTransforms",
         ":Support",
         ":Transforms",
     ],
@@ -1496,10 +1501,10 @@ cc_library(
         ":ConversionPassIncGen",
         ":IR",
         ":LLVMDialect",
-        ":LLVMTransforms",
         ":Pass",
         ":SPIRVDialect",
         ":StandardOps",
+        ":StandardToLLVM",
         ":Support",
         ":Transforms",
     ],
@@ -1574,7 +1579,7 @@ cc_library(
         ":IR",
         ":LLVMDialect",
         ":NVVMOpsIncGen",
-        ":SideEffects",
+        ":SideEffectInterfaces",
         ":StandardOps",
         ":Support",
         "@llvm-project//llvm:AsmParser",
@@ -1646,7 +1651,7 @@ cc_library(
         ":IR",
         ":LLVMDialect",
         ":ROCDLOpsIncGen",
-        ":SideEffects",
+        ":SideEffectInterfaces",
         ":StandardOps",
         ":Support",
         "@llvm-project//llvm:AsmParser",
@@ -1894,7 +1899,7 @@ cc_library(
         ":SPIRVOpsIncGen",
         ":SPIRVSerializationGen",
         ":SPIRVTargetAndABIStructGen",
-        ":SideEffects",
+        ":SideEffectInterfaces",
         ":Support",
         ":Transforms",
         "@llvm-project//llvm:Support",
@@ -1947,7 +1952,7 @@ cc_library(
 )
 
 cc_library(
-    name = "StandardToSPIRVConversions",
+    name = "StandardToSPIRVTransforms",
     srcs = glob([
         "lib/Conversion/StandardToSPIRV/*.cpp",
         "lib/Conversion/StandardToSPIRV/*.h",
@@ -1972,6 +1977,11 @@ cc_library(
     ],
 )
 
+alias(
+    name = "StandardToSPIRVConversions",
+    actual = "StandardToSPIRVTransforms",
+)
+
 cc_library(
     name = "SPIRVSerialization",
     srcs = glob(
@@ -2033,7 +2043,7 @@ cc_library(
         ":IR",
         ":LoopLikeInterface",
         ":SCFDialect",
-        ":SideEffects",
+        ":SideEffectInterfaces",
         ":StandardOps",
         ":Support",
         "@llvm-project//llvm:Support",
@@ -2152,7 +2162,7 @@ cc_library(
         ":LoopLikeInterface",
         ":Pass",
         ":SCFDialect",
-        ":SideEffects",
+        ":SideEffectInterfaces",
         ":StandardOps",
         ":Support",
         ":TransformUtils",
@@ -2182,7 +2192,7 @@ cc_library(
     includes = ["include"],
     deps = [
         ":Affine",
-        ":AffineToStandardTransforms",
+        ":AffineToStandard",
         ":ConversionPassIncGen",
         ":GPUDialect",
         ":GPUTransforms",
@@ -2222,7 +2232,7 @@ cc_library(
 )
 
 cc_library(
-    name = "CFGTransforms",
+    name = "SCFToStandard",
     srcs = [
         "lib/Conversion/PassDetail.h",
         "lib/Conversion/SCFToStandard/SCFToStandard.cpp",
@@ -2244,8 +2254,13 @@ cc_library(
     ],
 )
 
+alias(
+    name = "CFGTransforms",
+    actual = "SCFToStandard",
+)
+
 cc_library(
-    name = "LLVMTransforms",
+    name = "StandardToLLVM",
     srcs = [
         "lib/Conversion/PassDetail.h",
         "lib/Conversion/StandardToLLVM/StandardToLLVM.cpp",
@@ -2269,6 +2284,11 @@ cc_library(
     ],
 )
 
+alias(
+    name = "LLVMTransforms",
+    actual = "StandardToLLVM",
+)
+
 gentbl(
     name = "CallOpInterfacesIncGen",
     strip_include_prefix = "include",
@@ -2401,7 +2421,7 @@ gentbl(
 )
 
 cc_library(
-    name = "SideEffects",
+    name = "SideEffectInterfaces",
     srcs = [
         "lib/Interfaces/SideEffectInterfaces.cpp",
     ],
@@ -2417,6 +2437,11 @@ cc_library(
     ],
 )
 
+alias(
+    name = "SideEffects",
+    actual = "SideEffectInterfaces",
+)
+
 cc_library(
     name = "Analysis",
     srcs = glob(
@@ -2627,7 +2652,6 @@ cc_library(
         ":GPUTransforms",
         ":IR",
         ":LLVMDialect",
-        ":LLVMTransforms",
         ":LinalgToLLVM",
         ":LinalgToSPIRV",
         ":LinalgToStandard",
@@ -2639,7 +2663,8 @@ cc_library(
         ":ShapeToStandard",
         ":ShapeTransforms",
         ":StandardOpsTransforms",
-        ":StandardToSPIRVConversions",
+        ":StandardToLLVM",
+        ":StandardToSPIRVTransforms",
         ":Support",
         ":Transforms",
         ":VectorToLLVM",
@@ -2699,7 +2724,6 @@ cc_library(
         ":Affine",
         ":AffinePassIncGen",
         ":AffineTransforms",
-        ":CFGTransforms",
         ":ConversionPassIncGen",
         ":GPUDialect",
         ":GPUPassIncGen",
@@ -2714,7 +2738,6 @@ cc_library(
         ":LLVMDialect",
         ":LLVMIRTransforms",
         ":LLVMPassIncGen",
-        ":LLVMTransforms",
         ":LinalgOps",
         ":LinalgPassIncGen",
         ":LinalgToLLVM",
@@ -2729,6 +2752,7 @@ cc_library(
         ":ROCDLDialect",
         ":SCFDialect",
         ":SCFToGPUPass",
+        ":SCFToStandard",
         ":SCFTransforms",
         ":SDBM",
         ":SPIRVDialect",
@@ -2743,7 +2767,8 @@ cc_library(
         ":StandardOps",
         ":StandardOpsTransforms",
         ":StandardOpsTransformsPassIncGen",
-        ":StandardToSPIRVConversions",
+        ":StandardToLLVM",
+        ":StandardToSPIRVTransforms",
         ":Transforms",
         ":TransformsPassIncGen",
         ":VectorOps",
@@ -2809,13 +2834,13 @@ cc_library(
     includes = ["include"],
     deps = [
         ":AllPassesAndDialectsNoRegistration",
-        ":CFGTransforms",
         ":ExecutionEngine",
         ":ExecutionEngineUtils",
         ":IR",
         ":LLVMDialect",
         ":Parser",
         ":Pass",
+        ":SCFToStandard",
         ":Support",
         "@llvm-project//llvm:Core",
         "@llvm-project//llvm:OrcJIT",
@@ -2885,7 +2910,7 @@ cc_library(
         ":IR",
         ":Pass",
         ":SPIRVDialect",
-        ":SideEffects",
+        ":SideEffectInterfaces",
         ":StandardOps",
         ":Support",
         "@llvm-project//llvm:Support",
@@ -2918,10 +2943,10 @@ cc_binary(
         ":GPUTransforms",
         ":IR",
         ":LLVMDialect",
-        ":LLVMTransforms",
         ":MlirJitRunner",
         ":NVVMDialect",
         ":Pass",
+        ":StandardToLLVM",
         ":TargetNVVMIR",
         ":Transforms",
         "//devtools/build/runtime:get_runfiles_dir",
@@ -2945,11 +2970,11 @@ cc_binary(
         ":GPUToSPIRVTransforms",
         ":GPUToVulkanTransforms",
         ":GPUTransforms",
-        ":LLVMTransforms",
         ":MlirJitRunner",
         ":Pass",
         ":SPIRVDialect",
-        ":StandardToSPIRVConversions",
+        ":StandardToLLVM",
+        ":StandardToSPIRVTransforms",
         "@llvm-project//llvm:Support",
     ],
 )
@@ -3159,7 +3184,7 @@ cc_library(
         ":Pass",
         ":QuantOpsIncGen",
         ":QuantPassIncGen",
-        ":SideEffects",
+        ":SideEffectInterfaces",
         ":StandardOps",
         "@llvm-project//llvm:Support",
     ],
@@ -3294,18 +3319,18 @@ cc_library(
     ]),
     includes = ["include"],
     deps = [
-        ":AffineToStandardTransforms",
+        ":AffineToStandard",
         ":Analysis",
-        ":CFGTransforms",
         ":ConversionPassIncGen",
         ":EDSC",
         ":IR",
         ":LLVMDialect",
-        ":LLVMTransforms",
         ":LinalgOps",
         ":LinalgTransforms",
         ":Pass",
+        ":SCFToStandard",
         ":StandardOps",
+        ":StandardToLLVM",
         ":Support",
         ":Transforms",
         ":VectorToLLVM",
@@ -3385,7 +3410,7 @@ cc_library(
         ":LinalgOpsIncGen",
         ":LinalgStructuredOpsIncGen",
         ":Parser",
-        ":SideEffects",
+        ":SideEffectInterfaces",
         ":StandardOps",
         ":Support",
         ":ViewLikeInterface",
@@ -3431,21 +3456,21 @@ cc_library(
     includes = ["include"],
     deps = [
         ":Affine",
-        ":AffineToStandardTransforms",
+        ":AffineToStandard",
         ":Analysis",
-        ":CFGTransforms",
         ":DialectUtils",
         ":EDSC",
         ":IR",
         ":LLVMDialect",
-        ":LLVMTransforms",
         ":LinalgOps",
         ":LinalgPassIncGen",
         ":LinalgStructuredOpsIncGen",
         ":Pass",
         ":SCFDialect",
+        ":SCFToStandard",
         ":SCFTransforms",
         ":StandardOps",
+        ":StandardToLLVM",
         ":Support",
         ":TransformUtils",
         ":Transforms",
@@ -3537,9 +3562,9 @@ cc_library(
         ":EDSC",
         ":IR",
         ":LLVMDialect",
-        ":LLVMTransforms",
         ":Pass",
         ":StandardOps",
+        ":StandardToLLVM",
         ":Support",
         ":Transforms",
         ":VectorOps",
@@ -3564,10 +3589,10 @@ cc_library(
         ":EDSC",
         ":IR",
         ":LLVMDialect",
-        ":LLVMTransforms",
         ":Pass",
         ":SCFDialect",
         ":StandardOps",
+        ":StandardToLLVM",
         ":Support",
         ":Transforms",
         ":VectorOps",

From 1603b2053c8514d84c1214e5f4421a8a8db4d77b Mon Sep 17 00:00:00 2001
From: XingyuLong <halolong@hotmail.com>
Date: Mon, 22 Jun 2020 17:44:04 -0400
Subject: [PATCH 0799/1390] zeroPadding3D shape bug

---
 tensorflow/python/keras/layers/convolutional.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/tensorflow/python/keras/layers/convolutional.py b/tensorflow/python/keras/layers/convolutional.py
index 51f4e3b320a..a84b708f8ae 100644
--- a/tensorflow/python/keras/layers/convolutional.py
+++ b/tensorflow/python/keras/layers/convolutional.py
@@ -2942,30 +2942,30 @@ class ZeroPadding3D(Layer):
     input_shape = tensor_shape.TensorShape(input_shape).as_list()
     if self.data_format == 'channels_first':
       if input_shape[2] is not None:
-        dim1 = input_shape[2] + 2 * self.padding[0][0]
+        dim1 = input_shape[2] + self.padding[0][0] + self.padding[0][1]
       else:
         dim1 = None
       if input_shape[3] is not None:
-        dim2 = input_shape[3] + 2 * self.padding[1][0]
+        dim2 = input_shape[3] + self.padding[1][0] + self.padding[1][1]
       else:
         dim2 = None
       if input_shape[4] is not None:
-        dim3 = input_shape[4] + 2 * self.padding[2][0]
+        dim3 = input_shape[4] + self.padding[2][0] + self.padding[2][1]
       else:
         dim3 = None
       return tensor_shape.TensorShape(
           [input_shape[0], input_shape[1], dim1, dim2, dim3])
     elif self.data_format == 'channels_last':
       if input_shape[1] is not None:
-        dim1 = input_shape[1] + 2 * self.padding[0][1]
+        dim1 = input_shape[1] + self.padding[0][0] + self.padding[0][1]
       else:
         dim1 = None
       if input_shape[2] is not None:
-        dim2 = input_shape[2] + 2 * self.padding[1][1]
+        dim2 = input_shape[2] + self.padding[1][0] + self.padding[1][1]
       else:
         dim2 = None
       if input_shape[3] is not None:
-        dim3 = input_shape[3] + 2 * self.padding[2][1]
+        dim3 = input_shape[3] + self.padding[2][0] + self.padding[2][1]
       else:
         dim3 = None
       return tensor_shape.TensorShape(

From e6ada6a6b4b2a98cd8a8e361ae7e4c6ae1d09645 Mon Sep 17 00:00:00 2001
From: Smit Hinsu <hinsu@google.com>
Date: Mon, 22 Jun 2020 14:28:48 -0700
Subject: [PATCH 0800/1390] Add unit test for Prelu in XNNPACK delegate

- Add PreluTester class and unit test for XNNPACK-delegated Prelu operator
- Relax restrictions on the number of input/output dimensions in delegated
  Prelu operators

PiperOrigin-RevId: 317735911
Change-Id: Iddf727f5f916b142412a1be44efa1f367dc31d49
---
 tensorflow/lite/delegates/xnnpack/BUILD       |  32 -
 .../delegates/xnnpack/leaky_relu_tester.h     |   1 +
 .../lite/delegates/xnnpack/prelu_test.cc      | 565 ------------------
 .../lite/delegates/xnnpack/prelu_tester.cc    | 237 --------
 .../lite/delegates/xnnpack/prelu_tester.h     |  88 ---
 .../delegates/xnnpack/xnnpack_delegate.cc     |   6 +-
 6 files changed, 3 insertions(+), 926 deletions(-)
 delete mode 100644 tensorflow/lite/delegates/xnnpack/prelu_test.cc
 delete mode 100644 tensorflow/lite/delegates/xnnpack/prelu_tester.cc
 delete mode 100644 tensorflow/lite/delegates/xnnpack/prelu_tester.h

diff --git a/tensorflow/lite/delegates/xnnpack/BUILD b/tensorflow/lite/delegates/xnnpack/BUILD
index e0d3d39f719..eaf7d8f6f03 100644
--- a/tensorflow/lite/delegates/xnnpack/BUILD
+++ b/tensorflow/lite/delegates/xnnpack/BUILD
@@ -180,23 +180,6 @@ cc_library(
     ],
 )
 
-cc_library(
-    name = "prelu_tester",
-    testonly = 1,
-    srcs = ["prelu_tester.cc"],
-    hdrs = ["prelu_tester.h"],
-    deps = [
-        "//tensorflow/lite:framework",
-        "//tensorflow/lite:schema_fbs_version",
-        "//tensorflow/lite/c:common",
-        "//tensorflow/lite/kernels:builtin_ops",
-        "//tensorflow/lite/schema:schema_fbs",
-        "@FP16",
-        "@com_google_googletest//:gtest",
-        "@flatbuffers",
-    ],
-)
-
 cc_library(
     name = "reduce_tester",
     testonly = 1,
@@ -544,21 +527,6 @@ cc_test(
     ],
 )
 
-cc_test(
-    name = "prelu_test",
-    srcs = ["prelu_test.cc"],
-    linkopts = select({
-        "//tensorflow:emscripten": EMSCRIPTEN_LINKOPTS,
-        "//conditions:default": [],
-    }),
-    deps = [
-        ":prelu_tester",
-        ":test_main",
-        ":xnnpack_delegate_test_mode",
-        "@com_google_googletest//:gtest",
-    ],
-)
-
 cc_test(
     name = "relu_test",
     srcs = ["relu_test.cc"],
diff --git a/tensorflow/lite/delegates/xnnpack/leaky_relu_tester.h b/tensorflow/lite/delegates/xnnpack/leaky_relu_tester.h
index 191dc938e89..f1d9efd7209 100644
--- a/tensorflow/lite/delegates/xnnpack/leaky_relu_tester.h
+++ b/tensorflow/lite/delegates/xnnpack/leaky_relu_tester.h
@@ -21,6 +21,7 @@ limitations under the License.
 
 #include <gtest/gtest.h>
 #include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
 namespace xnnpack {
diff --git a/tensorflow/lite/delegates/xnnpack/prelu_test.cc b/tensorflow/lite/delegates/xnnpack/prelu_test.cc
deleted file mode 100644
index ae40c032f32..00000000000
--- a/tensorflow/lite/delegates/xnnpack/prelu_test.cc
+++ /dev/null
@@ -1,565 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <cstdint>
-#include <functional>
-#include <memory>
-#include <random>
-
-#include <gtest/gtest.h>
-#include "tensorflow/lite/delegates/xnnpack/prelu_tester.h"
-#include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
-
-namespace tflite {
-namespace xnnpack {
-
-TEST(Prelu, DISABLED_4DBy4D) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
-  std::random_device random_device;
-  auto rng = std::mt19937(random_device());
-  auto shape_rng =
-      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
-  const auto batch = shape_rng();
-  const auto height = shape_rng();
-  const auto width = shape_rng();
-  const auto channels = shape_rng();
-
-  PreluTester()
-      .InputShape({batch, height, width, channels})
-      .SlopeShape({batch, height, width, channels})
-      .Test(xnnpack_delegate.get());
-}
-
-TEST(Prelu, 4DBy4DBroadcastChannels) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
-  std::random_device random_device;
-  auto rng = std::mt19937(random_device());
-  auto shape_rng =
-      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
-  const auto batch = shape_rng();
-  const auto height = shape_rng();
-  const auto width = shape_rng();
-  const auto channels = shape_rng();
-
-  PreluTester()
-      .InputShape({batch, height, width, channels})
-      .SlopeShape({1, 1, 1, channels})
-      .Test(xnnpack_delegate.get());
-}
-
-TEST(Prelu, DISABLED_4DBy4DBroadcastWidth) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
-  std::random_device random_device;
-  auto rng = std::mt19937(random_device());
-  auto shape_rng =
-      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
-  const auto batch = shape_rng();
-  const auto height = shape_rng();
-  const auto width = shape_rng();
-  const auto channels = shape_rng();
-
-  PreluTester()
-      .InputShape({batch, height, width, channels})
-      .SlopeShape({1, 1, width, 1})
-      .Test(xnnpack_delegate.get());
-}
-
-TEST(Prelu, DISABLED_4DBy4DBroadcastHeight) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
-  std::random_device random_device;
-  auto rng = std::mt19937(random_device());
-  auto shape_rng =
-      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
-  const auto batch = shape_rng();
-  const auto height = shape_rng();
-  const auto width = shape_rng();
-  const auto channels = shape_rng();
-
-  PreluTester()
-      .InputShape({batch, height, width, channels})
-      .SlopeShape({1, height, 1, 1})
-      .Test(xnnpack_delegate.get());
-}
-
-TEST(Prelu, DISABLED_4DBy4DBroadcastBatch) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
-  std::random_device random_device;
-  auto rng = std::mt19937(random_device());
-  auto shape_rng =
-      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
-  const auto batch = shape_rng();
-  const auto height = shape_rng();
-  const auto width = shape_rng();
-  const auto channels = shape_rng();
-
-  PreluTester()
-      .InputShape({batch, height, width, channels})
-      .SlopeShape({batch, 1, 1, 1})
-      .Test(xnnpack_delegate.get());
-}
-
-TEST(Prelu, DISABLED_4DBy4DBroadcastHeightWidthChannels) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
-  std::random_device random_device;
-  auto rng = std::mt19937(random_device());
-  auto shape_rng =
-      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
-  const auto batch = shape_rng();
-  const auto height = shape_rng();
-  const auto width = shape_rng();
-  const auto channels = shape_rng();
-
-  PreluTester()
-      .InputShape({batch, height, width, channels})
-      .SlopeShape({1, height, width, channels})
-      .Test(xnnpack_delegate.get());
-}
-
-TEST(Prelu, DISABLED_4DBy3D) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
-  std::random_device random_device;
-  auto rng = std::mt19937(random_device());
-  auto shape_rng =
-      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
-  const auto batch = shape_rng();
-  const auto height = shape_rng();
-  const auto width = shape_rng();
-  const auto channels = shape_rng();
-
-  PreluTester()
-      .InputShape({batch, height, width, channels})
-      .SlopeShape({height, width, channels})
-      .Test(xnnpack_delegate.get());
-}
-
-TEST(Prelu, DISABLED_4DBy2D) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
-  std::random_device random_device;
-  auto rng = std::mt19937(random_device());
-  auto shape_rng =
-      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
-  const auto batch = shape_rng();
-  const auto height = shape_rng();
-  const auto width = shape_rng();
-  const auto channels = shape_rng();
-
-  PreluTester()
-      .InputShape({batch, height, width, channels})
-      .SlopeShape({width, channels})
-      .Test(xnnpack_delegate.get());
-}
-
-TEST(Prelu, 4DBy1D) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
-  std::random_device random_device;
-  auto rng = std::mt19937(random_device());
-  auto shape_rng =
-      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
-  const auto batch = shape_rng();
-  const auto height = shape_rng();
-  const auto width = shape_rng();
-  const auto channels = shape_rng();
-
-  PreluTester()
-      .InputShape({batch, height, width, channels})
-      .SlopeShape({channels})
-      .Test(xnnpack_delegate.get());
-}
-
-TEST(Prelu, DISABLED_4DBy0D) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
-  std::random_device random_device;
-  auto rng = std::mt19937(random_device());
-  auto shape_rng =
-      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
-  const auto batch = shape_rng();
-  const auto height = shape_rng();
-  const auto width = shape_rng();
-  const auto channels = shape_rng();
-
-  PreluTester()
-      .InputShape({batch, height, width, channels})
-      .SlopeShape({})
-      .Test(xnnpack_delegate.get());
-}
-
-TEST(Prelu, DISABLED_3DBy3D) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
-  std::random_device random_device;
-  auto rng = std::mt19937(random_device());
-  auto shape_rng =
-      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
-  const auto batch = shape_rng();
-  const auto width = shape_rng();
-  const auto channels = shape_rng();
-
-  PreluTester()
-      .InputShape({batch, width, channels})
-      .SlopeShape({batch, width, channels})
-      .Test(xnnpack_delegate.get());
-}
-
-TEST(Prelu, 3DBy3DBroadcastChannels) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
-  std::random_device random_device;
-  auto rng = std::mt19937(random_device());
-  auto shape_rng =
-      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
-  const auto batch = shape_rng();
-  const auto width = shape_rng();
-  const auto channels = shape_rng();
-
-  PreluTester()
-      .InputShape({batch, width, channels})
-      .SlopeShape({1, 1, channels})
-      .Test(xnnpack_delegate.get());
-}
-
-TEST(Prelu, DISABLED_3DBy3DBroadcastWidth) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
-  std::random_device random_device;
-  auto rng = std::mt19937(random_device());
-  auto shape_rng =
-      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
-  const auto batch = shape_rng();
-  const auto width = shape_rng();
-  const auto channels = shape_rng();
-
-  PreluTester()
-      .InputShape({batch, width, channels})
-      .SlopeShape({1, width, 1})
-      .Test(xnnpack_delegate.get());
-}
-
-TEST(Prelu, DISABLED_3DBy3DBroadcastBatch) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
-  std::random_device random_device;
-  auto rng = std::mt19937(random_device());
-  auto shape_rng =
-      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
-  const auto batch = shape_rng();
-  const auto width = shape_rng();
-  const auto channels = shape_rng();
-
-  PreluTester()
-      .InputShape({batch, width, channels})
-      .SlopeShape({batch, 1, 1})
-      .Test(xnnpack_delegate.get());
-}
-
-TEST(Prelu, DISABLED_3DBy3DBroadcastWidthChannels) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
-  std::random_device random_device;
-  auto rng = std::mt19937(random_device());
-  auto shape_rng =
-      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
-  const auto batch = shape_rng();
-  const auto width = shape_rng();
-  const auto channels = shape_rng();
-
-  PreluTester()
-      .InputShape({batch, width, channels})
-      .SlopeShape({1, width, channels})
-      .Test(xnnpack_delegate.get());
-}
-
-TEST(Prelu, DISABLED_3DBy2D) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
-  std::random_device random_device;
-  auto rng = std::mt19937(random_device());
-  auto shape_rng =
-      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
-  const auto batch = shape_rng();
-  const auto width = shape_rng();
-  const auto channels = shape_rng();
-
-  PreluTester()
-      .InputShape({batch, width, channels})
-      .SlopeShape({width, channels})
-      .Test(xnnpack_delegate.get());
-}
-
-TEST(Prelu, 3DBy1D) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
-  std::random_device random_device;
-  auto rng = std::mt19937(random_device());
-  auto shape_rng =
-      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
-  const auto batch = shape_rng();
-  const auto width = shape_rng();
-  const auto channels = shape_rng();
-
-  PreluTester()
-      .InputShape({batch, width, channels})
-      .SlopeShape({channels})
-      .Test(xnnpack_delegate.get());
-}
-
-TEST(Prelu, DISABLED_3DBy0D) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
-  std::random_device random_device;
-  auto rng = std::mt19937(random_device());
-  auto shape_rng =
-      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
-  const auto batch = shape_rng();
-  const auto width = shape_rng();
-  const auto channels = shape_rng();
-
-  PreluTester()
-      .InputShape({batch, width, channels})
-      .SlopeShape({})
-      .Test(xnnpack_delegate.get());
-}
-
-TEST(Prelu, DISABLED_2DBy2D) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
-  std::random_device random_device;
-  auto rng = std::mt19937(random_device());
-  auto shape_rng =
-      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
-  const auto batch = shape_rng();
-  const auto channels = shape_rng();
-
-  PreluTester()
-      .InputShape({batch, channels})
-      .SlopeShape({batch, channels})
-      .Test(xnnpack_delegate.get());
-}
-
-TEST(Prelu, 2DBy2DBroadcastChannels) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
-  std::random_device random_device;
-  auto rng = std::mt19937(random_device());
-  auto shape_rng =
-      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
-  const auto batch = shape_rng();
-  const auto channels = shape_rng();
-
-  PreluTester()
-      .InputShape({batch, channels})
-      .SlopeShape({1, channels})
-      .Test(xnnpack_delegate.get());
-}
-
-TEST(Prelu, DISABLED_2DBy2DBroadcastBatch) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
-  std::random_device random_device;
-  auto rng = std::mt19937(random_device());
-  auto shape_rng =
-      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
-  const auto batch = shape_rng();
-  const auto channels = shape_rng();
-
-  PreluTester()
-      .InputShape({batch, channels})
-      .SlopeShape({batch, 1})
-      .Test(xnnpack_delegate.get());
-}
-
-TEST(Prelu, 2DBy1D) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
-  std::random_device random_device;
-  auto rng = std::mt19937(random_device());
-  auto shape_rng =
-      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
-  const auto batch = shape_rng();
-  const auto channels = shape_rng();
-
-  PreluTester()
-      .InputShape({batch, channels})
-      .SlopeShape({channels})
-      .Test(xnnpack_delegate.get());
-}
-
-TEST(Prelu, DISABLED_2DBy0D) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
-  std::random_device random_device;
-  auto rng = std::mt19937(random_device());
-  auto shape_rng =
-      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
-  const auto batch = shape_rng();
-  const auto channels = shape_rng();
-
-  PreluTester()
-      .InputShape({batch, channels})
-      .SlopeShape({})
-      .Test(xnnpack_delegate.get());
-}
-
-TEST(Prelu, 1DBy1D) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
-  std::random_device random_device;
-  auto rng = std::mt19937(random_device());
-  auto shape_rng =
-      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
-  const auto batch = shape_rng();
-
-  PreluTester().InputShape({batch}).SlopeShape({batch}).Test(
-      xnnpack_delegate.get());
-}
-
-TEST(Prelu, DISABLED_1DBy0D) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
-  std::random_device random_device;
-  auto rng = std::mt19937(random_device());
-  auto shape_rng =
-      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
-  const auto batch = shape_rng();
-
-  PreluTester().InputShape({batch}).SlopeShape({}).Test(xnnpack_delegate.get());
-}
-
-TEST(Prelu, FP16Weights) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
-  std::random_device random_device;
-  auto rng = std::mt19937(random_device());
-  auto shape_rng =
-      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
-  const auto batch = shape_rng();
-  const auto height = shape_rng();
-  const auto width = shape_rng();
-  const auto channels = shape_rng();
-
-  PreluTester()
-      .InputShape({batch, height, width, channels})
-      .SlopeShape({channels})
-      .FP16Weights()
-      .Test(xnnpack_delegate.get());
-}
-
-TEST(Prelu, SparseWeights) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
-  std::random_device random_device;
-  auto rng = std::mt19937(random_device());
-  auto shape_rng =
-      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
-  const auto batch = shape_rng();
-  const auto height = shape_rng();
-  const auto width = shape_rng();
-  const auto channels = shape_rng();
-
-  PreluTester()
-      .InputShape({batch, height, width, channels})
-      .SlopeShape({channels})
-      .SparseWeights()
-      .Test(xnnpack_delegate.get());
-}
-
-TEST(Prelu, MultiThreading) {
-  TfLiteXNNPackDelegateOptions delegate_options =
-      TfLiteXNNPackDelegateOptionsDefault();
-  delegate_options.num_threads = 2;
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
-                       TfLiteXNNPackDelegateDelete);
-
-  std::random_device random_device;
-  auto rng = std::mt19937(random_device());
-  auto shape_rng =
-      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
-  const auto batch = shape_rng();
-  const auto height = shape_rng();
-  const auto width = shape_rng();
-  const auto channels = shape_rng();
-
-  PreluTester()
-      .InputShape({batch, height, width, channels})
-      .SlopeShape({channels})
-      .Test(xnnpack_delegate.get());
-}
-
-}  // namespace xnnpack
-}  // namespace tflite
diff --git a/tensorflow/lite/delegates/xnnpack/prelu_tester.cc b/tensorflow/lite/delegates/xnnpack/prelu_tester.cc
deleted file mode 100644
index 90fa0fb67c0..00000000000
--- a/tensorflow/lite/delegates/xnnpack/prelu_tester.cc
+++ /dev/null
@@ -1,237 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/lite/delegates/xnnpack/prelu_tester.h"
-
-#include <array>
-#include <cstdint>
-#include <functional>
-#include <numeric>
-#include <random>
-#include <vector>
-
-#include <gtest/gtest.h>
-#include "third_party/FP16/include/fp16.h"
-#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
-#include "tensorflow/lite/interpreter.h"
-#include "tensorflow/lite/kernels/register.h"
-#include "tensorflow/lite/model.h"
-#include "tensorflow/lite/schema/schema_generated.h"
-#include "tensorflow/lite/version.h"
-
-namespace tflite {
-namespace xnnpack {
-
-void PreluTester::Test(TfLiteDelegate* delegate) const {
-  std::random_device random_device;
-  auto rng = std::mt19937(random_device());
-  auto input_rng = std::bind(std::uniform_real_distribution<float>(-1.0f, 1.0f),
-                             std::ref(rng));
-
-  std::vector<char> buffer = CreateTfLiteModel();
-  const Model* model = GetModel(buffer.data());
-
-  std::unique_ptr<Interpreter> delegate_interpreter;
-  ASSERT_EQ(
-      InterpreterBuilder(model, ::tflite::ops::builtin::BuiltinOpResolver())(
-          &delegate_interpreter),
-      kTfLiteOk);
-  std::unique_ptr<Interpreter> default_interpreter;
-  ASSERT_EQ(
-      InterpreterBuilder(model, ::tflite::ops::builtin::BuiltinOpResolver())(
-          &default_interpreter),
-      kTfLiteOk);
-
-  ASSERT_TRUE(delegate_interpreter);
-  ASSERT_TRUE(default_interpreter);
-
-  ASSERT_EQ(delegate_interpreter->inputs().size(), 1);
-  ASSERT_EQ(default_interpreter->inputs().size(), 1);
-
-  ASSERT_EQ(delegate_interpreter->outputs().size(), 1);
-  ASSERT_EQ(default_interpreter->outputs().size(), 1);
-
-  ASSERT_EQ(delegate_interpreter->AllocateTensors(), kTfLiteOk);
-  ASSERT_EQ(default_interpreter->AllocateTensors(), kTfLiteOk);
-
-  ASSERT_EQ(delegate_interpreter->ModifyGraphWithDelegate(delegate), kTfLiteOk);
-
-  float* default_input_data = default_interpreter->typed_tensor<float>(
-      default_interpreter->inputs()[0]);
-  std::generate(default_input_data,
-                default_input_data + ComputeSize(InputShape()),
-                std::ref(input_rng));
-
-  float* xnnpack_input_data = delegate_interpreter->typed_tensor<float>(
-      delegate_interpreter->inputs()[0]);
-  std::copy(default_input_data, default_input_data + ComputeSize(InputShape()),
-            xnnpack_input_data);
-
-  ASSERT_EQ(default_interpreter->Invoke(), kTfLiteOk);
-  ASSERT_EQ(delegate_interpreter->Invoke(), kTfLiteOk);
-
-  float* default_output_data = default_interpreter->typed_tensor<float>(
-      default_interpreter->outputs()[0]);
-  float* xnnpack_output_data = delegate_interpreter->typed_tensor<float>(
-      delegate_interpreter->outputs()[0]);
-
-  for (size_t i = 0; i < ComputeSize(OutputShape()); i++) {
-    ASSERT_EQ(default_output_data[i], xnnpack_output_data[i]);
-  }
-}
-
-std::vector<char> PreluTester::CreateTfLiteModel() const {
-  std::random_device random_device;
-  auto rng = std::mt19937(random_device());
-  auto slope_rng = std::bind(std::uniform_real_distribution<float>(0.25f, 0.5f),
-                             std::ref(rng));
-
-  flatbuffers::FlatBufferBuilder builder;
-  std::vector<flatbuffers::Offset<OperatorCode>> operator_codes{
-      {CreateOperatorCode(builder, BuiltinOperator_PRELU)}};
-  if (FP16Weights()) {
-    operator_codes.emplace_back(
-        CreateOperatorCode(builder, BuiltinOperator_DEQUANTIZE));
-  } else if (SparseWeights()) {
-    operator_codes.emplace_back(
-        CreateOperatorCode(builder, BuiltinOperator_DENSIFY));
-  }
-
-  std::vector<flatbuffers::Offset<Buffer>> buffers{{
-      CreateBuffer(builder, builder.CreateVector({})),
-  }};
-
-  if (FP16Weights()) {
-    std::vector<uint16_t> slope_data(ComputeSize(SlopeShape()));
-    std::generate(slope_data.begin(), slope_data.end(),
-                  std::bind(fp16_ieee_from_fp32_value, slope_rng));
-
-    buffers.push_back(CreateBuffer(
-        builder, builder.CreateVector(
-                     reinterpret_cast<const uint8_t*>(slope_data.data()),
-                     sizeof(uint16_t) * slope_data.size())));
-  } else {
-    std::vector<float> slope_data(ComputeSize(SlopeShape()));
-    std::generate(slope_data.begin(), slope_data.end(), slope_rng);
-
-    buffers.push_back(CreateBuffer(
-        builder, builder.CreateVector(
-                     reinterpret_cast<const uint8_t*>(slope_data.data()),
-                     sizeof(float) * slope_data.size())));
-  }
-
-  std::vector<flatbuffers::Offset<Tensor>> tensors;
-  std::vector<flatbuffers::Offset<Operator>> operators;
-  if (FP16Weights()) {
-    tensors.emplace_back(CreateTensor(
-        builder,
-        builder.CreateVector<int32_t>(SlopeShape().data(), SlopeShape().size()),
-        TensorType_FLOAT16, /*buffer=*/1));
-  } else if (SparseWeights()) {
-    const int dims_count = SlopeShape().size();
-    std::vector<flatbuffers::Offset<DimensionMetadata>> dim_metadata(
-        dims_count);
-    std::vector<int> traversal_order(dims_count);
-    for (int i = 0; i < dims_count; i++) {
-      traversal_order[i] = i;
-      dim_metadata[i] = CreateDimensionMetadata(builder, DimensionType_DENSE,
-                                                SlopeShape()[i]);
-    }
-    const flatbuffers::Offset<SparsityParameters> sparsity_param =
-        CreateSparsityParameters(builder, builder.CreateVector(traversal_order),
-                                 0, builder.CreateVector(dim_metadata));
-    tensors.emplace_back(CreateTensor(
-        builder,
-        builder.CreateVector<int32_t>(SlopeShape().data(), SlopeShape().size()),
-        TensorType_FLOAT32, /*buffer=*/1, /*name=*/0, /*quantization=*/0,
-        /*is_variable=*/false, /*sparsity=*/sparsity_param));
-  }
-  if (FP16Weights()) {
-    const std::array<int32_t, 1> dequantize_inputs{{0}};
-    const std::array<int32_t, 1> dequantize_outputs{{2}};
-    operators.emplace_back(CreateOperator(
-        builder, /*opcode_index=*/1,
-        builder.CreateVector<int32_t>(dequantize_inputs.data(),
-                                      dequantize_inputs.size()),
-        builder.CreateVector<int32_t>(dequantize_outputs.data(),
-                                      dequantize_outputs.size())));
-  } else if (SparseWeights()) {
-    const std::array<int32_t, 1> densify_inputs{{0}};
-    const std::array<int32_t, 1> densify_outputs{{2}};
-    operators.emplace_back(
-        CreateOperator(builder, /*opcode_index=*/1,
-                       builder.CreateVector<int32_t>(densify_inputs.data(),
-                                                     densify_inputs.size()),
-                       builder.CreateVector<int32_t>(densify_outputs.data(),
-                                                     densify_outputs.size())));
-  }
-  tensors.emplace_back(CreateTensor(
-      builder,
-      builder.CreateVector<int32_t>(InputShape().data(), InputShape().size()),
-      TensorType_FLOAT32));
-  tensors.emplace_back(CreateTensor(
-      builder,
-      builder.CreateVector<int32_t>(SlopeShape().data(), SlopeShape().size()),
-      TensorType_FLOAT32,
-      /*buffer=*/(FP16Weights() || SparseWeights()) ? 0 : 1));
-  tensors.emplace_back(CreateTensor(
-      builder,
-      builder.CreateVector<int32_t>(OutputShape().data(), OutputShape().size()),
-      TensorType_FLOAT32));
-
-  const std::array<int32_t, 2> op_inputs{
-      {static_cast<int>(tensors.size()) - 3,
-       static_cast<int>(tensors.size()) - 2}};
-  const std::array<int32_t, 1> op_outputs{
-      {static_cast<int>(tensors.size()) - 1}};
-  operators.emplace_back(CreateOperator(
-      builder, /*opcode_index=*/0,
-      builder.CreateVector<int32_t>(op_inputs.data(), op_inputs.size()),
-      builder.CreateVector<int32_t>(op_outputs.data(), op_outputs.size())));
-
-  const std::array<int32_t, 1> subgraph_inputs{
-      {static_cast<int32_t>(tensors.size() - 3)}};
-  const std::array<int32_t, 1> subgraph_outputs{
-      {static_cast<int32_t>(tensors.size()) - 1}};
-  flatbuffers::Offset<SubGraph> subgraph = CreateSubGraph(
-      builder, builder.CreateVector(tensors.data(), tensors.size()),
-      builder.CreateVector<int32_t>(subgraph_inputs.data(),
-                                    subgraph_inputs.size()),
-      builder.CreateVector<int32_t>(subgraph_outputs.data(),
-                                    subgraph_outputs.size()),
-      builder.CreateVector(operators.data(), operators.size()));
-
-  flatbuffers::Offset<flatbuffers::String> description =
-      builder.CreateString("PReLU model");
-
-  flatbuffers::Offset<Model> model_buffer = CreateModel(
-      builder, TFLITE_SCHEMA_VERSION,
-      builder.CreateVector(operator_codes.data(), operator_codes.size()),
-      builder.CreateVector(&subgraph, 1), description,
-      builder.CreateVector(buffers.data(), buffers.size()));
-
-  builder.Finish(model_buffer);
-
-  return std::vector<char>(builder.GetBufferPointer(),
-                           builder.GetBufferPointer() + builder.GetSize());
-}
-
-int32_t PreluTester::ComputeSize(const std::vector<int32_t>& shape) {
-  return std::accumulate(shape.cbegin(), shape.cend(), 1,
-                         std::multiplies<int32_t>());
-}
-
-}  // namespace xnnpack
-}  // namespace tflite
diff --git a/tensorflow/lite/delegates/xnnpack/prelu_tester.h b/tensorflow/lite/delegates/xnnpack/prelu_tester.h
deleted file mode 100644
index e89bae6029b..00000000000
--- a/tensorflow/lite/delegates/xnnpack/prelu_tester.h
+++ /dev/null
@@ -1,88 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_LITE_DELEGATES_XNNPACK_PRELU_TESTER_H_
-#define TENSORFLOW_LITE_DELEGATES_XNNPACK_PRELU_TESTER_H_
-
-#include <cstdint>
-#include <vector>
-
-#include <gtest/gtest.h>
-#include "tensorflow/lite/c/common.h"
-
-namespace tflite {
-namespace xnnpack {
-
-class PreluTester {
- public:
-  PreluTester() = default;
-  PreluTester(const PreluTester&) = delete;
-  PreluTester& operator=(const PreluTester&) = delete;
-
-  inline PreluTester& InputShape(std::initializer_list<int32_t> shape) {
-    for (auto it = shape.begin(); it != shape.end(); ++it) {
-      EXPECT_GT(*it, 0);
-    }
-    input_shape_ = std::vector<int32_t>(shape.begin(), shape.end());
-    return *this;
-  }
-
-  inline const std::vector<int32_t>& InputShape() const { return input_shape_; }
-
-  inline PreluTester& SlopeShape(std::initializer_list<int32_t> shape) {
-    for (auto it = shape.begin(); it != shape.end(); ++it) {
-      EXPECT_GT(*it, 0);
-    }
-    slope_shape_ = std::vector<int32_t>(shape.begin(), shape.end());
-    return *this;
-  }
-
-  inline const std::vector<int32_t>& SlopeShape() const { return slope_shape_; }
-
-  inline const std::vector<int32_t>& OutputShape() const {
-    return InputShape();
-  }
-
-  inline PreluTester& FP16Weights() {
-    fp16_weights_ = true;
-    return *this;
-  }
-
-  inline bool FP16Weights() const { return fp16_weights_; }
-
-  inline PreluTester& SparseWeights() {
-    sparse_weights_ = true;
-    return *this;
-  }
-
-  inline bool SparseWeights() const { return sparse_weights_; }
-
-  void Test(TfLiteDelegate* delegate) const;
-
- private:
-  std::vector<char> CreateTfLiteModel() const;
-
-  static int32_t ComputeSize(const std::vector<int32_t>& shape);
-
-  std::vector<int32_t> input_shape_;
-  std::vector<int32_t> slope_shape_;
-  bool fp16_weights_ = false;
-  bool sparse_weights_ = false;
-};
-
-}  // namespace xnnpack
-}  // namespace tflite
-
-#endif  // TENSORFLOW_LITE_DELEGATES_XNNPACK_PRELU_TESTER_H_
diff --git a/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.cc b/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.cc
index 31468ef7407..0afc9c32122 100644
--- a/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.cc
+++ b/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.cc
@@ -2266,8 +2266,7 @@ class Subgraph {
     const TfLiteTensor& input_tensor = tensors[node->inputs->data[0]];
     TF_LITE_ENSURE_STATUS(CheckTensorFloatType(
         logging_context, input_tensor, node->inputs->data[0], node_index));
-    TF_LITE_ENSURE_STATUS(CheckTensorShape(logging_context, input_tensor, 1,
-                                           XNN_MAX_TENSOR_DIMS,
+    TF_LITE_ENSURE_STATUS(CheckTensorShape(logging_context, input_tensor, 4,
                                            node->inputs->data[0]));
     TF_LITE_ENSURE_STATUS(CheckTensorNonDynamicAllocation(
         logging_context, input_tensor, node->inputs->data[0], node_index));
@@ -2285,8 +2284,7 @@ class Subgraph {
     const TfLiteTensor& output_tensor = tensors[node->outputs->data[0]];
     TF_LITE_ENSURE_STATUS(CheckTensorFloatType(
         logging_context, output_tensor, node->outputs->data[0], node_index));
-    TF_LITE_ENSURE_STATUS(CheckTensorShape(logging_context, output_tensor, 1,
-                                           XNN_MAX_TENSOR_DIMS,
+    TF_LITE_ENSURE_STATUS(CheckTensorShape(logging_context, output_tensor, 4,
                                            node->outputs->data[0]));
     TF_LITE_ENSURE_STATUS(CheckTensorNonDynamicAllocation(
         logging_context, output_tensor, node->outputs->data[0], node_index));

From 61c2e69663ee78de30431dc129d059d29d6d80ea Mon Sep 17 00:00:00 2001
From: Rick Chao <rchao@google.com>
Date: Mon, 22 Jun 2020 14:33:38 -0700
Subject: [PATCH 0801/1390] Make task_type and task_id standard properties in
 tf.distribute cluster resolvers.

PiperOrigin-RevId: 317736970
Change-Id: Ia9c76462afc4c2fcc42a149960b50b2cbcafd482
---
 .../cluster_resolver/cluster_resolver.py      | 121 +++++++++++++++++-
 .../gce_cluster_resolver_test.py              |  32 +++++
 ...e.cluster_resolver.-cluster-resolver.pbtxt |   8 ++
 ...esolver.-kubernetes-cluster-resolver.pbtxt |   8 ++
 ...ter_resolver.-slurm-cluster-resolver.pbtxt |   8 ++
 ...ter_resolver.-t-p-u-cluster-resolver.pbtxt |   8 ++
 ...e.cluster_resolver.-cluster-resolver.pbtxt |   8 ++
 ...esolver.-kubernetes-cluster-resolver.pbtxt |   8 ++
 ...ter_resolver.-slurm-cluster-resolver.pbtxt |   8 ++
 ...ter_resolver.-t-p-u-cluster-resolver.pbtxt |   8 ++
 10 files changed, 212 insertions(+), 5 deletions(-)

diff --git a/tensorflow/python/distribute/cluster_resolver/cluster_resolver.py b/tensorflow/python/distribute/cluster_resolver/cluster_resolver.py
index a8babc21af6..f43bfc9845f 100644
--- a/tensorflow/python/distribute/cluster_resolver/cluster_resolver.py
+++ b/tensorflow/python/distribute/cluster_resolver/cluster_resolver.py
@@ -71,11 +71,22 @@ class ClusterResolver(object):
   workers. This will eventually allow us to automatically recover from
   underlying machine failures and scale TensorFlow worker clusters up and down.
 
-  Note to Implementors: In addition to these abstract methods, you must also
-  implement the task_type, task_id, and rpc_layer attributes. You may choose
-  to implement them either as properties with getters or setters or directly
-  set the attributes. The task_type and task_id attributes are required by
-  `tf.distribute.experimental.MultiWorkerMirroredStrategy`.
+  Note to Implementors of `tf.distribute.cluster_resolver.ClusterResolver`
+  subclass: In addition to these abstract methods, when task_type, task_id, and
+  rpc_layer attributes are applicable, you should also implement them either as
+  properties with getters or setters, or directly set the attributes
+  `self._task_type`, `self._task_id`, or `self._rpc_layer` so the base class'
+  getters and setters are used. See
+  `tf.distribute.cluster_resolver.SimpleClusterResolver.__init__` for an
+  example.
+
+  In general, multi-client tf.distribute strategies such as
+  `tf.distribute.experimental.MultiWorkerMirroredStrategy` require task_type and
+  task_id properties to be available in the `ClusterResolver` they are using. On
+  the other hand, these concepts are not applicable in single-client strategies,
+  such as `tf.distribute.experimental.TPUStrategy`, because the program is only
+  expected to be run on one task, so there should not be a need to have code
+  branches according to task type and task id.
 
   - task_type is the name of the server's current named job (e.g. 'worker',
      'ps' in a distributed parameterized training job).
@@ -177,6 +188,106 @@ class ClusterResolver(object):
     """
     return ''
 
+  @property
+  def task_type(self):
+    """Returns the task type this `ClusterResolver` indicates.
+
+    In TensorFlow distributed environment, each job may have an applicable
+    task type. Valid task types in TensorFlow include
+    'chief': a worker that is designated with more responsibility,
+    'worker': a regular worker for training/evaluation,
+    'ps': a parameter server, or
+    'evaluator': an evaluator that evaluates the checkpoints for metrics.
+
+    See [Multi-worker configuration](
+    https://www.tensorflow.org/tutorials/distribute/multi_worker_with_keras#multi-worker_configuration)
+    for more information about 'chief' and 'worker' task type, which are most
+    commonly used.
+
+    Having access to such information is useful when user needs to run specific
+    code according to task types. For example,
+
+    ```python
+    cluster_spec = tf.train.ClusterSpec({
+        "ps": ["localhost:2222", "localhost:2223"],
+        "worker": ["localhost:2224", "localhost:2225", "localhost:2226"]
+    })
+
+    # SimpleClusterResolver is used here for illustration; other cluster
+    # resolvers may be used for other source of task type/id.
+    simple_resolver = SimpleClusterResolver(cluster_spec, task_type="worker",
+                                            task_id=1)
+
+    ...
+
+    if cluster_resolver.task_type == 'worker':
+      # Perform something that's only applicable on workers. This block
+      # will run on this particular instance since we've specified this task to
+      # be a worker in above cluster resolver.
+    elif cluster_resolver.task_type == 'ps':
+      # Perform something that's only applicable on parameter servers. This
+      # block will not run on this particular instance.
+    ```
+
+    Returns `None` if such information is not available or is not applicable
+    in the current distributed environment, such as training with
+    `tf.distribute.experimental.TPUStrategy`.
+
+    For more information, please see
+    `tf.distribute.cluster_resolver.ClusterResolver`'s class doc.
+    """
+    return getattr(self, '_task_type', None)
+
+  @property
+  def task_id(self):
+    """Returns the task id this `ClusterResolver` indicates.
+
+    In TensorFlow distributed environment, each job may have an applicable
+    task id, which is the index of the instance within its task type. This is
+    useful when user needs to run specific code according to task index. For
+    example,
+
+    ```python
+    cluster_spec = tf.train.ClusterSpec({
+        "ps": ["localhost:2222", "localhost:2223"],
+        "worker": ["localhost:2224", "localhost:2225", "localhost:2226"]
+    })
+
+    # SimpleClusterResolver is used here for illustration; other cluster
+    # resolvers may be used for other source of task type/id.
+    simple_resolver = SimpleClusterResolver(cluster_spec, task_type="worker",
+                                            task_id=0)
+
+    ...
+
+    if cluster_resolver.task_type == 'worker' and cluster_resolver.task_id == 0:
+      # Perform something that's only applicable on 'worker' type, id 0. This
+      # block will run on this particular instance since we've specified this
+      # task to be a 'worker', id 0 in above cluster resolver.
+    else:
+      # Perform something that's only applicable on other ids. This block will
+      # not run on this particular instance.
+    ```
+
+    Returns `None` if such information is not available or is not applicable
+    in the current distributed environment, such as training with
+    `tf.distribute.cluster_resolver.TPUClusterResolver`.
+
+    For more information, please see
+    `tf.distribute.cluster_resolver.ClusterResolver`'s class docstring.
+    """
+    return getattr(self, '_task_id', None)
+
+  @task_type.setter
+  def task_type(self, task_type):
+    """Setter of `task_type` property. See `task_type` property doc."""
+    self._task_type = task_type
+
+  @task_id.setter
+  def task_id(self, task_id):
+    """Setter of `task_id` property. See `task_type` property doc."""
+    self._task_id = task_id
+
 
 @tf_export('distribute.cluster_resolver.SimpleClusterResolver')
 class SimpleClusterResolver(ClusterResolver):
diff --git a/tensorflow/python/distribute/cluster_resolver/gce_cluster_resolver_test.py b/tensorflow/python/distribute/cluster_resolver/gce_cluster_resolver_test.py
index f39c86a0495..d8037497cb9 100644
--- a/tensorflow/python/distribute/cluster_resolver/gce_cluster_resolver_test.py
+++ b/tensorflow/python/distribute/cluster_resolver/gce_cluster_resolver_test.py
@@ -310,5 +310,37 @@ class GCEClusterResolverTest(test.TestCase):
     """
     self._verifyClusterSpecEquality(actual_cluster_spec, expected_proto)
 
+  def testSettingTaskTypeRaiseError(self):
+    name_to_ip = [
+        {
+            'name': 'instance1',
+            'ip': '10.1.2.3'
+        },
+        {
+            'name': 'instance2',
+            'ip': '10.2.3.4'
+        },
+        {
+            'name': 'instance3',
+            'ip': '10.3.4.5'
+        },
+    ]
+
+    gce_cluster_resolver = GCEClusterResolver(
+        project='test-project',
+        zone='us-east1-d',
+        instance_group='test-instance-group',
+        task_type='testworker',
+        port=8470,
+        credentials=None,
+        service=self.gen_standard_mock_service_client(name_to_ip))
+
+    with self.assertRaisesRegexp(
+        RuntimeError, 'You cannot reset the task_type '
+        'of the GCEClusterResolver after it has '
+        'been created.'):
+      gce_cluster_resolver.task_type = 'foobar'
+
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.distribute.cluster_resolver.-cluster-resolver.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distribute.cluster_resolver.-cluster-resolver.pbtxt
index 0b35b61b4c0..c9dbca7368e 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.distribute.cluster_resolver.-cluster-resolver.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.distribute.cluster_resolver.-cluster-resolver.pbtxt
@@ -6,6 +6,14 @@ tf_class {
     name: "environment"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "task_id"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "task_type"
+    mtype: "<type \'property\'>"
+  }
   member_method {
     name: "__init__"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.distribute.cluster_resolver.-kubernetes-cluster-resolver.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distribute.cluster_resolver.-kubernetes-cluster-resolver.pbtxt
index 3220d68e054..2819ca85612 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.distribute.cluster_resolver.-kubernetes-cluster-resolver.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.distribute.cluster_resolver.-kubernetes-cluster-resolver.pbtxt
@@ -7,6 +7,14 @@ tf_class {
     name: "environment"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "task_id"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "task_type"
+    mtype: "<type \'property\'>"
+  }
   member_method {
     name: "__init__"
     argspec: "args=[\'self\', \'job_to_label_mapping\', \'tf_server_port\', \'rpc_layer\', \'override_client\'], varargs=None, keywords=None, defaults=[\'None\', \'8470\', \'grpc\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.distribute.cluster_resolver.-slurm-cluster-resolver.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distribute.cluster_resolver.-slurm-cluster-resolver.pbtxt
index 30cfac0830c..a2c63243c86 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.distribute.cluster_resolver.-slurm-cluster-resolver.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.distribute.cluster_resolver.-slurm-cluster-resolver.pbtxt
@@ -7,6 +7,14 @@ tf_class {
     name: "environment"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "task_id"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "task_type"
+    mtype: "<type \'property\'>"
+  }
   member_method {
     name: "__init__"
     argspec: "args=[\'self\', \'jobs\', \'port_base\', \'gpus_per_node\', \'gpus_per_task\', \'tasks_per_node\', \'auto_set_gpu\', \'rpc_layer\'], varargs=None, keywords=None, defaults=[\'None\', \'8888\', \'None\', \'None\', \'None\', \'True\', \'grpc\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.distribute.cluster_resolver.-t-p-u-cluster-resolver.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distribute.cluster_resolver.-t-p-u-cluster-resolver.pbtxt
index 8eee489df93..165f8d43ace 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.distribute.cluster_resolver.-t-p-u-cluster-resolver.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.distribute.cluster_resolver.-t-p-u-cluster-resolver.pbtxt
@@ -7,6 +7,14 @@ tf_class {
     name: "environment"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "task_id"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "task_type"
+    mtype: "<type \'property\'>"
+  }
   member_method {
     name: "__init__"
     argspec: "args=[\'self\', \'tpu\', \'zone\', \'project\', \'job_name\', \'coordinator_name\', \'coordinator_address\', \'credentials\', \'service\', \'discovery_url\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'worker\', \'None\', \'None\', \'default\', \'None\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.cluster_resolver.-cluster-resolver.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.cluster_resolver.-cluster-resolver.pbtxt
index 0b35b61b4c0..c9dbca7368e 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.distribute.cluster_resolver.-cluster-resolver.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.cluster_resolver.-cluster-resolver.pbtxt
@@ -6,6 +6,14 @@ tf_class {
     name: "environment"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "task_id"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "task_type"
+    mtype: "<type \'property\'>"
+  }
   member_method {
     name: "__init__"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.cluster_resolver.-kubernetes-cluster-resolver.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.cluster_resolver.-kubernetes-cluster-resolver.pbtxt
index 3220d68e054..2819ca85612 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.distribute.cluster_resolver.-kubernetes-cluster-resolver.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.cluster_resolver.-kubernetes-cluster-resolver.pbtxt
@@ -7,6 +7,14 @@ tf_class {
     name: "environment"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "task_id"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "task_type"
+    mtype: "<type \'property\'>"
+  }
   member_method {
     name: "__init__"
     argspec: "args=[\'self\', \'job_to_label_mapping\', \'tf_server_port\', \'rpc_layer\', \'override_client\'], varargs=None, keywords=None, defaults=[\'None\', \'8470\', \'grpc\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.cluster_resolver.-slurm-cluster-resolver.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.cluster_resolver.-slurm-cluster-resolver.pbtxt
index 30cfac0830c..a2c63243c86 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.distribute.cluster_resolver.-slurm-cluster-resolver.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.cluster_resolver.-slurm-cluster-resolver.pbtxt
@@ -7,6 +7,14 @@ tf_class {
     name: "environment"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "task_id"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "task_type"
+    mtype: "<type \'property\'>"
+  }
   member_method {
     name: "__init__"
     argspec: "args=[\'self\', \'jobs\', \'port_base\', \'gpus_per_node\', \'gpus_per_task\', \'tasks_per_node\', \'auto_set_gpu\', \'rpc_layer\'], varargs=None, keywords=None, defaults=[\'None\', \'8888\', \'None\', \'None\', \'None\', \'True\', \'grpc\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.cluster_resolver.-t-p-u-cluster-resolver.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.cluster_resolver.-t-p-u-cluster-resolver.pbtxt
index 8eee489df93..165f8d43ace 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.distribute.cluster_resolver.-t-p-u-cluster-resolver.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.cluster_resolver.-t-p-u-cluster-resolver.pbtxt
@@ -7,6 +7,14 @@ tf_class {
     name: "environment"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "task_id"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "task_type"
+    mtype: "<type \'property\'>"
+  }
   member_method {
     name: "__init__"
     argspec: "args=[\'self\', \'tpu\', \'zone\', \'project\', \'job_name\', \'coordinator_name\', \'coordinator_address\', \'credentials\', \'service\', \'discovery_url\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'worker\', \'None\', \'None\', \'default\', \'None\', \'None\'], "

From 08180e68453306e767f36c755ada377c688fa6ee Mon Sep 17 00:00:00 2001
From: Raman Sarokin <sorokin@google.com>
Date: Mon, 22 Jun 2020 14:38:34 -0700
Subject: [PATCH 0802/1390] Padding converted to new style.

PiperOrigin-RevId: 317737917
Change-Id: Ie21457323a2cdb0376c16065fc4b809a5cc9a1d3
---
 .../lite/delegates/gpu/cl/kernels/padding.cc  | 134 +++++++++---------
 1 file changed, 64 insertions(+), 70 deletions(-)

diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/padding.cc b/tensorflow/lite/delegates/gpu/cl/kernels/padding.cc
index 48edcb448a1..883067a2c2d 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/padding.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/padding.cc
@@ -26,20 +26,21 @@ namespace gpu {
 namespace cl {
 namespace {
 
-std::string GetPaddingCode(
-    const OperationDef& op_def,
-    const std::vector<ElementwiseOperation*>& linked_operations,
-    const PadAttributes& attr) {
-  TensorCodeGenerator src_tensor(
-      "src_data",
-      WHSBPoint{"src_size.x", "src_size.y", "src_size.z", "src_size.w"},
-      op_def.src_tensors[0]);
-  TensorCodeGenerator dst_tensor(
-      "dst_data",
-      WHSBPoint{"dst_size.x", "dst_size.y", "dst_size.z", "dst_size.w"},
-      op_def.dst_tensors[0]);
+std::string GetPaddingCode(const OperationDef& op_def,
+                           const PadAttributes& attr, Arguments* args) {
+  args->AddObjectRef(
+      "src_tensor", AccessType::READ,
+      absl::make_unique<TensorDescriptor>(op_def.src_tensors[0]));
+  args->AddObjectRef(
+      "dst_tensor", AccessType::WRITE,
+      absl::make_unique<TensorDescriptor>(op_def.dst_tensors[0]));
+  args->AddInt("prepended_x");
+  args->AddInt("prepended_y");
+  args->AddInt("prepended_z");
+  args->AddInt("prepended_w");
 
-  const std::string dst_batch = op_def.IsBatchSupported() ? "B" : "";
+  const std::string dst_batch =
+      op_def.dst_tensors[0].HasAxis(Axis::BATCH) ? "B" : "0";
   std::string c = GetCommonDefines(op_def.precision);
   const std::string channels[] = {".x", ".y", ".z", ".w"};
 
@@ -51,76 +52,70 @@ std::string GetPaddingCode(
   }
 
   c += "__kernel void main_function(\n";
-  c += src_tensor.GetDeclaration(AccessType::READ);
-  c += GetArgsDeclaration(linked_operations);
-  c += dst_tensor.GetDeclaration(AccessType::WRITE) + ",\n";
-  c += "    int4 src_size,      \n";
-  c += "    int src_channels,   \n";
-  c += "    int4 dst_size,      \n";
-  c += "    int4 prepended      \n";
-  c += ") {\n";
-  if (op_def.IsBatchSupported()) {
+  c += "$0) {\n";
+  if (op_def.dst_tensors[0].HasAxis(Axis::BATCH)) {
     c += "  int linear_id = get_global_id(0);\n";
-    c += "  int X = linear_id / dst_size.w;\n";
-    c += "  int B = linear_id % dst_size.w;\n";
+    c += "  int X = linear_id / args.dst_tensor.Batch();\n";
+    c += "  int B = linear_id % args.dst_tensor.Batch();\n";
+    c += "  args.dst_tensor.SetBatchRef(B);\n";
   } else {
     c += "  int X = get_global_id(0);\n";
   }
   c += "  int Y = get_global_id(1);\n";
   c += "  int Z = get_global_id(2);\n";
-  c += "  if (X >= dst_size.x || Y >= dst_size.y || Z >= dst_size.z) return;\n";
+  c += "  if (X >= args.dst_tensor.Width() || Y >= args.dst_tensor.Height() || "
+       "Z >= args.dst_tensor.Slices()) { \n";
+  c += "    return; \n";
+  c += "  } \n";
   c += "  FLT4 result = (FLT4)(0.0);\n";
-  c += "  int s_x = X - prepended.x;\n";
-  c += "  int s_y = Y - prepended.y;\n";
-  if (op_def.IsBatchSupported()) {
-    c += "  int s_b = B - prepended.w;\n";
+  c += "  int s_x = X - args.prepended_x;\n";
+  c += "  int s_y = Y - args.prepended_y;\n";
+  if (op_def.src_tensors[0].HasAxis(Axis::BATCH)) {
+    c += "  int s_b = " + dst_batch + " - args.prepended_w;\n";
+    c += "  args.src_tensor.SetBatchRef(s_b);\n";
   }
-  const std::string src_batch = op_def.IsBatchSupported() ? "s_b" : "";
   if (attr.type == PaddingContentType::REFLECT) {
-    c += "  s_x = reflect(s_x, src_size.x);\n";
-    c += "  s_y = reflect(s_y, src_size.y);\n";
-    if (op_def.IsBatchSupported()) {
-      c += "  int s_b = reflect(s_b, src_size.w);\n";
+    c += "  s_x = reflect(s_x, args.src_tensor.Width());\n";
+    c += "  s_y = reflect(s_y, args.src_tensor.Height());\n";
+    if (op_def.src_tensors[0].HasAxis(Axis::BATCH)) {
+      c += "  int s_b = reflect(s_b, args.src_tensor.Batch());\n";
     }
     if (attr.prepended.c == 0 && attr.appended.c == 0) {
       // optimized case
-      c += "  result = " + src_tensor.ReadWHSB("s_x", "s_y", "Z", src_batch) +
-           ";\n";
+      c += "  result = args.src_tensor.Read(s_x, s_y, Z);\n";
     } else {
       c += "  int start_channel = Z * 4;\n";
       for (int i = 0; i < 4; ++i) {
         const auto& s = channels[i];
         c += "  {\n";
         c += "    int channel = start_channel + " + std::to_string(i) + ";\n";
-        c += "    int s_z = channel - prepended.z;\n";
+        c += "    int s_z = channel - args.prepended_z;\n";
         // We need additional clamp for z, so that we use alignment for channels
         // and can proceed extra channels that can lead to reading out of
         // resource.
-        c += "    s_z = clamp(reflect(s_z, src_channels), 0, src_channels - "
+        c += "    s_z = clamp(reflect(s_z, args.src_tensor.Channels()), 0, "
+             "args.src_tensor.Channels() - "
              "1);\n";
-        c += "    FLT4 t = " +
-             src_tensor.ReadWHSB("s_x", "s_y", "s_z / 4", src_batch) + ";\n";
+        c += "    FLT4 t = args.src_tensor.Read(s_x, s_y, s_z / 4);\n";
         c += "    FLT t_ar[4] = {t.x, t.y, t.z, t.w};\n";
         c += "    result" + s + " = t_ar[s_z % 4];\n";
         c += "  }\n";
       }
     }
   } else {
-    c += "  bool inside_x = s_x >= 0 && s_x < src_size.x;\n";
-    c += "  bool inside_y = s_y >= 0 && s_y < src_size.y;\n";
-    if (op_def.IsBatchSupported()) {
-      c += "  inside_y &= (s_b >= 0 && s_b < src_size.w);\n";
+    c += "  bool inside_x = s_x >= 0 && s_x < args.src_tensor.Width();\n";
+    c += "  bool inside_y = s_y >= 0 && s_y < args.src_tensor.Height();\n";
+    if (op_def.src_tensors[0].HasAxis(Axis::BATCH)) {
+      c += "  inside_y &= (s_b >= 0 && s_b < args.src_tensor.Batch());\n";
     }
     c += "  if (inside_x && inside_y) {\n";
     if (attr.prepended.c == 0 && attr.appended.c == 0) {
       // optimized case
-      c += "    result = " + src_tensor.ReadWHSB("s_x", "s_y", "Z", src_batch) +
-           ";\n";
+      c += "    result = args.src_tensor.Read(s_x, s_y, Z);\n";
     } else if (attr.prepended.c % 4 == 0) {
-      c += "    int s_z = Z - prepended.z / 4;\n";
-      c += "    if (s_z >= 0 && s_z < src_size.z) {\n";
-      c += "      result = " +
-           src_tensor.ReadWHSB("s_x", "s_y", "s_z", src_batch) + ";\n";
+      c += "    int s_z = Z - args.prepended_z / 4;\n";
+      c += "    if (s_z >= 0 && s_z < args.src_tensor.Slices()) {\n";
+      c += "      result = args.src_tensor.Read(s_x, s_y, s_z);\n";
       c += "    }\n";
     } else {
       c += "    int start_channel = Z * 4;\n";
@@ -128,10 +123,9 @@ std::string GetPaddingCode(
         const auto& s = channels[i];
         c += "    {\n";
         c += "    int channel = start_channel + " + std::to_string(i) + ";\n";
-        c += "    int s_z = channel - prepended.z;\n";
-        c += "    if (s_z >= 0 && s_z < src_channels) {\n";
-        c += "      FLT4 t = " +
-             src_tensor.ReadWHSB("s_x", "s_y", "s_z / 4", src_batch) + ";\n";
+        c += "    int s_z = channel - args.prepended_z;\n";
+        c += "    if (s_z >= 0 && s_z < args.src_tensor.Channels()) {\n";
+        c += "      FLT4 t = args.src_tensor.Read(s_x, s_y, s_z / 4);\n";
         c += "      FLT t_ar[4] = {t.x, t.y, t.z, t.w};\n";
         c += "      result" + s + " = t_ar[s_z % 4];\n";
         c += "    }\n";
@@ -140,10 +134,7 @@ std::string GetPaddingCode(
     }
     c += "  }\n";
   }
-  std::string x_3dcoord =
-      op_def.IsBatchSupported() ? "X * dst_size.w + B" : "X";
-  c += PostProcess(linked_operations, {"result", x_3dcoord, "Y", "Z"});
-  c += "  " + dst_tensor.WriteWHSB("result", "X", "Y", "Z", dst_batch);
+  c += "  args.dst_tensor.Write(result, X, Y, Z);\n";
   c += "}\n";
 
   return c;
@@ -170,24 +161,27 @@ Padding& Padding::operator=(Padding&& kernel) {
 }
 
 absl::Status Padding::Compile(const CreationContext& creation_context) {
-  const auto code =
-      GetPaddingCode(definition_, linked_operations_, attributes_);
+  std::string code = GetPaddingCode(definition_, attributes_, &args_);
+  std::string element_wise_code;
+  RETURN_IF_ERROR(
+      MergeOperations(linked_operations_, &args_, &element_wise_code));
+  RETURN_IF_ERROR(args_.TransformToCLCode(creation_context.device->GetInfo(),
+                                          {{"dst_tensor", element_wise_code}},
+                                          &code));
   return creation_context.cache->GetOrCreateCLKernel(
       code, "main_function", *creation_context.context,
       *creation_context.device, &kernel_);
 }
 
 absl::Status Padding::BindArguments() {
-  kernel_.ResetBindingCounter();
-  RETURN_IF_ERROR(kernel_.SetMemoryAuto(src_[0]->GetMemoryPtr()));
-  RETURN_IF_ERROR(BindArgs(&kernel_, linked_operations_));
-  RETURN_IF_ERROR(kernel_.SetMemoryAuto(dst_[0]->GetMemoryPtrForWriting()));
-  RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->GetWHSB()));
-  RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->Channels()));
-  RETURN_IF_ERROR(kernel_.SetBytesAuto(dst_[0]->GetWHSB()));
-  const auto& prep = attributes_.prepended;
-  RETURN_IF_ERROR(kernel_.SetBytesAuto(int4(prep.w, prep.h, prep.c, prep.b)));
-  return absl::OkStatus();
+  RETURN_IF_ERROR(args_.SetObjectRef("src_tensor", src_[0]));
+  RETURN_IF_ERROR(args_.SetObjectRef("dst_tensor", dst_[0]));
+  RETURN_IF_ERROR(args_.SetInt("prepended_x", attributes_.prepended.w));
+  RETURN_IF_ERROR(args_.SetInt("prepended_y", attributes_.prepended.h));
+  RETURN_IF_ERROR(args_.SetInt("prepended_z", attributes_.prepended.c));
+  RETURN_IF_ERROR(args_.SetInt("prepended_w", attributes_.prepended.b));
+  RETURN_IF_ERROR(SetArguments(linked_operations_, &args_));
+  return args_.Bind(kernel_.kernel());
 }
 
 int3 Padding::GetGridSize() const {

From 3a34bde6520c8d2643337e1db1a986a22943f27f Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 22 Jun 2020 14:41:02 -0700
Subject: [PATCH 0803/1390] Go: Update generated wrapper functions for
 TensorFlow ops.

PiperOrigin-RevId: 317738384
Change-Id: I2c74c34bb5fed2809a46f4ecdb95616f2f1c832d
---
 tensorflow/go/op/wrappers.go | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index ace8e58fdcd..8bceeb78564 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -31850,8 +31850,8 @@ func VarHandleOpSharedName(value string) VarHandleOpAttr {
 
 // VarHandleOpAllowedDevices sets the optional allowed_devices attribute to value.
 //
-// value: The allowed devices containing the resource variable. Set when the output
-// ResourceHandle represents a per-replica/partitioned resource variable.
+// value: DEPRECATED. The allowed devices containing the resource variable. Set when the
+// output ResourceHandle represents a per-replica/partitioned resource variable.
 // If not specified, defaults to <>
 func VarHandleOpAllowedDevices(value []string) VarHandleOpAttr {
 	return func(m optionalAttr) {

From ec4c8f32918e8354aa8aba8a6264f5f7e4f0ffbd Mon Sep 17 00:00:00 2001
From: abhichou4 <abhichou4@gmail.com>
Date: Tue, 23 Jun 2020 03:29:15 +0530
Subject: [PATCH 0804/1390] remove redundant stuff

---
 tensorflow/python/eager/forwardprop.py | 18 ++++++------------
 1 file changed, 6 insertions(+), 12 deletions(-)

diff --git a/tensorflow/python/eager/forwardprop.py b/tensorflow/python/eager/forwardprop.py
index 1a6d6a58d0e..ba64c91fbbf 100644
--- a/tensorflow/python/eager/forwardprop.py
+++ b/tensorflow/python/eager/forwardprop.py
@@ -157,19 +157,13 @@ def _jvp_helper_wrapper(
   """
   if batch_size:
     for primal, tangent in zip(inputs, tangents):
-      if tangent.rank == primal.rank + 1:
-        if tangent.shape != array_ops.concat([batch_size], primal.shape):
-          raise ValueError(
-            "Tangent {} was expected to be of shape "
-            "{} but is instead of shape {}".format(
-            tangent, [batch_size] + primal.shape, tangent.shape
-          )
+      if tangent.shape != array_ops.concat([batch_size], primal.shape, 0):
+        raise ValueError(
+          "Tangent {} was expected to be of shape "
+          "{} but is instead of shape {}".format(
+          tangent, [batch_size] + primal.shape, tangent.shape
         )
-        else:
-          raise ValueError(
-            "Invalid argument batch_size for rank "
-            "{}, {} tangents and primals".format(tangent.rank, primal.rank)
-          )
+      )
 
   if batch_size:
     return control_flow_ops.vectorized_map(

From b2f092894012e9c2612cb46c332140f28d91ced2 Mon Sep 17 00:00:00 2001
From: Yanhua Sun <yanhuasun@google.com>
Date: Mon, 22 Jun 2020 14:53:59 -0700
Subject: [PATCH 0805/1390] Add DeviceIndex xla op.

DeviceIndex op: given a list of device names, this operation returns the index of the device this op runs.  In the case of XLA, we are not executing on any device, we return the length of the list.

PiperOrigin-RevId: 317740778
Change-Id: I0679aa0adc5508b83502eee0d2044584577ed5b4
---
 .../compiler/jit/mark_for_compilation_pass.cc |  2 +-
 tensorflow/compiler/tf2xla/kernels/BUILD      |  1 +
 .../tf2xla/kernels/device_index_op.cc         | 51 +++++++++++++++++++
 .../base_api/api_def_DeviceIndex.pbtxt        |  6 +++
 .../python/ops/control_flow_ops_test.py       | 20 ++++++++
 5 files changed, 79 insertions(+), 1 deletion(-)
 create mode 100644 tensorflow/compiler/tf2xla/kernels/device_index_op.cc

diff --git a/tensorflow/compiler/jit/mark_for_compilation_pass.cc b/tensorflow/compiler/jit/mark_for_compilation_pass.cc
index 9f5723f4fa4..dc5df94e963 100644
--- a/tensorflow/compiler/jit/mark_for_compilation_pass.cc
+++ b/tensorflow/compiler/jit/mark_for_compilation_pass.cc
@@ -1837,7 +1837,7 @@ absl::flat_hash_map<string, std::vector<string>>* GetWhitelistTable() {
       "ConcatOffset", "Const", "MirrorPad", "Pack", "Pad", "PadV2", "Reverse",
       "ReverseV2", "ReverseSequence", "Slice", "Split", "SplitV",
       "StridedSlice", "StridedSliceGrad", "ResourceStridedSliceAssign",
-      "Tile", "Transpose", "InvertPermutation", "Unpack"}}};
+      "Tile", "Transpose", "InvertPermutation", "Unpack", "DeviceIndex"}}};
   // clang-format on
   return result;
 }
diff --git a/tensorflow/compiler/tf2xla/kernels/BUILD b/tensorflow/compiler/tf2xla/kernels/BUILD
index bdaeeafd295..e072225566d 100644
--- a/tensorflow/compiler/tf2xla/kernels/BUILD
+++ b/tensorflow/compiler/tf2xla/kernels/BUILD
@@ -32,6 +32,7 @@ tf_kernel_library(
         "data_format_ops.cc",
         "depthtospace_op.cc",
         "dequantize_op.cc",
+        "device_index_op.cc",
         "diag_op.cc",
         "dynamic_slice_ops.cc",
         "dynamic_stitch_op.cc",
diff --git a/tensorflow/compiler/tf2xla/kernels/device_index_op.cc b/tensorflow/compiler/tf2xla/kernels/device_index_op.cc
new file mode 100644
index 00000000000..ff058f92cd7
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/kernels/device_index_op.cc
@@ -0,0 +1,51 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/strings/string_view.h"
+#include "tensorflow/compiler/tf2xla/xla_helpers.h"
+#include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/compiler/xla/client/client_library.h"
+#include "tensorflow/compiler/xla/client/lib/arithmetic.h"
+#include "tensorflow/compiler/xla/client/lib/constants.h"
+#include "tensorflow/compiler/xla/client/lib/math.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/core/framework/kernel_def_builder.h"
+
+namespace tensorflow {
+namespace {
+
+class DeviceIndexOp : public XlaOpKernel {
+ public:
+  explicit DeviceIndexOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("device_names", &device_names_));
+  }
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    // When compiling we are not executing on any physical device, so we return
+    // a sentinel value (size of the list of devices).
+    ctx->SetOutput(
+        0, xla::ConstantR0<int32>(ctx->builder(), device_names_.size()));
+  }
+
+ private:
+  std::vector<string> device_names_;
+};
+
+REGISTER_XLA_OP(Name("DeviceIndex"), DeviceIndexOp);
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/api_def/base_api/api_def_DeviceIndex.pbtxt b/tensorflow/core/api_def/base_api/api_def_DeviceIndex.pbtxt
index 9a4e5abd110..87c146910ff 100644
--- a/tensorflow/core/api_def/base_api/api_def_DeviceIndex.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_DeviceIndex.pbtxt
@@ -2,4 +2,10 @@ op {
   graph_op_name: "DeviceIndex"
   visibility: HIDDEN
   summary: "Return the index of device the op runs."
+  description: <<END
+Given a list of device names, this operation returns the index of the device
+this op runs. The length of the list is returned in two cases:
+(1) Device does not exist in the given device list.
+(2) It is in XLA compilation.
+END
 }
diff --git a/tensorflow/python/ops/control_flow_ops_test.py b/tensorflow/python/ops/control_flow_ops_test.py
index 3ca9bda82f2..33d061b2b72 100644
--- a/tensorflow/python/ops/control_flow_ops_test.py
+++ b/tensorflow/python/ops/control_flow_ops_test.py
@@ -1274,6 +1274,26 @@ class ExecuteFnForDeviceTest(test_util.TensorFlowTestCase):
       self.assertEqual(6., self.evaluate(result))
       self.assertEqual([2.], self.evaluate(grad))
 
+  def testCompile(self):
+    if not test_util.is_gpu_available():
+      return
+
+    def cpu_fn(x):
+      return x + x
+
+    def gpu_fn(x):
+      return x * x
+
+    @def_function.function(experimental_compile=True)
+    def flexible_defun(a):
+      branches = {"CPU": lambda: cpu_fn(a), "GPU": lambda: gpu_fn(a)}
+      return control_flow_ops.execute_fn_for_device(branches, lambda: cpu_fn(a))
+
+    # Always execute the default branch in xla compilation case.
+    a = array_ops.constant(3.)
+    r = flexible_defun(a)
+    self.assertEqual(6., self.evaluate(r))
+
   def testFallBack(self):
 
     def default_fn(x):

From b85e963fb8190f0cece8352c85bb4ff10ad4f4b5 Mon Sep 17 00:00:00 2001
From: Akshay Modi <nareshmodi@google.com>
Date: Mon, 22 Jun 2020 14:55:44 -0700
Subject: [PATCH 0806/1390] Special case tfnp ndarrays in some places, and
 minor improvements to tfnp array functions

PiperOrigin-RevId: 317741132
Change-Id: I0c1ba744b0df7a648879da58bd600e81e26184ee
---
 tensorflow/python/eager/BUILD                 |  1 +
 tensorflow/python/eager/function.py           | 19 +++++++++++++++++--
 tensorflow/python/ops/numpy_ops/np_arrays.py  | 17 +++++++++++++++--
 .../python/ops/numpy_ops/np_interop_test.py   |  4 +++-
 4 files changed, 36 insertions(+), 5 deletions(-)

diff --git a/tensorflow/python/eager/BUILD b/tensorflow/python/eager/BUILD
index af5f3d16408..f51bd97e488 100644
--- a/tensorflow/python/eager/BUILD
+++ b/tensorflow/python/eager/BUILD
@@ -560,6 +560,7 @@ py_library(
         "//tensorflow/python:graph_to_function_def",
         "//tensorflow/python:pywrap_tf_session",
         "//tensorflow/python:util",
+        "//tensorflow/python/ops/numpy_ops:numpy",
         "//third_party/py/numpy",
         "@six_archive//:six",
     ],
diff --git a/tensorflow/python/eager/function.py b/tensorflow/python/eager/function.py
index 4a08e66b441..e2f5d86fbd2 100644
--- a/tensorflow/python/eager/function.py
+++ b/tensorflow/python/eager/function.py
@@ -81,6 +81,9 @@ from tensorflow.python.util import tf_inspect
 ag_ctx = lazy_loader.LazyLoader(
     "ag_ctx", globals(),
     "tensorflow.python.autograph.core.ag_ctx")
+np_arrays = lazy_loader.LazyLoader(
+    "np_arrays", globals(),
+    "tensorflow.python.ops.numpy_ops.np_arrays")
 
 
 FORWARD_FUNCTION_ATTRIBUTE_NAME = "forward_function_name"
@@ -1484,6 +1487,12 @@ class ConcreteFunction(object):
     self._func_graph = func_graph
     self._captured_inputs = self._func_graph.external_captures
     self._captured_closures = self._func_graph.deferred_external_captures
+    structured_outputs = self._func_graph.structured_outputs
+    self._ndarrays_list = (
+        isinstance(structured_outputs, (list, tuple)) and
+        structured_outputs and
+        all([isinstance(o, np_arrays.ndarray) for o in structured_outputs]))
+    self._ndarray_singleton = isinstance(structured_outputs, np_arrays.ndarray)
 
     # function_spec defines the structured signature.
     self._set_function_spec(function_spec)
@@ -2150,9 +2159,15 @@ class ConcreteFunction(object):
     if self._func_graph.structured_outputs is None:
       return result
 
+    if result:
+      if self._ndarrays_list:
+        return [np_arrays.tensor_to_ndarray(o) for o in result]
+      elif self._ndarray_singleton:
+        return np_arrays.tensor_to_ndarray(result[0])
+
     # Replace outputs with results, skipping over any 'None' values.
-    outputs_list = nest.flatten(self._func_graph.structured_outputs,
-                                expand_composites=True)
+    outputs_list = nest.flatten(
+        self._func_graph.structured_outputs, expand_composites=True)
     j = 0
     for i, o in enumerate(outputs_list):
       if o is not None:
diff --git a/tensorflow/python/ops/numpy_ops/np_arrays.py b/tensorflow/python/ops/numpy_ops/np_arrays.py
index 40ccfa9dc50..fd26318bea9 100644
--- a/tensorflow/python/ops/numpy_ops/np_arrays.py
+++ b/tensorflow/python/ops/numpy_ops/np_arrays.py
@@ -149,10 +149,23 @@ class ndarray(composite_tensor.CompositeTensor):  # pylint: disable=invalid-name
     if dtype and dtype != buffer.dtype:
       buffer = array_ops.bitcast(buffer, dtype)
     self._data = buffer
+    self._type_spec_internal = None
+
+  @classmethod
+  def from_tensor(cls, tensor):
+    o = cls.__new__(cls, None)
+    # pylint: disable=protected-access
+    o._data = tensor
+    o._type_spec_internal = None
+    # pylint: enable=protected-access
+    return o
 
   @property
   def _type_spec(self):
-    return NdarraySpec(type_spec.type_spec_from_value(self._data))
+    if self._type_spec_internal is None:
+      self._type_spec_internal = NdarraySpec(
+          type_spec.type_spec_from_value(self._data))
+    return self._type_spec_internal
 
   @property
   def data(self):
@@ -299,7 +312,7 @@ class ndarray(composite_tensor.CompositeTensor):  # pylint: disable=invalid-name
 
 
 def tensor_to_ndarray(tensor):
-  return ndarray(tensor._shape_tuple(), dtype=tensor.dtype, buffer=tensor)  # pylint: disable=protected-access
+  return ndarray.from_tensor(tensor)
 
 
 def ndarray_to_tensor(arr, dtype=None, name=None, as_ref=False):
diff --git a/tensorflow/python/ops/numpy_ops/np_interop_test.py b/tensorflow/python/ops/numpy_ops/np_interop_test.py
index 9580b787202..5c7560b12d9 100644
--- a/tensorflow/python/ops/numpy_ops/np_interop_test.py
+++ b/tensorflow/python/ops/numpy_ops/np_interop_test.py
@@ -64,7 +64,9 @@ class InteropTest(test.TestCase):
 
     dx, dy = t.gradient([xx, yy], [x, y])
 
-    # TODO(nareshmodi): Gradient tape returns tensors. Is it possible to rewrap?
+    # # TODO(nareshmodi): Figure out a way to rewrap ndarray as tensors.
+    # self.assertIsInstance(dx, np_arrays.ndarray)
+    # self.assertIsInstance(dy, np_arrays.ndarray)
     self.assertAllClose(dx, 2.0)
     self.assertAllClose(dy, 3.0)
 

From 00acf333e283661dd1c3a310b124da86b6ba579a Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 22 Jun 2020 15:03:37 -0700
Subject: [PATCH 0807/1390] [Profiler] Add eager-mode related stats and advices
 to the Overview Page.

PiperOrigin-RevId: 317742833
Change-Id: Ib78b44b673b29dd3ff0b81fa00068f64198feb30
---
 .../convert/op_stats_to_overview_page.cc      | 40 +++++++++++++++++++
 .../convert/op_stats_to_overview_page.h       | 10 +++++
 .../profiler/protobuf/overview_page.proto     | 11 ++++-
 3 files changed, 60 insertions(+), 1 deletion(-)

diff --git a/tensorflow/core/profiler/convert/op_stats_to_overview_page.cc b/tensorflow/core/profiler/convert/op_stats_to_overview_page.cc
index 96bbcc24fff..a92902b6cf7 100644
--- a/tensorflow/core/profiler/convert/op_stats_to_overview_page.cc
+++ b/tensorflow/core/profiler/convert/op_stats_to_overview_page.cc
@@ -130,11 +130,13 @@ void SetCommonRecommendation(absl::string_view input_classification,
                              absl::string_view output_statement,
                              HardwareType hardware_type,
                              absl::string_view tf_function_statement_html,
+                             absl::string_view eager_statement_html,
                              OverviewPageRecommendation* re) {
   re->set_bottleneck(std::string(input_classification));
   re->set_statement(std::string(input_statement));
   re->set_output_statement(std::string(output_statement));
   re->set_tf_function_statement_html(std::string(tf_function_statement_html));
+  re->set_eager_statement_html(std::string(eager_statement_html));
   ComputeHostTips(re);
   ComputeDeviceTips(hardware_type, re);
   ComputeDocumentationTips(re);
@@ -188,13 +190,26 @@ OverviewPageAnalysis ComputeAnalysisResult(const OpStats& op_stats) {
       SafeDivide(
           op_stats.device_op_metrics_db().precision_stats().compute_32bit_ps(),
           total_device_compute_ps));
+
   uint64 num_host_tf_ops = 0;
+  uint64 total_host_op_time_ps_exclude_idle = 0;
+  uint64 eager_host_op_time_ps = 0;
   for (const OpMetrics& metrics : op_stats.host_op_metrics_db().metrics_db()) {
     num_host_tf_ops += metrics.occurrences();
+    if (!IsIdleOp(metrics)) {
+      total_host_op_time_ps_exclude_idle += metrics.self_time_ps();
+      if (metrics.is_eager()) eager_host_op_time_ps += metrics.self_time_ps();
+    }
   }
   uint64 num_device_tf_ops = 0;
+  uint64 total_device_op_time_ps_exclude_idle = 0;
+  uint64 eager_device_op_time_ps = 0;
   for (const OpMetrics& metrics : device_tf_op_metrics_db.metrics_db()) {
     num_device_tf_ops += metrics.occurrences();
+    if (!IsIdleOp(metrics)) {
+      total_device_op_time_ps_exclude_idle += metrics.self_time_ps();
+      if (metrics.is_eager()) eager_device_op_time_ps += metrics.self_time_ps();
+    }
   }
   uint64 num_total_tf_ops = num_host_tf_ops + num_device_tf_ops;
   analysis.set_host_tf_op_percent(
@@ -202,6 +217,12 @@ OverviewPageAnalysis ComputeAnalysisResult(const OpStats& op_stats) {
   analysis.set_device_tf_op_percent(
       100.0 * SafeDivide(num_device_tf_ops, num_total_tf_ops));
   analysis.set_host_trace_level(op_stats.run_environment().host_trace_level());
+  analysis.set_host_op_time_eager_percent(
+      100.0 *
+      SafeDivide(eager_host_op_time_ps, total_host_op_time_ps_exclude_idle));
+  analysis.set_device_op_time_eager_percent(
+      100.0 * SafeDivide(eager_device_op_time_ps,
+                         total_device_op_time_ps_exclude_idle));
   return analysis;
 }
 
@@ -279,6 +300,22 @@ std::string TfFunctionRecommendationHtml(const TfFunctionDb& tf_function_db) {
                       ") due to either retracing or eager execution.");
 }
 
+std::string EagerRecommendationHtml(double host_op_time_eager_percent,
+                                    double device_op_time_eager_percent) {
+  std::string recommendation = "";
+  if (host_op_time_eager_percent > kEagerReportThresholdInPercent)
+    absl::StrAppend(&recommendation, host_op_time_eager_percent,
+                    "% of Op time on the host used eager execution. ");
+  if (device_op_time_eager_percent > kEagerReportThresholdInPercent)
+    absl::StrAppend(&recommendation, device_op_time_eager_percent,
+                    "% of Op time on the device used eager execution. ");
+  if (!recommendation.empty())
+    absl::StrAppend(&recommendation, "Performance could be improved with ",
+                    AnchorElement("https://www.tensorflow.org/guide/function",
+                                  "tf.function."));
+  return recommendation;
+}
+
 OverviewPage ConvertOpStatsToOverviewPage(const OpStats& op_stats,
                                           HardwareType hardware_type) {
   OverviewPage overview_page;
@@ -295,6 +332,9 @@ OverviewPage ConvertOpStatsToOverviewPage(const OpStats& op_stats,
   SetCommonRecommendation(
       bottleneck.input_classification(), bottleneck.input_statement(), "",
       hardware_type, TfFunctionRecommendationHtml(op_stats.tf_function_db()),
+      EagerRecommendationHtml(
+          overview_page.analysis().host_op_time_eager_percent(),
+          overview_page.analysis().device_op_time_eager_percent()),
       overview_page.mutable_recommendation());
   PopulateOverviewDiagnostics(op_stats, overview_page.mutable_diagnostics());
   return overview_page;
diff --git a/tensorflow/core/profiler/convert/op_stats_to_overview_page.h b/tensorflow/core/profiler/convert/op_stats_to_overview_page.h
index 098185b8feb..0d49ae492fc 100644
--- a/tensorflow/core/profiler/convert/op_stats_to_overview_page.h
+++ b/tensorflow/core/profiler/convert/op_stats_to_overview_page.h
@@ -32,11 +32,17 @@ namespace profiler {
 // the tf-functions profiled.
 const double kTfFunctionReportThresholdInPercent = 20;
 
+// Reports eager-mode optimization opportunity in the Overview Page if the
+// percent of Op time on host (or device) that is spent on eager mode is over
+// this threshold.
+const double kEagerReportThresholdInPercent = 10;
+
 void SetCommonRecommendation(absl::string_view input_classification,
                              absl::string_view input_statement,
                              absl::string_view output_statement,
                              HardwareType hardware_type,
                              absl::string_view tf_function_statement_html,
+                             absl::string_view eager_statement_html,
                              OverviewPageRecommendation* re);
 
 OverviewPageRecommendation ComputeGenericRecommendation(
@@ -54,6 +60,10 @@ OverviewPage ConvertOpStatsToOverviewPage(const OpStats& op_stats,
 // Returns a html which provides tf-function related recommendation.
 std::string TfFunctionRecommendationHtml(const TfFunctionDb& tf_function_db);
 
+// Returns a html which provides eager-mode related recommendation.
+std::string EagerRecommendationHtml(double host_op_time_eager_percent,
+                                    double device_op_time_eager_percent);
+
 }  // namespace profiler
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/profiler/protobuf/overview_page.proto b/tensorflow/core/profiler/protobuf/overview_page.proto
index cbef05d4d9f..5621ad92a0d 100644
--- a/tensorflow/core/profiler/protobuf/overview_page.proto
+++ b/tensorflow/core/profiler/protobuf/overview_page.proto
@@ -49,6 +49,12 @@ message OverviewPageAnalysis {
   double device_tf_op_percent = 12;
   // Host trace level.
   uint32 host_trace_level = 13;
+  // Percentage of TF-op execution time on the host (excluding the idle time)
+  // that are in eager mode.
+  double host_op_time_eager_percent = 14;
+  // Percentage of TF-op execution time on the device (excluding the idle time)
+  // that are in eager mode.
+  double device_op_time_eager_percent = 15;
 }
 
 // Overview result for a performance tip to users.
@@ -87,8 +93,11 @@ message OverviewPageRecommendation {
   // A statement for output that recommends the next steps for investigating the
   // bottleneck.
   string output_statement = 9;
+  // A statement that recommends the next steps for investigating eager-mode
+  // related bottleneck (it is a html so that it can link to other tools/docs.)
+  string eager_statement_html = 12;
   // A statement that recommends the next steps for investigating tf-function
-  // related bottleneck (it is a html so that it can link to other tools/docs.
+  // related bottleneck (it is a html so that it can link to other tools/docs.)
   string tf_function_statement_html = 10;
   // A list of tips for improving host performance.
   repeated OverviewPageTip host_tips = 3;

From 8c08608ff87d56de5ea5351414f0aadac458dba7 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 22 Jun 2020 15:04:34 -0700
Subject: [PATCH 0808/1390] fix the numpy version to 1.16.0

PiperOrigin-RevId: 317743030
Change-Id: I8ac4e0c30cb67cfc611d351ba405e2b94e57c04a
---
 tensorflow/tools/ci_build/release/common_win.bat | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/tools/ci_build/release/common_win.bat b/tensorflow/tools/ci_build/release/common_win.bat
index 0918f4c43b0..fa577fcfc33 100644
--- a/tensorflow/tools/ci_build/release/common_win.bat
+++ b/tensorflow/tools/ci_build/release/common_win.bat
@@ -30,7 +30,7 @@ SET PATH=%PATH%;C:\%PYTHON_DIRECTORY%
 %PIP_EXE% install future>=0.17.1 --no-deps
 %PIP_EXE% install --ignore-installed --force-reinstall --upgrade tf-estimator-nightly --no-deps
 %PIP_EXE% install tb-nightly --no-deps
-%PIP_EXE% install numpy --upgrade --no-deps
+%PIP_EXE% install numpy==1.16.0 --upgrade --no-deps
 %PIP_EXE% install opt_einsum --upgrade
 %PIP_EXE% install pandas --upgrade --no-deps
 %PIP_EXE% install protobuf --upgrade --no-deps

From cf15e5cf153e02dc96d87374c26d4aa21df80f21 Mon Sep 17 00:00:00 2001
From: Zhenyu Tan <tanzheny@google.com>
Date: Mon, 22 Jun 2020 15:11:04 -0700
Subject: [PATCH 0809/1390] Add distribution test for image preprocessing.

PiperOrigin-RevId: 317744390
Change-Id: I62557f99761043c3e3c83de4f4faee0cb66d94a8
---
 .../python/keras/layers/preprocessing/BUILD   | 16 +++++
 .../preprocessing/image_preprocessing.py      | 12 ++++
 .../image_preprocessing_distribution_test.py  | 67 +++++++++++++++++++
 3 files changed, 95 insertions(+)
 create mode 100644 tensorflow/python/keras/layers/preprocessing/image_preprocessing_distribution_test.py

diff --git a/tensorflow/python/keras/layers/preprocessing/BUILD b/tensorflow/python/keras/layers/preprocessing/BUILD
index af7f6392219..6916712d52c 100644
--- a/tensorflow/python/keras/layers/preprocessing/BUILD
+++ b/tensorflow/python/keras/layers/preprocessing/BUILD
@@ -361,6 +361,22 @@ distribute_py_test(
     ],
 )
 
+distribute_py_test(
+    name = "image_preprocessing_distribution_test",
+    srcs = ["image_preprocessing_distribution_test.py"],
+    main = "image_preprocessing_distribution_test.py",
+    python_version = "PY3",
+    tags = [
+        "multi_and_single_gpu",
+    ],
+    deps = [
+        ":category_crossing",
+        "//tensorflow/python/distribute:combinations",
+        "//tensorflow/python/distribute:strategy_combinations",
+        "//tensorflow/python/keras",
+    ],
+)
+
 tf_py_test(
     name = "discretization_test",
     size = "small",
diff --git a/tensorflow/python/keras/layers/preprocessing/image_preprocessing.py b/tensorflow/python/keras/layers/preprocessing/image_preprocessing.py
index 074652a0b79..dd741c8c72c 100644
--- a/tensorflow/python/keras/layers/preprocessing/image_preprocessing.py
+++ b/tensorflow/python/keras/layers/preprocessing/image_preprocessing.py
@@ -1306,6 +1306,18 @@ class _RandomGenerator(stateful_random_ops.Generator):
   numbers.
   """
 
+  # TODO(b/157995497): Temporarily use primary variable handle inside cross
+  # replica context.
+  @property
+  def state(self):
+    """The internal state of the RNG."""
+    state_var = self._state_var
+    try:
+      _ = getattr(state_var, 'handle')
+      return state_var
+    except ValueError:
+      return state_var.values[0]
+
   def _create_variable(self, *args, **kwargs):
     # This function does the same thing as the base class's namesake, except
     # that it skips the distribution-strategy check. When we are inside a
diff --git a/tensorflow/python/keras/layers/preprocessing/image_preprocessing_distribution_test.py b/tensorflow/python/keras/layers/preprocessing/image_preprocessing_distribution_test.py
new file mode 100644
index 00000000000..0b93c1d57c6
--- /dev/null
+++ b/tensorflow/python/keras/layers/preprocessing/image_preprocessing_distribution_test.py
@@ -0,0 +1,67 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for keras.layers.preprocessing.normalization."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python import keras
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.distribute import combinations
+from tensorflow.python.distribute import strategy_combinations
+from tensorflow.python.framework import dtypes
+from tensorflow.python.keras import keras_parameterized
+from tensorflow.python.keras.layers.preprocessing import image_preprocessing
+from tensorflow.python.keras.layers.preprocessing import preprocessing_test_utils
+from tensorflow.python.platform import test
+
+
+@combinations.generate(
+    combinations.combine(
+        distribution=strategy_combinations.all_strategies,
+        mode=["eager", "graph"]))
+class ImagePreprocessingDistributionTest(
+    keras_parameterized.TestCase,
+    preprocessing_test_utils.PreprocessingLayerTest):
+
+  def test_distribution(self, distribution):
+    np_images = np.random.random((1000, 32, 32, 3)).astype(np.float32)
+    image_dataset = dataset_ops.Dataset.from_tensor_slices(np_images).batch(
+        32, drop_remainder=True)
+
+    with distribution.scope():
+      input_data = keras.Input(shape=(32, 32, 3), dtype=dtypes.float32)
+      image_preprocessor = keras.Sequential([
+          image_preprocessing.Resizing(height=256, width=256),
+          image_preprocessing.RandomCrop(height=224, width=224),
+          image_preprocessing.RandomTranslation(.1, .1),
+          image_preprocessing.RandomRotation(.2),
+          image_preprocessing.RandomFlip(),
+          image_preprocessing.RandomZoom(.2, .2)])
+      preprocessed_image = image_preprocessor(input_data)
+      flatten_layer = keras.layers.Flatten(data_format="channels_last")
+      output = flatten_layer(preprocessed_image)
+      cls_layer = keras.layers.Dense(units=1, activation="sigmoid")
+      output = cls_layer(output)
+      model = keras.Model(inputs=input_data, outputs=preprocessed_image)
+    model.compile(loss="binary_crossentropy")
+    _ = model.predict(image_dataset)
+
+
+if __name__ == "__main__":
+  test.main()

From 763a82c83f492cbf4ee387a8e1e24ce66ef5535f Mon Sep 17 00:00:00 2001
From: Raman Sarokin <sorokin@google.com>
Date: Mon, 22 Jun 2020 15:22:15 -0700
Subject: [PATCH 0810/1390] FullyConnected converted to new style. Buffer
 implements GPUObject.

PiperOrigin-RevId: 317746356
Change-Id: I715d37a924298e54e9fe68e5c7b719ab592237a1
---
 tensorflow/lite/delegates/gpu/cl/BUILD        |  1 +
 tensorflow/lite/delegates/gpu/cl/arguments.cc |  3 +
 tensorflow/lite/delegates/gpu/cl/buffer.cc    | 38 ++++++++
 tensorflow/lite/delegates/gpu/cl/buffer.h     | 19 +++-
 .../gpu/cl/kernels/fully_connected.cc         | 87 +++++++++----------
 .../gpu/cl/kernels/fully_connected.h          | 25 ++++--
 6 files changed, 120 insertions(+), 53 deletions(-)

diff --git a/tensorflow/lite/delegates/gpu/cl/BUILD b/tensorflow/lite/delegates/gpu/cl/BUILD
index e6e7e4747c4..fda57ce6da0 100644
--- a/tensorflow/lite/delegates/gpu/cl/BUILD
+++ b/tensorflow/lite/delegates/gpu/cl/BUILD
@@ -81,6 +81,7 @@ cc_library(
     deps = [
         ":cl_command_queue",
         ":cl_context",
+        ":gpu_object",
         ":opencl_wrapper",
         ":util",
         "//tensorflow/lite/delegates/gpu/common:status",
diff --git a/tensorflow/lite/delegates/gpu/cl/arguments.cc b/tensorflow/lite/delegates/gpu/cl/arguments.cc
index 6955681a366..d26c869957c 100644
--- a/tensorflow/lite/delegates/gpu/cl/arguments.cc
+++ b/tensorflow/lite/delegates/gpu/cl/arguments.cc
@@ -746,6 +746,9 @@ absl::Status Arguments::ResolveSelectorsPass(
       size_t close_bracket_pos;
       RETURN_IF_ERROR(ParseArgsInsideBrackets(*code, next_position,
                                               &close_bracket_pos, &args));
+      for (auto& arg : args) {
+        RETURN_IF_ERROR(ResolveSelectorsPass({}, &arg));
+      }
       std::string patch;
       RETURN_IF_ERROR(ResolveSelector(linkables, object_name, selector_name,
                                       args, template_args, &patch));
diff --git a/tensorflow/lite/delegates/gpu/cl/buffer.cc b/tensorflow/lite/delegates/gpu/cl/buffer.cc
index 207cdec5122..68e85593e5d 100644
--- a/tensorflow/lite/delegates/gpu/cl/buffer.cc
+++ b/tensorflow/lite/delegates/gpu/cl/buffer.cc
@@ -44,6 +44,38 @@ absl::Status CreateBuffer(size_t size_in_bytes, bool gpu_read_only,
 }
 }  // namespace
 
+GPUResources BufferDescriptor::GetGPUResources(AccessType access_type) const {
+  GPUResources resources;
+  GPUBufferDescriptor desc;
+  desc.data_type = element_type;
+  desc.access_type = access_type;
+  desc.element_size = element_size;
+  resources.buffers.push_back({"buffer", desc});
+  return resources;
+}
+
+absl::Status BufferDescriptor::PerformSelector(
+    const std::string& selector, const std::vector<std::string>& args,
+    const std::vector<std::string>& template_args, std::string* result) const {
+  if (selector == "Read") {
+    return PerformReadSelector(args, result);
+  } else {
+    return absl::NotFoundError(absl::StrCat(
+        "BufferDescriptor don't have selector with name - ", selector));
+  }
+}
+
+absl::Status BufferDescriptor::PerformReadSelector(
+    const std::vector<std::string>& args, std::string* result) const {
+  if (args.size() != 1) {
+    return absl::NotFoundError(
+        absl::StrCat("BufferDescriptor Read require one argument, but ",
+                     args.size(), " was passed"));
+  }
+  *result = absl::StrCat("buffer[", args[0], "]");
+  return absl::OkStatus();
+}
+
 Buffer::Buffer(cl_mem buffer, size_t size_in_bytes)
     : buffer_(buffer), size_(size_in_bytes) {}
 
@@ -71,6 +103,12 @@ void Buffer::Release() {
   }
 }
 
+GPUResourcesWithValue Buffer::GetGPUResources(AccessType access_type) const {
+  GPUResourcesWithValue resources;
+  resources.buffers.push_back({"buffer", buffer_});
+  return resources;
+}
+
 absl::Status CreateReadOnlyBuffer(size_t size_in_bytes, CLContext* context,
                                   Buffer* result) {
   return CreateBuffer(size_in_bytes, true, nullptr, context, result);
diff --git a/tensorflow/lite/delegates/gpu/cl/buffer.h b/tensorflow/lite/delegates/gpu/cl/buffer.h
index 84c3292084b..771aae9e002 100644
--- a/tensorflow/lite/delegates/gpu/cl/buffer.h
+++ b/tensorflow/lite/delegates/gpu/cl/buffer.h
@@ -20,6 +20,7 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "tensorflow/lite/delegates/gpu/cl/cl_command_queue.h"
 #include "tensorflow/lite/delegates/gpu/cl/cl_context.h"
+#include "tensorflow/lite/delegates/gpu/cl/gpu_object.h"
 #include "tensorflow/lite/delegates/gpu/cl/opencl_wrapper.h"
 #include "tensorflow/lite/delegates/gpu/cl/util.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
@@ -28,9 +29,23 @@ namespace tflite {
 namespace gpu {
 namespace cl {
 
+struct BufferDescriptor : public GPUObjectDescriptor {
+  DataType element_type;  // FLOAT32 or FLOAT16
+  int element_size;
+
+  absl::Status PerformSelector(const std::string& selector,
+                               const std::vector<std::string>& args,
+                               const std::vector<std::string>& template_args,
+                               std::string* result) const override;
+
+  GPUResources GetGPUResources(AccessType access_type) const override;
+  absl::Status PerformReadSelector(const std::vector<std::string>& args,
+                                   std::string* result) const;
+};
+
 // Buffer represent linear GPU data storage with arbitrary data format.
 // Buffer is moveable but not copyable.
-class Buffer {
+class Buffer : public GPUObject {
  public:
   Buffer() {}  // just for using Buffer as a class members
   Buffer(cl_mem buffer, size_t size_in_bytes);
@@ -57,6 +72,8 @@ class Buffer {
   template <typename T>
   absl::Status ReadData(CLCommandQueue* queue, std::vector<T>* result) const;
 
+  GPUResourcesWithValue GetGPUResources(AccessType access_type) const override;
+
  private:
   void Release();
 
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/fully_connected.cc b/tensorflow/lite/delegates/gpu/cl/kernels/fully_connected.cc
index f93648f82fc..bc287ec2fee 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/fully_connected.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/fully_connected.cc
@@ -31,14 +31,15 @@ namespace {
 // Good results for ~1024 x 1024 sizes, for other can be written more
 // optimized shaders
 
-std::string GetFullyConnectedKernelCode(
-    const OperationDef& op_def, const LinearStorage& biases,
-    const std::vector<ElementwiseOperation*>& linked_operations,
-    const int3& work_group_size) {
-  TensorCodeGenerator src_tensor("src_data", WHSPoint{"1", "1", "depthes.x"},
-                                 op_def.src_tensors[0]);
-  TensorCodeGenerator dst_tensor("dst_data", WHSPoint{"1", "1", "depthes.y"},
-                                 op_def.dst_tensors[0]);
+std::string GetFullyConnectedKernelCode(const OperationDef& op_def,
+                                        const int3& work_group_size,
+                                        Arguments* args) {
+  args->AddObjectRef(
+      "src_tensor", AccessType::READ,
+      absl::make_unique<TensorDescriptor>(op_def.src_tensors[0]));
+  args->AddObjectRef(
+      "dst_tensor", AccessType::WRITE,
+      absl::make_unique<TensorDescriptor>(op_def.dst_tensors[0]));
 
   std::string c = GetCommonDefines(op_def.precision);
   switch (op_def.precision) {
@@ -54,21 +55,16 @@ std::string GetFullyConnectedKernelCode(
   const std::string wg_x = std::to_string(work_group_size.x);
   const std::string wg_y = std::to_string(work_group_size.y);
   c += "__kernel void main_function(\n";
-  c += src_tensor.GetDeclaration(AccessType::READ) + ",\n";
-  c += "    __global FLT16* filters,      \n";
-  c += biases.GetDeclaration();
-  c += GetArgsDeclaration(linked_operations);
-  c += dst_tensor.GetDeclaration(AccessType::WRITE) + ",\n";
-  c += "    int2 depthes                  \n";
-  c += ") {\n";
+  c += "$0) {\n";
   c += "  int gid = get_global_id(0);\n";
-  c += "  bool inside = gid < depthes.y;\n";
-  c += "  gid = min(gid, depthes.y - 1);\n";
+  c += "  bool inside = gid < args.dst_tensor.Slices();\n";
+  c += "  gid = min(gid, args.dst_tensor.Slices() - 1);\n";
   c += "  int2 tid = (int2)(get_local_id(0), get_local_id(1));\n";
   c += "  ACCUM_FLT4 s = (ACCUM_FLT4)(0.0f);\n";
-  c += "  for (uint c = tid.y; c < depthes.x; c += " + wg_y + ") {\n";
-  c += "    FLT4 v = " + src_tensor.ReadWHS("0", "0", "c") + ";\n";
-  c += "    FLT16 w = filters[c * depthes.y + gid];\n";
+  c += "  for (uint c = tid.y; c < args.src_tensor.Slices(); c += " + wg_y +
+       ") {\n";
+  c += "    FLT4 v = args.src_tensor.Read(0, 0, c);\n";
+  c += "    FLT16 w = args.weights.Read(c * args.dst_tensor.Slices() + gid);\n";
   c += "    s.x += dot(v, w.s0123);\n";
   c += "    s.y += dot(v, w.s4567);\n";
   c += "    s.z += dot(v, w.s89ab);\n";
@@ -81,10 +77,8 @@ std::string GetFullyConnectedKernelCode(
   for (int i = 1; i < work_group_size.y; ++i) {
     c += "    s += temp[tid.x][" + std::to_string(i) + "];\n";
   }
-  c += "    FLT4 r0 = TO_FLT4(s) + " + biases.ReadLinearFLT4("gid") + ";\n";
-  const LinkingContext context{"r0", "0", "0", "gid"};
-  c += PostProcess(linked_operations, context);
-  c += "  " + dst_tensor.WriteWHS("r0", "0", "0", "gid") + "\n";
+  c += "    FLT4 r0 = TO_FLT4(s) + args.biases.Read(gid);\n";
+  c += "    args.dst_tensor.Write(r0, 0, 0, gid);\n";
   c += "  }\n";
   c += "}\n";
 
@@ -97,15 +91,11 @@ FullyConnected::FullyConnected(const OperationDef& definition)
 
 FullyConnected::FullyConnected(FullyConnected&& kernel)
     : GPUOperation(std::move(kernel)),
-      weights_(std::move(kernel.weights_)),
-      biases_(std::move(kernel.biases_)),
       kernel_(std::move(kernel.kernel_)),
       work_group_size_(kernel.work_group_size_) {}
 
 FullyConnected& FullyConnected::operator=(FullyConnected&& kernel) {
   if (this != &kernel) {
-    weights_ = std::move(kernel.weights_);
-    biases_ = std::move(kernel.biases_);
     kernel_ = std::move(kernel.kernel_);
     std::swap(work_group_size_, kernel.work_group_size_);
     GPUOperation::operator=(std::move(kernel));
@@ -120,8 +110,14 @@ absl::Status FullyConnected::Compile(const CreationContext& creation_context) {
   do {
     work_group_size_ = {wg_width, wg_height, 1};
     wg_width /= 2;
-    const auto code = GetFullyConnectedKernelCode(
-        definition_, biases_, linked_operations_, work_group_size_);
+    std::string code =
+        GetFullyConnectedKernelCode(definition_, work_group_size_, &args_);
+    std::string element_wise_code;
+    RETURN_IF_ERROR(
+        MergeOperations(linked_operations_, &args_, &element_wise_code));
+    RETURN_IF_ERROR(args_.TransformToCLCode(creation_context.device->GetInfo(),
+                                            {{"dst_tensor", element_wise_code}},
+                                            &code));
     auto status = creation_context.cache->GetOrCreateCLKernel(
         code, "main_function", *creation_context.context,
         *creation_context.device, &kernel_);
@@ -138,14 +134,10 @@ absl::Status FullyConnected::Compile(const CreationContext& creation_context) {
 }
 
 absl::Status FullyConnected::AddToQueue(CLCommandQueue* queue) {
-  kernel_.ResetBindingCounter();
-  RETURN_IF_ERROR(kernel_.SetMemoryAuto(src_[0]->GetMemoryPtr()));
-  RETURN_IF_ERROR(kernel_.SetMemoryAuto(weights_.GetMemoryPtr()));
-  RETURN_IF_ERROR(kernel_.SetMemoryAuto(biases_.GetMemoryPtr()));
-  RETURN_IF_ERROR(BindArgs(&kernel_, linked_operations_));
-  RETURN_IF_ERROR(kernel_.SetMemoryAuto(dst_[0]->GetMemoryPtrForWriting()));
-  RETURN_IF_ERROR(
-      kernel_.SetBytesAuto(int2(src_[0]->Slices(), dst_[0]->Slices())));
+  RETURN_IF_ERROR(args_.SetObjectRef("src_tensor", src_[0]));
+  RETURN_IF_ERROR(args_.SetObjectRef("dst_tensor", dst_[0]));
+  RETURN_IF_ERROR(SetArguments(linked_operations_, &args_));
+  RETURN_IF_ERROR(args_.Bind(kernel_.kernel()));
   return queue->DispatchImplicit(kernel_, {dst_[0]->Slices(), 1, 1},
                                  work_group_size_);
 }
@@ -157,13 +149,18 @@ absl::Status CreateFullyConnected(const CreationContext& creation_context,
   *result = FullyConnected(definition);
   RETURN_IF_ERROR(
       result->UploadWeights(attr.weights, creation_context.context));
-  LinearStorageCreateInfo create_info;
-  create_info.storage_type = LinearStorageType::TEXTURE_2D;
-  create_info.data_type = definition.GetDataType();
-  create_info.name = "biases";
-  create_info.aligned_size = attr.weights.shape.o;
-  RETURN_IF_ERROR(CreateLinearStorage(
-      create_info, attr.bias, creation_context.context, &result->biases_));
+
+  TensorLinearDescriptor desc;
+  desc.storage_type = LinearStorageType::TEXTURE_2D;
+  desc.element_type = definition.GetDataType();
+
+  LinearStorage lt;
+  RETURN_IF_ERROR(
+      CreateLinearStorage(desc, attr.bias, creation_context.context, &lt));
+  result->args_.AddObject("biases", AccessType::READ,
+                          absl::make_unique<LinearStorage>(std::move(lt)),
+                          absl::make_unique<TensorLinearDescriptor>(desc));
+
   return absl::OkStatus();
 }
 
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/fully_connected.h b/tensorflow/lite/delegates/gpu/cl/kernels/fully_connected.h
index 0be5288ed3a..83490b281ab 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/fully_connected.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/fully_connected.h
@@ -61,8 +61,6 @@ class FullyConnected : public GPUOperation {
   void RearrangeWeights(const tflite::gpu::Tensor<OHWI, T>& weights,
                         absl::Span<S> dst);
 
-  Buffer weights_;
-  LinearStorage biases_;
   CLKernel kernel_;
   int3 work_group_size_ = int3(0, 0, 0);
 };
@@ -78,17 +76,30 @@ absl::Status FullyConnected::UploadWeights(
 
   const int float4_size = f32_weights ? 16 : 8;
 
-  if (definition_.GetDataType() == DataType::FLOAT32) {
+  BufferDescriptor desc;
+  desc.element_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
+  desc.element_size = 16;
+
+  Buffer weights_buffer;
+  if (f32_weights) {
     std::vector<float4> gpu_data(dst_depth * src_depth * 4);
     RearrangeWeights(weights, absl::MakeSpan(gpu_data));
-    return CreateReadOnlyBuffer(float4_size * elements_count, gpu_data.data(),
-                                context, &weights_);
+    RETURN_IF_ERROR(CreateReadOnlyBuffer(float4_size * elements_count,
+                                         gpu_data.data(), context,
+                                         &weights_buffer));
   } else {
     std::vector<half4> gpu_data(dst_depth * src_depth * 4);
     RearrangeWeights(weights, absl::MakeSpan(gpu_data));
-    return CreateReadOnlyBuffer(float4_size * elements_count, gpu_data.data(),
-                                context, &weights_);
+    RETURN_IF_ERROR(CreateReadOnlyBuffer(float4_size * elements_count,
+                                         gpu_data.data(), context,
+                                         &weights_buffer));
   }
+
+  args_.AddObject("weights", AccessType::READ,
+                  absl::make_unique<Buffer>(std::move(weights_buffer)),
+                  absl::make_unique<BufferDescriptor>(desc));
+
+  return absl::OkStatus();
 }
 
 template <DataType T, typename S>

From 090f88dcb4451aabcf2792b3caf7022e673baccb Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 22 Jun 2020 15:22:24 -0700
Subject: [PATCH 0811/1390] Add tests for code snippets in README.

PiperOrigin-RevId: 317746384
Change-Id: I437965917d7d392b906cecd145f87ced4635a592
---
 tensorflow/python/ops/numpy_ops/BUILD         |   4 +-
 .../python/ops/numpy_ops/np_interop_test.py   | 162 +++++++++++-------
 2 files changed, 99 insertions(+), 67 deletions(-)

diff --git a/tensorflow/python/ops/numpy_ops/BUILD b/tensorflow/python/ops/numpy_ops/BUILD
index 704e5a27b48..a70e3f3918d 100644
--- a/tensorflow/python/ops/numpy_ops/BUILD
+++ b/tensorflow/python/ops/numpy_ops/BUILD
@@ -112,7 +112,7 @@ cuda_py_test(
     srcs = ["np_interop_test.py"],
     deps = [
         ":numpy",
-        "//tensorflow/python:platform",
-        "@absl_py//absl/testing:parameterized",
+        "//tensorflow:tensorflow_py",
+        "//third_party/py/numpy",
     ],
 )
diff --git a/tensorflow/python/ops/numpy_ops/np_interop_test.py b/tensorflow/python/ops/numpy_ops/np_interop_test.py
index 5c7560b12d9..836bce61494 100644
--- a/tensorflow/python/ops/numpy_ops/np_interop_test.py
+++ b/tensorflow/python/ops/numpy_ops/np_interop_test.py
@@ -18,44 +18,73 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-
 import numpy as onp
+import tensorflow.compat.v2 as tf
+
+import tensorflow.python.ops.numpy_ops as np
 
 
-from tensorflow.python.distribute import distribution_strategy_context
-from tensorflow.python.distribute import mirrored_strategy
-from tensorflow.python.distribute import reduce_util
-from tensorflow.python.eager import backprop
-from tensorflow.python.eager import context
-from tensorflow.python.eager import def_function
-from tensorflow.python.framework import config
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops.numpy_ops import np_array_ops
-from tensorflow.python.ops.numpy_ops import np_arrays
-from tensorflow.python.ops.numpy_ops import np_math_ops
-from tensorflow.python.platform import test
+# Tests for code snippet put in README.md
+class ReadmeTest(tf.test.TestCase):
+
+  def testBroadcastAdd(self):
+    x_np = np.ones([2, 1]) + np.ones([1, 2])
+    x_onp = onp.ones([2, 1]) + onp.ones([1, 2])
+    self.assertAllClose(x_onp, x_np)
+
+  def testTypePromotion(self):
+    x_np = np.ones([1, 2], dtype=np.int16) + np.ones([2, 1], dtype=np.uint8)
+    x_onp = np.ones([1, 2], dtype=np.int16) + np.ones([2, 1], dtype=np.uint8)
+    self.assertEqual(x_onp.dtype, x_np.dtype)
+    self.assertAllClose(x_onp, x_np)
+
+  def testTFInterop(self):
+    x_np = np.sum(np.ones([1, 2]) + tf.ones([2, 1]))
+    x_onp = onp.sum(onp.ones([1, 2]) + onp.ones([2, 1]))
+    self.assertAllClose(x_onp, x_np)
+
+  def testOnpInterop(self):
+    x_np = onp.sum(np.ones([1, 2]) + onp.ones([2, 1]))
+    x_onp = onp.sum(onp.ones([1, 2]) + onp.ones([2, 1]))
+    self.assertAllClose(x_onp, x_np)
+
+  def testDevice(self):
+    if tf.test.is_gpu_available():
+      with tf.device('GPU:0'):
+        x = np.ones([1, 2])
+      self.assertIn('GPU', tf.convert_to_tensor(x).device)
+    with tf.device('CPU:0'):
+      x = np.ones([1, 2])
+    self.assertIn('CPU', tf.convert_to_tensor(x).device)
+
+  def testFunction(self):
+
+    @tf.function
+    def f(x, y):
+      return np.sum(x + y)
+
+    x_np = f(np.ones([1, 2]), tf.ones([2, 1]))
+    x_onp = onp.sum(onp.ones([1, 2]) + onp.ones([2, 1]))
+    self.assertAllClose(x_onp, x_np)
 
 
-class InteropTest(test.TestCase):
+class InteropTest(tf.test.TestCase):
 
   def setUp(self):
     super(InteropTest, self).setUp()
-    physical_devices = config.list_physical_devices('CPU')
-    configs = config.get_logical_device_configuration(physical_devices[0])
+    physical_devices = tf.config.list_physical_devices('CPU')
+    configs = tf.config.get_logical_device_configuration(physical_devices[0])
     if configs is None:
       logical_devices = [
-          context.LogicalDeviceConfiguration() for _ in range(3)
+          tf.config.LogicalDeviceConfiguration() for _ in range(3)
       ]
-      config.set_logical_device_configuration(physical_devices[0],
-                                              logical_devices)
+      tf.config.set_logical_device_configuration(physical_devices[0],
+                                                 logical_devices)
 
   def testGradientTapeInterop(self):
-    with backprop.GradientTape() as t:
-      x = np_array_ops.asarray(3.0)
-      y = np_array_ops.asarray(2.0)
+    with tf.GradientTape() as t:
+      x = np.asarray(3.0)
+      y = np.asarray(2.0)
 
       t.watch([x, y])
 
@@ -71,30 +100,30 @@ class InteropTest(test.TestCase):
     self.assertAllClose(dy, 3.0)
 
   def testFunctionInterop(self):
-    x = np_array_ops.asarray(3.0)
-    y = np_array_ops.asarray(2.0)
+    x = np.asarray(3.0)
+    y = np.asarray(2.0)
 
     add = lambda x, y: x + y
-    add_fn = def_function.function(add)
+    add_fn = tf.function(add)
 
     raw_result = add(x, y)
     fn_result = add_fn(x, y)
 
-    self.assertIsInstance(raw_result, np_arrays.ndarray)
-    self.assertIsInstance(fn_result, np_arrays.ndarray)
+    self.assertIsInstance(raw_result, np.ndarray)
+    self.assertIsInstance(fn_result, np.ndarray)
     self.assertAllClose(raw_result, fn_result)
 
   def testCondInterop(self):
-    x = np_array_ops.asarray(3.0)
+    x = np.asarray(3.0)
 
     def fn(x):
-      x_plus_1 = control_flow_ops.cond(x > 0, lambda: x+1, lambda: x+2)
-      x_plus_2 = control_flow_ops.cond(x < 0, lambda: x+1, lambda: x+2)
+      x_plus_1 = tf.cond(x > 0, lambda: x + 1, lambda: x + 2)
+      x_plus_2 = tf.cond(x < 0, lambda: x + 1, lambda: x + 2)
 
       return x_plus_1, x_plus_2
 
     raw_x_plus_1, raw_x_plus_2 = fn(x)
-    fn_x_plus_1, fn_x_plus_2 = def_function.function(fn)(x)
+    fn_x_plus_1, fn_x_plus_2 = tf.function(fn)(x)
 
     self.assertAllClose(raw_x_plus_1, x + 1)
     self.assertAllClose(raw_x_plus_2, x + 2)
@@ -103,44 +132,45 @@ class InteropTest(test.TestCase):
     self.assertAllClose(fn_x_plus_2, x + 2)
 
   def testWhileInterop(self):
+
     def fn():
-      x = np_array_ops.asarray(0)
+      x = np.asarray(0)
       c = lambda x: x < 10000
       b = lambda x: [x + 1]
-      return control_flow_ops.while_loop_v2(c, b, [x], parallel_iterations=20)
+      return tf.while_loop(c, b, [x], parallel_iterations=20)
 
     self.assertEqual(10000, fn()[0])
-    self.assertEqual(10000, def_function.function(fn)()[0])
+    self.assertEqual(10000, tf.function(fn)()[0])
 
   def testTensorTFNPArrayInterop(self):
-    arr = np_array_ops.asarray(0.)
-    t = constant_op.constant(10.)
+    arr = np.asarray(0.)
+    t = tf.constant(10.)
 
     arr_plus_t = arr + t
     t_plus_arr = t + arr
 
-    self.assertIsInstance(arr_plus_t, ops.Tensor)
-    self.assertIsInstance(t_plus_arr, ops.Tensor)
+    self.assertIsInstance(arr_plus_t, tf.Tensor)
+    self.assertIsInstance(t_plus_arr, tf.Tensor)
     self.assertEqual(10., arr_plus_t.numpy())
     self.assertEqual(10., t_plus_arr.numpy())
 
   def testTensorTFNPOp(self):
-    t = constant_op.constant(10.)
+    t = tf.constant(10.)
 
-    sq = np_math_ops.square(t)
-    self.assertIsInstance(sq, np_arrays.ndarray)
+    sq = np.square(t)
+    self.assertIsInstance(sq, np.ndarray)
     self.assertEqual(100., sq)
 
   def testTFNPArrayTFOpInterop(self):
-    arr = np_array_ops.asarray(10.)
+    arr = np.asarray(10.)
 
     # TODO(nareshmodi): Test more ops.
-    sq = math_ops.square(arr)
-    self.assertIsInstance(sq, ops.Tensor)
+    sq = tf.square(arr)
+    self.assertIsInstance(sq, tf.Tensor)
     self.assertEqual(100., sq.numpy())
 
   def testTFNPArrayNPOpInterop(self):
-    arr = np_array_ops.asarray([10.])
+    arr = np.asarray([10.])
 
     # TODO(nareshmodi): Test more ops.
     sq = onp.square(arr)
@@ -149,47 +179,49 @@ class InteropTest(test.TestCase):
 
     # TODO(nareshmodi): Fails since the autopacking code doesn't use
     # nest.flatten.
+
+
 #   def testAutopacking(self):
-#     arr1 = np_array_ops.asarray(1.)
-#     arr2 = np_array_ops.asarray(2.)
-#     arr3 = np_array_ops.asarray(3.)
+#     arr1 = np.asarray(1.)
+#     arr2 = np.asarray(2.)
+#     arr3 = np.asarray(3.)
 #     t = ops.convert_to_tensor_v2([arr1, arr2, arr3])
 
 #     self.assertEqual(t.numpy(), [1., 2., 3.])
 
   def testDistStratInterop(self):
-    strategy = mirrored_strategy.MirroredStrategy(
+    strategy = tf.distribute.MirroredStrategy(
         devices=['CPU:0', 'CPU:1', 'CPU:2'])
 
-    multiplier = np_array_ops.asarray(5.)
+    multiplier = np.asarray(5.)
 
     with strategy.scope():
-      @def_function.function
+
+      @tf.function
       def run():
-        ctx = distribution_strategy_context.get_replica_context()
-        val = np_array_ops.asarray(ctx.replica_id_in_sync_group)
+        ctx = tf.distribute.get_replica_context()
+        val = np.asarray(ctx.replica_id_in_sync_group)
         return val * multiplier
 
       distributed_values = strategy.run(run)
-      reduced = strategy.reduce(reduce_util.ReduceOp.SUM,
-                                distributed_values, axis=None)
+      reduced = strategy.reduce(
+          tf.distribute.ReduceOp.SUM, distributed_values, axis=None)
 
     values = distributed_values.values
 
     # Note that this should match the number of virtual CPUs.
     self.assertLen(values, 3)
-    self.assertIsInstance(values[0], np_arrays.ndarray)
-    self.assertIsInstance(values[1], np_arrays.ndarray)
-    self.assertIsInstance(values[2], np_arrays.ndarray)
+    self.assertIsInstance(values[0], np.ndarray)
+    self.assertIsInstance(values[1], np.ndarray)
+    self.assertIsInstance(values[2], np.ndarray)
     self.assertAllClose(values[0], 0)
     self.assertAllClose(values[1], 5)
     self.assertAllClose(values[2], 10)
 
     # "strategy.reduce" doesn't rewrap in ndarray.
-    # self.assertIsInstance(reduced, np_arrays.ndarray)
+    # self.assertIsInstance(reduced, np.ndarray)
     self.assertAllClose(reduced, 15)
 
-
 if __name__ == '__main__':
-  ops.enable_eager_execution()
-  test.main()
+  tf.compat.v1.enable_eager_execution()
+  tf.test.main()

From 188ba91f08a44b18ea503bada5cbf7d1f8a4bb1b Mon Sep 17 00:00:00 2001
From: Raman Sarokin <sorokin@google.com>
Date: Mon, 22 Jun 2020 15:23:18 -0700
Subject: [PATCH 0812/1390] MaxUnpooling converted to new style. Merged 2D/3D
 versions.

PiperOrigin-RevId: 317746537
Change-Id: I69961e9ae1412ccff177a44493020ef97ee0921f
---
 .../delegates/gpu/cl/kernels/max_unpooling.cc | 364 +++++++-----------
 .../delegates/gpu/cl/kernels/max_unpooling.h  |  39 +-
 2 files changed, 139 insertions(+), 264 deletions(-)

diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/max_unpooling.cc b/tensorflow/lite/delegates/gpu/cl/kernels/max_unpooling.cc
index 56109fc713b..dc16837102a 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/max_unpooling.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/max_unpooling.cc
@@ -25,178 +25,136 @@ namespace gpu {
 namespace cl {
 namespace {
 
-std::string GetMaxUnpoolingKernelCode(
-    const OperationDef& op_def, const CLDevice& device,
-    const std::vector<ElementwiseOperation*>& linked_operations) {
-  TensorCodeGenerator src("src_data",
-                          WHSPoint{"src_size.x", "src_size.y", "src_size.z"},
-                          op_def.src_tensors[0]);
-  TensorCodeGenerator src_ind(
-      "src_data_indices", WHSPoint{"src_size.x", "src_size.y", "src_size.z"},
-      op_def.src_tensors[1]);
-  TensorCodeGenerator dst("dst_data",
-                          WHSPoint{"dst_size.x", "dst_size.y", "dst_size.z"},
-                          op_def.dst_tensors[0]);
-
-  const auto address_mode = GetFastestZeroMode(device);
+std::string GetMaxUnpoolingKernelCode(const OperationDef& op_def,
+                                      const CLDevice& device, Arguments* args) {
+  auto src_desc = absl::make_unique<TensorDescriptor>(op_def.src_tensors[0]);
+  src_desc->SetTextureAddressMode(GetFastestZeroMode(device));
+  if (op_def.IsBatchSupported()) {
+    src_desc->SetStateVar("BatchedWidth", "true");
+  }
+  args->AddObjectRef("src_tensor", AccessType::READ, std::move(src_desc));
+  auto src_ind_desc =
+      absl::make_unique<TensorDescriptor>(op_def.src_tensors[1]);
+  src_ind_desc->SetTextureAddressMode(GetFastestZeroMode(device));
+  if (op_def.IsBatchSupported()) {
+    src_ind_desc->SetStateVar("BatchedWidth", "true");
+  }
+  args->AddObjectRef("src_indices", AccessType::READ, std::move(src_ind_desc));
+  auto dst_desc = absl::make_unique<TensorDescriptor>(op_def.dst_tensors[0]);
+  if (op_def.IsBatchSupported()) {
+    dst_desc->SetStateVar("BatchedWidth", "true");
+  }
+  args->AddObjectRef("dst_tensor", AccessType::WRITE, std::move(dst_desc));
+  if (op_def.dst_tensors[0].HasAxis(Axis::WIDTH)) {
+    args->AddInt("kernel_size_x");
+    args->AddInt("padding_x");
+    args->AddInt("stride_x");
+  }
+  if (op_def.dst_tensors[0].HasAxis(Axis::HEIGHT)) {
+    args->AddInt("kernel_size_y");
+    args->AddInt("padding_y");
+    args->AddInt("stride_y");
+  }
+  if (op_def.dst_tensors[0].HasAxis(Axis::DEPTH)) {
+    args->AddInt("kernel_size_z");
+    args->AddInt("padding_z");
+    args->AddInt("stride_z");
+  }
 
   std::string c = GetCommonDefines(op_def.precision);
-
   c += "__kernel void main_function(\n";
-  c += src.GetDeclaration(AccessType::READ) + ",\n";
-  c += src_ind.GetDeclaration(AccessType::READ);
-  c += GetArgsDeclaration(linked_operations);
-  c += dst.GetDeclaration(AccessType::WRITE) + ",\n";
-  c += "    int4 src_size,      \n";
-  c += "    int4 dst_size,      \n";
-  c += "    int2 kernel_size,   \n";
-  c += "    int2 padding,       \n";
-  c += "    int2 stride         \n";
-  c += ") {\n";
+  c += "$0) {\n";
   c += "  int X = get_global_id(0);\n";
-  c += "  int Y = get_global_id(1);\n";
-  c += "  int Z = get_global_id(2);\n";
-  c += "  if (X >= dst_size.x || Y >= dst_size.y || Z >= dst_size.z) return;\n";
-  if (op_def.IsBatchSupported()) {
-    c += "  int linear_id = get_global_id(0);\n";
-    c += "  int X0 = linear_id / dst_size.w;\n";
-    c += "  int B = linear_id % dst_size.w;\n";
-    c += "  int src_x0 = (X0 + padding.x) / stride.x;\n";
-    c += "  int src_x = src_x0 * dst_size.w + B;\n";
+  if (op_def.dst_tensors[0].HasAxis(Axis::DEPTH)) {
+    c += "  int linear_id_1 = get_global_id(1);\n";
+    c += "  int Y = linear_id_1 / args.dst_tensor.Depth();\n";
+    c += "  int Z = linear_id_1 % args.dst_tensor.Depth();\n";
+    c += "  int src_z = (Z + args.padding_z) / args.stride_z;\n";
   } else {
-    c += "  int src_x = (X + padding.x) / stride.x;\n";
+    c += "  int Y = get_global_id(1);\n";
   }
-  c += "  int src_y = (Y + padding.y) / stride.y;\n";
-  c += "  " + src.GetAddressWHS("src_adr", "src_x", "src_y", "Z") + "\n";
+  c += "  int S = get_global_id(2);\n";
+  c += "  if (X >= args.dst_tensor.Width() || Y >= args.dst_tensor.Height() || "
+       "S >= args.dst_tensor.Slices()) { \n";
+  c += "    return; \n";
+  c += "  } \n";
+  if (op_def.dst_tensors[0].HasAxis(Axis::BATCH)) {
+    c += "  int linear_id_0 = get_global_id(0);\n";
+    c += "  int X0 = linear_id_0 / args.dst_tensor.Batch();\n";
+    c += "  int B = linear_id_0 % args.dst_tensor.Batch();\n";
+    c += "  int src_x0 = (X0 + args.padding_x) / args.stride_x;\n";
+    c += "  int src_x = src_x0 * args.dst_tensor.Batch() + B;\n";
+  } else {
+    c += "  int src_x = (X + args.padding_x) / args.stride_x;\n";
+  }
+  c += "  int src_y = (Y + args.padding_y) / args.stride_y;\n";
+  std::string src_args = op_def.dst_tensors[0].HasAxis(Axis::DEPTH)
+                             ? "src_x, src_y, src_z, S"
+                             : "src_x, src_y, S";
   if (op_def.src_tensors[0].storage_type == TensorStorageType::BUFFER) {
-    c += "  bool outside = src_x < 0 || src_y < 0 ||";
-    c += "  src_x >= src_size.x || src_y >= src_size.y;\n";
+    if (op_def.dst_tensors[0].HasAxis(Axis::DEPTH)) {
+      c += "  bool outside = src_x < 0 || src_y < 0 || src_z < 0 || src_x >= "
+           "args.src_tensor.Width() || src_y >= args.src_tensor.Height() || "
+           "src_z >= args.src_tensor.Depth();\n";
+    } else {
+      c += "  bool outside = src_x < 0 || src_y < 0 || src_x >= "
+           "args.src_tensor.Width() || src_y >= args.src_tensor.Height();\n";
+    }
     c += "  FLT4 src = (FLT4)(0.0f);\n";
     c += "  int4 ind = (int4)(0);\n";
     c += "  if (!outside) {\n";
-    c += "    src = " + src.Read("src_adr") + ";\n";
-    c += "    ind = convert_int4(" + src_ind.Read("src_adr") + ");\n";
+    c += "    src = args.src_tensor.Read(" + src_args + ");\n";
+    c += "    ind = convert_int4(args.src_indices.Read(" + src_args + "));\n";
     c += "  }\n";
   } else {
-    c += "  FLT4 src = " + src.Read("src_adr", address_mode) + ";\n";
-    c += "  int4 ind = convert_int4(" + src_ind.Read("src_adr", address_mode) +
-         ");\n";
+    c += "  FLT4 src = args.src_tensor.Read(" + src_args + ");\n";
+    c +=
+        "  int4 ind = convert_int4(args.src_indices.Read(" + src_args + "));\n";
   }
-  if (op_def.IsBatchSupported()) {
-    c += "  int t_x = X0 - (src_x0 * stride.x - padding.x);\n";
+  if (op_def.dst_tensors[0].HasAxis(Axis::BATCH)) {
+    c += "  int t_x = X0 - (src_x0 * args.stride_x - args.padding_x);\n";
   } else {
-    c += "  int t_x = X - (src_x * stride.x - padding.x);\n";
+    c += "  int t_x = X - (src_x * args.stride_x - args.padding_x);\n";
+  }
+  c += "  int t_y = Y - (src_y * args.stride_y - args.padding_y);\n";
+  if (op_def.dst_tensors[0].HasAxis(Axis::DEPTH)) {
+    c += "  int t_z = Z - (src_z * args.stride_z - args.padding_z);\n";
+    c += "  int t_index = (t_y * args.kernel_size_x + t_x) * "
+         "args.kernel_size_z + t_z;\n";
+  } else {
+    c += "  int t_index = t_y * args.kernel_size_x + t_x;\n";
   }
-  c += "  int t_y = Y - (src_y * stride.y - padding.y);\n";
-  c += "  int t_index = t_y * kernel_size.x + t_x;\n";
   c += "  FLT4 result;\n";
   const std::string channels[] = {".x", ".y", ".z", ".w"};
   for (int i = 0; i < 4; ++i) {
     const auto& s = channels[i];
     c += "  result" + s + "= t_index == ind" + s + "? src" + s + ": 0.0f;\n";
   }
-  c += PostProcess(linked_operations, {"result", "X", "Y", "Z"});
-  c += "  " + dst.WriteWHS("result", "X", "Y", "Z");
+  if (op_def.dst_tensors[0].HasAxis(Axis::DEPTH)) {
+    c += "  args.dst_tensor.Write(result, X, Y, Z, S);\n";
+  } else {
+    c += "  args.dst_tensor.Write(result, X, Y, S);\n";
+  }
   c += "}\n";
 
   return c;
 }
-
-std::string GetMaxUnpooling3DKernelCode(
-    const OperationDef& op_def, const CLDevice& device,
-    const std::vector<ElementwiseOperation*>& linked_operations) {
-  TensorCodeGenerator src(
-      "src_data",
-      WHDSPoint{"src_size.x", "src_size.y", "src_size.z", "src_size.w"},
-      op_def.src_tensors[0]);
-  TensorCodeGenerator src_ind(
-      "src_data_indices",
-      WHDSPoint{"src_size.x", "src_size.y", "src_size.z", "src_size.w"},
-      op_def.src_tensors[1]);
-  TensorCodeGenerator dst(
-      "dst_data",
-      WHDSPoint{"dst_size.x", "dst_size.y", "dst_size.z", "dst_size.w"},
-      op_def.dst_tensors[0]);
-
-  const auto address_mode = GetFastestZeroMode(device);
-
-  std::string c = GetCommonDefines(op_def.precision);
-
-  c += "__kernel void main_function(\n";
-  c += src.GetDeclaration(AccessType::READ) + ",\n";
-  c += src_ind.GetDeclaration(AccessType::READ);
-  c += GetArgsDeclaration(linked_operations);
-  c += dst.GetDeclaration(AccessType::WRITE) + ",\n";
-  c += "    int4 src_size,      \n";
-  c += "    int4 dst_size,      \n";
-  if (op_def.IsBatchSupported()) {
-    c += "    int batch_size,          \n";
-  }
-  c += "    int4 kernel_size,   \n";
-  c += "    int4 padding,       \n";
-  c += "    int4 stride         \n";
-  c += ") {\n";
-  c += "  int X = get_global_id(0);\n";
-  c += "  int Y = get_global_id(1);\n";
-  c += "  int linear_id_z = get_global_id(2);\n";
-  c += "  int S = linear_id_z % dst_size.w;\n";
-  c += "  int Z = linear_id_z / dst_size.w;\n";
-  c += "  if (X >= dst_size.x || Y >= dst_size.y || Z >= dst_size.z) return;\n";
-  if (op_def.IsBatchSupported()) {
-    c += "  int linear_id = get_global_id(0);\n";
-    c += "  int X0 = linear_id / batch_size;\n";
-    c += "  int B = linear_id % batch_size;\n";
-    c += "  int src_x0 = (X0 + padding.x) / stride.x;\n";
-    c += "  int src_x = src_x0 * batch_size + B;\n";
-  } else {
-    c += "  int src_x = (X + padding.x) / stride.x;\n";
-  }
-  c += "  int src_y = (Y + padding.y) / stride.y;\n";
-  c += "  int src_z = (Z + padding.z) / stride.z;\n";
-  c += "  " + src.GetAddressWHDS("src_adr", "src_x", "src_y", "src_z", "S") +
-       "\n";
-  if (op_def.src_tensors[0].storage_type == TensorStorageType::BUFFER) {
-    c += "  bool outside = src_x < 0 || src_y < 0 || src_z < 0 || ";
-    c += "  src_x >= src_size.x || src_y >= src_size.y || src_z >= "
-         "src_size.z;\n";
-    c += "  FLT4 src = (FLT4)(0.0f);\n";
-    c += "  int4 ind = (int4)(0);\n";
-    c += "  if (!outside) {\n";
-    c += "    src = " + src.Read("src_adr") + ";\n";
-    c += "    ind = convert_int4(" + src_ind.Read("src_adr") + ");\n";
-    c += "  }\n";
-  } else {
-    c += "  FLT4 src = " + src.Read("src_adr", address_mode) + ";\n";
-    c += "  int4 ind = convert_int4(" + src_ind.Read("src_adr", address_mode) +
-         ");\n";
-  }
-  if (op_def.IsBatchSupported()) {
-    c += "  int t_x = X0 - (src_x0 * stride.x - padding.x);\n";
-  } else {
-    c += "  int t_x = X - (src_x * stride.x - padding.x);\n";
-  }
-  c += "  int t_y = Y - (src_y * stride.y - padding.y);\n";
-  c += "  int t_z = Z - (src_z * stride.z - padding.z);\n";
-  c += "  int t_index = (t_y * kernel_size.x + t_x) * kernel_size.z + t_z;\n";
-  c += "  FLT4 result;\n";
-  const std::string channels[] = {".x", ".y", ".z", ".w"};
-  for (int i = 0; i < 4; ++i) {
-    const auto& s = channels[i];
-    c += "  result" + s + " = t_index == ind" + s + " ? src" + s + ": 0.0f;\n";
-  }
-  c += PostProcess(linked_operations, {"result", "X", "Y", "S"});
-  c += "  " + dst.WriteWHDS("result", "X", "Y", "Z", "S");
-  c += "}\n";
-  return c;
-}
 }  // namespace
 
 MaxUnpooling::MaxUnpooling(const OperationDef& definition,
                            const MaxUnpooling2DAttributes& attr)
     : GPUOperation(definition),
-      stride_(attr.strides.w, attr.strides.h),
-      padding_(attr.padding.appended.w, attr.padding.appended.h),
-      kernel_size_(attr.kernel.w, attr.kernel.h) {}
+      stride_(attr.strides.w, attr.strides.h, 0, 0),
+      padding_(attr.padding.appended.w, attr.padding.appended.h, 0, 0),
+      kernel_size_(attr.kernel.w, attr.kernel.h, 0, 0) {}
+
+MaxUnpooling::MaxUnpooling(const OperationDef& definition,
+                           const MaxUnpooling3DAttributes& attr)
+    : GPUOperation(definition),
+      stride_(attr.strides.w, attr.strides.h, attr.strides.d, 0),
+      padding_(attr.padding.appended.w, attr.padding.appended.h,
+               attr.padding.appended.d, 0),
+      kernel_size_(attr.kernel.w, attr.kernel.h, attr.kernel.d, 0) {}
 
 MaxUnpooling::MaxUnpooling(MaxUnpooling&& kernel)
     : GPUOperation(std::move(kernel)),
@@ -219,30 +177,45 @@ MaxUnpooling& MaxUnpooling::operator=(MaxUnpooling&& kernel) {
 }
 
 absl::Status MaxUnpooling::Compile(const CreationContext& creation_context) {
-  const auto code = GetMaxUnpoolingKernelCode(
-      definition_, *creation_context.device, linked_operations_);
+  std::string code =
+      GetMaxUnpoolingKernelCode(definition_, *creation_context.device, &args_);
+  std::string element_wise_code;
+  RETURN_IF_ERROR(
+      MergeOperations(linked_operations_, &args_, &element_wise_code));
+  RETURN_IF_ERROR(args_.TransformToCLCode(creation_context.device->GetInfo(),
+                                          {{"dst_tensor", element_wise_code}},
+                                          &code));
   return creation_context.cache->GetOrCreateCLKernel(
       code, "main_function", *creation_context.context,
       *creation_context.device, &kernel_);
 }
 
 absl::Status MaxUnpooling::BindArguments() {
-  kernel_.ResetBindingCounter();
-  RETURN_IF_ERROR(kernel_.SetMemoryAuto(src_[0]->GetMemoryPtr()));
-  RETURN_IF_ERROR(kernel_.SetMemoryAuto(src_[1]->GetMemoryPtr()));
-  RETURN_IF_ERROR(BindArgs(&kernel_, linked_operations_));
-  RETURN_IF_ERROR(kernel_.SetMemoryAuto(dst_[0]->GetMemoryPtrForWriting()));
-  RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->GetWBatchedHSB()));
-  RETURN_IF_ERROR(kernel_.SetBytesAuto(dst_[0]->GetWBatchedHSB()));
-  RETURN_IF_ERROR(kernel_.SetBytesAuto(kernel_size_));
-  RETURN_IF_ERROR(kernel_.SetBytesAuto(padding_));
-  RETURN_IF_ERROR(kernel_.SetBytesAuto(stride_));
-  return absl::OkStatus();
+  RETURN_IF_ERROR(args_.SetObjectRef("src_tensor", src_[0]));
+  RETURN_IF_ERROR(args_.SetObjectRef("src_indices", src_[1]));
+  RETURN_IF_ERROR(args_.SetObjectRef("dst_tensor", dst_[0]));
+  if (definition_.dst_tensors[0].HasAxis(Axis::WIDTH)) {
+    RETURN_IF_ERROR(args_.SetInt("stride_x", stride_.x));
+    RETURN_IF_ERROR(args_.SetInt("padding_x", padding_.x * src_[0]->Batch()));
+    RETURN_IF_ERROR(args_.SetInt("kernel_size_x", kernel_size_.x));
+  }
+  if (definition_.dst_tensors[0].HasAxis(Axis::HEIGHT)) {
+    RETURN_IF_ERROR(args_.SetInt("stride_y", stride_.y));
+    RETURN_IF_ERROR(args_.SetInt("padding_y", padding_.y));
+    RETURN_IF_ERROR(args_.SetInt("kernel_size_y", kernel_size_.y));
+  }
+  if (definition_.dst_tensors[0].HasAxis(Axis::DEPTH)) {
+    RETURN_IF_ERROR(args_.SetInt("stride_z", stride_.z));
+    RETURN_IF_ERROR(args_.SetInt("padding_z", padding_.z));
+    RETURN_IF_ERROR(args_.SetInt("kernel_size_z", kernel_size_.z));
+  }
+  RETURN_IF_ERROR(SetArguments(linked_operations_, &args_));
+  return args_.Bind(kernel_.kernel());
 }
 
 int3 MaxUnpooling::GetGridSize() const {
   const int grid_x = dst_[0]->Width() * dst_[0]->Batch();
-  const int grid_y = dst_[0]->Height();
+  const int grid_y = dst_[0]->Height() * dst_[0]->Depth();
   const int grid_z = dst_[0]->Slices();
   return int3(grid_x, grid_y, grid_z);
 }
@@ -262,82 +235,9 @@ MaxUnpooling CreateMaxUnpooling(const OperationDef& definition,
   return MaxUnpooling(definition, attr);
 }
 
-MaxUnpooling3D::MaxUnpooling3D(const OperationDef& definition,
-                               const MaxUnpooling3DAttributes& attr)
-    : GPUOperation(definition),
-      stride_(attr.strides.w, attr.strides.h, attr.strides.d),
-      padding_(attr.padding.appended.w, attr.padding.appended.h,
-               attr.padding.appended.d),
-      kernel_size_(attr.kernel.w, attr.kernel.h, attr.kernel.d) {}
-
-MaxUnpooling3D::MaxUnpooling3D(MaxUnpooling3D&& kernel)
-    : GPUOperation(std::move(kernel)),
-      stride_(kernel.stride_),
-      padding_(kernel.padding_),
-      kernel_size_(kernel.kernel_size_),
-      kernel_(std::move(kernel.kernel_)),
-      work_group_size_(kernel.work_group_size_) {}
-
-MaxUnpooling3D& MaxUnpooling3D::operator=(MaxUnpooling3D&& kernel) {
-  if (this != &kernel) {
-    std::swap(stride_, kernel.stride_);
-    std::swap(padding_, kernel.padding_);
-    std::swap(kernel_size_, kernel.kernel_size_);
-    kernel_ = std::move(kernel.kernel_);
-    std::swap(work_group_size_, kernel.work_group_size_);
-    GPUOperation::operator=(std::move(kernel));
-  }
-  return *this;
-}
-
-absl::Status MaxUnpooling3D::Compile(const CreationContext& creation_context) {
-  const auto code = GetMaxUnpooling3DKernelCode(
-      definition_, *creation_context.device, linked_operations_);
-  return creation_context.cache->GetOrCreateCLKernel(
-      code, "main_function", *creation_context.context,
-      *creation_context.device, &kernel_);
-}
-
-absl::Status MaxUnpooling3D::BindArguments() {
-  kernel_.ResetBindingCounter();
-  RETURN_IF_ERROR(kernel_.SetMemoryAuto(src_[0]->GetMemoryPtr()));
-  RETURN_IF_ERROR(kernel_.SetMemoryAuto(src_[1]->GetMemoryPtr()));
-  RETURN_IF_ERROR(BindArgs(&kernel_, linked_operations_));
-  RETURN_IF_ERROR(kernel_.SetMemoryAuto(dst_[0]->GetMemoryPtrForWriting()));
-  RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->GetWBatchedHDS()));
-  RETURN_IF_ERROR(kernel_.SetBytesAuto(dst_[0]->GetWBatchedHDS()));
-  if (definition_.IsBatchSupported()) {
-    RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->Batch()));
-  }
-  RETURN_IF_ERROR(kernel_.SetBytesAuto(
-      int4(kernel_size_.x, kernel_size_.y, kernel_size_.z, 1)));
-  RETURN_IF_ERROR(
-      kernel_.SetBytesAuto(int4(padding_.x, padding_.y, padding_.z, 1)));
-  RETURN_IF_ERROR(
-      kernel_.SetBytesAuto(int4(stride_.x, stride_.y, stride_.z, 1)));
-  return absl::OkStatus();
-}
-
-int3 MaxUnpooling3D::GetGridSize() const {
-  const int grid_x = dst_[0]->Width() * dst_[0]->Batch();
-  const int grid_y = dst_[0]->Height();
-  const int grid_z = dst_[0]->Slices() * dst_[0]->Depth();
-  return int3(grid_x, grid_y, grid_z);
-}
-
-absl::Status MaxUnpooling3D::Tune(const TuningParameters& params) {
-  RETURN_IF_ERROR(BindArguments());
-  return GetBestWorkGroup(params, kernel_, GetGridSize(), &work_group_size_);
-}
-
-absl::Status MaxUnpooling3D::AddToQueue(CLCommandQueue* queue) {
-  RETURN_IF_ERROR(BindArguments());
-  return queue->DispatchImplicit(kernel_, GetGridSize(), work_group_size_);
-}
-
-MaxUnpooling3D CreateMaxUnpooling3D(const OperationDef& definition,
-                                    const MaxUnpooling3DAttributes& attr) {
-  return MaxUnpooling3D(definition, attr);
+MaxUnpooling CreateMaxUnpooling(const OperationDef& definition,
+                                const MaxUnpooling3DAttributes& attr) {
+  return MaxUnpooling(definition, attr);
 }
 
 }  // namespace cl
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/max_unpooling.h b/tensorflow/lite/delegates/gpu/cl/kernels/max_unpooling.h
index 19184ee1e89..24b8c4bbfe3 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/max_unpooling.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/max_unpooling.h
@@ -29,6 +29,8 @@ class MaxUnpooling : public GPUOperation {
  public:
   MaxUnpooling(const OperationDef& definition,
                const MaxUnpooling2DAttributes& attr);
+  MaxUnpooling(const OperationDef& definition,
+               const MaxUnpooling3DAttributes& attr);
   absl::Status AddToQueue(CLCommandQueue* queue) override;
   absl::Status Tune(const TuningParameters& params) override;
 
@@ -44,9 +46,9 @@ class MaxUnpooling : public GPUOperation {
   absl::Status BindArguments();
   int3 GetGridSize() const;
 
-  int2 stride_;
-  int2 padding_;
-  int2 kernel_size_;
+  int4 stride_;
+  int4 padding_;
+  int4 kernel_size_;
 
   CLKernel kernel_;
   int3 work_group_size_ = int3(8, 4, 1);
@@ -55,35 +57,8 @@ class MaxUnpooling : public GPUOperation {
 MaxUnpooling CreateMaxUnpooling(const OperationDef& definition,
                                 const MaxUnpooling2DAttributes& attr);
 
-class MaxUnpooling3D : public GPUOperation {
- public:
-  MaxUnpooling3D(const OperationDef& definition,
-                 const MaxUnpooling3DAttributes& attr);
-  absl::Status AddToQueue(CLCommandQueue* queue) override;
-  absl::Status Tune(const TuningParameters& params) override;
-
-  absl::Status Compile(const CreationContext& creation_context) override;
-
-  // Move only
-  MaxUnpooling3D(MaxUnpooling3D&& kernel);
-  MaxUnpooling3D& operator=(MaxUnpooling3D&& kernel);
-  MaxUnpooling3D(const MaxUnpooling3D&) = delete;
-  MaxUnpooling3D& operator=(const MaxUnpooling3D&) = delete;
-
- private:
-  absl::Status BindArguments();
-  int3 GetGridSize() const;
-
-  int3 stride_;
-  int3 padding_;
-  int3 kernel_size_;
-
-  CLKernel kernel_;
-  int3 work_group_size_ = int3(8, 4, 1);
-};
-
-MaxUnpooling3D CreateMaxUnpooling3D(const OperationDef& definition,
-                                    const MaxUnpooling3DAttributes& attr);
+MaxUnpooling CreateMaxUnpooling(const OperationDef& definition,
+                                const MaxUnpooling3DAttributes& attr);
 
 }  // namespace cl
 }  // namespace gpu

From 13deeb095cbc18960b45eff52c7403f6c1963fd0 Mon Sep 17 00:00:00 2001
From: Raman Sarokin <sorokin@google.com>
Date: Mon, 22 Jun 2020 15:23:37 -0700
Subject: [PATCH 0813/1390] Mean converted to new style.

PiperOrigin-RevId: 317746588
Change-Id: I9bd2eb57a3a39db01636f0e12ab812d5c21bf251
---
 .../lite/delegates/gpu/cl/kernels/mean.cc     | 80 ++++++++++---------
 1 file changed, 44 insertions(+), 36 deletions(-)

diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/mean.cc b/tensorflow/lite/delegates/gpu/cl/kernels/mean.cc
index f79a30e33dd..3f8fb5ee648 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/mean.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/mean.cc
@@ -27,41 +27,46 @@ namespace gpu {
 namespace cl {
 namespace {
 
-std::string GetMeanKernelCode(
-    const OperationDef& op_def,
-    const std::vector<ElementwiseOperation*>& linked_operations,
-    const int3& work_group_size) {
-  TensorCodeGenerator src_tensor(
-      "src_data", WHSPoint{"src_size.x", "src_size.y", "src_size.z"},
-      op_def.src_tensors[0]);
-  TensorCodeGenerator dst_tensor("dst_data", WHSPoint{"1", "1", "src_size.z"},
-                                 op_def.dst_tensors[0]);
+std::string GetMeanKernelCode(const OperationDef& op_def,
+                              const int3& work_group_size, Arguments* args) {
+  args->AddObjectRef(
+      "src_tensor", AccessType::READ,
+      absl::make_unique<TensorDescriptor>(op_def.src_tensors[0]));
+  args->AddObjectRef(
+      "dst_tensor", AccessType::WRITE,
+      absl::make_unique<TensorDescriptor>(op_def.dst_tensors[0]));
+  args->AddFloat("inv_multiplier_1");
+  args->AddFloat("inv_multiplier_2");
 
   std::string c = GetCommonDefines(op_def.precision);
   const std::string wg_x = std::to_string(work_group_size.x);
   const std::string wg_y = std::to_string(work_group_size.y);
   c += "__kernel void main_function(\n";
-  c += src_tensor.GetDeclaration(AccessType::READ);
-  c += GetArgsDeclaration(linked_operations);
-  c += dst_tensor.GetDeclaration(AccessType::WRITE) + ",\n";
-  c += "    int4 src_size,           \n";
-  c += "    float2 inv_multipliers   \n";
-  c += ") {\n";
+  c += "$0) {\n";
   c += "  __local float4 accum[" +
        std::to_string(work_group_size.x * work_group_size.y) + "];\n";
   c += "  int local_x = get_local_id(0);\n";
   c += "  int local_y = get_local_id(1);\n";
   c += "  int local_id = local_y * " + wg_x + " + local_x;\n";
-  c += "  int S = get_global_id(2);\n";
-  c += "  if (S >= src_size.z) return;\n";
+  if (op_def.dst_tensors[0].HasAxis(Axis::BATCH)) {
+    c += "  int linear_id_2 = get_global_id(2);\n";
+    c += "  int S = linear_id_2 / args.dst_tensor.Batch();\n";
+    c += "  int B = linear_id_2 % args.dst_tensor.Batch();\n";
+    c += "  args.dst_tensor.SetBatchRef(B);\n";
+    c += "  args.src_tensor.SetBatchRef(B);\n";
+  } else {
+    c += "  int S = get_global_id(2);\n";
+  }
+  c += "  if (S >= args.dst_tensor.Slices()) return;\n";
   c += "  accum[local_id] = (float4)(0.0f);\n";
-  c += "  for (int s_y = local_y; s_y < src_size.y; s_y += " + wg_y + ") {\n";
-  c += "    for (int s_x = local_x; s_x < src_size.x; s_x += " + wg_x + ") {\n";
-  c += "        accum[local_id] += " +
-       src_tensor.ReadAsFloatWHS("s_x", "s_y", "S") + ";\n";
+  c += "  for (int s_y = local_y; s_y < args.src_tensor.Height(); s_y += " +
+       wg_y + ") {\n";
+  c += "    for (int s_x = local_x; s_x < args.src_tensor.Width(); s_x += " +
+       wg_x + ") {\n";
+  c += "      accum[local_id] += args.src_tensor.Read<float>(s_x, s_y, S);\n";
   c += "    }\n";
   c += "  }\n";
-  c += "  accum[local_id] *= inv_multipliers.x;\n";
+  c += "  accum[local_id] *= args.inv_multiplier_1;\n";
   c += "  barrier(CLK_LOCAL_MEM_FENCE);\n";
   const int total_size = work_group_size.x * work_group_size.y;
   int offset = 1;
@@ -81,9 +86,8 @@ std::string GetMeanKernelCode(
   for (int i = 1; i < reminder; ++i) {
     c += "  sum += accum[" + std::to_string(offset * i) + "];\n";
   }
-  c += "  FLT4 result = TO_FLT4(sum * inv_multipliers.y);\n";
-  c += PostProcess(linked_operations, {"result", "0", "0", "S"});
-  c += "  " + dst_tensor.WriteWHS("result", "0", "0", "S");
+  c += "  FLT4 result = TO_FLT4(sum * args.inv_multiplier_2);\n";
+  c += "  args.dst_tensor.Write(result, 0, 0, S);\n";
   c += "}\n";
   return c;
 }
@@ -107,30 +111,34 @@ absl::Status Mean::Compile(const CreationContext& creation_context) {
   if (creation_context.device->IsAdreno3xx()) {
     work_group_size_ = int3(16, 8, 1);
   }
-  const auto code =
-      GetMeanKernelCode(definition_, linked_operations_, work_group_size_);
+  std::string code = GetMeanKernelCode(definition_, work_group_size_, &args_);
+  std::string element_wise_code;
+  RETURN_IF_ERROR(
+      MergeOperations(linked_operations_, &args_, &element_wise_code));
+  RETURN_IF_ERROR(args_.TransformToCLCode(creation_context.device->GetInfo(),
+                                          {{"dst_tensor", element_wise_code}},
+                                          &code));
   return creation_context.cache->GetOrCreateCLKernel(
       code, "main_function", *creation_context.context,
       *creation_context.device, &kernel_);
 }
 
 absl::Status Mean::BindArguments() {
-  kernel_.ResetBindingCounter();
-  RETURN_IF_ERROR(kernel_.SetMemoryAuto(src_[0]->GetMemoryPtr()));
-  RETURN_IF_ERROR(BindArgs(&kernel_, linked_operations_));
-  RETURN_IF_ERROR(kernel_.SetMemoryAuto(dst_[0]->GetMemoryPtrForWriting()));
-  RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->GetWBatchedHSB()));
+  RETURN_IF_ERROR(args_.SetObjectRef("src_tensor", src_[0]));
+  RETURN_IF_ERROR(args_.SetObjectRef("dst_tensor", dst_[0]));
   const double total_size = src_[0]->Width() * src_[0]->Height();
   const double size_0 = work_group_size_.x * work_group_size_.y;
   const double size_1 = total_size / size_0;
-  RETURN_IF_ERROR(kernel_.SetBytesAuto(float2(1.0 / size_1, 1.0 / size_0)));
-  return absl::OkStatus();
+  RETURN_IF_ERROR(args_.SetFloat("inv_multiplier_1", 1.0 / size_1));
+  RETURN_IF_ERROR(args_.SetFloat("inv_multiplier_2", 1.0 / size_0));
+  RETURN_IF_ERROR(SetArguments(linked_operations_, &args_));
+  return args_.Bind(kernel_.kernel());
 }
 
 int3 Mean::GetGridSize() const {
-  const int grid_x = work_group_size_.x * dst_[0]->Batch();
+  const int grid_x = work_group_size_.x;
   const int grid_y = work_group_size_.y;
-  const int grid_z = dst_[0]->Slices();
+  const int grid_z = dst_[0]->Slices() * dst_[0]->Batch();
   return int3(grid_x, grid_y, grid_z);
 }
 

From 1c12a84e20a626854cad2651da8a23126910ee3f Mon Sep 17 00:00:00 2001
From: Raman Sarokin <sorokin@google.com>
Date: Mon, 22 Jun 2020 15:24:03 -0700
Subject: [PATCH 0814/1390] LSTM converted to new style.

PiperOrigin-RevId: 317746681
Change-Id: I8471f352e2f842d40d71c2733564ab346d4eb69c
---
 .../lite/delegates/gpu/cl/kernels/lstm.cc     | 75 +++++++++----------
 1 file changed, 34 insertions(+), 41 deletions(-)

diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/lstm.cc b/tensorflow/lite/delegates/gpu/cl/kernels/lstm.cc
index 77eea07f278..4732d35e987 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/lstm.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/lstm.cc
@@ -26,39 +26,34 @@ namespace gpu {
 namespace cl {
 namespace {
 
-std::string GetLSTMCode(const OperationDef& op_def, const CLDevice& device) {
-  const WHSBPoint state_size{"1", "1", "state_size.z", "state_size.w"};
-  const WHSBPoint src_size{"1", "1", "src_size.z", "src_size.w"};
-
-  TensorCodeGenerator intermediate("src_data", src_size, op_def.src_tensors[0]);
-  TensorCodeGenerator prev_state("prev_state", state_size,
-                                 op_def.src_tensors[1]);
-
-  TensorCodeGenerator activation("dst_data", state_size, op_def.dst_tensors[0]);
-  TensorCodeGenerator new_state("new_state", state_size, op_def.dst_tensors[1]);
+std::string GetLSTMCode(const OperationDef& op_def, const CLDevice& device,
+                        Arguments* args) {
+  args->AddObjectRef(
+      "intermediate", AccessType::READ,
+      absl::make_unique<TensorDescriptor>(op_def.src_tensors[0]));
+  args->AddObjectRef(
+      "prev_state", AccessType::READ,
+      absl::make_unique<TensorDescriptor>(op_def.src_tensors[1]));
+  args->AddObjectRef(
+      "new_state", AccessType::WRITE,
+      absl::make_unique<TensorDescriptor>(op_def.dst_tensors[0]));
+  args->AddObjectRef(
+      "activation", AccessType::WRITE,
+      absl::make_unique<TensorDescriptor>(op_def.dst_tensors[1]));
 
   std::string c = GetCommonDefines(op_def.precision);
-
   c += "__kernel void main_function(\n";
-  c += intermediate.GetDeclaration(AccessType::READ) + ",\n";
-  c += prev_state.GetDeclaration(AccessType::READ) + ",\n";
-  c += new_state.GetDeclaration(AccessType::WRITE) + ",\n";
-  c += activation.GetDeclaration(AccessType::WRITE) + ",\n";
-  c += "    int4 src_size,             \n";
-  c += "    int4 state_size,           \n";
-  c += "    int BATCH_SIZE             \n";
-  c += ") {\n";
+  c += "$0) {\n";
   c += "  int B = get_global_id(0);\n";
   c += "  int Z = get_global_id(1);\n";
-  c += "  if (Z >= state_size.z || B >= state_size.w) return;\n";
-  c += "  FLT4 prev_st = " + prev_state.ReadWHSB("0", "0", "Z", "B") + ";\n";
-  c += "  FLT4 r0 = " + intermediate.ReadWHSB("0", "0", "Z", "B") + ";\n";
-  c += "  FLT4 r1 = " +
-       intermediate.ReadWHSB("0", "0", "Z + state_size.z", "B") + ";\n";
-  c += "  FLT4 r2 = " +
-       intermediate.ReadWHSB("0", "0", "Z + state_size.z * 2", "B") + ";\n";
-  c += "  FLT4 r3 = " +
-       intermediate.ReadWHSB("0", "0", "Z + state_size.z * 3", "B") + ";\n";
+  c += "  if (Z >= args.activation.Slices() || B >= args.activation.Batch()) "
+       "return;\n";
+  c += "  FLT4 prev_st = args.prev_state.Read(0, 0, Z, B);\n";
+  c += "  FLT4 r0 = args.intermediate.Read(0, 0, Z, B);\n";
+  c += "  int state_stride = args.activation.Slices();\n";
+  c += "  FLT4 r1 = args.intermediate.Read(0, 0, Z + state_stride, B);\n";
+  c += "  FLT4 r2 = args.intermediate.Read(0, 0, Z + state_stride * 2, B);\n";
+  c += "  FLT4 r3 = args.intermediate.Read(0, 0, Z + state_stride * 3, B);\n";
   if (op_def.precision != CalculationsPrecision::F32 && device.IsAdreno()) {
     c += "  FLT4 input_gate;\n";
     c += "  FLT4 new_input;\n";
@@ -97,9 +92,9 @@ std::string GetLSTMCode(const OperationDef& op_def, const CLDevice& device) {
         "* r3));\n";
   }
   c += "  FLT4 new_st = input_gate * new_input + forget_gate * prev_st;\n";
-  c += "  FLT4 activation = output_gate * tanh(new_st);\n";
-  c += "  " + activation.WriteWHSB("activation", "0", "0", "Z", "B");
-  c += "  " + new_state.WriteWHSB("new_st", "0", "0", "Z", "B");
+  c += "  FLT4 act_value = output_gate * tanh(new_st);\n";
+  c += "  args.activation.Write(act_value, 0, 0, Z, B);\n";
+  c += "  args.new_state.Write(new_st, 0, 0, Z, B);\n";
   c += "}\n";
   return c;
 }
@@ -122,22 +117,20 @@ LSTM& LSTM::operator=(LSTM&& kernel) {
 }
 
 absl::Status LSTM::Compile(const CreationContext& creation_context) {
-  const auto code = GetLSTMCode(definition_, *creation_context.device);
+  std::string code = GetLSTMCode(definition_, *creation_context.device, &args_);
+  RETURN_IF_ERROR(
+      args_.TransformToCLCode(creation_context.device->GetInfo(), {}, &code));
   return creation_context.cache->GetOrCreateCLKernel(
       code, "main_function", *creation_context.context,
       *creation_context.device, &kernel_);
 }
 
 absl::Status LSTM::BindArguments() {
-  kernel_.ResetBindingCounter();
-  RETURN_IF_ERROR(kernel_.SetMemoryAuto(src_[0]->GetMemoryPtr()));
-  RETURN_IF_ERROR(kernel_.SetMemoryAuto(src_[1]->GetMemoryPtr()));
-  RETURN_IF_ERROR(kernel_.SetMemoryAuto(dst_[0]->GetMemoryPtrForWriting()));
-  RETURN_IF_ERROR(kernel_.SetMemoryAuto(dst_[1]->GetMemoryPtrForWriting()));
-  RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->GetWHSB()));
-  RETURN_IF_ERROR(kernel_.SetBytesAuto(dst_[0]->GetWHSB()));
-  RETURN_IF_ERROR(kernel_.SetBytesAuto(dst_[0]->Batch()));
-  return absl::OkStatus();
+  RETURN_IF_ERROR(args_.SetObjectRef("intermediate", src_[0]));
+  RETURN_IF_ERROR(args_.SetObjectRef("prev_state", src_[1]));
+  RETURN_IF_ERROR(args_.SetObjectRef("new_state", dst_[0]));
+  RETURN_IF_ERROR(args_.SetObjectRef("activation", dst_[1]));
+  return args_.Bind(kernel_.kernel());
 }
 
 int3 LSTM::GetGridSize() const {

From 5f9ae657edf18cedbf514a325727264a823b0136 Mon Sep 17 00:00:00 2001
From: Rick Chao <rchao@google.com>
Date: Mon, 22 Jun 2020 15:28:57 -0700
Subject: [PATCH 0815/1390] Fix tsan failure in multi_process_runner_test.

PiperOrigin-RevId: 317747609
Change-Id: I8bf2e493431a69a0cf581012666045df2879055e
---
 tensorflow/python/distribute/BUILD            |  2 +-
 .../python/distribute/multi_process_runner.py | 12 +++++
 .../distribute/multi_process_runner_test.py   | 46 ++++---------------
 3 files changed, 23 insertions(+), 37 deletions(-)

diff --git a/tensorflow/python/distribute/BUILD b/tensorflow/python/distribute/BUILD
index f0f3766afe1..74d80b63e12 100644
--- a/tensorflow/python/distribute/BUILD
+++ b/tensorflow/python/distribute/BUILD
@@ -1794,7 +1794,7 @@ py_test(
     name = "multi_process_runner_test",
     srcs = ["multi_process_runner_test.py"],
     python_version = "PY3",
-    tags = ["notsan"],  # TODO(b/158874970)
+    shard_count = 12,
     deps = [
         ":multi_process_runner",
         ":multi_worker_test_base",
diff --git a/tensorflow/python/distribute/multi_process_runner.py b/tensorflow/python/distribute/multi_process_runner.py
index af527b67b4b..cb460c8fff5 100644
--- a/tensorflow/python/distribute/multi_process_runner.py
+++ b/tensorflow/python/distribute/multi_process_runner.py
@@ -423,6 +423,18 @@ class MultiProcessRunner(object):
   def join(self, timeout=_DEFAULT_TIMEOUT_SEC):
     """Joins all the processes with timeout.
 
+    If any of the subprocesses does not exit approximately after `timeout`
+    seconds has passed after `join` call, this raises a
+    `SubprocessTimeoutError`.
+
+    Note: At timeout, it uses SIGTERM to terminate the subprocesses, in order to
+    log the stack traces of the subprocesses when they exit. However, this
+    results in timeout when the test runs with tsan (thread sanitizer); if tsan
+    is being run on the test targets that rely on timeout to assert information,
+    `MultiProcessRunner.terminate_all()` must be called after `join()`, before
+    the test exits, so the subprocesses are terminated with SIGKILL, and data
+    race is removed.
+
     Args:
       timeout: if set and not all processes report status within roughly
         `timeout` seconds, a `SubprocessTimeoutError` exception will be raised.
diff --git a/tensorflow/python/distribute/multi_process_runner_test.py b/tensorflow/python/distribute/multi_process_runner_test.py
index 6194ac527d5..529d7fd91a5 100644
--- a/tensorflow/python/distribute/multi_process_runner_test.py
+++ b/tensorflow/python/distribute/multi_process_runner_test.py
@@ -124,24 +124,6 @@ class MultiProcessRunnerTest(test.TestCase):
                   std_stream_results)
     self.assertIn('This is returned data.', return_value)
 
-  def test_process_that_exits(self):
-
-    def func_to_exit_in_25_sec():
-      logging.error('foo')
-      time.sleep(100)
-      logging.error('bar')
-
-    mpr = multi_process_runner.MultiProcessRunner(
-        func_to_exit_in_25_sec,
-        multi_worker_test_base.create_cluster_spec(num_workers=1),
-        list_stdout=True,
-        max_run_time=25)
-
-    mpr.start()
-    stdout = mpr.join().stdout
-    self.assertLen([msg for msg in stdout if 'foo' in msg], 1)
-    self.assertLen([msg for msg in stdout if 'bar' in msg], 0)
-
   def test_termination(self):
 
     def proc_func():
@@ -301,29 +283,21 @@ class MultiProcessRunnerTest(test.TestCase):
   def test_stdout_available_when_timeout(self):
 
     def proc_func():
-      for i in range(50):
-        logging.info('(logging) %s-%d, i: %d',
-                     multi_worker_test_base.get_task_type(), self._worker_idx(),
-                     i)
-        time.sleep(1)
+      logging.info('something printed')
+      time.sleep(10000)  # Intentionally make the test timeout.
 
     with self.assertRaises(multi_process_runner.SubprocessTimeoutError) as cm:
-      multi_process_runner.run(
+      mpr = multi_process_runner.MultiProcessRunner(
           proc_func,
-          multi_worker_test_base.create_cluster_spec(num_workers=1, num_ps=1),
-          list_stdout=True,
-          timeout=5)
+          multi_worker_test_base.create_cluster_spec(num_workers=1),
+          list_stdout=True)
+      mpr.start()
+      mpr.join(timeout=60)
+    mpr.terminate_all()
 
     list_to_assert = cm.exception.mpr_result.stdout
-    # We should see 5 iterations from worker and ps, however sometime on TAP
-    # due to CPU throttling and slugginess of msan/asan build, this became
-    # flaky. Therefore we allow more margin of errors to only check the first
-    # 3 iterations.
-    for job in ['worker', 'ps']:
-      for iteration in range(0, 3):
-        self.assertTrue(
-            any('(logging) {}-0, i: {}'.format(job, iteration) in line
-                for line in list_to_assert))
+    self.assertTrue(
+        any('something printed' in line for line in list_to_assert))
 
   def test_seg_fault_raises_error(self):
 

From 1d0b5f87672f90c6afdcaa665a88fc9f3216503c Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 22 Jun 2020 15:29:33 -0700
Subject: [PATCH 0816/1390] Some edits to README.md. Adds some interoperation
 testing for ndarray methods. Improves support for tf.Variable. Points
 np.__doc__ to README.md

PiperOrigin-RevId: 317747715
Change-Id: Ia19bc25959dd5ddb00f828c5aaa17ac5a35c8131
---
 tensorflow/python/ops/numpy_ops/README.md   | 121 +++++++++++---------
 tensorflow/python/ops/numpy_ops/__init__.py | 101 +---------------
 tensorflow/python/ops/numpy_ops/np_utils.py |  10 +-
 3 files changed, 79 insertions(+), 153 deletions(-)

diff --git a/tensorflow/python/ops/numpy_ops/README.md b/tensorflow/python/ops/numpy_ops/README.md
index 812f6b946c0..111de75044f 100644
--- a/tensorflow/python/ops/numpy_ops/README.md
+++ b/tensorflow/python/ops/numpy_ops/README.md
@@ -1,44 +1,52 @@
-# tf.experimental.numpy
+# NumPy API on TensorFlow
 
-This module provides a subset of numpy API, built on top of TensorFlow
-operations. APIs are based on numpy 1.16 version.
+This module provides a subset of NumPy API, built on top of TensorFlow
+operations. APIs are based on and have been tested with NumPy 1.16 version.
 
 The set of supported APIs may be expanded over time. Also future releases may
-change the baseline version of numpy API being supported. A list of some
-systematic differences with numpy are listed later in the "Differences with
-Numpy" section.
+change the baseline version of NumPy API being supported. A list of some
+systematic differences with NumPy are listed later in the "Differences with
+NumPy" section.
 
 ## Getting Started
 
 ```python
 import tensorflow as tf
-from tf.experimental import numpy as np
+from tensorflow.python.ops import numpy_ops as np
 print(np.ones([2,1]) + np.ones([1, 2]))
 ```
 
 ## Types
 
-The module provide an `ndarray` class which wraps an immutable `tf.Tensor`.
+The module provides an `ndarray` class which wraps an immutable `tf.Tensor`.
 Additional functions are provided which accept array-like objects. Here
 array-like objects includes `ndarrays` as defined by this module, as well as
-`tf.Tensor`, in addition to types accepted by `numpy`.
+`tf.Tensor`, in addition to types accepted by NumPy.
 
-A subset of `numpy` dtypes are supported. Type promotion follows numpy
+A subset of NumPy dtypes are supported. Type promotion follows NumPy
 semantics.
 
 ```python
 print(np.ones([1, 2], dtype=np.int16) + np.ones([2, 1], dtype=np.uint8))
 ```
 
-## Interop
+## Array Interface
 
-The numpy API calls can be interleaved with TensorFlow calls without incurring
-Tensor data copies. This is true even if the `ndarray` or `tf.Tensor` is placed
-on a non-CPU device.
+The `ndarray` class implements the `__array__` interface. This should allow
+these objects to be passed into contexts that expect a NumPy or array-like
+object (e.g. matplotlib).
 
-Additionally, one could put these calls in a `with tf.GradientTape()` context to
-compute gradients through the numpy API calls. Similarly, code vectorization can
-be done using `tf.vectorized_map()`.
+```python
+import numpy as onp
+onp.sum(np.ones([1, 2]) + onp.ones([2, 1]))
+```
+
+
+## TF Interoperability
+
+The TF-NumPy API calls can be interleaved with TensorFlow calls
+without incurring Tensor data copies. This is true even if the `ndarray` or
+`tf.Tensor` is placed on a non-CPU device.
 
 In general, the expected behavior should be on par with that of code involving
 `tf.Tensor` and running stateless TensorFlow functions on them.
@@ -47,23 +55,24 @@ In general, the expected behavior should be on par with that of code involving
 np.sum(np.ones([1, 2]) + tf.ones([2, 1]))
 ```
 
-## Array Interface
+Note that the `__array_priority__` is currently chosen to be lower than
+`tf.Tensor`. Hence the `+` operator above returns a `tf.Tensor`.
+
+Additional examples of interopability include:
+
+*  using `with tf.GradientTape()` scope to compute gradients through the
+  TF-NumPy API calls.
+*  using `tf.distribution.Strategy` scope for distributed execution
+*  using `tf.vectorized_map()` for speeding up code using auto-vectorization
 
-The `ndarray` class implements the `__array__` interface. This should allow
-these objects to be passed into contexts that expect a `numpy` or array-like
-object (e.g. matplotlib).
 
-```python
-import numpy as onp
-onp.sum(np.ones([1, 2]) + onp.ones([2, 1]))
-```
 
 ## Device Support
 
 Given that `ndarray` and functions wrap TensorFlow constructs, the code will
-have GPU and TPU support on par with TensorFlow. Also the code can be wrapped
-with `tf.function` and XLA compiled. Device placement can be controlled by using
-`with tf.device` scopes.
+have GPU and TPU support on par with TensorFlow. Device placement can be
+controlled by using `with tf.device` scopes. Note that these devices could
+be local or remote.
 
 ```python
 with tf.device("GPU:0"):
@@ -73,10 +82,10 @@ print(tf.convert_to_tensor(x).device)
 
 ## Graph and Eager Modes
 
-Eager mode execution should typically match numpy semantics of executing
+Eager mode execution should typically match NumPy semantics of executing
 op-by-op. However the same code can be executed in graph mode, by putting it
-inside a `tf.function`. The function body can contain numpy code, and the inputs
-can be ndarray as well.
+inside a `tf.function`. The function body can contain NumPy code, and the inputs
+can be `ndarray` as well.
 
 ```python
 @tf.function
@@ -85,45 +94,51 @@ def f(x, y):
 
 f(np.ones([1, 2]), tf.ones([2, 1]))
 ```
+Python control flow based on `ndarray` values will be translated by
+[autograph](https://www.tensorflow.org/code/tensorflow/python/autograph/g3doc/reference/index.md)
+into `tf.cond` and `tf.while_loop` constructs. The code can be XLA compiled
+for further optimizations.
 
-Note that this can change behavior of certain operations since symbolic
-execution may not have information that is computed during runtime.
+However, note that graph mode execution can change behavior of certain
+operations since symbolic execution may not have information that is computed
+during runtime. Some differences are:
 
-Some differences are:
+*   Shapes can be incomplete or unknown in graph mode. This means that
+    `ndarray.shape`, `ndarray.size` and `ndarray.ndim` can return `ndarray`
+    objects instead of returning integer (or tuple of integer) values.
+*   `__len__`, `__iter__` and `__index__` properties of `ndarray`
+    may similarly not be supported in graph mode. Code using these
+    may need to change to explicit shape operations or control flow
+    constructs.
+*   Also note the [autograph limitations](https://www.tensorflow.org/code/tensorflow/python/autograph/g3doc/reference/limitations.md).
 
-*   Shapes can be incomplete or unknown. This means that `ndarray.shape`,
-    `ndarray.size` and `ndarray.ndim` can return `ndarray` objects instead of
-    returning integer (or tuple of integer) values.
-*   Python control flow based on `ndarray` values may not work and may have to
-    be rewritten to use `tf.cond` or `tf.while_loop`. Note that autograph
-    conversion as part of `tf.function` should still work.
-*   `__len__`, `__iter__` and `__index__` properties of `ndarray` may similarly
-    not work in graph mode.
 
 ## Mutation and Variables
 
-`ndarrays` currently wrap immutable `tf.Tensor`. Also currently mutation
+`ndarrays` currently wrap immutable `tf.Tensor`. Hence mutation
 operations like slice assigns are not supported. This may change in the future.
+Note however that one can directly construct a `tf.Variable` and use that with
+the TF-NumPy APIs.
 
-There is currently no explict construct on par with `tf.Variable`. However one
-can directly construct a `tf.Variable` and use that with the numpy APIs in this
-module. See section on Interop.
+```python
+tf_var = tf.Variable(2.0)
+tf_var.assign_add(np.square(tf_var))
+```
 
-## Differences with Numpy
+## Differences with NumPy
 
 Here is a non-exhaustive list of differences:
 
 *   Not all dtypes are currently supported. e.g. `np.float96`, `np.float128`.
     `np.object`, `np.str`, `np.recarray` types are not supported.
-*   `ndarray` storage is in C order only. Fortran order, views, stride_tricks
+*   `ndarray` storage is in C order only. Fortran order, views, `stride_tricks`
     are not supported.
-*   Only a subset of functions and modules are supported. This set would be
+*   Only a subset of functions and modules are supported. This set will be
     expanded over time. For supported functions, some arguments or argument
-    values may not be supported. This differences are listed in the function
-    comments.
+    values may not be supported. This differences are generally provide in the
+    function comments. Full `ufunc` support is also not provided.
 *   Buffer mutation is currently not supported. `ndarrays` wrap immutable
     tensors. This means that output buffer arguments (e..g `out` in ufuncs) are
     not supported
-*   full `ufunc` support is not provided.
-*   Numpy C API is not supported. Numpy's Cython and Swig integration are not
+*   NumPy C API is not supported. NumPy's Cython and Swig integration are not
     supported.
diff --git a/tensorflow/python/ops/numpy_ops/__init__.py b/tensorflow/python/ops/numpy_ops/__init__.py
index c520ece7843..0e87036584e 100644
--- a/tensorflow/python/ops/numpy_ops/__init__.py
+++ b/tensorflow/python/ops/numpy_ops/__init__.py
@@ -12,103 +12,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""numpy_ops.
+"""tf.experimental.numpy: Numpy API on top of TensorFlow.
 
-This module provides a subset of numpy API, built on top of TensorFlow
-operations. APIs are based on numpy 1.16 version.
-
-The set of supported APIs may be expanded over time. Also future releases may
-change the baseline version of numpy API being supported. A list of some
-systematic differences with numpy are listed later in the "Differences with
-Numpy" section.
-
-Types
------
-
-The module provide an `ndarray` class which wraps an immutable `tf.Tensor`.
-Additional functions are provided which accept array-like objects. Here
-array-like objects includes `ndarrays` as defined by this module, as well as
-`tf.Tensor`, in addition to types accepted by `numpy`.
-
-A subset of `numpy` dtypes are supported, along with `tf.bfloat16`.
-
-Device Support
--------------
-
-Given that `ndarray` and functions wrap TensorFlow constructs, the code will
-have GPU and TPU support on par with TensorFlow. Also the code can be wrapped
-with `tf.function` and XLA compiled. Device placement can be controlled by using
-`with tf.device` scopes.
-
-Graph and Eager Modes
---------------------
-
-Eager mode execution should typically match numpy semantics of executing
-op-by-op. However the same code can be executed in graph mode, by putting it
-inside a `tf.function`. This can change behavior of certain operations since
-symbolic execution may not have information that is computed during runtime.
-
-Some differences are:
-  * Shapes can be incomplete or unknown. This means that `ndarray.shape`,
-    `ndarray.size` and `ndarray.ndim` can return `ndarray` objects instead of
-    returning integer (or tuple of integer) values.
-  * Python control flow based on `ndarray` values may not work and may have to
-    be rewritten to use `tf.cond` or `tf.while_loop`. Note that autograph
-    conversion as part of `tf.function` should still work.
-  * `__len__`, `__iter__` and `__index__` properties of `ndarray` may similarly
-    not work in graph mode.
-
-Mutation and Variables
----------------------
-
-`ndarrays` currently wrap immutable `tf.Tensor`. Also currently mutation
-operations like slice assigns are not supported. This may change in the future.
-
-There is currently no explict construct on par with tf.Variable. However one can
-directly construct a `tf.Variable` and use that with the numpy APIs in this
-module. See section on Interop.
-
-Interop
-------
-
-The numpy API calls can be interleaved with TensorFlow calls without incurring
-Tensor data copies. This is true even if the `ndarray` or `tf.Tensor` is placed
-on a non-CPU device.
-
-Additionally, one could put these calls in a `with tf.GradientTape()` context to
-compute gradients through the numpy API calls. Similarly, code vectorization can
-be done using `tf.vectorized_map()`.
-
-In general, the expected behavior should be on par with that of code involving
-`tf.Tensor` and running stateless TensorFlow functions on them.
-
-Array Interface
---------------
-
-The `ndarray` class implements the `__array__ interface. This should allow these
-objects to be passed into contexts that expect a `numpy` or array-like object
-(e.g. matplotlib).
-
-
-Differences with Numpy
----------------------
-
-Here is a non-exhaustive list of differences:
-  * Not all dtypes are currently supported. e.g. `np.float96`, `np.float128`.
-    `np.object`, `np.str`, `np.recarray` types are not supported.
-  * `ndarray` storage is in C order only. Fortran order, views, stride_tricks
-    are not supported.
-  * Only a subset of functions and modules are supported. This set would be
-    expanded over time. For supported functions, some arguments or argument
-    values may not be supported. This differences are listed in the function
-    comments.
-  * Buffer mutation is currently not supported. `ndarrays` wrap immutable
-    tensors. This means that output buffer arguments (e..g `out` in ufuncs) are
-    not supported
-  * full `ufunc` support is not provided.
-  * Numpy C API is not supported. Numpy's Cython and Swig integration are not
-    supported.
+This module provides a subset of numpy APIs, built on top of TensorFlow
+operations. Please see documentation here:
+https://github.com/tensorflow/tensorflow/tree/master/tensorflow/python/ops/numpy_ops.
 """
+# TODO(wangpeng): Append `np_export`ed symbols to the comments above.
+
 # pylint: disable=g-direct-tensorflow-import
 
 from __future__ import absolute_import
diff --git a/tensorflow/python/ops/numpy_ops/np_utils.py b/tensorflow/python/ops/numpy_ops/np_utils.py
index 04ec38d611c..4a7d5f8fea7 100644
--- a/tensorflow/python/ops/numpy_ops/np_utils.py
+++ b/tensorflow/python/ops/numpy_ops/np_utils.py
@@ -24,13 +24,13 @@ import numpy as np
 
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import indexed_slices
-from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.numpy_ops import np_arrays
 from tensorflow.python.ops.numpy_ops import np_dtypes
+from tensorflow.python.types import core
 from tensorflow.python.util import nest
 
 tensor_to_ndarray = np_arrays.tensor_to_ndarray
@@ -43,7 +43,7 @@ def _canonicalize_axis(axis, rank):
 def _canonicalize_axes(axes, rank):
   rank = _maybe_static(rank)
 
-  if isinstance(rank, ops.Tensor):
+  if isinstance(rank, core.Tensor):
     canonicalizer = (
         lambda axis: cond(axis < 0, lambda: axis + rank, lambda: axis))
   else:
@@ -102,7 +102,7 @@ def isscalar(val):
   """Returns whether `val` is a scalar value or scalar Tensor."""
   if isinstance(val, np_arrays.ndarray):
     val = val.data
-  if isinstance(val, ops.Tensor):
+  if isinstance(val, core.Tensor):
     ndims = val.shape.ndims
     if ndims is not None:
       return ndims == 0
@@ -127,7 +127,7 @@ def result_type(*arrays_and_dtypes):
     # Don't put np.ndarray in this list, because np.result_type looks at the
     # value (not just dtype) of np.ndarray to decide the result type.
     if isinstance(
-        x, (np_arrays.ndarray, ops.Tensor, indexed_slices.IndexedSlices)):
+        x, (np_arrays.ndarray, core.Tensor, indexed_slices.IndexedSlices)):
       return _to_numpy_type(x.dtype)
     elif isinstance(x, dtypes.DType):
       return _to_numpy_type(x)
@@ -393,7 +393,7 @@ def get_static_value(x):
     Same as `tf.get_static_value`, except that it returns None when `x` has a
     float dtype.
   """
-  if isinstance(x, ops.Tensor) and (x.dtype.is_floating or x.dtype.is_complex):
+  if isinstance(x, core.Tensor) and (x.dtype.is_floating or x.dtype.is_complex):
     return None
   return tensor_util.constant_value(x)
 

From dc756d14ca1363a434b30397fd9ffa4d65133b68 Mon Sep 17 00:00:00 2001
From: Andy Ly <lyandy@google.com>
Date: Mon, 22 Jun 2020 15:33:04 -0700
Subject: [PATCH 0817/1390] Sync TensorFlow MLIR ODS with TensorFlow op
 registry.

tf.BesselI0e and tf.BesselI1e are moved to tf_ops.td as they are now python generated ops, so they do not have summaries and descriptions in the op registry. int8 and int16 support has been added to tf.Acos, tf.Atan, tf.Inv, tf.Reciprocal, tf.Round, and tf.Tan.

PiperOrigin-RevId: 317748438
Change-Id: Icb86560d5118c38c69819fd02daa3a3841e113b2
---
 .../mlir/tensorflow/ir/tf_generated_ops.td    | 70 ++++---------------
 .../compiler/mlir/tensorflow/ir/tf_ops.td     | 42 +++++++++++
 2 files changed, 56 insertions(+), 56 deletions(-)

diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td b/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
index 1e5dad345f8..ecbe63ac525 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
@@ -53,11 +53,11 @@ def TF_AcosOp : TF_Op<"Acos", [NoSideEffect, SameOperandsAndResultType]> {
   let summary = "Computes acos of x element-wise.";
 
   let arguments = (ins
-    TensorOf<[BF16, F16, F32, F64, I32, I64, TF_Complex128, TF_Complex64]>:$x
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64]>:$x
   );
 
   let results = (outs
-    TensorOf<[BF16, F16, F32, F64, I32, I64, TF_Complex128, TF_Complex64]>:$y
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64]>:$y
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -505,11 +505,11 @@ tf.math.asin(y) # [1.047, 0.785] = x
   }];
 
   let arguments = (ins
-    TensorOf<[BF16, F16, F32, F64, I32, I64, TF_Complex128, TF_Complex64]>:$x
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64]>:$x
   );
 
   let results = (outs
-    TensorOf<[BF16, F16, F32, F64, I32, I64, TF_Complex128, TF_Complex64]>:$y
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64]>:$y
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -638,11 +638,11 @@ tf.math.atan(y) # [1.047, 0.785] = x
   }];
 
   let arguments = (ins
-    TensorOf<[BF16, F16, F32, F64, I32, I64, TF_Complex128, TF_Complex64]>:$x
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64]>:$x
   );
 
   let results = (outs
-    TensorOf<[BF16, F16, F32, F64, I32, I64, TF_Complex128, TF_Complex64]>:$y
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64]>:$y
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -921,48 +921,6 @@ reverse of SpaceToBatch.  See below for a precise description.
   TF_DerivedOperandTypeAttr Tblock_shape = TF_DerivedOperandTypeAttr<1>;
 }
 
-def TF_BesselI0eOp : TF_Op<"BesselI0e", [NoSideEffect, SameOperandsAndResultType]> {
-  let summary = "Computes the Bessel i0e function of `x` element-wise.";
-
-  let description = [{
-Exponentially scaled modified Bessel function of order 0 defined as
-`bessel_i0e(x) = exp(-abs(x)) bessel_i0(x)`.
-
-This function is faster and numerically stabler than `bessel_i0(x)`.
-  }];
-
-  let arguments = (ins
-    TF_FpTensor:$x
-  );
-
-  let results = (outs
-    TF_FpTensor:$y
-  );
-
-  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
-}
-
-def TF_BesselI1eOp : TF_Op<"BesselI1e", [NoSideEffect, SameOperandsAndResultType]> {
-  let summary = "Computes the Bessel i1e function of `x` element-wise.";
-
-  let description = [{
-Exponentially scaled modified Bessel function of order 0 defined as
-`bessel_i1e(x) = exp(-abs(x)) bessel_i1(x)`.
-
-This function is faster and numerically stabler than `bessel_i1(x)`.
-  }];
-
-  let arguments = (ins
-    TF_FpTensor:$x
-  );
-
-  let results = (outs
-    TF_FpTensor:$y
-  );
-
-  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
-}
-
 def TF_BiasAddOp : TF_Op<"BiasAdd", [NoSideEffect]> {
   let summary = "Adds `bias` to `value`.";
 
@@ -4225,11 +4183,11 @@ I.e., \\(y = 1 / x\\).
   }];
 
   let arguments = (ins
-    TensorOf<[BF16, F16, F32, F64, I32, I64, TF_Complex128, TF_Complex64]>:$x
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64]>:$x
   );
 
   let results = (outs
-    TensorOf<[BF16, F16, F32, F64, I32, I64, TF_Complex128, TF_Complex64]>:$y
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64]>:$y
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -7004,11 +6962,11 @@ I.e., \\(y = 1 / x\\).
   }];
 
   let arguments = (ins
-    TensorOf<[BF16, F16, F32, F64, I32, I64, TF_Complex128, TF_Complex64]>:$x
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64]>:$x
   );
 
   let results = (outs
-    TensorOf<[BF16, F16, F32, F64, I32, I64, TF_Complex128, TF_Complex64]>:$y
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64]>:$y
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -7734,11 +7692,11 @@ according to the current system rounding mode use std::cint.
   }];
 
   let arguments = (ins
-    TensorOf<[BF16, F16, F32, F64, I32, I64, TF_Complex128, TF_Complex64]>:$x
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64]>:$x
   );
 
   let results = (outs
-    TensorOf<[BF16, F16, F32, F64, I32, I64, TF_Complex128, TF_Complex64]>:$y
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64]>:$y
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -9532,11 +9490,11 @@ Given an input tensor, this function computes tangent of every
   }];
 
   let arguments = (ins
-    TensorOf<[BF16, F16, F32, F64, I32, I64, TF_Complex128, TF_Complex64]>:$x
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64]>:$x
   );
 
   let results = (outs
-    TensorOf<[BF16, F16, F32, F64, I32, I64, TF_Complex128, TF_Complex64]>:$y
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64]>:$y
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td
index 24e88b0e966..f5d8fbae46a 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td
@@ -1041,4 +1041,46 @@ operation create / operate on a copy of `x`.
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
 }
 
+def TF_BesselI0eOp : TF_Op<"BesselI0e", [NoSideEffect, SameOperandsAndResultType]> {
+  let summary = "Computes the Bessel i0e function of `x` element-wise.";
+
+  let description = [{
+Exponentially scaled modified Bessel function of order 0 defined as
+`bessel_i0e(x) = exp(-abs(x)) bessel_i0(x)`.
+
+This function is faster and numerically stabler than `bessel_i0(x)`.
+  }];
+
+  let arguments = (ins
+    TF_FpTensor:$x
+  );
+
+  let results = (outs
+    TF_FpTensor:$y
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+}
+
+def TF_BesselI1eOp : TF_Op<"BesselI1e", [NoSideEffect, SameOperandsAndResultType]> {
+  let summary = "Computes the Bessel i1e function of `x` element-wise.";
+
+  let description = [{
+Exponentially scaled modified Bessel function of order 0 defined as
+`bessel_i1e(x) = exp(-abs(x)) bessel_i1(x)`.
+
+This function is faster and numerically stabler than `bessel_i1(x)`.
+  }];
+
+  let arguments = (ins
+    TF_FpTensor:$x
+  );
+
+  let results = (outs
+    TF_FpTensor:$y
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+}
+
 #endif // TF_OPS

From e653495365a1d50bb7660686f02e55594ea4b62c Mon Sep 17 00:00:00 2001
From: Tiezhen WANG <wangtz@google.com>
Date: Mon, 22 Jun 2020 15:33:55 -0700
Subject: [PATCH 0818/1390] Update document to use latest TF API - gfile has
 been moved to tf.io

PiperOrigin-RevId: 317748628
Change-Id: I6822e5991d892d238acb0976966e90f60e130961
---
 tensorflow/lite/g3doc/convert/python_api.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/lite/g3doc/convert/python_api.md b/tensorflow/lite/g3doc/convert/python_api.md
index ff14c4b92e7..3171306af13 100644
--- a/tensorflow/lite/g3doc/convert/python_api.md
+++ b/tensorflow/lite/g3doc/convert/python_api.md
@@ -53,7 +53,7 @@ converter = tf.lite.TFLiteConverter.from_saved_model(export_dir)
 tflite_model = converter.convert()
 
 # Save the TF Lite model.
-with tf.gfile.GFile('model.tflite', 'wb') as f:
+with tf.io.gfile.GFile('model.tflite', 'wb') as f:
   f.write(tflite_model)
 ```
 
@@ -125,7 +125,7 @@ converter = tf.lite.TFLiteConverter.from_concrete_functions([concrete_func])
 tflite_model = converter.convert()
 
 # Save the TF Lite model.
-with tf.gfile.GFile('model.tflite', 'wb') as f:
+with tf.io.gfile.GFile('model.tflite', 'wb') as f:
   f.write(tflite_model)
 ```
 

From 3980537192303fc7e7bd10157857d0b0dbd82619 Mon Sep 17 00:00:00 2001
From: Tim Shen <timshen@google.com>
Date: Mon, 22 Jun 2020 15:38:24 -0700
Subject: [PATCH 0819/1390] [MLIR] Remove TupleSelectOp from LHLO.

LHLO uses output-parameter, but TupleSelectOp outputs into a tuple on the device. The current type constraints are wrong, and there is not enough expressiveness in LHLO to define a device-memory representation that can be passed in between kernel launches.

PiperOrigin-RevId: 317749542
Change-Id: I6d3350cbf9decf006f239a7208f6bbef0175ac61
---
 tensorflow/compiler/mlir/xla/ir/lhlo_ops.td   |  9 ---------
 .../compiler/mlir/xla/tests/lhlo_ops.mlir     | 20 -------------------
 2 files changed, 29 deletions(-)

diff --git a/tensorflow/compiler/mlir/xla/ir/lhlo_ops.td b/tensorflow/compiler/mlir/xla/ir/lhlo_ops.td
index aed7c83570e..95ad97118ef 100644
--- a/tensorflow/compiler/mlir/xla/ir/lhlo_ops.td
+++ b/tensorflow/compiler/mlir/xla/ir/lhlo_ops.td
@@ -760,15 +760,6 @@ def LHLO_SortOp: LHLO_Op<"sort", []>, BASE_HLO_SortOp {
   let regions = (region SizedRegion<1>:$comparator);
 }
 
-def LHLO_TupleSelectOp: LHLO_Op<"tuple_select", [SameOperandsShape]> {
-  let arguments = (ins
-    Arg<LHLO_PredBuffer, "", [MemRead]>:$pred,
-    Arg<LHLO_Buffer, "", [MemRead]>:$on_true,
-    Arg<LHLO_Buffer, "", [MemRead]>:$on_false,
-    Arg<LHLO_Buffer, "", [MemWrite]>:$output
-  );
-}
-
 //===----------------------------------------------------------------------===//
 // Late operations
 //===----------------------------------------------------------------------===//
diff --git a/tensorflow/compiler/mlir/xla/tests/lhlo_ops.mlir b/tensorflow/compiler/mlir/xla/tests/lhlo_ops.mlir
index 0ed8b36466e..1e803da4ac6 100644
--- a/tensorflow/compiler/mlir/xla/tests/lhlo_ops.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/lhlo_ops.mlir
@@ -964,23 +964,3 @@ func @sort_memrefs(%arg0: memref<16x16xf32>, %arg1: memref<16x16xf16>,
   }) : (memref<16x16xf32>, memref<16x16xf16>, tuple<memref<16x16xf32>, memref<16x16xf16>>) -> ()
   return
 }
-
-// -----
-
-// CHECK-LABEL: func @tuple_select_memrefs
-func @tuple_select_memrefs(%pred: memref<20xi1>, %true_values: memref<20xf32>,
-                           %false_values: memref<20xf32>, %arg_out: memref<20xf32>) -> () {
-  "xla_lhlo.tuple_select"(%pred, %true_values, %false_values, %arg_out)
-      : (memref<20xi1>, memref<20xf32>, memref<20xf32>, memref<20xf32>) -> ()
-  return
-}
-
-// -----
-
-func @tuple_select_memrefs(%pred: memref<10xi1>, %true_values: memref<20xf32>,
-                           %false_values: memref<20xf32>, %arg_out: memref<20xf32>) -> () {
-  // expected-error@+1{{requires the same shape for all operands}}
-  "xla_lhlo.tuple_select"(%pred, %true_values, %false_values, %arg_out)
-      : (memref<10xi1>, memref<20xf32>, memref<20xf32>, memref<20xf32>) -> ()
-  return
-}

From 752ce6670f01fa536adcf67c06905ab7a7347add Mon Sep 17 00:00:00 2001
From: Raman Sarokin <sorokin@google.com>
Date: Mon, 22 Jun 2020 15:50:15 -0700
Subject: [PATCH 0820/1390] ConcatZ converted to new style. Added support of
 BHWDC/HWDC layouts.

PiperOrigin-RevId: 317751701
Change-Id: I0cb91a6a2317e13624189081eb321a1562810b15
---
 .../lite/delegates/gpu/cl/kernels/concat_z.cc | 135 ++++++++----------
 1 file changed, 60 insertions(+), 75 deletions(-)

diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/concat_z.cc b/tensorflow/lite/delegates/gpu/cl/kernels/concat_z.cc
index f1970cef645..08c18907c78 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/concat_z.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/concat_z.cc
@@ -36,73 +36,61 @@ bool IsAllChannelsX4(const std::vector<int>& channels) {
   return true;
 }
 
-std::string GetSrcDepthSizeVar(int src_index) {
-  return "src_size_" + std::to_string(src_index) + "_depth";
-}
-
-std::string GetConcatKernelCode(
-    const OperationDef& op_def, const std::vector<int>& channels,
-    const std::vector<ElementwiseOperation*>& linked_operations) {
-  std::vector<TensorCodeGenerator> srcs(channels.size());
-  for (int i = 0; i < channels.size(); ++i) {
-    const std::string tensor_name = "src_data_" + std::to_string(i);
-    srcs[i] = TensorCodeGenerator(
-        tensor_name,
-        WHSPoint{"dst_size.x", "dst_size.y", GetSrcDepthSizeVar(i)},
-        op_def.src_tensors[i]);
+std::string GetConcatKernelCode(const OperationDef& op_def,
+                                const std::vector<int>& channels,
+                                Arguments* args) {
+  std::vector<std::string> tensor_names(op_def.src_tensors.size());
+  for (int i = 0; i < op_def.src_tensors.size(); ++i) {
+    tensor_names[i] = "src_tensor_" + std::to_string(i);
+    auto src_desc = absl::make_unique<TensorDescriptor>(op_def.src_tensors[i]);
+    if (op_def.IsBatchSupported()) {
+      src_desc->SetStateVar("BatchedWidth", "true");
+    }
+    args->AddObjectRef(tensor_names[i], AccessType::READ, std::move(src_desc));
   }
-  TensorCodeGenerator dst("dst_data",
-                          WHSPoint{"dst_size.x", "dst_size.y", "dst_size.z"},
-                          op_def.dst_tensors[0]);
+  auto dst_desc = absl::make_unique<TensorDescriptor>(op_def.dst_tensors[0]);
+  if (op_def.IsBatchSupported()) {
+    dst_desc->SetStateVar("BatchedWidth", "true");
+  }
+  args->AddObjectRef("dst_tensor", AccessType::WRITE, std::move(dst_desc));
 
   std::string c = GetCommonDefines(op_def.precision);
-  const std::string postfix[] = {".x", ".y", ".z", ".w"};
-
   c += "__kernel void main_function(\n";
-  for (const auto& src : srcs) {
-    c += src.GetDeclaration(AccessType::READ) + ",\n";
-  }
-  c += dst.GetDeclaration(AccessType::WRITE);
-  c += GetArgsDeclaration(linked_operations);
-  for (int i = 0; i < channels.size(); ++i) {
-    c += "    int " + GetSrcDepthSizeVar(i) + ",\n";
-  }
-  c += "    int4 dst_size\n";
-  c += ") {\n";
+  c += "$0) {\n";
   c += "  int X = get_global_id(0);\n";
   c += "  int Y = get_global_id(1);\n";
-  c += "  if (X >= dst_size.x || Y >= dst_size.y) return; \n";
+  std::string coords = "X, Y";
+  if (op_def.dst_tensors[0].HasAxis(Axis::DEPTH)) {
+    c += "  int Z = get_global_id(2);\n";
+    c += "  if (Z >= args.dst_tensor.Depth()) return;\n";
+    coords = "X, Y, Z";
+  }
+  c += "  if (X >= args.dst_tensor.Width() || Y >= args.dst_tensor.Height()) "
+       "return; \n";
 
   if (IsAllChannelsX4(channels)) {
     // When all channels % 4 == 0 we can read/assign/write FLT4 elements easily.
     // Also it is easy to write a loop in this case, to prevent long kernel
     // generation.
-    c += "  int Z = 0;\n";
+    c += "  int S = 0;\n";
     for (int i = 0; i < channels.size(); ++i) {
+      std::string t_name = "args." + tensor_names[i];
       const int depth = DivideRoundUp(channels[i], 4);
       if (depth % 2 == 0) {
         // We can read more at once inside of loop in case depth % 2 == 0
         // it should be better for reading latency hiding
-        c += "  for (int i = 0; i < " + GetSrcDepthSizeVar(i) + "; i += 2) {\n";
-        c += "    FLT4 result0 = " + srcs[i].ReadWHS("X", "Y", "i") + ";\n";
-        c += "    FLT4 result1 = " + srcs[i].ReadWHS("X", "Y", "i + 1") + ";\n";
-        c += "    " + dst.GetAddressWHS("dst_adr0", "X", "Y", "Z") + "\n";
-        c += "    " + dst.GetAddressWHS("dst_adr1", "X", "Y", "Z + 1") + "\n";
-        const LinkingContext context_0{"result0", "X", "Y", "Z"};
-        const LinkingContext context_1{"result1", "X", "Y", "Z + 1"};
-        c += PostProcess(linked_operations, context_0);
-        c += PostProcess(linked_operations, context_1);
-        c += "    " + dst.WriteWHS("result0", "X", "Y", "Z");
-        c += "    " + dst.WriteWHS("result1", "X", "Y", "Z + 1");
-        c += "    Z += 2;\n";
+        c += "  for (int i = 0; i < " + t_name + ".Slices(); i += 2) {\n";
+        c += "    FLT4 result0 = " + t_name + ".Read(" + coords + ", i);\n";
+        c += "    FLT4 result1 = " + t_name + ".Read(" + coords + ", i + 1);\n";
+        c += "    args.dst_tensor.Write(result0, " + coords + ", S);\n";
+        c += "    args.dst_tensor.Write(result1, " + coords + ", S + 1);\n";
+        c += "    S += 2;\n";
         c += "  }\n";
       } else {
-        c += "  for (int i = 0; i < " + GetSrcDepthSizeVar(i) + "; ++i) {\n";
-        c += "    FLT4 result = " + srcs[i].ReadWHS("X", "Y", "i") + ";\n";
-        const LinkingContext context{"result", "X", "Y", "Z"};
-        c += PostProcess(linked_operations, context);
-        c += "    " + dst.WriteWHS("result", "X", "Y", "Z");
-        c += "    Z++;\n";
+        c += "  for (int i = 0; i < " + t_name + ".Slices(); ++i) {\n";
+        c += "    FLT4 result = " + t_name + ".Read(" + coords + ", i);\n";
+        c += "    args.dst_tensor.Write(result, " + coords + ", S);\n";
+        c += "    S++;\n";
         c += "  }\n";
       }
     }
@@ -111,24 +99,23 @@ std::string GetConcatKernelCode(
     int out_channel = 0;
     int read_index = 0;
     int z = 0;
+    const std::string postfix[] = {".x", ".y", ".z", ".w"};
     for (int i = 0; i < channels.size(); ++i) {
+      std::string tensor_name = "args." + tensor_names[i];
       const int depth = DivideRoundUp(channels[i], 4);
       for (int d = 0; d < depth; ++d) {
         const int channels_in_group = std::min(4, channels[i] - d * 4);
         const std::string temp_name = "t" + std::to_string(read_index);
-        c += "  FLT4 " + temp_name + " = ";
-        c += srcs[i].ReadWHS("X", "Y", std::to_string(d)) + ";\n";
+        c += "  FLT4 " + temp_name + " = " + tensor_name + ".Read(" + coords +
+             ", " + std::to_string(d) + ");\n";
         for (int ch = 0; ch < channels_in_group; ++ch) {
           c += "  result" + postfix[out_channel] + " = ";
           c += temp_name + postfix[ch] + ";\n";
           out_channel++;
           if (out_channel == 4) {
             out_channel = 0;
-            c += "  {\n";
-            const LinkingContext context{"result", "X", "Y", std::to_string(z)};
-            c += PostProcess(linked_operations, context);
-            c += "  " + dst.WriteWHS("result", "X", "Y", std::to_string(z));
-            c += "  }\n";
+            c += "  args.dst_tensor.Write(result, " + coords + ", " +
+                 std::to_string(z) + ");\n";
             z++;
           }
         }
@@ -136,11 +123,8 @@ std::string GetConcatKernelCode(
       }
     }
     if (out_channel != 0) {
-      c += "  {\n";
-      const LinkingContext context{"result", "X", "Y", std::to_string(z)};
-      c += PostProcess(linked_operations, context);
-      c += "  " + dst.WriteWHS("result", "X", "Y", std::to_string(z));
-      c += "  }\n";
+      c += "  args.dst_tensor.Write(result, " + coords + ", " +
+           std::to_string(z) + ");\n";
     }
   }
   c += "}\n";
@@ -166,8 +150,7 @@ ConcatZ& ConcatZ::operator=(ConcatZ&& kernel) {
 }
 
 absl::Status ConcatZ::Compile(const CreationContext& creation_context) {
-  const auto code =
-      GetConcatKernelCode(definition_, channels_, linked_operations_);
+  std::string code = GetConcatKernelCode(definition_, channels_, &args_);
   std::vector<CompilerOptions> options;
   if (creation_context.device->IsPowerVR() &&
       definition_.precision == CalculationsPrecision::F32 &&
@@ -179,32 +162,34 @@ absl::Status ConcatZ::Compile(const CreationContext& creation_context) {
       definition_.precision != CalculationsPrecision::F32 &&
       definition_.src_tensors[0].storage_type != TensorStorageType::BUFFER &&
       !IsAllChannelsX4(channels_)) {
-    // BUG, some AMD gpus crashe without it
+    // BUG, some AMD gpus crash without it
     options.push_back(CompilerOptions::CL_OPT_DISABLE);
   }
+  std::string element_wise_code;
+  RETURN_IF_ERROR(
+      MergeOperations(linked_operations_, &args_, &element_wise_code));
+  RETURN_IF_ERROR(args_.TransformToCLCode(creation_context.device->GetInfo(),
+                                          {{"dst_tensor", element_wise_code}},
+                                          &code));
   return creation_context.cache->GetOrCreateCLKernel(
       code, "main_function", options, *creation_context.context,
       *creation_context.device, &kernel_);
 }
 
 absl::Status ConcatZ::BindArguments() {
-  kernel_.ResetBindingCounter();
-  for (int i = 0; i < channels_.size(); ++i) {
-    RETURN_IF_ERROR(kernel_.SetMemoryAuto(src_[i]->GetMemoryPtr()));
+  for (int i = 0; i < definition_.src_tensors.size(); ++i) {
+    RETURN_IF_ERROR(
+        args_.SetObjectRef("src_tensor_" + std::to_string(i), src_[i]));
   }
-  RETURN_IF_ERROR(kernel_.SetMemoryAuto(dst_[0]->GetMemoryPtrForWriting()));
-  RETURN_IF_ERROR(BindArgs(&kernel_, linked_operations_));
-  for (int i = 0; i < channels_.size(); ++i) {
-    RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[i]->Slices()));
-  }
-  RETURN_IF_ERROR(kernel_.SetBytesAuto(dst_[0]->GetWBatchedHSB()));
-  return absl::OkStatus();
+  RETURN_IF_ERROR(args_.SetObjectRef("dst_tensor", dst_[0]));
+  RETURN_IF_ERROR(SetArguments(linked_operations_, &args_));
+  return args_.Bind(kernel_.kernel());
 }
 
 int3 ConcatZ::GetGridSize() const {
   const int grid_x = dst_[0]->Width() * dst_[0]->Batch();
   const int grid_y = dst_[0]->Height();
-  const int grid_z = 1;
+  const int grid_z = dst_[0]->Depth();
   return int3(grid_x, grid_y, grid_z);
 }
 

From afeac170f0d3035ddea8ff4cac637b47074fb775 Mon Sep 17 00:00:00 2001
From: Raman Sarokin <sorokin@google.com>
Date: Mon, 22 Jun 2020 15:52:35 -0700
Subject: [PATCH 0821/1390] ConverterToConvWeights converted to new style.

PiperOrigin-RevId: 317752091
Change-Id: I4e23ab1a9a943c197db7a7be25e5272114fd43c3
---
 .../gpu/cl/kernels/conv_weights_converter.cc  | 95 ++++++++++---------
 .../lite/delegates/gpu/cl/tensor_type.cc      | 17 ++++
 .../lite/delegates/gpu/cl/tensor_type.h       |  3 +
 3 files changed, 70 insertions(+), 45 deletions(-)

diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/conv_weights_converter.cc b/tensorflow/lite/delegates/gpu/cl/kernels/conv_weights_converter.cc
index 71559ab587a..18a6886dc89 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/conv_weights_converter.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/conv_weights_converter.cc
@@ -26,51 +26,51 @@ namespace cl {
 namespace {
 
 std::string GetConverterToConvWeightsCode(
-    const OperationDef& op_def,
-    const ConvWeightsDescription& conv_weights_desc) {
-  TensorCodeGenerator src_tensor(
-      "src_data",
-      WHSBPoint{"src_size.x", "src_size.y", "src_size.z", "src_size.w"},
-      op_def.src_tensors[0]);
-  TensorCodeGenerator dst_tensor(
-      "dst_data",
-      WHSBPoint{"dst_size.x", "dst_size.y", "dst_size.z", "dst_size.w"},
-      op_def.dst_tensors[0]);
+    const OperationDef& op_def, const ConvWeightsDescription& conv_weights_desc,
+    Arguments* args) {
+  args->AddObjectRef(
+      "src_tensor", AccessType::READ,
+      absl::make_unique<TensorDescriptor>(op_def.src_tensors[0]));
+  args->AddObjectRef(
+      "dst_tensor", AccessType::WRITE,
+      absl::make_unique<TensorDescriptor>(op_def.dst_tensors[0]));
+  args->AddFloat("mask_x");
+  args->AddFloat("mask_y");
+  args->AddFloat("mask_z");
+  args->AddFloat("mask_w");
 
   std::string c = GetCommonDefines(op_def.precision);
   c += "__kernel void main_function(\n";
-  c += src_tensor.GetDeclaration(AccessType::READ) + ",\n";
-  c += dst_tensor.GetDeclaration(AccessType::WRITE) + ",\n";
-  c += "    int4 src_size,              \n";
-  c += "    float4 mask\n";
-  c += ") {\n";
+  c += "$0) {\n";
   c += "  int GROUP_SIZE = " +
        std::to_string(conv_weights_desc.output_group_size) + ";\n";
   c += "  int O = get_global_id(0) * 4;\n";
   c += "  int I = get_global_id(1);\n";
   c += "  int Z = get_global_id(2);\n";
-  c += "  int W = Z % src_size.x;\n";
-  c += "  int H = Z / src_size.x;\n";
-  c += "  if (O >= src_size.w || I >= src_size.z || H >= src_size.y) return;\n";
-  c += "  FLT4 v0 =" + src_tensor.ReadWHSB("W", "H", "I", "O + 0") + ";\n";
+  c += "  int W = Z % args.src_tensor.Width();\n";
+  c += "  int H = Z / args.src_tensor.Width();\n";
+  c += "  if (O >= args.src_tensor.Batch() || I >= args.src_tensor.Slices() || "
+       "H >= args.src_tensor.Height()) return;\n";
+  c += "  FLT4 v0 = args.src_tensor.Read(W, H, I, O + 0);\n";
   c += "  FLT4 v1 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);\n";
   c += "  FLT4 v2 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);\n";
   c += "  FLT4 v3 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);\n";
-  c += "  if (O + 1 < src_size.w) {\n";
-  c += "    v1 =" + src_tensor.ReadWHSB("W", "H", "I", "O + 1") + ";\n";
+  c += "  if (O + 1 < args.src_tensor.Batch()) {\n";
+  c += "    v1 = args.src_tensor.Read(W, H, I, O + 1);\n";
   c += "  }\n";
-  c += "  if (O + 2 < src_size.w) {\n";
-  c += "    v2 =" + src_tensor.ReadWHSB("W", "H", "I", "O + 2") + ";\n";
+  c += "  if (O + 2 < args.src_tensor.Batch()) {\n";
+  c += "    v2 = args.src_tensor.Read(W, H, I, O + 2);\n";
   c += "  }\n";
-  c += "  if (O + 3 < src_size.w) {\n";
-  c += "    v3 =" + src_tensor.ReadWHSB("W", "H", "I", "O + 3") + ";\n";
+  c += "  if (O + 3 < args.src_tensor.Batch()) {\n";
+  c += "    v3 = args.src_tensor.Read(W, H, I, O + 3);\n";
   c += "  }\n";
-  c += "  if (I == src_size.z - 1) {\n";
-  c += "    FLT4 mask_t = TO_FLT4(mask);\n";
-  c += "    v0 *= mask_t;\n";
-  c += "    v1 *= mask_t;\n";
-  c += "    v2 *= mask_t;\n";
-  c += "    v3 *= mask_t;\n";
+  c += "  if (I == args.src_tensor.Slices() - 1) {\n";
+  c += "    FLT4 mask = (FLT4)(args.mask_x, args.mask_y, args.mask_z, "
+       "args.mask_w);\n";
+  c += "    v0 *= mask;\n";
+  c += "    v1 *= mask;\n";
+  c += "    v2 *= mask;\n";
+  c += "    v3 *= mask;\n";
   c += "  }\n";
   c += "  FLT4 r0 = (FLT4)(v0.x, v1.x, v2.x, v3.x);\n";
   c += "  FLT4 r1 = (FLT4)(v0.y, v1.y, v2.y, v3.y);\n";
@@ -78,17 +78,18 @@ std::string GetConverterToConvWeightsCode(
   c += "  FLT4 r3 = (FLT4)(v0.w, v1.w, v2.w, v3.w);\n";
   c += "  int d_index = O / (GROUP_SIZE * 4);\n";
   c += "  int k_index = (O % (GROUP_SIZE * 4)) / 4;\n";
-  c += "  int dst_offset = (((d_index * src_size.y + H) * src_size.x + W) * "
-       "src_size.z + I) * GROUP_SIZE + "
+  c += "  int dst_offset = (((d_index * args.src_tensor.Height() + H) * "
+       "args.src_tensor.Width() + W) * "
+       "args.src_tensor.Slices() + I) * GROUP_SIZE + "
        "k_index;\n";
   c += "  int address0 = dst_offset * 4 + 0;\n";
   c += "  int address1 = dst_offset * 4 + 1;\n";
   c += "  int address2 = dst_offset * 4 + 2;\n";
   c += "  int address3 = dst_offset * 4 + 3;\n";
-  c += "  " + dst_tensor.Write("r0", "address0");
-  c += "  " + dst_tensor.Write("r1", "address1");
-  c += "  " + dst_tensor.Write("r2", "address2");
-  c += "  " + dst_tensor.Write("r3", "address3");
+  c += "  args.dst_tensor.WriteLinear(r0, dst_offset * 4 + 0)\n;";
+  c += "  args.dst_tensor.WriteLinear(r1, dst_offset * 4 + 1)\n;";
+  c += "  args.dst_tensor.WriteLinear(r2, dst_offset * 4 + 2)\n;";
+  c += "  args.dst_tensor.WriteLinear(r3, dst_offset * 4 + 3)\n;";
   c += "}\n";
   return c;
 }
@@ -115,20 +116,24 @@ ConverterToConvWeights& ConverterToConvWeights::operator=(
 absl::Status ConverterToConvWeights::Compile(
     const CreationContext& creation_context) {
   std::string code =
-      GetConverterToConvWeightsCode(definition_, conv_weights_desc_);
+      GetConverterToConvWeightsCode(definition_, conv_weights_desc_, &args_);
+  RETURN_IF_ERROR(
+      args_.TransformToCLCode(creation_context.device->GetInfo(), {}, &code));
   return creation_context.cache->GetOrCreateCLKernel(
       code, "main_function", *creation_context.context,
       *creation_context.device, &kernel_);
 }
 
 absl::Status ConverterToConvWeights::BindArguments() {
-  kernel_.ResetBindingCounter();
-  RETURN_IF_ERROR(kernel_.SetMemoryAuto(src_[0]->GetMemoryPtr()));
-  RETURN_IF_ERROR(kernel_.SetMemoryAuto(dst_[0]->GetMemoryPtrForWriting()));
-  RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->GetWHSB()));
-  RETURN_IF_ERROR(
-      kernel_.SetBytesAuto(GetMaskForLastPlane(src_[0]->Channels())));
-  return absl::OkStatus();
+  RETURN_IF_ERROR(args_.SetObjectRef("src_tensor", src_[0]));
+  RETURN_IF_ERROR(args_.SetObjectRef("dst_tensor", dst_[0]));
+  float4 mask = GetMaskForLastPlane(src_[0]->Channels());
+  RETURN_IF_ERROR(args_.SetFloat("mask_x", mask.x));
+  RETURN_IF_ERROR(args_.SetFloat("mask_y", mask.y));
+  RETURN_IF_ERROR(args_.SetFloat("mask_z", mask.z));
+  RETURN_IF_ERROR(args_.SetFloat("mask_w", mask.w));
+  RETURN_IF_ERROR(SetArguments(linked_operations_, &args_));
+  return args_.Bind(kernel_.kernel());
 }
 
 int3 ConverterToConvWeights::GetGridSize() const {
diff --git a/tensorflow/lite/delegates/gpu/cl/tensor_type.cc b/tensorflow/lite/delegates/gpu/cl/tensor_type.cc
index 8e048675697..0c3a1e3508c 100644
--- a/tensorflow/lite/delegates/gpu/cl/tensor_type.cc
+++ b/tensorflow/lite/delegates/gpu/cl/tensor_type.cc
@@ -168,6 +168,8 @@ absl::Status TensorDescriptor::PerformSelector(
     return PerformReadSelector(args, template_args, result);
   } else if (selector == "Write") {
     return PerformWriteSelector(args, result);
+  } else if (selector == "WriteLinear") {
+    return PerformWriteLinearSelector(args, result);
   } else if (selector == "GetAddress") {
     return PerformGetAddressSelector(args, result);
   } else {
@@ -253,6 +255,21 @@ absl::Status TensorDescriptor::PerformWriteSelector(
   return absl::OkStatus();
 }
 
+absl::Status TensorDescriptor::PerformWriteLinearSelector(
+    const std::vector<std::string>& args, std::string* result) const {
+  if (storage_type != TensorStorageType::BUFFER &&
+      storage_type != TensorStorageType::IMAGE_BUFFER) {
+    return absl::InvalidArgumentError(
+        "WriteLinear selector can be used only with linear "
+        "storages(BUFFER/IMAGE_BUFFER)");
+  }
+  if (args.size() != 2) {
+    return absl::NotFoundError("Unrecognized WriteLinear selector");
+  }
+  *result = Write(args[0], "(" + args[1] + ")");
+  return absl::OkStatus();
+}
+
 std::string TensorDescriptor::Read(DataType read_as_type,
                                    const std::string& global_address) const {
   const std::string read_as =
diff --git a/tensorflow/lite/delegates/gpu/cl/tensor_type.h b/tensorflow/lite/delegates/gpu/cl/tensor_type.h
index 7e173753217..3a1d7abb01a 100644
--- a/tensorflow/lite/delegates/gpu/cl/tensor_type.h
+++ b/tensorflow/lite/delegates/gpu/cl/tensor_type.h
@@ -93,6 +93,9 @@ struct TensorDescriptor : public GPUObjectDescriptor {
   absl::Status PerformWriteSelector(const std::vector<std::string>& args,
                                     std::string* result) const;
 
+  absl::Status PerformWriteLinearSelector(const std::vector<std::string>& args,
+                                          std::string* result) const;
+
   std::string Read(DataType read_as_type,
                    const std::string& global_address) const;
   std::string Write(const std::string& var_name,

From 26421826a0b43d6148ece23d2fc6d0a4d0442fa0 Mon Sep 17 00:00:00 2001
From: Yujing Zhang <yujingzhang@google.com>
Date: Mon, 22 Jun 2020 15:52:37 -0700
Subject: [PATCH 0822/1390] Support packed variable for tf data captured
 function.

If a packed variable is passed to a DatasetOp through a captured function,
1) function instantiation (graph expansion) would insert a PackOp between per-replica arg nodes and the DatasetOp.
2) function execution would feed unpacked ResourceHandles to corresponding sub-functions.

PiperOrigin-RevId: 317752101
Change-Id: I53f7f1d6d075fd40a76a32f329f2dcbbf1db6494
---
 .../process_function_library_runtime.cc       | 19 ++++++++---
 .../process_function_library_runtime_test.cc  | 32 +++++++++++++-----
 .../core/kernels/data/captured_function.cc    | 33 +++++++++++++++++--
 .../custom_training_loop_input_test.py        |  4 ---
 tensorflow/python/eager/function_test.py      |  7 ++++
 tensorflow/python/framework/func_graph.py     |  7 ++++
 6 files changed, 83 insertions(+), 19 deletions(-)

diff --git a/tensorflow/core/common_runtime/process_function_library_runtime.cc b/tensorflow/core/common_runtime/process_function_library_runtime.cc
index 160b9dd88f4..5ee6546f6be 100644
--- a/tensorflow/core/common_runtime/process_function_library_runtime.cc
+++ b/tensorflow/core/common_runtime/process_function_library_runtime.cc
@@ -1356,11 +1356,22 @@ void ProcessFunctionLibraryRuntime::Run(
       // "Index"s of _Arg nodes are unique when all arguments are local Tensors.
       for (const auto& it : comp_data.arg_indices) {
         if (it.sub_index >= 0) {
-          return errors::InvalidArgument("Got unexpected sub_index ",
-                                         it.sub_index, " for argument ",
-                                         it.index);
+          const Tensor& t = args[it.index];
+          if (t.dtype() != DT_RESOURCE) {
+            return errors::InvalidArgument("Got unexpected sub_index ",
+                                           it.sub_index, " for argument ",
+                                           it.index);
+          }
+          const auto& handles = t.flat<ResourceHandle>();
+          if (it.sub_index >= handles.size()) {
+            return errors::InvalidArgument(
+                "Sub_index ", it.sub_index, "is out of range [0,",
+                handles.size(), ") for argument ", it.index);
+          }
+          comp_args->args.push_back(Tensor(handles(it.sub_index)));
+        } else {
+          comp_args->args.push_back(args[it.index]);
         }
-        comp_args->args.push_back(args[it.index]);
       }
       return Status::OK();
     };
diff --git a/tensorflow/core/common_runtime/process_function_library_runtime_test.cc b/tensorflow/core/common_runtime/process_function_library_runtime_test.cc
index a007501fc82..6e17cdf4316 100644
--- a/tensorflow/core/common_runtime/process_function_library_runtime_test.cc
+++ b/tensorflow/core/common_runtime/process_function_library_runtime_test.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "tensorflow/core/framework/resource_var.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/framework/type_index.h"
+#include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/core/threadpool.h"
@@ -867,14 +868,29 @@ TEST_F(ProcessFunctionLibraryRuntimeTest, MultiDevice_CompositeDevice) {
   inst_opts.input_resource_dtypes_and_shapes[0] = {
       initial_resource_value0.dtype(), initial_resource_value0.shape()};
 
-  gtl::InlinedVector<TensorValue, 4> handles;
-  handles.push_back(TensorValue(&resource_handle0));
-  handles.push_back(TensorValue(&resource_handle1));
-  TestFunctionPackedArgs args(0, std::move(handles));
-  Tensor ret;
-  TF_CHECK_OK(RunWithPackedArgs("AddVarAcrossDevices", opts, {{"T", DT_FLOAT}},
-                                inst_opts, args, {&ret}));
-  test::ExpectTensorEqual<float>(ret, test::AsTensor<float>({40, 60}));
+  // Packed TensorHandle
+  {
+    gtl::InlinedVector<TensorValue, 4> handles;
+    handles.push_back(TensorValue(&resource_handle0));
+    handles.push_back(TensorValue(&resource_handle1));
+    TestFunctionPackedArgs args(0, std::move(handles));
+    Tensor ret;
+    TF_CHECK_OK(RunWithPackedArgs("AddVarAcrossDevices", opts,
+                                  {{"T", DT_FLOAT}}, inst_opts, args, {&ret}));
+    test::ExpectTensorEqual<float>(ret, test::AsTensor<float>({40, 60}));
+  }
+
+  // Packed Tensor
+  {
+    Tensor arg(DT_RESOURCE, TensorShape({2}));
+    arg.flat<ResourceHandle>()(0) = resource_handle0.scalar<ResourceHandle>()();
+    arg.flat<ResourceHandle>()(1) = resource_handle1.scalar<ResourceHandle>()();
+
+    Tensor ret;
+    TF_CHECK_OK(Run("AddVarAcrossDevices", opts, {{"T", DT_FLOAT}}, inst_opts,
+                    {arg}, {&ret}));
+    test::ExpectTensorEqual<float>(ret, test::AsTensor<float>({40, 60}));
+  }
 }
 
 TEST_F(ProcessFunctionLibraryRuntimeTest, MultiDevice_ResourceOutput_GPU) {
diff --git a/tensorflow/core/kernels/data/captured_function.cc b/tensorflow/core/kernels/data/captured_function.cc
index f740d7ff1ad..07e5a5b1273 100644
--- a/tensorflow/core/kernels/data/captured_function.cc
+++ b/tensorflow/core/kernels/data/captured_function.cc
@@ -571,6 +571,9 @@ Status CapturedFunction::Instantiate(
   DCHECK(lib->device() != nullptr);
   inst_opts.target = lib->device()->name();
 
+  // Maps from a CompositeDevice name to underlying physical device names.
+  absl::flat_hash_map<string, std::vector<string>> composite_devices;
+
   if (inst_opts.is_multi_device_function) {
     // Compute devices of non-captured inputs.
     //
@@ -596,9 +599,29 @@ Status CapturedFunction::Instantiate(
       const auto& input = captured_inputs_[i];
       DataType dtype = input.dtype();
       if (dtype == DT_RESOURCE) {
-        const ResourceHandle& handle = input.flat<ResourceHandle>()(0);
-        inst_opts.input_devices.push_back(handle.device());
-        const auto& dtypes_and_shapes = handle.dtypes_and_shapes();
+        const auto& handles = input.flat<ResourceHandle>();
+        const ResourceHandle& handle0 = handles(0);
+        string composite_device;
+        auto iter = fdef->arg_attr().find(num_non_captured_inputs + i);
+        if (iter != fdef->arg_attr().end()) {
+          auto arg_attr = iter->second.attr().find("_composite_device");
+          if (arg_attr != iter->second.attr().end()) {
+            composite_device = arg_attr->second.s();
+          }
+        }
+        if (!composite_device.empty()) {
+          if (composite_devices.find(composite_device) ==
+              composite_devices.end()) {
+            for (int i = 0; i < handles.size(); ++i) {
+              composite_devices[composite_device].push_back(
+                  handles(i).device());
+            }
+          }
+          inst_opts.input_devices.push_back(composite_device);
+        } else {
+          inst_opts.input_devices.push_back(handle0.device());
+        }
+        const auto& dtypes_and_shapes = handle0.dtypes_and_shapes();
         // Set dtypes and shapes for resource variable inputs.
         if (!dtypes_and_shapes.empty()) {
           input_resource_variable_dtypes_and_shapes[num_non_captured_inputs +
@@ -613,6 +636,10 @@ Status CapturedFunction::Instantiate(
       }
     }
 
+    for (const auto& it : composite_devices) {
+      inst_opts.composite_devices[it.first] = &it.second;
+    }
+
     for (size_t i = 0; i < fdef->signature().output_arg_size(); ++i) {
       inst_opts.output_devices.push_back(inst_opts.target);
     }
diff --git a/tensorflow/python/distribute/custom_training_loop_input_test.py b/tensorflow/python/distribute/custom_training_loop_input_test.py
index 748cb7834fc..e2c4076f3f1 100644
--- a/tensorflow/python/distribute/custom_training_loop_input_test.py
+++ b/tensorflow/python/distribute/custom_training_loop_input_test.py
@@ -796,10 +796,6 @@ class InputIterationTest(test.TestCase, parameterized.TestCase,
           mode=["eager"]
       ))
   def testMultiDeviceDataCapturedFunction(self, distribution):
-    if getattr(distribution, "_enable_packed_variable_in_eager_mode", False):
-      self.skipTest(
-          "Dataset captured function doesn't support packed tensors yet "
-          "(b/145922293).")
     inputs = constant_op.constant([2., 3.])
     dataset = lambda _: dataset_ops.Dataset.from_tensor_slices(inputs).repeat(5)
     input_iterator = iter(
diff --git a/tensorflow/python/eager/function_test.py b/tensorflow/python/eager/function_test.py
index 2c49795ba8a..3c42d95e437 100644
--- a/tensorflow/python/eager/function_test.py
+++ b/tensorflow/python/eager/function_test.py
@@ -224,6 +224,13 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
 
       return read0, read1, read2, read3
 
+    arg_attrs = read_var.get_concrete_function().function_def.arg_attr
+    self.assertLen(arg_attrs, 2)
+    self.assertEqual(arg_attrs[0].attr['_composite_device'].s,
+                     compat.as_bytes(packed_var_0.device))
+    self.assertEqual(arg_attrs[1].attr['_composite_device'].s,
+                     compat.as_bytes(packed_var_1.device))
+
     self.assertAllEqual(read_var(), (1 + 5, 2 + 5, 3 + 6, 4 + 6))
 
   def testImplementsAttributeBasic(self):
diff --git a/tensorflow/python/framework/func_graph.py b/tensorflow/python/framework/func_graph.py
index b0f8821b17f..e8e8fcbf081 100644
--- a/tensorflow/python/framework/func_graph.py
+++ b/tensorflow/python/framework/func_graph.py
@@ -646,6 +646,13 @@ class FuncGraph(ops.Graph):
     if capture is None:
       placeholder = _create_substitute_placeholder(
           tensor, name=name, dtype=tensor.dtype, shape=shape)
+      # Record the composite device as an attribute to the placeholder.
+      # This attribute would be propogated into the arg_attr of the FunctionDef.
+      # Currently, a packed eager tensor is always placed on a CompositeDevice.
+      if isinstance(tensor, ops.EagerTensor) and tensor.is_packed:
+        placeholder.op._set_attr(  # pylint: disable=protected-access
+            "_composite_device",
+            attr_value_pb2.AttrValue(s=compat.as_bytes(tensor.device)))
       self.add_capture(tensor, placeholder)
     else:
       placeholder = capture[1]

From 23d5a2e00ac5f93234d4cba91108c0bc3d15b112 Mon Sep 17 00:00:00 2001
From: Raman Sarokin <sorokin@google.com>
Date: Mon, 22 Jun 2020 15:53:37 -0700
Subject: [PATCH 0823/1390] ConvTexture converted to new style. Texture2D
 implements GPUObject.

PiperOrigin-RevId: 317752276
Change-Id: Id1c66b2f6ca7b70475cc82abc422935fc3f1a251
---
 tensorflow/lite/delegates/gpu/cl/BUILD        |   1 +
 .../delegates/gpu/cl/kernels/conv_texture.cc  | 208 +++++++++---------
 .../delegates/gpu/cl/kernels/conv_texture.h   |  85 ++++---
 tensorflow/lite/delegates/gpu/cl/texture2d.cc |  41 ++++
 tensorflow/lite/delegates/gpu/cl/texture2d.h  |  18 +-
 5 files changed, 218 insertions(+), 135 deletions(-)

diff --git a/tensorflow/lite/delegates/gpu/cl/BUILD b/tensorflow/lite/delegates/gpu/cl/BUILD
index fda57ce6da0..3e10f669d70 100644
--- a/tensorflow/lite/delegates/gpu/cl/BUILD
+++ b/tensorflow/lite/delegates/gpu/cl/BUILD
@@ -523,6 +523,7 @@ cc_library(
     deps = [
         ":cl_command_queue",
         ":cl_context",
+        ":gpu_object",
         ":opencl_wrapper",
         ":tensor_type",
         ":util",
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/conv_texture.cc b/tensorflow/lite/delegates/gpu/cl/kernels/conv_texture.cc
index e92cc13706d..12765b11fa5 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/conv_texture.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/conv_texture.cc
@@ -31,18 +31,32 @@ namespace gpu {
 namespace cl {
 namespace {
 
-std::string GenerateConvCode(
-    const OperationDef& op_def, const int3& block_size, bool is1x1,
-    bool adreno4xx_optimization, bool stride_correction,
-    bool different_weights_for_height, const CLDevice& device,
-    const std::vector<ElementwiseOperation*>& linked_operations) {
-  std::string c = GetCommonDefines(op_def.precision);
-  TensorCodeGenerator src_tensor(
-      "src_data", WHSPoint{"src_size.x", "src_size.y", "src_size.z"},
-      op_def.src_tensors[0]);
-  TensorCodeGenerator dst_tensor(
-      "dst_data", WHSPoint{"dst_size.x", "dst_size.y", "dst_size.z"},
-      op_def.dst_tensors[0]);
+std::string GenerateConvCode(const OperationDef& op_def, const int3& block_size,
+                             bool is1x1, bool adreno4xx_optimization,
+                             bool stride_correction,
+                             bool different_weights_for_height,
+                             const CLDevice& device, Arguments* args) {
+  auto src_desc = absl::make_unique<TensorDescriptor>(op_def.src_tensors[0]);
+  src_desc->SetTextureAddressMode(GetFastestZeroMode(device));
+  if (op_def.IsBatchSupported()) {
+    src_desc->SetStateVar("BatchedWidth", "true");
+  }
+  args->AddObjectRef("src_tensor", AccessType::READ, std::move(src_desc));
+  auto dst_desc = absl::make_unique<TensorDescriptor>(op_def.dst_tensors[0]);
+  if (op_def.IsBatchSupported()) {
+    dst_desc->SetStateVar("BatchedWidth", "true");
+  }
+  args->AddObjectRef("dst_tensor", AccessType::WRITE, std::move(dst_desc));
+  if (!is1x1) {
+    args->AddInt("kernel_size_x");
+    args->AddInt("kernel_size_y");
+    args->AddInt("dilation_x");
+    args->AddInt("dilation_y");
+  }
+  args->AddInt("stride_x");
+  args->AddInt("stride_y");
+  args->AddInt("padding_x");
+  args->AddInt("padding_y");
 
   const auto src_tensor_type = op_def.src_tensors[0].storage_type;
   const bool is_buffer = src_tensor_type == TensorStorageType::IMAGE_BUFFER ||
@@ -63,6 +77,7 @@ std::string GenerateConvCode(
     zs[z] = std::to_string(z);
   }
 
+  std::string c = GetCommonDefines(op_def.precision);
   for (int z = 0; z < block_size.z; ++z) {
     const std::string f0 = std::to_string(z * 4 + 0);
     const std::string f1 = std::to_string(z * 4 + 1);
@@ -86,43 +101,29 @@ std::string GenerateConvCode(
   }
 
   c += "__kernel void main_function(\n";
-  c += src_tensor.GetDeclaration(AccessType::READ) + ",\n";
-  c += "    __read_only image2d_t filters0,   \n";
-  c += "    __read_only image2d_t filters1,   \n";
-  c += "    __read_only image2d_t filters2,   \n";
-  c += "    __read_only image2d_t filters3,   \n";
-  c += "    __read_only image2d_t biases";
-  c += GetArgsDeclaration(linked_operations);
-  c += dst_tensor.GetDeclaration(AccessType::WRITE) + ",\n";
-  c += "    int4 src_size,                   \n";
-  c += "    int4 dst_size,                   \n";
-  if (!is1x1) {
-    c += "    int2 kernel_size,              \n";
-    c += "    int2 dilation,                 \n";
-  }
-  c += "    int2 stride,                     \n";
-  c += "    int2 padding                     \n";
-  c += ") {\n";
+  c += "$0) {\n";
   c += "  int X = get_global_id(0) * " + std::to_string(block_size.x) + ";\n";
   c += "  int Y = get_global_id(1) * " + std::to_string(block_size.y) + ";\n";
   c += "  int Z = get_global_id(2) * " + std::to_string(block_size.z) + ";\n";
-  c += "  if (X >= dst_size.x || Y >= dst_size.y || Z >= dst_size.z) return;\n";
+  c += "  if (X >= args.dst_tensor.Width() || Y >= args.dst_tensor.Height() "
+       "|| Z >= args.dst_tensor.Slices()) return;\n";
   std::vector<std::string> s_x(block_size.x);
   std::vector<std::string> s_y(block_size.y);
   for (int x = 0; x < block_size.x; ++x) {
     if (stride_correction) {
       c += "  int xc" + xs[x] + " = " +
-           GetXStrideCorrected("X + " + xs[x], "src_size.w", "stride.x",
-                               "padding.x") +
+           GetXStrideCorrected("X + " + xs[x], "args.src_tensor.Batch()",
+                               "args.stride_x", "args.padding_x") +
            ";\n";
     } else {
       c += "  int xc" + xs[x] + " = (X +" + xs[x] +
-           ") * stride.x + padding.x;\n";
+           ") * args.stride_x + args.padding_x;\n";
     }
     s_x[x] = is1x1 ? "xc" + xs[x] : "cx" + xs[x];
   }
   for (int y = 0; y < block_size.y; ++y) {
-    c += "  int yc" + ys[y] + " = (Y +" + ys[y] + ") * stride.y + padding.y;\n";
+    c += "  int yc" + ys[y] + " = (Y +" + ys[y] +
+         ") * args.stride_y + args.padding_y;\n";
     s_y[y] = is1x1 ? "yc" + ys[y] : "cy" + ys[y];
   }
   for (int i = 0; i < block_size.x * block_size.y * block_size.z; ++i) {
@@ -131,7 +132,7 @@ std::string GenerateConvCode(
   }
   std::string f_y = is1x1 ? "s" : "filter_offset";
   if (different_weights_for_height) {
-    f_y = "Y * src_size.z + s";
+    f_y = "Y * args.src_tensor.Slices() + s";
   }
   if (!is1x1) {
     for (int x = 0; x < block_size.x; ++x) {
@@ -141,31 +142,31 @@ std::string GenerateConvCode(
       c += "  int cy" + ys[y] + ";\n";
     }
     c += "  int filter_offset = 0;\n";
-    c += "  for (int y = 0; y < kernel_size.y; ++y) {\n";
+    c += "  for (int y = 0; y < args.kernel_size_y; ++y) {\n";
     for (int y = 0; y < block_size.y; ++y) {
-      c += "  cy" + ys[y] + " = y * dilation.y + yc" + ys[y] + ";\n";
+      c += "  cy" + ys[y] + " = y * args.dilation_y + yc" + ys[y] + ";\n";
     }
     if (is_buffer) {
       for (int y = 0; y < block_size.y; ++y) {
         c += "  bool in_y" + ys[y] + " = cy" + ys[y] + " >= 0 && cy" + ys[y] +
-             " < src_size.y;\n";
+             " < args.src_tensor.Height();\n";
         if (src_tensor_type == TensorStorageType::BUFFER) {
           c += "    cy" + ys[y] + " = clamp(cy" + ys[y] +
-               ", 0, src_size.y - 1);\n";
+               ", 0, args.src_tensor.Height() - 1);\n";
         }
       }
     }
-    c += "  for (int x = 0; x < kernel_size.x; ++x) {\n";
+    c += "  for (int x = 0; x < args.kernel_size_x; ++x) {\n";
     for (int x = 0; x < block_size.x; ++x) {
-      c += "  cx" + xs[x] + " = x * dilation.x + xc" + xs[x] + ";\n";
+      c += "  cx" + xs[x] + " = x * args.dilation_x + xc" + xs[x] + ";\n";
     }
     if (is_buffer) {
       for (int x = 0; x < block_size.x; ++x) {
         c += "  bool in_x" + xs[x] + " = cx" + xs[x] + " >= 0 && cx" + xs[x] +
-             " < src_size.x;\n";
+             " < args.src_tensor.Width();\n";
         if (src_tensor_type == TensorStorageType::BUFFER) {
           c += "    cx" + xs[x] + " = clamp(cx" + xs[x] +
-               ", 0, src_size.x - 1);\n";
+               ", 0, args.src_tensor.Width() - 1);\n";
         }
       }
       for (int x = 0; x < block_size.x; ++x) {
@@ -173,90 +174,95 @@ std::string GenerateConvCode(
           const std::string id = std::to_string(y * block_size.x + x);
           if (src_tensor_type == TensorStorageType::IMAGE_BUFFER) {
             c += absl::Substitute(
-                "  int addr_$0 = select(-1, cy$2 * src_size.x + cx$1, (in_x$1 "
+                "  int addr_$0 = select(-1, cy$2 * args.src_tensor.Width() + "
+                "cx$1, (in_x$1 "
                 "&& "
                 "in_y$2));\n",
                 y * block_size.x + x, x, y);
             c += absl::Substitute(
-                "  int dz_$0 = select(0, src_size.x * src_size.y, (in_x$1 && "
+                "  int dz_$0 = select(0, args.src_tensor.Width() * "
+                "args.src_tensor.Height(), (in_x$1 && "
                 "in_y$2));\n",
                 y * block_size.x + x, x, y);
           } else {
-            c += absl::Substitute("  int addr_$0 = cy$2 * src_size.x + cx$1;\n",
-                                  y * block_size.x + x, x, y);
+            c += absl::Substitute(
+                "  int addr_$0 = cy$2 * args.src_tensor.Width() + cx$1;\n",
+                y * block_size.x + x, x, y);
           }
         }
       }
       if (src_tensor_type == TensorStorageType::BUFFER) {
-        c += "  int dz = src_size.x * src_size.y;\n";
+        c += "  int dz = args.src_tensor.Width() * args.src_tensor.Height();\n";
       }
     }
   } else if (is_buffer) {
     for (int y = 0; y < block_size.y; ++y) {
       c += "  bool in_y" + ys[y] + " = yc" + ys[y] + " >= 0 && yc" + ys[y] +
-           " < src_size.y;\n";
+           " < args.src_tensor.Height();\n";
     }
     for (int x = 0; x < block_size.x; ++x) {
       c += "  bool in_x" + xs[x] + " = xc" + xs[x] + " >= 0 && xc" + xs[x] +
-           " < src_size.x;\n";
+           " < args.src_tensor.Width();\n";
     }
     for (int x = 0; x < block_size.x; ++x) {
       for (int y = 0; y < block_size.y; ++y) {
         const std::string id = std::to_string(y * block_size.x + x);
         if (src_tensor_type == TensorStorageType::IMAGE_BUFFER) {
           c += absl::Substitute(
-              "  int addr_$0 = select(-1, yc$2 * src_size.x + xc$1, (in_x$1 && "
+              "  int addr_$0 = select(-1, yc$2 * args.src_tensor.Width() + "
+              "xc$1, (in_x$1 && "
               "in_y$2));\n",
               y * block_size.x + x, x, y);
           c += absl::Substitute(
-              "  int dz_$0 = select(0, src_size.x * src_size.y, (in_x$1 && "
+              "  int dz_$0 = select(0, args.src_tensor.Width() * "
+              "args.src_tensor.Height(), (in_x$1 && "
               "in_y$2));\n",
               y * block_size.x + x, x, y);
         } else {
-          c += absl::Substitute("  int addr_$0 = yc$2 * src_size.x + xc$1;\n",
-                                y * block_size.x + x, x, y);
+          c += absl::Substitute(
+              "  int addr_$0 = yc$2 * args.src_tensor.Width() + xc$1;\n",
+              y * block_size.x + x, x, y);
         }
       }
     }
     if (src_tensor_type == TensorStorageType::BUFFER) {
-      c += "  int dz = src_size.x * src_size.y;\n";
+      c += "  int dz = args.src_tensor.Width() * args.src_tensor.Height();\n";
     }
   }
-  c += "  for (int s = 0; s < src_size.z; ++s) {\n";
+  c += "  for (int s = 0; s < args.src_tensor.Slices(); ++s) {\n";
   if (is_buffer) {
     if (src_tensor_type == TensorStorageType::IMAGE_BUFFER) {
       for (int index = 0; index < block_size.x * block_size.y; ++index) {
         const std::string id = std::to_string(index);
         c +=
-            "    FLT4 src" + id + " = " + src_tensor.Read("addr_" + id) + ";\n";
+            "    FLT4 src" + id + " = args.src_tensor.Read(addr_" + id + ");\n";
       }
     } else {
       for (int x = 0; x < block_size.x; ++x) {
         for (int y = 0; y < block_size.y; ++y) {
           const std::string id = std::to_string(y * block_size.x + x);
-          c += "    FLT4 src" + id + " = " + src_tensor.Read("addr_" + id) +
-               " * (FLT)(in_x" + xs[x] + " && in_y" + ys[y] + "); addr_" + id +
+          c += "    FLT4 src" + id + " = args.src_tensor.Read(addr_" + id +
+               ") * (FLT)(in_x" + xs[x] + " && in_y" + ys[y] + "); addr_" + id +
                " += dz;\n";
         }
       }
     }
   }
   for (int z = 0; z < block_size.z; ++z) {
-    const std::string fc = "(int2)(Z + " + zs[z] + ", " + f_y + ")";
-    c += absl::Substitute(R"(    FLT4 f$1 = READ_IMAGE(filters0, smp_none, $0);
-    FLT4 f$2 = READ_IMAGE(filters1, smp_none, $0);
-    FLT4 f$3 = READ_IMAGE(filters2, smp_none, $0);
-    FLT4 f$4 = READ_IMAGE(filters3, smp_none, $0);
+    c += absl::Substitute(R"(    FLT4 f$2 = args.weights0.Read($0, $1);
+    FLT4 f$3 = args.weights1.Read($0, $1);
+    FLT4 f$4 = args.weights2.Read($0, $1);
+    FLT4 f$5 = args.weights3.Read($0, $1);
 )",
-                          fc, z * 4 + 0, z * 4 + 1, z * 4 + 2, z * 4 + 3);
+                          "Z + " + zs[z], f_y, z * 4 + 0, z * 4 + 1, z * 4 + 2,
+                          z * 4 + 3);
   }
   if (!is_buffer) {
-    const auto mode = GetFastestZeroMode(device);
     for (int x = 0; x < block_size.x; ++x) {
       for (int y = 0; y < block_size.y; ++y) {
         const std::string id = std::to_string(y * block_size.x + x);
-        c += "    FLT4 src" + id + " = " +
-             src_tensor.ReadWHS(s_x[x], s_y[y], "s", mode) + ";\n";
+        c += "    FLT4 src" + id + " = args.src_tensor.Read(" + s_x[x] + ", " +
+             s_y[y] + ", s);\n";
       }
     }
   }
@@ -278,17 +284,17 @@ std::string GenerateConvCode(
       }
     }
   }
-  c += "  }\n";  // src_size.z
+  c += "  }\n";  // args.src_tensor.Slices()
   if (!is1x1) {
-    c += "  }\n";  // kernel_size.x
-    c += "  }\n";  // kernel_size.y
+    c += "  }\n";  // kernel_size_x
+    c += "  }\n";  // kernel_size_y
   }
   // when is1x1 && adreno4xx_optimization is true, xc0 == X and yc0 == Y
   std::string dst_x = is1x1 && adreno4xx_optimization ? "xc0" : "X";
   std::string dst_y = is1x1 && adreno4xx_optimization ? "yc0" : "Y";
   for (int z = 0; z < block_size.z; ++z) {
-    c += "  if (Z < dst_size.z) {\n";
-    c += "    FLT4 bias_val = READ_IMAGE(biases, smp_none, (int2)(Z, 0));\n";
+    c += "  if (Z < args.dst_tensor.Slices()) {\n";
+    c += "    FLT4 bias_val = args.biases.Read(Z);\n";
     for (int y = 0; y < block_size.y; ++y) {
       for (int x = 0; x < block_size.x; ++x) {
         const std::string id =
@@ -296,11 +302,10 @@ std::string GenerateConvCode(
         c += "    {\n";
         c += "      int xc = " + dst_x + " + " + xs[x] + ";\n";
         c += "      int yc = " + dst_y + " + " + ys[y] + ";\n";
-        c += "      if (xc < dst_size.x && yc < dst_size.y) {\n";
+        c += "      if (xc < args.dst_tensor.Width() && yc < "
+             "args.dst_tensor.Height()) {\n";
         c += "        FLT4 res = TO_FLT4(r" + id + ") + bias_val;\n";
-        const LinkingContext context{"res", "xc", "yc", "Z"};
-        c += PostProcess(linked_operations, context);
-        c += "        " + dst_tensor.WriteWHS("res", "xc", "yc", "Z") + "\n";
+        c += "        args.dst_tensor.Write(res, xc, yc, Z);\n";
         c += "      }\n";
         c += "    }\n";
       }
@@ -350,11 +355,6 @@ ConvTexture::ConvTexture(const OperationDef& definition)
 
 ConvTexture::ConvTexture(ConvTexture&& operation)
     : GPUOperation(std::move(operation)),
-      weights_0_(std::move(operation.weights_0_)),
-      weights_1_(std::move(operation.weights_1_)),
-      weights_2_(std::move(operation.weights_2_)),
-      weights_3_(std::move(operation.weights_3_)),
-      biases_(std::move(operation.biases_)),
       kernel_size_(operation.kernel_size_),
       stride_(operation.stride_),
       padding_(operation.padding_),
@@ -366,11 +366,6 @@ ConvTexture::ConvTexture(ConvTexture&& operation)
 
 ConvTexture& ConvTexture::operator=(ConvTexture&& operation) {
   if (this != &operation) {
-    weights_0_ = std::move(operation.weights_0_);
-    weights_1_ = std::move(operation.weights_1_);
-    weights_2_ = std::move(operation.weights_2_);
-    weights_3_ = std::move(operation.weights_3_);
-    biases_ = std::move(operation.biases_);
     std::swap(kernel_size_, operation.kernel_size_);
     std::swap(stride_, operation.stride_);
     std::swap(padding_, operation.padding_);
@@ -395,10 +390,16 @@ absl::Status ConvTexture::Compile(const CreationContext& creation_context) {
       definition_.precision == CalculationsPrecision::F16;
   const bool stride_correction =
       definition_.IsBatchSupported() && stride_.x != 1;
-  const std::string code =
+  std::string code =
       GenerateConvCode(definition_, block_size_, is1x1, adreno4xx_optimization,
                        stride_correction, different_weights_for_height_,
-                       *creation_context.device, linked_operations_);
+                       *creation_context.device, &args_);
+  std::string element_wise_code;
+  RETURN_IF_ERROR(
+      MergeOperations(linked_operations_, &args_, &element_wise_code));
+  RETURN_IF_ERROR(args_.TransformToCLCode(creation_context.device->GetInfo(),
+                                          {{"dst_tensor", element_wise_code}},
+                                          &code));
   std::vector<CompilerOptions> options;
   if (UseFP16SIMD(*creation_context.device, definition_.precision, is1x1)) {
     options.push_back(CompilerOptions::ADRENO_FULL_SIMD_LINE);
@@ -409,25 +410,20 @@ absl::Status ConvTexture::Compile(const CreationContext& creation_context) {
 }
 
 absl::Status ConvTexture::BindArguments() {
-  kernel_.ResetBindingCounter();
-  RETURN_IF_ERROR(kernel_.SetMemoryAuto(src_[0]->GetMemoryPtr()));
-  RETURN_IF_ERROR(kernel_.SetMemoryAuto(weights_0_.GetMemoryPtr()));
-  RETURN_IF_ERROR(kernel_.SetMemoryAuto(weights_1_.GetMemoryPtr()));
-  RETURN_IF_ERROR(kernel_.SetMemoryAuto(weights_2_.GetMemoryPtr()));
-  RETURN_IF_ERROR(kernel_.SetMemoryAuto(weights_3_.GetMemoryPtr()));
-  RETURN_IF_ERROR(kernel_.SetMemoryAuto(biases_.GetMemoryPtr()));
-  RETURN_IF_ERROR(BindArgs(&kernel_, linked_operations_));
-  RETURN_IF_ERROR(kernel_.SetMemoryAuto(dst_[0]->GetMemoryPtrForWriting()));
-  RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->GetWBatchedHSB()));
-  RETURN_IF_ERROR(kernel_.SetBytesAuto(dst_[0]->GetWBatchedHSB()));
+  RETURN_IF_ERROR(args_.SetObjectRef("src_tensor", src_[0]));
+  RETURN_IF_ERROR(args_.SetObjectRef("dst_tensor", dst_[0]));
   if (!(kernel_size_.x == 1 && kernel_size_.y == 1)) {
-    RETURN_IF_ERROR(kernel_.SetBytesAuto(kernel_size_));
-    RETURN_IF_ERROR(kernel_.SetBytesAuto(
-        int2(dilation_.x * src_[0]->Batch(), dilation_.y)));
+    RETURN_IF_ERROR(args_.SetInt("kernel_size_x", kernel_size_.x));
+    RETURN_IF_ERROR(args_.SetInt("kernel_size_y", kernel_size_.y));
+    RETURN_IF_ERROR(args_.SetInt("dilation_x", dilation_.x * src_[0]->Batch()));
+    RETURN_IF_ERROR(args_.SetInt("dilation_y", dilation_.y));
   }
-  RETURN_IF_ERROR(kernel_.SetBytesAuto(stride_));
-  RETURN_IF_ERROR(
-      kernel_.SetBytesAuto(int2(padding_.x * src_[0]->Batch(), padding_.y)));
+  RETURN_IF_ERROR(args_.SetInt("stride_x", stride_.x));
+  RETURN_IF_ERROR(args_.SetInt("stride_y", stride_.y));
+  RETURN_IF_ERROR(args_.SetInt("padding_x", padding_.x * src_[0]->Batch()));
+  RETURN_IF_ERROR(args_.SetInt("padding_y", padding_.y));
+  RETURN_IF_ERROR(SetArguments(linked_operations_, &args_));
+  RETURN_IF_ERROR(args_.Bind(kernel_.kernel()));
   return absl::OkStatus();
 }
 
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/conv_texture.h b/tensorflow/lite/delegates/gpu/cl/kernels/conv_texture.h
index 42f7ecd51af..31c2a72021e 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/conv_texture.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/conv_texture.h
@@ -92,12 +92,6 @@ class ConvTexture : public GPUOperation {
   absl::Status BindArguments();
   int3 GetGridSize() const;
 
-  Texture2D weights_0_;
-  Texture2D weights_1_;
-  Texture2D weights_2_;
-  Texture2D weights_3_;
-  LinearStorage biases_;
-
   int2 kernel_size_;
   int2 stride_;
   int2 padding_;
@@ -119,11 +113,16 @@ absl::Status ConvTexture::UploadData(
     const tflite::gpu::Tensor<OHWI, T>& weights,
     const tflite::gpu::Tensor<Linear, T>& biases, CLContext* context) {
   RETURN_IF_ERROR(UploadWeights(weights, context));
-  LinearStorageCreateInfo create_info;
-  create_info.storage_type = LinearStorageType::TEXTURE_2D;
-  create_info.data_type = definition_.GetDataType();
-  create_info.aligned_size = weights.shape.o;
-  RETURN_IF_ERROR(CreateLinearStorage(create_info, biases, context, &biases_));
+
+  TensorLinearDescriptor desc;
+  desc.storage_type = LinearStorageType::TEXTURE_2D;
+  desc.element_type = definition_.GetDataType();
+
+  LinearStorage lt;
+  RETURN_IF_ERROR(CreateLinearStorage(desc, biases, context, &lt));
+  args_.AddObject("biases", AccessType::READ,
+                  absl::make_unique<LinearStorage>(std::move(lt)),
+                  absl::make_unique<TensorLinearDescriptor>(desc));
   return absl::OkStatus();
 }
 
@@ -135,14 +134,19 @@ absl::Status ConvTexture::UploadDataForWinograd4x4To6x6(
   RearrangeWeightsToWinograd4x4To6x6Weights(weights, &wino_weights);
   RETURN_IF_ERROR(UploadWeights(wino_weights, context));
 
-  LinearStorageCreateInfo create_info;
-  create_info.storage_type = LinearStorageType::TEXTURE_2D;
-  create_info.data_type = definition_.GetDataType();
-  create_info.aligned_size = 1;
   tflite::gpu::Tensor<Linear, DataType::FLOAT32> bias;
   bias.shape = Linear(1);
   bias.data = {0.0f};
-  return CreateLinearStorage(create_info, bias, context, &biases_);
+  TensorLinearDescriptor desc;
+  desc.storage_type = LinearStorageType::TEXTURE_2D;
+  desc.element_type = definition_.GetDataType();
+
+  LinearStorage lt;
+  RETURN_IF_ERROR(CreateLinearStorage(desc, bias, context, &lt));
+  args_.AddObject("biases", AccessType::READ,
+                  absl::make_unique<LinearStorage>(std::move(lt)),
+                  absl::make_unique<TensorLinearDescriptor>(desc));
+  return absl::OkStatus();
 }
 
 template <DataType T>
@@ -157,11 +161,20 @@ absl::Status ConvTexture::UploadWeights(
   int texture_width = dst_depth;
   int texture_height = src_depth * kernel_x * kernel_y;
 
-  DataType data_type = definition_.GetDataType();
+  const bool f32_weights = definition_.precision == CalculationsPrecision::F32;
+  DataType data_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
 
   const int elements_count = texture_width * texture_height;
 
-  if (data_type == DataType::FLOAT32) {
+  Texture2DDescriptor desc;
+  desc.element_type = data_type;
+
+  Texture2D weights_0;
+  Texture2D weights_1;
+  Texture2D weights_2;
+  Texture2D weights_3;
+
+  if (f32_weights) {
     std::vector<float4> gpu_data_0(elements_count);
     std::vector<float4> gpu_data_1(elements_count);
     std::vector<float4> gpu_data_2(elements_count);
@@ -171,15 +184,16 @@ absl::Status ConvTexture::UploadWeights(
                          absl::MakeSpan(gpu_data_3));
     RETURN_IF_ERROR(CreateTexture2DRGBA(data_type, texture_width,
                                         texture_height, gpu_data_0.data(),
-                                        context, &weights_0_));
+                                        context, &weights_0));
     RETURN_IF_ERROR(CreateTexture2DRGBA(data_type, texture_width,
                                         texture_height, gpu_data_1.data(),
-                                        context, &weights_1_));
+                                        context, &weights_1));
     RETURN_IF_ERROR(CreateTexture2DRGBA(data_type, texture_width,
                                         texture_height, gpu_data_2.data(),
-                                        context, &weights_2_));
-    return CreateTexture2DRGBA(data_type, texture_width, texture_height,
-                               gpu_data_3.data(), context, &weights_3_);
+                                        context, &weights_2));
+    RETURN_IF_ERROR(CreateTexture2DRGBA(data_type, texture_width,
+                                        texture_height, gpu_data_3.data(),
+                                        context, &weights_3));
   } else {
     std::vector<half4> gpu_data_0(elements_count);
     std::vector<half4> gpu_data_1(elements_count);
@@ -190,16 +204,31 @@ absl::Status ConvTexture::UploadWeights(
                          absl::MakeSpan(gpu_data_3));
     RETURN_IF_ERROR(CreateTexture2DRGBA(data_type, texture_width,
                                         texture_height, gpu_data_0.data(),
-                                        context, &weights_0_));
+                                        context, &weights_0));
     RETURN_IF_ERROR(CreateTexture2DRGBA(data_type, texture_width,
                                         texture_height, gpu_data_1.data(),
-                                        context, &weights_1_));
+                                        context, &weights_1));
     RETURN_IF_ERROR(CreateTexture2DRGBA(data_type, texture_width,
                                         texture_height, gpu_data_2.data(),
-                                        context, &weights_2_));
-    return CreateTexture2DRGBA(data_type, texture_width, texture_height,
-                               gpu_data_3.data(), context, &weights_3_);
+                                        context, &weights_2));
+    RETURN_IF_ERROR(CreateTexture2DRGBA(data_type, texture_width,
+                                        texture_height, gpu_data_3.data(),
+                                        context, &weights_3));
   }
+
+  args_.AddObject("weights0", AccessType::READ,
+                  absl::make_unique<Texture2D>(std::move(weights_0)),
+                  absl::make_unique<Texture2DDescriptor>(desc));
+  args_.AddObject("weights1", AccessType::READ,
+                  absl::make_unique<Texture2D>(std::move(weights_1)),
+                  absl::make_unique<Texture2DDescriptor>(desc));
+  args_.AddObject("weights2", AccessType::READ,
+                  absl::make_unique<Texture2D>(std::move(weights_2)),
+                  absl::make_unique<Texture2DDescriptor>(desc));
+  args_.AddObject("weights3", AccessType::READ,
+                  absl::make_unique<Texture2D>(std::move(weights_3)),
+                  absl::make_unique<Texture2DDescriptor>(desc));
+  return absl::OkStatus();
 }
 
 template <DataType S, typename T>
diff --git a/tensorflow/lite/delegates/gpu/cl/texture2d.cc b/tensorflow/lite/delegates/gpu/cl/texture2d.cc
index 022c15660ce..1b774c40862 100644
--- a/tensorflow/lite/delegates/gpu/cl/texture2d.cc
+++ b/tensorflow/lite/delegates/gpu/cl/texture2d.cc
@@ -59,6 +59,41 @@ absl::Status CreateTexture2D(int width, int height, cl_channel_type type,
 }
 }  // namespace
 
+GPUResources Texture2DDescriptor::GetGPUResources(
+    AccessType access_type) const {
+  GPUResources resources;
+  GPUImage2DDescriptor desc;
+  desc.data_type = element_type;
+  desc.access_type = access_type;
+  resources.images2d.push_back({"tex2d", desc});
+  return resources;
+}
+
+absl::Status Texture2DDescriptor::PerformSelector(
+    const std::string& selector, const std::vector<std::string>& args,
+    const std::vector<std::string>& template_args, std::string* result) const {
+  if (selector == "Read") {
+    return PerformReadSelector(args, result);
+  } else {
+    return absl::NotFoundError(absl::StrCat(
+        "TensorLinearDescriptor don't have selector with name - ", selector));
+  }
+}
+
+absl::Status Texture2DDescriptor::PerformReadSelector(
+    const std::vector<std::string>& args, std::string* result) const {
+  if (args.size() != 2) {
+    return absl::NotFoundError(
+        absl::StrCat("Texture2DDescriptor Read require one argument, but ",
+                     args.size(), " was passed"));
+  }
+  const std::string read =
+      element_type == DataType::FLOAT16 ? "read_imageh" : "read_imagef";
+  *result = absl::StrCat(read, "(tex2d, smp_none, (int2)(", args[0],
+                         ", " + args[1] + "))");
+  return absl::OkStatus();
+}
+
 Texture2D::Texture2D(cl_mem texture, int width, int height,
                      cl_channel_type type)
     : texture_(texture), width_(width), height_(height), channel_type_(type) {}
@@ -95,6 +130,12 @@ void Texture2D::Release() {
   }
 }
 
+GPUResourcesWithValue Texture2D::GetGPUResources(AccessType access_type) const {
+  GPUResourcesWithValue resources;
+  resources.images2d.push_back({"tex2d", texture_});
+  return resources;
+}
+
 // Creates new 4-channel 2D texture with f32 elements
 absl::Status CreateTexture2DRGBA32F(int width, int height, CLContext* context,
                                     Texture2D* result) {
diff --git a/tensorflow/lite/delegates/gpu/cl/texture2d.h b/tensorflow/lite/delegates/gpu/cl/texture2d.h
index c12d8a2836c..cd41bb60aee 100644
--- a/tensorflow/lite/delegates/gpu/cl/texture2d.h
+++ b/tensorflow/lite/delegates/gpu/cl/texture2d.h
@@ -20,6 +20,7 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "tensorflow/lite/delegates/gpu/cl/cl_command_queue.h"
 #include "tensorflow/lite/delegates/gpu/cl/cl_context.h"
+#include "tensorflow/lite/delegates/gpu/cl/gpu_object.h"
 #include "tensorflow/lite/delegates/gpu/cl/opencl_wrapper.h"
 #include "tensorflow/lite/delegates/gpu/cl/tensor_type.h"
 #include "tensorflow/lite/delegates/gpu/cl/util.h"
@@ -30,9 +31,22 @@ namespace tflite {
 namespace gpu {
 namespace cl {
 
+struct Texture2DDescriptor : public GPUObjectDescriptor {
+  DataType element_type;  // FLOAT32 or FLOAT16
+
+  absl::Status PerformSelector(const std::string& selector,
+                               const std::vector<std::string>& args,
+                               const std::vector<std::string>& template_args,
+                               std::string* result) const override;
+
+  GPUResources GetGPUResources(AccessType access_type) const override;
+  absl::Status PerformReadSelector(const std::vector<std::string>& args,
+                                   std::string* result) const;
+};
+
 // Texture2D represent formatted GPU data storage.
 // Texture2D is moveable but not copyable.
-class Texture2D {
+class Texture2D : public GPUObject {
  public:
   Texture2D() {}  // just for using Texture2D as a class members
   Texture2D(cl_mem texture, int width, int height, cl_channel_type type);
@@ -56,6 +70,8 @@ class Texture2D {
   template <typename T>
   absl::Status ReadData(CLCommandQueue* queue, std::vector<T>* result) const;
 
+  GPUResourcesWithValue GetGPUResources(AccessType access_type) const override;
+
  private:
   void Release();
 

From 094340c9d4147a5be79bd37be71acc5a687e7f41 Mon Sep 17 00:00:00 2001
From: Dan Moldovan <mdan@google.com>
Date: Mon, 22 Jun 2020 15:58:38 -0700
Subject: [PATCH 0824/1390] Add a workaround for slowness of tf.while_loop in
 the default executor when maximum_iterations is set. Fixes #40517.

PiperOrigin-RevId: 317753132
Change-Id: I7b5767019ceebd3f21975a990e7c2c05dd878ca6
---
 .../python/autograph/operators/control_flow.py    | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/tensorflow/python/autograph/operators/control_flow.py b/tensorflow/python/autograph/operators/control_flow.py
index 77db7579ece..8b4e97c238c 100644
--- a/tensorflow/python/autograph/operators/control_flow.py
+++ b/tensorflow/python/autograph/operators/control_flow.py
@@ -80,6 +80,7 @@ from tensorflow.python.framework import func_graph
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import control_flow_util
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import tensor_array_ops
 from tensorflow.python.ops.ragged import ragged_tensor
@@ -429,7 +430,9 @@ def _known_len_tf_for_stmt(
       return control_flow_ops.cond(main_test, extra_test, lambda: False)
     return main_test
 
-  opts['maximum_iterations'] = n
+  # TODO(b/159186914): Remove.
+  if not control_flow_util.GraphOrParentsInXlaContext(ops.get_default_graph()):
+    opts['maximum_iterations'] = n
 
   _tf_while_stmt(
       aug_test,
@@ -475,7 +478,9 @@ def _tf_ragged_for_stmt(
       return control_flow_ops.cond(main_test, extra_test, lambda: False)
     return main_test
 
-  opts['maximum_iterations'] = n
+  # TODO(b/159186914): Remove.
+  if not control_flow_util.GraphOrParentsInXlaContext(ops.get_default_graph()):
+    opts['maximum_iterations'] = n
 
   _tf_while_stmt(
       aug_test,
@@ -524,8 +529,10 @@ def _tf_range_for_stmt(
       return control_flow_ops.cond(main_test, extra_test, lambda: False)
     return main_test
 
-  opts['maximum_iterations'] = math_ops.cast(
-      misc.get_range_len(start, limit, delta), dtypes.int32)
+  # TODO(b/159186914): Remove.
+  if not control_flow_util.GraphOrParentsInXlaContext(ops.get_default_graph()):
+    opts['maximum_iterations'] = math_ops.cast(
+        misc.get_range_len(start, limit, delta), dtypes.int32)
 
   _tf_while_stmt(
       aug_test,

From 8d99ecdbd50face521441389f6ee02c8d682ce68 Mon Sep 17 00:00:00 2001
From: "Ahmed S. Taei" <ataei@google.com>
Date: Mon, 22 Jun 2020 16:02:10 -0700
Subject: [PATCH 0825/1390] Add AArch64/GISel/*.cpp to AArch64CodeGen srcs.

PiperOrigin-RevId: 317753780
Change-Id: I33c44d6355cca3b9a054773afad47b6d00ba6d9f
---
 third_party/llvm/llvm.autogenerated.BUILD | 1 +
 1 file changed, 1 insertion(+)

diff --git a/third_party/llvm/llvm.autogenerated.BUILD b/third_party/llvm/llvm.autogenerated.BUILD
index 50b71a3686f..c70ff559165 100644
--- a/third_party/llvm/llvm.autogenerated.BUILD
+++ b/third_party/llvm/llvm.autogenerated.BUILD
@@ -698,6 +698,7 @@ cc_library(
         "lib/Target/AArch64/*.c",
         "lib/Target/AArch64/*.cpp",
         "lib/Target/AArch64/*.inc",
+        "lib/Target/AArch64/GISel/*.cpp",
     ]),
     hdrs = glob([
         "include/llvm/Target/AArch64/*.h",

From b5adbbcf0e2722d47b0533c665d18df96e91c56e Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 22 Jun 2020 16:05:37 -0700
Subject: [PATCH 0826/1390] tf numpy: Interop test for tf.Variable.

PiperOrigin-RevId: 317754511
Change-Id: I6bcabccaa626039c53269bf7e8837fef972bd537
---
 .../python/ops/numpy_ops/np_interop_test.py   | 78 +++++++++++++++----
 1 file changed, 62 insertions(+), 16 deletions(-)

diff --git a/tensorflow/python/ops/numpy_ops/np_interop_test.py b/tensorflow/python/ops/numpy_ops/np_interop_test.py
index 836bce61494..17a3bf81a01 100644
--- a/tensorflow/python/ops/numpy_ops/np_interop_test.py
+++ b/tensorflow/python/ops/numpy_ops/np_interop_test.py
@@ -94,25 +94,11 @@ class InteropTest(tf.test.TestCase):
     dx, dy = t.gradient([xx, yy], [x, y])
 
     # # TODO(nareshmodi): Figure out a way to rewrap ndarray as tensors.
-    # self.assertIsInstance(dx, np_arrays.ndarray)
-    # self.assertIsInstance(dy, np_arrays.ndarray)
+    # self.assertIsInstance(dx, np.ndarray)
+    # self.assertIsInstance(dy, np.ndarray)
     self.assertAllClose(dx, 2.0)
     self.assertAllClose(dy, 3.0)
 
-  def testFunctionInterop(self):
-    x = np.asarray(3.0)
-    y = np.asarray(2.0)
-
-    add = lambda x, y: x + y
-    add_fn = tf.function(add)
-
-    raw_result = add(x, y)
-    fn_result = add_fn(x, y)
-
-    self.assertIsInstance(raw_result, np.ndarray)
-    self.assertIsInstance(fn_result, np.ndarray)
-    self.assertAllClose(raw_result, fn_result)
-
   def testCondInterop(self):
     x = np.asarray(3.0)
 
@@ -222,6 +208,66 @@ class InteropTest(tf.test.TestCase):
     # self.assertIsInstance(reduced, np.ndarray)
     self.assertAllClose(reduced, 15)
 
+
+class FunctionTest(InteropTest):
+
+  def testFunctionInterop(self):
+    x = np.asarray(3.0)
+    y = np.asarray(2.0)
+
+    add = lambda x, y: x + y
+    add_fn = tf.function(add)
+
+    raw_result = add(x, y)
+    fn_result = add_fn(x, y)
+
+    self.assertIsInstance(raw_result, np.ndarray)
+    self.assertIsInstance(fn_result, np.ndarray)
+    self.assertAllClose(raw_result, fn_result)
+
+  def testLen(self):
+
+    @tf.function
+    def f(x):
+      # Note that shape of input to len is data dependent.
+      return len(np.where(x)[0])
+
+    t = np.asarray([True, False, True])
+    with self.assertRaises(TypeError):
+      f(t)
+
+  def testIter(self):
+
+    @tf.function
+    def f(x):
+      y, z = x
+      return y, z
+
+    with self.assertRaises(TypeError):
+      f(np.asarray([3, 4]))
+
+  def testIndex(self):
+
+    @tf.function
+    def f(x):
+      return [0, 1][x]
+
+    with self.assertRaises(TypeError):
+      f(np.asarray([1]))
+
+
+class VariableTest(InteropTest):
+
+  def test(self):
+    tf_var = tf.Variable(2.0)
+    value = np.square(tf_var)
+    self.assertIsInstance(value, np.ndarray)
+    self.assertAllClose(4.0, value)
+    with tf.control_dependencies([tf_var.assign_add(value)]):
+      tf_var_value = tf_var.read_value()
+    self.assertAllClose(6.0, tf_var_value)
+
+
 if __name__ == '__main__':
   tf.compat.v1.enable_eager_execution()
   tf.test.main()

From 8ad4b3c4d5ad0e6a38fb0bc60651e3a0032ef52e Mon Sep 17 00:00:00 2001
From: Lukas Geiger <lukas.geiger94@gmail.com>
Date: Tue, 23 Jun 2020 01:54:00 +0200
Subject: [PATCH 0827/1390] Call Context.ensure_initialized() method directly

---
 tensorflow/python/eager/context.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/python/eager/context.py b/tensorflow/python/eager/context.py
index e1f3c58496b..96481d8bd48 100644
--- a/tensorflow/python/eager/context.py
+++ b/tensorflow/python/eager/context.py
@@ -890,13 +890,13 @@ class Context(object):
 
   @property
   def executor(self):
-    ensure_initialized()
+    self.ensure_initialized()
     return executor.Executor(
         pywrap_tfe.TFE_ContextGetExecutorForThread(self._context_handle))
 
   @executor.setter
   def executor(self, e):
-    ensure_initialized()
+    self.ensure_initialized()
     pywrap_tfe.TFE_ContextSetExecutorForThread(self._context_handle, e.handle())
 
   @property

From 76ac3e6a5ff4a1f60bc560dd069f5ba15eb57de9 Mon Sep 17 00:00:00 2001
From: Peng Wang <wangpeng@google.com>
Date: Mon, 22 Jun 2020 16:08:32 -0700
Subject: [PATCH 0828/1390] Adds NumPy LICENSE and lists of NumPy API symbols
 that `tf.experimental.numpy` implements to tensorflow/third_party/py/numpy/

PiperOrigin-RevId: 317755065
Change-Id: I0efd21b4bf917b7f14ec30e0b5710954844de448
---
 third_party/py/numpy/LICENSE                  |  60 ++
 third_party/py/numpy/README.md                |   4 +
 .../tf_numpy_api/numpy_ops.ndarray.pbtxt      |  58 ++
 .../py/numpy/tf_numpy_api/numpy_ops.pbtxt     | 903 ++++++++++++++++++
 .../numpy/tf_numpy_api/numpy_ops.random.pbtxt |  27 +
 5 files changed, 1052 insertions(+)
 create mode 100644 third_party/py/numpy/LICENSE
 create mode 100644 third_party/py/numpy/README.md
 create mode 100644 third_party/py/numpy/tf_numpy_api/numpy_ops.ndarray.pbtxt
 create mode 100644 third_party/py/numpy/tf_numpy_api/numpy_ops.pbtxt
 create mode 100644 third_party/py/numpy/tf_numpy_api/numpy_ops.random.pbtxt

diff --git a/third_party/py/numpy/LICENSE b/third_party/py/numpy/LICENSE
new file mode 100644
index 00000000000..b9731f734f5
--- /dev/null
+++ b/third_party/py/numpy/LICENSE
@@ -0,0 +1,60 @@
+Copyright (c) 2005-2019, NumPy Developers.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+    * Redistributions of source code must retain the above copyright
+       notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above
+       copyright notice, this list of conditions and the following
+       disclaimer in the documentation and/or other materials provided
+       with the distribution.
+
+    * Neither the name of the NumPy Developers nor the names of any
+       contributors may be used to endorse or promote products derived
+       from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+
+The NumPy repository and source distributions bundle several libraries that are
+compatibly licensed.  We list these here.
+
+Name: Numpydoc
+Files: doc/sphinxext/numpydoc/*
+License: 2-clause BSD
+  For details, see doc/sphinxext/LICENSE.txt
+
+Name: scipy-sphinx-theme
+Files: doc/scipy-sphinx-theme/*
+License: 3-clause BSD, PSF and Apache 2.0
+  For details, see doc/scipy-sphinx-theme/LICENSE.txt
+
+Name: lapack-lite
+Files: numpy/linalg/lapack_lite/*
+License: 3-clause BSD
+  For details, see numpy/linalg/lapack_lite/LICENSE.txt
+
+Name: tempita
+Files: tools/npy_tempita/*
+License: BSD derived
+  For details, see tools/npy_tempita/license.txt
+
+Name: dragon4
+Files: numpy/core/src/multiarray/dragon4.c
+License: One of a kind
+  For license text, see numpy/core/src/multiarray/dragon4.c
diff --git a/third_party/py/numpy/README.md b/third_party/py/numpy/README.md
new file mode 100644
index 00000000000..4e58b9df87b
--- /dev/null
+++ b/third_party/py/numpy/README.md
@@ -0,0 +1,4 @@
+# numpy_ops
+
+The folder tf_numpy_api/ contains lists of NumPy API symbols that the
+`numpy_ops` internal module in TensorFlow implements.
diff --git a/third_party/py/numpy/tf_numpy_api/numpy_ops.ndarray.pbtxt b/third_party/py/numpy/tf_numpy_api/numpy_ops.ndarray.pbtxt
new file mode 100644
index 00000000000..8492a30d81b
--- /dev/null
+++ b/third_party/py/numpy/tf_numpy_api/numpy_ops.ndarray.pbtxt
@@ -0,0 +1,58 @@
+path: "numpy_ops.ndarray"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.numpy_ops.np_arrays.ndarray\'>"
+  is_instance: "<class \'tensorflow.python.framework.composite_tensor.CompositeTensor\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "T"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "data"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "ndim"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "size"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'shape\', \'dtype\', \'buffer\'], varargs=None, keywords=None, defaults=[\"<class \'float\'>\", \'None\'], "
+  }
+  member_method {
+    name: "astype"
+    argspec: "args=[\'self\', \'dtype\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_tensor"
+    argspec: "args=[\'cls\', \'tensor\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ravel"
+    argspec: "args=[\'a\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reshape"
+    argspec: "args=[\'a\'], varargs=newshape, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "tolist"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "transpose"
+    argspec: "args=[\'a\', \'axes\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/third_party/py/numpy/tf_numpy_api/numpy_ops.pbtxt b/third_party/py/numpy/tf_numpy_api/numpy_ops.pbtxt
new file mode 100644
index 00000000000..30913665f14
--- /dev/null
+++ b/third_party/py/numpy/tf_numpy_api/numpy_ops.pbtxt
@@ -0,0 +1,903 @@
+path: "numpy_ops"
+tf_module {
+  member {
+    name: "bool_"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "complex128"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "complex64"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "complex_"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "e"
+    mtype: "<class \'float\'>"
+  }
+  member {
+    name: "float16"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "float32"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "float64"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "float_"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "iinfo"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "inexact"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "inf"
+    mtype: "<class \'float\'>"
+  }
+  member {
+    name: "int16"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "int32"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "int64"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "int8"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "int_"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "ndarray"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "object_"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "pi"
+    mtype: "<class \'float\'>"
+  }
+  member {
+    name: "random"
+    mtype: "<type \'module\'>"
+  }
+  member {
+    name: "string_"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "uint16"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "uint32"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "uint64"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "uint8"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "unicode_"
+    mtype: "<type \'type\'>"
+  }
+  member_method {
+    name: "abs"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "absolute"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "add"
+    argspec: "args=[\'x1\', \'x2\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "all"
+    argspec: "args=[\'a\', \'axis\', \'keepdims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "allclose"
+    argspec: "args=[\'a\', \'b\', \'rtol\', \'atol\', \'equal_nan\'], varargs=None, keywords=None, defaults=[\'1e-05\', \'1e-08\', \'False\'], "
+  }
+  member_method {
+    name: "amax"
+    argspec: "args=[\'a\', \'axis\', \'keepdims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "amin"
+    argspec: "args=[\'a\', \'axis\', \'keepdims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "angle"
+    argspec: "args=[\'z\', \'deg\'], varargs=None, keywords=None, defaults=[\'False\'], "
+  }
+  member_method {
+    name: "any"
+    argspec: "args=[\'a\', \'axis\', \'keepdims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "append"
+    argspec: "args=[\'arr\', \'values\', \'axis\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "arange"
+    argspec: "args=[\'start\', \'stop\', \'step\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'None\'], "
+  }
+  member_method {
+    name: "arccos"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "arccosh"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "arcsin"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "arcsinh"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "arctan"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "arctan2"
+    argspec: "args=[\'x1\', \'x2\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "arctanh"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "argmax"
+    argspec: "args=[\'a\', \'axis\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "argmin"
+    argspec: "args=[\'a\', \'axis\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "argsort"
+    argspec: "args=[\'a\', \'axis\', \'kind\', \'order\'], varargs=None, keywords=None, defaults=[\'-1\', \'quicksort\', \'None\'], "
+  }
+  member_method {
+    name: "around"
+    argspec: "args=[\'a\', \'decimals\'], varargs=None, keywords=None, defaults=[\'0\'], "
+  }
+  member_method {
+    name: "array"
+    argspec: "args=[\'val\', \'dtype\', \'copy\', \'ndmin\'], varargs=None, keywords=None, defaults=[\'None\', \'True\', \'0\'], "
+  }
+  member_method {
+    name: "array_equal"
+    argspec: "args=[\'a1\', \'a2\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "asanyarray"
+    argspec: "args=[\'a\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "asarray"
+    argspec: "args=[\'a\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ascontiguousarray"
+    argspec: "args=[\'a\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "atleast_1d"
+    argspec: "args=[], varargs=arys, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "atleast_2d"
+    argspec: "args=[], varargs=arys, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "atleast_3d"
+    argspec: "args=[], varargs=arys, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "average"
+    argspec: "args=[\'a\', \'axis\', \'weights\', \'returned\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'False\'], "
+  }
+  member_method {
+    name: "bitwise_and"
+    argspec: "args=[\'x1\', \'x2\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "bitwise_not"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "bitwise_or"
+    argspec: "args=[\'x1\', \'x2\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "bitwise_xor"
+    argspec: "args=[\'x1\', \'x2\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "broadcast_arrays"
+    argspec: "args=[], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "broadcast_to"
+    argspec: "args=[\'array\', \'shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "cbrt"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ceil"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "clip"
+    argspec: "args=[\'a\', \'a_min\', \'a_max\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compress"
+    argspec: "args=[\'condition\', \'a\', \'axis\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "concatenate"
+    argspec: "args=[\'arys\', \'axis\'], varargs=None, keywords=None, defaults=[\'0\'], "
+  }
+  member_method {
+    name: "conj"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "conjugate"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "copy"
+    argspec: "args=[\'a\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "cos"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "cosh"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_nonzero"
+    argspec: "args=[\'a\', \'axis\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "cross"
+    argspec: "args=[\'a\', \'b\', \'axisa\', \'axisb\', \'axisc\', \'axis\'], varargs=None, keywords=None, defaults=[\'-1\', \'-1\', \'-1\', \'None\'], "
+  }
+  member_method {
+    name: "cumprod"
+    argspec: "args=[\'a\', \'axis\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "cumsum"
+    argspec: "args=[\'a\', \'axis\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "deg2rad"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "diag"
+    argspec: "args=[\'v\', \'k\'], varargs=None, keywords=None, defaults=[\'0\'], "
+  }
+  member_method {
+    name: "diag_indices"
+    argspec: "args=[\'n\', \'ndim\'], varargs=None, keywords=None, defaults=[\'2\'], "
+  }
+  member_method {
+    name: "diagflat"
+    argspec: "args=[\'v\', \'k\'], varargs=None, keywords=None, defaults=[\'0\'], "
+  }
+  member_method {
+    name: "diagonal"
+    argspec: "args=[\'a\', \'offset\', \'axis1\', \'axis2\'], varargs=None, keywords=None, defaults=[\'0\', \'0\', \'1\'], "
+  }
+  member_method {
+    name: "diff"
+    argspec: "args=[\'a\', \'n\', \'axis\'], varargs=None, keywords=None, defaults=[\'1\', \'-1\'], "
+  }
+  member_method {
+    name: "divide"
+    argspec: "args=[\'x1\', \'x2\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "divmod"
+    argspec: "args=[\'x1\', \'x2\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "dot"
+    argspec: "args=[\'a\', \'b\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "dsplit"
+    argspec: "args=[\'ary\', \'indices_or_sections\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "dstack"
+    argspec: "args=[\'tup\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "einsum"
+    argspec: "args=[\'subscripts\'], varargs=operands, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "empty"
+    argspec: "args=[\'shape\', \'dtype\'], varargs=None, keywords=None, defaults=[\"<class \'float\'>\"], "
+  }
+  member_method {
+    name: "empty_like"
+    argspec: "args=[\'a\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "equal"
+    argspec: "args=[\'x1\', \'x2\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "exp"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "exp2"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "expand_dims"
+    argspec: "args=[\'a\', \'axis\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "expm1"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "eye"
+    argspec: "args=[\'N\', \'M\', \'k\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'0\', \"<class \'float\'>\"], "
+  }
+  member_method {
+    name: "fabs"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "finfo"
+    argspec: "args=[\'dtype\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "fix"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "flip"
+    argspec: "args=[\'m\', \'axis\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "fliplr"
+    argspec: "args=[\'m\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "flipud"
+    argspec: "args=[\'m\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "float_power"
+    argspec: "args=[\'x1\', \'x2\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "floor"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "floor_divide"
+    argspec: "args=[\'x1\', \'x2\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "full"
+    argspec: "args=[\'shape\', \'fill_value\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "full_like"
+    argspec: "args=[\'a\', \'fill_value\', \'dtype\', \'order\', \'subok\', \'shape\'], varargs=None, keywords=None, defaults=[\'None\', \'K\', \'True\', \'None\'], "
+  }
+  member_method {
+    name: "gcd"
+    argspec: "args=[\'x1\', \'x2\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "geomspace"
+    argspec: "args=[\'start\', \'stop\', \'num\', \'endpoint\', \'dtype\', \'axis\'], varargs=None, keywords=None, defaults=[\'50\', \'True\', \'None\', \'0\'], "
+  }
+  member_method {
+    name: "greater"
+    argspec: "args=[\'x1\', \'x2\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "greater_equal"
+    argspec: "args=[\'x1\', \'x2\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "heaviside"
+    argspec: "args=[\'x1\', \'x2\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "hsplit"
+    argspec: "args=[\'ary\', \'indices_or_sections\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "hstack"
+    argspec: "args=[\'tup\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "hypot"
+    argspec: "args=[\'x1\', \'x2\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "identity"
+    argspec: "args=[\'n\', \'dtype\'], varargs=None, keywords=None, defaults=[\"<class \'float\'>\"], "
+  }
+  member_method {
+    name: "imag"
+    argspec: "args=[\'a\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "inner"
+    argspec: "args=[\'a\', \'b\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "isclose"
+    argspec: "args=[\'a\', \'b\', \'rtol\', \'atol\', \'equal_nan\'], varargs=None, keywords=None, defaults=[\'1e-05\', \'1e-08\', \'False\'], "
+  }
+  member_method {
+    name: "iscomplex"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "iscomplexobj"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "isfinite"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "isinf"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "isnan"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "isneginf"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "isposinf"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "isreal"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "isrealobj"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "isscalar"
+    argspec: "args=[\'a\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "issubdtype"
+    argspec: "args=[\'arg1\', \'arg2\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ix_"
+    argspec: "args=[], varargs=args, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "kron"
+    argspec: "args=[\'a\', \'b\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "lcm"
+    argspec: "args=[\'x1\', \'x2\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "less"
+    argspec: "args=[\'x1\', \'x2\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "less_equal"
+    argspec: "args=[\'x1\', \'x2\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "linspace"
+    argspec: "args=[\'start\', \'stop\', \'num\', \'endpoint\', \'retstep\', \'dtype\', \'axis\'], varargs=None, keywords=None, defaults=[\'50\', \'True\', \'False\', \"<class \'float\'>\", \'0\'], "
+  }
+  member_method {
+    name: "log"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "log10"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "log1p"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "log2"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "logaddexp"
+    argspec: "args=[\'x1\', \'x2\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "logaddexp2"
+    argspec: "args=[\'x1\', \'x2\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "logical_and"
+    argspec: "args=[\'x1\', \'x2\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "logical_not"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "logical_or"
+    argspec: "args=[\'x1\', \'x2\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "logical_xor"
+    argspec: "args=[\'x1\', \'x2\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "logspace"
+    argspec: "args=[\'start\', \'stop\', \'num\', \'endpoint\', \'base\', \'dtype\', \'axis\'], varargs=None, keywords=None, defaults=[\'50\', \'True\', \'10.0\', \'None\', \'0\'], "
+  }
+  member_method {
+    name: "matmul"
+    argspec: "args=[\'x1\', \'x2\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "max"
+    argspec: "args=[\'a\', \'axis\', \'keepdims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "maximum"
+    argspec: "args=[\'x1\', \'x2\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "mean"
+    argspec: "args=[\'a\', \'axis\', \'dtype\', \'keepdims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "meshgrid"
+    argspec: "args=[], varargs=xi, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "min"
+    argspec: "args=[\'a\', \'axis\', \'keepdims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "minimum"
+    argspec: "args=[\'x1\', \'x2\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "mod"
+    argspec: "args=[\'x1\', \'x2\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "moveaxis"
+    argspec: "args=[\'a\', \'source\', \'destination\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "multiply"
+    argspec: "args=[\'x1\', \'x2\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "nanmean"
+    argspec: "args=[\'a\', \'axis\', \'dtype\', \'keepdims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "nanprod"
+    argspec: "args=[\'a\', \'axis\', \'dtype\', \'keepdims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'False\'], "
+  }
+  member_method {
+    name: "nansum"
+    argspec: "args=[\'a\', \'axis\', \'dtype\', \'keepdims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'False\'], "
+  }
+  member_method {
+    name: "ndim"
+    argspec: "args=[\'a\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "negative"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "nextafter"
+    argspec: "args=[\'x1\', \'x2\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "nonzero"
+    argspec: "args=[\'a\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "not_equal"
+    argspec: "args=[\'x1\', \'x2\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ones"
+    argspec: "args=[\'shape\', \'dtype\'], varargs=None, keywords=None, defaults=[\"<class \'float\'>\"], "
+  }
+  member_method {
+    name: "ones_like"
+    argspec: "args=[\'a\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "outer"
+    argspec: "args=[\'a\', \'b\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "pad"
+    argspec: "args=[\'ary\', \'pad_width\', \'mode\', \'constant_values\'], varargs=None, keywords=None, defaults=[\'0\'], "
+  }
+  member_method {
+    name: "polyval"
+    argspec: "args=[\'p\', \'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "positive"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "power"
+    argspec: "args=[\'x1\', \'x2\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "prod"
+    argspec: "args=[\'a\', \'axis\', \'dtype\', \'keepdims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "promote_types"
+    argspec: "args=[\'type1\', \'type2\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ptp"
+    argspec: "args=[\'a\', \'axis\', \'keepdims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "rad2deg"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ravel"
+    argspec: "args=[\'a\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "real"
+    argspec: "args=[\'val\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reciprocal"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "remainder"
+    argspec: "args=[\'x1\', \'x2\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "repeat"
+    argspec: "args=[\'a\', \'repeats\', \'axis\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "reshape"
+    argspec: "args=[\'a\', \'newshape\', \'order\'], varargs=None, keywords=None, defaults=[\'C\'], "
+  }
+  member_method {
+    name: "result_type"
+    argspec: "args=[], varargs=arrays_and_dtypes, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "roll"
+    argspec: "args=[\'a\', \'shift\', \'axis\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "rot90"
+    argspec: "args=[\'m\', \'k\', \'axes\'], varargs=None, keywords=None, defaults=[\'1\', \'(0, 1)\'], "
+  }
+  member_method {
+    name: "round"
+    argspec: "args=[\'a\', \'decimals\'], varargs=None, keywords=None, defaults=[\'0\'], "
+  }
+  member_method {
+    name: "select"
+    argspec: "args=[\'condlist\', \'choicelist\', \'default\'], varargs=None, keywords=None, defaults=[\'0\'], "
+  }
+  member_method {
+    name: "shape"
+    argspec: "args=[\'a\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "sign"
+    argspec: "args=[\'x\', \'out\', \'where\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "signbit"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "sin"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "sinc"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "sinh"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "sort"
+    argspec: "args=[\'a\', \'axis\', \'kind\', \'order\'], varargs=None, keywords=None, defaults=[\'-1\', \'quicksort\', \'None\'], "
+  }
+  member_method {
+    name: "split"
+    argspec: "args=[\'ary\', \'indices_or_sections\', \'axis\'], varargs=None, keywords=None, defaults=[\'0\'], "
+  }
+  member_method {
+    name: "sqrt"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "square"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "squeeze"
+    argspec: "args=[\'a\', \'axis\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "stack"
+    argspec: "args=[\'arrays\', \'axis\'], varargs=None, keywords=None, defaults=[\'0\'], "
+  }
+  member_method {
+    name: "std"
+    argspec: "args=[\'a\', \'axis\', \'keepdims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "subtract"
+    argspec: "args=[\'x1\', \'x2\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "sum"
+    argspec: "args=[\'a\', \'axis\', \'dtype\', \'keepdims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "swapaxes"
+    argspec: "args=[\'a\', \'axis1\', \'axis2\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "take"
+    argspec: "args=[\'a\', \'indices\', \'axis\', \'out\', \'mode\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'clip\'], "
+  }
+  member_method {
+    name: "take_along_axis"
+    argspec: "args=[\'arr\', \'indices\', \'axis\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "tan"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "tanh"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "tensordot"
+    argspec: "args=[\'a\', \'b\', \'axes\'], varargs=None, keywords=None, defaults=[\'2\'], "
+  }
+  member_method {
+    name: "tile"
+    argspec: "args=[\'a\', \'reps\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "trace"
+    argspec: "args=[\'a\', \'offset\', \'axis1\', \'axis2\', \'dtype\'], varargs=None, keywords=None, defaults=[\'0\', \'0\', \'1\', \'None\'], "
+  }
+  member_method {
+    name: "transpose"
+    argspec: "args=[\'a\', \'axes\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "tri"
+    argspec: "args=[\'N\', \'M\', \'k\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'0\', \'None\'], "
+  }
+  member_method {
+    name: "tril"
+    argspec: "args=[\'m\', \'k\'], varargs=None, keywords=None, defaults=[\'0\'], "
+  }
+  member_method {
+    name: "triu"
+    argspec: "args=[\'m\', \'k\'], varargs=None, keywords=None, defaults=[\'0\'], "
+  }
+  member_method {
+    name: "true_divide"
+    argspec: "args=[\'x1\', \'x2\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "vander"
+    argspec: "args=[\'x\', \'N\', \'increasing\'], varargs=None, keywords=None, defaults=[\'None\', \'False\'], "
+  }
+  member_method {
+    name: "var"
+    argspec: "args=[\'a\', \'axis\', \'dtype\', \'out\', \'ddof\', \'keepdims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'0\', \'None\'], "
+  }
+  member_method {
+    name: "vdot"
+    argspec: "args=[\'a\', \'b\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "vsplit"
+    argspec: "args=[\'ary\', \'indices_or_sections\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "vstack"
+    argspec: "args=[\'tup\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "where"
+    argspec: "args=[\'condition\', \'x\', \'y\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "zeros"
+    argspec: "args=[\'shape\', \'dtype\'], varargs=None, keywords=None, defaults=[\"<class \'float\'>\"], "
+  }
+  member_method {
+    name: "zeros_like"
+    argspec: "args=[\'a\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/third_party/py/numpy/tf_numpy_api/numpy_ops.random.pbtxt b/third_party/py/numpy/tf_numpy_api/numpy_ops.random.pbtxt
new file mode 100644
index 00000000000..3e6bb720e8c
--- /dev/null
+++ b/third_party/py/numpy/tf_numpy_api/numpy_ops.random.pbtxt
@@ -0,0 +1,27 @@
+path: "numpy_ops.random"
+tf_module {
+  member_method {
+    name: "rand"
+    argspec: "args=[], varargs=size, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "randint"
+    argspec: "args=[\'low\', \'high\', \'size\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \"<class \'int\'>\"], "
+  }
+  member_method {
+    name: "randn"
+    argspec: "args=[], varargs=args, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "random"
+    argspec: "args=[\'size\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "seed"
+    argspec: "args=[\'s\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "uniform"
+    argspec: "args=[\'low\', \'high\', \'size\'], varargs=None, keywords=None, defaults=[\'0.0\', \'1.0\', \'None\'], "
+  }
+}

From 23fc134a1762ea29f1fb300d3d60e84bd8939a29 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 22 Jun 2020 16:12:37 -0700
Subject: [PATCH 0829/1390] Go: Update generated wrapper functions for
 TensorFlow ops.

PiperOrigin-RevId: 317755766
Change-Id: I105b220b0760792d9f4fa7c1390c1477cdb1c627
---
 tensorflow/go/op/wrappers.go | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index 8bceeb78564..3675c26751c 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -11448,6 +11448,11 @@ func AssertNextDataset(scope *Scope, input_dataset tf.Output, transformations tf
 }
 
 // Return the index of device the op runs.
+//
+// Given a list of device names, this operation returns the index of the device
+// this op runs. The length of the list is returned in two cases:
+// (1) Device does not exist in the given device list.
+// (2) It is in XLA compilation.
 func DeviceIndex(scope *Scope, device_names []string) (index tf.Output) {
 	if scope.Err() != nil {
 		return

From a26044ac2f32da5b8e91e3ee4b1a3f49de08fee7 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 22 Jun 2020 16:16:21 -0700
Subject: [PATCH 0830/1390] Make a couple of remaining tests in
 math_ops_test.py run in eager mode.

PiperOrigin-RevId: 317756434
Change-Id: I14911d5f460131f4b9bdf6031cc7011ddd6a1d53
---
 tensorflow/python/ops/math_ops_test.py | 40 +++++++++++++++++++-------
 1 file changed, 30 insertions(+), 10 deletions(-)

diff --git a/tensorflow/python/ops/math_ops_test.py b/tensorflow/python/ops/math_ops_test.py
index 940966741dc..9699f6d2b78 100644
--- a/tensorflow/python/ops/math_ops_test.py
+++ b/tensorflow/python/ops/math_ops_test.py
@@ -397,7 +397,6 @@ class AddNTest(test_util.TensorFlowTestCase):
         self.assertAllEqual(x[0] * num_inputs,
                             math_ops.add_n([tf_x[0]] * num_inputs))
 
-  @test_util.deprecated_graph_mode_only
   def testGrad(self):
     np.random.seed(42)
     for num_inputs in range(1, 10):
@@ -406,9 +405,16 @@ class AddNTest(test_util.TensorFlowTestCase):
             variables.Variable(10.0 * np.random.random())
             for _ in range(0, num_inputs)
         ]
-        addn = math_ops.add_n(input_vars)
         self.evaluate(variables.global_variables_initializer())
-        add_n_grad = gradients.gradients(addn, input_vars)
+        if context.executing_eagerly():
+          with backprop.GradientTape() as tape:
+            tape.watch(input_vars)
+            addn = math_ops.add_n(input_vars)
+            add_n_grad = tape.gradient(addn, input_vars)
+        else:
+          addn = math_ops.add_n(input_vars)
+          add_n_grad = gradients.gradients(addn, input_vars)
+
         self.assertAllEqual(
             np.repeat(1.0, num_inputs),  # d/dx (x + y + ...) = 1
             [self.evaluate(g) for g in add_n_grad])
@@ -515,18 +521,32 @@ class DivAndModTest(test_util.TensorFlowTestCase):
     _ = math_ops.divide(foo, 1.)
     _ = math_ops.div(foo, 2.)
 
-  @test_util.deprecated_graph_mode_only
   def testFloorDivGrad(self):
     a = variables.Variable(2.)
     b = variables.Variable(4.)
+    input_vars = [a, b]
     self.evaluate(variables.global_variables_initializer())
-    c_grad = gradients.gradients(math_ops.divide(a, b), [a, b])
-    self.assertAllEqual([self.evaluate(x) for x in c_grad], [.25, -.125])
-    c_grad = gradients.gradients(math_ops.div(a, b), [a, b])
-    self.assertAllEqual([self.evaluate(x) for x in c_grad], [.25, -.125])
-    c_grad = gradients.gradients(math_ops.floordiv(a, b), [a, b])
+    if context.executing_eagerly():
+      # TDOO(rmlarsen): Is there a more compact way of
+      # writing this for multiple expressions?
+      with backprop.GradientTape() as tape:
+        tape.watch(input_vars)
+        c_grad0 = tape.gradient(math_ops.divide(a, b), input_vars)
+      with backprop.GradientTape() as tape:
+        tape.watch(input_vars)
+        c_grad1 = tape.gradient(math_ops.div(a, b), input_vars)
+      with backprop.GradientTape() as tape:
+        tape.watch(input_vars)
+        c_grad2 = tape.gradient(math_ops.floordiv(a, b), input_vars)
+    else:
+      c_grad0 = gradients.gradients(math_ops.divide(a, b), input_vars)
+      c_grad1 = gradients.gradients(math_ops.div(a, b), input_vars)
+      c_grad2 = gradients.gradients(math_ops.floordiv(a, b), input_vars)
+    self.assertAllEqual([self.evaluate(x) for x in c_grad0], [.25, -.125])
+    self.assertAllEqual([self.evaluate(x) for x in c_grad1], [.25, -.125])
     self.assertAllEqual(
-        [None if x is None else self.evaluate(x) for x in c_grad], [None, None])
+        [None if x is None else self.evaluate(x) for x in c_grad2],
+        [None, None])
 
   def testConsistent(self):
     nums, divs = self.intTestData()

From 7789ccd70067ac288dbe12a5ab525588a32f3437 Mon Sep 17 00:00:00 2001
From: rahul-kamat <rahulkamat@gmail.com>
Date: Tue, 23 Jun 2020 00:18:56 +0000
Subject: [PATCH 0831/1390] Pass whitelist op set from op gen main

---
 tensorflow/python/framework/python_op_gen.cc  | 37 ++++++++++---------
 tensorflow/python/framework/python_op_gen.h   |  6 ++-
 .../framework/python_op_gen_internal.cc       |  3 +-
 .../python/framework/python_op_gen_internal.h |  3 +-
 .../python/framework/python_op_gen_main.cc    | 20 +++++++---
 5 files changed, 41 insertions(+), 28 deletions(-)

diff --git a/tensorflow/python/framework/python_op_gen.cc b/tensorflow/python/framework/python_op_gen.cc
index 42fb88bec88..0b6f974d962 100644
--- a/tensorflow/python/framework/python_op_gen.cc
+++ b/tensorflow/python/framework/python_op_gen.cc
@@ -72,10 +72,6 @@ std::unordered_map<string, string> dtype_type {
       {"_dtypes.variant", "_dtypes.Variant"}
 };
 
-// Add op name to this set to add type annotations
-std::unordered_set<string> type_annotate_ops {
-};
-
 string AttrVarName(const string& attr_name,
                    std::unordered_map<string, string>* attr_expressions) {
   const string var = strings::StrCat("_attr_", attr_name);
@@ -137,8 +133,8 @@ string TensorPBString(const TensorProto& pb) {
 class GenEagerPythonOp : public python_op_gen_internal::GenPythonOp {
  public:
   GenEagerPythonOp(const OpDef& op_def, const ApiDef& api_def,
-                   const string& function_name)
-      : python_op_gen_internal::GenPythonOp(op_def, api_def, function_name) {
+                   const string& function_name, const bool type_annotate_op)
+      : python_op_gen_internal::GenPythonOp(op_def, api_def, function_name, type_annotate_op) {
     op_name_ = function_name_;
     absl::ConsumePrefix(&op_name_, "_");
   }
@@ -218,8 +214,8 @@ class GenEagerPythonOp : public python_op_gen_internal::GenPythonOp {
 };
 
 string GetEagerPythonOp(const OpDef& op_def, const ApiDef& api_def,
-                        const string& function_name) {
-  return GenEagerPythonOp(op_def, api_def, function_name).Code();
+                        const string& function_name, const bool type_annotate_op) {
+  return GenEagerPythonOp(op_def, api_def, function_name, type_annotate_op).Code();
 }
 
 string GenEagerPythonOp::FlattenInputs(
@@ -351,7 +347,7 @@ string GenEagerPythonOp::Code() {
 
   std::unordered_map<string, string> type_annotations;
   // Only populate map for whitelisted ops
-  if (type_annotate_ops.find(op_def_.name()) != type_annotate_ops.end()) {
+  if (type_annotate_op_) {
     type_annotations = GetTypeAnnotationMap();
   }
 
@@ -834,7 +830,7 @@ void GenEagerPythonOp::AddEagerFunctionTeardown(
 bool GenEagerPythonOp::AddEagerFastPathAndGraphCode(
     const string& parameters, const std::vector<string>& output_sizes,
     const string& eager_not_allowed_error, std::unordered_map<string, string>& type_annotations) {
-  if (type_annotate_ops.find(op_def_.name()) != type_annotate_ops.end()) {
+  if (type_annotate_op_) {
     GenerateTypeVars(type_annotations);
   }
   if (api_def_.visibility() == ApiDef::VISIBLE) {
@@ -843,7 +839,7 @@ bool GenEagerPythonOp::AddEagerFastPathAndGraphCode(
 
   AddExport();
   AddDefLine(function_name_, parameters);
-  if (type_annotate_ops.find(op_def_.name()) != type_annotate_ops.end()) {
+  if (type_annotate_op_) {
     AddReturnTypeAnnotation(type_annotations);
   }
   AddDocStringDescription();
@@ -885,7 +881,7 @@ bool GenEagerPythonOp::AddEagerFallbackCode(
   AddDefLine(
       strings::StrCat(function_name_, kEagerFallbackSuffix),
       strings::StrCat(parameters, parameters.empty() ? "" : ", ", "ctx"));
-  if (type_annotate_ops.find(op_def_.name()) != type_annotate_ops.end()) {
+  if (type_annotate_op_) {
     AddReturnTypeAnnotation(type_annotations);
   }
   if (!eager_not_allowed_error.empty()) {
@@ -1136,7 +1132,8 @@ void GenEagerPythonOp::AddRawOpExport(const string& parameters) {
 
 string GetPythonOpsImpl(const OpList& ops, const ApiDefMap& api_defs,
                         const std::vector<string>& hidden_ops,
-                        const string& source_file_name = "") {
+                        const string& source_file_name = "",
+                        std::unordered_set<string> type_annotate_ops = {}) {
   string result;
   // Header
   // TODO(josh11b): Mention the library for which wrappers are being generated.
@@ -1214,8 +1211,10 @@ from typing import TypeVar
       continue;
     }
 
+    const bool type_annotate_op = type_annotate_ops.find(op_def.name()) != type_annotate_ops.end();
+
     strings::StrAppend(&result,
-                       GetEagerPythonOp(op_def, *api_def, function_name));
+                       GetEagerPythonOp(op_def, *api_def, function_name, type_annotate_op));
   }
 
   return result;
@@ -1225,15 +1224,17 @@ from typing import TypeVar
 
 string GetPythonOps(const OpList& ops, const ApiDefMap& api_defs,
                     const std::vector<string>& hidden_ops,
-                    const string& source_file_name) {
-  return GetPythonOpsImpl(ops, api_defs, hidden_ops, source_file_name);
+                    const string& source_file_name,
+                    std::unordered_set<string> type_annotate_ops) {
+  return GetPythonOpsImpl(ops, api_defs, hidden_ops, source_file_name, type_annotate_ops);
 }
 
 void PrintPythonOps(const OpList& ops, const ApiDefMap& api_defs,
                     const std::vector<string>& hidden_ops,
-                    const string& source_file_name) {
+                    const string& source_file_name,
+                    std::unordered_set<string> type_annotate_ops) {
   printf("%s",
-         GetPythonOpsImpl(ops, api_defs, hidden_ops, source_file_name).c_str());
+         GetPythonOpsImpl(ops, api_defs, hidden_ops, source_file_name, type_annotate_ops).c_str());
 }
 
 string GetPythonWrappers(const char* op_list_buf, size_t op_list_len) {
diff --git a/tensorflow/python/framework/python_op_gen.h b/tensorflow/python/framework/python_op_gen.h
index 8d5eb68bed1..1a3b6c5e8f2 100644
--- a/tensorflow/python/framework/python_op_gen.h
+++ b/tensorflow/python/framework/python_op_gen.h
@@ -32,7 +32,8 @@ namespace tensorflow {
 // file where the ops' REGISTER_OP() calls reside.
 string GetPythonOps(const OpList& ops, const ApiDefMap& api_defs,
                     const std::vector<string>& hidden_ops,
-                    const string& source_file_name);
+                    const string& source_file_name,
+                    std::unordered_set<string> type_annotate_ops);
 
 // Prints the output of GetPrintOps to stdout.
 // hidden_ops should be a list of Op names that should get a leading _
@@ -41,7 +42,8 @@ string GetPythonOps(const OpList& ops, const ApiDefMap& api_defs,
 // where the ops' REGISTER_OP() calls reside.
 void PrintPythonOps(const OpList& ops, const ApiDefMap& api_defs,
                     const std::vector<string>& hidden_ops,
-                    const string& source_file_name);
+                    const string& source_file_name,
+                    std::unordered_set<string> type_annotate_ops);
 
 // Get the python wrappers for a list of ops in a OpList.
 // `op_list_buf` should be a pointer to a buffer containing
diff --git a/tensorflow/python/framework/python_op_gen_internal.cc b/tensorflow/python/framework/python_op_gen_internal.cc
index 05102db0189..d0ef82857c4 100644
--- a/tensorflow/python/framework/python_op_gen_internal.cc
+++ b/tensorflow/python/framework/python_op_gen_internal.cc
@@ -513,10 +513,11 @@ const ApiDef::Attr* FindAttr(StringPiece name, const ApiDef& api_def) {
 }
 
 GenPythonOp::GenPythonOp(const OpDef& op_def, const ApiDef& api_def,
-                         const string& function_name)
+                         const string& function_name, const bool type_annotate_op)
     : op_def_(op_def),
       api_def_(api_def),
       function_name_(function_name),
+      type_annotate_op_(type_annotate_op),
       num_outs_(op_def.output_arg_size()) {}
 
 GenPythonOp::~GenPythonOp() {}
diff --git a/tensorflow/python/framework/python_op_gen_internal.h b/tensorflow/python/framework/python_op_gen_internal.h
index 094eb702438..5229bffc5d0 100644
--- a/tensorflow/python/framework/python_op_gen_internal.h
+++ b/tensorflow/python/framework/python_op_gen_internal.h
@@ -71,7 +71,7 @@ class ParamNames {
 class GenPythonOp {
  public:
   GenPythonOp(const OpDef& op_def, const ApiDef& api_def,
-              const string& function_name);
+              const string& function_name, const bool type_annotate_op_);
   virtual ~GenPythonOp();
 
   virtual string Code();
@@ -98,6 +98,7 @@ class GenPythonOp {
   const OpDef& op_def_;
   const ApiDef& api_def_;
   const string function_name_;
+  const bool type_annotate_op_;
   const int num_outs_;
 
   // Return value from Code() is prelude_ + result_.
diff --git a/tensorflow/python/framework/python_op_gen_main.cc b/tensorflow/python/framework/python_op_gen_main.cc
index 2f0ef70fe67..dcaea53100e 100644
--- a/tensorflow/python/framework/python_op_gen_main.cc
+++ b/tensorflow/python/framework/python_op_gen_main.cc
@@ -108,7 +108,8 @@ string InferSourceFileName(const char* argv_zero) {
 void PrintAllPythonOps(const std::vector<string>& op_list,
                        const std::vector<string>& api_def_dirs,
                        const string& source_file_name,
-                       bool op_list_is_whitelist) {
+                       bool op_list_is_whitelist,
+                       std::unordered_set<string> type_annotate_ops) {
   OpList ops;
   OpRegistry::Global()->Export(false, &ops);
 
@@ -133,9 +134,9 @@ void PrintAllPythonOps(const std::vector<string>& op_list,
         *pruned_ops.mutable_op()->Add() = op_def;
       }
     }
-    PrintPythonOps(pruned_ops, api_def_map, {}, source_file_name);
+    PrintPythonOps(pruned_ops, api_def_map, {}, source_file_name, type_annotate_ops);
   } else {
-    PrintPythonOps(ops, api_def_map, op_list, source_file_name);
+    PrintPythonOps(ops, api_def_map, op_list, source_file_name, type_annotate_ops);
   }
 }
 
@@ -157,19 +158,26 @@ int main(int argc, char* argv[]) {
   std::vector<tensorflow::string> api_def_dirs = tensorflow::str_util::Split(
       argv[1], ",", tensorflow::str_util::SkipEmpty());
 
+  // Add op name to this set to add type annotations
+  std::unordered_set<tensorflow::string> type_annotate_ops {
+  };
+
   if (argc == 2) {
     tensorflow::PrintAllPythonOps({}, api_def_dirs, source_file_name,
-                                  false /* op_list_is_whitelist */);
+                                  false /* op_list_is_whitelist */,
+                                  type_annotate_ops);
   } else if (argc == 3) {
     std::vector<tensorflow::string> hidden_ops;
     TF_CHECK_OK(tensorflow::ParseOpListCommandLine(argv[2], &hidden_ops));
     tensorflow::PrintAllPythonOps(hidden_ops, api_def_dirs, source_file_name,
-                                  false /* op_list_is_whitelist */);
+                                  false /* op_list_is_whitelist */,
+                                  type_annotate_ops);
   } else if (argc == 4) {
     std::vector<tensorflow::string> op_list;
     TF_CHECK_OK(tensorflow::ParseOpListCommandLine(argv[2], &op_list));
     tensorflow::PrintAllPythonOps(op_list, api_def_dirs, source_file_name,
-                                  tensorflow::string(argv[3]) == "1");
+                                  tensorflow::string(argv[3]) == "1",
+                                  type_annotate_ops);
   } else {
     return -1;
   }

From 557916fa3fd9d4c7dbcfabeb982632e2f31c4f1d Mon Sep 17 00:00:00 2001
From: Feng Liu <fengliuai@google.com>
Date: Mon, 22 Jun 2020 16:17:13 -0700
Subject: [PATCH 0832/1390] Fuse the back to back tfr.cast with unranked input
 to tf.EnsureShape

After this step, all the tfr ops should be raised to the tf ops

PiperOrigin-RevId: 317756602
Change-Id: I15731f231caf6c47eac1719fc1c2ed3d01cff515
---
 .../mlir/tensorflow/ir/tf_generated_ops.td    | 21 +++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td b/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
index ecbe63ac525..7f250392cb2 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
@@ -2806,6 +2806,27 @@ the corresponding feature.
   TF_DerivedOperandSizeAttr N = TF_DerivedOperandSizeAttr<0>;
 }
 
+def TF_EnsureShapeOp : TF_Op<"EnsureShape", [NoSideEffect]> {
+  let summary = "Ensures that the tensor's shape matches the expected shape.";
+
+  let description = [{
+Raises an error if the input tensor's shape does not match the specified shape.
+Returns the input tensor otherwise.
+  }];
+
+  let arguments = (ins
+    TF_Tensor:$input,
+
+    TF_ShapeAttr:$shape
+  );
+
+  let results = (outs
+    TF_Tensor:$output
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+}
+
 def TF_EqualOp : TF_Op<"Equal", [Commutative, NoSideEffect]> {
   let summary = "Returns the truth value of (x == y) element-wise.";
 

From d2b35a7955ae20829ad395999089f889cbbcc598 Mon Sep 17 00:00:00 2001
From: Xinyi Wang <wxinyi@google.com>
Date: Mon, 22 Jun 2020 16:39:57 -0700
Subject: [PATCH 0833/1390] Enable last partial batch for MWMS in TF2.x

PiperOrigin-RevId: 317760674
Change-Id: Ib7e0adbf4f8f013f21faef07ed4961c078806093
---
 .../collective_all_reduce_strategy.py         |  1 +
 .../collective_all_reduce_strategy_test.py    | 55 +++++++++--------
 tensorflow/python/distribute/input_lib.py     | 13 ++--
 .../python/distribute/input_lib_test.py       |  1 -
 .../python/distribute/strategy_common_test.py | 61 +++++++++++++++++--
 5 files changed, 91 insertions(+), 40 deletions(-)

diff --git a/tensorflow/python/distribute/collective_all_reduce_strategy.py b/tensorflow/python/distribute/collective_all_reduce_strategy.py
index f73474c4af4..e2b039ceb23 100644
--- a/tensorflow/python/distribute/collective_all_reduce_strategy.py
+++ b/tensorflow/python/distribute/collective_all_reduce_strategy.py
@@ -178,6 +178,7 @@ class CollectiveAllReduceExtended(mirrored_strategy.MirroredExtended):
     self._communication = communication
     self._initialize_strategy(self._cluster_resolver)
     self._cfer_fn_cache = weakref.WeakKeyDictionary()
+    self.experimental_enable_get_next_as_optional = True
     assert isinstance(self._cross_device_ops,
                       cross_device_ops_lib.CollectiveAllReduce)
 
diff --git a/tensorflow/python/distribute/collective_all_reduce_strategy_test.py b/tensorflow/python/distribute/collective_all_reduce_strategy_test.py
index a9f7bc74e9e..87212c85fc4 100644
--- a/tensorflow/python/distribute/collective_all_reduce_strategy_test.py
+++ b/tensorflow/python/distribute/collective_all_reduce_strategy_test.py
@@ -370,7 +370,8 @@ class CollectiveAllReduceStrategyTestBase(
         else:
           self.assertEqual(list(expected_value), list(computed_value))
 
-      with self.assertRaises(errors.OutOfRangeError):
+      # error raised by calling optional_get_value on an Optional of None
+      with self.assertRaises(errors.InvalidArgumentError):
         next_element = iterator.get_next()
         sess.run([distribute_utils.select_replica(r, next_element)
                   for r in range(len(devices))])
@@ -449,31 +450,35 @@ class DistributedCollectiveAllReduceStrategyTest(
       combinations.combine(
           mode=['graph'], required_gpus=[0, 1, 2], use_dataset=[True, False]))
   def testMakeInputFnIterator(self, required_gpus, use_dataset):
-    if use_dataset:
-      fn = lambda: dataset_ops.Dataset.range(100)
-    else:
-      def fn():
-        dataset = dataset_ops.Dataset.range(100)
-        it = dataset_ops.make_one_shot_iterator(dataset)
-        return it.get_next
-    # We use CPU as the device when required_gpus = 0
-    devices_per_worker = max(1, required_gpus)
-    expected_values = [[i+j for j in range(devices_per_worker)]
-                       for i in range(0, 100, devices_per_worker)]
+    def _worker_fn(task_type, task_id, required_gpus):
+      if use_dataset:
+        fn = lambda: dataset_ops.Dataset.range(100)
+      else:
+        def fn():
+          dataset = dataset_ops.Dataset.range(100)
+          it = dataset_ops.make_one_shot_iterator(dataset)
+          return it.get_next
+      # We use CPU as the device when required_gpus = 0
+      devices_per_worker = max(1, required_gpus)
+      expected_values = [[i+j for j in range(devices_per_worker)]
+                         for i in range(0, 100, devices_per_worker)]
 
-    input_fn = self._input_fn_to_test_input_context(
-        fn,
-        expected_num_replicas_in_sync=3*devices_per_worker,
-        expected_num_input_pipelines=3,
-        expected_input_pipeline_id=1)  # because task_id = 1
-    self._test_input_fn_iterator(
-        'worker',
-        1,
-        required_gpus,
-        input_fn,
-        expected_values,
-        test_reinitialize=use_dataset,
-        ignore_order=not use_dataset)
+      input_fn = self._input_fn_to_test_input_context(
+          fn,
+          expected_num_replicas_in_sync=3*devices_per_worker,
+          expected_num_input_pipelines=3,
+          expected_input_pipeline_id=task_id)
+      self._test_input_fn_iterator(
+          task_type,
+          task_id,
+          required_gpus,
+          input_fn,
+          expected_values,
+          test_reinitialize=use_dataset,
+          ignore_order=not use_dataset)
+
+    self._run_between_graph_clients(_worker_fn, self._cluster_spec,
+                                    required_gpus)
 
   @combinations.generate(combinations.combine(mode=['graph']))
   def testUpdateConfigProto(self):
diff --git a/tensorflow/python/distribute/input_lib.py b/tensorflow/python/distribute/input_lib.py
index e4a362a92c6..74268999de0 100644
--- a/tensorflow/python/distribute/input_lib.py
+++ b/tensorflow/python/distribute/input_lib.py
@@ -549,7 +549,7 @@ def _get_next_as_optional(iterator, strategy, name=None):
       # Collective all-reduce requires explicit devices for inputs.
       with ops.device("/cpu:0"):
         # Converting to integers for all-reduce.
-        worker_has_value = math_ops.cast(worker_has_value, dtypes.int32)
+        worker_has_value = math_ops.cast(worker_has_value, dtypes.int64)
         worker_devices.append(worker_has_value.device)
         worker_has_values.append(worker_has_value)
       # Make `replicas` a flat list of values across all replicas.
@@ -624,16 +624,12 @@ class DistributedIteratorBase(DistributedIteratorInterface):
     # get_next_as_optional(). And we only enable get_next_as_optional when the
     # output shapes are not static.
     #
-    # TODO(yuefengz): Currently `experimental_enable_get_next_as_optional` is
-    # always set to False in CollectiveAllReduceStrategy. We want to have a way
-    # to distinguish multi workers/single worker between graph, so we can enable
-    # the behavior in single worker case.
-    #
     # TODO(rxsang): We want to always enable the get_next_as_optional behavior
     # when user passed input_fn instead of dataset.
     if getattr(
         strategy.extended, "experimental_enable_get_next_as_optional", False):
-      self._enable_get_next_as_optional = not static_shape
+      self._enable_get_next_as_optional = (
+          not static_shape) or strategy.extended._in_multi_worker_mode()
     else:
       self._enable_get_next_as_optional = False
 
@@ -906,7 +902,8 @@ class DistributedIterator(DistributedIteratorBase,
       self._strategy = strategy
       if getattr(
           strategy.extended, "experimental_enable_get_next_as_optional", False):
-        self._enable_get_next_as_optional = not static_shape
+        self._enable_get_next_as_optional = (
+            not static_shape) or strategy.extended._in_multi_worker_mode()
       else:
         self._enable_get_next_as_optional = False
     else:
diff --git a/tensorflow/python/distribute/input_lib_test.py b/tensorflow/python/distribute/input_lib_test.py
index 7f02d0121d0..23397bf5070 100644
--- a/tensorflow/python/distribute/input_lib_test.py
+++ b/tensorflow/python/distribute/input_lib_test.py
@@ -1144,7 +1144,6 @@ class DistributedIteratorMultiWorkerTest(
           expected_values = [[[0, 1]], [[2, 3]], [[4]]]
           input_context = None
 
-        strategy.extended.experimental_enable_get_next_as_optional = True
         self._test_input_iteration(
             input_type,
             api_type,
diff --git a/tensorflow/python/distribute/strategy_common_test.py b/tensorflow/python/distribute/strategy_common_test.py
index ed52a4794ee..b823e2de331 100644
--- a/tensorflow/python/distribute/strategy_common_test.py
+++ b/tensorflow/python/distribute/strategy_common_test.py
@@ -19,6 +19,7 @@ from __future__ import division
 from __future__ import print_function
 
 from absl.testing import parameterized
+import numpy as np
 
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.distribute import combinations
@@ -29,7 +30,6 @@ from tensorflow.python.distribute import strategy_test_lib
 from tensorflow.python.eager import def_function
 from tensorflow.python.framework import dtypes
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 
 
@@ -81,7 +81,7 @@ class DistributedCollectiveAllReduceStrategyTest(
       return d.shard(input_context.num_input_pipelines,
                      input_context.input_pipeline_id)
 
-    expected_sum_on_workers = [10, 35]
+    expected_data_on_worker = [[0, 1, 2, 3, 4], [5, 6, 7, 8, 9]]
     input_iterator = iter(
         strategy.experimental_distribute_datasets_from_function(dataset_fn))
 
@@ -90,10 +90,59 @@ class DistributedCollectiveAllReduceStrategyTest(
       return strategy.experimental_local_results(iterator.get_next())
 
     result = run(input_iterator)
-    sum_value = math_ops.reduce_sum(result)
-    self.assertEqual(
-        sum_value.numpy(),
-        expected_sum_on_workers[multi_worker_test_base.get_task_index()])
+    self.assertTrue(
+        np.array_equal(
+            result[0].numpy(),
+            expected_data_on_worker[multi_worker_test_base.get_task_index()]))
+
+  def testSimpleInputFromDatasetLastPartialBatch(self, strategy):
+    global_batch_size = 8
+    dataset = dataset_ops.DatasetV2.range(14).batch(
+        global_batch_size, drop_remainder=False)
+    input_iterator = iter(strategy.experimental_distribute_dataset(dataset))
+
+    @def_function.function
+    def run(input_iterator):
+      return strategy.run(lambda x: x, args=(next(input_iterator),))
+
+    # Let the complete batch go.
+    run(input_iterator)
+
+    # `result` is an incomplete batch
+    result = run(input_iterator)
+    expected_data_on_worker = [[8, 9, 10], [11, 12, 13]]
+    self.assertTrue(
+        np.array_equal(
+            result.numpy(),
+            expected_data_on_worker[multi_worker_test_base.get_task_index()]))
+
+  def testSimpleInputFromFnLastPartialBatch(self, strategy):
+
+    def dataset_fn(input_context):
+      global_batch_size = 8
+      batch_size = input_context.get_per_replica_batch_size(global_batch_size)
+      dataset = dataset_ops.DatasetV2.range(14).batch(
+          batch_size, drop_remainder=False)
+      return dataset.shard(input_context.num_input_pipelines,
+                           input_context.input_pipeline_id)
+
+    input_iterator = iter(
+        strategy.experimental_distribute_datasets_from_function(dataset_fn))
+
+    @def_function.function
+    def run(input_iterator):
+      return strategy.run(lambda x: x, args=(next(input_iterator),))
+
+    # Let the complete batch go.
+    run(input_iterator)
+    # `result` is an incomplete batch
+    result = run(input_iterator)
+
+    expected_data_on_worker = [[8, 9, 10, 11], [12, 13]]
+    self.assertTrue(
+        np.array_equal(
+            result.numpy(), expected_data_on_worker[
+                multi_worker_test_base.get_task_index()]))
 
   def testReduceHostTensor(self, strategy):
     reduced = strategy.reduce(

From 2f9984ff838fb7636b78deff197c18c144d5532c Mon Sep 17 00:00:00 2001
From: Berkin Ilbeyi <berkin@google.com>
Date: Mon, 22 Jun 2020 16:42:36 -0700
Subject: [PATCH 0834/1390] Allocate input/output buffers at the correct memory
 space.

PiperOrigin-RevId: 317761146
Change-Id: Ia27b6765a547451575240e76ea419f705e29bd32
---
 tensorflow/compiler/xla/service/interpreter/executor.cc | 1 -
 tensorflow/compiler/xla/service/transfer_manager.cc     | 4 +++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/tensorflow/compiler/xla/service/interpreter/executor.cc b/tensorflow/compiler/xla/service/interpreter/executor.cc
index a2e46ba2afe..616fd031c47 100644
--- a/tensorflow/compiler/xla/service/interpreter/executor.cc
+++ b/tensorflow/compiler/xla/service/interpreter/executor.cc
@@ -35,7 +35,6 @@ XlaInterpreterExecutor::~XlaInterpreterExecutor() {}
 
 DeviceMemoryBase XlaInterpreterExecutor::Allocate(uint64 size,
                                                   int64 memory_space) {
-  CHECK_EQ(memory_space, 0);
   return DeviceMemoryBase(new char[size], size);
 }
 
diff --git a/tensorflow/compiler/xla/service/transfer_manager.cc b/tensorflow/compiler/xla/service/transfer_manager.cc
index 4658aebd571..ebb0226476f 100644
--- a/tensorflow/compiler/xla/service/transfer_manager.cc
+++ b/tensorflow/compiler/xla/service/transfer_manager.cc
@@ -355,7 +355,9 @@ StatusOr<ScopedShapedBuffer> TransferManager::AllocateScopedShapedBuffer(
         ShapeUtil::GetSubshape(shaped_buffer.on_device_shape(), index);
     TF_ASSIGN_OR_RETURN(auto memory,
                         allocator->Allocate(shaped_buffer.device_ordinal(),
-                                            GetByteSizeRequirement(subshape)));
+                                            GetByteSizeRequirement(subshape),
+                                            /*retry_on_failure=*/true,
+                                            subshape.layout().memory_space()));
     // Move the allocated buffer into the ScopedShapedBuffer, which owns it.
     memory_base = memory.Release();
   }

From 2d8d440dbb0f9aaa4bef53c815a33c6cb36c3bdf Mon Sep 17 00:00:00 2001
From: Meghna Natraj <mnatraj@google.com>
Date: Mon, 22 Jun 2020 16:49:56 -0700
Subject: [PATCH 0835/1390] Update TFLite interpreter usage in TFLite Micro
 Speech example

PiperOrigin-RevId: 317762428
Change-Id: I075791cad44c6d9d4e747f76b0cb50a5d422fe87
---
 .../train/train_micro_speech_model.ipynb      | 600 +++++++++++++++++-
 1 file changed, 599 insertions(+), 1 deletion(-)

diff --git a/tensorflow/lite/micro/examples/micro_speech/train/train_micro_speech_model.ipynb b/tensorflow/lite/micro/examples/micro_speech/train/train_micro_speech_model.ipynb
index bfe75bdd9f7..1a1d5ff2c3b 100644
--- a/tensorflow/lite/micro/examples/micro_speech/train/train_micro_speech_model.ipynb
+++ b/tensorflow/lite/micro/examples/micro_speech/train/train_micro_speech_model.ipynb
@@ -1 +1,599 @@
-{"nbformat":4,"nbformat_minor":0,"metadata":{"colab":{"name":"train_micro_speech_model.ipynb","provenance":[{"file_id":"https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/micro/examples/micro_speech/train/train_micro_speech_model.ipynb","timestamp":1587690382292}],"collapsed_sections":[],"toc_visible":true},"kernelspec":{"name":"python3","display_name":"Python 3"},"accelerator":"GPU"},"cells":[{"cell_type":"markdown","metadata":{"id":"pO4-CY_TCZZS","colab_type":"text"},"source":["# Train a Simple Audio Recognition Model"]},{"cell_type":"markdown","metadata":{"id":"BaFfr7DHRmGF","colab_type":"text"},"source":["This notebook demonstrates how to train a 20 kB [Simple Audio Recognition](https://www.tensorflow.org/tutorials/sequences/audio_recognition) model to recognize keywords in speech.\n","\n","The model created in this notebook is used in the [micro_speech](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/micro/examples/micro_speech) example for [TensorFlow Lite for MicroControllers](https://www.tensorflow.org/lite/microcontrollers/overview).\n","\n","<table class=\"tfo-notebook-buttons\" align=\"left\">\n","  <td>\n","    <a target=\"_blank\" href=\"https://colab.research.google.com/github/tensorflow/tensorflow/blob/master/tensorflow/lite/micro/examples/micro_speech/train/train_micro_speech_model.ipynb\"><img src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" />Run in Google Colab</a>\n","  </td>\n","  <td>\n","    <a target=\"_blank\" href=\"https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/micro/examples/micro_speech/train/train_micro_speech_model.ipynb\"><img src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" />View source on GitHub</a>\n","  </td>\n","</table>\n"]},{"cell_type":"markdown","metadata":{"id":"XaVtYN4nlCft","colab_type":"text"},"source":["**Training is much faster using GPU acceleration.** Before you proceed, ensure you are using a GPU runtime by going to **Runtime -> Change runtime type** and set **Hardware accelerator: GPU**. Training 15,000 iterations will take 1.5 - 2 hours on a GPU runtime.\n","\n","## Configure Defaults\n","\n","**MODIFY** the following constants for your specific use case."]},{"cell_type":"code","metadata":{"id":"ludfxbNIaegy","colab_type":"code","colab":{}},"source":["# A comma-delimited list of the words you want to train for.\n","# The options are: yes,no,up,down,left,right,on,off,stop,go\n","# All the other words will be used to train an \"unknown\" label and silent\n","# audio data with no spoken words will be used to train a \"silence\" label.\n","WANTED_WORDS = \"yes,no\"\n","\n","# The number of steps and learning rates can be specified as comma-separated\n","# lists to define the rate at each stage. For example,\n","# TRAINING_STEPS=12000,3000 and LEARNING_RATE=0.001,0.0001\n","# will run 12,000 training loops in total, with a rate of 0.001 for the first\n","# 8,000, and 0.0001 for the final 3,000.\n","TRAINING_STEPS = \"12000,3000\"\n","LEARNING_RATE = \"0.001,0.0001\"\n","\n","# Calculate the total number of steps, which is used to identify the checkpoint\n","# file name.\n","TOTAL_STEPS = str(sum(map(lambda string: int(string), TRAINING_STEPS.split(\",\"))))\n","\n","# Print the configuration to confirm it\n","print(\"Training these words: %s\" % WANTED_WORDS)\n","print(\"Training steps in each stage: %s\" % TRAINING_STEPS)\n","print(\"Learning rate in each stage: %s\" % LEARNING_RATE)\n","print(\"Total number of training steps: %s\" % TOTAL_STEPS)"],"execution_count":0,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"gCgeOpvY9pAi","colab_type":"text"},"source":["**DO NOT MODIFY** the following constants as they include filepaths used in this notebook and data that is shared during training and inference."]},{"cell_type":"code","metadata":{"id":"Nd1iM1o2ymvA","colab_type":"code","colab":{}},"source":["# Calculate the percentage of 'silence' and 'unknown' training samples required\n","# to ensure that we have equal number of samples for each label.\n","number_of_labels = WANTED_WORDS.count(',') + 1\n","number_of_total_labels = number_of_labels + 2 # for 'silence' and 'unknown' label\n","equal_percentage_of_training_samples = int(100.0/(number_of_total_labels))\n","SILENT_PERCENTAGE = equal_percentage_of_training_samples\n","UNKNOWN_PERCENTAGE = equal_percentage_of_training_samples\n","\n","# Constants which are shared during training and inference\n","PREPROCESS = 'micro'\n","WINDOW_STRIDE = 20\n","MODEL_ARCHITECTURE = 'tiny_conv' # Other options include: single_fc, conv,\n","                      # low_latency_conv, low_latency_svdf, tiny_embedding_conv\n","\n","# Constants used during training only\n","VERBOSITY = 'WARN'\n","EVAL_STEP_INTERVAL = '1000'\n","SAVE_STEP_INTERVAL = '1000'\n","\n","# Constants for training directories and filepaths\n","DATASET_DIR =  'dataset/'\n","LOGS_DIR = 'logs/'\n","TRAIN_DIR = 'train/' # for training checkpoints and other files.\n","\n","# Constants for inference directories and filepaths\n","import os\n","MODELS_DIR = 'models'\n","if not os.path.exists(MODELS_DIR):\n","  os.mkdir(MODELS_DIR)\n","MODEL_TF = os.path.join(MODELS_DIR, 'model.pb')\n","MODEL_TFLITE = os.path.join(MODELS_DIR, 'model.tflite')\n","FLOAT_MODEL_TFLITE = os.path.join(MODELS_DIR, 'float_model.tflite')\n","MODEL_TFLITE_MICRO = os.path.join(MODELS_DIR, 'model.cc')\n","SAVED_MODEL = os.path.join(MODELS_DIR, 'saved_model')\n","\n","QUANT_INPUT_MIN = 0.0\n","QUANT_INPUT_MAX = 26.0\n","QUANT_INPUT_RANGE = QUANT_INPUT_MAX - QUANT_INPUT_MIN"],"execution_count":0,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"6rLYpvtg9P4o","colab_type":"text"},"source":["## Setup Environment\n","\n","Install Dependencies"]},{"cell_type":"code","metadata":{"id":"ed_XpUrU5DvY","colab_type":"code","colab":{}},"source":["%tensorflow_version 1.x\n","import tensorflow as tf"],"execution_count":0,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"T9Ty5mR58E4i","colab_type":"text"},"source":["**DELETE** any old data from previous runs\n"]},{"cell_type":"code","metadata":{"id":"APGx0fEh7hFF","colab_type":"code","colab":{}},"source":["!rm -rf {DATASET_DIR} {LOGS_DIR} {TRAIN_DIR} {MODELS_DIR}"],"execution_count":0,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"GfEUlfFBizio","colab_type":"text"},"source":["Clone the TensorFlow Github Repository, which contains the relevant code required to run this tutorial."]},{"cell_type":"code","metadata":{"id":"yZArmzT85SLq","colab_type":"code","colab":{}},"source":["!git clone -q --depth 1 https://github.com/tensorflow/tensorflow"],"execution_count":0,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"nS9swHLSi7Bi","colab_type":"text"},"source":["Load TensorBoard to visualize the accuracy and loss as training proceeds.\n"]},{"cell_type":"code","metadata":{"id":"q4qF1VxP3UE4","colab_type":"code","colab":{}},"source":["%load_ext tensorboard\n","%tensorboard --logdir {LOGS_DIR}"],"execution_count":0,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"x1J96Ron-O4R","colab_type":"text"},"source":["## Training\n","\n","The following script downloads the dataset and begin training."]},{"cell_type":"code","metadata":{"id":"VJsEZx6lynbY","colab_type":"code","colab":{}},"source":["!python tensorflow/tensorflow/examples/speech_commands/train.py \\\n","--data_dir={DATASET_DIR} \\\n","--wanted_words={WANTED_WORDS} \\\n","--silence_percentage={SILENT_PERCENTAGE} \\\n","--unknown_percentage={UNKNOWN_PERCENTAGE} \\\n","--preprocess={PREPROCESS} \\\n","--window_stride={WINDOW_STRIDE} \\\n","--model_architecture={MODEL_ARCHITECTURE} \\\n","--how_many_training_steps={TRAINING_STEPS} \\\n","--learning_rate={LEARNING_RATE} \\\n","--train_dir={TRAIN_DIR} \\\n","--summaries_dir={LOGS_DIR} \\\n","--verbosity={VERBOSITY} \\\n","--eval_step_interval={EVAL_STEP_INTERVAL} \\\n","--save_step_interval={SAVE_STEP_INTERVAL}"],"execution_count":0,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"UczQKtqLi7OJ","colab_type":"text"},"source":["# Skipping the training\n","\n","If you don't want to spend an hour or two training the model from scratch, you can download pretrained checkpoints by uncommenting the lines below (removing the '#'s at the start of each line) and running them."]},{"cell_type":"code","metadata":{"id":"RZw3VNlnla-J","colab_type":"code","colab":{}},"source":["#!curl -O \"https://storage.googleapis.com/download.tensorflow.org/models/tflite/speech_micro_train_2020_05_10.tgz\"\n","#!tar xzf speech_micro_train_2020_05_10.tgz"],"execution_count":0,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"XQUJLrdS-ftl","colab_type":"text"},"source":["## Generate a TensorFlow Model for Inference\n","\n","Combine relevant training results (graph, weights, etc) into a single file for inference. This process is known as freezing a model and the resulting model is known as a frozen model/graph, as it cannot be further re-trained after this process."]},{"cell_type":"code","metadata":{"id":"xyc3_eLh9sAg","colab_type":"code","colab":{}},"source":["!rm -rf {SAVED_MODEL}\n","!python tensorflow/tensorflow/examples/speech_commands/freeze.py \\\n","--wanted_words=$WANTED_WORDS \\\n","--window_stride_ms=$WINDOW_STRIDE \\\n","--preprocess=$PREPROCESS \\\n","--model_architecture=$MODEL_ARCHITECTURE \\\n","--start_checkpoint=$TRAIN_DIR$MODEL_ARCHITECTURE'.ckpt-'{TOTAL_STEPS} \\\n","--save_format=saved_model \\\n","--output_file={SAVED_MODEL}"],"execution_count":0,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"_DBGDxVI-nKG","colab_type":"text"},"source":["## Generate a TensorFlow Lite Model\n","\n","Convert the frozen graph into a TensorFlow Lite model, which is fully quantized for use with embedded devices.\n","\n","The following cell will also print the model size, which will be under 20 kilobytes."]},{"cell_type":"code","metadata":{"id":"RIitkqvGWmre","colab_type":"code","colab":{}},"source":["import sys\n","# We add this path so we can import the speech processing modules.\n","sys.path.append(\"/content/tensorflow/tensorflow/examples/speech_commands/\")\n","import input_data\n","import models\n","import numpy as np"],"execution_count":0,"outputs":[]},{"cell_type":"code","metadata":{"id":"kzqECqMxgBh4","colab_type":"code","colab":{}},"source":["SAMPLE_RATE = 16000\n","CLIP_DURATION_MS = 1000\n","WINDOW_SIZE_MS = 30.0\n","FEATURE_BIN_COUNT = 40\n","BACKGROUND_FREQUENCY = 0.8\n","BACKGROUND_VOLUME_RANGE = 0.1\n","TIME_SHIFT_MS = 100.0\n","\n","DATA_URL = 'https://storage.googleapis.com/download.tensorflow.org/data/speech_commands_v0.02.tar.gz'\n","VALIDATION_PERCENTAGE = 10\n","TESTING_PERCENTAGE = 10"],"execution_count":0,"outputs":[]},{"cell_type":"code","metadata":{"id":"rNQdAplJV1fz","colab_type":"code","colab":{}},"source":["model_settings = models.prepare_model_settings(\n","    len(input_data.prepare_words_list(WANTED_WORDS.split(','))),\n","    SAMPLE_RATE, CLIP_DURATION_MS, WINDOW_SIZE_MS,\n","    WINDOW_STRIDE, FEATURE_BIN_COUNT, PREPROCESS)\n","audio_processor = input_data.AudioProcessor(\n","    DATA_URL, DATASET_DIR,\n","    SILENT_PERCENTAGE, UNKNOWN_PERCENTAGE,\n","    WANTED_WORDS.split(','), VALIDATION_PERCENTAGE,\n","    TESTING_PERCENTAGE, model_settings, LOGS_DIR)"],"execution_count":0,"outputs":[]},{"cell_type":"code","metadata":{"id":"lBj_AyCh1cC0","colab_type":"code","colab":{}},"source":["with tf.Session() as sess:\n","  float_converter = tf.lite.TFLiteConverter.from_saved_model(SAVED_MODEL)\n","  float_tflite_model = float_converter.convert()\n","  float_tflite_model_size = open(FLOAT_MODEL_TFLITE, \"wb\").write(float_tflite_model)\n","  print(\"Float model is %d bytes\" % float_tflite_model_size)\n","\n","  converter = tf.lite.TFLiteConverter.from_saved_model(SAVED_MODEL)\n","  converter.optimizations = [tf.lite.Optimize.DEFAULT]\n","  converter.inference_input_type = tf.lite.constants.INT8\n","  converter.inference_output_type = tf.lite.constants.INT8\n","  def representative_dataset_gen():\n","    for i in range(100):\n","      data, _ = audio_processor.get_data(1, i*1, model_settings,\n","                                         BACKGROUND_FREQUENCY, \n","                                         BACKGROUND_VOLUME_RANGE,\n","                                         TIME_SHIFT_MS,\n","                                         'testing',\n","                                         sess)\n","      flattened_data = np.array(data.flatten(), dtype=np.float32).reshape(1, 1960)\n","      yield [flattened_data]\n","  converter.representative_dataset = representative_dataset_gen\n","  tflite_model = converter.convert()\n","  tflite_model_size = open(MODEL_TFLITE, \"wb\").write(tflite_model)\n","  print(\"Quantized model is %d bytes\" % tflite_model_size)\n"],"execution_count":0,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"EeLiDZTbLkzv","colab_type":"text"},"source":["# Testing the TensorFlow Lite model's accuracy\n","\n","Verify that the model we've exported is still accurate, using the TF Lite Python API and our test set."]},{"cell_type":"code","metadata":{"id":"wQsEteKRLryJ","colab_type":"code","colab":{}},"source":["with tf.Session() as sess:\n","  test_data, test_labels = audio_processor.get_data(\n","      -1, 0, model_settings, BACKGROUND_FREQUENCY, BACKGROUND_VOLUME_RANGE,\n","      TIME_SHIFT_MS, 'testing', sess)\n","\n","float_interpreter = tf.lite.Interpreter(FLOAT_MODEL_TFLITE)\n","float_interpreter.allocate_tensors()\n","\n","float_input_index = float_interpreter.get_input_details()[0][\"index\"]\n","\n","float_output_index = float_interpreter.get_output_details()[0][\"index\"]\n","float_model_output = float_interpreter.tensor(float_output_index)\n","\n","float_correct_predictions = 0\n","for i in range(len(test_data)):\n","  current_input = test_data[i]\n","  current_label = test_labels[i]\n","  flattened_input = np.array(current_input.flatten(), dtype=np.float32).reshape(1, 1960)\n","  float_interpreter.set_tensor(float_input_index, flattened_input)\n","  float_interpreter.invoke()\n","  top_prediction = float_model_output()[0].argmax()\n","  if top_prediction == current_label:\n","    float_correct_predictions += 1\n","\n","print('Float accuracy is %f%% (N=%d)' % ((float_correct_predictions * 100) / len(test_data), len(test_data)))\n","\n","interpreter = tf.lite.Interpreter(MODEL_TFLITE)\n","interpreter.allocate_tensors()\n","\n","input_index = interpreter.get_input_details()[0][\"index\"]\n","\n","output_index = interpreter.get_output_details()[0][\"index\"]\n","model_output = interpreter.tensor(output_index)\n","\n","with tf.Session() as sess:\n","  test_data, test_labels = audio_processor.get_data(\n","      -1, 0, model_settings, BACKGROUND_FREQUENCY, BACKGROUND_VOLUME_RANGE,\n","      TIME_SHIFT_MS, 'testing', sess)\n","\n","correct_predictions = 0\n","for i in range(len(test_data)):\n","  current_input = test_data[i]\n","  current_label = test_labels[i]\n","  quantized_input = np.zeros((1960), np.int8)\n","  for index, input_value in enumerate(current_input.flatten()):\n","    # These scaling values are derived from those used in input_data.py in the\n","    # training pipeline.\n","    value = ((input_value - QUANT_INPUT_MIN) * 256) / QUANT_INPUT_RANGE\n","    value -= 128\n","    if value < -128:\n","      value = -128\n","    if value > 127:\n","      value = 127\n","    quantized_input[index] = value\n","  flattened_input = np.array(quantized_input.flatten(), dtype=np.int8).reshape(1, 1960)\n","  interpreter.set_tensor(input_index, flattened_input)\n","  interpreter.invoke()\n","  top_prediction = model_output()[0].argmax()\n","  if top_prediction == current_label:\n","    correct_predictions += 1\n","\n","print('Quantized accuracy is %f%% (N=%d)' % ((correct_predictions * 100) / len(test_data), len(test_data)))\n"],"execution_count":0,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"dt6Zqbxu-wIi","colab_type":"text"},"source":["## Generate a TensorFlow Lite for MicroControllers Model\n","Convert the TensorFlow Lite model into a C source file that can be loaded by TensorFlow Lite for Microcontrollers."]},{"cell_type":"code","metadata":{"id":"XohZOTjR8ZyE","colab_type":"code","colab":{}},"source":["# Install xxd if it is not available\n","!apt-get update && apt-get -qq install xxd\n","# Convert to a C source file\n","!xxd -i {MODEL_TFLITE} > {MODEL_TFLITE_MICRO}\n","# Update variable names\n","REPLACE_TEXT = MODEL_TFLITE.replace('/', '_').replace('.', '_')\n","!sed -i 's/'{REPLACE_TEXT}'/g_model/g' {MODEL_TFLITE_MICRO}"],"execution_count":0,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"2pQnN0i_-0L2","colab_type":"text"},"source":["## Deploy to a Microcontroller\n","\n","Follow the instructions in the [micro_speech](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/micro/examples/micro_speech) README.md for [TensorFlow Lite for MicroControllers](https://www.tensorflow.org/lite/microcontrollers/overview) to deploy this model on a specific microcontroller.\n","\n","**Reference Model:** If you have not modified this notebook, you can follow the instructions as is, to deploy the model. Refer to the [`micro_speech/train/models`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/micro/examples/micro_speech/train/models) directory to access the models generated in this notebook. \n","\n","**New Model:** If you have generated a new model to identify different words: (i) Update `kCategoryCount` and `kCategoryLabels` in [`micro_speech/micro_features/micro_model_settings.h`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/micro/examples/micro_speech/micro_features/micro_model_settings.h) and (ii) Update the values assigned to the variables defined in [`micro_speech/micro_features/model.cc`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/micro/examples/micro_speech/micro_features/model.cc) with values displayed after running the following cell."]},{"cell_type":"code","metadata":{"id":"eoYyh0VU8pca","colab_type":"code","colab":{}},"source":["# Print the C source file\n","!cat {MODEL_TFLITE_MICRO}"],"execution_count":0,"outputs":[]},{"cell_type":"code","metadata":{"id":"iYlIKpO2mkhv","colab_type":"code","colab":{}},"source":[""],"execution_count":0,"outputs":[]}]}
\ No newline at end of file
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "colab": {
+      "name": "train_micro_speech_model.ipynb",
+      "provenance": [],
+      "collapsed_sections": [],
+      "toc_visible": true
+    },
+    "kernelspec": {
+      "name": "python3",
+      "display_name": "Python 3"
+    },
+    "accelerator": "GPU"
+  },
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "pO4-CY_TCZZS",
+        "colab_type": "text"
+      },
+      "source": [
+        "# Train a Simple Audio Recognition Model"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "BaFfr7DHRmGF",
+        "colab_type": "text"
+      },
+      "source": [
+        "This notebook demonstrates how to train a 20 kB [Simple Audio Recognition](https://www.tensorflow.org/tutorials/sequences/audio_recognition) model to recognize keywords in speech.\n",
+        "\n",
+        "The model created in this notebook is used in the [micro_speech](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/micro/examples/micro_speech) example for [TensorFlow Lite for MicroControllers](https://www.tensorflow.org/lite/microcontrollers/overview).\n",
+        "\n",
+        "<table class=\"tfo-notebook-buttons\" align=\"left\">\n",
+        "  <td>\n",
+        "    <a target=\"_blank\" href=\"https://colab.research.google.com/github/tensorflow/tensorflow/blob/master/tensorflow/lite/micro/examples/micro_speech/train/train_micro_speech_model.ipynb\"><img src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" />Run in Google Colab</a>\n",
+        "  </td>\n",
+        "  <td>\n",
+        "    <a target=\"_blank\" href=\"https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/micro/examples/micro_speech/train/train_micro_speech_model.ipynb\"><img src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" />View source on GitHub</a>\n",
+        "  </td>\n",
+        "</table>\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "XaVtYN4nlCft",
+        "colab_type": "text"
+      },
+      "source": [
+        "**Training is much faster using GPU acceleration.** Before you proceed, ensure you are using a GPU runtime by going to **Runtime -> Change runtime type** and set **Hardware accelerator: GPU**. Training 15,000 iterations will take 1.5 - 2 hours on a GPU runtime.\n",
+        "\n",
+        "## Configure Defaults\n",
+        "\n",
+        "**MODIFY** the following constants for your specific use case."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "ludfxbNIaegy",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "# A comma-delimited list of the words you want to train for.\n",
+        "# The options are: yes,no,up,down,left,right,on,off,stop,go\n",
+        "# All the other words will be used to train an \"unknown\" label and silent\n",
+        "# audio data with no spoken words will be used to train a \"silence\" label.\n",
+        "WANTED_WORDS = \"yes,no\"\n",
+        "\n",
+        "# The number of steps and learning rates can be specified as comma-separated\n",
+        "# lists to define the rate at each stage. For example,\n",
+        "# TRAINING_STEPS=12000,3000 and LEARNING_RATE=0.001,0.0001\n",
+        "# will run 12,000 training loops in total, with a rate of 0.001 for the first\n",
+        "# 8,000, and 0.0001 for the final 3,000.\n",
+        "TRAINING_STEPS = \"12000,3000\"\n",
+        "LEARNING_RATE = \"0.001,0.0001\"\n",
+        "\n",
+        "# Calculate the total number of steps, which is used to identify the checkpoint\n",
+        "# file name.\n",
+        "TOTAL_STEPS = str(sum(map(lambda string: int(string), TRAINING_STEPS.split(\",\"))))\n",
+        "\n",
+        "# Print the configuration to confirm it\n",
+        "print(\"Training these words: %s\" % WANTED_WORDS)\n",
+        "print(\"Training steps in each stage: %s\" % TRAINING_STEPS)\n",
+        "print(\"Learning rate in each stage: %s\" % LEARNING_RATE)\n",
+        "print(\"Total number of training steps: %s\" % TOTAL_STEPS)"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "gCgeOpvY9pAi",
+        "colab_type": "text"
+      },
+      "source": [
+        "**DO NOT MODIFY** the following constants as they include filepaths used in this notebook and data that is shared during training and inference."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "Nd1iM1o2ymvA",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "# Calculate the percentage of 'silence' and 'unknown' training samples required\n",
+        "# to ensure that we have equal number of samples for each label.\n",
+        "number_of_labels = WANTED_WORDS.count(',') + 1\n",
+        "number_of_total_labels = number_of_labels + 2 # for 'silence' and 'unknown' label\n",
+        "equal_percentage_of_training_samples = int(100.0/(number_of_total_labels))\n",
+        "SILENT_PERCENTAGE = equal_percentage_of_training_samples\n",
+        "UNKNOWN_PERCENTAGE = equal_percentage_of_training_samples\n",
+        "\n",
+        "# Constants which are shared during training and inference\n",
+        "PREPROCESS = 'micro'\n",
+        "WINDOW_STRIDE = 20\n",
+        "MODEL_ARCHITECTURE = 'tiny_conv' # Other options include: single_fc, conv,\n",
+        "                      # low_latency_conv, low_latency_svdf, tiny_embedding_conv\n",
+        "\n",
+        "# Constants used during training only\n",
+        "VERBOSITY = 'WARN'\n",
+        "EVAL_STEP_INTERVAL = '1000'\n",
+        "SAVE_STEP_INTERVAL = '1000'\n",
+        "\n",
+        "# Constants for training directories and filepaths\n",
+        "DATASET_DIR =  'dataset/'\n",
+        "LOGS_DIR = 'logs/'\n",
+        "TRAIN_DIR = 'train/' # for training checkpoints and other files.\n",
+        "\n",
+        "# Constants for inference directories and filepaths\n",
+        "import os\n",
+        "MODELS_DIR = 'models'\n",
+        "if not os.path.exists(MODELS_DIR):\n",
+        "  os.mkdir(MODELS_DIR)\n",
+        "MODEL_TF = os.path.join(MODELS_DIR, 'model.pb')\n",
+        "MODEL_TFLITE = os.path.join(MODELS_DIR, 'model.tflite')\n",
+        "FLOAT_MODEL_TFLITE = os.path.join(MODELS_DIR, 'float_model.tflite')\n",
+        "MODEL_TFLITE_MICRO = os.path.join(MODELS_DIR, 'model.cc')\n",
+        "SAVED_MODEL = os.path.join(MODELS_DIR, 'saved_model')\n",
+        "\n",
+        "QUANT_INPUT_MIN = 0.0\n",
+        "QUANT_INPUT_MAX = 26.0\n",
+        "QUANT_INPUT_RANGE = QUANT_INPUT_MAX - QUANT_INPUT_MIN"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "6rLYpvtg9P4o",
+        "colab_type": "text"
+      },
+      "source": [
+        "## Setup Environment\n",
+        "\n",
+        "Install Dependencies"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "ed_XpUrU5DvY",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "%tensorflow_version 1.x\n",
+        "import tensorflow as tf"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "T9Ty5mR58E4i",
+        "colab_type": "text"
+      },
+      "source": [
+        "**DELETE** any old data from previous runs\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "APGx0fEh7hFF",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "!rm -rf {DATASET_DIR} {LOGS_DIR} {TRAIN_DIR} {MODELS_DIR}"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "GfEUlfFBizio",
+        "colab_type": "text"
+      },
+      "source": [
+        "Clone the TensorFlow Github Repository, which contains the relevant code required to run this tutorial."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "yZArmzT85SLq",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "!git clone -q --depth 1 https://github.com/tensorflow/tensorflow"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "nS9swHLSi7Bi",
+        "colab_type": "text"
+      },
+      "source": [
+        "Load TensorBoard to visualize the accuracy and loss as training proceeds.\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "q4qF1VxP3UE4",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "%load_ext tensorboard\n",
+        "%tensorboard --logdir {LOGS_DIR}"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "x1J96Ron-O4R",
+        "colab_type": "text"
+      },
+      "source": [
+        "## Training\n",
+        "\n",
+        "The following script downloads the dataset and begin training."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "VJsEZx6lynbY",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "!python tensorflow/tensorflow/examples/speech_commands/train.py \\\n",
+        "--data_dir={DATASET_DIR} \\\n",
+        "--wanted_words={WANTED_WORDS} \\\n",
+        "--silence_percentage={SILENT_PERCENTAGE} \\\n",
+        "--unknown_percentage={UNKNOWN_PERCENTAGE} \\\n",
+        "--preprocess={PREPROCESS} \\\n",
+        "--window_stride={WINDOW_STRIDE} \\\n",
+        "--model_architecture={MODEL_ARCHITECTURE} \\\n",
+        "--how_many_training_steps={TRAINING_STEPS} \\\n",
+        "--learning_rate={LEARNING_RATE} \\\n",
+        "--train_dir={TRAIN_DIR} \\\n",
+        "--summaries_dir={LOGS_DIR} \\\n",
+        "--verbosity={VERBOSITY} \\\n",
+        "--eval_step_interval={EVAL_STEP_INTERVAL} \\\n",
+        "--save_step_interval={SAVE_STEP_INTERVAL}"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "UczQKtqLi7OJ",
+        "colab_type": "text"
+      },
+      "source": [
+        "# Skipping the training\n",
+        "\n",
+        "If you don't want to spend an hour or two training the model from scratch, you can download pretrained checkpoints by uncommenting the lines below (removing the '#'s at the start of each line) and running them."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "RZw3VNlnla-J",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "#!curl -O \"https://storage.googleapis.com/download.tensorflow.org/models/tflite/speech_micro_train_2020_05_10.tgz\"\n",
+        "#!tar xzf speech_micro_train_2020_05_10.tgz"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "XQUJLrdS-ftl",
+        "colab_type": "text"
+      },
+      "source": [
+        "## Generate a TensorFlow Model for Inference\n",
+        "\n",
+        "Combine relevant training results (graph, weights, etc) into a single file for inference. This process is known as freezing a model and the resulting model is known as a frozen model/graph, as it cannot be further re-trained after this process."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "xyc3_eLh9sAg",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "!rm -rf {SAVED_MODEL}\n",
+        "!python tensorflow/tensorflow/examples/speech_commands/freeze.py \\\n",
+        "--wanted_words=$WANTED_WORDS \\\n",
+        "--window_stride_ms=$WINDOW_STRIDE \\\n",
+        "--preprocess=$PREPROCESS \\\n",
+        "--model_architecture=$MODEL_ARCHITECTURE \\\n",
+        "--start_checkpoint=$TRAIN_DIR$MODEL_ARCHITECTURE'.ckpt-'{TOTAL_STEPS} \\\n",
+        "--save_format=saved_model \\\n",
+        "--output_file={SAVED_MODEL}"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "_DBGDxVI-nKG",
+        "colab_type": "text"
+      },
+      "source": [
+        "## Generate a TensorFlow Lite Model\n",
+        "\n",
+        "Convert the frozen graph into a TensorFlow Lite model, which is fully quantized for use with embedded devices.\n",
+        "\n",
+        "The following cell will also print the model size, which will be under 20 kilobytes."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "RIitkqvGWmre",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "import sys\n",
+        "# We add this path so we can import the speech processing modules.\n",
+        "sys.path.append(\"/content/tensorflow/tensorflow/examples/speech_commands/\")\n",
+        "import input_data\n",
+        "import models\n",
+        "import numpy as np"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "kzqECqMxgBh4",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "SAMPLE_RATE = 16000\n",
+        "CLIP_DURATION_MS = 1000\n",
+        "WINDOW_SIZE_MS = 30.0\n",
+        "FEATURE_BIN_COUNT = 40\n",
+        "BACKGROUND_FREQUENCY = 0.8\n",
+        "BACKGROUND_VOLUME_RANGE = 0.1\n",
+        "TIME_SHIFT_MS = 100.0\n",
+        "\n",
+        "DATA_URL = 'https://storage.googleapis.com/download.tensorflow.org/data/speech_commands_v0.02.tar.gz'\n",
+        "VALIDATION_PERCENTAGE = 10\n",
+        "TESTING_PERCENTAGE = 10"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "rNQdAplJV1fz",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "model_settings = models.prepare_model_settings(\n",
+        "    len(input_data.prepare_words_list(WANTED_WORDS.split(','))),\n",
+        "    SAMPLE_RATE, CLIP_DURATION_MS, WINDOW_SIZE_MS,\n",
+        "    WINDOW_STRIDE, FEATURE_BIN_COUNT, PREPROCESS)\n",
+        "audio_processor = input_data.AudioProcessor(\n",
+        "    DATA_URL, DATASET_DIR,\n",
+        "    SILENT_PERCENTAGE, UNKNOWN_PERCENTAGE,\n",
+        "    WANTED_WORDS.split(','), VALIDATION_PERCENTAGE,\n",
+        "    TESTING_PERCENTAGE, model_settings, LOGS_DIR)"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "lBj_AyCh1cC0",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "with tf.Session() as sess:\n",
+        "  float_converter = tf.lite.TFLiteConverter.from_saved_model(SAVED_MODEL)\n",
+        "  float_tflite_model = float_converter.convert()\n",
+        "  float_tflite_model_size = open(FLOAT_MODEL_TFLITE, \"wb\").write(float_tflite_model)\n",
+        "  print(\"Float model is %d bytes\" % float_tflite_model_size)\n",
+        "\n",
+        "  converter = tf.lite.TFLiteConverter.from_saved_model(SAVED_MODEL)\n",
+        "  converter.optimizations = [tf.lite.Optimize.DEFAULT]\n",
+        "  converter.inference_input_type = tf.lite.constants.INT8\n",
+        "  converter.inference_output_type = tf.lite.constants.INT8\n",
+        "  def representative_dataset_gen():\n",
+        "    for i in range(100):\n",
+        "      data, _ = audio_processor.get_data(1, i*1, model_settings,\n",
+        "                                         BACKGROUND_FREQUENCY, \n",
+        "                                         BACKGROUND_VOLUME_RANGE,\n",
+        "                                         TIME_SHIFT_MS,\n",
+        "                                         'testing',\n",
+        "                                         sess)\n",
+        "      flattened_data = np.array(data.flatten(), dtype=np.float32).reshape(1, 1960)\n",
+        "      yield [flattened_data]\n",
+        "  converter.representative_dataset = representative_dataset_gen\n",
+        "  tflite_model = converter.convert()\n",
+        "  tflite_model_size = open(MODEL_TFLITE, \"wb\").write(tflite_model)\n",
+        "  print(\"Quantized model is %d bytes\" % tflite_model_size)\n"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "EeLiDZTbLkzv",
+        "colab_type": "text"
+      },
+      "source": [
+        "## Testing the TensorFlow Lite model's accuracy\n",
+        "\n",
+        "Verify that the model we've exported is still accurate, using the TF Lite Python API and our test set."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "wQsEteKRLryJ",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "# Helper function to run inference\n",
+        "def run_tflite_inference(tflite_model_path, model_type=\"Float\"):\n",
+        "  # Load test data\n",
+        "  np.random.seed(0) # set random seed for reproducible test results.\n",
+        "  with tf.Session() as sess:\n",
+        "    test_data, test_labels = audio_processor.get_data(\n",
+        "        -1, 0, model_settings, BACKGROUND_FREQUENCY, BACKGROUND_VOLUME_RANGE,\n",
+        "        TIME_SHIFT_MS, 'testing', sess)\n",
+        "  test_data = np.expand_dims(test_data, axis=1).astype(np.float32)\n",
+        "\n",
+        "  # Initialize the interpreter\n",
+        "  interpreter = tf.lite.Interpreter(tflite_model_path)\n",
+        "  interpreter.allocate_tensors()\n",
+        "\n",
+        "  input_details = interpreter.get_input_details()[0]\n",
+        "  output_details = interpreter.get_output_details()[0]\n",
+        "\n",
+        "  # For quantized models, manually quantize the input data from float to integer\n",
+        "  if model_type == \"Quantized\":\n",
+        "    input_scale, input_zero_point = input_details[\"quantization\"]\n",
+        "    test_data = test_data / input_scale + input_zero_point\n",
+        "    test_data = test_data.astype(input_details[\"dtype\"])\n",
+        "\n",
+        "  correct_predictions = 0\n",
+        "  for i in range(len(test_data)):\n",
+        "    interpreter.set_tensor(input_details[\"index\"], test_data[i])\n",
+        "    interpreter.invoke()\n",
+        "    output = interpreter.get_tensor(output_details[\"index\"])[0]\n",
+        "    top_prediction = output.argmax()\n",
+        "    correct_predictions += (top_prediction == test_labels[i])\n",
+        "\n",
+        "  print('%s model accuracy is %f%% (Number of test samples=%d)' % (\n",
+        "      model_type, (correct_predictions * 100) / len(test_data), len(test_data)))"
+      ],
+      "execution_count": 110,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "l-pD52Na6jRa",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "# Compute float model accuracy\n",
+        "run_tflite_inference(FLOAT_MODEL_TFLITE)\n",
+        "\n",
+        "# Compute quantized model accuracy\n",
+        "run_tflite_inference(MODEL_TFLITE, model_type='Quantized')"
+      ],
+      "execution_count": 111,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "dt6Zqbxu-wIi",
+        "colab_type": "text"
+      },
+      "source": [
+        "## Generate a TensorFlow Lite for MicroControllers Model\n",
+        "Convert the TensorFlow Lite model into a C source file that can be loaded by TensorFlow Lite for Microcontrollers."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "XohZOTjR8ZyE",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "# Install xxd if it is not available\n",
+        "!apt-get update && apt-get -qq install xxd\n",
+        "# Convert to a C source file\n",
+        "!xxd -i {MODEL_TFLITE} > {MODEL_TFLITE_MICRO}\n",
+        "# Update variable names\n",
+        "REPLACE_TEXT = MODEL_TFLITE.replace('/', '_').replace('.', '_')\n",
+        "!sed -i 's/'{REPLACE_TEXT}'/g_model/g' {MODEL_TFLITE_MICRO}"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "2pQnN0i_-0L2",
+        "colab_type": "text"
+      },
+      "source": [
+        "## Deploy to a Microcontroller\n",
+        "\n",
+        "Follow the instructions in the [micro_speech](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/micro/examples/micro_speech) README.md for [TensorFlow Lite for MicroControllers](https://www.tensorflow.org/lite/microcontrollers/overview) to deploy this model on a specific microcontroller.\n",
+        "\n",
+        "**Reference Model:** If you have not modified this notebook, you can follow the instructions as is, to deploy the model. Refer to the [`micro_speech/train/models`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/micro/examples/micro_speech/train/models) directory to access the models generated in this notebook.\n",
+        "\n",
+        "**New Model:** If you have generated a new model to identify different words: (i) Update `kCategoryCount` and `kCategoryLabels` in [`micro_speech/micro_features/micro_model_settings.h`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/micro/examples/micro_speech/micro_features/micro_model_settings.h) and (ii) Update the values assigned to the variables defined in [`micro_speech/micro_features/model.cc`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/micro/examples/micro_speech/micro_features/model.cc) with values displayed after running the following cell."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "eoYyh0VU8pca",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "# Print the C source file\n",
+        "!cat {MODEL_TFLITE_MICRO}"
+      ],
+      "execution_count": null,
+      "outputs": []
+    }
+  ]
+}
\ No newline at end of file

From 5d4a29eaf590b4a3068ef4d0b7bea9d4f7bd9369 Mon Sep 17 00:00:00 2001
From: Akshay Modi <nareshmodi@google.com>
Date: Mon, 22 Jun 2020 16:50:08 -0700
Subject: [PATCH 0836/1390] Special case wrapping of ndarrays in the gradient
 tape code.

PiperOrigin-RevId: 317762474
Change-Id: Ie848ad90a88aff5b2faef4069c3f05887038c367
---
 tensorflow/python/eager/BUILD                |  1 +
 tensorflow/python/eager/backprop.py          | 24 ++++++++++++++++++--
 tensorflow/python/ops/numpy_ops/np_arrays.py |  8 ++-----
 3 files changed, 25 insertions(+), 8 deletions(-)

diff --git a/tensorflow/python/eager/BUILD b/tensorflow/python/eager/BUILD
index f51bd97e488..408d784ae82 100644
--- a/tensorflow/python/eager/BUILD
+++ b/tensorflow/python/eager/BUILD
@@ -588,6 +588,7 @@ py_library(
         "//tensorflow/python:tensor_shape",
         "//tensorflow/python:unconnected_gradients",
         "//tensorflow/python:util",
+        "//tensorflow/python/ops/numpy_ops:numpy",
         "//tensorflow/python/ops/parallel_for:control_flow_ops",
         "@six_archive//:six",
     ],
diff --git a/tensorflow/python/eager/backprop.py b/tensorflow/python/eager/backprop.py
index 8da3f71360a..5800a51f89a 100644
--- a/tensorflow/python/eager/backprop.py
+++ b/tensorflow/python/eager/backprop.py
@@ -62,6 +62,9 @@ from tensorflow.python.util.tf_export import tf_export
 pfor_ops = LazyLoader(
     "pfor_ops", globals(),
     "tensorflow.python.ops.parallel_for.control_flow_ops")
+np_arrays = LazyLoader(
+    "np_arrays", globals(),
+    "tensorflow.python.ops.numpy_ops.np_arrays")
 
 function = LazyLoader("function", globals(),
                       "tensorflow.python.eager.function")
@@ -721,9 +724,11 @@ pywrap_tfe.TFE_Py_RegisterVSpace(_default_vspace)
 
 
 def _handle_or_self(x):
-  """If x is ResourceVariable, return its handle, else x."""
+  """Unwrap resource variable/ndarray to return tensors."""
   if resource_variable_ops.is_resource_variable(x):
-    x = x.handle
+    return x.handle
+  if isinstance(x, np_arrays.ndarray):
+    return x.data
   return x
 
 
@@ -1023,6 +1028,7 @@ class GradientTape(object):
             "gradient in order to compute higher order "
             "derivatives.", 1)
 
+    num_ndarrays = 0
     flat_targets = []
     for t in nest.flatten(target):
       if not backprop_util.IsTrainable(t):
@@ -1033,7 +1039,12 @@ class GradientTape(object):
       if resource_variable_ops.is_resource_variable(t):
         with self:
           t = ops.convert_to_tensor(t)
+      elif isinstance(t, np_arrays.ndarray):
+        t = t.data
+        num_ndarrays += 1
       flat_targets.append(t)
+    # Only rewrap if all targets are ndarray. If not, prefer tensors.
+    rewrap_as_ndarray = num_ndarrays == len(flat_targets)
 
     flat_sources = nest.flatten(sources)
     flat_sources_raw = flat_sources
@@ -1066,6 +1077,9 @@ class GradientTape(object):
       self._watched_variables = self._tape.watched_variables()
       self._tape = None
 
+    if rewrap_as_ndarray:
+      flat_grad = nest.map_structure(np_arrays.tensor_to_ndarray, flat_grad)
+
     grad = nest.pack_sequence_as(sources, flat_grad)
     return grad
 
@@ -1120,6 +1134,10 @@ class GradientTape(object):
       ValueError: If vectorization of jacobian computation fails.
     """
     flat_sources = nest.flatten(sources)
+    rewrap_as_ndarray = False
+    if isinstance(target, np_arrays.ndarray):
+      target = target.data
+      rewrap_as_ndarray = True
     target_static_shape = target.shape
     target_shape = array_ops.shape(target)
     # Note that we push and pop the tape here and below. This is needed since we
@@ -1169,6 +1187,8 @@ class GradientTape(object):
         out = array_ops.reshape(out, new_shape)
         if context.executing_eagerly():
           out.set_shape(target_static_shape.concatenate(flat_sources[i].shape))
+      if rewrap_as_ndarray:
+        out = np_arrays.tensor_to_ndarray(out)
       output[i] = out
 
     return nest.pack_sequence_as(sources, output)
diff --git a/tensorflow/python/ops/numpy_ops/np_arrays.py b/tensorflow/python/ops/numpy_ops/np_arrays.py
index fd26318bea9..eca84421d1b 100644
--- a/tensorflow/python/ops/numpy_ops/np_arrays.py
+++ b/tensorflow/python/ops/numpy_ops/np_arrays.py
@@ -82,10 +82,10 @@ class NdarraySpec(type_spec.BatchableTypeSpec):
     return (self._data_spec,)
 
   def _batch(self, batch_size):
-    return NdarraySpec(self._data_spec.batch(batch_size))
+    return NdarraySpec(self._data_spec._batch(batch_size))  # pylint: disable=protected-access
 
   def _unbatch(self):
-    return NdarraySpec(self._data_spec.unbatch())
+    return NdarraySpec(self._data_spec._unbatch())  # pylint: disable=protected-access
 
 
 class ndarray(composite_tensor.CompositeTensor):  # pylint: disable=invalid-name
@@ -306,10 +306,6 @@ class ndarray(composite_tensor.CompositeTensor):  # pylint: disable=invalid-name
   def __repr__(self):
     return 'ndarray<{}>'.format(self.data.__repr__())
 
-  @property
-  def _id(self):
-    return self.data._id  # pylint: disable=protected-access
-
 
 def tensor_to_ndarray(tensor):
   return ndarray.from_tensor(tensor)

From 6b0b4876f8fe024485d02080d0be2d541ece6b17 Mon Sep 17 00:00:00 2001
From: Peng Wang <wangpeng@google.com>
Date: Mon, 22 Jun 2020 16:54:12 -0700
Subject: [PATCH 0837/1390] [TF-numpy] Adds __matmul__ method to ndarray.

PiperOrigin-RevId: 317763209
Change-Id: I3a2ab8e07b040144239ecee58d4ba9267f5b8977
---
 tensorflow/python/ops/numpy_ops/np_math_ops.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/python/ops/numpy_ops/np_math_ops.py b/tensorflow/python/ops/numpy_ops/np_math_ops.py
index b242e51a2e5..427aa96e5a4 100644
--- a/tensorflow/python/ops/numpy_ops/np_math_ops.py
+++ b/tensorflow/python/ops/numpy_ops/np_math_ops.py
@@ -954,6 +954,7 @@ setattr(np_arrays.ndarray, '__pow__', _wrap(power))
 setattr(np_arrays.ndarray, '__rpow__', _wrap(power, True))
 setattr(np_arrays.ndarray, '__truediv__', _wrap(true_divide))
 setattr(np_arrays.ndarray, '__rtruediv__', _wrap(true_divide, True))
+setattr(np_arrays.ndarray, '__matmul__', _wrap(matmul))
 
 
 def _comparison(tf_fun, x1, x2, cast_bool_to_int=False):

From 8bec96c5a8a14239a4b607e5e4c4ba1c5a9971f8 Mon Sep 17 00:00:00 2001
From: rahul-kamat <rahulkamat@gmail.com>
Date: Tue, 23 Jun 2020 00:44:39 +0000
Subject: [PATCH 0838/1390] Add tests to verify type annotations are correctly
 added

---
 .../python/framework/python_op_gen_test.cc    | 251 +++++++++++++++++-
 1 file changed, 249 insertions(+), 2 deletions(-)

diff --git a/tensorflow/python/framework/python_op_gen_test.cc b/tensorflow/python/framework/python_op_gen_test.cc
index 5185086fdd3..2561195c407 100644
--- a/tensorflow/python/framework/python_op_gen_test.cc
+++ b/tensorflow/python/framework/python_op_gen_test.cc
@@ -20,22 +20,269 @@ limitations under the License.
 #include "tensorflow/core/framework/op_gen_lib.h"
 #include "tensorflow/core/platform/test.h"
 
+#include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+
 namespace tensorflow {
 namespace {
 
+constexpr char kBaseOpDef[] = R"(
+op {
+  name: "Foo"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T2"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_UINT8
+        type: DT_INT8
+      }
+    }
+  }
+  attr {
+    name: "T2"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_STRING
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  summary: "Summary for op Foo."
+  description: "Description for op Foo."
+}
+op {
+  name: "Bar"
+  input_arg {
+    name: "x"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "y"
+    type: DT_QINT8
+  }
+  output_arg {
+    name: "output"
+    type: DT_BOOL
+  }
+  summary: "Summary for op Bar."
+  description: "Description for op Bar."
+}
+op {
+  name: "FooBar"
+  input_arg {
+    name: "x"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type: DT_BOOL
+  }
+  attr {
+    name: "t"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_INT8
+      }
+    }
+  }
+  attr {
+    name: "var1"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "var2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  summary: "Summary for op FooBar."
+  description: "Description for op FooBar."
+}
+op {
+  name: "Baz"
+  input_arg {
+    name: "inputs"
+    number_attr: "N"
+    type_list_attr: "T"
+  }
+  output_arg {
+    name: "output1"
+    type: DT_BOOL
+  }
+  output_arg {
+    name: "output2"
+    type: DT_BOOL
+  }
+  attr {
+    name: "T"
+    type: "bool"
+  }
+  attr {
+    name: "N"
+    type: "int"
+  }
+  summary: "Summary for op Baz."
+  description: "Description for op Baz."
+}
+)";
+
+std::unordered_set<string> type_annotate_ops {
+  "Foo",
+  "Bar",
+  "FooBar",
+  "Baz"
+};
+
+
+void ExpectHasSubstr(const string& s, const string& expected) {
+  EXPECT_TRUE(absl::StrContains(s, expected))
+      << "'Generated ops does not contain '" << expected << "'";
+}
+
+void ExpectDoesNotHaveSubstr(const string& s, const string& expected) {
+  EXPECT_FALSE(absl::StrContains(s, expected))
+      << "'Generated ops contains '" << expected << "'";
+}
+
+void ExpectSubstrOrder(const string& s, const string& before,
+                       const string& after) {
+  int before_pos = s.find(before);
+  int after_pos = s.find(after);
+  ASSERT_NE(std::string::npos, before_pos);
+  ASSERT_NE(std::string::npos, after_pos);
+  EXPECT_LT(before_pos, after_pos)
+      << before << "' is not before '" << after;
+}
+
 TEST(PythonOpGen, Basic) {
   OpList ops;
   OpRegistry::Global()->Export(false, &ops);
 
   ApiDefMap api_def_map(ops);
 
-  string code = GetPythonOps(ops, api_def_map, {}, "");
+  string code = GetPythonOps(ops, api_def_map, {}, "", {});
 
   EXPECT_TRUE(absl::StrContains(code, "def case"));
-
   // TODO(mdan): Add tests to verify type annotations are correctly added.
 }
 
+TEST(PythonOpGen, TypeAnnotateSingleTypeTensor) {
+  OpList op_defs;
+  OpRegistry::Global()->Export(false, &op_defs);
+  protobuf::TextFormat::ParseFromString(kBaseOpDef, &op_defs);  // NOLINT
+  ApiDefMap api_def_map(op_defs);
+
+  string code = GetPythonOps(op_defs, api_def_map, {}, "", type_annotate_ops);
+
+  const string typed_bar = "def bar(x: _ops.Tensor[_dtypes.String], y: _ops.Tensor[_dtypes.QInt8], name=None) -> _ops.Tensor[_dtypes.Bool]:";
+  ExpectHasSubstr(code, typed_bar);
+
+  const string untyped_bar = "def bar(x, y, name=None):";
+  ExpectDoesNotHaveSubstr(code, untyped_bar);
+}
+
+TEST(PythonOpGen, TypeAnnotateMultiTypeTensor) {
+  OpList op_defs;
+  OpRegistry::Global()->Export(false, &op_defs);
+  protobuf::TextFormat::ParseFromString(kBaseOpDef, &op_defs);  // NOLINT
+  ApiDefMap api_def_map(op_defs);
+
+  string code = GetPythonOps(op_defs, api_def_map, {}, "", type_annotate_ops);
+
+  const string typed_foo = "def foo(x: _ops.Tensor[TV_Foo_T], y: _ops.Tensor[TV_Foo_T2], name=None) -> _ops.Tensor[TV_Foo_T]:";
+  ExpectHasSubstr(code, typed_foo);
+}
+
+TEST(PythonOpGen, GenerateCorrectTypeVars) {
+  OpList op_defs;
+  OpRegistry::Global()->Export(false, &op_defs);
+  protobuf::TextFormat::ParseFromString(kBaseOpDef, &op_defs);  // NOLINT
+  ApiDefMap api_def_map(op_defs);
+
+  string code = GetPythonOps(op_defs, api_def_map, {}, "", type_annotate_ops);
+
+  const string typevars_foo = R"(
+TV_Foo_T = TypeVar("TV_Foo_T", _dtypes.Int8, _dtypes.UInt8)
+TV_Foo_T2 = TypeVar("TV_Foo_T2", _dtypes.Float32, _dtypes.Float64, _dtypes.String)
+)";
+
+  ExpectHasSubstr(code, typevars_foo);
+}
+
+TEST(PythonOpGen, TypeAnnotateFallback) {
+  OpList op_defs;
+  OpRegistry::Global()->Export(false, &op_defs);
+  protobuf::TextFormat::ParseFromString(kBaseOpDef, &op_defs);  // NOLINT
+  ApiDefMap api_def_map(op_defs);
+
+  string code = GetPythonOps(op_defs, api_def_map, {}, "", type_annotate_ops);
+
+  const string typed_foo_fallback = "def foo_eager_fallback(x: _ops.Tensor[TV_Foo_T], y: _ops.Tensor[TV_Foo_T2], name, ctx) -> _ops.Tensor[TV_Foo_T]:";
+  ExpectHasSubstr(code, typed_foo_fallback);
+}
+
+TEST(PythonOpGen, GenerateTypeVarAboveOp) {
+  OpList op_defs;
+  OpRegistry::Global()->Export(false, &op_defs);
+  protobuf::TextFormat::ParseFromString(kBaseOpDef, &op_defs);  // NOLINT
+  ApiDefMap api_def_map(op_defs);
+
+  string code = GetPythonOps(op_defs, api_def_map, {}, "", type_annotate_ops);
+
+  const string typevar_foo = "TV_Foo_";
+  const string def_foo = "def foo";
+  ExpectSubstrOrder(code, typevar_foo, def_foo);
+}
+
+
+TEST(PythonOpGen, TypeAnnotateDefaultParams) {
+  OpList op_defs;
+  OpRegistry::Global()->Export(false, &op_defs);
+  protobuf::TextFormat::ParseFromString(kBaseOpDef, &op_defs);  // NOLINT
+  ApiDefMap api_def_map(op_defs);
+
+  string code = GetPythonOps(op_defs, api_def_map, {}, "", type_annotate_ops);
+
+  const string params = "def foo_bar(x: _ops.Tensor[_dtypes.Float32], t: TV_FooBar_t, var1: bool = False, var2: int = 0, name=None)";
+  const string params_fallback = "def foo_bar_eager_fallback(x: _ops.Tensor[_dtypes.Float32], t: TV_FooBar_t, var1: bool, var2: int, name, ctx)";
+  ExpectHasSubstr(code, params);
+  ExpectHasSubstr(code, params_fallback);
+}
+
+TEST(PythonOpGen, NoTypingSequenceTensors) {
+  OpList op_defs;
+  OpRegistry::Global()->Export(false, &op_defs);
+  protobuf::TextFormat::ParseFromString(kBaseOpDef, &op_defs);  // NOLINT
+  ApiDefMap api_def_map(op_defs);
+
+  string code = GetPythonOps(op_defs, api_def_map, {}, "", type_annotate_ops);
+
+  const string baz_def_line = "def baz(inputs, name=None):";
+
+  ExpectHasSubstr(code, baz_def_line);
+}
+
 // TODO(mdan): Include more tests with synhtetic ops and api defs.
 
 }  // namespace

From 23e387975e683a971dd0cbe2198cb60c5cf71fc4 Mon Sep 17 00:00:00 2001
From: Frank Chen <frankchn@google.com>
Date: Mon, 22 Jun 2020 16:58:50 -0700
Subject: [PATCH 0839/1390] Rename tpu_load_library to
 tpu_api_dlsym_initializer.

This better reflects what the load library module does.

PiperOrigin-RevId: 317763973
Change-Id: I66169394c2f1d77b260323cb81d76f0f4e167579
---
 tensorflow/core/BUILD                         |  2 +-
 tensorflow/core/framework/load_library.cc     |  2 +-
 tensorflow/core/tpu/BUILD                     | 24 ++++++--
 tensorflow/core/tpu/kernels/BUILD             |  4 +-
 .../core/tpu/kernels/tpu_configuration_ops.cc |  2 +-
 .../tpu/kernels/tpu_mesh_state_interface.h    |  2 +-
 tensorflow/core/tpu/tpu_api.cc                | 57 +++++++++++++++++++
 .../tpu/{tpu_library_loader.h => tpu_api.h}   | 12 +---
 ...loader.cc => tpu_api_dlsym_initializer.cc} | 40 +------------
 .../core/tpu/tpu_api_dlsym_initializer.h      | 38 +++++++++++++
 ...c => tpu_api_dlsym_initializer_windows.cc} | 20 +------
 tensorflow/stream_executor/tpu/BUILD          | 12 ++--
 .../stream_executor/tpu/tpu_executor.cc       |  2 +-
 .../stream_executor/tpu/tpu_node_context.cc   |  2 +-
 .../stream_executor/tpu/tpu_platform.cc       |  2 +-
 tensorflow/stream_executor/tpu/tpu_stream.h   |  2 +-
 tensorflow/stream_executor/tpu/tpu_timer.h    |  2 +-
 .../tpu/tpu_transfer_manager.cc               |  2 +-
 18 files changed, 141 insertions(+), 86 deletions(-)
 create mode 100644 tensorflow/core/tpu/tpu_api.cc
 rename tensorflow/core/tpu/{tpu_library_loader.h => tpu_api.h} (77%)
 rename tensorflow/core/tpu/{tpu_library_loader.cc => tpu_api_dlsym_initializer.cc} (71%)
 create mode 100644 tensorflow/core/tpu/tpu_api_dlsym_initializer.h
 rename tensorflow/core/tpu/{tpu_library_loader_windows.cc => tpu_api_dlsym_initializer_windows.cc} (64%)

diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index 695035c91e9..d0be6ee9597 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -2259,7 +2259,7 @@ tf_cuda_library(
         "//tensorflow/core/platform/default/build_config:platformlib",
         "//tensorflow/core/profiler/lib:annotated_traceme",
         "//tensorflow/core/profiler/lib:traceme",
-        "//tensorflow/core/tpu:tpu_library_loader",
+        "//tensorflow/core/tpu:tpu_api_dlsym_initializer",
         "//tensorflow/core/util:einsum_op_util",
         "//tensorflow/core/util:padding",
         "//tensorflow/core/util:port",
diff --git a/tensorflow/core/framework/load_library.cc b/tensorflow/core/framework/load_library.cc
index ab08d644074..22181e1c8be 100644
--- a/tensorflow/core/framework/load_library.cc
+++ b/tensorflow/core/framework/load_library.cc
@@ -22,7 +22,7 @@ limitations under the License.
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/mem.h"
 #if !defined(IS_MOBILE_PLATFORM)
-#include "tensorflow/core/tpu/tpu_library_loader.h"
+#include "tensorflow/core/tpu/tpu_api_dlsym_initializer.h"
 #endif  // IS_MOBILE_PLATFORM
 
 namespace tensorflow {
diff --git a/tensorflow/core/tpu/BUILD b/tensorflow/core/tpu/BUILD
index 9e89cd69235..aa811f23672 100644
--- a/tensorflow/core/tpu/BUILD
+++ b/tensorflow/core/tpu/BUILD
@@ -107,15 +107,31 @@ cc_library(
 )
 
 cc_library(
-    name = "tpu_library_loader",
+    name = "tpu_api",
+    srcs = ["tpu_api.cc"],
+    hdrs = ["tpu_api.h"],
+    deps = [
+        ":libtftpu_header",
+        ":tpu_config_c_api",
+        "//tensorflow/core/tpu/kernels:tpu_compile_c_api_hdrs",
+        "//tensorflow/core/tpu/kernels:tpu_mesh_state_c_api_hdrs",
+        "//tensorflow/core/tpu/kernels:tpu_util_c_api_hdrs",
+        "//tensorflow/stream_executor/tpu:tpu_executor_c_api_hdrs",
+        "//tensorflow/stream_executor/tpu:tpu_node_context_c_api_hdrs",
+    ],
+)
+
+cc_library(
+    name = "tpu_api_dlsym_initializer",
     srcs = if_windows(
-        ["tpu_library_loader_windows.cc"],
-        otherwise = ["tpu_library_loader.cc"],
+        ["tpu_api_dlsym_initializer_windows.cc"],
+        otherwise = ["tpu_api_dlsym_initializer.cc"],
     ),
-    hdrs = ["tpu_library_loader.h"],
+    hdrs = ["tpu_api_dlsym_initializer.h"],
     visibility = ["//visibility:public"],
     deps = [
         ":libtftpu_header",
+        ":tpu_api",
         ":tpu_config_c_api",
         ":tpu_library_init_fns",
         "//tensorflow/core/platform:errors",
diff --git a/tensorflow/core/tpu/kernels/BUILD b/tensorflow/core/tpu/kernels/BUILD
index a9f2202cd45..f69c97b81de 100644
--- a/tensorflow/core/tpu/kernels/BUILD
+++ b/tensorflow/core/tpu/kernels/BUILD
@@ -72,10 +72,10 @@ tf_kernel_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/platform:refcount",
+        "//tensorflow/core/tpu:tpu_api",
         "//tensorflow/core/tpu:tpu_config_c_api",
         "//tensorflow/core/tpu:tpu_configuration",
         "//tensorflow/core/tpu:tpu_defs",
-        "//tensorflow/core/tpu:tpu_library_loader",
         "//tensorflow/stream_executor/tpu:proto_helper",
     ],
     alwayslink = 1,
@@ -224,7 +224,7 @@ cc_library(
         "//tensorflow/compiler/xla/service",
         "//tensorflow/core:framework",
         "//tensorflow/core/protobuf/tpu:compile_metadata_proto_cc",
-        "//tensorflow/core/tpu:tpu_library_loader",
+        "//tensorflow/core/tpu:tpu_api",
     ],
 )
 
diff --git a/tensorflow/core/tpu/kernels/tpu_configuration_ops.cc b/tensorflow/core/tpu/kernels/tpu_configuration_ops.cc
index 12a3256a44f..583f1aec207 100644
--- a/tensorflow/core/tpu/kernels/tpu_configuration_ops.cc
+++ b/tensorflow/core/tpu/kernels/tpu_configuration_ops.cc
@@ -24,10 +24,10 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/platform/refcount.h"
 #include "tensorflow/core/tpu/kernels/tpu_mesh_state_interface.h"
+#include "tensorflow/core/tpu/tpu_api.h"
 #include "tensorflow/core/tpu/tpu_config_c_api.h"
 #include "tensorflow/core/tpu/tpu_configuration.h"
 #include "tensorflow/core/tpu/tpu_defs.h"
-#include "tensorflow/core/tpu/tpu_library_loader.h"
 #include "tensorflow/stream_executor/tpu/proto_helper.h"
 
 namespace tensorflow {
diff --git a/tensorflow/core/tpu/kernels/tpu_mesh_state_interface.h b/tensorflow/core/tpu/kernels/tpu_mesh_state_interface.h
index 3eff3be4915..e2ac38b5f84 100644
--- a/tensorflow/core/tpu/kernels/tpu_mesh_state_interface.h
+++ b/tensorflow/core/tpu/kernels/tpu_mesh_state_interface.h
@@ -21,7 +21,7 @@ limitations under the License.
 #include "tensorflow/core/protobuf/tpu/compile_metadata.pb.h"
 #include "tensorflow/core/tpu/kernels/tpu_compile_c_api.h"
 #include "tensorflow/core/tpu/kernels/tpu_mesh_state_c_api.h"
-#include "tensorflow/core/tpu/tpu_library_loader.h"
+#include "tensorflow/core/tpu/tpu_api.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/tpu/tpu_api.cc b/tensorflow/core/tpu/tpu_api.cc
new file mode 100644
index 00000000000..8dad82b3029
--- /dev/null
+++ b/tensorflow/core/tpu/tpu_api.cc
@@ -0,0 +1,57 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/tpu/tpu_api.h"
+
+namespace tensorflow {
+namespace tpu {
+
+TfTpu_BaseFn* InitializeApiFn() {
+  static TfTpu_BaseFn base_fn;
+  return &base_fn;
+}
+
+TfTpu_ConfigApiFn* ConfigApiFn() {
+  static TfTpu_ConfigApiFn config_api_fn;
+  return &config_api_fn;
+}
+
+TfTpu_MeshStateApiFn* MeshStateApiFn() {
+  static TfTpu_MeshStateApiFn mesh_state_api_fn;
+  return &mesh_state_api_fn;
+}
+
+TfTpu_CompileApiFn* CompileApiFn() {
+  static TfTpu_CompileApiFn compile_api_fn;
+  return &compile_api_fn;
+}
+
+TfTpu_ExecutorApiFn* ExecutorApiFn() {
+  static TfTpu_ExecutorApiFn executor_api_fn;
+  return &executor_api_fn;
+}
+
+TfTpu_NodeContextApiFn* NodeContextApiFn() {
+  static TfTpu_NodeContextApiFn node_context_api_fn;
+  return &node_context_api_fn;
+}
+
+TfTpu_UtilApiFn* UtilApiFn() {
+  static TfTpu_UtilApiFn util_api_fn;
+  return &util_api_fn;
+}
+
+}  // namespace tpu
+}  // namespace tensorflow
diff --git a/tensorflow/core/tpu/tpu_library_loader.h b/tensorflow/core/tpu/tpu_api.h
similarity index 77%
rename from tensorflow/core/tpu/tpu_library_loader.h
rename to tensorflow/core/tpu/tpu_api.h
index ba6c324707d..c47ace6601d 100644
--- a/tensorflow/core/tpu/tpu_library_loader.h
+++ b/tensorflow/core/tpu/tpu_api.h
@@ -13,10 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CORE_TPU_TPU_LIBRARY_LOADER_H_
-#define TENSORFLOW_CORE_TPU_TPU_LIBRARY_LOADER_H_
+#ifndef TENSORFLOW_CORE_TPU_TPU_API_H_
+#define TENSORFLOW_CORE_TPU_TPU_API_H_
 
-#include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/tpu/kernels/tpu_compile_c_api.h"
 #include "tensorflow/core/tpu/kernels/tpu_mesh_state_c_api.h"
 #include "tensorflow/core/tpu/kernels/tpu_util_c_api.h"
@@ -25,13 +24,9 @@ limitations under the License.
 #include "tensorflow/stream_executor/tpu/tpu_executor_c_api.h"
 #include "tensorflow/stream_executor/tpu/tpu_node_context_c_api.h"
 
-// LINT.IfChange
 namespace tensorflow {
 namespace tpu {
 
-Status InitializeTpuLibrary(void* library_handle);
-
-// TODO(frankchn): Separate out API functions from the loader.
 TfTpu_BaseFn* InitializeApiFn();
 
 TfTpu_ConfigApiFn* ConfigApiFn();
@@ -48,6 +43,5 @@ TfTpu_UtilApiFn* UtilApiFn();
 
 }  // namespace tpu
 }  // namespace tensorflow
-// LINT.ThenChange(//tensorflow/core/tpu/tpu_library_loader_windows.cc)
 
-#endif  // TENSORFLOW_CORE_TPU_TPU_LIBRARY_LOADER_H_
+#endif  // TENSORFLOW_CORE_TPU_TPU_API_H_
diff --git a/tensorflow/core/tpu/tpu_library_loader.cc b/tensorflow/core/tpu/tpu_api_dlsym_initializer.cc
similarity index 71%
rename from tensorflow/core/tpu/tpu_library_loader.cc
rename to tensorflow/core/tpu/tpu_api_dlsym_initializer.cc
index 834b86e68a7..c6666421327 100644
--- a/tensorflow/core/tpu/tpu_library_loader.cc
+++ b/tensorflow/core/tpu/tpu_api_dlsym_initializer.cc
@@ -13,14 +13,13 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-// TODO(frankchn): Rename to `tpu_api_dlsym_initializer` or similar.
-
-#include "tensorflow/core/tpu/tpu_library_loader.h"
+#include "tensorflow/core/tpu/tpu_api_dlsym_initializer.h"
 
 #include <dlfcn.h>
 
 #include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/tpu/tpu_api.h"
 #include "tensorflow/stream_executor/tpu/tpu_node_context_c_api.h"
 #include "tensorflow/stream_executor/tpu/tpu_platform.h"
 
@@ -39,41 +38,6 @@ namespace tpu {
 
 #include "tensorflow/core/tpu/tpu_library_init_fns.inc"
 
-TfTpu_BaseFn* InitializeApiFn() {
-  static TfTpu_BaseFn base_fn;
-  return &base_fn;
-}
-
-TfTpu_ConfigApiFn* ConfigApiFn() {
-  static TfTpu_ConfigApiFn config_api_fn;
-  return &config_api_fn;
-}
-
-TfTpu_MeshStateApiFn* MeshStateApiFn() {
-  static TfTpu_MeshStateApiFn mesh_state_api_fn;
-  return &mesh_state_api_fn;
-}
-
-TfTpu_CompileApiFn* CompileApiFn() {
-  static TfTpu_CompileApiFn compile_api_fn;
-  return &compile_api_fn;
-}
-
-TfTpu_ExecutorApiFn* ExecutorApiFn() {
-  static TfTpu_ExecutorApiFn executor_api_fn;
-  return &executor_api_fn;
-}
-
-TfTpu_NodeContextApiFn* NodeContextApiFn() {
-  static TfTpu_NodeContextApiFn node_context_api_fn;
-  return &node_context_api_fn;
-}
-
-TfTpu_UtilApiFn* UtilApiFn() {
-  static TfTpu_UtilApiFn util_api_fn;
-  return &util_api_fn;
-}
-
 Status InitializeTpuLibrary(void* library_handle) {
   bool shared_object_loaded = true;
   if (library_handle == nullptr) {
diff --git a/tensorflow/core/tpu/tpu_api_dlsym_initializer.h b/tensorflow/core/tpu/tpu_api_dlsym_initializer.h
new file mode 100644
index 00000000000..257fa25ad37
--- /dev/null
+++ b/tensorflow/core/tpu/tpu_api_dlsym_initializer.h
@@ -0,0 +1,38 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_TPU_TPU_API_DLSYM_INITIALIZER_H_
+#define TENSORFLOW_CORE_TPU_TPU_API_DLSYM_INITIALIZER_H_
+
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/tpu/kernels/tpu_compile_c_api.h"
+#include "tensorflow/core/tpu/kernels/tpu_mesh_state_c_api.h"
+#include "tensorflow/core/tpu/kernels/tpu_util_c_api.h"
+#include "tensorflow/core/tpu/libtftpu.h"
+#include "tensorflow/core/tpu/tpu_config_c_api.h"
+#include "tensorflow/stream_executor/tpu/tpu_executor_c_api.h"
+#include "tensorflow/stream_executor/tpu/tpu_node_context_c_api.h"
+
+// LINT.IfChange
+namespace tensorflow {
+namespace tpu {
+
+Status InitializeTpuLibrary(void* library_handle);
+
+}  // namespace tpu
+}  // namespace tensorflow
+// LINT.ThenChange(//tensorflow/core/tpu/tpu_api_dlsym_initializer_windows.cc)
+
+#endif  // TENSORFLOW_CORE_TPU_TPU_API_DLSYM_INITIALIZER_H_
diff --git a/tensorflow/core/tpu/tpu_library_loader_windows.cc b/tensorflow/core/tpu/tpu_api_dlsym_initializer_windows.cc
similarity index 64%
rename from tensorflow/core/tpu/tpu_library_loader_windows.cc
rename to tensorflow/core/tpu/tpu_api_dlsym_initializer_windows.cc
index 7cf1b5cdb1d..f453a98e558 100644
--- a/tensorflow/core/tpu/tpu_library_loader_windows.cc
+++ b/tensorflow/core/tpu/tpu_api_dlsym_initializer_windows.cc
@@ -15,28 +15,14 @@ limitations under the License.
 
 #include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/status.h"
-#include "tensorflow/core/tpu/tpu_library_loader.h"
+#include "tensorflow/core/tpu/tpu_api_dlsym_initializer.h"
 
-// Reminder: Update tpu_library_loader.cc if you are adding new publicly
-// visible methods.
+// Reminder: Update tpu_api_dlsym_initializer_windows.cc if you are adding new
+// publicly visible methods.
 
 namespace tensorflow {
 namespace tpu {
 
-TfTpu_BaseFn* InitializeApiFn() { return nullptr; }
-
-TfTpu_ConfigApiFn* ConfigApiFn() { return nullptr; }
-
-TfTpu_MeshStateApiFn* MeshStateApiFn() { return nullptr; }
-
-TfTpu_CompileApiFn* CompileApiFn() { return nullptr; }
-
-TfTpu_ExecutorApiFn* ExecutorApiFn() { return nullptr; }
-
-TfTpu_NodeContextApiFn* NodeContextApiFn() { return nullptr; }
-
-TfTpu_UtilApiFn* UtilApiFn() { return nullptr; }
-
 Status InitializeTpuLibrary(void* library_handle) {
   return errors::Unimplemented(
       "Loading TPU library is not supported on Windows.");
diff --git a/tensorflow/stream_executor/tpu/BUILD b/tensorflow/stream_executor/tpu/BUILD
index bf88e9809d0..720ba6bc0c3 100644
--- a/tensorflow/stream_executor/tpu/BUILD
+++ b/tensorflow/stream_executor/tpu/BUILD
@@ -70,7 +70,7 @@ cc_library(
         ":status_helper",
         ":tpu_executor_c_api_hdrs",
         ":tpu_stream_interface",
-        "//tensorflow/core/tpu:tpu_library_loader",
+        "//tensorflow/core/tpu:tpu_api",
         "//tensorflow/stream_executor:stream",
     ],
 )
@@ -81,7 +81,7 @@ cc_library(
     deps = [
         ":tpu_executor_c_api_hdrs",
         "//tensorflow/core/platform:types",
-        "//tensorflow/core/tpu:tpu_library_loader",
+        "//tensorflow/core/tpu:tpu_api",
         "//tensorflow/stream_executor:stream",
     ],
 )
@@ -101,7 +101,7 @@ cc_library(
         ":tpu_timer",
         "//tensorflow/c:tf_status",
         "//tensorflow/core:lib",
-        "//tensorflow/core/tpu:tpu_library_loader",
+        "//tensorflow/core/tpu:tpu_api",
         "//tensorflow/stream_executor:stream",
         "//tensorflow/stream_executor/lib",
         "@com_google_absl//absl/container:flat_hash_map",
@@ -151,7 +151,7 @@ cc_library(
         "//tensorflow/compiler/xla/service:stream_pool",
         "//tensorflow/compiler/xla/service:transfer_manager",
         "//tensorflow/core:framework",
-        "//tensorflow/core/tpu:tpu_library_loader",
+        "//tensorflow/core/tpu:tpu_api",
         "//tensorflow/stream_executor:device_memory_allocator",
         "//tensorflow/stream_executor/lib",
         "@com_google_absl//absl/memory",
@@ -169,7 +169,7 @@ cc_library(
         ":tpu_platform_interface",
         "//tensorflow/c:tf_status",
         "//tensorflow/core/platform:types",
-        "//tensorflow/core/tpu:tpu_library_loader",
+        "//tensorflow/core/tpu:tpu_api",
         "//tensorflow/stream_executor:stream",
         "@com_google_absl//absl/container:flat_hash_map",
     ],
@@ -201,7 +201,7 @@ cc_library(
         "//tensorflow/compiler/xla:xla_data_proto_cc",
         "//tensorflow/compiler/xla/service:shaped_buffer",
         "//tensorflow/compiler/xla/service:transfer_manager",
-        "//tensorflow/core/tpu:tpu_library_loader",
+        "//tensorflow/core/tpu:tpu_api",
         "//tensorflow/stream_executor:stream",
     ],
 )
diff --git a/tensorflow/stream_executor/tpu/tpu_executor.cc b/tensorflow/stream_executor/tpu/tpu_executor.cc
index cb1410880eb..95c32714732 100644
--- a/tensorflow/stream_executor/tpu/tpu_executor.cc
+++ b/tensorflow/stream_executor/tpu/tpu_executor.cc
@@ -17,7 +17,7 @@ limitations under the License.
 
 #include "tensorflow/c/tf_status.h"
 #include "tensorflow/core/lib/gtl/cleanup.h"
-#include "tensorflow/core/tpu/tpu_library_loader.h"
+#include "tensorflow/core/tpu/tpu_api.h"
 #include "tensorflow/stream_executor/device_memory.h"
 #include "tensorflow/stream_executor/lib/status.h"
 #include "tensorflow/stream_executor/tpu/c_api_conversions.h"
diff --git a/tensorflow/stream_executor/tpu/tpu_node_context.cc b/tensorflow/stream_executor/tpu/tpu_node_context.cc
index 356ede40fb3..b502264cfc7 100644
--- a/tensorflow/stream_executor/tpu/tpu_node_context.cc
+++ b/tensorflow/stream_executor/tpu/tpu_node_context.cc
@@ -17,7 +17,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/backend.h"
 #include "tensorflow/compiler/xla/service/platform_util.h"
 #include "tensorflow/compiler/xla/service/transfer_manager.h"
-#include "tensorflow/core/tpu/tpu_library_loader.h"
+#include "tensorflow/core/tpu/tpu_api.h"
 #include "tensorflow/stream_executor/device_memory_allocator.h"
 #include "tensorflow/stream_executor/tpu/tpu_executor_c_api.h"
 #include "tensorflow/stream_executor/tpu/tpu_node_context_c_api.h"
diff --git a/tensorflow/stream_executor/tpu/tpu_platform.cc b/tensorflow/stream_executor/tpu/tpu_platform.cc
index 4bccd822e91..97a97a63351 100644
--- a/tensorflow/stream_executor/tpu/tpu_platform.cc
+++ b/tensorflow/stream_executor/tpu/tpu_platform.cc
@@ -16,7 +16,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/tpu/tpu_platform.h"
 
 #include "tensorflow/c/tf_status.h"
-#include "tensorflow/core/tpu/tpu_library_loader.h"
+#include "tensorflow/core/tpu/tpu_api.h"
 #include "tensorflow/stream_executor/platform.h"
 #include "tensorflow/stream_executor/tpu/status_helper.h"
 #include "tensorflow/stream_executor/tpu/tpu_executor.h"
diff --git a/tensorflow/stream_executor/tpu/tpu_stream.h b/tensorflow/stream_executor/tpu/tpu_stream.h
index e1aa1164248..209a624b462 100644
--- a/tensorflow/stream_executor/tpu/tpu_stream.h
+++ b/tensorflow/stream_executor/tpu/tpu_stream.h
@@ -16,7 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_STREAM_EXECUTOR_TPU_TPU_STREAM_H_
 #define TENSORFLOW_STREAM_EXECUTOR_TPU_TPU_STREAM_H_
 
-#include "tensorflow/core/tpu/tpu_library_loader.h"
+#include "tensorflow/core/tpu/tpu_api.h"
 #include "tensorflow/stream_executor/stream_executor_internal.h"
 #include "tensorflow/stream_executor/tpu/c_api_conversions.h"
 #include "tensorflow/stream_executor/tpu/status_helper.h"
diff --git a/tensorflow/stream_executor/tpu/tpu_timer.h b/tensorflow/stream_executor/tpu/tpu_timer.h
index d7f8f660b37..0ad48ce8a80 100644
--- a/tensorflow/stream_executor/tpu/tpu_timer.h
+++ b/tensorflow/stream_executor/tpu/tpu_timer.h
@@ -17,7 +17,7 @@ limitations under the License.
 #define TENSORFLOW_STREAM_EXECUTOR_TPU_TPU_TIMER_H_
 
 #include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/tpu/tpu_library_loader.h"
+#include "tensorflow/core/tpu/tpu_api.h"
 #include "tensorflow/stream_executor/stream_executor_internal.h"
 #include "tensorflow/stream_executor/tpu/tpu_executor_c_api.h"
 
diff --git a/tensorflow/stream_executor/tpu/tpu_transfer_manager.cc b/tensorflow/stream_executor/tpu/tpu_transfer_manager.cc
index 934fabbf54d..c55af7d58b9 100644
--- a/tensorflow/stream_executor/tpu/tpu_transfer_manager.cc
+++ b/tensorflow/stream_executor/tpu/tpu_transfer_manager.cc
@@ -17,7 +17,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
-#include "tensorflow/core/tpu/tpu_library_loader.h"
+#include "tensorflow/core/tpu/tpu_api.h"
 #include "tensorflow/stream_executor/device_memory.h"
 #include "tensorflow/stream_executor/tpu/c_api_conversions.h"
 #include "tensorflow/stream_executor/tpu/proto_helper.h"

From 3cc9264e307a1ec221680542c79e86e0f3d13807 Mon Sep 17 00:00:00 2001
From: Peng Wang <wangpeng@google.com>
Date: Mon, 22 Jun 2020 17:00:43 -0700
Subject: [PATCH 0840/1390] [TF-numpy] Adds an accessor class for numpy_ops, in
 order to be tf_exported'ed.

PiperOrigin-RevId: 317764327
Change-Id: I7af7e421c89b0c754e836a2a84cb67a1a82687a1
---
 tensorflow/python/ops/numpy_ops/BUILD         |  1 -
 .../python/ops/numpy_ops/np_accessor.py       | 32 -------------------
 2 files changed, 33 deletions(-)
 delete mode 100644 tensorflow/python/ops/numpy_ops/np_accessor.py

diff --git a/tensorflow/python/ops/numpy_ops/BUILD b/tensorflow/python/ops/numpy_ops/BUILD
index a70e3f3918d..c4203840c61 100644
--- a/tensorflow/python/ops/numpy_ops/BUILD
+++ b/tensorflow/python/ops/numpy_ops/BUILD
@@ -11,7 +11,6 @@ py_library(
     name = "numpy",
     srcs = [
         "__init__.py",
-        "np_accessor.py",
         "np_array_ops.py",
         "np_arrays.py",
         "np_dtypes.py",
diff --git a/tensorflow/python/ops/numpy_ops/np_accessor.py b/tensorflow/python/ops/numpy_ops/np_accessor.py
deleted file mode 100644
index 64786d2c50a..00000000000
--- a/tensorflow/python/ops/numpy_ops/np_accessor.py
+++ /dev/null
@@ -1,32 +0,0 @@
-# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""An accessor class for numpy_ops contents."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.python.ops import numpy_ops
-
-
-class Numpy:
-  """An accessor class that forwards attribute accesses to module `numpy_ops`.
-  """
-
-  def __getattr__(self, attr):
-    return getattr(numpy_ops, attr)
-
-
-numpy = Numpy()

From 1a3b7af373ab365780729d927ff0117951b3ddfb Mon Sep 17 00:00:00 2001
From: Shanqing Cai <cais@google.com>
Date: Mon, 22 Jun 2020 17:05:24 -0700
Subject: [PATCH 0841/1390] [tfdbg2] Support reading multiple DebugEvent file
 sets from the same tfdbg run

TensorFlow jobs that involve multiple hosts (e.g., parameter-server setups and
TPU coordinator-worker setups) can generate >1 DebugEvent file sets when
instrumented with tfdbg2's `tf.debugging.experimental.enable_dump_debug_info()`.

This CL adds capability to load these multiple file sets belonging to the
same tfdbg_run_id to DebugEventsReader and DebugDataReader.

PiperOrigin-RevId: 317765159
Change-Id: Ifcf593bd8b404e3e1c3a6f3f3be70bd6b8b73555
---
 .../python/debug/lib/debug_events_reader.py   | 290 +++++++++++-------
 .../debug/lib/debug_events_writer_test.py     |  87 +++++-
 .../python/debug/lib/debug_v2_ops_test.py     |  19 +-
 .../debug/lib/dumping_callback_test_lib.py    |   7 +-
 4 files changed, 282 insertions(+), 121 deletions(-)

diff --git a/tensorflow/python/debug/lib/debug_events_reader.py b/tensorflow/python/debug/lib/debug_events_reader.py
index 743cea7103a..915c65f7594 100644
--- a/tensorflow/python/debug/lib/debug_events_reader.py
+++ b/tensorflow/python/debug/lib/debug_events_reader.py
@@ -46,28 +46,37 @@ class DebugEventsReader(object):
   # penalty.
   _READER_RELEASE_PER = 100
 
+  _METADATA_SUFFIX = ".metadata"
+  _SOURCE_FILE_SUFFIX = ".source_files"
+  _STACK_FRAMES_SUFFIX = ".stack_frames"
+  _GRAPHS_SUFFIX = ".graphs"
+  _EXECUTION_SUFFIX = ".execution"
+  _GRAPH_EXECUTION_TRACES_SUFFIX = ".graph_execution_traces"
+
   def __init__(self, dump_root):
     if not file_io.is_directory(dump_root):
       raise ValueError("Specified dump_root is not a directory: %s" % dump_root)
-    metadata_paths = file_io.get_matching_files(
-        os.path.join(dump_root, "*.metadata"))
-    if not metadata_paths:
-      raise ValueError("Cannot find any metadata file in directory: %s" %
-                       dump_root)
-    elif len(metadata_paths) > 1:
-      raise ValueError(
-          "Unexpected: Found multiple (%d) metadata in directory: %s" %
-          (len(metadata_paths), dump_root))
-    self._metadata_path = compat.as_bytes(metadata_paths[0])
-    self._metadata_reader = None
+    self._dump_root = dump_root
+    self._metadata_paths = self._load_metadata_files()
 
-    prefix = metadata_paths[0][:-len(".metadata")]
-    self._source_files_path = compat.as_bytes("%s.source_files" % prefix)
-    self._stack_frames_path = compat.as_bytes("%s.stack_frames" % prefix)
-    self._graphs_path = compat.as_bytes("%s.graphs" % prefix)
-    self._execution_path = compat.as_bytes("%s.execution" % prefix)
-    self._graph_execution_traces_path = compat.as_bytes(
-        "%s.graph_execution_traces" % prefix)
+    prefixes = [
+        metadata_path[:-len(self._METADATA_SUFFIX)]
+        for metadata_path in self._metadata_paths
+    ]
+    prefix = prefixes[0]  # This is the prefix of the main file set.
+    self._source_files_path = compat.as_bytes(prefix + self._SOURCE_FILE_SUFFIX)
+    self._stack_frames_path = compat.as_bytes(prefix +
+                                              self._STACK_FRAMES_SUFFIX)
+    self._graphs_path = compat.as_bytes(prefix + self._GRAPHS_SUFFIX)
+    self._execution_path = compat.as_bytes(prefix + self._EXECUTION_SUFFIX)
+    # There can be multiple .graph_execution_trace files each belonging
+    # to a file set generated on an individual host, in the case of
+    # a distributed TensorFlow job.
+    # This is different from the other debug event files in the file set.
+    self._graph_execution_traces_paths = [
+        compat.as_bytes(prefix + self._GRAPH_EXECUTION_TRACES_SUFFIX)
+        for prefix in prefixes
+    ]
     self._readers = dict()  # A map from file path to reader.
     # A map from file path to current reading offset.
     self._reader_offsets = dict()
@@ -78,6 +87,91 @@ class DebugEventsReader(object):
 
     self._offsets = dict()
 
+  def _load_metadata_files(self):
+    """Load and parse metadata files in the dump root.
+
+    Check that all metadata files have a common tfdbg_run_id, and raise
+    a ValueError if their tfdbg_run_ids differ.
+
+    Returns:
+      A list of metadata file paths in ascending order of their starting
+        wall_time timestamp.
+    """
+
+    metadata_paths = file_io.get_matching_files(
+        os.path.join(self._dump_root, "*%s" % self._METADATA_SUFFIX))
+    if not metadata_paths:
+      raise ValueError("Cannot find any tfdbg metadata file in directory: %s" %
+                       self._dump_root)
+    wall_times = []
+    run_ids = []
+    tensorflow_versions = []
+    file_versions = []
+    for metadata_path in metadata_paths:
+      reader = tf_record.tf_record_random_reader(metadata_path)
+      try:
+        record = reader.read(0)[0]
+        debug_event = debug_event_pb2.DebugEvent.FromString(record)
+        wall_times.append(debug_event.wall_time)
+        run_ids.append(debug_event.debug_metadata.tfdbg_run_id)
+        tensorflow_versions.append(
+            debug_event.debug_metadata.tensorflow_version)
+        file_versions.append(debug_event.debug_metadata.file_version)
+      finally:
+        reader.close()
+    self._starting_wall_time = wall_times[0]
+    self._tfdbg_run_id = run_ids[0]
+    self._tensorflow_version = tensorflow_versions[0]
+    self._file_version = file_versions[0]
+    if len(metadata_paths) == 1:
+      # Fast path for a common case (only one DebugEvent file set.)
+      return metadata_paths
+
+    num_no_id = len([run_id for run_id in run_ids if not run_id])
+    if num_no_id:
+      paths_without_run_id = [
+          metadata_path
+          for metadata_path, run_id in zip(metadata_paths, run_ids)
+          if not run_id
+      ]
+      raise ValueError(
+          "Found %d tfdbg metadata files and %d of them do not "
+          "have tfdbg run ids. The metadata files without run ids are: %s" %
+          (len(run_ids), num_no_id, paths_without_run_id))
+    elif len(set(run_ids)) != 1:
+      raise ValueError(
+          "Unexpected: Found multiple (%d) tfdbg2 runs in directory %s" %
+          (len(set(run_ids)), self._dump_root))
+    # Return the metadata files in ascending order of their timestamps.
+    paths_and_timestamps = sorted(
+        zip(metadata_paths, wall_times), key=lambda t: t[1])
+    self._starting_wall_time = paths_and_timestamps[0][1]
+    return [path[0] for path in paths_and_timestamps]
+
+  def starting_wall_time(self):
+    """Get the starting timestamp of the instrumented TensorFlow program.
+
+    When there are multiple hosts (i.e., multiple tfdbg file sets), the earliest
+    timestamp among the file sets is returned. It is assumed to be the job that
+    starts first (e.g., the coordinator).
+
+    Returns:
+      Starting timestamp in seconds since the epoch, as a float.
+    """
+    return self._starting_wall_time
+
+  def tfdbg_run_id(self):
+    """Get the run ID of the instrumented TensorFlow program."""
+    return self._tfdbg_run_id
+
+  def tensorflow_version(self):
+    """Get the version string of TensorFlow that the debugged program ran on."""
+    return self._tensorflow_version
+
+  def tfdbg_file_version(self):
+    """Get the tfdbg file format version."""
+    return self._file_version
+
   def __enter__(self):
     return self
 
@@ -139,9 +233,6 @@ class DebugEventsReader(object):
           self._reader_offsets[file_path] = 0
     return self._readers[file_path]
 
-  def metadata_iterator(self):
-    return self._generic_iterator(self._metadata_path)
-
   def source_files_iterator(self):
     return self._generic_iterator(self._source_files_path)
 
@@ -193,14 +284,18 @@ class DebugEventsReader(object):
       proto_string = self._get_reader(self._execution_path).read(offset)[0]
     return debug_event_pb2.DebugEvent.FromString(proto_string)
 
-  def graph_execution_traces_iterator(self):
-    return self._generic_iterator(self._graph_execution_traces_path)
+  def graph_execution_traces_iterators(self):
+    return [
+        self._generic_iterator(path)
+        for path in self._graph_execution_traces_paths
+    ]
 
-  def read_graph_execution_traces_event(self, offset):
-    """Read DebugEvent at given offset from .graph_execution_traces file.
+  def read_graph_execution_traces_event(self, locator):
+    """Read DebugEvent at given offset from given .graph_execution_traces file.
 
     Args:
-      offset: Offset to read the DebugEvent proto from.
+      locator: A (file_index, offset) tuple that locates the DebugEvent
+        containing the graph execution trace.
 
     Returns:
       A DebugEventProto.
@@ -209,9 +304,11 @@ class DebugEventsReader(object):
       `errors.DataLossError` if offset is at a wrong location.
       `IndexError` if offset is out of range of the file.
     """
-    with self._reader_read_locks[self._graph_execution_traces_path]:
-      proto_string = self._get_reader(
-          self._graph_execution_traces_path).read(offset)[0]
+    file_index, offset = locator
+    graph_execution_traces_path = self._graph_execution_traces_paths[file_index]
+    with self._reader_read_locks[graph_execution_traces_path]:
+      proto_string = self._get_reader(graph_execution_traces_path).read(
+          offset)[0]
     return debug_event_pb2.DebugEvent.FromString(proto_string)
 
   def close(self):
@@ -227,21 +324,27 @@ class BaseDigest(object):
 
   Properties:
     wall_time: A timestamp for the digest as a `float` (unit: s).
-    offset: A offset number in the corresponding file that can be used for
-      fast random read access.
+    locator: A datum that allows tracng the digest to its original
+      location. It can be either of the two:
+       1. Bytes offset from the beginning of the file as a single integer,
+          for the case of all digests of the same kind coming from the same
+          file.
+       2. A tuple of a file index and a byte offset. This applies to case
+          in which the same type of debugger data may come from multple files,
+          e.g., graph execution traces.
   """
 
-  def __init__(self, wall_time, offset):
+  def __init__(self, wall_time, locator):
     self._wall_time = wall_time
-    self._offset = offset
+    self._locator = locator
 
   @property
   def wall_time(self):
     return self._wall_time
 
   @property
-  def offset(self):
-    return self._offset
+  def locator(self):
+    return self._locator
 
   def to_json(self):
     return {"wall_time": self.wall_time}
@@ -265,10 +368,10 @@ class ExecutionDigest(BaseDigest):
 
   def __init__(self,
                wall_time,
-               offset,
+               locator,
                op_type,
                output_tensor_device_ids=None):
-    super(ExecutionDigest, self).__init__(wall_time, offset)
+    super(ExecutionDigest, self).__init__(wall_time, locator)
     self._op_type = op_type
     self._output_tensor_device_ids = _tuple_or_none(output_tensor_device_ids)
 
@@ -332,7 +435,7 @@ class Execution(ExecutionDigest):
                debug_tensor_values=None):
     super(Execution, self).__init__(
         execution_digest.wall_time,
-        execution_digest.offset,
+        execution_digest.locator,
         execution_digest.op_type,
         output_tensor_device_ids=execution_digest.output_tensor_device_ids)
     self._host_name = host_name
@@ -556,7 +659,7 @@ class GraphOpCreationDigest(BaseDigest):
 
   def __init__(self,
                wall_time,
-               offset,
+               locator,
                graph_id,
                op_type,
                op_name,
@@ -565,7 +668,7 @@ class GraphOpCreationDigest(BaseDigest):
                stack_frame_ids,
                input_names=None,
                device_name=None):
-    super(GraphOpCreationDigest, self).__init__(wall_time, offset)
+    super(GraphOpCreationDigest, self).__init__(wall_time, locator)
     self._graph_id = graph_id
     self._op_type = op_type
     self._op_name = op_name
@@ -640,14 +743,9 @@ class GraphExecutionTraceDigest(BaseDigest):
       graph.
   """
 
-  def __init__(self,
-               wall_time,
-               offset,
-               op_type,
-               op_name,
-               output_slot,
+  def __init__(self, wall_time, locator, op_type, op_name, output_slot,
                graph_id):
-    super(GraphExecutionTraceDigest, self).__init__(wall_time, offset)
+    super(GraphExecutionTraceDigest, self).__init__(wall_time, locator)
     self._op_type = op_type
     self._op_name = op_name
     self._output_slot = output_slot
@@ -701,13 +799,13 @@ class GraphExecutionTrace(GraphExecutionTraceDigest):
                tensor_debug_mode,
                debug_tensor_value=None,
                device_name=None):
-    super(GraphExecutionTrace, self).__init__(
-        graph_execution_trace_digest.wall_time,
-        graph_execution_trace_digest.offset,
-        graph_execution_trace_digest.op_type,
-        graph_execution_trace_digest.op_name,
-        graph_execution_trace_digest.output_slot,
-        graph_execution_trace_digest.graph_id)
+    super(GraphExecutionTrace,
+          self).__init__(graph_execution_trace_digest.wall_time,
+                         graph_execution_trace_digest.locator,
+                         graph_execution_trace_digest.op_type,
+                         graph_execution_trace_digest.op_name,
+                         graph_execution_trace_digest.output_slot,
+                         graph_execution_trace_digest.graph_id)
     self._graph_ids = tuple(graph_ids)
     self._tensor_debug_mode = tensor_debug_mode
     self._debug_tensor_value = debug_tensor_value
@@ -780,17 +878,17 @@ def _parse_tensor_value(tensor_proto, return_list=False):
     return None
 
 
-def _execution_digest_from_debug_event_proto(debug_event, offset):
+def _execution_digest_from_debug_event_proto(debug_event, locator):
   """Convert a DebugEvent proto into an ExecutionDigest data object."""
   return ExecutionDigest(
       debug_event.wall_time,
-      offset,
+      locator,
       debug_event.execution.op_type,
-      output_tensor_device_ids=(
-          debug_event.execution.output_tensor_device_ids or None))
+      output_tensor_device_ids=(debug_event.execution.output_tensor_device_ids
+                                or None))
 
 
-def _execution_from_debug_event_proto(debug_event, offset):
+def _execution_from_debug_event_proto(debug_event, locator):
   """Convert a DebugEvent proto into an Execution data object."""
   execution_proto = debug_event.execution
 
@@ -806,7 +904,7 @@ def _execution_from_debug_event_proto(debug_event, offset):
       debug_tensor_values.append(
           _parse_tensor_value(tensor_proto, return_list=True))
   return Execution(
-      _execution_digest_from_debug_event_proto(debug_event, offset),
+      _execution_digest_from_debug_event_proto(debug_event, locator),
       execution_proto.code_location.host_name,
       tuple(execution_proto.code_location.stack_frame_ids),
       execution_proto.tensor_debug_mode,
@@ -832,7 +930,6 @@ class DebugDataReader(object):
 
   def __init__(self, dump_root):
     self._reader = DebugEventsReader(dump_root)
-    self._load_metadata()
 
     # TODO(cais): Implement pagination for memory constraints.
     self._execution_digests = []
@@ -858,13 +955,6 @@ class DebugDataReader(object):
   def _add_monitor(self, monitor):
     self._monitors.append(monitor)
 
-  def _load_metadata(self):
-    metadata_iter = self._reader.metadata_iterator()
-    debug_event = next(metadata_iter).debug_event
-    self._starting_wall_time = debug_event.wall_time
-    self._tensorflow_version = debug_event.debug_metadata.tensorflow_version
-    self._tfdbg_run_id = debug_event.debug_metadata.tfdbg_run_id
-
   def _load_source_files(self):
     """Incrementally read the .source_files DebugEvent file."""
     source_files_iter = self._reader.source_files_iterator()
@@ -944,37 +1034,32 @@ class DebugDataReader(object):
 
   def _load_graph_execution_traces(self):
     """Incrementally load the .graph_execution_traces file."""
-    traces_iter = self._reader.graph_execution_traces_iterator()
-    for debug_event, offset in traces_iter:
-      self._graph_execution_trace_digests.append(
-          self._graph_execution_trace_digest_from_debug_event_proto(
-              debug_event, offset))
-      if self._monitors:
-        graph_execution_trace = (
-            self._graph_execution_trace_from_debug_event_proto(
-                debug_event, offset))
-        for monitor in self._monitors:
-          monitor.on_graph_execution_trace(
-              len(self._graph_execution_trace_digests) - 1,
-              graph_execution_trace)
+    for i, traces_iter in enumerate(
+        self._reader.graph_execution_traces_iterators()):
+      for debug_event, offset in traces_iter:
+        self._graph_execution_trace_digests.append(
+            self._graph_execution_trace_digest_from_debug_event_proto(
+                debug_event, (i, offset)))
+        if self._monitors:
+          graph_execution_trace = (
+              self._graph_execution_trace_from_debug_event_proto(
+                  debug_event, (i, offset)))
+          for monitor in self._monitors:
+            monitor.on_graph_execution_trace(
+                len(self._graph_execution_trace_digests) - 1,
+                graph_execution_trace)
 
-  def _graph_execution_trace_digest_from_debug_event_proto(self,
-                                                           debug_event,
-                                                           offset):
+  def _graph_execution_trace_digest_from_debug_event_proto(
+      self, debug_event, locator):
     trace_proto = debug_event.graph_execution_trace
     op_name = trace_proto.op_name
     op_type = self._lookup_op_type(trace_proto.tfdbg_context_id, op_name)
     return GraphExecutionTraceDigest(
-        debug_event.wall_time,
-        offset,
-        op_type,
-        op_name,
+        debug_event.wall_time, locator, op_type, op_name,
         trace_proto.output_slot,
         debug_event.graph_execution_trace.tfdbg_context_id)
 
-  def _graph_execution_trace_from_debug_event_proto(self,
-                                                    debug_event,
-                                                    offset):
+  def _graph_execution_trace_from_debug_event_proto(self, debug_event, locator):
     """Convert a DebugEvent proto into a GraphExecutionTrace data object."""
     trace_proto = debug_event.graph_execution_trace
     graph_ids = [trace_proto.tfdbg_context_id]
@@ -995,7 +1080,7 @@ class DebugDataReader(object):
           trace_proto.tensor_proto, return_list=True)
     return GraphExecutionTrace(
         self._graph_execution_trace_digest_from_debug_event_proto(
-            debug_event, offset),
+            debug_event, locator),
         graph_ids=graph_ids,
         tensor_debug_mode=trace_proto.tensor_debug_mode,
         debug_tensor_value=debug_tensor_value,
@@ -1059,7 +1144,7 @@ class DebugDataReader(object):
     Returns:
       Stating wall time as seconds since the epoch, as a `float`.
     """
-    return self._starting_wall_time
+    return self._reader.starting_wall_time()
 
   def tensorflow_version(self):
     """TensorFlow version used in the debugged TensorFlow program.
@@ -1070,11 +1155,11 @@ class DebugDataReader(object):
     Returns:
       TensorFlow version used by the debugged program, as a `str`.
     """
-    return self._tensorflow_version
+    return self._reader.tensorflow_version()
 
   def tfdbg_run_id(self):
     """Get the debugger run ID of the debugged TensorFlow program."""
-    return self._tfdbg_run_id
+    return self._reader.tfdbg_run_id()
 
   def outermost_graphs(self):
     """Get the number of outer most graphs read so far."""
@@ -1171,9 +1256,9 @@ class DebugDataReader(object):
 
   def read_execution(self, execution_digest):
     """Read a detailed Execution object."""
-    debug_event = self._reader.read_execution_event(execution_digest.offset)
-    return _execution_from_debug_event_proto(
-        debug_event, execution_digest.offset)
+    debug_event = self._reader.read_execution_event(execution_digest.locator)
+    return _execution_from_debug_event_proto(debug_event,
+                                             execution_digest.locator)
 
   def read_graph_execution_trace(self, graph_execution_trace_digest):
     """Read the detailed graph execution trace.
@@ -1185,9 +1270,9 @@ class DebugDataReader(object):
       The corresponding `GraphExecutionTrace` object.
     """
     debug_event = self._reader.read_graph_execution_traces_event(
-        graph_execution_trace_digest.offset)
+        graph_execution_trace_digest.locator)
     return self._graph_execution_trace_from_debug_event_proto(
-        debug_event, graph_execution_trace_digest.offset)
+        debug_event, graph_execution_trace_digest.locator)
 
   def read_execution_stack_trace(self, execution):
     """Read the stack trace of a given Execution object.
@@ -1234,7 +1319,7 @@ class DebugDataReader(object):
       A list of numpy arrays representing the output tensor values of the
         execution event.
     """
-    debug_event = self._reader.read_execution_event(execution.offset)
+    debug_event = self._reader.read_execution_event(execution.locator)
     return [_parse_tensor_value(tensor_proto)
             for tensor_proto in debug_event.execution.tensor_protos]
 
@@ -1248,8 +1333,7 @@ class DebugDataReader(object):
       A numpy array representing the output tensor value of the intra-graph
         tensor execution event.
     """
-    debug_event = self._reader.read_graph_execution_traces_event(
-        trace.offset)
+    debug_event = self._reader.read_graph_execution_traces_event(trace.locator)
     return _parse_tensor_value(debug_event.graph_execution_trace.tensor_proto)
 
   def symbolic_tensor_id(self, graph_id, op_name, output_slot):
diff --git a/tensorflow/python/debug/lib/debug_events_writer_test.py b/tensorflow/python/debug/lib/debug_events_writer_test.py
index 7b06bf772be..3f3f9179e5d 100644
--- a/tensorflow/python/debug/lib/debug_events_writer_test.py
+++ b/tensorflow/python/debug/lib/debug_events_writer_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 import glob
 import json as json_lib
 import os
+import re
 import threading
 import time
 
@@ -264,14 +265,14 @@ class DebugEventsWriterTest(dumping_callback_test_lib.DumpingCallbackTestBase,
       writer.WriteGraphExecutionTrace(trace)
 
     with debug_events_reader.DebugEventsReader(self.dump_root) as reader:
-      actuals = list(reader.graph_execution_traces_iterator())
+      actuals = list(reader.graph_execution_traces_iterators()[0])
       # Before FlushExecutionFiles() is called. No data should have been written
       # to the file.
       self.assertEmpty(actuals)
 
       writer.FlushExecutionFiles()
       actuals = list(item.debug_event.graph_execution_trace
-                     for item in reader.graph_execution_traces_iterator())
+                     for item in reader.graph_execution_traces_iterators()[0])
       self.assertLen(actuals, debug_events_writer.DEFAULT_CIRCULAR_BUFFER_SIZE)
       for i in range(debug_events_writer.DEFAULT_CIRCULAR_BUFFER_SIZE):
         self.assertEqual(
@@ -291,7 +292,7 @@ class DebugEventsWriterTest(dumping_callback_test_lib.DumpingCallbackTestBase,
 
     with debug_events_reader.DebugEventsReader(self.dump_root) as reader:
       actuals = list(item.debug_event.graph_execution_trace
-                     for item in reader.graph_execution_traces_iterator())
+                     for item in reader.graph_execution_traces_iterators()[0])
     self.assertLen(actuals, num_execution_events)
     for i in range(num_execution_events):
       self.assertEqual(actuals[i].op_name, "Op%d" % i)
@@ -598,6 +599,86 @@ class DebugEventsWriterTest(dumping_callback_test_lib.DumpingCallbackTestBase,
     self.assertEqual(traces[-1].op_name, "Op_%d" % (expected_end - 1))
 
 
+class MultiSetReaderTest(dumping_callback_test_lib.DumpingCallbackTestBase):
+  """Test for DebugDataReader for multiple file sets under a dump root."""
+
+  def testReadingTwoFileSetsWithTheSameDumpRootSucceeds(self):
+    # To simulate a multi-host data dump, we first generate file sets in two
+    # different directories, with the same tfdbg_run_id, and then combine them.
+    tfdbg_run_id = "foo"
+    for i in range(2):
+      writer = debug_events_writer.DebugEventsWriter(
+          os.path.join(self.dump_root, str(i)),
+          tfdbg_run_id,
+          circular_buffer_size=-1)
+      if i == 0:
+        debugged_graph = debug_event_pb2.DebuggedGraph(
+            graph_id="graph1", graph_name="graph1")
+        writer.WriteDebuggedGraph(debugged_graph)
+        op_name = "Op_0"
+        graph_op_creation = debug_event_pb2.GraphOpCreation(
+            op_type="FooOp", op_name=op_name, graph_id="graph1")
+        writer.WriteGraphOpCreation(graph_op_creation)
+        op_name = "Op_1"
+        graph_op_creation = debug_event_pb2.GraphOpCreation(
+            op_type="FooOp", op_name=op_name, graph_id="graph1")
+        writer.WriteGraphOpCreation(graph_op_creation)
+      for _ in range(10):
+        trace = debug_event_pb2.GraphExecutionTrace(
+            op_name="Op_%d" % i, tfdbg_context_id="graph1")
+        writer.WriteGraphExecutionTrace(trace)
+        writer.FlushNonExecutionFiles()
+        writer.FlushExecutionFiles()
+
+    # Move all files from the subdirectory /1 to subdirectory /0.
+    dump_root_0 = os.path.join(self.dump_root, "0")
+    src_paths = glob.glob(os.path.join(self.dump_root, "1", "*"))
+    for src_path in src_paths:
+      dst_path = os.path.join(
+          dump_root_0,
+          # Rename the file set to avoid file name collision.
+          re.sub(r"(tfdbg_events\.\d+)", r"\g<1>1", os.path.basename(src_path)))
+      os.rename(src_path, dst_path)
+
+    with debug_events_reader.DebugDataReader(dump_root_0) as reader:
+      reader.update()
+      # Verify the content of the .graph_execution_traces file.
+      trace_digests = reader.graph_execution_traces(digest=True)
+      self.assertLen(trace_digests, 20)
+      for _ in range(10):
+        trace = reader.read_graph_execution_trace(trace_digests[i])
+        self.assertEqual(trace.op_name, "Op_0")
+      for _ in range(10):
+        trace = reader.read_graph_execution_trace(trace_digests[i + 10])
+        self.assertEqual(trace.op_name, "Op_1")
+
+  def testReadingTwoFileSetsWithTheDifferentRootsLeadsToError(self):
+    # To simulate a multi-host data dump, we first generate file sets in two
+    # different directories, with different tfdbg_run_ids, and then combine
+    # them.
+    for i in range(2):
+      writer = debug_events_writer.DebugEventsWriter(
+          os.path.join(self.dump_root, str(i)),
+          "run_id_%d" % i,
+          circular_buffer_size=-1)
+      writer.FlushNonExecutionFiles()
+      writer.FlushExecutionFiles()
+
+    # Move all files from the subdirectory /1 to subdirectory /0.
+    dump_root_0 = os.path.join(self.dump_root, "0")
+    src_paths = glob.glob(os.path.join(self.dump_root, "1", "*"))
+    for src_path in src_paths:
+      dst_path = os.path.join(
+          dump_root_0,
+          # Rename the file set to avoid file name collision.
+          re.sub(r"(tfdbg_events\.\d+)", r"\g<1>1", os.path.basename(src_path)))
+      os.rename(src_path, dst_path)
+
+    with self.assertRaisesRegexp(ValueError,
+                                 r"Found multiple \(2\) tfdbg2 runs"):
+      debug_events_reader.DebugDataReader(dump_root_0)
+
+
 class DataObjectsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
 
   def jsonRoundTripCheck(self, obj):
diff --git a/tensorflow/python/debug/lib/debug_v2_ops_test.py b/tensorflow/python/debug/lib/debug_v2_ops_test.py
index d715869f359..d70c505d3fc 100644
--- a/tensorflow/python/debug/lib/debug_v2_ops_test.py
+++ b/tensorflow/python/debug/lib/debug_v2_ops_test.py
@@ -92,16 +92,13 @@ class DebugIdentityV2OpTest(dumping_callback_test_lib.DumpingCallbackTestBase):
           write_debug_trace(x), [9.0 + np.sqrt(3.0), 16.0 + 2.0])
 
     with debug_events_reader.DebugEventsReader(self.dump_root) as reader:
-      metadata_iter = reader.metadata_iterator()
       # Check that the .metadata DebugEvents data file has been created, even
       # before FlushExecutionFiles() is called.
-      debug_event = next(metadata_iter).debug_event
-      self.assertGreater(debug_event.wall_time, 0)
-      self.assertTrue(debug_event.debug_metadata.tensorflow_version)
-      self.assertTrue(
-          debug_event.debug_metadata.file_version.startswith("debug.Event:"))
+      self.assertGreater(reader.starting_wall_time(), 0)
+      self.assertTrue(reader.tensorflow_version())
+      self.assertTrue(reader.tfdbg_file_version().startswith("debug.Event"))
 
-      graph_trace_iter = reader.graph_execution_traces_iterator()
+      graph_trace_iter = reader.graph_execution_traces_iterators()[0]
       # Before FlushExecutionFiles() is called, the .graph_execution_traces file
       # ought to be empty.
       with self.assertRaises(StopIteration):
@@ -109,7 +106,7 @@ class DebugIdentityV2OpTest(dumping_callback_test_lib.DumpingCallbackTestBase):
 
       # Flush the circular buffer.
       self.writer.FlushExecutionFiles()
-      graph_trace_iter = reader.graph_execution_traces_iterator()
+      graph_trace_iter = reader.graph_execution_traces_iterators()[0]
 
       # The circular buffer has a size of 4. So only the data from the
       # last two iterations should have been written to self.dump_root.
@@ -167,7 +164,7 @@ class DebugIdentityV2OpTest(dumping_callback_test_lib.DumpingCallbackTestBase):
 
     self.writer.FlushExecutionFiles()
     with debug_events_reader.DebugEventsReader(self.dump_root) as reader:
-      graph_trace_iter = reader.graph_execution_traces_iterator()
+      graph_trace_iter = reader.graph_execution_traces_iterators()[0]
       try:
         x_values = []
         timestamp = 0
@@ -216,7 +213,7 @@ class DebugIdentityV2OpTest(dumping_callback_test_lib.DumpingCallbackTestBase):
 
     for debug_root in (self.dump_root, another_dump_root):
       with debug_events_reader.DebugEventsReader(debug_root) as reader:
-        graph_trace_iter = reader.graph_execution_traces_iterator()
+        graph_trace_iter = reader.graph_execution_traces_iterators()[0]
 
         debug_event = next(graph_trace_iter).debug_event
         trace = debug_event.graph_execution_trace
@@ -272,7 +269,7 @@ class DebugIdentityV2OpUninitializedWriterTest(
     writer.FlushExecutionFiles()
 
     with debug_events_reader.DebugEventsReader(self.dump_root) as reader:
-      graph_trace_iter = reader.graph_execution_traces_iterator()
+      graph_trace_iter = reader.graph_execution_traces_iterators()[0]
       graph_execution_traces = []
       while True:
         try:
diff --git a/tensorflow/python/debug/lib/dumping_callback_test_lib.py b/tensorflow/python/debug/lib/dumping_callback_test_lib.py
index 05bf3aeb6da..e58ffdbd79f 100644
--- a/tensorflow/python/debug/lib/dumping_callback_test_lib.py
+++ b/tensorflow/python/debug/lib/dumping_callback_test_lib.py
@@ -48,7 +48,6 @@ class DumpingCallbackTestBase(test_util.TensorFlowTestCase):
   def _readAndCheckMetadataFile(self):
     """Read and check the .metadata debug-events file."""
     with debug_events_reader.DebugEventsReader(self.dump_root) as reader:
-      metadata_iter = reader.metadata_iterator()
-      metadata = next(metadata_iter).debug_event.debug_metadata
-      self.assertEqual(metadata.tensorflow_version, versions.__version__)
-      self.assertTrue(metadata.file_version.startswith("debug.Event"))
+      self.assertTrue(reader.tfdbg_run_id())
+      self.assertEqual(reader.tensorflow_version(), versions.__version__)
+      self.assertTrue(reader.tfdbg_file_version().startswith("debug.Event"))

From c692c45daef9e4a1d5937f705fe8a3977b04fddd Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 22 Jun 2020 17:10:15 -0700
Subject: [PATCH 0842/1390] tf numpy: some changes to ndarray constructor
 logic.

PiperOrigin-RevId: 317765968
Change-Id: Iea4338ad18707ff36fc49b450d0defad5c13a6a2
---
 tensorflow/python/ops/numpy_ops/np_arrays.py      |  7 +++----
 tensorflow/python/ops/numpy_ops/np_arrays_test.py | 14 ++++++++++++++
 2 files changed, 17 insertions(+), 4 deletions(-)

diff --git a/tensorflow/python/ops/numpy_ops/np_arrays.py b/tensorflow/python/ops/numpy_ops/np_arrays.py
index eca84421d1b..77157544e8f 100644
--- a/tensorflow/python/ops/numpy_ops/np_arrays.py
+++ b/tensorflow/python/ops/numpy_ops/np_arrays.py
@@ -141,13 +141,12 @@ class ndarray(composite_tensor.CompositeTensor):  # pylint: disable=invalid-name
         raise ValueError('Unexpected type for `buffer` {}. Must be an ndarray,'
                          ' Tensor or np.ndarray.'.format(type(buffer)))
 
-      if shape is not None and tuple(shape) != buffer._shape_tuple():  # pylint: disable=protected-access
-        # TODO(srbs): NumPy allows this. Investigate if/how to support this.
-        raise ValueError('shape arg must match buffer.shape.')
+      if shape is not None:
+        buffer.set_shape(shape)
 
     assert isinstance(buffer, ops.Tensor)
     if dtype and dtype != buffer.dtype:
-      buffer = array_ops.bitcast(buffer, dtype)
+      buffer = math_ops.cast(buffer, dtype)
     self._data = buffer
     self._type_spec_internal = None
 
diff --git a/tensorflow/python/ops/numpy_ops/np_arrays_test.py b/tensorflow/python/ops/numpy_ops/np_arrays_test.py
index 412addc0ad7..ab407d2bfcf 100644
--- a/tensorflow/python/ops/numpy_ops/np_arrays_test.py
+++ b/tensorflow/python/ops/numpy_ops/np_arrays_test.py
@@ -22,6 +22,7 @@ import collections
 
 import numpy as np
 
+from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
@@ -51,6 +52,19 @@ class ArrayTest(test.TestCase):
     self.assertIs(a.dtype.type, np.bool_)
     self.assertAllEqual([False, True], a)
 
+  def testConstructor(self):
+    t = constant_op.constant([[1], [1]])
+    a = np_arrays.ndarray(shape=(2, 1), buffer=t)
+    self.assertAllEqual(t, a)
+    self.assertEqual(dtypes.float64, a.dtype)
+
+    a = np_arrays.ndarray(shape=(2, 1), dtype=dtypes.int32, buffer=t)
+    self.assertAllEqual(t, a)
+    self.assertEqual(dtypes.int32, a.dtype)
+
+    with self.assertRaises(ValueError):  # bad shape
+      _ = np_arrays.ndarray((2, 2), buffer=t)
+
   def testNeg(self):
     a = t2a(ops.convert_to_tensor(value=[1.0, 2.0]))
     self.assertAllEqual([-1.0, -2.0], -a)

From fe6e64b09853ac9dbb234ce9b7d4b10da11c7fe9 Mon Sep 17 00:00:00 2001
From: Xiao Yu <fishx@google.com>
Date: Mon, 22 Jun 2020 17:15:42 -0700
Subject: [PATCH 0843/1390] Refactor eager placement logic into three util
 methods: - MaybePinSmallOpsToCpu - MaybePinToResourceDevice -
 MaybePinToCustomDevice

We are going to reuse MaybePinSmallOpsToCpu in TFRT but not the other two. Because TFRT doesn't have native Resource neither Custom Device.

PiperOrigin-RevId: 317766813
Change-Id: I43241b5786120ddf39dc4bfff6071239afdfd785
---
 tensorflow/core/common_runtime/eager/BUILD    |  31 +++
 .../core/common_runtime/eager/context.h       |   2 +-
 tensorflow/core/common_runtime/eager/core.cc  |  22 ++
 .../common_runtime/eager/eager_operation.h    |   2 +-
 .../core/common_runtime/eager/execute.cc      | 169 -------------
 .../common_runtime/eager/placement_utils.cc   | 228 ++++++++++++++++++
 .../common_runtime/eager/placement_utils.h    |  55 +++++
 .../eager/eager_service_impl.cc               |   6 +-
 tensorflow/lite/delegates/flex/kernel.cc      |   9 +-
 9 files changed, 349 insertions(+), 175 deletions(-)
 create mode 100644 tensorflow/core/common_runtime/eager/placement_utils.cc
 create mode 100644 tensorflow/core/common_runtime/eager/placement_utils.h

diff --git a/tensorflow/core/common_runtime/eager/BUILD b/tensorflow/core/common_runtime/eager/BUILD
index fb69bcb7ab5..911b59eed17 100644
--- a/tensorflow/core/common_runtime/eager/BUILD
+++ b/tensorflow/core/common_runtime/eager/BUILD
@@ -29,6 +29,7 @@ tf_cuda_library(
         ":context",
         ":eager_operation",
         ":execute",
+        ":placement_utils",
         ":tensor_handle",
         "//tensorflow/c:c_api_internal",
         "//tensorflow/c:tf_tensor_internal",
@@ -489,6 +490,7 @@ cc_library(
         ":eager_op_rewrite_registry",
         ":eager_operation",
         ":kernel_and_device",
+        ":placement_utils",
         ":tensor_handle",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:inlined_vector",
@@ -521,6 +523,35 @@ cc_library(
     }),
 )
 
+tf_cuda_library(
+    name = "placement_utils",
+    srcs = [
+        "placement_utils.cc",
+    ],
+    hdrs = [
+        "placement_utils.h",
+    ],
+    visibility = ["//tensorflow:internal"],
+    deps = [
+        ":context",
+        ":attr_builder",
+        ":eager_operation",
+        "//tensorflow/c/eager:immediate_execution_tensor_handle",
+    ] + select({
+        "//tensorflow:android": [
+            "//tensorflow/core:portable_tensorflow_lib_lite",
+        ],
+        "//conditions:default": [
+            "//tensorflow/core:core_cpu_lib",
+            "//tensorflow/core:framework",
+            "//tensorflow/core:framework_internal",
+            "//tensorflow/core:lib",
+            "//tensorflow/core:lib_internal",
+            "//tensorflow/core:protos_all_cc",
+        ],
+    }),
+)
+
 tf_cuda_library(
     name = "attr_builder",
     srcs = ["attr_builder.cc"],
diff --git a/tensorflow/core/common_runtime/eager/context.h b/tensorflow/core/common_runtime/eager/context.h
index c16e1f0f4ad..e6769279558 100644
--- a/tensorflow/core/common_runtime/eager/context.h
+++ b/tensorflow/core/common_runtime/eager/context.h
@@ -478,7 +478,7 @@ class EagerContext : public ImmediateExecutionContext, public core::RefCounted {
   // On mobile, it just cleans the caches.
   void WaitForAndCloseRemoteContexts();
 
-  bool PinSmallOpsToCPU() { return pin_small_ops_to_cpu_; }
+  bool PinSmallOpsToCPU() const { return pin_small_ops_to_cpu_; }
 
   tensorflow::Env* TFEnv() const { return env_; }
 
diff --git a/tensorflow/core/common_runtime/eager/core.cc b/tensorflow/core/common_runtime/eager/core.cc
index 3d37250a4fe..77d2b665f5e 100644
--- a/tensorflow/core/common_runtime/eager/core.cc
+++ b/tensorflow/core/common_runtime/eager/core.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/eager/context.h"
 #include "tensorflow/core/common_runtime/eager/eager_operation.h"
 #include "tensorflow/core/common_runtime/eager/execute.h"
+#include "tensorflow/core/common_runtime/eager/placement_utils.h"
 #include "tensorflow/core/common_runtime/eager/tensor_handle.h"
 #include "tensorflow/core/platform/errors.h"
 
@@ -187,6 +188,27 @@ Status EagerContext::RegisterFunction(AbstractFunction* f) {
 // eager_operation.cc we can avoid a circular dependency between them.
 Status EagerOperation::Execute(absl::Span<AbstractTensorHandle*> retvals,
                                int* num_retvals) {
+  // Run eager placement logic.
+  VariantDevice device;
+  TF_RETURN_IF_ERROR(eager::MaybePinToCustomDevice(&device, *this));
+  if (device == kVariantDeviceNull) {
+    TF_RETURN_IF_ERROR(eager::MaybePinToResourceDevice(&device, *this));
+  }
+  if (device == kVariantDeviceNull) {
+    bool pin_to_cpu;
+    TF_RETURN_IF_ERROR(eager::MaybePinSmallOpsToCpu(
+        &pin_to_cpu, op_name(),
+        absl::MakeSpan(
+            reinterpret_cast<ImmediateExecutionTensorHandle**>(inputs_.data()),
+            inputs_.size()),
+        ctx_));
+    if (pin_to_cpu) {
+      device = ctx_.HostCPU();
+    }
+  }
+  if (device != kVariantDeviceNull) {
+    SetDevice(device);
+  }
   return EagerExecute(
       this, reinterpret_cast<tensorflow::TensorHandle**>(retvals.data()),
       num_retvals);
diff --git a/tensorflow/core/common_runtime/eager/eager_operation.h b/tensorflow/core/common_runtime/eager/eager_operation.h
index 963aed25733..fa245649d5c 100644
--- a/tensorflow/core/common_runtime/eager/eager_operation.h
+++ b/tensorflow/core/common_runtime/eager/eager_operation.h
@@ -126,7 +126,7 @@ class EagerOperation : public ImmediateExecutionOperation {
   bool is_function() const { return is_function_; }
   bool colocation_exempt() const { return colocation_exempt_; }
 
-  tensorflow::EagerContext& EagerContext() { return ctx_; }
+  tensorflow::EagerContext& EagerContext() const { return ctx_; }
 
   AttrBuilder* MutableAttrs() { return &attrs_; }
   const AttrBuilder& Attrs() const { return attrs_; }
diff --git a/tensorflow/core/common_runtime/eager/execute.cc b/tensorflow/core/common_runtime/eager/execute.cc
index a94c882b3b3..a030f4d0356 100644
--- a/tensorflow/core/common_runtime/eager/execute.cc
+++ b/tensorflow/core/common_runtime/eager/execute.cc
@@ -870,173 +870,6 @@ Status EagerRemoteExecute(EagerOperation* op, TensorHandle** retvals,
 }
 #endif  // IS_MOBILE_PLATFORM
 
-// These ops are not pinnable since they generate data. It can be slower to
-// generate and then copy the data instead of just generating the data on the
-// device directly.
-bool IsPinnableOp(const string& op_type) {
-  static const gtl::FlatSet<string>* unpinnable_ops = new gtl::FlatSet<string>({
-      "RandomUniform",
-      "RandomUniformInt",
-      "RandomStandardNormal",
-      "StatelessRandomUniform",
-      "StatelessRandomUniformInt",
-      "StatelessRandomUniformFullInt",
-      "StatelessRandomNormal",
-  });
-
-  // XRT ops refer to per-device handles that are not safe to move between
-  // devices.
-  return unpinnable_ops->find(op_type) == unpinnable_ops->end() &&
-         !absl::StartsWith(op_type, "XRT");
-}
-
-// Validate if the remote device with the given incarnation is valid in the
-// remote device manager of the current eager context.
-Status ValidateTensorHandleRemoteDevice(EagerContext* ctx,
-                                        int64 device_incarnation) {
-  if (ctx->remote_device_mgr()->ContainsDevice(device_incarnation)) {
-    return Status::OK();
-  }
-  return errors::InvalidArgument(
-      "Resource input tensor contains an invalid device. This might happen "
-      "when the client has connected to a different cluster, or some remote "
-      "workers have been restarted.");
-}
-
-// The Op device may be updated if:
-// - A resource touching input is specified: all resource-touching ops run in
-// the device the resource is, regardless of anything else that has been
-// specified. This is identical to the graph mode behavior.
-//
-// - All op inputs are on the CPU, small (<64 elements) and integers
-// (int32/int64). This can be disabled by setting the environment variable
-// "TF_EAGER_ENABLE_SMALL_TENSOR_CPU_PINNING" to "0" or "false".
-//
-// TODO(b/154234908): Unify placement logic.
-Status MaybeUpdateOpDevice(EagerOperation* op) {
-  // If operation was already placed on a custom device, use it.
-  if (VariantDeviceIsCustom(op->Device())) {
-    return Status::OK();
-  }
-
-  // If all the inputs are on the same custom device, use that custom
-  // device. Otherwise, it is an error to have a custom device as an input.
-  if (!op->Inputs().empty()) {
-    // We keep track of what we've seen with devices instead of booleans to be
-    // able to provide a meaningful error message below.
-    VariantDevice first = op->Inputs()[0]->device();
-    VariantDevice different = first;  // A different input device, if any.
-    VariantDevice custom = first;     // The first custom device seen, or an
-                                      // arbitrary non-custom device otherwise.
-    for (size_t i = 1; first == different && i < op->Inputs().size(); ++i) {
-      VariantDevice device = op->Inputs()[i]->device();
-      if (device != first) {
-        different = device;
-      }
-      if (!VariantDeviceIsCustom(custom) && VariantDeviceIsCustom(device)) {
-        custom = device;
-      }
-      if (different != first && VariantDeviceIsCustom(custom)) {
-        return errors::InvalidArgument(absl::StrCat(
-            "If an operation has one of its inputs in a custom device, then "
-            "all inputs should be on that same device. Operation ",
-            op->Name(), " has one input in custom device ",
-            VariantDeviceName(custom),
-            " and at least one input in a different device ",
-            VariantDeviceName(custom == first ? different : first)));
-      }
-    }
-    if (different == first && VariantDeviceIsCustom(custom)) {
-      op->SetDevice(first);
-      return Status::OK();
-    }
-  }
-
-  if (op->colocation_exempt()) {
-    return Status::OK();
-  }
-  EagerContext& ctx = op->EagerContext();
-  bool all_inputs_eligible_for_cpu_pinning =
-      ctx.PinSmallOpsToCPU() && !op->is_function() && IsPinnableOp(op->Name());
-  Device* op_device = op->Device() == kVariantDeviceNull
-                          ? ctx.HostCPU()
-                          : absl::get<Device*>(op->Device());
-  for (int i = 0; i < op->Inputs().size(); ++i) {
-    TensorHandle* tensor_handle = op->Inputs()[i];
-    if (tensor_handle->dtype == DT_RESOURCE) {
-      if (tensor_handle->resource_remote_device_incarnation() != 0) {
-        TF_RETURN_IF_ERROR(ValidateTensorHandleRemoteDevice(
-            &ctx, tensor_handle->resource_remote_device_incarnation()));
-      }
-      Device* resource_device = tensor_handle->resource_device();
-      DVLOG(2) << "for op " << op->Name() << " input " << i << " "
-               << DataTypeString(tensor_handle->dtype)
-               << " input device = " << resource_device->name()
-               << ", op device = " << op_device->name();
-      // We check for `op->Device() == nullptr` because it can be later
-      // interpreted as unspecified device and a different device can
-      // be selected based on device priority. If any input to an op
-      // is a resource we must pin it to prevent different device selection.
-      // TODO(iga): null device can mean "unspecified" or "CPU". Clean this up.
-      if (resource_device != op_device || op->Device() == kVariantDeviceNull) {
-        DVLOG(1) << (resource_device != op_device ? "Changing " : "Setting ")
-                 << "device of operation " << op->Name() << " to "
-                 << resource_device->name() << " because input #" << i
-                 << " is a resource in this device.";
-        op->SetDevice(resource_device);
-      }
-      all_inputs_eligible_for_cpu_pinning = false;
-      // No point in looking at other inputs. If there are other resources,
-      // they must have the same device and we already declared the op to be
-      // ineligible for CPU pinning.
-      break;
-    } else if (all_inputs_eligible_for_cpu_pinning) {
-      auto input_device_variant = tensor_handle->DeviceOrHostCPU(ctx);
-      if (VariantDeviceIsCustom(input_device_variant)) {
-        all_inputs_eligible_for_cpu_pinning = false;
-        continue;
-      }
-      Device* input_device = absl::get<Device*>(input_device_variant);
-      DVLOG(2) << "for op " << op->Name() << " input " << i << " "
-               << DataTypeString(tensor_handle->dtype)
-               << " input device = " << input_device->name()
-               << ", op device = " << op_device->name();
-
-      // Input is on CPU.
-      if (input_device != ctx.HostCPU()) {
-        all_inputs_eligible_for_cpu_pinning = false;
-        continue;
-      }
-
-      if (tensor_handle->dtype != DataType::DT_INT32 &&
-          tensor_handle->dtype != DataType::DT_INT64) {
-        all_inputs_eligible_for_cpu_pinning = false;
-        continue;
-      }
-
-      int64 num_elements;
-      TF_RETURN_IF_ERROR(tensor_handle->NumElements(&num_elements));
-      if (num_elements > 64) {
-        all_inputs_eligible_for_cpu_pinning = false;
-      }
-    }
-  }
-
-  // Ops without inputs are usually ops that generate a tensor in some way and
-  // usually require being present on whatever device they are scheduled on
-  // - for e.g. VarHandleOp or _Recv).
-  // TODO(nareshmodi): Is it possible there is no int32/int64 CPU kernel for
-  // an op, but there is a GPU kernel?
-  if (!op->Inputs().empty() && all_inputs_eligible_for_cpu_pinning) {
-    DVLOG(1) << "Forcing op " << op->Name()
-             << " to be on the CPU since all input tensors have an "
-                "int32/int64 dtype, and are small (less than 64 elements).";
-    op->SetDevice(ctx.HostCPU());
-  }
-
-  return Status::OK();
-}
-
 Status GetKernelOutputs(std::vector<Tensor>* outputs, int num_outputs,
                         TensorHandle** retvals, EagerContext* ctx,
                         KernelAndDevice* kernel) {
@@ -1099,8 +932,6 @@ Status EagerExecute(EagerOperation* op, TensorHandle** retvals,
       [&] { return absl::StrCat("EagerExecute: ", op->Name()); },
       profiler::TraceMeLevel::kInfo);
 
-  TF_RETURN_IF_ERROR(MaybeUpdateOpDevice(op));
-
   if (VariantDeviceIsCustom(op->Device())) {
     return absl::get<CustomDevice*>(op->Device())
         ->Execute(op, retvals, num_retvals);
diff --git a/tensorflow/core/common_runtime/eager/placement_utils.cc b/tensorflow/core/common_runtime/eager/placement_utils.cc
new file mode 100644
index 00000000000..8898516612f
--- /dev/null
+++ b/tensorflow/core/common_runtime/eager/placement_utils.cc
@@ -0,0 +1,228 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/common_runtime/eager/placement_utils.h"
+
+#include "tensorflow/c/eager/immediate_execution_tensor_handle.h"
+#include "tensorflow/core/common_runtime/eager/attr_builder.h"
+#include "tensorflow/core/common_runtime/eager/eager_operation.h"
+#include "tensorflow/core/common_runtime/input_colocation_exemption_registry.h"
+#include "tensorflow/core/framework/op_def.pb.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/platform/errors.h"
+
+namespace tensorflow {
+namespace eager {
+
+// These ops are not pinnable since they generate data. It can be slower to
+// generate and then copy the data instead of just generating the data on the
+// device directly.
+static bool IsPinnableOp(StringPiece op_name) {
+  static const gtl::FlatSet<string>* unpinnable_ops = new gtl::FlatSet<string>({
+      "RandomUniform",
+      "RandomUniformInt",
+      "RandomStandardNormal",
+      "StatelessRandomUniform",
+      "StatelessRandomUniformInt",
+      "StatelessRandomUniformFullInt",
+      "StatelessRandomNormal",
+  });
+
+  // XRT ops refer to per-device handles that are not safe to move between
+  // devices.
+  return unpinnable_ops->find(string(op_name)) == unpinnable_ops->end() &&
+         !absl::StartsWith(op_name, "XRT");
+}
+// Validate if the remote device with the given incarnation is valid in the
+// remote device manager of the current eager context.
+static Status ValidateTensorHandleRemoteDevice(EagerContext* ctx,
+                                               int64 device_incarnation) {
+  if (ctx->remote_device_mgr()->ContainsDevice(device_incarnation)) {
+    return Status::OK();
+  }
+  return errors::InvalidArgument(
+      "Resource input tensor contains an invalid device. This might happen "
+      "when the client has connected to a different cluster, or some remote "
+      "workers have been restarted.");
+}
+
+bool IsColocationExempt(StringPiece op_name) {
+  const auto& exempt_ops = InputColocationExemptionRegistry::Global()->Get();
+  return exempt_ops.find(string(op_name)) != exempt_ops.end();
+}
+
+bool IsFunction(StringPiece op_name) {
+  const OpDef* op_def = nullptr;
+  Status s = OpDefForOp(string(op_name), &op_def);
+  if (!s.ok()) {
+    if (!errors::IsNotFound(s)) {
+      LOG(WARNING) << "Looking up OpDef failed with error: " << s.ToString();
+    }
+    // Cannot find OpDef, it is a function.
+    return true;
+  }
+  return false;
+}
+
+bool IsCustomDevice(StringPiece device_name, const EagerContext& ctx) {
+  CustomDevice* custom_device;
+  return ctx.FindCustomDeviceFromName(string(device_name), &custom_device).ok();
+}
+
+Status MaybePinSmallOpsToCpu(bool* result, StringPiece op_name,
+                             absl::Span<ImmediateExecutionTensorHandle*> args,
+                             const EagerContext& ctx) {
+  if (!ctx.PinSmallOpsToCPU() || IsFunction(op_name) ||
+      IsColocationExempt(op_name) || !IsPinnableOp(op_name)) {
+    *result = false;
+    return Status::OK();
+  }
+
+  // Ops without inputs are usually ops that generate a tensor in some way and
+  // usually require being present on whatever device they are scheduled on
+  // - for e.g. VarHandleOp or _Recv).
+  if (args.empty()) {
+    *result = false;
+    return Status::OK();
+  }
+
+  int i = 0;
+  for (auto* arg : args) {
+    Status s;
+    const char* device_name = arg->DeviceName(&s);
+    DataType dtype = arg->DataType();
+    TF_RETURN_IF_ERROR(s);
+    if (IsCustomDevice(device_name, ctx)) {
+      *result = false;
+      return Status::OK();
+    }
+
+    DVLOG(2) << "for op " << op_name << " input " << i << " "
+             << DataTypeString(dtype) << " input device = " << device_name;
+
+    // Input is on CPU.
+    if (device_name != ctx.HostCPU()->name()) {
+      *result = false;
+      return Status::OK();
+    }
+
+    if (dtype != DataType::DT_INT32 && dtype != DataType::DT_INT64) {
+      *result = false;
+      return Status::OK();
+    }
+
+    int64 num_elements;
+    TF_RETURN_IF_ERROR(arg->NumElements(&num_elements));
+    if (num_elements > 64) {
+      *result = false;
+      return Status::OK();
+    }
+    i++;
+  }
+
+  // TODO(nareshmodi): Is it possible there is no int32/int64 CPU kernel for
+  // an op, but there is a GPU kernel?
+  DVLOG(1) << "Forcing op " << op_name
+           << " to be on the CPU since all input tensors have an "
+              "int32/int64 dtype, and are small (less than 64 elements).";
+  *result = true;
+  return Status::OK();
+}
+
+Status MaybePinToResourceDevice(VariantDevice* device,
+                                const EagerOperation& op) {
+  if (op.colocation_exempt()) {
+    return Status::OK();
+  }
+  EagerContext& ctx = op.EagerContext();
+  Device* op_device = op.Device() == kVariantDeviceNull
+                          ? ctx.HostCPU()
+                          : absl::get<Device*>(op.Device());
+  for (int i = 0; i < op.Inputs().size(); ++i) {
+    TensorHandle* tensor_handle = op.Inputs()[i];
+    if (tensor_handle->dtype == DT_RESOURCE) {
+      if (tensor_handle->resource_remote_device_incarnation() != 0) {
+        TF_RETURN_IF_ERROR(ValidateTensorHandleRemoteDevice(
+            &ctx, tensor_handle->resource_remote_device_incarnation()));
+      }
+      Device* resource_device = tensor_handle->resource_device();
+      DVLOG(2) << "for op " << op.Name() << " input " << i << " "
+               << DataTypeString(tensor_handle->dtype)
+               << " input device = " << resource_device->name()
+               << ", op device = " << op_device->name();
+      // We check for `op->Device() == nullptr` because it can be later
+      // interpreted as unspecified device and a different device can
+      // be selected based on device priority. If any input to an op
+      // is a resource we must pin it to prevent different device selection.
+      // TODO(iga): null device can mean "unspecified" or "CPU". Clean this up.
+      if (resource_device != op_device || op.Device() == kVariantDeviceNull) {
+        DVLOG(1) << (resource_device != op_device ? "Changing " : "Setting ")
+                 << "device of operation " << op.Name() << " to "
+                 << resource_device->name() << " because input #" << i
+                 << " is a resource in this device.";
+        *device = resource_device;
+        return Status::OK();
+        // No point in looking at other inputs. If there are other resources,
+        // they must have the same device and we already declared the op to be
+        // ineligible for CPU pinning.
+      }
+    }
+  }
+  return Status::OK();
+}
+
+Status MaybePinToCustomDevice(VariantDevice* device, const EagerOperation& op) {
+  // If operation was already placed on a custom device, use it.
+  if (VariantDeviceIsCustom(op.Device())) {
+    *device = op.Device();
+    return Status::OK();
+  }
+
+  if (!op.Inputs().empty()) {
+    // We keep track of what we've seen with devices instead of booleans to be
+    // able to provide a meaningful error message below.
+    VariantDevice first = op.Inputs()[0]->device();
+    VariantDevice different = first;  // A different input device, if any.
+    VariantDevice custom = first;     // The first custom device seen, or an
+                                      // arbitrary non-custom device otherwise.
+    for (size_t i = 1; first == different && i < op.Inputs().size(); ++i) {
+      VariantDevice device = op.Inputs()[i]->device();
+      if (device != first) {
+        different = device;
+      }
+      if (!VariantDeviceIsCustom(custom) && VariantDeviceIsCustom(device)) {
+        custom = device;
+      }
+      if (different != first && VariantDeviceIsCustom(custom)) {
+        return errors::InvalidArgument(absl::StrCat(
+            "If an operation has one of its inputs in a custom device, then "
+            "all inputs should be on that same device. Operation ",
+            op.Name(), " has one input in custom device ",
+            VariantDeviceName(custom),
+            " and at least one input in a different device ",
+            VariantDeviceName(custom == first ? different : first)));
+      }
+    }
+    if (different == first && VariantDeviceIsCustom(custom)) {
+      *device = first;
+      return Status::OK();
+    }
+  }
+
+  return Status::OK();
+}
+
+}  // namespace eager
+}  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/eager/placement_utils.h b/tensorflow/core/common_runtime/eager/placement_utils.h
new file mode 100644
index 00000000000..d58bd304b27
--- /dev/null
+++ b/tensorflow/core/common_runtime/eager/placement_utils.h
@@ -0,0 +1,55 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_EAGER_PLACEMENT_UTILS_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_EAGER_PLACEMENT_UTILS_H_
+
+#include "tensorflow/core/common_runtime/eager/context.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/platform/stringpiece.h"
+
+namespace tensorflow {
+namespace eager {
+
+bool IsColocationExempt(StringPiece op_name);
+
+bool IsFunction(StringPiece op_name);
+
+bool IsCustomDevice(StringPiece device_name, const EagerContext& ctx);
+
+// TODO(b/154234908): Unify placement logic.
+// TODO(b/159647422): Add C++ unit tests for placement logic.
+
+// Pin the op to cpu if all op inputs are on the CPU, small (<64 elements) and
+// integers (int32/int64). This can be disabled by setting the environment
+// variable "TF_EAGER_ENABLE_SMALL_TENSOR_CPU_PINNING" to "0" or "false".
+Status MaybePinSmallOpsToCpu(bool* result, StringPiece op_name,
+                             absl::Span<ImmediateExecutionTensorHandle*> args,
+                             const EagerContext& ctx);
+
+// If a resource touching input is specified, all resource-touching ops run in
+// the device the resource is, regardless of anything else that has been
+// specified. This is identical to the graph mode behavior.
+Status MaybePinToResourceDevice(VariantDevice* device,
+                                const EagerOperation& op);
+
+// If all the inputs are on the same custom device, use that custom
+// device. Otherwise, it is an error to have a custom device as an input.
+Status MaybePinToCustomDevice(VariantDevice* device, const EagerOperation& op);
+
+}  // namespace eager
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_EAGER_PLACEMENT_UTILS_H_
diff --git a/tensorflow/core/distributed_runtime/eager/eager_service_impl.cc b/tensorflow/core/distributed_runtime/eager/eager_service_impl.cc
index fd0606538c4..4735ff6eaf6 100644
--- a/tensorflow/core/distributed_runtime/eager/eager_service_impl.cc
+++ b/tensorflow/core/distributed_runtime/eager/eager_service_impl.cc
@@ -491,7 +491,11 @@ Status EagerServiceImpl::ExecuteOp(const Operation& operation,
 
   absl::FixedArray<tensorflow::TensorHandle*> retvals(num_retvals);
   VLOG(3) << "ServerContext: Calling EagerExecute for op " << operation.id();
-  TF_RETURN_IF_ERROR(EagerExecute(&op, retvals.data(), &num_retvals));
+  TF_RETURN_IF_ERROR(op.Execute(
+      absl::MakeSpan(
+          reinterpret_cast<tensorflow::AbstractTensorHandle**>(retvals.data()),
+          num_retvals),
+      &num_retvals));
 
   return AddOpRetvalsToResponse(
       eager_context, operation.id(), num_retvals, retvals.data(),
diff --git a/tensorflow/lite/delegates/flex/kernel.cc b/tensorflow/lite/delegates/flex/kernel.cc
index e7705ecf3ce..b3e978908bd 100644
--- a/tensorflow/lite/delegates/flex/kernel.cc
+++ b/tensorflow/lite/delegates/flex/kernel.cc
@@ -331,9 +331,12 @@ tensorflow::Status ExecuteFlexOp(TfLiteContext* context, BufferMap* buffer_map,
   node_data->mutable_outputs()->ResetTensorHandles();
   int num_retvals = node_data->NumOutputs();
   TF_RETURN_WITH_CONTEXT_IF_ERROR(
-      EagerExecute(node_data->op(),
-                   node_data->mutable_outputs()->GetTensorHandles()->data(),
-                   &num_retvals),
+      node_data->op()->Execute(
+          absl::MakeSpan(
+              reinterpret_cast<tensorflow::AbstractTensorHandle**>(
+                  node_data->mutable_outputs()->GetTensorHandles()->data()),
+              num_retvals),
+          &num_retvals),
       " (while executing '", node_data->name(), "' via Eager)");
 
   if (num_retvals != node_data->NumOutputs()) {

From 34b4fab30a40efbdd22c5fb4532e818c4d8e441d Mon Sep 17 00:00:00 2001
From: Advait Jain <advaitjain@google.com>
Date: Mon, 22 Jun 2020 17:39:59 -0700
Subject: [PATCH 0844/1390] Remove deprecated AddBuiltin API from
 MicroMutableOpResolver.

* Added new API hooks for all the OPs currently supported in TFLM.
* These new APIs still need to be implemented with operator specific parse
  functions but this change allows us to remove the old API and incrementally
  update the implementations.

PiperOrigin-RevId: 317770205
Change-Id: Idaaa687401f2bac5fbf9925e27c04bf536b154ea
---
 tensorflow/lite/micro/all_ops_resolver.cc     | 120 +++----
 .../image_recognition_test.cc                 |  12 +-
 .../image_recognition_experimental/main.cc    |  12 +-
 .../examples/magic_wand/magic_wand_test.cc    |  16 +-
 .../examples/magic_wand/main_functions.cc     |  16 +-
 .../examples/micro_speech/main_functions.cc   |  16 +-
 .../micro_speech/micro_speech_test.cc         |  13 +-
 .../person_detection/main_functions.cc        |  10 +-
 .../person_detection/person_detection_test.cc |  10 +-
 .../main_functions.cc                         |  16 +-
 .../person_detection_test.cc                  |  16 +-
 .../lite/micro/micro_mutable_op_resolver.h    | 332 ++++++++++++++++--
 .../micro/micro_mutable_op_resolver_test.cc   |  33 +-
 13 files changed, 408 insertions(+), 214 deletions(-)

diff --git a/tensorflow/lite/micro/all_ops_resolver.cc b/tensorflow/lite/micro/all_ops_resolver.cc
index b0021a2e771..e728a95360a 100644
--- a/tensorflow/lite/micro/all_ops_resolver.cc
+++ b/tensorflow/lite/micro/all_ops_resolver.cc
@@ -26,74 +26,60 @@ const char* GetString_ETHOSU();
 
 AllOpsResolver::AllOpsResolver() {
   // Please keep this list of Builtin Operators in alphabetical order.
-  AddBuiltin(BuiltinOperator_ABS, tflite::ops::micro::Register_ABS());
-  AddBuiltin(BuiltinOperator_ADD, tflite::ops::micro::Register_ADD());
-  AddBuiltin(BuiltinOperator_ARG_MAX, tflite::ops::micro::Register_ARG_MAX());
-  AddBuiltin(BuiltinOperator_ARG_MIN, tflite::ops::micro::Register_ARG_MIN());
-  AddBuiltin(BuiltinOperator_AVERAGE_POOL_2D,
-             tflite::ops::micro::Register_AVERAGE_POOL_2D());
-  AddBuiltin(BuiltinOperator_CEIL, tflite::ops::micro::Register_CEIL());
-  AddBuiltin(BuiltinOperator_CONCATENATION,
-             tflite::ops::micro::Register_CONCATENATION());
-  AddBuiltin(BuiltinOperator_CONV_2D, tflite::ops::micro::Register_CONV_2D());
-  AddBuiltin(BuiltinOperator_COS, tflite::ops::micro::Register_COS());
-  AddBuiltin(BuiltinOperator_DEPTHWISE_CONV_2D,
-             tflite::ops::micro::Register_DEPTHWISE_CONV_2D());
-  AddBuiltin(BuiltinOperator_DEQUANTIZE,
-             tflite::ops::micro::Register_DEQUANTIZE());
-  AddBuiltin(BuiltinOperator_EQUAL, tflite::ops::micro::Register_EQUAL());
-  AddBuiltin(BuiltinOperator_FLOOR, tflite::ops::micro::Register_FLOOR());
-  AddBuiltin(BuiltinOperator_FULLY_CONNECTED,
-             tflite::ops::micro::Register_FULLY_CONNECTED());
-  AddBuiltin(BuiltinOperator_GREATER, tflite::ops::micro::Register_GREATER());
-  AddBuiltin(BuiltinOperator_GREATER_EQUAL,
-             tflite::ops::micro::Register_GREATER_EQUAL());
-  AddBuiltin(BuiltinOperator_L2_NORMALIZATION,
-             tflite::ops::micro::Register_L2_NORMALIZATION());
-  AddBuiltin(BuiltinOperator_LESS, tflite::ops::micro::Register_LESS());
-  AddBuiltin(BuiltinOperator_LESS_EQUAL,
-             tflite::ops::micro::Register_LESS_EQUAL());
-  AddBuiltin(BuiltinOperator_LOG, tflite::ops::micro::Register_LOG());
-  AddBuiltin(BuiltinOperator_LOGICAL_AND,
-             tflite::ops::micro::Register_LOGICAL_AND());
-  AddBuiltin(BuiltinOperator_LOGICAL_NOT,
-             tflite::ops::micro::Register_LOGICAL_NOT());
-  AddBuiltin(BuiltinOperator_LOGICAL_OR,
-             tflite::ops::micro::Register_LOGICAL_OR());
-  AddBuiltin(BuiltinOperator_LOGISTIC, tflite::ops::micro::Register_LOGISTIC());
-  AddBuiltin(BuiltinOperator_MAX_POOL_2D,
-             tflite::ops::micro::Register_MAX_POOL_2D());
-  AddBuiltin(BuiltinOperator_MAXIMUM, tflite::ops::micro::Register_MAXIMUM());
-  AddBuiltin(BuiltinOperator_MEAN, tflite::ops::micro::Register_MEAN());
-  AddBuiltin(BuiltinOperator_MINIMUM, tflite::ops::micro::Register_MINIMUM());
-  AddBuiltin(BuiltinOperator_MUL, tflite::ops::micro::Register_MUL());
-  AddBuiltin(BuiltinOperator_NEG, tflite::ops::micro::Register_NEG());
-  AddBuiltin(BuiltinOperator_NOT_EQUAL,
-             tflite::ops::micro::Register_NOT_EQUAL());
-  AddBuiltin(BuiltinOperator_PACK, tflite::ops::micro::Register_PACK());
-  AddBuiltin(BuiltinOperator_PAD, tflite::ops::micro::Register_PAD());
-  AddBuiltin(BuiltinOperator_PADV2, tflite::ops::micro::Register_PADV2());
-  AddBuiltin(BuiltinOperator_PRELU, tflite::ops::micro::Register_PRELU());
-  AddBuiltin(BuiltinOperator_QUANTIZE, tflite::ops::micro::Register_QUANTIZE());
-  AddBuiltin(BuiltinOperator_RELU, tflite::ops::micro::Register_RELU());
-  AddBuiltin(BuiltinOperator_RELU6, tflite::ops::micro::Register_RELU6());
-  AddBuiltin(BuiltinOperator_RESHAPE, tflite::ops::micro::Register_RESHAPE());
-  AddBuiltin(BuiltinOperator_RESIZE_NEAREST_NEIGHBOR,
-             tflite::ops::micro::Register_RESIZE_NEAREST_NEIGHBOR());
-  AddBuiltin(BuiltinOperator_ROUND, tflite::ops::micro::Register_ROUND());
-  AddBuiltin(BuiltinOperator_RSQRT, tflite::ops::micro::Register_RSQRT());
-  AddBuiltin(BuiltinOperator_SIN, tflite::ops::micro::Register_SIN());
-  AddBuiltin(BuiltinOperator_SOFTMAX, tflite::ops::micro::Register_SOFTMAX());
-  AddBuiltin(BuiltinOperator_SPLIT, tflite::ops::micro::Register_SPLIT());
-  AddBuiltin(BuiltinOperator_SQRT, tflite::ops::micro::Register_SQRT());
-  AddBuiltin(BuiltinOperator_SQUARE, tflite::ops::micro::Register_SQUARE());
-  AddBuiltin(BuiltinOperator_STRIDED_SLICE,
-             tflite::ops::micro::Register_STRIDED_SLICE());
-  AddBuiltin(BuiltinOperator_SUB, tflite::ops::micro::Register_SUB());
-  AddBuiltin(BuiltinOperator_SVDF, tflite::ops::micro::Register_SVDF());
-  AddBuiltin(BuiltinOperator_TANH, tflite::ops::micro::Register_TANH());
-  AddBuiltin(BuiltinOperator_UNPACK, tflite::ops::micro::Register_UNPACK());
+  AddAbs();
+  AddAdd();
+  AddArgMax();
+  AddArgMin();
+  AddAveragePool2D();
+  AddCeil();
+  AddConcatenation();
+  AddConv2D();
+  AddCos();
+  AddDepthwiseConv2D();
+  AddDequantize();
+  AddEqual();
+  AddFloor();
+  AddFullyConnected();
+  AddGreater();
+  AddGreaterEqual();
+  AddL2Normalization();
+  AddLess();
+  AddLessEqual();
+  AddLog();
+  AddLogicalAnd();
+  AddLogicalNot();
+  AddLogicalOr();
+  AddLogistic();
+  AddMaximum();
+  AddMaxPool2D();
+  AddMean();
+  AddMinimum();
+  AddMul();
+  AddNeg();
+  AddNotEqual();
+  AddPack();
+  AddPad();
+  AddPadV2();
+  AddPrelu();
+  AddQuantize();
+  AddRelu();
+  AddRelu6();
+  AddReshape();
+  AddResizeNearestNeighbor();
+  AddRound();
+  AddRsqrt();
+  AddSin();
+  AddSoftmax();
+  AddSplit();
+  AddSqrt();
+  AddSquare();
+  AddStridedSlice();
+  AddSub();
+  AddSvdf();
+  AddTanh();
+  AddUnpack();
 
+  // TODO(b/159644355): Figure out if custom Ops belong in AllOpsResolver.
   TfLiteRegistration* registration =
       tflite::ops::micro::custom::Register_ETHOSU();
   if (registration) {
diff --git a/tensorflow/lite/micro/examples/image_recognition_experimental/image_recognition_test.cc b/tensorflow/lite/micro/examples/image_recognition_experimental/image_recognition_test.cc
index ac4de118834..5ad2fb2acbe 100644
--- a/tensorflow/lite/micro/examples/image_recognition_experimental/image_recognition_test.cc
+++ b/tensorflow/lite/micro/examples/image_recognition_experimental/image_recognition_test.cc
@@ -44,14 +44,10 @@ TF_LITE_MICRO_TEST(TestImageRecognitionInvoke) {
 
   tflite::MicroMutableOpResolver<4> micro_op_resolver;
 
-  micro_op_resolver.AddBuiltin(tflite::BuiltinOperator_CONV_2D,
-                               tflite::ops::micro::Register_CONV_2D());
-  micro_op_resolver.AddBuiltin(tflite::BuiltinOperator_MAX_POOL_2D,
-                               tflite::ops::micro::Register_MAX_POOL_2D());
-  micro_op_resolver.AddBuiltin(tflite::BuiltinOperator_FULLY_CONNECTED,
-                               tflite::ops::micro::Register_FULLY_CONNECTED());
-  micro_op_resolver.AddBuiltin(tflite::BuiltinOperator_SOFTMAX,
-                               tflite::ops::micro::Register_SOFTMAX());
+  micro_op_resolver.AddConv2D();
+  micro_op_resolver.AddMaxPool2D();
+  micro_op_resolver.AddFullyConnected();
+  micro_op_resolver.AddSoftmax();
 
   const int tensor_arena_size = 50 * 1024;
   uint8_t tensor_arena[tensor_arena_size];
diff --git a/tensorflow/lite/micro/examples/image_recognition_experimental/main.cc b/tensorflow/lite/micro/examples/image_recognition_experimental/main.cc
index becdbdf1bd7..fcf7b41b827 100644
--- a/tensorflow/lite/micro/examples/image_recognition_experimental/main.cc
+++ b/tensorflow/lite/micro/examples/image_recognition_experimental/main.cc
@@ -58,14 +58,10 @@ int main(int argc, char** argv) {
 
   tflite::MicroMutableOpResolver<4> micro_op_resolver;
 
-  micro_op_resolver.AddBuiltin(tflite::BuiltinOperator_CONV_2D,
-                               tflite::ops::micro::Register_CONV_2D());
-  micro_op_resolver.AddBuiltin(tflite::BuiltinOperator_MAX_POOL_2D,
-                               tflite::ops::micro::Register_MAX_POOL_2D());
-  micro_op_resolver.AddBuiltin(tflite::BuiltinOperator_FULLY_CONNECTED,
-                               tflite::ops::micro::Register_FULLY_CONNECTED());
-  micro_op_resolver.AddBuiltin(tflite::BuiltinOperator_SOFTMAX,
-                               tflite::ops::micro::Register_SOFTMAX());
+  micro_op_resolver.AddConv2D();
+  micro_op_resolver.AddFullyConnected();
+  micro_op_resolver.AddMaxPool2D();
+  micro_op_resolver.AddSoftmax();
 
   constexpr int tensor_arena_size = 50 * 1024;
   uint8_t tensor_arena[tensor_arena_size];
diff --git a/tensorflow/lite/micro/examples/magic_wand/magic_wand_test.cc b/tensorflow/lite/micro/examples/magic_wand/magic_wand_test.cc
index 88bfad860e2..fb75afee309 100644
--- a/tensorflow/lite/micro/examples/magic_wand/magic_wand_test.cc
+++ b/tensorflow/lite/micro/examples/magic_wand/magic_wand_test.cc
@@ -47,17 +47,11 @@ TF_LITE_MICRO_TEST(LoadModelAndPerformInference) {
   // incur some penalty in code space for op implementations that are not
   // needed by this graph.
   static tflite::MicroMutableOpResolver<5> micro_op_resolver;  // NOLINT
-  micro_op_resolver.AddBuiltin(
-      tflite::BuiltinOperator_DEPTHWISE_CONV_2D,
-      tflite::ops::micro::Register_DEPTHWISE_CONV_2D());
-  micro_op_resolver.AddBuiltin(tflite::BuiltinOperator_MAX_POOL_2D,
-                               tflite::ops::micro::Register_MAX_POOL_2D());
-  micro_op_resolver.AddBuiltin(tflite::BuiltinOperator_CONV_2D,
-                               tflite::ops::micro::Register_CONV_2D());
-  micro_op_resolver.AddBuiltin(tflite::BuiltinOperator_FULLY_CONNECTED,
-                               tflite::ops::micro::Register_FULLY_CONNECTED());
-  micro_op_resolver.AddBuiltin(tflite::BuiltinOperator_SOFTMAX,
-                               tflite::ops::micro::Register_SOFTMAX());
+  micro_op_resolver.AddConv2D();
+  micro_op_resolver.AddDepthwiseConv2D();
+  micro_op_resolver.AddFullyConnected();
+  micro_op_resolver.AddMaxPool2D();
+  micro_op_resolver.AddSoftmax();
 
   // Create an area of memory to use for input, output, and intermediate arrays.
   // Finding the minimum value for your model may require some trial and error.
diff --git a/tensorflow/lite/micro/examples/magic_wand/main_functions.cc b/tensorflow/lite/micro/examples/magic_wand/main_functions.cc
index 26c2eb44747..8defeaad866 100644
--- a/tensorflow/lite/micro/examples/magic_wand/main_functions.cc
+++ b/tensorflow/lite/micro/examples/magic_wand/main_functions.cc
@@ -66,17 +66,11 @@ void setup() {
   // incur some penalty in code space for op implementations that are not
   // needed by this graph.
   static tflite::MicroMutableOpResolver<5> micro_op_resolver;  // NOLINT
-  micro_op_resolver.AddBuiltin(
-      tflite::BuiltinOperator_DEPTHWISE_CONV_2D,
-      tflite::ops::micro::Register_DEPTHWISE_CONV_2D());
-  micro_op_resolver.AddBuiltin(tflite::BuiltinOperator_MAX_POOL_2D,
-                               tflite::ops::micro::Register_MAX_POOL_2D());
-  micro_op_resolver.AddBuiltin(tflite::BuiltinOperator_CONV_2D,
-                               tflite::ops::micro::Register_CONV_2D());
-  micro_op_resolver.AddBuiltin(tflite::BuiltinOperator_FULLY_CONNECTED,
-                               tflite::ops::micro::Register_FULLY_CONNECTED());
-  micro_op_resolver.AddBuiltin(tflite::BuiltinOperator_SOFTMAX,
-                               tflite::ops::micro::Register_SOFTMAX());
+  micro_op_resolver.AddConv2D();
+  micro_op_resolver.AddDepthwiseConv2D();
+  micro_op_resolver.AddFullyConnected();
+  micro_op_resolver.AddMaxPool2D();
+  micro_op_resolver.AddSoftmax();
 
   // Build an interpreter to run the model with.
   static tflite::MicroInterpreter static_interpreter(
diff --git a/tensorflow/lite/micro/examples/micro_speech/main_functions.cc b/tensorflow/lite/micro/examples/micro_speech/main_functions.cc
index 30c5022b2d6..d09c4c7af06 100644
--- a/tensorflow/lite/micro/examples/micro_speech/main_functions.cc
+++ b/tensorflow/lite/micro/examples/micro_speech/main_functions.cc
@@ -75,24 +75,16 @@ void setup() {
   // tflite::AllOpsResolver resolver;
   // NOLINTNEXTLINE(runtime-global-variables)
   static tflite::MicroMutableOpResolver<4> micro_op_resolver(error_reporter);
-  if (micro_op_resolver.AddBuiltin(
-          tflite::BuiltinOperator_DEPTHWISE_CONV_2D,
-          tflite::ops::micro::Register_DEPTHWISE_CONV_2D()) != kTfLiteOk) {
+  if (micro_op_resolver.AddDepthwiseConv2D() != kTfLiteOk) {
     return;
   }
-  if (micro_op_resolver.AddBuiltin(
-          tflite::BuiltinOperator_FULLY_CONNECTED,
-          tflite::ops::micro::Register_FULLY_CONNECTED()) != kTfLiteOk) {
+  if (micro_op_resolver.AddFullyConnected() != kTfLiteOk) {
     return;
   }
-  if (micro_op_resolver.AddBuiltin(tflite::BuiltinOperator_SOFTMAX,
-                                   tflite::ops::micro::Register_SOFTMAX()) !=
-      kTfLiteOk) {
+  if (micro_op_resolver.AddSoftmax() != kTfLiteOk) {
     return;
   }
-  if (micro_op_resolver.AddBuiltin(tflite::BuiltinOperator_RESHAPE,
-                                   tflite::ops::micro::Register_RESHAPE()) !=
-      kTfLiteOk) {
+  if (micro_op_resolver.AddReshape() != kTfLiteOk) {
     return;
   }
 
diff --git a/tensorflow/lite/micro/examples/micro_speech/micro_speech_test.cc b/tensorflow/lite/micro/examples/micro_speech/micro_speech_test.cc
index 2c442f955cc..0f6a2afd527 100644
--- a/tensorflow/lite/micro/examples/micro_speech/micro_speech_test.cc
+++ b/tensorflow/lite/micro/examples/micro_speech/micro_speech_test.cc
@@ -49,15 +49,10 @@ TF_LITE_MICRO_TEST(TestInvoke) {
   //
   // tflite::AllOpsResolver resolver;
   tflite::MicroMutableOpResolver<4> micro_op_resolver;
-  micro_op_resolver.AddBuiltin(
-      tflite::BuiltinOperator_DEPTHWISE_CONV_2D,
-      tflite::ops::micro::Register_DEPTHWISE_CONV_2D());
-  micro_op_resolver.AddBuiltin(tflite::BuiltinOperator_FULLY_CONNECTED,
-                               tflite::ops::micro::Register_FULLY_CONNECTED());
-  micro_op_resolver.AddBuiltin(tflite::BuiltinOperator_SOFTMAX,
-                               tflite::ops::micro::Register_SOFTMAX());
-  micro_op_resolver.AddBuiltin(tflite::BuiltinOperator_RESHAPE,
-                               tflite::ops::micro::Register_RESHAPE());
+  micro_op_resolver.AddDepthwiseConv2D();
+  micro_op_resolver.AddFullyConnected();
+  micro_op_resolver.AddReshape();
+  micro_op_resolver.AddSoftmax();
 
   // Create an area of memory to use for input, output, and intermediate arrays.
   const int tensor_arena_size = 10 * 1024;
diff --git a/tensorflow/lite/micro/examples/person_detection/main_functions.cc b/tensorflow/lite/micro/examples/person_detection/main_functions.cc
index aa4d83a3334..d7e9f6826c4 100644
--- a/tensorflow/lite/micro/examples/person_detection/main_functions.cc
+++ b/tensorflow/lite/micro/examples/person_detection/main_functions.cc
@@ -66,13 +66,9 @@ void setup() {
   // tflite::AllOpsResolver resolver;
   // NOLINTNEXTLINE(runtime-global-variables)
   static tflite::MicroMutableOpResolver<3> micro_op_resolver;
-  micro_op_resolver.AddBuiltin(
-      tflite::BuiltinOperator_DEPTHWISE_CONV_2D,
-      tflite::ops::micro::Register_DEPTHWISE_CONV_2D());
-  micro_op_resolver.AddBuiltin(tflite::BuiltinOperator_CONV_2D,
-                               tflite::ops::micro::Register_CONV_2D());
-  micro_op_resolver.AddBuiltin(tflite::BuiltinOperator_AVERAGE_POOL_2D,
-                               tflite::ops::micro::Register_AVERAGE_POOL_2D());
+  micro_op_resolver.AddAveragePool2D();
+  micro_op_resolver.AddConv2D();
+  micro_op_resolver.AddDepthwiseConv2D();
 
   // Build an interpreter to run the model with.
   static tflite::MicroInterpreter static_interpreter(
diff --git a/tensorflow/lite/micro/examples/person_detection/person_detection_test.cc b/tensorflow/lite/micro/examples/person_detection/person_detection_test.cc
index bc53a8410da..7e706d49fcc 100644
--- a/tensorflow/lite/micro/examples/person_detection/person_detection_test.cc
+++ b/tensorflow/lite/micro/examples/person_detection/person_detection_test.cc
@@ -57,13 +57,9 @@ TF_LITE_MICRO_TEST(TestInvoke) {
   //
   // tflite::AllOpsResolver resolver;
   tflite::MicroMutableOpResolver<3> micro_op_resolver;
-  micro_op_resolver.AddBuiltin(
-      tflite::BuiltinOperator_DEPTHWISE_CONV_2D,
-      tflite::ops::micro::Register_DEPTHWISE_CONV_2D());
-  micro_op_resolver.AddBuiltin(tflite::BuiltinOperator_CONV_2D,
-                               tflite::ops::micro::Register_CONV_2D());
-  micro_op_resolver.AddBuiltin(tflite::BuiltinOperator_AVERAGE_POOL_2D,
-                               tflite::ops::micro::Register_AVERAGE_POOL_2D());
+  micro_op_resolver.AddAveragePool2D();
+  micro_op_resolver.AddConv2D();
+  micro_op_resolver.AddDepthwiseConv2D();
 
   // Build an interpreter to run the model with.
   tflite::MicroInterpreter interpreter(model, micro_op_resolver, tensor_arena,
diff --git a/tensorflow/lite/micro/examples/person_detection_experimental/main_functions.cc b/tensorflow/lite/micro/examples/person_detection_experimental/main_functions.cc
index ac47e36ff8f..09a9cb2c6c4 100644
--- a/tensorflow/lite/micro/examples/person_detection_experimental/main_functions.cc
+++ b/tensorflow/lite/micro/examples/person_detection_experimental/main_functions.cc
@@ -73,17 +73,11 @@ void setup() {
   // tflite::AllOpsResolver resolver;
   // NOLINTNEXTLINE(runtime-global-variables)
   static tflite::MicroMutableOpResolver<5> micro_op_resolver;
-  micro_op_resolver.AddBuiltin(
-      tflite::BuiltinOperator_DEPTHWISE_CONV_2D,
-      tflite::ops::micro::Register_DEPTHWISE_CONV_2D());
-  micro_op_resolver.AddBuiltin(tflite::BuiltinOperator_CONV_2D,
-                               tflite::ops::micro::Register_CONV_2D());
-  micro_op_resolver.AddBuiltin(tflite::BuiltinOperator_AVERAGE_POOL_2D,
-                               tflite::ops::micro::Register_AVERAGE_POOL_2D());
-  micro_op_resolver.AddBuiltin(tflite::BuiltinOperator_RESHAPE,
-                               tflite::ops::micro::Register_RESHAPE());
-  micro_op_resolver.AddBuiltin(tflite::BuiltinOperator_SOFTMAX,
-                               tflite::ops::micro::Register_SOFTMAX());
+  micro_op_resolver.AddAveragePool2D();
+  micro_op_resolver.AddConv2D();
+  micro_op_resolver.AddDepthwiseConv2D();
+  micro_op_resolver.AddReshape();
+  micro_op_resolver.AddSoftmax();
 
   // Build an interpreter to run the model with.
   // NOLINTNEXTLINE(runtime-global-variables)
diff --git a/tensorflow/lite/micro/examples/person_detection_experimental/person_detection_test.cc b/tensorflow/lite/micro/examples/person_detection_experimental/person_detection_test.cc
index ddec8951596..270a427b1df 100644
--- a/tensorflow/lite/micro/examples/person_detection_experimental/person_detection_test.cc
+++ b/tensorflow/lite/micro/examples/person_detection_experimental/person_detection_test.cc
@@ -53,17 +53,11 @@ TF_LITE_MICRO_TEST(TestInvoke) {
   // incur some penalty in code space for op implementations that are not
   // needed by this graph.
   tflite::MicroMutableOpResolver<5> micro_op_resolver;
-  micro_op_resolver.AddBuiltin(
-      tflite::BuiltinOperator_DEPTHWISE_CONV_2D,
-      tflite::ops::micro::Register_DEPTHWISE_CONV_2D());
-  micro_op_resolver.AddBuiltin(tflite::BuiltinOperator_CONV_2D,
-                               tflite::ops::micro::Register_CONV_2D());
-  micro_op_resolver.AddBuiltin(tflite::BuiltinOperator_AVERAGE_POOL_2D,
-                               tflite::ops::micro::Register_AVERAGE_POOL_2D());
-  micro_op_resolver.AddBuiltin(tflite::BuiltinOperator_RESHAPE,
-                               tflite::ops::micro::Register_RESHAPE());
-  micro_op_resolver.AddBuiltin(tflite::BuiltinOperator_SOFTMAX,
-                               tflite::ops::micro::Register_SOFTMAX());
+  micro_op_resolver.AddAveragePool2D();
+  micro_op_resolver.AddConv2D();
+  micro_op_resolver.AddDepthwiseConv2D();
+  micro_op_resolver.AddReshape();
+  micro_op_resolver.AddSoftmax();
 
   // Build an interpreter to run the model with.
   tflite::MicroInterpreter interpreter(model, micro_op_resolver, tensor_arena,
diff --git a/tensorflow/lite/micro/micro_mutable_op_resolver.h b/tensorflow/lite/micro/micro_mutable_op_resolver.h
index b9ce2bb4bba..1b76f440a61 100644
--- a/tensorflow/lite/micro/micro_mutable_op_resolver.h
+++ b/tensorflow/lite/micro/micro_mutable_op_resolver.h
@@ -104,39 +104,72 @@ class MicroMutableOpResolver : public MicroOpResolver {
     return kTfLiteOk;
   }
 
-  // Registers a Builtin Operator with the MicroOpResolver.
-  //
-  // Only the first call for a given BuiltinOperator enum will be successful.
-  // i.e. if this function is called again for a previously added
-  // BuiltinOperator, the MicroOpResolver will be unchanged and this function
-  // will return kTfLiteError.
-  //
-  // TODO(b/149408647): remove this API once the BuiltinOperator specific Add
-  // functions are fully implemented.
-  TfLiteStatus AddBuiltin(tflite::BuiltinOperator op,
-                          TfLiteRegistration* registration) {
-    TFLITE_DCHECK(registration != nullptr);
-    // For code that is not switched over to the new selective registration of
-    // the parse function, we pass in ParseOpData. This allows for backwards
-    // compatibility.
-    return AddBuiltin(op, *registration, ParseOpData);
-  }
-
   // The Add* functions below add the various Builtin operators to the
   // MicroMutableOpResolver object.
-  //
-  // This API is currently experimental (and only supported for a small subset
-  // of operators). It will soon be preferred over the AddBuiltin function for
-  // the following reason:
-  //  * If all calls to AddBuiltin for an application use this API, the code
-  //    size will be smaller by 5-8K (compared to the using the AddBuiltin
-  //    override).
+
+  TfLiteStatus AddAbs() {
+    // TODO(b/149408647): Replace ParseOpData with the operator specific parse
+    // function.
+    return AddBuiltin(BuiltinOperator_ABS, *tflite::ops::micro::Register_ABS(),
+                      ParseOpData);
+  }
+
+  TfLiteStatus AddAdd() {
+    // TODO(b/149408647): Replace ParseOpData with the operator specific parse
+    // function.
+    return AddBuiltin(BuiltinOperator_ADD, *tflite::ops::micro::Register_ADD(),
+                      ParseOpData);
+  }
+
+  TfLiteStatus AddArgMax() {
+    // TODO(b/149408647): Replace ParseOpData with the operator specific parse
+    // function.
+    return AddBuiltin(BuiltinOperator_ARG_MAX,
+                      *tflite::ops::micro::Register_ARG_MAX(), ParseOpData);
+  }
+
+  TfLiteStatus AddArgMin() {
+    // TODO(b/149408647): Replace ParseOpData with the operator specific parse
+    // function.
+    return AddBuiltin(BuiltinOperator_ARG_MIN,
+                      *tflite::ops::micro::Register_ARG_MIN(), ParseOpData);
+  }
+
+  TfLiteStatus AddAveragePool2D() {
+    // TODO(b/149408647): Replace ParseOpData with the operator specific parse
+    // function.
+    return AddBuiltin(BuiltinOperator_AVERAGE_POOL_2D,
+                      *tflite::ops::micro::Register_AVERAGE_POOL_2D(),
+                      ParseOpData);
+  }
+
+  TfLiteStatus AddCeil() {
+    // TODO(b/149408647): Replace ParseOpData with the operator specific parse
+    // function.
+    return AddBuiltin(BuiltinOperator_CEIL,
+                      *tflite::ops::micro::Register_CEIL(), ParseOpData);
+  }
+
+  TfLiteStatus AddConcatenation() {
+    // TODO(b/149408647): Replace ParseOpData with the operator specific parse
+    // function.
+    return AddBuiltin(BuiltinOperator_CONCATENATION,
+                      *tflite::ops::micro::Register_CONCATENATION(),
+                      ParseOpData);
+  }
 
   TfLiteStatus AddConv2D() {
     return AddBuiltin(BuiltinOperator_CONV_2D,
                       *tflite::ops::micro::Register_CONV_2D(), ParseConv2D);
   }
 
+  TfLiteStatus AddCos() {
+    // TODO(b/149408647): Replace ParseOpData with the operator specific parse
+    // function.
+    return AddBuiltin(BuiltinOperator_COS, *tflite::ops::micro::Register_COS(),
+                      ParseOpData);
+  }
+
   TfLiteStatus AddDepthwiseConv2D() {
     return AddBuiltin(BuiltinOperator_DEPTHWISE_CONV_2D,
                       *tflite::ops::micro::Register_DEPTHWISE_CONV_2D(),
@@ -149,12 +182,91 @@ class MicroMutableOpResolver : public MicroOpResolver {
                       ParseDequantize);
   }
 
+  TfLiteStatus AddEqual() {
+    // TODO(b/149408647): Replace ParseOpData with the operator specific parse
+    // function.
+    return AddBuiltin(BuiltinOperator_EQUAL,
+                      *tflite::ops::micro::Register_EQUAL(), ParseOpData);
+  }
+
+  TfLiteStatus AddFloor() {
+    // TODO(b/149408647): Replace ParseOpData with the operator specific parse
+    // function.
+    return AddBuiltin(BuiltinOperator_FLOOR,
+                      *tflite::ops::micro::Register_FLOOR(), ParseOpData);
+  }
+
   TfLiteStatus AddFullyConnected() {
     return AddBuiltin(BuiltinOperator_FULLY_CONNECTED,
                       *tflite::ops::micro::Register_FULLY_CONNECTED(),
                       ParseFullyConnected);
   }
 
+  TfLiteStatus AddGreater() {
+    // TODO(b/149408647): Replace ParseOpData with the operator specific parse
+    // function.
+    return AddBuiltin(BuiltinOperator_GREATER,
+                      *tflite::ops::micro::Register_GREATER(), ParseOpData);
+  }
+
+  TfLiteStatus AddGreaterEqual() {
+    // TODO(b/149408647): Replace ParseOpData with the operator specific parse
+    // function.
+    return AddBuiltin(BuiltinOperator_GREATER_EQUAL,
+                      *tflite::ops::micro::Register_GREATER_EQUAL(),
+                      ParseOpData);
+  }
+
+  TfLiteStatus AddL2Normalization() {
+    // TODO(b/149408647): Replace ParseOpData with the operator specific parse
+    // function.
+    return AddBuiltin(BuiltinOperator_L2_NORMALIZATION,
+                      *tflite::ops::micro::Register_L2_NORMALIZATION(),
+                      ParseOpData);
+  }
+
+  TfLiteStatus AddLess() {
+    // TODO(b/149408647): Replace ParseOpData with the operator specific parse
+    // function.
+    return AddBuiltin(BuiltinOperator_LESS,
+                      *tflite::ops::micro::Register_LESS(), ParseOpData);
+  }
+
+  TfLiteStatus AddLessEqual() {
+    // TODO(b/149408647): Replace ParseOpData with the operator specific parse
+    // function.
+    return AddBuiltin(BuiltinOperator_LESS_EQUAL,
+                      *tflite::ops::micro::Register_LESS_EQUAL(), ParseOpData);
+  }
+
+  TfLiteStatus AddLog() {
+    // TODO(b/149408647): Replace ParseOpData with the operator specific parse
+    // function.
+    return AddBuiltin(BuiltinOperator_LOG, *tflite::ops::micro::Register_LOG(),
+                      ParseOpData);
+  }
+
+  TfLiteStatus AddLogicalAnd() {
+    // TODO(b/149408647): Replace ParseOpData with the operator specific parse
+    // function.
+    return AddBuiltin(BuiltinOperator_LOGICAL_AND,
+                      *tflite::ops::micro::Register_LOGICAL_AND(), ParseOpData);
+  }
+
+  TfLiteStatus AddLogicalNot() {
+    // TODO(b/149408647): Replace ParseOpData with the operator specific parse
+    // function.
+    return AddBuiltin(BuiltinOperator_LOGICAL_NOT,
+                      *tflite::ops::micro::Register_LOGICAL_NOT(), ParseOpData);
+  }
+
+  TfLiteStatus AddLogicalOr() {
+    // TODO(b/149408647): Replace ParseOpData with the operator specific parse
+    // function.
+    return AddBuiltin(BuiltinOperator_LOGICAL_OR,
+                      *tflite::ops::micro::Register_LOGICAL_OR(), ParseOpData);
+  }
+
   TfLiteStatus AddLogistic() {
     // TODO(b/149408647): Replace ParseOpData with the operator specific parse
     // function.
@@ -162,26 +274,196 @@ class MicroMutableOpResolver : public MicroOpResolver {
                       *tflite::ops::micro::Register_LOGISTIC(), ParseOpData);
   }
 
+  TfLiteStatus AddMaximum() {
+    // TODO(b/149408647): Replace ParseOpData with the operator specific parse
+    // function.
+    return AddBuiltin(BuiltinOperator_MAXIMUM,
+                      *tflite::ops::micro::Register_MAXIMUM(), ParseOpData);
+  }
+
+  TfLiteStatus AddMaxPool2D() {
+    // TODO(b/149408647): Replace ParseOpData with the operator specific parse
+    // function.
+    return AddBuiltin(BuiltinOperator_MAX_POOL_2D,
+                      *tflite::ops::micro::Register_MAX_POOL_2D(), ParseOpData);
+  }
+
+  TfLiteStatus AddMean() {
+    // TODO(b/149408647): Replace ParseOpData with the operator specific parse
+    // function.
+    return AddBuiltin(BuiltinOperator_MEAN,
+                      *tflite::ops::micro::Register_MEAN(), ParseOpData);
+  }
+
+  TfLiteStatus AddMinimum() {
+    // TODO(b/149408647): Replace ParseOpData with the operator specific parse
+    // function.
+    return AddBuiltin(BuiltinOperator_MINIMUM,
+                      *tflite::ops::micro::Register_MINIMUM(), ParseOpData);
+  }
+
+  TfLiteStatus AddMul() {
+    // TODO(b/149408647): Replace ParseOpData with the operator specific parse
+    // function.
+    return AddBuiltin(BuiltinOperator_MUL, *tflite::ops::micro::Register_MUL(),
+                      ParseOpData);
+  }
+
+  TfLiteStatus AddNeg() {
+    // TODO(b/149408647): Replace ParseOpData with the operator specific parse
+    // function.
+    return AddBuiltin(BuiltinOperator_NEG, *tflite::ops::micro::Register_NEG(),
+                      ParseOpData);
+  }
+
+  TfLiteStatus AddNotEqual() {
+    // TODO(b/149408647): Replace ParseOpData with the operator specific parse
+    // function.
+    return AddBuiltin(BuiltinOperator_NOT_EQUAL,
+                      *tflite::ops::micro::Register_NOT_EQUAL(), ParseOpData);
+  }
+
+  TfLiteStatus AddPack() {
+    // TODO(b/149408647): Replace ParseOpData with the operator specific parse
+    // function.
+    return AddBuiltin(BuiltinOperator_PACK,
+                      *tflite::ops::micro::Register_PACK(), ParseOpData);
+  }
+
+  TfLiteStatus AddPad() {
+    // TODO(b/149408647): Replace ParseOpData with the operator specific parse
+    // function.
+    return AddBuiltin(BuiltinOperator_PAD, *tflite::ops::micro::Register_PAD(),
+                      ParseOpData);
+  }
+
+  TfLiteStatus AddPadV2() {
+    // TODO(b/149408647): Replace ParseOpData with the operator specific parse
+    // function.
+    return AddBuiltin(BuiltinOperator_PADV2,
+                      *tflite::ops::micro::Register_PADV2(), ParseOpData);
+  }
+
+  TfLiteStatus AddPrelu() {
+    // TODO(b/149408647): Replace ParseOpData with the operator specific parse
+    // function.
+    return AddBuiltin(BuiltinOperator_PRELU,
+                      *tflite::ops::micro::Register_PRELU(), ParseOpData);
+  }
+
   TfLiteStatus AddQuantize() {
     return AddBuiltin(BuiltinOperator_QUANTIZE,
                       *tflite::ops::micro::Register_QUANTIZE(), ParseQuantize);
   }
 
+  TfLiteStatus AddRelu() {
+    // TODO(b/149408647): Replace ParseOpData with the operator specific parse
+    // function.
+    return AddBuiltin(BuiltinOperator_RELU,
+                      *tflite::ops::micro::Register_RELU(), ParseOpData);
+  }
+
+  TfLiteStatus AddRelu6() {
+    // TODO(b/149408647): Replace ParseOpData with the operator specific parse
+    // function.
+    return AddBuiltin(BuiltinOperator_RELU6,
+                      *tflite::ops::micro::Register_RELU6(), ParseOpData);
+  }
+
   TfLiteStatus AddReshape() {
     return AddBuiltin(BuiltinOperator_RESHAPE,
                       *tflite::ops::micro::Register_RESHAPE(), ParseReshape);
   }
 
+  TfLiteStatus AddResizeNearestNeighbor() {
+    // TODO(b/149408647): Replace ParseOpData with the operator specific parse
+    // function.
+    return AddBuiltin(BuiltinOperator_RESIZE_NEAREST_NEIGHBOR,
+                      *tflite::ops::micro::Register_RESIZE_NEAREST_NEIGHBOR(),
+                      ParseOpData);
+  }
+
+  TfLiteStatus AddRound() {
+    // TODO(b/149408647): Replace ParseOpData with the operator specific parse
+    // function.
+    return AddBuiltin(BuiltinOperator_ROUND,
+                      *tflite::ops::micro::Register_ROUND(), ParseOpData);
+  }
+
+  TfLiteStatus AddRsqrt() {
+    // TODO(b/149408647): Replace ParseOpData with the operator specific parse
+    // function.
+    return AddBuiltin(BuiltinOperator_RSQRT,
+                      *tflite::ops::micro::Register_RSQRT(), ParseOpData);
+  }
+
+  TfLiteStatus AddSin() {
+    // TODO(b/149408647): Replace ParseOpData with the operator specific parse
+    // function.
+    return AddBuiltin(BuiltinOperator_SIN, *tflite::ops::micro::Register_SIN(),
+                      ParseOpData);
+  }
+
   TfLiteStatus AddSoftmax() {
     return AddBuiltin(BuiltinOperator_SOFTMAX,
                       *tflite::ops::micro::Register_SOFTMAX(), ParseSoftmax);
   }
 
+  TfLiteStatus AddSplit() {
+    // TODO(b/149408647): Replace ParseOpData with the operator specific parse
+    // function.
+    return AddBuiltin(BuiltinOperator_SPLIT,
+                      *tflite::ops::micro::Register_SPLIT(), ParseOpData);
+  }
+
+  TfLiteStatus AddSqrt() {
+    // TODO(b/149408647): Replace ParseOpData with the operator specific parse
+    // function.
+    return AddBuiltin(BuiltinOperator_SQRT,
+                      *tflite::ops::micro::Register_SQRT(), ParseOpData);
+  }
+
+  TfLiteStatus AddSquare() {
+    // TODO(b/149408647): Replace ParseOpData with the operator specific parse
+    // function.
+    return AddBuiltin(BuiltinOperator_SQUARE,
+                      *tflite::ops::micro::Register_SQUARE(), ParseOpData);
+  }
+
+  TfLiteStatus AddStridedSlice() {
+    // TODO(b/149408647): Replace ParseOpData with the operator specific parse
+    // function.
+    return AddBuiltin(BuiltinOperator_STRIDED_SLICE,
+                      *tflite::ops::micro::Register_STRIDED_SLICE(),
+                      ParseOpData);
+  }
+
+  TfLiteStatus AddSub() {
+    // TODO(b/149408647): Replace ParseOpData with the operator specific parse
+    // function.
+    return AddBuiltin(BuiltinOperator_SUB, *tflite::ops::micro::Register_SUB(),
+                      ParseOpData);
+  }
+
   TfLiteStatus AddSvdf() {
     return AddBuiltin(BuiltinOperator_SVDF,
                       *tflite::ops::micro::Register_SVDF(), ParseSvdf);
   }
 
+  TfLiteStatus AddTanh() {
+    // TODO(b/149408647): Replace ParseOpData with the operator specific parse
+    // function.
+    return AddBuiltin(BuiltinOperator_TANH,
+                      *tflite::ops::micro::Register_TANH(), ParseOpData);
+  }
+
+  TfLiteStatus AddUnpack() {
+    // TODO(b/149408647): Replace ParseOpData with the operator specific parse
+    // function.
+    return AddBuiltin(BuiltinOperator_UNPACK,
+                      *tflite::ops::micro::Register_UNPACK(), ParseOpData);
+  }
+
   unsigned int GetRegistrationLength() { return registrations_len_; }
 
  private:
diff --git a/tensorflow/lite/micro/micro_mutable_op_resolver_test.cc b/tensorflow/lite/micro/micro_mutable_op_resolver_test.cc
index ff5dfdf3a9a..fe9c8de5959 100644
--- a/tensorflow/lite/micro/micro_mutable_op_resolver_test.cc
+++ b/tensorflow/lite/micro/micro_mutable_op_resolver_test.cc
@@ -68,14 +68,7 @@ TF_LITE_MICRO_TEST(TestOperations) {
   static TfLiteRegistration r = {tflite::MockInit, tflite::MockFree,
                                  tflite::MockPrepare, tflite::MockInvoke};
 
-  MicroMutableOpResolver<2> micro_op_resolver;
-  TF_LITE_MICRO_EXPECT_EQ(
-      kTfLiteOk, micro_op_resolver.AddBuiltin(BuiltinOperator_CONV_2D, &r));
-
-  // Only one AddBuiltin per operator should return kTfLiteOk.
-  TF_LITE_MICRO_EXPECT_EQ(
-      kTfLiteError, micro_op_resolver.AddBuiltin(BuiltinOperator_CONV_2D, &r));
-
+  MicroMutableOpResolver<1> micro_op_resolver;
   TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk,
                           micro_op_resolver.AddCustom("mock_custom", &r));
 
@@ -85,16 +78,10 @@ TF_LITE_MICRO_TEST(TestOperations) {
 
   tflite::MicroOpResolver* resolver = &micro_op_resolver;
 
+  TF_LITE_MICRO_EXPECT_EQ(1, micro_op_resolver.GetRegistrationLength());
+
   const TfLiteRegistration* registration =
-      resolver->FindOp(BuiltinOperator_CONV_2D);
-  TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
-  TF_LITE_MICRO_EXPECT_EQ(nullptr, registration->init(nullptr, nullptr, 0));
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(nullptr, nullptr));
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(nullptr, nullptr));
-
-  TF_LITE_MICRO_EXPECT_EQ(2, micro_op_resolver.GetRegistrationLength());
-
-  registration = resolver->FindOp(BuiltinOperator_RELU);
+      resolver->FindOp(BuiltinOperator_RELU);
   TF_LITE_MICRO_EXPECT_EQ(nullptr, registration);
 
   registration = resolver->FindOp("mock_custom");
@@ -116,12 +103,7 @@ TF_LITE_MICRO_TEST(TestErrorReporting) {
                                  tflite::MockPrepare, tflite::MockInvoke};
 
   tflite::MockErrorReporter mock_reporter;
-  MicroMutableOpResolver<2> micro_op_resolver(&mock_reporter);
-  TF_LITE_MICRO_EXPECT_EQ(false, mock_reporter.HasBeenCalled());
-  mock_reporter.ResetState();
-
-  TF_LITE_MICRO_EXPECT_EQ(
-      kTfLiteOk, micro_op_resolver.AddBuiltin(BuiltinOperator_CONV_2D, &r));
+  MicroMutableOpResolver<1> micro_op_resolver(&mock_reporter);
   TF_LITE_MICRO_EXPECT_EQ(false, mock_reporter.HasBeenCalled());
   mock_reporter.ResetState();
 
@@ -132,10 +114,7 @@ TF_LITE_MICRO_TEST(TestErrorReporting) {
 
   // Attempting to Add more operators than the class template parameter for
   // MicroMutableOpResolver should result in errors.
-  TF_LITE_MICRO_EXPECT_EQ(
-      kTfLiteError, micro_op_resolver.AddBuiltin(BuiltinOperator_RELU, &r));
-  TF_LITE_MICRO_EXPECT_EQ(true, mock_reporter.HasBeenCalled());
-  mock_reporter.ResetState();
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteError, micro_op_resolver.AddRelu());
 
   TF_LITE_MICRO_EXPECT_EQ(kTfLiteError,
                           micro_op_resolver.AddCustom("mock_custom_1", &r));

From e74a115bcf5cd27f476b46161a639e9ec599491d Mon Sep 17 00:00:00 2001
From: Peng Wang <wangpeng@google.com>
Date: Mon, 22 Jun 2020 17:47:47 -0700
Subject: [PATCH 0845/1390] [TF-numpy] Adds __rmatmul__ method to ndarray.

PiperOrigin-RevId: 317771125
Change-Id: I719c46d97ae1c68ac59dcd1cf8f65d067ddc7658
---
 tensorflow/python/ops/numpy_ops/np_math_ops.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tensorflow/python/ops/numpy_ops/np_math_ops.py b/tensorflow/python/ops/numpy_ops/np_math_ops.py
index 427aa96e5a4..03329bbdbf1 100644
--- a/tensorflow/python/ops/numpy_ops/np_math_ops.py
+++ b/tensorflow/python/ops/numpy_ops/np_math_ops.py
@@ -950,11 +950,12 @@ setattr(np_arrays.ndarray, '__sub__', _wrap(subtract))
 setattr(np_arrays.ndarray, '__rsub__', _wrap(subtract, True))
 setattr(np_arrays.ndarray, '__mul__', _wrap(multiply))
 setattr(np_arrays.ndarray, '__rmul__', _wrap(multiply, True))
+setattr(np_arrays.ndarray, '__matmul__', _wrap(matmul))
+setattr(np_arrays.ndarray, '__rmatmul__', _wrap(matmul, True))
 setattr(np_arrays.ndarray, '__pow__', _wrap(power))
 setattr(np_arrays.ndarray, '__rpow__', _wrap(power, True))
 setattr(np_arrays.ndarray, '__truediv__', _wrap(true_divide))
 setattr(np_arrays.ndarray, '__rtruediv__', _wrap(true_divide, True))
-setattr(np_arrays.ndarray, '__matmul__', _wrap(matmul))
 
 
 def _comparison(tf_fun, x1, x2, cast_bool_to_int=False):

From 4d13d6416da49b82b375d631082607be3c58b57f Mon Sep 17 00:00:00 2001
From: Rick Chao <rchao@google.com>
Date: Mon, 22 Jun 2020 17:49:06 -0700
Subject: [PATCH 0846/1390] Make cluster_resolver standard property in
 tf.distribute strategies.

PiperOrigin-RevId: 317771299
Change-Id: I71b5c585cef7bd7ef80e66b75e30287fddcf89e2
---
 tensorflow/python/distribute/BUILD            |  5 +-
 .../collective_all_reduce_strategy.py         | 12 ++++
 .../collective_all_reduce_strategy_test.py    | 15 ++++-
 .../python/distribute/distribute_lib.py       | 59 +++++++++++++++++++
 .../python/distribute/distribute_lib_test.py  | 13 ++++
 .../python/distribute/strategy_common_test.py | 35 +++++++++++
 tensorflow/python/distribute/tpu_strategy.py  | 12 ++++
 .../python/distribute/tpu_strategy_test.py    |  7 +++
 ...orflow.distribute.-mirrored-strategy.pbtxt |  4 ++
 ...flow.distribute.-one-device-strategy.pbtxt |  4 ++
 .../v1/tensorflow.distribute.-strategy.pbtxt  |  4 ++
 ...perimental.-central-storage-strategy.pbtxt |  4 ++
 ...ntal.-multi-worker-mirrored-strategy.pbtxt |  4 ++
 ...erimental.-parameter-server-strategy.pbtxt |  4 ++
 ...tribute.experimental.-t-p-u-strategy.pbtxt |  4 ++
 ...orflow.distribute.-mirrored-strategy.pbtxt |  4 ++
 ...flow.distribute.-one-device-strategy.pbtxt |  4 ++
 .../v2/tensorflow.distribute.-strategy.pbtxt  |  4 ++
 ...ensorflow.distribute.-t-p-u-strategy.pbtxt |  4 ++
 ...perimental.-central-storage-strategy.pbtxt |  4 ++
 ...ntal.-multi-worker-mirrored-strategy.pbtxt |  4 ++
 ...erimental.-parameter-server-strategy.pbtxt |  4 ++
 ...tribute.experimental.-t-p-u-strategy.pbtxt |  4 ++
 23 files changed, 215 insertions(+), 3 deletions(-)

diff --git a/tensorflow/python/distribute/BUILD b/tensorflow/python/distribute/BUILD
index 74d80b63e12..85ee8de5635 100644
--- a/tensorflow/python/distribute/BUILD
+++ b/tensorflow/python/distribute/BUILD
@@ -204,6 +204,7 @@ py_test(
         "//tensorflow/python:variables",
         "//tensorflow/python/autograph/core:test_lib",
         "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/distribute/cluster_resolver:cluster_resolver_lib",
         "//third_party/py/numpy",
     ],
 )
@@ -1847,10 +1848,11 @@ py_test(
     ],
 )
 
-cuda_py_test(
+distribute_py_test(
     name = "strategy_common_test",
     srcs = ["strategy_common_test.py"],
     python_version = "PY3",
+    shard_count = 12,
     tags = [
         "multi_and_single_gpu",
         # TODO(b/155301154): Enable this test on multi-gpu guitar once multi process
@@ -1859,6 +1861,7 @@ cuda_py_test(
     ],
     xla_enable_strict_auto_jit = True,
     deps = [
+        ":collective_all_reduce_strategy",
         ":combinations",
         ":multi_worker_test_base",
         ":reduce_util",
diff --git a/tensorflow/python/distribute/collective_all_reduce_strategy.py b/tensorflow/python/distribute/collective_all_reduce_strategy.py
index e2b039ceb23..e754cc43a41 100644
--- a/tensorflow/python/distribute/collective_all_reduce_strategy.py
+++ b/tensorflow/python/distribute/collective_all_reduce_strategy.py
@@ -138,6 +138,18 @@ class CollectiveAllReduceStrategy(distribute_lib.Strategy):
     """
     return super(CollectiveAllReduceStrategy, self).scope()
 
+  @property
+  def cluster_resolver(self):
+    """Returns the cluster resolver associated with this strategy.
+
+    As a multi-worker strategy,
+    `tf.distribute.experimental.MultiWorkerMirroredStrategy` provides the
+    associated `tf.distribute.cluster_resolver.ClusterResolver`. If the user
+    provides one in `__init__`, that instance is returned; if the user does
+    not, a default `TFConfigClusterResolver` is provided.
+    """
+    return self.extended._cluster_resolver  # pylint: disable=protected-access
+
 
 @tf_export(v1=["distribute.experimental.MultiWorkerMirroredStrategy"])  # pylint: disable=missing-docstring
 class CollectiveAllReduceStrategyV1(distribute_lib.StrategyV1):
diff --git a/tensorflow/python/distribute/collective_all_reduce_strategy_test.py b/tensorflow/python/distribute/collective_all_reduce_strategy_test.py
index 87212c85fc4..9d16c6f5a1f 100644
--- a/tensorflow/python/distribute/collective_all_reduce_strategy_test.py
+++ b/tensorflow/python/distribute/collective_all_reduce_strategy_test.py
@@ -505,8 +505,7 @@ class DistributedCollectiveAllReduceStrategyTest(
     self.assertEqual(['CollectiveReduce'],
                      new_rewrite_options.scoped_allocator_opts.enable_op)
 
-  @combinations.generate(combinations.combine(mode=['eager']))
-  def testEnableCollectiveOps(self):
+  def _get_strategy_with_mocked_methods(self):
     mock_called = [False]
 
     # pylint: disable=dangerous-default-value
@@ -525,9 +524,21 @@ class DistributedCollectiveAllReduceStrategyTest(
                                 mock_configure_collective_ops):
       strategy, _, _ = self._get_test_object(
           task_type='worker', task_id=1, num_gpus=2)
+
+    return strategy, mock_called
+
+  @combinations.generate(combinations.combine(mode=['eager']))
+  def testEnableCollectiveOps(self):
+    strategy, mock_called = self._get_strategy_with_mocked_methods()
     self.assertTrue(strategy.extended._std_server_started)
     self.assertTrue(mock_called[0])
 
+  @combinations.generate(combinations.combine(mode=['eager']))
+  def testEnableCollectiveOpsAndClusterResolver(self):
+    strategy, _ = self._get_strategy_with_mocked_methods()
+    self.assertEqual(strategy.cluster_resolver.task_type, 'worker')
+    self.assertEqual(strategy.cluster_resolver.task_id, 1)
+
 
 class DistributedCollectiveAllReduceStrategyTestWithChief(
     CollectiveAllReduceStrategyTestBase, parameterized.TestCase):
diff --git a/tensorflow/python/distribute/distribute_lib.py b/tensorflow/python/distribute/distribute_lib.py
index 5abfb6e1c09..f32427b88e0 100644
--- a/tensorflow/python/distribute/distribute_lib.py
+++ b/tensorflow/python/distribute/distribute_lib.py
@@ -1439,6 +1439,65 @@ class StrategyBase(object):
   def __copy__(self):
     raise RuntimeError("Must only deepcopy DistributionStrategy.")
 
+  @property
+  def cluster_resolver(self):
+    """Returns the cluster resolver associated with this strategy.
+
+    In general, when using a multi-worker `tf.distribute` strategy such as
+    `tf.distribute.experimental.MultiWorkerMirroredStrategy` or
+    `tf.distribute.experimental.TPUStrategy()`, there is a
+    `tf.distribute.cluster_resolver.ClusterResolver` associated with the
+    strategy used, and such an instance is returned by this property.
+
+    Strategies that intend to have an associated
+    `tf.distribute.cluster_resolver.ClusterResolver` must set the
+    relevant attribute, or override this property; otherwise, `None` is returned
+    by default. Those strategies should also provide information regarding what
+    is returned by this property.
+
+    Single-worker strategies usually do not have a
+    `tf.distribute.cluster_resolver.ClusterResolver`, and in those cases this
+    property will return `None`.
+
+    The `tf.distribute.cluster_resolver.ClusterResolver` may be useful when the
+    user needs to access information such as the cluster spec, task type or task
+    id. For example,
+
+    ```python
+
+    os.environ['TF_CONFIG'] = json.dumps({
+      'cluster': {
+          'worker': ["localhost:12345", "localhost:23456"],
+          'ps': ["localhost:34567"]
+      },
+      'task': {'type': 'worker', 'index': 0}
+    })
+
+    # This implicitly uses TF_CONFIG for the cluster and current task info.
+    strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy()
+
+    ...
+
+    if strategy.cluster_resolver.task_type == 'worker':
+      # Perform something that's only applicable on workers. Since we set this
+      # as a worker above, this block will run on this particular instance.
+    elif strategy.cluster_resolver.task_type == 'ps':
+      # Perform something that's only applicable on parameter servers. Since we
+      # set this as a worker above, this block will not run on this particular
+      # instance.
+    ```
+
+    For more information, please see
+    `tf.distribute.cluster_resolver.ClusterResolver`'s API docstring.
+
+    Returns:
+      The cluster resolver associated with this strategy. Returns `None` if a
+      cluster resolver is not applicable or available in this strategy.
+    """
+    if hasattr(self.extended, "_cluster_resolver"):
+      return self.extended._cluster_resolver  # pylint: disable=protected-access
+    return None
+
 
 @tf_export("distribute.Strategy", v1=[])  # pylint: disable=g-missing-docstring
 class Strategy(StrategyBase):
diff --git a/tensorflow/python/distribute/distribute_lib_test.py b/tensorflow/python/distribute/distribute_lib_test.py
index 8ea1cac6f02..b5924ec3b67 100644
--- a/tensorflow/python/distribute/distribute_lib_test.py
+++ b/tensorflow/python/distribute/distribute_lib_test.py
@@ -28,6 +28,7 @@ from tensorflow.python.distribute import distribute_lib
 from tensorflow.python.distribute import distribution_strategy_context as ds_context
 from tensorflow.python.distribute import input_lib
 from tensorflow.python.distribute import reduce_util
+from tensorflow.python.distribute.cluster_resolver import SimpleClusterResolver
 from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
 from tensorflow.python.framework import constant_op
@@ -36,6 +37,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
+from tensorflow.python.training import server_lib
 from tensorflow.python.util import nest
 
 
@@ -422,6 +424,17 @@ class TestStrategyTest(test.TestCase):
 
     test_fn()
 
+  def testClusterResolverDefaultNotImplemented(self):
+    dist = _TestStrategy()
+    self.assertIsNone(dist.cluster_resolver)
+    base_cluster_spec = server_lib.ClusterSpec({
+        "ps": ["ps0:2222", "ps1:2222"],
+        "worker": ["worker0:2222", "worker1:2222", "worker2:2222"]
+    })
+    cluster_resolver = SimpleClusterResolver(base_cluster_spec)
+    dist.extended._cluster_resolver = cluster_resolver
+    self.assertIs(dist.cluster_resolver, cluster_resolver)
+
 
 # _TestStrategy2 is like _TestStrategy, except it doesn't change variable
 # creation.
diff --git a/tensorflow/python/distribute/strategy_common_test.py b/tensorflow/python/distribute/strategy_common_test.py
index b823e2de331..d1a72da7e7c 100644
--- a/tensorflow/python/distribute/strategy_common_test.py
+++ b/tensorflow/python/distribute/strategy_common_test.py
@@ -27,6 +27,8 @@ from tensorflow.python.distribute import multi_worker_test_base
 from tensorflow.python.distribute import reduce_util
 from tensorflow.python.distribute import strategy_combinations
 from tensorflow.python.distribute import strategy_test_lib
+from tensorflow.python.distribute.collective_all_reduce_strategy import CollectiveAllReduceStrategy
+from tensorflow.python.distribute.tpu_strategy import TPUStrategy
 from tensorflow.python.eager import def_function
 from tensorflow.python.framework import dtypes
 from tensorflow.python.ops import array_ops
@@ -184,5 +186,38 @@ class DistributedCollectiveAllReduceStrategyTest(
   # worker strategy combinations can run on a fixed number of GPUs.
 
 
+class StrategyClusterResolverTest(test.TestCase, parameterized.TestCase):
+
+  @combinations.generate(
+      combinations.combine(
+          strategy=[strategy_combinations.multi_worker_mirrored_two_workers] +
+          strategy_combinations.all_strategies,
+          mode=['eager']))
+  def testClusterResolverProperty(self, strategy):
+    # CollectiveAllReduceStrategy and TPUStrategy must have a cluster resolver.
+    # `None` otherwise.
+    resolver = strategy.cluster_resolver
+    if not isinstance(strategy, CollectiveAllReduceStrategy) and not isinstance(
+        strategy, TPUStrategy):
+      self.assertIsNone(resolver)
+      return
+
+    with strategy.scope():
+      self.assertIs(strategy.cluster_resolver, resolver)
+    self.assertTrue(hasattr(resolver, 'cluster_spec'))
+    self.assertTrue(hasattr(resolver, 'environment'))
+    self.assertTrue(hasattr(resolver, 'master'))
+    self.assertTrue(hasattr(resolver, 'num_accelerators'))
+    self.assertIsNone(resolver.rpc_layer)
+    if isinstance(strategy, CollectiveAllReduceStrategy):
+      self.assertGreaterEqual(resolver.task_id, 0)
+      self.assertLessEqual(resolver.task_id, 1)
+      self.assertEqual(resolver.task_type, 'worker')
+    elif isinstance(strategy, TPUStrategy):
+      # TPUStrategy does not have task_id and task_type applicable.
+      self.assertIsNone(resolver.task_id)
+      self.assertIsNone(resolver.task_type)
+
+
 if __name__ == '__main__':
   combinations.main()
diff --git a/tensorflow/python/distribute/tpu_strategy.py b/tensorflow/python/distribute/tpu_strategy.py
index 4b3c4be0ccd..df393c61dbb 100644
--- a/tensorflow/python/distribute/tpu_strategy.py
+++ b/tensorflow/python/distribute/tpu_strategy.py
@@ -345,6 +345,18 @@ class TPUStrategy(distribute_lib.Strategy):
     options = options or distribute_lib.RunOptions()
     return self.extended.tpu_run(fn, args, kwargs, options)
 
+  @property
+  def cluster_resolver(self):
+    """Returns the cluster resolver associated with this strategy.
+
+    `tf.distribute.experimental.TPUStrategy` provides the
+    associated `tf.distribute.cluster_resolver.ClusterResolver`. If the user
+    provides one in `__init__`, that instance is returned; if the user does
+    not, a default
+    `tf.distribute.cluster_resolver.TPUClusterResolver` is provided.
+    """
+    return self.extended._tpu_cluster_resolver  # pylint: disable=protected-access
+
 
 @tf_export(v1=["distribute.experimental.TPUStrategy"])
 class TPUStrategyV1(distribute_lib.StrategyV1):
diff --git a/tensorflow/python/distribute/tpu_strategy_test.py b/tensorflow/python/distribute/tpu_strategy_test.py
index 5e47e750d87..142743a6ec2 100644
--- a/tensorflow/python/distribute/tpu_strategy_test.py
+++ b/tensorflow/python/distribute/tpu_strategy_test.py
@@ -555,6 +555,13 @@ class TPUStrategyTest(test.TestCase, parameterized.TestCase):
       update_variable.get_concrete_function()
       self.assertLen(strategy.extended.worker_devices, trace_count[0])
 
+  def test_cluster_resolver_available(self, enable_packed_var):
+    resolver = get_tpu_cluster_resolver()
+    remote.connect_to_cluster(resolver)
+    tpu_strategy_util.initialize_tpu_system(resolver)
+    strategy = tpu_lib.TPUStrategy(resolver)
+    self.assertIs(strategy.cluster_resolver, resolver)
+
 
 class TPUStrategyDataPrefetchTest(test.TestCase):
 
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.distribute.-mirrored-strategy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distribute.-mirrored-strategy.pbtxt
index 36c78c406b7..0c5db602029 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.distribute.-mirrored-strategy.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.distribute.-mirrored-strategy.pbtxt
@@ -4,6 +4,10 @@ tf_class {
   is_instance: "<class \'tensorflow.python.distribute.distribute_lib.StrategyV1\'>"
   is_instance: "<class \'tensorflow.python.distribute.distribute_lib.StrategyBase\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "cluster_resolver"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "extended"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.distribute.-one-device-strategy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distribute.-one-device-strategy.pbtxt
index 09865ab02ee..ae62acffa44 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.distribute.-one-device-strategy.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.distribute.-one-device-strategy.pbtxt
@@ -4,6 +4,10 @@ tf_class {
   is_instance: "<class \'tensorflow.python.distribute.distribute_lib.StrategyV1\'>"
   is_instance: "<class \'tensorflow.python.distribute.distribute_lib.StrategyBase\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "cluster_resolver"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "extended"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.distribute.-strategy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distribute.-strategy.pbtxt
index 0e6c10bd533..9285405ea4f 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.distribute.-strategy.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.distribute.-strategy.pbtxt
@@ -3,6 +3,10 @@ tf_class {
   is_instance: "<class \'tensorflow.python.distribute.distribute_lib.StrategyV1\'>"
   is_instance: "<class \'tensorflow.python.distribute.distribute_lib.StrategyBase\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "cluster_resolver"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "extended"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.distribute.experimental.-central-storage-strategy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distribute.experimental.-central-storage-strategy.pbtxt
index fbc4c107a1a..3c3d785ac7c 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.distribute.experimental.-central-storage-strategy.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.distribute.experimental.-central-storage-strategy.pbtxt
@@ -4,6 +4,10 @@ tf_class {
   is_instance: "<class \'tensorflow.python.distribute.distribute_lib.StrategyV1\'>"
   is_instance: "<class \'tensorflow.python.distribute.distribute_lib.StrategyBase\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "cluster_resolver"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "extended"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.distribute.experimental.-multi-worker-mirrored-strategy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distribute.experimental.-multi-worker-mirrored-strategy.pbtxt
index cd67e7d27c4..e1f8bea251b 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.distribute.experimental.-multi-worker-mirrored-strategy.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.distribute.experimental.-multi-worker-mirrored-strategy.pbtxt
@@ -4,6 +4,10 @@ tf_class {
   is_instance: "<class \'tensorflow.python.distribute.distribute_lib.StrategyV1\'>"
   is_instance: "<class \'tensorflow.python.distribute.distribute_lib.StrategyBase\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "cluster_resolver"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "extended"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.distribute.experimental.-parameter-server-strategy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distribute.experimental.-parameter-server-strategy.pbtxt
index 0eff82474ff..6ae83d18589 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.distribute.experimental.-parameter-server-strategy.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.distribute.experimental.-parameter-server-strategy.pbtxt
@@ -4,6 +4,10 @@ tf_class {
   is_instance: "<class \'tensorflow.python.distribute.distribute_lib.StrategyV1\'>"
   is_instance: "<class \'tensorflow.python.distribute.distribute_lib.StrategyBase\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "cluster_resolver"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "extended"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.distribute.experimental.-t-p-u-strategy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distribute.experimental.-t-p-u-strategy.pbtxt
index 2af9a5ad095..0e548eca9b5 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.distribute.experimental.-t-p-u-strategy.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.distribute.experimental.-t-p-u-strategy.pbtxt
@@ -4,6 +4,10 @@ tf_class {
   is_instance: "<class \'tensorflow.python.distribute.distribute_lib.StrategyV1\'>"
   is_instance: "<class \'tensorflow.python.distribute.distribute_lib.StrategyBase\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "cluster_resolver"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "extended"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-mirrored-strategy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-mirrored-strategy.pbtxt
index be4c841aed7..8817f16d808 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-mirrored-strategy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-mirrored-strategy.pbtxt
@@ -4,6 +4,10 @@ tf_class {
   is_instance: "<class \'tensorflow.python.distribute.distribute_lib.Strategy\'>"
   is_instance: "<class \'tensorflow.python.distribute.distribute_lib.StrategyBase\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "cluster_resolver"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "extended"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-one-device-strategy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-one-device-strategy.pbtxt
index 4557fe1060b..992243ffe8a 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-one-device-strategy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-one-device-strategy.pbtxt
@@ -4,6 +4,10 @@ tf_class {
   is_instance: "<class \'tensorflow.python.distribute.distribute_lib.Strategy\'>"
   is_instance: "<class \'tensorflow.python.distribute.distribute_lib.StrategyBase\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "cluster_resolver"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "extended"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-strategy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-strategy.pbtxt
index 9f6a2ac32be..8140088e701 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-strategy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-strategy.pbtxt
@@ -3,6 +3,10 @@ tf_class {
   is_instance: "<class \'tensorflow.python.distribute.distribute_lib.Strategy\'>"
   is_instance: "<class \'tensorflow.python.distribute.distribute_lib.StrategyBase\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "cluster_resolver"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "extended"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-t-p-u-strategy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-t-p-u-strategy.pbtxt
index f41a08454e2..29947a1c9c5 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-t-p-u-strategy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-t-p-u-strategy.pbtxt
@@ -4,6 +4,10 @@ tf_class {
   is_instance: "<class \'tensorflow.python.distribute.distribute_lib.Strategy\'>"
   is_instance: "<class \'tensorflow.python.distribute.distribute_lib.StrategyBase\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "cluster_resolver"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "extended"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-central-storage-strategy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-central-storage-strategy.pbtxt
index dd61960c66f..695fb52358b 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-central-storage-strategy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-central-storage-strategy.pbtxt
@@ -4,6 +4,10 @@ tf_class {
   is_instance: "<class \'tensorflow.python.distribute.distribute_lib.Strategy\'>"
   is_instance: "<class \'tensorflow.python.distribute.distribute_lib.StrategyBase\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "cluster_resolver"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "extended"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-multi-worker-mirrored-strategy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-multi-worker-mirrored-strategy.pbtxt
index 500ae362e5f..43632e17b6d 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-multi-worker-mirrored-strategy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-multi-worker-mirrored-strategy.pbtxt
@@ -4,6 +4,10 @@ tf_class {
   is_instance: "<class \'tensorflow.python.distribute.distribute_lib.Strategy\'>"
   is_instance: "<class \'tensorflow.python.distribute.distribute_lib.StrategyBase\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "cluster_resolver"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "extended"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-parameter-server-strategy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-parameter-server-strategy.pbtxt
index d6dc9627d9a..39181625469 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-parameter-server-strategy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-parameter-server-strategy.pbtxt
@@ -4,6 +4,10 @@ tf_class {
   is_instance: "<class \'tensorflow.python.distribute.distribute_lib.Strategy\'>"
   is_instance: "<class \'tensorflow.python.distribute.distribute_lib.StrategyBase\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "cluster_resolver"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "extended"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-t-p-u-strategy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-t-p-u-strategy.pbtxt
index 82a4362a597..855cdbfb175 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-t-p-u-strategy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-t-p-u-strategy.pbtxt
@@ -4,6 +4,10 @@ tf_class {
   is_instance: "<class \'tensorflow.python.distribute.distribute_lib.Strategy\'>"
   is_instance: "<class \'tensorflow.python.distribute.distribute_lib.StrategyBase\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "cluster_resolver"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "extended"
     mtype: "<type \'property\'>"

From e60cf08994a837d1dc37ec2f85063e44b3987a81 Mon Sep 17 00:00:00 2001
From: Rahul Joshi <jurahul@google.com>
Date: Mon, 22 Jun 2020 18:11:15 -0700
Subject: [PATCH 0847/1390] [NFC] Re-instate use of FuncOp::isPublic()

PiperOrigin-RevId: 317774876
Change-Id: I6a832236377d403b1ebd24ecbc26025c37dc1c13
---
 .../compiler/mlir/tensorflow/ir/tf_saved_model.cc   | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.cc
index ef55761686e..5a7d81d4c0c 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.cc
@@ -232,22 +232,19 @@ static LogicalResult VerifySavedModelModule(
   for (auto func : module.getOps<FuncOp>()) {
     const bool is_exported = IsExported(func);
 
-    if (is_exported && func.getVisibility() != FuncOp::Visibility::Public) {
+    if (is_exported && !func.isPublic()) {
       return func.emitError()
              << "exported function @" << func.getName() << " should be public";
     }
 
-    if (!is_exported && func.getVisibility() == FuncOp::Visibility::Public) {
+    if (!is_exported && func.isPublic()) {
       return func.emitError() << "non-exported function @" << func.getName()
                               << " should be private";
     }
 
-    if (HasAnyTfSavedModelArgAttr(func)) {
-      if (!is_exported) {
-        return func.emitError()
-               << "can only apply 'tf_saved_model' argument attributes "
-                  "to exported functions";
-      }
+    if (!is_exported && HasAnyTfSavedModelArgAttr(func)) {
+      return func.emitError() << "can only apply 'tf_saved_model' argument "
+                                 "attributes to exported functions";
     }
   }
 

From 265de52331a10af793f733d21e2152123819a269 Mon Sep 17 00:00:00 2001
From: Scott Zhu <scottzhu@google.com>
Date: Mon, 22 Jun 2020 18:20:43 -0700
Subject: [PATCH 0848/1390] Fix input mapping issue when model is
 constructed/tested with dict input tensor.

The mapping of the dict input tensors was not correct since it was still using the tensor name, rather than the key of the tensor when build the model. This cause the issue down the stream when the inputs are provided with unknown keys.

We had some backup logic, which will probably do correct things, eg just flat the dict to keep the original order, which was correct most of the case, but not very reliable. In this change, we make the behavior change:

1. When model is build with dict input tensors, the key of the tensor, instead of the name, will be used to map the tensor with input data.
2. Unknown keys in the input data will result into a warning. We didn't throw error since user might do it intentionally, eg using part of the model to test with full input data.

PiperOrigin-RevId: 317776370
Change-Id: I91983443f2b770cb0b45ddb7726f52708cb91d61
---
 tensorflow/python/keras/engine/functional.py  | 24 ++++++++--
 .../python/keras/engine/functional_test.py    | 46 +++++++++++++++++++
 2 files changed, 67 insertions(+), 3 deletions(-)

diff --git a/tensorflow/python/keras/engine/functional.py b/tensorflow/python/keras/engine/functional.py
index 0612d70044d..fd80e7f8bb4 100644
--- a/tensorflow/python/keras/engine/functional.py
+++ b/tensorflow/python/keras/engine/functional.py
@@ -22,6 +22,7 @@ from __future__ import print_function
 import collections
 import copy
 import itertools
+import warnings
 
 from six.moves import zip  # pylint: disable=redefined-builtin
 
@@ -131,10 +132,10 @@ class Functional(training_lib.Model):
 
     # Models constructed with a single Tensor or list of Tensors can
     # be called with a dict, where the keys of the dict are the names
-    # of the `Input` objects. Extra keys are ignored.
+    # of the `Input` objects. Extra keys are ignored with warning.
     self._enable_dict_to_input_mapping = (
         not nest.is_sequence(self._nested_inputs) or
-        (isinstance(self._nested_inputs, (list, tuple)) and
+        (isinstance(self._nested_inputs, (list, tuple, dict)) and
          not any(nest.is_sequence(t) for t in self._nested_inputs)))
 
     if any(not hasattr(tensor, '_keras_history') for tensor in self.outputs):
@@ -524,10 +525,27 @@ class Functional(training_lib.Model):
       ref_inputs = self._nested_inputs
       if not nest.is_sequence(ref_inputs):
         ref_inputs = [self._nested_inputs]
+      if isinstance(ref_inputs, dict):
+        # In the case that the graph is constructed with dict input tensors,
+        # We will use the original dict key to map with the keys in the input
+        # data. Note that the model.inputs is using nest.flatten to process the
+        # input tensors, which means the dict input tensors are ordered by their
+        # keys.
+        ref_input_names = sorted(ref_inputs.keys())
+      else:
+        ref_input_names = [inp._keras_history.layer.name for inp in ref_inputs]
+
+      # Raise an warning if there are more input data comparing to input tensor
+      if len(tensors) > len(ref_input_names):
+        warnings.warn(
+            'Input dict contained keys {} which did not match any model input. '
+            'They will be ignored by the model.'.format(
+                [n for n in tensors.keys() if n not in ref_input_names])
+            )
 
       try:
         # Flatten in the order `Input`s were passed during Model construction.
-        return [tensors[inp._keras_history.layer.name] for inp in ref_inputs]
+        return [tensors[n] for n in ref_input_names]
       except KeyError:
         # TODO(b/151582614)
         return nest.flatten(tensors)
diff --git a/tensorflow/python/keras/engine/functional_test.py b/tensorflow/python/keras/engine/functional_test.py
index 3c14411deb9..0e82d95d3de 100644
--- a/tensorflow/python/keras/engine/functional_test.py
+++ b/tensorflow/python/keras/engine/functional_test.py
@@ -18,8 +18,11 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import warnings
+
 import numpy as np
 
+
 from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
 from tensorflow.python.framework import constant_op
@@ -43,6 +46,7 @@ from tensorflow.python.keras.utils import tf_utils
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import string_ops
 from tensorflow.python.ops.ragged import ragged_factory_ops
 from tensorflow.python.platform import test
 from tensorflow.python.training.tracking.util import Checkpoint
@@ -1565,6 +1569,48 @@ class DefaultShapeInferenceBehaviorTest(keras_parameterized.TestCase):
     self.assertEqual(config['layers'][2]['inbound_nodes'],
                      [[['in1', 0, 0, {}], ['in2', 0, 0, {}]]])
 
+  @combinations.generate(combinations.combine(mode=['eager']))
+  def test_dict_inputs_tensors(self):
+    # Note that this test is running with v2 eager only, since the v1
+    # will behave differently wrt to dict input for training.
+    inputs = {
+        'sentence2': input_layer_lib.Input(
+            shape=(), name='a', dtype=dtypes.string),
+        'sentence1': input_layer_lib.Input(
+            shape=(), name='b', dtype=dtypes.string),
+    }
+    strlen = layers.Lambda(string_ops.string_length_v2)
+    diff = layers.Subtract()(
+        [strlen(inputs['sentence1']), strlen(inputs['sentence2'])])
+    diff = math_ops.cast(diff, dtypes.float32)
+    model = training_lib.Model(inputs, diff)
+
+    extra_keys = {
+        'sentence1': constant_op.constant(['brown fox', 'lazy dog']),
+        'sentence2': constant_op.constant(['owl', 'cheeky cat']),
+        'label': constant_op.constant([0, 1]),
+    }
+
+    with warnings.catch_warnings(record=True) as w:
+      warnings.simplefilter('always')
+      model(extra_keys)
+      self.assertIn('ignored by the model', str(w[-1].message))
+
+    model.compile('sgd', 'mse')
+    with warnings.catch_warnings(record=True) as w:
+      warnings.simplefilter('always')
+      model.fit(extra_keys, y=constant_op.constant([0, 1]), steps_per_epoch=1)
+      self.assertIn('ignored by the model', str(w[-1].message))
+
+    with warnings.catch_warnings(record=True) as w:
+      warnings.simplefilter('always')
+      model.evaluate(extra_keys, constant_op.constant([0, 1]))
+      self.assertIn('ignored by the model', str(w[-1].message))
+
+    # Make sure the model inputs are sorted with the dict keys.
+    self.assertEqual(model.inputs[0]._keras_history.layer.name, 'b')
+    self.assertEqual(model.inputs[1]._keras_history.layer.name, 'a')
+
 
 class GraphUtilsTest(test.TestCase):
 

From d8b90acf95bdff09e29b1065739e28e8bd7500d9 Mon Sep 17 00:00:00 2001
From: Dan Moldovan <mdan@google.com>
Date: Mon, 22 Jun 2020 18:46:12 -0700
Subject: [PATCH 0849/1390] Enable type annotations for python/ops.

PiperOrigin-RevId: 317779660
Change-Id: Ife6b7319ef394b611798f5d01b64ebdb3c0a01cc
---
 tensorflow/python/ops/logging_ops.py | 18 +++---------------
 1 file changed, 3 insertions(+), 15 deletions(-)

diff --git a/tensorflow/python/ops/logging_ops.py b/tensorflow/python/ops/logging_ops.py
index 8ca63f55987..02fce277690 100644
--- a/tensorflow/python/ops/logging_ops.py
+++ b/tensorflow/python/ops/logging_ops.py
@@ -54,11 +54,9 @@ except NameError:
 # call relies on certain conditionals for its dependencies.  Use
 # control_flow_ops.Assert.
 
-# Assert and Print are special symbols in python, so we must
-# have an upper-case version of them.
-#
-# For users with Python 3 or Python 2.7
-# with `from __future__ import print_function`, we could also allow lowercase.
+# Assert and Print are special symbols in Python 2, so we must
+# have an upper-case version of them. When support for it is dropped,
+# we can allow lowercase.
 # See https://github.com/tensorflow/tensorflow/issues/18053
 
 
@@ -83,11 +81,6 @@ def Print(input_, data, message=None, first_n=None, summarize=None, name=None):
     with jupyter notebook (printing to the notebook *server's* output, not into
     the notebook).
 
-  Additionally, to use tf.print in python 2.7, users must make sure to import
-  the following:
-
-  `from __future__ import print_function`
-
   Args:
     input_: A tensor passed through this op.
     data: A list of tensors to print out when op is evaluated.
@@ -148,11 +141,6 @@ def print_v2(*inputs, **kwargs):
   Python objects. Printed tensors will recursively show the first and last
   elements of each dimension to summarize.
 
-  @compatibility(python2)
-  In python 2.7, make sure to import the following:
-  `from __future__ import print_function`
-  @end_compatibility
-
   Example:
     Single-input usage:
 

From 7c669e5f795a75bf6665c9596d730e438ab36511 Mon Sep 17 00:00:00 2001
From: Bixia Zheng <bixia@google.com>
Date: Mon, 22 Jun 2020 19:13:19 -0700
Subject: [PATCH 0850/1390] [TF:TRT] Add #if to logger_registry.h for
 consistency.

Add #if GOOGLE_TENSORRT to logger_registry.h to make it consistent with
logger_registry.cc.

Rewrite two lines of #if into #if GOOGLE_CUDA && GOOGLE_TENSORRT.

PiperOrigin-RevId: 317782636
Change-Id: Icaf7116b5013f57bb37b80511c7c195f8bd7e2d5
---
 tensorflow/compiler/tf2tensorrt/common/utils.h       |  6 ++----
 .../compiler/tf2tensorrt/convert/convert_graph.cc    |  6 ++----
 .../compiler/tf2tensorrt/convert/convert_graph.h     |  6 ++----
 .../tf2tensorrt/convert/convert_graph_test.cc        |  6 ++----
 .../compiler/tf2tensorrt/convert/convert_nodes.cc    |  6 ++----
 .../compiler/tf2tensorrt/convert/convert_nodes.h     |  6 ++----
 .../tf2tensorrt/convert/convert_nodes_test.cc        |  6 ++----
 .../compiler/tf2tensorrt/convert/logger_registry.cc  |  6 ++----
 .../compiler/tf2tensorrt/convert/logger_registry.h   |  5 +++--
 .../tf2tensorrt/convert/trt_optimization_pass.cc     |  6 ++----
 .../tf2tensorrt/convert/trt_optimization_pass.h      |  6 ++----
 .../tf2tensorrt/kernels/get_calibration_data_op.cc   |  6 ++----
 .../compiler/tf2tensorrt/kernels/trt_engine_op.cc    |  6 ++----
 .../tf2tensorrt/kernels/trt_engine_op_test.cc        |  6 ++----
 .../tf2tensorrt/kernels/trt_engine_resource_ops.cc   |  6 ++----
 .../kernels/trt_engine_resource_ops_test.cc          |  6 ++----
 .../tf2tensorrt/ops/get_calibration_data_op.cc       |  6 ++----
 tensorflow/compiler/tf2tensorrt/ops/trt_engine_op.cc |  6 ++----
 .../tf2tensorrt/ops/trt_engine_resource_ops.cc       |  6 ++----
 .../compiler/tf2tensorrt/plugin/plugin_cast.cu.cc    |  6 ++----
 tensorflow/compiler/tf2tensorrt/plugin/trt_plugin.cc |  6 ++----
 tensorflow/compiler/tf2tensorrt/plugin/trt_plugin.h  |  6 ++----
 tensorflow/compiler/tf2tensorrt/segment/segment.cc   |  6 ++----
 tensorflow/compiler/tf2tensorrt/segment/segment.h    |  6 ++----
 .../compiler/tf2tensorrt/segment/segment_test.cc     |  6 ++----
 tensorflow/compiler/tf2tensorrt/segment/union_find.h |  6 ++----
 tensorflow/compiler/tf2tensorrt/tensorrt_test.cc     |  6 ++----
 .../compiler/tf2tensorrt/utils/trt_allocator.cc      | 12 ++++--------
 .../compiler/tf2tensorrt/utils/trt_allocator.h       | 12 ++++--------
 .../compiler/tf2tensorrt/utils/trt_engine_utils.cc   |  6 ++----
 .../compiler/tf2tensorrt/utils/trt_engine_utils.h    |  6 ++----
 .../tf2tensorrt/utils/trt_int8_calibrator.cc         |  6 ++----
 .../compiler/tf2tensorrt/utils/trt_int8_calibrator.h |  6 ++----
 tensorflow/compiler/tf2tensorrt/utils/trt_logger.cc  |  6 ++----
 tensorflow/compiler/tf2tensorrt/utils/trt_logger.h   |  6 ++----
 .../compiler/tf2tensorrt/utils/trt_lru_cache.cc      |  6 ++----
 .../compiler/tf2tensorrt/utils/trt_lru_cache.h       |  6 ++----
 .../utils/trt_shape_optimization_profiles.h          |  6 ++----
 .../utils/trt_shape_optimization_profiles_test.cc    |  6 ++----
 39 files changed, 83 insertions(+), 162 deletions(-)

diff --git a/tensorflow/compiler/tf2tensorrt/common/utils.h b/tensorflow/compiler/tf2tensorrt/common/utils.h
index 9ab0145e1ec..b428733ecd4 100644
--- a/tensorflow/compiler/tf2tensorrt/common/utils.h
+++ b/tensorflow/compiler/tf2tensorrt/common/utils.h
@@ -16,8 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_TF2TENSORRT_COMMON_UTILS_H_
 #define TENSORFLOW_COMPILER_TF2TENSORRT_COMMON_UTILS_H_
 
-#if GOOGLE_CUDA
-#if GOOGLE_TENSORRT
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
 
 #include "tensorflow/core/platform/logging.h"
 
@@ -29,7 +28,6 @@ namespace tensorrt {
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#endif
-#endif
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
 
 #endif  // TENSORFLOW_COMPILER_TF2TENSORRT_COMMON_UTILS_H_
diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc b/tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc
index 1c51d51f1c9..5429aaf3362 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc
+++ b/tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc
@@ -53,8 +53,7 @@ limitations under the License.
 #include "tensorflow/core/util/device_name_utils.h"
 #include "tensorflow/tools/graph_transforms/transform_utils.h"
 
-#if GOOGLE_CUDA
-#if GOOGLE_TENSORRT
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
 #include "third_party/gpus/cuda/include/cuda_runtime_api.h"
 #include "third_party/tensorrt/NvInfer.h"
 namespace tensorflow {
@@ -884,5 +883,4 @@ Status ConvertAfterShapes(const ConversionParams& params) {
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#endif  // GOOGLE_TENSORRT
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_graph.h b/tensorflow/compiler/tf2tensorrt/convert/convert_graph.h
index 53ab84a6fa9..d3897e864fa 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/convert_graph.h
+++ b/tensorflow/compiler/tf2tensorrt/convert/convert_graph.h
@@ -24,8 +24,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/types.h"
 
-#if GOOGLE_CUDA
-#if GOOGLE_TENSORRT
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
 
 namespace tensorflow {
 namespace tensorrt {
@@ -66,7 +65,6 @@ Status RegisterGraphToFunctionLibrary(const GraphDef& segment_graph_def,
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#endif  // GOOGLE_TENSORRT
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
 
 #endif  // TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_CONVERT_GRAPH_H_
diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_graph_test.cc b/tensorflow/compiler/tf2tensorrt/convert/convert_graph_test.cc
index a1f523d6bfa..54fb1d56441 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/convert_graph_test.cc
+++ b/tensorflow/compiler/tf2tensorrt/convert/convert_graph_test.cc
@@ -34,8 +34,7 @@ limitations under the License.
 #include "tensorflow/core/protobuf/config.pb.h"  // NOLINT
 #include "tensorflow/core/public/session.h"
 
-#if GOOGLE_CUDA
-#if GOOGLE_TENSORRT
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
 
 namespace tensorflow {
 namespace tensorrt {
@@ -231,5 +230,4 @@ TEST_F(ConvertAfterShapesTest, DirectlyConnectedEngines) {
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#endif  // GOOGLE_TENSORRT
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc
index 96cec556942..2ec616ba621 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc
+++ b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc
@@ -59,8 +59,7 @@ limitations under the License.
 #include "tensorflow/core/util/env_var.h"
 #include "tensorflow/core/util/strided_slice_op.h"
 
-#if GOOGLE_CUDA
-#if GOOGLE_TENSORRT
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
 #include "third_party/tensorrt/NvInfer.h"
 #include "third_party/tensorrt/NvInferPlugin.h"
 
@@ -6258,5 +6257,4 @@ bool OutputEdgeValidator::operator()(const Edge* out_edge) const {
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#endif  // GOOGLE_TENSORRT
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h
index 7a1276c645c..a621735fad1 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h
+++ b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h
@@ -33,8 +33,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/stream_executor/lib/statusor.h"
 
-#if GOOGLE_CUDA
-#if GOOGLE_TENSORRT
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
 #include "third_party/tensorrt/NvInfer.h"
 
 namespace tensorflow {
@@ -694,7 +693,6 @@ BinaryOperationMap();
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#endif  // GOOGLE_TENSORRT
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
 
 #endif  // TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_CONVERT_NODES_H_
diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes_test.cc b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes_test.cc
index c24b169f651..53ec9ee7ada 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes_test.cc
+++ b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes_test.cc
@@ -21,8 +21,7 @@ limitations under the License.
 #include <unordered_map>
 #include <vector>
 
-#if GOOGLE_CUDA
-#if GOOGLE_TENSORRT
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
@@ -6636,5 +6635,4 @@ TEST_F(OpConverterTest, ConvertPad) {
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#endif  // GOOGLE_TENSORRT
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
diff --git a/tensorflow/compiler/tf2tensorrt/convert/logger_registry.cc b/tensorflow/compiler/tf2tensorrt/convert/logger_registry.cc
index 82e68cbb28d..07c9c2f1ea0 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/logger_registry.cc
+++ b/tensorflow/compiler/tf2tensorrt/convert/logger_registry.cc
@@ -12,8 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#if GOOGLE_CUDA
-#if GOOGLE_TENSORRT
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
 
 #include "tensorflow/compiler/tf2tensorrt/convert/logger_registry.h"
 
@@ -58,5 +57,4 @@ LoggerRegistry* GetLoggerRegistry() {
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#endif  // GOOGLE_TENSORRT
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
diff --git a/tensorflow/compiler/tf2tensorrt/convert/logger_registry.h b/tensorflow/compiler/tf2tensorrt/convert/logger_registry.h
index 45b302742d0..2a265cf7caa 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/logger_registry.h
+++ b/tensorflow/compiler/tf2tensorrt/convert/logger_registry.h
@@ -19,7 +19,8 @@ limitations under the License.
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/types.h"
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
+
 #include "third_party/tensorrt/NvInfer.h"
 
 namespace tensorflow {
@@ -53,5 +54,5 @@ class RegisterLogger {
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
 #endif  // TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_LOGGER_REGISTRY_H_
diff --git a/tensorflow/compiler/tf2tensorrt/convert/trt_optimization_pass.cc b/tensorflow/compiler/tf2tensorrt/convert/trt_optimization_pass.cc
index 72f4fe5ef9b..1cf98d135cb 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/trt_optimization_pass.cc
+++ b/tensorflow/compiler/tf2tensorrt/convert/trt_optimization_pass.cc
@@ -28,8 +28,7 @@ limitations under the License.
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/stacktrace.h"
 
-#if GOOGLE_CUDA
-#if GOOGLE_TENSORRT
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
 namespace tensorflow {
 namespace tensorrt {
 namespace convert {
@@ -302,5 +301,4 @@ static VerboseCustomGraphOptimizerRegistrar TRTOptimizationPass_Registrar(
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#endif
-#endif
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
diff --git a/tensorflow/compiler/tf2tensorrt/convert/trt_optimization_pass.h b/tensorflow/compiler/tf2tensorrt/convert/trt_optimization_pass.h
index f79048bb5f6..e0aaa5500ab 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/trt_optimization_pass.h
+++ b/tensorflow/compiler/tf2tensorrt/convert/trt_optimization_pass.h
@@ -23,8 +23,7 @@ limitations under the License.
 #include "tensorflow/core/grappler/optimizers/custom_graph_optimizer.h"
 #include "tensorflow/core/platform/logging.h"
 
-#if GOOGLE_CUDA
-#if GOOGLE_TENSORRT
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
 
 namespace tensorflow {
 namespace tensorrt {
@@ -83,6 +82,5 @@ class TRTOptimizationPass : public grappler::CustomGraphOptimizer {
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#endif  // GOOGLE_CUDA
-#endif  // GOOGLE_TENSORRT
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
 #endif  // TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_TRT_OPTIMIZATION_PASS_H_
diff --git a/tensorflow/compiler/tf2tensorrt/kernels/get_calibration_data_op.cc b/tensorflow/compiler/tf2tensorrt/kernels/get_calibration_data_op.cc
index 3143b06817e..76fb40b9520 100644
--- a/tensorflow/compiler/tf2tensorrt/kernels/get_calibration_data_op.cc
+++ b/tensorflow/compiler/tf2tensorrt/kernels/get_calibration_data_op.cc
@@ -22,8 +22,7 @@ limitations under the License.
 #include "tensorflow/core/framework/resource_mgr.h"
 #include "tensorflow/core/lib/core/refcount.h"
 
-#if GOOGLE_CUDA
-#if GOOGLE_TENSORRT
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
 
 namespace tensorflow {
 namespace tensorrt {
@@ -67,5 +66,4 @@ REGISTER_KERNEL_BUILDER(Name("GetCalibrationDataOp").Device(DEVICE_GPU),
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#endif  // GOOGLE_TENSORRT
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
diff --git a/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op.cc b/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op.cc
index 98d199ca9ab..1094555a622 100644
--- a/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op.cc
+++ b/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op.cc
@@ -48,8 +48,7 @@ limitations under the License.
 #include "tensorflow/core/util/env_var.h"
 #include "tensorflow/stream_executor/lib/statusor.h"
 
-#if GOOGLE_CUDA
-#if GOOGLE_TENSORRT
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
 #include "third_party/gpus/cuda/include/cuda_runtime_api.h"
 #include "third_party/tensorrt/NvInfer.h"
 
@@ -1009,5 +1008,4 @@ REGISTER_KERNEL_BUILDER(Name("TRTEngineOp").Device(DEVICE_GPU), TRTEngineOp);
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#endif  // GOOGLE_TENSORRT
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
diff --git a/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op_test.cc b/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op_test.cc
index a06010de1c7..71193dc24cf 100644
--- a/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op_test.cc
+++ b/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op_test.cc
@@ -50,8 +50,7 @@ limitations under the License.
 #include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/public/version.h"
 
-#if GOOGLE_CUDA
-#if GOOGLE_TENSORRT
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
 
 namespace tensorflow {
 namespace tensorrt {
@@ -306,5 +305,4 @@ TYPED_TEST(TRTEngineOpTest, Basic) {
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#endif  // GOOGLE_TENSORRT
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
diff --git a/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_resource_ops.cc b/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_resource_ops.cc
index 2c5821df6ac..3b6e7e91d3b 100644
--- a/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_resource_ops.cc
+++ b/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_resource_ops.cc
@@ -33,8 +33,7 @@ limitations under the License.
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/thread_annotations.h"
 
-#if GOOGLE_CUDA
-#if GOOGLE_TENSORRT
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
 #include "third_party/tensorrt/NvInfer.h"
 
 namespace tensorflow {
@@ -251,5 +250,4 @@ REGISTER_KERNEL_BUILDER(Name("SerializeTRTResource").Device(DEVICE_GPU),
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#endif  // GOOGLE_TENSORRT
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
diff --git a/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_resource_ops_test.cc b/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_resource_ops_test.cc
index 4a24160569d..6a073ee24d0 100644
--- a/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_resource_ops_test.cc
+++ b/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_resource_ops_test.cc
@@ -48,8 +48,7 @@ limitations under the License.
 #include "tensorflow/core/platform/tstring.h"
 #include "tensorflow/core/platform/types.h"
 
-#if GOOGLE_CUDA
-#if GOOGLE_TENSORRT
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
 
 namespace tensorflow {
 namespace tensorrt {
@@ -246,5 +245,4 @@ TEST_F(TRTEngineResourceOpsTest, Basic) {
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#endif  // GOOGLE_TENSORRT
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
diff --git a/tensorflow/compiler/tf2tensorrt/ops/get_calibration_data_op.cc b/tensorflow/compiler/tf2tensorrt/ops/get_calibration_data_op.cc
index 573172b92e6..2af3164c3e2 100644
--- a/tensorflow/compiler/tf2tensorrt/ops/get_calibration_data_op.cc
+++ b/tensorflow/compiler/tf2tensorrt/ops/get_calibration_data_op.cc
@@ -13,8 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#if GOOGLE_CUDA
-#if GOOGLE_TENSORRT
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
 
 #include "tensorflow/core/framework/common_shape_fns.h"
 #include "tensorflow/core/framework/op.h"
@@ -34,5 +33,4 @@ Returns calibration data for the given resource name
 
 }  // namespace tensorflow
 
-#endif  // GOOGLE_TENSORRT
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
diff --git a/tensorflow/compiler/tf2tensorrt/ops/trt_engine_op.cc b/tensorflow/compiler/tf2tensorrt/ops/trt_engine_op.cc
index bd3c2b299a9..2527fe9b910 100644
--- a/tensorflow/compiler/tf2tensorrt/ops/trt_engine_op.cc
+++ b/tensorflow/compiler/tf2tensorrt/ops/trt_engine_op.cc
@@ -13,8 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#if GOOGLE_CUDA
-#if GOOGLE_TENSORRT
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
 
 #include "tensorflow/core/framework/common_shape_fns.h"
 #include "tensorflow/core/framework/op.h"
@@ -59,5 +58,4 @@ REGISTER_OP("TRTEngineOp")
     .Attr("static_engine: bool = true");
 }  // namespace tensorflow
 
-#endif  // GOOGLE_TENSORRT
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
diff --git a/tensorflow/compiler/tf2tensorrt/ops/trt_engine_resource_ops.cc b/tensorflow/compiler/tf2tensorrt/ops/trt_engine_resource_ops.cc
index 01911de66ec..3141092de03 100644
--- a/tensorflow/compiler/tf2tensorrt/ops/trt_engine_resource_ops.cc
+++ b/tensorflow/compiler/tf2tensorrt/ops/trt_engine_resource_ops.cc
@@ -13,8 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#if GOOGLE_CUDA
-#if GOOGLE_TENSORRT
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
 
 #include "tensorflow/core/framework/common_shape_fns.h"
 #include "tensorflow/core/framework/op.h"
@@ -46,5 +45,4 @@ REGISTER_OP("SerializeTRTResource")
 
 }  // namespace tensorflow
 
-#endif  // GOOGLE_TENSORRT
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
diff --git a/tensorflow/compiler/tf2tensorrt/plugin/plugin_cast.cu.cc b/tensorflow/compiler/tf2tensorrt/plugin/plugin_cast.cu.cc
index 4c0d8b0392a..141a7d1f462 100644
--- a/tensorflow/compiler/tf2tensorrt/plugin/plugin_cast.cu.cc
+++ b/tensorflow/compiler/tf2tensorrt/plugin/plugin_cast.cu.cc
@@ -17,8 +17,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2tensorrt/plugin/trt_plugin.h"
 #include "tensorflow/core/platform/logging.h"
 
-#if GOOGLE_CUDA
-#if GOOGLE_TENSORRT
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
 #define EIGEN_USE_GPU  // For definition of Eigen::GpuDevice.
 #include "third_party/gpus/cuda/include/cuda_runtime_api.h"
 #include "tensorflow/core/util/gpu_kernel_helper.h"
@@ -234,5 +233,4 @@ REGISTER_TFTRT_PLUGIN(CastPluginCreator);
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#endif  // GOOGLE_CUDA
-#endif  // GOOGLE_TENSORRT
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
diff --git a/tensorflow/compiler/tf2tensorrt/plugin/trt_plugin.cc b/tensorflow/compiler/tf2tensorrt/plugin/trt_plugin.cc
index 563ce724f43..83d5f9b5965 100644
--- a/tensorflow/compiler/tf2tensorrt/plugin/trt_plugin.cc
+++ b/tensorflow/compiler/tf2tensorrt/plugin/trt_plugin.cc
@@ -17,8 +17,7 @@ limitations under the License.
 
 #include <cstring>
 
-#if GOOGLE_CUDA
-#if GOOGLE_TENSORRT
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
 
 namespace tensorflow {
 namespace tensorrt {
@@ -30,5 +29,4 @@ const char* kTfTrtPluginNamespace = "TF";
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#endif  // GOOGLE_CUDA
-#endif  // GOOGLE_TENSORRT
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
diff --git a/tensorflow/compiler/tf2tensorrt/plugin/trt_plugin.h b/tensorflow/compiler/tf2tensorrt/plugin/trt_plugin.h
index bdb046e6c71..600ac6683da 100644
--- a/tensorflow/compiler/tf2tensorrt/plugin/trt_plugin.h
+++ b/tensorflow/compiler/tf2tensorrt/plugin/trt_plugin.h
@@ -20,8 +20,7 @@ limitations under the License.
 
 #include "tensorflow/core/platform/logging.h"
 
-#if GOOGLE_CUDA
-#if GOOGLE_TENSORRT
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
 #include "third_party/tensorrt/NvInfer.h"
 
 namespace tensorflow {
@@ -90,7 +89,6 @@ class TrtPluginRegistrar {
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#endif  // GOOGLE_TENSORRT
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
 
 #endif  // TENSORFLOW_COMPILER_TF2TENSORRT_PLUGIN_TRT_PLUGIN_H_
diff --git a/tensorflow/compiler/tf2tensorrt/segment/segment.cc b/tensorflow/compiler/tf2tensorrt/segment/segment.cc
index 32e30006f58..d9080b6f69a 100644
--- a/tensorflow/compiler/tf2tensorrt/segment/segment.cc
+++ b/tensorflow/compiler/tf2tensorrt/segment/segment.cc
@@ -35,8 +35,7 @@ limitations under the License.
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/util/env_var.h"
 
-#if GOOGLE_CUDA
-#if GOOGLE_TENSORRT
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
 
 namespace tensorflow {
 namespace tensorrt {
@@ -1062,5 +1061,4 @@ Status SegmentGraph(const Graph* tf_graph,
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#endif  // GOOGLE_TENSORRT
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
diff --git a/tensorflow/compiler/tf2tensorrt/segment/segment.h b/tensorflow/compiler/tf2tensorrt/segment/segment.h
index 7295c8f0d9d..3f79983cfd2 100644
--- a/tensorflow/compiler/tf2tensorrt/segment/segment.h
+++ b/tensorflow/compiler/tf2tensorrt/segment/segment.h
@@ -25,8 +25,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/types.h"
 
-#if GOOGLE_CUDA
-#if GOOGLE_TENSORRT
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
 
 namespace tensorflow {
 namespace tensorrt {
@@ -67,7 +66,6 @@ Status SegmentGraph(const Graph* tf_graph,
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#endif  // GOOGLE_TENSORRT
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
 
 #endif  // TENSORFLOW_COMPILER_TF2TENSORRT_SEGMENT_SEGMENT_H_
diff --git a/tensorflow/compiler/tf2tensorrt/segment/segment_test.cc b/tensorflow/compiler/tf2tensorrt/segment/segment_test.cc
index 2437481a9c4..f3bc5bfbee6 100644
--- a/tensorflow/compiler/tf2tensorrt/segment/segment_test.cc
+++ b/tensorflow/compiler/tf2tensorrt/segment/segment_test.cc
@@ -26,8 +26,7 @@ limitations under the License.
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/public/session.h"
 
-#if GOOGLE_CUDA
-#if GOOGLE_TENSORRT
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
 
 namespace tensorflow {
 namespace tensorrt {
@@ -522,5 +521,4 @@ TEST_F(SegmentTest, IncompatibleBatchSizes) {
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#endif  // GOOGLE_TENSORRT
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
diff --git a/tensorflow/compiler/tf2tensorrt/segment/union_find.h b/tensorflow/compiler/tf2tensorrt/segment/union_find.h
index 70e83c12fca..b53615ec019 100644
--- a/tensorflow/compiler/tf2tensorrt/segment/union_find.h
+++ b/tensorflow/compiler/tf2tensorrt/segment/union_find.h
@@ -19,8 +19,7 @@ limitations under the License.
 #include "absl/strings/str_format.h"
 #include "absl/types/optional.h"
 
-#if GOOGLE_CUDA
-#if GOOGLE_TENSORRT
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
 
 namespace tensorflow {
 namespace tensorrt {
@@ -217,7 +216,6 @@ UnionFind<T>* UnionFind<T>::FindRoot() {
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#endif  // GOOGLE_TENSORRT
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
 
 #endif  // TENSORFLOW_COMPILER_TF2TENSORRT_SEGMENT_UNION_FIND_H_
diff --git a/tensorflow/compiler/tf2tensorrt/tensorrt_test.cc b/tensorflow/compiler/tf2tensorrt/tensorrt_test.cc
index 510591bfe00..e994d20df33 100644
--- a/tensorflow/compiler/tf2tensorrt/tensorrt_test.cc
+++ b/tensorflow/compiler/tf2tensorrt/tensorrt_test.cc
@@ -18,8 +18,7 @@ limitations under the License.
 #include "tensorflow/core/platform/stream_executor.h"
 #include "tensorflow/core/platform/test.h"
 
-#if GOOGLE_CUDA
-#if GOOGLE_TENSORRT
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
 #include "third_party/gpus/cuda/include/cuda.h"
 #include "third_party/gpus/cuda/include/cuda_runtime_api.h"
 #include "third_party/tensorrt/NvInfer.h"
@@ -164,5 +163,4 @@ TEST(TensorrtTest, BasicFunctions) {
 }  // namespace
 }  // namespace tensorflow
 
-#endif  // GOOGLE_TENSORRT
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
diff --git a/tensorflow/compiler/tf2tensorrt/utils/trt_allocator.cc b/tensorflow/compiler/tf2tensorrt/utils/trt_allocator.cc
index 617ea7fad5c..d4f3a524577 100644
--- a/tensorflow/compiler/tf2tensorrt/utils/trt_allocator.cc
+++ b/tensorflow/compiler/tf2tensorrt/utils/trt_allocator.cc
@@ -17,11 +17,9 @@ limitations under the License.
 
 #include "tensorflow/core/platform/logging.h"
 
-#if GOOGLE_CUDA
-#if GOOGLE_TENSORRT
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
 #include "third_party/gpus/cuda/include/cuda_runtime_api.h"
-#endif  // GOOGLE_TENSORRT
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
 
 namespace tensorflow {
 namespace tensorrt {
@@ -52,8 +50,7 @@ void* Align(uint64_t alignment, uint64_t size, void*& ptr, uint64_t& space) {
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#if GOOGLE_CUDA
-#if GOOGLE_TENSORRT
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
 
 namespace tensorflow {
 namespace tensorrt {
@@ -113,5 +110,4 @@ void TRTDeviceAllocator::free(void* memory) {
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#endif  // GOOGLE_TENSORRT
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
diff --git a/tensorflow/compiler/tf2tensorrt/utils/trt_allocator.h b/tensorflow/compiler/tf2tensorrt/utils/trt_allocator.h
index 4ab8b52f523..d219a8a14e8 100644
--- a/tensorflow/compiler/tf2tensorrt/utils/trt_allocator.h
+++ b/tensorflow/compiler/tf2tensorrt/utils/trt_allocator.h
@@ -20,11 +20,9 @@ limitations under the License.
 
 #include "tensorflow/core/framework/allocator.h"
 
-#if GOOGLE_CUDA
-#if GOOGLE_TENSORRT
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
 #include "third_party/tensorrt/NvInfer.h"
-#endif  // GOOGLE_TENSORRT
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
 
 namespace tensorflow {
 namespace tensorrt {
@@ -33,8 +31,7 @@ void* Align(uint64_t alignment, uint64_t size, void*& ptr, uint64_t& space);
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#if GOOGLE_CUDA
-#if GOOGLE_TENSORRT
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
 
 namespace tensorflow {
 namespace tensorrt {
@@ -69,6 +66,5 @@ class TRTDeviceAllocator : public TRTBaseAllocator {
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#endif  // GOOGLE_TENSORRT
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
 #endif  // TENSORFLOW_COMPILER_TF2TENSORRT_UTILS_TRT_ALLOCATOR_H_
diff --git a/tensorflow/compiler/tf2tensorrt/utils/trt_engine_utils.cc b/tensorflow/compiler/tf2tensorrt/utils/trt_engine_utils.cc
index ed997b267b1..8ccfb8b06f0 100644
--- a/tensorflow/compiler/tf2tensorrt/utils/trt_engine_utils.cc
+++ b/tensorflow/compiler/tf2tensorrt/utils/trt_engine_utils.cc
@@ -25,8 +25,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/errors.h"
 
-#if GOOGLE_CUDA
-#if GOOGLE_TENSORRT
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
 #include "third_party/tensorrt/NvInfer.h"
 
 namespace tensorflow {
@@ -257,5 +256,4 @@ Status TrtEnqueue(nvinfer1::IExecutionContext* execution_context,
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#endif  // GOOGLE_TENSORRT
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
diff --git a/tensorflow/compiler/tf2tensorrt/utils/trt_engine_utils.h b/tensorflow/compiler/tf2tensorrt/utils/trt_engine_utils.h
index a471749877a..1ea4fe28cb4 100644
--- a/tensorflow/compiler/tf2tensorrt/utils/trt_engine_utils.h
+++ b/tensorflow/compiler/tf2tensorrt/utils/trt_engine_utils.h
@@ -24,8 +24,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/lib/core/status.h"
 
-#if GOOGLE_CUDA
-#if GOOGLE_TENSORRT
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
 #include "third_party/tensorrt/NvInfer.h"
 
 namespace tensorflow {
@@ -91,7 +90,6 @@ Status TrtEnqueue(nvinfer1::IExecutionContext* execution_context,
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#endif  // GOOGLE_TENSORRT
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
 
 #endif  // TENSORFLOW_COMPILER_TF2TENSORRT_UTILS_TRT_ENGINE_UTILS_H_
diff --git a/tensorflow/compiler/tf2tensorrt/utils/trt_int8_calibrator.cc b/tensorflow/compiler/tf2tensorrt/utils/trt_int8_calibrator.cc
index 554c127fa37..24271e352a7 100644
--- a/tensorflow/compiler/tf2tensorrt/utils/trt_int8_calibrator.cc
+++ b/tensorflow/compiler/tf2tensorrt/utils/trt_int8_calibrator.cc
@@ -20,8 +20,7 @@ limitations under the License.
 
 #include "tensorflow/core/platform/logging.h"
 
-#if GOOGLE_CUDA
-#if GOOGLE_TENSORRT
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
 #include "third_party/gpus/cuda/include/cuda_runtime_api.h"
 
 namespace tensorflow {
@@ -147,5 +146,4 @@ TRTInt8Calibrator::~TRTInt8Calibrator() {
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#endif
-#endif
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
diff --git a/tensorflow/compiler/tf2tensorrt/utils/trt_int8_calibrator.h b/tensorflow/compiler/tf2tensorrt/utils/trt_int8_calibrator.h
index 06b39716490..4c670e85f52 100644
--- a/tensorflow/compiler/tf2tensorrt/utils/trt_int8_calibrator.h
+++ b/tensorflow/compiler/tf2tensorrt/utils/trt_int8_calibrator.h
@@ -22,8 +22,7 @@ limitations under the License.
 #include <utility>
 #include "tensorflow/core/platform/mutex.h"
 
-#if GOOGLE_CUDA
-#if GOOGLE_TENSORRT
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
 
 #include "third_party/gpus/cuda/include/cuda_runtime_api.h"
 #include "third_party/tensorrt/NvInfer.h"
@@ -101,6 +100,5 @@ struct TRTInt8Calibrator : public nvinfer1::IInt8EntropyCalibrator {
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#endif
-#endif
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
 #endif  // TENSORFLOW_COMPILER_TF2TENSORRT_UTILS_TRT_INT8_CALIBRATOR_H_
diff --git a/tensorflow/compiler/tf2tensorrt/utils/trt_logger.cc b/tensorflow/compiler/tf2tensorrt/utils/trt_logger.cc
index 193687ebc8c..e34bf5e7397 100644
--- a/tensorflow/compiler/tf2tensorrt/utils/trt_logger.cc
+++ b/tensorflow/compiler/tf2tensorrt/utils/trt_logger.cc
@@ -15,8 +15,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/tf2tensorrt/utils/trt_logger.h"
 
-#if GOOGLE_CUDA
-#if GOOGLE_TENSORRT
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
 #include "tensorflow/compiler/tf2tensorrt/common/utils.h"
 #include "tensorflow/compiler/tf2tensorrt/convert/logger_registry.h"
 #include "tensorflow/core/platform/logging.h"
@@ -68,5 +67,4 @@ REGISTER_TENSORRT_LOGGER("DefaultLogger", Logger::GetLogger());
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#endif  // GOOGLE_CUDA
-#endif  // GOOGLE_TENSORRT
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
diff --git a/tensorflow/compiler/tf2tensorrt/utils/trt_logger.h b/tensorflow/compiler/tf2tensorrt/utils/trt_logger.h
index 2ade1b48f47..ce6552e8fe9 100644
--- a/tensorflow/compiler/tf2tensorrt/utils/trt_logger.h
+++ b/tensorflow/compiler/tf2tensorrt/utils/trt_logger.h
@@ -18,8 +18,7 @@ limitations under the License.
 
 #include "tensorflow/core/platform/types.h"
 
-#if GOOGLE_CUDA
-#if GOOGLE_TENSORRT
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
 #include "third_party/tensorrt/NvInfer.h"
 
 namespace tensorflow {
@@ -40,7 +39,6 @@ class Logger : public nvinfer1::ILogger {
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#endif  // GOOGLE_TENSORRT
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
 
 #endif  // TENSORFLOW_COMPILER_TF2TENSORRT_UTILS_TRT_LOGGER_H_
diff --git a/tensorflow/compiler/tf2tensorrt/utils/trt_lru_cache.cc b/tensorflow/compiler/tf2tensorrt/utils/trt_lru_cache.cc
index fbcdaad52c0..ee7e6272372 100644
--- a/tensorflow/compiler/tf2tensorrt/utils/trt_lru_cache.cc
+++ b/tensorflow/compiler/tf2tensorrt/utils/trt_lru_cache.cc
@@ -23,8 +23,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/platform/mutex.h"
 
-#if GOOGLE_CUDA
-#if GOOGLE_TENSORRT
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
 #include "third_party/tensorrt/NvInfer.h"
 
 namespace tensorflow {
@@ -141,5 +140,4 @@ EngineContext* TRTEngineCacheResource::GetEngineContext(const int profile_id) {
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#endif  // GOOGLE_TENSORRT
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
diff --git a/tensorflow/compiler/tf2tensorrt/utils/trt_lru_cache.h b/tensorflow/compiler/tf2tensorrt/utils/trt_lru_cache.h
index 8e345254f75..991b9a949e4 100644
--- a/tensorflow/compiler/tf2tensorrt/utils/trt_lru_cache.h
+++ b/tensorflow/compiler/tf2tensorrt/utils/trt_lru_cache.h
@@ -115,8 +115,7 @@ class LRUCache {
   }
 };
 
-#if GOOGLE_CUDA
-#if GOOGLE_TENSORRT
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
 
 struct EngineContext {
   EngineContext() {}  // Creates an empty context.
@@ -223,8 +222,7 @@ class TRTEngineCacheResource : public ResourceBase {
   TrtShapeOptimizationProfile profiles_;
 };
 
-#endif  // GOOGLE_TENSORRT
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
 
 }  // namespace tensorrt
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2tensorrt/utils/trt_shape_optimization_profiles.h b/tensorflow/compiler/tf2tensorrt/utils/trt_shape_optimization_profiles.h
index 40c7f5dcf31..fc688b14139 100644
--- a/tensorflow/compiler/tf2tensorrt/utils/trt_shape_optimization_profiles.h
+++ b/tensorflow/compiler/tf2tensorrt/utils/trt_shape_optimization_profiles.h
@@ -29,8 +29,7 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 
-#if GOOGLE_CUDA
-#if GOOGLE_TENSORRT
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
 
 #include "third_party/tensorrt/NvInfer.h"
 
@@ -173,6 +172,5 @@ class TrtShapeOptimizationProfile {
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#endif  // GOOGLE_TENSORRT
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
 #endif  // TENSORFLOW_COMPILER_TF2TENSORRT_UTILS_TRT_SHAPE_OPTIMIZATION_PROFILES_H_
diff --git a/tensorflow/compiler/tf2tensorrt/utils/trt_shape_optimization_profiles_test.cc b/tensorflow/compiler/tf2tensorrt/utils/trt_shape_optimization_profiles_test.cc
index 501810587e0..32c2200fb71 100644
--- a/tensorflow/compiler/tf2tensorrt/utils/trt_shape_optimization_profiles_test.cc
+++ b/tensorflow/compiler/tf2tensorrt/utils/trt_shape_optimization_profiles_test.cc
@@ -13,8 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#if GOOGLE_CUDA
-#if GOOGLE_TENSORRT
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
 
 #include <string.h>
 
@@ -214,5 +213,4 @@ TEST_F(TrtShapeOptimizationProfileTest, Dynamic) {
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#endif  // GOOGLE_TENSORRT
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT

From d8e73a2f303ddebb7da3694fe7ae20a752eea01b Mon Sep 17 00:00:00 2001
From: Jiho Choi <jihochoi@google.com>
Date: Mon, 22 Jun 2020 19:16:42 -0700
Subject: [PATCH 0851/1390] Add element tracing for prefetch.

PiperOrigin-RevId: 317782986
Change-Id: Ia6e6331a59559423f5a0eac76bf2e7ffd677f876
---
 tensorflow/core/kernels/data/BUILD            |  2 ++
 .../core/kernels/data/prefetch_dataset_op.cc  | 23 +++++++++++++++++--
 2 files changed, 23 insertions(+), 2 deletions(-)

diff --git a/tensorflow/core/kernels/data/BUILD b/tensorflow/core/kernels/data/BUILD
index 6d0351202df..0972dc83ccf 100644
--- a/tensorflow/core/kernels/data/BUILD
+++ b/tensorflow/core/kernels/data/BUILD
@@ -674,6 +674,8 @@ tf_kernel_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/profiler/lib:traceme",
+        "//tensorflow/core/profiler/lib:traceme_encode",
     ],
 )
 
diff --git a/tensorflow/core/kernels/data/prefetch_dataset_op.cc b/tensorflow/core/kernels/data/prefetch_dataset_op.cc
index 0230bcd146d..20b78ba14ad 100644
--- a/tensorflow/core/kernels/data/prefetch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/prefetch_dataset_op.cc
@@ -28,6 +28,8 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/platform/stringprintf.h"
+#include "tensorflow/core/profiler/lib/traceme.h"
+#include "tensorflow/core/profiler/lib/traceme_encode.h"
 #include "tensorflow/core/protobuf/error_codes.pb.h"
 
 namespace tensorflow {
@@ -303,6 +305,7 @@ class PrefetchDatasetOp::Dataset : public DatasetBase {
       // The buffered data element.
       std::vector<Tensor> value;
       int64 created_us;
+      int64 id;
     };
 
     int64 buffer_limit() const TF_EXCLUSIVE_LOCKS_REQUIRED(*mu_) {
@@ -339,6 +342,13 @@ class PrefetchDatasetOp::Dataset : public DatasetBase {
       // (if we successfully got an element) the output values.
       Status s = buffer_.front().status;
       if (s.ok()) {
+        int64 buffer_element_id = buffer_.front().id;
+        profiler::TraceMe traceme(
+            [&] {
+              return profiler::TraceMeEncode(
+                  "PrefetchConsume", {{"element_id", buffer_element_id}});
+            },
+            profiler::kInfo);
         if (dataset()->slack_period_ > 0 &&
             (num_elements() + 1) % dataset()->slack_period_ == 0) {
           // TODO(rachelim): Consider doing something more sophisticated
@@ -423,8 +433,16 @@ class PrefetchDatasetOp::Dataset : public DatasetBase {
         mutex_lock input_l(input_mu_);
         bool end_of_sequence;
         BufferElement buffer_element;
-        buffer_element.status = input_impl_->GetNext(
-            ctx.get(), &buffer_element.value, &end_of_sequence);
+        {
+          profiler::TraceMe traceme(
+              [&] {
+                return profiler::TraceMeEncode("PrefetchProduce",
+                                               {{"element_id", num_produced}});
+              },
+              profiler::kInfo);
+          buffer_element.status = input_impl_->GetNext(
+              ctx.get(), &buffer_element.value, &end_of_sequence);
+        }
         if (buffer_element.status.ok() && end_of_sequence) {
           mutex_lock l(*mu_);
           prefetch_thread_finished_ = true;
@@ -437,6 +455,7 @@ class PrefetchDatasetOp::Dataset : public DatasetBase {
           mutex_lock l(*mu_);
           RecordBufferEnqueue(ctx.get(), buffer_element.value);
           buffer_element.created_us = EnvTime::NowMicros();
+          buffer_element.id = num_produced;
           buffer_.push_back(std::move(buffer_element));
           cond_var_->notify_all();
         }

From 90078a19ee1ce5c7bc644bac1ceff7c16d12e4bb Mon Sep 17 00:00:00 2001
From: David Rim <davidrim@google.com>
Date: Mon, 22 Jun 2020 19:16:48 -0700
Subject: [PATCH 0852/1390] Update quantize weights to use hybrid per-channel
 quantization and asymmetric quantization by default

PiperOrigin-RevId: 317783008
Change-Id: I3c0f026ded030407c434bca70fe15d5fe723c744
---
 tensorflow/lite/kernels/register.cc           | 18 ++--
 .../lite/tools/optimize/quantize_weights.cc   | 92 ++++++++++++++++---
 .../tools/optimize/quantize_weights_test.cc   |  2 +
 .../lite/tools/versioning/op_version.cc       | 63 ++++++++++++-
 tensorflow/lite/tools/versioning/op_version.h |  9 ++
 .../lite/tools/versioning/op_version_test.cc  | 27 ++++++
 .../lite/tools/versioning/runtime_version.cc  | 12 +++
 7 files changed, 195 insertions(+), 28 deletions(-)

diff --git a/tensorflow/lite/kernels/register.cc b/tensorflow/lite/kernels/register.cc
index c3a4aaad16d..452ce35ec78 100644
--- a/tensorflow/lite/kernels/register.cc
+++ b/tensorflow/lite/kernels/register.cc
@@ -54,24 +54,24 @@ BuiltinOpResolver::BuiltinOpResolver() {
   AddBuiltin(BuiltinOperator_L2_POOL_2D, Register_L2_POOL_2D());
   AddBuiltin(BuiltinOperator_CONV_2D, Register_CONV_2D(),
              /* min_version = */ 1,
-             /* max_version = */ 4);
+             /* max_version = */ 5);
   AddBuiltin(BuiltinOperator_DEPTHWISE_CONV_2D, Register_DEPTHWISE_CONV_2D(),
              /* min_version = */ 1,
-             /* max_version = */ 5);
+             /* max_version = */ 6);
   AddBuiltin(BuiltinOperator_SVDF, Register_SVDF(),
              /* min_version = */ 1,
-             /* max_version = */ 3);
+             /* max_version = */ 4);
   AddBuiltin(BuiltinOperator_RNN, Register_RNN(),
              /* min_version = */ 1,
-             /* max_version = */ 2);
+             /* max_version = */ 3);
   AddBuiltin(BuiltinOperator_BIDIRECTIONAL_SEQUENCE_RNN,
              Register_BIDIRECTIONAL_SEQUENCE_RNN(),
              /* min_version = */ 1,
-             /* max_version = */ 2);
+             /* max_version = */ 3);
   AddBuiltin(BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_RNN,
              Register_UNIDIRECTIONAL_SEQUENCE_RNN(),
              /* min_version = */ 1,
-             /* max_version = */ 2);
+             /* max_version = */ 3);
   AddBuiltin(BuiltinOperator_EMBEDDING_LOOKUP, Register_EMBEDDING_LOOKUP(),
              /* min_version = */ 1,
              /* max_version = */ 3);
@@ -79,7 +79,7 @@ BuiltinOpResolver::BuiltinOpResolver() {
              Register_EMBEDDING_LOOKUP_SPARSE());
   AddBuiltin(BuiltinOperator_FULLY_CONNECTED, Register_FULLY_CONNECTED(),
              /* min_version = */ 1,
-             /* max_version = */ 8);
+             /* max_version = */ 9);
   AddBuiltin(BuiltinOperator_LSH_PROJECTION, Register_LSH_PROJECTION());
   AddBuiltin(BuiltinOperator_HASHTABLE_LOOKUP, Register_HASHTABLE_LOOKUP());
   AddBuiltin(BuiltinOperator_SOFTMAX, Register_SOFTMAX(),
@@ -105,13 +105,13 @@ BuiltinOpResolver::BuiltinOpResolver() {
   AddBuiltin(BuiltinOperator_LOCAL_RESPONSE_NORMALIZATION,
              Register_LOCAL_RESPONSE_NORMALIZATION());
   AddBuiltin(BuiltinOperator_LSTM, Register_LSTM(), /* min_version = */ 1,
-             /* max_version = */ 3);
+             /* max_version = */ 4);
   AddBuiltin(BuiltinOperator_BIDIRECTIONAL_SEQUENCE_LSTM,
              Register_BIDIRECTIONAL_SEQUENCE_LSTM(), /* min_version = */ 1,
              /* max_version = */ 3);
   AddBuiltin(BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_LSTM,
              Register_UNIDIRECTIONAL_SEQUENCE_LSTM(), /* min_version = */ 1,
-             /* max_version = */ 2);
+             /* max_version = */ 3);
   AddBuiltin(BuiltinOperator_PAD, Register_PAD(), /* min_version = */ 1,
              /* max_version = */ 2);
   AddBuiltin(BuiltinOperator_PADV2, Register_PADV2(), /* min_version = */ 1,
diff --git a/tensorflow/lite/tools/optimize/quantize_weights.cc b/tensorflow/lite/tools/optimize/quantize_weights.cc
index 7e3853c645c..8bef019a83e 100644
--- a/tensorflow/lite/tools/optimize/quantize_weights.cc
+++ b/tensorflow/lite/tools/optimize/quantize_weights.cc
@@ -43,6 +43,12 @@ typedef struct {
   int32_t op_input_idx;
 } ConsumerOpInfo;
 
+typedef struct {
+  TensorT* t;
+  bool is_per_channel;
+  int channel_dim;
+} TensorPerChannel;
+
 // The default minimum number of elements a weights array must have to be
 // quantized by this transformation.
 const int kWeightsMinNumElementsDefault = 1024;
@@ -138,6 +144,7 @@ bool IsHybridEvaluationOp(const OperatorT* op, const OperatorCodeT* op_code,
     }
   } else if (builtin_op_code == BuiltinOperator_FULLY_CONNECTED ||
              builtin_op_code == BuiltinOperator_CONV_2D ||
+             builtin_op_code == BuiltinOperator_DEPTHWISE_CONV_2D ||
              builtin_op_code == BuiltinOperator_SVDF ||
              builtin_op_code == BuiltinOperator_RNN ||
              builtin_op_code == BuiltinOperator_BIDIRECTIONAL_SEQUENCE_LSTM ||
@@ -181,9 +188,10 @@ bool CheckAllOpInputsQuantized(const SubGraphT* subgraph, const OperatorT* op,
 // Inserts Tensors for each input tensor of op that should be
 // quantized into tensor_map.
 TfLiteStatus InsertQuantizableInputTensorsFromOperator(
-    const ModelT* model, const OperatorT* op, uint64_t weights_min_num_elements,
+    const ModelT* model, OperatorT* op, uint64_t weights_min_num_elements,
     const CustomOpMap& custom_op_map,
-    absl::flat_hash_map<int32_t, TensorT*>* tensor_map, int subgraph_index) {
+    absl::flat_hash_map<int32_t, TensorPerChannel>* tensor_map,
+    int subgraph_index) {
   SubGraphT* subgraph = model->subgraphs.at(subgraph_index).get();
   const OperatorCodeT* op_code = model->operator_codes[op->opcode_index].get();
 
@@ -222,7 +230,50 @@ TfLiteStatus InsertQuantizableInputTensorsFromOperator(
       continue;
     }
 
-    tensor_map->insert({tensor_idx, tensor});
+    if (op_code->builtin_code == BuiltinOperator_DEPTHWISE_CONV_2D) {
+      tensor_map->insert(
+          {tensor_idx, {tensor, /*is_per_channel=*/true, /*dim=*/3}});
+    } else if (op_code->builtin_code == BuiltinOperator_CONV_2D) {
+      tensor_map->insert(
+          {tensor_idx, {tensor, /*is_per_channel=*/true, /*dim=*/0}});
+    } else {
+      switch (op_code->builtin_code) {
+        case BuiltinOperator_BIDIRECTIONAL_SEQUENCE_LSTM:
+          op->builtin_options.AsBidirectionalSequenceLSTMOptions()
+              ->asymmetric_quantize_inputs = true;
+          break;
+        case BuiltinOperator_BIDIRECTIONAL_SEQUENCE_RNN:
+          op->builtin_options.AsBidirectionalSequenceRNNOptions()
+              ->asymmetric_quantize_inputs = true;
+          break;
+        case BuiltinOperator_FULLY_CONNECTED:
+          op->builtin_options.AsFullyConnectedOptions()
+              ->asymmetric_quantize_inputs = true;
+          break;
+        case BuiltinOperator_LSTM:
+          op->builtin_options.AsLSTMOptions()->asymmetric_quantize_inputs =
+              true;
+          break;
+        case BuiltinOperator_RNN:
+          op->builtin_options.AsRNNOptions()->asymmetric_quantize_inputs = true;
+          break;
+        case BuiltinOperator_SVDF:
+          op->builtin_options.AsSVDFOptions()->asymmetric_quantize_inputs =
+              true;
+          break;
+        case BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_LSTM:
+          op->builtin_options.AsUnidirectionalSequenceLSTMOptions()
+              ->asymmetric_quantize_inputs = true;
+          break;
+        case BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_RNN:
+          op->builtin_options.AsSequenceRNNOptions()
+              ->asymmetric_quantize_inputs = true;
+          break;
+        default:
+          break;
+      }
+      tensor_map->insert({tensor_idx, {tensor, /*is_per_channel=*/false}});
+    }
   }
 
   return kTfLiteOk;
@@ -275,17 +326,22 @@ void MakeTensor(const string& name, const std::vector<int32_t>& shape,
 void UpdateInt8OperatorVersions(ModelT* model) {
   for (int i = 0; i < model->operator_codes.size(); ++i) {
     const BuiltinOperator& op_code = model->operator_codes[i]->builtin_code;
-    if (op_code == BuiltinOperator_CONV_2D || op_code == BuiltinOperator_SVDF ||
-        op_code == BuiltinOperator_RNN ||
+    if (op_code == BuiltinOperator_BIDIRECTIONAL_SEQUENCE_LSTM ||
         op_code == BuiltinOperator_BIDIRECTIONAL_SEQUENCE_RNN ||
+        op_code == BuiltinOperator_EMBEDDING_LOOKUP ||
+        op_code == BuiltinOperator_RNN ||
         op_code == BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_LSTM ||
         op_code == BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_RNN) {
-      model->operator_codes[i]->version = 2;
-    } else if (op_code == BuiltinOperator_FULLY_CONNECTED ||
-               op_code == BuiltinOperator_BIDIRECTIONAL_SEQUENCE_LSTM ||
-               op_code == BuiltinOperator_EMBEDDING_LOOKUP ||
-               op_code == BuiltinOperator_LSTM) {
       model->operator_codes[i]->version = 3;
+    } else if (op_code == BuiltinOperator_LSTM ||
+               op_code == BuiltinOperator_SVDF) {
+      model->operator_codes[i]->version = 4;
+    } else if (op_code == BuiltinOperator_CONV_2D) {
+      model->operator_codes[i]->version = 5;
+    } else if (op_code == BuiltinOperator_DEPTHWISE_CONV_2D) {
+      model->operator_codes[i]->version = 6;
+    } else if (op_code == BuiltinOperator_FULLY_CONNECTED) {
+      model->operator_codes[i]->version = 9;
     }
   }
 }
@@ -354,7 +410,7 @@ TfLiteStatus QuantizeWeightsInt8(flatbuffers::FlatBufferBuilder* builder,
        ++subgraph_index) {
     SubGraphT* subgraph = model->subgraphs.at(subgraph_index).get();
 
-    absl::flat_hash_map<int32_t, TensorT*> tensor_map;
+    absl::flat_hash_map<int32_t, TensorPerChannel> tensor_map;
     for (int i = 0; i < subgraph->operators.size(); ++i) {
       OperatorT* op = subgraph->operators[i].get();
       TF_LITE_ENSURE_STATUS(InsertQuantizableInputTensorsFromOperator(
@@ -362,16 +418,22 @@ TfLiteStatus QuantizeWeightsInt8(flatbuffers::FlatBufferBuilder* builder,
           subgraph_index));
     }
 
-    for (std::pair<int32_t, TensorT*> tensor_pair : tensor_map) {
+    for (std::pair<int32_t, TensorPerChannel> tensor_pair : tensor_map) {
       // Quantize the tensor.
-      TF_LITE_ENSURE_STATUS(
-          utils::SymmetricQuantizeTensor(model.get(), tensor_pair.second));
+      if (tensor_pair.second.is_per_channel) {
+        TF_LITE_ENSURE_STATUS(utils::SymmetricQuantizeTensorPerChannel(
+            model.get(), tensor_pair.second.t, tensor_pair.second.channel_dim,
+            nullptr));
+      } else {
+        TF_LITE_ENSURE_STATUS(
+            utils::SymmetricQuantizeTensor(model.get(), tensor_pair.second.t));
+      }
     }
 
     // Examine the tensor consumers to determine which require dequantize ops.
     for (const auto& tensor_pair : tensor_map) {
       int32_t tensor_idx = tensor_pair.first;
-      TensorT* tensor = tensor_pair.second;
+      TensorT* tensor = tensor_pair.second.t;
       std::vector<ConsumerOpInfo> consumer_op_infos =
           GetTensorConsumers(model.get(), subgraph, tensor_idx);
       if (IsQuantizationPassThroughOps(model.get(), consumer_op_infos)) {
diff --git a/tensorflow/lite/tools/optimize/quantize_weights_test.cc b/tensorflow/lite/tools/optimize/quantize_weights_test.cc
index 76f2815ef0b..2f92a9ad71c 100644
--- a/tensorflow/lite/tools/optimize/quantize_weights_test.cc
+++ b/tensorflow/lite/tools/optimize/quantize_weights_test.cc
@@ -215,6 +215,8 @@ TEST_F(QuantizeWeightsTest, HybridConv) {
       } else if (quant_tensor->buffer() != 0) {
         EXPECT_EQ(quant_tensor->type(), TensorType_INT8)
             << quant_tensor->name()->str();
+        auto shape = GetAsVector(quant_tensor->shape());
+        EXPECT_EQ(quant_tensor->quantization()->scale()->size(), shape[0]);
       } else {
         EXPECT_EQ(quant_tensor->type(), TensorType_FLOAT32);
       }
diff --git a/tensorflow/lite/tools/versioning/op_version.cc b/tensorflow/lite/tools/versioning/op_version.cc
index a97b9da47f1..a339976739b 100644
--- a/tensorflow/lite/tools/versioning/op_version.cc
+++ b/tensorflow/lite/tools/versioning/op_version.cc
@@ -70,10 +70,13 @@ int GetBuiltinOperatorVersion(const OpSignature& op_sig) {
         return 3;
       }
       // If the op is a signed int8 hybrid operation, we need to return
-      // version 2.
+      // version 2 or 5 if per channel.
       if (op_sig.input_types.at(0) == TensorType_FLOAT32 &&
           op_sig.input_types.at(1) == TensorType_INT8 &&
           op_sig.output_types.at(0) == TensorType_FLOAT32) {
+        if (op_sig.options.conv_2d.is_per_channel_quantized) {
+          return 5;
+        }
         return 2;
       }
       return 1;
@@ -87,10 +90,13 @@ int GetBuiltinOperatorVersion(const OpSignature& op_sig) {
       }
 
       // If the op is a signed int8 hybrid operation, we need to return
-      // version 4.
+      // version 4 or 6 if per-channel.
       if (op_sig.input_types.at(0) == TensorType_FLOAT32 &&
           op_sig.input_types.at(1) == TensorType_INT8 &&
           op_sig.output_types.at(0) == TensorType_FLOAT32) {
+        if (op_sig.options.depthwise_conv_2d.is_per_channel_quantized) {
+          return 6;
+        }
         return 4;
       }
       // If the op has signed int8 op_sig.inputs and op_sig.outputs, its
@@ -154,6 +160,10 @@ int GetBuiltinOperatorVersion(const OpSignature& op_sig) {
       if (op_sig.input_types.at(0) == TensorType_FLOAT32 &&
           op_sig.input_types.at(1) == TensorType_INT8 &&
           op_sig.output_types.at(0) == TensorType_FLOAT32) {
+        if (op_sig.options.fully_connected.asymmetric_quantize_inputs) {
+          // This is to use the updated quantization scheme.
+          return 9;
+        }
         return 3;
       }
       // For float and uint8 fixed point kernels, if the weight is
@@ -185,6 +195,10 @@ int GetBuiltinOperatorVersion(const OpSignature& op_sig) {
       if (op_sig.input_types.at(0) == TensorType_FLOAT32 &&
           op_sig.input_types.at(1) == TensorType_INT8 &&
           op_sig.output_types.at(0) == TensorType_FLOAT32) {
+        // This is to use the updated quantization scheme
+        if (op_sig.options.input_quantization.asymmetric_quantize_inputs) {
+          return 4;
+        }
         return 2;
       }
       return 1;
@@ -251,6 +265,9 @@ int GetBuiltinOperatorVersion(const OpSignature& op_sig) {
           op_sig.input_types.at(0) == TensorType_FLOAT32 &&
           op_sig.input_types.at(2) == TensorType_INT8 &&
           op_sig.output_types.at(0) == TensorType_FLOAT32) {
+        if (op_sig.options.lstm.asymmetric_quantize_inputs) {
+          return 4;
+        }
         return 3;
       }
       // KERNEL_BASIC was added in version 2.
@@ -265,6 +282,9 @@ int GetBuiltinOperatorVersion(const OpSignature& op_sig) {
       if (op_sig.input_types.at(0) == TensorType_FLOAT32 &&
           op_sig.input_types.at(2) == TensorType_INT8 &&
           op_sig.output_types.at(0) == TensorType_FLOAT32) {
+        if (op_sig.options.lstm.asymmetric_quantize_inputs) {
+          return 3;
+        }
         return 2;
       }
       return 1;
@@ -450,7 +470,6 @@ int GetBuiltinOperatorVersion(const OpSignature& op_sig) {
         return 2;
       }
       return 1;
-
     case BuiltinOperator_TANH:
     case BuiltinOperator_LOGISTIC:
       if (op_sig.input_types.at(0) == TensorType_INT16 &&
@@ -500,6 +519,19 @@ int GetBuiltinOperatorVersion(const OpSignature& op_sig) {
       }
       return 1;
 
+    case BuiltinOperator_RNN:
+    case BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_RNN:
+    case BuiltinOperator_BIDIRECTIONAL_SEQUENCE_RNN:
+    case BuiltinOperator_BIDIRECTIONAL_SEQUENCE_LSTM:
+      if (op_sig.input_types.at(1) == TensorType_INT8 &&
+          op_sig.output_types.at(0) == TensorType_FLOAT32) {
+        if (op_sig.options.input_quantization.asymmetric_quantize_inputs) {
+          return 3;
+        } else {
+          return 2;
+        }
+      }
+      return 1;
     case BuiltinOperator_ADD:
     case BuiltinOperator_PAD:
     case BuiltinOperator_PADV2:
@@ -566,6 +598,16 @@ OpSignature GetOpSignature(const OperatorCode* op_code, const Operator* op,
         op_sig.options.depthwise_conv_2d.dilation_h_factor =
             conv_option->dilation_h_factor();
       }
+      const Tensor* filter_tensor =
+          subgraph->tensors()->Get(op->inputs()->Get(1));
+      const QuantizationParameters* filter_quant =
+          filter_tensor->quantization();
+      int num_channels = filter_tensor->shape()->Get(3);
+      if (filter_quant && filter_quant->scale() &&
+          filter_quant->scale()->Length() &&
+          filter_quant->scale()->Length() == num_channels) {
+        op_sig.options.depthwise_conv_2d.is_per_channel_quantized = true;
+      }
     } break;
 
     case BuiltinOperator_FAKE_QUANT: {
@@ -584,6 +626,8 @@ OpSignature GetOpSignature(const OperatorCode* op_code, const Operator* op,
             fully_connected_option->keep_num_dims();
         op_sig.options.fully_connected.weights_format =
             fully_connected_option->weights_format();
+        op_sig.options.fully_connected.asymmetric_quantize_inputs =
+            fully_connected_option->asymmetric_quantize_inputs();
       }
 
       const Tensor* weight_tensor =
@@ -644,6 +688,18 @@ OpSignature GetOpSignature(const OperatorCode* op_code, const Operator* op,
         op_sig.options.resize.align_corners = resize_nn_option->align_corners();
       }
     } break;
+    case BuiltinOperator_CONV_2D: {
+      const Tensor* filter_tensor =
+          subgraph->tensors()->Get(op->inputs()->Get(1));
+      const QuantizationParameters* filter_quant =
+          filter_tensor->quantization();
+      int num_channels = filter_tensor->shape()->Get(0);
+      if (filter_quant && filter_quant->scale() &&
+          filter_quant->scale()->Length() &&
+          filter_quant->scale()->Length() == num_channels) {
+        op_sig.options.conv_2d.is_per_channel_quantized = true;
+      }
+    } break;
     // TODO(b/150176627): Add tests for GetOpSignature.
     case BuiltinOperator_STRIDED_SLICE:
     case BuiltinOperator_SPACE_TO_BATCH_ND:
@@ -651,7 +707,6 @@ OpSignature GetOpSignature(const OperatorCode* op_code, const Operator* op,
     case BuiltinOperator_TRANSPOSE: {
       op_sig.options.single_input_op.num_dims = GetNumDims(subgraph, op, 0);
     } break;
-
     case BuiltinOperator_SUB:
     case BuiltinOperator_DIV:
     case BuiltinOperator_MAXIMUM:
diff --git a/tensorflow/lite/tools/versioning/op_version.h b/tensorflow/lite/tools/versioning/op_version.h
index df74ffaf6dd..71362001387 100644
--- a/tensorflow/lite/tools/versioning/op_version.h
+++ b/tensorflow/lite/tools/versioning/op_version.h
@@ -30,6 +30,7 @@ typedef struct {
     struct {
       int32_t dilation_w_factor;
       int32_t dilation_h_factor;
+      bool is_per_channel_quantized;
     } depthwise_conv_2d;
     struct {
       bool narrow_range;
@@ -40,6 +41,7 @@ typedef struct {
       // TODO(b/156530611): Make this global when more ops support sparse
       // computation.
       bool sparse_weight;
+      bool asymmetric_quantize_inputs;
     } fully_connected;
     struct {
       float input1_scale;
@@ -48,6 +50,7 @@ typedef struct {
     } mul;
     struct {
       LSTMKernelType kernel_type;
+      bool asymmetric_quantize_inputs;
     } lstm;
     struct {
       bool half_pixel_centers;
@@ -60,6 +63,12 @@ typedef struct {
       int32_t num_dims;
       bool need_broadcast;
     } broadcast;
+    struct {
+      bool is_per_channel_quantized;
+    } conv_2d;
+    struct {
+      bool asymmetric_quantize_inputs;
+    } input_quantization;
   } options;
 } OpSignature;
 
diff --git a/tensorflow/lite/tools/versioning/op_version_test.cc b/tensorflow/lite/tools/versioning/op_version_test.cc
index 4017fc3bff0..e9fd857a3f5 100644
--- a/tensorflow/lite/tools/versioning/op_version_test.cc
+++ b/tensorflow/lite/tools/versioning/op_version_test.cc
@@ -361,6 +361,19 @@ TEST(OpVersionTest, VersioningFullyConnectedTest) {
   fake_op_sig.options.fully_connected = {
       false, FullyConnectedOptionsWeightsFormat_DEFAULT, true};
   EXPECT_EQ(GetBuiltinOperatorVersion(fake_op_sig), 8);
+
+  fake_op_sig = {
+      .op = BuiltinOperator_FULLY_CONNECTED,
+      .input_types =
+          std::vector<TensorType>{TensorType_FLOAT32, TensorType_INT8,
+                                  TensorType_FLOAT32},
+      .output_types = std::vector<TensorType>{TensorType_FLOAT32},
+  };
+  fake_op_sig.options.fully_connected = {
+      false, FullyConnectedOptionsWeightsFormat_DEFAULT, false, false};
+  EXPECT_EQ(GetBuiltinOperatorVersion(fake_op_sig), 3);
+  fake_op_sig.options.fully_connected.asymmetric_quantize_inputs = true;
+  EXPECT_EQ(GetBuiltinOperatorVersion(fake_op_sig), 9);
 }
 
 TEST(OpVersionTest, VersioningDequantizeTest) {
@@ -412,6 +425,15 @@ TEST(OpVersionTest, VersioningConv2DTest) {
       .output_types = std::vector<TensorType>{TensorType_FLOAT32},
   };
   EXPECT_EQ(GetBuiltinOperatorVersion(fake_op_sig), 2);
+
+  fake_op_sig = {
+      .op = BuiltinOperator_CONV_2D,
+      .input_types =
+          std::vector<TensorType>{TensorType_FLOAT32, TensorType_INT8},
+      .output_types = std::vector<TensorType>{TensorType_FLOAT32},
+  };
+  fake_op_sig.options.conv_2d.is_per_channel_quantized = true;
+  EXPECT_EQ(GetBuiltinOperatorVersion(fake_op_sig), 5);
 }
 
 TEST(OpVersionTest, VersioningFloorDivOperatorTest) {
@@ -479,6 +501,8 @@ TEST(OpVersionTest, VersioningSVDFOperatorTest) {
       .output_types = std::vector<TensorType>{TensorType_FLOAT32},
   };
   EXPECT_EQ(GetBuiltinOperatorVersion(fake_op_sig), 2);
+  fake_op_sig.options.input_quantization.asymmetric_quantize_inputs = true;
+  EXPECT_EQ(GetBuiltinOperatorVersion(fake_op_sig), 4);
 
   fake_op_sig = {
       .op = BuiltinOperator_SVDF,
@@ -489,6 +513,7 @@ TEST(OpVersionTest, VersioningSVDFOperatorTest) {
   };
   EXPECT_EQ(GetBuiltinOperatorVersion(fake_op_sig), 3);
 }
+
 TEST(OpVersionTest, VersioningDepthwiseConv2DTest) {
   OpSignature fake_op_sig = {
       .op = BuiltinOperator_DEPTHWISE_CONV_2D,
@@ -497,6 +522,8 @@ TEST(OpVersionTest, VersioningDepthwiseConv2DTest) {
       .output_types = std::vector<TensorType>{TensorType_FLOAT32},
   };
   EXPECT_EQ(GetBuiltinOperatorVersion(fake_op_sig), 4);
+  fake_op_sig.options.depthwise_conv_2d.is_per_channel_quantized = true;
+  EXPECT_EQ(GetBuiltinOperatorVersion(fake_op_sig), 6);
 
   fake_op_sig = {
       .op = BuiltinOperator_DEPTHWISE_CONV_2D,
diff --git a/tensorflow/lite/tools/versioning/runtime_version.cc b/tensorflow/lite/tools/versioning/runtime_version.cc
index 36976354685..efec5a7da18 100644
--- a/tensorflow/lite/tools/versioning/runtime_version.cc
+++ b/tensorflow/lite/tools/versioning/runtime_version.cc
@@ -63,11 +63,13 @@ std::string FindMinimumRuntimeVersionForOp(tflite::BuiltinOperator op_code,
               {{BuiltinOperator_CONV_2D, 2}, "1.14.0"},
               {{BuiltinOperator_CONV_2D, 3}, "1.14.0"},
               {{BuiltinOperator_CONV_2D, 4}, kPendingReleaseVersion},
+              {{BuiltinOperator_CONV_2D, 5}, kPendingReleaseVersion},
               {{BuiltinOperator_DEPTHWISE_CONV_2D, 1}, "1.5.0"},
               {{BuiltinOperator_DEPTHWISE_CONV_2D, 2}, "1.12.0"},
               {{BuiltinOperator_DEPTHWISE_CONV_2D, 3}, "1.14.0"},
               {{BuiltinOperator_DEPTHWISE_CONV_2D, 4}, "2.2.0"},
               {{BuiltinOperator_DEPTHWISE_CONV_2D, 5}, kPendingReleaseVersion},
+              {{BuiltinOperator_DEPTHWISE_CONV_2D, 6}, kPendingReleaseVersion},
               {{BuiltinOperator_ADD, 1}, "1.5.0"},
               {{BuiltinOperator_ADD, 2}, "1.14.0"},
               {{BuiltinOperator_ADD_N, 1}, "1.14.0"},
@@ -102,6 +104,7 @@ std::string FindMinimumRuntimeVersionForOp(tflite::BuiltinOperator op_code,
               {{BuiltinOperator_FULLY_CONNECTED, 6}, "2.1.0"},
               {{BuiltinOperator_FULLY_CONNECTED, 7}, kPendingReleaseVersion},
               {{BuiltinOperator_FULLY_CONNECTED, 8}, kPendingReleaseVersion},
+              {{BuiltinOperator_FULLY_CONNECTED, 9}, kPendingReleaseVersion},
               {{BuiltinOperator_GATHER, 1}, "1.6.0"},
               {{BuiltinOperator_GATHER, 2}, "1.14.0"},
               {{BuiltinOperator_GATHER, 3}, "1.15.0"},
@@ -111,6 +114,7 @@ std::string FindMinimumRuntimeVersionForOp(tflite::BuiltinOperator op_code,
               {{BuiltinOperator_SVDF, 1}, "1.5.0"},
               {{BuiltinOperator_SVDF, 2}, "1.14.0"},
               {{BuiltinOperator_SVDF, 3}, "2.2.0"},
+              {{BuiltinOperator_SVDF, 4}, kPendingReleaseVersion},
               {{BuiltinOperator_L2_NORMALIZATION, 1}, "1.5.0"},
               {{BuiltinOperator_L2_NORMALIZATION, 2}, "1.14.0"},
               {{BuiltinOperator_L2_POOL_2D, 1}, "1.5.0"},
@@ -151,13 +155,18 @@ std::string FindMinimumRuntimeVersionForOp(tflite::BuiltinOperator op_code,
               {{BuiltinOperator_LSTM, 1}, "1.7.0"},
               {{BuiltinOperator_LSTM, 2}, "1.10.0"},
               {{BuiltinOperator_LSTM, 3}, "1.14.0"},
+              {{BuiltinOperator_LSTM, 4}, kPendingReleaseVersion},
               {{BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_LSTM, 1}, "1.13.1"},
               {{BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_LSTM, 2}, "1.14.0"},
+              {{BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_LSTM, 3},
+               kPendingReleaseVersion},
               {{BuiltinOperator_BIDIRECTIONAL_SEQUENCE_LSTM, 1}, "1.14.0"},
               {{BuiltinOperator_BIDIRECTIONAL_SEQUENCE_LSTM, 2}, "1.14.0"},
               {{BuiltinOperator_BIDIRECTIONAL_SEQUENCE_LSTM, 3}, "1.14.0"},
               {{BuiltinOperator_BIDIRECTIONAL_SEQUENCE_RNN, 1}, "1.14.0"},
               {{BuiltinOperator_BIDIRECTIONAL_SEQUENCE_RNN, 2}, "1.14.0"},
+              {{BuiltinOperator_BIDIRECTIONAL_SEQUENCE_RNN, 3},
+               kPendingReleaseVersion},
               {{BuiltinOperator_MEAN, 1}, "1.6.0"},
               {{BuiltinOperator_MEAN, 2}, "1.14.0"},
               {{BuiltinOperator_SUM, 1}, "1.10.0"},
@@ -179,6 +188,7 @@ std::string FindMinimumRuntimeVersionForOp(tflite::BuiltinOperator op_code,
                kPendingReleaseVersion},
               {{BuiltinOperator_RNN, 1}, "1.5.0"},
               {{BuiltinOperator_RNN, 2}, "1.14.0"},
+              {{BuiltinOperator_RNN, 3}, kPendingReleaseVersion},
               {{BuiltinOperator_SKIP_GRAM, 1}, "1.5.0"},
               {{BuiltinOperator_SQUEEZE, 1}, "1.6.0"},
               {{BuiltinOperator_SPLIT, 1}, "1.5.0"},
@@ -233,6 +243,8 @@ std::string FindMinimumRuntimeVersionForOp(tflite::BuiltinOperator op_code,
               {{BuiltinOperator_UNIQUE, 1}, "1.14.0"},
               {{BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_RNN, 1}, "1.14.0"},
               {{BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_RNN, 2}, "1.14.0"},
+              {{BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_RNN, 3},
+               kPendingReleaseVersion},
               {{BuiltinOperator_WHERE, 1}, "1.14.0"},
               {{BuiltinOperator_DEQUANTIZE, 1}, "1.13.1"},
               {{BuiltinOperator_DEQUANTIZE, 2}, "1.14.0"},

From 27d1a803165af2cab5025ebf2387aa4785349c5a Mon Sep 17 00:00:00 2001
From: Tare Gaskin <taregaskin@google.com>
Date: Tue, 23 Jun 2020 02:34:18 +0000
Subject: [PATCH 0853/1390] [-Wsign-compare] warning fixes batch 3

---
 indexed_warning_files.json                    |  1 +
 tensorflow/core/data/compression_utils.cc     |  2 +-
 .../optimizers/arithmetic_optimizer.cc        | 24 +++++++++--------
 .../grappler/optimizers/constant_folding.cc   | 27 ++++++++++---------
 .../generic_layout_optimizer_transposer.cc    | 24 ++++++++++-------
 .../generic_layout_optimizer_transposer.h     | 10 ++++---
 .../optimizers/graph_optimizer_stage.cc       |  2 +-
 .../optimizers/implementation_selector.cc     |  2 +-
 .../grappler/optimizers/memory_optimizer.cc   |  2 +-
 .../optimizers/scoped_allocator_optimizer.cc  | 11 ++++----
 .../grappler/optimizers/shape_optimizer.cc    |  3 ++-
 .../kernels/data/single_threaded_executor.cc  |  4 +--
 .../kernels/initializable_lookup_table.cc     |  2 +-
 .../remote_fused_graph_execute_utils.cc       | 15 ++++++-----
 tensorflow/core/kernels/stack.cc              |  3 ++-
 ...embedding_optimization_parameters_utils.cc |  4 +--
 tensorflow/core/util/bcast.h                  |  4 +--
 17 files changed, 79 insertions(+), 61 deletions(-)
 create mode 100644 indexed_warning_files.json

diff --git a/indexed_warning_files.json b/indexed_warning_files.json
new file mode 100644
index 00000000000..cbe0560ba18
--- /dev/null
+++ b/indexed_warning_files.json
@@ -0,0 +1 @@
+{"0": "tensorflow/lite/arena_planner.cc", "1": "tensorflow/core/platform/protobuf.cc", "2": "tensorflow/core/platform/protobuf.cc", "3": "tensorflow/core/platform/default/logging.cc", "4": "tensorflow/core/platform/default/logging.cc", "5": "tensorflow/core/lib/strings/proto_serialization.cc", "6": "tensorflow/core/lib/strings/proto_serialization.cc", "7": "tensorflow/core/platform/default/stacktrace_handler.cc", "8": "tensorflow/core/platform/default/stacktrace_handler.cc", "9": "tensorflow/core/framework/cpu_allocator_impl.cc", "10": "tensorflow/core/framework/allocator_registry.cc", "11": "tensorflow/core/framework/cpu_allocator_impl.cc", "12": "tensorflow/core/framework/allocator_registry.cc", "13": "tensorflow/lite/experimental/microfrontend/lib/fft.cc", "14": "tensorflow/python/util/tf_stack.cc", "15": "external/com_github_grpc_grpc/src/core/tsi/ssl_transport_security.cc", "16": "tensorflow/core/profiler/internal/parse_annotation.cc", "17": "tensorflow/compiler/mlir/xla/ir/chlo_ops.cc", "18": "tensorflow/core/platform/status.cc", "19": "tensorflow/core/platform/file_system_helper.cc", "20": "tensorflow/core/platform/file_system.cc", "21": "tensorflow/core/platform/env.cc", "22": "tensorflow/core/lib/io/random_inputstream.cc", "23": "tensorflow/core/lib/io/snappy/snappy_inputbuffer.cc", "24": "tensorflow/core/lib/io/inputbuffer.cc", "25": "tensorflow/core/lib/io/zlib_outputbuffer.cc", "26": "tensorflow/core/lib/io/snappy/snappy_outputbuffer.cc", "27": "tensorflow/core/framework/tensor_shape.cc", "28": "tensorflow/compiler/mlir/xla/ir/hlo_ops.cc", "29": "tensorflow/compiler/mlir/lite/quantization/quantization_utils.cc", "30": "tensorflow/compiler/mlir/lite/quantization/quantization_driver.cc", "31": "tensorflow/compiler/mlir/lite/quantization/quantization_config.cc", "32": "tensorflow/core/kernels/data/prefetch_autotuner.cc", "33": "tensorflow/core/kernels/quantization_utils.cc", "34": "tensorflow/core/profiler/utils/derived_timeline.cc", "35": "tensorflow/core/profiler/utils/xplane_utils.cc", "36": "tensorflow/core/profiler/lib/profiler_session.cc", "37": "tensorflow/core/platform/s3/s3_file_system.cc", "38": "tensorflow/lite/toco/model_cmdline_flags.cc", "39": "tensorflow/lite/toco/toco_cmdline_flags.cc", "40": "tensorflow/lite/toco/toco_cmdline_flags.cc", "41": "tensorflow/compiler/xla/window_util.cc", "42": "tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc", "43": "tensorflow/compiler/mlir/lite/quantization/import_quant_stats_pass.cc", "44": "tensorflow/core/kernels/batch_kernels.cc", "45": "tensorflow/lite/toco/graph_transformations/reorder_elementwise_unary.cc", "46": "tensorflow/core/kernels/range_sampler.cc", "47": "tensorflow/core/kernels/data/experimental/sql/sqlite_query_connection.cc", "48": "tensorflow/core/grappler/utils.cc", "49": "tensorflow/core/grappler/costs/op_level_cost_estimator.cc", "50": "tensorflow/lite/toco/graph_transformations/convert_trivial_tile_to_concat.cc", "51": "tensorflow/core/grappler/utils/topological_sort.cc", "52": "tensorflow/core/grappler/utils/frame.cc", "53": "tensorflow/core/grappler/optimizers/common_subgraph_elimination.cc", "54": "tensorflow/core/grappler/optimizers/model_pruner.cc", "55": "tensorflow/python/grappler/model_analyzer.cc", "56": "tensorflow/core/grappler/optimizers/debug_stripper.cc", "57": "tensorflow/core/grappler/utils/graph_view.cc", "58": "tensorflow/core/grappler/utils/functions.cc", "59": "tensorflow/core/grappler/costs/graph_memory.cc", "60": "tensorflow/core/grappler/optimizers/generic_layout_optimizer.cc", "61": "tensorflow/core/grappler/optimizers/function_optimizer.cc", "62": "tensorflow/core/grappler/optimizers/generic_layout_optimizer_transposer.cc", "63": "tensorflow/core/grappler/costs/virtual_scheduler.cc", "64": "tensorflow/core/grappler/optimizers/implementation_selector.cc", "65": "tensorflow/core/grappler/optimizers/pin_to_host_optimizer.cc", "66": "tensorflow/core/grappler/optimizers/shape_optimizer.cc", "67": "tensorflow/core/grappler/optimizers/graph_optimizer_stage.cc", "68": "tensorflow/core/grappler/optimizers/memory_optimizer.cc", "69": "tensorflow/core/grappler/optimizers/scoped_allocator_optimizer.cc", "70": "tensorflow/core/grappler/costs/graph_properties.cc", "71": "tensorflow/core/grappler/optimizers/constant_folding.cc", "72": "tensorflow/core/grappler/optimizers/loop_optimizer.cc", "73": "tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc", "74": "tensorflow/core/data/service/compression_utils.cc", "75": "tensorflow/core/kernels/stack.cc", "76": "tensorflow/core/kernels/remote_fused_graph_execute_utils.cc", "77": "tensorflow/core/tpu/tpu_embedding_optimization_parameters_utils.cc", "78": "tensorflow/core/kernels/initializable_lookup_table.cc", "79": "tensorflow/core/kernels/data/single_threaded_executor.cc", "80": "tensorflow/core/kernels/lookup_util.cc", "81": "tensorflow/lite/toco/graph_transformations/merge_reshape_into_preceding_transpose.cc", "82": "tensorflow/lite/toco/graph_transformations/identify_nearest_upsample.cc", "83": "tensorflow/lite/toco/graph_transformations/remove_successive_transpose.cc", "84": "tensorflow/lite/toco/graph_transformations/remove_trivial_passthrough.cc", "85": "tensorflow/lite/toco/graph_transformations/propagate_fixed_sizes.cc", "86": "tensorflow/lite/toco/graph_transformations/resolve_batch_normalization.cc", "87": "tensorflow/lite/toco/graph_transformations/resolve_constant_concatenation.cc", "88": "tensorflow/lite/toco/graph_transformations/reorder_reshape_transpose.cc", "89": "tensorflow/lite/toco/graph_transformations/resolve_constant_unary.cc", "90": "tensorflow/lite/toco/graph_transformations/drop_fake_quant.cc", "91": "tensorflow/lite/toco/graph_transformations/unpartition_embedding_lookup.cc", "92": "tensorflow/lite/toco/graph_transformations/dequantize.cc", "93": "tensorflow/lite/toco/graph_transformations/unroll_batch_matmul.cc", "94": "tensorflow/lite/toco/graph_transformations/hardcode_min_max.cc", "95": "tensorflow/lite/toco/graph_transformations/propagate_array_data_types.cc", "96": "tensorflow/lite/toco/graph_transformations/group_bidirectional_sequence_ops.cc", "97": "tensorflow/lite/toco/graph_transformations/propagate_fake_quant_num_bits.cc", "98": "tensorflow/lite/toco/graph_transformations/resolve_constant_slice.cc", "99": "tensorflow/lite/toco/graph_transformations/resolve_constant_transpose.cc", "100": "tensorflow/lite/toco/graph_transformations/resolve_constant_pack.cc", "101": "tensorflow/lite/toco/graph_transformations/fuse_broadcast_into_following_binary.cc", "102": "tensorflow/lite/toco/graph_transformations/ensure_uint8_weights_safe_for_fast_int8_kernels.cc", "103": "tensorflow/lite/toco/graph_transformations/convert_trivial_transpose_to_reshape.cc", "104": "tensorflow/core/profiler/convert/op_metrics_to_record.cc", "105": "tensorflow/core/profiler/utils/event_span.cc", "106": "tensorflow/python/framework/python_op_gen.cc", "107": "tensorflow/python/framework/python_op_gen_internal.cc", "108": "tensorflow/compiler/tf2xla/ops/xla_ops.cc", "109": "tensorflow/compiler/mlir/tensorflow/ir/tf_device.cc", "110": "tensorflow/compiler/mlir/tensorflow/transforms/unroll_batch_matmul.cc", "111": "tensorflow/compiler/mlir/tensorflow/ir/tf_executor.cc", "112": "tensorflow/core/kernels/data/captured_function.cc", "113": "tensorflow/core/profiler/convert/xplane_to_tf_functions.cc", "114": "tensorflow/core/common_runtime/bfc_allocator.cc", "115": "tensorflow/core/util/padding.cc", "116": "tensorflow/core/framework/op_def_util.cc", "117": "tensorflow/core/framework/node_def_util.cc", "118": "tensorflow/core/framework/shape_inference.cc", "119": "tensorflow/core/framework/common_shape_fns.cc", "120": "tensorflow/core/common_runtime/lower_case_op.cc", "121": "tensorflow/core/common_runtime/gradients.cc", "122": "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc", "123": "tensorflow/stream_executor/device_description.cc", "124": "tensorflow/compiler/jit/shape_inference.cc", "125": "tensorflow/compiler/mlir/tensorflow/utils/export_utils.cc", "126": "tensorflow/compiler/xla/index_util.cc", "127": "tensorflow/compiler/xla/metric_table_report.cc", "128": "tensorflow/compiler/xla/layout.cc", "129": "tensorflow/stream_executor/stream_executor_pimpl.cc", "130": "tensorflow/compiler/xla/shape_util.cc", "131": "tensorflow/compiler/xla/service/hlo_lexer.cc", "132": "tensorflow/compiler/xla/service/cpu/shape_partition.cc", "133": "tensorflow/compiler/xla/util.cc", "134": "tensorflow/compiler/xla/service/name_uniquer.cc", "135": "tensorflow/compiler/xla/shape_layout.cc", "136": "tensorflow/compiler/xla/client/sharding_builder.cc", "137": "tensorflow/compiler/xla/service/computation_layout.cc", "138": "tensorflow/compiler/mlir/xla/type_to_shape.cc", "139": "tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_flags.cc", "140": "tensorflow/compiler/xla/service/shaped_buffer.cc", "141": "tensorflow/compiler/tf2xla/sharding_util.cc", "142": "tensorflow/stream_executor/stream.cc", "143": "tensorflow/compiler/xla/layout_util.cc", "144": "tensorflow/compiler/xla/shape.cc", "145": "tensorflow/core/profiler/convert/xplane_to_memory_profile.cc", "146": "tensorflow/core/profiler/convert/op_stats_to_overview_page.cc", "147": "tensorflow/compiler/mlir/xla/transforms/legalize_tf.cc", "148": "tensorflow/compiler/xla/client/xla_computation.cc", "149": "tensorflow/compiler/mlir/tensorflow/utils/xla_sharding_util.cc", "150": "tensorflow/compiler/tf2xla/rearrange_function_argument.cc", "151": "tensorflow/compiler/jit/encapsulate_util.cc", "152": "tensorflow/compiler/tf2xla/tf2xla_util.cc", "153": "tensorflow/compiler/tf2xla/functionalize_while.cc", "154": "tensorflow/compiler/tf2xla/functionalize_cond.cc", "155": "tensorflow/core/kernels/boosted_trees/resources.cc", "156": "tensorflow/python/client/session_ref.cc", "157": "tensorflow/core/distributed_runtime/rpc/grpc_state.cc", "158": "tensorflow/core/distributed_runtime/rpc/grpc_remote_master.cc", "159": "tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_client.cc", "160": "tensorflow/core/distributed_runtime/collective_param_resolver_distributed.cc", "161": "tensorflow/core/distributed_runtime/collective_rma_distributed.cc", "162": "tensorflow/core/platform/status.cc", "163": "tensorflow/core/framework/tensor_shape.cc", "164": "tensorflow/core/profiler/internal/tfprof_timeline.cc", "165": "tensorflow/core/platform/file_system.cc", "166": "tensorflow/core/platform/file_system_helper.cc", "167": "tensorflow/core/platform/env.cc", "168": "tensorflow/core/lib/io/snappy/snappy_outputbuffer.cc", "169": "tensorflow/core/lib/io/zlib_outputbuffer.cc", "170": "tensorflow/core/lib/io/snappy/snappy_inputbuffer.cc", "171": "tensorflow/core/lib/io/random_inputstream.cc", "172": "tensorflow/core/lib/io/inputbuffer.cc", "173": "tensorflow/core/util/padding.cc", "174": "tensorflow/core/framework/op_def_util.cc", "175": "tensorflow/core/platform/s3/s3_file_system.cc", "176": "tensorflow/stream_executor/device_description.cc", "177": "tensorflow/core/framework/shape_inference.cc", "178": "tensorflow/core/framework/node_def_util.cc", "179": "tensorflow/core/common_runtime/bfc_allocator.cc", "180": "tensorflow/core/framework/common_shape_fns.cc", "181": "tensorflow/stream_executor/stream_executor_pimpl.cc", "182": "tensorflow/core/profiler/utils/xplane_utils.cc", "183": "tensorflow/core/grappler/utils.cc", "184": "tensorflow/core/grappler/costs/op_level_cost_estimator.cc", "185": "tensorflow/core/grappler/utils/symbolic_shapes.cc", "186": "tensorflow/core/grappler/utils/frame.cc", "187": "tensorflow/core/grappler/utils/topological_sort.cc", "188": "tensorflow/stream_executor/stream.cc", "189": "tensorflow/core/kernels/initializable_lookup_table.cc", "190": "tensorflow/core/grappler/optimizers/common_subgraph_elimination.cc", "191": "tensorflow/core/grappler/utils/graph_view.cc", "192": "tensorflow/core/grappler/optimizers/model_pruner.cc", "193": "tensorflow/core/kernels/lookup_util.cc", "194": "tensorflow/compiler/tf2xla/ops/xla_ops.cc", "195": "tensorflow/python/framework/python_op_gen_internal.cc", "196": "tensorflow/python/framework/python_op_gen.cc", "197": "tensorflow/core/profiler/utils/derived_timeline.cc", "198": "tensorflow/core/grappler/optimizers/debug_stripper.cc", "199": "tensorflow/core/grappler/optimizers/implementation_selector.cc", "200": "tensorflow/core/grappler/utils/functions.cc", "201": "tensorflow/core/grappler/optimizers/graph_optimizer_stage.cc", "202": "tensorflow/core/grappler/optimizers/pin_to_host_optimizer.cc", "203": "tensorflow/core/grappler/optimizers/shape_optimizer.cc", "204": "tensorflow/core/grappler/optimizers/generic_layout_optimizer_transposer.cc", "205": "tensorflow/core/grappler/optimizers/function_optimizer.cc", "206": "tensorflow/core/grappler/optimizers/scoped_allocator_optimizer.cc", "207": "tensorflow/core/grappler/optimizers/generic_layout_optimizer.cc", "208": "tensorflow/core/grappler/costs/virtual_scheduler.cc", "209": "tensorflow/core/grappler/optimizers/loop_optimizer.cc", "210": "tensorflow/core/grappler/optimizers/constant_folding.cc", "211": "tensorflow/core/profiler/lib/profiler_session.cc", "212": "tensorflow/core/grappler/costs/graph_properties.cc", "213": "tensorflow/core/grappler/costs/graph_memory.cc", "214": "tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc", "215": "tensorflow/core/grappler/optimizers/memory_optimizer.cc", "216": "tensorflow/core/common_runtime/gradients.cc", "217": "tensorflow/core/common_runtime/lower_case_op.cc", "218": "tensorflow/core/grappler/utils/symbolic_shapes.cc", "219": "tensorflow/compiler/jit/graphcycles/graphcycles.cc", "220": "tensorflow/compiler/jit/xla_cluster_util.cc", "221": "tensorflow/core/kernels/data/experimental/snapshot_util.cc", "222": "tensorflow/core/grappler/optimizers/data/vectorization/parse_single_example_vectorizer.cc", "223": "tensorflow/core/distributed_runtime/graph_mgr.cc", "224": "tensorflow/core/grappler/optimizers/data/vectorization_utils.cc", "225": "tensorflow/core/grappler/optimizers/data/map_vectorization.cc", "226": "tensorflow/cc/framework/while_gradients.cc", "227": "tensorflow/cc/framework/gradients.cc", "228": "tensorflow/core/grappler/graph_analyzer/subgraph.cc", "229": "tensorflow/core/grappler/graph_analyzer/graph_analyzer.cc", "230": "tensorflow/core/grappler/graph_analyzer/sig_node.cc", "231": "tensorflow/compiler/tf2xla/shape_util.cc", "232": "tensorflow/compiler/xla/service/computation_placer.cc", "233": "tensorflow/compiler/xla/client/executable_build_options.cc", "234": "tensorflow/compiler/xla/service/hlo_module_config.cc", "235": "tensorflow/core/distributed_runtime/master_session.cc", "236": "tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util.cc", "237": "tensorflow/compiler/tf2xla/literal_util.cc", "238": "tensorflow/compiler/xla/service/cpu/cpu_options.cc", "239": "tensorflow/compiler/xla/client/client.cc", "240": "tensorflow/compiler/xla/literal_util.cc", "241": "tensorflow/core/distributed_runtime/rpc/grpc_session.cc", "242": "tensorflow/compiler/xla/service/hlo_input_output_alias_config.cc", "243": "tensorflow/compiler/xla/service/call_graph.cc", "244": "tensorflow/compiler/xla/service/tuple_util.cc", "245": "tensorflow/compiler/xla/service/hlo_reachability.cc", "246": "tensorflow/compiler/xla/service/hlo_cost_analysis.cc", "247": "tensorflow/compiler/xla/service/hlo_execution_profile.cc", "248": "tensorflow/compiler/xla/service/hlo_module_group.cc", "249": "tensorflow/compiler/mlir/xla/hlo_utils.cc", "250": "tensorflow/compiler/xla/service/hlo_sharding_metadata.cc", "251": "tensorflow/compiler/xla/service/map_inliner.cc", "252": "tensorflow/compiler/xla/service/flatten_call_graph.cc", "253": "tensorflow/compiler/xla/service/hlo_domain_map.cc", "254": "tensorflow/compiler/xla/service/hlo_cse.cc", "255": "tensorflow/compiler/xla/service/batchnorm_expander.cc", "256": "tensorflow/compiler/xla/service/dynamic_index_splitter.cc", "257": "tensorflow/compiler/xla/service/dfs_hlo_visitor.cc", "258": "tensorflow/compiler/xla/service/hlo_subcomputation_unification.cc", "259": "tensorflow/compiler/xla/service/slice_sinker.cc", "260": "tensorflow/compiler/xla/service/dot_decomposer.cc", "261": "tensorflow/compiler/xla/service/sort_simplifier.cc", "262": "tensorflow/compiler/xla/service/reshape_mover.cc", "263": "tensorflow/compiler/xla/service/gpu/partition_assignment.cc", "264": "tensorflow/compiler/xla/service/fusion_node_indexing_evaluation.cc", "265": "tensorflow/compiler/xla/service/dynamic_parameter_binding.cc", "266": "tensorflow/compiler/xla/literal.cc", "267": "tensorflow/compiler/xla/service/hlo_schedule.cc", "268": "tensorflow/compiler/xla/service/buffer_value.cc", "269": "tensorflow/compiler/xla/service/hlo_dce.cc", "270": "tensorflow/compiler/xla/service/hlo_module.cc", "271": "tensorflow/compiler/xla/service/hlo_sharding.cc", "272": "tensorflow/compiler/xla/service/transpose_folding.cc", "273": "tensorflow/compiler/xla/service/logical_buffer.cc", "274": "tensorflow/compiler/xla/service/call_inliner.cc", "275": "tensorflow/compiler/xla/service/hlo_buffer.cc", "276": "tensorflow/compiler/xla/service/hlo_instructions.cc", "277": "tensorflow/compiler/xla/service/hlo_value.cc", "278": "tensorflow/compiler/xla/service/zero_sized_hlo_elimination.cc", "279": "tensorflow/compiler/xla/service/tuple_simplifier.cc", "280": "tensorflow/compiler/xla/service/hlo_computation.cc", "281": "tensorflow/compiler/xla/service/logical_buffer_analysis.cc", "282": "tensorflow/compiler/xla/service/hlo_phi_graph.cc", "283": "tensorflow/compiler/xla/service/conditional_simplifier.cc", "284": "tensorflow/compiler/xla/service/hlo_query.cc", "285": "tensorflow/compiler/xla/service/channel_tracker.cc", "286": "tensorflow/compiler/xla/client/lib/constants.cc", "287": "tensorflow/compiler/xla/service/collective_ops_utils.cc", "288": "tensorflow/compiler/xla/service/hlo_dataflow_analysis.cc", "289": "tensorflow/compiler/tf2xla/lib/util.cc", "290": "tensorflow/compiler/xla/service/shape_inference.cc", "291": "tensorflow/compiler/tf2xla/lib/broadcast.cc", "292": "tensorflow/compiler/tf2xla/lib/data_format.cc", "293": "tensorflow/compiler/xla/service/hlo_parser.cc", "294": "tensorflow/compiler/xla/service/hlo_ordering.cc", "295": "tensorflow/compiler/xla/service/hlo_instruction.cc", "296": "tensorflow/compiler/xla/service/instruction_fusion.cc", "297": "tensorflow/compiler/xla/service/tuple_points_to_analysis.cc", "298": "tensorflow/compiler/xla/service/hlo_verifier.cc", "299": "tensorflow/compiler/xla/client/lib/comparators.cc", "300": "tensorflow/compiler/xla/client/lib/arithmetic.cc", "301": "tensorflow/compiler/xla/client/lib/sorting.cc", "302": "tensorflow/compiler/mlir/xla/ir/mlir_hlo_builder.cc", "303": "tensorflow/compiler/xla/client/lib/loops.cc", "304": "tensorflow/compiler/tf2xla/lib/scatter.cc", "305": "tensorflow/compiler/xla/client/lib/slicing.cc", "306": "tensorflow/compiler/xla/client/lib/prng.cc", "307": "tensorflow/compiler/xla/client/xla_builder.cc", "308": "tensorflow/compiler/xla/service/hlo_graph_dumper.cc", "309": "tensorflow/compiler/xla/service/hlo_alias_analysis.cc", "310": "tensorflow/compiler/xla/service/hlo_live_range.cc", "311": "tensorflow/compiler/xla/client/lib/pooling.cc", "312": "tensorflow/compiler/xla/client/lib/matrix.cc", "313": "tensorflow/compiler/xla/client/lib/tridiagonal.cc", "314": "tensorflow/compiler/xla/service/conditional_to_select.cc", "315": "tensorflow/compiler/xla/service/batch_dot_simplification.cc", "316": "tensorflow/compiler/xla/service/rng_expander.cc", "317": "tensorflow/compiler/xla/service/cholesky_expander.cc", "318": "tensorflow/compiler/xla/client/lib/svd.cc", "319": "tensorflow/compiler/xla/service/hlo_memory_scheduler.cc", "320": "tensorflow/compiler/xla/client/lib/qr.cc", "321": "tensorflow/compiler/xla/client/lib/self_adjoint_eig.cc", "322": "tensorflow/compiler/xla/service/convolution_group_converter.cc", "323": "tensorflow/compiler/xla/client/lib/math.cc", "324": "tensorflow/compiler/xla/service/hlo_creation_utils.cc", "325": "tensorflow/compiler/xla/service/layout_assignment.cc", "326": "tensorflow/compiler/xla/service/while_loop_constant_sinking.cc", "327": "tensorflow/compiler/xla/service/scatter_expander.cc", "328": "tensorflow/compiler/xla/service/heap_simulator.cc", "329": "tensorflow/compiler/xla/service/while_util.cc", "330": "tensorflow/compiler/xla/service/dynamic_dimension_inference.cc", "331": "tensorflow/compiler/xla/service/cpu/buffer_info_util.cc", "332": "tensorflow/compiler/xla/service/hlo_proto_util.cc", "333": "tensorflow/compiler/xla/service/llvm_ir/buffer_assignment_util.cc", "334": "tensorflow/compiler/xla/service/executable.cc", "335": "tensorflow/compiler/xla/service/hlo_pass_pipeline.cc", "336": "tensorflow/compiler/xla/service/triangular_solve_expander.cc", "337": "tensorflow/compiler/xla/service/compilation_cache.cc", "338": "tensorflow/compiler/xla/service/platform_util.cc", "339": "tensorflow/compiler/xla/service/copy_insertion.cc", "340": "tensorflow/compiler/xla/service/compiler.cc", "341": "tensorflow/compiler/xla/service/hlo_constant_folding.cc", "342": "tensorflow/compiler/xla/service/memory_space_assignment.cc", "343": "tensorflow/compiler/xla/service/transfer_manager.cc", "344": "tensorflow/compiler/xla/service/generic_transfer_manager.cc", "345": "tensorflow/compiler/xla/service/hlo_element_type_converter.cc", "346": "tensorflow/compiler/xla/service/execution_tracker.cc", "347": "tensorflow/compiler/xla/service/buffer_assignment.cc", "348": "tensorflow/compiler/xla/service/backend.cc", "349": "tensorflow/compiler/xla/service/op_expander_pass.cc", "350": "tensorflow/compiler/xla/service/indexed_array_analysis.cc", "351": "tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor_complex128.cc", "352": "tensorflow/compiler/xla/service/tree_reduction_rewriter.cc", "353": "tensorflow/compiler/xla/service/rng_bit_generator_expander.cc", "354": "tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor_complex64.cc", "355": "tensorflow/compiler/xla/service/while_loop_analysis.cc", "356": "tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor_int32.cc", "357": "tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor_uint16.cc", "358": "tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor_uint8.cc", "359": "tensorflow/compiler/xla/service/algebraic_simplifier.cc", "360": "tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor_int8.cc", "361": "tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor_half.cc", "362": "tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor_int64.cc", "363": "tensorflow/compiler/xla/service/allocation_tracker.cc", "364": "tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor_bfloat16.cc", "365": "tensorflow/compiler/xla/service/while_loop_invariant_code_motion.cc", "366": "tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor_float.cc", "367": "tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor_int16.cc", "368": "tensorflow/compiler/xla/service/hlo_get_dimension_size_rewriter.cc", "369": "tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor_uint32.cc", "370": "tensorflow/compiler/xla/service/dynamic_padder.cc", "371": "tensorflow/compiler/xla/service/dump.cc", "372": "tensorflow/compiler/xla/service/while_loop_simplifier.cc", "373": "tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor_uint64.cc", "374": "tensorflow/compiler/xla/service/compile_only_service.cc", "375": "tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor_bool.cc", "376": "tensorflow/compiler/xla/service/local_service.cc", "377": "tensorflow/compiler/xla/client/compile_only_client.cc", "378": "tensorflow/compiler/tf2xla/xla_expression.cc", "379": "tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor_double.cc", "380": "tensorflow/compiler/xla/client/client_library.cc", "381": "tensorflow/compiler/xla/client/local_client.cc", "382": "tensorflow/compiler/jit/xla_tensor.cc", "383": "tensorflow/compiler/xla/service/service.cc", "384": "tensorflow/compiler/tf2xla/const_analysis.cc", "385": "tensorflow/compiler/tf2xla/kernels/conv_op_helpers.cc", "386": "tensorflow/compiler/jit/device_util.cc", "387": "tensorflow/compiler/tf2xla/xla_context.cc", "388": "tensorflow/compiler/tf2xla/kernels/tensor_list_utils.cc", "389": "tensorflow/compiler/tf2xla/xla_op_registry.cc", "390": "tensorflow/compiler/tf2xla/xla_helpers.cc", "391": "tensorflow/compiler/tf2xla/xla_compilation_device.cc", "392": "tensorflow/compiler/tf2xla/graph_compiler.cc", "393": "tensorflow/compiler/tf2xla/xla_resource.cc", "394": "tensorflow/compiler/tf2xla/xla_op_kernel.cc", "395": "tensorflow/compiler/tf2xla/lib/random.cc", "396": "tensorflow/compiler/jit/compilability_check_util.cc", "397": "tensorflow/compiler/xla/service/hlo_evaluator.cc", "398": "tensorflow/compiler/tf2xla/graph_compiler_util.cc", "399": "tensorflow/compiler/tf2xla/kernels/if_while_utils.cc", "400": "tensorflow/compiler/tf2xla/xla_compiler.cc", "401": "tensorflow/compiler/jit/increase_dynamism_for_auto_jit_pass.cc", "402": "tensorflow/compiler/jit/build_xla_ops_pass.cc", "403": "tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc", "404": "tensorflow/compiler/jit/encapsulate_xla_computations_pass.cc", "405": "tensorflow/compiler/jit/extract_outside_compilation_pass.cc", "406": "tensorflow/compiler/aot/aot_only_var_handle_op.cc", "407": "tensorflow/compiler/tf2xla/tf2xla.cc", "408": "tensorflow/compiler/mlir/tensorflow/transforms/tpu_merge_variables_with_execute.cc", "409": "tensorflow/compiler/mlir/xla/transforms/legalize_tf_with_tf2xla.cc", "410": "tensorflow/compiler/mlir/tensorflow/translate/export_graphdef.cc", "411": "tensorflow/compiler/mlir/xla/mlir_hlo_to_hlo.cc", "412": "tensorflow/compiler/mlir/tensorflow/transforms/tpu_cluster_formation.cc", "413": "tensorflow/compiler/mlir/tensorflow/transforms/tpu_sharding_identification_pass.cc", "414": "tensorflow/compiler/mlir/tensorflow/transforms/tpu_variable_runtime_reformatting.cc", "415": "tensorflow/compiler/jit/xla_compilation_cache.cc", "416": "tensorflow/compiler/mlir/tensorflow/utils/compile_mlir_util.cc", "417": "tensorflow/compiler/mlir/tensorflow/translate/import_model.cc", "418": "tensorflow/compiler/jit/xla_device_context.cc", "419": "tensorflow/compiler/jit/xla_launch_util.cc", "420": "tensorflow/compiler/mlir/tensorflow/transforms/resource_op_lifting.cc", "421": "tensorflow/compiler/jit/xla_compile_on_demand_op.cc", "422": "tensorflow/compiler/jit/xla_device_ops.cc", "423": "tensorflow/compiler/mlir/tensorflow/transforms/einsum.cc", "424": "tensorflow/compiler/mlir/tensorflow/transforms/batchmatmul_to_einsum.cc", "425": "tensorflow/compiler/jit/xla_device.cc", "426": "tensorflow/compiler/jit/kernels/xla_ops.cc", "427": "tensorflow/compiler/jit/xla_kernel_creator_util.cc", "428": "tensorflow/compiler/xla/service/llvm_compiler.cc", "429": "tensorflow/core/distributed_runtime/eager/remote_mgr.cc", "430": "tensorflow/core/common_runtime/eager/execute.cc", "431": "tensorflow/compiler/mlir/tensorflow/transforms/promote_resources_to_args.cc", "432": "tensorflow/core/distributed_runtime/eager/eager_service_impl.cc", "433": "tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service_impl.cc", "434": "tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc", "435": "tensorflow/compiler/xla/service/cpu/xfeed_manager.cc", "436": "tensorflow/compiler/xla/service/gpu/target_util.cc", "437": "tensorflow/compiler/aot/embedded_protocol_buffers.cc", "438": "tensorflow/compiler/xla/service/llvm_ir/math_ops.cc", "439": "tensorflow/compiler/xla/service/llvm_ir/tuple_ops.cc", "440": "tensorflow/compiler/xla/service/cpu/ir_emission_utils.cc", "441": "tensorflow/compiler/xla/service/llvm_ir/ir_array.cc", "442": "tensorflow/compiler/xla/service/cpu/vector_support_library.cc", "443": "tensorflow/compiler/xla/service/llvm_ir/llvm_util.cc", "444": "tensorflow/compiler/xla/service/llvm_ir/llvm_loop.cc", "445": "tensorflow/compiler/xla/service/cpu/cpu_transfer_manager.cc", "446": "tensorflow/compiler/xla/service/llvm_ir/loop_emitter.cc", "447": "tensorflow/python/tfcompile_wrapper.cc", "448": "tensorflow/compiler/xla/service/cpu/conv_canonicalization.cc", "449": "tensorflow/compiler/xla/service/cpu/cpu_runtime.cc", "450": "tensorflow/compiler/xla/service/llvm_ir/kernel_support_library.cc", "451": "tensorflow/compiler/xla/service/llvm_ir/alias_analysis.cc", "452": "tensorflow/compiler/xla/service/cpu/llvm_ir_runtime.cc", "453": "tensorflow/compiler/xla/service/cpu/ir_function.cc", "454": "tensorflow/compiler/xla/service/cpu/parallel_loop_emitter.cc", "455": "tensorflow/compiler/xla/service/gpu/parallel_loop_emitter.cc", "456": "tensorflow/compiler/xla/service/cpu/tiled_dot_emitter.cc", "457": "tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion.cc", "458": "tensorflow/compiler/xla/service/cpu/compiler_functor.cc", "459": "tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.cc", "460": "tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc", "461": "tensorflow/compiler/xla/service/elemental_ir_emitter.cc", "462": "tensorflow/compiler/xla/service/llvm_ir/dynamic_update_slice_util.cc", "463": "tensorflow/compiler/xla/service/cpu/cpu_layout_assignment.cc", "464": "tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc", "465": "tensorflow/compiler/xla/service/cpu/cpu_executable.cc", "466": "tensorflow/compiler/xla/service/cpu/parallel_task_assignment.cc", "467": "tensorflow/compiler/xla/service/cpu/elemental_ir_emitter.cc", "468": "tensorflow/core/profiler/internal/tfprof_op.cc", "469": "tensorflow/compiler/jit/xla_cpu_device.cc", "470": "tensorflow/python/lib/core/ndarray_tensor.cc", "471": "tensorflow/compiler/xla/service/cpu/ir_emitter.cc", "472": "tensorflow/core/profiler/internal/tfprof_code.cc", "473": "tensorflow/compiler/xla/service/cpu/cpu_compiler.cc", "474": "tensorflow/core/profiler/internal/tfprof_stats.cc", "475": "tensorflow/core/profiler/internal/print_model_analysis.cc", "476": "tensorflow/compiler/aot/codegen.cc", "477": "tensorflow/compiler/aot/compile.cc", "478": "tensorflow/compiler/tf2xla/mlir_tf2xla.cc", "479": "tensorflow/python/eager/pywrap_tfe_src.cc", "480": "tensorflow/compiler/mlir/lite/utils/lstm_utils.cc", "481": "tensorflow/compiler/mlir/lite/transforms/prepare_composite_functions_tf.cc", "482": "tensorflow/compiler/mlir/lite/transforms/dilated_conv.cc", "483": "tensorflow/lite/delegates/nnapi/quant_lstm_sup.cc", "484": "tensorflow/compiler/mlir/lite/transforms/dense_to_sparse.cc", "485": "tensorflow/compiler/mlir/lite/transforms/lower_static_tensor_list.cc", "486": "tensorflow/lite/delegates/nnapi/nnapi_delegate.cc", "487": "tensorflow/compiler/mlir/lite/transforms/prepare_tf.cc", "488": "tensorflow/compiler/mlir/lite/transforms/legalize_tf.cc", "489": "tensorflow/compiler/mlir/lite/transforms/optimize.cc", "490": "tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper.cc", "491": "tensorflow/lite/tools/verifier.cc", "492": "tensorflow/lite/tools/optimize/model_utils.cc", "493": "tensorflow/lite/tools/optimize/quantization_utils.cc", "494": "tensorflow/lite/tools/versioning/op_version.cc", "495": "tensorflow/lite/tools/versioning/runtime_version.cc", "496": "tensorflow/lite/tools/optimize/quantize_model.cc", "497": "tensorflow/lite/tools/optimize/quantize_weights.cc", "498": "tensorflow/lite/python/optimize/calibration_wrapper.cc", "499": "tensorflow/lite/toco/tflite/import.cc", "500": "tensorflow/compiler/mlir/lite/flatbuffer_import.cc", "501": "tensorflow/compiler/mlir/lite/ir/tfl_ops.cc", "502": "tensorflow/compiler/mlir/lite/flatbuffer_export.cc", "503": "tensorflow/compiler/mlir/lite/python/saved_model_to_tfl_flatbuffer.cc", "504": "tensorflow/lite/experimental/microfrontend/lib/fft.cc", "505": "tensorflow/core/tpu/tpu_embedding_optimization_parameters_utils.cc", "506": "tensorflow/core/platform/default/logging.cc", "507": "tensorflow/core/platform/default/logging.cc", "508": "tensorflow/core/platform/protobuf.cc", "509": "tensorflow/core/framework/allocator_registry.cc", "510": "tensorflow/core/framework/cpu_allocator_impl.cc", "511": "tensorflow/core/kernels/lookup_util.cc", "512": "tensorflow/lite/delegates/nnapi/nnapi_delegate.cc", "513": "tensorflow/core/framework/allocator_registry.cc", "514": "tensorflow/core/common_runtime/lower_case_op.cc", "515": "tensorflow/core/data/service/compression_utils.cc", "516": "tensorflow/core/kernels/data/prefetch_autotuner.cc", "517": "tensorflow/lite/tools/versioning/runtime_version.cc", "518": "tensorflow/compiler/xla/service/hlo_evaluator.cc", "519": "tensorflow/compiler/tf2xla/xla_resource.cc", "520": "tensorflow/core/kernels/stack.cc", "521": "tensorflow/core/profiler/internal/parse_annotation.cc", "522": "tensorflow/core/framework/cpu_allocator_impl.cc", "523": "tensorflow/lite/experimental/microfrontend/lib/fft.cc", "524": "tensorflow/lite/toco/graph_transformations/ensure_uint8_weights_safe_for_fast_int8_kernels.cc", "525": "tensorflow/compiler/xla/service/while_loop_invariant_code_motion.cc", "526": "tensorflow/core/tpu/tpu_embedding_optimization_parameters_utils.cc"}
\ No newline at end of file
diff --git a/tensorflow/core/data/compression_utils.cc b/tensorflow/core/data/compression_utils.cc
index d132bdca8da..2ab51712580 100644
--- a/tensorflow/core/data/compression_utils.cc
+++ b/tensorflow/core/data/compression_utils.cc
@@ -116,7 +116,7 @@ Status UncompressElement(const CompressedElement& compressed,
           compressed_data.data(), compressed_data.size(), &uncompressed_size)) {
     return errors::Internal("Could not get snappy uncompressed length");
   }
-  if (uncompressed_size != total_size) {
+  if (uncompressed_size != static_cast<size_t>(total_size)) {
     return errors::Internal(
         "Uncompressed size mismatch. Snappy expects ", uncompressed_size,
         " whereas the tensor metadata suggests ", total_size);
diff --git a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
index 520346b0166..c8015a6e50c 100644
--- a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
@@ -598,7 +598,7 @@ class AddOpsRewriteStage : public ArithmeticNodesGroupOptimizerStage {
     std::deque<InputAndShape> add_ops;
 
     // Prepare leaf AddN nodes for inputs of equal shape
-    for (int i = 0; i < shapes.size(); ++i) {
+    for (int i = 0, iter_limit = shapes.size(); i < iter_limit; ++i) {
       const auto node_name = leaf_node_name(i);
       const auto& inputs = shape_sig_to_inputs[ShapeSignature(shapes[i])];
       add_ops.push_back(AddInputsOfSymbolicallyEqualShape(*group.root_node,
@@ -750,7 +750,7 @@ class HoistCommonFactorOutOfAggregation : public ArithmeticOptimizerStage {
         ctx().node_map->AddOutput(new_add_node->name(), new_outer_node->name());
 
         // Hoist non-shared factors up into the new AddN node.
-        for (int i = 0; i < unique_factors.size(); ++i) {
+        for (int i = 0, iter_limit = unique_factors.size(); i < iter_limit; ++i) {
           const string& unique_factor_i = unique_factors[i];
           new_add_node->set_input(i, unique_factor_i);
           ctx().node_map->AddOutput(unique_factor_i, new_add_node->name());
@@ -1190,7 +1190,7 @@ class RemoveIdentityTranspose : public ArithmeticOptimizerStage {
     if (a.size() != b.size()) {
       return false;
     }
-    for (int i = 0; i < a.size(); ++i) {
+    for (int i = 0, iter_limit = a.size(); i < iter_limit; ++i) {
       if (a[b[i]] != i) {
         return false;
       }
@@ -1199,7 +1199,7 @@ class RemoveIdentityTranspose : public ArithmeticOptimizerStage {
   }
 
   bool IsIdentityPermutation(const std::vector<int64>& perm) {
-    for (int64 i = 0; i < perm.size(); ++i) {
+    for (int64 i = 0, iter_limit = perm.size(); i < iter_limit; ++i) {
       if (i != perm[i]) {
         return false;
       }
@@ -1500,7 +1500,8 @@ class HoistCWiseUnaryChainsStage : public ArithmeticOptimizerStage {
     for (int i = start; i < end; ++i) {
       unique_inputs.insert(node.input(i));
     }
-    return unique_inputs.size() == n;
+    int unique_input_size = unique_inputs.size(); 
+    return unique_input_size == n;
   }
 
   // Returns the length of the common unary chain of ops that can be
@@ -3248,14 +3249,15 @@ class RemoveStackSliceSameAxis : public ArithmeticOptimizerStage {
                                      slice_begin_vec.size(), ") and size (",
                                      slice_size_vec.size(), ") vectors.");
     }
+    int slice_begin_vec_size = slice_begin_vec.size();
     if (!pack_output_shape.unknown_rank() &&
-        slice_begin_vec.size() != pack_output_shape.dims()) {
+        slice_begin_vec_size != pack_output_shape.dims()) {
       return Status::OK();
     }
-    if (pack_axis >= slice_begin_vec.size()) {
+    if (pack_axis >= slice_begin_vec_size) {
       return errors::InvalidArgument(
           "Input to node ", node->name(), " had pack_axis ", pack_axis,
-          " but rank was ", slice_begin_vec.size(), ".");
+          " but rank was ", slice_begin_vec_size, ".");
     }
 
     *slice_start_value = slice_begin_vec[pack_axis];
@@ -3264,7 +3266,7 @@ class RemoveStackSliceSameAxis : public ArithmeticOptimizerStage {
       return Status::OK();
     }
 
-    for (size_t i = 0; i < slice_begin_vec.size(); ++i) {
+    for (int i = 0; i < slice_begin_vec_size; ++i) {
       if (i != pack_axis) {
         if (slice_begin_vec[i] != 0 ||
             !(slice_size_vec[i] == -1 ||
@@ -3352,7 +3354,7 @@ class RemoveStackSliceSameAxis : public ArithmeticOptimizerStage {
 
     int begin_index = -1;
     int64 begin_value = 0;
-    for (int i = 0; i < slice_begin_vec.size(); ++i) {
+    for (int i = 0, iter_limit = slice_begin_vec.size(); i < iter_limit; ++i) {
       const int64 v = slice_begin_vec[i];
       if (v != 0) {
         if (begin_index != -1) {
@@ -3366,7 +3368,7 @@ class RemoveStackSliceSameAxis : public ArithmeticOptimizerStage {
 
     int end_index = -1;
     int64 end_value = 0;
-    for (int i = 0; i < slice_end_vec.size(); ++i) {
+    for (int i = 0, iter_limit = slice_begin_vec.size(); i < iter_limit; ++i) {
       const int64 v = slice_end_vec[i];
       if (v != pack_output_shape.dim_size(i)) {
         if (end_index != -1) {
diff --git a/tensorflow/core/grappler/optimizers/constant_folding.cc b/tensorflow/core/grappler/optimizers/constant_folding.cc
index d912eb7857b..f42340f9d09 100644
--- a/tensorflow/core/grappler/optimizers/constant_folding.cc
+++ b/tensorflow/core/grappler/optimizers/constant_folding.cc
@@ -479,7 +479,7 @@ Status ConstantFolding::MaterializeShapes(const GraphProperties& properties) {
     CHECK_EQ(op, "ShapeN");
     CHECK_EQ(input.size(), output.size());
     const NodeDef* const shape_n_node = node;
-    for (int port_idx = 0; port_idx < output.size(); ++port_idx) {
+    for (int port_idx = 0, idx_limit = output.size(); port_idx < idx_limit; ++port_idx) {
       const DataType type = output[port_idx].dtype();
       CHECK(type == DT_INT32 || type == DT_INT64);
       const PartialTensorShape shape(input[port_idx].shape());
@@ -641,12 +641,12 @@ Status ConstantFolding::MaterializeBroadcastGradientArgs(
   // These extra dims could be equal to 1, in which case there is no
   // broadcasting. It could also be greater than 1, in which case there would
   // be broadcasting. Since we don't know, we'll just punt.
-  for (int i = common_dims; i < shape1.size(); ++i) {
+  for (int i = common_dims, iter_limit = shape1.size(); i < iter_limit; ++i) {
     if (shape1[i] < 0) {
       return Status::OK();
     }
   }
-  for (int i = common_dims; i < shape2.size(); ++i) {
+  for (int i = common_dims, iter_limit = shape2.size(); i < iter_limit; ++i) {
     if (shape2[i] < 0) {
       return Status::OK();
     }
@@ -1165,7 +1165,7 @@ bool IsValidConstShapeForMulConvPushDown(
   // If the const is a scalar, or it has fewer or same number of dimensions
   // than the filter and it only has single element, the optimization should
   // work.
-  if (mul_const_input_shape.dim_size() <= data_format.size() &&
+  if (mul_const_input_shape.dim_size() <= static_cast<int>(data_format.size()) &&
       TensorShape(mul_const_input_shape).num_elements() == 1) {
     return true;
   }
@@ -1461,7 +1461,7 @@ Status ConstantFolding::FoldNode(NodeDef* node, GraphDef* output_graph,
   VLOG(2) << "Folded node: " << SummarizeNodeDef(*node);
 
   NodeDef* constant_output = nullptr;
-  for (int i = 0; i < const_nodes.size(); i++) {
+  for (int i = 0, iter_limit = const_nodes.size(); i < iter_limit; i++) {
     NodeDef* const_node = &const_nodes[i];
     VLOG(3) << "Generated constant node: " << SummarizeNodeDef(*const_node);
     if (const_node->name().empty()) {
@@ -1549,7 +1549,7 @@ Status ConstantFolding::FoldNode(NodeDef* node, GraphDef* output_graph,
                                      constant_output->name());
               *output->mutable_input(i) = AsControlDependency(*constant_output);
             }
-          } else if (port < const_nodes.size() &&
+          } else if (port < static_cast<int>(const_nodes.size()) &&
                      !const_nodes[port].name().empty()) {
             // Replace alive outputs with the corresponding constant.
             node_map_->UpdateInput(output->name(), NodeName(output->input(i)),
@@ -2068,7 +2068,8 @@ Status ConstantFolding::RemoveShuffleOrTranspose(
         permutation.push_back(permutation_tensor.vec<int>()(j));
       }
     }
-    if (permutation.size() != shape.dim_size()) {
+    int permutation_size = permutation.size();
+    if (permutation_size != shape.dim_size()) {
       // Number of elements in perm should be same as dim_size. Skip if not.
       return Status::OK();
     }
@@ -2245,9 +2246,10 @@ Status ConstantFolding::SimplifyStridedSlice(const GraphProperties& properties,
       // as many as expanded_ellipsis_indices.size() axes during computation.
       // We need to subtract this number from j.
       int i = j;
+      int expanded_ellipsis_indices_size = expanded_ellipsis_indices.size();
       if (ellipsis_index != -1 &&
-          j >= ellipsis_index + expanded_ellipsis_indices.size()) {
-        i = j - expanded_ellipsis_indices.size();
+          j >= ellipsis_index + expanded_ellipsis_indices_size) {
+        i = j - expanded_ellipsis_indices_size;
       }
       int b = begin.dtype() == DT_INT32 ? begin.vec<int>()(i)
                                         : begin.vec<int64>()(i);
@@ -3479,15 +3481,16 @@ bool ConstantFolding::PartialAssocOpConstFolding(GraphDef* optimized_graph,
   }
   // Promote AccumulateNV2 with all constant inputs to AddN, since it is
   // a fake node that cannot be constant folded by itself.
-  if (const_inputs.size() == num_non_control_inputs &&
+  int const_inputs_size = const_inputs.size();
+  if (const_inputs_size == num_non_control_inputs &&
       node->op() == "AccumulateNV2") {
     node->set_op("AddN");
     node->mutable_attr()->erase("shape");
     return true;
   }
   const string new_node_name = OptimizedNodeName(
-      *node, strings::StrCat("_partial_split_", const_inputs.size()));
-  if (const_inputs.size() > 1 && const_inputs.size() < num_non_control_inputs &&
+      *node, strings::StrCat("_partial_split_", const_inputs_size));
+  if (const_inputs_size > 1 && const_inputs_size < num_non_control_inputs &&
       !node_map_->NodeExists(new_node_name)) {
     NodeDef* added_node = optimized_graph->add_node();
     *added_node = *node;
diff --git a/tensorflow/core/grappler/optimizers/generic_layout_optimizer_transposer.cc b/tensorflow/core/grappler/optimizers/generic_layout_optimizer_transposer.cc
index ab7d8fcd6cf..63239082134 100644
--- a/tensorflow/core/grappler/optimizers/generic_layout_optimizer_transposer.cc
+++ b/tensorflow/core/grappler/optimizers/generic_layout_optimizer_transposer.cc
@@ -242,7 +242,7 @@ Status Transposer::CreateConstPermNode(TransposeContext* context,
 
   AttrValue attr_tensor;
   Tensor tensor(DT_INT32, TensorShape({4}));
-  for (int i = 0; i < permutation.size(); i++) {
+  for (int i = 0, iter_limit = permutation.size(); i < iter_limit; i++) {
     tensor.flat<int>()(i) = permutation[i];
   }
   tensor.AsProtoTensorContent(attr_tensor.mutable_tensor());
@@ -538,10 +538,11 @@ bool Transposer::IsFaninPortDimsNIfConst(const utils::MutableNodeView& node,
     if (!tensor.FromProto(value_attr->tensor())) {
       return false;
     }
-    if (tensor.dims() != dims.size()) {
+    int dims_size = dims.size(); 
+    if (tensor.dims() != dims_size) {
       return false;
     }
-    for (int i = 0; i < dims.size(); ++i) {
+    for (int i = 0; i < dims_size; ++i) {
       if (tensor.dim_size(i) != dims[i]) {
         return false;
       }
@@ -863,12 +864,13 @@ inline bool IsValidConstPermTransposeNode(const utils::MutableNodeView& node,
   if (!GetValueAttrFromConstInputNode(node, IsTranspose, 1, &tensor)) {
     return false;
   }
-  if (tensor.NumElements() != permutation.size()) {
+  int permutation_size = permutation.size();
+  if (tensor.NumElements() != permutation_size) {
     return false;
   }
 
   const auto& tensor_data = tensor.unaligned_flat<int32>();
-  for (int i = 0; i < permutation.size(); i++) {
+  for (int i = 0; i < permutation_size; i++) {
     if (permutation[i] != tensor_data(i)) {
       return false;
     }
@@ -1229,10 +1231,11 @@ bool ReduceTransposer::KeepDims(const utils::MutableNodeView& node) {
 
 bool ReduceTransposer::IsAlongAxis(const Tensor& tensor,
                                    absl::Span<const int> axis, int rank) {
-  if (tensor.dims() != 1 || tensor.dim_size(0) != axis.size()) {
+  int axis_size = axis.size();
+  if (tensor.dims() != 1 || tensor.dim_size(0) != axis_size) {
     return false;
   }
-  for (int i = 0; i < axis.size(); ++i) {
+  for (int i = 0; i < axis_size; ++i) {
     int local_axis = tensor.flat<int>()(i);
     if (local_axis < 0) {
       local_axis += rank;
@@ -1444,12 +1447,13 @@ bool SqueezeTransposer::IsAlongAxis(const AttrValue& attr,
                                     int rank) const {
   const auto& list = attr.list();
   // If list is empty, Squeeze op will squeeze all dimensions of size 1.
+  int axis_size = axis.size();
   if (list.i_size() == 0) {
     return true;
-  } else if (list.i_size() != axis.size()) {
+  } else if (list.i_size() != axis_size) {
     return false;
   }
-  for (int i = 0; i < axis.size(); ++i) {
+  for (int i = 0; i < axis_size; ++i) {
     int local_axis = list.i(i);
     if (local_axis < 0) {
       local_axis += rank;
@@ -1563,7 +1567,7 @@ Status StridedSliceTransposer::PermuteMask(TransposeContext* context,
     return errors::InvalidArgument("invalid mask value: ", mask_i);
   }
   int result = 0;
-  for (int i = 0; i < context->src_to_dst.size(); i++) {
+  for (int i = 0, iter_limit = context->src_to_dst.size(); i < iter_limit; i++) {
     const int final_pos = context->src_to_dst[i];
     const int position_mask = 1 << final_pos;
     const int bit_i = (mask_i & position_mask) >> final_pos;
diff --git a/tensorflow/core/grappler/optimizers/generic_layout_optimizer_transposer.h b/tensorflow/core/grappler/optimizers/generic_layout_optimizer_transposer.h
index b518c32d8ec..bb00e965872 100644
--- a/tensorflow/core/grappler/optimizers/generic_layout_optimizer_transposer.h
+++ b/tensorflow/core/grappler/optimizers/generic_layout_optimizer_transposer.h
@@ -528,11 +528,12 @@ template <typename T>
 Status PermuteSingle(absl::string_view location,
                      absl::Span<const int> permutation, T* values) {
   DCHECK(values != nullptr);
-  if (values->size() != permutation.size()) {
+  int permutation_size = permutation.size(); 
+  if (values->size() != permutation_size) {
     return Status(tensorflow::error::Code::INVALID_ARGUMENT,
                   absl::StrCat("Size of values ", values->size(),
                                " does not match size of permutation ",
-                               permutation.size(), " @ ", location));
+                               permutation_size, " @ ", location));
   }
   typedef typename T::value_type V;
   std::vector<V> elements(values->begin(), values->end());
@@ -549,11 +550,12 @@ template <typename T>
 Status PermuteDouble(absl::string_view location,
                      absl::Span<const int> permutation, T* values) {
   DCHECK(values != nullptr);
-  if (values->size() != permutation.size() * 2) {
+  int permutation_size = permutation.size(); 
+  if (values->size() != permutation_size * 2) {
     return Status(tensorflow::error::Code::INVALID_ARGUMENT,
                   absl::StrCat("Size of values ", values->size(),
                                " does not match twice the size of permutation ",
-                               permutation.size(), " @ ", location));
+                               permutation_size, " @ ", location));
   }
   typedef typename T::value_type V;
   std::vector<V> elements(values->begin(), values->end());
diff --git a/tensorflow/core/grappler/optimizers/graph_optimizer_stage.cc b/tensorflow/core/grappler/optimizers/graph_optimizer_stage.cc
index 97033a180a6..4e955db2f5a 100644
--- a/tensorflow/core/grappler/optimizers/graph_optimizer_stage.cc
+++ b/tensorflow/core/grappler/optimizers/graph_optimizer_stage.cc
@@ -58,7 +58,7 @@ Status GetTensorProperties(const GraphOptimizerContext& ctx,
 
   const auto& output_properties =
       ctx.graph_properties->GetOutputProperties(tensor_id.node());
-  auto num_outputs = output_properties.size();
+  int num_outputs = output_properties.size();
 
   if (num_outputs == 0 || tensor_id.index() > num_outputs - 1) {
     return errors::InvalidArgument(
diff --git a/tensorflow/core/grappler/optimizers/implementation_selector.cc b/tensorflow/core/grappler/optimizers/implementation_selector.cc
index 2b0a27aaa2d..51d61cfef2e 100644
--- a/tensorflow/core/grappler/optimizers/implementation_selector.cc
+++ b/tensorflow/core/grappler/optimizers/implementation_selector.cc
@@ -130,7 +130,7 @@ string FindForwardNode(utils::MutableNodeView* backward_node) {
 void UpdateForwardIdentityNodeDtype(utils::MutableNodeView* forward_node,
                                     const DataTypeVector& dtypes) {
   const auto& fanouts_vector = forward_node->GetRegularFanouts();
-  for (int pos = 0; pos < fanouts_vector.size(); ++pos) {
+  for (int pos = 0, pos_limit = fanouts_vector.size(); pos < pos_limit; ++pos) {
     const auto& fanouts_at_pos = fanouts_vector[pos];
     for (const auto& fanout : fanouts_at_pos) {
       if ("Identity" == fanout.node_view()->GetOp()) {
diff --git a/tensorflow/core/grappler/optimizers/memory_optimizer.cc b/tensorflow/core/grappler/optimizers/memory_optimizer.cc
index 867433dcff5..c6fd2b48ac3 100644
--- a/tensorflow/core/grappler/optimizers/memory_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/memory_optimizer.cc
@@ -468,7 +468,7 @@ void RecomputationRewritingPass(RewriterConfig::MemOptType optimization_level,
         // with "gradients/" or contains "/gradients/".
         return absl::StartsWith(node.name(),
                                 recomputation_targets_name_scope) ||
-               node.name().find("/" + recomputation_targets_name_scope) != -1;
+               static_cast<int>(node.name().find("/" + recomputation_targets_name_scope)) != -1;
       };
 
   if (optimization_level == RewriterConfig::RECOMPUTATION_HEURISTICS ||
diff --git a/tensorflow/core/grappler/optimizers/scoped_allocator_optimizer.cc b/tensorflow/core/grappler/optimizers/scoped_allocator_optimizer.cc
index 464a2c17197..7a0079d2b4c 100644
--- a/tensorflow/core/grappler/optimizers/scoped_allocator_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/scoped_allocator_optimizer.cc
@@ -73,7 +73,8 @@ bool HasOpName(const string& node_name, const string& op_name) {
 Status GetOutputDataType(
     const std::vector<OpInfo::TensorProperties>& output_props, int output_index,
     DataType* dtype) {
-  if (output_index >= output_props.size()) {
+  int output_props_size = output_props.size();
+  if (output_index >= output_props_size) {
     return errors::Internal("Invalid output index ", output_index,
                             " size of output_props ", output_props.size());
   }
@@ -520,7 +521,7 @@ class UnaryElementwiseRewriter : public ScopedAllocatorOptimizer::Rewriter {
 
     // Add control edges from the ScopedAllocatorOp to all of the
     // input nodes and mark them for allocation from backing tensor.
-    for (int i = 0; i < inputs.size(); ++i) {
+    for (int i = 0, iter_limit = inputs.size(); i < iter_limit; ++i) {
       auto& nd = inputs[i];
       if (IsArg(*nd.from_node_def)) {
         return errors::Internal(
@@ -547,7 +548,7 @@ class UnaryElementwiseRewriter : public ScopedAllocatorOptimizer::Rewriter {
       std::vector<InputDesc> inputs_to_first;
       LOG_WARNING_AND_RETURN_IF_ERROR(GetDataInputs(
           graph, sa_opti->node_map(), nd.from_node_def, &inputs_to_first));
-      for (int i = 0; i < inputs_to_first.size(); ++i) {
+      for (int i = 0, iter_limit = inputs_to_first.size(); i < iter_limit; ++i) {
         if (fanout.find(inputs_to_first[i].from_node_def) != fanout.end()) {
           VLOG(2) << "Found node " << inputs_to_first[i].from_node_def->name()
                   << " in the fanout of " << sa_name;
@@ -587,7 +588,7 @@ class UnaryElementwiseRewriter : public ScopedAllocatorOptimizer::Rewriter {
     VLOG(2) << "BuildSAConcatNode " << sac_name;
     // control input: edge name -> source node name
     absl::flat_hash_map<string, string> sac_ctl_inputs;
-    for (int i = 0; i < ops.size(); ++i) {
+    for (int i = 0, iter_limit = ops.size(); i < iter_limit; ++i) {
       NodeDef* old_op = ops[i];
       for (const string& old_op_input : old_op->input()) {
         int position = 0;
@@ -708,7 +709,7 @@ class UnaryElementwiseRewriter : public ScopedAllocatorOptimizer::Rewriter {
                         const std::set<string>& op_instance_names,
                         const string& op_name, const string& sas_name) {
     VLOG(2) << "RewireSubgraph";
-    for (int op_idx = 0; op_idx < ops.size(); ++op_idx) {
+    for (int op_idx = 0, idx_limit = ops.size(); op_idx < idx_limit; ++op_idx) {
       NodeDef* old_op = ops[op_idx];
       // Copy the output node set since we'll be modifying the version
       // maintained by NodeMap in the loop.
diff --git a/tensorflow/core/grappler/optimizers/shape_optimizer.cc b/tensorflow/core/grappler/optimizers/shape_optimizer.cc
index 69de1cde4ca..656c1a1db1c 100644
--- a/tensorflow/core/grappler/optimizers/shape_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/shape_optimizer.cc
@@ -99,7 +99,8 @@ Status ShapeOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
         }
         const auto& prop =
             properties.GetOutputProperties(reduce_indices.node->name());
-        if (prop.size() <= reduce_indices.port_id) {
+        int prop_size = prop.size();
+        if (prop_size <= reduce_indices.port_id) {
           continue;
         }
         const TensorShapeProto& reduction_indices_shape =
diff --git a/tensorflow/core/kernels/data/single_threaded_executor.cc b/tensorflow/core/kernels/data/single_threaded_executor.cc
index 3a16f1018dd..eeb1ffd5ad0 100644
--- a/tensorflow/core/kernels/data/single_threaded_executor.cc
+++ b/tensorflow/core/kernels/data/single_threaded_executor.cc
@@ -51,8 +51,8 @@ class SingleThreadedExecutorImpl : public Executor {
     std::vector<Node*> ordered_nodes;
     ordered_nodes.reserve(graph.num_nodes());
     GetReversePostOrder(graph, &ordered_nodes);
-
-    if (ordered_nodes.size() != graph.num_nodes()) {
+    int ordered_nodes_size = ordered_nodes.size();
+    if (ordered_nodes_size != graph.num_nodes()) {
       return errors::InvalidArgument("Graph had ", graph.num_nodes(),
                                      " but reverse post-order had ",
                                      ordered_nodes.size());
diff --git a/tensorflow/core/kernels/initializable_lookup_table.cc b/tensorflow/core/kernels/initializable_lookup_table.cc
index 196c2fe95a3..48041526022 100644
--- a/tensorflow/core/kernels/initializable_lookup_table.cc
+++ b/tensorflow/core/kernels/initializable_lookup_table.cc
@@ -74,7 +74,7 @@ Status InitializableLookupTable::Initialize(InitTableIterator& iter) {
 
 Status InitializableLookupTable::AreEntriesSame(const InitTableIterator& iter,
                                                 bool* result) {
-  *result = iter.total_size() == size();
+  *result = static_cast<size_t>(iter.total_size()) == size();
   return Status::OK();
 }
 
diff --git a/tensorflow/core/kernels/remote_fused_graph_execute_utils.cc b/tensorflow/core/kernels/remote_fused_graph_execute_utils.cc
index b57d6163cac..e3f220b9ff4 100644
--- a/tensorflow/core/kernels/remote_fused_graph_execute_utils.cc
+++ b/tensorflow/core/kernels/remote_fused_graph_execute_utils.cc
@@ -431,7 +431,8 @@ RemoteFusedGraphExecuteUtils::AddOutputTensorShapeTypeByTensorShapeMap(
   if (data_types.empty()) {
     return false;
   }
-  CHECK(data_types.size() > port);
+  int data_types_size = data_types.size();
+  CHECK(data_types_size > port);
   *data_type = data_types.at(port);
   *shape = shapes.at(port);
   return true;
@@ -788,7 +789,8 @@ RemoteFusedGraphExecuteUtils::BuildRemoteFusedGraphExecuteOpNode(
           ++input_count;
         }
       }
-      CHECK(input_count == 0 || input_count == node->in_edges().size())
+      int node_in_edges_size = node->in_edges().size();
+      CHECK(input_count == 0 || input_count == node_in_edges_size)
           << "Invalid input_count(" << input_count << ", "
           << node->in_edges().size() << ") " << node_name;
 
@@ -968,10 +970,10 @@ RemoteFusedGraphExecuteUtils::BuildRemoteFusedGraphExecuteOpNode(
       border_inputs, border_outputs, require_shape_type, &graph, &fused_node));
 
   for (const Node* node : graph.nodes()) {
-    for (int i = 0; i < node->num_inputs(); ++i) {
+    for (int i = 0, iter_limit = node->num_inputs(); i < iter_limit; ++i) {
       const Edge* edge = nullptr;
       TF_RETURN_IF_ERROR(node->input_edge(i, &edge));
-      for (int j = 0; j < border_outputs.size(); ++j) {
+      for (int j = 0, second_iter_limit = border_outputs.size(); j < second_iter_limit; ++j) {
         const string& output = border_outputs.at(j);
         const TensorId tid = ParseTensorName(output);
         const string output_name(tid.first);
@@ -1333,8 +1335,9 @@ RemoteFusedGraphExecuteUtils::FuseRemoteGraphByPlacedArguments(
 
 /* static */ Status RemoteFusedGraphExecuteUtils::CopyByteArrayToTensor(
     const void* src_ptr, const int src_size, Tensor* tensor) {
-  CHECK(tensor->TotalBytes() >= src_size)
-      << tensor->TotalBytes() << ", " << src_size;
+  int tensor_TotalBytes = tensor->TotalBytes();
+  CHECK(tensor_TotalBytes >= src_size)
+      << tensor_TotalBytes << ", " << src_size;
   void* dst_ptr;
   switch (tensor->dtype()) {
     case DT_FLOAT:
diff --git a/tensorflow/core/kernels/stack.cc b/tensorflow/core/kernels/stack.cc
index a30729c3b2f..dd20902a26d 100644
--- a/tensorflow/core/kernels/stack.cc
+++ b/tensorflow/core/kernels/stack.cc
@@ -57,7 +57,8 @@ class Stack : public ResourceBase {
   Status Push(const TensorAndAllocation& value) {
     mutex_lock l(mu_);
     TF_RETURN_IF_ERROR(CheckNotClosed());
-    if (max_size_ >= 0 && stack_.size() >= max_size_) {
+    int stack_size = stack_.size(); 
+    if (max_size_ >= 0 && stack_size >= max_size_) {
       return errors::InvalidArgument("Stack[", stack_name_, "] overflowed ",
                                      "its max_size (", max_size_, ")");
     }
diff --git a/tensorflow/core/tpu/tpu_embedding_optimization_parameters_utils.cc b/tensorflow/core/tpu/tpu_embedding_optimization_parameters_utils.cc
index 9daf9a2cef7..c40fea7c61c 100644
--- a/tensorflow/core/tpu/tpu_embedding_optimization_parameters_utils.cc
+++ b/tensorflow/core/tpu/tpu_embedding_optimization_parameters_utils.cc
@@ -336,7 +336,7 @@ Status LoadOpShapeFunction::operator()(
                     });
   std::vector<shape_inference::ShapeHandle> inputs(user_param_count);
   int input_index = 0;
-  for (int i = 0; i < state_variable_specs.size(); ++i) {
+  for (int i = 0, iter_limit = state_variable_specs.size(); i < iter_limit; ++i) {
     if (state_variable_specs[i].has_user_defined() || is_debug_op_) {
       std::vector<shape_inference::ShapeHandle> input_temp;
       TF_RETURN_IF_ERROR(c->input(state_variable_specs[i].name(), &input_temp));
@@ -388,7 +388,7 @@ Status RetrieveOpShapeFunction::operator()(
   TF_RETURN_IF_ERROR(c->GetAttr("num_shards", &num_shards));
   int shard_id;
   TF_RETURN_IF_ERROR(c->GetAttr("shard_id", &shard_id));
-  for (int j = 0; j < state_variable_specs.size(); ++j) {
+  for (int j = 0, iter_limit = state_variable_specs.size(); j < iter_limit; ++j) {
     if (state_variable_specs[j].has_user_defined() || is_debug_op_) {
       auto shape = c->MakeShape(
           std::vector<shape_inference::DimensionHandle>(2, c->UnknownDim()));
diff --git a/tensorflow/core/util/bcast.h b/tensorflow/core/util/bcast.h
index 7bb8ea18ad3..a79a3f08622 100644
--- a/tensorflow/core/util/bcast.h
+++ b/tensorflow/core/util/bcast.h
@@ -139,7 +139,7 @@ BCastList<N>::BCastList(const BCastList::Vec (&x)[N],
     if (x[i] != x[0]) {
       all_equal = false;
     }
-    if (x[i].size() > largest_rank) {
+    if ( static_cast<int>(x[i].size()) > largest_rank) {
       largest_rank = x[i].size();
     }
   }
@@ -176,7 +176,7 @@ BCastList<N>::BCastList(const BCastList::Vec (&x)[N],
 
   // 1-extend and align all vectors.
   for (int i = 0; i < N; ++i) {
-    if (copy[i].size() < largest_rank) {
+    if (static_cast<int>(copy[i].size()) < largest_rank) {
       copy[i].resize(largest_rank, 1);
     }
   }

From 2399d37889c044fb6fb52fb3abc2bb5d41e64a28 Mon Sep 17 00:00:00 2001
From: Mihai Maruseac <mihaimaruseac@google.com>
Date: Mon, 22 Jun 2020 19:18:54 -0700
Subject: [PATCH 0854/1390] Create `RELEASE.md` stub.

Now, every PR and commit can add to the release notes before the branch is cut for a new release.

PiperOrigin-RevId: 317783211
Change-Id: If65da4b956f83940c7bd539d40f915cfd9af4db0
---
 RELEASE.md | 54 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 54 insertions(+)

diff --git a/RELEASE.md b/RELEASE.md
index f93626cc876..68d9399676a 100644
--- a/RELEASE.md
+++ b/RELEASE.md
@@ -1,3 +1,57 @@
+# Release 2.4.0
+
+<INSERT SMALL BLURB ABOUT RELEASE FOCUS AREA AND POTENTIAL TOOLCHAIN CHANGES>
+
+## Breaking Changes
+
+* <DOCUMENT BREAKING CHANGES HERE>
+* <THIS SECTION SHOULD CONTAIN API, ABI AND BEHAVIORAL BREAKING CHANGES>
+
+## Known Caveats
+
+* <CAVEATS REGARDING THE RELEASE (BUT NOT BREAKING CHANGES). E.G. ADDING A NEW DEPENDENCY, BUMPING A DEPENDENCY NUMBER, LACK OF SUPPORT ON SOME PLATFORM, ETC>
+
+## Major Features and Improvements
+
+* <INSERT MAJOR FEATURE HERE, USING MARKDOWN SYNTAX>
+* <IF RELEASE CONTAINS MULTIPLE FEATURES FROM SAME AREA, GROUP THEM TOGETHER>
+
+## Bug Fixes and Other Changes
+
+* <SIMILAR TO ABOVE SECTION, BUT FOR OTHER IMPORTANT CHANGES / BUG FIXES>
+* <IF A CHANGE CLOSES A GITHUB ISSUE, IT SHOULD BE DOCUMENTED HERE>
+* <NOTES SHOULD BE GROUPED PER AREA>
+* TF Core:
+    * <ADD RELEASE NOTES HERE>
+* `tf.data`:
+    * <ADD RELEASE NOTES HERE>
+*   `tf.distribute`:
+    * <ADD RELEASE NOTES HERE>
+*   `tf.keras`:
+    * <ADD RELEASE NOTES HERE>
+*   `tf.function`/AutoGraph:
+    * <ADD RELEASE NOTES HERE>
+*   `tf.lite`:
+    * <ADD RELEASE NOTES HERE>
+*   `tf.random`:
+    * <ADD RELEASE NOTES HERE>
+*   Math and Linear Algebra:
+    * <ADD RELEASE NOTES HERE>
+*   TPU Enhancements:
+    * <ADD RELEASE NOTES HERE>
+*   XLA Support:
+    * <ADD RELEASE NOTES HERE>
+*   Tracing and Debugging:
+    * <ADD RELEASE NOTES HERE>
+*   Other:
+    * <ADD RELEASE NOTES HERE>
+
+## Thanks to our Contributors
+
+This release contains contributions from many people at Google, as well as:
+
+<INSERT>, <NAME>, <HERE>, <USING>, <GITHUB>, <HANDLE>
+
 # Release 2.3.0
 
 ## Breaking Changes

From 821d1b087be221aa3c58be8e077d4553d6851ef2 Mon Sep 17 00:00:00 2001
From: Jiho Choi <jihochoi@google.com>
Date: Mon, 22 Jun 2020 19:30:34 -0700
Subject: [PATCH 0855/1390] Add element tracing for parallel_map to track
 exactly which upstream event produces an element for each parallel_map's
 GetNext call.

PiperOrigin-RevId: 317784362
Change-Id: I981692f8d152ab8b6c6c343d01563800a8f94600
---
 tensorflow/core/kernels/data/BUILD            |  2 ++
 .../kernels/data/parallel_map_dataset_op.cc   | 19 ++++++++++++++++++-
 2 files changed, 20 insertions(+), 1 deletion(-)

diff --git a/tensorflow/core/kernels/data/BUILD b/tensorflow/core/kernels/data/BUILD
index 0972dc83ccf..74283b63b67 100644
--- a/tensorflow/core/kernels/data/BUILD
+++ b/tensorflow/core/kernels/data/BUILD
@@ -470,6 +470,8 @@ tf_kernel_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/profiler/lib:traceme",
+        "//tensorflow/core/profiler/lib:traceme_encode",
     ],
 )
 
diff --git a/tensorflow/core/kernels/data/parallel_map_dataset_op.cc b/tensorflow/core/kernels/data/parallel_map_dataset_op.cc
index bae90549841..1dc27ce6635 100644
--- a/tensorflow/core/kernels/data/parallel_map_dataset_op.cc
+++ b/tensorflow/core/kernels/data/parallel_map_dataset_op.cc
@@ -29,6 +29,8 @@ limitations under the License.
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/random/random.h"
 #include "tensorflow/core/platform/stringprintf.h"
+#include "tensorflow/core/profiler/lib/traceme.h"
+#include "tensorflow/core/profiler/lib/traceme_encode.h"
 #include "tensorflow/core/protobuf/error_codes.pb.h"
 
 namespace tensorflow {
@@ -241,6 +243,10 @@ class ParallelMapDatasetOp::Dataset : public DatasetBase {
       RecordStop(ctx);
       result->notification.WaitForNotification();
       RecordStart(ctx);
+      profiler::TraceMe traceme([&] {
+        return profiler::TraceMeEncode("ParallelMapConsume",
+                                       {{"element_id", result->id}});
+      });
       return ProcessResult(ctx, result, out_tensors, end_of_sequence);
     }
 
@@ -358,10 +364,14 @@ class ParallelMapDatasetOp::Dataset : public DatasetBase {
 
    private:
     struct InvocationResult {
+      InvocationResult() {}
+      explicit InvocationResult(int64 id) : id(id) {}
+
       Notification notification;
       Status status;
       std::vector<Tensor> return_values;
       bool end_of_input;
+      int64 id;
     };
 
     void CancelThreads(bool wait) TF_LOCKS_EXCLUDED(mu_) {
@@ -402,6 +412,10 @@ class ParallelMapDatasetOp::Dataset : public DatasetBase {
     void CallFunction(const std::shared_ptr<IteratorContext>& ctx,
                       const std::shared_ptr<InvocationResult>& result)
         TF_LOCKS_EXCLUDED(*mu_) {
+      profiler::TraceMe traceme([&] {
+        return profiler::TraceMeEncode("ParallelMapProduce",
+                                       {{"element_id", result->id}});
+      });
       // Get the next input element.
       std::vector<Tensor> input_element;
       result->status = input_impl_->GetNext(ctx.get(), &input_element,
@@ -490,6 +504,8 @@ class ParallelMapDatasetOp::Dataset : public DatasetBase {
         return num_calls_ >= num_parallel_calls ||
                invocation_results_.size() >= num_parallel_calls;
       };
+      // Counts the total number of calls to use as an id of InvocationResult.
+      int64 num_total_calls = 0;
       while (true) {
         {
           mutex_lock l(*mu_);
@@ -502,7 +518,8 @@ class ParallelMapDatasetOp::Dataset : public DatasetBase {
             return;
           }
           while (!busy()) {
-            invocation_results_.push_back(std::make_shared<InvocationResult>());
+            invocation_results_.push_back(
+                std::make_shared<InvocationResult>(num_total_calls++));
             new_calls.push_back(invocation_results_.back());
             num_calls_++;
           }

From 01fdbb866b4350acd74370d0a4b182feb4a056d8 Mon Sep 17 00:00:00 2001
From: rahul-kamat <rahulkamat@gmail.com>
Date: Tue, 23 Jun 2020 02:46:05 +0000
Subject: [PATCH 0856/1390] Remove imports from python_op_gen_test

---
 tensorflow/python/framework/python_op_gen_test.cc | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/tensorflow/python/framework/python_op_gen_test.cc b/tensorflow/python/framework/python_op_gen_test.cc
index 2561195c407..cf6566ea7ae 100644
--- a/tensorflow/python/framework/python_op_gen_test.cc
+++ b/tensorflow/python/framework/python_op_gen_test.cc
@@ -20,9 +20,6 @@ limitations under the License.
 #include "tensorflow/core/framework/op_gen_lib.h"
 #include "tensorflow/core/platform/test.h"
 
-#include "tensorflow/core/lib/io/path.h"
-#include "tensorflow/core/lib/strings/str_util.h"
-
 namespace tensorflow {
 namespace {
 

From b16994e2d646df982c2c8614a513742ac06c3cf0 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 22 Jun 2020 20:27:46 -0700
Subject: [PATCH 0857/1390] Add element tracing for parallel_map to track
 exactly which upstream event produces an element for each parallel_map's
 GetNext call.

PiperOrigin-RevId: 317789815
Change-Id: Ic8a50a3a311051069ed25973b33cd2b0b9bbece5
---
 tensorflow/core/kernels/data/BUILD            |  2 --
 .../kernels/data/parallel_map_dataset_op.cc   | 19 +------------------
 2 files changed, 1 insertion(+), 20 deletions(-)

diff --git a/tensorflow/core/kernels/data/BUILD b/tensorflow/core/kernels/data/BUILD
index 74283b63b67..0972dc83ccf 100644
--- a/tensorflow/core/kernels/data/BUILD
+++ b/tensorflow/core/kernels/data/BUILD
@@ -470,8 +470,6 @@ tf_kernel_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core/profiler/lib:traceme",
-        "//tensorflow/core/profiler/lib:traceme_encode",
     ],
 )
 
diff --git a/tensorflow/core/kernels/data/parallel_map_dataset_op.cc b/tensorflow/core/kernels/data/parallel_map_dataset_op.cc
index 1dc27ce6635..bae90549841 100644
--- a/tensorflow/core/kernels/data/parallel_map_dataset_op.cc
+++ b/tensorflow/core/kernels/data/parallel_map_dataset_op.cc
@@ -29,8 +29,6 @@ limitations under the License.
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/random/random.h"
 #include "tensorflow/core/platform/stringprintf.h"
-#include "tensorflow/core/profiler/lib/traceme.h"
-#include "tensorflow/core/profiler/lib/traceme_encode.h"
 #include "tensorflow/core/protobuf/error_codes.pb.h"
 
 namespace tensorflow {
@@ -243,10 +241,6 @@ class ParallelMapDatasetOp::Dataset : public DatasetBase {
       RecordStop(ctx);
       result->notification.WaitForNotification();
       RecordStart(ctx);
-      profiler::TraceMe traceme([&] {
-        return profiler::TraceMeEncode("ParallelMapConsume",
-                                       {{"element_id", result->id}});
-      });
       return ProcessResult(ctx, result, out_tensors, end_of_sequence);
     }
 
@@ -364,14 +358,10 @@ class ParallelMapDatasetOp::Dataset : public DatasetBase {
 
    private:
     struct InvocationResult {
-      InvocationResult() {}
-      explicit InvocationResult(int64 id) : id(id) {}
-
       Notification notification;
       Status status;
       std::vector<Tensor> return_values;
       bool end_of_input;
-      int64 id;
     };
 
     void CancelThreads(bool wait) TF_LOCKS_EXCLUDED(mu_) {
@@ -412,10 +402,6 @@ class ParallelMapDatasetOp::Dataset : public DatasetBase {
     void CallFunction(const std::shared_ptr<IteratorContext>& ctx,
                       const std::shared_ptr<InvocationResult>& result)
         TF_LOCKS_EXCLUDED(*mu_) {
-      profiler::TraceMe traceme([&] {
-        return profiler::TraceMeEncode("ParallelMapProduce",
-                                       {{"element_id", result->id}});
-      });
       // Get the next input element.
       std::vector<Tensor> input_element;
       result->status = input_impl_->GetNext(ctx.get(), &input_element,
@@ -504,8 +490,6 @@ class ParallelMapDatasetOp::Dataset : public DatasetBase {
         return num_calls_ >= num_parallel_calls ||
                invocation_results_.size() >= num_parallel_calls;
       };
-      // Counts the total number of calls to use as an id of InvocationResult.
-      int64 num_total_calls = 0;
       while (true) {
         {
           mutex_lock l(*mu_);
@@ -518,8 +502,7 @@ class ParallelMapDatasetOp::Dataset : public DatasetBase {
             return;
           }
           while (!busy()) {
-            invocation_results_.push_back(
-                std::make_shared<InvocationResult>(num_total_calls++));
+            invocation_results_.push_back(std::make_shared<InvocationResult>());
             new_calls.push_back(invocation_results_.back());
             num_calls_++;
           }

From 3ed1e3029e68ca8cb6306c8f31182306741dcf0c Mon Sep 17 00:00:00 2001
From: Nick Kreeger <kreeger@google.com>
Date: Mon, 22 Jun 2020 20:37:38 -0700
Subject: [PATCH 0858/1390] Remove static_assert for type checking in
 FlatBufferVectorToTfLiteTypeArray.

It turns out that std::is_same() has dropped the non-string argument in c++17. This breaks internal users that are building against qualcomm.

PiperOrigin-RevId: 317790812
Change-Id: If56a61d20426670251b55f370a6b5fa886a49e21
---
 tensorflow/lite/micro/micro_allocator.cc | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/tensorflow/lite/micro/micro_allocator.cc b/tensorflow/lite/micro/micro_allocator.cc
index bf9e38d1050..239a23335a6 100644
--- a/tensorflow/lite/micro/micro_allocator.cc
+++ b/tensorflow/lite/micro/micro_allocator.cc
@@ -401,12 +401,9 @@ TfLiteStatus FlatBufferVectorToTfLiteTypeArray(
     kTfLiteArrayType** result) {
   TFLITE_DCHECK(error_reporter != nullptr);
   TFLITE_DCHECK(flatbuffer_array != nullptr);
-  // Only two conversions are supported - float and int32 - ensure that these
-  // match at compile time instead of duplicating functions here:
-  static_assert((std::is_same<kFlatBufferVectorType, int32_t>() &&
-                 std::is_same<kTfLiteArrayType, TfLiteIntArray>()) ||
-                (std::is_same<kFlatBufferVectorType, float>() &&
-                 std::is_same<kTfLiteArrayType, TfLiteFloatArray>()));
+  // TODO(b/159668691): Consider adding type assertion or breaking this function
+  // into multiple functions for each type. std::is_same is c++11 and has a
+  // special updated constructor in c++17 that requires a string argument.
   if (FLATBUFFERS_LITTLEENDIAN) {
     // On little-endian machines, TfLite*Array happens to have the same memory
     // layout as flatbuffers:Vector<kFlatBufferVectorType>, so we can

From 0453c02f43b64e54caf1a3f001f0c01def38ba37 Mon Sep 17 00:00:00 2001
From: Andy Ly <lyandy@google.com>
Date: Mon, 22 Jun 2020 21:36:57 -0700
Subject: [PATCH 0859/1390] Update tf.DeviceIndex description in TensorFlow
 MLIR ODS to match description in TensorFlow op registry (NFC).

PiperOrigin-RevId: 317796799
Change-Id: Ic4d6ca197cce7c56e491f59814cf913dcd64713c
---
 tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td b/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
index 7f250392cb2..65ca3ea4dbd 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
@@ -2378,6 +2378,13 @@ horizontal and vertices strides, `strides = [1, stride, stride, 1]`.
 def TF_DeviceIndexOp : TF_Op<"DeviceIndex", [NoSideEffect]> {
   let summary = "Return the index of device the op runs.";
 
+  let description = [{
+Given a list of device names, this operation returns the index of the device
+this op runs. The length of the list is returned in two cases:
+(1) Device does not exist in the given device list.
+(2) It is in XLA compilation.
+  }];
+
   let arguments = (ins
     StrArrayAttr:$device_names
   );

From 79fc3fd8fcc4b2936ca630cec6898a83ee92f59b Mon Sep 17 00:00:00 2001
From: Rick Chao <rchao@google.com>
Date: Mon, 22 Jun 2020 21:41:24 -0700
Subject: [PATCH 0860/1390] Skip data loading error in
 multi_worker_tutorial_test (the test does not aim to cover this).

PiperOrigin-RevId: 317797271
Change-Id: I8336d7ffeda0836beef0a2d04e633614a44e7fa4
---
 .../python/keras/distribute/multi_worker_tutorial_test.py      | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tensorflow/python/keras/distribute/multi_worker_tutorial_test.py b/tensorflow/python/keras/distribute/multi_worker_tutorial_test.py
index 3f9ab18f89c..aafa00bcc9f 100644
--- a/tensorflow/python/keras/distribute/multi_worker_tutorial_test.py
+++ b/tensorflow/python/keras/distribute/multi_worker_tutorial_test.py
@@ -19,6 +19,7 @@ from __future__ import print_function
 import contextlib
 import os
 import re
+import zipfile
 from absl.testing import parameterized
 import numpy as np
 from tensorflow.python import keras
@@ -43,6 +44,8 @@ class MultiWorkerTutorialTest(parameterized.TestCase, test.TestCase):
   def skip_fetch_failure_exception(self):
     try:
       yield
+    except zipfile.BadZipfile as e:
+      self.skipTest('Data loading error: Bad magic number for file header.')
     except Exception as e:  # pylint: disable=broad-except
       if 'URL fetch failure' in str(e):
         self.skipTest('URL fetch error not considered failure of the test.')

From e3904ac3aef005326e126e8240eaf5bf23cb3b04 Mon Sep 17 00:00:00 2001
From: Henry Tan <henrytan@google.com>
Date: Mon, 22 Jun 2020 22:01:28 -0700
Subject: [PATCH 0861/1390] Internal TPU library change.

PiperOrigin-RevId: 317799072
Change-Id: Ia9b4fba3aa8292e5b089d5741f359eef9763df09
---
 .../core/tpu/kernels/tpu_compile_c_api.h      |  2 +-
 tensorflow/core/tpu/kernels/tpu_op_util.cc    | 37 ++++++++++++++-----
 tensorflow/core/tpu/kernels/tpu_op_util.h     |  7 ++++
 3 files changed, 36 insertions(+), 10 deletions(-)

diff --git a/tensorflow/core/tpu/kernels/tpu_compile_c_api.h b/tensorflow/core/tpu/kernels/tpu_compile_c_api.h
index c101e489d56..eab53fe9da4 100644
--- a/tensorflow/core/tpu/kernels/tpu_compile_c_api.h
+++ b/tensorflow/core/tpu/kernels/tpu_compile_c_api.h
@@ -35,7 +35,7 @@ struct CompilationCacheKeyProperty {
   const char* mlir_module;
   const int32_t* device_ids;
   size_t device_ids_size;
-  int32_t guaranteed_constants_size;
+  size_t guaranteed_constants_size;
   uint64_t function_library_fingerprint;
   int32_t num_cores_per_replica;
   int32_t num_replicas;
diff --git a/tensorflow/core/tpu/kernels/tpu_op_util.cc b/tensorflow/core/tpu/kernels/tpu_op_util.cc
index e2f717fea8b..31b0cc6c72d 100644
--- a/tensorflow/core/tpu/kernels/tpu_op_util.cc
+++ b/tensorflow/core/tpu/kernels/tpu_op_util.cc
@@ -24,12 +24,13 @@ namespace tpu {
 namespace {
 // Return fingerprint_in_metadata if it's not empty; otherwise read input tensor
 // data to compute the fingerprint.
-std::string GuaranteedConstFingerprint(
-    const string& fingerprint_in_metadata,
-    const OpInputList& guaranteed_constants) {
+std::string GuaranteedConstFingerprint(const string& fingerprint_in_metadata,
+                                       const Tensor* guaranteed_constants,
+                                       size_t guaranteed_constants_size) {
   if (fingerprint_in_metadata.empty()) {
     uint64_t fingerprint = 0;
-    for (const auto& constant : guaranteed_constants) {
+    for (size_t i = 0; i < guaranteed_constants_size; ++i) {
+      const Tensor& constant = guaranteed_constants[i];
       fingerprint = TpuCompile_CreateGuaranteedConstFingerprint(
           fingerprint, constant.tensor_data().data(),
           constant.tensor_data().size());
@@ -86,9 +87,12 @@ std::string CreateConfigPrefix(const TPUCompileMetadataProto& metadata) {
 }
 }  // namespace
 
+// The `guaranteed_constants` must be passed as reference due to the lazy
+// evaluation of `guaranteed_const_fingerprint()` callback.
 TpuCompilationCacheKey CreateCompilationCacheKey(
     absl::string_view function_name, uint64 function_library_fingerprint,
-    absl::string_view mlir_module, const OpInputList& guaranteed_constants,
+    absl::string_view mlir_module, const Tensor* guaranteed_constants,
+    size_t guaranteed_constants_size,
     const std::vector<TensorShape>& dynamic_shapes,
     const TPUCompileMetadataProto& metadata,
     const TpuMeshStateInterface& mesh_state) {
@@ -114,7 +118,7 @@ TpuCompilationCacheKey CreateCompilationCacheKey(
           mlir_module.data(),
           flattened_device_ids.data(),
           flattened_device_ids.size(),
-          guaranteed_constants.size(),
+          guaranteed_constants_size,
           function_library_fingerprint,
           metadata.num_cores_per_replica(),
           metadata.num_replicas(),
@@ -128,7 +132,7 @@ TpuCompilationCacheKey CreateCompilationCacheKey(
 
   // Guaranteed constants can be different across sessions. Use session_handle
   // and guaranteed_const fingerprint to guarantee no collision.
-  if (guaranteed_constants.size() > 0) {
+  if (guaranteed_constants != nullptr && guaranteed_constants_size > 0) {
     key.has_guaranteed_const = true;
     key.session_handle = metadata.session_handle();
     // Both `metadata` and `guaranteed_constants` lifetime are captured by
@@ -136,16 +140,31 @@ TpuCompilationCacheKey CreateCompilationCacheKey(
     // managed through the `TPUCompileOpKernelImpl` that outlives the
     // lifetime of the compilation cache lookups.
     string fingerprint;
-    key.guaranteed_const_fingerprint = [&metadata, &guaranteed_constants,
+    key.guaranteed_const_fingerprint = [&metadata, guaranteed_constants,
+                                        guaranteed_constants_size,
                                         fingerprint]() mutable {
       if (fingerprint.empty()) {
         fingerprint = GuaranteedConstFingerprint(
-            metadata.guaranteed_const_fingerprint(), guaranteed_constants);
+            metadata.guaranteed_const_fingerprint(), guaranteed_constants,
+            guaranteed_constants_size);
       }
       return fingerprint;
     };
   }
   return key;
 }
+
+TpuCompilationCacheKey CreateCompilationCacheKey(
+    absl::string_view function_name, uint64 function_library_fingerprint,
+    absl::string_view mlir_module, const OpInputList& guaranteed_constants,
+    size_t guaranteed_constants_size,
+    const std::vector<TensorShape>& dynamic_shapes,
+    const TPUCompileMetadataProto& metadata,
+    const TpuMeshStateInterface& mesh_state) {
+  return CreateCompilationCacheKey(
+      function_name, function_library_fingerprint, mlir_module,
+      (guaranteed_constants.size() > 0 ? &guaranteed_constants[0] : nullptr),
+      guaranteed_constants.size(), dynamic_shapes, metadata, mesh_state);
+}
 }  // namespace tpu
 }  // namespace tensorflow
diff --git a/tensorflow/core/tpu/kernels/tpu_op_util.h b/tensorflow/core/tpu/kernels/tpu_op_util.h
index 0a9657ca05e..bbaa05682e6 100644
--- a/tensorflow/core/tpu/kernels/tpu_op_util.h
+++ b/tensorflow/core/tpu/kernels/tpu_op_util.h
@@ -34,6 +34,13 @@ TpuCompilationCacheKey CreateCompilationCacheKey(
     const std::vector<TensorShape>& dynamic_shapes,
     const TPUCompileMetadataProto& metadata,
     const TpuMeshStateInterface& mesh_state);
+TpuCompilationCacheKey CreateCompilationCacheKey(
+    absl::string_view function_name, uint64 function_library_fingerprint,
+    absl::string_view mlir_module, const Tensor* guaranteed_constants,
+    size_t guaranteed_constants_size,
+    const std::vector<TensorShape>& dynamic_shapes,
+    const TPUCompileMetadataProto& metadata,
+    const TpuMeshStateInterface& mesh_state);
 }  // namespace tpu
 }  // namespace tensorflow
 

From ff4490f636c87bf9efacfca06bd1f70a61bcfdc9 Mon Sep 17 00:00:00 2001
From: Taehee Jeong <taeheej@google.com>
Date: Mon, 22 Jun 2020 22:05:00 -0700
Subject: [PATCH 0862/1390] Config Core ML delegate provider correctly

PiperOrigin-RevId: 317799599
Change-Id: I6eef7b9de910b2401cf24909d1e245dc972103f9
---
 tensorflow/lite/tools/delegates/coreml_delegate_provider.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/lite/tools/delegates/coreml_delegate_provider.cc b/tensorflow/lite/tools/delegates/coreml_delegate_provider.cc
index c6509618aee..7d88f04c8b4 100644
--- a/tensorflow/lite/tools/delegates/coreml_delegate_provider.cc
+++ b/tensorflow/lite/tools/delegates/coreml_delegate_provider.cc
@@ -17,6 +17,7 @@ limitations under the License.
 #include "tensorflow/lite/tools/delegates/delegate_provider.h"
 #include "tensorflow/lite/tools/evaluation/utils.h"
 #if defined(__APPLE__)
+#include "TargetConditionals.h"
 #if TARGET_OS_IPHONE && !TARGET_IPHONE_SIMULATOR
 // Only enable metal delegate when using a real iPhone device.
 #define REAL_IPHONE_DEVICE

From 40e860b7a46cf3768bd177b25b1e35750860f73f Mon Sep 17 00:00:00 2001
From: Mehdi Amini <aminim@google.com>
Date: Mon, 22 Jun 2020 22:21:55 -0700
Subject: [PATCH 0863/1390] Add a doc on the current short-term plan for XLA
 GPU CodeGen with MLIR

PiperOrigin-RevId: 317801702
Change-Id: Idabeb7717b357622e9cbcc2d4352df6b4ce25373
---
 .../compiler/mlir/g3doc/xla_gpu_codegen.md    | 265 ++++++++++++++++++
 1 file changed, 265 insertions(+)
 create mode 100644 tensorflow/compiler/mlir/g3doc/xla_gpu_codegen.md

diff --git a/tensorflow/compiler/mlir/g3doc/xla_gpu_codegen.md b/tensorflow/compiler/mlir/g3doc/xla_gpu_codegen.md
new file mode 100644
index 00000000000..06c55abf1fa
--- /dev/null
+++ b/tensorflow/compiler/mlir/g3doc/xla_gpu_codegen.md
@@ -0,0 +1,265 @@
+# MLIR CodeGen for XLA
+
+<!--*
+# Document freshness: For more information, see go/fresh-source.
+freshness: { owner: 'timshen' reviewed: '2020-06-16' }
+*-->
+
+XLA operates on `HloInstruction` and performs many optimizations on this
+representation, sharing a lot of these between targeted devices. As some point a
+linear schedule is computed and the memory buffer is assigned to each value
+statically. The device specific codegen operates by traversing this sequence and
+calling "emitters" to generate a representation suitable for the device (for
+example a single LLVM function per XLA computation on CPU, or a sequence of
+"thunks" encapsulating GPU operations and possibly generated PTX when targeting
+GPU).
+
+As a staging step, we're currently in the process of intercepting the process
+right after XLA completes the buffer-assignment phase and emit instead an MLIR
+module in the `lhlo` dialect. From there we perform the codegen using MLIR
+components (Linalg, affine, and GPU dialect mainly) depending on the device.
+
+Below is the plan of record to incrementally migrate XLA/GPU by using `lhlo` as
+the codegen input.
+
+## Tasks
+
+              | Host                     | Device
+------------- | ------------------------ | ------------------------
+Input format  | HloInstruction* (Task 1) | HloInstruction* (Task 1)
+Output format | xla::Thunk (Task 2)      | LLVM IR (Task 3)
+
+*   **Task 1** changes both host and device input format from HloInstruction* to
+    LHLO.
+*   **Task 2** changes output format of host from thunks to "some landing pad
+    for host" (see below).
+*   **Task 3** migrates device output from LLVM IR to some form of MLIR. It's
+    optional to this project, and see the section "Migrating Device LLVM IR" for
+    details.
+
+This project prioritizes having end-to-end runnable models with LHLO-emitters
+enabled as much as possible. This implies that the following order list of
+objectives by priority:
+
+*   Make XLA/GPU runnable with LHLO emitters, with existing Thunks and emitters
+    unmodified.
+*   Eliminate the references to HloInstruction\* in LHLO, case by case:
+    *   Switch a legacy emitter to an MLIR-based emitter (e.g. Linalg), or
+    *   Mechanically translate the existing emitter to take MLIR representation
+        (migrate to Standard with GPU Dialect).
+
+## Migrating Thunks (Task 2)
+
+xla::gpu::Thunk is a data structure that:
+
+*   Can be called into from the host (xla::gpu::Thunk::ExecuteOnStream()).
+*   Carries various data in its subclasses.
+*   Interacts with BufferAllocation::Slice and StreamExecutor.
+*   Launches kernels
+*   Calls into all runtime libraries.
+
+The cost of that includes:
+
+*   Representing op-specific configuration data (e.g. convolution configs).
+*   Migrating op shape and operand shapes.
+*   Representing a tree of thunks (while, condition, etc).
+
+The migration work is independent from LHLO / emitter migration. Under limited
+resources, it's prioritized behind LHLO / emitter migration.
+
+We have several choices on how to lower the host-side part from LHLO:
+
+*   TFRT
+    *   (Pro) great CUDA and HIP wrappers for use.
+    *   (Pro) easy to implement library calls (cuDNN, cuBLAS, cuFFT, etc), as
+        TFRT ops are interpreted by C++ code.
+    *   (Con) host side is under development and not tested.
+    *   (Con) the JAX integration isn’t clear from a runtime point of view
+*   Jitted CPU code
+    *   (Pro) great lower-ability. Create a few loops and conditions and it's
+        done.
+    *   (Con) GPUDialect doesn't yet model chains/streams/asynchronicity/device
+        allocation.
+    *   (Con) CUDA / HIP runtime support is minimal (toolkit path, version,
+        dynamic loading, etc).
+*   Existing (interpreting) XLA runtime
+
+Tentative conclusion: Use jitted CPU code during the transition, and optionally
+adopt TFRT in the end.
+
+## Migrating Device LLVM IR (Task 3)
+
+An elemental emitter generates target op by filling it element by element. Each
+output element depends on a set of elements from the operands. All elements are
+described by combining the buffer with dynamic indices. It's sufficient to
+describe almost all "math" ops, but for performance reasons only a large subset
+of "math" ops are implemented directly in (Cpu|Gpu)ElementalIrEmitter.
+
+ElementalIrEmitter is unique in that:
+
+*   A large portion of the code is shared between XLA/GPU and CPU.
+*   It represents a large portion of ops seen in models, including all
+    element-wise ops.
+*   Most fusions solely depend on ElementalIrEmitter.
+*   It's structurally simple, as it describes a data dependency DAG between op
+    elements and operand elements.
+*   It's mostly portable and high-level (e.g. unlike GPU kReduce and GPU kCopy).
+*   Dynamic shape support is easy for at least element-wise ops.
+
+Now, for all ops, elementally-emitted or not, there are several flavors of the
+end state of each XLA op:
+
+1.  Device code stays as LLVM IR.
+1.  Refactor the old emitter to be like LHLO -> MLIR LLVM Dialect:
+    *   (Cost) Will be throw-away work if we want to ultimately migrate to
+        Standard.
+    *   (Benefit) It is easy and mechanical. Can be done in a short period.
+    *   (Benefit) It doesn't benefit more compared to a).
+1.  Refactor old emitters to be like LHLO -> MLIR GPU + Standard + Loops:
+    *   (Cost) Lifting existing emitters to Standard introduces some challenges.
+        Pointers and GEPs need to be converted to MemRefs and SubViews. Ensuring
+        amdgpu completeness is another one.
+    *   (Cost) XLA/GPU heavily relies on LLVM metadata:
+        *   `range` for block/thread indices.
+        *   `align`, `dereferenceable`, `invariant.load`, `alias.scope`,
+            `noalias` for load/stores.
+        *   `llvm.loop.unroll.disable`, `llvm.loop.unroll.full`,
+            `llvm.loop.vectorize.enable` for sequential loops.
+    *   (Benefit) Can be long-term. More portable.
+1.  Refactor old emitters to be LHLO -> Linalg, and write new Linalg emitters
+    *   (Cost) This is case by case. Compared to previous options, a new
+        implementation that matches XLA's performance needs to go through the
+        benchmark <-> optimize workflow, which can be a significant cost for
+        some ops.
+    *   (Benefit) unified stack; community support; portability; more
+        optimization potentials.
+
+## Prioritization
+
+While all three tasks mentioned above are parallelizable, under limited
+resources they have to be serialized. The prioritization focuses on visible
+results for completion of each task.
+
+The prioritization is: Task1 (LHLO for legacy emitters) > Task 2 (Thunks) > Task
+3 (MLIR emitters).
+
+By the end of Task 1, users of XLA can generate an LHLO (e.g. kernel generator)
+and execute them. The compilation format will not be serializable MLIR.
+
+By the end of Task 2, LHLO lowers to proper, serializable MLIR. This enables
+offline compilation.
+
+By the end of Task 3, all XLA emitters are MLIR-based in its implementation.
+
+## Detailed Design
+
+### Step 1: (Task 1) Complete LHLO and Make Legacy Emitters Take LHLO
+
+This step makes all existing XLA/GPU emitters interact with MLIR ops. This step
+is pure refactoring and NFC.
+
+This step is mostly mechanical, but it's worth noticing the following
+discrepancies between an unnested HloComputation and LHLO:
+
+*   Each HloInstruction has direct access to its operands (a data-flow DAG). On
+    contrary, each LHLO op only has access to its operand buffers (a bipartite
+    between ops and buffers). LHLO ops have to go through use-def chains to
+    access their operand ops.
+*   Unnested legacy emitters empirically almost never access their operands. The
+    only exception is kReduce.
+*   Unnested legacy emitters access BufferAssignment only for getting slices,
+    not for accessing aux data structures like dataflow\_analysis() or
+    alias\_analysis(). llvm\_ir builds its own alias\_analysis() based on slice
+    information.
+
+The conclusion is that LHLO should fit right-in without major hassle.
+
+### Step 2: (Optional) Profiling Support
+
+**This step is only needed if we start to discard some of the XLA Thunk logic
+(see the next step).**
+
+Before actually turning on any MLIR-based emitters, we need profiling for
+MLIR-based emitters.
+
+Currently XLA performs its own profiling by calling into StreamExecutor's timer.
+The timer under the hood inserts two events before and after a kernel launch,
+and measures the sync time between these two events.
+
+There are roughly three approaches to support profiling in MLIR:
+
+*   Run a profiler end-to-end
+*   Add a profile op for each op in LHLO, using an injected profiler.
+
+The "end-to-end" approach is transparent to MLIR, but suffers the same problem
+that makes XLA not use it in the first place: library calls collected by a
+profiler (nvprof/...) can't easily relate to HLO ops. For example, cuDNN
+launches multiple kernels for each HLO, and it's hard to tell which kernels
+correspond to which HLO.
+
+The "injected profiler" approach requires:
+
+*   LHLO to take a profiler as a parameter.
+*   inserting profile.start / profile.end before and after each op.
+*   a pass from that lowers profile.{start,end} to a C++ implementation.
+
+The exact profiling can't be easily done for MLIR-generated ops, since:
+
+*   MLIR doesn't have a timer, nor it depends on TFRT / StreamExecutor.
+*   MLIR doesn't easily call into C functions with complicated parameters.
+
+### Step 3: (Task 2) Migrating Thunks
+
+This step migrates all host ops and library calls. This step will eliminate most
+of the thunks and produce serializable MLIR instead.
+
+There are roughly three kinds of thunks:
+
+*   KernelThunk, which launches a kernel.
+*   Control flow thunks, which has host control flow logic (conditional, while,
+    for, sequence) and launch body kernels.
+*   Library thunks: cuDNN, cuBLAS, cuFFT, NCCL, etc.
+
+The **bottom line** is to:
+
+*   Create a Thunk dialect that provides (de)serialize logic for all existing
+    C++-based Thunks.
+*   Change emitters to emit a graph of Thunk dialect.
+
+**Optionally**, we can relieve some thunks from C++ implementation. KernelThunk
+can lower to the GPU LaunchKernelOp. Control flow thunks can leverage the CFG
+Dialect for loops and conditions, combined with LaunchKernelOp. This optional
+step requires profiling and stream support.
+
+### Step 4: (Task 3) Migrated ElementalIrEmitter
+
+Once profiling is ready, we can complete and tune all ElementalIrEmitter-based
+emitters in MLIR. Then we turn them on by default, assuming that all of these
+MLIR-based emitters use a single stream.
+
+Notice that it's beneficial to migrate XLA/CPU's ElementalIrEmitter as well,
+since they share a large portion of the code.
+
+With all benchmarking and performance hunting done (TODO: define performance
+parity), we turn on the new MLIR-based elemental emitter, and delete the legacy
+ElementalIrEmitter.
+
+This step also provides easy fusion transitions (nested ops) for the later
+migration.
+
+### Step 5: Multi-Stream Support or Drop
+
+We can't delete
+[some of the emitters](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/compiler/xla/service/gpu/stream_assignment.cc#L140)
+until we support it in MLIR, or we drop the feature. It's a relatively large
+amount of work in MLIR and a small amount of gain for XLA. We should investigate
+current users of multi-stream XLA/GPU users, and try to delete this feature if
+reasonable.
+
+### Step 6: (Task 3) Migrated Device Ops
+
+This step migrates all unnested ops, then we can delete all unnested emitters.
+
+This calls on a rewrite/refactor for kCopy and kReduce. kReduce is already
+worked on for plenty, so the actual amount of work that needs to be done remains
+to be seen.

From 934df8dcea0c176314a52d0062dcca08638bb52d Mon Sep 17 00:00:00 2001
From: Advait Jain <advaitjain@google.com>
Date: Mon, 22 Jun 2020 22:29:15 -0700
Subject: [PATCH 0864/1390] Remove portable_optimized depthwise_conv
 implementation.

PR #34999 surfaced inconsistencies between the reference and portable_optimized
implementations and we are also slowly transitioning to CMSIS-NN for the
optimized implementations for ARM Cortex-M targets.

There may still be an issue with the reference depthwise conv + dilation > 1 but
that would be a separate bug.

PiperOrigin-RevId: 317802639
Change-Id: Ic8c9acffb060bb3ec5f802a2045ac6ac8b9f2233
---
 tensorflow/lite/micro/BUILD                   |  24 -
 tensorflow/lite/micro/kernels/BUILD           |  81 ---
 .../portable_optimized/depthwise_conv.cc      | 515 ------------------
 .../lite/micro/tools/ci_build/test_mbed.sh    |   2 +-
 .../make/targets/apollo3evb_makefile.inc      |   3 -
 5 files changed, 1 insertion(+), 624 deletions(-)
 delete mode 100644 tensorflow/lite/micro/kernels/portable_optimized/depthwise_conv.cc

diff --git a/tensorflow/lite/micro/BUILD b/tensorflow/lite/micro/BUILD
index f63d9778634..bdfa0c909db 100644
--- a/tensorflow/lite/micro/BUILD
+++ b/tensorflow/lite/micro/BUILD
@@ -102,30 +102,6 @@ cc_library(
     ],
 )
 
-# TODO(b/144176795): This target should really be handled differently so that we
-# do not have a fork in the build graph. The bug has some initial ideas.
-cc_library(
-    name = "portable_optimized_op_resolver",
-    srcs = [
-        "all_ops_resolver.cc",
-        "micro_mutable_op_resolver.h",
-        "micro_op_resolver.h",
-    ],
-    hdrs = [
-        "all_ops_resolver.h",
-    ],
-    copts = micro_copts(),
-    deps = [
-        ":micro_compatibility",
-        "//tensorflow/lite/c:common",
-        "//tensorflow/lite/core/api",
-        "//tensorflow/lite/kernels:op_macros",
-        "//tensorflow/lite/kernels/internal:compatibility",
-        "//tensorflow/lite/micro/kernels:portable_optimized_micro_ops",
-        "//tensorflow/lite/schema:schema_fbs",
-    ],
-)
-
 cc_library(
     name = "debug_log",
     srcs = [
diff --git a/tensorflow/lite/micro/kernels/BUILD b/tensorflow/lite/micro/kernels/BUILD
index c7fa19b8cea..0fd0be4e3a4 100644
--- a/tensorflow/lite/micro/kernels/BUILD
+++ b/tensorflow/lite/micro/kernels/BUILD
@@ -20,7 +20,6 @@ package_group(
     packages = ["//tensorflow/lite/micro"],
 )
 
-# LINT.IfChange(micro_ops)
 cc_library(
     name = "micro_ops",
     srcs = [
@@ -106,73 +105,6 @@ cc_library(
         ],
     }),
 )
-# LINT.ThenChange(//tensorflow/lite/micro/kernels/BUILD:portable_optimized_micro_ops)
-
-# LINT.IfChange(portable_optimized_micro_ops)
-cc_library(
-    name = "portable_optimized_micro_ops",
-    srcs = [
-        "activations.cc",
-        "add.cc",
-        "arg_min_max.cc",
-        "ceil.cc",
-        "circular_buffer.cc",
-        "comparisons.cc",
-        "concatenation.cc",
-        "conv.cc",
-        "dequantize.cc",
-        "elementwise.cc",
-        "ethosu.cc",
-        "floor.cc",
-        "fully_connected.cc",
-        "l2norm.cc",
-        "logical.cc",
-        "logistic.cc",
-        "maximum_minimum.cc",
-        "mul.cc",
-        "neg.cc",
-        "pack.cc",
-        "pad.cc",
-        "pooling.cc",
-        "portable_optimized/depthwise_conv.cc",
-        "prelu.cc",
-        "quantize.cc",
-        "reduce.cc",
-        "reshape.cc",
-        "resize_nearest_neighbor.cc",
-        "round.cc",
-        "softmax.cc",
-        "split.cc",
-        "strided_slice.cc",
-        "sub.cc",
-        "svdf.cc",
-        "tanh.cc",
-        "unpack.cc",
-    ],
-    hdrs = ["micro_ops.h"],
-    copts = micro_copts(),
-    visibility = [
-        # Needed for micro:portable_optimized_ops_resolver but visibility can not be
-        # finer-grained than a package.
-        ":micro_top_level",
-    ],
-    deps = [
-        ":activation_utils",
-        ":micro_utils",
-        "//tensorflow/lite/c:common",
-        "//tensorflow/lite/kernels:kernel_util",
-        "//tensorflow/lite/kernels:op_macros",
-        "//tensorflow/lite/kernels:padding",
-        "//tensorflow/lite/kernels/internal:common",
-        "//tensorflow/lite/kernels/internal:compatibility",
-        "//tensorflow/lite/kernels/internal:quantization_util",
-        "//tensorflow/lite/kernels/internal:reference_base",
-        "//tensorflow/lite/kernels/internal:tensor",
-        "//tensorflow/lite/kernels/internal:types",
-        "//tensorflow/lite/micro:micro_utils",
-    ],
-)
-# LINT.ThenChange(//tensorflow/lite/micro/kernels/BUILD:micro_ops)
 
 test_suite(
     name = "all_tests",
@@ -214,19 +146,6 @@ tflite_micro_cc_test(
     ],
 )
 
-tflite_micro_cc_test(
-    name = "portable_optimized_depthwise_conv_test",
-    srcs = [
-        "depthwise_conv_test.cc",
-    ],
-    deps = [
-        "//tensorflow/lite/c:common",
-        "//tensorflow/lite/kernels/internal:tensor",
-        "//tensorflow/lite/micro:portable_optimized_op_resolver",
-        "//tensorflow/lite/micro/testing:micro_test",
-    ],
-)
-
 tflite_micro_cc_test(
     name = "fully_connected_test",
     srcs = [
diff --git a/tensorflow/lite/micro/kernels/portable_optimized/depthwise_conv.cc b/tensorflow/lite/micro/kernels/portable_optimized/depthwise_conv.cc
deleted file mode 100644
index 9fb8f2e32cc..00000000000
--- a/tensorflow/lite/micro/kernels/portable_optimized/depthwise_conv.cc
+++ /dev/null
@@ -1,515 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/lite/kernels/internal/reference/integer_ops/depthwise_conv.h"
-
-#include "tensorflow/lite/c/builtin_op_data.h"
-#include "tensorflow/lite/c/common.h"
-#include "tensorflow/lite/kernels/internal/common.h"
-#include "tensorflow/lite/kernels/internal/quantization_util.h"
-#include "tensorflow/lite/kernels/internal/reference/depthwiseconv_float.h"
-#include "tensorflow/lite/kernels/internal/reference/depthwiseconv_uint8.h"
-#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
-#include "tensorflow/lite/kernels/kernel_util.h"
-#include "tensorflow/lite/kernels/padding.h"
-
-namespace tflite {
-namespace ops {
-namespace micro {
-namespace depthwise_conv {
-namespace {
-
-constexpr int kInputTensor = 0;
-constexpr int kFilterTensor = 1;
-constexpr int kBiasTensor = 2;
-constexpr int kOutputTensor = 0;
-constexpr int kMaxChannels = 256;
-
-// Depthwise conv is quantized along dimension 3:
-// https://www.tensorflow.org/lite/performance/quantization_spec
-constexpr int kDepthwiseConvQuantizedDimension = 3;
-
-// Size of the cached buffer we'll be using to hold reordered weights.
-constexpr int kReshapedFilterDataSize = 1 * 1024;
-
-struct OpData {
-  TfLitePaddingValues padding;
-  // The scaling factor from input to output (aka the 'real multiplier') can
-  // be represented as a fixed point multiplier plus a left shift.
-  int32_t output_multiplier;
-  int output_shift;
-
-  // Per channel output multiplier and shift.
-  int32_t per_channel_output_multiplier[kMaxChannels];
-  int32_t per_channel_output_shift[kMaxChannels];
-
-  // The range of the fused activation layer. For example for kNone and
-  // uint8_t these would be 0 and 255.
-  int32_t output_activation_min;
-  int32_t output_activation_max;
-};
-
-TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteNode* node,
-                             TfLiteDepthwiseConvParams* params, int width,
-                             int height, int filter_width, int filter_height,
-                             int out_width, int out_height,
-                             const TfLiteType data_type, OpData* data) {
-  bool has_bias = node->inputs->size == 3;
-  // Check number of inputs/outputs
-  TF_LITE_ENSURE(context, has_bias || node->inputs->size == 2);
-  TF_LITE_ENSURE_EQ(context, node->outputs->size, 1);
-
-  // Matching GetWindowedOutputSize in TensorFlow.
-  auto padding = params->padding;
-  data->padding = ComputePaddingHeightWidth(
-      params->stride_height, params->stride_width,
-      params->dilation_height_factor, params->dilation_width_factor, height,
-      width, filter_height, filter_width, padding, &out_height, &out_width);
-
-  // Note that quantized inference requires that all tensors have their
-  // parameters set. This is usually done during quantized training.
-  if (data_type != kTfLiteFloat32) {
-    const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-    const TfLiteTensor* filter = GetInput(context, node, kFilterTensor);
-    const TfLiteTensor* bias =
-        GetOptionalInputTensor(context, node, kBiasTensor);
-    TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
-    int num_channels = filter->dims->data[kDepthwiseConvQuantizedDimension];
-
-    TF_LITE_ENSURE_STATUS(tflite::PopulateConvolutionQuantizationParams(
-        context, input, filter, bias, output, params->activation,
-        &data->output_multiplier, &data->output_shift,
-        &data->output_activation_min, &data->output_activation_max,
-        data->per_channel_output_multiplier,
-        reinterpret_cast<int*>(data->per_channel_output_shift), num_channels));
-  }
-  return kTfLiteOk;
-}
-
-// Specialized implementation of the depthwise convolution operation designed to
-// work with the particular filter width of eight used by the default micro
-// speech sample code. It uses 1KB of RAM to hold reordered weight parameters,
-// converted from TFLite's NHWC format to NCHW format, and expressed as signed
-// eight bit integers, rather than unsigned. Care must be taken when calling
-// this not to use it for more than one node since there's only a single static
-// buffer holding the weights. You should use this implementation if depthwise
-// convolutions are a performance bottleneck, you have a layer that meets the
-// parameter requirements, and the extra RAM usage and additional code size are
-// not an issue.
-static inline void DepthwiseConvOptimizedForFilterWidthEight(
-    TfLiteContext* context, const DepthwiseParams& params,
-    const RuntimeShape& input_shape, const uint8* input_data,
-    const RuntimeShape& filter_shape, const uint8* filter_data,
-    const RuntimeShape& bias_shape, const int32* bias_data,
-    const RuntimeShape& output_shape, uint8* output_data) {
-  const int stride_width = params.stride_width;
-  const int stride_height = params.stride_height;
-  const int pad_width = params.padding_values.width;
-  const int pad_height = params.padding_values.height;
-  const int depth_multiplier = params.depth_multiplier;
-  const int32 output_activation_min = params.quantized_activation_min;
-  const int32 output_activation_max = params.quantized_activation_max;
-  const int32 input_offset = params.input_offset;
-  const int32 filter_offset = params.weights_offset;
-  const int32 output_offset = params.output_offset;
-  const int32 output_multiplier = params.output_multiplier;
-  const int output_shift = params.output_shift;
-  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
-  TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
-  TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
-
-  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
-  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
-  const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3);
-  const int input_height = input_shape.Dims(1);
-  const int input_width = input_shape.Dims(2);
-  const int input_depth = input_shape.Dims(3);
-  const int filter_height = filter_shape.Dims(1);
-  const int filter_width = filter_shape.Dims(2);
-  const int output_height = output_shape.Dims(1);
-  const int output_width = output_shape.Dims(2);
-  TFLITE_DCHECK_EQ(output_depth, input_depth * depth_multiplier);
-  TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
-
-  static int16_t reshaped_filter_data[kReshapedFilterDataSize];
-  const int needed_size =
-      output_depth * filter_width * filter_height * input_depth;
-  if (needed_size > kReshapedFilterDataSize) {
-    TF_LITE_KERNEL_LOG(
-        context,
-        "Size too large for reshaped weight buffer (%d needed, %d available)",
-        needed_size, kReshapedFilterDataSize);
-    return;
-  }
-
-  RuntimeShape reshaped_filter_shape;
-  reshaped_filter_shape.BuildFrom(
-      {1, output_depth, filter_height, filter_width});
-
-  // If this is the first time through, repack the weights into a cached buffer
-  // so that they can be accessed sequentially.
-  static bool is_reshaped_filter_initialized = false;
-  if (!is_reshaped_filter_initialized) {
-    for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
-      for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
-        for (int oc = 0; oc < output_depth; ++oc) {
-          const uint8* current_filter =
-              filter_data + Offset(filter_shape, 0, filter_y, filter_x, oc);
-          int16_t* reshaped_filter =
-              reshaped_filter_data +
-              Offset(reshaped_filter_shape, 0, oc, filter_y, filter_x);
-          *reshaped_filter =
-              static_cast<int16_t>(*current_filter) + filter_offset;
-        }
-      }
-    }
-    is_reshaped_filter_initialized = true;
-  }
-
-  for (int b = 0; b < batches; ++b) {
-    for (int out_y = 0; out_y < output_height; ++out_y) {
-      for (int out_x = 0; out_x < output_width; ++out_x) {
-        for (int ic = 0; ic < input_depth; ++ic) {
-          for (int m = 0; m < depth_multiplier; m++) {
-            const int oc = m + ic * depth_multiplier;
-            const int in_x_origin = (out_x * stride_width) - pad_width;
-            const int in_y_origin = (out_y * stride_height) - pad_height;
-            int32 acc = 0;
-            int in_y_start = in_y_origin;
-            int filter_y_start = 0;
-            if (in_y_origin < 0) {
-              in_y_start = 0;
-              filter_y_start = 0 - in_y_origin;
-            }
-            int filter_y_end = filter_height;
-            if ((in_y_origin + filter_height) >= input_height) {
-              filter_y_end -= (in_y_origin + filter_height) - input_height;
-            }
-            int in_y = in_y_start;
-            int in_x_start = in_x_origin;
-            int filter_x_start = 0;
-            bool is_out_of_x_bounds = false;
-            if (in_x_origin < 0) {
-              in_x_start = 0;
-              filter_x_start = 0 - in_x_origin;
-              is_out_of_x_bounds = true;
-            }
-            int filter_x_end = filter_width;
-            if ((in_x_origin + filter_width) >= input_width) {
-              filter_x_end -= (in_x_origin + filter_width) - input_width;
-              is_out_of_x_bounds = true;
-            }
-            for (int filter_y = filter_y_start; filter_y < filter_y_end;
-                 ++filter_y, ++in_y) {
-              const uint8* current_input =
-                  input_data + Offset(input_shape, b, in_y, in_x_start, ic);
-              if ((filter_width == 8) && !is_out_of_x_bounds) {
-                int16* current_filter =
-                    reshaped_filter_data + Offset(reshaped_filter_shape, 0, oc,
-                                                  filter_y, filter_x_start);
-                const uint32_t input_vals0 =
-                    *reinterpret_cast<const uint32_t*>(current_input);
-                current_input += 4;
-                const int32_t filter_vals0 =
-                    *reinterpret_cast<const int32_t*>(current_filter);
-                current_filter += 2;
-                const uint8 input_val0 = input_vals0 & 0xff;
-                const int16 filter_val0 = filter_vals0 & 0xffff;
-                acc += filter_val0 * input_val0;
-                const uint8 input_val1 = (input_vals0 >> 8) & 0xff;
-                const int16 filter_val1 = (filter_vals0 >> 16) & 0xffff;
-                acc += filter_val1 * input_val1;
-
-                const int32_t filter_vals1 =
-                    *reinterpret_cast<const int32_t*>(current_filter);
-                current_filter += 2;
-                const uint8 input_val2 = (input_vals0 >> 16) & 0xff;
-                const int16 filter_val2 = filter_vals1 & 0xffff;
-                acc += filter_val2 * input_val2;
-                const uint8 input_val3 = (input_vals0 >> 24) & 0xff;
-                const int16 filter_val3 = (filter_vals1 >> 16) & 0xffff;
-                acc += filter_val3 * input_val3;
-
-                const uint32_t input_vals1 =
-                    *reinterpret_cast<const uint32_t*>(current_input);
-                const int32_t filter_vals2 =
-                    *reinterpret_cast<const int32_t*>(current_filter);
-                current_filter += 2;
-                const uint8 input_val4 = input_vals1 & 0xff;
-                const int16 filter_val4 = filter_vals2 & 0xffff;
-                acc += filter_val4 * input_val4;
-                const uint8 input_val5 = (input_vals1 >> 8) & 0xff;
-                const int16 filter_val5 = (filter_vals2 >> 16) & 0xffff;
-                acc += filter_val5 * input_val5;
-
-                const int32_t filter_vals3 =
-                    *reinterpret_cast<const int32_t*>(current_filter);
-                const uint8 input_val6 = (input_vals1 >> 16) & 0xff;
-                const int16 filter_val6 = filter_vals3 & 0xffff;
-                acc += filter_val6 * input_val6;
-                const uint8 input_val7 = (input_vals1 >> 24) & 0xff;
-                const int16 filter_val7 = (filter_vals3 >> 16) & 0xffff;
-                acc += filter_val7 * input_val7;
-              } else {
-                const uint8* current_filter =
-                    filter_data +
-                    Offset(filter_shape, 0, filter_y, filter_x_start, oc);
-                for (int filter_x = filter_x_start; filter_x < filter_x_end;
-                     ++filter_x) {
-                  int32 input_val = *current_input;
-                  current_input += input_depth;
-                  int32 filter_val = *current_filter;
-                  current_filter += output_depth;
-                  acc +=
-                      (filter_val + filter_offset) * (input_val + input_offset);
-                }
-              }
-            }
-            if (bias_data) {
-              acc += bias_data[oc];
-            }
-            acc = reference_ops::depthwise_conv::DepthwiseConvRound<
-                DepthwiseConvOutputRounding::kAwayFromZero>(
-                acc, output_multiplier, output_shift);
-            acc += output_offset;
-            acc = std::max(acc, output_activation_min);
-            acc = std::min(acc, output_activation_max);
-            output_data[Offset(output_shape, b, out_y, out_x, oc)] =
-                static_cast<uint8>(acc);
-          }
-        }
-      }
-    }
-  }
-}  // namespace
-
-}  // namespace
-
-void EvalFloat(TfLiteContext* context, TfLiteNode* node,
-               TfLiteDepthwiseConvParams* params, OpData* data,
-               const TfLiteTensor* input, const TfLiteTensor* filter,
-               const TfLiteTensor* bias, TfLiteTensor* output) {
-  float output_activation_min, output_activation_max;
-  CalculateActivationRange(params->activation, &output_activation_min,
-                           &output_activation_max);
-
-  tflite::DepthwiseParams op_params;
-  // Padding type is ignored, but still set.
-  op_params.padding_type = PaddingType::kSame;
-  op_params.padding_values.width = data->padding.width;
-  op_params.padding_values.height = data->padding.height;
-  op_params.stride_width = params->stride_width;
-  op_params.stride_height = params->stride_height;
-  op_params.dilation_width_factor = 1;
-  op_params.dilation_height_factor = 1;
-  op_params.depth_multiplier = params->depth_multiplier;
-  op_params.float_activation_min = output_activation_min;
-  op_params.float_activation_max = output_activation_max;
-
-  tflite::reference_ops::DepthwiseConv(
-      op_params, GetTensorShape(input), GetTensorData<float>(input),
-      GetTensorShape(filter), GetTensorData<float>(filter),
-      GetTensorShape(bias), GetTensorData<float>(bias), GetTensorShape(output),
-      GetTensorData<float>(output));
-}
-
-// TODO(njeff): Optimize for int8 like we do for uint8.
-
-void EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node,
-                             TfLiteDepthwiseConvParams* params, OpData* data,
-                             const TfLiteTensor* input,
-                             const TfLiteTensor* filter,
-                             const TfLiteTensor* bias, TfLiteTensor* output) {
-  DepthwiseParams op_params;
-  op_params.padding_type = PaddingType::kSame;
-  op_params.padding_values.width = data->padding.width;
-  op_params.padding_values.height = data->padding.height;
-  op_params.stride_width = params->stride_width;
-  op_params.stride_height = params->stride_height;
-  op_params.dilation_width_factor = params->dilation_width_factor;
-  op_params.dilation_height_factor = params->dilation_height_factor;
-  op_params.depth_multiplier = params->depth_multiplier;
-  op_params.input_offset = -input->params.zero_point;
-  op_params.weights_offset = 0;
-  op_params.output_offset = output->params.zero_point;
-  // TODO(b/130439627): Use calculated value for clamping.
-  op_params.quantized_activation_min = std::numeric_limits<int8_t>::min();
-  op_params.quantized_activation_max = std::numeric_limits<int8_t>::max();
-
-  reference_integer_ops::DepthwiseConvPerChannel(
-      op_params, data->per_channel_output_multiplier,
-      data->per_channel_output_shift, GetTensorShape(input),
-      GetTensorData<int8>(input), GetTensorShape(filter),
-      GetTensorData<int8>(filter), GetTensorShape(bias),
-      GetTensorData<int32>(bias), GetTensorShape(output),
-      GetTensorData<int8>(output));
-}
-
-void EvalQuantized(TfLiteContext* context, TfLiteNode* node,
-                   TfLiteDepthwiseConvParams* params, OpData* data,
-                   const TfLiteTensor* input, const TfLiteTensor* filter,
-                   const TfLiteTensor* bias, TfLiteTensor* output) {
-  const int32_t input_offset = -input->params.zero_point;
-  const int32_t filter_offset = -filter->params.zero_point;
-  const int32_t output_offset = output->params.zero_point;
-
-  tflite::DepthwiseParams op_params;
-  // Padding type is ignored, but still set.
-  op_params.padding_type = PaddingType::kSame;
-  op_params.padding_values.width = data->padding.width;
-  op_params.padding_values.height = data->padding.height;
-  op_params.stride_width = params->stride_width;
-  op_params.stride_height = params->stride_height;
-  op_params.dilation_width_factor = 1;
-  op_params.dilation_height_factor = 1;
-  op_params.depth_multiplier = params->depth_multiplier;
-  op_params.quantized_activation_min = data->output_activation_min;
-  op_params.quantized_activation_max = data->output_activation_max;
-  op_params.input_offset = input_offset;
-  op_params.weights_offset = filter_offset;
-  op_params.output_offset = output_offset;
-  op_params.output_multiplier = data->output_multiplier;
-  // Legacy ops used mixed left and right shifts. Now all are +ve-means-left.
-  op_params.output_shift = -data->output_shift;
-
-  // Figure out if we can use the optimized path for this set of parameters.
-  const int filter_width = GetTensorShape(filter).Dims(2);
-  const int input_depth = GetTensorShape(input).Dims(3);
-  const int output_depth = GetTensorShape(filter).Dims(3);
-  const int filter_height = GetTensorShape(filter).Dims(1);
-  const int needed_size =
-      output_depth * filter_width * filter_height * input_depth;
-  bool use_optimized_path = false;
-  if ((filter_width == 8) && (input_offset == 0) && (input_depth == 1) &&
-      (needed_size <= kReshapedFilterDataSize)) {
-    // FIXME(petewarden) - We need a more robust way of handling this, ideally
-    // with an allocation mechanism available through the context API.
-    // Use the address of the node as a proxy for its identity, since we need
-    // to ensure the weight values are consistent between calls, and there's
-    // no easy way to do that quickly other than relying on the identity of
-    // the owning node.
-    static TfLiteNode* initialized_node_address = node;
-    if (initialized_node_address == node) {
-      use_optimized_path = true;
-    } else {
-      static bool has_warned = false;
-      if (!has_warned) {
-        TF_LITE_KERNEL_LOG(
-            context,
-            "Multiple depthwise conv ops match optimization parameters, but "
-            "only the first will use the fast path, because there's only one "
-            "RAM cache available");
-        has_warned = true;
-      }
-    }
-  }
-  if (use_optimized_path) {
-    DepthwiseConvOptimizedForFilterWidthEight(
-        context, op_params, GetTensorShape(input),
-        GetTensorData<uint8_t>(input), GetTensorShape(filter),
-        GetTensorData<uint8_t>(filter), GetTensorShape(bias),
-        GetTensorData<int32_t>(bias), GetTensorShape(output),
-        GetTensorData<uint8_t>(output));
-  } else {
-    tflite::reference_ops::DepthwiseConv(
-        op_params, GetTensorShape(input), GetTensorData<uint8_t>(input),
-        GetTensorShape(filter), GetTensorData<uint8_t>(filter),
-        GetTensorShape(bias), GetTensorData<int32_t>(bias),
-        GetTensorShape(output), GetTensorData<uint8_t>(output));
-  }
-}
-
-TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
-  auto* params =
-      reinterpret_cast<TfLiteDepthwiseConvParams*>(node->builtin_data);
-
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  const TfLiteTensor* filter = GetInput(context, node, kFilterTensor);
-  const TfLiteTensor* bias =
-      (NumInputs(node) == 3) ? GetInput(context, node, kBiasTensor) : nullptr;
-
-  const TfLiteType data_type = input->type;
-  int width = SizeOfDimension(input, 2);
-  int height = SizeOfDimension(input, 1);
-  int filter_width = SizeOfDimension(filter, 2);
-  int filter_height = SizeOfDimension(filter, 1);
-  int out_width = ComputeOutSize(params->padding, width, filter_width,
-                                 params->stride_width);
-  int out_height = ComputeOutSize(params->padding, height, filter_height,
-                                  params->stride_height);
-  OpData data;
-
-  // All per-channel quantized tensors need valid zero point and scale arrays.
-  if (input->type == kTfLiteInt8) {
-    TF_LITE_ENSURE_EQ(context, filter->quantization.type,
-                      kTfLiteAffineQuantization);
-
-    const auto* affine_quantization =
-        reinterpret_cast<TfLiteAffineQuantization*>(
-            filter->quantization.params);
-    TF_LITE_ENSURE(context, affine_quantization);
-    TF_LITE_ENSURE(context, affine_quantization->scale);
-    TF_LITE_ENSURE(context, affine_quantization->zero_point);
-    TF_LITE_ENSURE(
-        context, affine_quantization->scale->size == 1 ||
-                     affine_quantization->scale->size ==
-                         filter->dims->data[kDepthwiseConvQuantizedDimension]);
-    TF_LITE_ENSURE_EQ(context, affine_quantization->scale->size,
-                      affine_quantization->zero_point->size);
-  }
-
-  TF_LITE_ENSURE_STATUS(CalculateOpData(context, node, params, width, height,
-                                        filter_width, filter_height, out_width,
-                                        out_height, data_type, &data));
-
-  // TODO(aselle): Consider whether float conv and quantized conv should be
-  // separate ops to avoid dispatch overhead here.
-  switch (input->type) {  // Already know in/out types are same.
-    case kTfLiteFloat32:
-      EvalFloat(context, node, params, &data, input, filter, bias, output);
-      break;
-    case kTfLiteInt8:
-      EvalQuantizedPerChannel(context, node, params, &data, input, filter, bias,
-                              output);
-      break;
-    case kTfLiteUInt8:
-      EvalQuantized(context, node, params, &data, input, filter, bias, output);
-      break;
-    default:
-      TF_LITE_KERNEL_LOG(context, "Type %s (%d) not supported.",
-                         TfLiteTypeGetName(input->type), input->type);
-      return kTfLiteError;
-  }
-  return kTfLiteOk;
-}
-
-}  // namespace depthwise_conv
-
-TfLiteRegistration* Register_DEPTHWISE_CONV_2D() {
-  static TfLiteRegistration r = {/*init=*/nullptr,
-                                 /*free=*/nullptr,
-                                 /*prepare=*/nullptr,
-                                 /*invoke=*/depthwise_conv::Eval,
-                                 /*profiling_string=*/nullptr,
-                                 /*builtin_code=*/0,
-                                 /*custom_name=*/nullptr,
-                                 /*version=*/0};
-  return &r;
-}
-
-}  // namespace micro
-}  // namespace ops
-}  // namespace tflite
diff --git a/tensorflow/lite/micro/tools/ci_build/test_mbed.sh b/tensorflow/lite/micro/tools/ci_build/test_mbed.sh
index a4d47009c93..fa4506fa6b8 100755
--- a/tensorflow/lite/micro/tools/ci_build/test_mbed.sh
+++ b/tensorflow/lite/micro/tools/ci_build/test_mbed.sh
@@ -49,7 +49,7 @@ fi
 
 make -f tensorflow/lite/micro/tools/make/Makefile \
   TARGET=${TARGET} \
-  TAGS="portable_optimized disco_f746ng" \
+  TAGS="disco_f746ng" \
   ${PROJECTS}
 
 readable_run tensorflow/lite/micro/tools/ci_build/install_mbed_cli.sh
diff --git a/tensorflow/lite/micro/tools/make/targets/apollo3evb_makefile.inc b/tensorflow/lite/micro/tools/make/targets/apollo3evb_makefile.inc
index 7d2a0e65b97..dc7a689daed 100644
--- a/tensorflow/lite/micro/tools/make/targets/apollo3evb_makefile.inc
+++ b/tensorflow/lite/micro/tools/make/targets/apollo3evb_makefile.inc
@@ -24,9 +24,6 @@ ifeq ($(TARGET),$(filter $(TARGET),\
 $(MAKEFILE_DIR)/downloads/$(AM_SDK_DEST)/$(SF_BSPS_DEST): $(MAKEFILE_DIR)/downloads/$(AM_SDK_DEST)
   endif
 
-  # Use the faster depthwise conv implementation.
-  ALL_TAGS += portable_optimized
-
   PLATFORM_FLAGS = \
     -DPART_apollo3 \
     -DAM_PACKAGE_BGA \

From 088d4376945c0ee6f3def04284ff1a835ea57097 Mon Sep 17 00:00:00 2001
From: Hye Soo Yang <hyey@google.com>
Date: Mon, 22 Jun 2020 22:38:05 -0700
Subject: [PATCH 0865/1390] Fix BMP header bytes.

PiperOrigin-RevId: 317803857
Change-Id: Iafa8e4e44400ec3c119af8897ba3fa693138501f
---
 tensorflow/python/kernel_tests/decode_bmp_op_test.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/tensorflow/python/kernel_tests/decode_bmp_op_test.py b/tensorflow/python/kernel_tests/decode_bmp_op_test.py
index 5e7991382ed..b0a8f9ffbbb 100644
--- a/tensorflow/python/kernel_tests/decode_bmp_op_test.py
+++ b/tensorflow/python/kernel_tests/decode_bmp_op_test.py
@@ -25,14 +25,14 @@ from tensorflow.python.ops import image_ops
 from tensorflow.python.platform import test
 
 
-
 class DecodeBmpOpTest(test.TestCase):
 
   def testex1(self):
     img_bytes = [[[0, 0, 255], [0, 255, 0]], [[255, 0, 0], [255, 255, 255]]]
     # Encoded BMP bytes from Wikipedia
+    # BMP header bytes: https://en.wikipedia.org/wiki/List_of_file_signatures
     encoded_bytes = [
-        0x42, 0x40,
+        0x42, 0x4d,
         0x46, 0, 0, 0,
         0, 0,
         0, 0,
@@ -66,9 +66,10 @@ class DecodeBmpOpTest(test.TestCase):
 
   def testGrayscale(self):
     img_bytes = [[[255], [0]], [[255], [0]]]
+    # BMP header bytes: https://en.wikipedia.org/wiki/List_of_file_signatures
     encoded_bytes = [
         0x42,
-        0x40,
+        0x4d,
         0x3d,
         0,
         0,
@@ -133,6 +134,8 @@ class DecodeBmpOpTest(test.TestCase):
 
     byte_string = bytes(bytearray(encoded_bytes))
     img_in = constant_op.constant(byte_string, dtype=dtypes.string)
+    # TODO(b/159600494): Currently, `decode_bmp` op does not validate input
+    # magic bytes.
     decode = image_ops.decode_bmp(img_in)
 
     with self.cached_session():

From dce31a431df80aa55f2cfd87d60952c413d6f690 Mon Sep 17 00:00:00 2001
From: chuanqiw <chuanqi.wang@intel.com>
Date: Tue, 23 Jun 2020 14:08:27 +0800
Subject: [PATCH 0866/1390] Update sqlite version to 3.32.1 to fix the
 CVE-2020-13630 and CVE-2020-11656

---
 tensorflow/workspace.bzl | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index 0f591ba8b90..29c6ef99397 100755
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -409,12 +409,12 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
     tf_http_archive(
         name = "org_sqlite",
         build_file = clean_dep("//third_party:sqlite.BUILD"),
-        sha256 = "f3c79bc9f4162d0b06fa9fe09ee6ccd23bb99ce310b792c5145f87fbcc30efca",
-        strip_prefix = "sqlite-amalgamation-3310100",
+        sha256 = "8d46ef69b96628bedb781bd8309210f2a1f4a353792097302f6b754044e6540f",
+        strip_prefix = "sqlite-amalgamation-3320100",
         system_build_file = clean_dep("//third_party/systemlibs:sqlite.BUILD"),
         urls = [
-            "https://storage.googleapis.com/mirror.tensorflow.org/www.sqlite.org/2020/sqlite-amalgamation-3310100.zip",
-            "https://www.sqlite.org/2020/sqlite-amalgamation-3310100.zip",
+            "https://storage.googleapis.com/mirror.tensorflow.org/www.sqlite.org/2020/sqlite-amalgamation-3320100.zip",
+            "https://www.sqlite.org/2020/sqlite-amalgamation-3320100.zip",
         ],
     )
 

From 51ee63247d1d4e4c6db089875b605963f31de2d9 Mon Sep 17 00:00:00 2001
From: Gaurav Jain <gjn@google.com>
Date: Mon, 22 Jun 2020 23:04:46 -0700
Subject: [PATCH 0867/1390] Rollforward: Add uint32 & uint64 to
 TF_CALL_INTEGRAL_TYPES

PiperOrigin-RevId: 317806755
Change-Id: I3752198bf98d5659bf163455a5dfe1fbe39ffd1c
---
 tensorflow/core/framework/register_types.h    | 21 +++++---------
 tensorflow/core/framework/types.cc            |  5 ----
 tensorflow/core/kernels/BUILD                 |  2 ++
 tensorflow/core/kernels/concat_lib_cpu.cc     |  2 --
 tensorflow/core/kernels/concat_op.cc          |  2 --
 tensorflow/core/kernels/constant_op.cc        |  1 -
 tensorflow/core/kernels/control_flow_ops.cc   |  5 ----
 .../core/kernels/data/dataset_test_base.cc    |  2 --
 tensorflow/core/kernels/dense_update_ops.cc   |  1 -
 .../core/kernels/dynamic_partition_op.cc      |  2 --
 tensorflow/core/kernels/fill_functor.cc       |  5 +++-
 tensorflow/core/kernels/gather_op.cc          |  2 --
 tensorflow/core/kernels/identity_op.cc        |  1 -
 tensorflow/core/kernels/ragged_gather_op.cc   |  2 --
 .../kernels/ragged_tensor_from_variant_op.cc  |  2 --
 .../kernels/ragged_tensor_to_tensor_op.cc     |  2 --
 .../kernels/ragged_tensor_to_variant_op.cc    |  2 --
 .../core/kernels/resource_variable_ops.cc     |  1 -
 tensorflow/core/kernels/split_lib_cpu.cc      |  1 -
 tensorflow/core/kernels/split_op.cc           |  1 -
 tensorflow/core/kernels/strided_slice_op.cc   |  2 --
 .../core/kernels/strided_slice_op_impl.h      |  2 --
 tensorflow/core/kernels/topk_op.cc            |  2 --
 .../core/kernels/topk_op_gpu_uint32.cu.cc     | 28 +++++++++++++++++++
 .../core/kernels/topk_op_gpu_uint64.cu.cc     | 28 +++++++++++++++++++
 tensorflow/core/util/batch_util.cc            |  8 ------
 .../core/util/saved_tensor_slice_util.h       |  2 ++
 27 files changed, 71 insertions(+), 63 deletions(-)
 create mode 100644 tensorflow/core/kernels/topk_op_gpu_uint32.cu.cc
 create mode 100644 tensorflow/core/kernels/topk_op_gpu_uint64.cu.cc

diff --git a/tensorflow/core/framework/register_types.h b/tensorflow/core/framework/register_types.h
index bc3e5e1743b..0cf6536e8c2 100644
--- a/tensorflow/core/framework/register_types.h
+++ b/tensorflow/core/framework/register_types.h
@@ -153,16 +153,9 @@ limitations under the License.
 #endif  // defined(IS_MOBILE_PLATFORM)  - end of TF_CALL_type defines
 
 // Defines for sets of types.
-
-// TODO(b/111604096): Add uint32 and uint64 to TF_CALL_INTEGRAL_TYPES.
-//
-// The uint32 and uint64 types were introduced in 10/2017 to be used via XLA and
-// thus were not included in TF_CALL_INTEGRAL_TYPES. Including them in
-// TF_CALL_INTEGRAL_TYPES should only happen after evaluating the effect on the
-// TF binary size and performance.
-#define TF_CALL_INTEGRAL_TYPES(m)                                      \
-  TF_CALL_int64(m) TF_CALL_int32(m) TF_CALL_uint16(m) TF_CALL_int16(m) \
-      TF_CALL_uint8(m) TF_CALL_int8(m)
+#define TF_CALL_INTEGRAL_TYPES(m)                                       \
+  TF_CALL_uint64(m) TF_CALL_int64(m) TF_CALL_uint32(m) TF_CALL_int32(m) \
+      TF_CALL_uint16(m) TF_CALL_int16(m) TF_CALL_uint8(m) TF_CALL_int8(m)
 
 #define TF_CALL_FLOAT_TYPES(m) \
   TF_CALL_half(m) TF_CALL_bfloat16(m) TF_CALL_float(m) TF_CALL_double(m)
@@ -174,10 +167,10 @@ limitations under the License.
 #define TF_CALL_REAL_NUMBER_TYPES_NO_BFLOAT16(m) \
   TF_CALL_INTEGRAL_TYPES(m) TF_CALL_half(m) TF_CALL_float(m) TF_CALL_double(m)
 
-#define TF_CALL_REAL_NUMBER_TYPES_NO_INT32(m)                              \
-  TF_CALL_half(m) TF_CALL_bfloat16(m) TF_CALL_float(m) TF_CALL_double(m)   \
-      TF_CALL_int64(m) TF_CALL_uint16(m) TF_CALL_int16(m) TF_CALL_uint8(m) \
-          TF_CALL_int8(m)
+#define TF_CALL_REAL_NUMBER_TYPES_NO_INT32(m)                                \
+  TF_CALL_half(m) TF_CALL_bfloat16(m) TF_CALL_float(m) TF_CALL_double(m)     \
+      TF_CALL_uint64(m) TF_CALL_int64(m) TF_CALL_uint32(m) TF_CALL_uint16(m) \
+          TF_CALL_int16(m) TF_CALL_uint8(m) TF_CALL_int8(m)
 
 #define TF_CALL_COMPLEX_TYPES(m) TF_CALL_complex64(m) TF_CALL_complex128(m)
 
diff --git a/tensorflow/core/framework/types.cc b/tensorflow/core/framework/types.cc
index 97eaec98ffe..d6455e012d0 100644
--- a/tensorflow/core/framework/types.cc
+++ b/tensorflow/core/framework/types.cc
@@ -238,11 +238,6 @@ int DataTypeSize(DataType dt) {
     TF_CALL_qint16(CASE);
     TF_CALL_quint16(CASE);
 
-    // uint32 and uint64 aren't included in TF_CALL_POD_TYPES because we
-    // don't want to define kernels for them at this stage to avoid binary
-    // bloat.
-    TF_CALL_uint32(CASE);
-    TF_CALL_uint64(CASE);
     default:
       return 0;
   }
diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index 7da864a6027..e2ff5aed283 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -4919,7 +4919,9 @@ tf_kernel_library(
         "topk_op_gpu_double.cu.cc",
         "topk_op_gpu_float.cu.cc",
         "topk_op_gpu_half.cu.cc",
+        "topk_op_gpu_uint64.cu.cc",
         "topk_op_gpu_int64.cu.cc",
+        "topk_op_gpu_uint32.cu.cc",
         "topk_op_gpu_int32.cu.cc",
         "topk_op_gpu_int16.cu.cc",
         "topk_op_gpu_uint16.cu.cc",
diff --git a/tensorflow/core/kernels/concat_lib_cpu.cc b/tensorflow/core/kernels/concat_lib_cpu.cc
index da73d3d2c56..1dec589d3ff 100644
--- a/tensorflow/core/kernels/concat_lib_cpu.cc
+++ b/tensorflow/core/kernels/concat_lib_cpu.cc
@@ -116,8 +116,6 @@ REGISTER(qint8)
 REGISTER(quint16)
 REGISTER(qint16)
 REGISTER(qint32)
-REGISTER(uint32)
-REGISTER(uint64)
 
 #if defined(IS_MOBILE_PLATFORM) && !defined(SUPPORT_SELECTIVE_REGISTRATION) && \
     !defined(__ANDROID_TYPES_FULL__)
diff --git a/tensorflow/core/kernels/concat_op.cc b/tensorflow/core/kernels/concat_op.cc
index be3e9a67c5f..d3f3a04f33b 100644
--- a/tensorflow/core/kernels/concat_op.cc
+++ b/tensorflow/core/kernels/concat_op.cc
@@ -208,8 +208,6 @@ REGISTER_CONCAT(qint8);
 REGISTER_CONCAT(quint16);
 REGISTER_CONCAT(qint16);
 REGISTER_CONCAT(qint32);
-REGISTER_CONCAT(uint32);
-REGISTER_CONCAT(uint64);
 
 #undef REGISTER_CONCAT
 
diff --git a/tensorflow/core/kernels/constant_op.cc b/tensorflow/core/kernels/constant_op.cc
index 4bcbc076446..dc178d17d49 100644
--- a/tensorflow/core/kernels/constant_op.cc
+++ b/tensorflow/core/kernels/constant_op.cc
@@ -211,7 +211,6 @@ TF_CALL_ALL_TYPES(REGISTER_CPU_KERNEL);
 // the conversion from uint8 to quint8.
 REGISTER_KERNEL(CPU, quint8);
 REGISTER_KERNEL(CPU, quint16);
-REGISTER_KERNEL(CPU, uint32);
 #undef REGISTER_CPU_KERNEL
 
 #ifdef TENSORFLOW_USE_SYCL
diff --git a/tensorflow/core/kernels/control_flow_ops.cc b/tensorflow/core/kernels/control_flow_ops.cc
index 435de3c5954..1a0082c6a3b 100644
--- a/tensorflow/core/kernels/control_flow_ops.cc
+++ b/tensorflow/core/kernels/control_flow_ops.cc
@@ -101,16 +101,12 @@ TF_CALL_ALL_TYPES(REGISTER_CPU_SWITCH);
 TF_CALL_ALL_TYPES(REGISTER_CPU_REF_SWITCH);
 TF_CALL_QUANTIZED_TYPES(REGISTER_CPU_SWITCH);
 TF_CALL_QUANTIZED_TYPES(REGISTER_CPU_REF_SWITCH);
-REGISTER_CPU_SWITCH(uint64);
 
 TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_GPU_SWITCH);
 TF_CALL_QUANTIZED_TYPES(REGISTER_GPU_SWITCH);
 TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_GPU_REF_SWITCH);
 TF_CALL_QUANTIZED_TYPES(REGISTER_GPU_REF_SWITCH);
-REGISTER_GPU_SWITCH(uint64);
 TF_CALL_variant(REGISTER_GPU_SWITCH);
-TF_CALL_uint32(REGISTER_GPU_SWITCH);
-TF_CALL_uint32(REGISTER_GPU_REF_SWITCH);
 TF_CALL_bool(REGISTER_GPU_SWITCH);
 TF_CALL_bool(REGISTER_GPU_REF_SWITCH);
 
@@ -311,7 +307,6 @@ TF_CALL_QUANTIZED_TYPES(REGISTER_GPU_KERNEL);
 TF_CALL_QUANTIZED_TYPES(REGISTER_GPU_REF_KERNEL);
 REGISTER_GPU_KERNEL(bool);
 REGISTER_GPU_REF_KERNEL(bool);
-REGISTER_GPU_KERNEL(uint64);
 TF_CALL_variant(REGISTER_GPU_KERNEL);
 
 #undef REGISTER_GPU_KERNEL
diff --git a/tensorflow/core/kernels/data/dataset_test_base.cc b/tensorflow/core/kernels/data/dataset_test_base.cc
index b91ab9b733c..e41e35be1e9 100644
--- a/tensorflow/core/kernels/data/dataset_test_base.cc
+++ b/tensorflow/core/kernels/data/dataset_test_base.cc
@@ -220,8 +220,6 @@ Status DatasetOpsTestBase::ExpectEqual(const Tensor& a, const Tensor& b) {
     break;
     TF_CALL_NUMBER_TYPES(CASE);
     TF_CALL_tstring(CASE);
-    TF_CALL_uint32(CASE);
-    TF_CALL_uint64(CASE);
     // TODO(feihugis): figure out how to support variant tensors.
 #undef CASE
     default:
diff --git a/tensorflow/core/kernels/dense_update_ops.cc b/tensorflow/core/kernels/dense_update_ops.cc
index 55e4cd7606a..71235fca143 100644
--- a/tensorflow/core/kernels/dense_update_ops.cc
+++ b/tensorflow/core/kernels/dense_update_ops.cc
@@ -98,7 +98,6 @@ typedef Eigen::SyclDevice SYCLDevice;
 
 TF_CALL_ALL_TYPES(REGISTER_KERNELS);
 // uint32 not included in ALL_TYPES
-TF_CALL_uint32(REGISTER_KERNELS);
 TF_CALL_QUANTIZED_TYPES(REGISTER_KERNELS);
 // quint16 not included in QUANTIZIED_TYPES
 TF_CALL_quint16(REGISTER_KERNELS);
diff --git a/tensorflow/core/kernels/dynamic_partition_op.cc b/tensorflow/core/kernels/dynamic_partition_op.cc
index 90ed71dccce..95af19c4c48 100644
--- a/tensorflow/core/kernels/dynamic_partition_op.cc
+++ b/tensorflow/core/kernels/dynamic_partition_op.cc
@@ -164,8 +164,6 @@ class DynamicPartitionOp : public DynamicPartitionOp_Shared {
       DynamicPartitionOp<T>)
 
 TF_CALL_ALL_TYPES(REGISTER_DYNAMIC_PARTITION);
-// For partitioning fingerprints.
-TF_CALL_uint64(REGISTER_DYNAMIC_PARTITION);
 #undef REGISTER_DYNAMIC_PARTITION
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/fill_functor.cc b/tensorflow/core/kernels/fill_functor.cc
index 10dd3df1915..174a4e45a79 100644
--- a/tensorflow/core/kernels/fill_functor.cc
+++ b/tensorflow/core/kernels/fill_functor.cc
@@ -45,6 +45,8 @@ DEFINE_SETZERO_CPU(Eigen::half);
 DEFINE_SETZERO_CPU(bfloat16);
 DEFINE_SETZERO_CPU(float);
 DEFINE_SETZERO_CPU(double);
+DEFINE_SETZERO_CPU(uint32);
+DEFINE_SETZERO_CPU(uint64);
 DEFINE_SETZERO_CPU(uint8);
 DEFINE_SETZERO_CPU(int8);
 DEFINE_SETZERO_CPU(uint16);
@@ -96,6 +98,8 @@ DEFINE_SETONE_CPU(Eigen::half);
 DEFINE_SETONE_CPU(bfloat16);
 DEFINE_SETONE_CPU(float);
 DEFINE_SETONE_CPU(double);
+DEFINE_SETONE_CPU(uint32);
+DEFINE_SETONE_CPU(uint64);
 DEFINE_SETONE_CPU(uint8);
 DEFINE_SETONE_CPU(int8);
 DEFINE_SETONE_CPU(uint16);
@@ -137,7 +141,6 @@ struct FillFunctor<Eigen::ThreadPoolDevice, T> {
 TF_CALL_ALL_TYPES(DEFINE_FILL_CPU);
 DEFINE_FILL_CPU(quint8);
 DEFINE_FILL_CPU(quint16);
-DEFINE_FILL_CPU(uint32);
 #undef DEFINE_FILL_CPU
 
 #ifdef TENSORFLOW_USE_SYCL
diff --git a/tensorflow/core/kernels/gather_op.cc b/tensorflow/core/kernels/gather_op.cc
index 6d493a5f2ea..948567e019a 100644
--- a/tensorflow/core/kernels/gather_op.cc
+++ b/tensorflow/core/kernels/gather_op.cc
@@ -211,8 +211,6 @@ TF_CALL_ALL_TYPES(REGISTER_GATHER_CPU);
 TF_CALL_QUANTIZED_TYPES(REGISTER_GATHER_CPU);
 TF_CALL_quint16(REGISTER_GATHER_CPU);
 TF_CALL_qint16(REGISTER_GATHER_CPU);
-TF_CALL_uint32(REGISTER_GATHER_CPU);
-TF_CALL_uint64(REGISTER_GATHER_CPU);
 
 #undef REGISTER_GATHER_CPU
 
diff --git a/tensorflow/core/kernels/identity_op.cc b/tensorflow/core/kernels/identity_op.cc
index d15b64597f5..4b226dd72d4 100644
--- a/tensorflow/core/kernels/identity_op.cc
+++ b/tensorflow/core/kernels/identity_op.cc
@@ -122,7 +122,6 @@ REGISTER_SYCL_HOST_KERNEL(bool);
 
 TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_GPU_KERNEL);
 REGISTER_GPU_KERNEL(Variant);
-TF_CALL_uint32(REGISTER_GPU_KERNEL);
 REGISTER_GPU_KERNEL(bool);
 
 #undef REGISTER_GPU_KERNEL
diff --git a/tensorflow/core/kernels/ragged_gather_op.cc b/tensorflow/core/kernels/ragged_gather_op.cc
index 88c0d1ebd69..3bf82cba050 100644
--- a/tensorflow/core/kernels/ragged_gather_op.cc
+++ b/tensorflow/core/kernels/ragged_gather_op.cc
@@ -296,8 +296,6 @@ TF_CALL_tstring(REGISTER_CPU_KERNEL);
 TF_CALL_QUANTIZED_TYPES(REGISTER_CPU_KERNEL);
 TF_CALL_quint16(REGISTER_CPU_KERNEL);
 TF_CALL_qint16(REGISTER_CPU_KERNEL);
-TF_CALL_uint32(REGISTER_CPU_KERNEL);
-TF_CALL_uint64(REGISTER_CPU_KERNEL);
 #undef REGISTER_CPU_KERNEL
 #undef REGISTER_CPU_KERNEL_WITH_INDEX_TYPE
 
diff --git a/tensorflow/core/kernels/ragged_tensor_from_variant_op.cc b/tensorflow/core/kernels/ragged_tensor_from_variant_op.cc
index f83bcb38c6c..ad0712e6fd0 100644
--- a/tensorflow/core/kernels/ragged_tensor_from_variant_op.cc
+++ b/tensorflow/core/kernels/ragged_tensor_from_variant_op.cc
@@ -308,8 +308,6 @@ TF_CALL_tstring(REGISTER_KERNELS);
 TF_CALL_QUANTIZED_TYPES(REGISTER_KERNELS);
 TF_CALL_quint16(REGISTER_KERNELS);
 TF_CALL_qint16(REGISTER_KERNELS);
-TF_CALL_uint32(REGISTER_KERNELS);
-TF_CALL_uint64(REGISTER_KERNELS);
 #undef REGISTER_KERNELS
 #undef REGISTER_KERNELS_WITH_SPLIT_TYPE
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/ragged_tensor_to_tensor_op.cc b/tensorflow/core/kernels/ragged_tensor_to_tensor_op.cc
index d729c43f25a..9ae5d7ffbdc 100644
--- a/tensorflow/core/kernels/ragged_tensor_to_tensor_op.cc
+++ b/tensorflow/core/kernels/ragged_tensor_to_tensor_op.cc
@@ -561,8 +561,6 @@ TF_CALL_string(REGISTER_CPU_KERNEL);
 TF_CALL_QUANTIZED_TYPES(REGISTER_CPU_KERNEL);
 TF_CALL_quint16(REGISTER_CPU_KERNEL);
 TF_CALL_qint16(REGISTER_CPU_KERNEL);
-TF_CALL_uint32(REGISTER_CPU_KERNEL);
-TF_CALL_uint64(REGISTER_CPU_KERNEL);
 
 #undef REGISTER_CPU_KERNEL
 
diff --git a/tensorflow/core/kernels/ragged_tensor_to_variant_op.cc b/tensorflow/core/kernels/ragged_tensor_to_variant_op.cc
index 7a5ae1c6240..64c372b005e 100644
--- a/tensorflow/core/kernels/ragged_tensor_to_variant_op.cc
+++ b/tensorflow/core/kernels/ragged_tensor_to_variant_op.cc
@@ -213,8 +213,6 @@ TF_CALL_tstring(REGISTER_KERNELS);
 TF_CALL_QUANTIZED_TYPES(REGISTER_KERNELS);
 TF_CALL_quint16(REGISTER_KERNELS);
 TF_CALL_qint16(REGISTER_KERNELS);
-TF_CALL_uint32(REGISTER_KERNELS);
-TF_CALL_uint64(REGISTER_KERNELS);
 #undef REGISTER_KERNELS
 #undef REGISTER_KERNELS_WITH_SPLIT_TYPE
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/resource_variable_ops.cc b/tensorflow/core/kernels/resource_variable_ops.cc
index b9c883c7e2f..510e95ca606 100644
--- a/tensorflow/core/kernels/resource_variable_ops.cc
+++ b/tensorflow/core/kernels/resource_variable_ops.cc
@@ -508,7 +508,6 @@ class AssignVariableOp<Device, Variant> : public OpKernel {
 
 TF_CALL_ALL_TYPES(REGISTER_KERNELS);
 TF_CALL_QUANTIZED_TYPES(REGISTER_KERNELS);
-TF_CALL_uint32(REGISTER_KERNELS);
 #undef REGISTER_KERNELS
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/tensorflow/core/kernels/split_lib_cpu.cc b/tensorflow/core/kernels/split_lib_cpu.cc
index 0cb0a94d498..a3060e4e90d 100644
--- a/tensorflow/core/kernels/split_lib_cpu.cc
+++ b/tensorflow/core/kernels/split_lib_cpu.cc
@@ -43,7 +43,6 @@ void Split<Eigen::ThreadPoolDevice, T, NDims>::operator()(
 
 TF_CALL_ALL_TYPES(DEFINE_CPU_KERNELS)
 DEFINE_CPU_KERNELS(quint8)
-DEFINE_CPU_KERNELS(uint64)
 
 #ifdef TENSORFLOW_USE_SYCL
 template <typename T, int NDims>
diff --git a/tensorflow/core/kernels/split_op.cc b/tensorflow/core/kernels/split_op.cc
index f09740c6198..08575f01f67 100644
--- a/tensorflow/core/kernels/split_op.cc
+++ b/tensorflow/core/kernels/split_op.cc
@@ -404,7 +404,6 @@ class SplitOpSYCL : public SplitOpBase<SYCLDevice, T> {
 
 TF_CALL_ALL_TYPES(REGISTER_SPLIT);
 REGISTER_SPLIT(quint8);
-REGISTER_SPLIT(uint64);
 
 #undef REGISTER_SPLIT
 
diff --git a/tensorflow/core/kernels/strided_slice_op.cc b/tensorflow/core/kernels/strided_slice_op.cc
index ccc1984bb98..b4099213303 100644
--- a/tensorflow/core/kernels/strided_slice_op.cc
+++ b/tensorflow/core/kernels/strided_slice_op.cc
@@ -440,8 +440,6 @@ class StridedSliceAssignOp : public OpKernel {
                           StridedSliceAssignOp<CPUDevice, type, true>)
 
 TF_CALL_ALL_TYPES(REGISTER_STRIDED_SLICE);
-TF_CALL_uint32(REGISTER_STRIDED_SLICE);
-TF_CALL_uint64(REGISTER_STRIDED_SLICE);
 
 #undef REGISTER_STRIDED_SLICE
 
diff --git a/tensorflow/core/kernels/strided_slice_op_impl.h b/tensorflow/core/kernels/strided_slice_op_impl.h
index 1ae959b7b3f..5ce1d773e33 100644
--- a/tensorflow/core/kernels/strided_slice_op_impl.h
+++ b/tensorflow/core/kernels/strided_slice_op_impl.h
@@ -287,8 +287,6 @@ TF_CALL_GPU_ALL_TYPES(DECLARE_FOR_N_GPU);
 #endif  // END GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 TF_CALL_ALL_TYPES(DECLARE_FOR_N_CPU);
-TF_CALL_uint32(DECLARE_FOR_N_CPU);
-TF_CALL_uint64(DECLARE_FOR_N_CPU);
 
 #ifdef TENSORFLOW_USE_SYCL
 #define PREVENT_FOR_N_SYCL(T) \
diff --git a/tensorflow/core/kernels/topk_op.cc b/tensorflow/core/kernels/topk_op.cc
index c555b42f005..50325b7bcfe 100644
--- a/tensorflow/core/kernels/topk_op.cc
+++ b/tensorflow/core/kernels/topk_op.cc
@@ -258,7 +258,6 @@ namespace functor {
 
 TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPEC);
 TF_CALL_INTEGRAL_TYPES(DECLARE_GPU_SPEC);
-TF_CALL_uint32(DECLARE_GPU_SPEC);
 
 #undef DECLARE_GPU_SPEC
 
@@ -276,7 +275,6 @@ TF_CALL_uint32(DECLARE_GPU_SPEC);
 
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_KERNELS);
 TF_CALL_INTEGRAL_TYPES(REGISTER_KERNELS);
-TF_CALL_uint32(REGISTER_KERNELS)
 #undef REGISTER_KERNELS
 
 #endif  // end GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/topk_op_gpu_uint32.cu.cc b/tensorflow/core/kernels/topk_op_gpu_uint32.cu.cc
new file mode 100644
index 00000000000..16e2e0e9420
--- /dev/null
+++ b/tensorflow/core/kernels/topk_op_gpu_uint32.cu.cc
@@ -0,0 +1,28 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/kernels/topk_op.h"
+#include "tensorflow/core/kernels/topk_op_gpu.h"
+
+namespace tensorflow {
+using Eigen::GpuDevice;
+
+template struct functor::TopKFunctor<GPUDevice, uint32>;
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/topk_op_gpu_uint64.cu.cc b/tensorflow/core/kernels/topk_op_gpu_uint64.cu.cc
new file mode 100644
index 00000000000..895247a63a2
--- /dev/null
+++ b/tensorflow/core/kernels/topk_op_gpu_uint64.cu.cc
@@ -0,0 +1,28 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/kernels/topk_op.h"
+#include "tensorflow/core/kernels/topk_op_gpu.h"
+
+namespace tensorflow {
+using Eigen::GpuDevice;
+
+template struct functor::TopKFunctor<GPUDevice, uint64>;
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/util/batch_util.cc b/tensorflow/core/util/batch_util.cc
index b88c365ced0..e03188b04da 100644
--- a/tensorflow/core/util/batch_util.cc
+++ b/tensorflow/core/util/batch_util.cc
@@ -182,8 +182,6 @@ Status CopyElementToSlice(Tensor element, Tensor* parent, int64 index) {
   switch (element.dtype()) {
     TF_CALL_ALL_TYPES(HANDLE_TYPE);
     TF_CALL_QUANTIZED_TYPES(HANDLE_TYPE);
-    TF_CALL_uint32(HANDLE_TYPE);
-    TF_CALL_uint64(HANDLE_TYPE);
 #undef HANDLE_TYPE
     default:
       return errors::Unimplemented("CopyElementToSlice Unhandled data type: ",
@@ -207,8 +205,6 @@ Status CopySliceToElement(const Tensor& parent, Tensor* element, int64 index) {
   switch (parent.dtype()) {
     TF_CALL_ALL_TYPES(HANDLE_TYPE);
     TF_CALL_QUANTIZED_TYPES(HANDLE_TYPE);
-    TF_CALL_uint32(HANDLE_TYPE);
-    TF_CALL_uint64(HANDLE_TYPE);
 #undef HANDLE_TYPE
     default:
       return errors::Unimplemented("CopySliceToElement Unhandled data type: ",
@@ -280,8 +276,6 @@ Status CopyContiguousSlices(const Tensor& src, int64 src_offset,
   switch (src.dtype()) {
     TF_CALL_ALL_TYPES(HANDLE_TYPE);
     TF_CALL_QUANTIZED_TYPES(HANDLE_TYPE);
-    TF_CALL_uint32(HANDLE_TYPE);
-    TF_CALL_uint64(HANDLE_TYPE);
 #undef HANDLE_TYPE
     default:
       return errors::Unimplemented("CopyContiguousSlices unhandled data type: ",
@@ -308,8 +302,6 @@ Status MaybeMoveSliceToElement(Tensor* parent, Tensor* element, int64 index) {
   switch (parent->dtype()) {
     TF_CALL_ALL_TYPES(HANDLE_TYPE);
     TF_CALL_QUANTIZED_TYPES(HANDLE_TYPE);
-    TF_CALL_uint32(HANDLE_TYPE);
-    TF_CALL_uint64(HANDLE_TYPE);
 #undef HANDLE_TYPE
     default:
       return errors::Unimplemented(
diff --git a/tensorflow/core/util/saved_tensor_slice_util.h b/tensorflow/core/util/saved_tensor_slice_util.h
index 09b9235b711..1f9768f5163 100644
--- a/tensorflow/core/util/saved_tensor_slice_util.h
+++ b/tensorflow/core/util/saved_tensor_slice_util.h
@@ -116,7 +116,9 @@ TENSOR_PROTO_EXTRACT_TYPE(double, double, double);
 TENSOR_PROTO_EXTRACT_TYPE_COMPLEX(complex64, scomplex, float);
 TENSOR_PROTO_EXTRACT_TYPE_COMPLEX(complex128, dcomplex, double);
 TENSOR_PROTO_EXTRACT_TYPE(int32, int, int32);
+TENSOR_PROTO_EXTRACT_TYPE(uint32, uint32, uint32);
 TENSOR_PROTO_EXTRACT_TYPE(int64, int64, protobuf_int64);
+TENSOR_PROTO_EXTRACT_TYPE(uint64, uint64, protobuf_uint64);
 TENSOR_PROTO_EXTRACT_TYPE(uint16, int, int32);
 TENSOR_PROTO_EXTRACT_TYPE(uint8, int, int32);
 TENSOR_PROTO_EXTRACT_TYPE(int8, int, int32);

From e5023a1738cce7efcdf9d87863b85c80ab2f8c9e Mon Sep 17 00:00:00 2001
From: Karim Nosir <karimnosseir@google.com>
Date: Mon, 22 Jun 2020 23:10:17 -0700
Subject: [PATCH 0868/1390] - Fix type checking in elementwise.cc - Update
 error messages for some Abs Cast Ceil Cos Sin Not Square Sqrt RSqrt Log

PiperOrigin-RevId: 317807251
Change-Id: I2a4f359f04346551eda5a382b25f34bab2c73dc7
---
 tensorflow/lite/kernels/cast.cc        | 24 ++++++++-------
 tensorflow/lite/kernels/ceil.cc        |  4 +++
 tensorflow/lite/kernels/elementwise.cc | 42 ++++++++++++++++++--------
 tensorflow/lite/kernels/op_macros.h    |  9 ++++++
 4 files changed, 55 insertions(+), 24 deletions(-)

diff --git a/tensorflow/lite/kernels/cast.cc b/tensorflow/lite/kernels/cast.cc
index 415f1270328..ab95afa979f 100644
--- a/tensorflow/lite/kernels/cast.cc
+++ b/tensorflow/lite/kernels/cast.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/tensor.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/kernels/op_macros.h"
 
 namespace tflite {
 namespace ops {
@@ -67,8 +68,8 @@ void copyCast(const std::complex<float>* in, std::complex<float>* out,
 }
 
 template <typename FromT>
-TfLiteStatus copyToTensor(const FromT* in, TfLiteTensor* out,
-                          int num_elements) {
+TfLiteStatus copyToTensor(TfLiteContext* context, const FromT* in,
+                          TfLiteTensor* out, int num_elements) {
   switch (out->type) {
     case kTfLiteInt64:
       copyCast(in, out->data.i64, num_elements);
@@ -91,7 +92,7 @@ TfLiteStatus copyToTensor(const FromT* in, TfLiteTensor* out,
       break;
     default:
       // Unsupported type.
-      return kTfLiteError;
+      TF_LITE_UNSUPPORTED_TYPE(context, out->type, "Cast");
   }
   return kTfLiteOk;
 }
@@ -103,22 +104,23 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, num_elements, NumElements(output));
   switch (input->type) {
     case kTfLiteInt64:
-      return copyToTensor(input->data.i64, output, num_elements);
+      return copyToTensor(context, input->data.i64, output, num_elements);
     case kTfLiteInt32:
-      return copyToTensor(input->data.i32, output, num_elements);
+      return copyToTensor(context, input->data.i32, output, num_elements);
     case kTfLiteUInt8:
-      return copyToTensor(input->data.uint8, output, num_elements);
+      return copyToTensor(context, input->data.uint8, output, num_elements);
     case kTfLiteFloat32:
-      return copyToTensor(GetTensorData<float>(input), output, num_elements);
+      return copyToTensor(context, GetTensorData<float>(input), output,
+                          num_elements);
     case kTfLiteBool:
-      return copyToTensor(input->data.b, output, num_elements);
+      return copyToTensor(context, input->data.b, output, num_elements);
     case kTfLiteComplex64:
       return copyToTensor(
-          reinterpret_cast<std::complex<float>*>(input->data.c64), output,
-          num_elements);
+          context, reinterpret_cast<std::complex<float>*>(input->data.c64),
+          output, num_elements);
     default:
       // Unsupported type.
-      return kTfLiteError;
+      TF_LITE_UNSUPPORTED_TYPE(context, input->type, "Cast");
   }
   return kTfLiteOk;
 }
diff --git a/tensorflow/lite/kernels/ceil.cc b/tensorflow/lite/kernels/ceil.cc
index d8c6eaad7a4..95c660f3376 100644
--- a/tensorflow/lite/kernels/ceil.cc
+++ b/tensorflow/lite/kernels/ceil.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/tensor.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/kernels/op_macros.h"
 
 namespace tflite {
 namespace ops {
@@ -41,6 +42,9 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   const TfLiteTensor* input = GetInput(context, node, kInputTensor);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  if (input->type != kTfLiteFloat32) {
+    TF_LITE_UNSUPPORTED_TYPE(context, input->type, "Ceil");
+  }
 
   optimized_ops::Ceil(GetTensorShape(input), GetTensorData<float>(input),
                       GetTensorShape(output), GetTensorData<float>(output));
diff --git a/tensorflow/lite/kernels/elementwise.cc b/tensorflow/lite/kernels/elementwise.cc
index 1b91244af33..61c6aeaa811 100644
--- a/tensorflow/lite/kernels/elementwise.cc
+++ b/tensorflow/lite/kernels/elementwise.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/tensor.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/kernels/op_macros.h"
 
 namespace tflite {
 namespace ops {
@@ -39,17 +40,15 @@ bool IsLogicalSupportedType(const TfLiteType type) {
 }
 
 typedef bool (*IsSupportedType)(TfLiteType);
-template <IsSupportedType>
+template <IsSupportedType is_supported_type, const char* op_name>
 TfLiteStatus GenericPrepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
   const TfLiteTensor* input = GetInput(context, node, 0);
   TfLiteTensor* output = GetOutput(context, node, 0);
   TF_LITE_ENSURE_TYPES_EQ(context, input->type, output->type);
-  if (!IsSupportedType(input->type)) {
-    context->ReportError(context, "Current data type %d is not supported.",
-                         input->type);
-    return kTfLiteError;
+  if (!is_supported_type(input->type)) {
+    TF_LITE_UNSUPPORTED_TYPE(context, input->type, op_name);
   }
   return context->ResizeTensor(context, output,
                                TfLiteIntArrayCopy(input->dims));
@@ -112,13 +111,23 @@ TfLiteStatus LogicalNotEval(TfLiteContext* context, TfLiteNode* node) {
   return EvalLogical(context, node, [](bool v) { return !v; });
 }
 
+constexpr char kAbsName[] = "Abs";
+constexpr char kSinName[] = "Sin";
+constexpr char kCosName[] = "Cos";
+constexpr char kLogName[] = "Log";
+constexpr char kSqrtName[] = "Sqrt";
+constexpr char kRsqrtName[] = "Rsqrt";
+constexpr char kSquareName[] = "Square";
+constexpr char kNotName[] = "Not";
+
 }  // namespace
 }  // namespace elementwise
 
 TfLiteRegistration* Register_ABS() {
   static TfLiteRegistration r = {
       /*init=*/nullptr, /*free=*/nullptr,
-      elementwise::GenericPrepare<elementwise::IsNumericSupportedType>,
+      elementwise::GenericPrepare<elementwise::IsNumericSupportedType,
+                                  elementwise::kAbsName>,
       elementwise::AbsEval};
   return &r;
 }
@@ -126,7 +135,8 @@ TfLiteRegistration* Register_ABS() {
 TfLiteRegistration* Register_SIN() {
   static TfLiteRegistration r = {
       /*init=*/nullptr, /*free=*/nullptr,
-      elementwise::GenericPrepare<elementwise::IsNumericSupportedType>,
+      elementwise::GenericPrepare<elementwise::IsNumericSupportedType,
+                                  elementwise::kSinName>,
       elementwise::SinEval};
   return &r;
 }
@@ -134,7 +144,8 @@ TfLiteRegistration* Register_SIN() {
 TfLiteRegistration* Register_COS() {
   static TfLiteRegistration r = {
       /*init=*/nullptr, /*free=*/nullptr,
-      elementwise::GenericPrepare<elementwise::IsNumericSupportedType>,
+      elementwise::GenericPrepare<elementwise::IsNumericSupportedType,
+                                  elementwise::kCosName>,
       elementwise::CosEval};
   return &r;
 }
@@ -142,7 +153,8 @@ TfLiteRegistration* Register_COS() {
 TfLiteRegistration* Register_LOG() {
   static TfLiteRegistration r = {
       /*init=*/nullptr, /*free=*/nullptr,
-      elementwise::GenericPrepare<elementwise::IsNumericSupportedType>,
+      elementwise::GenericPrepare<elementwise::IsNumericSupportedType,
+                                  elementwise::kLogName>,
       elementwise::LogEval};
   return &r;
 }
@@ -150,7 +162,8 @@ TfLiteRegistration* Register_LOG() {
 TfLiteRegistration* Register_SQRT() {
   static TfLiteRegistration r = {
       /*init=*/nullptr, /*free=*/nullptr,
-      elementwise::GenericPrepare<elementwise::IsNumericSupportedType>,
+      elementwise::GenericPrepare<elementwise::IsNumericSupportedType,
+                                  elementwise::kSqrtName>,
       elementwise::SqrtEval};
   return &r;
 }
@@ -158,7 +171,8 @@ TfLiteRegistration* Register_SQRT() {
 TfLiteRegistration* Register_RSQRT() {
   static TfLiteRegistration r = {
       /*init=*/nullptr, /*free=*/nullptr,
-      elementwise::GenericPrepare<elementwise::IsNumericSupportedType>,
+      elementwise::GenericPrepare<elementwise::IsNumericSupportedType,
+                                  elementwise::kRsqrtName>,
       elementwise::RsqrtEval};
   return &r;
 }
@@ -166,7 +180,8 @@ TfLiteRegistration* Register_RSQRT() {
 TfLiteRegistration* Register_SQUARE() {
   static TfLiteRegistration r = {
       /*init=*/nullptr, /*free=*/nullptr,
-      elementwise::GenericPrepare<elementwise::IsNumericSupportedType>,
+      elementwise::GenericPrepare<elementwise::IsNumericSupportedType,
+                                  elementwise::kSquareName>,
       elementwise::SquareEval};
   return &r;
 }
@@ -174,7 +189,8 @@ TfLiteRegistration* Register_SQUARE() {
 TfLiteRegistration* Register_LOGICAL_NOT() {
   static TfLiteRegistration r = {
       /*init=*/nullptr, /*free=*/nullptr,
-      elementwise::GenericPrepare<elementwise::IsLogicalSupportedType>,
+      elementwise::GenericPrepare<elementwise::IsLogicalSupportedType,
+                                  elementwise::kNotName>,
       elementwise::LogicalNotEval};
   return &r;
 }
diff --git a/tensorflow/lite/kernels/op_macros.h b/tensorflow/lite/kernels/op_macros.h
index 5c190f1c595..5786756f408 100644
--- a/tensorflow/lite/kernels/op_macros.h
+++ b/tensorflow/lite/kernels/op_macros.h
@@ -44,6 +44,15 @@ inline void InfiniteLoop() {
     fprintf(stderr, "%s", (x)); \
   } while (0)
 
+// Report Error for unsupported type by op 'op_name' and returns kTfLiteError.
+#define TF_LITE_UNSUPPORTED_TYPE(context, type, op_name)                    \
+  do {                                                                      \
+    TF_LITE_KERNEL_LOG((context), "%s:%d Type %s is unsupported by op %s.", \
+                       __FILE__, __LINE__, TfLiteTypeGetName(type),         \
+                       (op_name));                                          \
+    return kTfLiteError;                                                    \
+  } while (0)
+
 #define TFLITE_ABORT abort()
 
 #endif  // TF_LITE_MCU_DEBUG_LOG

From daa3c52aa1c301fc698574a2943f2a6eed2de08f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?M=C3=A5ns=20Nilsson?= <mans.nilsson@arm.com>
Date: Fri, 13 Mar 2020 17:05:54 +0100
Subject: [PATCH 0869/1390] TFL: Port HARD_SWISH operator from TFLite to TFLu

Change-Id: I18092fa30dee33df4577a4abaf5321e7ba96a04c
---
 tensorflow/lite/kernels/activations.cc        |  16 -
 tensorflow/lite/kernels/internal/common.h     |  47 +++
 .../internal/reference/reference_ops.h        |  19 -
 tensorflow/lite/micro/all_ops_resolver.cc     |   1 +
 tensorflow/lite/micro/kernels/activations.cc  | 193 +++++++++-
 .../lite/micro/kernels/activations_test.cc    | 354 ++++++++++++++++++
 tensorflow/lite/micro/kernels/micro_ops.h     |   1 +
 .../lite/micro/micro_mutable_op_resolver.h    |   8 +
 tensorflow/lite/micro/micro_utils.h           |  10 +
 9 files changed, 612 insertions(+), 37 deletions(-)

diff --git a/tensorflow/lite/kernels/activations.cc b/tensorflow/lite/kernels/activations.cc
index 7ad33973b38..b6181973c7d 100644
--- a/tensorflow/lite/kernels/activations.cc
+++ b/tensorflow/lite/kernels/activations.cc
@@ -298,22 +298,6 @@ void HardSwishFree(TfLiteContext* context, void* buffer) {
   delete static_cast<HardSwishData*>(buffer);
 }
 
-void DownScaleInt32ToInt16Multiplier(int32_t multiplier_int32,
-                                     int16_t* multiplier_int16) {
-  TFLITE_DCHECK_GE(multiplier_int32, 0);
-  static constexpr int32_t kRoundingOffset = 1 << 15;
-  if (multiplier_int32 >=
-      std::numeric_limits<int32_t>::max() - kRoundingOffset) {
-    *multiplier_int16 = std::numeric_limits<int16_t>::max();
-    return;
-  }
-  const int32_t result = (multiplier_int32 + kRoundingOffset) >> 16;
-  TFLITE_DCHECK_LE(result << 16, multiplier_int32 + kRoundingOffset);
-  TFLITE_DCHECK_GT(result << 16, multiplier_int32 - kRoundingOffset);
-  *multiplier_int16 = result;
-  TFLITE_DCHECK_EQ(*multiplier_int16, result);
-}
-
 TfLiteStatus HardSwishPrepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_STATUS(GenericPrepare(context, node));
   TfLiteTensor* output = GetOutput(context, node, 0);
diff --git a/tensorflow/lite/kernels/internal/common.h b/tensorflow/lite/kernels/internal/common.h
index c1db3587415..9bc65029f34 100644
--- a/tensorflow/lite/kernels/internal/common.h
+++ b/tensorflow/lite/kernels/internal/common.h
@@ -932,6 +932,53 @@ void optimized_ops_prefetch_write_l1_keep(const T* ptr) {
 #endif
 }
 
+// Similar to ARM instruction SQDMULH.
+// Similar to gemmlowp::SaturatingRoundingDoublingHighMul except
+// rounding to zero instead of to nearest (SQRDMULH).
+inline std::int16_t SaturatingDoublingHighMul(std::int16_t a, std::int16_t b) {
+  bool overflow = a == b && a == std::numeric_limits<std::int16_t>::min();
+  std::int32_t a_32(a);
+  std::int32_t b_32(b);
+  std::int32_t ab_32 = a_32 * b_32;
+  std::int16_t ab_x2_high16 = static_cast<std::int16_t>((ab_32) / (1 << 15));
+  return overflow ? std::numeric_limits<std::int16_t>::max() : ab_x2_high16;
+}
+
+// Similar to gemmlowp::SaturatingRoundingDoublingHighMul
+inline std::int16_t SaturatingRoundingDoublingHighMul(std::int16_t a, std::int16_t b) {
+  bool overflow = a == b && a == std::numeric_limits<std::int16_t>::min();
+  std::int32_t a_32(a);
+  std::int32_t b_32(b);
+  std::int32_t ab_32 = a_32 * b_32;
+  std::int16_t nudge = ab_32 >= 0 ? (1 << 14) : (1 - (1 << 14));
+  std::int16_t ab_x2_high16 =
+      static_cast<std::int16_t>((ab_32 + nudge) / (1 << 15));
+  return overflow ? std::numeric_limits<std::int16_t>::max() : ab_x2_high16;
+}
+
+inline int16_t SaturatingLeftShift(int16_t value, int amount) {
+  int32_t result = static_cast<int32_t>(value) * (1 << amount);
+  result = std::min<int32_t>(result, std::numeric_limits<int16_t>::max());
+  result = std::max<int32_t>(result, std::numeric_limits<int16_t>::min());
+  return result;
+}
+
+inline void DownScaleInt32ToInt16Multiplier(int32_t multiplier_int32,
+                                            int16_t* multiplier_int16) {
+  TFLITE_DCHECK_GE(multiplier_int32, 0);
+  static constexpr int32_t kRoundingOffset = 1 << 15;
+  if (multiplier_int32 >=
+      std::numeric_limits<int32_t>::max() - kRoundingOffset) {
+    *multiplier_int16 = std::numeric_limits<int16_t>::max();
+    return;
+  }
+  const int32_t result = (multiplier_int32 + kRoundingOffset) >> 16;
+  TFLITE_DCHECK_LE(result << 16, multiplier_int32 + kRoundingOffset);
+  TFLITE_DCHECK_GT(result << 16, multiplier_int32 - kRoundingOffset);
+  *multiplier_int16 = result;
+  TFLITE_DCHECK_EQ(*multiplier_int16, result);
+}
+
 }  // namespace tflite
 
 #endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_COMMON_H_
diff --git a/tensorflow/lite/kernels/internal/reference/reference_ops.h b/tensorflow/lite/kernels/internal/reference/reference_ops.h
index 5208b21eb4d..b94c2060b2d 100644
--- a/tensorflow/lite/kernels/internal/reference/reference_ops.h
+++ b/tensorflow/lite/kernels/internal/reference/reference_ops.h
@@ -2615,25 +2615,6 @@ inline void HardSwish(const RuntimeShape& input_shape, const T* input_data,
   }
 }
 
-inline int16_t SaturatingLeftShift(int16_t value, int amount) {
-  int32_t result = static_cast<int32_t>(value) * (1 << amount);
-  result = std::min<int32_t>(result, std::numeric_limits<int16_t>::max());
-  result = std::max<int32_t>(result, std::numeric_limits<int16_t>::min());
-  return result;
-}
-
-// Similar to ARM instruction SQDMULH.
-// Similar to gemmlowp::SaturatingRoundingDoublingHighMul except
-// rounding to zero instead of to nearest (SQRDMULH).
-inline std::int16_t SaturatingDoublingHighMul(std::int16_t a, std::int16_t b) {
-  bool overflow = a == b && a == std::numeric_limits<std::int16_t>::min();
-  std::int32_t a_32(a);
-  std::int32_t b_32(b);
-  std::int32_t ab_32 = a_32 * b_32;
-  std::int16_t ab_x2_high16 = static_cast<std::int16_t>((ab_32) / (1 << 15));
-  return overflow ? std::numeric_limits<std::int16_t>::max() : ab_x2_high16;
-}
-
 template <typename T>
 inline void HardSwish(const HardSwishParams& params,
                       const RuntimeShape& input_shape, const T* input_data,
diff --git a/tensorflow/lite/micro/all_ops_resolver.cc b/tensorflow/lite/micro/all_ops_resolver.cc
index e728a95360a..ff461cb947e 100644
--- a/tensorflow/lite/micro/all_ops_resolver.cc
+++ b/tensorflow/lite/micro/all_ops_resolver.cc
@@ -42,6 +42,7 @@ AllOpsResolver::AllOpsResolver() {
   AddFullyConnected();
   AddGreater();
   AddGreaterEqual();
+  AddHardSwish();
   AddL2Normalization();
   AddLess();
   AddLessEqual();
diff --git a/tensorflow/lite/micro/kernels/activations.cc b/tensorflow/lite/micro/kernels/activations.cc
index 4a9b8ce5d8e..75b0339002d 100644
--- a/tensorflow/lite/micro/kernels/activations.cc
+++ b/tensorflow/lite/micro/kernels/activations.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/common.h"
 #include "tensorflow/lite/kernels/internal/quantization_util.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/kernels/internal/types.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/kernels/op_macros.h"
 #include "tensorflow/lite/micro/micro_utils.h"
@@ -77,6 +78,157 @@ inline void Relu6Quantized(Q lower, Q upper, const RuntimeShape& input_shape,
   }
 }
 
+inline std::int32_t RoundingDivideByPOT(std::int32_t numerator, int exponent) {
+  std::int32_t sign = numerator >= 0 ? 1 : -1;
+  std::int32_t abs_numerator = std::abs(numerator);
+  std::int32_t mask = (1LL << exponent) - 1;
+  std::int32_t remainder = abs_numerator & mask;
+  std::int32_t threshold = mask >> 1;
+  std::int32_t abs_result =
+      (abs_numerator >> exponent) + (remainder > threshold ? 1 : 0);
+  return sign * abs_result;
+}
+
+inline void HardSwishFloatOp(const RuntimeShape& input_shape, const float* input_data,
+                           const RuntimeShape& output_shape, float* output_data) {
+  auto matching_size = MatchingFlatSize(input_shape, output_shape);
+  const float* in_end = input_data + matching_size;
+  for (; input_data < in_end; input_data++, output_data++) {
+    const float in = *input_data;
+    *output_data =
+        in * std::min(static_cast<float>(6), std::max(static_cast<float>(0), in + 3)) /
+        6;
+  }
+}
+
+template <typename T>
+void HardSwishOp(HardSwishParams& params,
+                      const RuntimeShape& input_shape, const T* input_data,
+                      const RuntimeShape& output_shape, T* output_data) {
+  const int flat_size = MatchingFlatSize(input_shape, output_shape);
+
+  for (int i = 0; i < flat_size; i++) {
+    const int16_t input_value = input_data[i] - params.input_zero_point;
+    // Left-shift as much as we can without overflow/saturation to put
+    // significant bits in the high bits of our 16-bit fixedpoint values, so
+    // that fixed-point approximate computations below are as accurate as
+    // possible.
+    const int16_t input_value_on_hires_input_scale = input_value << 7;
+    // Compute the input value on essentially the output scale, just not
+    // right-shifted yet. This is the value that we'll use in the (x >= +3)
+    // case, and that in the general case we'll multiply against the "relu-ish"
+    // fixed-point multiplier in [0, 1].
+    const int16_t input_value_on_preshift_output_scale =
+        SaturatingRoundingDoublingHighMul(input_value_on_hires_input_scale,
+                                          params.output_multiplier_fixedpoint_int16);
+    // Now compute the "relu-ish multiplier". In the (-3 <= x <= +3) case, that
+    // is just an affine rescaling of x from [-3, 3] to [0, 1]. In the general
+    // case, it is just that plus saturation at the boundaries of [-3, 3].
+    // First, we rescale from [-3, 3] to [-1, 1], saturating.
+    // That is done by rescaling the input value with a fixed-point multiplier
+    // (reluish_multiplier_fixedpoint) and bit-shift such that we represent
+    // that input value on the scale where the real value 3.0f is represented
+    // by the quantized value 32768.  (+32768 is actually not representable as
+    // int16, so this saturates at +32767, and that is seen empirically to be
+    // a negligible contribution to numerical error/bias).
+    //
+    // This code is careful to correctly implement any magnitude of multiplier,
+    // involving either a right shift or a left shift, with correct saturation
+    // behavior in the left-shift case. This forces this code to be more
+    // complicated, but is necessary for real applications: a partially
+    // trained quantized MobileNet v3-small model that motivated this code
+    // exhibits some large [min, max] range boundaries, of the order of
+    // magnitude of 10 or 100 depending on layers.
+    //
+    // The next few lines are basically just an ordinary
+    // MultiplyByQuantizedMultiplier, except that we are more careful here
+    // about the fine details of saturation when left-shifting, because here
+    // overflow in left-shift is a common case, not an anomaly as
+    // MultiplyByQuantizedMultiplier assumes.
+    int16_t reluish_value = input_value_on_hires_input_scale;
+    // Shift left, saturating, as much as we can while ensuring that this
+    // saturation will not contribute to the result. That is, left shift amount
+    // reduced by 1.
+    if (params.reluish_multiplier_exponent > 0) {
+      reluish_value = SaturatingLeftShift(
+          reluish_value, params.reluish_multiplier_exponent - 1);
+    }
+    // Apply the fixed-point multiplier, dividing the value by a divisor
+    // ranging in [1, 2].
+    reluish_value = SaturatingRoundingDoublingHighMul(reluish_value, params.reluish_multiplier_fixedpoint_int16);
+    // Apply the last bit of left-shift. Thus, in the left-shifting case, if
+    // any saturation affects the result, it is happening here --- any
+    // saturation having occurred above is overwritten here, not affecting the
+    // result.
+    if (params.reluish_multiplier_exponent > 0) {
+      reluish_value = SaturatingLeftShift(reluish_value, 1);
+    }
+    // Shift right, in the right-shifting case.
+    if (params.reluish_multiplier_exponent < 0) {
+      reluish_value = RoundingDivideByPOT(
+          reluish_value, -params.reluish_multiplier_exponent);
+    }
+    // At this point we have rescaled the value into a 16bit fixedpoint
+    // reluish_value in [-1, 1].
+    // We now convert that to a 16bit fixedpoint value in [0, 1].
+    reluish_value = (reluish_value + (1 << 15)) >> 1;
+    // Use of SaturatingDoublingHighMul here is important to cancel the biases
+    // from the above SaturatingRoundingDoublingHighMul.
+    //
+    const int16_t preshift_output_value = SaturatingDoublingHighMul(
+        reluish_value, input_value_on_preshift_output_scale);
+    // We were so far operating on the pre-shift output scale. Now we finally
+    // apply that output shift, arriving at the final output scale.
+    int16_t output_value = RoundingDivideByPOT(
+        preshift_output_value, -params.output_multiplier_exponent);
+    output_value += params.output_zero_point;
+    output_value =
+        std::min<int16_t>(output_value, std::numeric_limits<T>::max());
+    output_value =
+        std::max<int16_t>(output_value, std::numeric_limits<T>::min());
+    output_data[i] = output_value;
+  }
+}
+
+
+template <typename Q>
+TfLiteStatus HardSwishQuantized(TfLiteContext* context, TfLiteNode* node) {
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  HardSwishParams params;
+
+   params.input_zero_point = input->params.zero_point;
+   params.output_zero_point = output->params.zero_point;
+
+   const float input_scale = input->params.scale;
+   const float hires_input_scale = (1.0f / 128.0f) * input_scale;
+   const float reluish_scale = 3.0f / 32768.0f;
+   const float output_scale = output->params.scale;
+
+   const double output_multiplier = static_cast<double>(hires_input_scale / output_scale);
+   int32_t output_multiplier_fixedpoint_int32;
+   QuantizeMultiplier(output_multiplier, &output_multiplier_fixedpoint_int32,
+                      &params.output_multiplier_exponent);
+   DownScaleInt32ToInt16Multiplier(
+       output_multiplier_fixedpoint_int32,
+       &params.output_multiplier_fixedpoint_int16);
+
+   TF_LITE_ENSURE(context, params.output_multiplier_exponent <= 0);
+
+   const double reluish_multiplier = static_cast<double>(hires_input_scale / reluish_scale);
+   int32_t reluish_multiplier_fixedpoint_int32;
+   QuantizeMultiplier(reluish_multiplier, &reluish_multiplier_fixedpoint_int32,
+                      &params.reluish_multiplier_exponent);
+   DownScaleInt32ToInt16Multiplier(
+       reluish_multiplier_fixedpoint_int32,
+       &params.reluish_multiplier_fixedpoint_int16);
+
+   HardSwishOp<Q>(params, GetTensorShape(input),
+                GetTensorData<Q>(input), GetTensorShape(output), GetTensorData<Q>(output));
+   return kTfLiteOk;
+}
+
 TfLiteStatus ReluPrepare(TfLiteContext* context, TfLiteNode* node) {
   return kTfLiteOk;
 }
@@ -107,7 +259,7 @@ TfLiteStatus ReluEval(TfLiteContext* context, TfLiteNode* node) {
       return kTfLiteOk;
     }
     default: {
-      TF_LITE_KERNEL_LOG(context, "Only float32 is supported currently, got %s",
+      TF_LITE_KERNEL_LOG(context, "Only float32/int8/uint8 are supported currently, got %s",
                          TfLiteTypeGetName(input->type));
       return kTfLiteError;
     }
@@ -148,13 +300,43 @@ TfLiteStatus Relu6Eval(TfLiteContext* context, TfLiteNode* node) {
       return kTfLiteOk;
     }
     default: {
-      TF_LITE_KERNEL_LOG(context, "Only float32 is supported currently, got %s",
+      TF_LITE_KERNEL_LOG(context, "Only float32/int8/uint8 are supported currently, got %s",
                          TfLiteTypeGetName(input->type));
       return kTfLiteError;
     }
   }
 }
 
+TfLiteStatus HardSwishPrepare(TfLiteContext* context, TfLiteNode* node) {
+  return kTfLiteOk;
+}
+
+TfLiteStatus HardSwishEval(TfLiteContext* context, TfLiteNode* node) {
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  switch (input->type) {
+    case kTfLiteFloat32: {
+      HardSwishFloatOp(
+          GetTensorShape(input),
+          GetTensorData<float>(input),
+          GetTensorShape(output),
+          GetTensorData<float>(output));
+      return kTfLiteOk;
+    } break;
+    case kTfLiteUInt8: {
+      return HardSwishQuantized<uint8>(context, node);
+    } break;
+    case kTfLiteInt8: {
+      return HardSwishQuantized<int8>(context, node);
+    } break;
+    default:
+      TF_LITE_KERNEL_LOG(context, "Only float32/int8/uint8 are supported currently, got %s",
+                         TfLiteTypeGetName(input->type));
+      return kTfLiteError;
+  }
+}
+
 }  // namespace activations
 
 TfLiteRegistration* Register_RELU() {
@@ -181,6 +363,13 @@ TfLiteRegistration* Register_RELU6() {
   return &r;
 }
 
+TfLiteRegistration* Register_HARD_SWISH() {
+  static TfLiteRegistration r = {};
+  r.prepare = activations::HardSwishPrepare;
+  r.invoke = activations::HardSwishEval;
+  return &r;
+}
+
 }  // namespace micro
 }  // namespace ops
 }  // namespace tflite
diff --git a/tensorflow/lite/micro/kernels/activations_test.cc b/tensorflow/lite/micro/kernels/activations_test.cc
index 221f8f66d58..0a7db1e6589 100644
--- a/tensorflow/lite/micro/kernels/activations_test.cc
+++ b/tensorflow/lite/micro/kernels/activations_test.cc
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <random>
+
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/micro/all_ops_resolver.h"
@@ -135,6 +137,318 @@ void TestRelu6Float(const int* input_dims_data, const float* input_data,
   }
 }
 
+void GenerateUniformRandomVector(int size, float min, float max,
+                                 std::minstd_rand* random_engine,
+                                 std::vector<float>* result) {
+  // Never use std::uniform_*_distribution in tests, it's
+  // implementation-defined. Likewise, don't use std::default_random_engine,
+  // implementation-defined. Implementation-defined is bad because it means that
+  // any toolchain update or new platform may run into test failures.
+  // std::minstd_rand is a standard instantiation of
+  // std::linear_congruential_engine, the cheapest generator in c++11 stdlib,
+  // it's good enough here.
+  result->resize(size);
+  for (int i = 0; i < size; i++) {
+    // We don't care whether the `max` value may ever be produced exactly.
+    // It may actually be thanks to rounding, as std::minstd_rand::modulus
+    // is 2^31 - 1 is greater than the inverse float epsilon.
+    float random_value_scaled_0_1 =
+        (*random_engine)() *
+        (1.0f / static_cast<float>(std::minstd_rand::modulus));
+    (*result)[i] = min + (max - min) * random_value_scaled_0_1;
+  }
+}
+
+void EvalTestReferenceHardSwish(int size, const std::vector<float>& input,
+                                std::vector<float>* result) {
+  result->resize(size);
+  for (int i = 0; i < size; i++) {
+    const float in = input[i];
+    (*result)[i] = in * std::min(6.0f, std::max(0.0f, in + 3)) * (1.0f / 6.0f);
+  }
+}
+
+template <typename T>
+void TestHardSwishQuantized(int size, float input_min,
+                            float input_max, float output_min,
+                            float output_max, std::minstd_rand* random_engine) {
+  T output_data[size];
+  T input_data_quantized[size];
+  const int input_dims_data[] = {2, 1, size};
+  const int output_dims_data[] = {2, 1, size};
+  const float input_scale = ScaleFromMinMax<T>(input_min, input_max);
+  const int input_zero_point = ZeroPointFromMinMax<T>(input_min, input_max);
+  const float output_scale = ScaleFromMinMax<T>(output_min, output_max);
+  const int output_zero_point = ZeroPointFromMinMax<T>(output_min, output_max);
+
+  // The numerical error for any 8bit quantized function is at least one half
+  // times the quantization step: 0.5 * (kOutMax - kOutMin) / 256.
+  // To that we add again the quantization step (kOutMax - kOutMin) / 256
+  // to allow for an off-by-one rounding error.
+  const float kTolerance = std::max(input_max - input_min, output_max - output_min) * (1.5f / 256.f);
+
+  TfLiteIntArray* input_dims = IntArrayFromInts(input_dims_data);
+  TfLiteIntArray* output_dims = IntArrayFromInts(output_dims_data);
+  const int output_elements_count = ElementCount(*output_dims);
+
+  TF_LITE_MICRO_EXPECT_EQ(output_elements_count, size);
+
+  float dequantized_output[output_elements_count];
+
+  std::vector<float> float_input_values;
+  std::vector<float> float_ref_output_values;
+  GenerateUniformRandomVector(size, input_min, input_max, random_engine,
+                              &float_input_values);
+  EvalTestReferenceHardSwish(size, float_input_values,
+                             &float_ref_output_values);
+  for (float& val : float_ref_output_values) {
+    val = std::min(output_max, std::max(output_min, val));
+  }
+
+  constexpr int inputs_size = 1;
+  constexpr int outputs_size = 1;
+  constexpr int tensors_size = inputs_size + outputs_size;
+  TfLiteTensor tensors[tensors_size] = {
+      CreateQuantizedTensor(float_input_values.data(), input_data_quantized, input_dims,
+                            input_scale, input_zero_point, "input_tensor"),
+      CreateQuantizedTensor(output_data, output_dims, output_scale,
+                            output_zero_point, "output_tensor"),
+  };
+
+  TfLiteContext context;
+  PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
+
+  ::tflite::AllOpsResolver resolver;
+  const TfLiteRegistration* registration =
+      resolver.FindOp(tflite::BuiltinOperator_HARD_SWISH);
+  TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
+
+  const char* init_data = nullptr;
+  size_t init_data_size = 0;
+  void* user_data = nullptr;
+  if (registration->init) {
+    user_data = registration->init(&context, init_data, init_data_size);
+  }
+  int inputs_array_data[] = {1, 0};
+  TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
+  int outputs_array_data[] = {1, 1};
+  TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
+
+  TfLiteNode node;
+  node.inputs = inputs_array;
+  node.outputs = outputs_array;
+  node.temporaries = nullptr;
+  node.user_data = user_data;
+  node.builtin_data = nullptr;
+  node.custom_initial_data = nullptr;
+  node.custom_initial_data_size = 0;
+  node.delegate = nullptr;
+  if (registration->prepare) {
+    TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
+  }
+
+  TF_LITE_MICRO_EXPECT_NE(nullptr, registration->invoke);
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(&context, &node));
+
+  if (registration->free) {
+    registration->free(&context, user_data);
+  }
+
+  AsymmetricDequantize<T>(output_data, output_elements_count, output_scale, output_zero_point, dequantized_output);
+
+  for (int i = 0; i < output_elements_count; ++i) {
+    TF_LITE_MICRO_EXPECT_NEAR(float_ref_output_values.data()[i], dequantized_output[i], kTolerance);
+  }
+}
+
+template <typename T>
+void TestHardSwishQuantizedBias(float input_min, float input_max, float output_min,
+                                float output_max, float tolerated_bias) {
+  const float quantized_type_range =
+      static_cast<float>(std::numeric_limits<T>::max()) -
+      static_cast<float>(std::numeric_limits<T>::min());
+
+
+  const float input_scale = ScaleFromMinMax<T>(input_min, input_max);
+  const float output_scale = ScaleFromMinMax<T>(output_min, output_max);
+
+  const int input_zero_point = ZeroPointFromMinMax<T>(input_min, input_max);
+  const int output_zero_point = ZeroPointFromMinMax<T>(output_min, output_max);
+
+  const float max_scale = std::max(output_scale, input_scale);
+
+  // In this bias-focused test case, no need for randomly generated input
+  // values.
+  TF_LITE_MICRO_EXPECT_LE(input_min, -3.0f);
+  TF_LITE_MICRO_EXPECT_GE(input_max, 3.0f);
+  const int quantized_input_negative_three =
+      std::round(std::numeric_limits<T>::min() +
+                 (-3.0f - input_min) / input_scale);
+  const int quantized_input_positive_three =
+      std::round(std::numeric_limits<T>::min() +
+                 (3.0f - input_min) / input_scale);
+  std::vector<float> float_input_values;
+  for (int i = quantized_input_negative_three;
+       i <= quantized_input_positive_three; i++) {
+    float_input_values.push_back(
+        input_min +
+        (i - std::numeric_limits<T>::min()) * input_scale);
+  }
+  const int size = float_input_values.size();
+  std::vector<float> float_ref_output_values;
+  EvalTestReferenceHardSwish(size, float_input_values,
+                             &float_ref_output_values);
+  for (float& val : float_ref_output_values) {
+    val = std::min(output_max, std::max(output_min, val));
+  }
+
+  T output_data[size];
+  T input_data_quantized[size];
+  const int input_dims_data[] = {2, 1, size};
+  const int output_dims_data[] = {2, 1, size};
+
+  // The numerical error for any 8bit quantized function is at least one half
+  // times the quantization step: 0.5 * (kOutMax - kOutMin) / 256.
+  // To that we add again the quantization step (kOutMax - kOutMin) / 256
+  // to allow for an off-by-one rounding error.
+  const float kTolerance = std::max(input_max - input_min, output_max - output_min) * (1.5f / 256.f);
+
+  TfLiteIntArray* input_dims = IntArrayFromInts(input_dims_data);
+  TfLiteIntArray* output_dims = IntArrayFromInts(output_dims_data);
+  const int output_elements_count = ElementCount(*output_dims);
+
+  TF_LITE_MICRO_EXPECT_EQ(output_elements_count, size);
+
+  float dequantized_output[output_elements_count];
+
+  constexpr int inputs_size = 1;
+  constexpr int outputs_size = 1;
+  constexpr int tensors_size = inputs_size + outputs_size;
+  TfLiteTensor tensors[tensors_size] = {
+      CreateQuantizedTensor(float_input_values.data(), input_data_quantized, input_dims,
+                            input_scale, input_zero_point, "input_tensor"),
+      CreateQuantizedTensor(output_data, output_dims, output_scale,
+                            output_zero_point, "output_tensor"),
+  };
+
+  TfLiteContext context;
+  PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
+
+  ::tflite::AllOpsResolver resolver;
+  const TfLiteRegistration* registration =
+      resolver.FindOp(tflite::BuiltinOperator_HARD_SWISH);
+  TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
+
+  const char* init_data = nullptr;
+  size_t init_data_size = 0;
+  void* user_data = nullptr;
+  if (registration->init) {
+    user_data = registration->init(&context, init_data, init_data_size);
+  }
+  int inputs_array_data[] = {1, 0};
+  TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
+  int outputs_array_data[] = {1, 1};
+  TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
+
+  TfLiteNode node;
+  node.inputs = inputs_array;
+  node.outputs = outputs_array;
+  node.temporaries = nullptr;
+  node.user_data = user_data;
+  node.builtin_data = nullptr;
+  node.custom_initial_data = nullptr;
+  node.custom_initial_data_size = 0;
+  node.delegate = nullptr;
+  if (registration->prepare) {
+    TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
+  }
+
+  TF_LITE_MICRO_EXPECT_NE(nullptr, registration->invoke);
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(&context, &node));
+
+  if (registration->free) {
+    registration->free(&context, user_data);
+  }
+
+  AsymmetricDequantize<T>(output_data, output_elements_count, output_scale, output_zero_point, dequantized_output);
+
+  float sum_diff = 0;
+  for (int i = 0; i < size; i++) {
+    sum_diff += dequantized_output[i] - float_ref_output_values[i];
+  }
+  const float bias = sum_diff / (size * max_scale);
+  TF_LITE_MICRO_EXPECT_LE(std::abs(bias), tolerated_bias);
+}
+
+void TestHardSwishFloat(int size, std::minstd_rand* random_engine) {
+  std::vector<float> float_input_values;
+  const float kMin = -10.0f;
+  const float kMax = 10.0f;
+  GenerateUniformRandomVector(size, kMin, kMax, random_engine,
+                              &float_input_values);
+  std::vector<float> float_ref_output_values;
+  EvalTestReferenceHardSwish(size, float_input_values,
+                             &float_ref_output_values);
+
+  float output_data[size];
+  const int input_dims_data[] = {1, size};
+  const int output_dims_data[] = {1, size};
+
+  TfLiteIntArray* input_dims = IntArrayFromInts(input_dims_data);
+  TfLiteIntArray* output_dims = IntArrayFromInts(output_dims_data);
+  const int output_elements_count = ElementCount(*output_dims);
+
+  TF_LITE_MICRO_EXPECT_EQ(output_elements_count, size);
+
+  constexpr int inputs_size = 1;
+  constexpr int outputs_size = 1;
+  constexpr int tensors_size = inputs_size + outputs_size;
+  TfLiteTensor tensors[tensors_size] = {
+      CreateFloatTensor(float_input_values.data(), input_dims, "input_tensor"),
+      CreateFloatTensor(output_data, output_dims, "output_tensor"),
+  };
+  TfLiteContext context;
+  PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
+
+  ::tflite::AllOpsResolver resolver;
+  const TfLiteRegistration* registration =
+      resolver.FindOp(tflite::BuiltinOperator_HARD_SWISH);
+  TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
+
+  const char* init_data = nullptr;
+  size_t init_data_size = 0;
+  void* user_data = nullptr;
+  if (registration->init) {
+    user_data = registration->init(&context, init_data, init_data_size);
+  }
+  int inputs_array_data[] = {1, 0};
+  TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
+  int outputs_array_data[] = {1, 1};
+  TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
+  TfLiteNode node;
+  node.inputs = inputs_array;
+  node.outputs = outputs_array;
+  node.temporaries = nullptr;
+  node.user_data = user_data;
+  node.builtin_data = nullptr;
+  node.custom_initial_data = nullptr;
+  node.custom_initial_data_size = 0;
+  node.delegate = nullptr;
+  if (registration->prepare) {
+    TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
+  }
+  TF_LITE_MICRO_EXPECT_NE(nullptr, registration->invoke);
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(&context, &node));
+
+  if (registration->free) {
+    registration->free(&context, user_data);
+  }
+
+  for (int i = 0; i < output_elements_count; ++i) {
+    TF_LITE_MICRO_EXPECT_NEAR(float_ref_output_values.data()[i], output_data[i], 1e-5f);
+  }
+}
+
 void TestReluUint8(const int* input_dims_data, const float* input_data,
                    uint8_t* input_data_quantized, const float input_scale,
                    const int input_zero_point, const float* golden,
@@ -431,6 +745,46 @@ TF_LITE_MICRO_TEST(SimpleRelu6TestFloat) {
                                   output_data);
 }
 
+TF_LITE_MICRO_TEST(SimpleHardSwishTestFloat) {
+  std::minstd_rand random_engine;
+  for (int size : {1, 2, 3, 4, 10, 20, 30, 40, 100}) {
+    tflite::testing::TestHardSwishFloat(size, &random_engine);
+  }
+}
+
+TF_LITE_MICRO_TEST(SimpleHardSwishTestQuantized) {
+  std::minstd_rand random_engine;
+  std::vector<std::pair<float, float>> minmax_pairs{
+      {0.f, 1.f}, {-2.f, 1.f}, {-5.f, 10.f}, {-40.f, 60.f}};
+  for (const auto& input_minmax : minmax_pairs) {
+    for (const auto& output_minmax : minmax_pairs) {
+      float input_min = input_minmax.first;
+      float input_max = input_minmax.second;
+      float output_min = output_minmax.first;
+      float output_max = output_minmax.second;
+      for (int size : {1, 3, 10, 100}) {
+        tflite::testing::TestHardSwishQuantized<int8_t>(size, input_min, input_max,
+                                                        output_min, output_max,
+                                                        &random_engine);
+        tflite::testing::TestHardSwishQuantized<uint8_t>(size, input_min, input_max,
+                                                         output_min, output_max,
+                                                         &random_engine);
+      }
+    }
+  }
+}
+
+// See the comment in the reference implementation of quantized HardSwish:
+// A numerical issue significantly affecting ImageNet classification accuracy
+// with MobileNet v3 is only observable at the scale of HardSwish unit tests
+// if we monitor specifically bias. This testcase is extracted from one of the
+// HardSwish nodes in that MobileNet v3 that exhibited this issue.
+TF_LITE_MICRO_TEST(SimpleHardSwishTestQuantizedBias) {
+  tflite::testing::TestHardSwishQuantizedBias<uint8_t>(-11.654928f, 25.036512f,
+                                                       -0.3905796f, 24.50887f, 0.035);
+}
+
+
 TF_LITE_MICRO_TEST(SimpleReluTestUint8) {
   const int elements_count = 10;
 
diff --git a/tensorflow/lite/micro/kernels/micro_ops.h b/tensorflow/lite/micro/kernels/micro_ops.h
index 24180aab8c5..25f9bd41d63 100644
--- a/tensorflow/lite/micro/kernels/micro_ops.h
+++ b/tensorflow/lite/micro/kernels/micro_ops.h
@@ -46,6 +46,7 @@ TfLiteRegistration* Register_FLOOR();
 TfLiteRegistration* Register_FULLY_CONNECTED();
 TfLiteRegistration* Register_GREATER();
 TfLiteRegistration* Register_GREATER_EQUAL();
+TfLiteRegistration* Register_HARD_SWISH();
 TfLiteRegistration* Register_LESS();
 TfLiteRegistration* Register_LESS_EQUAL();
 TfLiteRegistration* Register_LOG();
diff --git a/tensorflow/lite/micro/micro_mutable_op_resolver.h b/tensorflow/lite/micro/micro_mutable_op_resolver.h
index 1b76f440a61..d77dc9577fd 100644
--- a/tensorflow/lite/micro/micro_mutable_op_resolver.h
+++ b/tensorflow/lite/micro/micro_mutable_op_resolver.h
@@ -217,6 +217,14 @@ class MicroMutableOpResolver : public MicroOpResolver {
                       ParseOpData);
   }
 
+  TfLiteStatus AddHardSwish() {
+    // TODO(b/149408647): Replace ParseOpData with the operator specific parse
+    // function.
+    return AddBuiltin(BuiltinOperator_HARD_SWISH,
+                      *tflite::ops::micro::Register_HARD_SWISH(),
+                      ParseOpData);
+  }
+
   TfLiteStatus AddL2Normalization() {
     // TODO(b/149408647): Replace ParseOpData with the operator specific parse
     // function.
diff --git a/tensorflow/lite/micro/micro_utils.h b/tensorflow/lite/micro/micro_utils.h
index 4f8689b943e..0a4ab1ef09c 100644
--- a/tensorflow/lite/micro/micro_utils.h
+++ b/tensorflow/lite/micro/micro_utils.h
@@ -94,6 +94,16 @@ void SymmetricDequantize(const int8_t* values, const int size,
                          const float dequantization_scale,
                          float* dequantized_values);
 
+template <typename T>
+void AsymmetricDequantize(const T* values, const int size,
+                          const float dequantization_scale,
+                          int dequantization_zero_point,
+                          float* dequantized_values) {
+  for (int i = 0; i < size; ++i) {
+    dequantized_values[i] = (values[i] - dequantization_zero_point) * dequantization_scale;
+  }
+}
+
 }  // namespace tflite
 
 #endif  // TENSORFLOW_LITE_MICRO_MICRO_UTILS_H_

From 820257026fa125ee102948986f0e6826b6bee883 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?M=C3=A5ns=20Nilsson?= <mans.nilsson@arm.com>
Date: Tue, 2 Jun 2020 09:35:02 +0200
Subject: [PATCH 0870/1390] TFLu: Fix variable length array error in hard_swish
 unit test

---
 .../lite/micro/kernels/activations_test.cc    | 87 ++++++++++++-------
 1 file changed, 57 insertions(+), 30 deletions(-)

diff --git a/tensorflow/lite/micro/kernels/activations_test.cc b/tensorflow/lite/micro/kernels/activations_test.cc
index 0a7db1e6589..03a6b698a86 100644
--- a/tensorflow/lite/micro/kernels/activations_test.cc
+++ b/tensorflow/lite/micro/kernels/activations_test.cc
@@ -169,11 +169,9 @@ void EvalTestReferenceHardSwish(int size, const std::vector<float>& input,
 }
 
 template <typename T>
-void TestHardSwishQuantized(int size, float input_min,
-                            float input_max, float output_min,
+void TestHardSwishQuantized(int size, const T* output_data, T* input_data_quantized, float* dequantized_output,
+                            float input_min, float input_max, float output_min,
                             float output_max, std::minstd_rand* random_engine) {
-  T output_data[size];
-  T input_data_quantized[size];
   const int input_dims_data[] = {2, 1, size};
   const int output_dims_data[] = {2, 1, size};
   const float input_scale = ScaleFromMinMax<T>(input_min, input_max);
@@ -193,8 +191,6 @@ void TestHardSwishQuantized(int size, float input_min,
 
   TF_LITE_MICRO_EXPECT_EQ(output_elements_count, size);
 
-  float dequantized_output[output_elements_count];
-
   std::vector<float> float_input_values;
   std::vector<float> float_ref_output_values;
   GenerateUniformRandomVector(size, input_min, input_max, random_engine,
@@ -262,8 +258,9 @@ void TestHardSwishQuantized(int size, float input_min,
 }
 
 template <typename T>
-void TestHardSwishQuantizedBias(float input_min, float input_max, float output_min,
-                                float output_max, float tolerated_bias) {
+void TestHardSwishQuantizedBias(const int size, const T* output_data, T* input_data_quantized,
+                                float* dequantized_output, float input_min, float input_max,
+                                float output_min, float output_max, float tolerated_bias) {
   const float quantized_type_range =
       static_cast<float>(std::numeric_limits<T>::max()) -
       static_cast<float>(std::numeric_limits<T>::min());
@@ -294,7 +291,8 @@ void TestHardSwishQuantizedBias(float input_min, float input_max, float output_m
         input_min +
         (i - std::numeric_limits<T>::min()) * input_scale);
   }
-  const int size = float_input_values.size();
+  TF_LITE_MICRO_EXPECT_EQ(float_input_values.size(), size);
+
   std::vector<float> float_ref_output_values;
   EvalTestReferenceHardSwish(size, float_input_values,
                              &float_ref_output_values);
@@ -302,8 +300,6 @@ void TestHardSwishQuantizedBias(float input_min, float input_max, float output_m
     val = std::min(output_max, std::max(output_min, val));
   }
 
-  T output_data[size];
-  T input_data_quantized[size];
   const int input_dims_data[] = {2, 1, size};
   const int output_dims_data[] = {2, 1, size};
 
@@ -319,8 +315,6 @@ void TestHardSwishQuantizedBias(float input_min, float input_max, float output_m
 
   TF_LITE_MICRO_EXPECT_EQ(output_elements_count, size);
 
-  float dequantized_output[output_elements_count];
-
   constexpr int inputs_size = 1;
   constexpr int outputs_size = 1;
   constexpr int tensors_size = inputs_size + outputs_size;
@@ -370,7 +364,8 @@ void TestHardSwishQuantizedBias(float input_min, float input_max, float output_m
     registration->free(&context, user_data);
   }
 
-  AsymmetricDequantize<T>(output_data, output_elements_count, output_scale, output_zero_point, dequantized_output);
+  AsymmetricDequantize<T>(output_data, output_elements_count, output_scale,
+                          output_zero_point, dequantized_output);
 
   float sum_diff = 0;
   for (int i = 0; i < size; i++) {
@@ -380,7 +375,7 @@ void TestHardSwishQuantizedBias(float input_min, float input_max, float output_m
   TF_LITE_MICRO_EXPECT_LE(std::abs(bias), tolerated_bias);
 }
 
-void TestHardSwishFloat(int size, std::minstd_rand* random_engine) {
+void TestHardSwishFloat(const int size, float* output_data, std::minstd_rand* random_engine) {
   std::vector<float> float_input_values;
   const float kMin = -10.0f;
   const float kMax = 10.0f;
@@ -390,7 +385,6 @@ void TestHardSwishFloat(int size, std::minstd_rand* random_engine) {
   EvalTestReferenceHardSwish(size, float_input_values,
                              &float_ref_output_values);
 
-  float output_data[size];
   const int input_dims_data[] = {1, size};
   const int output_dims_data[] = {1, size};
 
@@ -747,29 +741,57 @@ TF_LITE_MICRO_TEST(SimpleRelu6TestFloat) {
 
 TF_LITE_MICRO_TEST(SimpleHardSwishTestFloat) {
   std::minstd_rand random_engine;
-  for (int size : {1, 2, 3, 4, 10, 20, 30, 40, 100}) {
-    tflite::testing::TestHardSwishFloat(size, &random_engine);
-  }
+  constexpr int size = 100;
+  float output_data[size] = {0.f};
+
+  tflite::testing::TestHardSwishFloat(size, output_data, &random_engine);
 }
 
-TF_LITE_MICRO_TEST(SimpleHardSwishTestQuantized) {
+
+TF_LITE_MICRO_TEST(SimpleHardSwishTestInt8) {
   std::minstd_rand random_engine;
   std::vector<std::pair<float, float>> minmax_pairs{
       {0.f, 1.f}, {-2.f, 1.f}, {-5.f, 10.f}, {-40.f, 60.f}};
+  constexpr int size = 101;
+  constexpr int8_t output_data[size] = {0};
+  int8_t input_data_quantized[size] = {0};
+  float dequantized_output[size] = {0.f};
+
   for (const auto& input_minmax : minmax_pairs) {
     for (const auto& output_minmax : minmax_pairs) {
       float input_min = input_minmax.first;
       float input_max = input_minmax.second;
       float output_min = output_minmax.first;
       float output_max = output_minmax.second;
-      for (int size : {1, 3, 10, 100}) {
-        tflite::testing::TestHardSwishQuantized<int8_t>(size, input_min, input_max,
-                                                        output_min, output_max,
-                                                        &random_engine);
-        tflite::testing::TestHardSwishQuantized<uint8_t>(size, input_min, input_max,
-                                                         output_min, output_max,
-                                                         &random_engine);
-      }
+
+      tflite::testing::TestHardSwishQuantized<int8_t>(size, output_data, input_data_quantized, dequantized_output,
+                                                      input_min, input_max, output_min, output_max,
+                                                      &random_engine);
+
+    }
+  }
+}
+
+TF_LITE_MICRO_TEST(SimpleHardSwishTestUint8) {
+  std::minstd_rand random_engine;
+  std::vector<std::pair<float, float>> minmax_pairs{
+      {0.f, 1.f}, {-2.f, 1.f}, {-5.f, 10.f}, {-40.f, 60.f}};
+  constexpr int size = 99;
+  constexpr uint8_t output_data[size] = {0};
+  uint8_t input_data_quantized[size] = {0};
+  float dequantized_output[size] = {0.f};
+
+  for (const auto& input_minmax : minmax_pairs) {
+    for (const auto& output_minmax : minmax_pairs) {
+      float input_min = input_minmax.first;
+      float input_max = input_minmax.second;
+      float output_min = output_minmax.first;
+      float output_max = output_minmax.second;
+
+      tflite::testing::TestHardSwishQuantized<uint8_t>(size, output_data, input_data_quantized, dequantized_output,
+                                                       input_min, input_max, output_min, output_max,
+                                                       &random_engine);
+
     }
   }
 }
@@ -780,8 +802,13 @@ TF_LITE_MICRO_TEST(SimpleHardSwishTestQuantized) {
 // if we monitor specifically bias. This testcase is extracted from one of the
 // HardSwish nodes in that MobileNet v3 that exhibited this issue.
 TF_LITE_MICRO_TEST(SimpleHardSwishTestQuantizedBias) {
-  tflite::testing::TestHardSwishQuantizedBias<uint8_t>(-11.654928f, 25.036512f,
-                                                       -0.3905796f, 24.50887f, 0.035);
+  constexpr int size = 43;
+  constexpr uint8_t output_data[size] = {0};
+  uint8_t input_data_quantized[size] = {0};
+  float dequantized_output[size] = {0.f};
+
+  tflite::testing::TestHardSwishQuantizedBias<uint8_t>(size, output_data, input_data_quantized, dequantized_output,
+                                                       -11.654928f, 25.036512f, -0.3905796f, 24.50887f, 0.035);
 }
 
 
From c2fc5dccfa021fa8f7ec48da874c0d91a2d211c4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?M=C3=A5ns=20Nilsson?= <mans.nilsson@arm.com>
Date: Wed, 10 Jun 2020 11:14:27 +0200
Subject: [PATCH 0871/1390] TFLu: Update hard swish op as per review comments

---
 tensorflow/lite/kernels/internal/BUILD        |   2 +
 .../kernels/internal/reference/activations.h  |  54 +++++++
 .../internal/reference/reference_ops.h        |  15 +-
 tensorflow/lite/micro/kernels/activations.cc  | 150 +++++++++---------
 .../lite/micro/kernels/activations_test.cc    | 121 +++++++-------
 5 files changed, 196 insertions(+), 146 deletions(-)
 create mode 100644 tensorflow/lite/kernels/internal/reference/activations.h

diff --git a/tensorflow/lite/kernels/internal/BUILD b/tensorflow/lite/kernels/internal/BUILD
index a02a5bf3981..31a6a74eb5e 100644
--- a/tensorflow/lite/kernels/internal/BUILD
+++ b/tensorflow/lite/kernels/internal/BUILD
@@ -437,6 +437,7 @@ cc_library(
     name = "reference_base",
     srcs = [],
     hdrs = [
+        "reference/activations.h",
         "reference/add.h",
         "reference/arg_min_max.h",
         "reference/batch_matmul.h",
@@ -525,6 +526,7 @@ cc_library(
     name = "legacy_reference_base",
     srcs = [],
     hdrs = [
+        "reference/activations.h",
         "reference/add.h",
         "reference/arg_min_max.h",
         "reference/binary_function.h",
diff --git a/tensorflow/lite/kernels/internal/reference/activations.h b/tensorflow/lite/kernels/internal/reference/activations.h
new file mode 100644
index 00000000000..2110f5e3cbc
--- /dev/null
+++ b/tensorflow/lite/kernels/internal/reference/activations.h
@@ -0,0 +1,54 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_ACTIVATIONS_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_ACTIVATIONS_H_
+
+#include "tensorflow/lite/kernels/internal/types.h"
+#include "tensorflow/lite/kernels/internal/common.h"
+
+
+namespace tflite {
+namespace reference_ops {
+
+inline std::int32_t RoundingDivideByPOT(std::int32_t numerator, int exponent) {
+  std::int32_t sign = numerator >= 0 ? 1 : -1;
+  std::int32_t abs_numerator = std::abs(numerator);
+  std::int32_t mask = (1LL << exponent) - 1;
+  std::int32_t remainder = abs_numerator & mask;
+  std::int32_t threshold = mask >> 1;
+  std::int32_t abs_result =
+      (abs_numerator >> exponent) + (remainder > threshold ? 1 : 0);
+  return sign * abs_result;
+}
+
+template <typename T>
+inline void HardSwish(const RuntimeShape& input_shape, const T* input_data,
+                      const RuntimeShape& output_shape, T* output_data) {
+  ruy::profiler::ScopeLabel label("ReferenceHardSwish/Float");
+  auto matching_size = MatchingFlatSize(input_shape, output_shape);
+  const T* in_end = input_data + matching_size;
+  for (; input_data < in_end; input_data++, output_data++) {
+    const float in = *input_data;
+    *output_data =
+        in * std::min(static_cast<T>(6), std::max(static_cast<T>(0), in + 3)) /
+        6;
+  }
+}
+
+}  // namespace reference_ops
+}  // namespace tflite
+
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_CONV_H_
diff --git a/tensorflow/lite/kernels/internal/reference/reference_ops.h b/tensorflow/lite/kernels/internal/reference/reference_ops.h
index b94c2060b2d..3e9ebff69ad 100644
--- a/tensorflow/lite/kernels/internal/reference/reference_ops.h
+++ b/tensorflow/lite/kernels/internal/reference/reference_ops.h
@@ -32,6 +32,7 @@ limitations under the License.
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/kernels/internal/common.h"
 #include "tensorflow/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/lite/kernels/internal/reference/activations.h"
 #include "tensorflow/lite/kernels/internal/reference/add.h"
 #include "tensorflow/lite/kernels/internal/reference/arg_min_max.h"
 #include "tensorflow/lite/kernels/internal/reference/binary_function.h"
@@ -2601,20 +2602,6 @@ void ReverseSequence(const TS* seq_lengths, const int seq_dim,
   }
 }
 
-template <typename T>
-inline void HardSwish(const RuntimeShape& input_shape, const T* input_data,
-                      const RuntimeShape& output_shape, T* output_data) {
-  ruy::profiler::ScopeLabel label("ReferenceHardSwish/Float");
-  auto matching_size = MatchingFlatSize(input_shape, output_shape);
-  const T* in_end = input_data + matching_size;
-  for (; input_data < in_end; input_data++, output_data++) {
-    const float in = *input_data;
-    *output_data =
-        in * std::min(static_cast<T>(6), std::max(static_cast<T>(0), in + 3)) /
-        6;
-  }
-}
-
 template <typename T>
 inline void HardSwish(const HardSwishParams& params,
                       const RuntimeShape& input_shape, const T* input_data,
diff --git a/tensorflow/lite/micro/kernels/activations.cc b/tensorflow/lite/micro/kernels/activations.cc
index 75b0339002d..934f029ac6a 100644
--- a/tensorflow/lite/micro/kernels/activations.cc
+++ b/tensorflow/lite/micro/kernels/activations.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/quantization_util.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/internal/types.h"
+#include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/kernels/op_macros.h"
 #include "tensorflow/lite/micro/micro_utils.h"
@@ -78,33 +79,10 @@ inline void Relu6Quantized(Q lower, Q upper, const RuntimeShape& input_shape,
   }
 }
 
-inline std::int32_t RoundingDivideByPOT(std::int32_t numerator, int exponent) {
-  std::int32_t sign = numerator >= 0 ? 1 : -1;
-  std::int32_t abs_numerator = std::abs(numerator);
-  std::int32_t mask = (1LL << exponent) - 1;
-  std::int32_t remainder = abs_numerator & mask;
-  std::int32_t threshold = mask >> 1;
-  std::int32_t abs_result =
-      (abs_numerator >> exponent) + (remainder > threshold ? 1 : 0);
-  return sign * abs_result;
-}
-
-inline void HardSwishFloatOp(const RuntimeShape& input_shape, const float* input_data,
-                           const RuntimeShape& output_shape, float* output_data) {
-  auto matching_size = MatchingFlatSize(input_shape, output_shape);
-  const float* in_end = input_data + matching_size;
-  for (; input_data < in_end; input_data++, output_data++) {
-    const float in = *input_data;
-    *output_data =
-        in * std::min(static_cast<float>(6), std::max(static_cast<float>(0), in + 3)) /
-        6;
-  }
-}
-
 template <typename T>
 void HardSwishOp(HardSwishParams& params,
-                      const RuntimeShape& input_shape, const T* input_data,
-                      const RuntimeShape& output_shape, T* output_data) {
+                 const RuntimeShape& input_shape, const T* input_data,
+                 const RuntimeShape& output_shape, T* output_data) {
   const int flat_size = MatchingFlatSize(input_shape, output_shape);
 
   for (int i = 0; i < flat_size; i++) {
@@ -165,7 +143,7 @@ void HardSwishOp(HardSwishParams& params,
     }
     // Shift right, in the right-shifting case.
     if (params.reluish_multiplier_exponent < 0) {
-      reluish_value = RoundingDivideByPOT(
+      reluish_value = tflite::reference_ops::RoundingDivideByPOT(
           reluish_value, -params.reluish_multiplier_exponent);
     }
     // At this point we have rescaled the value into a 16bit fixedpoint
@@ -179,7 +157,7 @@ void HardSwishOp(HardSwishParams& params,
         reluish_value, input_value_on_preshift_output_scale);
     // We were so far operating on the pre-shift output scale. Now we finally
     // apply that output shift, arriving at the final output scale.
-    int16_t output_value = RoundingDivideByPOT(
+    int16_t output_value = tflite::reference_ops::RoundingDivideByPOT(
         preshift_output_value, -params.output_multiplier_exponent);
     output_value += params.output_zero_point;
     output_value =
@@ -190,46 +168,11 @@ void HardSwishOp(HardSwishParams& params,
   }
 }
 
-
-template <typename Q>
-TfLiteStatus HardSwishQuantized(TfLiteContext* context, TfLiteNode* node) {
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
-
-  HardSwishParams params;
-
-   params.input_zero_point = input->params.zero_point;
-   params.output_zero_point = output->params.zero_point;
-
-   const float input_scale = input->params.scale;
-   const float hires_input_scale = (1.0f / 128.0f) * input_scale;
-   const float reluish_scale = 3.0f / 32768.0f;
-   const float output_scale = output->params.scale;
-
-   const double output_multiplier = static_cast<double>(hires_input_scale / output_scale);
-   int32_t output_multiplier_fixedpoint_int32;
-   QuantizeMultiplier(output_multiplier, &output_multiplier_fixedpoint_int32,
-                      &params.output_multiplier_exponent);
-   DownScaleInt32ToInt16Multiplier(
-       output_multiplier_fixedpoint_int32,
-       &params.output_multiplier_fixedpoint_int16);
-
-   TF_LITE_ENSURE(context, params.output_multiplier_exponent <= 0);
-
-   const double reluish_multiplier = static_cast<double>(hires_input_scale / reluish_scale);
-   int32_t reluish_multiplier_fixedpoint_int32;
-   QuantizeMultiplier(reluish_multiplier, &reluish_multiplier_fixedpoint_int32,
-                      &params.reluish_multiplier_exponent);
-   DownScaleInt32ToInt16Multiplier(
-       reluish_multiplier_fixedpoint_int32,
-       &params.reluish_multiplier_fixedpoint_int16);
-
-   HardSwishOp<Q>(params, GetTensorShape(input),
-                GetTensorData<Q>(input), GetTensorShape(output), GetTensorData<Q>(output));
-   return kTfLiteOk;
-}
-
 TfLiteStatus ReluPrepare(TfLiteContext* context, TfLiteNode* node) {
+  // Validate number of inputs and outputs
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+
   return kTfLiteOk;
 }
 
@@ -267,6 +210,10 @@ TfLiteStatus ReluEval(TfLiteContext* context, TfLiteNode* node) {
 }
 
 TfLiteStatus Relu6Prepare(TfLiteContext* context, TfLiteNode* node) {
+  // Validate number of inputs and outputs
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+
   return kTfLiteOk;
 }
 
@@ -307,34 +254,84 @@ TfLiteStatus Relu6Eval(TfLiteContext* context, TfLiteNode* node) {
   }
 }
 
+void* HardSwishInit(TfLiteContext* context, const char* buffer, size_t length) {
+  void* data = nullptr;
+  if (context->AllocatePersistentBuffer(context, sizeof(HardSwishParams), &data) ==
+      kTfLiteError) {
+    return nullptr;
+  }
+  return data;
+}
+
 TfLiteStatus HardSwishPrepare(TfLiteContext* context, TfLiteNode* node) {
+  TFLITE_DCHECK(node->user_data != nullptr);
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  if (input->type == kTfLiteUInt8 || input->type == kTfLiteInt8) {
+    HardSwishParams* params = static_cast<HardSwishParams*>(node->user_data);
+
+    params->input_zero_point = input->params.zero_point;
+    params->output_zero_point = output->params.zero_point;
+
+    const float input_scale = input->params.scale;
+    const float hires_input_scale = (1.0f / 128.0f) * input_scale;
+    const float reluish_scale = 3.0f / 32768.0f;
+    const float output_scale = output->params.scale;
+
+    const double output_multiplier = static_cast<double>(hires_input_scale / output_scale);
+    int32_t output_multiplier_fixedpoint_int32;
+    QuantizeMultiplier(output_multiplier, &output_multiplier_fixedpoint_int32,
+                       &params->output_multiplier_exponent);
+    DownScaleInt32ToInt16Multiplier(
+        output_multiplier_fixedpoint_int32,
+        &params->output_multiplier_fixedpoint_int16);
+
+    TF_LITE_ENSURE(context, params->output_multiplier_exponent <= 0);
+
+    const double reluish_multiplier = static_cast<double>(hires_input_scale / reluish_scale);
+    int32_t reluish_multiplier_fixedpoint_int32;
+    QuantizeMultiplier(reluish_multiplier, &reluish_multiplier_fixedpoint_int32,
+                       &params->reluish_multiplier_exponent);
+    DownScaleInt32ToInt16Multiplier(
+        reluish_multiplier_fixedpoint_int32,
+        &params->reluish_multiplier_fixedpoint_int16);
+  }
+
   return kTfLiteOk;
 }
 
 TfLiteStatus HardSwishEval(TfLiteContext* context, TfLiteNode* node) {
   const TfLiteTensor* input = GetInput(context, node, kInputTensor);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  HardSwishParams* params = static_cast<HardSwishParams*>(node->user_data);
 
   switch (input->type) {
     case kTfLiteFloat32: {
-      HardSwishFloatOp(
+      tflite::reference_ops::HardSwish<float>(
           GetTensorShape(input),
           GetTensorData<float>(input),
           GetTensorShape(output),
           GetTensorData<float>(output));
-      return kTfLiteOk;
     } break;
     case kTfLiteUInt8: {
-      return HardSwishQuantized<uint8>(context, node);
+      HardSwishOp<uint8_t>(*params, GetTensorShape(input),
+                           GetTensorData<uint8_t>(input), GetTensorShape(output), GetTensorData<uint8_t>(output));
     } break;
     case kTfLiteInt8: {
-      return HardSwishQuantized<int8>(context, node);
+      HardSwishOp<int8_t>(*params, GetTensorShape(input),
+                          GetTensorData<int8_t>(input), GetTensorShape(output), GetTensorData<int8_t>(output));
     } break;
-    default:
+    default: {
       TF_LITE_KERNEL_LOG(context, "Only float32/int8/uint8 are supported currently, got %s",
                          TfLiteTypeGetName(input->type));
       return kTfLiteError;
+    }
   }
+  return kTfLiteOk;
 }
 
 }  // namespace activations
@@ -364,9 +361,14 @@ TfLiteRegistration* Register_RELU6() {
 }
 
 TfLiteRegistration* Register_HARD_SWISH() {
-  static TfLiteRegistration r = {};
-  r.prepare = activations::HardSwishPrepare;
-  r.invoke = activations::HardSwishEval;
+  static TfLiteRegistration r = {/*init=*/activations::HardSwishInit,
+                                 /*free=*/nullptr,
+                                 /*prepare=*/activations::HardSwishPrepare,
+                                 /*invoke=*/activations::HardSwishEval,
+                                 /*profiling_string=*/nullptr,
+                                 /*builtin_code=*/0,
+                                 /*custom_name=*/nullptr,
+                                 /*version=*/0};
   return &r;
 }
 
diff --git a/tensorflow/lite/micro/kernels/activations_test.cc b/tensorflow/lite/micro/kernels/activations_test.cc
index 03a6b698a86..b1d8e429025 100644
--- a/tensorflow/lite/micro/kernels/activations_test.cc
+++ b/tensorflow/lite/micro/kernels/activations_test.cc
@@ -139,7 +139,7 @@ void TestRelu6Float(const int* input_dims_data, const float* input_data,
 
 void GenerateUniformRandomVector(int size, float min, float max,
                                  std::minstd_rand* random_engine,
-                                 std::vector<float>* result) {
+                                 float* result) {
   // Never use std::uniform_*_distribution in tests, it's
   // implementation-defined. Likewise, don't use std::default_random_engine,
   // implementation-defined. Implementation-defined is bad because it means that
@@ -147,7 +147,6 @@ void GenerateUniformRandomVector(int size, float min, float max,
   // std::minstd_rand is a standard instantiation of
   // std::linear_congruential_engine, the cheapest generator in c++11 stdlib,
   // it's good enough here.
-  result->resize(size);
   for (int i = 0; i < size; i++) {
     // We don't care whether the `max` value may ever be produced exactly.
     // It may actually be thanks to rounding, as std::minstd_rand::modulus
@@ -155,23 +154,23 @@ void GenerateUniformRandomVector(int size, float min, float max,
     float random_value_scaled_0_1 =
         (*random_engine)() *
         (1.0f / static_cast<float>(std::minstd_rand::modulus));
-    (*result)[i] = min + (max - min) * random_value_scaled_0_1;
+    result[i] = min + (max - min) * random_value_scaled_0_1;
   }
 }
 
-void EvalTestReferenceHardSwish(int size, const std::vector<float>& input,
-                                std::vector<float>* result) {
-  result->resize(size);
+void EvalTestReferenceHardSwish(int size, float* input,
+                                float* result) {
   for (int i = 0; i < size; i++) {
     const float in = input[i];
-    (*result)[i] = in * std::min(6.0f, std::max(0.0f, in + 3)) * (1.0f / 6.0f);
+    result[i] = in * std::min(6.0f, std::max(0.0f, in + 3)) * (1.0f / 6.0f);
   }
 }
 
 template <typename T>
 void TestHardSwishQuantized(int size, const T* output_data, T* input_data_quantized, float* dequantized_output,
                             float input_min, float input_max, float output_min,
-                            float output_max, std::minstd_rand* random_engine) {
+                            float output_max, std::minstd_rand* random_engine,
+                            float *float_input_values, float* float_ref_output_values) {
   const int input_dims_data[] = {2, 1, size};
   const int output_dims_data[] = {2, 1, size};
   const float input_scale = ScaleFromMinMax<T>(input_min, input_max);
@@ -191,21 +190,20 @@ void TestHardSwishQuantized(int size, const T* output_data, T* input_data_quanti
 
   TF_LITE_MICRO_EXPECT_EQ(output_elements_count, size);
 
-  std::vector<float> float_input_values;
-  std::vector<float> float_ref_output_values;
   GenerateUniformRandomVector(size, input_min, input_max, random_engine,
-                              &float_input_values);
+                              float_input_values);
   EvalTestReferenceHardSwish(size, float_input_values,
-                             &float_ref_output_values);
-  for (float& val : float_ref_output_values) {
-    val = std::min(output_max, std::max(output_min, val));
+                             float_ref_output_values);
+  for (int i = 0; i < size; i++) {
+    float val = float_ref_output_values[i];
+    float_ref_output_values[i] = std::min(output_max, std::max(output_min, val));
   }
 
   constexpr int inputs_size = 1;
   constexpr int outputs_size = 1;
   constexpr int tensors_size = inputs_size + outputs_size;
   TfLiteTensor tensors[tensors_size] = {
-      CreateQuantizedTensor(float_input_values.data(), input_data_quantized, input_dims,
+      CreateQuantizedTensor(float_input_values, input_data_quantized, input_dims,
                             input_scale, input_zero_point, "input_tensor"),
       CreateQuantizedTensor(output_data, output_dims, output_scale,
                             output_zero_point, "output_tensor"),
@@ -253,19 +251,19 @@ void TestHardSwishQuantized(int size, const T* output_data, T* input_data_quanti
   AsymmetricDequantize<T>(output_data, output_elements_count, output_scale, output_zero_point, dequantized_output);
 
   for (int i = 0; i < output_elements_count; ++i) {
-    TF_LITE_MICRO_EXPECT_NEAR(float_ref_output_values.data()[i], dequantized_output[i], kTolerance);
+    TF_LITE_MICRO_EXPECT_NEAR(float_ref_output_values[i], dequantized_output[i], kTolerance);
   }
 }
 
 template <typename T>
 void TestHardSwishQuantizedBias(const int size, const T* output_data, T* input_data_quantized,
                                 float* dequantized_output, float input_min, float input_max,
-                                float output_min, float output_max, float tolerated_bias) {
+                                float output_min, float output_max, float tolerated_bias,
+                                float* float_input_values, float* float_ref_output_values) {
   const float quantized_type_range =
       static_cast<float>(std::numeric_limits<T>::max()) -
       static_cast<float>(std::numeric_limits<T>::min());
 
-
   const float input_scale = ScaleFromMinMax<T>(input_min, input_max);
   const float output_scale = ScaleFromMinMax<T>(output_min, output_max);
 
@@ -284,20 +282,18 @@ void TestHardSwishQuantizedBias(const int size, const T* output_data, T* input_d
   const int quantized_input_positive_three =
       std::round(std::numeric_limits<T>::min() +
                  (3.0f - input_min) / input_scale);
-  std::vector<float> float_input_values;
+
   for (int i = quantized_input_negative_three;
        i <= quantized_input_positive_three; i++) {
-    float_input_values.push_back(
-        input_min +
-        (i - std::numeric_limits<T>::min()) * input_scale);
+    float_input_values[i] = input_min +
+                            (i - std::numeric_limits<T>::min()) * input_scale;
   }
-  TF_LITE_MICRO_EXPECT_EQ(float_input_values.size(), size);
 
-  std::vector<float> float_ref_output_values;
   EvalTestReferenceHardSwish(size, float_input_values,
-                             &float_ref_output_values);
-  for (float& val : float_ref_output_values) {
-    val = std::min(output_max, std::max(output_min, val));
+                             float_ref_output_values);
+  for (int i = 0; i < size; i++) {
+    float val = float_ref_output_values[i];
+    float_ref_output_values[i] = std::min(output_max, std::max(output_min, val));
   }
 
   const int input_dims_data[] = {2, 1, size};
@@ -319,7 +315,7 @@ void TestHardSwishQuantizedBias(const int size, const T* output_data, T* input_d
   constexpr int outputs_size = 1;
   constexpr int tensors_size = inputs_size + outputs_size;
   TfLiteTensor tensors[tensors_size] = {
-      CreateQuantizedTensor(float_input_values.data(), input_data_quantized, input_dims,
+      CreateQuantizedTensor(float_input_values, input_data_quantized, input_dims,
                             input_scale, input_zero_point, "input_tensor"),
       CreateQuantizedTensor(output_data, output_dims, output_scale,
                             output_zero_point, "output_tensor"),
@@ -375,15 +371,15 @@ void TestHardSwishQuantizedBias(const int size, const T* output_data, T* input_d
   TF_LITE_MICRO_EXPECT_LE(std::abs(bias), tolerated_bias);
 }
 
-void TestHardSwishFloat(const int size, float* output_data, std::minstd_rand* random_engine) {
-  std::vector<float> float_input_values;
+void TestHardSwishFloat(const int size, float* output_data, std::minstd_rand* random_engine,
+                        float* float_input_values, float* float_ref_output_values) {
   const float kMin = -10.0f;
   const float kMax = 10.0f;
   GenerateUniformRandomVector(size, kMin, kMax, random_engine,
-                              &float_input_values);
-  std::vector<float> float_ref_output_values;
+                              float_input_values);
+
   EvalTestReferenceHardSwish(size, float_input_values,
-                             &float_ref_output_values);
+                             float_ref_output_values);
 
   const int input_dims_data[] = {1, size};
   const int output_dims_data[] = {1, size};
@@ -398,7 +394,7 @@ void TestHardSwishFloat(const int size, float* output_data, std::minstd_rand* ra
   constexpr int outputs_size = 1;
   constexpr int tensors_size = inputs_size + outputs_size;
   TfLiteTensor tensors[tensors_size] = {
-      CreateFloatTensor(float_input_values.data(), input_dims, "input_tensor"),
+      CreateFloatTensor(float_input_values, input_dims, "input_tensor"),
       CreateFloatTensor(output_data, output_dims, "output_tensor"),
   };
   TfLiteContext context;
@@ -439,7 +435,7 @@ void TestHardSwishFloat(const int size, float* output_data, std::minstd_rand* ra
   }
 
   for (int i = 0; i < output_elements_count; ++i) {
-    TF_LITE_MICRO_EXPECT_NEAR(float_ref_output_values.data()[i], output_data[i], 1e-5f);
+    TF_LITE_MICRO_EXPECT_NEAR(float_ref_output_values[i], output_data[i], 1e-5f);
   }
 }
 
@@ -743,55 +739,62 @@ TF_LITE_MICRO_TEST(SimpleHardSwishTestFloat) {
   std::minstd_rand random_engine;
   constexpr int size = 100;
   float output_data[size] = {0.f};
+  float input_values[size] = {0.f};
+  float output_values[size] = {0.f};
 
-  tflite::testing::TestHardSwishFloat(size, output_data, &random_engine);
+  tflite::testing::TestHardSwishFloat(size, output_data, &random_engine,
+                                      input_values, output_values);
 }
 
 
 TF_LITE_MICRO_TEST(SimpleHardSwishTestInt8) {
   std::minstd_rand random_engine;
-  std::vector<std::pair<float, float>> minmax_pairs{
-      {0.f, 1.f}, {-2.f, 1.f}, {-5.f, 10.f}, {-40.f, 60.f}};
+  constexpr int pairs = 4, one_pair = 2;
   constexpr int size = 101;
+  constexpr float minmax_pairs[pairs][one_pair] = {
+    {0.f, 1.f}, {-2.f, 1.f}, {-5.f, 10.f}, {-40.f, 60.f}};
   constexpr int8_t output_data[size] = {0};
   int8_t input_data_quantized[size] = {0};
   float dequantized_output[size] = {0.f};
+  float input_values[size] = {0.f};
+  float output_values[size] = {0.f};
 
-  for (const auto& input_minmax : minmax_pairs) {
-    for (const auto& output_minmax : minmax_pairs) {
-      float input_min = input_minmax.first;
-      float input_max = input_minmax.second;
-      float output_min = output_minmax.first;
-      float output_max = output_minmax.second;
+  for (int x = 0; x < pairs; x++) {
+    for (int y = 0; y < pairs; y++) {
+      float input_min = minmax_pairs[x][0];
+      float input_max = minmax_pairs[x][1];
+      float output_min = minmax_pairs[y][0];
+      float output_max = minmax_pairs[y][1];
 
       tflite::testing::TestHardSwishQuantized<int8_t>(size, output_data, input_data_quantized, dequantized_output,
                                                       input_min, input_max, output_min, output_max,
-                                                      &random_engine);
-
+                                                      &random_engine, input_values, output_values);
     }
   }
 }
 
 TF_LITE_MICRO_TEST(SimpleHardSwishTestUint8) {
   std::minstd_rand random_engine;
-  std::vector<std::pair<float, float>> minmax_pairs{
-      {0.f, 1.f}, {-2.f, 1.f}, {-5.f, 10.f}, {-40.f, 60.f}};
   constexpr int size = 99;
+  constexpr int pairs = 4, one_pair = 2;
+  constexpr float minmax_pairs[pairs][one_pair] = {
+    {0.f, 1.f}, {-2.f, 1.f}, {-5.f, 10.f}, {-40.f, 60.f}};
   constexpr uint8_t output_data[size] = {0};
   uint8_t input_data_quantized[size] = {0};
   float dequantized_output[size] = {0.f};
+  float input_values[size] = {0.f};
+  float output_values[size] = {0.f};
 
-  for (const auto& input_minmax : minmax_pairs) {
-    for (const auto& output_minmax : minmax_pairs) {
-      float input_min = input_minmax.first;
-      float input_max = input_minmax.second;
-      float output_min = output_minmax.first;
-      float output_max = output_minmax.second;
+  for (int x = 0; x < pairs; x++) {
+    for (int y = 0; y < pairs; y++) {
+      float input_min = minmax_pairs[x][0];
+      float input_max = minmax_pairs[x][1];
+      float output_min = minmax_pairs[y][0];
+      float output_max = minmax_pairs[y][1];
 
       tflite::testing::TestHardSwishQuantized<uint8_t>(size, output_data, input_data_quantized, dequantized_output,
                                                        input_min, input_max, output_min, output_max,
-                                                       &random_engine);
-
+                                                       &random_engine, input_values, output_values);
     }
   }
 }
@@ -806,12 +809,14 @@ TF_LITE_MICRO_TEST(SimpleHardSwishTestQuantizedBias) {
   constexpr uint8_t output_data[size] = {0};
   uint8_t input_data_quantized[size] = {0};
   float dequantized_output[size] = {0.f};
+  float input_values[size] = {0.f};
+  float output_values[size] = {0.f};
 
   tflite::testing::TestHardSwishQuantizedBias<uint8_t>(size, output_data, input_data_quantized, dequantized_output,
-                                                       -11.654928f, 25.036512f, -0.3905796f, 24.50887f, 0.035);
+                                                       -11.654928f, 25.036512f, -0.3905796f, 24.50887f, 0.035,
+                                                       input_values, output_values);
 }
 
-
 TF_LITE_MICRO_TEST(SimpleReluTestUint8) {
   const int elements_count = 10;
 

From 801779e9231ec58497b6ce4d921abe459045a87d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?M=C3=A5ns=20Nilsson?= <mans.nilsson@arm.com>
Date: Thu, 11 Jun 2020 15:56:58 +0200
Subject: [PATCH 0872/1390] TFLu: do not directly include reference_ops.h

---
 tensorflow/lite/kernels/internal/reference/activations.h | 2 +-
 tensorflow/lite/micro/kernels/activations.cc             | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/lite/kernels/internal/reference/activations.h b/tensorflow/lite/kernels/internal/reference/activations.h
index 2110f5e3cbc..c043f1e0845 100644
--- a/tensorflow/lite/kernels/internal/reference/activations.h
+++ b/tensorflow/lite/kernels/internal/reference/activations.h
@@ -15,10 +15,10 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_ACTIVATIONS_H_
 #define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_ACTIVATIONS_H_
 
+#include "ruy/profiler/instrumentation.h"  // from @ruy
 #include "tensorflow/lite/kernels/internal/types.h"
 #include "tensorflow/lite/kernels/internal/common.h"
 
-
 namespace tflite {
 namespace reference_ops {
 
diff --git a/tensorflow/lite/micro/kernels/activations.cc b/tensorflow/lite/micro/kernels/activations.cc
index 934f029ac6a..a536878e4cb 100644
--- a/tensorflow/lite/micro/kernels/activations.cc
+++ b/tensorflow/lite/micro/kernels/activations.cc
@@ -19,7 +19,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/quantization_util.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/internal/types.h"
-#include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
+#include "tensorflow/lite/kernels/internal/reference/activations.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/kernels/op_macros.h"
 #include "tensorflow/lite/micro/micro_utils.h"

From 7473be170522cf994f541de07f867648b25c450e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?M=C3=A5ns=20Nilsson?= <mans.nilsson@arm.com>
Date: Tue, 23 Jun 2020 10:09:18 +0200
Subject: [PATCH 0873/1390] TFLu: remove parameter to CreateTensor calls as it
 was removed

---
 tensorflow/lite/micro/kernels/activations_test.cc | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/tensorflow/lite/micro/kernels/activations_test.cc b/tensorflow/lite/micro/kernels/activations_test.cc
index b1d8e429025..d8fa1c56a5d 100644
--- a/tensorflow/lite/micro/kernels/activations_test.cc
+++ b/tensorflow/lite/micro/kernels/activations_test.cc
@@ -204,9 +204,9 @@ void TestHardSwishQuantized(int size, const T* output_data, T* input_data_quanti
   constexpr int tensors_size = inputs_size + outputs_size;
   TfLiteTensor tensors[tensors_size] = {
       CreateQuantizedTensor(float_input_values, input_data_quantized, input_dims,
-                            input_scale, input_zero_point, "input_tensor"),
+                            input_scale, input_zero_point),
       CreateQuantizedTensor(output_data, output_dims, output_scale,
-                            output_zero_point, "output_tensor"),
+                            output_zero_point),
   };
 
   TfLiteContext context;
@@ -316,9 +316,9 @@ void TestHardSwishQuantizedBias(const int size, const T* output_data, T* input_d
   constexpr int tensors_size = inputs_size + outputs_size;
   TfLiteTensor tensors[tensors_size] = {
       CreateQuantizedTensor(float_input_values, input_data_quantized, input_dims,
-                            input_scale, input_zero_point, "input_tensor"),
+                            input_scale, input_zero_point),
       CreateQuantizedTensor(output_data, output_dims, output_scale,
-                            output_zero_point, "output_tensor"),
+                            output_zero_point),
   };
 
   TfLiteContext context;
@@ -394,8 +394,8 @@ void TestHardSwishFloat(const int size, float* output_data, std::minstd_rand* ra
   constexpr int outputs_size = 1;
   constexpr int tensors_size = inputs_size + outputs_size;
   TfLiteTensor tensors[tensors_size] = {
-      CreateFloatTensor(float_input_values, input_dims, "input_tensor"),
-      CreateFloatTensor(output_data, output_dims, "output_tensor"),
+      CreateFloatTensor(float_input_values, input_dims),
+      CreateFloatTensor(output_data, output_dims),
   };
   TfLiteContext context;
   PopulateContext(tensors, tensors_size, micro_test::reporter, &context);

From f5e1a27293823b799a080c384d854aff8f3ef133 Mon Sep 17 00:00:00 2001
From: Tres Popp <tpopp@google.com>
Date: Tue, 23 Jun 2020 02:01:33 -0700
Subject: [PATCH 0874/1390] Integrate LLVM at
 https://github.com/llvm/llvm-project/commit/f570d5810485

PiperOrigin-RevId: 317824506
Change-Id: I5da78511de1e20d9b0a62fd3a8092b791a35b9da
---
 .../compiler/mlir/xla/transforms/lhlo_legalize_to_llvm.cc     | 2 +-
 tensorflow/workspace.bzl                                      | 4 ++--
 third_party/mlir/BUILD                                        | 1 +
 3 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/tensorflow/compiler/mlir/xla/transforms/lhlo_legalize_to_llvm.cc b/tensorflow/compiler/mlir/xla/transforms/lhlo_legalize_to_llvm.cc
index 99d2c08aa98..78a77dc3b4d 100644
--- a/tensorflow/compiler/mlir/xla/transforms/lhlo_legalize_to_llvm.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/lhlo_legalize_to_llvm.cc
@@ -129,7 +129,7 @@ struct DynamicMemRefCastOpConverter
 void PopulateLhloToLLVMConversionPatterns(LLVMTypeConverter *converter,
                                           OwningRewritePatternList *patterns) {
   patterns->insert<DynamicMemRefCastOpConverter, StaticMemRefCastOpConverter>(
-      *converter);
+      *converter, LowerToLLVMOptions());
 }
 
 }  // namespace xla_lhlo
diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index 0f591ba8b90..a55dcd17b07 100755
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -710,8 +710,8 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
     )
 
     # Check out LLVM and MLIR from llvm-project.
-    LLVM_COMMIT = "7e825abd5704ce28b166f9463d4bd304348fd2a9"
-    LLVM_SHA256 = "a21b752ee1866e195f3f72c7931c79f8c4ecc0f14861488284bdc2bdf14d6fe9"
+    LLVM_COMMIT = "f570d5810485fa6fb2e1009f795a899d79bd429f"
+    LLVM_SHA256 = "e154a1a97c3b6bead73a32bcb6fc37aac2d80628abee2961315304a4964f04fe"
     LLVM_URLS = [
         "https://storage.googleapis.com/mirror.tensorflow.org/github.com/llvm/llvm-project/archive/{commit}.tar.gz".format(commit = LLVM_COMMIT),
         "https://github.com/llvm/llvm-project/archive/{commit}.tar.gz".format(commit = LLVM_COMMIT),
diff --git a/third_party/mlir/BUILD b/third_party/mlir/BUILD
index 2be59fc44b1..8fd0a94bf64 100644
--- a/third_party/mlir/BUILD
+++ b/third_party/mlir/BUILD
@@ -1973,6 +1973,7 @@ cc_library(
         ":StandardOps",
         ":Support",
         ":Transforms",
+        ":VectorOps",
         "@llvm-project//llvm:Support",
     ],
 )

From 7a124ecab8070e7affc2fabf019b5b57588d85ee Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 23 Jun 2020 02:01:48 -0700
Subject: [PATCH 0875/1390] Update GraphDef version to 441.

PiperOrigin-RevId: 317824531
Change-Id: I7fbf95f3a662d2d8d823656e9ddf5ae2dfd487f6
---
 tensorflow/core/public/version.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index e63ca48863f..52a926c8d8b 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -108,7 +108,7 @@ limitations under the License.
 
 #define TF_GRAPH_DEF_VERSION_MIN_PRODUCER 0
 #define TF_GRAPH_DEF_VERSION_MIN_CONSUMER 0
-#define TF_GRAPH_DEF_VERSION 440  // Updated: 2020/6/22
+#define TF_GRAPH_DEF_VERSION 441  // Updated: 2020/6/23
 
 // Checkpoint compatibility versions (the versions field in SavedSliceMeta).
 //

From 5aa8be781eebe1f3544d4c19f83ca0fa6351b4a3 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 23 Jun 2020 02:01:50 -0700
Subject: [PATCH 0876/1390] compat: Update forward compatibility horizon to
 2020-06-23

PiperOrigin-RevId: 317824535
Change-Id: I12a1d7a71a420b996f67cf3c536a9c5fd3365154
---
 tensorflow/python/compat/compat.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/compat/compat.py b/tensorflow/python/compat/compat.py
index 301f4608aba..521d7eaf30f 100644
--- a/tensorflow/python/compat/compat.py
+++ b/tensorflow/python/compat/compat.py
@@ -33,7 +33,7 @@ from tensorflow.python.util.tf_export import tf_export
 # This value changes every day with an automatic CL. It can be modified in code
 # via `forward_compatibility_horizon()` or with the environment variable
 # TF_FORWARD_COMPATIBILITY_DELTA_DAYS, which is added to the compatibility date.
-_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2020, 6, 22)
+_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2020, 6, 23)
 _FORWARD_COMPATIBILITY_DELTA_DAYS_VAR_NAME = "TF_FORWARD_COMPATIBILITY_DELTA_DAYS"
 _FORWARD_COMPATIBILITY_DATE_NUMBER = None
 

From 775b9cb5acffaf86c1ce09d4d1460f40b1392c2a Mon Sep 17 00:00:00 2001
From: Alexander Belyaev <pifon@google.com>
Date: Tue, 23 Jun 2020 03:19:37 -0700
Subject: [PATCH 0877/1390] [XLA][MLIR] Use callback-based builders for
 (indexed_)generic in (L)HLO->Linalg.

PiperOrigin-RevId: 317833249
Change-Id: Ifb60bd3a4797ce360c2a2685e3e96a19dcd6b161
---
 .../xla/transforms/xla_legalize_to_linalg.cc  | 124 +++++++-----------
 1 file changed, 44 insertions(+), 80 deletions(-)

diff --git a/tensorflow/compiler/mlir/xla/transforms/xla_legalize_to_linalg.cc b/tensorflow/compiler/mlir/xla/transforms/xla_legalize_to_linalg.cc
index 8a2f8ce7d04..e7bb5df8233 100644
--- a/tensorflow/compiler/mlir/xla/transforms/xla_legalize_to_linalg.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/xla_legalize_to_linalg.cc
@@ -125,32 +125,19 @@ class PointwiseToLinalgConverter : public OpConversionPattern<OpTy> {
       opResultTypes.push_back(shapedType);
     }
 
+    int64_t args_count = bodyArgTypes.size();
+    int64_t results_count = bodyResultTypes.size();
     auto linalgOp = rewriter.create<linalg::GenericOp>(
-        loc, opResultTypes, args,
-        /*inputCount=*/bodyArgTypes.size(),
-        /*outputCount=*/bodyResultTypes.size(), indexing_maps,
-        GetNParallelLoopsAttrs(nloops));
-
-    // Add a block to the region.
-    auto* region = &linalgOp.region();
-    auto* block = rewriter.createBlock(region, region->end());
-    block->addArguments(bodyArgTypes);
-    if (isLHLO) block->addArguments(bodyResultTypes);
-
-    SmallVector<Value, 4> bodyArgs;
-    for (int i = 0, e = bodyArgTypes.size(); i < e; ++i) {
-      bodyArgs.push_back(block->getArgument(i));
-    }
-
-    rewriter.setInsertionPointToEnd(block);
-    // TODO(ravishankarm) : For now use the method in xla_lhlo namespace. That
-    // method needs to be moved out of there.
-    Value opResult = xla_lhlo::XlaOpToStdScalarOp::map<OpTy>(
-        op, bodyResultTypes, bodyArgs, &rewriter);
-    if (!opResult) {
-      return failure();
-    }
-    rewriter.create<linalg::YieldOp>(loc, opResult);
+        loc, opResultTypes, args, args_count, results_count, indexing_maps,
+        GetNParallelLoopsAttrs(nloops),
+        [&](OpBuilder& nestedBuilder, Location nestedLoc, ValueRange args) {
+          // TODO(ravishankarm) : For now use the method in xla_lhlo namespace.
+          // That method needs to be moved out of there.
+          Value opResult = xla_lhlo::XlaOpToStdScalarOp::map<OpTy>(
+              op, bodyResultTypes,
+              llvm::to_vector<2>(args.take_front(args_count)), &rewriter);
+          nestedBuilder.create<linalg::YieldOp>(loc, opResult);
+        });
     rewriter.replaceOp(op, linalgOp.getOperation()->getResults());
     return success();
   }
@@ -301,27 +288,20 @@ class DataMovementOpConverter : public OpConversionPattern<OpTy> {
       OpTy op, ArrayRef<Value> args,
       ConversionPatternRewriter& rewriter) const final {
     if (!verifyXLAOpBufferOrTensorSemantics<isLHLO>(op)) return failure();
-    auto operandType = op.operand().getType().template cast<ShapedType>();
     auto resultType = getXLAOpResultType<isLHLO>(op);
 
     SmallVector<AffineMap, 2> indexing_maps =
         Derived::getIndexingMaps(op, &rewriter);
     if (indexing_maps.empty()) return failure();
 
-    OpBuilder::InsertionGuard linalgOpGuard(rewriter);
     auto nloops = resultType.getRank();
     auto loc = op.getLoc();
     auto linalgOp = rewriter.create<linalg::GenericOp>(
         loc, isLHLO ? ArrayRef<Type>{} : resultType, args, /*inputCount=*/1,
-        /*outputCount=*/1, indexing_maps, GetNParallelLoopsAttrs(nloops));
-
-    auto* region = &linalgOp.region();
-    auto* block = rewriter.createBlock(region, region->end());
-    block->addArguments(operandType.getElementType());
-    if (isLHLO) block->addArgument(resultType.getElementType());
-
-    rewriter.setInsertionPointToEnd(block);
-    rewriter.create<linalg::YieldOp>(loc, block->getArgument(0));
+        /*outputCount=*/1, indexing_maps, GetNParallelLoopsAttrs(nloops),
+        [&](OpBuilder& nestedBuilder, Location nestedLoc, ValueRange args) {
+          nestedBuilder.create<linalg::YieldOp>(loc, *args.begin());
+        });
 
     rewriter.replaceOp(op, linalgOp.getOperation()->getResults());
     return success();
@@ -437,36 +417,26 @@ class LhloBroadcastInDimConverter
       Value zero = rewriter.create<ConstantIndexOp>(loc, 0);
       Value val =
           rewriter.create<LoadOp>(loc, operand, llvm::makeArrayRef({zero}));
-      auto linalgOp = rewriter.create<linalg::GenericOp>(
+      rewriter.create<linalg::GenericOp>(
           loc, llvm::None, llvm::makeArrayRef(operand_adaptor.output()),
           /*inputCount=*/0, /*outputCount=*/1,
           llvm::makeArrayRef(rewriter.getMultiDimIdentityMap(nloops)),
-          GetNParallelLoopsAttrs(nloops));
+          GetNParallelLoopsAttrs(nloops),
+          [&](OpBuilder& nestedBuilder, Location nestedLoc, ValueRange args) {
+            nestedBuilder.create<linalg::YieldOp>(loc, val);
+          });
 
-      auto* region = &linalgOp.region();
-      auto* block = rewriter.createBlock(region, region->end());
-      block->addArgument(result_type.getElementType());
-
-      rewriter.setInsertionPointToEnd(block);
-      rewriter.create<linalg::YieldOp>(loc, val);
     } else {
       auto indexing_maps = getIndexingMaps(op, broadcast_dims, result_shape,
                                            operand_type, &rewriter);
-
-      OpBuilder::InsertionGuard linalgOpGuard(rewriter);
-      auto linalgOp = rewriter.create<linalg::GenericOp>(
+      rewriter.create<linalg::GenericOp>(
           loc, llvm::None,
           llvm::makeArrayRef({operand, operand_adaptor.output()}),
           /*inputCount=*/1, /*outputCount=*/1, indexing_maps,
-          GetNParallelLoopsAttrs(nloops));
-
-      auto* region = &linalgOp.region();
-      auto* block = rewriter.createBlock(region, region->end());
-      block->addArguments(operand_type.getElementType());
-      block->addArgument(result_type.getElementType());
-
-      rewriter.setInsertionPointToEnd(block);
-      rewriter.create<linalg::YieldOp>(loc, block->getArgument(0));
+          GetNParallelLoopsAttrs(nloops),
+          [&](OpBuilder& nestedBuilder, Location nestedLoc, ValueRange args) {
+            nestedBuilder.create<linalg::YieldOp>(loc, *args.begin());
+          });
     }
     rewriter.replaceOp(op, llvm::None);
     return success();
@@ -686,32 +656,26 @@ class IotaConverter : public OpConversionPattern<xla_lhlo::IotaOp> {
     // Construct the indexing maps needed for linalg.generic ops.
     unsigned nloops = resultMemrefType.getRank();
 
-    auto loc = iotaOp.getLoc();
-    auto linalgOp = rewriter.create<linalg::IndexedGenericOp>(
-        loc, ArrayRef<Type>{}, args,
+    rewriter.create<linalg::IndexedGenericOp>(
+        iotaOp.getLoc(), ArrayRef<Type>{}, args,
         0,  // args_in
         1,  // args_out
         llvm::makeArrayRef(rewriter.getMultiDimIdentityMap(nloops)),
-        GetNParallelLoopsAttrs(nloops));
+        GetNParallelLoopsAttrs(nloops),
+        [&](OpBuilder& nestedBuilder, Location nestedLoc, ValueRange ivs,
+            ValueRange args) {
+          Value castOp = nestedBuilder.create<IndexCastOp>(
+              nestedLoc, ivs[iotaOp.iota_dimension().getZExtValue()],
+              nestedBuilder.getIntegerType(
+                  resultElementType.getIntOrFloatBitWidth()));
+          if (resultElementType.isa<FloatType>()) {
+            castOp = nestedBuilder.create<SIToFPOp>(nestedLoc, castOp,
+                                                    resultElementType);
+          }
+          nestedBuilder.create<linalg::YieldOp>(nestedLoc, castOp);
+        });
 
-    // Add a block to the region.
-    auto* region = &linalgOp.region();
-    auto* block = rewriter.createBlock(region, region->end());
-    for (unsigned i = 0; i < nloops; ++i) {
-      block->addArgument(rewriter.getIndexType());
-    }
-    block->addArguments(llvm::makeArrayRef(resultElementType));
-
-    rewriter.setInsertionPointToEnd(block);
-    Operation* castOp = rewriter.create<IndexCastOp>(
-        loc, block->getArgument(iotaOp.iota_dimension().getZExtValue()),
-        rewriter.getIntegerType(resultElementType.getIntOrFloatBitWidth()));
-    if (resultElementType.isa<FloatType>()) {
-      castOp = rewriter.create<SIToFPOp>(loc, castOp->getResult(0),
-                                         resultElementType);
-    }
-    rewriter.create<linalg::YieldOp>(loc, castOp->getResult(0));
-    rewriter.eraseOp(iotaOp);
+    rewriter.replaceOp(iotaOp, llvm::None);
     return success();
   }
 };
@@ -867,7 +831,7 @@ struct LhloLegalizeToLinalg
 
     auto func = getFunction();
     populateLHLOToLinalgConversionPattern(func.getContext(), &patterns);
-    if (failed(applyPartialConversion(func, target, patterns))) {
+    if (failed(applyPartialConversion(func, target, patterns, nullptr))) {
       signalPassFailure();
     }
   }
@@ -882,7 +846,7 @@ struct HloLegalizeToLinalg
 
     auto func = getFunction();
     xla_hlo::populateHLOToLinalgConversionPattern(func.getContext(), &patterns);
-    if (failed(applyPartialConversion(func, target, patterns))) {
+    if (failed(applyPartialConversion(func, target, patterns, nullptr))) {
       signalPassFailure();
     }
   }

From 8eea0658d48c4110076b478ac507445e2b2e47b3 Mon Sep 17 00:00:00 2001
From: Adrian Kuegel <akuegel@google.com>
Date: Tue, 23 Jun 2020 04:51:06 -0700
Subject: [PATCH 0878/1390] Replace tensorflow command line flags with LLVM
 command line flags.

This avoids a dependency on the framework_internal library, and also LLVM
directly supports comma-separated value lists.

PiperOrigin-RevId: 317842748
Change-Id: I9207375c753fbab33f6211736d57248c8c3f558a
---
 .../compiler/mlir/tools/kernel_gen/BUILD      |  3 +-
 .../mlir/tools/kernel_gen/tf_to_cubin.cc      | 88 +++++--------------
 2 files changed, 26 insertions(+), 65 deletions(-)

diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/BUILD b/tensorflow/compiler/mlir/tools/kernel_gen/BUILD
index cebfa7cd9d4..80b597d962d 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/BUILD
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/BUILD
@@ -44,8 +44,9 @@ tf_cc_binary(
     visibility = ["//tensorflow/core/kernels/cubin_headers:__pkg__"],
     deps = [
         ":cubin_creator",
-        "//tensorflow/core:framework_internal",
+        "//tensorflow/compiler/mlir:init_mlir",
         "//tensorflow/core:lib",
         "@com_google_absl//absl/strings",
+        "@llvm-project//llvm:Support",
     ],
 )
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/tf_to_cubin.cc b/tensorflow/compiler/mlir/tools/kernel_gen/tf_to_cubin.cc
index 66fcabde0ac..96831689600 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/tf_to_cubin.cc
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/tf_to_cubin.cc
@@ -21,77 +21,37 @@
 #include <utility>
 #include <vector>
 
-#include "absl/strings/numbers.h"
-#include "absl/strings/str_split.h"
 #include "absl/strings/string_view.h"
+#include "llvm/Support/CommandLine.h"
+#include "tensorflow/compiler/mlir/init_mlir.h"
 #include "tensorflow/compiler/mlir/tools/kernel_gen/cubin_creator.h"
 #include "tensorflow/core/platform/env.h"
-#include "tensorflow/core/platform/init_main.h"
 #include "tensorflow/core/platform/logging.h"
-#include "tensorflow/core/util/command_line_flags.h"
-
-namespace {
-bool ParseStringList(std::string string_list, std::vector<uint32_t>* result) {
-  result->clear();
-  uint32_t item;
-  auto items = absl::StrSplit(string_list, ',');
-  for (const auto& item_str : items) {
-    if (!absl::SimpleAtoi(item_str, &item)) {
-      LOG(ERROR) << "Expected token " << item_str << " to be an integer";
-      return false;
-    }
-    result->push_back(item);
-  }
-  return true;
-}
-}  // namespace
 
 int main(int argc, char** argv) {
-  std::string input_file = "foo.mlir";
-  std::string output_file = "foo.bin";
-  int32_t architecture = 50;
-  std::vector<uint32_t> tile_sizes;
-  std::vector<uint32_t> unroll_factors;
-  std::vector<uint32_t> same_shape;
+  llvm::cl::opt<std::string> input_file("input", llvm::cl::desc("input file"),
+                                        llvm::cl::value_desc("filename"),
+                                        llvm::cl::init("foo.mlir"));
+  llvm::cl::opt<std::string> output_file(
+      "output", llvm::cl::desc("output file"), llvm::cl::value_desc("filename"),
+      llvm::cl::init("foo.bin"));
+  llvm::cl::opt<int32_t> architecture(
+      "arch", llvm::cl::desc("target architecture (e.g. 50 for sm_50)"),
+      llvm::cl::init(50));
+  llvm::cl::list<uint32_t> tile_sizes(
+      "tile_sizes", llvm::cl::desc("tile sizes to use"), llvm::cl::ZeroOrMore,
+      llvm::cl::CommaSeparated);
+  llvm::cl::list<uint32_t> unroll_factors(
+      "unroll_factors",
+      llvm::cl::desc("factors to unroll by, separated by commas"),
+      llvm::cl::ZeroOrMore, llvm::cl::CommaSeparated);
+  llvm::cl::list<uint32_t> same_shape(
+      "same_shape",
+      llvm::cl::desc("arguments with same shape, separated by commas"),
+      llvm::cl::ZeroOrMore, llvm::cl::CommaSeparated);
 
-  auto parse_tile_sizes = [&tile_sizes](std::string tile_sizes_str) {
-    if (!ParseStringList(tile_sizes_str, &tile_sizes)) {
-      return false;
-    }
-    // Initialize with the default.
-    if (tile_sizes.empty()) {
-      tile_sizes.push_back(16);
-      tile_sizes.push_back(64);
-    }
-    return true;
-  };
-
-  auto parse_unroll_factors =
-      [&unroll_factors](std::string unroll_factors_str) {
-        return ParseStringList(unroll_factors_str, &unroll_factors);
-      };
-
-  auto parse_same_shape = [&same_shape](std::string same_shape_str) {
-    return ParseStringList(same_shape_str, &same_shape);
-  };
-
-  std::vector<tensorflow::Flag> flag_list = {
-      tensorflow::Flag("input", &input_file, "input file"),
-      tensorflow::Flag("output", &output_file, "output file"),
-      tensorflow::Flag("arch", &architecture,
-                       "target architecture (e.g. 50 for sm_50)"),
-      tensorflow::Flag("tile_sizes", parse_tile_sizes, "16,64",
-                       "tile sizes to use"),
-      tensorflow::Flag("unroll_factors", parse_unroll_factors, "",
-                       "factors to unroll by, separated by commas"),
-      tensorflow::Flag("same_shape", parse_same_shape, "",
-                       "arguments with same shape, separated by commas"),
-  };
-  bool parse_ok = tensorflow::Flags::Parse(&argc, argv, flag_list);
-  tensorflow::port::InitMain("usage", &argc, &argv);
-  if (!parse_ok) {
-    return 1;
-  }
+  tensorflow::InitMlir y(&argc, &argv);
+  llvm::cl::ParseCommandLineOptions(argc, argv, "TF op GPU kernel generator\n");
 
   std::pair<int32_t, int32_t> compute_capability(architecture / 10,
                                                  architecture % 10);

From af94e801cf90fe8fb70d930d8658c20913d09ca5 Mon Sep 17 00:00:00 2001
From: Stephan Herhut <herhut@google.com>
Date: Tue, 23 Jun 2020 04:56:42 -0700
Subject: [PATCH 0879/1390] Use ForLoopSpecializationPass to help with
 vectorization of kernels.

PiperOrigin-RevId: 317843378
Change-Id: I4ead02c24f957269888af5491934567cd3e311fb
---
 tensorflow/compiler/xla/service/mlir_gpu/kernel_lowering.cc | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/tensorflow/compiler/xla/service/mlir_gpu/kernel_lowering.cc b/tensorflow/compiler/xla/service/mlir_gpu/kernel_lowering.cc
index 3f99d40c717..196ea218ef3 100644
--- a/tensorflow/compiler/xla/service/mlir_gpu/kernel_lowering.cc
+++ b/tensorflow/compiler/xla/service/mlir_gpu/kernel_lowering.cc
@@ -505,6 +505,11 @@ Status LowerLHLOToGPU(mlir::ModuleOp module, LowerLHLOToGPUOptions options) {
   // Some basic cleanup.
   pm.addNestedPass<::mlir::FuncOp>(::mlir::createCanonicalizerPass());
   pm.addNestedPass<::mlir::FuncOp>(::mlir::createCSEPass());
+  // Make loops with min bounds into a conditional plus static bounds.
+  // Only do this if we unrolled in the first place.
+  if (!options.unroll_factors.empty()) {
+    pm.addNestedPass<::mlir::FuncOp>(mlir::createForLoopSpecializationPass());
+  }
   // Approximate of requested.
   if (options.use_approximations) {
     pm.addNestedPass<::mlir::FuncOp>(

From 7198070f4d032493eeae9741ba9fd868f3d3fb7c Mon Sep 17 00:00:00 2001
From: Tres Popp <tpopp@google.com>
Date: Tue, 23 Jun 2020 04:57:39 -0700
Subject: [PATCH 0880/1390] Integrate LLVM at
 https://github.com/llvm/llvm-project/commit/7a55d984971c

PiperOrigin-RevId: 317843475
Change-Id: I2824384ac6a2535fd779ee40301d83004e55b5f3
---
 tensorflow/workspace.bzl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index a55dcd17b07..7c47628818b 100755
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -710,8 +710,8 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
     )
 
     # Check out LLVM and MLIR from llvm-project.
-    LLVM_COMMIT = "f570d5810485fa6fb2e1009f795a899d79bd429f"
-    LLVM_SHA256 = "e154a1a97c3b6bead73a32bcb6fc37aac2d80628abee2961315304a4964f04fe"
+    LLVM_COMMIT = "7a55d984971c11daa55e9423934f98bdc9c04f2f"
+    LLVM_SHA256 = "6ef6d1f92f51936ed3027433ea26875ac4d2ac8eed88b1fbce472018c4fb7720"
     LLVM_URLS = [
         "https://storage.googleapis.com/mirror.tensorflow.org/github.com/llvm/llvm-project/archive/{commit}.tar.gz".format(commit = LLVM_COMMIT),
         "https://github.com/llvm/llvm-project/archive/{commit}.tar.gz".format(commit = LLVM_COMMIT),

From e071e66f03eb6ae234212400f34061f25e79a699 Mon Sep 17 00:00:00 2001
From: Lev Proleev <levp@google.com>
Date: Tue, 23 Jun 2020 05:29:56 -0700
Subject: [PATCH 0881/1390] Add support for TENSOR_QUANT8_ASYMM_SIGNED in NNAPI
 delegate

PiperOrigin-RevId: 317846923
Change-Id: I1c61f53e89228cd2482435e9255e390864bd83e3
---
 tensorflow/lite/delegates/nnapi/BUILD         |  26 +
 .../delegates/nnapi/acceleration_test_list.cc |  14 +-
 .../lite/delegates/nnapi/nnapi_delegate.cc    | 246 +++--
 .../delegates/nnapi/nnapi_delegate_kernel.h   |   3 +
 .../nnapi/nnapi_delegate_mock_test.h          |   2 +
 ...nnapi_delegate_signed_quantization_test.cc | 920 ++++++++++++++++++
 tensorflow/lite/nnapi/NeuralNetworksTypes.h   |   1 +
 tensorflow/lite/nnapi/nnapi_handler.h         |  22 +
 8 files changed, 1154 insertions(+), 80 deletions(-)
 create mode 100644 tensorflow/lite/delegates/nnapi/nnapi_delegate_signed_quantization_test.cc

diff --git a/tensorflow/lite/delegates/nnapi/BUILD b/tensorflow/lite/delegates/nnapi/BUILD
index ec9f6907f21..beeaff1b99d 100644
--- a/tensorflow/lite/delegates/nnapi/BUILD
+++ b/tensorflow/lite/delegates/nnapi/BUILD
@@ -190,6 +190,32 @@ cc_test(
     ],
 )
 
+cc_test(
+    name = "nnapi_delegate_signed_quantization_test",
+    size = "small",
+    srcs = [
+        "nnapi_delegate_signed_quantization_test.cc",
+    ],
+    tags = [
+        "no_mac",
+        "no_windows",
+        "tflite_not_portable_ios",
+    ],
+    deps = [
+        ":nnapi_delegate",
+        ":nnapi_delegate_mock_test",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite:kernel_api",
+        "//tensorflow/lite:minimal_logging",
+        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/kernels:builtin_ops",
+        "//tensorflow/lite/kernels:test_util",
+        "//tensorflow/lite/nnapi:nnapi_implementation",
+        "//tensorflow/lite/nnapi:nnapi_lib",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
 cc_test(
     name = "quant_lstm_sup_test",
     size = "small",
diff --git a/tensorflow/lite/delegates/nnapi/acceleration_test_list.cc b/tensorflow/lite/delegates/nnapi/acceleration_test_list.cc
index b20628016f0..31bdc5f8b99 100644
--- a/tensorflow/lite/delegates/nnapi/acceleration_test_list.cc
+++ b/tensorflow/lite/delegates/nnapi/acceleration_test_list.cc
@@ -60,6 +60,10 @@ FloatActivationsOpTest/Elu,30
 FloatActivationsOpTest/HardSwish
 QuantizedActivationsOpTest/HardSwish
 QuantizedActivationsOpTest/HardSwishBias
+QuantizedActivationsOpTest/Relu*
+QuantizedActivationsOpTest/PRelu,29
+QuantizedActivationsOpTest/PReluSameShapes,29
+QuantizedActivationsOpTest/PReluInt8.+,30
 
 # add_test
 FloatAddOpModel/.+
@@ -145,6 +149,7 @@ ConvolutionOpTest/ConvolutionOpTest/.+/\d+
 
 # dequantize_test
 DequantizeOpTest/Uint8
+DequantizeOpTest/Int8,30
 
 # depth_to_space_test
 DepthToSpaceOpModel/Float32
@@ -190,6 +195,7 @@ QuantizedFullyConnectedOpTest/SimpleTestQuantizedOutputMultiplierGreaterThan1Uin
 QuantizedFullyConnectedOpTest/SimpleTestQuantizedOutputMultiplierGreaterThan1Int8/\d+,29
 HybridFullyConnectedOpTest/SimpleTestQuantizedUint8,29
 HybridFullyConnectedOpTest/SimpleTestQuantizedInt8,29
+HybridAsymmetricInputFullyConnectedOpTest.SimpleTestQuantizedUint8,29
 FloatFullyConnectedOpTest/FloatFullyConnectedOpTest/SimpleTest4DInput/\d+
 QuantizedFullyConnectedOpTest/QuantizedFullyConnectedOpTest/SimpleTest4dInputQuantizedUint8/\d+
 QuantizedFullyConnectedOpTest/QuantizedFullyConnectedOpTest/SimpleTest4dInputQuantizedOutputMultiplierGreaterThan1Uint8/\d+,29
@@ -207,6 +213,7 @@ FloatGatherOpTest/LastAxis,29
 TypesGatherOpTest/Float32Int32,29
 TypesGatherOpTest/Int32Int32,29
 TypesGatherOpTest/Uint8Int32,29
+TypesGatherOpTest/Int8Int32,29
 
 # hashtable_lookup_test
 # All test excepted the string one should be accelerated
@@ -286,13 +293,18 @@ QuantizedLstmTest/BasicQuantizedLstmTest/29
 
 # quantize_test
 QuantizeOpTest/UINT8,29
+QuantizeOpTest/INT8,30
+
+# rank
 
 # reduce_test
 -Dynamic.+(Mean|Sum|Prod|Max|Min)OpTest/.+
 -ConstUint8(Mean|Sum)OpTest/.+
+-ConstInt8MeanOpTest.NonSpecialAxisNonSameScale
+-ConstInt8MeanOpTest.QuantizedDifferentScale
 ConstUint8(Max|Min)OpTest/.+,29
 ConstUint8(Mean)OpTest/.+
-Constint8(Mean|Max|Min)OpTest/.+
+ConstInt8(Mean|Max|Min)OpTest/.+,29
 ConstFloat(Sum|Prod|Max|Min)OpTest/NotKeepDims,29
 ConstFloat(Sum|Prod|Max|Min)OpTest/KeepDims,29
 ConstFloat(Mean|Any)OpTest/NotKeepDims
diff --git a/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc b/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc
index 1c35ee370c2..58ab13ab657 100644
--- a/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc
+++ b/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc
@@ -201,6 +201,7 @@ bool NeedInt8Conversion(const TfLiteContext* context, int builtin_code,
     case kTfLiteBuiltinConcatenation:
     case kTfLiteBuiltinEqual:
     case kTfLiteBuiltinExpandDims:
+    case kTfLiteBuiltinGather:
     case kTfLiteBuiltinGreater:
     case kTfLiteBuiltinGreaterEqual:
     case kTfLiteBuiltinHardSwish:
@@ -377,6 +378,7 @@ bool HasZeroes(TfLiteIntArrayView array) {
 enum {
   NN_TENSOR_FLAG_SCALAR_AS_TENSOR = 1U << 0,
   NN_TENSOR_FLAG_INT8_CONVERSION = 1U << 1,
+  NN_TENSOR_FLAG_USE_INT8_ASYMM_SIGNED = 1U << 2,
 };
 
 // Returns the SDK level to target when delegating to the given devices.
@@ -1065,6 +1067,8 @@ class NNAPIOpBuilder {
         tensor_flags & NN_TENSOR_FLAG_SCALAR_AS_TENSOR;
     const bool need_int8_conversion =
         tensor_flags & NN_TENSOR_FLAG_INT8_CONVERSION;
+    const bool use_int8_asymm_signed =
+        tensor_flags & NN_TENSOR_FLAG_USE_INT8_ASYMM_SIGNED;
     int ann_tensor_index = operand_mapping_->lite_index_to_ann(tensor_index);
     if (ann_tensor_index != -1) {
       indices->push_back(ann_tensor_index);
@@ -1095,12 +1099,25 @@ class NNAPIOpBuilder {
         nn_type = ANEURALNETWORKS_TENSOR_FLOAT32;
         break;
       case kTfLiteUInt8:
+        nn_type = ANEURALNETWORKS_TENSOR_QUANT8_ASYMM;
+        scale = tensor->params.scale;
+        zeroPoint = tensor->params.zero_point;
+        if (scale == 0) {
+          // ANEURALNETWORKS_TENSOR_QUANT8_ASYMM with zero scale is not valid in
+          // NNAPI.
+          scale = 1;
+        }
+        break;
       case kTfLiteInt8:
         // If explicit int8 conversion is needed, we still need
         // ANEURALNETWORKS_TENSOR_QUANT8_ASYMM type.
-        nn_type = (tensor_type == kTfLiteUInt8 || need_int8_conversion)
-                      ? ANEURALNETWORKS_TENSOR_QUANT8_ASYMM
-                      : ANEURALNETWORKS_TENSOR_QUANT8_SYMM;
+        if (use_int8_asymm_signed) {
+          nn_type = ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED;
+        } else if (need_int8_conversion) {
+          nn_type = ANEURALNETWORKS_TENSOR_QUANT8_ASYMM;
+        } else {
+          nn_type = ANEURALNETWORKS_TENSOR_QUANT8_SYMM;
+        }
         scale = tensor->params.scale;
         zeroPoint = tensor->params.zero_point;
         if (tensor->quantization.type == kTfLiteAffineQuantization) {
@@ -1130,8 +1147,7 @@ class NNAPIOpBuilder {
             operand_mapping_->add_type_conversion(tensor_index, kTfLiteUInt8);
           }
           if (scale == 0) {
-            // TENSOR_QUANT8_ASYMM and ANEURALNETWORKS_TENSOR_QUANT8_ASYMM
-            // with zero scale are not valid in NNAPI.
+            // QUANT8 tensors with zero scale are not valid in NNAPI.
             scale = 1;
           }
         }
@@ -1248,7 +1264,6 @@ class NNAPIOpBuilder {
             "setting new operand value", nnapi_errno_);
       }
     }
-
     indices->push_back(ann_tensor_index);
     return kTfLiteOk;
   }
@@ -1437,7 +1452,6 @@ bool NNAPIDelegateKernel::Validate(
     bool is_accelerator_specified,
     std::vector<NNAPIValidationFailure>* map_failures) {
   OpValidationContext val_ctx{true, map_failures};
-
   switch (builtin_code) {
     case kTfLiteBuiltinAdd: {
       ExpectMaxOpVersion(version, 2, &val_ctx);
@@ -1789,18 +1803,21 @@ bool NNAPIDelegateKernel::Validate(
              "Supported op versions are 1 and 2 only", &val_ctx);
 
       const auto& input = context->tensors[node->inputs->data[0]];
-      Expect(input.type != kTfLiteFloat16,
-             NNAPIValidationFailureType::kUnsupportedInputType,
-             "kTfLiteFloat16 not supported as input", &val_ctx);
+      if (android_sdk_version < kMinSdkVersionForNNAPI12) {
+        EXPECT_INPUT_TYPE_IN(input.type, kTfLiteUInt8);
+      } else {
+        EXPECT_INPUT_TYPE_IN(input.type, kTfLiteUInt8, kTfLiteInt8);
 
-      const auto zero_point = input.params.zero_point;
-      Expect(input.type != kTfLiteInt8 ||
-                 (zero_point == 0 &&
-                  android_sdk_version >= kMinSdkVersionForNNAPI12),
-             NNAPIValidationFailureType::kUnsupportedInputType,
-             "NN API supports int8 type since version 1.2 but only for "
-             "symmetric quantization.",
-             &val_ctx);
+        if (android_sdk_version == kMinSdkVersionForNNAPI12 &&
+            input.type == kTfLiteInt8) {
+          const auto zero_point = input.params.zero_point;
+          Expect(zero_point == 0,
+                 NNAPIValidationFailureType::kUnsupportedInputType,
+                 "NN API supports int8 type since version 1.2 but only for "
+                 "symmetric quantization.",
+                 &val_ctx);
+        }
+      }
     } break;
     case kTfLiteBuiltinFloor: {
       ExpectOpVersion(version, 1, &val_ctx);
@@ -2150,21 +2167,38 @@ bool NNAPIDelegateKernel::Validate(
                                  &val_ctx);
       const TfLiteType input_type =
           context->tensors[node->inputs->data[0]].type;
-      EXPECT_INPUT_TYPE_IN(input_type, kTfLiteFloat32, kTfLiteInt32,
-                           kTfLiteUInt8);
       const TfLiteType output_type =
           context->tensors[node->outputs->data[0]].type;
-      ExpectTypeIn(output_type, {kTfLiteFloat32, kTfLiteInt32, kTfLiteUInt8},
-                   NNAPIValidationFailureType::kUnsupportedOutputType,
-                   "Output type should be one of kTfLiteFloat32, kTfLiteInt32, "
-                   "kTfLiteUInt8.",
-                   &val_ctx);
+      if (android_sdk_version >= kMinSdkVersionForNNAPI13) {
+        EXPECT_INPUT_TYPE_IN(input_type, kTfLiteFloat32, kTfLiteInt32,
+                             kTfLiteUInt8, kTfLiteInt8);
+
+        ExpectTypeIn(
+            output_type,
+            {kTfLiteFloat32, kTfLiteInt32, kTfLiteUInt8, kTfLiteInt8},
+            NNAPIValidationFailureType::kUnsupportedOutputType,
+            "Output type should be one of kTfLiteFloat32, kTfLiteInt32, "
+            "kTfLiteUInt8, kTfLiteInt8.",
+            &val_ctx);
+      } else {
+        EXPECT_INPUT_TYPE_IN(input_type, kTfLiteFloat32, kTfLiteInt32,
+                             kTfLiteUInt8);
+
+        ExpectTypeIn(
+            output_type, {kTfLiteFloat32, kTfLiteInt32, kTfLiteUInt8},
+            NNAPIValidationFailureType::kUnsupportedOutputType,
+            "Output type should be one of kTfLiteFloat32, kTfLiteInt32, "
+            "kTfLiteUInt8.",
+            &val_ctx);
+      }
     } break;
     case kTfLiteBuiltinPrelu: {
       ExpectOpVersion(version, 1, &val_ctx);
       ExpectMinAndroidSdkVersion(android_sdk_version, kMinSdkVersionForNNAPI12,
                                  &val_ctx);
-      ExpectIsFloatOrUint8Operator(context, node, &val_ctx);
+      const auto input_type = context->tensors[node->inputs->data[0]].type;
+      EXPECT_INPUT_TYPE_IN(input_type, kTfLiteFloat32, kTfLiteUInt8,
+                           kTfLiteInt8);
     } break;
     case kTfLiteBuiltinTile: {
       ExpectOpVersion(version, 1, &val_ctx);
@@ -2240,19 +2274,18 @@ bool NNAPIDelegateKernel::Validate(
              &val_ctx);
     } break;
     case kTfLiteBuiltinGather: {
-      ExpectOpVersion(version, 1, &val_ctx);
+      ExpectOpVersion(version, 2, &val_ctx);
       ExpectMinAndroidSdkVersion(android_sdk_version, kMinSdkVersionForNNAPI12,
                                  &val_ctx);
       const auto input_type = context->tensors[node->inputs->data[0]].type;
       const auto& positions = context->tensors[node->inputs->data[1]];
+
       EXPECT_INPUT_TYPE_IN(input_type, kTfLiteFloat32, kTfLiteFloat16,
-                           kTfLiteInt32, kTfLiteUInt8);
-      ExpectTypeIn(positions.type,
-                   {kTfLiteFloat32, kTfLiteFloat16, kTfLiteInt32, kTfLiteUInt8},
-                   NNAPIValidationFailureType::kUnsupportedInputType,
-                   "Positions type should be one of kTfLiteFloat32, "
-                   "kTfLiteFloat16, kTfLiteInt32, kTfLiteUInt8",
-                   &val_ctx);
+                           kTfLiteInt32, kTfLiteUInt8, kTfLiteInt8);
+
+      Expect(positions.type == kTfLiteInt32,
+             NNAPIValidationFailureType::kUnsupportedInputType,
+             "Positions type should be one of kTfLiteInt32", &val_ctx);
       Expect(positions.dims->size != 0,
              NNAPIValidationFailureType::kUnsupportedOperandRank,
              "0-dimension args are not supported by NNAPI.", &val_ctx);
@@ -2283,8 +2316,13 @@ bool NNAPIDelegateKernel::Validate(
                                  &val_ctx);
       // Tensor indices: split_dim: 0, value: 1
       const TfLiteTensor& input = context->tensors[node->inputs->data[1]];
-      EXPECT_INPUT_TYPE_IN(input.type, kTfLiteFloat32, kTfLiteUInt8,
-                           kTfLiteInt32);
+      if (android_sdk_version >= kMinSdkVersionForNNAPI13) {
+        EXPECT_INPUT_TYPE_IN(input.type, kTfLiteFloat32, kTfLiteUInt8,
+                             kTfLiteInt8, kTfLiteInt32);
+      } else {
+        EXPECT_INPUT_TYPE_IN(input.type, kTfLiteFloat32, kTfLiteUInt8,
+                             kTfLiteInt32);
+      }
       const TfLiteTensor& axis = context->tensors[node->inputs->data[0]];
       Expect(axis.type == kTfLiteInt32 && axis.allocation_type == kTfLiteMmapRo,
              NNAPIValidationFailureType::kUnsupportedInputType,
@@ -2308,30 +2346,41 @@ bool NNAPIDelegateKernel::Validate(
              NNAPIValidationFailureType::kUnsupportedInputType,
              "Value should be Float32.", &val_ctx);
       const auto output_type = context->tensors[node->outputs->data[0]].type;
-      Expect(output_type == kTfLiteUInt8,
-             NNAPIValidationFailureType::kUnsupportedOutputType,
-             "Output should be kTfLiteUInt8.", &val_ctx);
+      if (android_sdk_version < kMinSdkVersionForNNAPI13) {
+        Expect(output_type == kTfLiteUInt8,
+               NNAPIValidationFailureType::kUnsupportedOutputType,
+               "Output should be kTfLiteUInt8.", &val_ctx);
+      } else {
+        ExpectTypeIn(output_type, {kTfLiteUInt8, kTfLiteInt8},
+                     NNAPIValidationFailureType::kUnsupportedOutputType,
+                     "Output should be kTfLiteUInt8.", &val_ctx);
+      }
       const auto quantization_params =
           context->tensors[node->outputs->data[0]].params;
       Expect(quantization_params.scale > 0.f,
              NNAPIValidationFailureType::kUnsupportedQuantizationParameters,
              "Quantization scale should be > 0.", &val_ctx);
     } break;
-    case kTfLiteBuiltinReduceAny:
-    case kTfLiteBuiltinReduceMin:
-    case kTfLiteBuiltinReduceMax: {
-      ExpectOpVersion(version, 1, &val_ctx);
+    case kTfLiteBuiltinReduceAny: {
+      ExpectOpVersion(version, 2, &val_ctx);
       ExpectMinAndroidSdkVersion(android_sdk_version, kMinSdkVersionForNNAPI12,
                                  &val_ctx);
       Expect(context->tensors[node->outputs->data[0]].dims->size != 0,
              NNAPIValidationFailureType::kUnsupportedOutputType,
              "NNAPI does not support generating a scalar as output.", &val_ctx);
-      if (builtin_code == kTfLiteBuiltinReduceProd) {
-        const auto input_type = context->tensors[node->inputs->data[0]].type;
-        Expect(input_type == kTfLiteFloat32,
-               NNAPIValidationFailureType::kUnsupportedInputType,
-               "NNAPI only supports floating point REDUCE_PROD.", &val_ctx);
-      }
+    } break;
+    case kTfLiteBuiltinReduceMin:
+    case kTfLiteBuiltinReduceMax: {
+      ExpectMaxOpVersion(version, 2, &val_ctx);
+      ExpectMinAndroidSdkVersion(android_sdk_version, kMinSdkVersionForNNAPI12,
+                                 &val_ctx);
+      const auto input_tensor = context->tensors[node->inputs->data[0]];
+      const auto input_type = input_tensor.type;
+      EXPECT_INPUT_TYPE_IN(input_type, kTfLiteFloat32, kTfLiteUInt8,
+                           kTfLiteInt8);
+      Expect(input_tensor.dims->size != 0,
+             NNAPIValidationFailureType::kUnsupportedOutputType,
+             "NNAPI does not support generating a scalar as output.", &val_ctx);
     } break;
     case kTfLiteBuiltinDepthToSpace: {
       const TfLiteType input_type =
@@ -3093,16 +3142,10 @@ TfLiteStatus NNAPIDelegateKernel::Map(
     case kTfLiteBuiltinGather: {
       auto builtin = reinterpret_cast<TfLiteGatherParams*>(
           mapping_args.node->builtin_data);
-      mapping_args.builder->AddTensorInput(mapping_args.node->inputs->data[0],
-                                           /* hybrid_op */ false,
-                                           /* scalar_as_tensor */ false);
-
       mapping_args.builder->AddScalarInt32Operand(builtin->axis);
-
       mapping_args.builder->AddTensorInput(mapping_args.node->inputs->data[1],
                                            /* hybrid_op */ false,
-                                           /* scalar_as_tensor */ false);
-
+                                           /* tensor_flags */ 0);
       *nn_op_type = ANEURALNETWORKS_GATHER;
     } break;
     case kTfLiteBuiltinBidirectionalSequenceLstm: {
@@ -3430,6 +3473,9 @@ TfLiteStatus NNAPIDelegateKernel::Invoke(TfLiteContext* context,
   // absolute indices but NN api indices inputs by relative indices.
   int relative_input_index = 0;
 
+  const bool use_int8_asymm_signed =
+      target_sdk_version_ >= kMinSdkVersionForNNAPI13;
+
   size_t input_offset = 0;
   for (auto absolute_input_index : TfLiteIntArrayView(node->inputs)) {
     if (absolute_input_index == kTfLiteOptionalTensor) {
@@ -3472,9 +3518,16 @@ TfLiteStatus NNAPIDelegateKernel::Invoke(TfLiteContext* context,
           }
         } else if (tensor->type == kTfLiteInt8 &&
                    ann_type_equivalent == kTfLiteInt32) {
-          for (int i = 0; i < num_elements; ++i) {
-            reinterpret_cast<int32_t*>(input_ptr)[i] =
-                static_cast<const int32_t>(tensor->data.int8[i]) + 128;
+          if (use_int8_asymm_signed) {
+            for (int i = 0; i < num_elements; ++i) {
+              reinterpret_cast<int32_t*>(input_ptr)[i] =
+                  static_cast<const int32_t>(tensor->data.int8[i]);
+            }
+          } else {
+            for (int i = 0; i < num_elements; ++i) {
+              reinterpret_cast<int32_t*>(input_ptr)[i] =
+                  static_cast<const int32_t>(tensor->data.int8[i]) + 128;
+            }
           }
         } else {
           context->ReportError(
@@ -3685,6 +3738,15 @@ TfLiteStatus NNAPIDelegateKernel::AddOpsAndTensors(TfLiteContext* context,
                          &dequantize_mapping, &allocation_memory_mapping_,
                          &nnapi_to_tflite_op_mapping_, nn_model_.get(),
                          nnapi_errno);
+
+  // If we have target accelerators the target SDK version might be
+  // different than the current android version.
+  target_sdk_version_ = nnapi_->android_sdk_version;
+  if (!nnapi_devices_.empty()) {
+    TF_LITE_ENSURE_STATUS(GetTargetSdkVersion(
+        context, nnapi_, nnapi_devices_, &target_sdk_version_, nnapi_errno));
+  }
+
   // Add Tensors.
   for (auto node_index : nodes_) {
     // Obtain the op and registration.
@@ -3696,11 +3758,18 @@ TfLiteStatus NNAPIDelegateKernel::AddOpsAndTensors(TfLiteContext* context,
     const bool hybrid_op = IsHybridOperator(context, reg->builtin_code, node);
     const bool scalar_as_tensor = IsScalarInputSupported(reg->builtin_code);
     const bool need_int8_conversion =
+        target_sdk_version_ < kMinSdkVersionForNNAPI13 &&
         NeedInt8Conversion(context, reg->builtin_code, node);
+    const bool use_int8_asymm_signed =
+        target_sdk_version_ >= kMinSdkVersionForNNAPI13 && !hybrid_op;
+
     int input_tensor_flags = 0;
     if (scalar_as_tensor) {
       input_tensor_flags |= NN_TENSOR_FLAG_SCALAR_AS_TENSOR;
     }
+    if (use_int8_asymm_signed) {
+      input_tensor_flags |= NN_TENSOR_FLAG_USE_INT8_ASYMM_SIGNED;
+    }
 
     // On SDK level less than 30, h_swish will be lowered into supported NNAPI
     // operations. Since SDK level 30, h_swish is supported as a single
@@ -3807,8 +3876,12 @@ TfLiteStatus NNAPIDelegateKernel::AddOpsAndTensors(TfLiteContext* context,
             break;
           case kTfLiteInt8:
             if (constant_value.allocation_type == kTfLiteMmapRo) {
-              builder.AddScalarInt32Operand(
-                  static_cast<int32_t>(*constant_value.data.int8) + 128);
+              if (need_int8_conversion) {
+                builder.AddScalarInt32Operand(
+                    static_cast<int32_t>(*constant_value.data.int8) + 128);
+              } else {
+                builder.AddScalarInt32Operand(*constant_value.data.int8);
+              }
             } else {
               builder.AddSingleValueTensorAsScalarOperand(
                   constant_value_id, ANEURALNETWORKS_INT32);
@@ -3836,7 +3909,8 @@ TfLiteStatus NNAPIDelegateKernel::AddOpsAndTensors(TfLiteContext* context,
           // specifying the output height and width, is not added and
           // instead the height and width will be added individually as
           // scalars by the mapping function returned by Map().
-          TF_LITE_ENSURE_STATUS(builder.AddTensorInput(input_index, hybrid_op));
+          TF_LITE_ENSURE_STATUS(builder.AddTensorInput(input_index, hybrid_op,
+                                                       input_tensor_flags));
         }
       } else if (reg->builtin_code == kTfLiteBuiltinTopkV2 && input_pos > 0) {
         // The K parameter tensor is not handled here but by the functor
@@ -3844,8 +3918,12 @@ TfLiteStatus NNAPIDelegateKernel::AddOpsAndTensors(TfLiteContext* context,
         // the else clause below
         continue;
       } else if (reg->builtin_code == kTfLiteBuiltinGather) {
-        // Everything is added during Map since input tensors
+        // Everything else is added during Map since input tensors
         // have different order.
+        if (input_pos == 0) {
+          TF_LITE_ENSURE_STATUS(builder.AddTensorInput(input_index, hybrid_op,
+                                                       input_tensor_flags));
+        }
         continue;
       } else if (reg->builtin_code == kTfLiteBuiltinExpandDims &&
                  input_pos == 1) {
@@ -3862,7 +3940,8 @@ TfLiteStatus NNAPIDelegateKernel::AddOpsAndTensors(TfLiteContext* context,
         // the axis, needs to be converted to a scalar since TFLite uses a
         // tensor but NNAPI uses a scalar as the axis.
         if (input_pos == 0) {
-          TF_LITE_ENSURE_STATUS(builder.AddTensorInput(input_index, hybrid_op));
+          TF_LITE_ENSURE_STATUS(builder.AddTensorInput(input_index, hybrid_op,
+                                                       input_tensor_flags));
         } else {
           const int axis_id = node->inputs->data[1];
           const TfLiteTensor& axis_tensor = context->tensors[axis_id];
@@ -3908,12 +3987,26 @@ TfLiteStatus NNAPIDelegateKernel::AddOpsAndTensors(TfLiteContext* context,
                   std::vector<uint8_t>(1, operand_tensor.data.uint8[0]),
                   operand_tensor.params, &tensor_index));
               break;
-            case kTfLiteInt8:
-              TF_LITE_ENSURE_STATUS(builder.AddNewInputConstantTensor(
-                  ANEURALNETWORKS_TENSOR_QUANT8_SYMM, operand_tensor.type, {1},
-                  std::vector<int8_t>(1, operand_tensor.data.int8[0]),
-                  operand_tensor.params, &tensor_index));
-              break;
+            case kTfLiteInt8: {
+              auto params = operand_tensor.params;
+              if (params.scale == 0.0) {
+                params.scale = 1.0;
+              }
+
+              if (use_int8_asymm_signed) {
+                TF_LITE_ENSURE_STATUS(builder.AddNewInputConstantTensor(
+                    ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED,
+                    operand_tensor.type, {1},
+                    std::vector<int8_t>(1, operand_tensor.data.int8[0]), params,
+                    &tensor_index));
+              } else {
+                TF_LITE_ENSURE_STATUS(builder.AddNewInputConstantTensor(
+                    ANEURALNETWORKS_TENSOR_QUANT8_ASYMM, operand_tensor.type,
+                    {1},
+                    std::vector<int8_t>(1, operand_tensor.data.int8[0] + 128),
+                    params, &tensor_index));
+              }
+            } break;
             case kTfLiteInt32:
               TF_LITE_ENSURE_STATUS(builder.AddNewInputConstantTensor(
                   ANEURALNETWORKS_TENSOR_INT32, operand_tensor.type, {1},
@@ -3995,19 +4088,11 @@ TfLiteStatus NNAPIDelegateKernel::AddOpsAndTensors(TfLiteContext* context,
       }
     }
 
-    // If we have target accelerators the target SDK version might be
-    // different than the current android version.
-    int target_sdk_version = nnapi_->android_sdk_version;
-    if (!nnapi_devices_.empty()) {
-      TF_LITE_ENSURE_STATUS(GetTargetSdkVersion(
-          context, nnapi_, nnapi_devices_, &target_sdk_version, nnapi_errno));
-    }
-
     // Get op type and operands
     // Fails if the Validate function failed
     int nn_op_type;
     TF_LITE_ENSURE_STATUS(
-        Map(context, reg->builtin_code, reg->version, target_sdk_version,
+        Map(context, reg->builtin_code, reg->version, target_sdk_version_,
             {context, &builder, node, &model_state_outputs_,
              &model_state_tfl_inputs_, &feedback_loops_, nnapi_errno},
             &nn_op_type));
@@ -4017,6 +4102,9 @@ TfLiteStatus NNAPIDelegateKernel::AddOpsAndTensors(TfLiteContext* context,
     if (need_int8_conversion) {
       output_tensor_flags |= NN_TENSOR_FLAG_INT8_CONVERSION;
     }
+    if (use_int8_asymm_signed) {
+      output_tensor_flags |= NN_TENSOR_FLAG_USE_INT8_ASYMM_SIGNED;
+    }
     for (int output_pos = 0; output_pos < node->outputs->size; ++output_pos) {
       const auto output_index = node->outputs->data[output_pos];
 
diff --git a/tensorflow/lite/delegates/nnapi/nnapi_delegate_kernel.h b/tensorflow/lite/delegates/nnapi/nnapi_delegate_kernel.h
index 26822c011e3..9aa0f303cc2 100644
--- a/tensorflow/lite/delegates/nnapi/nnapi_delegate_kernel.h
+++ b/tensorflow/lite/delegates/nnapi/nnapi_delegate_kernel.h
@@ -341,6 +341,9 @@ class NNAPIDelegateKernel {
 
   std::vector<int> nnapi_to_tflite_op_mapping_;
 
+  // Fully initialized in NNAPIDelegateKernel::AddOpsAndTensors
+  int target_sdk_version_ = 27;  // kMinSdkVersionForNNAPI13
+
   void AddDequantizeOperatorsWhereNeeded(
       const TfLiteContext* context, int builtin_code, const TfLiteNode* node,
       int tflite_node_index, NNAPIOpBuilder* builder, int* nnapi_errno);
diff --git a/tensorflow/lite/delegates/nnapi/nnapi_delegate_mock_test.h b/tensorflow/lite/delegates/nnapi/nnapi_delegate_mock_test.h
index fa7ff9dd1f1..5dbe4110131 100644
--- a/tensorflow/lite/delegates/nnapi/nnapi_delegate_mock_test.h
+++ b/tensorflow/lite/delegates/nnapi/nnapi_delegate_mock_test.h
@@ -71,6 +71,8 @@ class NnApiMock : public ::tflite::nnapi::NnApiHandler {
     ExecutionComputeReturns<ANEURALNETWORKS_NO_ERROR>();
     ExecutionStartComputeReturns<ANEURALNETWORKS_NO_ERROR>();
     EventWaitReturns<ANEURALNETWORKS_NO_ERROR>();
+    SetPriorityReturns<ANEURALNETWORKS_NO_ERROR>();
+    SetOperandSymmPerChannelQuantParamsReturns<ANEURALNETWORKS_NO_ERROR>();
     SetNnapiSupportedDevice("test-device", android_sdk_version);
   }
 
diff --git a/tensorflow/lite/delegates/nnapi/nnapi_delegate_signed_quantization_test.cc b/tensorflow/lite/delegates/nnapi/nnapi_delegate_signed_quantization_test.cc
new file mode 100644
index 00000000000..b9d702015c2
--- /dev/null
+++ b/tensorflow/lite/delegates/nnapi/nnapi_delegate_signed_quantization_test.cc
@@ -0,0 +1,920 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <gtest/gtest.h>
+#include "tensorflow/lite/builtin_ops.h"
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/delegates/nnapi/nnapi_delegate.h"
+#include "tensorflow/lite/delegates/nnapi/nnapi_delegate_mock_test.h"
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/fully_connected.h"
+#include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/minimal_logging.h"
+#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/nnapi/NeuralNetworksTypes.h"
+#include "tensorflow/lite/nnapi/nnapi_implementation.h"
+
+namespace tflite {
+
+namespace ops {
+namespace builtin {
+
+TfLiteRegistration* Register_CONVOLUTION_REF();
+TfLiteRegistration* Register_DEQUANTIZE();
+
+}  // namespace builtin
+}  // namespace ops
+
+namespace {
+
+class SingleOpModelWithNNAPI : public SingleOpModel {
+ public:
+  SingleOpModelWithNNAPI() = default;
+  void Init(const NnApi* nnapi) {
+    stateful_delegate_.reset(new StatefulNnApiDelegate(nnapi));
+    SetDelegate(stateful_delegate_.get());
+  }
+
+  StatefulNnApiDelegate* GetDelegate() { return stateful_delegate_.get(); }
+
+  void SetBufferHandle(int index, TfLiteBufferHandle handle) {
+    interpreter_->SetBufferHandle(index, handle, stateful_delegate_.get());
+  }
+  TfLiteStatus GetCompilationStatus() { return compilation_status_; }
+
+ protected:
+  std::unique_ptr<StatefulNnApiDelegate> stateful_delegate_;
+  TfLiteStatus compilation_status_;
+};
+
+class HybridFullyConnectedOpModel : public SingleOpModelWithNNAPI {
+ public:
+  HybridFullyConnectedOpModel(const NnApi* nnapi, int units, int batches,
+                              const TensorData& input,
+                              const TensorData& weights,
+                              const TensorData& output = {TensorType_FLOAT32},
+                              bool asymmetric_inputs = false)
+      : batches_(batches), units_(units) {
+    SingleOpModelWithNNAPI::Init(nnapi);
+    int total_input_size = 1;
+    for (size_t i = 0; i < input.shape.size(); ++i) {
+      total_input_size *= input.shape[i];
+    }
+    input_size_ = total_input_size / batches_;
+
+    input_ = AddInput(input);
+    weights_ = AddInput(weights);
+
+    TensorData bias{TensorType_FLOAT32, {units_}};
+    bias_ = AddInput(bias);
+
+    output_ = AddOutput(output);
+
+    auto options = CreateFullyConnectedOptions(
+                       builder_, ActivationFunctionType_RELU,
+                       tflite::FullyConnectedOptionsWeightsFormat_DEFAULT,
+                       false, asymmetric_inputs)
+                       .Union();
+    SetBuiltinOp(BuiltinOperator_FULLY_CONNECTED,
+                 BuiltinOptions_FullyConnectedOptions, options);
+    resolver_ = absl::make_unique<SingleOpResolver>(
+        BuiltinOperator_FULLY_CONNECTED,
+        ops::builtin::Register_FULLY_CONNECTED_PIE());
+    BuildInterpreter({GetShape(input_), GetShape(weights_), GetShape(bias_)},
+                     /*num_threads=*/-1,
+                     /* allow_fp32_relax_to_fp16 */ false,
+                     /*apply_delegate=*/false);
+    compilation_status_ = ApplyDelegate();
+  }
+  void SetBias(const std::vector<float>& f) { PopulateTensor(bias_, f); }
+  void SetWeights(const std::vector<float>& data) {
+    SymmetricQuantizeAndPopulate(weights_, data);
+  }
+  void SetSignedWeights(std::initializer_list<float> f) {
+    SignedSymmetricQuantizeAndPopulate(weights_, f);
+  }
+
+  void SetInput(const std::vector<float>& f) { PopulateTensor(input_, f); }
+  std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
+  std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
+
+  int input_size() { return input_size_; }
+  int num_units() { return units_; }
+  int num_batches() { return batches_; }
+
+ protected:
+  int input_;
+  int weights_;
+  int bias_;
+  int output_;
+
+  int batches_;
+  int units_;
+  int input_size_;
+};
+
+struct NnApiSignedQuantizationTest
+    : ::tflite::delegate::nnapi::NnApiDelegateMockTest {
+  static void SetUpTestSuite() { tensors_count = new std::map<int, int>(); }
+  void SetUp() override {
+    ::tflite::delegate::nnapi::NnApiDelegateMockTest::SetUp();
+    nnapi_mock_->StubAddOperandWith(
+        [](ANeuralNetworksModel* model,
+           const ANeuralNetworksOperandType* type) -> int {
+          const auto nn_tensor_type = type->type;
+          if (tensors_count->find(nn_tensor_type) == tensors_count->end()) {
+            tensors_count->insert({nn_tensor_type, 0});
+          }
+          tensors_count->at(nn_tensor_type)++;
+          return ANEURALNETWORKS_NO_ERROR;
+        });
+  }
+  void TearDown() override { tensors_count->clear(); }
+  static void TearDownTestSuite() {
+    delete tensors_count;
+    tensors_count = nullptr;
+  }
+  static std::map<int, int>* tensors_count;
+};
+std::map<int, int>* NnApiSignedQuantizationTest::tensors_count = nullptr;
+
+TEST_F(NnApiSignedQuantizationTest,
+       HybridFullyConnectedMapsToSignedSymmOnSdk29) {
+  nnapi_mock_->SetAndroidSdkVersion(29);
+
+  HybridFullyConnectedOpModel m(
+      nnapi_mock_->GetNnApi(), /*units=*/3, /*batches=*/2,
+      /*input=*/{TensorType_FLOAT32, {2, 10}},
+      /*weights=*/{TensorType_INT8, {3, 10}, 0, 0, 10.0 / 127.0, 0});
+  m.SetSignedWeights({
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 0
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 1
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 2
+  });
+  m.SetBias({1, 2, 3});
+  m.SetInput({
+      1, 2, 3, 4, 5, 6, 7, 8,  -9, -10,  // b = 0
+      1, 2, 3, 4, 5, 6, 7, -8, 9,  -10,  // b = 1
+  });
+
+  m.Invoke();
+  EXPECT_EQ(m.GetCompilationStatus(), kTfLiteOk);
+
+  ASSERT_EQ(tensors_count->size(), 3);
+  ASSERT_NE(tensors_count->find(ANEURALNETWORKS_TENSOR_FLOAT32),
+            tensors_count->end());
+  ASSERT_NE(tensors_count->find(ANEURALNETWORKS_INT32), tensors_count->end());
+  ASSERT_NE(tensors_count->find(ANEURALNETWORKS_TENSOR_QUANT8_SYMM),
+            tensors_count->end());
+
+  EXPECT_EQ(tensors_count->at(ANEURALNETWORKS_TENSOR_FLOAT32),
+            4);  // fc_input, fc_weights, fc_bias, fc_output
+  EXPECT_EQ(tensors_count->at(ANEURALNETWORKS_INT32), 1);  // activation
+  EXPECT_EQ(tensors_count->at(ANEURALNETWORKS_TENSOR_QUANT8_SYMM),
+            1);  // dequantize_weights_input
+}
+
+TEST_F(NnApiSignedQuantizationTest,
+       HybridFullyConnectedMapsToSignedSymmOnSdk30) {
+  nnapi_mock_->SetAndroidSdkVersion(30);
+
+  HybridFullyConnectedOpModel m(
+      nnapi_mock_->GetNnApi(), /*units=*/3, /*batches=*/2,
+      /*input=*/{TensorType_FLOAT32, {2, 10}},
+      /*weights=*/{TensorType_INT8, {3, 10}, 0, 0, 10.0 / 127.0, 0});
+  m.SetSignedWeights({
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 0
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 1
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 2
+  });
+  m.SetBias({1, 2, 3});
+  m.SetInput({
+      1, 2, 3, 4, 5, 6, 7, 8,  -9, -10,  // b = 0
+      1, 2, 3, 4, 5, 6, 7, -8, 9,  -10,  // b = 1
+  });
+
+  m.Invoke();
+  EXPECT_EQ(m.GetCompilationStatus(), kTfLiteOk);
+
+  ASSERT_EQ(tensors_count->size(), 3);
+  ASSERT_NE(tensors_count->find(ANEURALNETWORKS_TENSOR_FLOAT32),
+            tensors_count->end());
+  ASSERT_NE(tensors_count->find(ANEURALNETWORKS_INT32), tensors_count->end());
+  ASSERT_NE(tensors_count->find(ANEURALNETWORKS_TENSOR_QUANT8_SYMM),
+            tensors_count->end());
+
+  EXPECT_EQ(tensors_count->at(ANEURALNETWORKS_TENSOR_FLOAT32),
+            4);  // fc_input, fc_weights, fc_bias, fc_output
+  EXPECT_EQ(tensors_count->at(ANEURALNETWORKS_INT32), 1);  // activation
+  EXPECT_EQ(tensors_count->at(ANEURALNETWORKS_TENSOR_QUANT8_SYMM),
+            1);  // dequantize_weights_input
+}
+
+template <typename FilterType>
+class BaseConvolutionOpModel : public SingleOpModelWithNNAPI {
+ public:
+  BaseConvolutionOpModel(
+      const NnApi* nnapi, TfLiteRegistration* registration,
+      const TensorData& input, const TensorData& filter,
+      const TensorData& output, int stride_width = 2, int stride_height = 2,
+      enum Padding padding = Padding_VALID,
+      enum ActivationFunctionType activation = ActivationFunctionType_NONE,
+      int dilation_width_factor = 1, int dilation_height_factor = 1,
+      std::initializer_list<FilterType> filter_data = {}) {
+    SingleOpModelWithNNAPI::Init(nnapi);
+
+    input_ = AddInput(input);
+
+    if (filter_data.size()) {
+      filter_ = AddConstInput(filter, filter_data);
+    } else {
+      filter_ = AddInput(filter);
+    }
+
+    int bias_size = GetShape(filter_)[0];
+    if (input.type == TensorType_FLOAT32) {
+      bias_ = AddInput({TensorType_FLOAT32, {bias_size}});
+    } else {
+      // This is a quantized version. The scale of 'bias' depends on the scales
+      // of input and filter. Supposedly this is correctly set during quantized
+      // training.
+      if (filter.per_channel_quantization) {
+        // per channel quantization.
+        std::vector<float> bias_scale(
+            filter.per_channel_quantization_scales.size());
+        std::vector<int64_t> bias_zero_points(
+            filter.per_channel_quantization_scales.size());
+        for (size_t i = 0; i < filter.per_channel_quantization_scales.size();
+             ++i) {
+          bias_scale[i] =
+              input.scale * filter.per_channel_quantization_scales[i];
+          bias_zero_points[i] = 0;
+        }
+        tflite::TensorType bias_type = TensorType_INT32;
+        if (input.type == TensorType_INT16) {
+          // In case of 16-bit, the bias type is set to be int 64.
+          bias_type = TensorType_INT64;
+        }
+        TensorData bias{bias_type,
+                        {bias_size},
+                        /*min=*/0,
+                        /*max=*/0,
+                        /*scale=*/0,
+                        /*zero_point=*/0,
+                        true,
+                        /*per_channel_quantization_scales=*/bias_scale,
+                        /*per_channel_quantization_offsets=*/bias_zero_points,
+                        /*channel_index==*/0};
+        bias_ = AddInput(bias);
+      } else {
+        // per tensor quantization.
+        auto bias_scale = GetScale(input_) * GetScale(filter_);
+        TensorData bias{TensorType_INT32, {bias_size}, 0, 0, bias_scale};
+        bias_ = AddInput(bias);
+      }
+    }
+
+    output_ = AddOutput(output);
+
+    SetBuiltinOp(BuiltinOperator_CONV_2D, BuiltinOptions_Conv2DOptions,
+                 CreateConv2DOptions(
+                     builder_, padding, stride_width, stride_height, activation,
+                     dilation_width_factor, dilation_height_factor)
+                     .Union());
+
+    resolver_ = absl::make_unique<SingleOpResolver>(BuiltinOperator_CONV_2D,
+                                                    registration);
+    BuildInterpreter({GetShape(input_), GetShape(filter_), GetShape(bias_)},
+                     /*num_threads=*/-1,
+                     /* allow_fp32_relax_to_fp16 */ false,
+                     /*apply_delegate=*/false);
+    compilation_status_ = ApplyDelegate();
+  }
+
+ protected:
+  int input_;
+  int filter_;
+  int bias_;
+  int output_;
+};
+
+class QuantizedConvolutionOpModel : public BaseConvolutionOpModel<uint8_t> {
+ public:
+  using BaseConvolutionOpModel::BaseConvolutionOpModel;
+
+  void SetInput(std::initializer_list<float> data) {
+    QuantizeAndPopulate<uint8_t>(input_, data);
+  }
+
+  void SetFilter(std::initializer_list<float> data) {
+    QuantizeAndPopulate<uint8_t>(filter_, data);
+  }
+
+  void SetBias(std::initializer_list<float> data) {
+    QuantizeAndPopulate<int32_t>(bias_, data);
+  }
+
+  std::vector<uint8_t> GetOutput() { return ExtractVector<uint8_t>(output_); }
+  std::vector<float> GetDequantizedOutput() {
+    return Dequantize<uint8_t>(ExtractVector<uint8_t>(output_),
+                               GetScale(output_), GetZeroPoint(output_));
+  }
+};
+
+TEST_F(NnApiSignedQuantizationTest,
+       Conv2DUnsignedPerTensorMapsToUnsignedOnSdk29) {
+  QuantizedConvolutionOpModel m(nnapi_mock_->GetNnApi(),
+                                ops::builtin::Register_CONVOLUTION_REF(),
+                                {TensorType_UINT8, {2, 2, 4, 1}, -63.5, 64},
+                                {TensorType_UINT8, {3, 2, 2, 1}, -63.5, 64},
+                                {TensorType_UINT8, {}, -127, 128});
+  m.SetInput({
+      // First batch
+      1, 1, 1, 1,  // row = 1
+      2, 2, 2, 2,  // row = 2
+      // Second batch
+      1, 2, 3, 4,  // row = 1
+      1, 2, 3, 4,  // row = 2
+  });
+  m.SetFilter({
+      1, 2, 3, 4,    // first 2x2 filter
+      -1, 1, -1, 1,  // second 2x2 filter
+      -1, -1, 1, 1,  // third 2x2 filter
+  });
+  m.SetBias({1, 2, 3});
+
+  m.Invoke();
+  EXPECT_EQ(m.GetCompilationStatus(), kTfLiteOk);
+
+  ASSERT_EQ(tensors_count->size(), 3);
+  ASSERT_NE(tensors_count->find(ANEURALNETWORKS_TENSOR_QUANT8_ASYMM),
+            tensors_count->end());
+  ASSERT_NE(tensors_count->find(ANEURALNETWORKS_TENSOR_INT32),
+            tensors_count->end());
+  ASSERT_NE(tensors_count->find(ANEURALNETWORKS_INT32), tensors_count->end());
+
+  EXPECT_EQ(tensors_count->at(ANEURALNETWORKS_TENSOR_QUANT8_ASYMM),
+            3);  // input, filter, output
+  EXPECT_EQ(tensors_count->at(ANEURALNETWORKS_TENSOR_INT32), 1);  // bias
+  EXPECT_EQ(tensors_count->at(ANEURALNETWORKS_INT32),
+            4);  //  padding, stride_width, stride_height, activation
+}
+
+TEST_F(NnApiSignedQuantizationTest,
+       Conv2dUnsignedPerTensorMapsToUnsignedOnSdk30) {
+  nnapi_mock_->SetAndroidSdkVersion(30);
+  QuantizedConvolutionOpModel m(nnapi_mock_->GetNnApi(),
+                                ops::builtin::Register_CONVOLUTION_REF(),
+                                {TensorType_UINT8, {2, 2, 4, 1}, -63.5, 64},
+                                {TensorType_UINT8, {3, 2, 2, 1}, -63.5, 64},
+                                {TensorType_UINT8, {}, -127, 128});
+  m.SetInput({
+      // First batch
+      1, 1, 1, 1,  // row = 1
+      2, 2, 2, 2,  // row = 2
+      // Second batch
+      1, 2, 3, 4,  // row = 1
+      1, 2, 3, 4,  // row = 2
+  });
+  m.SetFilter({
+      1, 2, 3, 4,    // first 2x2 filter
+      -1, 1, -1, 1,  // second 2x2 filter
+      -1, -1, 1, 1,  // third 2x2 filter
+  });
+  m.SetBias({1, 2, 3});
+
+  m.Invoke();
+  EXPECT_EQ(m.GetCompilationStatus(), kTfLiteOk);
+
+  ASSERT_EQ(tensors_count->size(), 3);
+  ASSERT_NE(tensors_count->find(ANEURALNETWORKS_TENSOR_QUANT8_ASYMM),
+            tensors_count->end());
+  ASSERT_NE(tensors_count->find(ANEURALNETWORKS_TENSOR_INT32),
+            tensors_count->end());
+  ASSERT_NE(tensors_count->find(ANEURALNETWORKS_INT32), tensors_count->end());
+
+  EXPECT_EQ(tensors_count->at(ANEURALNETWORKS_TENSOR_QUANT8_ASYMM),
+            3);  // input, filter, output
+  EXPECT_EQ(tensors_count->at(ANEURALNETWORKS_TENSOR_INT32), 1);  // bias
+  EXPECT_EQ(tensors_count->at(ANEURALNETWORKS_INT32),
+            4);  //  padding, stride_width, stride_height, activation
+}
+
+class PerChannelQuantizedConvolutionOpModel
+    : public BaseConvolutionOpModel<int8_t> {
+ public:
+  using BaseConvolutionOpModel::BaseConvolutionOpModel;
+
+  void SetInput(std::initializer_list<float> data) {
+    QuantizeAndPopulate<int8_t>(input_, data);
+  }
+
+  void SetFilter(std::initializer_list<float> data) {
+    PerChannelSymmetricQuantizeAndPopulate(filter_, data);
+  }
+
+  void SetBias(std::initializer_list<float> data) {
+    PerChannelQuantizeBias(bias_, data);
+  }
+
+  std::vector<int8_t> GetOutput() { return ExtractVector<int8_t>(output_); }
+  std::vector<float> GetDequantizedOutput() {
+    return Dequantize<int8_t>(ExtractVector<int8_t>(output_), GetScale(output_),
+                              GetZeroPoint(output_));
+  }
+};
+
+TEST_F(NnApiSignedQuantizationTest,
+       Conv2dSignedPerTensorMapsToUnsignedOnSdk29) {
+  nnapi_mock_->SetAndroidSdkVersion(29);
+  PerChannelQuantizedConvolutionOpModel m(
+      nnapi_mock_->GetNnApi(), ops::builtin::Register_CONVOLUTION_REF(),
+      {TensorType_INT8, {1, 2, 3, 2}, -63.5, 64, 0.5, -1},
+      {TensorType_INT8,
+       // [2 * 2 * 2 * 2] as [output_channel, y, x, input_channel]
+       {2, 2, 2, 2},
+       0,
+       0,
+       0,
+       0,
+       /*per_channel_quantization=*/true,
+       /*per_channel_quantization_scales=*/{1},
+       /*per_channel_quantization_offsets=*/{0},
+       /*channel_index=*/0},
+      {TensorType_INT8, {}, -63.5, 64, 0.5, -1},
+      /*stride_width=*/1, /*stride_height=*/1);
+  m.SetInput({
+      // [1 * 2 * 3 * 2] as [batch, y, x, input_channel]
+      3, 2,    // batch = 0, y = 0, x = 0
+      1, -1,   // batch = 0, y = 0, x = 1
+      -2, -3,  // batch = 0, y = 0, x = 2
+      4, 3,    // batch = 0, y = 1, x = 0
+      2, -2,   // batch = 0, y = 1, x = 1
+      -3, -4,  // batch = 0, y = 1, x = 2
+  });
+  m.SetFilter(
+      // [2 * 2 * 2 * 2] as [output_channel, y, x, input_channel]
+      {
+          1, 2,  // out channel = 0, y = 0, x = 0
+          3, 4,  // out channel = 0, y = 0, x = 1
+          3, 4,  // out channel = 0, y = 1, x = 0
+          5, 6,  // out channel = 0, y = 1, x = 1
+          7, 8,  // out channel = 1, y = 0, x = 0
+          5, 6,  // out channel = 1, y = 0, x = 1
+          3, 4,  // out channel = 1, y = 1, x = 0
+          1, 2,  // out channel = 1, y = 1, x = 1
+      });
+  m.SetBias({3, -2});
+
+  // Invoke and verify output.
+  // output has dimension [1 * 1 * 2 * 2] as [batch, y, x, output_channel]
+  m.Invoke();
+  EXPECT_EQ(m.GetCompilationStatus(), kTfLiteOk);
+
+  ASSERT_EQ(tensors_count->size(), 3);
+  ASSERT_NE(tensors_count->find(ANEURALNETWORKS_TENSOR_QUANT8_ASYMM),
+            tensors_count->end());
+  ASSERT_NE(tensors_count->find(ANEURALNETWORKS_TENSOR_INT32),
+            tensors_count->end());
+  ASSERT_NE(tensors_count->find(ANEURALNETWORKS_INT32), tensors_count->end());
+
+  EXPECT_EQ(tensors_count->at(ANEURALNETWORKS_TENSOR_QUANT8_ASYMM),
+            3);  // input, filter, output
+  EXPECT_EQ(tensors_count->at(ANEURALNETWORKS_TENSOR_INT32), 1);  // bias
+  EXPECT_EQ(tensors_count->at(ANEURALNETWORKS_INT32),
+            4);  //  padding, stride_width, stride_height, activation
+}
+
+TEST_F(NnApiSignedQuantizationTest,
+       Conv2dSignedPerTensorMapsToUnsignedOnSdk30) {
+  nnapi_mock_->SetAndroidSdkVersion(30);
+  PerChannelQuantizedConvolutionOpModel m(
+      nnapi_mock_->GetNnApi(), ops::builtin::Register_CONVOLUTION_REF(),
+      {TensorType_INT8, {1, 2, 3, 2}, -63.5, 64, 0.5, -1},
+      {TensorType_INT8,
+       // [2 * 2 * 2 * 2] as [output_channel, y, x, input_channel]
+       {2, 2, 2, 2},
+       0,
+       0,
+       0,
+       0,
+       /*per_channel_quantization=*/true,
+       /*per_channel_quantization_scales=*/{1},
+       /*per_channel_quantization_offsets=*/{0},
+       /*channel_index=*/0},
+      {TensorType_INT8, {}, -63.5, 64, 0.5, -1},
+      /*stride_width=*/1, /*stride_height=*/1);
+  m.SetInput({
+      // [1 * 2 * 3 * 2] as [batch, y, x, input_channel]
+      3, 2,    // batch = 0, y = 0, x = 0
+      1, -1,   // batch = 0, y = 0, x = 1
+      -2, -3,  // batch = 0, y = 0, x = 2
+      4, 3,    // batch = 0, y = 1, x = 0
+      2, -2,   // batch = 0, y = 1, x = 1
+      -3, -4,  // batch = 0, y = 1, x = 2
+  });
+  m.SetFilter(
+      // [2 * 2 * 2 * 2] as [output_channel, y, x, input_channel]
+      {
+          1, 2,  // out channel = 0, y = 0, x = 0
+          3, 4,  // out channel = 0, y = 0, x = 1
+          3, 4,  // out channel = 0, y = 1, x = 0
+          5, 6,  // out channel = 0, y = 1, x = 1
+          7, 8,  // out channel = 1, y = 0, x = 0
+          5, 6,  // out channel = 1, y = 0, x = 1
+          3, 4,  // out channel = 1, y = 1, x = 0
+          1, 2,  // out channel = 1, y = 1, x = 1
+      });
+  m.SetBias({3, -2});
+
+  // Invoke and verify output.
+  // output has dimension [1 * 1 * 2 * 2] as [batch, y, x, output_channel]
+  m.Invoke();
+  EXPECT_EQ(m.GetCompilationStatus(), kTfLiteOk);
+
+  ASSERT_EQ(tensors_count->size(), 3);
+  ASSERT_NE(tensors_count->find(ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED),
+            tensors_count->end());
+  ASSERT_NE(tensors_count->find(ANEURALNETWORKS_TENSOR_INT32),
+            tensors_count->end());
+  ASSERT_NE(tensors_count->find(ANEURALNETWORKS_INT32), tensors_count->end());
+
+  EXPECT_EQ(tensors_count->at(ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED),
+            3);  // input, filter, output
+  EXPECT_EQ(tensors_count->at(ANEURALNETWORKS_TENSOR_INT32), 1);  // bias
+  EXPECT_EQ(tensors_count->at(ANEURALNETWORKS_INT32),
+            4);  //  padding, stride_width, stride_height, activation
+}
+
+TEST_F(NnApiSignedQuantizationTest,
+       Conv2dSignedPerChannelMapsToUnsignedOnSdk29) {
+  PerChannelQuantizedConvolutionOpModel m(
+      nnapi_mock_->GetNnApi(), ops::builtin::Register_CONVOLUTION_REF(),
+      {TensorType_INT8, {1, 2, 3, 2}, -63.5, 64, 0.5, -1},
+      {TensorType_INT8,
+       // [2 * 2 * 2 * 2] as [output_channel, y, x, input_channel]
+       {2, 2, 2, 2},
+       0,
+       0,
+       0,
+       0,
+       /*per_channel_quantization=*/true,
+       /*per_channel_quantization_scales=*/{1, 2},
+       /*per_channel_quantization_offsets=*/{0, 0},
+       /*channel_index=*/0},
+      {TensorType_INT8, {}, -63.5, 64, 0.5, -1},
+      /*stride_width=*/1, /*stride_height=*/1);
+  m.SetInput({
+      // [1 * 2 * 3 * 2] as [batch, y, x, input_channel]
+      3, 2,    // batch = 0, y = 0, x = 0
+      1, -1,   // batch = 0, y = 0, x = 1
+      -2, -3,  // batch = 0, y = 0, x = 2
+      4, 3,    // batch = 0, y = 1, x = 0
+      2, -2,   // batch = 0, y = 1, x = 1
+      -3, -4,  // batch = 0, y = 1, x = 2
+  });
+  m.SetFilter(
+      // [2 * 2 * 2 * 2] as [output_channel, y, x, input_channel]
+      {
+          1, 2,  // out channel = 0, y = 0, x = 0
+          3, 4,  // out channel = 0, y = 0, x = 1
+          3, 4,  // out channel = 0, y = 1, x = 0
+          5, 6,  // out channel = 0, y = 1, x = 1
+          7, 8,  // out channel = 1, y = 0, x = 0
+          5, 6,  // out channel = 1, y = 0, x = 1
+          3, 4,  // out channel = 1, y = 1, x = 0
+          1, 2,  // out channel = 1, y = 1, x = 1
+      });
+  m.SetBias({3, -2});
+
+  // Invoke and verify output.
+  // output has dimension [1 * 1 * 2 * 2] as [batch, y, x, output_channel]
+  m.Invoke();
+  EXPECT_EQ(m.GetCompilationStatus(), kTfLiteOk);
+
+  ASSERT_EQ(tensors_count->size(), 4);
+  ASSERT_NE(tensors_count->find(ANEURALNETWORKS_TENSOR_QUANT8_ASYMM),
+            tensors_count->end());
+  ASSERT_NE(tensors_count->find(ANEURALNETWORKS_TENSOR_QUANT8_SYMM_PER_CHANNEL),
+            tensors_count->end());
+  ASSERT_NE(tensors_count->find(ANEURALNETWORKS_TENSOR_INT32),
+            tensors_count->end());
+  ASSERT_NE(tensors_count->find(ANEURALNETWORKS_INT32), tensors_count->end());
+
+  EXPECT_EQ(tensors_count->at(ANEURALNETWORKS_TENSOR_QUANT8_ASYMM),
+            2);  // input, output
+  EXPECT_EQ(tensors_count->at(ANEURALNETWORKS_TENSOR_QUANT8_SYMM_PER_CHANNEL),
+            1);                                                   // filter
+  EXPECT_EQ(tensors_count->at(ANEURALNETWORKS_TENSOR_INT32), 1);  // bias
+  EXPECT_EQ(tensors_count->at(ANEURALNETWORKS_INT32),
+            4);  //  padding, stride_width, stride_height, activation
+}
+
+TEST_F(NnApiSignedQuantizationTest, Conv2dSignedPerChannelMapsToSignedOnSdk30) {
+  nnapi_mock_->SetAndroidSdkVersion(30);
+  PerChannelQuantizedConvolutionOpModel m(
+      nnapi_mock_->GetNnApi(), ops::builtin::Register_CONVOLUTION_REF(),
+      {TensorType_INT8, {1, 2, 3, 2}, -63.5, 64, 0.5, -1},
+      {TensorType_INT8,
+       // [2 * 2 * 2 * 2] as [output_channel, y, x, input_channel]
+       {2, 2, 2, 2},
+       0,
+       0,
+       0,
+       0,
+       /*per_channel_quantization=*/true,
+       /*per_channel_quantization_scales=*/{1, 2},
+       /*per_channel_quantization_offsets=*/{0, 0},
+       /*channel_index=*/0},
+      {TensorType_INT8, {}, -63.5, 64, 0.5, -1},
+      /*stride_width=*/1, /*stride_height=*/1);
+  m.SetInput({
+      // [1 * 2 * 3 * 2] as [batch, y, x, input_channel]
+      3, 2,    // batch = 0, y = 0, x = 0
+      1, -1,   // batch = 0, y = 0, x = 1
+      -2, -3,  // batch = 0, y = 0, x = 2
+      4, 3,    // batch = 0, y = 1, x = 0
+      2, -2,   // batch = 0, y = 1, x = 1
+      -3, -4,  // batch = 0, y = 1, x = 2
+  });
+  m.SetFilter(
+      // [2 * 2 * 2 * 2] as [output_channel, y, x, input_channel]
+      {
+          1, 2,  // out channel = 0, y = 0, x = 0
+          3, 4,  // out channel = 0, y = 0, x = 1
+          3, 4,  // out channel = 0, y = 1, x = 0
+          5, 6,  // out channel = 0, y = 1, x = 1
+          7, 8,  // out channel = 1, y = 0, x = 0
+          5, 6,  // out channel = 1, y = 0, x = 1
+          3, 4,  // out channel = 1, y = 1, x = 0
+          1, 2,  // out channel = 1, y = 1, x = 1
+      });
+  m.SetBias({3, -2});
+
+  // Invoke and verify output.
+  // output has dimension [1 * 1 * 2 * 2] as [batch, y, x, output_channel]
+  m.Invoke();
+  EXPECT_EQ(m.GetCompilationStatus(), kTfLiteOk);
+
+  ASSERT_EQ(tensors_count->size(), 4);
+  ASSERT_NE(tensors_count->find(ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED),
+            tensors_count->end());
+  ASSERT_NE(tensors_count->find(ANEURALNETWORKS_TENSOR_QUANT8_SYMM_PER_CHANNEL),
+            tensors_count->end());
+  ASSERT_NE(tensors_count->find(ANEURALNETWORKS_TENSOR_INT32),
+            tensors_count->end());
+  ASSERT_NE(tensors_count->find(ANEURALNETWORKS_INT32), tensors_count->end());
+
+  EXPECT_EQ(tensors_count->at(ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED),
+            2);  // input, output
+  EXPECT_EQ(tensors_count->at(ANEURALNETWORKS_TENSOR_QUANT8_SYMM_PER_CHANNEL),
+            1);                                                   // filter
+  EXPECT_EQ(tensors_count->at(ANEURALNETWORKS_TENSOR_INT32), 1);  // bias
+  EXPECT_EQ(tensors_count->at(ANEURALNETWORKS_INT32),
+            4);  //  padding, stride_width, stride_height, activation
+}
+
+class QuantizeOpModel : public SingleOpModelWithNNAPI {
+ public:
+  QuantizeOpModel(const NnApi* nnapi, const TensorData& input,
+                  const TensorData& output) {
+    SingleOpModelWithNNAPI::Init(nnapi);
+    input_ = AddInput(input);
+    output_ = AddOutput(output);
+    SetBuiltinOp(BuiltinOperator_QUANTIZE, BuiltinOptions_QuantizeOptions,
+                 CreateQuantizeOptions(builder_).Union());
+
+    BuildInterpreter({GetShape(input_)}, /*num_threads=*/-1,
+                     /* allow_fp32_relax_to_fp16 */ false,
+                     /*apply_delegate=*/false);
+    compilation_status_ = ApplyDelegate();
+  }
+
+  void SetInput(std::initializer_list<float> data) {
+    PopulateTensor(input_, data);
+  }
+
+  template <typename T>
+  void SetInputAndQuantize(std::initializer_list<float> data) {
+    QuantizeAndPopulate<T>(input_, data);
+  }
+
+  template <typename T>
+  std::vector<T> GetOutput() {
+    return ExtractVector<T>(output_);
+  }
+
+ private:
+  int input_;
+  int output_;
+};
+
+TEST_F(NnApiSignedQuantizationTest, QuantizeUint8MapsToUint8OnSdk29) {
+  // [-63.5, 64] -> scale=0.5 zero_point=127 for UINT8
+  QuantizeOpModel m(nnapi_mock_->GetNnApi(), {TensorType_FLOAT32, {2, 5}},
+                    {TensorType_UINT8, {2, 5}, 0, 0, 0.5, 127});
+
+  m.SetInput({-63.5, -63, -62.5, -62, -61.5, 62, 62.5, 63, 63.5, 64});
+  m.Invoke();
+  EXPECT_EQ(m.GetCompilationStatus(), kTfLiteOk);
+
+  ASSERT_EQ(tensors_count->size(), 2);
+  ASSERT_NE(tensors_count->find(ANEURALNETWORKS_TENSOR_FLOAT32),
+            tensors_count->end());
+  ASSERT_NE(tensors_count->find(ANEURALNETWORKS_TENSOR_QUANT8_ASYMM),
+            tensors_count->end());
+
+  EXPECT_EQ(tensors_count->at(ANEURALNETWORKS_TENSOR_FLOAT32),
+            1);  // input
+  EXPECT_EQ(tensors_count->at(ANEURALNETWORKS_TENSOR_QUANT8_ASYMM),
+            1);  // output
+}
+
+TEST_F(NnApiSignedQuantizationTest, QuantizeUint8MapsToUint8OnSdk30) {
+  nnapi_mock_->SetAndroidSdkVersion(30);
+  // [-63.5, 64] -> scale=0.5 zero_point=127 for UINT8
+  QuantizeOpModel m(nnapi_mock_->GetNnApi(), {TensorType_FLOAT32, {2, 5}},
+                    {TensorType_UINT8, {2, 5}, 0, 0, 0.5, 127});
+
+  m.SetInput({-63.5, -63, -62.5, -62, -61.5, 62, 62.5, 63, 63.5, 64});
+  m.Invoke();
+  EXPECT_EQ(m.GetCompilationStatus(), kTfLiteOk);
+
+  ASSERT_EQ(tensors_count->size(), 2);
+  ASSERT_NE(tensors_count->find(ANEURALNETWORKS_TENSOR_FLOAT32),
+            tensors_count->end());
+  ASSERT_NE(tensors_count->find(ANEURALNETWORKS_TENSOR_QUANT8_ASYMM),
+            tensors_count->end());
+
+  EXPECT_EQ(tensors_count->at(ANEURALNETWORKS_TENSOR_FLOAT32),
+            1);  // input
+  EXPECT_EQ(tensors_count->at(ANEURALNETWORKS_TENSOR_QUANT8_ASYMM),
+            1);  // output
+}
+
+// Quantize with Int8 output is only supported since SDK level 30.
+TEST_F(NnApiSignedQuantizationTest, QuantizeInt8MapsToInt8OnSdk30) {
+  nnapi_mock_->SetAndroidSdkVersion(30);
+  // [-63.5, 64] -> scale=0.5 zero_point=1 for INT8
+  QuantizeOpModel m(nnapi_mock_->GetNnApi(), {TensorType_FLOAT32, {2, 5}},
+                    {TensorType_INT8, {2, 5}, 0, 0, 0.5, -1});
+
+  m.SetInput({-63.5, -63, -62.5, -62, -61.5, 62, 62.5, 63, 63.5, 64});
+  m.Invoke();
+  EXPECT_EQ(m.GetCompilationStatus(), kTfLiteOk);
+
+  ASSERT_EQ(tensors_count->size(), 2);
+  ASSERT_NE(tensors_count->find(ANEURALNETWORKS_TENSOR_FLOAT32),
+            tensors_count->end());
+  ASSERT_NE(tensors_count->find(ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED),
+            tensors_count->end());
+
+  EXPECT_EQ(tensors_count->at(ANEURALNETWORKS_TENSOR_FLOAT32),
+            1);  // input
+  EXPECT_EQ(tensors_count->at(ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED),
+            1);  // output
+}
+
+class DequantizeOpModel : public SingleOpModelWithNNAPI {
+ public:
+  DequantizeOpModel(const NnApi* nnapi, TensorType type,
+                    std::initializer_list<int> shape, float scale,
+                    int32_t zero_point, int version) {
+    SingleOpModelWithNNAPI::Init(nnapi);
+    const TensorData input_tensor_data = {type, shape, 0, 0, scale, zero_point};
+    input_ = AddInput(input_tensor_data);
+    output_ = AddOutput({TensorType_FLOAT32, shape});
+    SetBuiltinOp(BuiltinOperator_DEQUANTIZE, BuiltinOptions_DequantizeOptions,
+                 CreateDequantizeOptions(builder_).Union());
+
+    resolver_ = absl::make_unique<SingleOpResolver>(
+        BuiltinOperator_DEQUANTIZE, ops::builtin::Register_DEQUANTIZE(),
+        version);
+
+    BuildInterpreter({GetShape(input_)}, /*num_threads=*/-1,
+                     /* allow_fp32_relax_to_fp16 */ false,
+                     /*apply_delegate=*/false);
+    compilation_status_ = ApplyDelegate();
+  }
+
+  template <typename T>
+  void SetInput(std::initializer_list<T> data) {
+    PopulateTensor(input_, data);
+  }
+
+  std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
+
+ private:
+  int input_;
+  int output_;
+};
+
+TEST_F(NnApiSignedQuantizationTest, DequantizeUint8MapsToUint8OnSdk29) {
+  // [-63.5, 64] -> scale=0.5 zero_point=127 for UINT8
+  DequantizeOpModel m(nnapi_mock_->GetNnApi(), TensorType_UINT8, {2, 5}, 0.5,
+                      127, 1);
+
+  m.SetInput<uint8_t>({0, 1, 2, 3, 4, 251, 252, 253, 254, 255});
+  m.Invoke();
+  EXPECT_EQ(m.GetCompilationStatus(), kTfLiteOk);
+
+  ASSERT_EQ(tensors_count->size(), 2);
+  ASSERT_NE(tensors_count->find(ANEURALNETWORKS_TENSOR_QUANT8_ASYMM),
+            tensors_count->end());
+  ASSERT_NE(tensors_count->find(ANEURALNETWORKS_TENSOR_FLOAT32),
+            tensors_count->end());
+
+  EXPECT_EQ(tensors_count->at(ANEURALNETWORKS_TENSOR_QUANT8_ASYMM),
+            1);  // input
+  EXPECT_EQ(tensors_count->at(ANEURALNETWORKS_TENSOR_FLOAT32),
+            1);  // output
+}
+
+TEST_F(NnApiSignedQuantizationTest, DequantizeUint8MapsToUint8OnSdk30) {
+  nnapi_mock_->SetAndroidSdkVersion(30);
+  // [-63.5, 64] -> scale=0.5 zero_point=127 for UINT8
+  DequantizeOpModel m(nnapi_mock_->GetNnApi(), TensorType_UINT8, {2, 5}, 0.5,
+                      127, 1);
+
+  m.SetInput<uint8_t>({0, 1, 2, 3, 4, 251, 252, 253, 254, 255});
+  m.Invoke();
+  EXPECT_EQ(m.GetCompilationStatus(), kTfLiteOk);
+
+  ASSERT_EQ(tensors_count->size(), 2);
+  ASSERT_NE(tensors_count->find(ANEURALNETWORKS_TENSOR_QUANT8_ASYMM),
+            tensors_count->end());
+  ASSERT_NE(tensors_count->find(ANEURALNETWORKS_TENSOR_FLOAT32),
+            tensors_count->end());
+
+  EXPECT_EQ(tensors_count->at(ANEURALNETWORKS_TENSOR_QUANT8_ASYMM),
+            1);  // input
+  EXPECT_EQ(tensors_count->at(ANEURALNETWORKS_TENSOR_FLOAT32),
+            1);  // output
+}
+
+// Dequantize with Int8 input is only supported for symmetric quantization on
+// SDK level 29
+TEST_F(NnApiSignedQuantizationTest,
+       DequantizeTestInt8SymmMapsToInt8SymmOnSdk29) {
+  // [-63.5, 64] -> scale=0.5, zero_point=0 for INT8
+  DequantizeOpModel m(nnapi_mock_->GetNnApi(), TensorType_INT8, {2, 5}, 0.5, 0,
+                      2);
+
+  m.SetInput<int8_t>({-128, -127, -126, -125, -124, 123, 124, 125, 126, 127});
+  m.Invoke();
+  EXPECT_EQ(m.GetCompilationStatus(), kTfLiteOk);
+
+  ASSERT_EQ(tensors_count->size(), 2);
+  ASSERT_NE(tensors_count->find(ANEURALNETWORKS_TENSOR_QUANT8_SYMM),
+            tensors_count->end());
+  ASSERT_NE(tensors_count->find(ANEURALNETWORKS_TENSOR_FLOAT32),
+            tensors_count->end());
+
+  EXPECT_EQ(tensors_count->at(ANEURALNETWORKS_TENSOR_QUANT8_SYMM),
+            1);  // input
+  EXPECT_EQ(tensors_count->at(ANEURALNETWORKS_TENSOR_FLOAT32),
+            1);  // output
+}
+
+// Dequantize with Int8 input is only supported since SDK level 30.
+TEST_F(NnApiSignedQuantizationTest, DequantizeTestInt8MapsToInt8OnSdk30) {
+  nnapi_mock_->SetAndroidSdkVersion(30);
+  // [-63.5, 64] -> scale=0.5, zero_point=1 for INT8
+  DequantizeOpModel m(nnapi_mock_->GetNnApi(), TensorType_INT8, {2, 5}, 0.5, -1,
+                      2);
+
+  m.SetInput<int8_t>({-128, -127, -126, -125, -124, 123, 124, 125, 126, 127});
+  m.Invoke();
+  EXPECT_EQ(m.GetCompilationStatus(), kTfLiteOk);
+
+  ASSERT_EQ(tensors_count->size(), 2);
+  ASSERT_NE(tensors_count->find(ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED),
+            tensors_count->end());
+  ASSERT_NE(tensors_count->find(ANEURALNETWORKS_TENSOR_FLOAT32),
+            tensors_count->end());
+
+  EXPECT_EQ(tensors_count->at(ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED),
+            1);  // input
+  EXPECT_EQ(tensors_count->at(ANEURALNETWORKS_TENSOR_FLOAT32),
+            1);  // output
+}
+
+}  // namespace
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/lite/nnapi/NeuralNetworksTypes.h b/tensorflow/lite/nnapi/NeuralNetworksTypes.h
index 3c30a0479fa..8415df58b8b 100644
--- a/tensorflow/lite/nnapi/NeuralNetworksTypes.h
+++ b/tensorflow/lite/nnapi/NeuralNetworksTypes.h
@@ -46,6 +46,7 @@ enum {
   ANEURALNETWORKS_TENSOR_QUANT16_SYMM = 7,
   ANEURALNETWORKS_TENSOR_QUANT8_SYMM_PER_CHANNEL = 11,
   ANEURALNETWORKS_TENSOR_QUANT8_SYMM = 13,
+  ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED = 14,
 };
 
 /**
diff --git a/tensorflow/lite/nnapi/nnapi_handler.h b/tensorflow/lite/nnapi/nnapi_handler.h
index 00c0b23e3cf..16e1e9fea10 100644
--- a/tensorflow/lite/nnapi/nnapi_handler.h
+++ b/tensorflow/lite/nnapi/nnapi_handler.h
@@ -118,6 +118,11 @@ class NnApiHandler {
            const ANeuralNetworksOperandType* type) { return Value; };
   }
 
+  void StubAddOperandWith(int(stub)(ANeuralNetworksModel* model,
+                                    const ANeuralNetworksOperandType* type)) {
+    nnapi_->ANeuralNetworksModel_addOperand = stub;
+  }
+
   template <int Value>
   void SetOperandValueReturns() {
     nnapi_->ANeuralNetworksModel_setOperandValue =
@@ -268,6 +273,23 @@ class NnApiHandler {
     };
   }
 
+  template <int Value>
+  void SetPriorityReturns() {
+    nnapi_->ANeuralNetworksCompilation_setPriority =
+        [](ANeuralNetworksCompilation* compilation, int priority) -> int {
+      return Value;
+    };
+  }
+
+  template <int Value>
+  void SetOperandSymmPerChannelQuantParamsReturns() {
+    nnapi_->ANeuralNetworksModel_setOperandSymmPerChannelQuantParams =
+        [](ANeuralNetworksModel* model, int32_t index,
+           const ANeuralNetworksSymmPerChannelQuantParams* channelQuant) {
+          return Value;
+        };
+  }
+
   /*
    * Sets the SDK Version in the nnapi structure.
    * If set_unsupported_ops_to_null is set to true, all the functions not

From 4fc5c50e15d48ae50e3d7c192a6efad19fafdb3d Mon Sep 17 00:00:00 2001
From: Lukas Geiger <lukas.geiger94@gmail.com>
Date: Tue, 23 Jun 2020 16:00:35 +0200
Subject: [PATCH 0882/1390] Prefer generator expressions over list
 comprehensions

---
 tensorflow/python/eager/function.py                  |  4 ++--
 tensorflow/python/keras/engine/training_v1.py        |  4 ++--
 .../keras/layers/preprocessing/category_crossing.py  |  8 ++++----
 .../python/keras/layers/preprocessing/hashing.py     | 12 ++++++------
 4 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/tensorflow/python/eager/function.py b/tensorflow/python/eager/function.py
index e2f5d86fbd2..beb58d8ff91 100644
--- a/tensorflow/python/eager/function.py
+++ b/tensorflow/python/eager/function.py
@@ -1192,7 +1192,7 @@ class _TapeGradientFunctions(object):
   def _wrap_backward_function(self, forward_graph, backward, outputs):
     """Create a backward function given `outputs` from the forward function."""
     capture_mapping = dict(
-        zip([ops.tensor_id(t) for t in forward_graph.outputs], outputs))
+        zip((ops.tensor_id(t) for t in forward_graph.outputs), outputs))
     remapped_captures = [
         capture_mapping.get(ops.tensor_id(capture), capture)
         for capture in backward.captured_inputs
@@ -1491,7 +1491,7 @@ class ConcreteFunction(object):
     self._ndarrays_list = (
         isinstance(structured_outputs, (list, tuple)) and
         structured_outputs and
-        all([isinstance(o, np_arrays.ndarray) for o in structured_outputs]))
+        all(isinstance(o, np_arrays.ndarray) for o in structured_outputs))
     self._ndarray_singleton = isinstance(structured_outputs, np_arrays.ndarray)
 
     # function_spec defines the structured signature.
diff --git a/tensorflow/python/keras/engine/training_v1.py b/tensorflow/python/keras/engine/training_v1.py
index c137c6e517a..c901b73c892 100644
--- a/tensorflow/python/keras/engine/training_v1.py
+++ b/tensorflow/python/keras/engine/training_v1.py
@@ -1492,8 +1492,8 @@ class Model(training_lib.Model):
   def _recompile_weights_loss_and_weighted_metrics(self):
     if not self._is_compiled:
       return False
-    recompile = any([e.sample_weights_mismatch()
-                     for e in self._training_endpoints])
+    recompile = any(e.sample_weights_mismatch()
+                    for e in self._training_endpoints)
 
     if recompile:
       self._compile_weights_loss_and_weighted_metrics()
diff --git a/tensorflow/python/keras/layers/preprocessing/category_crossing.py b/tensorflow/python/keras/layers/preprocessing/category_crossing.py
index 594b9741946..e949bd4c87c 100644
--- a/tensorflow/python/keras/layers/preprocessing/category_crossing.py
+++ b/tensorflow/python/keras/layers/preprocessing/category_crossing.py
@@ -188,15 +188,15 @@ class CategoryCrossing(Layer):
   def compute_output_signature(self, input_spec):
     input_shapes = [x.shape for x in input_spec]
     output_shape = self.compute_output_shape(input_shapes)
-    if any([
+    if any(
         isinstance(inp_spec, ragged_tensor.RaggedTensorSpec)
         for inp_spec in input_spec
-    ]):
+    ):
       return tensor_spec.TensorSpec(shape=output_shape, dtype=dtypes.string)
-    elif any([
+    elif any(
         isinstance(inp_spec, sparse_tensor.SparseTensorSpec)
         for inp_spec in input_spec
-    ]):
+    ):
       return sparse_tensor.SparseTensorSpec(
           shape=output_shape, dtype=dtypes.string)
     return tensor_spec.TensorSpec(shape=output_shape, dtype=dtypes.string)
diff --git a/tensorflow/python/keras/layers/preprocessing/hashing.py b/tensorflow/python/keras/layers/preprocessing/hashing.py
index faeeec63a86..89c3042ae24 100644
--- a/tensorflow/python/keras/layers/preprocessing/hashing.py
+++ b/tensorflow/python/keras/layers/preprocessing/hashing.py
@@ -158,10 +158,10 @@ class Hashing(Layer):
   def _preprocess_inputs(self, inputs):
     if isinstance(inputs, (tuple, list)):
       # If any of them is tensor or ndarray, then treat as list
-      if any([
+      if any(
           tensor_util.is_tensor(inp) or isinstance(inp, np.ndarray)
           for inp in inputs
-      ]):
+      ):
         return [self._preprocess_single_input(inp) for inp in inputs]
     return self._preprocess_single_input(inputs)
 
@@ -261,15 +261,15 @@ class Hashing(Layer):
         return tensor_spec.TensorSpec(shape=output_shape, dtype=output_dtype)
     input_shapes = [x.shape for x in input_spec]
     output_shape = self.compute_output_shape(input_shapes)
-    if any([
+    if any(
         isinstance(inp_spec, ragged_tensor.RaggedTensorSpec)
         for inp_spec in input_spec
-    ]):
+    ):
       return tensor_spec.TensorSpec(shape=output_shape, dtype=dtypes.int64)
-    elif any([
+    elif any(
         isinstance(inp_spec, sparse_tensor.SparseTensorSpec)
         for inp_spec in input_spec
-    ]):
+    ):
       return sparse_tensor.SparseTensorSpec(
           shape=output_shape, dtype=dtypes.int64)
     return tensor_spec.TensorSpec(shape=output_shape, dtype=dtypes.int64)

From ba915231715348096e18775581027d7e625d8485 Mon Sep 17 00:00:00 2001
From: Stephan Herhut <herhut@google.com>
Date: Tue, 23 Jun 2020 06:18:20 -0700
Subject: [PATCH 0883/1390] Add call-back to further configure the
 llvm::TargetMachine that was selected for PTX generation.

This is used in kernel generation to enable FMA operation fusion.

PiperOrigin-RevId: 317851871
Change-Id: Ifaaa1ba99e5aecb90f380d085583bb8f56805d9f
---
 .../compiler/mlir/tools/kernel_gen/cubin_creator.cc  | 12 +++++++++---
 .../service/gpu/llvm_gpu_backend/gpu_backend_lib.cc  | 12 +++++++++---
 .../service/gpu/llvm_gpu_backend/gpu_backend_lib.h   |  8 +++++---
 3 files changed, 23 insertions(+), 9 deletions(-)

diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/cubin_creator.cc b/tensorflow/compiler/mlir/tools/kernel_gen/cubin_creator.cc
index 79969a22572..85a53e042e1 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/cubin_creator.cc
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/cubin_creator.cc
@@ -268,6 +268,7 @@ StatusOr<std::vector<uint8_t>> tensorflow::kernel_gen::GenerateCubinForTfCode(
     options.tile_sizes = tile_sizes;
     options.unroll_factors = unroll_factors;
     options.collapse_parallel_loops = false;
+    options.use_approximations = true;
     TF_RETURN_IF_ERROR(xla::mlir_gpu::LowerLHLOToGPU(module.get(), options));
   }
   TF_RETURN_IF_ERROR(xla::mlir_gpu::LowerKernelBodiesToNVVM(module.get()));
@@ -287,10 +288,15 @@ StatusOr<std::vector<uint8_t>> tensorflow::kernel_gen::GenerateCubinForTfCode(
   xla::HloModuleConfig config;
   config.set_debug_options(xla::GetDebugOptionsFromFlags());
 
+  auto enable_fusion = [](llvm::TargetMachine* target) {
+    target->Options.AllowFPOpFusion = llvm::FPOpFusion::FPOpFusionMode::Fast;
+  };
+
   TF_ASSIGN_OR_RETURN(std::string libdevice_dir, GetLibdeviceDir(config));
-  TF_ASSIGN_OR_RETURN(std::string ptx, xla::gpu::nvptx::CompileToPtx(
-                                           llvmModule.get(), compute_capability,
-                                           config, libdevice_dir));
+  TF_ASSIGN_OR_RETURN(
+      std::string ptx,
+      xla::gpu::nvptx::CompileToPtx(llvmModule.get(), compute_capability,
+                                    config, libdevice_dir, enable_fusion));
   VLOG(1) << ptx;
 
 #if GOOGLE_CUDA
diff --git a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc
index 497dcda4361..d2126a8d17d 100644
--- a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc
+++ b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc
@@ -492,9 +492,10 @@ void NVPTXBackendInit(const HloModuleConfig& hlo_module_config) {
 
 namespace nvptx {
 
-StatusOr<string> CompileToPtx(llvm::Module* module, GpuVersion gpu_version,
-                              const HloModuleConfig& hlo_module_config,
-                              const string& libdevice_dir_path) {
+StatusOr<string> CompileToPtx(
+    llvm::Module* module, GpuVersion gpu_version,
+    const HloModuleConfig& hlo_module_config, const string& libdevice_dir_path,
+    std::function<void(llvm::TargetMachine*)> configure_target) {
   static absl::once_flag backend_init_flag;
   absl::call_once(backend_init_flag, NVPTXBackendInit, hlo_module_config);
 
@@ -525,6 +526,11 @@ StatusOr<string> CompileToPtx(llvm::Module* module, GpuVersion gpu_version,
     std::unique_ptr<llvm::TargetMachine> target_machine = NVPTXGetTargetMachine(
         default_target_triple, *compute_capability, hlo_module_config);
 
+    // Apply target machine configuration from call-back if available.
+    if (configure_target) {
+      configure_target(target_machine.get());
+    }
+
     // Link with libdevice, and optimize the LLVM module.
     TF_RETURN_IF_ERROR(LinkAndOptimizeModule(
         module, gpu_version, hlo_module_config, libdevice_dir_path,
diff --git a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.h b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.h
index 526621de7a5..33ef9280c7a 100644
--- a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.h
+++ b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.h
@@ -22,6 +22,7 @@ limitations under the License.
 
 #include "absl/strings/string_view.h"
 #include "llvm/IR/Module.h"
+#include "llvm/Target/TargetMachine.h"
 #include "tensorflow/compiler/xla/service/gpu/gpu_types.h"
 #include "tensorflow/compiler/xla/service/hlo_module_config.h"
 #include "tensorflow/compiler/xla/statusor.h"
@@ -38,9 +39,10 @@ namespace nvptx {
 // The Compile.* interfaces each create their own llvm::LLVMContext objects for
 // thread safety, but note that LLVM's multithreaded support is very
 // preliminary; multithreaded use is not recommended at this time.
-StatusOr<string> CompileToPtx(llvm::Module* module, GpuVersion gpu_version,
-                              const HloModuleConfig& hlo_module_config,
-                              const string& libdevice_dir_path);
+StatusOr<string> CompileToPtx(
+    llvm::Module* module, GpuVersion gpu_version,
+    const HloModuleConfig& hlo_module_config, const string& libdevice_dir_path,
+    std::function<void(llvm::TargetMachine*)> configure_target = nullptr);
 }  // namespace nvptx
 
 namespace amdgpu {

From c67719c6c34207e0e8a1f5e6b429d20130cd71f9 Mon Sep 17 00:00:00 2001
From: Tres Popp <tpopp@google.com>
Date: Tue, 23 Jun 2020 06:58:18 -0700
Subject: [PATCH 0884/1390] Integrate LLVM at
 https://github.com/llvm/llvm-project/commit/f1c671925b1c

PiperOrigin-RevId: 317856949
Change-Id: I192ce19253524e7957d7fc0a83abaac4adb31772
---
 tensorflow/workspace.bzl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index 7c47628818b..d34e7d973d3 100755
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -710,8 +710,8 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
     )
 
     # Check out LLVM and MLIR from llvm-project.
-    LLVM_COMMIT = "7a55d984971c11daa55e9423934f98bdc9c04f2f"
-    LLVM_SHA256 = "6ef6d1f92f51936ed3027433ea26875ac4d2ac8eed88b1fbce472018c4fb7720"
+    LLVM_COMMIT = "f1c671925b1c60ded3e4e7b3c6b1ec984b2d9b93"
+    LLVM_SHA256 = "57fc8f0ab46bdfdff52b03c2196d658c094bc4179cd1cf9495becf6a8466123a"
     LLVM_URLS = [
         "https://storage.googleapis.com/mirror.tensorflow.org/github.com/llvm/llvm-project/archive/{commit}.tar.gz".format(commit = LLVM_COMMIT),
         "https://github.com/llvm/llvm-project/archive/{commit}.tar.gz".format(commit = LLVM_COMMIT),

From 18dc2474023de36cb686f9900d5cac85763cbf6f Mon Sep 17 00:00:00 2001
From: Yash Katariya <yashkatariya@google.com>
Date: Tue, 23 Jun 2020 08:30:44 -0700
Subject: [PATCH 0885/1390] Save output in lite notebooks to stop them from
 being tested.

PiperOrigin-RevId: 317870420
Change-Id: Ie69635ac083b7788e00acabf53ed3975f8b019f2
---
 .../post_training_float16_quant.ipynb         | 236 ++++++++++++----
 .../post_training_integer_quant.ipynb         | 178 +++++++++---
 .../performance/post_training_quant.ipynb     | 263 ++++++++++++++----
 3 files changed, 534 insertions(+), 143 deletions(-)

diff --git a/tensorflow/lite/g3doc/performance/post_training_float16_quant.ipynb b/tensorflow/lite/g3doc/performance/post_training_float16_quant.ipynb
index ef08902865e..6015d3e1a65 100644
--- a/tensorflow/lite/g3doc/performance/post_training_float16_quant.ipynb
+++ b/tensorflow/lite/g3doc/performance/post_training_float16_quant.ipynb
@@ -12,7 +12,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": 1,
       "metadata": {
         "cellView": "form",
         "colab": {},
@@ -105,7 +105,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": 2,
       "metadata": {
         "colab": {},
         "colab_type": "code",
@@ -124,13 +124,29 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": 3,
       "metadata": {
-        "colab": {},
+        "colab": {
+          "height": 34
+        },
         "colab_type": "code",
-        "id": "c6nb7OPlXs_3"
+        "id": "c6nb7OPlXs_3",
+        "outputId": "be7e4e14-cd67-4554-e928-ad803f36dad9"
       },
-      "outputs": [],
+      "outputs": [
+        {
+          "data": {
+            "text/plain": [
+              "tf.float16"
+            ]
+          },
+          "execution_count": 3,
+          "metadata": {
+            "tags": []
+          },
+          "output_type": "execute_result"
+        }
+      ],
       "source": [
         "tf.float16"
       ]
@@ -147,13 +163,39 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": 4,
       "metadata": {
-        "colab": {},
+        "colab": {
+          "height": 102
+        },
         "colab_type": "code",
-        "id": "hWSAjQWagIHl"
+        "id": "hWSAjQWagIHl",
+        "outputId": "9bf2b530-5a05-415f-f856-cab3642256e9"
       },
-      "outputs": [],
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/mnist.npz\n",
+            "11493376/11490434 [==============================] - 0s 0us/step\n",
+            "11501568/11490434 [==============================] - 0s 0us/step\n",
+            "1875/1875 [==============================] - 12s 6ms/step - loss: 0.2864 - accuracy: 0.9207 - val_loss: 0.1467 - val_accuracy: 0.9560\n"
+          ]
+        },
+        {
+          "data": {
+            "text/plain": [
+              "\u003ctensorflow.python.keras.callbacks.History at 0x7fcd75df46a0\u003e"
+            ]
+          },
+          "execution_count": 4,
+          "metadata": {
+            "tags": []
+          },
+          "output_type": "execute_result"
+        }
+      ],
       "source": [
         "# Load MNIST dataset\n",
         "mnist = keras.datasets.mnist\n",
@@ -211,7 +253,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": 5,
       "metadata": {
         "colab": {},
         "colab_type": "code",
@@ -235,7 +277,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": 6,
       "metadata": {
         "colab": {},
         "colab_type": "code",
@@ -249,13 +291,29 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": 7,
       "metadata": {
-        "colab": {},
+        "colab": {
+          "height": 34
+        },
         "colab_type": "code",
-        "id": "Ie9pQaQrn5ue"
+        "id": "Ie9pQaQrn5ue",
+        "outputId": "5df7381a-78ee-4f3e-e1a9-0f3a028384cf"
       },
-      "outputs": [],
+      "outputs": [
+        {
+          "data": {
+            "text/plain": [
+              "84452"
+            ]
+          },
+          "execution_count": 7,
+          "metadata": {
+            "tags": []
+          },
+          "output_type": "execute_result"
+        }
+      ],
       "source": [
         "tflite_model_file = tflite_models_dir/\"mnist_model.tflite\"\n",
         "tflite_model_file.write_bytes(tflite_model)"
@@ -273,7 +331,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": 8,
       "metadata": {
         "colab": {},
         "colab_type": "code",
@@ -297,13 +355,29 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": 9,
       "metadata": {
-        "colab": {},
+        "colab": {
+          "height": 34
+        },
         "colab_type": "code",
-        "id": "yuNfl3CoHNK3"
+        "id": "yuNfl3CoHNK3",
+        "outputId": "839f02cd-0a8c-4551-aaa3-0c05c845ad2e"
       },
-      "outputs": [],
+      "outputs": [
+        {
+          "data": {
+            "text/plain": [
+              "44272"
+            ]
+          },
+          "execution_count": 9,
+          "metadata": {
+            "tags": []
+          },
+          "output_type": "execute_result"
+        }
+      ],
       "source": [
         "tflite_fp16_model = converter.convert()\n",
         "tflite_model_fp16_file = tflite_models_dir/\"mnist_model_quant_f16.tflite\"\n",
@@ -322,13 +396,26 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": 10,
       "metadata": {
-        "colab": {},
+        "colab": {
+          "height": 68
+        },
         "colab_type": "code",
-        "id": "JExfcfLDscu4"
+        "id": "JExfcfLDscu4",
+        "outputId": "6ca316c2-cb0e-40e9-ffb1-a8bcf267e101"
       },
-      "outputs": [],
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "total 128K\n",
+            "-rw-rw-r-- 1 colaboratory-playground 50844828 44K Jun 23 06:04 mnist_model_quant_f16.tflite\n",
+            "-rw-rw-r-- 1 colaboratory-playground 50844828 83K Jun 23 06:04 mnist_model.tflite\n"
+          ]
+        }
+      ],
       "source": [
         "!ls -lh {tflite_models_dir}"
       ]
@@ -365,7 +452,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": 11,
       "metadata": {
         "colab": {},
         "colab_type": "code",
@@ -379,7 +466,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": 12,
       "metadata": {
         "colab": {},
         "colab_type": "code",
@@ -403,7 +490,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": 13,
       "metadata": {
         "colab": {},
         "colab_type": "code",
@@ -423,13 +510,29 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": 14,
       "metadata": {
-        "colab": {},
+        "colab": {
+          "height": 281
+        },
         "colab_type": "code",
-        "id": "XZClM2vo3_bm"
+        "id": "XZClM2vo3_bm",
+        "outputId": "fec12377-9f68-45a7-b4a6-ad902d8db171"
       },
-      "outputs": [],
+      "outputs": [
+        {
+          "data": {
+            "image/png": "iVBORw0KGgoAAAANSUhEUgAAAP8AAAEICAYAAACQ6CLfAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4zLCBo\ndHRwOi8vbWF0cGxvdGxpYi5vcmcvnQurowAAFxZJREFUeJzt3XtU1HXeB/D3cE0RVDSG4eKMPJBL\nIrI6ZqXhBTFrVwwpw5WEAGnLc9ZL2nbbI1arPPV4nix99jRR7aiFz7qmtIu6KhulVrJj4baYHiKI\nq6DCE4pyG7/PH51mI5nf4DAX9Pt+neM5zO/z/f2+H37ynt/M/GbmpxJCCBCRdDzc3QARuQfDTyQp\nhp9IUgw/kaQYfiJJMfxEkmL4yeF6enqgUqlQXV0NAMjOzsaGDRucPm9+fj5mzpzp9HluFgy/nYYN\nG2b55+HhgSFDhlhuv/vuu06fPzs7u1cPvr6+GDlypNPntUd+fj6effZZm+OmT5+OP/7xj07p4Ztv\nvum1v4YNGwaVSoXNmzc7Zb4bgZe7G7hRXbp0yfKzTqdDfn4+5syZY3V8T08PvLwct7vz8/ORn59v\nuZ2WloahQ4c6bPs/Zjab4enp6ZRtu0pERESv/7Ovv/4a48aNw8KFC93YlXvxyO8kzz//PB5++GEs\nXrwY/v7+2LFjB9LS0pCbm2sZc/jwYeh0Osvturo6JCcn49Zbb8XYsWOxdevWfs118eJF7NmzB+np\n6f0a/8O8L7zwAkaNGoWxY8di586dlnpaWhqWL1+OefPmwc/PD0eOHEFHRwdWr16N8PBwqNVqPPHE\nE+jo6LCsk5eXh+DgYISGhsJoNPaa76e/9/vvv4+4uDgEBAQgMjISBw8exG9/+1t8+umn+PWvf41h\nw4Zh5cqVAIBTp05hzpw5CAwMxM9+9jPs3r3bsp1z587hl7/8JQICAnDnnXeiqqqqX78/ABiNRsye\nPRvh4eH9XuemI2jAtFqtOHToUK9lzz33nPD29hYffPCBMJvN4vLly2LJkiVi3bp1ljGHDh0SWq1W\nCCFET0+PmDhxovj9738vOjs7RUVFhdBqteLw4cNCCCFKSkrEqFGj+pz/rbfeEpGRkf3u99ChQ8LT\n01OsWbNGdHR0iOLiYjFkyBBRUVEhhBBiyZIlYsSIEeKTTz4RZrNZdHR0iOXLl4sHHnhAtLS0iO++\n+07cd9994vnnnxdCCPGXv/xFBAcHi/LycnHp0iXx0EMPCQCiqqrKsr0ffu9jx46J4cOHi8OHDwuz\n2SxqamrE6dOnhRBCTJs2TbzzzjuWPtva2kRISIgwGo2iu7tbmEwmERgYaBmfkpIiUlNTRXt7uzh5\n8qQIDg4WM2bMsKw/b9488corr1zz+1+9elVotVqxffv2fu+zmxHD7wDWwj9r1qxey5TCf/ToUTF2\n7Nhe41944QWRnZ1tc/74+Hjx4osv9rvfQ4cOCW9vb9He3m5ZlpycLDZs2GDp89FHH7XUzGaz8PX1\nFdXV1ZZlH3/8seUO55FHHhHPPfecpVZeXm41/JmZmWLNmjV99vXT8O/YsUPMnDmz15jMzEzx0ksv\nia6uLuHp6Wm5wxJCiLVr1/YKvzV///vfhb+/f6/fX0Z8zu9E1/OQ8ttvv0VNTQ1GjBhhWWY2m22+\nel1VVYWjR49i27Zt19XbqFGjer1GoNVq0dDQYLn9497Pnj2Lzs5OTJw40bJM/OjzYA0NDZg2bVqv\nbVlTW1uLKVOm9KvHb7/9FseOHeu1T3p6epCRkYGmpiaYzeZefWq1WpSWltrcrtFoxEMPPeS010hu\nFAy/E6lUql63/fz8cPnyZcvts2fPWn4ODw9HVFQUvvrqq+uaY9u2bZgxY4Zi4Ppy4cIFXLlyBUOG\nDAEA1NTUQK/X99m7Wq2Gj48Pzpw5A7Vafc22NBoNamtrLbdramqszhseHo7Kyso+az/dX+Hh4UhI\nSMD+/fuvGdvd3Q0PDw/U1tYiMjLS5rw/aG9vx+7du1FUVGRz7M2OL/i5UFxcHIqKitDa2orGxka8\n9tprltpdd90FHx8fbNq0CR0dHTCbzfjyyy9x4sQJxW1u27YNGRkZ1yxPS0tDdna21fWuXr2K3Nxc\ndHV1oaSkBPv378eDDz7Y51hPT09kZ2dj5cqVOHfuHIQQqKurw8GDBwEAixYtwttvv43Tp0+jvb0d\n69evtzpvVlYW8vPz8eGHH+Lq1auoq6vDmTNnAHx/J/PNN99YxiYlJaG8vBzvvfceuru70d3djdLS\nUpw5cwbe3t544IEHsG7dOly5cgX/+te/sH37dsV9BQC7d+9GUFAQ7rnnHptjb3YMvwtlZGQgOjoa\nWq0W8+bNQ2pqqqXm5eWFffv2obS0FDqdDqNHj8Zjjz2GtrY2AEBJSUmvh78AcOTIETQ1NSElJeWa\nuWpra3s9FP+psLAw+Pn5QaPRID09Hfn5+YiKirI6ftOmTdBqtbjjjjswfPhwzJ07FxUVFQCA+fPn\nY/ny5ZgxYwZuu+02JCYmWt3O3XffjTfffBO/+c1vMHz4cMyaNcvyqGHlypUoKCjAiBEjsHr1agwf\nPhx/+9vfsGPHDmg0GgQHB+OZZ55BZ2cnAOAPf/gDWltboVarkZWVhUcffbTXXHPnzsXLL7/ca5nR\naMTSpUuveZQhI5UQ/DKPm01HRwd+/vOf48svv+zzvQWHDx9Gdna25R14JCc+578J3XLLLdf92gHJ\nhw/7iSTFh/1EkuKRn0hSLn3O76PyxS3wc+WURFLpQDu6RGe/xg4o/AcOHMCKFStgNpuRnZ2Np59+\nWnH8LfDDVFXCQKYkIgXHRXG/x9r9sN9sNmP58uXYv38/Tp06hYKCApw6dcrezRGRi9kd/tLSUkRG\nRiIiIgI+Pj5ITU1FYWGhI3sjIieyO/z19fW9PlQRFhaG+vr6a8YZDAbo9Xro9Xp0o3/PRYjI+ewO\nf19nCPt6y2ROTg5MJhNMJhO84WvvdETkYHaHPywsrNcnuerq6hASEuKQpojI+ewO/5QpU1BRUYGq\nqip0dXVh586dSEpKcmRvROREdp/q8/LywpYtW3DvvffCbDYjMzMT48ePd2RvRORELn17b4AqkOf5\niZzouChGm2jp11i+vZdIUgw/kaQYfiJJMfxEkmL4iSTF8BNJiuEnkhTDTyQphp9IUgw/kaQYfiJJ\nMfxEkmL4iSTF8BNJiuEnkhTDTyQphp9IUgw/kaQYfiJJMfxEkmL4iSTF8BNJiuEnkhTDTyQphp9I\nUgw/kaQYfiJJMfxEkmL4iSTF8BNJymsgK+t0Ovj7+8PT0xNeXl4wmUyO6ouInGxA4QeADz/8EKNH\nj3ZEL0TkQnzYTySpAYVfpVJh7ty5mDx5MgwGQ59jDAYD9Ho99Ho9utE5kOmIyIFUQghh78oNDQ0I\nCQlBc3MzEhMT8frrryM+Pt7q+ABVIKaqEuydjohsOC6K0SZa+jV2QEf+kJAQAEBQUBCSk5NRWlo6\nkM0RkQvZHf729nZcvHjR8vPBgwcRExPjsMaIyLnsfrW/qakJycnJAICenh786le/wrx58xzWGBE5\nl93hj4iIwMmTJx3ZCxG5EE/1EUmK4SeSFMNPJCmGn0hSDD+RpAb8wR5ZXFh2l9XamEe+Vlz3dLNa\nsd7V6a1YDy1Qrg+tu2S1drXslOK6JC8e+YkkxfATSYrhJ5IUw08kKYafSFIMP5GkGH4iSfE8fz89\ntfY9q7UUv1bllf9jgJPPVC5X91y2Wtt8btYAJ79xlTZrrdb8Ng1XXNer+ISj2xl0eOQnkhTDTyQp\nhp9IUgw/kaQYfiJJMfxEkmL4iSQ1oCv2XK8b+Yo97Q9OtVo7H6t8HzryK+Vd3BqtUqz7xP6fYv3l\nmPet1hKHXFFct+jyMMX6L4Za/66AgboiuhTrxzv9FOszb+m2e+7IoscU67fl/MPubbuTy67YQ0Q3\nLoafSFIMP5GkGH4iSTH8RJJi+IkkxfATSYqf5+8nvz8fV6gNbNsBA1sdrwfPtFp7aZpOee6PlK85\n8PLMSDs66h+vK1cV637/bFSsj/p4t2J9go/16x0MrVa+FoIMbB75MzMzERQUhJiYGMuylpYWJCYm\nIioqComJiWhttfFlFkQ06NgMf0ZGBg4cONBrWV5eHhISElBRUYGEhATk5eU5rUEicg6b4Y+Pj0dg\nYGCvZYWFhUhPTwcApKenY+/evc7pjoicxq7n/E1NTdBoNAAAjUaD5uZmq2MNBgMMBgMAoBud9kxH\nRE7g9Ff7c3JyYDKZYDKZ4A1fZ09HRP1kV/jVajUaG79/JbaxsRFBQUEObYqInM+u8CclJcFoNAIA\njEYjFixY4NCmiMj5bD7nX7x4MUpKSnD+/HmEhYVh/fr1ePrpp7Fo0SK89dZbGDNmDHbt2uWKXsmK\nnrNNVmt+u63XAMBsY9t+f75gR0eO0ZR9l2J9vI/yn+9/tYyzWtO9843iuj2K1ZuDzfAXFBT0uby4\nuNjhzRCR6/DtvUSSYviJJMXwE0mK4SeSFMNPJCl+pJfcxksbrljf8uwWxbq3ylOxvmvzHKu1UY2f\nKq4rAx75iSTF8BNJiuEnkhTDTyQphp9IUgw/kaQYfiJJ8Tw/uc3pVaGK9Sm+ypcuL+9Svvx44KnL\n192TTHjkJ5IUw08kKYafSFIMP5GkGH4iSTH8RJJi+IkkxfP85FSdv5hitfb5g/9tY23lKzw9vmKF\nYn3IJ6U2ti83HvmJJMXwE0mK4SeSFMNPJCmGn0hSDD+RpBh+IknxPD85Vc191o8vw1TK5/EXVyUq\n1oceOKlYF4pVsnnkz8zMRFBQEGJiYizLcnNzERoairi4OMTFxWHfvn1ObZKIHM9m+DMyMnDgwIFr\nlq9atQplZWUoKyvD/fff75TmiMh5bIY/Pj4egYGBruiFiFzI7hf8tmzZgtjYWGRmZqK1tdXqOIPB\nAL1eD71ej2502jsdETmYXeF//PHHUVlZibKyMmg0Gjz55JNWx+bk5MBkMsFkMsHbxgc1iMh17Aq/\nWq2Gp6cnPDw8sGzZMpSW8tNTRDcau8Lf2Nho+XnPnj29zgQQ0Y3B5nn+xYsXo6SkBOfPn0dYWBjW\nr1+PkpISlJWVQaVSQafT4Y033nBFrzQIefj7K9Yfueeo1Vrb1Q7FdZs3RCjWfTv/oVgnZTbDX1BQ\ncM2yrKwspzRDRK7Dt/cSSYrhJ5IUw08kKYafSFIMP5Gk+JFeGpCK3PGK9b+O/h+rtQUVKYrr+u7j\nqTxn4pGfSFIMP5GkGH4iSTH8RJJi+IkkxfATSYrhJ5IUz/OTou/S7lSs//Ph1xTrlT3dVmuX/jNM\ncV1fNCrWaWB45CeSFMNPJCmGn0hSDD+RpBh+Ikkx/ESSYviJJMXz/JLzCg1RrK/83f8q1n1Vyn9C\nqScfsVq7dT8/r+9OPPITSYrhJ5IUw08kKYafSFIMP5GkGH4iSTH8RJKyeZ6/trYWS5cuxdmzZ+Hh\n4YGcnBysWLECLS0tePjhh1FdXQ2dToc//elPGDlypCt6puug8lL+L5741zrF+kPDLijW370YpFhX\n/8768eWq4prkbDaP/F5eXti0aRO++uorfPbZZ9i6dStOnTqFvLw8JCQkoKKiAgkJCcjLy3NFv0Tk\nIDbDr9FoMGnSJACAv78/oqOjUV9fj8LCQqSnpwMA0tPTsXfvXud2SkQOdV3P+aurq/HFF19g6tSp\naGpqgkajAfD9HURzc7NTGiQi5+j3e/svXbqElJQUvPrqqwgICOj3BAaDAQaDAQDQjc7r75CInKJf\nR/7u7m6kpKRgyZIlWLhwIQBArVajsfH7L1hsbGxEUFDfL/zk5OTAZDLBZDLBG74OapuIBspm+IUQ\nyMrKQnR0NFavXm1ZnpSUBKPRCAAwGo1YsGCB87okIodTCSGE0oCjR4/innvuwYQJE+Dh8f19xYYN\nGzB16lQsWrQINTU1GDNmDHbt2oXAwEDFyQJUgZiqSnBc92STarLyJbSLPtg+oO3f/cxyxfqIbZ8O\naPt0fY6LYrSJln6Ntfmcf/r06bB2/1BcXHx9nRHRoMF3+BFJiuEnkhTDTyQphp9IUgw/kaQYfiJJ\n8au7bwKet99mtZazs3BA2779beXz+Lrtnw1o++Q+PPITSYrhJ5IUw08kKYafSFIMP5GkGH4iSTH8\nRJLief6bwOknrH9l+vyhbQPadlhJl/IA5a+DoEGMR34iSTH8RJJi+IkkxfATSYrhJ5IUw08kKYaf\nSFI8z38D6Jh/h2K9eP4mhepQxzZDNw0e+YkkxfATSYrhJ5IUw08kKYafSFIMP5GkGH4iSdk8z19b\nW4ulS5fi7Nmz8PDwQE5ODlasWIHc3Fy8+eabuPXWWwEAGzZswP333+/0hmXUMM1TsT7Gy/5z+e9e\nDFKse7cpf56fn+a/cdkMv5eXFzZt2oRJkybh4sWLmDx5MhITEwEAq1atwpo1a5zeJBE5ns3wazQa\naDQaAIC/vz+io6NRX1/v9MaIyLmu6zl/dXU1vvjiC0ydOhUAsGXLFsTGxiIzMxOtra19rmMwGKDX\n66HX69GNzoF3TEQO0e/wX7p0CSkpKXj11VcREBCAxx9/HJWVlSgrK4NGo8GTTz7Z53o5OTkwmUww\nmUzwhq/DGieigelX+Lu7u5GSkoIlS5Zg4cKFAAC1Wg1PT094eHhg2bJlKC0tdWqjRORYNsMvhEBW\nVhaio6OxevVqy/LGxkbLz3v27EFMTIxzOiQip7D5gt+xY8ewfft2TJgwAXFxcQC+P61XUFCAsrIy\nqFQq6HQ6vPHGG05vlq7fxgu3K9Y/vVenWBeNXzqwGxpMbIZ/+vTpEH18NzvP6RPd2PgOPyJJMfxE\nkmL4iSTF8BNJiuEnkhTDTyQplejrPJ6TBKgCMVWV4KrpiKRzXBSjTbT0ayyP/ESSYviJJMXwE0mK\n4SeSFMNPJCmGn0hSDD+RpFx6iW6fUR5o1VVZbp87d87y1d+DzWDtbbD2BbA3ezmyN5/q/h/PXfom\nn5/S6/UwmUzuml7RYO1tsPYFsDd7uas3PuwnkhTDTyQpz9zc3Fx3NjB58mR3Tq9osPY2WPsC2Ju9\n3NGbW5/zE5H78GE/kaQYfiJJuSX8Bw4cwLhx4xAZGYm8vDx3tGCVTqezXKNAr9e7tZfMzEwEBQX1\nuiBKS0sLEhMTERUVhcTERKvXSHRHb7m5uQgNDUVcXBzi4uKwb98+t/RWW1uLWbNmITo6GuPHj8fm\nzZsBuH/fWevLbftNuFhPT4+IiIgQlZWVorOzU8TGxory8nJXt2GVVqsV586dc3cbQgghPvroI3Hi\nxAkxfvx4y7K1a9eKjRs3CiGE2Lhxo3jqqacGTW/r1q0Tr7zyilv6+bGGhgZx4sQJIYQQbW1tIioq\nSpSXl7t931nry137zeVH/tLSUkRGRiIiIgI+Pj5ITU1FYWGhq9u4IcTHxyMwMLDXssLCQqSnpwMA\n0tPTsXfvXne01mdvg4VGo8GkSZMA9L6svLv3nbW+3MXl4a+vr0d4eLjldlhYmFt3wE+pVCrMnTsX\nkydPhsFgcHc712hqaoJGowHw/R9Tc3OzmzvqrT+XbXelH19WfjDtO3sud+9oLg+/6OPMokqlcnUb\nVh07dgyff/459u/fj61bt+Ljjz92d0s3jP5ett1VfnpZ+cHC3svdO5rLwx8WFoba2lrL7bq6OoSE\nhLi6Dat+6CUoKAjJycmD7tLjarXacoXkxsZGBAUFubmjfxtMl223dll5d++7wXS5e5eHf8qUKaio\nqEBVVRW6urqwc+dOJCUlubqNPrW3t+PixYuWnw8ePDjoLj2elJQEo9EIADAajViwYIGbO/q3wXLZ\ndmHlsvLu3nfW+nLbfnP5S4xCiKKiIhEVFSUiIiLESy+95I4W+lRZWSliY2NFbGysuP32293eW2pq\nqggODhZeXl4iNDRU5Ofni/Pnz4vZs2eLyMhIMXv2bHHhwoVB01taWpqIiYkREyZMEPPnzxcNDQ1u\n6e3IkSMCgJgwYYKYOHGimDhxoigqKnL7vrPWl7v2G9/eSyQpvsOPSFIMP5GkGH4iSTH8RJJi+Ikk\nxfATSYrhJ5LU/wOdAGX9nfSgHgAAAABJRU5ErkJggg==\n",
+            "text/plain": [
+              "\u003cFigure size 600x400 with 1 Axes\u003e"
+            ]
+          },
+          "metadata": {
+            "tags": []
+          },
+          "output_type": "display_data"
+        }
+      ],
       "source": [
         "import matplotlib.pylab as plt\n",
         "\n",
@@ -442,7 +545,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": 15,
       "metadata": {
         "colab": {},
         "colab_type": "code",
@@ -462,13 +565,29 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": 16,
       "metadata": {
-        "colab": {},
+        "colab": {
+          "height": 281
+        },
         "colab_type": "code",
-        "id": "CIH7G_MwbY2x"
+        "id": "CIH7G_MwbY2x",
+        "outputId": "6a65e499-6618-4b3e-94f6-1d12af8fb251"
       },
-      "outputs": [],
+      "outputs": [
+        {
+          "data": {
+            "image/png": "iVBORw0KGgoAAAANSUhEUgAAAP8AAAEICAYAAACQ6CLfAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4zLCBo\ndHRwOi8vbWF0cGxvdGxpYi5vcmcvnQurowAAFxZJREFUeJzt3XtU1HXeB/D3cE0RVDSG4eKMPJBL\nIrI6ZqXhBTFrVwwpw5WEAGnLc9ZL2nbbI1arPPV4nix99jRR7aiFz7qmtIu6KhulVrJj4baYHiKI\nq6DCE4pyG7/PH51mI5nf4DAX9Pt+neM5zO/z/f2+H37ynt/M/GbmpxJCCBCRdDzc3QARuQfDTyQp\nhp9IUgw/kaQYfiJJMfxEkmL4yeF6enqgUqlQXV0NAMjOzsaGDRucPm9+fj5mzpzp9HluFgy/nYYN\nG2b55+HhgSFDhlhuv/vuu06fPzs7u1cPvr6+GDlypNPntUd+fj6effZZm+OmT5+OP/7xj07p4Ztv\nvum1v4YNGwaVSoXNmzc7Zb4bgZe7G7hRXbp0yfKzTqdDfn4+5syZY3V8T08PvLwct7vz8/ORn59v\nuZ2WloahQ4c6bPs/Zjab4enp6ZRtu0pERESv/7Ovv/4a48aNw8KFC93YlXvxyO8kzz//PB5++GEs\nXrwY/v7+2LFjB9LS0pCbm2sZc/jwYeh0Osvturo6JCcn49Zbb8XYsWOxdevWfs118eJF7NmzB+np\n6f0a/8O8L7zwAkaNGoWxY8di586dlnpaWhqWL1+OefPmwc/PD0eOHEFHRwdWr16N8PBwqNVqPPHE\nE+jo6LCsk5eXh+DgYISGhsJoNPaa76e/9/vvv4+4uDgEBAQgMjISBw8exG9/+1t8+umn+PWvf41h\nw4Zh5cqVAIBTp05hzpw5CAwMxM9+9jPs3r3bsp1z587hl7/8JQICAnDnnXeiqqqqX78/ABiNRsye\nPRvh4eH9XuemI2jAtFqtOHToUK9lzz33nPD29hYffPCBMJvN4vLly2LJkiVi3bp1ljGHDh0SWq1W\nCCFET0+PmDhxovj9738vOjs7RUVFhdBqteLw4cNCCCFKSkrEqFGj+pz/rbfeEpGRkf3u99ChQ8LT\n01OsWbNGdHR0iOLiYjFkyBBRUVEhhBBiyZIlYsSIEeKTTz4RZrNZdHR0iOXLl4sHHnhAtLS0iO++\n+07cd9994vnnnxdCCPGXv/xFBAcHi/LycnHp0iXx0EMPCQCiqqrKsr0ffu9jx46J4cOHi8OHDwuz\n2SxqamrE6dOnhRBCTJs2TbzzzjuWPtva2kRISIgwGo2iu7tbmEwmERgYaBmfkpIiUlNTRXt7uzh5\n8qQIDg4WM2bMsKw/b9488corr1zz+1+9elVotVqxffv2fu+zmxHD7wDWwj9r1qxey5TCf/ToUTF2\n7Nhe41944QWRnZ1tc/74+Hjx4osv9rvfQ4cOCW9vb9He3m5ZlpycLDZs2GDp89FHH7XUzGaz8PX1\nFdXV1ZZlH3/8seUO55FHHhHPPfecpVZeXm41/JmZmWLNmjV99vXT8O/YsUPMnDmz15jMzEzx0ksv\nia6uLuHp6Wm5wxJCiLVr1/YKvzV///vfhb+/f6/fX0Z8zu9E1/OQ8ttvv0VNTQ1GjBhhWWY2m22+\nel1VVYWjR49i27Zt19XbqFGjer1GoNVq0dDQYLn9497Pnj2Lzs5OTJw40bJM/OjzYA0NDZg2bVqv\nbVlTW1uLKVOm9KvHb7/9FseOHeu1T3p6epCRkYGmpiaYzeZefWq1WpSWltrcrtFoxEMPPeS010hu\nFAy/E6lUql63/fz8cPnyZcvts2fPWn4ODw9HVFQUvvrqq+uaY9u2bZgxY4Zi4Ppy4cIFXLlyBUOG\nDAEA1NTUQK/X99m7Wq2Gj48Pzpw5A7Vafc22NBoNamtrLbdramqszhseHo7Kyso+az/dX+Hh4UhI\nSMD+/fuvGdvd3Q0PDw/U1tYiMjLS5rw/aG9vx+7du1FUVGRz7M2OL/i5UFxcHIqKitDa2orGxka8\n9tprltpdd90FHx8fbNq0CR0dHTCbzfjyyy9x4sQJxW1u27YNGRkZ1yxPS0tDdna21fWuXr2K3Nxc\ndHV1oaSkBPv378eDDz7Y51hPT09kZ2dj5cqVOHfuHIQQqKurw8GDBwEAixYtwttvv43Tp0+jvb0d\n69evtzpvVlYW8vPz8eGHH+Lq1auoq6vDmTNnAHx/J/PNN99YxiYlJaG8vBzvvfceuru70d3djdLS\nUpw5cwbe3t544IEHsG7dOly5cgX/+te/sH37dsV9BQC7d+9GUFAQ7rnnHptjb3YMvwtlZGQgOjoa\nWq0W8+bNQ2pqqqXm5eWFffv2obS0FDqdDqNHj8Zjjz2GtrY2AEBJSUmvh78AcOTIETQ1NSElJeWa\nuWpra3s9FP+psLAw+Pn5QaPRID09Hfn5+YiKirI6ftOmTdBqtbjjjjswfPhwzJ07FxUVFQCA+fPn\nY/ny5ZgxYwZuu+02JCYmWt3O3XffjTfffBO/+c1vMHz4cMyaNcvyqGHlypUoKCjAiBEjsHr1agwf\nPhx/+9vfsGPHDmg0GgQHB+OZZ55BZ2cnAOAPf/gDWltboVarkZWVhUcffbTXXHPnzsXLL7/ca5nR\naMTSpUuveZQhI5UQ/DKPm01HRwd+/vOf48svv+zzvQWHDx9Gdna25R14JCc+578J3XLLLdf92gHJ\nhw/7iSTFh/1EkuKRn0hSLn3O76PyxS3wc+WURFLpQDu6RGe/xg4o/AcOHMCKFStgNpuRnZ2Np59+\nWnH8LfDDVFXCQKYkIgXHRXG/x9r9sN9sNmP58uXYv38/Tp06hYKCApw6dcrezRGRi9kd/tLSUkRG\nRiIiIgI+Pj5ITU1FYWGhI3sjIieyO/z19fW9PlQRFhaG+vr6a8YZDAbo9Xro9Xp0o3/PRYjI+ewO\nf19nCPt6y2ROTg5MJhNMJhO84WvvdETkYHaHPywsrNcnuerq6hASEuKQpojI+ewO/5QpU1BRUYGq\nqip0dXVh586dSEpKcmRvROREdp/q8/LywpYtW3DvvffCbDYjMzMT48ePd2RvRORELn17b4AqkOf5\niZzouChGm2jp11i+vZdIUgw/kaQYfiJJMfxEkmL4iSTF8BNJiuEnkhTDTyQphp9IUgw/kaQYfiJJ\nMfxEkmL4iSTF8BNJiuEnkhTDTyQphp9IUgw/kaQYfiJJMfxEkmL4iSTF8BNJiuEnkhTDTyQphp9I\nUgw/kaQYfiJJMfxEkmL4iSTF8BNJymsgK+t0Ovj7+8PT0xNeXl4wmUyO6ouInGxA4QeADz/8EKNH\nj3ZEL0TkQnzYTySpAYVfpVJh7ty5mDx5MgwGQ59jDAYD9Ho99Ho9utE5kOmIyIFUQghh78oNDQ0I\nCQlBc3MzEhMT8frrryM+Pt7q+ABVIKaqEuydjohsOC6K0SZa+jV2QEf+kJAQAEBQUBCSk5NRWlo6\nkM0RkQvZHf729nZcvHjR8vPBgwcRExPjsMaIyLnsfrW/qakJycnJAICenh786le/wrx58xzWGBE5\nl93hj4iIwMmTJx3ZCxG5EE/1EUmK4SeSFMNPJCmGn0hSDD+RpAb8wR5ZXFh2l9XamEe+Vlz3dLNa\nsd7V6a1YDy1Qrg+tu2S1drXslOK6JC8e+YkkxfATSYrhJ5IUw08kKYafSFIMP5GkGH4iSfE8fz89\ntfY9q7UUv1bllf9jgJPPVC5X91y2Wtt8btYAJ79xlTZrrdb8Ng1XXNer+ISj2xl0eOQnkhTDTyQp\nhp9IUgw/kaQYfiJJMfxEkmL4iSQ1oCv2XK8b+Yo97Q9OtVo7H6t8HzryK+Vd3BqtUqz7xP6fYv3l\nmPet1hKHXFFct+jyMMX6L4Za/66AgboiuhTrxzv9FOszb+m2e+7IoscU67fl/MPubbuTy67YQ0Q3\nLoafSFIMP5GkGH4iSTH8RJJi+IkkxfATSYqf5+8nvz8fV6gNbNsBA1sdrwfPtFp7aZpOee6PlK85\n8PLMSDs66h+vK1cV637/bFSsj/p4t2J9go/16x0MrVa+FoIMbB75MzMzERQUhJiYGMuylpYWJCYm\nIioqComJiWhttfFlFkQ06NgMf0ZGBg4cONBrWV5eHhISElBRUYGEhATk5eU5rUEicg6b4Y+Pj0dg\nYGCvZYWFhUhPTwcApKenY+/evc7pjoicxq7n/E1NTdBoNAAAjUaD5uZmq2MNBgMMBgMAoBud9kxH\nRE7g9Ff7c3JyYDKZYDKZ4A1fZ09HRP1kV/jVajUaG79/JbaxsRFBQUEObYqInM+u8CclJcFoNAIA\njEYjFixY4NCmiMj5bD7nX7x4MUpKSnD+/HmEhYVh/fr1ePrpp7Fo0SK89dZbGDNmDHbt2uWKXsmK\nnrNNVmt+u63XAMBsY9t+f75gR0eO0ZR9l2J9vI/yn+9/tYyzWtO9843iuj2K1ZuDzfAXFBT0uby4\nuNjhzRCR6/DtvUSSYviJJMXwE0mK4SeSFMNPJCl+pJfcxksbrljf8uwWxbq3ylOxvmvzHKu1UY2f\nKq4rAx75iSTF8BNJiuEnkhTDTyQphp9IUgw/kaQYfiJJ8Tw/uc3pVaGK9Sm+ypcuL+9Svvx44KnL\n192TTHjkJ5IUw08kKYafSFIMP5GkGH4iSTH8RJJi+IkkxfP85FSdv5hitfb5g/9tY23lKzw9vmKF\nYn3IJ6U2ti83HvmJJMXwE0mK4SeSFMNPJCmGn0hSDD+RpBh+IknxPD85Vc191o8vw1TK5/EXVyUq\n1oceOKlYF4pVsnnkz8zMRFBQEGJiYizLcnNzERoairi4OMTFxWHfvn1ObZKIHM9m+DMyMnDgwIFr\nlq9atQplZWUoKyvD/fff75TmiMh5bIY/Pj4egYGBruiFiFzI7hf8tmzZgtjYWGRmZqK1tdXqOIPB\nAL1eD71ej2502jsdETmYXeF//PHHUVlZibKyMmg0Gjz55JNWx+bk5MBkMsFkMsHbxgc1iMh17Aq/\nWq2Gp6cnPDw8sGzZMpSW8tNTRDcau8Lf2Nho+XnPnj29zgQQ0Y3B5nn+xYsXo6SkBOfPn0dYWBjW\nr1+PkpISlJWVQaVSQafT4Y033nBFrzQIefj7K9Yfueeo1Vrb1Q7FdZs3RCjWfTv/oVgnZTbDX1BQ\ncM2yrKwspzRDRK7Dt/cSSYrhJ5IUw08kKYafSFIMP5Gk+JFeGpCK3PGK9b+O/h+rtQUVKYrr+u7j\nqTxn4pGfSFIMP5GkGH4iSTH8RJJi+IkkxfATSYrhJ5IUz/OTou/S7lSs//Ph1xTrlT3dVmuX/jNM\ncV1fNCrWaWB45CeSFMNPJCmGn0hSDD+RpBh+Ikkx/ESSYviJJMXz/JLzCg1RrK/83f8q1n1Vyn9C\nqScfsVq7dT8/r+9OPPITSYrhJ5IUw08kKYafSFIMP5GkGH4iSTH8RJKyeZ6/trYWS5cuxdmzZ+Hh\n4YGcnBysWLECLS0tePjhh1FdXQ2dToc//elPGDlypCt6puug8lL+L5741zrF+kPDLijW370YpFhX\n/8768eWq4prkbDaP/F5eXti0aRO++uorfPbZZ9i6dStOnTqFvLw8JCQkoKKiAgkJCcjLy3NFv0Tk\nIDbDr9FoMGnSJACAv78/oqOjUV9fj8LCQqSnpwMA0tPTsXfvXud2SkQOdV3P+aurq/HFF19g6tSp\naGpqgkajAfD9HURzc7NTGiQi5+j3e/svXbqElJQUvPrqqwgICOj3BAaDAQaDAQDQjc7r75CInKJf\nR/7u7m6kpKRgyZIlWLhwIQBArVajsfH7L1hsbGxEUFDfL/zk5OTAZDLBZDLBG74OapuIBspm+IUQ\nyMrKQnR0NFavXm1ZnpSUBKPRCAAwGo1YsGCB87okIodTCSGE0oCjR4/innvuwYQJE+Dh8f19xYYN\nGzB16lQsWrQINTU1GDNmDHbt2oXAwEDFyQJUgZiqSnBc92STarLyJbSLPtg+oO3f/cxyxfqIbZ8O\naPt0fY6LYrSJln6Ntfmcf/r06bB2/1BcXHx9nRHRoMF3+BFJiuEnkhTDTyQphp9IUgw/kaQYfiJJ\n8au7bwKet99mtZazs3BA2779beXz+Lrtnw1o++Q+PPITSYrhJ5IUw08kKYafSFIMP5GkGH4iSTH8\nRJLief6bwOknrH9l+vyhbQPadlhJl/IA5a+DoEGMR34iSTH8RJJi+IkkxfATSYrhJ5IUw08kKYaf\nSFI8z38D6Jh/h2K9eP4mhepQxzZDNw0e+YkkxfATSYrhJ5IUw08kKYafSFIMP5GkGH4iSdk8z19b\nW4ulS5fi7Nmz8PDwQE5ODlasWIHc3Fy8+eabuPXWWwEAGzZswP333+/0hmXUMM1TsT7Gy/5z+e9e\nDFKse7cpf56fn+a/cdkMv5eXFzZt2oRJkybh4sWLmDx5MhITEwEAq1atwpo1a5zeJBE5ns3wazQa\naDQaAIC/vz+io6NRX1/v9MaIyLmu6zl/dXU1vvjiC0ydOhUAsGXLFsTGxiIzMxOtra19rmMwGKDX\n66HX69GNzoF3TEQO0e/wX7p0CSkpKXj11VcREBCAxx9/HJWVlSgrK4NGo8GTTz7Z53o5OTkwmUww\nmUzwhq/DGieigelX+Lu7u5GSkoIlS5Zg4cKFAAC1Wg1PT094eHhg2bJlKC0tdWqjRORYNsMvhEBW\nVhaio6OxevVqy/LGxkbLz3v27EFMTIxzOiQip7D5gt+xY8ewfft2TJgwAXFxcQC+P61XUFCAsrIy\nqFQq6HQ6vPHGG05vlq7fxgu3K9Y/vVenWBeNXzqwGxpMbIZ/+vTpEH18NzvP6RPd2PgOPyJJMfxE\nkmL4iSTF8BNJiuEnkhTDTyQplejrPJ6TBKgCMVWV4KrpiKRzXBSjTbT0ayyP/ESSYviJJMXwE0mK\n4SeSFMNPJCmGn0hSDD+RpFx6iW6fUR5o1VVZbp87d87y1d+DzWDtbbD2BbA3ezmyN5/q/h/PXfom\nn5/S6/UwmUzuml7RYO1tsPYFsDd7uas3PuwnkhTDTyQpz9zc3Fx3NjB58mR3Tq9osPY2WPsC2Ju9\n3NGbW5/zE5H78GE/kaQYfiJJuSX8Bw4cwLhx4xAZGYm8vDx3tGCVTqezXKNAr9e7tZfMzEwEBQX1\nuiBKS0sLEhMTERUVhcTERKvXSHRHb7m5uQgNDUVcXBzi4uKwb98+t/RWW1uLWbNmITo6GuPHj8fm\nzZsBuH/fWevLbftNuFhPT4+IiIgQlZWVorOzU8TGxory8nJXt2GVVqsV586dc3cbQgghPvroI3Hi\nxAkxfvx4y7K1a9eKjRs3CiGE2Lhxo3jqqacGTW/r1q0Tr7zyilv6+bGGhgZx4sQJIYQQbW1tIioq\nSpSXl7t931nry137zeVH/tLSUkRGRiIiIgI+Pj5ITU1FYWGhq9u4IcTHxyMwMLDXssLCQqSnpwMA\n0tPTsXfvXne01mdvg4VGo8GkSZMA9L6svLv3nbW+3MXl4a+vr0d4eLjldlhYmFt3wE+pVCrMnTsX\nkydPhsFgcHc712hqaoJGowHw/R9Tc3OzmzvqrT+XbXelH19WfjDtO3sud+9oLg+/6OPMokqlcnUb\nVh07dgyff/459u/fj61bt+Ljjz92d0s3jP5ett1VfnpZ+cHC3svdO5rLwx8WFoba2lrL7bq6OoSE\nhLi6Dat+6CUoKAjJycmD7tLjarXacoXkxsZGBAUFubmjfxtMl223dll5d++7wXS5e5eHf8qUKaio\nqEBVVRW6urqwc+dOJCUlubqNPrW3t+PixYuWnw8ePDjoLj2elJQEo9EIADAajViwYIGbO/q3wXLZ\ndmHlsvLu3nfW+nLbfnP5S4xCiKKiIhEVFSUiIiLESy+95I4W+lRZWSliY2NFbGysuP32293eW2pq\nqggODhZeXl4iNDRU5Ofni/Pnz4vZs2eLyMhIMXv2bHHhwoVB01taWpqIiYkREyZMEPPnzxcNDQ1u\n6e3IkSMCgJgwYYKYOHGimDhxoigqKnL7vrPWl7v2G9/eSyQpvsOPSFIMP5GkGH4iSTH8RJJi+Ikk\nxfATSYrhJ5LU/wOdAGX9nfSgHgAAAABJRU5ErkJggg==\n",
+            "text/plain": [
+              "\u003cFigure size 600x400 with 1 Axes\u003e"
+            ]
+          },
+          "metadata": {
+            "tags": []
+          },
+          "output_type": "display_data"
+        }
+      ],
       "source": [
         "plt.imshow(test_images[0])\n",
         "template = \"True:{true}, predicted:{predict}\"\n",
@@ -489,7 +608,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": 17,
       "metadata": {
         "colab": {},
         "colab_type": "code",
@@ -531,13 +650,24 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": 18,
       "metadata": {
-        "colab": {},
+        "colab": {
+          "height": 34
+        },
         "colab_type": "code",
-        "id": "T5mWkSbMcU5z"
+        "id": "T5mWkSbMcU5z",
+        "outputId": "818e9142-70cf-420b-8e64-38c2ca11a370"
       },
-      "outputs": [],
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "0.956\n"
+          ]
+        }
+      ],
       "source": [
         "print(evaluate_model(interpreter))"
       ]
@@ -554,13 +684,24 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": 19,
       "metadata": {
-        "colab": {},
+        "colab": {
+          "height": 34
+        },
         "colab_type": "code",
-        "id": "-9cnwiPp6EGm"
+        "id": "-9cnwiPp6EGm",
+        "outputId": "53e00eac-51af-4030-be1a-3df986640f8d"
       },
-      "outputs": [],
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "0.956\n"
+          ]
+        }
+      ],
       "source": [
         "# NOTE: Colab runs on server CPUs. At the time of writing this, TensorFlow Lite\n",
         "# doesn't have super optimized server CPU kernels. For this reason this may be\n",
@@ -599,8 +740,7 @@
   "metadata": {
     "colab": {
       "collapsed_sections": [],
-      "name": "post_training-float16-quant.ipynb",
-      "private_outputs": true,
+      "name": "post_training_float16_quant.ipynb",
       "provenance": [],
       "toc_visible": true
     },
diff --git a/tensorflow/lite/g3doc/performance/post_training_integer_quant.ipynb b/tensorflow/lite/g3doc/performance/post_training_integer_quant.ipynb
index ad461f56d6f..8397dbfa69f 100644
--- a/tensorflow/lite/g3doc/performance/post_training_integer_quant.ipynb
+++ b/tensorflow/lite/g3doc/performance/post_training_integer_quant.ipynb
@@ -12,7 +12,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": 17,
       "metadata": {
         "cellView": "form",
         "colab": {},
@@ -106,7 +106,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": 18,
       "metadata": {
         "colab": {},
         "colab_type": "code",
@@ -135,13 +135,36 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": 19,
       "metadata": {
-        "colab": {},
+        "colab": {
+          "height": 51
+        },
         "colab_type": "code",
-        "id": "eMsw_6HujaqM"
+        "id": "eMsw_6HujaqM",
+        "outputId": "5662a5f3-fc64-458f-958a-98f9c6348143"
       },
-      "outputs": [],
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "1875/1875 [==============================] - 2s 1ms/step - loss: 0.2782 - accuracy: 0.9221 - val_loss: 0.1230 - val_accuracy: 0.9664\n"
+          ]
+        },
+        {
+          "data": {
+            "text/plain": [
+              "\u003ctensorflow.python.keras.callbacks.History at 0x7f33f1817588\u003e"
+            ]
+          },
+          "execution_count": 19,
+          "metadata": {
+            "tags": []
+          },
+          "output_type": "execute_result"
+        }
+      ],
       "source": [
         "# Load MNIST dataset\n",
         "mnist = keras.datasets.mnist\n",
@@ -199,7 +222,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": 20,
       "metadata": {
         "colab": {},
         "colab_type": "code",
@@ -223,7 +246,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": 21,
       "metadata": {
         "colab": {},
         "colab_type": "code",
@@ -237,13 +260,29 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": 22,
       "metadata": {
-        "colab": {},
+        "colab": {
+          "height": 34
+        },
         "colab_type": "code",
-        "id": "Ie9pQaQrn5ue"
+        "id": "Ie9pQaQrn5ue",
+        "outputId": "8580b835-61f0-42b3-a21e-b8d476042c11"
       },
-      "outputs": [],
+      "outputs": [
+        {
+          "data": {
+            "text/plain": [
+              "84528"
+            ]
+          },
+          "execution_count": 22,
+          "metadata": {
+            "tags": []
+          },
+          "output_type": "execute_result"
+        }
+      ],
       "source": [
         "tflite_model_file = tflite_models_dir/\"mnist_model.tflite\"\n",
         "tflite_model_file.write_bytes(tflite_model)"
@@ -266,7 +305,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": 23,
       "metadata": {
         "colab": {},
         "colab_type": "code",
@@ -289,7 +328,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": 24,
       "metadata": {
         "colab": {},
         "colab_type": "code",
@@ -319,13 +358,29 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": 25,
       "metadata": {
-        "colab": {},
+        "colab": {
+          "height": 34
+        },
         "colab_type": "code",
-        "id": "yuNfl3CoHNK3"
+        "id": "yuNfl3CoHNK3",
+        "outputId": "79a19679-87a2-4dc6-eee4-b33f3e5c1c5d"
       },
-      "outputs": [],
+      "outputs": [
+        {
+          "data": {
+            "text/plain": [
+              "24720"
+            ]
+          },
+          "execution_count": 25,
+          "metadata": {
+            "tags": []
+          },
+          "output_type": "execute_result"
+        }
+      ],
       "source": [
         "tflite_model_quant = converter.convert()\n",
         "tflite_model_quant_file = tflite_models_dir/\"mnist_model_quant.tflite\"\n",
@@ -344,13 +399,27 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": 26,
       "metadata": {
-        "colab": {},
+        "colab": {
+          "height": 85
+        },
         "colab_type": "code",
-        "id": "JExfcfLDscu4"
+        "id": "JExfcfLDscu4",
+        "outputId": "58238f92-01b0-4faa-e293-35451d08dd7c"
       },
-      "outputs": [],
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "total 140K\n",
+            "-rw-rw-r-- 1 yashkatariya 10086651 25K Jun 23 06:06 mnist_model_quant_io.tflite\n",
+            "-rw-rw-r-- 1 yashkatariya 10086651 25K Jun 23 06:07 mnist_model_quant.tflite\n",
+            "-rw-rw-r-- 1 yashkatariya 10086651 83K Jun 23 06:06 mnist_model.tflite\n"
+          ]
+        }
+      ],
       "source": [
         "!ls -lh {tflite_models_dir}"
       ]
@@ -369,13 +438,29 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": 27,
       "metadata": {
-        "colab": {},
+        "colab": {
+          "height": 34
+        },
         "colab_type": "code",
-        "id": "kzjEjcDs3BHa"
+        "id": "kzjEjcDs3BHa",
+        "outputId": "8d7370ec-3f3f-41a2-8afb-4ecdd40e9efc"
       },
-      "outputs": [],
+      "outputs": [
+        {
+          "data": {
+            "text/plain": [
+              "24784"
+            ]
+          },
+          "execution_count": 27,
+          "metadata": {
+            "tags": []
+          },
+          "output_type": "execute_result"
+        }
+      ],
       "source": [
         "converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8]\n",
         "converter.inference_input_type = tf.uint8\n",
@@ -423,7 +508,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": 28,
       "metadata": {
         "colab": {},
         "colab_type": "code",
@@ -437,7 +522,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": 29,
       "metadata": {
         "colab": {},
         "colab_type": "code",
@@ -465,7 +550,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": 30,
       "metadata": {
         "colab": {},
         "colab_type": "code",
@@ -484,13 +569,29 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": 31,
       "metadata": {
-        "colab": {},
+        "colab": {
+          "height": 281
+        },
         "colab_type": "code",
-        "id": "XZClM2vo3_bm"
+        "id": "XZClM2vo3_bm",
+        "outputId": "3af2e31c-44c6-41f2-c51f-da9d7b71bdfb"
       },
-      "outputs": [],
+      "outputs": [
+        {
+          "data": {
+            "image/png": "iVBORw0KGgoAAAANSUhEUgAAAP8AAAEICAYAAACQ6CLfAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4zLCBo\ndHRwOi8vbWF0cGxvdGxpYi5vcmcvnQurowAAFxZJREFUeJzt3XtU1HXeB/D3cE0RVDSG4eKMPJBL\nIrI6ZqXhBTFrVwwpw5WEAGnLc9ZL2nbbI1arPPV4nix99jRR7aiFz7qmtIu6KhulVrJj4baYHiKI\nq6DCE4pyG7/PH51mI5nf4DAX9Pt+neM5zO/z/f2+H37ynt/M/GbmpxJCCBCRdDzc3QARuQfDTyQp\nhp9IUgw/kaQYfiJJMfxEkmL4yeF6enqgUqlQXV0NAMjOzsaGDRucPm9+fj5mzpzp9HluFgy/nYYN\nG2b55+HhgSFDhlhuv/vuu06fPzs7u1cPvr6+GDlypNPntUd+fj6effZZm+OmT5+OP/7xj07p4Ztv\nvum1v4YNGwaVSoXNmzc7Zb4bgZe7G7hRXbp0yfKzTqdDfn4+5syZY3V8T08PvLwct7vz8/ORn59v\nuZ2WloahQ4c6bPs/Zjab4enp6ZRtu0pERESv/7Ovv/4a48aNw8KFC93YlXvxyO8kzz//PB5++GEs\nXrwY/v7+2LFjB9LS0pCbm2sZc/jwYeh0Osvturo6JCcn49Zbb8XYsWOxdevWfs118eJF7NmzB+np\n6f0a/8O8L7zwAkaNGoWxY8di586dlnpaWhqWL1+OefPmwc/PD0eOHEFHRwdWr16N8PBwqNVqPPHE\nE+jo6LCsk5eXh+DgYISGhsJoNPaa76e/9/vvv4+4uDgEBAQgMjISBw8exG9/+1t8+umn+PWvf41h\nw4Zh5cqVAIBTp05hzpw5CAwMxM9+9jPs3r3bsp1z587hl7/8JQICAnDnnXeiqqqqX78/ABiNRsye\nPRvh4eH9XuemI2jAtFqtOHToUK9lzz33nPD29hYffPCBMJvN4vLly2LJkiVi3bp1ljGHDh0SWq1W\nCCFET0+PmDhxovj9738vOjs7RUVFhdBqteLw4cNCCCFKSkrEqFGj+pz/rbfeEpGRkf3u99ChQ8LT\n01OsWbNGdHR0iOLiYjFkyBBRUVEhhBBiyZIlYsSIEeKTTz4RZrNZdHR0iOXLl4sHHnhAtLS0iO++\n+07cd9994vnnnxdCCPGXv/xFBAcHi/LycnHp0iXx0EMPCQCiqqrKsr0ffu9jx46J4cOHi8OHDwuz\n2SxqamrE6dOnhRBCTJs2TbzzzjuWPtva2kRISIgwGo2iu7tbmEwmERgYaBmfkpIiUlNTRXt7uzh5\n8qQIDg4WM2bMsKw/b9488corr1zz+1+9elVotVqxffv2fu+zmxHD7wDWwj9r1qxey5TCf/ToUTF2\n7Nhe41944QWRnZ1tc/74+Hjx4osv9rvfQ4cOCW9vb9He3m5ZlpycLDZs2GDp89FHH7XUzGaz8PX1\nFdXV1ZZlH3/8seUO55FHHhHPPfecpVZeXm41/JmZmWLNmjV99vXT8O/YsUPMnDmz15jMzEzx0ksv\nia6uLuHp6Wm5wxJCiLVr1/YKvzV///vfhb+/f6/fX0Z8zu9E1/OQ8ttvv0VNTQ1GjBhhWWY2m22+\nel1VVYWjR49i27Zt19XbqFGjer1GoNVq0dDQYLn9497Pnj2Lzs5OTJw40bJM/OjzYA0NDZg2bVqv\nbVlTW1uLKVOm9KvHb7/9FseOHeu1T3p6epCRkYGmpiaYzeZefWq1WpSWltrcrtFoxEMPPeS010hu\nFAy/E6lUql63/fz8cPnyZcvts2fPWn4ODw9HVFQUvvrqq+uaY9u2bZgxY4Zi4Ppy4cIFXLlyBUOG\nDAEA1NTUQK/X99m7Wq2Gj48Pzpw5A7Vafc22NBoNamtrLbdramqszhseHo7Kyso+az/dX+Hh4UhI\nSMD+/fuvGdvd3Q0PDw/U1tYiMjLS5rw/aG9vx+7du1FUVGRz7M2OL/i5UFxcHIqKitDa2orGxka8\n9tprltpdd90FHx8fbNq0CR0dHTCbzfjyyy9x4sQJxW1u27YNGRkZ1yxPS0tDdna21fWuXr2K3Nxc\ndHV1oaSkBPv378eDDz7Y51hPT09kZ2dj5cqVOHfuHIQQqKurw8GDBwEAixYtwttvv43Tp0+jvb0d\n69evtzpvVlYW8vPz8eGHH+Lq1auoq6vDmTNnAHx/J/PNN99YxiYlJaG8vBzvvfceuru70d3djdLS\nUpw5cwbe3t544IEHsG7dOly5cgX/+te/sH37dsV9BQC7d+9GUFAQ7rnnHptjb3YMvwtlZGQgOjoa\nWq0W8+bNQ2pqqqXm5eWFffv2obS0FDqdDqNHj8Zjjz2GtrY2AEBJSUmvh78AcOTIETQ1NSElJeWa\nuWpra3s9FP+psLAw+Pn5QaPRID09Hfn5+YiKirI6ftOmTdBqtbjjjjswfPhwzJ07FxUVFQCA+fPn\nY/ny5ZgxYwZuu+02JCYmWt3O3XffjTfffBO/+c1vMHz4cMyaNcvyqGHlypUoKCjAiBEjsHr1agwf\nPhx/+9vfsGPHDmg0GgQHB+OZZ55BZ2cnAOAPf/gDWltboVarkZWVhUcffbTXXHPnzsXLL7/ca5nR\naMTSpUuveZQhI5UQ/DKPm01HRwd+/vOf48svv+zzvQWHDx9Gdna25R14JCc+578J3XLLLdf92gHJ\nhw/7iSTFh/1EkuKRn0hSLn3O76PyxS3wc+WURFLpQDu6RGe/xg4o/AcOHMCKFStgNpuRnZ2Np59+\nWnH8LfDDVFXCQKYkIgXHRXG/x9r9sN9sNmP58uXYv38/Tp06hYKCApw6dcrezRGRi9kd/tLSUkRG\nRiIiIgI+Pj5ITU1FYWGhI3sjIieyO/z19fW9PlQRFhaG+vr6a8YZDAbo9Xro9Xp0o3/PRYjI+ewO\nf19nCPt6y2ROTg5MJhNMJhO84WvvdETkYHaHPywsrNcnuerq6hASEuKQpojI+ewO/5QpU1BRUYGq\nqip0dXVh586dSEpKcmRvROREdp/q8/LywpYtW3DvvffCbDYjMzMT48ePd2RvRORELn17b4AqkOf5\niZzouChGm2jp11i+vZdIUgw/kaQYfiJJMfxEkmL4iSTF8BNJiuEnkhTDTyQphp9IUgw/kaQYfiJJ\nMfxEkmL4iSTF8BNJiuEnkhTDTyQphp9IUgw/kaQYfiJJMfxEkmL4iSTF8BNJiuEnkhTDTyQphp9I\nUgw/kaQYfiJJMfxEkmL4iSTF8BNJymsgK+t0Ovj7+8PT0xNeXl4wmUyO6ouInGxA4QeADz/8EKNH\nj3ZEL0TkQnzYTySpAYVfpVJh7ty5mDx5MgwGQ59jDAYD9Ho99Ho9utE5kOmIyIFUQghh78oNDQ0I\nCQlBc3MzEhMT8frrryM+Pt7q+ABVIKaqEuydjohsOC6K0SZa+jV2QEf+kJAQAEBQUBCSk5NRWlo6\nkM0RkQvZHf729nZcvHjR8vPBgwcRExPjsMaIyLnsfrW/qakJycnJAICenh786le/wrx58xzWGBE5\nl93hj4iIwMmTJx3ZCxG5EE/1EUmK4SeSFMNPJCmGn0hSDD+RpAb8wR5ZXFh2l9XamEe+Vlz3dLNa\nsd7V6a1YDy1Qrg+tu2S1drXslOK6JC8e+YkkxfATSYrhJ5IUw08kKYafSFIMP5GkGH4iSfE8fz89\ntfY9q7UUv1bllf9jgJPPVC5X91y2Wtt8btYAJ79xlTZrrdb8Ng1XXNer+ISj2xl0eOQnkhTDTyQp\nhp9IUgw/kaQYfiJJMfxEkmL4iSQ1oCv2XK8b+Yo97Q9OtVo7H6t8HzryK+Vd3BqtUqz7xP6fYv3l\nmPet1hKHXFFct+jyMMX6L4Za/66AgboiuhTrxzv9FOszb+m2e+7IoscU67fl/MPubbuTy67YQ0Q3\nLoafSFIMP5GkGH4iSTH8RJJi+IkkxfATSYqf5+8nvz8fV6gNbNsBA1sdrwfPtFp7aZpOee6PlK85\n8PLMSDs66h+vK1cV637/bFSsj/p4t2J9go/16x0MrVa+FoIMbB75MzMzERQUhJiYGMuylpYWJCYm\nIioqComJiWhttfFlFkQ06NgMf0ZGBg4cONBrWV5eHhISElBRUYGEhATk5eU5rUEicg6b4Y+Pj0dg\nYGCvZYWFhUhPTwcApKenY+/evc7pjoicxq7n/E1NTdBoNAAAjUaD5uZmq2MNBgMMBgMAoBud9kxH\nRE7g9Ff7c3JyYDKZYDKZ4A1fZ09HRP1kV/jVajUaG79/JbaxsRFBQUEObYqInM+u8CclJcFoNAIA\njEYjFixY4NCmiMj5bD7nX7x4MUpKSnD+/HmEhYVh/fr1ePrpp7Fo0SK89dZbGDNmDHbt2uWKXsmK\nnrNNVmt+u63XAMBsY9t+f75gR0eO0ZR9l2J9vI/yn+9/tYyzWtO9843iuj2K1ZuDzfAXFBT0uby4\nuNjhzRCR6/DtvUSSYviJJMXwE0mK4SeSFMNPJCl+pJfcxksbrljf8uwWxbq3ylOxvmvzHKu1UY2f\nKq4rAx75iSTF8BNJiuEnkhTDTyQphp9IUgw/kaQYfiJJ8Tw/uc3pVaGK9Sm+ypcuL+9Svvx44KnL\n192TTHjkJ5IUw08kKYafSFIMP5GkGH4iSTH8RJJi+IkkxfP85FSdv5hitfb5g/9tY23lKzw9vmKF\nYn3IJ6U2ti83HvmJJMXwE0mK4SeSFMNPJCmGn0hSDD+RpBh+IknxPD85Vc191o8vw1TK5/EXVyUq\n1oceOKlYF4pVsnnkz8zMRFBQEGJiYizLcnNzERoairi4OMTFxWHfvn1ObZKIHM9m+DMyMnDgwIFr\nlq9atQplZWUoKyvD/fff75TmiMh5bIY/Pj4egYGBruiFiFzI7hf8tmzZgtjYWGRmZqK1tdXqOIPB\nAL1eD71ej2502jsdETmYXeF//PHHUVlZibKyMmg0Gjz55JNWx+bk5MBkMsFkMsHbxgc1iMh17Aq/\nWq2Gp6cnPDw8sGzZMpSW8tNTRDcau8Lf2Nho+XnPnj29zgQQ0Y3B5nn+xYsXo6SkBOfPn0dYWBjW\nr1+PkpISlJWVQaVSQafT4Y033nBFrzQIefj7K9Yfueeo1Vrb1Q7FdZs3RCjWfTv/oVgnZTbDX1BQ\ncM2yrKwspzRDRK7Dt/cSSYrhJ5IUw08kKYafSFIMP5Gk+JFeGpCK3PGK9b+O/h+rtQUVKYrr+u7j\nqTxn4pGfSFIMP5GkGH4iSTH8RJJi+IkkxfATSYrhJ5IUz/OTou/S7lSs//Ph1xTrlT3dVmuX/jNM\ncV1fNCrWaWB45CeSFMNPJCmGn0hSDD+RpBh+Ikkx/ESSYviJJMXz/JLzCg1RrK/83f8q1n1Vyn9C\nqScfsVq7dT8/r+9OPPITSYrhJ5IUw08kKYafSFIMP5GkGH4iSTH8RJKyeZ6/trYWS5cuxdmzZ+Hh\n4YGcnBysWLECLS0tePjhh1FdXQ2dToc//elPGDlypCt6puug8lL+L5741zrF+kPDLijW370YpFhX\n/8768eWq4prkbDaP/F5eXti0aRO++uorfPbZZ9i6dStOnTqFvLw8JCQkoKKiAgkJCcjLy3NFv0Tk\nIDbDr9FoMGnSJACAv78/oqOjUV9fj8LCQqSnpwMA0tPTsXfvXud2SkQOdV3P+aurq/HFF19g6tSp\naGpqgkajAfD9HURzc7NTGiQi5+j3e/svXbqElJQUvPrqqwgICOj3BAaDAQaDAQDQjc7r75CInKJf\nR/7u7m6kpKRgyZIlWLhwIQBArVajsfH7L1hsbGxEUFDfL/zk5OTAZDLBZDLBG74OapuIBspm+IUQ\nyMrKQnR0NFavXm1ZnpSUBKPRCAAwGo1YsGCB87okIodTCSGE0oCjR4/innvuwYQJE+Dh8f19xYYN\nGzB16lQsWrQINTU1GDNmDHbt2oXAwEDFyQJUgZiqSnBc92STarLyJbSLPtg+oO3f/cxyxfqIbZ8O\naPt0fY6LYrSJln6Ntfmcf/r06bB2/1BcXHx9nRHRoMF3+BFJiuEnkhTDTyQphp9IUgw/kaQYfiJJ\n8au7bwKet99mtZazs3BA2779beXz+Lrtnw1o++Q+PPITSYrhJ5IUw08kKYafSFIMP5GkGH4iSTH8\nRJLief6bwOknrH9l+vyhbQPadlhJl/IA5a+DoEGMR34iSTH8RJJi+IkkxfATSYrhJ5IUw08kKYaf\nSFI8z38D6Jh/h2K9eP4mhepQxzZDNw0e+YkkxfATSYrhJ5IUw08kKYafSFIMP5GkGH4iSdk8z19b\nW4ulS5fi7Nmz8PDwQE5ODlasWIHc3Fy8+eabuPXWWwEAGzZswP333+/0hmXUMM1TsT7Gy/5z+e9e\nDFKse7cpf56fn+a/cdkMv5eXFzZt2oRJkybh4sWLmDx5MhITEwEAq1atwpo1a5zeJBE5ns3wazQa\naDQaAIC/vz+io6NRX1/v9MaIyLmu6zl/dXU1vvjiC0ydOhUAsGXLFsTGxiIzMxOtra19rmMwGKDX\n66HX69GNzoF3TEQO0e/wX7p0CSkpKXj11VcREBCAxx9/HJWVlSgrK4NGo8GTTz7Z53o5OTkwmUww\nmUzwhq/DGieigelX+Lu7u5GSkoIlS5Zg4cKFAAC1Wg1PT094eHhg2bJlKC0tdWqjRORYNsMvhEBW\nVhaio6OxevVqy/LGxkbLz3v27EFMTIxzOiQip7D5gt+xY8ewfft2TJgwAXFxcQC+P61XUFCAsrIy\nqFQq6HQ6vPHGG05vlq7fxgu3K9Y/vVenWBeNXzqwGxpMbIZ/+vTpEH18NzvP6RPd2PgOPyJJMfxE\nkmL4iSTF8BNJiuEnkhTDTyQplejrPJ6TBKgCMVWV4KrpiKRzXBSjTbT0ayyP/ESSYviJJMXwE0mK\n4SeSFMNPJCmGn0hSDD+RpFx6iW6fUR5o1VVZbp87d87y1d+DzWDtbbD2BbA3ezmyN5/q/h/PXfom\nn5/S6/UwmUzuml7RYO1tsPYFsDd7uas3PuwnkhTDTyQpz9zc3Fx3NjB58mR3Tq9osPY2WPsC2Ju9\n3NGbW5/zE5H78GE/kaQYfiJJuSX8Bw4cwLhx4xAZGYm8vDx3tGCVTqezXKNAr9e7tZfMzEwEBQX1\nuiBKS0sLEhMTERUVhcTERKvXSHRHb7m5uQgNDUVcXBzi4uKwb98+t/RWW1uLWbNmITo6GuPHj8fm\nzZsBuH/fWevLbftNuFhPT4+IiIgQlZWVorOzU8TGxory8nJXt2GVVqsV586dc3cbQgghPvroI3Hi\nxAkxfvx4y7K1a9eKjRs3CiGE2Lhxo3jqqacGTW/r1q0Tr7zyilv6+bGGhgZx4sQJIYQQbW1tIioq\nSpSXl7t931nry137zeVH/tLSUkRGRiIiIgI+Pj5ITU1FYWGhq9u4IcTHxyMwMLDXssLCQqSnpwMA\n0tPTsXfvXne01mdvg4VGo8GkSZMA9L6svLv3nbW+3MXl4a+vr0d4eLjldlhYmFt3wE+pVCrMnTsX\nkydPhsFgcHc712hqaoJGowHw/R9Tc3OzmzvqrT+XbXelH19WfjDtO3sud+9oLg+/6OPMokqlcnUb\nVh07dgyff/459u/fj61bt+Ljjz92d0s3jP5ett1VfnpZ+cHC3svdO5rLwx8WFoba2lrL7bq6OoSE\nhLi6Dat+6CUoKAjJycmD7tLjarXacoXkxsZGBAUFubmjfxtMl223dll5d++7wXS5e5eHf8qUKaio\nqEBVVRW6urqwc+dOJCUlubqNPrW3t+PixYuWnw8ePDjoLj2elJQEo9EIADAajViwYIGbO/q3wXLZ\ndmHlsvLu3nfW+nLbfnP5S4xCiKKiIhEVFSUiIiLESy+95I4W+lRZWSliY2NFbGysuP32293eW2pq\nqggODhZeXl4iNDRU5Ofni/Pnz4vZs2eLyMhIMXv2bHHhwoVB01taWpqIiYkREyZMEPPnzxcNDQ1u\n6e3IkSMCgJgwYYKYOHGimDhxoigqKnL7vrPWl7v2G9/eSyQpvsOPSFIMP5GkGH4iSTH8RJJi+Ikk\nxfATSYrhJ5LU/wOdAGX9nfSgHgAAAABJRU5ErkJggg==\n",
+            "text/plain": [
+              "\u003cFigure size 600x400 with 1 Axes\u003e"
+            ]
+          },
+          "metadata": {
+            "tags": []
+          },
+          "output_type": "display_data"
+        }
+      ],
       "source": [
         "import matplotlib.pylab as plt\n",
         "\n",
@@ -513,7 +614,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": 33,
       "metadata": {
         "colab": {},
         "colab_type": "code",
@@ -530,7 +631,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": null,
       "metadata": {
         "colab": {},
         "colab_type": "code",
@@ -557,7 +658,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": null,
       "metadata": {
         "colab": {},
         "colab_type": "code",
@@ -599,7 +700,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": null,
       "metadata": {
         "colab": {},
         "colab_type": "code",
@@ -622,7 +723,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": null,
       "metadata": {
         "colab": {},
         "colab_type": "code",
@@ -653,7 +754,6 @@
     "colab": {
       "collapsed_sections": [],
       "name": "post_training_integer_quant.ipynb",
-      "private_outputs": true,
       "provenance": [],
       "toc_visible": true
     },
diff --git a/tensorflow/lite/g3doc/performance/post_training_quant.ipynb b/tensorflow/lite/g3doc/performance/post_training_quant.ipynb
index 201ccf5bdc3..5341fe5e4fb 100644
--- a/tensorflow/lite/g3doc/performance/post_training_quant.ipynb
+++ b/tensorflow/lite/g3doc/performance/post_training_quant.ipynb
@@ -12,7 +12,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": 1,
       "metadata": {
         "cellView": "form",
         "colab": {},
@@ -126,7 +126,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": 2,
       "metadata": {
         "colab": {},
         "colab_type": "code",
@@ -155,13 +155,36 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": 3,
       "metadata": {
-        "colab": {},
+        "colab": {
+          "height": 51
+        },
         "colab_type": "code",
-        "id": "hWSAjQWagIHl"
+        "id": "hWSAjQWagIHl",
+        "outputId": "961899f8-1597-4417-b21d-cae94a330ecc"
       },
-      "outputs": [],
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "1875/1875 [==============================] - 10s 5ms/step - loss: 0.2787 - accuracy: 0.9203 - val_loss: 0.1323 - val_accuracy: 0.9624\n"
+          ]
+        },
+        {
+          "data": {
+            "text/plain": [
+              "\u003ctensorflow.python.keras.callbacks.History at 0x7f6443480e80\u003e"
+            ]
+          },
+          "execution_count": 3,
+          "metadata": {
+            "tags": []
+          },
+          "output_type": "execute_result"
+        }
+      ],
       "source": [
         "# Load MNIST dataset\n",
         "mnist = keras.datasets.mnist\n",
@@ -200,8 +223,7 @@
         "id": "5NMaNZQCkW9X"
       },
       "source": [
-        "For the example, since you trained the model for just a single epoch, so it only trains to ~96% accuracy.\n",
-        "\n"
+        "For the example, since you trained the model for just a single epoch, so it only trains to ~96% accuracy.\n"
       ]
     },
     {
@@ -220,7 +242,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": 4,
       "metadata": {
         "colab": {},
         "colab_type": "code",
@@ -244,7 +266,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": 5,
       "metadata": {
         "colab": {},
         "colab_type": "code",
@@ -258,13 +280,29 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": 6,
       "metadata": {
-        "colab": {},
+        "colab": {
+          "height": 34
+        },
         "colab_type": "code",
-        "id": "Ie9pQaQrn5ue"
+        "id": "Ie9pQaQrn5ue",
+        "outputId": "046db0bc-1745-4e94-9f21-f7e91bdaebda"
       },
-      "outputs": [],
+      "outputs": [
+        {
+          "data": {
+            "text/plain": [
+              "84452"
+            ]
+          },
+          "execution_count": 6,
+          "metadata": {
+            "tags": []
+          },
+          "output_type": "execute_result"
+        }
+      ],
       "source": [
         "tflite_model_file = tflite_models_dir/\"mnist_model.tflite\"\n",
         "tflite_model_file.write_bytes(tflite_model)"
@@ -282,13 +320,29 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": 7,
       "metadata": {
-        "colab": {},
+        "colab": {
+          "height": 34
+        },
         "colab_type": "code",
-        "id": "g8PUvLWDlmmz"
+        "id": "g8PUvLWDlmmz",
+        "outputId": "d79b45d3-babf-4890-8036-de2f497da88a"
       },
-      "outputs": [],
+      "outputs": [
+        {
+          "data": {
+            "text/plain": [
+              "23840"
+            ]
+          },
+          "execution_count": 7,
+          "metadata": {
+            "tags": []
+          },
+          "output_type": "execute_result"
+        }
+      ],
       "source": [
         "converter.optimizations = [tf.lite.Optimize.DEFAULT]\n",
         "tflite_quant_model = converter.convert()\n",
@@ -308,13 +362,29 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": 8,
       "metadata": {
-        "colab": {},
+        "colab": {
+          "height": 119
+        },
         "colab_type": "code",
-        "id": "JExfcfLDscu4"
+        "id": "JExfcfLDscu4",
+        "outputId": "d1fda4c2-343e-40fb-f90f-b6bde00c523e"
       },
-      "outputs": [],
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "total 214M\n",
+            "-rw-rw-r-- 1 colaboratory-playground 50844828  44K Jun 23 06:04 mnist_model_quant_f16.tflite\n",
+            "-rw-rw-r-- 1 colaboratory-playground 50844828  24K Jun 23 06:12 mnist_model_quant.tflite\n",
+            "-rw-rw-r-- 1 colaboratory-playground 50844828  83K Jun 23 06:12 mnist_model.tflite\n",
+            "-rw-rw-r-- 1 colaboratory-playground 50844828  44M Jun 23 06:10 resnet_v2_101_quantized.tflite\n",
+            "-rw-rw-r-- 1 colaboratory-playground 50844828 171M Jun 23 06:09 resnet_v2_101.tflite\n"
+          ]
+        }
+      ],
       "source": [
         "!ls -lh {tflite_models_dir}"
       ]
@@ -329,8 +399,7 @@
         "## Run the TFLite models\n",
         "\n",
         "Run the TensorFlow Lite model using the Python TensorFlow Lite\n",
-        "Interpreter.\n",
-        "\n"
+        "Interpreter.\n"
       ]
     },
     {
@@ -345,7 +414,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": 9,
       "metadata": {
         "colab": {},
         "colab_type": "code",
@@ -359,7 +428,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": 10,
       "metadata": {
         "colab": {},
         "colab_type": "code",
@@ -383,7 +452,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": 11,
       "metadata": {
         "colab": {},
         "colab_type": "code",
@@ -403,13 +472,29 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": 12,
       "metadata": {
-        "colab": {},
+        "colab": {
+          "height": 281
+        },
         "colab_type": "code",
-        "id": "XZClM2vo3_bm"
+        "id": "XZClM2vo3_bm",
+        "outputId": "0fa4155b-01f8-4fea-f586-d9044d73572e"
       },
-      "outputs": [],
+      "outputs": [
+        {
+          "data": {
+            "image/png": "iVBORw0KGgoAAAANSUhEUgAAAP8AAAEICAYAAACQ6CLfAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4zLCBo\ndHRwOi8vbWF0cGxvdGxpYi5vcmcvnQurowAAFxZJREFUeJzt3XtU1HXeB/D3cE0RVDSG4eKMPJBL\nIrI6ZqXhBTFrVwwpw5WEAGnLc9ZL2nbbI1arPPV4nix99jRR7aiFz7qmtIu6KhulVrJj4baYHiKI\nq6DCE4pyG7/PH51mI5nf4DAX9Pt+neM5zO/z/f2+H37ynt/M/GbmpxJCCBCRdDzc3QARuQfDTyQp\nhp9IUgw/kaQYfiJJMfxEkmL4yeF6enqgUqlQXV0NAMjOzsaGDRucPm9+fj5mzpzp9HluFgy/nYYN\nG2b55+HhgSFDhlhuv/vuu06fPzs7u1cPvr6+GDlypNPntUd+fj6effZZm+OmT5+OP/7xj07p4Ztv\nvum1v4YNGwaVSoXNmzc7Zb4bgZe7G7hRXbp0yfKzTqdDfn4+5syZY3V8T08PvLwct7vz8/ORn59v\nuZ2WloahQ4c6bPs/Zjab4enp6ZRtu0pERESv/7Ovv/4a48aNw8KFC93YlXvxyO8kzz//PB5++GEs\nXrwY/v7+2LFjB9LS0pCbm2sZc/jwYeh0Osvturo6JCcn49Zbb8XYsWOxdevWfs118eJF7NmzB+np\n6f0a/8O8L7zwAkaNGoWxY8di586dlnpaWhqWL1+OefPmwc/PD0eOHEFHRwdWr16N8PBwqNVqPPHE\nE+jo6LCsk5eXh+DgYISGhsJoNPaa76e/9/vvv4+4uDgEBAQgMjISBw8exG9/+1t8+umn+PWvf41h\nw4Zh5cqVAIBTp05hzpw5CAwMxM9+9jPs3r3bsp1z587hl7/8JQICAnDnnXeiqqqqX78/ABiNRsye\nPRvh4eH9XuemI2jAtFqtOHToUK9lzz33nPD29hYffPCBMJvN4vLly2LJkiVi3bp1ljGHDh0SWq1W\nCCFET0+PmDhxovj9738vOjs7RUVFhdBqteLw4cNCCCFKSkrEqFGj+pz/rbfeEpGRkf3u99ChQ8LT\n01OsWbNGdHR0iOLiYjFkyBBRUVEhhBBiyZIlYsSIEeKTTz4RZrNZdHR0iOXLl4sHHnhAtLS0iO++\n+07cd9994vnnnxdCCPGXv/xFBAcHi/LycnHp0iXx0EMPCQCiqqrKsr0ffu9jx46J4cOHi8OHDwuz\n2SxqamrE6dOnhRBCTJs2TbzzzjuWPtva2kRISIgwGo2iu7tbmEwmERgYaBmfkpIiUlNTRXt7uzh5\n8qQIDg4WM2bMsKw/b9488corr1zz+1+9elVotVqxffv2fu+zmxHD7wDWwj9r1qxey5TCf/ToUTF2\n7Nhe41944QWRnZ1tc/74+Hjx4osv9rvfQ4cOCW9vb9He3m5ZlpycLDZs2GDp89FHH7XUzGaz8PX1\nFdXV1ZZlH3/8seUO55FHHhHPPfecpVZeXm41/JmZmWLNmjV99vXT8O/YsUPMnDmz15jMzEzx0ksv\nia6uLuHp6Wm5wxJCiLVr1/YKvzV///vfhb+/f6/fX0Z8zu9E1/OQ8ttvv0VNTQ1GjBhhWWY2m22+\nel1VVYWjR49i27Zt19XbqFGjer1GoNVq0dDQYLn9497Pnj2Lzs5OTJw40bJM/OjzYA0NDZg2bVqv\nbVlTW1uLKVOm9KvHb7/9FseOHeu1T3p6epCRkYGmpiaYzeZefWq1WpSWltrcrtFoxEMPPeS010hu\nFAy/E6lUql63/fz8cPnyZcvts2fPWn4ODw9HVFQUvvrqq+uaY9u2bZgxY4Zi4Ppy4cIFXLlyBUOG\nDAEA1NTUQK/X99m7Wq2Gj48Pzpw5A7Vafc22NBoNamtrLbdramqszhseHo7Kyso+az/dX+Hh4UhI\nSMD+/fuvGdvd3Q0PDw/U1tYiMjLS5rw/aG9vx+7du1FUVGRz7M2OL/i5UFxcHIqKitDa2orGxka8\n9tprltpdd90FHx8fbNq0CR0dHTCbzfjyyy9x4sQJxW1u27YNGRkZ1yxPS0tDdna21fWuXr2K3Nxc\ndHV1oaSkBPv378eDDz7Y51hPT09kZ2dj5cqVOHfuHIQQqKurw8GDBwEAixYtwttvv43Tp0+jvb0d\n69evtzpvVlYW8vPz8eGHH+Lq1auoq6vDmTNnAHx/J/PNN99YxiYlJaG8vBzvvfceuru70d3djdLS\nUpw5cwbe3t544IEHsG7dOly5cgX/+te/sH37dsV9BQC7d+9GUFAQ7rnnHptjb3YMvwtlZGQgOjoa\nWq0W8+bNQ2pqqqXm5eWFffv2obS0FDqdDqNHj8Zjjz2GtrY2AEBJSUmvh78AcOTIETQ1NSElJeWa\nuWpra3s9FP+psLAw+Pn5QaPRID09Hfn5+YiKirI6ftOmTdBqtbjjjjswfPhwzJ07FxUVFQCA+fPn\nY/ny5ZgxYwZuu+02JCYmWt3O3XffjTfffBO/+c1vMHz4cMyaNcvyqGHlypUoKCjAiBEjsHr1agwf\nPhx/+9vfsGPHDmg0GgQHB+OZZ55BZ2cnAOAPf/gDWltboVarkZWVhUcffbTXXHPnzsXLL7/ca5nR\naMTSpUuveZQhI5UQ/DKPm01HRwd+/vOf48svv+zzvQWHDx9Gdna25R14JCc+578J3XLLLdf92gHJ\nhw/7iSTFh/1EkuKRn0hSLn3O76PyxS3wc+WURFLpQDu6RGe/xg4o/AcOHMCKFStgNpuRnZ2Np59+\nWnH8LfDDVFXCQKYkIgXHRXG/x9r9sN9sNmP58uXYv38/Tp06hYKCApw6dcrezRGRi9kd/tLSUkRG\nRiIiIgI+Pj5ITU1FYWGhI3sjIieyO/z19fW9PlQRFhaG+vr6a8YZDAbo9Xro9Xp0o3/PRYjI+ewO\nf19nCPt6y2ROTg5MJhNMJhO84WvvdETkYHaHPywsrNcnuerq6hASEuKQpojI+ewO/5QpU1BRUYGq\nqip0dXVh586dSEpKcmRvROREdp/q8/LywpYtW3DvvffCbDYjMzMT48ePd2RvRORELn17b4AqkOf5\niZzouChGm2jp11i+vZdIUgw/kaQYfiJJMfxEkmL4iSTF8BNJiuEnkhTDTyQphp9IUgw/kaQYfiJJ\nMfxEkmL4iSTF8BNJiuEnkhTDTyQphp9IUgw/kaQYfiJJMfxEkmL4iSTF8BNJiuEnkhTDTyQphp9I\nUgw/kaQYfiJJMfxEkmL4iSTF8BNJymsgK+t0Ovj7+8PT0xNeXl4wmUyO6ouInGxA4QeADz/8EKNH\nj3ZEL0TkQnzYTySpAYVfpVJh7ty5mDx5MgwGQ59jDAYD9Ho99Ho9utE5kOmIyIFUQghh78oNDQ0I\nCQlBc3MzEhMT8frrryM+Pt7q+ABVIKaqEuydjohsOC6K0SZa+jV2QEf+kJAQAEBQUBCSk5NRWlo6\nkM0RkQvZHf729nZcvHjR8vPBgwcRExPjsMaIyLnsfrW/qakJycnJAICenh786le/wrx58xzWGBE5\nl93hj4iIwMmTJx3ZCxG5EE/1EUmK4SeSFMNPJCmGn0hSDD+RpAb8wR5ZXFh2l9XamEe+Vlz3dLNa\nsd7V6a1YDy1Qrg+tu2S1drXslOK6JC8e+YkkxfATSYrhJ5IUw08kKYafSFIMP5GkGH4iSfE8fz89\ntfY9q7UUv1bllf9jgJPPVC5X91y2Wtt8btYAJ79xlTZrrdb8Ng1XXNer+ISj2xl0eOQnkhTDTyQp\nhp9IUgw/kaQYfiJJMfxEkmL4iSQ1oCv2XK8b+Yo97Q9OtVo7H6t8HzryK+Vd3BqtUqz7xP6fYv3l\nmPet1hKHXFFct+jyMMX6L4Za/66AgboiuhTrxzv9FOszb+m2e+7IoscU67fl/MPubbuTy67YQ0Q3\nLoafSFIMP5GkGH4iSTH8RJJi+IkkxfATSYqf5+8nvz8fV6gNbNsBA1sdrwfPtFp7aZpOee6PlK85\n8PLMSDs66h+vK1cV637/bFSsj/p4t2J9go/16x0MrVa+FoIMbB75MzMzERQUhJiYGMuylpYWJCYm\nIioqComJiWhttfFlFkQ06NgMf0ZGBg4cONBrWV5eHhISElBRUYGEhATk5eU5rUEicg6b4Y+Pj0dg\nYGCvZYWFhUhPTwcApKenY+/evc7pjoicxq7n/E1NTdBoNAAAjUaD5uZmq2MNBgMMBgMAoBud9kxH\nRE7g9Ff7c3JyYDKZYDKZ4A1fZ09HRP1kV/jVajUaG79/JbaxsRFBQUEObYqInM+u8CclJcFoNAIA\njEYjFixY4NCmiMj5bD7nX7x4MUpKSnD+/HmEhYVh/fr1ePrpp7Fo0SK89dZbGDNmDHbt2uWKXsmK\nnrNNVmt+u63XAMBsY9t+f75gR0eO0ZR9l2J9vI/yn+9/tYyzWtO9843iuj2K1ZuDzfAXFBT0uby4\nuNjhzRCR6/DtvUSSYviJJMXwE0mK4SeSFMNPJCl+pJfcxksbrljf8uwWxbq3ylOxvmvzHKu1UY2f\nKq4rAx75iSTF8BNJiuEnkhTDTyQphp9IUgw/kaQYfiJJ8Tw/uc3pVaGK9Sm+ypcuL+9Svvx44KnL\n192TTHjkJ5IUw08kKYafSFIMP5GkGH4iSTH8RJJi+IkkxfP85FSdv5hitfb5g/9tY23lKzw9vmKF\nYn3IJ6U2ti83HvmJJMXwE0mK4SeSFMNPJCmGn0hSDD+RpBh+IknxPD85Vc191o8vw1TK5/EXVyUq\n1oceOKlYF4pVsnnkz8zMRFBQEGJiYizLcnNzERoairi4OMTFxWHfvn1ObZKIHM9m+DMyMnDgwIFr\nlq9atQplZWUoKyvD/fff75TmiMh5bIY/Pj4egYGBruiFiFzI7hf8tmzZgtjYWGRmZqK1tdXqOIPB\nAL1eD71ej2502jsdETmYXeF//PHHUVlZibKyMmg0Gjz55JNWx+bk5MBkMsFkMsHbxgc1iMh17Aq/\nWq2Gp6cnPDw8sGzZMpSW8tNTRDcau8Lf2Nho+XnPnj29zgQQ0Y3B5nn+xYsXo6SkBOfPn0dYWBjW\nr1+PkpISlJWVQaVSQafT4Y033nBFrzQIefj7K9Yfueeo1Vrb1Q7FdZs3RCjWfTv/oVgnZTbDX1BQ\ncM2yrKwspzRDRK7Dt/cSSYrhJ5IUw08kKYafSFIMP5Gk+JFeGpCK3PGK9b+O/h+rtQUVKYrr+u7j\nqTxn4pGfSFIMP5GkGH4iSTH8RJJi+IkkxfATSYrhJ5IUz/OTou/S7lSs//Ph1xTrlT3dVmuX/jNM\ncV1fNCrWaWB45CeSFMNPJCmGn0hSDD+RpBh+Ikkx/ESSYviJJMXz/JLzCg1RrK/83f8q1n1Vyn9C\nqScfsVq7dT8/r+9OPPITSYrhJ5IUw08kKYafSFIMP5GkGH4iSTH8RJKyeZ6/trYWS5cuxdmzZ+Hh\n4YGcnBysWLECLS0tePjhh1FdXQ2dToc//elPGDlypCt6puug8lL+L5741zrF+kPDLijW370YpFhX\n/8768eWq4prkbDaP/F5eXti0aRO++uorfPbZZ9i6dStOnTqFvLw8JCQkoKKiAgkJCcjLy3NFv0Tk\nIDbDr9FoMGnSJACAv78/oqOjUV9fj8LCQqSnpwMA0tPTsXfvXud2SkQOdV3P+aurq/HFF19g6tSp\naGpqgkajAfD9HURzc7NTGiQi5+j3e/svXbqElJQUvPrqqwgICOj3BAaDAQaDAQDQjc7r75CInKJf\nR/7u7m6kpKRgyZIlWLhwIQBArVajsfH7L1hsbGxEUFDfL/zk5OTAZDLBZDLBG74OapuIBspm+IUQ\nyMrKQnR0NFavXm1ZnpSUBKPRCAAwGo1YsGCB87okIodTCSGE0oCjR4/innvuwYQJE+Dh8f19xYYN\nGzB16lQsWrQINTU1GDNmDHbt2oXAwEDFyQJUgZiqSnBc92STarLyJbSLPtg+oO3f/cxyxfqIbZ8O\naPt0fY6LYrSJln6Ntfmcf/r06bB2/1BcXHx9nRHRoMF3+BFJiuEnkhTDTyQphp9IUgw/kaQYfiJJ\n8au7bwKet99mtZazs3BA2779beXz+Lrtnw1o++Q+PPITSYrhJ5IUw08kKYafSFIMP5GkGH4iSTH8\nRJLief6bwOknrH9l+vyhbQPadlhJl/IA5a+DoEGMR34iSTH8RJJi+IkkxfATSYrhJ5IUw08kKYaf\nSFI8z38D6Jh/h2K9eP4mhepQxzZDNw0e+YkkxfATSYrhJ5IUw08kKYafSFIMP5GkGH4iSdk8z19b\nW4ulS5fi7Nmz8PDwQE5ODlasWIHc3Fy8+eabuPXWWwEAGzZswP333+/0hmXUMM1TsT7Gy/5z+e9e\nDFKse7cpf56fn+a/cdkMv5eXFzZt2oRJkybh4sWLmDx5MhITEwEAq1atwpo1a5zeJBE5ns3wazQa\naDQaAIC/vz+io6NRX1/v9MaIyLmu6zl/dXU1vvjiC0ydOhUAsGXLFsTGxiIzMxOtra19rmMwGKDX\n66HX69GNzoF3TEQO0e/wX7p0CSkpKXj11VcREBCAxx9/HJWVlSgrK4NGo8GTTz7Z53o5OTkwmUww\nmUzwhq/DGieigelX+Lu7u5GSkoIlS5Zg4cKFAAC1Wg1PT094eHhg2bJlKC0tdWqjRORYNsMvhEBW\nVhaio6OxevVqy/LGxkbLz3v27EFMTIxzOiQip7D5gt+xY8ewfft2TJgwAXFxcQC+P61XUFCAsrIy\nqFQq6HQ6vPHGG05vlq7fxgu3K9Y/vVenWBeNXzqwGxpMbIZ/+vTpEH18NzvP6RPd2PgOPyJJMfxE\nkmL4iSTF8BNJiuEnkhTDTyQplejrPJ6TBKgCMVWV4KrpiKRzXBSjTbT0ayyP/ESSYviJJMXwE0mK\n4SeSFMNPJCmGn0hSDD+RpFx6iW6fUR5o1VVZbp87d87y1d+DzWDtbbD2BbA3ezmyN5/q/h/PXfom\nn5/S6/UwmUzuml7RYO1tsPYFsDd7uas3PuwnkhTDTyQpz9zc3Fx3NjB58mR3Tq9osPY2WPsC2Ju9\n3NGbW5/zE5H78GE/kaQYfiJJuSX8Bw4cwLhx4xAZGYm8vDx3tGCVTqezXKNAr9e7tZfMzEwEBQX1\nuiBKS0sLEhMTERUVhcTERKvXSHRHb7m5uQgNDUVcXBzi4uKwb98+t/RWW1uLWbNmITo6GuPHj8fm\nzZsBuH/fWevLbftNuFhPT4+IiIgQlZWVorOzU8TGxory8nJXt2GVVqsV586dc3cbQgghPvroI3Hi\nxAkxfvx4y7K1a9eKjRs3CiGE2Lhxo3jqqacGTW/r1q0Tr7zyilv6+bGGhgZx4sQJIYQQbW1tIioq\nSpSXl7t931nry137zeVH/tLSUkRGRiIiIgI+Pj5ITU1FYWGhq9u4IcTHxyMwMLDXssLCQqSnpwMA\n0tPTsXfvXne01mdvg4VGo8GkSZMA9L6svLv3nbW+3MXl4a+vr0d4eLjldlhYmFt3wE+pVCrMnTsX\nkydPhsFgcHc712hqaoJGowHw/R9Tc3OzmzvqrT+XbXelH19WfjDtO3sud+9oLg+/6OPMokqlcnUb\nVh07dgyff/459u/fj61bt+Ljjz92d0s3jP5ett1VfnpZ+cHC3svdO5rLwx8WFoba2lrL7bq6OoSE\nhLi6Dat+6CUoKAjJycmD7tLjarXacoXkxsZGBAUFubmjfxtMl223dll5d++7wXS5e5eHf8qUKaio\nqEBVVRW6urqwc+dOJCUlubqNPrW3t+PixYuWnw8ePDjoLj2elJQEo9EIADAajViwYIGbO/q3wXLZ\ndmHlsvLu3nfW+nLbfnP5S4xCiKKiIhEVFSUiIiLESy+95I4W+lRZWSliY2NFbGysuP32293eW2pq\nqggODhZeXl4iNDRU5Ofni/Pnz4vZs2eLyMhIMXv2bHHhwoVB01taWpqIiYkREyZMEPPnzxcNDQ1u\n6e3IkSMCgJgwYYKYOHGimDhxoigqKnL7vrPWl7v2G9/eSyQpvsOPSFIMP5GkGH4iSTH8RJJi+Ikk\nxfATSYrhJ5LU/wOdAGX9nfSgHgAAAABJRU5ErkJggg==\n",
+            "text/plain": [
+              "\u003cFigure size 600x400 with 1 Axes\u003e"
+            ]
+          },
+          "metadata": {
+            "tags": []
+          },
+          "output_type": "display_data"
+        }
+      ],
       "source": [
         "import matplotlib.pylab as plt\n",
         "\n",
@@ -432,7 +517,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": 13,
       "metadata": {
         "colab": {},
         "colab_type": "code",
@@ -474,13 +559,24 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": 14,
       "metadata": {
-        "colab": {},
+        "colab": {
+          "height": 34
+        },
         "colab_type": "code",
-        "id": "DqXBnDfJ7qxL"
+        "id": "DqXBnDfJ7qxL",
+        "outputId": "78f393f8-c4a5-41e0-abe4-ab6a5c394e51"
       },
-      "outputs": [],
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "0.9624\n"
+          ]
+        }
+      ],
       "source": [
         "print(evaluate_model(interpreter))"
       ]
@@ -497,13 +593,24 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": 15,
       "metadata": {
-        "colab": {},
+        "colab": {
+          "height": 34
+        },
         "colab_type": "code",
-        "id": "-9cnwiPp6EGm"
+        "id": "-9cnwiPp6EGm",
+        "outputId": "d82552d7-8a2c-49dc-a19a-56010a013102"
       },
-      "outputs": [],
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "0.9626\n"
+          ]
+        }
+      ],
       "source": [
         "print(evaluate_model(interpreter_quant))"
       ]
@@ -515,7 +622,6 @@
         "id": "L7lfxkor8pgv"
       },
       "source": [
-        "\n",
         "In this example, the compressed model has no difference in the accuracy."
       ]
     },
@@ -537,7 +643,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": 16,
       "metadata": {
         "colab": {},
         "colab_type": "code",
@@ -557,13 +663,29 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": 17,
       "metadata": {
-        "colab": {},
+        "colab": {
+          "height": 34
+        },
         "colab_type": "code",
-        "id": "LwnV4KxwVEoG"
+        "id": "LwnV4KxwVEoG",
+        "outputId": "7d50f90d-6104-43a3-863c-28db9465d483"
       },
-      "outputs": [],
+      "outputs": [
+        {
+          "data": {
+            "text/plain": [
+              "178509092"
+            ]
+          },
+          "execution_count": 17,
+          "metadata": {
+            "tags": []
+          },
+          "output_type": "execute_result"
+        }
+      ],
       "source": [
         "# Convert to TF Lite without quantization\n",
         "resnet_tflite_file = tflite_models_dir/\"resnet_v2_101.tflite\"\n",
@@ -572,13 +694,29 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": 18,
       "metadata": {
-        "colab": {},
+        "colab": {
+          "height": 34
+        },
         "colab_type": "code",
-        "id": "2qkZD0VoVExe"
+        "id": "2qkZD0VoVExe",
+        "outputId": "76a47590-fa91-49b9-f568-4e00b46c9537"
       },
-      "outputs": [],
+      "outputs": [
+        {
+          "data": {
+            "text/plain": [
+              "45182656"
+            ]
+          },
+          "execution_count": 18,
+          "metadata": {
+            "tags": []
+          },
+          "output_type": "execute_result"
+        }
+      ],
       "source": [
         "# Convert to TF Lite with quantization\n",
         "converter.optimizations = [tf.lite.Optimize.DEFAULT]\n",
@@ -588,13 +726,28 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": 19,
       "metadata": {
-        "colab": {},
+        "colab": {
+          "height": 102
+        },
         "colab_type": "code",
-        "id": "vhOjeg1x9Knp"
+        "id": "vhOjeg1x9Knp",
+        "outputId": "c643a660-f815-49f0-ac4b-ac48af3c1203"
       },
-      "outputs": [],
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "-rw-rw-r-- 1 colaboratory-playground 50844828  44K Jun 23 06:04 /tmp/mnist_tflite_models/mnist_model_quant_f16.tflite\n",
+            "-rw-rw-r-- 1 colaboratory-playground 50844828  24K Jun 23 06:12 /tmp/mnist_tflite_models/mnist_model_quant.tflite\n",
+            "-rw-rw-r-- 1 colaboratory-playground 50844828  83K Jun 23 06:12 /tmp/mnist_tflite_models/mnist_model.tflite\n",
+            "-rw-rw-r-- 1 colaboratory-playground 50844828  44M Jun 23 06:13 /tmp/mnist_tflite_models/resnet_v2_101_quantized.tflite\n",
+            "-rw-rw-r-- 1 colaboratory-playground 50844828 171M Jun 23 06:12 /tmp/mnist_tflite_models/resnet_v2_101.tflite\n"
+          ]
+        }
+      ],
       "source": [
         "!ls -lh {tflite_models_dir}/*.tflite"
       ]
@@ -606,7 +759,6 @@
         "id": "qqHLaqFMCjRZ"
       },
       "source": [
-        "\n",
         "The model size reduces from 171 MB to 43 MB.\n",
         "The accuracy of this model on imagenet can be evaluated using the scripts provided for [TFLite accuracy measurement](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/tools/accuracy/ilsvrc).\n",
         "\n",
@@ -618,7 +770,6 @@
     "colab": {
       "collapsed_sections": [],
       "name": "post_training_quant.ipynb",
-      "private_outputs": true,
       "provenance": [],
       "toc_visible": true
     },

From 89851a6a7725d92735e9817ee6a1f551dd7492ab Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 23 Jun 2020 08:42:05 -0700
Subject: [PATCH 0886/1390] Replace deprecated thread annotations macros.

PiperOrigin-RevId: 317872415
Change-Id: I2e3fcd5cebc7a2b4d63c1322a2386484b336479c
---
 tensorflow/compiler/xla/pjrt/distributed/service.h | 10 +++++-----
 tensorflow/compiler/xla/pjrt/nvidia_gpu_device.cc  |  2 +-
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/tensorflow/compiler/xla/pjrt/distributed/service.h b/tensorflow/compiler/xla/pjrt/distributed/service.h
index 725a76791ce..9ecbdb3cc7c 100644
--- a/tensorflow/compiler/xla/pjrt/distributed/service.h
+++ b/tensorflow/compiler/xla/pjrt/distributed/service.h
@@ -54,15 +54,15 @@ class DistributedRuntimeServiceImpl final
 
   absl::Mutex mu_;
   enum class State { kInitializing, kRunning };
-  State state_ GUARDED_BY(mu_) = State::kInitializing;
+  State state_ ABSL_GUARDED_BY(mu_) = State::kInitializing;
 
-  std::vector<LocalTopologyProto> local_topologies_ GUARDED_BY(mu_);
-  GlobalTopologyProto topology_ GUARDED_BY(mu_);
+  std::vector<LocalTopologyProto> local_topologies_ ABSL_GUARDED_BY(mu_);
+  GlobalTopologyProto topology_ ABSL_GUARDED_BY(mu_);
   struct Node {
     bool present = false;
   };
-  int num_nodes_present_ GUARDED_BY(mu_) = 0;
-  std::vector<Node> nodes_ GUARDED_BY(mu_);
+  int num_nodes_present_ ABSL_GUARDED_BY(mu_) = 0;
+  std::vector<Node> nodes_ ABSL_GUARDED_BY(mu_);
 
   KeyValueStore key_value_store_;
 };
diff --git a/tensorflow/compiler/xla/pjrt/nvidia_gpu_device.cc b/tensorflow/compiler/xla/pjrt/nvidia_gpu_device.cc
index de760af8fd9..edffaf6c877 100644
--- a/tensorflow/compiler/xla/pjrt/nvidia_gpu_device.cc
+++ b/tensorflow/compiler/xla/pjrt/nvidia_gpu_device.cc
@@ -169,7 +169,7 @@ class NcclIdStore {
   const std::shared_ptr<DistributedRuntimeClient> client_;
 
   absl::Mutex mu_;
-  absl::flat_hash_map<std::string, std::string> cache_ GUARDED_BY(mu_);
+  absl::flat_hash_map<std::string, std::string> cache_ ABSL_GUARDED_BY(mu_);
 };
 
 StatusOr<std::string> NcclIdStore::GetNcclUniqueId(const NcclCliqueKey& key) {

From f5a9d24c847ffcc7ae09e850aad39e2cb55ae4f3 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 23 Jun 2020 08:57:59 -0700
Subject: [PATCH 0887/1390] Added shape inference for tf_device.LaunchOp.

PiperOrigin-RevId: 317875384
Change-Id: Idb070c9e92d07ee19cd8ed26c1beec3de86f43df
---
 .../mlir/tensorflow/tests/shape_inference.mlir      | 13 +++++++++++++
 .../mlir/tensorflow/transforms/shape_inference.cc   |  4 ++++
 2 files changed, 17 insertions(+)

diff --git a/tensorflow/compiler/mlir/tensorflow/tests/shape_inference.mlir b/tensorflow/compiler/mlir/tensorflow/tests/shape_inference.mlir
index 1599d53ed15..1af4ba6b3dc 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/shape_inference.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/shape_inference.mlir
@@ -433,4 +433,17 @@ func @multiple_blocks_one_return(%arg0: tensor<?xf32>) -> tensor<*xf32> {
     // CHECK: return %[[CAST_RESULT_0]], %[[CAST_RESULT_1]], %[[ADDI]]
     return %27, %28, %2 : tensor<*xui8>, tensor<*xi8>, tensor<*xi8>
   }
+
+  // CHECK-LABEL: infer_device_launch
+  func @infer_device_launch(%arg0: tensor<1x8x2xi32>) -> (tensor<*xf32>, tensor<*xf32>) {
+    %0 = "tf.Const"() {value = dense<-1> : tensor<i32>} : () -> tensor<i32>
+    %1 = "tf_device.launch"() ({
+      %2 = "tf.Cast"(%arg0) {Truncate = false} : (tensor<1x8x2xi32>) -> tensor<1x8x2xf32>
+      tf_device.return %2 : tensor<1x8x2xf32>
+    // CHECK: () -> tensor<1x8x2xf32>
+    }) {device = "/device:CPU:0"} : () -> tensor<*xf32>
+    // CHECK: (tensor<i32>, tensor<1x8x2xf32>) -> (tensor<1x8x1xf32>, tensor<1x8x1xf32>)
+    %3:2 = "tf.Split"(%0, %1) {device = ""} : (tensor<i32>, tensor<*xf32>) -> (tensor<*xf32>, tensor<*xf32>)
+    return %3#0, %3#1 : tensor<*xf32>, tensor<*xf32>
+  }
 }
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference.cc b/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference.cc
index 7e4baadc397..33ccf5caff2 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference.cc
@@ -215,6 +215,10 @@ bool InferShapeForNonTFDialectOperation(Operation* op, Dialect* tf_dialect) {
     return InferShapeForPassThroughOps(
         tensor_cast.getOperation()->getOperands(), op, tf_dialect);
   }
+  if (auto launch_op = dyn_cast<tf_device::LaunchOp>(op)) {
+    return InferShapeForPassThroughOps(
+        launch_op.GetBody().getTerminator()->getOperands(), op, tf_dialect);
+  }
   return false;
 }
 

From a9e2bceddf4588f1ef0bae6fb4054111ff3d2225 Mon Sep 17 00:00:00 2001
From: Raman Sarokin <sorokin@google.com>
Date: Tue, 23 Jun 2020 09:51:29 -0700
Subject: [PATCH 0888/1390] ConvConstants converted to new style. Added memory
 types for buffer.

PiperOrigin-RevId: 317885492
Change-Id: I12dfede368bdf157f746f7823d7d47fd3807ffa1
---
 tensorflow/lite/delegates/gpu/cl/arguments.cc |  16 +-
 tensorflow/lite/delegates/gpu/cl/buffer.cc    |  14 ++
 tensorflow/lite/delegates/gpu/cl/buffer.h     |   3 +
 tensorflow/lite/delegates/gpu/cl/gpu_object.h |   3 +
 .../gpu/cl/kernels/conv_constants.cc          | 143 ++++++++++--------
 .../delegates/gpu/cl/kernels/conv_constants.h |  30 ++--
 .../lite/delegates/gpu/cl/linear_storage.cc   |   1 +
 .../lite/delegates/gpu/cl/linear_storage.h    |   1 +
 8 files changed, 132 insertions(+), 79 deletions(-)

diff --git a/tensorflow/lite/delegates/gpu/cl/arguments.cc b/tensorflow/lite/delegates/gpu/cl/arguments.cc
index d26c869957c..1fd58ef2454 100644
--- a/tensorflow/lite/delegates/gpu/cl/arguments.cc
+++ b/tensorflow/lite/delegates/gpu/cl/arguments.cc
@@ -457,8 +457,20 @@ std::string Arguments::GetListOfArgs() {
   for (auto& t : buffers_) {
     const std::string type_name =
         t.second.data_type == DataType::FLOAT32 ? "float" : "half";
-    AppendArgument(absl::StrCat("__global ", type_name, t.second.element_size,
-                                "* ", t.first),
+    std::string memory_type;
+    switch (t.second.memory_type) {
+      case MemoryType::GLOBAL:
+        memory_type = "__global";
+        break;
+      case MemoryType::CONSTANT:
+        memory_type = "__constant";
+        break;
+      case MemoryType::LOCAL:
+        memory_type = "__local";
+        break;
+    }
+    AppendArgument(absl::StrCat(memory_type, " ", type_name,
+                                t.second.element_size, "* ", t.first),
                    &result);
   }
   for (auto& t : image_buffers_) {
diff --git a/tensorflow/lite/delegates/gpu/cl/buffer.cc b/tensorflow/lite/delegates/gpu/cl/buffer.cc
index 68e85593e5d..436d8751e18 100644
--- a/tensorflow/lite/delegates/gpu/cl/buffer.cc
+++ b/tensorflow/lite/delegates/gpu/cl/buffer.cc
@@ -50,6 +50,7 @@ GPUResources BufferDescriptor::GetGPUResources(AccessType access_type) const {
   desc.data_type = element_type;
   desc.access_type = access_type;
   desc.element_size = element_size;
+  desc.memory_type = memory_type;
   resources.buffers.push_back({"buffer", desc});
   return resources;
 }
@@ -59,6 +60,8 @@ absl::Status BufferDescriptor::PerformSelector(
     const std::vector<std::string>& template_args, std::string* result) const {
   if (selector == "Read") {
     return PerformReadSelector(args, result);
+  } else if (selector == "GetPtr") {
+    return PerformGetPtrSelector(args, result);
   } else {
     return absl::NotFoundError(absl::StrCat(
         "BufferDescriptor don't have selector with name - ", selector));
@@ -76,6 +79,17 @@ absl::Status BufferDescriptor::PerformReadSelector(
   return absl::OkStatus();
 }
 
+absl::Status BufferDescriptor::PerformGetPtrSelector(
+    const std::vector<std::string>& args, std::string* result) const {
+  if (!args.empty()) {
+    return absl::NotFoundError(
+        absl::StrCat("BufferDescriptor GetPtr require zero arguments, but ",
+                     args.size(), " was passed"));
+  }
+  *result = "buffer";
+  return absl::OkStatus();
+}
+
 Buffer::Buffer(cl_mem buffer, size_t size_in_bytes)
     : buffer_(buffer), size_(size_in_bytes) {}
 
diff --git a/tensorflow/lite/delegates/gpu/cl/buffer.h b/tensorflow/lite/delegates/gpu/cl/buffer.h
index 771aae9e002..0d1072040c1 100644
--- a/tensorflow/lite/delegates/gpu/cl/buffer.h
+++ b/tensorflow/lite/delegates/gpu/cl/buffer.h
@@ -32,6 +32,7 @@ namespace cl {
 struct BufferDescriptor : public GPUObjectDescriptor {
   DataType element_type;  // FLOAT32 or FLOAT16
   int element_size;
+  MemoryType memory_type = MemoryType::GLOBAL;
 
   absl::Status PerformSelector(const std::string& selector,
                                const std::vector<std::string>& args,
@@ -41,6 +42,8 @@ struct BufferDescriptor : public GPUObjectDescriptor {
   GPUResources GetGPUResources(AccessType access_type) const override;
   absl::Status PerformReadSelector(const std::vector<std::string>& args,
                                    std::string* result) const;
+  absl::Status PerformGetPtrSelector(const std::vector<std::string>& args,
+                                     std::string* result) const;
 };
 
 // Buffer represent linear GPU data storage with arbitrary data format.
diff --git a/tensorflow/lite/delegates/gpu/cl/gpu_object.h b/tensorflow/lite/delegates/gpu/cl/gpu_object.h
index faf18b539e2..711c4726bc2 100644
--- a/tensorflow/lite/delegates/gpu/cl/gpu_object.h
+++ b/tensorflow/lite/delegates/gpu/cl/gpu_object.h
@@ -54,10 +54,13 @@ struct GPUImageBufferDescriptor {
   cl_mem memory;
 };
 
+enum class MemoryType { GLOBAL, CONSTANT, LOCAL };
+
 struct GPUBufferDescriptor {
   DataType data_type;
   AccessType access_type;
   int element_size;
+  MemoryType memory_type = MemoryType::GLOBAL;
   cl_mem memory;
 };
 
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/conv_constants.cc b/tensorflow/lite/delegates/gpu/cl/kernels/conv_constants.cc
index d4dc206ffce..e6fc5da36a2 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/conv_constants.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/conv_constants.cc
@@ -27,16 +27,29 @@ namespace gpu {
 namespace cl {
 namespace {
 
-std::string GenerateConvolutionConstantCode(
-    const OperationDef& op_def, const int2& kernel_size, int src_channels,
-    int dst_channels, bool stride_correction, const CLDevice& device,
-    const std::vector<ElementwiseOperation*>& linked_operations) {
-  TensorCodeGenerator src_tensor(
-      "src_data", WHSPoint{"src_size.x", "src_size.y", "src_size.z"},
-      op_def.src_tensors[0]);
-  TensorCodeGenerator dst_tensor(
-      "dst_data", WHSPoint{"dst_size.x", "dst_size.y", "dst_size.z"},
-      op_def.dst_tensors[0]);
+std::string GenerateConvolutionConstantCode(const OperationDef& op_def,
+                                            const int2& kernel_size,
+                                            int src_channels, int dst_channels,
+                                            bool stride_correction,
+                                            const CLDevice& device,
+                                            Arguments* args) {
+  auto src_desc = absl::make_unique<TensorDescriptor>(op_def.src_tensors[0]);
+  src_desc->SetTextureAddressMode(GetFastestZeroMode(device));
+  if (op_def.IsBatchSupported()) {
+    src_desc->SetStateVar("BatchedWidth", "true");
+  }
+  args->AddObjectRef("src_tensor", AccessType::READ, std::move(src_desc));
+  auto dst_desc = absl::make_unique<TensorDescriptor>(op_def.dst_tensors[0]);
+  if (op_def.IsBatchSupported()) {
+    dst_desc->SetStateVar("BatchedWidth", "true");
+  }
+  args->AddObjectRef("dst_tensor", AccessType::WRITE, std::move(dst_desc));
+  args->AddInt("stride_x");
+  args->AddInt("stride_y");
+  args->AddInt("padding_x");
+  args->AddInt("padding_y");
+  args->AddInt("dilation_x");
+  args->AddInt("dilation_y");
 
   std::string c = GetCommonDefines(op_def.precision);
 
@@ -89,33 +102,24 @@ std::string GenerateConvolutionConstantCode(
   const std::string postfixes[] = {".x", ".xy", ".xyz", ""};
 
   c += "__kernel void main_function(\n";
-  c += src_tensor.GetDeclaration(AccessType::READ) + ",\n";
-  c += "    __constant FLT4* filters,  \n";
-  c += "    __constant FLT4* biases";
-  c += GetArgsDeclaration(linked_operations);
-  c += dst_tensor.GetDeclaration(AccessType::WRITE) + ",\n";
-  c += "    int2 stride,               \n";
-  c += "    int2 padding,              \n";
-  c += "    int2 dilation,             \n";
-  c += "    int4 src_size,             \n";
-  c += "    int4 dst_size              \n";
-  c += ") {\n";
+  c += "$0) {\n";
   c += "  int X = get_global_id(0);\n";
   c += "  int Y = get_global_id(1);\n";
-  c += "  if (X >= dst_size.x || Y >= dst_size.y) return;\n";
+  c += "  if (X >= args.dst_tensor.Width() || Y >= args.dst_tensor.Height()) "
+       "return;\n";
   if (stride_correction) {
     c += "  int start_x = " +
-         GetXStrideCorrected("X", "src_size.w", "stride.x", "padding.x") +
+         GetXStrideCorrected("X", "args.src_tensor.Batch()", "args.stride_x",
+                             "args.padding_x") +
          ";\n";
   } else {
-    c += "  int start_x = X * stride.x + padding.x;\n";
+    c += "  int start_x = X * args.stride_x + args.padding_x;\n";
   }
-  c += "  int start_y = Y * stride.y + padding.y;\n";
+  c += "  int start_y = Y * args.stride_y + args.padding_y;\n";
   c += "  ACCUM_FLT4 r[" + kOutZ + "];\n";
   c += "  for (int i = 0; i < " + kOutZ + "; ++i) {\n";
   c += "    r[i] = (ACCUM_FLT4)(0.0f, 0.0f, 0.0f, 0.0f);\n";
   c += "  }\n";
-  const auto address_mode = GetFastestZeroMode(device);
   int filters_counter = 0;
   for (int s = 0; s < src_depth; ++s) {
     const int ch_count = std::min(4, src_channels - s * 4);
@@ -124,27 +128,29 @@ std::string GenerateConvolutionConstantCode(
     const std::string s_type = absl::StrCat("FLT", s_count);
     const std::string s_postfix = postfixes[ch_count - 1];
     for (int ky = 0; ky < kernel_size.y; ++ky) {
-      std::string s_y = absl::StrCat("(start_y + ", ky, " * dilation.y)");
+      std::string s_y = absl::StrCat("(start_y + ", ky, " * args.dilation_y)");
       if (manual_clamp) {
         c += "  {\n";
-        c += "  bool y_out = " + s_y + " < 0 || " + s_y + " >= src_size.y;\n";
+        c += "  bool y_out = " + s_y + " < 0 || " + s_y +
+             " >= args.src_tensor.Height();\n";
       }
       for (int kx = 0; kx < kernel_size.x; ++kx) {
         c += "  {\n";
-        std::string s_x = absl::StrCat("(start_x + ", kx, " * dilation.x)");
+        std::string s_x =
+            absl::StrCat("(start_x + ", kx, " * args.dilation_x)");
         if (manual_clamp) {
-          c += "    bool x_out = " + s_x + "< 0 || " + s_x + ">= src_size.x;\n";
+          c += "    bool x_out = " + s_x + "< 0 || " + s_x +
+               ">= args.src_tensor.Width();\n";
           c += "    " + s_type + " src = x_out || y_out ?";
-          c += "(" + s_type + ")(0.0) : ";
-          c += src_tensor.ReadWHS(s_x, s_y, std::to_string(s)) + s_postfix +
-               ";\n";
+          c += "(" + s_type + ")(0.0) : args.src_tensor.Read(" + s_x + ", " +
+               s_y + ", " + std::to_string(s) + ")" + s_postfix + ";\n";
         } else {
-          c += "    " + s_type + " src = " +
-               src_tensor.ReadWHS(s_x, s_y, std::to_string(s), address_mode) +
-               s_postfix + ";\n";
+          c += "    " + s_type + " src = args.src_tensor.Read(" + s_x + ", " +
+               s_y + ", " + std::to_string(s) + ")" + s_postfix + ";\n";
         }
         for (int d = 0; d < out_z; ++d) {
-          c += "    " + s_conv + "(r[" + std::to_string(d) + "], src, filters,";
+          c += "    " + s_conv + "(r[" + std::to_string(d) +
+               "], src, args.weigths.GetPtr(),";
           c += " " + std::to_string(filters_counter) + ");\n";
           filters_counter += ch_count;
         }
@@ -158,10 +164,9 @@ std::string GenerateConvolutionConstantCode(
   for (int i = 0; i < out_z; ++i) {
     std::string s_i = std::to_string(i);
     c += "  {\n";
-    c += "    FLT4 res = TO_FLT4(r[" + s_i + "]) + biases[" + s_i + "];\n";
-    const LinkingContext context{"res", "X", "Y", s_i};
-    c += PostProcess(linked_operations, context);
-    c += "  " + dst_tensor.WriteWHS("res", "X", "Y", s_i);
+    c += "    FLT4 res = TO_FLT4(r[" + s_i + "]) + args.biases.Read(" + s_i +
+         ");\n";
+    c += "  args.dst_tensor.Write(res, X, Y, " + s_i + ");\n";
     c += "  }\n";
   }
   c += "}\n";
@@ -191,8 +196,6 @@ int GetOptimalMaxConstantSize(const DeviceInfo& info) {
 
 ConvConstants::ConvConstants(ConvConstants&& kernel)
     : GPUOperation(std::move(kernel)),
-      weights_(std::move(kernel.weights_)),
-      biases_(std::move(kernel.biases_)),
       kernel_size_(kernel.kernel_size_),
       stride_(kernel.stride_),
       padding_(kernel.padding_),
@@ -204,8 +207,6 @@ ConvConstants::ConvConstants(ConvConstants&& kernel)
 
 ConvConstants& ConvConstants::operator=(ConvConstants&& kernel) {
   if (this != &kernel) {
-    weights_ = std::move(kernel.weights_);
-    biases_ = std::move(kernel.biases_);
     std::swap(kernel_size_, kernel.kernel_size_);
     std::swap(stride_, kernel.stride_);
     std::swap(padding_, kernel.padding_);
@@ -222,9 +223,15 @@ ConvConstants& ConvConstants::operator=(ConvConstants&& kernel) {
 absl::Status ConvConstants::Compile(const CreationContext& creation_context) {
   const bool stride_correction =
       definition_.IsBatchSupported() && stride_.x != 1;
-  const auto code = GenerateConvolutionConstantCode(
+  std::string code = GenerateConvolutionConstantCode(
       definition_, kernel_size_, src_channels_, dst_channels_,
-      stride_correction, *creation_context.device, linked_operations_);
+      stride_correction, *creation_context.device, &args_);
+  std::string element_wise_code;
+  RETURN_IF_ERROR(
+      MergeOperations(linked_operations_, &args_, &element_wise_code));
+  RETURN_IF_ERROR(args_.TransformToCLCode(creation_context.device->GetInfo(),
+                                          {{"dst_tensor", element_wise_code}},
+                                          &code));
   std::vector<CompilerOptions> options;
   if (definition_.precision == CalculationsPrecision::F16 &&
       creation_context.device->IsAdreno3xx()) {
@@ -241,20 +248,16 @@ absl::Status ConvConstants::Compile(const CreationContext& creation_context) {
 }
 
 absl::Status ConvConstants::BindArguments() {
-  kernel_.ResetBindingCounter();
-  RETURN_IF_ERROR(kernel_.SetMemoryAuto(src_[0]->GetMemoryPtr()));
-  RETURN_IF_ERROR(kernel_.SetMemoryAuto(weights_.GetMemoryPtr()));
-  RETURN_IF_ERROR(kernel_.SetMemoryAuto(biases_.GetMemoryPtr()));
-  RETURN_IF_ERROR(BindArgs(&kernel_, linked_operations_));
-  RETURN_IF_ERROR(kernel_.SetMemoryAuto(dst_[0]->GetMemoryPtrForWriting()));
-  RETURN_IF_ERROR(kernel_.SetBytesAuto(stride_));
-  RETURN_IF_ERROR(
-      kernel_.SetBytesAuto(int2(padding_.x * src_[0]->Batch(), padding_.y)));
-  RETURN_IF_ERROR(
-      kernel_.SetBytesAuto(int2(dilation_.x * src_[0]->Batch(), dilation_.y)));
-  RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->GetWBatchedHSB()));
-  RETURN_IF_ERROR(kernel_.SetBytesAuto(dst_[0]->GetWBatchedHSB()));
-  return absl::OkStatus();
+  RETURN_IF_ERROR(args_.SetObjectRef("src_tensor", src_[0]));
+  RETURN_IF_ERROR(args_.SetObjectRef("dst_tensor", dst_[0]));
+  RETURN_IF_ERROR(args_.SetInt("stride_x", stride_.x));
+  RETURN_IF_ERROR(args_.SetInt("stride_y", stride_.y));
+  RETURN_IF_ERROR(args_.SetInt("padding_x", padding_.x * src_[0]->Batch()));
+  RETURN_IF_ERROR(args_.SetInt("padding_y", padding_.y));
+  RETURN_IF_ERROR(args_.SetInt("dilation_x", dilation_.x * src_[0]->Batch()));
+  RETURN_IF_ERROR(args_.SetInt("dilation_y", dilation_.y));
+  RETURN_IF_ERROR(SetArguments(linked_operations_, &args_));
+  return args_.Bind(kernel_.kernel());
 }
 
 int3 ConvConstants::GetGridSize() const {
@@ -304,12 +307,18 @@ absl::Status CreateConvConstants(const CreationContext& creation_context,
   *result = ConvConstants(definition, attr);
   RETURN_IF_ERROR(
       result->UploadWeights(attr.weights, creation_context.context));
-  LinearStorageCreateInfo create_info;
-  create_info.storage_type = LinearStorageType::BUFFER;
-  create_info.data_type = definition.GetDataType();
-  create_info.aligned_size = attr.weights.shape.o;
-  RETURN_IF_ERROR(CreateLinearStorage(
-      create_info, attr.bias, creation_context.context, &result->biases_));
+
+  TensorLinearDescriptor desc;
+  desc.storage_type = LinearStorageType::BUFFER;
+  desc.element_type = definition.GetDataType();
+  desc.memory_type = MemoryType::CONSTANT;
+
+  LinearStorage lt;
+  RETURN_IF_ERROR(
+      CreateLinearStorage(desc, attr.bias, creation_context.context, &lt));
+  result->args_.AddObject("biases", AccessType::READ,
+                          absl::make_unique<LinearStorage>(std::move(lt)),
+                          absl::make_unique<TensorLinearDescriptor>(desc));
   return absl::OkStatus();
 }
 
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/conv_constants.h b/tensorflow/lite/delegates/gpu/cl/kernels/conv_constants.h
index 8d80d48314d..b9cc52f7e94 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/conv_constants.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/conv_constants.h
@@ -71,9 +71,6 @@ class ConvConstants : public GPUOperation {
   absl::Status BindArguments();
   int3 GetGridSize() const;
 
-  Buffer weights_;
-  LinearStorage biases_;
-
   int2 kernel_size_;
   int2 stride_;
   int2 padding_;
@@ -92,21 +89,34 @@ absl::Status ConvConstants::UploadWeights(
   const int kernel_x = weights.shape.w;
   const int kernel_y = weights.shape.h;
 
-  const int float_size =
-      definition_.precision == CalculationsPrecision::F32 ? 4 : 2;
+  const bool f32_weights = definition_.precision == CalculationsPrecision::F32;
+
+  BufferDescriptor desc;
+  desc.element_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
+  desc.element_size = 4;
+  desc.memory_type = MemoryType::CONSTANT;
+
+  const int float_size = f32_weights ? 4 : 2;
   const int float_count = src_channels_ * dst_depth * 4 * kernel_x * kernel_y;
 
-  if (definition_.GetDataType() == DataType::FLOAT32) {
+  Buffer weights_buffer;
+  if (f32_weights) {
     std::vector<float4> gpu_data(float_count / 4);
     RearrangeWeightsData(weights, absl::MakeSpan(gpu_data));
-    return CreateReadOnlyBuffer(float_size * float_count, gpu_data.data(),
-                                context, &weights_);
+    RETURN_IF_ERROR(CreateReadOnlyBuffer(
+        float_size * float_count, gpu_data.data(), context, &weights_buffer));
   } else {
     std::vector<half4> gpu_data(float_count / 4);
     RearrangeWeightsData(weights, absl::MakeSpan(gpu_data));
-    return CreateReadOnlyBuffer(float_size * float_count, gpu_data.data(),
-                                context, &weights_);
+    RETURN_IF_ERROR(CreateReadOnlyBuffer(
+        float_size * float_count, gpu_data.data(), context, &weights_buffer));
   }
+
+  args_.AddObject("weigths", AccessType::READ,
+                  absl::make_unique<Buffer>(std::move(weights_buffer)),
+                  absl::make_unique<BufferDescriptor>(desc));
+
+  return absl::OkStatus();
 }
 
 template <DataType S, typename T>
diff --git a/tensorflow/lite/delegates/gpu/cl/linear_storage.cc b/tensorflow/lite/delegates/gpu/cl/linear_storage.cc
index 47504a34c2b..ee0ea3efbec 100644
--- a/tensorflow/lite/delegates/gpu/cl/linear_storage.cc
+++ b/tensorflow/lite/delegates/gpu/cl/linear_storage.cc
@@ -32,6 +32,7 @@ GPUResources TensorLinearDescriptor::GetGPUResources(
     desc.data_type = element_type;
     desc.access_type = access_type;
     desc.element_size = 4;
+    desc.memory_type = memory_type;
     resources.buffers.push_back({"buffer", desc});
   } else {
     GPUImage2DDescriptor desc;
diff --git a/tensorflow/lite/delegates/gpu/cl/linear_storage.h b/tensorflow/lite/delegates/gpu/cl/linear_storage.h
index 2c0770ef3dc..14c8460bf80 100644
--- a/tensorflow/lite/delegates/gpu/cl/linear_storage.h
+++ b/tensorflow/lite/delegates/gpu/cl/linear_storage.h
@@ -40,6 +40,7 @@ enum class LinearStorageType { BUFFER, TEXTURE_2D };
 struct TensorLinearDescriptor : public GPUObjectDescriptor {
   LinearStorageType storage_type;
   DataType element_type;  // FLOAT32 or FLOAT16
+  MemoryType memory_type = MemoryType::GLOBAL;  // applicable for BUFFER
 
   absl::Status PerformSelector(const std::string& selector,
                                const std::vector<std::string>& args,

From 6262e61826764c5236c1e74596583973c7248f3c Mon Sep 17 00:00:00 2001
From: Akshay Modi <nareshmodi@google.com>
Date: Tue, 23 Jun 2020 09:51:51 -0700
Subject: [PATCH 0889/1390] Add more np array interop tests.

PiperOrigin-RevId: 317885566
Change-Id: I26dc3643547f0bd834b8e913e3c6eda7847f1aea
---
 .../python/ops/numpy_ops/np_interop_test.py   | 120 ++++++++++++++++--
 1 file changed, 106 insertions(+), 14 deletions(-)

diff --git a/tensorflow/python/ops/numpy_ops/np_interop_test.py b/tensorflow/python/ops/numpy_ops/np_interop_test.py
index 17a3bf81a01..33abb58f260 100644
--- a/tensorflow/python/ops/numpy_ops/np_interop_test.py
+++ b/tensorflow/python/ops/numpy_ops/np_interop_test.py
@@ -93,9 +93,8 @@ class InteropTest(tf.test.TestCase):
 
     dx, dy = t.gradient([xx, yy], [x, y])
 
-    # # TODO(nareshmodi): Figure out a way to rewrap ndarray as tensors.
-    # self.assertIsInstance(dx, np.ndarray)
-    # self.assertIsInstance(dy, np.ndarray)
+    self.assertIsInstance(dx, np.ndarray)
+    self.assertIsInstance(dy, np.ndarray)
     self.assertAllClose(dx, 2.0)
     self.assertAllClose(dy, 3.0)
 
@@ -181,19 +180,17 @@ class InteropTest(tf.test.TestCase):
 
     multiplier = np.asarray(5.)
 
-    with strategy.scope():
+    @tf.function
+    def run():
+      ctx = tf.distribute.get_replica_context()
+      val = np.asarray(ctx.replica_id_in_sync_group)
+      return val * multiplier
 
-      @tf.function
-      def run():
-        ctx = tf.distribute.get_replica_context()
-        val = np.asarray(ctx.replica_id_in_sync_group)
-        return val * multiplier
+    distributed_values = strategy.run(run)
+    reduced = strategy.reduce(
+        tf.distribute.ReduceOp.SUM, distributed_values, axis=None)
 
-      distributed_values = strategy.run(run)
-      reduced = strategy.reduce(
-          tf.distribute.ReduceOp.SUM, distributed_values, axis=None)
-
-    values = distributed_values.values
+    values = strategy.experimental_local_results(distributed_values)
 
     # Note that this should match the number of virtual CPUs.
     self.assertLen(values, 3)
@@ -208,6 +205,101 @@ class InteropTest(tf.test.TestCase):
     # self.assertIsInstance(reduced, np.ndarray)
     self.assertAllClose(reduced, 15)
 
+  def testPyFuncInterop(self):
+    def py_func_fn(a, b):
+      return a + b
+
+    @tf.function
+    def fn(a, b):
+      result = tf.py_function(py_func_fn, [a, b], a.dtype)
+      return np.asarray(result)
+
+    a = np.asarray(1.)
+    b = np.asarray(2.)
+
+    result = fn(a, b)
+    self.assertIsInstance(result, np.ndarray)
+    self.assertAllClose(result, 3.)
+
+  def testDatasetInterop(self):
+    values = [1, 2, 3, 4, 5, 6]
+    values_as_array = np.asarray(values)
+
+    # Tensor dataset
+    dataset = tf.data.Dataset.from_tensors(values_as_array)
+
+    for value, value_from_dataset in zip([values_as_array], dataset):
+      self.assertIsInstance(value_from_dataset, np.ndarray)
+      self.assertAllEqual(value_from_dataset, value)
+
+    # Tensor slice dataset
+    dataset = tf.data.Dataset.from_tensor_slices(values_as_array)
+
+    for value, value_from_dataset in zip(values, dataset):
+      self.assertIsInstance(value_from_dataset, np.ndarray)
+      self.assertAllEqual(value_from_dataset, value)
+
+    # # TODO(nareshmodi): as_numpy_iterator() doesn't work.
+    # items = list(dataset.as_numpy_iterator())
+
+    # Map over a dataset.
+    dataset = dataset.map(lambda x: np.add(x, 1))
+
+    for value, value_from_dataset in zip(values, dataset):
+      self.assertIsInstance(value_from_dataset, np.ndarray)
+      self.assertAllEqual(value_from_dataset, value + 1)
+
+    # Batch a dataset.
+    dataset = tf.data.Dataset.from_tensor_slices(values_as_array).batch(2)
+
+    for value, value_from_dataset in zip([[1, 2], [3, 4], [5, 6]], dataset):
+      self.assertIsInstance(value_from_dataset, np.ndarray)
+      self.assertAllEqual(value_from_dataset, value)
+
+  def testKerasInterop(self):
+    # Return an ndarray from the model.
+    inputs = tf.keras.layers.Input(shape=(10,))
+    output_layer = tf.keras.layers.Lambda(np.square)(inputs)
+    model = tf.keras.Model([inputs], output_layer)
+
+    values = onp.arange(10, dtype=onp.float32)
+    values_as_array = np.asarray(values)
+
+    result = model(values)
+    self.assertIsInstance(result, np.ndarray)
+    self.assertAllClose(result, onp.square(values))
+
+    result = model(values_as_array)
+    self.assertIsInstance(result, np.ndarray)
+    self.assertAllClose(result, onp.square(values))
+
+  def testPForInterop(self):
+    def outer_product(a):
+      return np.tensordot(a, a, 0)
+
+    batch_size = 100
+    a = np.ones((batch_size, 32, 32))
+    c = tf.vectorized_map(outer_product, a)
+
+    # # TODO(nareshmodi): vectorized_map doesn't rewrap tensors in ndarray.
+    # self.assertIsInstance(c, np.ndarray)
+    self.assertEqual(c.shape, (batch_size, 32, 32, 32, 32))
+
+  def testJacobian(self):
+    with tf.GradientTape() as g:
+      x = np.asarray([1., 2.])
+      y = np.asarray([3., 4.])
+      g.watch(x)
+      g.watch(y)
+      z = x * x * y
+
+    jacobian = g.jacobian(z, [x, y])
+    answer = [tf.linalg.diag(2 * x * y), tf.linalg.diag(x * x)]
+
+    self.assertIsInstance(jacobian[0], np.ndarray)
+    self.assertIsInstance(jacobian[1], np.ndarray)
+    self.assertAllClose(jacobian, answer)
+
 
 class FunctionTest(InteropTest):
 

From d34bb0c10657b29322b231b08a818738d4eb010f Mon Sep 17 00:00:00 2001
From: Raman Sarokin <sorokin@google.com>
Date: Tue, 23 Jun 2020 09:52:04 -0700
Subject: [PATCH 0890/1390] DepthwiseConvolution converted to new style.
 DepthwiseConvolution3D merged into DepthwiseConvolution.

PiperOrigin-RevId: 317885608
Change-Id: Ifead6b0998c75d70f2be18ba2d32936e9e8ecdf0
---
 .../lite/delegates/gpu/cl/kernels/BUILD       |  23 --
 .../gpu/cl/kernels/depthwise_conv.cc          | 328 ++++++++++-------
 .../delegates/gpu/cl/kernels/depthwise_conv.h | 152 +++++++-
 .../gpu/cl/kernels/depthwise_conv_3d.cc       | 338 ------------------
 .../gpu/cl/kernels/depthwise_conv_3d.h        | 170 ---------
 5 files changed, 337 insertions(+), 674 deletions(-)
 delete mode 100644 tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv_3d.cc
 delete mode 100644 tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv_3d.h

diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/BUILD b/tensorflow/lite/delegates/gpu/cl/kernels/BUILD
index 24a9a962296..21fb65e8909 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/BUILD
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/BUILD
@@ -594,29 +594,6 @@ cc_library(
     ],
 )
 
-cc_library(
-    name = "depthwise_conv_3d",
-    srcs = ["depthwise_conv_3d.cc"],
-    hdrs = ["depthwise_conv_3d.h"],
-    deps = [
-        ":gpu_operation",
-        ":util",
-        ":work_group_picking",
-        "//tensorflow/lite/delegates/gpu/cl:buffer",
-        "//tensorflow/lite/delegates/gpu/cl:cl_device",
-        "//tensorflow/lite/delegates/gpu/cl:linear_storage",
-        "//tensorflow/lite/delegates/gpu/cl:tensor",
-        "//tensorflow/lite/delegates/gpu/cl:texture2d",
-        "//tensorflow/lite/delegates/gpu/cl:util",
-        "//tensorflow/lite/delegates/gpu/common:data_type",
-        "//tensorflow/lite/delegates/gpu/common:operations",
-        "//tensorflow/lite/delegates/gpu/common:shape",
-        "//tensorflow/lite/delegates/gpu/common:status",
-        "//tensorflow/lite/delegates/gpu/common:tensor",
-        "//tensorflow/lite/delegates/gpu/common:types",
-    ],
-)
-
 cc_test(
     name = "depthwise_conv_test",
     srcs = ["depthwise_conv_test.cc"],
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv.cc b/tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv.cc
index 4c5e20abde3..de1a04befa8 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv.cc
@@ -34,57 +34,71 @@ bool IsSpecializedCase(int channel_multiplier) {
          channel_multiplier == 4;
 }
 
-std::string GetSrcValue(const TensorCodeGenerator& src_tensor,
-                        int channel_multiplier,
-                        TextureAddressMode address_mode) {
+std::string GetSrcValue(int channel_multiplier, const std::string coords) {
   std::string c;
   if (channel_multiplier == 1) {
-    c += "      FLT4 src_final =" +
-         src_tensor.ReadWHS("x_c", "y_c", "Z", address_mode) + ";\n";
+    c += "      FLT4 src_final = args.src_tensor.Read(" + coords + ", S);\n";
   } else if (channel_multiplier == 2) {
-    c += "      int z_layer = Z / 2;\n";
-    c += "      FLT4 src =" +
-         src_tensor.ReadWHS("x_c", "y_c", "z_layer", address_mode) + ";\n";
-    c += "      FLT2 t0 = Z % 2 == 0 ? src.xy : src.zw;\n";
+    c += "      int s_layer = S / 2;\n";
+    c += "      FLT4 src = args.src_tensor.Read(" + coords + ", s_layer);\n";
+    c += "      FLT2 t0 = S % 2 == 0 ? src.xy : src.zw;\n";
     c += "      FLT4 src_final = (FLT4)(t0.x, t0.x, t0.y, t0.y);\n";
   } else if (channel_multiplier == 4) {
-    c += "      int z_layer = Z / 4;\n";
-    c += "      FLT4 src =" +
-         src_tensor.ReadWHS("x_c", "y_c", "z_layer", address_mode) + ";\n";
+    c += "      int s_layer = S / 4;\n";
+    c += "      FLT4 src = args.src_tensor.Read(" + coords + ", s_layer);\n";
     c += "      FLT t0 = src.x;\n";
-    c += "      int reminder = Z % 4;\n";
+    c += "      int reminder = S % 4;\n";
     c += "      if (reminder == 1) t0 = src.y;\n";
     c += "      if (reminder == 2) t0 = src.z;\n";
     c += "      if (reminder == 3) t0 = src.w;\n";
     c += "      FLT4 src_final = (FLT4)(t0, t0, t0, t0);\n";
   } else {
-    c += "      int z_layer = Z / channel_multiplier;\n";
-    c += "      FLT4 src =" +
-         src_tensor.ReadWHS("x_c", "y_c", "z_layer", address_mode) + ";\n";
-    c += "      int z_offset = (Z % channel_multiplier) * 4;\n";
+    c += "      int s_layer = S / args.ch_multiplier;\n";
+    c += "      FLT4 src = args.src_tensor.Read(" + coords + ", s_layer);\n";
+    c += "      int s_offset = (S % args.ch_multiplier) * 4;\n";
     c += "      FLT4 src_final;\n";
     c += "      FLT temp_arr[4] = {src.x, src.y, src.z, src.w};\n";
-    c += "      src_final.x = temp_arr[(z_offset + 0) / channel_multiplier];\n";
-    c += "      src_final.y = temp_arr[(z_offset + 1) / channel_multiplier];\n";
-    c += "      src_final.z = temp_arr[(z_offset + 2) / channel_multiplier];\n";
-    c += "      src_final.w = temp_arr[(z_offset + 3) / channel_multiplier];\n";
+    c += "      src_final.x = temp_arr[(s_offset + 0) / args.ch_multiplier];\n";
+    c += "      src_final.y = temp_arr[(s_offset + 1) / args.ch_multiplier];\n";
+    c += "      src_final.z = temp_arr[(s_offset + 2) / args.ch_multiplier];\n";
+    c += "      src_final.w = temp_arr[(s_offset + 3) / args.ch_multiplier];\n";
   }
 
   return c;
 }
 
 std::string GenerateDepthwiseConvolutionCode(
-    const OperationDef& op_def, bool stride_correction,
-    const LinearStorage& biases, int channel_multiplier,
-    bool weights_are_buffer,
-    const std::vector<ElementwiseOperation*>& linked_operations,
-    const CLDevice& device) {
-  TensorCodeGenerator src_tensor(
-      "src_data", WHSPoint{"src_size.x", "src_size.y", "src_size.z"},
-      op_def.src_tensors[0]);
-  TensorCodeGenerator dst_tensor(
-      "dst_data", WHSPoint{"dst_size.x", "dst_size.y", "dst_size.z"},
-      op_def.dst_tensors[0]);
+    const OperationDef& op_def, bool stride_correction, int channel_multiplier,
+    bool weights_are_buffer, const CLDevice& device, Arguments* args) {
+  auto src_desc = absl::make_unique<TensorDescriptor>(op_def.src_tensors[0]);
+  src_desc->SetTextureAddressMode(GetFastestZeroMode(device));
+  if (op_def.IsBatchSupported()) {
+    src_desc->SetStateVar("BatchedWidth", "true");
+  }
+  args->AddObjectRef("src_tensor", AccessType::READ, std::move(src_desc));
+  auto dst_desc = absl::make_unique<TensorDescriptor>(op_def.dst_tensors[0]);
+  if (op_def.IsBatchSupported()) {
+    dst_desc->SetStateVar("BatchedWidth", "true");
+  }
+  args->AddObjectRef("dst_tensor", AccessType::WRITE, std::move(dst_desc));
+  args->AddInt("kernel_size_x");
+  args->AddInt("stride_x");
+  args->AddInt("padding_x");
+  args->AddInt("dilation_x");
+  args->AddInt("kernel_size_y");
+  args->AddInt("stride_y");
+  args->AddInt("padding_y");
+  args->AddInt("dilation_y");
+  if (op_def.dst_tensors[0].HasAxis(Axis::DEPTH)) {
+    args->AddInt("kernel_size_z");
+    args->AddInt("stride_z");
+    args->AddInt("padding_z");
+    args->AddInt("dilation_z");
+  }
+  if (!IsSpecializedCase(channel_multiplier)) {
+    args->AddInt("ch_multiplier");
+  }
+
   const auto src_tensor_type = op_def.src_tensors[0].storage_type;
 
   std::string c = GetCommonDefines(op_def.precision);
@@ -93,86 +107,110 @@ std::string GenerateDepthwiseConvolutionCode(
                             src_tensor_type == TensorStorageType::IMAGE_BUFFER;
 
   c += "__kernel void main_function(\n";
-  c += src_tensor.GetDeclaration(AccessType::READ) + ",\n";
-  if (weights_are_buffer) {
-    c += "    __global FLT4* filters,  \n";
-  } else {
-    c += "    __read_only image2d_t filters,  \n";
-  }
-  c += biases.GetDeclaration();
-  c += GetArgsDeclaration(linked_operations);
-  c += dst_tensor.GetDeclaration(AccessType::WRITE) + ",\n";
-  c += "    int2 kernel_size,                \n";
-  c += "    int2 stride,                     \n";
-  c += "    int2 padding,                    \n";
-  c += "    int2 dilation,                   \n";
-  if (!IsSpecializedCase(channel_multiplier)) {
-    c += "    int channel_multiplier,            \n";
-  }
-  c += "    int4 src_size,                   \n";
-  c += "    int4 dst_size                    \n";
-  c += ") {\n";
+  c += "$0) {\n";
   c += "  int X = get_global_id(0);\n";
   c += "  int Y = get_global_id(1);\n";
-  c += "  int Z = get_global_id(2);\n";
-  c += "  if (X >= dst_size.x || Y >= dst_size.y || Z >= dst_size.z) return;\n";
+  if (op_def.dst_tensors[0].HasAxis(Axis::DEPTH)) {
+    c += "  int linear_id_2 = get_global_id(2);\n";
+    c += "  int S = linear_id_2 / args.dst_tensor.Depth();\n";
+    c += "  int Z = linear_id_2 % args.dst_tensor.Depth();\n";
+  } else {
+    c += "  int S = get_global_id(2);\n";
+  }
+  c += "  if (X >= args.dst_tensor.Width() || Y >= args.dst_tensor.Height() || "
+       "S >= args.dst_tensor.Slices()) { \n";
+  c += "    return; \n";
+  c += "  } \n";
   c += "  ACCUM_FLT4 r = (ACCUM_FLT4)(0.0f, 0.0f, 0.0f, 0.0f);\n";
   if (stride_correction) {
     c += "  int x_offseted = " +
-         GetXStrideCorrected("X", "src_size.w", "stride.x", "padding.x") +
+         GetXStrideCorrected("X", "args.src_tensor.Batch()", "args.stride_x",
+                             "args.padding_x") +
          ";\n";
   } else {
-    c += "  int x_offseted = X * stride.x + padding.x;\n";
+    c += "  int x_offseted = X * args.stride_x + args.padding_x;\n";
+  }
+  c += "  int y_offseted = Y * args.stride_y + args.padding_y;\n";
+  std::string weights_offset = "args.kernel_size_x * args.kernel_size_y";
+  if (op_def.dst_tensors[0].HasAxis(Axis::DEPTH)) {
+    c += "  int z_offseted = Z * args.stride_z + args.padding_z;\n";
+    weights_offset += " * args.kernel_size_z";
   }
-  c += "  int y_offseted = Y * stride.y + padding.y;\n";
   if (weights_are_buffer) {
-    c += "  int fx_c = Z * kernel_size.x * kernel_size.y;\n";
+    c += "  int fx_c = S * " + weights_offset + ";\n";
   } else {
     c += "  int fx_c = 0;\n";
   }
 
+  std::string flat_coords = "x_c, y_c";
   if (manual_clamp) {
-    c += "  for (int ky = 0; ky < kernel_size.y; ++ky) {\n";
-    c += "    int y_c = y_offseted + ky * dilation.y;\n";
-    c += "    bool outside_y = y_c < 0 || y_c >= src_size.y;\n";
-    c += "    for (int kx = 0; kx < kernel_size.x; ++kx) {\n";
-    c += "      int x_c = x_offseted + kx * dilation.x;\n";
-    c += "      bool outside_x = x_c < 0 || x_c >= src_size.x;\n";
-    c += "      if (!outside_x && !outside_y) {\n";
-    if (weights_are_buffer) {
-      c += "        FLT4 f = filters[fx_c];\n";
-    } else {
-      c += "        FLT4 f = READ_IMAGE(filters, smp_none, (int2)(fx_c, Z));\n";
+    std::string check = "!outside_x && !outside_y";
+    if (op_def.dst_tensors[0].HasAxis(Axis::DEPTH)) {
+      check += " && !outside_z";
+      flat_coords += ", z_c";
+      c += "  for (int kz = 0; kz < args.kernel_size_z; ++kz) {\n";
+      c += "    int z_c = z_offseted + kz * args.dilation_z;\n";
+      c += "    bool outside_z = z_c < 0 || z_c >= args.src_tensor.Depth();\n";
     }
-    c += GetSrcValue(src_tensor, channel_multiplier,
-                     TextureAddressMode::DONT_CARE);
+    c += "  for (int ky = 0; ky < args.kernel_size_y; ++ky) {\n";
+    c += "    int y_c = y_offseted + ky * args.dilation_y;\n";
+    c += "    bool outside_y = y_c < 0 || y_c >= args.src_tensor.Height();\n";
+    c += "    for (int kx = 0; kx < args.kernel_size_x; ++kx) {\n";
+    c += "      int x_c = x_offseted + kx * args.dilation_x;\n";
+    c += "      bool outside_x = x_c < 0 || x_c >= args.src_tensor.Width();\n";
+    c += "      if (" + check + ") {\n";
+    if (weights_are_buffer) {
+      c += "        FLT4 f = args.weights.Read(fx_c);\n";
+    } else {
+      c += "        FLT4 f = args.weights.Read(fx_c, S);\n";
+    }
+    c += GetSrcValue(channel_multiplier, flat_coords);
     c += "        r += TO_ACCUM_TYPE(src_final * f);\n";
     c += "      };\n";
     c += "      fx_c++;\n";
     c += "    }\n";
     c += "  }\n";
+    if (op_def.dst_tensors[0].HasAxis(Axis::DEPTH)) {
+      c += "  }\n";
+    }
   } else {  // Texture types with ZERO clamping
-    c += "  for (int ky = 0; ky < kernel_size.y; ++ky) {\n";
-    c += "    int y_c = y_offseted + ky * dilation.y;\n";
-    c += "    for (int kx = 0; kx < kernel_size.x; ++kx) {\n";
-    c += "      int x_c = x_offseted + kx * dilation.x;\n";
-    const auto access_mode = GetFastestZeroMode(device);
-    c += GetSrcValue(src_tensor, channel_multiplier, access_mode);
+    if (op_def.dst_tensors[0].HasAxis(Axis::DEPTH)) {
+      flat_coords += ", z_c";
+      c += "  for (int kz = 0; kz < args.kernel_size_z; ++kz) {\n";
+      c += "    int z_c = z_offseted + kz * args.dilation_z;\n";
+      if (src_tensor_type !=
+          TensorStorageType::TEXTURE_3D) {  // Only TEXTURE_3D supports clamping
+                                            // in DEPTH dimension
+        c += "    if (z_c < 0 || z_c >= args.src_tensor.Depth()) {\n";
+        c += "      fx_c += args.kernel_size_y * args.kernel_size_x;\n";
+        c += "      continue;\n";
+        c += "    }\n";
+      }
+    }
+    c += "  for (int ky = 0; ky < args.kernel_size_y; ++ky) {\n";
+    c += "    int y_c = y_offseted + ky * args.dilation_y;\n";
+    c += "    for (int kx = 0; kx < args.kernel_size_x; ++kx) {\n";
+    c += "      int x_c = x_offseted + kx * args.dilation_x;\n";
+    c += GetSrcValue(channel_multiplier, flat_coords);
     if (weights_are_buffer) {
-      c += "      FLT4 f = filters[fx_c];\n";
+      c += "      FLT4 f = args.weights.Read(fx_c);\n";
     } else {
-      c += "      FLT4 f = READ_IMAGE(filters, smp_none, (int2)(fx_c, Z));\n";
+      c += "      FLT4 f = args.weights.Read(fx_c, S);\n";
     }
     c += "      fx_c++;\n";
     c += "      r += TO_ACCUM_TYPE(src_final * f);\n";
     c += "    }\n";
     c += "  }\n";
+    if (op_def.dst_tensors[0].HasAxis(Axis::DEPTH)) {
+      c += "  }\n";
+    }
+  }
+  c += "  FLT4 res0 = TO_FLT4(r) + args.biases.Read(S);\n";
+  if (op_def.dst_tensors[0].HasAxis(Axis::DEPTH)) {
+    c += "  args.dst_tensor.Write(res0, X, Y, Z, S);\n";
+  } else {
+    c += "  args.dst_tensor.Write(res0, X, Y, S);\n";
   }
-  c += "  FLT4 bias_val = " + biases.ReadLinearFLT4("Z") + ";\n";
-  c += "  FLT4 res0 = TO_FLT4(r) + bias_val;\n";
-  const LinkingContext context{"res0", "X", "Y", "Z"};
-  c += PostProcess(linked_operations, context);
-  c += "  " + dst_tensor.WriteWHS("res0", "X", "Y", "Z") + "\n";
   c += "}\n";
 
   return c;
@@ -184,20 +222,30 @@ DepthwiseConvolution::DepthwiseConvolution(
     const DepthwiseConvolution2DAttributes& attr, bool weights_are_buffer)
     : GPUOperation(definition),
       weights_are_buffer_(weights_are_buffer),
-      kernel_size_(attr.weights.shape.w, attr.weights.shape.h),
-      stride_(attr.strides.w, attr.strides.h),
-      padding_(-attr.padding.prepended.w, -attr.padding.prepended.h),
-      dilation_(attr.dilations.w, attr.dilations.h),
+      kernel_size_(attr.weights.shape.w, attr.weights.shape.h, 0, 0),
+      stride_(attr.strides.w, attr.strides.h, 0, 0),
+      padding_(-attr.padding.prepended.w, -attr.padding.prepended.h, 0, 0),
+      dilation_(attr.dilations.w, attr.dilations.h, 0, 0),
+      channel_multiplier_(attr.weights.shape.o),
+      work_group_size_(8, 8, 1) {}
+
+DepthwiseConvolution::DepthwiseConvolution(
+    const OperationDef& definition,
+    const DepthwiseConvolution3DAttributes& attr, bool weights_are_buffer)
+    : GPUOperation(definition),
+      weights_are_buffer_(weights_are_buffer),
+      kernel_size_(attr.weights.shape.w, attr.weights.shape.h,
+                   attr.weights.shape.d, 0),
+      stride_(attr.strides.w, attr.strides.h, attr.strides.d, 0),
+      padding_(-attr.padding.prepended.w, -attr.padding.prepended.h,
+               -attr.padding.prepended.d, 0),
+      dilation_(attr.dilations.w, attr.dilations.h, attr.dilations.d, 0),
       channel_multiplier_(attr.weights.shape.o),
       work_group_size_(8, 8, 1) {}
 
 DepthwiseConvolution::DepthwiseConvolution(DepthwiseConvolution&& operation)
     : GPUOperation(std::move(operation)),
       weights_are_buffer_(operation.weights_are_buffer_),
-      weights_tex2d_(std::move(operation.weights_tex2d_)),
-      weights_buf_(std::move(operation.weights_buf_)),
-      weights_(operation.weights_),
-      biases_(std::move(operation.biases_)),
       kernel_size_(operation.kernel_size_),
       stride_(operation.stride_),
       padding_(operation.padding_),
@@ -210,10 +258,6 @@ DepthwiseConvolution& DepthwiseConvolution::operator=(
     DepthwiseConvolution&& operation) {
   if (this != &operation) {
     std::swap(weights_are_buffer_, operation.weights_are_buffer_);
-    weights_tex2d_ = std::move(operation.weights_tex2d_);
-    weights_buf_ = std::move(operation.weights_buf_);
-    std::swap(weights_, operation.weights_);
-    biases_ = std::move(operation.biases_);
     std::swap(kernel_size_, operation.kernel_size_);
     std::swap(stride_, operation.stride_);
     std::swap(padding_, operation.padding_);
@@ -230,39 +274,48 @@ absl::Status DepthwiseConvolution::Compile(
     const CreationContext& creation_context) {
   const bool stride_correction =
       definition_.IsBatchSupported() && stride_.x != 1;
-  const auto code = GenerateDepthwiseConvolutionCode(
-      definition_, stride_correction, biases_, channel_multiplier_,
-      weights_are_buffer_, linked_operations_, *creation_context.device);
+  std::string code = GenerateDepthwiseConvolutionCode(
+      definition_, stride_correction, channel_multiplier_, weights_are_buffer_,
+      *creation_context.device, &args_);
+  std::string element_wise_code;
+  RETURN_IF_ERROR(
+      MergeOperations(linked_operations_, &args_, &element_wise_code));
+  RETURN_IF_ERROR(args_.TransformToCLCode(creation_context.device->GetInfo(),
+                                          {{"dst_tensor", element_wise_code}},
+                                          &code));
   return creation_context.cache->GetOrCreateCLKernel(
       code, "main_function", *creation_context.context,
       *creation_context.device, &kernel_);
 }
 
 absl::Status DepthwiseConvolution::BindArguments() {
-  kernel_.ResetBindingCounter();
-  RETURN_IF_ERROR(kernel_.SetMemoryAuto(src_[0]->GetMemoryPtr()));
-  RETURN_IF_ERROR(kernel_.SetMemoryAuto(weights_));
-  RETURN_IF_ERROR(kernel_.SetMemoryAuto(biases_.GetMemoryPtr()));
-  RETURN_IF_ERROR(BindArgs(&kernel_, linked_operations_));
-  RETURN_IF_ERROR(kernel_.SetMemoryAuto(dst_[0]->GetMemoryPtrForWriting()));
-  RETURN_IF_ERROR(kernel_.SetBytesAuto(kernel_size_));
-  RETURN_IF_ERROR(kernel_.SetBytesAuto(stride_));
-  RETURN_IF_ERROR(
-      kernel_.SetBytesAuto(int2(padding_.x * src_[0]->Batch(), padding_.y)));
-  RETURN_IF_ERROR(
-      kernel_.SetBytesAuto(int2(dilation_.x * src_[0]->Batch(), dilation_.y)));
-  if (!IsSpecializedCase(channel_multiplier_)) {
-    RETURN_IF_ERROR(kernel_.SetBytesAuto(int32_t(channel_multiplier_)));
+  RETURN_IF_ERROR(args_.SetObjectRef("src_tensor", src_[0]));
+  RETURN_IF_ERROR(args_.SetObjectRef("dst_tensor", dst_[0]));
+  RETURN_IF_ERROR(args_.SetInt("kernel_size_x", kernel_size_.x));
+  RETURN_IF_ERROR(args_.SetInt("stride_x", stride_.x));
+  RETURN_IF_ERROR(args_.SetInt("padding_x", padding_.x * src_[0]->Batch()));
+  RETURN_IF_ERROR(args_.SetInt("dilation_x", dilation_.x * src_[0]->Batch()));
+  RETURN_IF_ERROR(args_.SetInt("kernel_size_y", kernel_size_.y));
+  RETURN_IF_ERROR(args_.SetInt("stride_y", stride_.y));
+  RETURN_IF_ERROR(args_.SetInt("padding_y", padding_.y));
+  RETURN_IF_ERROR(args_.SetInt("dilation_y", dilation_.y));
+  if (definition_.dst_tensors[0].HasAxis(Axis::DEPTH)) {
+    RETURN_IF_ERROR(args_.SetInt("kernel_size_z", kernel_size_.z));
+    RETURN_IF_ERROR(args_.SetInt("stride_z", stride_.z));
+    RETURN_IF_ERROR(args_.SetInt("padding_z", padding_.z));
+    RETURN_IF_ERROR(args_.SetInt("dilation_z", dilation_.z));
   }
-  RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->GetWBatchedHSB()));
-  RETURN_IF_ERROR(kernel_.SetBytesAuto(dst_[0]->GetWBatchedHSB()));
-  return absl::OkStatus();
+  if (!IsSpecializedCase(channel_multiplier_)) {
+    RETURN_IF_ERROR(args_.SetInt("ch_multiplier", channel_multiplier_));
+  }
+  RETURN_IF_ERROR(SetArguments(linked_operations_, &args_));
+  return args_.Bind(kernel_.kernel());
 }
 
 int3 DepthwiseConvolution::GetGridSize() const {
   const int grid_x = dst_[0]->Width() * dst_[0]->Batch();
   const int grid_y = dst_[0]->Height();
-  const int grid_z = dst_[0]->Slices();
+  const int grid_z = dst_[0]->Slices() * dst_[0]->Depth();
   return int3(grid_x, grid_y, grid_z);
 }
 
@@ -284,14 +337,41 @@ absl::Status CreateDepthwiseConvolution(
   *result = DepthwiseConvolution(definition, attr, weights_are_buffer);
   RETURN_IF_ERROR(
       result->UploadWeights(attr.weights, creation_context.context));
-  LinearStorageCreateInfo create_info;
-  create_info.storage_type = weights_are_buffer ? LinearStorageType::BUFFER
-                                                : LinearStorageType::TEXTURE_2D;
-  create_info.data_type = definition.GetDataType();
-  create_info.name = "biases";
-  create_info.aligned_size = attr.weights.shape.o * attr.weights.shape.i;
-  RETURN_IF_ERROR(CreateLinearStorage(
-      create_info, attr.bias, creation_context.context, &result->biases_));
+
+  TensorLinearDescriptor desc;
+  desc.storage_type = weights_are_buffer ? LinearStorageType::BUFFER
+                                         : LinearStorageType::TEXTURE_2D;
+  desc.element_type = definition.GetDataType();
+
+  LinearStorage lt;
+  RETURN_IF_ERROR(
+      CreateLinearStorage(desc, attr.bias, creation_context.context, &lt));
+  result->args_.AddObject("biases", AccessType::READ,
+                          absl::make_unique<LinearStorage>(std::move(lt)),
+                          absl::make_unique<TensorLinearDescriptor>(desc));
+  return absl::OkStatus();
+}
+
+absl::Status CreateDepthwiseConvolution(
+    const CreationContext& creation_context, const OperationDef& definition,
+    const DepthwiseConvolution3DAttributes& attr,
+    DepthwiseConvolution* result) {
+  bool weights_are_buffer = creation_context.device->IsMali();
+  *result = DepthwiseConvolution(definition, attr, weights_are_buffer);
+  RETURN_IF_ERROR(
+      result->UploadWeights(attr.weights, creation_context.context));
+
+  TensorLinearDescriptor desc;
+  desc.storage_type = weights_are_buffer ? LinearStorageType::BUFFER
+                                         : LinearStorageType::TEXTURE_2D;
+  desc.element_type = definition.GetDataType();
+
+  LinearStorage lt;
+  RETURN_IF_ERROR(
+      CreateLinearStorage(desc, attr.bias, creation_context.context, &lt));
+  result->args_.AddObject("biases", AccessType::READ,
+                          absl::make_unique<LinearStorage>(std::move(lt)),
+                          absl::make_unique<TensorLinearDescriptor>(desc));
   return absl::OkStatus();
 }
 
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv.h b/tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv.h
index 7655f2abae0..30cd3d06a5a 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv.h
@@ -54,9 +54,17 @@ class DepthwiseConvolution : public GPUOperation {
       const CreationContext& creation_context, const OperationDef& definition,
       const DepthwiseConvolution2DAttributes& attr,
       DepthwiseConvolution* result);
+  friend absl::Status CreateDepthwiseConvolution(
+      const CreationContext& creation_context, const OperationDef& definition,
+      const DepthwiseConvolution3DAttributes& attr,
+      DepthwiseConvolution* result);
   DepthwiseConvolution(const OperationDef& definition,
                        const DepthwiseConvolution2DAttributes& attr,
                        bool weights_are_buffer);
+  DepthwiseConvolution(const OperationDef& definition,
+                       const DepthwiseConvolution3DAttributes& attr,
+                       bool weights_are_buffer);
+
   template <DataType T>
   absl::Status UploadWeights(const tflite::gpu::Tensor<OHWI, T>& weights,
                              CLContext* context);
@@ -65,20 +73,23 @@ class DepthwiseConvolution : public GPUOperation {
   void RearrangeWeightsData(const tflite::gpu::Tensor<OHWI, S>& weights,
                             absl::Span<T> dst);
 
+  template <DataType T>
+  absl::Status UploadWeights(const tflite::gpu::Tensor<OHWDI, T>& weights,
+                             CLContext* context);
+
+  template <DataType S, typename T>
+  void RearrangeWeightsData(const tflite::gpu::Tensor<OHWDI, S>& weights,
+                            absl::Span<T> dst);
+
   absl::Status BindArguments();
   int3 GetGridSize() const;
 
   bool weights_are_buffer_;
-  Texture2D weights_tex2d_;
-  Buffer weights_buf_;
-  cl_mem weights_;
 
-  LinearStorage biases_;
-
-  int2 kernel_size_;
-  int2 stride_;
-  int2 padding_;
-  int2 dilation_;
+  int4 kernel_size_;
+  int4 stride_;
+  int4 padding_;
+  int4 dilation_;
   int channel_multiplier_;
 
   CLKernel kernel_;
@@ -89,26 +100,28 @@ template <DataType T>
 absl::Status DepthwiseConvolution::UploadWeights(
     const tflite::gpu::Tensor<OHWI, T>& weights, CLContext* context) {
   const int dst_channels = weights.shape.i * weights.shape.o;
-  const int dst_depth = DivideRoundUp(dst_channels, 4);
+  const int dst_slices = DivideRoundUp(dst_channels, 4);
   const int kernel_x = weights.shape.w;
   const int kernel_y = weights.shape.h;
 
-  const int elements_count = kernel_x * kernel_y * dst_depth;
+  const int elements_count = kernel_x * kernel_y * dst_slices;
 
   const bool fp32_weights = definition_.precision == CalculationsPrecision::F32;
   const int float4_size = fp32_weights ? 16 : 8;
 
+  Texture2D weights_tex2d;
+  Buffer weights_buf;
   if (fp32_weights) {
     std::vector<float4> gpu_data(elements_count);
     RearrangeWeightsData(weights, absl::MakeSpan(gpu_data));
     if (weights_are_buffer_) {
       RETURN_IF_ERROR(CreateReadOnlyBuffer(float4_size * elements_count,
                                            gpu_data.data(), context,
-                                           &weights_buf_));
+                                           &weights_buf));
     } else {
       RETURN_IF_ERROR(CreateTexture2DRGBA(
-          definition_.GetDataType(), kernel_x * kernel_y, dst_depth,
-          gpu_data.data(), context, &weights_tex2d_));
+          definition_.GetDataType(), kernel_x * kernel_y, dst_slices,
+          gpu_data.data(), context, &weights_tex2d));
     }
   } else {
     std::vector<half4> gpu_data(elements_count);
@@ -116,18 +129,27 @@ absl::Status DepthwiseConvolution::UploadWeights(
     if (weights_are_buffer_) {
       RETURN_IF_ERROR(CreateReadOnlyBuffer(float4_size * elements_count,
                                            gpu_data.data(), context,
-                                           &weights_buf_));
+                                           &weights_buf));
     } else {
       RETURN_IF_ERROR(CreateTexture2DRGBA(
-          definition_.GetDataType(), kernel_x * kernel_y, dst_depth,
-          gpu_data.data(), context, &weights_tex2d_));
+          definition_.GetDataType(), kernel_x * kernel_y, dst_slices,
+          gpu_data.data(), context, &weights_tex2d));
     }
   }
 
   if (weights_are_buffer_) {
-    weights_ = weights_buf_.GetMemoryPtr();
+    BufferDescriptor desc;
+    desc.element_type = fp32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
+    desc.element_size = 4;
+    args_.AddObject("weights", AccessType::READ,
+                    absl::make_unique<Buffer>(std::move(weights_buf)),
+                    absl::make_unique<BufferDescriptor>(desc));
   } else {
-    weights_ = weights_tex2d_.GetMemoryPtr();
+    Texture2DDescriptor desc;
+    desc.element_type = fp32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
+    args_.AddObject("weights", AccessType::READ,
+                    absl::make_unique<Texture2D>(std::move(weights_tex2d)),
+                    absl::make_unique<Texture2DDescriptor>(desc));
   }
 
   return absl::OkStatus();
@@ -162,6 +184,98 @@ void DepthwiseConvolution::RearrangeWeightsData(
   }
 }
 
+template <DataType T>
+absl::Status DepthwiseConvolution::UploadWeights(
+    const tflite::gpu::Tensor<OHWDI, T>& weights, CLContext* context) {
+  const int dst_channels = weights.shape.i * weights.shape.o;
+  const int dst_slices = DivideRoundUp(dst_channels, 4);
+  const int kernel_x = weights.shape.w;
+  const int kernel_y = weights.shape.h;
+  const int kernel_z = weights.shape.d;
+
+  const int elements_count = kernel_x * kernel_y * kernel_z * dst_slices;
+
+  const bool fp32_weights = definition_.precision == CalculationsPrecision::F32;
+  const int float4_size = fp32_weights ? 16 : 8;
+
+  Texture2D weights_tex2d;
+  Buffer weights_buf;
+  if (fp32_weights) {
+    std::vector<float4> gpu_data(elements_count);
+    RearrangeWeightsData(weights, absl::MakeSpan(gpu_data));
+    if (weights_are_buffer_) {
+      RETURN_IF_ERROR(CreateReadOnlyBuffer(float4_size * elements_count,
+                                           gpu_data.data(), context,
+                                           &weights_buf));
+    } else {
+      RETURN_IF_ERROR(CreateTexture2DRGBA(
+          definition_.GetDataType(), kernel_x * kernel_y * kernel_z, dst_slices,
+          gpu_data.data(), context, &weights_tex2d));
+    }
+  } else {
+    std::vector<half4> gpu_data(elements_count);
+    RearrangeWeightsData(weights, absl::MakeSpan(gpu_data));
+    if (weights_are_buffer_) {
+      RETURN_IF_ERROR(CreateReadOnlyBuffer(float4_size * elements_count,
+                                           gpu_data.data(), context,
+                                           &weights_buf));
+    } else {
+      RETURN_IF_ERROR(CreateTexture2DRGBA(
+          definition_.GetDataType(), kernel_x * kernel_y * kernel_z, dst_slices,
+          gpu_data.data(), context, &weights_tex2d));
+    }
+  }
+
+  if (weights_are_buffer_) {
+    BufferDescriptor desc;
+    desc.element_type = fp32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
+    desc.element_size = 4;
+    args_.AddObject("weights", AccessType::READ,
+                    absl::make_unique<Buffer>(std::move(weights_buf)),
+                    absl::make_unique<BufferDescriptor>(desc));
+  } else {
+    Texture2DDescriptor desc;
+    desc.element_type = fp32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
+    args_.AddObject("weights", AccessType::READ,
+                    absl::make_unique<Texture2D>(std::move(weights_tex2d)),
+                    absl::make_unique<Texture2DDescriptor>(desc));
+  }
+
+  return absl::OkStatus();
+}
+
+template <DataType S, typename T>
+void DepthwiseConvolution::RearrangeWeightsData(
+    const tflite::gpu::Tensor<OHWDI, S>& weights, absl::Span<T> dst) {
+  const int dst_channels = weights.shape.i * weights.shape.o;
+  const int dst_slices = DivideRoundUp(dst_channels, 4);
+  const int kernel_x = weights.shape.w;
+  const int kernel_y = weights.shape.h;
+  const int kernel_z = weights.shape.d;
+
+  int counter = 0;
+  for (int d = 0; d < dst_slices; ++d) {
+    for (int z = 0; z < kernel_z; ++z) {
+      for (int y = 0; y < kernel_y; ++y) {
+        for (int x = 0; x < kernel_x; ++x) {
+          T filter_val;
+          for (int i = 0; i < 4; ++i) {
+            const int d_ch = d * 4 + i;
+            if (d_ch < dst_channels) {
+              const int f_index = weights.shape.LinearIndex(
+                  {d_ch % weights.shape.o, y, x, z, d_ch / weights.shape.o});
+              filter_val[i] = weights.data[f_index];
+            } else {
+              filter_val[i] = 0.0f;
+            }
+          }
+          dst[counter++] = filter_val;
+        }
+      }
+    }
+  }
+}
+
 absl::Status CreateDepthwiseConvolution(
     const CreationContext& creation_context, const OperationDef& definition,
     const DepthwiseConvolution2DAttributes& attr, DepthwiseConvolution* result);
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv_3d.cc b/tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv_3d.cc
deleted file mode 100644
index f9926a9f466..00000000000
--- a/tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv_3d.cc
+++ /dev/null
@@ -1,338 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv_3d.h"
-
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "tensorflow/lite/delegates/gpu/cl/cl_device.h"
-#include "tensorflow/lite/delegates/gpu/cl/kernels/util.h"
-#include "tensorflow/lite/delegates/gpu/cl/kernels/work_group_picking.h"
-
-namespace tflite {
-namespace gpu {
-namespace cl {
-namespace {
-
-bool IsSpecializedCase(int channel_multiplier) {
-  return channel_multiplier == 1 || channel_multiplier == 2 ||
-         channel_multiplier == 4;
-}
-
-std::string GetSrcValue(const TensorCodeGenerator& src_tensor,
-                        int channel_multiplier,
-                        TextureAddressMode address_mode) {
-  std::string c;
-  if (channel_multiplier == 1) {
-    c += "        FLT4 src_final =" +
-         src_tensor.ReadWHDS("x_c", "y_c", "z_c", "S", address_mode) + ";\n";
-  } else if (channel_multiplier == 2) {
-    c += "        int z_layer = S / 2;\n";
-    c += "        FLT4 src =" +
-         src_tensor.ReadWHDS("x_c", "y_c", "z_c", "z_layer", address_mode) +
-         ";\n";
-    c += "        FLT2 t0 = S % 2 == 0 ? src.xy : src.zw;\n";
-    c += "        FLT4 src_final = (FLT4)(t0.x, t0.x, t0.y, t0.y);\n";
-  } else if (channel_multiplier == 4) {
-    c += "        int z_layer = S / 4;\n";
-    c += "        FLT4 src =" +
-         src_tensor.ReadWHDS("x_c", "y_c", "z_c", "z_layer", address_mode) +
-         ";\n";
-    c += "        FLT t0 = src.x;\n";
-    c += "        int reminder = S % 4;\n";
-    c += "        if (reminder == 1) t0 = src.y;\n";
-    c += "        if (reminder == 2) t0 = src.z;\n";
-    c += "        if (reminder == 3) t0 = src.w;\n";
-    c += "        FLT4 src_final = (FLT4)(t0, t0, t0, t0);\n";
-  } else {
-    c += "        int z_layer = S / channel_multiplier;\n";
-    c += "        FLT4 src =" +
-         src_tensor.ReadWHDS("x_c", "y_c", "z_c", "z_layer", address_mode) +
-         ";\n";
-    c += "        int z_offset = (S % channel_multiplier) * 4;\n";
-    c += "        FLT4 src_final;\n";
-    c += "        FLT temp_arr[4] = {src.x, src.y, src.z, src.w};\n";
-    c += "        src_final.x = temp_arr[(z_offset + 0) / "
-         "channel_multiplier];\n";
-    c += "        src_final.y = temp_arr[(z_offset + 1) / "
-         "channel_multiplier];\n";
-    c += "        src_final.z = temp_arr[(z_offset + 2) / "
-         "channel_multiplier];\n";
-    c += "        src_final.w = temp_arr[(z_offset + 3) / "
-         "channel_multiplier];\n";
-  }
-
-  return c;
-}
-
-std::string GenerateDepthwiseConvolution3DCode(
-    const OperationDef& op_def, bool stride_correction,
-    const LinearStorage& biases, int channel_multiplier,
-    bool weights_are_buffer,
-    const std::vector<ElementwiseOperation*>& linked_operations,
-    const CLDevice& device) {
-  TensorCodeGenerator src_tensor(
-      "src_data",
-      WHDSPoint{"src_size.x", "src_size.y", "src_size.z", "src_size.w"},
-      op_def.src_tensors[0]);
-  TensorCodeGenerator dst_tensor(
-      "dst_data",
-      WHDSPoint{"dst_size.x", "dst_size.y", "dst_size.z", "dst_size.w"},
-      op_def.dst_tensors[0]);
-  const auto src_tensor_type = op_def.src_tensors[0].storage_type;
-
-  std::string c = GetCommonDefines(op_def.precision);
-
-  const bool manual_clamp = src_tensor_type == TensorStorageType::BUFFER ||
-                            src_tensor_type == TensorStorageType::IMAGE_BUFFER;
-
-  c += "__kernel void main_function(\n";
-  c += src_tensor.GetDeclaration(AccessType::READ) + ",\n";
-  if (weights_are_buffer) {
-    c += "    __global FLT4* filters,  \n";
-  } else {
-    c += "    __read_only image2d_t filters,  \n";
-  }
-  c += biases.GetDeclaration();
-  c += GetArgsDeclaration(linked_operations);
-  c += dst_tensor.GetDeclaration(AccessType::WRITE) + ",\n";
-  c += "    int4 kernel_size,                \n";
-  c += "    int4 stride,                     \n";
-  c += "    int4 padding,                    \n";
-  c += "    int4 dilation,                   \n";
-  if (!IsSpecializedCase(channel_multiplier)) {
-    c += "    int channel_multiplier,            \n";
-  }
-  if (op_def.IsBatchSupported()) {
-    c += "    int batch_size,          \n";
-  }
-  c += "    int4 src_size,                   \n";
-  c += "    int4 dst_size                    \n";
-  c += ") {\n";
-  c += "  int X = get_global_id(0);\n";
-  c += "  int Y = get_global_id(1);\n";
-  c += "  int linear_id_z = get_global_id(2);\n";
-  c += "  int S = linear_id_z % dst_size.w;\n";
-  c += "  int Z = linear_id_z / dst_size.w;\n";
-  c += "  if (X >= dst_size.x || Y >= dst_size.y || Z >= dst_size.z) return;\n";
-  c += "  ACCUM_FLT4 r = (ACCUM_FLT4)(0.0f, 0.0f, 0.0f, 0.0f);\n";
-  if (stride_correction) {
-    c += "  int x_offseted = " +
-         GetXStrideCorrected("X", "batch_size", "stride.x", "padding.x") +
-         ";\n";
-  } else {
-    c += "  int x_offseted = X * stride.x + padding.x;\n";
-  }
-  c += "  int y_offseted = Y * stride.y + padding.y;\n";
-  c += "  int z_offseted = Z * stride.z + padding.z;\n";
-  if (weights_are_buffer) {
-    c += "  int fx_c = S * kernel_size.x * kernel_size.y * kernel_size.z;\n";
-  } else {
-    c += "  int fx_c = 0;\n";
-  }
-
-  if (manual_clamp) {
-    c += "  for (int kz = 0; kz < kernel_size.z; ++kz) {\n";
-    c += "    int z_c = z_offseted + kz * dilation.z;\n";
-    c += "    bool outside_z = z_c < 0 || z_c >= src_size.z;\n";
-    c += "    for (int ky = 0; ky < kernel_size.y; ++ky) {\n";
-    c += "      int y_c = y_offseted + ky * dilation.y;\n";
-    c += "      bool outside_y = y_c < 0 || y_c >= src_size.y;\n";
-    c += "      for (int kx = 0; kx < kernel_size.x; ++kx) {\n";
-    c += "        int x_c = x_offseted + kx * dilation.x;\n";
-    c += "        bool outside_x = x_c < 0 || x_c >= src_size.x;\n";
-    c += "        if (!outside_x && !outside_y && !outside_z) {\n";
-    if (weights_are_buffer) {
-      c += "          FLT4 f = filters[fx_c];\n";
-    } else {
-      c += "          FLT4 f = READ_IMAGE(filters, smp_none, (int2)(fx_c, "
-           "S));\n";
-    }
-    c += GetSrcValue(src_tensor, channel_multiplier,
-                     TextureAddressMode::DONT_CARE);
-    c += "          r += TO_ACCUM_TYPE(src_final * f);\n";
-    c += "        };\n";
-    c += "        fx_c++;\n";
-    c += "      }\n";
-    c += "    }\n";
-    c += "  }\n";
-  } else {  // Texture types with ZERO clamping
-    c += "  for (int kz = 0; kz < kernel_size.z; ++kz) {\n";
-    c += "    int z_c = z_offseted + kz * dilation.z;\n";
-    if (src_tensor_type !=
-        TensorStorageType::TEXTURE_3D) {  // Only TEXTURE_3D supports clamping
-                                          // in DEPTH dimension
-      c += "    if (z_c < 0 || z_c >= src_size.z) {\n";
-      c += "      fx_c += kernel_size.y * kernel_size.x;\n";
-      c += "      continue;\n";
-      c += "    }\n";
-    }
-    c += "    for (int ky = 0; ky < kernel_size.y; ++ky) {\n";
-    c += "      int y_c = y_offseted + ky * dilation.y;\n";
-    c += "      for (int kx = 0; kx < kernel_size.x; ++kx) {\n";
-    c += "        int x_c = x_offseted + kx * dilation.x;\n";
-    const auto access_mode = GetFastestZeroMode(device);
-    c += GetSrcValue(src_tensor, channel_multiplier, access_mode);
-    if (weights_are_buffer) {
-      c += "        FLT4 f = filters[fx_c];\n";
-    } else {
-      c += "        FLT4 f = READ_IMAGE(filters, smp_none, (int2)(fx_c, S));\n";
-    }
-    c += "        fx_c++;\n";
-    c += "        r += TO_ACCUM_TYPE(src_final * f);\n";
-    c += "      }\n";
-    c += "    }\n";
-    c += "  }\n";
-  }
-  c += "  FLT4 bias_val = " + biases.ReadLinearFLT4("S") + ";\n";
-  c += "  FLT4 res0 = TO_FLT4(r) + bias_val;\n";
-  const LinkingContext context{"res0", "X", "Y", "S"};
-  c += PostProcess(linked_operations, context);
-  c += "  " + dst_tensor.WriteWHDS("res0", "X", "Y", "Z", "S") + "\n";
-  c += "}\n";
-  return c;
-}
-}  // namespace
-
-DepthwiseConvolution3D::DepthwiseConvolution3D(
-    const OperationDef& definition,
-    const DepthwiseConvolution3DAttributes& attr, const CLDevice& device)
-    : GPUOperation(definition),
-      weights_are_buffer_(device.IsMali()),
-      kernel_size_(attr.weights.shape.w, attr.weights.shape.h,
-                   attr.weights.shape.d),
-      stride_(attr.strides.w, attr.strides.h, attr.strides.d),
-      padding_(-attr.padding.prepended.w, -attr.padding.prepended.h,
-               -attr.padding.prepended.d),
-      dilation_(attr.dilations.w, attr.dilations.h, attr.dilations.d),
-      channel_multiplier_(attr.weights.shape.o),
-      work_group_size_(8, 8, 1) {}
-
-DepthwiseConvolution3D::DepthwiseConvolution3D(
-    DepthwiseConvolution3D&& operation)
-    : GPUOperation(std::move(operation)),
-      weights_tex2d_(std::move(operation.weights_tex2d_)),
-      weights_buf_(std::move(operation.weights_buf_)),
-      weights_are_buffer_(operation.weights_are_buffer_),
-      biases_(std::move(operation.biases_)),
-      kernel_size_(operation.kernel_size_),
-      stride_(operation.stride_),
-      padding_(operation.padding_),
-      dilation_(operation.dilation_),
-      channel_multiplier_(operation.channel_multiplier_),
-      kernel_(std::move(operation.kernel_)),
-      work_group_size_(operation.work_group_size_) {}
-
-DepthwiseConvolution3D& DepthwiseConvolution3D::operator=(
-    DepthwiseConvolution3D&& operation) {
-  if (this != &operation) {
-    weights_tex2d_ = std::move(operation.weights_tex2d_);
-    weights_buf_ = std::move(operation.weights_buf_);
-    std::swap(weights_are_buffer_, operation.weights_are_buffer_);
-    biases_ = std::move(operation.biases_);
-    std::swap(kernel_size_, operation.kernel_size_);
-    std::swap(stride_, operation.stride_);
-    std::swap(padding_, operation.padding_);
-    std::swap(dilation_, operation.dilation_);
-    std::swap(channel_multiplier_, operation.channel_multiplier_);
-    kernel_ = std::move(operation.kernel_);
-    std::swap(work_group_size_, operation.work_group_size_);
-    GPUOperation::operator=(std::move(operation));
-  }
-  return *this;
-}
-
-absl::Status DepthwiseConvolution3D::Compile(
-    const CreationContext& creation_context) {
-  const bool stride_correction =
-      definition_.IsBatchSupported() && stride_.x != 1;
-  const auto code = GenerateDepthwiseConvolution3DCode(
-      definition_, stride_correction, biases_, channel_multiplier_,
-      weights_are_buffer_, linked_operations_, *creation_context.device);
-  return creation_context.cache->GetOrCreateCLKernel(
-      code, "main_function", *creation_context.context,
-      *creation_context.device, &kernel_);
-}
-
-absl::Status DepthwiseConvolution3D::BindArguments() {
-  kernel_.ResetBindingCounter();
-  RETURN_IF_ERROR(kernel_.SetMemoryAuto(src_[0]->GetMemoryPtr()));
-  if (weights_are_buffer_) {
-    RETURN_IF_ERROR(kernel_.SetMemoryAuto(weights_buf_.GetMemoryPtr()));
-  } else {
-    RETURN_IF_ERROR(kernel_.SetMemoryAuto(weights_tex2d_.GetMemoryPtr()));
-  }
-  RETURN_IF_ERROR(kernel_.SetMemoryAuto(biases_.GetMemoryPtr()));
-  RETURN_IF_ERROR(BindArgs(&kernel_, linked_operations_));
-  RETURN_IF_ERROR(kernel_.SetMemoryAuto(dst_[0]->GetMemoryPtrForWriting()));
-  RETURN_IF_ERROR(kernel_.SetBytesAuto(
-      int4(kernel_size_.x, kernel_size_.y, kernel_size_.z, 1)));
-  RETURN_IF_ERROR(
-      kernel_.SetBytesAuto(int4(stride_.x, stride_.y, stride_.z, 1)));
-  RETURN_IF_ERROR(kernel_.SetBytesAuto(
-      int4(padding_.x * src_[0]->Batch(), padding_.y, padding_.z, 1)));
-  RETURN_IF_ERROR(kernel_.SetBytesAuto(
-      int4(dilation_.x * src_[0]->Batch(), dilation_.y, dilation_.z, 1)));
-  if (!IsSpecializedCase(channel_multiplier_)) {
-    RETURN_IF_ERROR(kernel_.SetBytesAuto(int32_t(channel_multiplier_)));
-  }
-  if (definition_.IsBatchSupported()) {
-    RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->Batch()));
-  }
-  RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->GetWBatchedHDS()));
-  RETURN_IF_ERROR(kernel_.SetBytesAuto(dst_[0]->GetWBatchedHDS()));
-  return absl::OkStatus();
-}
-
-int3 DepthwiseConvolution3D::GetGridSize() const {
-  const int grid_x = dst_[0]->Width() * dst_[0]->Batch();
-  const int grid_y = dst_[0]->Height();
-  const int grid_z = dst_[0]->Slices() * dst_[0]->Depth();
-  return int3(grid_x, grid_y, grid_z);
-}
-
-absl::Status DepthwiseConvolution3D::Tune(const TuningParameters& params) {
-  RETURN_IF_ERROR(BindArguments());
-  return GetBestWorkGroup(params, kernel_, GetGridSize(), &work_group_size_);
-}
-
-absl::Status DepthwiseConvolution3D::AddToQueue(CLCommandQueue* queue) {
-  RETURN_IF_ERROR(BindArguments());
-  return queue->DispatchImplicit(kernel_, GetGridSize(), work_group_size_);
-}
-
-absl::Status CreateDepthwiseConvolution3D(
-    const CreationContext& creation_context, const OperationDef& definition,
-    const DepthwiseConvolution3DAttributes& attr,
-    DepthwiseConvolution3D* result) {
-  *result = DepthwiseConvolution3D(definition, attr, *creation_context.device);
-  RETURN_IF_ERROR(
-      result->UploadWeights(attr.weights, creation_context.context));
-  LinearStorageCreateInfo create_info;
-  create_info.storage_type =
-      DeduceLinearStorageType(definition.GetPrimaryStorageType());
-  create_info.data_type = definition.GetDataType();
-  create_info.name = "biases";
-  create_info.aligned_size = attr.weights.shape.o * attr.weights.shape.i;
-  RETURN_IF_ERROR(CreateLinearStorage(
-      create_info, attr.bias, creation_context.context, &result->biases_));
-  return absl::OkStatus();
-}
-
-}  // namespace cl
-}  // namespace gpu
-}  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv_3d.h b/tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv_3d.h
deleted file mode 100644
index 3c87ba5832c..00000000000
--- a/tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv_3d.h
+++ /dev/null
@@ -1,170 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_DEPTHWISE_CONV_3D_H_
-#define TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_DEPTHWISE_CONV_3D_H_
-
-#include <vector>
-
-#include "tensorflow/lite/delegates/gpu/cl/buffer.h"
-#include "tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h"
-#include "tensorflow/lite/delegates/gpu/cl/linear_storage.h"
-#include "tensorflow/lite/delegates/gpu/cl/tensor.h"
-#include "tensorflow/lite/delegates/gpu/cl/texture2d.h"
-#include "tensorflow/lite/delegates/gpu/cl/util.h"
-#include "tensorflow/lite/delegates/gpu/common/data_type.h"
-#include "tensorflow/lite/delegates/gpu/common/operations.h"
-#include "tensorflow/lite/delegates/gpu/common/shape.h"
-#include "tensorflow/lite/delegates/gpu/common/status.h"
-#include "tensorflow/lite/delegates/gpu/common/tensor.h"
-#include "tensorflow/lite/delegates/gpu/common/types.h"
-
-namespace tflite {
-namespace gpu {
-namespace cl {
-
-class DepthwiseConvolution3D : public GPUOperation {
- public:
-  DepthwiseConvolution3D() = default;
-  absl::Status AddToQueue(CLCommandQueue* queue) override;
-  absl::Status Tune(const TuningParameters& params) override;
-
-  absl::Status Compile(const CreationContext& creation_context) override;
-
-  // Move only
-  DepthwiseConvolution3D(DepthwiseConvolution3D&& operation);
-  DepthwiseConvolution3D& operator=(DepthwiseConvolution3D&& operation);
-  DepthwiseConvolution3D(const DepthwiseConvolution3D&) = delete;
-  DepthwiseConvolution3D& operator=(const DepthwiseConvolution3D&) = delete;
-
- private:
-  friend absl::Status CreateDepthwiseConvolution3D(
-      const CreationContext& creation_context, const OperationDef& definition,
-      const DepthwiseConvolution3DAttributes& attr,
-      DepthwiseConvolution3D* result);
-  DepthwiseConvolution3D(const OperationDef& definition,
-                         const DepthwiseConvolution3DAttributes& attr,
-                         const CLDevice& device);
-  template <DataType T>
-  absl::Status UploadWeights(const tflite::gpu::Tensor<OHWDI, T>& weights,
-                             CLContext* context);
-
-  template <DataType S, typename T>
-  void RearrangeWeightsData(const tflite::gpu::Tensor<OHWDI, S>& weights,
-                            absl::Span<T> dst);
-
-  absl::Status BindArguments();
-  int3 GetGridSize() const;
-
-  Texture2D weights_tex2d_;
-  Buffer weights_buf_;
-  bool weights_are_buffer_;
-
-  LinearStorage biases_;
-
-  int3 kernel_size_;
-  int3 stride_;
-  int3 padding_;
-  int3 dilation_;
-  int channel_multiplier_;
-
-  CLKernel kernel_;
-  int3 work_group_size_;
-};
-
-template <DataType T>
-absl::Status DepthwiseConvolution3D::UploadWeights(
-    const tflite::gpu::Tensor<OHWDI, T>& weights, CLContext* context) {
-  const int dst_channels = weights.shape.i * weights.shape.o;
-  const int dst_slices = DivideRoundUp(dst_channels, 4);
-  const int kernel_x = weights.shape.w;
-  const int kernel_y = weights.shape.h;
-  const int kernel_z = weights.shape.d;
-
-  const int elements_count = kernel_x * kernel_y * kernel_z * dst_slices;
-  const bool f32_weights = definition_.precision == CalculationsPrecision::F32;
-
-  const int float4_size = f32_weights ? 16 : 8;
-
-  if (f32_weights) {
-    std::vector<float4> gpu_data(elements_count);
-    RearrangeWeightsData(weights, absl::MakeSpan(gpu_data));
-    if (weights_are_buffer_) {
-      RETURN_IF_ERROR(CreateReadOnlyBuffer(float4_size * elements_count,
-                                           gpu_data.data(), context,
-                                           &weights_buf_));
-    } else {
-      RETURN_IF_ERROR(CreateTexture2DRGBA(
-          definition_.GetDataType(), kernel_x * kernel_y * kernel_z, dst_slices,
-          gpu_data.data(), context, &weights_tex2d_));
-    }
-  } else {
-    std::vector<half4> gpu_data(elements_count);
-    RearrangeWeightsData(weights, absl::MakeSpan(gpu_data));
-    if (weights_are_buffer_) {
-      RETURN_IF_ERROR(CreateReadOnlyBuffer(float4_size * elements_count,
-                                           gpu_data.data(), context,
-                                           &weights_buf_));
-    } else {
-      RETURN_IF_ERROR(CreateTexture2DRGBA(
-          definition_.GetDataType(), kernel_x * kernel_y * kernel_z, dst_slices,
-          gpu_data.data(), context, &weights_tex2d_));
-    }
-  }
-  return absl::OkStatus();
-}
-
-template <DataType S, typename T>
-void DepthwiseConvolution3D::RearrangeWeightsData(
-    const tflite::gpu::Tensor<OHWDI, S>& weights, absl::Span<T> dst) {
-  const int dst_channels = weights.shape.i * weights.shape.o;
-  const int dst_slices = DivideRoundUp(dst_channels, 4);
-  const int kernel_x = weights.shape.w;
-  const int kernel_y = weights.shape.h;
-  const int kernel_z = weights.shape.d;
-
-  int counter = 0;
-  for (int d = 0; d < dst_slices; ++d) {
-    for (int z = 0; z < kernel_z; ++z) {
-      for (int y = 0; y < kernel_y; ++y) {
-        for (int x = 0; x < kernel_x; ++x) {
-          T filter_val;
-          for (int i = 0; i < 4; ++i) {
-            const int d_ch = d * 4 + i;
-            if (d_ch < dst_channels) {
-              const int f_index = weights.shape.LinearIndex(
-                  {d_ch % weights.shape.o, y, x, z, d_ch / weights.shape.o});
-              filter_val[i] = weights.data[f_index];
-            } else {
-              filter_val[i] = 0.0f;
-            }
-          }
-          dst[counter++] = filter_val;
-        }
-      }
-    }
-  }
-}
-
-absl::Status CreateDepthwiseConvolution3D(
-    const CreationContext& creation_context, const OperationDef& definition,
-    const DepthwiseConvolution3DAttributes& attr,
-    DepthwiseConvolution3D* result);
-
-}  // namespace cl
-}  // namespace gpu
-}  // namespace tflite
-
-#endif  // TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_DEPTHWISE_CONV_3D_H_

From bd6f60bf01875b4151ebf7bb37a7172ffa9ec426 Mon Sep 17 00:00:00 2001
From: Robert David <lrdx@google.com>
Date: Tue, 23 Jun 2020 09:55:28 -0700
Subject: [PATCH 0891/1390] Rename few LSTM symbols to a name that is
 consistent among different versions.

Also remove an unused std::vector variable.

PiperOrigin-RevId: 317886335
Change-Id: Ib1e131c42d86b983630a4f8130d9950a45119b4a
---
 tensorflow/lite/kernels/lstm_eval.cc | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/tensorflow/lite/kernels/lstm_eval.cc b/tensorflow/lite/kernels/lstm_eval.cc
index 5b4e8a8d479..ca8344d863b 100644
--- a/tensorflow/lite/kernels/lstm_eval.cc
+++ b/tensorflow/lite/kernels/lstm_eval.cc
@@ -490,7 +490,7 @@ inline void LstmStepFloat(
 // Temporary pre-allocated storage for quantized values:
 //   quantized_input_ptr (same size as input_ptr)
 //   quantized_output_state_ptr (same size as output_state_ptr)
-//   quantized_cell_state_ptr (same size as cell_state_ptr)
+//   quantized_output_scratch (same size as cell_state_ptr)
 // Temporary pre-allocated storage for recovered values:
 //   recovered_cell_weights (same size as cell_to_*_weights)
 //
@@ -540,7 +540,7 @@ inline void LstmStepHybrid(
     float* scratch2, float* scratch3, float* scaling_factors,
     float* scaling_factors_scratch, float* recovered_cell_weights,
     int8_t* quantized_input_ptr, int8_t* quantized_aux_input_ptr,
-    int8_t* quantized_output_state_ptr, int8_t* quantized_cell_state_ptr,
+    int8_t* quantized_output_state_ptr, int8_t* quantized_output_scratch,
     float* output_state_ptr, float* cell_state_ptr, int32_t* accum_scratch_ptr,
     float* output_ptr, int32_t* zero_points, int32_t* row_sums,
     int row_sums_size, bool* compute_row_sums, bool asymmetric_quantize_inputs,
@@ -882,10 +882,10 @@ inline void LstmStepHybrid(
     if (!tensor_utils::IsZeroVector(output_gate_scratch, n_batch * n_cell)) {
       // Save quantization and matmul computation for all zero input.
       tensor_utils::BatchQuantizeFloats(
-          output_gate_scratch, n_batch, n_cell, quantized_cell_state_ptr,
+          output_gate_scratch, n_batch, n_cell, quantized_output_scratch,
           scaling_factors, zero_points, asymmetric_quantize_inputs);
       tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-          projection_weights_ptr, n_output, n_cell, quantized_cell_state_ptr,
+          projection_weights_ptr, n_output, n_cell, quantized_output_scratch,
           projection_weights_scale, scaling_factors, n_batch, output_state_ptr,
           /*per_channel_scale=*/nullptr,
           asymmetric_quantize_inputs ? zero_points : nullptr, accum_scratch_ptr,
@@ -996,7 +996,7 @@ inline void LstmStepHybrid(
 //   output_state_ptr - size 'n_batch * n_output'
 //   cell_state_ptr   - size 'n_batch * n_cell'
 //   output_ptr       - size 'n_batch * n_output'
-inline void LstmStepInteger(
+inline void LstmStepInteger8x8_16(
     const int8_t* input_ptr, const int8_t* input_to_input_weight_ptr,
     int32_t effective_input_to_input_scale_a,
     int32_t effective_input_to_input_scale_b,
@@ -1060,7 +1060,7 @@ inline void LstmStepInteger(
     int32_t output_state_zp, int16_t* cell_state_ptr, int8_t* output_ptr,
     int16_t* scratch0, int16_t* scratch1, int16_t* scratch2, int16_t* scratch3,
     int8_t* scratch4, int32_t* scratch5, CpuBackendContext* context) {
-  ruy::profiler::ScopeLabel label("LstmStepInteger");
+  ruy::profiler::ScopeLabel label("LstmStepInteger8x8_16");
   // Make named scratch buffers for the different gates.
   int16_t* input_gate_scratch = scratch0;
   int16_t* forget_gate_scratch = scratch1;
@@ -1336,7 +1336,7 @@ inline void LstmStepInteger(
 //   cell_state_ptr   - size 'n_batch * n_cell'
 //   output_ptr       - size 'n_batch * n_output'
 // TODO(b/148688698): Move zero point calculation into Prepare().
-void LstmStepInteger(
+inline void LstmStepInteger8x8_8(
     const int8_t* input_ptr, int32_t input_zp,
     const int8_t* input_to_input_weight_ptr,
     int32_t effective_input_to_input_scale_a,
@@ -1391,6 +1391,7 @@ void LstmStepInteger(
     int8_t* scratch0, int8_t* scratch1, int16_t* scratch2, int16_t* scratch3,
     int16_t* scratch4, int16_t* scratch5, int16_t* scratch6,
     int16_t* scratch7) {
+  ruy::profiler::ScopeLabel label("LstmStepInteger8x8_8");
   // Make named scratch buffers for the different gates.
   int16_t* input_gate_scratch = scratch5;
   int16_t* forget_gate_scratch = scratch2;
@@ -1426,7 +1427,7 @@ void LstmStepInteger(
   tensor_utils::ApplySigmoidFloat(forget_gate_scratch, n_batch, n_cell,
                                   forget_gate_scratch);
 
-  // Update gate.
+  // Cell gate.
   std::fill_n(scratch0, n_batch * n_cell, 0);
   std::fill_n(scratch1, n_batch * n_cell, 0);
   tensor_utils::MatrixBatchVectorMultiply(
@@ -1444,13 +1445,13 @@ void LstmStepInteger(
       intermediate_scale_a[4], intermediate_scale_b[4], intermediate_scale_a[5],
       intermediate_scale_b[5], n_batch, n_cell, cell_gate_scratch);
 
-  // Update gate layer norm.
+  // Cell gate layer norm.
   tensor_utils::ApplyLayerNormFloat(
       cell_gate_scratch, layer_norm_cell_weight_ptr, layer_norm_cell_scale_a,
       layer_norm_cell_scale_b, cell_gate_bias_ptr, n_batch, n_cell,
       cell_gate_scratch);
 
-  // Update gate tanh.
+  // Cell gate tanh.
   tensor_utils::ApplyTanhFloat(cell_gate_scratch, n_batch, n_cell, -12,
                                cell_gate_scratch);
 
@@ -1505,7 +1506,6 @@ void LstmStepInteger(
   tensor_utils::ApplyTanhFloat(cell_state_ptr, n_batch, n_cell, -15,
                                forget_gate_scratch);
 
-  std::vector<int16_t> hidden(n_batch * n_cell);
   tensor_utils::CwiseMul(output_gate_scratch, forget_gate_scratch, n_batch,
                          n_cell, 15 + 15 - 15, cell_gate_scratch);
 
@@ -2004,7 +2004,7 @@ TfLiteStatus EvalInteger8x8_16(
     const int t_rel = t;
     int8_t* output_ptr = GetTensorData<int8_t>(output) + t_rel * output_step;
     const int8_t* input_ptr = GetTensorData<int8_t>(input) + t_rel * input_step;
-    LstmStepInteger(
+    LstmStepInteger8x8_16(
         input_ptr, GetTensorData<int8_t>(input_to_input_weights),
         integer_lstm_param->effective_input_to_input_scale_a,
         integer_lstm_param->effective_input_to_input_scale_b,
@@ -2140,7 +2140,7 @@ TfLiteStatus EvalInteger8x8_8(
     int8_t* output_ptr = GetTensorData<int8_t>(output) + t_rel * output_step;
     // Input can be int8 asymmetric or int16 symmetric.
     const int8_t* input_ptr = GetTensorData<int8_t>(input) + t_rel * input_step;
-    lstm_eval::LstmStepInteger(
+    lstm_eval::LstmStepInteger8x8_8(
         input_ptr, input_zp,
 
         GetTensorData<int8_t>(input_to_input_weights),

From b07691301fa26602c35531aa6349e95136c9ef3d Mon Sep 17 00:00:00 2001
From: Smit Hinsu <hinsu@google.com>
Date: Tue, 23 Jun 2020 10:05:49 -0700
Subject: [PATCH 0892/1390] MultiProcessRunner: Add
 UnexpectedSubprocessExitError to be raised if the exit code from a subprocess
 in unexpected. This results in subprocess having segfault failing the test,
 which would not have before this change.

PiperOrigin-RevId: 317888643
Change-Id: Id1c372ec695e8478bbb9c0888516ca9af4bb9799
---
 .../python/distribute/multi_process_runner.py | 38 ++-----------------
 .../distribute/multi_process_runner_test.py   | 35 -----------------
 2 files changed, 3 insertions(+), 70 deletions(-)

diff --git a/tensorflow/python/distribute/multi_process_runner.py b/tensorflow/python/distribute/multi_process_runner.py
index cb460c8fff5..db31b9c4dd4 100644
--- a/tensorflow/python/distribute/multi_process_runner.py
+++ b/tensorflow/python/distribute/multi_process_runner.py
@@ -475,28 +475,14 @@ class MultiProcessRunner(object):
     process_statuses = self._queue_to_list(self._process_status_queue)
     if not self._all_forced_terminated and len(
         process_statuses) != self._outstanding_subprocess_count:
-      raise UnexpectedSubprocessExitError(
-          'Missing status(es) from %d subprocess(es). See logs for details.' %
-          (self._outstanding_subprocess_count - len(process_statuses)),
-          self._get_mpr_result(process_statuses))
+      raise RuntimeError(
+          'missing statuses from %d subproceses.' %
+          (self._outstanding_subprocess_count - len(process_statuses)))
     for process_status in process_statuses:
       assert isinstance(process_status, _ProcessStatusInfo)
       if not process_status.is_successful:
         six.reraise(*process_status.exc_info)
 
-    # Checking all the processes that are expected to exit properly.
-    for (task_type, task_id), p in self._processes.items():
-      if self._dependence_on_chief and task_type != 'chief':
-        # If _dependence_on_chief, other processes may have been
-        # forced-terminated, which is expected.
-        continue
-      # Successfully exiting process has exit code 0.
-      if p.exitcode > 0:
-        raise UnexpectedSubprocessExitError(
-            'Subprocess %s-%d exited with exit code %d. See logs for details.' %
-            (task_type, task_id, p.exitcode),
-            self._get_mpr_result(process_statuses))
-
     logging.info('Joining log reading threads.')
     for thread in self._reading_threads:
       thread.join()
@@ -532,8 +518,6 @@ class MultiProcessRunner(object):
     for (task_type, task_id), p in self._processes.items():
       try:
         os.kill(p.pid, sig)
-        logging.info('%s-%d terminated with signal %r.', task_type, task_id,
-                     sig)
       except ProcessLookupError:
         logging.info('Attempting to kill %s-%d but it does not exist.',
                      task_type, task_id)
@@ -686,9 +670,6 @@ class _ProcFunc(object):
       self._resources.process_status_queue.put(info)
       self._close_streaming()
 
-    # Exit with code 0 as it's considered successful exit at this point.
-    sys.exit(0)
-
 
 class SubprocessTimeoutError(RuntimeError):
   """An error that indicates there is at least one subprocess timing out.
@@ -703,19 +684,6 @@ class SubprocessTimeoutError(RuntimeError):
     self.mpr_result = mpr_result
 
 
-class UnexpectedSubprocessExitError(RuntimeError):
-  """An error indicating there is at least one subprocess with unexpected exit.
-
-  When this is raised, a `MultiProcessRunnerResult` object can be retrieved by
-  `UnexpectedSubprocessExitError`'s mpr_result attribute. See
-  `MultiProcessRunner.join()` for more information.
-  """
-
-  def __init__(self, msg, mpr_result):
-    super(UnexpectedSubprocessExitError, self).__init__(msg)
-    self.mpr_result = mpr_result
-
-
 def _set_tf_config(task_type, task_id, cluster_spec, rpc_layer=None):
   """Set TF_CONFIG environment variable."""
   tf_config_dict = {
diff --git a/tensorflow/python/distribute/multi_process_runner_test.py b/tensorflow/python/distribute/multi_process_runner_test.py
index 529d7fd91a5..d6e04010e34 100644
--- a/tensorflow/python/distribute/multi_process_runner_test.py
+++ b/tensorflow/python/distribute/multi_process_runner_test.py
@@ -18,7 +18,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import ctypes
 import json
 import os
 import threading
@@ -299,40 +298,6 @@ class MultiProcessRunnerTest(test.TestCase):
     self.assertTrue(
         any('something printed' in line for line in list_to_assert))
 
-  def test_seg_fault_raises_error(self):
-
-    def proc_func_expected_to_seg_fault():
-      ctypes.string_at(0)  # Intentionally made seg fault.
-
-    with self.assertRaises(
-        multi_process_runner.UnexpectedSubprocessExitError) as cm:
-      multi_process_runner.run(
-          proc_func_expected_to_seg_fault,
-          multi_worker_test_base.create_cluster_spec(num_workers=1),
-          list_stdout=True)
-    self.assertIn('Missing status(es) from 1 subprocess(es).',
-                  str(cm.exception))
-    list_to_assert = cm.exception.mpr_result.stdout
-    self.assertTrue(any('SIGSEGV' in line for line in list_to_assert))
-
-  def test_seg_fault_in_chief_raises_error(self):
-
-    def proc_func_expected_to_seg_fault():
-      if multi_worker_test_base.get_task_type() == 'worker':
-        time.sleep(10000)
-      ctypes.string_at(0)  # Intentionally made seg fault.
-
-    with self.assertRaises(
-        multi_process_runner.UnexpectedSubprocessExitError) as cm:
-      multi_process_runner.run(
-          proc_func_expected_to_seg_fault,
-          multi_worker_test_base.create_cluster_spec(
-              has_chief=True, num_workers=1),
-          list_stdout=True)
-    self.assertIn('Subprocess chief-0 exited with exit code',
-                  str(cm.exception))
-    list_to_assert = cm.exception.mpr_result.stdout
-    self.assertTrue(any('SIGSEGV' in line for line in list_to_assert))
 
 if __name__ == '__main__':
   multi_process_runner.test_main()

From e4d6335bcb7a73cd8967c2c12339380aa1ae284f Mon Sep 17 00:00:00 2001
From: Hye Soo Yang <hyey@google.com>
Date: Tue, 23 Jun 2020 10:21:31 -0700
Subject: [PATCH 0893/1390] Refactor DecodeImageOp for the purpose of removing
 redundant data parsing and format checks from python wrapper and having them
 take place only in kernels. Remove security concerns. This change:

- Creates new op kernel (`DecodeImageV2Op`) that can decode all four image formats (jpg, png, gif, bmp). `DecodeImage` is the op name. `DecodeBmpOp` is moved into `DecodeImageV2Op`. (Now we have `gen_image_ops.decode_image` as opposed to previous `decode_image` which was a pure python implementation.)
- Updates GIF decoder to take in `expand_animation` flag for decoding just one frame.
- Removes data parsing and format checking logic from python layer entirely.
- Updates magic bytes for detecting image formats.
- Replicates portions of `convert_image_dtype` functionality in kernel (for optionally converting uint8/uint16 -> float32).

PiperOrigin-RevId: 317891936
Change-Id: I84f18e053f6dad845d9f2a61e1119f4de131c85d
---
 .../base_api/api_def_DecodeImage.pbtxt        |  51 ++
 .../python_api/api_def_DecodeImage.pbtxt      |   4 +
 tensorflow/core/kernels/decode_image_op.cc    | 459 +++++++++++++++++-
 tensorflow/core/lib/gif/gif_io.cc             |  13 +-
 tensorflow/core/lib/gif/gif_io.h              |   2 +-
 tensorflow/core/ops/image_ops.cc              |  45 ++
 tensorflow/core/ops/image_ops_test.cc         |  33 +-
 tensorflow/python/ops/image_ops_impl.py       |  17 +
 tensorflow/python/ops/image_ops_test.py       | 201 ++++----
 .../api/golden/v1/tensorflow.raw_ops.pbtxt    |   4 +
 .../api/golden/v2/tensorflow.raw_ops.pbtxt    |   4 +
 11 files changed, 738 insertions(+), 95 deletions(-)
 create mode 100644 tensorflow/core/api_def/base_api/api_def_DecodeImage.pbtxt
 create mode 100644 tensorflow/core/api_def/python_api/api_def_DecodeImage.pbtxt

diff --git a/tensorflow/core/api_def/base_api/api_def_DecodeImage.pbtxt b/tensorflow/core/api_def/base_api/api_def_DecodeImage.pbtxt
new file mode 100644
index 00000000000..c534425eb24
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_DecodeImage.pbtxt
@@ -0,0 +1,51 @@
+op {
+  graph_op_name: "DecodeImage"
+  in_arg {
+    name: "contents"
+    description: <<END
+0-D. The encoded image bytes.
+END
+  }
+  out_arg {
+    name: "image"
+    description: <<END
+3-D with shape `[height, width, channels]` or 4-D with shape
+`[frame, height, width, channels]`..
+END
+  }
+  attr {
+    name: "channels"
+    description: <<END
+Number of color channels for the decoded image.
+END
+  }
+  attr {
+    name: "dtype"
+    description: <<END
+The desired DType of the returned Tensor.
+END
+  }
+  attr {
+    name: "expand_animations"
+    description: <<END
+Controls the output shape of the returned op. If True, the returned op will
+produce a 3-D tensor for PNG, JPEG, and BMP files; and a 4-D tensor for all
+GIFs, whether animated or not. If, False, the returned op will produce a 3-D
+tensor for all file types and will truncate animated GIFs to the first frame.
+END
+  }
+  summary: "Function for decode_bmp, decode_gif, decode_jpeg, and decode_png."
+  description: <<END
+Detects whether an image is a BMP, GIF, JPEG, or PNG, and performs the
+appropriate operation to convert the input bytes string into a Tensor of type
+dtype.
+
+*NOTE*: decode_gif returns a 4-D array [num_frames, height, width, 3], as
+opposed to decode_bmp, decode_jpeg and decode_png, which return 3-D arrays
+[height, width, num_channels]. Make sure to take this into account when
+constructing your graph if you are intermixing GIF files with BMP, JPEG, and/or
+PNG files. Alternately, set the expand_animations argument of this function to
+False, in which case the op will return 3-dimensional tensors and will truncate
+animated GIF files to the first frame.
+END
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_DecodeImage.pbtxt b/tensorflow/core/api_def/python_api/api_def_DecodeImage.pbtxt
new file mode 100644
index 00000000000..54c4f6eeeee
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_DecodeImage.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "DecodeImage"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/kernels/decode_image_op.cc b/tensorflow/core/kernels/decode_image_op.cc
index 3f878ac6b95..8d0c0d89d43 100644
--- a/tensorflow/core/kernels/decode_image_op.cc
+++ b/tensorflow/core/kernels/decode_image_op.cc
@@ -17,7 +17,10 @@ limitations under the License.
 
 #include <memory>
 
+#define EIGEN_USE_THREADS
+
 #include "absl/strings/escaping.h"
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
@@ -33,19 +36,31 @@ limitations under the License.
 namespace tensorflow {
 namespace {
 
+// Magic bytes (hex) for each image format.
+// https://en.wikipedia.org/wiki/List_of_file_signatures
+// WARNING: Changing `static const` to `constexpr` requires first checking that
+// it works with supported MSVC version.
+// https://docs.microsoft.com/en-us/cpp/cpp/constexpr-cpp?redirectedfrom=MSDN&view=vs-2019
+static const char kPngMagicBytes[] = "\x89\x50\x4E\x47\x0D\x0A\x1A\x0A";
+static const char kGifMagicBytes[] = "\x47\x49\x46\x38";
+static const char kBmpMagicBytes[] = "\x42\x4d";
+// The 4th byte of JPEG is '\xe0' or '\xe1', so check just the first three.
+static const char kJpegMagicBytes[] = "\xff\xd8\xff";
+
 enum FileFormat {
   kUnknownFormat = 0,
   kPngFormat = 1,
   kJpgFormat = 2,
   kGifFormat = 3,
+  kBmpFormat = 4,
 };
 
 // Classify the contents of a file based on starting bytes (the magic number).
 FileFormat ClassifyFileFormat(StringPiece data) {
-  // The 4th byte of JPEG is '\xe0' or '\xe1', so check just the first three
-  if (absl::StartsWith(data, "\xff\xd8\xff")) return kJpgFormat;
-  if (absl::StartsWith(data, "\x89PNG\r\n\x1a\n")) return kPngFormat;
-  if (absl::StartsWith(data, "\x47\x49\x46\x38")) return kGifFormat;
+  if (absl::StartsWith(data, kJpegMagicBytes)) return kJpgFormat;
+  if (absl::StartsWith(data, kPngMagicBytes)) return kPngFormat;
+  if (absl::StartsWith(data, kGifMagicBytes)) return kGifFormat;
+  if (absl::StartsWith(data, kBmpMagicBytes)) return kBmpFormat;
   return kUnknownFormat;
 }
 
@@ -339,11 +354,447 @@ class DecodeImageOp : public OpKernel {
   jpeg::UncompressFlags flags_;
 };
 
+// Decode an image. Supported image formats are JPEG, PNG, GIF and BMP. This is
+// a newer version of `DecodeImageOp` for enabling image data parsing to take
+// place in kernels only, reducing security vulnerabilities and redundancy.
+class DecodeImageV2Op : public OpKernel {
+ public:
+  explicit DecodeImageV2Op(OpKernelConstruction* context) : OpKernel(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("channels", &channels_));
+    OP_REQUIRES(
+        context,
+        channels_ == 0 || channels_ == 1 || channels_ == 3 || channels_ == 4,
+        errors::InvalidArgument("`channels` must be 0, 1, 3 or 4 but got ",
+                                channels_));
+    OP_REQUIRES_OK(context, context->GetAttr("dtype", &data_type_));
+    OP_REQUIRES(
+        context,
+        data_type_ == DataType::DT_UINT8 || data_type_ == DataType::DT_UINT16 ||
+            data_type_ == DataType::DT_FLOAT,
+        errors::InvalidArgument(
+            "`dtype` must be unit8, unit16, float but got: ", data_type_));
+    OP_REQUIRES_OK(context,
+                   context->GetAttr("expand_animations", &expand_animations_));
+  }
+
+  // Helper for decoding BMP.
+  inline int32 ByteSwapInt32ForBigEndian(int32 x) {
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+    return le32toh(x);
+#else
+    return x;
+#endif
+  }
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor& contents = context->input(0);
+    OP_REQUIRES(
+        context, TensorShapeUtils::IsScalar(contents.shape()),
+        errors::InvalidArgument("`contents` must be scalar but got shape",
+                                contents.shape().DebugString()));
+    const StringPiece input = contents.scalar<tstring>()();
+    OP_REQUIRES(context, !input.empty(),
+                errors::InvalidArgument("Input is empty."));
+    OP_REQUIRES(context, input.size() <= std::numeric_limits<int>::max(),
+                errors::InvalidArgument(
+                    "Input contents are too large for int: ", input.size()));
+
+    // Parse magic bytes to determine file format.
+    switch (ClassifyFileFormat(input)) {
+      case kJpgFormat:
+        DecodeJpegV2(context, input);
+        break;
+      case kPngFormat:
+        DecodePngV2(context, input);
+        break;
+      case kGifFormat:
+        DecodeGifV2(context, input);
+        break;
+      case kBmpFormat:
+        DecodeBmpV2(context, input);
+        break;
+      case kUnknownFormat:
+        OP_REQUIRES(context, false,
+                    errors::InvalidArgument("Unknown image file format. One of "
+                                            "JPEG, PNG, GIF, BMP required."));
+        break;
+    }
+  }
+
+  void DecodeJpegV2(OpKernelContext* context, StringPiece input) {
+    OP_REQUIRES(context, channels_ == 0 || channels_ == 1 || channels_ == 3,
+                errors::InvalidArgument("JPEG does not support 4 channels."));
+
+    // Use default settings for `DecodeImage` op. Use local copy of flags to
+    // avoid race condition as the class member is shared among different
+    // invocations.
+    jpeg::UncompressFlags flags = jpeg::UncompressFlags();
+    flags.components = channels_;
+    flags.dct_method = JDCT_IFAST;
+
+    // Output tensor and the image buffer size.
+    Tensor* output = nullptr;
+    int buffer_size = 0;
+
+    // Decode JPEG. Directly allocate to the output buffer if data type is
+    // uint8 (to save extra copying). Otherwise, allocate a new uint8 buffer
+    // with buffer size. `jpeg::Uncompress` support unit8 only.
+    uint8* buffer = jpeg::Uncompress(
+        input.data(), input.size(), flags, nullptr /* nwarn */,
+        [&](int width, int height, int channels) -> uint8* {
+          buffer_size = height * width * channels;
+          Status status = context->allocate_output(
+              0, TensorShape({height, width, channels}), &output);
+          if (!status.ok()) {
+            VLOG(1) << status;
+            context->SetStatus(status);
+            return nullptr;
+          }
+
+          if (data_type_ == DataType::DT_UINT8) {
+            return output->flat<uint8>().data();
+          } else {
+            return new uint8[buffer_size];
+          }
+        });
+
+    OP_REQUIRES(context, buffer,
+                errors::InvalidArgument("jpeg::Uncompress failed."));
+
+    // For when desired data type if unit8, the output buffer is already
+    // allocated during the `jpeg::Uncompress` call above; return.
+    if (data_type_ == DataType::DT_UINT8) {
+      return;
+    }
+    // Make sure we don't forget to deallocate `buffer`.
+    std::unique_ptr<uint8[]> buffer_unique_ptr(buffer);
+
+    // Convert uint8 image data to desired data type.
+    // Use eigen threadpooling to speed up the copy operation.
+    const auto& device = context->eigen_device<Eigen::ThreadPoolDevice>();
+    TTypes<uint8>::UnalignedConstFlat buffer_view(buffer, buffer_size);
+    if (data_type_ == DataType::DT_UINT16) {
+      uint16 scale = floor((std::numeric_limits<uint16>::max() + 1) /
+                           (std::numeric_limits<uint8>::max() + 1));
+      // Fill output tensor with desired dtype.
+      output->flat<uint16>().device(device) =
+          buffer_view.cast<uint16>() * scale;
+    } else if (data_type_ == DataType::DT_FLOAT) {
+      float scale = 1. / std::numeric_limits<uint8>::max();
+      // Fill output tensor with desired dtype.
+      output->flat<float>().device(device) = buffer_view.cast<float>() * scale;
+    }
+  }
+
+  void DecodePngV2(OpKernelContext* context, StringPiece input) {
+    int channel_bits;
+    channel_bits = (data_type_ == DataType::DT_UINT8) ? 8 : 16;
+    png::DecodeContext decode;
+    OP_REQUIRES(
+        context, png::CommonInitDecode(input, channels_, channel_bits, &decode),
+        errors::InvalidArgument("Invalid PNG. Failed to initialize decoder."));
+
+    // Verify that width and height are not too large:
+    // - verify width and height don't overflow int.
+    // - width can later be multiplied by channels_ and sizeof(uint16), so
+    //   verify single dimension is not too large.
+    // - verify when width and height are multiplied together, there are a few
+    //   bits to spare as well.
+    const int width = static_cast<int>(decode.width);
+    const int height = static_cast<int>(decode.height);
+    const int64 total_size =
+        static_cast<int64>(width) * static_cast<int64>(height);
+    if (width != static_cast<int64>(decode.width) || width <= 0 ||
+        width >= (1LL << 27) || height != static_cast<int64>(decode.height) ||
+        height <= 0 || height >= (1LL << 27) || total_size >= (1LL << 29)) {
+      png::CommonFreeDecode(&decode);
+      OP_REQUIRES(context, false,
+                  errors::InvalidArgument("PNG size too large for int: ",
+                                          decode.width, " by ", decode.height));
+    }
+
+    Tensor* output = nullptr;
+    const auto status = context->allocate_output(
+        0, TensorShape({height, width, decode.channels}), &output);
+    if (!status.ok()) png::CommonFreeDecode(&decode);
+    OP_REQUIRES_OK(context, status);
+
+    if (data_type_ == DataType::DT_UINT8) {
+      OP_REQUIRES(
+          context,
+          png::CommonFinishDecode(
+              reinterpret_cast<png_bytep>(output->flat<uint8>().data()),
+              decode.channels * width * sizeof(uint8), &decode),
+          errors::InvalidArgument("Invalid PNG data, size ", input.size()));
+    } else if (data_type_ == DataType::DT_UINT16) {
+      OP_REQUIRES(
+          context,
+          png::CommonFinishDecode(
+              reinterpret_cast<png_bytep>(output->flat<uint16>().data()),
+              decode.channels * width * sizeof(uint16), &decode),
+          errors::InvalidArgument("Invalid PNG data, size ", input.size()));
+    } else if (data_type_ == DataType::DT_FLOAT) {
+      // `png::CommonFinishDecode` does not support `float`. First allocate
+      // uint16 buffer for the image and decode in uint16 (lossless). Wrap the
+      // buffer in `unique_ptr` so that we don't forget to delete the buffer.
+      std::unique_ptr<uint16[]> buffer(
+          new uint16[height * width * decode.channels]);
+      OP_REQUIRES(
+          context,
+          png::CommonFinishDecode(reinterpret_cast<png_bytep>(buffer.get()),
+                                  decode.channels * width * sizeof(uint16),
+                                  &decode),
+          errors::InvalidArgument("Invalid PNG data, size ", input.size()));
+
+      // Convert uint16 image data to desired data type.
+      // Use eigen threadpooling to speed up the copy operation.
+      const auto& device = context->eigen_device<Eigen::ThreadPoolDevice>();
+      TTypes<uint16, 3>::UnalignedConstTensor buf(buffer.get(), height, width,
+                                                  decode.channels);
+      float scale = 1. / std::numeric_limits<uint16>::max();
+      // Fill output tensor with desired dtype.
+      output->tensor<float, 3>().device(device) = buf.cast<float>() * scale;
+    }
+  }
+
+  void DecodeGifV2(OpKernelContext* context, StringPiece input) {
+    // GIF has 3 channels.
+    OP_REQUIRES(context, channels_ == 0 || channels_ == 3,
+                errors::InvalidArgument("channels must be 0 or 3 for GIF, got ",
+                                        channels_));
+
+    // Decode GIF, allocating tensor if dtype is uint8, otherwise defer tensor
+    // allocation til after dtype conversion is done. `gif`::Decode` supports
+    // uint8 only.
+    Tensor* output = nullptr;
+    int buffer_size = 0;
+    string error_string;
+    uint8* buffer = gif::Decode(
+        input.data(), input.size(),
+        [&](int num_frames, int width, int height, int channels) -> uint8* {
+          buffer_size = num_frames * height * width * channels;
+
+          Status status;
+          if (expand_animations_) {
+            status = context->allocate_output(
+                0, TensorShape({num_frames, height, width, channels}), &output);
+          } else {
+            status = context->allocate_output(
+                0, TensorShape({height, width, channels}), &output);
+          }
+          if (!status.ok()) {
+            VLOG(1) << status;
+            context->SetStatus(status);
+            return nullptr;
+          }
+
+          if (data_type_ == DataType::DT_UINT8) {
+            return output->flat<uint8>().data();
+          } else {
+            return new uint8[buffer_size];
+          }
+        },
+        &error_string, expand_animations_);
+
+    OP_REQUIRES(context, buffer,
+                errors::InvalidArgument("Invalid GIF data (size ", input.size(),
+                                        "), ", error_string));
+
+    // For when desired data type is unit8, the output buffer is already
+    // allocated during the `gif::Decode` call above; return.
+    if (data_type_ == DataType::DT_UINT8) {
+      return;
+    }
+    // Make sure we don't forget to deallocate `buffer`.
+    std::unique_ptr<uint8[]> buffer_unique_ptr(buffer);
+
+    // Convert the raw uint8 buffer to desired dtype.
+    // Use eigen threadpooling to speed up the copy operation.
+    TTypes<uint8>::UnalignedConstFlat buffer_view(buffer, buffer_size);
+    const auto& device = context->eigen_device<Eigen::ThreadPoolDevice>();
+    if (data_type_ == DataType::DT_UINT16) {
+      uint16 scale = floor((std::numeric_limits<uint16>::max() + 1) /
+                           (std::numeric_limits<uint8>::max() + 1));
+      // Fill output tensor with desired dtype.
+      output->flat<uint16>().device(device) =
+          buffer_view.cast<uint16>() * scale;
+    } else if (data_type_ == DataType::DT_FLOAT) {
+      float scale = 1. / std::numeric_limits<uint8>::max();
+      // Fill output tensor with desired dtype.
+      output->flat<float>().device(device) = buffer_view.cast<float>() * scale;
+    }
+  }
+
+  void DecodeBmpV2(OpKernelContext* context, StringPiece input) {
+    OP_REQUIRES(context, channels_ == 0 || channels_ == 3,
+                errors::InvalidArgument(
+                    "`channels` must be 0 or 3 for BMP, but got ", channels_));
+
+    OP_REQUIRES(context, (32 <= input.size()),
+                errors::InvalidArgument("Incomplete bmp content, requires at "
+                                        "least 32 bytes to find the header "
+                                        "size, width, height, and bpp, got ",
+                                        input.size(), " bytes"));
+
+    const uint8* img_bytes = reinterpret_cast<const uint8*>(input.data());
+    int32 header_size_ = internal::SubtleMustCopy(
+        *(reinterpret_cast<const int32*>(img_bytes + 10)));
+    const int32 header_size = ByteSwapInt32ForBigEndian(header_size_);
+    int32 width_ = internal::SubtleMustCopy(
+        *(reinterpret_cast<const int32*>(img_bytes + 18)));
+    const int32 width = ByteSwapInt32ForBigEndian(width_);
+    int32 height_ = internal::SubtleMustCopy(
+        *(reinterpret_cast<const int32*>(img_bytes + 22)));
+    const int32 height = ByteSwapInt32ForBigEndian(height_);
+    int32 bpp_ = internal::SubtleMustCopy(
+        *(reinterpret_cast<const int32*>(img_bytes + 28)));
+    const int32 bpp = ByteSwapInt32ForBigEndian(bpp_);
+
+    if (channels_) {
+      OP_REQUIRES(context, (channels_ == bpp / 8),
+                  errors::InvalidArgument(
+                      "channels attribute ", channels_,
+                      " does not match bits per pixel from file ", bpp / 8));
+    } else {
+      channels_ = bpp / 8;
+    }
+
+    // Current implementation only supports 1, 3 or 4 channel
+    // bitmaps.
+    OP_REQUIRES(context, (channels_ == 1 || channels_ == 3 || channels_ == 4),
+                errors::InvalidArgument(
+                    "Number of channels must be 1, 3 or 4, was ", channels_));
+
+    OP_REQUIRES(context, width > 0,
+                errors::InvalidArgument("Width must be positive"));
+    OP_REQUIRES(context, height != 0,
+                errors::InvalidArgument("Height must be nonzero"));
+    OP_REQUIRES(context, header_size >= 0,
+                errors::InvalidArgument("header size must be nonnegative"));
+
+    // The real requirement is < 2^31 minus some headers and channel data,
+    // so rounding down to something that's still ridiculously big.
+    OP_REQUIRES(
+        context,
+        (static_cast<int64>(width) * std::abs(static_cast<int64>(height))) <
+            static_cast<int64>(std::numeric_limits<int32_t>::max() / 8),
+        errors::InvalidArgument(
+            "Total possible pixel bytes must be less than 2^30"));
+
+    const int32 abs_height = abs(height);
+
+    // there may be padding bytes when the width is not a multiple of 4 bytes
+    const int row_size = (channels_ * width + 3) / 4 * 4;
+
+    const int64 last_pixel_offset = static_cast<int64>(header_size) +
+                                    (abs_height - 1) * row_size +
+                                    (width - 1) * channels_;
+
+    // [expected file size] = [last pixel offset] + [last pixel size=channels]
+    const int64 expected_file_size = last_pixel_offset + channels_;
+
+    OP_REQUIRES(
+        context, (expected_file_size <= input.size()),
+        errors::InvalidArgument("Incomplete bmp content, requires at least ",
+                                expected_file_size, " bytes, got ",
+                                input.size(), " bytes"));
+
+    // if height is negative, data layout is top down
+    // otherwise, it's bottom up.
+    bool top_down = (height < 0);
+
+    // Decode image, allocating tensor once the image size is known.
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK(
+        context, context->allocate_output(
+                     0, TensorShape({abs_height, width, channels_}), &output));
+
+    const uint8* bmp_pixels = &img_bytes[header_size];
+
+    if (data_type_ == DataType::DT_UINT8) {
+      DecodeBMP(bmp_pixels, row_size, output->flat<uint8>().data(), width,
+                abs_height, channels_, top_down);
+    } else {
+      std::unique_ptr<uint8[]> buffer(new uint8[height * width * channels_]);
+      DecodeBMP(bmp_pixels, row_size, buffer.get(), width, abs_height,
+                channels_, top_down);
+      TTypes<uint8, 3>::UnalignedConstTensor buf(buffer.get(), height, width,
+                                                 channels_);
+      // Convert the raw uint8 buffer to desired dtype.
+      // Use eigen threadpooling to speed up the copy operation.
+      const auto& device = context->eigen_device<Eigen::ThreadPoolDevice>();
+      if (data_type_ == DataType::DT_UINT16) {
+        uint16 scale = floor((std::numeric_limits<uint16>::max() + 1) /
+                             (std::numeric_limits<uint8>::max() + 1));
+        // Fill output tensor with desired dtype.
+        output->tensor<uint16, 3>().device(device) = buf.cast<uint16>() * scale;
+      } else if (data_type_ == DataType::DT_FLOAT) {
+        float scale = 1. / std::numeric_limits<uint8>::max();
+        // Fill output tensor with desired dtype.
+        output->tensor<float, 3>().device(device) = buf.cast<float>() * scale;
+      }
+    }
+  }
+
+  void DecodeBMP(const uint8* input, const int row_size, uint8* const output,
+                 const int width, const int height, const int channels,
+                 bool top_down);
+
+ private:
+  int channels_ = 0;
+  DataType data_type_;
+  bool expand_animations_;
+};
+
 REGISTER_KERNEL_BUILDER(Name("DecodeJpeg").Device(DEVICE_CPU), DecodeImageOp);
 REGISTER_KERNEL_BUILDER(Name("DecodePng").Device(DEVICE_CPU), DecodeImageOp);
 REGISTER_KERNEL_BUILDER(Name("DecodeGif").Device(DEVICE_CPU), DecodeImageOp);
 REGISTER_KERNEL_BUILDER(Name("DecodeAndCropJpeg").Device(DEVICE_CPU),
                         DecodeImageOp);
+REGISTER_KERNEL_BUILDER(Name("DecodeImage").Device(DEVICE_CPU),
+                        DecodeImageV2Op);
+
+void DecodeImageV2Op::DecodeBMP(const uint8* input, const int row_size,
+                                uint8* const output, const int width,
+                                const int height, const int channels,
+                                bool top_down) {
+  for (int i = 0; i < height; i++) {
+    int src_pos;
+    int dst_pos;
+
+    for (int j = 0; j < width; j++) {
+      if (!top_down) {
+        src_pos = ((height - 1 - i) * row_size) + j * channels;
+      } else {
+        src_pos = i * row_size + j * channels;
+      }
+
+      dst_pos = (i * width + j) * channels;
+
+      switch (channels) {
+        case 1:
+          output[dst_pos] = input[src_pos];
+          break;
+        case 3:
+          // BGR -> RGB
+          output[dst_pos] = input[src_pos + 2];
+          output[dst_pos + 1] = input[src_pos + 1];
+          output[dst_pos + 2] = input[src_pos];
+          break;
+        case 4:
+          // BGRA -> RGBA
+          output[dst_pos] = input[src_pos + 2];
+          output[dst_pos + 1] = input[src_pos + 1];
+          output[dst_pos + 2] = input[src_pos];
+          output[dst_pos + 3] = input[src_pos + 3];
+          break;
+        default:
+          LOG(FATAL) << "Unexpected number of channels: " << channels;
+          break;
+      }
+    }
+  }
+}
 
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/lib/gif/gif_io.cc b/tensorflow/core/lib/gif/gif_io.cc
index dc5406920a4..32e2f6dfa52 100644
--- a/tensorflow/core/lib/gif/gif_io.cc
+++ b/tensorflow/core/lib/gif/gif_io.cc
@@ -55,7 +55,7 @@ static const char* GifErrorStringNonNull(int error_code) {
 
 uint8* Decode(const void* srcdata, int datasize,
               const std::function<uint8*(int, int, int, int)>& allocate_output,
-              string* error_string) {
+              string* error_string, bool expand_animations) {
   int error_code = D_GIF_SUCCEEDED;
   InputBufferInfo info = {reinterpret_cast<const uint8*>(srcdata), datasize};
   GifFileType* gif_file =
@@ -82,10 +82,13 @@ uint8* Decode(const void* srcdata, int datasize,
     return nullptr;
   }
 
+  int target_num_frames = gif_file->ImageCount;
+  if (!expand_animations) target_num_frames = 1;
+
   // Don't request more memory than needed for each frame, preventing OOM
   int max_frame_width = 0;
   int max_frame_height = 0;
-  for (int k = 0; k < gif_file->ImageCount; k++) {
+  for (int k = 0; k < target_num_frames; k++) {
     SavedImage* si = &gif_file->SavedImages[k];
     if (max_frame_height < si->ImageDesc.Height)
       max_frame_height = si->ImageDesc.Height;
@@ -93,14 +96,14 @@ uint8* Decode(const void* srcdata, int datasize,
       max_frame_width = si->ImageDesc.Width;
   }
 
-  const int num_frames = gif_file->ImageCount;
   const int width = max_frame_width;
   const int height = max_frame_height;
   const int channel = 3;
 
-  uint8* const dstdata = allocate_output(num_frames, width, height, channel);
+  uint8* const dstdata =
+      allocate_output(target_num_frames, width, height, channel);
   if (!dstdata) return nullptr;
-  for (int k = 0; k < num_frames; k++) {
+  for (int k = 0; k < target_num_frames; k++) {
     uint8* this_dst = dstdata + k * width * channel * height;
 
     SavedImage* this_image = &gif_file->SavedImages[k];
diff --git a/tensorflow/core/lib/gif/gif_io.h b/tensorflow/core/lib/gif/gif_io.h
index e46a7917398..ae7d5125bd7 100644
--- a/tensorflow/core/lib/gif/gif_io.h
+++ b/tensorflow/core/lib/gif/gif_io.h
@@ -44,7 +44,7 @@ namespace gif {
 
 uint8* Decode(const void* srcdata, int datasize,
               const std::function<uint8*(int, int, int, int)>& allocate_output,
-              string* error_string);
+              string* error_string, bool expand_animations = true);
 
 }  // namespace gif
 }  // namespace tensorflow
diff --git a/tensorflow/core/ops/image_ops.cc b/tensorflow/core/ops/image_ops.cc
index e11f14b8538..43ee65c4ab4 100644
--- a/tensorflow/core/ops/image_ops.cc
+++ b/tensorflow/core/ops/image_ops.cc
@@ -87,6 +87,40 @@ Status DecodeImageShapeFn(InferenceContext* c) {
   return Status::OK();
 }
 
+Status DecodeImageV2ShapeFn(InferenceContext* c) {
+  ShapeHandle unused;
+  int32 channels;
+  bool expand_animations;
+  DimensionHandle channels_dim;
+
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &unused));
+  TF_RETURN_IF_ERROR(c->GetAttr("channels", &channels));
+  TF_RETURN_IF_ERROR(c->GetAttr("expand_animations", &expand_animations));
+
+  if (channels == 0) {
+    channels_dim = c->UnknownDim();
+  } else {
+    if (channels < 0) {
+      return errors::InvalidArgument("channels must be non-negative, got ",
+                                     channels);
+    }
+    channels_dim = c->MakeDim(channels);
+  }
+
+  // `expand_animations` set to true will return 4-D shapes for GIF. 3-D shapes
+  // will be returned for jpg, png, and bmp. `expand_animations` set to false
+  // will always return 3-D shapes for all (jpg, png, bmp, gif).
+  if (expand_animations) {
+    c->set_output(0, c->UnknownShape());
+    return Status::OK();
+  } else {
+    c->set_output(0,
+                  c->MakeShape({InferenceContext::kUnknownDim,
+                                InferenceContext::kUnknownDim, channels_dim}));
+    return Status::OK();
+  }
+}
+
 Status EncodeImageShapeFn(InferenceContext* c) {
   ShapeHandle unused;
   TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 3, &unused));
@@ -412,6 +446,17 @@ REGISTER_OP("RandomCrop")
     });
 // TODO(shlens): Support variable rank in RandomCrop.
 
+// --------------------------------------------------------------------------
+REGISTER_OP("DecodeImage")
+    .Input("contents: string")
+    // Setting `channels` to 0 means using the inherent number of channels in
+    // the image.
+    .Attr("channels: int = 0")
+    .Attr("dtype: {uint8, uint16, float32} = DT_UINT8")
+    .Output("image: dtype")
+    .Attr("expand_animations: bool = true")
+    .SetShapeFn(DecodeImageV2ShapeFn);
+
 // --------------------------------------------------------------------------
 REGISTER_OP("DecodeJpeg")
     .Input("contents: string")
diff --git a/tensorflow/core/ops/image_ops_test.cc b/tensorflow/core/ops/image_ops_test.cc
index e517e750955..4d0c1fceb28 100644
--- a/tensorflow/core/ops/image_ops_test.cc
+++ b/tensorflow/core/ops/image_ops_test.cc
@@ -17,6 +17,7 @@ limitations under the License.
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/shape_inference_testutil.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test.h"
 
@@ -61,6 +62,34 @@ TEST(ImageOpsTest, DecodeGif) {
   INFER_OK(op, "[]", "[?,?,?,3]");
 }
 
+TEST(ImageOpTest, DecodeImage) {
+  ShapeInferenceTestOp op("DecodeImage");
+
+  // Rank check.
+  INFER_ERROR("Shape must be rank 0 but is rank 1", op, "[1]");
+
+  // Set `expand_animations` to false. Output is always ?,?,?.
+  TF_ASSERT_OK(NodeDefBuilder("test", "DecodeImage")
+                   .Input({"img", 0, DT_STRING})
+                   .Attr("expand_animations", false)
+                   .Finalize(&op.node_def));
+  INFER_OK(op, "[]", "[?,?,?]");
+
+  // Set `expand_animations` to false. Output shape is not known (3D or 4D).
+  TF_ASSERT_OK(NodeDefBuilder("test", "DecodeImage")
+                   .Input({"img", 0, DT_STRING})
+                   .Attr("expand_animations", true)
+                   .Finalize(&op.node_def));
+  INFER_OK(op, "[]", "?");
+
+  // Negative channel value is rejected.
+  TF_ASSERT_OK(NodeDefBuilder("test", "DecodeImage")
+                   .Input({"img", 0, DT_STRING})
+                   .Attr("channels", -1)
+                   .Finalize(&op.node_def));
+  INFER_ERROR("channels must be non-negative, got -1", op, "[]");
+}
+
 TEST(ImageOpsTest, DecodeImage_ShapeFn) {
   for (const char* op_name : {"DecodeJpeg", "DecodePng"}) {
     ShapeInferenceTestOp op(op_name);
@@ -325,8 +354,8 @@ TEST(ImageOpsTest, DrawBoundingBoxes_ShapeFn) {
 
   // Check images.
   INFER_ERROR("must be rank 4", op, "[1,?,3];?");
-  INFER_ERROR("should be either 1 (GRY), 3 (RGB), or 4 (RGBA)",
-      op, "[1,?,?,5];?");
+  INFER_ERROR("should be either 1 (GRY), 3 (RGB), or 4 (RGBA)", op,
+              "[1,?,?,5];?");
 
   // Check boxes.
   INFER_ERROR("must be rank 3", op, "[1,?,?,4];[1,4]");
diff --git a/tensorflow/python/ops/image_ops_impl.py b/tensorflow/python/ops/image_ops_impl.py
index 683681b5c98..bbce25724e7 100644
--- a/tensorflow/python/ops/image_ops_impl.py
+++ b/tensorflow/python/ops/image_ops_impl.py
@@ -2634,6 +2634,23 @@ def decode_image(contents,
     ValueError: On incorrect number of channels.
   """
   with ops.name_scope(name, 'decode_image'):
+    if compat.forward_compatible(2020, 7, 14):
+      channels = 0 if channels is None else channels
+      if dtype not in [dtypes.float32, dtypes.uint8, dtypes.uint16]:
+        dest_dtype = dtype
+        dtype = dtypes.uint16
+        return convert_image_dtype(gen_image_ops.decode_image(
+            contents=contents,
+            channels=channels,
+            expand_animations=expand_animations,
+            dtype=dtype), dest_dtype)
+      else:
+        return gen_image_ops.decode_image(
+            contents=contents,
+            channels=channels,
+            expand_animations=expand_animations,
+            dtype=dtype)
+
     if channels not in (None, 0, 1, 3, 4):
       raise ValueError('channels must be in (None, 0, 1, 3, 4)')
     substr = string_ops.substr(contents, 0, 3)
diff --git a/tensorflow/python/ops/image_ops_test.py b/tensorflow/python/ops/image_ops_test.py
index 0206ccf9b33..a05209c2038 100644
--- a/tensorflow/python/ops/image_ops_test.py
+++ b/tensorflow/python/ops/image_ops_test.py
@@ -30,6 +30,7 @@ from six.moves import xrange  # pylint: disable=redefined-builtin
 
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.client import session
+from tensorflow.python.compat import compat
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
@@ -5173,107 +5174,141 @@ class SobelEdgesTest(test_util.TensorFlowTestCase):
 @test_util.run_all_in_graph_and_eager_modes
 class DecodeImageTest(test_util.TensorFlowTestCase):
 
+  _FORWARD_COMPATIBILITY_HORIZONS = [
+      (2020, 6, 11),
+      (2020, 7, 11),
+      (2525, 1, 1),  # future behavior
+  ]
+
   def testJpegUint16(self):
-    with self.cached_session(use_gpu=True) as sess:
-      base = "tensorflow/core/lib/jpeg/testdata"
-      jpeg0 = io_ops.read_file(os.path.join(base, "jpeg_merge_test1.jpg"))
-      image0 = image_ops.decode_image(jpeg0, dtype=dtypes.uint16)
-      image1 = image_ops.convert_image_dtype(image_ops.decode_jpeg(jpeg0),
-                                             dtypes.uint16)
-      image0, image1 = self.evaluate([image0, image1])
-      self.assertAllEqual(image0, image1)
+    for horizon in self._FORWARD_COMPATIBILITY_HORIZONS:
+      with compat.forward_compatibility_horizon(*horizon):
+        with self.cached_session(use_gpu=True) as sess:
+          base = "tensorflow/core/lib/jpeg/testdata"
+          jpeg0 = io_ops.read_file(os.path.join(base, "jpeg_merge_test1.jpg"))
+          image0 = image_ops.decode_image(jpeg0, dtype=dtypes.uint16)
+          image1 = image_ops.convert_image_dtype(image_ops.decode_jpeg(jpeg0),
+                                                 dtypes.uint16)
+          image0, image1 = self.evaluate([image0, image1])
+          self.assertAllEqual(image0, image1)
 
   def testPngUint16(self):
-    with self.cached_session(use_gpu=True) as sess:
-      base = "tensorflow/core/lib/png/testdata"
-      png0 = io_ops.read_file(os.path.join(base, "lena_rgba.png"))
-      image0 = image_ops.decode_image(png0, dtype=dtypes.uint16)
-      image1 = image_ops.convert_image_dtype(
-          image_ops.decode_png(png0, dtype=dtypes.uint16), dtypes.uint16)
-      image0, image1 = self.evaluate([image0, image1])
-      self.assertAllEqual(image0, image1)
+    for horizon in self._FORWARD_COMPATIBILITY_HORIZONS:
+      with compat.forward_compatibility_horizon(*horizon):
+        with self.cached_session(use_gpu=True) as sess:
+          base = "tensorflow/core/lib/png/testdata"
+          png0 = io_ops.read_file(os.path.join(base, "lena_rgba.png"))
+          image0 = image_ops.decode_image(png0, dtype=dtypes.uint16)
+          image1 = image_ops.convert_image_dtype(
+              image_ops.decode_png(png0, dtype=dtypes.uint16), dtypes.uint16)
+          image0, image1 = self.evaluate([image0, image1])
+          self.assertAllEqual(image0, image1)
 
-      # NumPy conversions should happen before
-      x = np.random.randint(256, size=(4, 4, 3), dtype=np.uint16)
-      x_str = image_ops_impl.encode_png(x)
-      x_dec = image_ops_impl.decode_image(
-          x_str, channels=3, dtype=dtypes.uint16)
-      self.assertAllEqual(x, x_dec)
+          # NumPy conversions should happen before
+          x = np.random.randint(256, size=(4, 4, 3), dtype=np.uint16)
+          x_str = image_ops_impl.encode_png(x)
+          x_dec = image_ops_impl.decode_image(
+              x_str, channels=3, dtype=dtypes.uint16)
+          self.assertAllEqual(x, x_dec)
 
   def testGifUint16(self):
-    with self.cached_session(use_gpu=True) as sess:
-      base = "tensorflow/core/lib/gif/testdata"
-      gif0 = io_ops.read_file(os.path.join(base, "scan.gif"))
-      image0 = image_ops.decode_image(gif0, dtype=dtypes.uint16)
-      image1 = image_ops.convert_image_dtype(image_ops.decode_gif(gif0),
-                                             dtypes.uint16)
-      image0, image1 = self.evaluate([image0, image1])
-      self.assertAllEqual(image0, image1)
+    for horizon in self._FORWARD_COMPATIBILITY_HORIZONS:
+      with compat.forward_compatibility_horizon(*horizon):
+        with self.cached_session(use_gpu=True) as sess:
+          base = "tensorflow/core/lib/gif/testdata"
+          gif0 = io_ops.read_file(os.path.join(base, "scan.gif"))
+          image0 = image_ops.decode_image(gif0, dtype=dtypes.uint16)
+          image1 = image_ops.convert_image_dtype(image_ops.decode_gif(gif0),
+                                                 dtypes.uint16)
+          image0, image1 = self.evaluate([image0, image1])
+          self.assertAllEqual(image0, image1)
 
   def testBmpUint16(self):
-    with self.cached_session(use_gpu=True) as sess:
-      base = "tensorflow/core/lib/bmp/testdata"
-      bmp0 = io_ops.read_file(os.path.join(base, "lena.bmp"))
-      image0 = image_ops.decode_image(bmp0, dtype=dtypes.uint16)
-      image1 = image_ops.convert_image_dtype(image_ops.decode_bmp(bmp0),
-                                             dtypes.uint16)
-      image0, image1 = self.evaluate([image0, image1])
-      self.assertAllEqual(image0, image1)
+    for horizon in self._FORWARD_COMPATIBILITY_HORIZONS:
+      with compat.forward_compatibility_horizon(*horizon):
+        with self.cached_session(use_gpu=True) as sess:
+          base = "tensorflow/core/lib/bmp/testdata"
+          bmp0 = io_ops.read_file(os.path.join(base, "lena.bmp"))
+          image0 = image_ops.decode_image(bmp0, dtype=dtypes.uint16)
+          image1 = image_ops.convert_image_dtype(image_ops.decode_bmp(bmp0),
+                                                 dtypes.uint16)
+          image0, image1 = self.evaluate([image0, image1])
+          self.assertAllEqual(image0, image1)
 
   def testJpegFloat32(self):
-    with self.cached_session(use_gpu=True) as sess:
-      base = "tensorflow/core/lib/jpeg/testdata"
-      jpeg0 = io_ops.read_file(os.path.join(base, "jpeg_merge_test1.jpg"))
-      image0 = image_ops.decode_image(jpeg0, dtype=dtypes.float32)
-      image1 = image_ops.convert_image_dtype(image_ops.decode_jpeg(jpeg0),
-                                             dtypes.float32)
-      image0, image1 = self.evaluate([image0, image1])
-      self.assertAllEqual(image0, image1)
+    for horizon in self._FORWARD_COMPATIBILITY_HORIZONS:
+      with compat.forward_compatibility_horizon(*horizon):
+        with self.cached_session(use_gpu=True) as sess:
+          base = "tensorflow/core/lib/jpeg/testdata"
+          jpeg0 = io_ops.read_file(os.path.join(base, "jpeg_merge_test1.jpg"))
+          image0 = image_ops.decode_image(jpeg0, dtype=dtypes.float32)
+          image1 = image_ops.convert_image_dtype(image_ops.decode_jpeg(jpeg0),
+                                                 dtypes.float32)
+          image0, image1 = self.evaluate([image0, image1])
+          self.assertAllEqual(image0, image1)
 
   def testPngFloat32(self):
-    with self.cached_session(use_gpu=True) as sess:
-      base = "tensorflow/core/lib/png/testdata"
-      png0 = io_ops.read_file(os.path.join(base, "lena_rgba.png"))
-      image0 = image_ops.decode_image(png0, dtype=dtypes.float32)
-      image1 = image_ops.convert_image_dtype(
-          image_ops.decode_png(png0, dtype=dtypes.uint16), dtypes.float32)
-      image0, image1 = self.evaluate([image0, image1])
-      self.assertAllEqual(image0, image1)
+    for horizon in self._FORWARD_COMPATIBILITY_HORIZONS:
+      with compat.forward_compatibility_horizon(*horizon):
+        with self.cached_session(use_gpu=True) as sess:
+          base = "tensorflow/core/lib/png/testdata"
+          png0 = io_ops.read_file(os.path.join(base, "lena_rgba.png"))
+          image0 = image_ops.decode_image(png0, dtype=dtypes.float32)
+          image1 = image_ops.convert_image_dtype(
+              image_ops.decode_png(png0, dtype=dtypes.uint16), dtypes.float32)
+          image0, image1 = self.evaluate([image0, image1])
+          self.assertAllEqual(image0, image1)
 
   def testGifFloat32(self):
-    with self.cached_session(use_gpu=True) as sess:
-      base = "tensorflow/core/lib/gif/testdata"
-      gif0 = io_ops.read_file(os.path.join(base, "scan.gif"))
-      image0 = image_ops.decode_image(gif0, dtype=dtypes.float32)
-      image1 = image_ops.convert_image_dtype(image_ops.decode_gif(gif0),
-                                             dtypes.float32)
-      image0, image1 = self.evaluate([image0, image1])
-      self.assertAllEqual(image0, image1)
+    for horizon in self._FORWARD_COMPATIBILITY_HORIZONS:
+      with compat.forward_compatibility_horizon(*horizon):
+        with self.cached_session(use_gpu=True) as sess:
+          base = "tensorflow/core/lib/gif/testdata"
+          gif0 = io_ops.read_file(os.path.join(base, "scan.gif"))
+          image0 = image_ops.decode_image(gif0, dtype=dtypes.float32)
+          image1 = image_ops.convert_image_dtype(image_ops.decode_gif(gif0),
+                                                 dtypes.float32)
+          image0, image1 = self.evaluate([image0, image1])
+          self.assertAllEqual(image0, image1)
 
   def testBmpFloat32(self):
-    with self.cached_session(use_gpu=True) as sess:
-      base = "tensorflow/core/lib/bmp/testdata"
-      bmp0 = io_ops.read_file(os.path.join(base, "lena.bmp"))
-      image0 = image_ops.decode_image(bmp0, dtype=dtypes.float32)
-      image1 = image_ops.convert_image_dtype(image_ops.decode_bmp(bmp0),
-                                             dtypes.float32)
-      image0, image1 = self.evaluate([image0, image1])
-      self.assertAllEqual(image0, image1)
+    for horizon in self._FORWARD_COMPATIBILITY_HORIZONS:
+      with compat.forward_compatibility_horizon(*horizon):
+        with self.cached_session(use_gpu=True) as sess:
+          base = "tensorflow/core/lib/bmp/testdata"
+          bmp0 = io_ops.read_file(os.path.join(base, "lena.bmp"))
+          image0 = image_ops.decode_image(bmp0, dtype=dtypes.float32)
+          image1 = image_ops.convert_image_dtype(image_ops.decode_bmp(bmp0),
+                                                 dtypes.float32)
+          image0, image1 = self.evaluate([image0, image1])
+          self.assertAllEqual(image0, image1)
 
   def testExpandAnimations(self):
-    with self.cached_session(use_gpu=True) as sess:
-      base = "tensorflow/core/lib/gif/testdata"
-      gif0 = io_ops.read_file(os.path.join(base, "scan.gif"))
-      image0 = image_ops.decode_image(
-          gif0, dtype=dtypes.float32, expand_animations=False)
-      # image_ops.decode_png() handles GIFs and returns 3D tensors
-      animation = image_ops.decode_gif(gif0)
-      first_frame = array_ops.gather(animation, 0)
-      image1 = image_ops.convert_image_dtype(first_frame, dtypes.float32)
-      image0, image1 = self.evaluate([image0, image1])
-      self.assertEqual(len(image0.shape), 3)
-      self.assertAllEqual(list(image0.shape), [40, 20, 3])
-      self.assertAllEqual(image0, image1)
+    for horizon in self._FORWARD_COMPATIBILITY_HORIZONS:
+      with compat.forward_compatibility_horizon(*horizon):
+        with self.cached_session(use_gpu=True) as sess:
+          base = "tensorflow/core/lib/gif/testdata"
+          gif0 = io_ops.read_file(os.path.join(base, "scan.gif"))
+
+          # Test `expand_animations=False` case.
+          image0 = image_ops.decode_image(
+              gif0, dtype=dtypes.float32, expand_animations=False)
+          # image_ops.decode_png() handles GIFs and returns 3D tensors
+          animation = image_ops.decode_gif(gif0)
+          first_frame = array_ops.gather(animation, 0)
+          image1 = image_ops.convert_image_dtype(first_frame, dtypes.float32)
+          image0, image1 = self.evaluate([image0, image1])
+          self.assertEqual(len(image0.shape), 3)
+          self.assertAllEqual(list(image0.shape), [40, 20, 3])
+          self.assertAllEqual(image0, image1)
+
+          # Test `expand_animations=True` case.
+          image2 = image_ops.decode_image(gif0, dtype=dtypes.float32)
+          image3 = image_ops.convert_image_dtype(animation, dtypes.float32)
+          image2, image3 = self.evaluate([image2, image3])
+          self.assertEqual(len(image2.shape), 4)
+          self.assertAllEqual(list(image2.shape), [12, 40, 20, 3])
+          self.assertAllEqual(image2, image3)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt
index 8e5303cbea4..1d27408735a 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt
@@ -1064,6 +1064,10 @@ tf_module {
     name: "DecodeGif"
     argspec: "args=[\'contents\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "DecodeImage"
+    argspec: "args=[\'contents\', \'channels\', \'dtype\', \'expand_animations\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \"<dtype: \'uint8\'>\", \'True\', \'None\'], "
+  }
   member_method {
     name: "DecodeJSONExample"
     argspec: "args=[\'json_examples\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt
index 8e5303cbea4..1d27408735a 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt
@@ -1064,6 +1064,10 @@ tf_module {
     name: "DecodeGif"
     argspec: "args=[\'contents\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "DecodeImage"
+    argspec: "args=[\'contents\', \'channels\', \'dtype\', \'expand_animations\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \"<dtype: \'uint8\'>\", \'True\', \'None\'], "
+  }
   member_method {
     name: "DecodeJSONExample"
     argspec: "args=[\'json_examples\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "

From 7c235362cd12241dcf90f6059c03909091fd1e41 Mon Sep 17 00:00:00 2001
From: Lu Wang <luwa@google.com>
Date: Tue, 23 Jun 2020 10:23:58 -0700
Subject: [PATCH 0894/1390] Add metadata and Support library related
 documentation

PiperOrigin-RevId: 317892400
Change-Id: Ibd9c4793d3df86872cae42536ff848632f0d5e28
---
 .../lite/experimental/support/java/README.md  | 221 +---------------
 tensorflow/lite/g3doc/_book.yaml              |  12 +-
 tensorflow/lite/g3doc/convert/metadata.md     | 158 ++++++++++--
 tensorflow/lite/g3doc/guide/codegen.md        | 123 ++++++++-
 tensorflow/lite/g3doc/guide/lite_support.md   | 238 ++++++++++++++++++
 5 files changed, 502 insertions(+), 250 deletions(-)
 create mode 100644 tensorflow/lite/g3doc/guide/lite_support.md

diff --git a/tensorflow/lite/experimental/support/java/README.md b/tensorflow/lite/experimental/support/java/README.md
index d5f3e121f3a..c8565e4bf36 100644
--- a/tensorflow/lite/experimental/support/java/README.md
+++ b/tensorflow/lite/experimental/support/java/README.md
@@ -13,222 +13,5 @@ especially around:
 *   Use-cases we should support including data types and operations
 *   Ease of use - does the APIs make sense to the community
 
-## Table of Contents
-
-*   [Getting Started](#getting-started)
-    *   [Import Gradle dependency and other settings](#Import-Gradle-dependency-and-other-settings)
-    *   [Basic image manipulation and conversion](#Basic-image-manipulation-and-conversion)
-    *   [Create output objects and run the model](#Create-output-objects-and-run-the-model)
-    *   [Accessing the result](#Accessing-the-result)
-    *   [Optional: Mapping results to labels](#Optional-Mapping-results-to-labels)
-*   [Current use-case coverage](#Current-use-case-coverage)
-*   [ImageProcessor Architecture](#ImageProcessor-Architecture)
-*   [Quantization](#Quantization)
-
-## Getting Started
-
-### Import Gradle dependency and other settings
-
-Copy the .tflite model file to the assets directory for the Android module where
-the model will be run. Specify that the file should not be compressed, and add
-the TensorFlow Lite library to the module’s build.gradle file:
-
-```java
-android {
-    // Other settings
-
-    // Specify tflite file should not be compressed for the app apk
-    aaptOptions {
-        noCompress "tflite"
-    }
-
-}
-
-dependencies {
-    // Other dependencies
-
-    // Import tflite dependencies
-    implementation 'org.tensorflow:tensorflow-lite:0.0.0-nightly'
-    implementation 'org.tensorflow:tensorflow-lite-gpu:0.0.0-nightly'
-    implementation 'org.tensorflow:tensorflow-lite-support:0.0.0-nightly'
-}
-```
-
-### Basic image manipulation and conversion
-
-The TensorFlow Lite Support Library has a suite of basic image manipulation
-methods such as crop and resize. To use it, create an ImagePreprocessor and add
-the required operations. To convert the image into the tensor format required by
-the TensorFlow Lite interpreter, create a TensorImage to be used as input:
-
-```java
-import org.tensorflow.lite.support.image.ImageProcessor;
-import org.tensorflow.lite.support.image.TensorImage;
-import org.tensorflow.lite.support.image.ops.ResizeOp;
-
-// Initialization code
-// Create an ImageProcessor with all ops required. For more ops, please
-// refer to the ImageProcessor Architecture section in this README.
-ImageProcessor imageProcessor =
-    new ImageProcessor.Builder()
-        .add(new ResizeOp(224, 224, ResizeOp.ResizeMethod.BILINEAR))
-        .build();
-
-// Create a TensorImage object, this creates the tensor the TensorFlow Lite
-// interpreter needs
-TensorImage tImage = new TensorImage(DataType.UINT8);
-
-// Analysis code for every frame
-// Preprocess the image
-tImage.load(bitmap);
-tImage = imageProcessor.process(tImage);
-```
-
-### Create output objects and run the model
-
-Before running the model, we need to create the container objects that will
-store the result:
-
-```java
-import org.tensorflow.lite.support.tensorbuffer.TensorBuffer;
-
-// Create a container for the result and specify that this is a quantized model.
-// Hence, the 'DataType' is defined as UINT8 (8-bit unsigned integer)
-TensorBuffer probabilityBuffer =
-    TensorBuffer.createFixedSize(new int[]{1, 1001}, DataType.UINT8);
-```
-
-Loading the model and running inference:
-
-```java
-import org.tensorflow.lite.support.model.Model;
-
-// Initialise the model
-try{
-    MappedByteBuffer tfliteModel
-        = FileUtil.loadMappedFile(activity,
-            "mobilenet_v1_1.0_224_quant.tflite");
-    Interpreter tflite = new Interpreter(tfliteModel)
-} catch (IOException e){
-    Log.e("tfliteSupport", "Error reading model", e);
-}
-
-// Running inference
-if(null != tflite) {
-    tflite.run(tImage.getBuffer(), probabilityBuffer.getBuffer());
-}
-```
-
-### Accessing the result
-
-Developers can access the output directly through
-`probabilityBuffer.getFloatArray()`. If the model produces a quantized output,
-remember to convert the result. For the MobileNet quantized model, the developer
-needs to divide each output value by 255 to obtain the probability ranging from
-0 (least likely) to 1 (most likely) for each category.
-
-### Optional: Mapping results to labels
-
-Developers can also optionally map the results to labels. First, copy the text
-file containing labels into the module’s assets directory. Next, load the label
-file using the following code:
-
-```java
-import org.tensorflow.lite.support.common.FileUtil;
-
-final String ASSOCIATED_AXIS_LABELS = "labels.txt";
-List<String> associatedAxisLabels = null;
-
-try {
-    associatedAxisLabels = FileUtil.loadLabels(this, ASSOCIATED_AXIS_LABELS);
-} catch (IOException e) {
-    Log.e("tfliteSupport", "Error reading label file", e);
-}
-```
-
-The following snippet demonstrates how to associate the probabilities with
-category labels:
-
-```java
-import org.tensorflow.lite.support.common.TensorProcessor;
-import org.tensorflow.lite.support.label.TensorLabel;
-
-// Post-processor which dequantize the result
-TensorProcessor probabilityProcessor =
-    new TensorProcessor.Builder().add(new NormalizeOp(0, 255)).build();
-
-if (null != associatedAxisLabels) {
-    // Map of labels and their corresponding probability
-    TensorLabel labels = new TensorLabel(associatedAxisLabels,
-        probabilityProcessor.process(probabilityBuffer));
-
-    // Create a map to access the result based on label
-    Map<String, Float> floatMap = labels.getMapWithFloatValue();
-}
-```
-
-## Current use-case coverage
-
-The current experimental version of the TensorFlow Lite Support Library covers:
-
-*   common data types (float, uint8, images and array of these objects) as
-    inputs and outputs of tflite models.
-*   basic image operations (crop image, resize and rotate).
-*   quantized and float models.
-
-Future versions will improve support for text-related applications.
-
-## ImageProcessor Architecture
-
-The design of the `ImageProcessor` allowed the image manipulation operations to
-be defined up front and optimised during the build process. The `ImageProcessor`
-currently supports three basic preprocessing operations:
-
-```java
-int width = bitmap.getWidth();
-int height = bitmap.getHeight();
-
-int size = height > width ? width : height;
-
-ImageProcessor imageProcessor =
-    new ImageProcessor.Builder()
-        // Center crop the image to the largest square possible
-        .add(new ResizeWithCropOrPadOp(size, size))
-        // Resize using Bilinear or Nearest neighbour
-        .add(new ResizeOp(224, 224, ResizeOp.ResizeMethod.BILINEAR));
-        // Rotation counter-clockwise in 90 degree increments
-        .add(new Rot90Op(rotateDegrees / 90))
-        .build();
-```
-
-The eventual goal of the support library is to support all
-[`tf.image`](https://www.tensorflow.org/api_docs/python/tf/image)
-transformations. This means the transformation will be the same as TensorFlow
-and the implementation will be independent of the operating system.
-
-Developers are also welcome to create custom processors. It is important in
-these cases to be aligned with the training process - i.e. the same
-preprocessing should apply to both training and inference to increase
-reproducibility.
-
-## Quantization
-
-When initiating input or output objects such as `TensorImage` or `TensorBuffer`
-the developer will need to specify whether they are to be quantized, by
-specifying their type to be `DataType.UINT8` or `DataType.FLOAT32`.
-
-The `TensorProcessor` can be used to quantize input tensors or dequantize output
-tensors. For example, when processing a quantized output `TensorBuffer`, the
-developer can use `NormalizeOp` to dequantize the result to a floating point
-probability between 0 and 1:
-
-```java
-import org.tensorflow.lite.support.common.TensorProcessor;
-
-// Post-processor which dequantize the result
-TensorProcessor probabilityProcessor =
-    new TensorProcessor.Builder().add(new NormalizeOp(0, 255)).build();
-
-// Post-processor which dequantize the result
-TensorBuffer dequantizedBuffer = probabilityProcessor.process(probabilityBuffer);
-```
+See the [documentation](https://www.tensorflow.org/lite/guide/lite_support) for
+instruction and examples.
diff --git a/tensorflow/lite/g3doc/_book.yaml b/tensorflow/lite/g3doc/_book.yaml
index abb18870003..6c454fab921 100644
--- a/tensorflow/lite/g3doc/_book.yaml
+++ b/tensorflow/lite/g3doc/_book.yaml
@@ -84,15 +84,16 @@ upper_tabs:
         path: /lite/convert/quantization
       - title: "Convert RNN models"
         path: /lite/convert/rnn
-      - title: "1.x compatibility"
-        path: /lite/convert/1x_compatibility
       - title: "Add metadata"
         path: /lite/convert/metadata
-        status: experimental
+      - title: "1.x compatibility"
+        path: /lite/convert/1x_compatibility
 
       - heading: "Inference"
       - title: "Overview"
         path: /lite/guide/inference
+      - title: "Integrate models with metadata"
+        path: /lite/guide/codegen
       - title: "Custom operators"
         path: /lite/guide/ops_custom
       - title: "Operator versions"
@@ -102,11 +103,10 @@ upper_tabs:
       - title: "Select operators from TensorFlow"
         path: /lite/guide/ops_select
         status: experimental
+      - title: "Process input and output data"
+        path: /lite/guide/lite_support
       - title: "List of hosted models"
         path: /lite/guide/hosted_models
-      - title: "Generate code from metadata"
-        path: /lite/guide/codegen
-        status: experimental
 
       - heading: "Performance"
       - title: "Best practices"
diff --git a/tensorflow/lite/g3doc/convert/metadata.md b/tensorflow/lite/g3doc/convert/metadata.md
index cd86333b305..6cc3e4aad84 100644
--- a/tensorflow/lite/g3doc/convert/metadata.md
+++ b/tensorflow/lite/g3doc/convert/metadata.md
@@ -1,6 +1,4 @@
-# Adding metadata to TensorFlow Lite model
-
-Note: TensorFlow Lite Metadata is in experimental (beta) phase.
+# Adding metadata to TensorFlow Lite models
 
 TensorFlow Lite metadata provides a standard for model descriptions. The
 metadata is an important source of knowledge about what the model does and its
@@ -8,9 +6,15 @@ input / output information. The metadata consists of both
 
 *   human readable parts which convey the best practice when using the model,
     and
-*   machine readable parts that can be leveraged by code generators, such as
-    [the TensorFlow Lite Android code generator](../guide/codegen.md) and
-    [the Android Studio ML Binding feature](https://developer.android.com/studio/preview/features#tensor-flow-lite-models).
+*   machine readable parts that can be leveraged by code generators, such as the
+    [TensorFlow Lite Android code generator](../guide/codegen.md#generate-code-with-tensorflow-lite-android-code-generator)
+    and the
+    [Android Studio ML Binding feature](../guide/codegen.md#generate-code-with-android-studio-ml-model-binding).
+
+All image models published on
+[TensorFlow Lite hosted models](https://www.tensorflow.org/lite/guide/hosted_models)
+and [TensorFlow Hub](https://tfhub.dev/s?deployment-format=lite) have been
+populated with metadata.
 
 ## Setup the metadata tools
 
@@ -29,22 +33,23 @@ TensorFlow Lite metadata tooling supports both Python 2 and Python 3.
 
 ## Adding metadata
 
-There are three parts to the
-[model metadata](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/experimental/support/metadata/metadata_schema.fbs):
+There are three parts to the model metadata in the
+[schema](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/experimental/support/metadata/metadata_schema.fbs):
 
 1.  **Model information** - Overall description of the model as well as items
     such as licence terms. See
-    [ModelMetadata](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/experimental/support/metadata/metadata_schema.fbs#L515).
+    [ModelMetadata](https://github.com/tensorflow/tensorflow/blob/268853ee81edab09e07f455cc918f7ef9a421485/tensorflow/lite/experimental/support/metadata/metadata_schema.fbs#L464).
 2.  **Input information** - Description of the inputs and pre-processing
     required such as normalization. See
-    [SubGraphMetadata.input_tensor_metadata](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/experimental/support/metadata/metadata_schema.fbs#L500).
+    [SubGraphMetadata.input_tensor_metadata](https://github.com/tensorflow/tensorflow/blob/268853ee81edab09e07f455cc918f7ef9a421485/tensorflow/lite/experimental/support/metadata/metadata_schema.fbs#L452).
 3.  **Output information** - Description of the output and post-processing
     required such as mapping to labels. See
-    [SubGraphMetadata.output_tensor_metadata](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/experimental/support/metadata/metadata_schema.fbs#L509).
+    [SubGraphMetadata.output_tensor_metadata](https://github.com/tensorflow/tensorflow/blob/268853ee81edab09e07f455cc918f7ef9a421485/tensorflow/lite/experimental/support/metadata/metadata_schema.fbs#L458).
 
 Since TensorFlow Lite only supports single subgraph at this point, the
-[TensorFlow Lite code generator](../guide/codegen.md) and
-[the Android Studio ML Binding feature](https://developer.android.com/studio/preview/features#tensor-flow-lite-models)
+[TensorFlow Lite code generator](../guide/codegen.md#generate-code-with-tensorflow-lite-android-code-generator)
+and the
+[Android Studio ML Binding feature](../guide/codegen.md#generate-code-with-android-studio-ml-model-binding)
 will use `ModelMetadata.name` and `ModelMetadata.description`, instead of
 `SubGraphMetadata.name` and `SubGraphMetadata.description`, when displaying
 metadata and generating code.
@@ -85,6 +90,85 @@ corresponding pre/post processing automatically to the object. See
 [the \<Codegen usage\> section of each associate file type](https://github.com/tensorflow/tensorflow/blob/268853ee81edab09e07f455cc918f7ef9a421485/tensorflow/lite/experimental/support/metadata/metadata_schema.fbs#L37-L77)
 in the schema for more details.
 
+### Normalization and quantization parameters
+
+Normalization is a common data preprocessing technique in machine learning. The
+goal of normalization is to change the values to a common scale, without
+distorting differences in the ranges of values.
+
+[Model quantization](https://www.tensorflow.org/lite/performance/model_optimization#model_quantization)
+is a technique that allows for reduced precision representations of weights and
+optionally, activations for both storage and computation.
+
+In terms of preprocessing and post-processing, normalization and quantization
+are two independent steps. Here are the details.
+
+|                         | Normalization           | Quantization             |
+| :---------------------: | ----------------------- | ------------------------ |
+| \                       | **Float model**: \      | **Float model**: \       |
+: An example of the       : - mean\: 127.5 \        : - zeroPoint\: 0 \        :
+: parameter values of the : - std\: 127.5 \         : - scale\: 1.0 \          :
+: input image in          : **Quant model**\: \     : **Quant model**\: \      :
+: MobileNet for float and : - mean\: 127.5 \        : - zeroPoint\: 128.0 \    :
+: quant models,           : - std\: 127.5           : - scale\:0.0078125f \    :
+: respectively.           :                         :                          :
+| \                       | \                       | **Float models** does    |
+: \                       : \                       : not need quantization. \ :
+: \                       : **Inputs**\: If input   : **Quantized model** may  :
+: \                       : data is normalized in   : or may not need          :
+: When to invoke?         : training, the input     : quantization in pre/post :
+:                         : data of inference needs : processing. It depends   :
+:                         : to be normalized        : on the datatype of       :
+:                         : accordingly. \          : input/output tensors. \  :
+:                         : **Outputs**\: output    : - float tensors\: no     :
+:                         : data will not be        : quantization in pre/post :
+:                         : normalized in general.  : processing needed. Quant :
+:                         :                         : op and dequant op are    :
+:                         :                         : baked into the model     :
+:                         :                         : graph. \                 :
+:                         :                         : - int8/uint8 tensors\:   :
+:                         :                         : need quantization in     :
+:                         :                         : pre/post processing.     :
+| \                       | \                       | **Quantize for inputs**: |
+: \                       : \                       : \                        :
+: Formula                 : normalized_input =      : q = f / scale +          :
+:                         : (input - mean) / std    : zeroPoint \              :
+:                         :                         : **Dequantize for         :
+:                         :                         : outputs**\: \            :
+:                         :                         : f = (q - zeroPoint) *    :
+:                         :                         : scale                    :
+| \                       | Filled by model creator | Filled automatically by  |
+: Where are the           : and stored in model     : TFLite converter, and    :
+: parameters              : metadata, as            : stored in tflite model   :
+:                         : `NormalizationOptions`  : file.                    :
+| How to get the          | Through the             | Through the TFLite       |
+: parameters?             : `MetadataExtractor` API : `Tensor` API [1] or      :
+:                         : [2]                     : through the              :
+:                         :                         : `MetadataExtractor` API  :
+:                         :                         : [2]                      :
+| Do float and quant      | Yes, float and quant    | No, the float model does |
+: models share the same   : models have the same    : not need quantization.   :
+: value?                  : Normalization           :                          :
+:                         : parameters              :                          :
+| Does TFLite Code        | \                       | \                        |
+: generator or Android    : Yes                     : Yes                      :
+: Studio ML binding       :                         :                          :
+: automatically generate  :                         :                          :
+: it in data processing?  :                         :                          :
+
+[1] The
+[TensorFlow Lite Java API](https://github.com/tensorflow/tensorflow/blob/09ec15539eece57b257ce9074918282d88523d56/tensorflow/lite/java/src/main/java/org/tensorflow/lite/Tensor.java#L73)
+and the
+[TensorFlow Lite C++ API](https://github.com/tensorflow/tensorflow/blob/09ec15539eece57b257ce9074918282d88523d56/tensorflow/lite/c/common.h#L391).
+\
+[2] The
+[metadata extractor library](../guide/codegen.md#read-the-metadata-from-models)
+
+When processing image data for uint8 models, normalization and quantization are
+sometimes skipped. It is fine to do so when the pixel values are in the range of
+[0, 255]. But in general, you should always process the data according to the
+normalization and quantization parameters when applicable.
+
 ### Examples
 
 Note: The export directory specified has to exist before you run the script; it
@@ -248,7 +332,7 @@ You can pack as many associated files as you want into the model through
 documented in the metadata. In this example, packing the lable file is
 mandatory.
 
-### Visualize the metadata
+## Visualize the metadata
 
 You can use [Netron](https://github.com/lutzroeder/netron) to visualize your
 metadata, or you can read the metadata from a TensorFlow Lite model into a json
@@ -263,3 +347,49 @@ json_file = displayer.get_metadata_json()
 with open(export_json_file, "w") as f:
   f.write(json_file)
 ```
+
+## Metadata versioning
+
+The
+[metadata schema](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/experimental/support/metadata/metadata_schema.fbs)
+is versioned both by the Semantic versioning number, which tracks the changes of
+the schema file, and by the Flatbuffers file identification, which indicates the
+true version compatibility.
+
+### The Semantic versioning number
+
+The metadata schema is versioned by the
+[Semantic versioning number](https://github.com/tensorflow/tensorflow/blob/72d30dfb8bc58be931604f853bd161a11b7c9fcc/tensorflow/lite/experimental/support/metadata/metadata_schema.fbs#L53),
+such as MAJOR.MINOR.PATCH. It tracks schema changes according to the rules
+[here](https://github.com/tensorflow/tensorflow/blob/72d30dfb8bc58be931604f853bd161a11b7c9fcc/tensorflow/lite/experimental/support/metadata/metadata_schema.fbs#L32-L44).
+See the
+[history of fields](https://github.com/tensorflow/tensorflow/blob/72d30dfb8bc58be931604f853bd161a11b7c9fcc/tensorflow/lite/experimental/support/metadata/metadata_schema.fbs#L63)
+added after version `1.0.0`.
+
+### The Flatbuffers file identification
+
+Semantic versioning guarantees the compatibility if following the rules, but it
+does not imply the true incompatibility. When bumping up the MAJOR number, it
+does not necessarily mean the backwards compatibility is broken. Therefore, we
+use the
+[Flatbuffers file identification](https://google.github.io/flatbuffers/md__schemas.html),
+[file_identifiler](https://github.com/tensorflow/tensorflow/blob/72d30dfb8bc58be931604f853bd161a11b7c9fcc/tensorflow/lite/experimental/support/metadata/metadata_schema.fbs#L61),
+to denote the true compatibility of the metadata schema. The file identifier is
+exactly 4 characters long. It is fixed to a certain metadata schema and not
+subject to change by users. If the backward compatibility of the metadata schema
+has to be broken for some reason, the file_identifier will bump up, for example,
+from “M001” to “M002”. File_identifiler is expected to be changed much less
+frequently than the metadata_version.
+
+### The minimum necessary metadata parser version
+
+The
+[minimum necessary metadata parser version](https://github.com/tensorflow/tensorflow/blob/72d30dfb8bc58be931604f853bd161a11b7c9fcc/tensorflow/lite/experimental/support/metadata/metadata_schema.fbs#L565)
+is the minimum version of metadata parser (the Flatbuffers generated code) that
+can read the metadata Flatbuffers in full. The version is effectively the
+largest version number among the versions of all the fields populated and the
+smallest compatible version indicated by the file identifier. The minimum
+necessary metadata parser version is automaticaly populated by the
+`MetadataPopulator` when the metadata is populated into a TFLite model. See the
+[metadata extractor](../guide/codegen.md#read-the-metadata-from-models) about
+how the minimum necessary metadata parser version is used.
diff --git a/tensorflow/lite/g3doc/guide/codegen.md b/tensorflow/lite/g3doc/guide/codegen.md
index 74c404e61fa..cceb40b1d74 100644
--- a/tensorflow/lite/g3doc/guide/codegen.md
+++ b/tensorflow/lite/g3doc/guide/codegen.md
@@ -1,7 +1,22 @@
-# Generate code from TensorFlow Lite metadata
+# Integrate TensorFlow Lite models with metadata
 
-Note: TensorFlow Lite wrapper code generator is in experimental (beta) phase and
-currently only supports Android.
+[TensorFlow Lite metadata](../convert/metadata.md) contains a rich description
+of what the model does and how to use the model. It can empower code generators,
+such as the
+[TensorFlow Lite Android code generator](#generate-code-with-tensorflow-lite-android-code-generator)
+and the
+[Android Studio ML Binding feature](#generate-code-with-android-studio-ml-model-binding),
+to automatically generates the inference code for you. It can also be used to
+configure your custom inference pipeline.
+
+Browse
+[TensorFlow Lite hosted models](https://www.tensorflow.org/lite/guide/hosted_models)
+and [TensorFlow Hub](https://tfhub.dev/s?deployment-format=lite) to download
+pretrained models with metadata. All image models have been supported.
+
+## Generate code with TensorFlow Lite Android code generator
+
+Note: TensorFlow Lite wrapper code generator currently only supports Android.
 
 For TensorFlow Lite model enhanced with [metadata](../convert/metadata.md),
 developers can use the TensorFlow Lite Android wrapper code generator to create
@@ -15,7 +30,7 @@ under relevant fields in
 [metadata_schema.fbs](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/experimental/support/metadata/metadata_schema.fbs),
 to see how the codegen tool parses each field.
 
-## Generate Wrapper Code
+### Generate Wrapper Code
 
 You will need to install the following tooling in your terminal:
 
@@ -46,9 +61,9 @@ from google.colab import files
 files.download('classify_wrapper.zip')
 ```
 
-## Using the generated code
+### Using the generated code
 
-### Step 1: Import the generated code
+#### Step 1: Import the generated code
 
 Unzip the generated code if necessary into a directory structure. The root of
 the generated code is assumed to be `SRC_ROOT`.
@@ -60,7 +75,7 @@ select `SRC_ROOT`
 Using the above example, the directory and the module imported would be called
 `classify_wrapper`.
 
-### Step 2: Update the app's `build.gradle` file
+#### Step 2: Update the app's `build.gradle` file
 
 In the app module that will be consuming the generated library module:
 
@@ -78,7 +93,7 @@ Under the dependencies section, add the following:
 implementation project(":classify_wrapper")
 ```
 
-### Step 3: Using the model
+#### Step 3: Using the model
 
 ```java
 // 1. Initialize the model
@@ -104,7 +119,7 @@ if(null != myImageClassifier) {
 }
 ```
 
-## Accelerating model inference
+### Accelerating model inference
 
 The generated code provides a way for developers to accelerate their code
 through the use of [delegates](../performance/delegates.md) and the number of
@@ -128,9 +143,9 @@ try {
 }
 ```
 
-## Troubleshooting
+### Troubleshooting
 
-### Getting 'java.io.FileNotFoundException: This file can not be opened as a file descriptor; it is probably compressed'
+#### Getting 'java.io.FileNotFoundException: This file can not be opened as a file descriptor; it is probably compressed'
 
 Under the app module that will uses the library module, insert the following
 lines under the android section:
@@ -140,3 +155,89 @@ aaptOptions {
    noCompress "tflite"
 }
 ```
+
+## Generate code with Android Studio ML Model Binding
+
+[Android Studio ML Model Binding](https://developer.android.com/studio/preview/features#tensor-flow-lite-models)
+allows you to directly import TensorFlow Lite models and use them in your
+Android Studio projects. It generates easy-to-use classes so you can run your
+model with less code and better type safety. See the
+[introduction](https://developer.android.com/studio/preview/features#tensor-flow-lite-models)
+for more details.
+
+Note: Code generated by the TensorFlow Lite Android code generator may include
+some latest API or experimental features, which can be a super set of the one
+generated by the Android Studio ML Model Binding.
+
+## Read the metadata from models
+
+The Metadata Extractor library is a convinient tool to read the metadata and
+associated files from a models across different platforms (see the
+[Java version](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/experimental/support/metadata)
+and the C++ version is coming soon). Users can also build their own metadata
+extractor tool in other languages using the Flatbuffers library.
+
+### Read the metadata in Java
+
+Note: the Java Metadata Extractor library is available as an Android library
+dependency: `org.tensorflow:tensorflow-lite-metadata`.
+
+You can initialize a `MetadataExtractor` with a `ByteBuffer` that points to the
+model:
+
+```java
+public MetadataExtractor(ByteBuffer buffer);
+```
+
+The `ByteBuffer` must remain unchanged for the whole lifetime of the
+`MetadataExtractor`. The initialization may fail if the Flatbuffers file
+identifier of the model metadata does not match the one of the metadata parser.
+See [metadata versioning](../convert/metadata.md#metadata-versioning) for more
+information.
+
+As long as the file identifer is satisfied, the metadata extractor will not fail
+when reading metadata generated from an old or a future scheme due to the
+Flatbuffers forward and backwards compatibility mechanism. But fields from
+future shcemas cannot be extracted by older metadata extractors. The
+[minimum necessary parser version](../convert/metadata.md#the-minimum-necessary-metadata-parser-version)
+of the metadata indicates the minimum version of metadata parser that can read
+the metadata Flatbuffers in full. You can use the following method to verify if
+the minimum necessary parser version is satisfied:
+
+```java
+public final boolean isMinimumParserVersionSatisfied();
+```
+
+It is allowed to pass in a model without metadata. However, invoking methods
+that read from the metadata will cause runtime errors. You can check if a model
+has metadata by invoking the method:
+
+```java
+public boolean hasMetadata();
+```
+
+`MetadataExtractor` provides convenient functions for you to get the
+input/output tensors' metadata. For example,
+
+```java
+public int getInputTensorCount();
+public TensorMetadata getInputTensorMetadata(int inputIndex);
+public QuantizationParams getInputTensorQuantizationParams(int inputIndex);
+public int[] getInputTensorShape(int inputIndex);
+public int getoutputTensorCount();
+public TensorMetadata getoutputTensorMetadata(int inputIndex);
+public QuantizationParams getoutputTensorQuantizationParams(int inputIndex);
+public int[] getoutputTensorShape(int inputIndex);
+```
+
+You can also read associated files through their names with the method:
+
+```java
+public InputStream getAssociatedFile(String fileName);
+```
+
+Though the
+[TensorFlow Lite model schema](https://github.com/tensorflow/tensorflow/blob/aa7ff6aa28977826e7acae379e82da22482b2bf2/tensorflow/lite/schema/schema.fbs#L1075)
+supports multiple subgraphs, the TFLite Interpreter only supports single
+subgraph so far. Therefore, `MetadataExtractor` omits subgraph index as an input
+in its methods.
diff --git a/tensorflow/lite/g3doc/guide/lite_support.md b/tensorflow/lite/g3doc/guide/lite_support.md
new file mode 100644
index 00000000000..826979efb19
--- /dev/null
+++ b/tensorflow/lite/g3doc/guide/lite_support.md
@@ -0,0 +1,238 @@
+# Process input and output data with the TensorFlow Lite Support Library
+
+Note: TensorFlow Lite Support Library currently only supports Android.
+
+Mobile application developers typically interact with typed objects such as
+bitmaps or primitives such as integers. However, the TensorFlow Lite Interpreter
+that runs the on-device machine learning model uses tensors in the form of
+ByteBuffer, which can be difficult to debug and manipulate. The
+[TensorFlow Lite Android Support Library](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/experimental/support/java)
+is designed to help process the input and output of TensorFlow Lite models, and
+make the TensorFlow Lite interpreter easier to use.
+
+## Getting Started
+
+### Import Gradle dependency and other settings
+
+Copy the `.tflite` model file to the assets directory of the Android module
+where the model will be run. Specify that the file should not be compressed, and
+add the TensorFlow Lite library to the module’s `build.gradle` file:
+
+```java
+android {
+    // Other settings
+
+    // Specify tflite file should not be compressed for the app apk
+    aaptOptions {
+        noCompress "tflite"
+    }
+
+}
+
+dependencies {
+    // Other dependencies
+
+    // Import tflite dependencies
+    implementation 'org.tensorflow:tensorflow-lite:0.0.0-nightly'
+    // The GPU delegate library is optional. Depend on it as needed.
+    implementation 'org.tensorflow:tensorflow-lite-gpu:0.0.0-nightly'
+    implementation 'org.tensorflow:tensorflow-lite-support:0.0.0-nightly'
+}
+```
+
+### Basic image manipulation and conversion
+
+The TensorFlow Lite Support Library has a suite of basic image manipulation
+methods such as crop and resize. To use it, create an `ImagePreprocessor` and
+add the required operations. To convert the image into the tensor format
+required by the TensorFlow Lite interpreter, create a `TensorImage` to be used
+as input:
+
+```java
+import org.tensorflow.lite.support.image.ImageProcessor;
+import org.tensorflow.lite.support.image.TensorImage;
+import org.tensorflow.lite.support.image.ops.ResizeOp;
+
+// Initialization code
+// Create an ImageProcessor with all ops required. For more ops, please
+// refer to the ImageProcessor Architecture section in this README.
+ImageProcessor imageProcessor =
+    new ImageProcessor.Builder()
+        .add(new ResizeOp(224, 224, ResizeOp.ResizeMethod.BILINEAR))
+        .build();
+
+// Create a TensorImage object. This creates the tensor of the corresponding
+// tensor type (uint8 in this case) that the TensorFlow Lite interpreter needs.
+TensorImage tImage = new TensorImage(DataType.UINT8);
+
+// Analysis code for every frame
+// Preprocess the image
+tImage.load(bitmap);
+tImage = imageProcessor.process(tImage);
+```
+
+`DataType` of a tensor can be read through the
+[metadata exractor library](../guide/codegen.md#read-the-metadata-from-models)
+as well as other model information.
+
+### Create output objects and run the model
+
+Before running the model, we need to create the container objects that will
+store the result:
+
+```java
+import org.tensorflow.lite.support.tensorbuffer.TensorBuffer;
+
+// Create a container for the result and specify that this is a quantized model.
+// Hence, the 'DataType' is defined as UINT8 (8-bit unsigned integer)
+TensorBuffer probabilityBuffer =
+    TensorBuffer.createFixedSize(new int[]{1, 1001}, DataType.UINT8);
+```
+
+Loading the model and running inference:
+
+```java
+import org.tensorflow.lite.support.model.Model;
+
+// Initialise the model
+try{
+    MappedByteBuffer tfliteModel
+        = FileUtil.loadMappedFile(activity,
+            "mobilenet_v1_1.0_224_quant.tflite");
+    Interpreter tflite = new Interpreter(tfliteModel)
+} catch (IOException e){
+    Log.e("tfliteSupport", "Error reading model", e);
+}
+
+// Running inference
+if(null != tflite) {
+    tflite.run(tImage.getBuffer(), probabilityBuffer.getBuffer());
+}
+```
+
+### Accessing the result
+
+Developers can access the output directly through
+`probabilityBuffer.getFloatArray()`. If the model produces a quantized output,
+remember to convert the result. For the MobileNet quantized model, the developer
+needs to divide each output value by 255 to obtain the probability ranging from
+0 (least likely) to 1 (most likely) for each category.
+
+### Optional: Mapping results to labels
+
+Developers can also optionally map the results to labels. First, copy the text
+file containing labels into the module’s assets directory. Next, load the label
+file using the following code:
+
+```java
+import org.tensorflow.lite.support.common.FileUtil;
+
+final String ASSOCIATED_AXIS_LABELS = "labels.txt";
+List<String> associatedAxisLabels = null;
+
+try {
+    associatedAxisLabels = FileUtil.loadLabels(this, ASSOCIATED_AXIS_LABELS);
+} catch (IOException e) {
+    Log.e("tfliteSupport", "Error reading label file", e);
+}
+```
+
+The following snippet demonstrates how to associate the probabilities with
+category labels:
+
+```java
+import org.tensorflow.lite.support.common.TensorProcessor;
+import org.tensorflow.lite.support.label.TensorLabel;
+
+// Post-processor which dequantize the result
+TensorProcessor probabilityProcessor =
+    new TensorProcessor.Builder().add(new NormalizeOp(0, 255)).build();
+
+if (null != associatedAxisLabels) {
+    // Map of labels and their corresponding probability
+    TensorLabel labels = new TensorLabel(associatedAxisLabels,
+        probabilityProcessor.process(probabilityBuffer));
+
+    // Create a map to access the result based on label
+    Map<String, Float> floatMap = labels.getMapWithFloatValue();
+}
+```
+
+## Current use-case coverage
+
+The current version of the TensorFlow Lite Support Library covers:
+
+*   common data types (float, uint8, images and array of these objects) as
+    inputs and outputs of tflite models.
+*   basic image operations (crop image, resize and rotate).
+*   normalization and quantization
+*   file utils
+
+Future versions will improve support for text-related applications.
+
+## ImageProcessor Architecture
+
+The design of the `ImageProcessor` allowed the image manipulation operations to
+be defined up front and optimised during the build process. The `ImageProcessor`
+currently supports three basic preprocessing operations:
+
+```java
+int width = bitmap.getWidth();
+int height = bitmap.getHeight();
+
+int size = height > width ? width : height;
+
+ImageProcessor imageProcessor =
+    new ImageProcessor.Builder()
+        // Center crop the image to the largest square possible
+        .add(new ResizeWithCropOrPadOp(size, size))
+        // Resize using Bilinear or Nearest neighbour
+        .add(new ResizeOp(224, 224, ResizeOp.ResizeMethod.BILINEAR));
+        // Rotation counter-clockwise in 90 degree increments
+        .add(new Rot90Op(rotateDegrees / 90))
+        .add(new NormalizeOp(127.5, 127.5))
+        .add(new QuantizeOp(128.0, 1/128.0))
+        .build();
+```
+
+See more details
+[here](../convert/metadata.md#normalization-and-quantization-parameters) about
+normalization and quantization.
+
+The eventual goal of the support library is to support all
+[`tf.image`](https://www.tensorflow.org/api_docs/python/tf/image)
+transformations. This means the transformation will be the same as TensorFlow
+and the implementation will be independent of the operating system.
+
+Developers are also welcome to create custom processors. It is important in
+these cases to be aligned with the training process - i.e. the same
+preprocessing should apply to both training and inference to increase
+reproducibility.
+
+## Quantization
+
+When initiating input or output objects such as `TensorImage` or `TensorBuffer`
+you need to specify their types to be `DataType.UINT8` or `DataType.FLOAT32`.
+
+```java
+TensorImage tImage = new TensorImage(DataType.UINT8);
+TensorBuffer probabilityBuffer =
+    TensorBuffer.createFixedSize(new int[]{1, 1001}, DataType.UINT8);
+```
+
+The `TensorProcessor` can be used to quantize input tensors or dequantize output
+tensors. For example, when processing a quantized output `TensorBuffer`, the
+developer can use `DequantizeOp` to dequantize the result to a floating point
+probability between 0 and 1:
+
+```java
+import org.tensorflow.lite.support.common.TensorProcessor;
+
+// Post-processor which dequantize the result
+TensorProcessor probabilityProcessor =
+    new TensorProcessor.Builder().add(new DequantizeOp(0, 1/255.0)).build();
+TensorBuffer dequantizedBuffer = probabilityProcessor.process(probabilityBuffer);
+```
+
+The quantization parameters of a tensor can be read through the
+[metadata exractor library](../guide/codegen.md#read-the-metadata-from-models).

From c3204e5a7ee3376b8a931e94a5385465cba8ea63 Mon Sep 17 00:00:00 2001
From: Rahul Joshi <jurahul@google.com>
Date: Tue, 23 Jun 2020 11:04:18 -0700
Subject: [PATCH 0895/1390] [NFC] Move CustomOp verification to a better place

PiperOrigin-RevId: 317901475
Change-Id: I91f4cc94db562a747b1bbf33c6635215c9cf6858
---
 tensorflow/compiler/mlir/lite/ir/tfl_ops.cc | 32 +++++++++++++--------
 1 file changed, 20 insertions(+), 12 deletions(-)

diff --git a/tensorflow/compiler/mlir/lite/ir/tfl_ops.cc b/tensorflow/compiler/mlir/lite/ir/tfl_ops.cc
index 6e9930271c8..16d256c7571 100644
--- a/tensorflow/compiler/mlir/lite/ir/tfl_ops.cc
+++ b/tensorflow/compiler/mlir/lite/ir/tfl_ops.cc
@@ -758,6 +758,22 @@ OpFoldResult ConcatenationOp::fold(ArrayRef<Attribute> operands) {
   return new_concat.getResult();
 }
 
+//===----------------------------------------------------------------------===//
+// CustomOp
+//===----------------------------------------------------------------------===//
+
+static LogicalResult Verify(CustomOp op) {
+  OpaqueElementsAttr opaque_attr =
+      op.custom_option().cast<OpaqueElementsAttr>();
+  if (!opaque_attr.getType().hasStaticShape())
+    return op.emitOpError("custom_option should have a static shape.");
+  if (opaque_attr.getValue().size() !=
+      opaque_attr.getType().cast<ShapedType>().getDimSize(0))
+    return op.emitOpError(
+        "custom_option should have the same length of content with shape.");
+  return success();
+}
+
 //===----------------------------------------------------------------------===//
 // FullyConnectedOp
 //===----------------------------------------------------------------------===//
@@ -2169,6 +2185,10 @@ static LogicalResult Verify(TransposeOp op) {
   return success();
 }
 
+//===----------------------------------------------------------------------===//
+// WhileOp
+//===----------------------------------------------------------------------===//
+
 LogicalResult Verify(WhileOp op) {
   if (op.getNumOperands() != op.getNumResults())
     return op.emitOpError(llvm::formatv(
@@ -2178,18 +2198,6 @@ LogicalResult Verify(WhileOp op) {
   return success();
 }
 
-static LogicalResult Verify(CustomOp op) {
-  OpaqueElementsAttr opaque_attr =
-      op.custom_option().cast<OpaqueElementsAttr>();
-  if (!opaque_attr.getType().hasStaticShape())
-    return op.emitOpError("custom_option should have a static shape.");
-  if (opaque_attr.getValue().size() !=
-      opaque_attr.getType().cast<ShapedType>().getDimSize(0))
-    return op.emitOpError(
-        "custom_option should have the same length of content with shape.");
-  return success();
-}
-
 namespace {
 // Canonicalize While op so that results and operands match and external values
 // are via implicit capture rather than via block args.

From 6edcfa7f37c82edb3f30b67b79477dcd37ce5616 Mon Sep 17 00:00:00 2001
From: Karim Nosir <karimnosseir@google.com>
Date: Tue, 23 Jun 2020 11:05:12 -0700
Subject: [PATCH 0896/1390] * Update error message for Add op for supported
 ops. * Add override for BroadcastAdd4DSlow and BroadcastAddDispatch which
 takes context, also updated the functions to report error when the output
 requires broadcasting more than 4d/5d.

PiperOrigin-RevId: 317901694
Change-Id: I654d4c34a250792dff889206bcdc36f28afa060c
---
 tensorflow/lite/kernels/add.cc | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/tensorflow/lite/kernels/add.cc b/tensorflow/lite/kernels/add.cc
index d6e5db90a97..bda475bdc35 100644
--- a/tensorflow/lite/kernels/add.cc
+++ b/tensorflow/lite/kernels/add.cc
@@ -34,12 +34,12 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/internal/types.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/kernels/op_macros.h"
 
 namespace tflite {
 namespace ops {
 namespace builtin {
 namespace add {
-
 // This file has three implementation of Add.
 enum KernelType {
   kReference,
@@ -323,9 +323,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
                       EvalAddQuantized<kernel_type>(context, node, params, data,
                                                     input1, input2, output));
   } else {
-    context->ReportError(context,
-                         "Inputs and outputs not all float|uint8|int16 types.");
-    return kTfLiteError;
+    TF_LITE_UNSUPPORTED_TYPE(context, output->type, "Add");
   }
 
   return kTfLiteOk;

From 2d06143ddbcb6ca5c666156223f6a3963f1ecbd9 Mon Sep 17 00:00:00 2001
From: Smit Hinsu <hinsu@google.com>
Date: Tue, 23 Jun 2020 11:06:05 -0700
Subject: [PATCH 0897/1390] Enable last partial batch for MWMS in TF2.x

PiperOrigin-RevId: 317901899
Change-Id: Iff520d5c62c7a74bd34aa7a5239a3c9fac19a0a1
---
 .../collective_all_reduce_strategy.py         |  1 -
 .../collective_all_reduce_strategy_test.py    | 55 ++++++++---------
 tensorflow/python/distribute/input_lib.py     | 13 ++--
 .../python/distribute/input_lib_test.py       |  1 +
 .../python/distribute/strategy_common_test.py | 61 ++-----------------
 5 files changed, 40 insertions(+), 91 deletions(-)

diff --git a/tensorflow/python/distribute/collective_all_reduce_strategy.py b/tensorflow/python/distribute/collective_all_reduce_strategy.py
index e754cc43a41..b4cc90e858f 100644
--- a/tensorflow/python/distribute/collective_all_reduce_strategy.py
+++ b/tensorflow/python/distribute/collective_all_reduce_strategy.py
@@ -190,7 +190,6 @@ class CollectiveAllReduceExtended(mirrored_strategy.MirroredExtended):
     self._communication = communication
     self._initialize_strategy(self._cluster_resolver)
     self._cfer_fn_cache = weakref.WeakKeyDictionary()
-    self.experimental_enable_get_next_as_optional = True
     assert isinstance(self._cross_device_ops,
                       cross_device_ops_lib.CollectiveAllReduce)
 
diff --git a/tensorflow/python/distribute/collective_all_reduce_strategy_test.py b/tensorflow/python/distribute/collective_all_reduce_strategy_test.py
index 9d16c6f5a1f..9e36531a8db 100644
--- a/tensorflow/python/distribute/collective_all_reduce_strategy_test.py
+++ b/tensorflow/python/distribute/collective_all_reduce_strategy_test.py
@@ -370,8 +370,7 @@ class CollectiveAllReduceStrategyTestBase(
         else:
           self.assertEqual(list(expected_value), list(computed_value))
 
-      # error raised by calling optional_get_value on an Optional of None
-      with self.assertRaises(errors.InvalidArgumentError):
+      with self.assertRaises(errors.OutOfRangeError):
         next_element = iterator.get_next()
         sess.run([distribute_utils.select_replica(r, next_element)
                   for r in range(len(devices))])
@@ -450,35 +449,31 @@ class DistributedCollectiveAllReduceStrategyTest(
       combinations.combine(
           mode=['graph'], required_gpus=[0, 1, 2], use_dataset=[True, False]))
   def testMakeInputFnIterator(self, required_gpus, use_dataset):
-    def _worker_fn(task_type, task_id, required_gpus):
-      if use_dataset:
-        fn = lambda: dataset_ops.Dataset.range(100)
-      else:
-        def fn():
-          dataset = dataset_ops.Dataset.range(100)
-          it = dataset_ops.make_one_shot_iterator(dataset)
-          return it.get_next
-      # We use CPU as the device when required_gpus = 0
-      devices_per_worker = max(1, required_gpus)
-      expected_values = [[i+j for j in range(devices_per_worker)]
-                         for i in range(0, 100, devices_per_worker)]
+    if use_dataset:
+      fn = lambda: dataset_ops.Dataset.range(100)
+    else:
+      def fn():
+        dataset = dataset_ops.Dataset.range(100)
+        it = dataset_ops.make_one_shot_iterator(dataset)
+        return it.get_next
+    # We use CPU as the device when required_gpus = 0
+    devices_per_worker = max(1, required_gpus)
+    expected_values = [[i+j for j in range(devices_per_worker)]
+                       for i in range(0, 100, devices_per_worker)]
 
-      input_fn = self._input_fn_to_test_input_context(
-          fn,
-          expected_num_replicas_in_sync=3*devices_per_worker,
-          expected_num_input_pipelines=3,
-          expected_input_pipeline_id=task_id)
-      self._test_input_fn_iterator(
-          task_type,
-          task_id,
-          required_gpus,
-          input_fn,
-          expected_values,
-          test_reinitialize=use_dataset,
-          ignore_order=not use_dataset)
-
-    self._run_between_graph_clients(_worker_fn, self._cluster_spec,
-                                    required_gpus)
+    input_fn = self._input_fn_to_test_input_context(
+        fn,
+        expected_num_replicas_in_sync=3*devices_per_worker,
+        expected_num_input_pipelines=3,
+        expected_input_pipeline_id=1)  # because task_id = 1
+    self._test_input_fn_iterator(
+        'worker',
+        1,
+        required_gpus,
+        input_fn,
+        expected_values,
+        test_reinitialize=use_dataset,
+        ignore_order=not use_dataset)
 
   @combinations.generate(combinations.combine(mode=['graph']))
   def testUpdateConfigProto(self):
diff --git a/tensorflow/python/distribute/input_lib.py b/tensorflow/python/distribute/input_lib.py
index 74268999de0..e4a362a92c6 100644
--- a/tensorflow/python/distribute/input_lib.py
+++ b/tensorflow/python/distribute/input_lib.py
@@ -549,7 +549,7 @@ def _get_next_as_optional(iterator, strategy, name=None):
       # Collective all-reduce requires explicit devices for inputs.
       with ops.device("/cpu:0"):
         # Converting to integers for all-reduce.
-        worker_has_value = math_ops.cast(worker_has_value, dtypes.int64)
+        worker_has_value = math_ops.cast(worker_has_value, dtypes.int32)
         worker_devices.append(worker_has_value.device)
         worker_has_values.append(worker_has_value)
       # Make `replicas` a flat list of values across all replicas.
@@ -624,12 +624,16 @@ class DistributedIteratorBase(DistributedIteratorInterface):
     # get_next_as_optional(). And we only enable get_next_as_optional when the
     # output shapes are not static.
     #
+    # TODO(yuefengz): Currently `experimental_enable_get_next_as_optional` is
+    # always set to False in CollectiveAllReduceStrategy. We want to have a way
+    # to distinguish multi workers/single worker between graph, so we can enable
+    # the behavior in single worker case.
+    #
     # TODO(rxsang): We want to always enable the get_next_as_optional behavior
     # when user passed input_fn instead of dataset.
     if getattr(
         strategy.extended, "experimental_enable_get_next_as_optional", False):
-      self._enable_get_next_as_optional = (
-          not static_shape) or strategy.extended._in_multi_worker_mode()
+      self._enable_get_next_as_optional = not static_shape
     else:
       self._enable_get_next_as_optional = False
 
@@ -902,8 +906,7 @@ class DistributedIterator(DistributedIteratorBase,
       self._strategy = strategy
       if getattr(
           strategy.extended, "experimental_enable_get_next_as_optional", False):
-        self._enable_get_next_as_optional = (
-            not static_shape) or strategy.extended._in_multi_worker_mode()
+        self._enable_get_next_as_optional = not static_shape
       else:
         self._enable_get_next_as_optional = False
     else:
diff --git a/tensorflow/python/distribute/input_lib_test.py b/tensorflow/python/distribute/input_lib_test.py
index 23397bf5070..7f02d0121d0 100644
--- a/tensorflow/python/distribute/input_lib_test.py
+++ b/tensorflow/python/distribute/input_lib_test.py
@@ -1144,6 +1144,7 @@ class DistributedIteratorMultiWorkerTest(
           expected_values = [[[0, 1]], [[2, 3]], [[4]]]
           input_context = None
 
+        strategy.extended.experimental_enable_get_next_as_optional = True
         self._test_input_iteration(
             input_type,
             api_type,
diff --git a/tensorflow/python/distribute/strategy_common_test.py b/tensorflow/python/distribute/strategy_common_test.py
index d1a72da7e7c..7070fbbf18f 100644
--- a/tensorflow/python/distribute/strategy_common_test.py
+++ b/tensorflow/python/distribute/strategy_common_test.py
@@ -19,7 +19,6 @@ from __future__ import division
 from __future__ import print_function
 
 from absl.testing import parameterized
-import numpy as np
 
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.distribute import combinations
@@ -32,6 +31,7 @@ from tensorflow.python.distribute.tpu_strategy import TPUStrategy
 from tensorflow.python.eager import def_function
 from tensorflow.python.framework import dtypes
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 
 
@@ -83,7 +83,7 @@ class DistributedCollectiveAllReduceStrategyTest(
       return d.shard(input_context.num_input_pipelines,
                      input_context.input_pipeline_id)
 
-    expected_data_on_worker = [[0, 1, 2, 3, 4], [5, 6, 7, 8, 9]]
+    expected_sum_on_workers = [10, 35]
     input_iterator = iter(
         strategy.experimental_distribute_datasets_from_function(dataset_fn))
 
@@ -92,59 +92,10 @@ class DistributedCollectiveAllReduceStrategyTest(
       return strategy.experimental_local_results(iterator.get_next())
 
     result = run(input_iterator)
-    self.assertTrue(
-        np.array_equal(
-            result[0].numpy(),
-            expected_data_on_worker[multi_worker_test_base.get_task_index()]))
-
-  def testSimpleInputFromDatasetLastPartialBatch(self, strategy):
-    global_batch_size = 8
-    dataset = dataset_ops.DatasetV2.range(14).batch(
-        global_batch_size, drop_remainder=False)
-    input_iterator = iter(strategy.experimental_distribute_dataset(dataset))
-
-    @def_function.function
-    def run(input_iterator):
-      return strategy.run(lambda x: x, args=(next(input_iterator),))
-
-    # Let the complete batch go.
-    run(input_iterator)
-
-    # `result` is an incomplete batch
-    result = run(input_iterator)
-    expected_data_on_worker = [[8, 9, 10], [11, 12, 13]]
-    self.assertTrue(
-        np.array_equal(
-            result.numpy(),
-            expected_data_on_worker[multi_worker_test_base.get_task_index()]))
-
-  def testSimpleInputFromFnLastPartialBatch(self, strategy):
-
-    def dataset_fn(input_context):
-      global_batch_size = 8
-      batch_size = input_context.get_per_replica_batch_size(global_batch_size)
-      dataset = dataset_ops.DatasetV2.range(14).batch(
-          batch_size, drop_remainder=False)
-      return dataset.shard(input_context.num_input_pipelines,
-                           input_context.input_pipeline_id)
-
-    input_iterator = iter(
-        strategy.experimental_distribute_datasets_from_function(dataset_fn))
-
-    @def_function.function
-    def run(input_iterator):
-      return strategy.run(lambda x: x, args=(next(input_iterator),))
-
-    # Let the complete batch go.
-    run(input_iterator)
-    # `result` is an incomplete batch
-    result = run(input_iterator)
-
-    expected_data_on_worker = [[8, 9, 10, 11], [12, 13]]
-    self.assertTrue(
-        np.array_equal(
-            result.numpy(), expected_data_on_worker[
-                multi_worker_test_base.get_task_index()]))
+    sum_value = math_ops.reduce_sum(result)
+    self.assertEqual(
+        sum_value.numpy(),
+        expected_sum_on_workers[multi_worker_test_base.get_task_index()])
 
   def testReduceHostTensor(self, strategy):
     reduced = strategy.reduce(

From 7543ea1f37cea5ff67ecf4e77c93704d29fd8e1d Mon Sep 17 00:00:00 2001
From: Ruoxin Sang <rxsang@google.com>
Date: Tue, 23 Jun 2020 11:10:49 -0700
Subject: [PATCH 0898/1390] Support dynamic outputs for XLA on demand ops.

PiperOrigin-RevId: 317902879
Change-Id: I6b6dfa54855d5996ac15d4b5c48a5db5dc230025
---
 tensorflow/compiler/jit/xla_launch_util.cc    | 28 ++++++-
 tensorflow/compiler/xla/service/BUILD         |  5 +-
 .../compiler/xla/service/transfer_manager.cc  | 63 +++++++++++++++
 .../compiler/xla/service/transfer_manager.h   |  9 +++
 .../compiler/xrt/kernels/xrt_execute_op.cc    | 78 +++----------------
 .../python/distribute/tpu_strategy_test.py    |  9 +++
 6 files changed, 121 insertions(+), 71 deletions(-)

diff --git a/tensorflow/compiler/jit/xla_launch_util.cc b/tensorflow/compiler/jit/xla_launch_util.cc
index fc0ff8d9445..eb31b23c991 100644
--- a/tensorflow/compiler/jit/xla_launch_util.cc
+++ b/tensorflow/compiler/jit/xla_launch_util.cc
@@ -476,10 +476,36 @@ Status XlaComputationLaunchContext::PopulateOutputs(
     stream->ThenRecordEvent(definition_event.get());
   }
 
+  std::vector<TensorShape> output_tensor_shapes;
+  output_tensor_shapes.reserve(ctx->num_outputs());
+  if (output.on_host_shape().is_dynamic()) {
+    TF_ASSIGN_OR_RETURN(
+        auto transfer_manager,
+        xla::TransferManager::GetForPlatform(stream->parent()->platform()));
+
+    xla::Shape output_host_shape = output.on_host_shape();
+    xla::Shape output_device_shape = output.on_device_shape();
+    TF_RETURN_IF_ERROR(transfer_manager->ReadDynamicShapes(
+        stream, &output, &output_host_shape, &output_device_shape));
+
+    output.set_shapes(output_host_shape, output_device_shape);
+    for (int i = 0; i < ctx->num_outputs(); ++i) {
+      const xla::Shape& subshape =
+          xla::ShapeUtil::GetSubshape(output_host_shape, {i});
+      TensorShape shape;
+      TF_RETURN_IF_ERROR(XLAShapeToTensorShape(subshape, &shape));
+      output_tensor_shapes.push_back(shape);
+    }
+  } else {
+    for (int i = 0; i < ctx->num_outputs(); ++i) {
+      output_tensor_shapes.push_back(compilation_result->outputs[i].shape);
+    }
+  }
+
   // Copy XLA results to the OpOutputList.
   int output_num = 0;
   for (int i = 0; i < ctx->num_outputs(); ++i) {
-    const TensorShape& shape = compilation_result->outputs[i].shape;
+    const TensorShape& shape = output_tensor_shapes[i];
     const DataType& type = compilation_result->outputs[i].type;
     VLOG(2) << "Retval " << i << " shape " << shape.DebugString() << " type "
             << DataTypeString(type);
diff --git a/tensorflow/compiler/xla/service/BUILD b/tensorflow/compiler/xla/service/BUILD
index 2fd457e8e47..10e2d7e65d1 100644
--- a/tensorflow/compiler/xla/service/BUILD
+++ b/tensorflow/compiler/xla/service/BUILD
@@ -1202,6 +1202,9 @@ cc_library(
     srcs = ["transfer_manager.cc"],
     hdrs = ["transfer_manager.h"],
     deps = [
+        ":compiler",
+        ":executable",
+        ":maybe_owning_device_memory",
         ":shaped_buffer",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:shape_util",
@@ -1210,8 +1213,6 @@ cc_library(
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
-        "//tensorflow/compiler/xla/service:executable",
-        "//tensorflow/compiler/xla/service:maybe_owning_device_memory",
         "//tensorflow/core:lib",
         "//tensorflow/core:stream_executor_no_cuda",
         "//tensorflow/stream_executor:device_memory",
diff --git a/tensorflow/compiler/xla/service/transfer_manager.cc b/tensorflow/compiler/xla/service/transfer_manager.cc
index ebb0226476f..0fd64209152 100644
--- a/tensorflow/compiler/xla/service/transfer_manager.cc
+++ b/tensorflow/compiler/xla/service/transfer_manager.cc
@@ -20,6 +20,7 @@ limitations under the License.
 
 #include "absl/memory/memory.h"
 #include "absl/strings/str_cat.h"
+#include "tensorflow/compiler/xla/service/compiler.h"
 #include "tensorflow/compiler/xla/service/maybe_owning_device_memory.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
@@ -33,6 +34,7 @@ limitations under the License.
 using absl::StrCat;
 
 namespace xla {
+
 /* static */ tensorflow::mutex
     TransferManager::platform_transfer_manager_mutex_(
         tensorflow::LINKER_INITIALIZED);
@@ -200,6 +202,67 @@ void TransferManager::TransferArrayFromDevice(
                                    std::move(done), transfer_metadata);
 }
 
+Status TransferManager::ReadDynamicShapes(se::Stream* stream,
+                                          ShapedBuffer* device_buffer,
+                                          Shape* host_shape,
+                                          Shape* device_shape) {
+  DCHECK(device_shape->is_dynamic());
+  Shape original_device_shape = *device_shape;
+  Shape original_host_shape = *host_shape;
+  TF_RETURN_IF_ERROR(stream->BlockHostUntilDone());
+
+  TF_ASSIGN_OR_RETURN(auto compiler,
+                      Compiler::GetForPlatform(stream->parent()->platform()));
+  TF_RETURN_IF_ERROR(device_buffer->buffers().ForEachMutableElementWithStatus(
+      [&](const ShapeIndex& index, se::DeviceMemoryBase* buffer) {
+        const Shape& buffer_shape =
+            ShapeUtil::GetSubshape(*device_shape, index);
+        if (buffer_shape.IsTuple()) {
+          return Status::OK();
+        }
+        Shape& host_sub_shape =
+            *ShapeUtil::GetMutableSubshape(host_shape, index);
+        Shape& device_sub_shape =
+            *ShapeUtil::GetMutableSubshape(device_shape, index);
+        if (device_sub_shape.is_static()) {
+          return Status::OK();
+        }
+
+        // Read the dynamic shape metadata from the device stream.
+        auto shape_size_fn = compiler->ShapeSizeBytesFunction();
+        Shape buffer_shape_static = ShapeUtil::MakeStaticShape(buffer_shape);
+        const int64 offset = shape_size_fn(buffer_shape_static);
+        int64 metadata_size = shape_size_fn(buffer_shape) - offset;
+        if (metadata_size == 0) {
+          return InvalidArgument("Dynamic shape metadata size should not be 0");
+        }
+        auto buffer_8 = se::DeviceMemory<uint8>(*buffer);
+        auto metadata_buffer =
+            stream->parent()->GetSubBuffer(&buffer_8, offset, metadata_size);
+        TF_ASSIGN_OR_RETURN(
+            auto metadata,
+            TransferArrayFromDevice(
+                stream,
+                ShapeUtil::MakeShape(S32, {buffer_shape.dimensions_size()}),
+                metadata_buffer));
+
+        // Update shape size from metadata.
+        for (int64 i = 0; i < metadata.element_count(); ++i) {
+          host_sub_shape.mutable_dimensions()[i] = metadata.Get<int32>({i});
+          device_sub_shape.mutable_dimensions()[i] = metadata.Get<int32>({i});
+        }
+        return Status::OK();
+      }));
+  host_shape->clear_dynamic_dimensions();
+  device_shape->clear_dynamic_dimensions();
+
+  TF_RET_CHECK(ShapeUtil::DynamicShapeIsCompatible(*device_shape,
+                                                   original_device_shape));
+  TF_RET_CHECK(
+      ShapeUtil::DynamicShapeIsCompatible(*host_shape, original_host_shape));
+  return Status::OK();
+}
+
 /* static */ void TransferManager::RegisterTransferManager(
     se::Platform::Id platform_id,
     TransferManagerCreationFunction creation_function) {
diff --git a/tensorflow/compiler/xla/service/transfer_manager.h b/tensorflow/compiler/xla/service/transfer_manager.h
index e3f8ceacc42..c0670d26eee 100644
--- a/tensorflow/compiler/xla/service/transfer_manager.h
+++ b/tensorflow/compiler/xla/service/transfer_manager.h
@@ -184,6 +184,15 @@ class TransferManager {
       const se::DeviceMemoryBase& source,
       const TransferMetadata* transfer_metadata = nullptr);
 
+  // Read from a device buffer and update the dynamic dimension sizes of
+  // `host_shape` and `device_shape`. The function takes in bounded dynamic
+  // shapes, and returns static shapes with dynamic shapes updated.
+  // The shape of the buffer also have to be compatible with the host shape and
+  // device shape.
+  virtual Status ReadDynamicShapes(se::Stream* stream,
+                                   ShapedBuffer* device_buffer,
+                                   Shape* host_shape, Shape* device_shape);
+
   // Transfers the given literal into the Infeed interface of the device,
   // using the given executor.
   virtual Status TransferLiteralToInfeed(se::StreamExecutor* executor,
diff --git a/tensorflow/compiler/xrt/kernels/xrt_execute_op.cc b/tensorflow/compiler/xrt/kernels/xrt_execute_op.cc
index 3bd8af577c8..bfd48bd1442 100644
--- a/tensorflow/compiler/xrt/kernels/xrt_execute_op.cc
+++ b/tensorflow/compiler/xrt/kernels/xrt_execute_op.cc
@@ -264,86 +264,28 @@ Status UpdateDynamicInputs(
   return Status::OK();
 }
 
-xla::StatusOr<xla::Literal> ReadMetadataLiteral(
-    se::Stream* stream, se::DeviceMemoryBase buffer,
-    const xla::Shape& buffer_shape, xla::TransferManager* transfer_manager) {
-  TF_ASSIGN_OR_RETURN(auto compiler, xla::Compiler::GetForPlatform(
-                                         stream->parent()->platform()));
-  auto shape_size_fn = compiler->ShapeSizeBytesFunction();
-  xla::Shape buffer_shape_static =
-      xla::ShapeUtil::MakeStaticShape(buffer_shape);
-  const int64 offset = shape_size_fn(buffer_shape_static);
-  int64 metadata_size = shape_size_fn(buffer_shape) - offset;
-  TF_RET_CHECK(metadata_size != 0);
-  auto buffer_8 = se::DeviceMemory<uint8>(buffer);
-  auto metadata_buffer =
-      stream->parent()->GetSubBuffer(&buffer_8, offset, metadata_size);
-  return transfer_manager->TransferArrayFromDevice(
-      stream,
-      xla::ShapeUtil::MakeShape(xla::S32, {buffer_shape.dimensions_size()}),
-      metadata_buffer);
-}
-
-// For each subshape in the result buffer that's dynamic, read the dynamic
-// dimension sizes from the metadata, and update output shapes. The result shape
-// is a static and concrete shape.
-xla::Status UpdateDynamicOutputs(se::Stream* stream,
-                                 const xla::ShapedBuffer& shaped_buffer,
-                                 xla::Shape* output_host_shape,
-                                 xla::Shape* output_device_shape) {
-  DCHECK(output_device_shape->is_dynamic());
-  TF_ASSIGN_OR_RETURN(
-      auto transfer_manager,
-      xla::TransferManager::GetForPlatform(stream->parent()->platform()));
-  TF_RETURN_IF_ERROR(stream->BlockHostUntilDone());
-  TF_RETURN_IF_ERROR(shaped_buffer.buffers().ForEachElementWithStatus(
-      [&](const xla::ShapeIndex& index, const se::DeviceMemoryBase& buffer) {
-        const xla::Shape& buffer_shape =
-            xla::ShapeUtil::GetSubshape(*output_device_shape, index);
-        if (buffer_shape.IsTuple()) {
-          return Status::OK();
-        }
-        xla::Shape& host_shape =
-            *xla::ShapeUtil::GetMutableSubshape(output_host_shape, index);
-        xla::Shape& device_shape =
-            *xla::ShapeUtil::GetMutableSubshape(output_device_shape, index);
-        if (device_shape.is_static()) {
-          return Status::OK();
-        }
-        TF_ASSIGN_OR_RETURN(auto metadata,
-                            ReadMetadataLiteral(stream, buffer, buffer_shape,
-                                                transfer_manager));
-        // Update shape size from metadata.
-        for (int64 i = 0; i < metadata.element_count(); ++i) {
-          host_shape.mutable_dimensions()[i] = metadata.Get<int32>({i});
-          device_shape.mutable_dimensions()[i] = metadata.Get<int32>({i});
-        }
-        return Status::OK();
-      }));
-  output_host_shape->clear_dynamic_dimensions();
-  output_device_shape->clear_dynamic_dimensions();
-  return Status::OK();
-}
-
 xla::StatusOr<RefPtr<XRTTupleAllocation>> CreateOutputTuple(
     se::Stream* stream, xla::ExecutionOutput run_result, xla::Backend* backend,
     int device_ordinal) {
   XRTTupleAllocation* output_tuple;
-  const xla::ScopedShapedBuffer& shaped_buffer = run_result.Result();
-  if (shaped_buffer.on_device_shape().is_dynamic()) {
+  xla::ScopedShapedBuffer* shaped_buffer = run_result.MutableResult();
+  if (shaped_buffer->on_device_shape().is_dynamic()) {
     // Update dynamic shapes from output buffer, and create a XRT tensor with
     // dimension sizes read from metadata.
-    xla::Shape output_host_shape = shaped_buffer.on_host_shape();
-    xla::Shape output_device_shape = shaped_buffer.on_device_shape();
-    TF_RETURN_IF_ERROR(UpdateDynamicOutputs(
+    xla::Shape output_host_shape = shaped_buffer->on_host_shape();
+    xla::Shape output_device_shape = shaped_buffer->on_device_shape();
+    TF_ASSIGN_OR_RETURN(
+        auto transfer_manager,
+        xla::TransferManager::GetForPlatform(stream->parent()->platform()));
+    TF_RETURN_IF_ERROR(transfer_manager->ReadDynamicShapes(
         stream, shaped_buffer, &output_host_shape, &output_device_shape));
     TF_RETURN_IF_ERROR(XRTTupleAllocation::CreateFromBuffer(
-        shaped_buffer, output_host_shape, output_device_shape, backend,
+        *shaped_buffer, output_host_shape, output_device_shape, backend,
         device_ordinal, &output_tuple));
   } else {
     // Fast-path: Don't copy shapes of output buffer.
     TF_RETURN_IF_ERROR(XRTTupleAllocation::CreateFromBuffer(
-        shaped_buffer, backend, device_ordinal, &output_tuple));
+        *shaped_buffer, backend, device_ordinal, &output_tuple));
   }
   // After the output tuple is created, we can release the output result
   // buffers, to make sure they won't be cleared by its destructor.
diff --git a/tensorflow/python/distribute/tpu_strategy_test.py b/tensorflow/python/distribute/tpu_strategy_test.py
index 142743a6ec2..850981e073e 100644
--- a/tensorflow/python/distribute/tpu_strategy_test.py
+++ b/tensorflow/python/distribute/tpu_strategy_test.py
@@ -123,6 +123,15 @@ class TPUTest(test.TestCase):
       result = bar() + 1
       self.assertAllEqual(result, 2)
 
+  def test_on_demand_op_with_dynamic_output(self):
+    with ops.device("/device:TPU:0"):
+      where_output = array_ops.where([True, False, True])
+    self.assertAllEqual(where_output, [[0], [2]])
+
+    with ops.device("/device:TPU:0"):
+      repeat_output = array_ops.repeat(math_ops.range(2), [1, 4])
+    self.assertAllEqual(repeat_output, [0, 1, 1, 1, 1])
+
 
 @parameterized.named_parameters([("PackedVar", True), ("", False)])
 class TPUStrategyTest(test.TestCase, parameterized.TestCase):

From 34ee7f4fd44caa40e3d50e801968be94f80b2f99 Mon Sep 17 00:00:00 2001
From: Vo Van Nghia <vovannghia2409@gmail.com>
Date: Wed, 24 Jun 2020 01:38:19 +0700
Subject: [PATCH 0899/1390] Add gcs header to expose symbol

---
 .../experimental/filesystem/plugins/gcs/BUILD |  8 ++---
 .../filesystem/plugins/gcs/gcs_filesystem.cc  | 30 ++++++----------
 .../filesystem/plugins/gcs/gcs_filesystem.h   | 35 +++++++++++++++++++
 .../plugins/gcs/gcs_filesystem_test.cc        | 10 ++----
 4 files changed, 50 insertions(+), 33 deletions(-)
 create mode 100644 tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem.h

diff --git a/tensorflow/c/experimental/filesystem/plugins/gcs/BUILD b/tensorflow/c/experimental/filesystem/plugins/gcs/BUILD
index f61aa8347d4..68d7de41b1b 100644
--- a/tensorflow/c/experimental/filesystem/plugins/gcs/BUILD
+++ b/tensorflow/c/experimental/filesystem/plugins/gcs/BUILD
@@ -19,6 +19,7 @@ tf_cc_shared_object(
 cc_library(
     name = "gcs_filesystem_impl",
     srcs = ["gcs_filesystem.cc"],
+    hdrs = ["gcs_filesystem.h"],
     copts = select({
         "//conditions:default": [],
         "//tensorflow:windows": get_win_copts(),
@@ -55,14 +56,9 @@ tf_cc_test(
         "notap",
     ],
     deps = [
-        ":gcs_helper",
-        "//tensorflow/c:env",
-        "//tensorflow/c:tf_status",
+        ":gcs_filesystem_impl",
         "//tensorflow/c:tf_status_helper",
-        "//tensorflow/c/experimental/filesystem:filesystem_interface",
         "//tensorflow/core/platform:stacktrace_handler",
         "//tensorflow/core/platform:test",
-        "@com_github_googlecloudplatform_google_cloud_cpp//:storage_client",
-        "@com_google_absl//absl/strings",
     ],
 )
diff --git a/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem.cc b/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem.cc
index 8c5c035f939..bd55cafd6f8 100644
--- a/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem.cc
+++ b/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem.cc
@@ -12,26 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include "tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem.h"
+
 #include <stdlib.h>
 #include <string.h>
 
-#include <fstream>
-
-#include "absl/strings/string_view.h"
 #include "google/cloud/storage/client.h"
 #include "tensorflow/c/env.h"
-#include "tensorflow/c/experimental/filesystem/filesystem_interface.h"
 #include "tensorflow/c/experimental/filesystem/plugins/gcs/gcs_helper.h"
 #include "tensorflow/c/tf_status.h"
 
-#ifdef TF_GCS_FILESYSTEM_TEST
-// For testing purpose, we expose some functions.
-#define TF_STATIC
-#else
-// Otherwise, we don't expose any symbol.
-#define TF_STATIC static
-#endif
-
 // Implementation of a filesystem for GCS environments.
 // This filesystem will support `gs://` URI schemes.
 namespace gcs = google::cloud::storage;
@@ -48,8 +38,8 @@ static inline void TF_SetStatusFromGCSStatus(
 static void* plugin_memory_allocate(size_t size) { return calloc(1, size); }
 static void plugin_memory_free(void* ptr) { free(ptr); }
 
-static void ParseGCSPath(absl::string_view fname, bool object_empty_ok,
-                         char** bucket, char** object, TF_Status* status) {
+void ParseGCSPath(absl::string_view fname, bool object_empty_ok, char** bucket,
+                  char** object, TF_Status* status) {
   size_t scheme_end = fname.find("://") + 2;
   if (fname.substr(0, scheme_end + 1) != "gs://") {
     TF_SetStatus(status, TF_INVALID_ARGUMENT,
@@ -130,7 +120,7 @@ namespace tf_read_only_memory_region {
 namespace tf_gcs_filesystem {
 
 // TODO(vnvo2409): Add lazy-loading and customizing parameters.
-TF_STATIC void Init(TF_Filesystem* filesystem, TF_Status* status) {
+void Init(TF_Filesystem* filesystem, TF_Status* status) {
   google::cloud::StatusOr<gcs::Client> client =
       gcs::Client::CreateDefaultClient();
   if (!client) {
@@ -143,14 +133,14 @@ TF_STATIC void Init(TF_Filesystem* filesystem, TF_Status* status) {
   TF_SetStatus(status, TF_OK, "");
 }
 
-static void Cleanup(TF_Filesystem* filesystem) {
+void Cleanup(TF_Filesystem* filesystem) {
   plugin_memory_free(filesystem->plugin_filesystem);
 }
 
 // TODO(vnvo2409): Implement later
 
-static void NewWritableFile(const TF_Filesystem* filesystem, const char* path,
-                            TF_WritableFile* file, TF_Status* status) {
+void NewWritableFile(const TF_Filesystem* filesystem, const char* path,
+                     TF_WritableFile* file, TF_Status* status) {
   char* bucket;
   char* object;
   ParseGCSPath(path, false, &bucket, &object, status);
@@ -166,8 +156,8 @@ static void NewWritableFile(const TF_Filesystem* filesystem, const char* path,
   TF_SetStatus(status, TF_OK, "");
 }
 
-static void NewAppendableFile(const TF_Filesystem* filesystem, const char* path,
-                              TF_WritableFile* file, TF_Status* status) {
+void NewAppendableFile(const TF_Filesystem* filesystem, const char* path,
+                       TF_WritableFile* file, TF_Status* status) {
   char* bucket;
   char* object;
   ParseGCSPath(path, false, &bucket, &object, status);
diff --git a/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem.h b/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem.h
new file mode 100644
index 00000000000..cc8168e8d67
--- /dev/null
+++ b/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem.h
@@ -0,0 +1,35 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+ ==============================================================================*/
+#ifndef TENSORFLOW_C_EXPERIMENTAL_FILESYSTEM_PLUGINS_GCS_GCS_FILESYSTEM_H_
+#define TENSORFLOW_C_EXPERIMENTAL_FILESYSTEM_PLUGINS_GCS_GCS_FILESYSTEM_H_
+
+#include "absl/strings/string_view.h"
+#include "google/cloud/storage/client.h"
+#include "tensorflow/c/experimental/filesystem/filesystem_interface.h"
+#include "tensorflow/c/tf_status.h"
+
+void ParseGCSPath(absl::string_view fname, bool object_empty_ok, char** bucket,
+                  char** object, TF_Status* status);
+
+namespace tf_gcs_filesystem {
+void Init(TF_Filesystem* filesystem, TF_Status* status);
+void Cleanup(TF_Filesystem* filesystem);
+void NewWritableFile(const TF_Filesystem* filesystem, const char* path,
+                     TF_WritableFile* file, TF_Status* status);
+void NewAppendableFile(const TF_Filesystem* filesystem, const char* path,
+                       TF_WritableFile* file, TF_Status* status);
+}  // namespace tf_gcs_filesystem
+
+#endif  // TENSORFLOW_C_EXPERIMENTAL_FILESYSTEM_PLUGINS_GCS_GCS_FILESYSTEM_H_
diff --git a/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem_test.cc b/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem_test.cc
index 43221763791..96fef424ebf 100644
--- a/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem_test.cc
+++ b/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem_test.cc
@@ -12,18 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/c/experimental/filesystem/filesystem_interface.h"
+#include "tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem.h"
+
 #include "tensorflow/c/tf_status_helper.h"
 #include "tensorflow/core/platform/stacktrace_handler.h"
 #include "tensorflow/core/platform/test.h"
 
 #define ASSERT_TF_OK(x) ASSERT_EQ(TF_OK, TF_GetCode(x))
 
-// Forward declaration
-namespace tf_gcs_filesystem {
-void Init(TF_Filesystem* filesystem, TF_Status* status);
-}
-
 namespace tensorflow {
 namespace {
 
@@ -38,7 +34,7 @@ class GCSFilesystemTest : public ::testing::Test {
   }
   void TearDown() override {
     TF_DeleteStatus(status_);
-    // TODO(vnvo2409): Add filesystem cleanup
+    tf_gcs_filesystem::Cleanup(filesystem_);
     delete filesystem_;
   }
 

From c41685d118a9da1b8b7c4fb5620a9aa920bf0740 Mon Sep 17 00:00:00 2001
From: Advait Jain <advaitjain@google.com>
Date: Tue, 23 Jun 2020 11:35:26 -0700
Subject: [PATCH 0900/1390] Separate out parse functionality into helper
 functions.

Ops in this change:
 * Abs
 * Add
 * ArgMax
 * ArgMin

PiperOrigin-RevId: 317908035
Change-Id: I6c33bd83c987c92b71992c6c113d8678bc9d35d8
---
 .../lite/core/api/flatbuffer_conversions.cc   | 134 +++++++++++++-----
 .../lite/core/api/flatbuffer_conversions.h    |  16 +++
 .../lite/micro/micro_mutable_op_resolver.h    |  16 +--
 3 files changed, 121 insertions(+), 45 deletions(-)

diff --git a/tensorflow/lite/core/api/flatbuffer_conversions.cc b/tensorflow/lite/core/api/flatbuffer_conversions.cc
index 73d785bf369..c496c456542 100644
--- a/tensorflow/lite/core/api/flatbuffer_conversions.cc
+++ b/tensorflow/lite/core/api/flatbuffer_conversions.cc
@@ -177,6 +177,91 @@ TfLiteStatus ConvertTensorType(TensorType tensor_type, TfLiteType* type,
   }
 }
 
+// We have this parse function instead of directly returning kTfLiteOk from the
+// switch-case in ParseOpData because this function is used as part of the
+// selective registration for the OpResolver implementation in micro.
+TfLiteStatus ParseAbs(const Operator*, BuiltinOperator, ErrorReporter*,
+                      BuiltinDataAllocator*, void**) {
+  return kTfLiteOk;
+}
+
+TfLiteStatus ParseAdd(const Operator* op, BuiltinOperator,
+                      ErrorReporter* error_reporter,
+                      BuiltinDataAllocator* allocator, void** builtin_data) {
+  CheckParsePointerParams(op, error_reporter, allocator, builtin_data);
+
+  SafeBuiltinDataAllocator safe_allocator(allocator);
+  std::unique_ptr<TfLiteAddParams, SafeBuiltinDataAllocator::BuiltinDataDeleter>
+      params = safe_allocator.Allocate<TfLiteAddParams>();
+  TF_LITE_ENSURE(error_reporter, params != nullptr);
+
+  const AddOptions* schema_params = op->builtin_options_as_AddOptions();
+
+  if (schema_params != nullptr) {
+    params->activation =
+        ConvertActivation(schema_params->fused_activation_function());
+  } else {
+    // TODO(b/157480169): We should either return kTfLiteError or fill in some
+    // reasonable defaults in the params struct. We are not doing so until we
+    // better undertand the ramifications of changing the legacy behavior.
+  }
+
+  *builtin_data = params.release();
+  return kTfLiteOk;
+}
+
+TfLiteStatus ParseArgMax(const Operator* op, BuiltinOperator,
+                         ErrorReporter* error_reporter,
+                         BuiltinDataAllocator* allocator, void** builtin_data) {
+  CheckParsePointerParams(op, error_reporter, allocator, builtin_data);
+
+  SafeBuiltinDataAllocator safe_allocator(allocator);
+  std::unique_ptr<TfLiteArgMaxParams,
+                  SafeBuiltinDataAllocator::BuiltinDataDeleter>
+      params = safe_allocator.Allocate<TfLiteArgMaxParams>();
+  TF_LITE_ENSURE(error_reporter, params != nullptr);
+
+  const ArgMaxOptions* schema_params = op->builtin_options_as_ArgMaxOptions();
+
+  if (schema_params != nullptr) {
+    TF_LITE_ENSURE_STATUS(ConvertTensorType(
+        schema_params->output_type(), &params->output_type, error_reporter));
+  } else {
+    // TODO(b/157480169): We should either return kTfLiteError or fill in some
+    // reasonable defaults in the params struct. We are not doing so until we
+    // better undertand the ramifications of changing the legacy behavior.
+  }
+
+  *builtin_data = params.release();
+  return kTfLiteOk;
+}
+
+TfLiteStatus ParseArgMin(const Operator* op, BuiltinOperator,
+                         ErrorReporter* error_reporter,
+                         BuiltinDataAllocator* allocator, void** builtin_data) {
+  CheckParsePointerParams(op, error_reporter, allocator, builtin_data);
+
+  SafeBuiltinDataAllocator safe_allocator(allocator);
+  std::unique_ptr<TfLiteArgMinParams,
+                  SafeBuiltinDataAllocator::BuiltinDataDeleter>
+      params = safe_allocator.Allocate<TfLiteArgMinParams>();
+  TF_LITE_ENSURE(error_reporter, params != nullptr);
+
+  const ArgMinOptions* schema_params = op->builtin_options_as_ArgMinOptions();
+
+  if (schema_params != nullptr) {
+    TF_LITE_ENSURE_STATUS(ConvertTensorType(
+        schema_params->output_type(), &params->output_type, error_reporter));
+  } else {
+    // TODO(b/157480169): We should either return kTfLiteError or fill in some
+    // reasonable defaults in the params struct. We are not doing so until we
+    // better undertand the ramifications of changing the legacy behavior.
+  }
+
+  *builtin_data = params.release();
+  return kTfLiteOk;
+}
+
 TfLiteStatus ParseConv2D(const Operator* op, BuiltinOperator,
                          ErrorReporter* error_reporter,
                          BuiltinDataAllocator* allocator, void** builtin_data) {
@@ -430,6 +515,22 @@ TfLiteStatus ParseOpData(const Operator* op, BuiltinOperator op_type,
   SafeBuiltinDataAllocator safe_allocator(allocator);
   *builtin_data = nullptr;
   switch (op_type) {
+    case BuiltinOperator_ABS: {
+      return ParseAbs(op, op_type, error_reporter, allocator, builtin_data);
+    }
+
+    case BuiltinOperator_ADD: {
+      return ParseAdd(op, op_type, error_reporter, allocator, builtin_data);
+    }
+
+    case BuiltinOperator_ARG_MAX: {
+      return ParseArgMax(op, op_type, error_reporter, allocator, builtin_data);
+    }
+
+    case BuiltinOperator_ARG_MIN: {
+      return ParseArgMin(op, op_type, error_reporter, allocator, builtin_data);
+    }
+
     case BuiltinOperator_CONV_2D: {
       return ParseConv2D(op, op_type, error_reporter, allocator, builtin_data);
     }
@@ -586,16 +687,6 @@ TfLiteStatus ParseOpData(const Operator* op, BuiltinOperator op_type,
       *builtin_data = params.release();
       return kTfLiteOk;
     }
-    case BuiltinOperator_ADD: {
-      auto params = safe_allocator.Allocate<TfLiteAddParams>();
-      TF_LITE_ENSURE(error_reporter, params != nullptr);
-      if (const auto* schema_params = op->builtin_options_as_AddOptions()) {
-        params->activation =
-            ConvertActivation(schema_params->fused_activation_function());
-      }
-      *builtin_data = params.release();
-      return kTfLiteOk;
-    }
     case BuiltinOperator_DIV: {
       auto params = safe_allocator.Allocate<TfLiteDivParams>();
       TF_LITE_ENSURE(error_reporter, params != nullptr);
@@ -838,28 +929,6 @@ TfLiteStatus ParseOpData(const Operator* op, BuiltinOperator op_type,
       *builtin_data = params.release();
       return kTfLiteOk;
     }
-    case BuiltinOperator_ARG_MAX: {
-      auto params = safe_allocator.Allocate<TfLiteArgMaxParams>();
-      TF_LITE_ENSURE(error_reporter, params != nullptr);
-      if (const auto* schema_params = op->builtin_options_as_ArgMaxOptions()) {
-        TF_LITE_ENSURE_STATUS(ConvertTensorType(schema_params->output_type(),
-                                                &params->output_type,
-                                                error_reporter));
-      }
-      *builtin_data = params.release();
-      return kTfLiteOk;
-    }
-    case BuiltinOperator_ARG_MIN: {
-      auto params = safe_allocator.Allocate<TfLiteArgMinParams>();
-      TF_LITE_ENSURE(error_reporter, params != nullptr);
-      if (const auto* schema_params = op->builtin_options_as_ArgMinOptions()) {
-        TF_LITE_ENSURE_STATUS(ConvertTensorType(schema_params->output_type(),
-                                                &params->output_type,
-                                                error_reporter));
-      }
-      *builtin_data = params.release();
-      return kTfLiteOk;
-    }
     case BuiltinOperator_TRANSPOSE_CONV: {
       auto params = safe_allocator.Allocate<TfLiteTransposeConvParams>();
       TF_LITE_ENSURE(error_reporter, params != nullptr);
@@ -1019,7 +1088,6 @@ TfLiteStatus ParseOpData(const Operator* op, BuiltinOperator op_type,
       return kTfLiteOk;
     }
     // Below are the ops with no builtin_data structure.
-    case BuiltinOperator_ABS:
     case BuiltinOperator_BATCH_TO_SPACE_ND:
     // TODO(aselle): Implement call in BuiltinOptions, but nullptrs are
     // ok for now, since there is no call implementation either.
diff --git a/tensorflow/lite/core/api/flatbuffer_conversions.h b/tensorflow/lite/core/api/flatbuffer_conversions.h
index 78d2aca6222..a6431aa5ee1 100644
--- a/tensorflow/lite/core/api/flatbuffer_conversions.h
+++ b/tensorflow/lite/core/api/flatbuffer_conversions.h
@@ -75,6 +75,22 @@ TfLiteStatus ConvertTensorType(TensorType tensor_type, TfLiteType* type,
 // removed once we are no longer using ParseOpData for the OpResolver
 // implementation in micro.
 
+TfLiteStatus ParseAbs(const Operator* op, BuiltinOperator op_type,
+                      ErrorReporter* error_reporter,
+                      BuiltinDataAllocator* allocator, void** builtin_data);
+
+TfLiteStatus ParseAdd(const Operator* op, BuiltinOperator op_type,
+                      ErrorReporter* error_reporter,
+                      BuiltinDataAllocator* allocator, void** builtin_data);
+
+TfLiteStatus ParseArgMax(const Operator* op, BuiltinOperator op_type,
+                         ErrorReporter* error_reporter,
+                         BuiltinDataAllocator* allocator, void** builtin_data);
+
+TfLiteStatus ParseArgMin(const Operator* op, BuiltinOperator op_type,
+                         ErrorReporter* error_reporter,
+                         BuiltinDataAllocator* allocator, void** builtin_data);
+
 TfLiteStatus ParseConv2D(const Operator* op, BuiltinOperator op_type,
                          ErrorReporter* error_reporter,
                          BuiltinDataAllocator* allocator, void** builtin_data);
diff --git a/tensorflow/lite/micro/micro_mutable_op_resolver.h b/tensorflow/lite/micro/micro_mutable_op_resolver.h
index 1b76f440a61..8c99f77729d 100644
--- a/tensorflow/lite/micro/micro_mutable_op_resolver.h
+++ b/tensorflow/lite/micro/micro_mutable_op_resolver.h
@@ -108,31 +108,23 @@ class MicroMutableOpResolver : public MicroOpResolver {
   // MicroMutableOpResolver object.
 
   TfLiteStatus AddAbs() {
-    // TODO(b/149408647): Replace ParseOpData with the operator specific parse
-    // function.
     return AddBuiltin(BuiltinOperator_ABS, *tflite::ops::micro::Register_ABS(),
-                      ParseOpData);
+                      ParseAbs);
   }
 
   TfLiteStatus AddAdd() {
-    // TODO(b/149408647): Replace ParseOpData with the operator specific parse
-    // function.
     return AddBuiltin(BuiltinOperator_ADD, *tflite::ops::micro::Register_ADD(),
-                      ParseOpData);
+                      ParseAdd);
   }
 
   TfLiteStatus AddArgMax() {
-    // TODO(b/149408647): Replace ParseOpData with the operator specific parse
-    // function.
     return AddBuiltin(BuiltinOperator_ARG_MAX,
-                      *tflite::ops::micro::Register_ARG_MAX(), ParseOpData);
+                      *tflite::ops::micro::Register_ARG_MAX(), ParseArgMax);
   }
 
   TfLiteStatus AddArgMin() {
-    // TODO(b/149408647): Replace ParseOpData with the operator specific parse
-    // function.
     return AddBuiltin(BuiltinOperator_ARG_MIN,
-                      *tflite::ops::micro::Register_ARG_MIN(), ParseOpData);
+                      *tflite::ops::micro::Register_ARG_MIN(), ParseArgMin);
   }
 
   TfLiteStatus AddAveragePool2D() {

From fad1ae2938d5be4a126fe05249830c318e6da10e Mon Sep 17 00:00:00 2001
From: Jonah Kohn <51345541+jonah-kohn@users.noreply.github.com>
Date: Tue, 23 Jun 2020 11:48:23 -0700
Subject: [PATCH 0901/1390] Revert 1bd0148 for modular PRs

---
 tensorflow/python/keras/optimizer_v2/optimizer_v2.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/keras/optimizer_v2/optimizer_v2.py b/tensorflow/python/keras/optimizer_v2/optimizer_v2.py
index d8992bbe3e0..c55b332bfc0 100644
--- a/tensorflow/python/keras/optimizer_v2/optimizer_v2.py
+++ b/tensorflow/python/keras/optimizer_v2/optimizer_v2.py
@@ -910,7 +910,7 @@ class OptimizerV2(trackable.Trackable):
       return value()
     if tensor_util.is_tensor(value):
       return backend.get_value(value)
-    return float(value)
+    return value
 
   def variables(self):
     """Returns variables of this Optimizer based on the order created."""

From 6bbb6f6940bcfe973568898126c9695626e85327 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 23 Jun 2020 11:45:40 -0700
Subject: [PATCH 0902/1390] Remove recompile workaround in integration test
 since the fix has been in for a while.

PiperOrigin-RevId: 317910115
Change-Id: I5174cbf5eea8b4983c0ba9aff10ec8a53e34bf13
---
 tensorflow/python/keras/tests/integration_test.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/tensorflow/python/keras/tests/integration_test.py b/tensorflow/python/keras/tests/integration_test.py
index 8e4d38c1a6a..64a7b694355 100644
--- a/tensorflow/python/keras/tests/integration_test.py
+++ b/tensorflow/python/keras/tests/integration_test.py
@@ -160,10 +160,6 @@ class SequentialIntegrationTest(KerasIntegrationTest):
     model.pop()
     model.add(keras.layers.Dense(y_train.shape[-1], activation='softmax'))
 
-    # TODO(b/134523282): There is an bug with Sequential models, so the model
-    # must be marked as compiled=False to ensure the next compile goes through.
-    model._is_compiled = False
-
     model.compile(
         loss='categorical_crossentropy',
         optimizer=keras.optimizer_v2.adam.Adam(0.005),

From 7d94f9589de4388d249794d9856b10ca55d146c8 Mon Sep 17 00:00:00 2001
From: Ran Chen <crccw@google.com>
Date: Tue, 23 Jun 2020 11:47:29 -0700
Subject: [PATCH 0903/1390] Add MultiProcessPoolRunner

Tensorflow initialization can take a long time when GPUs are present. We cannot afford starting a new group of workers for every single test. MultiProcessPoolRunner uses a pool of workers so that we can avoid the initialization cost. Compared to MultiProcessRunner, it doesn't support terminating workers.

Note that implementation wise we could build MultiProcessPoolRunner on top of MultiProcessRunner or vice-versa if there's no need to support termination. Since it's easier for MultiProcessPoolRunner not to support termination, we choose MultiProcessPoolRunner on top of MultiProcessRunner.

PiperOrigin-RevId: 317910434
Change-Id: I72edd5231cfc9b0dc57df7e5bc135da097cca362
---
 .../python/distribute/multi_process_runner.py | 210 ++++++++++++++++--
 .../distribute/multi_process_runner_test.py   |  69 +++++-
 2 files changed, 254 insertions(+), 25 deletions(-)

diff --git a/tensorflow/python/distribute/multi_process_runner.py b/tensorflow/python/distribute/multi_process_runner.py
index db31b9c4dd4..84b61be1ea2 100644
--- a/tensorflow/python/distribute/multi_process_runner.py
+++ b/tensorflow/python/distribute/multi_process_runner.py
@@ -67,8 +67,7 @@ except ImportError:
 # exception stack trace info is stored in exc_info to pass on to parent process
 # to be re-raised.
 _ProcessStatusInfo = collections.namedtuple(
-    '_ProcessStatusInfo',
-    ['task_type', 'is_successful', 'exc_info', 'return_value'])
+    '_ProcessStatusInfo', ['is_successful', 'exc_info', 'return_value'])
 
 # Information returned from a successful MultiProcessRunner run.
 MultiProcessRunnerResult = collections.namedtuple('MultiProcessRunnerResult',
@@ -124,6 +123,7 @@ class MultiProcessRunner(object):
                stream_stdout=True,
                list_stdout=False,
                use_dill_for_args=True,
+               daemon=False,
                args=None,
                kwargs=None):
     """Creates a multi-process runner.
@@ -157,6 +157,7 @@ class MultiProcessRunner(object):
       use_dill_for_args: Whether to use dill to pickle `args` and `kwargs`. dill
         can pickle more objects, but doesn't work with types in
         `multiprocessing` library like `Mutex`.
+      daemon: Whether to start processes as daemons.
       args: Positional arguments to be sent to functions run on processes.
       kwargs: Keyword arguments to be sent to functions run on processes.
 
@@ -188,6 +189,7 @@ class MultiProcessRunner(object):
     self._list_stdout = list_stdout
     self._dependence_on_chief = True
     self._use_dill_for_args = use_dill_for_args
+    self._daemon = daemon
     self._args = args or ()
     self._kwargs = kwargs or {}
 
@@ -268,7 +270,8 @@ class MultiProcessRunner(object):
         test_env=test_env,
         target=_ProcFunc(),
         args=(resources, test_env, proc_func, args, kwargs,
-              self._use_dill_for_args))
+              self._use_dill_for_args),
+        daemon=self._daemon)
     p.start()
     self._processes[(task_type, task_id)] = p
     self._outstanding_subprocess_count += 1
@@ -580,7 +583,6 @@ class _ProcFunc(object):
         time.sleep(0.1)
     self._resources.process_status_queue.put(
         _ProcessStatusInfo(
-            task_type=task_type,
             is_successful=True,
             exc_info=None,
             return_value=None))
@@ -640,17 +642,9 @@ class _ProcFunc(object):
     if test_env.v2_enabled:
       v2_compat.enable_v2_behavior()
 
-    try:
-      with self._runtime_mode(test_env.executing_eagerly):
-        return_value = proc_func(*args, **kwargs)
-        is_successful = True
-        exc_info = None
-
-    except Exception:  # pylint: disable=broad-except
-      # Capture all exceptions to be reported to parent process.
-      return_value = None
-      is_successful = False
-      exc_info = sys.exc_info()
+    with self._runtime_mode(test_env.executing_eagerly):
+      info = _run_contained(proc_func, args, kwargs)
+      self._resources.process_status_queue.put(info)
 
       # Re-raise the exception in addition to reporting it to the parent
       # process, so that even if `--test_timeout` flag is set and the
@@ -659,18 +653,188 @@ class _ProcFunc(object):
       # instead of silently suppressing the error due to early bazel
       # timeout. Raising an error in the subprocess produces stack trace in
       # the log, but the program continues running.
-      raise
+      if not info.is_successful:
+        six.reraise(*info.exc_info)
 
-    finally:
-      info = _ProcessStatusInfo(
-          task_type=test_env.task_type,
-          is_successful=is_successful,
-          exc_info=exc_info,
-          return_value=return_value)
-      self._resources.process_status_queue.put(info)
       self._close_streaming()
 
 
+class MultiProcessPoolRunner(object):
+  """A utility class to start a process pool to simulate a cluster.
+
+  It's similar to MultiProcessRunner, but uses a pool of processes to avoid the
+  expensive initialization cost of Tensorflow.
+  """
+
+  def __init__(self, cluster_spec, initializer=None):
+    """Creates a multi-process pool runner.
+
+    Args:
+      cluster_spec: Dict for cluster spec. The following is an example of
+        cluster with three workers.
+        {"worker": ["worker0.example.com:2222",
+                    "worker1.example.com:2222",
+                    "worker2.example.com:2222"]}
+      initializer: a callable to called at the startup of worker processes.
+
+    Raises:
+      RuntimeError: if `multi_process_runner.test_main()` is not called.
+      ValueError: if there are more than one chief in the `cluster_spec`.
+    """
+    self._cluster_spec = cluster_spec
+    self._initializer = initializer
+    self._conn = {}
+    self._runner = None
+
+  def __del__(self):
+    self._reset()
+
+  def _reset(self):
+    for conn in self._conn.values():
+      conn.close()
+    self._conn = {}
+    if self._runner is not None:
+      self._runner.join()
+      self._runner = None
+
+  def _start(self):
+    """Starts the worker pool."""
+    # We need different arguments for different processes so we're passing a
+    # no-op proc_func here and use start_single_process instead.
+    #
+    # We also need to start the process pool as daemon, so that they don't block
+    # the program from exiting. Note that __del__ may not get called when
+    # there's an exception. The user may also store a pool runner in a global
+    # object to share across test cases
+
+    if dill is None:
+      raise unittest.SkipTest(
+          'TODO(b/150264776): Resolve dependency issue in CI')
+
+    self._runner = MultiProcessRunner(
+        proc_func=lambda: None,
+        cluster_spec=self._cluster_spec,
+        use_dill_for_args=False,
+        daemon=True)
+    if self._initializer:
+      initializer = dill.dumps(self._initializer, dill.HIGHEST_PROTOCOL)
+    else:
+      initializer = None
+    for task_type, addresses in self._cluster_spec.items():
+      for task_id, _ in enumerate(addresses):
+        conn1, conn2 = multiprocessing.Pipe(duplex=True)
+        self._conn[(task_type, task_id)] = conn1
+        self._runner.start_single_process(
+            task_type,
+            task_id,
+            proc_func=_pool_runner_worker,
+            args=(initializer, conn2))
+
+  def run(self, proc_func, args=None, kwargs=None):
+    """Runs `proc_func` with `args` and `kwargs` on all jobs.
+
+    Args:
+      proc_func: The function to be run.
+      args: Optional positional arguments to be supplied in `proc_func`.
+      kwargs: Optional keyword arguments to be supplied in `proc_func`.
+
+    Returns:
+      A list of return values.
+    """
+    if self._runner is None:
+      self._start()
+
+    # Since we start the processes as daemon they're going to be killed by
+    # SIGTERM when the program exits. We only turn on streaming during run() to
+    # avoid printing the stacktrace caused by the SIGTERM.
+    self._runner._stream_stdout = True  # pylint: disable=protected-access
+
+    try:
+      proc_func = dill.dumps(proc_func, dill.HIGHEST_PROTOCOL)
+      for conn in self._conn.values():
+        conn.send((proc_func, args or [], kwargs or {}))
+
+      process_statuses = []
+      for (task_type, task_id), conn in self._conn.items():
+        logging.info('Waiting for the result from %s-%d', task_type, task_id)
+        try:
+          process_statuses.append(conn.recv())
+        except EOFError:
+          # This shouldn't happen due to exceptions in proc_func. This usually
+          # means bugs in the runner.
+          self._reset()
+          raise RuntimeError('Unexpected EOF. Worker process may have died. '
+                             'Please report a bug')
+
+      return_values = []
+      for process_status in process_statuses:
+        assert isinstance(process_status, _ProcessStatusInfo)
+        if not process_status.is_successful:
+          six.reraise(*process_status.exc_info)
+        if process_status.return_value is not None:
+          return_values.append(process_status.return_value)
+
+      return return_values
+    finally:
+      self._runner._stream_stdout = False  # pylint: disable=protected-access
+
+
+def _pool_runner_worker(initializer, conn):
+  """Function that runs on the workers in a pool.
+
+  It listens for callables to run and returns the result until `conn` is closed.
+  It captures the exceptions during executing the callable and return it through
+  `conn`.
+
+  Args:
+    initializer: A callable to execute during startup.
+    conn: A multiprocessing.Connection object to listen for tasks and send
+      results.
+  """
+  if initializer:
+    initializer = dill.loads(initializer)
+    initializer()
+  while True:
+    try:
+      proc_func, args, kwargs = conn.recv()
+    except EOFError:
+      break
+    proc_func = dill.loads(proc_func)
+    info = _run_contained(proc_func, args, kwargs)
+    sys.stdout.flush()
+    sys.stderr.flush()
+    conn.send(info)
+
+
+def _run_contained(proc_func, args, kwargs):
+  """Runs `proc_func` with `args` and `kwargs`.
+
+  The function returns _ProcessStatusInfo which captures the return value and
+  the exception.
+
+  Args:
+    proc_func: The function to be run.
+    args: Optional positional arguments to be supplied in `proc_func`.
+    kwargs: Optional keyword arguments to be supplied in `proc_func`.
+
+  Returns:
+    a _ProcessStatusInfo.
+  """
+  try:
+    return_value = proc_func(*args, **kwargs)
+    is_successful = True
+    exc_info = None
+  except Exception:  # pylint: disable=broad-except
+    return_value = None
+    is_successful = False
+    exc_info = sys.exc_info()
+  finally:
+    return _ProcessStatusInfo(  # pylint: disable=lost-exception
+        is_successful=is_successful,
+        exc_info=exc_info,
+        return_value=return_value)
+
+
 class SubprocessTimeoutError(RuntimeError):
   """An error that indicates there is at least one subprocess timing out.
 
diff --git a/tensorflow/python/distribute/multi_process_runner_test.py b/tensorflow/python/distribute/multi_process_runner_test.py
index d6e04010e34..32d3ae6c84e 100644
--- a/tensorflow/python/distribute/multi_process_runner_test.py
+++ b/tensorflow/python/distribute/multi_process_runner_test.py
@@ -22,6 +22,8 @@ import json
 import os
 import threading
 import time
+import unittest
+
 from absl import logging
 
 from tensorflow.python.distribute import multi_process_runner
@@ -45,7 +47,7 @@ def proc_func_that_adds_simple_return_data():
   return 'dummy_data'
 
 
-def proc_func_that_return_args_and_kwargs(*args, **kwargs):
+def proc_func_that_returns_args_and_kwargs(*args, **kwargs):
   return list(args) + list(kwargs.items())
 
 
@@ -53,6 +55,20 @@ def proc_func_with_barrier():
   return multi_process_runner.barrier()
 
 
+def proc_func_that_returns_pid():
+  return os.getpid()
+
+
+V = None
+
+
+def proc_func_that_sets_global(val):
+  global V
+  old_val = V
+  V = val
+  return old_val
+
+
 class MultiProcessRunnerTest(test.TestCase):
 
   def _worker_idx(self):
@@ -95,7 +111,7 @@ class MultiProcessRunnerTest(test.TestCase):
 
   def test_multi_process_runner_args_passed_correctly(self):
     return_value = multi_process_runner.run(
-        proc_func_that_return_args_and_kwargs,
+        proc_func_that_returns_args_and_kwargs,
         multi_worker_test_base.create_cluster_spec(num_workers=1),
         args=('a', 'b'),
         kwargs={
@@ -299,5 +315,54 @@ class MultiProcessRunnerTest(test.TestCase):
         any('something printed' in line for line in list_to_assert))
 
 
+class MultiProcessPoolRunnerTest(test.TestCase):
+
+  def test_same_process_across_runs(self):
+    cluster_spec = multi_worker_test_base.create_cluster_spec(num_workers=2)
+    runner = multi_process_runner.MultiProcessPoolRunner(cluster_spec)
+    pid = runner.run(proc_func_that_returns_pid)
+    for _ in range(3):
+      self.assertAllEqual(runner.run(proc_func_that_returns_pid), pid)
+
+  def test_exceptions_in_sub_process(self):
+    cluster_spec = multi_worker_test_base.create_cluster_spec(num_workers=2)
+    runner = multi_process_runner.MultiProcessPoolRunner(cluster_spec)
+    pid = runner.run(proc_func_that_returns_pid)
+    with self.assertRaisesRegexp(ValueError, 'This is an error.'):
+      runner.run(proc_func_that_errors)
+    self.assertAllEqual(runner.run(proc_func_that_returns_pid), pid)
+
+  def test_tf_config(self):
+    cluster_spec = multi_worker_test_base.create_cluster_spec(
+        has_chief=True, num_workers=2)
+    runner = multi_process_runner.MultiProcessPoolRunner(cluster_spec)
+    result = runner.run(proc_func_that_adds_task_type_in_return_data)
+
+    job_count_dict = {'worker': 2, 'chief': 1}
+    for data in result:
+      job_count_dict[data] -= 1
+
+    self.assertEqual(job_count_dict['worker'], 0)
+    self.assertEqual(job_count_dict['chief'], 0)
+
+  @unittest.expectedFailure
+  def test_exception_in_main_process(self):
+    # When there's an exception in the main process, __del__() is not called.
+    # This test is to verify MultiProcessPoolRunner can cope with __del__() not
+    # being called.
+    cluster_spec = multi_worker_test_base.create_cluster_spec(
+        has_chief=True, num_workers=2)
+    runner = multi_process_runner.MultiProcessPoolRunner(cluster_spec)
+    runner.run(proc_func_that_returns_pid)
+    raise ValueError('failure')
+
+  def test_initializer(self):
+    cluster_spec = multi_worker_test_base.create_cluster_spec(num_workers=2)
+    runner = multi_process_runner.MultiProcessPoolRunner(
+        cluster_spec, initializer=lambda: proc_func_that_sets_global(1))
+    result = runner.run(proc_func_that_sets_global, args=(2,))
+    self.assertAllEqual(result, [1, 1])
+
+
 if __name__ == '__main__':
   multi_process_runner.test_main()

From 846a410161f7c7d1210f26f71742ccdaf08a0975 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 23 Jun 2020 11:53:12 -0700
Subject: [PATCH 0904/1390] Update ops-related pbtxt files.

PiperOrigin-RevId: 317911581
Change-Id: Id1b6ee7c3db66e6a2c26d514af6b967c9be4ae3b
---
 .../compat/ops_history_v2/DecodeImage.pbtxt   | 39 +++++++++++++++++++
 tensorflow/core/ops/ops.pbtxt                 | 39 +++++++++++++++++++
 2 files changed, 78 insertions(+)
 create mode 100644 tensorflow/core/ops/compat/ops_history_v2/DecodeImage.pbtxt

diff --git a/tensorflow/core/ops/compat/ops_history_v2/DecodeImage.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/DecodeImage.pbtxt
new file mode 100644
index 00000000000..066ffd1091d
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v2/DecodeImage.pbtxt
@@ -0,0 +1,39 @@
+op {
+  name: "DecodeImage"
+  input_arg {
+    name: "contents"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "image"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "channels"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    default_value {
+      type: DT_UINT8
+    }
+    allowed_values {
+      list {
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_FLOAT
+      }
+    }
+  }
+  attr {
+    name: "expand_animations"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+}
diff --git a/tensorflow/core/ops/ops.pbtxt b/tensorflow/core/ops/ops.pbtxt
index dbd91c91b65..dec894cc173 100644
--- a/tensorflow/core/ops/ops.pbtxt
+++ b/tensorflow/core/ops/ops.pbtxt
@@ -11476,6 +11476,45 @@ op {
     type: DT_UINT8
   }
 }
+op {
+  name: "DecodeImage"
+  input_arg {
+    name: "contents"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "image"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "channels"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    default_value {
+      type: DT_UINT8
+    }
+    allowed_values {
+      list {
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_FLOAT
+      }
+    }
+  }
+  attr {
+    name: "expand_animations"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+}
 op {
   name: "DecodeJSONExample"
   input_arg {

From dd7bf5b85e3062af84f2c8ebc5eac900c34b9d7f Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 23 Jun 2020 11:53:53 -0700
Subject: [PATCH 0905/1390] Go: Update generated wrapper functions for
 TensorFlow ops.

PiperOrigin-RevId: 317911728
Change-Id: Ibab9243d032babf1f1299829c85e1ee73d7ed02c
---
 tensorflow/go/op/wrappers.go | 74 ++++++++++++++++++++++++++++++++++++
 1 file changed, 74 insertions(+)

diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index 3675c26751c..106e7445be9 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -15370,6 +15370,80 @@ func MergeSummary(scope *Scope, inputs []tf.Output) (summary tf.Output) {
 	return op.Output(0)
 }
 
+// DecodeImageAttr is an optional argument to DecodeImage.
+type DecodeImageAttr func(optionalAttr)
+
+// DecodeImageChannels sets the optional channels attribute to value.
+//
+// value: Number of color channels for the decoded image.
+// If not specified, defaults to 0
+func DecodeImageChannels(value int64) DecodeImageAttr {
+	return func(m optionalAttr) {
+		m["channels"] = value
+	}
+}
+
+// DecodeImageDtype sets the optional dtype attribute to value.
+//
+// value: The desired DType of the returned Tensor.
+// If not specified, defaults to DT_UINT8
+func DecodeImageDtype(value tf.DataType) DecodeImageAttr {
+	return func(m optionalAttr) {
+		m["dtype"] = value
+	}
+}
+
+// DecodeImageExpandAnimations sets the optional expand_animations attribute to value.
+//
+// value: Controls the output shape of the returned op. If True, the returned op will
+// produce a 3-D tensor for PNG, JPEG, and BMP files; and a 4-D tensor for all
+// GIFs, whether animated or not. If, False, the returned op will produce a 3-D
+// tensor for all file types and will truncate animated GIFs to the first frame.
+// If not specified, defaults to true
+func DecodeImageExpandAnimations(value bool) DecodeImageAttr {
+	return func(m optionalAttr) {
+		m["expand_animations"] = value
+	}
+}
+
+// Function for decode_bmp, decode_gif, decode_jpeg, and decode_png.
+//
+// Detects whether an image is a BMP, GIF, JPEG, or PNG, and performs the
+// appropriate operation to convert the input bytes string into a Tensor of type
+// dtype.
+//
+// *NOTE*: decode_gif returns a 4-D array [num_frames, height, width, 3], as
+// opposed to decode_bmp, decode_jpeg and decode_png, which return 3-D arrays
+// [height, width, num_channels]. Make sure to take this into account when
+// constructing your graph if you are intermixing GIF files with BMP, JPEG, and/or
+// PNG files. Alternately, set the expand_animations argument of this function to
+// False, in which case the op will return 3-dimensional tensors and will truncate
+// animated GIF files to the first frame.
+//
+// Arguments:
+//	contents: 0-D. The encoded image bytes.
+//
+// Returns 3-D with shape `[height, width, channels]` or 4-D with shape
+// `[frame, height, width, channels]`..
+func DecodeImage(scope *Scope, contents tf.Output, optional ...DecodeImageAttr) (image tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "DecodeImage",
+		Input: []tf.Input{
+			contents,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // AvgPoolAttr is an optional argument to AvgPool.
 type AvgPoolAttr func(optionalAttr)
 

From 66d4dbfc9d71812c7be3f8a195293e0cc0539fff Mon Sep 17 00:00:00 2001
From: Robert David <lrdx@google.com>
Date: Tue, 23 Jun 2020 12:00:02 -0700
Subject: [PATCH 0906/1390] Optimize NeonCwiseClipping doing a single loop with
 a single postamble.

Also rename the float ClipVector function, as it's doing the same thing. Make the parameters among float/intN_t versions the same (removing output array / n_batch as applicable) and fixing call sites, and change the portable implementations to a template.

PiperOrigin-RevId: 317912994
Change-Id: I94fa22b00d0c76e2f69794d18c493eeb2cb27a1c
---
 .../internal/optimized/neon_tensor_utils.cc   | 137 ++++++++----------
 .../internal/optimized/neon_tensor_utils.h    |  21 ++-
 .../optimized/neon_tensor_utils_impl.h        |  15 +-
 .../internal/optimized/sse_tensor_utils.h     |  21 +--
 .../reference/portable_tensor_utils.cc        |  37 -----
 .../reference/portable_tensor_utils.h         |  22 +--
 .../reference/portable_tensor_utils_impl.h    |  21 ++-
 .../lite/kernels/internal/tensor_utils.h      |  29 ++--
 .../kernels/internal/tensor_utils_test.cc     |  31 ++--
 tensorflow/lite/kernels/lstm_eval.cc          |  32 ++--
 .../calibration/builtin_logging_ops/lstm.cc   |   8 +-
 11 files changed, 152 insertions(+), 222 deletions(-)

diff --git a/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.cc b/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.cc
index c96f298370a..800d7008b4b 100644
--- a/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.cc
+++ b/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.cc
@@ -1892,61 +1892,70 @@ void NeonCwiseAdd(const int16_t* input_1, const int16_t* input_2, int n_batch,
   }
 }
 
-void NeonCwiseClipping(int16_t* input, const int16_t clipping_value,
-                       int32_t n_batch, int32_t n_input) {
-  const int16x8_t max_dup = vdupq_n_s16(clipping_value);
-  const int16x8_t min_dup = vdupq_n_s16(-clipping_value);
-  for (int batch = 0; batch < n_batch; ++batch) {
-    int i = 0;
-    for (; i <= n_input - 16; i += 16) {
-      const int index = batch * n_input + i;
-      int16x8_t val_0 = vld1q_s16(input + index);
-      int16x8_t val_1 = vld1q_s16(input + index + 8);
-      val_0 = vminq_s16(val_0, max_dup);
-      val_1 = vminq_s16(val_1, max_dup);
-      val_0 = vmaxq_s16(val_0, min_dup);
-      val_1 = vmaxq_s16(val_1, min_dup);
-      vst1q_s16(input + index, val_0);
-      vst1q_s16(input + index + 8, val_1);
-    }
-    for (; i < n_input; ++i) {
-      const int index = batch * n_input + i;
-      if (input[index] > clipping_value) {
-        input[index] = clipping_value;
-      }
-      if (input[index] < -clipping_value) {
-        input[index] = -clipping_value;
-      }
-    }
+void NeonCwiseClipping(float* vector, const int v_size,
+                       const float clipping_value) {
+  const float32x4_t clipping_value_f32x4 = vmovq_n_f32(clipping_value);
+  const float32x4_t neg_clipping_value_f32x4 = vmovq_n_f32(-clipping_value);
+
+  int i = 0;
+  for (; i <= v_size - kFloatValuesPerNeonVector;
+       i += kFloatValuesPerNeonVector) {
+    // Load from memory to vector.
+    float32x4_t v_f32x4 = vld1q_f32(vector + i);
+    // Clip between clipping_value and -clipping_value.
+    v_f32x4 = vminq_f32(clipping_value_f32x4, v_f32x4);
+    v_f32x4 = vmaxq_f32(neg_clipping_value_f32x4, v_f32x4);
+    // Save to output.
+    vst1q_f32(vector + i, v_f32x4);
+  }
+  for (; i < v_size; i++) {
+    vector[i] = std::max(std::min(clipping_value, vector[i]), -clipping_value);
   }
 }
 
-void NeonCwiseClipping(int8_t* input, const int8_t clipping_value,
-                       int32_t n_batch, int32_t n_input) {
+void NeonCwiseClipping(int16_t* vector, const int v_size,
+                       const int16_t clipping_value) {
+  const int16x8_t max_dup = vdupq_n_s16(clipping_value);
+  const int16x8_t min_dup = vdupq_n_s16(-clipping_value);
+
+  int i = 0;
+  for (; i <= v_size - kInt16ValuesPerNeonVector * 2;
+       i += kInt16ValuesPerNeonVector * 2) {
+    int16x8_t val_0 = vld1q_s16(vector + i);
+    int16x8_t val_1 = vld1q_s16(vector + i + kInt16ValuesPerNeonVector);
+    val_0 = vminq_s16(val_0, max_dup);
+    val_1 = vminq_s16(val_1, max_dup);
+    val_0 = vmaxq_s16(val_0, min_dup);
+    val_1 = vmaxq_s16(val_1, min_dup);
+    vst1q_s16(vector + i, val_0);
+    vst1q_s16(vector + i + kInt16ValuesPerNeonVector, val_1);
+  }
+  for (; i < v_size; i++) {
+    vector[i] = std::max(std::min(clipping_value, vector[i]),
+                         static_cast<int16_t>(-clipping_value));
+  }
+}
+
+void NeonCwiseClipping(int8_t* vector, const int v_size,
+                       const int8_t clipping_value) {
   const int8x16_t max_dup = vdupq_n_s8(clipping_value);
   const int8x16_t min_dup = vdupq_n_s8(-clipping_value);
-  for (int batch = 0; batch < n_batch; ++batch) {
-    int i = 0;
-    for (; i <= n_input - 32; i += 32) {
-      const int index = batch * n_input + i;
-      int8x16_t val_0 = vld1q_s8(input + index);
-      int8x16_t val_1 = vld1q_s8(input + index + 16);
-      val_0 = vminq_s8(val_0, max_dup);
-      val_1 = vminq_s8(val_1, max_dup);
-      val_0 = vmaxq_s8(val_0, min_dup);
-      val_1 = vmaxq_s8(val_1, min_dup);
-      vst1q_s8(input + index, val_0);
-      vst1q_s8(input + index + 16, val_1);
-    }
-    for (; i < n_input; ++i) {
-      const int index = batch * n_input + i;
-      if (input[index] > clipping_value) {
-        input[index] = clipping_value;
-      }
-      if (input[index] < -clipping_value) {
-        input[index] = -clipping_value;
-      }
-    }
+
+  int i = 0;
+  for (; i < v_size - kInt8ValuesPerNeonVector * 2;
+       i += kInt8ValuesPerNeonVector * 2) {
+    int8x16_t val_0 = vld1q_s8(vector + i);
+    int8x16_t val_1 = vld1q_s8(vector + i + kInt8ValuesPerNeonVector);
+    val_0 = vminq_s8(val_0, max_dup);
+    val_1 = vminq_s8(val_1, max_dup);
+    val_0 = vmaxq_s8(val_0, min_dup);
+    val_1 = vmaxq_s8(val_1, min_dup);
+    vst1q_s8(vector + i, val_0);
+    vst1q_s8(vector + i + kInt8ValuesPerNeonVector, val_1);
+  }
+  for (; i < v_size; i++) {
+    vector[i] = std::max(std::min(clipping_value, vector[i]),
+                         static_cast<int8_t>(-clipping_value));
   }
 }
 
@@ -2208,34 +2217,6 @@ bool NeonIsZeroVector(const int8_t* vector, int v_size) {
   return true;
 }
 
-void NeonClipVector(const float* vector, int v_size, float abs_limit,
-                    float* result) {
-  // If v_size is not divisible by the vector size, then we need to process the
-  // final few elements sequentially. postamble_start shows the start index
-  // where this should happen.
-  const int postamble_start =
-      RoundDownVectors<kFloatValuesPerNeonVector>(v_size);
-
-  // Replicate abs_limit and -abs_limit in two vectors.
-  const float32x4_t abs_limit_f32x4 = vmovq_n_f32(abs_limit);
-  const float32x4_t neg_abs_limit_f32x4 = vmovq_n_f32(-abs_limit);
-
-  int v = 0;
-  for (; v < postamble_start; v += kFloatValuesPerNeonVector) {
-    // Load from memory to vector.
-    float32x4_t v_f32x4 = vld1q_f32(vector + v);
-    // Clip between abs_limit and -abs_limit.
-    float32x4_t result_f32x4 = vminq_f32(abs_limit_f32x4, v_f32x4);
-    result_f32x4 = vmaxq_f32(neg_abs_limit_f32x4, result_f32x4);
-    // Save to output.
-    vst1q_f32(result + v, result_f32x4);
-  }
-  // Postamble loop.
-  for (; v < v_size; v++) {
-    result[v] = std::max(std::min(abs_limit, vector[v]), -abs_limit);
-  }
-}
-
 void NeonVectorScalarMultiply(const int8_t* vector, const int v_size,
                               const float scale, float* result) {
   // Here the assumption is that each buffer is 4-byte aligned.
diff --git a/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.h b/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.h
index 86951fcd559..7417e836b5c 100644
--- a/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.h
+++ b/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.h
@@ -198,14 +198,17 @@ void CwiseAdd(const int16_t* input_1, const int16_t* input_2, int n_batch,
   NEON_OR_PORTABLE(CwiseAdd, input_1, input_2, n_batch, n_input, output);
 }
 
-void CwiseClipping(int16_t* input, const int16_t clipping_value,
-                   int32_t n_batch, int32_t n_input) {
-  NEON_OR_PORTABLE(CwiseClipping, input, clipping_value, n_batch, n_input);
+void CwiseClipping(float* vector, const int v_size,
+                   const float clipping_value) {
+  NEON_OR_PORTABLE(CwiseClipping, vector, v_size, clipping_value);
 }
-
-void CwiseClipping(int8_t* input, const int8_t clipping_value, int32_t n_batch,
-                   int32_t n_input) {
-  NEON_OR_PORTABLE(CwiseClipping, input, clipping_value, n_batch, n_input);
+void CwiseClipping(int16_t* vector, const int v_size,
+                   const int16_t clipping_value) {
+  NEON_OR_PORTABLE(CwiseClipping, vector, v_size, clipping_value);
+}
+void CwiseClipping(int8_t* vector, const int v_size,
+                   const int8_t clipping_value) {
+  NEON_OR_PORTABLE(CwiseClipping, vector, v_size, clipping_value);
 }
 
 void BatchVectorBatchVectorDotProduct(const int16_t* vector1,
@@ -255,10 +258,6 @@ void VectorScalarMultiply(const int8_t* vector, int v_size, float scale,
                           float* result) {
   NEON_OR_PORTABLE(VectorScalarMultiply, vector, v_size, scale, result);
 }
-void ClipVector(const float* vector, int v_size, float abs_limit,
-                float* result) {
-  NEON_OR_PORTABLE(ClipVector, vector, v_size, abs_limit, result);
-}
 
 void SymmetricQuantizeFloats(const float* values, const int size,
                              int8_t* quantized_values, float* min_value,
diff --git a/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils_impl.h b/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils_impl.h
index 1554d07a61c..44bc83a0669 100644
--- a/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils_impl.h
+++ b/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils_impl.h
@@ -83,11 +83,12 @@ void NeonCwiseMul(const int16_t* input_1, const int16_t* input_2,
 void NeonCwiseAdd(const int16_t* input_1, const int16_t* input_2, int n_batch,
                   int n_input, int16_t* output);
 
-void NeonCwiseClipping(int16_t* input, const int16_t clipping_value,
-                       int32_t n_batch, int32_t n_input);
-
-void NeonCwiseClipping(int8_t* input, const int8_t clipping_value,
-                       int32_t n_batch, int32_t n_input);
+void NeonCwiseClipping(float* vector, const int v_size,
+                       const float clipping_value);
+void NeonCwiseClipping(int16_t* vector, const int v_size,
+                       const int16_t clipping_value);
+void NeonCwiseClipping(int8_t* vector, const int v_size,
+                       const int8_t clipping_value);
 
 void NeonMatrixBatchVectorMultiplyAccumulate(
     const int8_t* input, const int32_t* bias,
@@ -133,10 +134,6 @@ void NeonSub1Vector(const float* vector, int v_size, float* result);
 
 void NeonSub1Vector(const int16_t* vector, int v_size, int16_t* result);
 
-// Clip elements of a vector using a abs_limit value.
-void NeonClipVector(const float* vector, int v_size, float abs_limit,
-                    float* result);
-
 // Multiply all elements of vector with a scalar.
 void NeonVectorScalarMultiply(const int8_t* vector, int v_size, float scale,
                               float* result);
diff --git a/tensorflow/lite/kernels/internal/optimized/sse_tensor_utils.h b/tensorflow/lite/kernels/internal/optimized/sse_tensor_utils.h
index 224d811e862..af29dda7229 100644
--- a/tensorflow/lite/kernels/internal/optimized/sse_tensor_utils.h
+++ b/tensorflow/lite/kernels/internal/optimized/sse_tensor_utils.h
@@ -206,14 +206,19 @@ void CwiseAdd(const int16_t* input_1, const int16_t* input_2, int n_batch,
   PortableCwiseAdd(input_1, input_2, n_batch, n_input, output);
 }
 
-void CwiseClipping(int16_t* input, const int16_t clipping_value,
-                   int32_t n_batch, int32_t n_input) {
-  PortableCwiseClipping(input, clipping_value, n_batch, n_input);
+void CwiseClipping(float* vector, const int v_size,
+                   const float clipping_value) {
+  PortableCwiseClipping(vector, v_size, clipping_value);
 }
 
-void CwiseClipping(int8_t* input, const int8_t clipping_value, int32_t n_batch,
-                   int32_t n_input) {
-  PortableCwiseClipping(input, clipping_value, n_batch, n_input);
+void CwiseClipping(int16_t* vector, const int v_size,
+                   const int16_t clipping_value) {
+  PortableCwiseClipping(vector, v_size, clipping_value);
+}
+
+void CwiseClipping(int8_t* vector, const int v_size,
+                   const int8_t clipping_value) {
+  PortableCwiseClipping(vector, v_size, clipping_value);
 }
 
 void BatchVectorBatchVectorDotProduct(const int16_t* vector1,
@@ -263,10 +268,6 @@ void VectorScalarMultiply(const int8_t* vector, int v_size, float scale,
                           float* result) {
   NEON_OR_PORTABLE(VectorScalarMultiply, vector, v_size, scale, result);
 }
-void ClipVector(const float* vector, int v_size, float abs_limit,
-                float* result) {
-  NEON_OR_PORTABLE(ClipVector, vector, v_size, abs_limit, result);
-}
 
 void SymmetricQuantizeFloats(const float* values, const int size,
                              int8_t* quantized_values, float* min_value,
diff --git a/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.cc b/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.cc
index 4f6db290d4f..856331a62e7 100644
--- a/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.cc
+++ b/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.cc
@@ -651,36 +651,6 @@ void PortableCwiseAdd(const int16_t* input_1, const int16_t* input_2,
   }
 }
 
-void PortableCwiseClipping(int16_t* input, const int16_t clipping_value,
-                           int32_t n_batch, int32_t n_input) {
-  for (int batch = 0; batch < n_batch; ++batch) {
-    for (int i = 0; i < n_input; ++i) {
-      const int index = batch * n_input + i;
-      if (input[index] > clipping_value) {
-        input[index] = clipping_value;
-      }
-      if (input[index] < -clipping_value) {
-        input[index] = -clipping_value;
-      }
-    }
-  }
-}
-
-void PortableCwiseClipping(int8_t* input, const int8_t clipping_value,
-                           int32_t n_batch, int32_t n_input) {
-  for (int batch = 0; batch < n_batch; ++batch) {
-    for (int i = 0; i < n_input; ++i) {
-      const int index = batch * n_input + i;
-      if (input[index] > clipping_value) {
-        input[index] = clipping_value;
-      }
-      if (input[index] < -clipping_value) {
-        input[index] = -clipping_value;
-      }
-    }
-  }
-}
-
 float PortableVectorVectorDotProduct(const float* vector1, const float* vector2,
                                      int v_size) {
   float result = 0.0;
@@ -757,13 +727,6 @@ void PortableVectorScalarMultiply(const int8_t* vector, const int v_size,
   }
 }
 
-void PortableClipVector(const float* vector, int v_size, float abs_limit,
-                        float* result) {
-  for (int v = 0; v < v_size; v++) {
-    result[v] = std::max(std::min(abs_limit, vector[v]), -abs_limit);
-  }
-}
-
 void PortableReductionSumVector(const float* input_vector, float* output_vector,
                                 int output_size, int reduction_size) {
   const float* input_vector_ptr = input_vector;
diff --git a/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.h b/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.h
index 0fd7a407595..ecb7fe8ea2b 100644
--- a/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.h
+++ b/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.h
@@ -230,14 +230,19 @@ void CwiseAdd(const int16_t* input_1, const int16_t* input_2, int n_batch,
   PortableCwiseAdd(input_1, input_2, n_batch, n_input, output);
 }
 
-void CwiseClipping(int16_t* input, const int16_t clipping_value,
-                   int32_t n_batch, int32_t n_input) {
-  PortableCwiseClipping(input, clipping_value, n_batch, n_input);
+void CwiseClipping(float* vector, const int v_size,
+                   const float clipping_value) {
+  PortableCwiseClipping(vector, v_size, clipping_value);
 }
 
-void CwiseClipping(int8_t* input, const int8_t clipping_value, int32_t n_batch,
-                   int32_t n_input) {
-  PortableCwiseClipping(input, clipping_value, n_batch, n_input);
+void CwiseClipping(int16_t* vector, const int v_size,
+                   const int16_t clipping_value) {
+  PortableCwiseClipping(vector, v_size, clipping_value);
+}
+
+void CwiseClipping(int8_t* vector, const int v_size,
+                   const int8_t clipping_value) {
+  PortableCwiseClipping(vector, v_size, clipping_value);
 }
 
 void VectorBatchVectorCwiseProductAccumulate(const int16_t* vector, int v_size,
@@ -279,11 +284,6 @@ void VectorScalarMultiply(const int8_t* vector, int v_size, float scale,
   PortableVectorScalarMultiply(vector, v_size, scale, result);
 }
 
-void ClipVector(const float* vector, int v_size, float abs_limit,
-                float* result) {
-  PortableClipVector(vector, v_size, abs_limit, result);
-}
-
 void ReductionSumVector(const float* input_vector, float* output_vector,
                         int output_size, int reduction_size) {
   PortableReductionSumVector(input_vector, output_vector, output_size,
diff --git a/tensorflow/lite/kernels/internal/reference/portable_tensor_utils_impl.h b/tensorflow/lite/kernels/internal/reference/portable_tensor_utils_impl.h
index 34767ccd942..556e4640cbb 100644
--- a/tensorflow/lite/kernels/internal/reference/portable_tensor_utils_impl.h
+++ b/tensorflow/lite/kernels/internal/reference/portable_tensor_utils_impl.h
@@ -15,6 +15,7 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_PORTABLE_TENSOR_UTILS_IMPL_H_
 #define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_PORTABLE_TENSOR_UTILS_IMPL_H_
 
+#include <algorithm>
 #include <cstdint>
 
 // TODO(ghodrat): Remove this header file and the dependency to internal data
@@ -33,9 +34,6 @@ class CpuBackendContext;
 
 namespace tensor_utils {
 
-// Limit a float input f between +abs_limit and -abs_limit.
-float PortableClip(float f, float abs_limit);
-
 template <typename T>
 bool PortableIsZeroVector(const T* vector, int v_size) {
   for (int i = 0; i < v_size; ++i) {
@@ -178,11 +176,14 @@ void PortableCwiseMul(const int16_t* input_1, const int16_t* input_2,
 void PortableCwiseAdd(const int16_t* input_1, const int16_t* input_2,
                       int n_batch, int n_input, int16_t* output);
 
-void PortableCwiseClipping(int16_t* input, const int16_t clipping_value,
-                           int32_t n_batch, int32_t n_input);
-
-void PortableCwiseClipping(int8_t* input, const int8_t clipping_value,
-                           int32_t n_batch, int32_t n_input);
+template <typename T>
+void PortableCwiseClipping(T* vector, const int v_size,
+                           const T clipping_value) {
+  for (int i = 0; i < v_size; i++) {
+    vector[i] = std::max(std::min(clipping_value, vector[i]),
+                         static_cast<T>(-clipping_value));
+  }
+}
 
 // Batch vector initialization with another vector.
 void PortableVectorBatchVectorAssign(const float* vector, int v_size,
@@ -201,10 +202,6 @@ void PortableSub1Vector(const int16_t* vector, int v_size, int16_t* result);
 void PortableVectorScalarMultiply(const int8_t* vector, int v_size, float scale,
                                   float* result);
 
-// Clip elements of a vector using a abs_limit value.
-void PortableClipVector(const float* vector, int v_size, float abs_limit,
-                        float* result);
-
 // Reduce-sum on a float input vector:
 // input_vector: float pointer to input vector.
 // output_vector: float pointer to vector.
diff --git a/tensorflow/lite/kernels/internal/tensor_utils.h b/tensorflow/lite/kernels/internal/tensor_utils.h
index 8c956c49f5f..716fbaa740e 100644
--- a/tensorflow/lite/kernels/internal/tensor_utils.h
+++ b/tensorflow/lite/kernels/internal/tensor_utils.h
@@ -406,23 +406,16 @@ void CwiseMul(const int16_t* input_1, const int16_t* input_2,
 void CwiseAdd(const int16_t* input_1, const int16_t* input_2, int n_batch,
               int n_input, int16_t* output);
 
-// Element-wise in-place clipping of a quantized vector.
-// Parameters:
-//     - input:          batch vector of size n_batch * n_input; 16 bit.
+// Element-wise in-place clipping of a vector. Overloaded for float, int16_t,
+// int8_t. Parameters:
+//     - vector:         vector of size v_size.
+//     - v_size:         the size of the vector.
 //     - clipping_value: the value used for clipping.
-//     - n_batch:        the number of batches.
-//     - n_input:        the size for input and output.
-void CwiseClipping(int16_t* input, const int16_t clipping_value,
-                   int32_t n_batch, int32_t n_input);
-
-// Element-wise in-place clipping of a quantized vector.
-// Parameters:
-//     - input:          batch vector of size n_batch * n_input; 8 bit.
-//     - clipping_value: the value used for clipping.
-//     - n_batch:        the number of batches.
-//     - n_input:        the size for input and output.
-void CwiseClipping(int8_t* input, const int8_t clipping_value, int32_t n_batch,
-                   int32_t n_input);
+void CwiseClipping(float* vector, const int v_size, const float clipping_value);
+void CwiseClipping(int16_t* vector, const int v_size,
+                   const int16_t clipping_value);
+void CwiseClipping(int8_t* vector, const int v_size,
+                   const int8_t clipping_value);
 
 // Cwise product of two vectors.
 template <typename T>
@@ -611,10 +604,6 @@ void Sub1Vector(const int16_t* vector, int v_size, int16_t* result);
 void VectorScalarMultiply(const int8_t* vector, int v_size, float scale,
                           float* result);
 
-// Clip elements of a vector using a abs_limit value.
-void ClipVector(const float* vector, int v_size, float abs_limit,
-                float* result);
-
 // Reduce-sum on a float input vector:
 // input_vector: float pointer to input vector.
 // output_vector: float pointer to vector.
diff --git a/tensorflow/lite/kernels/internal/tensor_utils_test.cc b/tensorflow/lite/kernels/internal/tensor_utils_test.cc
index 878cf0d2618..825070cf510 100644
--- a/tensorflow/lite/kernels/internal/tensor_utils_test.cc
+++ b/tensorflow/lite/kernels/internal/tensor_utils_test.cc
@@ -37,18 +37,6 @@ TEST(uKernels, FloorLog2Test) {
   }
 }
 
-TEST(uKernels, ClipTest) {
-  constexpr int kVectorSize = 10;
-  constexpr float kAbsLimit = 2.0;
-  static float input[kVectorSize] = {0.0,  -0.5, 1.0,  -1.5, 2.0,
-                                     -2.5, 3.0,  -3.5, 4.0,  -4.5};
-  std::vector<float> output(kVectorSize);
-  ClipVector(input, kVectorSize, kAbsLimit, output.data());
-  EXPECT_THAT(output,
-              ElementsAreArray(ArrayFloatNear(
-                  {0.0, -0.5, 1.0, -1.5, 2.0, -2.0, 2.0, -2.0, 2.0, -2.0})));
-}
-
 TEST(uKernels, VectorScalarMultiply) {
   constexpr int kVectorSize = 29;
   static int8_t input[kVectorSize];
@@ -976,15 +964,28 @@ TEST(uKernels, QuantAddTest) {
   EXPECT_THAT(output, testing::ElementsAreArray(expected_output));
 }
 
+TEST(uKernels, ClipTest) {
+  constexpr int kVectorSize = 10;
+  constexpr float kAbsLimit = 2.0;
+  std::vector<float> input = {0.0,  -0.5, 1.0,  -1.5, 2.0,
+                              -2.5, 3.0,  -3.5, 4.0,  -4.5};
+  CwiseClipping(input.data(), kVectorSize, kAbsLimit);
+  const std::vector<float> expected_output = {0.0,  -0.5, 1.0,  -1.5, 2.0,
+                                              -2.0, 2.0,  -2.0, 2.0,  -2.0};
+  EXPECT_THAT(input, testing::ElementsAreArray(expected_output));
+}
+
 // Quantized clipping for 16 bit.
 TEST(uKernels, QuantClip16Test) {
+  constexpr int kVectorSize = 30;
+  constexpr int16_t kAbsLimit = 300;
   std::vector<int16_t> input = {
       -10500, 1,     -2,     -7404,  200,    -5401,  -1757, -7668,
       -19248, -9692, -24249, -17923, -15840, -10026, 5249,  -89,
       1787,   -200,  -6691,  -19524, -13439, -24048, -1123, 32767,
       -17267, -3378, 823,    11482,  -11139, 7508,
   };
-  CwiseClipping(input.data(), 300, 2, 15);
+  CwiseClipping(input.data(), kVectorSize, kAbsLimit);
   const std::vector<int16_t> expected_output = {
       -300, 1,    -2,   -300, 200,  -300, -300, -300, -300, -300,
       -300, -300, -300, -300, 300,  -89,  300,  -200, -300, -300,
@@ -995,11 +996,13 @@ TEST(uKernels, QuantClip16Test) {
 
 // Quantized clipping for 8 bit.
 TEST(uKernels, QuantClip8Test) {
+  constexpr int kVectorSize = 30;
+  constexpr int8_t kAbsLimit = 32;
   std::vector<int8_t> input = {
       4,   -11, -5, -34, -10, -17, -27, -22, 15,  127, -128, 1,  3, 56, 3,
       -21, 1,   9,  -13, 10,  0,   -1,  -55, -40, 127, -128, 11, 4, 6,  32,
   };
-  CwiseClipping(input.data(), 32, 2, 15);
+  CwiseClipping(input.data(), kVectorSize, kAbsLimit);
   const std::vector<int8_t> expected_output = {
       4,   -11, -5, -32, -10, -17, -27, -22, 15,  32, -32, 1,  3, 32, 3,
       -21, 1,   9,  -13, 10,  0,   -1,  -32, -32, 32, -32, 11, 4, 6,  32,
diff --git a/tensorflow/lite/kernels/lstm_eval.cc b/tensorflow/lite/kernels/lstm_eval.cc
index ca8344d863b..3f74f3e7fff 100644
--- a/tensorflow/lite/kernels/lstm_eval.cc
+++ b/tensorflow/lite/kernels/lstm_eval.cc
@@ -374,8 +374,8 @@ inline void LstmStepFloat(
         cell_state_ptr);
   }
   if (params->cell_clip > 0.0) {
-    tensor_utils::ClipVector(cell_state_ptr, n_batch * n_cell,
-                             params->cell_clip, cell_state_ptr);
+    tensor_utils::CwiseClipping(cell_state_ptr, n_batch * n_cell,
+                                params->cell_clip);
   }
 
   // For each batch and cell: update the output gate.
@@ -415,8 +415,8 @@ inline void LstmStepFloat(
         projection_weights_ptr, n_output, n_cell, output_gate_scratch, n_batch,
         output_state_ptr);
     if (params->proj_clip > 0.0) {
-      tensor_utils::ClipVector(output_state_ptr, n_batch * n_output,
-                               params->proj_clip, output_state_ptr);
+      tensor_utils::CwiseClipping(output_state_ptr, n_batch * n_output,
+                                  params->proj_clip);
     }
   } else {
     std::copy_n(output_gate_scratch, n_batch * n_output, output_state_ptr);
@@ -837,8 +837,8 @@ inline void LstmStepHybrid(
         cell_state_ptr);
   }
   if (params->cell_clip > 0.0) {
-    tensor_utils::ClipVector(cell_state_ptr, n_batch * n_cell,
-                             params->cell_clip, cell_state_ptr);
+    tensor_utils::CwiseClipping(cell_state_ptr, n_batch * n_cell,
+                                params->cell_clip);
   }
 
   // For each batch and cell: update the output gate.
@@ -893,8 +893,8 @@ inline void LstmStepHybrid(
           scaling_factors_scratch, context);
     }
     if (params->proj_clip > 0.0) {
-      tensor_utils::ClipVector(output_state_ptr, n_batch * n_output,
-                               params->proj_clip, output_state_ptr);
+      tensor_utils::CwiseClipping(output_state_ptr, n_batch * n_output,
+                                  params->proj_clip);
     }
   } else {
     std::copy_n(output_gate_scratch, n_batch * n_output, output_state_ptr);
@@ -1187,8 +1187,8 @@ inline void LstmStepInteger8x8_16(
                          n_cell, cell_state_ptr);
 
   if (quantized_cell_clip > 0) {
-    tensor_utils::CwiseClipping(cell_state_ptr, quantized_cell_clip, n_batch,
-                                n_cell);
+    tensor_utils::CwiseClipping(cell_state_ptr, n_batch * n_cell,
+                                quantized_cell_clip);
   }
 
   // Ouptut gate.
@@ -1234,8 +1234,8 @@ inline void LstmStepInteger8x8_16(
         effective_proj_scale_a, effective_proj_scale_b, n_batch, n_cell,
         n_output, output_state_zp, scratch5, output_ptr, context);
     if (quantized_proj_clip > 0) {
-      tensor_utils::CwiseClipping(output_ptr, quantized_proj_clip, n_batch,
-                                  n_output);
+      tensor_utils::CwiseClipping(output_ptr, n_batch * n_output,
+                                  quantized_proj_clip);
     }
   } else {
     std::copy_n(scratch4, n_batch * n_output, output_ptr);
@@ -1498,8 +1498,8 @@ inline void LstmStepInteger8x8_8(
   tensor_utils::CwiseAdd(scratch6, scratch7, n_batch, n_cell, cell_state_ptr);
 
   if (quantized_cell_clip > 0) {
-    tensor_utils::CwiseClipping(cell_state_ptr, quantized_cell_clip, n_batch,
-                                n_cell);
+    tensor_utils::CwiseClipping(cell_state_ptr, n_batch * n_cell,
+                                quantized_cell_clip);
   }
 
   // Cell to hidden.
@@ -1517,8 +1517,8 @@ inline void LstmStepInteger8x8_8(
 
   // Projection clipping.
   if (quantized_proj_clip > 0) {
-    tensor_utils::CwiseClipping(output_ptr, quantized_proj_clip, n_batch,
-                                n_output);
+    tensor_utils::CwiseClipping(output_ptr, n_batch * n_output,
+                                quantized_proj_clip);
   }
 
   // Copy output to output state.
diff --git a/tensorflow/lite/tools/optimize/calibration/builtin_logging_ops/lstm.cc b/tensorflow/lite/tools/optimize/calibration/builtin_logging_ops/lstm.cc
index 09ce81c1d97..ed1ef07d8d3 100644
--- a/tensorflow/lite/tools/optimize/calibration/builtin_logging_ops/lstm.cc
+++ b/tensorflow/lite/tools/optimize/calibration/builtin_logging_ops/lstm.cc
@@ -222,8 +222,8 @@ inline void LstmStepWithAuxInput(
         cell_state_ptr);
   }
   if (params->cell_clip > 0.0) {
-    tensor_utils::ClipVector(cell_state_ptr, n_batch * n_cell,
-                             params->cell_clip, cell_state_ptr);
+    tensor_utils::CwiseClipping(cell_state_ptr, n_batch * n_cell,
+                                params->cell_clip);
   }
 
   // For each batch and cell: update the output gate.
@@ -268,8 +268,8 @@ inline void LstmStepWithAuxInput(
         projection_weights_ptr, n_output, n_cell, output_gate_scratch, n_batch,
         output_state_ptr);
     if (params->proj_clip > 0.0) {
-      tensor_utils::ClipVector(output_state_ptr, n_batch * n_output,
-                               params->proj_clip, output_state_ptr);
+      tensor_utils::CwiseClipping(output_state_ptr, n_batch * n_output,
+                                  params->proj_clip);
     }
   } else {
     std::copy_n(output_gate_scratch, n_batch * n_output, output_state_ptr);

From 465ca119b20697cd51a14297e3e81b8e6b2ecf91 Mon Sep 17 00:00:00 2001
From: Yujing Zhang <yujingzhang@google.com>
Date: Tue, 23 Jun 2020 12:05:35 -0700
Subject: [PATCH 0907/1390] Introduce a SaveContext to detect whether we are
 building a graph for a SavedModel. And don't use packed variables under a
 SaveContext.

PiperOrigin-RevId: 317914296
Change-Id: I92cc6043484d642a1919cb5ab238d5e5cacc4c2a
---
 tensorflow/python/distribute/BUILD            |  2 +
 .../distribute/packed_distributed_variable.py |  4 ++
 .../distribute/saved_model_save_load_test.py  | 14 +++++
 .../distribute/saved_model_test_base.py       | 17 ++++++
 tensorflow/python/distribute/values.py        | 23 ++++----
 tensorflow/python/distribute/values_test.py   | 11 ++++
 tensorflow/python/saved_model/BUILD           | 10 ++++
 tensorflow/python/saved_model/save.py         | 19 ++++++-
 tensorflow/python/saved_model/save_context.py | 56 +++++++++++++++++++
 9 files changed, 144 insertions(+), 12 deletions(-)
 create mode 100644 tensorflow/python/saved_model/save_context.py

diff --git a/tensorflow/python/distribute/BUILD b/tensorflow/python/distribute/BUILD
index 85ee8de5635..9900040a6e6 100644
--- a/tensorflow/python/distribute/BUILD
+++ b/tensorflow/python/distribute/BUILD
@@ -751,6 +751,7 @@ py_library(
         "//tensorflow/python:variable_scope",
         "//tensorflow/python:variables",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/saved_model:save_context",
         "//tensorflow/python/training/saving:saveable_object",
         "//tensorflow/python/training/saving:saveable_object_util",
         "//tensorflow/python/training/tracking:base",
@@ -1171,6 +1172,7 @@ distribute_py_test(
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/eager:test",
+        "//tensorflow/python/saved_model:save_context",
         "//tensorflow/python/saved_model/model_utils:mode_keys",
         "//tensorflow/python/tpu:tpu_lib",
         "//tensorflow/python/types",
diff --git a/tensorflow/python/distribute/packed_distributed_variable.py b/tensorflow/python/distribute/packed_distributed_variable.py
index c249b8efc1c..4c9433dc164 100644
--- a/tensorflow/python/distribute/packed_distributed_variable.py
+++ b/tensorflow/python/distribute/packed_distributed_variable.py
@@ -108,6 +108,10 @@ class PackedDistributedVariable(resource_variable_ops.BaseResourceVariable):
     else:
       return self._handle
 
+  @property
+  def packed_handle(self):
+    return self._handle
+
   def _read_variable_op(self):
     if context.executing_eagerly():
       return self.get_var_on_current_device().value()
diff --git a/tensorflow/python/distribute/saved_model_save_load_test.py b/tensorflow/python/distribute/saved_model_save_load_test.py
index 23050a612f5..2b753c1e1c8 100644
--- a/tensorflow/python/distribute/saved_model_save_load_test.py
+++ b/tensorflow/python/distribute/saved_model_save_load_test.py
@@ -70,6 +70,20 @@ class SavedModelKerasModelTest(test_base.TestSavedModelBase):
                                                  distribution_for_restoring,
                                                  save_in_scope)
 
+  @combinations.generate(
+      combinations.times(test_base.simple_models_with_strategies(),
+                         combinations.combine(save_in_scope=[True, False])))
+  def test_no_variable_device_placement(self, model_and_input, distribution,
+                                        save_in_scope):
+    saved_dir = self.run_test_save_strategy(model_and_input, distribution,
+                                            save_in_scope)
+    func = saved_model.load(saved_dir)
+    concrete_function = func.signatures[test_base._DEFAULT_FUNCTION_KEY]
+    for f in concrete_function.graph.as_graph_def().library.function:
+      for n in f.node_def:
+        if n.op == 'ReadVariableOp':
+          self.assertEmpty(n.device)
+
 
 class SavedModelTFModuleTest(test_base.TestSavedModelBase):
 
diff --git a/tensorflow/python/distribute/saved_model_test_base.py b/tensorflow/python/distribute/saved_model_test_base.py
index 70ea582baff..1fab0f2b0bd 100644
--- a/tensorflow/python/distribute/saved_model_test_base.py
+++ b/tensorflow/python/distribute/saved_model_test_base.py
@@ -274,3 +274,20 @@ class TestSavedModelBase(test.TestCase, parameterized.TestCase):
     tolerance = get_tolerance(distribution_for_saving,
                               distribution_for_restoring)
     self.assertAllClose(result_before_save, load_result, atol=tolerance)
+
+  def run_test_save_strategy(self, model_and_input,
+                             distribution, save_in_scope):
+    """Save a model with DS."""
+    saved_dir = os.path.join(self.get_temp_dir(), '3')
+    with distribution.scope():
+      model = model_and_input.get_model()
+      x_train, y_train, _ = model_and_input.get_data()
+      batch_size = model_and_input.get_batch_size()
+      self._train_model(model, x_train, y_train, batch_size)
+
+    if save_in_scope:
+      with distribution.scope():
+        self._save_model(model, saved_dir)
+    else:
+      self._save_model(model, saved_dir)
+    return saved_dir
diff --git a/tensorflow/python/distribute/values.py b/tensorflow/python/distribute/values.py
index 37643e03b18..35f040edc83 100644
--- a/tensorflow/python/distribute/values.py
+++ b/tensorflow/python/distribute/values.py
@@ -35,6 +35,7 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import variable_scope as vs
 from tensorflow.python.ops import variables as variables_lib
+from tensorflow.python.saved_model import save_context
 from tensorflow.python.training.saving import saveable_object
 from tensorflow.python.training.saving import saveable_object_util
 from tensorflow.python.training.tracking import base as trackable
@@ -472,11 +473,10 @@ class DistributedVariable(DistributedDelegate, variables_lib.Variable,
     # variable.
     self._var_policy = var_policy
 
-  @property
-  def _devices(self):
-    if self._packed_var is not None:
-      return tuple(d for d in self._packed_var.devices)
-    return tuple(v.device for v in self._values)
+  def _use_packed_variable(self):
+    # Don't use packed variable when under a SaveContext to avoid explicit
+    # device placement on variable consuming ops.
+    return self._packed_var is not None and not save_context.in_save_context()
 
   def is_initialized(self, name=None):
     """Identifies if all the component variables are initialized.
@@ -488,7 +488,7 @@ class DistributedVariable(DistributedDelegate, variables_lib.Variable,
       The op that evaluates to True or False depending on if all the
       component variables are initialized.
     """
-    if self._packed_var is not None:
+    if self._use_packed_variable():
       return self._packed_var.is_initialized()
     result = self._primary.is_initialized()
     # We iterate through the list of values except the last one to allow us to
@@ -562,7 +562,9 @@ class DistributedVariable(DistributedDelegate, variables_lib.Variable,
 
   @property
   def _packed_variable(self):
-    return self._packed_var
+    if self._use_packed_variable():
+      return self._packed_var
+    return None
 
   @property
   def handle(self):
@@ -571,7 +573,7 @@ class DistributedVariable(DistributedDelegate, variables_lib.Variable,
       raise ValueError("`handle` is not available outside the replica context"
                        " or a `tf.distribute.Strategy.update()` call.")
     else:
-      if self._packed_var is not None:
+      if self._use_packed_variable():
         return self._packed_var.handle
       return self._values[replica_id].handle
 
@@ -623,7 +625,7 @@ class DistributedVariable(DistributedDelegate, variables_lib.Variable,
 
   def _get_replica(self, replica_id):
     """Returns the value on a device with the given replica_id."""
-    if self._packed_var is not None:
+    if self._use_packed_variable():
       return self._packed_var.on_device(self._devices[replica_id])
     return self._values[replica_id]
 
@@ -844,8 +846,9 @@ class DistributedVariable(DistributedDelegate, variables_lib.Variable,
       obj_map[v] = new_obj
       resource_map[v.handle] = new_obj.handle
     obj_map[self] = new_obj
-    resource_map[self.handle] = new_obj.handle
     resource_map[self] = new_obj.handle
+    if self._packed_var is not None:
+      resource_map[self._packed_var.packed_handle] = new_obj.handle
     return obj_map, resource_map
 
 
diff --git a/tensorflow/python/distribute/values_test.py b/tensorflow/python/distribute/values_test.py
index d0e3eec22a8..69884a06814 100644
--- a/tensorflow/python/distribute/values_test.py
+++ b/tensorflow/python/distribute/values_test.py
@@ -55,6 +55,7 @@ from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import sparse_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables as variables_lib
+from tensorflow.python.saved_model import save_context
 from tensorflow.python.saved_model.model_utils import mode_keys
 from tensorflow.python.tpu import tpu_strategy_util
 from tensorflow.python.training import saver as saver_lib
@@ -753,6 +754,16 @@ class PackedDistributedVariableTest(test.TestCase, parameterized.TestCase):
         self.assertEqual(val.device, devices[i])
         self.assertEqual(self.evaluate(val.read_value()), i)
 
+  def testIgnorePackedVariableInSaveContext(self, distribution):
+    distribution._enable_packed_variable_in_eager_mode = True
+    with distribution.scope():
+      v = variables_lib.Variable(0)
+      self.assertIsInstance(
+          v._packed_variable, packed.PackedDistributedVariable)
+
+    with save_context.save_context():
+      self.assertIsNone(v._packed_variable)
+
 
 class MirroredVariableTest(test.TestCase, parameterized.TestCase):
 
diff --git a/tensorflow/python/saved_model/BUILD b/tensorflow/python/saved_model/BUILD
index 240b60f43f6..1fc6253f763 100644
--- a/tensorflow/python/saved_model/BUILD
+++ b/tensorflow/python/saved_model/BUILD
@@ -281,6 +281,15 @@ py_library(
     ],
 )
 
+py_library(
+    name = "save_context",
+    srcs = [
+        "save_context.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [],
+)
+
 py_library(
     name = "save",
     srcs = [
@@ -293,6 +302,7 @@ py_library(
         ":function_serialization",
         ":nested_structure_coder",
         ":revived_types",
+        ":save_context",
         ":save_options",
         ":signature_constants",
         ":signature_def_utils",
diff --git a/tensorflow/python/saved_model/save.py b/tensorflow/python/saved_model/save.py
index 802ce1d61b7..84764431b9d 100644
--- a/tensorflow/python/saved_model/save.py
+++ b/tensorflow/python/saved_model/save.py
@@ -45,6 +45,7 @@ from tensorflow.python.saved_model import constants
 from tensorflow.python.saved_model import function_serialization
 from tensorflow.python.saved_model import nested_structure_coder
 from tensorflow.python.saved_model import revived_types
+from tensorflow.python.saved_model import save_context
 from tensorflow.python.saved_model import save_options
 from tensorflow.python.saved_model import signature_constants
 from tensorflow.python.saved_model import signature_def_utils
@@ -985,8 +986,11 @@ def export_meta_graph(obj, filename, signatures=None, options=None):
   ops.dismantle_graph(exported_graph)
 
 
-def _build_meta_graph(obj, export_dir, signatures, options,
-                      meta_graph_def=None):
+def _build_meta_graph_impl(obj,
+                           export_dir,
+                           signatures,
+                           options,
+                           meta_graph_def=None):
   """Creates a MetaGraph containing the resources and functions of an object."""
   if ops.inside_function():
     raise AssertionError(
@@ -1044,3 +1048,14 @@ def _build_meta_graph(obj, export_dir, signatures, options,
         graph_debug_info.SerializeToString(deterministic=True))
 
   return meta_graph_def, exported_graph, object_saver, asset_info
+
+
+def _build_meta_graph(obj,
+                      export_dir,
+                      signatures,
+                      options,
+                      meta_graph_def=None):
+  """Creates a MetaGraph under a SaveContext."""
+  with save_context.save_context():
+    return _build_meta_graph_impl(obj, export_dir, signatures, options,
+                                  meta_graph_def)
diff --git a/tensorflow/python/saved_model/save_context.py b/tensorflow/python/saved_model/save_context.py
new file mode 100644
index 00000000000..53d92587247
--- /dev/null
+++ b/tensorflow/python/saved_model/save_context.py
@@ -0,0 +1,56 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Context for building SavedModel."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import contextlib
+import threading
+
+
+class SaveContext(threading.local):
+  """A context for building a graph of SavedModel."""
+
+  def __init__(self):
+    super(SaveContext, self).__init__()
+    self._in_save_context = False
+
+  def enter_save_context(self):
+    self._in_save_context = True
+
+  def exit_save_context(self):
+    self._in_save_context = False
+
+  def in_save_context(self):
+    return self._in_save_context
+
+_save_context = SaveContext()
+
+
+@contextlib.contextmanager
+def save_context():
+  _save_context.enter_save_context()
+  try:
+    yield
+  finally:
+    _save_context.exit_save_context()
+
+
+def in_save_context():
+  """Returns whether under a save context."""
+  return _save_context.in_save_context()
+

From 68256cc9a847fcbc590ea04293c08bdd8f6498ea Mon Sep 17 00:00:00 2001
From: Nick Kreeger <kreeger@google.com>
Date: Tue, 23 Jun 2020 12:22:53 -0700
Subject: [PATCH 0908/1390] Add internal test for multi-tenant with the
 RecordingMicroInterpreter.

PiperOrigin-RevId: 317918028
Change-Id: I93c006dcb3b35750bcf269d1defb0ae6e59aebe5
---
 tensorflow/lite/micro/memory_arena_threshold_test.cc | 2 +-
 tensorflow/lite/micro/recording_micro_interpreter.h  | 7 +++++++
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/tensorflow/lite/micro/memory_arena_threshold_test.cc b/tensorflow/lite/micro/memory_arena_threshold_test.cc
index b45de85a21b..c698f2c7115 100644
--- a/tensorflow/lite/micro/memory_arena_threshold_test.cc
+++ b/tensorflow/lite/micro/memory_arena_threshold_test.cc
@@ -31,7 +31,7 @@ namespace {
 // Ensure memory doesn't expand more that 3%:
 constexpr float kAllocationThreshold = 0.03;
 constexpr float kAllocationTailMiscCeiling = 1024;
-const bool kIs64BitSystem = sizeof(void*) == 8;
+const bool kIs64BitSystem = (sizeof(void*) == 8);
 
 constexpr int kKeywordModelTensorArenaSize = 22 * 1024;
 uint8_t keyword_model_tensor_arena[kKeywordModelTensorArenaSize];
diff --git a/tensorflow/lite/micro/recording_micro_interpreter.h b/tensorflow/lite/micro/recording_micro_interpreter.h
index eb443fc6fd1..0a579b0be8e 100644
--- a/tensorflow/lite/micro/recording_micro_interpreter.h
+++ b/tensorflow/lite/micro/recording_micro_interpreter.h
@@ -45,6 +45,13 @@ class RecordingMicroInterpreter : public MicroInterpreter {
         recording_micro_allocator_(
             static_cast<const RecordingMicroAllocator&>(allocator())) {}
 
+  RecordingMicroInterpreter(const Model* model,
+                            const MicroOpResolver& op_resolver,
+                            RecordingMicroAllocator* allocator,
+                            ErrorReporter* error_reporter)
+      : MicroInterpreter(model, op_resolver, allocator, error_reporter),
+        recording_micro_allocator_(*allocator) {}
+
   const RecordingMicroAllocator& GetMicroAllocator() const {
     return recording_micro_allocator_;
   }

From 893635aad3dddbfaf6067918f26b3c635a292439 Mon Sep 17 00:00:00 2001
From: Anjali Sridhar <anjalisridhar@google.com>
Date: Tue, 23 Jun 2020 12:29:17 -0700
Subject: [PATCH 0909/1390] Remove error message which does not apply since we
 create the datasets and element spec when we call
 `experimental_distributed_datasets_from_function`.

PiperOrigin-RevId: 317919230
Change-Id: Ia2f9449ecee4aa7198f53fbae99a3f1ac048802a
---
 tensorflow/python/distribute/input_lib.py | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/tensorflow/python/distribute/input_lib.py b/tensorflow/python/distribute/input_lib.py
index e4a362a92c6..64089e54bfa 100644
--- a/tensorflow/python/distribute/input_lib.py
+++ b/tensorflow/python/distribute/input_lib.py
@@ -1216,12 +1216,6 @@ class DistributedDatasetsFromFunction(_IterableInput):
   @property
   def element_spec(self):
     """The type specification of an element of this dataset."""
-    if self._element_spec is None:
-      raise ValueError("You must create an iterator before calling "
-                       "`element_spec` on the distributed dataset or iterator. "
-                       "This is because the dataset function is not called "
-                       "before an iterator is created.")
-
     return self._element_spec
 
 
From 7d025c63c53f3066354a2d1f41abf184b359bceb Mon Sep 17 00:00:00 2001
From: Gaurav Jain <gjn@google.com>
Date: Tue, 23 Jun 2020 12:31:29 -0700
Subject: [PATCH 0910/1390] Ignore other graph inputs in custom gradient

Though the inputs should not be valid we ignore them to make it easier
for v1 code to migrate to custom gradients.

PiperOrigin-RevId: 317919685
Change-Id: Idc68fa39277cfb006e7e6c665b035a70db40b600
---
 tensorflow/python/ops/custom_gradient.py | 32 +++++++++++++++++++++---
 tensorflow/python/ops/gradients_test.py  | 21 ++++++++++++++++
 2 files changed, 49 insertions(+), 4 deletions(-)

diff --git a/tensorflow/python/ops/custom_gradient.py b/tensorflow/python/ops/custom_gradient.py
index 5f4ee055621..ed666840436 100644
--- a/tensorflow/python/ops/custom_gradient.py
+++ b/tensorflow/python/ops/custom_gradient.py
@@ -336,15 +336,38 @@ def _graph_mode_decorator(f, args, kwargs):
           "All variables used by a function wrapped with @custom_gradient must "
           "be `ResourceVariable`s. Ensure that no `variable_scope` is created "
           "with `use_resource=False`.")
+
+  # It is possible for the caller to pass in an input that is from a different
+  # graph. Even though this is not valid we filter these out if they are not
+  # from the output graph to make it easier for some code to migrate to custom
+  # gradients.
+  inputs = nest.flatten(args)
+  outputs = nest.flatten(result)
+  graphs = {getattr(o, "graph", None) for o in outputs}
+  # Not all results may be tensors. However, we want to ensure that all outputs
+  # are from the same graph and use that to filter the inputs.
+  graphs.discard(None)  # Discard non-graph outputs
+  if graphs:
+    if len(graphs) > 1:
+      raise ValueError("All graph outputs should be from the same graph")
+    output_graph = graphs.pop()
+    filtered_inputs = []
+    for i in inputs:
+      if i.graph != output_graph:
+        logging.warn("%s does not belong to output graph %s", i, output_graph)
+      else:
+        filtered_inputs.append(i)
+
+    inputs = filtered_inputs
+
   # The variables that grad_fn needs to return gradients for are the set of
   # variables used that are *not* part of the inputs.
-  inputs = args
   variables_in_tape = frozenset([
       v.ref() for v in variable_watcher.watched_variables()
   ]) - frozenset(v.ref() for v in inputs)
   variables_in_subgraph = frozenset([
       v.ref()
-      for v in get_dependent_variables(input_ops=inputs, output_ops=result)
+      for v in get_dependent_variables(input_ops=inputs, output_ops=outputs)
   ])
   variables = list(
       [v.deref() for v in variables_in_subgraph.union(variables_in_tape)])
@@ -363,7 +386,7 @@ def _graph_mode_decorator(f, args, kwargs):
   flat_result = nest.flatten(result)
   flat_result_len = len(flat_result)
 
-  all_tensors = flat_result + args + variables
+  all_tensors = flat_result + inputs + variables
 
   def tape_grad_fn(*result_grads):
     """Custom grad fn wrapper."""
@@ -515,7 +538,8 @@ def recompute_grad(f):
 
         def transpose(*t_args, **t_kwargs):
           """Gradient function calculation for forward mode autodiff."""
-          # Just throw an error since gradients / activations are not stored on tape for recompute.
+          # Just throw an error since gradients / activations are not stored on
+          # tape for recompute.
           raise NotImplementedError(
               "recompute_grad tried to transpose grad of {}. "
               "Consider not using recompute_grad in forward mode"
diff --git a/tensorflow/python/ops/gradients_test.py b/tensorflow/python/ops/gradients_test.py
index fc5f38aedba..158253d1aab 100644
--- a/tensorflow/python/ops/gradients_test.py
+++ b/tensorflow/python/ops/gradients_test.py
@@ -1197,6 +1197,27 @@ class CustomGradientTest(test_util.TensorFlowTestCase, parameterized.TestCase):
         dw = sess.run(math_ops.reduce_sum(grads[1]))
         self.assertEqual(12., dw)
 
+  def testCustomGradientOtherGraphVariables(self):
+    with ops.Graph().as_default():
+      v = variables.Variable(1.0)
+
+    @custom_gradient.custom_gradient
+    def MyMultiply(x1, x2, unused_y):
+      result = x1 * x2
+
+      def Grad(dy):
+        # Switched the ordering here.
+        return [dy * x1, dy * x2]
+
+      return result, Grad
+
+    with ops.Graph().as_default():
+      x1 = constant(3.)
+      x2 = constant(5.)
+      y = MyMultiply(x1, x2, v)
+      dy = gradients.gradients(y, [x1, x2])
+      self.assertAllEqual([3., 5.], self.evaluate(dy))
+
   def testCustomGradientWithVariablesNoFalsePositives(self):
 
     @custom_gradient.custom_gradient

From 95428e83f53de7cfafd5ea10dc1ca353398e8c66 Mon Sep 17 00:00:00 2001
From: Brian Zhao <bmzhao@google.com>
Date: Tue, 23 Jun 2020 12:33:36 -0700
Subject: [PATCH 0911/1390] Adding RevivedConstant class for Constant reloading
 in the SavedModelAPI C API.

PiperOrigin-RevId: 317920112
Change-Id: I2dc84de102c1edc5513df319e66ee20351bdb725
---
 .../c/experimental/saved_model/core/BUILD     |  43 ++++
 .../saved_model/core/revived_types/BUILD      |  39 ++++
 .../core/revived_types/constant.cc            |  46 ++++
 .../saved_model/core/revived_types/constant.h |  55 +++++
 .../revived_types/tensorhandle_convertible.h  |  49 +++++
 .../saved_model/core/saved_model_utils.cc     |  38 ++++
 .../saved_model/core/saved_model_utils.h      |  39 ++++
 .../core/saved_model_utils_test.cc            | 199 ++++++++++++++++++
 8 files changed, 508 insertions(+)
 create mode 100644 tensorflow/c/experimental/saved_model/core/revived_types/BUILD
 create mode 100644 tensorflow/c/experimental/saved_model/core/revived_types/constant.cc
 create mode 100644 tensorflow/c/experimental/saved_model/core/revived_types/constant.h
 create mode 100644 tensorflow/c/experimental/saved_model/core/revived_types/tensorhandle_convertible.h
 create mode 100644 tensorflow/c/experimental/saved_model/core/saved_model_utils.cc
 create mode 100644 tensorflow/c/experimental/saved_model/core/saved_model_utils.h
 create mode 100644 tensorflow/c/experimental/saved_model/core/saved_model_utils_test.cc

diff --git a/tensorflow/c/experimental/saved_model/core/BUILD b/tensorflow/c/experimental/saved_model/core/BUILD
index dbe1b6d656c..bc9a5fd9442 100644
--- a/tensorflow/c/experimental/saved_model/core/BUILD
+++ b/tensorflow/c/experimental/saved_model/core/BUILD
@@ -3,6 +3,10 @@
 # Targets in this directory are pure C++ "Classes" underlying the C API types
 # under tf/c/experimental/saved_model/public/. They are subject to change and
 # have visibility limited to Tensorflow's implementation only.
+load(
+    "//tensorflow:tensorflow.bzl",
+    "tf_cc_test",
+)
 
 package(
     default_visibility = [
@@ -47,6 +51,22 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "saved_model_utils",
+    srcs = [
+        "saved_model_utils.cc",
+    ],
+    hdrs = [
+        "saved_model_utils.h",
+    ],
+    deps = [
+        "//tensorflow/c:tf_tensor_internal",
+        "//tensorflow/c/eager:immediate_execution_context",
+        "//tensorflow/c/experimental/saved_model/core/revived_types:constant",
+        "//tensorflow/core:protos_all_cc",
+    ],
+)
+
 cc_library(
     name = "tf_saved_model_impl",
     srcs = [
@@ -84,3 +104,26 @@ filegroup(
     ],
     visibility = ["//tensorflow/core:__pkg__"],
 )
+
+tf_cc_test(
+    name = "saved_model_utils_test",
+    srcs = [
+        "saved_model_utils_test.cc",
+    ],
+    deps = [
+        ":saved_model_utils",
+        "//tensorflow/c:tensor_interface",
+        "//tensorflow/c/eager:abstract_tensor_handle",
+        "//tensorflow/c/eager:immediate_execution_context",
+        "//tensorflow/c/eager:immediate_execution_tensor_handle",
+        "//tensorflow/c/experimental/saved_model/core/revived_types:constant",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core/common_runtime:core_cpu_lib",
+        "//tensorflow/core/common_runtime/eager:context",
+        "//tensorflow/core/common_runtime/eager:core",
+    ],
+)
diff --git a/tensorflow/c/experimental/saved_model/core/revived_types/BUILD b/tensorflow/c/experimental/saved_model/core/revived_types/BUILD
new file mode 100644
index 00000000000..ad3844e00a0
--- /dev/null
+++ b/tensorflow/c/experimental/saved_model/core/revived_types/BUILD
@@ -0,0 +1,39 @@
+# This package contains classes corresponding to Revived SavedObjectGraph types
+# used by SavedModel. See https://cs.opensource.google/tensorflow/tensorflow/+/c575e2ba93c442121d98d3f125d83fed1339924d:tensorflow/core/protobuf/saved_object_graph.proto;l=56-62
+package(
+    default_visibility = [
+        # Restricting visibility for now
+        "//tensorflow/c/experimental/saved_model/core:__pkg__",
+    ],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+cc_library(
+    name = "constant",
+    srcs = [
+        "constant.cc",
+    ],
+    hdrs = [
+        "constant.h",
+    ],
+    deps = [
+        ":tensorhandle_convertible",
+        "//tensorflow/c:tensor_interface",
+        "//tensorflow/c/eager:immediate_execution_context",
+        "//tensorflow/c/eager:immediate_execution_tensor_handle",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/common_runtime/eager:tensor_handle",
+    ],
+)
+
+cc_library(
+    name = "tensorhandle_convertible",
+    hdrs = [
+        "tensorhandle_convertible.h",
+    ],
+    deps = [
+        "//tensorflow/c/eager:immediate_execution_tensor_handle",
+    ],
+)
diff --git a/tensorflow/c/experimental/saved_model/core/revived_types/constant.cc b/tensorflow/c/experimental/saved_model/core/revived_types/constant.cc
new file mode 100644
index 00000000000..0cabf83a123
--- /dev/null
+++ b/tensorflow/c/experimental/saved_model/core/revived_types/constant.cc
@@ -0,0 +1,46 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/c/experimental/saved_model/core/revived_types/constant.h"
+
+#include <memory>
+
+#include "tensorflow/c/eager/immediate_execution_context.h"
+#include "tensorflow/c/eager/immediate_execution_tensor_handle.h"
+#include "tensorflow/c/experimental/saved_model/core/revived_types/tensorhandle_convertible.h"
+#include "tensorflow/core/common_runtime/eager/tensor_handle.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor.pb.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+Constant::Constant(ImmediateTensorHandlePtr handle)
+    : TensorHandleConvertible(std::move(handle)) {}
+
+Status Constant::Create(ImmediateExecutionContext* ctx,
+                        AbstractTensorInterface* tensor,
+                        std::unique_ptr<Constant>* output) {
+  ImmediateExecutionTensorHandle* handle = ctx->CreateLocalHandle(tensor);
+  if (handle == nullptr) {
+    return errors::Internal("Failed to convert tensor to tensorhandle");
+  }
+  output->reset(new Constant(ImmediateTensorHandlePtr(handle)));
+  return Status();
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/c/experimental/saved_model/core/revived_types/constant.h b/tensorflow/c/experimental/saved_model/core/revived_types/constant.h
new file mode 100644
index 00000000000..845a6f391c0
--- /dev/null
+++ b/tensorflow/c/experimental/saved_model/core/revived_types/constant.h
@@ -0,0 +1,55 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_REVIVED_CONSTANT_H_
+#define TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_REVIVED_CONSTANT_H_
+
+#include <memory>
+
+#include "tensorflow/c/eager/immediate_execution_context.h"
+#include "tensorflow/c/eager/immediate_execution_tensor_handle.h"
+#include "tensorflow/c/experimental/saved_model/core/revived_types/tensorhandle_convertible.h"
+#include "tensorflow/c/tensor_interface.h"
+#include "tensorflow/core/framework/tensor.pb.h"
+
+namespace tensorflow {
+
+// This class corresponds to python's tf.constant, which is effectively a
+// TensorHandle explicitly initialized to some value.
+// For now this doesn't do much beyond wrap Context's CreateLocalHandle method,
+// and offer a subclass of TensorHandleConvertible. Note that similar to
+// the python's eager mode logic, we bypass calling the "Const" op:
+// https://github.com/tensorflow/tensorflow/blob/1c064ab76064c58e54261b805027474885a1534d/tensorflow/python/framework/constant_op.py#L301
+class Constant : public TensorHandleConvertible {
+ public:
+  static Status Create(ImmediateExecutionContext* ctx,
+                       AbstractTensorInterface* tensor,
+                       std::unique_ptr<Constant>* output);
+
+  // RevivedConstant is movable, but not copyable.
+  Constant(Constant&& other) = default;
+  Constant& operator=(Constant&& other) = default;
+
+  ~Constant() override = default;
+
+ private:
+  explicit Constant(ImmediateTensorHandlePtr handle);
+  Constant(const Constant&) = delete;
+  Constant& operator=(const Constant&) = delete;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_REVIVED_CONSTANT_H_
diff --git a/tensorflow/c/experimental/saved_model/core/revived_types/tensorhandle_convertible.h b/tensorflow/c/experimental/saved_model/core/revived_types/tensorhandle_convertible.h
new file mode 100644
index 00000000000..98179586e83
--- /dev/null
+++ b/tensorflow/c/experimental/saved_model/core/revived_types/tensorhandle_convertible.h
@@ -0,0 +1,49 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_TENSORHANDLE_CONVERTIBLE_H_
+#define TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_TENSORHANDLE_CONVERTIBLE_H_
+
+#include "tensorflow/c/eager/immediate_execution_tensor_handle.h"
+
+namespace tensorflow {
+
+// A common interface for objects that can be converted to a TensorHandle.
+// Examples of objects that implement this include Variables, Constants, Assets,
+// etc. This is used to convert captured objects into a ConcreteFunction's
+// captured TensorHandles:
+// https://github.com/tensorflow/tensorflow/blob/676a68963ea4b64fe479b9cede06aa8f5b290ab8/tensorflow/python/saved_model/load.py#L229-L240
+class TensorHandleConvertible {
+ public:
+  explicit TensorHandleConvertible(ImmediateTensorHandlePtr handle)
+      : handle_(std::move(handle)) {}
+
+  ImmediateExecutionTensorHandle* handle() { return handle_.get(); }
+
+  // TensorHandleConvertible is movable, but not copyable.
+  TensorHandleConvertible(TensorHandleConvertible&& other) = default;
+  TensorHandleConvertible& operator=(TensorHandleConvertible&& other) = default;
+
+  virtual ~TensorHandleConvertible() = default;
+
+ protected:
+  TensorHandleConvertible(const TensorHandleConvertible&) = delete;
+  TensorHandleConvertible& operator=(const TensorHandleConvertible&) = delete;
+  ImmediateTensorHandlePtr handle_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_TENSORHANDLE_CONVERTIBLE_H_
diff --git a/tensorflow/c/experimental/saved_model/core/saved_model_utils.cc b/tensorflow/c/experimental/saved_model/core/saved_model_utils.cc
new file mode 100644
index 00000000000..9fe9caa27d7
--- /dev/null
+++ b/tensorflow/c/experimental/saved_model/core/saved_model_utils.cc
@@ -0,0 +1,38 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/c/experimental/saved_model/core/saved_model_utils.h"
+
+#include "tensorflow/c/experimental/saved_model/core/revived_types/constant.h"
+#include "tensorflow/c/tf_tensor_internal.h"
+
+namespace tensorflow {
+namespace internal {
+
+Status TensorProtoToConstant(ImmediateExecutionContext* ctx,
+                             const TensorProto& proto,
+                             std::unique_ptr<Constant>* output) {
+  tensorflow::Tensor tensor;
+  bool parse_result = tensor.FromProto(proto);
+  if (!parse_result) {
+    return errors::Internal("Failed to parse tensor from tensorproto");
+  }
+
+  TensorInterface tensor_interface(std::move(tensor));
+  return Constant::Create(ctx, &tensor_interface, output);
+}
+
+}  // namespace internal
+}  // namespace tensorflow
diff --git a/tensorflow/c/experimental/saved_model/core/saved_model_utils.h b/tensorflow/c/experimental/saved_model/core/saved_model_utils.h
new file mode 100644
index 00000000000..5223f1c5f7d
--- /dev/null
+++ b/tensorflow/c/experimental/saved_model/core/saved_model_utils.h
@@ -0,0 +1,39 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_SAVED_MODEL_UTILS_H_
+#define TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_SAVED_MODEL_UTILS_H_
+
+// Some internal utility functions for the SavedModelAPI, factored out into a
+// separately unit-testable header.
+
+#include "tensorflow/c/eager/immediate_execution_context.h"
+#include "tensorflow/c/experimental/saved_model/core/revived_types/constant.h"
+#include "tensorflow/core/framework/tensor.pb.h"
+
+namespace tensorflow {
+namespace internal {
+
+// Load a TensorProto into a tensorflow::Constant. This is similar to the
+// constant loading logic in python:
+// https://github.com/tensorflow/tensorflow/blob/516608035f85cec8b126712b0ff8407220206b22/tensorflow/python/saved_model/load.py#L437
+Status TensorProtoToConstant(ImmediateExecutionContext* ctx,
+                             const TensorProto& proto,
+                             std::unique_ptr<Constant>* output);
+
+}  // namespace internal
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_SAVED_MODEL_UTILS_H_
diff --git a/tensorflow/c/experimental/saved_model/core/saved_model_utils_test.cc b/tensorflow/c/experimental/saved_model/core/saved_model_utils_test.cc
new file mode 100644
index 00000000000..483162574f7
--- /dev/null
+++ b/tensorflow/c/experimental/saved_model/core/saved_model_utils_test.cc
@@ -0,0 +1,199 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/c/experimental/saved_model/core/saved_model_utils.h"
+
+#include <string.h>
+
+#include <memory>
+#include <vector>
+
+#include "tensorflow/c/eager/immediate_execution_tensor_handle.h"
+#include "tensorflow/c/experimental/saved_model/core/revived_types/constant.h"
+#include "tensorflow/c/tensor_interface.h"
+#include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/common_runtime/eager/context.h"
+#include "tensorflow/core/framework/numeric_types.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor.pb.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/lib/bfloat16/bfloat16.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace {
+
+// Converts a tensorflow::DatatypeSet to std::vector<DataType>.
+// This is needed for GTest's ::testing::ValuesIn, since
+// DataTypeSet doesn't fullfill all the constraints of an STL-like iterable.
+std::vector<DataType> DataTypeSetToVector(DataTypeSet set) {
+  std::vector<DataType> result;
+  result.reserve(set.size());
+  for (DataType dt : set) {
+    result.push_back(dt);
+  }
+  return result;
+}
+
+// Returns a vector of shapes intended to be "interesting" test cases.
+std::vector<std::vector<int64>> InterestingShapes() {
+  std::vector<std::vector<int64>> interesting_shapes;
+  interesting_shapes.push_back({});             // Scalar
+  interesting_shapes.push_back({10});           // 1D Vector
+  interesting_shapes.push_back({3, 3});         // 2D Matrix
+  interesting_shapes.push_back({1, 4, 6, 10});  // Higher Dimension Tensor
+  return interesting_shapes;
+}
+
+// Fills a numeric tensor with `value`.
+void FillNumericTensor(Tensor* tensor, int8 value) {
+  switch (tensor->dtype()) {
+#define CASE(type)                                    \
+  case DataTypeToEnum<type>::value: {                 \
+    const auto& flattened = tensor->flat<type>();     \
+    for (int i = 0; i < tensor->NumElements(); ++i) { \
+      flattened(i) = value;                           \
+    }                                                 \
+    break;                                            \
+  }
+    TF_CALL_INTEGRAL_TYPES(CASE);
+    TF_CALL_double(CASE);
+    TF_CALL_float(CASE);
+#undef CASE
+    default:
+      CHECK(false) << "Unsupported data type: "
+                   << DataTypeString(tensor->dtype());
+      break;
+  }
+}
+
+// Checks the underlying data is equal for the buffers for two numeric tensors.
+// Note: The caller must ensure to check that the dtypes and sizes of the
+// underlying buffers are the same before calling this.
+void CheckBufferDataIsEqual(DataType dtype, int64 num_elements, void* a,
+                            void* b) {
+  switch (dtype) {
+#define CASE(type)                               \
+  case DataTypeToEnum<type>::value: {            \
+    type* typed_a = static_cast<type*>(a);       \
+    type* typed_b = static_cast<type*>(b);       \
+    for (int64 i = 0; i < num_elements; ++i) {   \
+      if (DataTypeIsFloating(dtype)) {           \
+        EXPECT_FLOAT_EQ(typed_a[i], typed_b[i]); \
+      } else {                                   \
+        EXPECT_EQ(typed_a[i], typed_b[i]);       \
+      }                                          \
+    }                                            \
+    break;                                       \
+  }
+    TF_CALL_INTEGRAL_TYPES(CASE);
+    TF_CALL_double(CASE);
+    TF_CALL_float(CASE);
+#undef CASE
+    default:
+      CHECK(false) << "Unsupported data type: " << DataTypeString(dtype);
+  }
+}
+
+class ConstantTest : public ::testing::TestWithParam<
+                         std::tuple<DataType, std::vector<int64>, bool>> {
+ public:
+  ConstantTest()
+      : device_mgr_(std::make_unique<StaticDeviceMgr>(DeviceFactory::NewDevice(
+            "CPU", {}, "/job:localhost/replica:0/task:0"))),
+        ctx_(new EagerContext(
+            SessionOptions(),
+            tensorflow::ContextDevicePlacementPolicy::DEVICE_PLACEMENT_SILENT,
+            tensorflow::ContextMirroringPolicy::MIRRORING_NONE,
+            /* async= */ false,
+            /* lazy_copy_function_remote_inputs= */ false, device_mgr_.get(),
+            /* device_mgr_owned= */ false, /* rendezvous= */ nullptr,
+            /* custom_kernel_creator= */ nullptr,
+            /* cluster_flr= */ nullptr)) {}
+
+  EagerContext* context() { return ctx_.get(); }
+
+ private:
+  std::unique_ptr<StaticDeviceMgr> device_mgr_;
+  EagerContextPtr ctx_;
+};
+
+// Basic sanity check that roundtripping a Tensor->Tensorproto->Constant
+// preserves values.
+TEST_P(ConstantTest, CreateConstantSuccessful) {
+  // Get test parameters
+  auto& test_params = GetParam();
+  DataType dtype = std::get<0>(test_params);
+  TensorShape shape(std::get<1>(test_params));
+  bool tensorproto_use_tensor_content = std::get<2>(test_params);
+
+  // Construct a Tensor with the given dtype + shape
+  Tensor expected(dtype, shape);
+  FillNumericTensor(&expected, 42);
+
+  // Serialize it to a Tensorproto
+  TensorProto proto;
+  if (tensorproto_use_tensor_content) {
+    expected.AsProtoTensorContent(&proto);
+  } else {
+    expected.AsProtoField(&proto);
+  }
+
+  // Revival should succeed w/o errors
+  std::unique_ptr<Constant> revived;
+  TF_EXPECT_OK(internal::TensorProtoToConstant(context(), proto, &revived));
+
+  // The revived tensorhandle should have the exact same dtype, shape, +
+  // approx equivalent data to the original.
+  ImmediateExecutionTensorHandle* handle = revived->handle();
+  Status status;
+  AbstractTensorPtr revived_tensor(handle->Resolve(&status));
+  TF_EXPECT_OK(status) << "Failed to convert tensorhandle to tensor";
+  EXPECT_EQ(revived_tensor->Type(), expected.dtype());
+  EXPECT_EQ(revived_tensor->NumElements(), expected.NumElements());
+  EXPECT_EQ(revived_tensor->NumDims(), expected.dims());
+  for (int i = 0; i < expected.dims(); ++i) {
+    EXPECT_EQ(revived_tensor->Dim(i), expected.dim_size(i));
+  }
+
+  CheckBufferDataIsEqual(expected.dtype(), expected.NumElements(),
+                         revived_tensor->Data(), expected.data());
+}
+
+// Test against combinations of tensors that are
+// 1. Varying dtypes
+// 2. Varying shapes
+// 3. TensorProto serialized using tensor_content vs repeated type
+INSTANTIATE_TEST_SUITE_P(
+    ConstantIntegerDtypesTest, ConstantTest,
+    ::testing::Combine(
+        ::testing::ValuesIn(DataTypeSetToVector(kDataTypeIsInteger)),
+        ::testing::ValuesIn(InterestingShapes()),
+        ::testing::Values(false, true)));
+
+INSTANTIATE_TEST_SUITE_P(
+    ConstantFloatingDtypesTest, ConstantTest,
+    ::testing::Combine(::testing::Values(DT_FLOAT, DT_DOUBLE),
+                       ::testing::ValuesIn(InterestingShapes()),
+                       ::testing::Values(false, true)));
+
+}  // namespace
+}  // namespace tensorflow

From e8c972652ad77076faf464df4f59240a2dd1548a Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 23 Jun 2020 12:36:13 -0700
Subject: [PATCH 0912/1390] Move uses of `tensorflow::MakeTypeIndex()` to
 `tensorflow::TypeIndex::Make`.

PiperOrigin-RevId: 317920618
Change-Id: I7af52fdf92c77858ffa897f6d5449bfb0213f4e5
---
 .../process_function_library_runtime_test.cc  |  4 +--
 tensorflow/core/framework/resource_mgr.h      | 16 +++++------
 .../core/framework/resource_op_kernel.h       |  2 +-
 tensorflow/core/framework/variant.h           | 10 +++----
 .../core/framework/variant_encode_decode.h    |  2 +-
 .../core/framework/variant_op_registry.h      |  6 ++--
 .../framework/variant_op_registry_test.cc     | 28 ++++++++++---------
 tensorflow/core/framework/variant_test.cc     |  2 +-
 .../kernels/conditional_accumulator_op.cc     |  2 +-
 tensorflow/core/kernels/data/dataset_utils.h  |  2 +-
 .../experimental/threadpool_dataset_op.cc     |  2 +-
 tensorflow/core/kernels/data/iterator_ops.cc  |  2 +-
 .../kernels/data/multi_device_iterator_ops.cc |  2 +-
 tensorflow/core/kernels/ops_testutil.h        |  2 +-
 tensorflow/core/kernels/tile_ops.cc           |  2 +-
 tensorflow/core/platform/abi_test.cc          |  4 +--
 16 files changed, 45 insertions(+), 43 deletions(-)

diff --git a/tensorflow/core/common_runtime/process_function_library_runtime_test.cc b/tensorflow/core/common_runtime/process_function_library_runtime_test.cc
index 6e17cdf4316..9d662956504 100644
--- a/tensorflow/core/common_runtime/process_function_library_runtime_test.cc
+++ b/tensorflow/core/common_runtime/process_function_library_runtime_test.cc
@@ -764,8 +764,8 @@ Tensor GetResourceHandle(const string& var_name, const string& container,
   handle.set_device(device_name);
   handle.set_container(container);
   handle.set_name(var_name);
-  handle.set_hash_code(MakeTypeIndex<Var>().hash_code());
-  handle.set_maybe_type_name(MakeTypeIndex<Var>().name());
+  handle.set_hash_code(TypeIndex::Make<Var>().hash_code());
+  handle.set_maybe_type_name(TypeIndex::Make<Var>().name());
   Tensor tensor(DT_RESOURCE, TensorShape({}));
   tensor.scalar<ResourceHandle>()() = handle;
   return tensor;
diff --git a/tensorflow/core/framework/resource_mgr.h b/tensorflow/core/framework/resource_mgr.h
index b0e4eace16e..3af8d81b0dc 100644
--- a/tensorflow/core/framework/resource_mgr.h
+++ b/tensorflow/core/framework/resource_mgr.h
@@ -301,7 +301,7 @@ ResourceHandle MakeResourceHandle(
   return MakeResourceHandle(
       container.empty() ? ctx->resource_manager()->default_container()
                         : container,
-      name, *ctx->device(), MakeTypeIndex<T>(), dtypes_and_shapes);
+      name, *ctx->device(), TypeIndex::Make<T>(), dtypes_and_shapes);
 }
 
 template <typename T>
@@ -311,7 +311,7 @@ ResourceHandle MakeResourceHandle(
   return MakeResourceHandle(
       container.empty() ? ctx->resource_manager()->default_container()
                         : container,
-      name, *ctx->device(), MakeTypeIndex<T>(), dtypes_and_shapes);
+      name, *ctx->device(), TypeIndex::Make<T>(), dtypes_and_shapes);
 }
 
 Status MakeResourceHandleToOutput(OpKernelContext* context, int output_index,
@@ -589,7 +589,7 @@ Status ResourceMgr::Create(const string& container, const string& name,
   CheckDeriveFromResourceBase<T>();
   CHECK(resource != nullptr);
   mutex_lock l(mu_);
-  return DoCreate(container, MakeTypeIndex<T>(), name, resource);
+  return DoCreate(container, TypeIndex::Make<T>(), name, resource);
 }
 
 template <typename T, bool use_dynamic_cast>
@@ -635,7 +635,7 @@ template <typename T, bool use_dynamic_cast>
 Status ResourceMgr::LookupInternal(const string& container, const string& name,
                                    T** resource) const {
   ResourceBase* found = nullptr;
-  Status s = DoLookup(container, MakeTypeIndex<T>(), name, &found);
+  Status s = DoLookup(container, TypeIndex::Make<T>(), name, &found);
   if (s.ok()) {
     // It's safe to down cast 'found' to T* since
     // typeid(T).hash_code() is part of the map key.
@@ -660,7 +660,7 @@ Status ResourceMgr::LookupOrCreate(const string& container, const string& name,
   s = LookupInternal<T, use_dynamic_cast>(container, name, resource);
   if (s.ok()) return s;
   TF_RETURN_IF_ERROR(creator(resource));
-  s = DoCreate(container, MakeTypeIndex<T>(), name, *resource);
+  s = DoCreate(container, TypeIndex::Make<T>(), name, *resource);
   if (!s.ok()) {
     return errors::Internal("LookupOrCreate failed unexpectedly");
   }
@@ -671,7 +671,7 @@ Status ResourceMgr::LookupOrCreate(const string& container, const string& name,
 template <typename T>
 Status ResourceMgr::Delete(const string& container, const string& name) {
   CheckDeriveFromResourceBase<T>();
-  return DoDelete(container, MakeTypeIndex<T>(), name);
+  return DoDelete(container, TypeIndex::Make<T>(), name);
 }
 
 template <typename T>
@@ -710,7 +710,7 @@ Status ValidateDevice(OpKernelContext* ctx, const ResourceHandle& p);
 template <typename T>
 Status ValidateDeviceAndType(OpKernelContext* ctx, const ResourceHandle& p) {
   TF_RETURN_IF_ERROR(internal::ValidateDevice(ctx, p));
-  auto type_index = MakeTypeIndex<T>();
+  auto type_index = TypeIndex::Make<T>();
   if (type_index.hash_code() != p.hash_code()) {
     return errors::InvalidArgument(
         "Trying to access resource using the wrong type. Expected ",
@@ -883,7 +883,7 @@ ResourceHandle ScopedStepContainer::MakeResourceHandle(
   mutex_lock ml(mu_);
   dirty_ = true;
   return tensorflow::MakeResourceHandle(container_, name, device,
-                                        MakeTypeIndex<T>(), {});
+                                        TypeIndex::Make<T>(), {});
 }
 
 template <typename T>
diff --git a/tensorflow/core/framework/resource_op_kernel.h b/tensorflow/core/framework/resource_op_kernel.h
index d8ee52a0e5d..4cb732ae973 100644
--- a/tensorflow/core/framework/resource_op_kernel.h
+++ b/tensorflow/core/framework/resource_op_kernel.h
@@ -105,7 +105,7 @@ class ResourceOpKernel : public OpKernel {
     if (has_resource_type_) {
       OP_REQUIRES_OK(context, MakeResourceHandleToOutput(
                                   context, 0, cinfo_.container(), cinfo_.name(),
-                                  MakeTypeIndex<T>()));
+                                  TypeIndex::Make<T>()));
     } else {
       context->set_output_ref(0, &mu_, handle_.AccessTensor(context));
     }
diff --git a/tensorflow/core/framework/variant.h b/tensorflow/core/framework/variant.h
index 3200d7c81fa..e8a0c332968 100644
--- a/tensorflow/core/framework/variant.h
+++ b/tensorflow/core/framework/variant.h
@@ -144,7 +144,7 @@ void EncodeVariant(const T& value, string* buf);
 //   Variant y_type_unknown = serialized_proto_f;  // Store serialized Variant.
 //
 //   EXPECT_EQ(x.TypeName(), y_type_unknown.TypeName());  // Looks like Foo.
-//   EXPECT_EQ(MakeTypeIndex<VariantTensorDataProto>(),
+//   EXPECT_EQ(TypeIndex::Make<VariantTensorDataProto>(),
 //             y_type_unknown.TypeId());
 //
 class Variant {
@@ -227,7 +227,7 @@ class Variant {
   // of the original type when a TensorValueDataProto is stored as the
   // value.  In this case, it returns the TypeIndex of TensorValueDataProto.
   TypeIndex TypeId() const {
-    const TypeIndex VoidTypeIndex = MakeTypeIndex<void>();
+    const TypeIndex VoidTypeIndex = TypeIndex::Make<void>();
     if (is_empty()) {
       return VoidTypeIndex;
     }
@@ -244,7 +244,7 @@ class Variant {
   // otherwise.
   template <typename T>
   T* get() {
-    const TypeIndex TTypeIndex = MakeTypeIndex<T>();
+    const TypeIndex TTypeIndex = TypeIndex::Make<T>();
     if (is_empty() || (TTypeIndex != TypeId())) return nullptr;
     return std::addressof(static_cast<Variant::Value<T>*>(GetValue())->value);
   }
@@ -253,7 +253,7 @@ class Variant {
   // otherwise.
   template <typename T>
   const T* get() const {
-    const TypeIndex TTypeIndex = MakeTypeIndex<T>();
+    const TypeIndex TTypeIndex = TypeIndex::Make<T>();
     if (is_empty() || (TTypeIndex != TypeId())) return nullptr;
     return std::addressof(
         static_cast<const Variant::Value<T>*>(GetValue())->value);
@@ -333,7 +333,7 @@ class Variant {
 
     TypeIndex TypeId() const final {
       const TypeIndex value_type_index =
-          MakeTypeIndex<typename std::decay<T>::type>();
+          TypeIndex::Make<typename std::decay<T>::type>();
       return value_type_index;
     }
 
diff --git a/tensorflow/core/framework/variant_encode_decode.h b/tensorflow/core/framework/variant_encode_decode.h
index 5e08e5a7a60..502bbd57422 100644
--- a/tensorflow/core/framework/variant_encode_decode.h
+++ b/tensorflow/core/framework/variant_encode_decode.h
@@ -160,7 +160,7 @@ string TypeNameVariantImpl(
     const T& value,
     TypeNameResolver<T, false /* has_type_name */, false /* Tensor */,
                      false /* protobuf */>) {
-  return port::MaybeAbiDemangle(MakeTypeIndex<T>().name());
+  return port::MaybeAbiDemangle(TypeIndex::Make<T>().name());
 }
 
 template <typename T>
diff --git a/tensorflow/core/framework/variant_op_registry.h b/tensorflow/core/framework/variant_op_registry.h
index 4d94dcd35dd..5879597e5eb 100644
--- a/tensorflow/core/framework/variant_op_registry.h
+++ b/tensorflow/core/framework/variant_op_registry.h
@@ -521,7 +521,7 @@ class UnaryVariantBinaryOpRegistration {
 #define INTERNAL_REGISTER_UNARY_VARIANT_DEVICE_COPY_FUNCTION(T, direction,   \
                                                              device_copy_fn) \
   INTERNAL_REGISTER_UNARY_VARIANT_DEVICE_COPY_FUNCTION_UNIQ_HELPER(          \
-      __COUNTER__, T, direction, MakeTypeIndex<T>(), device_copy_fn)
+      __COUNTER__, T, direction, TypeIndex::Make<T>(), device_copy_fn)
 
 #define INTERNAL_REGISTER_UNARY_VARIANT_DEVICE_COPY_FUNCTION_UNIQ_HELPER( \
     ctr, T, direction, type_index, device_copy_fn)                        \
@@ -542,7 +542,7 @@ class UnaryVariantBinaryOpRegistration {
 #define REGISTER_UNARY_VARIANT_UNARY_OP_FUNCTION(op, device, T,     \
                                                  unary_op_function) \
   REGISTER_UNARY_VARIANT_UNARY_OP_FUNCTION_UNIQ_HELPER(             \
-      __COUNTER__, op, device, T, MakeTypeIndex<T>(), unary_op_function)
+      __COUNTER__, op, device, T, TypeIndex::Make<T>(), unary_op_function)
 
 #define REGISTER_UNARY_VARIANT_UNARY_OP_FUNCTION_UNIQ_HELPER(       \
     ctr, op, device, T, type_index, unary_op_function)              \
@@ -563,7 +563,7 @@ class UnaryVariantBinaryOpRegistration {
 #define REGISTER_UNARY_VARIANT_BINARY_OP_FUNCTION(op, device, T,      \
                                                   binary_op_function) \
   REGISTER_UNARY_VARIANT_BINARY_OP_FUNCTION_UNIQ_HELPER(              \
-      __COUNTER__, op, device, T, MakeTypeIndex<T>(), binary_op_function)
+      __COUNTER__, op, device, T, TypeIndex::Make<T>(), binary_op_function)
 
 #define REGISTER_UNARY_VARIANT_BINARY_OP_FUNCTION_UNIQ_HELPER( \
     ctr, op, device, T, type_index, binary_op_function)        \
diff --git a/tensorflow/core/framework/variant_op_registry_test.cc b/tensorflow/core/framework/variant_op_registry_test.cc
index 0aef6154a1f..1c45a39770c 100644
--- a/tensorflow/core/framework/variant_op_registry_test.cc
+++ b/tensorflow/core/framework/variant_op_registry_test.cc
@@ -155,12 +155,12 @@ TEST(VariantOpCopyToGPURegistryTest, TestBasic) {
   // No registered copy fn for GPU<->GPU.
   EXPECT_EQ(UnaryVariantOpRegistry::Global()->GetDeviceCopyFn(
                 VariantDeviceCopyDirection::DEVICE_TO_DEVICE,
-                MakeTypeIndex<VariantValue>()),
+                TypeIndex::Make<VariantValue>()),
             nullptr);
 
   auto* copy_to_gpu_fn = UnaryVariantOpRegistry::Global()->GetDeviceCopyFn(
       VariantDeviceCopyDirection::HOST_TO_DEVICE,
-      MakeTypeIndex<VariantValue>());
+      TypeIndex::Make<VariantValue>());
   EXPECT_NE(copy_to_gpu_fn, nullptr);
 
   VariantValue vv{true /* early_exit */};
@@ -183,7 +183,7 @@ TEST(VariantOpCopyToGPURegistryTest, TestDuplicate) {
   UnaryVariantOpRegistry registry;
   UnaryVariantOpRegistry::AsyncVariantDeviceCopyFn f;
   class FjFjFj {};
-  const auto kTypeIndex = MakeTypeIndex<FjFjFj>();
+  const auto kTypeIndex = TypeIndex::Make<FjFjFj>();
   registry.RegisterDeviceCopyFn(VariantDeviceCopyDirection::HOST_TO_DEVICE,
                                 kTypeIndex, f);
   EXPECT_DEATH(registry.RegisterDeviceCopyFn(
@@ -193,9 +193,10 @@ TEST(VariantOpCopyToGPURegistryTest, TestDuplicate) {
 
 TEST(VariantOpZerosLikeRegistryTest, TestBasicCPU) {
   class Blah {};
-  EXPECT_EQ(UnaryVariantOpRegistry::Global()->GetUnaryOpFn(
-                ZEROS_LIKE_VARIANT_UNARY_OP, DEVICE_CPU, MakeTypeIndex<Blah>()),
-            nullptr);
+  EXPECT_EQ(
+      UnaryVariantOpRegistry::Global()->GetUnaryOpFn(
+          ZEROS_LIKE_VARIANT_UNARY_OP, DEVICE_CPU, TypeIndex::Make<Blah>()),
+      nullptr);
 
   VariantValue vv_early_exit{true /* early_exit */, 0 /* value */};
   Variant v = vv_early_exit;
@@ -218,9 +219,10 @@ TEST(VariantOpZerosLikeRegistryTest, TestBasicCPU) {
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 TEST(VariantOpUnaryOpRegistryTest, TestBasicGPU) {
   class Blah {};
-  EXPECT_EQ(UnaryVariantOpRegistry::Global()->GetUnaryOpFn(
-                ZEROS_LIKE_VARIANT_UNARY_OP, DEVICE_GPU, MakeTypeIndex<Blah>()),
-            nullptr);
+  EXPECT_EQ(
+      UnaryVariantOpRegistry::Global()->GetUnaryOpFn(
+          ZEROS_LIKE_VARIANT_UNARY_OP, DEVICE_GPU, TypeIndex::Make<Blah>()),
+      nullptr);
 
   VariantValue vv_early_exit{true /* early_exit */, 0 /* value */};
   Variant v = vv_early_exit;
@@ -245,7 +247,7 @@ TEST(VariantOpUnaryOpRegistryTest, TestDuplicate) {
   UnaryVariantOpRegistry registry;
   UnaryVariantOpRegistry::VariantUnaryOpFn f;
   class FjFjFj {};
-  const auto kTypeIndex = MakeTypeIndex<FjFjFj>();
+  const auto kTypeIndex = TypeIndex::Make<FjFjFj>();
 
   registry.RegisterUnaryOpFn(ZEROS_LIKE_VARIANT_UNARY_OP, DEVICE_CPU,
                              kTypeIndex, f);
@@ -263,7 +265,7 @@ TEST(VariantOpUnaryOpRegistryTest, TestDuplicate) {
 TEST(VariantOpAddRegistryTest, TestBasicCPU) {
   class Blah {};
   EXPECT_EQ(UnaryVariantOpRegistry::Global()->GetBinaryOpFn(
-                ADD_VARIANT_BINARY_OP, DEVICE_CPU, MakeTypeIndex<Blah>()),
+                ADD_VARIANT_BINARY_OP, DEVICE_CPU, TypeIndex::Make<Blah>()),
             nullptr);
 
   VariantValue vv_early_exit{true /* early_exit */, 3 /* value */};
@@ -290,7 +292,7 @@ TEST(VariantOpAddRegistryTest, TestBasicCPU) {
 TEST(VariantOpAddRegistryTest, TestBasicGPU) {
   class Blah {};
   EXPECT_EQ(UnaryVariantOpRegistry::Global()->GetBinaryOpFn(
-                ADD_VARIANT_BINARY_OP, DEVICE_GPU, MakeTypeIndex<Blah>()),
+                ADD_VARIANT_BINARY_OP, DEVICE_GPU, TypeIndex::Make<Blah>()),
             nullptr);
 
   VariantValue vv_early_exit{true /* early_exit */, 3 /* value */};
@@ -318,7 +320,7 @@ TEST(VariantOpAddRegistryTest, TestDuplicate) {
   UnaryVariantOpRegistry registry;
   UnaryVariantOpRegistry::VariantBinaryOpFn f;
   class FjFjFj {};
-  const auto kTypeIndex = MakeTypeIndex<FjFjFj>();
+  const auto kTypeIndex = TypeIndex::Make<FjFjFj>();
 
   registry.RegisterBinaryOpFn(ADD_VARIANT_BINARY_OP, DEVICE_CPU, kTypeIndex, f);
   EXPECT_DEATH(registry.RegisterBinaryOpFn(ADD_VARIANT_BINARY_OP, DEVICE_CPU,
diff --git a/tensorflow/core/framework/variant_test.cc b/tensorflow/core/framework/variant_test.cc
index 3aa9743353e..5edb6efdc5e 100644
--- a/tensorflow/core/framework/variant_test.cc
+++ b/tensorflow/core/framework/variant_test.cc
@@ -589,7 +589,7 @@ TEST(VariantTest, TensorListTest) {
   serialized.ToProto(&data);
   const Variant y_unknown = data;
   EXPECT_EQ(y_unknown.TypeName(), "TensorList");
-  EXPECT_EQ(y_unknown.TypeId(), MakeTypeIndex<VariantTensorDataProto>());
+  EXPECT_EQ(y_unknown.TypeId(), TypeIndex::Make<VariantTensorDataProto>());
   EXPECT_EQ(y_unknown.DebugString(),
             strings::StrCat(
                 "Variant<type: TensorList value: ", data.DebugString(), ">"));
diff --git a/tensorflow/core/kernels/conditional_accumulator_op.cc b/tensorflow/core/kernels/conditional_accumulator_op.cc
index 6b6feb81cfa..debe2368d28 100644
--- a/tensorflow/core/kernels/conditional_accumulator_op.cc
+++ b/tensorflow/core/kernels/conditional_accumulator_op.cc
@@ -90,7 +90,7 @@ class ResourceConditionalAccumulatorOp : public ConditionalAccumulatorBaseOp {
     h(1) = cinfo_.name();
     OP_REQUIRES_OK(ctx, MakeResourceHandleToOutput(
                             ctx, 0, cinfo_.container(), cinfo_.name(),
-                            MakeTypeIndex<ConditionalAccumulatorBase>()));
+                            TypeIndex::Make<ConditionalAccumulatorBase>()));
   }
 
   TF_DISALLOW_COPY_AND_ASSIGN(ResourceConditionalAccumulatorOp);
diff --git a/tensorflow/core/kernels/data/dataset_utils.h b/tensorflow/core/kernels/data/dataset_utils.h
index ac087360fd0..0127fe68641 100644
--- a/tensorflow/core/kernels/data/dataset_utils.h
+++ b/tensorflow/core/kernels/data/dataset_utils.h
@@ -35,7 +35,7 @@ Status CreateHandle(OpKernelContext* ctx, T* resource,
   TF_RETURN_IF_ERROR(mgr->Create<T>(container_name, unique_name, resource));
 
   *handle = MakeResourceHandle(container_name, unique_name, *ctx->device(),
-                               MakeTypeIndex<T>());
+                               TypeIndex::Make<T>());
   return Status::OK();
 }
 
diff --git a/tensorflow/core/kernels/data/experimental/threadpool_dataset_op.cc b/tensorflow/core/kernels/data/experimental/threadpool_dataset_op.cc
index 65252e3dbcf..a9c682a426b 100644
--- a/tensorflow/core/kernels/data/experimental/threadpool_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/threadpool_dataset_op.cc
@@ -111,7 +111,7 @@ class ThreadPoolHandleOp : public OpKernel {
     }
     OP_REQUIRES_OK(ctx, MakeResourceHandleToOutput(
                             ctx, 0, cinfo_.container(), cinfo_.name(),
-                            MakeTypeIndex<ThreadPoolResource>()));
+                            TypeIndex::Make<ThreadPoolResource>()));
   }
 
  private:
diff --git a/tensorflow/core/kernels/data/iterator_ops.cc b/tensorflow/core/kernels/data/iterator_ops.cc
index 8dd7f4c364b..1996e7f230e 100644
--- a/tensorflow/core/kernels/data/iterator_ops.cc
+++ b/tensorflow/core/kernels/data/iterator_ops.cc
@@ -443,7 +443,7 @@ void IteratorHandleOp::Compute(OpKernelContext* context)
   }
   OP_REQUIRES_OK(context, MakeResourceHandleToOutput(
                               context, 0, cinfo_.container(), cinfo_.name(),
-                              MakeTypeIndex<IteratorResource>()));
+                              TypeIndex::Make<IteratorResource>()));
 }
 
 Status IteratorHandleOp::VerifyResource(IteratorResource* resource) {
diff --git a/tensorflow/core/kernels/data/multi_device_iterator_ops.cc b/tensorflow/core/kernels/data/multi_device_iterator_ops.cc
index 7be03632d94..f3f67bcad07 100644
--- a/tensorflow/core/kernels/data/multi_device_iterator_ops.cc
+++ b/tensorflow/core/kernels/data/multi_device_iterator_ops.cc
@@ -475,7 +475,7 @@ class MultiDeviceIteratorHandleOp : public OpKernel {
     }
     OP_REQUIRES_OK(context, MakeResourceHandleToOutput(
                                 context, 0, container_name, unique_name,
-                                MakeTypeIndex<MultiDeviceIterator>()));
+                                TypeIndex::Make<MultiDeviceIterator>()));
   }
 
  private:
diff --git a/tensorflow/core/kernels/ops_testutil.h b/tensorflow/core/kernels/ops_testutil.h
index ea79a4b416b..93eee6ff350 100644
--- a/tensorflow/core/kernels/ops_testutil.h
+++ b/tensorflow/core/kernels/ops_testutil.h
@@ -126,7 +126,7 @@ class OpsTestBase : public ::testing::Test {
     std::string container_name =
         container.empty() ? rm->default_container() : container;
     EXPECT_TRUE(rm->Create(container_name, name, resource).ok());
-    AddResourceInputInternal(container_name, name, MakeTypeIndex<T>());
+    AddResourceInputInternal(container_name, name, TypeIndex::Make<T>());
   }
 
   // Runs an operation producing 'num_outputs' outputs.
diff --git a/tensorflow/core/kernels/tile_ops.cc b/tensorflow/core/kernels/tile_ops.cc
index e626d430864..f733d9b9aea 100644
--- a/tensorflow/core/kernels/tile_ops.cc
+++ b/tensorflow/core/kernels/tile_ops.cc
@@ -554,7 +554,7 @@ inline void TileGradientOp<Device, Tmultiples>::HandleCase(
     OpKernelContext* context, const std::vector<Tmultiples>& input_dims,
     const gtl::ArraySlice<Tmultiples>& multiples_array, Tensor* result) {
   LOG(FATAL) << "TileGradientOp: Invalid combination of Device, DT and NDIM: "
-             << MakeTypeIndex<Device>().name() << ", " << DataTypeString(DT)
+             << TypeIndex::Make<Device>().name() << ", " << DataTypeString(DT)
              << ", " << NDIM;
 }
 
diff --git a/tensorflow/core/platform/abi_test.cc b/tensorflow/core/platform/abi_test.cc
index 3a01953aec2..b6f8dd5c4ec 100644
--- a/tensorflow/core/platform/abi_test.cc
+++ b/tensorflow/core/platform/abi_test.cc
@@ -23,14 +23,14 @@ namespace tensorflow {
 struct MyRandomPODType {};
 
 TEST(AbiTest, AbiDemangleTest) {
-  EXPECT_EQ(port::MaybeAbiDemangle(MakeTypeIndex<int>().name()), "int");
+  EXPECT_EQ(port::MaybeAbiDemangle(TypeIndex::Make<int>().name()), "int");
 
 #ifdef PLATFORM_WINDOWS
   const char pod_type_name[] = "struct tensorflow::MyRandomPODType";
 #else
   const char pod_type_name[] = "tensorflow::MyRandomPODType";
 #endif
-  EXPECT_EQ(port::MaybeAbiDemangle(MakeTypeIndex<MyRandomPODType>().name()),
+  EXPECT_EQ(port::MaybeAbiDemangle(TypeIndex::Make<MyRandomPODType>().name()),
             pod_type_name);
 
   EXPECT_EQ(

From f430bdcc862fc0ab534d5800bdb6f88b4957bb1e Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 23 Jun 2020 12:58:44 -0700
Subject: [PATCH 0913/1390] call cuptiFinalize for CUDA11

PiperOrigin-RevId: 317925127
Change-Id: I06670239f21e784b1d1f4cd02f450a479e35b534
---
 tensorflow/core/profiler/internal/gpu/cupti_tracer.cc  | 1 +
 tensorflow/core/profiler/internal/gpu/device_tracer.cc | 5 ++++-
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/tensorflow/core/profiler/internal/gpu/cupti_tracer.cc b/tensorflow/core/profiler/internal/gpu/cupti_tracer.cc
index 931801427e7..b620b51cc99 100644
--- a/tensorflow/core/profiler/internal/gpu/cupti_tracer.cc
+++ b/tensorflow/core/profiler/internal/gpu/cupti_tracer.cc
@@ -1518,6 +1518,7 @@ Status CuptiTracer::DisableActivityTracing() {
 
 Status CuptiTracer::Finalize() {
   if (option_->cupti_finalize) {
+    VLOG(1) << "CuptiFinalize";
     RETURN_IF_CUPTI_ERROR(cupti_interface_->Finalize());
   }
   return Status::OK();
diff --git a/tensorflow/core/profiler/internal/gpu/device_tracer.cc b/tensorflow/core/profiler/internal/gpu/device_tracer.cc
index 3c0ac04caf2..48391324f79 100644
--- a/tensorflow/core/profiler/internal/gpu/device_tracer.cc
+++ b/tensorflow/core/profiler/internal/gpu/device_tracer.cc
@@ -612,8 +612,11 @@ Status GpuTracer::DoStart() {
   options_.activities_selected.push_back(CUPTI_ACTIVITY_KIND_MEMCPY2);
   options_.activities_selected.push_back(CUPTI_ACTIVITY_KIND_OVERHEAD);
 
+// CUDA/CUPTI 10 have issues (leaks and crashes) with CuptiFinalize.
 #if CUDA_VERSION < 10000
-  if (!trace_concurrent_kernels) options_.cupti_finalize = true;
+  if (!options.trace_concurrent_kernels()) options_.cupti_finalize = true;
+#elif CUDA_VERSION >= 11000
+  options_.cupti_finalize = true;
 #endif
 
   CuptiTracerCollectorOptions collector_options;

From 8984c6d97a4a8d5dd8eed3f8dd40e64664f13e45 Mon Sep 17 00:00:00 2001
From: Nick Kreeger <kreeger@google.com>
Date: Tue, 23 Jun 2020 13:18:22 -0700
Subject: [PATCH 0914/1390] Ensure that multi-tenant allocation is properly
 recorded.

The RecordingMicroAllocator currently keeps track of logging buckets for a single model. To help support auditing efforts using multi-tenant APIs, this class needs to keep track of the sum of allocation in all buckets.

To test this use case, I allocate the same model twice in a single arena. All allocations should be bucketed for auditing.

PiperOrigin-RevId: 317928536
Change-Id: I61ce3cdb8a5e45b0cb6232f19de0bdeec7b5d7b0
---
 .../lite/micro/recording_micro_allocator.cc   |  69 +++++++-----
 .../lite/micro/recording_micro_allocator.h    |  22 ++--
 .../micro/recording_micro_allocator_test.cc   | 105 +++++++++++++++---
 3 files changed, 142 insertions(+), 54 deletions(-)

diff --git a/tensorflow/lite/micro/recording_micro_allocator.cc b/tensorflow/lite/micro/recording_micro_allocator.cc
index 05ccdbdbfaa..e667e7db9a9 100644
--- a/tensorflow/lite/micro/recording_micro_allocator.cc
+++ b/tensorflow/lite/micro/recording_micro_allocator.cc
@@ -110,37 +110,54 @@ void RecordingMicroAllocator::PrintRecordedAllocation(
 
 TfLiteStatus RecordingMicroAllocator::AllocateTfLiteTensorArray(
     TfLiteContext* context, const SubGraph* subgraph) {
-  SnapshotAllocationUsage(recorded_tflite_tensor_array_data_);
+  RecordedAllocation allocations = SnapshotAllocationUsage();
 
   TfLiteStatus status =
       MicroAllocator::AllocateTfLiteTensorArray(context, subgraph);
 
-  RecordAllocationUsage(recorded_tflite_tensor_array_data_);
-  recorded_tflite_tensor_array_data_.count = context->tensors_size;
+  RecordAllocationUsage(allocations, recorded_tflite_tensor_array_data_);
+  // The allocation for this recording will always be 1. This is because the
+  // parent class mallocs one large allocation for the number of tensors in the
+  // graph (e.g. sizeof(TfLiteTensor) * num_tensors).
+  // To prevent extra overhead and potential for fragmentation, manually adjust
+  // the accounting by decrementing by 1 and adding the actual number of tensors
+  // used in the graph:
+  recorded_tflite_tensor_array_data_.count += context->tensors_size - 1;
   return status;
 }
 
 TfLiteStatus RecordingMicroAllocator::PopulateTfLiteTensorArrayFromFlatbuffer(
     const Model* model, TfLiteContext* context, const SubGraph* subgraph) {
-  SnapshotAllocationUsage(recorded_tflite_tensor_array_quantization_data_);
+  RecordedAllocation allocations = SnapshotAllocationUsage();
 
   TfLiteStatus status = MicroAllocator::PopulateTfLiteTensorArrayFromFlatbuffer(
       model, context, subgraph);
 
-  RecordAllocationUsage(recorded_tflite_tensor_array_quantization_data_);
+  RecordAllocationUsage(allocations,
+                        recorded_tflite_tensor_array_quantization_data_);
   return status;
 }
 
 TfLiteStatus RecordingMicroAllocator::AllocateNodeAndRegistrations(
     const SubGraph* subgraph, NodeAndRegistration** node_and_registrations) {
-  SnapshotAllocationUsage(recorded_node_and_registration_array_data_);
+  RecordedAllocation allocations = SnapshotAllocationUsage();
 
   TfLiteStatus status = MicroAllocator::AllocateNodeAndRegistrations(
       subgraph, node_and_registrations);
 
-  RecordAllocationUsage(recorded_node_and_registration_array_data_);
-  recorded_node_and_registration_array_data_.count =
-      subgraph->operators()->size();
+  RecordAllocationUsage(allocations,
+                        recorded_node_and_registration_array_data_);
+  // The allocation count in SimpleMemoryAllocator will only be 1. To provide
+  // better logging, decrement by 1 and add in the actual number of operators
+  // used in the graph:
+  // The allocation for this recording will always be 1. This is because the
+  // parent class mallocs one large allocation for the number of nodes in the
+  // graph (e.g. sizeof(NodeAndRegistration) * num_nodes).
+  // To prevent extra overhead and potential for fragmentation, manually adjust
+  // the accounting by decrementing by 1 and adding the actual number of nodes
+  // used in the graph:
+  recorded_node_and_registration_array_data_.count +=
+      subgraph->operators()->size() - 1;
   return status;
 }
 
@@ -149,43 +166,45 @@ RecordingMicroAllocator::PrepareNodeAndRegistrationDataFromFlatbuffer(
     const Model* model, const SubGraph* subgraph,
     const MicroOpResolver& op_resolver,
     NodeAndRegistration* node_and_registrations) {
-  SnapshotAllocationUsage(recorded_op_data_);
+  RecordedAllocation allocations = SnapshotAllocationUsage();
 
   TfLiteStatus status =
       MicroAllocator::PrepareNodeAndRegistrationDataFromFlatbuffer(
           model, subgraph, op_resolver, node_and_registrations);
 
-  RecordAllocationUsage(recorded_op_data_);
+  RecordAllocationUsage(allocations, recorded_op_data_);
   return status;
 }
 
 TfLiteStatus RecordingMicroAllocator::AllocateVariables(
     TfLiteContext* context, const SubGraph* subgraph) {
-  SnapshotAllocationUsage(recorded_tflite_tensor_variable_buffer_data_);
+  RecordedAllocation allocations = SnapshotAllocationUsage();
 
   TfLiteStatus status = MicroAllocator::AllocateVariables(context, subgraph);
 
-  RecordAllocationUsage(recorded_tflite_tensor_variable_buffer_data_);
+  RecordAllocationUsage(allocations,
+                        recorded_tflite_tensor_variable_buffer_data_);
   return status;
 }
 
-void RecordingMicroAllocator::SnapshotAllocationUsage(
-    RecordedAllocation& recorded_allocation) {
-  recorded_allocation.requested_bytes =
-      recording_memory_allocator_->GetRequestedBytes();
-  recorded_allocation.used_bytes = recording_memory_allocator_->GetUsedBytes();
-  recorded_allocation.count = recording_memory_allocator_->GetAllocatedCount();
+RecordedAllocation RecordingMicroAllocator::SnapshotAllocationUsage() const {
+  return {/*requested_bytes=*/recording_memory_allocator_->GetRequestedBytes(),
+          /*used_bytes=*/recording_memory_allocator_->GetUsedBytes(),
+          /*count=*/recording_memory_allocator_->GetAllocatedCount()};
 }
 
 void RecordingMicroAllocator::RecordAllocationUsage(
+    const RecordedAllocation& snapshotted_allocation,
     RecordedAllocation& recorded_allocation) {
-  recorded_allocation.requested_bytes =
+  recorded_allocation.requested_bytes +=
       recording_memory_allocator_->GetRequestedBytes() -
-      recorded_allocation.requested_bytes;
-  recorded_allocation.used_bytes = recording_memory_allocator_->GetUsedBytes() -
-                                   recorded_allocation.used_bytes;
-  recorded_allocation.count = recording_memory_allocator_->GetAllocatedCount() -
-                              recorded_allocation.count;
+      snapshotted_allocation.requested_bytes;
+  recorded_allocation.used_bytes +=
+      recording_memory_allocator_->GetUsedBytes() -
+      snapshotted_allocation.used_bytes;
+  recorded_allocation.count +=
+      recording_memory_allocator_->GetAllocatedCount() -
+      snapshotted_allocation.count;
 }
 
 }  // namespace tflite
diff --git a/tensorflow/lite/micro/recording_micro_allocator.h b/tensorflow/lite/micro/recording_micro_allocator.h
index b30b045cc34..a5b97c7ef3a 100644
--- a/tensorflow/lite/micro/recording_micro_allocator.h
+++ b/tensorflow/lite/micro/recording_micro_allocator.h
@@ -36,12 +36,11 @@ enum class RecordedAllocationType {
 // type. Each recording contains the number of bytes requested, the actual bytes
 // allocated (can defer from requested by alignment), and the number of items
 // allocated.
-typedef struct RecordedAllocation {
-  RecordedAllocation() : requested_bytes(0), used_bytes(0), count(0) {}
+struct RecordedAllocation {
   size_t requested_bytes;
   size_t used_bytes;
   size_t count;
-} RecordedAllocation;
+};
 
 // Utility subclass of MicroAllocator that records all allocations
 // inside the arena. A summary of allocations can be logged through the
@@ -82,9 +81,6 @@ class RecordingMicroAllocator : public MicroAllocator {
   TfLiteStatus AllocateVariables(TfLiteContext* context,
                                  const SubGraph* subgraph) override;
 
-  void SnapshotAllocationUsage(RecordedAllocation& recorded_allocation);
-  void RecordAllocationUsage(RecordedAllocation& recorded_allocation);
-
  private:
   RecordingMicroAllocator(RecordingSimpleMemoryAllocator* memory_allocator,
                           ErrorReporter* error_reporter);
@@ -93,13 +89,17 @@ class RecordingMicroAllocator : public MicroAllocator {
                                const char* allocation_name,
                                const char* allocation_description) const;
 
+  RecordedAllocation SnapshotAllocationUsage() const;
+  void RecordAllocationUsage(const RecordedAllocation& snapshotted_allocation,
+                             RecordedAllocation& recorded_allocation);
+
   const RecordingSimpleMemoryAllocator* recording_memory_allocator_;
 
-  RecordedAllocation recorded_tflite_tensor_array_data_;
-  RecordedAllocation recorded_tflite_tensor_array_quantization_data_;
-  RecordedAllocation recorded_tflite_tensor_variable_buffer_data_;
-  RecordedAllocation recorded_node_and_registration_array_data_;
-  RecordedAllocation recorded_op_data_;
+  RecordedAllocation recorded_tflite_tensor_array_data_ = {};
+  RecordedAllocation recorded_tflite_tensor_array_quantization_data_ = {};
+  RecordedAllocation recorded_tflite_tensor_variable_buffer_data_ = {};
+  RecordedAllocation recorded_node_and_registration_array_data_ = {};
+  RecordedAllocation recorded_op_data_ = {};
 
   TF_LITE_REMOVE_VIRTUAL_DELETE
 };
diff --git a/tensorflow/lite/micro/recording_micro_allocator_test.cc b/tensorflow/lite/micro/recording_micro_allocator_test.cc
index 775a2de2dfd..8b8eaa20638 100644
--- a/tensorflow/lite/micro/recording_micro_allocator_test.cc
+++ b/tensorflow/lite/micro/recording_micro_allocator_test.cc
@@ -43,12 +43,20 @@ TF_LITE_MICRO_TEST(TestRecordsTfLiteTensorArrayData) {
   tflite::RecordingMicroAllocator* micro_allocator =
       tflite::RecordingMicroAllocator::Create(arena, kTestConvArenaSize,
                                               micro_test::reporter);
-  TF_LITE_MICRO_EXPECT_NE(nullptr, micro_allocator);
-  TF_LITE_MICRO_EXPECT_GE(kTfLiteOk, micro_allocator->StartModelAllocation(
-                                         model, &context, all_ops_resolver,
-                                         &node_and_registration));
-  TF_LITE_MICRO_EXPECT_GE(
-      kTfLiteOk, micro_allocator->FinishModelAllocation(model, &context));
+  // TODO(b/158102673): ugly workaround for not having fatal assertions. Same
+  // throughout this file.
+  TF_LITE_MICRO_EXPECT_NE(micro_allocator, nullptr);
+  if (micro_allocator == nullptr) return 1;
+
+  TfLiteStatus status;
+  status = micro_allocator->StartModelAllocation(
+      model, &context, all_ops_resolver, &node_and_registration);
+  TF_LITE_MICRO_EXPECT_EQ(status, kTfLiteOk);
+  if (status != kTfLiteOk) return 1;
+
+  status = micro_allocator->FinishModelAllocation(model, &context);
+  TF_LITE_MICRO_EXPECT_EQ(status, kTfLiteOk);
+  if (status != kTfLiteOk) return 1;
 
   tflite::RecordedAllocation recorded_allocation =
       micro_allocator->GetRecordedAllocation(
@@ -70,12 +78,18 @@ TF_LITE_MICRO_TEST(TestRecordsTensorArrayQuantizationData) {
   tflite::RecordingMicroAllocator* micro_allocator =
       tflite::RecordingMicroAllocator::Create(arena, kTestConvArenaSize,
                                               micro_test::reporter);
-  TF_LITE_MICRO_EXPECT_NE(nullptr, micro_allocator);
-  TF_LITE_MICRO_EXPECT_GE(kTfLiteOk, micro_allocator->StartModelAllocation(
-                                         model, &context, all_ops_resolver,
-                                         &node_and_registration));
-  TF_LITE_MICRO_EXPECT_GE(
-      kTfLiteOk, micro_allocator->FinishModelAllocation(model, &context));
+  TF_LITE_MICRO_EXPECT_NE(micro_allocator, nullptr);
+  if (micro_allocator == nullptr) return 1;
+
+  TfLiteStatus status;
+  status = micro_allocator->StartModelAllocation(
+      model, &context, all_ops_resolver, &node_and_registration);
+  TF_LITE_MICRO_EXPECT_EQ(status, kTfLiteOk);
+  if (status != kTfLiteOk) return 1;
+
+  status = micro_allocator->FinishModelAllocation(model, &context);
+  TF_LITE_MICRO_EXPECT_EQ(status, kTfLiteOk);
+  if (status != kTfLiteOk) return 1;
 
   // Walk the model subgraph to find all tensors with quantization params and
   // keep a tally.
@@ -124,12 +138,18 @@ TF_LITE_MICRO_TEST(TestRecordsNodeAndRegistrationArrayData) {
   tflite::RecordingMicroAllocator* micro_allocator =
       tflite::RecordingMicroAllocator::Create(arena, kTestConvArenaSize,
                                               micro_test::reporter);
-  TF_LITE_MICRO_EXPECT_NE(nullptr, micro_allocator);
-  TF_LITE_MICRO_EXPECT_GE(kTfLiteOk, micro_allocator->StartModelAllocation(
-                                         model, &context, all_ops_resolver,
-                                         &node_and_registration));
-  TF_LITE_MICRO_EXPECT_GE(
-      kTfLiteOk, micro_allocator->FinishModelAllocation(model, &context));
+  TF_LITE_MICRO_EXPECT_NE(micro_allocator, nullptr);
+  if (micro_allocator == nullptr) return 1;
+
+  TfLiteStatus status;
+  status = micro_allocator->StartModelAllocation(
+      model, &context, all_ops_resolver, &node_and_registration);
+  TF_LITE_MICRO_EXPECT_EQ(status, kTfLiteOk);
+  if (status != kTfLiteOk) return 1;
+
+  status = micro_allocator->FinishModelAllocation(model, &context);
+  TF_LITE_MICRO_EXPECT_EQ(status, kTfLiteOk);
+  if (status != kTfLiteOk) return 1;
 
   size_t num_ops = model->subgraphs()->Get(0)->operators()->size();
   tflite::RecordedAllocation recorded_allocation =
@@ -142,6 +162,55 @@ TF_LITE_MICRO_TEST(TestRecordsNodeAndRegistrationArrayData) {
                           num_ops * NODE_AND_REGISTRATION_STRUCT_SIZE);
 }
 
+TF_LITE_MICRO_TEST(TestRecordsMultiTenantAllocations) {
+  TfLiteContext context;
+  tflite::AllOpsResolver all_ops_resolver;
+  tflite::NodeAndRegistration* node_and_registration;
+  const tflite::Model* model = tflite::GetModel(kTestConvModelData);
+
+  // Double the arena size to allocate two models inside of it:
+  uint8_t arena[kTestConvArenaSize * 2];
+
+  TfLiteStatus status;
+
+  tflite::RecordingMicroAllocator* micro_allocator =
+      tflite::RecordingMicroAllocator::Create(arena, kTestConvArenaSize * 2,
+                                              micro_test::reporter);
+  TF_LITE_MICRO_EXPECT_NE(micro_allocator, nullptr);
+  if (micro_allocator == nullptr) return 1;
+
+  // First allocation with the model in the arena:
+  status = micro_allocator->StartModelAllocation(
+      model, &context, all_ops_resolver, &node_and_registration);
+  TF_LITE_MICRO_EXPECT_EQ(status, kTfLiteOk);
+  if (status != kTfLiteOk) return 1;
+
+  status = micro_allocator->FinishModelAllocation(model, &context);
+  TF_LITE_MICRO_EXPECT_EQ(status, kTfLiteOk);
+  if (status != kTfLiteOk) return 1;
+
+  // Second allocation with the same model in the arena:
+  status = micro_allocator->StartModelAllocation(
+      model, &context, all_ops_resolver, &node_and_registration);
+  TF_LITE_MICRO_EXPECT_EQ(status, kTfLiteOk);
+  if (status != kTfLiteOk) return 1;
+
+  status = kTfLiteOk, micro_allocator->FinishModelAllocation(model, &context);
+  TF_LITE_MICRO_EXPECT_EQ(status, kTfLiteOk);
+  if (status != kTfLiteOk) return 1;
+
+  tflite::RecordedAllocation recorded_allocation =
+      micro_allocator->GetRecordedAllocation(
+          tflite::RecordedAllocationType::kTfLiteTensorArray);
+  TF_LITE_MICRO_EXPECT_EQ(recorded_allocation.count, context.tensors_size * 2);
+  TF_LITE_MICRO_EXPECT_EQ(
+      recorded_allocation.requested_bytes,
+      context.tensors_size * TF_LITE_TENSOR_STRUCT_SIZE * 2);
+  TF_LITE_MICRO_EXPECT_GE(
+      recorded_allocation.used_bytes,
+      context.tensors_size * TF_LITE_TENSOR_STRUCT_SIZE * 2);
+}
+
 // TODO(b/158124094): Find a way to audit OpData allocations on
 // cross-architectures.
 

From 89a5efab7d3fa63c6d2a34c78ef0914112fc8e7a Mon Sep 17 00:00:00 2001
From: Christian Sigg <csigg@google.com>
Date: Tue, 23 Jun 2020 13:44:38 -0700
Subject: [PATCH 0915/1390] Disable failing
 //tensorflow/core/kernels:conv_ops_test_gpu on CUDA11.

PiperOrigin-RevId: 317932591
Change-Id: I70cbb7489ef8125d6cbe4b6f3e18229913c2c7c9
---
 tensorflow/core/kernels/BUILD | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index e2ff5aed283..1e05ee90ff8 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -1762,6 +1762,7 @@ tf_cuda_cc_test(
     name = "conv_ops_test",
     size = "medium",
     srcs = ["conv_ops_test.cc"],
+    tags = ["no_cuda11"],  # b/159664089
     deps = [
         ":conv_ops",
         ":image",

From 77644192c476d39f2d8a31eb5dc12ec591d2a5c9 Mon Sep 17 00:00:00 2001
From: Marat Dukhan <maratek@google.com>
Date: Tue, 23 Jun 2020 13:52:13 -0700
Subject: [PATCH 0916/1390] Add unit test for Prelu in XNNPACK delegate

- Add PreluTester class and unit test for XNNPACK-delegated Prelu operator
- Relax restrictions on the number of input/output dimensions in delegated
  Prelu operators

PiperOrigin-RevId: 317933686
Change-Id: Ie7bac6c8d6bd358ef8b5d79042d6ae1af07e1c49
---
 tensorflow/lite/delegates/xnnpack/BUILD       |  32 +
 .../delegates/xnnpack/leaky_relu_tester.h     |   1 -
 .../lite/delegates/xnnpack/prelu_test.cc      | 583 ++++++++++++++++++
 .../lite/delegates/xnnpack/prelu_tester.cc    | 237 +++++++
 .../lite/delegates/xnnpack/prelu_tester.h     |  88 +++
 .../delegates/xnnpack/xnnpack_delegate.cc     |   6 +-
 6 files changed, 944 insertions(+), 3 deletions(-)
 create mode 100644 tensorflow/lite/delegates/xnnpack/prelu_test.cc
 create mode 100644 tensorflow/lite/delegates/xnnpack/prelu_tester.cc
 create mode 100644 tensorflow/lite/delegates/xnnpack/prelu_tester.h

diff --git a/tensorflow/lite/delegates/xnnpack/BUILD b/tensorflow/lite/delegates/xnnpack/BUILD
index eaf7d8f6f03..e0d3d39f719 100644
--- a/tensorflow/lite/delegates/xnnpack/BUILD
+++ b/tensorflow/lite/delegates/xnnpack/BUILD
@@ -180,6 +180,23 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "prelu_tester",
+    testonly = 1,
+    srcs = ["prelu_tester.cc"],
+    hdrs = ["prelu_tester.h"],
+    deps = [
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite:schema_fbs_version",
+        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/kernels:builtin_ops",
+        "//tensorflow/lite/schema:schema_fbs",
+        "@FP16",
+        "@com_google_googletest//:gtest",
+        "@flatbuffers",
+    ],
+)
+
 cc_library(
     name = "reduce_tester",
     testonly = 1,
@@ -527,6 +544,21 @@ cc_test(
     ],
 )
 
+cc_test(
+    name = "prelu_test",
+    srcs = ["prelu_test.cc"],
+    linkopts = select({
+        "//tensorflow:emscripten": EMSCRIPTEN_LINKOPTS,
+        "//conditions:default": [],
+    }),
+    deps = [
+        ":prelu_tester",
+        ":test_main",
+        ":xnnpack_delegate_test_mode",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
 cc_test(
     name = "relu_test",
     srcs = ["relu_test.cc"],
diff --git a/tensorflow/lite/delegates/xnnpack/leaky_relu_tester.h b/tensorflow/lite/delegates/xnnpack/leaky_relu_tester.h
index f1d9efd7209..191dc938e89 100644
--- a/tensorflow/lite/delegates/xnnpack/leaky_relu_tester.h
+++ b/tensorflow/lite/delegates/xnnpack/leaky_relu_tester.h
@@ -21,7 +21,6 @@ limitations under the License.
 
 #include <gtest/gtest.h>
 #include "tensorflow/lite/c/common.h"
-#include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
 namespace xnnpack {
diff --git a/tensorflow/lite/delegates/xnnpack/prelu_test.cc b/tensorflow/lite/delegates/xnnpack/prelu_test.cc
new file mode 100644
index 00000000000..10026915add
--- /dev/null
+++ b/tensorflow/lite/delegates/xnnpack/prelu_test.cc
@@ -0,0 +1,583 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <random>
+
+#include <gtest/gtest.h>
+#include "tensorflow/lite/delegates/xnnpack/prelu_tester.h"
+#include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
+
+namespace tflite {
+namespace xnnpack {
+
+// TODO(b/159727692)
+TEST(Prelu, DISABLED_4DBy4D) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  PreluTester()
+      .InputShape({batch, height, width, channels})
+      .SlopeShape({batch, height, width, channels})
+      .Test(xnnpack_delegate.get());
+}
+
+TEST(Prelu, 4DBy4DBroadcastChannels) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  PreluTester()
+      .InputShape({batch, height, width, channels})
+      .SlopeShape({1, 1, 1, channels})
+      .Test(xnnpack_delegate.get());
+}
+
+// TODO(b/159727692)
+TEST(Prelu, DISABLED_4DBy4DBroadcastWidth) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  PreluTester()
+      .InputShape({batch, height, width, channels})
+      .SlopeShape({1, 1, width, 1})
+      .Test(xnnpack_delegate.get());
+}
+
+// TODO(b/159727692)
+TEST(Prelu, DISABLED_4DBy4DBroadcastHeight) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  PreluTester()
+      .InputShape({batch, height, width, channels})
+      .SlopeShape({1, height, 1, 1})
+      .Test(xnnpack_delegate.get());
+}
+
+// TODO(b/159727692)
+TEST(Prelu, DISABLED_4DBy4DBroadcastBatch) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  PreluTester()
+      .InputShape({batch, height, width, channels})
+      .SlopeShape({batch, 1, 1, 1})
+      .Test(xnnpack_delegate.get());
+}
+
+// TODO(b/159727692)
+TEST(Prelu, DISABLED_4DBy4DBroadcastHeightWidthChannels) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  PreluTester()
+      .InputShape({batch, height, width, channels})
+      .SlopeShape({1, height, width, channels})
+      .Test(xnnpack_delegate.get());
+}
+
+// TODO(b/159727692)
+TEST(Prelu, DISABLED_4DBy3D) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  PreluTester()
+      .InputShape({batch, height, width, channels})
+      .SlopeShape({height, width, channels})
+      .Test(xnnpack_delegate.get());
+}
+
+// TODO(b/159727692)
+TEST(Prelu, DISABLED_4DBy2D) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  PreluTester()
+      .InputShape({batch, height, width, channels})
+      .SlopeShape({width, channels})
+      .Test(xnnpack_delegate.get());
+}
+
+TEST(Prelu, 4DBy1D) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  PreluTester()
+      .InputShape({batch, height, width, channels})
+      .SlopeShape({channels})
+      .Test(xnnpack_delegate.get());
+}
+
+// TODO(b/159727692)
+TEST(Prelu, DISABLED_4DBy0D) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  PreluTester()
+      .InputShape({batch, height, width, channels})
+      .SlopeShape({})
+      .Test(xnnpack_delegate.get());
+}
+
+// TODO(b/159727692)
+TEST(Prelu, DISABLED_3DBy3D) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  PreluTester()
+      .InputShape({batch, width, channels})
+      .SlopeShape({batch, width, channels})
+      .Test(xnnpack_delegate.get());
+}
+
+TEST(Prelu, 3DBy3DBroadcastChannels) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  PreluTester()
+      .InputShape({batch, width, channels})
+      .SlopeShape({1, 1, channels})
+      .Test(xnnpack_delegate.get());
+}
+
+// TODO(b/159727692)
+TEST(Prelu, DISABLED_3DBy3DBroadcastWidth) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  PreluTester()
+      .InputShape({batch, width, channels})
+      .SlopeShape({1, width, 1})
+      .Test(xnnpack_delegate.get());
+}
+
+// TODO(b/159727692)
+TEST(Prelu, DISABLED_3DBy3DBroadcastBatch) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  PreluTester()
+      .InputShape({batch, width, channels})
+      .SlopeShape({batch, 1, 1})
+      .Test(xnnpack_delegate.get());
+}
+
+// TODO(b/159727692)
+TEST(Prelu, DISABLED_3DBy3DBroadcastWidthChannels) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  PreluTester()
+      .InputShape({batch, width, channels})
+      .SlopeShape({1, width, channels})
+      .Test(xnnpack_delegate.get());
+}
+
+// TODO(b/159727692)
+TEST(Prelu, DISABLED_3DBy2D) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  PreluTester()
+      .InputShape({batch, width, channels})
+      .SlopeShape({width, channels})
+      .Test(xnnpack_delegate.get());
+}
+
+TEST(Prelu, 3DBy1D) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  PreluTester()
+      .InputShape({batch, width, channels})
+      .SlopeShape({channels})
+      .Test(xnnpack_delegate.get());
+}
+
+// TODO(b/159727692)
+TEST(Prelu, DISABLED_3DBy0D) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  PreluTester()
+      .InputShape({batch, width, channels})
+      .SlopeShape({})
+      .Test(xnnpack_delegate.get());
+}
+
+// TODO(b/159727692)
+TEST(Prelu, DISABLED_2DBy2D) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto channels = shape_rng();
+
+  PreluTester()
+      .InputShape({batch, channels})
+      .SlopeShape({batch, channels})
+      .Test(xnnpack_delegate.get());
+}
+
+TEST(Prelu, 2DBy2DBroadcastChannels) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto channels = shape_rng();
+
+  PreluTester()
+      .InputShape({batch, channels})
+      .SlopeShape({1, channels})
+      .Test(xnnpack_delegate.get());
+}
+
+// TODO(b/159727692)
+TEST(Prelu, DISABLED_2DBy2DBroadcastBatch) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto channels = shape_rng();
+
+  PreluTester()
+      .InputShape({batch, channels})
+      .SlopeShape({batch, 1})
+      .Test(xnnpack_delegate.get());
+}
+
+TEST(Prelu, 2DBy1D) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto channels = shape_rng();
+
+  PreluTester()
+      .InputShape({batch, channels})
+      .SlopeShape({channels})
+      .Test(xnnpack_delegate.get());
+}
+
+// TODO(b/159727692)
+TEST(Prelu, DISABLED_2DBy0D) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto channels = shape_rng();
+
+  PreluTester()
+      .InputShape({batch, channels})
+      .SlopeShape({})
+      .Test(xnnpack_delegate.get());
+}
+
+TEST(Prelu, 1DBy1D) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+
+  PreluTester().InputShape({batch}).SlopeShape({batch}).Test(
+      xnnpack_delegate.get());
+}
+
+// TODO(b/159727692)
+TEST(Prelu, DISABLED_1DBy0D) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+
+  PreluTester().InputShape({batch}).SlopeShape({}).Test(xnnpack_delegate.get());
+}
+
+TEST(Prelu, FP16Weights) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  PreluTester()
+      .InputShape({batch, height, width, channels})
+      .SlopeShape({channels})
+      .FP16Weights()
+      .Test(xnnpack_delegate.get());
+}
+
+TEST(Prelu, SparseWeights) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  PreluTester()
+      .InputShape({batch, height, width, channels})
+      .SlopeShape({channels})
+      .SparseWeights()
+      .Test(xnnpack_delegate.get());
+}
+
+TEST(Prelu, MultiThreading) {
+  TfLiteXNNPackDelegateOptions delegate_options =
+      TfLiteXNNPackDelegateOptionsDefault();
+  delegate_options.num_threads = 2;
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  PreluTester()
+      .InputShape({batch, height, width, channels})
+      .SlopeShape({channels})
+      .Test(xnnpack_delegate.get());
+}
+
+}  // namespace xnnpack
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/xnnpack/prelu_tester.cc b/tensorflow/lite/delegates/xnnpack/prelu_tester.cc
new file mode 100644
index 00000000000..ab20c2c51dc
--- /dev/null
+++ b/tensorflow/lite/delegates/xnnpack/prelu_tester.cc
@@ -0,0 +1,237 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/xnnpack/prelu_tester.h"
+
+#include <array>
+#include <cstdint>
+#include <functional>
+#include <numeric>
+#include <random>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include <fp16.h>
+#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/version.h"
+
+namespace tflite {
+namespace xnnpack {
+
+void PreluTester::Test(TfLiteDelegate* delegate) const {
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto input_rng = std::bind(std::uniform_real_distribution<float>(-1.0f, 1.0f),
+                             std::ref(rng));
+
+  std::vector<char> buffer = CreateTfLiteModel();
+  const Model* model = GetModel(buffer.data());
+
+  std::unique_ptr<Interpreter> delegate_interpreter;
+  ASSERT_EQ(
+      InterpreterBuilder(model, ::tflite::ops::builtin::BuiltinOpResolver())(
+          &delegate_interpreter),
+      kTfLiteOk);
+  std::unique_ptr<Interpreter> default_interpreter;
+  ASSERT_EQ(
+      InterpreterBuilder(model, ::tflite::ops::builtin::BuiltinOpResolver())(
+          &default_interpreter),
+      kTfLiteOk);
+
+  ASSERT_TRUE(delegate_interpreter);
+  ASSERT_TRUE(default_interpreter);
+
+  ASSERT_EQ(delegate_interpreter->inputs().size(), 1);
+  ASSERT_EQ(default_interpreter->inputs().size(), 1);
+
+  ASSERT_EQ(delegate_interpreter->outputs().size(), 1);
+  ASSERT_EQ(default_interpreter->outputs().size(), 1);
+
+  ASSERT_EQ(delegate_interpreter->AllocateTensors(), kTfLiteOk);
+  ASSERT_EQ(default_interpreter->AllocateTensors(), kTfLiteOk);
+
+  ASSERT_EQ(delegate_interpreter->ModifyGraphWithDelegate(delegate), kTfLiteOk);
+
+  float* default_input_data = default_interpreter->typed_tensor<float>(
+      default_interpreter->inputs()[0]);
+  std::generate(default_input_data,
+                default_input_data + ComputeSize(InputShape()),
+                std::ref(input_rng));
+
+  float* xnnpack_input_data = delegate_interpreter->typed_tensor<float>(
+      delegate_interpreter->inputs()[0]);
+  std::copy(default_input_data, default_input_data + ComputeSize(InputShape()),
+            xnnpack_input_data);
+
+  ASSERT_EQ(default_interpreter->Invoke(), kTfLiteOk);
+  ASSERT_EQ(delegate_interpreter->Invoke(), kTfLiteOk);
+
+  float* default_output_data = default_interpreter->typed_tensor<float>(
+      default_interpreter->outputs()[0]);
+  float* xnnpack_output_data = delegate_interpreter->typed_tensor<float>(
+      delegate_interpreter->outputs()[0]);
+
+  for (size_t i = 0; i < ComputeSize(OutputShape()); i++) {
+    ASSERT_EQ(default_output_data[i], xnnpack_output_data[i]);
+  }
+}
+
+std::vector<char> PreluTester::CreateTfLiteModel() const {
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto slope_rng = std::bind(std::uniform_real_distribution<float>(0.25f, 0.5f),
+                             std::ref(rng));
+
+  flatbuffers::FlatBufferBuilder builder;
+  std::vector<flatbuffers::Offset<OperatorCode>> operator_codes{
+      {CreateOperatorCode(builder, BuiltinOperator_PRELU)}};
+  if (FP16Weights()) {
+    operator_codes.emplace_back(
+        CreateOperatorCode(builder, BuiltinOperator_DEQUANTIZE));
+  } else if (SparseWeights()) {
+    operator_codes.emplace_back(
+        CreateOperatorCode(builder, BuiltinOperator_DENSIFY));
+  }
+
+  std::vector<flatbuffers::Offset<Buffer>> buffers{{
+      CreateBuffer(builder, builder.CreateVector({})),
+  }};
+
+  if (FP16Weights()) {
+    std::vector<uint16_t> slope_data(ComputeSize(SlopeShape()));
+    std::generate(slope_data.begin(), slope_data.end(),
+                  std::bind(fp16_ieee_from_fp32_value, slope_rng));
+
+    buffers.push_back(CreateBuffer(
+        builder, builder.CreateVector(
+                     reinterpret_cast<const uint8_t*>(slope_data.data()),
+                     sizeof(uint16_t) * slope_data.size())));
+  } else {
+    std::vector<float> slope_data(ComputeSize(SlopeShape()));
+    std::generate(slope_data.begin(), slope_data.end(), slope_rng);
+
+    buffers.push_back(CreateBuffer(
+        builder, builder.CreateVector(
+                     reinterpret_cast<const uint8_t*>(slope_data.data()),
+                     sizeof(float) * slope_data.size())));
+  }
+
+  std::vector<flatbuffers::Offset<Tensor>> tensors;
+  std::vector<flatbuffers::Offset<Operator>> operators;
+  if (FP16Weights()) {
+    tensors.emplace_back(CreateTensor(
+        builder,
+        builder.CreateVector<int32_t>(SlopeShape().data(), SlopeShape().size()),
+        TensorType_FLOAT16, /*buffer=*/1));
+  } else if (SparseWeights()) {
+    const int dims_count = SlopeShape().size();
+    std::vector<flatbuffers::Offset<DimensionMetadata>> dim_metadata(
+        dims_count);
+    std::vector<int> traversal_order(dims_count);
+    for (int i = 0; i < dims_count; i++) {
+      traversal_order[i] = i;
+      dim_metadata[i] = CreateDimensionMetadata(builder, DimensionType_DENSE,
+                                                SlopeShape()[i]);
+    }
+    const flatbuffers::Offset<SparsityParameters> sparsity_param =
+        CreateSparsityParameters(builder, builder.CreateVector(traversal_order),
+                                 0, builder.CreateVector(dim_metadata));
+    tensors.emplace_back(CreateTensor(
+        builder,
+        builder.CreateVector<int32_t>(SlopeShape().data(), SlopeShape().size()),
+        TensorType_FLOAT32, /*buffer=*/1, /*name=*/0, /*quantization=*/0,
+        /*is_variable=*/false, /*sparsity=*/sparsity_param));
+  }
+  if (FP16Weights()) {
+    const std::array<int32_t, 1> dequantize_inputs{{0}};
+    const std::array<int32_t, 1> dequantize_outputs{{2}};
+    operators.emplace_back(CreateOperator(
+        builder, /*opcode_index=*/1,
+        builder.CreateVector<int32_t>(dequantize_inputs.data(),
+                                      dequantize_inputs.size()),
+        builder.CreateVector<int32_t>(dequantize_outputs.data(),
+                                      dequantize_outputs.size())));
+  } else if (SparseWeights()) {
+    const std::array<int32_t, 1> densify_inputs{{0}};
+    const std::array<int32_t, 1> densify_outputs{{2}};
+    operators.emplace_back(
+        CreateOperator(builder, /*opcode_index=*/1,
+                       builder.CreateVector<int32_t>(densify_inputs.data(),
+                                                     densify_inputs.size()),
+                       builder.CreateVector<int32_t>(densify_outputs.data(),
+                                                     densify_outputs.size())));
+  }
+  tensors.emplace_back(CreateTensor(
+      builder,
+      builder.CreateVector<int32_t>(InputShape().data(), InputShape().size()),
+      TensorType_FLOAT32));
+  tensors.emplace_back(CreateTensor(
+      builder,
+      builder.CreateVector<int32_t>(SlopeShape().data(), SlopeShape().size()),
+      TensorType_FLOAT32,
+      /*buffer=*/(FP16Weights() || SparseWeights()) ? 0 : 1));
+  tensors.emplace_back(CreateTensor(
+      builder,
+      builder.CreateVector<int32_t>(OutputShape().data(), OutputShape().size()),
+      TensorType_FLOAT32));
+
+  const std::array<int32_t, 2> op_inputs{
+      {static_cast<int>(tensors.size()) - 3,
+       static_cast<int>(tensors.size()) - 2}};
+  const std::array<int32_t, 1> op_outputs{
+      {static_cast<int>(tensors.size()) - 1}};
+  operators.emplace_back(CreateOperator(
+      builder, /*opcode_index=*/0,
+      builder.CreateVector<int32_t>(op_inputs.data(), op_inputs.size()),
+      builder.CreateVector<int32_t>(op_outputs.data(), op_outputs.size())));
+
+  const std::array<int32_t, 1> subgraph_inputs{
+      {static_cast<int32_t>(tensors.size() - 3)}};
+  const std::array<int32_t, 1> subgraph_outputs{
+      {static_cast<int32_t>(tensors.size()) - 1}};
+  flatbuffers::Offset<SubGraph> subgraph = CreateSubGraph(
+      builder, builder.CreateVector(tensors.data(), tensors.size()),
+      builder.CreateVector<int32_t>(subgraph_inputs.data(),
+                                    subgraph_inputs.size()),
+      builder.CreateVector<int32_t>(subgraph_outputs.data(),
+                                    subgraph_outputs.size()),
+      builder.CreateVector(operators.data(), operators.size()));
+
+  flatbuffers::Offset<flatbuffers::String> description =
+      builder.CreateString("PReLU model");
+
+  flatbuffers::Offset<Model> model_buffer = CreateModel(
+      builder, TFLITE_SCHEMA_VERSION,
+      builder.CreateVector(operator_codes.data(), operator_codes.size()),
+      builder.CreateVector(&subgraph, 1), description,
+      builder.CreateVector(buffers.data(), buffers.size()));
+
+  builder.Finish(model_buffer);
+
+  return std::vector<char>(builder.GetBufferPointer(),
+                           builder.GetBufferPointer() + builder.GetSize());
+}
+
+int32_t PreluTester::ComputeSize(const std::vector<int32_t>& shape) {
+  return std::accumulate(shape.cbegin(), shape.cend(), 1,
+                         std::multiplies<int32_t>());
+}
+
+}  // namespace xnnpack
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/xnnpack/prelu_tester.h b/tensorflow/lite/delegates/xnnpack/prelu_tester.h
new file mode 100644
index 00000000000..e89bae6029b
--- /dev/null
+++ b/tensorflow/lite/delegates/xnnpack/prelu_tester.h
@@ -0,0 +1,88 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_XNNPACK_PRELU_TESTER_H_
+#define TENSORFLOW_LITE_DELEGATES_XNNPACK_PRELU_TESTER_H_
+
+#include <cstdint>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "tensorflow/lite/c/common.h"
+
+namespace tflite {
+namespace xnnpack {
+
+class PreluTester {
+ public:
+  PreluTester() = default;
+  PreluTester(const PreluTester&) = delete;
+  PreluTester& operator=(const PreluTester&) = delete;
+
+  inline PreluTester& InputShape(std::initializer_list<int32_t> shape) {
+    for (auto it = shape.begin(); it != shape.end(); ++it) {
+      EXPECT_GT(*it, 0);
+    }
+    input_shape_ = std::vector<int32_t>(shape.begin(), shape.end());
+    return *this;
+  }
+
+  inline const std::vector<int32_t>& InputShape() const { return input_shape_; }
+
+  inline PreluTester& SlopeShape(std::initializer_list<int32_t> shape) {
+    for (auto it = shape.begin(); it != shape.end(); ++it) {
+      EXPECT_GT(*it, 0);
+    }
+    slope_shape_ = std::vector<int32_t>(shape.begin(), shape.end());
+    return *this;
+  }
+
+  inline const std::vector<int32_t>& SlopeShape() const { return slope_shape_; }
+
+  inline const std::vector<int32_t>& OutputShape() const {
+    return InputShape();
+  }
+
+  inline PreluTester& FP16Weights() {
+    fp16_weights_ = true;
+    return *this;
+  }
+
+  inline bool FP16Weights() const { return fp16_weights_; }
+
+  inline PreluTester& SparseWeights() {
+    sparse_weights_ = true;
+    return *this;
+  }
+
+  inline bool SparseWeights() const { return sparse_weights_; }
+
+  void Test(TfLiteDelegate* delegate) const;
+
+ private:
+  std::vector<char> CreateTfLiteModel() const;
+
+  static int32_t ComputeSize(const std::vector<int32_t>& shape);
+
+  std::vector<int32_t> input_shape_;
+  std::vector<int32_t> slope_shape_;
+  bool fp16_weights_ = false;
+  bool sparse_weights_ = false;
+};
+
+}  // namespace xnnpack
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_XNNPACK_PRELU_TESTER_H_
diff --git a/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.cc b/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.cc
index 0afc9c32122..31468ef7407 100644
--- a/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.cc
+++ b/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.cc
@@ -2266,7 +2266,8 @@ class Subgraph {
     const TfLiteTensor& input_tensor = tensors[node->inputs->data[0]];
     TF_LITE_ENSURE_STATUS(CheckTensorFloatType(
         logging_context, input_tensor, node->inputs->data[0], node_index));
-    TF_LITE_ENSURE_STATUS(CheckTensorShape(logging_context, input_tensor, 4,
+    TF_LITE_ENSURE_STATUS(CheckTensorShape(logging_context, input_tensor, 1,
+                                           XNN_MAX_TENSOR_DIMS,
                                            node->inputs->data[0]));
     TF_LITE_ENSURE_STATUS(CheckTensorNonDynamicAllocation(
         logging_context, input_tensor, node->inputs->data[0], node_index));
@@ -2284,7 +2285,8 @@ class Subgraph {
     const TfLiteTensor& output_tensor = tensors[node->outputs->data[0]];
     TF_LITE_ENSURE_STATUS(CheckTensorFloatType(
         logging_context, output_tensor, node->outputs->data[0], node_index));
-    TF_LITE_ENSURE_STATUS(CheckTensorShape(logging_context, output_tensor, 4,
+    TF_LITE_ENSURE_STATUS(CheckTensorShape(logging_context, output_tensor, 1,
+                                           XNN_MAX_TENSOR_DIMS,
                                            node->outputs->data[0]));
     TF_LITE_ENSURE_STATUS(CheckTensorNonDynamicAllocation(
         logging_context, output_tensor, node->outputs->data[0], node_index));

From 68fe003b85997020d7d0d9bac03400230ba4bf63 Mon Sep 17 00:00:00 2001
From: Scott Zhu <scottzhu@google.com>
Date: Tue, 23 Jun 2020 13:59:05 -0700
Subject: [PATCH 0917/1390] Move the keras related CTL training test to
 keras/distribute.

PiperOrigin-RevId: 317934704
Change-Id: Iafccd476f87c2ff04c8fbbe9910239ac6d2b7cf3
---
 tensorflow/python/distribute/BUILD            | 22 ----------------
 tensorflow/python/keras/distribute/BUILD      | 26 +++++++++++++++++++
 .../custom_training_loop_models_test.py       |  0
 3 files changed, 26 insertions(+), 22 deletions(-)
 rename tensorflow/python/{ => keras}/distribute/custom_training_loop_models_test.py (100%)

diff --git a/tensorflow/python/distribute/BUILD b/tensorflow/python/distribute/BUILD
index 9900040a6e6..d2c46f64f18 100644
--- a/tensorflow/python/distribute/BUILD
+++ b/tensorflow/python/distribute/BUILD
@@ -1276,28 +1276,6 @@ distribute_py_test(
     ],
 )
 
-distribute_py_test(
-    name = "custom_training_loop_models_test",
-    srcs = ["custom_training_loop_models_test.py"],
-    main = "custom_training_loop_models_test.py",
-    tags = [
-        "multi_and_single_gpu",
-    ],
-    tpu_tags = [
-        "no_oss",  # b/153615544.
-    ],
-    deps = [
-        ":combinations",
-        ":strategy_combinations",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:variables",
-        "//tensorflow/python/data/ops:dataset_ops",
-        "//tensorflow/python/eager:test",
-        "//tensorflow/python/keras",
-        "@absl_py//absl/testing:parameterized",
-    ],
-)
-
 distribute_py_test(
     name = "custom_training_loop_optimizer_test",
     srcs = ["custom_training_loop_optimizer_test.py"],
diff --git a/tensorflow/python/keras/distribute/BUILD b/tensorflow/python/keras/distribute/BUILD
index c6a8f2c5f91..4245d70b1f0 100644
--- a/tensorflow/python/keras/distribute/BUILD
+++ b/tensorflow/python/keras/distribute/BUILD
@@ -77,6 +77,32 @@ cuda_py_test(
     ],
 )
 
+distribute_py_test(
+    name = "custom_training_loop_models_test",
+    srcs = ["custom_training_loop_models_test.py"],
+    main = "custom_training_loop_models_test.py",
+    tags = [
+        "multi_and_single_gpu",
+    ],
+    tpu_tags = [
+        "no_oss",  # b/153615544.
+    ],
+    deps = [
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:util",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/distribute:combinations",
+        "//tensorflow/python/distribute:reduce_util",
+        "//tensorflow/python/distribute:strategy_combinations",
+        "//tensorflow/python/eager:backprop",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/eager:test",
+        "//tensorflow/python/keras",
+        "//tensorflow/python/module",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
 py_library(
     name = "distribute_strategy_test_lib",
     srcs = [
diff --git a/tensorflow/python/distribute/custom_training_loop_models_test.py b/tensorflow/python/keras/distribute/custom_training_loop_models_test.py
similarity index 100%
rename from tensorflow/python/distribute/custom_training_loop_models_test.py
rename to tensorflow/python/keras/distribute/custom_training_loop_models_test.py

From cc9d951afa75936d533d6a011ad29d1728e3cc3a Mon Sep 17 00:00:00 2001
From: Wenhao Jia <jiawenhao@google.com>
Date: Tue, 23 Jun 2020 14:03:51 -0700
Subject: [PATCH 0918/1390] Fix a TPU test failure.

PiperOrigin-RevId: 317935580
Change-Id: Id0d06446f4d7159b375879ea97f251d10ee80195
---
 tensorflow/python/keras/layers/preprocessing/BUILD | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/python/keras/layers/preprocessing/BUILD b/tensorflow/python/keras/layers/preprocessing/BUILD
index 6916712d52c..9adf97d1fa5 100644
--- a/tensorflow/python/keras/layers/preprocessing/BUILD
+++ b/tensorflow/python/keras/layers/preprocessing/BUILD
@@ -369,6 +369,7 @@ distribute_py_test(
     tags = [
         "multi_and_single_gpu",
     ],
+    tpu_tags = ["no_oss"],
     deps = [
         ":category_crossing",
         "//tensorflow/python/distribute:combinations",

From b88bebf1ed553f282c960ce3c4ffec7a3d35dcf5 Mon Sep 17 00:00:00 2001
From: Karim Nosir <karimnosseir@google.com>
Date: Tue, 23 Jun 2020 14:19:01 -0700
Subject: [PATCH 0919/1390] Add check for control flow v1 in converter. If
 found any control flow v1 in the graph after importing then an error message
 will show and failure is signaled.

PiperOrigin-RevId: 317938209
Change-Id: I58e14a25dad2f2337d8ad05de55aff757dfcd0b2
---
 .../compiler/mlir/lite/tests/end2end/BUILD    |   1 +
 .../lite/tests/end2end/control_flow_v1.pbtxt  | 257 ++++++++++++++++++
 .../compiler/mlir/lite/tf_tfl_translate.cc    |   2 +-
 .../mlir/lite/tf_to_tfl_flatbuffer.cc         |  41 ++-
 .../compiler/mlir/lite/tf_to_tfl_flatbuffer.h |   3 +-
 5 files changed, 297 insertions(+), 7 deletions(-)
 create mode 100644 tensorflow/compiler/mlir/lite/tests/end2end/control_flow_v1.pbtxt

diff --git a/tensorflow/compiler/mlir/lite/tests/end2end/BUILD b/tensorflow/compiler/mlir/lite/tests/end2end/BUILD
index cf584987d2d..25bd761f99e 100644
--- a/tensorflow/compiler/mlir/lite/tests/end2end/BUILD
+++ b/tensorflow/compiler/mlir/lite/tests/end2end/BUILD
@@ -26,6 +26,7 @@ filegroup(
         "//tensorflow/compiler/mlir/lite:flatbuffer_to_string",
         "//tensorflow/compiler/mlir/lite:tf_tfl_translate",
         "@llvm-project//llvm:FileCheck",
+        "@llvm-project//llvm:not",
     ],
 )
 
diff --git a/tensorflow/compiler/mlir/lite/tests/end2end/control_flow_v1.pbtxt b/tensorflow/compiler/mlir/lite/tests/end2end/control_flow_v1.pbtxt
new file mode 100644
index 00000000000..7b3a4d14fea
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/tests/end2end/control_flow_v1.pbtxt
@@ -0,0 +1,257 @@
+# RUN: not tf_tfl_translate -tf-upgrade-legacy=false -tf-input-arrays=Placeholder,Placeholder_1 -tf-input-shapes=1,2:1 -tf-output-arrays=cond/Merge -tf-enable-shape-inference-on-import=false -mlir-print-debuginfo -output-mlir %s -o - 2>&1 | FileCheck %s
+
+# CHECK: error: The graph has Control Flow V1 ops. TFLite converter doesn't support Control Flow V1 ops. Consider using Control Flow V2 ops instead.
+
+node {
+  name: "Const"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 2
+          }
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\315\314\314=\315\314L>\232\231\231>\315\314\314>"
+      }
+    }
+  }
+}
+node {
+  name: "Placeholder"
+  op: "Placeholder"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: -1
+        }
+        dim {
+          size: 2
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Placeholder_1"
+  op: "Placeholder"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_BOOL
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+      }
+    }
+  }
+}
+node {
+  name: "cond/Switch"
+  op: "Switch"
+  input: "Placeholder_1"
+  input: "Placeholder_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_BOOL
+    }
+  }
+}
+node {
+  name: "cond/switch_t"
+  op: "Identity"
+  input: "cond/Switch:1"
+  attr {
+    key: "T"
+    value {
+      type: DT_BOOL
+    }
+  }
+}
+node {
+  name: "cond/switch_f"
+  op: "Identity"
+  input: "cond/Switch"
+  attr {
+    key: "T"
+    value {
+      type: DT_BOOL
+    }
+  }
+}
+node {
+  name: "cond/pred_id"
+  op: "Identity"
+  input: "Placeholder_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_BOOL
+    }
+  }
+}
+node {
+  name: "cond/MatMul"
+  op: "MatMul"
+  input: "cond/MatMul/Switch:1"
+  input: "cond/MatMul/Switch_1:1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "cond/MatMul/Switch"
+  op: "Switch"
+  input: "Placeholder"
+  input: "cond/pred_id"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@Placeholder"
+      }
+    }
+  }
+}
+node {
+  name: "cond/MatMul/Switch_1"
+  op: "Switch"
+  input: "Const"
+  input: "cond/pred_id"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@Const"
+      }
+    }
+  }
+}
+node {
+  name: "cond/Add"
+  op: "Add"
+  input: "cond/Add/Switch"
+  input: "cond/Add/Switch_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "cond/Add/Switch"
+  op: "Switch"
+  input: "Placeholder"
+  input: "cond/pred_id"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@Placeholder"
+      }
+    }
+  }
+}
+node {
+  name: "cond/Add/Switch_1"
+  op: "Switch"
+  input: "Const"
+  input: "cond/pred_id"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@Const"
+      }
+    }
+  }
+}
+node {
+  name: "cond/Merge"
+  op: "Merge"
+  input: "cond/Add"
+  input: "cond/MatMul"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "init"
+  op: "NoOp"
+}
+versions {
+  producer: 134
+}
diff --git a/tensorflow/compiler/mlir/lite/tf_tfl_translate.cc b/tensorflow/compiler/mlir/lite/tf_tfl_translate.cc
index 31dad60c294..fcaebe82f74 100644
--- a/tensorflow/compiler/mlir/lite/tf_tfl_translate.cc
+++ b/tensorflow/compiler/mlir/lite/tf_tfl_translate.cc
@@ -172,7 +172,7 @@ int main(int argc, char **argv) {
         input_file_name, input_mlir, use_splatted_constant, custom_opdefs,
         debug_info_file, input_arrays, input_dtypes, input_shapes,
         output_arrays,
-        /*prune_unused_nodes=*/true, &source_mgr, &context);
+        /*prune_unused_nodes=*/true, upgrade_legacy, &source_mgr, &context);
   }
 
   // If errors occur, the library call in the above already logged the error
diff --git a/tensorflow/compiler/mlir/lite/tf_to_tfl_flatbuffer.cc b/tensorflow/compiler/mlir/lite/tf_to_tfl_flatbuffer.cc
index 38b96cf833f..2e45953c5fa 100644
--- a/tensorflow/compiler/mlir/lite/tf_to_tfl_flatbuffer.cc
+++ b/tensorflow/compiler/mlir/lite/tf_to_tfl_flatbuffer.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "llvm/Support/raw_ostream.h"
 #include "mlir/IR/Attributes.h"  // from @llvm-project
 #include "mlir/IR/Module.h"  // from @llvm-project
+#include "mlir/IR/Visitors.h"  // from @llvm-project
 #include "mlir/Parser.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Support/FileUtilities.h"  // from @llvm-project
@@ -28,6 +29,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/lite/flatbuffer_export.h"
 #include "tensorflow/compiler/mlir/lite/quantization/quantization_config.h"
 #include "tensorflow/compiler/mlir/lite/transforms/passes.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_executor.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/decode_constant.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
 #include "tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate.h"
@@ -39,19 +41,47 @@ limitations under the License.
 #include "tensorflow/stream_executor/lib/statusor.h"
 
 namespace tensorflow {
-
+namespace {
 using mlir::MLIRContext;
 using mlir::ModuleOp;
+using mlir::Operation;
 using mlir::OwningModuleRef;
 using stream_executor::port::StatusOr;
 
+bool IsControlFlowV1Op(Operation* op) {
+  return mlir::isa<mlir::tf_executor::SwitchOp>(op) ||
+         mlir::isa<mlir::tf_executor::MergeOp>(op) ||
+         mlir::isa<mlir::tf_executor::EnterOp>(op) ||
+         mlir::isa<mlir::tf_executor::ExitOp>(op) ||
+         mlir::isa<mlir::tf_executor::NextIterationSinkOp>(op) ||
+         mlir::isa<mlir::tf_executor::NextIterationSourceOp>(op);
+}
+
+mlir::LogicalResult IsValidGraph(mlir::ModuleOp module) {
+  auto result = module.walk([&](Operation* op) {
+    return IsControlFlowV1Op(op) ? mlir::WalkResult::interrupt()
+                                 : mlir::WalkResult::advance();
+  });
+  if (result.wasInterrupted()) {
+    module.emitError(
+        "The graph has Control Flow V1 ops. TFLite converter doesn't support "
+        "Control Flow V1 ops. Consider using Control Flow V2 ops instead. See "
+        "https://www.tensorflow.org/api_docs/python/tf/compat/v1/"
+        "enable_control_flow_v2.");
+    return mlir::failure();
+  }
+  return mlir::success();
+}
+}  // namespace
+
 StatusOr<OwningModuleRef> LoadFromGraphdefOrMlirSource(
     const std::string& input_filename, bool input_mlir,
     bool use_splatted_constant, const std::vector<std::string>& extra_tf_opdefs,
     absl::string_view debug_info_file, absl::string_view input_arrays,
     absl::string_view input_dtypes, absl::string_view input_shapes,
     absl::string_view output_arrays, bool prune_unused_nodes,
-    llvm::SourceMgr* source_mgr, MLIRContext* context) {
+    bool enable_upgrade_legacy, llvm::SourceMgr* source_mgr,
+    MLIRContext* context) {
   // Set up the input file.
   std::string error_message;
   auto file = mlir::openInputFile(input_filename, &error_message);
@@ -86,14 +116,14 @@ StatusOr<OwningModuleRef> LoadFromGraphdefOrMlirSource(
         file->getBuffer(), debug_info_file, input_arrays, input_dtypes,
         input_shapes, output_arrays, /*control_output_arrays=*/"",
         prune_unused_nodes, /*convert_legacy_fed_inputs=*/true,
-        /*graph_as_function=*/false, /*upgrade_legacy=*/true,
+        /*graph_as_function=*/false, enable_upgrade_legacy,
         /*enable_shape_inference=*/false, context);
   }
   return tensorflow::GraphdefToMlirTranslateFunction(
       file->getBuffer(), debug_info_file, input_arrays, input_dtypes,
       input_shapes, output_arrays, /*control_output_arrays=*/"",
       prune_unused_nodes, /*convert_legacy_fed_inputs=*/true,
-      /*graph_as_function=*/false, /*upgrade_legacy=*/true,
+      /*graph_as_function=*/false, enable_upgrade_legacy,
       /*enable_shape_inference=*/false, context);
 }
 
@@ -104,7 +134,8 @@ Status ConvertTFExecutorToTFLOrFlatbuffer(
     mlir::PassManager* pass_manager) {
   mlir::StatusScopedDiagnosticHandler statusHandler(module.getContext(),
                                                     /*propagate=*/true);
-  if (failed(pass_manager->run(module))) {
+
+  if (failed(IsValidGraph(module)) || failed(pass_manager->run(module))) {
     return statusHandler.ConsumeStatus();
   }
 
diff --git a/tensorflow/compiler/mlir/lite/tf_to_tfl_flatbuffer.h b/tensorflow/compiler/mlir/lite/tf_to_tfl_flatbuffer.h
index d2c31a6b972..82cf9c9549b 100644
--- a/tensorflow/compiler/mlir/lite/tf_to_tfl_flatbuffer.h
+++ b/tensorflow/compiler/mlir/lite/tf_to_tfl_flatbuffer.h
@@ -41,7 +41,8 @@ LoadFromGraphdefOrMlirSource(
     absl::string_view debug_info_file, absl::string_view input_arrays,
     absl::string_view input_dtypes, absl::string_view input_shapes,
     absl::string_view output_arrays, bool prune_unused_nodes,
-    llvm::SourceMgr* source_mgr, mlir::MLIRContext* context);
+    bool enable_upgrade_legacy, llvm::SourceMgr* source_mgr,
+    mlir::MLIRContext* context);
 
 // Load Saved model (either v1 or v2) into MLIR.
 stream_executor::port::StatusOr<mlir::OwningModuleRef> ImportSavedModel(

From 513fff7cf6378a3864ad0e8a4194fab885d487d3 Mon Sep 17 00:00:00 2001
From: Jonah Kohn <51345541+jonah-kohn@users.noreply.github.com>
Date: Tue, 23 Jun 2020 14:30:10 -0700
Subject: [PATCH 0920/1390] Correct parameter syntax for vis_utils.plot_model

---
 tensorflow/python/keras/utils/vis_utils_test.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/keras/utils/vis_utils_test.py b/tensorflow/python/keras/utils/vis_utils_test.py
index b1dc4c21ab8..c5e3d18e734 100644
--- a/tensorflow/python/keras/utils/vis_utils_test.py
+++ b/tensorflow/python/keras/utils/vis_utils_test.py
@@ -92,7 +92,7 @@ class ModelToDotFormatTest(test.TestCase):
     try:
       vis_utils.plot_model(
           model, to_file=dot_img_file, show_shapes=True,
-          show_dtype=True expand_nested=True)
+          show_dtype=True, expand_nested=True)
       self.assertTrue(file_io.file_exists(dot_img_file))
       file_io.delete_file(dot_img_file)
     except ImportError:

From 4f6e48e9fd6fd8d536a96d78a6e6006e4ac0074c Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 23 Jun 2020 14:28:35 -0700
Subject: [PATCH 0921/1390] Fixed shape inference in xlog1py. Modified binary
 ops tests to run on 1d & 2d inputs so that broadcasting is also tested
 implicitly. Verified that test failed on xlog1py prior to change in
 binary_ops.cc.

PiperOrigin-RevId: 317939721
Change-Id: I6f1f8e501028b84933c152d7315fe67fee5b9b46
---
 tensorflow/compiler/tests/binary_ops_test.py     | 12 ++++++------
 tensorflow/compiler/tf2xla/kernels/binary_ops.cc |  1 +
 2 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/tensorflow/compiler/tests/binary_ops_test.py b/tensorflow/compiler/tests/binary_ops_test.py
index 789309bb3bc..07a41d67520 100644
--- a/tensorflow/compiler/tests/binary_ops_test.py
+++ b/tensorflow/compiler/tests/binary_ops_test.py
@@ -229,16 +229,16 @@ class BinaryOpsTest(xla_test.XLATestCase):
         self._testBinary(
             gen_math_ops.xdivy,
             np.array([0, 4, 3, 2, 1, 0], dtype=dtype),
-            np.array([0, 5, 6, 7, 8, float("NaN")], dtype=dtype),
-            expected=np.array([0, 0.8, 0.5, 0.285714, 0.125, 0], dtype=dtype),
+            np.array([[0, 5, 6, 7, 8, float("NaN")]], dtype=dtype),
+            expected=np.array([[0, 0.8, 0.5, 0.285714, 0.125, 0]], dtype=dtype),
             rtol=1e-6,
             atol=1e-6)
 
         self._testBinary(
             gen_math_ops.xlogy,
             np.array([0, 4, 3, 2, 1, 0], dtype=dtype),
-            np.array([0, 5, 6, 7, 8, float("NaN")], dtype=dtype),
-            expected=np.array([0, 6.437752, 5.375278, 3.89182, 2.079442, 0],
+            np.array([[0, 5, 6, 7, 8, float("NaN")]], dtype=dtype),
+            expected=np.array([[0, 6.437752, 5.375278, 3.89182, 2.079442, 0]],
                               dtype=dtype),
             rtol=1e-4,
             atol=1e-6)
@@ -246,8 +246,8 @@ class BinaryOpsTest(xla_test.XLATestCase):
         self._testBinary(
             gen_math_ops.xlog1py,
             np.array([0, 4, 3, 2, 1, 0], dtype=dtype),
-            np.array([-1, 5, 6, 7, 8, float("NaN")], dtype=dtype),
-            expected=np.array([0, 7.167038, 5.837730, 4.158883, 2.197225, 0],
+            np.array([[-1, 5, 6, 7, 8, float("NaN")]], dtype=dtype),
+            expected=np.array([[0, 7.167038, 5.837730, 4.158883, 2.197225, 0]],
                               dtype=dtype),
             rtol=1e-4,
             atol=1e-6)
diff --git a/tensorflow/compiler/tf2xla/kernels/binary_ops.cc b/tensorflow/compiler/tf2xla/kernels/binary_ops.cc
index 0ea851e9325..88d7525e5d5 100644
--- a/tensorflow/compiler/tf2xla/kernels/binary_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/binary_ops.cc
@@ -153,6 +153,7 @@ XLA_MAKE_BINARY(Xlogy, XlogyImpl(lhs, rhs, broadcast_helper));
 
 xla::XlaOp Xlog1pyImpl(xla::XlaOp x, xla::XlaOp y,
                        const BCast& broadcast_helper) {
+  std::tie(x, y) = XlaBinaryOp::Broadcast(x, y, broadcast_helper);
   auto non_zero = xla::Mul(x, xla::Log1p(y));
   auto zero = xla::ZerosLike(non_zero);
   auto x_is_zero = xla::Eq(x, zero);

From 686dd8ecd88a65c562e3e698b9f15d66f5d8344a Mon Sep 17 00:00:00 2001
From: abhichou4 <abhichou4@gmail.com>
Date: Wed, 24 Jun 2020 03:15:49 +0530
Subject: [PATCH 0922/1390] use is_compatible_with, add error test

---
 tensorflow/python/eager/forwardprop.py      | 27 +++++++++++++--------
 tensorflow/python/eager/forwardprop_test.py | 24 ++++++++++++++++--
 2 files changed, 39 insertions(+), 12 deletions(-)

diff --git a/tensorflow/python/eager/forwardprop.py b/tensorflow/python/eager/forwardprop.py
index ba64c91fbbf..90acfc79829 100644
--- a/tensorflow/python/eager/forwardprop.py
+++ b/tensorflow/python/eager/forwardprop.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import functools
 import threading
 
 from tensorflow.python import pywrap_tfe
@@ -30,6 +31,7 @@ from tensorflow.python.eager import function
 from tensorflow.python.framework import ops
 
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops.parallel_for import control_flow_ops
 from tensorflow.python.ops.unconnected_gradients import UnconnectedGradients
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import nest
@@ -141,7 +143,7 @@ def _jvp_helper(op_name, attr_tuple, inputs, outputs, tangents):
 
 
 def _jvp_helper_wrapper(
-  op_name, attr_tuple, inputs, outputs, tangents, batch_size
+  op_name, attr_tuple, inputs, outputs, tangents, use_batch 
 ):
   """Computes a batch of Jacobian-vector product for an op.
 
@@ -150,22 +152,27 @@ def _jvp_helper_wrapper(
     attr_tuple: Attributes of the operation.
     inputs: A flat list of input Tensors to the operation.
     outputs: A flat list of output Tensors from the operation.
-    tangents: A flat list of Tensors, same shape as `[batch_size] + input_shape`.
+    tangents: A flat list of Tensors, compatible with shape `[None] + input_shape`.
+    use_batch: A bool, True to vetorize over batch of tangents 
+      of shape `[None] + input_shape`
 
   Returns:
-    A flat list of tangents corresponding to `outputs`.
+    A flat list of tangents compatible with `outputs` or `[None] + output_shape`.
+  
+  Raises:
+    ValueError: if tangent shapes are not compatible with input shapes.
+
   """
-  if batch_size:
+  if use_batch:
     for primal, tangent in zip(inputs, tangents):
-      if tangent.shape != array_ops.concat([batch_size], primal.shape, 0):
+      if tangent.shape.is_compatible_with(primal.shape): 
         raise ValueError(
           "Tangent {} was expected to be of shape "
           "{} but is instead of shape {}".format(
-          tangent, [batch_size] + primal.shape, tangent.shape
+          tangent, [None] + primal.shape, tangent.shape
         )
       )
 
-  if batch_size:
     return control_flow_ops.vectorized_map(
       functools.partial(_jvp_helper, op_name, attr_tuple, inputs, outputs),
       tangents,
@@ -196,17 +203,17 @@ _TRACE_COUNT_LIMIT = 32
 
 
 def _jvp_dispatch(
-  op_name, attr_tuple, inputs, outputs, tangents, batch_size=None
+  op_name, attr_tuple, inputs, outputs, tangents, use_batch=False
 ):
   """Determine which forwardprop function to call."""
   # Note that this _TRACE_COUNT read races with writes. That's fine, it just
   # means we may trace a few more exact shapes before moving on to relaxation.
   if _TRACE_COUNT.get(op_name, 0) < _TRACE_COUNT_LIMIT:
     return _jvp_exact_shapes(
-        op_name, attr_tuple, inputs, outputs, tangents, batch_size)
+        op_name, attr_tuple, inputs, outputs, tangents, use_batch)
   else:
     return _jvp_relaxed_shapes(
-        op_name, attr_tuple, inputs, outputs, tangents, batch_size)
+        op_name, attr_tuple, inputs, outputs, tangents, use_batch)
 
 
 pywrap_tfe.TFE_Py_RegisterJVPFunction(_jvp_dispatch)
diff --git a/tensorflow/python/eager/forwardprop_test.py b/tensorflow/python/eager/forwardprop_test.py
index 1743cd64967..04d8b16dc04 100644
--- a/tensorflow/python/eager/forwardprop_test.py
+++ b/tensorflow/python/eager/forwardprop_test.py
@@ -27,6 +27,7 @@ import numpy as np
 from tensorflow.python import pywrap_tfe
 from tensorflow.python.distribute import mirrored_strategy
 from tensorflow.python.eager import backprop
+from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
 from tensorflow.python.eager import forwardprop
 from tensorflow.python.eager import forwardprop_util
@@ -257,7 +258,7 @@ class ForwardpropTest(test.TestCase, parameterized.TestCase):
             constant_op.constant([1., 2., 3.]),
             constant_op.constant([4., 5., 6.]),
         ),
-        batch_size=3)
+        use_batch=True)
 
     # Using evaluate and asserting with just a list works too
     # but the output is more explicit this way
@@ -276,12 +277,31 @@ class ForwardpropTest(test.TestCase, parameterized.TestCase):
             constant_op.constant([[1.], [0.], [1.]]),
             constant_op.constant([[0.], [1.], [1.]]),
         ),
-        batch_size=3)
+        use_batch=True)
     self.assertAllClose(
       [constant_op.constant([[5.], [4.], [5. + 4.]])],
       jvp_flat
     )
 
+  def testJVPFunctionRaisesError(self):
+    context.ensure_initialized()
+    ctx = context.context()
+
+    sum_outputs = (constant_op.constant(6.),)
+    
+    with self.assertRaises(ValueError):
+      forwardprop._jvp_dispatch(
+        op_name="Add",
+        attr_tuple=(),
+        inputs=(constant_op.constant(2.), constant_op.constant(4.)),
+        outputs=sum_outputs,
+        tangents=(
+	  constant_op.constant([1., 2.]),
+	  constant_op.constant([[1.], [2.]])
+	),
+        use_batch=True
+      )
+
   def testNonDifferentiableOpWithInputTangent(self):
     x = constant_op.constant(1.)
     with forwardprop.ForwardAccumulator(x, 2.) as acc1:

From 00be49f0c049e75b2478bc1b691e11ca4d92644c Mon Sep 17 00:00:00 2001
From: Ken Franko <kfranko@google.com>
Date: Tue, 23 Jun 2020 14:35:42 -0700
Subject: [PATCH 0923/1390] Extract code to utils for setting device for
 outside compilation host computation.

PiperOrigin-RevId: 317940898
Change-Id: Id15913f3aa2ea42c2e2f2e78302d6fff60ee49c5
---
 tensorflow/compiler/mlir/tensorflow/BUILD     |   3 +
 ...u_extract_head_tail_outside_compilation.cc |  66 +------
 .../utils/tpu_rewrite_device_util.cc          |  55 ++++++
 .../utils/tpu_rewrite_device_util.h           |  10 +
 .../utils/tpu_rewrite_device_util_test.cc     | 182 ++++++++++++++++++
 5 files changed, 254 insertions(+), 62 deletions(-)

diff --git a/tensorflow/compiler/mlir/tensorflow/BUILD b/tensorflow/compiler/mlir/tensorflow/BUILD
index b159815d5eb..db31d4faf5f 100644
--- a/tensorflow/compiler/mlir/tensorflow/BUILD
+++ b/tensorflow/compiler/mlir/tensorflow/BUILD
@@ -1356,6 +1356,7 @@ cc_library(
     srcs = ["utils/tpu_rewrite_device_util.cc"],
     hdrs = ["utils/tpu_rewrite_device_util.h"],
     deps = [
+        ":tensorflow",
         "//tensorflow/compiler/xla:array4d",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
         "//tensorflow/compiler/xla/service:computation_placer",
@@ -1366,6 +1367,7 @@ cc_library(
         "@com_google_absl//absl/strings",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Support",
     ],
 )
 
@@ -1374,6 +1376,7 @@ tf_cc_test(
     size = "small",
     srcs = ["utils/tpu_rewrite_device_util_test.cc"],
     deps = [
+        ":device_util",
         ":tpu_rewrite_device_util",
         "//tensorflow/core:framework",
         "//tensorflow/core:test",
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_extract_head_tail_outside_compilation.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_extract_head_tail_outside_compilation.cc
index bdfe43fc9cb..2be6ee7a78c 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_extract_head_tail_outside_compilation.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_extract_head_tail_outside_compilation.cc
@@ -113,64 +113,6 @@ tf_device::LaunchOp CreateLaunchForBlock(OpBuilder* builder, Operation* op,
   return launch;
 }
 
-// Parses TPU compilation and execution devices from a TPU cluster and returns
-// the host device for the head and tail computations. If the TPU computation is
-// replicated, kTPUReplicatedHost is returned instead.
-LogicalResult GetHostDeviceForHeadTailComputation(
-    mlir::TF::RuntimeDevices devices, tf_device::ClusterOp cluster,
-    std::string* host_device) {
-  auto replicate = cluster.getParentOfType<tf_device::ReplicateOp>();
-  if (replicate) {
-    *host_device = tensorflow::kTPUReplicatedHost;
-    return success();
-  }
-
-  auto num_cores_per_replica_attr =
-      cluster.getAttrOfType<IntegerAttr>(tensorflow::kNumCoresPerReplicaAttr);
-  if (!num_cores_per_replica_attr)
-    return cluster.emitOpError(
-        "cluster op missing `num_cores_per_replica` attribute");
-
-  if (num_cores_per_replica_attr.getInt() != 1)
-    return cluster.emitOpError(
-        "outside compilation is not supported with model parallelism.");
-
-  auto topology_attr =
-      cluster.getAttrOfType<StringAttr>(tensorflow::kTopologyAttr);
-  if (!topology_attr)
-    return cluster.emitOpError("cluster op missing `topology` attribute");
-
-  auto device_assignment_attr =
-      cluster.getAttrOfType<mlir::ArrayAttr>(tensorflow::kDeviceAssignmentAttr);
-  if (!device_assignment_attr)
-    return cluster.emitOpError(llvm::formatv("requires attribute '{0}'",
-                                             tensorflow::kDeviceAssignmentAttr)
-                                   .str());
-
-  auto status_or_device_coodinates =
-      tensorflow::GetDeviceCoordinates(device_assignment_attr);
-
-  if (!status_or_device_coodinates.ok())
-    return cluster.emitError()
-           << "error in fetching tpu device coordinates: "
-           << status_or_device_coodinates.status().error_message();
-
-  // Determine compilation and execution devices.
-  auto status_or_tpu_device_assignment =
-      tensorflow::GetTPUCompilationAndExecutionDevices(
-          devices.device_names(), /*num_replicas=*/1,
-          /*num_cores_per_replica=*/1, topology_attr.getValue(),
-          status_or_device_coodinates.ConsumeValueOrDie());
-  if (!status_or_tpu_device_assignment.ok())
-    return cluster.emitError()
-           << "error in fetching TPU compilation/execution devices: "
-           << status_or_tpu_device_assignment.status().error_message();
-  auto& tpu_device_assignment = status_or_tpu_device_assignment.ValueOrDie();
-
-  *host_device = tpu_device_assignment.tpu_devices[0][0].host;
-  return success();
-}
-
 // Returns a set of ops that are outside compiled and can be extracted to before
 // the TPU computation. These ops are either connected to the inputs of the TPU
 // computation or other ops that can be extracted, and have no operands from
@@ -232,8 +174,8 @@ mlir::LogicalResult LiftHeadOutsideCompiledOps(
   llvm::SmallVector<Operation*, 4> head_outside_compiled_ops =
       FindOutsideCompiledOpsAtHead(cluster);
   if (head_outside_compiled_ops.empty()) return success();
-  if (failed(
-          GetHostDeviceForHeadTailComputation(devices, cluster, host_device)))
+  if (failed(tensorflow::GetHostDeviceOutsideComputation(devices, cluster,
+                                                         host_device)))
     return failure();
 
   CreateHeadComputation(builder, cluster, head_outside_compiled_ops,
@@ -361,8 +303,8 @@ mlir::LogicalResult LiftTailOutsideCompiledOps(
   if (tail_outside_compiled_ops.empty()) return success();
 
   if (host_device.empty())
-    if (failed(GetHostDeviceForHeadTailComputation(devices, *cluster,
-                                                   &host_device)))
+    if (failed(tensorflow::GetHostDeviceOutsideComputation(devices, *cluster,
+                                                           &host_device)))
       return failure();
 
   // Forward all results of cluster first. These results will be remapped once
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util.cc b/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util.cc
index 282b7ad3139..f884b75bce1 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util.cc
@@ -484,4 +484,59 @@ std::string GetDeviceAliasForLogicalCore(int core_index) {
   return llvm::formatv("{0}_{1}", kTPUReplicatedCore, core_index).str();
 }
 
+mlir::LogicalResult GetHostDeviceOutsideComputation(
+    mlir::TF::RuntimeDevices devices, mlir::tf_device::ClusterOp cluster,
+    std::string* host_device) {
+  auto replicate = cluster.getParentOfType<mlir::tf_device::ReplicateOp>();
+  if (replicate) {
+    *host_device = tensorflow::kTPUReplicatedHost;
+    return mlir::success();
+  }
+
+  auto num_cores_per_replica_attr = cluster.getAttrOfType<mlir::IntegerAttr>(
+      tensorflow::kNumCoresPerReplicaAttr);
+  if (!num_cores_per_replica_attr)
+    return cluster.emitOpError(
+        "cluster op missing `num_cores_per_replica` attribute");
+
+  if (num_cores_per_replica_attr.getInt() != 1)
+    return cluster.emitOpError(
+        "outside compilation is not supported with model parallelism.");
+
+  auto topology_attr =
+      cluster.getAttrOfType<mlir::StringAttr>(tensorflow::kTopologyAttr);
+  if (!topology_attr)
+    return cluster.emitOpError("cluster op missing `topology` attribute");
+
+  auto device_assignment_attr =
+      cluster.getAttrOfType<mlir::ArrayAttr>(tensorflow::kDeviceAssignmentAttr);
+  if (!device_assignment_attr)
+    return cluster.emitOpError(llvm::formatv("requires attribute '{0}'",
+                                             tensorflow::kDeviceAssignmentAttr)
+                                   .str());
+
+  auto status_or_device_coodinates =
+      tensorflow::GetDeviceCoordinates(device_assignment_attr);
+
+  if (!status_or_device_coodinates.ok())
+    return cluster.emitError()
+           << "error in fetching tpu device coordinates: "
+           << status_or_device_coodinates.status().error_message();
+
+  // Determine compilation and execution devices.
+  auto status_or_tpu_device_assignment =
+      tensorflow::GetTPUCompilationAndExecutionDevices(
+          devices.device_names(), /*num_replicas=*/1,
+          /*num_cores_per_replica=*/1, topology_attr.getValue(),
+          status_or_device_coodinates.ConsumeValueOrDie());
+  if (!status_or_tpu_device_assignment.ok())
+    return cluster.emitError()
+           << "error in fetching TPU compilation/execution devices: "
+           << status_or_tpu_device_assignment.status().error_message();
+  auto& tpu_device_assignment = status_or_tpu_device_assignment.ValueOrDie();
+
+  *host_device = tpu_device_assignment.tpu_devices[0][0].host;
+  return mlir::success();
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util.h b/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util.h
index 6bb541ab683..96cc8d7877b 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util.h
+++ b/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util.h
@@ -23,6 +23,9 @@ limitations under the License.
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
 #include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_structs.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/util/device_name_utils.h"
@@ -237,6 +240,13 @@ StatusOr<TPUDeviceAssignment> GetTPUCompilationAndExecutionDevices(
 // logical core.
 std::string GetDeviceAliasForLogicalCore(int core_index);
 
+// Parses TPU compilation and execution devices from a TPU cluster and returns
+// the host device for the head and tail computations. If the TPU computation is
+// replicated, kTPUReplicatedHost is returned instead.
+mlir::LogicalResult GetHostDeviceOutsideComputation(
+    mlir::TF::RuntimeDevices devices, mlir::tf_device::ClusterOp cluster,
+    std::string* host_device);
+
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_TPU_REWRITE_DEVICE_UTIL_H_
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util_test.cc b/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util_test.cc
index a70e93a0195..49a8f704b30 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util_test.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util_test.cc
@@ -21,6 +21,8 @@ limitations under the License.
 #include "llvm/Support/FormatVariadic.h"
 #include "mlir/IR/Builders.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/Module.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/utils/device_util.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/protobuf/tpu/topology.pb.h"
@@ -622,5 +624,185 @@ TEST(TPURewriteDeviceUtilTest, TestInvalidAttrForDeviceAssignmentDisallowed) {
             "bad 'device_assignment' attribute at index 0, not an int");
 }
 
+TEST(TPURewriteDeviceUtilTest, TestGetHostFailDeviceMissingAttributes) {
+  mlir::registerDialect<mlir::tf_device::TensorFlowDeviceDialect>();
+  mlir::MLIRContext context;
+  mlir::OwningModuleRef module_ref =
+      mlir::ModuleOp::create(mlir::UnknownLoc::get(&context));
+  mlir::OpBuilder builder(module_ref->getBodyRegion());
+  llvm::SmallVector<mlir::Type, 8> result_types;
+  auto cluster = builder.create<mlir::tf_device::ClusterOp>(
+      mlir::UnknownLoc::get(&context), result_types);
+
+  mlir::TF::RuntimeDevices devices;
+  std::string host_device;
+  EXPECT_TRUE(mlir::failed(
+      GetHostDeviceOutsideComputation(devices, cluster, &host_device)));
+}
+
+TEST(TPURewriteDeviceUtilTest, TestGetHostDeviceFailModelParallelism) {
+  mlir::registerDialect<mlir::tf_device::TensorFlowDeviceDialect>();
+  mlir::MLIRContext context;
+  mlir::OwningModuleRef module_ref =
+      mlir::ModuleOp::create(mlir::UnknownLoc::get(&context));
+  mlir::OpBuilder builder(module_ref->getBodyRegion());
+
+  llvm::SmallVector<mlir::Type, 8> result_types;
+  auto cluster = builder.create<mlir::tf_device::ClusterOp>(
+      mlir::UnknownLoc::get(&context), result_types);
+  cluster.setAttr(kNumCoresPerReplicaAttr,
+                  builder.getIntegerAttr(builder.getIntegerType(64), 5));
+  cluster.setAttr(kTopologyAttr, builder.getStringAttr(""));
+  cluster.setAttr(kDeviceAssignmentAttr, builder.getArrayAttr({}));
+
+  mlir::TF::RuntimeDevices runtime_devices;
+  std::string host_device;
+  EXPECT_TRUE(mlir::failed(
+      GetHostDeviceOutsideComputation(runtime_devices, cluster, &host_device)));
+}
+
+TEST(TPURewriteDeviceUtilTest, TestGetHostDeviceFailMissingTopology) {
+  mlir::registerDialect<mlir::tf_device::TensorFlowDeviceDialect>();
+  mlir::MLIRContext context;
+  mlir::OwningModuleRef module_ref =
+      mlir::ModuleOp::create(mlir::UnknownLoc::get(&context));
+  mlir::OpBuilder builder(module_ref->getBodyRegion());
+
+  llvm::SmallVector<mlir::Type, 8> result_types;
+  auto cluster = builder.create<mlir::tf_device::ClusterOp>(
+      mlir::UnknownLoc::get(&context), result_types);
+  cluster.setAttr(kNumCoresPerReplicaAttr,
+                  builder.getIntegerAttr(builder.getIntegerType(64), 1));
+  cluster.setAttr(kDeviceAssignmentAttr, builder.getArrayAttr({}));
+
+  mlir::TF::RuntimeDevices runtime_devices;
+  std::string host_device;
+  EXPECT_TRUE(mlir::failed(
+      GetHostDeviceOutsideComputation(runtime_devices, cluster, &host_device)));
+}
+
+TEST(TPURewriteDeviceUtilTest, TestGetHostDeviceFailMissingDeviceAssignment) {
+  mlir::registerDialect<mlir::tf_device::TensorFlowDeviceDialect>();
+  mlir::MLIRContext context;
+  mlir::OwningModuleRef module_ref =
+      mlir::ModuleOp::create(mlir::UnknownLoc::get(&context));
+  mlir::OpBuilder builder(module_ref->getBodyRegion());
+
+  llvm::SmallVector<mlir::Type, 8> result_types;
+  auto cluster = builder.create<mlir::tf_device::ClusterOp>(
+      mlir::UnknownLoc::get(&context), result_types);
+  cluster.setAttr(kNumCoresPerReplicaAttr,
+                  builder.getIntegerAttr(builder.getIntegerType(64), 1));
+  cluster.setAttr(kTopologyAttr, builder.getStringAttr(""));
+
+  mlir::TF::RuntimeDevices runtime_devices;
+  std::string host_device;
+  EXPECT_TRUE(mlir::failed(
+      GetHostDeviceOutsideComputation(runtime_devices, cluster, &host_device)));
+}
+
+TEST(TPURewriteDeviceUtilTest, TestGetHostDeviceFailBadDeviceAssignment) {
+  mlir::registerDialect<mlir::tf_device::TensorFlowDeviceDialect>();
+  mlir::MLIRContext context;
+  mlir::OwningModuleRef module_ref =
+      mlir::ModuleOp::create(mlir::UnknownLoc::get(&context));
+  mlir::OpBuilder builder(module_ref->getBodyRegion());
+
+  llvm::SmallVector<mlir::Type, 8> result_types;
+  auto cluster = builder.create<mlir::tf_device::ClusterOp>(
+      mlir::UnknownLoc::get(&context), result_types);
+  cluster.setAttr(kNumCoresPerReplicaAttr,
+                  builder.getIntegerAttr(builder.getIntegerType(64), 1));
+  cluster.setAttr(kTopologyAttr, builder.getStringAttr(""));
+  cluster.setAttr(kDeviceAssignmentAttr,
+                  builder.getStrArrayAttr(llvm::ArrayRef<llvm::StringRef>(
+                      {"bad_device_assigment"})));
+
+  mlir::TF::RuntimeDevices runtime_devices;
+  std::string host_device;
+  EXPECT_TRUE(mlir::failed(
+      GetHostDeviceOutsideComputation(runtime_devices, cluster, &host_device)));
+}
+
+TEST(TPURewriteDeviceUtilTest, TestGetHostDeviceFailBadDeviceName) {
+  mlir::registerDialect<mlir::tf_device::TensorFlowDeviceDialect>();
+  mlir::MLIRContext context;
+  mlir::OwningModuleRef module_ref =
+      mlir::ModuleOp::create(mlir::UnknownLoc::get(&context));
+  mlir::OpBuilder builder(module_ref->getBodyRegion());
+  module_ref->setAttr(
+      "tf.devices", builder.getStrArrayAttr(
+                        llvm::ArrayRef<llvm::StringRef>({"bad_device_name"})));
+
+  llvm::SmallVector<mlir::Type, 8> result_types;
+  auto cluster = builder.create<mlir::tf_device::ClusterOp>(
+      mlir::UnknownLoc::get(&context), result_types);
+  cluster.setAttr(kNumCoresPerReplicaAttr,
+                  builder.getIntegerAttr(builder.getIntegerType(64), 1));
+  cluster.setAttr(kTopologyAttr, builder.getStringAttr(""));
+  cluster.setAttr(kDeviceAssignmentAttr, builder.getArrayAttr({}));
+
+  mlir::TF::RuntimeDevices runtime_devices;
+  GetDevicesFromOp(*module_ref, &runtime_devices);
+  std::string host_device;
+  EXPECT_TRUE(mlir::failed(
+      GetHostDeviceOutsideComputation(runtime_devices, cluster, &host_device)));
+}
+
+TEST(TPURewriteDeviceUtilTest, TestGetHostDeviceTPUReplicate) {
+  mlir::registerDialect<mlir::tf_device::TensorFlowDeviceDialect>();
+  mlir::MLIRContext context;
+  mlir::OwningModuleRef module_ref =
+      mlir::ModuleOp::create(mlir::UnknownLoc::get(&context));
+  mlir::OpBuilder builder(module_ref->getBodyRegion());
+
+  llvm::SmallDenseMap<llvm::StringRef, llvm::SmallVector<llvm::StringRef, 4>>
+      devices;
+  auto replicate = builder.create<mlir::tf_device::ReplicateOp>(
+      mlir::UnknownLoc::get(&context), /*num_replicas=*/2, devices,
+      llvm::ArrayRef<std::pair<llvm::ArrayRef<mlir::Value>, mlir::Type>>{},
+      llvm::ArrayRef<mlir::Type>{});
+  builder.setInsertionPoint(&replicate.body().front(),
+                            replicate.body().front().begin());
+
+  llvm::SmallVector<mlir::Type, 8> result_types;
+  auto cluster = builder.create<mlir::tf_device::ClusterOp>(
+      mlir::UnknownLoc::get(&context), result_types);
+
+  mlir::TF::RuntimeDevices runtime_devices;
+  std::string host_device;
+  EXPECT_TRUE(mlir::succeeded(
+      GetHostDeviceOutsideComputation(runtime_devices, cluster, &host_device)));
+  EXPECT_EQ(host_device, kTPUReplicatedHost);
+}
+
+TEST(TPURewriteDeviceUtilTest, TestGetHostDeviceNotReplicated) {
+  mlir::registerDialect<mlir::tf_device::TensorFlowDeviceDialect>();
+  mlir::MLIRContext context;
+  mlir::OwningModuleRef module_ref =
+      mlir::ModuleOp::create(mlir::UnknownLoc::get(&context));
+  mlir::OpBuilder builder(module_ref->getBodyRegion());
+  module_ref->setAttr(
+      "tf.devices", builder.getStrArrayAttr(llvm::ArrayRef<llvm::StringRef>(
+                        {"/job:localhost/replica:0/task:0/device:TPU_SYSTEM:0",
+                         "/job:localhost/replica:0/task:0/device:TPU:0",
+                         "/job:worker/replica:0/task:0/device:CPU:0"})));
+
+  llvm::SmallVector<mlir::Type, 8> result_types;
+  auto cluster = builder.create<mlir::tf_device::ClusterOp>(
+      mlir::UnknownLoc::get(&context), result_types);
+  cluster.setAttr(kNumCoresPerReplicaAttr,
+                  builder.getIntegerAttr(builder.getIntegerType(64), 1));
+  cluster.setAttr(kTopologyAttr, builder.getStringAttr(""));
+  cluster.setAttr(kDeviceAssignmentAttr, builder.getArrayAttr({}));
+
+  mlir::TF::RuntimeDevices runtime_devices;
+  GetDevicesFromOp(*module_ref, &runtime_devices);
+  std::string host_device;
+  EXPECT_TRUE(mlir::succeeded(
+      GetHostDeviceOutsideComputation(runtime_devices, cluster, &host_device)));
+  EXPECT_EQ(host_device, "/job:localhost/replica:0/task:0/device:CPU:0");
+}
+
 }  // anonymous namespace
 }  // namespace tensorflow

From 6fad91e7737761b3e5ec8ba7643486b2a86b9e31 Mon Sep 17 00:00:00 2001
From: abhichou4 <abhichou4@gmail.com>
Date: Wed, 24 Jun 2020 03:27:40 +0530
Subject: [PATCH 0924/1390] remove context, empty line

---
 tensorflow/python/eager/forwardprop.py      | 1 -
 tensorflow/python/eager/forwardprop_test.py | 4 ----
 2 files changed, 5 deletions(-)

diff --git a/tensorflow/python/eager/forwardprop.py b/tensorflow/python/eager/forwardprop.py
index 90acfc79829..a9627f28db1 100644
--- a/tensorflow/python/eager/forwardprop.py
+++ b/tensorflow/python/eager/forwardprop.py
@@ -161,7 +161,6 @@ def _jvp_helper_wrapper(
   
   Raises:
     ValueError: if tangent shapes are not compatible with input shapes.
-
   """
   if use_batch:
     for primal, tangent in zip(inputs, tangents):
diff --git a/tensorflow/python/eager/forwardprop_test.py b/tensorflow/python/eager/forwardprop_test.py
index 04d8b16dc04..0a34fc0265f 100644
--- a/tensorflow/python/eager/forwardprop_test.py
+++ b/tensorflow/python/eager/forwardprop_test.py
@@ -27,7 +27,6 @@ import numpy as np
 from tensorflow.python import pywrap_tfe
 from tensorflow.python.distribute import mirrored_strategy
 from tensorflow.python.eager import backprop
-from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
 from tensorflow.python.eager import forwardprop
 from tensorflow.python.eager import forwardprop_util
@@ -284,9 +283,6 @@ class ForwardpropTest(test.TestCase, parameterized.TestCase):
     )
 
   def testJVPFunctionRaisesError(self):
-    context.ensure_initialized()
-    ctx = context.context()
-
     sum_outputs = (constant_op.constant(6.),)
     
     with self.assertRaises(ValueError):

From 26dc5fc6534755a53f37d0a7f3e0bde0a6686cf3 Mon Sep 17 00:00:00 2001
From: Raman Sarokin <sorokin@google.com>
Date: Tue, 23 Jun 2020 14:39:07 -0700
Subject: [PATCH 0925/1390] DepthwiseConv3x3 converted to new style.

PiperOrigin-RevId: 317941480
Change-Id: I4c562fc8cc965dc16b34ed99b4840a81a64646cf
---
 .../gpu/cl/kernels/depthwise_conv_3x3.cc      | 208 +++++++++---------
 .../gpu/cl/kernels/depthwise_conv_3x3.h       |  26 ++-
 .../lite/delegates/gpu/cl/tensor_type.cc      |  41 ++++
 .../lite/delegates/gpu/cl/tensor_type.h       |   6 +
 4 files changed, 162 insertions(+), 119 deletions(-)

diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv_3x3.cc b/tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv_3x3.cc
index c8ac82581c0..309ce4a9d87 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv_3x3.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv_3x3.cc
@@ -28,55 +28,47 @@ namespace gpu {
 namespace cl {
 namespace {
 
-std::string GenerateDepthwiseConvCode(
-    const OperationDef& op_def,
-    const std::vector<ElementwiseOperation*>& linked_operations,
-    const CLDevice& device, bool weights_are_buffer, bool local_mem_uploads) {
-  std::string c = GetCommonDefines(op_def.precision);
-  TensorCodeGenerator src_tensor(
-      "src_data", WHSPoint{"dst_size.x", "dst_size.y", "dst_size.z"},
-      op_def.src_tensors[0]);
-  TensorCodeGenerator dst_tensor(
-      "dst_data", WHSPoint{"dst_size.x", "dst_size.y", "dst_size.z"},
-      op_def.dst_tensors[0]);
+std::string GenerateDepthwiseConvCode(const OperationDef& op_def,
+                                      const CLDevice& device,
+                                      bool weights_are_buffer,
+                                      bool local_mem_uploads, Arguments* args) {
+  auto src_desc = absl::make_unique<TensorDescriptor>(op_def.src_tensors[0]);
+  src_desc->SetTextureAddressMode(GetFastestZeroMode(device));
+  args->AddObjectRef("src_tensor", AccessType::READ, std::move(src_desc));
+  args->AddObjectRef(
+      "dst_tensor", AccessType::WRITE,
+      absl::make_unique<TensorDescriptor>(op_def.dst_tensors[0]));
   const auto src_tensor_type = op_def.src_tensors[0].storage_type;
 
-  const auto mode = GetFastestZeroMode(device);
-
   const bool manual_clamp = src_tensor_type == TensorStorageType::BUFFER ||
                             src_tensor_type == TensorStorageType::IMAGE_BUFFER;
 
+  std::string c = GetCommonDefines(op_def.precision);
   if (local_mem_uploads) {
     c += "__attribute__((reqd_work_group_size(8, 4, 1)))\n";
   }
   c += "__kernel void main_function(\n";
-  c += src_tensor.GetDeclaration(AccessType::READ) + ",\n";
-  if (weights_are_buffer) {
-    c += "    __global FLT4* filters\n";
-  } else {
-    c += "    __read_only image2d_t filters\n";
-  }
-  c += GetArgsDeclaration(linked_operations);
-  c += dst_tensor.GetDeclaration(AccessType::WRITE) + ",\n";
-  c += "    int4 dst_size\n";
-  c += ") {\n";
+  c += "$0) {\n";
   c += "  int X = get_global_id(0) * 2;\n";
   c += "  int Y = get_global_id(1) * 2;\n";
-  c += "  int Z = get_global_id(2);\n";
+  c += "  int S = get_global_id(2);\n";
   c += "   ACCUM_FLT4 r0 = (ACCUM_FLT4)(0.0f);\n";
   c += "   ACCUM_FLT4 r1 = (ACCUM_FLT4)(0.0f);\n";
   c += "   ACCUM_FLT4 r2 = (ACCUM_FLT4)(0.0f);\n";
   c += "   ACCUM_FLT4 r3 = (ACCUM_FLT4)(0.0f);\n";
   if (!local_mem_uploads) {
-    c += "  if (X >= dst_size.x || Y >= dst_size.y || Z >= dst_size.z) "
-         "return;\n";
+    c += "  if (X >= args.dst_tensor.Width() || Y >= args.dst_tensor.Height() "
+         "|| S >= args.dst_tensor.Slices()) { \n";
+    c += "    return; \n";
+    c += "  } \n";
   }
   if (local_mem_uploads) {
     c += "  __local FLT4 f[10];\n";
-    c += "  event_t e = async_work_group_copy(f, filters + Z * 10, 10, 0);\n";
+    c += "  event_t e = async_work_group_copy(f, args.weights.GetPtr() + S * "
+         "10, 10, 0);\n";
     c += "  wait_group_events(1, &e);\n";
   } else if (weights_are_buffer) {
-    c += "  __global FLT4* f = filters + Z * 10;\n";
+    c += "  __global FLT4* f = args.weights.GetPtr() + S * 10;\n";
   }
   c += "  FLT4 s0;\n";
   c += "  FLT4 s1;\n";
@@ -87,15 +79,15 @@ std::string GenerateDepthwiseConvCode(
   std::string xc[4] = {"X - 1", "X", "X + 1", "X + 2"};
   std::string yc[4] = {"Y - 1", "Y", "Y + 1", "Y + 2"};
   if (!weights_are_buffer) {
-    c += "   FLT4 f0 = READ_IMAGE(filters, smp_none, (int2)(0, Z));\n";
-    c += "   FLT4 f1 = READ_IMAGE(filters, smp_none, (int2)(1, Z));\n";
-    c += "   FLT4 f2 = READ_IMAGE(filters, smp_none, (int2)(2, Z));\n";
-    c += "   FLT4 f3 = READ_IMAGE(filters, smp_none, (int2)(3, Z));\n";
-    c += "   FLT4 f4 = READ_IMAGE(filters, smp_none, (int2)(4, Z));\n";
-    c += "   FLT4 f5 = READ_IMAGE(filters, smp_none, (int2)(5, Z));\n";
-    c += "   FLT4 f6 = READ_IMAGE(filters, smp_none, (int2)(6, Z));\n";
-    c += "   FLT4 f7 = READ_IMAGE(filters, smp_none, (int2)(7, Z));\n";
-    c += "   FLT4 f8 = READ_IMAGE(filters, smp_none, (int2)(8, Z));\n";
+    c += "   FLT4 f0 = args.weights.Read(0, S);\n";
+    c += "   FLT4 f1 = args.weights.Read(1, S);\n";
+    c += "   FLT4 f2 = args.weights.Read(2, S);\n";
+    c += "   FLT4 f3 = args.weights.Read(3, S);\n";
+    c += "   FLT4 f4 = args.weights.Read(4, S);\n";
+    c += "   FLT4 f5 = args.weights.Read(5, S);\n";
+    c += "   FLT4 f6 = args.weights.Read(6, S);\n";
+    c += "   FLT4 f7 = args.weights.Read(7, S);\n";
+    c += "   FLT4 f8 = args.weights.Read(8, S);\n";
   }
   if (manual_clamp) {
     c += "  int x0 = X - 1;\n";
@@ -106,25 +98,25 @@ std::string GenerateDepthwiseConvCode(
     c += "  int y1 = Y;\n";
     c += "  int y2 = Y + 1;\n";
     c += "  int y3 = Y + 2;\n";
-    c += "  bool x0_in = x0 >= 0 && x0 < dst_size.x;\n";
-    c += "  bool x1_in = x1 >= 0 && x1 < dst_size.x;\n";
-    c += "  bool x2_in = x2 >= 0 && x2 < dst_size.x;\n";
-    c += "  bool x3_in = x3 >= 0 && x3 < dst_size.x;\n";
-    c += "  bool y0_in = y0 >= 0 && y0 < dst_size.y;\n";
-    c += "  bool y1_in = y1 >= 0 && y1 < dst_size.y;\n";
-    c += "  bool y2_in = y2 >= 0 && y2 < dst_size.y;\n";
-    c += "  bool y3_in = y3 >= 0 && y3 < dst_size.y;\n";
-    c += "  x0 = clamp(x0, 0, dst_size.x - 1);\n";
-    c += "  x1 = clamp(x1, 0, dst_size.x - 1);\n";
-    c += "  x2 = clamp(x2, 0, dst_size.x - 1);\n";
-    c += "  x3 = clamp(x3, 0, dst_size.x - 1);\n";
-    c += "  y0 = clamp(y0, 0, dst_size.y - 1);\n";
-    c += "  y1 = clamp(y1, 0, dst_size.y - 1);\n";
-    c += "  y2 = clamp(y2, 0, dst_size.y - 1);\n";
-    c += "  y3 = clamp(y3, 0, dst_size.y - 1);\n";
+    c += "  bool x0_in = x0 >= 0 && x0 < args.dst_tensor.Width();\n";
+    c += "  bool x1_in = x1 >= 0 && x1 < args.dst_tensor.Width();\n";
+    c += "  bool x2_in = x2 >= 0 && x2 < args.dst_tensor.Width();\n";
+    c += "  bool x3_in = x3 >= 0 && x3 < args.dst_tensor.Width();\n";
+    c += "  bool y0_in = y0 >= 0 && y0 < args.dst_tensor.Height();\n";
+    c += "  bool y1_in = y1 >= 0 && y1 < args.dst_tensor.Height();\n";
+    c += "  bool y2_in = y2 >= 0 && y2 < args.dst_tensor.Height();\n";
+    c += "  bool y3_in = y3 >= 0 && y3 < args.dst_tensor.Height();\n";
+    c += "  x0 = clamp(x0, 0, args.dst_tensor.Width() - 1);\n";
+    c += "  x1 = clamp(x1, 0, args.dst_tensor.Width() - 1);\n";
+    c += "  x2 = clamp(x2, 0, args.dst_tensor.Width() - 1);\n";
+    c += "  x3 = clamp(x3, 0, args.dst_tensor.Width() - 1);\n";
+    c += "  y0 = clamp(y0, 0, args.dst_tensor.Height() - 1);\n";
+    c += "  y1 = clamp(y1, 0, args.dst_tensor.Height() - 1);\n";
+    c += "  y2 = clamp(y2, 0, args.dst_tensor.Height() - 1);\n";
+    c += "  y3 = clamp(y3, 0, args.dst_tensor.Height() - 1);\n";
     if (src_tensor_type == TensorStorageType::BUFFER) {
-      c += "  __global FLT4* src_loc = src_data + Z * dst_size.x * "
-           "dst_size.y;\n";
+      c += "  __global FLT4* src_loc = "
+           "args.src_tensor.GetPtrWithSliceOffset(S);\n";
     }
     xc[0] = "x0";
     xc[1] = "x1";
@@ -150,29 +142,29 @@ std::string GenerateDepthwiseConvCode(
   auto read_4x_line = [&](int y) {
     if (src_tensor_type == TensorStorageType::BUFFER) {
       const std::string y_in = "y" + std::to_string(y) + "_in";
-      c += "    s0 = src_loc[" + yc[y] + " * dst_size.x + " + xc[0] +
-           "] * (FLT)(x0_in && " + y_in + ");\n";
-      c += "    s1 = src_loc[" + yc[y] + " * dst_size.x + " + xc[1] +
-           "] * (FLT)(x1_in && " + y_in + ");\n";
-      c += "    s2 = src_loc[" + yc[y] + " * dst_size.x + " + xc[2] +
-           "] * (FLT)(x2_in && " + y_in + ");\n";
-      c += "    s3 = src_loc[" + yc[y] + " * dst_size.x + " + xc[3] +
-           "] * (FLT)(x3_in && " + y_in + ");\n";
+      c += "    s0 = src_loc[args.src_tensor.GetWHOffset(" + xc[0] + ", " +
+           yc[y] + ")] * (FLT)(x0_in && " + y_in + ");\n";
+      c += "    s1 = src_loc[args.src_tensor.GetWHOffset(" + xc[1] + ", " +
+           yc[y] + ")] * (FLT)(x1_in && " + y_in + ");\n";
+      c += "    s2 = src_loc[args.src_tensor.GetWHOffset(" + xc[2] + ", " +
+           yc[y] + ")] * (FLT)(x2_in && " + y_in + ");\n";
+      c += "    s3 = src_loc[args.src_tensor.GetWHOffset(" + xc[3] + ", " +
+           yc[y] + ")] * (FLT)(x3_in && " + y_in + ");\n";
     } else if (src_tensor_type == TensorStorageType::IMAGE_BUFFER) {
       const std::string y_in = "y" + std::to_string(y) + "_in";
-      c += "    s0 = " + src_tensor.ReadWHS(xc[0], yc[y], "Z", mode) +
-           " * (FLT)(x0_in && " + y_in + ");\n";
-      c += "    s1 = " + src_tensor.ReadWHS(xc[1], yc[y], "Z", mode) +
-           " * (FLT)(x1_in && " + y_in + ");\n";
-      c += "    s2 = " + src_tensor.ReadWHS(xc[2], yc[y], "Z", mode) +
-           " * (FLT)(x2_in && " + y_in + ");\n";
-      c += "    s3 = " + src_tensor.ReadWHS(xc[3], yc[y], "Z", mode) +
-           " * (FLT)(x3_in && " + y_in + ");\n";
+      c += "    s0 = args.src_tensor.Read(" + xc[0] + ", " + yc[y] +
+           ", S) * (FLT)(x0_in && " + y_in + ");\n";
+      c += "    s1 = args.src_tensor.Read(" + xc[1] + ", " + yc[y] +
+           ", S) * (FLT)(x1_in && " + y_in + ");\n";
+      c += "    s2 = args.src_tensor.Read(" + xc[2] + ", " + yc[y] +
+           ", S) * (FLT)(x2_in && " + y_in + ");\n";
+      c += "    s3 = args.src_tensor.Read(" + xc[3] + ", " + yc[y] +
+           ", S) * (FLT)(x3_in && " + y_in + ");\n";
     } else {
-      c += "    s0 = " + src_tensor.ReadWHS(xc[0], yc[y], "Z", mode) + ";\n";
-      c += "    s1 = " + src_tensor.ReadWHS(xc[1], yc[y], "Z", mode) + ";\n";
-      c += "    s2 = " + src_tensor.ReadWHS(xc[2], yc[y], "Z", mode) + ";\n";
-      c += "    s3 = " + src_tensor.ReadWHS(xc[3], yc[y], "Z", mode) + ";\n";
+      c += "    s0 = args.src_tensor.Read(" + xc[0] + ", " + yc[y] + ", S);\n";
+      c += "    s1 = args.src_tensor.Read(" + xc[1] + ", " + yc[y] + ", S);\n";
+      c += "    s2 = args.src_tensor.Read(" + xc[2] + ", " + yc[y] + ", S);\n";
+      c += "    s3 = args.src_tensor.Read(" + xc[3] + ", " + yc[y] + ", S);\n";
     }
   };
   c += "  {\n";
@@ -224,40 +216,38 @@ std::string GenerateDepthwiseConvCode(
   c += "    r3 += TO_ACCUM_TYPE(" + W[8] + " * s3);\n";
   c += "  }\n";
   if (!weights_are_buffer) {
-    c += "   FLT4 bias = READ_IMAGE(filters, smp_none, (int2)(9, Z));\n";
+    c += "   FLT4 bias = args.weights.Read(9, S);\n";
   }
   c += "  r0 += TO_ACCUM_TYPE(" + bias + ");\n";
   c += "  r1 += TO_ACCUM_TYPE(" + bias + ");\n";
   c += "  r2 += TO_ACCUM_TYPE(" + bias + ");\n";
   c += "  r3 += TO_ACCUM_TYPE(" + bias + ");\n";
   if (local_mem_uploads) {
-    c += "  if (X >= dst_size.x || Y >= dst_size.y || Z >= dst_size.z) "
-         "return;\n";
+    c += "  if (X >= args.dst_tensor.Width() || Y >= args.dst_tensor.Height() "
+         "|| "
+         "S >= args.dst_tensor.Slices()) { \n";
+    c += "    return; \n";
+    c += "  } \n";
   }
-  c += "  if(X + 0 < dst_size.x && Y + 0 < dst_size.y) {\n";
+  c += "  if(X + 0 < args.dst_tensor.Width() && Y + 0 < "
+       "args.dst_tensor.Height()) {\n";
   c += "    FLT4 result = TO_FLT4(r0);\n";
-  c += "  " + dst_tensor.GetAddressWHS("address", "X + 0", "Y + 0", "Z") + "\n";
-  LinkingContext context{"result", "X + 0", "Y + 0", "Z"};
-  c += PostProcess(linked_operations, context);
-  c += "  " + dst_tensor.WriteWHS("result", "X + 0", "Y + 0", "Z") + "\n";
+  c += "    args.dst_tensor.Write(result, X + 0, Y + 0, S)\n";
   c += "  }\n";
-  c += "  if(X + 1 < dst_size.x && Y + 0 < dst_size.y) {\n";
+  c += "  if(X + 1 < args.dst_tensor.Width() && Y + 0 < "
+       "args.dst_tensor.Height()) {\n";
   c += "    FLT4 result = TO_FLT4(r1);\n";
-  context = {"result", "X + 1", "Y + 0", "Z"};
-  c += PostProcess(linked_operations, context);
-  c += "  " + dst_tensor.WriteWHS("result", "X + 1", "Y + 0", "Z") + "\n";
+  c += "    args.dst_tensor.Write(result, X + 1, Y + 0, S)\n";
   c += "  }\n";
-  c += "  if(X + 0 < dst_size.x && Y + 1 < dst_size.y) {\n";
+  c += "  if(X + 0 < args.dst_tensor.Width() && Y + 1 < "
+       "args.dst_tensor.Height()) {\n";
   c += "    FLT4 result = TO_FLT4(r2);\n";
-  context = {"result", "X + 0", "Y + 1", "Z"};
-  c += PostProcess(linked_operations, context);
-  c += "  " + dst_tensor.WriteWHS("result", "X + 0", "Y + 1", "Z") + "\n";
+  c += "    args.dst_tensor.Write(result, X + 0, Y + 1, S)\n";
   c += "  }\n";
-  c += "  if(X + 1 < dst_size.x && Y + 1 < dst_size.y) {\n";
+  c += "  if(X + 1 < args.dst_tensor.Width() && Y + 1 < "
+       "args.dst_tensor.Height()) {\n";
   c += "    FLT4 result = TO_FLT4(r3);\n";
-  context = {"result", "X + 1", "Y + 1", "Z"};
-  c += PostProcess(linked_operations, context);
-  c += "  " + dst_tensor.WriteWHS("result", "X + 1", "Y + 1", "Z") + "\n";
+  c += "    args.dst_tensor.Write(result, X + 1, Y + 1, S)\n";
   c += "  }\n";
   c += "}\n";
 
@@ -277,9 +267,6 @@ DepthwiseConv3x3::DepthwiseConv3x3(DepthwiseConv3x3&& operation)
     : GPUOperation(std::move(operation)),
       weights_are_buffer_(operation.weights_are_buffer_),
       local_mem_uploads_(operation.local_mem_uploads_),
-      weights_tex2d_(std::move(operation.weights_tex2d_)),
-      weights_buf_(std::move(operation.weights_buf_)),
-      weights_(operation.weights_),
       kernel_(std::move(operation.kernel_)),
       work_group_size_(operation.work_group_size_) {}
 
@@ -287,9 +274,6 @@ DepthwiseConv3x3& DepthwiseConv3x3::operator=(DepthwiseConv3x3&& operation) {
   if (this != &operation) {
     std::swap(weights_are_buffer_, operation.weights_are_buffer_);
     std::swap(local_mem_uploads_, operation.local_mem_uploads_);
-    weights_tex2d_ = std::move(operation.weights_tex2d_);
-    weights_buf_ = std::move(operation.weights_buf_);
-    std::swap(weights_, operation.weights_);
     kernel_ = std::move(operation.kernel_);
     std::swap(work_group_size_, operation.work_group_size_);
     GPUOperation::operator=(std::move(operation));
@@ -300,8 +284,15 @@ DepthwiseConv3x3& DepthwiseConv3x3::operator=(DepthwiseConv3x3&& operation) {
 absl::Status DepthwiseConv3x3::Compile(
     const CreationContext& creation_context) {
   std::string code = GenerateDepthwiseConvCode(
-      definition_, linked_operations_, *creation_context.device,
-      weights_are_buffer_, local_mem_uploads_);
+      definition_, *creation_context.device, weights_are_buffer_,
+      local_mem_uploads_, &args_);
+  std::string element_wise_code;
+  RETURN_IF_ERROR(
+      MergeOperations(linked_operations_, &args_, &element_wise_code));
+  RETURN_IF_ERROR(args_.TransformToCLCode(creation_context.device->GetInfo(),
+                                          {{"dst_tensor", element_wise_code}},
+                                          &code));
+
   std::vector<CompilerOptions> options;
   if (definition_.precision == CalculationsPrecision::F16 &&
       creation_context.device->IsPowerVR()) {
@@ -313,13 +304,10 @@ absl::Status DepthwiseConv3x3::Compile(
 }
 
 absl::Status DepthwiseConv3x3::BindArguments() {
-  kernel_.ResetBindingCounter();
-  RETURN_IF_ERROR(kernel_.SetMemoryAuto(src_[0]->GetMemoryPtr()));
-  RETURN_IF_ERROR(kernel_.SetMemoryAuto(weights_));
-  RETURN_IF_ERROR(BindArgs(&kernel_, linked_operations_));
-  RETURN_IF_ERROR(kernel_.SetMemoryAuto(dst_[0]->GetMemoryPtrForWriting()));
-  RETURN_IF_ERROR(kernel_.SetBytesAuto(dst_[0]->GetWHSB()));
-  return absl::OkStatus();
+  RETURN_IF_ERROR(args_.SetObjectRef("src_tensor", src_[0]));
+  RETURN_IF_ERROR(args_.SetObjectRef("dst_tensor", dst_[0]));
+  RETURN_IF_ERROR(SetArguments(linked_operations_, &args_));
+  return args_.Bind(kernel_.kernel());
 }
 
 int3 DepthwiseConv3x3::GetGridSize() const {
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv_3x3.h b/tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv_3x3.h
index 1ab17e3048c..9cb2ac41c87 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv_3x3.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv_3x3.h
@@ -71,9 +71,6 @@ class DepthwiseConv3x3 : public GPUOperation {
 
   bool weights_are_buffer_;
   bool local_mem_uploads_;
-  Texture2D weights_tex2d_;
-  Buffer weights_buf_;
-  cl_mem weights_;
 
   CLKernel kernel_;
   int3 work_group_size_ = int3(8, 4, 1);
@@ -90,17 +87,19 @@ absl::Status DepthwiseConv3x3::UploadWeightsAndBiases(
   const bool fp32_weights = definition_.precision == CalculationsPrecision::F32;
   const int float4_size = fp32_weights ? 16 : 8;
 
+  Texture2D weights_tex2d;
+  Buffer weights_buf;
   if (fp32_weights) {
     std::vector<float4> gpu_data(elements_count);
     RearrangeWeightsAndBiasesData(weights, biases, absl::MakeSpan(gpu_data));
     if (weights_are_buffer_) {
       RETURN_IF_ERROR(CreateReadOnlyBuffer(float4_size * elements_count,
                                            gpu_data.data(), context,
-                                           &weights_buf_));
+                                           &weights_buf));
     } else {
       RETURN_IF_ERROR(CreateTexture2DRGBA(
           definition_.GetDataType(), texture_width, texture_height,
-          gpu_data.data(), context, &weights_tex2d_));
+          gpu_data.data(), context, &weights_tex2d));
     }
   } else {
     std::vector<half4> gpu_data(elements_count);
@@ -108,18 +107,27 @@ absl::Status DepthwiseConv3x3::UploadWeightsAndBiases(
     if (weights_are_buffer_) {
       RETURN_IF_ERROR(CreateReadOnlyBuffer(float4_size * elements_count,
                                            gpu_data.data(), context,
-                                           &weights_buf_));
+                                           &weights_buf));
     } else {
       RETURN_IF_ERROR(CreateTexture2DRGBA(
           definition_.GetDataType(), texture_width, texture_height,
-          gpu_data.data(), context, &weights_tex2d_));
+          gpu_data.data(), context, &weights_tex2d));
     }
   }
 
   if (weights_are_buffer_) {
-    weights_ = weights_buf_.GetMemoryPtr();
+    BufferDescriptor desc;
+    desc.element_type = fp32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
+    desc.element_size = 4;
+    args_.AddObject("weights", AccessType::READ,
+                    absl::make_unique<Buffer>(std::move(weights_buf)),
+                    absl::make_unique<BufferDescriptor>(desc));
   } else {
-    weights_ = weights_tex2d_.GetMemoryPtr();
+    Texture2DDescriptor desc;
+    desc.element_type = fp32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
+    args_.AddObject("weights", AccessType::READ,
+                    absl::make_unique<Texture2D>(std::move(weights_tex2d)),
+                    absl::make_unique<Texture2DDescriptor>(desc));
   }
 
   return absl::OkStatus();
diff --git a/tensorflow/lite/delegates/gpu/cl/tensor_type.cc b/tensorflow/lite/delegates/gpu/cl/tensor_type.cc
index 0c3a1e3508c..ef49f67cf77 100644
--- a/tensorflow/lite/delegates/gpu/cl/tensor_type.cc
+++ b/tensorflow/lite/delegates/gpu/cl/tensor_type.cc
@@ -172,6 +172,10 @@ absl::Status TensorDescriptor::PerformSelector(
     return PerformWriteLinearSelector(args, result);
   } else if (selector == "GetAddress") {
     return PerformGetAddressSelector(args, result);
+  } else if (selector == "GetPtrWithSliceOffset") {
+    return PerformGetPtrWithSliceOffsetSelector(args, result);
+  } else if (selector == "GetWHOffset") {
+    return PerformGetWHOffsetSelector(args, result);
   } else {
     return absl::NotFoundError(absl::StrCat(
         "TensorDescriptor don't have selector with name - ", selector));
@@ -351,6 +355,43 @@ absl::Status TensorDescriptor::PerformGetAddressSelector(
   return absl::OkStatus();
 }
 
+absl::Status TensorDescriptor::PerformGetPtrWithSliceOffsetSelector(
+    const std::vector<std::string>& args, std::string* result) const {
+  if (storage_type != TensorStorageType::BUFFER) {
+    return absl::InvalidArgumentError(
+        "GetPtrWithSliceOffset selector can be used only with BUFFER");
+  }
+  if (args.size() != 1) {
+    return absl::NotFoundError(absl::StrCat(
+        "GetPtrWithSliceOffset require one argument(slice coordinate), but ",
+        args.size(), " was passed"));
+  }
+  const std::string width = IsBatchedWidth() ? "width_batched" : "width";
+  if (HasAxis(Axis::DEPTH)) {
+    *result =
+        absl::StrCat("buffer + ", args[0], " * ", width, " * height * depth");
+  } else {
+    *result = absl::StrCat("buffer + ", args[0], " * ", width, " * height");
+  }
+  return absl::OkStatus();
+}
+
+absl::Status TensorDescriptor::PerformGetWHOffsetSelector(
+    const std::vector<std::string>& args, std::string* result) const {
+  if (storage_type != TensorStorageType::BUFFER) {
+    return absl::InvalidArgumentError(
+        "GetWHOffset selector can be used only with BUFFER");
+  }
+  if (args.size() != 2) {
+    return absl::NotFoundError(absl::StrCat(
+        "GetWHOffset require two arguments(X and Y coordinates), but ",
+        args.size(), " was passed"));
+  }
+  const std::string width = IsBatchedWidth() ? "width_batched" : "width";
+  *result = absl::StrCat(args[1], " * ", width, " + ", args[0]);
+  return absl::OkStatus();
+}
+
 std::string TensorDescriptor::DeclareAddress(const std::string& var_name,
                                              const std::string& address) const {
   return absl::StrCat(StorageTypeToAddressType(), " ", var_name, " = ", address,
diff --git a/tensorflow/lite/delegates/gpu/cl/tensor_type.h b/tensorflow/lite/delegates/gpu/cl/tensor_type.h
index 3a1d7abb01a..12c078f1025 100644
--- a/tensorflow/lite/delegates/gpu/cl/tensor_type.h
+++ b/tensorflow/lite/delegates/gpu/cl/tensor_type.h
@@ -85,6 +85,12 @@ struct TensorDescriptor : public GPUObjectDescriptor {
   absl::Status PerformGetAddressSelector(const std::vector<std::string>& args,
                                          std::string* result) const;
 
+  absl::Status PerformGetPtrWithSliceOffsetSelector(
+      const std::vector<std::string>& args, std::string* result) const;
+
+  absl::Status PerformGetWHOffsetSelector(const std::vector<std::string>& args,
+                                          std::string* result) const;
+
   std::string DeclareAddress(const std::string& var_name,
                              const std::string& address) const;
 

From ce78d27ad1dd6aa5ba749aa40b9822a554bc1f74 Mon Sep 17 00:00:00 2001
From: Raman Sarokin <sorokin@google.com>
Date: Tue, 23 Jun 2020 14:41:43 -0700
Subject: [PATCH 0926/1390] ConvolutionTransposed3x3Thin converted to new
 style.

PiperOrigin-RevId: 317941931
Change-Id: Iccb665ff6a34eeb4b836fbf925a5b02aa38e371f
---
 .../convolution_transposed_3x3_thin.cc        | 152 +++++++-----------
 .../kernels/convolution_transposed_3x3_thin.h |  57 +++++--
 .../convolution_transposed_3x3_thin_test.cc   |   2 +-
 3 files changed, 101 insertions(+), 110 deletions(-)

diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3_thin.cc b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3_thin.cc
index d65ff071c7e..020a99852d7 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3_thin.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3_thin.cc
@@ -27,21 +27,19 @@ namespace gpu {
 namespace cl {
 namespace {
 
-std::string GenerateConvolutionTransposedCode(
-    const OperationDef& op_def, const LinearStorage& biases, int src_depth,
-    int dst_depth, const CLDevice& device,
-    const std::vector<ElementwiseOperation*>& linked_operations) {
-  TensorCodeGenerator src_tensor(
-      "src_data",
-      WHSBPoint{"src_size.x", "src_size.y", "src_size.z", "src_size.w"},
-      op_def.src_tensors[0]);
-  TensorCodeGenerator dst_tensor(
-      "dst_data",
-      WHSBPoint{"dst_size.x", "dst_size.y", "dst_size.z", "dst_size.w"},
-      op_def.dst_tensors[0]);
+std::string GenerateConvolutionTransposedCode(const OperationDef& op_def,
+                                              int src_depth, int dst_depth,
+                                              const CLDevice& device,
+                                              Arguments* args) {
+  auto src_desc = absl::make_unique<TensorDescriptor>(op_def.src_tensors[0]);
+  src_desc->SetTextureAddressMode(GetFastestZeroMode(device));
+  args->AddObjectRef("src_tensor", AccessType::READ, std::move(src_desc));
+  args->AddObjectRef(
+      "dst_tensor", AccessType::WRITE,
+      absl::make_unique<TensorDescriptor>(op_def.dst_tensors[0]));
+
   const auto src_tensor_type = op_def.src_tensors[0].storage_type;
 
-  const std::string batch_id = op_def.IsBatchSupported() ? "B" : "";
   std::string c = GetCommonDefines(op_def.precision);
 
   switch (op_def.precision) {
@@ -61,23 +59,19 @@ std::string GenerateConvolutionTransposedCode(
   }
 
   c += "__kernel void main_function(\n";
-  c += src_tensor.GetDeclaration(AccessType::READ) + ",\n";
-  c += "    __constant FLT4* filters,  \n";
-  c += biases.GetDeclaration();
-  c += GetArgsDeclaration(linked_operations);
-  c += dst_tensor.GetDeclaration(AccessType::WRITE) + ",\n";
-  c += "    int4 src_size,             \n";
-  c += "    int4 dst_size              \n";
-  c += ") {\n";
+  c += "$0) {\n";
   if (op_def.IsBatchSupported()) {
     c += "  int linear_id = get_global_id(0);\n";
-    c += "  int X = linear_id / dst_size.w;\n";
-    c += "  int B = linear_id % dst_size.w;\n";
+    c += "  int X = linear_id / args.dst_tensor.Batch();\n";
+    c += "  int B = linear_id % args.dst_tensor.Batch();\n";
+    c += "  args.dst_tensor.SetBatchRef(B);\n";
+    c += "  args.src_tensor.SetBatchRef(B);\n";
   } else {
     c += "  int X = get_global_id(0);\n";
   }
   c += "  int Y = get_global_id(1);\n";
-  c += "  if (X >= src_size.x || Y >= src_size.y) return;\n";
+  c += "  if (X >= args.src_tensor.Width() || Y >= args.src_tensor.Height()) "
+       "return;\n";
   for (int d = 0; d < dst_depth; ++d) {
     const std::string layer = std::to_string(d);
     c += "  ACCUM_FLT4 r" + layer + "[2][2];\n";
@@ -91,61 +85,48 @@ std::string GenerateConvolutionTransposedCode(
     const std::string z = std::to_string(s);
     c += "  {\n";
     if (src_tensor_type == TensorStorageType::BUFFER) {
-      c += "  bool x_in = X + 1 < src_size.x;\n";
-      c += "  bool y_in = Y + 1 < src_size.y;\n";
-      c +=
-          "  FLT4 src0 = " + src_tensor.ReadWHSB("X", "Y", z, batch_id) + ";\n";
+      c += "  bool x_in = X + 1 < args.src_tensor.Width();\n";
+      c += "  bool y_in = Y + 1 < args.src_tensor.Height();\n";
+      c += "  FLT4 src0 = args.src_tensor.Read(X, Y, " + z + ");\n";
       c += "  FLT4 src1 = (FLT4)(0.0);\n";
       c += "  FLT4 src2 = (FLT4)(0.0);\n";
       c += "  FLT4 src3 = (FLT4)(0.0);\n";
       c += "  if (x_in) {\n";
-      c += "    src1 = " + src_tensor.ReadWHSB("X + 1", "Y", z, batch_id) +
-           ";\n";
+      c += "    src1 = args.src_tensor.Read(X + 1, Y, " + z + ");\n";
       c += "  }\n";
       c += "  if (y_in) {\n";
-      c += "    src2 = " + src_tensor.ReadWHSB("X", "Y + 1", z, batch_id) +
-           ";\n";
+      c += "    src2 = args.src_tensor.Read(X, Y + 1, " + z + ");\n";
       c += "  }\n";
       c += "  if (x_in && y_in) {\n";
-      c += "    src3 = " + src_tensor.ReadWHSB("X + 1", "Y + 1", z, batch_id) +
-           ";\n";
+      c += "    src3 = args.src_tensor.Read(X + 1, Y + 1, " + z + ");\n";
       c += "  }\n";
     } else if (src_tensor_type == TensorStorageType::IMAGE_BUFFER) {
-      c +=
-          "  " + src_tensor.GetAddressWHSB("c0", "X", "Y", z, batch_id) + ";\n";
-      c += "  " + src_tensor.GetAddressWHSB("c1", "X + 1", "Y", z, batch_id) +
-           ";\n";
-      c += "  " + src_tensor.GetAddressWHSB("c2", "X", "Y + 1", z, batch_id) +
-           ";\n";
-      c += "  " +
-           src_tensor.GetAddressWHSB("c3", "X + 1", "Y + 1", z, batch_id) +
-           ";\n";
-      c += "  bool x_in = X + 1 < src_size.x;\n";
-      c += "  bool y_in = Y + 1 < src_size.y;\n";
+      c += "  args.src_tensor.GetAddress(c0, X, Y, " + z + ");\n";
+      c += "  args.src_tensor.GetAddress(c1, X + 1, Y, " + z + ");\n";
+      c += "  args.src_tensor.GetAddress(c2, X, Y + 1, " + z + ");\n";
+      c += "  args.src_tensor.GetAddress(c3, X + 1, Y + 1, " + z + ");\n";
+      c += "  bool x_in = X + 1 < args.src_tensor.Width();\n";
+      c += "  bool y_in = Y + 1 < args.src_tensor.Height();\n";
       c += "  c1 = select(-1, c1, x_in);\n";
       c += "  c2 = select(-1, c2, y_in);\n";
       c += "  c3 = select(-1, c3, x_in && y_in);\n";
-      c += "  FLT4 src0 = " + src_tensor.Read("c0") + ";\n";
-      c += "  FLT4 src1 = " + src_tensor.Read("c1") + ";\n";
-      c += "  FLT4 src2 = " + src_tensor.Read("c2") + ";\n";
-      c += "  FLT4 src3 = " + src_tensor.Read("c3") + ";\n";
+      c += "  FLT4 src0 = args.src_tensor.Read(c0);\n";
+      c += "  FLT4 src1 = args.src_tensor.Read(c1);\n";
+      c += "  FLT4 src2 = args.src_tensor.Read(c2);\n";
+      c += "  FLT4 src3 = args.src_tensor.Read(c3);\n";
     } else {
-      const auto mode = GetFastestZeroMode(device);
-      c += "  FLT4 src0 = " + src_tensor.ReadWHSB("X", "Y", z, batch_id, mode) +
-           ";\n";
-      c += "  FLT4 src1 = " +
-           src_tensor.ReadWHSB("X + 1", "Y", z, batch_id, mode) + ";\n";
-      c += "  FLT4 src2 = " +
-           src_tensor.ReadWHSB("X", "Y + 1", z, batch_id, mode) + ";\n";
-      c += "  FLT4 src3 = " +
-           src_tensor.ReadWHSB("X + 1", "Y + 1", z, batch_id, mode) + ";\n";
+      c += "  FLT4 src0 = args.src_tensor.Read(X, Y, " + z + ");\n";
+      c += "  FLT4 src1 = args.src_tensor.Read(X + 1, Y, " + z + ");\n";
+      c += "  FLT4 src2 = args.src_tensor.Read(X, Y + 1, " + z + ");\n";
+      c += "  FLT4 src3 = args.src_tensor.Read(X + 1, Y + 1, " + z + ");\n";
     }
     for (int d = 0; d < dst_depth; ++d) {
       const std::string layer = std::to_string(d);
       const std::string f_offset = std::to_string(filters_index);
       filters_index++;
       c += "  {\n";
-      c += "  __constant FLT4* L0 = filters + 36 * " + f_offset + ";\n";
+      c += "  __constant FLT4* L0 = args.weights.GetPtr() + 36 * " + f_offset +
+           ";\n";
       c += "  CONV(r" + layer + "[0][0], src0, L0, 0);\n";
       c += "  CONV(r" + layer + "[0][1], src0, L0, 4);\n";
       c += "  CONV(r" + layer + "[0][1], src1, L0, 8);\n";
@@ -164,7 +145,8 @@ std::string GenerateConvolutionTransposedCode(
   for (int d = 0; d < dst_depth; ++d) {
     const std::string layer = std::to_string(d);
     c += "  {\n";
-    c += "  FLT4 bias_val = " + biases.ReadLinearFLT4(layer) + ";\n";
+    c += "  FLT4 bias_val = args.weights.Read(" +
+         std::to_string(36 * filters_index + d) + ");\n";
     for (int y = 0; y < 2; ++y) {
       for (int x = 0; x < 2; ++x) {
         const std::string x_coord = "X + " + std::to_string(x);
@@ -172,14 +154,8 @@ std::string GenerateConvolutionTransposedCode(
         c += "  {\n";
         c += "    FLT4 result = TO_FLT4(r" + layer + "[" + std::to_string(y) +
              "][" + std::to_string(x) + "]) + bias_val;\n";
-        const std::string x_3dcoord = op_def.IsBatchSupported()
-                                          ? "(" + x_coord + ") * dst_size.w + B"
-                                          : x_coord;
-        const LinkingContext context{"result", x_3dcoord, y_coord, layer};
-        c += PostProcess(linked_operations, context);
-        c += "    " +
-             dst_tensor.WriteWHSB("result", x_coord, y_coord, layer, batch_id) +
-             "\n";
+        c += "    args.dst_tensor.Write(result, " + x_coord + ", " + y_coord +
+             ", " + layer + ");\n";
         c += "  }\n";
       }
     }
@@ -200,8 +176,6 @@ ConvolutionTransposed3x3Thin::ConvolutionTransposed3x3Thin(
 ConvolutionTransposed3x3Thin::ConvolutionTransposed3x3Thin(
     ConvolutionTransposed3x3Thin&& operation)
     : GPUOperation(std::move(operation)),
-      weights_(std::move(operation.weights_)),
-      biases_(std::move(operation.biases_)),
       src_channels_(operation.src_channels_),
       dst_channels_(operation.dst_channels_),
       kernel_(std::move(operation.kernel_)),
@@ -210,8 +184,6 @@ ConvolutionTransposed3x3Thin::ConvolutionTransposed3x3Thin(
 ConvolutionTransposed3x3Thin& ConvolutionTransposed3x3Thin::operator=(
     ConvolutionTransposed3x3Thin&& operation) {
   if (this != &operation) {
-    weights_ = std::move(operation.weights_);
-    biases_ = std::move(operation.biases_);
     std::swap(src_channels_, operation.src_channels_);
     std::swap(dst_channels_, operation.dst_channels_);
     kernel_ = std::move(operation.kernel_);
@@ -223,25 +195,25 @@ ConvolutionTransposed3x3Thin& ConvolutionTransposed3x3Thin::operator=(
 
 absl::Status ConvolutionTransposed3x3Thin::Compile(
     const CreationContext& creation_context) {
-  const auto code = GenerateConvolutionTransposedCode(
-      definition_, biases_, DivideRoundUp(src_channels_, 4),
-      DivideRoundUp(dst_channels_, 4), *creation_context.device,
-      linked_operations_);
+  std::string code = GenerateConvolutionTransposedCode(
+      definition_, DivideRoundUp(src_channels_, 4),
+      DivideRoundUp(dst_channels_, 4), *creation_context.device, &args_);
+  std::string element_wise_code;
+  RETURN_IF_ERROR(
+      MergeOperations(linked_operations_, &args_, &element_wise_code));
+  RETURN_IF_ERROR(args_.TransformToCLCode(creation_context.device->GetInfo(),
+                                          {{"dst_tensor", element_wise_code}},
+                                          &code));
   return creation_context.cache->GetOrCreateCLKernel(
       code, "main_function", *creation_context.context,
       *creation_context.device, &kernel_);
 }
 
 absl::Status ConvolutionTransposed3x3Thin::BindArguments() {
-  kernel_.ResetBindingCounter();
-  RETURN_IF_ERROR(kernel_.SetMemoryAuto(src_[0]->GetMemoryPtr()));
-  RETURN_IF_ERROR(kernel_.SetMemoryAuto(weights_.GetMemoryPtr()));
-  RETURN_IF_ERROR(kernel_.SetMemoryAuto(biases_.GetMemoryPtr()));
-  RETURN_IF_ERROR(BindArgs(&kernel_, linked_operations_));
-  RETURN_IF_ERROR(kernel_.SetMemoryAuto(dst_[0]->GetMemoryPtrForWriting()));
-  RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->GetWHSB()));
-  RETURN_IF_ERROR(kernel_.SetBytesAuto(dst_[0]->GetWHSB()));
-  return absl::OkStatus();
+  RETURN_IF_ERROR(args_.SetObjectRef("src_tensor", src_[0]));
+  RETURN_IF_ERROR(args_.SetObjectRef("dst_tensor", dst_[0]));
+  RETURN_IF_ERROR(SetArguments(linked_operations_, &args_));
+  return args_.Bind(kernel_.kernel());
 }
 
 int3 ConvolutionTransposed3x3Thin::GetGridSize() const {
@@ -282,15 +254,7 @@ absl::Status CreateConvolutionTransposed3x3Thin(
   }
   *result = ConvolutionTransposed3x3Thin(definition, attr);
   RETURN_IF_ERROR(
-      result->UploadWeights(attr.weights, creation_context.context));
-  LinearStorageCreateInfo create_info;
-  create_info.storage_type =
-      DeduceLinearStorageType(definition.GetPrimaryStorageType());
-  create_info.data_type = definition.GetDataType();
-  create_info.name = "biases";
-  create_info.aligned_size = attr.weights.shape.o;
-  RETURN_IF_ERROR(CreateLinearStorage(
-      create_info, attr.bias, creation_context.context, &result->biases_));
+      result->UploadData(attr.weights, attr.bias, creation_context.context));
   return absl::OkStatus();
 }
 
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3_thin.h b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3_thin.h
index 447afb621e2..e292f416796 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3_thin.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3_thin.h
@@ -59,8 +59,9 @@ class ConvolutionTransposed3x3Thin : public GPUOperation {
       const OperationDef& definition,
       const ConvolutionTransposedAttributes& attr);
   template <DataType T>
-  absl::Status UploadWeights(const tflite::gpu::Tensor<OHWI, T>& weights,
-                             CLContext* context);
+  absl::Status UploadData(const tflite::gpu::Tensor<OHWI, T>& weights,
+                          const tflite::gpu::Tensor<Linear, T>& biases,
+                          CLContext* context);
 
   template <DataType S, typename T>
   void RearrangeWeightsData(const tflite::gpu::Tensor<OHWI, S>& weights,
@@ -69,9 +70,6 @@ class ConvolutionTransposed3x3Thin : public GPUOperation {
   absl::Status BindArguments();
   int3 GetGridSize() const;
 
-  Buffer weights_;
-  LinearStorage biases_;
-
   int src_channels_;
   int dst_channels_;
 
@@ -80,29 +78,58 @@ class ConvolutionTransposed3x3Thin : public GPUOperation {
 };
 
 template <DataType T>
-absl::Status ConvolutionTransposed3x3Thin::UploadWeights(
-    const tflite::gpu::Tensor<OHWI, T>& weights, CLContext* context) {
+absl::Status ConvolutionTransposed3x3Thin::UploadData(
+    const tflite::gpu::Tensor<OHWI, T>& weights,
+    const tflite::gpu::Tensor<Linear, T>& biases, CLContext* context) {
   const int src_depth = DivideRoundUp(src_channels_, 4);
   const int dst_depth = DivideRoundUp(dst_channels_, 4);
   const int kernel_x = 3;  //  This operation support only 3x3 kernel
   const int kernel_y = 3;
   const int flt4_count = kernel_x * kernel_y * src_depth * dst_depth * 4;
 
-  const int flt4_size = definition_.precision == CalculationsPrecision::F32
-                            ? sizeof(float4)
-                            : sizeof(half4);
+  const bool f32_weights = definition_.precision == CalculationsPrecision::F32;
 
-  if (definition_.GetDataType() == DataType::FLOAT32) {
+  BufferDescriptor desc;
+  desc.element_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
+  desc.element_size = 4;
+  desc.memory_type = MemoryType::CONSTANT;
+
+  Buffer weights_buffer;
+  if (f32_weights) {
     std::vector<float4> gpu_data(flt4_count);
     RearrangeWeightsData(weights, absl::MakeSpan(gpu_data));
-    return CreateReadOnlyBuffer(flt4_size * flt4_count, gpu_data.data(),
-                                context, &weights_);
+    for (int i = 0; i < dst_depth; ++i) {
+      float4 bias_value(0.0f);
+      for (int c = 0; c < 4; ++c) {
+        int ch = i * 4 + c;
+        bias_value[c] = ch < weights.shape.o ? biases.data[ch] : 0.0f;
+      }
+      gpu_data.push_back(bias_value);
+    }
+    RETURN_IF_ERROR(CreateReadOnlyBuffer(sizeof(float4) * gpu_data.size(),
+                                         gpu_data.data(), context,
+                                         &weights_buffer));
   } else {
     std::vector<half4> gpu_data(flt4_count);
     RearrangeWeightsData(weights, absl::MakeSpan(gpu_data));
-    return CreateReadOnlyBuffer(flt4_size * flt4_count, gpu_data.data(),
-                                context, &weights_);
+    for (int i = 0; i < dst_depth; ++i) {
+      half4 bias_value(0.0f);
+      for (int c = 0; c < 4; ++c) {
+        int ch = i * 4 + c;
+        bias_value[c] = ch < weights.shape.o ? biases.data[ch] : 0.0f;
+      }
+      gpu_data.push_back(bias_value);
+    }
+    RETURN_IF_ERROR(CreateReadOnlyBuffer(sizeof(half4) * gpu_data.size(),
+                                         gpu_data.data(), context,
+                                         &weights_buffer));
   }
+
+  args_.AddObject("weights", AccessType::READ,
+                  absl::make_unique<Buffer>(std::move(weights_buffer)),
+                  absl::make_unique<BufferDescriptor>(desc));
+
+  return absl::OkStatus();
 }
 
 template <DataType S, typename T>
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3_thin_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3_thin_test.cc
index 1d25605582a..82d4492866d 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3_thin_test.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3_thin_test.cc
@@ -43,7 +43,7 @@ TEST_F(OpenCLOperationTest, ConvolutionTransposed3x3ThinSimpleWeights) {
   attr.weights.shape = OHWI(1, 3, 3, 1);
   attr.weights.data = {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f};
   attr.bias.shape = Linear(2);
-  attr.bias.data = {0.0f};
+  attr.bias.data = {0.0f, 0.0f};
 
   for (auto storage : env_.GetSupportedStorages()) {
     for (auto precision : env_.GetSupportedPrecisions()) {

From 86caeb05ba53d568df499f099245d581f1b8b5ca Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 23 Jun 2020 14:42:11 -0700
Subject: [PATCH 0927/1390] Ignore other graph inputs in custom gradient

Though the inputs should not be valid we ignore them to make it easier
for v1 code to migrate to custom gradients.

PiperOrigin-RevId: 317942016
Change-Id: I374b97097cdd44093132f93f7098c8c52c877c90
---
 tensorflow/python/ops/custom_gradient.py | 32 +++---------------------
 tensorflow/python/ops/gradients_test.py  | 21 ----------------
 2 files changed, 4 insertions(+), 49 deletions(-)

diff --git a/tensorflow/python/ops/custom_gradient.py b/tensorflow/python/ops/custom_gradient.py
index ed666840436..5f4ee055621 100644
--- a/tensorflow/python/ops/custom_gradient.py
+++ b/tensorflow/python/ops/custom_gradient.py
@@ -336,38 +336,15 @@ def _graph_mode_decorator(f, args, kwargs):
           "All variables used by a function wrapped with @custom_gradient must "
           "be `ResourceVariable`s. Ensure that no `variable_scope` is created "
           "with `use_resource=False`.")
-
-  # It is possible for the caller to pass in an input that is from a different
-  # graph. Even though this is not valid we filter these out if they are not
-  # from the output graph to make it easier for some code to migrate to custom
-  # gradients.
-  inputs = nest.flatten(args)
-  outputs = nest.flatten(result)
-  graphs = {getattr(o, "graph", None) for o in outputs}
-  # Not all results may be tensors. However, we want to ensure that all outputs
-  # are from the same graph and use that to filter the inputs.
-  graphs.discard(None)  # Discard non-graph outputs
-  if graphs:
-    if len(graphs) > 1:
-      raise ValueError("All graph outputs should be from the same graph")
-    output_graph = graphs.pop()
-    filtered_inputs = []
-    for i in inputs:
-      if i.graph != output_graph:
-        logging.warn("%s does not belong to output graph %s", i, output_graph)
-      else:
-        filtered_inputs.append(i)
-
-    inputs = filtered_inputs
-
   # The variables that grad_fn needs to return gradients for are the set of
   # variables used that are *not* part of the inputs.
+  inputs = args
   variables_in_tape = frozenset([
       v.ref() for v in variable_watcher.watched_variables()
   ]) - frozenset(v.ref() for v in inputs)
   variables_in_subgraph = frozenset([
       v.ref()
-      for v in get_dependent_variables(input_ops=inputs, output_ops=outputs)
+      for v in get_dependent_variables(input_ops=inputs, output_ops=result)
   ])
   variables = list(
       [v.deref() for v in variables_in_subgraph.union(variables_in_tape)])
@@ -386,7 +363,7 @@ def _graph_mode_decorator(f, args, kwargs):
   flat_result = nest.flatten(result)
   flat_result_len = len(flat_result)
 
-  all_tensors = flat_result + inputs + variables
+  all_tensors = flat_result + args + variables
 
   def tape_grad_fn(*result_grads):
     """Custom grad fn wrapper."""
@@ -538,8 +515,7 @@ def recompute_grad(f):
 
         def transpose(*t_args, **t_kwargs):
           """Gradient function calculation for forward mode autodiff."""
-          # Just throw an error since gradients / activations are not stored on
-          # tape for recompute.
+          # Just throw an error since gradients / activations are not stored on tape for recompute.
           raise NotImplementedError(
               "recompute_grad tried to transpose grad of {}. "
               "Consider not using recompute_grad in forward mode"
diff --git a/tensorflow/python/ops/gradients_test.py b/tensorflow/python/ops/gradients_test.py
index 158253d1aab..fc5f38aedba 100644
--- a/tensorflow/python/ops/gradients_test.py
+++ b/tensorflow/python/ops/gradients_test.py
@@ -1197,27 +1197,6 @@ class CustomGradientTest(test_util.TensorFlowTestCase, parameterized.TestCase):
         dw = sess.run(math_ops.reduce_sum(grads[1]))
         self.assertEqual(12., dw)
 
-  def testCustomGradientOtherGraphVariables(self):
-    with ops.Graph().as_default():
-      v = variables.Variable(1.0)
-
-    @custom_gradient.custom_gradient
-    def MyMultiply(x1, x2, unused_y):
-      result = x1 * x2
-
-      def Grad(dy):
-        # Switched the ordering here.
-        return [dy * x1, dy * x2]
-
-      return result, Grad
-
-    with ops.Graph().as_default():
-      x1 = constant(3.)
-      x2 = constant(5.)
-      y = MyMultiply(x1, x2, v)
-      dy = gradients.gradients(y, [x1, x2])
-      self.assertAllEqual([3., 5.], self.evaluate(dy))
-
   def testCustomGradientWithVariablesNoFalsePositives(self):
 
     @custom_gradient.custom_gradient

From 7fbee9ba492298cbbaeefb483ca2369e78b32783 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 23 Jun 2020 14:54:48 -0700
Subject: [PATCH 0928/1390] Remove unused `MakeTypeIndex<>()`.

PiperOrigin-RevId: 317944111
Change-Id: I1dda4ebe9334b17bd8162d01977c4e8e7ff0c778
---
 tensorflow/core/framework/type_index.h | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/tensorflow/core/framework/type_index.h b/tensorflow/core/framework/type_index.h
index fcf68677a12..e8f715bebda 100644
--- a/tensorflow/core/framework/type_index.h
+++ b/tensorflow/core/framework/type_index.h
@@ -95,11 +95,6 @@ class TypeIndex {
   const char* name_;
 };
 
-template <typename T>
-inline TypeIndex MakeTypeIndex() {
-  return TypeIndex::Make<T>();
-}
-
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_CORE_FRAMEWORK_TYPE_INDEX_H_

From 06a954905bfa5ff23cd23330880d7598558ffcb0 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 23 Jun 2020 15:07:18 -0700
Subject: [PATCH 0929/1390] Internal change

PiperOrigin-RevId: 317946737
Change-Id: I39415c323b1d612d14f7a165ce6b188c4b84143d
---
 tensorflow/python/profiler/internal/python_hooks.cc | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/tensorflow/python/profiler/internal/python_hooks.cc b/tensorflow/python/profiler/internal/python_hooks.cc
index 73bc3731290..7ad15cd921d 100644
--- a/tensorflow/python/profiler/internal/python_hooks.cc
+++ b/tensorflow/python/profiler/internal/python_hooks.cc
@@ -200,8 +200,12 @@ void PythonHooks::ClearProfilerInAllThreads() {
 void PythonHooks::EnableTraceMe(bool enable) {
   const char* kModuleName =
       "tensorflow.python.profiler.trace";
-  auto trace_module = py::module::import(kModuleName);
-  trace_module.attr("enabled") = py::bool_(enable);
+  try {
+    auto trace_module = py::module::import(kModuleName);
+    trace_module.attr("enabled") = py::bool_(enable);
+  } catch (const py::error_already_set& e) {
+    LOG(ERROR) << "Can't import " << kModuleName;
+  }
 }
 
 }  // namespace profiler

From 8535dafb37ec4ce5c7272ffa4b8b4c491d44e999 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 23 Jun 2020 15:27:04 -0700
Subject: [PATCH 0930/1390] Internal change

PiperOrigin-RevId: 317950322
Change-Id: I83c81973a220b74c015a8571c4f1d50b4ede91db
---
 tensorflow/python/framework/func_graph.py     |   2 +-
 .../experimental/autocast_variable.py         | 103 +++++++++---------
 .../experimental/autocast_variable_test.py    |  31 ++----
 3 files changed, 58 insertions(+), 78 deletions(-)

diff --git a/tensorflow/python/framework/func_graph.py b/tensorflow/python/framework/func_graph.py
index 55a1a358458..e8e8fcbf081 100644
--- a/tensorflow/python/framework/func_graph.py
+++ b/tensorflow/python/framework/func_graph.py
@@ -187,7 +187,7 @@ class FuncGraph(ops.Graph):
     self.inputs = []
     self.outputs = []
     self.control_outputs = []
-    self.control_captures = object_identity.ObjectIdentitySet()
+    self.control_captures = set()
     self.structured_input_signature = None
     self.structured_outputs = None
     self._weak_variables = []
diff --git a/tensorflow/python/keras/mixed_precision/experimental/autocast_variable.py b/tensorflow/python/keras/mixed_precision/experimental/autocast_variable.py
index b4415fb7f78..57e8ced65a0 100644
--- a/tensorflow/python/keras/mixed_precision/experimental/autocast_variable.py
+++ b/tensorflow/python/keras/mixed_precision/experimental/autocast_variable.py
@@ -188,87 +188,61 @@ class AutoCastVariable(variables.Variable, core.Tensor):
   def constraint(self):
     return self._variable.constraint
 
-  def _apply_assign_update(self,
-                           update_fn,
-                           value,
-                           use_locking=None,
-                           name=None,
-                           read_value=True):
-    if ops.executing_eagerly_outside_functions():
-      assign_op = update_fn(value, use_locking, name, False)
-      return self if read_value else assign_op
-
-    # Fallback to wrapping the returned variable in graph mode if possible
-    assign_var = update_fn(value, use_locking, name, read_value)
-    if read_value and resource_variable_ops.is_resource_variable(assign_var):
-      return create_autocast_variable(assign_var)
-    return assign_var
-
-  def _apply_update(self, update_fn, *args, **kwargs):
-    update_var = update_fn(*args, **kwargs)
-    if ops.executing_eagerly_outside_functions():
-      return self
-
-    # Fallback to wrapping the returned variable in graph mode if possible
-    if resource_variable_ops.is_resource_variable(update_var):
-      return create_autocast_variable(update_var)
-    return update_var
-
   def assign(self, value, use_locking=None, name=None, read_value=True):
-    return self._apply_assign_update(self._variable.assign, value, use_locking,
-                                     name, read_value)
+    assign_op = self._variable.assign(value, use_locking, name, read_value)
+    return _maybe_wrap(assign_op, wrap=read_value)
 
   def assign_add(self, delta, use_locking=None, name=None, read_value=True):
-    return self._apply_assign_update(self._variable.assign_add, delta,
-                                     use_locking, name, read_value)
+    assign_op = self._variable.assign_add(delta, use_locking, name, read_value)
+    return _maybe_wrap(assign_op, wrap=read_value)
 
   def assign_sub(self, delta, use_locking=None, name=None, read_value=True):
-    return self._apply_assign_update(self._variable.assign_sub, delta,
-                                     use_locking, name, read_value)
+    assign_op = self._variable.assign_sub(delta, use_locking, name, read_value)
+    return _maybe_wrap(assign_op, wrap=read_value)
 
   def scatter_sub(self, sparse_delta, use_locking=False, name=None):
-    return self._apply_update(self._variable.scatter_sub, sparse_delta,
-                              use_locking, name)
+    var = self._variable.scatter_sub(sparse_delta, use_locking, name)
+    return _maybe_wrap(var)
 
   def scatter_add(self, sparse_delta, use_locking=False, name=None):
-    return self._apply_update(self._variable.scatter_add, sparse_delta,
-                              use_locking, name)
+    var = self._variable.scatter_add(sparse_delta, use_locking, name)
+    return _maybe_wrap(var)
 
   def scatter_max(self, sparse_delta, use_locking=False, name=None):
-    return self._apply_update(self._variable.scatter_max, sparse_delta,
-                              use_locking, name)
+    var = self._variable.scatter_max(sparse_delta, use_locking, name)
+    return _maybe_wrap(var)
 
   def scatter_min(self, sparse_delta, use_locking=False, name=None):
-    return self._apply_update(self._variable.scatter_min, sparse_delta,
-                              use_locking, name)
+    var = self._variable.scatter_min(sparse_delta, use_locking, name)
+    return _maybe_wrap(var)
 
   def scatter_mul(self, sparse_delta, use_locking=False, name=None):
-    return self._apply_update(self._variable.scatter_mul, sparse_delta,
-                              use_locking, name)
+    var = self._variable.scatter_mul(sparse_delta, use_locking, name)
+    return _maybe_wrap(var)
 
   def scatter_div(self, sparse_delta, use_locking=False, name=None):
-    return self._apply_update(self._variable.scatter_div, sparse_delta,
-                              use_locking, name)
+    var = self._variable.scatter_div(sparse_delta, use_locking, name)
+    return _maybe_wrap(var)
 
   def scatter_update(self, sparse_delta, use_locking=False, name=None):
-    return self._apply_update(self._variable.scatter_update, sparse_delta,
-                              use_locking, name)
+    var = self._variable.scatter_update(sparse_delta, use_locking, name)
+    return _maybe_wrap(var)
 
   def batch_scatter_update(self, sparse_delta, use_locking=False, name=None):
-    return self._apply_update(self._variable.batch_scatter_update, sparse_delta,
-                              use_locking, name)
+    var = self._variable.batch_scatter_update(sparse_delta, use_locking, name)
+    return _maybe_wrap(var)
 
   def scatter_nd_sub(self, indices, updates, name=None):
-    return self._apply_update(self._variable.scatter_nd_sub, indices, updates,
-                              name)
+    var = self._variable.scatter_nd_sub(indices, updates, name)
+    return _maybe_wrap(var)
 
   def scatter_nd_add(self, indices, updates, name=None):
-    return self._apply_update(self._variable.scatter_nd_add, indices, updates,
-                              name)
+    var = self._variable.scatter_nd_add(indices, updates, name)
+    return _maybe_wrap(var)
 
   def scatter_nd_update(self, indices, updates, name=None):
-    return self._apply_update(self._variable.scatter_nd_update, indices,
-                              updates, name)
+    var = self._variable.scatter_nd_update(indices, updates, name)
+    return _maybe_wrap(var)
 
   def load(self, value, session=None):
     return self._variable.load(value, session)
@@ -495,3 +469,24 @@ def create_autocast_variable(variable):
       # pylint: enable=missing-format-attribute
 
   return AutoCastDistributedVariable(variable)
+
+
+def _maybe_wrap(variable, wrap=True):
+  """Creates an AutoCastVariable that wraps another variable if applicable.
+
+  This function is used to wrap the return value of AutoCastVariable.assign.
+  Unfortunately MirroredVariable.assign will (incorrectly) return a Mirrored
+  value instead of a MirroredVariable. So we cannot properly wrap it in an
+  AutoCastVariable. We return the original variable in that case.
+
+  Args:
+    variable: A tf.Variable or op.
+    wrap: A boolean to define whether to wrap the variable in an
+      AutoCastVariable or not.
+
+  Returns:
+    An AutoCastVariable if wrap is True and variable is a resource variable.
+  """
+  if wrap and resource_variable_ops.is_resource_variable(variable):
+    return create_autocast_variable(variable)
+  return variable
diff --git a/tensorflow/python/keras/mixed_precision/experimental/autocast_variable_test.py b/tensorflow/python/keras/mixed_precision/experimental/autocast_variable_test.py
index ad5d782c2c6..964118136d4 100644
--- a/tensorflow/python/keras/mixed_precision/experimental/autocast_variable_test.py
+++ b/tensorflow/python/keras/mixed_precision/experimental/autocast_variable_test.py
@@ -305,8 +305,8 @@ class AutoCastVariableTest(test.TestCase, parameterized.TestCase):
         self.assertAllClose(3., self.evaluate(x.assign_sub(3.)))
 
         # Assign multiple times
-        # This currently doesn't work in graph mode if a strategy is used
-        if not ds_context.has_strategy() or context.executing_eagerly():
+        # This currently only works if no strategy is used
+        if not ds_context.has_strategy():
           assign = x.assign(1.)
           self.assertAllClose(1., self.evaluate(assign))
           self.assertAllClose(0., self.evaluate(assign.assign(0.)))
@@ -344,23 +344,6 @@ class AutoCastVariableTest(test.TestCase, parameterized.TestCase):
         # assign still expect float32 value even if in float16 scope
         run_and_check()
 
-  @combinations.generate(maybe_distribute)
-  def test_assign_tf_function(self, distribution):
-    if not context.executing_eagerly():
-      self.skipTest('Test is not compatible with graph mode')
-
-    with distribution.scope():
-      x = get_var(0., dtypes.float32)
-      x = autocast_variable.create_autocast_variable(x)
-
-      @def_function.function
-      def run_assign():
-        return x.assign(1.).assign_add(3.).assign_add(3.).assign_sub(2.)
-
-      with ops.get_default_graph()._enable_auto_casting_variables(
-          dtypes.float16):
-        self.assertAllClose(5., self.evaluate(run_assign()))
-
   @combinations.generate(maybe_distribute)
   def test_assign_stays_in_true_dtype(self, distribution):
     with distribution.scope():
@@ -375,16 +358,18 @@ class AutoCastVariableTest(test.TestCase, parameterized.TestCase):
           dtypes.float16):
         # Variable should be increased, despite it appearing to be the same
         # float16 value.
-        self.evaluate(x.assign(1. + small_tensor))
+        self.assertEqual(1. + small_val,
+                         self.evaluate(x.assign(1. + small_tensor)))
         self.assertEqual(1., self.evaluate(x.value()))
-      self.assertEqual(1. + small_val, self.evaluate(x))
+      self.assertEqual(1. + small_val, self.evaluate(x.value()))
 
       self.evaluate(x.assign(1.))
       with ops.get_default_graph()._enable_auto_casting_variables(
           dtypes.float16):
-        self.evaluate(x.assign_add(small_tensor))
+        self.assertEqual(1. + small_val,
+                         self.evaluate(x.assign_add(small_tensor)))
         self.assertEqual(1., self.evaluate(x.value()))
-      self.assertEqual(1. + small_val, self.evaluate(x))
+      self.assertEqual(1. + small_val, self.evaluate(x.value()))
 
   @combinations.generate(maybe_distribute)
   def test_checkpoint(self, distribution):

From ab05b8d7776e04e6e483c5b0bc7d7358df3ec967 Mon Sep 17 00:00:00 2001
From: Dero Gharibian <dero@google.com>
Date: Tue, 23 Jun 2020 15:30:46 -0700
Subject: [PATCH 0931/1390] Replaced extern inline with static inline to
 mitigate duplicate symbols in cgo

PiperOrigin-RevId: 317950897
Change-Id: Ia3cb17d5946a969187d8f1a81ff4c77844dcde3a
---
 tensorflow/core/platform/ctstring_internal.h | 32 ++++++++++----------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/tensorflow/core/platform/ctstring_internal.h b/tensorflow/core/platform/ctstring_internal.h
index 69338e6e4b7..f75fd04f955 100644
--- a/tensorflow/core/platform/ctstring_internal.h
+++ b/tensorflow/core/platform/ctstring_internal.h
@@ -136,7 +136,7 @@ typedef struct TF_TString {  // NOLINT
 // _Static_assert(CHAR_BIT == 8);
 // _Static_assert(sizeof(TF_TString) == 24);
 
-extern inline TF_TString_Type TF_TString_GetType(const TF_TString *str) {
+static inline TF_TString_Type TF_TString_GetType(const TF_TString *str) {
   return (TF_TString_Type)(str->u.raw.raw[0] & TF_TSTR_TYPE_MASK);  // NOLINT
 }
 
@@ -168,12 +168,12 @@ static inline size_t TF_TString_ToInternalSizeT(size_t size,
 #endif  // TF_TSTRING_LITTLE_ENDIAN
 }
 
-extern inline void TF_TString_Init(TF_TString *str) {
+static inline void TF_TString_Init(TF_TString *str) {
   str->u.smll.size = 0;
   str->u.smll.str[0] = '\0';
 }
 
-extern inline void TF_TString_Dealloc(TF_TString *str) {
+static inline void TF_TString_Dealloc(TF_TString *str) {
   if (TF_TString_GetType(str) == TF_TSTR_LARGE &&
       str->u.large.ptr != NULL) {  // NOLINT
     free(str->u.large.ptr);
@@ -181,7 +181,7 @@ extern inline void TF_TString_Dealloc(TF_TString *str) {
   }
 }
 
-extern inline size_t TF_TString_GetSize(const TF_TString *str) {
+static inline size_t TF_TString_GetSize(const TF_TString *str) {
   switch (TF_TString_GetType(str)) {
     case TF_TSTR_SMALL:
       return str->u.smll.size >> 2;
@@ -196,7 +196,7 @@ extern inline size_t TF_TString_GetSize(const TF_TString *str) {
   }
 }
 
-extern inline size_t TF_TString_GetCapacity(const TF_TString *str) {
+static inline size_t TF_TString_GetCapacity(const TF_TString *str) {
   switch (TF_TString_GetType(str)) {
     case TF_TSTR_SMALL:
       return TF_TString_SmallCapacity;
@@ -209,7 +209,7 @@ extern inline size_t TF_TString_GetCapacity(const TF_TString *str) {
   }
 }
 
-extern inline const char *TF_TString_GetDataPointer(const TF_TString *str) {
+static inline const char *TF_TString_GetDataPointer(const TF_TString *str) {
   switch (TF_TString_GetType(str)) {
     case TF_TSTR_SMALL:
       return str->u.smll.str;
@@ -225,7 +225,7 @@ extern inline const char *TF_TString_GetDataPointer(const TF_TString *str) {
   }
 }
 
-extern inline char *TF_TString_ResizeUninitialized(TF_TString *str,
+static inline char *TF_TString_ResizeUninitialized(TF_TString *str,
                                                    size_t new_size) {
   size_t curr_size = TF_TString_GetSize(str);
   size_t copy_size = TF_min(new_size, curr_size);
@@ -288,7 +288,7 @@ extern inline char *TF_TString_ResizeUninitialized(TF_TString *str,
   return str->u.large.ptr;
 }
 
-extern inline char *TF_TString_GetMutableDataPointer(TF_TString *str) {
+static inline char *TF_TString_GetMutableDataPointer(TF_TString *str) {
   switch (TF_TString_GetType(str)) {
     case TF_TSTR_SMALL:
       return str->u.smll.str;
@@ -306,7 +306,7 @@ extern inline char *TF_TString_GetMutableDataPointer(TF_TString *str) {
   }
 }
 
-extern inline void TF_TString_Reserve(TF_TString *str, size_t new_cap) {
+static inline void TF_TString_Reserve(TF_TString *str, size_t new_cap) {
   TF_TString_Type curr_type = TF_TString_GetType(str);
 
   if (new_cap <= TF_TString_SmallCapacity) {
@@ -347,7 +347,7 @@ extern inline void TF_TString_Reserve(TF_TString *str, size_t new_cap) {
   str->u.large.cap = new_cap;
 }
 
-extern inline char *TF_TString_Resize(TF_TString *str, size_t new_size,
+static inline char *TF_TString_Resize(TF_TString *str, size_t new_size,
                                       char c) {
   size_t curr_size = TF_TString_GetSize(str);
   char *cstr = TF_TString_ResizeUninitialized(str, new_size);
@@ -359,7 +359,7 @@ extern inline char *TF_TString_Resize(TF_TString *str, size_t new_size,
   return cstr;
 }
 
-extern inline void TF_TString_AssignView(TF_TString *dst, const char *src,
+static inline void TF_TString_AssignView(TF_TString *dst, const char *src,
                                          size_t size) {
   TF_TString_Dealloc(dst);
 
@@ -367,7 +367,7 @@ extern inline void TF_TString_AssignView(TF_TString *dst, const char *src,
   dst->u.view.ptr = src;
 }
 
-extern inline void TF_TString_AppendN(TF_TString *dst, const char *src,
+static inline void TF_TString_AppendN(TF_TString *dst, const char *src,
                                       size_t src_size) {
   if (!src_size) return;
 
@@ -378,21 +378,21 @@ extern inline void TF_TString_AppendN(TF_TString *dst, const char *src,
   memcpy(dst_c + dst_size, src, src_size);
 }
 
-extern inline void TF_TString_Append(TF_TString *dst, const TF_TString *src) {
+static inline void TF_TString_Append(TF_TString *dst, const TF_TString *src) {
   const char *src_c = TF_TString_GetDataPointer(src);
   size_t size = TF_TString_GetSize(src);
 
   TF_TString_AppendN(dst, src_c, size);
 }
 
-extern inline void TF_TString_Copy(TF_TString *dst, const char *src,
+static inline void TF_TString_Copy(TF_TString *dst, const char *src,
                                    size_t size) {
   char *dst_c = TF_TString_ResizeUninitialized(dst, size);
 
   if (size) memcpy(dst_c, src, size);
 }
 
-extern inline void TF_TString_Assign(TF_TString *dst, const TF_TString *src) {
+static inline void TF_TString_Assign(TF_TString *dst, const TF_TString *src) {
   if (dst == src) return;
 
   TF_TString_Dealloc(dst);
@@ -421,7 +421,7 @@ extern inline void TF_TString_Assign(TF_TString *dst, const TF_TString *src) {
   }
 }
 
-extern inline void TF_TString_Move(TF_TString *dst, TF_TString *src) {
+static inline void TF_TString_Move(TF_TString *dst, TF_TString *src) {
   if (dst == src) return;
 
   TF_TString_Dealloc(dst);

From 83b4d04ae2621456172aaf7fa0fa54aea6fb2e81 Mon Sep 17 00:00:00 2001
From: David Majnemer <majnemer@google.com>
Date: Tue, 23 Jun 2020 15:39:59 -0700
Subject: [PATCH 0932/1390] [XLA] Evaluate tf.sign for real arguments with
 fewer operations

We can evaluate it as:
  (x > 0) - (x < 0)

which should be cheaper than:
  x != x ? 0 : sign(x)

PiperOrigin-RevId: 317952523
Change-Id: I7b848497c9ceedb8aba10185cdba8d9c3d3d6a3d
---
 tensorflow/compiler/tf2xla/kernels/unary_ops.cc | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

diff --git a/tensorflow/compiler/tf2xla/kernels/unary_ops.cc b/tensorflow/compiler/tf2xla/kernels/unary_ops.cc
index 405c5e787da..66545fc72cf 100644
--- a/tensorflow/compiler/tf2xla/kernels/unary_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/unary_ops.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/lib/constants.h"
 #include "tensorflow/compiler/xla/client/lib/math.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/primitive_util.h"
 #include "tensorflow/core/framework/kernel_def_builder.h"
 
 namespace tensorflow {
@@ -85,8 +86,20 @@ XLAJIT_MAKE_UNARY(Rsqrt, xla::Rsqrt(x));
 XLAJIT_MAKE_UNARY(Sigmoid, xla::Logistic(x));
 
 // Returns 0 if x is NaN, 0 if x is 0, -1 if x < 0 and 1 if x > 0.
-XLAJIT_MAKE_UNARY(Sign,
-                  xla::Select(xla::Ne(x, x), xla::ZerosLike(x), xla::Sign(x)));
+static xla::XlaOp Sign(xla::XlaBuilder* b, xla::XlaOp x) {
+  return b->ReportErrorOrReturn([&]() -> xla::StatusOr<xla::XlaOp> {
+    TF_ASSIGN_OR_RETURN(auto shape, b->GetShape(x));
+    if (xla::primitive_util::IsComplexType(shape.element_type())) {
+      return xla::Sign(x);
+    }
+    auto gt = xla::Gt(x, xla::ZerosLike(x));
+    auto lt = xla::Lt(x, xla::ZerosLike(x));
+    return xla::ConvertElementType(gt, shape.element_type()) -
+           xla::ConvertElementType(lt, shape.element_type());
+  });
+}
+
+XLAJIT_MAKE_UNARY(Sign, Sign(b, x));
 XLAJIT_MAKE_UNARY(Sinh, xla::Sinh(x));
 
 static xla::XlaOp Softplus(xla::XlaBuilder* b, xla::XlaOp features) {

From a912655abd2f8b55441c2a8396c2580ceee07a29 Mon Sep 17 00:00:00 2001
From: Xiao Yu <fishx@google.com>
Date: Tue, 23 Jun 2020 15:58:49 -0700
Subject: [PATCH 0933/1390] Fix a heap-use-after-free issue.

PiperOrigin-RevId: 317955770
Change-Id: I843e4bcd9b5cac3c22893d4e0e9aa6867e18a8c4
---
 tensorflow/core/common_runtime/eager/core.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/common_runtime/eager/core.cc b/tensorflow/core/common_runtime/eager/core.cc
index 77d2b665f5e..0191527748b 100644
--- a/tensorflow/core/common_runtime/eager/core.cc
+++ b/tensorflow/core/common_runtime/eager/core.cc
@@ -197,7 +197,7 @@ Status EagerOperation::Execute(absl::Span<AbstractTensorHandle*> retvals,
   if (device == kVariantDeviceNull) {
     bool pin_to_cpu;
     TF_RETURN_IF_ERROR(eager::MaybePinSmallOpsToCpu(
-        &pin_to_cpu, op_name(),
+        &pin_to_cpu, Name(),
         absl::MakeSpan(
             reinterpret_cast<ImmediateExecutionTensorHandle**>(inputs_.data()),
             inputs_.size()),

From 672b293cbdbfc5225120a3f46f4665a89fc62acf Mon Sep 17 00:00:00 2001
From: abhichou4 <abhichou4@gmail.com>
Date: Wed, 24 Jun 2020 04:47:56 +0530
Subject: [PATCH 0934/1390] use Regexp for right ValueError

---
 tensorflow/python/eager/forwardprop.py      | 2 +-
 tensorflow/python/eager/forwardprop_test.py | 6 +++++-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/tensorflow/python/eager/forwardprop.py b/tensorflow/python/eager/forwardprop.py
index a9627f28db1..1e9d923dc2c 100644
--- a/tensorflow/python/eager/forwardprop.py
+++ b/tensorflow/python/eager/forwardprop.py
@@ -164,7 +164,7 @@ def _jvp_helper_wrapper(
   """
   if use_batch:
     for primal, tangent in zip(inputs, tangents):
-      if tangent.shape.is_compatible_with(primal.shape): 
+      if not tangent.shape.is_compatible_with([None] + primal.shape): 
         raise ValueError(
           "Tangent {} was expected to be of shape "
           "{} but is instead of shape {}".format(
diff --git a/tensorflow/python/eager/forwardprop_test.py b/tensorflow/python/eager/forwardprop_test.py
index 0a34fc0265f..de71459da8a 100644
--- a/tensorflow/python/eager/forwardprop_test.py
+++ b/tensorflow/python/eager/forwardprop_test.py
@@ -27,6 +27,7 @@ import numpy as np
 from tensorflow.python import pywrap_tfe
 from tensorflow.python.distribute import mirrored_strategy
 from tensorflow.python.eager import backprop
+from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
 from tensorflow.python.eager import forwardprop
 from tensorflow.python.eager import forwardprop_util
@@ -283,9 +284,12 @@ class ForwardpropTest(test.TestCase, parameterized.TestCase):
     )
 
   def testJVPFunctionRaisesError(self):
+    context.ensure_initialized()
+    ctx = context.context()
+
     sum_outputs = (constant_op.constant(6.),)
     
-    with self.assertRaises(ValueError):
+    with self.assertRaisesRegexp(ValueError, r".*was expected to be of shape*"):
       forwardprop._jvp_dispatch(
         op_name="Add",
         attr_tuple=(),

From 7f0e00817fe3c5c090dfb748137120f5a1ae0261 Mon Sep 17 00:00:00 2001
From: rahul-kamat <rahulkamat@gmail.com>
Date: Tue, 23 Jun 2020 23:24:20 +0000
Subject: [PATCH 0935/1390] PR review changes

---
 tensorflow/python/framework/python_op_gen.cc  | 117 +++++++++---------
 tensorflow/python/framework/python_op_gen.h   |   6 +-
 .../framework/python_op_gen_internal.cc       |   4 +-
 .../python/framework/python_op_gen_internal.h |   4 +-
 .../python/framework/python_op_gen_main.cc    |   4 +-
 .../python/framework/python_op_gen_test.cc    |   2 +-
 6 files changed, 67 insertions(+), 70 deletions(-)

diff --git a/tensorflow/python/framework/python_op_gen.cc b/tensorflow/python/framework/python_op_gen.cc
index 0b6f974d962..b6e39a4df00 100644
--- a/tensorflow/python/framework/python_op_gen.cc
+++ b/tensorflow/python/framework/python_op_gen.cc
@@ -45,7 +45,8 @@ const int kRightMargin = 78;
 
 constexpr char kEagerFallbackSuffix[] = "_eager_fallback";
 
-std::unordered_map<string, string> dtype_type {
+// Dtype enums mapped to dtype classes which is the type of each dtype
+const std::unordered_map<string, string> dtype_type {
       {"_dtypes.float16", "_dtypes.Float16"},
       {"_dtypes.half", "_dtypes.Half"},
       {"_dtypes.float32", "_dtypes.Float32"},
@@ -133,8 +134,8 @@ string TensorPBString(const TensorProto& pb) {
 class GenEagerPythonOp : public python_op_gen_internal::GenPythonOp {
  public:
   GenEagerPythonOp(const OpDef& op_def, const ApiDef& api_def,
-                   const string& function_name, const bool type_annotate_op)
-      : python_op_gen_internal::GenPythonOp(op_def, api_def, function_name, type_annotate_op) {
+                   const string& function_name, bool add_type_annotations)
+      : python_op_gen_internal::GenPythonOp(op_def, api_def, function_name, add_type_annotations) {
     op_name_ = function_name_;
     absl::ConsumePrefix(&op_name_, "_");
   }
@@ -160,12 +161,12 @@ class GenEagerPythonOp : public python_op_gen_internal::GenPythonOp {
   bool AddEagerFastPathAndGraphCode(const string& parameters,
                                     const std::vector<string>& output_sizes,
                                     const string& eager_not_allowed_error,
-                                    std::unordered_map<string, string>& type_annotations);
+                                    const std::unordered_map<string, string>& type_annotations);
   bool AddEagerFallbackCode(const string& parameters,
                             const std::vector<string>& output_sizes,
                             const string& num_outputs_expr,
                             const string& eager_not_allowed_error,
-                            std::unordered_map<string, string>& type_annotations);
+                            const std::unordered_map<string, string>& type_annotations);
   void AddEagerFastPathExecute();
 
   void AddEagerInferredAttrs(const string& indentation);
@@ -177,11 +178,11 @@ class GenEagerPythonOp : public python_op_gen_internal::GenPythonOp {
 
   void AddRawOpExport(const string& parameters);
 
-  std::unordered_map<string, string> GetTypeAnnotationMap();
+  std::unordered_map<string, string> GetTypeAnnotations();
 
-  void GenerateTypeVars(std::unordered_map<string, string>& type_annotations);
+  void GenerateTypeVars(const std::unordered_map<string, string>& type_annotations);
 
-  void AddReturnTypeAnnotation(std::unordered_map<string, string>& type_annotations);
+  void AddReturnTypeAnnotation(const std::unordered_map<string, string>& type_annotations);
 
   void AddAttrForArg(const string& attr, int arg_index) {
     gtl::InsertIfNotPresent(&inferred_attrs_, attr,
@@ -214,8 +215,8 @@ class GenEagerPythonOp : public python_op_gen_internal::GenPythonOp {
 };
 
 string GetEagerPythonOp(const OpDef& op_def, const ApiDef& api_def,
-                        const string& function_name, const bool type_annotate_op) {
-  return GenEagerPythonOp(op_def, api_def, function_name, type_annotate_op).Code();
+                        const string& function_name, bool add_type_annotations) {
+  return GenEagerPythonOp(op_def, api_def, function_name, add_type_annotations).Code();
 }
 
 string GenEagerPythonOp::FlattenInputs(
@@ -347,8 +348,8 @@ string GenEagerPythonOp::Code() {
 
   std::unordered_map<string, string> type_annotations;
   // Only populate map for whitelisted ops
-  if (type_annotate_op_) {
-    type_annotations = GetTypeAnnotationMap();
+  if (add_type_annotations_) {
+    type_annotations = GetTypeAnnotations();
   }
 
   string parameters;
@@ -357,33 +358,28 @@ string GenEagerPythonOp::Code() {
     if (!parameters.empty()) strings::StrAppend(&parameters, ", ");
     strings::StrAppend(&parameters, param.GetRenameTo());
 
-    // Add type annotations to param
     if (type_annotations.find(param.GetName()) != type_annotations.end()) {
-      strings::StrAppend(&parameters, ": ", type_annotations[param.GetName()]);
+      strings::StrAppend(&parameters, ": ", type_annotations.at(param.GetName()));
     }
   }
 
-  // Append to parameters and parameters_with_defaults because multiple functions
-  // are generated (op and fallback op)
   string parameters_with_defaults = parameters;
   for (const auto& param_and_default : params_with_default_) {
     if (!parameters.empty()) strings::StrAppend(&parameters, ", ");
     if (!parameters_with_defaults.empty())
       strings::StrAppend(&parameters_with_defaults, ", ");
 
-    // Add type annotations to param_and_default
+    strings::StrAppend(&parameters, param_and_default.first.GetRenameTo());
+    strings::StrAppend(&parameters_with_defaults, param_and_default.first.GetRenameTo());
     if (type_annotations.find(param_and_default.first.GetName()) != type_annotations.end()) {
-      const string param_type = type_annotations[param_and_default.first.GetName()];
-      strings::StrAppend(&parameters, param_and_default.first.GetRenameTo(), ": ", param_type);
-      strings::StrAppend(&parameters_with_defaults,
-                         param_and_default.first.GetRenameTo(), ": ",
-                         param_type, " = ", param_and_default.second);
-      continue;
+      const string param_type = type_annotations.at(param_and_default.first.GetName());
+      // Append to parameters and parameters_with_defaults because multiple functions
+      // are generated by AddEagerFastPathAndGraphCode() and AddEagerFallbackCode()
+      strings::StrAppend(&parameters, ": ", param_type);
+      strings::StrAppend(&parameters_with_defaults, ":", param_type);
     }
 
-    strings::StrAppend(&parameters, param_and_default.first.GetRenameTo());
-    strings::StrAppend(&parameters_with_defaults,
-                       param_and_default.first.GetRenameTo(), "=",
+    strings::StrAppend(&parameters_with_defaults, "=",
                        param_and_default.second);
   }
 
@@ -428,9 +424,9 @@ string GenEagerPythonOp::Code() {
   return prelude_ + result_;
 }
 
-std::unordered_map<string, string> GenEagerPythonOp::GetTypeAnnotationMap() {
+std::unordered_map<string, string> GenEagerPythonOp::GetTypeAnnotations() {
   std::unordered_map<string, string> type_annotations;
-  // Mapping attrs to TypeVars
+  // Map attrs to TypeVars
   for (const auto& attr : op_def_.attr()) {
     if (attr.type() == "type") {
       const string type_var_name = "TV_" + op_def_.name() + "_" + attr.name();
@@ -441,24 +437,26 @@ std::unordered_map<string, string> GenEagerPythonOp::GetTypeAnnotationMap() {
     }
   }
 
-  // Mapping input Tensors to their types
+  // Map input Tensors to their types
   for (const auto& arg : op_def_.input_arg()) {
-    // Do not add type annotations to args that accept a sequence of Tensors
-    if (!arg.number_attr().empty()) continue;
+    // TODO(rahulkamat): Add type annotations to args that accept a sequence of Tensors
+    if (!arg.number_attr().empty() || !arg.type_list_attr().empty()) continue;
     type_annotations[arg.name()] = GetArgAnnotation(arg, type_annotations);
   }
 
-  // Mapping output Tensor to its type
+  // TODO(rahulkamat): Add type annotations to handle return types of a sequence of Tensors.
+  // Map output Tensor to its type
   if (op_def_.output_arg_size() == 1) {
     const auto& arg = op_def_.output_arg(0);
-    type_annotations[arg.name()] = GetArgAnnotation(arg, type_annotations);
+    if (arg.number_attr().empty() && arg.type_list_attr().empty())
+      type_annotations[arg.name()] = GetArgAnnotation(arg, type_annotations);
   }
 
   return type_annotations;
 }
 
 // Generate TypeVars using attrs
-void GenEagerPythonOp::GenerateTypeVars(std::unordered_map<string, string>& type_annotations) {
+void GenEagerPythonOp::GenerateTypeVars(const std::unordered_map<string, string>& type_annotations) {
   bool added_typevar = false;
   for (const auto& attr : op_def_.attr()) {
     if (attr.type() == "type") {
@@ -466,12 +464,10 @@ void GenEagerPythonOp::GenerateTypeVars(std::unordered_map<string, string>& type
       for (int t : attr.allowed_values().list().type()) {
         DataType dtype = static_cast<DataType>(t);
         const string py_dtype = python_op_gen_internal::DataTypeToPython(dtype, "_dtypes.");
-        if (dtype_type.find(py_dtype) != dtype_type.end()) {
-          allowed_types.emplace_back(dtype_type[py_dtype]);
-        }
+          allowed_types.emplace_back(dtype_type.at(py_dtype));
       }
 
-      // If all dtypes are allowed, add them all
+      // When a Tensor does not have any dtypes specified, all dtypes are allowed
       if (allowed_types.empty()) {
         for (std::pair<string, string> map_dtype : dtype_type) {
           allowed_types.emplace_back(map_dtype.second);
@@ -486,7 +482,7 @@ void GenEagerPythonOp::GenerateTypeVars(std::unordered_map<string, string>& type
         strings::StrAppend(&typevar_dtypes, *it);
       }
 
-      const string type_var_name = type_annotations[attr.name()];
+      const string type_var_name = type_annotations.at(attr.name());
       strings::StrAppend(&result_, type_var_name, " = TypeVar(\"", type_var_name, "\", ", typevar_dtypes,")\n");
       added_typevar = true;
     }
@@ -495,14 +491,15 @@ void GenEagerPythonOp::GenerateTypeVars(std::unordered_map<string, string>& type
   if (added_typevar) strings::StrAppend(&result_, "\n");
 }
 
-// TODO(rahulkamat): Modify AddDefLine() to add return type annotation
-void GenEagerPythonOp::AddReturnTypeAnnotation(std::unordered_map<string, string>& type_annotations) {
+void GenEagerPythonOp::AddReturnTypeAnnotation(const std::unordered_map<string, string>& type_annotations) {
   if (op_def_.output_arg_size() == 1) {
     const auto& arg = op_def_.output_arg(0);
-    // Add type annotations to param
-    if (type_annotations.find(arg.name()) != type_annotations.end()) {
+    if (arg.number_attr().empty() && arg.type_list_attr().empty()) {
+      const string return_type = type_annotations.at(arg.name());
+      // TODO(rahulkamat): Modify AddDefLine() to add return type annotation to avoid
+      // erasing ":\n" from the end of the def line
       result_.erase(result_.length() - 2);
-      strings::StrAppend(&result_, " -> ", type_annotations[arg.name()], ":\n");
+      strings::StrAppend(&result_, " -> ", return_type, ":\n");
     }
   }
 }
@@ -829,8 +826,9 @@ void GenEagerPythonOp::AddEagerFunctionTeardown(
 
 bool GenEagerPythonOp::AddEagerFastPathAndGraphCode(
     const string& parameters, const std::vector<string>& output_sizes,
-    const string& eager_not_allowed_error, std::unordered_map<string, string>& type_annotations) {
-  if (type_annotate_op_) {
+    const string& eager_not_allowed_error,
+    const std::unordered_map<string, string>& type_annotations) {
+  if (add_type_annotations_) {
     GenerateTypeVars(type_annotations);
   }
   if (api_def_.visibility() == ApiDef::VISIBLE) {
@@ -839,7 +837,7 @@ bool GenEagerPythonOp::AddEagerFastPathAndGraphCode(
 
   AddExport();
   AddDefLine(function_name_, parameters);
-  if (type_annotate_op_) {
+  if (add_type_annotations_) {
     AddReturnTypeAnnotation(type_annotations);
   }
   AddDocStringDescription();
@@ -877,11 +875,11 @@ bool GenEagerPythonOp::AddEagerFastPathAndGraphCode(
 bool GenEagerPythonOp::AddEagerFallbackCode(
     const string& parameters, const std::vector<string>& output_sizes,
     const string& num_outputs_expr, const string& eager_not_allowed_error,
-    std::unordered_map<string, string>& type_annotations) {
+    const std::unordered_map<string, string>& type_annotations) {
   AddDefLine(
       strings::StrCat(function_name_, kEagerFallbackSuffix),
       strings::StrCat(parameters, parameters.empty() ? "" : ", ", "ctx"));
-  if (type_annotate_op_) {
+  if (add_type_annotations_) {
     AddReturnTypeAnnotation(type_annotations);
   }
   if (!eager_not_allowed_error.empty()) {
@@ -1133,7 +1131,7 @@ void GenEagerPythonOp::AddRawOpExport(const string& parameters) {
 string GetPythonOpsImpl(const OpList& ops, const ApiDefMap& api_defs,
                         const std::vector<string>& hidden_ops,
                         const string& source_file_name = "",
-                        std::unordered_set<string> type_annotate_ops = {}) {
+                        const std::unordered_set<string> type_annotate_ops = {}) {
   string result;
   // Header
   // TODO(josh11b): Mention the library for which wrappers are being generated.
@@ -1211,10 +1209,11 @@ from typing import TypeVar
       continue;
     }
 
-    const bool type_annotate_op = type_annotate_ops.find(op_def.name()) != type_annotate_ops.end();
+    auto iter = type_annotate_ops.find(op_def.name());
+    bool add_type_annotations = iter != type_annotate_ops.end();
 
     strings::StrAppend(&result,
-                       GetEagerPythonOp(op_def, *api_def, function_name, type_annotate_op));
+                       GetEagerPythonOp(op_def, *api_def, function_name, add_type_annotations));
   }
 
   return result;
@@ -1225,14 +1224,14 @@ from typing import TypeVar
 string GetPythonOps(const OpList& ops, const ApiDefMap& api_defs,
                     const std::vector<string>& hidden_ops,
                     const string& source_file_name,
-                    std::unordered_set<string> type_annotate_ops) {
+                    const std::unordered_set<string> type_annotate_ops) {
   return GetPythonOpsImpl(ops, api_defs, hidden_ops, source_file_name, type_annotate_ops);
 }
 
 void PrintPythonOps(const OpList& ops, const ApiDefMap& api_defs,
                     const std::vector<string>& hidden_ops,
                     const string& source_file_name,
-                    std::unordered_set<string> type_annotate_ops) {
+                    const std::unordered_set<string> type_annotate_ops) {
   printf("%s",
          GetPythonOpsImpl(ops, api_defs, hidden_ops, source_file_name, type_annotate_ops).c_str());
 }
@@ -1245,16 +1244,14 @@ string GetPythonWrappers(const char* op_list_buf, size_t op_list_len) {
   return GetPythonOpsImpl(ops, api_def_map, {});
 }
 
-string GetArgAnnotation(const auto& arg, std::unordered_map<string, string>& type_annotations) {
-  if (type_annotations.find(arg.type_attr()) != type_annotations.end()) {
+string GetArgAnnotation(const auto& arg, const std::unordered_map<string, string>& type_annotations) {
+  if (!arg.type_attr().empty()) {
     // Get the correct TypeVar if arg maps to an attr
-    return "_ops.Tensor[" + type_annotations[arg.type_attr()] + "]";
+    return "_ops.Tensor[" + type_annotations.at(arg.type_attr()) + "]";
   } else {
     // Get the dtype of the Tensor
     const string py_dtype = python_op_gen_internal::DataTypeToPython(arg.type(), "_dtypes.");
-    if (dtype_type.find(py_dtype) != dtype_type.end()) {
-      return "_ops.Tensor[" + dtype_type[py_dtype] + "]";
-    }
+    return "_ops.Tensor[" + dtype_type.at(py_dtype) + "]";
   }
 
   return "Any";
diff --git a/tensorflow/python/framework/python_op_gen.h b/tensorflow/python/framework/python_op_gen.h
index 1a3b6c5e8f2..5dfc959b3ad 100644
--- a/tensorflow/python/framework/python_op_gen.h
+++ b/tensorflow/python/framework/python_op_gen.h
@@ -33,7 +33,7 @@ namespace tensorflow {
 string GetPythonOps(const OpList& ops, const ApiDefMap& api_defs,
                     const std::vector<string>& hidden_ops,
                     const string& source_file_name,
-                    std::unordered_set<string> type_annotate_ops);
+                    const std::unordered_set<string> type_annotate_ops);
 
 // Prints the output of GetPrintOps to stdout.
 // hidden_ops should be a list of Op names that should get a leading _
@@ -43,7 +43,7 @@ string GetPythonOps(const OpList& ops, const ApiDefMap& api_defs,
 void PrintPythonOps(const OpList& ops, const ApiDefMap& api_defs,
                     const std::vector<string>& hidden_ops,
                     const string& source_file_name,
-                    std::unordered_set<string> type_annotate_ops);
+                    const std::unordered_set<string> type_annotate_ops);
 
 // Get the python wrappers for a list of ops in a OpList.
 // `op_list_buf` should be a pointer to a buffer containing
@@ -55,7 +55,7 @@ string GetPythonWrappers(const char* op_list_buf, size_t op_list_len);
 // `arg` should be an input or output of an op
 // `type_annotations` should contain attr names mapped to TypeVar names
 string GetArgAnnotation(const auto& arg,
-                        std::unordered_map<string, string>& type_annotations);
+                        const std::unordered_map<string, string>& type_annotations);
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/python/framework/python_op_gen_internal.cc b/tensorflow/python/framework/python_op_gen_internal.cc
index d0ef82857c4..adbdbbf06fb 100644
--- a/tensorflow/python/framework/python_op_gen_internal.cc
+++ b/tensorflow/python/framework/python_op_gen_internal.cc
@@ -513,11 +513,11 @@ const ApiDef::Attr* FindAttr(StringPiece name, const ApiDef& api_def) {
 }
 
 GenPythonOp::GenPythonOp(const OpDef& op_def, const ApiDef& api_def,
-                         const string& function_name, const bool type_annotate_op)
+                         const string& function_name, bool add_type_annotations)
     : op_def_(op_def),
       api_def_(api_def),
       function_name_(function_name),
-      type_annotate_op_(type_annotate_op),
+      add_type_annotations_(add_type_annotations),
       num_outs_(op_def.output_arg_size()) {}
 
 GenPythonOp::~GenPythonOp() {}
diff --git a/tensorflow/python/framework/python_op_gen_internal.h b/tensorflow/python/framework/python_op_gen_internal.h
index 5229bffc5d0..08d9b3c8a66 100644
--- a/tensorflow/python/framework/python_op_gen_internal.h
+++ b/tensorflow/python/framework/python_op_gen_internal.h
@@ -71,7 +71,7 @@ class ParamNames {
 class GenPythonOp {
  public:
   GenPythonOp(const OpDef& op_def, const ApiDef& api_def,
-              const string& function_name, const bool type_annotate_op_);
+              const string& function_name, bool add_type_annotations_);
   virtual ~GenPythonOp();
 
   virtual string Code();
@@ -98,7 +98,7 @@ class GenPythonOp {
   const OpDef& op_def_;
   const ApiDef& api_def_;
   const string function_name_;
-  const bool type_annotate_op_;
+  bool add_type_annotations_;
   const int num_outs_;
 
   // Return value from Code() is prelude_ + result_.
diff --git a/tensorflow/python/framework/python_op_gen_main.cc b/tensorflow/python/framework/python_op_gen_main.cc
index dcaea53100e..c3ef4202d2a 100644
--- a/tensorflow/python/framework/python_op_gen_main.cc
+++ b/tensorflow/python/framework/python_op_gen_main.cc
@@ -109,7 +109,7 @@ void PrintAllPythonOps(const std::vector<string>& op_list,
                        const std::vector<string>& api_def_dirs,
                        const string& source_file_name,
                        bool op_list_is_whitelist,
-                       std::unordered_set<string> type_annotate_ops) {
+                       const std::unordered_set<string> type_annotate_ops) {
   OpList ops;
   OpRegistry::Global()->Export(false, &ops);
 
@@ -159,7 +159,7 @@ int main(int argc, char* argv[]) {
       argv[1], ",", tensorflow::str_util::SkipEmpty());
 
   // Add op name to this set to add type annotations
-  std::unordered_set<tensorflow::string> type_annotate_ops {
+  const std::unordered_set<tensorflow::string> type_annotate_ops {
   };
 
   if (argc == 2) {
diff --git a/tensorflow/python/framework/python_op_gen_test.cc b/tensorflow/python/framework/python_op_gen_test.cc
index cf6566ea7ae..5fff1a1d111 100644
--- a/tensorflow/python/framework/python_op_gen_test.cc
+++ b/tensorflow/python/framework/python_op_gen_test.cc
@@ -261,7 +261,7 @@ TEST(PythonOpGen, TypeAnnotateDefaultParams) {
 
   string code = GetPythonOps(op_defs, api_def_map, {}, "", type_annotate_ops);
 
-  const string params = "def foo_bar(x: _ops.Tensor[_dtypes.Float32], t: TV_FooBar_t, var1: bool = False, var2: int = 0, name=None)";
+  const string params = "def foo_bar(x: _ops.Tensor[_dtypes.Float32], t: TV_FooBar_t, var1:bool=False, var2:int=0, name=None)";
   const string params_fallback = "def foo_bar_eager_fallback(x: _ops.Tensor[_dtypes.Float32], t: TV_FooBar_t, var1: bool, var2: int, name, ctx)";
   ExpectHasSubstr(code, params);
   ExpectHasSubstr(code, params_fallback);

From a24767dcaeac10dd87b01ac4de27f0f7ff1e3c55 Mon Sep 17 00:00:00 2001
From: Rick Chao <rchao@google.com>
Date: Tue, 23 Jun 2020 16:20:31 -0700
Subject: [PATCH 0936/1390] Skip testClusterResolverProperty for TPU cases and
 follow up with a fix.

PiperOrigin-RevId: 317959686
Change-Id: I6ad671e2a5b03886e24d5db88d2cf57db35b3bd1
---
 tensorflow/python/distribute/strategy_common_test.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tensorflow/python/distribute/strategy_common_test.py b/tensorflow/python/distribute/strategy_common_test.py
index 7070fbbf18f..7744364c544 100644
--- a/tensorflow/python/distribute/strategy_common_test.py
+++ b/tensorflow/python/distribute/strategy_common_test.py
@@ -156,6 +156,8 @@ class StrategyClusterResolverTest(test.TestCase, parameterized.TestCase):
     with strategy.scope():
       self.assertIs(strategy.cluster_resolver, resolver)
     self.assertTrue(hasattr(resolver, 'cluster_spec'))
+    if isinstance(strategy, TPUStrategy):
+      self.skipTest('b/159747888')
     self.assertTrue(hasattr(resolver, 'environment'))
     self.assertTrue(hasattr(resolver, 'master'))
     self.assertTrue(hasattr(resolver, 'num_accelerators'))

From dfe03768e01a3488e57b428e4c7f02ede66af555 Mon Sep 17 00:00:00 2001
From: Priya Gupta <priyag@google.com>
Date: Tue, 23 Jun 2020 16:50:55 -0700
Subject: [PATCH 0937/1390] Enhance the docstring for
 tf.distribute.Stategy.reduce API.

PiperOrigin-RevId: 317965125
Change-Id: I46ce4c2e6a8d547d9d26c01ccb27b25394f1dc7d
---
 .../python/distribute/distribute_lib.py       | 91 ++++++++++++++++---
 1 file changed, 79 insertions(+), 12 deletions(-)

diff --git a/tensorflow/python/distribute/distribute_lib.py b/tensorflow/python/distribute/distribute_lib.py
index f32427b88e0..d7893ae54f8 100644
--- a/tensorflow/python/distribute/distribute_lib.py
+++ b/tensorflow/python/distribute/distribute_lib.py
@@ -1217,20 +1217,85 @@ class StrategyBase(object):
     return self.run(fn, args=args, kwargs=kwargs, options=options)
 
   def reduce(self, reduce_op, value, axis):
-    """Reduce `value` across replicas.
+    """Reduce `value` across replicas and return result on current device.
+
+    >>> strategy = tf.distribute.MirroredStrategy()
+    >>> def step_fn():
+    ...   i = tf.distribute.get_replica_context().replica_id_in_sync_group
+    ...   return tf.identity(i)
+    >>>
+    >>> per_replica_result = strategy.run(step_fn)
+    >>> total = strategy.reduce("SUM", per_replica_result, axis=None)
+    >>> total
+    <tf.Tensor: shape=(), dtype=int32, numpy=0>
+
+    To see how this would look with multiple replicas, consider the same
+    example with MirroredStrategy with 2 GPUs:
+
+    ```python
+    strategy = tf.distribute.MirroredStrategy(devices=["gpu:0", "gpu:1"])
+    def step_fn():
+      i = tf.distribute.get_replica_context().replica_id_in_sync_group
+      return tf.identity(i)
+
+    per_replica_result = strategy.run(step_fn)
+    # Check devices on which per replica result is:
+    strategy.experimental_local_results(per_replica_result)[0].device
+    # /job:localhost/replica:0/task:0/device:GPU:0
+    strategy.experimental_local_results(per_replica_result)[1].device
+    # /job:localhost/replica:0/task:0/device:GPU:1
+
+    total = strategy.reduce("SUM", per_replica_result, axis=None)
+    # Check device on which reduced result is:
+    total.device
+    # /job:localhost/replica:0/task:0/device:CPU:0
+
+    ```
+
+    This API is typically used for aggregating the results returned from
+    different replicas, for reporting etc. For example, loss computed from
+    different replicas can be averaged using this API before printing.
+
+    Note: The result is copied to the "current" device - which would typically
+    be the CPU of the worker on which the program is running. For `TPUStrategy`,
+    it is the first TPU host. For multi client `MultiWorkerMirroredStrategy`,
+    this is CPU of each worker.
+
+    There are a number of different tf.distribute APIs for reducing values
+    across replicas:
+    * `tf.distribute.ReplicaContext.all_reduce`: This differs from
+    `Strategy.reduce` in that it is for replica context and does
+    not copy the results to the host device. `all_reduce` should be typically
+    used for reductions inside the training step such as gradients.
+    * `tf.distribute.StrategyExtended.reduce_to` and
+    `tf.distribute.StrategyExtended.batch_reduce_to`: These APIs are more
+    advanced versions of `Strategy.reduce` as they allow customizing the
+    destination of the result. They are also called in cross replica context.
+
+    _What should axis be?_
 
     Given a per-replica value returned by `run`, say a
     per-example loss, the batch will be divided across all the replicas.  This
     function allows you to aggregate across replicas and optionally also across
-    batch elements.  For example, if you have a global batch size of 8 and 2
+    batch elements by specifying the axis parameter accordingly.
+
+    For example, if you have a global batch size of 8 and 2
     replicas, values for examples `[0, 1, 2, 3]` will be on replica 0 and
-    `[4, 5, 6, 7]` will be on replica 1. By default, `reduce` will just
-    aggregate across replicas, returning `[0+4, 1+5, 2+6, 3+7]`. This is useful
-    when each replica is computing a scalar or some other value that doesn't
-    have a "batch" dimension (like a gradient). More often you will want to
-    aggregate across the global batch, which you can get by specifying the batch
+    `[4, 5, 6, 7]` will be on replica 1. With `axis=None`, `reduce` will
+    aggregate only across replicas, returning `[0+4, 1+5, 2+6, 3+7]`.
+    This is useful when each replica is computing a scalar or some other value
+    that doesn't have a "batch" dimension (like a gradient or loss).
+    ```
+    strategy.reduce("sum", per_replica_result, axis=None)
+    ```
+
+    Sometimes, you will want to aggregate across both the global batch _and_
+    all replicas. You can get this behavior by specifying the batch
     dimension as the `axis`, typically `axis=0`. In this case it would return a
     scalar `0+1+2+3+4+5+6+7`.
+    ```
+    strategy.reduce("sum", per_replica_result, axis=0)
+    ```
 
     If there is a last partial batch, you will need to specify an axis so
     that the resulting shape is consistent across replicas. So if the last
@@ -1242,11 +1307,13 @@ class StrategyBase(object):
     which will weigh some values `1/8` and others `1/4`.
 
     Args:
-      reduce_op: A `tf.distribute.ReduceOp` value specifying how values should
-        be combined.
-      value: A "per replica" value, e.g. returned by `run` to
-        be combined into a single tensor.
-      axis: Specifies the dimension to reduce along within each
+      reduce_op: a `tf.distribute.ReduceOp` value specifying how values should
+        be combined. Allows using string representation of the enum such as
+        "SUM", "MEAN".
+      value: a `tf.distribute.DistributeValues` instance, e.g. returned by
+        `Strategy.run`, to be combined into a single tensor. It can also be a
+        regular tensor when used with `OneDeviceStrategy` or default strategy.
+      axis: specifies the dimension to reduce along within each
         replica's tensor. Should typically be set to the batch dimension, or
         `None` to only reduce across replicas (e.g. if the tensor has no batch
         dimension).

From 30c8b4a5bf64f982a626fb1f5e3888e5c481dce5 Mon Sep 17 00:00:00 2001
From: abhichou4 <abhichou4@gmail.com>
Date: Wed, 24 Jun 2020 05:45:05 +0530
Subject: [PATCH 0938/1390] reformat

---
 tensorflow/python/eager/forwardprop.py      | 42 +++++++-------
 tensorflow/python/eager/forwardprop_test.py | 62 +++++++++++----------
 2 files changed, 55 insertions(+), 49 deletions(-)

diff --git a/tensorflow/python/eager/forwardprop.py b/tensorflow/python/eager/forwardprop.py
index 1e9d923dc2c..4854f114f45 100644
--- a/tensorflow/python/eager/forwardprop.py
+++ b/tensorflow/python/eager/forwardprop.py
@@ -143,7 +143,7 @@ def _jvp_helper(op_name, attr_tuple, inputs, outputs, tangents):
 
 
 def _jvp_helper_wrapper(
-  op_name, attr_tuple, inputs, outputs, tangents, use_batch 
+    op_name, attr_tuple, inputs, outputs, tangents, use_batch
 ):
   """Computes a batch of Jacobian-vector product for an op.
 
@@ -152,32 +152,33 @@ def _jvp_helper_wrapper(
     attr_tuple: Attributes of the operation.
     inputs: A flat list of input Tensors to the operation.
     outputs: A flat list of output Tensors from the operation.
-    tangents: A flat list of Tensors, compatible with shape `[None] + input_shape`.
-    use_batch: A bool, True to vetorize over batch of tangents 
-      of shape `[None] + input_shape`
+    tangents: A flat list of Tensors, compatible with
+      shape `[None] + input_shape`.
+    use_batch: A bool, True to vetorize over batch of tangents
+      of shape `[None] + input_shape`.
 
   Returns:
-    A flat list of tangents compatible with `outputs` or `[None] + output_shape`.
-  
+    A flat list of tangents compatible with `outputs`
+    or `[None] + output_shape`.
+
   Raises:
     ValueError: if tangent shapes are not compatible with input shapes.
   """
   if use_batch:
     for primal, tangent in zip(inputs, tangents):
-      if not tangent.shape.is_compatible_with([None] + primal.shape): 
+      if not tangent.shape.is_compatible_with([None] + primal.shape):
         raise ValueError(
-          "Tangent {} was expected to be of shape "
-          "{} but is instead of shape {}".format(
-          tangent, [None] + primal.shape, tangent.shape
+            "Tangent {} was expected to be of shape "
+            "{} but is instead of shape {}".format(
+                tangent, [None] + primal.shape, tangent.shape
+            )
         )
-      )
 
     return control_flow_ops.vectorized_map(
-      functools.partial(_jvp_helper, op_name, attr_tuple, inputs, outputs),
-      tangents,
+        functools.partial(_jvp_helper, op_name, attr_tuple, inputs, outputs),
+        tangents,
     )
-  else:
-    return _jvp_helper(op_name, attr_tuple, inputs, outputs, tangents)
+  return _jvp_helper(op_name, attr_tuple, inputs, outputs, tangents)
 
 
 # TODO(allenl): experimental_relax_shapes for gradients which rely on static
@@ -194,7 +195,8 @@ def _jvp_helper_wrapper(
 _jvp_relaxed_shapes = function.defun(
     _jvp_helper_wrapper, experimental_relax_shapes=True
 )
-_jvp_exact_shapes = function.defun(_jvp_helper_wrapper, experimental_relax_shapes=False)
+_jvp_exact_shapes = function.defun(
+    _jvp_helper_wrapper, experimental_relax_shapes=False)
 
 # The maximum number of exact-shape traces to perform for a single op before
 # switching to shape relaxation.
@@ -202,7 +204,7 @@ _TRACE_COUNT_LIMIT = 32
 
 
 def _jvp_dispatch(
-  op_name, attr_tuple, inputs, outputs, tangents, use_batch=False
+    op_name, attr_tuple, inputs, outputs, tangents, use_batch=False
 ):
   """Determine which forwardprop function to call."""
   # Note that this _TRACE_COUNT read races with writes. That's fine, it just
@@ -210,9 +212,8 @@ def _jvp_dispatch(
   if _TRACE_COUNT.get(op_name, 0) < _TRACE_COUNT_LIMIT:
     return _jvp_exact_shapes(
         op_name, attr_tuple, inputs, outputs, tangents, use_batch)
-  else:
-    return _jvp_relaxed_shapes(
-        op_name, attr_tuple, inputs, outputs, tangents, use_batch)
+  return _jvp_relaxed_shapes(
+      op_name, attr_tuple, inputs, outputs, tangents, use_batch)
 
 
 pywrap_tfe.TFE_Py_RegisterJVPFunction(_jvp_dispatch)
@@ -430,6 +431,7 @@ class ForwardAccumulator(object):
     unconnected_gradients = UnconnectedGradients(unconnected_gradients)
     if self._accumulator is None:
       raise ValueError("Called jvp() without first tracing anything.")
+
     def _fetch_jvp(tensor):
       if hasattr(tensor, "handle"):
         tensor = ops.convert_to_tensor(tensor.handle)
diff --git a/tensorflow/python/eager/forwardprop_test.py b/tensorflow/python/eager/forwardprop_test.py
index de71459da8a..e16c65d34ab 100644
--- a/tensorflow/python/eager/forwardprop_test.py
+++ b/tensorflow/python/eager/forwardprop_test.py
@@ -165,7 +165,8 @@ def _vectorize_parameters(f, params, use_pfor, dtype):
     return f(tangents)
 
   if use_pfor:
-    return control_flow_ops.vectorized_map(_wrapper, math_ops.range(total_size))
+    return control_flow_ops.vectorized_map(
+        _wrapper, math_ops.range(total_size))
   else:
     return map_fn.map_fn(_wrapper, math_ops.range(total_size), dtype)
 
@@ -220,6 +221,7 @@ def _test_gradients(testcase,
   # And the symbolic computations should be much closer.
   testcase.assertAllClose(sym_jac_back, sym_jac_fwd)
 
+
 class ForwardpropTest(test.TestCase, parameterized.TestCase):
 
   def testJVPFunction(self):
@@ -263,8 +265,8 @@ class ForwardpropTest(test.TestCase, parameterized.TestCase):
     # Using evaluate and asserting with just a list works too
     # but the output is more explicit this way
     self.assertAllClose(
-      [constant_op.constant([1. + 4., 2. + 5., 3. + 6.])],
-      jvp_flat
+        [constant_op.constant([1. + 4., 2. + 5., 3. + 6.])],
+        jvp_flat
     )
 
     mul_outputs = (constant_op.constant([20.]),)
@@ -279,8 +281,8 @@ class ForwardpropTest(test.TestCase, parameterized.TestCase):
         ),
         use_batch=True)
     self.assertAllClose(
-      [constant_op.constant([[5.], [4.], [5. + 4.]])],
-      jvp_flat
+        [constant_op.constant([[5.], [4.], [5. + 4.]])],
+        jvp_flat
     )
 
   def testJVPFunctionRaisesError(self):
@@ -288,18 +290,18 @@ class ForwardpropTest(test.TestCase, parameterized.TestCase):
     ctx = context.context()
 
     sum_outputs = (constant_op.constant(6.),)
-    
+
     with self.assertRaisesRegexp(ValueError, r".*was expected to be of shape*"):
       forwardprop._jvp_dispatch(
-        op_name="Add",
-        attr_tuple=(),
-        inputs=(constant_op.constant(2.), constant_op.constant(4.)),
-        outputs=sum_outputs,
-        tangents=(
-	  constant_op.constant([1., 2.]),
-	  constant_op.constant([[1.], [2.]])
-	),
-        use_batch=True
+          op_name="Add",
+          attr_tuple=(),
+          inputs=(constant_op.constant(2.), constant_op.constant(4.)),
+          outputs=sum_outputs,
+          tangents=(
+              constant_op.constant([1., 2.]),
+              constant_op.constant([[1.], [2.]])
+          ),
+          use_batch=True
       )
 
   def testNonDifferentiableOpWithInputTangent(self):
@@ -346,7 +348,7 @@ class ForwardpropTest(test.TestCase, parameterized.TestCase):
       self._execution_count = execution_count + 1
       x = array_ops.zeros([execution_count])
       with forwardprop.ForwardAccumulator(
-          x, array_ops.ones_like(x)) as acc:
+              x, array_ops.ones_like(x)) as acc:
         y = x + x
       self.assertAllClose(2. * array_ops.ones_like(x), acc.jvp(y))
 
@@ -355,10 +357,10 @@ class ForwardpropTest(test.TestCase, parameterized.TestCase):
     x = constant_op.constant(-2.)
     with self.assertRaisesRegexp(ValueError, "multiple times"):
       with forwardprop.ForwardAccumulator(
-          [x, x], [1., 2.]):
+              [x, x], [1., 2.]):
         pass
     with forwardprop.ForwardAccumulator(
-        [x], [3.]) as acc:
+            [x], [3.]) as acc:
       self.assertAllClose(3., acc.jvp(x))
       acc._watch(x, constant_op.constant(10.))
       self.assertAllClose(13., acc.jvp(x))
@@ -436,7 +438,8 @@ class ForwardpropTest(test.TestCase, parameterized.TestCase):
 
     _test_gradients(self, f, [constant_op.constant([1., 2.])], order=3)
 
-  # TODO(allenl): investigate why assert_no_new_pyobjects_executing_eagerly fails around this test?
+  # TODO(allenl): investigate why assert_no_new_pyobjects_executing_eagerly
+  # fails around this test?
   def testExceptionCustomGradientRecomputeGradForward(self):
 
     @custom_gradient.recompute_grad
@@ -553,9 +556,9 @@ class ForwardpropTest(test.TestCase, parameterized.TestCase):
 
     primal = constant_op.constant(1.1)
     with forwardprop.ForwardAccumulator(
-        primal, constant_op.constant(1.)) as outer_acc:
+            primal, constant_op.constant(1.)) as outer_acc:
       with forwardprop.ForwardAccumulator(
-          primal, constant_op.constant(1.)) as acc:
+              primal, constant_op.constant(1.)) as acc:
         primal_out = f(primal)
     inner_jvp = acc.jvp(primal_out)
     outer_jvp = outer_acc.jvp(inner_jvp)
@@ -571,9 +574,9 @@ class ForwardpropTest(test.TestCase, parameterized.TestCase):
     inner_jvp = constant_op.constant(3.)
     with forwardprop.ForwardAccumulator(
         [primal_in, inner_jvp],
-        [constant_op.constant(2.), constant_op.constant(4.)]) as outer_acc:
+            [constant_op.constant(2.), constant_op.constant(4.)]) as outer_acc:
       with forwardprop.ForwardAccumulator(
-          primal_in, inner_jvp) as inner_acc:
+              primal_in, inner_jvp) as inner_acc:
         packed_input_indices, packed_input_tangents = (
             forwardprop_util.pack_tangents([primal_in]))
         self.assertAllClose([3., 2., 4.], packed_input_tangents)
@@ -603,9 +606,9 @@ class ForwardpropTest(test.TestCase, parameterized.TestCase):
 
       primal = constant_op.constant(1.1)
       with forwardprop.ForwardAccumulator(
-          primal, constant_op.constant(1.)) as outer_acc:
+              primal, constant_op.constant(1.)) as outer_acc:
         with forwardprop.ForwardAccumulator(
-            primal, constant_op.constant(1.)) as acc:
+                primal, constant_op.constant(1.)) as acc:
           primal_out = f(primal)
       inner_jvp = acc.jvp(primal_out)
       outer_jvp = outer_acc.jvp(inner_jvp)
@@ -637,7 +640,7 @@ class ForwardpropTest(test.TestCase, parameterized.TestCase):
     matmul = def_function.function(math_ops.matmul)
 
     with forwardprop.ForwardAccumulator(
-        primals=[m1, m2], tangents=[tangent1, tangent2]) as acc:
+            primals=[m1, m2], tangents=[tangent1, tangent2]) as acc:
       result1 = matmul(m1, m1, transpose_b=True)
       result2 = matmul(m2, m2, transpose_b=True)
 
@@ -848,7 +851,7 @@ class ForwardpropTest(test.TestCase, parameterized.TestCase):
   def testVariableWatched(self):
     v = variables.Variable([1., 2., 3.])
     with forwardprop.ForwardAccumulator(
-        v, constant_op.constant([.1, -.2, .3])) as acc:
+            v, constant_op.constant([.1, -.2, .3])) as acc:
       self.assertAllClose([.1, -.2, .3], acc.jvp(v))
       x = v * 2.
       self.assertAllClose([.2, -.4, .6], acc.jvp(x))
@@ -864,7 +867,8 @@ class ForwardpropTest(test.TestCase, parameterized.TestCase):
       self.assertAllClose(0.0, acc.jvp(y, unconnected_gradients="zero"))
       self.assertIsNone(acc.jvp(y, unconnected_gradients="none"))
 
-  # TODO(kkb): One weakref instance is created with warmup_iters=2, investigate.
+  # TODO(kkb): One weakref instance is created with warmup_iters=2,
+  # investigate.
   @test_util.assert_no_new_pyobjects_executing_eagerly(warmup_iters=3)
   def testVariableWatchedFunction(self):
 
@@ -878,7 +882,7 @@ class ForwardpropTest(test.TestCase, parameterized.TestCase):
         if self._v is None:
           self._v = variables.Variable([1., 2., 3.])
         with forwardprop.ForwardAccumulator(
-            self._v, constant_op.constant([.1, -.2, .3])) as acc:
+                self._v, constant_op.constant([.1, -.2, .3])) as acc:
           x = self._v * 2.
           x2 = self._v + .1
         return acc.jvp((self._v, x, x2))

From 7211f4c2b12fb0e4f4ce24e710900048c8a322a4 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 23 Jun 2020 17:13:18 -0700
Subject: [PATCH 0939/1390] Add the "--define=no_tensorflow_py_deps=true" flag
 for the windows cpu release builds.

PiperOrigin-RevId: 317968971
Change-Id: I7d4db21474d85620928f3a5ffb1e4cfebaa2be9f
---
 .../tools/ci_build/release/windows/cpu_py35_full/release.bat   | 2 +-
 .../tools/ci_build/release/windows/cpu_py36_full/release.bat   | 2 +-
 .../tools/ci_build/release/windows/cpu_py37_full/release.bat   | 2 +-
 .../tools/ci_build/release/windows/cpu_py38_full/release.bat   | 3 ++-
 4 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/tensorflow/tools/ci_build/release/windows/cpu_py35_full/release.bat b/tensorflow/tools/ci_build/release/windows/cpu_py35_full/release.bat
index bd8c217ddef..02b12c7650a 100644
--- a/tensorflow/tools/ci_build/release/windows/cpu_py35_full/release.bat
+++ b/tensorflow/tools/ci_build/release/windows/cpu_py35_full/release.bat
@@ -17,4 +17,4 @@ SET PYTHON_DIRECTORY=Python35
 
 CALL tensorflow\tools\ci_build\release\common_win.bat
 
-call tensorflow\tools\ci_build\windows\cpu\pip\run.bat --release_build --extra_build_flags "--config=v2" --extra_test_flags "--test_env=TF2_BEHAVIOR=1" --project_name "tensorflow_cpu"
+call tensorflow\tools\ci_build\windows\cpu\pip\run.bat --release_build --extra_build_flags "--config=v2 --define=no_tensorflow_py_deps=true" --extra_test_flags "--test_env=TF2_BEHAVIOR=1" --project_name "tensorflow_cpu"
diff --git a/tensorflow/tools/ci_build/release/windows/cpu_py36_full/release.bat b/tensorflow/tools/ci_build/release/windows/cpu_py36_full/release.bat
index 0a81a90a431..e44e6ca6e18 100644
--- a/tensorflow/tools/ci_build/release/windows/cpu_py36_full/release.bat
+++ b/tensorflow/tools/ci_build/release/windows/cpu_py36_full/release.bat
@@ -17,4 +17,4 @@ SET PYTHON_DIRECTORY=Python36
 
 CALL tensorflow\tools\ci_build\release\common_win.bat
 
-call tensorflow\tools\ci_build\windows\cpu\pip\run.bat --release_build --extra_build_flags "--config=v2" --extra_test_flags "--test_env=TF2_BEHAVIOR=1" --project_name "tensorflow_cpu"
+call tensorflow\tools\ci_build\windows\cpu\pip\run.bat --release_build --extra_build_flags "--config=v2 --define=no_tensorflow_py_deps=true" --extra_test_flags "--test_env=TF2_BEHAVIOR=1" --project_name "tensorflow_cpu"
diff --git a/tensorflow/tools/ci_build/release/windows/cpu_py37_full/release.bat b/tensorflow/tools/ci_build/release/windows/cpu_py37_full/release.bat
index 9591d7aac34..c65167a5dc6 100644
--- a/tensorflow/tools/ci_build/release/windows/cpu_py37_full/release.bat
+++ b/tensorflow/tools/ci_build/release/windows/cpu_py37_full/release.bat
@@ -17,4 +17,4 @@ SET PYTHON_DIRECTORY=Python37
 
 CALL tensorflow\tools\ci_build\release\common_win.bat
 
-call tensorflow\tools\ci_build\windows\cpu\pip\run.bat --release_build --extra_build_flags "--config=v2" --extra_test_flags "--test_env=TF2_BEHAVIOR=1" --project_name "tensorflow_cpu"
+call tensorflow\tools\ci_build\windows\cpu\pip\run.bat --release_build --extra_build_flags "--config=v2 --define=no_tensorflow_py_deps=true" --extra_test_flags "--test_env=TF2_BEHAVIOR=1" --project_name "tensorflow_cpu"
diff --git a/tensorflow/tools/ci_build/release/windows/cpu_py38_full/release.bat b/tensorflow/tools/ci_build/release/windows/cpu_py38_full/release.bat
index 7a7435b3713..06599fc0d8c 100644
--- a/tensorflow/tools/ci_build/release/windows/cpu_py38_full/release.bat
+++ b/tensorflow/tools/ci_build/release/windows/cpu_py38_full/release.bat
@@ -17,4 +17,5 @@ SET PYTHON_DIRECTORY=Python38
 
 CALL tensorflow\tools\ci_build\release\common_win.bat
 
-call tensorflow\tools\ci_build\windows\cpu\pip\run.bat --release_build --extra_build_flags "--config=v2" --extra_test_flags "--test_env=TF2_BEHAVIOR=1" --project_name "tensorflow_cpu"
+call tensorflow\tools\ci_build\windows\cpu\pip\run.bat --release_build --extra_build_flags "--config=v2 --define=no_tensorflow_py_deps=true" --extra_test_flags "--test_env=TF2_BEHAVIOR=1" --project_name "tensorflow_cpu"
+

From 4dd7002d5697d729e281d3b05a140088361690e2 Mon Sep 17 00:00:00 2001
From: Rick Chao <rchao@google.com>
Date: Tue, 23 Jun 2020 17:20:39 -0700
Subject: [PATCH 0940/1390] MultiProcessRunner: Add more information regarding
 UnexpectedSubprocessExitError.

PiperOrigin-RevId: 317970123
Change-Id: Ie2aff422fc7eff2bd48b6a82fab34e4b0c0bb930
---
 .../python/distribute/multi_process_runner.py | 59 ++++++++++++++++---
 .../distribute/multi_process_runner_test.py   | 50 ++++++++++++++++
 2 files changed, 101 insertions(+), 8 deletions(-)

diff --git a/tensorflow/python/distribute/multi_process_runner.py b/tensorflow/python/distribute/multi_process_runner.py
index 84b61be1ea2..4971eea93ad 100644
--- a/tensorflow/python/distribute/multi_process_runner.py
+++ b/tensorflow/python/distribute/multi_process_runner.py
@@ -144,6 +144,9 @@ class MultiProcessRunner(object):
         `signal.alarm()` api. Note that this is best effort at Python level
         since Python signal handler does not get executed when it runs lower
         level C/C++ code. So it can be delayed for arbitrarily long time.
+        If any of the child process is still running when `max_run_time` is up,
+        they will be force-terminated and a `UnexpectedSubprocessExitError`
+        may be raised at `join()`.
       grpc_fail_fast: Whether GRPC connection between processes should fail
         without retrying. Defaults to None, in which case the environment
         variable is not explicitly set.
@@ -450,11 +453,19 @@ class MultiProcessRunner(object):
       from subprocesses' stdout and stderr.
 
     Raises:
-      SubprocessTimeoutError: if not all processes report status approximatelty
-      within `timeout` seconds. When this is raised, a
-      `MultiProcessRunnerResult` object can be retrieved by
-      `SubprocessTimeoutError`'s mpr_result attribute, which has the same
-      structure as above 'Returns' section describes.
+      SubprocessTimeoutError: if not all processes report status approximately
+        within `timeout` seconds. When this is raised, a
+        `MultiProcessRunnerResult` object can be retrieved by
+        `SubprocessTimeoutError`'s mpr_result attribute, which has the same
+        structure as above 'Returns' section describes.
+      UnexpectedSubprocessExitError: If any of the subprocesses did not exit
+        properly (for example, they exit on SIGTERM or SIGKILL signal). When
+        this is raised, a `MultiProcessRunnerResult` object can be retrieved by
+        `UnexpectedSubprocessExitError`'s mpr_result attribute, which has the
+        same structure as above 'Returns' section describes. If `max_run_time`
+        is not `None`, it is expected that some subprocesses may be
+        force-killed when `max_run_time` is up, and this is raised in those
+        cases.
       Exception: if there is an Exception propagated from any subprocess.
     """
     if self._joined:
@@ -478,14 +489,28 @@ class MultiProcessRunner(object):
     process_statuses = self._queue_to_list(self._process_status_queue)
     if not self._all_forced_terminated and len(
         process_statuses) != self._outstanding_subprocess_count:
-      raise RuntimeError(
-          'missing statuses from %d subproceses.' %
-          (self._outstanding_subprocess_count - len(process_statuses)))
+      raise UnexpectedSubprocessExitError(
+          'Missing status(es) from %d subprocess(es). See logs for details.' %
+          (self._outstanding_subprocess_count - len(process_statuses)),
+          self._get_mpr_result(process_statuses))
     for process_status in process_statuses:
       assert isinstance(process_status, _ProcessStatusInfo)
       if not process_status.is_successful:
         six.reraise(*process_status.exc_info)
 
+    # Checking all the processes that are expected to exit properly.
+    for (task_type, task_id), p in self._processes.items():
+      if self._dependence_on_chief and task_type != 'chief':
+        # If _dependence_on_chief, other processes may have been
+        # forced-terminated, which is expected.
+        continue
+      # Successfully exiting process has exit code 0.
+      if p.exitcode > 0:
+        raise UnexpectedSubprocessExitError(
+            'Subprocess %s-%d exited with exit code %d. See logs for details.' %
+            (task_type, task_id, p.exitcode),
+            self._get_mpr_result(process_statuses))
+
     logging.info('Joining log reading threads.')
     for thread in self._reading_threads:
       thread.join()
@@ -521,6 +546,8 @@ class MultiProcessRunner(object):
     for (task_type, task_id), p in self._processes.items():
       try:
         os.kill(p.pid, sig)
+        logging.info('%s-%d terminated with signal %r.', task_type, task_id,
+                     sig)
       except ProcessLookupError:
         logging.info('Attempting to kill %s-%d but it does not exist.',
                      task_type, task_id)
@@ -658,6 +685,9 @@ class _ProcFunc(object):
 
       self._close_streaming()
 
+    # Exit with code 0 as it's considered successful exit at this point.
+    sys.exit(0)
+
 
 class MultiProcessPoolRunner(object):
   """A utility class to start a process pool to simulate a cluster.
@@ -848,6 +878,19 @@ class SubprocessTimeoutError(RuntimeError):
     self.mpr_result = mpr_result
 
 
+class UnexpectedSubprocessExitError(RuntimeError):
+  """An error indicating there is at least one subprocess with unexpected exit.
+
+  When this is raised, a `MultiProcessRunnerResult` object can be retrieved by
+  `UnexpectedSubprocessExitError`'s mpr_result attribute. See
+  `MultiProcessRunner.join()` for more information.
+  """
+
+  def __init__(self, msg, mpr_result):
+    super(UnexpectedSubprocessExitError, self).__init__(msg)
+    self.mpr_result = mpr_result
+
+
 def _set_tf_config(task_type, task_id, cluster_spec, rpc_layer=None):
   """Set TF_CONFIG environment variable."""
   tf_config_dict = {
diff --git a/tensorflow/python/distribute/multi_process_runner_test.py b/tensorflow/python/distribute/multi_process_runner_test.py
index 32d3ae6c84e..acec6d0c999 100644
--- a/tensorflow/python/distribute/multi_process_runner_test.py
+++ b/tensorflow/python/distribute/multi_process_runner_test.py
@@ -18,8 +18,10 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import ctypes
 import json
 import os
+import sys
 import threading
 import time
 import unittest
@@ -314,7 +316,55 @@ class MultiProcessRunnerTest(test.TestCase):
     self.assertTrue(
         any('something printed' in line for line in list_to_assert))
 
+  def test_seg_fault_raises_error(self):
 
+    def proc_func_expected_to_seg_fault():
+      ctypes.string_at(0)  # Intentionally made seg fault.
+
+    with self.assertRaises(
+        multi_process_runner.UnexpectedSubprocessExitError) as cm:
+      multi_process_runner.run(
+          proc_func_expected_to_seg_fault,
+          multi_worker_test_base.create_cluster_spec(num_workers=1),
+          list_stdout=True)
+    self.assertIn('Missing status(es) from 1 subprocess(es).',
+                  str(cm.exception))
+    list_to_assert = cm.exception.mpr_result.stdout
+    self.assertTrue(any('SIGSEGV' in line for line in list_to_assert))
+
+  def test_seg_fault_in_chief_raises_error(self):
+
+    def proc_func_expected_to_seg_fault():
+      if multi_worker_test_base.get_task_type() == 'worker':
+        time.sleep(10000)
+      ctypes.string_at(0)  # Intentionally made seg fault.
+
+    with self.assertRaises(
+        multi_process_runner.UnexpectedSubprocessExitError) as cm:
+      multi_process_runner.run(
+          proc_func_expected_to_seg_fault,
+          multi_worker_test_base.create_cluster_spec(
+              has_chief=True, num_workers=1),
+          list_stdout=True)
+    self.assertIn('Subprocess chief-0 exited with exit code',
+                  str(cm.exception))
+    list_to_assert = cm.exception.mpr_result.stdout
+    self.assertTrue(any('SIGSEGV' in line for line in list_to_assert))
+
+  def test_non_zero_exit_code_raises_error(self):
+
+    def proc_func_expected_to_exit_with_1():
+      sys.exit(1)
+
+    with self.assertRaises(
+        multi_process_runner.UnexpectedSubprocessExitError) as cm:
+      multi_process_runner.run(
+          proc_func_expected_to_exit_with_1,
+          multi_worker_test_base.create_cluster_spec(num_workers=1))
+    self.assertIn('Missing status(es) from 1 subprocess(es).',
+                  str(cm.exception))
+
+    
 class MultiProcessPoolRunnerTest(test.TestCase):
 
   def test_same_process_across_runs(self):

From b00c93e3c13db45b8d19121598f9a0b6eaf0b93f Mon Sep 17 00:00:00 2001
From: rahul-kamat <rahulkamat@gmail.com>
Date: Wed, 24 Jun 2020 00:37:27 +0000
Subject: [PATCH 0941/1390] Add test with all ops annotated, Move proto strings
 inside tests

---
 tensorflow/python/framework/python_op_gen.cc  |   2 +
 .../python/framework/python_op_gen_test.cc    | 442 ++++++++++++------
 2 files changed, 311 insertions(+), 133 deletions(-)

diff --git a/tensorflow/python/framework/python_op_gen.cc b/tensorflow/python/framework/python_op_gen.cc
index b6e39a4df00..19946f5b71c 100644
--- a/tensorflow/python/framework/python_op_gen.cc
+++ b/tensorflow/python/framework/python_op_gen.cc
@@ -434,6 +434,8 @@ std::unordered_map<string, string> GenEagerPythonOp::GetTypeAnnotations() {
     } else if (attr.type() == "bool" || attr.type() == "float" ||
                attr.type() == "int" || attr.type() == "bytes") {
       type_annotations[attr.name()] = attr.type();
+    } else if (attr.type() == "string") {
+      type_annotations[attr.name()] = "str";
     }
   }
 
diff --git a/tensorflow/python/framework/python_op_gen_test.cc b/tensorflow/python/framework/python_op_gen_test.cc
index 5fff1a1d111..41e27f019cf 100644
--- a/tensorflow/python/framework/python_op_gen_test.cc
+++ b/tensorflow/python/framework/python_op_gen_test.cc
@@ -23,135 +23,6 @@ limitations under the License.
 namespace tensorflow {
 namespace {
 
-constexpr char kBaseOpDef[] = R"(
-op {
-  name: "Foo"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "y"
-    type_attr: "T2"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_UINT8
-        type: DT_INT8
-      }
-    }
-  }
-  attr {
-    name: "T2"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_STRING
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-  summary: "Summary for op Foo."
-  description: "Description for op Foo."
-}
-op {
-  name: "Bar"
-  input_arg {
-    name: "x"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "y"
-    type: DT_QINT8
-  }
-  output_arg {
-    name: "output"
-    type: DT_BOOL
-  }
-  summary: "Summary for op Bar."
-  description: "Description for op Bar."
-}
-op {
-  name: "FooBar"
-  input_arg {
-    name: "x"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "output"
-    type: DT_BOOL
-  }
-  attr {
-    name: "t"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_INT8
-      }
-    }
-  }
-  attr {
-    name: "var1"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "var2"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-  summary: "Summary for op FooBar."
-  description: "Description for op FooBar."
-}
-op {
-  name: "Baz"
-  input_arg {
-    name: "inputs"
-    number_attr: "N"
-    type_list_attr: "T"
-  }
-  output_arg {
-    name: "output1"
-    type: DT_BOOL
-  }
-  output_arg {
-    name: "output2"
-    type: DT_BOOL
-  }
-  attr {
-    name: "T"
-    type: "bool"
-  }
-  attr {
-    name: "N"
-    type: "int"
-  }
-  summary: "Summary for op Baz."
-  description: "Description for op Baz."
-}
-)";
-
-std::unordered_set<string> type_annotate_ops {
-  "Foo",
-  "Bar",
-  "FooBar",
-  "Baz"
-};
-
-
 void ExpectHasSubstr(const string& s, const string& expected) {
   EXPECT_TRUE(absl::StrContains(s, expected))
       << "'Generated ops does not contain '" << expected << "'";
@@ -172,19 +43,67 @@ void ExpectSubstrOrder(const string& s, const string& before,
       << before << "' is not before '" << after;
 }
 
-TEST(PythonOpGen, Basic) {
+TEST(PythonOpGen, TypeAnnotateAllOps) {
   OpList ops;
   OpRegistry::Global()->Export(false, &ops);
 
   ApiDefMap api_def_map(ops);
 
-  string code = GetPythonOps(ops, api_def_map, {}, "", {});
+  std::unordered_set<string> type_annotate_ops;
+  for (const auto& op : ops.op()) {
+    type_annotate_ops.insert(op.name());
+  }
 
-  EXPECT_TRUE(absl::StrContains(code, "def case"));
-  // TODO(mdan): Add tests to verify type annotations are correctly added.
+  string code = GetPythonOps(ops, api_def_map, {}, "", type_annotate_ops);
+
+  const string all_types = ", _dtypes.BFloat16, _dtypes.Bool, _dtypes.Complex128, _dtypes.Complex64, "
+  "_dtypes.Float16, _dtypes.Float32, _dtypes.Float64, _dtypes.Half, _dtypes.Int16, "
+  "_dtypes.Int32, _dtypes.Int64, _dtypes.Int8, _dtypes.QInt16, _dtypes.QInt32, "
+  "_dtypes.QInt8, _dtypes.QUInt16, _dtypes.QUInt8, _dtypes.Resource, _dtypes.String, "
+  "_dtypes.UInt16, _dtypes.UInt32, _dtypes.UInt64, _dtypes.UInt8, _dtypes.Variant)";
+
+  const string fake_param_typevar = "TV_FakeParam_dtype = TypeVar(\"TV_FakeParam_dtype\"" + all_types;
+  const string fake_param = "def fake_param_eager_fallback(dtype: TV_FakeParam_dtype, shape, name, ctx) -> _ops.Tensor[TV_FakeParam_dtype]:";
+  const string fake_param_fallback = "def fake_param_eager_fallback(dtype: TV_FakeParam_dtype, shape, name, ctx) -> _ops.Tensor[TV_FakeParam_dtype]:";
+
+  ExpectHasSubstr(code, fake_param_typevar);
+  ExpectHasSubstr(code, fake_param);
+  ExpectHasSubstr(code, fake_param_fallback);
+
+  const string to_bool_typevar = "TV_ToBool_T = TypeVar(\"TV_ToBool_T\"" + all_types;
+  const string to_bool_ = "def to_bool(input: _ops.Tensor[TV_ToBool_T], name=None) -> _ops.Tensor[_dtypes.Bool]:";
+  const string to_bool_fallback = "def to_bool_eager_fallback(input: _ops.Tensor[TV_ToBool_T], name, ctx) -> _ops.Tensor[_dtypes.Bool]:";
+
+  ExpectHasSubstr(code, to_bool_typevar);
+  ExpectHasSubstr(code, to_bool_);
+  ExpectHasSubstr(code, to_bool_fallback);
 }
 
 TEST(PythonOpGen, TypeAnnotateSingleTypeTensor) {
+  constexpr char kBaseOpDef[] = R"(
+  op {
+    name: "Bar"
+    input_arg {
+      name: "x"
+      type: DT_STRING
+    }
+    input_arg {
+      name: "y"
+      type: DT_QINT8
+    }
+    output_arg {
+      name: "output"
+      type: DT_BOOL
+    }
+    summary: "Summary for op Bar."
+    description: "Description for op Bar."
+  }
+  )";
+
+  std::unordered_set<string> type_annotate_ops {
+    "Bar"
+  };
+
   OpList op_defs;
   OpRegistry::Global()->Export(false, &op_defs);
   protobuf::TextFormat::ParseFromString(kBaseOpDef, &op_defs);  // NOLINT
@@ -200,6 +119,51 @@ TEST(PythonOpGen, TypeAnnotateSingleTypeTensor) {
 }
 
 TEST(PythonOpGen, TypeAnnotateMultiTypeTensor) {
+  constexpr char kBaseOpDef[] = R"(
+  op {
+    name: "Foo"
+    input_arg {
+      name: "x"
+      type_attr: "T"
+    }
+    input_arg {
+      name: "y"
+      type_attr: "T2"
+    }
+    output_arg {
+      name: "output"
+      type_attr: "T"
+    }
+    attr {
+      name: "T"
+      type: "type"
+      allowed_values {
+        list {
+          type: DT_UINT8
+          type: DT_INT8
+        }
+      }
+    }
+    attr {
+      name: "T2"
+      type: "type"
+      allowed_values {
+        list {
+          type: DT_STRING
+          type: DT_FLOAT
+          type: DT_DOUBLE
+        }
+      }
+    }
+    summary: "Summary for op Foo."
+    description: "Description for op Foo."
+  }
+  )";
+
+  std::unordered_set<string> type_annotate_ops {
+    "Foo",
+  };
+
   OpList op_defs;
   OpRegistry::Global()->Export(false, &op_defs);
   protobuf::TextFormat::ParseFromString(kBaseOpDef, &op_defs);  // NOLINT
@@ -212,6 +176,51 @@ TEST(PythonOpGen, TypeAnnotateMultiTypeTensor) {
 }
 
 TEST(PythonOpGen, GenerateCorrectTypeVars) {
+  constexpr char kBaseOpDef[] = R"(
+  op {
+    name: "Foo"
+    input_arg {
+      name: "x"
+      type_attr: "T"
+    }
+    input_arg {
+      name: "y"
+      type_attr: "T2"
+    }
+    output_arg {
+      name: "output"
+      type_attr: "T"
+    }
+    attr {
+      name: "T"
+      type: "type"
+      allowed_values {
+        list {
+          type: DT_UINT8
+          type: DT_INT8
+        }
+      }
+    }
+    attr {
+      name: "T2"
+      type: "type"
+      allowed_values {
+        list {
+          type: DT_STRING
+          type: DT_FLOAT
+          type: DT_DOUBLE
+        }
+      }
+    }
+    summary: "Summary for op Foo."
+    description: "Description for op Foo."
+  }
+  )";
+
+  std::unordered_set<string> type_annotate_ops {
+    "Foo",
+  };
+
   OpList op_defs;
   OpRegistry::Global()->Export(false, &op_defs);
   protobuf::TextFormat::ParseFromString(kBaseOpDef, &op_defs);  // NOLINT
@@ -228,6 +237,51 @@ TV_Foo_T2 = TypeVar("TV_Foo_T2", _dtypes.Float32, _dtypes.Float64, _dtypes.Strin
 }
 
 TEST(PythonOpGen, TypeAnnotateFallback) {
+  constexpr char kBaseOpDef[] = R"(
+  op {
+    name: "Foo"
+    input_arg {
+      name: "x"
+      type_attr: "T"
+    }
+    input_arg {
+      name: "y"
+      type_attr: "T2"
+    }
+    output_arg {
+      name: "output"
+      type_attr: "T"
+    }
+    attr {
+      name: "T"
+      type: "type"
+      allowed_values {
+        list {
+          type: DT_UINT8
+          type: DT_INT8
+        }
+      }
+    }
+    attr {
+      name: "T2"
+      type: "type"
+      allowed_values {
+        list {
+          type: DT_STRING
+          type: DT_FLOAT
+          type: DT_DOUBLE
+        }
+      }
+    }
+    summary: "Summary for op Foo."
+    description: "Description for op Foo."
+  }
+  )";
+
+  std::unordered_set<string> type_annotate_ops {
+    "Foo",
+  };
+
   OpList op_defs;
   OpRegistry::Global()->Export(false, &op_defs);
   protobuf::TextFormat::ParseFromString(kBaseOpDef, &op_defs);  // NOLINT
@@ -240,6 +294,51 @@ TEST(PythonOpGen, TypeAnnotateFallback) {
 }
 
 TEST(PythonOpGen, GenerateTypeVarAboveOp) {
+  constexpr char kBaseOpDef[] = R"(
+  op {
+    name: "Foo"
+    input_arg {
+      name: "x"
+      type_attr: "T"
+    }
+    input_arg {
+      name: "y"
+      type_attr: "T2"
+    }
+    output_arg {
+      name: "output"
+      type_attr: "T"
+    }
+    attr {
+      name: "T"
+      type: "type"
+      allowed_values {
+        list {
+          type: DT_UINT8
+          type: DT_INT8
+        }
+      }
+    }
+    attr {
+      name: "T2"
+      type: "type"
+      allowed_values {
+        list {
+          type: DT_STRING
+          type: DT_FLOAT
+          type: DT_DOUBLE
+        }
+      }
+    }
+    summary: "Summary for op Foo."
+    description: "Description for op Foo."
+  }
+  )";
+
+  std::unordered_set<string> type_annotate_ops {
+    "Foo",
+  };
+
   OpList op_defs;
   OpRegistry::Global()->Export(false, &op_defs);
   protobuf::TextFormat::ParseFromString(kBaseOpDef, &op_defs);  // NOLINT
@@ -254,6 +353,50 @@ TEST(PythonOpGen, GenerateTypeVarAboveOp) {
 
 
 TEST(PythonOpGen, TypeAnnotateDefaultParams) {
+  constexpr char kBaseOpDef[] = R"(
+  op {
+    name: "FooBar"
+    input_arg {
+      name: "x"
+      type: DT_FLOAT
+    }
+    output_arg {
+      name: "output"
+      type: DT_BOOL
+    }
+    attr {
+      name: "t"
+      type: "type"
+      allowed_values {
+        list {
+          type: DT_HALF
+          type: DT_INT8
+        }
+      }
+    }
+    attr {
+      name: "var1"
+      type: "bool"
+      default_value {
+        b: false
+      }
+    }
+    attr {
+      name: "var2"
+      type: "int"
+      default_value {
+        i: 0
+      }
+    }
+    summary: "Summary for op FooBar."
+    description: "Description for op FooBar."
+  }
+  )";
+
+  std::unordered_set<string> type_annotate_ops {
+    "FooBar",
+  };
+
   OpList op_defs;
   OpRegistry::Global()->Export(false, &op_defs);
   protobuf::TextFormat::ParseFromString(kBaseOpDef, &op_defs);  // NOLINT
@@ -268,6 +411,39 @@ TEST(PythonOpGen, TypeAnnotateDefaultParams) {
 }
 
 TEST(PythonOpGen, NoTypingSequenceTensors) {
+  constexpr char kBaseOpDef[] = R"(
+  op {
+    name: "Baz"
+    input_arg {
+      name: "inputs"
+      number_attr: "N"
+      type_list_attr: "T"
+    }
+    output_arg {
+      name: "output1"
+      type: DT_BOOL
+    }
+    output_arg {
+      name: "output2"
+      type: DT_BOOL
+    }
+    attr {
+      name: "T"
+      type: "bool"
+    }
+    attr {
+      name: "N"
+      type: "int"
+    }
+    summary: "Summary for op Baz."
+    description: "Description for op Baz."
+  }
+  )";
+
+  std::unordered_set<string> type_annotate_ops {
+    "Baz"
+  };
+
   OpList op_defs;
   OpRegistry::Global()->Export(false, &op_defs);
   protobuf::TextFormat::ParseFromString(kBaseOpDef, &op_defs);  // NOLINT

From 422825f1a904b0cf0b82ccf804af7c433ca6b56a Mon Sep 17 00:00:00 2001
From: Mehdi Amini <aminim@google.com>
Date: Tue, 23 Jun 2020 17:31:14 -0700
Subject: [PATCH 0942/1390] Fix Markdown table format to dispay correctly on
 GitHub

GitHub requires a leading | for tables.

PiperOrigin-RevId: 317971572
Change-Id: I0b0860e143d21fb8fa52a8421fa62b43fa9bfd04
---
 tensorflow/compiler/mlir/g3doc/xla_gpu_codegen.md | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tensorflow/compiler/mlir/g3doc/xla_gpu_codegen.md b/tensorflow/compiler/mlir/g3doc/xla_gpu_codegen.md
index 06c55abf1fa..2fe109c1783 100644
--- a/tensorflow/compiler/mlir/g3doc/xla_gpu_codegen.md
+++ b/tensorflow/compiler/mlir/g3doc/xla_gpu_codegen.md
@@ -24,10 +24,10 @@ the codegen input.
 
 ## Tasks
 
-              | Host                     | Device
-------------- | ------------------------ | ------------------------
-Input format  | HloInstruction* (Task 1) | HloInstruction* (Task 1)
-Output format | xla::Thunk (Task 2)      | LLVM IR (Task 3)
+|               | Host                     | Device
+| ------------- | ------------------------ | ------------------------
+| Input format  | HloInstruction* (Task 1) | HloInstruction* (Task 1)
+| Output format | xla::Thunk (Task 2)      | LLVM IR (Task 3)
 
 *   **Task 1** changes both host and device input format from HloInstruction* to
     LHLO.

From 5a5679c8aa3645aae5a47582f40f6697a04efa9a Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 23 Jun 2020 17:32:44 -0700
Subject: [PATCH 0943/1390] when python is not initialized , do nothing in
 python hooks.

PiperOrigin-RevId: 317971811
Change-Id: Ib73f11e1c2a88dee6f11105c2ae8ab20599703a6
---
 tensorflow/python/profiler/internal/python_hooks.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/profiler/internal/python_hooks.cc b/tensorflow/python/profiler/internal/python_hooks.cc
index 7ad15cd921d..33e182f8de0 100644
--- a/tensorflow/python/profiler/internal/python_hooks.cc
+++ b/tensorflow/python/profiler/internal/python_hooks.cc
@@ -46,7 +46,7 @@ PythonHooks* PythonHooks::GetSingleton() {
 }
 
 void PythonHooks::Start(const PythonHooksOptions& option) {
-  DCHECK(Py_IsInitialized());
+  if (!Py_IsInitialized()) return;
   if (option.enable_python_traceme || option.enable_trace_python_function) {
     PyGILState_STATE gil_state = PyGILState_Ensure();
     if (option.enable_trace_python_function) {

From 3252c965ee399aa795522f9f383805dc4aaec68f Mon Sep 17 00:00:00 2001
From: Nupur Garg <nupurgarg@google.com>
Date: Tue, 23 Jun 2020 17:36:05 -0700
Subject: [PATCH 0944/1390] Add input array shape instructions to Keras model.

PiperOrigin-RevId: 317972245
Change-Id: I9863d2e6beda85e4c0d016db541bb4341e739bc9
---
 tensorflow/lite/g3doc/convert/python_api.md | 22 +++++++++++++++++++--
 1 file changed, 20 insertions(+), 2 deletions(-)

diff --git a/tensorflow/lite/g3doc/convert/python_api.md b/tensorflow/lite/g3doc/convert/python_api.md
index 3171306af13..0c43a795514 100644
--- a/tensorflow/lite/g3doc/convert/python_api.md
+++ b/tensorflow/lite/g3doc/convert/python_api.md
@@ -30,8 +30,8 @@ This document contains [example usages](#examples) of the API and
 ### Converting a SavedModel <a name="saved_model"></a>
 
 The following example shows how to convert a
-[SavedModel](https://www.tensorflow.org/guide/saved_model) into a
-TensorFlow Lite [`FlatBuffer`](https://google.github.io/flatbuffers/).
+[SavedModel](https://www.tensorflow.org/guide/saved_model) into a TensorFlow
+Lite [`FlatBuffer`](https://google.github.io/flatbuffers/).
 
 ```python
 import tensorflow as tf
@@ -97,6 +97,24 @@ with tf.io.gfile.GFile('model.tflite', 'wb') as f:
   f.write(tflite_model)
 ```
 
+If your model requires specifying the input shape, use `tf.keras.layers.Input`
+or `tf.keras.layers.InputLayer` to create a Keras model with a fixed input shape
+as seen below or use the [`from_concrete_functions`](#concrete_function)
+classmethod as shown in the prior section to set the shape of the input arrays
+prior to conversion.
+
+```python
+input = tf.keras.layers.Input(shape=(1), batch_size=1)
+dense_layer = tf.keras.layers.Dense(units=1, input_shape=[1])
+model = tf.keras.Model(input, dense_layer(input))
+```
+
+```python
+model = tf.keras.models.Sequential(
+    [tf.keras.layers.InputLayer(input_shape=(1), batch_size=1),
+     tf.keras.layers.Dense(units=1, input_shape=[1])])
+```
+
 ### Converting a concrete function <a name="concrete_function"></a>
 
 The following example shows how to convert a TensorFlow

From 7db333e5545ccd6784b2e752a95b8119769e6696 Mon Sep 17 00:00:00 2001
From: Lluis-Miquel Munguia <llmunguia@google.com>
Date: Tue, 23 Jun 2020 17:44:58 -0700
Subject: [PATCH 0945/1390] Internal code refactoring.

PiperOrigin-RevId: 317973409
Change-Id: Ic249b4e1380313b6c556022dc78826c3165f1d3f
---
 tensorflow/core/grappler/costs/BUILD          |   1 +
 .../grappler/costs/op_level_cost_estimator.cc | 364 ++++++++++--------
 .../grappler/costs/op_level_cost_estimator.h  |   3 +
 3 files changed, 215 insertions(+), 153 deletions(-)

diff --git a/tensorflow/core/grappler/costs/BUILD b/tensorflow/core/grappler/costs/BUILD
index 02a26cdd390..257d77541e0 100644
--- a/tensorflow/core/grappler/costs/BUILD
+++ b/tensorflow/core/grappler/costs/BUILD
@@ -323,6 +323,7 @@ cc_library(
         ":cost_estimator",
         ":op_context",
         ":utils",
+        "@com_google_absl//absl/strings",
         "//third_party/eigen3",
         "//tensorflow/core:framework",
         "//tensorflow/core:protos_all_cc",
diff --git a/tensorflow/core/grappler/costs/op_level_cost_estimator.cc b/tensorflow/core/grappler/costs/op_level_cost_estimator.cc
index 6f57708a780..fb0d6ecf1d0 100644
--- a/tensorflow/core/grappler/costs/op_level_cost_estimator.cc
+++ b/tensorflow/core/grappler/costs/op_level_cost_estimator.cc
@@ -16,6 +16,7 @@ limitations under the License.
 
 #include "tensorflow/core/grappler/costs/op_level_cost_estimator.h"
 
+#include "absl/strings/match.h"
 #include "third_party/eigen3/Eigen/Core"
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/attr_value_util.h"
@@ -23,6 +24,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/grappler/clusters/utils.h"
+#include "tensorflow/core/grappler/costs/op_context.h"
 #include "tensorflow/core/grappler/costs/utils.h"
 
 namespace tensorflow {
@@ -101,16 +103,16 @@ static const Costs::Duration kMinComputeTime(1);
 
 namespace {
 
-string GetDataFormat(const OpInfo& op_info) {
-  string data_format = "NHWC";  // Default format.
+std::string GetDataFormat(const OpInfo& op_info) {
+  std::string data_format = "NHWC";  // Default format.
   if (op_info.attr().find("data_format") != op_info.attr().end()) {
     data_format = op_info.attr().at("data_format").s();
   }
   return data_format;
 }
 
-string GetFilterFormat(const OpInfo& op_info) {
-  string filter_format = "HWIO";  // Default format.
+std::string GetFilterFormat(const OpInfo& op_info) {
+  std::string filter_format = "HWIO";  // Default format.
   if (op_info.attr().find("filter_format") != op_info.attr().end()) {
     filter_format = op_info.attr().at("filter_format").s();
   }
@@ -202,7 +204,7 @@ int64 CwiseOutputElementCount(const TensorShapeProto& input_shape_1,
 
 // Helper function for determining whether there are repeated indices in the
 // input Einsum equation.
-bool CheckRepeatedDimensions(const string& dim_str) {
+bool CheckRepeatedDimensions(const absl::string_view dim_str) {
   int str_size = dim_str.size();
   for (int idx = 0; idx < str_size - 1; idx++) {
     if (dim_str.find(dim_str[idx], idx + 1) != std::string::npos) {
@@ -212,6 +214,75 @@ bool CheckRepeatedDimensions(const string& dim_str) {
   return false;
 }
 
+// Auxiliary function for determining whether OpLevelCostEstimator is compatible
+// with a given Einsum.
+bool IsEinsumCorrectlyFormed(const OpContext& einsum_context) {
+  const auto& op_info = einsum_context.op_info;
+
+  auto it = op_info.attr().find("equation");
+  if (it == op_info.attr().end()) return false;
+  const absl::string_view equation = it->second.s();
+  std::vector<std::string> equation_split = absl::StrSplit(equation, "->");
+
+  if (equation_split.empty()) {
+    LOG(WARNING) << "Einsum with malformed equation";
+    return false;
+  }
+  std::vector<absl::string_view> input_split =
+      absl::StrSplit(equation_split[0], ',');
+
+  // The current model covers Einsum operations with two operands and a RHS
+  if (op_info.inputs_size() != 2 || equation_split.size() != 2) {
+    VLOG(1) << "Missing accurate estimator for op: " << op_info.op();
+    return false;
+  }
+  const auto& a_input = op_info.inputs(0);
+  const auto& b_input = op_info.inputs(1);
+  absl::string_view rhs_str = equation_split[1];
+  absl::string_view a_input_str = input_split[0];
+  absl::string_view b_input_str = input_split[1];
+
+  // Ellipsis are not currently supported
+  if (absl::StrContains(a_input_str, "...") ||
+      absl::StrContains(b_input_str, "...")) {
+    VLOG(1) << "Missing accurate estimator for op: " << op_info.op()
+            << ", ellipsis not supported";
+    return false;
+  }
+
+  constexpr int kMatrixRank = 2;
+
+  bool a_input_shape_unknown = false;
+  bool b_input_shape_unknown = false;
+
+  TensorShapeProto a_input_shape = MaybeGetMinimumShape(
+      a_input.shape(), std::max(kMatrixRank, a_input.shape().dim_size()),
+      &a_input_shape_unknown);
+  TensorShapeProto b_input_shape = MaybeGetMinimumShape(
+      b_input.shape(), std::max(kMatrixRank, b_input.shape().dim_size()),
+      &b_input_shape_unknown);
+
+  if (a_input_str.size() != static_cast<size_t>(a_input_shape.dim_size()) ||
+      b_input_str.size() != static_cast<size_t>(b_input_shape.dim_size())) {
+    VLOG(1) << "Missing accurate estimator for op: " << op_info.op()
+            << ", equation subscripts don't match tensor rank.";
+    return false;
+  }
+
+  // Subscripts where axis appears more than once for a single input are not yet
+  // supported
+  if (CheckRepeatedDimensions(a_input_str) ||
+      CheckRepeatedDimensions(b_input_str) ||
+      CheckRepeatedDimensions(rhs_str)) {
+    VLOG(1) << "Missing accurate estimator for op: " << op_info.op()
+            << ", Subscripts where axis appears more than once for a single "
+               "input are not yet supported";
+    return false;
+  }
+
+  return true;
+}
+
 }  // namespace
 
 // Return a minimum shape if the shape is unknown. If known, return the original
@@ -528,7 +599,7 @@ DeviceInfo OpLevelCostEstimator::GetDeviceInfo(
       }
     }
   } else if (device.type() == "GPU") {
-    const string architecture = device.environment().at("architecture");
+    const std::string architecture = device.environment().at("architecture");
     int cores_per_multiprocessor;
     if (architecture < "3") {
       // Fermi
@@ -695,7 +766,7 @@ OpLevelCostEstimator::ConvolutionDimensionsFromInputs(
   VLOG(2) << "Original filter shape: " << original_filter_shape.DebugString();
 
   int x_index, y_index, major_channel_index, minor_channel_index = -1;
-  const string& data_format = GetDataFormat(op_info);
+  const std::string& data_format = GetDataFormat(op_info);
   if (data_format == "NCHW") {
     major_channel_index = 1;
     y_index = 2;
@@ -712,7 +783,7 @@ OpLevelCostEstimator::ConvolutionDimensionsFromInputs(
     x_index = 2;
     major_channel_index = 3;
   }
-  const string& filter_format = GetFilterFormat(op_info);
+  const std::string& filter_format = GetFilterFormat(op_info);
   int filter_x_index, filter_y_index, in_major_channel_index, out_channel_index,
       in_minor_channel_index = -1;
   if (filter_format == "HWIO") {
@@ -906,6 +977,130 @@ int64 OpLevelCostEstimator::CountMatMulOperations(const OpInfo& op_info,
   return ops;
 }
 
+bool OpLevelCostEstimator::GenerateBatchMatmulContextFromEinsum(
+    const OpContext& einsum_context, OpContext* batch_matmul_context,
+    bool* found_unknown_shapes) const {
+  // This auxiliary function transforms an einsum OpContext into its equivalent
+  // Batch Matmul OpContext. The function returns a boolean, which determines
+  // whether it was successful in generating the output OpContext or not.
+
+  // Einsum computes a generalized contraction between tensors of arbitrary
+  // dimension as defined by the equation written in the Einstein summation
+  // convention. The number of tensors in the computation and the number of
+  // contractions can be arbitrarily long. The current model only contemplates
+  // Einsum equations, which can be translated into a single BatchMatMul
+  // operation. Einsum operations with more than two operands are not currently
+  // supported. Subscripts where an axis appears more than once for a single
+  // input and ellipsis are currently also excluded. See:
+  // https://www.tensorflow.org/api_docs/python/tf/einsum
+  // We distinguish four kinds of dimensions, depending on their placement in
+  // the equation:
+  // + B: Batch dimensions: Dimensions which appear in both operands and RHS.
+  // + K: Contracting dimensions: These appear in both inputs but not RHS.
+  // + M: Operand A dimensions: These appear in the first operand and the RHS.
+  // + N: Operand B dimensions: These appear in the second operand and the RHS.
+  // Then, the operation to estimate is BatchMatMul([B,M,K],[B,K,N])
+
+  if (batch_matmul_context == nullptr) {
+    VLOG(1) << "Output context should not be a nullptr.";
+    return false;
+  }
+  if (!IsEinsumCorrectlyFormed(einsum_context)) return false;
+  const auto& op_info = einsum_context.op_info;
+  std::vector<std::string> equation_split =
+      absl::StrSplit(op_info.attr().find("equation")->second.s(), "->");
+  std::vector<absl::string_view> input_split =
+      absl::StrSplit(equation_split[0], ',');
+  const auto& a_input = op_info.inputs(0);
+  const auto& b_input = op_info.inputs(1);
+  absl::string_view rhs_str = equation_split[1];
+  absl::string_view a_input_str = input_split[0];
+  absl::string_view b_input_str = input_split[1];
+
+  constexpr int kMatrixRank = 2;
+
+  bool a_input_shape_unknown = false;
+  bool b_input_shape_unknown = false;
+
+  TensorShapeProto a_input_shape = MaybeGetMinimumShape(
+      a_input.shape(), std::max(kMatrixRank, a_input.shape().dim_size()),
+      &a_input_shape_unknown);
+  TensorShapeProto b_input_shape = MaybeGetMinimumShape(
+      b_input.shape(), std::max(kMatrixRank, b_input.shape().dim_size()),
+      &b_input_shape_unknown);
+
+  *found_unknown_shapes = a_input_shape_unknown || b_input_shape_unknown ||
+                          (a_input.shape().dim_size() < kMatrixRank) ||
+                          (b_input.shape().dim_size() < kMatrixRank);
+
+  OpInfo batch_matmul_op_info = op_info;
+  batch_matmul_op_info.mutable_inputs()->Clear();
+  batch_matmul_op_info.set_op("BatchMatMul");
+
+  AttrValue transpose_attribute;
+  transpose_attribute.set_b(false);
+  (*batch_matmul_op_info.mutable_attr())["transpose_a"] = transpose_attribute;
+  (*batch_matmul_op_info.mutable_attr())["transpose_b"] = transpose_attribute;
+
+  OpInfo::TensorProperties* a_matrix = batch_matmul_op_info.add_inputs();
+  TensorShapeProto* a_matrix_shape = a_matrix->mutable_shape();
+  a_matrix->set_dtype(a_input.dtype());
+
+  OpInfo::TensorProperties* b_matrix = batch_matmul_op_info.add_inputs();
+  b_matrix->set_dtype(b_input.dtype());
+  TensorShapeProto* b_matrix_shape = b_matrix->mutable_shape();
+
+  TensorShapeProto_Dim m_dim;
+  TensorShapeProto_Dim n_dim;
+  TensorShapeProto_Dim k_dim;
+
+  m_dim.set_size(1);
+  n_dim.set_size(1);
+  k_dim.set_size(1);
+
+  for (int i_idx = 0, a_input_str_size = a_input_str.size();
+       i_idx < a_input_str_size; ++i_idx) {
+    if (b_input_str.find(a_input_str[i_idx]) == std::string::npos) {
+      if (rhs_str.find(a_input_str[i_idx]) == std::string::npos) {
+        VLOG(1) << "Missing accurate estimator for op: " << op_info.op();
+        return false;
+      }
+
+      m_dim.set_size(m_dim.size() * a_input_shape.dim(i_idx).size());
+      continue;
+    } else if (rhs_str.find(a_input_str[i_idx]) == std::string::npos) {
+      // The dimension does not appear in the RHS, therefore it is a contracting
+      // dimension.
+      k_dim.set_size(k_dim.size() * a_input_shape.dim(i_idx).size());
+      continue;
+    }
+    // It appears in both input operands, therefore we place it as an outer
+    // dimension for the Batch Matmul.
+    *(a_matrix_shape->add_dim()) = a_input_shape.dim(i_idx);
+    *(b_matrix_shape->add_dim()) = a_input_shape.dim(i_idx);
+  }
+  for (int i_idx = 0, b_input_str_size = b_input_str.size();
+       i_idx < b_input_str_size; ++i_idx) {
+    if (a_input_str.find(b_input_str[i_idx]) == std::string::npos) {
+      if (rhs_str.find(b_input_str[i_idx]) == std::string::npos) {
+        VLOG(1) << "Missing accurate estimator for op: " << op_info.op();
+        return false;
+      }
+      n_dim.set_size(n_dim.size() * b_input_shape.dim(i_idx).size());
+    }
+  }
+
+  // The two inner-most dimensions of the Batch Matmul are added.
+  *(a_matrix_shape->add_dim()) = m_dim;
+  *(a_matrix_shape->add_dim()) = k_dim;
+  *(b_matrix_shape->add_dim()) = k_dim;
+  *(b_matrix_shape->add_dim()) = n_dim;
+
+  *batch_matmul_context = einsum_context;
+  batch_matmul_context->op_info = batch_matmul_op_info;
+  return true;
+}
+
 int64 OpLevelCostEstimator::CountBatchMatMulOperations(
     const OpInfo& op_info, bool* found_unknown_shapes) {
   return CountBatchMatMulOperations(op_info, nullptr, found_unknown_shapes);
@@ -1327,7 +1522,7 @@ Costs OpLevelCostEstimator::PredictFusedConv2DBiasActivation(
   // contrib/fused_conv/kernels/fused_conv2d_bias_activation_op.cc
 
   // TODO(yaozhang): Support NHWC_VECT_W.
-  string data_format = GetDataFormat(op_context.op_info);
+  std::string data_format = GetDataFormat(op_context.op_info);
   if (data_format != "NCHW" && data_format != "NHWC" &&
       data_format != "NCHW_VECT_C") {
     LOG(WARNING) << "unsupported data format: " << data_format;
@@ -1335,7 +1530,7 @@ Costs OpLevelCostEstimator::PredictFusedConv2DBiasActivation(
     cost.inaccurate = true;
     return cost;
   }
-  string filter_format = GetFilterFormat(op_context.op_info);
+  std::string filter_format = GetFilterFormat(op_context.op_info);
   if (filter_format != "HWIO" && filter_format != "OIHW" &&
       filter_format != "OIHW_VECT_I") {
     LOG(WARNING) << "unsupported filter format: " << filter_format;
@@ -1405,154 +1600,17 @@ Costs OpLevelCostEstimator::PredictMatMul(const OpContext& op_context) const {
 }
 
 Costs OpLevelCostEstimator::PredictEinsum(const OpContext& op_context) const {
-  // Einsum computes a generalized contraction between tensors of arbitrary
-  // dimension as defined by the equation written in the Einstein summation
-  // convention. The number of tensors in the computation and the number of
-  // contractions can be arbitrarily long. The current model only contemplates
-  // Einsum equations, which can be translated into a single BatchMatMul
-  // operation. Einsum operations with more than two operands are not currently
-  // supported. Subscripts where an axis appears more than once for a single
-  // input and ellipsis are currently also excluded. See:
-  // https://www.tensorflow.org/api_docs/python/tf/einsum
-  // We distinguish four kinds of dimensions, depending on their placement in
-  // the equation:
-  // + B: Batch dimensions: Dimensions which appear in both operands and RHS.
-  // + K: Contracting dimensions: These appear in both inputs but not RHS.
-  // + M: Operand A dimensions: These appear in the first operand and the RHS.
-  // + N: Operand B dimensions: These appear in the second operand and the RHS.
-  // Then, the operation to estimate is BatchMatMul([B,M,K],[B,K,N])
   const auto& op_info = op_context.op_info;
 
   auto it = op_info.attr().find("equation");
   if (it == op_info.attr().end()) return Costs::ZeroCosts(/*inaccurate=*/true);
-  const string& equation = it->second.s();
-  std::vector<string> equation_split = absl::StrSplit(equation, "->");
-
-  if (equation_split.empty()) {
-    LOG(WARNING) << "Einsum with malformed equation";
-    return PredictCostOfAnUnknownOp(op_context);
-  }
-  std::vector<string> input_split = absl::StrSplit(equation_split[0], ',');
-
-  // The current model covers Einsum operations with two operands and a RHS
-  if (op_info.inputs_size() != 2 || equation_split.size() != 2) {
-    VLOG(1) << "Missing accurate estimator for op: " << op_info.op();
-    return PredictCostOfAnUnknownOp(op_context);
-  }
-  string rhs_str = equation_split[1];
-  string a_input_str = input_split[0];
-  string b_input_str = input_split[1];
-
-  // Ellipsis are not currently supported
-  if (a_input_str.find("...") != std::string::npos ||
-      b_input_str.find("...") != std::string::npos) {
-    VLOG(1) << "Missing accurate estimator for op: " << op_info.op()
-            << ", ellipsis not supported";
-    return PredictCostOfAnUnknownOp(op_context);
-  }
-
-  const auto& a_input = op_info.inputs(0);
-  const auto& b_input = op_info.inputs(1);
-  const int matrix_rank = 2;
-
+  OpContext batch_matmul_op_context;
   bool found_unknown_shapes = false;
-  bool a_input_shape_unknown = false;
-  bool b_input_shape_unknown = false;
-
-  TensorShapeProto a_input_shape = MaybeGetMinimumShape(
-      a_input.shape(), std::max(matrix_rank, a_input.shape().dim_size()),
-      &a_input_shape_unknown);
-  TensorShapeProto b_input_shape = MaybeGetMinimumShape(
-      b_input.shape(), std::max(matrix_rank, b_input.shape().dim_size()),
-      &b_input_shape_unknown);
-
-  found_unknown_shapes = a_input_shape_unknown || b_input_shape_unknown ||
-                         (a_input.shape().dim_size() < matrix_rank) ||
-                         (b_input.shape().dim_size() < matrix_rank);
-
-  if (a_input_str.size() != static_cast<size_t>(a_input_shape.dim_size()) ||
-      b_input_str.size() != static_cast<size_t>(b_input_shape.dim_size())) {
-    VLOG(1) << "Missing accurate estimator for op: " << op_info.op()
-            << ", equation subscripts don't match tensor rank.";
+  bool success = GenerateBatchMatmulContextFromEinsum(
+      op_context, &batch_matmul_op_context, &found_unknown_shapes);
+  if (!success) {
     return PredictCostOfAnUnknownOp(op_context);
   }
-
-  // Subscripts where axis appears more than once for a single input are not yet
-  // supported
-  if (CheckRepeatedDimensions(a_input_str) ||
-      CheckRepeatedDimensions(b_input_str) ||
-      CheckRepeatedDimensions(rhs_str)) {
-    VLOG(1) << "Missing accurate estimator for op: " << op_info.op()
-            << ", Subscripts where axis appears more than once for a single "
-               "input are not yet supported";
-    return PredictCostOfAnUnknownOp(op_context);
-  }
-
-  OpInfo batch_matmul_op_info = op_info;
-  batch_matmul_op_info.mutable_inputs()->Clear();
-  batch_matmul_op_info.set_op("BatchMatMul");
-
-  AttrValue transpose_attribute;
-  transpose_attribute.set_b(false);
-  (*batch_matmul_op_info.mutable_attr())["transpose_a"] = transpose_attribute;
-  (*batch_matmul_op_info.mutable_attr())["transpose_b"] = transpose_attribute;
-
-  OpInfo::TensorProperties* a_matrix = batch_matmul_op_info.add_inputs();
-  TensorShapeProto* a_matrix_shape = a_matrix->mutable_shape();
-  a_matrix->set_dtype(a_input.dtype());
-
-  OpInfo::TensorProperties* b_matrix = batch_matmul_op_info.add_inputs();
-  b_matrix->set_dtype(b_input.dtype());
-  TensorShapeProto* b_matrix_shape = b_matrix->mutable_shape();
-
-  TensorShapeProto_Dim m_dim;
-  TensorShapeProto_Dim n_dim;
-  TensorShapeProto_Dim k_dim;
-
-  m_dim.set_size(1);
-  n_dim.set_size(1);
-  k_dim.set_size(1);
-
-  for (int i_idx = 0, a_input_str_size = a_input_str.size();
-       i_idx < a_input_str_size; ++i_idx) {
-    if (b_input_str.find(a_input_str[i_idx]) == std::string::npos) {
-      if (rhs_str.find(a_input_str[i_idx]) == std::string::npos) {
-        VLOG(1) << "Missing accurate estimator for op: " << op_info.op();
-        return PredictCostOfAnUnknownOp(op_context);
-      }
-
-      m_dim.set_size(m_dim.size() * a_input_shape.dim(i_idx).size());
-      continue;
-    } else if (rhs_str.find(a_input_str[i_idx]) == std::string::npos) {
-      // The dimension does not appear in the RHS, therefore it is a contracting
-      // dimension.
-      k_dim.set_size(k_dim.size() * a_input_shape.dim(i_idx).size());
-      continue;
-    }
-    // It appears in both input operands, therefore we place it as an outer
-    // dimension for the Batch Matmul.
-    *(a_matrix_shape->add_dim()) = a_input_shape.dim(i_idx);
-    *(b_matrix_shape->add_dim()) = a_input_shape.dim(i_idx);
-  }
-  for (int i_idx = 0, b_input_str_size = b_input_str.size();
-       i_idx < b_input_str_size; ++i_idx) {
-    if (a_input_str.find(b_input_str[i_idx]) == std::string::npos) {
-      if (rhs_str.find(b_input_str[i_idx]) == std::string::npos) {
-        VLOG(1) << "Missing accurate estimator for op: " << op_info.op();
-        return PredictCostOfAnUnknownOp(op_context);
-      }
-      n_dim.set_size(n_dim.size() * b_input_shape.dim(i_idx).size());
-    }
-  }
-
-  // The two inner-most dimensions of the Batch Matmul are added.
-  *(a_matrix_shape->add_dim()) = m_dim;
-  *(a_matrix_shape->add_dim()) = k_dim;
-  *(b_matrix_shape->add_dim()) = k_dim;
-  *(b_matrix_shape->add_dim()) = n_dim;
-
-  OpContext batch_matmul_op_context = op_context;
-  batch_matmul_op_context.op_info = batch_matmul_op_info;
   Costs costs = PredictCosts(batch_matmul_op_context);
   costs.inaccurate = costs.inaccurate || found_unknown_shapes;
   costs.num_ops_with_unknown_shapes = found_unknown_shapes;
@@ -1772,7 +1830,7 @@ Costs OpLevelCostEstimator::PredictFusedOp(
 
 /* static */
 OpContext OpLevelCostEstimator::FusedChildContext(
-    const OpContext& parent, const string& op_name,
+    const OpContext& parent, const std::string& op_name,
     const OpInfo::TensorProperties& output,
     const std::vector<OpInfo::TensorProperties>& inputs) {
   // Setup the base parameters of our new context.
@@ -1821,7 +1879,7 @@ OpLevelCostEstimator::OpDimensionsFromInputs(
   VLOG(2) << "Image shape: " << image_shape.DebugString();
 
   int x_index, y_index, channel_index;
-  const string& data_format = GetDataFormat(op_info);
+  const std::string& data_format = GetDataFormat(op_info);
   if (data_format == "NCHW") {
     channel_index = 1;
     y_index = 2;
diff --git a/tensorflow/core/grappler/costs/op_level_cost_estimator.h b/tensorflow/core/grappler/costs/op_level_cost_estimator.h
index ad2df8fcdd5..2bf3c5bb920 100644
--- a/tensorflow/core/grappler/costs/op_level_cost_estimator.h
+++ b/tensorflow/core/grappler/costs/op_level_cost_estimator.h
@@ -138,6 +138,9 @@ class OpLevelCostEstimator {
   static int64 CountMatMulOperations(const OpInfo& op_info,
                                      MatMulDimensions* mat_mul,
                                      bool* found_unknown_shapes);
+  bool GenerateBatchMatmulContextFromEinsum(const OpContext& einsum_context,
+                                            OpContext* batch_matmul_context,
+                                            bool* found_unknown_shapes) const;
   static int64 CountBatchMatMulOperations(const OpInfo& op_info,
                                           bool* found_unknown_shapes);
   static int64 CountBatchMatMulOperations(const OpInfo& op_info,

From e213574acedc8810cf3eb753ff387d70c52b90a3 Mon Sep 17 00:00:00 2001
From: Ashwin Murthy <ashwinm@google.com>
Date: Tue, 23 Jun 2020 18:06:58 -0700
Subject: [PATCH 0946/1390] Add g3doc for TensorFlow composite operation fusion
 in the TensorFlow Lite converter

PiperOrigin-RevId: 317976446
Change-Id: I5b9093f5290f14444cb1a64c1c17f3017996e5b5
---
 tensorflow/lite/g3doc/_book.yaml              |   2 +
 .../lite/g3doc/convert/operation_fusion.md    | 270 ++++++++++++++++++
 tensorflow/lite/g3doc/convert/rnn.md          |  19 +-
 .../lite/g3doc/guide/ops_compatibility.md     |   3 -
 .../lite/g3doc/images/convert/op_fusion.png   | Bin 0 -> 39668 bytes
 .../g3doc/images/convert/op_fusion_banner.jpg | Bin 0 -> 90470 bytes
 6 files changed, 285 insertions(+), 9 deletions(-)
 create mode 100644 tensorflow/lite/g3doc/convert/operation_fusion.md
 create mode 100644 tensorflow/lite/g3doc/images/convert/op_fusion.png
 create mode 100644 tensorflow/lite/g3doc/images/convert/op_fusion_banner.jpg

diff --git a/tensorflow/lite/g3doc/_book.yaml b/tensorflow/lite/g3doc/_book.yaml
index 6c454fab921..715e0c8431b 100644
--- a/tensorflow/lite/g3doc/_book.yaml
+++ b/tensorflow/lite/g3doc/_book.yaml
@@ -86,6 +86,8 @@ upper_tabs:
         path: /lite/convert/rnn
       - title: "Add metadata"
         path: /lite/convert/metadata
+      - title: "Composite operation fusion"
+        path: /lite/convert/operation_fusion
       - title: "1.x compatibility"
         path: /lite/convert/1x_compatibility
 
diff --git a/tensorflow/lite/g3doc/convert/operation_fusion.md b/tensorflow/lite/g3doc/convert/operation_fusion.md
new file mode 100644
index 00000000000..c8714179498
--- /dev/null
+++ b/tensorflow/lite/g3doc/convert/operation_fusion.md
@@ -0,0 +1,270 @@
+# TensorFlow operation fusion
+
+## Overview
+
+This page describes the design and steps needed to convert composite operations
+in TensorFlow to fused operations in TensorFlow Lite. This infrastructure is
+general purpose and supports conversion of any composite operation in TensorFlow
+to a corresponding fused operation in TensorFlow Lite.
+
+An example use of this infrastructure is TensorFlow RNN operation fusion to
+TensorFlow Lite, as detailed
+[here](https://www.tensorflow.org/lite/convert/rnn).
+
+### What are fused operations
+
+![drawing](../images/convert/op_fusion_banner.jpg)
+
+TensorFlow operations can either be primitive ops e.g.
+[tf.add](https://www.tensorflow.org/api_docs/python/tf/math/add) or they can be
+composed from other primitive operations e.g.
+[tf.einsum](https://www.tensorflow.org/api_docs/python/tf/einsum). A primitive
+operation shows up as a single node in the TensorFlow graph while.a composite
+operation is a collection of nodes in the TensorFlow graph. Executing a
+composite operation is equivalent to executing each of its constituent primitive
+operations.
+
+A fused operation corresponds to a single operation that subsumes all the
+computation performed by each primitive operation within the corresponding
+composite operation.
+
+### Benefits of fused operations
+
+Fused operations exist to maximize the performance of their underlying kernel
+implementations, by optimizing the overall computation and reducing memory
+footprint. This is very valuable, especially for low-latency inference workloads
+and resource constrained mobile platforms.
+
+Fused operations also provide a higher level interface to define complex
+transformations like quantization, which would otherwise be infeasible or very
+hard to do at a more granular level.
+
+TensorFlow Lite has many instances of fused operations for the reasons
+articulated above. These fused operations typically correspond to composite
+operations in the source TensorFlow program. Examples of composite operations in
+TensorFlow that are implemented as a single fused operation in TensorFlow Lite
+include various RNN operations like Unidirectional and Bidirectional sequence
+LSTM, convolution (conv2d, bias add, relu), fully connected (matmul, bias add,
+relu) and more. In TensorFlow Lite, LSTM quantization is currently only
+implemented in the fused LSTM operations.
+
+### Challenges with fused operations
+
+Converting composite operations from TensorFlow to fused operations in
+TensorFlow Lite is a hard problem. This is because:
+
+1.  Composite operations are represented in the TensorFlow graph as an
+    unstructured set of primitive operations. It can be very challenging to
+    identify (e.g. via pattern matching) the sub-graph corresponding to such a
+    composite operation.
+
+1.  There may be more than one TensorFlow implementation targeting a fused
+    TensorFlow Lite operation. For example, there are many LSTM implementations
+    in TensorFlow (Keras, Babelfish/lingvo etc) and each of these is composed of
+    different primitive operations but they all could still be converted to the
+    same fused LSTM operation in TensorFlow Lite.
+
+As such, conversion of fused operations has proven quite challenging.
+
+## Converting from composite to fused operation
+
+The overall architecture for converting TensorFlow composite operations to
+TensorFlow Lite fused operations is below:
+
+![drawing](../images/convert/op_fusion.png)
+
+### Wrap the composite operation in a `tf.function`
+
+In the TensorFlow model source code, identify and abstract out the composite
+operation into a `tf.function` with the
+[experimental\_implements](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/eager/function.py#L88)
+function annotation. See an example of [embedding lookup](#composing_ops). The
+function defines the interface and its arguments should be used to implement the
+conversion logic.
+
+### Write conversion code
+
+The conversion code is written per the interface of the function with the
+`implements` annotation. See an example fusion for
+[embedding lookup](#fusion_code). Conceptually, the conversion code replaces the
+composite implementation of this interface with the fused one.
+
+In the prepare-composite-functions pass, plugin in your
+[conversion code](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/compiler/mlir/lite/transforms/prepare_composite_functions_tf.cc#L108).
+
+In more advanced usages, it is possible to implement complex transformations of
+the composite operation's operands in order to derive the operands of the fused
+operation. See
+[Keras LSTM](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/compiler/mlir/lite/utils/lstm_utils.cc#L627).
+conversion code as an example.
+
+### Convert to TensorFlow Lite
+
+Use the
+[TFLiteConverter.from_saved_model](https://www.tensorflow.org/api_docs/python/tf/lite/TFLiteConverter#from_saved_model)
+API to convert to TensorFlow Lite.
+
+## Under the hood
+
+<a id="under_the_hood"></a>
+
+We now describe high level details of the overall design in converting to fused
+operations in TensorFlow Lite.
+
+### Composing operations in TensorFlow
+
+<a id="composing_ops"></a>
+
+The use of `tf.function` with the
+[experimental\_implements](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/eager/function.py#L88)
+function attribute allows users to explicitly compose new operations using
+TensorFlow primitive operations and specify the interface that the resultant
+composite operation implements. This is very useful as it provides:
+
+1.  A well-defined boundary for the composite operation in the underlying
+    TensorFlow graph.
+1.  Explicitly specify the interface that this operation implements. The
+    arguments of the `tf.function` correspond to the arguments of this
+    interface.
+
+As an example, let’s consider a composite operation defined in
+[Lingvo/TensorFlow](https://github.com/tensorflow/lingvo) to implement embedding
+lookup. This maps to a fused operation in TensorFlow Lite.
+
+```python
+  @tf.function(
+        experimental_implements="lingvo.embedding_lookup")
+    def EmbFprop(embs, ids_vec):
+      """Embedding forward prop.
+
+      Effectively, it computes:
+        num = size of ids_vec
+        rets = zeros([num, embedding dim])
+        for i in range(num):
+          rets[i, :] = embs[ids_vec[i], :]
+        return rets
+
+      Args:
+        embs: The embedding matrix.
+        ids_vec: A vector of int32 embedding ids.
+
+      Returns:
+        The result of embedding lookups. A matrix of shape
+        [num ids in ids_vec, embedding dims].
+      """
+      num = tf.shape(ids_vec)[0]
+      rets = inplace_ops.empty([num] + emb_shape_suf, py_utils.FPropDtype(p))
+
+      def EmbFpropLoop(i, embs, ids_vec, rets):
+        # row_id = ids_vec[i]
+        row_id = tf.gather(ids_vec, i)
+        # row = embs[row_id]
+        row = tf.reshape(tf.gather(embs, row_id), [1] + emb_shape_suf)
+        # rets[i] = row
+        rets = inplace_ops.alias_inplace_update(rets, [i], row)
+        return embs, ids_vec, rets
+
+      _, _, rets = functional_ops.For(
+          start=0,
+          limit=num,
+          delta=1,
+          inputs=[embs, ids_vec, rets],
+          body=EmbFpropLoop,
+          rewrite_with_while=compiled)
+      if len(weight_shape) > 2:
+        rets = tf.reshape(rets, [num, symbolic.ToStatic(p.embedding_dim)])
+      return rets
+```
+
+By making models use composite operations via `tf.function` as illustrated
+above, it becomes possible to build a general infrastructure to **identify and
+convert** such operations to fused TensorFlow Lite operations.
+
+### Extending the TensorFlow Lite converter
+
+The TensorFlow Lite converter that was released earlier this year only supported
+importing TensorFlow models as a graph with all variables replaced with their
+corresponding constant values. This does not work for operation fusion since
+such graphs have all functions inlined so that the variables can be turned into
+constants.
+
+In order to leverage the `tf.function` with the `experimental_implements`
+feature during the conversion process, the functions need to be preserved until
+later in the conversion process.
+
+As such, we implemented a new workflow of importing and converting TensorFlow
+models in the converter to support the composite operation fusion use case.
+Specifically, the new features added are:
+
+1.  Importing TensorFlow
+    [saved models into MLIR](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/compiler/mlir/tensorflow/translate/import_model.cc#L3593)
+1.  [fuse composite operations](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/compiler/mlir/lite/transforms/prepare_composite_functions_tf.cc#L103)
+1.  [variable mutability analysis](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/compiler/mlir/tensorflow/transforms/optimize_global_tensors.cc#L43)
+1.  [freeze all read-only variables](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/compiler/mlir/tensorflow/transforms/freeze_global_tensors.cc#L44)
+
+This allows us to perform operation fusion using the functions representing the
+composite operations prior to function inlining and variable freezing.
+
+### Implementing operation fusion
+
+Let’s look at the operation fusion pass in more detail. This pass does the
+following:
+
+1.  Loop through all functions in the MLIR module.
+1.  If a function has the tf.\_implements attribute, based on the attribute
+    value, calls the appropriate operation fusion utility.
+1.  The operation fusion utility operates on the function’s operands and
+    attributes (which serve as the interface for the conversion) and replaces
+    the body of the function with an equivalent function body containing the
+    fused operation.
+1.  In many cases, the replaced body will contain operations other than the
+    fused operation. These correspond to some static transforms on the
+    function’s operands in order to obtain the operands of the fused operation.
+    Since these computations can all be constant folded away, they would not be
+    present in the exported flatbuffer where only the fused operation would
+    exist.
+
+Here is code snippet from the pass showing the main workflow:
+
+```
+void PrepareCompositeFunctionsPass::ConvertTFImplements(FuncOp func,
+                                                        StringAttr attr) {
+  if (attr.getValue() == "lingvo.embedding_lookup") {
+    func.eraseBody();
+    func.addEntryBlock();
+    // Convert the composite embedding_lookup function body to a
+    // TFLite fused embedding_lookup op.
+    ConvertEmbeddedLookupFunc convert_embedded_lookup(func);
+    if (failed(convert_embedded_lookup.VerifySignature())) {
+      return signalPassFailure();
+    }
+    convert_embedded_lookup.RewriteFunc();
+  } else if (attr.getValue() == mlir::TFL::kKerasLstm) {
+     func.eraseBody();
+     func.addEntryBlock();
+     OpBuilder builder(func.getBody());
+     if (failed(ConvertKerasLSTMLayer(func, &builder))) {
+       return signalPassFailure();
+     }
+  } else if (.....) /* Other fusions can plug in here */
+}
+```
+
+Here is code snippet showing mapping this composite operation to a fused
+operation in TensorFlow Lite leveraging the function as a conversion interface.
+
+<a id="fusion_code"></a>
+
+```C++
+void RewriteFunc() {
+    Value lookup = func_.getArgument(1);
+    Value value = func_.getArgument(0);
+    auto output_type = func_.getType().getResult(0);
+
+    OpBuilder builder(func_.getBody());
+    auto op = builder.create<mlir::TFL::EmbeddingLookupOp>(
+        func_.getLoc(), output_type, lookup, value);
+
+    builder.create<mlir::ReturnOp>(func_.getLoc(), op.getResult());
+  }
+```
diff --git a/tensorflow/lite/g3doc/convert/rnn.md b/tensorflow/lite/g3doc/convert/rnn.md
index 734992c0904..0954f13a4c7 100644
--- a/tensorflow/lite/g3doc/convert/rnn.md
+++ b/tensorflow/lite/g3doc/convert/rnn.md
@@ -23,15 +23,16 @@ two fold:
 
 ## Converter API
 
-Currently this feature is available through the
-[tf-nightly](https://pypi.org/project/tf-nightly/) pip or from head. This will
-be available in the TensorFlow 2.3 release.
+The feature is part of TensorFlow 2.3 release. It is also available through the
+[tf-nightly](https://pypi.org/project/tf-nightly/) pip or from head.
 
 This conversion functionality is available when converting to TensorFlow Lite
 via a SavedModel or from the Keras model directly. See example usages.
 
 ### From saved model
 
+<a id="from_saved_model"></a>
+
 ```
 # build a saved model. Here concrete_function is the exported function
 # corresponding to the TensorFlow model containing one or more
@@ -64,6 +65,8 @@ illustrates the end to end usage with the TensorFlow Lite interpreter.
 
 ## TensorFlow RNNs APIs supported
 
+<a id="rnn_apis"></a>
+
 ### Keras LSTM conversion (recommended)
 
 We support out-of-the-box conversion of Keras LSTM to TensorFlow Lite. For
@@ -75,13 +78,17 @@ details on how this works please refer to the
 Also important is to highlight the TensorFlow Lite’s LSTM contract with respect
 to the Keras operation definition:
 
-1.  The dimension 0 of the input tensor is the batch size.
-1.  The dimension 0 of the recurrent\_weight tensor is the number of outputs.
+1.  The dimension 0 of the **input** tensor is the batch size.
+1.  The dimension 0 of the **recurrent\_weight** tensor is the number of
+    outputs.
 1.  The **weight** and **recurrent\_kernel** tensors are transposed.
-1.  The transposed weight, transposed recurrent\_kernel and bias tensors are
+1.  The transposed weight, transposed recurrent\_kernel and **bias** tensors are
     split into 4 equal sized tensors along the dimension 0. These correspond to
     **input gate, forget gate, cell, and output gate**.
 
+See the detailed conversion code from Keras LSTM to TensorFlow Lite
+[here](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/compiler/mlir/lite/utils/lstm_utils.cc#L627).
+
 #### Keras LSTM Variants
 
 ##### Time major
diff --git a/tensorflow/lite/g3doc/guide/ops_compatibility.md b/tensorflow/lite/g3doc/guide/ops_compatibility.md
index 054b7e0e275..d1462cb09c7 100644
--- a/tensorflow/lite/g3doc/guide/ops_compatibility.md
+++ b/tensorflow/lite/g3doc/guide/ops_compatibility.md
@@ -1147,11 +1147,8 @@ models:
 *   `CALL`
 *   `CONCAT_EMBEDDINGS`
 *   `CUSTOM`
-*   `EMBEDDING_LOOKUP`
 *   `EMBEDDING_LOOKUP_SPARSE`
 *   `HASHTABLE_LOOKUP`
 *   `LSH_PROJECTION`
-*   `LSTM`
-*   `RNN`
 *   `SKIP_GRAM`
 *   `SVDF`
diff --git a/tensorflow/lite/g3doc/images/convert/op_fusion.png b/tensorflow/lite/g3doc/images/convert/op_fusion.png
new file mode 100644
index 0000000000000000000000000000000000000000..bfee4acb53ced941ec2e779d7082c0f91f9735e0
GIT binary patch
literal 39668
zcmeAS@N?(olHy`uVBq!ia0y~yU{+vYV2a>iV_;yIRn}C%z|g?p>EaktaqG=q?isP6
zGndErFIe`rXY#hi!tPgi4wZzIT<V(TSdi5Gh=ci-#LD7VvwpXFI~-_G5S+-y%4^1x
z#GKUKtD$Amv_fM=$AkGV-5jmo+TSv`Br&@~aIsQT?Xx}Ye_N+c-RnPhRY>S6t@poe
z_N$kEi>$u;ZQZHww_k7Cy~|~QK#Ee(>Z@sHTkC_FSBI^wdS7^4_WSnZ^7Vhto}K@{
z^6RUs-}R!m<y?MrwA)-icGs7O&(22+*cbDxjo6r!Z2b1uvVVzBPfd-_sN278_wL^-
zmi6Ui-Dl+%^VvS#7385Z$60L&WtTH_@@KX9o18uLot=%XZg;`ML+<{TPp7zFTN{0S
z$D2*3rRM+pvRpd;&m(c^@;im@-uM6i`z|eQmb2o@-SYc)y~=OrZr^%t_oq|Z((*q~
zxc@7-7MY%F`D{jV-ObCFKSylt`OVJGzCULD_47He9y~jLd5@G52n6#m{Y^V?|MH*j
z!M?UPOyu_eyU=d;<%Q|>n9D}I_CHVhOYQ%0RR37a>+9?DYyQ5D&#!rtEPuP`5T|<1
zo40Rkm#oY;d%bO0X27=(UY#8s9UVSNvmF2K*|%@s{!F%?`|G~%zW?_^>Gjy_MvRXW
z&8#jS>y?(i|LfXzIs5N-it8_}d470n)Z1V0-o0~vc7Fc;-QoVA07;p2f#=C{6W`<E
z({v&~T?h@2{kr=1yWRPRX7p*<{&>*b`+EKUe+NTXhyC3V`TzZ<yd&D0Ag>r5%w&6)
zE$RE?OuF6Y&%U$Gd@CLYF4M1h$o<{?_J{nhRe$dkzS{QFIU_9i@~s;CeD%0bE5et*
z&U|KOS}k&F?sc{KH(f2k22EXgHS2Ys?YA4h<trXEdS==Lncn_z<J45``YSn?Z$7?I
z+N)?1z;ZS8{{7v;`qfuf2L~5=2WPu4yi+~7=3CaQZ%=k_g__iQ_V_1J?XWwicXgf2
zvzuh9^k&Z2^<S(0ekgo%YO83Q64T$G!f!6U*I_=ijM2DjK6{r{h65-F%0z$t`jz+h
z+xGo`e}ovypDbAuZ+iCXa<!F=i75{rJi2c@JIU*&-O+Me-`ERRJ}vv?`dwmOsjuO$
z&Gnx9+Qpx*)BALq_r#n<vP&$DPVYa}a7L@g#cJy732XKmcbhyZceyTnd5y*UN$b?z
zyV6wIQv5Ex`}uEL7byA^XF2}b?|;oy{_U5=@_$zZuD@Py?#X+?VdC|@tqT=2?q9lc
z)%R`$%d=>%#TP<ZzSV4BX&^73`D)^_Ybup>DnI{qEqiyK=fuOKmea)#{{OWs&&$VS
zDr2I_<BdydcZ-)s&tGy-=Wgg9ncxiulH0{fci0%(&+gi_R{Pc!jaO=*un#`cVDfIy
z=X09r4tcv?E@R7`?ZEbAol=Lt>FQp_hjVH-FZef0<t(RXrMuX(#IjF+1zQatMD&y`
z>ArW~kB4Ki`0Q?*;N(Wh7m|k>N<`gMuiZ&-F%zqF|DC7f<D2Dhscc#8kBDo1(%gqt
zz_uN2m{WQ!GWb-D!J^OezVAt2BspPocZ2oSFJ^2h!DT&rrl)5v+S6Y$b^Y8MF>^NW
zGGNtwDYE8o_k8~oui7(rEn#MM`d6-YtnJqt{ho_o4#}K-d!gaZ)TgzT(_cG*qH)<Y
zM#YHzPbPUU>tWpaVo`V0vaBY_2|o{?$X=kbzPkHyY9_~}({7wU{$1(Xb=HsPgyPZj
z+o!y^!^h+C#B{ZRQoFd?6J^(X-_uL8(jD?HHh?o&A@d==2=)0jMUUqcpUeDndjFr(
zDfgPTe{AHhSx|V%Q+?~LE!>h5N@M-|99Vo-Cp~El>^K_h-_y3F`_yH@36)3B7jLrh
zFy73}%+wPj6Z*t>u|~P`bzx<V*Cq;}AlY=a;mDSy+Uxgh+F0}R)3K}J@pr2Z|41>C
zH@{y~oc#3E)ZWv2yK}PdmMby~1+U=q*=N7iKw&f2$IqX1Ht;hy8SdO~A*Pn<mlzdR
zp5YMj_@K%9iuK2Pl>VQMydBJ3R53BFoEPMc6MAV5PM|`iw?!v**OtPcPpA985ESe=
z1}ZEpzTHTcUOuPD>v5m;yBp7b$DU|#`8k89Y}&5X9Dn*f_SrA`A=kWN-|LgIE7H`}
z*qApie0?%@yWLgOFM1IN_r?0JE9I1!dh^QGw<o-_95jA!T(V9|Ku}O{V(i(57PFii
z7o>a-bgYTpz0FkX(_#641x_>i*4O`i-OH`FV?n6-nV>wIvPq1JGv<XC)kG!QrST}Q
zk5sKZ`}$Vd^);JJt~9Rn@6qvDe`5N3exFySx~pF+w(h*CZe5;t{er&kocUJ^{jN{$
zK7YCF%(+)G&(G?4#!qVpdDzF4c@o>7?RU$fvlGpO=Sbx4`0=P)+Wby|^W%5B-{<vw
zwRVZP{_$;o{qB!1{p)va-B5FJk?UI5&(~v$&rVou{5S_(F1Vz)^Lng%U;qF2+6VIm
zImD{@`S`YpZJ1wvuX3%R@V?|0rs6{yT&s*Ubwv72{{QIze<SMnK@V=W_KCb_{3Q53
zJ*(#NTwZz=oRT_xikO4UoQkf;mT!&wGkq4j&&iAK@^^#s^Yc&hJbCr{_3c@0rZR65
zS}*>%^W5&bk>QNFvMXn^fzq{0N|)q>P91Pec66}0FoRN#(jrDk67A?<OW7nTu<iEK
z-Ur#cw%wg%masbdp3gaJ{bxPJcTa75Cn_i?D0s2KM~L~*ZnKKW`){5|=>K@!TDQk0
zfBmFly_Z#wRO@T(>w8|CZ(b=q`PbCF4+6lpG8=jFa5UB`8!Ob<Z8~eYC-cvl!d-hl
zZ~m1y<J*UI;viFbPdqVEP#0me`~U9#*CQOv^UvqKdNAor&Hj}uvu9Z?GGJkjy}fU1
zO<19h#UcY<Ti@7#DW=A!r%#+_di1-``&)ZYe15Z}u3FSe$$Gl^{Q1#K?(Sc*&v??N
z@N(Ywz3;%jl$sFu<dxEa{ku5+=<D0vkC14R*_Zz7YOL!8``y{U1*iNxy0Jks`1R~v
zD-Aen_ANJ%jBUMGx#V@{%7mUZIU1LaeObLCUSdfJ*EP{rMXT)w6471rna}OrdZPdF
zck$G2P*TZoP`e<q=4bA?gA8kOi|b}G?zH-8v;Bvi*yLotYpVC(he~wT?%p=PX{wY!
z&*@~}x0B~o7fh_nyEM&v();J4$DjR}tqgLh)P$AU4r1w*E|QzO=6_$Sv3f3JqRZoR
zlZqo8>YaklIe1n-|Ba-QO^e6l=FhoO0?V531{Ix)_5Ay38>BLFVK!pr@v!23H6c-A
z!px)R+0*WC{+IGjfUU)9&G~c5u_vFcQx}_^4N0KPMvlBE?wGN)lnNJX@o-p&ZsdEs
z;s`@cq68a@#L*ui*C%&h`tI-JyXNMN)khkRL`^C={otu<Z5mwrM;;DG`)svsC87S&
zN)Pt$In;3Art5|J>g&S0eQL92u%-A-Omj|jyCL}c%A?(qu6NDE)zt5^zFx89jdgc%
ze@^l;NIABM@n$ERO4Fa8vHpEUXWrlX7AyIa<HVkIZ?EUPiJrlH$@lfGI+oP1X?FU}
z#WxOieJ_2IA3y)<*C$mIBV+yF%@chSID_vVztQIAqEE@7&{UF|PzW&@k<K$5pdwNN
z(5!%6#QM-5z3n%^sh+okEoF{JywQpM;Wt!4=7KU!kLc#z^D1sCeAxCa`i&4M8DDJh
z(P4Hva4!0cenyt>^>6t~_xG>dJlksfO6N`QD}Gz#tH0Mg7k}@{yH-Jq%@*nNb=L26
zwOeApF#GrR>Bg<ERy_IrNB7m|u3u}kPtBcHV;-<%c~ADNsCXIfG|@`8*yTT^!PTGC
zgpbAwEDy@X8m-zMzFM|Tgs~<(QDK`1qqTj0-m6bf{#4K2wS495Nn4(5T==8LyD+D0
z;=fOe)>v~ZAE=a`{QkR^-TixCk8m9Gb)ETJtu5Vq;`f=ZmgZ_-J2fNP%}c{4UVqwI
z`0$RtNag*wD@RSiiE<I6q&oAaT^yH|s~^wPVfs_s-SF?glifRSZmjNZu(MeB!;a_7
zyVgbt51tc$Ju7#MyFE}onttZ=+M^%WJ-IRW<j<9iNhMFa)~UN2T{2bpDYC|Ta?zq3
zpHnygiGiD+Y$-K7Cro3na&Pt5IOBP4ZmT0Fb5Y2ryACHUzo)UK#Clr3?w&8XFk)V9
zy+@)>Qr5NPR!zn8wwtzuvZbVbyu0hso(sL5J3%oZc(auE1gN>XckkZ*p2F{UC(Y=b
zvsLuqPu*DmKBpzmlz2|;b(Wa0_2&1Nmc2Ur*BuHds44q6vsB*aY}FErK1s<emXxxO
zcVmxk44MCFy9cQJvRbx(|Nq_P_bQ)HF;<BGc`97){p;7i4@s2YzuFo!BkrgM%jL~x
zQ|D~9GZa)dRFFTpFP-bk+oR{FlskR<J2BS3rz*iHLRqN$(sI@NvX(83GXra;>n~E@
zZ~I3Vlx8OOt-Ss^+t2##mihU&wq%}~!GCL0)z??-*;{WNkSLd5%;{4-E4U^t_Q={+
z&l#~S-%lGGPgwU+qLcCFlo=fV?ncaz-LWA1cI(46qKP7EJvVx1Ce8}I^8W)Db5ZrA
zZ_5_tUY@+?y{@4PD9tKeJu~0F{`<at`?Akjy<QXV%PQ<7eed_Z?|E5LtNr_e#p425
zK5iFbw7fheO^xm8#6v81;{McxwC;Ryr6EVdhvR9k9?#KtotisTf7T@jmPt&j?JmBi
ze*gZAIrF6^D0iKo-TnEy*16j==2TZ-|8|=JTqyG7I<c{{+i$J;`APTwpJ(RkYkwwa
z-8?l_TmDeM#bt$W4%Ko^E-1LZ%{rh|l<}8ftpC2x%BgW>Ibnrsw0z$rneH|){~dNE
zJuYzmU+;Z~m&gRw*qiSc-L>s*So!=%5xZVr`(suKwy4Cfy!?A(x48bl{mbW7y;26X
zdmbNTm!D&D?D60C_5Yv!`}dFi_19kygMSD;z5N2zYVPpK>(Y$;I#oOT-{VED-F5XJ
zkBXb;?S8xM+PWZNP}_a~zQU`a;eQjat_qD7(~GIdzicCGYGGk<HNI;(BPg{^T<Pua
ze_rmmT=kouzJGV@-~a#PWq<p>_t)=u)Wvpri&@SMgS@oni8FS;-?zK)Ft54EIs5;A
zj!)N#{FHII<WJ4ZOH1eHTr0e%SK^sgw(}Xd?ODdk#?HQcZQR~p7s~g(j-7A+@rZE!
zmYScR?(^~R{P}%Seg2(@?Rl|}h3Efpc|0q7UFMyV%f7wV?{+Ls4Y&}Xtj=%q!9iPA
zS!&Mrr~36f3%_0sH`m+!X45gt=W`C{h4p{^`t@(((JoQ@Y^jB_XCHff<K4S=H{(El
z5u6y?1M1$|e?DV;zvf=$^S%G~zOQ|6nm(uSSoyub@5=wbnESrwxwG%#4=+}Qt~TKR
zf9^Ej>MJH9^%|G*Dvw*=`*=We`JAF~<9>a!(vybAWsEDHOmu&j><^9}PzBl1(Q%?|
z!`*i|yB$IK%H<f-HYGdrpJ%$6uiw%ICr~4?C%3<TQaUVI2G(@qNqSUp_v~E@R+{?0
zoyz)ekLjGX#s0ZjvDbz3z0W^f`zXEK)-Y>geZ9-X=QDlh2243FeUe}Q+`>DHg_<4X
zwu@OUvT&RCGWvYF*z=ktCyZ;Qf5*B0x|#dx%rS=S?oHy^c@Sr?Grh0oN))iWdL_Bl
zvf-z%>8xGb4a8aAMt2>sv)F90ZJmM0>|NXD+8y2bdiJi>0&GuSJ^FF8^<Dm}b4s>t
z3C;^osLyROUB0rn)iE*})Uy(eZB5*@{lw>{MZQ}SD?i%4kF8wtBQG~_c{NusxByp5
zJ~7jEgPpBqS()Gie{Y@>pPu|y?qZl1*s-_k{bj+nokus;3NaV?e)=odsyK79On1>Y
z4ZG*T8-kdNk{3?V7s))-V6tnQtl`g)qI0)Ya>Fn7`Kv<OASWaat~k8&=0;0b277%&
zw#$>=m+SW#dTrWepv*ketlZYnREy`tr%9WZi>om@7duXT{)UgoW6JcRilFn2ha8qJ
zS@w8>(%W9wz282WftucelJf7~-l{8K`O>w)&fZYLGP=v5J~xpgPxGQf&@qR!goxcA
zBN9ann8JAuH%xeUV2$19$)A(C!lp8E3K#ia>~s5g*C(GHl)5Hve7fuq&yph|t9E)O
z%=f!EYodgRn}At)?*6)@`|H%j%bv46IdQ1wjCY@`1rM`RrF-gAUJ<tytC^WMxd-um
zwcBx`xOaOqC>5Q!v1u2_zUL~8mWu_YyUr&%TuVFn=dbH6+pYukzMV5~>_60CV4Cfn
zl(p#0oraj$L<hZ>RY?~sBxF7%YTY!Il@j2&{<br7R@{>Kw{O8+C+pUI`}WCS&)VwN
z9%bJ-Tk2rF&*EDFGrD)qWb`cGcx%oK?iZTpo?DkKODcK%=E?6n4QI4Y?bxnSU3-0U
z@FqT=9>J%1TCZkEO_<r*cq2sSeuTt?NLi@~H$8uD7x{nh=F_k}prpNN^MQyzkHq)C
z@K`dJ$@kuJjyp5^rs!GlGaquwz2++uD7fOh&c4?#Z`B2U-?J|F{$<7<s|^KzPOMXR
zw|%B+e|Vqk&)wo}8D*aPm}77IJ>_biEI<41*0<{-E^}=*o-k7r6vTqOC;U`+PUO7j
zg77*ZZGIPKqtqE;Gx+Zd>46-w2wZRp3PM{xFh$T75vW-M3S?040Gv6X;!5D27^w9R
zk%hJ}p*;vCq=pn&7fc+@G9S@bUw`!*3a*@{AOEl2kN3o;)YEe3&*wk;Sy~+i*0^nP
zrbOS1moI<5wEceP^J70>_o@H>z5icVd%5P%nwii4eOm*zzh@@r&nN2kKOY~eIkVB-
z6{I!gjL(Ty@wgxBeLsIZ%*rixW|KNZ+{fp{GT+%!>GNy9ouBo*W_RT~pK3_I<k*px
zoI5)<ZY+CyYyS6d-|p>5IN0>^+Z>1uot%{q8rh}P=ayVLTw7Z^f0ljyzjxnez$ACR
z-}hU}UB329K06zm-j?YgC%Ayz@ayaA%J;|Li|NJu@T+zQ3m-ev5>x$ltNqa$oybio
z8~1{uK}jh&*XdvC%S%h$Z%9J~=t}NZ`TYF+{k^+)|2|N)_us^*wPnG<uV>%76XLA7
zbV^VbSLcss_tNfsZcV$hdD-&i>HNx$6AM!c3JTmlPk(>snf{|?`&##Ymnpy2mA~(K
ze*gb>?CYXGuYX?u_R9U(4{OWKrv!OvE?w4jZvW3K$99^}`SamczwG|Rr`F>CgZFO#
zS70o*Yn>$bSwkZc$>dq*{-1cJf9L$Py~z*%m*1Q2S9!wT%S&_Vlv`K7Jj|bS&En~y
z<ht9p?|+_sukl{-Owpt%8b)jeuP**^-Ti;PUGb&;KaabE+`N02-IuH7reXVvKfeDZ
z_U+H-CRXk=N!_j<rL9qKyAG!Q{l0GZ&nw6OG%p4@?(33kbLvYL=Wlo(?srm1a__``
z)9x_dKTrNd`+e>20l7Y;^IF}T9d7R(&WrhWoapPXd2|1AeEo~&c*~>qdLZ`&<$ihR
z9cKPZ{Hq-}<+x2$Qf^=k{d(`$f7{Qegyp{XcY(~ga>44~8qs=*h&#W(yG>G3mi{Xd
zQS|1)`M=4FKn}V3lP~gbca?mqSm(thCk$S`T>ba;Z@bz>^*$ikrEQj?*$IX3-?~j!
zI=o=_?&|voRrl`y(%%6#`GdRTFSb|vKli>=I=mom{r=yw&)57}TkitVxjy;+<~JYj
z{kylgSIgCEN&bfC*6DGTP4y07X*VO@*O#vL9!&ikZ?;P0<%E0xel<U@{h0E<L1XEZ
zAg^AB^!j&md;cBX`&}}$tHg$N^|$J0`oGv+KtY}L<B?<D{e<^vi@hfDt-PA`?_&Tc
zK%^Fe;@izX|F7U;FF{_28+UY6oLUj&!VLE8Qbxgx4H~AVrH9<bO<$?Y-?GSg+SuEg
z{wMYEne*%CnS6~a{&{Izrrn37lckQwUATWK;H}lWMF+!n8LzWTyYi~yTIuoapO(lS
zww=E1;<1gV;x(6=C@4t@EIk&w{O{dU$K<NdKeC_q_RFc}xLX$QmzfIh`ggwSUC*(5
z_5TiS%j7t>*mu3vkJQ}XN|*LlUi>ey)#}^4st9fWSB7_rGo^E7R{tsAW>@lAv)BB~
zzOqXaaUjDNF$QL_H`_PeUjF~$WhwVHb~{8HK0leS{*L7p%T|5+2j`ddFjl_0T$aCe
zYT5Lq))fyLJTo06wpN~<{XhEkFW+s|6Y7@byA>bt)P(xoaqE{i+qXB_FN>X8yZ?M$
z_4a-Ce^)JD*1~wu=cE2xJ@4;tZ0Bt)HOVlNoN!Wyw<AbO02GqDCny;_yT9pu|G$F#
z*C(s;94z+zkB+;^)pxP(|LgPHIoE~PZn(##bAI2$mbaf`J~qC)R{m0h<)`5zr??p{
zjhF9V`sP_b<)rZchY}OsP3Ft&ZJ6SFIB6|wCSUBYUqQ@=thn7g{^i{Nf9X==*8g!*
zCdR3}&fh0p-?KvX*S1p)dtM8lmR@iDD0J<w$<sc6(#)SLA@IttcCp=`)9Ke*0?gbd
zSzR{!HP`6J%;cKs-^;JeeC=l+%)adL#s8sh)81cx{rzG7oLzG-F8KSz+v=9A{blp@
z^}n;X?c}$dfBuQP)Vk`px-FBdifldHcmKH?=EbZ83iGBehIg8~ZI^yLlkS_ZQ+NNi
z)^fF_N&D;=A9Fvm*)A$_uJ&)YbiGOKEi+~A89a9S703VU$ZmXa=ZDV&&s)#m{hSk4
z$8xkr?Y>Ees>ITD#`8X(3EOp3zc%i)&jLSQj>-hPQ_B)B>V4|lvXW8p!_r=#hl}D^
zZr7;IH_=er@ZVp5&dpWojkQaZ@84;-aR1`>E1fkb=hu1k-uAfG;3EC#)(8Ec$E0S6
z@BQ<0t&OGMPZ@t-j#MKlm){=UAG<yq7OwPM(EIZ1<6R$SzMc_h-*>e9iTB<Y$*%<O
z7wWs%w;3wRf9m}-Q)@<i_$!IvHy-cbZ-d6NaKsE2riUC4IR0?(oSs+qdCAVz5p(9r
z_b;$!3U10{I$Xn|%$sz0g~6;z%O3wWk@f$?88K6CYM;Q7FLNJCOqfwN&HwPCeJr<j
zG9Fy<;qI6JKYDVnMQq5J^<d`XK=pZtiuHDPEUk5(bw$BomXkoAp!aR({{_4pJKPU9
z6v(9>HC<@aS3LKS%m>?4Bc{pIX2y$ODxR}tqWbP-6Z<aqY_aQk@<dJV+O>o|i|(w_
zjMylqE-$z+e#iNw>zgyw|DMhWnzcR%8h1?H3YN8}WFKi-TwZ)B(}QI~YH<Ijl>vr9
zll<-{S@ARG<cOaZu3$Q~>231K{^gJUoVon-k7n%*=AM%UJRHvjHY~fm;gE#jjc05v
zFH=HarySC7UEnkK<zbsxpVwT?AFeIFf8p1hZ@C=B?((gMHwBp|`73ibPsxAn{V(SA
zm4+pyCw?2cZA%hamGh+M&!wtjIkrucO%-}hEm@qh@};uwTaVY1r4^P=VRLa|UUWr8
z=yH8UlfOcbrNhS(OI{XF=LBDc^II0zs3>jQ(rfuMQS8#i$upiEp8Ph3Wg*MK&@?sP
zY3<1lhd7ot8VA>BI7fbv^Z$HWY=XPAVg3w_IzM;)OQw36a!WcjgDQ6?3)a{?y)Vc-
zY5m)kQ6GH&WlBzwe5N0|W#Yfb#@CJIpK=}5%SudI^-FENu$uXq{F}9cmp6UdYWwe<
z7PPG3beVAbooZcA*%C(e%LzX$-8^Q@I#MF%!`7x~&cw3(V~XvkBo{^Anq-#6UWpRB
zw934sIvF{g518oOa=Ck0<0+d8(-R|Rr`laztXDo3M^2Efe8#pUc6a$U#+9!wZ|Qxz
zY`?4Y<G*rqr6#;ME)l35yhe0o)^b=ZHg&ypl4z(ozR2c5kNP>m7fBqSWY^8OHidoh
zMz;q`yr*v|Gd{FoNot_o@wQdzJx&w+|H=BlYR`69;p}sm%UEc!_D;sm%&;}#nzyHF
zFH4`OzSuSOy>I7My+nbC=!tC4t#|RhkC2$KX=RGxBjMl$7Rz&@M3a0u{|2VT+`6ji
z7Wt{vZEj3BES^}x*Kn?=;`dlnpU@TfE5v#}*HMwRo0dQH*`K&dxcCdF+f1*ib>}~B
zy2yGi`H4zX<u#oNdiM5W0sPk+mR#PDbSrqC8IK3c-PD7{arLhp=lz^D%|K?N-nA_o
z#Fo6dtL?)s-D|2~o#b$8U9rT$h}}Y$*L*4|jJ@<Sa;}uXk*^0Np8q|uLg1L;x?VOH
zO<qvjc%|o#d&=7qd%v3tTuHRy_{O>Hr(Ky?7lX)SA!W%PrXx}lHZ6aa7<Gy%)XrT}
z@QRU^fIg4U^KZw6OZx)Njwx)5m+fTioXF4gbko|{%dB@cUe!5nz@|~g@4dbDX-r?B
z{;h*j!mp%v_p~}bu&8ytStI}cSx17+vORBem=8G>{F?bM?JWOwtyt-%k4ti6{!7e#
zB#~(`PjAzDBZV)uUvl5<5xe~L#qo%Gmwy|N7Rp?1(10c=&-<)G`8MZW8D`|!&fYh_
z(P09El%UPAYMoVOeygI^_*^)d;U1;;^NIcx<#->DS$bmL{YM&(n5?PM3Yp2c@Qs1u
zN{eaRryD)ms}XzDT0d!trtHgwXDamFcH5LsW9<BDn|aqD`ls3U8CMN9D%w52EfX#&
z;FIx1{QolRo~3^yrmaT=pAM6ElV_9VJcs4Sl*RoQdmg{l#xZTCo^wL<HdXE8r?=hm
zJ<D);jner|<|~%_-}MxaJ{k0W;k~~Ob5}U$iCO5!w+m*Tes#L=*Rm|_M<v@em{KoY
zoOWXKn*0p@#oSADZ$DkVrf+9P_^kTS>c>(~v+GXS>nfYt7Owp~_rj;w?GsgGmo0cD
zw_?}2E0aHeGPRRQ7g?%xe^r^*Qb==R$}E>xDOX&fRrb;;ml`zSGBHWAURP(yLK7xq
zAgJMUsi7kS(yD^$2er<?O&w5ED$~KmL;<7>RLWn3G_O#^AT6aS*i7N=z%&`0071<#
zFKni;xggmFH5pvifm?kD_aRJSRsuDlK+?!!iy&SDxf#1DQ(rrq&%b2xyG{5FH^`fy
zdi!Q%)y32E?^ir^)Sr8ywC1|{G;_cGuU5#Odi<b~S<3(BTg^X-IVJN;PVBh)_sseA
zc~|Z2E?;NA_V4$U{mc6zYCrHj{kZ1&xeQQ!Z{_D-Ic4|a()Uk(@yz)*O|$l|TT%bK
zZ{^z3>an|0+D|>LD!%*wGI#nmIk72AEAI-f&sy5~yZionzjvbVdbjxgIa9e)<98_g
z{+#yz_dv;CkoSZ@<2_JIkTcqziKplAM)5EG7XAD+FPBXFSOIFTY0KO7hyR`b&)q-!
z%e<-^*Up2QDN^lgZ3<4^56+MKVqd1gzb^b%zH{d%jdJ(b=iW1Y_q_T&-}V>p+Fia+
zUz^{l{<7wEU;f`dzUNg_J3qgS{uEVlEPh&F!l&OZvv%x1x8v80K9J+S?E3T>R5@R(
zTzO*p<MTDwz7+noUI?j>7-u?kF(kY5oXB`$7}vOkEhR}>;MB}lY$;3j$1hHG0JqU9
zelpf?yZe2H(7KNqk`q?_e)@v{d*5%5mEfp4p_l7W@hj5)=Y?w`t?BPSuDLdaWnSC7
z^2@Vw`<DK<{5;?O`N#Lg-)sD|&+lAcU#M;2aT?UZKIR?wdC}Tn*%SRWmpAdp{`$4{
zAme4-wEbTy*Cr(N?@iyjPhB?H_S?<;_hC}64o~59^E!2F`>W=!xt}^-rfr$|e$~7`
zpF*aaOmU6Y=-&2`#m2VXRQ1Z_Z!Wu&f3ZCe*1PYz^XKK%bNB!2uFu)p{F!slwU~V+
zvwykU{+Mo}KjZVAV_p0u`M#eY=Fhpsbz;%`%>0|ePxjpVQa{I8_Wp(6x7J=)zU3eL
z;=adZd&%YZA6HEjzdX6kNa4!#*D|T^zwiIIC-i=aR)So}rR9PB9BP+;Yx8hCb>Elz
zo%O!Ey<7eA;PzXR0<VM_3#$HqSTg7Jtb5Epo?hz}bFCjP?v4s>d?fYs`h<V^<-9Xb
zx!V=qznmYl{Kvd(XWJ9)CU?RE^WV+f;r7M$X{iy@p|8u=Zc(o*|7qKm?fmoOZ`a?C
z%SCi6j{m<CrT3pt^g=u1Ln+&@nY;5pZ)YrAJHyNJ8e2<sugyz!W-oB@F`>{v;m;Wp
z_9l&Ip~6Q#%r|W0wlD5c=tvgS5a#)F@k{2~1sw~zCbBgNed<$aQs`MJI_LCx7D=CC
z1%>VejZ<m|6fM*mIyLoKLg%>GO>r$aIoa!#^8@FGU{RkXl}BPeZd#zqYp1+n=Ephd
z+nVIgJ+|(1mGj|T7@uMOmUFZ4WpkwkN&!8^myA4|?^@1(Zz-SbD(?~?Q+Z)JOY0J4
z{$EB+A7+?2$-b!m)mMCp!Cd_7lAKSEWtwz9onf{<rD}iaVjqXV;=fns+C@}-yB(eR
zW6rZ+)BgEpM*Z`%Pi$3r;KO|A%dY_OOYRr^EMxM2S@N(w4b-a?*bzM6fMw&n28J0X
zTc(;#?lXCExs&l>ojOO~uEP9@=HIic_>Y+OympSV3l@Lp`pQ50Quc|xp0=N|J^s7g
zd~)!G!xzR<cf-(*s^jk!pBL|+Fs<mv%$k?XzjnnI%>BmrJb?exl77wU-wJ!!x&GK4
zdoTXoeTGqwM^s*neBJamBZViPvX{4-753CgcrzdR+VgwPH^~4<^Qvbuqv8tA-~;<v
zOf#kaP4@eLxj$3$<>ZukJwgUYg8iFJnwD?LxwM=yS7O4Klj(M~T1<;drZlLZzOwMu
z%DkH*Yy5aQ8WU%=IoQwB3C_Q?ggY$L{I7WG)2B;P1$GEdS?d2XlezZj<!2@u-r_eW
z*`1wk^I20{FeSXVu0Z$GdrPtF8n^l*JuVA$xc!yXJpKB7@GldGHBtHvJI{!7hD!>Z
zQnv7a&)t;w?sUbYxHA4rY^N5@o2a&b-VA|r+{y-bG$SsuzqCDIdSr^LUFvm%U*>zi
z8~&|~6l5u`efj%~-17TBWxE&H@f6t^wRHNOUwrPZ*rn-`6H1=+-~WH*$P+b%Yq?Js
zw<bERDqGZD*Y$1LcayJAziYkgy=b727!myOd;7MhrQx9JW!VzOhxWfhLU$kNX=gmi
zY;fs`zg{tO&^MEb%lGtD98vmU^P;C=ntESg*}lnvd2$?1oNk<UE7cjF1a!NWA8B~9
zm?Lx63@g()TX|mowDDZlf2!<_5{FmwrW&j0S@|}NYWnjm%6CZ%ewS!#G!yb!_T4Eo
zVfwZCuiGNEb}tKMYf-Q<&)wJ^I+aoJLr=AeC}Z`M_)>nOphFzqcDXuk%#SkvdGK;n
zPS;I)X4hA(ye`oycGa75jb&wpyur<~$F}FBuexFqlz+y}j@59Pd*Z%~^4%}PL)lm+
z$8SBSzCXMuc>233M+@DSw;3t?G|}Oyl$dwXNZ}8!qP)oBqC5Jbmo29TNeN8UOn3O<
zZxZ@%{`Yfc3Y88s+5J<wZ(6q9V&%-NoLHyv`G8?lebd@?N!MlU`j2vW*m+G@_VE5f
zu@Bpg6;j{!u5=RYC{1!uiuF0$)v)=lh)~nc8-W_vRtVJa9&Zr%5f=Mhi#h1(yomU~
zYx`E2s=hRtRrWohM@`V>a@Z%e1<U_QbjTViEa?|K6ti)eD({-r>lDB1U6*}4Dekbr
z<j}=lFKr`C*K!CS%5aLwzjZIKH((~?$@SsZ%tCL^*2}jWac9~uy}Uuf(9?b9u`DjJ
z{rXcnHDymN0gsymA8Pm@pSkMeFaClpJRXq`4qwjfarC*VRc+3)Skmz9@?hpken%g!
z)Vf@7%Qo`ul?H<*0~uzgy_df-XDW!k|Cf>OP|*~>t*ll3F3Z-#h6)$e|9IPf__8-E
z*5}Qq!zVngE=`l_^5gNi$`P<zaDs+){h{+K{HHP|S_R+SoyD;#nMHG%)YD9kN6DvR
zbl&;RFF4+D#)Fq5GdU|@)fc(9R~nvNH+Xg0@YXcF4}R|>nVyxgskm`yHY-R^4&HX6
zdJB)om&{!P_SZIUiJm>>d$7;pq*D=9`u$I(CP?^}a7_c%h1VN=n3yN|U+(xSE0`m2
zC&?oyS@g2AlCL$hP-fCyS1FfIEhcmQ?{_ku6!$t_^Xs96K+}<$yN+k-`_<3v{OEOF
zeZ#5q>KZCHdcHEum6%X+eDOl2$2W94KVM#%?Zf|w^Jf8%$CmwXc7;m{@bE9w-FtjT
zogw@FHMNN+E@}E)(U1KfP{#kM_>Q$9ufhGJ+GZgyt~3<rURJPtcf4dvXZ4(Y{K*$O
z86VDi{yKMs^fT>cyPv*acHqgfY~52$S)WcU|7j>XY18U!Nj%pZ3Suv9o1lMPEk6IS
z?X7b^p8Rc@sA&7uzrOw0rG|pH8vRAJk_$op<vqb-uJEVbx8cYYxg#e190!yPdgu0V
zG;sJOZC_h+Dok#}r1Bj3Q_pX-O<AC_?Uzc!>L~7@G&POsHJ5X}FSA~oD>1<!Y|GSL
zf%2a`%D;8JExjB#&tksGlBYGlCTzRl_sg(pz8??AQ=tx_B}F|Advg|+|5BOtex>zG
z&V-ffU8bUQB_>217kcY{MRx0LR_(I1E<yE6$NNH4-zqQYeZNv$t?BjLcMPtTfoAuQ
z+s?Qbzp+^CZ|l#+@4l30F|`{hd@Pw?EP3{!)UjurGZxNp3*9z3(wbRFT1a5Y^6Lh%
zH)Q|1{9Rl-MRajQ=;f{x?Z1MyTda?jZqIfI_|tc6qk*MQ;o|j&Uu?bIW&VEom!r)u
zncIz+o%T=hZI(W1wkPq8+Na+Wr_AcG`admL`s=-4i`}Lw$p*b&`Tybi{gt=$ezGfX
zd+js7@Ax{$%vm*6!v7nO2j8F3<htR|6wYUNgjJ@VWmMPym#nU!^uXhCK-o>{8JrQC
z`eF}-CL~EEG3qtOU%h<C<NVK;Qv~K666!hn&iTsRGWXl3&uPAvf8`&YX<Vv!{tKtR
z|3gDI!9P!)3#Tl7etFs#Py5<4J1c)*`q$sOe~a=pxh=Pw%$ZJm_{XuFmU*21O62YN
z>k~t^&fn!8;5Td4%Yw`a?(v1b$+NGSD4aQcEaQUt6zBM(yE7kEpX%0)-^uPd{eJVu
zcawMg=V(j5ef7q>ZEyaapFbfu{FQj_6A4E1%1P#GN_+DkNljaOU*eqUl7E@<7xtBK
z#qQa!uCIQ||MuOyuJl{%TNc0hcly7Y!K|78`j)-7@2t*Qk(+z^SEt!C>4YV<^4~M3
zUi!7{`qowJFTJ<4UlydbwC&nX^Er23MBn&4A#kZj`<HLubFAKXtN&(CX^~pHJz(`p
zeI~c2OEw#n7kum79OUI2xA*;Jbu+f<-`1Oh#+*TO2p_s1%M0*!NL3#VSznQvx~C+6
z#rEymOQ+1*QT2W4YH9B9FZE|v1TDENIHB=gvH!0Jf%PBwPn<XsvT9Ro*OXOJlMnwr
zWW^^oeY2QKt>^q8uazg4CtuV1{r%w5zpFvR-tjH_{x1q;=Kl>Fxqc=6=4UNv(jJrq
zjOzdWxm*3aTF&nIt@2;)kil&W<=f$V78`C@^8UY6U>8rl)V0pl*L41Et@i-+k(OFr
zU$mG_PFChkbb^Vdkq>CL{_B_P^_AbA%d&!dgRflP|GfQJx|X|I_OF1XCTQ94zoUDj
zD`xi9hk#93_4T0s+)FQ{-_$PIy}LSovByN&1<ThzKNOlRmVYla-e(0!mFbj<h}|h)
zAFTh)_Um5e7mZELTf~-p|8{3v(b-AuCW_$x<n7u0OONZ_xnzCeccilW#6S~8TU*=t
z@msPU+^=!o1?q%e5w7}wfB$B~4O`wPru9e|cJ-W{HNkXiRqpA|=j;BwY~TMd13XM$
zb}_Jjwtwy#yW*S8U+?`oeD+7i#=u1<Bre^(Yr8vab!_bWm4B;_c6OW1sr&hHzt#7)
z$<k6npkANv)66jWABScg%bYIvef84g`g{5<R!H*4Ub^D5*m+`MQ@n*t{pb7AuWi5n
z&iZ<5@!9#(V3!B`eyw`D{p-D7{j)Zn-|G8SCt^dwu8Rv!uq;`=JiY&3RqfL*)vxb)
zzFe>SzOVf2zi;pLWHMsve<nPRo*#SV$HD31rgde%FCEL@w|&jU9}B@gz0q0m_~3h;
z%*?~rr)^1^cs|DFM#cY^wf!cKr?`vrUi+W>(Bbc*We;!M*YHdEv{uxz=HsDX{9?S<
z>b_r3|N8%(^>v*ro1*hU|9^nG)>(&MHQt_PvTk3+gN;*9zlxsK_9TAx$1_=RXL3^1
zjW%9yKND{eu>1Fp#Z_|u?um<=%f+vM37P}B@<do!rM@)lz5eg2Z2s$6TZL~3Uh(ix
zVJZZzg)qJlzdQ5jrC*1)$L;-4_`d*blUB#dNADvq|4zvFf4lG9yLZgJzg*N9wnpVH
z*uQ?;({B0rAJy(Q;1nVH^{%v-?Y@Eq`*y~ypZ_|e<AmR$uWetd{x+|k_AMvv+>-h^
zV0Bl;H_k6Cx_|86ykD{0VmdQYO*D-9Hhhh#kAIuJeD00U6Rab^Hiusd{a^jy;4;pe
z0jsY*lMwCdVQy}2HvUzWZNKkJ^WWKXi$F2FGB(is@7+W3-@aWj%W`r#mV7JJ|LwH?
z`S<@9|NjA+E6NJL7H+rmg?LYkg{*f9&&!op|9yY?{>|T=%JSg2K3cNo-YxrG2C*qT
zE3Uup4%b<0|Ad=$Dk#$CU6EA({%^|NjXh$nZWG_VdslliYx>>|f1SZ|bX(uPi)jA4
zHU7sufr)`jGVStq|6J9&|0862?#j1c@4j}+Jxy)(J1QhOcV&3H{N_dbPL@LKylVA#
zNnF`ctAj5BQ`iq2{r>%G-4Fiwh%Y<pBS1m4bk)(2ulMHN(r0I9UoPp|)iYsPx4gyA
zANSb*E(T9jnQl2(6aIeLyLayv_b9o#?fdpvU3%ZPBla3#AIWy?==^a~BPb;$+y0qe
zbj<xX^1|StI=3|`^Y6o`wcGZC=kngYi>v;*t1<ppJGf+7Rb2ZmpH2Sd`?c}lqC<@D
zTK$Lf=DOebJ3)bQ#o)_<T<;3~6aKwwu5FH>GOC&R>-`;H*MVg}m|lqgeHT2b8QN9i
zu{JDv-@CoF{6E=Udbd8?|I5C7iGJ+2XP!6r^=|*lcXQi2=eRAt^>d0JtyWKoULpRq
z`@Fn;dfn^KvDf7c7rnN-_EyGoZ<*=GYST;E5zBU{U$DKtf34A7-Ef&*rn38$U)C1h
zIdIx)*R~}~_w3OPjoOrOZpmNkMQY3cz1k9Ytx+a1MDx;4#>n!=>SDjxU3zyj&ir_0
zx#_I!52hF@e7_fyy*2&Xne>tr2M_BX)Bl$`ZELLx-2daxyyUo>zkCcV@9a6YYp%Nf
z?AtT`?)m@VeFS@3;*#qdd<y*+Tn^j2$SC%GVs*an@%J88@k`XU&uvuHefl(a=H}-;
zZ|jafJ6r3Q{Qd5{OTYf@G5x&fTUN}MnceKt-0WHJ!Bxu9=B=;y{k7!nxMkU<!ds-i
z`%puPRF1zI^QNEY4u<`f+gbgw#aQ8vyVQh*Tc56LRo>V3mgj`r=6^djUgw@4Ije2c
zGW*9XduP@g<@)e=SZPh_5|lhO=i8UNQQy~HkMG^O@9Rz1-l?an)`2Zs+WqSPhh}~K
zPy8LT3fP|f%m1>aiQnbJl=}TI`~IyBw)!!%WP4m?L~4WU2jQPfgB?#T3v#vkq^muB
z*T>Iq=hu0A_y4^-DbQGf#V4?C%H(Nt*$Ok~e%^BN*(S3mw?0gL7gkwb_ipWBBZV!q
zi*J3+4z=F7EJpvFc6RV8jplky<6L?FJ)BKHcgAk;yt}BbvikHD&)c8Mw#>eFq<qQ4
zy3BMYj(P1~$3Oa=%1L+lVQzOh?9}TgQBT-fF7p4=U44A3_}-0QWrG(TyWFrqSbTp?
zh<nYHtl#Eevc08Fi(mg=51QJQe6#-l?dJO*SzEFmNeD2hF4H=?h4;whGv8&uuX;Q0
z(qwkV0H2)>K5xC27YZgk{plJohi!A}(@F0y?$y7mCj0#35nCo1m%C+G8Uol-m*zTN
z(d3bMTA;%_$FhI<nK{#?Oq+5p&8vS^m)m@4&sp}j-}fB5&3CW(gk}WGbRTo+Rq>~P
zwX8IZxxA{Zjiak_*SuwqKmU@xzt8l2NalnnQ&xkfhZpw!wyDmpTj)RUyZF-F$9GKb
z9!;0I@ap%fUy60-ZXd8cwKFqr$zr`st5*eo-S2O?8?tJ~z8TXjrwE>0p5U6`Z~N(2
zX%YW%+k<;g><u^L?f#wdKm6RSS!Fi79x>_f6#w5_wNb3_%A$k6&VyPKOO`MH_WS<-
z1NL%Rk0d5I$Nz5%b8wz;N;r4koAoR9rtD6brO^3+-}uAL{dLaTJ9cK>ZCsnok~dMW
z>6`cYs#tA%H<m*-OMVJg1nY4x)11s8=RARZ-W2Z-K9W}^{^67>ztZ4x;K-qvQ_J|C
z_|9DsZ6mAhyKBX4^@oBM%Z_#Ldg>Uz?nQW$^{-ZiX$^*-RvRfSSsb_IX?hQH>El}y
z`(6iJ_4_p^c@6J^xpPi$eYmTz&T7{|!=}gU>;6obHt|XPuepN#6;1bFa&UZ?E`9cg
z!;ABsX7g5-#951SH}05aWBF|v^St}3$|aaiDa#(~%yz9+vhQ)3a{Bz8#fCv{^EUb^
zuH#L;!1k(bqw@M4VoSPzxy{VC%qUy^-s8CG@d+2&CQ1l=N|xpQZjh$5uD*)*wxqzT
zBY$W5AAYp!Q}v;(vi{6Y^2rLB;dj*k?G1gL=rB!ZRX%vlLDbr3^8BWWJRFT;$2mGa
zePMAs^SaN$a2coIlw~gunYe6l-Y{*RxFc7i-N87iCk1&k6}>)p7ky^X<?)#Msgx@-
z!H?nea}$R1w}s!TT<~T#XEOez#&I-aX<OF2?*Cgh{Wp{mJo7C0K+?Iosf-KXKAGDu
z^TCo|>~h(q_fO95eEGFw?)FRS*J9?^Y*BZU<<NOtu{_)1QQhg~PXrW-WrU^YuZ+*T
zW5AMmo1^Hb%=ds<@m5jpQf^G2_Ut&G{C!16^*%=C>rBqgKRfTvzqD(ave}fG-!s+O
zRPIVB=B8Ka^QJm1;=dd?!+`NldUC46j_EtTtxR&!+Uq_=D*NxV>eEtxYqz@hXt|mw
zWC=-4kXWm*LOH_I^}sZ?C5bGb7HIRADfu^fG40*7QfuO}oLz2blKYph4b0oOO^Ju&
zX+V3-&yW=+2K@=!-mEN^`yn)A`Q^10v;6E>Rwi2f=rj4bW$UIgL7Qjyk8WAxSAI|_
z;Mp#hU0Tdevka<!e(il=<yM~Iz5Ljs%Ox*QL}VRVbtZjF?{B62=pMI(KReW}?0=JK
zeO=v$<9oo(D4}(+JsnRL-DA0$7_!*smG{il*NfYd9rSGeZoR!$|7D7b{$E>e<Jo?P
z58XI+;%<bbfSgyr`5AqZmv8^DEqdbIyS-yM+>adZZfi2Fs99JtmGR($4<_bwSD(Ke
z*mrcw+!L#Fb_K~XgihV99K7KW!&KHEmHT>Hj)W+@I)7ebMn%^-L#7Sux>^s;%{eVq
zd*fU~fE{DnF1g0K+;oSEBKs>}dtc_w=<;K_@=#*Jsz0AzyWPIDcG2ovZJmqXl&lqM
zEdTXn?#iGq!MEJbgl7HuWGa#B_4-)J>OE%`Ub703oUkd==dh`Q;Ulrj8w^ygbpI~W
z;q{0se^=cj@b_}wVovbDt-w;on|Z!Wucc1SV_UxaM@g2$CH37b`8I3&8uXbKL<l==
znDJbkvA!pvr<L)cX35QxO(lkvd$#a8i~1ipoFR~N)PvK}@Ml<V?=1GhpCXI4*stR8
z=(+J`^Y_hB$JAvvW~Dn^3hZ0;T3Xqg<=e}su8Rf=slk^%ssFpEf6?rnq?_ESThrcM
za=m3BHRJA|f6Dxy(n>VX6!h2ygsn+l8FceR#1zj9y`MA-A7?r!)xT=pd8pyYi!&AL
z7&(>Y7VCpX@k|uFwwfxOdB1*Em&=D!-vt+xWjSz6d+eFBi%sQ}Izw-Rn*a1ZzaxMC
zY&1|1OquFCvn#>=u4yOh9SI(u=v_Y~k`Ea^JI4QY+DikS)ZOB)3bCmU25c-Be}0+2
zDZwx|{Y&7!*&#LUC%XfsuN@9>vzyxXp7UC=?2!T<kI1@>9riqGX$^H}Znp_~-agjB
zd@yfb{EU+`@{T^Q(PcV*S;2DIo~gM@eGKHkr_SixlUMkE+wG4zYj{0$te)2`ihX%&
z(Z{$Jxy$!Hy-bCAg?Uq~f`5XWfOEmzbsHa8eExp1z};bnx}U>LwsOHG1vblij`%sf
z^z)kIY%u@bGGj5emR}drPI<ldj9nqzSZuXa#m%m0@kF~=^$l{X_<A@dxc_=Gr$K)k
z(>fysmX|vj7uF{&U$Mona(;pBj$@gBGwOA|^*>JBb81`svbv28yR&lLul}9JwkgqN
zrT-WGzjLJp>iX_3eDo)xVs8F&>D~Q_5lOi$jFtcXP1+KF=#Ida<u4yaxfsaTPLl35
zuuWYY`8d@f=`Q<z?JJgRPgmq0KD}2kJN1;$T<`!+$EAiT=N_xAc#>%s-1}o@@=I$$
zD^mrQEpxmVoSj_0OkA+y`9Wp>D~n@JEy-b$V0vp3A;--u^mU2OsU`DRK3;rRbA55e
ztvT#-4V$LC|8+G_MCH_yy&I1juq~d;*x4|>#OC8Nf5U)8y-Tt4Kb0kwEsB*`_ND5H
zwf@oH9tLJPP6BEQvRgafcAqR>@x9?fSc{^<{s`}h_t*_v+qX@WFYWrOab@Pk<pwcJ
z9!fmx+`{r;V{87G`aLW9%_fE4Isc{7>ZyL&UnT!bfqpsP=d4TbDsK%?tGw*KQT3BY
z*+D5^W+66B`OH;D3Uk+dm3mq2u;~8jvT4DP6(hS>aC}+SB*T=<bo$HF!jDI+=gtZ$
zt6S6P;T~Wyox%E(@(%6j8sA-8Vw$e};miu%Ipz4rO|eT)g-n~q(Cc7wN<HH2?YV5F
zPbT)=D6l);X8g6Kcu_vb7enid?UnPFDXX%~W+|U@e1DA^L)sA~Sw_ul^;*&3Qj7Tq
z_P%*(`%^H*&2q`<_%p{(xPFP;t?6MH+Wqgzug{XqA!b1?Grzb_k<8`ZJXOl^-D8i_
z`@huYu^3!^B7A+S&8cgp7uc_8UJ8uccRMO_=9cOevGX(1t9XA-KcD*c{S!OA-?oeM
zWKOSlj=Xha?<%iK+snU|Z(Yip(*KVm`0mTwcc((<d$U5^?^s1@FPZ{L0zqDx4k%1W
z5P7Y<RO;%yca1-I=vGPq)ZKtAMF4f>pg9A?gsc|>byMgu6U+!&NDS(pK&CIjeJl9H
zH<J56lf&S_kxLDr<r5%>Lj9!)nqKdKPM>2lCDQ?1gMfSl4xtlAt{uBwCH;Ku%<B?=
zPXE6Zu_?jt@t@%D*|Of3@07k3>^=YQtF&5<^osk(&(E}9Q!Mv;xoR)Be|_%iyYDTZ
zO})nrG6^&zZglO}tK<BcKJ~|frH$wP+;Z$s`2XkAe)O!{`=U4cvfzUAb@%FLY<Rn%
z?yR@<nqq_4IiEqTB#Xk2r}xT(JPMjEPg%3|I`{mW78(1Wn}Yfl*OH^(Mcpt`nlZ`3
z?jrArXV&b}-|yM%d41}{@jW}wf3yRQ5rF34Cw%0(CCQi|74#3(<+^1g^F8MOwp<>L
zww9F<@2%g3_JaCyZ_hpc{o4N5vTHAxf_LWa{j}ljGOxA!Kfdk#z5n86E%B+6?91FI
z@;{B)`tCqj?Cq_sHY<v2_Wiw_pXsaoxFXWB=KP7{cf|i^zTQ)EO0sm>Kk@m62g*tp
zzw4+|<bB)u-mGTn?ohtBe*MQy%ug+e?a{fYckB4`J-TmC_3zsLI}R2vYHJKci@(je
zxZ`tizHCv~<ump(uiZ$0pT70d``za^M?YKodxChbaKn4U*Zt8+EUK3bE(v5>_+2>v
z;#4%>th-W;H)^*p&)K}MC1GxnM*o*9h1brcZ&{mc$J$+dV*bjMDF=T}IXuO9-~Inn
zH%Hy~t^Sp_cFUB0$%@+ttGD%iwEg-dohQI9>(I_A*Q;;byFJB!w&nCqmoj75_%GzE
zIldzJ3-4R+v(k$L1MU8vId3T1_v_;QC$e83KleUgHTV0bATMwVOeyME{+8tz%V7z2
zrWmGMHG)+vH*9vEzaEfa^TWr$@4^|8bsojHR)uf5bcBPA(R}`4js>Dyr*Gl$oYnTg
z%1*dK{?{+@Ell4x2{vmnJMB(tX|M`3ce4#Lo+!oe=|IGh75rAukA!b;`jesjJb-T*
z!{H4t`~HPRTYNkD`CroPx~F>#^d$r?U67yHc2TIWb>pIZLz&E<Q`!p`-o4WL{_;gp
zx9Cg1K=}cbaZfCn(bf9$!ApC$h6OVm%uS_Pp0n(F86~((X`<OOJHa=_I%ln!-WxD&
zykP6=EFjNl6TJCTf}G%zVw?GwuAXa1$oXU{|I%vOorZ*hPsJR63{_{WwP-J?+}CsC
zNvTqPPw*}Nmmx;KRNmTu>JXeN@VD{cLb;mxeQK>c(q3)_<zjHs>|5rvG-=-dbE~)y
z+BF&<G<+?xN^IeCMiWP)Ml-iIO*fWf5;N~Kgs@(m;uyE&`O5lR@;&DY_GGQTYs5S$
zf35Wt#)mbpCdPC3gkQ0=SHC3^yfY{<MxxY{*JJlT<^HnUjO9W0pwbIeB)qI}Ul#Jy
z<4{D$OGlRTsh4M291~e4A^74*LqLAhyT#S<x43r9)M{ybd89$W#rUWO4@YqK56f^n
z)0{259Pb@9wawSLPi0K>b-P)asA6rX@FmvDb@iHljs3BI;W_k>))~i}+-#A?6WNY9
zOHQ~Es6C%c<h4r>^Pv|W`;5cZa4X6)y`Aw%Bq^5V*s`p(hchGuuFX9%Wo1dCg__Q?
zx~Y-^`_fy~%!{VT2cNVuRG2ZNq}OYWYT^~<X2<<c?+2?rU7no<x6G;Q!PEuQR@zJO
zE{hWs*?hpNtcvTg!4$^MtwL{XL>n!RCpT70v`4g@%V2*gadE+#D^oTI%v)gH^vc6|
z!u)lPlDW)>wrw-~<#hK@O+rUfV#Q3xht`)2s+t-f-8rGXtfw@|!9e?^<(gj4-)j5%
z_j$jbp8tFG6i}7nVxk~i#%SeOy5=f};Fo2$@0A&`b|08oW1u9#FP3e*%W2jP-mV98
zk6!69bBScR`SOaETYS>)n;y=;$}ab~9d1}+9q77qjjQK<j`%Xa9<_;P=T2WQE=h7Y
zb?Uj*lJ~z3)Ep5z&{602dO6It$xbYPZCuqQ(w;AsJ=tE$ADX6km$m!Ttz7wT%OtK#
z75RpzPcS{}{uP@0JKA*7kqnOK%c2GQm++rlyy~v<+WDZ=2C69D6lvXV3x}ouj!Tf`
zQJ}Oj^G?#O3Eay-<Ak7+7CeH6yPi`Pe*bXChcDN+#VtF&TdzVg{#RaK=aMg}EmtO%
z=e#+wZTsA=+`>v}>%WJ?4qMNXFJ1Odc<X$a*(p9pqIXyRl-1vA{pG8E#@@4Y=35>s
z7hbt;kGM+hOojKlnQ@nL<F1*9J@S3CCuhQRm1X+;GAmETxqwFM!HckhqdqSCB)WBa
zj`5}g=Vc!628~DvFWdK1`us}u#y=7JC+MGH`fD0h8I=CX_jfF;Y+bg7qoH!u{daq1
zYK{m0=a_WUY5D?5!&L66y>=7+{p#Ph^5ee8aaXx+ebx-J`I`Rf;&i?0;_NA^%M_1R
zf~vn+Y{xp>vzTW56IskQ=kV=IKP`D5Wr|mV#{9gOTewPF)V+*cpFQ=k^qJef{<}c~
z5=EZUm)9@itGxFz`oZP81$r-|pZ)B!$#9-^Jlwv{WA)!J-uJdY>9t@__SkiC2H!GR
zTqo->g*Rn2y<X_%VZfE>aj)dSXGo2CoA2Ic!~A!}QNdd;O=P_0b>?QVTeg(=%b7;4
z7d@GaKiOpLf4=pfE6eja?z;?Jf4}?u=34m-zH^E{mdEr>R8M^tyupCw_nW_K!mK{s
zc-?wF2UHhtUm1Uk?UjFJVEZb&4wD%h|HfNpXG?!7K4JZ;`{g3NIfn~>rGN$ijZ{*D
zH+cA4{hC~FIsMCwJtyYx(VTDdQvQ~^Tn|sBzc#d8p|*u%9%o!|a)9aAj*BfE8$OqE
z`&?C;(I41n!2dC2z98qdX#x$)*;*8&E^mC<_wQPU!~4YQ?U$A;R$acf%2hk}$yEP?
zr+7J*wJor5Q`4B;`gO(x3o*yKJpvO=Bm|lMJzR5L;#9*2*{4h1Cf4$n>YO(IWw=+a
zXHJ6CqIHH$B5G_G^QSzvT{{J`s&?juLm@m{8)iSStUct{Aj??Nk?Fe8Zc)}+m!55R
ze(v0!$9s=+$7cn5bLYfKi~8R3aJaS2|J?fdXW^{|!4ub~E$P1Pez@Vt3&G1vF0Bj?
ztxU4>i}~~E?Xr`Fe|9uZ&oxz8Inn#~iMv~1@h02-!2D{(nb-ZLydO*U`baQcI%1;0
z^oFA?xYd)>`M?XeTk~cd65+medKs@^j<ApM)^0%&ru=zt%Cj6I^e=^OTrKi+xtf;G
zp$I8~TZj814bSpCKG5MO(Q4w~7wXIG%`05_JiDxDGCZ9)cO5Va-JKb@c&WtBj+ddS
zhbkthiDe4^`J~rl#C&M-KLeH?vk%4VIQE_9;dq<2O5~?-NOI4NtSc)S6H8j>M;@7b
zo!{8_9-GSB^k2-zO$!h49DNr%Q9by~kD0C?D{sy)Towsx8$$|#$#Feq8zT1TR2<bg
z!t~1IK-ylHn_slt6drlj?JAH;OpvhbT<Nisv6EqrDgVpL8MFL&IW8aN`0&{#Nkm=j
zcK<TQlgq0ox<9FX>zaS(TuNcj=NFr8{Lk?&p7@PBx2p!cU=v&$dg}5pH|o1JFMqGY
zr9MG*MX&h#LLE~D!&?VMlzCYFUj#0B=xM)9jVb%ogX&YAZ;OhVo#J}x46bV2dwKBZ
z6X#nim`i<@>ptzi5E<@dw=rA%c=Gbq8r55PI9_Io*Eam@Wc@tR{PNid2f5dYkM3%1
zOt3#c*-o{#XVr$X)_)6Z6Yn5gqM7K#k;7S+yqC#d_|G#(-Gli#g3_5k=jb)4?eM#9
zw_vCKzsqm8Jv{5qo^)M<DX)ipcjQs0cM^LKbQ}rW<0E^Tmm^j9$@1KT4NFq{r!gLU
z;u0$m{o82KlxK6VvpR=w<;}mMd~L77FNIkCCE&H6pyJs_QgDy(n?-+)HdcN(r@@+S
za_Qp47@LFXQDM7HLQ6BYv#DM4-%{Od5j<UEch2{f7ez`F-g@qSFn`wtwPg!SKOOt#
z_saoR+Ldum?UmDjRQ=!`gP=M99L6Vgm&IC|dcbP|kOW#+;77YNNE+N}kc|I0kN2K(
z;IfLwdlg!DmrZ+{vC{ol-^7gqw^@##{@->ve@(k&?`L@FK3!1%{^O(de`aU@4?ib$
zeb<H^bH!zjJ>Mk$wYT+=Yu(|u+bbqM>YKM7mTh)VVVtRP`_;2Xo5hOGdkYFoj(D6d
zxUV!%s$=4u^V~v}jtjppfVM4Nm?w2=2K>}uS=Nv&=px);ytL!7kABtt!u44-FE^P_
zfA;-qzx*YO{Ih$GEluB>_xg+0KikTich)M)CE8pvI(6>+{Svv`EX92HC--gd;$L3-
z#PO$$djDttyX(J-eb!<&n)G>P@C>m_uCnX@znC3W>n=Y{QvcVNIl}RGcYIr9zKuuf
zY2Q~7Q%IODVw4PL4l+BcF<;Kr!0!9e`dhP{z`c_j<@3|G?|X0e#PxQM$p@vwnf~|J
zetkd3Hhs&odchFkg7beSXRa+?A?#QkcWjyUFX_+E?9yj{{y9^@{?r4D;Nxj$Yur}z
zU1_!9xMiIyU%BUH!*laAo6nhRKYg3=_4wNNmrLc|E=%ZrQgf+*?|;#`ce`zuUY(!l
z^uzq^?iotcpCj5Q_ZkFV2=P3ZTE}>jxwh_^ME91THFpl2j&iKrb7F$}k9_Nj^V2*v
zwAJ;Q7q@tN#<x^FdNh4|(^m_(f^(KVlU$@uEx7YgLV)RSgo>Z7A?wRuQx@`g)Y+va
zuJ{~&C2muxpq;X!%;R`FV=bq7nfLcTU2qfJO$N8CuSB-k#V|JsdMxKPe5>Jm`10Sy
z)$zOT@`Ze?wPtoY;L9*i|IRX-SM|46h3D}8ll<v%*}&;i-o%QTjS4&bF9+n^Qg@b|
zu<7&4^xdB?XGU!?Hx-{Ty=DoQF=UlAD5vz?;8-G+b@!0VoTjO4FH$pq{5*6y{&u0|
z%by>MBRBLu>@XAfv-shgqR=#jPj-^)7&Wc!eS^#D_D!(O)wiGicGgmOp6l^oS+nv|
zrd#mIh7B4uDr_xBpM@SY%XVD0i_z1)^3p|l@Oo@en<(X!o|AvPP=}_(I>v(<OS&%!
z?zY^xu6gd9Peq?TdmgvytYOz)R@b_5<|?)p)>l8PBO~^#hg;)g$$Y5k-3+D{%Y>4)
z*4Kwk6@2y|(gAfQ*FI}}84Q|_S)11KrRIExdz~Hk#><~?ifWh5?pm9%d==x)ucr=2
z<)-uU{yqPwr0f^C)dn8UST={zvByT>jgnY%e9bY&!)6K-`#vk{&wI7}JyWM{Hs_Yl
zpAGHbESWm*{G&G|d(VWJDsVg&z0GymKPU9e`JBWzd)L5f%A%wrjp`rwoKJTD^svUq
zIp_Rrt$TI%*7cM<_Yd8;RR8aMx$f9w$FdJI!4iMq?v80XGcHU0<w$M|OrFyEGTl0h
z16BsOtjs-<!GC$3fsD_`q`&O;)_-B60E@27=*TU<`LnB3QKJ8BIo~aPFSH>R=-33P
zq(Th4q=*Y-bKc5*c&cp2l9Su(WzSvPc(~S6?&a;=^{4y%mvol5`?xI4?|JW%lY8b?
zUs>jlc_n{j@_YO3G_2=;Rhs-z?une_+14*lT<86((}rv&Fk(C!eWG%4<<;+dcnY4)
z_mM6%n7zMdy>Kb-$F%pGrc3*&d~9<2m22^Q!`!@opN_=5Jk6U~eT)6)&o9=uzxlOQ
zZ+OEx&3e%n)_$l5CU8pV2%nf{Yxqpl?pd>Wf4@n6S>~zx?`*H~dfbe@;C{Hl;Fh3r
z;7-Pe6;?`DUfzDYd=cZ$|I)VaKn*`|u5l5)Jad_C?e=3QPe051es`<*-i@M9K5ctD
z<I>@c#eOxVj}Kjr-+lLrt`E<%hR5I*soT-#cQgbv;z2_a4=P_quD^Y5K?~!~&&Bz<
zQ;$xWRl>6DRD;XT8LJlh@H1x3`41`_D?o+iR^4TOhI2bNea`fqGV%G|f5*1xn-=^z
z-EnC)Uqzkv{}tvxCy9Tx{(t3fRO&QIfnROEF7L0jeBv9K%BHfGx%7#wVj%k`pVyPV
zJMO-y`)R4&7yj!yyl+0+>Ff@BDPnxfWfo`{C3o41rED$LYjeC5zCNhyI6p=Fy4Ab$
ztFl`xe?O_LznI1R_4CqRFHP`PnKvnde?B$s{__Jgh%v+byXD&Xoc|<v6$4ZP?xnv^
z-^%ph%8mk=xBG4fN8O*h?Um?l--AjFM?MtTWol0U!m@oO<IVq5H*c$aDECrn#Tj0X
zgE`+7^|PLyu9RT-lw;l{#8?ArWPVyUw`Q6Cy|NGU)|&TC_P=**(@WczFI=xA|1zF;
z%j@*z3X}Bnl55K*^kwEh@wUvAo8l?;N+|ch0^V{ivDyPW1KVHdH+|T0FJ^w3GXE1#
zyC?4Vp1A*gWaT28;r>POWoGsafu_F})9V-KZPCB9gpH;6*!Q2alrJ6MVs@(g&!b(5
zIj7bMe%JjjSgN&qI;`rmcC0jz_V2ySFk|^kmj0KMUxP+Mn%<f?JTvxRz-d$IG*3!^
z$*<{PNiEAy!3pwP?tk89sK7GOaA`*k$2minRJIl)4v)*~4_TI6*`H-ty=7+6Vqcag
ziB6YH4W8Ux$*4KI$<<!)%QF9aoLe-tZ+TSNEqCkL^zyui@f^^xKsx)pe@g_a--@|r
zALIPV-@U;g=u0Nc??85+@GpwW!kN|?>bVc@oM;!E@aI=6W9RR_i<3&8w11hwdF1>{
z4bI#HGVZdMvR&?8Dl6tSQ0!^@v3osaB=v-!h=0dr*V8P0EcXOWj+d<ZBExjCq-3$q
z3140gMkO`HxyPC{4@M*y1Ti1_GRs7sDTwKEMDQfG4RdDe?^vdD#xHb%<U!+u1y*a-
z=N5@s7cI?|>0~_UaOBEL0Y<BzZu2M23tU)JC6hTLZOdY|ma+@=i7b!)Xf?at{28M0
zYu*FfTk@4pUS1O9<Z<1%PveYlPr%b`haK_;yFbn}^|)vrBYo~^)$`h@vfdZWXE^`9
zy#njHcrh&xdAsenY_6bz(1zn%B$n;@@WjrPYX-xz{U2nXwW#0T$#_y6bS?;oM`cp&
zi`VfNxU@LCPO9@dc+Ouxu_Ix6gRQy3AD_FQ|0%y;)p{d!v0bL*;!FpD%kwLw&QDQ%
zB~?2ebZppG?q%gi8U$*J&l*gtQETvT{<G?Bp=9g9kYgd1Q=}(sGPTK+PyDpV=8S{S
z#nna%hE7eEhdh$1f}BrH*>R%z^NQpwsmpeY^9-lzEt^~@&tE<}Whp4wCu(Loa7YA)
z+CG%!{y1eGLvQPik5ep79efTd71$MiJRR*n`SD-5!-fhrj_MOs9eP*~vUJRuEIpwl
zMM<o6yRpKPgU@3^Yc1xnwJ^-w_cVUl7Q<c*x9A||N$WT{mn!pFCpjqjuGxE_XUS{t
zQ)0cr>1WuNwf=klYw6qfM;Zj)95>N*D>^Yf`_j3FmX{AKo(A>*nX4}m`R=g9!v(g6
zvx3Wdcf=ds+@k&}`N0B^YXz4lyiGCu%4+B|(T_dp7N29ZmA-tUTU)Y2(yc%{)%FDj
z<_bTrXufip;8`$BTHx9QgGG~;sh>z=QJ43W<Yk`pUP7$<{pw$h7yD|oAD=Kc6?kc~
z<;G7<(`4%`hetAQ!YhJzYkKxXBrsi?$i#D^z};a-f6`mOth`&26LxH$A?6`{DZ5+K
z&;40aRl=<JEQh4@btbmgm13`h7D9nWLvDI1T~IYp7Vtak#xjqkS#W~et#v7iUjx`w
z+zcO`w5~jAHR(=+!7U+WgE{Vzf<K<KsoYCFG`Z>%&u7QEtM7H#1~ChzaTYsR29`<v
z5jHTs<ZyGzLr&Z0N57c4IdHryX1*BuM#fG0Mes82z_2;yGR%i|YTVsrcJ1_q>8Y3Z
z7BM$?cCPg8u`qCQo|(vCWvFn4QCaYHodLI@>Z-Ynoq-mwW=VBKOz3m$tu}Ovd=&LO
z+u>2wQIGh2FB_o^<rH<zW%~pwp50z>OsSxUQK_Tn!5sBlbJsC;I?OKO(5tlaGw_@#
zxTH)p`hJk#nY-#QdK)&su~m2?*v29F#dvOxg>`*tk^{$!nQmfF1<nL*H~G=edO!B)
zIo~VI+m4uRo59eOb?FFG@h{bFwtahEJ}%+9S-``wSZ1QXRc&XTR&;W(nW}yd6JyOD
z?KUg75BEA5JIf~(*!?|Ldaz-Vy4vzn4M$Fhz5M*Wt3dyl)GUM57yEZIE|mS^y~Op)
z#@ipXpcOP{i{pWuHyIzq^pBnP^}Fil<Gj>+d92`v&nh)K;*yige+3;_wa@E!=eB_D
zf0kB;cJH_LG@tTfd&%$HH}0-^Ggm)!k=&JP_p%9H|D@iP9<+FJdF`ca-?}Sn?{<rP
zuGV|c56(BB?f%Irvp5b(*3LZf{=kX*>OrQhewq*`EV{xXmTk!K;__Ds;lQ3-M?zMv
z(w#CDIsoY9TVPn)S3c1v<+EVYb`vwp53#b)`V8Fd0*{76#?e4Y3Nas(68&+Jy~)Ja
zH%^AFURiP8ZO{BAUVGLjFM7A>$XAI~|5jEUjNfx(z1J4gdhH(`N3Lym|Gsa_R*#zW
z@AA3J-u{z=wO=}q{1&@3{f4i}$3wdw=E<Hu{i5xb>q_m)#CtA#)DGWd)v8ZSnz<_O
zhq=s0JAt@A$9KEz;;+2uXRZ)?Qu^xDAg|ufhYwayGy7GQ-7f`N;^y!Fm%o2~{44nh
zo(T?@%%k6Hv9*+!mHc8l_R&P4=Tu2*{)GE8AEowNf12d7bV`u&_49f0w<7jz__hGF
z7@3FfO8vJ7<#(QKKGX0aWcGnRg&&WM`7-v}THkwhb|bfceXjSPIc#V4Y@M?<c>UfN
zy}iYLHD9l3y7<gb$`0+Ds5UpuRbh5M@9cf4bKa+)t#Lc;v*fu@iP78+o0Z=GRP1;-
zE*G!(59-v-{uL%EaPN<A-5ld9w@=LM3i)RC<&W>PFS2Znn{IHKuXX>z_;z96lJ4y_
z&nt_2ULVX5$n|G_Z=jI*|GC=2GYtV^m9F#NF57jH?^kQlj(Jy>ull!Sxn9n4?|Y%q
z4BNHs@5RL}dvW;7^DlBsVty^ivA+N1_l&BDGI_0~Q?k~}*Hi{3KixF1x<|{^D&=Fs
z-*xr-GB?&cc=2%9gXS$v(*B*?a%|7e^O=$jFV9zGX&-nZp}#d*ito7c59JR}jPHNk
z_O|w0-L-2B_3yg>Z}~d&v5c}rZ^O1XnN?PQI~poM18rLUdo0~1oJ{|uoEsIE<?v%(
z)#SGGf99-Ped4@j<;^?S3~%*lOivE|yKT|8NtYVt6>(hod1n8U&F}5b=Ur|7H~ZU{
zyW6fQ|7!U6PAl#}d*#XH35It&Kfhq!_c(o-<I=z9ch;Og&GvJ8k>&bR4G9G&nqRxk
z`*Y@dXtq`ccc!NLvIN(^_sc%ai+<g2xG*k8d|mJt`&@rtm8Zov`u08IZ7t_lp6EA9
z()G4t)!xtkvUs2W`&=Kk@5xp1E_c;Wt>_E#@)fIny}4V;UHARYiz|YbIE(Rqi}^n{
zTkI$Ml0yvwd)y}YTyQ(!_CfsM!@u`mWG?M=ux3iQmhECE7-GCgz<(a+o5LQUaqwjc
zt^)JyZoae4c^~%oQZdJ+<dl{z`Bo2pu03FK&YPFR|H$P7Vzpl%w{PkD7<-k2<Noqx
zY{jlSedc%BS>>PIlk1kpv1Pi>jLp9C0y>w^&zbz)@rtf4`{YB0ZZ4PpJ)e^y|96(u
zzB>&99l?G*OA_Y&IAgkMSGdx72|@4I{<9AYT)Zx^aE6=PTmzlV|HXW%OP_+;UQx$a
zyCoP%#Jt{Qs&J8gW#*H6k0q{moj=aIXH9C$qWPMWcV#(D3CjAh>))q?$5uVHtxpco
zyrg*d?p^VkpIpmhLL?{Ha6C2k?^=B=&E<o5V=T)aLtPEo)DDG;C2TChB8&xbVY^Kg
zHr6RNUA)q8VcVty)01CnIO)y%WGo~!pEGUo{3NYVtF;{ovu!xHT((OnS-FbeKrQfj
zNmWqP|4sk=-8UC83q9kUy!C9oTI<b|zsoh4h17%-t#)~dwO<x5-23?Xoja~Se||Ck
zrQlX}>G#Sr4NHPoeyZmAJh4B^cT1_Rm!@gtmvV8w4cyl_L96wH)?W{g{}BFHTK!hi
z>yVcvQ$Lz1<XdKZo3Ptxs^o+_A|JNw&yqdl!OOvDtE_*;aM@*pR}-Ws)YQ695ogSn
zI*_b#LU})%Vd(Nz?<MXktXS%2_-G>A)hQ=e1oUtOs6BO=H}@Okcc$q3MRNifPMzfC
z`1$OsGQXSbr+w)mD$9bFzU|cv3q4pdmC>`;XNlr^ll)U%|I&Fqmp@ImNQw&GXR!ZF
z!<5+15sw4YdRDD#4Dwp}t?K-&d+{cZR~`m0AKbpZe7$S))_A`q@hMG5LsGft96#Og
zBrSOfn@YTLV4B0H<que9zvb~*vs7V&`h-*~q2P&DrV1~AHF`8pObl6+bFg^#+BC_|
z13Vi(NE<sb$ojv~<Y#W)`Ox~(1iQEI{dhPI+NLHXNn}1SR@O{aR;c_tW1Wbyb>{tc
z#s~K|nT6|~SZAb=`8U~xadO1ghB?^+mk+mT%@<rz^6S0_FUNIN`L&>}=~wu25B|Ol
zItI{um53zU+4wo~|9^;yFM6-Te5k|mpUW%-!Q#MFhoo12YqB{Etw6zhDuU(i#uHY&
z4>+$_s^2&4d}%f3`nvxToeyh#e0H(5#ZLRA>8K%gzC6oehEr?j%M`9Kxh272Ys@`=
z&nrp&d%}bJwaTh(4DXIKT!`Zd{`XGo*WA}f_OHl!6*)yd+tV-lYgbrY=(7Y1HkF<<
zyIt<Gpz%;sn~i;C7Zd(UgHA<S61IA7?a_GtKQYXkig`TFoJ>E`+#oKvK&apu8;kKl
z4|kS(OS<1z&XNn>aLD1vyGX$~LIuJ-$E!N-Gzk2dG_OW7R$;z@K;?va-4lB?>`fUA
zwS+HEp0P~SEz^RnrCyS;<E!i!N1Zw6e%t?=^eee7Be>_mlUjedo*PGg3HdNyKEiRS
zeK&{gdi!PV7w6x2Hq9#7d|AxVSu-^3mr1$(oT2+l{#e6n2iePllU_W3C^bQ%a-seS
z=3>h$Ta1@Z3A&wqcjn*yS;qoTgO+^fJAAXxiu{`{HDPYL0|!TPcIc7Inal1hNj>^)
zN5_My>aRDN>L+ytvaA+76U6rO$S!7=1GD#8FV?=@Cax@aFk{Wt7f*QKu$-PVUH^`M
zdBh?)mM2@f)&JBy+vCT>k(tE(_{QDts-@Ybmts#W`uBx5ch#$EAI9hIzt$z0uU^Ia
z;;^B@7vrz}TeUcnCrX)IT5|il+fPF$*WZ`k2f5zox!&NiYlpspO`?gZf|usinH3X1
zC)EEnx3#scmhkQB$<Xz9dyswK)+dGk7rZ%Ax#X;a?BfaVA58JL^xB_3i$9z97VGIm
zzKSZlkX<IW%DQumqw;So%XzhHvHq^^s;$+L2|l^6RbqE#i+;_%{$DvXX8Ee}?j=vl
zt9r|}@PB@py76w&ZY!;&Q>udQ?@he;xcTqa_n=k#0%eD*zI*Qd{*&Ex<x2f!4-A8*
z?pr%IxhLVKJ;;rHv61yB?yh$!YdCoDV5Y6@*7<AxpL*o}ck!YtSHzT^*I0L%Z%MZK
zF=yYCK+UBgTl4;&S#<yV*DL?}1(%%YThA(K@o&Yv>c58<UAZD2{5nGVaFoD5*|jUn
zyfn9JulXCXYsI&K^78LBi`^&6ru_T!bNBZzSKj?Qx3ydD8$b8}u`Ay|<!=1<uUGb^
zJkc=nGP(Pw@o()`-`W4qUpoFr8Pu}Z{JKPLM=qznT+NRY%KdkauLw*j@jY=XIsEIL
z;{uE8b3pUTOQ)IEecqeeo7(-F-zfENMqo<LwdR<=;mg1NJ7I4D8pbNRd;HR|lgGdR
z*%G&Ye)bd%BR9}?nyS5VzwbTQm;BRNUjZ^_tM==~@)AEH^Yh<oubFP@B_iouu{qt}
z|LyYmzh5k{P6iFHXHA#+`Qt|WFY|BTu9$x)xb`9-WsVPrvZBxNB45ZpAyM0+^Ouh8
z_IpveVEOX0&kr6v;92Z6v2b4B(OCjw%XV8&1C8A;+f`Tg`_i?h&+p#7<DOHdx=A>(
zTxZ*hy^m)d-T$F~GFZu@DXV9%kNKg!?bh1sue*Eyxu~^qT27cQ`svu0ukn$8R@8$=
z0yRxVzwlqzt@^Se_3QkvU$5oo9<%-}^|jMSL82>#|IPpF_ii#ohFvSKQkuU=Lx5)i
zhgZ{rkJ?8RG+qU4Q1WPL{Q2tryP1hBfdVYCVle{TrW^-5TpR<H>R5%9*F9BB-Lxxg
z%c&z#n~&ven(FIgq9%QjgHucK(cQYdA7A%<E_%0jdH(%(FB_kSf4x#N-|qXI@2~It
z{C{t~alXsGpU1q_|3AB3zvJ!ky28$mjuY!n#Q!^ZO@9BsLnrsopC$4C^T&@LjeEXK
zi8?xK?%cc8{~Y%g6cxR({vuKHVXpE0pR3R3em(8u;^Gqa)nxVlA8)RmTf5(?_PS;C
zv3=&2hTqIf*6h0GyfW73Gyglz&wEO<ir%c>cBb!c?c?ZW{~wmyTl_vSy*}OhyW;<I
z)3=x2arFk7)b;y+PPO^JADi#rzV!1_{`t~>ui2klUEiPi>Hj&O{`=PFuSeXrva|aa
zv)E#3*+k8VHLNpd+dj9A-oEG4_wDC$|NlDk<xaVp`m#Ecd?za<B_*ZZOv1vG?dQyz
zwKIEOzOV1uy1IWict0DSTlTY_-{etnmS{#+)~hwS(dYjDdGh4S-}QFyC&}-9>$|-q
zZ`ZrWbEEVB_&iU3|2ye<dEWo5=a=RG#)A!IIie@O`>Xd{{=0vEo;&+~$L7!4;rqYO
zkN>>Nd%vZf-9P=$e-zV>tc>(}%W~GfVfh)K^R<ufyj{QZeQf^z#h-8Q|0ueDi;IiP
zlf=_=<^M%JKeqk<Gy9y|RrmhONlX8>|2!wxWyO_KYhGIvl%>5E-<Dtfb?^I~4=2|t
zf?`RzFZ$lcmHX$-n)OpV?$nzrI!mVo3JVJ#_x1Iye?Ln*zv@A-{J&lC4oXT&pE{lf
z%m2JwUwyYV{LlAF=1<Q*%j7Ouaizo5(sE~ERn@n9-}imq|6?iWeC$sO`IVP<?>ESI
z*te83Fsmi0T>H#%_j|ut_h*1kfRxK}`uE}BM1keWnp;CQihq0Z<jh_99WTHCX#$<p
z`SPHuc<fKtYb_GCT3cB@hJ8NSxiWw6>s{7$Z^S{z;ClQwRlFQ*T>ti6gqN=4XZ{OM
z*H^!l-v54jS4T&O$LzZs-~EZ+_TsCpq8IPN&dyF}b@k={W{Q9O(FzLfOMS)L=6#>O
zXU&C-pp~v!MKzy}ng3lb{qLl{5a?=!Z+l+v>aVZQ$$8Z6HrY$}T7!?z-LD(J*ItB3
zyB?6ftbR=WU8%dyQnBsd9=EKGKKFMWI2x6@pC1a}|M30Xwxw5;OziCbeal|==WF#J
zCXkmRZLaV8S^NF$@!<VwAuB_?&-$GGcX6$G-7$U8sj?Flcl?wsFMF1@t?uNCpq0~R
z&71XeS-vgkA^^!gaH=@-<<9nAt*KcnJ3Bk&Yd?Ln|Husr#}tl=-h*epOsh9q6|(BD
zQptWBkZ9WzQ2I(QN_yq)Hq|TBA;l$U{=PNF`~LlDj48PnU3SxE(l^JSyT8tz|8`#W
z(bxB`{;f@4_I+;tZx?gk{MyI4wda4IjoyF9w%=s>ar=)4*RS)idR-lSZp-q-&$F!}
z^ukufl%=+<J?wa2|9SrIRqrdO$A0O1SNU}BWxXHe7ivG<`Q)>u^4<BlyOSP#Rt<8p
zNex*!h0Vq7h@*dnQ2Z+9%4^1bcT+!m++u&KfAGXf{mAQoYtz{qmn2RQR7!Ymb7@ZU
zlgAU9`J*S6T>HLkPNx6iXXQJYoO+J0RL_6qle_0a`o5!^jeKPJe%0N~-G0|fe&^Ef
zzgO_tJXip6!mX0JCp&e0b7nuES6ug+Iez0iSLQ`i*j&OU-Ol?J^WFaKhSE=$-qoM^
z6D_y@@7?$tPkB9N&Y$(A@4XjW%Yn|~^?P4A^K%|4Yh&E`Yg6<t+ncM;HXK>;_RYk#
zv5kqZH_T_eS^d6t*VFog-^|VDIqyBbM5moG)9SzN_4-}KcRu^Q`uDddc=_6DP}7ld
z;#Gk^?@k4;tCE~x_WYd}TT1%!{bmZ@XV<emd1!g))aT<I$BXBe);zXcK4-_i;vbXy
z-^|+`#HOO0JY%u*{EC)RRVjg`CJLLPcscgA>DT=87Oyhl6r154`+dPDdHd%=B2QwN
zdbH#Jzmq+;_vI~X>-|6G2wPRZ+Zg_R&fo9W8*WQS`gdmR(m$ADxqN=*qdU`o%U2#T
z-u?XIVM{LNLmGmPmy*BTEN?GK+h^&>|9!>toxx_F!Igj3Tz+n)KJUY6Z!?7+ufo4K
zH+JvdzyAm8^3u!GR;pUg`=oh#)`IK{d;SzHY1Q;OY5%nP?p68P4#tH(9?rJg|4X{d
zZZ3#_HKD5VJ@0<&3k?BvOD6CCSb6FG7P*EkS1NZu`I9Yv>+wZ*yYz*31N8Q~eGZzp
zCHT{%Z)Hawy4x?|d&nyu^MAsgi+-o~Fg~4AbK$iA&yP1h&5>TWGr8SHV8dm<SLuz}
z*UvL8o?2e_!e9PUTg5Y@s~y$Le0~L9SIq6W?<rrY`+Cv2bbIDam3MdcUf$sG{qeWE
z`>k#Bmhy5GybtuV6b8)|GAk*YD6rJA*s{2>*!;2m|5A9)wfX6?OpM%)DhFBrUleXg
zu-eoxD`Dnmjtf8Tyj<>6V(;A$%9eI#`gviVsozx(cs!VzDYK+MTj+^ethb@U4E{@=
zQXGqR*gHy9Dz5NpI8l2tzj7_hzSN&fu20?5SH;W0UBNl+^GzKAe@UT?LcZ(jE1vA(
zO`mi5yWAB0r|xwp&F8Dyb2j{5%cywbnM~EbnRSz`o<%*inySX^^g&km<o!(dnzrB5
z)Sk>beSd#}?YW0`EAyH1&Z*wb)c@-AblE4L-A_JGN#x1h`(WQW>8DOn9Dj=Cu0OCj
zHM`it{Cc<I@gE8AEas&*W*71u+ivpc$^0)*C7--6wViP8*SyT`QuPB1Och?bDAdkK
zTQ{|Oo1|q2<3hPv%Mt_YWsaX&vxQMn_v`oG4)953Ym}Is7PtyHImF4SH$45r()!cp
zV8u;sZ-0y9m$m0BtmNz9*mLT<kjsU~ULQi=ADE)B*+hY*zBIppN$_9pBA$$F=1B>w
z-_7;6+q}W@U^)+{qh9@Si>ZxLwl8JWKScGpzVKDHVA-~0@w=b?vtDmn>uzwWYW3^A
z^BEWF8KixE7p!@irF$!*=Y}Z%_ey_R?mrEYdv|c7QU0C&k|R&H%x#+bIpg`$uPcJr
zg<ezMU);ZIt>BX4-kN_mlCPDW-gj?rPxexu<e*n(2e+Qulh(52o~};mL63k;saMrO
zVa=x&oNP$(jR>~;y>;%Th9zfSWk$rbf16o6{cyvF?Xx6|Zk;Q7zfTREA(fmZ1eW<0
ztjWA3z{pXbT(aO#jjicwKBkvbpO-T+Hho&A8&rJ1D$2jfsVSXl&!q+x*~LZ*EQ^<H
zT)BoL?$YhK^X`0^wEX*gh0Q!3##2}F#Y{il@MO+1!Rai&N(*kLh`kDIzT#`o>?D_c
zd9v)vUy>78u1M;x_6@$&EctG#`K{D5pOSbzER#et!c!0U-h1`Z#kcbG3OQvf-=CVd
z`JNs>?|a^fCpS31_R`#oYkK6q?tY%c>oGTc-g~pxWnRod{)bOxBy77EtH(U)bsAUV
zva26+zDe;4L&~eUISw~cHEj?7JiliNk4rTxudibQ)6z5A%gd)|YH)IGI=3MF!HK<>
zwPl$viLBwVX6=4Z={vvmn&e!~=9Mh#S>CZ&8fMJ=>_6-G+W!(;89jHJT#emkr@Q>8
z$y33_S5G(a-fEj}wPc=Q*9=pp3s)-jnJ2wWT(aJ;xXFVj=aX56?&&06kKB2uO{Qf#
zR_L)k<rSXtRsLn+@;$FF_Faj-Hp$@DL6L<sCQ5wsS(;tNmV701`_o_jS1vDj8UB~&
z#0F{hDQrvTwM`AYAaQi+j3!s975ujX|NfH!jUG!0M7px6G{!x<U_SBx8S!L?O)I8l
z`fxro-Yz4&Nn3IP%Q?;kKg?|}aWcH@U3Xn*%@jsO54n)?Zc~LP8|O2gG_hQ!uyI=R
zrw4I{^Y-#^?EQ1He!sZ8TK?W`<+9frOeAM5F+8TItTe@U|DMz>E$0*cyp4m_?_XH^
zB(h`X!x|rtd98Kl6K;Mh_IFD-RXu6n+*F5GnGq5nKjmw0Wos!mP_Wr8VZHx5w3=GT
z_)@$>SVqty^{sO>^P!(#+@=Ljw3w!8f2KQs)-&S_u4@{+92XwP=<Dgn&%3te!Hjv&
zHf^7iEfgUbBlLx*>f=ofUJoDt%O<Lq0<XpH|CBK+?@B(GSZ>>|g>PPIx;FE8i1nJU
z)jv@kofKEidUZ|gLkqXEjciL^R?QT1n^hFg^s-7Tc3F&S?-b{~(IVFu%t?yV{BI?o
zb~iHJG47Xd^BLnA6P@2h`tN--kLCYUtEF=j9ab=3I<SKMd|0Mjc#&Y$HM64+e%{pw
z=afxc;tP%EuE-R$`n>&grgko`$2GSr!GT7)%H5|U7u+hjzsJDvJ18mM`%(4#`jX8&
zVVn${>oith%9vu^oSOY$s*%FW(!8jac~&pPAE?g%s#M~}`Ac$w&+XOCKXqQ{U9Y%u
zFz1@fm1C0~CZ4|}CuAbzP;}mC#W9fpc8Md2ZVnPxHP#<LAG2C6ceT5Lh5l^?b%A$V
zBV-@TFi*NTMMr6UdFH8Z#!9hSVhfk+1-U=j`*Plvy>oLNlJ1?T->duH>wS}})U_XV
z5|F%cHQRy1<kPYwCb_!vi!N;v-?*0X<h8{tIWsuc9Z&xudug_}!tCFl%A10lZtW_s
z-O1yzEL-4!`rIdq`Ub1r*-o*(e;T%LCX;c-u7Jok&U>?7q&gVwnxGqOd91@{+bUm4
zA)DhmU!w))%U-aUetyFLo+klpdwDp%I?XKP`~IYOhS$64x;4pmvzu<NVSX5Y$zNdJ
zCf(~9D)}tW=Nx{|sgs<vxxxDKqb%d-?H5h?_n9}pX0$Kv@5vB6u-tb_<>^~ZmwZw-
znx%Hk^!^<itW&10asC$HtDe^%E$1~CiEp&wmH}lbog|M_)z9|Sy{-*A)Nn-MW8}4+
zSDu?m3Vgcug=fm_V4h?LBd*{)em&3<A7&+MMkc+chA+FO2{1fOP05yvu{c-%C-r>Q
z`tzUX*WPmu-1m6#9=S$`%iQN)edhS{YWfzz4`02?cbX*&#B8^l{$;|tGpGF@-2LKY
z-L!v?%sR~tt9iZmTfK0PT=rChIrD6_+NqVvEDLW|Z8_sS`%Az%A6>2ZN$v@!zn-oR
zKKImi<z$xGKV_b^CakaNO#Kwy$B@uvP$e07f+J}1!6On=zCVom>ivC3e$&~wXoceq
z2CrP>g6{k3D?N$SxE?ma{8rhm#GQRD+a|F7&)hGu@Y0WuJ2%y7WV`9Sy2y8K^?b)m
zQQKWlnRQ=2ot}1~WbrP(+4I`tO0Th8Usb+w@tj%nLeJ^Gw|!xm;gz1LciZNBW!&tt
zThIbDl%+9vecY48_3jHySE$c>u=0BJn)0JH?|C#1Cw?}_`6t<!kz7)BF#UGU^sd0D
zQ?;)IRo9un*w=kh`Z-_L{z|c1cE*c$EC+4=0c{I<!!m2xmQENCbi)9+CF_!My=`U4
zjD9m|R>;;N@C63oR0`^9z!{K|4<y73J~dlV5G3LSzT*rMhe}GIqopAtO5hv|>LDR9
zkTpUsJb;V9t${N@rlTo>G`5hffm@8#KJc8L3+NzX&}v6yhKYh#ro*&zYv2FLd9J*@
zrZ9BLQ}g`4MbEd*w%h)}+P*l)^0Qrswf&dntM~n@D*U~B|AmNkyP4+yco5Eh?QGWh
z%L{Cl<Ocg||Kz!}Z0G;{+6UUU=O5<JNe)njxI;O8ABUCt{*=3`JWl`k_1T);_Ms>D
z{?Eq8ry0L*j(F=Bv44yGYsuqg(_VWOybYCql{0rw$&2&*zxHnDT{C;?xyvg)PuS*b
zyY`8($LweS#OHpWHt+cd^K&QPzu)_?+<N)EJtq~sL8p*rI>_B)OZojJ^{zsmCG(L3
zdo{Xw)8}kHTv)ZSy!Ds%Wtn?FxB0&@Vcv9Q!K{moM*1<`d-gqSdM(};TmIF(oRfJ{
zS&VS;ZYf@l+~p?!m*j7iwgVj>zm&03LP}s+m%yGcT&Ik;F1}wfY0hO6)|8OOqILE@
z8*XT8B=0)EKr?dYwoBQ!B2KNper5U;-~B6HE!AIYRA0I+wcEk=tM~gNi3vX5r@m}B
zI^%Mja!~H6wP*6zf3wcG75i#?{O@OTmMG_DP3U|V8kSkS*z8~~kB6^4v(cH)YLM`g
zny~Nrp69>(%lqs;S#k%v%WXctZ|>*qv0D#YhVOY7JNNk8-Tx0+#(W0#51b2Ztjj*~
zJ%6+9cHZ2}{xdU7S-Tb1UJsczr~d5I_wQwjJ|uD<|9=1dfpb&W*MFbH!?E|xSJRW(
zmZ7RI?-cj%75lWc^6pM+pLokp9qI2T<S*MirBQG1lPTHfEWdQRzqxSOzjEIX*5$fa
z4j0ca{d0k><-vdP`NfC$URpjq@|FA8iv!GjuliV?{+YdQZ*gMDiZ_bSBitgrc7zB2
z^~v3{Ve|KUzjb8nAAMN7J2zXC?Mc(y{Cf-Uyvh#B1NFdmPkFn2&EeJhyO&0<-+Rma
zsZ(vBpJn?eZC{bfW#=yMJNbG8TT9jQf2Y5sSV`x0CNHskKHs|hsB_=Nc^=-AXGXmd
zS<_?J`lLO*ds~oS=VM6|0dTotqTp?$@MWJ+N`l&piK+_}Efgh^t$diS%;0P?W!jwF
zAGzDPz^CBIdzPM<=GST~d>k&HZ%Fx~5L0&1x{Tusrxj-&r^w5T&wu^tE%vx?z}E7+
zz_%&9YVS@pwXW<<#}pf-_6OVDX|JBlwr;A!jPswrf0-lvC(TN5e|*uut#|M5H2eO*
z#_sT5;eudyi8;dlKc41AE<01@dr<4FcjUS$wvJYed1sifE1!6J>rbEL!#PGXp4!cr
zf5|7|OjWN~p<boOui(5X;+YR*PPH7^f0n~%GUGyClc~{_zNggM^m9A56#A`mUQl;z
z<>kBC3~t@$Q&gHiEznC!TN!?;-^i=M|7VcB1nZ^Nz!&DMt4%zYGJ4LoepjhpdfB@w
z=%0=C9Os>r@72sJ&Tj5-RK2@6f_YNW(`p4}qp!yNe|&D8+<kBcsAIB}QSw%Uz?T9)
z+tjv%sd2S+Y9ACVG^?XIqc{SR6gWCwyPjm}OjWp2CTQg-z^J0kWO;20n?k@F(GFfC
z1qlhJ&sAJLO|OjtwB4C5dCN9CS_%qGyZ*3FGx%R$;_6=cgLfjApVwu%^e1k4s&6;z
zt<;@r^Gp3rZU3!&KI7<;{(Ju<PO*HO(Qd>ccV)Hzg!t>s*O&KRXtwWRRODT<HZNGB
zF=n#K6=lZM*=hyPo);bIN}j@>diJYF^4&7S1vPrhT0Wg!5i;BLMe5BdsjhY_pWm8t
z@ml8TKRp*uoPW~&dFA{WZkPNer!2Pdj%|taKltdo8uKa3%+27EJJX@fNP*=Wi}9Rb
zrwz^t(-^cED^EW3w<}_2b_{TRZ?N}yO!I#IYYhdwP5M3#l0B2zd_3p+tqc5iNB>xZ
zz#X&L$p^F#hRd!nWv!4@T`03t&Gy#Qs@5-$%vnEjW(7a<{n{@0V!LZ@<hqx>>wBY?
zMW#Nqb<}-%?!1hDK=;lkcY2t3jTBgZ?PZF4wdt(X5;bP0Dvimz>OAt#@a<1G-(a=l
z+*XU}_YJjJe)V{yT1DM*z1;NOXRBBATHZ*zZo##$m&dpVO%q)#DY5!j+|0DlS@P=+
zpMN5LSfu%c@wJpN_e_V1vY&hPs^?ALwZ|27VaBBfol^}T%-t9{7-oAgg!Z|yG#xHt
zX?ZR1L@?#z<2m!bZDVU$FtfUFkKDwADNpK5S?(T|m~dzHebd_#6IgycnOTzLreJ$z
z=CaEoN7QFzb1i9)WOh0bvwXYE){|HCUUdZO)v3=|`o_?i<>r&s*MBv|UGA!4-D4<l
zSYm=Hf1c@`5PQwrW^5|2+h;7Bx>cOz(o>UF%eLyJ-ZY>2H1S$Pz%k#ia{_1Euuri+
zzC(SPncKvgV%=ByQ*M8_#?~^w+BUyC@{S&)w6vD2()-yjdyD0)cA^7E#LLWe)xr5Y
zp7piLeOUHVg0sVGveWv^O4(-zL9rxIe0EvuX_lCn{U;+#So?E%Jl1EbJ(>MyyQzZ2
z+M+m{z4ux2B`+qtinwfI`04(7^?C{BNu|Q2lFuqmb!`!DNuDKi#>mTAGTDsXH(R?o
ziz)3^z}$0NCKmm=oW$c1X{YMb?N!Ndq|hTFuw}aS<UqD1+?z#{kFD?smVN$X1rLXE
z-_u`Hid2M{<MX`^WO(k=Hc<LmGxuJgw7|QoSL6(fa_5;t>YzfFCC~439(b`<Uwntr
zF9(B3Kfliunw06nGKa+}wZLaXtiSHbq_<{lD%_{4S&KOrY<{P-n9sK{Kbkp6Z;rmN
zaAU%<+gl73o-CG8TvQ{$>G{(zeV<ua<m8ESd)-nUIJS7&2R}1vI%s%pVM?sy&bIb>
z-&l6^Bo*%evW1^()#=R28SYGvi?Slzl5erB@nvf<^ZYw)Hc!C+IoHcrR7ykNRk{9F
z>(4wth4-zwT5D0a<hhVlfszw!yi?g-ZpHjma`<%I@Ri0>_bN}}<G*r6ZF(3V-jjIx
z>-|%`;u%Hn=4NI>^IMhFggBLgq<@x;u2uUvRXBH?N|?}~$*A2Jcr8Um!~1~K1K$m<
z8%`gJJa^u5FAs;K_I%YFXE*{qpFI~oksigj_uHKF=l9;*eKAe?UX1YPDRWnHcx?9A
z(aWglu{dYby~cXY#XXBA*j@6h@+{vbqvv~n1&_xiXFG|zp0z?T*A^>1*(s;1c=zu$
z@s>5KAM9fJ;vgXZ*;#U1Y{R|N4O5=FMJN3V+9vWYJnyTB&MC{cYEK=b{beWDf123K
zzry|$!{yugdoOLc{XuR0l=}5$b6TH0&G6jjwod%;cO!-DUpd*a+h6-IKCUx7I%C7H
z%GKu4;G0&wm@i3Gd|z~@vwZ>UWXA#*{o>#Wje!q}IoEJrI(4_1W5SE2tuxsAma@;T
zp3+_W)ADxRHedDc&*ptOB&6{*dU=3;o__hVRpJ@NzMuR)#Klei{F=e|rS5uHJC1F}
z?)x%dM=o1GA@5yizO0YH9h0DdeSy2<4@BxOQp^4F>-*~SYZ^8BC$?S5x8EUmUH7xs
z&cM1z=gl*wUoXsCZ+?0Col|93*WU?Bf4X&(cXY7q^q&vkPl>+kcPVW4tt$(1+dSiQ
z|6O`^B7A*$b^w3ry&d-#<QA~o`tQAE`qZh`#-jQP1B29hPZ!6KY1h^7PpRHt(gm$e
zELjx3WK4IAUN%W$N(bb`d8r8#o%%Hv%lfQe)Na9@_PX2zbn_K>RE#YpMXhD$mous9
ze0IA(Y?=E{MG1U@Jmjh{c#9W>k?C;ju-T$L4|={zFRqzwc)PCX?b0=z5BIWn-`rNZ
zaYJZUoX_>{I{or!UY=9V+vYxfZuZ_yQgr3+y088>&2k%cpce24KR(EP<XVWn%H!7T
z=D0=Iv}Zg~P5o)BpAy~Gm{IlczQ)PZ_kUe{-}`ZOqhqB<>^jL0Vs8S2_y1Vm`Kz@t
z=W)zFXEzsU%T(>c>pAALjekvy%W`wz;HcbOa<6934Tl{j7BbRboVL|XnCF=PCvFv|
zcKoqpVuz-9SFWjxypv_B&>VMT2dGK9-beUtA*e|TDGwx@#P>hg^3K|N*0TKxZWY_v
z8!IkLulVT`91tlz!I5pnKDPXj_CUY?k#D|eHpMqRSQ<0g<_p&@W8Jm!c02zqcp3Ec
z%F=C5@1;1bc<pobmL%JfWz!iwcUs1JL%Ihk;w^H8ZWiaPYS%rt?5Pv5i7UHkEyna&
z-0uI<yMjz{b~~QSRhRNv{@8G?G`xOGp_IVEbX(T$iK1~~5q`fP`M&BgNT?|4`J7a9
zN;6_-oUo*-;sU#`PkitGd73|C?Y3<BSv(xqRqj5qyl(TZQvGd$%a1e9N}rc~o?NUk
zzvxTLO4p~RKdgiMyn@|+K70S9aZgG~SJ9InJB!10p7fvR&vuCT+<fxP-xJPmd-ZPo
zd-l%u)J5%66F<L7J}$SpOeJ{3j{3;usXH$|E1I+I(C_p={qZngH7=Oy(DzdRuh@iQ
zKOe5fB&MW)ZUSBc|8~@Vf6w;)52zu<&8U7H<T{0b=YO*G>g&IkH+}lNJ@%??+rx*(
zEC;qSdg@njrnM<peOIVmqS^Sx_xin?v6q`FrLHx6Smes|05oUt?)?m|b&>l%&ONUp
z|75O_n(R*f`I?_wD-{clw;XvQ^Y4lB?(4mo59{tsx0%ZJ1k`4p#lFsU=9k$`p?wll
z*i_gi@A9bqq~!Uk*+{we&tG2gnEfAq&X~GI`jXGDNz0c?3AnYMkWQU;K49LG&gugD
zPtJ3L^L!T<Lpz+CxD+d5gLVAgT~D6((%_G`Lo8b@OLJ;K^!Fp?k`q|&GzvBs2^tBs
zq`54<uT*kQ@x^%!AGb0IMUC^_D>E-=$4Lu(I&HhmEN;i)%9GLut9U$Yg&GP^Y)$%=
zl$4|~sjM&EW$_-iz5gXUI~Z49Kk_BIC9bbx%6{P^#*+g2zXW_+`8i#t)aUw?6kWcj
z4zY<Qr;T2is^-pceeqR9=a#pvD)-``*~*~1{WO*`Mjl=Az50Su@wvoTflNo2tu$ll
zv0UEExbiI9{_Bit*JV;1iuUk072CV1?MPeTzITq_HVNjU!=~jW_TC5Xm7HbyGP5aE
zx;chhx%kLKcYUAILvsw6ro0T@oGfnsOt@KSvjlU{`o|tS&2tY%``K#SpUz-Yw)V|#
z_DH|fqImNB7194jUWX0cjvHt$zQelOL}AI?^ONUodSfiLea>v*4sidU_e4ke0i_MA
ztG~`>y3ewPrIuy6FPlnmOW3J~lviQv137sF=daKF+&B4F!;@Ju@5|2`o%Ll?QER!V
zzD?^~iJDu(smu`H-3?n!6}lst53#ZMGTQu1(UPz?PCAl4L#U1MaLtm~3vCi}Uu;i(
z7sTt~W;Ns0^^k9eb3}Dvg^-`1(ubzEm#42iVaxf7qp3P~qBp<2W7a>vfEn{XR#gQp
zvsv51n0QL~HQ%f<uTxgtRZE&RoqE1)xyiS~OyR4^<|0M=%gZ($OPavAkmr@)a-UY^
zy+3#IT`QI`Ht=G0ioL}4s@Ni~uO!jPKyK!%V;#J2nLe9mMBUa{mJoRTs|Rmpg6yfH
zeG7uX1(_w2Q4)s?=et+3t(zof9+=2_Aa%hBLpS+XrV^ZoUh+Pl<D)ILmGPmS59cG}
zz4zAnGMruEWcjt@_2hGA(Vxt8ou1eESZmpzx8iV1oV4AMtz~7>lf^Qvo<9SY#mqi1
z@3(sXlg2+se)L{yNYL`A-}y`>^T?Me+#e!;J4bu&GSK@oZ{nmtDS>4rQ<ZbKxW0>j
zr`Gl`H%ea_8d9reu9;srJhzmSWf#jUmP1Q_dMxnxaQ^-a#p62!4oq9j%W^5EyJ6z_
z{#y-4B%ZQ%sd7efhIpz!FW`CXFz?oy|1VM<3^HVt-}mq&7M!S#bZh3}IA~p|s>|}>
z!1>e-zEYfTJdKY}KF@N0v5CTx_B|8$ubr8&t>~=(-sC9DD5rhbZFV$=Ni$z^x#gJa
zkW{kL`HVrpJ<aV)HfR3r0#|aE8s0DobR@~l2y#~N)~~o6vXrH($I0Llqp$64mNhK%
zzI-W`_3<b>_b6g7U-SLf=Do&O)vrIRoB8^D<=%VWV&kU&p2+{{@RV&weXc%~yE${!
zyu54gXQj@XUAEIJ(m1z!QyR;nmF)BX2t8P?cPjSykDd3T%)e)DmRt04^A+37g?5HZ
zeBUa+_CFP9)W<6o=ymt&YW-8YSH8<|mOQF;y0>Io<Y{FkrPJrt^Z!picjU;E{ZgS?
zSA!lu7XE(!=fm|sSuai6wLHqj#YOIB&70czH$MJn|1Az&8l-D#Y4|r+PG-LYxIW%g
zk+=7w@!Xv^i}yii;?Msz<nN1Dc$;v2Vb$Blj*bqSqMa{y^_Ooull#9XaB0xm<q5s_
z|9s27_uqLx_-=Ty^QPhde(mq;?fu&YvS4fF=Vz18-TgPK{tIMQ{$tMlvVY~zc~5Y<
zP4(J(<<{2h^7(V-{CaL}_aSw^3kZlYeiSYg`8XwTX;AltBS)U(=I{PBt@^JLXtXZ#
zd};p9x9yWBPnI{G6_PbkN?O{ws;KDKz5D-q*Z)Wb4bfd<ic2;yHLX4W<HwKUUbU%G
zd4|8f{(m_s``t%w&?TQK2Km*O&zAf8_|zTJ2wJIVYHhvy{*xzP%I{7JcX4r9VlV&y
zfPCroIv+p3e{%wt1bGWsC;9rG{c{?;L?Z^|!YZG=x3)dKvtL?TdiR|dfh#Zi9G!K;
zX!l!oi|YL;AO&SlO4sjwH2d?j$7PG%mb$H&IdkXr{F%1T@7>P-Hz(c%<jl`CuMWq*
zkEyyCT5oM<_i-t&w3*Gdj=+_I+4pYFw>35W+Z_LUYX1-Lijn>wrt;5otp7(pe*Ad<
z(w_lYl1C4%n)z~PeoW25r{685K<4!y3GA1?SM~dD@Z`()@9(MnJk_n;Yi&ct=I39Y
zd|4eir|x9-|IdFx3rTkIFOK&Dtyw8}I(hql&*#Uo?Y}k8e7W;|UvKaK#`nJhveYxm
z%F5o~ys7)YFz?%(d>>!G`}H?f^M8KaeXq{_`L_7~InR@~*Lenm?A`rlUDy$$JzWpl
zw%2|5cYV*ROYePreEucO13MyWvBT1^0MMeDv)2F2^XJ|8;XeDA*}uEXKKj~ANK3yy
z@Z?EIdR|#{dRba_dRo;wV>`QxC+~xUP_!d)*|Ib%W8>&w$Bd6Sem1-qJNuY<P3`3)
z$G)FA?vZ}MG5g4hD;zDi<)!a`+1e|de!upXc6`nGPH~&vpBHt{zgBs0ssCG<@4wW$
zLFx990-x2xNAq8=+52GT-05@6|1iH^=Kp{B*~hcr&oo!cPdK$pa7mE!y6+Oww@WTB
z>*Ot;|K(NukB#wWAW!bC{OIccevWDNt-G_2ng2NtS~=6JHZ^L=&l@KDUhZEz+xBhs
z@%T?7`$IrZk19G<dHnwN%9_iuvjmokE!_Cg_q~1byDRVI%U>qeO$4=#b#ArDd_Qok
zlehj^iO$knYkyY!Y&*Gm|N9f|pnI=B%&{zgRV%H2egDN5K`R-9*PHd{|6=1__2&_2
zMNH(aSrdGPrRyJPbObTG*`J$d_qe()6J%;nf=J4@EA!XPwyZs*5xg>vEhXu3?(#Y1
zcmCETLgaN%te$s2(){<Q`{t)YR<)b;<^T9C9$PXcUIOGz<%Z0+>F4)6+U+*e>+6LZ
zH=h0O-~XjxzX8ZoZVdX`cPuKu?Uzc`x_Y^x=Vz*-l9G~<QbqEwWgmO_B~rDPg2&n;
z1v}RF7=g|!gLDo+gZ0n?6yyX@CxREjIH6?mD3!fr-iI^Cc~Z7Me>^>|I%D<cdG#kJ
zTOW_Bc`@nz_dVBbR)+38c)tJt8NHikx1Je3|7`p=({xMvrF-vfUbTFlxB2UXg|`Bu
zcl={?e|x#|PGWWGx9V5-Y~SA&yOnpdZu0%W>5tFPD|2}MvHksB-|r9A-(9yow*L>G
z`@ToZ|NavJkJ(Rrs8BI8zm7TbY^8#M(hlj8dz+W>zq0wY{QHYPhqU=4KiTedn_a}*
zv@>>N?v}I3=bk^Gn;Uj{9{W|p(nFQ!Z~Xax_MzhW+uJ`IpO?%1_5Duy_H+KWS-$D_
zoxi`5nz-!uwx=^EvzLCmSN~o1)3iT*MrUWJtlN1!x$O0<-sSsB{;#|E@J3w!{k_$1
zUhw*V_#F%$&Ucw+o)CRjMMJ9O-a`GFKbxLSHCa=(*{p{#GkDwXS&WtMEZa-Y9?+ln
z|46EL{i*j4lI3n+|FLE1%D;N;jFq3d|NXUldLs6naYl+KXmE(lB_&isVA)og;(NmE
z{y%qq@LYQ?nafL(qxV$9gmX)$I7>4-9k^}Q@p@kM!^v(%d$&CgWZU^%gO}sGUi06p
zH|DWRPB8m=F~uRHJeQYa{>xCmTpo_|=Yy6@@Fkbrx|V-)AEcY)!hA`qB0-Du@(i!V
z4h|n@%)hz#^9}WPI}^+dSXrWdtWSo>E?fNP&P)Ee*5^w2?B;(itN*gt%HQT);oYxN
zmd`WO<!<bK7WkU)&A+*ycbFZY@#xcvyY;Jne-8G$wchY;yUgzV^zVD+woc&t6zF$0
zX<zlb{kP}Ye|`U&LrhZRtFgk8?%&VkraZiuF8^bK$t1l#<Ev`V?VdfbmEV-skeUAU
z>yMusrrZ2e{9gO&<nQ+da;YKfFQ2GgsefwXrzgJ_8z`(1Ran2%{&JN^`m1S0ioWOX
zZA$)^;XTDs(nt8ox7Cl2Zh5{%eVOUfz02Me+hn(S=)a6x(f@OsRR)((>a3#WcME;r
z-Ar+i=v}BA{Ozgnhb52Yw-=oAx38TqdrN=+^q)_6dPuF@&s<+OUzSJxr>E4Z61Df5
z!F77?55ITVx#aJZ^L*E}Ur#@8kh(hWcTHc}T$^djd>~P0qEM^&;nAPJ_1`XXA5Sux
zx9=sV^_M?FTLf&1=e(3zbNNBV&W5QBYL2qcT3*Y4xpTa%f9h*1p&f!LiDDcFIC3of
zHn<5KIe&hZ?T+VrzA^XJlzf+Bxz2gSK%Ucz^Ngo<qrBrCJN9=QSKNCN$JX+?aE{4j
zd!~&N^Deanh%aF^NO;m-@5z0zLh{Pz$WxtPihN%!j>|ukzS}L=?*5&21=H`TvwSP#
zoBZCL-Wk))P`Q$=#bHV^%bF?XFPHFGOxr5Pa%|<~qK9Qnw~Q_<uPxAdmJpKdU~u-Q
zt((r?pPAh&txruTUNN~&t>v5Byr6$E#+UbbF(2B?c`Nh!4A;Kk{Wj)v#1Eg_d+OWl
zYW4jr-&nrZ%&Yg5SJQLrf9(5u;;X<}dCMIO<?Gl#%{l-4rf2O1yH~$-Z-={uRQT;Q
zyDzSnBo5Bs7aQJOxE&e%-C}cu`9Tk{<w9?+Ir2J&T6Q@Y_$!<>y}bL3<(=L4YBdfT
zGsa6AOgok?61;(94adD#E8{um{r<XbyP?9J?t}>oR^RL8zU+GM^{y4oN32#nE58)u
z?|nE%WrF&RwZ1X$W;0khmaU7Q)v!s<c-IvdeP*ZHqzr??rygNdpS_hGC8xYp5zz0A
z=)1f+e%kdF;tQ{{bp_76q;vJH#O2*rG$UBvU2&D2a9Qqk_4nCQq5W1z{yh1aneA}P
z@8BXH1ChP2rkhRvHud$?>_m88<DF2QyLpzvG9Eh)8x98!A<m4`3M!7gZcDFcFdUR!
z%)hmAC9^@*g3|}qX3oBz$jk9eqcZ>2y6Jlaujn6bNJw}!@6UlNa{t^c-)y+Beb%q3
zpM|zeQ`YKfFzLOdb7%XkXQ6!yR3y&UzRrwW>F(yf>2H>=KC{#8mu$z>-{0RcTe$wT
zh;ma7)3qnelRFg|t3C$HFHwG*sLJDUYH#KdRW`}R=3S-=Pd^_%uesRBc5jbkWZBb_
zt)G5MZe?6}_p8y1Ejx?8s9nDux_z-2IF2TUZptu!wosG9JK%DP*eM1b$E@|aS*Ksx
z{}p@Bm+fHWbs^iXDp>!260e6__pgbXx_iZbU8_94U`lovhXdy|&L2W2zC{<z-{rlp
z`2H^SYR%vczpjQiJ)Sc=v-XelWA}v9CUS1<626`cb2IC-6CG~l1m4XzU%id3Wg^oV
ziP94*xGK+my)7{zE_X`V#9h)`89nWOP0ZH&{;n7@Ed7bYP~l~3zSO}N=?+Gz-3=PH
zOaJnCxG79K+@Lb|Y0~|tMhZ`Be$A5Dv5hU|pvM`xJG_^6<~LnqS#vTwn7{Vk=K$#m
zfq92#o0v<oF}?c~Q}y0ZR_Bi20-t+ETn&Z`dkg(?CxsnqNLd>w{I{$wE7?J!G%1CB
z^MPL+pE?!Sp3ab9cA9<S%C@IQ3R~XCrB41DAteBrTaMhssAzCz7uzz%kHwaI7u^@p
zdDi&Zd|8Cxf|B#r-xfYkRV;eetKZJJFn#V8^R?QF8s|<N(pZ*obvJ)1OWUS1UuJG&
zYuU7N`DTMv6XqU~P&_F$ZQ1W<RgvWzWY|;|OMTrn$w)%4{OuxL!?pcZNwEQzZ@bEl
zoM6ayaJxG1w#0-z{EJtf4P@_8`FeZFf?h^N|EDH<s<<9)2;T52eZzFdlg1WrMAj5-
ztjsjXoF938DI;VWe4?tPKvUPdu(|zpA?7bw>z+k0^~H5x;PkPWE?ZaBb99f)>gq|8
z*R4r>k?z3ZcRHJ|>R{B)YmCi~4B`3Avlpz)<UMTAec^SfQDjS;-nHCphX`@Uo7cF%
zzjQo5MV0BsvfrP~nQGt7&W)OWxS>Q`^Zab~SKIy4%bc}O+gNK&+?HcF+a|U8rkS<6
z?$w4V-_I$PxIKJjwB+o>{Z{$irV2B?!q@-y{GFZbFzZ&P+r6X^<5?l)Tu=5SiY*M*
zxB33{cj~5-4)4~&{Bo?Jcfq#nPRS>fvc!0<+rGZ(Wo~=r={rxBtS`re6$E>XdY8R<
zyHtIu!%PRY%TqQj&5`8!W~{)OVYxrf;Qp2$v-YH2VKsJ|W#A|w@@}%twS&Kdwm*Ej
z@9w4<>bGZIyZvyRp#pcbxcr~@*B`&XskiM;rbb(L_0xSf?`^$fnrX@75qWP0*Q@z2
z%Q<(v&a7tHIMXf6@9yK`Oz*!<f49B0T&(B&wsV_T^a5G+igQVy9aAmVFna#e5Wl5Y
z@IR+JCa-Biv8VSk-n7?gr`pZVel=bdbdB|`#rrc0dnc5gk=@;O@tb(IL(u-pGGkxe
z=%VS1Lt)9T@&upv-yZi&hZlm+o-bP&D)Tts=Vio<e}?h`f^SqD4ryEo_T8j0YyZ>K
zbk2y=59a@V{>yj1x`v8C)~&$RZ?>u$Jla#leRU6qo_@k%4M7v#Z@rf^o0sjsc-UV`
zIOTKk-=JexTrckqf3+vs>D23q^FOiHr%sshc}4TPZobvBJFD(apExt_>)q20SM{Bb
zB&4lQzI0*A`-aX(QmG5P?v`9Gc(YGA^Soz#@XVr~ZSKD{&bs=rzx0>8)pKmgpHrFs
zUu#$D_dGh}&Hel=1KSKEh4V8$UeRRZne6*(leNUplh$4yzh|+>i`}>)rg6IGN9tM5
z?+-txO8uN^GHvmsy|*~rxF$~#=lS$z!u(J7e6NFNsTl<&HGOUO!-uv(OFba@^I`+M
z8N8)Qw$?!YZ`>s{=tv)^$qgU)h7>a(At}(xW)K4|hsmIk>P&DS9?}4H0S_5LL?8_j
zP&WXw%EraziQv(X(`}Y+u1vbj4C*ZyDLhW*mjU%}AU%x+@z_6G&b6&9{VtuVb=7&c
zy4k(2i@N7M6K?}`a5yYJHl>%Jn^$}EE=co*V<*4P|HuFQ|0HNX<iVBjc^7^@%~k);
zzu0GKRc}Miy*)3_+`RnmGq<ygi%ZJOHS?`D7vEa#x8s6F@Jc!GrkTq>j#bBfy0!fG
zQc%Za6N_KfS^bi8^D2+lX0NY0bq{3E#bqBipSR9__T_hMj~=M^^5<Q+^s;lW_Pn|C
z_<mN&$`J0g^KD-D``bTvfsSUJUfN!IZie;J&t1CewTD6b4{mKpJe+Lqx8{G@^%e^u
zP&8etdh)nOSO4vr*|k4yv*UkH`P-v4HEIcGq<!VH@aw+&|ERCy1dUz1Jb#OO-pxq+
z+O+4Q#rl7v&$b77U)VKsdi=h^q`&>==GbmMEVlRV_ZAB+kg5G&F7|oz&+qT<do}0N
zs^{g;9;=q^`L+G>w*4={ug7goD_8bdI!&YOxK;AEDdux*i+_ci`>p%`@Wzd2^Lu)G
zOF?lZw=zfW_Ls?DC%=AYo%}AfzVOz!t>J5Hub#KRwjvO;gUf3;5mW!kix-+X&c62T
RB?AKkgQu&X%Q~loCIJ6+HGlvB

literal 0
HcmV?d00001

diff --git a/tensorflow/lite/g3doc/images/convert/op_fusion_banner.jpg b/tensorflow/lite/g3doc/images/convert/op_fusion_banner.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..e7b1503774e68ce9887d78efc53f5de2309f2e14
GIT binary patch
literal 90470
zcmex=<NqTj8Bb^D_<$gPH%}i|21Z5(1}2`wf`S|t1_p-Qypp0IcPE92$S4K&Ck!kM
zybSyddJO7`$;AbZ0RcV?3=9k~8pKB6Z`T+YLA=YlAhWoepPzD?5kFBfT4mbYPbP98
z89bOPCAB!2fq~%<0|P@`azRlE0|R3W0|SF{c}W3?-N3-WAW{?&8O6ZBI0eK`hp<<G
z*hvuf9uT`IB*>Y8f$;(Z1A{<vMq&yB1LFe*1_qrZB=K}4Hq>5_(^#C1%t16&aY<@L
z3CKX_{DR7&%=C;B1;>JdoKywRyktEc1tS9^0|o|AScB9sFnqoN4nsyU4un`|AOnLf
z$PeccVs5h;7*^*nFbH}e#ME9fFo<noU^pmNkXV!mF^_?Pf$<wlahkCan9s<)oq>UM
z+W-Hb!x<RZH5nNGUjP69_xAt)|1M)-V7|q`aNyhjhc=uUB_#z``ucgrdWk9dNvV3t
z`MLTPi3R$GdItIoHue<-iOJciB??KY>6v-9O7C~?S5nAKu~iB;^)>J<a8Aw0^r$LI
z4X*MFNv_IHwoFm7x8t&@u&T%{$W1LtRH(?!$t$+1uvG#ZZk1PT1rpX*QUEI{$+lH;
z1lg((;TxdfoL`ixV5(=Pn`~%ep<r&QXJ}?(U}^vgI73T)Lj!#SLtR4)D+6OILo)>^
zP_pAvP*AWbN=dT{a&d!NSCo=ws|58|d3m{Bxv^e;QM$gNrKP35fswwEk#12+nr?Ar
zUP)qwZeFo6#1NP{E~&-IMVSR9nfZANAafIw@=Hr>m6SjhL2XrlT9lFux2K@AC<kmy
zO0s@xPHJvyUP-aOp`M{WHhmSj1#pd610IJachB5({2J5sGjr2%+3D+xUvq9QR-MHq
zL8*uU#p>STkjjEo{h-w1{L-T2)S%QfWo$kuE^#d@!S4iAgIvo}^GdK9m6?-MT3k|;
zSdw3a-9RJ-;G~SrJq1YV1jSngDQWtKddB+jgaJw$Dalr8`9-;jC3cy)iRr2OSp})-
zHu?xrMEbBvNwzA<EXhd)Cr)^p0jE;OoDx(~kkO!ImXireR8~2OdFi%F6}l;@X^Evd
zB}#T7Zt*^uC8;+05Jjk(A)=^eg588}y-h`Kft7PnDk#80^7C`-Alc3n5sV5(CVB=6
z8or6inRzAo#TlA5`sjL)d<d6HbxABqwF4Dxx&~&thUOuL23Dr#R)!Y32If`<21tfL
zRHB*To1c=IR*7MTImC=8h;}4J=-N|D5>paO5-|+G;s{@4wP@yqWR&J6<t1k36r+VN
zD9}-Z7ZRP|xXj4}7Z3%BMa8M03}CAS%K^v<2ciKK%@w%?pd4&fo|#gTVP|M$0`?w~
z5W-cMnle&B6^fmasTo2SR1BXsXp;2`aCJxMg9_r)2TtR5`j%PxCXSAQ!P!w>LFtZ8
zj_IzB!H!Ozj*c$+=7EmIq3#)FDejh4iSCx=o-V$nzK)J2f!R5Jj@kx}VBqBF7z}2(
zI;tsGI9fXff;2liItDsA>1&%fx@5a$CmDI=IJ$>`l{u#ACpmfsIy$8Xx;i<Ax;mwY
zx;dq1yE_F&dO4S;yE~<)`?xsfdpnosdxH&hadPx^aV!VnKo>`o09Qu~e-|g;5I3j5
z5Z8+GB=?HqWRD7qG&nAI^>8Xr_H%JGh;(&wjC9Wk4RQ<02g`#EgeyeWL?m0^!qL$+
zFx|<~(aFNG+%*VnprfN_kYk{4kYk#@t7D=k$eSLHAm0VLg4j?T=;m1N>FyZl=;7>`
z?d@Fd>gbs6>4L5fVql=7vtxOnn^StSr*mMwkE^4jvtyvIi(`7Avr~Gqi<6_LyJMcO
zlViGXkdvdMn`e+`MPQh#yQ`6FW^$x!sBeLLhEqj&MPO!t6WGAe5|0euil|Hr=b*~;
zGS{5Is-TeULhs5*;{a#B(7?QGGyf{TjFiwQj}RAwQcq8>d|xl;jP$4suOPRe!eE22
z62CBSr=m&|{Qy_6frkDe22l~=ZaInJh7~ChIhNs3Mi~W(xh2I(;ihGg#zBTDc_}8T
z5e4RH9){X!#{RAuk%=Cep1IzcCY63!`P$Ceromx31*ws_QD6fLlk&WbQu9o`vkJV!
za|_He3knNMi;EJ?%JPF89X*{L13g`w0z+M$0wZ0W9erJrKw+EV>f+?=8tfbD>J}R0
zm*JTnRgoU$S_w8VKgpxq&DSHyB-qc{EzB!3xy-f7%O%JqJtHwVv@$$1(%i?{$v7a`
zFV)L6y&@$vyF4B2Wv{F#H}}kx>=d_9V~axns`AS80<eJw+5uG=UcuQVz9FHe0YNz_
zk>OzlNfB;_sS$=jIZ?SO`HA5LMM>_4rAbDFybm@|+a<><JUquVGb*>BG&wKIEG@gx
zKf53?F|W`&x3JKxvZSa`yG%dP(XrgcG0@T36_mtWEF3LEO$yE3A|jE960m{Up=Ius
zPMHOfuE{xOnfYc}jzN*ZE<q)k`Z-yFmchlA5oJ+P#<}4}L3!bxl>wE$<vwK<QDvrO
z8TpC&<xZuEc|nO0UM@bK*?AdY1GV*&ON|Y+3oEKBf*gx7%?$j*%k_gRi~LH>QjENd
zLoKSzTuimgEF(*zES)S2yz-55ip+va&BKb^-OW<ULZYfX-4a96EG;}jp~*(uC8H!F
zJJGwqJ1^VA!XU`kuu{L=IntoW*vr=|MBmZFJ<!=HEjT^VJ1{lfE3m*aJgCakFg3@~
z$)G5+(9EaONjuWQwK%&tz!dDl^pIp@--^&+Bj*tBvO?{m^8BnSw>0B4fA5^25KChN
z55MBDsEPu&EWb#zq$HzkOGB^Js=%V)V6z-ApVCA_FTdalb8Ua`{4%!)uz|>-YVH;q
zP#9KVnQWkKQW9vI<!h1TS)7@jUKs4{lpf~kUtwwC8Co3Vk*!}<;E@;{7_Oh2V&It*
zRTP=)>FVX21~$;rFt@<RH7m&^Gtn~5!z($^JlDfH)zVeJ+}SnFH>JY6!qB72$<NQ(
zC(AHD(j?S1&^bBEsXQ?!(J(4Ipx80czrv-$r8J~GCk2uY19FP0(h`k}U2_agvl2~;
z^G%(jd_oMuwEaDOf?O>Fvx@^VjU)AgGNW8dLme&6y)Du!Q!<M(%iX+lb5b%gi;BZi
zU<BBO2reos%fP52EGVnQ%+D-4E49GO$ji*d)Wt0<2{rAZBt@`+k%c*iriEEXd48sz
z6=uQi>0SZuWf`T(VL>Sc0bT))US6JA&J{)Beuai^F6QZxg+-{Lj-0f?27*$$qhpw*
zcByx1MObiYQLu5TNm)>-t7~?YXMn4ryNPzVcOh8P2&4iZ0XDEA44)FxWWffKW}b^j
zsHbDFvwNXmX|A8Qp>}psT6vOrnR#+zSaBjG>wzj&!;lh#ih|G(SZtw24=5T7L(EDF
z!V8T33yi`l5+jPU-OAh(BYXpsGlKIY{X)H+N~@C6Ex<0!vh;}XODin#%gM{m@^;N~
zb;&E!4oE3A)y_3^_jmUXjS6zgP7NxuNQ=xW2}>{Yvh*vBO!AH}@F@-U%gM;~OUsDN
ziZoBkGBwQw8yJ;end6x0m0#+Yp5yLY9_4J3ZeAR2kQb5?o@tO6TxsZSnPOg{9~xn1
z=xP`d7!c|m7FBK-9#QBVn4D4I8Is{#kXvP8R231GoD4S5BFChnyu#8jEwZ>Yz$c>6
z*rd$BFeKD7Dcw2J&9&6U%*is$IWsIhDm+6!IbGi=$f?x3tkPdQD#ESUB+}h1DLgcy
zASf^>Nx#$(Y+z7YW=cUwv7?`<hqj--g=2V9vZY13bE2_nai&wMu~)V`uEgSGW)R_?
zZJbt6<!qc|>7^e6@v@hRMYwZzx?!bny0f>Bx09z|dU$zaRjFl4Nr8E#acNPmerZ8Y
zSbkArs+*U4x=V3EsDE%(sHc;sUvfpMtD|{xlzvr)c1cyLcLmsmWhRmORUz610cAeE
z;l*hYQ9-H3$t8v!?v4h@<(`&a<tEydL6wmf89CXWhCbRpCV5c>l_8}b79Qp9X1>|_
zW$xNhRi4RVS!FrpU<1tx(#*;VBg={^Q{CN-5_5{n3d$=A%`%LQ3d^%ilUxeDJ-nkL
z&GfSZEF6o|EF3c<yxarB^iwMIjneepd^5B3%iOdhD@`l1%^?PQ!OIGi)K}<eSQL~L
z5oR8g6mFUt<Qi7t>7ME766WZspXicS73me^WSZ__T%2E(<B{s+X<3<7V4MSXVOfY}
zKyhwKq<=_}XJu}fX`*jcMtD(Brej%TcBE%{mPfW@Kyq$saA;YfS3sF}VtS~XZ+3aG
zWr<&okE=zFv0tjTyJ29yMM-#iHrPPlvi!i*{6vqOa<{~E!y*GSbJH*rU;i}cfNbNm
zT;H6qieT-ipkT8kZ+9nCpW;MIi;|Lp5En<}auDV>uz}bl$Q1({2+FDC8i1}BY#_|J
z)G!omAofTm&v>wbnT0ti5n)A%;RYp!DOFxcM#av4UI78#<-vZgfnia3QJIc~>COdS
z6=BX5+L3-vIVGM&l|G?GVdnlRffjC+#THRnDZzP#PT9$RU;`^6lLImf{f)G}^Gg%W
zO7g>lq6$K@5<_y#5=+ZNODaN(jor(_Gt;xRjl4@UOOsN=OkFc`f`d#loxBT+okP7m
zJe|`0a{?31O$xyV7W?T(7I{a8BnLT%`g>-%M?~b9W%*Qvd50BMdZ)QMnIxGM7M6Iq
z7nl^6JLQMx2Ro*En&p{B78V$VnMWBL7I~x?g<7N-CWg2`61GcnU}<r7QjtlaS$1KN
zMR8cDfnh|5myd;6nn!A2Kw6QTnR$3(rDs%zOO<b0p}Vg~PPu7%QDs4JsgZX{WnxZ&
zL5jXxfklp?GsHlB-;C^x68G%T5dTd5#Hh&1)F4OmY!|oCz$)*;oLtYaFt_kXb4SON
zs?5SHr!*tC^2ERh-_VKx_w=lkaBZXX?1=nSGmq?mq%=3Mm%~c}41GdMf=j~6{QOJv
zlPV38i(N~7QiGx*jl6wBBb^MqLM_5W)3vp|i;_aZQhdrHO$&VdEz(OP%~OoB@>1P{
zyxg)<eRCqhAZ_e|0N27WPtSl5x3F-dQuA=LEZ33}KaT*{)T|0$kBBtm(Co;<LQrcp
z+(|#TB+0m-q{!L4Bq=K-KP0gz%{|;FG@>*tF)Ab~*aPgs4Cj(umjJKQeBbQEq#Vc8
z67R@Di-NSuw0u{CqQtx;vjWo){h$(yurkkZLvPcvr0`Uu?2O#9+>&rhr-I1b!mKI}
z!?2wE@RX<kuz|(dE}4btMIn_w1<66d?uLeCl?8>q$zHh`r6xYw+GY9qk>+MbC6WHl
z;pXLK9${&r#gXATVU?yP-sK)f;e~~HmSv{NS%%)`UXUuFtjyat+ajgRDJ9(3*|;>{
zDbm~^Bgo6oB{<x%GQ1!vzcR}ur_!sS(#_E~EHFts#LqFuz|7q+T-zcyB&aOe*|Es0
z&`>+bGBDW~?81~H4;OcXP>U$b(7+HscmJ@&lB&YsKxem5&(a(ZL!U$ggRnABgJO^T
zqBMWYibDT<ry!T4z;f*@-{K_WfWjh2uOhG1FjsFke|NBf=4l?8g<ginl?C~!1qFpc
zS!HS7mSt&1RmOfLDHVx%DP{#p7DmDD?nXJ`Q3a-<xxP_ZCh3(Gxfz8;j@elzIbr5z
zA(0{ZNzO*4U<1p%3M;iEJj)7m@}tbNLe0Isy-R#@G7H@*T+7|eEL;lBEK0reio!B7
z3oENE@)M&9a;lQTEh`F*vnzuuN=%aq%nPC{gG*gYJ;T8UmKbXnSLC`Gr1}|pCVP1$
z7gi;wlw<@tXBO%Q>X$o~1$dSgCT2w?Mip6BnTD5I`ec_xd74-Hgro$P1^ES3g}Ym%
zhkFN^rWF@?f(@(+wg~cyFsRH3@GeOXiz@YV);Fy%OD|3{$S*N2D9*|A@-H?s4o=MR
z^3TXE@d_{X5B4oKH?Z_FFUU*FDt1dUDa&xoFD%Wo$kPWK=;-B~Xqpz`nq6pAQk4=M
z?C)iyT^ivXnyOu5RFdZw<eeX3=$Rem?UI!398i+w7H(isoMvI7T@ex<l$m2t>T4F|
zZfPD>QErwFsYbIhGo6BiOw&SwlQPN+Gjj{vTr!Ki1H8+!%PRe%N}>|=^ZoM5Q^O5C
zOOnbfja_^#%aV&+EhAj3vWvsR-2+l8D%}E-O$saX;Qf^HGz){I%tU9OAXC>|i{kS9
z665foGCwEJh^%tYZ0&+<*U(%`&oWDok_fkOZ|w+upR`<`@T62r7bD~Bkg)vx0`qYH
zF!NwXNIG;bwusD5G|wna^^FV*&+;n|N-{7G^$oU6%ytWNkMJ-})lQ4@ED6pEH?$}!
z47NzAC@9V^&T$E;bP6uccC&CVF?F+a%P}dc%!K6S!sL|Fh%~n_ZxekVbBl_oq)ZR9
zqDbwK6rZg0oTy0iQUe1^qtc+L{K$yR5(CR9kF1dF%1WoIz|1_COvlvBqBIM~^oS}$
zXV0WyaHM!t8k&0f<>yqoBxQ$(=DEAOxp<mY`scX^SU9Fyl)43kgy(sd7*vEM=4D4z
zRis6@T3RM+M+A816cv_~xF<Oq<z}asdX$(JC8mN6^mZ#M%5ib?3NkUy2@eXj40KBO
zPD;uxEKD@c%g(C`@HVsv%FoMjH}Nxa&yF;9EAR}qC<+R3v9t)xGcR`z@`*~c3{Fog
zb*l*T0UKBl;O$|W8R4hx=~EO~=ACU>mRONr>{sQJXqb_cl~bCVnCP3CRheO85*n0I
z5f+*lXlSe-6&C8Q?~_$!ni7#>=$2|0VCowYY+eL5Ff+=vBq=!6KRn1ZvN9<*Jh(V1
zEX6b@)G@i#CnzUf-_xWpD?hDL+rlI$q9il2#LuEI+_0eB#JHlw+#|!w$up<KG9oLl
zI5jjR3v6I+zGJ9!Xt85nxn+rAXjyStrIEg`d$~cTPjFF4aG6(9V4zQcnXh3*R8^(F
zSAltslb3&Ges*Y7Vz{A8vSCn|N1;=gM|p}{Xi6m5K<yy?D)-`|6rZ4^w92vocTfLh
zLyM%)a^uh(v;6F+D97SZk9^OvLf?WSvn1EBNOO}yV>eR|3zKleD$CFaQ13r9s>HuE
z-^mFQDIsa0MnPGw={XSwUanzbE?yOhzTW;`9!6&AX)a-5W^SfwS%!%|K}r5TVJ1bv
zE}@YoIl(5G1{S4Bp}zSA+NPF4p(&yHemR~-W?&bF1w^=omV^3q#XkDJi53>Q;n~_5
zRjG+7UfG7;M%u+?9!VjYeo-bqiNT4!<!+`X!6rV1mBBeF`NjpM!A^!1VLm<q7LI=A
z7WrTUQ@l$fi~J2sJuUsS3|+&s4NH7n5`%*(BAg7J!u?9qjFVFmON*oYB0@vWO$xK3
z(yB`JeG<KmU8~G;O$t2%3Y?8f!;@39Ei95DbxLJLwnwI6o}YogzDHt7SYe@=xxS-I
zvA$2PL3(y+fn`v3ctxgHaIl42NwRNxj;nsTQ-q~YdT~^aPildQSBP;%m`AXFZjq^T
zCS)KeDbXz3(6`7XHNed+DbK7d*&;VJEzvL4G0W6DIHjbb#JD2Nr9eA5Alp6CsJz%*
zJJ>?I#NVtU*V#DFJ0mhbvM4h=EH}z6)H6E_?Bxn~!$fn7LZ49c$inobGz<NJ%92vA
zBtK8T@a&W%?O=ar!))zB<KT<}(}KM8tPGP9*W&b)yo|KM5~m7dPou)ZEF*n?7mKor
z0!VK(%pxTy&7hz(Fe}i=vfRVRE4jib$*00MGCwFeBtKU_E21FOq%hw{JH^;E$<N0>
z-Ox|luPiCoEIZAtBw0T)%+u5(G)OzotRN*6?7~vd$}p3(l2B~}XUD<<BYjg-{hX?l
z?99qCQ=ft=*KEtu%<?jK12f;^kP@@NT-R_jujHWO^2FQ<qoSZf?;_8#Dr0@0)NteU
zJV;I{i^>f)$p{Y*&<`reut-iX$;|dD%y-OAGShboNz1YbH;*i=bg3-Tj`Ya!s0ytr
z$Pdr=NvW(TFE{qhPIC_S%WzBf_Q*;yF^qsT=Zb?OvcmJy+`<Dv4I;=GM_!p}nOT`n
zLAHNpfRkxPnV)N^PpO4=L3v=fUujO2S4x>*MSg~Bfu)~YNLE!;p<Ah6V5w<7BvK-B
z4AM+96DyJv3jzw<3sO@o!xOa)v{O9;Qp2@#!&3^P{5%32y&^n(BV2v_JSs{|y@M;g
zGXhJ@BMXhaw4=iF(@Ooqi_82|qRhczXy)$jZC2*x=ThQno)c-}<8R`dS>Tx!98^{8
zlkMjnS!oiK>XICuUuj_&UgE9q>ucnnYgSU2q3`QsRuJW$Xi`?<=U0~R>0B8ONry&`
zg%<gt!QPHlKB)ng`K1O~2L2{4k%hToRoan;Mx|xO{<(hIkrw8`L5YQ#6(Rchg+{@~
znL$y3#W@8P>Au+(dF3I_-o?d{PLLr2r^3vPim<YfvH*jeio)Vh-vHCBs=)AM<HEG?
zqSSy&OBc6bld`13RA;xSC^u98;_T$?sK9KW3Ll@aTy5`M?-C#FD8Ev_z+yj0x1uzl
z%CFQUs>sMNJ3G~>#62jY%)QjnAUPzk(mO)CC?_Dqr?Ak#v9Qdi)Hf_NO5e9M&p50s
z(I`33w8$(cDI&?(!q>&uFQC-P3^L|W9_3Y(>=o{4n3b8AQxXy2TIrnbZtUve7H;WY
z>gR3X5$qRGlB4gJ7-^JRP?F?iYGhEEW>%4!n3Wvim>X=8YUbmY9G0D)Zs`W;-lkN#
zMVN;<Rk(W>=46-_7$#MPn5Gy8n`h|zr6yMC2c?uH8CzInr)K7QhZ|)D8F+^T=jUho
zrsuha_!@Yc`c$M@I3|YYgoSFCl!0T}OFu6!+%L4uv&2>3H`grK$Uxh)G$ge&(j&^G
z$SA_VG~Xx5pdc&AJkj4fD%Z~|wa6*0xX`h%Fu77cr_91JH^QgLEU_@H$~6~KFMGPB
zg%xKP2N#ECR#cUFr~9TE<{9K9rJGe0hh};B`h=wiCx=Dp=UbK(rj!(<lq8uIr1}*4
z>AR+fo0Mf|xRrW^dzhycIOREKL$Y2`ph1yAzF%=zP%dPE2Gq7nP1W}a$ksN@Dhu#+
zGqemTHg+jC(046$4G+k4t_boE&G7It2uuzOsPJ_%t_bt?(YG+vuQUdG*)1m}II$o#
zH7_YVG9$0ltu!e-CB4i$Fg3e8!oaw&va-m)qr$t~$;iknEZNw!DlJFbAgMCE#L_F(
z%OyFtvNXiW$vec&-?6M5(j@i>b15^+G|$Y}PWKHgEX~sQkFY3>2==QgEGsDTb8$*8
zcQf%xPc}))bt^E*i?B4-HuP{eHOVu{sPGHQEDYB#ckwnYt1L6j%1Q*g(AXs)*(fU@
zsWRWPBGKO>$TB!BE3wMLJ+#y<G0Z#P*d-~e($6W-q%g}gt*TJpC&eqT#K179peV;n
zzpyMn$jGuh+_xkuFwDspVqjsQOH#05Vu@3QUr1(AXn?6<T8?>2P+(bPgh7dkVR>e7
zc}0+sftkNSh<0{}XIYV#tGjbivS+YkqGh0Cu(NMiU<IheE;RRrgrR#urLkv8lA%dX
zTBy5rVX%I<lbdr?d0CO4dscFKo?&Wap?{E(TUb%0TUcmlT2+C*PkK;rgp*;QerSHF
zXQG9xt9g)xQF@}Oi3`}vnHkA>2ChE(j_!$}X6~g%-uc0n`hMmX#Yu%m8D#~|+WFyO
zMX7FXWdViBSp`W|K^4LN1}2taZjnxIS=kY}>0U))L1meRz6NEGI;Bd#(jd#j%-bN-
z%sJCAu(%{A(I}-nGpHgnDnH!2&@(5{G1%KU#4Wr+JH^8^(cLq+sK~?1r6ND9#5`X=
zu_PnaMc>gQOusDM0+Lgb@>48}!$OlnEGz;lO-kLpJwm-A3o?AnyfP9EOFbfeqoN`M
zL(D>xGc$rMOHB$xjEv1A3-gR~JgN*$!z_%AvP+CjQ_Iqdid`Yoa9OTJ?%s*6MP|j3
zIa%87QJzNHUhXa~Np7Lq;Py$fg;{DunXgevV7PB;Mrx#KfKQf@WrbOZp^sZqL2^le
zTe-KpcXpPyn>M7h3H3-z&8?^i^ER-^OUrfgNDdCE&`vS+t}G5V3n;MgDXMU-3~+Y~
zEAaPri*!r$%n9@lO3F<s_YDg)G&c3r_j5PNNYC&v%_wuv1V>6`kY9>#Zccb*WpG8I
zMOJV{hCzypM^%_XsBf;Bg}<p+MOb!PPDoIiZ+LEyagbqEM7W`mX{kqMlvi4znNeX-
zroWSCo^w#Xd7(epz~oFn&%_j?0;7tsfMk?$t3tPu5^sY_!)(`_qNwt~V5i{pRBu1m
zq|Cs4Z^KZRBuk?*W0&%Zu!zj`^i1E3%&c&*fxeX`X4wT{fn{Emj{YH`X1VE(MVW>A
z@Hq(!Q+<D*5R-y11D~+);*!)-(-gPT+%nHB(<E>G@ElL){L12tpeoOz!U{;F1Q!}c
z<dhjzgr%qEhi6A<d%6_(<>+T6ni)D9C06G7Rs~lS8x|#%TBIkYy19qBhgOy(`Ff?6
zdFT7N8;7`sR+goBr)QU@gq0LQvR;LSd0t3KP)@0*Peqnzd8l7<aFDm5VNykvr+H{l
zWR9h$p_{8|R#terccDeNWu9wsSb<NHey&?;ps}%GscDpPM54R5X>nv_QV`h7xe*3N
zSwR8eju}B|#Z}%$`sI#($zh2lNhamSmgU**dHT6t9!cI&Zkg%c-o=j2RoeN5;bw;U
z#VJ0H6$N1#L0*9YmPTP=VMb+;Ff?#03`+5H(vB=KFwM;?j*Rm0%nS~8E%EX$s`3ge
zO{^$0EAxubuBr$NFb(t#i-;(Sa4Sg;&nZd{FEdS#a5jrHv~;w{Df6<7ge03%6Vrf@
z0<VBj@8Sx_9OsJ69H$_&<Wj%nuwZSoOv9k6Lbs}d>^#%VKrd&*%51~D6d#}9$Rc+O
zXZ_GTkHTE<g0g_Hpt9g77qdXHmyJTxT?-4cigMG<5|gUT3{%1b@<KvQ3S83M!(EFF
z%!=F%T+=OG3_M)Cd|b_(46<CaT+@ra(~`U@!jh886W!b*tBl+V^7ZqoLcj*5_#2c~
zmFDNC_>@Ez_@p|AhMOjrSVn|9r5EJnX_w~Ymt}@}q&s@Mc}EszBpC-r78RD46or|)
zRF(N8I(xesx%w8DIprh<c_u+z7@kvJ8B}cH8=RNr<5?b6Vh~jo8tUR*VrFWY;-3@l
zXIhk(qhIc5Y+P(v;pT5?n4gp5Ym{$P=;h(5oo5l@Qtla^80>D5nCxQ)Nj5p@`9+p_
zZefW=1|F4``NrBAiCzU|d8t(;8D?P>jwt~dmd;*biLMb<6@~7uQ64@9A+Bcr-i7Ac
z*=6p9>E6X<7LJuBhN)rhArLQz8JJgv8ibnoW|?J~W{0}w8+dDHx|zBaX62a$1Vkme
zmPA<Qn-p3)fy*9Wecwd0V8f)Kf{?`2NS^|uQiGsyi_)S>-$=`FFK`%ECKV^==qLM?
zJC_)0yOv}Yxu;c{TjUj_nMIcP1zDIT23R<TdPN2s78I6L8U*EfnMMTo7i31{dIc1D
zg-7}XI^~$?7pLch1(-rc=PL8kgH5$d!vk_d!p*(S6Ad#;3XHN!b6iuZd>n%+O3e$L
z+{+59N(+lTJoP=@vfZ3QiZToH{Io+uT`hAhe9Lmns$6{vOe$PUAziZyV|{0{tcWaE
z57*HAEK~2IlB7a&pV0J@h>D6Zw{Q>BaI>UD<0NC((qglcG$+&W(jb=<WAEUKqGS)l
zR9~|!gYqEH^xXWcFe4MNm%a3TG7EF{O+$@}swzYDO>!;Di~PLvvl4?+0;3{K9F3h!
zoXYdO^SsTy%*>NrO$<y-3JsI>bKLTCj8h_#OH2!M3nNmB3f)Q#A$3YxVpKq~mrqoc
zTds?9Vz5(wWl?Zpn6_J%fmvulW?@EwzNxQSP(g80P-0S%QE9kuM1iw$R#u65MWkU?
zM!0cKdXSrQg?_Q0zX4=qDbg^>(BG>lD6rJKuq01Aw4@};q9RE<F~HY1F*qdCB*m>P
zC)6$7BFiErKf@xxJ=7~iJIK|;+~2z>Lp#hj$XmO>w8)~eGT+k^;$@Glw9s6WGPB4)
z_n<=0fUImw|02t*%A`m~gNm@wG>f7n|6s4kJd1KOvr4bjf<oU&KZ}rxoGeF^eD4DN
z6pM-~lNA4ml9J5Ak|=N(rsY)nL<AT31?X4!6<Jo8<@vgs=arR(mO6R_dIkE2=KK5k
z2b-rPhXog>Wg90}n&k!frD;d{g_V_fXJ@z-m*#|2C7Wb98yBTQ!qCMx!X?9`ARr>b
z#4Ri_H8(Fj*So;nG*sV5KPb&7BsbV7x3tJCCCoRaEFi5?yVSDG+1NETD=E9MtRN^k
z(!j$hrO?|qEXpsls2J?RuoT==tfWqYfDI(cyfl5tDmd^mG03VcF8Jy&2m`VTFeejj
z?HYK+AnM{Z@Dea9tgGEXD+GOgt^A8J(=+oDb6oP1OF@fRJzeZdOEXifEM1LVO`R<r
zEu1Y3U7QRJEzFHgT%1fT++18-O`IIf!6swZk8m#34kWWHatriQK*lGfnx>f~o9S96
zrJCrP7^NEMTACZB>YAG+r5G9*C8e4troya1(vD%YXI^nhVqS8pr;8oP=u8rgM$(R9
zbWmzuN+xJkuyamgaj_kAfhNpRXtEf(T~doO%TiO^it=-bTr_9{UUzGiiSQ9PB2taa
zERqc^%ym=D6ODDv(o76>EzAuQbqy@c(o76eEeuVPOl<Tq^dcMtT8RjePC>*mD6l~B
zkeX<bY;0y^s%va&XsK&rZegOEm~3vYYhYlJmXwl|W{_-_2s0f?J2unv$xcZS*J06*
z&6*+{(E>39O%`DQEZ&gf1u6SL6RJmMaY=qrC6WXv@IY%qgHsDpMPU-8t_cONiMC2i
z1|=-J;>5Dl6j1QGmX*N75Ke|<c92C7O-LyN6n&tyo?>EVV3d+%teci-XrXJ8Vs4?E
zXl!Ptn`)G7o@{QEXl9lMjy{n2NZN7RQJ#^S2VLE3sB2&WTHR}8VP$BfYhVgq-3xIE
zSQTzViu2P-$`gxH9n(|uN{Cu{3^5i}A8zxLGZORCQ&a5pVL=BIK-Yr3pag{|C{719
z>MV&W0U-fHYSdv}m<=&%Xh$6+LQyj#EN5e8UYktBa(AoXlEk8t(t?1Z{ItxRR6D1{
z;>=`)Aa^GleLSiV2@1t5=)(Ho(j@F_>ru@_Q;lS1K}wocKv8~5X>w{2G@k^xxGCgh
zCKV+XRVo;o>Va0&qo{)Fh6DuC))jE+1Q88#akJyH(FgCi0qwp4OTh~hS}~vwqm}Ux
zb4M+KgaeV}XmAl3P!MIK!37BiBFWL<A~K*L%0`0=5)MR?qrpXFKtYt@3ob6iw%*jd
z6kDZoC40O74;Tcz+&tYF7?~IuJQzF}82-Oy&<V)MFUc>?$S+VZ)-zzRaPstJehc1Q
zEC5o$z{mhn%K)PP-)3lGU?d(eApkQRK(#Y5GBPnSF)}g1C5UDGzstbF(D?ry10Q&Q
zBSXpo(5C4B2N(o7n3!0Ym>C2a7zG)b1R4JyVO#`uE&~%2Gk7OE$Xi?-ob24JY>dn-
zOpJ^Sj4aHIp#Ab783qOh7G^e1E>?C9CPr=n21aIPMn+Z+9wE+Durw1h3nMEVI|rwr
zkYu>9xMN_%)M?j5B%~CT3_~Yhx^ne{sF<>uMQ~Vg`IPBPmT%d*?ecY1^NhmMipCYs
zxm48DH8i!1j7>}}t(=@)T;1F~JiUTKBBK(Ml2cOC(lfJ)O3Es$nwnc$+uA!iyCzPW
zF>}_!MT?g%Te)iErp?=T?A&$m(BUITj~zd8^3=r}H*Y<B^!UlsXD?p9`uOSdm#^Qx
z|M>Y!kb#kzk(rr`laq@-5$s_`CT12^HbHhFMPX41!@%IsiIXpKh=@4`6)wDVSzO5|
zv2oIrMH@LyOwFCbii)>9R8}!I3rR{YDc!XB;FU)oxFn@i!S1%Ow1T-gJR&j*<lv0V
ztg`Y7Bo|MeHhsp-S&Nq}UABD1%2ivpft(C?@zrbBZ``~Ea`AH%C;z|2z{AYIz$nNh
z$Y9T~A9N}JIIJloB;N=BwUqwJ)BW4bGs<2mxa{WR#czI6Xedm3{r?L?#i196ibF3A
z6^C9PDh|CeR2+JBs5tc6P;uz>q2f@Y%27M9&l4a2c&od0|Gl?i`OfCqwM`ejm;Bfe
zTDmVbMM-zt#XO((=#bk}UUDyrO5LS-^!lOYwXyG%u5Eu~&8m6r_L^l&UhZ14a>3Ol
zDOFot&bhg=cszBT6e+W!r+3EA(_3bV^Ld@!xVAbm&0B3+)x2rTCsl6MxbxSGW7fJ!
zk;Z|=%E`gvwl_?zUT>*iwuxK#`p$^U;oZqIMKlc86}|DCE497i+s!_!E78r{-eum|
za&W?HZYS9X#iqJkzS{e}F3sO~`<lwOGt<kgcBMyuICOHp&m`mU9RG0BNoyjWFUc#8
zxwhZjH0G*Lp{vWE+KE}m)7*1CeXSxsoq93TE_tiE#}U_(Of|`_Z;NNGI#WDRRGddt
z<Y)6V3((;)u-XVa&GUWbzd5~s+Fbw6R#_`QQFC|B$BW-;s8qe$|0kke`RDnaLu`f7
zyK>Ksy4!9D-koms(9YZ7!J&>rT8}@ao}RGnMz8F&%b~j2K|xb4*KJ+OadqpKvRsvI
zt&2@Uz6flLSD9qFrq9gOu=w70Ez`AGYFf{=l$4TIc`RA7di|=tt~O1zwNqalbC(W%
zUwvnX>6+xgnCDtATwFp!FLV@r&d|Lqqb<1RO}g_!xAULWrtCQP^gGLmY>w?#QwwBb
zcP_gw_i)OtTa#7=`mQpudcX9D%`fwdF=eTzuQlGaUcEZ@&je@Lt@);*tE2UbB0pN1
zXdSr{b~<9kiWOZ~yI7B|PuXlPwtnX(*9+d2>$zNWpE`x#G_#&4GiBnlwK<*JI+a#g
zS%t)Mt#~pi<<fC?;SEK_{(h#x0j0%Ry!<D>ifipk6;FLFHfw3?xl+ycywo%Ij%;<=
z60N*&cJ3n4P_OVA$1<0_TK8|u+U%(7vfHIK&7W(@3Vt{HaR0Q|vP)a8eT&q%I`M|P
zaM;D=p(z4er9}iEFE>&%-0?Cb^l134>!%*x<(c=y>`O$sRTG!At=`k6b9#K@m6nRU
z=${s@y=3N(jPn!Ul}Q;MX7SbCr99z&VeHDB>3_057Hzoqc2{|~M`P(0pNUL0?!jdz
zYov{mEpD&4zr1Eu{(DW+wc7i=u1$T<*Z;ce_HJ3BEoE=-u3oWlTjm=nb5|Q7E6s=}
zv(|gZ9d%cGw9G;)BPKAEZ^{eXO**Tq6-~o@xBBwN&NST^a#Lw)?<DhOOMMrfEz;6^
zm2p(uE7NBA0_!E$IWn7jU!UDuZn4t4@b(pM78kWAS2bp->OC=&_>$Xs<^KC?f3(b=
z3i~fE7md2|MyB=G#jaPpZttF6nwym=D7Z2!=vb$5)rnWD!XDfV-ZW{h#21$zPVQPO
zlTx!6Y@4(+b=#(3BS))MJ2TX)US&TGKKCZ(yyj%>+5K0;Oa=M2KC+yrIq#;@lNo7_
zStoOfh4&pjc3F3)Pr*yC&>UNr@L8@?u5(Q(oF3F3y}oA2rM(}gb}y}2xAxCQr9I_l
z7nW{W<2m`*S>}sxkFUF)xU@gd!peMc+|diq7QHb1S&?^lf5}@<o#>OEwxTQLC-sGC
z$F@hF>}xlzUHUdOd$M<Ez;DB%ISSabF;ukTc!240&yuzK#1_3Wlkcz&+qpX+Km3H@
zyGv1DQ?yNGV{a8zzh9xVX6t+2Z@a{Ocii*#>g)ZH)H^9`NuTev<9#dMABtLfzbudS
z+rGTByRL<n78@Rn7tt@tjXi$4$?g21d2Q||%>Bv&H|<%LkYbq^>Y~-L`o}-%lFY8Z
zyoZ89Cxy#KUFceVcv1HL{eFCZt$&sLU3}4XQk3{=%~#o47sd0W(iheBU8(iEre1z?
zE=OhN7G+PZ%#4hw!7^&Adc)6NF)eY8yC3Z*ThE*(y~Nt)N%*#9&fP0=j<1QhIOWNn
ztUG;aOLn~tdvxKLqvEM;EjzugH@j->zItzI_WdxKSg}Z*P`91a76*psX5MV7%x*cg
zL+xHm=pENvmXcOHMF)Drw|=vpDl8+k%qRSmPlC(QvdN+@cSC&DZxrUPFRMItC(S0-
zb$gWR-h;~vKUGZEDb&@hzJ0RjacQUA`TbtktM*80Ew6p}zWnRnH+CI56K|i-35XWB
ze8rkyI_RY4MZ?!RlQwQF^jHxg-n?+RZg^?mtVKuWEt|RH$E)z8L905idpxXO_UX%!
z?pJC(tyZF^w1wARRyyCTDpF>0B1megm~-jcOm~S1hwh|a3EQ&1Zq~K-g5Z$LVGg%e
zSxV{}Ul!Zj=Wcp(*LBlSQJXEnOR`pKt;mx1j24-2_1F%}{;Z8}V~?Aj=<0r|rTAAh
z$pV@jD5CiP6KrcUwtefo{n)KUY4aMfpP5&Box9IQhN<_QP1~^6=yuTT)s}AVE?&*)
zh1;WpV)`zr`sIc!=WS69iF~qW+n2;A`&FdY_PZ-ZPro{2`;wP&cTU93SSDI(VP_F4
zDk>{2t@Ad<cwu&^OjWn?Ue24Ir&hUUhaTIed%lkQa;Q#O&6*f-si>N$pA!QEqm`#=
z$jo?l>ZxnyVcp_eyCcoo=j?aSY1{B|icS04A7-{^*X~%NwQZH=bh*~EbL&2uE)80>
zr1RW~Qm)X9yV0&0T}jM3dj)3(o=y#(^!3`BFeZD6H@|XTtayC*#OlT8!u^%>rc5gL
z__k`5VZ<G&nVu)}96l9J{1x&lsaIv&)m4+iL()!${3*LAKdZO3YnGTrw6v(8@x^mT
zPd^abC094``S-b76`#k>srBIH`evBfzkhAdn>#JlwtDgRZl=_o(LA^2wEm{PueGnI
z&EFF|>vzfiD}BvxOpn>VPHA6Kn{j>fKl!~?{ZF$tpO<C%cXDQ(+p4;tWd_Qz{8Rou
zd>}q=(}YjwW@KGkm?`>J{IC7Vbz+=dt3zhWPQCa#_Ug1NQ-e%zUHL21T$p_L-r|l`
z6Zh}yS{i+I>CeeMYg4&Z%pOJmwA=DZ=hxAYs}fhwHI*%TJky5nnC9%A^R9+H?iO8l
zFzCR3OZ#1tPMgI}?!97g_3|RU<f#VJR6`D1X+}3cF5j_Ju;Ac{Cl~g1T&TTJBX6<9
zratuXvCDtv-b*`Ne(c(-54&eyUG|;#B){fKi$~YiFHc!{<I9KFiq>B|r>|-Lb>Veq
z?KSDWQ&&9NeSPi?Q>}?H+j7p`Exxnrmucv0pY^R~*B0(r>{I+Se4)^n*xI1$JM?SU
zCvF$l;p&(=aoHnNLFs^wxfK^T>0NT!74o8`((svkT8#V6sbbz9bv85JE&TCGveYEn
z-{`YfaNd?j+FnaN*JehA*R{E>T$Q;x^Jciavg(ql9nWn0+zsCrC0#c?8|rQv?4_l(
zXxS8-Y7^t&Q?IJptSoOl&7SonZ09=HgMlY@Xld!Sx?bw4d$iLBSJq-awrl<BUA2oU
zdd}Pq{G>7C?<C#Io+|6O?84o5L~h+lx}36Y`+lvL#S^Ao4cT%j`=C?EqgQPcwl2H5
zcl)7tcdBK-J$<?|)t1{t%_?5`L#W52q>7hvJM?(7GF-wfqps~dSr)Uqzw%Rao_qPa
zrP{C7vcD{ok`y*{iBz3flT)}|<mkCrtrgc6&wIADLU)Va%Xw>GC4cAUoAO=R^TTwl
z$Gm4+UgxD>kGOi@-*k3urgy&l>MKh{-PZ15pSkW=w&s)X&l@&%U$n1&Z~bKRyJ@8p
z6W3|<Wi7gT;_h{?S&GN=&17WmYOdPLGx^E>g;GyqH6CqQzbtoqMKHT|OxKja-Gvj@
zahpj;UVe04jeEsLza0PhRxgcwk3BbwJ@(M_>4}+kHPe^*OjkVhVNRXTO76n%_X<Pr
z2TfbGYU_trmGc%Ys#-ZWIOO}ufbUv6&xaPzS-ovt(W6zXuZPC?M_djL4m@xw%HpP|
zPOjV5o-EU~S-HwlS6p2DHYq8|p9s`$U|^qE@;hv+xVQSUhfS%|vw5>##Z6@G6s<dN
z>$#l0>T%j+_qD})*lWFZT$(Z~{J>Sy_b)4Awg1-WnFfX4t@_m!dhX~M_jITK3}sGl
zlIwaUcCK1^EhJWTV$G+*BBN%vE#<q;E?eVsM_PIIY?I87Q{J4NW@}ZDeNp_@#9+s(
zZ|8q|a`Y?b3yWn-H*<5JzW*$6VeExh6T9v|T^ZsOwYYy}UWB@RXmG>jcNT}cmS#+!
zv{iA{@BJ!&g{P^$outk6%S_wsgFV}(kWjD4JG%^y-dMw59=X1F#?uRvl&7wVDSbM9
zq2B$Sg}xQ*q?c~ZyKGqeFvNBB@}$hrgDW}%rK{fcuAR7YX;RqMOKLxD!z!L2rA!Ov
z?QyUFt;w0c<y!f|iglA;-V01Uvf$yR#c`iFe=3`MW}hs0E$6MMzcp{JoRXmA?@hDZ
zS*^c1X-0kjy0YZvG#1e-xhre>vgfQ`C}?nDRXcZNmZx{XhOHedI%XBd?Mh0K`y#nM
zruij*%x8O3&c9xl^VdIe(%a{~cYCjU70>l`j$hi>RnDK)yfP~H>mLu(PqA<JWLaNG
z+grrH#J2d%pDVIp9ggcKz0Z9d)MBP|i|y}Lz0<$q*GrmqUF^AUyd_O*Zpub$!`C5F
zk1PL$cDTP&p3WXOZIwr7|F%~@vetd>)0*<Z`pVMPnL%c4TfeLcEsk!Rdi&Cq`019r
zEUzt_yLqRe=#3X|wU(cj+R)^Q6fR)qAFbV2)>ehAt=Dz_BGh@W+4S{<TE*A%(w{A?
z{;KV}Z+|Cue!S|v+s78~Wy`*<@XPtS&H1yASJs;S`lllGQ}5e7Q`r~BaXZ;B$?ber
zm&La<bNls=3fE7TZ~M6NhVH^`ug;Z}C-7?B`gOYXQF+Yn0K>hK`}~jU_21L%bEpnB
zd@;RyvY=e2ZuhKN+ijIrB&{ymx+?R*_g4ib(^5*s7Qc{ru96eJOta0-w7}Byb-XBd
z)~wDsPxAV<O@3WHLwZ3~<Mt%eosrjg+Y?JKUCrE;Ti4|uckhepi`PD%HdoJokyLVz
zzu>RP%x5v@oA;j$KXm{7_I+<xYiSz?U77Pd{dsbqY|h<tH$SI`+ikbET`_Y<Ze_)1
z**FX3N#KydO4R=+Frvj)b3J#3Np^a4cyXz;X6ow0A73k7Zi{?z_q@SgwLO=5SIN%g
zH;uR`6RqW|dO9YfB&x!+tb46+-27AfVvBMouC`ycer|Pi#@?A10}r=sot*n%O{>hz
zHKDG)Vd?ICnzJ^gCv@N5seK|Z$mnSNrR!#`=Qhvyb2D4*n3hG^M)xx-cAj?$iS<%D
zX~sQg;<HC959u!Y?z*)<I@?<Ih-|gjmao5d9nB5nJQuFM_eiMa+{_72dDTp28U4~&
zC*tARE4QL(?cPWcU-1~P)a{GiH7f(Iu8H3EJofjjKhZ0LPwX|d7I#0`wZv_=>B&BW
z0~6h6Jx%F0QnEO;%-xF5sCcqU*Nr{8xqGW`H*bx+F;|!S+duBhT03`aEjSc8&C@W;
z&ZIRoYKH3kdv#wvW}lDns99_$e=4Bk6HoietrtFj>F8HDQF%RGK8V?AZ%<V6``5|V
z=8AIXy{CWQ7Ae1(t9Z51={Jk_##cRE{$AwSzPtKgl^6WxE$pwlsrXVp%Jj{DhW7Vy
zi+@I~`Fz~bep6cf#8vaBtnA^q?)=ogqJs7Lnk9C=>)PipajE2Tesc3g<u8};2`MJ;
zI&$q-%~JR+xbgSiydUdLRy|GJ{&cU=iCwxOt5<LTVRCY_SKQa^!#!J6z24t(p0e!Y
zi#Z)7VNdfFuIen`J$0^i$>xuf&aMvk^xCn$FQR1e%O$aUe#Bf`z2(G7K?5si&7XWb
zD*Zbz-}>@bKR>ZKTfrseWZNar#b#;Cla;g<WhBqsR;KS_w)@iAbRX{^wJP;z>z8VT
zX-@D+-Re_cUF*3awpNwztJn8GVlUm6Uh7<&bynC<wAg(@T(8W;)Z>90XWMOx-@Zs~
z`!(COk@45Ptoo&8gYpc8tXf^dbvUk^iz!OAU-<0rldCD~OR651&D!zhs&#0w^lqD|
zc>Zbc<;Ao*SFKvdxynCImJe%EXJGIAx;fI|F5BhR&lW%X^y=)UD|aOu^HV*)C{Ku2
zF?*Xk<L|<y<>&6`>6ym9x_-dulGx{|-o9%yx2!+<Wn&)ioq3;b_aB<3a<Io=-DFkw
zOrFf^)mN>3vn=0*B*(0*dh+Yx?VFx^O|A#MH?~@rdh~s~B9Cs!Rkev#XOgrk^NO~a
zDeXP$I<0oWw&wHOde5sqo4MuGR@q%U#ifoZ*}Ph_W9__kp}x^?&+0e3r|8W-HCfYh
z@us;SydEs;$dy=rN-j{JEhzW<?Xt%odJS_7Ov8hlY)T9}uZP~QzG^)0t#*@g)SVM4
z$JNBMRl{?q*S$a9v$}ctiXyeufqGJLUwxM?yHdF5*|EEvU90E3XBCsY0h%*oU|^7S
zx^X_6GxzMaOtxirW2cKH-Q50Qr|H3`)=D}{ul2ef)4O!9XmyoLYpCvl(&*Qjr<QhI
zTf1^!>f3wmHhV&hmhsGwkG!j5q~-1!tn}#F&Jwd(;j134DQi3M^k~!@rO+^KV;km+
z&KF+#{EF&XdPl45c%YD#*RP1nY<oMGeht{TvdMN=Y`~R+a(7Z)+!ufKTPye0@CG94
zY^<l}yxsh*zHa)I=T+j1XRh-va7laqq_S*gVx;P??1IInTvDoc-<x?(=hW!rvy#f{
zE)2<CqN;h%SJTU*Z`~@jXRbQENu?7{bsp&oDiN@~c|KY6_NtCmNq2XdCEi>ue&cnf
zPq*aF=gx(9nockNx%Ali+KriJtQmWv@&t4BcS_Z6dd_rEE{AjC9bb|5s3@`4;JT?x
z{oKy1T~oV|TV7|?(TRqx#Z$%oo~d}9sw%s5;@QTICrQ7(@`NAnSh#!!+tRnSb;q~L
zvmgG-bl_ZPTDe<J<u>EgQ*YkJEKd0$Q}JEw?erVL56vyxRCl%5x&$6sc=giMrJ_+`
zulE$4=4G-Ix9qzr?4`Bnt#!eJH*H2<OLIE<@{-;zvn|njzpH&yc`(~~xr?!8Z&Nnz
zs@q*R{rv7ZJFCKGSNES=ZaDkv;rzSp-^83u`by7T&)EAo;Hr^b_lsrHoIfRghnFvn
z6P0~kx=_qVUU)~WiciR?BeTs!rmxgB%DNj})we3ubb4k~J@X>1t+i`kZ*S|{r?q>=
zBCU+WYusn;>g#`?dST^TUElSA0w3ZScTeB&F1>u|x?k_szAWZmC3N?_?LkG)&1>^Q
zL%c-R+MF!DJz4tHQjMj%C3)3by{?8V6%9E#?QK-xcFSF}uNSVn{Bq(_v1pV07}+`Z
zZWU|IE!`0RCNg%?<;lt7v%_5b;@@7>`)2*MuO@o!ypLJ)PN^K+8TBq8woc}UvGR_o
zTV8tyiM0l8S(_Q$SS@p0Tdu3$^<ZF$mgXtbv&DgDL&DZwn(;YP_d@UVZ+_=;qe|CS
z)_yuTHEyGa<=TC+oB2)}iOC*+XI}lbPiN}c&9|SrW?sEhcrs(PSE!6vpljIdQ@+_J
z0&j+Hy{L2LtZR}ruU6-Bmbm@799u=Nd|B`DEYw{qdCTg>&lNx3TDr@1lU)DHb@Q|8
z5?5|nx;u2EN93}3kIkx0&%K@Vq~gmLy^ZTck122TG?VBqogVS*yQ^dGooTX3eOI4d
z>#94L5`M+!M{?RUYj&-5ysmG*{y7uQ{K?yCn#OJAy9-ZtsmBIyz4p1lXdVCV<r~GK
z>ypk^B}bavlm7bPvg+9n*)FrWwboW>_;e*y))m%db+4^!dnX$0YyDNz<o;T9&$6`|
zx98mTeZ8YErq;;RdP!QeNnRQEyeoG@F5U^XW>4D_Riy2@T46)X+wZ$YKiy^fn7Zdy
zn){BZci(2uNn3V%UD?f#GG#h1gdgr}kF4CPxm`L|WY)E*OS2DHu1lWU@9M7fV8!l?
z-R&A(eJ;h}Vk;lmE;ie8^FKqXOx^7X+4B}ge`JY|ohmQ0X{*rVno~yR54XCf2A%D_
zsdRp~N2l28T>dlBz6X0Azg2xcd-d9jK`S+~R?k_pENM6I)>|%bSH1XbcFI&VAlTe0
zJ+bfFtDo~N`Es`JzPYjL-IeXyg12~QgiepR*FNV^LD#JpS7lde+O08Nx$fTYwk5&s
zUO}?)53cXe>bmc1S-149%$?#z$BRXeKiN_4XCJrH@G$$sTYkmMtZm;0E|?wsc=D!P
zo-5IQxk4A-l<9xFc5{Bl$-?7LFaCR^^tUHoDSG|3V`;0jPxdMv(-e!1C|YCnR$$fT
z_SFklEvhnF+I2N{-7fviVE5C@W1c9iY+AbW!cI}){8H<T3?;>w=W-XLSN)dGxjEsU
z!Jg?>yPiirv(+uo-;p*gvQ}!HS>+j<FPYO`&wUqjTThhheeP7*T<@l*SB)20&02UZ
zVBL?WFLkV)Hg+mY^>&s`J>6!z<kF5jJqb;wYQITyPaW6V`SM!cyp(tIBrearW7}T5
z`;e~q$z98rdX^rGOnSd!hKuXXxs_j6ayTu`{O3AvKj#*$&WF2Imr3MxEt;}Rtve{i
zyX>vT>^WDG|14WK-=|m8JtuH)L0NHi=HeZzN{^kc+We%fJI%2_cJ21pyzf||m(Kd0
zHdpL<=I&^9qfUc$W~Y19<J+xxPP|h+HZL<f_lQ=PZ?@N^<t)#3_J!Gs%5*m^db?=O
zEG<jpSu55CYv-D7S39>`DW*_6Y{ygco~pcDuGa;}XK#Hplg(|WVb1zPy|($5sm;2A
zS+_UeS`_p&I5hv-mJh4^7H`gaed~F!si@S-KCRB}x0OS}9KBXd)>^dgT5Is(Uo&^D
z-MDT>X59Q&drw>LePUqhcd>kbf<>Qx&)KPS@2~l$(^RHgvSV-RZp+m6KIgRm4B1m9
zm*-u2U~YKhe2~$+)%<~?4>!E!TwcR*#WiV>WoS*7)-sJ5%Pd}m+;TlEtE#DM<?$$I
z)v6UwE_BQaXFYuCu-`lOIosTBefIfw>9X{-l`mr#n{JB8shoS6c|N<%w#c>NOYW^L
zv#I*`yVo%BL{SFc?K_Q=k3F!wGfQvvqL(4+&0ecaOIfa!ykvQGwP>sC<IqrpqrvMs
zcZ4ln)wj-cl1WIGo>X~>-m7;Lc{k=xIy))1e6yMT#3<eAzj@!sd^vQza#P#_mtD8?
zzMZ^TyUgTv>ABmM%OCnT8(DYdUW;|QUC6a6)a%mLg*U~z=bYl4cQmBP<*BPLS9jA}
zu~sSNRg1GHimv+d)qD24g>U9wIk@@wmT=$Qkye+Nzux|`=J(Fi$CqYq-|^>FG2bh>
zlXrcjZK_>MZf~7<`Rna3`)cnzhDyqOzq@L4<j1?`cZ%`UUCrGXku&q(^r)O|-X;>;
zPP;C;nqL*;A$B`<o3M{_V}7&fk}aoo6?wPZ%9yfj%2G|gsCPThZ!NyJYF(IRXt>v^
zpDEI@f?F)ty0|T#aBRNG+rL7$eqW!({+n-$OzWFp+yCf)Kcw3FHT=W;`&~{ainQ5O
zlXj>441IUGH@QqgC~y1ezrP*}T~6J6ThV0s(mQ(PmhX<YvWbKRhFZK{?IkcP|JoK^
zF~d)ml2Y39RyQeShQ@0=zw>PCs^?nD!RwYQ7yrGpw!Pr=*4#4nt=S=wu3^*lBltM^
zOc!$Jtz0zG^_84w!=3o0p-~sLvy6MSo1-p<E)5PX-Wa>u%k8;!aMC-=TeVsikM8GO
z+*puTyxa5KiptMvZ?EL`Z<}`_x57_y&Rs)UhrJ&rrynfUJFzF~b7@_7=9U9n??y~5
zKfG>w`Lwr|QzWjtr|tLBTDxV*!cw28(_;HNyH$GKU1sNHJ?eax5o5E*K>PA7ldXJi
zhBs`hPA|S+?mKyP)$-1ad2>={t)9EPq`y7U?8@Cwnqk$phyL2hZr&NV=)|1v*tIi-
zm$)^1np8PW=(;%LMReCaFVU#N500unXPb0P-Ss%T9xmVJJuU38)Tuwk!uh;v#W&ZN
zFsJdZHG2Jwdr{@mOMj;%|9HGJ_t?$SBcF5%>l!3>-p=`yeyr+qe%FyH$9CLP+V&`D
z)mqb9(|h;suT{Mk8M?)A$yQyLq;*oF;(iiecYC=nU$<;k)h5x<A~kETS-gRBp3QhS
zCo!^aVe04AIY$j*lK##}xh`^iqHaKr#NBnFkKXGFDw%xQndjcfdsKDVgzYh*8dKC(
zt<{VObo;p^CvcgO<%!PQp$9qjtCN#XyjQvs8h1%DJ8kX$vV>cgwe}}ksn3qd-|1%A
ze_eCOt-KxUT>EUJnad@@W|~X%#pQmCm~zFs^_;ffMXiU6Hc7u<?R9O-?8ww>7YtQa
zW+{sn92FJ!yC1#94dmvcO`=Cn%y_GnVX9X=^YN|1mS<9}(|tp?iySnp{+BvAM!Hum
zdqUx#q^o*gD`(uX$hFstdK|sbV#}6GVy~zAvudtd?G+{xxGz&$B(wX@6PLHE9<)r~
zyK!Sm`NS-(^_e`qo0q+qce*+>{Zh$g*H35K6;CTZo-VU4#`8y*Z(_IH(d?Ny+fUDU
z@_P@*#>|7WD|&7Px`t*2Z@IVZt9F$C>P^`hNmo}%b(|@<?RshNx{G35zp|#kDp<98
z!|}{u#~pnJGZ3w3FpKd&fn^{p3*J80Tsb@Lb@7><;Q>8e*Tq(?6`T7!;n6h>|LuF8
z3axz>-?t=aztgqO`^)~Cua03TiSn2Acd>mjN%dS-n9H4#x0|N1S(?1FTs3=^Nzx3h
z=hqi{acb_8(>qe6mOZU6JaKX6*WkUY7niPcy?c2%>rr9Nt<V2*K07}5tKz4gf1kfE
zai5T47dW5)sfJ4>oAZ-fppLBjgcO_CoBLNXE{r>-Yxw=^#<{a6^rtU9{rk2~|IJp*
zt20x-1>Re~x9IZsuHyK->AxmksBPPEd~eQ#Fa2w!zSST8zCQ5J+OW#U3+CT6)<1F8
z|5V5^mh6R}=I^m#`g|=&UhTT``AbSRt%{#=zS#U#(mr7%uW{b_aYw$w$?JB3UDxJF
zUD~=db8l4G+^?BY6`@OOtiq-$?F<zcxv;`<s$ik(Ev+TDe;hag4qpsHSaBEUmY&|C
zYnC_O_Ep`v+-p45T<*AJX4aDEy)Tzu?CDo4lo7dj)oa$#WY^HFC6}Z*qAW}+Plb!`
z6}ag=>F<3@om-ruc~_@>cji+1aZ|<hQ07&upXYpz=}ozEqt8e5o$KlS&SG9lC$Apq
zySgRGWpCMTNt;(IA1~?*me9Odb#qN@kjsT#tM6P-nz%8TWiCTi0Mvzy|0XR}4GGE)
zS!udx(voXif<nFCU)r+P`d8Rg`@qoEbMA7rKU!ody3QqRmF&|SpQG<w<2BBceHWYZ
z?1pR7%_4Q#*ifaXg1I|i2>p2*8mgIhC;51E{;I9_V`jY5+kGV{Bh`H2bItx!qNly8
z{d|6(PD?u+uxg)`o0i^Ut9NU9vuAz19VziW!(FlcU+0?1o7KucX64M-thaXUd|M->
zgd6j^X6?1gQ^_liIdaQmNpi{RP~WWbhtEDh9nZkP*kA5g>!iDKZ&T&{J(28+sntTw
zcYD8jY`wSh^gHfX!tpyyUuCDn9q@U6f4kr5L!W>1`Il|KQkZ{Z-#+~v`m0(Q@)o>J
zth_0o+WjrIZMVm*$@Ah8UoM*$qjflFV#40|gqU~cI!lkqmU0H~_-C0u<Bh2%zp?k+
zq>Inyr1~js+uH4P>(`Y&|CXqk=RRGux|g*o`Ds$|v|g`Em$hepSUjEO)1Li?);okZ
z@9f<9>fYU*aSO7xZA-CCGTAP)Z@bi2E1s|6>Q~M>?^#-YdhXUc(;nSD`b=uO>!dTM
zk4#k!j$eDUWy_Ye$K|r#Y|D+!Nmf|q_UfIlbZ=IMuk4Z68F80Bn3eMT?Y!i(dgY>@
zt9Dn{t<QNe?e6ZL-S&@mvCq%icdq25LCk)yjo#-93y!_pDSKJ#*k+l0@ogt}KiTj)
z`quM%ZzGj-1Mbvuuki>;kIUb2&V9MLPsbnMnH5>{LvCn&j+MUC*`>zSwM#hLX_<9x
z-99UW$gkUccpEPr?6O_iC-JDObNjP*HBbHd-*M(zPd1+UyjnA5dZh42={EC%D{IY@
za%SzCllPw?Ysc-!sn?{Pvr_Yw_yU>U&7L&#*@|8{*R9_J?wii|bzSL8pkZ=uu`s7#
zaqQDYZ?(Q{IDGcggr!MWtZlEH`=m8v`INmSpUeK7*ZI)){`tz@W&2eZ^Zi{Fb;@(s
zy~Fq4J^iW8Wp;nP@As&xbv9>rZry3UY_><nI>Rk#7jNc;Pl-1Aawqo&+x+C~JH;xl
zt;rYIch68=_T>9jd9F9Vo|>ChG$G;kowhR?y-U4IpY9J!I<`&s<ce3ao*vh>p8ez(
za$GdT^}_7fTCE+A_O1zAJ3q}mkjp<{)r#F7vpX02pH`~hd{}x-^sK7LO?-;Y*?OIK
zi!^SO1<zXU|Ncjse~ItMve{35i@sOd<bQnK>aL?l!;UXVHjbWj=IF^%{=lt@x0aSV
zsoW?#^7WI|yrse+*~JqB4UQVR!~~an<t7AedGq6C)Xl`>hT*KPcMppk?s$3a!Lya|
z#=4U?-C_G~<nnCG^pL!=J*oc1b}T%>T3c=xrRDN!nk+sSWaq2&TDfJ{^`=fwr}9Z%
zL34DUH*UOc7Pd@kv0>aTJ&BKrM~cjJx_+f5{r28cHr;=9r~R#+)hEr)-WAvpu{GXw
zZbI)0X=l@=g6qn=6h$td;hXt+$JT!Nt+!uIactXDcVU;=ZLh@{6AOawO?ety`~7OE
z!?nn3URG?2Rx6!5XuI;}j#cY(t7i#k3OsmPyz2Gllo;bBXWyz>sEe-(`g*)DJVA3u
z_Kq{YmZx*Yuim=MxYwn1<;=rBvyFxB3ckpj_Iv8HGM3%l-OG7>ubsSO`|Da~s>!{j
zI#2e5E%KbVK5M(Y@~jzU+cxhtQ$4WFRcN!R)QUG|A7d(ax6718-N-MIPx0NjQRg`C
zr8nE2H$IkH9AmG2>TP*W7H@k%N$sCm(|r1#svUV2erNjH?KPREyPWzpx5u;_?l^x@
z=59;abk^LZso^)@^|^0VDK>NoiHZ7S=xdR?=tYd#WszIKUCS1)Te{kP<El0839A>g
z=2?{_?_BbF*_*oSJN3@owuv(MYq4j4!j-$FyJ9z1=l<eecWn3L3fEc8{ORJiMXqJL
zE5wJa-n#Fu?3WL6J+qr9NkkV(WM)3d`^&7Yb<K0KM3PI4)v47_0&CWC?Mvfc=ofT#
zRjCcfiR+y5i?_@zefw5@R^joY%f-y{x4xa7@VabgtdWM4f5v`~V})~y<tMBTeQtMR
z!B(NvPfx9Wt7*S;Ir3I>R>$rSmm`JKHJ@B^$$PlgNRK<~Rw?tOSu+LGLym@)EDm0~
zxU}rcTlL7oi$^236=zLMn=R`p&f@ZVef*i9cXO@VbN?RsE4?k_TjYx0+S{i*`gx5t
zdG4g$%fD-%J#Vn_d04iN`?GtxVN37)o;xYp=zHRp<q4reD#xb$4(V9&_L{4z;jx?B
z`xaj1ZQE{|RP}I9XVcd3Y`JaHyCZTRFJ_$?nc^~0XUdJmw`M%t*8S<nyPBsy*LP%Y
zlMC$0+FRy&I`8tj8ElK3_TT$*Ol4`gszhFS@^-(I?<eS<x7eb0zd!Ywbj|hq$+fv^
zuO*jmx+ODH_jp=dv~7R$@*5qKHnWsZyLw?>mY!UCra^ylsO6?FtEV5cma$oS`%%cd
zZ&!ZjpLLd-f7|fyFR97zvqd!T`nN3muJ)^|HhQ0*-K9Sp=bPJ`t8C)k@a56vwY!$D
zEZ4QaG}Byr>12-05eY~3rOu39b-ZMk<9@$JtB00lbF|J2Yx^vdT)a`;Ud!8P+KM;Z
z!{R6EUELm;erC%BLwP%nRjU^Yh6?lRhVxjxT+C~mvd}rrYMsd}_q3y1W@LumeVh6A
z>z`XeH%=$-(N4-WJo#la)2X7KO-CoDfA@7<^JdqyS$7wn*_A9Y&+PBMzMBi8757VS
z`{-iTS}dYp`&4cBAE~fywF@&mGyQHZ&5nAKW$tE?yXx!j)PtU0r!20SX{`>u@$#)w
z%|_m8^DN)#Zr>U&#zvfYc;}rNuh~@|W%Iobn@j~?A2}UoI^n=>zuxS$lHE1x>3gNL
zCS8iHopxo{rLL&V{ZVD1F?uVOPGU>ws}^0^cTH=CSgB1X&zq^YqOL1_6#36^{6E7c
zf7NQ&Z<fpE^E#b=6}5YFckkVoh1qw_lm0!o{Qg$!?(UoI;icAoUzW>V&aG^$zq01O
z*nu~j*N4g4rCsEEJ?-9u8*4wU>vy~|^K;bpjapSApJpXK7rLt*vM}_XZ>E3QYlYU&
zJ3neiTrGOA?l_B2NyX)N+DY>wrG=yZ#H^T`w|`d6v3KGtlxx1JXP)1^`TXqdZ+vYJ
z9>2Qr^`zgEB)Xr?-M=V0zUs5Xe}=?s&6D=d5Bty1`tOJHhJCC43IF~$uRQ<TzEhj;
z7EIoB&b(-P&ShKWtkdcSt)fp&&SkII-W{UTG)Ye@&1<6B#g3?Tfs2mxJ<h#4dwQ`<
zxXJRuS66m#cQ~<fSLusxlNk@6nu_z8#%e!JeEQhw#;W3HFVp_0o!zfn`+ar)%&p(7
z?>Jpl+cI1FblLLWsZV#!oN+TDUj3oL`s+4MN0;Z^RoJYjXZ2U7@9^HcMY~hwSLNyN
z)|zlxY0klATGO(X{ADxZyynIxFFc#08)JA_v`n_vO3>US`TFrSlV0jnl&aMFX)7(W
zNN!S}di(CrQ+poYb-QzM+ne~9;P5SJA1B^5sn^(lU4AClUoDf}@8&+^>ic`^tmN)9
zE>4q+H{M%xaOxfRX-DPS-q@x6ez)VZR=94`ZSIWcjsYE0d3{Q9EzPF8y(;})cwu%5
zx7&FyAJxv#q|n2DxAgb_Ox`R#^S6`z_H9R_I+s`-FD?2#JL#MCwa!<v7hm?<)|~v1
zb6a$F?y;iRTe^xhP4>jB&GT>g(zR>*Q}3;ont8?UQx9%`+vT$9tyoC#GC|A0w|<)}
z#l6(0_Ho~eu{y4se*W^7yslV%m$UJ@q9*$1?i_g-_LMKP{O0qLV7^1eUdj?vC!Ia_
zh2OlnC}`3jnWmr=rK<_+4U5)Zd+=}Bqi-idmR4k}x*q%PKSO26&fF8X!&7#7{3$Y(
z`tr8)`ngpr4@dS#XkYBV*rywJ@41xXy}P%0Q=cbpwEZsBta?p<*Ujn4Ri%P~s+UZ5
zeDgTJbH-DnD_vDVzS~0AOD<jKCA`;b*^61)2d-^dyRlsJYqqOOys~uYy}V21;qnI0
z)>f8Uxn@<WuTr16ZQnAvOe^VWQAhP1%bs4$ysr1$Vbd#<Z(HUo_MWNidKUH7bH;?i
zNd9uQH5bLU6@6cGU3K^UOB-iAT;A$+x$f<@mzP$ZohZy&ENp3=d;4?2QR{T(w9_lD
z8HQ)xGp)J1?j2XK+?%jv*L}BfU5&VycP>ibYHINjlT&9N>xFJ#9`w7uyX@p!&3j%u
za$`PNv))}k`F_ESpx^&mb8heFTbpGVYM3n;vo9&3`}tJvwyTqa7ZsJ>Suyvn?UCTQ
z_f3Ahc<izL^|uSgiww3e@!6R1PT#I8ylAnhV8_jMm4X=$K3Ml#78h<^wq@Oa1}>}Z
zFCXTAy1r_$PsqxxO?zwI{{3;>@aV$N>vw{$#!u7q*nF-A-gt*FnEzkJzbu_)>f6n)
zB3>`Q|55+0McO`Dk+}1}uYS__&oIg0hVa74H<?TKIo<qJK6B-)(9ortUe~s+`>N@c
ze{Ji!znWe_bN^;X&8scaTF1GqJfy2*#=47_bZ2k3DlLx4WL7yhQQd0;Cug2_(R+cA
zHQRoQR-aSa7L)tc;ElzlsP&E0t=24V+Op$vk??-!o0h^aR8H9iPm|93w)oxGvsKCy
zKO2T^D%0bd)oMI-)v=vs%jO&I(2kc@%WFPnnf~I~$!~tsqNQG~)_r~JXxM7Ss*)vq
zzTB%r%q``ft-BMDzFVYNrAF^kM(^HjWtVE_Z(;UZE0y1C%oMWLqvz^l)g8L#Jj?zw
zgcU~rXJ8G_KHR(TsPS3h!#-(3N6e-z)!Y^s8eYGx%6OCV<eu&)QHM9@=`ts-m~=18
zKUC?})KbYUrB-Qk1I5kdWhS~!JaAoWwNjpTzki8FYS({;r^VW3XBXDIj@1#ou`VKW
zv9I4Wo~#8IMKo5;c=$NXef6{AK*3MeEQcq)EwTBlES_(~OUu=4_pRsekds~UYvb&s
zpXY3IPv#hHnfBw=gP57CUR-HOy`i0Zrd4sPx!!%-Y4>8KC(0hXpVuAE8ghT#e}?T=
zjC!)?PPv8U?p|A$^D5i#;Ix+^tBx8Mo~?=!OH!G*S5&1dZ1J4lY3fF9%5!IZIa>Is
z-1X?|J>^CZBB#B*wq{n>)xHl7ohsRFx=~(F!)#Y~tMrBn#>(^-%QhulJ!tSbZ0EbO
z_>aHNoY>-3eA9NJ^?S?KyZ2KiRZK2TjqcnMGu7k6`W)rz16$roPIh^(Hnq0&{`#!c
z&)16Lq}jvvRPxXLwsgXbo!4@mXHA{BY--5znZ?R0v#NdauRL=(xMK6(!nn;R^W9V@
zXRDOi1t;b5F4$+Pt&}}&=Ibr9*3?;k*je>1ucqSBs)^=K#glS3Jx!ahb+x?F=z4L(
z=ZWdj>kQ|$TFFdXz2~veq7%;f8$M}2T<W<rBk9}qO>eD*F3dW)E%W)3t(i~4ym`))
z6dn8acH_srf@g|KFP`XX-?Vb|inm%Guiv~(p4XPPc&198`^LAi%T3RQW>2g-)wQDT
zT8&ms`4>OQr<Po+%>EfbE|i7jRRqbicU#=*y^EtQv+Sbtr|g*ivuw+w8;8%VTYEC?
z?-Z-qE|nW)7EL}dIip+g<;ivb8EWt3{jS@6WqnxcvTysIKDl*i>&ts<w#bVuWLnm)
zvU%OIzO1ivr9&?*-%<K)`i>&*qQ(1nhHhHpwM_ZVG2hFg94ouSq)M*F99NIsv~z7|
z?agbxn(ApPK35^``Bz!==s>qx$x9vQR^Ikh@9COX45duc1eV9uexDl@ojGl`Z^%-u
zEsHXJe@)8N)LQ*&@xq0Do~zR)uIV!~jrqR!RByJ#wMDDvJPmPiaSaVU$Z>V*lxd|Z
zTfcrf&dX!C^OMJSWr<^kqUWmbJax&A>R9z$Ymt(gR@xcqtXjKs#=rXn<!47n2EXX;
zy1#s<+rlTGg7>xGaMm-sJ?;AZ@)u|Iu5Fz(@7JEQx0Wug3j154^_1&z%D-8ELY{@3
znYjA($|Rqc8Z&l9eJ#>jayGlRB1}m|@6xO#9zUa>{%*dQJ9+l+DPFVNedl{Ko#<Jx
zdTQXt>$WDg9ami<V!l_sxxQ-6+Y7tqtU57w*?EI`D;cCtC*5_fX1<=2CA@y+Znx6V
z#B#HW(#6@6OQN(B%6k8DEWcQnWt|yX{bk$dDa$WhU0VHm;lA&lrJEMJ`h3@l{I>j{
zrLZj5%TQC+!`YVVZl>b7Qs0$wSLx5$rMWJoe7{QQ*T9Wpmp0k)yX?qxJJNlbv+rKk
z<u!dFx55p(c6+S6w^)l!4!VGVfq{X|_AcvJj%~}naXzp6m?G0U^^F&QnfX<v>s8P7
zcj&M1t$T2N<=TnWjK=fp?po_}pMPU%d+UDS(fvuke(Za&Z<RFjvqhSd1M_bxG09b$
z{WLZ`m9pYx)Jxf$8zL^5CD^<US#KPD#;{=0w5w5}p5EO{%~z~?|M9!8u;lgi&GvPw
zw^!|OnVT>B`{miYy`N;S%&GQFGv2s$cfjf;n?Fk*J(T=ooAC1Mr*r0<zEC@j_2G3r
zN!zZ+MKN2V7dYj5E}nB%GuOmdGxqPlqzkECKR<2}owdu<?9P*m%Xydj>pXrbn`x%s
zH1pOp+Z5N!&lWCUZyEO9%`tt~jl6HYY11?J-cFCwn`n9arL^p+?J<jX^X1g#OG~7A
z+uSmK<<GlyVdnk&UMr)5wzS-McUYz`>-e1Ru8QZylSIE=o*1>_kEprynR>owVn?c@
zUVOV#rMjYO>cYrp;w=%GD>e70Oz#Msxh?u)*(IM(XY-aUm!9$JrDu6}OIhH9?0v60
zfBdSuZPv@`I$I^+)NzlUJCt%?6o+3CE0qli?V03eDKX>G{+5*=Z||SC%xu=5X+758
zR%UB{E#2F`|L*BeVe-?U<0#hacglWQ&pT(I^H$++>t8YEMBh8Q?Ctg^y_U0$uS@*)
z{*-F8?fvr=rOW24#q<ANy6c4Bu6sx1-%b5#%yI4hy58?eRed(upZ)8Pnr>0wXfFBv
zYGBmYKl=PT^j8IM_$N5|&g3h0D|bomI6D1R*y7Y*?>DV^t<pU$*Xrq%XSZA@#R{iZ
z`6U)kcyoL2eZ~{d|2Qv|wE2Ej@pb9NAAt`~1+PwvHz-{(F)K2>%_1>Zw@8oI#idBi
zN;p$yc9nUsOu6Bu{ifHvpG-_!c3AY%+SvP=Wn13q$?n@!d`<0w^}5@9=4xtj?{-IR
zI8#@;&*s*vbE?WACHH3@OI(%|xz?(+u5sdR#mFniS$CNx?{&+Yy6jWd)XV#xhD|%T
zWy|rEI$iTF?OgYEbujDkZ7nSGt}MRr=1WqUXIJ4m+oM+JH>kaNx;n$>*5O6FogZzJ
zwY@TD-9x#H&Ce{lo^NeT-M;huo+~_Q4>R`q?Yvv){pV?DTcZBX&5L7}?ppRQ=lYqb
z`lhXVpI0VFb-8Ol%t~6Z&r`Tu<e;c|iPECHmxre;+&ZaziSZ&c*IOaC;<Zo3ho5@=
zD1TOSS_=2|T-HXl?k}OeOSV@|OAQTuR(e7`&77?$OVQ@i%0-v+w%@7<eQ;^((%}2o
ztETv?mIlt%)>=OKs<7NODcexR+g)#CJKtK)y;JgM<BizkZ8P(WSEoNUb@?!_I83%~
z%ZDppP1gs`x*uL2`!sObWdFXU+;tzS#ioClKgr~2&iWYTKiW@Dm7LqMG&3q}%35vT
zSC{tw)%N;PveriH(iV&42@0!=kDhy!`+alyg6U#ynQOvcoY7qxHZ^qTSHGT56`t$<
zMwQ)rG-=&`hN=~7%YQ%KqMK_LbMto9%+!F4wf%f&Ub&-m(=EJLewcFa$^Ae-ue+ZL
zzfFJIxAvV)t@iTt{d>P=f7_AedGPkXqf7s&Kk2o!*1pbGC;BPXDq;V^(C@4E#}vHW
zR>ZzuI8{en^2*<3TbTWdJ-By0l03H|^PT48+tM@5?(A;u`aLCD_qvDY{Je=`5BWAF
zo`|ws&A089Mf8iHP_MAL>tnSaE}oWgol{zRs`zP_7;dSly^|JB-0v;(#PzZE_s;Fb
zhbO%5-x?jhyHZDPMQ-j#o7dHsUR+znclfgPoZoTWJ6$&{o1J!}Z{fpR)^=B?J=kg5
zuYXK0`^w(H9ZL5nBv044UTt&LamTZuwV#8<9tv)~y8Kq*bPF3%&_ee3&fL>}8Fxh{
zDII&ZO20Sn(lYLAb~n=k-M&ohIe0cE*z4``Srzf++*btmRSR!<e)r~+ZgBtisC4pm
z)$b~X;%BYvUAFD}`mFKa;kE3l)pl$DGjRX?(4JTu`bYiy<B3*Jh4@@|#je`6=b-20
z)Lpq-r`=rNxkJC@)W+kc`*JtFc(3_RD3f=$s>WT-c&2HSz4G&eE1s_mjZ8lF;?JaD
zrRDj?a~?f&DP8cSd}8GEZi~IlYq&2wefMUufb5-**Pg4XZuRIbzBK)9XsmFkQq|Uc
zj-E%e9tLe)^g4T@svUdP9;xdK_wPR&`!%+#pLdOH|J?G^b4u5?d+e9pFa0USvi`Hb
z^mmn~zk`luimqMwCf`eb&U3CkB6k+Yd_MNpV^u}S(yS+u!P;IQviBZbU+??()#vcF
z!oiKTtHrB&zaN)P)OvnmZjSV)XF7Maj_XxxzngvE)aYPZnoMQow%KnrTQ{FwHSOKy
zIkR*3#Xq-Mw$fcWtZv2Q@>$x|wITN&1|MGbT{(I0)@w#iUyl~YO8tyY`Le3T+HtjS
z#+qKgUADHN?-tGCE}I;xy`}ETZMzJ!>F2hm>O_8c6E}_D@ttPY^trk0g`3W8a|%8Y
z+)-qDT(G<6RDYCm>%lpzzJFQEvAv9M@}g<0Qo>g4Ec`C<@n_AG&zxd=_8JDu7`{9x
z%y)b5H<|R;cNgEBbbs^F`zd?g``+4h`^~jl%}p<s=1-Ig)<3d0V@h>w!S1`)KJ(dB
z1#9n%$Xm5IPcyUhN@U2g%YG@Vx_3uy&7G&YAzb_Fk4pdBF1|-X-p!x3)yiR`o$Zpm
z{8@L`n+9L8%P&*UN_jo)d^D$N_Kw@PvV%87R=uD7c-r%ey4`7RZzmn?xs|-ky=EV8
z*H?S3?OIcPwZkQ2wY@Y=m)hlU=}lX`DqQKu@ie8M5v$fqrLFv$b>-Cs=eEnU=d67C
zw%n#jcGI2pm&{746Jt5Ht?bCy{XF8)*R0=q%3hh_l2X$ZD@}WE&ar)}SGu2PdUnIs
ziqIui`Q=v@=0z8OIl5fyw|BUyJ?DkJcf!wFR-5U1b#`7aN-uor_l(`CPh}1RDq#Fi
zU@tby<5&Am+`7D_?)tCCG0JiwJfVC3MTg6LS6ck-j-R-3FQ<@HulWqCukRk5t$uZ^
z<@URUq03F4Wvu6$k#tJj`B<>g<%^<KR?`<p<gjeLH!1u|y1U02sUt<EKUck6;=8m@
zZ10IxGk^BRJUO%W;ER@%%iOAD*X@d{Ix_8Ev|_g`t6k1i|D)%_R?nQ%$&<ZplFu{K
zKyhQ+le6{)-`t;&D{+D4z&@|Gxvp0tu4KM?W~rl=EqBw9b3NyoX!%u74s8nRTK7vX
z>d9Gu$6rBvq^eIV&E=SBc#_LGJH%z$)h}t>m#6CF<@meYUb_6~nP$JscQ*OV%?uQ0
z`H&a6OJ~#6soT9%FQ3?Edah3MQpx$4ml1ojR`p2z+>*O-rHuCV&QL>h%~zXGw`+fo
zsMFkQ{!Lx^ez>CH<=39JQ_IBf9hmp=%DfLRn7^e&$17yN54AkR2HNV>ram)t!sm78
z?3S>#R!eL-e*fCBvOFi-dDHdZ=WMc16L}t%I{l`5)xNK%{qJd>t=$#>%H8ps)ML4?
zo18D%ZOOj*pMn40UiVL1BcJy*)u)`<uN-Qx87v{VuKiTqhY#H6BR#TxCv3fTbn~~B
zo~Ji?+n#!#x;N?aM$M~Rb{ti|Ej_<tL+r|uG@W^ASC92*opif$+%##c_tsGHs6`#|
zOMeL!y-qB#PxXtwUzzbL>rK>cy(rF7*0j}J-htkB>#o_eot?GiX182T){3uHwxJ!z
zfBt7!B(FMcS-76I-q-FxUfboqai_Lrt$6Df+9&zw%G9$*lh1j6p7^uhCwkV}gE!AE
z$hEohhd*?U?~@8P-`o>#yB3$!@94WKoO|BVHPkTg+VM+Q4_(#U;+pOD#N^hLSp}25
zmTg&E{CD!66)`faZ}{%1wBPb_t+rvo`73u@HtZ}sIQh_%oDH4}LnXB(p1PdN(he0Z
zl&;aUeZ3=W)s5@hy0WaU95=T$OzD`N*X8zDX3eE}R_2z1C3@Ul>X)}Rr?1t&_QEvX
zc*8>TwwbHL{;a!rW<koeS3y%33Ee%^vuozY=!aoj`Za}=FKu0$(WPA(va#s$g}ak_
z_C%icuYL2yJG19(+lA$4+jhww@!T9UX`^9`;@nAxx>h|&vSwF3IeE?c`*UVo&6=LK
zHtbyS#!1s+QkG3F4YLW1czUsCqLrnAkLam8ftB2AXG>(2P57?$IWsfzo6vdAQ}=Zy
z++BL^rd;Q$Rja?YYE4vA&JEOBwZveh;H0iVKiyrcI+lr4DOj-?ZJgLL>)ys0ua6hp
z?4Gu2nt9d-Lz7ikFDdQ)xOGL>x>@SJOs0aD>{N>kjie{*bz1jNvGqH<MP{1svog*V
zK3}dZEk7o_%5;6`#g=@JC!sMC*B7PhSMhfC1Y+IT&OAYSeWde+FW=oaT(J(>JRwbz
z&+T)`s+DK6J8v$Ep0#SFS3yYI>lxQ}UtQlG6*gtvx4$;U7u&P!q<1Grtk_|+XjA4?
z^Qt(rjTXC-pO@}Ed%-+;Q}xBX+vj}Zy&YLNcF#4|p0}q~`)L2{gOd-mc{+fLsCt_n
z6Vs1QW-CckbDD8@W9pG9m*jiZj9&!%8~oN=>bvXo+EpuqLcK1Ato!O|GPN{BH1K-B
zNfpx#C$1k?mRVaZch%kWO>wB`x~xT7Adu9Ob;`=yZBi81kHxdvGVYvIt`2-tE!TD3
z@NJQn<vQ1p;!sghaaK$BEoJA7P0l6XoF;Xq(?Vw^XW_Hx;J(vGmhO6LIh%jyzU47n
zcD!2Y@ku*s<<_uX>o{@7yr=yOcQ115*SwOmbNbnjN9!K-2CfTnJ033`>^*Ci)Krcu
zp|eDM<Yt@}N_)OZ$MK8r7o)Hzo3?(x_rxeNk7d)Moh#OrOsnZD?@hkD^t9i)5RbIb
zqunh*>&lX!etvi6w$E}kA6eTgnkU^nTDFVJo>*$>vRu(C)5SP<r_HWQQ}zW-St?bw
z_2-^nqH)vjt-e?PV$O@?(i<f|T~6NN+MXF2AMSDFXr^e%u8gSpVSCFr+LnLv{lP5K
zeo>ZNR^;to-{mLw1!)QDTPeA_M=Cv-+A&L-<z4KmCAX5VU!Sk^zVSokcgu&y_m`iz
zrB}3OdUotHu2*+%yB-rgC*8U3)9k5C-a(7kO@04;ReRLZ@{`kQbH3%)i1lhcyK?!=
zNr~Nk$rI1U8wPTOO=Y!RogNZdn$fp(W#dO%@pUJq))ZB)SQR27*>gwHdA{7%HI)~W
z&m?VKr5&|db&1W@>=4b2wOOr~4~1r4TI+KsIK(~NS2o0+)9cvY-O+w7A-9g2O<%LB
zjp5bHrHfWg>#G%ZRi5#7zsl9Hb+?q>E{_$gSz^CrjXO9Ufe6EQFZ5TqZZ+TFZ~eVn
z;fc=rq=4#;`&NqWzHV82u+}O2JKJ90obDT(bAEq&d9xy}KJnzcYrpz*e~Vu~`knP_
zz+1C7%g%)VU04<p=GhW0SbEMx@7K4*ZreA@_1|4D5vwblkY1MUn)Ntq$+CxGrnlCr
zw>?=EW!dbz_PNx;yt}WiT?$Y0*x{;`_S0j>kE?O7DkI*>>|N{k^5y24Pj2k^Ik&Ff
z;L*P)Z7<#lZt}j_*O%RWojY$ry7{g&vz}u&v!_;u@y@z*lj~wx(btSEho{%?j4-OU
zT_jv{NxCSmRZF<~;<P){@AVhl-sUp(#jUO!v4ty}7Oj&!xow8li=(39`8<JTUzGHP
zQ?GBDF0t%~XShtc_|=1TW%s(c&zlF82^=si`_vijlw|3YdL*Vhx0%`Zu;D57y&hYq
zoMG45R?)vZak|=+#hIZu!@fzE?Gca3+x`36nlRPn+Mk8g%}Zk!T3pXRbL6>vW}xBD
zTZ{Ck%nH9>xkFCv&fmLsn{xNh&#IfZd)f3Jp|{;?X`$_>Qj7G<cF&uy_E_k;q^0lC
zHtWuWGm#Swmujw)TIA*3<;}fnM!3oPtQ+e#C#9_TQXIRm>QvvS?ZLa|<#mOab%lAn
z_#C=wR`AxR?=+A3-aj8K?OW&VCi^>N>y#bqwr}|UduN^3;m&P;tMYeUzFK&<_Qd<8
z6**5o$*!2cwLXA5>E1V8^W6O^nP+a!+j0C>{AA6k+v{ze*P8q^`ceN@_nK4bwQmal
z8A{floZ+_ZZ%O`c;VXp?_a3w`&z^T)I`DRm_;SPDZx0)9JGkkI&*A2Rs)J?In@=U(
zuF7IPXL35{xo)tG^UE}6y$x@pAM9vV+v=NguQfYsLHIHqw{@DkX1uLjsx2y-Fi~oU
zwJi6}1t+Amcb(rRHz7AT?eMm<S8ngU?)%}+X|+S8{erE_W(9LS&zdbI`&?$}L19B4
zy^EET<wZ|r%T=FjR?V6&rgiC3+_O%n=OGbcwXR$z(+!2vO+~xsyb;f=dbM(?N#?E4
z*QJY`Gp`;E>RDZWAY)aZkMynAYF6T&E|<)H-6(UD`4+65rOP$3|HQ_1FO_b5tL)!D
z$v&q_HLgthR?SN7v#~oZWlNpbZrygRG(2i+m__vqU6W03RVEuMOt+S?>3Z0|Gq-Zb
z!>4n;`v*Qt*XcQXxYY2`?av0voMsiXo^My)c6R#PO@-cZi*lv4{CW-Z8(&z>m^$t8
zT<gN)`8rW=KPIcoZAli}rB<7~P{eheOx2=QL6NIo->5w8oAx|xUGYr*{!iD<y^^no
z#>z!ao5yLICUQ&gQpJ63zrJhIb%E~HJAWHLvD&`CqHb&A2bH^dzwPd9;@oMwO}Vu2
zY39Coxhr$}eoc$YJ^tjn=+4I%RII)76}yW!tLTO}o8I4&cQsjIRjEmLbE?bXJCkLE
z&$`B&vsk`Pn|tZNWMi+*GS%MO_N8$?OG=78QG1zJ=fYn5xtsGYS_=PYGv`*EU3clu
zCm+u3GIe*Zd223TKO^+lv<joNo5`2{eco=d`Cjgct!r*(PWqB!bX4Za+SgK5GlMow
zG@iGzdG_^-TUPen>*bmkl-M^%@n%^|!RDpUW2E~u_U78AelB_Zblt@Zf46Us-d1?&
zt>W1nDe+Zexk1;?OqZFOcIaipp4~^AMW)yGYcUyHpNqQ^DRES^N4Z^db+x|dY^5lP
z%wtiBS5IblZqCu#xk~ESG;PgUJJyx8##o)0T~%JT;rRN6>DQvBd_1zY^~B4mCW%{S
zeEoRq;?93pp13`HcINwA?_25f-yY(b@vC;h1--n@g=MDNOg~r5^zEx{I^BP~@JpbP
z%;v~-lh$n#IW%QQ-e#`8K+P$4L(j)~9dmZSv|>Ngsg=)6#YN*y1BFjt-ceX)YrSN}
z)#(q-s#b5AuYYt`<@;v~`~Ba}`1hCB^ZV8=!Mx>%f~zOjx@t#5mJM9pd%S;X$(!%r
z_pPnF&g*GmU4LWC{+2%zm#e2%Jve`8pU-k1@h@8YzJ|Z<lQlc%@bzbo`m4;G<)0F^
z%)Wo?OxfJ!Z>;J|t)Hxl6p7n=^j+qwc?UkPI(>(sh#RS6|J=UpZ|CwI7LVTc$4tL7
z;Yd8n+UTioAZw##A#0<1B6kOGc5_|)J1n+znhft-tLv}aPU?Ku@CjV;ZOx;zS=*wG
zy;;6)do*SCoRbqTR_0z^xa5TEQP2EYdM2&v6L)XzS+UTo^hcO3*X$LmcIaoXeO8&_
zvi0R%UawR0Irsa|zW)5P-<>!0yPj{>+n8CVb8YjhGV>)>=QB3Cb<VrBIaFWv_<Hl|
zoQjXjp8VM7uGsc*d+tr)uWQ=Yo|@G8QEa*7YCp4EDz>@TZ<tQ`yfgF0rr=dSuKH?q
z?pV2fad~ES=jltgrhGa!YxBfSi!zTpm0c{`@-Fw+>eI5<-(IpcxTP(AysYZxuP@(1
zTW=So8Qpa_AKh5IHR^r2x3TcTh>tn7t5*x8#jFija<(jGr&r&M_jYHZjusxfv~~6D
z4LY4K<+g@;&Q{vaw%uu!(%V%JufJ9*`)Khx=t5uF5&=D(<D#+mHSNltvAdTGlx6IB
zvG1(;*|5{O=Wcz<U8Hv{X5LM~r~9T~UKa6mU$h0!%D`AB>At_Oe1E+xO7*=OQgzo%
z>DAg_AA*nE^E!4VOhs^=PN%5&o#`hF6FP6Zs^9qXn%C;oTfWKuGiJxFTA$6VUvou%
zNp{`A&Cm8YwzG0<z56Lv_2fJ0m%gWb^C!HVk#JGd%luFk3%~21)~r?2_gtP7yw~@;
zXi)0a`Rv|aJ6^gJRnC%qwY{S^&vlXIWxn0vSN*43KHSN@Z1c8;+lLuX6ia%YNPMwQ
zVy)Xm@4GRx9zA|}biUl>vxd!r&qZH~J`%5<I`M_)`xlqRmMk?{D(tm<+V*8$(S~pP
zold;DD=!vWyyLR9+)cLAz3FSS=P#9*@iC$E=wiN$F1y#w(v3~Kksi95OG@>LRIqp1
zrW;w0x1<`*KKNQOvZZT%!&beuFRvc^9AfVKF5^4X>@Oc)X5KKh)z37RS5)tbH_yJT
zB=dG9-&(gfHj`_LDreW8@wjzqPPY2>yTKQFMJHG7*mLx?=FyVrVnLTy_C6}(IUb-M
z^ZXJ|RdD6*676T{MZ2tK{gATweL6JL>w5Ku^lHt7X-fqS-BwRO9g=UPB(}u2<dcz1
z*;T1|L0X?zt$gFTJ$k_{-4D8EC)OO?cI%<*_UMQ1uG0N{JZjRtLfX8S_c<Hh$kgz)
z;#|tNEHqqe=dGBf-#R*Gs9lo3Q3q;tA!GjkM4Y7dwC*+kQS)X0jy$P;^+e68HdB4m
zx<kKj2;Z}sd{364?o4;~6W-ca%Pky0%Z?3>eAe0|S9<gKm4*xJlf`EI{&nNr*%R)Y
zFFpPHcAEQ}Lvvo8+4NiEefYhdnZJ)(uHWwc*W-oV;h5xm+dQh=uk}{jKm2`t#lNuC
zHXkqeznMAxN!IeGTFFeN7k>KRlVhxWt;s(*d*Sn!f^tq1et+Yi^Ym9`P4A>cOVP_x
zd)@9WQjb>AoVDvZCs#;jk9NCQjk?IJf*ZAbM$f)C`Fi&~yMOxW<k@A`XA9SAtv0&T
zofRRs_FSIDYL%=<CzFd7rMqg!OK<piHt5n`@u+wAxplwkK409l?M&Tm)jSKmDei{1
zWTh|5@+Z&w_H|Ax_nYObe(G(`c)Qs6bkugG-_cXAo!7k`Q+lM=_(Jg7O?RFii(aqV
zX%w_hcxB$o?DqX3s+?<ueLH>$745G4baP^t_1hXrpWF4zmM@!l!7w>xR`bM_S{q}Y
zWZ&!VzSz}s?r6j@)4<LSoxL&A;%YbDR_rdXy{tQR#ZhCSaOFE^Lq$V7p16dYu9{&Q
z+r8ni<++%cT{d41UdD06Qo(<QGD)GAMK}K*U0=ub;8l|H_Crf6jBeMngk1elV^wgo
zv@8Ehg?+~RKZX}4-Q1P--tzPZrsGi^^Tk}3o_p+?E;!*`qr1bF=`(x%n>kgPcGhXF
zjGQqqY4MLgo4egR{p)6=XY0ydsr5OrQS*kI<%>MsO{dLNp4W2USi9$t^W_zvb3f>P
zx8Sopl)wMJtH`X9?0wbi7i#OwG(XvUck2uNqf0NDmWFOTDO?q|D(9J*&efRA2UDe+
zUK~8ty<>(P>x)AtmIW6ruhhx<-JVwRS5=p1<EeKumNc8bn<{hV(4vZ%H_^+*7A<@A
zc)`iNS9$BcHvRP~*mt8pI%m_1`iK7+E=hYolYD%(!&cDl_Wh;b($8GH8*u%7g#MM~
z+;2OO)JWWDz9{^5@2~i@nzOP9bJ|Ypwf~d<tGH!fVlQ>gxcn+|UhAvAc{g9zo!NNe
zns>Z;mX>_A>QR+lf2OaFOFI$~kUwd;PkGa<m7bYPuWEQrTl-|$)}?ogKE0oGX5nqA
zgT=X;+G}Hd&xdX6+S{8gz3rvSw5b!_?u1HpecvkcG+VlRPRF*=A1BztjPjGdI@hi_
zu|j8V;Ks#wqto}EuiiFOCNR`fr|+QXsfim;U0Xcwjns9mT4jNwMKaeFw>_S*Mf{!k
z>gJa|VKdhTx@B~Im@`xH-%mlMBd!*CR;w;*_?mzJqx<{O_j|i{7p;5i`(&^2asQ;t
z$5wr}_A9dzGBeL}Ke5lW$0f9AokU(lx!H?*-k&Qq^UhvgK55yVl@o=`w!7s?g}Lv~
zvUuz-^HXL;ugh`S#e1i9TzVLlJK>$MXu*7;yIdwWUoHEz+&jtG*K@|I@coOH>}l28
zUoFA8O)FD7H*3{tJCWrJW-W3*DwOLgJ@2`Z$)_1t-F~(m7BkgN<?Z)$%{a5wti|M_
zQRMWzNQq-xKj}~VuC)8Cox|fkkL5N}Q<n+12W?ra?Gr7(G&AnkmbJQu<}afvO4e$n
z?RulM^w#EyWxw3`r_4yO`>v(?ddpR_t%sIBO`o4>dFURz^t5qbKP%wau8eKVUp$$8
zd&h05scrEF-`Br!aJ~52xqL;QOUSpk=e(LaSNFSHgygL^cg@c)Sov_0W_nasTZmH9
znj$^v9U{-V!#Xc&J?L}wsZO`tCV6MUrX2q@+j--+zs|iJaDRSN+=s3kX%7kwmqq1$
zE<OIPYsaMr$HJ?kuFGwcQtB<13JfXR`Y^b8Z(P{L?J8R`FU`HUbw!`+QPJ>hxgu8C
z8Mi!3L#lQb9L{*X@}SJjW4bH03w<lN9@F%4c4qIt_nV(3R!xnZo^j*tG&%p}#;N-i
zZwkNTeK1M=MoQ%KGtaj8Gu<s|&ChRi4N?tTx-wi;^zrItS?)$E8<$<vZr<~4PR*6j
zPv_K@omf8e)Q_tXU+-L+6J_1qJ|*Q&{<C@7(dHfcslP+c?|l5$GqYSf&wPpDoAS;N
zFQY!EDcpNk)j8qNmXopOJM?zu&uY1TIM?O*z3tlJ!Ht4{b7H#+m4oF>l+6zL)YR-V
zKb5~fKH%-UGu5gu%1-R?zTwvOXv*fT#YHM>=Bw@U_@X`ISvZG|mQChQ-LJ=H#msxW
z^m@pVFzNpMj_vo>RJ{2dZV`4jwbr+0)>7w<hcktp)HChkbE{qzi+tH|X|s{YVcD&(
z*XPz1NvzJ7J#|~}PSoDMtfR-H@~R5{*)IFmYvVdej9XYeD>!V<e1nt1r6J!x%~rZu
z`1R?kv^Aeq15J{;9$s&aQlGFncc;z`F*dh8*Q(v3p}t#O4)1VTyE!^i%VT}mgF9v$
zU$6SJbB9^r&3|gAtL<MtYj!;OB(1Q3<NA-cPd2Ufy(zz|O767&&0x^Bw(Dk($|X-f
zja;hndydL>*ZfOc?=6>&4%xcZe^rrk=#7lGO6sSDUM}#;kml0UxKr|T$2`5;$CEvz
z3tx&^F8MC=FKv!tf0pgyfQ!|wea4e_Yu@%Voc=g-s}}S3ZY9awFR5Ks>x)A}O_Mqw
zgtta5+d4bL>g?R~?7&lN!XB({yS3=gM!#(nwUs=k-dxdlJX>zl^3(HUZU`q@=N+o3
ztM801RV!Pv*!pDdPT|eVlinn~+P3O$_S#JorTdP4`|asjHud4v(kN}O@3*ghIKSKP
za_zEjd*8Te2L?Mm-d1xlP*pnS+RoTHJKiSLmglTJ!#n5Oi5>Q{Qg~v&ybl*;?Wp;*
zq1DA@_UQs8TcIn*vitoqU;72hF=ywRRN5Zib+zNo@g-M|XYyWtcYfQdH(##YZhr74
zeC_V0x64bdek9dgy?G&YUG>JC-77!6y#7vqTG74vCht6-{gHZGa=ftoUste`%(LIO
z@2z}(_{{eovEMKHbMOD{c=l_=>|^tbe=a<pEq-j@^k>II&CKqczIi#>Gvl;j#q^lp
zq1QQY<h{zvj|kE$Tej?oi}#!uv-4cmVtax_zWrPkvL(p%q>tTWKA+O5p`s7oa;L4k
z;B$PH8Q;;}-nV17ezjRVY02i5FU2kQS;b!cnR`C`w3VoyO613sgL9>9P6R9O6-u$p
z*{-CsVv}^Rk+Nssr?AT(?`M{JU04;W1=?x6W#v(&ReGA+{6buP#4{c)yJ+1~#k}&T
z$eGoyn_s_c^x1X3uJ*FHpWU(AhV9`mmdoTzZL+UE`aa@P+_YWm)?Rw5CCsr_s$a70
z(v(Y2m+ZM3u-RzgN-d>~nBv%H>$tj)bnjYKaQVgtk2f!3)*N5krui(>$Kt%e%jaip
zXV`95?6wZsHu0p<pUinuOCH|{n|I(~UU9e|*UUwmviF70HI>s1^}2ZNV&1dWt~JbS
zS67Ct*L^+l%ps1z^>@S5)rFL6gQk>83*I=Xw7Oi-;N7Er3-8$~T-<AX!H4biGAqx{
z=z=|3huSsSF6KRZxAG?cs_yB9JMXCNReM^o+qY;+@TaFXU)r<13Z_Q|O<B4*Fwit)
zwaL41kG*xdM?Ot>wy4cQV&ah*hCI64zdgIVJn#3rS@-p#wijLw&Ahr;d8?@upR}~5
z?@ddd(;-{!9<P5=rgeBt)NPGfYMsUHhwtBC_U&t>w9)n28P~V=xIbT|H}!BvdvyA?
zt!InptUc@cI(b`G?8I$l-JAN(SFkl6I`t&&;s)2$(tP!)8F`2Pp1!!)^u%Y?4{v>A
z^41=GCq0ul`Eupg50|crKge3gt9U(hZE5hTIajiF^Ga8HbbdNAD?(@%+kx&bHLL2Z
z)t{bCSZn?=s^n!<T}(%~h4#YRZ4%eIUaL*d`fTlf-$;GSX`7>)m*wca?BPxhZ{OH`
z#Cvzx^E}P7&({eD-RX4EJZ1DWXx96SrYnoC#6@Ol-e)>;#eHjZpY~#r!vzl)ZoJ8}
zcB)Ef*ebni&u5jd+2)tRkg|7cS!nY%zjnWi(QTR9jQ!>+$2Zx#NtBrtsn04p@cX=)
z{(|H0rj_Ult@xW6`nf7p`u6XuA3Cqq7O(j=Z$@_D>852|J8~zkdO2yGtZTTbq3iK2
z7171-i%(tO5M-KZb~a>3R=RCyLRa19jS@H3FNk;PN;App4YG5tdsn_t-tB3@_Mg*@
zo{8LhSg`uw<gBRmTfW+SQ#Ll<HeIH8!!wNw7q)d<o1S{}#_C3H^0}PYoC|lgZ>jsQ
z+m*N`qIJLV;k<K(ngwdBTr$IcPw~sVqkVSLYt|drtyQ0i%0)#_7QHt2S)Rq!tYvcw
zxf*?gZB<s9T2I}izH-OP>;9Ua#m+Jx*Ay+&32xqb{j9CTls8Q|d(Cd8{b%TT%&%+O
zoOtHTHMZ2sW<UO=LEn7S&J@g;d@<avW%+__AydAe3VOY7)t!rRI|WxvcIx_jrHu2)
zihU~&?K_mUTjEJb-lJu<cY?3jyx&~*Fz<Yn_k_czOJ^JH$>lEHaz}E0#_{EkKYpmS
zTC`Ztq~l(X#FpxFzQw1)<)TH`Mouht-4v`H9F%C&b!}>IWaX;od8yVxYO6A1iswGl
z<LErvRXUaH$~O~1Ypc{qQL7sX^RhCJ_w#GKWA-&WoxACDM&`5$ZL>0$sBXKJyXB%r
z?iRHvUIHzbrYzO;3YxMo_wCEr%`=R?-IUgk*mmeb%w!k+>0gsV_uMs{eRa8R)~R!j
z`Z7BoUq2D9>eHT6SG$(qwqUEd*1e@c&Y{l1;_<9|L$p%oNB!cOvN9=MOKF+J!!E7j
zm!CPlhMZm1nppI(YtiGK?dJ}kwc*~t_S{W-L#e4*?B;;wuTsAUW%sz(y)Qhv@UiKp
zTZQwT^-Ob{4vTj`UKjM(=*#kyb?X(ipYdF7+;^XA(^~JSXn#YwE6I_4Eir|mhi5Ew
zbvUK-G<)hPP#Tcs+ILg8Qg_nFzIa9BqD}E{!?{%--Bq=*`24m!qTg|cQ?0zev6f2f
z!jKnBLoJr=JX2)5x_M%u-!0)e9Z$m6U0QfdcZ#|AhqE(G|1+>%PtC4<95Cm=-2L8L
zA5G`A-*!{UV@K@A+xzVKQdUngsy_6itlsJBl<Q1ZqLDh=R%x0n&5Ul^_Tg-_*Q?at
zweE>aMXOGA-#yISwQN)J^0}MSS5MuWW3z_u#<Z8G;cY+|!=9M?5(a5E`lnfyT-`*`
zCz5=>@@jtdm7U+eUh{iqv;Dr!%U|AlxzA1epWk8OTJgGbp8uR*)_Fb7oA=u05X|GK
zG`q_8n2+k~tb&(GF7^#Q^62K-Lnl=|okEr=r)~@Ncy>DMJ15VzIK9)Q@4v6wx@hYA
z@7wpZUu;aVTlO-t?6$DTlJzy-Pt6wD6n>g)#kM>*_>}GT3y!(Ry=!&5_kB;;I;~hW
ztITiNbaR%wF}K>S%UQ~&_cQj?N*aJ2!T53e>aMk7-Ak_usK<uqZa+O|cIw(GCw9h&
zinhxwvEb3nUHY;}tB*PJVQ|Z;OYQ1OH_vX<?TS|YDiCC}Y>~yZ%sXq^RH`TLTC&b1
zG$-e{<-3CgSz2>qcV_N7x^dOpvfwMB`=-tF4C0XLpOxC%_0~^KYmt@v!&%23&Da^p
zXPo*oZ1G#=6(18fv;GuLv3sLaE!ng67yt3Kf2=~k+B}}(B)vMrRc5x~Ew$CFs{e5b
z-Of16dw$nU^B37C_iUZ)FH;hD?55W1xXx8qPq+8FElOJ5wJz1vTJ*$~vYzL=Jmp)o
zpI2|1ebIKxjTH-<Jhhihycl?4V|d`=cA2<!p)M{5h3%Ftd~vmCyI;1nOwP+^9T~07
z?m99D`%;Te<@uV&v@5QzK0RSqzTULowc>u6QkNyqFATn~c{O;J<^2yqJ#kNu>zOy}
zY;O*qZ#V1JW!tKt$+A4YYol^Z>=wR!GI#05t)&)U(k*|6pD=S=_s-?rd+*E7LgQ{N
zGheQ4T(Y<9t3=_YQ)T)S^V&V0TvfR|>ssZW7nk?-&pD=3R8%ur?%2XWubC6>>zY|~
zx>|UJ%--_jgT$iDi=nX-eP78e?M(B{y=^kh)J!`tJXA#YaK2Ue*4}qJ`PA-jU9dq%
z>vq}7K+TMmYIA?QO*<^Tq|7diH)!6C#1&_ZC0jm7_Fc{~>JyrJAahCJw#$cNL$($K
z%nGjCx;88AoW@KZ-!_#)W~Ne|J33}4WnSL8D?u$YWnQ?y)?@8E>q0NgShY@Thg$F6
zv+Qr}PC;h2!RZJe@$&EDed}-8*?&5I%C;}mZI#xtdy`*AJzJ!;Xpx3p|Ky)fy*pW3
zPtDb`u~=K6@nojIxoq33N0%;_M4#8>T|CQb%TnvF@2^G$&HbC<zIx4)1&vojFCFLA
zE;#zha!pp!iq7siD>}PYDJAU+4K%wl+0#8dKHyfA#FAMnZ#+10>-pXS*U*D&T|*BF
zh8Fm8bnW)2cT^OY)!gD95)vbx!;IKTj^qP-VveyNROJ%y_nlvxlcYLVTW&L){rK9M
zX-D-v_4_RIcW!hKxc%bvhp=1I1)p3MO7KhNTh)6aF?Y{>u05i`b<4N!VpwN-;gnw3
z+TH8xbY5lq9h~(tRD03PohSOdozA#SGFRgV?V;!a?V(88xy~o#Q@ZQbmwU{u9z;%i
zdSlHz=oSjUgQ2TEV^$qsHf?WTh|i0A*9~u|6^ApQo%M3(^_q?4`pOq2&!<e=5_pq8
z(R}*Ne8sj~wHixq>UBL6)X!D4{v#&0SZKwRi<&X*qW8=8#n-0G?oCp;K3Q?|zRQ~~
zTfW<@C39tUuh~`2<uNB-MRoS&1Z5R3QZik?a^ely)Jf-hqV=v`d6TpJ$Stj>2iLA`
zT^Cip?)rg?YQ+yjFLvLYFSKga>ck{z?u6T+=Z-JPiP8^yd98cxexuV$=eONiw4bd;
zQ}=XIpP`n@v|Dx7C)T%3T{d;$8~e!r3@Nv!i95g6sm_~KWgRu^_7+JUW49MeJ6|dr
zrcB%`_07`i#S>SX*B2w>owHICzqQUb2^A5J+_J9E<IeSC-&V&q&-%7%>xtbfR<)h8
z-B!5)DS06=`5E52F8O-xF58t_o4!Nxdt!CG=k^AcdTIMCI`TPuHIvY%DYM*tc25y;
zFRcEuRI;*U-LLGZ@b#NkmW2I@k@kA$cBM3B%c~bv_nD8ZRPpSOwtVBJ^-ZtLN;I(A
zSo!)@quMF6wZ5Esn#*IVaq`3E4{<3G%VJEPrycuR9lR;K;8Mp*<@Fo>gx~8`ot}KM
z?{$wy-|y_yZ}O?7x!2+{PpKZ8n|b7_me)$JWn1p8?9f!M@sC*Qn$sKV>gppZYNDm3
zHzg#Nt9wh@Crjm7W@ou)y_j<)Y^SKGsHmZ<t1nmAtiWKWg}?fn-+(>Lel_QZopS&E
z{F|oJ<+n$#-)VHa=x1@yc_x>y+wZ?$T^pL)W%JZ>@s>=D#piwno)h{bSQah*?E5`#
zuBJD)!&-fPo6}5#rit$sbLrl>SUhccPjB41h{r$pY_3&Yy09^i_w4Rze-~EeSX<4S
z^&x6q)>Xe_4}OO3-W4i2XP=!*uhgWIA5>;us(Ad?by@5K=FA-JW$&Ktx+<(P;XK#r
z<p$R$oGuKxpIhGPS;l57TsQlx_uZ4xn=U>K+S0Nr|NEDvwTc(^7SFFVnk>6nyKr+(
z#kGZ}uSLdoYu>zlF5GM0%Xm%k)3)N~&#YHZ+Nom^+qyE(YswkFT~0@Hrs{u}xss!O
z{fDec?y7Q)lNURQEHoL4rzf8elF!+nWOKg0*U9kZg~@OJojQNw$^OmKE+^_;k8fgD
zxvx-IcRJ_sVmGHV6>@C1%o#5Gx?kQEVH$1t#r2rv{%Ael_^T5mKQ6ny>v&eOpwe!$
zKexB-<d}41d8S*8*Q%vK-F}}YdR<<;OW~cK->2q+pu?w1*G*Hsn_6OQwlr(O5|_Da
zC$3348Pv@^@omivw<n>YF2yy)g28)!$-Tc8SX5nq?e59m3ES>Is%5#bYpQ1Y>d?k_
zz2~dkeO4Na_N7#uxa#5(n!G3bs`#^cr%TRznfw;rV!a|_>!i!_Jl**R^Cb166Z_sd
z=4`mBaBTb8?Pum|nN)vTEqUT}^cvONb?cROUs-h7>(f>7<?4~TC;!e)S-Uqa@aD!F
zXZy}hTeoZGx>a4X=9#&w_)mP3>-B7nUh!gg)6>e~Ebd#yJD2a;>2_*fK(2PAj<nyV
zOOtMvOq&ui#Z)g>XRE1^UBuIroa&X6E!DCkmA0PsxHDtb##1vlC&w;{y`m#lT(y6d
z#ge&>(^mw&nelejYiqt^`PZerujM3P2+7W~&&xXZ_SmHL#;RtgJNh2??LL&(70%S@
zKW~>{m7>c2a-)h*>nn3k>R;LR`sK@nc~MecGZRZhW_2qr>r^P&`FUU3#<RWWsy{Dw
z*UK{9*Y;x1i>j`N_bpUU+TPmoc*n*FofWI+ajbsza?#AqS6`i<wJdb*#x#c$G2eED
zZS6TbF{<Nf(Yu{Niw^cGrQFnUZWo-c;x%3K+2doqvQt8j$4~gZ%Gqbr<;+!6mWFlu
zYVMDkcXQRMr3;r$+O*B%;o{D&=X>9|8TMu+J-0j;k{#8#G&EFDIOn6Pu5M_lPf1DO
zNz;_|Zkx<Z<G$}a-5hmotLV9+buBKgp|OjPmGu^0oc;IiYTn7ACEs3GO!!@x6#d}o
zHBF=RlT}+52fjU#SG2OQ_NdmBMG~vK)>g^MeUFKZ%s$DwGi%$Cjqhz+gR?!pBrEl8
zn|p5J>xI%{LW$Z_%VtaM3}4fp617w4G;4dR)tzm6M|b<CZ0!1eT2<6JT3Xm^S4{FT
zwMDtRT(T6O8gK4f4=zKHiF(4<6)-Ris!Ez~a$fSExS#nO`u@Iu+%S{t>&$)UXGgye
zT=jRx&Ghan^S@KW>hB$9-&6X$#OKM3Ey$ilW&6#08F%K>mg7P9uj!=STkj$pu+v)Y
zb;R3W&T{AHJ5S2n82?~VXPTY=nLG0ity=YA{<|RaNmG=j9%pyC=A*jn``qj2PWM*L
z-<lfk*_6??wy>l(_Q-tJsoj0kU#(mybtPtbiR#lL>H8(K3w0}lS4GRI<W!!1ynL~%
z+p#B?mtOg?O?;73!dcf_q31%L89u#yeF4;Q{I6QMQcvZ0m+sU4oPK_)oQOnVRP?9R
zGgDqk^okklUi7p1(>}*bV9KRcL94xj_iJAc4hg$*+}-22cc}KQV}>i%72kX6wsoz`
zrFBKCl$0_<V>rH+hFme(6drcs#EOT#=O?}o$&NUtwDWxNbe8O>E1{vW9GzWLL^q3d
zn=Xn<&f9n<ch=MLrAohNc;)r34cYCPx@CKo^NUqK*LcfR1-68Bt<Em$vziq=>snjn
zCe6tepE8RQH){#5&JS7TANJ&69cytg$HbF;I<5N*4MMYa+rDh8TG(c@FV)Xyb^Ob>
zS?QK~Z?$x7g~dgFu&#?q3tJ~)SDviA(WS`T$Xx4H#<C9Ul9xCp5~1OPphzs*CGty*
zuT)I@{wrDjDD%-;xql}@e@ZU?s%E?X0{5HW{u83khY6n)`>TEVtjC45+VxgP-bQNQ
z_%|scJhkl6?NzZ`i#Pn%db-MJR_UsR{;QZx*H-`gyDML|-FtF$d;KTnvRC?Zr+a6d
zcP!PsHuth=Xkw&>LilP?QG;-u!c{+Wo!74_nq9xCcG~aLzC~}nW?RkM_bo)#IVo(B
z*Tn1RdfXS!dQ%(}DQLE5($AUR#n-*Iyw|u~-BvmwDQxz(*uJbiS8lxjbgQuVZReK9
zKT@yV^*HUl(y6+=aLVHo_kGK!?fSEP#xAB$=>_Z6{9onI-JKc~8aLbS^Zuo);;u#}
zUdddc)@@km;&U?Y`F8h7p|-Xc-%k1cpP?-OukEP~xlya4S7+R9D>-W{x>DxWUf&v{
z=Zm(67C%akcq3|)rTAIeJLOF_+r^5sv*9)y_XPXq6fRDgHRIQF!;CN5_7{piS(LRe
zJy&PVztC*SVV^kGuxU%qN1x6;>Ymr}WS7!=70d7!4V$dwa<xOZ#reIMacXtsYwZNj
zrB|k{ov`zh<=QfVSG?^!Pg1V(Pdzj7!ba(%Uo2x+Zt}nIH~Wk8&1u1FV-{W8`(>NA
zkh<T7qpOxGKg(9mo^vPd>RmgHt2HKXI@kAZwvBzl8*uY`%AaoE_hmduyOKNaKa5<m
zvnR!D&ZNsyPlevH*~V_a_S&|bQ%GMR)#|F$bkES*qnjRGX;potcqLxa>qNZM)t!@X
z-4=VZ>+Q{HUrh2#WkXCin1%UEi(bsSq;tOH<KDAwK9jThie;Xgo}DVGoRc#v_4=ZD
zhfc|gZmpbIscX$tb<Awjv?|TJ!V4z8xLGQ3Nz9B>IeY4dHCD3&-)6^LcUj+izvM=H
zh<??=g=If2RaTlyKa;Q7kr=FZIriD7cV})dyscMua#HO2rE;~aL!Zt#U3zNe^*MdZ
z#g^)ZhUWV<>nyn~>2YF<&^^wS729%V-OTxZR(bQ1nG=mP7gpch_S{VJCfCjzdtVFf
zef8WjG-q8e&+Lew=BGn0#{O8meB;_GjkRpf(#6TGpHAu+yB&LLx^5QxnQPnG3^y&7
z=*<okj+?i^W)>n_fmw`%j;#l&8lKfct#yEbOmK+7>wm1Xqh{<)w$x$Q{jT93G$mAM
z?bD5V%D1l;$sS{wqWI(d;>u(4x!SH7OD=7Bn9qBEQ@(F^U_$hSZPiB0Op|ZzJ#Jcd
z>zf{5@@4OsjwhEoimxYqx*cwJZ`q#hb?L#kzD!#DxT;V)X0~7QtyL!__L;sZp0#4d
zDj&J4w^h=98>RMbRb4s#ula7T_(gq1cd{;7zbKxy<L;!tVZmo~Z>qS>Ue77Vx>Lj7
zH2uJb3-@$CZJoqjuKI1)wvbxaTQmPNu)dnOOH^FcW9eaMwJfDavjktvd~>loditO4
z)LrjWtBzzpig>v`hUbT|$GVSaLk@~X6khrvvucLkbhp(r)#hBcSDx!oUI2<?2L3yT
z`A^l(xHz-SWhURYBNcDVw$HEqKHcqB>a^L5=9FHK2^E^MZ0f4jv9HRXdMuh&8O@_x
z{H%K7+U&s4b4BYoIy#qzhC3-`hH9QQ3HfMMdFPtLvDI^)Ybi%v@d-T_dQebMSTyuP
z=Tg5TIrW>Of7PtmHGgG6^s<cCGB)Y5)s?3n^>}2hj6L>v&#LV)ZQgvLoZ0)AsJ)I8
zt=zQQtw{R9&k2@!lV{ehpW}H|dUmg=)ytP*I}IXLUT3|_deD1$#ny27M$sUfloFp3
zch{QRst0#W*1DUvpKH~$7caM)7J58>{bh4Z$Ev?lyC)j_;9Rlki^bQIk0yZqO$1?i
zdgJqz{M+I;%BAl=wou~B3(s%$sp(IO;&1jYO0i$`_$K3|{0SEOQ@0gfT&$R8!(+Zn
z@|BIp@4trqj~XBC>#tAI`qOjq*JQctFId0TE<d50zM6AU$xW@e<_DTySxdD&KBg?y
zT$;61Ye$Y$pypEZ*HPEDzF$%P^hoyH38y93`owJan<!`=9{t#g>!0PC(`&ap*HRAp
zc6HUNw@TS9Z2@<#t+hOs+IfFD*Q|{-YlHXSztnZaAWCiP@^-JVmFvFySA6DQ^FHLG
z)Yj;(bk*nc-SgVcKZ{eo{BMh|_|aLX%H&M*j!0F9UTe21oF{qhjWA>9b{Y4ryRIDh
zYkqaN?D7jGFT=T?PP?1C`_S<Pk3RgKSi0F~+m7io_uP<nz7l-rXvcT4yVXr|p9Zfe
zI$5;NR^6>7=;@@Hlb?M15_n*v$ImRyM_p+_$KB;>E_ezp+O}xs+KHk+Cms)$ne^a}
z%-n+$XTPn>w&G6hRu0u~)+mo#S2E4a)jv>Y!@gBsCyf<X^reK9ntIQbT2lGR+vmrk
zqc^seKE5sZX07b>6pzbpGx`qu+5CQ=F4b^ZXVWFE?Xjg-pPqEiHEX%HDm2?GXv+PS
zuYT_0vy#+&SnBI~H>KoF*3NUKi!-;b%?=hVp0o8t#}${Op`n6;hQTK<d`T_wyW(>y
ztNO@KEtMkEHMw(Bcb+Sr&bD=}muP6wszpjlTC3J^RGK?&?3FlHusw5OT+`Lod-_7}
z`fW4z-z}0@e=k!`c3aYPy^qDay!(DB-@fTI^Wmb-2Nz?wLPJ;W7FP<-SN>l0V((4i
zm#Mo<OY^6mYg$&d%xa<3iTT~H)MmaZGBZ;;-W_P*p7!HS_`0IIl}3jv@~fq%-J9us
z-_zqn)XkR*mR4t8JwJ7Z(L9Od>d;HJK|V68yS7^&doVBEy5Z#ov~<i*=)_Wx`e*N6
z?qi;nZl&98-JWb-Gy7t)W!}%)q|<`soD+2A8uSgO7luguv0U$(<}xMls>!Vvm$VPZ
zvx=-*`H(+~x#)fD`;)eFRj+M+mCKvI_0pn+8w^uw3chUlxMsC<VfBpbwmV-P+ifUr
zdBapR_~u_0^KEZqbCzD;^zqu|3tzrY+u-nW&cscgTPxxwF236=_1tvbS$E5wr)I7_
zda7gd6YwIxvd?;#Xt>CaZ)ILu!p@djx07Gl{H%+6TTfiRz0Fcsd9vKbx0lxk73H`U
z7v#RP4ZCjZ*_~DEr#ofswpCrKSzc4RwAJ=)_Iq@nt7b{mOotVlS1sCE6#Md}^6F}1
z#YqdlZV_4%sJ$UFRFY-YcelQjtSi}fu3L7zXD`XUw|ckf*@e3dt~vZZbvjO~{qokE
zmyE<Nhi;5lQqz>uUYcYox3>86#w;z<Q*)OZ+a&vdA_)in!LwNGxN$V=d9l-Lrk&}R
z-E(YB^xRqR1!rzhGYk#PT)f6+s(Rby6<)Vv&8J=N^tu?b(qyg0@32#wb5GkvYu#wl
znr)W$Y@el6U-IO9Fa20?R@d9$%~DFsRg-U<t-Zfx;kF#B&Xu_l@^fW2T1I?ZcdgZL
z)1p<D;v&VNQBOibT_2sh@O!Gc#{2Up%S!f`TM4b${(9L@pI^oWed}3P9Y1a~(`!lO
z+G|@EMW;PkCf2fSYsmJ{sGxbOr_7y7eex$=KDp<lid6WL=XL!4T9S7k$IMO1Id#>=
z*Vr<g`TkA47w48vvv@OSmBYp>8TtOfo5idjUB0~ixM9`18jdfau>u=~g(pWon~|k9
z{jO-*6&r1p!|P0fCfzoF?X!c|dy&YNtFLReM=i0Q5~{hiAg=H6TK%d$q9LDb_m{4h
zy|yY!`>1oUYj9u7G;>4QryEbrT%Xs=m$Y;Bwcy+0dEf75KXjY=J?_Gi^-B+X9afH8
zd+WltMbF*dYIet6+i7^IqgLPncMn5drq{JCOEsgeZN0Zt^ZvE1>s!5o7e~!s(Y3lM
zcYfE3&aSso_C94n*VYD=Szo&o=4YK$wPMvKpRL>9l$}dj^;fLz^qXrhOI<U)-fL;|
zZ9lZ^)0Fu<@3bdR=vi{VeUq1!_{E!fho!oflo&qTb!?|;WN=|}^6ajJb<T_5e|Pco
z`cy0ZXx*0NwOem>x0*_At1O;;EAH9upw_aDo+-&2VglD$9bfQUHS<~L)$Us>WA<Au
zjuCw-9&y&}>eg)EpihOT<2k>*H7OK*aP_I~OE2G$d0A8R*3SEUGR@&#pKo!l>Mx17
zp1Y<-TLqOvFYMJ9@AKH`%k{PRX-KHgm+Yx+mYeopl&>vX)4yqU>+F)y8%tI6LL&oB
zwq~UsTy4g^RH8S<W5v}gSKqqNTJqlcJdWgxMcCk&?*_9OPk!cw&)7TrRqdw7Gg89m
zOnWYOx%~K}!+ELip6b0+>^Ko-^!o47#Gl>XA%(}qrc`9DEq@&^a*wZZ&%Arw7c=i2
zkDlH)ZQs@P^Hzl&G&Pp!UorEu<=P)w>&|4)TXCd#&WhPttGm{3d-vVlEl}_4_04)q
z%2e;HT(D_{!jjI*hl9Sa)_j<vTfC1~x%kdC)3~+Sf#I2g1};D9^!^$?)sFbBc>H&m
z#o5Mk_S@6G{nDEKO!HLE?^EL2?q2?4>VIalQ@XUxqsnjnk9QeX>`i%_eB(||o=Nl;
z&$9b&oFT^)zOzYhPC2#Jv+4Dw=nwv(h8GXTU)Up6e!ME)rZIN*$4S}tDrQ@Am)zz%
zm#bw|W~Vo+-pg&{#+2fRHCgLjkG`6?OnYnF#f##``rqe0ewuaRH0xy7v|V?$#D+XS
z@Xh<unU_0-+?V90oS7)tHN#NM^`P0u+cx`qZYU+)xMtt&So%|-@U-Z%gS;=NW){uw
z%xILmD|^!`b6I5cyUlV_V}n}MR;?|c>{Wi{?e5))L3da0bS%r1`KgifOCvMlr9t@4
z2`)Egtv~CwHRjn?(Ho|Mq2Vkp7v{QGynVI%bfxxt+vrHy(uJ|M!MEi!j-9kwH}%As
zr^OT37GIU_o?75?G_+)$-_dW{hd1~-rkflr{_A#BSLDRnn=)slu2yEwsd}_K^xSj{
zGta9oWl#K$&x{Q@D=|5e?=$O4o9lI3Lu)pd{o-3U_w@dW_kPdJyZtS7(rqWH?Kdx+
zyRvL*!c4Wkd9H4Ymn?HVI&03Z#b3>}OJhP~_ATEtS5kc2uRH2@e!onN5SzPZTg{c8
zEoqIXgN6M07OT2stP49H^5o!am8i+4Zd-3n<LLFb%R1JVV0E>xN;iF5-SgwAdHb_>
zx)$%4zFOvZ^s}s)89P+DgZZ?k`15XEdg6V7uzPf@uvhTjqNS6pd#7pdzw-6_(yI%T
zRxg~{Ju}c(c{1xH&Vrb$F~XL8*^efh?pl)Xwn*LO$!6{Cua&1auafQ$UvcJ(+ga|*
zcJr2*t<yO9=&12T!<g$^&rGv<o>gQRvufwG+1!yTw`U%GFLi&Ncny=T(jq;6Wv#Sn
zVK>{p%J$0!igW%>Ro)t+-t4x>G$d^H%T?KX_da{_LUrTvg{gP8uAlpH)lR`QWu5P#
zTNT_k-q<>`>$p>|k1$ILPuRL?tHXJ0RrX3xJ8N6~UUqV^Ua9Vn>9+A1W)n|a?R<N-
zc=N@#Gh=qFv;4eQW|mCXjK>={{IQliV0b!Cwf*L!h{D&#vd6dIPP}+(+oh1L7O&Us
z`E+-(Ud;4q%jIgHu6;f?XxdgyueI5)XD=<uyL)u^m+N6KgIrw`E*YCy%<SXrdi31#
zT)6G~>Zvh5<(}KV3oX9$Tx-=&q3?0OPu9KX^?kK>YtgmaSM$$J3V+!3prbcC>WZ6T
zPnL3|wA3}j6}`!UD>~+<>KWZeFFptziU(GoWV^6T=n`+yn%jT3X_>qaSQ2=B^_1id
zGna7h*)D!o>s?^KYT%;Sn8K{kaMh*9W^L7)8huXH#PNyE&M#6=_Ulw_-RvwC5&rP!
zxtF^lZm&G^HKuU&GS_6a(9qD5H6_72Lq$LBmFlszp2XKRXHm*drA^s#+m2pH<Gi$$
zzig$jpuwS83ybFqDJ67DPx1&9W;=GYXvb62$!W4NH~%wa%JOELZP0VGd$%=nw%DCZ
zC*9XcY&~9YzVYIcuv)3y(<#SS`ZG%<Nz_K4S|zmPa!az>=@+G^`ka3$<R$CZa_u`*
z`eBo_XsunzmZHBrnxBJ1&bqd_moA#S@x%_Tt?RPV<-TqC^my`}bt!RQ_Pq&RtXp<>
zw&$h5vn{28e}v|Xo;ey6rXJ@J^G!{oFKpF=%FlVZ>Q{8*Zf&~Tm-$S;=%k*_+rm(<
zn6MXj_uV-3UQ#Jj?|sRHWoxgknEm&w*zb--Qy&B$3eEKT_S|io_xg9Cztmp7`+9J*
zmaeq(%2%pY8x6X|)_l#JZ*wl}D7WFoxyrY?_L=V387RJTm)QHS7fwENRNI!{S|Sos
z^);l|UC+qtN!!d~#m*fkwf|`y+s$(6ja1jl#b20<m$M%d=2p5UDQL>A?kZI)@mQnO
z!gb}Ze66)7GM{YK)+pp!-mb`VBvd<OmDTU3bH7iRANjs0FK=;0(XJnBf1jAS&dY7_
zq;rk8D{R-f)MhPuzqXInHEh@1Uveea!hRX9cYP6=Q!{6ix6<L5sI6Qpwr<*9x@yL@
zsiNjvYgX*g^0}*P<t}BCQFx=1&stE)EtAKsS8~~ne%ZFkE+(gWwoEa*?7L{nqM%tp
zQ<j>qNC}%YYhK^nDN`nH(u$bTn=El9TjJU>-t4F=F4=)5N=iyf8O}#P`<nWkTd|^g
z$%^W!=ZdGsJhu!N4Yw=|4UN@WwPV%%J!^u%sTLQ(|6u2>wdFhfqw~#fh-a5f`H<qU
z^KieN@XOz<FE-6CZ4ds|s<xzRl~>dYS>I@p;$Z9C{w&+ox3eb()?87!c<ox`SKieN
zrbTGYd=f7`=lJr>vmqg$Mb7=?JN8@m$-A|`m$g0M+4X3}(pASZzJx8B>ieEGB$@B#
z#Zav+_e0C~h?c*O{dH~k-{OoX@v32quYL^^op)XDtK7m*O~;?T`+DxurcWQIY0GxX
z%0Ak1=ZLS=mc52|S9peR-FfXWi}AeDjeTdrd>W_a&hk#{Ii+}XqTA|{lkr*WKBsTL
zvhB&ed(NHV#>;+%dM(@Xezou0zdsC~Fe|?esfs9h^(3a`WWdVBrDCD3{+fY-RnJw`
zUFOBeUG2?h^~+i&rL@fT@{W~UT6&KbEm|J&b56YHmgAg-%Rk!MAJGxIxm4)R@7$(a
zN6%MHd2qS4cuCbAnRQ=nOs%7?Kh26alRdQ4JoD_dgPU@GdEVag<3aDH%1vpnx}WXz
zX8MpS^`9YbnU(hOoLi++-reqt>WmeiDOR>@?vgAyN4-O*am5j7;)G6PftY45>J{qy
zG^lIYuh6|wL0cZGZ3)W0|0QTu(3Y2B?T;2&8WbrlH=FV(R{fRMvMCq#JnHn-4h@>J
zXzJemQ9-l6-utlq^~3AcH3=Q7p12%!(^}ac$$#<}#+EIJRmf!eUjHvo@}C&#zqzTJ
z@u$=Gc6wj`<|oK1$Y(PWvq6kV<0an*|Fx9<$<zJY%rnYfDY)$B<Hc`&5~-PxvU*~c
z;1MzeDo<mFUfhrMFb}BZq);?==%u0J(91)`p;v~ALkXN@OhO{1aT+CXOcos+`eV23
zr}L9;>0bX+_muBr%I>3CrJG*n*9t8--}bA*=VZ6ys+HSk7TW#_^}05d;U_VxX5<G!
zp?khs{a5cS%{R8k|6W<(yC8!&cV8Ir^i7_-F^c3LtKYu6+8e1aDf_F~?6jPH){eOM
z=9)HI?%Qvz4V0T-IOlJ`)O9^a+GX>!tER4JHNCggW9ru{(^7e^O|9weYMZBiTKRhB
z#A8>B@9YbX>RSC&(dgXf*SXnF+*)UQFGraM8)+=gpHwr+QsS7Jg_>UR#Fc$gA&>kP
z=Iu%;Iq`Jk(pm8dUqXAfM|9d-PA{HZwOf^M^~^isOec-*3Z1&WHa~dB)vBzOQ3b2s
z*B|h2&lh)NT{3O*%~G9JwV7F~Ouzb7FPZGu{><f>nM%mFo|oe5=PzeD;d=dr{HAHA
zQl_7}vGV%#McOkxZrm2BGTA7v9rrR+TwZ>*klwYlo~OUerRQz^l)ZGH`LwcQ1=jlS
z_p?RDgvv?m*d~zjpP_YapH)7C#htI~C&%i3+_QO`;iiATKcr4Q8u+m75YPM%TQ?r}
zeZY=nf?53iI-X5CR@oTbaGd;4Y@5Z)Jq(YFukG_K*?#QPxf{n$|9*dPTDYitn>2?}
z{h@2eWz8F8@~W<%TsQ5<y~x`VH|uME7@c~w;=|!YHvb<{Hy$thz}$O2>Zj?Y`_iYa
zlcnL#%TYM_pMg7CrsN(&;m+6bYP+HzSElDAr~du^;Iwwv;={ZxlJ*a?llx{L;4`<)
zesbNrCST`mPtN{*e<V{2Ln;zCG^_v7%_+QG!FcTX+L~D}^LwAp-Pk+*`~7B}bz1#A
zW}}`5$9$!}w|VqBp-mOj{w7_XZMt84&TqqeCk!&hz0Pg8SYxuPWVi0E(8CwbnCw^*
zdSug|@>yBkK|;S?Oq*JK*RT6lXRcfDeQPt0H!Ed2HV3W#yy*C?xgvdaEAA8=mHKA6
z_Gr{3J)UT<pCy99&M9`w9z|^xx-GahW<uyTo0A*9wVjrZTxrcy9Qvj@=FOGj#d-RE
z*KWTP$h)9$H@E0<jP7)c!gXRVKiqTan7rU@s!)=5*YD8a{9}`4udL<v&6=tjvex?5
zynD-6G;Nw^wJUs^S7gNXjc-H3cCK>?+qr6%=sH)>QqS(r{#&b}W@*jTSS|5m)x(Kr
zs%5&a8@>t6p7kVF<JnG8QE@M=g!;sY1q?R6Yj>S(d!aV>Ua7K0ol~yau8m?U92&W|
zmwx(a>gBrcbXMYZ$=sWpuC<134VrY@XJOu2&s?sxy3!wOb7JP-J-J?Y*K<{s2-7ur
zTt8FF+T5kI7HQ92TCz$xs;_#g)YXu^o2m<P-z<%rRlU4C>gw9r;+F42Jsx(>ir4At
zSk;rIrPdN!JatW$_MDDAH9Owzd-i&^YS>(fqe-{3o>q4pJ+8j{+^pNySC(q3#bk&*
zlPlV~G<)B)K(GAw<-c!#{kYfPO?Su3J6ooyDrau|xZ>xN;)yofO?zD9E(aH%jxP!N
z!gKf8<&E08zm8u`d^~l^$BkY4MYHF9b$Pq9yd+osW8vN`dld=yc>R42i#PLqDlp~O
z);Cm6xO?^CZRd&YbC9HS4)2-6>wN0;88hCyak5)1B`S|gUt9div8D9W?<Z_$tWO@e
zVzXw?;ZdzaA(*a3YrpG%yzAx6Lzi>jizG9g&d(}b6IUa>db>^4e+EmA)mt{}?=)0C
zwfU}Grq-G&-F#zJ&s7>*td`v_-fk3lQ8VrS>CEYA_m||(&pzMf@a@F<w(#6n+1*hu
zo`f7X7A=ljIeX4^?WlKCPU|~OT|Id=llRTC*Rh#5ro8A5IDTy6d2Z2ytHGw{ayBk{
zIKRpC;-cqiJ3GFcFMGa%UkTRAGrqU{SCRQAN%3zvleW~kEPK28xckjdFj<^5<GgSF
zU!_fgZ&dgHUg<E~;Q}%JGyDG--J_Gz<(e5c)uUdut`%Kpo$DKN>q=`V_sKOLg`%sn
zy{__^ty&uLValv)Q|~X$_WG(8yLU?2vs*PHzH8+}lYgGxHA~#_ZY<x7imi9w1d2a+
zyzR35Dx-H_zl*Qg73`^{rsjjyS*+D>&U;->=ViDbH9hLZvT52a>-heC+E6XDVm{N9
z%S~P3Rdc`n`)d8`+U~FGl8h3&Qm(yE+VPj;MeWm$gWJMI#kH=_W^wUbp(|hY>}%Oy
zA7Q7;9sB(DGRU;vt(vJSmA%C5i_Mn9ljp=OvzMEF=j=^`9Ghdxr`~V>kz-m=tNH$q
zy7s4e_Ad@ibJ?}cg=6(fs|(drtxm0NJ#qE3;JL@acVw9-9C`gX_lkCKSnf>ez^WJ5
z{jL_3O^vu3cS%OX@K4GbEOAol?L5CuZ)+yYjPn*p+Dx4pN~hn9nq9hFaO$m)cIiUV
zMH9BHn*C+1_I{_gJN#?PpB=5qT3Z!$rK_vEY0;M>Re|368Xq^s%!|HwqT|Kg<gi7H
zmT9cKbm>J%ELUsi%qa%2`yv_6SIvKU;{J)0_?w%&F8&mnz3sfLd^(OxFBxD!{%Pi-
z3rn<CEft%pw$<s{t(UIBRa?D#uWCDOUwuE+bZKTh2ea1AP1@hii@z@8T~XdW&3SPp
z?<w2WzNSHc&0LSYZHZlHBy@DMPifd!!{aHZ+x5Ix_V=e0O%L@74b0zRdFS5alRN5i
zbKlA@@A#4(^vUy=;-X!Ki$z<tgP&&BX<KWZTJE~^wD?7zxTiu}`X@j6_+s0w-Ko>l
zX2&n!X6NPeFBF}qwALZ>xX7mo(#_mQXCB&^ZKikjxzdT1nLW$q?N8l4GyKn_&xYUc
zPPcklzUy>o*q@!3Q!Jha+=}WsD(W^PbD6+xkH7Z(c-HK~LITVBv=4GuBt*{oUx;--
zu=exgsAnnDRtVgfZ@4|SSJLOcZn4Pr==4jgmj+60S*m$)Ew{Nwc-*32cTPLK4mx#0
zOFPlc^u24?cJ{3|R;f+BaBpMnlVjg6$7ugdyZ7rv%8F$R=cbh8gt%LZ_NQ^IyD%%Z
zZ{8*ut4DXv{`5Rrb-DbLdtaIKy*$Z<PnOzV-aPB&2hqi+oh-H+*98^EZMpkr!j1gO
z9ck^8rsO=^Rl}O`=|Z3Gnfp`bgs(NPS22U?leSzppE1g2i*k3d`BQb#*S_~p^0s}S
z-Ih_Q>2@xA`GK&0T@HkO&WU@UJ)9-y6*y|yAP7eFx{`e9dfrR2T8E!{X64KY_Wh8P
z=W<M7+l^^|yCt^Xx;yD~UR5Bk-IJ)5CgJs7;rrj8&$;U@6}_``&F#I`w^go))!JIm
zyAqN;mqn*Hrejy2%-W-ZB@dR_zn{BX^u<20wa=FKUUba36&mWBJXQ2euFLzWQcr!k
z*1A7fHA^(u@k#T1-3^TA|L!-d-IL4M_44hy^Am0xemA_+^xpP)USO)1%M{=3hO)X_
zJ+>~Iutan1ulL`tF7=yqI?YR$XZ6hzv(owDyvi@N<dxR+7Ee6!EbYO}-QL$PRWU0_
zrfxPe&xsP7yWG__r*zAi&b6IqvbIM3`4evMv}jo`yZ#P%5})|o7n)lAgGN*Nu*|>e
zcTFA_UpG2#%Flcv{)$bSx0%d&@jZKGSI1uE+jn){PqyNoZA(`-OqX>Q_0^91HFf3b
zyZ>5NA5FS_#q7woy~2-{tZ7Tla@W$Hq$%|^OY4PeNFHbBp}4$W>Gk=syPXcV9bMlS
zbxZO=_OytpQkz|VTs+meG*I-$Dz!NR3ohSLvxPN^BuN;?XEw^U-V$QVaP;l@BijwD
z&KRYq9zXWB=*lbIv)dUBuNLgOJLS-=x(y4jMSFVtuI00PJjE+y%F=9iucw=}zx>qL
z@Nn8Zu`BnauG?<;Xf4t!WOFwtGIUknnY8LasjKd`=UiN~l;_<0eL~J<Rnh4R^|oc{
zm+sDD+juo-qhLztWb5EIks|Hl`!l|V@07W5>uPBB)QIE7o@e)!HAP+9a_-12uW3tF
zmwlX;)%If5wJo`#Wx;)+981er9txVe*YDrf^>q~o|B6;d7Ee8?%N!V<sX3YD(`0e2
z2lJv<y<0W!eY)c{H>o4p3%A6q=$NrKcw@Y7L6(~KwjD}pkCZa(kC)%i@E6$_cTDMz
z#Lt_-Q`i1#zi{@s-8Q+D_=+vIBLBXW*M4jH=q>stN&jHmsX1%&uh_NaTimFeSo`S0
zm*C$Q?|#1}d{ifVW5<3rX|1BI`vZPS?s=T}%=LS>^F{yn?ntKRtUPt?uk(wv&%Nz4
z#qa6zyV~!Sy<fY~Cg^CjME8fpP0O{Tzw$4fea?(JR{MAEd%JmUr0cZTeJL3S*H7bq
z!Wp*u=&aZ6(a)pv%5I)nXS2h2kw@V%m17eQPxZR6Ds1XfH%$LsemN_6%ee&^)0Cyz
zE^Ft>_9jn!Vs99DIl1b&RnAS3Tt44j_tuL`exJSPlzZwjfj7(VMi<V^oD!;Qa!crD
z@YHo%ws6dsoqeC>h0TdwYTT@gR)2hX_vMp$@AqApr<t_3$MuH+Jj|u}jr8|K$#vCN
zS?~MpRI#+{o1yrF!>N8@+h55$o6pIyIkE3!MwRB@7w>+*<$N?v`^J&@X5OimQTJEW
z^4u%j@LB2i(S<LTe_y=&{TB7Ei1o>Wbv$MvPqXV5ele~*cKA&9d$IOK^Y8TyoYk|v
zQ*Ae>zvlgV*8cq3YrE^Bb_A(MtlO4iE5-VC1#fZYg%pQ6m!+<|Hi~jhbT!+T^xC8N
zfN4ljUzM|>t-#bur>$Zy@9juvyB{&<hia?nnz<4)_Vy*ud785}OQ&OPS@Nv@k88q~
z)laKZvXXr9DK*9H=be|Y)OKfTC$DHL&0exRQryf^S~==Sjb7D~tiHTznP(L*1z)YX
zufFQM46Mk2wHe`bJ?0wNU-ODzuB`ZaTJG&}yO~cW{3*LQqw6WwjY!udo-g~_{Y2{e
zyKa{9O4YYhgEM|k{_vMUPW@{ClG^3(H~we1DWc`Rt!8(|hr*C$a7z&kJHOLkW^DP>
zdEk5I;*Vw1x75wL(H9DnCS^L6&+Xl$OY?evCC9$Xzb?J+($*K(rtCa=Z`pqaui0P!
z?CihS)8B5|`|(ixRG+COJbz4e?@#2Y3k;sMy!U)DmcZdP+;}(JcgdDRTh?aAJ-?s1
z-|Kz(sUz=ShT2)JT$>p;m+$F|+EAa35n0_$uReXxT;}>tR{WyQl``k`OqPPdOCQE*
ztq`@C?&;7L>0=1<4hCJ1xgrNM?9aYkryo8iKUH_Rkxtp$;teTcyz{;;`kJ)wX8NhE
zaXH&(hs}PndF}LU-*en+e7DEEy&U0u#_rUG*Ln-JomQ+0+VUi0*7a*n*Oj!x!WJ!h
ztCg<SsuZHNZJxVnK}hmmSH<StOShSaPP_E)dTr3#6>sz=EWDTWx%ZiF+^m)N<)2TV
zQ(m&|*sD#q;%~iQ6}y!WDl2OmywCo;zQwzjxxH6Yx9s0~KV)~7tfKRtqtB(!SY<7*
zsh<~Dc=yt6=~bsQ>$CT*EW2Xm)Lz+r&U{A6Rlgtq8Ep1Ejee*1%572Byc6O!vx7I^
zxg5C3?Q6fu*2N0vjb=ZKuXf#}+s#|0yHh--sq|1)!m+a8Rk^b)&s<n88Wl7(wEWHM
zDvgEf+?=j$oHZ-M#l?5)YsG@d{+cN_FKO+(d3xK9cfxnKeY<8I{HQwqt$NR+-QKy3
zy~$;|$>-LV`P9o_^L5=EsdMc_`OTao+oRToz3DUIE|FNP)|opk+Bw75e5F^w&YWLb
z<`<59KPt2{*(WsWYUssV7FDad+OF(fTYYiG({D?S&X;@V<gM0S`gh&0rMCl(muQsF
z*z>ru<1fWQ^B5&YN?Q3oi_gh7c)4os$A6WT$B}Ifc*!OEOyA;O;^k25$A3P3?t3@w
z#@(=J@87Xsy|*pABw~Fgu5i!cWvj}M|9tvf_O9;6U2C>Y|F-+p^jv4CbK8AG-uHh#
zeXdl$@Np*ZB2(*JX*J!IR$tdjwf0yq-}&Zx;hD!Fn|n4LdUK*cX`RY;o$s&T{H}U(
zG-`F&oz<mL+Oc1*wF~A2T{`TrzRTj)pC^}kUE)lfYG>PA3RRB#u3ZqzwQ6aOSYSZ|
z0|RqJ^1RvC^e1G;@6r*x`(*mIP~XMR{11O&sF)vSH}Tv2eaHVZqy(&#yZPzu;+E(w
zQ4?3pirmDyS?#jScazI=CZ0_`^fvp1fz7g?9dXxlrY4=4H#INzy8PVyc`J-!yDrU1
zTUTtla?7<XYlD4XHy!NKTs?KN>H4%~yF>+pbFEGlmd(B|YgUrAwyY>uPkPA`lvFL=
z?-B6cu=3bvvHuLv@UY$$om?83tN*tAs@|=3s5BpKvjmOe-RkC!y<5UAL@nR?_q61d
zx24x?-HzS9c;?cJo_d=rZ*J>vQhF-(v)xqeQFoYQ^$rK07iYanb@#gcTqN{prPrmd
zb^jT-^MfusJN36&-1_t7Qm;$w($|Y*dS{wk3R#^}J?GBV+o5s0MDrq-+ri4WpVs=9
zk}iKzXTH0v|LCo_i@&myEJL7@jPt(QU!J&s!ioKxQ>HHdq-vf!&&uvRj$KAJ)+>LD
zJ(dpk-g006=i1+*<-+}HS7p0x_uV($nR~u)Wp?RP-%yd(ZOc}A1=@bi3|?%sb)vMk
z*0x^Ng&E-?L3Wea-8!Wuo)lT$ys@tA(z0vK7iLPnXxL_CnHF|)TIzh?NZFJ(Q_oF(
zpS3mW%o$<hT=9I*NAEOzEuJ=;y%z00IoEr`r?&-X%T)#M+_uz@XN<hF<#3m<pJl??
zT}P8XB_}?;^>EfZFU8H1zJ3s$d)mihp~>TEJh#_`>8qK^#jcZF_hgausJ6i$B(=*q
z@7z3O;;ykZYwGHtO{<^$@hnPp-x{N9*74!#sv1qR(xuNAuUl{Kc6*0r$%7wRzM2p7
zr{^Ahk+xx->zl=UL$=M0`6`lMJh8veBzyDXYJ*T;&4oW++|jwSup}~0XThCYujfXl
zTnxS(?sjdLo2T}iJ2xfOH2N;=SS@Jsb;?Ee75vIr+8>bYQjfXX4YPqT>61*|$!QV#
zGuHWLHP;^d6m5H6{)EnSkGH3{-OHIi!#{f6w$(lE8OxSB?l>^%vA)@=Q0?5(rPZ${
zMLjn>@lB84^{9AZaK@q)eXiMCwg#NGy};X9HO<8OR(1bRpPB2z+WxA~3u^ZYdY|*i
z@h-!S=_`|+_b%V@pJB?eol(+X=2f~=Amx01R_2mlnTso4eU#NaSlDZOOlGI&9?Qfd
zYXx<-ZN00uRkE|3)w$)i&$XDg#4S_TSGO$LHmgL#ZCYk}S<Sg{dmh8k<BpLNH<>*<
z)^Y8pdq+`kmzn$7BJFFj({r9K{we1_@y^mX{p+uf@u$o!*?m{tYWcGmJz@E-<x5Ua
z`t)k)+R#-~R=vNn?$oavw|Y|6yD24fbnOyY@+zy`GUl0S%)TIbW)vTvUt6@r{H4vO
z`ExbB_ipZFFJVrKTiVO_ZQkR5!BS5%@-J0Bl^rB9{Ci)XJ0)%MlF#c}>e5@1%Pjc9
zj#!p`E0PHp+H&*V*@N?1wwr0G9qzk*GAT&VOV|4MsioGR=4{<syg7K5`_9>2t5(!*
zn#XZ>uC!F&T^AEUkyQ)~47`bbRrwM79M{&p*x+71Deh*dS^rFX{tJwcY9oHR->Um9
z|97I(lIrx++j?fNtnyzr>tx?z!K*pq5?6E*cTdSaTqd*aqR`iD*Gs}d5uF7-mTPyN
zRXv`Tywd6W4o~l{<!(z>EStuje&}iThr=Q<CL;0DDtS2_efiTW_;)dG)DJq=UhQA_
zpFwj{>8T6%mwY}I_|hx(?Hmv51=2!abu!Age3$<;<GQwU-Q@EcgI}hS-PbJQ=3%!_
zs!wkPFS5#BzSDZ!O|x?S8`CtiHr*A^p7kdE>FujVB4zW|c=9`Edb|B+VD-v1eG#x!
zH#Ow#nOW{TXFP6Nr+!<Z%{<-h+SYqPay81Lu$4jb>Z!kcqUu~8RByTXW4HG$d*2*)
zEm$E#%FG=5oGpuA%9j0poBqww%Xf?2txB$!wVNV^POT5`4qa%ZwdHu|rH4XH@xdj&
zG2W)rKAV)Sy;$-v(E9i7FZVy3Z^@qfBrxPxsLSo(N4D|>@zFm1o2>1p^RM;uJ6bX2
z&-t`h{O)(euU!^?(|7*q!IdX(tQYF}mbc(b?ZMhyL%S1i{>4bmncMYIgKi2JewwzY
z?O=8AiB0!)zL&Z$5>+(YJw@|XqF!i7u78m7`OsCCA>qxT`T6g|s}I?WuFId~>ikty
zJTu^D^OL`j5}Sdc*5C5Wl|5BY=a)U6@AJu{P7i1A8Ki?BbAcIV3E%d$JK*LeU8SqG
z5fih^e#-lpJhr%E{-)=Y3;TMWhY@GbtCU?k^K_%8z}+26H^U~qjVkKBTBEfz`u5e?
zEKz&jw2HG<U)=F7X@}s`y5-VKZlBFsFO*bZzn3@huJ=s+twGms85Vvu`uo%7{eFwy
z%DiuT>VA|jTw}emSoPbgsNZg~KKtwZt=}Kpv%NabzQ+2(HQ6gWC;bl9tzFEwEPlWG
z-0zR$qW7+s-zWPbo9|W3$=X%Z_FZfb*1tdb-0zR;b?;s8zt8u@w7pa>W#3ir_>99V
zr@ue>y!OZSY45X-zi+RSI=pMTQGC|Y^;;6Joc{jg^S(dXr^~lK{(h|Qrg~Yh)UC}K
zw)PjS>zeKx?5uphINtE&vh)x7g=_b9{qXagx$~@U&!cJAQ@$InDGhwN=a&UnS@-)F
zvzBel&QTA%Y_&ASUAf@m?6zBr@_wdPReNp~R?#@>8dLIkN7wFofy=-Bx&7(lhStSC
z>#pY-?acBjReL%mdPdUSDQ?F?GsBFfPO9eZ+G*x}R`pV>zR6NeFY^P>O5eZUf8S+u
z&h;q0D&>naJLcvT9+%fO=lVLe)1x;uudK|r#;h+BIrJ>!Ac6EmmKGr^t9ye#|FqlP
zZ@s>|qe@OaU3T(v_B8#x+U~wUX|>RsktW%9rdVBi9I11q(__l!u&GNkp7YNN-oNa7
z{n4nf>^08~MPK~+61Pnvv3=U^C9kZGtajHjH#g_oXPxfUr!#|rfq{W-!*N^lHSr49
z_LoE`?>?!Yvud{cnfdK67(Uvqsa1Y!U)%rh#KIuk%};N;>u-C1>-#SMMS-UMbKl#X
zdzx-0qkpmZ;^vFSjW6=<*Uo?8ePREibK!^HnJ<WAH+-jSQFFPzZ64pJo!(aMF<PcP
zZ_Z1t?vr?%c<rdUM`)?7w%F>nvN;CF`KJZ<#lD#QSiWY5{0xuhRWp9vzf^PQaq08N
z_P(D3?{(T4em~v#v!y*)Uiz2+?B2>B=P%b>DPQE@Q~T*bU83TZ<H!C@e)c%7&iKoH
zQ|p_puk9DtovoO6;a+!L^Ij<vv%_otb;-3^ProDYTp<$k_+tLW<_j@-Prgg<v_1MR
zy<qmk50%Re{Qff-FWspkIW=$TaT%4c$8C=u_Q~sZo;HhlA(Ufwb8h?0K6aMh7uAp1
z{VcFIm{|DL@Z<g^KX)E^eg0V8>}QMj3e_aPpU(W5lW&=w`Iq_TQlCH0U-omwdzbkV
zzn>oXbI5sRfA3%Qv&Z)QNq@O7%Nl0RgjdOr>pedguD6%`l|QTRi|5DweKGuJRIDID
zkWG%^rQyZ*^ZDM_$*!o(x7t@)*HLM@UiORpi;pipz6ifCb!E8TvvTc=_5$zt4Ig~4
zoNrX!`tZZbhcc$i)3b|jo%!?U$?<F+>s9`XSHIl0^XASSdhhIY+MW0ez-fj3<juOt
zV)G|j$h|eG)4TMo|8lh8DcD&~Jl|LTo74NJ&GqkWm9_E{HFxKHy!fpKRx5#$8Uiyu
z*VXvt9=q>%<lhNFlXLmEO>X=gJb#(l3H$oEjlYjyKJ;?ed@kR2x?f`ow%woDziq#c
z$HdBMKcEIMEv?J!`SR=5@AsE4-mjKwi3(r;(4so;t5oqc^+}Vr%-0JqOcUpTDrSCF
zd4Ac_$Jcd!Z<qO%KFRrGS<jJhO{gR$g=KRCW_jIQdfPuO>t08;+LYVt40FYK{VuI_
zzI;PGbFI}M>zFf_1@|Nu<{8~guKd2}P~DU1F<v%O6J}kS`|qRGypX0vmQxMgY}Xy$
zvSLYP#3q*)mvhgCJoe#S{oeN1@vGd+YP+A>v*@i`x!~UXW9{r~K7Gr56nL-tYyT>7
zpWouA>Nu>HHCNS3@upAToG;Y(`Tm;PfL-Y~_62`%ylNtS`NxH9o4vCurj*b7oBXQv
zvfa^7^PBj<Mm0x1eRKQB;yvGA%ZKRA{H8zUk3gv}+n0YPQs?#4?y1T?-yiX7VafT8
zHOn6~ugWlA{&B&z%4)NZnybF4-YLGacIvjdmvyU7m`-Lp^;SZC@%*AUV%5C=L{HT(
zcAKACku7^{vT148{le4~kIEXY3boszn)k1@SUcV6+BEUY+1|6o8!oMyo38X{qtdjq
zljfD3d9Ji5`L3(~bklz?>kdkm-3_x<-kMbT&AiESxp>T>ho*PBZWixUdeSpZ&R(+Z
z(Xv}hRxWi5of#S%em|^x+4t5U-pQFp=Zdes4NqT`TN&|rQua#OuwJ)SYI)XmktgGj
zLdAL3t697+|6~;2!s|^9<iM8=cHC><%eMLS&F!7;mG7_Ft%x;#6R-J00b-QtytuP_
zyh>LWJ16w)PK=gY7q#ikZ|QS)u2ip+FKBsZowM_HV6K}fze3ONqRB-;Q(i4u^=a0k
z74KKNgx4Qlc<9!($-3f(5m)n)^zKh@H#~9SlGQ|+zL1zVW_uPde=gt$N|Jbn(n9~i
zhSGw6v3h(vKRs7w<L%!m*?qTWi3_{0tvsS@KUv)7_-Cv8={M$|h=>l0F8#%{Ac&=U
z?$RUcT=PvrWu9$aR5>^E;JSZ*Eu+o0YG|33<`t!_O6-%aDErP~TJkznY+3j0<+7K%
zZJ$p!dbRji?b7nNdwkWaGQLm!eDX};tTS&vTI~Jt-e%|YP&wnb+Pl7euv*q%8E-S^
zaOB*5>f-xlk+iXgTzs$neDaKJ<LyTteSgaO;Fqboz|D4nn{QA57G9Pt#hY+(xlv2>
zog|Z!X0djiEn5G&gO*yo*6}DaFVd7-ztYpULdNRa9Ie|ye*LRU{iMWP%Vf7FH=Z%m
zxgjJ|Hgk6W%tpJpNcM40hdQ(17x~V7yyw%i*pQ0zTh_h4ec;tB+ov_3ih0w@e$0vc
zQ$8=|^s0WzZ&UA9SI7pt?^!?3FLBMe`@-G#`#ztZSsM7*?$Yk{`OHv9n9jWYsIdCu
zy~<*}RW|9j!b{&iDD{>5w6D^#ZL@WapMG8G>f-r{-rElCh+F)qJlB1Dmi5egfmtVJ
z&CWZv<gv%LMYl6Uw+6-}`9AuvQmfSK;l+|5O|SCXS3jJSy0RrY<HYf;U1zR@+&;ek
zmh;3#OQPI%u6pw(yqI~hx(N8p4CW_q_Iq~wpP0j6Hp^Z-v)cXT+K#8NHCb%cSN=2j
zx&3oi`ge9xSpSJBceho%_-zO4VSrL068=1GO0Dv``?YKSozV2$)_&VxF;*wc`&Oju
ztMggAm)VA`c)2V3k-s-rQRm{Wv`IIsdP7es&Cyz3U0*f(pWuhS;+n2i@?IUgGiuxy
ziE+Oc@BC(&{%PjdRh~EXrRLA<x#Oq!qP6MQ(MPp?k$;w%dRnVK4xDlJ$RoLq2>ms2
z>NlRwKb0IZc~hLJB%PJo1zqot(CMlSi<X^tVWo&uiq`eqR!0A&yH>4T`szsPtle&}
zc74cL`}ggy_Z8`ncAop>wW@BBQm&G`LCt(vwaCBnx!tVk{V|*0-6?)K&y)Y?En~r$
zB}ie5!eso9u|MLU=Z6_P4ozk9)mo;y;W1x9`Hm3p&U+6Rt@2tiU3%T8$+55AYfV|<
zdazJBt$yyZpC*}n=0euyY?@J@EqOhAj`K=?eemJdl+W6^XWzaP)jZ^@{ZMP4WTZ-_
zzp4L|=k?PoCcgc<`SkL4)#Ywk^Zi@pY7g6;EV*X7&uP;fe}RLaXP+sxWfYV;{*~|R
ziZGtcXK7*SMs?2LWes<$UYq=2rg!ej*w3AkZzWgN9-jLz_L2McZ|C)TzP;<YD)au+
zdm-EX+bT=ax0j@?fZMd#cK2dack4g9kM!riofj+l_O9eA%ll97sZ`hJe0p&{@5R}`
zbN?CiJ}Te-oj%<S<f^{%pXHr$_ix+mGQPdbIHY+0={+9b{@r|fao*jFvzA}|%we=a
zTIf^E#`TV7Vee*qompIwd+x!VO>5e&cim7fm%6oc((60F|GqMou`)k%>)9T2k+w@)
zmxA@)qQK(C_`JLFlh|Fe6DcR1^mw-Cc6(h~w`Zr|63dYMA1a%!q{lt9jH-yyT56y9
zBK%`Yeni^J=RYpq>Xfo^6F!in&Fh{j{dGO-vVdu~{r;A}j(wsw_osWlXTaYF4E*b!
z{P-OyyS5-~m(eG&nS%0bEkd%KgqB=x_bUIIc|>nz*!R}``yXCvzZ`B_XfJALqwsC+
z(_Fn3E^JGdO<k+;+IZnouBBF*OWJ1?SMAUA)e1eI`El0rSG0;6ydgwNf(U-~Ey8G%
z^t;5~$XoAHUo-BCdUG>JPrC3g%boQWUsr2wJNC|J=|-XQq8#xZ>pwiU_?+=<538o0
zYKAu`t@&?3O)(TWtM;(a<(nV#4BwUSGC7pEWBQG@c7fl09{C4NwufB}xg?!geQK%3
z(onDEPV?kX&zq6Z{>r@Mv!TatSQaN`zMJ1N)BTdw+wZr|ziFB>JEL~X(_?SkUaZ@e
zSCc$Tu;SvJcyFUjtA*9)*KYrkRXSOBuOV+w6q~lzz1>!YG55KPtk+xCl=X)`_3)ZB
zr68m0($;Xpq{s<TeiJvZz9`YzmbEDN@%ApghfEjOOq;kS;^OK#tGD+@^B#3>t$k^q
zm*-XH|3*6epWM=eSN6`odG^@750AdSTD`Hqv-;t^$={wmS)TIm;MV$wudJqT{%yJA
z_C4OJRd13X#r-H>@^|M>&71Zca^rtg^(}q#?_8eQ_vW}&yCr^1-zSS~^5T`%_M2z-
z?tO55)w*MLs_*+Y=Q-!Ce=EHG*|t}4v$Mq{7u~qp&wJ^(ad*sKljFs+O>h3*WOOI*
zu*c{7r%I)b+TQNoQQjVUAh&Jqvz5M;%h{%S?WoQ=zHIY?lnqv{#q-<yLgF^9k6gLK
zfAi~StEZlsDAmz>+|7Dl)XERD*IBQ#_Q(<3(k8t4mwn5NKF^*fvy^*Zg}l5qwfFh-
z?$_bBk5#U;*fV3I%A;SC+oNx9*`9xR(b{d-MUSue+;?r=N-ZbB-^meIw)JW+pL<F9
zl52i%h|592)z`CLUaE@{Yq>I6rEqcRsw=S?yI0Lk^*uaoF0#Lu>%&5{d3|VXPmT9|
zzAy8)o2y!@f8$;KpMgIRCY^mXJXv1o`|<R0+3$W^UzbR$T(7p<oMh$PbL=v=9Bc6g
zeV-i3xl@1kKQ;a|V>?Tn=+yL|i;E;vcPodc^(?cRv{b(}^wO3^71nvDmT!)hUQm2B
zZgpSLqPK}#o77&OjJ$g3>du!+rpAG08(%-(wW>U1`#D(Cfbpx%^UFaWUr(#O-B)XT
zQt`)bw<PT;*gJ9z3=CLS=QA)cNcM|-H8y?!WaFFu-=SqU*`LQ;Zh#G;{p0Svb~C;2
z&hD+pSl4N9*8TQu$(=O!X4%uNw@Rv0S8?Zi7B0JP^t@==m5R4PTcgkKYP;yF8M4&j
zTy#QB;O4!Zw{8F3QJdEM(O0@;SG3NwpVO<a9%Ef}H81gGIQQI3DHUQr45OBF-ZWM_
zT{z?7j9lAa)3&ObEN>5}n|}|M6VYgSX{N8~yH4zO*%rHYyJQ)6Q?B`q4eG8ll}?w&
zaF~Zg>zz4XdR+H9Yt*~KB^Rf!XxEl=_A991cFkfdz1ZJ<I!8XKY{tLa_og;~xEQ!T
zta7^c+-b76Zfku^&12p^+b8p4{LZMTUE6+d+pbp-RT{LwyeP}KD{0+AkxJcFEAB7K
z-ed1S3Drdub*5*k=l?zDt9O@YOXk%1!Nqfab0&MQi*!AA;T^}GO1(R>rxr&loymLp
z;$Z5YMQe|{_U_txX2sp|YwJX_@8|n%UTc|i-S(pCy7Ml_WvXtx)5@E;>c!(VZi}x5
zo6D}(c~)Mz<fixZxeIN|eSf=K#;wXW-?7)+k~=ct&HX#iWS>n=)r%`@`FPj-txRcf
z<ldt%cOKhx@aF!VXRObvor>A>mh<th{96{UmTj)?zFgd!+Pt~Gct-gw-zhsQZwVhS
zxxc0Gs&CqN@nz4YPP3=|dv3U^jO)(Ki@oRHx}Pc4_AONo-t4_Dy=6Vyd}fO{t<^Qr
zRqwZT|4luzv|V=d$#<zLH!t+FZB=gAS|R)Q)#d&zxt(t{W{a#%4qSKK&3Z=Z)pE~}
zX*DJN;Tz9|h8*@|Kc4$=C-0)_x~y-OYoaEutXkR@Z@AWcOXia3IDtK$X4BuA<;*uY
z6)|6F`kSJU#uu758>`=rXT7rZ%&h!MlW?w8e>cBf`Q}9!$Bg5qG3ze(#|m1Vn%(|1
zbiQazr}CXAA$d{;cSC;`3x7xQUQvCh>Fhgkv*+}#*-)1Mu9)w$=ceiU)=m}gW`FCm
zTDj)lqc3+JL&DNZdIQ_je-qzT-ZM4(yez75#m&vX7Uv#)JH2f4*2h{|_pWJceb<tn
zcHi|k@63`Sr}uq<_p-w^_q2S-^!v|H-@7F@bFJFOO<UKxX+3PStuYTR=bSRd@Z##J
zU90D|-IGmTrY_<OOLL?SX=6JC0AGgx6}Rx^Gx1eF;vARnYVAL8%HYo;rnlc$>mKA@
zZ6vws!e{;szqPYMxO+H#m0ThNuU(ljDa!fGTE2zc9x~f*u?HIa&iQh&N`8y#zsHjw
zZ2HRVrk2rn$$VA8U%rUK*z^NN)yL+(ZaukdqR`ouH`+8)r|Nd!-jc=_nBDUF)#`Au
zlr1m*+`4U|X4x12?ZW<>TeXAMT-Pa2om2Ld??1y5DaZP@zx!UV{T^NYVtUFo*~CrW
z&P7TOHl~<$Mr_m76bYFcxO?u|ed2{lTcm3j*ED~Dg&8W%b9l9t@3iQmquJM*mfhzM
ze|Y;*^k*HjhkDc2ILCkF4wYFG<E}T=?M~Y1SvMmK&%3UfK6CBu?A2=`L#l3lUaL@7
znUUP9cgyYP%BYQ7_Ni{ZXI+vVR(#jtpxo~30Uu9(dtG*3?6h8Y<~8jpPo%bI2Cvb5
z({y~&t+d-~#W*f$?df|en3d#|+jDBiCW()81un@d9xEx9tUI1u_q2M?y?6dkD$f;9
z+;`{C>449gZ#Uh&tb01IceAX}yP%7*+h(RNyV;)KcG}?KrXy~WsSK4N^02N8#ETR)
zT7KzWjE>8ByeUT~?BN!(6;WT#$>nOj<rk}ao^>*>bN9)OTUmdv_E|2o>0a2`#I40i
zt5$Ex>v}fj!L{92@2&N94UJee<xKeMvTs|GZ->~at!SGom{yjhckSl1A5Y3ns#bmU
zJ5p-Br6_Jw)b3Lsx6fI)&LsL#p}xW6iFYo(T7A@7@XEW3$DnfAy<$J+c%QLA(pR`<
z+PY7Ba-T~*cHO5g^qJ36_N?5hJ;57g{ocxCtFGR&Ub^zwj*I!7NA?`<b1!yZwO-_n
zd+=^|sgqgnwU6B^q?KjICf>ZB8#{g5rrA@Yzg1q{?Y7M4P|~esPj#Yoeul0M>$rYA
z-7u^4X-b*4k=&Bwx&0SSvet##oZF)nTsQk;R)*G#i(*T=j+9-Sa=AS-WG%0cDA!8v
ziKk{)r!7$uJphSQ2KnG~wWj)VJJav(cz)Stl6?1;G^L#$P)Q7m8FLLO?kb+2|Mj;O
z#nUgYb-MD!Detkm(DA!7^O`<QoAGJgrzh@PZ_b>u?60Or_O5g3vkS6ai}l35X2!m{
zbnX=I%jl^^Z>29>Q`vO!^6D8^kF$F`b#v=G?3Y(Pdv4pDJKoRLrrYX&{j=#^^OA^#
z>sGnhzHBQ_&eEB*y71!Ml^bvFcyaEqP<V0dg0Gtc)_*pNSsnb@^Sb3a-BO?GzlUz-
zwcWZNdP#TF#hg0TP1iYhT|cl@cG*RL{_47|YYW|Tp9=kn({}tecWu_SlObP^yR1)^
zn%k|Mw6!Ozbk?y&l{;om+x^qv<lF0;l{e*N%bo9=Wa9Gi!Y22fWt*2wbKB)9x3}E(
zT)aMypxx5*`hh<k&hFOQ_p|1a)$%=ilw(hyzCY)Fka=*k&U<I)dwi8u{ioP+Z-#z9
z{diUN=|iS>_U!%f%I#G_>f9F@)d!KZm-<&$xb2=keZQ>7T|b`9dwKaQYEmB?oZci|
zy>0ft;EDHj^VeCgznw3>d+xb=pC_uA>S^~Ko~`-gou0`yf!C$0vO_FdR?WV;RNHGq
z-0PdZO1{2XQHEz`Ja~5H%vGOY)=T}am%3K$Ua><YOV9Vr>du5!n{;+2#29A3OehX#
zb@d5M4!N+RYj#)1@{^xWU#SZ)&O5b!U%V#VRobP>r4~l#zL?lPcs_l_p9N`mr^eUp
zgL}DeN#1g{$a`(=dt^R+m7l_Md(-Oo(~pOIpWc$av$Fc*mEJ24PFr8N@V$9n+~QA-
zv&%wjKmB+$@6(6ga((?etBJejNTydw$$cnBSS=fTFHUgX)2Hu`QztMjpT4r!kldac
zeqX<D<$HaO>qVcwKVB)l^5C@cg$v)a+w66J%53&#)$gYtui8F+DD^Hb?$03bvRwEj
z-c?_ByM4Ny=bL!mD0+>>XNj_p-SO|YYp=7O=;(Fv&I)nvn&(e8-Fo7_G}>j^q?v}V
z)(L0(X5PO$-CA4x+0$Deqo#<phFmaxvB^<q{UpiR{e8(dz7{F1O6vHw$acn~>q^sN
zf1Wtzum7}NcVcGbmI;0@t}3l)lXyIj{o&)d;~Uq8y6t9-xU*XN!@;*9TF*sKEKL2e
z&wN_6&@0`Wu1)7BMqgAk@qW=-wcTx}tM^IYkSV*SsVwVSsvQ;R7L~KwjXUo2vaKs8
zp7am(mpS8lVZ}{}{ye)lKA)P+++}B{+jeGOy|LBn>Z&Cxr?Iat*%~W+S9+>=vDiAf
zz|T)U_cAc>GvvCjF*1>N+WgjM`>~|dXQ!EDZ0by=tSp>fx9`rwh`0G(xi(U}#aL#y
zs_eSYDRXgINSD^sP_5E;q54y%vpu{ezG&{++8n{H8%=LTZ94Vs$kC$GTgUBfPj)3;
zTb0zcqAbrnPbbpjl<mxtta&e9XsH%2D=x8AY%@QrWLkBuI&$UKf|%=;vQ3LLwQu&V
z%W7K?ull5NTJ6*);+^ZQK5tqcw>*?L^lHJXBNDw<SG04~e!nu8v(7cRr0KGd(^u0g
zY0}NT-<N&3Zg*+hTCQDP*LQ}-{ETf%zLp&rxF)M~#<8tyU0kvQ1qF?@6Jx69m*qT*
z4C9^Y>g^~UD7dC`#yZ*Zoq_Sn*K^NT&-t#kYUim{tM-R}Nj>p5L%+6k{)ghu>(<_n
zT>dQHc`iLlXMM+(KeMebzBoPc*M+!Wuk9Z^j(Mx!Dg3GLxzxqf8rS7dZr<y6i|N1C
z^!<9|e+G7`=b1Gg^QV+sT<g|;q-(Es?#sWleDR9&XQs-%EKCYLcx>K$)tQ%~>WbGG
z+zFGN@k!QuQcRk*MlM%uXvnrMwb@seYI^l=%dM2S@nqG?k{nmJMW?>HEn2lRaOK=4
zrB$m|sY$F_^>(G_G|^7!`#L2hy^~yBi<THJ+8HRgVqHj{)`}}`i`H2vfd)@6Bb@xy
zws^}v75`J#b5^ev3qKN7=R5ZW!oP*<%BCOb{OMlYo3Ux1(&bM%@BOzwj5@k%R%uP|
z#Kfq^sgJ*BoHKbU*IG6AjLOob1()R>vp%2OT+*`Y+8#9(v)9^7%U8{sbF{tYeeCUZ
zxl0poiI#lL4T*_X-cTl+zBM%J+q<oAi^JEovsdV;p4l5#{ATyTw7}3nTTz1=pQAf|
z2g`Nz_x|orp7rFlW~j7j)|uX-3umpbYHqmMSa|ZS(8{uIscxT;xHaX~eU?>4QZhkz
z@`S%fJD)RZ)y#7C_SIS%Hb?7evs(Br@9%{=x35}D%ld}sU6s019OSX2#%Oh5VMx|V
z>8UGLt?CW&3C}G!J$3bDF~3dSS$AGl%IfOMyuC8#jqx<iMJXSZry96cxobx~b#Vz-
zTF1F%Ua|h>b%_2tWkdaba(Vw5a&~SJyIj$CW<m+SSJB+S=^ZIm3-#5KCHf`5>s(wf
zc1Cq}@?}k>kStZ<*zI;x^7G$c-Ss0@y6s))<siGz6;*~Zy@%!YNz92|%W`>Zr*P;E
zt;g*%e7=jyz1SY)w&v@uo6i=OygoW}zrjN*r_I#`j~3q*d*yL;-u7slD^efR^3TXB
zM(*8F82GNcR3+<t^xAn@ycL_xpYcstbMKDDitnzwCYh$My>{M|ZBOLv&+Hyy`FS!S
z)r;<WN^QROI(=4ioX**w%}cb+xASS)Ud&t48~M6)^I6CBVrhR4UJ{*syB*0H3$Aw?
z*Co8<I{Wr;SAR+LO69PsyE&f&vO#VuX-)g~sI8o#%<Wcin$P~smA8H_zF9Zj<@CqB
z`klS@laF=F?mjcy_PlA;volu5r>m`sQCr8evikS!D@F4)bK|UTMQwAL@;>|Osy79%
zlN--OntXP6zGiLMqD6`Qvw{V)WnhQblRROK?Eo<xy)mB3?ax=Vu9a7`c)ov|QOUdb
zzmvb+-=*|@k?@{fQh(=(*FQdT@6$Q2mtD-p3K*?G1_s8&Tl^p0GtO^V?DzMJ&a>2a
z`iGT{{p4EmeV#GPPM2>9(fgG}_kB548MCX~f`P%v?ar<839stTNX_g`ns9Gt3Xj|7
zl(63^C7L(QRUVta&VBcgOIyS1Ze8}Z75SgpCVH*C{-Py#f8Z6%^4-&VU!KfenHH&2
zvaLB<=1h{B)?cSEuWwt0MZ<M+%aVh==gzuv+j{rHOJ8o5+v|wAuFdthvF5m4guL=}
zw&RxtP72FCjS-ZXWI6L)=)pDTirxw>jS8E(R@>{+mbIBtVY9C;&AcBrb!qjlu&IVt
zE8ALc8(VaB%~*Evhdn<e8sy({M6Y(vUFc`=Xj0p4r>w1Ov)2l>C~__hS*jVjRMRWC
zZmZp_>y{_3Em}Rbz{PiK@Y8eIQtPW08G<hO=;++GYIkdVPIyB~Rr02PlOFEOQP<tc
zW9|CrZNavEw{A|&TNgStzUaiZpJyyxc1k_u5`DXEiOH%!U%S?2Tb5=<NoJksy}xDM
zK5w0FvFk}kGd4-8REvtroQ<o@k~*&R(tYcut?RR1W={+(>(Tk*5SOj>{b|$f?Q1-R
zqV+r<^-fY+wDqsPKd<q;qrQB}_qpdBS8F$nTDj2Iqip46M5(}R`>KAqa=qfj`m~eU
z?oYh(Zrha9&c_z(3=9mFr@ibZUeDimtv*F$$!+Ji%cq`7zkwxzz?Mqby@l*7i(PCe
zdJ4y!A_Idw=IUoW-3rNUHO>BKzcQyL-3;Bj`e@U8G2w9O>~kklcY5@fHXBY^d86Nb
zLLbYF7x9fz)nC4Be;KEDbHkh2r_+2BZ%^2^=+ufVm5Hl%3ZCe5SJK)V_2l4L*9$AR
z6pOJw+?Ka=BkP0RpLfl=v8-s9W>jy{#+a)~tCVj&Tc=w***iSHJbRw)71=qKdRt3x
ztA5US-P^xY_JM_ER6kGIwv#zKeiwukne1aI4=5{mvg_*goatv5Zv3vihn-zJ@L}7i
z{Al<Pmp)#eQ_N{jIr6dEbe&aFw64Y5NjLL4S}nG?HZ$E8)A4u_y}IzK>WsIlCab0_
z4V&`*b=3FgnU?}DM_!p2`NhZM#e?3B$F3X=T$7{>y5}zR;EK+rp(UBY%U-Wu_15d=
zTCMMErd}1=7b&-*W8EZo?R!sMTwKB}LknD7E-w`1-zdZZ8ua-f7Q5-(xtmwt#d%JB
z@r}1^-yNCY@7$}_ymg*gIAPcOomvmx?hVwoik`Z>sq~(7*NYXef5)p@T5MlEKQro@
zDa+MQdis-t-FCj(omaKl-17AG;*Ftqt@VzaRts78blz#TquX;ObZ$&$b6>T3*0E)m
zBl^#s$ts<7M{0Kawd$qPT1B!gzqO4|FBgBLIH`_l;pcVl`{GmhBLD7K%Wre<L%Ek(
z$P;VMZ(-@v-8<Sme{i**+r3X(q3HbEqZyUYzBlg*+_YTrhX1C`HoI(GjQkIT*p=S>
zaFcmjeV(@SdGkFs>Pw^zbDqz<UwZF;^+j9zBD?voWB!QCF3C<=@jC0EykE(6wcMRq
zKCiBYtco@2EM2|z-rB&B<u7mL?}3^2@#iLXoqu=2+UMn0Rc^*+-^br@6&2P9A1yxO
ztn!D|`FZSp-U*iJ-yUtT`CNVYtwr1E*)|E`D{hLZFTSC>=r?O-8S726ur$BzcD?^t
zW*2|wd%Etvrb%d~d*lPly|M4aKAyU9Q~TuE`)hNTrbJv3xPENMCSOmPXIj<4qCZa^
zuZi5DKds=}vxrTLD|No#47<BK<9ynQ8FsdzQO6hk_P#iC#kW79H@2U$`4ay;8mZ`}
zLhCTseT&$c8GUCaZ{ZK!G*@r>^e0sdh1KdMCQ7#J8?O0wM)h^Fv9^-VEKPyAJN7Nf
z&(9A%^`n-%?OjlC=)RB@Nw!kGhvl|e%!yrVbHTJ*c<PQ-kK1Pyx+na-l)LMzdd}M!
z20X7b_j;e)aOZHe$$tj9lN(>pd0jp2_t|OB>)XX@cO^-0H{0qyRrb`vkVPwFU;Rk8
z&WJ3l2{Ae;<Kw=~G$i)ufybKJS@VOAS-uNfJz1!~ywLP<gjv~*V^`<D=&P^0Tyyem
z&cTn*i)0F)N$k)&UAQ6b((>q4J9#|Z3!j|WZm=}CdH=q<*8k>iOb9+09JBG<m10xV
z81Zru%h^GZip$P-hORs3KV8MXx40yF)%~!l$2p&M_I#ap=3YefUHiy=7KQ8bx=W*9
zosY?#m)e`Y*?bz`{5AJwEUufo?ppgbeVy5Pqhp(I&OXiVe=WaHCVTdxyJ5dJUzbjw
znH>4%?9=Auug#D1nV!9vx4Jg+b?@dg8`peG`*iT~*V&KTrJlXm9=dPM*JC%IMO>>k
zu1t9O>+Hv)JntCZ7H?U0xw7tZK<Vd*qQ8Ni9m!Sk`cld2lf|6NGR?c^Wt~0yqNHEQ
zG-&7A=$l^Ozkh#m*DmbVo|2F)J)N01u4zxb@$mX#<{6u^)aIV|j9KU6a#-GEnbI{&
zP!R!2IqX+H&-e8{zAo~6p5>SGD(#QnN_MQneNH>(3V1xVJao0pM(App<GB*&1B~zZ
zdl&Sq4&1zZ{Q@0x&W(49eCB<=BR%!}h16+RWs8=d{UV><Ua;%zGamEMVv9FcueNWW
zH0%04)uYnSCMxwBPCVVDKK<DX4coW9tG>U;QqxM{v+zAvx1zsLZC8y>pLfJt-`AIo
z_bFvh6;HoYZt%F}&hu9t--WW5A3BjOyK-+;TKVCa6}#;o?`)S*du|`%zGHH%otQIJ
zm()|&o8M*6SUfbh{I$sT$gGziHc1`N%8#2l_rrB9z4JTXB_8M8dH$;4JC*DHtrM<U
z2UmSL^S*6oU|H?)Vq|BuZM-%6`m*#I(^K7>-&xNnJd{`b)up=A^zy?L)4nVB_8MD1
zTo)2MzwplCKH=i|tCa6}uA9%TxTb8`*OxQz^A;_7^SkevOu_6qe*@(n`Ih{M<W1gc
zzTW5D4}zZRpS<baw!E4r;j{hh({C~FoL>F(9P7NR!gIs69e)>k?78NOfaK!JsJtU-
zmV2gOnkU`uoO%C#&`PhcEq_Y8Gq<jK?)o^;@RQH`rYMuF9m-$UPkEP|ob9&lW!rhH
z+S04G2`Bf87+9;!j<KHm{eqrz{X3J{SC84Qx^aKOnYhbUd-mR$`Du23?T*)87BQ+<
zHJ4_47zCdU)pBM^_t={gXtujTQ7baJwOggTS@?Nrk1tYKPPB){fYNJ1@z%fn<-b>U
zhpHCOezdE&DSQ6HJEzU|MQ_}bbpG7wh_&-t-iJqWbTf%B<UTe(`fb3@J?$FvG8Mng
za{Kpds#p1!AJa~3N$gFXWTTvww0h3nC2uW$rPiwSxf%Z08P0NH&$0gu7Kb7j7_zm4
z*BT~n+xNS7(`mDgygi>+d*_;5F1Q`tXYuH@i{cigscW;NE=^g=@$QrNZLMV&|E(#%
ztrnQJHKpR}v#S#xU%y@By>-&x<Q*=%BiEa)e52$eJ$Kbh#+L$lW<NL1@$R)bb?tUX
zsL0f?sY|ne)Xmod<wMZw{p6e1FR#wJKiRVE?_w(r{=<IEXfsrlmGAzG-imILJ7*o~
zbTs$3b7t-OV8w?<>n<Cn>6OP!yEfZp_TzwvUAs<gJbiU$epqtYF^yH$;TrSO{dhI^
z$Nsu<b!$l0YM-x3yYsXCWVZxA%Bsi`6iR!pl$$$``|{gUcN<LBTO2i2X1KXu>e%NT
zx!-vwH%9--nIEv!WApu7BSF2p1!=a2^=Gm^SN6!&S`?dk>~&Q6%eB^zT&C?^Y?3W9
zY1OeO2ZLEI&2d!=2y%N;Ez$8LCfMuqtn9dLr59_LZzwCdAN#>Ws=Z}nZR2FqgdKPE
z%Wk~bckA)FPNs*RV%&3=mB#eTJ^Ph0YxkMg?%MliU7NC2Dfr5HPqo#4Z*CeKmX(gn
zQ_W25D|_ZOW4nw`_K~|g7hhS~RTjJQ?jGT-@2=jT-nr*UZQi1LmwPLZ3zt3X4_c;q
zvut_U2UCdL;pi3Hw>*w3MApY!y|w)5qkMyFuXNv9T%S{Do|o}eD{1cZtnz&3xVN!C
zD`uZD-PK*~XZl>m_x3DXuNlizZ_T}Z;QH?Hx;VAZyaX*P_IUGpW7z52YG+TaleT%O
zy4dGg+Qr;}P1D~_?^*R?^`30$&GWaKd@8bj`;_OB^mg-Cv$nduT9+9on!VTe#a7d=
zr5&9?U#rZsoidJZ=?H6^#Z?;Gy5ef=-pxURTo0gWj?@*6*iNd(F+Kq^8d3bqoWCv4
zd8$-7)px-M8MO;v|1;>H{xaL-_{ZHk<%^Ejy{y=IbE#W1>+<LJV$yydwI#kHuRC0|
z<Nk&$JQ;A}#A>BUt_ORowAN4U6k58j*80X>y|7h^GBTmk3u=95ny1{VzFk@K`snO^
z%Y}D#XGg!bW_|2ecr<Vq`<0uM%Y2vb-QHWVx%|qba)WEX&K<0{Jf|=uukou{%H7GC
z?{go=>D>KlG53t=zRerrOr9f~S#iDFyw*>>m^XjT8}VMt<Dzaa<b%_EZ<@{fcF*+D
z+55qrcMcCS%YHwb_Ac)2^TaE6t@g!kD?O<gwdUqM<<gyJ;yfGH3hI7)({$e`XX?T^
z8;^%gQ#M)}*6B4hDtO=0?E50mw!V4p@@!kk(Wt$tcizeuUI<-$F|@l$tMu}mWoNJC
z%3YaTcH??L$!*V#nU(YMEA}S{z1_wU^mf7$ul|g)I@hhcS4=FQo!`{u+ANtD)>Ue{
zR(P-PulGUQ=1#N;`FeI*>e*hwl5ZEw`O5a0pZ0n2*e7n)yp>vikYlhS8ybW4gah@`
zY?-{gx7(Nf4$b*FKkXL#ga|_(`>=vN%_@66P8d}3M7{ZT`bwnna;qaUi$ab^_Z=<`
z4Z55iTfY0rwY?w3c3s{4bn!HcC$7~qR&+MKeRxaza==xih5gl{Qhh!p%Po_ScEvO>
z|FL+Sv+ePDi?&&rT!q(Tf&y;mE;J3gQ<w!BLNZyK9d&KWQVqt3^Qvc6Zk^J$ad+GD
z#ENyNeq4>%_C(k3Y)+Z=zf0d|XGy1}_D+-Ed2Gk6Z}p9us}@CHT_k%}#y2Xz>ZI$s
z*Ro|-R?LgsuJl^0{b0A0h;Hzu83hYBoQasI^jxB+Vq?<rTwV8GDeG_(lfyhFU8)?u
z8dH`{S*qz3@W;N;`IqIl7w5HSCEt9f{cMHAg6Fpv+pNp%Pv5bA)`Fzw`FTB`b}qA>
z_h?^MV;5iTF5^?rW_+ve`niftgJ^AX@;B=~X70bYZOQH%>!!WFx-9Q<c<bR^QR!|w
zqn9q+D|@jlY{%uqU0>2Kwa*UH4$<1OXmV6V$+{1Us~dkoOBpCd(J<9A&)feQO7gw8
z+q{)Zt+;YW+V1R(tJ2c~bGDc^Da!?K`(7Av%`;eLXO%(RniGqQ1Z(noi?w#fe$j6A
zD!=`C&1<7*&+|Ec*~Nl}$L01}ckFJ-+I8Z^#Z_NI&mGS`WjXiWv~NLAq%-`bF4x`T
z3tyRa_LIKv8K>(jtl7U?9<PeN={+HLp-r~$?+RJ(u>6Ttd5P--%J|<E_kG#EX}ZE~
zr;2N{zkRTpw&uRlmpg~!7QB(a^V~Xqlij5s%WhcPzWjEv!nIxTm|?BUm+UW+$){H4
z`u^izG%bJ9gU!|Ry<J{U-8c8-j%0qD>&BIJ6ANW@pXM){BbuxIVpXiK(xSaPGJV%-
zbborT?r)r>=sByaH>T{*(!)>t4CCsuJZ>hPF5h+Y{)M#*Wa};$&CRL27Ik@D%Hs_x
zyG4&nJ$!aD-Z<*k+Ti_O;Z=M8@)d56TYJ_$YvO~IAM=uXqtjSojOFh<w=>_X`)270
zvkUV~m;K!%zZc@c<j)Xq$iB=^)pOj|@^P*C+Xtn(k^5X<?rhuZc*B0@IqOUJPQ^6m
zaz2jAzkT3U)aIJ*%f-A^%^T{A=e)=}AiYA$eV1g#mD^@BGyg44)PGsJU3}-=yXP+U
zEM46k8M69LMDFcc@%PuKUOn-8<IeEPwbIKzT|T(B`tLili<{CmM;-Tiwo`6(o5a@*
zcU|tP1%<mutaDRZmM2|NdTz#?Koe-1VkC8eBgUG1QaUh_*!xEE50Blh+%8$U#p?2k
z;Jhu5g0|hs&0H0ByCYO<>cULWDD~8(nF0JfpBG;~J2}O4QjE!Q?o->sPi7rGtyoqj
zn!GRmTgIL{6K-GD@7ecvlkc&YoB#auyIOQ}ZP+`pN2^{}fAd-^HskfSr7oY&?i6<~
zu`7wVoS2z5<xHz#TdK=+arPrAu2aRFGs|pdT$0+RyZlD&(u*EZ*&!=Umu5y?TK@2N
zll;YV-for3m*ox=#v3^tWVh6neA4ez&3LrtBr}W5lP&3!Yz^PAi+)z&5h$K?ah}JQ
z;~Qkc&rC3IBwE|=xy@3?4LwY^KKk)@vh=o<-&M8lCTw-sd@y8r&@I`?5!0S2A7A}W
zamhvR2Op-oT$;MR*7xDB8uW2ziuSt~JuXu&UL9@j`0*^q<`?Gzm(DmdSAXyG>)OA*
zT792(Gh)WEU}n+oeAl@V>jY<6Dqf4(xI28sr{txYK^CU5*-`2<HtJgFT=#f+@m0Ue
zVN=1fjkk{mmo#QvJRNpTCF{!B(A&q?w>{E4wC{7`c5|)U>fef2|7YNs^x)N^^KaB1
z<<}KfoesU}er)fH_h!<?MQb<Jr(5m+&?y@d`M2ec@B3q$u9gLUTwi5<w(ZWH)j9KT
zo?HJ%$vsr8KG9sb`s13du~TZY?@iU-d2(}H=_avbRhFCgEsZU-ewMv_&xgIdu?Csf
zyQk@DU)&sO-1F$}3%%T5M|<whduBFgX50N|8zLT@aPIWy74h!gves(F>+Jc#LRxNb
z436`Zm<q31y>nObl!9IJW^UNeT9mwX>(bDgwU-i$@@Ex%yT9j2CF>)-m2Xzu$zJ>J
zbx~jY=iV6a7fB`GpBlAKy4SYrR;+j4rLILGOD@KVX8Zly9<6W79Mv1<+okrn`g2QZ
zZ^*jPbN%_H)t4Wl_{)FYDxH6a^3=XRe@2YA`%pr3;#JrAH++xW`;)C3o+>Z(eK}~=
zv|;J%;^!*YtB-9?z2!XF?aFUC)|?H>KKVcAKK+sZ=<=tT;jH(BrzS{57v8MgeLEy=
z%_^rUOV+o#1Wj4B!#eNk@-4Ha6S6NxtnMpP%6+)CY2Cw<kynf^S-w`<T0GHg<Ll&I
z9pzMBHEk2F9sV&pZH>dl+i$9m*-m<5@oSTe-1Ev#DLefY8O@PeC*I5`eL3@zu({Te
zWTQK?TQ_+ZrUePVoN_q-xbwR$?7biTgN@_<dCQ&n&#*N-cS2q1`F#(qZnkHc)vu@!
z6TABAudi+u?{b5)3lAnnu=t$wniulcrg6fmT=B}Yg*!4be{ZQ+V|8@Xq^LCK6O)!2
zO#D?@!@&5*+`3vkLg?)!^PfH2!cI+@`%XwZ?k}Hmecc?h>0Vm1|9<<+e<em-a9aga
z+MJY+_oE9JZMK{hwuh7FRM)And97J@mftMoy7%Gb-l(9xQa@5ts=X>#)tggo2hifV
z1F`Q2udOS;o^iTvZhK^7&1=Ej)z{Q#C2#ZDEU{;!mNL)Xjc@vHi+xPpx#`I9#ivtO
ze!JOgHD%ZBJhkq7K|yPO1%me_-S~;JAu9OE+o;^S$+}za+|0YntK~JnXj8J?r`t<w
zgd|+j<Cn^WTEG1oYO=ij{*TZ<;mf4V4w+UxpKskOw{dsdvZxrZjdxPr3ONg>y05M|
zwPx+si=NS5`3J6o%K3^^p@n~@W$VtD*%I~fPu1nz4L5Siew5ADophE>ciqK|t<jIS
z3$bpxp4)U;VC}abhEd)=n=T35_FdNLmFX2Uh4~3$IuTm6LMhTt0eiYs-sAQ5``PxJ
zxPo#Qe!G0?sqq`ALC6#(tCV??*GN~tdAjJQ&Kax!43lDwWaeG_wdmeji`eBCp5{!N
zy=u}M>!}A{hW&Ofe!Qb=;foU!o^-qmT3N~wUQ@F6)9HV{>wK5}mb$QV+OjL1H#fQa
z%B@~$X?E-T>X}lu;?6Vr!#9>#u8n1{i-~L7v+DkDHS4M~OWLNLn7Q+!XQ<lZRa?vE
zN(-)^8u8@P_0>|x7d<*|X);ggaoElNBY%V51f9R_nVz%pWZCzuPl7*+e%wf&`u5wg
zFFyMXk4tSkQ>XrFdeXGITW+UirstQfSCns;<|*2{@5a5(>CY^XjPSUbJn8Mbt51|a
zJWlKj-+o*AqS2q@GFi6`f6QMM+ZFjcQ(C7ecJHk{Ep;+EhNr&$POegmpYgaiI?Z1A
ztM^84`&{MJKJV?j!zc6~?rpP-yPuQqa{9B($8)Rl8sB8DiD+44ZnQb;)grAJ?Y+~!
zW^A~vYw0FGB`WrvZ?n<i{EwUZo;|)9XKP;hV$#j^oLcvtUTmB8I2-P%;?D>-85otl
zKKK6C<~dRNJ5~QPNFCiAYk2Xj&Ka}b!#W#se@(4BxMjV;^*?PcXYASD#p>M9Tb;h`
zZEPn~&XY3}nU1`%EM8!?bU$aupUJ;8HU*tFvUXKlTe5!YnJYs4jWcI;?0n~1<vwG_
zvaBVmR;_N6*!pu%Of5%b>ZI$FPDf}+^Av1-8MIZN!QL{|)J)4T^{O!IId_*US!$Ly
zEN9;3xqSAbZTFYO+{eFs_b>SCeDuSgI40e1(YpJEwg%34b1}Q}`~BKCYi|a>Gk$8d
zQ)sD2^rqC4rTI2ncWqs_Cl4M!mcEGa`#m3P6xx?bJgu(XxK~*JOyRM$o8>!yExoaH
z{<evmj;r3jyZVIt!()eK*5_|CUzGZ@&*JK~<d60tyG1sCyJQyeWOwz}N{&AkgDH^|
zuFg6BsBUGMX8PMD@F*|ba9sQL+uj#5>yrCSb7%bVzp8gs=kJ!@X_k8T-o`oE^O?be
zp!jV?+RM7rm95`oTVLM`?9i(|d%VBoOyQI0H&be#7D&0Qp1SREdETA7;d@<em3rOt
z>ua;UGNUd}-&mz?6rOW^Q{(5WfxFz5lwT>jxpuu(o_G92kMDJR)6Q+{T#g2}Zr>$T
zbUZO~Q<Ts0#p`l%!>z0$uYP(kJL|@i6tAy`GuND}de|Jbx=HVE?h&gNt4kLi=PBx$
zJ<eKm;gPtPCb!e&sHGl3*<P2ntoxc##k($hR$tblt-&H8w?b^rJ=@tAW-BV!-L&XM
zNXTth-^+jJFNa-g!EE@(_m$+E{!bBKguhKqYWU;37-MV}xpZS-AZ4LFkM7oMS8ko(
z{Pa8TmX~F&OsCnEx7>MW9`-ExoA;^tQ~6RVE?#Tb+pyH)P|)eAOM|*DP5tnyQh1TO
z*7w`$)9%i8EzL@9nzv|@`(9a|prmIfXLa}V7A@=Bl+eEQmspW;fzkYgsIA|RO^fQ9
z6?NiqZ?w(EgzN5eR;-l1)xCV(j+GNEPJP?;b<d_U?)IO3mAg}}MOZlTJ6--^eC4;$
zC;5Y0<$6|qSlqKddwuk2^@JVuGakcL7;e;8{39Ls;?#OBWHZcsQ=3~9zUbOFOOF~b
z!or}k-QnAdTh5_h%Xu%qxvjr7rm$vG=qcM<A~oJp+iiXayeYSy&FON(?3$`!$my0_
zt3&jruGQQd_iO9YYIPOM%2Tm#PT5W`U%TtXl1-~pt5dRq4;&52F|su%j_C83xf07Y
zyFDy*^TN}r;>N1i_CDYJ#n?3`$XR+(^{VtLrCk*>c5MBm9e!-*$))1?vYS^tb2yWQ
znk$ib^~l>G<Xx{y|G5+F^Ho@L+qEe>9!KcK);ER*nl4qlxkjA3C+b?abH`eZKP5jh
zL#;~Iw-kiW+gGjiHp*<NS=X#A$*1W%-G%Sn-4mQEoGY>;uXnD0PTX$26RUkPv(`#w
z#BYD1I<u$j(xSJPWmCmZJDEK>IoZsobFzxPUdr~nleI#o$JF`E)Zei@exl;{knM4`
z&%;A?v(|<z4codlJ8DM!w8-?SmwlewrSFV+ZW$W$T<g)QRl7n%OIEF$8f<gnu&9l1
z6Wi7ahAXaZUD0uEt1wG;pn;2v%RxaCB{hz^rkMdqeugna)ec<$z0G@*UeBHNL3iew
zt`2igb!pEIwR~YMd3IAy-kGDJQP;LEv9?{>mKirmBei_tuM4ZUe6mlva&2$QT*X-`
zp|{U8Ez3yuKa||t`<ch2#5;0Im|u+UFK^pUS?gs@p;@`<SAv4iYyA_|=`Wv^BfOKZ
z8)kQ!ZRcKR+ll|Y<(ld)tqG9t+>&{q4p-9?q=}^6lP7C0WfZ-f__FlS#yd;cb!Li~
zc;CC98gz4Re(jB~q1I1!OpABgzWR%)=>3Y&dmqf*d{v!ucMI)|+N<>{^09mA=G)3K
zYmb++Z>`yHSa6cxVQ;GO8t%ID?%S73NBS#mUO(f-y=%L9&pflVS?X(@*>7}ST<!zN
zjc8b^KUqTPd9L#E(5_Q+eZ-9<KVA3IZj@8D*!q&Y@@i1m1GRN~+Ac@E4_c?I`C{(f
z3oqRRBd2N02V|{s`T3;k>#Sp!Pj{_*Hq~wU(K(&ht=Jbe?V8oCc<s55rPOAld1~sh
z=SxE@l=4<7S)Sf;ZD-Z-I-hkCIcQEmaFLHKmovSVU2=P0f7ae*w<S+bmtHpOdD!b3
zw@k0SS~+*(UP|ett&F-fweb60ui*Vr<zIDg-}elTFL~=FWu`N=IP$9D#+BQi?-Sp;
z)?YVfMdxk3?un7!>G9r57uHoLFKW87bdvkJ%MvFa!!PgDvAw!>p{2&Dz0-T2E*86|
zcr)sO)O)LYKhCh7(CPkN7<sSq?u&P8b2(4Cg}&L7ugfp+W^Dm)`P}>(Gsl$p+g;~A
zpL_3Knw{b2Wwbu^ab-qDdunz=_MX?(4>xfu{ks!ZKCisWdUdo@QQ*{Sed*KAN36AD
zm}8$u-{+kWDgN!z<~^TxU%VT2OL$Vx>YEk!qU7NA=QN$zzg=|h^SSr>UR}E|XG&_^
z>0_TReB?WIo#A@rweN>h#9aUGSo7ZI-IwyPZ7ok0Yki9|pVr<{7Pf<J_qpx+q!l*l
zzdf2>`MiAbuB}_tRVBkWecZFf#(C4$1E%*%_kOs^I<Y=a+xooqURi|y7e11mx{h((
z=e6(KMm<lBm=7#FWU<Y;HD~$KigTfvQ<L{c+sawb&z?19QApggEW7xE?z2Iyx+c#q
zEH!&n{p*M5g%fQ~Z>;0qK3R8qW9|)=rCSqU&iOIF$#Un{sY2#5qGdOp?X+3yEMY4s
z-gI<tWYRX>*L#CyWBcvBvWkwaT2q(GYm$F3RIn!d#<O+3TY_U{x0S2_XEOean$uNW
zC(F+CKUn%rE_LO`V9}z5J6>lwsk*t`xU8u8<5j3wm(A(5p&`q8x%_@*u{m9`jX(RL
z<`Li7-=62Yl>=4XXZ@Ry^;;lf^P}G`kIk0t+%nZ%$W}1#*w*RN&0czc`?rT|`(3j3
z*j-oW;NGM1Grmsw@U8W=?tQ77+R3u2m!f_5n_S;>In|bPM}&?zyNbHRfsZXOr)jgL
zt$MZQgy}oq`K+5y-OsZrdo}a>&zDOiOdb}OKWdLYZT?I+b>HI|^X!vPetO+jJuy0W
z<KugurI&6ypEh%1%!_#<rzgD?lbEp9Y+GEd*E7SLt{JCIyhOH#yIO4gRg3e?8wLgj
zl4f}--Uol#%zgDw>>GdcZDF-r{gqPOAMaVQT{`sO5{ujB(kVwSdi85AyBAdbW$ku>
zRSS>ka;?hFxS!oyZq%G)b#TvxS6i~4RLfjFJ4sVeRLJArJl7LlE0>C{a}C?ME^C$U
z+k&Fjs-n#NZ6EJiUXpr}wDE0G`cadt73=!K^d!%Oy2NtzWlxkjZhE3?#>Bs8?{NBC
zthm)XeXHE+wUL>>YiBv1aKF5=@Y}7P_HeI?ti>5it3qrp>N_628Y!B%O_%LxUrR~B
z&On>(N~WSUTe8af!<{dlu8|fFs^&Xeaw9YBq;=HlUA>`Iiz>aImz}?QbZ5(_sZ+i#
zyZofTx9xj&TlP+kou_77t`BF7{IliG;qGG2r`npYeV2nSP=m-GJC}X+Kv=!5xxu45
zhbp~JRb9^f<eaz{e3ZFf;pSOi3q<Fi_B~^<-C$nOrg>X#)l9gNU$!G{_oVJQInVaS
zu%>^yT+F9?Mt#bf<!jC6tC&gOT?abp{JAu(EtNC5c(%3B*Z0yH{s8somAsqM;(Md>
zH|2iR-FAJkb6K%TX^z3`Q_oiC?z&zzHM~Cf<zII3*qGI_8{;N^otWGjDttP`ZPB+6
zeiuK=yPLlExTv<_&2{g#sh(+BT0cEjJ$3oCdHS|WeME@KTl2oV&U3veuy}qot^GD2
z0V*}$Wn2E`wH!&=Gd7!ZPyO}cQu3Wm*{|Pt?%FG;?Y*T@(lGf*JJWsli>_CBx4Z6)
zQlF)JYN4oqWy)pc>CUPbJ%00UI-BKp=cZPA*sjn#i)ZGhYm$|Z8XjDG^i=1(=V{yd
zZJ);Lh2Ph{>~i|hi-e6Y@6Hv~wT`(l>yN0ZyY`*$C)KhZ&Uv1;RkT>@%yXrmU*0A!
zgY7;B7lO!yJ@QTfdC96@&awNP)qk&g`Rl&Q+|3`$dM0^bJ+hH)`sM!&zS8cG>i1eM
zW&f$FwLRna#7}h%4D6rI&Xl_^<Z{z5aL314!ROW=zUQU6f6uIntf<xZ)#k}&$3E!n
zYhiC)mbLYw|N0$bFL>SVua#bM%A@wz@&!v*&NRI3o9)-C8T)Q-jn!3;%j;^=HUI4S
zY9znW+I-XJnv3?OW<S%P74=ok<i9a@_RY$D8NI&yBF+ULv)V0ndF>CEKm0bEqL)Og
zc(#{rjm`Cod24B>Jo#4KyD<Bii91=Md(|J$%(JxXtP{B=>bh-LsMey}(Y<R|E}v#R
z)qio$<>fQ}o!(WNbD&_xm%XQxZuZ|^|MOhL&hnVkI)1t-oqfr@vF}P|oRZq|*y4{^
zLe$zfiYvly_^NJ;UK;Y~PFCjAmZF6Vf4)kwPb$k#efD#asoBqDOLtqJ_BVcK-|V?>
z;<oI*PP*n_!P2N#(<eq=dtJTp`yZ*KOFre>cbPifo2V+f)+*G${_^s6|FgbU$KMsr
z%WAZ;JtdR*QT%Yf=8|2PR{K5;>iTeDSN`>;ZCAaPhTMNY)$78tl}!(p-@ExY9kxga
zId(y8l179<8lafx`=<I8Hy6!2e5d}d7t1ZS7N|HriV=CmDUQ@6c>4J6ZPz=aWN*kh
z-ac6K)jIv%$zyYlZeMpby5~_`A(G^QS7qn#PVTjA+Z~!`+*x^C?%nM;)r|6lO7?u_
zmACE7{7jxny$#)QPW8D=_3qq?sSEEmK9SwSw>;<XO^eGjvbTmkwwONWt=W@rccjj4
zTV}tu+%~bh)ot?f?{kz`yJqC})lS^Hal)aKDgJA|oZB0htyofY;kxmvtov+J!~SaT
z_kG89bMu1*C7Enxe6Ghuk7x61n)?NvUo~yzx=4cyE4MzIQJ??!TJB1-v|EqEO5FW&
ze}(*$ZuzD7i>+Mc$a-<>OM4|v&hwmJCwq3TxYo7}{~6d`*VXlYh;(ONrD}SAt=6hf
zV!7S!;V!2-Lw_yac%0S8#4@<}lu}rf@xg!>GkVYZm|ZRvEcYo!3Xp@=P;V<kJy-bd
z?bb(O?Pcr}%05`l-TXb}v0oyM!jUYi&c<%|_IlmbYUj71i+>r<kNz@qj<eOCdkJxK
zj(_AnyX!-nLeli#mqnY+{2~hy+qI2u+~-=ATC&O_<lfTk{T^klo$lUlKOcub-zv-#
zYH-`-j!MuiQ{fOZ^~J^Eg*NMM`nE0s-!OIQSN+Sy8Gro0FE?56pFw45*jxD`JO2v|
z48Q7XUBiWTUo=^peXup^L;3d~nmdjJy}n<f(G|A!w<Bl?_2X?lXAq~-n#`LQ7;@z8
zAFZ${t0!mm{%6=wJ?lS1z28)=RUsgwL-H<uPkw^7QU;zQ=$qA$R`=n^YBIv_?W0XU
zU0i2p*;qC&W^T~c{KD^*=hPz~yqSOJne6k|Y5Sr~j~_SCyL>!kYxJGkdxdL0l=;6a
z=KK70bG)w9<Hs|^Gas*tTKmrKy~VW(YxVDz?Ulc7uAkOh`0)()mXB9;!@kRx&&jrt
zo&0;wamQ^cH;nzZNp5{5y4t^3=DJ>b;PS`Yxfjozw8pE5d&ZsWJsB$+UkF{y@|CVN
zZ@qdh)Af2S=e>LHzyCPxbaC2+ur+J5iv`OzzJ9oC^$wq<OH#G83`2`I-2T+_a?7{J
z_d0TY*G2Bz{q)z|RXe`9&aKj|65RYa`$%SL`-_bo5e6^j9$I=?dcM`%6V3f5ujiOg
ze;K;EdC%M@mqJx{2D<y*N;B-b_IPRGaj#YHex^#kxv@$$GUmDRblx>qC)=J+sSw`t
zH|P3kw;ey8vEHgwhxpjiofg4#7|EuJSJPHk_wPPuI*;$<-*bug>b7M+T^v(W%=Ff#
z)H`(V@w?BZ&a<cddv1Uni&jfl-Fy7*b8mX{ruyO;pVQ7&_g0B-PTTUJ>VDU&zBTbh
zJWRER9_DU&!k74U^Vzb8DLZf6b$J!NaYk^~E#<pnTRxiy#z_V5zn_0;<Eyo|x>l73
z@9lBhX?9tr?BiXn!)b?ER%*vQEy{a%ri|giITr>72B~=ezb3izo&UZ(n<D>Y%Bs7I
z-?^Xs#lRrH%%v>V;7ca&(r5Cyxi*(x?wc-MXzF!2o@@8jrFAyUM>d-*T)JoJf!!;o
ziAG)3IoW-7+I_DnmD^`6p7P_%r7ib9ybAF&5BPERj{W4hCjDYJzq!^$+L!ukq^)@F
zt9*1@?Zo(LZY!ViT(yZ^Gi~Rt(8m2Uw|t4dbn{fnYngY<9c4lxbKVCQ`Usw^J>j+F
zTGzJilft|L4}4iQYmxb7*Mb#SZ5Ga|bcY)(wS0<kNb$0Xk8azYSbthM<WpN#U&$q#
z==7VSKU1>i1)L53wJ6g{=u@cG$M6W>%urJ)eWSutCBg1<Dz9w+_=i<~*0M)Dmrn;S
zHsrH5Og5Xg_;@DIrdu^CX16k?T;6|wZ8qpK&G)b4d6w-enV<e1ULv3|*ho726-l%4
zy_5eIr*&2H7yf4`)ez2*5<rr~#bif5(iB$_aS?gf#nY{j9S@Jh2Gc%?&56~ks#&w-
zk~<$r)b0KyaeDKdS%=#nU7J_>>vaAX&52L0M{J9h-M#a>uXSrq>z&(kuFZOODK53`
zM~u4Il%Q41k4&>&Dv~$#<AY<1R;?>8o9y*F@YijpiSOJ@!5azQZvD*bxbC;pi+^Ws
zI7juay;;A=PwMX{HnXdbFR%qpDp((G@p-9Z>kE6qP4VkiKKuA~{pwkvT_*F`K27zy
zYO=;FEArKoD=Tl$Rg-wwrPgm6Fj42?fjd9L*&esY);i04MA}c{2i=9zaqB5f_O5)s
zcm8&9&gQLV-c^UcD2LR_ZPKYa8Z+r*-WSg+G08j1Le95mueta1nMI#1o7<Py%^J^g
zt+v<5b2lGcwA$mUS#7Gc;8&fqS?f5Lu3mm7RO@iKf56imy`v{)yw$2&(Y9Gp2%OSz
z*0vy7p5xotk4jtLn7jG@+e4Wd_9BSG2SL)9n3NG?WZf0X=`mS9+g_wieYDeb@98kd
z#6xeco!-0kx<uuT;KlBX<omWK?fO%?a%=h3Rbor7FAQk-Y=NvF2b=%@rTy>$ZX9_=
zU^enX^|_tv#qxg5@q4E0JGd5|7oYpB`!4hC@upn!&C3d>z4^~D;pWB|^Vluj+9%iZ
zKe5n$bF1)Ad4S>Tz0aO}HZ+<kbT5l{oAkPsx@Ox&UGi16)_v8M+jMpFlO@y4yt=--
zJ#jf$vTAFf)zJ)%tee(y&$LYCx^@L;if=2wU3%TAG(DqsYVHlbo`;)@*X*4edBli4
zXM@@;1-owBU8kqH&9+*9Z=KkQ(%klUJj+%to@4dl<(^rWrmof9Ewo4~>gc5Mw9SUW
zvep&L9-h8s=xJ(~;@Q1<^3#)(9<8{`{3_?<ZnewzZlB%^UH4xQbwBckzEp9aR__J7
zWoKq@%C&vF2f7lyZTqz{?qhQvcP%dAU%6TB&20bbe5sFV?^hmqm$>%p+@_DqEs)JT
zwkLA;t3vA;*|q5jdos_Ju@|mSwq3R@=elp;sjd6H^lq@(*Ir6D{Ox<{VfisVUY|YT
z2FLPDn%dTspI*|Uw_(Aq=bRZ{<`efGy1!O4PU!CXC99SvYu@WCvKPIuJ3{Z3%+%T5
zHbOfuZD$Gl8NDK~Gg}5cNsX&Y1*J*G8Q%{4Se#xhysi504qsjNN3#mhj)eoMLc-)W
zI+2XP&1C<NbmE}=GG~oRhd<s~_Pa>3aJuGY=M}3&{ia`7#c(=&PTuR3X-_f(<DxS|
zGtO+A6}8yI{qpMEQ$Z`2X8ZG+uIIEBa<UZ9ShBM5!{6f*U$yVataJHm&tCYXYT<jS
z_=ybVa_+0-J}&(!FerrhU$<xOyd&pRQIKl)YPDBEz^U((_ka1^y;h^`qF>k4sG!DB
z;hevRh5eJaZH>&?_B-ji>rTn3#i43jS7m)Zw^Q+z($=NgA%WMkUA|v=zrwxf(T=Ht
z2F}m=4?}l|F&n;Z{OES>n|jXo-vzUx_>Y=Au*Z2;11NGx8l?qkAO-UyPp%TsIM`Ar
dwi8&edttD-{2HQ~!R}zV=wNgC^&#u>n*f-?)_ec}

literal 0
HcmV?d00001


From dc704773c3fbe0cf3756d5e975871b695da59311 Mon Sep 17 00:00:00 2001
From: Tare Gaskin <taregaskin@google.com>
Date: Thu, 11 Jun 2020 01:03:25 +0000
Subject: [PATCH 0947/1390] [-Wsign-compare] batch resolution 1

---
 .../quantization/import_quant_stats_pass.cc   |  4 +--
 .../lite/quantization/quantization_config.cc  |  4 +--
 .../lite/quantization/quantization_driver.cc  |  4 +--
 .../lite/quantization/quantization_utils.cc   | 10 +++----
 .../mlir/tensorflow/utils/dump_mlir_util.cc   |  2 +-
 tensorflow/compiler/mlir/xla/ir/chlo_ops.cc   |  2 +-
 tensorflow/compiler/mlir/xla/ir/hlo_ops.cc    |  6 ++--
 tensorflow/compiler/xla/window_util.cc        |  2 +-
 tensorflow/core/kernels/batch_kernels.cc      |  6 ++--
 .../core/kernels/data/prefetch_autotuner.cc   |  4 +--
 tensorflow/core/kernels/quantization_utils.h  |  2 +-
 tensorflow/core/platform/s3/s3_file_system.cc |  2 +-
 .../core/profiler/utils/derived_timeline.cc   |  2 +-
 .../core/profiler/utils/derived_timeline.h    |  2 +-
 .../core/profiler/utils/xplane_utils.cc       |  2 +-
 tensorflow/core/util/bcast.h                  |  4 +--
 .../convert_trivial_tile_to_concat.cc         |  2 +-
 .../convert_trivial_transpose_to_reshape.cc   |  2 +-
 .../toco/graph_transformations/dequantize.cc  |  2 +-
 .../graph_transformations/drop_fake_quant.cc  |  2 +-
 ...int8_weights_safe_for_fast_int8_kernels.cc |  2 +-
 .../fuse_broadcast_into_following_binary.cc   |  2 +-
 .../group_bidirectional_sequence_ops.cc       |  4 +--
 .../graph_transformations/hardcode_min_max.cc |  2 +-
 .../identify_nearest_upsample.cc              |  2 +-
 .../merge_reshape_into_preceding_transpose.cc |  4 +--
 .../propagate_array_data_types.cc             |  2 +-
 .../propagate_fake_quant_num_bits.cc          |  2 +-
 .../propagate_fixed_sizes.cc                  | 28 +++++++++----------
 .../remove_successive_transpose.cc            | 10 +++----
 .../remove_trivial_passthrough.cc             |  2 +-
 .../reorder_elementwise_unary.cc              |  4 +--
 .../reorder_reshape_transpose.cc              | 12 ++++----
 .../resolve_batch_normalization.cc            | 10 +++----
 .../resolve_constant_concatenation.cc         |  2 +-
 .../resolve_constant_pack.cc                  |  2 +-
 .../resolve_constant_slice.cc                 |  2 +-
 .../resolve_constant_transpose.cc             |  2 +-
 .../resolve_constant_unary.cc                 |  4 +--
 .../unpartition_embedding_lookup.cc           |  4 +--
 tensorflow/lite/toco/model_cmdline_flags.cc   |  8 +++---
 tensorflow/lite/toco/toco_cmdline_flags.cc    |  2 +-
 42 files changed, 89 insertions(+), 89 deletions(-)

diff --git a/tensorflow/compiler/mlir/lite/quantization/import_quant_stats_pass.cc b/tensorflow/compiler/mlir/lite/quantization/import_quant_stats_pass.cc
index d924a3e82ac..5419a0d5e1b 100644
--- a/tensorflow/compiler/mlir/lite/quantization/import_quant_stats_pass.cc
+++ b/tensorflow/compiler/mlir/lite/quantization/import_quant_stats_pass.cc
@@ -76,7 +76,7 @@ class ImportQuantStatsPass
   // If the index is out of range, this method returns false. Otherwise it
   // returns true if the value is a float tensor.
   bool IsQuantizableResult(Operation *op, int index) {
-    if (index < 0 || index >= op->getNumResults()) return false;
+    if (index < 0 || index >= static_cast<int>(op->getNumResults())) return false;
     Value res = op->getResult(index);
     return res.getType().isa<ShapedType>() &&
            res.getType().cast<ShapedType>().getElementType().isa<FloatType>();
@@ -158,7 +158,7 @@ void ImportQuantStatsPass::ImportAsStatsOps(OpBuilder b, Operation *op,
     InsertStatsOpAtResult(b, op->getResult(index), layer_stats, axis_stats,
                           axis);
   } else {
-    for (int i = 0; i < op->getNumResults(); ++i) {
+    for (int i = 0; i < static_cast<int>(op->getNumResults()); ++i) {
       if (IsQuantizableResult(op, i)) {
         InsertStatsOpAtResult(b, op->getResult(i), layer_stats, axis_stats,
                               axis);
diff --git a/tensorflow/compiler/mlir/lite/quantization/quantization_config.cc b/tensorflow/compiler/mlir/lite/quantization/quantization_config.cc
index 3edd9c36760..30a19974ecc 100644
--- a/tensorflow/compiler/mlir/lite/quantization/quantization_config.cc
+++ b/tensorflow/compiler/mlir/lite/quantization/quantization_config.cc
@@ -48,7 +48,7 @@ bool ParseInputNodeQuantSpecs(absl::string_view node_names,
   std::vector<llvm::Optional<double>> node_mins;
   if (!min_values.empty()) {
     std::vector<std::string> node_mins_str = absl::StrSplit(min_values, ',');
-    for (int i = 0; i < node_mins_str.size(); i++) {
+    for (size_t i = 0; i < node_mins_str.size(); i++) {
       double value;
       if (!absl::SimpleAtod(node_mins_str[i], &value)) {
         return true;
@@ -60,7 +60,7 @@ bool ParseInputNodeQuantSpecs(absl::string_view node_names,
   std::vector<llvm::Optional<double>> node_maxs;
   if (!max_values.empty()) {
     std::vector<std::string> node_maxs_str = absl::StrSplit(max_values, ',');
-    for (int i = 0; i < node_maxs_str.size(); i++) {
+    for (size_t i = 0; i < node_maxs_str.size(); i++) {
       double value;
       if (!absl::SimpleAtod(node_maxs_str[i], &value)) {
         llvm::errs() << "Unexpected mins: " << node_maxs_str[i] << "\n";
diff --git a/tensorflow/compiler/mlir/lite/quantization/quantization_driver.cc b/tensorflow/compiler/mlir/lite/quantization/quantization_driver.cc
index 89443b1ec65..45047a05e80 100644
--- a/tensorflow/compiler/mlir/lite/quantization/quantization_driver.cc
+++ b/tensorflow/compiler/mlir/lite/quantization/quantization_driver.cc
@@ -294,7 +294,7 @@ class QuantizationDriver {
         return;
       if (current_op == op) llvm::errs() << "===>>>";
       llvm::errs() << op->getName() << " : (";
-      for (auto i = 0; i < op->getNumOperands(); ++i) {
+      for (size_t i = 0; i < op->getNumOperands(); ++i) {
         if (auto params = GetOperandQuantState(op, i).params)
           params.print(llvm::errs());
         else
@@ -303,7 +303,7 @@ class QuantizationDriver {
         llvm::errs() << ",";
       }
       llvm::errs() << ") -> (";
-      for (auto i = 0; i < op->getNumResults(); ++i) {
+      for (size_t i = 0; i < op->getNumResults(); ++i) {
         if (auto params = GetResultQuantState(op, i).params)
           params.print(llvm::errs());
         else
diff --git a/tensorflow/compiler/mlir/lite/quantization/quantization_utils.cc b/tensorflow/compiler/mlir/lite/quantization/quantization_utils.cc
index 32f68aaae5f..0a891009568 100644
--- a/tensorflow/compiler/mlir/lite/quantization/quantization_utils.cc
+++ b/tensorflow/compiler/mlir/lite/quantization/quantization_utils.cc
@@ -55,7 +55,7 @@ static Type GetQuantizedType(Builder builder, Type input_type,
   } else if (min.size() == max.size()) {
     auto shape = input_type.dyn_cast<ShapedType>();
     if (!shape || shape.getRank() <= quant_dim ||
-        min.size() != shape.getDimSize(quant_dim)) {
+        static_cast<int64_t>(min.size()) != shape.getDimSize(quant_dim)) {
       return {};
     }
     // TODO(b/141508873): the quantization dim is set to the last dimension.
@@ -76,7 +76,7 @@ TypeAttr RescaleQuantizedType(Type input, Attribute factor) {
   if (auto qtype = ele_type.dyn_cast<quant::UniformQuantizedPerAxisType>()) {
     ArrayRef<double> scales = qtype.getScales();
     // Broadcasting hasn't been implemented yet.
-    if (scales.size() != factor_values.getNumElements()) return {};
+    if (static_cast<int64_t>(scales.size()) != factor_values.getNumElements()) return {};
     SmallVector<double, 4> new_scales;
     new_scales.reserve(scales.size());
     auto scales_iter = scales.begin();
@@ -270,7 +270,7 @@ Type GetUniformQuantizedPerAxisTypeForWeight(ElementsAttr attr, int quant_dim,
                                              bool narrow_range) {
   Builder builder(attr.getContext());
   auto shape = attr.getType().cast<ShapedType>().getShape();
-  if (shape.size() <= quant_dim) return {};
+  if (static_cast<int>(shape.size()) <= quant_dim) return {};
   // `symmetric` can only be used when it is `signed` and `narrow_range`.
   if (symmetric && (!is_signed || !narrow_range)) return {};
 
@@ -335,7 +335,7 @@ quant::QuantizedType GetUniformQuantizedTypeForBias(
     const std::vector<quant::QuantizedType>& op_types) {
   if (op_types.empty()) return {};
 
-  int axis_size = 1;
+  size_t axis_size = 1;
   int32_t quant_dim = -1;
   Type expressed_type;
   // Requires all the op types are valid UniformQuantizedTypes or
@@ -369,7 +369,7 @@ quant::QuantizedType GetUniformQuantizedTypeForBias(
         scales[index_scale.index()] *= index_scale.value();
       }
     } else if (auto type = op_type.dyn_cast<quant::UniformQuantizedType>()) {
-      for (int index = 0; index != axis_size; ++index) {
+      for (size_t index = 0; index != axis_size; ++index) {
         scales[index] *= type.getScale();
       }
     }
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc b/tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc
index 797687ea658..b5a6c922707 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc
@@ -41,7 +41,7 @@ std::string MakeUniqueFilename(string name) {
   static NameCounts& instance = *new NameCounts;
 
   // Remove illegal characters from `name`.
-  for (int i = 0; i < name.size(); ++i) {
+  for (size_t i = 0; i < name.size(); ++i) {
     char ch = name[i];
     if (ch == '/' || ch == '[' || ch == ']' || ch == '*' || ch == '?' ||
         ch == '\\') {
diff --git a/tensorflow/compiler/mlir/xla/ir/chlo_ops.cc b/tensorflow/compiler/mlir/xla/ir/chlo_ops.cc
index 26db4549a2a..f5b895f0c76 100644
--- a/tensorflow/compiler/mlir/xla/ir/chlo_ops.cc
+++ b/tensorflow/compiler/mlir/xla/ir/chlo_ops.cc
@@ -49,7 +49,7 @@ static Type GetBroadcastType(Type x, Type y, Type element_type,
 
   if (shape_x.size() == shape_y.size()) {
     llvm::SmallVector<int64_t, 4> out_shape(shape_x.size());
-    for (int i = 0; i < shape_x.size(); i++) {
+    for (size_t i = 0; i < shape_x.size(); i++) {
       auto x_val = shape_x[i];
       auto y_val = shape_y[i];
       if (x_val == -1 || y_val == -1) {
diff --git a/tensorflow/compiler/mlir/xla/ir/hlo_ops.cc b/tensorflow/compiler/mlir/xla/ir/hlo_ops.cc
index 38bff6c2ca7..123446dd494 100644
--- a/tensorflow/compiler/mlir/xla/ir/hlo_ops.cc
+++ b/tensorflow/compiler/mlir/xla/ir/hlo_ops.cc
@@ -143,7 +143,7 @@ DenseIntElementsAttr BuildConvPaddingAttrs(
 
   int rank = padding_low.size();
   SmallVector<int64_t, 8> padding;
-  for (unsigned i = 0; i < rank; ++i) {
+  for (unsigned i = 0; i < static_cast<size_t>(rank); ++i) {
     padding.push_back(GetPaddingValue(padding_attr, {i, 0}) + padding_low[i]);
     padding.push_back(GetPaddingValue(padding_attr, {i, 1}) + padding_high[i]);
   }
@@ -891,7 +891,7 @@ static Attribute foldConcatenateHelper(ConcatenateOp* op,
   auto shape = type.getShape();
 
   size_t top_size = 1;
-  for (int i = 0; i < axis; i++) {
+  for (size_t i = 0; i < axis; i++) {
     top_size = top_size * shape[i];
   }
 
@@ -1169,7 +1169,7 @@ static LogicalResult Verify(MapOp op) {
   // increasing.
   auto values = op.dimensions().getValues<int64_t>();
   auto dimensions = std::vector<int64_t>{values.begin(), values.end()};
-  for (int i = 0; i < dimensions.size(); ++i) {
+  for (int i = 0; static_cast<size_t>(i) < dimensions.size(); ++i) {
     if (dimensions[i] != i)
       return op.emitOpError() << "requires monotonically increasing dimension "
                                  "numbers, but got: "
diff --git a/tensorflow/compiler/xla/window_util.cc b/tensorflow/compiler/xla/window_util.cc
index a58179c3ee0..e33d0b6d1dc 100644
--- a/tensorflow/compiler/xla/window_util.cc
+++ b/tensorflow/compiler/xla/window_util.cc
@@ -42,7 +42,7 @@ Window MakeWindow(absl::Span<const int64> sizes,
                   absl::Span<const int64> strides) {
   Window window;
   CHECK_EQ(sizes.size(), strides.size());
-  for (auto nb = 0; nb < sizes.size(); ++nb) {
+  for (auto nb = 0; static_cast<size_t>(nb) < sizes.size(); ++nb) {
     auto* dimension = window.add_dimensions();
     dimension->set_size(sizes[nb]);
     dimension->set_stride(strides[nb]);
diff --git a/tensorflow/core/kernels/batch_kernels.cc b/tensorflow/core/kernels/batch_kernels.cc
index 6449a399573..1dedecdf6d2 100644
--- a/tensorflow/core/kernels/batch_kernels.cc
+++ b/tensorflow/core/kernels/batch_kernels.cc
@@ -519,18 +519,18 @@ class BatchResource : public ResourceBase {
     std::map<string, std::vector<Tensor>> split_tensors;
 
     DCHECK_EQ(batch->task(0).context->num_outputs(), combined_outputs.size());
-    if (combined_outputs.size() != batch->task(0).context->num_outputs()) {
+    if (static_cast<int>(combined_outputs.size()) != batch->task(0).context->num_outputs()) {
       return errors::Internal("Wrong number of batched output tensors");
     }
 
     // Generate 'split_tensors' and populate the context outputs.
-    for (int i = 0; i < combined_outputs.size(); ++i) {
+    for (size_t i = 0; i < combined_outputs.size(); ++i) {
       const Tensor& output_tensor = combined_outputs[i];
       if (output_tensor.shape().dims() == 0) {
         return errors::FailedPrecondition(
             "Batched output tensor has 0 dimensions");
       }
-      if (output_tensor.shape().dim_size(0) != batch->size() + padding_size) {
+      if (output_tensor.shape().dim_size(0) != static_cast<long long int>(batch->size() + padding_size)) {
         return errors::FailedPrecondition(
             "Batched output tensor's 0th dimension does not equal the sum of "
             "the 0th dimension sizes of the input tensors");
diff --git a/tensorflow/core/kernels/data/prefetch_autotuner.cc b/tensorflow/core/kernels/data/prefetch_autotuner.cc
index a3bb1acc352..a3fd9919d6b 100644
--- a/tensorflow/core/kernels/data/prefetch_autotuner.cc
+++ b/tensorflow/core/kernels/data/prefetch_autotuner.cc
@@ -40,13 +40,13 @@ void PrefetchAutotuner::RecordConsumption(size_t current_buffer_size) {
     case Mode::kDisabled:
       return;
     case Mode::kUpswing:
-      if (current_buffer_size == buffer_limit_) {
+      if (static_cast<tensorflow::int64>(current_buffer_size) == buffer_limit_) {
         mode_ = Mode::kDownswing;
       }
       return;
     case Mode::kDownswing:
       if (current_buffer_size == 0) {
-        if (buffer_limit_ >= kBufferLimitThreshold) {
+        if (buffer_limit_ >= static_cast<tensorflow::int64>(kBufferLimitThreshold)) {
           buffer_limit_ += kBufferLimitThreshold;
         } else {
           buffer_limit_ *= 2;
diff --git a/tensorflow/core/kernels/quantization_utils.h b/tensorflow/core/kernels/quantization_utils.h
index 315616f3fb3..06c901967b0 100644
--- a/tensorflow/core/kernels/quantization_utils.h
+++ b/tensorflow/core/kernels/quantization_utils.h
@@ -268,7 +268,7 @@ inline void RequantizeManyInNewRangeReference(const qint32* input, int64 count,
   // that could be easily adapted for a SIMD implementation. It should also be
   // possible to perform all the calculations in 32-bit rather than 64, but
   // that's not been implemented yet.
-  for (size_t index = 0; index < count; ++index) {
+  for (size_t index = 0; static_cast<tensorflow::int64>(index) < count; ++index) {
     const int64 input_value = static_cast<int64>(input[index]);
     const int64 fp_value =
         ((input_value * range_scale_fp) >> 32) + input_offset_fp;
diff --git a/tensorflow/core/platform/s3/s3_file_system.cc b/tensorflow/core/platform/s3/s3_file_system.cc
index ee7dded6f98..181a39987e4 100644
--- a/tensorflow/core/platform/s3/s3_file_system.cc
+++ b/tensorflow/core/platform/s3/s3_file_system.cc
@@ -1050,7 +1050,7 @@ Status S3FileSystem::MultiPartCopy(const Aws::String& source,
       // wait on the mutex until notify is called
       // then check the finished parts as there could be false notifications
       multi_part_copy_cv.wait(lock, [&finishedPartStates, num_parts] {
-        return finishedPartStates.size() == num_parts;
+        return static_cast<const int>(finishedPartStates.size()) == num_parts;
       });
     }
     // check if there was any error for any part
diff --git a/tensorflow/core/profiler/utils/derived_timeline.cc b/tensorflow/core/profiler/utils/derived_timeline.cc
index f63a8e5c2d9..fe2106b4827 100644
--- a/tensorflow/core/profiler/utils/derived_timeline.cc
+++ b/tensorflow/core/profiler/utils/derived_timeline.cc
@@ -130,7 +130,7 @@ void DerivedXLineBuilder::ExpandOrAddLevelEvent(const XEvent& event,
 }
 
 void DerivedXLineBuilder::ResetLastEvents(int level) {
-  for (int i = level; i < last_event_by_level_.size(); ++i) {
+  for (int i = level; i < static_cast<int>(last_event_by_level_.size()); ++i) {
     last_event_by_level_[i] = absl::nullopt;
   }
   if (level == 0) ResetDependentLines();
diff --git a/tensorflow/core/profiler/utils/derived_timeline.h b/tensorflow/core/profiler/utils/derived_timeline.h
index cd4da7996c5..92489399b8f 100644
--- a/tensorflow/core/profiler/utils/derived_timeline.h
+++ b/tensorflow/core/profiler/utils/derived_timeline.h
@@ -37,7 +37,7 @@ class DerivedXLineBuilder {
                       std::vector<DerivedXLineBuilder*> dependent_lines);
 
   void ExpandOrAddEvents(const std::vector<XEvent>& event_per_level) {
-    for (int level = 0; level < event_per_level.size(); ++level) {
+    for (size_t level = 0; level < event_per_level.size(); ++level) {
       ExpandOrAddLevelEvent(event_per_level[level], level);
     }
   }
diff --git a/tensorflow/core/profiler/utils/xplane_utils.cc b/tensorflow/core/profiler/utils/xplane_utils.cc
index 3fa421c3459..f3cbf5d0699 100644
--- a/tensorflow/core/profiler/utils/xplane_utils.cc
+++ b/tensorflow/core/profiler/utils/xplane_utils.cc
@@ -155,7 +155,7 @@ void SortXSpace(XSpace* space) {
 // smaller than these value.
 void NormalizeTimestamps(XPlane* plane, uint64 start_time_ns) {
   for (XLine& line : *plane->mutable_lines()) {
-    if (line.timestamp_ns() >= start_time_ns) {
+    if (line.timestamp_ns() >= static_cast<long int>(start_time_ns)) {
       line.set_timestamp_ns(line.timestamp_ns() - start_time_ns);
     }
   }
diff --git a/tensorflow/core/util/bcast.h b/tensorflow/core/util/bcast.h
index 7bb8ea18ad3..075de84964e 100644
--- a/tensorflow/core/util/bcast.h
+++ b/tensorflow/core/util/bcast.h
@@ -139,7 +139,7 @@ BCastList<N>::BCastList(const BCastList::Vec (&x)[N],
     if (x[i] != x[0]) {
       all_equal = false;
     }
-    if (x[i].size() > largest_rank) {
+    if (static_cast<int>(x[i].size()) > largest_rank) {
       largest_rank = x[i].size();
     }
   }
@@ -176,7 +176,7 @@ BCastList<N>::BCastList(const BCastList::Vec (&x)[N],
 
   // 1-extend and align all vectors.
   for (int i = 0; i < N; ++i) {
-    if (copy[i].size() < largest_rank) {
+    if (static_cast<int>(copy[i].size()) < largest_rank) {
       copy[i].resize(largest_rank, 1);
     }
   }
diff --git a/tensorflow/lite/toco/graph_transformations/convert_trivial_tile_to_concat.cc b/tensorflow/lite/toco/graph_transformations/convert_trivial_tile_to_concat.cc
index 46288d2a1ed..c19ccf676c9 100644
--- a/tensorflow/lite/toco/graph_transformations/convert_trivial_tile_to_concat.cc
+++ b/tensorflow/lite/toco/graph_transformations/convert_trivial_tile_to_concat.cc
@@ -52,7 +52,7 @@ namespace toco {
   // It then just becomes a concat along that dimension.
   int non_one_dims = 0;
   int concat_axis = 0;
-  for (int i = 0; i < multiples.size(); ++i) {
+  for (size_t i = 0; i < multiples.size(); ++i) {
     if (multiples[i] != 1) {
       ++non_one_dims;
       concat_axis = i;
diff --git a/tensorflow/lite/toco/graph_transformations/convert_trivial_transpose_to_reshape.cc b/tensorflow/lite/toco/graph_transformations/convert_trivial_transpose_to_reshape.cc
index 2b5aaea2b23..fa8a69a1e7a 100644
--- a/tensorflow/lite/toco/graph_transformations/convert_trivial_transpose_to_reshape.cc
+++ b/tensorflow/lite/toco/graph_transformations/convert_trivial_transpose_to_reshape.cc
@@ -31,7 +31,7 @@ bool TransposeAffectsMemoryOrder(std::vector<int> perm,
   // just the shape) then the flat buffer representation shouldn't change.
   std::vector<int> old_major_index_ordering;
   std::vector<int> new_major_index_ordering;
-  for (int i = 0; i < in_shape.size(); i++) {
+  for (int i = 0; static_cast<size_t>(i) < in_shape.size(); i++) {
     if (in_shape[i] != 1) {
       old_major_index_ordering.push_back(i);
     }
diff --git a/tensorflow/lite/toco/graph_transformations/dequantize.cc b/tensorflow/lite/toco/graph_transformations/dequantize.cc
index cc5dddbb40e..c87c305a70d 100644
--- a/tensorflow/lite/toco/graph_transformations/dequantize.cc
+++ b/tensorflow/lite/toco/graph_transformations/dequantize.cc
@@ -35,7 +35,7 @@ void DequantizeBuffer(Array* array) {
   auto& new_data = array->GetMutableBuffer<ArrayDataType::kFloat>().data;
   new_data.resize(old_data.size());
   const auto& qparams = array->GetQuantizationParams();
-  for (int i = 0; i < old_data.size(); i++) {
+  for (size_t i = 0; i < old_data.size(); i++) {
     new_data[i] = qparams.scale * (old_data[i] - qparams.zero_point);
   }
 }
diff --git a/tensorflow/lite/toco/graph_transformations/drop_fake_quant.cc b/tensorflow/lite/toco/graph_transformations/drop_fake_quant.cc
index bb8679bced8..3a0b4d0103f 100644
--- a/tensorflow/lite/toco/graph_transformations/drop_fake_quant.cc
+++ b/tensorflow/lite/toco/graph_transformations/drop_fake_quant.cc
@@ -45,7 +45,7 @@ namespace toco {
   }
 
   // Drop min/max inputs
-  for (int i = 1; i < fakequant_op->inputs.size(); i++) {
+  for (size_t i = 1; i < fakequant_op->inputs.size(); i++) {
     if (CountOpsWithInput(*model, fakequant_op->inputs[i]) == 1) {
       model->EraseArray(fakequant_op->inputs[i]);
     }
diff --git a/tensorflow/lite/toco/graph_transformations/ensure_uint8_weights_safe_for_fast_int8_kernels.cc b/tensorflow/lite/toco/graph_transformations/ensure_uint8_weights_safe_for_fast_int8_kernels.cc
index 918bb489995..ce4574cdfbf 100644
--- a/tensorflow/lite/toco/graph_transformations/ensure_uint8_weights_safe_for_fast_int8_kernels.cc
+++ b/tensorflow/lite/toco/graph_transformations/ensure_uint8_weights_safe_for_fast_int8_kernels.cc
@@ -166,7 +166,7 @@ namespace toco {
   int index_of_previous_bad_value = 0;
   bool changed = false;
 
-  for (int i = 0; i < buffer_data.size(); i++) {
+  for (size_t i = 0; i < buffer_data.size(); i++) {
     if (buffer_data[i] == 0) {
       count_bad++;
       if (count_bad > 1) {
diff --git a/tensorflow/lite/toco/graph_transformations/fuse_broadcast_into_following_binary.cc b/tensorflow/lite/toco/graph_transformations/fuse_broadcast_into_following_binary.cc
index ba3e277f676..2c5c2cbb5f1 100644
--- a/tensorflow/lite/toco/graph_transformations/fuse_broadcast_into_following_binary.cc
+++ b/tensorflow/lite/toco/graph_transformations/fuse_broadcast_into_following_binary.cc
@@ -34,7 +34,7 @@ bool IsBroadcastingOp(const Model& model, Operator* op) {
   // Concatenation of identical inputs is usually a broadcast.
   if (op->type == OperatorType::kConcatenation) {
     // Verify that all inputs are the same.
-    for (int i = 1; i < op->inputs.size(); ++i) {
+    for (size_t i = 1; i < op->inputs.size(); ++i) {
       if (op->inputs[i] != op->inputs[0]) {
         return false;
       }
diff --git a/tensorflow/lite/toco/graph_transformations/group_bidirectional_sequence_ops.cc b/tensorflow/lite/toco/graph_transformations/group_bidirectional_sequence_ops.cc
index fa252b1a61b..a6d95ec43b1 100644
--- a/tensorflow/lite/toco/graph_transformations/group_bidirectional_sequence_ops.cc
+++ b/tensorflow/lite/toco/graph_transformations/group_bidirectional_sequence_ops.cc
@@ -125,7 +125,7 @@ bool CheckTwoUnidirectionalSequenceOpsAreValid(
       return false;
 
     // Make sure the inputs datatype matches.
-    for (int i = 0; i < fw_sequence_op->inputs.size(); ++i) {
+    for (size_t i = 0; i < fw_sequence_op->inputs.size(); ++i) {
       const auto& fw_input_array_name = fw_sequence_op->inputs[i];
       const auto& bw_input_array_name = bw_sequence_op->inputs[i];
       if (model.HasArray(fw_input_array_name) &&
@@ -137,7 +137,7 @@ bool CheckTwoUnidirectionalSequenceOpsAreValid(
     }
 
     // Make sure the outputs datatype matches.
-    for (int i = 0; i < fw_sequence_op->outputs.size(); ++i) {
+    for (size_t i = 0; i < fw_sequence_op->outputs.size(); ++i) {
       const auto& fw_output_array_name = fw_sequence_op->outputs[i];
       const auto& bw_output_array_name = bw_sequence_op->outputs[i];
       if (model.HasArray(fw_output_array_name) &&
diff --git a/tensorflow/lite/toco/graph_transformations/hardcode_min_max.cc b/tensorflow/lite/toco/graph_transformations/hardcode_min_max.cc
index 171d522daa7..4250668bcf5 100644
--- a/tensorflow/lite/toco/graph_transformations/hardcode_min_max.cc
+++ b/tensorflow/lite/toco/graph_transformations/hardcode_min_max.cc
@@ -405,7 +405,7 @@ bool HardcodeMinMaxForPack(Model* model, Operator* op) {
   }
   const auto& first_input_minmax = first_input_array.GetMinMax();
 
-  for (int i = 1; i < op->inputs.size(); i++) {
+  for (size_t i = 1; i < op->inputs.size(); i++) {
     const auto& input_array = model->GetArray(op->inputs[i]);
     if (!input_array.minmax) {
       return false;
diff --git a/tensorflow/lite/toco/graph_transformations/identify_nearest_upsample.cc b/tensorflow/lite/toco/graph_transformations/identify_nearest_upsample.cc
index 2ab6692a3a8..08894c93a5b 100644
--- a/tensorflow/lite/toco/graph_transformations/identify_nearest_upsample.cc
+++ b/tensorflow/lite/toco/graph_transformations/identify_nearest_upsample.cc
@@ -199,7 +199,7 @@ std::vector<std::unique_ptr<Operator>>::iterator FindOperator(
   shape_array.data_type = ArrayDataType::kInt32;
   auto& shape_buffer = shape_array.GetMutableBuffer<ArrayDataType::kInt32>();
   // This is what imagined as the original shape.
-  for (int i = 0; i < imagined_original_shape.size(); ++i) {
+  for (size_t i = 0; i < imagined_original_shape.size(); ++i) {
     shape_buffer.data.push_back(imagined_original_shape.at(i));
   }
 
diff --git a/tensorflow/lite/toco/graph_transformations/merge_reshape_into_preceding_transpose.cc b/tensorflow/lite/toco/graph_transformations/merge_reshape_into_preceding_transpose.cc
index 80170fe8bcb..a76ae1a0635 100644
--- a/tensorflow/lite/toco/graph_transformations/merge_reshape_into_preceding_transpose.cc
+++ b/tensorflow/lite/toco/graph_transformations/merge_reshape_into_preceding_transpose.cc
@@ -70,7 +70,7 @@ std::vector<int32> ReshapeToTranspose(const Model& model,
   std::vector<int> not_one_indices;
 
   // Separate into one indices and not one indices.
-  for (int i = 0; i < in_shape.size(); i++) {
+  for (size_t i = 0; i < in_shape.size(); i++) {
     if (in_shape[i] == 1) {
       one_indices.push_back(i);
     } else {
@@ -167,7 +167,7 @@ std::vector<int32> ReshapeToTranspose(const Model& model,
 
   // Combine the permutations.
   const auto& transpose_perm = transpose_op->perm;
-  for (int i = 0; i < merged_perm.size(); i++) {
+  for (size_t i = 0; i < merged_perm.size(); i++) {
     merged_perm[i] = transpose_perm[merged_perm[i]];
   }
 
diff --git a/tensorflow/lite/toco/graph_transformations/propagate_array_data_types.cc b/tensorflow/lite/toco/graph_transformations/propagate_array_data_types.cc
index 49d59de860b..2f316934311 100644
--- a/tensorflow/lite/toco/graph_transformations/propagate_array_data_types.cc
+++ b/tensorflow/lite/toco/graph_transformations/propagate_array_data_types.cc
@@ -170,7 +170,7 @@ void SetDataTypeForAllOutputs(Model* model, Operator* op,
       if (unsupported_op->output_data_types.size() < op->outputs.size()) {
         return ::tensorflow::Status::OK();
       }
-      for (int i = 0; i < op->outputs.size(); ++i) {
+      for (size_t i = 0; i < op->outputs.size(); ++i) {
         const string& output = op->outputs[i];
         const ArrayDataType data_type = unsupported_op->output_data_types[i];
         model->GetArray(output).data_type = data_type;
diff --git a/tensorflow/lite/toco/graph_transformations/propagate_fake_quant_num_bits.cc b/tensorflow/lite/toco/graph_transformations/propagate_fake_quant_num_bits.cc
index 1ed618879c1..94779f54af2 100644
--- a/tensorflow/lite/toco/graph_transformations/propagate_fake_quant_num_bits.cc
+++ b/tensorflow/lite/toco/graph_transformations/propagate_fake_quant_num_bits.cc
@@ -149,7 +149,7 @@ bool RecursivelyBackwardPropagateDataType(GraphTransformation* transformation,
                                           ArrayDataType new_data_type,
                                           const MinMax& new_minmax) {
   bool did_change = false;
-  for (int input_index = 0; input_index < op->inputs.size(); ++input_index) {
+  for (size_t input_index = 0; input_index < op->inputs.size(); ++input_index) {
     const auto& input = op->inputs[input_index];
     auto& input_array = model->GetArray(input);
 
diff --git a/tensorflow/lite/toco/graph_transformations/propagate_fixed_sizes.cc b/tensorflow/lite/toco/graph_transformations/propagate_fixed_sizes.cc
index 006e624eb7a..520cd8b495a 100644
--- a/tensorflow/lite/toco/graph_transformations/propagate_fixed_sizes.cc
+++ b/tensorflow/lite/toco/graph_transformations/propagate_fixed_sizes.cc
@@ -431,7 +431,7 @@ void ProcessTensorFlowReshapeOperator(Model* model,
   bool has_wildcard = false;
   int wildcard_index = 0;
   int product_non_wildcard_dims = 1;
-  for (int i = 0; i < shape_data.size(); i++) {
+  for (size_t i = 0; i < shape_data.size(); i++) {
     if (shape_data[i] == -1) {
       CHECK(!has_wildcard);
       has_wildcard = true;
@@ -574,7 +574,7 @@ void ProcessTensorFlowReductionOperator(Model* model, Operator* op) {
     std::set<int32> true_indices;
     const auto& reduction_indices =
         reduction_indices_array.GetBuffer<ArrayDataType::kInt32>().data;
-    for (int i = 0; i < reduction_indices.size(); ++i) {
+    for (size_t i = 0; i < reduction_indices.size(); ++i) {
       const int32 reduction_index = reduction_indices[i];
       if (reduction_index < -input_rank || reduction_index >= input_rank) {
         CHECK(false) << "Invalid reduction dimension " << reduction_index
@@ -627,7 +627,7 @@ void ProcessSliceOperator(Model* model, SliceOperator* op) {
   CHECK_EQ(op->begin.size(), op->size.size());
 
   std::vector<int> output_dims;
-  for (int i = 0; i < op->begin.size(); ++i) {
+  for (size_t i = 0; i < op->begin.size(); ++i) {
     int size = op->size[i];
     if (size == -1) {
       size = input_array.shape().dims(i) - op->begin[i];
@@ -883,7 +883,7 @@ void ProcessTensorFlowSplitVOperator(Model* model,
 
   CHECK_EQ(op->outputs.size(), op->num_split);
 
-  for (int i = 0; i < op->outputs.size(); ++i) {
+  for (size_t i = 0; i < op->outputs.size(); ++i) {
     const auto& output = op->outputs[i];
     Shape output_shape = input_shape;
     (*output_shape.mutable_dims())[axis] = size_splits_vector.at(i);
@@ -1514,7 +1514,7 @@ void ProcessPadOperator(Model* model, PadOperator* op) {
   std::vector<int>& dims = *output_shape.mutable_dims();
   CHECK_EQ(op->left_padding.size(), dims.size());
 
-  for (int i = 0; i < op->left_padding.size(); ++i) {
+  for (size_t i = 0; i < op->left_padding.size(); ++i) {
     dims[i] += op->left_padding[i] + op->right_padding[i];
   }
 
@@ -1540,7 +1540,7 @@ void ProcessPadV2Operator(Model* model, PadV2Operator* op) {
   std::vector<int>& dims = *output_shape.mutable_dims();
   CHECK_EQ(op->left_padding.size(), dims.size());
 
-  for (int i = 0; i < op->left_padding.size(); ++i) {
+  for (size_t i = 0; i < op->left_padding.size(); ++i) {
     dims[i] += op->left_padding[i] + op->right_padding[i];
   }
 
@@ -1683,7 +1683,7 @@ void ProcessStridedSliceOperator(Model* model, StridedSliceOperator* op) {
   CHECK_LE(op->strides.size(), num_input_axes)
       << "StridedSlice op with output \"" << op->outputs[0]
       << "\", requires no more than " << num_input_axes << " strides";
-  for (int i = 0; i < op->strides.size(); i++) {
+  for (size_t i = 0; i < op->strides.size(); i++) {
     CHECK_NE(op->strides[i], 0) << "Strides must be non-zero. Axis " << i
                                 << " has stride=" << op->strides[i] << ".";
   }
@@ -1814,7 +1814,7 @@ void ProcessTransposeOperator(Model* model, TransposeOperator* op) {
       << "Transpose permutation input " << op->inputs[1]
       << " must be same length as input dimensions";
   std::vector<int>* output_dims = output_array.mutable_shape()->mutable_dims();
-  for (int i = 0; i < perm.size(); i++) {
+  for (size_t i = 0; i < perm.size(); i++) {
     int axis = perm[i];
     CHECK_GE(axis, 0);
     CHECK_LT(axis, input_shape.dimensions_count());
@@ -1856,8 +1856,8 @@ void ProcessArgMinMaxOperator(Model* model, Op* op) {
   std::vector<int> output_dims;
 
   output_dims.reserve(input_dims.size() - 1);
-  for (int i = 0; i < input_dims.size(); ++i) {
-    if (i != axis) {
+  for (size_t i = 0; i < input_dims.size(); ++i) {
+    if ( static_cast<int>(i) != axis) {
       output_dims.push_back(input_dims[i]);
     }
   }
@@ -1938,7 +1938,7 @@ void ProcessTileOperator(Model* model, TensorFlowTileOperator* op) {
 
   auto* mutable_dims = output_array.mutable_shape()->mutable_dims();
   mutable_dims->resize(multiples.size());
-  for (int i = 0; i < mutable_dims->size(); ++i) {
+  for (size_t i = 0; i < mutable_dims->size(); ++i) {
     (*mutable_dims)[i] = input_shape.dims(i) * multiples[i];
   }
 }
@@ -2010,8 +2010,8 @@ void ProcessUnpackOperator(Model* model, UnpackOperator* op) {
   std::vector<int> output_dims;
 
   output_dims.reserve(input_dims.size() - 1);
-  for (int i = 0; i < input_dims.size(); ++i) {
-    if (i != op->axis) {
+  for (size_t i = 0; i < input_dims.size(); ++i) {
+    if ( static_cast<int>(i) != op->axis) {
       output_dims.push_back(input_dims[i]);
     }
   }
@@ -2399,7 +2399,7 @@ void ProcessScatterNdOperator(Model* model, ScatterNdOperator* op) {
       if (unsupported_op->output_shapes.size() < op->outputs.size()) {
         return ::tensorflow::Status::OK();
       }
-      for (int i = 0; i < op->outputs.size(); ++i) {
+      for (size_t i = 0; i < op->outputs.size(); ++i) {
         const string& output = op->outputs[i];
         model->GetArray(output).copy_shape(unsupported_op->output_shapes.at(i));
       }
diff --git a/tensorflow/lite/toco/graph_transformations/remove_successive_transpose.cc b/tensorflow/lite/toco/graph_transformations/remove_successive_transpose.cc
index 6eccda04c18..1cb3a300127 100644
--- a/tensorflow/lite/toco/graph_transformations/remove_successive_transpose.cc
+++ b/tensorflow/lite/toco/graph_transformations/remove_successive_transpose.cc
@@ -31,12 +31,12 @@ bool TransformsToIdentity(std::vector<int> const& perm1,
   // perm1 is the order of the indices after first transpose. When perm1 is
   // reordered according to perm2, if the result is simple increasing sequence
   // i.e., range(0, perm1.size()), then the two transposes cancel each other.
-  for (int i = 0; i < perm1.size(); ++i) {
-    if (perm1[i] < 0 || perm1[i] >= perm1.size() || perm2[i] < 0 ||
-        perm2[i] >= perm1.size()) {
+  for (size_t i = 0; i < perm1.size(); ++i) {
+    if (perm1[i] < 0 || perm1[i] >= static_cast<int>(perm1.size()) || perm2[i] < 0 ||
+        perm2[i] >= static_cast<int>(perm1.size())) {
       return false;
     }
-    if (perm1[perm2[i]] != i) {
+    if (perm1[perm2[i]] != static_cast<int>(i)) {
       return false;
     }
   }
@@ -46,7 +46,7 @@ bool TransformsToIdentity(std::vector<int> const& perm1,
 void ReplaceOpInputsWith(Model* model, const string& lookfor,
                          const string& replacewith) {
   for (const auto& op : model->operators) {
-    for (int i = 0; i < op->inputs.size(); ++i) {
+    for (size_t i = 0; i < op->inputs.size(); ++i) {
       if (op->inputs[i] == lookfor) {
         op->inputs[i] = replacewith;
       }
diff --git a/tensorflow/lite/toco/graph_transformations/remove_trivial_passthrough.cc b/tensorflow/lite/toco/graph_transformations/remove_trivial_passthrough.cc
index bd529bd9ecd..eeb8751bf86 100644
--- a/tensorflow/lite/toco/graph_transformations/remove_trivial_passthrough.cc
+++ b/tensorflow/lite/toco/graph_transformations/remove_trivial_passthrough.cc
@@ -82,7 +82,7 @@ bool RemoveTrivialPassthroughOp(GraphTransformation* transformation,
     // We call 'main input' the unique nonconstant input array if there is one,
     // or else the 0-th input.
     int count_nonconstant_input_arrays = 0;
-    for (int i = 0; i < passthru_op->inputs.size(); i++) {
+    for (size_t i = 0; i < passthru_op->inputs.size(); i++) {
       if (!model->GetArray(passthru_op->inputs[i]).buffer) {
         count_nonconstant_input_arrays++;
         if (count_nonconstant_input_arrays == 1) {
diff --git a/tensorflow/lite/toco/graph_transformations/reorder_elementwise_unary.cc b/tensorflow/lite/toco/graph_transformations/reorder_elementwise_unary.cc
index 17a5e9a1d6a..38edff76d55 100644
--- a/tensorflow/lite/toco/graph_transformations/reorder_elementwise_unary.cc
+++ b/tensorflow/lite/toco/graph_transformations/reorder_elementwise_unary.cc
@@ -127,9 +127,9 @@ bool IsMoveOperator(OperatorType optype) {
     move_op->outputs[0] = output_name;
   } else {
     // The intermediate array is now the output array.
-    for (int i = 0; i < model->operators.size(); i++) {
+    for (size_t i = 0; i < model->operators.size(); i++) {
       Operator* consumer = model->operators[i].get();
-      for (int j = 0; j < consumer->inputs.size(); j++) {
+      for (size_t j = 0; j < consumer->inputs.size(); j++) {
         if (consumer->inputs[j] == output_name) {
           consumer->inputs[j] = intermediate_name;
         }
diff --git a/tensorflow/lite/toco/graph_transformations/reorder_reshape_transpose.cc b/tensorflow/lite/toco/graph_transformations/reorder_reshape_transpose.cc
index 0fbcf9f73b1..b2d184cdc31 100644
--- a/tensorflow/lite/toco/graph_transformations/reorder_reshape_transpose.cc
+++ b/tensorflow/lite/toco/graph_transformations/reorder_reshape_transpose.cc
@@ -60,7 +60,7 @@ std::vector<int> ComputeNewPerm(std::vector<int> input_dims,
                                 std::vector<int> perm) {
   // These are the major axis of the input.
   std::vector<int> input_indices;
-  for (int i = 0; i < input_dims.size(); i++) {
+  for (size_t i = 0; i < input_dims.size(); i++) {
     if (input_dims[i] != 1) {
       input_indices.push_back(i);
     }
@@ -69,7 +69,7 @@ std::vector<int> ComputeNewPerm(std::vector<int> input_dims,
   // This maps which indices of the input produced the intermediate indices for
   // non-unary dimensions.
   std::unordered_map<int, int> intermediate_to_input_indices_map;
-  for (int i = 0; i < intermediate_dims.size(); i++) {
+  for (size_t i = 0; i < intermediate_dims.size(); i++) {
     if (intermediate_dims[i] != 1) {
       intermediate_to_input_indices_map[i] =
           input_indices[intermediate_to_input_indices_map.size()];
@@ -80,14 +80,14 @@ std::vector<int> ComputeNewPerm(std::vector<int> input_dims,
   // major indices.
   std::vector<int> new_perm;
   new_perm.reserve(input_dims.size());
-  for (int i = 0; i < perm.size(); i++) {
+  for (size_t i = 0; i < perm.size(); i++) {
     if (intermediate_dims[perm[i]] == 1) continue;
 
     new_perm.push_back(intermediate_to_input_indices_map[perm[i]]);
   }
 
   // Fill the rest of the transpose in with the ones.
-  for (int index = 0; index < input_dims.size(); index++) {
+  for (size_t index = 0; index < input_dims.size(); index++) {
     if (input_dims[index] == 1) {
       new_perm.push_back(index);
     }
@@ -193,9 +193,9 @@ std::vector<int> ComputeNewPerm(std::vector<int> input_dims,
     DeleteArrayIfUnused(intermediate_name, model);
   } else {
     // The intermediate array is now the output array.
-    for (int i = 0; i < model->operators.size(); i++) {
+    for (size_t i = 0; i < model->operators.size(); i++) {
       Operator* consumer = model->operators[i].get();
-      for (int j = 0; j < consumer->inputs.size(); j++) {
+      for (size_t j = 0; j < consumer->inputs.size(); j++) {
         if (consumer->inputs[j] == output_name) {
           consumer->inputs[j] = intermediate_name;
         }
diff --git a/tensorflow/lite/toco/graph_transformations/resolve_batch_normalization.cc b/tensorflow/lite/toco/graph_transformations/resolve_batch_normalization.cc
index 6e5815ee94d..545c53fb31a 100644
--- a/tensorflow/lite/toco/graph_transformations/resolve_batch_normalization.cc
+++ b/tensorflow/lite/toco/graph_transformations/resolve_batch_normalization.cc
@@ -124,11 +124,11 @@ namespace toco {
   const auto& offset_float_data =
       offset_array.GetBuffer<ArrayDataType::kFloat>().data;
 
-  CHECK(mul_float_data.size() == buffer_size);
-  CHECK(add_float_data.size() == buffer_size);
-  CHECK(mean_float_data.size() == buffer_size);
-  CHECK(multiplier_float_data.size() == buffer_size);
-  CHECK(offset_float_data.size() == buffer_size);
+  CHECK(static_cast<int>(mul_float_data.size()) == buffer_size);
+  CHECK(static_cast<int>(add_float_data.size()) == buffer_size);
+  CHECK(static_cast<int>(mean_float_data.size()) == buffer_size);
+  CHECK(static_cast<int>(multiplier_float_data.size()) == buffer_size);
+  CHECK(static_cast<int>(offset_float_data.size()) == buffer_size);
 
   for (int i = 0; i < buffer_size; i++) {
     mul_float_data[i] = multiplier_float_data[i];
diff --git a/tensorflow/lite/toco/graph_transformations/resolve_constant_concatenation.cc b/tensorflow/lite/toco/graph_transformations/resolve_constant_concatenation.cc
index 7c9aa025f64..20e805a29e0 100644
--- a/tensorflow/lite/toco/graph_transformations/resolve_constant_concatenation.cc
+++ b/tensorflow/lite/toco/graph_transformations/resolve_constant_concatenation.cc
@@ -64,7 +64,7 @@ void CopyTensorSegments(const std::vector<Array*>& input_arrays,
   // Copy the data from input_arrays to concatenated_array_buffer.
   T* dest_ptr = concatenated_array_buffer.data();
   for (int s = 0; s < total_copy_steps; s++) {
-    for (int i = 0; i < input_arrays.size(); i++) {
+    for (size_t i = 0; i < input_arrays.size(); i++) {
       std::copy(src_ptr[i], src_ptr[i] + array_copy_size[i], dest_ptr);
       src_ptr[i] += array_copy_size[i];
       dest_ptr += array_copy_size[i];
diff --git a/tensorflow/lite/toco/graph_transformations/resolve_constant_pack.cc b/tensorflow/lite/toco/graph_transformations/resolve_constant_pack.cc
index 0df35509d3d..c6dc093ba00 100644
--- a/tensorflow/lite/toco/graph_transformations/resolve_constant_pack.cc
+++ b/tensorflow/lite/toco/graph_transformations/resolve_constant_pack.cc
@@ -36,7 +36,7 @@ void Pack(Model* model, PackOperator const& op) {
   // Pack inputs into buffer
   CHECK_EQ(op.axis, 0) << "Packing only supported along first axis";
   int dst_offset = 0;
-  for (int i = 0; i < op.inputs.size(); i++) {
+  for (size_t i = 0; i < op.inputs.size(); i++) {
     // Append array data to output for each input array
     const auto& input_array = model->GetArray(op.inputs[i]);
     int input_size = RequiredBufferSizeForShape(input_array.shape());
diff --git a/tensorflow/lite/toco/graph_transformations/resolve_constant_slice.cc b/tensorflow/lite/toco/graph_transformations/resolve_constant_slice.cc
index fd71fb1873a..34a1a1ce899 100644
--- a/tensorflow/lite/toco/graph_transformations/resolve_constant_slice.cc
+++ b/tensorflow/lite/toco/graph_transformations/resolve_constant_slice.cc
@@ -50,7 +50,7 @@ bool Slice(SliceOperator const& op, Array const& input_array,
   CHECK_LE(size.size(), 4);
   std::vector<int> begin = op.begin;
   std::vector<int> end;
-  for (int i = 0; i < begin.size(); ++i) {
+  for (size_t i = 0; i < begin.size(); ++i) {
     int dim_size = size[i];
     if (dim_size == -1) {
       // -1 means the rest of the dimension.
diff --git a/tensorflow/lite/toco/graph_transformations/resolve_constant_transpose.cc b/tensorflow/lite/toco/graph_transformations/resolve_constant_transpose.cc
index 7ceffe6307e..a822f7b79e3 100644
--- a/tensorflow/lite/toco/graph_transformations/resolve_constant_transpose.cc
+++ b/tensorflow/lite/toco/graph_transformations/resolve_constant_transpose.cc
@@ -40,7 +40,7 @@ void Transpose(Model* model, const Array& input_array,
   CHECK(input_shape.dimensions_count() == output_shape.dimensions_count());
   const int dim = input_shape.dimensions_count();
   CHECK_LE(dim, 4);
-  CHECK(perm.size() >= dim);
+  CHECK(static_cast<int>(perm.size()) >= dim);
   for (int i = 0; i < dim; i++) {
     CHECK(perm[i] >= 0 && perm[i] < dim);
     CHECK(input_shape.dims(perm[i]) == output_shape.dims(i));
diff --git a/tensorflow/lite/toco/graph_transformations/resolve_constant_unary.cc b/tensorflow/lite/toco/graph_transformations/resolve_constant_unary.cc
index 197e17eee16..4d6cd188729 100644
--- a/tensorflow/lite/toco/graph_transformations/resolve_constant_unary.cc
+++ b/tensorflow/lite/toco/graph_transformations/resolve_constant_unary.cc
@@ -62,7 +62,7 @@ void ReduceGeneric(bool keep_dims, const std::vector<int>& axes,
   }
 
   std::vector<int> output_indices(input_shape.dimensions_count());
-  for (int input_offset = 0; input_offset < input.size(); ++input_offset) {
+  for (size_t input_offset = 0; input_offset < input.size(); ++input_offset) {
     std::vector<int> input_indices = ReverseOffset(input_shape, input_offset);
     // Calculate the output location by squashing input indices to 0
     // in reduced axes.
@@ -319,7 +319,7 @@ bool CopyMinMaxFromFirstInput(const Operator& op, Model* model) {
   } else if (unary_op->type == OperatorType::kRelu6 ||
              unary_op->type == OperatorType::kRelu1 ||
              unary_op->type == OperatorType::kRelu) {
-    for (size_t i = 0; i < output_buffer_size; ++i) {
+    for (int i = 0; i < output_buffer_size; ++i) {
       const float value = (*input_float_data)[i];
       float new_value = 0.0f;
       switch (unary_op->type) {
diff --git a/tensorflow/lite/toco/graph_transformations/unpartition_embedding_lookup.cc b/tensorflow/lite/toco/graph_transformations/unpartition_embedding_lookup.cc
index 1f7035c21e2..84d5922aae8 100644
--- a/tensorflow/lite/toco/graph_transformations/unpartition_embedding_lookup.cc
+++ b/tensorflow/lite/toco/graph_transformations/unpartition_embedding_lookup.cc
@@ -57,10 +57,10 @@ namespace toco {
   // Split up the DynamicStitch inputs into the indices and data.
   std::vector<string> stitch_indices_inputs;
   std::vector<string> stitch_data_inputs;
-  for (size_t i = 0; i < stitch_op->num_partitions; ++i) {
+  for (int i = 0; i < stitch_op->num_partitions; ++i) {
     stitch_indices_inputs.push_back(stitch_op->inputs[i]);
   }
-  for (size_t i = stitch_op->num_partitions; i < stitch_op->num_partitions * 2;
+  for (int i = stitch_op->num_partitions; i < stitch_op->num_partitions * 2;
        ++i) {
     stitch_data_inputs.push_back(stitch_op->inputs[i]);
   }
diff --git a/tensorflow/lite/toco/model_cmdline_flags.cc b/tensorflow/lite/toco/model_cmdline_flags.cc
index 86a1cedd612..929ff4b42c5 100644
--- a/tensorflow/lite/toco/model_cmdline_flags.cc
+++ b/tensorflow/lite/toco/model_cmdline_flags.cc
@@ -263,7 +263,7 @@ void ReadModelFlagsFromCommandLineFlags(
     QCHECK(uses_multi_input_flags);
     std::vector<string> mean_values =
         absl::StrSplit(parsed_model_flags.mean_values.value(), ',');
-    QCHECK(mean_values.size() == model_flags->input_arrays_size());
+    QCHECK(static_cast<int>(mean_values.size()) == model_flags->input_arrays_size());
     for (size_t i = 0; i < mean_values.size(); ++i) {
       char* last = nullptr;
       model_flags->mutable_input_arrays(i)->set_mean_value(
@@ -280,7 +280,7 @@ void ReadModelFlagsFromCommandLineFlags(
     QCHECK(uses_multi_input_flags);
     std::vector<string> std_values =
         absl::StrSplit(parsed_model_flags.std_values.value(), ',');
-    QCHECK(std_values.size() == model_flags->input_arrays_size());
+    QCHECK( static_cast<int>(std_values.size()) == model_flags->input_arrays_size());
     for (size_t i = 0; i < std_values.size(); ++i) {
       char* last = nullptr;
       model_flags->mutable_input_arrays(i)->set_std_value(
@@ -298,7 +298,7 @@ void ReadModelFlagsFromCommandLineFlags(
     QCHECK(uses_multi_input_flags);
     std::vector<string> input_data_types =
         absl::StrSplit(parsed_model_flags.input_data_types.value(), ',');
-    QCHECK(input_data_types.size() == model_flags->input_arrays_size());
+    QCHECK(static_cast<int>(input_data_types.size()) == model_flags->input_arrays_size());
     for (size_t i = 0; i < input_data_types.size(); ++i) {
       IODataType type;
       QCHECK(IODataType_Parse(input_data_types[i], &type));
@@ -321,7 +321,7 @@ void ReadModelFlagsFromCommandLineFlags(
     QCHECK(uses_multi_input_flags);
     std::vector<string> input_shapes =
         absl::StrSplit(parsed_model_flags.input_shapes.value(), ':');
-    QCHECK(input_shapes.size() == model_flags->input_arrays_size());
+    QCHECK(static_cast<int>(input_shapes.size()) == model_flags->input_arrays_size());
     for (size_t i = 0; i < input_shapes.size(); ++i) {
       auto* shape = model_flags->mutable_input_arrays(i)->mutable_shape();
       shape->clear_dims();
diff --git a/tensorflow/lite/toco/toco_cmdline_flags.cc b/tensorflow/lite/toco/toco_cmdline_flags.cc
index c133db8f2a4..9697a1ecbbd 100644
--- a/tensorflow/lite/toco/toco_cmdline_flags.cc
+++ b/tensorflow/lite/toco/toco_cmdline_flags.cc
@@ -320,7 +320,7 @@ void ReadTocoFlagsFromCommandLineFlags(const ParsedTocoFlags& parsed_toco_flags,
     std::vector<string> input_types =
         absl::StrSplit(parsed_toco_flags.input_types.value(), ',');
     QCHECK(!input_types.empty());
-    for (int i = 1; i < input_types.size(); i++) {
+    for (size_t i = 1; i < input_types.size(); i++) {
       QCHECK_EQ(input_types[i], input_types[0]);
     }
     toco::IODataType input_type;

From e969fbed2d9feff84520f4681b3c76585472d016 Mon Sep 17 00:00:00 2001
From: Tare Gaskin <taregaskin@google.com>
Date: Wed, 24 Jun 2020 01:56:42 +0000
Subject: [PATCH 0948/1390] mlir files set to mirror master

---
 .../quantization/import_quant_stats_pass.cc   |  5 +-
 .../lite/quantization/quantization_config.cc  |  4 +-
 .../lite/quantization/quantization_driver.cc  |  4 +-
 .../lite/quantization/quantization_utils.cc   |  5 +-
 .../mlir/tensorflow/utils/dump_mlir_util.cc   |  2 +-
 tensorflow/compiler/mlir/xla/ir/chlo_ops.cc   |  2 +-
 tensorflow/compiler/mlir/xla/ir/hlo_ops.cc    | 60 +------------------
 tensorflow/compiler/xla/window_util.cc        |  2 +-
 8 files changed, 15 insertions(+), 69 deletions(-)

diff --git a/tensorflow/compiler/mlir/lite/quantization/import_quant_stats_pass.cc b/tensorflow/compiler/mlir/lite/quantization/import_quant_stats_pass.cc
index 5419a0d5e1b..6299a70b1df 100644
--- a/tensorflow/compiler/mlir/lite/quantization/import_quant_stats_pass.cc
+++ b/tensorflow/compiler/mlir/lite/quantization/import_quant_stats_pass.cc
@@ -76,7 +76,8 @@ class ImportQuantStatsPass
   // If the index is out of range, this method returns false. Otherwise it
   // returns true if the value is a float tensor.
   bool IsQuantizableResult(Operation *op, int index) {
-    if (index < 0 || index >= static_cast<int>(op->getNumResults())) return false;
+    if (index < 0 || index >= static_cast<int>(op->getNumResults()))
+      return false;
     Value res = op->getResult(index);
     return res.getType().isa<ShapedType>() &&
            res.getType().cast<ShapedType>().getElementType().isa<FloatType>();
@@ -158,7 +159,7 @@ void ImportQuantStatsPass::ImportAsStatsOps(OpBuilder b, Operation *op,
     InsertStatsOpAtResult(b, op->getResult(index), layer_stats, axis_stats,
                           axis);
   } else {
-    for (int i = 0; i < static_cast<int>(op->getNumResults()); ++i) {
+    for (int i = 0, e = op->getNumResults(); i < e; ++i) {
       if (IsQuantizableResult(op, i)) {
         InsertStatsOpAtResult(b, op->getResult(i), layer_stats, axis_stats,
                               axis);
diff --git a/tensorflow/compiler/mlir/lite/quantization/quantization_config.cc b/tensorflow/compiler/mlir/lite/quantization/quantization_config.cc
index 30a19974ecc..9adabde4f25 100644
--- a/tensorflow/compiler/mlir/lite/quantization/quantization_config.cc
+++ b/tensorflow/compiler/mlir/lite/quantization/quantization_config.cc
@@ -48,7 +48,7 @@ bool ParseInputNodeQuantSpecs(absl::string_view node_names,
   std::vector<llvm::Optional<double>> node_mins;
   if (!min_values.empty()) {
     std::vector<std::string> node_mins_str = absl::StrSplit(min_values, ',');
-    for (size_t i = 0; i < node_mins_str.size(); i++) {
+    for (int i = 0, e = node_mins_str.size(); i < e; i++) {
       double value;
       if (!absl::SimpleAtod(node_mins_str[i], &value)) {
         return true;
@@ -60,7 +60,7 @@ bool ParseInputNodeQuantSpecs(absl::string_view node_names,
   std::vector<llvm::Optional<double>> node_maxs;
   if (!max_values.empty()) {
     std::vector<std::string> node_maxs_str = absl::StrSplit(max_values, ',');
-    for (size_t i = 0; i < node_maxs_str.size(); i++) {
+    for (int i = 0, e = node_maxs_str.size(); i < e; i++) {
       double value;
       if (!absl::SimpleAtod(node_maxs_str[i], &value)) {
         llvm::errs() << "Unexpected mins: " << node_maxs_str[i] << "\n";
diff --git a/tensorflow/compiler/mlir/lite/quantization/quantization_driver.cc b/tensorflow/compiler/mlir/lite/quantization/quantization_driver.cc
index 45047a05e80..f3e746c7a43 100644
--- a/tensorflow/compiler/mlir/lite/quantization/quantization_driver.cc
+++ b/tensorflow/compiler/mlir/lite/quantization/quantization_driver.cc
@@ -294,7 +294,7 @@ class QuantizationDriver {
         return;
       if (current_op == op) llvm::errs() << "===>>>";
       llvm::errs() << op->getName() << " : (";
-      for (size_t i = 0; i < op->getNumOperands(); ++i) {
+      for (int i = 0, e = op->getNumOperands(); i < e; ++i) {
         if (auto params = GetOperandQuantState(op, i).params)
           params.print(llvm::errs());
         else
@@ -303,7 +303,7 @@ class QuantizationDriver {
         llvm::errs() << ",";
       }
       llvm::errs() << ") -> (";
-      for (size_t i = 0; i < op->getNumResults(); ++i) {
+      for (int i = 0, e = op->getNumResults(); i < e; ++i) {
         if (auto params = GetResultQuantState(op, i).params)
           params.print(llvm::errs());
         else
diff --git a/tensorflow/compiler/mlir/lite/quantization/quantization_utils.cc b/tensorflow/compiler/mlir/lite/quantization/quantization_utils.cc
index 0a891009568..b98739eac6e 100644
--- a/tensorflow/compiler/mlir/lite/quantization/quantization_utils.cc
+++ b/tensorflow/compiler/mlir/lite/quantization/quantization_utils.cc
@@ -76,7 +76,8 @@ TypeAttr RescaleQuantizedType(Type input, Attribute factor) {
   if (auto qtype = ele_type.dyn_cast<quant::UniformQuantizedPerAxisType>()) {
     ArrayRef<double> scales = qtype.getScales();
     // Broadcasting hasn't been implemented yet.
-    if (static_cast<int64_t>(scales.size()) != factor_values.getNumElements()) return {};
+    if (static_cast<int64_t>(scales.size()) != factor_values.getNumElements())
+      return {};
     SmallVector<double, 4> new_scales;
     new_scales.reserve(scales.size());
     auto scales_iter = scales.begin();
@@ -369,7 +370,7 @@ quant::QuantizedType GetUniformQuantizedTypeForBias(
         scales[index_scale.index()] *= index_scale.value();
       }
     } else if (auto type = op_type.dyn_cast<quant::UniformQuantizedType>()) {
-      for (size_t index = 0; index != axis_size; ++index) {
+      for (int index = 0, e = axis_size; index != e; ++index) {
         scales[index] *= type.getScale();
       }
     }
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc b/tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc
index b5a6c922707..febf2bc096d 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc
@@ -41,7 +41,7 @@ std::string MakeUniqueFilename(string name) {
   static NameCounts& instance = *new NameCounts;
 
   // Remove illegal characters from `name`.
-  for (size_t i = 0; i < name.size(); ++i) {
+  for (int i = 0, e = name.size(); i < e; ++i) {
     char ch = name[i];
     if (ch == '/' || ch == '[' || ch == ']' || ch == '*' || ch == '?' ||
         ch == '\\') {
diff --git a/tensorflow/compiler/mlir/xla/ir/chlo_ops.cc b/tensorflow/compiler/mlir/xla/ir/chlo_ops.cc
index f5b895f0c76..3408f3ed0cc 100644
--- a/tensorflow/compiler/mlir/xla/ir/chlo_ops.cc
+++ b/tensorflow/compiler/mlir/xla/ir/chlo_ops.cc
@@ -49,7 +49,7 @@ static Type GetBroadcastType(Type x, Type y, Type element_type,
 
   if (shape_x.size() == shape_y.size()) {
     llvm::SmallVector<int64_t, 4> out_shape(shape_x.size());
-    for (size_t i = 0; i < shape_x.size(); i++) {
+    for (int i = 0, e = shape_x.size(); i < e; i++) {
       auto x_val = shape_x[i];
       auto y_val = shape_y[i];
       if (x_val == -1 || y_val == -1) {
diff --git a/tensorflow/compiler/mlir/xla/ir/hlo_ops.cc b/tensorflow/compiler/mlir/xla/ir/hlo_ops.cc
index 123446dd494..e0fa1da93b8 100644
--- a/tensorflow/compiler/mlir/xla/ir/hlo_ops.cc
+++ b/tensorflow/compiler/mlir/xla/ir/hlo_ops.cc
@@ -106,53 +106,6 @@ DenseIntElementsAttr BuildSliceLimits(DenseIntElementsAttr start_indices,
   return GetI64ElementsAttr(slice_limits, builder);
 }
 
-// Returns the padding value of the given position. If padding_attr is a
-// nullptr, returns 0.
-static int64_t GetPaddingValue(DenseIntElementsAttr padding_attr,
-                               ArrayRef<uint64_t> index) {
-  if (!padding_attr) return 0;
-  return padding_attr.getValue<int64_t>(index);
-}
-
-static bool IsOnlyPaddingSpatialDims(Value lhs,
-                                     ConvDimensionNumbers dimension_numbers,
-                                     DenseIntElementsAttr edge_padding_low,
-                                     DenseIntElementsAttr edge_padding_high) {
-  const int64_t batch_dim = dimension_numbers.input_batch_dimension().getInt();
-  const int64_t feature_dim =
-      dimension_numbers.input_feature_dimension().getInt();
-  if (edge_padding_low.getValue<int64_t>(batch_dim) ||
-      edge_padding_high.getValue<int64_t>(batch_dim))
-    return false;
-  if (edge_padding_low.getValue<int64_t>(feature_dim) ||
-      edge_padding_high.getValue<int64_t>(feature_dim))
-    return false;
-  return true;
-}
-
-DenseIntElementsAttr BuildConvPaddingAttrs(
-    DenseIntElementsAttr edge_padding_low,
-    DenseIntElementsAttr edge_padding_high, DenseIntElementsAttr padding_attr,
-    ConvDimensionNumbers dimension_numbers, Builder* builder) {
-  SmallVector<int64_t, 4> padding_low, padding_high;
-  for (const auto& dim : dimension_numbers.input_spatial_dimensions()) {
-    unsigned i = dim.getZExtValue();
-    padding_low.push_back(edge_padding_low.getValue<int64_t>(i));
-    padding_high.push_back(edge_padding_high.getValue<int64_t>(i));
-  }
-
-  int rank = padding_low.size();
-  SmallVector<int64_t, 8> padding;
-  for (unsigned i = 0; i < static_cast<size_t>(rank); ++i) {
-    padding.push_back(GetPaddingValue(padding_attr, {i, 0}) + padding_low[i]);
-    padding.push_back(GetPaddingValue(padding_attr, {i, 1}) + padding_high[i]);
-  }
-  // padding_attr.getType() doesn't work because it is an optional attribute,
-  // which can be a nullptr.
-  auto type = RankedTensorType::get({rank, 2}, builder->getIntegerType(64));
-  return DenseIntElementsAttr::get(type, padding);
-}
-
 #include "tensorflow/compiler/mlir/xla/transforms/generated_canonicalize.inc"
 }  // namespace
 
@@ -891,7 +844,7 @@ static Attribute foldConcatenateHelper(ConcatenateOp* op,
   auto shape = type.getShape();
 
   size_t top_size = 1;
-  for (size_t i = 0; i < axis; i++) {
+  for (int i = 0, e = axis; i < e; i++) {
     top_size = top_size * shape[i];
   }
 
@@ -1169,7 +1122,7 @@ static LogicalResult Verify(MapOp op) {
   // increasing.
   auto values = op.dimensions().getValues<int64_t>();
   auto dimensions = std::vector<int64_t>{values.begin(), values.end()};
-  for (int i = 0; static_cast<size_t>(i) < dimensions.size(); ++i) {
+  for (int i = 0, e = dimensions.size(); i < e; ++i) {
     if (dimensions[i] != i)
       return op.emitOpError() << "requires monotonically increasing dimension "
                                  "numbers, but got: "
@@ -2153,14 +2106,5 @@ LogicalResult deriveShapeFromFirstOperand(
   return success();
 }
 
-//===----------------------------------------------------------------------===//
-// ConvOp
-//===----------------------------------------------------------------------===//
-
-void ConvOp::getCanonicalizationPatterns(OwningRewritePatternList& results,
-                                         MLIRContext* context) {
-  results.insert<FoldPadIntoConv>(context);
-}
-
 }  // namespace xla_hlo
 }  // namespace mlir
diff --git a/tensorflow/compiler/xla/window_util.cc b/tensorflow/compiler/xla/window_util.cc
index e33d0b6d1dc..a58179c3ee0 100644
--- a/tensorflow/compiler/xla/window_util.cc
+++ b/tensorflow/compiler/xla/window_util.cc
@@ -42,7 +42,7 @@ Window MakeWindow(absl::Span<const int64> sizes,
                   absl::Span<const int64> strides) {
   Window window;
   CHECK_EQ(sizes.size(), strides.size());
-  for (auto nb = 0; static_cast<size_t>(nb) < sizes.size(); ++nb) {
+  for (auto nb = 0; nb < sizes.size(); ++nb) {
     auto* dimension = window.add_dimensions();
     dimension->set_size(sizes[nb]);
     dimension->set_stride(strides[nb]);

From 40439cbf5f26623f189a237ca58f29394a43649d Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 23 Jun 2020 18:53:48 -0700
Subject: [PATCH 0949/1390] Integrate LLVM at
 https://github.com/llvm/llvm-project/commit/4d1fd33561cf

PiperOrigin-RevId: 317982154
Change-Id: Id9ebe544371760095fd1303b740760d46bf65fdb
---
 tensorflow/compiler/aot/BUILD                                 | 1 +
 tensorflow/compiler/aot/compile.cc                            | 1 +
 .../compiler/mlir/xla/transforms/lhlo_legalize_to_llvm.cc     | 2 +-
 tensorflow/workspace.bzl                                      | 4 ++--
 third_party/mlir/BUILD                                        | 1 +
 5 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/tensorflow/compiler/aot/BUILD b/tensorflow/compiler/aot/BUILD
index eed796b4ec1..0c959e327a8 100644
--- a/tensorflow/compiler/aot/BUILD
+++ b/tensorflow/compiler/aot/BUILD
@@ -69,6 +69,7 @@ cc_library(
         "//tensorflow/core:protos_all_cc",
         "@llvm-project//llvm:ARMCodeGen",  # fixdeps: keep
         "@llvm-project//llvm:PowerPCCodeGen",  # fixdeps: keep
+        "@llvm-project//llvm:Support",
         "@llvm-project//llvm:Target",
         "@llvm-project//llvm:X86CodeGen",  # fixdeps: keep
         "//tensorflow/core:regexp_internal",
diff --git a/tensorflow/compiler/aot/compile.cc b/tensorflow/compiler/aot/compile.cc
index a2cba5cdf9e..fe0d6d5a074 100644
--- a/tensorflow/compiler/aot/compile.cc
+++ b/tensorflow/compiler/aot/compile.cc
@@ -22,6 +22,7 @@ limitations under the License.
 
 #include "absl/base/call_once.h"
 #include "llvm-c/Target.h"
+#include "llvm/Support/ManagedStatic.h"
 #include "tensorflow/compiler/aot/codegen.h"
 #include "tensorflow/compiler/aot/flags.h"
 #include "tensorflow/compiler/aot/quantize.h"
diff --git a/tensorflow/compiler/mlir/xla/transforms/lhlo_legalize_to_llvm.cc b/tensorflow/compiler/mlir/xla/transforms/lhlo_legalize_to_llvm.cc
index 78a77dc3b4d..99d2c08aa98 100644
--- a/tensorflow/compiler/mlir/xla/transforms/lhlo_legalize_to_llvm.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/lhlo_legalize_to_llvm.cc
@@ -129,7 +129,7 @@ struct DynamicMemRefCastOpConverter
 void PopulateLhloToLLVMConversionPatterns(LLVMTypeConverter *converter,
                                           OwningRewritePatternList *patterns) {
   patterns->insert<DynamicMemRefCastOpConverter, StaticMemRefCastOpConverter>(
-      *converter, LowerToLLVMOptions());
+      *converter);
 }
 
 }  // namespace xla_lhlo
diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index 98043c1658b..f2d0c028c5f 100755
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -710,8 +710,8 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
     )
 
     # Check out LLVM and MLIR from llvm-project.
-    LLVM_COMMIT = "f1c671925b1c60ded3e4e7b3c6b1ec984b2d9b93"
-    LLVM_SHA256 = "57fc8f0ab46bdfdff52b03c2196d658c094bc4179cd1cf9495becf6a8466123a"
+    LLVM_COMMIT = "4d1fd33561cf758be00bdbffab1b6a1a0e428fc0"
+    LLVM_SHA256 = "6d9851ea3c4ff3df57865e0cafc9793c983636cdb6dc9ff3df00816a778e9039"
     LLVM_URLS = [
         "https://storage.googleapis.com/mirror.tensorflow.org/github.com/llvm/llvm-project/archive/{commit}.tar.gz".format(commit = LLVM_COMMIT),
         "https://github.com/llvm/llvm-project/archive/{commit}.tar.gz".format(commit = LLVM_COMMIT),
diff --git a/third_party/mlir/BUILD b/third_party/mlir/BUILD
index 8fd0a94bf64..ba9b580e53f 100644
--- a/third_party/mlir/BUILD
+++ b/third_party/mlir/BUILD
@@ -1507,6 +1507,7 @@ cc_library(
         ":StandardToLLVM",
         ":Support",
         ":Transforms",
+        "@llvm-project//llvm:Support",
     ],
 )
 

From ef20289d996f85558817fa6aaf1d4786eee0c527 Mon Sep 17 00:00:00 2001
From: Mehdi Amini <aminim@google.com>
Date: Tue, 23 Jun 2020 19:51:24 -0700
Subject: [PATCH 0950/1390] Update LLVM OSS Bazel build file: add rule for
 auto-generated file

PiperOrigin-RevId: 317987718
Change-Id: I25126724e4f335438b9ac66f6d02a38be1e3782d
---
 third_party/llvm/llvm.autogenerated.BUILD | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/third_party/llvm/llvm.autogenerated.BUILD b/third_party/llvm/llvm.autogenerated.BUILD
index c70ff559165..50ff746b9f2 100644
--- a/third_party/llvm/llvm.autogenerated.BUILD
+++ b/third_party/llvm/llvm.autogenerated.BUILD
@@ -561,6 +561,7 @@ filegroup(
     name = "common_target_td_sources",
     srcs = glob([
         "include/llvm/CodeGen/*.td",
+        "include/llvm/Frontend/Directive/*.td",
         "include/llvm/IR/Intrinsics*.td",
         "include/llvm/TableGen/*.td",
         "include/llvm/Target/*.td",
@@ -666,6 +667,17 @@ cc_library(
     ],
 )
 
+gentbl(
+    name = "omp_gen",
+    tbl_outs = [("--gen-directive-decls", "include/llvm/Frontend/OpenMP/OMP.h.inc")],
+    tblgen = ":llvm-tblgen",
+    td_file = "include/llvm/Frontend/OpenMP/OMP.td",
+    td_srcs = glob([
+        "include/llvm/Frontend/OpenMP/*.td",
+        "include/llvm/Frontend/Directive/*.td",
+    ]),
+)
+
 ########################## Begin generated content ##########################
 cc_library(
     name = "AArch64AsmParser",
@@ -2053,6 +2065,7 @@ cc_library(
         ":Support",
         ":TransformUtils",
         ":config",
+        ":omp_gen",
     ],
 )
 

From a4f7dd5436885a8ecdc6ac34bc4689e7e04ed2af Mon Sep 17 00:00:00 2001
From: Chao Mei <chaomei@google.com>
Date: Tue, 23 Jun 2020 20:04:29 -0700
Subject: [PATCH 0951/1390] Support to output unconsumed flags and exit the
 execution if cmdline flags fail to be parsed for tflite evaluation tools.

PiperOrigin-RevId: 317989024
Change-Id: I52cc2249246b7d19c9c8a257ac1478d48f7de8fa
---
 .../evaluation_delegate_provider.cc           |  8 +++-
 tensorflow/lite/tools/evaluation/tasks/BUILD  |  4 ++
 .../tasks/coco_object_detection/BUILD         |  1 -
 .../tasks/coco_object_detection/run_eval.cc   | 21 ++++-----
 .../tasks/imagenet_image_classification/BUILD |  1 -
 .../imagenet_image_classification/run_eval.cc | 21 ++++-----
 .../evaluation/tasks/inference_diff/BUILD     |  1 -
 .../tasks/inference_diff/run_eval.cc          | 22 ++++-----
 .../tools/evaluation/tasks/task_executor.cc   | 47 +++++++++++++++++++
 .../tools/evaluation/tasks/task_executor.h    | 15 +++++-
 .../evaluation/tasks/task_executor_main.cc    |  4 +-
 11 files changed, 104 insertions(+), 41 deletions(-)
 create mode 100644 tensorflow/lite/tools/evaluation/tasks/task_executor.cc

diff --git a/tensorflow/lite/tools/evaluation/evaluation_delegate_provider.cc b/tensorflow/lite/tools/evaluation/evaluation_delegate_provider.cc
index 42f2666ba9b..fc40440b105 100644
--- a/tensorflow/lite/tools/evaluation/evaluation_delegate_provider.cc
+++ b/tensorflow/lite/tools/evaluation/evaluation_delegate_provider.cc
@@ -97,7 +97,13 @@ bool DelegateProviders::InitFromCmdlineArgs(int* argc, const char** argv) {
     auto one_flags = one->CreateFlags(&params_);
     flags.insert(flags.end(), one_flags.begin(), one_flags.end());
   }
-  return Flags::Parse(argc, argv, flags);
+
+  const bool parse_result = Flags::Parse(argc, argv, flags);
+  if (!parse_result) {
+    std::string usage = Flags::Usage(argv[0], flags);
+    TFLITE_LOG(ERROR) << usage;
+  }
+  return parse_result;
 }
 
 TfLiteDelegatePtr DelegateProviders::CreateDelegate(
diff --git a/tensorflow/lite/tools/evaluation/tasks/BUILD b/tensorflow/lite/tools/evaluation/tasks/BUILD
index d8daf170331..5272542f045 100644
--- a/tensorflow/lite/tools/evaluation/tasks/BUILD
+++ b/tensorflow/lite/tools/evaluation/tasks/BUILD
@@ -10,10 +10,14 @@ package(
 
 cc_library(
     name = "task_executor",
+    srcs = ["task_executor.cc"],
     hdrs = ["task_executor.h"],
     copts = tflite_copts(),
     linkopts = task_linkopts(),
     deps = [
+        "//tensorflow/lite/tools:command_line_flags",
+        "//tensorflow/lite/tools:logging",
+        "//tensorflow/lite/tools/evaluation:evaluation_delegate_provider",
         "//tensorflow/lite/tools/evaluation/proto:evaluation_config_cc_proto",
         "@com_google_absl//absl/types:optional",
     ],
diff --git a/tensorflow/lite/tools/evaluation/tasks/coco_object_detection/BUILD b/tensorflow/lite/tools/evaluation/tasks/coco_object_detection/BUILD
index b8f77d72acb..dc5f8237f6a 100644
--- a/tensorflow/lite/tools/evaluation/tasks/coco_object_detection/BUILD
+++ b/tensorflow/lite/tools/evaluation/tasks/coco_object_detection/BUILD
@@ -26,7 +26,6 @@ cc_library(
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/tools:command_line_flags",
         "//tensorflow/lite/tools:logging",
-        "//tensorflow/lite/tools/evaluation:evaluation_delegate_provider",
         "//tensorflow/lite/tools/evaluation:evaluation_stage",
         "//tensorflow/lite/tools/evaluation:utils",
         "//tensorflow/lite/tools/evaluation/proto:evaluation_config_cc_proto",
diff --git a/tensorflow/lite/tools/evaluation/tasks/coco_object_detection/run_eval.cc b/tensorflow/lite/tools/evaluation/tasks/coco_object_detection/run_eval.cc
index 765e8fc6465..73491457f38 100644
--- a/tensorflow/lite/tools/evaluation/tasks/coco_object_detection/run_eval.cc
+++ b/tensorflow/lite/tools/evaluation/tasks/coco_object_detection/run_eval.cc
@@ -21,7 +21,6 @@ limitations under the License.
 #include "absl/types/optional.h"
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/tools/command_line_flags.h"
-#include "tensorflow/lite/tools/evaluation/evaluation_delegate_provider.h"
 #include "tensorflow/lite/tools/evaluation/proto/evaluation_config.pb.h"
 #include "tensorflow/lite/tools/evaluation/proto/evaluation_stages.pb.h"
 #include "tensorflow/lite/tools/evaluation/stages/object_detection_stage.h"
@@ -49,11 +48,14 @@ std::string GetNameFromPath(const std::string& str) {
 
 class CocoObjectDetection : public TaskExecutor {
  public:
-  CocoObjectDetection(int* argc, char* argv[]);
+  CocoObjectDetection() : debug_mode_(false), num_interpreter_threads_(1) {}
   ~CocoObjectDetection() override {}
 
+ protected:
+  std::vector<Flag> GetFlags() final;
+
   // If the run is successful, the latest metrics will be returned.
-  absl::optional<EvaluationStageMetrics> Run() final;
+  absl::optional<EvaluationStageMetrics> RunImpl() final;
 
  private:
   void OutputResult(const EvaluationStageMetrics& latest_metrics) const;
@@ -68,8 +70,7 @@ class CocoObjectDetection : public TaskExecutor {
   DelegateProviders delegate_providers_;
 };
 
-CocoObjectDetection::CocoObjectDetection(int* argc, char* argv[])
-    : debug_mode_(false), num_interpreter_threads_(1) {
+std::vector<Flag> CocoObjectDetection::GetFlags() {
   std::vector<tflite::Flag> flag_list = {
       tflite::Flag::CreateFlag(kModelFileFlag, &model_file_path_,
                                "Path to test tflite model file."),
@@ -105,12 +106,10 @@ CocoObjectDetection::CocoObjectDetection(int* argc, char* argv[])
           "Delegate to use for inference, if available. "
           "Must be one of {'nnapi', 'gpu', 'xnnpack', 'hexagon'}"),
   };
-  tflite::Flags::Parse(argc, const_cast<const char**>(argv), flag_list);
-  DelegateProviders delegate_providers;
-  delegate_providers.InitFromCmdlineArgs(argc, const_cast<const char**>(argv));
+  return flag_list;
 }
 
-absl::optional<EvaluationStageMetrics> CocoObjectDetection::Run() {
+absl::optional<EvaluationStageMetrics> CocoObjectDetection::RunImpl() {
   // Process images in filename-sorted order.
   std::vector<std::string> image_paths;
   if (GetSortedFileNames(StripTrailingSlashes(ground_truth_images_path_),
@@ -224,8 +223,8 @@ void CocoObjectDetection::OutputResult(
                    << precision_metrics.overall_mean_average_precision();
 }
 
-std::unique_ptr<TaskExecutor> CreateTaskExecutor(int* argc, char* argv[]) {
-  return std::unique_ptr<TaskExecutor>(new CocoObjectDetection(argc, argv));
+std::unique_ptr<TaskExecutor> CreateTaskExecutor() {
+  return std::unique_ptr<TaskExecutor>(new CocoObjectDetection());
 }
 
 }  // namespace evaluation
diff --git a/tensorflow/lite/tools/evaluation/tasks/imagenet_image_classification/BUILD b/tensorflow/lite/tools/evaluation/tasks/imagenet_image_classification/BUILD
index de2a7f96311..941bbc0ff69 100644
--- a/tensorflow/lite/tools/evaluation/tasks/imagenet_image_classification/BUILD
+++ b/tensorflow/lite/tools/evaluation/tasks/imagenet_image_classification/BUILD
@@ -17,7 +17,6 @@ cc_library(
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/tools:command_line_flags",
         "//tensorflow/lite/tools:logging",
-        "//tensorflow/lite/tools/evaluation:evaluation_delegate_provider",
         "//tensorflow/lite/tools/evaluation:evaluation_stage",
         "//tensorflow/lite/tools/evaluation:utils",
         "//tensorflow/lite/tools/evaluation/proto:evaluation_config_cc_proto",
diff --git a/tensorflow/lite/tools/evaluation/tasks/imagenet_image_classification/run_eval.cc b/tensorflow/lite/tools/evaluation/tasks/imagenet_image_classification/run_eval.cc
index 13eeb313ad4..fdc97d44abc 100644
--- a/tensorflow/lite/tools/evaluation/tasks/imagenet_image_classification/run_eval.cc
+++ b/tensorflow/lite/tools/evaluation/tasks/imagenet_image_classification/run_eval.cc
@@ -20,7 +20,6 @@ limitations under the License.
 #include "absl/types/optional.h"
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/tools/command_line_flags.h"
-#include "tensorflow/lite/tools/evaluation/evaluation_delegate_provider.h"
 #include "tensorflow/lite/tools/evaluation/proto/evaluation_config.pb.h"
 #include "tensorflow/lite/tools/evaluation/proto/evaluation_stages.pb.h"
 #include "tensorflow/lite/tools/evaluation/stages/image_classification_stage.h"
@@ -50,11 +49,14 @@ std::vector<T> GetFirstN(const std::vector<T>& v, int n) {
 
 class ImagenetClassification : public TaskExecutor {
  public:
-  ImagenetClassification(int* argc, char* argv[]);
+  ImagenetClassification() : num_images_(0), num_interpreter_threads_(1) {}
   ~ImagenetClassification() override {}
 
+ protected:
+  std::vector<Flag> GetFlags() final;
+
   // If the run is successful, the latest metrics will be returned.
-  absl::optional<EvaluationStageMetrics> Run() final;
+  absl::optional<EvaluationStageMetrics> RunImpl() final;
 
  private:
   void OutputResult(const EvaluationStageMetrics& latest_metrics) const;
@@ -67,11 +69,9 @@ class ImagenetClassification : public TaskExecutor {
   std::string delegate_;
   int num_images_;
   int num_interpreter_threads_;
-  DelegateProviders delegate_providers_;
 };
 
-ImagenetClassification::ImagenetClassification(int* argc, char* argv[])
-    : num_images_(0), num_interpreter_threads_(1) {
+std::vector<Flag> ImagenetClassification::GetFlags() {
   std::vector<tflite::Flag> flag_list = {
       tflite::Flag::CreateFlag(kModelFileFlag, &model_file_path_,
                                "Path to test tflite model file."),
@@ -107,11 +107,10 @@ ImagenetClassification::ImagenetClassification(int* argc, char* argv[])
           "Delegate to use for inference, if available. "
           "Must be one of {'nnapi', 'gpu', 'hexagon', 'xnnpack'}"),
   };
-  tflite::Flags::Parse(argc, const_cast<const char**>(argv), flag_list);
-  delegate_providers_.InitFromCmdlineArgs(argc, const_cast<const char**>(argv));
+  return flag_list;
 }
 
-absl::optional<EvaluationStageMetrics> ImagenetClassification::Run() {
+absl::optional<EvaluationStageMetrics> ImagenetClassification::RunImpl() {
   // Process images in filename-sorted order.
   std::vector<std::string> image_files, ground_truth_image_labels;
   if (GetSortedFileNames(StripTrailingSlashes(ground_truth_images_path_),
@@ -203,8 +202,8 @@ void ImagenetClassification::OutputResult(
   }
 }
 
-std::unique_ptr<TaskExecutor> CreateTaskExecutor(int* argc, char* argv[]) {
-  return std::unique_ptr<TaskExecutor>(new ImagenetClassification(argc, argv));
+std::unique_ptr<TaskExecutor> CreateTaskExecutor() {
+  return std::unique_ptr<TaskExecutor>(new ImagenetClassification());
 }
 
 }  // namespace evaluation
diff --git a/tensorflow/lite/tools/evaluation/tasks/inference_diff/BUILD b/tensorflow/lite/tools/evaluation/tasks/inference_diff/BUILD
index a53872b50cb..36606722caf 100644
--- a/tensorflow/lite/tools/evaluation/tasks/inference_diff/BUILD
+++ b/tensorflow/lite/tools/evaluation/tasks/inference_diff/BUILD
@@ -17,7 +17,6 @@ cc_library(
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/tools:command_line_flags",
         "//tensorflow/lite/tools:logging",
-        "//tensorflow/lite/tools/evaluation:evaluation_delegate_provider",
         "//tensorflow/lite/tools/evaluation:evaluation_stage",
         "//tensorflow/lite/tools/evaluation/proto:evaluation_config_cc_proto",
         "//tensorflow/lite/tools/evaluation/proto:evaluation_stages_cc_proto",
diff --git a/tensorflow/lite/tools/evaluation/tasks/inference_diff/run_eval.cc b/tensorflow/lite/tools/evaluation/tasks/inference_diff/run_eval.cc
index 814ebe3b3bf..9a3fea0b8a3 100644
--- a/tensorflow/lite/tools/evaluation/tasks/inference_diff/run_eval.cc
+++ b/tensorflow/lite/tools/evaluation/tasks/inference_diff/run_eval.cc
@@ -19,7 +19,6 @@ limitations under the License.
 #include "absl/types/optional.h"
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/tools/command_line_flags.h"
-#include "tensorflow/lite/tools/evaluation/evaluation_delegate_provider.h"
 #include "tensorflow/lite/tools/evaluation/proto/evaluation_config.pb.h"
 #include "tensorflow/lite/tools/evaluation/proto/evaluation_stages.pb.h"
 #include "tensorflow/lite/tools/evaluation/stages/inference_profiler_stage.h"
@@ -37,11 +36,14 @@ constexpr char kDelegateFlag[] = "delegate";
 
 class InferenceDiff : public TaskExecutor {
  public:
-  InferenceDiff(int* argc, char* argv[]);
+  InferenceDiff() : num_runs_(50), num_interpreter_threads_(1) {}
   ~InferenceDiff() override {}
 
+ protected:
+  std::vector<Flag> GetFlags() final;
+
   // If the run is successful, the latest metrics will be returned.
-  absl::optional<EvaluationStageMetrics> Run() final;
+  absl::optional<EvaluationStageMetrics> RunImpl() final;
 
  private:
   void OutputResult(const EvaluationStageMetrics& latest_metrics) const;
@@ -50,11 +52,9 @@ class InferenceDiff : public TaskExecutor {
   std::string delegate_;
   int num_runs_;
   int num_interpreter_threads_;
-  DelegateProviders delegate_providers_;
 };
 
-InferenceDiff::InferenceDiff(int* argc, char* argv[])
-    : num_runs_(50), num_interpreter_threads_(1) {
+std::vector<Flag> InferenceDiff::GetFlags() {
   // Command Line Flags.
   std::vector<tflite::Flag> flag_list = {
       tflite::Flag::CreateFlag(kModelFileFlag, &model_file_path_,
@@ -72,11 +72,11 @@ InferenceDiff::InferenceDiff(int* argc, char* argv[])
           "Delegate to use for test inference, if available. "
           "Must be one of {'nnapi', 'gpu', 'hexagon', 'xnnpack'}"),
   };
-  tflite::Flags::Parse(argc, const_cast<const char**>(argv), flag_list);
-  delegate_providers_.InitFromCmdlineArgs(argc, const_cast<const char**>(argv));
+
+  return flag_list;
 }
 
-absl::optional<EvaluationStageMetrics> InferenceDiff::Run() {
+absl::optional<EvaluationStageMetrics> InferenceDiff::RunImpl() {
   // Initialize evaluation stage.
   EvaluationStageConfig eval_config;
   eval_config.set_name("inference_profiling");
@@ -137,8 +137,8 @@ void InferenceDiff::OutputResult(
   }
 }
 
-std::unique_ptr<TaskExecutor> CreateTaskExecutor(int* argc, char* argv[]) {
-  return std::unique_ptr<TaskExecutor>(new InferenceDiff(argc, argv));
+std::unique_ptr<TaskExecutor> CreateTaskExecutor() {
+  return std::unique_ptr<TaskExecutor>(new InferenceDiff());
 }
 
 }  // namespace evaluation
diff --git a/tensorflow/lite/tools/evaluation/tasks/task_executor.cc b/tensorflow/lite/tools/evaluation/tasks/task_executor.cc
new file mode 100644
index 00000000000..e62793dc6ff
--- /dev/null
+++ b/tensorflow/lite/tools/evaluation/tasks/task_executor.cc
@@ -0,0 +1,47 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/tools/evaluation/tasks/task_executor.h"
+
+#include "absl/types/optional.h"
+#include "tensorflow/lite/tools/logging.h"
+
+namespace tflite {
+namespace evaluation {
+absl::optional<EvaluationStageMetrics> TaskExecutor::Run(int* argc,
+                                                         char* argv[]) {
+  auto flag_list = GetFlags();
+  bool parse_result =
+      tflite::Flags::Parse(argc, const_cast<const char**>(argv), flag_list);
+  if (!parse_result) {
+    std::string usage = Flags::Usage(argv[0], flag_list);
+    TFLITE_LOG(ERROR) << usage;
+    return absl::nullopt;
+  }
+  parse_result = delegate_providers_.InitFromCmdlineArgs(
+      argc, const_cast<const char**>(argv));
+  if (!parse_result) {
+    return absl::nullopt;
+  }
+
+  std::string unconsumed_args =
+      Flags::ArgsToString(*argc, const_cast<const char**>(argv));
+  if (!unconsumed_args.empty()) {
+    TFLITE_LOG(WARN) << "Unconsumed cmdline flags: " << unconsumed_args;
+  }
+
+  return RunImpl();
+}
+}  // namespace evaluation
+}  // namespace tflite
diff --git a/tensorflow/lite/tools/evaluation/tasks/task_executor.h b/tensorflow/lite/tools/evaluation/tasks/task_executor.h
index b50e7d6d03f..caa84283098 100644
--- a/tensorflow/lite/tools/evaluation/tasks/task_executor.h
+++ b/tensorflow/lite/tools/evaluation/tasks/task_executor.h
@@ -16,6 +16,8 @@ limitations under the License.
 #define TENSORFLOW_LITE_TOOLS_EVALUATION_TASKS_TASK_EXECUTOR_H_
 
 #include "absl/types/optional.h"
+#include "tensorflow/lite/tools/command_line_flags.h"
+#include "tensorflow/lite/tools/evaluation/evaluation_delegate_provider.h"
 #include "tensorflow/lite/tools/evaluation/proto/evaluation_config.pb.h"
 
 namespace tflite {
@@ -25,13 +27,22 @@ namespace evaluation {
 class TaskExecutor {
  public:
   virtual ~TaskExecutor() {}
+
   // If the run is successful, the latest metrics will be returned.
-  virtual absl::optional<EvaluationStageMetrics> Run() = 0;
+  absl::optional<EvaluationStageMetrics> Run(int* argc, char* argv[]);
+
+ protected:
+  // Returns a list of commandline flags that this task defines.
+  virtual std::vector<Flag> GetFlags() = 0;
+
+  virtual absl::optional<EvaluationStageMetrics> RunImpl() = 0;
+
+  DelegateProviders delegate_providers_;
 };
 
 // Just a declaration. In order to avoid the boilerpolate main-function code,
 // every evaluation task should define this function.
-std::unique_ptr<TaskExecutor> CreateTaskExecutor(int* argc, char* argv[]);
+std::unique_ptr<TaskExecutor> CreateTaskExecutor();
 }  // namespace evaluation
 }  // namespace tflite
 
diff --git a/tensorflow/lite/tools/evaluation/tasks/task_executor_main.cc b/tensorflow/lite/tools/evaluation/tasks/task_executor_main.cc
index 6ef1a6724b7..97f8e263659 100644
--- a/tensorflow/lite/tools/evaluation/tasks/task_executor_main.cc
+++ b/tensorflow/lite/tools/evaluation/tasks/task_executor_main.cc
@@ -18,12 +18,12 @@ limitations under the License.
 
 // This could serve as the main function for all eval tools.
 int main(int argc, char* argv[]) {
-  auto task_executor = tflite::evaluation::CreateTaskExecutor(&argc, argv);
+  auto task_executor = tflite::evaluation::CreateTaskExecutor();
   if (task_executor == nullptr) {
     TFLITE_LOG(ERROR) << "Could not create the task evaluation!";
     return EXIT_FAILURE;
   }
-  const auto metrics = task_executor->Run();
+  const auto metrics = task_executor->Run(&argc, argv);
   if (!metrics.has_value()) {
     TFLITE_LOG(ERROR) << "Could not run the task evaluation!";
     return EXIT_FAILURE;

From cb32cf0f0160d1f582787119d0480de3ba8b9b53 Mon Sep 17 00:00:00 2001
From: Zhenyu Tan <tanzheny@google.com>
Date: Tue, 23 Jun 2020 21:06:46 -0700
Subject: [PATCH 0952/1390] change the size of input to remedy OOM issue.

PiperOrigin-RevId: 317995769
Change-Id: I1358449e989a41c5621e6a4d56e603387be0490d
---
 .../preprocessing/image_preprocessing_distribution_test.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/tensorflow/python/keras/layers/preprocessing/image_preprocessing_distribution_test.py b/tensorflow/python/keras/layers/preprocessing/image_preprocessing_distribution_test.py
index 0b93c1d57c6..7fc2b42c919 100644
--- a/tensorflow/python/keras/layers/preprocessing/image_preprocessing_distribution_test.py
+++ b/tensorflow/python/keras/layers/preprocessing/image_preprocessing_distribution_test.py
@@ -40,9 +40,10 @@ class ImagePreprocessingDistributionTest(
     preprocessing_test_utils.PreprocessingLayerTest):
 
   def test_distribution(self, distribution):
-    np_images = np.random.random((1000, 32, 32, 3)).astype(np.float32)
+    # TODO(b/159738418): large image input causes OOM in ubuntu multi gpu.
+    np_images = np.random.random((32, 32, 32, 3)).astype(np.float32)
     image_dataset = dataset_ops.Dataset.from_tensor_slices(np_images).batch(
-        32, drop_remainder=True)
+        16, drop_remainder=True)
 
     with distribution.scope():
       input_data = keras.Input(shape=(32, 32, 3), dtype=dtypes.float32)
@@ -58,7 +59,7 @@ class ImagePreprocessingDistributionTest(
       output = flatten_layer(preprocessed_image)
       cls_layer = keras.layers.Dense(units=1, activation="sigmoid")
       output = cls_layer(output)
-      model = keras.Model(inputs=input_data, outputs=preprocessed_image)
+      model = keras.Model(inputs=input_data, outputs=output)
     model.compile(loss="binary_crossentropy")
     _ = model.predict(image_dataset)
 

From 4dc9422e243f47d8a31db315f8a7ee8e204750d2 Mon Sep 17 00:00:00 2001
From: Ashwin Murthy <ashwinm@google.com>
Date: Tue, 23 Jun 2020 21:46:03 -0700
Subject: [PATCH 0953/1390] Small update to the operation fusion g3doc

PiperOrigin-RevId: 317999506
Change-Id: I676b755eebea8189b6068ac8785b92b1391764fd
---
 tensorflow/lite/g3doc/convert/operation_fusion.md | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tensorflow/lite/g3doc/convert/operation_fusion.md b/tensorflow/lite/g3doc/convert/operation_fusion.md
index c8714179498..74af2a08b81 100644
--- a/tensorflow/lite/g3doc/convert/operation_fusion.md
+++ b/tensorflow/lite/g3doc/convert/operation_fusion.md
@@ -53,10 +53,10 @@ implemented in the fused LSTM operations.
 Converting composite operations from TensorFlow to fused operations in
 TensorFlow Lite is a hard problem. This is because:
 
-1.  Composite operations are represented in the TensorFlow graph as an
-    unstructured set of primitive operations. It can be very challenging to
-    identify (e.g. via pattern matching) the sub-graph corresponding to such a
-    composite operation.
+1.  Composite operations are represented in the TensorFlow graph as a set of
+    primitive operations without a well defined boundary. It can be very
+    challenging to identify (e.g. via pattern matching) the sub-graph
+    corresponding to such a composite operation.
 
 1.  There may be more than one TensorFlow implementation targeting a fused
     TensorFlow Lite operation. For example, there are many LSTM implementations

From 55a1bd064482bbe59c7cbf8fa5bbdf7ee4828b5a Mon Sep 17 00:00:00 2001
From: Wenhao Jia <jiawenhao@google.com>
Date: Tue, 23 Jun 2020 21:52:26 -0700
Subject: [PATCH 0954/1390] Simplify the build structure of TpuExecutor.

PiperOrigin-RevId: 318000146
Change-Id: Ib2e95583653edcbcf85d3a1b17ef8b58ab570dce
---
 tensorflow/core/tpu/BUILD                     |  1 -
 tensorflow/core/tpu/kernels/BUILD             | 12 +--
 .../core/tpu/kernels/tpu_compile_c_api.h      |  2 +-
 .../core/tpu/kernels/tpu_ops_common_c_api.h   | 20 ----
 .../core/tpu/kernels/tpu_program_c_api.h      |  2 +-
 .../core/tpu/tpu_api_dlsym_initializer.cc     |  1 -
 tensorflow/core/tpu/tpu_library_init_fns.inc  |  1 +
 tensorflow/stream_executor/tpu/BUILD          | 98 ++++---------------
 .../stream_executor/tpu/status_helper.h       |  1 +
 .../stream_executor/tpu/tpu_executor_c_api.h  |  2 +-
 .../tpu/tpu_node_context_c_api.h              |  3 +-
 .../stream_executor/tpu/tpu_platform.cc       |  2 +-
 12 files changed, 30 insertions(+), 115 deletions(-)
 delete mode 100644 tensorflow/core/tpu/kernels/tpu_ops_common_c_api.h

diff --git a/tensorflow/core/tpu/BUILD b/tensorflow/core/tpu/BUILD
index aa811f23672..589af63da52 100644
--- a/tensorflow/core/tpu/BUILD
+++ b/tensorflow/core/tpu/BUILD
@@ -141,7 +141,6 @@ cc_library(
         "//tensorflow/core/tpu/kernels:tpu_util_c_api_hdrs",
         "//tensorflow/stream_executor/tpu:tpu_executor_c_api_hdrs",
         "//tensorflow/stream_executor/tpu:tpu_node_context_c_api_hdrs",
-        "//tensorflow/stream_executor/tpu:tpu_platform_hdrs",
     ],
 )
 
diff --git a/tensorflow/core/tpu/kernels/BUILD b/tensorflow/core/tpu/kernels/BUILD
index f69c97b81de..d82cf1b254b 100644
--- a/tensorflow/core/tpu/kernels/BUILD
+++ b/tensorflow/core/tpu/kernels/BUILD
@@ -1,4 +1,5 @@
 # TPU Kernel Implementations
+
 load(
     "//tensorflow/core/platform:build_config.bzl",
     "tf_proto_library_cc",
@@ -86,8 +87,8 @@ cc_library(
     hdrs = ["tpu_compile_c_api.h"],
     deps = [
         ":tpu_mesh_state_c_api_hdrs",
-        ":tpu_ops_common_c_api_hdrs",
         ":tpu_program_c_api_hdrs",
+        ":tpu_util_c_api_hdrs",
         "//tensorflow/core/tpu:libtftpu_header",
         "//tensorflow/stream_executor/tpu:proto_helper",
     ],
@@ -367,7 +368,6 @@ cc_library(
 
 cc_library(
     name = "tpu_util_hdrs",
-    srcs = [],
     hdrs = ["tpu_util.h"],
     deps = [
         ":tpu_compilation_cache_key",
@@ -390,17 +390,11 @@ cc_library(
     alwayslink = True,
 )
 
-cc_library(
-    name = "tpu_ops_common_c_api_hdrs",
-    hdrs = ["tpu_ops_common_c_api.h"],
-    alwayslink = True,
-)
-
 cc_library(
     name = "tpu_program_c_api_hdrs",
     hdrs = ["tpu_program_c_api.h"],
     deps = [
-        ":tpu_ops_common_c_api_hdrs",
+        ":tpu_util_c_api_hdrs",
         "//tensorflow/stream_executor/tpu:proto_helper",
     ],
     alwayslink = True,
diff --git a/tensorflow/core/tpu/kernels/tpu_compile_c_api.h b/tensorflow/core/tpu/kernels/tpu_compile_c_api.h
index eab53fe9da4..e82df78b3bd 100644
--- a/tensorflow/core/tpu/kernels/tpu_compile_c_api.h
+++ b/tensorflow/core/tpu/kernels/tpu_compile_c_api.h
@@ -16,8 +16,8 @@ limitations under the License.
 #define TENSORFLOW_CORE_TPU_KERNELS_TPU_COMPILE_C_API_H_
 
 #include "tensorflow/core/tpu/kernels/tpu_mesh_state_c_api.h"
-#include "tensorflow/core/tpu/kernels/tpu_ops_common_c_api.h"
 #include "tensorflow/core/tpu/kernels/tpu_program_c_api.h"
+#include "tensorflow/core/tpu/kernels/tpu_util_c_api.h"
 #include "tensorflow/core/tpu/libtftpu.h"
 #include "tensorflow/stream_executor/tpu/proto_helper.h"
 
diff --git a/tensorflow/core/tpu/kernels/tpu_ops_common_c_api.h b/tensorflow/core/tpu/kernels/tpu_ops_common_c_api.h
deleted file mode 100644
index 987eb64925f..00000000000
--- a/tensorflow/core/tpu/kernels/tpu_ops_common_c_api.h
+++ /dev/null
@@ -1,20 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#ifndef TENSORFLOW_CORE_TPU_KERNELS_TPU_OPS_COMMON_C_API_H_
-#define TENSORFLOW_CORE_TPU_KERNELS_TPU_OPS_COMMON_C_API_H_
-
-typedef struct SE_Status SE_Status;
-
-#endif  // TENSORFLOW_CORE_TPU_KERNELS_TPU_OPS_COMMON_C_API_H_
diff --git a/tensorflow/core/tpu/kernels/tpu_program_c_api.h b/tensorflow/core/tpu/kernels/tpu_program_c_api.h
index 43cbe37d258..254527e7a2a 100644
--- a/tensorflow/core/tpu/kernels/tpu_program_c_api.h
+++ b/tensorflow/core/tpu/kernels/tpu_program_c_api.h
@@ -15,7 +15,7 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_TPU_KERNELS_TPU_PROGRAM_C_API_H_
 #define TENSORFLOW_CORE_TPU_KERNELS_TPU_PROGRAM_C_API_H_
 
-#include "tensorflow/core/tpu/kernels/tpu_ops_common_c_api.h"
+#include "tensorflow/core/tpu/kernels/tpu_util_c_api.h"
 #include "tensorflow/stream_executor/tpu/proto_helper.h"
 
 typedef struct XLA_TpuProgram XLA_TpuProgram;
diff --git a/tensorflow/core/tpu/tpu_api_dlsym_initializer.cc b/tensorflow/core/tpu/tpu_api_dlsym_initializer.cc
index c6666421327..495e6a2219b 100644
--- a/tensorflow/core/tpu/tpu_api_dlsym_initializer.cc
+++ b/tensorflow/core/tpu/tpu_api_dlsym_initializer.cc
@@ -21,7 +21,6 @@ limitations under the License.
 #include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/tpu/tpu_api.h"
 #include "tensorflow/stream_executor/tpu/tpu_node_context_c_api.h"
-#include "tensorflow/stream_executor/tpu/tpu_platform.h"
 
 #define TFTPU_SET_FN(Struct, FnName)                                       \
   Struct->FnName##Fn =                                                     \
diff --git a/tensorflow/core/tpu/tpu_library_init_fns.inc b/tensorflow/core/tpu/tpu_library_init_fns.inc
index e21d7f195ad..29fdb42d95e 100644
--- a/tensorflow/core/tpu/tpu_library_init_fns.inc
+++ b/tensorflow/core/tpu/tpu_library_init_fns.inc
@@ -137,6 +137,7 @@ tensorflow::Status SetTpuNodeContextStructFns(void* library_handle) {
 
   TFTPU_SET_FN(node_context_fn, TpuNodeContext_Create);
   TFTPU_SET_FN(node_context_fn, TpuNodeContext_Free);
+  TFTPU_SET_FN(node_context_fn, TpuNodeContext_Initialize);
   TFTPU_SET_FN(node_context_fn, TpuNodeContext_StopChipHeartbeats);
   TFTPU_SET_FN(node_context_fn, TpuNodeContext_CloseTpuHost);
 
diff --git a/tensorflow/stream_executor/tpu/BUILD b/tensorflow/stream_executor/tpu/BUILD
index 720ba6bc0c3..71c2c728a17 100644
--- a/tensorflow/stream_executor/tpu/BUILD
+++ b/tensorflow/stream_executor/tpu/BUILD
@@ -16,7 +16,7 @@ cc_library(
         "//tensorflow/c:tf_attrtype",
         "//tensorflow/c:tf_status",
         "//tensorflow/core/tpu:libtftpu_header",
-        "//tensorflow/core/tpu/kernels:tpu_ops_common_c_api_hdrs",
+        "//tensorflow/core/tpu/kernels:tpu_util_c_api_hdrs",
     ],
     alwayslink = True,
 )
@@ -26,8 +26,8 @@ cc_library(
     hdrs = ["tpu_node_context_c_api.h"],
     visibility = ["//visibility:public"],
     deps = [
-        ":tpu_executor_c_api_hdrs",
         "//tensorflow/core/tpu:libtftpu_header",
+        "//tensorflow/core/tpu/kernels:tpu_util_c_api_hdrs",
     ],
     alwayslink = True,
 )
@@ -38,6 +38,7 @@ cc_library(
     deps = [
         ":tpu_executor_c_api_hdrs",
         "//tensorflow/core/platform:status",
+        "//tensorflow/core/tpu/kernels:tpu_util_c_api_hdrs",
     ],
 )
 
@@ -62,77 +63,34 @@ cc_library(
     deps = ["//tensorflow/core:lib"],
 )
 
-cc_library(
-    name = "tpu_stream",
-    hdrs = ["tpu_stream.h"],
-    deps = [
-        ":c_api_conversions",
-        ":status_helper",
-        ":tpu_executor_c_api_hdrs",
-        ":tpu_stream_interface",
-        "//tensorflow/core/tpu:tpu_api",
-        "//tensorflow/stream_executor:stream",
-    ],
-)
-
-cc_library(
-    name = "tpu_timer",
-    hdrs = ["tpu_timer.h"],
-    deps = [
-        ":tpu_executor_c_api_hdrs",
-        "//tensorflow/core/platform:types",
-        "//tensorflow/core/tpu:tpu_api",
-        "//tensorflow/stream_executor:stream",
-    ],
-)
-
 cc_library(
     name = "tpu_executor",
-    srcs = ["tpu_executor.cc"],
-    hdrs = ["tpu_executor.h"],
+    srcs = [
+        "tpu_executor.cc",
+        "tpu_platform.cc",
+    ],
+    hdrs = [
+        "tpu_executor.h",
+        "tpu_platform.h",
+        "tpu_stream.h",
+        "tpu_timer.h",
+    ],
     deps = [
         ":c_api_conversions",
         ":status_helper",
         ":tpu_executor_c_api_hdrs",
         ":tpu_executor_interface",
-        ":tpu_platform",
         ":tpu_platform_interface",
-        ":tpu_stream",
-        ":tpu_timer",
+        ":tpu_stream_interface",
         "//tensorflow/c:tf_status",
         "//tensorflow/core:lib",
+        "//tensorflow/core/platform:types",
         "//tensorflow/core/tpu:tpu_api",
         "//tensorflow/stream_executor:stream",
         "//tensorflow/stream_executor/lib",
         "@com_google_absl//absl/container:flat_hash_map",
     ],
-)
-
-cc_library(
-    name = "tpu_executor_hdrs",
-    hdrs = ["tpu_executor.h"],
-    deps = [
-        ":tpu_executor_c_api_hdrs",
-        ":tpu_executor_interface",
-        ":tpu_platform_hdrs",
-        ":tpu_platform_interface",
-        "//tensorflow/core/platform:types",
-        "//tensorflow/stream_executor:stream_header",
-        "//tensorflow/stream_executor/lib",
-        "@com_google_absl//absl/container:flat_hash_map",
-    ],
-)
-
-cc_library(
-    name = "tpu_platform_hdrs",
-    hdrs = ["tpu_platform.h"],
-    deps = [
-        ":tpu_executor_c_api_hdrs",
-        ":tpu_platform_interface",
-        "//tensorflow/core/platform:types",
-        "//tensorflow/stream_executor:stream_header",
-        "@com_google_absl//absl/container:flat_hash_map",
-    ],
+    alwayslink = True,
 )
 
 cc_library(
@@ -158,29 +116,11 @@ cc_library(
     ],
 )
 
-cc_library(
-    name = "tpu_platform",
-    srcs = ["tpu_platform.cc"],
-    hdrs = ["tpu_platform.h"],
-    deps = [
-        ":status_helper",
-        ":tpu_executor_c_api_hdrs",
-        ":tpu_executor_hdrs",
-        ":tpu_platform_interface",
-        "//tensorflow/c:tf_status",
-        "//tensorflow/core/platform:types",
-        "//tensorflow/core/tpu:tpu_api",
-        "//tensorflow/stream_executor:stream",
-        "@com_google_absl//absl/container:flat_hash_map",
-    ],
-    alwayslink = True,
-)
-
 cc_library(
     name = "tpu_transfer_manager",
     srcs = ["tpu_transfer_manager_registration.cc"],
     deps = [
-        ":tpu_platform",
+        ":tpu_executor",
         ":tpu_transfer_manager_base",
         "//tensorflow/compiler/xla/service:transfer_manager",
     ],
@@ -194,8 +134,8 @@ cc_library(
         ":c_api_conversions",
         ":proto_helper",
         ":status_helper",
+        ":tpu_executor",
         ":tpu_executor_c_api_hdrs",
-        ":tpu_platform",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
@@ -211,8 +151,8 @@ cc_library(
     srcs = ["tpu_computation_placer.cc"],
     hdrs = ["tpu_computation_placer.h"],
     deps = [
+        ":tpu_executor",
         ":tpu_executor_c_api_hdrs",
-        ":tpu_platform",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla/service:computation_placer",
     ],
diff --git a/tensorflow/stream_executor/tpu/status_helper.h b/tensorflow/stream_executor/tpu/status_helper.h
index 8fcf302edac..bc8820f5fef 100644
--- a/tensorflow/stream_executor/tpu/status_helper.h
+++ b/tensorflow/stream_executor/tpu/status_helper.h
@@ -17,6 +17,7 @@ limitations under the License.
 #define TENSORFLOW_STREAM_EXECUTOR_TPU_STATUS_HELPER_H_
 
 #include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/tpu/kernels/tpu_util_c_api.h"
 #include "tensorflow/stream_executor/tpu/tpu_executor_c_api.h"
 
 struct StatusHelper {
diff --git a/tensorflow/stream_executor/tpu/tpu_executor_c_api.h b/tensorflow/stream_executor/tpu/tpu_executor_c_api.h
index eee69a35b23..5911d651b66 100644
--- a/tensorflow/stream_executor/tpu/tpu_executor_c_api.h
+++ b/tensorflow/stream_executor/tpu/tpu_executor_c_api.h
@@ -21,7 +21,7 @@ limitations under the License.
 
 #include "tensorflow/c/tf_attrtype.h"
 #include "tensorflow/c/tf_status.h"
-#include "tensorflow/core/tpu/kernels/tpu_ops_common_c_api.h"
+#include "tensorflow/core/tpu/kernels/tpu_util_c_api.h"
 #include "tensorflow/core/tpu/libtftpu.h"
 
 typedef struct SE_Platform SE_Platform;
diff --git a/tensorflow/stream_executor/tpu/tpu_node_context_c_api.h b/tensorflow/stream_executor/tpu/tpu_node_context_c_api.h
index d47fdf37a46..e7ca506df72 100644
--- a/tensorflow/stream_executor/tpu/tpu_node_context_c_api.h
+++ b/tensorflow/stream_executor/tpu/tpu_node_context_c_api.h
@@ -15,8 +15,8 @@ limitations under the License.
 #ifndef TENSORFLOW_STREAM_EXECUTOR_TPU_TPU_NODE_CONTEXT_C_API_H_
 #define TENSORFLOW_STREAM_EXECUTOR_TPU_TPU_NODE_CONTEXT_C_API_H_
 
+#include "tensorflow/core/tpu/kernels/tpu_util_c_api.h"
 #include "tensorflow/core/tpu/libtftpu.h"
-#include "tensorflow/stream_executor/tpu/tpu_executor_c_api.h"
 
 typedef struct XLA_TpuNodeContext XLA_TpuNodeContext;
 
@@ -36,6 +36,7 @@ void TpuNodeContext_CloseTpuHost(SE_Status* status);
 struct TfTpu_NodeContextApiFn {
   TFTPU_ADD_FN_IN_STRUCT(TpuNodeContext_Create);
   TFTPU_ADD_FN_IN_STRUCT(TpuNodeContext_Free);
+  TFTPU_ADD_FN_IN_STRUCT(TpuNodeContext_Initialize);
   TFTPU_ADD_FN_IN_STRUCT(TpuNodeContext_StopChipHeartbeats);
   TFTPU_ADD_FN_IN_STRUCT(TpuNodeContext_CloseTpuHost);
 };
diff --git a/tensorflow/stream_executor/tpu/tpu_platform.cc b/tensorflow/stream_executor/tpu/tpu_platform.cc
index 97a97a63351..db6324ecaec 100644
--- a/tensorflow/stream_executor/tpu/tpu_platform.cc
+++ b/tensorflow/stream_executor/tpu/tpu_platform.cc
@@ -100,7 +100,7 @@ TpuPlatform::GetUncachedExecutor(
     return status.status();
   }
   return std::make_unique<stream_executor::StreamExecutor>(
-      this, absl::make_unique<tensorflow::TpuExecutor>(this, executor),
+      this, std::make_unique<tensorflow::TpuExecutor>(this, executor),
       config.ordinal);
 }
 

From c3c7f0418e7a7ec8ae6f0e531d24d6b5724ac3bb Mon Sep 17 00:00:00 2001
From: Tian Lin <tianlin@google.com>
Date: Tue, 23 Jun 2020 21:59:12 -0700
Subject: [PATCH 0955/1390] Add quantization description in Model Maker colab.

PiperOrigin-RevId: 318000747
Change-Id: I6cf6356bd3c582f6906005ba1879e316878cc5f8
---
 .../model_maker_image_classification.ipynb    | 126 ++++++++++++++----
 1 file changed, 102 insertions(+), 24 deletions(-)

diff --git a/tensorflow/lite/g3doc/tutorials/model_maker_image_classification.ipynb b/tensorflow/lite/g3doc/tutorials/model_maker_image_classification.ipynb
index 464a5d1b5ef..37b2395dec6 100644
--- a/tensorflow/lite/g3doc/tutorials/model_maker_image_classification.ipynb
+++ b/tensorflow/lite/g3doc/tutorials/model_maker_image_classification.ipynb
@@ -12,7 +12,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": null,
       "metadata": {
         "cellView": "form",
         "colab": {},
@@ -49,7 +49,7 @@
       "metadata": {
         "colab_type": "text",
         "id": "nDABAblytltI"
-      }, 
+      },
       "source": [
         "\u003ctable class=\"tfo-notebook-buttons\" align=\"left\"\u003e\n",
         "  \u003ctd\u003e\n",
@@ -93,7 +93,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": null,
       "metadata": {
         "colab": {},
         "colab_type": "code",
@@ -116,7 +116,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": null,
       "metadata": {
         "colab": {},
         "colab_type": "code",
@@ -131,6 +131,7 @@
         "\n",
         "from tensorflow_examples.lite.model_maker.core.data_util.image_dataloader import ImageClassifierDataLoader\n",
         "from tensorflow_examples.lite.model_maker.core.task import image_classifier\n",
+        "from tensorflow_examples.lite.model_maker.core.task.configs import QuantizationConfig\n",
         "from tensorflow_examples.lite.model_maker.core.task.model_spec import mobilenet_v2_spec\n",
         "from tensorflow_examples.lite.model_maker.core.task.model_spec import ImageModelSpec\n",
         "\n",
@@ -161,7 +162,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": null,
       "metadata": {
         "cellView": "form",
         "colab": {},
@@ -221,7 +222,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": null,
       "metadata": {
         "colab": {},
         "colab_type": "code",
@@ -245,7 +246,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": null,
       "metadata": {
         "colab": {},
         "colab_type": "code",
@@ -268,7 +269,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": null,
       "metadata": {
         "colab": {},
         "colab_type": "code",
@@ -294,7 +295,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": null,
       "metadata": {
         "colab": {},
         "colab_type": "code",
@@ -370,7 +371,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": null,
       "metadata": {
         "colab": {},
         "colab_type": "code",
@@ -398,7 +399,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": null,
       "metadata": {
         "colab": {},
         "colab_type": "code",
@@ -421,7 +422,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": null,
       "metadata": {
         "colab": {},
         "colab_type": "code",
@@ -445,7 +446,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": null,
       "metadata": {
         "colab": {},
         "colab_type": "code",
@@ -478,7 +479,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": null,
       "metadata": {
         "colab": {},
         "colab_type": "code",
@@ -501,7 +502,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": null,
       "metadata": {
         "colab": {},
         "colab_type": "code",
@@ -526,7 +527,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": null,
       "metadata": {
         "colab": {},
         "colab_type": "code",
@@ -549,7 +550,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": null,
       "metadata": {
         "colab": {},
         "colab_type": "code",
@@ -609,7 +610,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": null,
       "metadata": {
         "colab": {},
         "colab_type": "code",
@@ -644,7 +645,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": null,
       "metadata": {
         "colab": {},
         "colab_type": "code",
@@ -724,6 +725,83 @@
         "In this section, we describe several advanced topics, including switching to a different image classification model, changing the training hyperparameters etc.\n"
       ]
     },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "Gc4Jk8TvBQfm"
+      },
+      "source": [
+        "## Post-training quantization on the TensorFLow Lite model\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "tD8BOYrHBiDt"
+      },
+      "source": [
+        "[Post-training quantization](https://www.tensorflow.org/lite/performance/post_training_quantization) is a conversion technique that can reduce model size and inference latency, while also improving CPU and hardware accelerator latency, with little degradation in model accuracy. Thus, it's widely used to optimize the model.\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "iyIo0d5TCzE2"
+      },
+      "source": [
+        "Model Maker supports multiple post-training quantization options. Let's take full integer quantization as an instance. First, define the quantization config to enforce enforce full integer quantization for all ops including the input and output. The input type and output type are `uint8` by default. You may also change them to other types like `int8` by setting `inference_input_type` and `inference_output_type` in config."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "k8hL2mstCxQl"
+      },
+      "outputs": [],
+      "source": [
+        "config = QuantizationConfig.create_full_integer_quantization(representative_data=test_data, is_integer_only=True)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "K1gzx_rmFMOA"
+      },
+      "source": [
+        "Then we export TensorFlow Lite model with such configuration."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "WTJzFQnJFMjr"
+      },
+      "outputs": [],
+      "source": [
+        "model.export(export_dir='.', tflite_filename='model_quant.tflite', quantization_config=config)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "Safo0e40wKZW"
+      },
+      "source": [
+        "In Colab, you can download the model named `model_quant.tflite` from the left sidebar, same as the uploading part mentioned above."
+      ]
+    },
     {
       "cell_type": "markdown",
       "metadata": {
@@ -750,7 +828,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": null,
       "metadata": {
         "colab": {},
         "colab_type": "code",
@@ -773,7 +851,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": null,
       "metadata": {
         "colab": {},
         "colab_type": "code",
@@ -802,7 +880,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": null,
       "metadata": {
         "colab": {},
         "colab_type": "code",
@@ -871,7 +949,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": null,
       "metadata": {
         "colab": {},
         "colab_type": "code",
@@ -894,7 +972,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": null,
       "metadata": {
         "colab": {},
         "colab_type": "code",

From 867c19e99bd23ea6560c710f127bfcee5d9b1429 Mon Sep 17 00:00:00 2001
From: Ashwin Murthy <ashwinm@google.com>
Date: Tue, 23 Jun 2020 22:17:14 -0700
Subject: [PATCH 0956/1390] Change operators to operations in RNN doc

PiperOrigin-RevId: 318002821
Change-Id: I04ffa3a58c695ac0b032f5664f5d5828fee2e66e
---
 tensorflow/lite/g3doc/convert/rnn.md | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/tensorflow/lite/g3doc/convert/rnn.md b/tensorflow/lite/g3doc/convert/rnn.md
index 0954f13a4c7..ce9cf91f867 100644
--- a/tensorflow/lite/g3doc/convert/rnn.md
+++ b/tensorflow/lite/g3doc/convert/rnn.md
@@ -3,9 +3,9 @@
 ## Overview
 
 TensorFlow Lite supports converting TensorFlow RNN models to TensorFlow Lite’s
-fused LSTM operators. Fused operators exist to maximize the performance of their
-underlying kernel implementations, as well as provide a higher level interface
-to define complex transformations like quantizatization.
+fused LSTM operations. Fused operations exist to maximize the performance of
+their underlying kernel implementations, as well as provide a higher level
+interface to define complex transformations like quantizatization.
 
 Since there are many variants of RNN APIs in TensorFlow, our approach has been
 two fold:
@@ -105,7 +105,7 @@ forward and one for backward, see examples
 [here](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/keras/layers/wrappers.py#L381).
 Once we see the go\_backward attribute, we recognize it as backward LSTM, then
 we group forward & backward LSTM together. **This is future work.** Currently,
-this creates two UnidirectionalSequenceLSTM operators in the TensorFlow Lite
+this creates two UnidirectionalSequenceLSTM operations in the TensorFlow Lite
 model.
 
 ### User-defined LSTM conversion examples
@@ -141,7 +141,7 @@ MLIR-pass
 [here](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/compiler/mlir/lite/transforms/prepare_composite_functions_tf.cc#L108).
 The function’s interface should be treated like an API contract and should
 contain the arguments needed to convert to fused TensorFlow Lite LSTM
-operators - i.e. input, bias, weights, projection, layer normalization, etc. It
+operations - i.e. input, bias, weights, projection, layer normalization, etc. It
 is preferable for the tensors passed as arguments to this function to have known
 rank (i.e. RankedTensorType in MLIR). This makes it much easier to write
 conversion code that can assume these tensors as RankedTensorType and helps
@@ -196,5 +196,5 @@ follows:
     the user program. Such a TensorFlow program can still be converted to
     TensorFlow Lite using the feature being described here.
 1.  Bidirectional LSTM is currently modelled as two UnidirectionalSequenceLSTM
-    operators in TensorFlow Lite. This will be replaced with a single
+    operations in TensorFlow Lite. This will be replaced with a single
     BidirectionalSequenceLSTM op.

From 0ae60ad75c3bae6373f499fe4c5714b87d7a7a38 Mon Sep 17 00:00:00 2001
From: Uday Bondhugula <uday@polymagelabs.com>
Date: Mon, 8 Jun 2020 22:56:20 +0530
Subject: [PATCH 0957/1390] [MLIR] Add tf.VarIsInitializedOp op and its
 canonicalization pattern

Add tf.VarIsInitializedOp op to the MLIR TF dialect along with a
canonicalization to erase it when its result is unused. This op has side
effects on resources but can still be erased if its result is unused.

Signed-off-by: Uday Bondhugula <uday@polymagelabs.com>
---
 .../mlir/tensorflow/ir/tf_generated_ops.td    | 15 +++++++++++
 .../compiler/mlir/tensorflow/ir/tf_ops.cc     | 26 +++++++++++++++++++
 .../mlir/tensorflow/tests/canonicalize.mlir   | 11 ++++++++
 3 files changed, 52 insertions(+)

diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td b/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
index 65ca3ea4dbd..3fd02173272 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
@@ -6926,6 +6926,21 @@ operation.
   let hasCanonicalizer = 1;
 }
 
+def TF_VarIsInitializedOp : TF_Op<"VarIsInitializedOp", []> {
+  let summary = "Checks if a variable is initialized";
+
+  let description = [{
+This operation checks whether a resource handle-based variable has been
+initialized.
+  }];
+
+  let arguments = (ins TF_ResourceTensor : $resource);
+
+  let results = (outs TF_Tensor : $value);
+
+  let hasCanonicalizer = 1;
+}
+
 def TF_RealOp : TF_Op<"Real", [NoSideEffect, SameOperandsAndResultShape]> {
   let summary = "Returns the real part of a complex number.";
 
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc
index 6d8c5af297d..6416f2b692e 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc
@@ -2223,6 +2223,32 @@ void ReadVariableOp::getCanonicalizationPatterns(
   results.insert<ReadVariableOfCast>(context);
 }
 
+//===----------------------------------------------------------------------===//
+// VarIsInitializedOp
+//===----------------------------------------------------------------------===//
+
+namespace {
+
+/// Erase VarIsInitializedOp operations with no uses. This op has side effect on
+/// resources (read-only), but can still be deleted if it has zero uses.
+struct EraseDeadVarIsInitializedOp
+    : public OpRewritePattern<VarIsInitializedOp> {
+  using OpRewritePattern<VarIsInitializedOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(VarIsInitializedOp op,
+                                PatternRewriter& rewriter) const override {
+    if (!op.use_empty()) return failure();
+    rewriter.eraseOp(op);
+    return success();
+  }
+};
+}  // end anonymous namespace.
+
+void VarIsInitializedOp::getCanonicalizationPatterns(
+    OwningRewritePatternList &patterns, MLIRContext *context) {
+  patterns.insert<EraseDeadVarIsInitializedOp>(context);
+}
+
 //===----------------------------------------------------------------------===//
 // LogicalNotOp
 //===----------------------------------------------------------------------===//
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/canonicalize.mlir b/tensorflow/compiler/mlir/tensorflow/tests/canonicalize.mlir
index 542c5b3c166..c75916be04e 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/canonicalize.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/canonicalize.mlir
@@ -610,3 +610,14 @@ func @testBatchToSpaceDynamicCrops(%arg0: tensor<?x?x?x?xf32>, %arg1: tensor<*xi
   %0 = "tf.BatchToSpace"(%arg0, %arg1) {block_size = 8 : i64} : (tensor<?x?x?x?xf32>, tensor<*xi32>) -> tensor<*xf32>
   return %0 : tensor<*xf32>
 }
+
+// CHECK-LABEL: @erase_tf_var_is_initialized
+func @erase_tf_var_is_initialized(%arg0 : tensor<!tf.resource<tensor<f32>>>) -> tensor<i1> {
+  %vh = "tf.VarHandleOp"() {container = "", shape = "tfshape$", shared_name = "x"} : () -> tensor<!tf.resource<tensor<f32>>>
+  %is = "tf.VarIsInitializedOp"(%vh) : (tensor<!tf.resource<tensor<f32>>>) -> tensor<i1>
+  %res = "tf.UnknownOp"(%vh) : (tensor<!tf.resource<tensor<f32>>>) -> tensor<i1>
+  return %res : tensor<i1>
+}
+// Unused VarIsInitializedOp is erased.
+// CHECK: tf.VarHandleOp
+// CHECK-NEXT: tf.UnknownOp

From 19e03663aa2358ec910748b7919674f39273e311 Mon Sep 17 00:00:00 2001
From: Christian Sigg <csigg@google.com>
Date: Tue, 23 Jun 2020 23:30:15 -0700
Subject: [PATCH 0958/1390] Stop reporting errors after 10 mismatching tensor
 values, cutting down log size on failing builds.

PiperOrigin-RevId: 318010085
Change-Id: I9a8b70256b4f04134d9034deae606db61b1135fb
---
 tensorflow/core/framework/tensor_testutil.cc | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/tensorflow/core/framework/tensor_testutil.cc b/tensorflow/core/framework/tensor_testutil.cc
index 1a7812ce4eb..313451d6b83 100644
--- a/tensorflow/core/framework/tensor_testutil.cc
+++ b/tensorflow/core/framework/tensor_testutil.cc
@@ -42,11 +42,15 @@ void ExpectClose(const Tensor& x, const Tensor& y, double atol, double rtol) {
       << "typed_atol is negative: " << typed_atol;
   ASSERT_GE(typed_rtol, static_cast<RealType>(0.0))
       << "typed_rtol is negative: " << typed_rtol;
+  const int max_failures = 10;
+  int num_failures = 0;
   for (int i = 0; i < size; ++i) {
     EXPECT_TRUE(
         internal::Helper<T>::IsClose(Tx[i], Ty[i], typed_atol, typed_rtol))
-        << "index = " << i << " x = " << Tx[i] << " y = " << Ty[i]
-        << " typed_atol = " << typed_atol << " typed_rtol = " << typed_rtol;
+        << "index = " << (++num_failures, i) << " x = " << Tx[i]
+        << " y = " << Ty[i] << " typed_atol = " << typed_atol
+        << " typed_rtol = " << typed_rtol;
+    ASSERT_LT(num_failures, max_failures) << "Too many mismatches, giving up.";
   }
 }
 

From 71aceb9ca98d238a6f883687d84fd9b62ed5c21e Mon Sep 17 00:00:00 2001
From: Tres Popp <tpopp@google.com>
Date: Wed, 24 Jun 2020 00:38:41 -0700
Subject: [PATCH 0959/1390] Integrate LLVM at
 https://github.com/llvm/llvm-project/commit/6507bc56216b

PiperOrigin-RevId: 318016873
Change-Id: Ia747d4f68f39773b9e1b74b0b5e2d727363bd61c
---
 tensorflow/workspace.bzl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index f2d0c028c5f..49cd146bed5 100755
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -710,8 +710,8 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
     )
 
     # Check out LLVM and MLIR from llvm-project.
-    LLVM_COMMIT = "4d1fd33561cf758be00bdbffab1b6a1a0e428fc0"
-    LLVM_SHA256 = "6d9851ea3c4ff3df57865e0cafc9793c983636cdb6dc9ff3df00816a778e9039"
+    LLVM_COMMIT = "6507bc56216ba4441790bc581a5b76d9c2ad9774"
+    LLVM_SHA256 = "d1749ab8a32110fae83881ca6c82383632516c6fd5ffbd2c5dd1b486db224e46"
     LLVM_URLS = [
         "https://storage.googleapis.com/mirror.tensorflow.org/github.com/llvm/llvm-project/archive/{commit}.tar.gz".format(commit = LLVM_COMMIT),
         "https://github.com/llvm/llvm-project/archive/{commit}.tar.gz".format(commit = LLVM_COMMIT),

From 3c72bbf719a1fd3d92d1476fe2a4740630176aed Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 24 Jun 2020 02:02:11 -0700
Subject: [PATCH 0960/1390] compat: Update forward compatibility horizon to
 2020-06-24

PiperOrigin-RevId: 318025610
Change-Id: I64a948f41b5dc5bf6d21aecb4b89134469811eee
---
 tensorflow/python/compat/compat.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/compat/compat.py b/tensorflow/python/compat/compat.py
index 521d7eaf30f..07a3a3ce563 100644
--- a/tensorflow/python/compat/compat.py
+++ b/tensorflow/python/compat/compat.py
@@ -33,7 +33,7 @@ from tensorflow.python.util.tf_export import tf_export
 # This value changes every day with an automatic CL. It can be modified in code
 # via `forward_compatibility_horizon()` or with the environment variable
 # TF_FORWARD_COMPATIBILITY_DELTA_DAYS, which is added to the compatibility date.
-_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2020, 6, 23)
+_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2020, 6, 24)
 _FORWARD_COMPATIBILITY_DELTA_DAYS_VAR_NAME = "TF_FORWARD_COMPATIBILITY_DELTA_DAYS"
 _FORWARD_COMPATIBILITY_DATE_NUMBER = None
 

From 098a3eacfd8701d25f5d83070daed03b0be3c97b Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 24 Jun 2020 02:02:14 -0700
Subject: [PATCH 0961/1390] Update GraphDef version to 442.

PiperOrigin-RevId: 318025616
Change-Id: I2976374542cdd28db9d5d39709f30930f9336447
---
 tensorflow/core/public/version.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index 52a926c8d8b..62c72b59f7e 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -108,7 +108,7 @@ limitations under the License.
 
 #define TF_GRAPH_DEF_VERSION_MIN_PRODUCER 0
 #define TF_GRAPH_DEF_VERSION_MIN_CONSUMER 0
-#define TF_GRAPH_DEF_VERSION 441  // Updated: 2020/6/23
+#define TF_GRAPH_DEF_VERSION 442  // Updated: 2020/6/24
 
 // Checkpoint compatibility versions (the versions field in SavedSliceMeta).
 //

From de03980525f809ded4c66e28252e04f2c959b835 Mon Sep 17 00:00:00 2001
From: SiCong Li <sicong.li@arm.com>
Date: Thu, 13 Feb 2020 15:22:58 +0000
Subject: [PATCH 0962/1390] Add int8 and int16 support to tflu network tester.

Signed-off-by: SiCong Li <sicong.li@arm.com>
---
 .../network_tester/expected_output_data.h         |  3 +++
 .../micro/examples/network_tester/input_data.h    |  3 +++
 .../network_tester/network_tester_test.cc         | 15 +++++++++++----
 3 files changed, 17 insertions(+), 4 deletions(-)

diff --git a/tensorflow/lite/micro/examples/network_tester/expected_output_data.h b/tensorflow/lite/micro/examples/network_tester/expected_output_data.h
index 934722bad94..afaa7940d49 100644
--- a/tensorflow/lite/micro/examples/network_tester/expected_output_data.h
+++ b/tensorflow/lite/micro/examples/network_tester/expected_output_data.h
@@ -16,6 +16,9 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_MICRO_EXAMPLES_NETWORK_TESTER_EXPECTED_OUTPUT_DATA_H_
 #define TENSORFLOW_LITE_MICRO_EXAMPLES_NETWORK_TESTER_EXPECTED_OUTPUT_DATA_H_
 
+static const bool output_use_default_quant_params = true;
+static const float output_scale = 0;     // NOT USED
+static const int output_zero_point = 0;  // NOT USED
 static unsigned int expected_output_data_len = 4;
 static unsigned char expected_output_data[1][4] = {6, 8, 14, 16};
 
diff --git a/tensorflow/lite/micro/examples/network_tester/input_data.h b/tensorflow/lite/micro/examples/network_tester/input_data.h
index a94f6f90139..173e1d62f53 100644
--- a/tensorflow/lite/micro/examples/network_tester/input_data.h
+++ b/tensorflow/lite/micro/examples/network_tester/input_data.h
@@ -16,6 +16,9 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_MICRO_EXAMPLES_NETWORK_TESTER_INPUT_DATA_H_
 #define TENSORFLOW_LITE_MICRO_EXAMPLES_NETWORK_TESTER_INPUT_DATA_H_
 
+static const bool input_use_default_quant_params = true;
+static const float input_scale = 0;     // NOT USED
+static const int input_zero_point = 0;  // NOT USED
 static const int input_data_len = 16;
 static const unsigned char input_data[1][16] = {
     {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}};
diff --git a/tensorflow/lite/micro/examples/network_tester/network_tester_test.cc b/tensorflow/lite/micro/examples/network_tester/network_tester_test.cc
index e6eefe003f0..6ea02b3f4a5 100644
--- a/tensorflow/lite/micro/examples/network_tester/network_tester_test.cc
+++ b/tensorflow/lite/micro/examples/network_tester/network_tester_test.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include "tensorflow/lite/micro/examples/network_tester/network_model.h"
 #include "tensorflow/lite/micro/micro_error_reporter.h"
 #include "tensorflow/lite/micro/micro_interpreter.h"
+#include "tensorflow/lite/micro/micro_utils.h"
 #include "tensorflow/lite/micro/testing/micro_test.h"
 #include "tensorflow/lite/micro/testing/test_utils.h"
 #include "tensorflow/lite/schema/schema_generated.h"
@@ -62,6 +63,13 @@ inline void print_output_data(TfLiteTensor* output) {
 }
 #endif
 
+template <typename T>
+void check_output_elem(TfLiteTensor* output, const T* expected_output_data,
+                       const int index) {
+  TF_LITE_MICRO_EXPECT_EQ(tflite::GetTensorData<T>(output)[index],
+                          expected_output_data[index]);
+}
+
 TF_LITE_MICRO_TESTS_BEGIN
 
 TF_LITE_MICRO_TEST(TestInvoke) {
@@ -91,7 +99,7 @@ TF_LITE_MICRO_TEST(TestInvoke) {
   for (int n = 0; n < NUM_INFERENCES; n++) {
     for (int i = 0; i < interpreter.inputs_size(); ++i) {
       TfLiteTensor* input = interpreter.input(i);
-      memcpy(input->data.uint8, input_data[i], input->bytes);
+      memcpy(input->data.data, input_data[i], input->bytes);
     }
     TfLiteStatus invoke_status = interpreter.Invoke();
     if (invoke_status != kTfLiteOk) {
@@ -120,9 +128,8 @@ TF_LITE_MICRO_TEST(TestInvoke) {
 #ifndef NO_COMPARE_OUTPUT_DATA
     for (int i = 0; i < interpreter.outputs_size(); i++) {
       TfLiteTensor* output = interpreter.output(i);
-      for (int j = 0; j < output->bytes; ++j) {
-        TF_LITE_MICRO_EXPECT_EQ(output->data.uint8[j],
-                                expected_output_data[i][j]);
+      for (int j = 0; j < tflite::ElementCount(*(output->dims)); ++j) {
+        check_output_elem(output, expected_output_data[i], j);
       }
     }
 #endif

From f11ae6a006c65e5692cb1d91829343d70bcea719 Mon Sep 17 00:00:00 2001
From: SiCong Li <sicong.li@arm.com>
Date: Mon, 22 Jun 2020 12:16:55 +0100
Subject: [PATCH 0963/1390] Remove default quant info

---
 .../lite/micro/examples/network_tester/expected_output_data.h  | 3 ---
 tensorflow/lite/micro/examples/network_tester/input_data.h     | 3 ---
 2 files changed, 6 deletions(-)

diff --git a/tensorflow/lite/micro/examples/network_tester/expected_output_data.h b/tensorflow/lite/micro/examples/network_tester/expected_output_data.h
index afaa7940d49..934722bad94 100644
--- a/tensorflow/lite/micro/examples/network_tester/expected_output_data.h
+++ b/tensorflow/lite/micro/examples/network_tester/expected_output_data.h
@@ -16,9 +16,6 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_MICRO_EXAMPLES_NETWORK_TESTER_EXPECTED_OUTPUT_DATA_H_
 #define TENSORFLOW_LITE_MICRO_EXAMPLES_NETWORK_TESTER_EXPECTED_OUTPUT_DATA_H_
 
-static const bool output_use_default_quant_params = true;
-static const float output_scale = 0;     // NOT USED
-static const int output_zero_point = 0;  // NOT USED
 static unsigned int expected_output_data_len = 4;
 static unsigned char expected_output_data[1][4] = {6, 8, 14, 16};
 
diff --git a/tensorflow/lite/micro/examples/network_tester/input_data.h b/tensorflow/lite/micro/examples/network_tester/input_data.h
index 173e1d62f53..a94f6f90139 100644
--- a/tensorflow/lite/micro/examples/network_tester/input_data.h
+++ b/tensorflow/lite/micro/examples/network_tester/input_data.h
@@ -16,9 +16,6 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_MICRO_EXAMPLES_NETWORK_TESTER_INPUT_DATA_H_
 #define TENSORFLOW_LITE_MICRO_EXAMPLES_NETWORK_TESTER_INPUT_DATA_H_
 
-static const bool input_use_default_quant_params = true;
-static const float input_scale = 0;     // NOT USED
-static const int input_zero_point = 0;  // NOT USED
 static const int input_data_len = 16;
 static const unsigned char input_data[1][16] = {
     {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}};

From 2d5b01d0567d2935f4ec1f3f444f5559028b6702 Mon Sep 17 00:00:00 2001
From: Tres Popp <tpopp@google.com>
Date: Wed, 24 Jun 2020 02:37:40 -0700
Subject: [PATCH 0964/1390] Integrate LLVM at
 https://github.com/llvm/llvm-project/commit/f0084c3bcbc2

PiperOrigin-RevId: 318029034
Change-Id: Ic892173203479232a22354cf7617be668ff74568
---
 tensorflow/workspace.bzl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index 49cd146bed5..99d8166078b 100755
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -710,8 +710,8 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
     )
 
     # Check out LLVM and MLIR from llvm-project.
-    LLVM_COMMIT = "6507bc56216ba4441790bc581a5b76d9c2ad9774"
-    LLVM_SHA256 = "d1749ab8a32110fae83881ca6c82383632516c6fd5ffbd2c5dd1b486db224e46"
+    LLVM_COMMIT = "f0084c3bcbc2f2e17ab1a24d19ac6738eb4c4263"
+    LLVM_SHA256 = "6ad33ec6453292efc18e6d2ec1143c4114eb522edd2d8eefba72f4a2d692e6c3"
     LLVM_URLS = [
         "https://storage.googleapis.com/mirror.tensorflow.org/github.com/llvm/llvm-project/archive/{commit}.tar.gz".format(commit = LLVM_COMMIT),
         "https://github.com/llvm/llvm-project/archive/{commit}.tar.gz".format(commit = LLVM_COMMIT),

From d04bf998887fffe640ec8cacc9094574cd596f99 Mon Sep 17 00:00:00 2001
From: Adrian Kuegel <akuegel@google.com>
Date: Wed, 24 Jun 2020 05:43:16 -0700
Subject: [PATCH 0965/1390] Split _gen_kernel_image_hdr_impl into two rules.

Also, don't pass tags to the rules, we don't use that feature.

PiperOrigin-RevId: 318048163
Change-Id: I026533fb1a09538bef8bfd0c9bde4c6638f3d25d
---
 .../core/kernels/cubin_headers/build_defs.bzl | 58 ++++++++++++-------
 1 file changed, 36 insertions(+), 22 deletions(-)

diff --git a/tensorflow/core/kernels/cubin_headers/build_defs.bzl b/tensorflow/core/kernels/cubin_headers/build_defs.bzl
index c3e44b7a974..cdf77e59433 100644
--- a/tensorflow/core/kernels/cubin_headers/build_defs.bzl
+++ b/tensorflow/core/kernels/cubin_headers/build_defs.bzl
@@ -9,10 +9,9 @@ def _lookup_file(filegroup, path):
             return file
     return None
 
-def _gen_kernel_image_hdr_impl(ctx):
-    if not ctx.attr.gpu_archs:
-        fail("No GPU architecture specified, use --config=cuda or similar")
+CubinInfo = provider(fields = ["cubins"])
 
+def _gen_kernel_cubin_impl(ctx):
     name = ctx.attr.name
     tile_sizes = ctx.attr.tile_size.replace("x", ",")
     cmd_args = []
@@ -22,7 +21,6 @@ def _gen_kernel_image_hdr_impl(ctx):
         cmd_args.append("--unroll_factors=%s" % ctx.attr.unroll_factors)
 
     cubins = []
-    images = []
     for arch in ctx.attr.gpu_archs:
         # TODO(b/152737872): 'compute_' should generate both SASS and PTX.
         arch = arch.replace("compute_", "sm_")
@@ -41,13 +39,36 @@ def _gen_kernel_image_hdr_impl(ctx):
             mnemonic = "compile",
         )
         cubins.append(cubin)
+    return [CubinInfo(cubins = cubins)]
+
+_gen_kernel_cubin_rule = rule(
+    implementation = _gen_kernel_cubin_impl,
+    attrs = {
+        "mlir_op": attr.label(mandatory = True, allow_single_file = True),
+        "tile_size": attr.string(mandatory = True),
+        "same_shape": attr.string(),
+        "unroll_factors": attr.string(),
+        "gpu_archs": attr.string_list(mandatory = True),
+        "_tool": attr.label(
+            executable = True,
+            default = Label("//tensorflow/compiler/mlir/tools/kernel_gen:tf_to_cubin"),
+            cfg = "host",
+        ),
+    },
+    output_to_genfiles = True,
+)
+
+def _gen_kernel_image_hdr_impl(ctx):
+    images = []
+    for cubin in ctx.attr.input[CubinInfo].cubins:
+        arch = cubin.path.split(".")[-2]
         images.append("--image=profile=%s,file=%s" % (arch, cubin.path))
 
     # Generate fatbin file from all cubins.
-    fatbin = ctx.actions.declare_file("%s.fatbin" % name)
+    fatbin = ctx.actions.declare_file("%s.fatbin" % ctx.attr.name)
     ctx.actions.run(
         outputs = [fatbin],
-        inputs = cubins,
+        inputs = ctx.attr.input[CubinInfo].cubins,
         executable = _lookup_file(ctx.attr._cuda_root, "bin/fatbinary"),
         arguments = [
             "--64",
@@ -73,37 +94,31 @@ _gen_kernel_image_hdr_rule = rule(
     implementation = _gen_kernel_image_hdr_impl,
     output_to_genfiles = True,
     attrs = {
-        "mlir_op": attr.label(mandatory = True, allow_single_file = True),
-        "tile_size": attr.string(mandatory = True),
-        "same_shape": attr.string(),
-        "unroll_factors": attr.string(),
+        "input": attr.label(mandatory = True, providers = [CubinInfo]),
         "out": attr.output(mandatory = True),
         "symbol": attr.string(mandatory = True),
-        "gpu_archs": attr.string_list(mandatory = True),
         "_cuda_root": attr.label(
             default = Label("@local_config_cuda//cuda:cuda_root"),
         ),
-        "_tool": attr.label(
-            executable = True,
-            default = Label("//tensorflow/compiler/mlir/tools/kernel_gen:tf_to_cubin"),
-            cfg = "host",
-        ),
     },
 )
 
-def _gen_kernel_image_hdr(name, mlir_op, tile_size, tags = [], same_shape = None, unroll_factors = None):
+def _gen_kernel_image_hdr(name, mlir_op, tile_size, same_shape = None, unroll_factors = None):
     """Generates a C header with fatbin data from a Tensorflow op."""
     if cuda_gpu_architectures():
-        _gen_kernel_image_hdr_rule(
-            name = name,
+        _gen_kernel_cubin_rule(
+            name = name + "_cubin",
             mlir_op = mlir_op,
             tile_size = tile_size,
             same_shape = same_shape,
             unroll_factors = unroll_factors,
+            gpu_archs = cuda_gpu_architectures(),
+        )
+        _gen_kernel_image_hdr_rule(
+            name = name,
+            input = ":" + name + "_cubin",
             out = "%s.h" % name,
             symbol = "k%s" % name.replace("_", " ").title().replace(" ", ""),
-            gpu_archs = cuda_gpu_architectures(),
-            tags = tags,
         )
 
 def _gen_mlir_op_impl(ctx):
@@ -157,7 +172,6 @@ def gen_kernel_library(name, types, tile_size, tags = [], same_shape = None, unr
                 name = "{name}_{type}_kernel".format(name = name, type = type),
                 mlir_op = "{name}_{type}.mlir".format(name = name, type = type),
                 tile_size = tile_size,
-                tags = tags,
                 same_shape = same_shape,
                 unroll_factors = unroll_factors,
             )

From ac7f4539413a12178f0bc13557167f74bc49593d Mon Sep 17 00:00:00 2001
From: XingyuLong <halolong@hotmail.com>
Date: Wed, 24 Jun 2020 10:53:38 -0400
Subject: [PATCH 0966/1390] Add test and test it on different configuration,
 ie. ,

---
 .../python/keras/layers/convolutional_test.py | 72 ++++++++++++-------
 1 file changed, 46 insertions(+), 26 deletions(-)

diff --git a/tensorflow/python/keras/layers/convolutional_test.py b/tensorflow/python/keras/layers/convolutional_test.py
index f558a7bf893..551733bbbd2 100644
--- a/tensorflow/python/keras/layers/convolutional_test.py
+++ b/tensorflow/python/keras/layers/convolutional_test.py
@@ -730,35 +730,55 @@ class ZeroPaddingTest(keras_parameterized.TestCase):
     input_len_dim2 = 5
     input_len_dim3 = 3
 
-    inputs = np.ones((num_samples, input_len_dim1, input_len_dim2,
-                      input_len_dim3, stack_size))
+    for data_format in ['channels_first', 'channels_last']:
+      if data_format == 'channels_first':
+        inputs = np.ones((num_samples, stack_size, input_len_dim1,
+                          input_len_dim2, input_len_dim3))
+      elif data_format == 'channels_last':
+        inputs = np.ones((num_samples, input_len_dim1, input_len_dim2,
+                          input_len_dim3, stack_size))
 
-    with self.cached_session(use_gpu=True):
-      # basic test
-      testing_utils.layer_test(
-          keras.layers.ZeroPadding3D,
-          kwargs={'padding': (2, 2, 2)},
-          input_shape=inputs.shape)
+      with self.cached_session(use_gpu=True):
+        # basic test
+        testing_utils.layer_test(
+            keras.layers.ZeroPadding3D,
+            kwargs={'padding': (2, 2, 2),
+                    'data_format': data_format},
+            input_shape=inputs.shape)
+        testing_utils.layer_test(
+            keras.layers.ZeroPadding3D,
+            kwargs={'padding': ((1, 2), (3, 4), (0, 2)),
+                    'data_format': data_format},
+            input_shape=inputs.shape)
 
-      # correctness test
-      layer = keras.layers.ZeroPadding3D(padding=(2, 2, 2))
-      layer.build(inputs.shape)
-      output = layer(keras.backend.variable(inputs))
-      if context.executing_eagerly():
-        np_output = output.numpy()
-      else:
-        np_output = keras.backend.eval(output)
-      for offset in [0, 1, -1, -2]:
-        np.testing.assert_allclose(np_output[:, offset, :, :, :], 0.)
-        np.testing.assert_allclose(np_output[:, :, offset, :, :], 0.)
-        np.testing.assert_allclose(np_output[:, :, :, offset, :], 0.)
-      np.testing.assert_allclose(np_output[:, 2:-2, 2:-2, 2:-2, :], 1.)
+      with self.cached_session(use_gpu=True):
+        # correctness test
+        layer = keras.layers.ZeroPadding3D(padding=(2, 2, 2),
+                                           data_format=data_format)
+        layer.build(inputs.shape)
+        output = layer(keras.backend.variable(inputs))
+        if context.executing_eagerly():
+          np_output = output.numpy()
+        else:
+          np_output = keras.backend.eval(output)
+        if data_format == 'channels_last':
+          for offset in [0, 1, -1, -2]:
+            np.testing.assert_allclose(np_output[:, offset, :, :, :], 0.)
+            np.testing.assert_allclose(np_output[:, :, offset, :, :], 0.)
+            np.testing.assert_allclose(np_output[:, :, :, offset, :], 0.)
+          np.testing.assert_allclose(np_output[:, 2:-2, 2:-2, 2:-2, :], 1.)
+        elif data_format == 'channels_first':
+          for offset in [0, 1, -1, -2]:
+            np.testing.assert_allclose(np_output[:, :, offset, :, :], 0.)
+            np.testing.assert_allclose(np_output[:, :, :, offset, :], 0.)
+            np.testing.assert_allclose(np_output[:, :, :, :, offset], 0.)
+          np.testing.assert_allclose(np_output[:, :, 2:-2, 2:-2, 2:-2], 1.)
 
-    # test incorrect use
-    with self.assertRaises(ValueError):
-      keras.layers.ZeroPadding3D(padding=(1, 1))
-    with self.assertRaises(ValueError):
-      keras.layers.ZeroPadding3D(padding=None)
+      # test incorrect use
+      with self.assertRaises(ValueError):
+        keras.layers.ZeroPadding3D(padding=(1, 1))
+      with self.assertRaises(ValueError):
+        keras.layers.ZeroPadding3D(padding=None)
 
 
 @test_util.for_all_test_methods(test_util.disable_xla,

From 8ca275dac4378f36c170bb65081fb8fb56304679 Mon Sep 17 00:00:00 2001
From: Tres Popp <tpopp@google.com>
Date: Wed, 24 Jun 2020 06:04:53 -0700
Subject: [PATCH 0967/1390] Integrate LLVM at
 https://github.com/llvm/llvm-project/commit/3d6cab271c7c

PiperOrigin-RevId: 318050421
Change-Id: If2a3d740f9d815af2e2dbdc3be3aea307fcba0ec
---
 tensorflow/workspace.bzl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index 99d8166078b..88f3ea6f0bd 100755
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -710,8 +710,8 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
     )
 
     # Check out LLVM and MLIR from llvm-project.
-    LLVM_COMMIT = "f0084c3bcbc2f2e17ab1a24d19ac6738eb4c4263"
-    LLVM_SHA256 = "6ad33ec6453292efc18e6d2ec1143c4114eb522edd2d8eefba72f4a2d692e6c3"
+    LLVM_COMMIT = "3d6cab271c7cecf105b77834d837ccd4406700d7"
+    LLVM_SHA256 = "5ed3e9d2fb72e1a26d312ebbf1cfecf8e332fe9dce269a8e11073a3a444e5f32"
     LLVM_URLS = [
         "https://storage.googleapis.com/mirror.tensorflow.org/github.com/llvm/llvm-project/archive/{commit}.tar.gz".format(commit = LLVM_COMMIT),
         "https://github.com/llvm/llvm-project/archive/{commit}.tar.gz".format(commit = LLVM_COMMIT),

From 4aaf7b34764ae5fb3abf855ef6f90d6f1232d674 Mon Sep 17 00:00:00 2001
From: Edward Loper <edloper@google.com>
Date: Wed, 24 Jun 2020 06:32:08 -0700
Subject: [PATCH 0968/1390] Fix bug in signature printing for concrete
 functions with explicit input_signature and non-tensor arguments with default
 values.

PiperOrigin-RevId: 318053614
Change-Id: I7ba0bc5fdad20cfac51f0d4000679248b80102b9
---
 tensorflow/python/eager/function.py      | 17 ++++++++++++++++-
 tensorflow/python/eager/function_test.py | 15 +++++++++++++++
 2 files changed, 31 insertions(+), 1 deletion(-)

diff --git a/tensorflow/python/eager/function.py b/tensorflow/python/eager/function.py
index e2f5d86fbd2..309e47a4238 100644
--- a/tensorflow/python/eager/function.py
+++ b/tensorflow/python/eager/function.py
@@ -2199,6 +2199,14 @@ class ConcreteFunction(object):
     assert self._function_spec is not None
     arg_specs, kwarg_specs = self.structured_input_signature
     arg_names = list(self._function_spec.arg_names)
+
+    # If an explicit input_signature is provided to @tf.function, then any
+    # arguments with defaults that are not covered by that explicit signature
+    # are simply dropped from the signature.
+    # TODO(b/159639913) Look into whether dropping arguments with default values
+    # from the signature is the right thing to do.
+    arg_names = arg_names[:len(arg_specs)]
+
     if default_values:
       for i in range(len(arg_names)):
         if not _contains_type_spec(arg_specs[i]):
@@ -2248,6 +2256,14 @@ class ConcreteFunction(object):
     lines = [self._structured_signature_summary(default_values=True)]
     arg_specs, kwarg_specs = self.structured_input_signature
     names = list(self._function_spec.arg_names)
+
+    # If an explicit input_signature is provided to @tf.function, then any
+    # arguments with defaults that are not covered by that explicit signature
+    # are simply dropped from the signature.
+    # TODO(b/159639913) Look into whether dropping arguments with default values
+    # from the signature is the right thing to do.
+    names = names[:len(arg_specs)]
+
     names.extend(sorted(kwarg_specs))
     specs = list(arg_specs) + list(kwarg_specs.values())
     # note: we can skip bound args, since we already displayed thier bound
@@ -2855,7 +2871,6 @@ class Function(object):
       graph_function, _, _ = self._maybe_define_function(args, kwargs)
     return graph_function
 
-  # XX TODO: make sure we fix up this path as well!?
   def _get_concrete_function_internal(self, *args, **kwargs):
     """Bypasses error checking when getting a graph function."""
     graph_function = self._get_concrete_function_internal_garbage_collected(
diff --git a/tensorflow/python/eager/function_test.py b/tensorflow/python/eager/function_test.py
index 3c42d95e437..b70b1bc5c1f 100644
--- a/tensorflow/python/eager/function_test.py
+++ b/tensorflow/python/eager/function_test.py
@@ -3898,6 +3898,21 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
     c5_summary = 'func2(x=8, y)'
     self.assertEqual(c5.pretty_printed_signature(verbose=False), c5_summary)
 
+  def testPrettyPrintedExplicitSignatureWithKeywordArg(self):  # b/159639913
+
+    @def_function.function(input_signature=[tensor_spec.TensorSpec(None)])
+    def fn(a, b=1):
+      return a + b
+
+    concrete_fn = fn.get_concrete_function()
+    self.assertEqual(concrete_fn.pretty_printed_signature(False), 'fn(a)')
+    self.assertEqual(
+        concrete_fn.pretty_printed_signature(True), 'fn(a)\n'
+        '  Args:\n'
+        '    a: float32 Tensor, shape=<unknown>\n'
+        '  Returns:\n'
+        '    float32 Tensor, shape=<unknown>')
+
   @test_util.run_in_graph_and_eager_modes
   def testIndexedSlicesAsGradientsForConcreteFunctions(self):
 

From 65d5f172c7bf7a8f968de7c39cda89309b17ef25 Mon Sep 17 00:00:00 2001
From: Nick Kreeger <kreeger@google.com>
Date: Wed, 24 Jun 2020 07:50:56 -0700
Subject: [PATCH 0969/1390] Reduce the size of TfLiteNode for the TF Micro
 runtime.

This change uses the existing micro-specific build flag (TF_LITE_MICRO_STATIC_MEMORY) to reduce the size of TfLiteNode. In this build setting, only the minimum number of fields required for running TFLM are used. This build define is opt-in only for internal builds and continues to be enabled by default in Makefile builds.

All TFLM internal targets can be built with this flag by adding '--copt=-DTF_LITE_STATIC_MEMORY'.

This change reduces the sizeof(TfLiteNode) to 48 bytes (64bit systems) down from 72 bytes (64 bit systems) (1.5x reduction or ~33%).

Keyword model total allocation reduced by: 368 bytes (~2% savings)
Keyword model tail allocation reduced by: 368 bytes (~2% savings)

Test Conv model total allocation reduced by: 176 bytes (~1.6% savings)
Test Conv model tail allocation reduced by: 176 bytes (~5.5% savings)

PiperOrigin-RevId: 318062703
Change-Id: Ibbe860ae25e18686bc60a6694998719509779a8e
---
 tensorflow/lite/c/common.h                    | 176 +++++++++++-------
 .../internal/reference/reference_ops.h        |   2 +
 tensorflow/lite/kernels/kernel_util.h         |   6 +-
 .../lite/micro/benchmarks/conv_benchmark.cc   |   5 -
 .../benchmarks/depthwise_conv_benchmark.cc    |   5 -
 .../lite/micro/kernels/activations_test.cc    |  12 --
 tensorflow/lite/micro/kernels/add_test.cc     |   4 -
 .../lite/micro/kernels/arg_min_max_test.cc    |   4 -
 tensorflow/lite/micro/kernels/ceil_test.cc    |   4 -
 .../micro/kernels/circular_buffer_test.cc     |  10 -
 .../lite/micro/kernels/comparisons_test.cc    |   4 -
 .../lite/micro/kernels/concatenation_test.cc  |  18 +-
 tensorflow/lite/micro/kernels/conv_test.cc    |   4 -
 .../lite/micro/kernels/depthwise_conv_test.cc |   4 -
 .../lite/micro/kernels/dequantize_test.cc     |   4 -
 .../lite/micro/kernels/elementwise_test.cc    |  23 +--
 tensorflow/lite/micro/kernels/floor_test.cc   |   5 -
 .../micro/kernels/fully_connected_test.cc     |   8 -
 tensorflow/lite/micro/kernels/l2norm_test.cc  |   4 -
 tensorflow/lite/micro/kernels/logical_test.cc |   9 +-
 .../lite/micro/kernels/logistic_test.cc       |   4 -
 .../micro/kernels/maximum_minimum_test.cc     |  27 ++-
 tensorflow/lite/micro/kernels/mul_test.cc     |   2 -
 tensorflow/lite/micro/kernels/neg_test.cc     |   3 -
 tensorflow/lite/micro/kernels/pack_test.cc    |  38 ++--
 tensorflow/lite/micro/kernels/pad_test.cc     |   8 -
 tensorflow/lite/micro/kernels/pooling_test.cc |  16 --
 tensorflow/lite/micro/kernels/prelu_test.cc   |   6 -
 .../lite/micro/kernels/quantize_test.cc       |   4 -
 tensorflow/lite/micro/kernels/reduce_test.cc  |   2 -
 tensorflow/lite/micro/kernels/reshape_test.cc | 151 +++++++++------
 .../kernels/resize_nearest_neighbor_test.cc   |   4 -
 tensorflow/lite/micro/kernels/round_test.cc   |   4 -
 tensorflow/lite/micro/kernels/softmax_test.cc |  12 --
 tensorflow/lite/micro/kernels/split_test.cc   |  40 ++--
 .../lite/micro/kernels/strided_slice_test.cc  |   2 -
 tensorflow/lite/micro/kernels/sub_test.cc     |   4 -
 tensorflow/lite/micro/kernels/svdf_test.cc    |   2 -
 tensorflow/lite/micro/kernels/tanh_test.cc    |   4 -
 tensorflow/lite/micro/kernels/unpack_test.cc  |  36 ++--
 .../lite/micro/memory_arena_threshold_test.cc |   8 +-
 .../benchmark/experimental/c/c_api_types.h    | 176 +++++++++++-------
 42 files changed, 399 insertions(+), 465 deletions(-)

diff --git a/tensorflow/lite/c/common.h b/tensorflow/lite/c/common.h
index 9093e5d50ad..6e146cbc356 100644
--- a/tensorflow/lite/c/common.h
+++ b/tensorflow/lite/c/common.h
@@ -440,76 +440,6 @@ typedef struct TfLiteTensor {
   // `dims_signature` contains [1, -1, -1, 3]).
   const TfLiteIntArray* dims_signature;
 } TfLiteTensor;
-#else
-// Specific reduced TfLiteTensor struct for TF Micro runtime. This struct
-// contains only the minimum fields required to initialize and prepare a micro
-// inference graph. The fields in this struct have been ordered from
-// largest-to-smallest for optimal struct sizeof.
-//
-// NOTE: This flag is opt-in only at compile time.
-typedef struct TfLiteTensor {
-  // TODO(b/155784997): Consider consolidating these quantization fields:
-  // Quantization information. Replaces params field above.
-  TfLiteQuantization quantization;
-
-  // Quantization information.
-  TfLiteQuantizationParams params;
-
-  // A union of data pointers. The appropriate type should be used for a typed
-  // tensor based on `type`.
-  TfLitePtrUnion data;
-
-  // A pointer to a structure representing the dimensionality interpretation
-  // that the buffer should have. NOTE: the product of elements of `dims`
-  // and the element datatype size should be equal to `bytes` below.
-  TfLiteIntArray* dims;
-
-  // The number of bytes required to store the data of this Tensor. I.e.
-  // (bytes of each element) * dims[0] * ... * dims[n-1].  For example, if
-  // type is kTfLiteFloat32 and dims = {3, 2} then
-  // bytes = sizeof(float) * 3 * 2 = 4 * 3 * 2 = 24.
-  size_t bytes;
-
-  // The data type specification for data stored in `data`. This affects
-  // what member of `data` union should be used.
-  TfLiteType type;
-
-  // How memory is mapped
-  //  kTfLiteMmapRo: Memory mapped read only.
-  //  i.e. weights
-  //  kTfLiteArenaRw: Arena allocated read write memory
-  //  (i.e. temporaries, outputs).
-  TfLiteAllocationType allocation_type;
-
-  // True if the tensor is a variable.
-  bool is_variable;
-} TfLiteTensor;
-#endif  // TF_LITE_STATIC_MEMORY
-
-#ifndef TF_LITE_STATIC_MEMORY
-// Free data memory of tensor `t`.
-void TfLiteTensorDataFree(TfLiteTensor* t);
-
-// Free quantization data.
-void TfLiteQuantizationFree(TfLiteQuantization* quantization);
-
-// Free sparsity parameters.
-void TfLiteSparsityFree(TfLiteSparsity* sparsity);
-
-// Free memory of tensor `t`.
-void TfLiteTensorFree(TfLiteTensor* t);
-
-// Set all of a tensor's fields (and free any previously allocated data).
-void TfLiteTensorReset(TfLiteType type, const char* name, TfLiteIntArray* dims,
-                       TfLiteQuantizationParams quantization, char* buffer,
-                       size_t size, TfLiteAllocationType allocation_type,
-                       const void* allocation, bool is_variable,
-                       TfLiteTensor* tensor);
-
-// Resize the allocated data of a (dynamic) tensor. Tensors with allocation
-// types other than kTfLiteDynamic will be ignored.
-void TfLiteTensorRealloc(size_t num_bytes, TfLiteTensor* tensor);
-#endif  // TF_LITE_STATIC_MEMORY
 
 // A structure representing an instance of a node.
 // This structure only exhibits the inputs, outputs and user defined data, not
@@ -547,6 +477,112 @@ typedef struct TfLiteNode {
   // WARNING: This is an experimental interface that is subject to change.
   struct TfLiteDelegate* delegate;
 } TfLiteNode;
+#else
+// NOTE: This flag is opt-in only at compile time.
+//
+// Specific reduced TfLiteTensor struct for TF Micro runtime. This struct
+// contains only the minimum fields required to initialize and prepare a micro
+// inference graph. The fields in this struct have been ordered from
+// largest-to-smallest for optimal struct sizeof.
+//
+// This struct does not use:
+// - allocation
+// - buffer_handle
+// - data_is_stale
+// - delegate
+// - dims_signature
+// - name
+// - sparsity
+typedef struct TfLiteTensor {
+  // TODO(b/155784997): Consider consolidating these quantization fields:
+  // Quantization information. Replaces params field above.
+  TfLiteQuantization quantization;
+
+  // Quantization information.
+  TfLiteQuantizationParams params;
+
+  // A union of data pointers. The appropriate type should be used for a typed
+  // tensor based on `type`.
+  TfLitePtrUnion data;
+
+  // A pointer to a structure representing the dimensionality interpretation
+  // that the buffer should have. NOTE: the product of elements of `dims`
+  // and the element datatype size should be equal to `bytes` below.
+  TfLiteIntArray* dims;
+
+  // The number of bytes required to store the data of this Tensor. I.e.
+  // (bytes of each element) * dims[0] * ... * dims[n-1].  For example, if
+  // type is kTfLiteFloat32 and dims = {3, 2} then
+  // bytes = sizeof(float) * 3 * 2 = 4 * 3 * 2 = 24.
+  size_t bytes;
+
+  // The data type specification for data stored in `data`. This affects
+  // what member of `data` union should be used.
+  TfLiteType type;
+
+  // How memory is mapped
+  //  kTfLiteMmapRo: Memory mapped read only.
+  //  i.e. weights
+  //  kTfLiteArenaRw: Arena allocated read write memory
+  //  (i.e. temporaries, outputs).
+  TfLiteAllocationType allocation_type;
+
+  // True if the tensor is a variable.
+  bool is_variable;
+} TfLiteTensor;
+
+// Specific reduced TfLiteNode struct for TF Micro runtime. This struct contains
+// only the minimum fields required to represent a node.
+//
+// This struct does not use:
+// - delegate
+// - intermediates
+// - temporaries
+typedef struct TfLiteNode {
+  // Inputs to this node expressed as indices into the simulator's tensors.
+  TfLiteIntArray* inputs;
+
+  // Outputs to this node expressed as indices into the simulator's tensors.
+  TfLiteIntArray* outputs;
+
+  // Opaque data provided by the node implementer through `Registration.init`.
+  void* user_data;
+
+  // Opaque data provided to the node if the node is a builtin. This is usually
+  // a structure defined in builtin_op_data.h
+  void* builtin_data;
+
+  // Custom initial data. This is the opaque data provided in the flatbuffer.
+  // WARNING: This is an experimental interface that is subject to change.
+  const void* custom_initial_data;
+  int custom_initial_data_size;
+} TfLiteNode;
+#endif  // TF_LITE_STATIC_MEMORY
+
+#ifndef TF_LITE_STATIC_MEMORY
+// Free data memory of tensor `t`.
+void TfLiteTensorDataFree(TfLiteTensor* t);
+
+// Free quantization data.
+void TfLiteQuantizationFree(TfLiteQuantization* quantization);
+
+// Free sparsity parameters.
+void TfLiteSparsityFree(TfLiteSparsity* sparsity);
+
+// Free memory of tensor `t`.
+void TfLiteTensorFree(TfLiteTensor* t);
+
+// Set all of a tensor's fields (and free any previously allocated data).
+void TfLiteTensorReset(TfLiteType type, const char* name, TfLiteIntArray* dims,
+                       TfLiteQuantizationParams quantization, char* buffer,
+                       size_t size, TfLiteAllocationType allocation_type,
+                       const void* allocation, bool is_variable,
+                       TfLiteTensor* tensor);
+
+// Resize the allocated data of a (dynamic) tensor. Tensors with allocation
+// types other than kTfLiteDynamic will be ignored.
+void TfLiteTensorRealloc(size_t num_bytes, TfLiteTensor* tensor);
+#endif  // TF_LITE_STATIC_MEMORY
 
 // WARNING: This is an experimental interface that is subject to change.
 //
diff --git a/tensorflow/lite/kernels/internal/reference/reference_ops.h b/tensorflow/lite/kernels/internal/reference/reference_ops.h
index 5208b21eb4d..70a580946a8 100644
--- a/tensorflow/lite/kernels/internal/reference/reference_ops.h
+++ b/tensorflow/lite/kernels/internal/reference/reference_ops.h
@@ -1495,6 +1495,7 @@ inline void GatherNd(const RuntimeShape& params_shape,
   }
 }
 
+#ifndef TF_LITE_STATIC_MEMORY
 template <typename IndicesT = int32>
 inline void GatherNdString(const RuntimeShape& params_shape,
                            const TfLiteTensor* params_data,
@@ -1517,6 +1518,7 @@ inline void GatherNdString(const RuntimeShape& params_shape,
   }
   buffer.WriteToTensor(output_data, /*new_shape=*/nullptr);
 }
+#endif
 
 template <typename IndicesT, typename UpdatesT>
 inline void ScatterNd(const RuntimeShape& indices_shape,
diff --git a/tensorflow/lite/kernels/kernel_util.h b/tensorflow/lite/kernels/kernel_util.h
index 6bd6bb1c7ed..98418399561 100644
--- a/tensorflow/lite/kernels/kernel_util.h
+++ b/tensorflow/lite/kernels/kernel_util.h
@@ -44,6 +44,7 @@ inline TfLiteTensor* GetOutput(TfLiteContext* context, const TfLiteNode* node,
                                int index) {
   return &context->tensors[node->outputs->data[index]];
 }
+#ifndef TF_LITE_STATIC_MEMORY
 inline TfLiteTensor* GetTemporary(TfLiteContext* context,
                                   const TfLiteNode* node, int index) {
   return &context->tensors[node->temporaries->data[index]];
@@ -52,11 +53,12 @@ inline const TfLiteTensor* GetIntermediates(TfLiteContext* context,
                                             const TfLiteNode* node, int index) {
   return &context->tensors[node->intermediates->data[index]];
 }
-inline int NumInputs(const TfLiteNode* node) { return node->inputs->size; }
-inline int NumOutputs(const TfLiteNode* node) { return node->outputs->size; }
 inline int NumIntermediates(const TfLiteNode* node) {
   return node->intermediates->size;
 }
+#endif  // TF_LITE_STATIC_MEMORY
+inline int NumInputs(const TfLiteNode* node) { return node->inputs->size; }
+inline int NumOutputs(const TfLiteNode* node) { return node->outputs->size; }
 
 inline int64_t NumElements(const TfLiteIntArray* dims) {
   int64_t count = 1;
diff --git a/tensorflow/lite/micro/benchmarks/conv_benchmark.cc b/tensorflow/lite/micro/benchmarks/conv_benchmark.cc
index aef5813f6d1..73b2f9daf0d 100644
--- a/tensorflow/lite/micro/benchmarks/conv_benchmark.cc
+++ b/tensorflow/lite/micro/benchmarks/conv_benchmark.cc
@@ -53,19 +53,14 @@ TfLiteStatus ValidateConvGoldens(TfLiteTensor* tensors, int tensors_size,
   // There is 1 output at index 3 in the tensors array.
   int outputs_array_data[] = {1, 3};
   TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
-  // There are no temporaries.
-  int temporaries_array_data[] = {0};
-  TfLiteIntArray* temporaries_array = IntArrayFromInts(temporaries_array_data);
 
   TfLiteNode node;
   node.inputs = inputs_array;
   node.outputs = outputs_array;
-  node.temporaries = temporaries_array;
   node.user_data = user_data;
   node.builtin_data = reinterpret_cast<void*>(conv_params);
   node.custom_initial_data = nullptr;
   node.custom_initial_data_size = 0;
-  node.delegate = nullptr;
 
   if (registration->prepare) {
     TfLiteStatus prepare_status = registration->prepare(&context, &node);
diff --git a/tensorflow/lite/micro/benchmarks/depthwise_conv_benchmark.cc b/tensorflow/lite/micro/benchmarks/depthwise_conv_benchmark.cc
index 2098531dbfb..fd2c7eaf5cd 100644
--- a/tensorflow/lite/micro/benchmarks/depthwise_conv_benchmark.cc
+++ b/tensorflow/lite/micro/benchmarks/depthwise_conv_benchmark.cc
@@ -66,19 +66,14 @@ TfLiteStatus ValidateDepthwiseConvGoldens(TfLiteTensor* tensors,
   // There is 1 output at index 3 in the tensors array.
   int outputs_array_data[] = {1, 3};
   TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
-  // There are no intermediates.
-  int temporaries_array_data[] = {0};
-  TfLiteIntArray* temporaries_array = IntArrayFromInts(temporaries_array_data);
 
   TfLiteNode node;
   node.inputs = inputs_array;
   node.outputs = outputs_array;
-  node.temporaries = temporaries_array;
   node.user_data = user_data;
   node.builtin_data = reinterpret_cast<void*>(&builtin_data);
   node.custom_initial_data = nullptr;
   node.custom_initial_data_size = 0;
-  node.delegate = nullptr;
 
   if (registration->prepare) {
     TfLiteStatus prepare_status = registration->prepare(&context, &node);
diff --git a/tensorflow/lite/micro/kernels/activations_test.cc b/tensorflow/lite/micro/kernels/activations_test.cc
index 221f8f66d58..85556d10406 100644
--- a/tensorflow/lite/micro/kernels/activations_test.cc
+++ b/tensorflow/lite/micro/kernels/activations_test.cc
@@ -60,12 +60,10 @@ void TestReluFloat(const int* input_dims_data, const float* input_data,
   TfLiteNode node;
   node.inputs = inputs_array;
   node.outputs = outputs_array;
-  node.temporaries = nullptr;
   node.user_data = user_data;
   node.builtin_data = nullptr;
   node.custom_initial_data = nullptr;
   node.custom_initial_data_size = 0;
-  node.delegate = nullptr;
   if (registration->prepare) {
     TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
   }
@@ -116,12 +114,10 @@ void TestRelu6Float(const int* input_dims_data, const float* input_data,
   TfLiteNode node;
   node.inputs = inputs_array;
   node.outputs = outputs_array;
-  node.temporaries = nullptr;
   node.user_data = user_data;
   node.builtin_data = nullptr;
   node.custom_initial_data = nullptr;
   node.custom_initial_data_size = 0;
-  node.delegate = nullptr;
   if (registration->prepare) {
     TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
   }
@@ -177,12 +173,10 @@ void TestReluUint8(const int* input_dims_data, const float* input_data,
   TfLiteNode node;
   node.inputs = inputs_array;
   node.outputs = outputs_array;
-  node.temporaries = nullptr;
   node.user_data = user_data;
   node.builtin_data = nullptr;
   node.custom_initial_data = nullptr;
   node.custom_initial_data_size = 0;
-  node.delegate = nullptr;
   if (registration->prepare) {
     TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
   }
@@ -242,12 +236,10 @@ void TestRelu6Uint8(const int* input_dims_data, const float* input_data,
   TfLiteNode node;
   node.inputs = inputs_array;
   node.outputs = outputs_array;
-  node.temporaries = nullptr;
   node.user_data = user_data;
   node.builtin_data = nullptr;
   node.custom_initial_data = nullptr;
   node.custom_initial_data_size = 0;
-  node.delegate = nullptr;
   if (registration->prepare) {
     TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
   }
@@ -306,12 +298,10 @@ void TestReluInt8(const int* input_dims_data, const float* input_data,
   TfLiteNode node;
   node.inputs = inputs_array;
   node.outputs = outputs_array;
-  node.temporaries = nullptr;
   node.user_data = user_data;
   node.builtin_data = nullptr;
   node.custom_initial_data = nullptr;
   node.custom_initial_data_size = 0;
-  node.delegate = nullptr;
   if (registration->prepare) {
     TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
   }
@@ -372,12 +362,10 @@ void TestRelu6Int8(const int* input_dims_data, const float* input_data,
   TfLiteNode node;
   node.inputs = inputs_array;
   node.outputs = outputs_array;
-  node.temporaries = nullptr;
   node.user_data = user_data;
   node.builtin_data = nullptr;
   node.custom_initial_data = nullptr;
   node.custom_initial_data_size = 0;
-  node.delegate = nullptr;
   if (registration->prepare) {
     TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
   }
diff --git a/tensorflow/lite/micro/kernels/add_test.cc b/tensorflow/lite/micro/kernels/add_test.cc
index 6c66e0d4aaf..2d703600f56 100644
--- a/tensorflow/lite/micro/kernels/add_test.cc
+++ b/tensorflow/lite/micro/kernels/add_test.cc
@@ -89,18 +89,14 @@ void ValidateAddGoldens(TfLiteTensor* tensors, int tensors_size,
   TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
   int outputs_array_data[] = {1, 2};
   TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
-  int temporaries_array_data[] = {0};
-  TfLiteIntArray* temporaries_array = IntArrayFromInts(temporaries_array_data);
 
   TfLiteNode node;
   node.inputs = inputs_array;
   node.outputs = outputs_array;
-  node.temporaries = temporaries_array;
   node.user_data = user_data;
   node.builtin_data = reinterpret_cast<void*>(&builtin_data);
   node.custom_initial_data = nullptr;
   node.custom_initial_data_size = 0;
-  node.delegate = nullptr;
 
   if (registration->prepare) {
     TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
diff --git a/tensorflow/lite/micro/kernels/arg_min_max_test.cc b/tensorflow/lite/micro/kernels/arg_min_max_test.cc
index 57e761f816d..fa46badfc27 100644
--- a/tensorflow/lite/micro/kernels/arg_min_max_test.cc
+++ b/tensorflow/lite/micro/kernels/arg_min_max_test.cc
@@ -46,17 +46,13 @@ void ValidateArgMinMaxGoldens(TfLiteTensor* tensors, int tensors_size,
   TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
   int outputs_array_data[] = {1, 2};
   TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
-  int temporaries_array_data[] = {0};
-  TfLiteIntArray* temporaries_array = IntArrayFromInts(temporaries_array_data);
   TfLiteNode node;
   node.inputs = inputs_array;
   node.outputs = outputs_array;
-  node.temporaries = temporaries_array;
   node.user_data = user_data;
   node.builtin_data = nullptr;
   node.custom_initial_data = nullptr;
   node.custom_initial_data_size = 0;
-  node.delegate = nullptr;
   if (registration->prepare) {
     TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
   }
diff --git a/tensorflow/lite/micro/kernels/ceil_test.cc b/tensorflow/lite/micro/kernels/ceil_test.cc
index 67161e01556..7261d1c76a6 100644
--- a/tensorflow/lite/micro/kernels/ceil_test.cc
+++ b/tensorflow/lite/micro/kernels/ceil_test.cc
@@ -46,17 +46,13 @@ void TestCeil(const int* input_dims_data, const float* input_data,
   TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
   int outputs_array_data[] = {1, 1};
   TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
-  int temporaries_array_data[] = {0};
-  TfLiteIntArray* temporaries_array = IntArrayFromInts(temporaries_array_data);
   TfLiteNode node;
   node.inputs = inputs_array;
   node.outputs = outputs_array;
-  node.temporaries = temporaries_array;
   node.user_data = nullptr;
   node.builtin_data = nullptr;
   node.custom_initial_data = nullptr;
   node.custom_initial_data_size = 0;
-  node.delegate = nullptr;
   TF_LITE_MICRO_EXPECT_NE(nullptr, registration->invoke);
   TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(&context, &node));
   for (int i = 0; i < output_dims_count; ++i) {
diff --git a/tensorflow/lite/micro/kernels/circular_buffer_test.cc b/tensorflow/lite/micro/kernels/circular_buffer_test.cc
index c622f12ead2..4c48060a0a9 100644
--- a/tensorflow/lite/micro/kernels/circular_buffer_test.cc
+++ b/tensorflow/lite/micro/kernels/circular_buffer_test.cc
@@ -56,17 +56,12 @@ TfLiteNode PrepareCircularBufferInt8(const int* input_dims_data,
   // There is one output - tensor 1.
   const int outputs_array_data[] = {1, 1};
   TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
-  // There are no intermediates.
-  const int temporaries_array_data[] = {0};
-  TfLiteIntArray* temporaries_array = IntArrayFromInts(temporaries_array_data);
 
   node.inputs = inputs_array;
   node.outputs = outputs_array;
-  node.temporaries = temporaries_array;
   node.builtin_data = nullptr;
   node.custom_initial_data = nullptr;
   node.custom_initial_data_size = 0;
-  node.delegate = nullptr;
 
   TF_LITE_MICRO_EXPECT_NE(nullptr, registration->prepare);
   TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
@@ -104,17 +99,12 @@ TfLiteStatus InvokeCircularBufferInt8(const int* input_dims_data,
   // There is one output - tensor 1.
   const int outputs_array_data[] = {1, 1};
   TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
-  // There are no intermediates.
-  const int temporaries_array_data[] = {0};
-  TfLiteIntArray* temporaries_array = IntArrayFromInts(temporaries_array_data);
 
   node->inputs = inputs_array;
   node->outputs = outputs_array;
-  node->temporaries = temporaries_array;
   node->builtin_data = nullptr;
   node->custom_initial_data = nullptr;
   node->custom_initial_data_size = 0;
-  node->delegate = nullptr;
 
   TF_LITE_MICRO_EXPECT_NE(nullptr, registration->invoke);
 
diff --git a/tensorflow/lite/micro/kernels/comparisons_test.cc b/tensorflow/lite/micro/kernels/comparisons_test.cc
index c57f60f3ddd..b19c2aa8f01 100644
--- a/tensorflow/lite/micro/kernels/comparisons_test.cc
+++ b/tensorflow/lite/micro/kernels/comparisons_test.cc
@@ -44,18 +44,14 @@ void TestComparison(tflite::BuiltinOperator op, TfLiteTensor* tensors,
   TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
   const int outputs_array_data[] = {1, 2};
   TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
-  const int temporaries_array_data[] = {0};
-  TfLiteIntArray* temporaries_array = IntArrayFromInts(temporaries_array_data);
 
   TfLiteNode node;
   node.inputs = inputs_array;
   node.outputs = outputs_array;
-  node.temporaries = temporaries_array;
   node.user_data = nullptr;
   node.builtin_data = nullptr;
   node.custom_initial_data = nullptr;
   node.custom_initial_data_size = 0;
-  node.delegate = nullptr;
 
   if (registration->prepare) {
     TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
diff --git a/tensorflow/lite/micro/kernels/concatenation_test.cc b/tensorflow/lite/micro/kernels/concatenation_test.cc
index 5dc6a4ad669..8ac9e2ee2c8 100644
--- a/tensorflow/lite/micro/kernels/concatenation_test.cc
+++ b/tensorflow/lite/micro/kernels/concatenation_test.cc
@@ -57,19 +57,18 @@ void TestConcatenateTwoInputs(std::initializer_list<int> input1_dims_data,
       .activation = kTfLiteActNone  // Only activation supported in this impl
   };
 
-  TfLiteIntArray* inputs_array = IntArrayFromInitializer({2, 0, 1});
-  TfLiteIntArray* outputs_array = IntArrayFromInitializer({1, 2});
-  TfLiteIntArray* temporaries_array = IntArrayFromInitializer({0});
+  int inputs_array_data[] = {2, 0, 1};
+  TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
+  int outputs_array_data[] = {1, 2};
+  TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
 
   TfLiteNode node;
   node.inputs = inputs_array;
   node.outputs = outputs_array;
-  node.temporaries = temporaries_array;
   node.user_data = nullptr;
   node.builtin_data = reinterpret_cast<void*>(&builtin_data);
   node.custom_initial_data = nullptr;
   node.custom_initial_data_size = 0;
-  node.delegate = nullptr;
 
   TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
 
@@ -116,19 +115,18 @@ void TestConcatenateQuantizedTwoInputs(
       .activation = kTfLiteActNone  // Only activation supported in this impl
   };
 
-  TfLiteIntArray* inputs_array = IntArrayFromInitializer({2, 0, 1});
-  TfLiteIntArray* outputs_array = IntArrayFromInitializer({1, 2});
-  TfLiteIntArray* temporaries_array = IntArrayFromInitializer({0});
+  int inputs_array_data[] = {2, 0, 1};
+  TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
+  int outputs_array_data[] = {1, 2};
+  TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
 
   TfLiteNode node;
   node.inputs = inputs_array;
   node.outputs = outputs_array;
-  node.temporaries = temporaries_array;
   node.user_data = nullptr;
   node.builtin_data = reinterpret_cast<void*>(&builtin_data);
   node.custom_initial_data = nullptr;
   node.custom_initial_data_size = 0;
-  node.delegate = nullptr;
 
   TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
 
diff --git a/tensorflow/lite/micro/kernels/conv_test.cc b/tensorflow/lite/micro/kernels/conv_test.cc
index 7be3a1e6f70..2747ec5a9f3 100644
--- a/tensorflow/lite/micro/kernels/conv_test.cc
+++ b/tensorflow/lite/micro/kernels/conv_test.cc
@@ -76,18 +76,14 @@ TfLiteStatus ValidateConvGoldens(TfLiteTensor* tensors, int tensors_size,
   TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
   int outputs_array_data[] = {1, 3};
   TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
-  int temporaries_array_data[] = {0};
-  TfLiteIntArray* temporaries_array = IntArrayFromInts(temporaries_array_data);
 
   TfLiteNode node;
   node.inputs = inputs_array;
   node.outputs = outputs_array;
-  node.temporaries = temporaries_array;
   node.user_data = user_data;
   node.builtin_data = reinterpret_cast<void*>(conv_params);
   node.custom_initial_data = nullptr;
   node.custom_initial_data_size = 0;
-  node.delegate = nullptr;
 
   if (registration->prepare) {
     TF_LITE_ENSURE_OK(context, registration->prepare(&context, &node));
diff --git a/tensorflow/lite/micro/kernels/depthwise_conv_test.cc b/tensorflow/lite/micro/kernels/depthwise_conv_test.cc
index e2c6c71dfae..464241fdf25 100644
--- a/tensorflow/lite/micro/kernels/depthwise_conv_test.cc
+++ b/tensorflow/lite/micro/kernels/depthwise_conv_test.cc
@@ -73,18 +73,14 @@ TfLiteStatus ValidateDepthwiseConvGoldens(const T* expected_output_data,
   TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
   int outputs_array_data[] = {1, 3};
   TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
-  int temporaries_array_data[] = {0};
-  TfLiteIntArray* temporaries_array = IntArrayFromInts(temporaries_array_data);
 
   TfLiteNode node;
   node.inputs = inputs_array;
   node.outputs = outputs_array;
-  node.temporaries = temporaries_array;
   node.user_data = user_data;
   node.builtin_data = reinterpret_cast<void*>(&builtin_data);
   node.custom_initial_data = nullptr;
   node.custom_initial_data_size = 0;
-  node.delegate = nullptr;
   if (registration->prepare) {
     TF_LITE_ENSURE_OK(context, registration->prepare(&context, &node));
   }
diff --git a/tensorflow/lite/micro/kernels/dequantize_test.cc b/tensorflow/lite/micro/kernels/dequantize_test.cc
index 7faf21d00f2..21b42aedc50 100644
--- a/tensorflow/lite/micro/kernels/dequantize_test.cc
+++ b/tensorflow/lite/micro/kernels/dequantize_test.cc
@@ -48,18 +48,14 @@ void ValidateDequantizeGoldens(TfLiteTensor* tensors, int tensors_size,
   TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
   int outputs_array_data[] = {1, 1};
   TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
-  int temporaries_array_data[] = {0};
-  TfLiteIntArray* temporaries_array = IntArrayFromInts(temporaries_array_data);
 
   TfLiteNode node;
   node.inputs = inputs_array;
   node.outputs = outputs_array;
-  node.temporaries = temporaries_array;
   node.user_data = user_data;
   node.builtin_data = nullptr;
   node.custom_initial_data = nullptr;
   node.custom_initial_data_size = 0;
-  node.delegate = nullptr;
 
   if (registration->prepare) {
     TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
diff --git a/tensorflow/lite/micro/kernels/elementwise_test.cc b/tensorflow/lite/micro/kernels/elementwise_test.cc
index a201d2cae04..6e583dd2137 100644
--- a/tensorflow/lite/micro/kernels/elementwise_test.cc
+++ b/tensorflow/lite/micro/kernels/elementwise_test.cc
@@ -54,23 +54,18 @@ void TestElementwiseFloat(tflite::BuiltinOperator op,
   if (registration->init) {
     user_data = registration->init(&context, nullptr, 0);
   }
-  auto inputs_array_data = {1, 0};
-  TfLiteIntArray* inputs_array = IntArrayFromInitializer(inputs_array_data);
-  auto outputs_array_data = {1, 1};
-  TfLiteIntArray* outputs_array = IntArrayFromInitializer(outputs_array_data);
-  auto temporaries_array_data = {0};
-  TfLiteIntArray* temporaries_array =
-      IntArrayFromInitializer(temporaries_array_data);
+  int inputs_array_data[] = {1, 0};
+  TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
+  int outputs_array_data[] = {1, 1};
+  TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
 
   TfLiteNode node;
   node.inputs = inputs_array;
   node.outputs = outputs_array;
-  node.temporaries = temporaries_array;
   node.user_data = user_data;
   node.builtin_data = nullptr;
   node.custom_initial_data = nullptr;
   node.custom_initial_data_size = 0;
-  node.delegate = nullptr;
 
   if (registration->prepare) {
     TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
@@ -119,19 +114,19 @@ void TestElementwiseBool(tflite::BuiltinOperator op,
   if (registration->init) {
     user_data = registration->init(&context, nullptr, 0);
   }
-  TfLiteIntArray* inputs_array = IntArrayFromInitializer({1, 0});
-  TfLiteIntArray* outputs_array = IntArrayFromInitializer({1, 1});
-  TfLiteIntArray* temporaries_array = IntArrayFromInitializer({0});
+
+  int inputs_array_data[] = {1, 0};
+  TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
+  int outputs_array_data[] = {1, 1};
+  TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
 
   TfLiteNode node;
   node.inputs = inputs_array;
   node.outputs = outputs_array;
-  node.temporaries = temporaries_array;
   node.user_data = user_data;
   node.builtin_data = nullptr;
   node.custom_initial_data = nullptr;
   node.custom_initial_data_size = 0;
-  node.delegate = nullptr;
 
   if (registration->prepare) {
     TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
diff --git a/tensorflow/lite/micro/kernels/floor_test.cc b/tensorflow/lite/micro/kernels/floor_test.cc
index d841c7c39f3..2684bf3d8c9 100644
--- a/tensorflow/lite/micro/kernels/floor_test.cc
+++ b/tensorflow/lite/micro/kernels/floor_test.cc
@@ -47,18 +47,13 @@ void TestFloor(const int* input_dims_data, const float* input_data,
   TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
   int outputs_array_data[] = {1, 1};
   TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
-  int intermediates_array_data[] = {0};
-  TfLiteIntArray* temporaries_array =
-      IntArrayFromInts(intermediates_array_data);
   TfLiteNode node;
   node.inputs = inputs_array;
   node.outputs = outputs_array;
-  node.temporaries = temporaries_array;
   node.user_data = nullptr;
   node.builtin_data = nullptr;
   node.custom_initial_data = nullptr;
   node.custom_initial_data_size = 0;
-  node.delegate = nullptr;
   TF_LITE_MICRO_EXPECT_NE(nullptr, registration->invoke);
   TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(&context, &node));
   for (int i = 0; i < output_dims_count; ++i) {
diff --git a/tensorflow/lite/micro/kernels/fully_connected_test.cc b/tensorflow/lite/micro/kernels/fully_connected_test.cc
index 121c58c9150..5723248a408 100644
--- a/tensorflow/lite/micro/kernels/fully_connected_test.cc
+++ b/tensorflow/lite/micro/kernels/fully_connected_test.cc
@@ -72,18 +72,14 @@ TfLiteStatus TestFullyConnectedFloat(
   TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
   int outputs_array_data[] = {1, 3};
   TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
-  int temporaries_array_data[] = {0};
-  TfLiteIntArray* temporaries_array = IntArrayFromInts(temporaries_array_data);
 
   TfLiteNode node;
   node.inputs = inputs_array;
   node.outputs = outputs_array;
-  node.temporaries = temporaries_array;
   node.user_data = user_data;
   node.builtin_data = reinterpret_cast<void*>(&builtin_data);
   node.custom_initial_data = nullptr;
   node.custom_initial_data_size = 0;
-  node.delegate = nullptr;
   if (registration->prepare) {
     TF_LITE_ENSURE_OK(&context, registration->prepare(&context, &node));
   }
@@ -151,18 +147,14 @@ TfLiteStatus TestFullyConnectedQuantized(
   TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
   int outputs_array_data[] = {1, 3};
   TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
-  int temporaries_array_data[] = {0};
-  TfLiteIntArray* temporaries_array = IntArrayFromInts(temporaries_array_data);
 
   TfLiteNode node;
   node.inputs = inputs_array;
   node.outputs = outputs_array;
-  node.temporaries = temporaries_array;
   node.user_data = user_data;
   node.builtin_data = reinterpret_cast<void*>(&builtin_data);
   node.custom_initial_data = nullptr;
   node.custom_initial_data_size = 0;
-  node.delegate = nullptr;
 
   if (registration->prepare) {
     TF_LITE_ENSURE_OK(&context, registration->prepare(&context, &node));
diff --git a/tensorflow/lite/micro/kernels/l2norm_test.cc b/tensorflow/lite/micro/kernels/l2norm_test.cc
index e4c679a6c2f..39eb92a8849 100644
--- a/tensorflow/lite/micro/kernels/l2norm_test.cc
+++ b/tensorflow/lite/micro/kernels/l2norm_test.cc
@@ -112,18 +112,14 @@ void TestL2Normalization(const int* input_dims_data, const T* input_data,
   TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
   int outputs_array_data[] = {1, 1};
   TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
-  int temporaries_array_data[] = {0};
-  TfLiteIntArray* temporaries_array = IntArrayFromInts(temporaries_array_data);
 
   TfLiteNode node;
   node.inputs = inputs_array;
   node.outputs = outputs_array;
-  node.temporaries = temporaries_array;
   node.user_data = nullptr;
   node.builtin_data = reinterpret_cast<void*>(&builtin_data);
   node.custom_initial_data = nullptr;
   node.custom_initial_data_size = 0;
-  node.delegate = nullptr;
   TF_LITE_MICRO_EXPECT_NE(nullptr, registration->invoke);
   TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(&context, &node));
 
diff --git a/tensorflow/lite/micro/kernels/logical_test.cc b/tensorflow/lite/micro/kernels/logical_test.cc
index 5cf116b2eb4..f9ad9023d4a 100644
--- a/tensorflow/lite/micro/kernels/logical_test.cc
+++ b/tensorflow/lite/micro/kernels/logical_test.cc
@@ -51,19 +51,18 @@ void TestLogicalOp(tflite::BuiltinOperator op,
   const TfLiteRegistration* registration = resolver.FindOp(op);
   TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
 
-  TfLiteIntArray* inputs_array = IntArrayFromInitializer({2, 0, 1});
-  TfLiteIntArray* outputs_array = IntArrayFromInitializer({1, 2});
-  TfLiteIntArray* temporaries_array = IntArrayFromInitializer({0});
+  int inputs_array_data[] = {2, 0, 1};
+  TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
+  int outputs_array_data[] = {1, 2};
+  TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
 
   TfLiteNode node;
   node.inputs = inputs_array;
   node.outputs = outputs_array;
-  node.temporaries = temporaries_array;
   node.user_data = nullptr;
   node.builtin_data = nullptr;
   node.custom_initial_data = nullptr;
   node.custom_initial_data_size = 0;
-  node.delegate = nullptr;
 
   if (registration->prepare) {
     TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
diff --git a/tensorflow/lite/micro/kernels/logistic_test.cc b/tensorflow/lite/micro/kernels/logistic_test.cc
index 0403b744227..c828f00016f 100644
--- a/tensorflow/lite/micro/kernels/logistic_test.cc
+++ b/tensorflow/lite/micro/kernels/logistic_test.cc
@@ -62,12 +62,10 @@ void TestLogisticFloat(std::initializer_list<int> input_dims_data,
   TfLiteNode node;
   node.inputs = inputs_array;
   node.outputs = outputs_array;
-  node.temporaries = nullptr;
   node.user_data = user_data;
   node.builtin_data = nullptr;
   node.custom_initial_data = nullptr;
   node.custom_initial_data_size = 0;
-  node.delegate = nullptr;
   if (registration->prepare) {
     TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
   }
@@ -122,12 +120,10 @@ void TestLogisticInt8(std::initializer_list<int> input_dims_data,
   TfLiteNode node;
   node.inputs = inputs_array;
   node.outputs = outputs_array;
-  node.temporaries = nullptr;
   node.user_data = user_data;
   node.builtin_data = nullptr;
   node.custom_initial_data = nullptr;
   node.custom_initial_data_size = 0;
-  node.delegate = nullptr;
   if (registration->prepare) {
     TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
   }
diff --git a/tensorflow/lite/micro/kernels/maximum_minimum_test.cc b/tensorflow/lite/micro/kernels/maximum_minimum_test.cc
index 4e59b69623e..8635db3b60b 100644
--- a/tensorflow/lite/micro/kernels/maximum_minimum_test.cc
+++ b/tensorflow/lite/micro/kernels/maximum_minimum_test.cc
@@ -52,19 +52,18 @@ void TestMaxMinFloat(tflite::BuiltinOperator op,
   const TfLiteRegistration* registration = resolver.FindOp(op);
   TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
 
-  TfLiteIntArray* inputs_array = IntArrayFromInitializer({2, 0, 1});
-  TfLiteIntArray* outputs_array = IntArrayFromInitializer({1, 2});
-  TfLiteIntArray* temporaries_array = IntArrayFromInitializer({0});
+  int inputs_array_data[] = {2, 0, 1};
+  TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
+  int outputs_array_data[] = {1, 2};
+  TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
 
   TfLiteNode node;
   node.inputs = inputs_array;
   node.outputs = outputs_array;
-  node.temporaries = temporaries_array;
   node.user_data = nullptr;
   node.builtin_data = nullptr;
   node.custom_initial_data = nullptr;
   node.custom_initial_data_size = 0;
-  node.delegate = nullptr;
 
   if (registration->prepare) {
     TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
@@ -108,19 +107,18 @@ void TestMaxMinQuantized(
   const TfLiteRegistration* registration = resolver.FindOp(op);
   TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
 
-  TfLiteIntArray* inputs_array = IntArrayFromInitializer({2, 0, 1});
-  TfLiteIntArray* outputs_array = IntArrayFromInitializer({1, 2});
-  TfLiteIntArray* temporaries_array = IntArrayFromInitializer({0});
+  int inputs_array_data[] = {2, 0, 1};
+  TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
+  int outputs_array_data[] = {1, 2};
+  TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
 
   TfLiteNode node;
   node.inputs = inputs_array;
   node.outputs = outputs_array;
-  node.temporaries = temporaries_array;
   node.user_data = nullptr;
   node.builtin_data = nullptr;
   node.custom_initial_data = nullptr;
   node.custom_initial_data_size = 0;
-  node.delegate = nullptr;
 
   if (registration->prepare) {
     TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
@@ -162,19 +160,18 @@ void TestMaxMinQuantizedInt32(
   const TfLiteRegistration* registration = resolver.FindOp(op);
   TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
 
-  TfLiteIntArray* inputs_array = IntArrayFromInitializer({2, 0, 1});
-  TfLiteIntArray* outputs_array = IntArrayFromInitializer({1, 2});
-  TfLiteIntArray* temporaries_array = IntArrayFromInitializer({0});
+  int inputs_array_data[] = {2, 0, 1};
+  TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
+  int outputs_array_data[] = {1, 2};
+  TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
 
   TfLiteNode node;
   node.inputs = inputs_array;
   node.outputs = outputs_array;
-  node.temporaries = temporaries_array;
   node.user_data = nullptr;
   node.builtin_data = nullptr;
   node.custom_initial_data = nullptr;
   node.custom_initial_data_size = 0;
-  node.delegate = nullptr;
 
   if (registration->prepare) {
     TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
diff --git a/tensorflow/lite/micro/kernels/mul_test.cc b/tensorflow/lite/micro/kernels/mul_test.cc
index f69bf2aa17e..3601d91d8f7 100644
--- a/tensorflow/lite/micro/kernels/mul_test.cc
+++ b/tensorflow/lite/micro/kernels/mul_test.cc
@@ -76,7 +76,6 @@ void TestMulFloat(std::initializer_list<int> input1_dims_data,
   node.builtin_data = reinterpret_cast<void*>(&builtin_data);
   node.custom_initial_data = nullptr;
   node.custom_initial_data_size = 0;
-  node.delegate = nullptr;
 
   if (registration->prepare) {
     TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
@@ -148,7 +147,6 @@ void TestMulQuantized(std::initializer_list<int> input1_dims_data,
   node.builtin_data = reinterpret_cast<void*>(&builtin_data);
   node.custom_initial_data = nullptr;
   node.custom_initial_data_size = 0;
-  node.delegate = nullptr;
 
   if (registration->prepare) {
     TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
diff --git a/tensorflow/lite/micro/kernels/neg_test.cc b/tensorflow/lite/micro/kernels/neg_test.cc
index 12a47f09900..8c8e6b8b282 100644
--- a/tensorflow/lite/micro/kernels/neg_test.cc
+++ b/tensorflow/lite/micro/kernels/neg_test.cc
@@ -50,17 +50,14 @@ void TestNegFloat(std::initializer_list<int> input_dims_data,
   TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
   int outputs_array_data[] = {1, 1};
   TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
-  TfLiteIntArray* temporaries_array = IntArrayFromInitializer({0});
 
   TfLiteNode node;
   node.inputs = inputs_array;
   node.outputs = outputs_array;
-  node.temporaries = temporaries_array;
   node.user_data = nullptr;
   node.builtin_data = nullptr;
   node.custom_initial_data = nullptr;
   node.custom_initial_data_size = 0;
-  node.delegate = nullptr;
 
   TF_LITE_MICRO_EXPECT_NE(nullptr, registration->invoke);
   TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(&context, &node));
diff --git a/tensorflow/lite/micro/kernels/pack_test.cc b/tensorflow/lite/micro/kernels/pack_test.cc
index f9ac20a28ab..ddd1a39d775 100644
--- a/tensorflow/lite/micro/kernels/pack_test.cc
+++ b/tensorflow/lite/micro/kernels/pack_test.cc
@@ -64,19 +64,19 @@ void TestPackTwoInputsFloat(std::initializer_list<int> input1_dims_data,
   if (registration->init) {
     user_data = registration->init(&context, nullptr, 0);
   }
-  TfLiteIntArray* inputs_array = IntArrayFromInitializer({2, 0, 1});
-  TfLiteIntArray* outputs_array = IntArrayFromInitializer({1, 2});
-  TfLiteIntArray* temporaries_array = IntArrayFromInitializer({0});
+
+  int inputs_array_data[] = {2, 0, 1};
+  TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
+  int outputs_array_data[] = {1, 2};
+  TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
 
   TfLiteNode node;
   node.inputs = inputs_array;
   node.outputs = outputs_array;
-  node.temporaries = temporaries_array;
   node.user_data = user_data;
   node.builtin_data = reinterpret_cast<void*>(&builtin_data);
   node.custom_initial_data = nullptr;
   node.custom_initial_data_size = 0;
-  node.delegate = nullptr;
 
   if (registration->prepare) {
     TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
@@ -141,19 +141,19 @@ void TestPackThreeInputsFloat(std::initializer_list<int> input1_dims_data,
   if (registration->init) {
     user_data = registration->init(&context, nullptr, 0);
   }
-  TfLiteIntArray* inputs_array = IntArrayFromInitializer({3, 0, 1, 2});
-  TfLiteIntArray* outputs_array = IntArrayFromInitializer({1, 3});
-  TfLiteIntArray* temporaries_array = IntArrayFromInitializer({0});
+
+  int inputs_array_data[] = {3, 0, 1, 2};
+  TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
+  int outputs_array_data[] = {1, 3};
+  TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
 
   TfLiteNode node;
   node.inputs = inputs_array;
   node.outputs = outputs_array;
-  node.temporaries = temporaries_array;
   node.user_data = user_data;
   node.builtin_data = reinterpret_cast<void*>(&builtin_data);
   node.custom_initial_data = nullptr;
   node.custom_initial_data_size = 0;
-  node.delegate = nullptr;
 
   if (registration->prepare) {
     TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
@@ -214,19 +214,18 @@ void TestPackTwoInputsQuantized(
   if (registration->init) {
     user_data = registration->init(&context, nullptr, 0);
   }
-  TfLiteIntArray* inputs_array = IntArrayFromInitializer({2, 0, 1});
-  TfLiteIntArray* outputs_array = IntArrayFromInitializer({1, 2});
-  TfLiteIntArray* temporaries_array = IntArrayFromInitializer({0});
+  int inputs_array_data[] = {2, 0, 1};
+  TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
+  int outputs_array_data[] = {1, 2};
+  TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
 
   TfLiteNode node;
   node.inputs = inputs_array;
   node.outputs = outputs_array;
-  node.temporaries = temporaries_array;
   node.user_data = user_data;
   node.builtin_data = reinterpret_cast<void*>(&builtin_data);
   node.custom_initial_data = nullptr;
   node.custom_initial_data_size = 0;
-  node.delegate = nullptr;
 
   if (registration->prepare) {
     TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
@@ -284,19 +283,18 @@ void TestPackTwoInputsQuantized32(
   if (registration->init) {
     user_data = registration->init(&context, nullptr, 0);
   }
-  TfLiteIntArray* inputs_array = IntArrayFromInitializer({2, 0, 1});
-  TfLiteIntArray* outputs_array = IntArrayFromInitializer({1, 2});
-  TfLiteIntArray* temporaries_array = IntArrayFromInitializer({0});
+  int inputs_array_data[] = {2, 0, 1};
+  TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
+  int outputs_array_data[] = {1, 2};
+  TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
 
   TfLiteNode node;
   node.inputs = inputs_array;
   node.outputs = outputs_array;
-  node.temporaries = temporaries_array;
   node.user_data = user_data;
   node.builtin_data = reinterpret_cast<void*>(&builtin_data);
   node.custom_initial_data = nullptr;
   node.custom_initial_data_size = 0;
-  node.delegate = nullptr;
 
   if (registration->prepare) {
     TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
diff --git a/tensorflow/lite/micro/kernels/pad_test.cc b/tensorflow/lite/micro/kernels/pad_test.cc
index bef7ce0fa9f..9b1f4db30cd 100644
--- a/tensorflow/lite/micro/kernels/pad_test.cc
+++ b/tensorflow/lite/micro/kernels/pad_test.cc
@@ -39,17 +39,13 @@ TfLiteStatus ValidatePadGoldens(TfLiteTensor* tensors, int tensors_size,
   TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
   int outputs_array_data[] = {1, 2};
   TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
-  int temporaries_array_data[] = {0};
-  TfLiteIntArray* temporaries_array = IntArrayFromInts(temporaries_array_data);
   TfLiteNode node;
   node.inputs = inputs_array;
   node.outputs = outputs_array;
-  node.temporaries = temporaries_array;
   node.user_data = nullptr;
   node.builtin_data = nullptr;
   node.custom_initial_data = nullptr;
   node.custom_initial_data_size = 0;
-  node.delegate = nullptr;
   TF_LITE_MICRO_EXPECT_NE(nullptr, registration->prepare);
   TF_LITE_ENSURE_EQ(&context, kTfLiteOk,
                     registration->prepare(&context, &node));
@@ -76,17 +72,13 @@ TfLiteStatus ValidatePadV2Goldens(TfLiteTensor* tensors, int tensors_size,
   TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
   int outputs_array_data[] = {1, 3};
   TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
-  int temporaries_array_data[] = {0};
-  TfLiteIntArray* temporaries_array = IntArrayFromInts(temporaries_array_data);
   TfLiteNode node;
   node.inputs = inputs_array;
   node.outputs = outputs_array;
-  node.temporaries = temporaries_array;
   node.user_data = nullptr;
   node.builtin_data = nullptr;
   node.custom_initial_data = nullptr;
   node.custom_initial_data_size = 0;
-  node.delegate = nullptr;
   TF_LITE_MICRO_EXPECT_NE(nullptr, registration->prepare);
   // Prepare should catch dimension mismatches.
   TfLiteStatus prepare_status = registration->prepare(&context, &node);
diff --git a/tensorflow/lite/micro/kernels/pooling_test.cc b/tensorflow/lite/micro/kernels/pooling_test.cc
index 35a77662e07..d1f21da7533 100644
--- a/tensorflow/lite/micro/kernels/pooling_test.cc
+++ b/tensorflow/lite/micro/kernels/pooling_test.cc
@@ -66,18 +66,14 @@ void TestAveragePoolingFloat(std::initializer_list<int> input_dims_data,
   TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
   int outputs_array_data[] = {1, 1};
   TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
-  int temporaries_array_data[] = {0};
-  TfLiteIntArray* temporaries_array = IntArrayFromInts(temporaries_array_data);
 
   TfLiteNode node;
   node.inputs = inputs_array;
   node.outputs = outputs_array;
-  node.temporaries = temporaries_array;
   node.user_data = user_data;
   node.builtin_data = reinterpret_cast<void*>(&builtin_data);
   node.custom_initial_data = nullptr;
   node.custom_initial_data_size = 0;
-  node.delegate = nullptr;
 
   if (registration->prepare) {
     TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
@@ -138,18 +134,14 @@ void TestAveragePoolingQuantized(
   TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
   int outputs_array_data[] = {1, 1};
   TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
-  int temporaries_array_data[] = {0};
-  TfLiteIntArray* temporaries_array = IntArrayFromInts(temporaries_array_data);
 
   TfLiteNode node;
   node.inputs = inputs_array;
   node.outputs = outputs_array;
-  node.temporaries = temporaries_array;
   node.user_data = user_data;
   node.builtin_data = reinterpret_cast<void*>(&builtin_data);
   node.custom_initial_data = nullptr;
   node.custom_initial_data_size = 0;
-  node.delegate = nullptr;
 
   if (registration->prepare) {
     TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
@@ -209,18 +201,14 @@ void TestMaxPoolFloat(std::initializer_list<int> input_dims_data,
   TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
   int outputs_array_data[] = {1, 1};
   TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
-  int temporaries_array_data[] = {0};
-  TfLiteIntArray* temporaries_array = IntArrayFromInts(temporaries_array_data);
 
   TfLiteNode node;
   node.inputs = inputs_array;
   node.outputs = outputs_array;
-  node.temporaries = temporaries_array;
   node.user_data = user_data;
   node.builtin_data = reinterpret_cast<void*>(&builtin_data);
   node.custom_initial_data = nullptr;
   node.custom_initial_data_size = 0;
-  node.delegate = nullptr;
   if (registration->prepare) {
     TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
   }
@@ -283,18 +271,14 @@ void TestMaxPoolQuantized(std::initializer_list<int> input_dims_data,
   TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
   int outputs_array_data[] = {1, 1};
   TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
-  int temporaries_array_data[] = {0};
-  TfLiteIntArray* temporaries_array = IntArrayFromInts(temporaries_array_data);
 
   TfLiteNode node;
   node.inputs = inputs_array;
   node.outputs = outputs_array;
-  node.temporaries = temporaries_array;
   node.user_data = user_data;
   node.builtin_data = reinterpret_cast<void*>(&builtin_data);
   node.custom_initial_data = nullptr;
   node.custom_initial_data_size = 0;
-  node.delegate = nullptr;
   if (registration->prepare) {
     TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
   }
diff --git a/tensorflow/lite/micro/kernels/prelu_test.cc b/tensorflow/lite/micro/kernels/prelu_test.cc
index 37bb51660e2..4b4bfd12e60 100644
--- a/tensorflow/lite/micro/kernels/prelu_test.cc
+++ b/tensorflow/lite/micro/kernels/prelu_test.cc
@@ -58,16 +58,13 @@ void TestPreluFloat(std::initializer_list<int> input_dims_data,
   TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
   int outputs_array_data[] = {1, 2};
   TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
-  TfLiteIntArray* temporaries_array = IntArrayFromInitializer({0});
   TfLiteNode node;
   node.inputs = inputs_array;
   node.outputs = outputs_array;
-  node.temporaries = temporaries_array;
   node.user_data = user_data;
   node.builtin_data = nullptr;
   node.custom_initial_data = nullptr;
   node.custom_initial_data_size = 0;
-  node.delegate = nullptr;
   if (registration->prepare) {
     TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
   }
@@ -122,16 +119,13 @@ void TestPreluQuantized(std::initializer_list<int> input_dims_data,
   TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
   int outputs_array_data[] = {1, 2};
   TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
-  TfLiteIntArray* temporaries_array = IntArrayFromInitializer({0});
   TfLiteNode node;
   node.inputs = inputs_array;
   node.outputs = outputs_array;
-  node.temporaries = temporaries_array;
   node.user_data = user_data;
   node.builtin_data = nullptr;
   node.custom_initial_data = nullptr;
   node.custom_initial_data_size = 0;
-  node.delegate = nullptr;
   if (registration->prepare) {
     TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
   }
diff --git a/tensorflow/lite/micro/kernels/quantize_test.cc b/tensorflow/lite/micro/kernels/quantize_test.cc
index 37f06a29c58..b6f885d09e7 100644
--- a/tensorflow/lite/micro/kernels/quantize_test.cc
+++ b/tensorflow/lite/micro/kernels/quantize_test.cc
@@ -50,18 +50,14 @@ void ValidateQuantizeGoldens(TfLiteTensor* tensors, int tensors_size,
   TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
   int outputs_array_data[] = {1, 1};
   TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
-  int temporaries_array_data[] = {0};
-  TfLiteIntArray* temporaries_array = IntArrayFromInts(temporaries_array_data);
 
   TfLiteNode node;
   node.inputs = inputs_array;
   node.outputs = outputs_array;
-  node.temporaries = temporaries_array;
   node.user_data = user_data;
   node.builtin_data = nullptr;
   node.custom_initial_data = nullptr;
   node.custom_initial_data_size = 0;
-  node.delegate = nullptr;
 
   if (registration->prepare) {
     TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
diff --git a/tensorflow/lite/micro/kernels/reduce_test.cc b/tensorflow/lite/micro/kernels/reduce_test.cc
index b25b4f76766..928dda287aa 100644
--- a/tensorflow/lite/micro/kernels/reduce_test.cc
+++ b/tensorflow/lite/micro/kernels/reduce_test.cc
@@ -74,12 +74,10 @@ TfLiteStatus ValidateReduceGoldens(TfLiteTensor* tensors, int tensors_size,
   TfLiteNode node;
   node.inputs = inputs_array;
   node.outputs = outputs_array;
-  node.temporaries = nullptr;
   node.user_data = user_data;
   node.builtin_data = reinterpret_cast<void*>(params);
   node.custom_initial_data = nullptr;
   node.custom_initial_data_size = 0;
-  node.delegate = nullptr;
 
   if (registration->prepare) {
     TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
diff --git a/tensorflow/lite/micro/kernels/reshape_test.cc b/tensorflow/lite/micro/kernels/reshape_test.cc
index 07c64969ba1..5913c7f86bb 100644
--- a/tensorflow/lite/micro/kernels/reshape_test.cc
+++ b/tensorflow/lite/micro/kernels/reshape_test.cc
@@ -29,56 +29,29 @@ namespace tflite {
 namespace testing {
 namespace {
 
-// If expected output is empty, the test is expected to fail.
 template <typename T>
-void TestReshapeImpl(TfLiteTensor* input_tensor, TfLiteTensor* shape_tensor,
+void TestReshapeImpl(TfLiteContext* context, TfLiteNode* node,
                      TfLiteTensor* output_tensor,
                      std::initializer_list<T> expected_output,
                      std::initializer_list<int> expected_dims,
                      bool expect_failure) {
-  TfLiteContext context;
-  TfLiteTensor tensors[3];
-  TfLiteNode node;
-  if (shape_tensor == nullptr) {
-    constexpr int inputs_size = 1;
-    constexpr int outputs_size = 1;
-    constexpr int tensors_size = inputs_size + outputs_size;
-    tensors[0] = *input_tensor;
-    tensors[1] = *output_tensor,
-    PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
-    node.inputs = IntArrayFromInitializer({1, 0});
-    node.outputs = IntArrayFromInitializer({1, 1});
-  } else {
-    constexpr int inputs_size = 2;
-    constexpr int outputs_size = 1;
-    constexpr int tensors_size = inputs_size + outputs_size;
-    tensors[0] = *input_tensor;
-    tensors[1] = *shape_tensor;
-    tensors[2] = *output_tensor;
-    PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
-    node.inputs = IntArrayFromInitializer({2, 0, 1});
-    node.outputs = IntArrayFromInitializer({1, 2});
-  }
-
   ::tflite::AllOpsResolver resolver;
   const TfLiteRegistration* registration =
       resolver.FindOp(tflite::BuiltinOperator_RESHAPE);
   TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
 
   void* user_data = nullptr;
-  node.temporaries = nullptr;
-  node.user_data = user_data;
-  node.builtin_data = nullptr;
-  node.custom_initial_data = nullptr;
-  node.custom_initial_data_size = 0;
-  node.delegate = nullptr;
+  node->user_data = user_data;
+  node->builtin_data = nullptr;
+  node->custom_initial_data = nullptr;
+  node->custom_initial_data_size = 0;
 
   TF_LITE_MICRO_EXPECT_EQ(registration->init, nullptr);
   TF_LITE_MICRO_EXPECT_EQ(registration->free, nullptr);
 
   if (registration->prepare) {
     // Error can happen either in Prepare or eval stage.
-    auto status = registration->prepare(&context, &node);
+    auto status = registration->prepare(context, node);
     if (status != kTfLiteOk && expect_failure) {
       return;
     } else {
@@ -86,11 +59,10 @@ void TestReshapeImpl(TfLiteTensor* input_tensor, TfLiteTensor* shape_tensor,
     }
   }
   if (expect_failure) {
-    TF_LITE_MICRO_EXPECT_EQ(kTfLiteError,
-                            registration->invoke(&context, &node));
+    TF_LITE_MICRO_EXPECT_EQ(kTfLiteError, registration->invoke(context, node));
     return;
   }
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(&context, &node));
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(context, node));
 
   const int output_dims_count = ElementCount(*output_tensor->dims);
   const T* output_data = GetTensorData<T>(output_tensor);
@@ -105,6 +77,59 @@ void TestReshapeImpl(TfLiteTensor* input_tensor, TfLiteTensor* shape_tensor,
   }
 }
 
+// If expected output is empty, the test is expected to fail.
+template <typename T>
+void TestReshapeWithShapeImpl(TfLiteTensor* input_tensor,
+                              TfLiteTensor* shape_tensor,
+                              TfLiteTensor* output_tensor,
+                              std::initializer_list<T> expected_output,
+                              std::initializer_list<int> expected_dims,
+                              bool expect_failure) {
+  TfLiteContext context;
+  TfLiteTensor tensors[3];
+  TfLiteNode node;
+  constexpr int inputs_size = 2;
+  constexpr int outputs_size = 1;
+  constexpr int tensors_size = inputs_size + outputs_size;
+  tensors[0] = *input_tensor;
+  tensors[1] = *shape_tensor;
+  tensors[2] = *output_tensor;
+  PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
+
+  int inputs_data[] = {2, 0, 1};
+  node.inputs = IntArrayFromInts(inputs_data);
+  int outputs_data[] = {1, 2};
+  node.outputs = IntArrayFromInts(outputs_data);
+
+  TestReshapeImpl(&context, &node, output_tensor, expected_output,
+                  expected_dims, expect_failure);
+}
+
+// If expected output is empty, the test is expected to fail.
+template <typename T>
+void TestReshapeWithoutShapeImpl(TfLiteTensor* input_tensor,
+                                 TfLiteTensor* output_tensor,
+                                 std::initializer_list<T> expected_output,
+                                 std::initializer_list<int> expected_dims,
+                                 bool expect_failure) {
+  TfLiteContext context;
+  TfLiteTensor tensors[3];
+  TfLiteNode node;
+  constexpr int inputs_size = 1;
+  constexpr int outputs_size = 1;
+  constexpr int tensors_size = inputs_size + outputs_size;
+  tensors[0] = *input_tensor;
+  tensors[1] = *output_tensor,
+  PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
+  int inputs_data[] = {1, 0};
+  node.inputs = IntArrayFromInts(inputs_data);
+  int outputs_data[] = {1, 1};
+  node.outputs = IntArrayFromInts(outputs_data);
+
+  TestReshapeImpl(&context, &node, output_tensor, expected_output,
+                  expected_dims, expect_failure);
+}
+
 template <typename T = float, TfLiteType tensor_input_type = kTfLiteFloat32>
 void TestReshape(std::initializer_list<int> input_dims_data,
                  std::initializer_list<T> input_data,
@@ -122,14 +147,14 @@ void TestReshape(std::initializer_list<int> input_dims_data,
   TfLiteTensor output_tensor =
       CreateTensor<T, tensor_input_type>(output_data, output_dims);
   // Reshape param is passed as op's param.
-  TestReshapeImpl<T>(&input_tensor, nullptr, &output_tensor, expected_output,
-                     expected_dims, expect_failure);
+  TestReshapeWithoutShapeImpl<T>(&input_tensor, &output_tensor, expected_output,
+                                 expected_dims, expect_failure);
   // Reshape param is passed as a tensor.
   TfLiteIntArray* shape_dims = IntArrayFromInitializer(shape_dims_data);
   auto shape_tensor =
       CreateTensor<int32_t, kTfLiteInt32>(shape_data, shape_dims);
-  TestReshapeImpl<T>(&input_tensor, &shape_tensor, &output_tensor,
-                     expected_output, expected_dims, expect_failure);
+  TestReshapeWithShapeImpl<T>(&input_tensor, &shape_tensor, &output_tensor,
+                              expected_output, expected_dims, expect_failure);
 }
 }  // namespace
 }  // namespace testing
@@ -192,19 +217,20 @@ TF_LITE_MICRO_TEST(InvalidShape) {
   using tflite::testing::CreateFloatTensor;
   using tflite::testing::IntArrayFromInitializer;
   using tflite::testing::IntArrayFromInts;
-  TfLiteIntArray* input_dims = IntArrayFromInitializer({3, 1, 2, 2});
+  int input_dims_data[] = {3, 1, 2, 2};
+  TfLiteIntArray* input_dims = IntArrayFromInts(input_dims_data);
   auto input_data = {3.0f};
   auto input_tensor = CreateFloatTensor(input_data, input_dims);
   float output_data[4];
   int output_dims_data[6] = {2, 2, 1, 2, 2, 1};
   TfLiteIntArray* output_dims = IntArrayFromInts(output_dims_data);
   auto output_tensor = CreateFloatTensor(output_data, output_dims);
-  tflite::testing::TestReshapeImpl<float>(&input_tensor,   // input_tensor
-                                          nullptr,         // shape_tensor
-                                          &output_tensor,  // output_tensor
-                                          {},              // expected_output
-                                          {},              // expected_dims
-                                          true             // expect failure
+  tflite::testing::TestReshapeWithoutShapeImpl<float>(
+      &input_tensor,   // input_tensor
+      &output_tensor,  // output_tensor
+      {},              // expected_output
+      {},              // expected_dims
+      true             // expect failure
   );
 }
 
@@ -255,29 +281,32 @@ TF_LITE_MICRO_TEST(LegacyScalarOutput) {
   using tflite::testing::CreateFloatTensor;
   using tflite::testing::IntArrayFromInitializer;
   using tflite::testing::IntArrayFromInts;
-  TfLiteIntArray* input_dims = IntArrayFromInitializer({1, 1});
+  int input_dims_data[] = {1, 1};
+  TfLiteIntArray* input_dims = IntArrayFromInts(input_dims_data);
   auto input_data = {3.0f};
   auto input_tensor = CreateFloatTensor(input_data, input_dims);
   float output_data[1];
   int output_dims_data[2] = {1, 0};
   TfLiteIntArray* output_dims = IntArrayFromInts(output_dims_data);
   auto output_tensor = CreateFloatTensor(output_data, output_dims);
-  TfLiteIntArray* shape_dims = tflite::testing::IntArrayFromInitializer({1, 0});
+  int shape_dims_data[] = {1, 0};
+  TfLiteIntArray* shape_dims = IntArrayFromInts(shape_dims_data);
   auto shape_tensor =
       tflite::testing::CreateTensor<int32_t, kTfLiteInt32>({0}, shape_dims);
-  tflite::testing::TestReshapeImpl<float>(&input_tensor,   // input_tensor
-                                          &shape_tensor,   // shape_tensor
-                                          &output_tensor,  // output_tensor
-                                          {},              // expected_output
-                                          {},              // expected_dims
-                                          true             // expect failure
+  tflite::testing::TestReshapeWithShapeImpl<float>(
+      &input_tensor,   // input_tensor
+      &shape_tensor,   // shape_tensor
+      &output_tensor,  // output_tensor
+      {},              // expected_output
+      {},              // expected_dims
+      true             // expect failure
   );
-  tflite::testing::TestReshapeImpl<float>(&input_tensor,   // input_tensor
-                                          nullptr,         // shape_tensor
-                                          &output_tensor,  // output_tensor
-                                          {3},             // expected_output
-                                          {},              // expected_dims
-                                          false            // expect failure
+  tflite::testing::TestReshapeWithoutShapeImpl<float>(
+      &input_tensor,   // input_tensor
+      &output_tensor,  // output_tensor
+      {3},             // expected_output
+      {},              // expected_dims
+      false            // expect failure
   );
 }
 
diff --git a/tensorflow/lite/micro/kernels/resize_nearest_neighbor_test.cc b/tensorflow/lite/micro/kernels/resize_nearest_neighbor_test.cc
index e8dad09a635..654516c3ce7 100644
--- a/tensorflow/lite/micro/kernels/resize_nearest_neighbor_test.cc
+++ b/tensorflow/lite/micro/kernels/resize_nearest_neighbor_test.cc
@@ -78,18 +78,14 @@ void TestResizeNearestNeighbor(const int* input_dims_data, const T* input_data,
   TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
   int outputs_array_data[] = {1, 2};
   TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
-  int temporaries_array_data[] = {0};
-  TfLiteIntArray* temporaries_array = IntArrayFromInts(temporaries_array_data);
 
   TfLiteNode node;
   node.inputs = inputs_array;
   node.outputs = outputs_array;
-  node.temporaries = temporaries_array;
   node.user_data = nullptr;
   node.builtin_data = reinterpret_cast<void*>(&builtin_data);
   node.custom_initial_data = nullptr;
   node.custom_initial_data_size = 0;
-  node.delegate = nullptr;
   TF_LITE_MICRO_EXPECT_NE(nullptr, registration->invoke);
   TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(&context, &node));
 
diff --git a/tensorflow/lite/micro/kernels/round_test.cc b/tensorflow/lite/micro/kernels/round_test.cc
index e239faf4c00..c5db0447824 100644
--- a/tensorflow/lite/micro/kernels/round_test.cc
+++ b/tensorflow/lite/micro/kernels/round_test.cc
@@ -46,17 +46,13 @@ void TestRound(const int* input_dims_data, const float* input_data,
   TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
   int outputs_array_data[] = {1, 1};
   TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
-  int temporaries_array_data[] = {0};
-  TfLiteIntArray* temporaries_array = IntArrayFromInts(temporaries_array_data);
   TfLiteNode node;
   node.inputs = inputs_array;
   node.outputs = outputs_array;
-  node.temporaries = temporaries_array;
   node.user_data = nullptr;
   node.builtin_data = nullptr;
   node.custom_initial_data = nullptr;
   node.custom_initial_data_size = 0;
-  node.delegate = nullptr;
   TF_LITE_MICRO_EXPECT_NE(nullptr, registration->invoke);
   TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(&context, &node));
   for (int i = 0; i < output_dims_count; ++i) {
diff --git a/tensorflow/lite/micro/kernels/softmax_test.cc b/tensorflow/lite/micro/kernels/softmax_test.cc
index 5ed994fe9f0..9ba67eafabc 100644
--- a/tensorflow/lite/micro/kernels/softmax_test.cc
+++ b/tensorflow/lite/micro/kernels/softmax_test.cc
@@ -59,18 +59,14 @@ void TestSoftmaxFloat(std::initializer_list<int> input_dims_data,
   TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
   int outputs_array_data[] = {1, 1};
   TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
-  int temporaries_array_data[] = {0};
-  TfLiteIntArray* temporaries_array = IntArrayFromInts(temporaries_array_data);
 
   TfLiteNode node;
   node.inputs = inputs_array;
   node.outputs = outputs_array;
-  node.temporaries = temporaries_array;
   node.user_data = user_data;
   node.builtin_data = reinterpret_cast<void*>(&builtin_data);
   node.custom_initial_data = nullptr;
   node.custom_initial_data_size = 0;
-  node.delegate = nullptr;
   if (registration->prepare) {
     TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
   }
@@ -124,18 +120,14 @@ void TestSoftmaxQuantized(std::initializer_list<int> input_dims_data,
   TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
   int outputs_array_data[] = {1, 1};
   TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
-  int temporaries_array_data[] = {0};
-  TfLiteIntArray* temporaries_array = IntArrayFromInts(temporaries_array_data);
 
   TfLiteNode node;
   node.inputs = inputs_array;
   node.outputs = outputs_array;
-  node.temporaries = temporaries_array;
   node.user_data = user_data;
   node.builtin_data = reinterpret_cast<void*>(&builtin_data);
   node.custom_initial_data = nullptr;
   node.custom_initial_data_size = 0;
-  node.delegate = nullptr;
 
   if (registration->prepare) {
     TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
@@ -188,18 +180,14 @@ void TestSoftmaxQuantizedSigned(
   TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
   int outputs_array_data[] = {1, 1};
   TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
-  int temporaries_array_data[] = {0};
-  TfLiteIntArray* temporaries_array = IntArrayFromInts(temporaries_array_data);
 
   TfLiteNode node;
   node.inputs = inputs_array;
   node.outputs = outputs_array;
-  node.temporaries = temporaries_array;
   node.user_data = user_data;
   node.builtin_data = reinterpret_cast<void*>(&builtin_data);
   node.custom_initial_data = nullptr;
   node.custom_initial_data_size = 0;
-  node.delegate = nullptr;
 
   if (registration->prepare) {
     TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
diff --git a/tensorflow/lite/micro/kernels/split_test.cc b/tensorflow/lite/micro/kernels/split_test.cc
index b4423c01dca..3a51665ed56 100644
--- a/tensorflow/lite/micro/kernels/split_test.cc
+++ b/tensorflow/lite/micro/kernels/split_test.cc
@@ -76,19 +76,19 @@ void TestSplitTwoOutputsFloat(
   if (registration->init) {
     user_data = registration->init(&context, nullptr, 0);
   }
-  TfLiteIntArray* inputs_array = IntArrayFromInitializer({2, 0, 1});
-  TfLiteIntArray* outputs_array = IntArrayFromInitializer({2, 2, 3});
-  TfLiteIntArray* temporaries_array = IntArrayFromInitializer({0});
+
+  int inputs_array_data[] = {2, 0, 1};
+  TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
+  int outputs_array_data[] = {2, 2, 3};
+  TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
 
   TfLiteNode node;
   node.inputs = inputs_array;
   node.outputs = outputs_array;
-  node.temporaries = temporaries_array;
   node.user_data = user_data;
   node.builtin_data = reinterpret_cast<void*>(&builtin_data);
   node.custom_initial_data = nullptr;
   node.custom_initial_data_size = 0;
-  node.delegate = nullptr;
 
   if (registration->prepare) {
     TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
@@ -179,19 +179,19 @@ void TestSplitFourOutputsFloat(
   if (registration->init) {
     user_data = registration->init(&context, nullptr, 0);
   }
-  TfLiteIntArray* inputs_array = IntArrayFromInitializer({2, 0, 1});
-  TfLiteIntArray* outputs_array = IntArrayFromInitializer({4, 2, 3, 4, 5});
-  TfLiteIntArray* temporaries_array = IntArrayFromInitializer({0});
+
+  int inputs_array_data[] = {2, 0, 1};
+  TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
+  int outputs_array_data[] = {4, 2, 3, 4, 5};
+  TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
 
   TfLiteNode node;
   node.inputs = inputs_array;
   node.outputs = outputs_array;
-  node.temporaries = temporaries_array;
   node.user_data = user_data;
   node.builtin_data = reinterpret_cast<void*>(&builtin_data);
   node.custom_initial_data = nullptr;
   node.custom_initial_data_size = 0;
-  node.delegate = nullptr;
 
   if (registration->prepare) {
     TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
@@ -275,19 +275,19 @@ void TestSplitTwoOutputsQuantized(
   if (registration->init) {
     user_data = registration->init(&context, nullptr, 0);
   }
-  TfLiteIntArray* inputs_array = IntArrayFromInitializer({2, 0, 1});
-  TfLiteIntArray* outputs_array = IntArrayFromInitializer({2, 2, 3});
-  TfLiteIntArray* temporaries_array = IntArrayFromInitializer({0});
+
+  int inputs_array_data[] = {2, 0, 1};
+  TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
+  int outputs_array_data[] = {2, 2, 3};
+  TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
 
   TfLiteNode node;
   node.inputs = inputs_array;
   node.outputs = outputs_array;
-  node.temporaries = temporaries_array;
   node.user_data = user_data;
   node.builtin_data = reinterpret_cast<void*>(&builtin_data);
   node.custom_initial_data = nullptr;
   node.custom_initial_data_size = 0;
-  node.delegate = nullptr;
 
   if (registration->prepare) {
     TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
@@ -362,19 +362,19 @@ void TestSplitTwoOutputsQuantized32(
   if (registration->init) {
     user_data = registration->init(&context, nullptr, 0);
   }
-  TfLiteIntArray* inputs_array = IntArrayFromInitializer({2, 0, 1});
-  TfLiteIntArray* outputs_array = IntArrayFromInitializer({2, 2, 3});
-  TfLiteIntArray* temporaries_array = IntArrayFromInitializer({0});
+
+  int inputs_array_data[] = {2, 0, 1};
+  TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
+  int outputs_array_data[] = {2, 2, 3};
+  TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
 
   TfLiteNode node;
   node.inputs = inputs_array;
   node.outputs = outputs_array;
-  node.temporaries = temporaries_array;
   node.user_data = user_data;
   node.builtin_data = reinterpret_cast<void*>(&builtin_data);
   node.custom_initial_data = nullptr;
   node.custom_initial_data_size = 0;
-  node.delegate = nullptr;
 
   if (registration->prepare) {
     TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
diff --git a/tensorflow/lite/micro/kernels/strided_slice_test.cc b/tensorflow/lite/micro/kernels/strided_slice_test.cc
index 0ff01f7a71b..6ef162aea3d 100644
--- a/tensorflow/lite/micro/kernels/strided_slice_test.cc
+++ b/tensorflow/lite/micro/kernels/strided_slice_test.cc
@@ -100,12 +100,10 @@ void TestStrideSlide(std::initializer_list<int> input_shape,
   TfLiteNode node;
   node.inputs = inputs_array;
   node.outputs = outputs_array;
-  node.temporaries = nullptr;
   node.user_data = user_data;
   node.builtin_data = reinterpret_cast<void*>(&builtin_data);
   node.custom_initial_data = nullptr;
   node.custom_initial_data_size = 0;
-  node.delegate = nullptr;
   if (registration->prepare) {
     if (expect_prepare_err) {
       TF_LITE_MICRO_EXPECT_EQ(kTfLiteError,
diff --git a/tensorflow/lite/micro/kernels/sub_test.cc b/tensorflow/lite/micro/kernels/sub_test.cc
index b8de6eba453..169f3ad9568 100644
--- a/tensorflow/lite/micro/kernels/sub_test.cc
+++ b/tensorflow/lite/micro/kernels/sub_test.cc
@@ -89,18 +89,14 @@ void ValidateSubGoldens(TfLiteTensor* tensors, int tensors_size,
   TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
   int outputs_array_data[] = {1, 2};
   TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
-  int temporaries_array_data[] = {0};
-  TfLiteIntArray* temporaries_array = IntArrayFromInts(temporaries_array_data);
 
   TfLiteNode node;
   node.inputs = inputs_array;
   node.outputs = outputs_array;
-  node.temporaries = temporaries_array;
   node.user_data = user_data;
   node.builtin_data = reinterpret_cast<void*>(&builtin_data);
   node.custom_initial_data = nullptr;
   node.custom_initial_data_size = 0;
-  node.delegate = nullptr;
 
   if (registration->prepare) {
     TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
diff --git a/tensorflow/lite/micro/kernels/svdf_test.cc b/tensorflow/lite/micro/kernels/svdf_test.cc
index 560f6986a51..ea129efaaa8 100644
--- a/tensorflow/lite/micro/kernels/svdf_test.cc
+++ b/tensorflow/lite/micro/kernels/svdf_test.cc
@@ -201,7 +201,6 @@ void ValidateSVDFGoldens(const int batch_size, const int num_units,
   node.builtin_data = reinterpret_cast<void*>(&params);
   node.custom_initial_data = nullptr;
   node.custom_initial_data_size = 0;
-  node.delegate = nullptr;
   if (registration->prepare) {
     TfLiteStatus prepare_status = registration->prepare(&context, &node);
     TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, prepare_status);
@@ -275,7 +274,6 @@ void ValidateIntegerSVDFGoldens(const int batch_size, const int num_units,
   node.builtin_data = reinterpret_cast<void*>(&params);
   node.custom_initial_data = nullptr;
   node.custom_initial_data_size = 0;
-  node.delegate = nullptr;
 
   if (registration->prepare) {
     TfLiteStatus prepare_status = registration->prepare(&context, &node);
diff --git a/tensorflow/lite/micro/kernels/tanh_test.cc b/tensorflow/lite/micro/kernels/tanh_test.cc
index cd61ef32157..0dd8d619a7c 100644
--- a/tensorflow/lite/micro/kernels/tanh_test.cc
+++ b/tensorflow/lite/micro/kernels/tanh_test.cc
@@ -62,12 +62,10 @@ void TestTanhFloat(std::initializer_list<int> input_dims_data,
   TfLiteNode node;
   node.inputs = inputs_array;
   node.outputs = outputs_array;
-  node.temporaries = nullptr;
   node.user_data = user_data;
   node.builtin_data = nullptr;
   node.custom_initial_data = nullptr;
   node.custom_initial_data_size = 0;
-  node.delegate = nullptr;
   if (registration->prepare) {
     TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
   }
@@ -122,12 +120,10 @@ void TestTanhInt8(std::initializer_list<int> input_dims_data,
   TfLiteNode node;
   node.inputs = inputs_array;
   node.outputs = outputs_array;
-  node.temporaries = nullptr;
   node.user_data = user_data;
   node.builtin_data = nullptr;
   node.custom_initial_data = nullptr;
   node.custom_initial_data_size = 0;
-  node.delegate = nullptr;
   if (registration->prepare) {
     TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
   }
diff --git a/tensorflow/lite/micro/kernels/unpack_test.cc b/tensorflow/lite/micro/kernels/unpack_test.cc
index 3fd6b1bf242..1b801c2901d 100644
--- a/tensorflow/lite/micro/kernels/unpack_test.cc
+++ b/tensorflow/lite/micro/kernels/unpack_test.cc
@@ -79,19 +79,18 @@ void TestUnpackThreeOutputsFloat(
   if (registration->init) {
     user_data = registration->init(&context, nullptr, 0);
   }
-  TfLiteIntArray* inputs_array = IntArrayFromInitializer({1, 0});
-  TfLiteIntArray* outputs_array = IntArrayFromInitializer({3, 1, 2, 3});
-  TfLiteIntArray* temporaries_array = IntArrayFromInitializer({0});
+  int inputs_array_data[] = {1, 0};
+  TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
+  int outputs_array_data[] = {3, 1, 2, 3};
+  TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
 
   TfLiteNode node;
   node.inputs = inputs_array;
   node.outputs = outputs_array;
-  node.temporaries = temporaries_array;
   node.user_data = user_data;
   node.builtin_data = reinterpret_cast<void*>(&builtin_data);
   node.custom_initial_data = nullptr;
   node.custom_initial_data_size = 0;
-  node.delegate = nullptr;
 
   if (registration->prepare) {
     TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
@@ -156,19 +155,18 @@ void TestUnpackOneOutputFloat(std::initializer_list<int> input_dims_data,
   if (registration->init) {
     user_data = registration->init(&context, nullptr, 0);
   }
-  TfLiteIntArray* inputs_array = IntArrayFromInitializer({1, 0});
-  TfLiteIntArray* outputs_array = IntArrayFromInitializer({1, 1});
-  TfLiteIntArray* temporaries_array = IntArrayFromInitializer({0});
+  int inputs_array_data[] = {1, 0};
+  TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
+  int outputs_array_data[] = {1, 1};
+  TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
 
   TfLiteNode node;
   node.inputs = inputs_array;
   node.outputs = outputs_array;
-  node.temporaries = temporaries_array;
   node.user_data = user_data;
   node.builtin_data = reinterpret_cast<void*>(&builtin_data);
   node.custom_initial_data = nullptr;
   node.custom_initial_data_size = 0;
-  node.delegate = nullptr;
 
   if (registration->prepare) {
     TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
@@ -245,19 +243,18 @@ void TestUnpackThreeOutputsQuantized(
   if (registration->init) {
     user_data = registration->init(&context, nullptr, 0);
   }
-  TfLiteIntArray* inputs_array = IntArrayFromInitializer({1, 0});
-  TfLiteIntArray* outputs_array = IntArrayFromInitializer({3, 1, 2, 3});
-  TfLiteIntArray* temporaries_array = IntArrayFromInitializer({0});
+  int inputs_array_data[] = {1, 0};
+  TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
+  int outputs_array_data[] = {3, 1, 2, 3};
+  TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
 
   TfLiteNode node;
   node.inputs = inputs_array;
   node.outputs = outputs_array;
-  node.temporaries = temporaries_array;
   node.user_data = user_data;
   node.builtin_data = reinterpret_cast<void*>(&builtin_data);
   node.custom_initial_data = nullptr;
   node.custom_initial_data_size = 0;
-  node.delegate = nullptr;
 
   if (registration->prepare) {
     TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
@@ -338,19 +335,18 @@ void TestUnpackThreeOutputsQuantized32(
   if (registration->init) {
     user_data = registration->init(&context, nullptr, 0);
   }
-  TfLiteIntArray* inputs_array = IntArrayFromInitializer({1, 0});
-  TfLiteIntArray* outputs_array = IntArrayFromInitializer({3, 1, 2, 3});
-  TfLiteIntArray* temporaries_array = IntArrayFromInitializer({0});
+  int inputs_array_data[] = {1, 0};
+  TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
+  int outputs_array_data[] = {3, 1, 2, 3};
+  TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
 
   TfLiteNode node;
   node.inputs = inputs_array;
   node.outputs = outputs_array;
-  node.temporaries = temporaries_array;
   node.user_data = user_data;
   node.builtin_data = reinterpret_cast<void*>(&builtin_data);
   node.custom_initial_data = nullptr;
   node.custom_initial_data_size = 0;
-  node.delegate = nullptr;
 
   if (registration->prepare) {
     TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
diff --git a/tensorflow/lite/micro/memory_arena_threshold_test.cc b/tensorflow/lite/micro/memory_arena_threshold_test.cc
index c698f2c7115..8cd1518536b 100644
--- a/tensorflow/lite/micro/memory_arena_threshold_test.cc
+++ b/tensorflow/lite/micro/memory_arena_threshold_test.cc
@@ -45,8 +45,8 @@ constexpr int kKeywordModelNodeAndRegistrationCount = 15;
 // Run this test with '--copt=-DTF_LITE_MICRO_OPTIMIZED_RUNTIME' to get
 // optimized memory runtime values:
 #ifdef TF_LITE_STATIC_MEMORY
-constexpr int kKeywordModelTotalSize = 18448;
-constexpr int kKeywordModelTailSize = 17776;
+constexpr int kKeywordModelTotalSize = 18080;
+constexpr int kKeywordModelTailSize = 17408;
 #else
 constexpr int kKeywordModelTotalSize = 21040;
 constexpr int kKeywordModelTailSize = 20368;
@@ -65,8 +65,8 @@ constexpr int kTestConvModelNodeAndRegistrationCount = 7;
 // NOTE: These values are measured on x86-64:
 // TODO(b/158651472): Consider auditing these values on non-64 bit systems.
 #ifdef TF_LITE_STATIC_MEMORY
-constexpr int kTestConvModelTotalSize = 10960;
-constexpr int kTestConvModelTailSize = 3216;
+constexpr int kTestConvModelTotalSize = 10784;
+constexpr int kTestConvModelTailSize = 3040;
 #else
 constexpr int kTestConvModelTotalSize = 11680;
 constexpr int kTestConvModelTailSize = 3936;
diff --git a/tensorflow/lite/tools/benchmark/experimental/c/c_api_types.h b/tensorflow/lite/tools/benchmark/experimental/c/c_api_types.h
index 9093e5d50ad..6e146cbc356 100644
--- a/tensorflow/lite/tools/benchmark/experimental/c/c_api_types.h
+++ b/tensorflow/lite/tools/benchmark/experimental/c/c_api_types.h
@@ -440,76 +440,6 @@ typedef struct TfLiteTensor {
   // `dims_signature` contains [1, -1, -1, 3]).
   const TfLiteIntArray* dims_signature;
 } TfLiteTensor;
-#else
-// Specific reduced TfLiteTensor struct for TF Micro runtime. This struct
-// contains only the minimum fields required to initialize and prepare a micro
-// inference graph. The fields in this struct have been ordered from
-// largest-to-smallest for optimal struct sizeof.
-//
-// NOTE: This flag is opt-in only at compile time.
-typedef struct TfLiteTensor {
-  // TODO(b/155784997): Consider consolidating these quantization fields:
-  // Quantization information. Replaces params field above.
-  TfLiteQuantization quantization;
-
-  // Quantization information.
-  TfLiteQuantizationParams params;
-
-  // A union of data pointers. The appropriate type should be used for a typed
-  // tensor based on `type`.
-  TfLitePtrUnion data;
-
-  // A pointer to a structure representing the dimensionality interpretation
-  // that the buffer should have. NOTE: the product of elements of `dims`
-  // and the element datatype size should be equal to `bytes` below.
-  TfLiteIntArray* dims;
-
-  // The number of bytes required to store the data of this Tensor. I.e.
-  // (bytes of each element) * dims[0] * ... * dims[n-1].  For example, if
-  // type is kTfLiteFloat32 and dims = {3, 2} then
-  // bytes = sizeof(float) * 3 * 2 = 4 * 3 * 2 = 24.
-  size_t bytes;
-
-  // The data type specification for data stored in `data`. This affects
-  // what member of `data` union should be used.
-  TfLiteType type;
-
-  // How memory is mapped
-  //  kTfLiteMmapRo: Memory mapped read only.
-  //  i.e. weights
-  //  kTfLiteArenaRw: Arena allocated read write memory
-  //  (i.e. temporaries, outputs).
-  TfLiteAllocationType allocation_type;
-
-  // True if the tensor is a variable.
-  bool is_variable;
-} TfLiteTensor;
-#endif  // TF_LITE_STATIC_MEMORY
-
-#ifndef TF_LITE_STATIC_MEMORY
-// Free data memory of tensor `t`.
-void TfLiteTensorDataFree(TfLiteTensor* t);
-
-// Free quantization data.
-void TfLiteQuantizationFree(TfLiteQuantization* quantization);
-
-// Free sparsity parameters.
-void TfLiteSparsityFree(TfLiteSparsity* sparsity);
-
-// Free memory of tensor `t`.
-void TfLiteTensorFree(TfLiteTensor* t);
-
-// Set all of a tensor's fields (and free any previously allocated data).
-void TfLiteTensorReset(TfLiteType type, const char* name, TfLiteIntArray* dims,
-                       TfLiteQuantizationParams quantization, char* buffer,
-                       size_t size, TfLiteAllocationType allocation_type,
-                       const void* allocation, bool is_variable,
-                       TfLiteTensor* tensor);
-
-// Resize the allocated data of a (dynamic) tensor. Tensors with allocation
-// types other than kTfLiteDynamic will be ignored.
-void TfLiteTensorRealloc(size_t num_bytes, TfLiteTensor* tensor);
-#endif  // TF_LITE_STATIC_MEMORY
 
 // A structure representing an instance of a node.
 // This structure only exhibits the inputs, outputs and user defined data, not
@@ -547,6 +477,112 @@ typedef struct TfLiteNode {
   // WARNING: This is an experimental interface that is subject to change.
   struct TfLiteDelegate* delegate;
 } TfLiteNode;
+#else
+// NOTE: This flag is opt-in only at compile time.
+//
+// Specific reduced TfLiteTensor struct for TF Micro runtime. This struct
+// contains only the minimum fields required to initialize and prepare a micro
+// inference graph. The fields in this struct have been ordered from
+// largest-to-smallest for optimal struct sizeof.
+//
+// This struct does not use:
+// - allocation
+// - buffer_handle
+// - data_is_stale
+// - delegate
+// - dims_signature
+// - name
+// - sparsity
+typedef struct TfLiteTensor {
+  // TODO(b/155784997): Consider consolidating these quantization fields:
+  // Quantization information. Replaces params field above.
+  TfLiteQuantization quantization;
+
+  // Quantization information.
+  TfLiteQuantizationParams params;
+
+  // A union of data pointers. The appropriate type should be used for a typed
+  // tensor based on `type`.
+  TfLitePtrUnion data;
+
+  // A pointer to a structure representing the dimensionality interpretation
+  // that the buffer should have. NOTE: the product of elements of `dims`
+  // and the element datatype size should be equal to `bytes` below.
+  TfLiteIntArray* dims;
+
+  // The number of bytes required to store the data of this Tensor. I.e.
+  // (bytes of each element) * dims[0] * ... * dims[n-1].  For example, if
+  // type is kTfLiteFloat32 and dims = {3, 2} then
+  // bytes = sizeof(float) * 3 * 2 = 4 * 3 * 2 = 24.
+  size_t bytes;
+
+  // The data type specification for data stored in `data`. This affects
+  // what member of `data` union should be used.
+  TfLiteType type;
+
+  // How memory is mapped
+  //  kTfLiteMmapRo: Memory mapped read only.
+  //  i.e. weights
+  //  kTfLiteArenaRw: Arena allocated read write memory
+  //  (i.e. temporaries, outputs).
+  TfLiteAllocationType allocation_type;
+
+  // True if the tensor is a variable.
+  bool is_variable;
+} TfLiteTensor;
+
+// Specific reduced TfLiteNode struct for TF Micro runtime. This struct contains
+// only the minimum fields required to represent a node.
+//
+// This struct does not use:
+// - delegate
+// - intermediates
+// - temporaries
+typedef struct TfLiteNode {
+  // Inputs to this node expressed as indices into the simulator's tensors.
+  TfLiteIntArray* inputs;
+
+  // Outputs to this node expressed as indices into the simulator's tensors.
+  TfLiteIntArray* outputs;
+
+  // Opaque data provided by the node implementer through `Registration.init`.
+  void* user_data;
+
+  // Opaque data provided to the node if the node is a builtin. This is usually
+  // a structure defined in builtin_op_data.h
+  void* builtin_data;
+
+  // Custom initial data. This is the opaque data provided in the flatbuffer.
+  // WARNING: This is an experimental interface that is subject to change.
+  const void* custom_initial_data;
+  int custom_initial_data_size;
+} TfLiteNode;
+#endif  // TF_LITE_STATIC_MEMORY
+
+#ifndef TF_LITE_STATIC_MEMORY
+// Free data memory of tensor `t`.
+void TfLiteTensorDataFree(TfLiteTensor* t);
+
+// Free quantization data.
+void TfLiteQuantizationFree(TfLiteQuantization* quantization);
+
+// Free sparsity parameters.
+void TfLiteSparsityFree(TfLiteSparsity* sparsity);
+
+// Free memory of tensor `t`.
+void TfLiteTensorFree(TfLiteTensor* t);
+
+// Set all of a tensor's fields (and free any previously allocated data).
+void TfLiteTensorReset(TfLiteType type, const char* name, TfLiteIntArray* dims,
+                       TfLiteQuantizationParams quantization, char* buffer,
+                       size_t size, TfLiteAllocationType allocation_type,
+                       const void* allocation, bool is_variable,
+                       TfLiteTensor* tensor);
+
+// Resize the allocated data of a (dynamic) tensor. Tensors with allocation
+// types other than kTfLiteDynamic will be ignored.
+void TfLiteTensorRealloc(size_t num_bytes, TfLiteTensor* tensor);
+#endif  // TF_LITE_STATIC_MEMORY
 
 // WARNING: This is an experimental interface that is subject to change.
 //

From 9070ba741a9acf2fa9b58a8869767ef8c15e8ae1 Mon Sep 17 00:00:00 2001
From: Hyeonjong Ryu <xhae@google.com>
Date: Wed, 24 Jun 2020 08:21:48 -0700
Subject: [PATCH 0970/1390] Support INT64 on Sub op

PiperOrigin-RevId: 318067257
Change-Id: I42ca9263ba3817631b9d203739a2920f018d22c6
---
 tensorflow/compiler/mlir/lite/ir/tfl_ops.cc   |  8 +-
 tensorflow/compiler/mlir/lite/ir/tfl_ops.td   |  6 +-
 .../compiler/mlir/lite/tests/legalize-tf.mlir |  9 ++
 tensorflow/compiler/mlir/lite/tests/ops.mlir  |  8 ++
 tensorflow/lite/kernels/internal/common.h     |  9 +-
 .../internal/optimized/optimized_ops.h        | 52 ++++++-----
 .../lite/kernels/internal/reference/sub.h     | 93 +++++++++++++------
 tensorflow/lite/kernels/internal/types.h      | 16 ++++
 tensorflow/lite/kernels/register.cc           |  2 +-
 tensorflow/lite/kernels/sub.cc                |  8 +-
 tensorflow/lite/kernels/sub_test.cc           | 58 ++++++++++++
 .../testing/generated_examples_zip_test.cc    |  1 -
 tensorflow/lite/toco/tflite/op_version.cc     |  1 +
 .../lite/tools/versioning/op_version.cc       |  4 +
 .../lite/tools/versioning/op_version_test.cc  |  8 ++
 .../lite/tools/versioning/runtime_version.cc  |  1 +
 16 files changed, 223 insertions(+), 61 deletions(-)

diff --git a/tensorflow/compiler/mlir/lite/ir/tfl_ops.cc b/tensorflow/compiler/mlir/lite/ir/tfl_ops.cc
index 16d256c7571..853c641e282 100644
--- a/tensorflow/compiler/mlir/lite/ir/tfl_ops.cc
+++ b/tensorflow/compiler/mlir/lite/ir/tfl_ops.cc
@@ -138,6 +138,11 @@ bool IsI32Type(Type element_type) {
   return element_type.isInteger(32) && !element_type.isUnsignedInteger();
 }
 
+// Return true when the given element_type is I64.
+bool IsI64Type(Type element_type) {
+  return element_type.isInteger(64) && !element_type.isUnsignedInteger();
+}
+
 // Return true if the given Add operation has the CPU kernel supported shapes.
 bool VerifyAddOpShapeConstraints(AddOp op) {
   auto element_type = getElementTypeOrSelf(op.output().getType());
@@ -174,7 +179,8 @@ bool VerifySubOpShapeConstraints(SubOp op) {
   // Allows F32, QUI8, and QI16 outputs when the operands have valid shapes,
   // which are broadcastable shapes up to five dimension or have same shapes.
   if (element_type.isF32() || IsI32Type(element_type) ||
-      IsQUI8Type(element_type) || IsQI16Type(element_type)) {
+      IsI64Type(element_type) || IsQUI8Type(element_type) ||
+      IsQI16Type(element_type)) {
     return VerifyOperandsHaveSameShapesOrBroadcastableShape(
         /*op=*/op.getOperation(), /*indices=*/ArrayRef<unsigned>{0, 1},
         /*max_bcast_rank=*/5);
diff --git a/tensorflow/compiler/mlir/lite/ir/tfl_ops.td b/tensorflow/compiler/mlir/lite/ir/tfl_ops.td
index f379b241f9d..999dd7af0d4 100644
--- a/tensorflow/compiler/mlir/lite/ir/tfl_ops.td
+++ b/tensorflow/compiler/mlir/lite/ir/tfl_ops.td
@@ -2864,11 +2864,11 @@ def TFL_SubOp : TFL_Op<"sub", [
   }];
 
   let arguments = (
-    ins TFL_TensorOf<[F32, I32, QI8, QUI8, QI16]>:$lhs,
-    TFL_TensorOf<[F32, I32, QI8, QUI8, QI16]>:$rhs,
+    ins TFL_TensorOf<[F32, I32, I64, QI8, QUI8, QI16]>:$lhs,
+    TFL_TensorOf<[F32, I32, I64, QI8, QUI8, QI16]>:$rhs,
     TFL_AFAttr:$fused_activation_function);
 
-  let results = (outs TFL_TensorOf<[F32, I32, QI8, QUI8, QI16]>:$output);
+  let results = (outs TFL_TensorOf<[F32, I32, I64, QI8, QUI8, QI16]>:$output);
 
   let hasFolder = 1;
 
diff --git a/tensorflow/compiler/mlir/lite/tests/legalize-tf.mlir b/tensorflow/compiler/mlir/lite/tests/legalize-tf.mlir
index 5756fa6dec2..7cb9c4dd22c 100644
--- a/tensorflow/compiler/mlir/lite/tests/legalize-tf.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/legalize-tf.mlir
@@ -9,6 +9,15 @@ func @add(%arg0: tensor<1xf32>, %arg1: tensor<1xf32>) -> tensor<1xf32> {
 // CHECK:  return
 }
 
+func @sub(%arg0: tensor<1xi64>, %arg1: tensor<1xi64>) -> tensor<1xi64> {
+  %0 = "tf.Sub"(%arg0, %arg1) : (tensor<1xi64>, tensor<1xi64>) -> tensor<1xi64>
+  return %0: tensor<1xi64>
+
+// CHECK-LABEL: sub
+// CHECK:  tfl.sub %arg0, %arg1 {fused_activation_function = "NONE"} : tensor<1xi64>
+// CHECK:  return
+}
+
 // CHECK-LABEL: testAddHighDimsHaveSameShape
 func @testAddHighDimsHaveSameShape(%arg0: tensor<1x2x3x4x5x6x7x8xi32>, %arg1: tensor<1x2x3x4x5x6x7x8xi32>) -> tensor<1x2x3x4x5x6x7x8xi32> {
   // CHECK: tfl.add %arg0, %arg1 {fused_activation_function = "NONE"}
diff --git a/tensorflow/compiler/mlir/lite/tests/ops.mlir b/tensorflow/compiler/mlir/lite/tests/ops.mlir
index a84e5a7c618..f1742538935 100644
--- a/tensorflow/compiler/mlir/lite/tests/ops.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/ops.mlir
@@ -269,6 +269,14 @@ func @testSub(tensor<? x i32>, tensor<? x i32>) -> tensor<? x i32> {
   return %0#0 : tensor<? x i32>
 }
 
+// CHECK-LABEL: testSubInt64
+func @testSubInt64(tensor<? x i64>, tensor<? x i64>) -> tensor<? x i64> {
+^bb0(%arg0: tensor<? x i64>, %arg1: tensor<? x i64>):
+  // CHECK: tfl.sub %arg0, %arg1 {fused_activation_function = "RELU6"}
+  %0 = tfl.sub %arg0, %arg1 {fused_activation_function = "RELU6"} : tensor<? x i64>
+  return %0#0 : tensor<? x i64>
+}
+
 // CHECK-LABEL: testMul
 func @testMul(tensor<? x i32>, tensor<? x i32>) -> tensor<? x i32> {
 ^bb0(%arg0: tensor<? x i32>, %arg1: tensor<? x i32>):
diff --git a/tensorflow/lite/kernels/internal/common.h b/tensorflow/lite/kernels/internal/common.h
index c1db3587415..ce0f0157559 100644
--- a/tensorflow/lite/kernels/internal/common.h
+++ b/tensorflow/lite/kernels/internal/common.h
@@ -55,9 +55,12 @@ inline void GetActivationMinMax(FusedActivationFunctionType ac,
   }
 }
 
-inline float ActivationFunctionWithMinMax(float x, float output_activation_min,
-                                          float output_activation_max) {
-  return std::min(std::max(x, output_activation_min), output_activation_max);
+template <typename T>
+inline T ActivationFunctionWithMinMax(T x, T output_activation_min,
+                                      T output_activation_max) {
+  using std::max;
+  using std::min;
+  return min(max(x, output_activation_min), output_activation_max);
 }
 
 // Legacy function, left for compatibility only.
diff --git a/tensorflow/lite/kernels/internal/optimized/optimized_ops.h b/tensorflow/lite/kernels/internal/optimized/optimized_ops.h
index 528eea3d698..5e09a0cdb17 100644
--- a/tensorflow/lite/kernels/internal/optimized/optimized_ops.h
+++ b/tensorflow/lite/kernels/internal/optimized/optimized_ops.h
@@ -2766,37 +2766,39 @@ inline void SubNonBroadcast(const ArithmeticParams& params,
   }
 }
 
-inline void SubWithActivation(const ArithmeticParams& params,
-                              const RuntimeShape& input1_shape,
-                              const int32* input1_data,
-                              const RuntimeShape& input2_shape,
-                              const int32* input2_data,
-                              const RuntimeShape& output_shape,
-                              int32* output_data) {
-  ruy::profiler::ScopeLabel label("SubWithActivation/int32");
-  const int flat_size =
-      MatchingElementsSize(input1_shape, input2_shape, output_shape);
-  for (int i = 0; i < flat_size; ++i) {
-    output_data[i] = ActivationFunctionWithMinMax(
-        input1_data[i] - input2_data[i], params.quantized_activation_min,
-        params.quantized_activation_max);
-  }
+inline void SetActivationMinMax(const ArithmeticParams& params,
+                                int32* activation_min, int32* activation_max) {
+  *activation_min = params.quantized_activation_min;
+  *activation_max = params.quantized_activation_max;
 }
 
-inline void SubWithActivation(const ArithmeticParams& params,
-                              const RuntimeShape& input1_shape,
-                              const float* input1_data,
-                              const RuntimeShape& input2_shape,
-                              const float* input2_data,
-                              const RuntimeShape& output_shape,
-                              float* output_data) {
-  ruy::profiler::ScopeLabel label("SubWithActivation/float");
+inline void SetActivationMinMax(const ArithmeticParams& params,
+                                float* activation_min, float* activation_max) {
+  *activation_min = params.float_activation_min;
+  *activation_max = params.float_activation_max;
+}
+
+inline void SetActivationMinMax(const ArithmeticParams& params,
+                                int64_t* activation_min,
+                                int64_t* activation_max) {
+  *activation_min = params.int64_activation_min;
+  *activation_max = params.int64_activation_max;
+}
+
+template <typename T>
+inline void SubWithActivation(
+    const ArithmeticParams& params, const RuntimeShape& input1_shape,
+    const T* input1_data, const RuntimeShape& input2_shape,
+    const T* input2_data, const RuntimeShape& output_shape, T* output_data) {
+  ruy::profiler::ScopeLabel label("SubWithActivation_optimized");
   const int flat_size =
       MatchingElementsSize(input1_shape, input2_shape, output_shape);
+  T activation_min, activation_max;
+  SetActivationMinMax(params, &activation_min, &activation_max);
+
   for (int i = 0; i < flat_size; ++i) {
     output_data[i] = ActivationFunctionWithMinMax(
-        input1_data[i] - input2_data[i], params.float_activation_min,
-        params.float_activation_max);
+        input1_data[i] - input2_data[i], activation_min, activation_max);
   }
 }
 
diff --git a/tensorflow/lite/kernels/internal/reference/sub.h b/tensorflow/lite/kernels/internal/reference/sub.h
index 6191eaac558..91ef7f2c2fd 100644
--- a/tensorflow/lite/kernels/internal/reference/sub.h
+++ b/tensorflow/lite/kernels/internal/reference/sub.h
@@ -260,6 +260,45 @@ inline void BroadcastSubSlow(const ArithmeticParams& params,
   NDOpsHelper<N>(output_desc, sub_func);
 }
 
+template <int N = 5>
+void BroadcastSubSlow(const ArithmeticParams& params,
+                      const RuntimeShape& input1_shape,
+                      const int64_t* input1_data,
+                      const RuntimeShape& input2_shape,
+                      const int64_t* input2_data,
+                      const RuntimeShape& output_shape, int64_t* output_data) {
+  ruy::profiler::ScopeLabel label("BroadcastSubSlow/int64");
+  TFLITE_DCHECK_LE(input1_shape.DimensionsCount(), N);
+  TFLITE_DCHECK_LE(input2_shape.DimensionsCount(), N);
+  TFLITE_DCHECK_LE(output_shape.DimensionsCount(), N);
+  NdArrayDesc<N> desc1;
+  NdArrayDesc<N> desc2;
+  NdArrayDesc<N> output_desc;
+  NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
+                                      &desc2);
+  CopyDimsToDesc(RuntimeShape::ExtendedShape(N, output_shape), &output_desc);
+
+  // In Tensorflow, the dimensions are canonically named (batch_number, row,
+  // col, channel), with extents (batches, height, width, depth), with the
+  // trailing dimension changing most rapidly (channels has the smallest stride,
+  // typically 1 element).
+  //
+  // In generated C code, we store arrays with the dimensions reversed. The
+  // first dimension has smallest stride.
+  //
+  // We name our variables by their Tensorflow convention, but generate C code
+  // nesting loops such that the innermost loop has the smallest stride for the
+  // best cache behavior.
+  auto sub_func = [&](int indexes[N]) {
+    output_data[SubscriptToIndex(output_desc, indexes)] =
+        ActivationFunctionWithMinMax(
+            input1_data[SubscriptToIndex(desc1, indexes)] -
+                input2_data[SubscriptToIndex(desc2, indexes)],
+            params.int64_activation_min, params.int64_activation_max);
+  };
+  NDOpsHelper<N>(output_desc, sub_func);
+}
+
 template <typename T, int N = 5>
 void BroadcastSubSlow(const ArithmeticParams& params,
                       const RuntimeShape& input1_shape, const T* input1_data,
@@ -434,40 +473,42 @@ void Sub(const ArithmeticParams& params, const RuntimeShape& input1_shape,
   }
 }
 
-inline void SubWithActivation(const ArithmeticParams& params,
-                              const RuntimeShape& input1_shape,
-                              const int32* input1_data,
-                              const RuntimeShape& input2_shape,
-                              const int32* input2_data,
-                              const RuntimeShape& output_shape,
-                              int32* output_data) {
+inline void SetActivationMinMax(const ArithmeticParams& params,
+                                int32* activation_min, int32* activation_max) {
+  *activation_min = params.quantized_activation_min;
+  *activation_max = params.quantized_activation_max;
+}
+
+inline void SetActivationMinMax(const ArithmeticParams& params,
+                                float* activation_min, float* activation_max) {
+  *activation_min = params.float_activation_min;
+  *activation_max = params.float_activation_max;
+}
+
+inline void SetActivationMinMax(const ArithmeticParams& params,
+                                int64_t* activation_min,
+                                int64_t* activation_max) {
+  *activation_min = params.int64_activation_min;
+  *activation_max = params.int64_activation_max;
+}
+
+template <typename T>
+inline void SubWithActivation(
+    const ArithmeticParams& params, const RuntimeShape& input1_shape,
+    const T* input1_data, const RuntimeShape& input2_shape,
+    const T* input2_data, const RuntimeShape& output_shape, T* output_data) {
   ruy::profiler::ScopeLabel label("SubWithActivation");
   const int flat_size =
       MatchingElementsSize(input1_shape, input2_shape, output_shape);
+  T activation_min, activation_max;
+  SetActivationMinMax(params, &activation_min, &activation_max);
+
   for (int i = 0; i < flat_size; ++i) {
     output_data[i] = ActivationFunctionWithMinMax(
-        input1_data[i] - input2_data[i], params.quantized_activation_min,
-        params.quantized_activation_max);
+        input1_data[i] - input2_data[i], activation_min, activation_max);
   }
 }
 
-inline void SubWithActivation(const ArithmeticParams& params,
-                              const RuntimeShape& input1_shape,
-                              const float* input1_data,
-                              const RuntimeShape& input2_shape,
-                              const float* input2_data,
-                              const RuntimeShape& output_shape,
-                              float* output_data) {
-  const int flat_size =
-      MatchingElementsSize(input1_shape, input2_shape, output_shape);
-  for (int i = 0; i < flat_size; ++i) {
-    output_data[i] = ActivationFunctionWithMinMax(
-        input1_data[i] - input2_data[i], params.float_activation_min,
-        params.float_activation_max);
-  }
-}
-
-
 }  // namespace reference_ops
 }  // namespace tflite
 
diff --git a/tensorflow/lite/kernels/internal/types.h b/tensorflow/lite/kernels/internal/types.h
index 2a34f6608a3..7fe23ab2346 100644
--- a/tensorflow/lite/kernels/internal/types.h
+++ b/tensorflow/lite/kernels/internal/types.h
@@ -765,12 +765,17 @@ struct ArithmeticParams {
   int input1_shift;
   int32 input2_multiplier;
   int input2_shift;
+
+  // TODO(b/158622529): Union the following activation params.
   // uint8, etc, activation params.
   int32 quantized_activation_min;
   int32 quantized_activation_max;
   // float activation params.
   float float_activation_min;
   float float_activation_max;
+  // int64 activation params.
+  int64_t int64_activation_min;
+  int64_t int64_activation_max;
 
   // Processed output dimensions.
   // Let input "a" be the one that broadcasts in the faster-changing dimension.
@@ -1114,6 +1119,12 @@ inline void SetActivationParams(int32 min, int32 max, P* params) {
   params->quantized_activation_max = max;
 }
 
+template <typename P>
+inline void SetActivationParams(int64_t min, int64_t max, P* params) {
+  params->int64_activation_min = min;
+  params->int64_activation_max = max;
+}
+
 template <typename P>
 inline void GetActivationParams(const P& params, int32* min, int32* max) {
   *min = params.quantized_activation_min;
@@ -1126,6 +1137,11 @@ inline void GetActivationParams(const P& params, float* min, float* max) {
   *max = params.float_activation_max;
 }
 
+template <typename P>
+inline void GetActivationParams(const P& params, int64_t* min, int64_t* max) {
+  *min = params.int64_activation_min;
+  *max = params.int64_activation_max;
+}
 }  // namespace tflite
 
 #endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_TYPES_H_
diff --git a/tensorflow/lite/kernels/register.cc b/tensorflow/lite/kernels/register.cc
index 452ce35ec78..f8e29caf608 100644
--- a/tensorflow/lite/kernels/register.cc
+++ b/tensorflow/lite/kernels/register.cc
@@ -143,7 +143,7 @@ BuiltinOpResolver::BuiltinOpResolver() {
              /* max_version */ 2);
   AddBuiltin(BuiltinOperator_SUB, Register_SUB(),
              /* min_version = */ 1,
-             /* max_version = */ 3);
+             /* max_version = */ 4);
   AddBuiltin(BuiltinOperator_SPLIT, Register_SPLIT(),
              /* min_version = */ 1,
              /* max_version = */ 4);
diff --git a/tensorflow/lite/kernels/sub.cc b/tensorflow/lite/kernels/sub.cc
index 83b2714135d..4cd9dd7ff60 100644
--- a/tensorflow/lite/kernels/sub.cc
+++ b/tensorflow/lite/kernels/sub.cc
@@ -286,6 +286,11 @@ void EvalSub(TfLiteContext* context, TfLiteNode* node, TfLiteSubParams* params,
       EvalSubImpl<kernel_type, float>(context, node, params, data, input1,
                                       input2, requires_broadcast, output);
       break;
+    case kTfLiteInt64:
+      EvalSubImpl<kernel_type, int64_t>(context, node, params, data, input1,
+                                        input2, requires_broadcast, output);
+      break;
+
     default:
       TF_LITE_KERNEL_LOG(context, "output type %s is not supported.",
                          TfLiteTypeGetName(output->type));
@@ -371,7 +376,8 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   const TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
 
-  if (output->type == kTfLiteFloat32 || output->type == kTfLiteInt32) {
+  if (output->type == kTfLiteFloat32 || output->type == kTfLiteInt32 ||
+      output->type == kTfLiteInt64) {
     EvalSub<kernel_type>(context, node, params, data, input1, input2, output);
   } else if (output->type == kTfLiteUInt8 || output->type == kTfLiteInt8 ||
              output->type == kTfLiteInt16) {
diff --git a/tensorflow/lite/kernels/sub_test.cc b/tensorflow/lite/kernels/sub_test.cc
index 21f2dc7cabd..67054fe4903 100644
--- a/tensorflow/lite/kernels/sub_test.cc
+++ b/tensorflow/lite/kernels/sub_test.cc
@@ -63,6 +63,13 @@ class IntegerSubOpModel : public BaseSubOpModel {
   std::vector<int32_t> GetOutput() { return ExtractVector<int32_t>(output_); }
 };
 
+class Int64SubOpModel : public BaseSubOpModel {
+ public:
+  using BaseSubOpModel::BaseSubOpModel;
+
+  std::vector<int64_t> GetOutput() { return ExtractVector<int64_t>(output_); }
+};
+
 class QuantizedSubOpModel : public BaseSubOpModel {
  public:
   using BaseSubOpModel::BaseSubOpModel;
@@ -213,6 +220,57 @@ TEST(IntegerSubOpModel, WithBroadcast) {
   }
 }
 
+TEST(Int64SubOpModel, NoActivation) {
+  Int64SubOpModel m({TensorType_INT64, {1, 2, 2, 1}},
+                    {TensorType_INT64, {1, 2, 2, 1}}, {TensorType_INT64, {}},
+                    ActivationFunctionType_NONE);
+  m.PopulateTensor<int64_t>(m.input1(), {-20, 2, 7, 8});
+  m.PopulateTensor<int64_t>(m.input2(), {1, 2, 3, 5});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({-21, 0, 4, 3}));
+}
+
+TEST(Int64SubOpModel, ActivationRELU_N1_TO_1) {
+  Int64SubOpModel m({TensorType_INT64, {1, 2, 2, 1}},
+                    {TensorType_INT64, {1, 2, 2, 1}}, {TensorType_INT64, {}},
+                    ActivationFunctionType_RELU_N1_TO_1);
+  m.PopulateTensor<int64_t>(m.input1(), {-20, 2, 7, 8});
+  m.PopulateTensor<int64_t>(m.input2(), {1, 2, 3, 5});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({-1, 0, 1, 1}));
+}
+
+TEST(Int64SubOpModel, VariousInputShapes) {
+  std::vector<std::vector<int>> test_shapes = {
+      {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}};
+  for (int i = 0; i < test_shapes.size(); ++i) {
+    Int64SubOpModel m({TensorType_INT64, test_shapes[i]},
+                      {TensorType_INT64, test_shapes[i]},
+                      {TensorType_INT64, {}}, ActivationFunctionType_NONE);
+    m.PopulateTensor<int64_t>(m.input1(), {-20, 2, 7, 8, 11, 20});
+    m.PopulateTensor<int64_t>(m.input2(), {1, 2, 3, 5, 11, 1});
+    m.Invoke();
+    EXPECT_THAT(m.GetOutput(), ElementsAreArray({-21, 0, 4, 3, 0, 19}))
+        << "With shape number " << i;
+  }
+}
+
+TEST(Int64SubOpModel, WithBroadcast) {
+  std::vector<std::vector<int>> test_shapes = {
+      {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}, {1, 3, 1, 2, 1}};
+  for (int i = 0; i < test_shapes.size(); ++i) {
+    Int64SubOpModel m({TensorType_INT64, test_shapes[i]},
+                      {TensorType_INT64, {}},  // always a scalar
+                      {TensorType_INT64, {}}, ActivationFunctionType_NONE);
+    m.PopulateTensor<int64_t>(m.input1(), {-20, 2, 7, 8, 11, 20});
+    m.PopulateTensor<int64_t>(m.input2(), {1});
+    m.Invoke();
+    EXPECT_THAT(m.GetOutput(),
+                ElementsAreArray(ArrayFloatNear({-21, 1, 6, 7, 10, 19})))
+        << "With shape number " << i;
+  }
+}
+
 template <TensorType tensor_type, typename integer_dtype>
 void QuantizedTestsNoActivation() {
   float kQuantizedTolerance = GetTolerance(-1.0, 1.0);
diff --git a/tensorflow/lite/testing/generated_examples_zip_test.cc b/tensorflow/lite/testing/generated_examples_zip_test.cc
index b9304f2ab99..92f696d0e65 100644
--- a/tensorflow/lite/testing/generated_examples_zip_test.cc
+++ b/tensorflow/lite/testing/generated_examples_zip_test.cc
@@ -100,7 +100,6 @@ const std::map<string, string>& GetKnownBrokenTests() {
       {R"(^\/floor_mod.*activation=True.*dtype=tf\.int32)", "112968789"},
       {R"(^\/floor_mod.*activation=True.*dtype=tf\.int64)", "112968789"},
 
-      {R"(^\/sub.*dtype=tf\.int64)", "119126484"},
       {R"(^\/div.*dtype=tf\.int64)", "119126484"},
       {R"(^\/mul.*dtype=tf\.int64)", "119126484"},
       {R"(^\/add.*dtype=tf\.int64)", "119126484"},
diff --git a/tensorflow/lite/toco/tflite/op_version.cc b/tensorflow/lite/toco/tflite/op_version.cc
index efa53c69cae..38df3413a6f 100644
--- a/tensorflow/lite/toco/tflite/op_version.cc
+++ b/tensorflow/lite/toco/tflite/op_version.cc
@@ -58,6 +58,7 @@ std::string GetMinimumRuntimeVersionForModel(const Model& model) {
           {{OperatorType::kSpaceToBatchND, 2}, "1.14.0"},
           {{OperatorType::kSub, 1}, "1.6.0"},
           {{OperatorType::kSub, 2}, "1.14.0"},
+          {{OperatorType::kSub, 4}, kPendingReleaseOpVersion},
           {{OperatorType::kDiv, 1}, "1.6.0"},
           {{OperatorType::kBatchToSpaceND, 1}, "1.6.0"},
           {{OperatorType::kBatchToSpaceND, 2}, "1.14.0"},
diff --git a/tensorflow/lite/tools/versioning/op_version.cc b/tensorflow/lite/tools/versioning/op_version.cc
index a339976739b..d09ff3f03ca 100644
--- a/tensorflow/lite/tools/versioning/op_version.cc
+++ b/tensorflow/lite/tools/versioning/op_version.cc
@@ -448,6 +448,10 @@ int GetBuiltinOperatorVersion(const OpSignature& op_sig) {
       return 1;
 
     case BuiltinOperator_SUB:
+      if (!op_sig.input_types.empty() &&
+          op_sig.input_types.at(0) == TensorType_INT64) {
+        return 4;
+      }
       if (op_sig.options.broadcast.need_broadcast &&
           op_sig.options.broadcast.num_dims > 4) {
         return 3;
diff --git a/tensorflow/lite/tools/versioning/op_version_test.cc b/tensorflow/lite/tools/versioning/op_version_test.cc
index e9fd857a3f5..2f13b7234e3 100644
--- a/tensorflow/lite/tools/versioning/op_version_test.cc
+++ b/tensorflow/lite/tools/versioning/op_version_test.cc
@@ -296,6 +296,14 @@ TEST(OpVersionTest, VersioningSubTest) {
   SimpleVersioningTest(BuiltinOperator_SUB);
 }
 
+TEST(OpVersionTest, VersioningSub4Test) {
+  OpSignature fake_op_sig = {
+      .op = BuiltinOperator_SUB,
+      .input_types = std::vector<TensorType>{TensorType_INT64},
+  };
+  EXPECT_EQ(GetBuiltinOperatorVersion(fake_op_sig), 4);
+}
+
 void SimpleMulVersioningTest(TensorType data_type, float multiplier,
                              int version) {
   OpSignature fake_op_sig = {
diff --git a/tensorflow/lite/tools/versioning/runtime_version.cc b/tensorflow/lite/tools/versioning/runtime_version.cc
index efec5a7da18..976e7e70441 100644
--- a/tensorflow/lite/tools/versioning/runtime_version.cc
+++ b/tensorflow/lite/tools/versioning/runtime_version.cc
@@ -79,6 +79,7 @@ std::string FindMinimumRuntimeVersionForOp(tflite::BuiltinOperator op_code,
               {{BuiltinOperator_SUB, 1}, "1.6.0"},
               {{BuiltinOperator_SUB, 2}, "1.14.0"},
               {{BuiltinOperator_SUB, 3}, kPendingReleaseVersion},
+              {{BuiltinOperator_SUB, 4}, kPendingReleaseVersion},
               {{BuiltinOperator_DENSIFY, 1}, "2.2.0"},
               {{BuiltinOperator_DIV, 1}, "1.6.0"},
               {{BuiltinOperator_DIV, 2}, kPendingReleaseVersion},

From f760e5bb88054d7957bbaec2329f9175f75769a6 Mon Sep 17 00:00:00 2001
From: XingyuLong <halolong@hotmail.com>
Date: Wed, 24 Jun 2020 11:33:23 -0400
Subject: [PATCH 0971/1390] Add another correctness test for test case

---
 .../python/keras/layers/convolutional_test.py | 33 +++++++++++++++++++
 1 file changed, 33 insertions(+)

diff --git a/tensorflow/python/keras/layers/convolutional_test.py b/tensorflow/python/keras/layers/convolutional_test.py
index 551733bbbd2..88cbdec30e2 100644
--- a/tensorflow/python/keras/layers/convolutional_test.py
+++ b/tensorflow/python/keras/layers/convolutional_test.py
@@ -774,6 +774,39 @@ class ZeroPaddingTest(keras_parameterized.TestCase):
             np.testing.assert_allclose(np_output[:, :, :, :, offset], 0.)
           np.testing.assert_allclose(np_output[:, :, 2:-2, 2:-2, 2:-2], 1.)
 
+        layer = keras.layers.ZeroPadding3D(padding=((1, 2), (3, 4), (0, 2)),
+                                           data_format=data_format)
+        layer.build(inputs.shape)
+        output = layer(keras.backend.variable(inputs))
+        if context.executing_eagerly():
+          np_output = output.numpy()
+        else:
+          np_output = keras.backend.eval(output)
+        if data_format == 'channels_last':
+          for offset in [0]:
+            np.testing.assert_allclose(np_output[:, offset, :, :, :], 0.)
+          for offset in [-1, -2]:
+            np.testing.assert_allclose(np_output[:, offset, :, :, :], 0.)
+          for offset in [0, 1, 2]:
+            np.testing.assert_allclose(np_output[:, :, offset, :, :], 0.)
+          for offset in [-1, -2, -3, -4]:
+            np.testing.assert_allclose(np_output[:, :, offset, :, :], 0.)
+          for offset in [-1, -2]:
+            np.testing.assert_allclose(np_output[:, :, :, offset, :], 0.)
+          np.testing.assert_allclose(np_output[:, 1:-2, 3:-4, 0:-2, :], 1.)
+        elif data_format == 'channels_first':
+          for offset in [0]:
+            np.testing.assert_allclose(np_output[:, :, offset, :, :], 0.)
+          for offset in [-1, -2]:
+            np.testing.assert_allclose(np_output[:, :, offset, :, :], 0.)
+          for offset in [0, 1, 2]:
+            np.testing.assert_allclose(np_output[:, :, :, offset, :], 0.)
+          for offset in [-1, -2, -3, -4]:
+            np.testing.assert_allclose(np_output[:, :, :, offset, :], 0.)
+          for offset in [-1, -2]:
+            np.testing.assert_allclose(np_output[:, :, :, :, offset], 0.)
+          np.testing.assert_allclose(np_output[:, :, 1:-2, 3:-4, 0:-2], 1.)
+
       # test incorrect use
       with self.assertRaises(ValueError):
         keras.layers.ZeroPadding3D(padding=(1, 1))

From 553ef2313ee158e31c49e40106a41102447414d5 Mon Sep 17 00:00:00 2001
From: Tres Popp <tpopp@google.com>
Date: Wed, 24 Jun 2020 08:30:58 -0700
Subject: [PATCH 0972/1390] Integrate LLVM at
 https://github.com/llvm/llvm-project/commit/ab27603c6d44

PiperOrigin-RevId: 318068740
Change-Id: I2adcdaed85a79899e9806dd7bf90d5e31ad5a5db
---
 tensorflow/workspace.bzl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index 88f3ea6f0bd..c4337919a93 100755
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -710,8 +710,8 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
     )
 
     # Check out LLVM and MLIR from llvm-project.
-    LLVM_COMMIT = "3d6cab271c7cecf105b77834d837ccd4406700d7"
-    LLVM_SHA256 = "5ed3e9d2fb72e1a26d312ebbf1cfecf8e332fe9dce269a8e11073a3a444e5f32"
+    LLVM_COMMIT = "ab27603c6d444b15e5f8efc090611488440211a9"
+    LLVM_SHA256 = "adeb20173900cef3e16cbf0bd1f3c5ee23308549e47fc5f4d836462a1ff40b15"
     LLVM_URLS = [
         "https://storage.googleapis.com/mirror.tensorflow.org/github.com/llvm/llvm-project/archive/{commit}.tar.gz".format(commit = LLVM_COMMIT),
         "https://github.com/llvm/llvm-project/archive/{commit}.tar.gz".format(commit = LLVM_COMMIT),

From cb7907d992a40848f5e9ed407ad4d17ba5f655a3 Mon Sep 17 00:00:00 2001
From: Henry Tan <henrytan@google.com>
Date: Wed, 24 Jun 2020 09:40:21 -0700
Subject: [PATCH 0973/1390] Prep change for publishing TPU Ops.

PiperOrigin-RevId: 318081188
Change-Id: Ic2facba026e4abc4766ad3466f782f1535abf243
---
 tensorflow/core/tpu/kernels/BUILD             |  3 ++
 .../core/tpu/kernels/tpu_compile_op_common.cc | 39 +++++++++++++++++++
 .../core/tpu/kernels/tpu_compile_op_common.h  |  3 +-
 tensorflow/core/tpu/kernels/tpu_util.cc       |  1 -
 tensorflow/core/tpu/kernels/tpu_util.h        |  1 -
 tensorflow/core/tpu/kernels/tpu_util_c_api.h  |  6 +++
 6 files changed, 50 insertions(+), 3 deletions(-)

diff --git a/tensorflow/core/tpu/kernels/BUILD b/tensorflow/core/tpu/kernels/BUILD
index d82cf1b254b..071c99babfe 100644
--- a/tensorflow/core/tpu/kernels/BUILD
+++ b/tensorflow/core/tpu/kernels/BUILD
@@ -28,10 +28,12 @@ cc_library(
     srcs = ["tpu_compile_op_common.cc"],
     hdrs = ["tpu_compile_op_common.h"],
     deps = [
+        ":tpu_compile_op_options",
         ":tpu_compile_op_support",
         ":tpu_mesh_state_interface",
         ":tpu_program_group_interface",
         ":tpu_util",
+        ":tpu_util_c_api_hdrs",
         ":tpu_util_hdrs",
         "//tensorflow/compiler/jit:flags",
         "//tensorflow/compiler/jit:shape_inference",
@@ -50,6 +52,7 @@ cc_library(
         "//tensorflow/core/tpu:tpu_configuration",
         "//tensorflow/core/tpu:tpu_defs",
         "//tensorflow/stream_executor/tpu:tpu_platform_interface",
+        "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
     ],
     alwayslink = 1,
diff --git a/tensorflow/core/tpu/kernels/tpu_compile_op_common.cc b/tensorflow/core/tpu/kernels/tpu_compile_op_common.cc
index 79556cfa544..9cc494bc244 100644
--- a/tensorflow/core/tpu/kernels/tpu_compile_op_common.cc
+++ b/tensorflow/core/tpu/kernels/tpu_compile_op_common.cc
@@ -16,6 +16,7 @@ limitations under the License.
 
 #include <string>
 
+#include "absl/strings/string_view.h"
 #include "tensorflow/compiler/jit/flags.h"
 #include "tensorflow/compiler/tf2xla/tf2xla_util.h"
 #include "tensorflow/compiler/xla/client/client_library.h"
@@ -28,8 +29,10 @@ limitations under the License.
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/protobuf/tpu/compile_metadata.pb.h"
 #include "tensorflow/core/protobuf/tpu/dynamic_padding.pb.h"
+#include "tensorflow/core/tpu/kernels/tpu_compile_op_options.h"
 #include "tensorflow/core/tpu/kernels/tpu_program_group_interface.h"
 #include "tensorflow/core/tpu/kernels/tpu_util.h"
+#include "tensorflow/core/tpu/kernels/tpu_util_c_api.h"
 #include "tensorflow/core/tpu/tpu_configuration.h"
 #include "tensorflow/core/tpu/tpu_defs.h"
 
@@ -518,5 +521,41 @@ Status TpuCompileOpKernelCommon::OptimizeGraph(
   return Status::OK();
 }
 
+void TpuCompileOpKernelCommon::Compute(OpKernelContext* ctx) {
+  VLOG(1) << "Cloud TPU: TpuCompileOpKernelCommon::Compute";
+
+  std::shared_ptr<std::atomic<bool>> done(new std::atomic<bool>(false));
+
+  CancellationToken token =
+      ctx->cancellation_manager()->get_cancellation_token();
+  const bool already_cancelled =
+      !ctx->cancellation_manager()->RegisterCallback(token, [ctx, done]() {
+        if (TpuCompile_ShouldTpuCompileOpIgnoreCancellation()) {
+          return;
+        }
+
+        // Sleep and exit in another thread so the cancellation manager can
+        // continue running callbacks.
+        ctx->env()->SchedClosure([ctx, done]() { ExitCountdown(ctx, done); });
+      });
+
+  // If the RPC was cancelled before we registered the cancellation callback,
+  // don't compile the TPU program.
+  OP_REQUIRES(ctx, !already_cancelled,
+              errors::Cancelled("RPC cancelled, not compiling TPU program"));
+
+  // We only want to abort the process if a cancellation actually occurs during
+  // compilation; we must deregister the callback in the success case. It
+  // doesn't hurt to also deregister the callback in the failure case; the
+  // CancellationManager ensures that already-registered callbacks will be run
+  // once cancellation has started.
+  auto cancellation_cleanup = xla::MakeCleanup([ctx, token, done] {
+    ctx->cancellation_manager()->DeregisterCallback(token);
+    done->store(true);
+  });
+
+  OP_REQUIRES_OK(ctx, ComputeInternal(ctx));
+}
+
 }  // namespace tpu
 }  // namespace tensorflow
diff --git a/tensorflow/core/tpu/kernels/tpu_compile_op_common.h b/tensorflow/core/tpu/kernels/tpu_compile_op_common.h
index 5223732430a..7794c59c301 100644
--- a/tensorflow/core/tpu/kernels/tpu_compile_op_common.h
+++ b/tensorflow/core/tpu/kernels/tpu_compile_op_common.h
@@ -53,7 +53,8 @@ class TpuCompileOpKernelCommon {
 
   virtual ~TpuCompileOpKernelCommon() = default;
 
-  virtual void Compute(OpKernelContext* ctx) = 0;
+  void Compute(OpKernelContext* ctx);
+  virtual Status ComputeInternal(OpKernelContext* ctx) = 0;
 
   // Computes shapes for each argument. Uses both the static shape from the
   // metadata, and the dynamic shapes where the static shape is not
diff --git a/tensorflow/core/tpu/kernels/tpu_util.cc b/tensorflow/core/tpu/kernels/tpu_util.cc
index 5c286de7672..60f8fe0198b 100644
--- a/tensorflow/core/tpu/kernels/tpu_util.cc
+++ b/tensorflow/core/tpu/kernels/tpu_util.cc
@@ -95,6 +95,5 @@ Status DynamicShapesToTensorShapes(const InputList& dynamic_shapes,
   }
   return Status::OK();
 }
-
 }  // namespace tpu
 }  // namespace tensorflow
diff --git a/tensorflow/core/tpu/kernels/tpu_util.h b/tensorflow/core/tpu/kernels/tpu_util.h
index 3fa2c3be81d..f404ca4a8b7 100644
--- a/tensorflow/core/tpu/kernels/tpu_util.h
+++ b/tensorflow/core/tpu/kernels/tpu_util.h
@@ -68,7 +68,6 @@ Status TpuPaddedShapeFn(const Tensor& tensor, xla::Shape* shape);
 
 // A callback called on exit.
 void LogAndExit(int code);
-
 }  // namespace tpu
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/tpu/kernels/tpu_util_c_api.h b/tensorflow/core/tpu/kernels/tpu_util_c_api.h
index 32b946d56c9..4679ee00d15 100644
--- a/tensorflow/core/tpu/kernels/tpu_util_c_api.h
+++ b/tensorflow/core/tpu/kernels/tpu_util_c_api.h
@@ -31,6 +31,12 @@ void TpuCompile_ToTpuShapeRepresentation(
     bool use_fast_memory, TpuSerializedProto* serialized_tensor_shape,
     SE_Status* status);
 
+// XLA compilation cannot be cancelled. To avoid hanging the TF worker will exit
+// when cancellation is requested for an XLA compile op. Some tests require this
+// behavior to be disabled, and we test for this condition with the following
+// flag function.
+bool TpuCompile_ShouldTpuCompileOpIgnoreCancellation();
+
 }  // extern "C"
 
 struct TfTpu_UtilApiFn {

From a17858f3cc5e7ab4ebc2c166d71e7f85b2dad05d Mon Sep 17 00:00:00 2001
From: Smit Hinsu <hinsu@google.com>
Date: Wed, 24 Jun 2020 09:48:38 -0700
Subject: [PATCH 0974/1390] Avoid undefined behavior by checking for null
 Operation in TF_Input/TF_Output

PiperOrigin-RevId: 318082756
Change-Id: I5b5e9bc716cf159f22a4b89083e00efb93d9fecb
---
 tensorflow/c/c_api_function.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/c/c_api_function.cc b/tensorflow/c/c_api_function.cc
index 637801ed7e8..a0fa9613e7f 100644
--- a/tensorflow/c/c_api_function.cc
+++ b/tensorflow/c/c_api_function.cc
@@ -54,7 +54,7 @@ Status ProcessInputs(
     TF_EXCLUSIVE_LOCKS_REQUIRED(fn_body->mu) {
   input_tensors->reserve(ninputs);
   for (int i = 0; i < ninputs; ++i) {
-    Node* node = &inputs[i].oper->node;
+    Node* node = inputs[i].oper ? &inputs[i].oper->node : nullptr;
     int idx = inputs[i].index;
 
     TF_RETURN_WITH_CONTEXT_IF_ERROR(
@@ -90,7 +90,7 @@ Status ProcessOutputs(const TF_Graph* fn_body, const char* fn_name,
     TF_EXCLUSIVE_LOCKS_REQUIRED(fn_body->mu) {
   output_tensors->reserve(noutputs);
   for (int i = 0; i < noutputs; ++i) {
-    Node* node = &outputs[i].oper->node;
+    Node* node = outputs[i].oper ? &outputs[i].oper->node : nullptr;
     int idx = outputs[i].index;
     TF_RETURN_WITH_CONTEXT_IF_ERROR(
         fn_body->graph.IsValidOutputTensor(node, idx),

From 05c284bcc365a4b7046e7f0bd035aef691cc9814 Mon Sep 17 00:00:00 2001
From: Raman Sarokin <sorokin@google.com>
Date: Wed, 24 Jun 2020 09:51:37 -0700
Subject: [PATCH 0975/1390] Added attributes for buffer. Enhanced GetPtr method
 of Buffer object.

PiperOrigin-RevId: 318083432
Change-Id: I1fead5800bd03868296d93759598df378298ec71
---
 tensorflow/lite/delegates/gpu/cl/BUILD        |  3 ++
 tensorflow/lite/delegates/gpu/cl/arguments.cc | 23 +++++------
 tensorflow/lite/delegates/gpu/cl/buffer.cc    | 39 +++++++++++++++----
 tensorflow/lite/delegates/gpu/cl/buffer.h     |  8 ++--
 .../lite/delegates/gpu/cl/gpu_object.cc       | 37 ++++++++++++++++++
 tensorflow/lite/delegates/gpu/cl/gpu_object.h |  3 ++
 .../lite/delegates/gpu/cl/tensor_type.cc      |  7 ++++
 7 files changed, 96 insertions(+), 24 deletions(-)
 create mode 100644 tensorflow/lite/delegates/gpu/cl/gpu_object.cc

diff --git a/tensorflow/lite/delegates/gpu/cl/BUILD b/tensorflow/lite/delegates/gpu/cl/BUILD
index 3e10f669d70..1f894e7c142 100644
--- a/tensorflow/lite/delegates/gpu/cl/BUILD
+++ b/tensorflow/lite/delegates/gpu/cl/BUILD
@@ -49,6 +49,7 @@ cc_library(
         ":tensor_type",
         ":util",
         "//tensorflow/lite/delegates/gpu/common:access_type",
+        "//tensorflow/lite/delegates/gpu/common:data_type",
         "//tensorflow/lite/delegates/gpu/common:status",
         "//tensorflow/lite/delegates/gpu/common:types",
         "//tensorflow/lite/delegates/gpu/common:util",
@@ -84,6 +85,7 @@ cc_library(
         ":gpu_object",
         ":opencl_wrapper",
         ":util",
+        "//tensorflow/lite/delegates/gpu/common:data_type",
         "//tensorflow/lite/delegates/gpu/common:status",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
@@ -330,6 +332,7 @@ cc_library(
 
 cc_library(
     name = "gpu_object",
+    srcs = ["gpu_object.cc"],
     hdrs = ["gpu_object.h"],
     deps = [
         ":opencl_wrapper",
diff --git a/tensorflow/lite/delegates/gpu/cl/arguments.cc b/tensorflow/lite/delegates/gpu/cl/arguments.cc
index 1fd58ef2454..f5a39f1dd7e 100644
--- a/tensorflow/lite/delegates/gpu/cl/arguments.cc
+++ b/tensorflow/lite/delegates/gpu/cl/arguments.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "absl/strings/str_split.h"
 #include "absl/strings/substitute.h"
 #include "tensorflow/lite/delegates/gpu/cl/tensor_type.h"
+#include "tensorflow/lite/delegates/gpu/common/data_type.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
 
 namespace tflite {
@@ -457,21 +458,15 @@ std::string Arguments::GetListOfArgs() {
   for (auto& t : buffers_) {
     const std::string type_name =
         t.second.data_type == DataType::FLOAT32 ? "float" : "half";
-    std::string memory_type;
-    switch (t.second.memory_type) {
-      case MemoryType::GLOBAL:
-        memory_type = "__global";
-        break;
-      case MemoryType::CONSTANT:
-        memory_type = "__constant";
-        break;
-      case MemoryType::LOCAL:
-        memory_type = "__local";
-        break;
+    std::string attributes;
+    for (const auto& attr : t.second.attributes) {
+      attributes += absl::StrCat("  __attribute__((", attr, "))");
     }
-    AppendArgument(absl::StrCat(memory_type, " ", type_name,
-                                t.second.element_size, "* ", t.first),
-                   &result);
+    AppendArgument(
+        absl::StrCat(MemoryTypeToCLType(t.second.memory_type), " ",
+                     ToCLDataType(t.second.data_type, t.second.element_size),
+                     "* ", t.first, attributes),
+        &result);
   }
   for (auto& t : image_buffers_) {
     AppendArgument(absl::StrCat(GetImageModifier(t.second.access_type),
diff --git a/tensorflow/lite/delegates/gpu/cl/buffer.cc b/tensorflow/lite/delegates/gpu/cl/buffer.cc
index 436d8751e18..223da82f51c 100644
--- a/tensorflow/lite/delegates/gpu/cl/buffer.cc
+++ b/tensorflow/lite/delegates/gpu/cl/buffer.cc
@@ -15,6 +15,9 @@ limitations under the License.
 
 #include "tensorflow/lite/delegates/gpu/cl/buffer.h"
 
+#include <string>
+
+#include "tensorflow/lite/delegates/gpu/common/data_type.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
 
 namespace tflite {
@@ -51,6 +54,7 @@ GPUResources BufferDescriptor::GetGPUResources(AccessType access_type) const {
   desc.access_type = access_type;
   desc.element_size = element_size;
   desc.memory_type = memory_type;
+  desc.attributes = attributes;
   resources.buffers.push_back({"buffer", desc});
   return resources;
 }
@@ -61,7 +65,7 @@ absl::Status BufferDescriptor::PerformSelector(
   if (selector == "Read") {
     return PerformReadSelector(args, result);
   } else if (selector == "GetPtr") {
-    return PerformGetPtrSelector(args, result);
+    return PerformGetPtrSelector(args, template_args, result);
   } else {
     return absl::NotFoundError(absl::StrCat(
         "BufferDescriptor don't have selector with name - ", selector));
@@ -80,13 +84,34 @@ absl::Status BufferDescriptor::PerformReadSelector(
 }
 
 absl::Status BufferDescriptor::PerformGetPtrSelector(
-    const std::vector<std::string>& args, std::string* result) const {
-  if (!args.empty()) {
-    return absl::NotFoundError(
-        absl::StrCat("BufferDescriptor GetPtr require zero arguments, but ",
-                     args.size(), " was passed"));
+    const std::vector<std::string>& args,
+    const std::vector<std::string>& template_args, std::string* result) const {
+  if (args.size() > 1) {
+    return absl::NotFoundError(absl::StrCat(
+        "BufferDescriptor GetPtr require one or zero arguments, but ",
+        args.size(), " was passed"));
+  }
+  if (template_args.size() > 1) {
+    return absl::NotFoundError(
+        absl::StrCat("BufferDescriptor GetPtr require one or zero teemplate "
+                     "arguments, but ",
+                     template_args.size(), " was passed"));
+  }
+  std::string conversion;
+  if (template_args.size() == 1) {
+    const std::string type_name = ToCLDataType(element_type, element_size);
+    if (type_name != template_args[0]) {
+      conversion = absl::StrCat("(", MemoryTypeToCLType(memory_type), " ",
+                                template_args[0], "*)&");
+    }
+  }
+  if (args.empty()) {
+    *result = absl::StrCat(conversion, "buffer");
+  } else if (conversion.empty()) {
+    *result = absl::StrCat("(buffer + ", args[0], ")");
+  } else {
+    *result = absl::StrCat(conversion, "buffer[", args[0], "]");
   }
-  *result = "buffer";
   return absl::OkStatus();
 }
 
diff --git a/tensorflow/lite/delegates/gpu/cl/buffer.h b/tensorflow/lite/delegates/gpu/cl/buffer.h
index 0d1072040c1..d50f63c7d5d 100644
--- a/tensorflow/lite/delegates/gpu/cl/buffer.h
+++ b/tensorflow/lite/delegates/gpu/cl/buffer.h
@@ -30,9 +30,10 @@ namespace gpu {
 namespace cl {
 
 struct BufferDescriptor : public GPUObjectDescriptor {
-  DataType element_type;  // FLOAT32 or FLOAT16
+  DataType element_type;
   int element_size;
   MemoryType memory_type = MemoryType::GLOBAL;
+  std::vector<std::string> attributes;
 
   absl::Status PerformSelector(const std::string& selector,
                                const std::vector<std::string>& args,
@@ -42,8 +43,9 @@ struct BufferDescriptor : public GPUObjectDescriptor {
   GPUResources GetGPUResources(AccessType access_type) const override;
   absl::Status PerformReadSelector(const std::vector<std::string>& args,
                                    std::string* result) const;
-  absl::Status PerformGetPtrSelector(const std::vector<std::string>& args,
-                                     std::string* result) const;
+  absl::Status PerformGetPtrSelector(
+      const std::vector<std::string>& args,
+      const std::vector<std::string>& template_args, std::string* result) const;
 };
 
 // Buffer represent linear GPU data storage with arbitrary data format.
diff --git a/tensorflow/lite/delegates/gpu/cl/gpu_object.cc b/tensorflow/lite/delegates/gpu/cl/gpu_object.cc
new file mode 100644
index 00000000000..277d711ff63
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/gpu_object.cc
@@ -0,0 +1,37 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/cl/gpu_object.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+std::string MemoryTypeToCLType(MemoryType type) {
+  switch (type) {
+    case MemoryType::GLOBAL:
+      return "__global";
+    case MemoryType::CONSTANT:
+      return "__constant";
+      break;
+    case MemoryType::LOCAL:
+      return "__local";
+  }
+  return "";
+}
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/gpu_object.h b/tensorflow/lite/delegates/gpu/cl/gpu_object.h
index 711c4726bc2..85f2bba42a7 100644
--- a/tensorflow/lite/delegates/gpu/cl/gpu_object.h
+++ b/tensorflow/lite/delegates/gpu/cl/gpu_object.h
@@ -56,11 +56,14 @@ struct GPUImageBufferDescriptor {
 
 enum class MemoryType { GLOBAL, CONSTANT, LOCAL };
 
+std::string MemoryTypeToCLType(MemoryType type);
+
 struct GPUBufferDescriptor {
   DataType data_type;
   AccessType access_type;
   int element_size;
   MemoryType memory_type = MemoryType::GLOBAL;
+  std::vector<std::string> attributes;
   cl_mem memory;
 };
 
diff --git a/tensorflow/lite/delegates/gpu/cl/tensor_type.cc b/tensorflow/lite/delegates/gpu/cl/tensor_type.cc
index ef49f67cf77..d04313226ca 100644
--- a/tensorflow/lite/delegates/gpu/cl/tensor_type.cc
+++ b/tensorflow/lite/delegates/gpu/cl/tensor_type.cc
@@ -147,6 +147,13 @@ absl::Status TensorDescriptor::PerformSelector(
   } else if (selector == "Slices") {
     *result = "slices";
     return absl::OkStatus();
+  } else if (selector == "SliceStride") {
+    if (IsBatchedWidth()) {
+      *result = "width_batched * height";
+    } else {
+      *result = "width * height";
+    }
+    return absl::OkStatus();
   } else if (selector == "Channels") {
     *result = "channels";
     return absl::OkStatus();

From cc1273649cc1f20648dca7115958f3d77dd91567 Mon Sep 17 00:00:00 2001
From: Phil Pearl <phil.pearl@ravelin.com>
Date: Wed, 24 Jun 2020 09:53:18 -0700
Subject: [PATCH 0976/1390] PR #39117: trying again: Big performance gains for
 Go NewTensor and Value
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Imported from GitHub PR https://github.com/tensorflow/tensorflow/pull/39117

After some helpful comments in https://github.com/tensorflow/tensorflow/pull/36578#issuecomment-622308355 I'm trying again with this performance improvement.

Please please please if it gets rolled back again because of "internal test" failures, please get some kind of debug information. Stack traces or something. Any kind of clue.

```
name                                  old time/op    new time/op    delta
Tensor/New/[150528]int32-16             1.78ms ± 4%    0.13ms ±10%   -92.63%  (p=0.000 n=8+7)
Tensor/New/[100][100][100]int32-16      13.1ms ± 1%     0.9ms ±53%   -92.81%  (p=0.000 n=8+8)
Tensor/New/[]float32-16                 3.72ms ± 1%    0.97ms ±30%   -74.04%  (p=0.000 n=8+8)
Tensor/New/[][]float32-16               4.83ms ± 2%    1.32ms ± 8%   -72.69%  (p=0.000 n=8+8)
Tensor/New/[][][]float32-16             4.81ms ± 1%    1.32ms ± 4%   -72.51%  (p=0.001 n=8+6)
Tensor/New/[]string-16                   466ms ± 1%      34ms ± 4%   -92.60%  (p=0.001 n=7+7)
Tensor/New/[][]string-16                 460ms ± 1%      35ms ± 1%   -92.45%  (p=0.000 n=8+8)
Tensor/New/[][][]string-16               462ms ± 2%      36ms ± 5%   -92.14%  (p=0.000 n=8+8)
Tensor/Value/[150528]int32-16            647µs ± 3%      82µs ± 1%   -87.28%  (p=0.000 n=8+8)
Tensor/Value/[100][100][100]int32-16    6.43ms ± 1%    0.99ms ± 3%   -84.63%  (p=0.000 n=8+8)
Tensor/Value/[]float32-16               5.57ms ± 3%    1.04ms ± 7%   -81.26%  (p=0.000 n=8+8)
Tensor/Value/[][]float32-16             6.84ms ± 1%    1.51ms ± 1%   -77.96%  (p=0.000 n=8+8)
Tensor/Value/[][][]float32-16           6.87ms ± 1%    1.52ms ± 3%   -77.80%  (p=0.001 n=7+7)
Tensor/Value/[]string-16                 268ms ± 3%      20ms ± 2%   -92.45%  (p=0.000 n=8+8)
Tensor/Value/[][]string-16               269ms ± 2%      20ms ± 1%   -92.46%  (p=0.000 n=8+7)
Tensor/Value/[][][]string-16             271ms ± 2%      20ms ± 1%   -92.55%  (p=0.000 n=8+8)

name                                  old alloc/op   new alloc/op   delta
Tensor/New/[150528]int32-16              606kB ± 0%       0kB ± 0%   -99.99%  (p=0.000 n=8+8)
Tensor/New/[100][100][100]int32-16      4.16MB ± 0%    0.00MB ± 0%  -100.00%  (p=0.000 n=7+8)
Tensor/New/[]float32-16                 4.01MB ± 0%    0.00MB ± 0%  -100.00%  (p=0.002 n=7+8)
Tensor/New/[][]float32-16               4.48MB ± 0%    0.00MB ± 0%  -100.00%  (p=0.000 n=8+8)
Tensor/New/[][][]float32-16             4.48MB ± 0%    0.00MB ± 0%  -100.00%  (p=0.002 n=7+8)
Tensor/New/[]string-16                  48.0MB ± 0%     0.0MB ± 0%  -100.00%  (p=0.000 n=7+8)
Tensor/New/[][]string-16                48.3MB ± 0%     0.0MB ± 0%  -100.00%  (p=0.000 n=7+8)
Tensor/New/[][][]string-16              48.3MB ± 0%     0.0MB ± 0%  -100.00%  (p=0.000 n=7+8)
Tensor/Value/[150528]int32-16           1.21MB ± 0%    0.61MB ± 0%   -50.00%  (p=0.000 n=8+8)
Tensor/Value/[100][100][100]int32-16    9.23MB ± 0%    4.25MB ± 0%   -53.93%  (p=0.000 n=8+8)
Tensor/Value/[]float32-16               8.01MB ± 0%    4.01MB ± 0%   -50.00%  (p=0.000 n=8+7)
Tensor/Value/[][]float32-16             9.21MB ± 0%    4.25MB ± 0%   -53.82%  (p=0.000 n=8+8)
Tensor/Value/[][][]float32-16           9.23MB ± 0%    4.25MB ± 0%   -53.93%  (p=0.000 n=8+8)
Tensor/Value/[]string-16                56.0MB ± 0%    23.0MB ± 0%   -58.91%  (p=0.000 n=8+7)
Tensor/Value/[][]string-16              58.5MB ± 0%    23.3MB ± 0%   -60.23%  (p=0.000 n=8+8)
Tensor/Value/[][][]string-16            58.5MB ± 0%    23.3MB ± 0%   -60.25%  (p=0.001 n=7+7)

name                                  old allocs/op  new allocs/op  delta
Tensor/New/[150528]int32-16               4.00 ± 0%      2.00 ± 0%   -50.00%  (p=0.000 n=8+8)
Tensor/New/[100][100][100]int32-16       10.0k ± 0%      0.0k ± 0%   -99.96%  (p=0.000 n=8+8)
Tensor/New/[]float32-16                   4.00 ± 0%      2.00 ± 0%   -50.00%  (p=0.000 n=8+8)
Tensor/New/[][]float32-16                20.0k ± 0%      0.0k ± 0%   -99.99%  (p=0.000 n=8+8)
Tensor/New/[][][]float32-16              20.0k ± 0%      0.0k ± 0%   -99.98%  (p=0.000 n=8+8)
Tensor/New/[]string-16                   4.00M ± 0%     0.00M ± 0%  -100.00%  (p=0.000 n=8+8)
Tensor/New/[][]string-16                 4.01M ± 0%     0.00M ± 0%  -100.00%  (p=0.000 n=8+8)
Tensor/New/[][][]string-16               4.01M ± 0%     0.00M ± 0%  -100.00%  (p=0.000 n=8+8)
Tensor/Value/[150528]int32-16             7.00 ± 0%      2.00 ± 0%   -71.43%  (p=0.000 n=8+8)
Tensor/Value/[100][100][100]int32-16     40.2k ± 0%      0.0k ± 0%   -99.99%  (p=0.000 n=8+8)
Tensor/Value/[]float32-16                 7.00 ± 0%      2.00 ± 0%   -71.43%  (p=0.000 n=8+8)
Tensor/Value/[][]float32-16              40.0k ± 0%      0.0k ± 0%   -99.99%  (p=0.000 n=8+8)
Tensor/Value/[][][]float32-16            40.2k ± 0%      0.0k ± 0%   -99.99%  (p=0.000 n=8+8)
Tensor/Value/[]string-16                 5.00M ± 0%     0.00M ± 0%  -100.00%  (p=0.000 n=8+8)
Tensor/Value/[][]string-16               5.02M ± 0%     0.00M ± 0%  -100.00%  (p=0.000 n=8+8)
Tensor/Value/[][][]string-16             5.02M ± 0%     0.00M ± 0%  -100.00%  (p=0.000 n=8+8)
```
Copybara import of the project:

--
fb7e6b1665204f78863b70e7f4998cabeef0898d by Phil Pearl <phil.pearl@ravelin.com>:

Add some more benchmarks

--
4ae853272b8792a599cdbd5c5af5422b23d90366 by Phil Pearl <phil.pearl@ravelin.com>:

Go: large performance gains for NewTensor and Value
COPYBARA_INTEGRATE_REVIEW=https://github.com/tensorflow/tensorflow/pull/39117 from philpearl:master 4ae853272b8792a599cdbd5c5af5422b23d90366
PiperOrigin-RevId: 318083887
Change-Id: Ie211646f11922c5b0971bbf64bdb4a0c6a844985
---
 tensorflow/go/tensor.go      | 461 +++++++++++++++++++++--------------
 tensorflow/go/tensor_test.go |  80 ++++--
 2 files changed, 343 insertions(+), 198 deletions(-)

diff --git a/tensorflow/go/tensor.go b/tensorflow/go/tensor.go
index 9bc643ae6d2..bd84254f698 100644
--- a/tensorflow/go/tensor.go
+++ b/tensorflow/go/tensor.go
@@ -26,6 +26,7 @@ import (
 	"encoding/binary"
 	"fmt"
 	"io"
+	"math/bits"
 	"reflect"
 	"runtime"
 	"unsafe"
@@ -80,7 +81,7 @@ func NewTensor(value interface{}) (*Tensor, error) {
 	if dataType == String {
 		// TF_STRING tensors are encoded as an array of 8-byte offsets
 		// followed by string data. See c_api.h.
-		nbytes = uintptr(nflattened*8) + byteSizeOfEncodedStrings(value)
+		nbytes = uintptr(nflattened*8 + int64(byteSizeOfEncodedStrings(val)))
 	}
 	var shapePtr *C.int64_t
 	if len(shape) > 0 {
@@ -94,9 +95,22 @@ func NewTensor(value interface{}) (*Tensor, error) {
 	raw := tensorData(t.c)
 	buf := bytes.NewBuffer(raw[:0:len(raw)])
 	if dataType != String {
-		if err := encodeTensor(buf, val, shape); err != nil {
-			return nil, err
+		if isAllArray(val.Type()) {
+			// We have arrays all the way down, or just primitive types. We can
+			// just copy the memory in as it is all contiguous.
+			if err := copyPtr(buf, unpackEFace(value).data, int(val.Type().Size())); err != nil {
+				return nil, err
+			}
+		} else {
+			// When there are slices involved the memory for each leaf slice may
+			// not be contiguous with the others or in the order we might
+			// expect, so we need to work our way down to each slice of
+			// primitives and copy them individually
+			if err := encodeTensorWithSlices(buf, val, shape); err != nil {
+				return nil, err
+			}
 		}
+
 		if uintptr(buf.Len()) != nbytes {
 			return nil, bug("NewTensor incorrectly calculated the size of a tensor with type %v and shape %v as %v bytes instead of %v", dataType, shape, nbytes, buf.Len())
 		}
@@ -112,6 +126,43 @@ func NewTensor(value interface{}) (*Tensor, error) {
 	return t, nil
 }
 
+// isAllArray returns true if type is a primitive type or an array of primitive
+// types or an array of ... etc.. When this is true the data we want is
+// contiguous in RAM.
+func isAllArray(typ reflect.Type) bool {
+	switch typ.Kind() {
+	case reflect.Slice:
+		return false
+	case reflect.Array:
+		return isAllArray(typ.Elem())
+	default:
+		// We know the type is slices/arrays of slices/arrays of primitive types.
+		return true
+	}
+}
+
+// eface defines what an interface type actually is: a pointer to type
+// information about the encapsulated type and a pointer to the encapsulated
+// value.
+type eface struct {
+	rtype unsafe.Pointer
+	data  unsafe.Pointer
+}
+
+// unpackEFace gives us an effient way to get us a pointer to the value carried
+// in an interface. If you wrap a pointer type in an interface then the pointer
+// is directly stored in the interface struct. If you wrap a value type in an
+// interface then the compiler copies the value into a newly allocated piece of
+// memory and stores a pointer to that memory in the interface. So we're
+// guaranteed to get a pointer. Go reflection doesn't expose the pointer to
+// value types straightforwardly as it doesn't want you to think you have a
+// reference to the original value. But we just want a pointer to make it
+// efficient to read the value, so cheating like this should be safe and
+// reasonable.
+func unpackEFace(obj interface{}) *eface {
+	return (*eface)(unsafe.Pointer(&obj))
+}
+
 // ReadTensor constructs a Tensor with the provided type and shape from the
 // serialized tensor contents in r.
 //
@@ -168,21 +219,152 @@ func (t *Tensor) Shape() []int64 { return t.shape }
 // Tensor(int64, 0): int64
 // Tensor(float64, 3): [][][]float64
 func (t *Tensor) Value() interface{} {
-	typ := typeOf(t.DataType(), t.Shape())
-	val := reflect.New(typ)
 	raw := tensorData(t.c)
-	if t.DataType() != String {
-		if err := decodeTensor(bytes.NewReader(raw), t.Shape(), typ, val); err != nil {
-			panic(bug("unable to decode Tensor of type %v and shape %v - %v", t.DataType(), t.Shape(), err))
+	shape := t.Shape()
+	dt := t.DataType()
+	return decodeTensor(raw, shape, dt).Interface()
+}
+
+func decodeTensor(raw []byte, shape []int64, dt DataType) reflect.Value {
+	// Create a 1-dimensional slice of the base large enough for the data and
+	// copy the data in.
+	n := int(numElements(shape))
+
+	var (
+		slice reflect.Value
+		typ   reflect.Type
+	)
+	if dt == String {
+		strs, err := decodeOneDimString(raw, n)
+		if err != nil {
+			panic(bug("unable to decode string with shape %v: %v", shape, err))
 		}
+		slice = reflect.ValueOf(strs)
+		typ = slice.Type()
 	} else {
-		nflattened := numElements(t.Shape())
-		d := stringDecoder{offsets: bytes.NewReader(raw[0 : 8*nflattened]), data: raw[8*nflattened:], status: newStatus()}
-		if err := d.decode(val, t.Shape()); err != nil {
-			panic(bug("unable to decode String tensor with shape %v - %v", t.Shape(), err))
-		}
+		typ = typeForDataType(dt)
+		l := n * int(typ.Size())
+		typ = reflect.SliceOf(typ)
+		slice = reflect.MakeSlice(typ, n, n)
+		baseBytes := *(*[]byte)(unsafe.Pointer(&sliceHeader{
+			Data: unsafe.Pointer(slice.Pointer()),
+			Len:  l,
+			Cap:  l,
+		}))
+		copy(baseBytes, raw)
 	}
-	return reflect.Indirect(val).Interface()
+
+	// Now we have the data in place in the base slice we can add the
+	// dimensions. We want to walk backwards through the shape. If the shape is
+	// length 1 or 0 then we're already done.
+	if len(shape) == 0 {
+		return slice.Index(0)
+	}
+	if len(shape) == 1 {
+		return slice
+	}
+	// We have a special case if the tensor has no data. Our backing slice is
+	// empty, but we still want to create slices following the shape. In this
+	// case only the final part of the shape will be 0 and we want to recalculate
+	// n at this point ignoring that 0.
+	// For example if our shape is 3 * 2 * 0 then n will be zero, but we still
+	// want 6 zero length slices to group as follows.
+	// {{} {}} {{} {}} {{} {}}
+	if n == 0 {
+		n = int(numElements(shape[:len(shape)-1]))
+	}
+	for i := len(shape) - 2; i >= 0; i-- {
+		underlyingSize := typ.Elem().Size()
+		typ = reflect.SliceOf(typ)
+		subsliceLen := int(shape[i+1])
+		if subsliceLen != 0 {
+			n = n / subsliceLen
+		}
+		// Just using reflection it is difficult to avoid unnecessary
+		// allocations while setting up the sub-slices as the Slice function on
+		// a slice Value allocates. So we end up doing pointer arithmetic!
+		// Pointer() on a slice gives us access to the data backing the slice.
+		// We insert slice headers directly into this data.
+		data := unsafe.Pointer(slice.Pointer())
+		nextSlice := reflect.MakeSlice(typ, n, n)
+
+		for j := 0; j < n; j++ {
+			// This is equivalent to nSlice[j] = slice[j*subsliceLen: (j+1)*subsliceLen]
+			setSliceInSlice(nextSlice, j, sliceHeader{
+				Data: unsafe.Pointer(uintptr(data) + (uintptr(j*subsliceLen) * underlyingSize)),
+				Len:  subsliceLen,
+				Cap:  subsliceLen,
+			})
+		}
+
+		slice = nextSlice
+	}
+	return slice
+}
+
+// setSliceInSlice sets slice[index] = content.
+func setSliceInSlice(slice reflect.Value, index int, content sliceHeader) {
+	const sliceSize = unsafe.Sizeof(sliceHeader{})
+	// We must cast slice.Pointer to uninptr & back again to avoid GC issues.
+	// See https://github.com/google/go-cmp/issues/167#issuecomment-546093202
+	*(*sliceHeader)(unsafe.Pointer(uintptr(unsafe.Pointer(slice.Pointer())) + (uintptr(index) * sliceSize))) = content
+}
+
+// decodeOneDimString decodes a string tensor into a one-dimensional []string.
+func decodeOneDimString(raw []byte, nStrings int) ([]string, error) {
+	// Start by making an array of all the strings
+	strs := make([]string, nStrings)
+	// The first nStrings * 8 bytes of raw are offsets into the second half of
+	// the raw data. This second half is where the strings are encoded.
+	offsets := (*(*[]int64)(unsafe.Pointer(&raw)))[:nStrings]
+
+	// Reset raw after the offsets. Now the offsets will work relative to raw
+	raw = raw[nStrings*8:]
+	// Next we work out the final length of the string data so we can copy the
+	// good data out of raw (which is owned by the C tensor and won't be safe
+	// to access if the tensor is freed)
+	r := bytes.NewReader(raw)
+	var totalLength int
+	for _, offset := range offsets {
+		// At each offset we should find a varint length of a string.
+		// Errors here should mean the tensor is corrupt.
+		if _, err := r.Seek(offset, io.SeekStart); err != nil {
+			return nil, err
+		}
+		l, err := binary.ReadUvarint(r)
+		if err != nil {
+			return nil, err
+		}
+		totalLength += int(l)
+	}
+
+	// Lets allocate a big buffer to carry our string data.
+	stringData := make([]byte, 0, totalLength)
+	// Now copy the string data across into our new buffer, keeping track of the
+	// location of each string in the strs slice.
+	var cursor int
+	for i, offset := range offsets {
+		// At each offset we should find a varint length. Read it
+		if _, err := r.Seek(offset, io.SeekStart); err != nil {
+			return nil, err
+		}
+		l, err := binary.ReadUvarint(r)
+		if err != nil {
+			return nil, err
+		}
+
+		// Then copy the actual string into our large buffer
+		target := stringData[cursor : cursor+int(l)]
+		if _, err := r.Read(target); err != nil {
+			return nil, err
+		}
+		// Track where this string data is.
+		strs[i] = *(*string)(unsafe.Pointer(&target))
+		cursor += int(l)
+	}
+
+	// So now we have a big slice of strings
+	return strs, nil
 }
 
 // WriteContentsTo writes the serialized contents of t to w.
@@ -261,18 +443,18 @@ func shapeAndDataTypeOf(val reflect.Value) (shape []int64, dt DataType, err erro
 	return shape, dt, fmt.Errorf("unsupported type %v", typ)
 }
 
-// typeOf converts from a DataType and Shape to the equivalent Go type.
-func typeOf(dt DataType, shape []int64) reflect.Type {
-	var ret reflect.Type
+func typeForDataType(dt DataType) reflect.Type {
 	for _, t := range types {
 		if dt == DataType(t.dataType) {
-			ret = t.typ
-			break
+			return t.typ
 		}
 	}
-	if ret == nil {
-		panic(bug("DataType %v is not supported (see https://www.tensorflow.org/code/tensorflow/core/framework/types.proto)", dt))
-	}
+	panic(bug("DataType %v is not supported (see https://www.tensorflow.org/code/tensorflow/core/framework/types.proto)", dt))
+}
+
+// typeOf converts from a DataType and Shape to the equivalent Go type.
+func typeOf(dt DataType, shape []int64) reflect.Type {
+	ret := typeForDataType(dt)
 	for range shape {
 		ret = reflect.SliceOf(ret)
 	}
@@ -289,109 +471,93 @@ func numElements(shape []int64) int64 {
 
 // byteSizeOfEncodedStrings returns the size of the encoded strings in val.
 // val MUST be a string, or a container (array/slice etc.) of strings.
-func byteSizeOfEncodedStrings(val interface{}) uintptr {
-	if s, ok := val.(string); ok {
-		return uintptr(C.TF_StringEncodedSize(C.size_t(len(s))))
+// Tensorflow encodes strings as the varint encoded length followed by the
+// string bytes. We could call into the C library to do this but cgo has a heavy
+// overhead. So we just do that calculation in Go
+func byteSizeOfEncodedStrings(val reflect.Value) int {
+	if val.Kind() == reflect.String {
+		return sizeVarUint(uint64(val.Len())) + val.Len()
+	}
+	if val.Kind() != reflect.Slice && val.Kind() != reflect.Array {
+		panic(fmt.Sprintf("unexpected type %s", val.Type()))
 	}
 	// Otherwise must be an array or slice.
-	var size uintptr
-	v := reflect.ValueOf(val)
-	for i := 0; i < v.Len(); i++ {
-		size += byteSizeOfEncodedStrings(v.Index(i).Interface())
+	var size int
+	for i := 0; i < val.Len(); i++ {
+		size += byteSizeOfEncodedStrings(val.Index(i))
 	}
 	return size
 }
 
-// encodeTensor writes v to the specified buffer using the format specified in
-// c_api.h. Use stringEncoder for String tensors.
-func encodeTensor(w *bytes.Buffer, v reflect.Value, shape []int64) error {
-	switch v.Kind() {
-	case reflect.Bool:
-		b := byte(0)
-		if v.Bool() {
-			b = 1
-		}
-		if err := w.WriteByte(b); err != nil {
-			return err
-		}
-	case reflect.Int8, reflect.Int16, reflect.Int32, reflect.Int64, reflect.Uint8, reflect.Uint16, reflect.Uint32, reflect.Uint64, reflect.Float32, reflect.Float64, reflect.Complex64, reflect.Complex128:
-		if err := binary.Write(w, nativeEndian, v.Interface()); err != nil {
-			return err
-		}
-
-	case reflect.Array, reflect.Slice:
-		// If current dimension is a slice, verify that it has the expected size
-		// Go's type system makes that guarantee for arrays.
-		if v.Kind() == reflect.Slice {
-			expected := int(shape[0])
-			if v.Len() != expected {
-				return fmt.Errorf("mismatched slice lengths: %d and %d", v.Len(), expected)
-			}
-		}
-
-		// Optimisation: if only one dimension is left we can use binary.Write() directly for this slice
-		if len(shape) == 1 && v.Len() > 0 {
-			switch v.Index(0).Kind() {
-			case reflect.Int8, reflect.Int16, reflect.Int32, reflect.Int64, reflect.Uint8, reflect.Uint16, reflect.Uint32, reflect.Uint64, reflect.Float32, reflect.Float64, reflect.Complex64, reflect.Complex128:
-				return binary.Write(w, nativeEndian, v.Interface())
-			}
-		}
-
-		subShape := shape[1:]
-		for i := 0; i < v.Len(); i++ {
-			err := encodeTensor(w, v.Index(i), subShape)
-			if err != nil {
-				return err
-			}
-		}
-
-	default:
-		return fmt.Errorf("unsupported type %v", v.Type())
+// sizeVarUint determines how many bytes it would take to encode the int v as
+// an unsigned varint
+func sizeVarUint(v uint64) int {
+	if v < 0x80 {
+		return 1
 	}
-	return nil
+	bits := bits.Len64(v)
+	return (bits + 6) / 7
 }
 
-// decodeTensor decodes the Tensor from the buffer to ptr using the format
-// specified in c_api.h. Use stringDecoder for String tensors.
-func decodeTensor(r *bytes.Reader, shape []int64, typ reflect.Type, ptr reflect.Value) error {
-	switch typ.Kind() {
-	case reflect.Bool:
-		b, err := r.ReadByte()
+// encodeTensorWithSlices writes v to the specified buffer using the format specified in
+// c_api.h. Use stringEncoder for String tensors.
+func encodeTensorWithSlices(w *bytes.Buffer, v reflect.Value, shape []int64) error {
+	// If current dimension is a slice, verify that it has the expected size
+	// Go's type system makes that guarantee for arrays.
+	if v.Kind() == reflect.Slice {
+		expected := int(shape[0])
+		if v.Len() != expected {
+			return fmt.Errorf("mismatched slice lengths: %d and %d", v.Len(), expected)
+		}
+	} else if v.Kind() != reflect.Array {
+		return fmt.Errorf("unsupported type %v", v.Type())
+	}
+
+	// Once we have just a single dimension we can just copy the data
+	if len(shape) == 1 && v.Len() > 0 {
+		elt := v.Index(0)
+		if !elt.CanAddr() {
+			panic("cannot take address")
+		}
+		ptr := unsafe.Pointer(elt.Addr().Pointer())
+		return copyPtr(w, ptr, v.Len()*int(elt.Type().Size()))
+	}
+
+	subShape := shape[1:]
+	for i := 0; i < v.Len(); i++ {
+		err := encodeTensorWithSlices(w, v.Index(i), subShape)
 		if err != nil {
 			return err
 		}
-		ptr.Elem().SetBool(b == 1)
-	case reflect.Int8, reflect.Int16, reflect.Int32, reflect.Int64, reflect.Uint8, reflect.Uint16, reflect.Uint32, reflect.Uint64, reflect.Float32, reflect.Float64, reflect.Complex64, reflect.Complex128:
-		if err := binary.Read(r, nativeEndian, ptr.Interface()); err != nil {
-			return err
-		}
-
-	case reflect.Slice:
-		val := reflect.Indirect(ptr)
-		val.Set(reflect.MakeSlice(typ, int(shape[0]), int(shape[0])))
-
-		// Optimization: if only one dimension is left we can use binary.Read() directly for this slice
-		if len(shape) == 1 && val.Len() > 0 {
-			switch val.Index(0).Kind() {
-			case reflect.Int8, reflect.Int16, reflect.Int32, reflect.Int64, reflect.Uint8, reflect.Uint16, reflect.Uint32, reflect.Uint64, reflect.Float32, reflect.Float64, reflect.Complex64, reflect.Complex128:
-				return binary.Read(r, nativeEndian, val.Interface())
-			}
-		}
-
-		for i := 0; i < val.Len(); i++ {
-			if err := decodeTensor(r, shape[1:], typ.Elem(), val.Index(i).Addr()); err != nil {
-				return err
-			}
-		}
-
-	default:
-		return fmt.Errorf("unsupported type %v", typ)
 	}
+
 	return nil
 }
 
+// It isn't safe to use reflect.SliceHeader as it uses a uintptr for Data and
+// this is not inspected by the garbage collector
+type sliceHeader struct {
+	Data unsafe.Pointer
+	Len  int
+	Cap  int
+}
+
+// copyPtr copies the backing data for a slice or array directly into w. Note
+// we don't need to worry about byte ordering because we want the natural byte
+// order for the machine we're running on.
+func copyPtr(w *bytes.Buffer, ptr unsafe.Pointer, l int) error {
+	// Convert our slice header into a []byte so we can call w.Write
+	b := *(*[]byte)(unsafe.Pointer(&sliceHeader{
+		Data: ptr,
+		Len:  l,
+		Cap:  l,
+	}))
+	_, err := w.Write(b)
+	return err
+}
+
 type stringEncoder struct {
-	offsets io.Writer
+	offsets *bytes.Buffer
 	data    []byte
 	offset  uint64
 	status  *status
@@ -399,19 +565,18 @@ type stringEncoder struct {
 
 func (e *stringEncoder) encode(v reflect.Value, shape []int64) error {
 	if v.Kind() == reflect.String {
-		if err := binary.Write(e.offsets, nativeEndian, e.offset); err != nil {
+		if err := copyPtr(e.offsets, unsafe.Pointer(&e.offset), int(unsafe.Sizeof(e.offset))); err != nil {
 			return err
 		}
-		var (
-			s      = v.Interface().(string)
-			src    = C.CString(s)
-			srcLen = C.size_t(len(s))
-			dst    = (*C.char)(unsafe.Pointer(&e.data[e.offset]))
-			dstLen = C.size_t(uint64(len(e.data)) - e.offset)
-		)
-		e.offset += uint64(C.TF_StringEncode(src, srcLen, dst, dstLen, e.status.c))
-		C.free(unsafe.Pointer(src))
-		return e.status.Err()
+		// A string is encoded as the varint length followed by the string bytes.
+		// We do this in Go to avoid the considerable overhead of a cgo call into
+		// the tensorflow library
+		s := v.String()
+		n := binary.PutUvarint(e.data[e.offset:], uint64(len(s)))
+		e.offset += uint64(n)
+		n = copy(e.data[e.offset:], s)
+		e.offset += uint64(n)
+		return nil
 	}
 
 	if v.Kind() == reflect.Slice {
@@ -430,45 +595,6 @@ func (e *stringEncoder) encode(v reflect.Value, shape []int64) error {
 	return nil
 }
 
-type stringDecoder struct {
-	offsets io.Reader
-	data    []byte
-	status  *status
-}
-
-func (d *stringDecoder) decode(ptr reflect.Value, shape []int64) error {
-	if len(shape) == 0 {
-		var offset uint64
-		if err := binary.Read(d.offsets, nativeEndian, &offset); err != nil {
-			return err
-		}
-		var (
-			src    = (*C.char)(unsafe.Pointer(&d.data[offset]))
-			srcLen = C.size_t(len(d.data)) - C.size_t(offset)
-			dst    *C.char
-			dstLen C.size_t
-		)
-		if offset > uint64(len(d.data)) {
-			return fmt.Errorf("invalid offsets in String Tensor")
-		}
-		C.TF_StringDecode(src, srcLen, &dst, &dstLen, d.status.c)
-		if err := d.status.Err(); err != nil {
-			return err
-		}
-		s := ptr.Interface().(*string)
-		*s = C.GoStringN(dst, C.int(dstLen))
-		return nil
-	}
-	val := reflect.Indirect(ptr)
-	val.Set(reflect.MakeSlice(typeOf(String, shape), int(shape[0]), int(shape[0])))
-	for i := 0; i < val.Len(); i++ {
-		if err := d.decode(val.Index(i).Addr(), shape[1:]); err != nil {
-			return err
-		}
-	}
-	return nil
-}
-
 func bug(format string, args ...interface{}) error {
 	return fmt.Errorf("BUG: Please report at https://github.com/tensorflow/tensorflow/issues with the note: Go TensorFlow %v: %v", Version(), fmt.Sprintf(format, args...))
 }
@@ -489,22 +615,3 @@ func isTensorSerializable(dataType DataType) error {
 		return fmt.Errorf("serialization of tensors with the DataType %d is not yet supported, see https://github.com/tensorflow/tensorflow/issues/6003", dataType)
 	}
 }
-
-// nativeEndian is the byte order for the local platform. Used to send back and
-// forth Tensors with the C API. We test for endianness at runtime because
-// some architectures can be booted into different endian modes.
-var nativeEndian binary.ByteOrder
-
-func init() {
-	buf := [2]byte{}
-	*(*uint16)(unsafe.Pointer(&buf[0])) = uint16(0xABCD)
-
-	switch buf {
-	case [2]byte{0xCD, 0xAB}:
-		nativeEndian = binary.LittleEndian
-	case [2]byte{0xAB, 0xCD}:
-		nativeEndian = binary.BigEndian
-	default:
-		panic("Could not determine native endianness.")
-	}
-}
diff --git a/tensorflow/go/tensor_test.go b/tensorflow/go/tensor_test.go
index dc533cd3e1c..ebfbdecf6c8 100644
--- a/tensorflow/go/tensor_test.go
+++ b/tensorflow/go/tensor_test.go
@@ -18,6 +18,7 @@ package tensorflow
 
 import (
 	"bytes"
+	"fmt"
 	"io"
 	"reflect"
 	"testing"
@@ -276,6 +277,7 @@ func TestReadTensorReadAll(t *testing.T) {
 }
 
 func benchmarkNewTensor(b *testing.B, v interface{}) {
+	b.ReportAllocs()
 	for i := 0; i < b.N; i++ {
 		if t, err := NewTensor(v); err != nil || t == nil {
 			b.Fatalf("(%v, %v)", t, err)
@@ -283,32 +285,68 @@ func benchmarkNewTensor(b *testing.B, v interface{}) {
 	}
 }
 
-func BenchmarkNewTensor(b *testing.B) {
-	var (
-		// Some sample sizes from the Inception image labeling model.
-		// Where input tensors correspond to a 224x224 RGB image
-		// flattened into a vector.
-		vector [224 * 224 * 3]int32
-	)
-	b.Run("[150528]", func(b *testing.B) { benchmarkNewTensor(b, vector) })
-}
+func benchmarkValueTensor(b *testing.B, v interface{}) {
+	t, err := NewTensor(v)
+	if err != nil {
+		b.Fatalf("(%v, %v)", t, err)
+	}
+	b.ReportAllocs()
+	b.ResetTimer()
 
-func benchmarkDecodeTensor(b *testing.B, t *Tensor) {
 	for i := 0; i < b.N; i++ {
 		_ = t.Value()
 	}
 }
 
-func BenchmarkDecodeTensor(b *testing.B) {
-	var (
-		// Some sample sizes from the Inception image labeling model.
-		// Where input tensors correspond to a 224x224 RGB image
-		// flattened into a vector.
-		vector [224 * 224 * 3]int32
-	)
-	t, err := NewTensor(vector)
-	if err != nil {
-		b.Fatalf("(%v, %v)", t, err)
+func BenchmarkTensor(b *testing.B) {
+	// Some sample sizes from the Inception image labeling model.
+	// Where input tensors correspond to a 224x224 RGB image
+	// flattened into a vector.
+	var vector [224 * 224 * 3]int32
+	var arrays [100][100][100]int32
+
+	l3 := make([][][]float32, 100)
+	l2 := make([][]float32, 100*100)
+	l1 := make([]float32, 100*100*100)
+	for i := range l2 {
+		l2[i] = l1[i*100 : (i+1)*100]
 	}
-	b.Run("[150528]", func(b *testing.B) { benchmarkDecodeTensor(b, t) })
+	for i := range l3 {
+		l3[i] = l2[i*100 : (i+1)*100]
+	}
+
+	s1 := make([]string, 100*100*100)
+	s2 := make([][]string, 100*100)
+	s3 := make([][][]string, 100)
+	for i := range s1 {
+		s1[i] = "cheesit"
+	}
+	for i := range s2 {
+		s2[i] = s1[i*100 : (i+1)*100]
+	}
+	for i := range s3 {
+		s3[i] = s2[i*100 : (i+1)*100]
+	}
+
+	tests := []interface{}{
+		vector,
+		arrays,
+		l1,
+		l2,
+		l3,
+		s1,
+		s2,
+		s3,
+	}
+	b.Run("New", func(b *testing.B) {
+		for _, test := range tests {
+			b.Run(fmt.Sprintf("%T", test), func(b *testing.B) { benchmarkNewTensor(b, test) })
+		}
+	})
+	b.Run("Value", func(b *testing.B) {
+		for _, test := range tests {
+			b.Run(fmt.Sprintf("%T", test), func(b *testing.B) { benchmarkValueTensor(b, test) })
+		}
+	})
+
 }

From 9f4e69bb1e8da97d369d1c2a9999845a41bb589b Mon Sep 17 00:00:00 2001
From: Raman Sarokin <sorokin@google.com>
Date: Wed, 24 Jun 2020 09:53:41 -0700
Subject: [PATCH 0977/1390] ConvolutionTransposedThin converted to new style.

PiperOrigin-RevId: 318083958
Change-Id: I8de4b4b250ceff00e1d16ef917cb2f8698d33e28
---
 .../cl/kernels/convolution_transposed_thin.cc | 111 +++++++-----------
 .../cl/kernels/convolution_transposed_thin.h  |  59 +++++++---
 2 files changed, 86 insertions(+), 84 deletions(-)

diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_thin.cc b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_thin.cc
index 8eca689ed11..9df9587663c 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_thin.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_thin.cc
@@ -28,21 +28,17 @@ namespace gpu {
 namespace cl {
 namespace {
 
-std::string GenerateConvolutionTransposedCode(
-    const OperationDef& op_def, int src_depth, int dst_channels,
-    const int2& kernel_size, const CLDevice& device,
-    const std::vector<ElementwiseOperation*>& linked_operations) {
-  TensorCodeGenerator src_tensor(
-      "src_data",
-      WHSBPoint{"src_size.x", "src_size.y", "src_size.z", "src_size.w"},
-      op_def.src_tensors[0]);
-  TensorCodeGenerator dst_tensor(
-      "dst_data",
-      WHSBPoint{"dst_size.x", "dst_size.y", "dst_size.z", "dst_size.w"},
-      op_def.dst_tensors[0]);
+std::string GenerateConvolutionTransposedCode(const OperationDef& op_def,
+                                              int src_depth, int dst_channels,
+                                              const int2& kernel_size,
+                                              Arguments* args) {
+  args->AddObjectRef(
+      "src_tensor", AccessType::READ,
+      absl::make_unique<TensorDescriptor>(op_def.src_tensors[0]));
+  args->AddObjectRef(
+      "dst_tensor", AccessType::WRITE,
+      absl::make_unique<TensorDescriptor>(op_def.dst_tensors[0]));
 
-  const std::string batch_id = op_def.IsBatchSupported() ? "B" : "";
-  std::string c = GetCommonDefines(op_def.precision);
   const std::string channel_x = dst_channels == 1 ? "" : ".x";
   const std::vector<std::string> postfix = {channel_x, ".y", ".z", ".w"};
   const std::vector<std::string> channel = {".x", ".y", ".z", ".w"};
@@ -62,36 +58,33 @@ std::string GenerateConvolutionTransposedCode(
       break;
   }
 
+  std::string c = GetCommonDefines(op_def.precision);
   c += "__kernel void main_function(\n";
-  c += src_tensor.GetDeclaration(AccessType::READ) + ",\n";
-  c += "    __constant FLT4* filters";
-  c += GetArgsDeclaration(linked_operations);
-  c += dst_tensor.GetDeclaration(AccessType::WRITE) + ",\n";
-  c += "    int4 src_size,             \n";
-  c += "    int4 dst_size,             \n";
-  c += "    FLT4 bias_value            \n";
-  c += ") {\n";
+  c += "$0) {\n";
   if (op_def.IsBatchSupported()) {
     c += "  int linear_id = get_global_id(0);\n";
-    c += "  int X = linear_id / dst_size.w;\n";
-    c += "  int B = linear_id % dst_size.w;\n";
+    c += "  int X = linear_id / args.dst_tensor.Batch();\n";
+    c += "  int B = linear_id % args.dst_tensor.Batch();\n";
+    c += "  args.dst_tensor.SetBatchRef(B);\n";
+    c += "  args.src_tensor.SetBatchRef(B);\n";
   } else {
     c += "  int X = get_global_id(0);\n";
   }
   c += "  int Y = get_global_id(1);\n";
-  c += "  if (X >= src_size.x || Y >= src_size.y) return;\n";
+  c += "  if (X >= args.src_tensor.Width() || Y >= args.src_tensor.Height()) "
+       "return;\n";
   c += "  " + accum_type + " r[" + std::to_string(kernel_size.y) + "][" +
        std::to_string(kernel_size.x) + "];\n";
   c += "  {\n";
-  c += "  FLT4 src = " + src_tensor.ReadWHSB("X", "Y", "0", batch_id) + ";\n";
+  c += "  FLT4 src = args.src_tensor.Read(X, Y, 0);\n";
   int index = 0;
   for (int y = 0; y < kernel_size.y; ++y) {
     for (int x = 0; x < kernel_size.x; ++x) {
       std::string r_s =
           "  r[" + std::to_string(y) + "][" + std::to_string(x) + "]";
       for (int d = 0; d < dst_channels; ++d) {
-        c += r_s + postfix[d] + " = dot(src, filters[" + std::to_string(index) +
-             "]);\n";
+        c += r_s + postfix[d] + " = dot(src, args.weights.Read(" +
+             std::to_string(index) + "));\n";
         index++;
       }
     }
@@ -100,15 +93,15 @@ std::string GenerateConvolutionTransposedCode(
   for (int i = 1; i < src_depth; ++i) {
     c += "  if (X > " + std::to_string(-i) +
          ") {  // always true, to reduce registers usage\n";
-    c += "  FLT4 src = " +
-         src_tensor.ReadWHSB("X", "Y", std::to_string(i), batch_id) + ";\n";
+    c +=
+        "  FLT4 src = args.src_tensor.Read(X, Y, " + std::to_string(i) + ");\n";
     for (int y = 0; y < kernel_size.y; ++y) {
       for (int x = 0; x < kernel_size.x; ++x) {
         std::string r_s =
             "  r[" + std::to_string(y) + "][" + std::to_string(x) + "]";
         for (int d = 0; d < dst_channels; ++d) {
-          c += r_s + postfix[d] + " += dot(src, filters[" +
-               std::to_string(index) + "]);\n";
+          c += r_s + postfix[d] + " += dot(src, args.weights.Read(" +
+               std::to_string(index) + "));\n";
           index++;
         }
       }
@@ -121,21 +114,16 @@ std::string GenerateConvolutionTransposedCode(
     for (int x = 0; x < kernel_size.x; ++x) {
       const std::string x_coord = "X + " + std::to_string(x);
       const std::string y_coord = "Y + " + std::to_string(y);
-      c += "  if (" + x_coord + " < dst_size.x && " + y_coord +
-           " < dst_size.y) {\n";
-      c += "    FLT4 result = bias_value;\n";
+      c += "  if (" + x_coord + " < args.dst_tensor.Width() && " + y_coord +
+           " < args.dst_tensor.Height()) {\n";
+      c += "    FLT4 result = args.weights.Read(" + std::to_string(index) +
+           ");\n";
       for (int d = 0; d < dst_channels; ++d) {
         c += "    result" + channel[d] + " += r[" + std::to_string(y) + "][" +
              std::to_string(x) + "]" + postfix[d] + ";\n";
       }
-      const std::string x_3dcoord = op_def.IsBatchSupported()
-                                        ? "(" + x_coord + ") * dst_size.w + B"
-                                        : x_coord;
-      const LinkingContext context{"result", x_3dcoord, y_coord, "0"};
-      c += PostProcess(linked_operations, context);
-      c += "    " +
-           dst_tensor.WriteWHSB("result", x_coord, y_coord, "0", batch_id) +
-           "\n";
+      c += "    args.dst_tensor.Write(result, " + x_coord + ", " + y_coord +
+           ", 0);\n";
       c += "  }\n";
     }
   }
@@ -150,19 +138,11 @@ ConvolutionTransposedThin::ConvolutionTransposedThin(
     : GPUOperation(definition),
       kernel_size_(attr.weights.shape.w, attr.weights.shape.h),
       src_channels_(attr.weights.shape.i),
-      dst_channels_(attr.weights.shape.o) {
-  float4 bias_value(0.0f);
-  for (int i = 0; i < attr.weights.shape.o; ++i) {
-    bias_value[i] = attr.bias.data[i];
-  }
-  bias_value_ = FLT4(definition_.precision, bias_value);
-}
+      dst_channels_(attr.weights.shape.o) {}
 
 ConvolutionTransposedThin::ConvolutionTransposedThin(
     ConvolutionTransposedThin&& operation)
     : GPUOperation(std::move(operation)),
-      weights_buf_(std::move(operation.weights_buf_)),
-      bias_value_(std::move(operation.bias_value_)),
       kernel_size_(operation.kernel_size_),
       src_channels_(operation.src_channels_),
       dst_channels_(operation.dst_channels_),
@@ -172,8 +152,6 @@ ConvolutionTransposedThin::ConvolutionTransposedThin(
 ConvolutionTransposedThin& ConvolutionTransposedThin::operator=(
     ConvolutionTransposedThin&& operation) {
   if (this != &operation) {
-    weights_buf_ = std::move(operation.weights_buf_);
-    bias_value_ = std::move(operation.bias_value_);
     std::swap(kernel_size_, operation.kernel_size_);
     std::swap(src_channels_, operation.src_channels_);
     std::swap(dst_channels_, operation.dst_channels_);
@@ -186,9 +164,15 @@ ConvolutionTransposedThin& ConvolutionTransposedThin::operator=(
 
 absl::Status ConvolutionTransposedThin::Compile(
     const CreationContext& creation_context) {
-  const auto code = GenerateConvolutionTransposedCode(
+  std::string code = GenerateConvolutionTransposedCode(
       definition_, DivideRoundUp(src_channels_, 4), dst_channels_, kernel_size_,
-      *creation_context.device, linked_operations_);
+      &args_);
+  std::string element_wise_code;
+  RETURN_IF_ERROR(
+      MergeOperations(linked_operations_, &args_, &element_wise_code));
+  RETURN_IF_ERROR(args_.TransformToCLCode(creation_context.device->GetInfo(),
+                                          {{"dst_tensor", element_wise_code}},
+                                          &code));
 
   std::vector<CompilerOptions> options;
   if (definition_.precision == CalculationsPrecision::F16 &&
@@ -202,15 +186,10 @@ absl::Status ConvolutionTransposedThin::Compile(
 }
 
 absl::Status ConvolutionTransposedThin::BindArguments() {
-  kernel_.ResetBindingCounter();
-  RETURN_IF_ERROR(kernel_.SetMemoryAuto(src_[0]->GetMemoryPtr()));
-  RETURN_IF_ERROR(kernel_.SetMemoryAuto(weights_buf_.GetMemoryPtr()));
-  RETURN_IF_ERROR(BindArgs(&kernel_, linked_operations_));
-  RETURN_IF_ERROR(kernel_.SetMemoryAuto(dst_[0]->GetMemoryPtrForWriting()));
-  RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->GetWHSB()));
-  RETURN_IF_ERROR(kernel_.SetBytesAuto(dst_[0]->GetWHSB()));
-  RETURN_IF_ERROR(kernel_.SetBytesAuto(bias_value_));
-  return absl::OkStatus();
+  RETURN_IF_ERROR(args_.SetObjectRef("src_tensor", src_[0]));
+  RETURN_IF_ERROR(args_.SetObjectRef("dst_tensor", dst_[0]));
+  RETURN_IF_ERROR(SetArguments(linked_operations_, &args_));
+  return args_.Bind(kernel_.kernel());
 }
 
 int3 ConvolutionTransposedThin::GetGridSize() const {
@@ -248,7 +227,7 @@ absl::Status CreateConvolutionTransposedThin(
   }
   *result = ConvolutionTransposedThin(definition, attr);
   RETURN_IF_ERROR(
-      result->UploadWeights(attr.weights, creation_context.context));
+      result->UploadData(attr.weights, attr.bias, creation_context.context));
   return absl::OkStatus();
 }
 
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_thin.h b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_thin.h
index db2ad8c71eb..3061d545f75 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_thin.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_thin.h
@@ -58,8 +58,9 @@ class ConvolutionTransposedThin : public GPUOperation {
   ConvolutionTransposedThin(const OperationDef& definition,
                             const ConvolutionTransposedAttributes& attr);
   template <DataType T>
-  absl::Status UploadWeights(const tflite::gpu::Tensor<OHWI, T>& weights,
-                             CLContext* context);
+  absl::Status UploadData(const tflite::gpu::Tensor<OHWI, T>& weights,
+                          const tflite::gpu::Tensor<Linear, T>& biases,
+                          CLContext* context);
 
   template <DataType S, typename T>
   void RearrangeWeightsData(const tflite::gpu::Tensor<OHWI, S>& weights,
@@ -68,9 +69,6 @@ class ConvolutionTransposedThin : public GPUOperation {
   absl::Status BindArguments();
   int3 GetGridSize() const;
 
-  Buffer weights_buf_;
-  FLT4 bias_value_;
-
   int2 kernel_size_;
   int src_channels_;
   int dst_channels_;
@@ -80,25 +78,50 @@ class ConvolutionTransposedThin : public GPUOperation {
 };
 
 template <DataType T>
-absl::Status ConvolutionTransposedThin::UploadWeights(
-    const tflite::gpu::Tensor<OHWI, T>& weights, CLContext* context) {
+absl::Status ConvolutionTransposedThin::UploadData(
+    const tflite::gpu::Tensor<OHWI, T>& weights,
+    const tflite::gpu::Tensor<Linear, T>& biases, CLContext* context) {
   const int src_depth = DivideRoundUp(src_channels_, 4);
-  const int elements_count =
-      kernel_size_.x * kernel_size_.y * src_depth * 4 * dst_channels_;
+  const int flt4_count =
+      kernel_size_.x * kernel_size_.y * src_depth * dst_channels_;
 
-  const int float4_size =
-      definition_.precision == CalculationsPrecision::F32 ? 16 : 8;
-  if (definition_.GetDataType() == DataType::FLOAT32) {
-    std::vector<float4> gpu_data(elements_count);
+  const bool f32_weights = definition_.precision == CalculationsPrecision::F32;
+
+  BufferDescriptor desc;
+  desc.element_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
+  desc.element_size = 4;
+  desc.memory_type = MemoryType::CONSTANT;
+
+  Buffer weights_buffer;
+  if (f32_weights) {
+    std::vector<float4> gpu_data(flt4_count);
     RearrangeWeightsData(weights, absl::MakeSpan(gpu_data));
-    return CreateReadOnlyBuffer(float4_size * elements_count, gpu_data.data(),
-                                context, &weights_buf_);
+    float4 bias_value(0.0f);
+    for (int i = 0; i < weights.shape.o; ++i) {
+      bias_value[i] = biases.data[i];
+    }
+    gpu_data.push_back(bias_value);
+    RETURN_IF_ERROR(CreateReadOnlyBuffer(sizeof(float4) * gpu_data.size(),
+                                         gpu_data.data(), context,
+                                         &weights_buffer));
   } else {
-    std::vector<half4> gpu_data(elements_count);
+    std::vector<half4> gpu_data(flt4_count);
     RearrangeWeightsData(weights, absl::MakeSpan(gpu_data));
-    return CreateReadOnlyBuffer(float4_size * elements_count, gpu_data.data(),
-                                context, &weights_buf_);
+    half4 bias_value(0.0f);
+    for (int i = 0; i < weights.shape.o; ++i) {
+      bias_value[i] = biases.data[i];
+    }
+    gpu_data.push_back(bias_value);
+    RETURN_IF_ERROR(CreateReadOnlyBuffer(sizeof(half4) * gpu_data.size(),
+                                         gpu_data.data(), context,
+                                         &weights_buffer));
   }
+
+  args_.AddObject("weights", AccessType::READ,
+                  absl::make_unique<Buffer>(std::move(weights_buffer)),
+                  absl::make_unique<BufferDescriptor>(desc));
+
+  return absl::OkStatus();
 }
 
 template <DataType S, typename T>

From 61ef65bae9e0b9860941cd7fb86c14babaef1f8c Mon Sep 17 00:00:00 2001
From: rahul-kamat <rahulkamat@gmail.com>
Date: Wed, 24 Jun 2020 17:14:05 +0000
Subject: [PATCH 0978/1390] Update comment describing dtype map

---
 tensorflow/python/framework/python_op_gen.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/framework/python_op_gen.cc b/tensorflow/python/framework/python_op_gen.cc
index 19946f5b71c..de5618103d3 100644
--- a/tensorflow/python/framework/python_op_gen.cc
+++ b/tensorflow/python/framework/python_op_gen.cc
@@ -45,7 +45,7 @@ const int kRightMargin = 78;
 
 constexpr char kEagerFallbackSuffix[] = "_eager_fallback";
 
-// Dtype enums mapped to dtype classes which is the type of each dtype
+// Maps C++ dtype enum values to Python DType classes
 const std::unordered_map<string, string> dtype_type {
       {"_dtypes.float16", "_dtypes.Float16"},
       {"_dtypes.half", "_dtypes.Half"},

From 3d7c7303cffb285ca4f9e69b6d3ed6197552dc0e Mon Sep 17 00:00:00 2001
From: Scott Zhu <scottzhu@google.com>
Date: Wed, 24 Jun 2020 10:01:49 -0700
Subject: [PATCH 0979/1390] Update strategy_test_lib to not use Keras dense
 layer.

Replaced it with a variable and then matmul with the input. It doesn't fully copy all the keras behavior (like mix precision, etc), but it should be good enough for the existing test cases that uses it.

PiperOrigin-RevId: 318085606
Change-Id: Id5e420393b933a945eec1b5edccc10c0fc868dc2
---
 tensorflow/python/distribute/BUILD            |  3 ++-
 .../python/distribute/strategy_test_lib.py    | 23 ++++++++++++++-----
 2 files changed, 19 insertions(+), 7 deletions(-)

diff --git a/tensorflow/python/distribute/BUILD b/tensorflow/python/distribute/BUILD
index d2c46f64f18..2e3b93fba02 100644
--- a/tensorflow/python/distribute/BUILD
+++ b/tensorflow/python/distribute/BUILD
@@ -1116,6 +1116,8 @@ py_library(
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:gradients_impl",
         "//tensorflow/python:init_ops",
+        "//tensorflow/python:init_ops_v2",
+        "//tensorflow/python:math_ops",
         "//tensorflow/python:training",
         "//tensorflow/python:util",
         "//tensorflow/python:variable_scope",
@@ -1124,7 +1126,6 @@ py_library(
         "//tensorflow/python/eager:backprop",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:test",
-        "//tensorflow/python/keras/layers",
         "//third_party/py/numpy",
     ],
 )
diff --git a/tensorflow/python/distribute/strategy_test_lib.py b/tensorflow/python/distribute/strategy_test_lib.py
index 0845391ce3b..b8ed0f26ae5 100644
--- a/tensorflow/python/distribute/strategy_test_lib.py
+++ b/tensorflow/python/distribute/strategy_test_lib.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import functools
 import os
 import tempfile
 
@@ -38,11 +39,12 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
-from tensorflow.python.keras.layers import core
 from tensorflow.python.lib.io import tf_record
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gen_math_ops
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import init_ops_v2
 from tensorflow.python.ops import summary_ops_v2 as summary_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
@@ -114,12 +116,19 @@ def _events_from_logdir(test_case, logdir):
 class DistributionTestBase(test.TestCase):
   """Some tests that should work with any DistributionStrategy."""
 
+  def _create_variable_like_keras_dense_layer(self, name, shape, dtype):
+    initializer = functools.partial(
+        init_ops_v2.GlorotUniform(), shape, dtype=dtype)
+    return variables.Variable(
+        initial_value=initializer, name=name, trainable=True)
+
   def _test_minimize_loss_eager(self, d):
     with d.scope():
-      l = core.Dense(1, use_bias=False)
-
+      kernel = self._create_variable_like_keras_dense_layer(
+          name="kernel", shape=(1, 1), dtype=dtypes.float32)
       def loss(x):
-        y = array_ops.reshape(l(x), []) - array_ops.identity(1.)
+        y = array_ops.reshape(
+            gen_math_ops.mat_mul(x, kernel), []) - array_ops.identity(1.)
         return y * y
       # TODO(isaprykin): Extract implicit_grad+get_filtered_grad_fn into a
       # common `implicit_grad` function and put it in DistributionStrategy.
@@ -173,10 +182,12 @@ class DistributionTestBase(test.TestCase):
          ops.Graph().as_default(), \
          self.cached_session(config=config) as sess, \
          d.scope():
-      l = core.Dense(1, use_bias=False)
+      kernel = self._create_variable_like_keras_dense_layer(
+          name="kernel", shape=(1, 1), dtype=dtypes.float32)
 
       def loss(x):
-        y = array_ops.reshape(l(x), []) - array_ops.identity(1.)
+        y = array_ops.reshape(
+            gen_math_ops.mat_mul(x, kernel), []) - array_ops.identity(1.)
         return y * y
 
       grad_fn = backprop.implicit_grad(loss)

From c0facbc98101a32267d3e072f5004e1081e358cf Mon Sep 17 00:00:00 2001
From: Andy Ly <lyandy@google.com>
Date: Wed, 24 Jun 2020 10:06:34 -0700
Subject: [PATCH 0980/1390] Loosen restriction on TPUReplicatedInput indices in
 TPUClusterFormation and TPUDynamicPaddingMapper passes.

The exact ordering is not necessary, but knowing what indices the TPUReplicatedInput ops map to originally is, for TPU Dynamic Padder. This updates TPUClusterFormation pass to store an attribute keeping track of the original indices of TPUReplicatedInput ops and TPUDynamicPaddingMapper pass to use the same attribute when remapping padding_map paddings (arg_index, padding_arg_index). Missing nonnegative indices restriction is also removed.

This is necessary to move graph pruning before island coarsening. In addition, this cleanup is necessary to make adding support for packed inputs (TPUReplicatedInputs with only one operand) simpler.

PiperOrigin-RevId: 318086700
Change-Id: I9d0373d23c5b5335b43b91f13afa50604344b65c
---
 .../tests/tpu_cluster_formation.mlir          |  83 +++++++------
 .../tests/tpu_dynamic_padding_mapper.mlir     | 116 +++++++++---------
 .../transforms/tpu_cluster_formation.cc       |  70 +++++++----
 .../transforms/tpu_dynamic_padding_mapper.cc  |  49 +++++---
 4 files changed, 185 insertions(+), 133 deletions(-)

diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tpu_cluster_formation.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tpu_cluster_formation.mlir
index 52c7e5ada55..4e4317ce5dd 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tpu_cluster_formation.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tpu_cluster_formation.mlir
@@ -346,6 +346,7 @@ func @replication(%arg0: tensor<i1>, %arg1: tensor<i32>, %arg2: tensor<f32>) ->
 // CHECK:      %[[REPLICATE:[0-9]*]]:4 = tf_device.replicate
 // CHECK-DAG:  [%[[ARG_0]], %[[OP_A]]] as %[[RI_0:[a-z0-9]*]]: tensor<i1>
 // CHECK-DAG:  [%[[OP_B]], %[[ARG_1]]] as %[[RI_1:[a-z0-9]*]]: tensor<i32>
+// CHECK-NOT:  _replicated_input_indices
 // CHECK-SAME: n = 2 : i32
 // CHECK-NEXT:   %[[CLUSTER:[0-9]*]]:2 = "tf_device.cluster"() ( {
 // CHECK:          %[[OP_D:[0-9]*]] = "tf.opD"(%[[RI_0]], %[[RI_1]], %[[ARG_2]], %[[OP_C]])
@@ -382,6 +383,46 @@ func @sort_replicated_input(%arg0: tensor<i1>, %arg1: tensor<i1>, %arg2: tensor<
 // CHECK-DAG:  [%[[ARG_0]], %[[ARG_0]]] as %{{[a-z0-9]*}}
 // CHECK-DAG:  [%[[ARG_3]], %[[ARG_3]]] as %{{[a-z0-9]*}}
 // CHECK-DAG:  [%[[ARG_5]], %[[ARG_5]]] as %{{[a-z0-9]*}}
+// CHECK-SAME: _replicated_input_indices = [0, 1, 2, -1, -1, -1]
+
+
+// Test TPUReplicatedInputs with non contiguous `index` attributes.
+// CHECK-LABEL: func @non_contigous_indices
+// CHECK-SAME: (%[[ARG_0:.*]]: tensor<i1>, %[[ARG_1:.*]]: tensor<i1>, %[[ARG_2:.*]]: tensor<i1>)
+func @non_contigous_indices(%arg0: tensor<i1>, %arg1: tensor<i1>, %arg2: tensor<i1>) {
+  %0 = "tf.TPUReplicatedInput"(%arg0, %arg0) {index = 8 : i64} : (tensor<i1>, tensor<i1>) -> tensor<i1>
+  "tf.opA"(%0) {_tpu_replicate = "replicate", device = "device", name = "name"} : (tensor<i1>) -> ()
+  %1 = "tf.TPUReplicatedInput"(%arg1, %arg1) : (tensor<i1>, tensor<i1>) -> tensor<i1>
+  "tf.opB"(%1) {_tpu_replicate = "replicate", device = "device", name = "name"} : (tensor<i1>) -> ()
+  %2 = "tf.TPUReplicatedInput"(%arg2, %arg2) {index = 2 : i64} : (tensor<i1>, tensor<i1>) -> tensor<i1>
+  "tf.opC"(%2) {_tpu_replicate = "replicate", device = "device", name = "name"} : (tensor<i1>) -> ()
+  "tf.TPUReplicateMetadata"() {_tpu_replicate = "replicate", device = "device", num_replicas = 2, topology = "topology"} : () -> ()
+  return
+}
+
+// CHECK:      tf_device.replicate
+// CHECK-SAME: [%[[ARG_2]], %[[ARG_2]]] as %{{[a-z0-9]*}}
+// CHECK-SAME: [%[[ARG_0]], %[[ARG_0]]] as %{{[a-z0-9]*}}
+// CHECK-SAME: [%[[ARG_1]], %[[ARG_1]]] as %{{[a-z0-9]*}}
+// CHECK-SAME: _replicated_input_indices = [2, 8, -1]
+
+
+// Test that the `is_mirrored_variable` attribute is preserved in the
+// tf_device.replicate op.
+// CHECK-LABEL: func @mirrored_variables
+// CHECK-SAME: (%[[ARG_0:.*]]: tensor<!tf.resource<tensor<32xf32>>>, %[[ARG_1:.*]]: tensor<!tf.resource<tensor<32xf32>>>, %[[ARG_2:.*]]: tensor<!tf.resource<tensor<32xf32>>>, %[[ARG_3:.*]]: tensor<!tf.resource<tensor<32xf32>>>)
+func @mirrored_variables(%arg0: tensor<!tf.resource<tensor<32xf32>>>, %arg1: tensor<!tf.resource<tensor<32xf32>>>, %arg2: tensor<!tf.resource<tensor<32xf32>>>, %arg3: tensor<!tf.resource<tensor<32xf32>>>) {
+  %0 = "tf.TPUReplicatedInput"(%arg0, %arg1) {index = 0 : i64} : (tensor<!tf.resource<tensor<32xf32>>>, tensor<!tf.resource<tensor<32xf32>>>) -> tensor<!tf.resource<tensor<32xf32>>>
+  %1 = "tf.TPUReplicatedInput"(%arg2, %arg3) {index = 1 : i64, is_mirrored_variable = true} : (tensor<!tf.resource<tensor<32xf32>>>, tensor<!tf.resource<tensor<32xf32>>>) -> tensor<!tf.resource<tensor<32xf32>>>
+  "tf.opA"(%0, %1) {_tpu_replicate = "replicate", device = "device"} : (tensor<!tf.resource<tensor<32xf32>>>, tensor<!tf.resource<tensor<32xf32>>>) -> ()
+  "tf.TPUReplicateMetadata"() {_tpu_replicate = "replicate", device = "device", num_replicas = 2, topology = "topology"} : () -> ()
+  return
+}
+
+// CHECK:      tf_device.replicate
+// CHECK-SAME: [%[[ARG_0]], %[[ARG_1]]] as %{{[a-z0-9]*}}
+// CHECK-SAME: _mirrored_variable_indices = [1]
+// CHECK-SAME: _replicated_input_indices = [0, 1]
 
 
 // -----
@@ -407,8 +448,10 @@ func @bad_num_replicas() {
   return
 }
 
+
 // -----
 
+
 // Test that functions without TPUReplicateMetadata op are skipped without
 // error
 // CHECK-LABEL: func @missing_metadata_op
@@ -483,22 +526,9 @@ func @leftover_replicated_output(%arg0: tensor<i1>) {
 // -----
 
 
-// Test bad TPUReplicatedInput positive `index` attribute.
-func @bad_positive_index_input(%arg0: tensor<i1>) {
-  // expected-error@+1 {{'tf.TPUReplicatedInput' index is not in range [-1, 1), got 1}}
-  %0 = "tf.TPUReplicatedInput"(%arg0, %arg0) {index = 1 : i64} : (tensor<i1>, tensor<i1>) -> tensor<i1>
-  "tf.opA"(%0) {_tpu_replicate = "replicate", device = "device", name = "name"} : (tensor<i1>) -> ()
-  "tf.TPUReplicateMetadata"() {_tpu_replicate = "replicate", device = "device", num_replicas = 2, topology = "topology"} : () -> ()
-  return
-}
-
-
-// -----
-
-
 // Test bad TPUReplicatedInput negative `index` attribute.
 func @bad_negative_index_input(%arg0: tensor<i1>) {
-  // expected-error@+1 {{'tf.TPUReplicatedInput' index is not in range [-1, 1), got -2}}
+  // expected-error@+1 {{'tf.TPUReplicatedInput' op requires index to be at least -1, but got -2}}
   %0 = "tf.TPUReplicatedInput"(%arg0, %arg0) {index = -2 : i64} : (tensor<i1>, tensor<i1>) -> tensor<i1>
   "tf.opA"(%0) {_tpu_replicate = "replicate", device = "device", name = "name"} : (tensor<i1>) -> ()
   "tf.TPUReplicateMetadata"() {_tpu_replicate = "replicate", device = "device", num_replicas = 2, topology = "topology"} : () -> ()
@@ -509,33 +539,12 @@ func @bad_negative_index_input(%arg0: tensor<i1>) {
 // -----
 
 
-// Test TPUReplicatedInput with conflicting `index` attribute. This will result
-// in gaps in the TPUReplicatedInput ordering.
+// Test TPUReplicatedInput with conflicting `index` attribute.
 func @input_index_gaps(%arg0: tensor<i1>) {
-  // expected-error@+1 {{failed to sort 'tf.TPUReplicatedInput' ops, gap(s) found in indices}}
   %0 = "tf.TPUReplicatedInput"(%arg0, %arg0) {index = 1 : i64} : (tensor<i1>, tensor<i1>) -> tensor<i1>
+  // expected-error@+1 {{'tf.TPUReplicatedInput' op requires indices to be unique, but found multiple 'tf.TPUReplicatedInput' ops with index 1}}
   %1 = "tf.TPUReplicatedInput"(%arg0, %arg0) {index = 1 : i64} : (tensor<i1>, tensor<i1>) -> tensor<i1>
   "tf.opA"(%0, %1) {_tpu_replicate = "replicate", device = "device", name = "name"} : (tensor<i1>, tensor<i1>) -> ()
   "tf.TPUReplicateMetadata"() {_tpu_replicate = "replicate", device = "device", num_replicas = 2, topology = "topology"} : () -> ()
   return
 }
-
-
-// -----
-
-
-// Test that the `is_mirrored_variable` attribute is preserved in the
-// tf_device.replicate op.
-// CHECK-LABEL: func @mirrored_variables
-// CHECK-SAME: (%[[ARG_0:.*]]: tensor<!tf.resource<tensor<32xf32>>>, %[[ARG_1:.*]]: tensor<!tf.resource<tensor<32xf32>>>, %[[ARG_2:.*]]: tensor<!tf.resource<tensor<32xf32>>>, %[[ARG_3:.*]]: tensor<!tf.resource<tensor<32xf32>>>)
-func @mirrored_variables(%arg0: tensor<!tf.resource<tensor<32xf32>>>, %arg1: tensor<!tf.resource<tensor<32xf32>>>, %arg2: tensor<!tf.resource<tensor<32xf32>>>, %arg3: tensor<!tf.resource<tensor<32xf32>>>) {
-  %0 = "tf.TPUReplicatedInput"(%arg0, %arg1) {index = 0 : i64} : (tensor<!tf.resource<tensor<32xf32>>>, tensor<!tf.resource<tensor<32xf32>>>) -> tensor<!tf.resource<tensor<32xf32>>>
-  %1 = "tf.TPUReplicatedInput"(%arg2, %arg3) {index = 1 : i64, is_mirrored_variable = true} : (tensor<!tf.resource<tensor<32xf32>>>, tensor<!tf.resource<tensor<32xf32>>>) -> tensor<!tf.resource<tensor<32xf32>>>
-  "tf.opA"(%0, %1) {_tpu_replicate = "replicate", device = "device"} : (tensor<!tf.resource<tensor<32xf32>>>, tensor<!tf.resource<tensor<32xf32>>>) -> ()
-  "tf.TPUReplicateMetadata"() {_tpu_replicate = "replicate", device = "device", num_replicas = 2, topology = "topology"} : () -> ()
-  return
-}
-
-// CHECK:      tf_device.replicate
-// CHECK-SAME: [%[[ARG_0]], %[[ARG_1]]] as %{{[a-z0-9]*}}
-// CHECK-SAME: _mirrored_variable_indices = [1]
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tpu_dynamic_padding_mapper.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tpu_dynamic_padding_mapper.mlir
index 7faf764cbea..7feea3314fd 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tpu_dynamic_padding_mapper.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tpu_dynamic_padding_mapper.mlir
@@ -9,7 +9,7 @@
 //   padding_arg_index: 1
 // CHECK-LABEL: func @single_arg_single_shape
 func @single_arg_single_shape(%arg0: tensor<i1>) {
-  tf_device.replicate([%arg0, %arg0] as %ri_0: tensor<i1>, [%arg0, %arg0] as %ri_1: tensor<i1>) {n = 2 : i32} {
+  tf_device.replicate([%arg0, %arg0] as %ri_0: tensor<i1>, [%arg0, %arg0] as %ri_1: tensor<i1>) {_replicated_input_indices = [0, 1], n = 2 : i32} {
     "tf_device.cluster_func"(%ri_0, %ri_1) {func = @func0, padding_map = ["\10\02\18\01"]} : (tensor<i1>, tensor<i1>) -> ()
     tf_device.return
   }
@@ -36,7 +36,7 @@ func @func0(%arg0: tensor<i1>, %arg1: tensor<i1>) {
 //   padding_arg_index: 2
 // CHECK-LABEL: func @single_arg_multiple_shapes
 func @single_arg_multiple_shapes(%arg0: tensor<i1>) {
-  tf_device.replicate([%arg0, %arg0] as %ri_0: tensor<i1>, [%arg0, %arg0] as %ri_1: tensor<i1>, [%arg0, %arg0] as %ri_2: tensor<i1>) {n = 2 : i32} {
+  tf_device.replicate([%arg0, %arg0] as %ri_0: tensor<i1>, [%arg0, %arg0] as %ri_1: tensor<i1>, [%arg0, %arg0] as %ri_2: tensor<i1>) {_replicated_input_indices = [0, 1, 2], n = 2 : i32} {
     "tf_device.cluster_func"(%ri_0, %ri_1, %ri_2) {func = @func1, padding_map = ["\10\02\18\01", "\10\03\18\02"]} : (tensor<i1>, tensor<i1>, tensor<i1>) -> ()
     tf_device.return
   }
@@ -68,7 +68,7 @@ func @func1(%arg0: tensor<i1>, %arg1: tensor<i1>, %arg2: tensor<i1>) {
 //   padding_arg_index: 3
 // CHECK-LABEL: func @multiple_args
 func @multiple_args(%arg0: tensor<i1>) {
-  tf_device.replicate([%arg0, %arg0] as %ri_0: tensor<i1>, [%arg0, %arg0] as %ri_1: tensor<i1>, [%arg0, %arg0] as %ri_2: tensor<i1>, [%arg0, %arg0] as %ri_3: tensor<i1>, [%arg0, %arg0] as %ri_4: tensor<i1>) {n = 2 : i32} {
+  tf_device.replicate([%arg0, %arg0] as %ri_0: tensor<i1>, [%arg0, %arg0] as %ri_1: tensor<i1>, [%arg0, %arg0] as %ri_2: tensor<i1>, [%arg0, %arg0] as %ri_3: tensor<i1>, [%arg0, %arg0] as %ri_4: tensor<i1>) {_replicated_input_indices = [0, 1, 2, 3, 4], n = 2 : i32} {
     "tf_device.cluster_func"(%ri_0, %ri_1, %ri_2, %ri_3, %ri_4) {func = @func2, padding_map = ["\10\02\18\01", "\10\03\18\02", "\08\04\10\01\18\03"]} : (tensor<i1>, tensor<i1>, tensor<i1>, tensor<i1>, tensor<i1>) -> ()
     tf_device.return
   }
@@ -89,7 +89,7 @@ func @func2(%arg0: tensor<i1>, %arg1: tensor<i1>, %arg2: tensor<i1>, %arg3: tens
 //   padding_arg_index: 1
 // CHECK-LABEL: func @remap_indices
 func @remap_indices(%arg0: tensor<i1>) {
-  tf_device.replicate([%arg0, %arg0] as %ri_0: tensor<i1>, [%arg0, %arg0] as %ri_1: tensor<i1>) {n = 2 : i32} {
+  tf_device.replicate([%arg0, %arg0] as %ri_0: tensor<i1>, [%arg0, %arg0] as %ri_1: tensor<i1>) {_replicated_input_indices = [0, 1], n = 2 : i32} {
     "tf_device.cluster_func"(%ri_1, %arg0, %ri_0) {func = @func3, padding_map = ["\10\02\18\01"]} : (tensor<i1>, tensor<i1>, tensor<i1>) -> ()
     tf_device.return
   }
@@ -124,7 +124,7 @@ func @func4(%arg0: tensor<i1>, %arg1: tensor<i1>, %arg2: tensor<i1>) {
 // Test encapsulated function is not modified when there are no padding maps.
 // CHECK-LABEL: func @no_padding_map
 func @no_padding_map(%arg0: tensor<i1>) {
-  tf_device.replicate([%arg0, %arg0] as %ri_0: tensor<i1>, [%arg0, %arg0] as %ri_1: tensor<i1>) {n = 2 : i32} {
+  tf_device.replicate([%arg0, %arg0] as %ri_0: tensor<i1>, [%arg0, %arg0] as %ri_1: tensor<i1>) {_replicated_input_indices = [0, 1], n = 2 : i32} {
     "tf_device.cluster_func"(%ri_1, %arg0, %ri_0) {func = @func5} : (tensor<i1>, tensor<i1>, tensor<i1>) -> ()
     tf_device.return
   }
@@ -140,7 +140,7 @@ func @func5(%arg0: tensor<i1>, %arg1: tensor<i1>, %arg2: tensor<i1>) {
 // Test encapsulated function is not modified when padding maps is empty.
 // CHECK-LABEL: func @empty_padding_map
 func @empty_padding_map(%arg0: tensor<i1>) {
-  tf_device.replicate([%arg0, %arg0] as %ri_0: tensor<i1>, [%arg0, %arg0] as %ri_1: tensor<i1>) {n = 2 : i32} {
+  tf_device.replicate([%arg0, %arg0] as %ri_0: tensor<i1>, [%arg0, %arg0] as %ri_1: tensor<i1>) {_replicated_input_indices = [0, 1], n = 2 : i32} {
     "tf_device.cluster_func"(%ri_1, %arg0, %ri_0) {func = @func6, padding_map = []} : (tensor<i1>, tensor<i1>, tensor<i1>) -> ()
     tf_device.return
   }
@@ -161,7 +161,7 @@ func @func6(%arg0: tensor<i1>, %arg1: tensor<i1>, %arg2: tensor<i1>) {
 //   padding_arg_index: 1
 // CHECK-LABEL: func @unused_padding_map
 func @unused_padding_map(%arg0: tensor<i1>) {
-  tf_device.replicate([%arg0, %arg0] as %ri_0: tensor<i1>, [%arg0, %arg0] as %ri_1: tensor<i1>) {n = 2 : i32} {
+  tf_device.replicate([%arg0, %arg0] as %ri_0: tensor<i1>, [%arg0, %arg0] as %ri_1: tensor<i1>) {_replicated_input_indices = [0, 1], n = 2 : i32} {
     "tf_device.cluster_func"(%ri_1) {func = @func7, padding_map = ["\10\02\18\01"]} : (tensor<i1>) -> ()
     tf_device.return
   }
@@ -187,7 +187,7 @@ func @func7(%arg0: tensor<i1>) {
 //   shape_index: 2
 //   padding_arg_index: 3
 func @missing_padding_arg(%arg0: tensor<i1>) {
-  tf_device.replicate([%arg0, %arg0] as %ri_0: tensor<i1>, [%arg0, %arg0] as %ri_1: tensor<i1>, [%arg0, %arg0] as %ri_2: tensor<i1>, [%arg0, %arg0] as %ri_3: tensor<i1>) {n = 2 : i32} {
+  tf_device.replicate([%arg0, %arg0] as %ri_0: tensor<i1>, [%arg0, %arg0] as %ri_1: tensor<i1>, [%arg0, %arg0] as %ri_2: tensor<i1>, [%arg0, %arg0] as %ri_3: tensor<i1>) {_replicated_input_indices = [0, 1, 2, 3], n = 2 : i32} {
     // expected-warning@+1 {{bad 'padding_map' attribute at index 0, unused padding_arg_index 1}}
     "tf_device.cluster_func"(%ri_0, %ri_2, %ri_3) {func = @func8, padding_map = ["\10\02\18\01", "\08\02\10\02\18\03"]} : (tensor<i1>, tensor<i1>, tensor<i1>) -> ()
     tf_device.return
@@ -201,11 +201,55 @@ func @func8(%arg0: tensor<i1>, %arg1: tensor<i1>, %arg2: tensor<i1>) {
   return
 }
 
+// Test tf_device.replicate with missing _replicated_input_indices does no
+// transformation.
+//
+// Padding map "\10\02\18\01":
+//   arg_index: 0
+//   shape_index: 2
+//   padding_arg_index: 1
+// CHECK-LABEL: func @missing_replicated_input_indices
+func @missing_replicated_input_indices(%arg0: tensor<i1>) {
+  tf_device.replicate([%arg0, %arg0] as %ri_0: tensor<i1>, [%arg0, %arg0] as %ri_1: tensor<i1>) {n = 2 : i32} {
+    "tf_device.cluster_func"(%ri_0, %ri_1) {func = @func9, padding_map = ["\10\02\18\01"]} : (tensor<i1>, tensor<i1>) -> ()
+    tf_device.return
+  }
+  return
+}
+
+// CHECK-LABEL: func @func9
+// CHECK-NOT: xla_hlo.padding_map
+func @func9(%arg0: tensor<i1>, %arg1: tensor<i1>) {
+  return
+}
+
+// Test single argument with padding map lifted to associated encapsulated
+// function.
+//
+// Padding map "\08\08\10\06\18\02"
+//   arg_index: 8
+//   shape_index: 6
+//   padding_arg_index: 2
+// CHECK-LABEL: func @non_contigous_indices
+func @non_contigous_indices(%arg0: tensor<i1>) {
+  tf_device.replicate([%arg0, %arg0] as %ri_0: tensor<i1>, [%arg0, %arg0] as %ri_1: tensor<i1>) {_replicated_input_indices = [2, 8], n = 2 : i32} {
+    "tf_device.cluster_func"(%ri_0, %ri_1) {func = @func10, padding_map = ["\08\08\10\06\18\02"]} : (tensor<i1>, tensor<i1>) -> ()
+    tf_device.return
+  }
+  return
+}
+
+// CHECK-LABEL: func @func10
+// CHECK-SAME: (%{{[a-z0-9]+}}: tensor<i1>, %{{[a-z0-9]+}}: tensor<i1> {xla_hlo.padding_map = {padding_arg_indices = [0 : i32], shape_indices = [6 : i32]}})
+func @func10(%arg0: tensor<i1>, %arg1: tensor<i1>) {
+  return
+}
+
 // -----
 
 // Test bad padding map attribute (not an array).
 func @bad_padding_map() {
-  tf_device.replicate {n = 2 : i32} {
+  tf_device.replicate {_replicated_input_indices = [], n = 2 : i32} {
     // expected-error@+1 {{'tf_device.cluster_func' op requires 'padding_map' array attribute}}
     "tf_device.cluster_func"() {func = @_func, padding_map = 0 : i32} : () -> ()
     tf_device.return
@@ -221,7 +265,7 @@ func @_func() {
 
 // Test bad padding map attribute (element in array is not a string).
 func @bad_padding_map_element() {
-  tf_device.replicate {n = 2 : i32} {
+  tf_device.replicate {_replicated_input_indices = [], n = 2 : i32} {
     // expected-error@+1 {{'tf_device.cluster_func' op bad 'padding_map' attribute at index 0, not a string}}
     "tf_device.cluster_func"() {func = @_func, padding_map = [0 : i32]} : () -> ()
     tf_device.return
@@ -237,7 +281,7 @@ func @_func() {
 
 // Test unparsable padding map.
 func @bad_padding_map_proto() {
-  tf_device.replicate {n = 2 : i32} {
+  tf_device.replicate {_replicated_input_indices = [], n = 2 : i32} {
     // expected-error@+1 {{'tf_device.cluster_func' op bad 'padding_map' attribute at index 0, failed to parse 'z' as tensorflow::tpu::PaddingMap}}
     "tf_device.cluster_func"() {func = @_func, padding_map = ["z"]} : () -> ()
     tf_device.return
@@ -258,8 +302,8 @@ func @_func() {
 //   shape_index: 2
 //   padding_arg_index: 1
 func @negative_arg_index(%arg0: tensor<i1>) {
-  tf_device.replicate([%arg0, %arg0] as %ri_0: tensor<i1>, [%arg0, %arg0] as %ri_1: tensor<i1>) {n = 2 : i32} {
-    // expected-error@+1 {{'tf_device.cluster_func' op bad 'padding_map' attribute at index 0, arg_index must be in [0, 2), got -1}}
+  tf_device.replicate([%arg0, %arg0] as %ri_0: tensor<i1>, [%arg0, %arg0] as %ri_1: tensor<i1>) {_replicated_input_indices = [0, 1], n = 2 : i32} {
+    // expected-error@+1 {{'tf_device.cluster_func' op bad 'padding_map' attribute at index 0, arg_index must be nonnegative, but got -1}}
     "tf_device.cluster_func"(%ri_0, %ri_1) {func = @_func, padding_map = ["\08\FF\FF\FF\FF\FF\FF\FF\FF\FF\01\10\02\18\01"]} : (tensor<i1>, tensor<i1>) -> ()
     tf_device.return
   }
@@ -272,27 +316,6 @@ func @_func(%arg0: tensor<i1>, %arg1: tensor<i1>) {
 
 // -----
 
-// Test out of bound arg index.
-//
-// Padding map "\08\02\10\02\18\01":
-//   arg_index: 2
-//   shape_index: 2
-//   padding_arg_index: 1
-func @bad_arg_index(%arg0: tensor<i1>) {
-  tf_device.replicate([%arg0, %arg0] as %ri_0: tensor<i1>, [%arg0, %arg0] as %ri_1: tensor<i1>) {n = 2 : i32} {
-    // expected-error@+1 {{'tf_device.cluster_func' op bad 'padding_map' attribute at index 0, arg_index must be in [0, 2), got 2}}
-    "tf_device.cluster_func"(%ri_0, %ri_1) {func = @_func, padding_map = ["\08\02\10\02\18\01"]} : (tensor<i1>, tensor<i1>) -> ()
-    tf_device.return
-  }
-  return
-}
-
-func @_func(%arg0: tensor<i1>, %arg1: tensor<i1>) {
-  return
-}
-
-// -----
-
 // Test negative padding arg index.
 //
 // Padding map "\08\01\10\02\18\FF\FF\FF\FF\FF\FF\FF\FF\FF\01":
@@ -300,8 +323,8 @@ func @_func(%arg0: tensor<i1>, %arg1: tensor<i1>) {
 //   shape_index: 2
 //   padding_arg_index: -1
 func @negative_padding_arg_index(%arg0: tensor<i1>) {
-  tf_device.replicate([%arg0, %arg0] as %ri_0: tensor<i1>, [%arg0, %arg0] as %ri_1: tensor<i1>) {n = 2 : i32} {
-    // expected-error@+1 {{'tf_device.cluster_func' op bad 'padding_map' attribute at index 0, padding_arg_index must be in [0, 2), got -1}}
+  tf_device.replicate([%arg0, %arg0] as %ri_0: tensor<i1>, [%arg0, %arg0] as %ri_1: tensor<i1>) {_replicated_input_indices = [0, 1], n = 2 : i32} {
+    // expected-error@+1 {{'tf_device.cluster_func' op bad 'padding_map' attribute at index 0, padding_arg_index must be nonnegative, but got -1}}
     "tf_device.cluster_func"(%ri_0, %ri_1) {func = @_func, padding_map = ["\08\01\10\02\18\FF\FF\FF\FF\FF\FF\FF\FF\FF\01"]} : (tensor<i1>, tensor<i1>) -> ()
     tf_device.return
   }
@@ -311,24 +334,3 @@ func @negative_padding_arg_index(%arg0: tensor<i1>) {
 func @_func(%arg0: tensor<i1>, %arg1: tensor<i1>) {
   return
 }
-
-// -----
-
-// Test out of bound padding arg index.
-//
-// Padding map "\08\01\10\02\18\02":
-//   arg_index: 1
-//   shape_index: 2
-//   padding_arg_index: 2
-func @bad_padding_arg_index(%arg0: tensor<i1>) {
-  tf_device.replicate([%arg0, %arg0] as %ri_0: tensor<i1>, [%arg0, %arg0] as %ri_1: tensor<i1>) {n = 2 : i32} {
-    // expected-error@+1 {{'tf_device.cluster_func' op bad 'padding_map' attribute at index 0, padding_arg_index must be in [0, 2), got 2}}
-    "tf_device.cluster_func"(%ri_0, %ri_1) {func = @_func, padding_map = ["\08\01\10\02\18\02"]} : (tensor<i1>, tensor<i1>) -> ()
-    tf_device.return
-  }
-  return
-}
-
-func @_func(%arg0: tensor<i1>, %arg1: tensor<i1>) {
-  return
-}
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_cluster_formation.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_cluster_formation.cc
index 6ea6df38568..f3337ec0dfc 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_cluster_formation.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_cluster_formation.cc
@@ -22,6 +22,7 @@ limitations under the License.
 // not have ops outside of the cluster that are both operands and results of the
 // cluster. Note, this currently does not handle side effecting ops yet.
 
+#include <algorithm>
 #include <iterator>
 #include <memory>
 #include <tuple>
@@ -29,6 +30,7 @@ limitations under the License.
 
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallVector.h"
@@ -59,6 +61,7 @@ constexpr char kTPUReplicateAttr[] = "_tpu_replicate";
 constexpr char kDeviceAttr[] = "device";
 constexpr char kNameAttr[] = "name";
 constexpr char kNumReplicasAttr[] = "num_replicas";
+constexpr char kReplicatedInputIndicesAttr[] = "_replicated_input_indices";
 constexpr char kMirroredVariableIndicesAttr[] = "_mirrored_variable_indices";
 
 constexpr char kBadTPUReplicateAttrMsg[] =
@@ -261,33 +264,42 @@ void MovePrecedingClusterUsers(tf_device::ClusterOp cluster,
 
 // Sorts `tf.TPUReplicatedInput` ops by `index` attribute. Ops with an `index`
 // of -1 are always after ops with a non negative `index`, and an arbitrary
-// ordering is used as there are no dependencies on their relative ordering.
+// ordering is used as there are no dependencies on their relative ordering. If
+// there are multiple `tf.TPUReplicatedInput` ops with the same non negative
+// index or if indices are less than -1, an error will be returned.
 LogicalResult SortTPUReplicatedInputsByIndex(
     llvm::ArrayRef<Operation*> inputs,
     llvm::SmallVectorImpl<Operation*>* sorted_inputs) {
-  const int input_size = inputs.size();
-  sorted_inputs->resize(input_size, nullptr);
-  int last_index = input_size - 1;
-
+  llvm::SmallDenseSet<int64_t, 8> unique_indices;
   for (Operation* input : inputs) {
     int64_t index =
-        llvm::cast<TF::TPUReplicatedInputOp>(input).index().getLimitedValue();
-
-    if (index >= input_size || index < -1)
-      return input->emitError() << "'" << input->getName().getStringRef()
-                                << "' index is not in range [-1, " << input_size
-                                << "), got " << index;
-
-    if (index == -1)
-      (*sorted_inputs)[last_index--] = input;
-    else
-      (*sorted_inputs)[index] = input;
+        llvm::cast<TF::TPUReplicatedInputOp>(input).index().getSExtValue();
+    if (index < -1)
+      return input->emitOpError()
+             << "requires index to be at least -1, but got " << index;
+    if (index == -1) continue;
+    if (!unique_indices.insert(index).second)
+      return input->emitOpError()
+             << "requires indices to be unique, but found multiple '"
+             << input->getName() << "' ops with index " << index;
   }
 
-  if (llvm::any_of(*sorted_inputs, [](Operation* op) { return op == nullptr; }))
-    return inputs.front()->emitError()
-           << "failed to sort '" << inputs.front()->getName().getStringRef()
-           << "' ops, gap(s) found in indices";
+  // Sort all TPUReplicatedInputs by `index` attribute to have
+  // TPUReplicatedInputs with indices be added to the `tf_device.replicate` op
+  // deterministically. If `index` attribute is -1, instead move them to the
+  // end.
+  sorted_inputs->assign(inputs.begin(), inputs.end());
+  std::stable_sort(
+      sorted_inputs->begin(), sorted_inputs->end(),
+      [](Operation* l, Operation* r) {
+        int64_t l_index =
+            llvm::cast<TF::TPUReplicatedInputOp>(l).index().getSExtValue();
+        int64_t r_index =
+            llvm::cast<TF::TPUReplicatedInputOp>(r).index().getSExtValue();
+        if (l_index == -1 && r_index != -1) return false;
+        if (r_index == -1 && l_index != -1) return true;
+        return l_index < r_index;
+      });
 
   return success();
 }
@@ -315,6 +327,11 @@ LogicalResult ReplicateCluster(tf_device::ClusterOp cluster, int num_replicas) {
           unique_replicated_input_ops.getArrayRef(), &replicated_input_ops)))
     return failure();
 
+  // Index attribute value stored on TPUReplicatedInput op. These will be used
+  // later for dynamic padder.
+  llvm::SmallVector<int64_t, 8> replicated_input_indices;
+  bool has_replicated_input_index = false;
+
   // Indices of the replicate op's arguments that are mirrored variables.
   llvm::SmallVector<int64_t, 8> mirrored_variable_indices;
 
@@ -330,7 +347,14 @@ LogicalResult ReplicateCluster(tf_device::ClusterOp cluster, int num_replicas) {
 
     replicated_inputs.push_back(
         {input->getOperands(), input->getOperand(0).getType()});
-    if (llvm::cast<TF::TPUReplicatedInputOp>(input).is_mirrored_variable())
+
+    auto tpu_replicated_input = llvm::cast<TF::TPUReplicatedInputOp>(input);
+    int64_t tpu_replicated_input_index =
+        tpu_replicated_input.index().getSExtValue();
+    replicated_input_indices.push_back(tpu_replicated_input_index);
+    if (tpu_replicated_input_index != -1) has_replicated_input_index = true;
+
+    if (tpu_replicated_input.is_mirrored_variable())
       mirrored_variable_indices.push_back(pos_and_input.index());
   }
 
@@ -340,6 +364,10 @@ LogicalResult ReplicateCluster(tf_device::ClusterOp cluster, int num_replicas) {
       cluster.getLoc(), num_replicas,
       llvm::SmallDenseMap<llvm::StringRef, llvm::SmallVector<StringRef, 4>>(),
       replicated_inputs, cluster.getResultTypes());
+  if (has_replicated_input_index)
+    replicate_op.setAttr(kReplicatedInputIndicesAttr,
+                         builder.getI64ArrayAttr(replicated_input_indices));
+
   if (!mirrored_variable_indices.empty())
     replicate_op.setAttr(kMirroredVariableIndicesAttr,
                          builder.getI64ArrayAttr(mirrored_variable_indices));
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_dynamic_padding_mapper.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_dynamic_padding_mapper.cc
index 64af2eabd3d..5f33654d070 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_dynamic_padding_mapper.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_dynamic_padding_mapper.cc
@@ -40,6 +40,7 @@ limitations under the License.
 namespace mlir {
 namespace TFTPU {
 
+constexpr char kReplicatedInputIndicesAttr[] = "_replicated_input_indices";
 constexpr char kPaddingMapAttr[] = "padding_map";
 
 // This pass remaps and assigns padding maps to an encapsulated function's
@@ -56,14 +57,23 @@ struct TPUDynamicPaddingMapper
 // Creates a mapping from replicated input index (in `tf_device.replicate` op)
 // to `tf_device.cluster_func` operand index.
 llvm::SmallDenseMap<int32_t, int32_t> GetRemappedReplicatedInputIndices(
-    tf_device::ClusterFuncOp cluster_func, tf_device::ReplicateOp replicate) {
+    tf_device::ClusterFuncOp cluster_func, tf_device::ReplicateOp replicate,
+    ArrayAttr replicated_input_indices_attr) {
   Block* replicate_block = &replicate.GetBody();
 
   llvm::SmallDenseMap<int32_t, int32_t> remapped_indices;
-  for (auto operand_and_idx : llvm::enumerate(cluster_func.getOperands()))
-    if (auto block_arg = operand_and_idx.value().dyn_cast<BlockArgument>())
-      if (block_arg.getOwner() == replicate_block)
-        remapped_indices[block_arg.getArgNumber()] = operand_and_idx.index();
+  for (auto operand_and_idx : llvm::enumerate(cluster_func.getOperands())) {
+    if (auto block_arg = operand_and_idx.value().dyn_cast<BlockArgument>()) {
+      if (block_arg.getOwner() == replicate_block) {
+        int64_t replicated_input_index =
+            replicated_input_indices_attr[block_arg.getArgNumber()]
+                .cast<IntegerAttr>()
+                .getInt();
+        if (replicated_input_index != -1)
+          remapped_indices[replicated_input_index] = operand_and_idx.index();
+      }
+    }
+  }
 
   return remapped_indices;
 }
@@ -73,16 +83,15 @@ llvm::SmallDenseMap<int32_t, int32_t> GetRemappedReplicatedInputIndices(
 // indices. An error will be returned if an index is not found or parsing
 // failed.
 LogicalResult GetRemappedPaddings(
-    tf_device::ClusterFuncOp cluster_func, int num_replicated_args,
+    tf_device::ClusterFuncOp cluster_func,
     const llvm::SmallDenseMap<int32_t, int32_t>& remapped_indices,
     llvm::SmallVectorImpl<tensorflow::tpu::PaddingMap>* remapped_paddings) {
-  auto bad_index_msg = [num_replicated_args](int32_t index,
-                                             llvm::StringRef arg_type,
-                                             int32_t arg_index) {
+  auto bad_index_msg = [](int32_t index, llvm::StringRef arg_type,
+                          int32_t arg_index) {
     return llvm::formatv(
-               "bad '{0}' attribute at index {1}, {2} must be in [0, {3}), got "
-               "{4}",
-               kPaddingMapAttr, index, arg_type, num_replicated_args, arg_index)
+               "bad '{0}' attribute at index {1}, {2} must be nonnegative, but "
+               "got {3}",
+               kPaddingMapAttr, index, arg_type, arg_index)
         .str();
   };
 
@@ -111,12 +120,12 @@ LogicalResult GetRemappedPaddings(
           kPaddingMapAttr, idx, padding.getValue()));
 
     const int32_t arg_index = padding_proto.arg_index();
-    if (arg_index >= num_replicated_args || arg_index < 0)
+    if (arg_index < 0)
       return cluster_func.emitOpError()
              << bad_index_msg(idx, "arg_index", arg_index);
 
     const int32_t padding_arg_index = padding_proto.padding_arg_index();
-    if (padding_arg_index >= num_replicated_args || padding_arg_index < 0)
+    if (padding_arg_index < 0)
       return cluster_func.emitOpError()
              << bad_index_msg(idx, "padding_arg_index", padding_arg_index);
 
@@ -175,17 +184,21 @@ LogicalResult RemapAndAssignPaddingMaps(tf_device::ClusterFuncOp cluster_func,
   auto replicate = cluster_func.getParentOfType<tf_device::ReplicateOp>();
   // LaunchFunc is not replicated, there will be no padding.
   if (!replicate) return success();
-  const int num_replicated_args = replicate.GetBody().getNumArguments();
 
   auto func = symbol_table->lookup<FuncOp>(cluster_func.func());
   if (!func) return success();
 
+  auto replicated_input_indices_attr =
+      replicate.getAttrOfType<ArrayAttr>(kReplicatedInputIndicesAttr);
+  if (!replicated_input_indices_attr) return success();
+
   llvm::SmallDenseMap<int32_t, int32_t> remapped_indices =
-      GetRemappedReplicatedInputIndices(cluster_func, replicate);
+      GetRemappedReplicatedInputIndices(cluster_func, replicate,
+                                        replicated_input_indices_attr);
 
   llvm::SmallVector<tensorflow::tpu::PaddingMap, 4> remapped_paddings;
-  if (failed(GetRemappedPaddings(cluster_func, num_replicated_args,
-                                 remapped_indices, &remapped_paddings)))
+  if (failed(GetRemappedPaddings(cluster_func, remapped_indices,
+                                 &remapped_paddings)))
     return failure();
 
   AnnotateFunctionArgumentsWithPaddings(func, remapped_paddings);

From c09f77da7755f40c7dcdc63192afd3feb04788ec Mon Sep 17 00:00:00 2001
From: Nat Jeffries <njeff@google.com>
Date: Wed, 24 Jun 2020 10:11:15 -0700
Subject: [PATCH 0981/1390] Fix initialization order issue on Sparkfun. Reduce
 arena to minimum size for this benchmark. Align arena to avoid runtime
 warnings on Sparkfun.

PiperOrigin-RevId: 318087681
Change-Id: I5dd57a1dfb5edf61c440048fa4f11194d694e65a
---
 .../micro/benchmarks/keyword_benchmark.cc     | 19 ++++++++-------
 .../benchmarks/person_detection_benchmark.cc  | 23 ++++++++++---------
 2 files changed, 23 insertions(+), 19 deletions(-)

diff --git a/tensorflow/lite/micro/benchmarks/keyword_benchmark.cc b/tensorflow/lite/micro/benchmarks/keyword_benchmark.cc
index 65844fb91c1..d63f17a2d9a 100644
--- a/tensorflow/lite/micro/benchmarks/keyword_benchmark.cc
+++ b/tensorflow/lite/micro/benchmarks/keyword_benchmark.cc
@@ -33,20 +33,23 @@ limitations under the License.
 namespace {
 
 // Create an area of memory to use for input, output, and intermediate arrays.
-constexpr int tensor_arena_size = 73 * 1024;
-uint8_t tensor_arena[tensor_arena_size];
+// Align arena to 16 bytes to avoid alignment warnings on certain platforms.
+constexpr int tensor_arena_size = 21 * 1024;
+alignas(16) uint8_t tensor_arena[tensor_arena_size];
 // A random number generator seed to generate input values.
 constexpr int kRandomSeed = 42;
 
-// NOLINTNEXTLINE
-MicroBenchmarkRunner<int16_t> runner(g_keyword_scrambled_model_data,
-                                     tensor_arena, tensor_arena_size,
-                                     kRandomSeed);
+MicroBenchmarkRunner<int16_t>& GetBenchmarkRunner() {
+  // NOLINTNEXTLINE
+  static MicroBenchmarkRunner<int16_t> runner(
+      g_keyword_scrambled_model_data, tensor_arena, tensor_arena_size, 0);
+  return runner;
+}
 
 void KeywordRunTenIerations() {
   // TODO(b/152644476): Add a way to run more than a single deterministic input.
   for (int i = 0; i < 10; i++) {
-    runner.RunSingleIterationRandomInput();
+    GetBenchmarkRunner().RunSingleIterationRandomInput();
   }
 }
 
@@ -54,7 +57,7 @@ void KeywordRunTenIerations() {
 
 TF_LITE_MICRO_BENCHMARKS_BEGIN
 
-TF_LITE_MICRO_BENCHMARK(runner.RunSingleIterationRandomInput());
+TF_LITE_MICRO_BENCHMARK(GetBenchmarkRunner().RunSingleIterationRandomInput());
 
 TF_LITE_MICRO_BENCHMARK(KeywordRunTenIerations());
 
diff --git a/tensorflow/lite/micro/benchmarks/person_detection_benchmark.cc b/tensorflow/lite/micro/benchmarks/person_detection_benchmark.cc
index 31d2a36ed51..f8c9185620a 100644
--- a/tensorflow/lite/micro/benchmarks/person_detection_benchmark.cc
+++ b/tensorflow/lite/micro/benchmarks/person_detection_benchmark.cc
@@ -27,10 +27,6 @@ limitations under the License.
 #include "tensorflow/lite/schema/schema_generated.h"
 #include "tensorflow/lite/version.h"
 
-// Create an area of memory to use for input, output, and intermediate arrays.
-constexpr int tensor_arena_size = 73 * 1024;
-uint8_t tensor_arena[tensor_arena_size];
-
 /*
  * Person Detection benchmark.  Evaluates runtime performance of the visual
  * wakewords person detection model.  This is the same model found in
@@ -40,24 +36,28 @@ uint8_t tensor_arena[tensor_arena_size];
 namespace {
 
 // Create an area of memory to use for input, output, and intermediate arrays.
+// Align arena to 16 bytes to avoid alignment warnings on certain platforms.
 constexpr int tensor_arena_size = 95 * 1024;
-uint8_t tensor_arena[tensor_arena_size];
+alignas(16) uint8_t tensor_arena[tensor_arena_size];
 
-// NOLINTNEXTLINE
-MicroBenchmarkRunner<uint8_t> runner(g_person_detect_model_data, tensor_arena,
-                                     tensor_arena_size, 0);
+MicroBenchmarkRunner<uint8_t>& GetBenchmarkRunner() {
+  // NOLINTNEXTLINE
+  static MicroBenchmarkRunner<uint8_t> runner(
+      g_person_detect_model_data, tensor_arena, tensor_arena_size, 0);
+  return runner;
+}
 
 void PersonDetectionTenIerationsWithPerson() {
   // TODO(b/152644476): Add a way to run more than a single deterministic input.
   for (int i = 0; i < 10; i++) {
-    runner.RunSingleIterationCustomInput(g_person_data);
+    GetBenchmarkRunner().RunSingleIterationCustomInput(g_person_data);
   }
 }
 
 void PersonDetectionTenIerationsWithoutPerson() {
   // TODO(b/152644476): Add a way to run more than a single deterministic input.
   for (int i = 0; i < 10; i++) {
-    runner.RunSingleIterationCustomInput(g_no_person_data);
+    GetBenchmarkRunner().RunSingleIterationCustomInput(g_no_person_data);
   }
 }
 
@@ -65,7 +65,8 @@ void PersonDetectionTenIerationsWithoutPerson() {
 
 TF_LITE_MICRO_BENCHMARKS_BEGIN
 
-TF_LITE_MICRO_BENCHMARK(runner.RunSingleIterationCustomInput(g_person_data));
+TF_LITE_MICRO_BENCHMARK(
+    GetBenchmarkRunner().RunSingleIterationCustomInput(g_person_data));
 TF_LITE_MICRO_BENCHMARK(PersonDetectionTenIerationsWithPerson());
 TF_LITE_MICRO_BENCHMARK(PersonDetectionTenIerationsWithoutPerson());
 

From 833ae39436d73338e1ce562001e85be882cb6e15 Mon Sep 17 00:00:00 2001
From: Advait Jain <advaitjain@google.com>
Date: Wed, 24 Jun 2020 10:21:44 -0700
Subject: [PATCH 0982/1390] Separate out parse functionality into helper
 functions.

Ops in this change:
 * AveragePool2D
 * Ceil
 * Concatenation
 * Cos
 * Equal
 * Floor
 * Greater
 * GreaterEqual
 * MaxPool2D

PiperOrigin-RevId: 318089803
Change-Id: I65f907e78aae06cef7cd982eb9cd49566927e47e
---
 .../lite/core/api/flatbuffer_conversions.cc   | 175 ++++++++++++++----
 .../lite/core/api/flatbuffer_conversions.h    |  34 ++++
 .../lite/micro/micro_mutable_op_resolver.h    |  36 +---
 3 files changed, 185 insertions(+), 60 deletions(-)

diff --git a/tensorflow/lite/core/api/flatbuffer_conversions.cc b/tensorflow/lite/core/api/flatbuffer_conversions.cc
index c496c456542..efd8439f762 100644
--- a/tensorflow/lite/core/api/flatbuffer_conversions.cc
+++ b/tensorflow/lite/core/api/flatbuffer_conversions.cc
@@ -262,6 +262,43 @@ TfLiteStatus ParseArgMin(const Operator* op, BuiltinOperator,
   return kTfLiteOk;
 }
 
+// We have this parse function instead of directly returning kTfLiteOk from the
+// switch-case in ParseOpData because this function is used as part of the
+// selective registration for the OpResolver implementation in micro.
+TfLiteStatus ParseCeil(const Operator*, BuiltinOperator, ErrorReporter*,
+                       BuiltinDataAllocator*, void**) {
+  return kTfLiteOk;
+}
+
+TfLiteStatus ParseConcatenation(const Operator* op, BuiltinOperator,
+                                ErrorReporter* error_reporter,
+                                BuiltinDataAllocator* allocator,
+                                void** builtin_data) {
+  CheckParsePointerParams(op, error_reporter, allocator, builtin_data);
+
+  SafeBuiltinDataAllocator safe_allocator(allocator);
+  std::unique_ptr<TfLiteConcatenationParams,
+                  SafeBuiltinDataAllocator::BuiltinDataDeleter>
+      params = safe_allocator.Allocate<TfLiteConcatenationParams>();
+  TF_LITE_ENSURE(error_reporter, params != nullptr);
+
+  const ConcatenationOptions* schema_params =
+      op->builtin_options_as_ConcatenationOptions();
+
+  if (schema_params != nullptr) {
+    params->activation =
+        ConvertActivation(schema_params->fused_activation_function());
+    params->axis = schema_params->axis();
+  } else {
+    // TODO(b/157480169): We should either return kTfLiteError or fill in some
+    // reasonable defaults in the params struct. We are not doing so until we
+    // better undertand the ramifications of changing the legacy behavior.
+  }
+
+  *builtin_data = params.release();
+  return kTfLiteOk;
+}
+
 TfLiteStatus ParseConv2D(const Operator* op, BuiltinOperator,
                          ErrorReporter* error_reporter,
                          BuiltinDataAllocator* allocator, void** builtin_data) {
@@ -294,6 +331,14 @@ TfLiteStatus ParseConv2D(const Operator* op, BuiltinOperator,
   return kTfLiteOk;
 }
 
+// We have this parse function instead of directly returning kTfLiteOk from the
+// switch-case in ParseOpData because this function is used as part of the
+// selective registration for the OpResolver implementation in micro.
+TfLiteStatus ParseCos(const Operator*, BuiltinOperator, ErrorReporter*,
+                      BuiltinDataAllocator*, void**) {
+  return kTfLiteOk;
+}
+
 TfLiteStatus ParseDepthwiseConv2D(const Operator* op, BuiltinOperator,
                                   ErrorReporter* error_reporter,
                                   BuiltinDataAllocator* allocator,
@@ -338,6 +383,22 @@ TfLiteStatus ParseDequantize(const Operator*, BuiltinOperator, ErrorReporter*,
   return kTfLiteOk;
 }
 
+// We have this parse function instead of directly returning kTfLiteOk from the
+// switch-case in ParseOpData because this function is used as part of the
+// selective registration for the OpResolver implementation in micro.
+TfLiteStatus ParseEqual(const Operator*, BuiltinOperator, ErrorReporter*,
+                        BuiltinDataAllocator*, void**) {
+  return kTfLiteOk;
+}
+
+// We have this parse function instead of directly returning kTfLiteOk from the
+// switch-case in ParseOpData because this function is used as part of the
+// selective registration for the OpResolver implementation in micro.
+TfLiteStatus ParseFloor(const Operator*, BuiltinOperator, ErrorReporter*,
+                        BuiltinDataAllocator*, void**) {
+  return kTfLiteOk;
+}
+
 TfLiteStatus ParseFullyConnected(const Operator* op, BuiltinOperator,
                                  ErrorReporter* error_reporter,
                                  BuiltinDataAllocator* allocator,
@@ -384,6 +445,53 @@ TfLiteStatus ParseFullyConnected(const Operator* op, BuiltinOperator,
   return kTfLiteOk;
 }
 
+// We have this parse function instead of directly returning kTfLiteOk from the
+// switch-case in ParseOpData because this function is used as part of the
+// selective registration for the OpResolver implementation in micro.
+TfLiteStatus ParseGreater(const Operator*, BuiltinOperator, ErrorReporter*,
+                          BuiltinDataAllocator*, void**) {
+  return kTfLiteOk;
+}
+
+// We have this parse function instead of directly returning kTfLiteOk from the
+// switch-case in ParseOpData because this function is used as part of the
+// selective registration for the OpResolver implementation in micro.
+TfLiteStatus ParseGreaterEqual(const Operator*, BuiltinOperator, ErrorReporter*,
+                               BuiltinDataAllocator*, void**) {
+  return kTfLiteOk;
+}
+
+TfLiteStatus ParsePool(const Operator* op, BuiltinOperator,
+                       ErrorReporter* error_reporter,
+                       BuiltinDataAllocator* allocator, void** builtin_data) {
+  CheckParsePointerParams(op, error_reporter, allocator, builtin_data);
+
+  SafeBuiltinDataAllocator safe_allocator(allocator);
+  std::unique_ptr<TfLitePoolParams,
+                  SafeBuiltinDataAllocator::BuiltinDataDeleter>
+      params = safe_allocator.Allocate<TfLitePoolParams>();
+  TF_LITE_ENSURE(error_reporter, params != nullptr);
+
+  const Pool2DOptions* schema_params = op->builtin_options_as_Pool2DOptions();
+
+  if (schema_params != nullptr) {
+    params->padding = ConvertPadding(schema_params->padding());
+    params->stride_width = schema_params->stride_w();
+    params->stride_height = schema_params->stride_h();
+    params->filter_width = schema_params->filter_width();
+    params->filter_height = schema_params->filter_height();
+    params->activation =
+        ConvertActivation(schema_params->fused_activation_function());
+  } else {
+    // TODO(b/157480169): We should either return kTfLiteError or fill in some
+    // reasonable defaults in the params struct. We are not doing so until we
+    // better undertand the ramifications of changing the legacy behavior.
+  }
+
+  *builtin_data = params.release();
+  return kTfLiteOk;
+}
+
 TfLiteStatus ParseReshape(const Operator* op, BuiltinOperator,
                           ErrorReporter* error_reporter,
                           BuiltinDataAllocator* allocator,
@@ -531,6 +639,19 @@ TfLiteStatus ParseOpData(const Operator* op, BuiltinOperator op_type,
       return ParseArgMin(op, op_type, error_reporter, allocator, builtin_data);
     }
 
+    case BuiltinOperator_AVERAGE_POOL_2D: {
+      return ParsePool(op, op_type, error_reporter, allocator, builtin_data);
+    }
+
+    case BuiltinOperator_CEIL: {
+      return ParseCeil(op, op_type, error_reporter, allocator, builtin_data);
+    }
+
+    case BuiltinOperator_CONCATENATION: {
+      return ParseConcatenation(op, op_type, error_reporter, allocator,
+                                builtin_data);
+    }
+
     case BuiltinOperator_CONV_2D: {
       return ParseConv2D(op, op_type, error_reporter, allocator, builtin_data);
     }
@@ -545,11 +666,32 @@ TfLiteStatus ParseOpData(const Operator* op, BuiltinOperator op_type,
                              builtin_data);
     }
 
+    case BuiltinOperator_FLOOR: {
+      return ParseFloor(op, op_type, error_reporter, allocator, builtin_data);
+    }
+
     case BuiltinOperator_FULLY_CONNECTED: {
       return ParseFullyConnected(op, op_type, error_reporter, allocator,
                                  builtin_data);
     }
 
+    case BuiltinOperator_GREATER: {
+      return ParseGreater(op, op_type, error_reporter, allocator, builtin_data);
+    }
+
+    case BuiltinOperator_GREATER_EQUAL: {
+      return ParseGreaterEqual(op, op_type, error_reporter, allocator,
+                               builtin_data);
+    }
+
+    case BuiltinOperator_MAX_POOL_2D: {
+      return ParsePool(op, op_type, error_reporter, allocator, builtin_data);
+    }
+
+    case BuiltinOperator_L2_POOL_2D: {
+      return ParsePool(op, op_type, error_reporter, allocator, builtin_data);
+    }
+
     case BuiltinOperator_QUANTIZE: {
       return ParseQuantize(op, op_type, error_reporter, allocator,
                            builtin_data);
@@ -591,23 +733,6 @@ TfLiteStatus ParseOpData(const Operator* op, BuiltinOperator op_type,
       *builtin_data = params.release();
       return kTfLiteOk;
     }
-    case BuiltinOperator_AVERAGE_POOL_2D:
-    case BuiltinOperator_MAX_POOL_2D:
-    case BuiltinOperator_L2_POOL_2D: {
-      auto params = safe_allocator.Allocate<TfLitePoolParams>();
-      TF_LITE_ENSURE(error_reporter, params != nullptr);
-      if (const auto* pool_params = op->builtin_options_as_Pool2DOptions()) {
-        params->padding = ConvertPadding(pool_params->padding());
-        params->stride_width = pool_params->stride_w();
-        params->stride_height = pool_params->stride_h();
-        params->filter_width = pool_params->filter_width();
-        params->filter_height = pool_params->filter_height();
-        params->activation =
-            ConvertActivation(pool_params->fused_activation_function());
-      }
-      *builtin_data = params.release();
-      return kTfLiteOk;
-    }
     case BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_RNN: {
       auto params = safe_allocator.Allocate<TfLiteSequenceRNNParams>();
       TF_LITE_ENSURE(error_reporter, params != nullptr);
@@ -665,18 +790,6 @@ TfLiteStatus ParseOpData(const Operator* op, BuiltinOperator op_type,
     case BuiltinOperator_HASHTABLE_LOOKUP:
       // no-op.
       return kTfLiteOk;
-    case BuiltinOperator_CONCATENATION: {
-      auto params = safe_allocator.Allocate<TfLiteConcatenationParams>();
-      TF_LITE_ENSURE(error_reporter, params != nullptr);
-      if (const auto* concatenation_params =
-              op->builtin_options_as_ConcatenationOptions()) {
-        params->activation = ConvertActivation(
-            concatenation_params->fused_activation_function());
-        params->axis = concatenation_params->axis();
-      }
-      *builtin_data = params.release();
-      return kTfLiteOk;
-    }
     case BuiltinOperator_MUL: {
       auto params = safe_allocator.Allocate<TfLiteMulParams>();
       TF_LITE_ENSURE(error_reporter, params != nullptr);
@@ -1100,10 +1213,6 @@ TfLiteStatus ParseOpData(const Operator* op, BuiltinOperator op_type,
     case BuiltinOperator_EQUAL:
     case BuiltinOperator_EXP:
     case BuiltinOperator_EXPAND_DIMS:
-    case BuiltinOperator_CEIL:
-    case BuiltinOperator_FLOOR:
-    case BuiltinOperator_GREATER:
-    case BuiltinOperator_GREATER_EQUAL:
     case BuiltinOperator_HARD_SWISH:
     case BuiltinOperator_LESS:
     case BuiltinOperator_LESS_EQUAL:
diff --git a/tensorflow/lite/core/api/flatbuffer_conversions.h b/tensorflow/lite/core/api/flatbuffer_conversions.h
index a6431aa5ee1..7cf8decaca4 100644
--- a/tensorflow/lite/core/api/flatbuffer_conversions.h
+++ b/tensorflow/lite/core/api/flatbuffer_conversions.h
@@ -91,10 +91,23 @@ TfLiteStatus ParseArgMin(const Operator* op, BuiltinOperator op_type,
                          ErrorReporter* error_reporter,
                          BuiltinDataAllocator* allocator, void** builtin_data);
 
+TfLiteStatus ParseCeil(const Operator* op, BuiltinOperator op_type,
+                       ErrorReporter* error_reporter,
+                       BuiltinDataAllocator* allocator, void** builtin_data);
+
+TfLiteStatus ParseConcatenation(const Operator* op, BuiltinOperator op_type,
+                                ErrorReporter* error_reporter,
+                                BuiltinDataAllocator* allocator,
+                                void** builtin_data);
+
 TfLiteStatus ParseConv2D(const Operator* op, BuiltinOperator op_type,
                          ErrorReporter* error_reporter,
                          BuiltinDataAllocator* allocator, void** builtin_data);
 
+TfLiteStatus ParseCos(const Operator* op, BuiltinOperator op_type,
+                      ErrorReporter* error_reporter,
+                      BuiltinDataAllocator* allocator, void** builtin_data);
+
 TfLiteStatus ParseDepthwiseConv2D(const Operator* op, BuiltinOperator op_type,
                                   ErrorReporter* error_reporter,
                                   BuiltinDataAllocator* allocator,
@@ -105,11 +118,32 @@ TfLiteStatus ParseDequantize(const Operator* op, BuiltinOperator op_type,
                              BuiltinDataAllocator* allocator,
                              void** builtin_data);
 
+TfLiteStatus ParseEqual(const Operator* op, BuiltinOperator op_type,
+                        ErrorReporter* error_reporter,
+                        BuiltinDataAllocator* allocator, void** builtin_data);
+
+TfLiteStatus ParseFloor(const Operator* op, BuiltinOperator op_type,
+                        ErrorReporter* error_reporter,
+                        BuiltinDataAllocator* allocator, void** builtin_data);
+
 TfLiteStatus ParseFullyConnected(const Operator* op, BuiltinOperator op_type,
                                  ErrorReporter* error_reporter,
                                  BuiltinDataAllocator* allocator,
                                  void** builtin_data);
 
+TfLiteStatus ParseGreater(const Operator* op, BuiltinOperator op_type,
+                          ErrorReporter* error_reporter,
+                          BuiltinDataAllocator* allocator, void** builtin_data);
+
+TfLiteStatus ParseGreaterEqual(const Operator* op, BuiltinOperator op_type,
+                               ErrorReporter* error_reporter,
+                               BuiltinDataAllocator* allocator,
+                               void** builtin_data);
+
+TfLiteStatus ParsePool(const Operator* op, BuiltinOperator op_type,
+                       ErrorReporter* error_reporter,
+                       BuiltinDataAllocator* allocator, void** builtin_data);
+
 TfLiteStatus ParseQuantize(const Operator* op, BuiltinOperator op_type,
                            ErrorReporter* error_reporter,
                            BuiltinDataAllocator* allocator,
diff --git a/tensorflow/lite/micro/micro_mutable_op_resolver.h b/tensorflow/lite/micro/micro_mutable_op_resolver.h
index 8c99f77729d..f43c6e20e70 100644
--- a/tensorflow/lite/micro/micro_mutable_op_resolver.h
+++ b/tensorflow/lite/micro/micro_mutable_op_resolver.h
@@ -128,26 +128,20 @@ class MicroMutableOpResolver : public MicroOpResolver {
   }
 
   TfLiteStatus AddAveragePool2D() {
-    // TODO(b/149408647): Replace ParseOpData with the operator specific parse
-    // function.
     return AddBuiltin(BuiltinOperator_AVERAGE_POOL_2D,
                       *tflite::ops::micro::Register_AVERAGE_POOL_2D(),
-                      ParseOpData);
+                      ParsePool);
   }
 
   TfLiteStatus AddCeil() {
-    // TODO(b/149408647): Replace ParseOpData with the operator specific parse
-    // function.
     return AddBuiltin(BuiltinOperator_CEIL,
-                      *tflite::ops::micro::Register_CEIL(), ParseOpData);
+                      *tflite::ops::micro::Register_CEIL(), ParseCeil);
   }
 
   TfLiteStatus AddConcatenation() {
-    // TODO(b/149408647): Replace ParseOpData with the operator specific parse
-    // function.
     return AddBuiltin(BuiltinOperator_CONCATENATION,
                       *tflite::ops::micro::Register_CONCATENATION(),
-                      ParseOpData);
+                      ParseConcatenation);
   }
 
   TfLiteStatus AddConv2D() {
@@ -156,10 +150,8 @@ class MicroMutableOpResolver : public MicroOpResolver {
   }
 
   TfLiteStatus AddCos() {
-    // TODO(b/149408647): Replace ParseOpData with the operator specific parse
-    // function.
     return AddBuiltin(BuiltinOperator_COS, *tflite::ops::micro::Register_COS(),
-                      ParseOpData);
+                      ParseCos);
   }
 
   TfLiteStatus AddDepthwiseConv2D() {
@@ -175,17 +167,13 @@ class MicroMutableOpResolver : public MicroOpResolver {
   }
 
   TfLiteStatus AddEqual() {
-    // TODO(b/149408647): Replace ParseOpData with the operator specific parse
-    // function.
     return AddBuiltin(BuiltinOperator_EQUAL,
-                      *tflite::ops::micro::Register_EQUAL(), ParseOpData);
+                      *tflite::ops::micro::Register_EQUAL(), ParseEqual);
   }
 
   TfLiteStatus AddFloor() {
-    // TODO(b/149408647): Replace ParseOpData with the operator specific parse
-    // function.
     return AddBuiltin(BuiltinOperator_FLOOR,
-                      *tflite::ops::micro::Register_FLOOR(), ParseOpData);
+                      *tflite::ops::micro::Register_FLOOR(), ParseFloor);
   }
 
   TfLiteStatus AddFullyConnected() {
@@ -195,18 +183,14 @@ class MicroMutableOpResolver : public MicroOpResolver {
   }
 
   TfLiteStatus AddGreater() {
-    // TODO(b/149408647): Replace ParseOpData with the operator specific parse
-    // function.
     return AddBuiltin(BuiltinOperator_GREATER,
-                      *tflite::ops::micro::Register_GREATER(), ParseOpData);
+                      *tflite::ops::micro::Register_GREATER(), ParseGreater);
   }
 
   TfLiteStatus AddGreaterEqual() {
-    // TODO(b/149408647): Replace ParseOpData with the operator specific parse
-    // function.
     return AddBuiltin(BuiltinOperator_GREATER_EQUAL,
                       *tflite::ops::micro::Register_GREATER_EQUAL(),
-                      ParseOpData);
+                      ParseGreaterEqual);
   }
 
   TfLiteStatus AddL2Normalization() {
@@ -274,10 +258,8 @@ class MicroMutableOpResolver : public MicroOpResolver {
   }
 
   TfLiteStatus AddMaxPool2D() {
-    // TODO(b/149408647): Replace ParseOpData with the operator specific parse
-    // function.
     return AddBuiltin(BuiltinOperator_MAX_POOL_2D,
-                      *tflite::ops::micro::Register_MAX_POOL_2D(), ParseOpData);
+                      *tflite::ops::micro::Register_MAX_POOL_2D(), ParsePool);
   }
 
   TfLiteStatus AddMean() {

From 1b049d2a220bdb1cc2fd45fa8822bee10d4f6f5b Mon Sep 17 00:00:00 2001
From: Haifeng Jin <haifengj@google.com>
Date: Wed, 24 Jun 2020 10:30:12 -0700
Subject: [PATCH 0983/1390] Added check for case batch_outputs is not assigned
 in predict. Raise ValueError.

PiperOrigin-RevId: 318091655
Change-Id: I78832b5e1c41e763a24e1d77ea3a18bec1994c9a
---
 tensorflow/python/keras/engine/training.py      |  3 +++
 tensorflow/python/keras/engine/training_test.py | 13 +++++++++++++
 2 files changed, 16 insertions(+)

diff --git a/tensorflow/python/keras/engine/training.py b/tensorflow/python/keras/engine/training.py
index a0ebec4f95e..5355920ced5 100644
--- a/tensorflow/python/keras/engine/training.py
+++ b/tensorflow/python/keras/engine/training.py
@@ -1592,6 +1592,7 @@ class Model(base_layer.Layer, version_utils.ModelVersionSelector):
       predict_function = self.make_predict_function()
       self._predict_counter.assign(0)
       callbacks.on_predict_begin()
+      batch_outputs = None
       for _, iterator in data_handler.enumerate_epochs():  # Single epoch.
         with data_handler.catch_stop_iteration():
           for step in data_handler.steps():
@@ -1610,6 +1611,8 @@ class Model(base_layer.Layer, version_utils.ModelVersionSelector):
                   outputs, batch_outputs)
             end_step = step + data_handler.step_increment
             callbacks.on_predict_batch_end(end_step, {'outputs': batch_outputs})
+      if batch_outputs is None:
+        raise ValueError('Expect x to be a non-empty array or dataset.')
       callbacks.on_predict_end()
     all_outputs = nest.map_structure_up_to(batch_outputs, concat, outputs)
     return tf_utils.to_numpy_or_python_type(all_outputs)
diff --git a/tensorflow/python/keras/engine/training_test.py b/tensorflow/python/keras/engine/training_test.py
index 5cf15926bfb..ad904ce9aa7 100644
--- a/tensorflow/python/keras/engine/training_test.py
+++ b/tensorflow/python/keras/engine/training_test.py
@@ -1647,6 +1647,19 @@ class TestExceptionsAndWarnings(keras_parameterized.TestCase):
     ):
       training_module.Model([inputs], output)
 
+  @keras_parameterized.run_all_keras_modes(always_skip_v1=True)
+  def test_predict_error_with_empty_x(self):
+    inputs = layers_module.Input(shape=(2,))
+    outputs = layers_module.Dense(4)(inputs)
+    model = training_module.Model(inputs=inputs, outputs=outputs)
+    model.compile(loss='mse')
+
+    with self.assertRaisesRegexp(
+        ValueError,
+        'Expect x to be a non-empty array or dataset.'
+    ):
+      model.predict(np.array([]))
+
 
 class LossWeightingTest(keras_parameterized.TestCase):
 

From fcad3004bc40e1141ec9cdc1f5a0f330c9cbac91 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 24 Jun 2020 10:33:26 -0700
Subject: [PATCH 0984/1390] Add a method to PyClient returning
 std::shared_ptr<PjRtClient>.

PiperOrigin-RevId: 318092346
Change-Id: Ic5cdbc20a270f7a2c62eebbd47a2a0591f4d7c46
---
 tensorflow/compiler/xla/python/py_client.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/compiler/xla/python/py_client.h b/tensorflow/compiler/xla/python/py_client.h
index c94a206a926..be61bd74419 100644
--- a/tensorflow/compiler/xla/python/py_client.h
+++ b/tensorflow/compiler/xla/python/py_client.h
@@ -91,6 +91,7 @@ class PyClient : public std::enable_shared_from_this<PyClient> {
   explicit PyClient(std::shared_ptr<PjRtClient> pjrt_client);
 
   PjRtClient* pjrt_client() const { return pjrt_client_.get(); }
+  std::shared_ptr<PjRtClient> shared_pjrt_client() { return pjrt_client_; }
 
   const std::string& platform_name() const {
     return pjrt_client_->platform_name();

From 2434d2401399e3973d2f704f977bd6ad2d029ca7 Mon Sep 17 00:00:00 2001
From: Ken Franko <kfranko@google.com>
Date: Wed, 24 Jun 2020 10:34:59 -0700
Subject: [PATCH 0985/1390] Set host device for outside compilation LaunchOp.

PiperOrigin-RevId: 318092701
Change-Id: Ib8f9de05110030a54be12c36ba004939c4c9d832
---
 .../tpu_extract_outside_compilation.mlir      | 807 +++++++++---------
 .../tpu_extract_outside_compilation.cc        |  32 +-
 2 files changed, 424 insertions(+), 415 deletions(-)

diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tpu_extract_outside_compilation.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tpu_extract_outside_compilation.mlir
index d88489f5da0..208146a1226 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tpu_extract_outside_compilation.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tpu_extract_outside_compilation.mlir
@@ -2,460 +2,455 @@
 
 // Tests that missing `_xla_outside_compilation` attribute value results in an error.
 
-func @missing_outside_compilation_attribute() -> () {
-  "tf_device.cluster"() ( {
-    "tf.A"() : () -> ()
-    // expected-error@+1 {{attribute '_xla_outside_compilation' is empty}}
-    "tf.B"() {_xla_outside_compilation = ""} : () -> ()
-    tf_device.return
-  }) {cluster_attr = "cluster_attr"} : () -> ()
-  return
-}
+module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:worker/replica:0/task:0/device:CPU:0", "/job:worker/replica:0/task:0/device:TPU_SYSTEM:0", "/job:worker/replica:0/task:0/device:TPU:0"]} {
+  // Tests that TPU cluster with no outside compilation does not generate parallel_execute.
 
-// -----
+  // CHECK-LABEL: func @no_outside_compilation
+  func @no_outside_compilation() -> tensor<?xi32> {
+    %0 = "tf_device.cluster"() ( {
+      %1 = "tf.A"() : () -> tensor<?xi32>
+      %2 = "tf.B"(%1) : (tensor<?xi32>) -> tensor<?xi32>
+      tf_device.return %2 : tensor<?xi32>
+    }) {num_cores_per_replica = 1, topology =  "", device_assignment =  []} : () -> tensor<?xi32>
+    return %0 : tensor<?xi32>
+  }
 
-// Tests that TPU cluster with no outside compilation does not generate parallel_execute.
+  // CHECK-NOT: "tf_device.parallel_execute"
 
-// CHECK-LABEL: func @no_outside_compilation
-func @no_outside_compilation() -> tensor<?xi32> {
-  %0 = "tf_device.cluster"() ( {
-    %1 = "tf.A"() : () -> tensor<?xi32>
-    %2 = "tf.B"(%1) : (tensor<?xi32>) -> tensor<?xi32>
-    tf_device.return %2 : tensor<?xi32>
-  }) {cluster_attr = "cluster_attr"} : () -> tensor<?xi32>
-  return %0 : tensor<?xi32>
-}
+  // Tests extraction of a single outside compiled cluster with no input or output dependecies.
 
-// CHECK-NOT: "tf_device.parallel_execute"
-
-// Tests extraction of a single outside compiled cluster with no input or output dependecies.
-
-// CHECK-LABEL: func @nodep_single_outside_compilation
-func @nodep_single_outside_compilation() -> () {
-   // CHECK: "tf_device.parallel_execute"
-   // CHECK-NEXT: "tf_device.launch"
-   // CHECK-NEXT: "tf.B"
-   // CHECK-NOT: _xla_outside_compilation
-   // CHECK: "tf_device.cluster"
-   // CHECK-NEXT: "tf.A"
-   // CHECK: cluster_attr = "cluster_attr"
-  "tf_device.cluster"() ( {
-    "tf.A"() : () -> ()
-    "tf.B"() {_xla_outside_compilation = "cluster1"} : () -> ()
-    "tf.C"() : () -> ()
-    tf_device.return
-  }) {cluster_attr = "cluster_attr"} : () -> ()
-  return
-}
-
-// Tests extraction of a single outside compiled cluster with multiple ops and no input or output dependecies.
-
-// CHECK-LABEL: func @nodep_single_cluster_multiple_ops_outside_compilation
-func @nodep_single_cluster_multiple_ops_outside_compilation() -> () {
-   // CHECK: "tf_device.parallel_execute"
-   // CHECK-NEXT: "tf_device.launch"
-   // CHECK-NEXT: "tf.B"
-   // CHECK-NEXT: "tf.C"
-   // CHECK-NEXT: "tf.D"
-   // CHECK-NOT: _xla_outside_compilation
-   // CHECK: "tf_device.cluster"
-   // CHECK-NEXT: "tf.A"
-   // CHECK-NEXT: "tf.E"
-   // CHECK: cluster_attr = "cluster_attr"
-  "tf_device.cluster"() ( {
-    "tf.A"() : () -> ()
-    "tf.B"() {_xla_outside_compilation = "cluster1"} : () -> ()
-    "tf.C"() {_xla_outside_compilation = "cluster1"} : () -> ()
-    "tf.D"() {_xla_outside_compilation = "cluster1"} : () -> ()
-    "tf.E"() : () -> ()
-    tf_device.return
-  }) {cluster_attr = "cluster_attr"} : () -> ()
-  return
-}
-
-// Tests extraction of a multiple outside compiled clusters with no input or output dependecies.
-
-// CHECK-LABEL: func @nodep_multiple_outside_compilation
-func @nodep_multiple_outside_compilation() -> () {
-   // CHECK: "tf_device.parallel_execute"
-   // CHECK-COUNT-2: "tf_device.launch"
-   // CHECK: "tf_device.cluster"
-  "tf_device.cluster"() ( {
-    "tf.A"() : () -> ()
-    "tf.B"() {_xla_outside_compilation = "cluster1"} : () -> ()
-    "tf.C"() : () -> ()
-    "tf.D"() {_xla_outside_compilation = "cluster2"} : () -> ()
-    "tf.E"() : () -> ()
-    tf_device.return
-  }) {cluster_attr = "cluster_attr"} : () -> ()
-  return
-}
-
-// Tests extraction of a single outside compiled cluster with single TPU cluster return.
-
-// CHECK-LABEL: func @single_tpu_return_single_outside_compilation
-func @single_tpu_return_single_outside_compilation(%arg0: tensor<?xi32>) -> tensor<?xi32> {
-  %0 = "tf.A"(%arg0) : (tensor<?xi32>) -> tensor<?xi32>
-  // CHECK:      %[[REPLICATE:[0-9]*]]:2 = tf_device.replicate
-  // CHECK:        %[[PARALLEL_EXECUTE_OUTPUT:[0-9]*]] = "tf_device.parallel_execute"
-  // CHECK-NEXT:     "tf_device.launch"
-  // CHECK:          %[[TPU_CLUSTER_OUTPUT:[0-9]*]] = "tf_device.cluster"
-  // CHECK:            tf_device.return
-  // CHECK:          tf_device.return %[[TPU_CLUSTER_OUTPUT]]
-  // CHECK:        tf_device.return %[[PARALLEL_EXECUTE_OUTPUT]]
-  %1:2 = tf_device.replicate([%0, %arg0] as %ri_0: tensor<?xi32>) {n = 2 : i32} {
-    %2 = "tf_device.cluster"() ( {
+  // CHECK-LABEL: func @nodep_single_outside_compilation
+  func @nodep_single_outside_compilation() -> () {
+     // CHECK: "tf_device.parallel_execute"
+     // CHECK-NEXT: "tf_device.launch"
+     // CHECK-NEXT: "tf.B"
+     // CHECK-NOT: _xla_outside_compilation
+     // CHECK-NEXT:   tf_device.return
+     // CHECK-NEXT: device = "/job:worker/replica:0/task:0/device:CPU:0"
+     // CHECK: "tf_device.cluster"
+     // CHECK-NEXT: "tf.A"
+     // CHECK: device_assignment =  [], num_cores_per_replica = 1 : i64, topology =  ""
+    "tf_device.cluster"() ( {
       "tf.A"() : () -> ()
       "tf.B"() {_xla_outside_compilation = "cluster1"} : () -> ()
-      %3 = "tf.C"() : () -> tensor<?xi32>
-      tf_device.return %3 : tensor<?xi32>
-    }) {cluster_attr = "cluster_attr"} : () -> tensor<?xi32>
-    tf_device.return %2 : tensor<?xi32>
+      "tf.C"() : () -> ()
+      tf_device.return
+    }) {num_cores_per_replica = 1, topology =  "", device_assignment =  []} : () -> ()
+    return
   }
 
-  return %1 : tensor<?xi32>
-}
+  // Tests extraction of a single outside compiled cluster with multiple ops and no input or output dependecies.
 
-// Tests extraction of a single outside compiled cluster with multiple TPU cluster return.
-
-// CHECK-LABEL: func @multiple_tpu_return_single_outside_compilation
-func @multiple_tpu_return_single_outside_compilation(%arg0: tensor<?xi32>) -> tensor<?xf32> {
-  %0 = "tf.A"(%arg0) : (tensor<?xi32>) -> tensor<?xi32>
-  // CHECK:      %[[REPLICATE:[0-9]*]]:4 = tf_device.replicate
-  // CHECK:        %[[PARALLEL_EXECUTE_OUTPUT:[0-9]*]]:2  = "tf_device.parallel_execute"
-  // CHECK-NEXT:     "tf_device.launch"
-  // CHECK:          %[[TPU_CLUSTER_OUTPUT:[0-9]*]]:2 = "tf_device.cluster"
-  // CHECK:            tf_device.return
-  // CHECK:          tf_device.return %[[TPU_CLUSTER_OUTPUT]]
-  // CHECK:        tf_device.return %[[PARALLEL_EXECUTE_OUTPUT]]
-  %1:4 = tf_device.replicate([%0, %arg0] as %ri_0: tensor<?xi32>) {n = 2 : i32} {
-    %2, %3 = "tf_device.cluster"() ( {
-      %4 = "tf.A"() : () -> tensor<?xf32>
+  // CHECK-LABEL: func @nodep_single_cluster_multiple_ops_outside_compilation
+  func @nodep_single_cluster_multiple_ops_outside_compilation() -> () {
+     // CHECK: "tf_device.parallel_execute"
+     // CHECK-NEXT: "tf_device.launch"
+     // CHECK-NEXT: "tf.B"
+     // CHECK-NEXT: "tf.C"
+     // CHECK-NEXT: "tf.D"
+     // CHECK-NOT: _xla_outside_compilation
+     // CHECK: "tf_device.cluster"
+     // CHECK-NEXT: "tf.A"
+     // CHECK-NEXT: "tf.E"
+     // CHECK: device_assignment =  [], num_cores_per_replica = 1 : i64, topology =  ""
+    "tf_device.cluster"() ( {
+      "tf.A"() : () -> ()
       "tf.B"() {_xla_outside_compilation = "cluster1"} : () -> ()
-      %5 = "tf.C"() : () -> tensor<?xi32>
-      tf_device.return %4, %5  : tensor<?xf32>, tensor<?xi32>
-    }) {cluster_attr = "cluster_attr"} : () -> (tensor<?xf32>, tensor<?xi32>)
-    tf_device.return %2, %3 : tensor<?xf32>, tensor<?xi32>
+      "tf.C"() {_xla_outside_compilation = "cluster1"} : () -> ()
+      "tf.D"() {_xla_outside_compilation = "cluster1"} : () -> ()
+      "tf.E"() : () -> ()
+      tf_device.return
+    }) {num_cores_per_replica = 1, topology =  "", device_assignment =  []} : () -> ()
+    return
   }
 
-  return %1 : tensor<?xf32>
-}
+  // Tests extraction of a multiple outside compiled clusters with no input or output dependecies.
 
-// Tests extraction of a single outside compiled cluster with single device->host input.
-
-// CHECK-LABEL: func @single_outside_compiled_input_single_outside_compilation
-func @single_outside_compiled_input_single_outside_compilation(%arg0: tensor<?xi32>) -> tensor<?xi32> {
-  %0 = "tf.A"(%arg0) : (tensor<?xi32>) -> tensor<?xi32>
-  // CHECK:      %[[REPLICATE:[0-9]*]]:2 = tf_device.replicate
-  // CHECK:        %[[PARALLEL_EXECUTE_OUTPUT:[0-9]*]] = "tf_device.parallel_execute"
-  // CHECK-NEXT:     "tf_device.launch"
-  // CHECK:            %[[STATUS_OUTPUT:[a-z_0-9]*]], %[[PROGRAM_OUTPUT:[a-z_0-9]*]] = "tf._TPUCompileMlir"
-  // CHECK:            %[[RECV_OUTPUT:[0-9]*]] = "tf._XlaRecvAtHost"(%[[PROGRAM_OUTPUT]])
-  // CHECK-SAME:       key = "host_compute_channel_cluster1"
-  // CHECK:            "tf.B"(%[[RECV_OUTPUT]])
-  // CHECK:          "tf_device.cluster"
-  // CHECK:            %[[A_OUTPUT:[0-9]*]] = "tf.A"
-  // CHECK:            "tf._HostComputeMlir"(%[[A_OUTPUT]])
-  // CHECK-SAME:       key = "host_compute_channel_cluster1"
-  %1:2 = tf_device.replicate([%0, %arg0] as %ri_0: tensor<?xi32>) {n = 2 : i32} {
-    %2 = "tf_device.cluster"() ( {
-      %3 = "tf.A"() : () -> (tensor<?xi32>)
-      "tf.B"(%3) {_xla_outside_compilation = "cluster1"} : (tensor<?xi32>) -> ()
-      %4 = "tf.C"() : () -> tensor<?xi32>
-      tf_device.return %4 : tensor<?xi32>
-    }) {cluster_attr = "cluster_attr"} : () -> tensor<?xi32>
-    tf_device.return %2 : tensor<?xi32>
+  // CHECK-LABEL: func @nodep_multiple_outside_compilation
+  func @nodep_multiple_outside_compilation() -> () {
+     // CHECK: "tf_device.parallel_execute"
+     // CHECK-COUNT-2: "tf_device.launch"
+     // CHECK: "tf_device.cluster"
+    "tf_device.cluster"() ( {
+      "tf.A"() : () -> ()
+      "tf.B"() {_xla_outside_compilation = "cluster1"} : () -> ()
+      "tf.C"() : () -> ()
+      "tf.D"() {_xla_outside_compilation = "cluster2"} : () -> ()
+      "tf.E"() : () -> ()
+      tf_device.return
+    }) {num_cores_per_replica = 1, topology =  "", device_assignment =  []} : () -> ()
+    return
   }
 
-  return %1 : tensor<?xi32>
-}
+  // Tests extraction of a single outside compiled cluster with single TPU cluster return.
 
-// Tests extraction of a single outside compiled cluster with single host->device output.
+  // CHECK-LABEL: func @single_tpu_return_single_outside_compilation
+  func @single_tpu_return_single_outside_compilation(%arg0: tensor<?xi32>) -> tensor<?xi32> {
+    %0 = "tf.A"(%arg0) : (tensor<?xi32>) -> tensor<?xi32>
+    // CHECK:      %[[REPLICATE:[0-9]*]]:2 = tf_device.replicate
+    // CHECK:        %[[PARALLEL_EXECUTE_OUTPUT:[0-9]*]] = "tf_device.parallel_execute"
+    // CHECK-NEXT:     "tf_device.launch"
+    // CHECK-NEXT:       "tf.B"
+    // CHECK-NEXT:       tf_device.return
+    // CHECK-NEXT:     device = "TPU_REPLICATED_HOST"
+    // CHECK:          %[[TPU_CLUSTER_OUTPUT:[0-9]*]] = "tf_device.cluster"
+    // CHECK:            tf_device.return
+    // CHECK:          tf_device.return %[[TPU_CLUSTER_OUTPUT]]
+    // CHECK:        tf_device.return %[[PARALLEL_EXECUTE_OUTPUT]]
+    %1:2 = tf_device.replicate([%0, %arg0] as %ri_0: tensor<?xi32>) {n = 2 : i32} {
+      %2 = "tf_device.cluster"() ( {
+        "tf.A"() : () -> ()
+        "tf.B"() {_xla_outside_compilation = "cluster1"} : () -> ()
+        %3 = "tf.C"() : () -> tensor<?xi32>
+        tf_device.return %3 : tensor<?xi32>
+      }) {num_cores_per_replica = 1, topology =  "", device_assignment =  []} : () -> tensor<?xi32>
+      tf_device.return %2 : tensor<?xi32>
+    }
 
-// CHECK-LABEL: func @single_outside_compiled_output_single_outside_compilation
-func @single_outside_compiled_output_single_outside_compilation(%arg0: tensor<?xi32>) -> tensor<?xi32> {
-  %0 = "tf.A"(%arg0) : (tensor<?xi32>) -> tensor<?xi32>
-  // CHECK:      %[[REPLICATE:[0-9]*]]:2 = tf_device.replicate
-  // CHECK:        %[[PARALLEL_EXECUTE_OUTPUT:[0-9]*]] = "tf_device.parallel_execute"
-  // CHECK-NEXT:     "tf_device.launch"
-  // CHECK:            %[[STATUS_OUTPUT:[a-z_0-9]*]], %[[PROGRAM_OUTPUT:[a-z_0-9]*]] = "tf._TPUCompileMlir"
-  // CHECK:            "tf._XlaRecvAtHost"(%[[PROGRAM_OUTPUT]])
-  // CHECK:            %[[B_OUTPUT:[0-9]*]] = "tf.B"()
-  // CHECK:            "tf._XlaSendFromHost"(%[[B_OUTPUT]], %[[PROGRAM_OUTPUT]])
-  // CHECK-SAME:       key = "host_compute_channel_cluster1"
-  // CHECK:         "tf_device.cluster"
-  // CHECK:           %[[A_OUTPUT:[0-9]*]] = "tf.A"
-  // CHECK:           %[[HOST_OUTPUT:[0-9]*]] = "tf._HostComputeMlir"()
-  // CHECK-SAME:      key = "host_compute_channel_cluster1"
-  // CHECK:           "tf.C"(%[[HOST_OUTPUT]])
-  %1:2 = tf_device.replicate([%0, %arg0] as %ri_0: tensor<?xi32>) {n = 2 : i32} {
-    %2 = "tf_device.cluster"() ( {
-      %3 = "tf.A"() : () -> (tensor<?xi32>)
-      %4 = "tf.B"() {_xla_outside_compilation = "cluster1"} : () -> (tensor<?xi32>)
-      %5 = "tf.C"(%4) : (tensor<?xi32>) -> tensor<?xi32>
-      tf_device.return %5 : tensor<?xi32>
-    }) {cluster_attr = "cluster_attr"} : () -> tensor<?xi32>
-    tf_device.return %2 : tensor<?xi32>
+    return %1 : tensor<?xi32>
   }
 
-  return %1 : tensor<?xi32>
-}
+  // Tests extraction of a single outside compiled cluster with multiple TPU cluster return.
 
-// Tests extraction of a single outside compiled cluster host output returned by TPU cluster.
+  // CHECK-LABEL: func @multiple_tpu_return_single_outside_compilation
+  func @multiple_tpu_return_single_outside_compilation(%arg0: tensor<?xi32>) -> tensor<?xf32> {
+    %0 = "tf.A"(%arg0) : (tensor<?xi32>) -> tensor<?xi32>
+    // CHECK:      %[[REPLICATE:[0-9]*]]:4 = tf_device.replicate
+    // CHECK:        %[[PARALLEL_EXECUTE_OUTPUT:[0-9]*]]:2  = "tf_device.parallel_execute"
+    // CHECK-NEXT:     "tf_device.launch"
+    // CHECK:          %[[TPU_CLUSTER_OUTPUT:[0-9]*]]:2 = "tf_device.cluster"
+    // CHECK:            tf_device.return
+    // CHECK:          tf_device.return %[[TPU_CLUSTER_OUTPUT]]
+    // CHECK:        tf_device.return %[[PARALLEL_EXECUTE_OUTPUT]]
+    %1:4 = tf_device.replicate([%0, %arg0] as %ri_0: tensor<?xi32>) {n = 2 : i32} {
+      %2, %3 = "tf_device.cluster"() ( {
+        %4 = "tf.A"() : () -> tensor<?xf32>
+        "tf.B"() {_xla_outside_compilation = "cluster1"} : () -> ()
+        %5 = "tf.C"() : () -> tensor<?xi32>
+        tf_device.return %4, %5  : tensor<?xf32>, tensor<?xi32>
+      }) {num_cores_per_replica = 1, topology =  "", device_assignment =  []} : () -> (tensor<?xf32>, tensor<?xi32>)
+      tf_device.return %2, %3 : tensor<?xf32>, tensor<?xi32>
+    }
 
-// CHECK-LABEL: func @return_host_output_outside_compilation
-func @return_host_output_outside_compilation(%arg0: tensor<?xi32>) -> tensor<?xi32> {
-  %0 = "tf.A"(%arg0) : (tensor<?xi32>) -> tensor<?xi32>
-  // CHECK:      %[[REPLICATE:[0-9]*]]:2 = tf_device.replicate
-  // CHECK:        %[[PARALLEL_EXECUTE_OUTPUT:[0-9]*]] = "tf_device.parallel_execute"
-  // CHECK-NEXT:     "tf_device.launch"
-  // CHECK:            %[[STATUS_OUTPUT:[a-z_0-9]*]], %[[PROGRAM_OUTPUT:[a-z_0-9]*]] = "tf._TPUCompileMlir"
-  // CHECK:            %[[RECV_OUTPUT:[0-9]*]] = "tf._XlaRecvAtHost"(%[[PROGRAM_OUTPUT]])
-  // CHECK:            %[[B_OUTPUT:[0-9]*]] = "tf.B"(%[[RECV_OUTPUT]])
-  // CHECK:            "tf._XlaSendFromHost"(%[[B_OUTPUT]], %[[PROGRAM_OUTPUT]])
-  // CHECK:          "tf_device.cluster"
-  // CHECK:            %[[A_OUTPUT:[0-9]*]] = "tf.A"
-  // CHECK:            %[[HOST_OUTPUT:[0-9]*]] = "tf._HostComputeMlir"(%[[A_OUTPUT]])
-  // CHECK-SAME:       key = "host_compute_channel_cluster1"
-  // CHECK:            tf_device.return %[[HOST_OUTPUT]]
-  %1:2 = tf_device.replicate([%0, %arg0] as %ri_0: tensor<?xi32>) {n = 2 : i32} {
-    %2 = "tf_device.cluster"() ( {
-      %3 = "tf.A"() : () -> (tensor<?xi32>)
-      %4 = "tf.B"(%3) {_xla_outside_compilation = "cluster1"} : (tensor<?xi32>) -> (tensor<?xi32>)
-      %5 = "tf.C"(%3) : (tensor<?xi32>) -> (tensor<?xi32>)
-      tf_device.return %4 : tensor<?xi32>
-    }) {cluster_attr = "cluster_attr"} : () -> tensor<?xi32>
-    tf_device.return %2 : tensor<?xi32>
+    return %1 : tensor<?xf32>
   }
 
-  return %1 : tensor<?xi32>
-}
+  // Tests extraction of a single outside compiled cluster with single device->host input.
 
-// Tests extraction of a single outside compiled cluster with single input/output.
+  // CHECK-LABEL: func @single_outside_compiled_input_single_outside_compilation
+  func @single_outside_compiled_input_single_outside_compilation(%arg0: tensor<?xi32>) -> tensor<?xi32> {
+    %0 = "tf.A"(%arg0) : (tensor<?xi32>) -> tensor<?xi32>
+    // CHECK:      %[[REPLICATE:[0-9]*]]:2 = tf_device.replicate
+    // CHECK:        %[[PARALLEL_EXECUTE_OUTPUT:[0-9]*]] = "tf_device.parallel_execute"
+    // CHECK-NEXT:     "tf_device.launch"
+    // CHECK:            %[[STATUS_OUTPUT:[a-z_0-9]*]], %[[PROGRAM_OUTPUT:[a-z_0-9]*]] = "tf._TPUCompileMlir"
+    // CHECK:            %[[RECV_OUTPUT:[0-9]*]] = "tf._XlaRecvAtHost"(%[[PROGRAM_OUTPUT]])
+    // CHECK-SAME:       key = "host_compute_channel_cluster1"
+    // CHECK:            "tf.B"(%[[RECV_OUTPUT]])
+    // CHECK:          "tf_device.cluster"
+    // CHECK:            %[[A_OUTPUT:[0-9]*]] = "tf.A"
+    // CHECK:            "tf._HostComputeMlir"(%[[A_OUTPUT]])
+    // CHECK-SAME:       key = "host_compute_channel_cluster1"
+    %1:2 = tf_device.replicate([%0, %arg0] as %ri_0: tensor<?xi32>) {n = 2 : i32} {
+      %2 = "tf_device.cluster"() ( {
+        %3 = "tf.A"() : () -> (tensor<?xi32>)
+        "tf.B"(%3) {_xla_outside_compilation = "cluster1"} : (tensor<?xi32>) -> ()
+        %4 = "tf.C"() : () -> tensor<?xi32>
+        tf_device.return %4 : tensor<?xi32>
+      }) {num_cores_per_replica = 1, topology =  "", device_assignment =  []} : () -> tensor<?xi32>
+      tf_device.return %2 : tensor<?xi32>
+    }
 
-// CHECK-LABEL: func @single_outside_compiled_input_output_single_outside_compilation
-func @single_outside_compiled_input_output_single_outside_compilation(%arg0: tensor<?xi32>) -> tensor<?xi32> {
-  %0 = "tf.A"(%arg0) : (tensor<?xi32>) -> tensor<?xi32>
-  // CHECK:      %[[REPLICATE:[0-9]*]]:2 = tf_device.replicate
-  // CHECK:        %[[PARALLEL_EXECUTE_OUTPUT:[0-9]*]] = "tf_device.parallel_execute"
-  // CHECK-NEXT:     "tf_device.launch"
-  // CHECK:            %[[STATUS_OUTPUT:[a-z_0-9]*]], %[[PROGRAM_OUTPUT:[a-z_0-9]*]] = "tf._TPUCompileMlir"
-  // CHECK:            %[[RECV_OUTPUT:[0-9]*]] = "tf._XlaRecvAtHost"(%[[PROGRAM_OUTPUT]])
-  // CHECK:            %[[B_OUTPUT:[0-9]*]] = "tf.B"(%[[RECV_OUTPUT]])
-  // CHECK:            "tf._XlaSendFromHost"(%[[B_OUTPUT]], %[[PROGRAM_OUTPUT]])
-  // CHECK-SAME:       key = "host_compute_channel_cluster1"
-  // CHECK:          "tf_device.cluster"
-  // CHECK:            %[[A_OUTPUT:[0-9]*]] = "tf.A"
-  // CHECK:            %[[HOST_OUTPUT:[0-9]*]] = "tf._HostComputeMlir"(%[[A_OUTPUT]])
-  // CHECK-SAME:       key = "host_compute_channel_cluster1"
-  // CHECK:            "tf.C"(%[[HOST_OUTPUT]])
-  %1:2 = tf_device.replicate([%0, %arg0] as %ri_0: tensor<?xi32>) {n = 2 : i32} {
-    %2 = "tf_device.cluster"() ( {
-      %3 = "tf.A"() : () -> (tensor<?xi32>)
-      %4 = "tf.B"(%3) {_xla_outside_compilation = "cluster1"} : (tensor<?xi32>) -> (tensor<?xi32>)
-      %5 = "tf.C"(%4) : (tensor<?xi32>) -> tensor<?xi32>
-      tf_device.return %5 : tensor<?xi32>
-    }) {cluster_attr = "cluster_attr"} : () -> tensor<?xi32>
-    tf_device.return %2 : tensor<?xi32>
+    return %1 : tensor<?xi32>
   }
 
-  return %1 : tensor<?xi32>
-}
+  // Tests extraction of a single outside compiled cluster with single host->device output.
 
-// Tests extraction of a single outside compiled cluster with multiple input/output.
+  // CHECK-LABEL: func @single_outside_compiled_output_single_outside_compilation
+  func @single_outside_compiled_output_single_outside_compilation(%arg0: tensor<?xi32>) -> tensor<?xi32> {
+    %0 = "tf.A"(%arg0) : (tensor<?xi32>) -> tensor<?xi32>
+    // CHECK:      %[[REPLICATE:[0-9]*]]:2 = tf_device.replicate
+    // CHECK:        %[[PARALLEL_EXECUTE_OUTPUT:[0-9]*]] = "tf_device.parallel_execute"
+    // CHECK-NEXT:     "tf_device.launch"
+    // CHECK:            %[[STATUS_OUTPUT:[a-z_0-9]*]], %[[PROGRAM_OUTPUT:[a-z_0-9]*]] = "tf._TPUCompileMlir"
+    // CHECK:            "tf._XlaRecvAtHost"(%[[PROGRAM_OUTPUT]])
+    // CHECK:            %[[B_OUTPUT:[0-9]*]] = "tf.B"()
+    // CHECK:            "tf._XlaSendFromHost"(%[[B_OUTPUT]], %[[PROGRAM_OUTPUT]])
+    // CHECK-SAME:       key = "host_compute_channel_cluster1"
+    // CHECK:         "tf_device.cluster"
+    // CHECK:           %[[A_OUTPUT:[0-9]*]] = "tf.A"
+    // CHECK:           %[[HOST_OUTPUT:[0-9]*]] = "tf._HostComputeMlir"()
+    // CHECK-SAME:      key = "host_compute_channel_cluster1"
+    // CHECK:           "tf.C"(%[[HOST_OUTPUT]])
+    %1:2 = tf_device.replicate([%0, %arg0] as %ri_0: tensor<?xi32>) {n = 2 : i32} {
+      %2 = "tf_device.cluster"() ( {
+        %3 = "tf.A"() : () -> (tensor<?xi32>)
+        %4 = "tf.B"() {_xla_outside_compilation = "cluster1"} : () -> (tensor<?xi32>)
+        %5 = "tf.C"(%4) : (tensor<?xi32>) -> tensor<?xi32>
+        tf_device.return %5 : tensor<?xi32>
+      }) {num_cores_per_replica = 1, topology =  "", device_assignment =  []} : () -> tensor<?xi32>
+      tf_device.return %2 : tensor<?xi32>
+    }
 
-// CHECK-LABEL: func @multiple_outside_compiled_input_output_single_outside_compilation
-func @multiple_outside_compiled_input_output_single_outside_compilation(%arg0: tensor<?xi32>) -> tensor<?xi32> {
-  %0 = "tf.A"(%arg0) : (tensor<?xi32>) -> tensor<?xi32>
-  // CHECK:      %[[REPLICATE:[0-9]*]]:2 = tf_device.replicate
-  // CHECK:        %[[PARALLEL_EXECUTE_OUTPUT:[0-9]*]] = "tf_device.parallel_execute"
-  // CHECK-NEXT:     "tf_device.launch"
-  // CHECK:            %[[STATUS_OUTPUT:[a-z_0-9]*]], %[[PROGRAM_OUTPUT:[a-z_0-9]*]] = "tf._TPUCompileMlir"
-  // CHECK:            %[[RECV_OUTPUT:[0-9]*]]:2 = "tf._XlaRecvAtHost"(%[[PROGRAM_OUTPUT]])
-  // CHECK:            %[[B_OUTPUT:[0-9]*]]:2 = "tf.C"(%[[RECV_OUTPUT]]#0, %[[RECV_OUTPUT]]#1)
-  // CHECK:            "tf._XlaSendFromHost"(%[[B_OUTPUT]]#0, %[[B_OUTPUT]]#1, %[[PROGRAM_OUTPUT]])
-  // CHECK-SAME:        key = "host_compute_channel_cluster1"
-  // CHECK:          "tf_device.cluster"
-  // CHECK:            %[[A_OUTPUT:[0-9]*]] = "tf.A"
-  // CHECK:            %[[B_OUTPUT:[0-9]*]] = "tf.B"
-  // CHECK:            %[[HOST_OUTPUT:[0-9]*]]:2 = "tf._HostComputeMlir"(%[[A_OUTPUT]], %[[B_OUTPUT]])
-  // CHECK-SAME:       key = "host_compute_channel_cluster1"
-  // CHECK:            "tf.D"(%[[HOST_OUTPUT]]#0)
-  // CHECK:            "tf.E"(%[[HOST_OUTPUT]]#1)
-  %1:2 = tf_device.replicate([%0, %arg0] as %ri_0: tensor<?xi32>) {n = 2 : i32} {
-    %2 = "tf_device.cluster"() ( {
-      %3 = "tf.A"() : () -> (tensor<?xi32>)
-      %4 = "tf.B"() : () -> (tensor<?xi32>)
-      %5, %6 = "tf.C"(%3, %4) {_xla_outside_compilation = "cluster1"} : (tensor<?xi32>, tensor<?xi32>) -> (tensor<?xi32>, tensor<?xi32>)
-      %7 = "tf.D"(%5) : (tensor<?xi32>) -> tensor<?xi32>
-      %8 = "tf.E"(%6) : (tensor<?xi32>) -> tensor<?xi32>
-      tf_device.return %8 : tensor<?xi32>
-    }) {cluster_attr = "cluster_attr"} : () -> tensor<?xi32>
-    tf_device.return %2 : tensor<?xi32>
+    return %1 : tensor<?xi32>
   }
 
-  return %1 : tensor<?xi32>
-}
+  // Tests extraction of a single outside compiled cluster host output returned by TPU cluster.
 
-// Tests extraction of a multiple outside compiled clusters with input/output.
+  // CHECK-LABEL: func @return_host_output_outside_compilation
+  func @return_host_output_outside_compilation(%arg0: tensor<?xi32>) -> tensor<?xi32> {
+    %0 = "tf.A"(%arg0) : (tensor<?xi32>) -> tensor<?xi32>
+    // CHECK:      %[[REPLICATE:[0-9]*]]:2 = tf_device.replicate
+    // CHECK:        %[[PARALLEL_EXECUTE_OUTPUT:[0-9]*]] = "tf_device.parallel_execute"
+    // CHECK-NEXT:     "tf_device.launch"
+    // CHECK:            %[[STATUS_OUTPUT:[a-z_0-9]*]], %[[PROGRAM_OUTPUT:[a-z_0-9]*]] = "tf._TPUCompileMlir"
+    // CHECK:            %[[RECV_OUTPUT:[0-9]*]] = "tf._XlaRecvAtHost"(%[[PROGRAM_OUTPUT]])
+    // CHECK:            %[[B_OUTPUT:[0-9]*]] = "tf.B"(%[[RECV_OUTPUT]])
+    // CHECK:            "tf._XlaSendFromHost"(%[[B_OUTPUT]], %[[PROGRAM_OUTPUT]])
+    // CHECK:          "tf_device.cluster"
+    // CHECK:            %[[A_OUTPUT:[0-9]*]] = "tf.A"
+    // CHECK:            %[[HOST_OUTPUT:[0-9]*]] = "tf._HostComputeMlir"(%[[A_OUTPUT]])
+    // CHECK-SAME:       key = "host_compute_channel_cluster1"
+    // CHECK:            tf_device.return %[[HOST_OUTPUT]]
+    %1:2 = tf_device.replicate([%0, %arg0] as %ri_0: tensor<?xi32>) {n = 2 : i32} {
+      %2 = "tf_device.cluster"() ( {
+        %3 = "tf.A"() : () -> (tensor<?xi32>)
+        %4 = "tf.B"(%3) {_xla_outside_compilation = "cluster1"} : (tensor<?xi32>) -> (tensor<?xi32>)
+        %5 = "tf.C"(%3) : (tensor<?xi32>) -> (tensor<?xi32>)
+        tf_device.return %4 : tensor<?xi32>
+      }) {num_cores_per_replica = 1, topology =  "", device_assignment =  []} : () -> tensor<?xi32>
+      tf_device.return %2 : tensor<?xi32>
+    }
 
-// CHECK-LABEL: func @outside_compiled_input_output_multiple_outside_compilation
-func @outside_compiled_input_output_multiple_outside_compilation(%arg0: tensor<?xi32>) -> tensor<?xi32> {
-  %0 = "tf.A"(%arg0) : (tensor<?xi32>) -> tensor<?xi32>
-  // CHECK:      %[[REPLICATE:[0-9]*]]:2 = tf_device.replicate
-  // CHECK:        %[[PARALLEL_EXECUTE_OUTPUT:[0-9]*]] = "tf_device.parallel_execute"
-  // CHECK-NEXT:     "tf_device.launch"
-  // CHECK:            %[[STATUS_OUTPUT2:[a-z_0-9]*]], %[[PROGRAM_OUTPUT2:[a-z_0-9]*]] = "tf._TPUCompileMlir"
-  // CHECK:            %[[RECV_OUTPUT2:[0-9]*]] = "tf._XlaRecvAtHost"(%[[PROGRAM_OUTPUT2]])
-  // CHECK:            %[[D_OUTPUT:[0-9]*]] = "tf.D"(%[[RECV_OUTPUT2]])
-  // CHECK:            "tf._XlaSendFromHost"(%[[D_OUTPUT]], %[[PROGRAM_OUTPUT]])
-  // CHECK-SAME:         key = "host_compute_channel_cluster2"
-  // CHECK:          "tf_device.launch"
-  // CHECK:            %[[STATUS_OUTPUT1:[a-z_0-9]*]], %[[PROGRAM_OUTPUT1:[a-z_0-9]*]] = "tf._TPUCompileMlir"
-  // CHECK:            %[[RECV_OUTPUT1:[0-9]*]] = "tf._XlaRecvAtHost"(%[[PROGRAM_OUTPUT1]])
-  // CHECK:            %[[B_OUTPUT:[0-9]*]] = "tf.B"(%[[RECV_OUTPUT1]])
-  // CHECK:            "tf._XlaSendFromHost"(%[[B_OUTPUT]], %[[PROGRAM_OUTPUT]])
-  // CHECK-SAME:       key = "host_compute_channel_cluster1"
-  // CHECK:          "tf_device.cluster"
-  // CHECK:            %[[A_OUTPUT:[0-9]*]] = "tf.A"
-  // CHECK:            %[[HOST_OUTPUT1:[0-9]*]] = "tf._HostComputeMlir"(%[[A_OUTPUT]])
-  // CHECK-SAME:       key = "host_compute_channel_cluster1"
-  // CHECK:            %[[C_OUTPUT:[0-9]*]] = "tf.C"(%[[HOST_OUTPUT1]])
-  // CHECK:            %[[HOST_OUTPUT2:[0-9]*]] = "tf._HostComputeMlir"(%[[C_OUTPUT]])
-  // CHECK-SAME:       key = "host_compute_channel_cluster2"
-  // CHECK:            "tf.E"(%[[HOST_OUTPUT2]])
-  %1:2 = tf_device.replicate([%0, %arg0] as %ri_0: tensor<?xi32>) {n = 2 : i32} {
-    %2 = "tf_device.cluster"() ( {
-      %3 = "tf.A"() : () -> (tensor<?xi32>)
-      %4 = "tf.B"(%3) {_xla_outside_compilation = "cluster1"} : (tensor<?xi32>) -> (tensor<?xi32>)
-      %5 = "tf.C"(%4) : (tensor<?xi32>) -> (tensor<?xi32>)
-      %6 = "tf.D"(%5) {_xla_outside_compilation = "cluster2"} : (tensor<?xi32>) -> (tensor<?xi32>)
-      %7 = "tf.E"(%6) : (tensor<?xi32>) -> tensor<?xi32>
-      tf_device.return %7 : tensor<?xi32>
-    }) {cluster_attr = "cluster_attr"} : () -> tensor<?xi32>
-    tf_device.return %2 : tensor<?xi32>
+    return %1 : tensor<?xi32>
   }
 
-  return %1 : tensor<?xi32>
-}
+  // Tests extraction of a single outside compiled cluster with single input/output.
 
-// Tests extraction of a single outside compiled cluster with arg input and single device->host input.
+  // CHECK-LABEL: func @single_outside_compiled_input_output_single_outside_compilation
+  func @single_outside_compiled_input_output_single_outside_compilation(%arg0: tensor<?xi32>) -> tensor<?xi32> {
+    %0 = "tf.A"(%arg0) : (tensor<?xi32>) -> tensor<?xi32>
+    // CHECK:      %[[REPLICATE:[0-9]*]]:2 = tf_device.replicate
+    // CHECK:        %[[PARALLEL_EXECUTE_OUTPUT:[0-9]*]] = "tf_device.parallel_execute"
+    // CHECK-NEXT:     "tf_device.launch"
+    // CHECK:            %[[STATUS_OUTPUT:[a-z_0-9]*]], %[[PROGRAM_OUTPUT:[a-z_0-9]*]] = "tf._TPUCompileMlir"
+    // CHECK:            %[[RECV_OUTPUT:[0-9]*]] = "tf._XlaRecvAtHost"(%[[PROGRAM_OUTPUT]])
+    // CHECK:            %[[B_OUTPUT:[0-9]*]] = "tf.B"(%[[RECV_OUTPUT]])
+    // CHECK:            "tf._XlaSendFromHost"(%[[B_OUTPUT]], %[[PROGRAM_OUTPUT]])
+    // CHECK-SAME:       key = "host_compute_channel_cluster1"
+    // CHECK:          "tf_device.cluster"
+    // CHECK:            %[[A_OUTPUT:[0-9]*]] = "tf.A"
+    // CHECK:            %[[HOST_OUTPUT:[0-9]*]] = "tf._HostComputeMlir"(%[[A_OUTPUT]])
+    // CHECK-SAME:       key = "host_compute_channel_cluster1"
+    // CHECK:            "tf.C"(%[[HOST_OUTPUT]])
+    %1:2 = tf_device.replicate([%0, %arg0] as %ri_0: tensor<?xi32>) {n = 2 : i32} {
+      %2 = "tf_device.cluster"() ( {
+        %3 = "tf.A"() : () -> (tensor<?xi32>)
+        %4 = "tf.B"(%3) {_xla_outside_compilation = "cluster1"} : (tensor<?xi32>) -> (tensor<?xi32>)
+        %5 = "tf.C"(%4) : (tensor<?xi32>) -> tensor<?xi32>
+        tf_device.return %5 : tensor<?xi32>
+      }) {num_cores_per_replica = 1, topology =  "", device_assignment =  []} : () -> tensor<?xi32>
+      tf_device.return %2 : tensor<?xi32>
+    }
 
-// CHECK-LABEL: func @mixed_input_single_outside_compilation
-func @mixed_input_single_outside_compilation(%arg0: tensor<?xi32>) -> tensor<?xi32> {
-  %0 = "tf.A"(%arg0) : (tensor<?xi32>) -> tensor<?xi32>
-  // CHECK:      %[[REPLICATE:[0-9]*]]:2 = tf_device.replicate
-  // CHECK:        %[[PARALLEL_EXECUTE_OUTPUT:[0-9]*]] = "tf_device.parallel_execute"
-  // CHECK-NEXT:     "tf_device.launch"
-  // CHECK:            %[[STATUS_OUTPUT:[a-z_0-9]*]], %[[PROGRAM_OUTPUT:[a-z_0-9]*]] = "tf._TPUCompileMlir"
-  // CHECK:            %[[RECV_OUTPUT:[0-9]*]] = "tf._XlaRecvAtHost"(%[[PROGRAM_OUTPUT]])
-  // CHECK-SAME:       key = "host_compute_channel_cluster1"
-  // CHECK:            "tf.B"(%arg0, %[[RECV_OUTPUT]])
-  // CHECK:          "tf_device.cluster"
-  // CHECK:            %[[A_OUTPUT:[0-9]*]] = "tf.A"
-  // CHECK:            "tf._HostComputeMlir"(%[[A_OUTPUT]])
-  // CHECK-SAME:       key = "host_compute_channel_cluster1"
-  %1:2 = tf_device.replicate([%0, %arg0] as %ri_0: tensor<?xi32>) {n = 2 : i32} {
-    %2 = "tf_device.cluster"() ( {
-      %3 = "tf.A"() : () -> (tensor<?xi32>)
-      "tf.B"(%arg0, %3) {_xla_outside_compilation = "cluster1"} : (tensor<?xi32>, tensor<?xi32>) -> ()
-      %4 = "tf.C"() : () -> tensor<?xi32>
-      tf_device.return %4 : tensor<?xi32>
-    }) {cluster_attr = "cluster_attr"} : () -> tensor<?xi32>
-    tf_device.return %2 : tensor<?xi32>
+    return %1 : tensor<?xi32>
   }
 
-  return %1 : tensor<?xi32>
-}
+  // Tests extraction of a single outside compiled cluster with multiple input/output.
 
-// Tests extraction of a multiple outside compiled clusters with single device->host input.
+  // CHECK-LABEL: func @multiple_outside_compiled_input_output_single_outside_compilation
+  func @multiple_outside_compiled_input_output_single_outside_compilation(%arg0: tensor<?xi32>) -> tensor<?xi32> {
+    %0 = "tf.A"(%arg0) : (tensor<?xi32>) -> tensor<?xi32>
+    // CHECK:      %[[REPLICATE:[0-9]*]]:2 = tf_device.replicate
+    // CHECK:        %[[PARALLEL_EXECUTE_OUTPUT:[0-9]*]] = "tf_device.parallel_execute"
+    // CHECK-NEXT:     "tf_device.launch"
+    // CHECK:            %[[STATUS_OUTPUT:[a-z_0-9]*]], %[[PROGRAM_OUTPUT:[a-z_0-9]*]] = "tf._TPUCompileMlir"
+    // CHECK:            %[[RECV_OUTPUT:[0-9]*]]:2 = "tf._XlaRecvAtHost"(%[[PROGRAM_OUTPUT]])
+    // CHECK:            %[[B_OUTPUT:[0-9]*]]:2 = "tf.C"(%[[RECV_OUTPUT]]#0, %[[RECV_OUTPUT]]#1)
+    // CHECK:            "tf._XlaSendFromHost"(%[[B_OUTPUT]]#0, %[[B_OUTPUT]]#1, %[[PROGRAM_OUTPUT]])
+    // CHECK-SAME:        key = "host_compute_channel_cluster1"
+    // CHECK:          "tf_device.cluster"
+    // CHECK:            %[[A_OUTPUT:[0-9]*]] = "tf.A"
+    // CHECK:            %[[B_OUTPUT:[0-9]*]] = "tf.B"
+    // CHECK:            %[[HOST_OUTPUT:[0-9]*]]:2 = "tf._HostComputeMlir"(%[[A_OUTPUT]], %[[B_OUTPUT]])
+    // CHECK-SAME:       key = "host_compute_channel_cluster1"
+    // CHECK:            "tf.D"(%[[HOST_OUTPUT]]#0)
+    // CHECK:            "tf.E"(%[[HOST_OUTPUT]]#1)
+    %1:2 = tf_device.replicate([%0, %arg0] as %ri_0: tensor<?xi32>) {n = 2 : i32} {
+      %2 = "tf_device.cluster"() ( {
+        %3 = "tf.A"() : () -> (tensor<?xi32>)
+        %4 = "tf.B"() : () -> (tensor<?xi32>)
+        %5, %6 = "tf.C"(%3, %4) {_xla_outside_compilation = "cluster1"} : (tensor<?xi32>, tensor<?xi32>) -> (tensor<?xi32>, tensor<?xi32>)
+        %7 = "tf.D"(%5) : (tensor<?xi32>) -> tensor<?xi32>
+        %8 = "tf.E"(%6) : (tensor<?xi32>) -> tensor<?xi32>
+        tf_device.return %8 : tensor<?xi32>
+      }) {num_cores_per_replica = 1, topology =  "", device_assignment =  []} : () -> tensor<?xi32>
+      tf_device.return %2 : tensor<?xi32>
+    }
 
-// CHECK-LABEL: func @single_outside_compiled_input_multiple_outside_compilation
-func @single_outside_compiled_input_multiple_outside_compilation(%arg0: tensor<?xi32>) -> tensor<?xi32> {
-  %0 = "tf.A"(%arg0) : (tensor<?xi32>) -> tensor<?xi32>
-  // CHECK:      %[[REPLICATE:[0-9]*]]:2 = tf_device.replicate
-  // CHECK:        %[[PARALLEL_EXECUTE_OUTPUT:[0-9]*]] = "tf_device.parallel_execute"
-  // CHECK-NEXT:     "tf_device.launch"
-  // CHECK:            %[[STATUS_OUTPUT_2:[a-z_0-9]*]], %[[PROGRAM_OUTPUT_2:[a-z_0-9]*]] = "tf._TPUCompileMlir"
-  // CHECK:            %[[RECV_OUTPUT_2:[0-9]*]] = "tf._XlaRecvAtHost"(%[[PROGRAM_OUTPUT_2]])
-  // CHECK-SAME:      key = "host_compute_channel_cluster2"
-  // CHECK:           "tf.D"(%[[RECV_OUTPUT_2]])
-  // CHECK:          "tf_device.launch"
-  // CHECK:            %[[STATUS_OUTPUT_1:[a-z_0-9]*]], %[[PROGRAM_OUTPUT_1:[a-z_0-9]*]] = "tf._TPUCompileMlir"
-  // CHECK:            %[[RECV_OUTPUT_1:[0-9]*]] = "tf._XlaRecvAtHost"(%[[PROGRAM_OUTPUT_1]])
-  // CHECK-SAME:       key = "host_compute_channel_cluster1"
-  // CHECK:            "tf.B"(%[[RECV_OUTPUT_1]])
-  // CHECK:          "tf_device.cluster"
-  // CHECK:            %[[A_OUTPUT:[0-9]*]] = "tf.A"
-  // CHECK:            "tf._HostComputeMlir"(%[[A_OUTPUT]])
-  // CHECK-SAME:       key = "host_compute_channel_cluster1"
-  // CHECK:            %[[C_OUTPUT:[0-9]*]] = "tf.C"
-  // CHECK:            "tf._HostComputeMlir"(%[[C_OUTPUT]])
-  // CHECK-SAME:       key = "host_compute_channel_cluster2"
-  %1:2 = tf_device.replicate([%0, %arg0] as %ri_0: tensor<?xi32>) {n = 2 : i32} {
-    %2 = "tf_device.cluster"() ( {
-      %3 = "tf.A"() : () -> (tensor<?xi32>)
-      "tf.B"(%3) {_xla_outside_compilation = "cluster1"} : (tensor<?xi32>) -> ()
-      %4 = "tf.C"() : () -> tensor<?xi32>
-      "tf.D"(%4) {_xla_outside_compilation = "cluster2"} : (tensor<?xi32>) -> ()
-      tf_device.return %4 : tensor<?xi32>
-    }) {cluster_attr = "cluster_attr"} : () -> tensor<?xi32>
-    tf_device.return %2 : tensor<?xi32>
+    return %1 : tensor<?xi32>
   }
 
-  return %1 : tensor<?xi32>
-}
+  // Tests extraction of a multiple outside compiled clusters with input/output.
 
-// Tests extraction of a single outside compiled cluster with multiple device->host inputs.
+  // CHECK-LABEL: func @outside_compiled_input_output_multiple_outside_compilation
+  func @outside_compiled_input_output_multiple_outside_compilation(%arg0: tensor<?xi32>) -> tensor<?xi32> {
+    %0 = "tf.A"(%arg0) : (tensor<?xi32>) -> tensor<?xi32>
+    // CHECK:      %[[REPLICATE:[0-9]*]]:2 = tf_device.replicate
+    // CHECK:        %[[PARALLEL_EXECUTE_OUTPUT:[0-9]*]] = "tf_device.parallel_execute"
+    // CHECK-NEXT:     "tf_device.launch"
+    // CHECK:            %[[STATUS_OUTPUT2:[a-z_0-9]*]], %[[PROGRAM_OUTPUT2:[a-z_0-9]*]] = "tf._TPUCompileMlir"
+    // CHECK:            %[[RECV_OUTPUT2:[0-9]*]] = "tf._XlaRecvAtHost"(%[[PROGRAM_OUTPUT2]])
+    // CHECK:            %[[D_OUTPUT:[0-9]*]] = "tf.D"(%[[RECV_OUTPUT2]])
+    // CHECK:            "tf._XlaSendFromHost"(%[[D_OUTPUT]], %[[PROGRAM_OUTPUT]])
+    // CHECK-SAME:         key = "host_compute_channel_cluster2"
+    // CHECK:          "tf_device.launch"
+    // CHECK:            %[[STATUS_OUTPUT1:[a-z_0-9]*]], %[[PROGRAM_OUTPUT1:[a-z_0-9]*]] = "tf._TPUCompileMlir"
+    // CHECK:            %[[RECV_OUTPUT1:[0-9]*]] = "tf._XlaRecvAtHost"(%[[PROGRAM_OUTPUT1]])
+    // CHECK:            %[[B_OUTPUT:[0-9]*]] = "tf.B"(%[[RECV_OUTPUT1]])
+    // CHECK:            "tf._XlaSendFromHost"(%[[B_OUTPUT]], %[[PROGRAM_OUTPUT]])
+    // CHECK-SAME:       key = "host_compute_channel_cluster1"
+    // CHECK:          "tf_device.cluster"
+    // CHECK:            %[[A_OUTPUT:[0-9]*]] = "tf.A"
+    // CHECK:            %[[HOST_OUTPUT1:[0-9]*]] = "tf._HostComputeMlir"(%[[A_OUTPUT]])
+    // CHECK-SAME:       key = "host_compute_channel_cluster1"
+    // CHECK:            %[[C_OUTPUT:[0-9]*]] = "tf.C"(%[[HOST_OUTPUT1]])
+    // CHECK:            %[[HOST_OUTPUT2:[0-9]*]] = "tf._HostComputeMlir"(%[[C_OUTPUT]])
+    // CHECK-SAME:       key = "host_compute_channel_cluster2"
+    // CHECK:            "tf.E"(%[[HOST_OUTPUT2]])
+    %1:2 = tf_device.replicate([%0, %arg0] as %ri_0: tensor<?xi32>) {n = 2 : i32} {
+      %2 = "tf_device.cluster"() ( {
+        %3 = "tf.A"() : () -> (tensor<?xi32>)
+        %4 = "tf.B"(%3) {_xla_outside_compilation = "cluster1"} : (tensor<?xi32>) -> (tensor<?xi32>)
+        %5 = "tf.C"(%4) : (tensor<?xi32>) -> (tensor<?xi32>)
+        %6 = "tf.D"(%5) {_xla_outside_compilation = "cluster2"} : (tensor<?xi32>) -> (tensor<?xi32>)
+        %7 = "tf.E"(%6) : (tensor<?xi32>) -> tensor<?xi32>
+        tf_device.return %7 : tensor<?xi32>
+      }) {num_cores_per_replica = 1, topology =  "", device_assignment =  []} : () -> tensor<?xi32>
+      tf_device.return %2 : tensor<?xi32>
+    }
 
-// CHECK-LABEL: func @multiple_outside_compiled_inputs_single_outside_compilation
-func @multiple_outside_compiled_inputs_single_outside_compilation(%arg0: tensor<?xi32>) -> tensor<?xi32> {
-  %0 = "tf.A"(%arg0) : (tensor<?xi32>) -> tensor<?xi32>
-  // CHECK:      %[[REPLICATE:[0-9]*]]:2 = tf_device.replicate
-  // CHECK:        %[[PARALLEL_EXECUTE_OUTPUT:[0-9]*]] = "tf_device.parallel_execute"
-  // CHECK-NEXT:     "tf_device.launch"
-  // CHECK:            %[[STATUS_OUTPUT:[a-z_0-9]*]], %[[PROGRAM_OUTPUT:[a-z_0-9]*]] = "tf._TPUCompileMlir"
-  // CHECK:            %[[RECV_OUTPUT:[0-9]*]]:2 = "tf._XlaRecvAtHost"(%[[PROGRAM_OUTPUT]])
-  // CHECK-SAME:       key = "host_compute_channel_cluster1"
-  // CHECK:            "tf.C"(%[[RECV_OUTPUT]]#0)
-  // CHECK:            "tf.D"(%[[RECV_OUTPUT]]#1, %[[RECV_OUTPUT]]#0)
-  // CHECK:          "tf_device.cluster"
-  // CHECK:            %[[A_OUTPUT:[0-9]*]] = "tf.A"
-  // CHECK:            %[[B_OUTPUT:[0-9]*]] = "tf.B"
-  // CHECK:            "tf._HostComputeMlir"(%[[A_OUTPUT]], %[[B_OUTPUT]])
-  // CHECK-SAME:       key = "host_compute_channel_cluster1"
-  %1:2 = tf_device.replicate([%0, %arg0] as %ri_0: tensor<?xi32>) {n = 2 : i32} {
-    %2 = "tf_device.cluster"() ( {
-      %3 = "tf.A"() : () -> (tensor<?xi32>)
-      %4 = "tf.B"() : () -> (tensor<?xi32>)
-      "tf.C"(%3) {_xla_outside_compilation = "cluster1"} : (tensor<?xi32>) -> ()
-      "tf.D"(%4, %3) {_xla_outside_compilation = "cluster1"} : (tensor<?xi32>, tensor<?xi32>) -> ()
-      %5 = "tf.E"() : () -> tensor<?xi32>
-      tf_device.return %5 : tensor<?xi32>
-    }) {cluster_attr = "cluster_attr"} : () -> tensor<?xi32>
-    tf_device.return %2 : tensor<?xi32>
+    return %1 : tensor<?xi32>
   }
 
-  return %1 : tensor<?xi32>
-}
+  // Tests extraction of a single outside compiled cluster with arg input and single device->host input.
 
-// Tests only directly used results of tpu cluster are remapped with
-// parallel_execute.
+  // CHECK-LABEL: func @mixed_input_single_outside_compilation
+  func @mixed_input_single_outside_compilation(%arg0: tensor<?xi32>) -> tensor<?xi32> {
+    %0 = "tf.A"(%arg0) : (tensor<?xi32>) -> tensor<?xi32>
+    // CHECK:      %[[REPLICATE:[0-9]*]]:2 = tf_device.replicate
+    // CHECK:        %[[PARALLEL_EXECUTE_OUTPUT:[0-9]*]] = "tf_device.parallel_execute"
+    // CHECK-NEXT:     "tf_device.launch"
+    // CHECK:            %[[STATUS_OUTPUT:[a-z_0-9]*]], %[[PROGRAM_OUTPUT:[a-z_0-9]*]] = "tf._TPUCompileMlir"
+    // CHECK:            %[[RECV_OUTPUT:[0-9]*]] = "tf._XlaRecvAtHost"(%[[PROGRAM_OUTPUT]])
+    // CHECK-SAME:       key = "host_compute_channel_cluster1"
+    // CHECK:            "tf.B"(%arg0, %[[RECV_OUTPUT]])
+    // CHECK:          "tf_device.cluster"
+    // CHECK:            %[[A_OUTPUT:[0-9]*]] = "tf.A"
+    // CHECK:            "tf._HostComputeMlir"(%[[A_OUTPUT]])
+    // CHECK-SAME:       key = "host_compute_channel_cluster1"
+    %1:2 = tf_device.replicate([%0, %arg0] as %ri_0: tensor<?xi32>) {n = 2 : i32} {
+      %2 = "tf_device.cluster"() ( {
+        %3 = "tf.A"() : () -> (tensor<?xi32>)
+        "tf.B"(%arg0, %3) {_xla_outside_compilation = "cluster1"} : (tensor<?xi32>, tensor<?xi32>) -> ()
+        %4 = "tf.C"() : () -> tensor<?xi32>
+        tf_device.return %4 : tensor<?xi32>
+      }) {num_cores_per_replica = 1, topology =  "", device_assignment =  []} : () -> tensor<?xi32>
+      tf_device.return %2 : tensor<?xi32>
+    }
 
-// CHECK-LABEL: func @remapped_results
-func @remapped_results(%arg0: tensor<?xi32>) -> tensor<?xi32> {
-  %0 = "tf.A"(%arg0) : (tensor<?xi32>) -> tensor<?xi32>
-  // CHECK: %[[REPLICATE:[0-9]*]]:2 = tf_device.replicate
-  // CHECK:   %[[PARALLEL_EXECUTE_OUTPUT:[0-9]*]]:2 = "tf_device.parallel_execute"
-  // CHECK: tf_device.return %[[PARALLEL_EXECUTE_OUTPUT]]#1 : tensor<?xi32>
-  %1:2 = tf_device.replicate([%0, %arg0] as %ri_0: tensor<?xi32>) {n = 2 : i32} {
-    %2:2 = "tf_device.cluster"() ( {
-      %3 = "tf.A"() : () -> (tensor<?xi32>)
-      %4 = "tf.B"(%3) {_xla_outside_compilation = "cluster1"} : (tensor<?xi32>) -> (tensor<?xi32>)
-      %5:2 = "tf.C"(%4) : (tensor<?xi32>) -> (tensor<?xi32>, tensor<?xi32>)
-      tf_device.return %5#0, %5#1 : tensor<?xi32>, tensor<?xi32>
-    }) {cluster_attr = "cluster_attr"} : () -> (tensor<?xi32>, tensor<?xi32>)
-    tf_device.return %2#1 : tensor<?xi32>
+    return %1 : tensor<?xi32>
+  }
+
+  // Tests extraction of a multiple outside compiled clusters with single device->host input.
+
+  // CHECK-LABEL: func @single_outside_compiled_input_multiple_outside_compilation
+  func @single_outside_compiled_input_multiple_outside_compilation(%arg0: tensor<?xi32>) -> tensor<?xi32> {
+    %0 = "tf.A"(%arg0) : (tensor<?xi32>) -> tensor<?xi32>
+    // CHECK:      %[[REPLICATE:[0-9]*]]:2 = tf_device.replicate
+    // CHECK:        %[[PARALLEL_EXECUTE_OUTPUT:[0-9]*]] = "tf_device.parallel_execute"
+    // CHECK-NEXT:     "tf_device.launch"
+    // CHECK:            %[[STATUS_OUTPUT_2:[a-z_0-9]*]], %[[PROGRAM_OUTPUT_2:[a-z_0-9]*]] = "tf._TPUCompileMlir"
+    // CHECK:            %[[RECV_OUTPUT_2:[0-9]*]] = "tf._XlaRecvAtHost"(%[[PROGRAM_OUTPUT_2]])
+    // CHECK-SAME:      key = "host_compute_channel_cluster2"
+    // CHECK:           "tf.D"(%[[RECV_OUTPUT_2]])
+    // CHECK:          "tf_device.launch"
+    // CHECK:            %[[STATUS_OUTPUT_1:[a-z_0-9]*]], %[[PROGRAM_OUTPUT_1:[a-z_0-9]*]] = "tf._TPUCompileMlir"
+    // CHECK:            %[[RECV_OUTPUT_1:[0-9]*]] = "tf._XlaRecvAtHost"(%[[PROGRAM_OUTPUT_1]])
+    // CHECK-SAME:       key = "host_compute_channel_cluster1"
+    // CHECK:            "tf.B"(%[[RECV_OUTPUT_1]])
+    // CHECK:          "tf_device.cluster"
+    // CHECK:            %[[A_OUTPUT:[0-9]*]] = "tf.A"
+    // CHECK:            "tf._HostComputeMlir"(%[[A_OUTPUT]])
+    // CHECK-SAME:       key = "host_compute_channel_cluster1"
+    // CHECK:            %[[C_OUTPUT:[0-9]*]] = "tf.C"
+    // CHECK:            "tf._HostComputeMlir"(%[[C_OUTPUT]])
+    // CHECK-SAME:       key = "host_compute_channel_cluster2"
+    %1:2 = tf_device.replicate([%0, %arg0] as %ri_0: tensor<?xi32>) {n = 2 : i32} {
+      %2 = "tf_device.cluster"() ( {
+        %3 = "tf.A"() : () -> (tensor<?xi32>)
+        "tf.B"(%3) {_xla_outside_compilation = "cluster1"} : (tensor<?xi32>) -> ()
+        %4 = "tf.C"() : () -> tensor<?xi32>
+        "tf.D"(%4) {_xla_outside_compilation = "cluster2"} : (tensor<?xi32>) -> ()
+        tf_device.return %4 : tensor<?xi32>
+      }) {num_cores_per_replica = 1, topology =  "", device_assignment =  []} : () -> tensor<?xi32>
+      tf_device.return %2 : tensor<?xi32>
+    }
+
+    return %1 : tensor<?xi32>
+  }
+
+  // Tests extraction of a single outside compiled cluster with multiple device->host inputs.
+
+  // CHECK-LABEL: func @multiple_outside_compiled_inputs_single_outside_compilation
+  func @multiple_outside_compiled_inputs_single_outside_compilation(%arg0: tensor<?xi32>) -> tensor<?xi32> {
+    %0 = "tf.A"(%arg0) : (tensor<?xi32>) -> tensor<?xi32>
+    // CHECK:      %[[REPLICATE:[0-9]*]]:2 = tf_device.replicate
+    // CHECK:        %[[PARALLEL_EXECUTE_OUTPUT:[0-9]*]] = "tf_device.parallel_execute"
+    // CHECK-NEXT:     "tf_device.launch"
+    // CHECK:            %[[STATUS_OUTPUT:[a-z_0-9]*]], %[[PROGRAM_OUTPUT:[a-z_0-9]*]] = "tf._TPUCompileMlir"
+    // CHECK:            %[[RECV_OUTPUT:[0-9]*]]:2 = "tf._XlaRecvAtHost"(%[[PROGRAM_OUTPUT]])
+    // CHECK-SAME:       key = "host_compute_channel_cluster1"
+    // CHECK:            "tf.C"(%[[RECV_OUTPUT]]#0)
+    // CHECK:            "tf.D"(%[[RECV_OUTPUT]]#1, %[[RECV_OUTPUT]]#0)
+    // CHECK:          "tf_device.cluster"
+    // CHECK:            %[[A_OUTPUT:[0-9]*]] = "tf.A"
+    // CHECK:            %[[B_OUTPUT:[0-9]*]] = "tf.B"
+    // CHECK:            "tf._HostComputeMlir"(%[[A_OUTPUT]], %[[B_OUTPUT]])
+    // CHECK-SAME:       key = "host_compute_channel_cluster1"
+    %1:2 = tf_device.replicate([%0, %arg0] as %ri_0: tensor<?xi32>) {n = 2 : i32} {
+      %2 = "tf_device.cluster"() ( {
+        %3 = "tf.A"() : () -> (tensor<?xi32>)
+        %4 = "tf.B"() : () -> (tensor<?xi32>)
+        "tf.C"(%3) {_xla_outside_compilation = "cluster1"} : (tensor<?xi32>) -> ()
+        "tf.D"(%4, %3) {_xla_outside_compilation = "cluster1"} : (tensor<?xi32>, tensor<?xi32>) -> ()
+        %5 = "tf.E"() : () -> tensor<?xi32>
+        tf_device.return %5 : tensor<?xi32>
+      }) {num_cores_per_replica = 1, topology =  "", device_assignment =  []} : () -> tensor<?xi32>
+      tf_device.return %2 : tensor<?xi32>
+    }
+
+    return %1 : tensor<?xi32>
+  }
+
+  // Tests only directly used results of tpu cluster are remapped with
+  // parallel_execute.
+
+  // CHECK-LABEL: func @remapped_results
+  func @remapped_results(%arg0: tensor<?xi32>) -> tensor<?xi32> {
+    %0 = "tf.A"(%arg0) : (tensor<?xi32>) -> tensor<?xi32>
+    // CHECK: %[[REPLICATE:[0-9]*]]:2 = tf_device.replicate
+    // CHECK:   %[[PARALLEL_EXECUTE_OUTPUT:[0-9]*]]:2 = "tf_device.parallel_execute"
+    // CHECK: tf_device.return %[[PARALLEL_EXECUTE_OUTPUT]]#1 : tensor<?xi32>
+    %1:2 = tf_device.replicate([%0, %arg0] as %ri_0: tensor<?xi32>) {n = 2 : i32} {
+      %2:2 = "tf_device.cluster"() ( {
+        %3 = "tf.A"() : () -> (tensor<?xi32>)
+        %4 = "tf.B"(%3) {_xla_outside_compilation = "cluster1"} : (tensor<?xi32>) -> (tensor<?xi32>)
+        %5:2 = "tf.C"(%4) : (tensor<?xi32>) -> (tensor<?xi32>, tensor<?xi32>)
+        tf_device.return %5#0, %5#1 : tensor<?xi32>, tensor<?xi32>
+      }) {num_cores_per_replica = 1, topology =  "", device_assignment =  []} : () -> (tensor<?xi32>, tensor<?xi32>)
+      tf_device.return %2#1 : tensor<?xi32>
+    }
+    return %1 : tensor<?xi32>
   }
-  return %1 : tensor<?xi32>
 }
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_extract_outside_compilation.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_extract_outside_compilation.cc
index 503c9869557..fd7906460fe 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_extract_outside_compilation.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_extract_outside_compilation.cc
@@ -26,6 +26,8 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/device_util.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util.h"
 
 namespace mlir {
 namespace TFTPU {
@@ -91,13 +93,14 @@ void MoveOutsideClusterOpsToLaunchOp(tf_device::LaunchOp launch_op,
 
 // Creates a `tf_device::LaunchOp` to wrap cluster ops.
 tf_device::LaunchOp CreateLaunchOpForOutsideCluster(
-    OpBuilder* builder, Operation* last_cluster_op) {
-  // TODO(b/154363171): Set the CPU device.
+    OpBuilder* builder, Operation* last_cluster_op,
+    llvm::StringRef host_device) {
   // An empty string placeholder is used for the device as that will be later
   // populated with the device of the associated TPUReplicateMetadata op.
   llvm::SmallVector<Type, 8> result_types;
   auto launch_op = builder->create<tf_device::LaunchOp>(
-      last_cluster_op->getLoc(), builder->getStringAttr(""), result_types);
+      last_cluster_op->getLoc(), builder->getStringAttr(host_device),
+      result_types);
 
   launch_op.body().push_back(new Block);
 
@@ -253,8 +256,9 @@ void MoveOutsideCompiledOps(
 
 // Creates a `parallel_execute` op in place of launch with 'clusters` and
 // 'launch` as regions.
-void CreateParallelExecuteFromOutsideClusters(
-    tf_device::ClusterOp tpu_cluster, const OutsideClusterMap& clusters) {
+void CreateParallelExecuteFromOutsideClusters(tf_device::ClusterOp tpu_cluster,
+                                              const OutsideClusterMap& clusters,
+                                              llvm::StringRef host_device) {
   OpBuilder builder(tpu_cluster);
   // Create parallel_execute regions.  The original TPU cluster computation
   // is the extra region.
@@ -269,8 +273,8 @@ void CreateParallelExecuteFromOutsideClusters(
     Block& outside_block =
         parallel_execute_op.GetRegionBlockWithIndex(cluster.index());
     builder.setInsertionPointToEnd(&outside_block);
-    tf_device::LaunchOp host_launch_op =
-        CreateLaunchOpForOutsideCluster(&builder, cluster_ops.back());
+    tf_device::LaunchOp host_launch_op = CreateLaunchOpForOutsideCluster(
+        &builder, cluster_ops.back(), host_device);
 
     // Determine if there are any inputs that are provided out of cluster.
     auto external_inputs = GetExternalOperands(cluster_ops);
@@ -307,8 +311,14 @@ void CreateParallelExecuteFromOutsideClusters(
 }
 
 void TPUExtractOutsideCompilation::runOnOperation() {
+  // Get runtime devices information from the closest parent module.
+  auto module = getOperation();
+  mlir::TF::RuntimeDevices devices;
+  if (failed(tensorflow::GetDevicesFromOp(module, &devices)))
+    return signalPassFailure();
+
   auto extract_result =
-      getOperation().walk([&](tf_device::ClusterOp tpu_cluster) {
+      module.walk([&](tf_device::ClusterOp tpu_cluster) {
         OutsideClusterMap clusters;
         if (failed(CollectAndGroupOutsideClusterOps(&tpu_cluster.GetBody(),
                                                     &clusters)))
@@ -316,7 +326,11 @@ void TPUExtractOutsideCompilation::runOnOperation() {
 
         if (clusters.empty()) return WalkResult::advance();
 
-        CreateParallelExecuteFromOutsideClusters(tpu_cluster, clusters);
+        std::string host_device;
+        tensorflow::GetHostDeviceOutsideComputation(devices, tpu_cluster,
+                                                    &host_device);
+        CreateParallelExecuteFromOutsideClusters(tpu_cluster, clusters,
+                                                 host_device);
 
         return WalkResult::advance();
       });

From b7971c9c57cb6490cb9789b3d47f0b8a81f1bef3 Mon Sep 17 00:00:00 2001
From: Jacques Pienaar <jpienaar@google.com>
Date: Wed, 24 Jun 2020 10:41:52 -0700
Subject: [PATCH 0986/1390] Use SameOperandAndResultShape trait to refine shape

Add RefineShapeForPassThroughOps method that can refine the shape where we know the input/output shapes are constrained to be equal. This just generalizes the tensorcast op specific behavior previously.

PiperOrigin-RevId: 318094262
Change-Id: I9e4b5aa3717b0a36211ef9ec7969918eb149aa38
---
 .../tensorflow/tests/shape_inference.mlir     |  11 +
 .../tensorflow/transforms/shape_inference.cc  | 192 ++++++++++--------
 2 files changed, 118 insertions(+), 85 deletions(-)

diff --git a/tensorflow/compiler/mlir/tensorflow/tests/shape_inference.mlir b/tensorflow/compiler/mlir/tensorflow/tests/shape_inference.mlir
index 1af4ba6b3dc..bf952445fa8 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/shape_inference.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/shape_inference.mlir
@@ -442,8 +442,19 @@ func @multiple_blocks_one_return(%arg0: tensor<?xf32>) -> tensor<*xf32> {
       tf_device.return %2 : tensor<1x8x2xf32>
     // CHECK: () -> tensor<1x8x2xf32>
     }) {device = "/device:CPU:0"} : () -> tensor<*xf32>
+    // CHECK: "tf.Cast"(%{{.*}}) {Truncate = false} : (tensor<1x8x2xf32>) -> tensor<*xf32>
     // CHECK: (tensor<i32>, tensor<1x8x2xf32>) -> (tensor<1x8x1xf32>, tensor<1x8x1xf32>)
     %3:2 = "tf.Split"(%0, %1) {device = ""} : (tensor<i32>, tensor<*xf32>) -> (tensor<*xf32>, tensor<*xf32>)
+    %4 = addf %1, %1 : tensor<*xf32>
     return %3#0, %3#1 : tensor<*xf32>, tensor<*xf32>
   }
+
+  // CHECK-LABEL: func @tensor_cast(%arg0: tensor<1xi32>) -> tensor<1xi32>
+  func @tensor_cast(%arg0: tensor<1xi32>) -> tensor<*xi32> {
+   // CHECK: %[[RESULT:.*]] = tensor_cast
+   // CHECK-SAME: tensor<1xi32> to tensor<1xi32>
+   // CHECK: return %[[RESULT]] : tensor<1xi32>
+    %1 = tensor_cast %arg0 : tensor<1xi32> to tensor<*xi32>
+    return %1 : tensor<*xi32>
+  }
 }
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference.cc b/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference.cc
index 33ccf5caff2..8c537d01d5c 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference.cc
@@ -165,63 +165,6 @@ Optional<tensorflow::PartialTensorShape> GetShapeFromMlirType(Type t) {
   return None;
 }
 
-// Passes the operand shapes/types to the op's results.
-bool InferShapeForPassThroughOps(OperandRange pass_through_operands,
-                                 Operation* op, Dialect* tf_dialect) {
-  bool changed = false;
-  for (auto entry : zip(pass_through_operands, op->getResults())) {
-    Type operand_type = std::get<0>(entry).getType();
-    Value result = std::get<1>(entry);
-    if (result.getType() == operand_type) continue;
-    // Pass through nodes may remove ref types, don't consider that as
-    // refinement.
-    // TODO(jpienaar): There could be refinement in addition to this, so
-    // refine this.
-    if (operand_type.cast<TensorType>()
-            .getElementType()
-            .isa<TF::TensorFlowRefType>() &&
-        !result.getType()
-             .cast<TensorType>()
-             .getElementType()
-             .isa<TF::TensorFlowRefType>())
-      continue;
-    AddCastBackForUnsupportedNonTFUses(op, result, tf_dialect,
-                                       result.getType());
-    result.setType(operand_type);
-    changed = true;
-  }
-  return changed;
-}
-
-// Infers shape for necessary ops that are not in the TF dialect.
-bool InferShapeForNonTFDialectOperation(Operation* op, Dialect* tf_dialect) {
-  if (auto graph_op = dyn_cast<tf_executor::GraphOp>(op)) {
-    return InferShapeForPassThroughOps(graph_op.GetFetch().fetches(), op,
-                                       tf_dialect);
-  }
-  if (auto island_op = dyn_cast<tf_executor::IslandOp>(op)) {
-    return InferShapeForPassThroughOps(island_op.GetYield().fetches(), op,
-                                       tf_dialect);
-  }
-  if (auto iter_sink = dyn_cast<tf_executor::NextIterationSinkOp>(op)) {
-    auto iter_source = cast<tf_executor::NextIterationSourceOp>(
-        iter_sink.token().getDefiningOp());
-    return InferShapeForPassThroughOps(
-        iter_sink.getOperands().drop_front().take_front(), iter_source,
-        tf_dialect);
-  }
-  // TODO(b/155227679): Use OpInterface instead of hard-coding for TensorCastOp.
-  if (auto tensor_cast = dyn_cast<TensorCastOp>(op)) {
-    return InferShapeForPassThroughOps(
-        tensor_cast.getOperation()->getOperands(), op, tf_dialect);
-  }
-  if (auto launch_op = dyn_cast<tf_device::LaunchOp>(op)) {
-    return InferShapeForPassThroughOps(
-        launch_op.GetBody().getTerminator()->getOperands(), op, tf_dialect);
-  }
-  return false;
-}
-
 // Gets the subtype's shape and data type for `type`. Templated to support both
 // ResourceType and VariantType.
 template <typename T>
@@ -261,32 +204,6 @@ GetSubtypes(Type type) {
   return GetSubtypesHelper<TF::VariantType>(type);
 }
 
-// Makes result types match the operand types (the i-th result type will
-// match the i-th operand type). Returns true if anything is changed.
-bool PassThroughOperandTypes(OperandRange operands, ResultRange results) {
-  bool changed = false;
-  for (auto entry : zip(operands, results)) {
-    Type operand_type = std::get<0>(entry).getType();
-    Type result_type = std::get<1>(entry).getType();
-    if (operand_type == result_type) continue;
-    // Pass through nodes may remove ref types, don't consider that as
-    // refinement.
-    // TODO(jpienaar): There could be refinement in addition to this, so
-    // refine this.
-    if (operand_type.cast<TensorType>()
-            .getElementType()
-            .isa<TF::TensorFlowRefType>() &&
-        !result_type.cast<TensorType>()
-             .getElementType()
-             .isa<TF::TensorFlowRefType>())
-      continue;
-
-    std::get<1>(entry).setType(operand_type);
-    changed = true;
-  }
-  return changed;
-}
-
 // Returns whether type can be further refined.
 bool CanBeRefined(Type type) {
   auto shape_type = type.dyn_cast<ShapedType>();
@@ -600,6 +517,19 @@ class ShapeInference {
   // Returns whether it was able to compute constant values.
   LogicalResult TryToFold(Operation* op);
 
+  // Makes result types match the operand types (the i-th result type will
+  // match the i-th operand type). Returns true if anything is changed.
+  bool RefineTypeForPassThroughOperands(Operation* op, OperandRange operands,
+                                        ResultRange results);
+
+  // Makes result type's shape match the corresponding operand's shape.
+  // Returns whether any change was made.
+  bool RefineShapeForPassThroughOps(Operation* op);
+
+  // Infers shape for necessary ops that are not in the TF dialect. Returns
+  // whether any result type changed.
+  bool InferShapeForNonTFDialectOperation(Operation* op);
+
  private:
   // Mapping between ValuePort (which corresponds to an OpResult or smaller,
   // e.g., first element of OpResult produced) to an Attribute if the ValuePort
@@ -684,6 +614,97 @@ ShapeHandle ShapeInference::ComputeOutputAsShape(OpResult result,
   return ic->MakeShape(dims);
 }
 
+bool ShapeInference::RefineTypeForPassThroughOperands(Operation* op,
+                                                      OperandRange operands,
+                                                      ResultRange results) {
+  bool changed = false;
+  for (auto entry : zip(operands, results)) {
+    Type operand_type = std::get<0>(entry).getType();
+    Value result = std::get<1>(entry);
+    TensorType result_type = result.getType().cast<TensorType>();
+    if (operand_type == result_type) continue;
+    // Pass through nodes may remove ref types, don't consider that as
+    // refinement.
+    // TODO(jpienaar): There could be refinement in addition to this, so
+    // refine this.
+    if (operand_type.cast<TensorType>()
+            .getElementType()
+            .isa<TF::TensorFlowRefType>() &&
+        !result_type.cast<TensorType>()
+             .getElementType()
+             .isa<TF::TensorFlowRefType>())
+      continue;
+
+    std::get<1>(entry).setType(operand_type);
+    AddCastBackForUnsupportedNonTFUses(op, result, tf_dialect_, result_type);
+    changed = true;
+  }
+  return changed;
+}
+
+bool ShapeInference::RefineShapeForPassThroughOps(Operation* op) {
+  auto is_allowed_dtype = [](Type t) {
+    // Skip if element type is not in standard or TF dialect.
+    // TODO(jpienaar): The tf.Cast op, which is uniformly inserted at the
+    // moment, cannot handle arbirary types (e.g., it can't handle quantized
+    // types). This restriction can be relaxed if not only tf.Cast is used.
+    auto kind = t.getKind();
+    return (kind >= Type::FIRST_STANDARD_TYPE &&
+            kind < Type::LAST_STANDARD_TYPE) ||
+           (kind >= Type::FIRST_TENSORFLOW_TYPE &&
+            kind < Type::LAST_TENSORFLOW_TYPE);
+  };
+
+  bool changed = false;
+  for (auto entry : zip(op->getOperands(), op->getResults())) {
+    TensorType operand_type = std::get<0>(entry).getType().cast<TensorType>();
+    Value result = std::get<1>(entry);
+    TensorType result_type = result.getType().cast<TensorType>();
+    if (operand_type == result_type) continue;
+    if (!operand_type.hasRank()) continue;
+    if (result_type.hasRank() &&
+        result_type.getShape() == operand_type.getShape())
+      continue;
+    if (!is_allowed_dtype(operand_type.getElementType()) ||
+        !is_allowed_dtype(result_type.getElementType())) {
+      continue;
+    }
+
+    result.setType(RankedTensorType::get(operand_type.getShape(),
+                                         result_type.getElementType()));
+    AddCastBackForUnsupportedNonTFUses(op, result, tf_dialect_, result_type);
+    changed = true;
+  }
+  return changed;
+}
+
+bool ShapeInference::InferShapeForNonTFDialectOperation(Operation* op) {
+  if (auto graph_op = dyn_cast<tf_executor::GraphOp>(op)) {
+    return RefineTypeForPassThroughOperands(
+        graph_op.GetFetch(), graph_op.GetFetch().fetches(), op->getResults());
+  }
+  if (auto island_op = dyn_cast<tf_executor::IslandOp>(op)) {
+    return RefineTypeForPassThroughOperands(
+        island_op.GetYield(), island_op.GetYield().fetches(), op->getResults());
+  }
+  if (auto iter_sink = dyn_cast<tf_executor::NextIterationSinkOp>(op)) {
+    auto iter_source = cast<tf_executor::NextIterationSourceOp>(
+        iter_sink.token().getDefiningOp());
+    return RefineTypeForPassThroughOperands(
+        op, iter_sink.getOperands().drop_front().take_front(),
+        iter_source.getResults());
+  }
+  if (auto launch_op = dyn_cast<tf_device::LaunchOp>(op)) {
+    auto terminator = launch_op.GetBody().getTerminator();
+    return RefineTypeForPassThroughOperands(op, terminator->getOperands(),
+                                            op->getResults());
+  }
+  if (op->hasTrait<OpTrait::SameOperandsAndResultShape>()) {
+    return RefineShapeForPassThroughOps(op);
+  }
+  return false;
+}
+
 bool ShapeInference::InferShapeForSingleOperation(Operation* op) {
   LLVM_DEBUG(op->print(llvm::dbgs() << "InferShapeForSingleOperation for ");
              llvm::dbgs() << "\n");
@@ -693,7 +714,8 @@ bool ShapeInference::InferShapeForSingleOperation(Operation* op) {
   // to make sure they are preserved in the output.
   if (isa<TF::IdentityOp>(op) || isa<TF::IdentityNOp>(op) ||
       isa<TF::ZerosLikeOp>(op) || isa<TF::WhileOp>(op)) {
-    return PassThroughOperandTypes(op->getOperands(), op->getResults());
+    return RefineTypeForPassThroughOperands(op, op->getOperands(),
+                                            op->getResults());
   }
 
   // If no result for this op needs shape inference, we have a fast-path return.
@@ -1121,7 +1143,7 @@ LogicalResult ShapeInference::InferShapeUntilFixPoint(Region* region,
       }
 
       if (op->getDialect() != tf_dialect_) {
-        changed |= InferShapeForNonTFDialectOperation(op, tf_dialect_);
+        changed |= InferShapeForNonTFDialectOperation(op);
         return;
       }
 

From 18791f6ed9938ff7b7a682ca0265b0ac4f2d0b9f Mon Sep 17 00:00:00 2001
From: Doe Hyun Yoon <dyoon@google.com>
Date: Wed, 24 Jun 2020 11:03:26 -0700
Subject: [PATCH 0987/1390] Fix a typo in virtual_scheduler_test.

PiperOrigin-RevId: 318099215
Change-Id: I698608791dd922c888533e899c6ffaa4288d501f
---
 tensorflow/core/grappler/costs/virtual_scheduler_test.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/grappler/costs/virtual_scheduler_test.cc b/tensorflow/core/grappler/costs/virtual_scheduler_test.cc
index ae6fab28a72..3a332ff03db 100644
--- a/tensorflow/core/grappler/costs/virtual_scheduler_test.cc
+++ b/tensorflow/core/grappler/costs/virtual_scheduler_test.cc
@@ -2988,7 +2988,7 @@ TEST_F(VirtualSchedulerTest, AddMergeSwitch) {
   // Run the scheduler. The current VirtualScheduler, w/o annotation, triggers
   // both outputs of Switch; then Merge (as long as one input is ready, it's z
   // is ready, if we just use num_inputs_ready counter, the final Add becomes
-  // ready. possible to skipt scheduling z. (Need to use CompositeNodeManager
+  // ready. possible to skip scheduling z. (Need to use CompositeNodeManager
   // to test this case).
   auto ops_executed = RunScheduler("");
 

From 828703f12350e8115284d6ca0977a76d768db496 Mon Sep 17 00:00:00 2001
From: Andy Ly <lyandy@google.com>
Date: Wed, 24 Jun 2020 11:13:46 -0700
Subject: [PATCH 0988/1390] Update TPU bridge pipeline to run graph pruning
 prior to island coarsening.

TPU cluster formation has been updated to not be dependent on TPUReplicatedInput indices, and TPU dynamic padder remapper has been updated to use stored indices for each TPUReplicatedInput instead of tf_device.replicate operand ordering. Without dependencies on missing TPUReplicatedInput indices, nodes that are unreachable, including TPUReplicatedInput, can be pruned away.

PiperOrigin-RevId: 318101339
Change-Id: I727bd9a485e154934c0ee2c56779c268ae4bcbbe
---
 tensorflow/compiler/mlir/tensorflow/transforms/bridge.cc | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/bridge.cc b/tensorflow/compiler/mlir/tensorflow/transforms/bridge.cc
index ef7f63f82e3..6ff04c5dbda 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/bridge.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/bridge.cc
@@ -72,11 +72,10 @@ tensorflow::Status RunTPUBridge(
 }  // namespace
 
 void CreateTPUBridgePipeline(OpPassManager &pm) {
+  pm.addNestedPass<FuncOp>(tf_executor::CreateTFExecutorGraphPruningPass());
   // Run island coarsening before shape inference to allow more exact shape
   // inference using constant folding within islands.
   pm.addNestedPass<FuncOp>(tf_executor::CreateTFExecutorIslandCoarseningPass());
-  // TODO(b/150462212): Move graph pruning before island coarsening.
-  pm.addNestedPass<FuncOp>(tf_executor::CreateTFExecutorGraphPruningPass());
   // Run shape inference so that tf_executor/tf_device ops created later will
   // likely to inherit more concrete types.
   pm.addPass(TF::CreateTFShapeInferencePass());

From eff1aa2b458aa3f7ea5c210914ba429abd28bcb5 Mon Sep 17 00:00:00 2001
From: rahul-kamat <rahulkamat@gmail.com>
Date: Wed, 24 Jun 2020 18:29:50 +0000
Subject: [PATCH 0989/1390] Update comments

---
 .../python/framework/python_op_gen_main.cc       |  2 +-
 .../python/framework/python_op_gen_test.cc       | 16 +++++++---------
 2 files changed, 8 insertions(+), 10 deletions(-)

diff --git a/tensorflow/python/framework/python_op_gen_main.cc b/tensorflow/python/framework/python_op_gen_main.cc
index c3ef4202d2a..cd7c2d53097 100644
--- a/tensorflow/python/framework/python_op_gen_main.cc
+++ b/tensorflow/python/framework/python_op_gen_main.cc
@@ -158,7 +158,7 @@ int main(int argc, char* argv[]) {
   std::vector<tensorflow::string> api_def_dirs = tensorflow::str_util::Split(
       argv[1], ",", tensorflow::str_util::SkipEmpty());
 
-  // Add op name to this set to add type annotations
+  // Add op name here to generate type annotations for it
   const std::unordered_set<tensorflow::string> type_annotate_ops {
   };
 
diff --git a/tensorflow/python/framework/python_op_gen_test.cc b/tensorflow/python/framework/python_op_gen_test.cc
index 41e27f019cf..dc2ba4597c9 100644
--- a/tensorflow/python/framework/python_op_gen_test.cc
+++ b/tensorflow/python/framework/python_op_gen_test.cc
@@ -106,7 +106,7 @@ TEST(PythonOpGen, TypeAnnotateSingleTypeTensor) {
 
   OpList op_defs;
   OpRegistry::Global()->Export(false, &op_defs);
-  protobuf::TextFormat::ParseFromString(kBaseOpDef, &op_defs);  // NOLINT
+  protobuf::TextFormat::ParseFromString(kBaseOpDef, &op_defs);
   ApiDefMap api_def_map(op_defs);
 
   string code = GetPythonOps(op_defs, api_def_map, {}, "", type_annotate_ops);
@@ -166,7 +166,7 @@ TEST(PythonOpGen, TypeAnnotateMultiTypeTensor) {
 
   OpList op_defs;
   OpRegistry::Global()->Export(false, &op_defs);
-  protobuf::TextFormat::ParseFromString(kBaseOpDef, &op_defs);  // NOLINT
+  protobuf::TextFormat::ParseFromString(kBaseOpDef, &op_defs);
   ApiDefMap api_def_map(op_defs);
 
   string code = GetPythonOps(op_defs, api_def_map, {}, "", type_annotate_ops);
@@ -223,7 +223,7 @@ TEST(PythonOpGen, GenerateCorrectTypeVars) {
 
   OpList op_defs;
   OpRegistry::Global()->Export(false, &op_defs);
-  protobuf::TextFormat::ParseFromString(kBaseOpDef, &op_defs);  // NOLINT
+  protobuf::TextFormat::ParseFromString(kBaseOpDef, &op_defs);
   ApiDefMap api_def_map(op_defs);
 
   string code = GetPythonOps(op_defs, api_def_map, {}, "", type_annotate_ops);
@@ -284,7 +284,7 @@ TEST(PythonOpGen, TypeAnnotateFallback) {
 
   OpList op_defs;
   OpRegistry::Global()->Export(false, &op_defs);
-  protobuf::TextFormat::ParseFromString(kBaseOpDef, &op_defs);  // NOLINT
+  protobuf::TextFormat::ParseFromString(kBaseOpDef, &op_defs);
   ApiDefMap api_def_map(op_defs);
 
   string code = GetPythonOps(op_defs, api_def_map, {}, "", type_annotate_ops);
@@ -341,7 +341,7 @@ TEST(PythonOpGen, GenerateTypeVarAboveOp) {
 
   OpList op_defs;
   OpRegistry::Global()->Export(false, &op_defs);
-  protobuf::TextFormat::ParseFromString(kBaseOpDef, &op_defs);  // NOLINT
+  protobuf::TextFormat::ParseFromString(kBaseOpDef, &op_defs);
   ApiDefMap api_def_map(op_defs);
 
   string code = GetPythonOps(op_defs, api_def_map, {}, "", type_annotate_ops);
@@ -399,7 +399,7 @@ TEST(PythonOpGen, TypeAnnotateDefaultParams) {
 
   OpList op_defs;
   OpRegistry::Global()->Export(false, &op_defs);
-  protobuf::TextFormat::ParseFromString(kBaseOpDef, &op_defs);  // NOLINT
+  protobuf::TextFormat::ParseFromString(kBaseOpDef, &op_defs);
   ApiDefMap api_def_map(op_defs);
 
   string code = GetPythonOps(op_defs, api_def_map, {}, "", type_annotate_ops);
@@ -446,7 +446,7 @@ TEST(PythonOpGen, NoTypingSequenceTensors) {
 
   OpList op_defs;
   OpRegistry::Global()->Export(false, &op_defs);
-  protobuf::TextFormat::ParseFromString(kBaseOpDef, &op_defs);  // NOLINT
+  protobuf::TextFormat::ParseFromString(kBaseOpDef, &op_defs);
   ApiDefMap api_def_map(op_defs);
 
   string code = GetPythonOps(op_defs, api_def_map, {}, "", type_annotate_ops);
@@ -456,7 +456,5 @@ TEST(PythonOpGen, NoTypingSequenceTensors) {
   ExpectHasSubstr(code, baz_def_line);
 }
 
-// TODO(mdan): Include more tests with synhtetic ops and api defs.
-
 }  // namespace
 }  // namespace tensorflow

From 668eaa2b12917d264d6854af5b8808f63c2373ec Mon Sep 17 00:00:00 2001
From: hedgehog91 <67327791+hedgehog91@users.noreply.github.com>
Date: Wed, 24 Jun 2020 21:36:46 +0300
Subject: [PATCH 0990/1390] change_exit_code_when_bazel_none

---
 configure.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/configure.py b/configure.py
index da6af8f8cc9..9524eada3cd 100644
--- a/configure.py
+++ b/configure.py
@@ -480,7 +480,7 @@ def check_bazel_version(min_version, max_version):
   """
   if which('bazel') is None:
     print('Cannot find bazel. Please install bazel.')
-    sys.exit(0)
+    sys.exit(1)
 
   stderr = open(os.devnull, 'wb')
   curr_version = run_shell(['bazel', '--version'],

From ab984b4b347c1de5b4b129602fdc9923a62b1f04 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 24 Jun 2020 11:32:10 -0700
Subject: [PATCH 0991/1390] Integrate LLVM at
 https://github.com/llvm/llvm-project/commit/69d2fa9ed1c1

PiperOrigin-RevId: 318105221
Change-Id: Id976263b1ae81622aef8434dbbd087cdb1b87a4d
---
 tensorflow/workspace.bzl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index c4337919a93..b7c4fa08199 100755
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -710,8 +710,8 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
     )
 
     # Check out LLVM and MLIR from llvm-project.
-    LLVM_COMMIT = "ab27603c6d444b15e5f8efc090611488440211a9"
-    LLVM_SHA256 = "adeb20173900cef3e16cbf0bd1f3c5ee23308549e47fc5f4d836462a1ff40b15"
+    LLVM_COMMIT = "69d2fa9ed1c1aba6f473feb03cad257e69a0cf52"
+    LLVM_SHA256 = "94300fd7c357cc946d7c3e416f15f94fa9ee0004a70049e4e2d5ba008751a95f"
     LLVM_URLS = [
         "https://storage.googleapis.com/mirror.tensorflow.org/github.com/llvm/llvm-project/archive/{commit}.tar.gz".format(commit = LLVM_COMMIT),
         "https://github.com/llvm/llvm-project/archive/{commit}.tar.gz".format(commit = LLVM_COMMIT),

From 9e81e3eb4310b53e9c382b54a69b81336b01df74 Mon Sep 17 00:00:00 2001
From: Akshay Modi <nareshmodi@google.com>
Date: Wed, 24 Jun 2020 11:32:47 -0700
Subject: [PATCH 0992/1390] Cache hash of NdarraySpec.

PiperOrigin-RevId: 318105351
Change-Id: I09b6dbbb5f70466546b5986dc09b42d67ac10972
---
 tensorflow/python/ops/numpy_ops/np_arrays.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/tensorflow/python/ops/numpy_ops/np_arrays.py b/tensorflow/python/ops/numpy_ops/np_arrays.py
index 77157544e8f..38a198e6dab 100644
--- a/tensorflow/python/ops/numpy_ops/np_arrays.py
+++ b/tensorflow/python/ops/numpy_ops/np_arrays.py
@@ -67,6 +67,7 @@ class NdarraySpec(type_spec.BatchableTypeSpec):
       raise ValueError('NdarraySpec.__init__ was expecting a tf.TypeSpec, '
                        'but got a {} instead.'.format(type(data_spec)))
     self._data_spec = data_spec
+    self._hash = None
 
   @property
   def _component_specs(self):
@@ -87,6 +88,11 @@ class NdarraySpec(type_spec.BatchableTypeSpec):
   def _unbatch(self):
     return NdarraySpec(self._data_spec._unbatch())  # pylint: disable=protected-access
 
+  def __hash__(self):
+    if self._hash is None:
+      self._hash = hash((type(self), self._data_spec))
+    return self._hash
+
 
 class ndarray(composite_tensor.CompositeTensor):  # pylint: disable=invalid-name
   """Equivalent of numpy.ndarray backed by TensorFlow tensors.

From 46edde9448c4a5cc1dd77e02823aaf3557b6dcb5 Mon Sep 17 00:00:00 2001
From: tg-at-google <taregaskin@google.com>
Date: Wed, 24 Jun 2020 14:49:27 -0400
Subject: [PATCH 0993/1390] Update batch_kernels.cc

---
 tensorflow/core/kernels/batch_kernels.cc | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/tensorflow/core/kernels/batch_kernels.cc b/tensorflow/core/kernels/batch_kernels.cc
index 1dedecdf6d2..5aad4a91a32 100644
--- a/tensorflow/core/kernels/batch_kernels.cc
+++ b/tensorflow/core/kernels/batch_kernels.cc
@@ -519,12 +519,13 @@ class BatchResource : public ResourceBase {
     std::map<string, std::vector<Tensor>> split_tensors;
 
     DCHECK_EQ(batch->task(0).context->num_outputs(), combined_outputs.size());
-    if (static_cast<int>(combined_outputs.size()) != batch->task(0).context->num_outputs()) {
+    int combined_outputs_size = combined_outputs.size(); 
+    if (combined_outputs_size != batch->task(0).context->num_outputs()) {
       return errors::Internal("Wrong number of batched output tensors");
     }
 
     // Generate 'split_tensors' and populate the context outputs.
-    for (size_t i = 0; i < combined_outputs.size(); ++i) {
+    for (int i = 0, iter_limit = combined_outputs.size(); i < iter_limit; ++i) {
       const Tensor& output_tensor = combined_outputs[i];
       if (output_tensor.shape().dims() == 0) {
         return errors::FailedPrecondition(

From f21944249645996eb3b04af146c7ec1ffb5c189a Mon Sep 17 00:00:00 2001
From: Pavithra Vijay <psv@google.com>
Date: Wed, 24 Jun 2020 11:35:18 -0700
Subject: [PATCH 0994/1390] Update v1 only `legacy_tf_layers` base layer test
 with proper reason.

PiperOrigin-RevId: 318105821
Change-Id: Ib348e782ea3b15111d99cd67b91b54166758bd88
---
 tensorflow/python/keras/legacy_tf_layers/base_test.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tensorflow/python/keras/legacy_tf_layers/base_test.py b/tensorflow/python/keras/legacy_tf_layers/base_test.py
index 23d4c90d55e..0a61d77ba76 100644
--- a/tensorflow/python/keras/legacy_tf_layers/base_test.py
+++ b/tensorflow/python/keras/legacy_tf_layers/base_test.py
@@ -162,7 +162,7 @@ class BaseLayerTest(test.TestCase, parameterized.TestCase):
           synchronization=variable_scope.VariableSynchronization.ON_READ,
           trainable=True)
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only('Legacy TF Base layer is supported only in V1.')
   def testReusePartitionedVariablesAndRegularizers(self):
     regularizer = lambda x: math_ops.reduce_sum(x) * 1e-3
     partitioner = partitioned_variables.fixed_size_partitioner(3)
@@ -464,7 +464,7 @@ class BaseLayerTest(test.TestCase, parameterized.TestCase):
       self.assertTrue(isinstance(result, dict))
       self.assertEqual(set(['label', 'logits']), set(result.keys()))
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only('Legacy TF Base layer is supported only in V1.')
   def testActivityRegularizer(self):
     regularizer = math_ops.reduce_sum
     layer = base_layers.Layer(activity_regularizer=regularizer)
@@ -553,7 +553,7 @@ class BaseLayerTest(test.TestCase, parameterized.TestCase):
         self.assertEqual(len(layer.trainable_variables), 1)
         self.assertEqual(layer.variables[0].graph, outer_graph)
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only('Legacy TF Base layer is supported only in V1.')
   def testGetUpdateFor(self):
 
     class MyLayer(base_layers.Layer):
@@ -598,7 +598,7 @@ class BaseLayerTest(test.TestCase, parameterized.TestCase):
     self.assertEqual(len(layer.get_updates_for([intermediate_inputs])), 1)
     self.assertEqual(len(layer.get_updates_for([outputs])), 0)
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only('Legacy TF Base layer is supported only in V1.')
   def testGetLossesFor(self):
 
     class MyLayer(base_layers.Layer):

From 88148d582b448ecebc06101da705e35830b13d5a Mon Sep 17 00:00:00 2001
From: tg-at-google <taregaskin@google.com>
Date: Wed, 24 Jun 2020 14:52:12 -0400
Subject: [PATCH 0995/1390] Update derived_timeline.cc

---
 tensorflow/core/profiler/utils/derived_timeline.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/profiler/utils/derived_timeline.cc b/tensorflow/core/profiler/utils/derived_timeline.cc
index fe2106b4827..bcadf51c110 100644
--- a/tensorflow/core/profiler/utils/derived_timeline.cc
+++ b/tensorflow/core/profiler/utils/derived_timeline.cc
@@ -130,7 +130,7 @@ void DerivedXLineBuilder::ExpandOrAddLevelEvent(const XEvent& event,
 }
 
 void DerivedXLineBuilder::ResetLastEvents(int level) {
-  for (int i = level; i < static_cast<int>(last_event_by_level_.size()); ++i) {
+  for (int i = level, iter_limit = last_event_by_level_.size(); i < iter_limit; ++i) {
     last_event_by_level_[i] = absl::nullopt;
   }
   if (level == 0) ResetDependentLines();

From ede0832c1d42c4670acc127a3bae9e2dbad91645 Mon Sep 17 00:00:00 2001
From: tg-at-google <taregaskin@google.com>
Date: Wed, 24 Jun 2020 14:53:47 -0400
Subject: [PATCH 0996/1390] Update convert_trivial_transpose_to_reshape.cc

---
 .../convert_trivial_transpose_to_reshape.cc                     | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/lite/toco/graph_transformations/convert_trivial_transpose_to_reshape.cc b/tensorflow/lite/toco/graph_transformations/convert_trivial_transpose_to_reshape.cc
index fa8a69a1e7a..5501d5a25c6 100644
--- a/tensorflow/lite/toco/graph_transformations/convert_trivial_transpose_to_reshape.cc
+++ b/tensorflow/lite/toco/graph_transformations/convert_trivial_transpose_to_reshape.cc
@@ -31,7 +31,7 @@ bool TransposeAffectsMemoryOrder(std::vector<int> perm,
   // just the shape) then the flat buffer representation shouldn't change.
   std::vector<int> old_major_index_ordering;
   std::vector<int> new_major_index_ordering;
-  for (int i = 0; static_cast<size_t>(i) < in_shape.size(); i++) {
+  for (int i = 0, iter_limit = in_shape.size(); i < iter_limit; i++) {
     if (in_shape[i] != 1) {
       old_major_index_ordering.push_back(i);
     }

From b64d773eea54211c5c2f3525a250ebd71ca13e8e Mon Sep 17 00:00:00 2001
From: tg-at-google <taregaskin@google.com>
Date: Wed, 24 Jun 2020 14:54:56 -0400
Subject: [PATCH 0997/1390] Update dequantize.cc

---
 tensorflow/lite/toco/graph_transformations/dequantize.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/lite/toco/graph_transformations/dequantize.cc b/tensorflow/lite/toco/graph_transformations/dequantize.cc
index c87c305a70d..e54871b64d0 100644
--- a/tensorflow/lite/toco/graph_transformations/dequantize.cc
+++ b/tensorflow/lite/toco/graph_transformations/dequantize.cc
@@ -35,7 +35,7 @@ void DequantizeBuffer(Array* array) {
   auto& new_data = array->GetMutableBuffer<ArrayDataType::kFloat>().data;
   new_data.resize(old_data.size());
   const auto& qparams = array->GetQuantizationParams();
-  for (size_t i = 0; i < old_data.size(); i++) {
+  for (size_t i = 0, iter_limit = old_data.size(); i < iter_limit; i++) {
     new_data[i] = qparams.scale * (old_data[i] - qparams.zero_point);
   }
 }

From b496035c6203077d219357f8fc570a66969a7e61 Mon Sep 17 00:00:00 2001
From: Scott Zhu <scottzhu@google.com>
Date: Wed, 24 Jun 2020 11:38:04 -0700
Subject: [PATCH 0998/1390] Fork the keras related
 collective_all_reduce_strategy_test to keras/distribute.

PiperOrigin-RevId: 318106388
Change-Id: Icd5b43ca8d81f5b3861ce146c7ea83ce6b8d9f6c
---
 tensorflow/python/distribute/BUILD            |   2 -
 .../collective_all_reduce_strategy_test.py    | 204 +----------
 tensorflow/python/keras/distribute/BUILD      |  37 ++
 .../collective_all_reduce_strategy_test.py    | 320 ++++++++++++++++++
 4 files changed, 369 insertions(+), 194 deletions(-)
 create mode 100644 tensorflow/python/keras/distribute/collective_all_reduce_strategy_test.py

diff --git a/tensorflow/python/distribute/BUILD b/tensorflow/python/distribute/BUILD
index 2e3b93fba02..c351484f21f 100644
--- a/tensorflow/python/distribute/BUILD
+++ b/tensorflow/python/distribute/BUILD
@@ -1678,8 +1678,6 @@ cuda_py_test(
         "//tensorflow/python:variables",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/estimator:estimator_py",
-        "//tensorflow/python/keras/layers",
-        "//tensorflow/python/keras/mixed_precision/experimental:test_util",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
diff --git a/tensorflow/python/distribute/collective_all_reduce_strategy_test.py b/tensorflow/python/distribute/collective_all_reduce_strategy_test.py
index 9e36531a8db..0c643eaa115 100644
--- a/tensorflow/python/distribute/collective_all_reduce_strategy_test.py
+++ b/tensorflow/python/distribute/collective_all_reduce_strategy_test.py
@@ -18,12 +18,13 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import functools
+
 from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.core.protobuf import rewriter_config_pb2
-from tensorflow.python import keras
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.distribute import collective_all_reduce_strategy
 from tensorflow.python.distribute import combinations
@@ -39,25 +40,14 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
-from tensorflow.python.framework import test_util
-from tensorflow.python.keras import testing_utils
-from tensorflow.python.keras.layers import core
-from tensorflow.python.keras.mixed_precision.experimental import policy
-from tensorflow.python.keras.mixed_precision.experimental import test_util as mp_test_util
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gen_math_ops
 from tensorflow.python.ops import gradients
 from tensorflow.python.ops import init_ops
-from tensorflow.python.ops import nn
-from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import init_ops_v2
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
-from tensorflow.python.ops.losses import losses
 from tensorflow.python.platform import test
-from tensorflow.python.training import adam
-from tensorflow.python.training import gradient_descent
-from tensorflow.python.training import training_util
-from tensorflow.python.training.experimental import loss_scale as loss_scale_module
-from tensorflow.python.training.experimental import loss_scale_optimizer
 from tensorflow.python.training.server_lib import ClusterSpec
 
 
@@ -128,11 +118,16 @@ class CollectiveAllReduceStrategyTestBase(
          self.cached_session(config=config,
                              target=master_target) as sess, \
          d.scope():
-      l = core.Dense(1, use_bias=False,
-                     name='gpu_%d' % d.extended._num_gpus_per_worker)
+      initializer = functools.partial(
+          init_ops_v2.GlorotUniform(), (1, 1), dtype=dtypes.float32)
+      kernel = variables.Variable(
+          initial_value=initializer,
+          name='gpu_%d/kernel' % d.extended._num_gpus_per_worker,
+          trainable=True)
 
       def loss_fn(x):
-        y = array_ops.reshape(l(x), []) - constant_op.constant(1.)
+        y = array_ops.reshape(
+            gen_math_ops.mat_mul(x, kernel), []) - constant_op.constant(1.)
         return y * y
 
       # TODO(yuefengz, apassos): eager.backprop.implicit_grad is not safe for
@@ -188,133 +183,6 @@ class CollectiveAllReduceStrategyTestBase(
       # Error should go down
       self.assertLess(error_after, error_before)
 
-  def _test_complex_model(self, task_type, task_id, num_gpus):
-    d, master_target, config = self._get_test_object(task_type, task_id,
-                                                     num_gpus)
-
-    def model_fn():
-      """Mnist model with synthetic input."""
-      data_format = 'channels_last'
-      input_shape = [28, 28, 1]
-      l = keras.layers
-      max_pool = l.MaxPooling2D((2, 2), (2, 2),
-                                padding='same',
-                                data_format=data_format)
-      model = keras.Sequential([
-          l.Reshape(target_shape=input_shape, input_shape=(28 * 28,)),
-          l.Conv2D(
-              32,
-              5,
-              padding='same',
-              data_format=data_format,
-              activation=nn.relu), max_pool,
-          l.Conv2D(
-              64,
-              5,
-              padding='same',
-              data_format=data_format,
-              activation=nn.relu), max_pool,
-          l.Flatten(),
-          l.Dense(1024, activation=nn.relu),
-          l.Dropout(0.4),
-          l.Dense(10)
-      ])
-      image = random_ops.random_uniform([2, 28, 28])
-      label = random_ops.random_uniform([2, 1], maxval=10, dtype=dtypes.int32)
-      logits = model(image, training=True)
-      # TODO(yuefengz): make loss a callable for eager mode.
-      loss = losses.sparse_softmax_cross_entropy(labels=label, logits=logits)
-      optimizer = adam.AdamOptimizer(learning_rate=1e-4)
-      train_op = optimizer.minimize(loss,
-                                    training_util.get_or_create_global_step())
-      return train_op
-
-    with ops.Graph().as_default(), \
-         self.cached_session(config=config,
-                             target=master_target) as sess:
-      with d.scope():
-        train_op = d.extended.call_for_each_replica(model_fn)
-        train_op = d.group(d.experimental_local_results(train_op))
-
-      sess.run(variables.global_variables_initializer())
-      sess.run(train_op)
-
-  def _test_mixed_precision(self, task_type, task_id, num_gpus):
-    """Tests mixed precision works with the CollectiveAllReduceStrategy.
-
-    This tests:
-      1. Variables are in float32, by running with a small enough learning rate
-         that if the variables are float16, their values wouldn't change when
-         gradients are applied.
-      2. The loss scale is doubled if there are no NaNs.
-      3. The loss scale is halved if the first worker has a NaN, even if the
-         other works do not have NaNs.
-
-    Args:
-      task_type: A string, such as "worker", indicating the type of the replica.
-      task_id: Zero-indexed ID of the task.
-      num_gpus: The number of GPUs to use.
-    """
-    d, master_target, config = self._get_test_object(task_type, task_id,
-                                                     num_gpus)
-    # Should be set to mixed_float16 by caller.
-    self.assertEqual(policy.global_policy().name, 'mixed_float16')
-
-    with ops.Graph().as_default(), \
-         self.cached_session(config=config,
-                             target=master_target) as sess:
-      # The loss on the first worker is multiplied by this value. Allows
-      # testing the first worker having NaN loss and gradients while keeping the
-      # other workers' losses and gradients finite.
-      loss_multiplier_for_first_worker = variables.Variable(
-          1., dtype='float16', trainable=False)
-      with d.scope():
-        model = keras.Sequential([
-            mp_test_util.MultiplyLayer(assert_type=dtypes.float16,
-                                       input_shape=(1,)),
-        ])
-        loss_scale = loss_scale_module.DynamicLossScale(2 ** 10,
-                                                        increment_period=1)
-        def model_fn():
-          """Simple model to test mixed precision."""
-          x = np.ones((1, 1))
-          loss = model(x, training=True)
-
-          if ((task_type == 'worker' and task_id == 0) or
-              task_type is task_id is None):
-            loss *= loss_multiplier_for_first_worker
-          # Learning rate is small enough that if applied to a float16 variable,
-          # the variable will not change. So this tests the learning rate is not
-          # applied to a float16 value, but instead the float32 variable.
-          optimizer = gradient_descent.GradientDescentOptimizer(2 ** -14)
-          optimizer = loss_scale_optimizer.MixedPrecisionLossScaleOptimizer(
-              optimizer, loss_scale)
-          train_op = optimizer.minimize(
-              loss, training_util.get_or_create_global_step())
-          return train_op
-
-        train_op = d.extended.call_for_each_replica(model_fn)
-        train_op = d.group(d.experimental_local_results(train_op))
-
-      sess.run(variables.global_variables_initializer())
-      sess.run(train_op)
-
-      (var,) = model.trainable_weights
-      # Variable starts at 1. Each worker's gradient is 2 ** -14, the learning
-      # rate, and each worker's gradient will be subtracted from the variable.
-      expected = 1 - d.num_replicas_in_sync * 2 ** -14
-      self.assertEqual(sess.run(var), expected)
-      # Loss scale should double, as are gradients are finite.
-      self.assertEqual(sess.run(loss_scale()), 2 ** 11)
-
-      # Set the first worker to have NaN loss and gradients.
-      sess.run(loss_multiplier_for_first_worker.assign(float('NaN')))
-      sess.run(train_op)
-      # Variable should not change, since first worker had NaN
-      self.assertEqual(sess.run(var), expected)
-      # Loss scale should halve due to NaN
-      self.assertEqual(sess.run(loss_scale()), 2 ** 10)
-
   def _test_variable_initialization(self, task_type, task_id, num_gpus):
     distribution, master_target, config = self._get_test_object(
         task_type, task_id, num_gpus)
@@ -427,24 +295,6 @@ class DistributedCollectiveAllReduceStrategyTest(
         self._cluster_spec,
         num_gpus=required_gpus)
 
-  @combinations.generate(
-      combinations.combine(mode=['graph'], required_gpus=[0, 1, 2]))
-  def testComplexModel(self, required_gpus):
-    self._run_between_graph_clients(
-        self._test_complex_model, self._cluster_spec, num_gpus=required_gpus)
-
-  @combinations.generate(
-      combinations.combine(mode=['graph'], required_gpus=[0, 1, 2]))
-  @testing_utils.enable_v2_dtype_behavior
-  def testMixedPrecision(self, required_gpus):
-    if test_util.is_xla_enabled():
-      self.skipTest('Test gets NaNs with XLA')
-    with policy.policy_scope('mixed_float16'):
-      self._run_between_graph_clients(
-          self._test_mixed_precision,
-          self._cluster_spec,
-          num_gpus=required_gpus)
-
   @combinations.generate(
       combinations.combine(
           mode=['graph'], required_gpus=[0, 1, 2], use_dataset=[True, False]))
@@ -558,24 +408,6 @@ class DistributedCollectiveAllReduceStrategyTestWithChief(
         self._cluster_spec,
         num_gpus=required_gpus)
 
-  @combinations.generate(
-      combinations.combine(mode=['graph'], required_gpus=[0, 1, 2]))
-  def testComplexModel(self, required_gpus):
-    self._run_between_graph_clients(
-        self._test_complex_model, self._cluster_spec, num_gpus=required_gpus)
-
-  @combinations.generate(
-      combinations.combine(mode=['graph'], required_gpus=[0, 1, 2]))
-  @testing_utils.enable_v2_dtype_behavior
-  def testMixedPrecision(self, required_gpus):
-    if test_util.is_xla_enabled():
-      return  # Test gets NaNs with XLA
-    with policy.policy_scope('mixed_float16'):
-      self._run_between_graph_clients(
-          self._test_mixed_precision,
-          self._cluster_spec,
-          num_gpus=required_gpus)
-
 
 class LocalCollectiveAllReduceStrategy(
     CollectiveAllReduceStrategyTestBase,
@@ -593,18 +425,6 @@ class LocalCollectiveAllReduceStrategy(
     else:
       self._test_minimize_loss_graph(None, None, required_gpus)
 
-  @combinations.generate(
-      combinations.combine(mode=['graph'], required_gpus=[2, 4]))
-  def testComplexModel(self, required_gpus):
-    self._test_complex_model(None, None, required_gpus)
-
-  @combinations.generate(
-      combinations.combine(mode=['graph'], required_gpus=[2, 4]))
-  @testing_utils.enable_v2_dtype_behavior
-  def testMixedPrecision(self, required_gpus):
-    with policy.policy_scope('mixed_float16'):
-      self._test_mixed_precision(None, None, required_gpus)
-
   @combinations.generate(
       combinations.combine(
           mode=['graph'], required_gpus=2, use_dataset=[True, False]))
diff --git a/tensorflow/python/keras/distribute/BUILD b/tensorflow/python/keras/distribute/BUILD
index 4245d70b1f0..9516e7d6030 100644
--- a/tensorflow/python/keras/distribute/BUILD
+++ b/tensorflow/python/keras/distribute/BUILD
@@ -77,6 +77,43 @@ cuda_py_test(
     ],
 )
 
+cuda_py_test(
+    name = "collective_all_reduce_strategy_test",
+    srcs = ["collective_all_reduce_strategy_test.py"],
+    python_version = "PY3",
+    tags = [
+        "multi_and_single_gpu",
+    ],
+    # b/155301154 broken with XLA:GPU
+    xla_enable_strict_auto_jit = True,
+    deps = [
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:nn",
+        "//tensorflow/python:random_ops",
+        "//tensorflow/python:training_lib",
+        "//tensorflow/python:training_util",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/distribute:collective_all_reduce_strategy",
+        "//tensorflow/python/distribute:combinations",
+        "//tensorflow/python/distribute:cross_device_utils",
+        "//tensorflow/python/distribute:multi_worker_test_base",
+        "//tensorflow/python/distribute:multi_worker_util",
+        "//tensorflow/python/distribute:strategy_test_lib",
+        "//tensorflow/python/distribute/cluster_resolver:cluster_resolver_lib",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/keras:testing_utils",
+        "//tensorflow/python/keras/engine",
+        "//tensorflow/python/keras/layers",
+        "//tensorflow/python/keras/mixed_precision/experimental:policy",
+        "//tensorflow/python/keras/mixed_precision/experimental:test_util",
+        "//tensorflow/python/ops/losses",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
 distribute_py_test(
     name = "custom_training_loop_models_test",
     srcs = ["custom_training_loop_models_test.py"],
diff --git a/tensorflow/python/keras/distribute/collective_all_reduce_strategy_test.py b/tensorflow/python/keras/distribute/collective_all_reduce_strategy_test.py
new file mode 100644
index 00000000000..f2869e4d478
--- /dev/null
+++ b/tensorflow/python/keras/distribute/collective_all_reduce_strategy_test.py
@@ -0,0 +1,320 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for CollectiveAllReduceStrategy."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+import numpy as np
+
+from tensorflow.core.protobuf import config_pb2
+from tensorflow.python.distribute import collective_all_reduce_strategy
+from tensorflow.python.distribute import combinations
+from tensorflow.python.distribute import cross_device_utils
+from tensorflow.python.distribute import multi_worker_test_base
+from tensorflow.python.distribute import multi_worker_util
+from tensorflow.python.distribute import strategy_test_lib
+from tensorflow.python.distribute.cluster_resolver import SimpleClusterResolver
+from tensorflow.python.eager import context
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.keras import layers
+from tensorflow.python.keras import testing_utils
+from tensorflow.python.keras.engine import sequential
+from tensorflow.python.keras.mixed_precision.experimental import policy
+from tensorflow.python.keras.mixed_precision.experimental import test_util as mp_test_util
+from tensorflow.python.ops import nn
+from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.ops.losses import losses
+from tensorflow.python.platform import test
+from tensorflow.python.training import adam
+from tensorflow.python.training import gradient_descent
+from tensorflow.python.training import training_util
+from tensorflow.python.training.experimental import loss_scale as loss_scale_module
+from tensorflow.python.training.experimental import loss_scale_optimizer
+from tensorflow.python.training.server_lib import ClusterSpec
+
+
+def create_test_objects(cluster_spec=None,
+                        task_type=None,
+                        task_id=None,
+                        num_gpus=None):
+  sess_config = config_pb2.ConfigProto()
+  if num_gpus is None:
+    num_gpus = context.num_gpus()
+
+  if cluster_spec and task_type and task_id is not None:
+    cluster_resolver = SimpleClusterResolver(
+        cluster_spec=multi_worker_util.normalize_cluster_spec(cluster_spec),
+        task_type=task_type,
+        task_id=task_id,
+        num_accelerators={'GPU': num_gpus})
+    target = 'grpc://' + cluster_spec[task_type][task_id]
+  else:
+    cluster_resolver = SimpleClusterResolver(
+        ClusterSpec({}), num_accelerators={'GPU': num_gpus})
+    target = ''
+
+  strategy = collective_all_reduce_strategy.CollectiveAllReduceStrategy(
+      cluster_resolver=cluster_resolver)
+  sess_config = strategy.update_config_proto(sess_config)
+
+  return strategy, target, sess_config
+
+
+class CollectiveAllReduceStrategyTestBase(
+    multi_worker_test_base.MultiWorkerTestBase):
+
+  collective_key_base = 0
+
+  def setUp(self):
+    # We use a different key_base for each test so that collective keys won't be
+    # reused.
+    # TODO(yuefengz, ayushd): enable it to reuse collective keys in different
+    # tests.
+    CollectiveAllReduceStrategyTestBase.collective_key_base += 100000
+    super(CollectiveAllReduceStrategyTestBase, self).setUp()
+
+  def _get_test_object(self, task_type, task_id, num_gpus=0):
+    strategy, target, session_config = create_test_objects(
+        cluster_spec=self._cluster_spec,
+        task_type=task_type,
+        task_id=task_id,
+        num_gpus=num_gpus)
+
+    collective_keys = cross_device_utils.CollectiveKeys(
+        group_key_start=10 +
+        CollectiveAllReduceStrategyTestBase.collective_key_base,
+        op_instance_key_start=100 +
+        CollectiveAllReduceStrategyTestBase.collective_key_base,
+        variable_instance_key_start=10000 +
+        CollectiveAllReduceStrategyTestBase.collective_key_base)
+    strategy.extended._collective_keys = collective_keys
+    strategy.extended._cross_device_ops._collective_keys = collective_keys
+    strategy.extended._host_cross_device_ops._collective_keys = collective_keys
+
+    return strategy, target, session_config
+
+  def _test_complex_model(self, task_type, task_id, num_gpus):
+    d, master_target, config = self._get_test_object(task_type, task_id,
+                                                     num_gpus)
+
+    def model_fn():
+      """Mnist model with synthetic input."""
+      data_format = 'channels_last'
+      input_shape = [28, 28, 1]
+      l = layers
+      max_pool = l.MaxPooling2D((2, 2), (2, 2),
+                                padding='same',
+                                data_format=data_format)
+      model = sequential.Sequential([
+          l.Reshape(target_shape=input_shape, input_shape=(28 * 28,)),
+          l.Conv2D(
+              32,
+              5,
+              padding='same',
+              data_format=data_format,
+              activation=nn.relu), max_pool,
+          l.Conv2D(
+              64,
+              5,
+              padding='same',
+              data_format=data_format,
+              activation=nn.relu), max_pool,
+          l.Flatten(),
+          l.Dense(1024, activation=nn.relu),
+          l.Dropout(0.4),
+          l.Dense(10)
+      ])
+      image = random_ops.random_uniform([2, 28, 28])
+      label = random_ops.random_uniform([2, 1], maxval=10, dtype=dtypes.int32)
+      logits = model(image, training=True)
+      # TODO(yuefengz): make loss a callable for eager mode.
+      loss = losses.sparse_softmax_cross_entropy(labels=label, logits=logits)
+      optimizer = adam.AdamOptimizer(learning_rate=1e-4)
+      train_op = optimizer.minimize(loss,
+                                    training_util.get_or_create_global_step())
+      return train_op
+
+    with ops.Graph().as_default(), \
+         self.cached_session(config=config,
+                             target=master_target) as sess:
+      with d.scope():
+        train_op = d.extended.call_for_each_replica(model_fn)
+        train_op = d.group(d.experimental_local_results(train_op))
+
+      sess.run(variables.global_variables_initializer())
+      sess.run(train_op)
+
+  def _test_mixed_precision(self, task_type, task_id, num_gpus):
+    """Tests mixed precision works with the CollectiveAllReduceStrategy.
+
+    This tests:
+      1. Variables are in float32, by running with a small enough learning rate
+         that if the variables are float16, their values wouldn't change when
+         gradients are applied.
+      2. The loss scale is doubled if there are no NaNs.
+      3. The loss scale is halved if the first worker has a NaN, even if the
+         other works do not have NaNs.
+
+    Args:
+      task_type: A string, such as "worker", indicating the type of the replica.
+      task_id: Zero-indexed ID of the task.
+      num_gpus: The number of GPUs to use.
+    """
+    d, master_target, config = self._get_test_object(task_type, task_id,
+                                                     num_gpus)
+    # Should be set to mixed_float16 by caller.
+    self.assertEqual(policy.global_policy().name, 'mixed_float16')
+
+    with ops.Graph().as_default(), \
+         self.cached_session(config=config,
+                             target=master_target) as sess:
+      # The loss on the first worker is multiplied by this value. Allows
+      # testing the first worker having NaN loss and gradients while keeping the
+      # other workers' losses and gradients finite.
+      loss_multiplier_for_first_worker = variables.Variable(
+          1., dtype='float16', trainable=False)
+      with d.scope():
+        model = sequential.Sequential([
+            mp_test_util.MultiplyLayer(assert_type=dtypes.float16,
+                                       input_shape=(1,)),
+        ])
+        loss_scale = loss_scale_module.DynamicLossScale(2 ** 10,
+                                                        increment_period=1)
+        def model_fn():
+          """Simple model to test mixed precision."""
+          x = np.ones((1, 1))
+          loss = model(x, training=True)
+
+          if ((task_type == 'worker' and task_id == 0) or
+              task_type is task_id is None):
+            loss *= loss_multiplier_for_first_worker
+          # Learning rate is small enough that if applied to a float16 variable,
+          # the variable will not change. So this tests the learning rate is not
+          # applied to a float16 value, but instead the float32 variable.
+          optimizer = gradient_descent.GradientDescentOptimizer(2 ** -14)
+          optimizer = loss_scale_optimizer.MixedPrecisionLossScaleOptimizer(
+              optimizer, loss_scale)
+          train_op = optimizer.minimize(
+              loss, training_util.get_or_create_global_step())
+          return train_op
+
+        train_op = d.extended.call_for_each_replica(model_fn)
+        train_op = d.group(d.experimental_local_results(train_op))
+
+      sess.run(variables.global_variables_initializer())
+      sess.run(train_op)
+
+      (var,) = model.trainable_weights
+      # Variable starts at 1. Each worker's gradient is 2 ** -14, the learning
+      # rate, and each worker's gradient will be subtracted from the variable.
+      expected = 1 - d.num_replicas_in_sync * 2 ** -14
+      self.assertEqual(sess.run(var), expected)
+      # Loss scale should double, as are gradients are finite.
+      self.assertEqual(sess.run(loss_scale()), 2 ** 11)
+
+      # Set the first worker to have NaN loss and gradients.
+      sess.run(loss_multiplier_for_first_worker.assign(float('NaN')))
+      sess.run(train_op)
+      # Variable should not change, since first worker had NaN
+      self.assertEqual(sess.run(var), expected)
+      # Loss scale should halve due to NaN
+      self.assertEqual(sess.run(loss_scale()), 2 ** 10)
+
+
+class DistributedCollectiveAllReduceStrategyTest(
+    CollectiveAllReduceStrategyTestBase,
+    strategy_test_lib.DistributionTestBase,
+    parameterized.TestCase):
+
+  @classmethod
+  def setUpClass(cls):
+    """Create a local cluster with 3 workers."""
+    cls._cluster_spec = multi_worker_test_base.create_in_process_cluster(
+        num_workers=3, num_ps=0)
+
+  @combinations.generate(
+      combinations.combine(mode=['graph'], required_gpus=[0, 1, 2]))
+  def testComplexModel(self, required_gpus):
+    self._run_between_graph_clients(
+        self._test_complex_model, self._cluster_spec, num_gpus=required_gpus)
+
+  @combinations.generate(
+      combinations.combine(mode=['graph'], required_gpus=[0, 1, 2]))
+  @testing_utils.enable_v2_dtype_behavior
+  def testMixedPrecision(self, required_gpus):
+    if test_util.is_xla_enabled():
+      self.skipTest('Test gets NaNs with XLA')
+    with policy.policy_scope('mixed_float16'):
+      self._run_between_graph_clients(
+          self._test_mixed_precision,
+          self._cluster_spec,
+          num_gpus=required_gpus)
+
+
+class DistributedCollectiveAllReduceStrategyTestWithChief(
+    CollectiveAllReduceStrategyTestBase, parameterized.TestCase):
+
+  @classmethod
+  def setUpClass(cls):
+    """Create a local cluster with 3 workers and 1 chief."""
+    cls._cluster_spec = multi_worker_test_base.create_in_process_cluster(
+        num_workers=3, num_ps=0, has_chief=True)
+
+  @combinations.generate(
+      combinations.combine(mode=['graph'], required_gpus=[0, 1, 2]))
+  def testComplexModel(self, required_gpus):
+    self._run_between_graph_clients(
+        self._test_complex_model, self._cluster_spec, num_gpus=required_gpus)
+
+  @combinations.generate(
+      combinations.combine(mode=['graph'], required_gpus=[0, 1, 2]))
+  @testing_utils.enable_v2_dtype_behavior
+  def testMixedPrecision(self, required_gpus):
+    if test_util.is_xla_enabled():
+      return  # Test gets NaNs with XLA
+    with policy.policy_scope('mixed_float16'):
+      self._run_between_graph_clients(
+          self._test_mixed_precision,
+          self._cluster_spec,
+          num_gpus=required_gpus)
+
+
+class LocalCollectiveAllReduceStrategy(
+    CollectiveAllReduceStrategyTestBase,
+    strategy_test_lib.DistributionTestBase,
+    strategy_test_lib.TwoDeviceDistributionTestBase,
+    parameterized.TestCase):
+
+  @combinations.generate(
+      combinations.combine(mode=['graph'], required_gpus=[2, 4]))
+  def testComplexModel(self, required_gpus):
+    self._test_complex_model(None, None, required_gpus)
+
+  @combinations.generate(
+      combinations.combine(mode=['graph'], required_gpus=[2, 4]))
+  @testing_utils.enable_v2_dtype_behavior
+  def testMixedPrecision(self, required_gpus):
+    with policy.policy_scope('mixed_float16'):
+      self._test_mixed_precision(None, None, required_gpus)
+
+
+if __name__ == '__main__':
+  test.main()

From b7d61b43a9e4a3af907d8e150844690c438094dc Mon Sep 17 00:00:00 2001
From: tg-at-google <taregaskin@google.com>
Date: Wed, 24 Jun 2020 14:56:06 -0400
Subject: [PATCH 0999/1390] Update dequantize.cc

---
 tensorflow/lite/toco/graph_transformations/dequantize.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/lite/toco/graph_transformations/dequantize.cc b/tensorflow/lite/toco/graph_transformations/dequantize.cc
index e54871b64d0..8f9379c2c59 100644
--- a/tensorflow/lite/toco/graph_transformations/dequantize.cc
+++ b/tensorflow/lite/toco/graph_transformations/dequantize.cc
@@ -35,7 +35,7 @@ void DequantizeBuffer(Array* array) {
   auto& new_data = array->GetMutableBuffer<ArrayDataType::kFloat>().data;
   new_data.resize(old_data.size());
   const auto& qparams = array->GetQuantizationParams();
-  for (size_t i = 0, iter_limit = old_data.size(); i < iter_limit; i++) {
+  for (int i = 0, iter_limit = old_data.size(); i < iter_limit; i++) {
     new_data[i] = qparams.scale * (old_data[i] - qparams.zero_point);
   }
 }

From 87a18d136d2d5674df17b9fc5b9f700d8da53cb0 Mon Sep 17 00:00:00 2001
From: tg-at-google <taregaskin@google.com>
Date: Wed, 24 Jun 2020 14:57:01 -0400
Subject: [PATCH 1000/1390] Update drop_fake_quant.cc

---
 tensorflow/lite/toco/graph_transformations/drop_fake_quant.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/lite/toco/graph_transformations/drop_fake_quant.cc b/tensorflow/lite/toco/graph_transformations/drop_fake_quant.cc
index 3a0b4d0103f..267eb9fd12c 100644
--- a/tensorflow/lite/toco/graph_transformations/drop_fake_quant.cc
+++ b/tensorflow/lite/toco/graph_transformations/drop_fake_quant.cc
@@ -45,7 +45,7 @@ namespace toco {
   }
 
   // Drop min/max inputs
-  for (size_t i = 1; i < fakequant_op->inputs.size(); i++) {
+  for (int i = 1, iter_limit = fakequant_op->inputs.size(); i < iter_limit; i++) {
     if (CountOpsWithInput(*model, fakequant_op->inputs[i]) == 1) {
       model->EraseArray(fakequant_op->inputs[i]);
     }

From ed89825b68629cac39b7b6c8055c725bab3acd60 Mon Sep 17 00:00:00 2001
From: tg-at-google <taregaskin@google.com>
Date: Wed, 24 Jun 2020 15:00:31 -0400
Subject: [PATCH 1001/1390] Update
 ensure_uint8_weights_safe_for_fast_int8_kernels.cc

---
 .../ensure_uint8_weights_safe_for_fast_int8_kernels.cc          | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/lite/toco/graph_transformations/ensure_uint8_weights_safe_for_fast_int8_kernels.cc b/tensorflow/lite/toco/graph_transformations/ensure_uint8_weights_safe_for_fast_int8_kernels.cc
index ce4574cdfbf..2a85140488f 100644
--- a/tensorflow/lite/toco/graph_transformations/ensure_uint8_weights_safe_for_fast_int8_kernels.cc
+++ b/tensorflow/lite/toco/graph_transformations/ensure_uint8_weights_safe_for_fast_int8_kernels.cc
@@ -166,7 +166,7 @@ namespace toco {
   int index_of_previous_bad_value = 0;
   bool changed = false;
 
-  for (size_t i = 0; i < buffer_data.size(); i++) {
+  for (int i = 0, iter_limit = buffer_data.size(); i < iter_limit; i++) {
     if (buffer_data[i] == 0) {
       count_bad++;
       if (count_bad > 1) {

From c204c0f893b61c69dfb6c347224871aad52054c9 Mon Sep 17 00:00:00 2001
From: Nat Jeffries <njeff@google.com>
Date: Wed, 24 Jun 2020 11:41:44 -0700
Subject: [PATCH 1002/1390] Enable burst mode by default in timer
 initialization. Ensures benchmarks run using the same 96MHz clock frequency
 as examples.

PiperOrigin-RevId: 318107107
Change-Id: I9a825e5014c6d548dc1f2b9fbb76ee8e2b9c34f0
---
 .../lite/micro/sparkfun_edge/micro_time.cc    | 24 +++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/tensorflow/lite/micro/sparkfun_edge/micro_time.cc b/tensorflow/lite/micro/sparkfun_edge/micro_time.cc
index 12c9ae5c633..6e321a77896 100644
--- a/tensorflow/lite/micro/sparkfun_edge/micro_time.cc
+++ b/tensorflow/lite/micro/sparkfun_edge/micro_time.cc
@@ -27,6 +27,8 @@ limitations under the License.
 
 #include "tensorflow/lite/micro/micro_time.h"
 
+#include "tensorflow/lite/micro/debug_log.h"
+
 // These are headers from Ambiq's Apollo3 SDK.
 #include "am_bsp.h"         // NOLINT
 #include "am_mcu_apollo.h"  // NOLINT
@@ -42,6 +44,27 @@ constexpr int kTimerNum = 1;
 // Clock set to operate at 12MHz.
 constexpr int kClocksPerSecond = 12e6;
 
+// Enables 96MHz burst mode on Sparkfun Edge. Enable in timer since most
+// benchmarks and profilers want maximum performance for debugging.
+void BurstModeEnable() {
+  am_hal_burst_avail_e eBurstModeAvailable;
+  am_hal_burst_mode_e eBurstMode;
+
+  // Check that the Burst Feature is available.
+  int status = am_hal_burst_mode_initialize(&eBurstModeAvailable);
+  if (status != AM_HAL_STATUS_SUCCESS ||
+      eBurstModeAvailable != AM_HAL_BURST_AVAIL) {
+    DebugLog("Failed to initialize burst mode.");
+    return;
+  }
+
+  status = am_hal_burst_mode_enable(&eBurstMode);
+
+  if (status != AM_HAL_STATUS_SUCCESS || eBurstMode != AM_HAL_BURST_MODE) {
+    DebugLog("Failed to Enable Burst Mode operation\n");
+  }
+}
+
 }  // namespace
 
 int32_t ticks_per_second() { return kClocksPerSecond; }
@@ -53,6 +76,7 @@ int32_t GetCurrentTimeTicks() {
   // TODO(b/150808076): Split out initialization, intialize in interpreter.
   static bool is_initialized = false;
   if (!is_initialized) {
+    BurstModeEnable();
     am_hal_ctimer_config_t timer_config;
     // Operate as a 32-bit timer.
     timer_config.ui32Link = 1;

From ce0660bb292b57e2e59c0868550fc4d7208d30e4 Mon Sep 17 00:00:00 2001
From: Kuangyuan Chen <chky@google.com>
Date: Wed, 24 Jun 2020 11:44:21 -0700
Subject: [PATCH 1003/1390] Add a helper function,
 GetSessionInitializerExportedName(), to return the exported name of a session
 initializer function.

PiperOrigin-RevId: 318107585
Change-Id: I9d2b8f85e9c261dd9069106ce5b8592a3db4e160
---
 .../mlir/tensorflow/ir/tf_saved_model.cc      | 21 ++++++++++++++++++
 .../mlir/tensorflow/ir/tf_saved_model.h       |  3 +++
 .../tests/tf_saved_model_ops_invalid.mlir     | 22 +++++++++++++++++++
 3 files changed, 46 insertions(+)

diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.cc
index 5a7d81d4c0c..38c0390acca 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.cc
@@ -91,6 +91,16 @@ static LogicalResult Verify(SessionInitializerOp session_initializer) {
     return session_initializer.emitOpError()
            << "the initializer function should have no output";
 
+  auto exported_names = GetExportedNames(init_func_op);
+
+  if (exported_names.empty())
+    return session_initializer.emitOpError()
+           << "the initializer function should be exported";
+
+  if (exported_names.size() != 1)
+    return session_initializer.emitOpError()
+           << "the initializer function should have only one exported names";
+
   return success();
 }
 
@@ -429,5 +439,16 @@ void SessionInitializerOp::getCanonicalizationPatterns(
   results.insert<OptimizeSessionInitializerPattern>(context);
 }
 
+llvm::Optional<StringRef> GetSessionInitializerExportedName(ModuleOp op) {
+  auto session_initializer_op = GetSessionInitializerOp(op);
+  if (!session_initializer_op) return llvm::None;
+
+  SymbolTable symbol_table(op);
+  auto init_func_op =
+      symbol_table.lookup<mlir::FuncOp>(session_initializer_op.initializer());
+  auto exported_names = GetExportedNames(init_func_op);
+  return exported_names[0];
+}
+
 }  // namespace tf_saved_model
 }  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.h b/tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.h
index b6f8753cc51..056df4d6a43 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.h
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.h
@@ -65,6 +65,9 @@ Type GetBoundInputArgTypeFor(GlobalTensorOp global_tensor);
 // otherwise.
 SessionInitializerOp GetSessionInitializerOp(mlir::ModuleOp op);
 
+// Returns the exported name for the session initializer function.
+llvm::Optional<StringRef> GetSessionInitializerExportedName(mlir::ModuleOp op);
+
 }  // namespace tf_saved_model
 }  // namespace mlir
 
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model_ops_invalid.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model_ops_invalid.mlir
index 260174b184f..46eea9e508d 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model_ops_invalid.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model_ops_invalid.mlir
@@ -352,3 +352,25 @@ module attributes {tf_saved_model.semantics} {
     return %0 : tensor<1xf32>
   }
 }
+
+// -----
+
+module attributes {tf_saved_model.semantics} {
+
+  // expected-error@+1 {{the initializer function should be exported}}
+  "tf_saved_model.session_initializer"() { initializer = @init } : () -> ()
+  func @init() attributes {sym_visibility = "private"} {
+    return
+  }
+}
+
+// -----
+
+module attributes {tf_saved_model.semantics} {
+
+  // expected-error@+1 {{the initializer function should have only one exported name}}
+  "tf_saved_model.session_initializer"() { initializer = @init } : () -> ()
+  func @init() attributes { tf_saved_model.exported_names = ["a", "b"] } {
+    return
+  }
+}

From ea7c98c040b11e85160d50c18714c0878fc4b0cc Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 24 Jun 2020 11:52:39 -0700
Subject: [PATCH 1004/1390] Tighten some core dependencies, so that code that
 is only interested in (de)serializing Tensors doesn't require all of XLA and
 all TF kernels.

PiperOrigin-RevId: 318109428
Change-Id: Ibc6129213267432e1c9425a93106fe7188ba452a
---
 tensorflow/c/BUILD                     |  1 +
 tensorflow/c/eager/BUILD               |  1 -
 tensorflow/c/eager/abstract_function.h |  1 -
 tensorflow/python/BUILD                | 11 ++++++-----
 4 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/tensorflow/c/BUILD b/tensorflow/c/BUILD
index 12021a294e8..92cda95e8e4 100644
--- a/tensorflow/c/BUILD
+++ b/tensorflow/c/BUILD
@@ -173,6 +173,7 @@ tf_cuda_library(
     copts = tf_copts(),
     visibility = [
         "//tensorflow/c:__subpackages__",
+        "//tensorflow/python:__subpackages__",
         "//third_party/llvm/llvm-project:__subpackages__",
     ],
     deps = [
diff --git a/tensorflow/c/eager/BUILD b/tensorflow/c/eager/BUILD
index 5f7ab4a1f59..da7ddc3ec06 100644
--- a/tensorflow/c/eager/BUILD
+++ b/tensorflow/c/eager/BUILD
@@ -244,7 +244,6 @@ cc_library(
         "//tensorflow:internal",
     ],
     deps = [
-        "//tensorflow/c:c_api",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/platform:status",
     ],
diff --git a/tensorflow/c/eager/abstract_function.h b/tensorflow/c/eager/abstract_function.h
index e322b31f2b4..303dd435c05 100644
--- a/tensorflow/c/eager/abstract_function.h
+++ b/tensorflow/c/eager/abstract_function.h
@@ -15,7 +15,6 @@ limitations under the License.
 #ifndef TENSORFLOW_C_EAGER_ABSTRACT_FUNCTION_H_
 #define TENSORFLOW_C_EAGER_ABSTRACT_FUNCTION_H_
 
-#include "tensorflow/c/c_api.h"
 #include "tensorflow/core/framework/function.pb.h"
 #include "tensorflow/core/platform/status.h"
 
diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index 5f9e2dfb1ff..d42d218734e 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -558,7 +558,7 @@ cc_library(
     deps = [
         ":bfloat16_lib",
         ":numpy_lib",
-        "//tensorflow/c:c_api",
+        "//tensorflow/c:c_api_no_xla",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
     ],
@@ -997,11 +997,13 @@ tf_python_pybind_extension(
 
 cc_library(
     name = "safe_ptr",
-    srcs = ["lib/core/safe_ptr.cc"],
+    srcs = [
+        "lib/core/safe_ptr.cc",
+        "//tensorflow/c/eager:headers",
+    ],
     hdrs = ["lib/core/safe_ptr.h"],
     deps = [
-        "//tensorflow/c:c_api",
-        "//tensorflow/c/eager:c_api",
+        "//tensorflow/c:c_api_no_xla",
         "//third_party/python_runtime:headers",
     ],
 )
@@ -1046,7 +1048,6 @@ cc_library(
         ":ndarray_tensor_bridge",
         ":numpy_lib",
         ":safe_ptr",
-        "//tensorflow/c:c_api",
         "//tensorflow/c:c_api_internal",
         "//tensorflow/c:tf_status_helper",
         "//tensorflow/c:tf_tensor_internal",

From 830b3c0effe011e02557b87fa5776f65f6ebada1 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 24 Jun 2020 11:57:05 -0700
Subject: [PATCH 1005/1390] Go: Update generated wrapper functions for
 TensorFlow ops.

PiperOrigin-RevId: 318110254
Change-Id: I66f3a72c4ebe2dfa8e7a5269f2ff7c6caa178dfe
---
 tensorflow/go/op/wrappers.go | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index 106e7445be9..068d9e74df8 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -707,7 +707,7 @@ func OneHot(scope *Scope, indices tf.Output, depth tf.Output, on_value tf.Output
 	return op.Output(0)
 }
 
-// Extract `patches` from `input` and put them in the "depth" output dimension. 3D extension of `extract_image_patches`.
+// Extract `patches` from `input` and put them in the `"depth"` output dimension. 3D extension of `extract_image_patches`.
 //
 // Arguments:
 //	input: 5-D Tensor with shape `[batch, in_planes, in_rows, in_cols, depth]`.
@@ -716,11 +716,11 @@ func OneHot(scope *Scope, indices tf.Output, depth tf.Output, on_value tf.Output
 // `input`. Must be: `[1, stride_planes, stride_rows, stride_cols, 1]`.
 //	padding: The type of padding algorithm to use.
 //
-// We specify the size-related attributes as:
+// The size-related attributes are specified as follows:
 //
 // ```python
-//       ksizes = [1, ksize_planes, ksize_rows, ksize_cols, 1]
-//       strides = [1, stride_planes, strides_rows, strides_cols, 1]
+// ksizes = [1, ksize_planes, ksize_rows, ksize_cols, 1]
+// strides = [1, stride_planes, strides_rows, strides_cols, 1]
 // ```
 //
 // Returns 5-D Tensor with shape `[batch, out_planes, out_rows, out_cols,

From dc30240f53bcf1fa2098461b4c2c60d0805b4744 Mon Sep 17 00:00:00 2001
From: Anjali Sridhar <anjalisridhar@google.com>
Date: Wed, 24 Jun 2020 12:12:55 -0700
Subject: [PATCH 1006/1390] Rename import to avoid conflict with parameters.

PiperOrigin-RevId: 318113928
Change-Id: I06da2eff0eacc8e5bedfd20cf7334c17bcab008f
---
 tensorflow/python/distribute/values_test.py | 88 ++++++++++-----------
 1 file changed, 44 insertions(+), 44 deletions(-)

diff --git a/tensorflow/python/distribute/values_test.py b/tensorflow/python/distribute/values_test.py
index 69884a06814..ca752eaf703 100644
--- a/tensorflow/python/distribute/values_test.py
+++ b/tensorflow/python/distribute/values_test.py
@@ -36,7 +36,7 @@ from tensorflow.python.distribute import packed_distributed_variable as packed
 from tensorflow.python.distribute import strategy_combinations
 from tensorflow.python.distribute import tpu_strategy
 from tensorflow.python.distribute import tpu_values
-from tensorflow.python.distribute import values
+from tensorflow.python.distribute import values as values_lib
 from tensorflow.python.distribute.cluster_resolver import tpu_cluster_resolver
 from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
@@ -69,7 +69,7 @@ class DistributedValuesTest(test.TestCase, parameterized.TestCase):
   def testGetEager(self):
     one = constant_op.constant(1)
     two = constant_op.constant(2)
-    v = values.DistributedValues((one, two))
+    v = values_lib.DistributedValues((one, two))
     self.assertEqual(one, v._get())
     with distribute_lib.ReplicaContext(None, 1):
       self.assertEqual(two, v._get())
@@ -78,7 +78,7 @@ class DistributedValuesTest(test.TestCase, parameterized.TestCase):
     with context.graph_mode(), ops.Graph().as_default():
       one = constant_op.constant(1)
       two = constant_op.constant(2)
-      v = values.DistributedValues((one, two))
+      v = values_lib.DistributedValues((one, two))
       self.assertEqual(one, v._get())
       with distribute_lib.ReplicaContext(None, 1):
         self.assertEqual(two, v._get())
@@ -291,14 +291,14 @@ class DistributedDelegateTest(test.TestCase):
       def __init__(self, x):
         self.x = x
 
-    v = values.DistributedDelegate((Foo(7), Foo(8)))
+    v = values_lib.DistributedDelegate((Foo(7), Foo(8)))
     self.assertEqual(7, v.x)
     with self.assertRaises(AttributeError):
       _ = v.y
 
   @test_util.run_in_graph_and_eager_modes
   def testOperatorOverride(self):
-    v = values.DistributedDelegate((7, 8))
+    v = values_lib.DistributedDelegate((7, 8))
     # v should act like int(7).
     self.assertEqual(8, v + 1)
     self.assertEqual(10, 3 + v)
@@ -348,7 +348,7 @@ class DistributedDelegateTest(test.TestCase):
       def __init__(self, x):
         self.x = x
 
-    v = values.DistributedDelegate((Foo(7), Foo(8)))
+    v = values_lib.DistributedDelegate((Foo(7), Foo(8)))
     v_shallow_copy = copy.copy(v)
     self.assertEqual(v.x, v_shallow_copy.x)
     v_deep_copy = copy.deepcopy(v)
@@ -369,7 +369,7 @@ def _make_mirrored_val(init_val=5.0):
   for d, _ in zip(devices, ["v", "v/replica"]):
     with ops.device(d):
       v.append(constant_op.constant(init_val))
-  return values.Mirrored(v)
+  return values_lib.Mirrored(v)
 
 
 def _make_mirrored():
@@ -379,7 +379,7 @@ def _make_mirrored():
     with ops.device(d):
       v.append(variable_scope.get_variable(
           name=n, initializer=init, use_resource=True))
-  mirrored = values.MirroredVariable(
+  mirrored = values_lib.MirroredVariable(
       None, v, variable_scope.VariableAggregation.SUM)
   return mirrored
 
@@ -396,7 +396,7 @@ def mirrored_and_tpu_strategy_combinations():
 
 class RegroupAndSelectDeviceTest(test.TestCase, parameterized.TestCase):
 
-  def _is_per_replica(self, result, expected, klass=values.PerReplica):
+  def _is_per_replica(self, result, expected, klass=values_lib.PerReplica):
     self.assertIsInstance(result, klass)
     for i, exp in enumerate(expected):
       self.assertEqual(exp, result.values[i])
@@ -443,21 +443,21 @@ class RegroupAndSelectDeviceTest(test.TestCase, parameterized.TestCase):
     # Normally a mirrored value would be the same across devices, but
     # for a test it is convenient to be able to tell the values apart.
     result = distribute_utils.regroup((_nested_value("1"), _nested_value("2")),
-                                      values.Mirrored)
+                                      values_lib.Mirrored)
     self.assertIsInstance(result, tuple)
     self.assertLen(result, 3)
-    self._is_per_replica(result[0], ["a1", "a2"], values.Mirrored)
-    self._is_per_replica(result[2], ["h1", "h2"], values.Mirrored)
+    self._is_per_replica(result[0], ["a1", "a2"], values_lib.Mirrored)
+    self._is_per_replica(result[2], ["h1", "h2"], values_lib.Mirrored)
 
     self.assertIsInstance(result[1], list)
     self.assertLen(result[1], 3)
-    self._is_per_replica(result[1][0], ["b1", "b2"], values.Mirrored)
-    self._is_per_replica(result[1][2], ["g1", "g2"], values.Mirrored)
+    self._is_per_replica(result[1][0], ["b1", "b2"], values_lib.Mirrored)
+    self._is_per_replica(result[1][2], ["g1", "g2"], values_lib.Mirrored)
 
     self.assertIsInstance(result[1][1], dict)
     self.assertEqual(set(["c", "e"]), set(result[1][1].keys()))
-    self._is_per_replica(result[1][1]["c"], ["d1", "d2"], values.Mirrored)
-    self._is_per_replica(result[1][1]["e"], ["f1", "f2"], values.Mirrored)
+    self._is_per_replica(result[1][1]["c"], ["d1", "d2"], values_lib.Mirrored)
+    self._is_per_replica(result[1][1]["e"], ["f1", "f2"], values_lib.Mirrored)
 
     # Also test that we can undo the merge using select_replica()
     self.assertEqual(_nested_value("1"),
@@ -474,8 +474,8 @@ class RegroupAndSelectDeviceTest(test.TestCase, parameterized.TestCase):
     result = distribute_utils.regroup([("1", "2"), ("3", "4")])
     self.assertIsInstance(result, tuple)
     self.assertLen(result, 2)
-    self._is_per_replica(result[0], ("1", "3"), values.PerReplica)
-    self._is_per_replica(result[1], ("2", "4"), values.PerReplica)
+    self._is_per_replica(result[0], ("1", "3"), values_lib.PerReplica)
+    self._is_per_replica(result[1], ("2", "4"), values_lib.PerReplica)
 
   @combinations.generate(
       combinations.combine(
@@ -785,7 +785,7 @@ class MirroredVariableTest(test.TestCase, parameterized.TestCase):
   def testVariableOnAnotherDevice(self):
     v = variable_scope.get_variable(
         name="v", initializer=[1.], use_resource=True)
-    mirrored = values.MirroredVariable(
+    mirrored = values_lib.MirroredVariable(
         None, (v,), variable_scope.VariableAggregation.MEAN)
 
     self.assertEqual(v.name, mirrored.name)
@@ -942,7 +942,7 @@ class MirroredVariableTest(test.TestCase, parameterized.TestCase):
       with ops.device("/device:GPU:0"):
         v = variable_scope.get_variable(
             name="v", initializer=1., use_resource=True)
-      mirrored = values.MirroredVariable(
+      mirrored = values_lib.MirroredVariable(
           distribution, (v,), variable_scope.VariableAggregation.MEAN)
       sess.run(variables_lib.global_variables_initializer())
       sess.run({"complicated": mirrored})
@@ -1451,7 +1451,7 @@ def _make_replica_local(method, strategy=None):
   if (strategy is not None) and isinstance(strategy, _TPU_STRATEGIES):
     var_cls = tpu_values.TPUSyncOnReadVariable
   else:
-    var_cls = values.SyncOnReadVariable
+    var_cls = values_lib.SyncOnReadVariable
   replica_local = var_cls(strategy, v, method)
   return v, replica_local
 
@@ -1483,7 +1483,7 @@ class SyncOnReadVariablePropertiesTest(test.TestCase):
 
     v = variable_scope.get_variable(
         name="v", initializer=[1.], use_resource=True)
-    replica_local = values.SyncOnReadVariable(
+    replica_local = values_lib.SyncOnReadVariable(
         None, (v,), variable_scope.VariableAggregation.MEAN)
     self.assertEqual(2., self.evaluate(add1(replica_local)))
 
@@ -2035,7 +2035,7 @@ class SyncOnReadScatterReplicaTest(test.TestCase, parameterized.TestCase):
           aggregation=aggregation)
     self.evaluate(v.initializer)
 
-    delta = values.PerReplica([
+    delta = values_lib.PerReplica([
         indexed_slices.IndexedSlices(
             values=[[0.], [1.]], indices=[0, 1], dense_shape=(3,)),
         indexed_slices.IndexedSlices(
@@ -2053,7 +2053,7 @@ class SyncOnReadScatterReplicaTest(test.TestCase, parameterized.TestCase):
           aggregation=aggregation)
     self.evaluate(v.initializer)
 
-    delta = values.PerReplica([
+    delta = values_lib.PerReplica([
         indexed_slices.IndexedSlices(
             values=[[0.], [1.]], indices=[0, 1], dense_shape=(3,)),
         indexed_slices.IndexedSlices(
@@ -2071,7 +2071,7 @@ class SyncOnReadScatterReplicaTest(test.TestCase, parameterized.TestCase):
           aggregation=aggregation)
     self.evaluate(v.initializer)
 
-    delta = values.PerReplica([
+    delta = values_lib.PerReplica([
         indexed_slices.IndexedSlices(
             values=[[2.], [2.]], indices=[0, 1], dense_shape=(3,)),
         indexed_slices.IndexedSlices(
@@ -2089,7 +2089,7 @@ class SyncOnReadScatterReplicaTest(test.TestCase, parameterized.TestCase):
           aggregation=aggregation)
     self.evaluate(v.initializer)
 
-    delta = values.PerReplica([
+    delta = values_lib.PerReplica([
         indexed_slices.IndexedSlices(
             values=[[2.], [3.]], indices=[0, 1], dense_shape=(3,)),
         indexed_slices.IndexedSlices(
@@ -2107,7 +2107,7 @@ class SyncOnReadScatterReplicaTest(test.TestCase, parameterized.TestCase):
           aggregation=aggregation)
     self.evaluate(v.initializer)
 
-    delta = values.PerReplica([
+    delta = values_lib.PerReplica([
         indexed_slices.IndexedSlices(
             values=[[1.], [8.]], indices=[0, 1], dense_shape=(3,)),
         indexed_slices.IndexedSlices(
@@ -2125,7 +2125,7 @@ class SyncOnReadScatterReplicaTest(test.TestCase, parameterized.TestCase):
           aggregation=aggregation)
     self.evaluate(v.initializer)
 
-    delta = values.PerReplica([
+    delta = values_lib.PerReplica([
         indexed_slices.IndexedSlices(
             values=[[1.], [8.]], indices=[0, 1], dense_shape=(3,)),
         indexed_slices.IndexedSlices(
@@ -2143,7 +2143,7 @@ class SyncOnReadScatterReplicaTest(test.TestCase, parameterized.TestCase):
           aggregation=aggregation)
     self.evaluate(v.initializer)
 
-    delta = values.PerReplica([
+    delta = values_lib.PerReplica([
         indexed_slices.IndexedSlices(
             values=[[1.], [2.]], indices=[0, 1], dense_shape=(3,)),
         indexed_slices.IndexedSlices(
@@ -2175,7 +2175,7 @@ class PerReplicaTest(test.TestCase, parameterized.TestCase):
 
   def testTypeSpec(self):
     vals = (constant_op.constant(1.),)
-    per_replica = values.PerReplica(vals)
+    per_replica = values_lib.PerReplica(vals)
 
     spec = per_replica._type_spec
     self.assertEqual(spec._value_specs,
@@ -2183,7 +2183,7 @@ class PerReplicaTest(test.TestCase, parameterized.TestCase):
 
   def testTypeSpecRoundTrip(self):
     vals = (constant_op.constant(1.),)
-    per_replica = values.PerReplica(vals)
+    per_replica = values_lib.PerReplica(vals)
 
     spec = per_replica._type_spec
     tensor_list = spec._to_components(per_replica)
@@ -2193,7 +2193,7 @@ class PerReplicaTest(test.TestCase, parameterized.TestCase):
 
   def testTypeSpecNest(self):
     vals = (constant_op.constant(1.), constant_op.constant([5., 6.0]),)
-    per_replica = values.PerReplica(vals)
+    per_replica = values_lib.PerReplica(vals)
 
     # Note: nest.map_structure exercises nest.flatten and
     # nest.pack_sequence_as.
@@ -2206,7 +2206,7 @@ class PerReplicaTest(test.TestCase, parameterized.TestCase):
 
   @test_util.run_in_graph_and_eager_modes
   def testIsGraphTensor(self):
-    per_replica = values.PerReplica((constant_op.constant(1.),))
+    per_replica = values_lib.PerReplica((constant_op.constant(1.),))
     for t in nest.flatten(per_replica, expand_composites=True):
       self.assertEqual(hasattr(t, "graph"), not context.executing_eagerly())
 
@@ -2218,7 +2218,7 @@ class PerReplicaTest(test.TestCase, parameterized.TestCase):
       traces.append(None)  # Only happens on trace.
       return x
 
-    per_replica = values.PerReplica((constant_op.constant(1.),))
+    per_replica = values_lib.PerReplica((constant_op.constant(1.),))
 
     # Trace once.
     f(per_replica)
@@ -2232,13 +2232,13 @@ class PerReplicaTest(test.TestCase, parameterized.TestCase):
       per_replica = per_replica_spec._from_components(vals)
 
       output = f(per_replica)
-      self.assertIsInstance(output, values.PerReplica)
+      self.assertIsInstance(output, values_lib.PerReplica)
       self.assertAllEqual(output._values, per_replica._values)
       self.assertEmpty(traces)  # Make sure we're not re-tracing `f`.
 
   def testFunctionCanReturnPerReplica(self):
     f = def_function.function(lambda x: x)
-    x = values.PerReplica((constant_op.constant(1.),))
+    x = values_lib.PerReplica((constant_op.constant(1.),))
     y = f(x)
     self.assertIsNot(x, y)
     nest.map_structure(self.assertAllEqual, x, y, expand_composites=True)
@@ -2246,8 +2246,8 @@ class PerReplicaTest(test.TestCase, parameterized.TestCase):
 
   @test_util.run_in_graph_and_eager_modes
   def testCondWithTensorValues(self):
-    per_replica_1 = values.PerReplica((constant_op.constant("a"),))
-    per_replica_2 = values.PerReplica((constant_op.constant(["b", "c"]),))
+    per_replica_1 = values_lib.PerReplica((constant_op.constant("a"),))
+    per_replica_2 = values_lib.PerReplica((constant_op.constant(["b", "c"]),))
     condition = array_ops.placeholder_with_default(True, [])
 
     result = control_flow_ops.cond(
@@ -2258,8 +2258,8 @@ class PerReplicaTest(test.TestCase, parameterized.TestCase):
 
   @test_util.run_in_graph_and_eager_modes
   def testCondWithValuesConvertibleToTensor(self):
-    per_replica_1 = values.PerReplica(("a",))
-    per_replica_2 = values.PerReplica(("b",))
+    per_replica_1 = values_lib.PerReplica(("a",))
+    per_replica_2 = values_lib.PerReplica(("b",))
     condition = array_ops.placeholder_with_default(True, [])
 
     result = control_flow_ops.cond(
@@ -2270,8 +2270,8 @@ class PerReplicaTest(test.TestCase, parameterized.TestCase):
 
   @test_util.build_as_function_and_v1_graph
   def testCondWithValuesNotConvertibleToTensor(self):
-    per_replica_1 = values.PerReplica(({"a"},))
-    per_replica_2 = values.PerReplica(({"b", "c"},))
+    per_replica_1 = values_lib.PerReplica(({"a"},))
+    per_replica_2 = values_lib.PerReplica(({"b", "c"},))
     condition = array_ops.placeholder(dtypes.bool, [])
 
     with self.assertRaisesRegex(TypeError, "Could not build a TypeSpec for"):
@@ -2279,11 +2279,11 @@ class PerReplicaTest(test.TestCase, parameterized.TestCase):
           condition, lambda: per_replica_1, lambda: per_replica_2)
 
 
-def _make_index_slices(vals, indices, dense_shape=None):
+def _make_index_slices(values, indices, dense_shape=None):
   if dense_shape:
     dense_shape = array_ops.identity(dense_shape)
   return indexed_slices.IndexedSlices(
-      array_ops.identity(vals), array_ops.identity(indices), dense_shape)
+      array_ops.identity(values), array_ops.identity(indices), dense_shape)
 
 
 if __name__ == "__main__":

From f8c14ec741b08d8b9920d3fcd016b974f7adb2b0 Mon Sep 17 00:00:00 2001
From: Anjali Sridhar <anjalisridhar@google.com>
Date: Wed, 24 Jun 2020 12:14:00 -0700
Subject: [PATCH 1007/1390] Rename `_var_policy` to `policy` to remove the
 redundant `var`.

PiperOrigin-RevId: 318114143
Change-Id: Iac5d11f990154111b49049f4db33220753cab362
---
 tensorflow/python/distribute/values.py | 145 +++++++++++++------------
 1 file changed, 78 insertions(+), 67 deletions(-)

diff --git a/tensorflow/python/distribute/values.py b/tensorflow/python/distribute/values.py
index 35f040edc83..a180c915b25 100644
--- a/tensorflow/python/distribute/values.py
+++ b/tensorflow/python/distribute/values.py
@@ -471,7 +471,7 @@ class DistributedVariable(DistributedDelegate, variables_lib.Variable,
     self._initializer_op = None
     # Set a VariablePolicy which decides how we replicate/aggregate the given
     # variable.
-    self._var_policy = var_policy
+    self._policy = var_policy
 
   def _use_packed_variable(self):
     # Don't use packed variable when under a SaveContext to avoid explicit
@@ -655,8 +655,8 @@ class DistributedVariable(DistributedDelegate, variables_lib.Variable,
       return array_ops.identity(self._get())
 
   def value(self):
-    if self._var_policy:
-      return self._var_policy.value(self)
+    if self._policy:
+      return self._policy.value(self)
     return self._get_on_device_or_primary().value()
 
   def numpy(self):
@@ -667,75 +667,86 @@ class DistributedVariable(DistributedDelegate, variables_lib.Variable,
           "numpy() is only available when eager execution is enabled.")
 
   def assign_sub(self, value, use_locking=False, name=None, read_value=True):
-    if self._var_policy:
-      return self._var_policy.assign_sub(self, value, use_locking=use_locking,
-                                         name=name, read_value=read_value)
-    return values_util.on_write_assign_sub(self, value, use_locking=use_locking,
-                                           name=name, read_value=read_value)
+    if self._policy:
+      return self._policy.assign_sub(
+          self,
+          value,
+          use_locking=use_locking,
+          name=name,
+          read_value=read_value)
+    return values_util.on_write_assign_sub(
+        self, value, use_locking=use_locking, name=name, read_value=read_value)
 
   def assign_add(self, value, use_locking=False, name=None, read_value=True):
-    if self._var_policy:
-      return self._var_policy.assign_add(self, value, use_locking=use_locking,
-                                         name=name, read_value=read_value)
-    return values_util.on_write_assign_add(self, value, use_locking=use_locking,
-                                           name=name, read_value=read_value)
+    if self._policy:
+      return self._policy.assign_add(
+          self,
+          value,
+          use_locking=use_locking,
+          name=name,
+          read_value=read_value)
+    return values_util.on_write_assign_add(
+        self, value, use_locking=use_locking, name=name, read_value=read_value)
 
   def assign(self, value, use_locking=False, name=None, read_value=True):
-    if self._var_policy:
-      return self._var_policy.assign(self, value, use_locking=use_locking,
-                                     name=name, read_value=read_value)
-    return values_util.on_write_assign(self, value, use_locking=use_locking,
-                                       name=name, read_value=read_value)
+    if self._policy:
+      return self._policy.assign(
+          self,
+          value,
+          use_locking=use_locking,
+          name=name,
+          read_value=read_value)
+    return values_util.on_write_assign(
+        self, value, use_locking=use_locking, name=name, read_value=read_value)
 
   def scatter_sub(self, sparse_delta, use_locking=False, name=None):
-    if self._var_policy:
-      self._var_policy.scatter_sub(self, sparse_delta, use_locking=use_locking,
-                                   name=name)
-    return values_util.scatter_sub(self, sparse_delta, use_locking=use_locking,
-                                   name=name)
+    if self._policy:
+      self._policy.scatter_sub(
+          self, sparse_delta, use_locking=use_locking, name=name)
+    return values_util.scatter_sub(
+        self, sparse_delta, use_locking=use_locking, name=name)
 
   def scatter_add(self, sparse_delta, use_locking=False, name=None):
-    if self._var_policy:
-      self._var_policy.scatter_add(self, sparse_delta, use_locking=use_locking,
-                                   name=name)
-    return values_util.scatter_add(self, sparse_delta, use_locking=use_locking,
-                                   name=name)
+    if self._policy:
+      self._policy.scatter_add(
+          self, sparse_delta, use_locking=use_locking, name=name)
+    return values_util.scatter_add(
+        self, sparse_delta, use_locking=use_locking, name=name)
 
   def scatter_mul(self, sparse_delta, use_locking=False, name=None):
-    if self._var_policy:
-      self._var_policy.scatter_mul(self, sparse_delta, use_locking=use_locking,
-                                   name=name)
-    return values_util.scatter_mul(self, sparse_delta, use_locking=use_locking,
-                                   name=name)
+    if self._policy:
+      self._policy.scatter_mul(
+          self, sparse_delta, use_locking=use_locking, name=name)
+    return values_util.scatter_mul(
+        self, sparse_delta, use_locking=use_locking, name=name)
 
   def scatter_div(self, sparse_delta, use_locking=False, name=None):
-    if self._var_policy:
-      self._var_policy.scatter_div(self, sparse_delta, use_locking=use_locking,
-                                   name=name)
-    return values_util.scatter_div(self, sparse_delta, use_locking=use_locking,
-                                   name=name)
+    if self._policy:
+      self._policy.scatter_div(
+          self, sparse_delta, use_locking=use_locking, name=name)
+    return values_util.scatter_div(
+        self, sparse_delta, use_locking=use_locking, name=name)
 
   def scatter_min(self, sparse_delta, use_locking=False, name=None):
-    if self._var_policy:
-      self._var_policy.scatter_min(self, sparse_delta, use_locking=use_locking,
-                                   name=name)
-    return values_util.scatter_min(self, sparse_delta, use_locking=use_locking,
-                                   name=name)
+    if self._policy:
+      self._policy.scatter_min(
+          self, sparse_delta, use_locking=use_locking, name=name)
+    return values_util.scatter_min(
+        self, sparse_delta, use_locking=use_locking, name=name)
 
   def scatter_max(self, sparse_delta, use_locking=False, name=None):
-    if self._var_policy:
-      self._var_policy.scatter_max(self, sparse_delta, use_locking=use_locking,
-                                   name=name)
-    return values_util.scatter_max(self, sparse_delta, use_locking=use_locking,
-                                   name=name)
+    if self._policy:
+      self._policy.scatter_max(
+          self, sparse_delta, use_locking=use_locking, name=name)
+    return values_util.scatter_max(
+        self, sparse_delta, use_locking=use_locking, name=name)
 
   def scatter_update(self, sparse_delta, use_locking=False, name=None):
-    if self._var_policy:
-      self._var_policy.scatter_update(self, sparse_delta,
-                                      use_locking=use_locking, name=name)
-    return values_util.scatter_update(self, sparse_delta,
-                                      use_locking=use_locking,
-                                      name=name)
+    if self._policy:
+      self._policy.scatter_update(
+          self, sparse_delta, use_locking=use_locking, name=name)
+    return values_util.scatter_update(
+        self, sparse_delta, use_locking=use_locking, name=name)
 
   def _gather_saveables_for_checkpoint(self):
     """Overrides Trackable method.
@@ -753,14 +764,14 @@ class DistributedVariable(DistributedDelegate, variables_lib.Variable,
     return {trackable.VARIABLE_VALUE_KEY: _saveable_factory}
 
   def _as_graph_element(self):
-    if self._var_policy:
-      return self._var_policy._as_graph_element(self)  # pylint: disable=protected-access
+    if self._policy:
+      return self._policy._as_graph_element(self)  # pylint: disable=protected-access
 
     raise NotImplementedError("No policy set for calling _as_graph_element.")
 
   def _get_cross_replica(self):
-    if self._var_policy:
-      return self._var_policy._get_cross_replica(self)  # pylint: disable=protected-access
+    if self._policy:
+      return self._policy._get_cross_replica(self)  # pylint: disable=protected-access
 
     raise NotImplementedError(
         "This method should be overridden by sub-classes which support cross-"
@@ -793,8 +804,8 @@ class DistributedVariable(DistributedDelegate, variables_lib.Variable,
     Returns:
       Updated variable or `tf.Operation`.
     """
-    if self._var_policy:
-      return self._var_policy._update_replica(self, update_fn, value, **kwargs)  # pylint: disable=protected-access
+    if self._policy:
+      return self._policy._update_replica(self, update_fn, value, **kwargs)  # pylint: disable=protected-access
     raise NotImplementedError("should be implemented by subclass.")
 
   def _update(self, update_fn, value, **kwargs):
@@ -857,17 +868,17 @@ class _DistributedVariableSaveable(saveable_object.SaveableObject):
 
   def __init__(self, distributed_variable, primary_variable, name):
     self._distributed_variable = distributed_variable
-    if not self._distributed_variable._var_policy:
+    if not self._distributed_variable._policy:
       raise ValueError("VariablePolicy has not been set for the distributed "
                        "variable.")
-    tensor, spec = distributed_variable._var_policy.get_saveable(
+    tensor, spec = distributed_variable._policy.get_saveable(
         distributed_variable, primary_variable, name)
     super(_DistributedVariableSaveable, self).__init__(tensor, spec, name)
 
   def restore(self, restored_tensors, restored_shapes):
     """Restore the same value into all variables."""
     tensor, = restored_tensors
-    return self._distributed_variable._var_policy.get_restore_ops(  # pylint: disable=protected-access
+    return self._distributed_variable._policy.get_restore_ops(  # pylint: disable=protected-access
         self._distributed_variable, tensor)
 
 
@@ -1426,13 +1437,13 @@ class OnWritePolicy(AutoPolicy):
 # sync.
 def _is_mirrored(val):
   if isinstance(val, DistributedVariable):
-    if val._var_policy:  # pylint: disable=protected-access
-      return val._var_policy._is_mirrored()  # pylint: disable=protected-access
+    if val._policy:  # pylint: disable=protected-access
+      return val._policy._is_mirrored()  # pylint: disable=protected-access
   return isinstance(val, Mirrored)
 
 
 def _is_sync_on_read(val):
   if isinstance(val, DistributedVariable):
-    if val._var_policy:  # pylint: disable=protected-access
-      return not val._var_policy._is_mirrored()  # pylint: disable=protected-access
+    if val._policy:  # pylint: disable=protected-access
+      return not val._policy._is_mirrored()  # pylint: disable=protected-access
   return not isinstance(val, Mirrored)

From 8b3470fdbdc5b0d9d29a9ef6b40645a92f6b0b7a Mon Sep 17 00:00:00 2001
From: tg-at-google <taregaskin@google.com>
Date: Wed, 24 Jun 2020 15:34:54 -0400
Subject: [PATCH 1008/1390] Update
 tensorflow/core/grappler/optimizers/generic_layout_optimizer_transposer.cc

Co-authored-by: Mihai Maruseac <mihai.maruseac@gmail.com>
---
 .../grappler/optimizers/generic_layout_optimizer_transposer.cc  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/grappler/optimizers/generic_layout_optimizer_transposer.cc b/tensorflow/core/grappler/optimizers/generic_layout_optimizer_transposer.cc
index 63239082134..b64cf11b329 100644
--- a/tensorflow/core/grappler/optimizers/generic_layout_optimizer_transposer.cc
+++ b/tensorflow/core/grappler/optimizers/generic_layout_optimizer_transposer.cc
@@ -538,7 +538,7 @@ bool Transposer::IsFaninPortDimsNIfConst(const utils::MutableNodeView& node,
     if (!tensor.FromProto(value_attr->tensor())) {
       return false;
     }
-    int dims_size = dims.size(); 
+    const int dims_size = dims.size(); 
     if (tensor.dims() != dims_size) {
       return false;
     }

From cce023b61f2cb3cff9199864140cfe86493dfa72 Mon Sep 17 00:00:00 2001
From: tg-at-google <taregaskin@google.com>
Date: Wed, 24 Jun 2020 15:35:05 -0400
Subject: [PATCH 1009/1390] Update
 tensorflow/core/grappler/optimizers/generic_layout_optimizer_transposer.cc

Co-authored-by: Mihai Maruseac <mihai.maruseac@gmail.com>
---
 .../grappler/optimizers/generic_layout_optimizer_transposer.cc  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/grappler/optimizers/generic_layout_optimizer_transposer.cc b/tensorflow/core/grappler/optimizers/generic_layout_optimizer_transposer.cc
index b64cf11b329..f8ff9f7f19e 100644
--- a/tensorflow/core/grappler/optimizers/generic_layout_optimizer_transposer.cc
+++ b/tensorflow/core/grappler/optimizers/generic_layout_optimizer_transposer.cc
@@ -864,7 +864,7 @@ inline bool IsValidConstPermTransposeNode(const utils::MutableNodeView& node,
   if (!GetValueAttrFromConstInputNode(node, IsTranspose, 1, &tensor)) {
     return false;
   }
-  int permutation_size = permutation.size();
+  const int permutation_size = permutation.size();
   if (tensor.NumElements() != permutation_size) {
     return false;
   }

From a0fa8269c93405688225fa463a8921c2cedbed2b Mon Sep 17 00:00:00 2001
From: tg-at-google <taregaskin@google.com>
Date: Wed, 24 Jun 2020 15:35:45 -0400
Subject: [PATCH 1010/1390] Update
 tensorflow/core/grappler/optimizers/generic_layout_optimizer_transposer.cc

Co-authored-by: Mihai Maruseac <mihai.maruseac@gmail.com>
---
 .../grappler/optimizers/generic_layout_optimizer_transposer.cc  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/grappler/optimizers/generic_layout_optimizer_transposer.cc b/tensorflow/core/grappler/optimizers/generic_layout_optimizer_transposer.cc
index f8ff9f7f19e..c005c3eb3d4 100644
--- a/tensorflow/core/grappler/optimizers/generic_layout_optimizer_transposer.cc
+++ b/tensorflow/core/grappler/optimizers/generic_layout_optimizer_transposer.cc
@@ -1231,7 +1231,7 @@ bool ReduceTransposer::KeepDims(const utils::MutableNodeView& node) {
 
 bool ReduceTransposer::IsAlongAxis(const Tensor& tensor,
                                    absl::Span<const int> axis, int rank) {
-  int axis_size = axis.size();
+  const int axis_size = axis.size();
   if (tensor.dims() != 1 || tensor.dim_size(0) != axis_size) {
     return false;
   }

From 0158021cc3804995f140dff6d2f138185ba5eebd Mon Sep 17 00:00:00 2001
From: tg-at-google <taregaskin@google.com>
Date: Wed, 24 Jun 2020 15:35:54 -0400
Subject: [PATCH 1011/1390] Update
 tensorflow/core/grappler/optimizers/shape_optimizer.cc

Co-authored-by: Mihai Maruseac <mihai.maruseac@gmail.com>
---
 tensorflow/core/grappler/optimizers/shape_optimizer.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/grappler/optimizers/shape_optimizer.cc b/tensorflow/core/grappler/optimizers/shape_optimizer.cc
index 656c1a1db1c..98ead44bc4a 100644
--- a/tensorflow/core/grappler/optimizers/shape_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/shape_optimizer.cc
@@ -99,7 +99,7 @@ Status ShapeOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
         }
         const auto& prop =
             properties.GetOutputProperties(reduce_indices.node->name());
-        int prop_size = prop.size();
+        const int prop_size = prop.size();
         if (prop_size <= reduce_indices.port_id) {
           continue;
         }

From 0089b010bf5fb3365bddb2bf0a40bec254cc9d3b Mon Sep 17 00:00:00 2001
From: rahul-kamat <rahulkamat@gmail.com>
Date: Wed, 24 Jun 2020 19:45:36 +0000
Subject: [PATCH 1012/1390] Remove auto used as a function argument type

---
 tensorflow/python/framework/python_op_gen.cc | 12 ++++++------
 tensorflow/python/framework/python_op_gen.h  |  2 +-
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/tensorflow/python/framework/python_op_gen.cc b/tensorflow/python/framework/python_op_gen.cc
index de5618103d3..473eae43cd2 100644
--- a/tensorflow/python/framework/python_op_gen.cc
+++ b/tensorflow/python/framework/python_op_gen.cc
@@ -443,7 +443,7 @@ std::unordered_map<string, string> GenEagerPythonOp::GetTypeAnnotations() {
   for (const auto& arg : op_def_.input_arg()) {
     // TODO(rahulkamat): Add type annotations to args that accept a sequence of Tensors
     if (!arg.number_attr().empty() || !arg.type_list_attr().empty()) continue;
-    type_annotations[arg.name()] = GetArgAnnotation(arg, type_annotations);
+    type_annotations[arg.name()] = GetArgAnnotation(arg.type_attr(), arg.type(), type_annotations);
   }
 
   // TODO(rahulkamat): Add type annotations to handle return types of a sequence of Tensors.
@@ -451,7 +451,7 @@ std::unordered_map<string, string> GenEagerPythonOp::GetTypeAnnotations() {
   if (op_def_.output_arg_size() == 1) {
     const auto& arg = op_def_.output_arg(0);
     if (arg.number_attr().empty() && arg.type_list_attr().empty())
-      type_annotations[arg.name()] = GetArgAnnotation(arg, type_annotations);
+      type_annotations[arg.name()] = GetArgAnnotation(arg.type_attr(), arg.type(), type_annotations);
   }
 
   return type_annotations;
@@ -1246,13 +1246,13 @@ string GetPythonWrappers(const char* op_list_buf, size_t op_list_len) {
   return GetPythonOpsImpl(ops, api_def_map, {});
 }
 
-string GetArgAnnotation(const auto& arg, const std::unordered_map<string, string>& type_annotations) {
-  if (!arg.type_attr().empty()) {
+string GetArgAnnotation(const string& arg_type_attr, DataType arg_type, const std::unordered_map<string, string>& type_annotations) {
+  if (!arg_type_attr.empty()) {
     // Get the correct TypeVar if arg maps to an attr
-    return "_ops.Tensor[" + type_annotations.at(arg.type_attr()) + "]";
+    return "_ops.Tensor[" + type_annotations.at(arg_type_attr) + "]";
   } else {
     // Get the dtype of the Tensor
-    const string py_dtype = python_op_gen_internal::DataTypeToPython(arg.type(), "_dtypes.");
+    const string py_dtype = python_op_gen_internal::DataTypeToPython(arg_type, "_dtypes.");
     return "_ops.Tensor[" + dtype_type.at(py_dtype) + "]";
   }
 
diff --git a/tensorflow/python/framework/python_op_gen.h b/tensorflow/python/framework/python_op_gen.h
index 5dfc959b3ad..75f04952d48 100644
--- a/tensorflow/python/framework/python_op_gen.h
+++ b/tensorflow/python/framework/python_op_gen.h
@@ -54,7 +54,7 @@ string GetPythonWrappers(const char* op_list_buf, size_t op_list_len);
 // Get the type annotation for an arg
 // `arg` should be an input or output of an op
 // `type_annotations` should contain attr names mapped to TypeVar names
-string GetArgAnnotation(const auto& arg,
+string GetArgAnnotation(const string& arg_type_attr, DataType arg_type,
                         const std::unordered_map<string, string>& type_annotations);
 
 }  // namespace tensorflow

From d519200f83a1c4e035809746ca329cc23fd020f3 Mon Sep 17 00:00:00 2001
From: tg-at-google <taregaskin@google.com>
Date: Wed, 24 Jun 2020 15:48:46 -0400
Subject: [PATCH 1013/1390] Delete indexed_warning_files.json

---
 indexed_warning_files.json | 1 -
 1 file changed, 1 deletion(-)
 delete mode 100644 indexed_warning_files.json

diff --git a/indexed_warning_files.json b/indexed_warning_files.json
deleted file mode 100644
index cbe0560ba18..00000000000
--- a/indexed_warning_files.json
+++ /dev/null
@@ -1 +0,0 @@
-{"0": "tensorflow/lite/arena_planner.cc", "1": "tensorflow/core/platform/protobuf.cc", "2": "tensorflow/core/platform/protobuf.cc", "3": "tensorflow/core/platform/default/logging.cc", "4": "tensorflow/core/platform/default/logging.cc", "5": "tensorflow/core/lib/strings/proto_serialization.cc", "6": "tensorflow/core/lib/strings/proto_serialization.cc", "7": "tensorflow/core/platform/default/stacktrace_handler.cc", "8": "tensorflow/core/platform/default/stacktrace_handler.cc", "9": "tensorflow/core/framework/cpu_allocator_impl.cc", "10": "tensorflow/core/framework/allocator_registry.cc", "11": "tensorflow/core/framework/cpu_allocator_impl.cc", "12": "tensorflow/core/framework/allocator_registry.cc", "13": "tensorflow/lite/experimental/microfrontend/lib/fft.cc", "14": "tensorflow/python/util/tf_stack.cc", "15": "external/com_github_grpc_grpc/src/core/tsi/ssl_transport_security.cc", "16": "tensorflow/core/profiler/internal/parse_annotation.cc", "17": "tensorflow/compiler/mlir/xla/ir/chlo_ops.cc", "18": "tensorflow/core/platform/status.cc", "19": "tensorflow/core/platform/file_system_helper.cc", "20": "tensorflow/core/platform/file_system.cc", "21": "tensorflow/core/platform/env.cc", "22": "tensorflow/core/lib/io/random_inputstream.cc", "23": "tensorflow/core/lib/io/snappy/snappy_inputbuffer.cc", "24": "tensorflow/core/lib/io/inputbuffer.cc", "25": "tensorflow/core/lib/io/zlib_outputbuffer.cc", "26": "tensorflow/core/lib/io/snappy/snappy_outputbuffer.cc", "27": "tensorflow/core/framework/tensor_shape.cc", "28": "tensorflow/compiler/mlir/xla/ir/hlo_ops.cc", "29": "tensorflow/compiler/mlir/lite/quantization/quantization_utils.cc", "30": "tensorflow/compiler/mlir/lite/quantization/quantization_driver.cc", "31": "tensorflow/compiler/mlir/lite/quantization/quantization_config.cc", "32": "tensorflow/core/kernels/data/prefetch_autotuner.cc", "33": "tensorflow/core/kernels/quantization_utils.cc", "34": "tensorflow/core/profiler/utils/derived_timeline.cc", "35": "tensorflow/core/profiler/utils/xplane_utils.cc", "36": "tensorflow/core/profiler/lib/profiler_session.cc", "37": "tensorflow/core/platform/s3/s3_file_system.cc", "38": "tensorflow/lite/toco/model_cmdline_flags.cc", "39": "tensorflow/lite/toco/toco_cmdline_flags.cc", "40": "tensorflow/lite/toco/toco_cmdline_flags.cc", "41": "tensorflow/compiler/xla/window_util.cc", "42": "tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc", "43": "tensorflow/compiler/mlir/lite/quantization/import_quant_stats_pass.cc", "44": "tensorflow/core/kernels/batch_kernels.cc", "45": "tensorflow/lite/toco/graph_transformations/reorder_elementwise_unary.cc", "46": "tensorflow/core/kernels/range_sampler.cc", "47": "tensorflow/core/kernels/data/experimental/sql/sqlite_query_connection.cc", "48": "tensorflow/core/grappler/utils.cc", "49": "tensorflow/core/grappler/costs/op_level_cost_estimator.cc", "50": "tensorflow/lite/toco/graph_transformations/convert_trivial_tile_to_concat.cc", "51": "tensorflow/core/grappler/utils/topological_sort.cc", "52": "tensorflow/core/grappler/utils/frame.cc", "53": "tensorflow/core/grappler/optimizers/common_subgraph_elimination.cc", "54": "tensorflow/core/grappler/optimizers/model_pruner.cc", "55": "tensorflow/python/grappler/model_analyzer.cc", "56": "tensorflow/core/grappler/optimizers/debug_stripper.cc", "57": "tensorflow/core/grappler/utils/graph_view.cc", "58": "tensorflow/core/grappler/utils/functions.cc", "59": "tensorflow/core/grappler/costs/graph_memory.cc", "60": "tensorflow/core/grappler/optimizers/generic_layout_optimizer.cc", "61": "tensorflow/core/grappler/optimizers/function_optimizer.cc", "62": "tensorflow/core/grappler/optimizers/generic_layout_optimizer_transposer.cc", "63": "tensorflow/core/grappler/costs/virtual_scheduler.cc", "64": "tensorflow/core/grappler/optimizers/implementation_selector.cc", "65": "tensorflow/core/grappler/optimizers/pin_to_host_optimizer.cc", "66": "tensorflow/core/grappler/optimizers/shape_optimizer.cc", "67": "tensorflow/core/grappler/optimizers/graph_optimizer_stage.cc", "68": "tensorflow/core/grappler/optimizers/memory_optimizer.cc", "69": "tensorflow/core/grappler/optimizers/scoped_allocator_optimizer.cc", "70": "tensorflow/core/grappler/costs/graph_properties.cc", "71": "tensorflow/core/grappler/optimizers/constant_folding.cc", "72": "tensorflow/core/grappler/optimizers/loop_optimizer.cc", "73": "tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc", "74": "tensorflow/core/data/service/compression_utils.cc", "75": "tensorflow/core/kernels/stack.cc", "76": "tensorflow/core/kernels/remote_fused_graph_execute_utils.cc", "77": "tensorflow/core/tpu/tpu_embedding_optimization_parameters_utils.cc", "78": "tensorflow/core/kernels/initializable_lookup_table.cc", "79": "tensorflow/core/kernels/data/single_threaded_executor.cc", "80": "tensorflow/core/kernels/lookup_util.cc", "81": "tensorflow/lite/toco/graph_transformations/merge_reshape_into_preceding_transpose.cc", "82": "tensorflow/lite/toco/graph_transformations/identify_nearest_upsample.cc", "83": "tensorflow/lite/toco/graph_transformations/remove_successive_transpose.cc", "84": "tensorflow/lite/toco/graph_transformations/remove_trivial_passthrough.cc", "85": "tensorflow/lite/toco/graph_transformations/propagate_fixed_sizes.cc", "86": "tensorflow/lite/toco/graph_transformations/resolve_batch_normalization.cc", "87": "tensorflow/lite/toco/graph_transformations/resolve_constant_concatenation.cc", "88": "tensorflow/lite/toco/graph_transformations/reorder_reshape_transpose.cc", "89": "tensorflow/lite/toco/graph_transformations/resolve_constant_unary.cc", "90": "tensorflow/lite/toco/graph_transformations/drop_fake_quant.cc", "91": "tensorflow/lite/toco/graph_transformations/unpartition_embedding_lookup.cc", "92": "tensorflow/lite/toco/graph_transformations/dequantize.cc", "93": "tensorflow/lite/toco/graph_transformations/unroll_batch_matmul.cc", "94": "tensorflow/lite/toco/graph_transformations/hardcode_min_max.cc", "95": "tensorflow/lite/toco/graph_transformations/propagate_array_data_types.cc", "96": "tensorflow/lite/toco/graph_transformations/group_bidirectional_sequence_ops.cc", "97": "tensorflow/lite/toco/graph_transformations/propagate_fake_quant_num_bits.cc", "98": "tensorflow/lite/toco/graph_transformations/resolve_constant_slice.cc", "99": "tensorflow/lite/toco/graph_transformations/resolve_constant_transpose.cc", "100": "tensorflow/lite/toco/graph_transformations/resolve_constant_pack.cc", "101": "tensorflow/lite/toco/graph_transformations/fuse_broadcast_into_following_binary.cc", "102": "tensorflow/lite/toco/graph_transformations/ensure_uint8_weights_safe_for_fast_int8_kernels.cc", "103": "tensorflow/lite/toco/graph_transformations/convert_trivial_transpose_to_reshape.cc", "104": "tensorflow/core/profiler/convert/op_metrics_to_record.cc", "105": "tensorflow/core/profiler/utils/event_span.cc", "106": "tensorflow/python/framework/python_op_gen.cc", "107": "tensorflow/python/framework/python_op_gen_internal.cc", "108": "tensorflow/compiler/tf2xla/ops/xla_ops.cc", "109": "tensorflow/compiler/mlir/tensorflow/ir/tf_device.cc", "110": "tensorflow/compiler/mlir/tensorflow/transforms/unroll_batch_matmul.cc", "111": "tensorflow/compiler/mlir/tensorflow/ir/tf_executor.cc", "112": "tensorflow/core/kernels/data/captured_function.cc", "113": "tensorflow/core/profiler/convert/xplane_to_tf_functions.cc", "114": "tensorflow/core/common_runtime/bfc_allocator.cc", "115": "tensorflow/core/util/padding.cc", "116": "tensorflow/core/framework/op_def_util.cc", "117": "tensorflow/core/framework/node_def_util.cc", "118": "tensorflow/core/framework/shape_inference.cc", "119": "tensorflow/core/framework/common_shape_fns.cc", "120": "tensorflow/core/common_runtime/lower_case_op.cc", "121": "tensorflow/core/common_runtime/gradients.cc", "122": "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc", "123": "tensorflow/stream_executor/device_description.cc", "124": "tensorflow/compiler/jit/shape_inference.cc", "125": "tensorflow/compiler/mlir/tensorflow/utils/export_utils.cc", "126": "tensorflow/compiler/xla/index_util.cc", "127": "tensorflow/compiler/xla/metric_table_report.cc", "128": "tensorflow/compiler/xla/layout.cc", "129": "tensorflow/stream_executor/stream_executor_pimpl.cc", "130": "tensorflow/compiler/xla/shape_util.cc", "131": "tensorflow/compiler/xla/service/hlo_lexer.cc", "132": "tensorflow/compiler/xla/service/cpu/shape_partition.cc", "133": "tensorflow/compiler/xla/util.cc", "134": "tensorflow/compiler/xla/service/name_uniquer.cc", "135": "tensorflow/compiler/xla/shape_layout.cc", "136": "tensorflow/compiler/xla/client/sharding_builder.cc", "137": "tensorflow/compiler/xla/service/computation_layout.cc", "138": "tensorflow/compiler/mlir/xla/type_to_shape.cc", "139": "tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_flags.cc", "140": "tensorflow/compiler/xla/service/shaped_buffer.cc", "141": "tensorflow/compiler/tf2xla/sharding_util.cc", "142": "tensorflow/stream_executor/stream.cc", "143": "tensorflow/compiler/xla/layout_util.cc", "144": "tensorflow/compiler/xla/shape.cc", "145": "tensorflow/core/profiler/convert/xplane_to_memory_profile.cc", "146": "tensorflow/core/profiler/convert/op_stats_to_overview_page.cc", "147": "tensorflow/compiler/mlir/xla/transforms/legalize_tf.cc", "148": "tensorflow/compiler/xla/client/xla_computation.cc", "149": "tensorflow/compiler/mlir/tensorflow/utils/xla_sharding_util.cc", "150": "tensorflow/compiler/tf2xla/rearrange_function_argument.cc", "151": "tensorflow/compiler/jit/encapsulate_util.cc", "152": "tensorflow/compiler/tf2xla/tf2xla_util.cc", "153": "tensorflow/compiler/tf2xla/functionalize_while.cc", "154": "tensorflow/compiler/tf2xla/functionalize_cond.cc", "155": "tensorflow/core/kernels/boosted_trees/resources.cc", "156": "tensorflow/python/client/session_ref.cc", "157": "tensorflow/core/distributed_runtime/rpc/grpc_state.cc", "158": "tensorflow/core/distributed_runtime/rpc/grpc_remote_master.cc", "159": "tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_client.cc", "160": "tensorflow/core/distributed_runtime/collective_param_resolver_distributed.cc", "161": "tensorflow/core/distributed_runtime/collective_rma_distributed.cc", "162": "tensorflow/core/platform/status.cc", "163": "tensorflow/core/framework/tensor_shape.cc", "164": "tensorflow/core/profiler/internal/tfprof_timeline.cc", "165": "tensorflow/core/platform/file_system.cc", "166": "tensorflow/core/platform/file_system_helper.cc", "167": "tensorflow/core/platform/env.cc", "168": "tensorflow/core/lib/io/snappy/snappy_outputbuffer.cc", "169": "tensorflow/core/lib/io/zlib_outputbuffer.cc", "170": "tensorflow/core/lib/io/snappy/snappy_inputbuffer.cc", "171": "tensorflow/core/lib/io/random_inputstream.cc", "172": "tensorflow/core/lib/io/inputbuffer.cc", "173": "tensorflow/core/util/padding.cc", "174": "tensorflow/core/framework/op_def_util.cc", "175": "tensorflow/core/platform/s3/s3_file_system.cc", "176": "tensorflow/stream_executor/device_description.cc", "177": "tensorflow/core/framework/shape_inference.cc", "178": "tensorflow/core/framework/node_def_util.cc", "179": "tensorflow/core/common_runtime/bfc_allocator.cc", "180": "tensorflow/core/framework/common_shape_fns.cc", "181": "tensorflow/stream_executor/stream_executor_pimpl.cc", "182": "tensorflow/core/profiler/utils/xplane_utils.cc", "183": "tensorflow/core/grappler/utils.cc", "184": "tensorflow/core/grappler/costs/op_level_cost_estimator.cc", "185": "tensorflow/core/grappler/utils/symbolic_shapes.cc", "186": "tensorflow/core/grappler/utils/frame.cc", "187": "tensorflow/core/grappler/utils/topological_sort.cc", "188": "tensorflow/stream_executor/stream.cc", "189": "tensorflow/core/kernels/initializable_lookup_table.cc", "190": "tensorflow/core/grappler/optimizers/common_subgraph_elimination.cc", "191": "tensorflow/core/grappler/utils/graph_view.cc", "192": "tensorflow/core/grappler/optimizers/model_pruner.cc", "193": "tensorflow/core/kernels/lookup_util.cc", "194": "tensorflow/compiler/tf2xla/ops/xla_ops.cc", "195": "tensorflow/python/framework/python_op_gen_internal.cc", "196": "tensorflow/python/framework/python_op_gen.cc", "197": "tensorflow/core/profiler/utils/derived_timeline.cc", "198": "tensorflow/core/grappler/optimizers/debug_stripper.cc", "199": "tensorflow/core/grappler/optimizers/implementation_selector.cc", "200": "tensorflow/core/grappler/utils/functions.cc", "201": "tensorflow/core/grappler/optimizers/graph_optimizer_stage.cc", "202": "tensorflow/core/grappler/optimizers/pin_to_host_optimizer.cc", "203": "tensorflow/core/grappler/optimizers/shape_optimizer.cc", "204": "tensorflow/core/grappler/optimizers/generic_layout_optimizer_transposer.cc", "205": "tensorflow/core/grappler/optimizers/function_optimizer.cc", "206": "tensorflow/core/grappler/optimizers/scoped_allocator_optimizer.cc", "207": "tensorflow/core/grappler/optimizers/generic_layout_optimizer.cc", "208": "tensorflow/core/grappler/costs/virtual_scheduler.cc", "209": "tensorflow/core/grappler/optimizers/loop_optimizer.cc", "210": "tensorflow/core/grappler/optimizers/constant_folding.cc", "211": "tensorflow/core/profiler/lib/profiler_session.cc", "212": "tensorflow/core/grappler/costs/graph_properties.cc", "213": "tensorflow/core/grappler/costs/graph_memory.cc", "214": "tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc", "215": "tensorflow/core/grappler/optimizers/memory_optimizer.cc", "216": "tensorflow/core/common_runtime/gradients.cc", "217": "tensorflow/core/common_runtime/lower_case_op.cc", "218": "tensorflow/core/grappler/utils/symbolic_shapes.cc", "219": "tensorflow/compiler/jit/graphcycles/graphcycles.cc", "220": "tensorflow/compiler/jit/xla_cluster_util.cc", "221": "tensorflow/core/kernels/data/experimental/snapshot_util.cc", "222": "tensorflow/core/grappler/optimizers/data/vectorization/parse_single_example_vectorizer.cc", "223": "tensorflow/core/distributed_runtime/graph_mgr.cc", "224": "tensorflow/core/grappler/optimizers/data/vectorization_utils.cc", "225": "tensorflow/core/grappler/optimizers/data/map_vectorization.cc", "226": "tensorflow/cc/framework/while_gradients.cc", "227": "tensorflow/cc/framework/gradients.cc", "228": "tensorflow/core/grappler/graph_analyzer/subgraph.cc", "229": "tensorflow/core/grappler/graph_analyzer/graph_analyzer.cc", "230": "tensorflow/core/grappler/graph_analyzer/sig_node.cc", "231": "tensorflow/compiler/tf2xla/shape_util.cc", "232": "tensorflow/compiler/xla/service/computation_placer.cc", "233": "tensorflow/compiler/xla/client/executable_build_options.cc", "234": "tensorflow/compiler/xla/service/hlo_module_config.cc", "235": "tensorflow/core/distributed_runtime/master_session.cc", "236": "tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util.cc", "237": "tensorflow/compiler/tf2xla/literal_util.cc", "238": "tensorflow/compiler/xla/service/cpu/cpu_options.cc", "239": "tensorflow/compiler/xla/client/client.cc", "240": "tensorflow/compiler/xla/literal_util.cc", "241": "tensorflow/core/distributed_runtime/rpc/grpc_session.cc", "242": "tensorflow/compiler/xla/service/hlo_input_output_alias_config.cc", "243": "tensorflow/compiler/xla/service/call_graph.cc", "244": "tensorflow/compiler/xla/service/tuple_util.cc", "245": "tensorflow/compiler/xla/service/hlo_reachability.cc", "246": "tensorflow/compiler/xla/service/hlo_cost_analysis.cc", "247": "tensorflow/compiler/xla/service/hlo_execution_profile.cc", "248": "tensorflow/compiler/xla/service/hlo_module_group.cc", "249": "tensorflow/compiler/mlir/xla/hlo_utils.cc", "250": "tensorflow/compiler/xla/service/hlo_sharding_metadata.cc", "251": "tensorflow/compiler/xla/service/map_inliner.cc", "252": "tensorflow/compiler/xla/service/flatten_call_graph.cc", "253": "tensorflow/compiler/xla/service/hlo_domain_map.cc", "254": "tensorflow/compiler/xla/service/hlo_cse.cc", "255": "tensorflow/compiler/xla/service/batchnorm_expander.cc", "256": "tensorflow/compiler/xla/service/dynamic_index_splitter.cc", "257": "tensorflow/compiler/xla/service/dfs_hlo_visitor.cc", "258": "tensorflow/compiler/xla/service/hlo_subcomputation_unification.cc", "259": "tensorflow/compiler/xla/service/slice_sinker.cc", "260": "tensorflow/compiler/xla/service/dot_decomposer.cc", "261": "tensorflow/compiler/xla/service/sort_simplifier.cc", "262": "tensorflow/compiler/xla/service/reshape_mover.cc", "263": "tensorflow/compiler/xla/service/gpu/partition_assignment.cc", "264": "tensorflow/compiler/xla/service/fusion_node_indexing_evaluation.cc", "265": "tensorflow/compiler/xla/service/dynamic_parameter_binding.cc", "266": "tensorflow/compiler/xla/literal.cc", "267": "tensorflow/compiler/xla/service/hlo_schedule.cc", "268": "tensorflow/compiler/xla/service/buffer_value.cc", "269": "tensorflow/compiler/xla/service/hlo_dce.cc", "270": "tensorflow/compiler/xla/service/hlo_module.cc", "271": "tensorflow/compiler/xla/service/hlo_sharding.cc", "272": "tensorflow/compiler/xla/service/transpose_folding.cc", "273": "tensorflow/compiler/xla/service/logical_buffer.cc", "274": "tensorflow/compiler/xla/service/call_inliner.cc", "275": "tensorflow/compiler/xla/service/hlo_buffer.cc", "276": "tensorflow/compiler/xla/service/hlo_instructions.cc", "277": "tensorflow/compiler/xla/service/hlo_value.cc", "278": "tensorflow/compiler/xla/service/zero_sized_hlo_elimination.cc", "279": "tensorflow/compiler/xla/service/tuple_simplifier.cc", "280": "tensorflow/compiler/xla/service/hlo_computation.cc", "281": "tensorflow/compiler/xla/service/logical_buffer_analysis.cc", "282": "tensorflow/compiler/xla/service/hlo_phi_graph.cc", "283": "tensorflow/compiler/xla/service/conditional_simplifier.cc", "284": "tensorflow/compiler/xla/service/hlo_query.cc", "285": "tensorflow/compiler/xla/service/channel_tracker.cc", "286": "tensorflow/compiler/xla/client/lib/constants.cc", "287": "tensorflow/compiler/xla/service/collective_ops_utils.cc", "288": "tensorflow/compiler/xla/service/hlo_dataflow_analysis.cc", "289": "tensorflow/compiler/tf2xla/lib/util.cc", "290": "tensorflow/compiler/xla/service/shape_inference.cc", "291": "tensorflow/compiler/tf2xla/lib/broadcast.cc", "292": "tensorflow/compiler/tf2xla/lib/data_format.cc", "293": "tensorflow/compiler/xla/service/hlo_parser.cc", "294": "tensorflow/compiler/xla/service/hlo_ordering.cc", "295": "tensorflow/compiler/xla/service/hlo_instruction.cc", "296": "tensorflow/compiler/xla/service/instruction_fusion.cc", "297": "tensorflow/compiler/xla/service/tuple_points_to_analysis.cc", "298": "tensorflow/compiler/xla/service/hlo_verifier.cc", "299": "tensorflow/compiler/xla/client/lib/comparators.cc", "300": "tensorflow/compiler/xla/client/lib/arithmetic.cc", "301": "tensorflow/compiler/xla/client/lib/sorting.cc", "302": "tensorflow/compiler/mlir/xla/ir/mlir_hlo_builder.cc", "303": "tensorflow/compiler/xla/client/lib/loops.cc", "304": "tensorflow/compiler/tf2xla/lib/scatter.cc", "305": "tensorflow/compiler/xla/client/lib/slicing.cc", "306": "tensorflow/compiler/xla/client/lib/prng.cc", "307": "tensorflow/compiler/xla/client/xla_builder.cc", "308": "tensorflow/compiler/xla/service/hlo_graph_dumper.cc", "309": "tensorflow/compiler/xla/service/hlo_alias_analysis.cc", "310": "tensorflow/compiler/xla/service/hlo_live_range.cc", "311": "tensorflow/compiler/xla/client/lib/pooling.cc", "312": "tensorflow/compiler/xla/client/lib/matrix.cc", "313": "tensorflow/compiler/xla/client/lib/tridiagonal.cc", "314": "tensorflow/compiler/xla/service/conditional_to_select.cc", "315": "tensorflow/compiler/xla/service/batch_dot_simplification.cc", "316": "tensorflow/compiler/xla/service/rng_expander.cc", "317": "tensorflow/compiler/xla/service/cholesky_expander.cc", "318": "tensorflow/compiler/xla/client/lib/svd.cc", "319": "tensorflow/compiler/xla/service/hlo_memory_scheduler.cc", "320": "tensorflow/compiler/xla/client/lib/qr.cc", "321": "tensorflow/compiler/xla/client/lib/self_adjoint_eig.cc", "322": "tensorflow/compiler/xla/service/convolution_group_converter.cc", "323": "tensorflow/compiler/xla/client/lib/math.cc", "324": "tensorflow/compiler/xla/service/hlo_creation_utils.cc", "325": "tensorflow/compiler/xla/service/layout_assignment.cc", "326": "tensorflow/compiler/xla/service/while_loop_constant_sinking.cc", "327": "tensorflow/compiler/xla/service/scatter_expander.cc", "328": "tensorflow/compiler/xla/service/heap_simulator.cc", "329": "tensorflow/compiler/xla/service/while_util.cc", "330": "tensorflow/compiler/xla/service/dynamic_dimension_inference.cc", "331": "tensorflow/compiler/xla/service/cpu/buffer_info_util.cc", "332": "tensorflow/compiler/xla/service/hlo_proto_util.cc", "333": "tensorflow/compiler/xla/service/llvm_ir/buffer_assignment_util.cc", "334": "tensorflow/compiler/xla/service/executable.cc", "335": "tensorflow/compiler/xla/service/hlo_pass_pipeline.cc", "336": "tensorflow/compiler/xla/service/triangular_solve_expander.cc", "337": "tensorflow/compiler/xla/service/compilation_cache.cc", "338": "tensorflow/compiler/xla/service/platform_util.cc", "339": "tensorflow/compiler/xla/service/copy_insertion.cc", "340": "tensorflow/compiler/xla/service/compiler.cc", "341": "tensorflow/compiler/xla/service/hlo_constant_folding.cc", "342": "tensorflow/compiler/xla/service/memory_space_assignment.cc", "343": "tensorflow/compiler/xla/service/transfer_manager.cc", "344": "tensorflow/compiler/xla/service/generic_transfer_manager.cc", "345": "tensorflow/compiler/xla/service/hlo_element_type_converter.cc", "346": "tensorflow/compiler/xla/service/execution_tracker.cc", "347": "tensorflow/compiler/xla/service/buffer_assignment.cc", "348": "tensorflow/compiler/xla/service/backend.cc", "349": "tensorflow/compiler/xla/service/op_expander_pass.cc", "350": "tensorflow/compiler/xla/service/indexed_array_analysis.cc", "351": "tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor_complex128.cc", "352": "tensorflow/compiler/xla/service/tree_reduction_rewriter.cc", "353": "tensorflow/compiler/xla/service/rng_bit_generator_expander.cc", "354": "tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor_complex64.cc", "355": "tensorflow/compiler/xla/service/while_loop_analysis.cc", "356": "tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor_int32.cc", "357": "tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor_uint16.cc", "358": "tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor_uint8.cc", "359": "tensorflow/compiler/xla/service/algebraic_simplifier.cc", "360": "tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor_int8.cc", "361": "tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor_half.cc", "362": "tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor_int64.cc", "363": "tensorflow/compiler/xla/service/allocation_tracker.cc", "364": "tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor_bfloat16.cc", "365": "tensorflow/compiler/xla/service/while_loop_invariant_code_motion.cc", "366": "tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor_float.cc", "367": "tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor_int16.cc", "368": "tensorflow/compiler/xla/service/hlo_get_dimension_size_rewriter.cc", "369": "tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor_uint32.cc", "370": "tensorflow/compiler/xla/service/dynamic_padder.cc", "371": "tensorflow/compiler/xla/service/dump.cc", "372": "tensorflow/compiler/xla/service/while_loop_simplifier.cc", "373": "tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor_uint64.cc", "374": "tensorflow/compiler/xla/service/compile_only_service.cc", "375": "tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor_bool.cc", "376": "tensorflow/compiler/xla/service/local_service.cc", "377": "tensorflow/compiler/xla/client/compile_only_client.cc", "378": "tensorflow/compiler/tf2xla/xla_expression.cc", "379": "tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor_double.cc", "380": "tensorflow/compiler/xla/client/client_library.cc", "381": "tensorflow/compiler/xla/client/local_client.cc", "382": "tensorflow/compiler/jit/xla_tensor.cc", "383": "tensorflow/compiler/xla/service/service.cc", "384": "tensorflow/compiler/tf2xla/const_analysis.cc", "385": "tensorflow/compiler/tf2xla/kernels/conv_op_helpers.cc", "386": "tensorflow/compiler/jit/device_util.cc", "387": "tensorflow/compiler/tf2xla/xla_context.cc", "388": "tensorflow/compiler/tf2xla/kernels/tensor_list_utils.cc", "389": "tensorflow/compiler/tf2xla/xla_op_registry.cc", "390": "tensorflow/compiler/tf2xla/xla_helpers.cc", "391": "tensorflow/compiler/tf2xla/xla_compilation_device.cc", "392": "tensorflow/compiler/tf2xla/graph_compiler.cc", "393": "tensorflow/compiler/tf2xla/xla_resource.cc", "394": "tensorflow/compiler/tf2xla/xla_op_kernel.cc", "395": "tensorflow/compiler/tf2xla/lib/random.cc", "396": "tensorflow/compiler/jit/compilability_check_util.cc", "397": "tensorflow/compiler/xla/service/hlo_evaluator.cc", "398": "tensorflow/compiler/tf2xla/graph_compiler_util.cc", "399": "tensorflow/compiler/tf2xla/kernels/if_while_utils.cc", "400": "tensorflow/compiler/tf2xla/xla_compiler.cc", "401": "tensorflow/compiler/jit/increase_dynamism_for_auto_jit_pass.cc", "402": "tensorflow/compiler/jit/build_xla_ops_pass.cc", "403": "tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc", "404": "tensorflow/compiler/jit/encapsulate_xla_computations_pass.cc", "405": "tensorflow/compiler/jit/extract_outside_compilation_pass.cc", "406": "tensorflow/compiler/aot/aot_only_var_handle_op.cc", "407": "tensorflow/compiler/tf2xla/tf2xla.cc", "408": "tensorflow/compiler/mlir/tensorflow/transforms/tpu_merge_variables_with_execute.cc", "409": "tensorflow/compiler/mlir/xla/transforms/legalize_tf_with_tf2xla.cc", "410": "tensorflow/compiler/mlir/tensorflow/translate/export_graphdef.cc", "411": "tensorflow/compiler/mlir/xla/mlir_hlo_to_hlo.cc", "412": "tensorflow/compiler/mlir/tensorflow/transforms/tpu_cluster_formation.cc", "413": "tensorflow/compiler/mlir/tensorflow/transforms/tpu_sharding_identification_pass.cc", "414": "tensorflow/compiler/mlir/tensorflow/transforms/tpu_variable_runtime_reformatting.cc", "415": "tensorflow/compiler/jit/xla_compilation_cache.cc", "416": "tensorflow/compiler/mlir/tensorflow/utils/compile_mlir_util.cc", "417": "tensorflow/compiler/mlir/tensorflow/translate/import_model.cc", "418": "tensorflow/compiler/jit/xla_device_context.cc", "419": "tensorflow/compiler/jit/xla_launch_util.cc", "420": "tensorflow/compiler/mlir/tensorflow/transforms/resource_op_lifting.cc", "421": "tensorflow/compiler/jit/xla_compile_on_demand_op.cc", "422": "tensorflow/compiler/jit/xla_device_ops.cc", "423": "tensorflow/compiler/mlir/tensorflow/transforms/einsum.cc", "424": "tensorflow/compiler/mlir/tensorflow/transforms/batchmatmul_to_einsum.cc", "425": "tensorflow/compiler/jit/xla_device.cc", "426": "tensorflow/compiler/jit/kernels/xla_ops.cc", "427": "tensorflow/compiler/jit/xla_kernel_creator_util.cc", "428": "tensorflow/compiler/xla/service/llvm_compiler.cc", "429": "tensorflow/core/distributed_runtime/eager/remote_mgr.cc", "430": "tensorflow/core/common_runtime/eager/execute.cc", "431": "tensorflow/compiler/mlir/tensorflow/transforms/promote_resources_to_args.cc", "432": "tensorflow/core/distributed_runtime/eager/eager_service_impl.cc", "433": "tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service_impl.cc", "434": "tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc", "435": "tensorflow/compiler/xla/service/cpu/xfeed_manager.cc", "436": "tensorflow/compiler/xla/service/gpu/target_util.cc", "437": "tensorflow/compiler/aot/embedded_protocol_buffers.cc", "438": "tensorflow/compiler/xla/service/llvm_ir/math_ops.cc", "439": "tensorflow/compiler/xla/service/llvm_ir/tuple_ops.cc", "440": "tensorflow/compiler/xla/service/cpu/ir_emission_utils.cc", "441": "tensorflow/compiler/xla/service/llvm_ir/ir_array.cc", "442": "tensorflow/compiler/xla/service/cpu/vector_support_library.cc", "443": "tensorflow/compiler/xla/service/llvm_ir/llvm_util.cc", "444": "tensorflow/compiler/xla/service/llvm_ir/llvm_loop.cc", "445": "tensorflow/compiler/xla/service/cpu/cpu_transfer_manager.cc", "446": "tensorflow/compiler/xla/service/llvm_ir/loop_emitter.cc", "447": "tensorflow/python/tfcompile_wrapper.cc", "448": "tensorflow/compiler/xla/service/cpu/conv_canonicalization.cc", "449": "tensorflow/compiler/xla/service/cpu/cpu_runtime.cc", "450": "tensorflow/compiler/xla/service/llvm_ir/kernel_support_library.cc", "451": "tensorflow/compiler/xla/service/llvm_ir/alias_analysis.cc", "452": "tensorflow/compiler/xla/service/cpu/llvm_ir_runtime.cc", "453": "tensorflow/compiler/xla/service/cpu/ir_function.cc", "454": "tensorflow/compiler/xla/service/cpu/parallel_loop_emitter.cc", "455": "tensorflow/compiler/xla/service/gpu/parallel_loop_emitter.cc", "456": "tensorflow/compiler/xla/service/cpu/tiled_dot_emitter.cc", "457": "tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion.cc", "458": "tensorflow/compiler/xla/service/cpu/compiler_functor.cc", "459": "tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.cc", "460": "tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc", "461": "tensorflow/compiler/xla/service/elemental_ir_emitter.cc", "462": "tensorflow/compiler/xla/service/llvm_ir/dynamic_update_slice_util.cc", "463": "tensorflow/compiler/xla/service/cpu/cpu_layout_assignment.cc", "464": "tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc", "465": "tensorflow/compiler/xla/service/cpu/cpu_executable.cc", "466": "tensorflow/compiler/xla/service/cpu/parallel_task_assignment.cc", "467": "tensorflow/compiler/xla/service/cpu/elemental_ir_emitter.cc", "468": "tensorflow/core/profiler/internal/tfprof_op.cc", "469": "tensorflow/compiler/jit/xla_cpu_device.cc", "470": "tensorflow/python/lib/core/ndarray_tensor.cc", "471": "tensorflow/compiler/xla/service/cpu/ir_emitter.cc", "472": "tensorflow/core/profiler/internal/tfprof_code.cc", "473": "tensorflow/compiler/xla/service/cpu/cpu_compiler.cc", "474": "tensorflow/core/profiler/internal/tfprof_stats.cc", "475": "tensorflow/core/profiler/internal/print_model_analysis.cc", "476": "tensorflow/compiler/aot/codegen.cc", "477": "tensorflow/compiler/aot/compile.cc", "478": "tensorflow/compiler/tf2xla/mlir_tf2xla.cc", "479": "tensorflow/python/eager/pywrap_tfe_src.cc", "480": "tensorflow/compiler/mlir/lite/utils/lstm_utils.cc", "481": "tensorflow/compiler/mlir/lite/transforms/prepare_composite_functions_tf.cc", "482": "tensorflow/compiler/mlir/lite/transforms/dilated_conv.cc", "483": "tensorflow/lite/delegates/nnapi/quant_lstm_sup.cc", "484": "tensorflow/compiler/mlir/lite/transforms/dense_to_sparse.cc", "485": "tensorflow/compiler/mlir/lite/transforms/lower_static_tensor_list.cc", "486": "tensorflow/lite/delegates/nnapi/nnapi_delegate.cc", "487": "tensorflow/compiler/mlir/lite/transforms/prepare_tf.cc", "488": "tensorflow/compiler/mlir/lite/transforms/legalize_tf.cc", "489": "tensorflow/compiler/mlir/lite/transforms/optimize.cc", "490": "tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper.cc", "491": "tensorflow/lite/tools/verifier.cc", "492": "tensorflow/lite/tools/optimize/model_utils.cc", "493": "tensorflow/lite/tools/optimize/quantization_utils.cc", "494": "tensorflow/lite/tools/versioning/op_version.cc", "495": "tensorflow/lite/tools/versioning/runtime_version.cc", "496": "tensorflow/lite/tools/optimize/quantize_model.cc", "497": "tensorflow/lite/tools/optimize/quantize_weights.cc", "498": "tensorflow/lite/python/optimize/calibration_wrapper.cc", "499": "tensorflow/lite/toco/tflite/import.cc", "500": "tensorflow/compiler/mlir/lite/flatbuffer_import.cc", "501": "tensorflow/compiler/mlir/lite/ir/tfl_ops.cc", "502": "tensorflow/compiler/mlir/lite/flatbuffer_export.cc", "503": "tensorflow/compiler/mlir/lite/python/saved_model_to_tfl_flatbuffer.cc", "504": "tensorflow/lite/experimental/microfrontend/lib/fft.cc", "505": "tensorflow/core/tpu/tpu_embedding_optimization_parameters_utils.cc", "506": "tensorflow/core/platform/default/logging.cc", "507": "tensorflow/core/platform/default/logging.cc", "508": "tensorflow/core/platform/protobuf.cc", "509": "tensorflow/core/framework/allocator_registry.cc", "510": "tensorflow/core/framework/cpu_allocator_impl.cc", "511": "tensorflow/core/kernels/lookup_util.cc", "512": "tensorflow/lite/delegates/nnapi/nnapi_delegate.cc", "513": "tensorflow/core/framework/allocator_registry.cc", "514": "tensorflow/core/common_runtime/lower_case_op.cc", "515": "tensorflow/core/data/service/compression_utils.cc", "516": "tensorflow/core/kernels/data/prefetch_autotuner.cc", "517": "tensorflow/lite/tools/versioning/runtime_version.cc", "518": "tensorflow/compiler/xla/service/hlo_evaluator.cc", "519": "tensorflow/compiler/tf2xla/xla_resource.cc", "520": "tensorflow/core/kernels/stack.cc", "521": "tensorflow/core/profiler/internal/parse_annotation.cc", "522": "tensorflow/core/framework/cpu_allocator_impl.cc", "523": "tensorflow/lite/experimental/microfrontend/lib/fft.cc", "524": "tensorflow/lite/toco/graph_transformations/ensure_uint8_weights_safe_for_fast_int8_kernels.cc", "525": "tensorflow/compiler/xla/service/while_loop_invariant_code_motion.cc", "526": "tensorflow/core/tpu/tpu_embedding_optimization_parameters_utils.cc"}
\ No newline at end of file

From b8b2016ba33b01efac7135a69d90806cc59fb153 Mon Sep 17 00:00:00 2001
From: Austin Anderson <angerson@google.com>
Date: Wed, 24 Jun 2020 12:50:47 -0700
Subject: [PATCH 1014/1390] Pin numpy below 1.19.0 in Dockerfiles

See #40688.

PiperOrigin-RevId: 318122157
Change-Id: Ief46c5610f3aaf0cdd7d43ce1a10d6d87e8e8e01
---
 .../dockerfiles/dockerfiles/devel-cpu-jupyter.Dockerfile      | 2 +-
 tensorflow/tools/dockerfiles/dockerfiles/devel-cpu.Dockerfile | 2 +-
 .../dockerfiles/dockerfiles/devel-gpu-jupyter.Dockerfile      | 2 +-
 tensorflow/tools/dockerfiles/dockerfiles/devel-gpu.Dockerfile | 2 +-
 .../dockerfiles/mkl_horovod/devel-horovod-jupyter.Dockerfile  | 4 ++--
 .../dockerfiles/mkl_horovod/devel-horovod.Dockerfile          | 4 ++--
 .../dockerfiles/ppc64le/devel-cpu-ppc64le-jupyter.Dockerfile  | 4 ++--
 .../dockerfiles/ppc64le/devel-cpu-ppc64le.Dockerfile          | 4 ++--
 .../dockerfiles/ppc64le/devel-gpu-ppc64le-jupyter.Dockerfile  | 4 ++--
 .../dockerfiles/ppc64le/devel-gpu-ppc64le.Dockerfile          | 4 ++--
 .../dockerfiles/partials/ubuntu/bazel.partial.Dockerfile      | 2 +-
 .../dockerfiles/partials/ubuntu/bazelbuild.partial.Dockerfile | 2 +-
 12 files changed, 18 insertions(+), 18 deletions(-)

diff --git a/tensorflow/tools/dockerfiles/dockerfiles/devel-cpu-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/devel-cpu-jupyter.Dockerfile
index a215449ef7f..3ea196bb354 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/devel-cpu-jupyter.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/devel-cpu-jupyter.Dockerfile
@@ -84,7 +84,7 @@ RUN python3 -m pip --no-cache-dir install \
     keras_preprocessing \
     matplotlib \
     mock \
-    numpy \
+    numpy<1.19.0 \
     scipy \
     sklearn \
     pandas \
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/devel-cpu.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/devel-cpu.Dockerfile
index f7d414bc902..d3a487d754c 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/devel-cpu.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/devel-cpu.Dockerfile
@@ -84,7 +84,7 @@ RUN python3 -m pip --no-cache-dir install \
     keras_preprocessing \
     matplotlib \
     mock \
-    numpy \
+    numpy<1.19.0 \
     scipy \
     sklearn \
     pandas \
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/devel-gpu-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/devel-gpu-jupyter.Dockerfile
index e7e717c584c..f6e6b2e4869 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/devel-gpu-jupyter.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/devel-gpu-jupyter.Dockerfile
@@ -126,7 +126,7 @@ RUN python3 -m pip --no-cache-dir install \
     keras_preprocessing \
     matplotlib \
     mock \
-    numpy \
+    numpy<1.19.0 \
     scipy \
     sklearn \
     pandas \
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/devel-gpu.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/devel-gpu.Dockerfile
index b18af60892f..9301ca36712 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/devel-gpu.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/devel-gpu.Dockerfile
@@ -126,7 +126,7 @@ RUN python3 -m pip --no-cache-dir install \
     keras_preprocessing \
     matplotlib \
     mock \
-    numpy \
+    numpy<1.19.0 \
     scipy \
     sklearn \
     pandas \
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/mkl_horovod/devel-horovod-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/mkl_horovod/devel-horovod-jupyter.Dockerfile
index db669389df4..a41f979aecd 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/mkl_horovod/devel-horovod-jupyter.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/mkl_horovod/devel-horovod-jupyter.Dockerfile
@@ -84,7 +84,7 @@ RUN python3 -m pip --no-cache-dir install \
     keras_preprocessing \
     matplotlib \
     mock \
-    numpy \
+    numpy<1.19.0 \
     scipy \
     sklearn \
     pandas \
@@ -93,7 +93,7 @@ RUN python3 -m pip --no-cache-dir install \
     enum34
 
 # Install bazel
-ARG BAZEL_VERSION=3.0.0
+ARG BAZEL_VERSION=3.1.0
 RUN mkdir /bazel && \
     wget -O /bazel/installer.sh "https://github.com/bazelbuild/bazel/releases/download/${BAZEL_VERSION}/bazel-${BAZEL_VERSION}-installer-linux-x86_64.sh" && \
     wget -O /bazel/LICENSE.txt "https://raw.githubusercontent.com/bazelbuild/bazel/master/LICENSE" && \
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/mkl_horovod/devel-horovod.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/mkl_horovod/devel-horovod.Dockerfile
index 5d90624f64f..8da0f799662 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/mkl_horovod/devel-horovod.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/mkl_horovod/devel-horovod.Dockerfile
@@ -84,7 +84,7 @@ RUN python3 -m pip --no-cache-dir install \
     keras_preprocessing \
     matplotlib \
     mock \
-    numpy \
+    numpy<1.19.0 \
     scipy \
     sklearn \
     pandas \
@@ -93,7 +93,7 @@ RUN python3 -m pip --no-cache-dir install \
     enum34
 
 # Install bazel
-ARG BAZEL_VERSION=3.0.0
+ARG BAZEL_VERSION=3.1.0
 RUN mkdir /bazel && \
     wget -O /bazel/installer.sh "https://github.com/bazelbuild/bazel/releases/download/${BAZEL_VERSION}/bazel-${BAZEL_VERSION}-installer-linux-x86_64.sh" && \
     wget -O /bazel/LICENSE.txt "https://raw.githubusercontent.com/bazelbuild/bazel/master/LICENSE" && \
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-cpu-ppc64le-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-cpu-ppc64le-jupyter.Dockerfile
index 53ccffd1403..9542bf3d17a 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-cpu-ppc64le-jupyter.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-cpu-ppc64le-jupyter.Dockerfile
@@ -83,7 +83,7 @@ RUN python3 -m pip --no-cache-dir install \
     keras_preprocessing \
     matplotlib \
     mock \
-    numpy \
+    numpy<1.19.0 \
     scipy \
     sklearn \
     pandas \
@@ -91,7 +91,7 @@ RUN python3 -m pip --no-cache-dir install \
     enum34
 
  # Build and install bazel
-ENV BAZEL_VERSION 3.0.0
+ENV BAZEL_VERSION 3.1.0
 WORKDIR /
 RUN mkdir /bazel && \
     cd /bazel && \
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-cpu-ppc64le.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-cpu-ppc64le.Dockerfile
index 1bbe7129479..60b0e07c62a 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-cpu-ppc64le.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-cpu-ppc64le.Dockerfile
@@ -83,7 +83,7 @@ RUN python3 -m pip --no-cache-dir install \
     keras_preprocessing \
     matplotlib \
     mock \
-    numpy \
+    numpy<1.19.0 \
     scipy \
     sklearn \
     pandas \
@@ -91,7 +91,7 @@ RUN python3 -m pip --no-cache-dir install \
     enum34
 
  # Build and install bazel
-ENV BAZEL_VERSION 3.0.0
+ENV BAZEL_VERSION 3.1.0
 WORKDIR /
 RUN mkdir /bazel && \
     cd /bazel && \
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-gpu-ppc64le-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-gpu-ppc64le-jupyter.Dockerfile
index 0700a354d3c..00116417490 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-gpu-ppc64le-jupyter.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-gpu-ppc64le-jupyter.Dockerfile
@@ -125,7 +125,7 @@ RUN python3 -m pip --no-cache-dir install \
     keras_preprocessing \
     matplotlib \
     mock \
-    numpy \
+    numpy<1.19.0 \
     scipy \
     sklearn \
     pandas \
@@ -133,7 +133,7 @@ RUN python3 -m pip --no-cache-dir install \
     enum34
 
  # Build and install bazel
-ENV BAZEL_VERSION 3.0.0
+ENV BAZEL_VERSION 3.1.0
 WORKDIR /
 RUN mkdir /bazel && \
     cd /bazel && \
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-gpu-ppc64le.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-gpu-ppc64le.Dockerfile
index b6d8ff8b90e..6c01d3499c9 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-gpu-ppc64le.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-gpu-ppc64le.Dockerfile
@@ -125,7 +125,7 @@ RUN python3 -m pip --no-cache-dir install \
     keras_preprocessing \
     matplotlib \
     mock \
-    numpy \
+    numpy<1.19.0 \
     scipy \
     sklearn \
     pandas \
@@ -133,7 +133,7 @@ RUN python3 -m pip --no-cache-dir install \
     enum34
 
  # Build and install bazel
-ENV BAZEL_VERSION 3.0.0
+ENV BAZEL_VERSION 3.1.0
 WORKDIR /
 RUN mkdir /bazel && \
     cd /bazel && \
diff --git a/tensorflow/tools/dockerfiles/partials/ubuntu/bazel.partial.Dockerfile b/tensorflow/tools/dockerfiles/partials/ubuntu/bazel.partial.Dockerfile
index 54fdb2be648..7babbdaae97 100644
--- a/tensorflow/tools/dockerfiles/partials/ubuntu/bazel.partial.Dockerfile
+++ b/tensorflow/tools/dockerfiles/partials/ubuntu/bazel.partial.Dockerfile
@@ -14,7 +14,7 @@ RUN python3 -m pip --no-cache-dir install \
     keras_preprocessing \
     matplotlib \
     mock \
-    numpy \
+    numpy<1.19.0 \
     scipy \
     sklearn \
     pandas \
diff --git a/tensorflow/tools/dockerfiles/partials/ubuntu/bazelbuild.partial.Dockerfile b/tensorflow/tools/dockerfiles/partials/ubuntu/bazelbuild.partial.Dockerfile
index c0327619ad9..f3f9d689506 100644
--- a/tensorflow/tools/dockerfiles/partials/ubuntu/bazelbuild.partial.Dockerfile
+++ b/tensorflow/tools/dockerfiles/partials/ubuntu/bazelbuild.partial.Dockerfile
@@ -13,7 +13,7 @@ RUN python3 -m pip --no-cache-dir install \
     keras_preprocessing \
     matplotlib \
     mock \
-    numpy \
+    numpy<1.19.0 \
     scipy \
     sklearn \
     pandas \

From 3786fd6a7ce1963c26da9d2d3c55221fa7f7a631 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 24 Jun 2020 13:08:03 -0700
Subject: [PATCH 1015/1390] Update TPU bridge pipeline to run graph pruning
 prior to island coarsening.

TPU cluster formation has been updated to not be dependent on TPUReplicatedInput indices, and TPU dynamic padder remapper has been updated to use stored indices for each TPUReplicatedInput instead of tf_device.replicate operand ordering. Without dependencies on missing TPUReplicatedInput indices, nodes that are unreachable, including TPUReplicatedInput, can be pruned away.

PiperOrigin-RevId: 318125735
Change-Id: Ib8f319bff39f7bc204e7b4d844af05564a5e2aaf
---
 tensorflow/compiler/mlir/tensorflow/transforms/bridge.cc | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/bridge.cc b/tensorflow/compiler/mlir/tensorflow/transforms/bridge.cc
index 6ff04c5dbda..ef7f63f82e3 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/bridge.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/bridge.cc
@@ -72,10 +72,11 @@ tensorflow::Status RunTPUBridge(
 }  // namespace
 
 void CreateTPUBridgePipeline(OpPassManager &pm) {
-  pm.addNestedPass<FuncOp>(tf_executor::CreateTFExecutorGraphPruningPass());
   // Run island coarsening before shape inference to allow more exact shape
   // inference using constant folding within islands.
   pm.addNestedPass<FuncOp>(tf_executor::CreateTFExecutorIslandCoarseningPass());
+  // TODO(b/150462212): Move graph pruning before island coarsening.
+  pm.addNestedPass<FuncOp>(tf_executor::CreateTFExecutorGraphPruningPass());
   // Run shape inference so that tf_executor/tf_device ops created later will
   // likely to inherit more concrete types.
   pm.addPass(TF::CreateTFShapeInferencePass());

From 2c4a0140f62e021bb596c11a3b306eb94e481d85 Mon Sep 17 00:00:00 2001
From: Lei Zhang <antiagainst@google.com>
Date: Wed, 24 Jun 2020 14:07:21 -0700
Subject: [PATCH 1016/1390] Exclude ML InlineAdvisor source files in LLVM BUILD

PiperOrigin-RevId: 318137323
Change-Id: I31d01d9942935a0e2d612d0521d746221117f6dd
---
 third_party/llvm/llvm.autogenerated.BUILD | 22 ++++++++++++++--------
 1 file changed, 14 insertions(+), 8 deletions(-)

diff --git a/third_party/llvm/llvm.autogenerated.BUILD b/third_party/llvm/llvm.autogenerated.BUILD
index 50ff746b9f2..75a53262c3b 100644
--- a/third_party/llvm/llvm.autogenerated.BUILD
+++ b/third_party/llvm/llvm.autogenerated.BUILD
@@ -1395,14 +1395,20 @@ cc_library(
 
 cc_library(
     name = "Analysis",
-    srcs = glob([
-        "lib/Analysis/*.c",
-        "lib/Analysis/*.cpp",
-        "lib/Analysis/*.inc",
-        "include/llvm/Transforms/Utils/Local.h",
-        "include/llvm/Transforms/Scalar.h",
-        "lib/Analysis/*.h",
-    ]),
+    srcs = glob(
+        [
+            "lib/Analysis/*.c",
+            "lib/Analysis/*.cpp",
+            "lib/Analysis/*.inc",
+            "include/llvm/Transforms/Utils/Local.h",
+            "include/llvm/Transforms/Scalar.h",
+            "lib/Analysis/*.h",
+        ],
+        exclude = [
+            "lib/Analysis/MLInlineAdvisor.cpp",
+            "lib/Analysis/ReleaseModeModelRunner.cpp",
+        ],
+    ),
     hdrs = glob([
         "include/llvm/Analysis/*.h",
         "include/llvm/Analysis/*.def",

From 5fff246364f5bcbe289572136bff801309cf5b92 Mon Sep 17 00:00:00 2001
From: Peter Hawkins <phawkins@google.com>
Date: Wed, 24 Jun 2020 14:14:44 -0700
Subject: [PATCH 1017/1390] [XLA:Python] Add AllGather to HLO bindings.

First step in fixing https://github.com/google/jax/issues/3431

PiperOrigin-RevId: 318138763
Change-Id: Ie79e882f649ce96ea373a76f936b0a783ac2cebf
---
 tensorflow/compiler/xla/python/ops.cc | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/tensorflow/compiler/xla/python/ops.cc b/tensorflow/compiler/xla/python/ops.cc
index 89891d39f78..9362a367dfc 100644
--- a/tensorflow/compiler/xla/python/ops.cc
+++ b/tensorflow/compiler/xla/python/ops.cc
@@ -50,6 +50,11 @@ void BuildOpsSubmodule(py::module* m) {
       .value("ADJOINT", TriangularSolveOptions::ADJOINT);
 
   ops.def("AfterAll", &AfterAll, py::arg("builder"), py::arg("tokens"));
+  ops.def("AllGather", &AllGather, py::arg("operand"),
+          py::arg("all_gather_dimension"), py::arg("shard_count"),
+          py::arg("replica_groups") = py::list(),
+          py::arg("channel_id") = absl::nullopt,
+          py::arg("shape_with_layout") = absl::nullopt);
   ops.def(
       "AllReduce",
       static_cast<XlaOp (*)(

From eb500b17ac6178d5d411203b9771c720b7f117ac Mon Sep 17 00:00:00 2001
From: rahul-kamat <rahulkamat@gmail.com>
Date: Wed, 24 Jun 2020 21:23:32 +0000
Subject: [PATCH 1018/1390] Pass single structured arg to function

---
 tensorflow/python/framework/python_op_gen.cc | 12 ++++++------
 tensorflow/python/framework/python_op_gen.h  |  2 +-
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/tensorflow/python/framework/python_op_gen.cc b/tensorflow/python/framework/python_op_gen.cc
index 473eae43cd2..ecece1655ef 100644
--- a/tensorflow/python/framework/python_op_gen.cc
+++ b/tensorflow/python/framework/python_op_gen.cc
@@ -443,7 +443,7 @@ std::unordered_map<string, string> GenEagerPythonOp::GetTypeAnnotations() {
   for (const auto& arg : op_def_.input_arg()) {
     // TODO(rahulkamat): Add type annotations to args that accept a sequence of Tensors
     if (!arg.number_attr().empty() || !arg.type_list_attr().empty()) continue;
-    type_annotations[arg.name()] = GetArgAnnotation(arg.type_attr(), arg.type(), type_annotations);
+    type_annotations[arg.name()] = GetArgAnnotation(arg, type_annotations);
   }
 
   // TODO(rahulkamat): Add type annotations to handle return types of a sequence of Tensors.
@@ -451,7 +451,7 @@ std::unordered_map<string, string> GenEagerPythonOp::GetTypeAnnotations() {
   if (op_def_.output_arg_size() == 1) {
     const auto& arg = op_def_.output_arg(0);
     if (arg.number_attr().empty() && arg.type_list_attr().empty())
-      type_annotations[arg.name()] = GetArgAnnotation(arg.type_attr(), arg.type(), type_annotations);
+      type_annotations[arg.name()] = GetArgAnnotation(arg, type_annotations);
   }
 
   return type_annotations;
@@ -1246,13 +1246,13 @@ string GetPythonWrappers(const char* op_list_buf, size_t op_list_len) {
   return GetPythonOpsImpl(ops, api_def_map, {});
 }
 
-string GetArgAnnotation(const string& arg_type_attr, DataType arg_type, const std::unordered_map<string, string>& type_annotations) {
-  if (!arg_type_attr.empty()) {
+string GetArgAnnotation(const OpDef::ArgDef& arg, const std::unordered_map<string, string>& type_annotations) {
+  if (!arg.type_attr().empty()) {
     // Get the correct TypeVar if arg maps to an attr
-    return "_ops.Tensor[" + type_annotations.at(arg_type_attr) + "]";
+    return "_ops.Tensor[" + type_annotations.at(arg.type_attr()) + "]";
   } else {
     // Get the dtype of the Tensor
-    const string py_dtype = python_op_gen_internal::DataTypeToPython(arg_type, "_dtypes.");
+    const string py_dtype = python_op_gen_internal::DataTypeToPython(arg.type(), "_dtypes.");
     return "_ops.Tensor[" + dtype_type.at(py_dtype) + "]";
   }
 
diff --git a/tensorflow/python/framework/python_op_gen.h b/tensorflow/python/framework/python_op_gen.h
index 75f04952d48..178e078a81b 100644
--- a/tensorflow/python/framework/python_op_gen.h
+++ b/tensorflow/python/framework/python_op_gen.h
@@ -54,7 +54,7 @@ string GetPythonWrappers(const char* op_list_buf, size_t op_list_len);
 // Get the type annotation for an arg
 // `arg` should be an input or output of an op
 // `type_annotations` should contain attr names mapped to TypeVar names
-string GetArgAnnotation(const string& arg_type_attr, DataType arg_type,
+string GetArgAnnotation(const OpDef::ArgDef& arg,
                         const std::unordered_map<string, string>& type_annotations);
 
 }  // namespace tensorflow

From 80c6eb3afffc8e97496613b9efdfb0f45a5dfbe6 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 24 Jun 2020 14:15:06 -0700
Subject: [PATCH 1019/1390] fix build on cuda 9.

PiperOrigin-RevId: 318138843
Change-Id: Iec603fb709107072eab59d2499dfbae46c93c998
---
 tensorflow/core/profiler/internal/gpu/device_tracer.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/profiler/internal/gpu/device_tracer.cc b/tensorflow/core/profiler/internal/gpu/device_tracer.cc
index 48391324f79..6fc10bbf95b 100644
--- a/tensorflow/core/profiler/internal/gpu/device_tracer.cc
+++ b/tensorflow/core/profiler/internal/gpu/device_tracer.cc
@@ -614,7 +614,7 @@ Status GpuTracer::DoStart() {
 
 // CUDA/CUPTI 10 have issues (leaks and crashes) with CuptiFinalize.
 #if CUDA_VERSION < 10000
-  if (!options.trace_concurrent_kernels()) options_.cupti_finalize = true;
+  if (!trace_concurrent_kernels) options_.cupti_finalize = true;
 #elif CUDA_VERSION >= 11000
   options_.cupti_finalize = true;
 #endif

From 7113c0b029ad067bf9b5e633082434bb4a4aba62 Mon Sep 17 00:00:00 2001
From: Jaesung Chung <jaesung@google.com>
Date: Wed, 24 Jun 2020 14:36:13 -0700
Subject: [PATCH 1020/1390] Factor out tf_saved_model passes from
 tensorflow_passes

Since tf_saved_model passes now requires CPU op kernels to capture resource
variables, it would be better to be split from the tensorflow passes to keep
use cases, that are not required tf_saved_model dialect, small.

PiperOrigin-RevId: 318143061
Change-Id: I0b30efd1dcb8945cdfe6184664f66cd757e0ea67
---
 tensorflow/compiler/mlir/lite/BUILD           |  1 +
 .../compiler/mlir/lite/tf_tfl_passes.cc       |  1 +
 tensorflow/compiler/mlir/tensorflow/BUILD     | 30 ++++++++++++-
 .../transforms/lift_variables_pass.h          |  6 ---
 .../lift_variables_pass_registration.cc       | 27 ++++++++++++
 .../mlir/tensorflow/transforms/passes.h       | 10 -----
 .../transforms/tf_saved_model_passes.h        | 42 +++++++++++++++++++
 tensorflow/compiler/mlir/tfjs/BUILD           |  1 +
 .../compiler/mlir/tfjs/tf_tfjs_passes.cc      |  1 +
 9 files changed, 101 insertions(+), 18 deletions(-)
 create mode 100644 tensorflow/compiler/mlir/tensorflow/transforms/lift_variables_pass_registration.cc
 create mode 100644 tensorflow/compiler/mlir/tensorflow/transforms/tf_saved_model_passes.h

diff --git a/tensorflow/compiler/mlir/lite/BUILD b/tensorflow/compiler/mlir/lite/BUILD
index 8d4efeb3d60..f3ad2e11e57 100644
--- a/tensorflow/compiler/mlir/lite/BUILD
+++ b/tensorflow/compiler/mlir/lite/BUILD
@@ -839,6 +839,7 @@ cc_library(
         "//tensorflow/compiler/mlir/tensorflow:tensorflow_dialect_registration",
         "//tensorflow/compiler/mlir/tensorflow:tensorflow_passes",
         "//tensorflow/compiler/mlir/tensorflow:tf_graph_optimization_pass",
+        "//tensorflow/compiler/mlir/tensorflow:tf_saved_model_passes",
         "//tensorflow/compiler/mlir/tensorflow:translate_lib",
         "@llvm-project//mlir:AllPassesAndDialects",
         "@llvm-project//mlir:Analysis",
diff --git a/tensorflow/compiler/mlir/lite/tf_tfl_passes.cc b/tensorflow/compiler/mlir/lite/tf_tfl_passes.cc
index 3fa2eae42f2..de07ce791a0 100644
--- a/tensorflow/compiler/mlir/lite/tf_tfl_passes.cc
+++ b/tensorflow/compiler/mlir/lite/tf_tfl_passes.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/lite/transforms/passes.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/decode_constant.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
+#include "tensorflow/compiler/mlir/tensorflow/transforms/tf_saved_model_passes.h"
 #include "tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate.h"
 
 namespace mlir {
diff --git a/tensorflow/compiler/mlir/tensorflow/BUILD b/tensorflow/compiler/mlir/tensorflow/BUILD
index db31d4faf5f..5de8db4db0f 100644
--- a/tensorflow/compiler/mlir/tensorflow/BUILD
+++ b/tensorflow/compiler/mlir/tensorflow/BUILD
@@ -469,6 +469,34 @@ cc_library(
     alwayslink = 1,
 )
 
+cc_library(
+    name = "tf_saved_model_passes",
+    srcs = [
+        "transforms/freeze_global_tensors.cc",
+        "transforms/lift_variables_pass_registration.cc",
+        "transforms/optimize_global_tensors.cc",
+    ],
+    hdrs = [
+        "transforms/tf_saved_model_passes.h",
+    ],
+    deps = [
+        ":lift_variables_pass",
+        ":tensorflow",
+        ":tensorflow_passes",
+        ":tensorflow_types",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:StandardOps",
+        "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:Transforms",
+    ],
+    alwayslink = 1,
+)
+
 cc_library(
     name = "tensorflow_passes",
     srcs = [
@@ -487,7 +515,6 @@ cc_library(
         "transforms/executor_tpuv1_island_coarsening.cc",
         "transforms/executor_tpuv1_outline_tpu_island.cc",
         "transforms/fold_switch.cc",
-        "transforms/freeze_global_tensors.cc",
         "transforms/functional_control_flow_to_cfg.cc",
         "transforms/functional_control_flow_to_regions.cc",
         "transforms/fused_kernel_matcher.cc",
@@ -499,7 +526,6 @@ cc_library(
         "transforms/layout_optimization.cc",
         "transforms/materialize_mlir_passthrough_op.cc",
         "transforms/optimize.cc",
-        "transforms/optimize_global_tensors.cc",
         "transforms/parallel_execute_to_islands.cc",
         "transforms/promote_resources_to_args.cc",
         "transforms/readonly_references_to_resources.cc",
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/lift_variables_pass.h b/tensorflow/compiler/mlir/tensorflow/transforms/lift_variables_pass.h
index 0eaee959c77..cff4936d6eb 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/lift_variables_pass.h
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/lift_variables_pass.h
@@ -45,12 +45,6 @@ class LiftVariablesPass
   ::tensorflow::Session* session_;
 };
 
-// Creates as pass that creates GlobalTensorOp for each variable from function
-// arguments and converts the function arguments to the corresponding saved
-// model arguments.
-std::unique_ptr<OperationPass<ModuleOp>> CreateLiftVariablesPass(
-    ::tensorflow::Session* session);
-
 }  // namespace tf_saved_model
 }  // namespace mlir
 
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/lift_variables_pass_registration.cc b/tensorflow/compiler/mlir/tensorflow/transforms/lift_variables_pass_registration.cc
new file mode 100644
index 00000000000..672c1695204
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/lift_variables_pass_registration.cc
@@ -0,0 +1,27 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/mlir/tensorflow/transforms/lift_variables_pass.h"
+
+namespace mlir {
+namespace tf_saved_model {
+
+std::unique_ptr<OperationPass<ModuleOp>> CreateLiftVariablesPass(
+    ::tensorflow::Session* session) {
+  return std::make_unique<LiftVariablesPass>(session);
+}
+
+}  // namespace tf_saved_model
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/passes.h b/tensorflow/compiler/mlir/tensorflow/transforms/passes.h
index 168b317641d..795f48711cb 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/passes.h
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/passes.h
@@ -296,16 +296,6 @@ void CreateTPUBridgePipelineV1(OpPassManager& pm);
 
 }  // namespace TFTPU
 
-namespace tf_saved_model {
-
-// Creates a pass that optimizes tf_saved_model.global_tensor ops.
-std::unique_ptr<OperationPass<ModuleOp>> CreateOptimizeGlobalTensorsPass();
-
-// Creates a pass that freezes tf_saved_model.global_tensor ops.
-std::unique_ptr<OperationPass<ModuleOp>> CreateFreezeGlobalTensorsPass();
-
-}  // namespace tf_saved_model
-
 }  // namespace mlir
 
 #endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSFORMS_PASSES_H_
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tf_saved_model_passes.h b/tensorflow/compiler/mlir/tensorflow/transforms/tf_saved_model_passes.h
new file mode 100644
index 00000000000..242c4c002c9
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tf_saved_model_passes.h
@@ -0,0 +1,42 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSFORMS_TF_SAVED_MODEL_PASSES_H_
+#define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSFORMS_TF_SAVED_MODEL_PASSES_H_
+
+#include <memory>
+
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "tensorflow/core/public/session.h"
+
+namespace mlir {
+namespace tf_saved_model {
+
+// Creates a pass that optimizes tf_saved_model.global_tensor ops.
+std::unique_ptr<OperationPass<ModuleOp>> CreateOptimizeGlobalTensorsPass();
+
+// Creates a pass that freezes tf_saved_model.global_tensor ops.
+std::unique_ptr<OperationPass<ModuleOp>> CreateFreezeGlobalTensorsPass();
+
+// Creates as pass that creates GlobalTensorOp for each variable from function
+// arguments and converts the function arguments to the corresponding saved
+// model arguments.
+std::unique_ptr<OperationPass<ModuleOp>> CreateLiftVariablesPass(
+    ::tensorflow::Session* session);
+
+}  // namespace tf_saved_model
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSFORMS_TF_SAVED_MODEL_PASSES_H_
diff --git a/tensorflow/compiler/mlir/tfjs/BUILD b/tensorflow/compiler/mlir/tfjs/BUILD
index b4b72aef003..7d3091f921f 100644
--- a/tensorflow/compiler/mlir/tfjs/BUILD
+++ b/tensorflow/compiler/mlir/tfjs/BUILD
@@ -132,6 +132,7 @@ cc_library(
         "//tensorflow/compiler/mlir/tensorflow:tensorflow_dialect_registration",
         "//tensorflow/compiler/mlir/tensorflow:tensorflow_passes",
         "//tensorflow/compiler/mlir/tensorflow:tf_graph_optimization_pass",
+        "//tensorflow/compiler/mlir/tensorflow:tf_saved_model_passes",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:Transforms",
diff --git a/tensorflow/compiler/mlir/tfjs/tf_tfjs_passes.cc b/tensorflow/compiler/mlir/tfjs/tf_tfjs_passes.cc
index d48d90997de..b7e95629062 100644
--- a/tensorflow/compiler/mlir/tfjs/tf_tfjs_passes.cc
+++ b/tensorflow/compiler/mlir/tfjs/tf_tfjs_passes.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "mlir/Pass/PassManager.h"  // from @llvm-project
 #include "mlir/Transforms/Passes.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
+#include "tensorflow/compiler/mlir/tensorflow/transforms/tf_saved_model_passes.h"
 #include "tensorflow/compiler/mlir/tfjs/transforms/passes.h"
 
 namespace tensorflow {

From 9a2e83b5f51e31a62b55e2536cd5979da616f61d Mon Sep 17 00:00:00 2001
From: Lukas Geiger <lukas.geiger94@gmail.com>
Date: Wed, 24 Jun 2020 23:45:31 +0200
Subject: [PATCH 1021/1390] Fix _EagerTensorCache.flush

---
 tensorflow/python/eager/context.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/eager/context.py b/tensorflow/python/eager/context.py
index e1f3c58496b..c87ef2c19b7 100644
--- a/tensorflow/python/eager/context.py
+++ b/tensorflow/python/eager/context.py
@@ -98,7 +98,7 @@ class _EagerTensorCache(object):
     return self._data.get(key, None)
 
   def flush(self):
-    self._data = {}
+    self._data.clear()
 
 
 class FunctionCallOptions(object):

From 3be744b79c3d63e5b1f70d610e815b7dca77f816 Mon Sep 17 00:00:00 2001
From: Saurabh Saxena <srbs@google.com>
Date: Wed, 24 Jun 2020 14:45:03 -0700
Subject: [PATCH 1022/1390] Prepare Abstract* interfaces and its subclasses for
 LLVM style RTTI by adding `classof`.

PiperOrigin-RevId: 318144766
Change-Id: Ie2bc05db0c8b022b085c6b60ad4bd54838407abe
---
 tensorflow/c/eager/abstract_context.h         |  3 +-
 tensorflow/c/eager/abstract_operation.h       |  2 +-
 tensorflow/c/eager/abstract_tensor_handle.h   |  3 +-
 .../c/eager/immediate_execution_context.h     |  9 ++++--
 .../c/eager/immediate_execution_operation.h   |  9 ++++--
 .../eager/immediate_execution_tensor_handle.h | 10 +++++--
 .../c/experimental/saved_model/core/ops/BUILD |  1 +
 .../saved_model/core/ops/variable_ops.cc      |  6 ++--
 tensorflow/core/BUILD                         |  1 +
 .../core/common_runtime/eager/context.cc      |  4 ++-
 .../core/common_runtime/eager/context.h       |  5 ++++
 .../common_runtime/eager/eager_operation.h    |  9 +++++-
 .../common_runtime/eager/tensor_handle.cc     | 21 +++++++++-----
 .../core/common_runtime/eager/tensor_handle.h |  5 ++++
 tensorflow/core/lib/llvm_rtti/BUILD           | 28 +++++++++++++++++++
 tensorflow/core/lib/llvm_rtti/llvm_rtti.h     | 25 +++++++++++++++++
 16 files changed, 118 insertions(+), 23 deletions(-)
 create mode 100644 tensorflow/core/lib/llvm_rtti/BUILD
 create mode 100644 tensorflow/core/lib/llvm_rtti/llvm_rtti.h

diff --git a/tensorflow/c/eager/abstract_context.h b/tensorflow/c/eager/abstract_context.h
index 36d983e1408..b488255d150 100644
--- a/tensorflow/c/eager/abstract_context.h
+++ b/tensorflow/c/eager/abstract_context.h
@@ -16,7 +16,6 @@ limitations under the License.
 #define TENSORFLOW_C_EAGER_ABSTRACT_CONTEXT_H_
 
 #include <memory>
-#include <vector>
 
 #include "tensorflow/c/eager/abstract_function.h"
 #include "tensorflow/c/eager/abstract_operation.h"
@@ -33,7 +32,7 @@ namespace tensorflow {
 // environment, a traced representation etc.
 class AbstractContext {
  protected:
-  enum AbstractContextKind { kTracing, kImmediateExecution };
+  enum AbstractContextKind { kGraph, kMlir, kEager, kTfrt };
   explicit AbstractContext(AbstractContextKind kind) : kind_(kind) {}
   virtual ~AbstractContext() {}
 
diff --git a/tensorflow/c/eager/abstract_operation.h b/tensorflow/c/eager/abstract_operation.h
index 817d7656ec8..de8c7c951f1 100644
--- a/tensorflow/c/eager/abstract_operation.h
+++ b/tensorflow/c/eager/abstract_operation.h
@@ -30,7 +30,7 @@ namespace tensorflow {
 // tracing or immediate execution mode.
 class AbstractOperation {
  protected:
-  enum AbstractOperationKind { kTracing, kImmediateExecution };
+  enum AbstractOperationKind { kGraph, kMlir, kEager, kTfrt };
   explicit AbstractOperation(AbstractOperationKind kind) : kind_(kind) {}
   virtual ~AbstractOperation() {}
 
diff --git a/tensorflow/c/eager/abstract_tensor_handle.h b/tensorflow/c/eager/abstract_tensor_handle.h
index 64b941d0729..d50bd4530db 100644
--- a/tensorflow/c/eager/abstract_tensor_handle.h
+++ b/tensorflow/c/eager/abstract_tensor_handle.h
@@ -16,14 +16,13 @@ limitations under the License.
 #define TENSORFLOW_C_EAGER_ABSTRACT_TENSOR_HANDLE_H_
 
 #include <memory>
-
 namespace tensorflow {
 
 // Abstract interface to a Tensor handle in either tracing or immediate
 // execution mode.
 class AbstractTensorHandle {
  protected:
-  enum AbstractTensorHandleKind { kTracing, kImmediateExecution };
+  enum AbstractTensorHandleKind { kGraph, kMlir, kEager, kTfrt };
   explicit AbstractTensorHandle(AbstractTensorHandleKind kind) : kind_(kind) {}
   virtual ~AbstractTensorHandle() {}
 
diff --git a/tensorflow/c/eager/immediate_execution_context.h b/tensorflow/c/eager/immediate_execution_context.h
index 77d59dd23e2..6d06d9a8de6 100644
--- a/tensorflow/c/eager/immediate_execution_context.h
+++ b/tensorflow/c/eager/immediate_execution_context.h
@@ -38,7 +38,6 @@ namespace tensorflow {
 // TensorHandles & Operations.
 class ImmediateExecutionContext : public AbstractContext {
  public:
-  static constexpr AbstractContextKind kKind = kImmediateExecution;
   // Optimized scalar creation functions
   virtual AbstractTensorInterface* CreateInt64Scalar(int64 value) = 0;
   virtual AbstractTensorInterface* CreateUint64Scalar(uint64 value) = 0;
@@ -103,8 +102,14 @@ class ImmediateExecutionContext : public AbstractContext {
   // already exists.
   virtual Status AddFunctionDef(const FunctionDef& fdef) = 0;
 
+  // For LLVM style RTTI.
+  static bool classof(const AbstractContext* ptr) {
+    return ptr->getKind() == kEager || ptr->getKind() == kTfrt;
+  }
+
  protected:
-  ImmediateExecutionContext() : AbstractContext(kKind) {}
+  explicit ImmediateExecutionContext(AbstractContextKind kind)
+      : AbstractContext(kind) {}
   ~ImmediateExecutionContext() override {}
 };
 
diff --git a/tensorflow/c/eager/immediate_execution_operation.h b/tensorflow/c/eager/immediate_execution_operation.h
index 4e2959ba7af..31a75c5b8c7 100644
--- a/tensorflow/c/eager/immediate_execution_operation.h
+++ b/tensorflow/c/eager/immediate_execution_operation.h
@@ -34,7 +34,6 @@ namespace tensorflow {
 // Abstract interface to an operation.
 class ImmediateExecutionOperation : public AbstractOperation {
  public:
-  static constexpr AbstractOperationKind kKind = kImmediateExecution;
   virtual void Clear() = 0;
 
   virtual const tensorflow::OpDef* OpDef() const = 0;
@@ -45,8 +44,14 @@ class ImmediateExecutionOperation : public AbstractOperation {
   // Experimental
   virtual Status SetUseXla(bool enable) = 0;
 
+  // For LLVM style RTTI.
+  static bool classof(const AbstractOperation* ptr) {
+    return ptr->getKind() == kEager || ptr->getKind() == kTfrt;
+  }
+
  protected:
-  ImmediateExecutionOperation() : AbstractOperation(kKind) {}
+  explicit ImmediateExecutionOperation(AbstractOperationKind kind)
+      : AbstractOperation(kind) {}
   ~ImmediateExecutionOperation() override {}
 };
 
diff --git a/tensorflow/c/eager/immediate_execution_tensor_handle.h b/tensorflow/c/eager/immediate_execution_tensor_handle.h
index 31aa3aa0f75..c9e39a80663 100644
--- a/tensorflow/c/eager/immediate_execution_tensor_handle.h
+++ b/tensorflow/c/eager/immediate_execution_tensor_handle.h
@@ -33,8 +33,6 @@ namespace tensorflow {
 // is needed a static_cast can be applied.
 class ImmediateExecutionTensorHandle : public AbstractTensorHandle {
  public:
-  static constexpr AbstractTensorHandleKind kKind = kImmediateExecution;
-
   // Returns tensor dtype.
   virtual tensorflow::DataType DataType() const = 0;
   // Returns number of dimensions.
@@ -54,8 +52,14 @@ class ImmediateExecutionTensorHandle : public AbstractTensorHandle {
   // Return a copy of the handle.
   virtual ImmediateExecutionTensorHandle* Copy() = 0;
 
+  // For LLVM style RTTI.
+  static bool classof(const AbstractTensorHandle* ptr) {
+    return ptr->getKind() == kEager || ptr->getKind() == kTfrt;
+  }
+
  protected:
-  ImmediateExecutionTensorHandle() : AbstractTensorHandle(kKind) {}
+  explicit ImmediateExecutionTensorHandle(AbstractTensorHandleKind kind)
+      : AbstractTensorHandle(kind) {}
   ~ImmediateExecutionTensorHandle() override {}
 };
 
diff --git a/tensorflow/c/experimental/saved_model/core/ops/BUILD b/tensorflow/c/experimental/saved_model/core/ops/BUILD
index 1e2496487f9..6205ef82bab 100644
--- a/tensorflow/c/experimental/saved_model/core/ops/BUILD
+++ b/tensorflow/c/experimental/saved_model/core/ops/BUILD
@@ -31,6 +31,7 @@ cc_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/lib/llvm_rtti",
         "@com_google_absl//absl/types:span",
     ],
 )
diff --git a/tensorflow/c/experimental/saved_model/core/ops/variable_ops.cc b/tensorflow/c/experimental/saved_model/core/ops/variable_ops.cc
index 67c592fc16b..492a58f816d 100644
--- a/tensorflow/c/experimental/saved_model/core/ops/variable_ops.cc
+++ b/tensorflow/c/experimental/saved_model/core/ops/variable_ops.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/lib/gtl/inlined_vector.h"
+#include "tensorflow/core/lib/llvm_rtti/llvm_rtti.h"
 #include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/platform/types.h"
@@ -57,7 +58,8 @@ Status CreateUninitializedResourceVariable(ImmediateExecutionContext* ctx,
   TF_RETURN_IF_ERROR(varhandle_op->Execute(
       absl::MakeSpan(&var_handle, num_retvals), &num_retvals));
   AbstractTensorHandlePtr owned_var_handle(var_handle);
-  if (owned_var_handle->getKind() != ImmediateExecutionTensorHandle::kKind) {
+  if (!tensorflow::isa<ImmediateExecutionTensorHandle>(
+          owned_var_handle.get())) {
     return errors::Internal("Unexpected tensor handle kind.");
   }
   handle->reset(reinterpret_cast<ImmediateExecutionTensorHandle*>(
@@ -92,7 +94,7 @@ Status ReadVariable(ImmediateExecutionContext* ctx,
   TF_RETURN_IF_ERROR(
       read_op->Execute(absl::MakeSpan(&value, num_retvals), &num_retvals));
   AbstractTensorHandlePtr owned_value(value);
-  if (owned_value->getKind() != ImmediateExecutionTensorHandle::kKind) {
+  if (!tensorflow::isa<ImmediateExecutionTensorHandle>(owned_value.get())) {
     return errors::Internal("Unexpected tensor handle kind.");
   }
   output->reset(
diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index d0be6ee9597..d7f1df66c52 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -1736,6 +1736,7 @@ filegroup(
         "//tensorflow/core/lib/gtl:legacy_lib_internal_public_gtl_headers",
         "//tensorflow/core/lib/hash:legacy_lib_internal_public_headers",
         "//tensorflow/core/lib/io:legacy_lib_internal_public_headers",
+        "//tensorflow/core/lib/llvm_rtti:legacy_lib_core_all_headers",
         "//tensorflow/core/lib/monitoring:legacy_lib_monitoring_lib_internal_public_headers",
         "//tensorflow/core/lib/random:legacy_lib_internal_public_random_headers",
         "//tensorflow/core/lib/strings:legacy_lib_internal_public_string_headers",
diff --git a/tensorflow/core/common_runtime/eager/context.cc b/tensorflow/core/common_runtime/eager/context.cc
index 7ca40fc6cf6..9c0ab9ba849 100644
--- a/tensorflow/core/common_runtime/eager/context.cc
+++ b/tensorflow/core/common_runtime/eager/context.cc
@@ -20,6 +20,7 @@ limitations under the License.
 
 // clang-format off
 // Required for IS_MOBILE_PLATFORM
+#include "tensorflow/c/eager/immediate_execution_context.h"
 #include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/common_runtime/process_function_library_runtime.h"
 #include "tensorflow/core/lib/core/refcount.h"
@@ -77,7 +78,8 @@ EagerContext::EagerContext(
     bool device_mgr_owned, Rendezvous* rendezvous,
     const CustomKernelCreator* custom_kernel_creator,
     DistributedFunctionLibraryRuntime* cluster_flr)
-    : opts_(opts),
+    : ImmediateExecutionContext(kEager),
+      opts_(opts),
       default_device_placement_policy_(default_device_placement_policy),
       default_mirroring_policy_(default_mirroring_policy),
       local_device_manager_(device_mgr, device_mgr_owned),
diff --git a/tensorflow/core/common_runtime/eager/context.h b/tensorflow/core/common_runtime/eager/context.h
index e6769279558..2cafc9579a8 100644
--- a/tensorflow/core/common_runtime/eager/context.h
+++ b/tensorflow/core/common_runtime/eager/context.h
@@ -669,6 +669,11 @@ class EagerContext : public ImmediateExecutionContext, public core::RefCounted {
       std::unique_ptr<eager::RemoteMgr, std::function<void(eager::RemoteMgr*)>>
           remote_mgr);
 
+  // For LLVM style RTTI.
+  static bool classof(const AbstractContext* ptr) {
+    return ptr->getKind() == kEager;
+  }
+
   // The server_ is not const since we release it when the context is destroyed.
   // Therefore the server_ object is not marked as const (even though it should
   // be).
diff --git a/tensorflow/core/common_runtime/eager/eager_operation.h b/tensorflow/core/common_runtime/eager/eager_operation.h
index fa245649d5c..93903b23b45 100644
--- a/tensorflow/core/common_runtime/eager/eager_operation.h
+++ b/tensorflow/core/common_runtime/eager/eager_operation.h
@@ -20,6 +20,7 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "absl/types/variant.h"
 #include "tensorflow/c/eager/abstract_tensor_handle.h"
+#include "tensorflow/c/eager/immediate_execution_operation.h"
 #include "tensorflow/core/common_runtime/eager/attr_builder.h"
 #include "tensorflow/core/common_runtime/eager/context.h"
 #include "tensorflow/core/common_runtime/eager/eager_executor.h"
@@ -34,7 +35,8 @@ namespace tensorflow {
 
 class EagerOperation : public ImmediateExecutionOperation {
  public:
-  explicit EagerOperation(tensorflow::EagerContext* ctx) : ctx_(*ctx) {}
+  explicit EagerOperation(tensorflow::EagerContext* ctx)
+      : ImmediateExecutionOperation(kEager), ctx_(*ctx) {}
   ~EagerOperation() override {
     for (TensorHandle* h : inputs_) {
       h->Unref();
@@ -164,6 +166,11 @@ class EagerOperation : public ImmediateExecutionOperation {
   // Op name recorded for memory debugging purpose.
   const char* op_name() const { return op_name_; }
 
+  // For LLVM style RTTI.
+  static bool classof(const AbstractOperation* ptr) {
+    return ptr->getKind() == kEager;
+  }
+
  private:
   void AddTensorHandle(TensorHandle* h);
 
diff --git a/tensorflow/core/common_runtime/eager/tensor_handle.cc b/tensorflow/core/common_runtime/eager/tensor_handle.cc
index 0cd55959924..12bd70d705d 100644
--- a/tensorflow/core/common_runtime/eager/tensor_handle.cc
+++ b/tensorflow/core/common_runtime/eager/tensor_handle.cc
@@ -225,7 +225,8 @@ TensorHandle* TensorHandle::CreateLocalHandle(tensorflow::Tensor&& t,
 
 TensorHandle::TensorHandle(tensorflow::Tensor&& t, Device* d, Device* op_device,
                            Device* resource_device, EagerContext* ctx)
-    : dtype(t.dtype()),
+    : ImmediateExecutionTensorHandle(kEager),
+      dtype(t.dtype()),
       device_((!ctx || d == ctx->HostCPU()) ? nullptr : d),
       op_device_(op_device),
       resource_device_(resource_device),
@@ -240,7 +241,8 @@ TensorHandle::TensorHandle(tensorflow::Tensor&& t, Device* d, Device* op_device,
 
 TensorHandle::TensorHandle(tensorflow::Tensor&& t, Device* d, Device* op_device,
                            EagerContext* ctx)
-    : dtype(DT_RESOURCE),
+    : ImmediateExecutionTensorHandle(kEager),
+      dtype(DT_RESOURCE),
       device_((!ctx || d == ctx->HostCPU()) ? nullptr : d),
       op_device_(op_device),
       resource_device_(
@@ -258,7 +260,8 @@ TensorHandle::TensorHandle(tensorflow::Tensor&& t, Device* d, Device* op_device,
 
 TensorHandle::TensorHandle(tensorflow::Tensor&& t, CustomDevice* d,
                            EagerContext* ctx)
-    : dtype(t.dtype()),
+    : ImmediateExecutionTensorHandle(kEager),
+      dtype(t.dtype()),
       device_(d),
       op_device_(nullptr),
       resource_device_(nullptr),
@@ -282,7 +285,8 @@ TensorHandle* TensorHandle::CreateEmptyLocalHandle(Device* d, Device* op_device,
 TensorHandle::TensorHandle(Device* d, Device* op_device,
                            Device* resource_device, tensorflow::DataType dtype,
                            EagerContext* ctx)
-    : dtype(dtype),
+    : ImmediateExecutionTensorHandle(kEager),
+      dtype(dtype),
       device_((d == ctx->HostCPU()) ? nullptr : d),
       op_device_(op_device),
       resource_device_(resource_device),
@@ -350,7 +354,8 @@ TensorHandle::TensorHandle(std::vector<TensorHandle*>&& handles, Device* device,
                            const tensorflow::DataType dtype,
                            const tensorflow::TensorShape& shape,
                            EagerContext* ctx)
-    : dtype(dtype),
+    : ImmediateExecutionTensorHandle(kEager),
+      dtype(dtype),
       device_(device),
       op_device_(device),
       resource_device_(dtype == DT_RESOURCE ? device : nullptr),
@@ -374,7 +379,8 @@ TensorHandle::TensorHandle(int64 op_id, int32 output_num,
                            const string& remote_task,
                            tensorflow::DataType dtype, Device* d,
                            EagerContext* ctx)
-    : dtype(dtype),
+    : ImmediateExecutionTensorHandle(kEager),
+      dtype(dtype),
       device_(d),
       op_device_(d),
       resource_device_(dtype == DT_RESOURCE ? d : nullptr),
@@ -398,7 +404,8 @@ TensorHandle* TensorHandle::CreateLazyRemoteHandle(int64 op_id,
 TensorHandle::TensorHandle(int64 op_id, int32 output_num,
                            tensorflow::DataType dtype, Device* d,
                            EagerContext* ctx)
-    : dtype(dtype),
+    : ImmediateExecutionTensorHandle(kEager),
+      dtype(dtype),
       device_(d),
       op_device_(d),
       resource_device_(dtype == DT_RESOURCE ? d : nullptr),
diff --git a/tensorflow/core/common_runtime/eager/tensor_handle.h b/tensorflow/core/common_runtime/eager/tensor_handle.h
index 8ef482cd82c..007ba33f231 100644
--- a/tensorflow/core/common_runtime/eager/tensor_handle.h
+++ b/tensorflow/core/common_runtime/eager/tensor_handle.h
@@ -240,6 +240,11 @@ class TensorHandle : public ImmediateExecutionTensorHandle,
   // index.
   Status ExtractPackedHandle(const int index, TensorHandle** handle) const;
 
+  // For LLVM style RTTI.
+  static bool classof(const AbstractTensorHandle* ptr) {
+    return ptr->getKind() == kEager;
+  }
+
  private:
   friend class PackedTensorHandleTest;
 
diff --git a/tensorflow/core/lib/llvm_rtti/BUILD b/tensorflow/core/lib/llvm_rtti/BUILD
new file mode 100644
index 00000000000..cc8f5d5f46c
--- /dev/null
+++ b/tensorflow/core/lib/llvm_rtti/BUILD
@@ -0,0 +1,28 @@
+# Library for using LLVM style RTTI in TensorFlow.
+load(
+    "//tensorflow/core/platform:rules_cc.bzl",
+    "cc_library",
+)
+
+package(
+    default_visibility = [
+        "//tensorflow:__subpackages__",
+    ],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+cc_library(
+    name = "llvm_rtti",
+    hdrs = ["llvm_rtti.h"],
+    deps = [
+        "@llvm-project//llvm:Support",
+    ],
+)
+
+filegroup(
+    name = "legacy_lib_core_all_headers",
+    srcs = [
+        "llvm_rtti.h",
+    ],
+    visibility = ["//tensorflow/core:__pkg__"],
+)
diff --git a/tensorflow/core/lib/llvm_rtti/llvm_rtti.h b/tensorflow/core/lib/llvm_rtti/llvm_rtti.h
new file mode 100644
index 00000000000..72c4174b592
--- /dev/null
+++ b/tensorflow/core/lib/llvm_rtti/llvm_rtti.h
@@ -0,0 +1,25 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_LIB_CORE_LLVM_RTTI_H_
+#define TENSORFLOW_CORE_LIB_CORE_LLVM_RTTI_H_
+
+#include "third_party/llvm/llvm-project/llvm/include/llvm/Support/Casting.h"
+
+namespace tensorflow {
+using llvm::dyn_cast;
+using llvm::isa;
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_LIB_CORE_LLVM_RTTI_H_

From 3de3dd00180dd4205ed5a4496c16e1c68f6957f1 Mon Sep 17 00:00:00 2001
From: Tare Gaskin <taregaskin@google.com>
Date: Wed, 24 Jun 2020 21:58:00 +0000
Subject: [PATCH 1023/1390] [-Wsign-compare] warning fixes batch 4

---
 .../compiler/xla/service/name_uniquer.cc      |  2 +-
 tensorflow/core/framework/node_def_util.cc    |  6 ++--
 tensorflow/core/framework/op_def_util.cc      |  4 +--
 .../core/kernels/data/captured_function.cc    | 29 +++++++++++--------
 .../kernels/data/single_threaded_executor.cc  |  4 +--
 .../kernels/initializable_lookup_table.cc     |  2 +-
 tensorflow/core/kernels/lookup_util.cc        |  2 +-
 .../profiler/convert/op_metrics_to_record.cc  |  4 +--
 .../convert/xplane_to_tf_functions.cc         |  2 +-
 tensorflow/core/profiler/utils/event_span.cc  |  6 ++--
 tensorflow/core/util/padding.cc               |  3 +-
 tensorflow/python/client/session_ref.cc       |  2 +-
 tensorflow/python/framework/python_op_gen.cc  | 12 ++++----
 .../framework/python_op_gen_internal.cc       |  4 +--
 .../stream_executor/device_description.cc     |  2 +-
 15 files changed, 46 insertions(+), 38 deletions(-)

diff --git a/tensorflow/compiler/xla/service/name_uniquer.cc b/tensorflow/compiler/xla/service/name_uniquer.cc
index 70742b67a28..cba82975e59 100644
--- a/tensorflow/compiler/xla/service/name_uniquer.cc
+++ b/tensorflow/compiler/xla/service/name_uniquer.cc
@@ -51,7 +51,7 @@ NameUniquer::NameUniquer(const string& separator) {
   if (!absl::ascii_isalpha(c) && c != '_') {
     result[0] = '_';
   }
-  for (int i = 1; i < result.length(); i++) {
+  for (int i = 1, iter_limit = result.length(); i < iter_limit; i++) {
     if (!IsAllowed(result[i])) {
       result[i] = '_';
     }
diff --git a/tensorflow/core/framework/node_def_util.cc b/tensorflow/core/framework/node_def_util.cc
index 0a26ceca66f..ca67cb535f9 100644
--- a/tensorflow/core/framework/node_def_util.cc
+++ b/tensorflow/core/framework/node_def_util.cc
@@ -509,7 +509,8 @@ Status InputTypeForNode(const NodeDef& node_def, const OpDef& op_def,
   DataTypeVector input_types;
   for (const auto& arg : op_def.input_arg()) {
     TF_RETURN_IF_ERROR(AddArgToSig(node_def, arg, &input_types));
-    if (input_types.size() > input_port) {
+    int input_types_size = input_types.size();
+    if (input_types_size > input_port) {
       const DataType dtype = input_types[input_port];
       *input_type = dtype;
       return Status::OK();
@@ -532,7 +533,8 @@ Status OutputTypeForNode(const NodeDef& node_def, const OpDef& op_def,
   DataTypeVector output_types;
   for (const auto& arg : op_def.output_arg()) {
     TF_RETURN_IF_ERROR(AddArgToSig(node_def, arg, &output_types));
-    if (output_types.size() > output_port) {
+    int output_types_size = output_types.size(); 
+    if (output_types_size > output_port) {
       const DataType dtype = output_types[output_port];
       *output_type = dtype;
       return Status::OK();
diff --git a/tensorflow/core/framework/op_def_util.cc b/tensorflow/core/framework/op_def_util.cc
index 115c24e1968..bb0c6b8757f 100644
--- a/tensorflow/core/framework/op_def_util.cc
+++ b/tensorflow/core/framework/op_def_util.cc
@@ -661,7 +661,7 @@ Status OpDefCompatible(const OpDef& old_op, const OpDef& new_op) {
            "' vs. '", new_in_sig, "'");
   VALIDATE(old_in_ref.size() == new_in_ref.size(),  // Should not happen
            "Unexpected change in input ref lists.");
-  for (int i = 0; i < old_in_ref.size(); ++i) {
+  for (int i = 0, iter_limit = old_in_ref.size(); i < iter_limit; ++i) {
     // Allowed to remove "ref" from an input (or leave it unchanged).
     VALIDATE(old_in_ref[i] || !new_in_ref[i], "Input ", i,
              " changed from non-ref to ref");
@@ -677,7 +677,7 @@ Status OpDefCompatible(const OpDef& old_op, const OpDef& new_op) {
            old_out_sig, "' vs. '", new_out_sig, "'");
   VALIDATE(old_out_ref.size() == new_out_ref.size(),  // Should not happen
            "Unexpected change in output ref lists");
-  for (int i = 0; i < old_out_ref.size(); ++i) {
+  for (int i = 0, iter_limit = old_in_ref.size(); i < iter_limit; ++i) {
     // Allowed to add "ref" to an output (or leave it unchanged).
     VALIDATE(!old_out_ref[i] || new_out_ref[i], "Output ", i,
              " changed from ref to non-ref");
diff --git a/tensorflow/core/kernels/data/captured_function.cc b/tensorflow/core/kernels/data/captured_function.cc
index f740d7ff1ad..fdcad3cb448 100644
--- a/tensorflow/core/kernels/data/captured_function.cc
+++ b/tensorflow/core/kernels/data/captured_function.cc
@@ -115,7 +115,7 @@ Status RunShortCircuit(const ShortCircuitInfo& info,
                        const CapturedFunction* const func,
                        std::vector<Tensor>* rets) {
   VLOG(3) << "Running function " << func->func().name() << " short circuit";
-  size_t num_args = args.size();
+  const int num_args = args.size();
   rets->reserve(info.indices.size());
   for (size_t i = 0; i < info.indices.size(); ++i) {
     if (info.indices[i] < num_args) {
@@ -131,7 +131,7 @@ Status RunShortCircuit(const ShortCircuitInfo& info, std::vector<Tensor>&& args,
                        const CapturedFunction* const func,
                        std::vector<Tensor>* rets) {
   VLOG(3) << "Running function " << func->func().name() << " short circuit";
-  size_t num_args = args.size();
+  const int num_args = args.size();
   rets->reserve(info.indices.size());
   for (size_t i = 0; i < info.indices.size(); ++i) {
     if (info.indices[i] < num_args) {
@@ -198,7 +198,7 @@ Status CreateShortCircuitInfo(OpKernelConstruction* ctx,
       last_use[indices[i]] = i;
     }
     can_move.resize(indices.size());
-    for (size_t i = 0; i < indices.size(); ++i) {
+    for (int i = 0, iter_limit = indices.size(); i < iter_limit; ++i) {
       can_move[i] = last_use[indices[i]] == i;
     }
   }
@@ -278,11 +278,12 @@ class CallFrameBase : public CallFrameInterface {
 
   // Callee methods.
   Status SetRetval(int index, const Tensor& val) override {
-    if (index < retvals_.size() && val.dtype() == ret_types_[index] &&
+    const int retvals_size_ = retvals_.size();
+    if (index < retvals_size_ && val.dtype() == ret_types_[index] &&
         !retvals_[index]) {
       retvals_[index] = val;
       return Status::OK();
-    } else if (index >= retvals_.size()) {
+    } else if (index >= retvals_size_) {
       return errors::InvalidArgument("Return value ", index,
                                      " is out of range.");
     } else if (val.dtype() != ret_types_[index]) {
@@ -317,10 +318,12 @@ class OwnedArgsCallFrame : public CallFrameBase {
 
   // Callee methods.
   Status GetArg(int index, const Tensor** val) override {
-    if (index < args_.size()) {
+    const int args_size_ = args_.size();
+    const int captured_inputs_size_ = captured_inputs_->size();
+    if (index < args_size_) {
       *val = &args_[index];
       return Status::OK();
-    } else if (index < args_.size() + captured_inputs_->size()) {
+    } else if (index < args_size_ + captured_inputs_size_ ) {
       *val = &(*captured_inputs_)[index - args_.size()];
       return Status::OK();
     } else {
@@ -336,7 +339,7 @@ class OwnedArgsCallFrame : public CallFrameBase {
     *val = std::move(args_[index]);
   }
   bool CanConsumeArg(int index) const override {
-    return index >= 0 && index < args_.size();
+    return index >= 0 && index < static_cast<int>(args_.size());
   }
 
  private:
@@ -359,11 +362,13 @@ class BorrowedArgsCallFrame : public CallFrameBase {
 
   // Callee methods.
   Status GetArg(int index, const Tensor** val) override {
-    if (index < args_.size()) {
+    const int args_size_ = args_.size(); 
+    const int captured_inputs_size_ = captured_inputs_->size(); 
+    if (index < args_size_ ) {
       *val = &args_[index];
       return Status::OK();
-    } else if (index < args_.size() + captured_inputs_->size()) {
-      *val = &(*captured_inputs_)[index - args_.size()];
+    } else if (index < args_size_ + captured_inputs_size_) {
+      *val = &(*captured_inputs_)[index - args_size_];
       return Status::OK();
     } else {
       return errors::InvalidArgument("Argument ", index, " is out of range.");
@@ -613,7 +618,7 @@ Status CapturedFunction::Instantiate(
       }
     }
 
-    for (size_t i = 0; i < fdef->signature().output_arg_size(); ++i) {
+    for (int i = 0, iter_limit = fdef->signature().output_arg_size(); i < iter_limit; ++i) {
       inst_opts.output_devices.push_back(inst_opts.target);
     }
 
diff --git a/tensorflow/core/kernels/data/single_threaded_executor.cc b/tensorflow/core/kernels/data/single_threaded_executor.cc
index 3a16f1018dd..eeb1ffd5ad0 100644
--- a/tensorflow/core/kernels/data/single_threaded_executor.cc
+++ b/tensorflow/core/kernels/data/single_threaded_executor.cc
@@ -51,8 +51,8 @@ class SingleThreadedExecutorImpl : public Executor {
     std::vector<Node*> ordered_nodes;
     ordered_nodes.reserve(graph.num_nodes());
     GetReversePostOrder(graph, &ordered_nodes);
-
-    if (ordered_nodes.size() != graph.num_nodes()) {
+    int ordered_nodes_size = ordered_nodes.size();
+    if (ordered_nodes_size != graph.num_nodes()) {
       return errors::InvalidArgument("Graph had ", graph.num_nodes(),
                                      " but reverse post-order had ",
                                      ordered_nodes.size());
diff --git a/tensorflow/core/kernels/initializable_lookup_table.cc b/tensorflow/core/kernels/initializable_lookup_table.cc
index 196c2fe95a3..48041526022 100644
--- a/tensorflow/core/kernels/initializable_lookup_table.cc
+++ b/tensorflow/core/kernels/initializable_lookup_table.cc
@@ -74,7 +74,7 @@ Status InitializableLookupTable::Initialize(InitTableIterator& iter) {
 
 Status InitializableLookupTable::AreEntriesSame(const InitTableIterator& iter,
                                                 bool* result) {
-  *result = iter.total_size() == size();
+  *result = static_cast<size_t>(iter.total_size()) == size();
   return Status::OK();
 }
 
diff --git a/tensorflow/core/kernels/lookup_util.cc b/tensorflow/core/kernels/lookup_util.cc
index 142878d8fb0..9adcedd6b1a 100644
--- a/tensorflow/core/kernels/lookup_util.cc
+++ b/tensorflow/core/kernels/lookup_util.cc
@@ -132,7 +132,7 @@ class TextFileLineIterator
     std::vector<string> tokens;
     if (!ignore_split_) {
       tokens = str_util::Split(line, delimiter_);
-      if (std::max(key_index_, value_index_) >= tokens.size()) {
+      if ( static_cast<size_t>(std::max(key_index_, value_index_)) >= tokens.size()) {
         status_ = errors::InvalidArgument(
             "Invalid number of columns in ", filename_, " line ", next_id_,
             " (", line, ") : expected ", std::max(key_index_, value_index_),
diff --git a/tensorflow/core/profiler/convert/op_metrics_to_record.cc b/tensorflow/core/profiler/convert/op_metrics_to_record.cc
index 8e28199b827..e4845b3cbd3 100644
--- a/tensorflow/core/profiler/convert/op_metrics_to_record.cc
+++ b/tensorflow/core/profiler/convert/op_metrics_to_record.cc
@@ -37,8 +37,8 @@ std::vector<const OpMetrics*> SortedOpMetricsDb(const OpMetricsDb& metrics_db,
     return std::make_tuple(a->self_time_ps(), b->name()) >
            std::make_tuple(b->self_time_ps(), a->name());
   };
-
-  if (max_records != -1 && result.size() > max_records) {
+  int result_size = result.size();
+  if (max_records != -1 && result_size > max_records) {
     absl::c_partial_sort(result, result.begin() + max_records, comp);
     result.resize(max_records);
   } else {
diff --git a/tensorflow/core/profiler/convert/xplane_to_tf_functions.cc b/tensorflow/core/profiler/convert/xplane_to_tf_functions.cc
index 3f3506bc8bf..e0b517d797a 100644
--- a/tensorflow/core/profiler/convert/xplane_to_tf_functions.cc
+++ b/tensorflow/core/profiler/convert/xplane_to_tf_functions.cc
@@ -206,7 +206,7 @@ class TfFunctionExecutions {
 
   std::string DebugString() const {
     std::string result = "\nActivations:\n";
-    for (auto i = 0; i < activations_.size(); i++) {
+    for (int i = 0, iter_limit = activations_.size(); i < iter_limit; i++) {
       absl::StrAppend(&result, "[", i, "] ", activations_[i].DebugString(),
                       "\n");
     }
diff --git a/tensorflow/core/profiler/utils/event_span.cc b/tensorflow/core/profiler/utils/event_span.cc
index 5e0413c4ba2..f946f336ed8 100644
--- a/tensorflow/core/profiler/utils/event_span.cc
+++ b/tensorflow/core/profiler/utils/event_span.cc
@@ -128,7 +128,7 @@ std::vector<EventTypeSpan> ToNonOverlappedEvents(
   if (event_boundaries.empty()) return result;
   result.reserve(event_boundaries.size());
   PriorityTracker priority_tracker;
-  for (int64 i = 0; i < (event_boundaries.size() - 1); i++) {
+  for (int64 i = 0, iter_limit = (event_boundaries.size() - 1); i < iter_limit; i++) {
     EventType highest_priority = priority_tracker.Update(event_boundaries[i]);
     result.push_back({highest_priority, Timespan::FromEndPoints(
                                             event_boundaries[i].time_ps,
@@ -325,12 +325,12 @@ Timespan StepDetails::StepTime() const {
 
 std::string StepDetails::DebugString() const {
   std::string result = "([";
-  for (int i = 0; i < markers_.size(); i++) {
+  for (int i = 0, iter_limit = markers_.size(); i < iter_limit; i++) {
     if (i > 0) absl::StrAppend(&result, ", ");
     absl::StrAppend(&result, PrintStepMarker(markers_[i]));
   }
   absl::StrAppend(&result, "], [");
-  for (int i = 0; i < events_.size(); i++) {
+  for (int i = 0, iter_limit = events_.size(); i < iter_limit; i++) {
     if (i > 0) absl::StrAppend(&result, ", ");
     absl::StrAppend(&result, PrintEventTypeSpan(events_[i]));
   }
diff --git a/tensorflow/core/util/padding.cc b/tensorflow/core/util/padding.cc
index 5fa33d8a590..002b67049f3 100644
--- a/tensorflow/core/util/padding.cc
+++ b/tensorflow/core/util/padding.cc
@@ -37,7 +37,8 @@ Status CheckValidPadding(Padding padding_type,
                          const std::vector<int64>& explicit_paddings,
                          int num_dims, TensorFormat data_format) {
   if (padding_type == Padding::EXPLICIT) {
-    if (explicit_paddings.size() != 2 * num_dims) {
+    int explicit_paddings_size = explicit_paddings.size(); 
+    if (explicit_paddings_size != 2 * num_dims) {
       return errors::InvalidArgument(
           "explicit_paddings attribute must contain ", 2 * num_dims,
           " values, but got: ", explicit_paddings.size());
diff --git a/tensorflow/python/client/session_ref.cc b/tensorflow/python/client/session_ref.cc
index d911e185153..cc3b48cb3e6 100644
--- a/tensorflow/python/client/session_ref.cc
+++ b/tensorflow/python/client/session_ref.cc
@@ -146,7 +146,7 @@ class SessionLogger {
     // Build an index from fetch tensor name to first index in
     // output_tensor_names.
     std::unordered_map<string, int> output_name_to_offset;
-    for (int i = 0; i < output_tensor_names.size(); ++i) {
+    for (int i = 0, iter_limit = output_tensor_names.size(); i < iter_limit; ++i) {
       const string& name = output_tensor_names[i];
       if (output_name_to_offset.insert(std::make_pair(name, i)).second) {
         req->add_fetch(name);
diff --git a/tensorflow/python/framework/python_op_gen.cc b/tensorflow/python/framework/python_op_gen.cc
index ca0c5d9ef1a..12aebb6a671 100644
--- a/tensorflow/python/framework/python_op_gen.cc
+++ b/tensorflow/python/framework/python_op_gen.cc
@@ -63,7 +63,7 @@ void AddInferredAttr(const string& indentation, const string& attr_name,
 string VectorToTuple(const std::vector<string>& l) {
   if (l.size() == 1) return strings::StrCat("(", l.front(), ",)");
   string ret = "(";
-  for (int i = 0; i < l.size(); ++i) {
+  for (int i = 0, iter_limit = l.size(); i < iter_limit; ++i) {
     if (i > 0) {
       strings::StrAppend(&ret, ", ");
     }
@@ -75,11 +75,11 @@ string VectorToTuple(const std::vector<string>& l) {
 
 void Unflatten(const string& prefix, const std::vector<string>& output_sizes,
                const string& var, string* result) {
-  for (int i = 0; i < output_sizes.size(); ++i) {
+  for (int i = 0, iter_limit = output_sizes.size(); i < iter_limit; ++i) {
     if (!output_sizes[i].empty()) {
       strings::StrAppend(result, prefix, var, " = ");
       if (i > 0) strings::StrAppend(result, var, "[:", i, "] + ");
-      if (i + 1 < output_sizes.size()) {
+      if (i + 1 < iter_limit) {
         // Special case i == 0 to avoid "0 +" in the generated code.
         if (i == 0) {
           strings::StrAppend(result, "[", var, "[:", output_sizes[i], "]] + ",
@@ -295,7 +295,7 @@ string GenEagerPythonOp::Code() {
   // from the end of params_no_default_, and adding params_no_default_.
   attrs_.reserve(params_no_default_.size() - op_def_.input_arg_size() +
                  params_with_default_.size());
-  for (int i = op_def_.input_arg_size(); i < params_no_default_.size(); ++i) {
+  for (int i = op_def_.input_arg_size(), iter_limit = params_no_default_.size(); i < iter_limit; ++i) {
     attrs_.push_back(params_no_default_[i].GetName());
   }
   for (const auto& p : params_with_default_) {
@@ -331,7 +331,7 @@ string GenEagerPythonOp::Code() {
                      parameters_with_defaults.empty() ? "" : ", ", "name=None");
 
   // Add attr_expressions_ for attrs that are params.
-  for (int i = 0; i < attrs_.size(); ++i) {
+  for (int i = 0, iter_limit = attrs_.size(); i < iter_limit; ++i) {
     const string& attr_name = attrs_[i];
     const string& attr_api_name =
         param_names_[i + op_def_.input_arg_size()].GetRenameTo();
@@ -522,7 +522,7 @@ bool GenEagerPythonOp::GetEagerFunctionSetup(const string& indentation,
     }
   }
 
-  for (int i = 0; i < attrs_.size(); ++i) {
+  for (int i = 0, iter_limit = attrs_.size(); i < iter_limit; ++i) {
     const string& attr_name = attrs_[i];
     const auto& param = param_names_[i + op_def_.input_arg_size()];
     const auto& attr = *FindAttr(attr_name, op_def_);
diff --git a/tensorflow/python/framework/python_op_gen_internal.cc b/tensorflow/python/framework/python_op_gen_internal.cc
index 05102db0189..d2e25e368b4 100644
--- a/tensorflow/python/framework/python_op_gen_internal.cc
+++ b/tensorflow/python/framework/python_op_gen_internal.cc
@@ -561,10 +561,10 @@ string GenPythonOp::Code() {
   // from the end of args_no_default, and adding args_no_default.
   attrs_.reserve(params_no_default.size() - op_def_.input_arg_size() +
                  params_with_default.size());
-  for (int i = op_def_.input_arg_size(); i < params_no_default.size(); ++i) {
+  for (int i = op_def_.input_arg_size(), iter_limit = params_no_default.size(); i < iter_limit; ++i) {
     attrs_.push_back(params_no_default[i].GetName());
   }
-  for (int i = 0; i < params_with_default.size(); ++i) {
+  for (int i = 0, iter_limit = params_with_default.size(); i < iter_limit; ++i) {
     attrs_.push_back(params_with_default[i].GetName());
   }
 
diff --git a/tensorflow/stream_executor/device_description.cc b/tensorflow/stream_executor/device_description.cc
index 9ee6e6837d7..b6d98fcbafa 100644
--- a/tensorflow/stream_executor/device_description.cc
+++ b/tensorflow/stream_executor/device_description.cc
@@ -127,7 +127,7 @@ bool ThreadDimOk(const DeviceDescription &device_description,
                  const ThreadDim &thread_dim) {
   auto total_threads = thread_dim.x * thread_dim.y * thread_dim.z;
   auto threads_per_block_limit = device_description.threads_per_block_limit();
-  if (total_threads > threads_per_block_limit) {
+  if (total_threads > static_cast<long long unsigned int>(threads_per_block_limit)) {
     VLOG(2) << "exceeded total-thread-per-block limit: " << total_threads
             << " vs limit " << threads_per_block_limit;
     return false;

From b390193f1545324323f311f5002851e809ba8f3e Mon Sep 17 00:00:00 2001
From: Francois Chollet <fchollet@google.com>
Date: Wed, 24 Jun 2020 14:49:53 -0700
Subject: [PATCH 1024/1390] Fix a critical breakage in `training` argument
 default value in inference for layers with a default of `training=True`
 called in e.g. a Sequential container.

PiperOrigin-RevId: 318145694
Change-Id: I1af5286824e3a45e1a7d1b8a4fadd7ec223895dc
---
 tensorflow/python/keras/engine/base_layer.py  |  4 +--
 .../python/keras/engine/base_layer_test.py    | 28 ++++++++++++---
 .../python/keras/engine/functional_test.py    |  8 ++---
 .../preprocessing/image_preprocessing_test.py | 34 +++++++++++++++++++
 4 files changed, 64 insertions(+), 10 deletions(-)

diff --git a/tensorflow/python/keras/engine/base_layer.py b/tensorflow/python/keras/engine/base_layer.py
index 97eb0447a69..1cd28a7a6e4 100644
--- a/tensorflow/python/keras/engine/base_layer.py
+++ b/tensorflow/python/keras/engine/base_layer.py
@@ -2891,9 +2891,9 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
     self._expects_training_arg = ('training' in call_fn_args or
                                   self._call_accepts_kwargs)
     # The default training arg will be any (non-None) default specified in the
-    # method signature, or `False` if no non-None default is specified.
+    # method signature, or None if no value is specified.
     self._default_training_arg = self._call_fn_arg_defaults.get(
-        'training') or False
+        'training')
     self._expects_mask_arg = ('mask' in call_fn_args or
                               self._call_accepts_kwargs)
 
diff --git a/tensorflow/python/keras/engine/base_layer_test.py b/tensorflow/python/keras/engine/base_layer_test.py
index 58a0799329a..559e927d603 100644
--- a/tensorflow/python/keras/engine/base_layer_test.py
+++ b/tensorflow/python/keras/engine/base_layer_test.py
@@ -650,6 +650,17 @@ class BaseLayerTest(keras_parameterized.TestCase):
         else:
           return self._nested_layer(inputs) * 0.5
 
+    class CustomLayerDefaultTrainingNone(base_layer.Layer):
+
+      def __init__(self, nested_layer=None):
+        self._nested_layer = nested_layer or array_ops.identity
+
+      def call(self, inputs, training=None):
+        if training:
+          return self._nested_layer(inputs)
+        else:
+          return self._nested_layer(inputs) * 0.5
+
     class CustomLayerDefaultTrainingFalse(base_layer.Layer):
 
       def __init__(self, nested_layer=None):
@@ -701,21 +712,30 @@ class BaseLayerTest(keras_parameterized.TestCase):
     # Outer layers/models should set the training context implicitly for all
     # nested layers, respecting whatever mode the outer layer was run with.
     layer = CustomLayerDefaultTrainingTrue(CustomLayerDefaultTrainingFalse())
-    self.assertAllEqual(layer(x), x)
+    # No outer value passed: use local defaults
+    self.assertAllEqual(layer(x), x * 0.25)  # Use local default False
+    # Outer value passed: override local defaults
     self.assertAllEqual(layer(x, training=False), x * 0.25)
     self.assertAllEqual(layer(x, training=True), x)
 
     layer = CustomLayerDefaultTrainingFalse(CustomLayerDefaultTrainingTrue())
-    self.assertAllEqual(layer(x), x * 0.25)
+    # No outer value passed: use local defaults
+    self.assertAllEqual(layer(x), x)  # Use local default True
+    # Outer value passed: override local defaults
     self.assertAllEqual(layer(x, training=False), x * 0.25)
     self.assertAllEqual(layer(x, training=True), x)
 
     # If the outer layer `call` doesn't take a training argument at all,
-    # it'll set the nested scope as inference when no training arg is passed in.
+    # it'll set the nested scope as None when no training arg is passed in.
     # If a training arg is passed in it won't use it directly in `call`, but
     # it will set the nested training mode.
     layer = CustomLayerNoTrainingArg(CustomLayerDefaultTrainingTrue())
-    self.assertAllEqual(layer(x), x * 0.5)
+    self.assertAllEqual(layer(x), x)  # Use local default True
+    self.assertAllEqual(layer(x, training=False), x * 0.5)
+    self.assertAllEqual(layer(x, training=True), x)
+
+    layer = CustomLayerDefaultTrainingNone(CustomLayerDefaultTrainingTrue())
+    self.assertAllEqual(layer(x), x)  # Use local default True
     self.assertAllEqual(layer(x, training=False), x * 0.5)
     self.assertAllEqual(layer(x, training=True), x)
 
diff --git a/tensorflow/python/keras/engine/functional_test.py b/tensorflow/python/keras/engine/functional_test.py
index 0e82d95d3de..24b0e147b97 100644
--- a/tensorflow/python/keras/engine/functional_test.py
+++ b/tensorflow/python/keras/engine/functional_test.py
@@ -2116,13 +2116,13 @@ class CacheCorrectnessTest(keras_parameterized.TestCase):
 
     if context.executing_eagerly():
       # In v2, construction still works when no `training` is specified
-      # When no value passed during construction, it uses the runtime value.
+      # When no value passed during construction, it uses the local default.
       inputs = input_layer_lib.Input(10)
       outputs = my_layer(inputs)
       network = functional.Functional(inputs, outputs)
       self.assertAllEqual(network(x, training=True), _call(x, True))
       self.assertAllEqual(network(x, training=False), _call(x, False))
-      self.assertAllEqual(network(x), _call(x, False))
+      self.assertAllEqual(network(x), _call(x, True))  # Use local default
 
     # `None` value passed positionally during construction is ignored at runtime
     inputs = input_layer_lib.Input(10)
@@ -2131,7 +2131,7 @@ class CacheCorrectnessTest(keras_parameterized.TestCase):
     self.assertAllEqual(network(x, training=True), _call(x, True))
     self.assertAllEqual(network(x, training=False), _call(x, False))
     if context.executing_eagerly():
-      self.assertAllEqual(network(x), _call(x, False))
+      self.assertAllEqual(network(x), _call(x, True))  # Use local default
     else:
       # in v1 training would have defaulted to using the `None` inside the layer
       # if training is not passed at runtime
@@ -2144,7 +2144,7 @@ class CacheCorrectnessTest(keras_parameterized.TestCase):
     self.assertAllEqual(network(x, training=True), _call(x, True))
     self.assertAllEqual(network(x, training=False), _call(x, False))
     if context.executing_eagerly():
-      self.assertAllEqual(network(x), _call(x, False))
+      self.assertAllEqual(network(x), _call(x, True))  # Use local default
     else:
       # in v1 training would have defaulted to using the `None` inside the layer
       # if training is not passed at runtime
diff --git a/tensorflow/python/keras/layers/preprocessing/image_preprocessing_test.py b/tensorflow/python/keras/layers/preprocessing/image_preprocessing_test.py
index 5cb7cec5b7b..f5210589b82 100644
--- a/tensorflow/python/keras/layers/preprocessing/image_preprocessing_test.py
+++ b/tensorflow/python/keras/layers/preprocessing/image_preprocessing_test.py
@@ -26,6 +26,7 @@ from tensorflow.python.framework import errors
 from tensorflow.python.framework import test_util as tf_test_util
 from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import testing_utils
+from tensorflow.python.keras.engine import sequential
 from tensorflow.python.keras.layers.preprocessing import image_preprocessing
 from tensorflow.python.keras.utils.generic_utils import CustomObjectScope
 from tensorflow.python.ops import gen_stateful_random_ops
@@ -1273,5 +1274,38 @@ class RandomWidthTest(keras_parameterized.TestCase):
     self.assertEqual(layer_1.name, layer.name)
 
 
+@keras_parameterized.run_all_keras_modes(always_skip_v1=True)
+class LearningPhaseTest(keras_parameterized.TestCase):
+
+  def test_plain_call(self):
+    layer = image_preprocessing.RandomWidth(.5, seed=123)
+    shape = (12, 12, 3)
+    img = np.random.random((12,) + shape)
+    out = layer(img)  # Default to training=True
+    self.assertNotEqual(tuple(int(i) for i in out.shape[1:]), shape)
+
+    out = layer(img, training=True)
+    self.assertNotEqual(tuple(int(i) for i in out.shape[1:]), shape)
+
+    out = layer(img, training=False)
+    self.assertEqual(tuple(int(i) for i in out.shape[1:]), shape)
+
+  def test_call_in_container(self):
+    layer1 = image_preprocessing.RandomWidth(.5, seed=123)
+    layer2 = image_preprocessing.RandomHeight(.5, seed=123)
+    seq = sequential.Sequential([layer1, layer2])
+
+    shape = (12, 12, 3)
+    img = np.random.random((12,) + shape)
+    out = seq(img)  # Default to training=True
+    self.assertNotEqual(tuple(int(i) for i in out.shape[1:]), shape)
+
+    out = seq(img, training=True)
+    self.assertNotEqual(tuple(int(i) for i in out.shape[1:]), shape)
+
+    out = seq(img, training=False)
+    self.assertEqual(tuple(int(i) for i in out.shape[1:]), shape)
+
+
 if __name__ == '__main__':
   test.main()

From c7482df39b5c38538fe02d5371db308629eb827f Mon Sep 17 00:00:00 2001
From: Vo Van Nghia <vovannghia2409@gmail.com>
Date: Thu, 25 Jun 2020 05:06:16 +0700
Subject: [PATCH 1025/1390] use string instead of const char* gcs

---
 .../experimental/filesystem/plugins/gcs/BUILD |  2 -
 .../filesystem/plugins/gcs/gcs_filesystem.cc  | 51 ++++++-------------
 .../filesystem/plugins/gcs/gcs_filesystem.h   |  5 +-
 .../plugins/gcs/gcs_filesystem_test.cc        | 24 +++++++--
 4 files changed, 39 insertions(+), 43 deletions(-)

diff --git a/tensorflow/c/experimental/filesystem/plugins/gcs/BUILD b/tensorflow/c/experimental/filesystem/plugins/gcs/BUILD
index 68d7de41b1b..d2b77c93f88 100644
--- a/tensorflow/c/experimental/filesystem/plugins/gcs/BUILD
+++ b/tensorflow/c/experimental/filesystem/plugins/gcs/BUILD
@@ -30,7 +30,6 @@ cc_library(
         "//tensorflow/c:tf_status",
         "//tensorflow/c/experimental/filesystem:filesystem_interface",
         "@com_github_googlecloudplatform_google_cloud_cpp//:storage_client",
-        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -50,7 +49,6 @@ tf_cc_test(
         "gcs_filesystem.cc",
         "gcs_filesystem_test.cc",
     ],
-    local_defines = ["TF_GCS_FILESYSTEM_TEST"],
     tags = [
         "manual",
         "notap",
diff --git a/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem.cc b/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem.cc
index bd55cafd6f8..61177bfd4f2 100644
--- a/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem.cc
+++ b/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem.cc
@@ -38,8 +38,8 @@ static inline void TF_SetStatusFromGCSStatus(
 static void* plugin_memory_allocate(size_t size) { return calloc(1, size); }
 static void plugin_memory_free(void* ptr) { free(ptr); }
 
-void ParseGCSPath(absl::string_view fname, bool object_empty_ok, char** bucket,
-                  char** object, TF_Status* status) {
+void ParseGCSPath(const std::string& fname, bool object_empty_ok,
+                  std::string& bucket, std::string& object, TF_Status* status) {
   size_t scheme_end = fname.find("://") + 2;
   if (fname.substr(0, scheme_end + 1) != "gs://") {
     TF_SetStatus(status, TF_INVALID_ARGUMENT,
@@ -48,33 +48,18 @@ void ParseGCSPath(absl::string_view fname, bool object_empty_ok, char** bucket,
   }
 
   size_t bucket_end = fname.find("/", scheme_end + 1);
-  if (bucket_end == absl::string_view::npos) {
+  if (bucket_end == std::string::npos) {
     TF_SetStatus(status, TF_INVALID_ARGUMENT,
                  "GCS path doesn't contain a bucket name.");
     return;
   }
-  absl::string_view bucket_view =
-      fname.substr(scheme_end + 1, bucket_end - scheme_end - 1);
-  *bucket =
-      static_cast<char*>(plugin_memory_allocate(bucket_view.length() + 1));
-  memcpy(*bucket, bucket_view.data(), bucket_view.length());
-  (*bucket)[bucket_view.length()] = '\0';
+  bucket = std::move(fname.substr(scheme_end + 1, bucket_end - scheme_end - 1));
 
-  absl::string_view object_view = fname.substr(bucket_end + 1);
-  if (object_view.empty()) {
-    if (object_empty_ok) {
-      *object = nullptr;
-      return;
-    } else {
-      TF_SetStatus(status, TF_INVALID_ARGUMENT,
-                   "GCS path doesn't contain an object name.");
-      return;
-    }
+  object = std::move(fname.substr(bucket_end + 1));
+  if (object.empty() && !object_empty_ok) {
+    TF_SetStatus(status, TF_INVALID_ARGUMENT,
+                 "GCS path doesn't contain an object name.");
   }
-  *object =
-      static_cast<char*>(plugin_memory_allocate(object_view.length() + 1));
-  // object_view.data() is a null-terminated string_view because fname is.
-  strcpy(*object, object_view.data());
 }
 
 // SECTION 1. Implementation for `TF_RandomAccessFile`
@@ -89,8 +74,8 @@ namespace tf_random_access_file {
 // ----------------------------------------------------------------------------
 namespace tf_writable_file {
 typedef struct GCSFile {
-  const char* bucket;
-  const char* object;
+  const std::string bucket;
+  const std::string object;
   gcs::Client* gcs_client;  // not owned
   TempFile outfile;
   bool sync_need;
@@ -98,8 +83,6 @@ typedef struct GCSFile {
 
 static void Cleanup(TF_WritableFile* file) {
   auto gcs_file = static_cast<GCSFile*>(file->plugin_file);
-  plugin_memory_free(const_cast<char*>(gcs_file->bucket));
-  plugin_memory_free(const_cast<char*>(gcs_file->object));
   delete gcs_file;
 }
 
@@ -141,15 +124,14 @@ void Cleanup(TF_Filesystem* filesystem) {
 
 void NewWritableFile(const TF_Filesystem* filesystem, const char* path,
                      TF_WritableFile* file, TF_Status* status) {
-  char* bucket;
-  char* object;
-  ParseGCSPath(path, false, &bucket, &object, status);
+  std::string bucket, object;
+  ParseGCSPath(path, false, bucket, object, status);
   if (TF_GetCode(status) != TF_OK) return;
 
   auto gcs_client = static_cast<gcs::Client*>(filesystem->plugin_filesystem);
   char* temp_file_name = TF_GetTempFileName("");
   file->plugin_file = new tf_writable_file::GCSFile(
-      {bucket, object, gcs_client,
+      {std::move(bucket), std::move(object), gcs_client,
        TempFile(temp_file_name, std::ios::binary | std::ios::out), true});
   // We are responsible for freeing the pointer returned by TF_GetTempFileName
   free(temp_file_name);
@@ -158,9 +140,8 @@ void NewWritableFile(const TF_Filesystem* filesystem, const char* path,
 
 void NewAppendableFile(const TF_Filesystem* filesystem, const char* path,
                        TF_WritableFile* file, TF_Status* status) {
-  char* bucket;
-  char* object;
-  ParseGCSPath(path, false, &bucket, &object, status);
+  std::string bucket, object;
+  ParseGCSPath(path, false, bucket, object, status);
   if (TF_GetCode(status) != TF_OK) return;
 
   auto gcs_client = static_cast<gcs::Client*>(filesystem->plugin_filesystem);
@@ -175,7 +156,7 @@ void NewAppendableFile(const TF_Filesystem* filesystem, const char* path,
   // If this file does not exist on server, we will need to sync it.
   bool sync_need = (status_code == TF_NOT_FOUND);
   file->plugin_file = new tf_writable_file::GCSFile(
-      {bucket, object, gcs_client,
+      {std::move(bucket), std::move(object), gcs_client,
        TempFile(temp_file_name, std::ios::binary | std::ios::app), sync_need});
   free(temp_file_name);
   TF_SetStatus(status, TF_OK, "");
diff --git a/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem.h b/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem.h
index cc8168e8d67..bd95afaec3b 100644
--- a/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem.h
+++ b/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem.h
@@ -15,13 +15,12 @@
 #ifndef TENSORFLOW_C_EXPERIMENTAL_FILESYSTEM_PLUGINS_GCS_GCS_FILESYSTEM_H_
 #define TENSORFLOW_C_EXPERIMENTAL_FILESYSTEM_PLUGINS_GCS_GCS_FILESYSTEM_H_
 
-#include "absl/strings/string_view.h"
 #include "google/cloud/storage/client.h"
 #include "tensorflow/c/experimental/filesystem/filesystem_interface.h"
 #include "tensorflow/c/tf_status.h"
 
-void ParseGCSPath(absl::string_view fname, bool object_empty_ok, char** bucket,
-                  char** object, TF_Status* status);
+void ParseGCSPath(const std::string& fname, bool object_empty_ok,
+                  std::string& bucket, std::string& object, TF_Status* status);
 
 namespace tf_gcs_filesystem {
 void Init(TF_Filesystem* filesystem, TF_Status* status);
diff --git a/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem_test.cc b/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem_test.cc
index 96fef424ebf..5e1afd98255 100644
--- a/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem_test.cc
+++ b/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem_test.cc
@@ -43,9 +43,27 @@ class GCSFilesystemTest : public ::testing::Test {
   TF_Status* status_;
 };
 
-// We have to add this test here because there must be at least one test.
-// This test will be removed in the future.
-TEST_F(GCSFilesystemTest, TestInit) { ASSERT_TF_OK(status_); }
+TEST_F(GCSFilesystemTest, ParseGCSPath) {
+  std::string bucket, object;
+  ParseGCSPath("gs://bucket/path/to/object", false, bucket, object, status_);
+  ASSERT_TF_OK(status_);
+  ASSERT_EQ(bucket, "bucket");
+  ASSERT_EQ(object, "path/to/object");
+
+  ParseGCSPath("gs://bucket/", true, bucket, object, status_);
+  ASSERT_TF_OK(status_);
+  ASSERT_EQ(bucket, "bucket");
+
+  ParseGCSPath("bucket/path/to/object", false, bucket, object, status_);
+  ASSERT_EQ(TF_GetCode(status_), TF_INVALID_ARGUMENT);
+
+  // bucket name must end with "/"
+  ParseGCSPath("gs://bucket", true, bucket, object, status_);
+  ASSERT_EQ(TF_GetCode(status_), TF_INVALID_ARGUMENT);
+
+  ParseGCSPath("gs://bucket/", false, bucket, object, status_);
+  ASSERT_EQ(TF_GetCode(status_), TF_INVALID_ARGUMENT);
+}
 
 }  // namespace
 }  // namespace tensorflow

From a7940d1b482a77886f315d00f20e0cf1e9080d89 Mon Sep 17 00:00:00 2001
From: Christian Sigg <csigg@google.com>
Date: Wed, 24 Jun 2020 15:08:24 -0700
Subject: [PATCH 1026/1390] Fix ROCm build after cl/317868942.

Two function declarations were added by accident.
Macro was missing closing parenthesis.

PiperOrigin-RevId: 318149363
Change-Id: I6363ec5f75719a594f0ab19732dd06704ead02c5
---
 tensorflow/stream_executor/rocm/rocm_dnn.cc | 13 ++++++-------
 tensorflow/stream_executor/rocm/rocm_dnn.h  | 19 -------------------
 2 files changed, 6 insertions(+), 26 deletions(-)

diff --git a/tensorflow/stream_executor/rocm/rocm_dnn.cc b/tensorflow/stream_executor/rocm/rocm_dnn.cc
index 9c09784b3f1..4b2761e7658 100644
--- a/tensorflow/stream_executor/rocm/rocm_dnn.cc
+++ b/tensorflow/stream_executor/rocm/rocm_dnn.cc
@@ -291,7 +291,7 @@ namespace wrap {
   __macro(miopenSetTensorDescriptor)                                 \
   __macro(miopenGetTensorDescriptorSize)                             \
   __macro(miopenPoolingForward)                                      \
-  __macro(miopenPoolingGetWorkSpaceSizeV2                            \
+  __macro(miopenPoolingGetWorkSpaceSizeV2)                           \
   __macro(miopenPoolingBackward)                                     \
   __macro(miopenLRNForward)                                          \
   __macro(miopenLRNBackward)                                         \
@@ -358,13 +358,12 @@ namespace wrap {
   __macro(miopenConvolutionBackwardWeightsGetSolution)               \
   __macro(miopenConvolutionBackwardWeightsGetSolutionWorkspaceSize)  \
   __macro(miopenConvolutionBackwardWeightsCompileSolution)           \
-  __macro(miopenConvolutionBackwardWeightsImmediate)		     \
-  __macro(miopenCreateCTCLossDescriptor)			     \
-  __macro(miopenSetCTCLossDescriptor)				     \
-  __macro(miopenGetCTCLossWorkspaceSize)			     \
-  __macro(miopenCTCLoss)					     \
+  __macro(miopenConvolutionBackwardWeightsImmediate)                 \
+  __macro(miopenCreateCTCLossDescriptor)                             \
+  __macro(miopenSetCTCLossDescriptor)                                \
+  __macro(miopenGetCTCLossWorkspaceSize)                             \
+  __macro(miopenCTCLoss)                                             \
   __macro(miopenDestroyCTCLossDescriptor)
-
 // clang-format on
 
 MIOPEN_DNN_ROUTINE_EACH(STREAM_EXECUTOR_MIOPEN_WRAP)
diff --git a/tensorflow/stream_executor/rocm/rocm_dnn.h b/tensorflow/stream_executor/rocm/rocm_dnn.h
index 4f568702d96..b01c1cc5290 100644
--- a/tensorflow/stream_executor/rocm/rocm_dnn.h
+++ b/tensorflow/stream_executor/rocm/rocm_dnn.h
@@ -885,25 +885,6 @@ class MIOpenSupport : public dnn::DnnSupport {
       ScratchAllocator* scratch_allocator,
       std::vector<dnn::ProfileResult>* out_algorithms);
 
-  port::Status DoCtcLossImpl(
-      Stream* stream, const MIOpenRnnStateTensorDescriptor& probs_desc,
-      const DeviceMemoryBase probs_data, absl::Span<const int> labels_data,
-      absl::Span<const int> labels_lengths_data,
-      absl::Span<const int> input_lengths_data, DeviceMemoryBase costs_data,
-      const MIOpenRnnStateTensorDescriptor& grads_desc,
-      DeviceMemoryBase grads_data, const MIOpenCTCLossDescriptor& ctc_loss_desc,
-      DeviceMemory<uint8> scratch_memory);
-
-  port::Status DoPrepareForCtcLoss(
-      Stream* stream, dnn::DataType element_type,
-      const dnn::RnnStateTensorDescriptor& probs_desc,
-      const dnn::RnnStateTensorDescriptor& grads_desc,
-      absl::Span<const int> labels_data,
-      absl::Span<const int> labels_lengths_data,
-      absl::Span<const int> input_lengths_data,
-      ScratchAllocator* scratch_allocator,
-      DeviceMemory<uint8>* scratch_memory) override;
-
   template <class T>
   bool DoPoolBackwardImpl(Stream* stream,
                           const dnn::PoolingDescriptor& pooling_dimensions,

From 2e9615962de5225d8fb21532189a281bbc8c167f Mon Sep 17 00:00:00 2001
From: Vo Van Nghia <vovannghia2409@gmail.com>
Date: Thu, 25 Jun 2020 05:33:21 +0700
Subject: [PATCH 1027/1390] Use pointer instead of reference

---
 .../filesystem/plugins/gcs/gcs_filesystem.cc         | 12 ++++++------
 .../filesystem/plugins/gcs/gcs_filesystem.h          |  2 +-
 .../filesystem/plugins/gcs/gcs_filesystem_test.cc    | 10 +++++-----
 3 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem.cc b/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem.cc
index 61177bfd4f2..e57f2fed1e9 100644
--- a/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem.cc
+++ b/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem.cc
@@ -39,7 +39,7 @@ static void* plugin_memory_allocate(size_t size) { return calloc(1, size); }
 static void plugin_memory_free(void* ptr) { free(ptr); }
 
 void ParseGCSPath(const std::string& fname, bool object_empty_ok,
-                  std::string& bucket, std::string& object, TF_Status* status) {
+                  std::string* bucket, std::string* object, TF_Status* status) {
   size_t scheme_end = fname.find("://") + 2;
   if (fname.substr(0, scheme_end + 1) != "gs://") {
     TF_SetStatus(status, TF_INVALID_ARGUMENT,
@@ -53,10 +53,10 @@ void ParseGCSPath(const std::string& fname, bool object_empty_ok,
                  "GCS path doesn't contain a bucket name.");
     return;
   }
-  bucket = std::move(fname.substr(scheme_end + 1, bucket_end - scheme_end - 1));
+  *bucket = std::move(fname.substr(scheme_end + 1, bucket_end - scheme_end - 1));
 
-  object = std::move(fname.substr(bucket_end + 1));
-  if (object.empty() && !object_empty_ok) {
+  *object = std::move(fname.substr(bucket_end + 1));
+  if (object->empty() && !object_empty_ok) {
     TF_SetStatus(status, TF_INVALID_ARGUMENT,
                  "GCS path doesn't contain an object name.");
   }
@@ -125,7 +125,7 @@ void Cleanup(TF_Filesystem* filesystem) {
 void NewWritableFile(const TF_Filesystem* filesystem, const char* path,
                      TF_WritableFile* file, TF_Status* status) {
   std::string bucket, object;
-  ParseGCSPath(path, false, bucket, object, status);
+  ParseGCSPath(path, false, &bucket, &object, status);
   if (TF_GetCode(status) != TF_OK) return;
 
   auto gcs_client = static_cast<gcs::Client*>(filesystem->plugin_filesystem);
@@ -141,7 +141,7 @@ void NewWritableFile(const TF_Filesystem* filesystem, const char* path,
 void NewAppendableFile(const TF_Filesystem* filesystem, const char* path,
                        TF_WritableFile* file, TF_Status* status) {
   std::string bucket, object;
-  ParseGCSPath(path, false, bucket, object, status);
+  ParseGCSPath(path, false, &bucket, &object, status);
   if (TF_GetCode(status) != TF_OK) return;
 
   auto gcs_client = static_cast<gcs::Client*>(filesystem->plugin_filesystem);
diff --git a/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem.h b/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem.h
index bd95afaec3b..de96ea6b844 100644
--- a/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem.h
+++ b/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem.h
@@ -20,7 +20,7 @@
 #include "tensorflow/c/tf_status.h"
 
 void ParseGCSPath(const std::string& fname, bool object_empty_ok,
-                  std::string& bucket, std::string& object, TF_Status* status);
+                  std::string* bucket, std::string* object, TF_Status* status);
 
 namespace tf_gcs_filesystem {
 void Init(TF_Filesystem* filesystem, TF_Status* status);
diff --git a/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem_test.cc b/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem_test.cc
index 5e1afd98255..db8c538d167 100644
--- a/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem_test.cc
+++ b/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem_test.cc
@@ -45,23 +45,23 @@ class GCSFilesystemTest : public ::testing::Test {
 
 TEST_F(GCSFilesystemTest, ParseGCSPath) {
   std::string bucket, object;
-  ParseGCSPath("gs://bucket/path/to/object", false, bucket, object, status_);
+  ParseGCSPath("gs://bucket/path/to/object", false, &bucket, &object, status_);
   ASSERT_TF_OK(status_);
   ASSERT_EQ(bucket, "bucket");
   ASSERT_EQ(object, "path/to/object");
 
-  ParseGCSPath("gs://bucket/", true, bucket, object, status_);
+  ParseGCSPath("gs://bucket/", true, &bucket, &object, status_);
   ASSERT_TF_OK(status_);
   ASSERT_EQ(bucket, "bucket");
 
-  ParseGCSPath("bucket/path/to/object", false, bucket, object, status_);
+  ParseGCSPath("bucket/path/to/object", false, &bucket, &object, status_);
   ASSERT_EQ(TF_GetCode(status_), TF_INVALID_ARGUMENT);
 
   // bucket name must end with "/"
-  ParseGCSPath("gs://bucket", true, bucket, object, status_);
+  ParseGCSPath("gs://bucket", true, &bucket, &object, status_);
   ASSERT_EQ(TF_GetCode(status_), TF_INVALID_ARGUMENT);
 
-  ParseGCSPath("gs://bucket/", false, bucket, object, status_);
+  ParseGCSPath("gs://bucket/", false, &bucket, &object, status_);
   ASSERT_EQ(TF_GetCode(status_), TF_INVALID_ARGUMENT);
 }
 

From 690a77bfe32f3f13d180d17c3ffd7652c8602446 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 24 Jun 2020 15:30:38 -0700
Subject: [PATCH 1028/1390] Throw error when TF_ListPhysicalDevices fails.

PiperOrigin-RevId: 318153404
Change-Id: Iffce4d65c1b8b4c12c24a61094ab1b13fb6b9ee2
---
 tensorflow/python/tfe_wrapper.cc | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/tensorflow/python/tfe_wrapper.cc b/tensorflow/python/tfe_wrapper.cc
index 80cce331353..4ad49d91134 100644
--- a/tensorflow/python/tfe_wrapper.cc
+++ b/tensorflow/python/tfe_wrapper.cc
@@ -254,14 +254,10 @@ py::object TFE_Py_ExecuteCancelable_wrapper(
 }
 
 static py::object TF_ListPhysicalDevices() {
-  tensorflow::Safe_TF_StatusPtr status = tensorflow::make_safe(TF_NewStatus());
   std::vector<string> devices;
   tensorflow::Status s =
       tensorflow::DeviceFactory::ListAllPhysicalDevices(&devices);
-  tensorflow::Set_TF_Status_from_Status(status.get(), s);
-  if (!s.ok()) {
-    return py::none();
-  }
+  MaybeRaiseRegisteredFromStatus(s);
   PyObject* result = PyList_New(devices.size());
   int i = 0;
   for (auto& dev : devices) {

From 9dd55d0b9d21a10d148fe8fce3f4a046501c2137 Mon Sep 17 00:00:00 2001
From: Aavishkar Mishra <58372823+aavishkarmishra@users.noreply.github.com>
Date: Thu, 25 Jun 2020 04:06:20 +0530
Subject: [PATCH 1029/1390] Update api_def_Maximum.pbtxt

#39981
---
 tensorflow/core/api_def/python_api/api_def_Maximum.pbtxt | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tensorflow/core/api_def/python_api/api_def_Maximum.pbtxt b/tensorflow/core/api_def/python_api/api_def_Maximum.pbtxt
index f53382118f3..9e835abf35a 100644
--- a/tensorflow/core/api_def/python_api/api_def_Maximum.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_Maximum.pbtxt
@@ -8,10 +8,10 @@ op {
   }
   description: <<END
 Example:
->>> x = tf.constant([0., 0., 0., 0.])
->>> y = tf.constant([-2., 0., 2., 5.])
->>> tf.math.maximum(x, y)
-<tf.Tensor: shape=(4,), dtype=float32, numpy=array([0., 0., 2., 5.], dtype=float32)>
+>>> x = tf.constant([0., 0., 0., 0.])  
+>>> y = tf.constant([-2., 0., 2., 5.])  
+>>> tf.math.maximum(x, y)  
+<tf.Tensor: shape=(4,), dtype=float32, numpy=array([0., 0., 2., 5.], dtype=float32)>  
 
 END
 }

From 878093287cdf8be565b1c86cbd1a397b4697a4cb Mon Sep 17 00:00:00 2001
From: Srihari Humbarwadi <sriharihumbarwadi97@gmail.com>
Date: Thu, 25 Jun 2020 04:07:40 +0530
Subject: [PATCH 1030/1390] Fixed typos in non_max_suppression_padded

---
 tensorflow/python/ops/image_ops_impl.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/tensorflow/python/ops/image_ops_impl.py b/tensorflow/python/ops/image_ops_impl.py
index bbce25724e7..ab57ce9244e 100644
--- a/tensorflow/python/ops/image_ops_impl.py
+++ b/tensorflow/python/ops/image_ops_impl.py
@@ -4578,7 +4578,7 @@ def non_max_suppression_padded(boxes,
     sorted_input: a boolean indicating whether the input boxes and scores
       are sorted in descending order by the score.
     canonicalized_coordinates: if box coordinates are given as
-    `[y_min, x_min, y_max, x_max]`, settign to True eliminate redundant
+    `[y_min, x_min, y_max, x_max]`, setting to True eliminate redundant
      computation to canonicalize box coordinates.
     tile_size: an integer representing the number of boxes in a tile, i.e.,
       the maximum number of boxes per image that can be used to suppress other
@@ -4586,8 +4586,8 @@ def non_max_suppression_padded(boxes,
       potentially more redundant work.
   Returns:
     idx: a tensor with a shape of [..., num_boxes] representing the
-      indices selected by non-max suppression. The leadign dimensions
-      are the batch dimensions of the input boxes. All numbers are are within
+      indices selected by non-max suppression. The leading dimensions
+      are the batch dimensions of the input boxes. All numbers are within
       [0, num_boxes). For each image (i.e., idx[i]), only the first num_valid[i]
       indices (i.e., idx[i][:num_valid[i]]) are valid.
     num_valid: a tensor of rank 0 or higher with a shape of [...]
@@ -4703,7 +4703,7 @@ def non_max_suppression_padded_v2(boxes,
     sorted_input: a boolean indicating whether the input boxes and scores
       are sorted in descending order by the score.
     canonicalized_coordinates: if box coordinates are given as
-    `[y_min, x_min, y_max, x_max]`, settign to True eliminate redundant
+    `[y_min, x_min, y_max, x_max]`, setting to True eliminate redundant
      computation to canonicalize box coordinates.
     tile_size: an integer representing the number of boxes in a tile, i.e.,
       the maximum number of boxes per image that can be used to suppress other
@@ -4711,8 +4711,8 @@ def non_max_suppression_padded_v2(boxes,
       potentially more redundant work.
   Returns:
     idx: a tensor with a shape of [..., num_boxes] representing the
-      indices selected by non-max suppression. The leadign dimensions
-      are the batch dimensions of the input boxes. All numbers are are within
+      indices selected by non-max suppression. The leading dimensions
+      are the batch dimensions of the input boxes. All numbers are within
       [0, num_boxes). For each image (i.e., idx[i]), only the first num_valid[i]
       indices (i.e., idx[i][:num_valid[i]]) are valid.
     num_valid: a tensor of rank 0 or higher with a shape of [...]

From 0780433b32cb1c797da6cbf175cf8393d896a670 Mon Sep 17 00:00:00 2001
From: Robert David <lrdx@google.com>
Date: Wed, 24 Jun 2020 15:54:38 -0700
Subject: [PATCH 1031/1390] LSTM: Split projection calculations to separate
 functions.

PiperOrigin-RevId: 318157584
Change-Id: I9bbee585b18fc508e6afb166e394e8d4bf50940d
---
 tensorflow/lite/kernels/lstm_eval.cc          | 337 ++++++++++++------
 .../calibration/builtin_logging_ops/lstm.cc   |  67 ++--
 2 files changed, 275 insertions(+), 129 deletions(-)

diff --git a/tensorflow/lite/kernels/lstm_eval.cc b/tensorflow/lite/kernels/lstm_eval.cc
index 3f74f3e7fff..178c2b62fb9 100644
--- a/tensorflow/lite/kernels/lstm_eval.cc
+++ b/tensorflow/lite/kernels/lstm_eval.cc
@@ -127,6 +127,210 @@ inline float GetTensorScale(const TfLiteTensor* tensor) {
   return tensor == nullptr ? 1.0f : tensor->params.scale;
 }
 
+// Calculates the output state tensor of an LSTM step.
+//
+// Implements the following formula:
+//   output_no_projection = output_gate .* activate(cell_state)
+//     (elementwise vector product)
+// If no projection is used:
+//   output = output_state = output_no_projection
+// With projection:
+//   output = output_state = clip(W*output_no_projection + bias)
+//
+// Output might not have a different 'stride' than n_batch, so we need to copy.
+//
+// Parameters:
+//  - n_batch: batches: the number of distinct vectors in each array.
+//  - n_cell, n_output: sizes of vectors.
+//  - cell_state, output_gate: input vectors, size n_batch*n_cell.
+//  - projection_weights, projection_weights_scale, projection_bias:
+//      constant inputs, describing projection matrix and bias.
+//  - proj_clip: if > 0, clip the output of the projection.
+//  - output_state: output vector, size n_batch*n_output. Must be contigous.
+//  - scratch: scratch area, size n_batch*n_cell.
+// LINT.IfChange
+void CalculateLstmOutputFloat(int n_batch, int n_cell, int n_output,
+                              const float* cell_state, const float* output_gate,
+                              TfLiteFusedActivation activation,
+                              const float* projection_weights,
+                              const float* projection_bias,
+                              const float proj_clip, float* output_state,
+                              float* scratch) {
+  tensor_utils::ApplyActivationToVector(cell_state, n_batch * n_cell,
+                                        activation, scratch);
+  tensor_utils::VectorVectorCwiseProduct(output_gate, scratch, n_batch * n_cell,
+                                         scratch);
+
+  const bool use_projection = (projection_weights != nullptr);
+  const bool use_projection_bias = (projection_bias != nullptr);
+
+  if (use_projection) {
+    if (use_projection_bias) {
+      tensor_utils::VectorBatchVectorAssign(projection_bias, n_output, n_batch,
+                                            output_state);
+    } else {
+      std::fill_n(output_state, n_batch * n_output, 0.0f);
+    }
+    tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+        projection_weights, n_output, n_cell, scratch, n_batch, output_state);
+    if (proj_clip > 0.0f) {
+      tensor_utils::CwiseClipping(output_state, n_batch * n_output, proj_clip);
+    }
+  } else {
+    std::copy_n(scratch, n_batch * n_output, output_state);
+  }
+}
+// LINT.ThenChange(//tensorflow/lite/tools/optimize/calibration/builtin_logging_ops/lstm.cc)
+
+// Calculates the output state tensor of an LSTM step. See Float version too.
+//
+// Parameters:
+//  - n_batch: batches: the number of distinct vectors in each array.
+//  - n_cell, n_output: sizes of vectors.
+//  - cell_state, output_gate: input vectors, size n_batch*n_cell.
+//  - projection_weights, projection_weights_scale, projection_bias:
+//      constant inputs, describing projection matrix and bias.
+//  - proj_clip: if > 0, clip the output of the projection.
+//  - output_state: output vector, size n_batch*n_output. Must be contigous.
+//  - asymmetric_quantize_inputs: parameter to control quantization.
+//  - projection_weights_row_sums, compute_row_sums, context: Data for optimized
+//      MatrixBatchVectorMultiplyAccumulate.
+//  - scratch0: scratch area of size n_batch*n_cell
+//  - scratch1: scratch area of size n_batch*n_cell
+//  - scratch2: scratch area of size n_batch
+//  - scratch3: scratch area of size n_batch
+//  - scratch4: scratch area used by MatrixBatchVectorMultiplyAccumulate
+void CalculateLstmOutputHybrid(
+    int n_batch, int n_cell, int n_output, const float* cell_state,
+    const float* output_gate, TfLiteFusedActivation activation,
+    const int8_t* projection_weights, float projection_weights_scale,
+    const float* projection_bias, const float proj_clip, float* output_state,
+    bool asymmetric_quantize_inputs, int32_t* projection_weights_row_sums,
+    bool* compute_row_sums, CpuBackendContext* context, float* scratch0,
+    int8_t* scratch1, float* scratch2, int32_t* scratch3, int32_t* scratch4) {
+  tensor_utils::ApplyActivationToVector(cell_state, n_batch * n_cell,
+                                        activation, scratch0);
+  tensor_utils::VectorVectorCwiseProduct(output_gate, scratch0,
+                                         n_batch * n_cell, scratch0);
+
+  const bool use_projection = (projection_weights != nullptr);
+  const bool use_projection_bias = (projection_bias != nullptr);
+
+  if (use_projection) {
+    if (use_projection_bias) {
+      tensor_utils::VectorBatchVectorAssign(projection_bias, n_output, n_batch,
+                                            output_state);
+    } else {
+      std::fill_n(output_state, n_batch * n_output, 0.0f);
+    }
+    if (!tensor_utils::IsZeroVector(scratch0, n_batch * n_cell)) {
+      // Save quantization and matmul computation for all zero output.
+      tensor_utils::BatchQuantizeFloats(scratch0, n_batch, n_cell, scratch1,
+                                        scratch2, scratch3,
+                                        asymmetric_quantize_inputs);
+      tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+          projection_weights, n_output, n_cell, scratch1,
+          projection_weights_scale, scratch2, n_batch, output_state,
+          /*per_channel_scale=*/nullptr,
+          asymmetric_quantize_inputs ? scratch3 : nullptr, scratch4,
+          projection_weights_row_sums, compute_row_sums, scratch2, context);
+    }
+    if (proj_clip > 0.0f) {
+      tensor_utils::CwiseClipping(output_state, n_batch * n_output, proj_clip);
+    }
+  } else {
+    std::copy_n(scratch0, n_batch * n_output, output_state);
+  }
+}
+
+// Calculates the output state tensor of an LSTM step. See Float and hybrid
+// versions as well.
+//
+// Parameters:
+//  - n_batch: batches: the number of distinct vectors in each array.
+//  - n_cell, n_output: sizes of vectors.
+//  - cell_state, output_gate: input vectors, size n_batch*n_cell.
+//  - cell_state_scale: scaling of cell_state.
+//  - effective_hidden_scale_[a|b]: effective scale of cell_state.*output_gate
+//  - hidden_zp: zero_point for cell_state.*output_gate
+//  - projection_weights, effective_proj_scale_[a|b], projection_effective_bias:
+//      constant inputs, describing projection matrix and bias.
+//  - output_state_zp: zero point of output_state. (Input, calibrated value.)
+//  - quantized_proj_clip: if > 0, clip the output of the projection.
+//  - output_state: output vector, size n_batch*n_output. Must be contigous.
+//  - context: data for optimized MatrixBatchVectorMultiplyAccumulate.
+//  - scratch0: scratch area of size n_batch*n_cell
+//  - scratch1: scratch area of size n_batch*n_cell
+//  - scratch2: scratch area used by MatrixBatchVectorMultiplyAccumulate
+void CalculateLstmOutputInteger8x8_16(
+    int n_batch, int n_cell, int n_output, const int16_t* cell_state,
+    int32_t cell_state_scale, const int16_t* output_gate,
+    int32_t effective_hidden_scale_a, int32_t effective_hidden_scale_b,
+    int32_t hidden_zp, const int8_t* projection_weights,
+    int32_t effective_proj_scale_a, int32_t effective_proj_scale_b,
+    const int32_t* projection_effective_bias, int32_t output_state_zp,
+    int8_t quantized_proj_clip, int8_t* output_state,
+    CpuBackendContext* context, int16_t* scratch0, int8_t* scratch1,
+    int32_t* scratch2) {
+  // Note: unlike float/hybrid, the activation is always Tanh.
+  tensor_utils::ApplyTanh(15 + cell_state_scale, cell_state, n_batch, n_cell,
+                          scratch0);
+  tensor_utils::CwiseMul(output_gate, scratch0, effective_hidden_scale_a,
+                         effective_hidden_scale_b, n_batch, n_cell, hidden_zp,
+                         scratch1);
+
+  const bool use_projection = (projection_weights != nullptr);
+
+  if (use_projection) {
+    // Note: no bias like in float/hybrid
+    std::fill_n(output_state, n_batch * n_output, 0);
+    tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+        scratch1, projection_effective_bias, projection_weights,
+        effective_proj_scale_a, effective_proj_scale_b, n_batch, n_cell,
+        n_output, output_state_zp, scratch2, output_state, context);
+    if (quantized_proj_clip > 0) {
+      tensor_utils::CwiseClipping(output_state, n_batch * n_output,
+                                  quantized_proj_clip);
+    }
+  } else {
+    std::copy_n(scratch1, n_batch * n_output, output_state);
+  }
+}
+
+// Calculates the output state tensor of an LSTM step. See Float and hybrid
+// versions as well.
+//
+// Parameters:
+//  - n_batch: batches: the number of distinct vectors in each array.
+//  - n_cell, n_output: sizes of vectors.
+//  - cell_state, output_gate: input vectors, size n_batch*n_cell.
+//  - projection_weights, effective_proj_scale_[a|b], projection_bias:
+//      constant inputs, describing projection matrix and bias.
+//  - output_state_zp: zero point of the output state.
+//  - quantized_proj_clip: if > 0, clip the output of the projection.
+//  - output_state: output vector, size n_batch*n_output. Must be contigous.
+//  - scratch: scratch area of size n_batch*n_cell
+void CalculateLstmOutputInteger8x8_8(
+    int n_batch, int n_cell, int n_output, const int16_t* cell_state,
+    const int16_t* output_gate, const int8_t* projection_weights,
+    int32_t effective_proj_scale_a, int32_t effective_proj_scale_b,
+    const int32_t* projection_bias, int32_t output_state_zp,
+    int32_t quantized_proj_clip, int8_t* output_state, int16_t* scratch) {
+  // Note: unlike float/hybrid, the activation is always Tanh.
+  tensor_utils::ApplyTanhFloat(cell_state, n_batch, n_cell, -15, scratch);
+  tensor_utils::CwiseMul(output_gate, scratch, n_batch, n_cell, 15 + 15 - 15,
+                         scratch);
+  // Note: no bias like in float/hybrid
+  tensor_utils::MatrixBatchVectorMultiply(
+      scratch, projection_weights, effective_proj_scale_a,
+      effective_proj_scale_b, projection_bias, n_batch, n_cell, n_output,
+      output_state_zp, output_state);
+  if (quantized_proj_clip > 0) {
+    tensor_utils::CwiseClipping(output_state, n_batch * n_output,
+                                quantized_proj_clip);
+  }
+}
+
 // Performs an LSTM batch inference step for input specified by input_ptr.
 // The LSTM cell is specified by the pointers to its weights (*_weights_ptr) and
 // biases (*_bias_ptr), and buffers (*_scratch), along with additional
@@ -395,32 +599,12 @@ inline void LstmStepFloat(
   }
   tensor_utils::ApplySigmoidToVector(output_gate_scratch, n_batch * n_cell,
                                      output_gate_scratch);
-  tensor_utils::ApplyActivationToVector(cell_state_ptr, n_batch * n_cell,
-                                        params->activation, cell_gate_scratch);
-  tensor_utils::VectorVectorCwiseProduct(output_gate_scratch, cell_gate_scratch,
-                                         n_batch * n_cell, output_gate_scratch);
 
-  const bool use_projection_weight = (projection_weights_ptr != nullptr);
-  const bool use_projection_bias = (projection_bias_ptr != nullptr);
+  CalculateLstmOutputFloat(n_batch, n_cell, n_output, cell_state_ptr,
+                           output_gate_scratch, params->activation,
+                           projection_weights_ptr, projection_bias_ptr,
+                           params->proj_clip, output_state_ptr, scratch2);
 
-  // For each batch: update output_state.
-  if (use_projection_weight) {
-    if (use_projection_bias) {
-      tensor_utils::VectorBatchVectorAssign(projection_bias_ptr, n_output,
-                                            n_batch, output_state_ptr);
-    } else {
-      std::fill_n(output_state_ptr, n_batch * n_output, 0.0f);
-    }
-    tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-        projection_weights_ptr, n_output, n_cell, output_gate_scratch, n_batch,
-        output_state_ptr);
-    if (params->proj_clip > 0.0) {
-      tensor_utils::CwiseClipping(output_state_ptr, n_batch * n_output,
-                                  params->proj_clip);
-    }
-  } else {
-    std::copy_n(output_gate_scratch, n_batch * n_output, output_state_ptr);
-  }
   // Copy output_state to the output. Note that the output batch rows may not be
   // contiguous (output_batch_leading_dim != n_output).
   for (int b = 0; b < n_batch; b++) {
@@ -861,44 +1045,17 @@ inline void LstmStepHybrid(
   }
   tensor_utils::ApplySigmoidToVector(output_gate_scratch, n_batch * n_cell,
                                      output_gate_scratch);
-  tensor_utils::ApplyActivationToVector(cell_state_ptr, n_batch * n_cell,
-                                        params->activation, cell_gate_scratch);
-  tensor_utils::VectorVectorCwiseProduct(output_gate_scratch, cell_gate_scratch,
-                                         n_batch * n_cell, output_gate_scratch);
 
-  const bool use_projection_weight = (projection_weights_ptr != nullptr);
-  const bool use_projection_bias = (projection_bias_ptr != nullptr);
+  CalculateLstmOutputHybrid(
+      n_batch, n_cell, n_output, cell_state_ptr, output_gate_scratch,
+      params->activation, projection_weights_ptr, projection_weights_scale,
+      projection_bias_ptr, params->proj_clip, output_state_ptr,
+      asymmetric_quantize_inputs, projection_weights_row_sums, compute_row_sums,
+      context, scratch2, quantized_output_scratch, scaling_factors, zero_points,
+      accum_scratch_ptr);
 
-  // For each batch: update the projection and output_state. Note that since
-  // the output batch rows may not be contiguous (output_batch_leading_dim !=
-  // n_output), we unroll the batched operations.
-  if (use_projection_weight) {
-    if (use_projection_bias) {
-      tensor_utils::VectorBatchVectorAssign(projection_bias_ptr, n_output,
-                                            n_batch, output_state_ptr);
-    } else {
-      std::fill_n(output_state_ptr, n_batch * n_output, 0.0f);
-    }
-    if (!tensor_utils::IsZeroVector(output_gate_scratch, n_batch * n_cell)) {
-      // Save quantization and matmul computation for all zero input.
-      tensor_utils::BatchQuantizeFloats(
-          output_gate_scratch, n_batch, n_cell, quantized_output_scratch,
-          scaling_factors, zero_points, asymmetric_quantize_inputs);
-      tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-          projection_weights_ptr, n_output, n_cell, quantized_output_scratch,
-          projection_weights_scale, scaling_factors, n_batch, output_state_ptr,
-          /*per_channel_scale=*/nullptr,
-          asymmetric_quantize_inputs ? zero_points : nullptr, accum_scratch_ptr,
-          projection_weights_row_sums, compute_row_sums,
-          scaling_factors_scratch, context);
-    }
-    if (params->proj_clip > 0.0) {
-      tensor_utils::CwiseClipping(output_state_ptr, n_batch * n_output,
-                                  params->proj_clip);
-    }
-  } else {
-    std::copy_n(output_gate_scratch, n_batch * n_output, output_state_ptr);
-  }
+  // Copy output_state_ptr to the output. Note that the output batch rows may
+  // not be contiguous (output_batch_leading_dim != n_output).
   for (int b = 0; b < n_batch; b++) {
     std::copy_n(output_state_ptr + b * n_output, n_output,
                 output_ptr + b * output_batch_leading_dim);
@@ -1071,7 +1228,6 @@ inline void LstmStepInteger8x8_16(
   const bool use_cifg = (input_to_input_weight_ptr == nullptr);
   const bool use_peephole = (cell_to_output_weight_ptr != nullptr);
   const bool use_layer_norm = (layer_norm_forget_weight_ptr != nullptr);
-  const bool use_projection = (projection_weight_ptr != nullptr);
 
   // Check for nullptrs.
   TFLITE_DCHECK(input_to_forget_effective_bias);
@@ -1219,28 +1375,17 @@ inline void LstmStepInteger8x8_16(
   tensor_utils::ApplySigmoid(output_gate_scratch, n_batch, n_cell,
                              output_gate_scratch);
 
-  // Hidden.
-  tensor_utils::ApplyTanh(15 + cell_state_scale, cell_state_ptr, n_batch,
-                          n_cell, input_gate_scratch);
+  CalculateLstmOutputInteger8x8_16(
+      n_batch, n_cell, n_output, cell_state_ptr, cell_state_scale,
+      output_gate_scratch, effective_hidden_scale_a, effective_hidden_scale_b,
+      hidden_zp, projection_weight_ptr, effective_proj_scale_a,
+      effective_proj_scale_b, projection_effective_bias, output_state_zp,
+      quantized_proj_clip, output_state_ptr, context, scratch0, scratch4,
+      scratch5);
 
-  tensor_utils::CwiseMul(output_gate_scratch, input_gate_scratch,
-                         effective_hidden_scale_a, effective_hidden_scale_b,
-                         n_batch, n_cell, hidden_zp, scratch4);
-  // Projection.
-  if (use_projection) {
-    std::fill_n(output_ptr, n_batch * n_output, 0);
-    tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-        scratch4, projection_effective_bias, projection_weight_ptr,
-        effective_proj_scale_a, effective_proj_scale_b, n_batch, n_cell,
-        n_output, output_state_zp, scratch5, output_ptr, context);
-    if (quantized_proj_clip > 0) {
-      tensor_utils::CwiseClipping(output_ptr, n_batch * n_output,
-                                  quantized_proj_clip);
-    }
-  } else {
-    std::copy_n(scratch4, n_batch * n_output, output_ptr);
-  }
-  std::copy_n(output_ptr, n_batch * n_output, output_state_ptr);
+  // Copy output state to the output. Note that unlike float or hybrid, output
+  // is always contigous.
+  std::copy_n(output_state_ptr, n_batch * n_output, output_ptr);
 }
 
 // Fully quantized lstm kernel for 8 bit gate matmul output.
@@ -1502,27 +1647,15 @@ inline void LstmStepInteger8x8_8(
                                 quantized_cell_clip);
   }
 
-  // Cell to hidden.
-  tensor_utils::ApplyTanhFloat(cell_state_ptr, n_batch, n_cell, -15,
-                               forget_gate_scratch);
+  CalculateLstmOutputInteger8x8_8(
+      n_batch, n_cell, n_output, cell_state_ptr, output_gate_scratch,
+      projection_weight_ptr, effective_proj_scale_a, effective_proj_scale_b,
+      projection_bias_ptr, output_state_zp, quantized_proj_clip,
+      output_state_ptr, scratch2);
 
-  tensor_utils::CwiseMul(output_gate_scratch, forget_gate_scratch, n_batch,
-                         n_cell, 15 + 15 - 15, cell_gate_scratch);
-
-  // Projection.
-  tensor_utils::MatrixBatchVectorMultiply(
-      cell_gate_scratch, projection_weight_ptr, effective_proj_scale_a,
-      effective_proj_scale_b, projection_bias_ptr, n_batch, n_cell, n_output,
-      output_state_zp, output_ptr);
-
-  // Projection clipping.
-  if (quantized_proj_clip > 0) {
-    tensor_utils::CwiseClipping(output_ptr, n_batch * n_output,
-                                quantized_proj_clip);
-  }
-
-  // Copy output to output state.
-  std::copy_n(output_ptr, n_batch * n_output, output_state_ptr);
+  // Copy output state to the output. Note that unlike float or hybrid, output
+  // is always contigous.
+  std::copy_n(output_state_ptr, n_batch * n_output, output_ptr);
 }
 
 }  // namespace
diff --git a/tensorflow/lite/tools/optimize/calibration/builtin_logging_ops/lstm.cc b/tensorflow/lite/tools/optimize/calibration/builtin_logging_ops/lstm.cc
index ed1ef07d8d3..6399af013c5 100644
--- a/tensorflow/lite/tools/optimize/calibration/builtin_logging_ops/lstm.cc
+++ b/tensorflow/lite/tools/optimize/calibration/builtin_logging_ops/lstm.cc
@@ -37,6 +37,41 @@ namespace builtin {
 
 namespace {
 
+void CalculateLstmOutputFloat(
+    int n_batch, int n_cell, int n_output, const float* cell_state,
+    const float* output_gate, TfLiteFusedActivation activation,
+    const float* projection_weights, const float* projection_bias,
+    const float proj_clip, float* output_state, float* scratch, Logger* logger,
+    const std::vector<int>& intermediate_tensor_indexes,
+    ErrorReporter* error_reporter) {
+  tensor_utils::ApplyActivationToVector(cell_state, n_batch * n_cell,
+                                        activation, scratch);
+  tensor_utils::VectorVectorCwiseProduct(output_gate, scratch, n_batch * n_cell,
+                                         scratch);
+
+  logger->LogTensorValue(intermediate_tensor_indexes[4], scratch,
+                         n_cell * n_batch, error_reporter);
+
+  const bool use_projection = (projection_weights != nullptr);
+  const bool use_projection_bias = (projection_bias != nullptr);
+
+  if (use_projection) {
+    if (use_projection_bias) {
+      tensor_utils::VectorBatchVectorAssign(projection_bias, n_output, n_batch,
+                                            output_state);
+    } else {
+      std::fill_n(output_state, n_batch * n_output, 0.0f);
+    }
+    tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+        projection_weights, n_output, n_cell, scratch, n_batch, output_state);
+    if (proj_clip > 0.0f) {
+      tensor_utils::CwiseClipping(output_state, n_batch * n_output, proj_clip);
+    }
+  } else {
+    std::copy_n(scratch, n_batch * n_output, output_state);
+  }
+}
+
 inline void LstmStepWithAuxInput(
     const float* input_ptr, const float* input_to_input_weights_ptr,
     const float* input_to_forget_weights_ptr,
@@ -245,35 +280,13 @@ inline void LstmStepWithAuxInput(
   }
   tensor_utils::ApplySigmoidToVector(output_gate_scratch, n_batch * n_cell,
                                      output_gate_scratch);
-  tensor_utils::ApplyActivationToVector(cell_state_ptr, n_batch * n_cell,
-                                        params->activation, cell_gate_scratch);
-  tensor_utils::VectorVectorCwiseProduct(output_gate_scratch, cell_gate_scratch,
-                                         n_batch * n_cell, output_gate_scratch);
 
-  logger->LogTensorValue(intermediate_tensor_indexes[4], output_gate_scratch,
-                         n_cell * n_batch, error_reporter);
+  CalculateLstmOutputFloat(n_batch, n_cell, n_output, cell_state_ptr,
+                           output_gate_scratch, params->activation,
+                           projection_weights_ptr, projection_bias_ptr,
+                           params->proj_clip, output_state_ptr, scratch2,
+                           logger, intermediate_tensor_indexes, error_reporter);
 
-  const bool use_projection_weight = (projection_weights_ptr != nullptr);
-  const bool use_projection_bias = (projection_bias_ptr != nullptr);
-
-  // For each batch: update output_state.
-  if (use_projection_weight) {
-    if (use_projection_bias) {
-      tensor_utils::VectorBatchVectorAssign(projection_bias_ptr, n_output,
-                                            n_batch, output_state_ptr);
-    } else {
-      std::fill_n(output_state_ptr, n_batch * n_output, 0.0f);
-    }
-    tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-        projection_weights_ptr, n_output, n_cell, output_gate_scratch, n_batch,
-        output_state_ptr);
-    if (params->proj_clip > 0.0) {
-      tensor_utils::CwiseClipping(output_state_ptr, n_batch * n_output,
-                                  params->proj_clip);
-    }
-  } else {
-    std::copy_n(output_gate_scratch, n_batch * n_output, output_state_ptr);
-  }
   // Copy output_state to the output. Note that the output batch rows may not be
   // contiguous (output_batch_leading_dim != n_output).
   for (int b = 0; b < n_batch; b++) {

From 9f292402a6f7cce596859f72a459da9ea2892139 Mon Sep 17 00:00:00 2001
From: Marcello Maggioni <maggioni@google.com>
Date: Wed, 24 Jun 2020 16:04:47 -0700
Subject: [PATCH 1032/1390] [XLA] Add kLogistic HLO to allow for custom HLO
 lowering based on target.

This will enable target specific lowering for the logistic function
to be performed when this HLO is used.

PiperOrigin-RevId: 318159527
Change-Id: I453782fea99838fddd9039f63faa5c876cb7dec0
---
 tensorflow/compiler/xla/g3doc/operation_semantics.md |  3 +++
 tensorflow/compiler/xla/service/dfs_hlo_visitor.h    |  3 +++
 tensorflow/compiler/xla/service/hlo_cost_analysis.cc | 12 +++++++-----
 .../xla/service/hlo_evaluator_typed_visitor.h        | 10 ++++++++++
 tensorflow/compiler/xla/service/hlo_graph_dumper.cc  |  1 +
 tensorflow/compiler/xla/service/hlo_instruction.cc   |  6 ++++++
 tensorflow/compiler/xla/service/hlo_opcode.h         |  1 +
 tensorflow/compiler/xla/service/hlo_parser.cc        |  1 +
 .../compiler/xla/service/instruction_fusion.cc       |  1 +
 tensorflow/compiler/xla/service/layout_assignment.cc |  1 +
 tensorflow/compiler/xla/service/shape_inference.cc   |  1 +
 .../compiler/xla/service/sharding_propagation.cc     |  1 +
 12 files changed, 36 insertions(+), 5 deletions(-)

diff --git a/tensorflow/compiler/xla/g3doc/operation_semantics.md b/tensorflow/compiler/xla/g3doc/operation_semantics.md
index 4d483e2b78e..56aa3feefaa 100644
--- a/tensorflow/compiler/xla/g3doc/operation_semantics.md
+++ b/tensorflow/compiler/xla/g3doc/operation_semantics.md
@@ -1288,6 +1288,9 @@ if and only if the corresponding input element is finite.
 
 <b>`LogicalNot(operand)`</b> Element-wise logical not `x -> !(x)`.
 
+<b>`Logistic(operand)`</b> Element-wise logistic function computation `x ->
+logistic(x)`.
+
 <b>`PopulationCount(operand)`</b> Computes the number of bits set in each
 element of `operand`.
 
diff --git a/tensorflow/compiler/xla/service/dfs_hlo_visitor.h b/tensorflow/compiler/xla/service/dfs_hlo_visitor.h
index bdaac32a0e5..b0def1a2dd8 100644
--- a/tensorflow/compiler/xla/service/dfs_hlo_visitor.h
+++ b/tensorflow/compiler/xla/service/dfs_hlo_visitor.h
@@ -150,6 +150,9 @@ class DfsHloVisitorBase {
   virtual Status HandleRound(HloInstructionPtr hlo) {
     return HandleElementwiseUnary(hlo);
   }
+  virtual Status HandleLogistic(HloInstructionPtr hlo) {
+    return HandleElementwiseUnary(hlo);
+  }
   virtual Status HandleSign(HloInstructionPtr hlo) {
     return HandleElementwiseUnary(hlo);
   }
diff --git a/tensorflow/compiler/xla/service/hlo_cost_analysis.cc b/tensorflow/compiler/xla/service/hlo_cost_analysis.cc
index 8a31bc5fef4..814643718ba 100644
--- a/tensorflow/compiler/xla/service/hlo_cost_analysis.cc
+++ b/tensorflow/compiler/xla/service/hlo_cost_analysis.cc
@@ -99,12 +99,14 @@ Status HloCostAnalysis::HandleElementwiseOp(
   auto opcode = hlo_instruction->opcode();
   // We treat transcendental operations separately since one transcendental
   // operation can correspond to several floating point ops.
+  // kLogistic is included in "trascendental" as it is implemented using
+  // trascendental ops (tanh or exp).
   if (opcode == HloOpcode::kExp || opcode == HloOpcode::kLog ||
-      opcode == HloOpcode::kPower || opcode == HloOpcode::kSqrt ||
-      opcode == HloOpcode::kRsqrt || opcode == HloOpcode::kTanh ||
-      opcode == HloOpcode::kSin || opcode == HloOpcode::kCos ||
-      opcode == HloOpcode::kExpm1 || opcode == HloOpcode::kLog1p ||
-      opcode == HloOpcode::kAtan2) {
+      opcode == HloOpcode::kLogistic || opcode == HloOpcode::kPower ||
+      opcode == HloOpcode::kSqrt || opcode == HloOpcode::kRsqrt ||
+      opcode == HloOpcode::kTanh || opcode == HloOpcode::kSin ||
+      opcode == HloOpcode::kCos || opcode == HloOpcode::kExpm1 ||
+      opcode == HloOpcode::kLog1p || opcode == HloOpcode::kAtan2) {
     current_properties_[kTranscendentalsKey] = computation_count;
   } else {
     // Note: transcendental operations are considered a separate category from
diff --git a/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor.h b/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor.h
index 3dc9cc24734..1a154f32a6f 100644
--- a/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor.h
+++ b/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor.h
@@ -451,6 +451,16 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
     return HandleNegate<ReturnT>(negate);
   }
 
+  Status HandleLogistic(HloInstruction* logistic) override {
+    TF_ASSIGN_OR_RETURN(
+        parent_->evaluated_[logistic],
+        ElementWiseUnaryOp(logistic, [](ElementwiseT elem_operand) {
+          return static_cast<ElementwiseT>(1) /
+                 (static_cast<ElementwiseT>(1) + std::exp(-elem_operand));
+        }));
+    return Status::OK();
+  }
+
   template <typename NativeT,
             typename std::enable_if<std::is_integral<NativeT>::value>::type* =
                 nullptr>
diff --git a/tensorflow/compiler/xla/service/hlo_graph_dumper.cc b/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
index ad21efa13c9..a50af6bf1b9 100644
--- a/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
+++ b/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
@@ -977,6 +977,7 @@ ColorScheme HloDotDumper::GetInstructionColor(const HloInstruction* instr) {
     case HloOpcode::kShiftLeft:
     case HloOpcode::kShiftRightArithmetic:
     case HloOpcode::kShiftRightLogical:
+    case HloOpcode::kLogistic:
     case HloOpcode::kSign:
     case HloOpcode::kSin:
     case HloOpcode::kSlice:
diff --git a/tensorflow/compiler/xla/service/hlo_instruction.cc b/tensorflow/compiler/xla/service/hlo_instruction.cc
index 6de76c1cc63..e9a04583bdf 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.cc
+++ b/tensorflow/compiler/xla/service/hlo_instruction.cc
@@ -833,6 +833,7 @@ HloInstruction::CreateRngBitGenerator(const Shape& shape, HloInstruction* state,
     case HloOpcode::kPopulationCount:
     case HloOpcode::kReal:
     case HloOpcode::kRsqrt:
+    case HloOpcode::kLogistic:
     case HloOpcode::kSign:
     case HloOpcode::kSin:
     case HloOpcode::kSqrt:
@@ -1615,6 +1616,7 @@ std::unique_ptr<HloInstruction> HloInstruction::CloneWithNewOperands(
     case HloOpcode::kPopulationCount:
     case HloOpcode::kReal:
     case HloOpcode::kRsqrt:
+    case HloOpcode::kLogistic:
     case HloOpcode::kSign:
     case HloOpcode::kSin:
     case HloOpcode::kSqrt:
@@ -1993,6 +1995,7 @@ bool HloInstruction::IdenticalSlowPath(
     case HloOpcode::kShiftLeft:
     case HloOpcode::kShiftRightArithmetic:
     case HloOpcode::kShiftRightLogical:
+    case HloOpcode::kLogistic:
     case HloOpcode::kSign:
     case HloOpcode::kSin:
     case HloOpcode::kSqrt:
@@ -2440,6 +2443,7 @@ bool HloInstruction::IsElementwiseImpl(
     case HloOpcode::kReal:
     case HloOpcode::kReducePrecision:
     case HloOpcode::kRsqrt:
+    case HloOpcode::kLogistic:
     case HloOpcode::kSign:
     case HloOpcode::kSin:
     case HloOpcode::kSqrt:
@@ -2854,6 +2858,8 @@ Status HloInstruction::Visit(DfsHloVisitorBase<HloInstructionPtr>* visitor) {
       return visitor->HandleBatchNormInference(this);
     case HloOpcode::kBatchNormGrad:
       return visitor->HandleBatchNormGrad(this);
+    case HloOpcode::kLogistic:
+      return visitor->HandleLogistic(this);
     case HloOpcode::kSign:
       return visitor->HandleSign(this);
     case HloOpcode::kConstant:
diff --git a/tensorflow/compiler/xla/service/hlo_opcode.h b/tensorflow/compiler/xla/service/hlo_opcode.h
index 92359bcbdac..1625d0bbae4 100644
--- a/tensorflow/compiler/xla/service/hlo_opcode.h
+++ b/tensorflow/compiler/xla/service/hlo_opcode.h
@@ -98,6 +98,7 @@ namespace xla {
   V(kIsFinite, "is-finite", 1)                                         \
   V(kLog, "log", 1)                                                    \
   V(kLog1p, "log-plus-one", 1)                                         \
+  V(kLogistic, "logistic", 1)                                          \
   V(kAnd, "and", 2)                                                    \
   V(kNot, "not", 1)                                                    \
   V(kOr, "or", 2)                                                      \
diff --git a/tensorflow/compiler/xla/service/hlo_parser.cc b/tensorflow/compiler/xla/service/hlo_parser.cc
index 86475ce76f4..22cd34f3378 100644
--- a/tensorflow/compiler/xla/service/hlo_parser.cc
+++ b/tensorflow/compiler/xla/service/hlo_parser.cc
@@ -888,6 +888,7 @@ bool HloParserImpl::ParseInstructionRhs(HloComputation::Builder* builder,
     case HloOpcode::kFloor:
     case HloOpcode::kLog:
     case HloOpcode::kLog1p:
+    case HloOpcode::kLogistic:
     case HloOpcode::kNot:
     case HloOpcode::kNegate:
     case HloOpcode::kPopulationCount:
diff --git a/tensorflow/compiler/xla/service/instruction_fusion.cc b/tensorflow/compiler/xla/service/instruction_fusion.cc
index 02966cc2bf2..8d8930615b2 100644
--- a/tensorflow/compiler/xla/service/instruction_fusion.cc
+++ b/tensorflow/compiler/xla/service/instruction_fusion.cc
@@ -161,6 +161,7 @@ bool IsAlwaysDuplicable(const HloInstruction& instruction) {
     case HloOpcode::kGather:
     case HloOpcode::kLog:
     case HloOpcode::kLog1p:
+    case HloOpcode::kLogistic:
     case HloOpcode::kMap:
     case HloOpcode::kParameter:
     case HloOpcode::kPower:
diff --git a/tensorflow/compiler/xla/service/layout_assignment.cc b/tensorflow/compiler/xla/service/layout_assignment.cc
index a35ba140e86..3c48668e742 100644
--- a/tensorflow/compiler/xla/service/layout_assignment.cc
+++ b/tensorflow/compiler/xla/service/layout_assignment.cc
@@ -2193,6 +2193,7 @@ bool LayoutAssignment::InstructionCanChangeLayout(
     case HloOpcode::kIsFinite:
     case HloOpcode::kLog:
     case HloOpcode::kLog1p:
+    case HloOpcode::kLogistic:
     case HloOpcode::kMap:
     case HloOpcode::kMaximum:
     case HloOpcode::kMinimum:
diff --git a/tensorflow/compiler/xla/service/shape_inference.cc b/tensorflow/compiler/xla/service/shape_inference.cc
index bb4a38ded1e..40a28d90f0a 100644
--- a/tensorflow/compiler/xla/service/shape_inference.cc
+++ b/tensorflow/compiler/xla/service/shape_inference.cc
@@ -256,6 +256,7 @@ StatusOr<Shape> InferWindowOutputShape(const Shape& base_shape,
     case HloOpcode::kExpm1:
     case HloOpcode::kLog:
     case HloOpcode::kLog1p:
+    case HloOpcode::kLogistic:
     case HloOpcode::kRsqrt:
     case HloOpcode::kSqrt:
     case HloOpcode::kCbrt:
diff --git a/tensorflow/compiler/xla/service/sharding_propagation.cc b/tensorflow/compiler/xla/service/sharding_propagation.cc
index 2eba9279df4..46ef132c1c0 100644
--- a/tensorflow/compiler/xla/service/sharding_propagation.cc
+++ b/tensorflow/compiler/xla/service/sharding_propagation.cc
@@ -216,6 +216,7 @@ const HloInstruction* PickRepresentativeOperand(
     case HloOpcode::kIsFinite:
     case HloOpcode::kLog:
     case HloOpcode::kLog1p:
+    case HloOpcode::kLogistic:
     case HloOpcode::kMaximum:
     case HloOpcode::kMinimum:
     case HloOpcode::kMultiply:

From 4cdc8d04f306d54c6b992767ad6128a49dfdad52 Mon Sep 17 00:00:00 2001
From: Bruce Fontaine <bfontain@google.com>
Date: Wed, 24 Jun 2020 16:10:38 -0700
Subject: [PATCH 1033/1390] Add InputOption support to all remaining
 strategies.

PiperOrigin-RevId: 318160598
Change-Id: I99ec2aa41a6e4e5fb4720b878fd82ad10802dcc9
---
 .../distribute/central_storage_strategy.py    | 14 +++--
 .../collective_all_reduce_strategy.py         | 21 ++++++--
 .../collective_all_reduce_strategy_test.py    | 50 ++++++++++++++++++
 .../python/distribute/distribute_lib.py       |  9 ++--
 .../python/distribute/mirrored_strategy.py    | 29 ++++++++---
 .../distribute/mirrored_strategy_test.py      | 44 ++++++++++++++++
 .../python/distribute/one_device_strategy.py  | 38 +++++++++-----
 .../distribute/one_device_strategy_test.py    | 41 +++++++++++++++
 .../distribute/parameter_server_strategy.py   | 51 +++++++++++-------
 .../parameter_server_strategy_test.py         | 52 +++++++++++++++++++
 ...flow.distribute.-one-device-strategy.pbtxt |  4 +-
 ...perimental.-central-storage-strategy.pbtxt |  4 +-
 ...erimental.-parameter-server-strategy.pbtxt |  4 +-
 13 files changed, 301 insertions(+), 60 deletions(-)

diff --git a/tensorflow/python/distribute/central_storage_strategy.py b/tensorflow/python/distribute/central_storage_strategy.py
index c4555d7d5bd..6e2c441c468 100644
--- a/tensorflow/python/distribute/central_storage_strategy.py
+++ b/tensorflow/python/distribute/central_storage_strategy.py
@@ -74,7 +74,7 @@ class CentralStorageStrategy(distribute_lib.Strategy):
   def _from_num_gpus(cls, num_gpus):
     return cls(device_util.local_devices_from_num_gpus(num_gpus))
 
-  def experimental_distribute_dataset(self, dataset):  # pylint: disable=useless-super-delegation
+  def experimental_distribute_dataset(self, dataset, options=None):  # pylint: disable=useless-super-delegation
     """Distributes a tf.data.Dataset instance provided via dataset.
 
     The returned dataset is a wrapped strategy dataset which creates a
@@ -96,14 +96,17 @@ class CentralStorageStrategy(distribute_lib.Strategy):
     ```
     Args:
       dataset: `tf.data.Dataset` to be prefetched to device.
+      options: `tf.distribute.InputOptions` used to control options on how this
+        dataset is distributed.
 
     Returns:
       A "distributed `Dataset`" that the caller can iterate over.
     """
     return super(CentralStorageStrategy, self).experimental_distribute_dataset(
-        dataset)
+        dataset, options)
 
-  def experimental_distribute_datasets_from_function(self, dataset_fn):  # pylint: disable=useless-super-delegation
+  def experimental_distribute_datasets_from_function(self, dataset_fn,  # pylint: disable=useless-super-delegation
+                                                     options=None):
     """Distributes `tf.data.Dataset` instances created by calls to `dataset_fn`.
 
     `dataset_fn` will be called once for each worker in the strategy. In this
@@ -136,6 +139,8 @@ class CentralStorageStrategy(distribute_lib.Strategy):
     Args:
       dataset_fn: A function taking a `tf.distribute.InputContext` instance and
         returning a `tf.data.Dataset`.
+      options: `tf.distribute.InputOptions` used to control options on how this
+        dataset is distributed.
 
     Returns:
       A "distributed `Dataset`", which the caller can iterate over like regular
@@ -143,7 +148,8 @@ class CentralStorageStrategy(distribute_lib.Strategy):
     """
     return super(
         CentralStorageStrategy,
-        self).experimental_distribute_datasets_from_function(dataset_fn)
+        self).experimental_distribute_datasets_from_function(dataset_fn,
+                                                             options)
 
   def experimental_local_results(self, value):  # pylint: disable=useless-super-delegation
     """Returns the list of all local per-replica values contained in `value`.
diff --git a/tensorflow/python/distribute/collective_all_reduce_strategy.py b/tensorflow/python/distribute/collective_all_reduce_strategy.py
index b4cc90e858f..3cd4acd60f7 100644
--- a/tensorflow/python/distribute/collective_all_reduce_strategy.py
+++ b/tensorflow/python/distribute/collective_all_reduce_strategy.py
@@ -358,9 +358,6 @@ class CollectiveAllReduceExtended(mirrored_strategy.MirroredExtended):
     )
     super(CollectiveAllReduceExtended, self)._initialize_single_worker(
         local_devices)
-    host_device = device_util.get_host_for_device(self._worker_device)
-    self._input_workers = input_lib.InputWorkers(
-        [(host_device, self.worker_devices)])
 
     # Add a default device so that ops without specified devices will not end up
     # on other workers.
@@ -378,6 +375,20 @@ class CollectiveAllReduceExtended(mirrored_strategy.MirroredExtended):
         task_id, self._num_workers, local_devices,
         self._communication)
 
+  def _input_workers_with_options(self, options=None):
+    host_device = device_util.get_host_for_device(self._worker_device)
+    if not options or options.experimental_prefetch_to_device:
+      return input_lib.InputWorkers([(host_device, self.worker_devices)])
+    else:
+      return input_lib.InputWorkers([(
+          host_device,
+          [device_util.get_host_for_device(worker) for worker in
+           self.worker_devices])])
+
+  @property
+  def _input_workers(self):
+    return self._input_workers_with_options()
+
   def _get_variable_creator_initial_value(self,
                                           replica_id,
                                           device,
@@ -441,7 +452,7 @@ class CollectiveAllReduceExtended(mirrored_strategy.MirroredExtended):
     input_context = self._make_input_context()
     return input_lib.get_distributed_dataset(
         dataset,
-        self._input_workers,
+        self._input_workers_with_options(options),
         self._container_strategy(),
         split_batch_by=self._num_replicas_in_sync,
         input_context=input_context)
@@ -451,7 +462,7 @@ class CollectiveAllReduceExtended(mirrored_strategy.MirroredExtended):
     input_context = self._make_input_context()
     return input_lib.get_distributed_datasets_from_function(
         dataset_fn=dataset_fn,
-        input_workers=self._input_workers,
+        input_workers=self._input_workers_with_options(options),
         input_contexts=[input_context],
         strategy=self._container_strategy())
 
diff --git a/tensorflow/python/distribute/collective_all_reduce_strategy_test.py b/tensorflow/python/distribute/collective_all_reduce_strategy_test.py
index 0c643eaa115..7861d1efd25 100644
--- a/tensorflow/python/distribute/collective_all_reduce_strategy_test.py
+++ b/tensorflow/python/distribute/collective_all_reduce_strategy_test.py
@@ -29,7 +29,9 @@ from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.distribute import collective_all_reduce_strategy
 from tensorflow.python.distribute import combinations
 from tensorflow.python.distribute import cross_device_utils
+from tensorflow.python.distribute import distribute_lib
 from tensorflow.python.distribute import distribute_utils
+from tensorflow.python.distribute import input_lib
 from tensorflow.python.distribute import multi_worker_test_base
 from tensorflow.python.distribute import multi_worker_util
 from tensorflow.python.distribute import reduce_util
@@ -37,6 +39,7 @@ from tensorflow.python.distribute import strategy_test_lib
 from tensorflow.python.distribute.cluster_resolver import SimpleClusterResolver
 from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import device as tf_device
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
@@ -281,6 +284,53 @@ class DistributedCollectiveAllReduceStrategyTest(
     self.assertEqual(2 * num_workers,
                      distribution.num_replicas_in_sync)
 
+  @combinations.generate(combinations.combine(
+      mode=['graph'],
+      prefetch_to_device=[None, True]))
+  def test_prefetch_to_device_dataset(self, prefetch_to_device):
+    distribution, _, _ = self._get_test_object(
+        task_type='worker',
+        task_id=0,
+        num_gpus=2)
+    if prefetch_to_device is None:
+      input_options = None
+    else:
+      input_options = distribute_lib.InputOptions(
+          experimental_prefetch_to_device=prefetch_to_device)
+    dataset = dataset_ops.Dataset.range(100)
+    dataset = dataset.batch(distribution.num_replicas_in_sync)
+    dataset = distribution.experimental_distribute_dataset(
+        dataset, options=input_options)
+    if isinstance(dataset, input_lib.DistributedDatasetV1):
+      item = dataset.make_initializable_iterator().get_next()
+    else:
+      self.skipTest('unsupported test combination')
+    device_types = {
+        tf_device.DeviceSpec.from_string(tensor.device).device_type for
+        tensor in item.values}
+    self.assertAllEqual(list(device_types), ['GPU'])
+
+  @combinations.generate(combinations.combine(mode=['graph']))
+  def test_prefetch_to_host_dataset(self):
+    distribution, _, _ = self._get_test_object(
+        task_type='worker',
+        task_id=0,
+        num_gpus=2)
+    input_options = distribute_lib.InputOptions(
+        experimental_prefetch_to_device=False)
+    dataset = dataset_ops.Dataset.range(100)
+    dataset = dataset.batch(distribution.num_replicas_in_sync)
+    dataset = distribution.experimental_distribute_dataset(
+        dataset, options=input_options)
+    if isinstance(dataset, input_lib.DistributedDatasetV1):
+      item = dataset.make_initializable_iterator().get_next()
+    else:
+      self.skipTest('unsupported test combination')
+    device_types = {
+        tf_device.DeviceSpec.from_string(tensor.device).device_type for
+        tensor in item.values}
+    self.assertAllEqual(list(device_types), ['CPU'])
+
   @combinations.generate(
       combinations.combine(mode=['graph'], required_gpus=[0, 1, 2]))
   def testMinimizeLossGraph(self, required_gpus):
diff --git a/tensorflow/python/distribute/distribute_lib.py b/tensorflow/python/distribute/distribute_lib.py
index d7893ae54f8..a06767b95cc 100644
--- a/tensorflow/python/distribute/distribute_lib.py
+++ b/tensorflow/python/distribute/distribute_lib.py
@@ -629,11 +629,10 @@ class InputOptions(
   ```
 
   Attributes:
-    experimental_prefetch_to_device: Boolean. Currently only applies to
-      TPUStrategy. Defaults to True. If True, dataset elements will be
-      prefetched to accelerator device memory. When False, dataset elements are
-      prefetched to host device memory. Must be False when using TPUEmbedding
-      API.
+    experimental_prefetch_to_device: Boolean. Defaults to True. If True, dataset
+      elements will be prefetched to accelerator device memory. When False,
+      dataset elements are prefetched to host device memory. Must be False when
+      using TPUEmbedding API.
   """
 
   def __new__(cls, experimental_prefetch_to_device=True):
diff --git a/tensorflow/python/distribute/mirrored_strategy.py b/tensorflow/python/distribute/mirrored_strategy.py
index e01b5e6792d..91efce92793 100644
--- a/tensorflow/python/distribute/mirrored_strategy.py
+++ b/tensorflow/python/distribute/mirrored_strategy.py
@@ -331,16 +331,16 @@ class MirroredExtended(distribute_lib.StrategyExtendedV1):
   def _initialize_single_worker(self, devices):
     """Initializes the object for single-worker training."""
     self._devices = tuple(device_util.canonicalize(d) for d in devices)
-    self._input_workers = input_lib.InputWorkers(
-        ((device_util.canonicalize("/device:CPU:0", devices[0]), devices),))
+    self._input_workers_devices = (
+        (device_util.canonicalize("/device:CPU:0", devices[0]), devices),)
     self._inferred_cross_device_ops = None if self._cross_device_ops else (
         cross_device_ops_lib.choose_the_best(devices))
     self._host_input_device = numpy_dataset.SingleDevice(
-        self._input_workers.worker_devices[0])
+        self._input_workers_devices[0][0])
     self._is_multi_worker_training = False
     logging.info("Using MirroredStrategy with devices %r", devices)
     device_spec = tf_device.DeviceSpec.from_string(
-        self._input_workers.worker_devices[0])
+        self._input_workers_devices[0][0])
     # Ensures when we enter strategy.scope() we use the correct default device
     if device_spec.job is not None and device_spec.job != "localhost":
       self._default_device = "/job:%s/replica:%d/task:%d" % (
@@ -368,7 +368,7 @@ class MirroredExtended(distribute_lib.StrategyExtendedV1):
     self._host_input_device = numpy_dataset.SingleDevice(workers[0])
 
     self._devices = tuple(devices)
-    self._input_workers = input_lib.InputWorkers(worker_devices)
+    self._input_workers_devices = worker_devices
     self._is_multi_worker_training = True
 
     if len(workers) > 1:
@@ -385,6 +385,18 @@ class MirroredExtended(distribute_lib.StrategyExtendedV1):
 
     logging.info("Using MirroredStrategy with remote devices %r", devices)
 
+  def _input_workers_with_options(self, options=None):
+    if not options or options.experimental_prefetch_to_device:
+      return input_lib.InputWorkers(self._input_workers_devices)
+    else:
+      return input_lib.InputWorkers(
+          [(host_device, (host_device,) * len(compute_devices)) for
+           host_device, compute_devices in self._input_workers_devices])
+
+  @property
+  def _input_workers(self):
+    return self._input_workers_with_options()
+
   def _get_variable_creator_initial_value(self,
                                           replica_id,
                                           device,
@@ -478,7 +490,7 @@ class MirroredExtended(distribute_lib.StrategyExtendedV1):
   def _experimental_distribute_dataset(self, dataset, options):
     return input_lib.get_distributed_dataset(
         dataset,
-        self._input_workers,
+        self._input_workers_with_options(options),
         self._container_strategy(),
         split_batch_by=self._num_replicas_in_sync)
 
@@ -489,7 +501,8 @@ class MirroredExtended(distribute_lib.StrategyExtendedV1):
   def _experimental_distribute_datasets_from_function(self, dataset_fn,
                                                       options):
     input_contexts = []
-    num_workers = self._input_workers.num_workers
+    input_workers = self._input_workers_with_options(options)
+    num_workers = input_workers.num_workers
     for i in range(num_workers):
       input_contexts.append(distribute_lib.InputContext(
           num_input_pipelines=num_workers,
@@ -498,7 +511,7 @@ class MirroredExtended(distribute_lib.StrategyExtendedV1):
 
     return input_lib.get_distributed_datasets_from_function(
         dataset_fn,
-        self._input_workers,
+        input_workers,
         input_contexts,
         self._container_strategy())
 
diff --git a/tensorflow/python/distribute/mirrored_strategy_test.py b/tensorflow/python/distribute/mirrored_strategy_test.py
index e6414b2704a..39cc7f3a48f 100644
--- a/tensorflow/python/distribute/mirrored_strategy_test.py
+++ b/tensorflow/python/distribute/mirrored_strategy_test.py
@@ -30,8 +30,10 @@ from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.distribute import combinations
 from tensorflow.python.distribute import cross_device_ops as cross_device_ops_lib
 from tensorflow.python.distribute import device_util
+from tensorflow.python.distribute import distribute_lib
 from tensorflow.python.distribute import distribute_utils
 from tensorflow.python.distribute import distribution_strategy_context as ds_context
+from tensorflow.python.distribute import input_lib
 from tensorflow.python.distribute import mirrored_strategy
 from tensorflow.python.distribute import multi_worker_test_base
 from tensorflow.python.distribute import reduce_util
@@ -44,6 +46,7 @@ from tensorflow.python.eager import def_function
 from tensorflow.python.eager import function
 from tensorflow.python.eager import test
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import device as tf_device
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import func_graph
 from tensorflow.python.framework import ops
@@ -230,6 +233,47 @@ class MirroredTwoDeviceDistributionTest(
   def testTrainableVariables(self, distribution):
     self._test_trainable_variable(distribution)
 
+  def test_prefetch_to_device_dataset(self, distribution):
+    input_options = distribute_lib.InputOptions(
+        experimental_prefetch_to_device=True)
+    dataset = dataset_ops.Dataset.range(100)
+    dataset = dataset.batch(distribution.num_replicas_in_sync)
+    dataset = distribution.experimental_distribute_dataset(
+        dataset, options=input_options)
+    if context.executing_eagerly():
+      item = next(iter(dataset))
+    else:
+      if isinstance(dataset, input_lib.DistributedDatasetV1):
+        item = dataset.make_initializable_iterator().get_next()
+      else:
+        self.skipTest("unsupported test combination")
+    device_types = [
+        tf_device.DeviceSpec.from_string(tensor.device).device_type for
+        tensor in item.values]
+    expected_device_types = [
+        tf_device.DeviceSpec.from_string(device).device_type for
+        device in distribution.extended.worker_devices]
+    self.assertAllEqual(device_types, expected_device_types)
+
+  def test_prefetch_to_host_dataset(self, distribution):
+    input_options = distribute_lib.InputOptions(
+        experimental_prefetch_to_device=False)
+    dataset = dataset_ops.Dataset.range(100)
+    dataset = dataset.batch(distribution.num_replicas_in_sync)
+    dataset = distribution.experimental_distribute_dataset(
+        dataset, options=input_options)
+    if context.executing_eagerly():
+      item = next(iter(dataset))
+    else:
+      if isinstance(dataset, input_lib.DistributedDatasetV1):
+        item = dataset.make_initializable_iterator().get_next()
+      else:
+        self.skipTest("unsupported test combination")
+    device_types = {
+        tf_device.DeviceSpec.from_string(tensor.device).device_type for
+        tensor in item.values}
+    self.assertAllEqual(list(device_types), ["CPU"])
+
 
 def one_device_combinations():
   return combinations.combine(
diff --git a/tensorflow/python/distribute/one_device_strategy.py b/tensorflow/python/distribute/one_device_strategy.py
index e2bb28ac96f..2a58df28c14 100644
--- a/tensorflow/python/distribute/one_device_strategy.py
+++ b/tensorflow/python/distribute/one_device_strategy.py
@@ -81,7 +81,7 @@ class OneDeviceStrategy(distribute_lib.Strategy):
     distribute_lib.distribution_strategy_gauge.get_cell("V2").set(
         "OneDeviceStrategy")
 
-  def experimental_distribute_dataset(self, dataset):  # pylint: disable=useless-super-delegation
+  def experimental_distribute_dataset(self, dataset, options=None):  # pylint: disable=useless-super-delegation
     """Distributes a tf.data.Dataset instance provided via dataset.
 
     In this case, there is only one device, so this is only a thin wrapper
@@ -102,14 +102,16 @@ class OneDeviceStrategy(distribute_lib.Strategy):
     ```
     Args:
       dataset: `tf.data.Dataset` to be prefetched to device.
-
+      options: `tf.distribute.InputOptions` used to control options on how this
+        dataset is distributed.
     Returns:
       A "distributed `Dataset`" that the caller can iterate over.
     """
     return super(OneDeviceStrategy, self).experimental_distribute_dataset(
-        dataset)
+        dataset, options)
 
-  def experimental_distribute_datasets_from_function(self, dataset_fn):  # pylint: disable=useless-super-delegation
+  def experimental_distribute_datasets_from_function(self, dataset_fn,  # pylint: disable=useless-super-delegation
+                                                     options=None):
     """Distributes `tf.data.Dataset` instances created by calls to `dataset_fn`.
 
     `dataset_fn` will be called once for each worker in the strategy. In this
@@ -140,6 +142,8 @@ class OneDeviceStrategy(distribute_lib.Strategy):
     Args:
       dataset_fn: A function taking a `tf.distribute.InputContext` instance and
         returning a `tf.data.Dataset`.
+      options: `tf.distribute.InputOptions` used to control options on how this
+        dataset is distributed.
 
     Returns:
       A "distributed `Dataset`", which the caller can iterate over like regular
@@ -147,7 +151,7 @@ class OneDeviceStrategy(distribute_lib.Strategy):
     """
     return super(
         OneDeviceStrategy, self).experimental_distribute_datasets_from_function(
-            dataset_fn)
+            dataset_fn, options)
 
   def experimental_local_results(self, value):  # pylint: disable=useless-super-delegation
     """Returns the list of all local per-replica values contained in `value`.
@@ -254,10 +258,18 @@ class OneDeviceExtended(distribute_lib.StrategyExtendedV1):
   def __init__(self, container_strategy, device):
     super(OneDeviceExtended, self).__init__(container_strategy)
     self._device = device_util.resolve(device)
-    suffix_loc = self._device.rfind("/")
-    self._input_device = self._device[:suffix_loc] + "/device:CPU:0"
-    worker_device_pairs = [(self._input_device, [self._device])]
-    self._input_workers = input_lib.InputWorkers(worker_device_pairs)
+    self._input_device = device_util.get_host_for_device(self._device)
+
+  def _input_workers_with_options(self, options=None):
+    if not options or options.experimental_prefetch_to_device:
+      return input_lib.InputWorkers([(self._input_device, (self._device,))])
+    else:
+      return input_lib.InputWorkers([(self._input_device,
+                                      (self._input_device,))])
+
+  @property
+  def _input_workers(self):
+    return self._input_workers_with_options()
 
   def _create_variable(self, next_creator, **kwargs):
     colocate_with = kwargs.pop("colocate_with", None)
@@ -300,14 +312,16 @@ class OneDeviceExtended(distribute_lib.StrategyExtendedV1):
   def _experimental_distribute_dataset(self, dataset, options):
     # Note that split_batch_by argument is not passed because it is always 1 in
     # this strategy, and adding it adds unnecessary overhead to the dataset.
-    return input_lib.get_distributed_dataset(dataset, self._input_workers,
-                                             self._container_strategy())
+    return input_lib.get_distributed_dataset(
+        dataset,
+        self._input_workers_with_options(options),
+        self._container_strategy())
 
   def _experimental_distribute_datasets_from_function(self, dataset_fn,
                                                       options):
     return input_lib.get_distributed_datasets_from_function(
         dataset_fn,
-        self._input_workers,
+        self._input_workers_with_options(options),
         [distribute_lib.InputContext()],
         self._container_strategy())
 
diff --git a/tensorflow/python/distribute/one_device_strategy_test.py b/tensorflow/python/distribute/one_device_strategy_test.py
index 0e6f81df1f9..238d0150100 100644
--- a/tensorflow/python/distribute/one_device_strategy_test.py
+++ b/tensorflow/python/distribute/one_device_strategy_test.py
@@ -20,10 +20,13 @@ from __future__ import print_function
 from tensorflow.python import tf2
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.distribute import combinations
+from tensorflow.python.distribute import distribute_lib
+from tensorflow.python.distribute import input_lib
 from tensorflow.python.distribute import strategy_combinations
 from tensorflow.python.distribute import strategy_test_lib
 from tensorflow.python.eager import context
 from tensorflow.python.eager import test
+from tensorflow.python.framework import device as tf_device
 
 
 @combinations.generate(
@@ -116,6 +119,44 @@ class OneDeviceStrategyTest(
   def testTrainableVariables(self, distribution):
     self._test_trainable_variable(distribution)
 
+  def test_prefetch_to_device_dataset(self, distribution):
+    input_options = distribute_lib.InputOptions(
+        experimental_prefetch_to_device=True)
+    dataset = dataset_ops.Dataset.range(100)
+    dataset = dataset.batch(distribution.num_replicas_in_sync)
+    dataset = distribution.experimental_distribute_dataset(
+        dataset, options=input_options)
+    if context.executing_eagerly():
+      item = next(iter(dataset))
+    else:
+      if isinstance(dataset, input_lib.DistributedDatasetV1):
+        item = dataset.make_initializable_iterator().get_next()
+      else:
+        self.skipTest("unsupported test combination")
+    device_types = (
+        tf_device.DeviceSpec.from_string(item.device).device_type)
+    expected_device_types = (
+        tf_device.DeviceSpec.from_string(
+            distribution.extended.worker_devices[0]).device_type)
+    self.assertAllEqual(device_types, expected_device_types)
+
+  def test_prefetch_to_host_dataset(self, distribution):
+    input_options = distribute_lib.InputOptions(
+        experimental_prefetch_to_device=False)
+    dataset = dataset_ops.Dataset.range(100)
+    dataset = dataset.batch(distribution.num_replicas_in_sync)
+    dataset = distribution.experimental_distribute_dataset(
+        dataset, options=input_options)
+    if context.executing_eagerly():
+      item = next(iter(dataset))
+    else:
+      if isinstance(dataset, input_lib.DistributedDatasetV1):
+        item = dataset.make_initializable_iterator().get_next()
+      else:
+        self.skipTest("unsupported test combination")
+    self.assertAllEqual(
+        tf_device.DeviceSpec.from_string(item.device).device_type, "CPU")
+
 
 @combinations.generate(
     combinations.combine(
diff --git a/tensorflow/python/distribute/parameter_server_strategy.py b/tensorflow/python/distribute/parameter_server_strategy.py
index 9675b7002c5..5bef04f1ae6 100644
--- a/tensorflow/python/distribute/parameter_server_strategy.py
+++ b/tensorflow/python/distribute/parameter_server_strategy.py
@@ -122,16 +122,18 @@ class ParameterServerStrategy(distribute_lib.Strategy):
     distribute_lib.distribution_strategy_replica_gauge.get_cell("num_ps").set(
         len(self.extended.parameter_devices))
 
-  def experimental_distribute_dataset(self, dataset):
+  def experimental_distribute_dataset(self, dataset, options=None):
     self._raise_pss_error_if_eager()
     super(ParameterServerStrategy,
-          self).experimental_distribute_dataset(dataset=dataset)
+          self).experimental_distribute_dataset(dataset=dataset,
+                                                options=options)
 
-  def experimental_distribute_datasets_from_function(self, dataset_fn):
+  def experimental_distribute_datasets_from_function(self, dataset_fn,
+                                                     options=None):
     self._raise_pss_error_if_eager()
     super(ParameterServerStrategy,
           self).experimental_distribute_datasets_from_function(
-              dataset_fn=dataset_fn)
+              dataset_fn=dataset_fn, options=options)
 
   def run(self, fn, args=(), kwargs=None, options=None):
     self._raise_pss_error_if_eager()
@@ -229,22 +231,21 @@ class ParameterServerStrategyExtended(distribute_lib.StrategyExtendedV1):
     cluster_spec = multi_worker_util.normalize_cluster_spec(cluster_spec)
     assert cluster_spec.as_dict()
 
-    worker_device = "/job:%s/task:%d" % (task_type, task_id)
-    self._input_host_device = numpy_dataset.SingleDevice(worker_device)
+    self._worker_device = "/job:%s/task:%d" % (task_type, task_id)
+    self._input_host_device = numpy_dataset.SingleDevice(self._worker_device)
 
     # Define compute devices which is a list of device strings and one for each
     # replica. When there are GPUs, replicate operations on these GPUs.
     # Otherwise, place operations on CPU.
     if num_gpus > 0:
       compute_devices = tuple(
-          "%s/device:GPU:%d" % (worker_device, i) for i in range(num_gpus))
+          "%s/device:GPU:%d" % (self._worker_device, i)
+          for i in range(num_gpus))
     else:
-      compute_devices = (worker_device,)
+      compute_devices = (self._worker_device,)
 
     self._compute_devices = [
         device_util.canonicalize(d) for d in compute_devices]
-    self._input_workers = input_lib.InputWorkers(
-        [(worker_device, compute_devices)])
 
     # In distributed mode, place variables on ps jobs in a round-robin fashion.
     # Note that devices returned from `replica_device_setter` are not
@@ -259,7 +260,7 @@ class ParameterServerStrategyExtended(distribute_lib.StrategyExtendedV1):
       raise ValueError("The cluster spec needs to have `ps` jobs.")
     self._variable_device = device_setter.replica_device_setter(
         ps_tasks=num_ps_replicas,
-        worker_device=worker_device,
+        worker_device=self._worker_device,
         merge_devices=True,
         cluster=cluster_spec)
 
@@ -271,7 +272,7 @@ class ParameterServerStrategyExtended(distribute_lib.StrategyExtendedV1):
 
     # Add a default device so that ops without specified devices will not end up
     # on other workers.
-    self._default_device = worker_device
+    self._default_device = self._worker_device
 
     self._is_chief = multi_worker_util.is_chief(cluster_spec, task_type,
                                                 task_id)
@@ -294,8 +295,8 @@ class ParameterServerStrategyExtended(distribute_lib.StrategyExtendedV1):
                         parameter_device,
                         cluster_resolver=None):
     """Initialize local devices for training."""
-    worker_device = device_util.canonicalize("/device:CPU:0")
-    self._input_host_device = numpy_dataset.SingleDevice(worker_device)
+    self._worker_device = device_util.canonicalize("/device:CPU:0")
+    self._input_host_device = numpy_dataset.SingleDevice(self._worker_device)
 
     if compute_devices is None:
       if not cluster_resolver:
@@ -318,9 +319,6 @@ class ParameterServerStrategyExtended(distribute_lib.StrategyExtendedV1):
       else:
         parameter_device = _LOCAL_CPU
 
-    self._input_workers = input_lib.InputWorkers(
-        [(worker_device, compute_devices)])
-
     self._variable_device = parameter_device
     self._compute_devices = compute_devices
     self._parameter_devices = (parameter_device,)
@@ -334,13 +332,26 @@ class ParameterServerStrategyExtended(distribute_lib.StrategyExtendedV1):
         "single machine) with compute_devices = %r, variable_device = %r",
         compute_devices, self._variable_device)
 
+  def _input_workers_with_options(self, options=None):
+    if not options or options.experimental_prefetch_to_device:
+      return input_lib.InputWorkers(
+          [(self._worker_device, self._compute_devices)])
+    else:
+      return input_lib.InputWorkers(
+          [(self._worker_device,
+            (self._worker_device,) * len(self._compute_devices))])
+
+  @property
+  def _input_workers(self):
+    return self._input_workers_with_options()
+
   def _validate_colocate_with_variable(self, colocate_with_variable):
     distribute_utils.validate_colocate(colocate_with_variable, self)
 
   def _experimental_distribute_dataset(self, dataset, options):
     return input_lib.get_distributed_dataset(
         dataset,
-        self._input_workers,
+        self._input_workers_with_options(options),
         self._container_strategy(),
         split_batch_by=self._num_replicas_in_sync)
 
@@ -394,7 +405,7 @@ class ParameterServerStrategyExtended(distribute_lib.StrategyExtendedV1):
 
     return input_lib.get_distributed_datasets_from_function(
         dataset_fn,
-        self._input_workers,
+        self._input_workers_with_options(options),
         [input_context],
         self._container_strategy())
 
@@ -497,7 +508,7 @@ class ParameterServerStrategyExtended(distribute_lib.StrategyExtendedV1):
       if d_spec.job == self._task_type and d_spec.task != self._task_id:
         raise ValueError(
             "Cannot reduce to another worker: %r, current worker is %r" %
-            (d, self._input_workers.worker_devices[0]))
+            (d, self._worker_device))
 
   def _reduce_to(self, reduce_op, value, destinations, experimental_hints):
     self._verify_destinations_not_different_worker(destinations)
diff --git a/tensorflow/python/distribute/parameter_server_strategy_test.py b/tensorflow/python/distribute/parameter_server_strategy_test.py
index 24dbd091079..b4fb15800a1 100644
--- a/tensorflow/python/distribute/parameter_server_strategy_test.py
+++ b/tensorflow/python/distribute/parameter_server_strategy_test.py
@@ -27,8 +27,10 @@ from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.distribute import central_storage_strategy
 from tensorflow.python.distribute import combinations
 from tensorflow.python.distribute import device_util
+from tensorflow.python.distribute import distribute_lib
 from tensorflow.python.distribute import distribute_utils
 from tensorflow.python.distribute import distribution_strategy_context as ds_context
+from tensorflow.python.distribute import input_lib
 from tensorflow.python.distribute import multi_worker_test_base
 from tensorflow.python.distribute import multi_worker_util
 from tensorflow.python.distribute import parameter_server_strategy
@@ -40,6 +42,7 @@ from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
 from tensorflow.python.estimator import run_config
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import device as tf_device
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
@@ -765,6 +768,55 @@ class ParameterServerStrategyTest(
     self.assertRaisesRegex(NotImplementedError, 'ParameterServerStrategy*',
                            strategy.run, train_step)
 
+  @combinations.generate(combinations.combine(
+      mode=['graph'],
+      prefetch_to_device=[None, True]))
+  def test_prefetch_to_device_dataset(self, prefetch_to_device):
+    distribution, _, _ = create_test_objects(
+        cluster_spec=self._cluster_spec,
+        task_type='worker',
+        task_id=0,
+        num_gpus=2)
+    if prefetch_to_device is None:
+      input_options = None
+    else:
+      input_options = distribute_lib.InputOptions(
+          experimental_prefetch_to_device=prefetch_to_device)
+    dataset = dataset_ops.Dataset.range(100)
+    dataset = dataset.batch(distribution.num_replicas_in_sync)
+    dataset = distribution.experimental_distribute_dataset(
+        dataset, options=input_options)
+    if isinstance(dataset, input_lib.DistributedDatasetV1):
+      item = dataset.make_initializable_iterator().get_next()
+    else:
+      self.skipTest('unsupported test combination')
+    device_types = {
+        tf_device.DeviceSpec.from_string(tensor.device).device_type for
+        tensor in item.values}
+    self.assertAllEqual(list(device_types), ['GPU'])
+
+  @combinations.generate(combinations.combine(mode=['graph']))
+  def test_prefetch_to_host_dataset(self):
+    distribution, _, _ = create_test_objects(
+        cluster_spec=self._cluster_spec,
+        task_type='worker',
+        task_id=0,
+        num_gpus=2)
+    input_options = distribute_lib.InputOptions(
+        experimental_prefetch_to_device=False)
+    dataset = dataset_ops.Dataset.range(100)
+    dataset = dataset.batch(distribution.num_replicas_in_sync)
+    dataset = distribution.experimental_distribute_dataset(
+        dataset, options=input_options)
+    if isinstance(dataset, input_lib.DistributedDatasetV1):
+      item = dataset.make_initializable_iterator().get_next()
+    else:
+      self.skipTest('unsupported test combination')
+    device_types = {
+        tf_device.DeviceSpec.from_string(tensor.device).device_type for
+        tensor in item.values}
+    self.assertAllEqual(list(device_types), ['CPU'])
+
 
 class ParameterServerStrategyWithChiefTest(ParameterServerStrategyTestBase,
                                            parameterized.TestCase):
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-one-device-strategy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-one-device-strategy.pbtxt
index 992243ffe8a..b6604408536 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-one-device-strategy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-one-device-strategy.pbtxt
@@ -34,11 +34,11 @@ tf_class {
   }
   member_method {
     name: "experimental_distribute_dataset"
-    argspec: "args=[\'self\', \'dataset\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'dataset\', \'options\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "experimental_distribute_datasets_from_function"
-    argspec: "args=[\'self\', \'dataset_fn\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'dataset_fn\', \'options\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "experimental_distribute_values_from_function"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-central-storage-strategy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-central-storage-strategy.pbtxt
index 695fb52358b..ab030edd731 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-central-storage-strategy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-central-storage-strategy.pbtxt
@@ -34,11 +34,11 @@ tf_class {
   }
   member_method {
     name: "experimental_distribute_dataset"
-    argspec: "args=[\'self\', \'dataset\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'dataset\', \'options\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "experimental_distribute_datasets_from_function"
-    argspec: "args=[\'self\', \'dataset_fn\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'dataset_fn\', \'options\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "experimental_distribute_values_from_function"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-parameter-server-strategy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-parameter-server-strategy.pbtxt
index 39181625469..f792094bfdb 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-parameter-server-strategy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-parameter-server-strategy.pbtxt
@@ -34,11 +34,11 @@ tf_class {
   }
   member_method {
     name: "experimental_distribute_dataset"
-    argspec: "args=[\'self\', \'dataset\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'dataset\', \'options\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "experimental_distribute_datasets_from_function"
-    argspec: "args=[\'self\', \'dataset_fn\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'dataset_fn\', \'options\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "experimental_distribute_values_from_function"

From 8e387522d5592df809d0242cea634a2a9dc21a16 Mon Sep 17 00:00:00 2001
From: Nat Jeffries <njeff@google.com>
Date: Wed, 24 Jun 2020 16:10:59 -0700
Subject: [PATCH 1034/1390] Configure cache control and clockgen in sparkfun
 edge timer initialization.

PiperOrigin-RevId: 318160671
Change-Id: I70612a26d92b2f840f54fc808dddbf2279ab5c98
---
 tensorflow/lite/micro/sparkfun_edge/micro_time.cc | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/tensorflow/lite/micro/sparkfun_edge/micro_time.cc b/tensorflow/lite/micro/sparkfun_edge/micro_time.cc
index 6e321a77896..9987a3b9d41 100644
--- a/tensorflow/lite/micro/sparkfun_edge/micro_time.cc
+++ b/tensorflow/lite/micro/sparkfun_edge/micro_time.cc
@@ -47,6 +47,12 @@ constexpr int kClocksPerSecond = 12e6;
 // Enables 96MHz burst mode on Sparkfun Edge. Enable in timer since most
 // benchmarks and profilers want maximum performance for debugging.
 void BurstModeEnable() {
+  am_hal_clkgen_control(AM_HAL_CLKGEN_CONTROL_SYSCLK_MAX, 0);
+
+  // Set the default cache configuration
+  am_hal_cachectrl_config(&am_hal_cachectrl_defaults);
+  am_hal_cachectrl_enable();
+
   am_hal_burst_avail_e eBurstModeAvailable;
   am_hal_burst_mode_e eBurstMode;
 

From 7f236df1778cb834367d4dc1fed6215108e38dc6 Mon Sep 17 00:00:00 2001
From: Pete Warden <petewarden@google.com>
Date: Wed, 24 Jun 2020 16:24:48 -0700
Subject: [PATCH 1035/1390] Fix for hello world test typo

PiperOrigin-RevId: 318163038
Change-Id: I3e509e2f9eb6a57760470934c8b08c567b0da47e
---
 .../lite/micro/examples/hello_world/hello_world_test.cc       | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/lite/micro/examples/hello_world/hello_world_test.cc b/tensorflow/lite/micro/examples/hello_world/hello_world_test.cc
index 7ced43211b7..4da4ba7fa94 100644
--- a/tensorflow/lite/micro/examples/hello_world/hello_world_test.cc
+++ b/tensorflow/lite/micro/examples/hello_world/hello_world_test.cc
@@ -88,8 +88,8 @@ TF_LITE_MICRO_TEST(LoadModelAndPerformInference) {
   // properties we expect. It should be the same as the input tensor.
   TfLiteTensor* output = interpreter.output(0);
   TF_LITE_MICRO_EXPECT_EQ(2, output->dims->size);
-  TF_LITE_MICRO_EXPECT_EQ(1, input->dims->data[0]);
-  TF_LITE_MICRO_EXPECT_EQ(1, input->dims->data[1]);
+  TF_LITE_MICRO_EXPECT_EQ(1, output->dims->data[0]);
+  TF_LITE_MICRO_EXPECT_EQ(1, output->dims->data[1]);
   TF_LITE_MICRO_EXPECT_EQ(kTfLiteFloat32, output->type);
 
   // Obtain the output value from the tensor

From 76e5985f7328fa31fede6e77055072ae55e21be5 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 24 Jun 2020 16:27:06 -0700
Subject: [PATCH 1036/1390] [Grappler] Improve removal of redundant ops in
 arithmetic optimizer. 1. Remove identity Reshape ops with control inputs. 2.
 Remove identity BroadcastTo ops. This op is common, as it appears in the
 gradient of Sum.

PiperOrigin-RevId: 318163412
Change-Id: I63954d71c4ed4ff8636bdad651ac7f18d0518fc0
---
 tensorflow/core/grappler/op_types.cc          |  2 +
 tensorflow/core/grappler/op_types.h           |  1 +
 .../optimizers/arithmetic_optimizer.cc        | 28 +++++---
 .../optimizers/arithmetic_optimizer_test.cc   | 68 +++++++++++--------
 4 files changed, 60 insertions(+), 39 deletions(-)

diff --git a/tensorflow/core/grappler/op_types.cc b/tensorflow/core/grappler/op_types.cc
index 9e3b401154a..efd23b6005e 100644
--- a/tensorflow/core/grappler/op_types.cc
+++ b/tensorflow/core/grappler/op_types.cc
@@ -117,6 +117,8 @@ bool IsBiasAddGrad(const NodeDef& node) { return node.op() == "BiasAddGrad"; }
 
 bool IsBitcast(const NodeDef& node) { return node.op() == "Bitcast"; }
 
+bool IsBroadcastTo(const NodeDef& node) { return node.op() == "BroadcastTo"; }
+
 bool IsCast(const NodeDef& node) { return node.op() == "Cast"; }
 
 bool IsCastLike(const NodeDef& node) {
diff --git a/tensorflow/core/grappler/op_types.h b/tensorflow/core/grappler/op_types.h
index b1624ac70c6..59fc68daba5 100644
--- a/tensorflow/core/grappler/op_types.h
+++ b/tensorflow/core/grappler/op_types.h
@@ -47,6 +47,7 @@ bool IsBetainc(const NodeDef& node);
 bool IsBiasAdd(const NodeDef& node);
 bool IsBiasAddGrad(const NodeDef& node);
 bool IsBitcast(const NodeDef& node);
+bool IsBroadcastTo(const NodeDef& node);
 bool IsCast(const NodeDef& node);
 bool IsCheckNumerics(const NodeDef& node);
 bool IsCollective(const NodeDef& node);
diff --git a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
index eaffcf92438..15227ee32e0 100644
--- a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
@@ -1917,15 +1917,22 @@ class LogSoftmaxStage : public ArithmeticOptimizerStage {
 //      ^                                  |
 //      |                                  |
 //    input                      input  ---+
-class RemoveRedundantReshape : public ArithmeticOptimizerStage {
+//
+// Additionally,  Reshape and BroadcastTo nodes where the
+// input and target shapes are equal are bypassed.
+//
+class RemoveRedundantReshapeOrBroadcastTo : public ArithmeticOptimizerStage {
  public:
-  explicit RemoveRedundantReshape(const GraphOptimizerContext& ctx,
-                                  const ArithmeticOptimizerContext& ctx_ext)
-      : ArithmeticOptimizerStage("RemoveRedundantReshape", ctx, ctx_ext) {}
-  ~RemoveRedundantReshape() override = default;
+  explicit RemoveRedundantReshapeOrBroadcastTo(
+      const GraphOptimizerContext& ctx,
+      const ArithmeticOptimizerContext& ctx_ext)
+      : ArithmeticOptimizerStage("RemoveRedundantReshapeOrBroadcastTo", ctx,
+                                 ctx_ext) {}
+  ~RemoveRedundantReshapeOrBroadcastTo() override = default;
 
   bool IsSupported(const NodeDef* node) const override {
-    return IsReshape(*node) && !IsInPreserveSet(*node);
+    return (IsReshape(*node) || IsBroadcastTo(*node)) &&
+           !IsInPreserveSet(*node);
   }
 
   Status TrySimplify(NodeDef* node, string* simplified_node_name) override {
@@ -1933,7 +1940,8 @@ class RemoveRedundantReshape : public ArithmeticOptimizerStage {
     TF_RETURN_IF_ERROR(GetInputNode(node->input(0), &input));
 
     // 1. Bypass reshape followed by reshape.
-    if (IsReshape(*input) && !HasControlInputs(*input)) {
+    if (IsReshape(*node) && IsReshape(*input)) {
+      ForwardControlDependencies(node, {input});
       node->set_input(0, input->input(0));
       ctx().node_map->UpdateInput(node->name(), input->name(), input->input(0));
       *simplified_node_name = node->name();
@@ -1944,7 +1952,7 @@ class RemoveRedundantReshape : public ArithmeticOptimizerStage {
     // 2. If the reshape is a no-op, forward its input to its consumers, unless
     // it anchors a control dependency since we want to make sure that control
     // dependency is triggered.
-    if (ReshapeIsIdentity(*node) && !HasControlInputs(*node)) {
+    if (InputMatchesTargetShape(*node) && !HasControlInputs(*node)) {
       *simplified_node_name = node->input(0);
       return Status::OK();
     }
@@ -1954,7 +1962,7 @@ class RemoveRedundantReshape : public ArithmeticOptimizerStage {
 
  private:
   // Returns whether `reshape` is an identity op.
-  bool ReshapeIsIdentity(const NodeDef& reshape) {
+  bool InputMatchesTargetShape(const NodeDef& reshape) {
     const OpInfo::TensorProperties* reshape_props;
     const OpInfo::TensorProperties* input_props;
 
@@ -3673,7 +3681,7 @@ Status ArithmeticOptimizer::SimplifyArithmeticOps(bool can_use_shapes) {
   if (options_.remove_redundant_cast)
     pipeline.AddStage<RemoveRedundantCastStage>(ctx, ctx_ext);
   if (options_.remove_redundant_reshape)
-    pipeline.AddStage<RemoveRedundantReshape>(ctx, ctx_ext);
+    pipeline.AddStage<RemoveRedundantReshapeOrBroadcastTo>(ctx, ctx_ext);
   if (options_.remove_negation)
     pipeline.AddStage<RemoveNegationStage>(ctx, ctx_ext);
   if (options_.replace_mul_with_square)
diff --git a/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test.cc b/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test.cc
index 8b403b17841..1c5b7f922bd 100644
--- a/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test.cc
@@ -828,37 +828,45 @@ TEST_F(ArithmeticOptimizerTest, FoldConjugateTransposeIntoBatchMatMul) {
 }
 
 TEST_F(ArithmeticOptimizerTest, RemoveRedundantReshapeIdentityReshape) {
-  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
-  Output inputs =
-      ops::Placeholder(s, DT_FLOAT, ops::Placeholder::Shape({-1, 3, 28, 28}));
-  Output inputs_shape = ops::Shape(s, inputs);
-  // The target shape of the reshape is the concatenation of `batch_size` and
-  // [3,28,28].
-  Output batch_size = ops::Slice(s, inputs_shape, ops::Const(s, {0}, {1}),
-                                 ops::Const(s, {1}, {1}));
-  Output target_shape = ops::Concat(
-      s.WithOpName("target_shape"),
-      {batch_size, ops::Const(s, {3, 28, 28}, {3})}, ops::Const(s, {0}, {}));
-  Output reshape = ops::Reshape(s, inputs, target_shape);
-  Output outputs = ops::Identity(s.WithOpName("outputs"), reshape);
+  for (bool is_broadcastto : {false, true}) {
+    tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+    Output inputs =
+        ops::Placeholder(s, DT_FLOAT, ops::Placeholder::Shape({-1, 3, 28, 28}));
+    Output inputs_shape = ops::Shape(s, inputs);
+    // The target shape of the reshape is the concatenation of `batch_size` and
+    // [3,28,28].
+    Output batch_size = ops::Slice(s, inputs_shape, ops::Const(s, {0}, {1}),
+                                   ops::Const(s, {1}, {1}));
+    Output target_shape = ops::Concat(
+        s.WithOpName("target_shape"),
+        {batch_size, ops::Const(s, {3, 28, 28}, {3})}, ops::Const(s, {0}, {}));
+    if (is_broadcastto) {
+      Output outputs = ops::Identity(s.WithOpName("outputs"),
+                                     ops::BroadcastTo(s, inputs, target_shape));
+    } else {
+      Output outputs = ops::Identity(s.WithOpName("outputs"),
+                                     ops::Reshape(s, inputs, target_shape));
+    }
 
-  GrapplerItem item;
-  item.fetch = {"outputs"};
-  TF_CHECK_OK(s.ToGraphDef(&item.graph));
-  auto x_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({3, 3, 28, 28}));
-  auto tensors_expected =
-      EvaluateNodes(item.graph, item.fetch, {{"Placeholder", x_t}});
-  ASSERT_EQ(tensors_expected.size(), 1);
+    GrapplerItem item;
+    item.fetch = {"outputs"};
+    TF_CHECK_OK(s.ToGraphDef(&item.graph));
+    auto x_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({3, 3, 28, 28}));
+    auto tensors_expected =
+        EvaluateNodes(item.graph, item.fetch, {{"Placeholder", x_t}});
+    ASSERT_EQ(tensors_expected.size(), 1);
 
-  GraphDef output;
-  ArithmeticOptimizer optimizer;
-  EnableOnlyRemoveRedundantReshape(&optimizer);
-  OptimizeTwiceAndPrune(&optimizer, &item, &output);
+    GraphDef output;
+    ArithmeticOptimizer optimizer;
+    EnableOnlyRemoveRedundantReshape(&optimizer);
+    OptimizeTwiceAndPrune(&optimizer, &item, &output);
 
-  EXPECT_EQ(CountOpNodes(output, "Reshape"), 0);
-  auto tensors = EvaluateNodes(output, item.fetch, {{"Placeholder", x_t}});
-  ASSERT_EQ(tensors.size(), 1);
-  test::ExpectTensorNear<float>(tensors[0], tensors_expected[0], 1e-6);
+    EXPECT_EQ(CountOpNodes(output, "Reshape"), 0);
+    EXPECT_EQ(CountOpNodes(output, "BroadcastTo"), 0);
+    auto tensors = EvaluateNodes(output, item.fetch, {{"Placeholder", x_t}});
+    ASSERT_EQ(tensors.size(), 1);
+    test::ExpectTensorNear<float>(tensors[0], tensors_expected[0], 1e-6);
+  }
 }
 
 TEST_F(ArithmeticOptimizerTest,
@@ -1023,7 +1031,9 @@ TEST_F(ArithmeticOptimizerTest, RemoveRedundantReshapeCombineReshapes) {
                      ops::Const(s.WithOpName("perm"), {0, 2, 3, 1, 4}, {5}));
   Output nhwc = ops::Reshape(
       s.WithOpName("nhwc"), transpose,
-      ops::Const(s.WithOpName("nhwc_shape"), {8, 28, 28, 12}, {4}));
+      ops::Const(
+          s.WithControlDependencies(nchw_vect_c).WithOpName("nhwc_shape"),
+          {8, 28, 28, 12}, {4}));
   Output flatten = ops::Reshape(
       s.WithOpName("flatten"), nhwc,
       ops::Const(s.WithOpName("flatten_shape"), {8, 28 * 28 * 12}, {2}));

From 2ff0335ff1b884a3cb2f36d36c5d37cf2753231a Mon Sep 17 00:00:00 2001
From: Scott Zhu <scottzhu@google.com>
Date: Wed, 24 Jun 2020 16:27:14 -0700
Subject: [PATCH 1037/1390] Move keras_save_load_test to keras/distribute

PiperOrigin-RevId: 318163436
Change-Id: Ia0d60bdf7d47de0efe32dfa84778168db4cee0b5
---
 tensorflow/python/distribute/BUILD               | 16 ----------------
 tensorflow/python/keras/distribute/BUILD         | 16 ++++++++++++++++
 .../distribute/keras_save_load_test.py           |  0
 3 files changed, 16 insertions(+), 16 deletions(-)
 rename tensorflow/python/{ => keras}/distribute/keras_save_load_test.py (100%)

diff --git a/tensorflow/python/distribute/BUILD b/tensorflow/python/distribute/BUILD
index c351484f21f..87a4419e9eb 100644
--- a/tensorflow/python/distribute/BUILD
+++ b/tensorflow/python/distribute/BUILD
@@ -1595,22 +1595,6 @@ distribute_py_test(
     ],
 )
 
-distribute_py_test(
-    name = "keras_save_load_test",
-    size = "medium",
-    srcs = ["keras_save_load_test.py"],
-    full_precision = True,
-    main = "keras_save_load_test.py",
-    shard_count = 7,
-    tags = [
-        "multi_and_single_gpu",
-    ],
-    deps = [
-        ":saved_model_test_base",
-        "//tensorflow/python/keras/saving",
-    ],
-)
-
 distribute_py_test(
     name = "saved_model_mixed_api_test",
     size = "medium",
diff --git a/tensorflow/python/keras/distribute/BUILD b/tensorflow/python/keras/distribute/BUILD
index 9516e7d6030..935cdbcb239 100644
--- a/tensorflow/python/keras/distribute/BUILD
+++ b/tensorflow/python/keras/distribute/BUILD
@@ -316,6 +316,22 @@ distribute_py_test(
     ],
 )
 
+distribute_py_test(
+    name = "keras_save_load_test",
+    size = "medium",
+    srcs = ["keras_save_load_test.py"],
+    full_precision = True,
+    main = "keras_save_load_test.py",
+    shard_count = 7,
+    tags = [
+        "multi_and_single_gpu",
+    ],
+    deps = [
+        "//tensorflow/python/distribute:saved_model_test_base",
+        "//tensorflow/python/keras/saving",
+    ],
+)
+
 distribute_py_test(
     name = "keras_stateful_lstm_model_correctness_test",
     size = "medium",
diff --git a/tensorflow/python/distribute/keras_save_load_test.py b/tensorflow/python/keras/distribute/keras_save_load_test.py
similarity index 100%
rename from tensorflow/python/distribute/keras_save_load_test.py
rename to tensorflow/python/keras/distribute/keras_save_load_test.py

From 9bdd719f52c2cdba03947139dec350d3f5320f02 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 24 Jun 2020 16:27:46 -0700
Subject: [PATCH 1038/1390] Integrate LLVM at
 https://github.com/llvm/llvm-project/commit/cdd6a2788cac

PiperOrigin-RevId: 318163539
Change-Id: I9a9fcff5cb03a49b46e474c050b78e0a51bf6657
---
 tensorflow/workspace.bzl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index b7c4fa08199..454c3e5ec21 100755
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -710,8 +710,8 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
     )
 
     # Check out LLVM and MLIR from llvm-project.
-    LLVM_COMMIT = "69d2fa9ed1c1aba6f473feb03cad257e69a0cf52"
-    LLVM_SHA256 = "94300fd7c357cc946d7c3e416f15f94fa9ee0004a70049e4e2d5ba008751a95f"
+    LLVM_COMMIT = "cdd6a2788caced6b377af818154138d9983dba5f"
+    LLVM_SHA256 = "0b61c3585b8157d820edb5177bf821a9f6ce7b73747ef63cdc3ce12f79f24138"
     LLVM_URLS = [
         "https://storage.googleapis.com/mirror.tensorflow.org/github.com/llvm/llvm-project/archive/{commit}.tar.gz".format(commit = LLVM_COMMIT),
         "https://github.com/llvm/llvm-project/archive/{commit}.tar.gz".format(commit = LLVM_COMMIT),

From 9338d6bcdf66ce07ce280627c099986811e5b73a Mon Sep 17 00:00:00 2001
From: Haoyu Zhang <haoyuzhang@google.com>
Date: Wed, 24 Jun 2020 16:41:43 -0700
Subject: [PATCH 1039/1390] Prevent data race when accessing function registry
 in UpdateRemoteMaster.

Currently `UpdateRemoteMaster` first get a list of registered functions, and then register them to remote workers. If functions are removed at the same time, it might hit nullptr when looking up functions by names in the previously acquired list. In this change, the function_defs come from the ToProto(). It makes a copy of the function protos to avoid the race condition.

PiperOrigin-RevId: 318166239
Change-Id: Ia13091ccc9577691c6508f8de7f7d2766ef81353
---
 .../core/common_runtime/eager/context.cc      | 29 +++++------------
 .../core/common_runtime/eager/context.h       |  3 --
 .../python/eager/remote_cluster_test.py       | 31 +++++++++++++++++++
 3 files changed, 39 insertions(+), 24 deletions(-)

diff --git a/tensorflow/core/common_runtime/eager/context.cc b/tensorflow/core/common_runtime/eager/context.cc
index 9c0ab9ba849..be750bcc9bc 100644
--- a/tensorflow/core/common_runtime/eager/context.cc
+++ b/tensorflow/core/common_runtime/eager/context.cc
@@ -602,16 +602,6 @@ const FunctionDef* EagerContext::FindFunctionDef(const string& name) {
   return func_lib_def_.Find(name);
 }
 
-std::vector<const FunctionDef*> EagerContext::ListRegisteredFunctions() {
-  std::vector<const FunctionDef*> result;
-  std::vector<string> function_names = func_lib_def_.ListFunctionNames();
-  result.reserve(function_names.size());
-  for (const string& fn : function_names) {
-    result.emplace_back(func_lib_def_.Find(fn));
-  }
-  return result;
-}
-
 void EagerContext::ClearRunMetadata() { run_metadata_.Clear(); }
 
 bool EagerContext::UsesTFRT() { return false; }
@@ -690,37 +680,35 @@ Status EagerContext::MaybeRegisterFunctionRemotely(const FunctionDef& fdef) {
 }
 
 Status EagerContext::RegisterExistingFunctionsOnRemoteWorkers(
-    const std::vector<const FunctionDef*>& function_defs,
     const std::vector<string>& remote_workers) {
 #if !defined(IS_MOBILE_PLATFORM)
   // Register multiple functions on selected remote workers.
   uint64 context_id = GetContextId();
+  FunctionDefLibrary function_defs = func_lib_def_.ToProto();
   for (int i = 0; i < remote_workers.size(); i++) {
     core::RefCountPtr<eager::EagerClient> eager_client;
     Status s = GetClient(remote_workers[i], &eager_client);
     if (!s.ok()) {
       continue;
     }
-    for (int j = 0; j < function_defs.size(); j++) {
-      auto* request = new eager::EnqueueRequest;
+    for (int j = 0; j < function_defs.function_size(); j++) {
+      auto request = std::make_shared<eager::EnqueueRequest>();
       request->set_context_id(context_id);
       eager::RegisterFunctionOp* register_function =
           request->add_queue()->mutable_register_function();
-      *register_function->mutable_function_def() = *function_defs[j];
+      *register_function->mutable_function_def() = function_defs.function(j);
       StripDefaultAttributes(
           *OpRegistry::Global(),
           register_function->mutable_function_def()->mutable_node_def());
-      auto* response = new eager::EnqueueResponse;
+      auto response = std::make_shared<eager::EnqueueResponse>();
       eager_client->StreamingEnqueueAsync(
-          request, response, [request, response](const Status& s) {
+          request.get(), response.get(), [request, response](const Status& s) {
             if (!s.ok()) {
               LOG(ERROR) << "Failed to register function remotely due to "
                          << s.error_message()
                          << "\nThis shouldn't happen, please file a bug to "
                             "tensorflow team.";
             }
-            delete request;
-            delete response;
           });
     }
   }
@@ -1265,7 +1253,6 @@ Status EagerContext::UpdateRemoteMaster(
                             std::begin(add_remote_contexts),
                             std::end(add_remote_contexts));
   }
-  std::vector<const FunctionDef*> function_defs = ListRegisteredFunctions();
 
   {
     mutex_lock l(remote_state_mu_);
@@ -1292,8 +1279,8 @@ Status EagerContext::UpdateRemoteMaster(
   // ones), and `RegisterExistingFunctionsOnRemoteWorkers` will take care of
   // registering existing functions, where duplicate registrations will be
   // ignored by the remote workers.
-  TF_RETURN_IF_ERROR(RegisterExistingFunctionsOnRemoteWorkers(
-      function_defs, add_remote_contexts));
+  TF_RETURN_IF_ERROR(
+      RegisterExistingFunctionsOnRemoteWorkers(add_remote_contexts));
   return Status::OK();
 }
 
diff --git a/tensorflow/core/common_runtime/eager/context.h b/tensorflow/core/common_runtime/eager/context.h
index 2cafc9579a8..68f618adbec 100644
--- a/tensorflow/core/common_runtime/eager/context.h
+++ b/tensorflow/core/common_runtime/eager/context.h
@@ -482,8 +482,6 @@ class EagerContext : public ImmediateExecutionContext, public core::RefCounted {
 
   tensorflow::Env* TFEnv() const { return env_; }
 
-  std::vector<const FunctionDef*> ListRegisteredFunctions();
-
   Status FindDeviceFromName(const char* device_name, Device** device) const;
 
   Status FindCompositeDeviceFromName(StringPiece device_name,
@@ -513,7 +511,6 @@ class EagerContext : public ImmediateExecutionContext, public core::RefCounted {
   void InitPrioritizedDeviceTypeList();
   Status MaybeRegisterFunctionRemotely(const FunctionDef& fdef);
   Status RegisterExistingFunctionsOnRemoteWorkers(
-      const std::vector<const FunctionDef*>& function_defs,
       const std::vector<string>& remote_workers);
 
   void ResetPFLR(const DeviceMgr* device_mgr, Env* env,
diff --git a/tensorflow/python/eager/remote_cluster_test.py b/tensorflow/python/eager/remote_cluster_test.py
index efe4e08cbf7..864d5e7c0f3 100644
--- a/tensorflow/python/eager/remote_cluster_test.py
+++ b/tensorflow/python/eager/remote_cluster_test.py
@@ -300,6 +300,37 @@ class DynamicClusterTest(test.TestCase, parameterized.TestCase):
       y = worker_fn(x1)
     np.testing.assert_array_equal([[2, 2], [2, 2]], y.numpy())
 
+  @test_util.run_in_async_and_sync_mode
+  def testFunctionRegisteredAndRemoved(self):
+    """Update cluster when other function are registered and removed."""
+    with ops.device(self.device_local):
+      x1 = array_ops.ones([2, 2])
+
+    num_calls = 30
+    self._coord = coordinator.Coordinator()
+
+    def update_server_def_fn():
+      with self._coord.stop_on_exception():
+        for i in range(num_calls):
+          context.update_server_def(
+              server_def=(self.server_def_s1_s2 if i %
+                          2 == 0 else self.server_def_s1_s3))
+
+    t = threading.Thread(target=update_server_def_fn)
+    t.start()
+
+    for _ in range(num_calls):
+      @def_function.function
+      def worker_fn(i):
+        return math_ops.matmul(i, i)
+
+      concrete_fn = worker_fn.get_concrete_function(x1)
+      del concrete_fn
+      del worker_fn
+
+    # No exception should be thrown from the thread
+    self._coord.join([t])
+
   def testPendingNodesServerReplaced(self):
     """Update cluster when nodes are still pending on remote workers."""
     with ops.device(self.device_local):

From af487e85ce54521ec21205e4ef57bf1989d8d929 Mon Sep 17 00:00:00 2001
From: Smit Hinsu <hinsu@google.com>
Date: Wed, 24 Jun 2020 16:44:04 -0700
Subject: [PATCH 1040/1390] Disable dlpack test in asan mode

Currently test has an undefined behavior:

SanitizerError
UndefinedBehaviorSanitizer: null-pointer-use include/c++/v1/vector:1551:12
PiperOrigin-RevId: 318166647
Change-Id: Ie0c3cac1c2b13738ed9cb9942a0b567059a98876
---
 tensorflow/python/dlpack/BUILD | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/python/dlpack/BUILD b/tensorflow/python/dlpack/BUILD
index 7d865029bfb..31ae7f4b435 100644
--- a/tensorflow/python/dlpack/BUILD
+++ b/tensorflow/python/dlpack/BUILD
@@ -19,6 +19,7 @@ cuda_py_test(
     name = "dlpack_test",
     srcs = ["dlpack_test.py"],
     srcs_version = "PY2AND3",
+    tags = ["noasan"],  # TODO(b/159774807)
     deps = [
         ":dlpack",
         "//tensorflow/python/eager:test",

From 472441b0fa6aa7372418cc773fb97398579c1579 Mon Sep 17 00:00:00 2001
From: bhanu prakash bandaru venkata <bhanup@cadence.com>
Date: Wed, 24 Jun 2020 16:53:21 -0700
Subject: [PATCH 1041/1390] PR #39691: Cadence HiFi Mini NN Library: Integrated
 optimized kernels

Imported from GitHub PR https://github.com/tensorflow/tensorflow/pull/39691

Integrated Cadence HiFi Mini optimized functions for
fully_connected, softmax and svdf kernels.
These optimized functions gives better performance for keyword_benchmark application.
Copybara import of the project:

--
094b959873d4db062b65848c97fcff1990dcbe8f by bhanu prakash bandaru venkata <bhanup@cadence.com>:

Cadence HiFi Mini NN Library: Optimized kernels integration

Integrated fully_connected, softmax and svdf cadence HiFi Mini optimized
kernels to tensorflow lite micro kernels. These optimized kernels gives
better performance over reference kernels while running
keyword_benchmark application.

--
172dc6c9ecc8289a984e786d0a254e397c984745 by bhanu prakash bandaru venkata <bhanup@cadence.com>:

Cadence HiFi Mini NN Library: code cleanup and optimization

Further optimized states update in svdf function.
Code cleanup: keep the files which are used in keyword_benchmark
application (keep the functions called in fully_connected, softmax and svdf).
delete remaining files in NN library

--
898d39d526fd38aec04a217b2bc627297c95805d by bhanu prakash bandaru venkata <bhanup@cadence.com>:

Cadence HiFi Mini Nn Library: updating copy rights

Copyrights are updated in fully_connected, softmax and svdf files.
indentation related change in dot_prod.

COPYBARA_INTEGRATE_REVIEW=https://github.com/tensorflow/tensorflow/pull/39691 from bhanuprakashbv:master 898d39d526fd38aec04a217b2bc627297c95805d
PiperOrigin-RevId: 318168368
Change-Id: I61f7812b3a9bcc3a327ed4047c5d72fb64b012e5
---
 .../xtensa_hifimini/fixedpoint_utils.h        |  153 +++
 .../xtensa_hifimini/fully_connected.cc        |  204 ++++
 .../micro/kernels/xtensa_hifimini/quantize.cc |  178 +++
 .../micro/kernels/xtensa_hifimini/softmax.cc  |  194 +++
 .../micro/kernels/xtensa_hifimini/svdf.cc     |  362 ++++++
 .../algo/common/include/xa_api_defs.h         |   65 +
 .../algo/common/include/xa_nnlib_common.h     |   55 +
 .../common/include/xa_nnlib_common_macros.h   |  921 ++++++++++++++
 .../common/include/xa_nnlib_definitions.h     |   57 +
 .../algo/common/include/xa_nnlib_err_chk.h    |   84 ++
 .../xa_nn_activations_asym8s_asym8s.c         |  176 +++
 .../hifi_mini/xa_nn_softmax_asym8_asym8.c     | 1005 ++++++++++++++++
 .../basic/hifi_mini/xa_nn_dot_prod_16x16.c    |  175 +++
 .../fc/hifi_mini/xa_nn_fully_connected.c      |  142 +++
 .../hifi_mini/xa_nn_matXvec_sym8sxasym8s.c    | 1053 +++++++++++++++++
 .../xa_nnlib/include/nnlib/xa_nnlib_api.h     |   43 +
 .../include/nnlib/xa_nnlib_kernels_api.h      |  298 +++++
 .../include/nnlib/xa_nnlib_standards.h        |  170 +++
 .../xa_nnlib/include/xa_type_def.h            |  106 ++
 .../xtensa_hifimini/xtensa_tf_micro_common.h  |   88 ++
 .../ext_libs/xtensa_hifimini_nn_library.inc   |   30 +
 21 files changed, 5559 insertions(+)
 create mode 100644 tensorflow/lite/micro/kernels/xtensa_hifimini/fixedpoint_utils.h
 create mode 100644 tensorflow/lite/micro/kernels/xtensa_hifimini/fully_connected.cc
 create mode 100644 tensorflow/lite/micro/kernels/xtensa_hifimini/quantize.cc
 create mode 100644 tensorflow/lite/micro/kernels/xtensa_hifimini/softmax.cc
 create mode 100644 tensorflow/lite/micro/kernels/xtensa_hifimini/svdf.cc
 create mode 100644 tensorflow/lite/micro/kernels/xtensa_hifimini/xa_nnlib/algo/common/include/xa_api_defs.h
 create mode 100644 tensorflow/lite/micro/kernels/xtensa_hifimini/xa_nnlib/algo/common/include/xa_nnlib_common.h
 create mode 100644 tensorflow/lite/micro/kernels/xtensa_hifimini/xa_nnlib/algo/common/include/xa_nnlib_common_macros.h
 create mode 100644 tensorflow/lite/micro/kernels/xtensa_hifimini/xa_nnlib/algo/common/include/xa_nnlib_definitions.h
 create mode 100644 tensorflow/lite/micro/kernels/xtensa_hifimini/xa_nnlib/algo/common/include/xa_nnlib_err_chk.h
 create mode 100644 tensorflow/lite/micro/kernels/xtensa_hifimini/xa_nnlib/algo/kernels/activations/hifi_mini/xa_nn_activations_asym8s_asym8s.c
 create mode 100644 tensorflow/lite/micro/kernels/xtensa_hifimini/xa_nnlib/algo/kernels/activations/hifi_mini/xa_nn_softmax_asym8_asym8.c
 create mode 100644 tensorflow/lite/micro/kernels/xtensa_hifimini/xa_nnlib/algo/kernels/basic/hifi_mini/xa_nn_dot_prod_16x16.c
 create mode 100644 tensorflow/lite/micro/kernels/xtensa_hifimini/xa_nnlib/algo/kernels/fc/hifi_mini/xa_nn_fully_connected.c
 create mode 100644 tensorflow/lite/micro/kernels/xtensa_hifimini/xa_nnlib/algo/kernels/matXvec/hifi_mini/xa_nn_matXvec_sym8sxasym8s.c
 create mode 100644 tensorflow/lite/micro/kernels/xtensa_hifimini/xa_nnlib/include/nnlib/xa_nnlib_api.h
 create mode 100644 tensorflow/lite/micro/kernels/xtensa_hifimini/xa_nnlib/include/nnlib/xa_nnlib_kernels_api.h
 create mode 100644 tensorflow/lite/micro/kernels/xtensa_hifimini/xa_nnlib/include/nnlib/xa_nnlib_standards.h
 create mode 100644 tensorflow/lite/micro/kernels/xtensa_hifimini/xa_nnlib/include/xa_type_def.h
 create mode 100644 tensorflow/lite/micro/kernels/xtensa_hifimini/xtensa_tf_micro_common.h
 create mode 100644 tensorflow/lite/micro/tools/make/ext_libs/xtensa_hifimini_nn_library.inc

diff --git a/tensorflow/lite/micro/kernels/xtensa_hifimini/fixedpoint_utils.h b/tensorflow/lite/micro/kernels/xtensa_hifimini/fixedpoint_utils.h
new file mode 100644
index 00000000000..918192c4d8f
--- /dev/null
+++ b/tensorflow/lite/micro/kernels/xtensa_hifimini/fixedpoint_utils.h
@@ -0,0 +1,153 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_MICRO_KERNELS_XTENSA_HIFIMINI_FIXEDPOINT_UTILS_H_
+#define TENSORFLOW_LITE_MICRO_KERNELS_XTENSA_HIFIMINI_FIXEDPOINT_UTILS_H_
+
+#include <xtensa/tie/xt_hifi2.h>
+
+#include <algorithm>
+#include <cmath>
+#include <cstdint>
+
+#include "tensorflow/lite/kernels/internal/compatibility.h"
+
+namespace tflite {
+namespace ops {
+namespace micro {
+namespace xtensa {
+namespace hifimini {
+
+// INT24 MIN/MAX
+#define INT24_MIN -8388608
+#define INT24_MAX 8388607
+
+//
+// Multiply 24bit value by a quantized multiplier (w/ shift) and returns a 48bit
+// aligned value in the QR register.
+//
+inline ae_q56s MultiplyByQuantizedMultiplier(ae_p24x2s x_24x2,
+                                             int32_t quantized_multiplier,
+                                             int shift) {
+  // A value with 1 sign bit, N integer bits and M fractional bits is
+  // represented as QN+1.M since the sign bit is included in the integer bits.
+  //
+  // The Q notation in this method explains the values represented in each
+  // variable, along with an implicit division since the quantized_multiplier
+  // represents a value between 0.5 and 1.0 (Q1.X-1 where X is the bit precision
+  // of the type).
+  //
+  // Load the quantized multiplier into the PR register.
+  // NOTE: This method assumes that this param has been calculated for 24bit
+  // space - not 32bits.
+  // Q32.0 / 2^23 -> Q24.0 / 2^23 representing a Q1.23 multiplier.
+  ae_p24x2s quantized_multiplier_24x2 = AE_MOVPA24(quantized_multiplier);
+  // Shift right by 23 - 16 bits minus the specified shift.  This is because we
+  // keep 16 fractional bits until the end to perform rounding.  Subtract shift
+  // since shift is a left shift, and the 23-16 is a right shift.
+  int shift_amount = 7 - shift;
+
+  // Find the product of x and the quantized_multiplier.
+  // Q24.0 / 2^23 * Q24.0 = Q48.0 / 2^23
+  // Q48.0 / 2^23 >> 7 = Q48.0 / 2^16
+  ae_q56s result_56 = AE_MULP24S_HH(x_24x2, quantized_multiplier_24x2);
+
+  // Shift right if shift amount is positive, left if shift amount is negative.
+  if (shift_amount >= 0) {
+    result_56 = AE_Q56S_SRA(result_56, shift_amount);
+  } else {
+    result_56 = AE_Q56S_SLA(result_56, -shift_amount);
+  }
+
+  // Round off the bottom 16 bits.
+  // Q48.0 / 2^16 -> Q32.0 aligned to 48 bits.
+  result_56 = AE_ROUNDSQ32SYM(result_56);
+  return result_56;
+}
+
+//
+// Multiply 32bit value by a quantized multiplier (w/ shift) and returns a 48bit
+// aligned value in the QR register.
+//
+inline ae_q56s MultiplyByQuantizedMultiplier(int32_t x,
+                                             int32_t quantized_multiplier,
+                                             int shift) {
+  // Convert x into a 2x24bit PR register file. If x is outside the numerical
+  // limits of a 24bit integer, the "fractional" or lower 8bits are discarded.
+  // If x is within the range of a 24 bit integer, the "signed" or upper 8bits
+  // are discarded.
+  ae_p24x2s x_24x2;
+  if (x > INT24_MIN && x < INT24_MAX) {
+    x_24x2 = AE_MOVPA24(x);
+  } else {
+    x_24x2 = static_cast<ae_p24s>(*reinterpret_cast<ae_p24f*>(&x));
+    shift += 8;
+  }
+
+  return MultiplyByQuantizedMultiplier(x_24x2, quantized_multiplier, shift);
+}
+
+//
+// Calculate quantization params for 24bit runtimes.
+//
+inline void QuantizeMultiplier(float multiplier, int32_t* quantized_multiplier,
+                               int* shift) {
+  if (multiplier == 0.0f) {
+    *quantized_multiplier = 0;
+    *shift = 0;
+    return;
+  }
+
+  // Special cased to 24bit:
+  const float q = std::frexp(multiplier, shift);
+  auto q_fixed = static_cast<int64_t>(std::round(q * (1 << 23)));
+
+  TFLITE_CHECK(q_fixed <= (1 << 23));
+  if (q_fixed == (1 << 23)) {
+    q_fixed /= 2;
+    ++*shift;
+  }
+  TFLITE_CHECK_LE(q_fixed, INT24_MAX);
+
+  // Ensure shift does not exceed 24-bit range.
+  TFLITE_CHECK_LE(*shift, 23);
+  if (*shift < -23) {
+    *shift = 0;
+    q_fixed = 0;
+  }
+  *quantized_multiplier = static_cast<int32_t>(q_fixed);
+}
+
+//
+// Convert a floating point number to a Q representation for 24 bit integers.
+//
+inline int CreateQConstantForInt24(int integer_bits, float f) {
+  const float min_bounds = static_cast<float>(INT24_MIN);
+  const float max_bounds = static_cast<float>(INT24_MAX);
+
+  int fractional_bits = 23 - integer_bits;
+  float raw = std::round(f * static_cast<float>(1 << fractional_bits));
+  raw = std::max(raw, min_bounds);
+  raw = std::min(raw, max_bounds);
+  return static_cast<int>(raw);
+}
+
+}  // namespace hifimini
+}  // namespace xtensa
+}  // namespace micro
+}  // namespace ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_MICRO_KERNELS_XTENSA_HIFIMINI_FIXEDPOINT_UTILS_H_
diff --git a/tensorflow/lite/micro/kernels/xtensa_hifimini/fully_connected.cc b/tensorflow/lite/micro/kernels/xtensa_hifimini/fully_connected.cc
new file mode 100644
index 00000000000..c1adfc88a02
--- /dev/null
+++ b/tensorflow/lite/micro/kernels/xtensa_hifimini/fully_connected.cc
@@ -0,0 +1,204 @@
+/*******************************************************************************
+ * Copyright (c) 2020 Cadence Design Systems, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to use this Software with Cadence processor cores only and
+ * not with any other processors and platforms, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ ******************************************************************************/
+
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/kernels/internal/reference/fully_connected.h"
+
+#include <xtensa/tie/xt_hifi2.h>
+
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/lite/kernels/internal/reference/integer_ops/fully_connected.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/micro/kernels/xtensa_hifimini/fixedpoint_utils.h"
+#include "tensorflow/lite/micro/kernels/xtensa_hifimini/xtensa_tf_micro_common.h"
+namespace tflite {
+namespace ops {
+namespace micro {
+
+namespace fully_connected {
+namespace {
+
+struct OpData {
+  // The scaling factor from input to output (aka the 'real multiplier') can
+  // be represented as a fixed point multiplier plus a left shift.
+  int32_t output_multiplier;
+  int output_shift;
+  // The range of the fused activation layer. For example for kNone and
+  // uint8_t these would be 0 and 255.
+  int32_t output_activation_min;
+  int32_t output_activation_max;
+  // The index of the temporary tensor where the quantized inputs are cached.
+  int input_quantized_index;
+};
+
+constexpr int kInputTensor = 0;
+constexpr int kWeightsTensor = 1;
+constexpr int kBiasTensor = 2;
+constexpr int kOutputTensor = 0;
+
+TfLiteStatus CalculateOpData(TfLiteContext* context,
+                             TfLiteFusedActivation activation,
+                             TfLiteType data_type, const TfLiteTensor* input,
+                             const TfLiteTensor* filter,
+                             const TfLiteTensor* bias, TfLiteTensor* output,
+                             OpData* data) {
+  if (data_type != kTfLiteInt8) {
+    TF_LITE_KERNEL_LOG(context, "Type %s (%d) not supported.",
+                       TfLiteTypeGetName(data_type), data_type);
+    return kTfLiteError;
+  }
+
+  double real_multiplier = 0.0;
+  TF_LITE_ENSURE_STATUS(GetQuantizedConvolutionMultipler(
+      context, input, filter, bias, output, &real_multiplier));
+  xtensa::hifimini::QuantizeMultiplier(
+      real_multiplier, &data->output_multiplier, &data->output_shift);
+  return CalculateActivationRangeQuantized(context, activation, output,
+                                           &data->output_activation_min,
+                                           &data->output_activation_max);
+}
+
+}  // namespace
+
+void* Init(TfLiteContext* context, const char* buffer, size_t length) {
+  TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
+  void* data = nullptr;
+  if (context->AllocatePersistentBuffer(context, sizeof(OpData), &data) ==
+      kTfLiteError) {
+    return nullptr;
+  }
+  return data;
+}
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  TFLITE_DCHECK(node->user_data != nullptr);
+  TFLITE_DCHECK(node->builtin_data != nullptr);
+
+  OpData* data = static_cast<OpData*>(node->user_data);
+  const auto* params =
+      reinterpret_cast<TfLiteFullyConnectedParams*>(node->builtin_data);
+
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  const TfLiteTensor* filter = GetInput(context, node, kWeightsTensor);
+  const TfLiteTensor* bias = GetOptionalInputTensor(context, node, kBiasTensor);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  return CalculateOpData(context, params->activation, input->type, input,
+                         filter, bias, output, data);
+}
+
+TfLiteStatus EvalQuantizedInt8(TfLiteContext* context, TfLiteNode* node,
+                               const OpData& data, const TfLiteTensor* input,
+                               const TfLiteTensor* filter,
+                               const TfLiteTensor* bias, TfLiteTensor* output) {
+  // TODO(b/154032858): Investigate removing extra copies.
+  FullyConnectedParams op_params;
+  op_params.input_offset = -input->params.zero_point;
+  op_params.weights_offset = -filter->params.zero_point;
+  op_params.output_offset = output->params.zero_point;
+  op_params.output_multiplier = data.output_multiplier;
+  op_params.output_shift = data.output_shift;
+  op_params.quantized_activation_min = data.output_activation_min;
+  op_params.quantized_activation_max = data.output_activation_max;
+
+  {
+    int ret, b, weight_depth, out_depth, batches;
+    int8_t* p_out = GetTensorData<int8_t>(output);
+    weight_depth = GetTensorShape(filter).Dims(
+        GetTensorShape(filter).DimensionsCount() - 1);
+    out_depth = GetTensorShape(output).Dims(
+        GetTensorShape(output).DimensionsCount() - 1);
+    batches = FlatSizeSkipDim(GetTensorShape(output),
+                              GetTensorShape(output).DimensionsCount() - 1);
+
+    // TODO: Use xa_nn_fully_connected_sym8xasym8s_asym8s? the kernel tests fail
+    // with it.
+    for (b = 0; b < batches; b++) {
+      ret = xa_nn_fully_connected_asym8sxasym8s_asym8s(
+          (GetTensorData<int8_t>(output) + b * out_depth),
+          GetTensorData<int8_t>(filter),
+          (GetTensorData<int8_t>(input) + b * weight_depth),
+          GetTensorData<int32_t>(bias), weight_depth, out_depth,
+          op_params.weights_offset, op_params.input_offset,
+          (op_params.output_multiplier << 8), op_params.output_shift,
+          op_params.output_offset);
+      CHECK_ERR_HIFI_NNLIB_KER(
+          ret, "xa_nn_fully_connected_sym8xasym8s_asym8s failed");
+    }
+    ret = xa_nn_vec_activation_min_max_asym8s_asym8s(
+        p_out, p_out, data.output_activation_min, data.output_activation_max,
+        batches * out_depth);
+    CHECK_ERR_HIFI_NNLIB_KER(
+        ret,
+        "fully_connected: xa_nn_vec_activation_min_max_asym8s_asym8s failed");
+  }
+  return kTfLiteOk;
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  TFLITE_DCHECK(node->user_data != nullptr);
+  const OpData& data = *(static_cast<const OpData*>(node->user_data));
+
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  const TfLiteTensor* filter = GetInput(context, node, kWeightsTensor);
+  const TfLiteTensor* bias = GetOptionalInputTensor(context, node, kBiasTensor);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  TFLITE_DCHECK(filter->type == kTfLiteInt8);
+  return EvalQuantizedInt8(context, node, data, input, filter, bias, output);
+}
+
+}  // namespace fully_connected
+
+TfLiteRegistration* Register_FULLY_CONNECTED() {
+  static TfLiteRegistration r = {/*init=*/fully_connected::Init,
+                                 /*free=*/nullptr,
+                                 /*prepare=*/fully_connected::Prepare,
+                                 /*invoke=*/fully_connected::Eval,
+                                 /*profiling_string=*/nullptr,
+                                 /*builtin_code=*/0,
+                                 /*custom_name=*/nullptr,
+                                 /*version=*/0};
+
+  return &r;
+}
+
+}  // namespace micro
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/lite/micro/kernels/xtensa_hifimini/quantize.cc b/tensorflow/lite/micro/kernels/xtensa_hifimini/quantize.cc
new file mode 100644
index 00000000000..29b2544a625
--- /dev/null
+++ b/tensorflow/lite/micro/kernels/xtensa_hifimini/quantize.cc
@@ -0,0 +1,178 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/kernels/internal/reference/quantize.h"
+
+#include <xtensa/tie/xt_hifi2.h>
+
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/micro/kernels/xtensa_hifimini/fixedpoint_utils.h"
+
+namespace tflite {
+namespace ops {
+namespace micro {
+
+namespace xtensa {
+namespace hifimini {
+
+void AffineQuantize(int scale_multiplier,
+                    const tflite::QuantizationParams& op_params,
+                    const RuntimeShape& input_shape, const int16_t* input_data,
+                    const RuntimeShape& output_shape, int8_t* output_data) {
+  const int32 zero_point = op_params.zero_point;
+  const int flat_size = MatchingFlatSize(input_shape, output_shape);
+  ae_q56s min_val_56 = AE_CVTQ48A32S(INT16_MIN);
+  ae_q56s max_val_56 = AE_CVTQ48A32S(INT16_MAX);
+  ae_q56s zero_point_56 = AE_CVTQ48A32S(zero_point);
+
+  const ae_p16x2s* input_data_ptr = (const ae_p16x2s*)(input_data - 2);
+
+  ae_p24x2s scale_multiplier_24x2 = AE_MOVPA24(scale_multiplier);
+
+  int iters = flat_size / 2;
+  for (int i = 0; i < iters; i++) {
+    // Load two 16bit pairs into the 2x24bit register PR:
+    // Values need to be right shifted 8 bits to align from upper 16bits to a
+    // 24bit value:
+    ae_p24x2s inputs_24x2;
+    AE_LP16X2F_IU(inputs_24x2, input_data_ptr, 4);
+    inputs_24x2 = AE_P24X2S_SRAI(inputs_24x2, 8);
+
+    // Q0.23 * Q16.0 == Q16.23
+    {
+      ae_q56s sum_56 = AE_MULP24S_HH(scale_multiplier_24x2, inputs_24x2);
+
+      // Q16.23 -> Q16.0
+      // Shift right only 7 bits (23 - 16). This truncated shift aligns the
+      // 16bit value at the truncation line for 32bit in the QR register. The
+      // lower 16 bits will be used for rounding in AE_ROUNDSQ32SYM.
+      sum_56 = AE_Q56S_SRAI(sum_56, 7);
+
+      // Round and truncate 32 bits
+      sum_56 = AE_ROUNDSQ32SYM(sum_56);
+
+      // Add offset (zero_point_56 is already aligned at 32bits.
+      sum_56 = AE_ADDQ56(sum_56, zero_point_56);
+
+      // Saturate:
+      sum_56 = AE_MINQ56S(sum_56, max_val_56);
+      sum_56 = AE_MAXQ56S(sum_56, min_val_56);
+
+      output_data[i * 2] = static_cast<int16_t>(AE_TRUNCA32Q48(sum_56));
+    }
+    {
+      ae_q56s sum_56 = AE_MULP24S_LL(scale_multiplier_24x2, inputs_24x2);
+
+      // Q16.23 -> Q16.0
+      // Shift right only 7 bits (23 - 16). This truncated shift aligns the
+      // 16bit value at the truncation line for 32bit in the QR register. The
+      // lower 16 bits will be used for rounding in AE_ROUNDSQ32SYM.
+      sum_56 = AE_Q56S_SRAI(sum_56, 23 - 16);
+
+      // Round and truncate 32 bits
+      sum_56 = AE_ROUNDSQ32SYM(sum_56);
+
+      // Add offset (zero_point_56 is already aligned at 32bits.
+      sum_56 = AE_ADDQ56(sum_56, zero_point_56);
+
+      // Saturate:
+      sum_56 = AE_MINQ56S(sum_56, max_val_56);
+      sum_56 = AE_MAXQ56S(sum_56, min_val_56);
+
+      output_data[i * 2 + 1] = static_cast<int16_t>(AE_TRUNCA32Q48(sum_56));
+    }
+  }
+}
+
+}  // namespace hifimini
+}  // namespace xtensa
+
+namespace quantize {
+
+struct OpData {
+  int scale_multiplier = 0;
+};
+
+void* Init(TfLiteContext* context, const char* buffer, size_t length) {
+  TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
+  void* data = nullptr;
+  if (context->AllocatePersistentBuffer(context, sizeof(OpData), &data) ==
+      kTfLiteError) {
+    return nullptr;
+  }
+  return data;
+}
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  TFLITE_DCHECK(node->user_data != nullptr);
+  auto* op_data = static_cast<OpData*>(node->user_data);
+
+  TfLiteTensor* output = GetOutput(context, node, 0);
+  const TfLiteTensor* input = GetInput(context, node, 0);
+
+  // TODO(b/155682734): Fix dangerous input/output scale ratio assumptions.
+  op_data->scale_multiplier = xtensa::hifimini::CreateQConstantForInt24(
+      0, input->params.scale / output->params.scale);
+
+  return kTfLiteOk;
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  TFLITE_DCHECK(node->user_data != nullptr);
+  auto* op_data = static_cast<OpData*>(node->user_data);
+
+  const TfLiteTensor* input = GetInput(context, node, 0);
+  TfLiteTensor* output = GetOutput(context, node, 0);
+
+  tflite::QuantizationParams op_params;
+  op_params.zero_point = output->params.zero_point;
+
+  if (input->type != kTfLiteInt16 && output->type != kTfLiteInt8) {
+    TF_LITE_KERNEL_LOG(context, "Input %s, output %s not supported.",
+                       TfLiteTypeGetName(input->type),
+                       TfLiteTypeGetName(output->type));
+    return kTfLiteError;
+  }
+
+  xtensa::hifimini::AffineQuantize(
+      op_data->scale_multiplier, op_params, GetTensorShape(input),
+      GetTensorData<int16_t>(input), GetTensorShape(output),
+      GetTensorData<int8_t>(output));
+  return kTfLiteOk;
+}
+
+}  // namespace quantize
+
+// This Op (QUANTIZE) quantizes the input and produces quantized output.
+// AffineQuantize takes scale and zero point and quantizes the float value to
+// quantized output, in int8 or uint8 format.
+TfLiteRegistration* Register_QUANTIZE() {
+  static TfLiteRegistration r = {/*init=*/quantize::Init,
+                                 /*free=*/nullptr,
+                                 /*prepare=*/quantize::Prepare,
+                                 /*invoke=*/quantize::Eval,
+                                 /*profiling_string=*/nullptr,
+                                 /*builtin_code=*/0,
+                                 /*custom_name=*/nullptr,
+                                 /*version=*/0};
+  return &r;
+}
+
+}  // namespace micro
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/lite/micro/kernels/xtensa_hifimini/softmax.cc b/tensorflow/lite/micro/kernels/xtensa_hifimini/softmax.cc
new file mode 100644
index 00000000000..ddf05d28d7f
--- /dev/null
+++ b/tensorflow/lite/micro/kernels/xtensa_hifimini/softmax.cc
@@ -0,0 +1,194 @@
+/*******************************************************************************
+ * Copyright (c) 2020 Cadence Design Systems, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to use this Software with Cadence processor cores only and
+ * not with any other processors and platforms, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ ******************************************************************************/
+
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/kernels/internal/reference/softmax.h"
+
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/kernels/op_macros.h"
+#include "tensorflow/lite/micro/kernels/xtensa_hifimini/xtensa_tf_micro_common.h"
+
+namespace tflite {
+namespace ops {
+namespace micro {
+namespace activations {
+namespace {
+
+struct OpData {
+  int32_t input_multiplier;
+  int32_t input_left_shift;
+  int32_t diff_min;
+  int scratch_tensor_index;
+};
+
+}  // namespace
+
+TfLiteStatus CalculateSoftmaxOpData(TfLiteContext* context,
+                                    const TfLiteTensor* input,
+                                    TfLiteTensor* output,
+                                    const TfLiteSoftmaxParams* params,
+                                    OpData* op_data) {
+  if (input->type == kTfLiteUInt8 || input->type == kTfLiteInt8) {
+    if (input->type == kTfLiteUInt8) {
+      TF_LITE_ENSURE_EQ(context, output->params.zero_point, 0);
+    } else {
+      if (output->type == kTfLiteInt16) {
+        TF_LITE_ENSURE_EQ(context, output->params.zero_point,
+                          std::numeric_limits<int16_t>::min());
+        // NOTE: Current int16 softmax output does not require symmetric scaling
+        // - so no need to verify scale here.
+      } else {
+        TF_LITE_ENSURE_EQ(context, output->params.zero_point,
+                          std::numeric_limits<int8_t>::min());
+        TF_LITE_ENSURE(context, output->params.scale == 1.f / 256);
+      }
+    }
+
+    static const int kScaledDiffIntegerBits = 5;
+
+    int input_left_shift;
+    tflite::PreprocessSoftmaxScaling(
+        static_cast<double>(params->beta),
+        static_cast<double>(input->params.scale), kScaledDiffIntegerBits,
+        &op_data->input_multiplier, &input_left_shift);
+    op_data->input_left_shift = input_left_shift;
+    op_data->diff_min =
+        -1.0 * tflite::CalculateInputRadius(kScaledDiffIntegerBits,
+                                            op_data->input_left_shift);
+  }
+  return kTfLiteOk;
+}
+
+void* SoftmaxInit(TfLiteContext* context, const char* buffer, size_t length) {
+  TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
+  void* data = nullptr;
+  if (context->AllocatePersistentBuffer(context, sizeof(OpData), &data) ==
+      kTfLiteError) {
+    return nullptr;
+  }
+  return data;
+}
+
+TfLiteStatus SoftmaxPrepare(TfLiteContext* context, TfLiteNode* node) {
+  auto* params = static_cast<TfLiteSoftmaxParams*>(node->builtin_data);
+
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+  const TfLiteTensor* input = GetInput(context, node, 0);
+  TfLiteTensor* output = GetOutput(context, node, 0);
+  TF_LITE_ENSURE(context, NumDimensions(input) >= 1);
+
+  TFLITE_DCHECK(node->user_data != nullptr);
+  OpData* op_data = static_cast<OpData*>(node->user_data);
+
+  const RuntimeShape& input_shape = GetTensorShape(input);
+  const RuntimeShape& output_shape = GetTensorShape(output);
+  const int trailing_dim = input_shape.DimensionsCount() - 1;
+  const int depth =
+      MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim);
+  int scratch_size =
+      xa_nn_get_softmax_scratch_size(PREC_SYM8S, PREC_SYM8S, depth);
+
+  const TfLiteStatus scratch_status = context->RequestScratchBufferInArena(
+      context, scratch_size, &(op_data->scratch_tensor_index));
+  TF_LITE_ENSURE_OK(context, scratch_status);
+  // Allocate an array to precompute exponents over all int8 inputs, applying
+  // the scale and beta before calculating exp. It is mandatory to apply beta
+  // and scale here, since each softmax op may have different beta and scale
+  // values. Beta and scale will remain constant for a given softmax op.
+
+  TF_LITE_ENSURE_STATUS(
+      CalculateSoftmaxOpData(context, input, output, params, op_data));
+
+  return kTfLiteOk;
+}
+
+TfLiteStatus SoftmaxEval(TfLiteContext* context, TfLiteNode* node) {
+  auto* op_data = static_cast<OpData*>(node->user_data);
+
+  const TfLiteTensor* input = GetInput(context, node, 0);
+  TfLiteTensor* output = GetOutput(context, node, 0);
+
+  if (input->type == kTfLiteInt8 && output->type == kTfLiteInt16) {
+    const RuntimeShape& input_shape = GetTensorShape(input);
+    const int8_t* input_data = GetTensorData<int8_t>(input);
+    const RuntimeShape& output_shape = GetTensorShape(output);
+    int16* output_data = GetTensorData<int16>(output);
+    const int trailing_dim = input_shape.DimensionsCount() - 1;
+    const int outer_size =
+        MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape);
+    const int depth =
+        MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim);
+
+    void* p_scratch = static_cast<void*>(
+        context->GetScratchBuffer(context, op_data->scratch_tensor_index));
+    TFLITE_DCHECK(p_scratch != nullptr);
+
+    for (int i = 0; i < outer_size; ++i) {
+      int err = xa_nn_vec_softmax_asym8s_16(
+          &output_data[i * depth], &input_data[i * depth], op_data->diff_min,
+          op_data->input_left_shift, op_data->input_multiplier, depth,
+          p_scratch);
+      CHECK_ERR_HIFI_NNLIB_KER(err, "xa_nn_vec_softmax_asym8s_16 failed");
+    }
+    return kTfLiteOk;
+  } else {
+    TF_LITE_KERNEL_LOG(context, "Type %s (%d) not supported.",
+                       TfLiteTypeGetName(input->type), input->type);
+    return kTfLiteError;
+  }
+}
+}  // namespace activations
+
+TfLiteRegistration* Register_SOFTMAX() {
+  static TfLiteRegistration r = {/*init=*/activations::SoftmaxInit,
+                                 /*free=*/nullptr,
+                                 /*prepare=*/activations::SoftmaxPrepare,
+                                 /*invoke=*/activations::SoftmaxEval,
+                                 /*profiling_string=*/nullptr,
+                                 /*builtin_code=*/0,
+                                 /*custom_name=*/nullptr,
+                                 /*version=*/0};
+  return &r;
+}
+
+}  // namespace micro
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/lite/micro/kernels/xtensa_hifimini/svdf.cc b/tensorflow/lite/micro/kernels/xtensa_hifimini/svdf.cc
new file mode 100644
index 00000000000..2ac3202736d
--- /dev/null
+++ b/tensorflow/lite/micro/kernels/xtensa_hifimini/svdf.cc
@@ -0,0 +1,362 @@
+/*******************************************************************************
+ * Copyright (c) 2020 Cadence Design Systems, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to use this Software with Cadence processor cores only and
+ * not with any other processors and platforms, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ ******************************************************************************/
+
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <math.h>
+#include <xtensa/tie/xt_hifi2.h>
+
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/kernels/op_macros.h"
+#include "tensorflow/lite/micro/kernels/activation_utils.h"
+#include "tensorflow/lite/micro/kernels/xtensa_hifimini/fixedpoint_utils.h"
+#include "tensorflow/lite/micro/kernels/xtensa_hifimini/xtensa_tf_micro_common.h"
+
+namespace tflite {
+namespace ops {
+namespace micro {
+namespace svdf {
+namespace {
+
+struct OpData {
+  int32 effective_scale_1_a;
+  int32 effective_scale_2_a;
+  // b versions of each scale are kept at int since the numbers are just the
+  // shift value - typically between [-32, 32].
+  int effective_scale_1_b;
+  int effective_scale_2_b;
+  int scratch_tensor_index;
+  int scratch_output_tensor_index;
+};
+
+// Input tensors.
+constexpr int kInputTensor = 0;
+constexpr int kWeightsFeatureTensor = 1;
+constexpr int kWeightsTimeTensor = 2;
+constexpr int kBiasTensor = 3;
+// This is a variable tensor, and will be modified by this op.
+constexpr int kInputActivationStateTensor = 4;
+
+// Output tensor.
+constexpr int kOutputTensor = 0;
+
+/**
+ * This version of SVDF is specific to TFLite Micro. It contains only a full
+ * integer receipe with optimizations for the Xtensa HiFiMini platform.
+ *
+ * Note: passing OpData by value might seem like an oversight but it helps
+ * reduce the latency. See b/155656675 for more details.
+ */
+TfLiteStatus EvalIntegerSVDF(TfLiteContext* context, TfLiteNode* node,
+                             const TfLiteTensor* input_tensor,
+                             const TfLiteTensor* weights_feature_tensor,
+                             const TfLiteTensor* weights_time_tensor,
+                             const TfLiteTensor* bias_tensor,
+                             const TfLiteSVDFParams* params,
+                             TfLiteTensor* activation_state_tensor,
+                             TfLiteTensor* output_tensor, OpData data,
+                             int32_t input_zp, int32_t output_zp) {
+  const int n_rank = params->rank;
+  const int n_batch = input_tensor->dims->data[0];
+  const int n_input = input_tensor->dims->data[1];
+  const int n_filter = weights_feature_tensor->dims->data[0];
+  const int n_unit = n_filter / n_rank;
+  const int n_memory = weights_time_tensor->dims->data[1];
+
+  TFLITE_DCHECK(context != nullptr);
+  TFLITE_DCHECK(context->GetScratchBuffer != nullptr);
+
+  int32_t* scratch_tensor = static_cast<int32_t*>(
+      context->GetScratchBuffer(context, data.scratch_tensor_index));
+  TFLITE_DCHECK(scratch_tensor != nullptr);
+  int32_t* scratch_output_tensor = static_cast<int32_t*>(
+      context->GetScratchBuffer(context, data.scratch_output_tensor_index));
+  TFLITE_DCHECK(scratch_output_tensor != nullptr);
+
+  // Shift states.
+  int16_t* const state_ptr = GetTensorData<int16_t>(activation_state_tensor);
+
+  // Left shift the activation_state.
+
+  // 4-byte alignment check for state_ptr
+  if (((reinterpret_cast<int>(state_ptr)) & 0x3) == 0) {
+    // 4-bytes aligned processing
+    ae_p16x2s* new_state_start = (ae_p16x2s*)(state_ptr - 2);
+    const ae_p16x2s* old_state_start = (ae_p16x2s*)(state_ptr - 2);
+    int loopcnt = (n_batch * n_filter * n_memory) - 1;
+    ae_p24x2s dstate, dtmp, dout;
+
+    AE_LP16X2F_IU(dtmp, old_state_start, 4);
+    AE_LP16X2F_IU(dstate, old_state_start, 4);
+    for (int i = 0; i < (loopcnt >> 1); i++) {
+      dout = AE_SELP24_LH(dtmp, dstate);
+      dtmp = dstate;
+      AE_LP16X2F_IU(dstate, old_state_start, 4);
+      AE_SP16X2F_IU(dout, new_state_start, 4);
+    }
+    if (loopcnt & 0x1) {
+      AE_SP16F_L_I(dtmp, (ae_p16s*)new_state_start, 4);
+    }
+  } else {
+    // 2-bytes aligned processing
+    ae_p16s* new_state_start = (ae_p16s*)(state_ptr - 1);
+    const ae_p16s* old_state_start = (ae_p16s*)(state_ptr);
+    int loopcnt = (n_batch * n_filter * n_memory) - 1;
+    ae_p24x2s dstate;
+    for (int i = 0; i < loopcnt; i++) {
+      AE_LP16F_IU(dstate, old_state_start, 2);
+      AE_SP16F_L_IU(dstate, new_state_start, 2);
+    }
+  }
+  // Note: no need to clear the latest activation, matmul is not accumulative.
+
+  // Feature matmul.
+  {
+    int16_t* state = GetTensorData<int16_t>(activation_state_tensor);
+    const int8_t* input = GetTensorData<int8_t>(input_tensor);
+    const int8_t* weight_feature =
+        GetTensorData<int8_t>(weights_feature_tensor);
+    int16_t* result_in_batch = state + (n_memory - 1);
+    int err = 0;
+
+    for (int b = 0; b < n_batch; b++) {
+      err = xa_nn_matXvec_out_stride_sym8sxasym8s_16(
+          &result_in_batch[b * n_filter * n_memory], weight_feature,
+          &input[b * n_input], NULL, n_filter, n_input, n_input, n_memory,
+          -input_zp, (data.effective_scale_1_a << 8), data.effective_scale_1_b);
+      CHECK_ERR_HIFI_NNLIB_KER(err, "xa_nn_vec_matXvec_sym8sxasym8s_16 failed");
+    }
+  }
+
+  // Time.
+  {
+    for (int b = 0; b < n_batch; ++b) {
+      int8_t* output_ptr = GetTensorData<int8_t>(output_tensor) + b * n_unit;
+
+      const int16_t* vector1_ptr = GetTensorData<int16_t>(weights_time_tensor);
+      const int16_t* vector2_ptr =
+          GetTensorData<int16_t>(activation_state_tensor) +
+          b * n_memory * n_filter;
+      int err = 0;
+      const int32_t* bias_ptr = GetTensorData<int32_t>(bias_tensor);
+      err = xa_nn_dot_prod_16x16_asym8s(
+          output_ptr, vector1_ptr, vector2_ptr, bias_ptr, n_memory * n_rank,
+          (data.effective_scale_2_a << 8), data.effective_scale_2_b, output_zp,
+          n_unit);
+      CHECK_ERR_HIFI_NNLIB_KER(err, "xa_nn_dot_prod_16x16_asym8s failed");
+    }
+  }
+  return kTfLiteOk;
+}
+
+}  // namespace
+
+void* Init(TfLiteContext* context, const char* buffer, size_t length) {
+  TFLITE_DCHECK(context != nullptr);
+  TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
+  void* data = nullptr;
+  if (context->AllocatePersistentBuffer(context, sizeof(OpData), &data) ==
+      kTfLiteError) {
+    return nullptr;
+  }
+  return data;
+}
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  TFLITE_DCHECK(node->builtin_data != nullptr);
+  const auto* params = static_cast<const TfLiteSVDFParams*>(node->builtin_data);
+
+  // Validate Tensor Inputs (dtype depends on quantization):
+  // [0] = Input, {2, batch_size, input_size}
+  // [1] = Weights Feature, {2, num_filters, input_size}
+  // [2] = Weights Time, {2, num_filters, memory_size}
+  // [3] = Bias (optional), {1, num_units}
+  // [4] = Activation State (variable),
+  //         {2, batch_size, memory_size * num_filters}
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  const TfLiteTensor* weights_feature =
+      GetInput(context, node, kWeightsFeatureTensor);
+  const TfLiteTensor* weights_time =
+      GetInput(context, node, kWeightsTimeTensor);
+  const TfLiteTensor* bias = GetOptionalInputTensor(context, node, kBiasTensor);
+  const TfLiteTensor* activation_state =
+      GetInput(context, node, kInputActivationStateTensor);
+
+  // Define input constants based on input tensor definition above:
+  const int rank = params->rank;
+  const int input_size = input->dims->data[1];
+  const int batch_size = input->dims->data[0];
+  // Ensure the input size is a multiple of two.  This is necessary since
+  // optimized kernels access the memory in chunks of two, and all accesses
+  // must be aligned to 16 bits.
+  // TODO(b/153202598): Remove when padding is allowed in TFLite tensors.
+  TF_LITE_ENSURE_EQ(context, input_size % 2, 0);
+
+  const int num_filters = weights_feature->dims->data[0];
+  TF_LITE_ENSURE_EQ(context, num_filters % rank, 0);
+  const int num_units = num_filters / rank;
+  const int memory_size = weights_time->dims->data[1];
+
+  if (input->type != kTfLiteInt8) {
+    TF_LITE_KERNEL_LOG(context, "Type %s (%d) not supported.",
+                       TfLiteTypeGetName(input->type), input->type);
+    return kTfLiteError;
+  }
+
+  // Validate Input Tensor:
+  TF_LITE_ENSURE(context, input->type == kTfLiteInt8);
+  TF_LITE_ENSURE_EQ(context, NumDimensions(input), 2);
+
+  // Validate Tensor Output:
+  // [0] = float/int8, {2, batch_size, num_units}
+  TF_LITE_ENSURE_EQ(context, node->outputs->size, 1);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  TF_LITE_ENSURE_EQ(context, NumDimensions(output), 2);
+  TF_LITE_ENSURE_EQ(context, output->dims->data[0], batch_size);
+  TF_LITE_ENSURE_EQ(context, output->dims->data[1], num_units);
+
+  // Validate Weights Feature Input Tensor:
+  TF_LITE_ENSURE_EQ(context, NumDimensions(weights_feature), 2);
+  TF_LITE_ENSURE_EQ(context, weights_feature->dims->data[1], input_size);
+
+  // Validate Weights Time Input Tensor:
+  TF_LITE_ENSURE_EQ(context, NumDimensions(weights_time), 2);
+  TF_LITE_ENSURE_EQ(context, weights_time->dims->data[0], num_filters);
+  TF_LITE_ENSURE_EQ(context, weights_time->dims->data[1], memory_size);
+
+  // Validate Optional Bias Input Tensor:
+  if (bias != nullptr) {
+    TF_LITE_ENSURE_EQ(context, bias->dims->data[0], num_units);
+    TF_LITE_ENSURE_EQ(context, bias->type, kTfLiteInt32);
+  }
+
+  // Validate Activation State Input Tensor:
+  TF_LITE_ENSURE_EQ(context, NumDimensions(activation_state), 2);
+  TF_LITE_ENSURE_EQ(context, activation_state->dims->data[0], batch_size);
+  TF_LITE_ENSURE_EQ(context, activation_state->dims->data[1],
+                    memory_size * num_filters);
+
+  TF_LITE_ENSURE_EQ(context, node->inputs->size, 5);
+  TF_LITE_ENSURE_EQ(context, weights_feature->type, kTfLiteInt8);
+  TF_LITE_ENSURE_EQ(context, weights_time->type, kTfLiteInt16);
+  TF_LITE_ENSURE_EQ(context, activation_state->type, kTfLiteInt16);
+
+  // Validate output tensor:
+  TF_LITE_ENSURE_EQ(context, output->type, kTfLiteInt8);
+
+  // Calculate effective scales.
+  auto* input_params =
+      static_cast<TfLiteAffineQuantization*>(input->quantization.params);
+  auto* weights_feature_params = static_cast<TfLiteAffineQuantization*>(
+      weights_feature->quantization.params);
+  auto* state_params = static_cast<TfLiteAffineQuantization*>(
+      activation_state->quantization.params);
+  auto* weight_time_params =
+      static_cast<TfLiteAffineQuantization*>(weights_time->quantization.params);
+  auto* output_params =
+      static_cast<TfLiteAffineQuantization*>(output->quantization.params);
+  const float effective_scale_1 = input_params->scale->data[0] *
+                                  weights_feature_params->scale->data[0] /
+                                  state_params->scale->data[0];
+  const float effective_scale_2 = state_params->scale->data[0] *
+                                  weight_time_params->scale->data[0] /
+                                  output_params->scale->data[0];
+
+  TFLITE_DCHECK(node->user_data != nullptr);
+  OpData* data = static_cast<OpData*>(node->user_data);
+
+  xtensa::hifimini::QuantizeMultiplier(effective_scale_1,
+                                       &data->effective_scale_1_a,
+                                       &data->effective_scale_1_b);
+  xtensa::hifimini::QuantizeMultiplier(effective_scale_2,
+                                       &data->effective_scale_2_a,
+                                       &data->effective_scale_2_b);
+
+  const TfLiteStatus scratch_status = context->RequestScratchBufferInArena(
+      context, batch_size * num_filters * sizeof(int32_t),
+      &(data->scratch_tensor_index));
+  TF_LITE_ENSURE_OK(context, scratch_status);
+  const TfLiteStatus scratch_output_status =
+      context->RequestScratchBufferInArena(
+          context, batch_size * num_units * sizeof(int32_t),
+          &(data->scratch_output_tensor_index));
+  TF_LITE_ENSURE_OK(context, scratch_output_status);
+
+  return kTfLiteOk;
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  auto* params = static_cast<TfLiteSVDFParams*>(node->builtin_data);
+
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  const TfLiteTensor* weights_feature =
+      GetInput(context, node, kWeightsFeatureTensor);
+  const TfLiteTensor* weights_time =
+      GetInput(context, node, kWeightsTimeTensor);
+  const TfLiteTensor* bias = GetOptionalInputTensor(context, node, kBiasTensor);
+  TfLiteTensor* activation_state =
+      GetVariableInput(context, node, kInputActivationStateTensor);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  TF_LITE_ENSURE_EQ(context, params->activation, kTfLiteActRelu);
+
+  TFLITE_DCHECK(node->user_data != nullptr);
+  const OpData& data = *(static_cast<const OpData*>(node->user_data));
+
+  return EvalIntegerSVDF(context, node, input, weights_feature, weights_time,
+                         bias, params, activation_state, output, data,
+                         input->params.zero_point, output->params.zero_point);
+}
+
+}  // namespace svdf
+
+TfLiteRegistration* Register_SVDF() {
+  static TfLiteRegistration r = {/*init=*/svdf::Init,
+                                 /*free=*/nullptr,
+                                 /*prepare=*/svdf::Prepare,
+                                 /*invoke=*/svdf::Eval,
+                                 /*profiling_string=*/nullptr,
+                                 /*builtin_code=*/0,
+                                 /*custom_name=*/nullptr,
+                                 /*version=*/0};
+  return &r;
+}
+
+}  // namespace micro
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/lite/micro/kernels/xtensa_hifimini/xa_nnlib/algo/common/include/xa_api_defs.h b/tensorflow/lite/micro/kernels/xtensa_hifimini/xa_nnlib/algo/common/include/xa_api_defs.h
new file mode 100644
index 00000000000..a3eac676bbe
--- /dev/null
+++ b/tensorflow/lite/micro/kernels/xtensa_hifimini/xa_nnlib/algo/common/include/xa_api_defs.h
@@ -0,0 +1,65 @@
+/*******************************************************************************
+ * Copyright (c) 2019-2020 Cadence Design Systems, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to use this Software with Cadence processor cores only and
+ * not with any other processors and platforms, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ ******************************************************************************/
+
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef __XA_API_DEFS_H__
+#define __XA_API_DEFS_H__
+
+/*****************************************************************************/
+/* Constant hash defines                                                     */
+/*****************************************************************************/
+/* A constant to let API copy small strings to buffers outside */
+#define XA_API_STR_LEN 30
+#define XA_APIVERSION_MAJOR 1
+#define XA_APIVERSION_MINOR 0
+
+/* last compatible version */
+/* sometimes a new API version is just for a bugfix, or a added feature  in */
+/* this case it is better to use a newer version even though a library  was */
+/* made for an older version, library API can then be upgraded to newer API */
+/* version after checking for compatibility or by adding features           */
+#define XA_LASTCOMP_APIVERSION_MAJOR 1
+#define XA_LASTCOMP_APIVERSION_MINOR 0
+
+#define XA_STR(str) #str
+#define XA_MAKE_VERSION_STR(maj, min) XA_STR(maj) "." XA_STR(min)
+#define XA_APIVERSION \
+  XA_MAKE_VERSION_STR(XA_APIVERSION_MAJOR, XA_APIVERSION_MINOR)
+
+#define XA_LAST_COMP_APIVERSION                     \
+  XA_MAKE_VERSION_STR(XA_LASTCOMP_APIVERSION_MAJOR, \
+                      XA_LASTCOMP_APIVERSION_MINOR)
+
+#endif /* __XA_API_DEFS_H__ */
diff --git a/tensorflow/lite/micro/kernels/xtensa_hifimini/xa_nnlib/algo/common/include/xa_nnlib_common.h b/tensorflow/lite/micro/kernels/xtensa_hifimini/xa_nnlib/algo/common/include/xa_nnlib_common.h
new file mode 100644
index 00000000000..3e87bf27383
--- /dev/null
+++ b/tensorflow/lite/micro/kernels/xtensa_hifimini/xa_nnlib/algo/common/include/xa_nnlib_common.h
@@ -0,0 +1,55 @@
+/*******************************************************************************
+ * Copyright (c) 2019-2020 Cadence Design Systems, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to use this Software with Cadence processor cores only and
+ * not with any other processors and platforms, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ ******************************************************************************/
+
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef __XA_NNLIB_COMMON_H__
+#define __XA_NNLIB_COMMON_H__
+
+#include <inttypes.h>
+#include <stddef.h>
+#include <xtensa/config/core-isa.h>
+#include <xtensa/tie/xt_core.h>
+#include <xtensa/tie/xt_hifi2.h>
+#include <xtensa/tie/xt_misc.h>
+#if XCHAL_HAVE_HIFI4_VFPU
+#include <xtensa/tie/xt_FP.h>
+#endif
+
+#include "tensorflow/lite/micro/kernels/xtensa_hifimini/xa_nnlib/algo/include/xa_nnlib_err_chk.h"
+#include "tensorflow/lite/micro/kernels/xtensa_hifimini/xa_nnlib/algo/include/xa_nnlib_kernels_api.h"
+#include "tensorflow/lite/micro/kernels/xtensa_hifimini/xa_nnlib/include/nnlib/xa_nnlib_standards.h"
+#include "tensorflow/lite/micro/kernels/xtensa_hifimini/xa_nnlib/include/xa_type_def.h"
+
+#endif /* __XA_NNLIB_COMMON_H__ */
diff --git a/tensorflow/lite/micro/kernels/xtensa_hifimini/xa_nnlib/algo/common/include/xa_nnlib_common_macros.h b/tensorflow/lite/micro/kernels/xtensa_hifimini/xa_nnlib/algo/common/include/xa_nnlib_common_macros.h
new file mode 100644
index 00000000000..d04752b3a12
--- /dev/null
+++ b/tensorflow/lite/micro/kernels/xtensa_hifimini/xa_nnlib/algo/common/include/xa_nnlib_common_macros.h
@@ -0,0 +1,921 @@
+/*******************************************************************************
+ * Copyright (c) 2019-2020 Cadence Design Systems, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to use this Software with Cadence processor cores only and
+ * not with any other processors and platforms, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ ******************************************************************************/
+
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef __XA_NNLIB_COMMON_MACROS_H__
+#define __XA_NNLIB_COMMON_MACROS_H__
+
+#ifndef NULL
+#define NULL (void *)0
+#endif /* NULL */
+
+#define ALIGNMENT 8
+
+/* Macro for zero value */
+#define ZERO64 AE_MOVINT64_FROMINT32X2(AE_MOVDA32(0))
+#define ZERO16X4 AE_MOVDA16(0)
+#define ZERO16 (0)
+#define ZERO32 (0)
+
+/* Macro for 1 */
+#define ONE16X4 AE_MOVDA16(1)
+
+/* Value of ROW_UNROLL currently supported are 1,2,4,8 only */
+#ifndef ROW_UNROLL
+#define ROW_UNROLL 8
+#endif
+#define VEC_UNROLL 2
+
+#define ACC_LSH_AFTER_FIRST_MATXVEC 0
+
+/* Increment in bytes required for particular load
+ * instructions. */
+#define INCREMENT_IN_BYTES_FOR_WORD8 1
+#define INCREMENT_IN_BYTES_FOR_INT16 2
+#define INCREMENT_IN_BYTES_FOR_INT32 (INCREMENT_IN_BYTES_FOR_INT16 * 2)
+#define INCREMENT_IN_BYTES_FOR_WORD8X4 (INCREMENT_IN_BYTES_FOR_WORD8 * 4)
+#define INCREMENT_IN_BYTES_FOR_INT16X4 (INCREMENT_IN_BYTES_FOR_INT16 * 4)
+#define INCREMENT_IN_BYTES_FOR_INT64 INCREMENT_IN_BYTES_FOR_INT16X4
+#define INCREMENT_IN_BYTES_FOR_FLOAT32 4
+#define INCREMENT_IN_BYTES_FOR_FLOAT32x2 (INCREMENT_IN_BYTES_FOR_FLOAT32 * 2)
+
+#define HF2_AE_ADDCIRC16X4_XC(ptr, offset) \
+  ptr = ptr + offset;                      \
+  if (ptr >= p_end) ptr = ptr - size;
+
+#define MULTIPLY_BY_QUANTIZED_MULTIPLIER(q_out, inp, out_multiplier, \
+                                         left_shift, right_shift)    \
+  {                                                                  \
+    ae_q56s d1;                                                      \
+    ae_p24x2s d_mul;                                                 \
+    d_mul = AE_CVTP24A16X2_HL(out_multiplier, out_multiplier);       \
+    d1 = AE_CVTQ48A32S(inp);                                         \
+    d1 = AE_SLLAQ56(d1, left_shift);                                 \
+    q_out = AE_MULFQ32SP16U_L(d1, d_mul);                            \
+    q_out = AE_SRAIQ56(q_out, 16);                                   \
+    AE_MULAFQ32SP16S_H(q_out, d1, d_mul);                            \
+    q_out = AE_SRAAQ56(q_out, right_shift);                          \
+    q_out = AE_ROUNDSQ32SYM(q_out);                                  \
+  }
+
+/* Limit effective bias_shift and acc_shift to [-63 ... 63] */
+#define LIMIT_VARIABLE(_var, _left_limit, _right_limit) \
+  _var = _var > _right_limit ? _right_limit             \
+                             : _var < _left_limit ? _left_limit : _var;
+
+#define LIMIT_ACC_LSH LIMIT_VARIABLE(acc_shift, -63, 63);
+
+#define LIMIT_BIAS_LSH LIMIT_VARIABLE(bias_shift, -63, 63);
+
+#define BW(_datatype) sizeof(_datatype)
+
+#define ADJUST_VAR_AxB(A, B) (((8 * (4 - (BW(A) + BW(B))))))
+
+#define ADJUST_VAR_C(C) (((64 - (8 * BW(C)))))
+
+#define ADJUST_ACC_LSH_AxB_C(A, B, C) \
+  acc_shift = acc_shift + 32;         \
+  LIMIT_ACC_LSH;
+
+#define ADJUST_BIAS_LSH_AxB(A, B) LIMIT_BIAS_LSH;
+
+#define ADJUST_ACC_LSH_AND_BIAS_LSH_AxB_C(A, B, C) \
+  ADJUST_ACC_LSH_AxB_C(A, B, C);                   \
+  ADJUST_BIAS_LSH_AxB(A, B);
+
+/* ====================================================================================================
+ */
+#define SETUP_BIAS_f32                   \
+  xtfloat _xtfloat_bias = (xtfloat)0.0f; \
+  xtfloat *_xtfloat_p_bias = (xtfloat *)p_bias;
+
+#define SETUP_BIAS_ASYM8b               \
+  WORD32 _WORD32_bias;                  \
+  ae_int64 _ae_int64_sat_bias = ZERO64; \
+  WORD32 *_WORD32_p_bias = (WORD32 *)p_bias;
+
+#define SETUP_BIAS_8b                   \
+  WORD8 _WORD8_bias;                    \
+  UWORD32 _UWORD32_bias;                \
+  ae_int64 _ae_int64_bias = ZERO64;     \
+  ae_int64 _ae_int64_sat_bias = ZERO64; \
+  WORD8 *_WORD8_p_bias = (WORD8 *)p_bias;
+
+#define SETUP_BIAS_8b_BATCH                     \
+  WORD8 _WORD8_bias;                            \
+  WORD16 _WORD16_bias;                          \
+  ae_int16 _ae_int16_bias = ZERO16;             \
+  ae_int16 *_ae_int16_p_bias = &_ae_int16_bias; \
+  ae_int64 _ae_int64_sat_bias = ZERO64;         \
+  WORD8 *_WORD8_p_bias = (WORD8 *)p_bias;
+
+#define SETUP_BIAS_32b                  \
+  ae_int32 _ae_int32_bias = ZERO32;     \
+  ae_int64 _ae_int64_sat_bias = ZERO64; \
+  ae_int32 *_ae_int32_p_bias = (ae_int32 *)p_bias;
+
+#define SETUP_BIAS_16b                  \
+  ae_int16 _ae_int16_bias = ZERO16;     \
+  ae_int64 _ae_int64_sat_bias = ZERO64; \
+  ae_int16 *_ae_int16_p_bias = (ae_int16 *)p_bias;
+
+#define SETUP_BIAS_64b                  \
+  ae_int64 _ae_int64_bias = ZERO64;     \
+  ae_int64 _ae_int64_sat_bias = ZERO64; \
+  ae_int64 *_ae_int64_p_bias = (ae_int64 *)p_bias;
+
+#define SETUP_ACC_FOR_8bx8b(idx) SETUP_ACC_64b(idx)
+#define SETUP_ACC_FOR_8bx16b(idx) SETUP_ACC_64b(idx)
+#define SETUP_ACC_FOR_16bx8b(idx) SETUP_ACC_64b(idx)
+#define SETUP_ACC_FOR_16bx16b(idx) SETUP_ACC_64b(idx)
+#define SETUP_ACC_FOR_ASYM8bxASYM8b(idx) SETUP_ACC_64b(idx)
+
+/*------------------ time batching macros ----------------- */
+
+#define SETUP_ACC_BATCH_ROW_FOR_16bx8b SETUP_ACC_BATCH_ROW_FOR_16bx16b
+#define SETUP_ACC_BATCH_ROW_FOR_8bx16b SETUP_ACC_BATCH_ROW_FOR_16bx16b
+#define SETUP_ACC_BATCH_ROW_FOR_8bx8b SETUP_ACC_BATCH_ROW_FOR_16bx16b
+#define SETUP_ACC_BATCH_ROW_FOR_ASYM8bxASYM8b SETUP_ACC_BATCH_ROW_FOR_16bx16b
+
+#define SETUP_ACC_BATCH_FOR_16bx8b SETUP_ACC_BATCH_FOR_16bx16b
+#define SETUP_ACC_BATCH_FOR_8bx16b SETUP_ACC_BATCH_FOR_16bx16b
+#define SETUP_ACC_BATCH_FOR_8bx8b SETUP_ACC_BATCH_FOR_16bx16b
+#define SETUP_ACC_BATCH_FOR_ASYM8bxASYM8b SETUP_ACC_BATCH_FOR_16bx16b
+
+#define SETUP_ACC_BATCH_ROW_FOR_16bx16b(idx_row) \
+  SETUP_ACC_BATCH_VEC_UNROLL(idx_row);
+
+#define SETUP_ACC_BATCH_FOR_16bx16b(idx_row, idx_vec) \
+  ae_int64 _ae_int64_acc_##idx_row##_##idx_vec = ZERO64;
+
+#define SETUP_ACC_BATCH_ROW_FOR_f32(idx_row) \
+  SETUP_ACC_BATCH_VEC_UNROLL(idx_row);
+
+#define SETUP_ACC_BATCH_FOR_f32(idx_row, idx_vec)                   \
+  xtfloatx2 _xtfloatx2_acc_##idx_row##_##idx_vec = (xtfloatx2)0.0f; \
+  xtfloat _xtfloat_acc_##idx_row##_##idx_vec = (xtfloat)0.0f;       \
+  /*---------------------------------------------------------*/
+
+#define SETUP_ACC_64b(idx) ae_int64 _ae_int64_acc_##idx = ZERO64;
+
+#define SETUP_VEC1_8b                     \
+  ae_int16x4 _ae_int16x4_vec1 = ZERO16X4; \
+  WORD8 *_WORD8_p_vec1 = (WORD8 *)p_vec1;
+
+#define SETUP_VEC2_8b                     \
+  ae_int16x4 _ae_int16x4_vec2 = ZERO16X4; \
+  WORD8 *_WORD8_p_vec2 = (WORD8 *)p_vec2;
+
+#define SETUP_VEC1_16b                    \
+  ae_int16x4 _ae_int16x4_vec1 = ZERO16X4; \
+  ae_int16x4 *_ae_int16x4_p_vec1 = (ae_int16x4 *)p_vec1;
+
+#define SETUP_VEC2_16b                    \
+  ae_int16x4 _ae_int16x4_vec2 = ZERO16X4; \
+  ae_int16x4 *_ae_int16x4_p_vec2 = (ae_int16x4 *)p_vec2;
+
+#define SETUP_VEC1_ASYM8b SETUP_VEC1_8b
+#define SETUP_VEC2_ASYM8b SETUP_VEC2_8b
+/*------------------ time batching macros ----------------- */
+
+#define SETUP_VEC_BATCH_8b(idx_vec)                      \
+  ae_int16x4 _ae_int16x4_vec_batch_##idx_vec = ZERO16X4; \
+  WORD8 *_WORD8_p_vec_batch_##idx_vec = (WORD8 *)(p_vec1[vec_itr + idx_vec]);
+
+#define SETUP_VEC_BATCH_16b(idx_vec)                     \
+  ae_int16x4 _ae_int16x4_vec_batch_##idx_vec = ZERO16X4; \
+  ae_int16x4 *_ae_int16x4_p_vec_batch_##idx_vec =        \
+      (ae_int16x4 *)(p_vec1[vec_itr + idx_vec]);
+
+#define SETUP_VEC_OFFSET_BATCH_16b(idx_vec)              \
+  ae_int16x4 _ae_int16x4_vec_batch_##idx_vec = ZERO16X4; \
+  ae_int16x4 *_ae_int16x4_p_vec_batch_##idx_vec =        \
+      (ae_int16x4 *)(p_vec1 + (vec_itr + idx_vec) * vec_offset);
+
+#define SETUP_VEC_BATCH_f32(idx_vec)                          \
+  xtfloatx2 _xtfloatx2_vec_batch_##idx_vec = (xtfloatx2)0.0f; \
+  xtfloatx2 *_xtfloatx2_p_vec_batch_##idx_vec =               \
+      (xtfloatx2 *)(p_vec1[vec_itr + idx_vec]);
+
+#define SETUP_VEC_BATCH_ASYM8b SETUP_VEC_BATCH_8b
+/*---------------------------------------------------------*/
+
+#define SETUP_MAT1_8b(idx)                      \
+  ae_int16x4 _ae_int16x4_mat1_##idx = ZERO16X4; \
+  WORD8 *_WORD8_p_mat1_##idx = (WORD8 *)&p_mat1[(m_itr + idx) * row_stride1];
+
+#define SETUP_MAT2_8b(idx)                      \
+  ae_int16x4 _ae_int16x4_mat2_##idx = ZERO16X4; \
+  WORD8 *_WORD8_p_mat2_##idx = (WORD8 *)&p_mat2[(m_itr + idx) * row_stride2];
+
+#define SETUP_MAT1_16b(idx)                     \
+  ae_int16x4 _ae_int16x4_mat1_##idx = ZERO16X4; \
+  ae_int16x4 *_ae_int16x4_p_mat1_##idx =        \
+      (ae_int16x4 *)&p_mat1[(m_itr + idx) * row_stride1];
+
+#define SETUP_MAT2_16b(idx)                     \
+  ae_int16x4 _ae_int16x4_mat2_##idx = ZERO16X4; \
+  ae_int16x4 *_ae_int16x4_p_mat2_##idx =        \
+      (ae_int16x4 *)&p_mat2[(m_itr + idx) * row_stride2];
+
+#define SETUP_MAT1_f32(idx)                          \
+  xtfloatx2 _xtfloatx2_mat1_##idx = (xtfloatx2)0.0f; \
+  xtfloatx2 *_xtfloatx2_p_mat1_##idx =               \
+      (xtfloatx2 *)&p_mat1[(m_itr + idx) * row_stride1];
+
+#define SETUP_MAT1_ASYM8b SETUP_MAT1_8b
+#define SETUP_MAT2_ASYM8b SETUP_MAT2_8b
+/* ====================================================================== */
+
+#define LOAD_VEC1_8b \
+  AE_L8X4F_IP(_ae_int16x4_vec1, _WORD8_p_vec1, INCREMENT_IN_BYTES_FOR_WORD8X4);
+
+#define LOAD_VEC2_8b \
+  AE_L8X4F_IP(_ae_int16x4_vec2, _WORD8_p_vec2, INCREMENT_IN_BYTES_FOR_WORD8X4);
+
+#define LOAD_VEC1_16b                               \
+  AE_L16X4_IP(_ae_int16x4_vec1, _ae_int16x4_p_vec1, \
+              INCREMENT_IN_BYTES_FOR_INT16X4);
+
+#define LOAD_VEC2_16b                               \
+  AE_L16X4_IP(_ae_int16x4_vec2, _ae_int16x4_p_vec2, \
+              INCREMENT_IN_BYTES_FOR_INT16X4);
+
+#define LOAD_VEC1_ASYM8b                                    \
+  AE_L8X4F_IP(_ae_int16x4_vec1, _WORD8_p_vec1,              \
+              INCREMENT_IN_BYTES_FOR_WORD8X4);              \
+  _ae_int16x4_vec1 = AE_MOVF16X4_FROMF64(                   \
+      AE_SRLI64(AE_MOVF64_FROMF16X4(_ae_int16x4_vec1), 8)); \
+  _ae_int16x4_vec1 = AE_ADD16(_ae_int16x4_vec1, AE_MOVDA16(vec1_zero_bias));
+
+#define LOAD_VEC2_ASYM8b                                                     \
+  AE_L8X4F_IP(_ae_int16x4_vec2, _WORD8_p_vec2,                               \
+              INCREMENT_IN_BYTES_FOR_WORD8X4);                               \
+  _ae_int16x4_vec2 = AE_MOVF16X4_FROMF64(                                    \
+      AE_SRLI64(AE_MOVF64_FROMF16X4(_ae_int16x4_vec2), 8));                  \
+  _ae_int16x4_vec2 = AE_ADD16(_ae_int16x4_vec2, AE_MOVDA16(vec2_zero_bias)); \
+/*------------------ time batching macros ----------------- */
+#define LOAD_VEC_BATCH_f32(idx_vec)                                           \
+  XT_LSX2IP(_xtfloatx2_vec_batch_##idx_vec, _xtfloatx2_p_vec_batch_##idx_vec, \
+            INCREMENT_IN_BYTES_FOR_FLOAT32x2);
+
+#define LOAD_VEC_BATCH_8b(idx_vec)                                           \
+  AE_L8X4F_IP(_ae_int16x4_vec_batch_##idx_vec, _WORD8_p_vec_batch_##idx_vec, \
+              INCREMENT_IN_BYTES_FOR_WORD8X4);
+
+#define LOAD_VEC_BATCH_16b(idx_vec)              \
+  AE_L16X4_IP(_ae_int16x4_vec_batch_##idx_vec,   \
+              _ae_int16x4_p_vec_batch_##idx_vec, \
+              INCREMENT_IN_BYTES_FOR_INT16X4);
+
+#define LOAD_VEC_BATCH_ASYM8b(idx_vec)                                       \
+  AE_L8X4F_IP(_ae_int16x4_vec_batch_##idx_vec, _WORD8_p_vec_batch_##idx_vec, \
+              INCREMENT_IN_BYTES_FOR_WORD8X4);                               \
+  _ae_int16x4_vec_batch_##idx_vec = AE_MOVF16X4_FROMF64(                     \
+      AE_SRLI64(AE_MOVF64_FROMF16X4(_ae_int16x4_vec_batch_##idx_vec), 8));   \
+  _ae_int16x4_vec_batch_##idx_vec =                                          \
+      AE_ADD16(_ae_int16x4_vec_batch_##idx_vec, AE_MOVDA16(vec1_zero_bias));
+
+#define LOAD_BIAS_8b_FOR_8bx8b                  \
+  _WORD8_bias = *_WORD8_p_bias++;               \
+  _WORD16_bias = _WORD8_bias;                   \
+  *((WORD16 *)_ae_int16_p_bias) = _WORD16_bias; \
+  _ae_int64_sat_bias = AE_SLAA64S(((ae_int64)_ae_int16_bias), bias_shift);
+
+#define LOAD_BIAS_16b_FOR_8bx16b                    \
+  ae_int16_loadip(_ae_int16_bias, _ae_int16_p_bias, \
+                  INCREMENT_IN_BYTES_FOR_INT16);    \
+  _ae_int64_sat_bias = AE_SLAA64S(((ae_int64)_ae_int16_bias), bias_shift);
+
+#define LOAD_BIAS_16b_FOR_16bx8b LOAD_BIAS_16b_FOR_8bx16b
+
+#define LOAD_BIAS_16b_FOR_16bx16b                   \
+  ae_int16_loadip(_ae_int16_bias, _ae_int16_p_bias, \
+                  INCREMENT_IN_BYTES_FOR_INT16);    \
+  _ae_int64_sat_bias = AE_SLAA64S(((ae_int64)_ae_int16_bias), bias_shift);
+
+#define LOAD_BIAS_f32 \
+  XT_LSIP(_xtfloat_bias, _xtfloat_p_bias, INCREMENT_IN_BYTES_FOR_FLOAT32);
+
+#define LOAD_BIAS_ASYM8b                                                \
+  _WORD32_bias = *_WORD32_p_bias++;                                     \
+  _ae_int64_sat_bias =                                                  \
+      AE_SRAI64(AE_MOVINT64_FROMINT32X2(AE_MOVDA32(_WORD32_bias)), 32); \
+/*---------------------------------------------------------*/
+#define LOAD_ROW_MAT1_8b(idx)                              \
+  AE_L8X4F_IP(_ae_int16x4_mat1_##idx, _WORD8_p_mat1_##idx, \
+              INCREMENT_IN_BYTES_FOR_WORD8X4);
+
+#define LOAD_ROW_MAT2_8b(idx)                              \
+  AE_L8X4F_IP(_ae_int16x4_mat2_##idx, _WORD8_p_mat2_##idx, \
+              INCREMENT_IN_BYTES_FOR_WORD8X4);
+
+#define LOAD_ROW_MAT1_16b(idx)                                  \
+  AE_L16X4_IP(_ae_int16x4_mat1_##idx, _ae_int16x4_p_mat1_##idx, \
+              INCREMENT_IN_BYTES_FOR_INT16X4);
+
+#define LOAD_ROW_MAT2_16b(idx)                                  \
+  AE_L16X4_IP(_ae_int16x4_mat2_##idx, _ae_int16x4_p_mat2_##idx, \
+              INCREMENT_IN_BYTES_FOR_INT16X4);
+
+#define LOAD_ROW_MAT1_f32(idx)                              \
+  XT_LSX2IP(_xtfloatx2_mat1_##idx, _xtfloatx2_p_mat1_##idx, \
+            INCREMENT_IN_BYTES_FOR_FLOAT32x2);
+
+#define LOAD_ROW_MAT1_ASYM8b(idx)                                 \
+  AE_L8X4F_IP(_ae_int16x4_mat1_##idx, _WORD8_p_mat1_##idx,        \
+              INCREMENT_IN_BYTES_FOR_WORD8X4);                    \
+  _ae_int16x4_mat1_##idx = AE_MOVF16X4_FROMF64(                   \
+      AE_SRLI64(AE_MOVF64_FROMF16X4(_ae_int16x4_mat1_##idx), 8)); \
+  _ae_int16x4_mat1_##idx =                                        \
+      AE_ADD16(_ae_int16x4_mat1_##idx, AE_MOVDA16(mat1_zero_bias));
+
+#define LOAD_ROW_MAT2_ASYM8b(idx)                                 \
+  AE_L8X4F_IP(_ae_int16x4_mat2_##idx, _WORD8_p_mat2_##idx,        \
+              INCREMENT_IN_BYTES_FOR_WORD8X4);                    \
+  _ae_int16x4_mat2_##idx = AE_MOVF16X4_FROMF64(                   \
+      AE_SRLI64(AE_MOVF64_FROMF16X4(_ae_int16x4_mat2_##idx), 8)); \
+  _ae_int16x4_mat2_##idx =                                        \
+      AE_ADD16(_ae_int16x4_mat2_##idx, AE_MOVDA16(mat2_zero_bias));
+
+#define KERNEL_MAT1_VEC1_8b_8b(idx) \
+  LOAD_ROW_MAT1_8b(idx);            \
+  AE_MULAAAAQ16(_ae_int64_acc_##idx, _ae_int16x4_vec1, _ae_int16x4_mat1_##idx);
+
+#define KERNEL_MAT2_VEC2_8b_8b(idx) \
+  LOAD_ROW_MAT2_8b(idx);            \
+  AE_MULAAAAQ16(_ae_int64_acc_##idx, _ae_int16x4_vec2, _ae_int16x4_mat2_##idx);
+
+#define KERNEL_MAT1_VEC1_16b_8b(idx) \
+  LOAD_ROW_MAT1_16b(idx);            \
+  AE_MULAAAAQ16(_ae_int64_acc_##idx, _ae_int16x4_vec1, _ae_int16x4_mat1_##idx);
+
+#define KERNEL_MAT2_VEC2_16b_8b(idx) \
+  LOAD_ROW_MAT2_16b(idx);            \
+  AE_MULAAAAQ16(_ae_int64_acc_##idx, _ae_int16x4_vec2, _ae_int16x4_mat2_##idx);
+
+#define KERNEL_MAT1_VEC1_8b_16b(idx) \
+  LOAD_ROW_MAT1_8b(idx);             \
+  AE_MULAAAAQ16(_ae_int64_acc_##idx, _ae_int16x4_vec1, _ae_int16x4_mat1_##idx);
+
+#define KERNEL_MAT2_VEC2_8b_16b(idx) \
+  LOAD_ROW_MAT2_8b(idx);             \
+  AE_MULAAAAQ16(_ae_int64_acc_##idx, _ae_int16x4_vec2, _ae_int16x4_mat2_##idx);
+
+#define KERNEL_MAT1_VEC1_16b_16b(idx) \
+  LOAD_ROW_MAT1_16b(idx);             \
+  AE_MULAAAAQ16(_ae_int64_acc_##idx, _ae_int16x4_vec1, _ae_int16x4_mat1_##idx);
+
+#define KERNEL_MAT2_VEC2_16b_16b(idx) \
+  LOAD_ROW_MAT2_16b(idx);             \
+  AE_MULAAAAQ16(_ae_int64_acc_##idx, _ae_int16x4_vec2, _ae_int16x4_mat2_##idx);
+
+#define KERNEL_MAT1_VEC1_ASYM8b_ASYM8b(idx) \
+  LOAD_ROW_MAT1_ASYM8b(idx);                \
+  AE_MULAAAAQ16(_ae_int64_acc_##idx, _ae_int16x4_vec1, _ae_int16x4_mat1_##idx);
+
+#define KERNEL_MAT2_VEC2_ASYM8b_ASYM8b(idx) \
+  LOAD_ROW_MAT2_ASYM8b(idx);                \
+  AE_MULAAAAQ16(_ae_int64_acc_##idx, _ae_int16x4_vec2, _ae_int16x4_mat2_##idx);
+
+/*------------------ time batching macros ----------------- */
+
+#define KERNEL_MAT1_VEC_BATCH_ROW_8b_8b KERNEL_MAT1_VEC_BATCH_ROW_16b_16b
+#define KERNEL_MAT1_VEC_BATCH_ROW_16b_8b KERNEL_MAT1_VEC_BATCH_ROW_16b_16b
+#define KERNEL_MAT1_VEC_BATCH_ROW_8b_16b KERNEL_MAT1_VEC_BATCH_ROW_16b_16b
+#define KERNEL_MAT1_VEC_BATCH_ROW_ASYM8b_ASYM8b \
+  KERNEL_MAT1_VEC_BATCH_ROW_16b_16b
+#define KERNEL_MAT1_VEC_BATCH_8b_8b KERNEL_MAT1_VEC_BATCH_16b_16b
+#define KERNEL_MAT1_VEC_BATCH_16b_8b KERNEL_MAT1_VEC_BATCH_16b_16b
+#define KERNEL_MAT1_VEC_BATCH_8b_16b KERNEL_MAT1_VEC_BATCH_16b_16b
+#define KERNEL_MAT1_VEC_BATCH_ASYM8b_ASYM8b KERNEL_MAT1_VEC_BATCH_16b_16b
+
+#define KERNEL_MAT1_VEC_BATCH_ROW_16b_16b(idx_row) \
+  KERNEL_MAT1_VEC_BATCH_VEC_UNROLL(idx_row);
+
+#define KERNEL_MAT1_VEC_BATCH_16b_16b(idx_row, idx_vec) \
+  AE_MULAAAAQ16(_ae_int64_acc_##idx_row##_##idx_vec,    \
+                _ae_int16x4_vec_batch_##idx_vec, _ae_int16x4_mat1_##idx_row);
+
+#define KERNEL_MAT1_VEC_BATCH_ROW_f32(idx_row) \
+  KERNEL_MAT1_VEC_BATCH_VEC_UNROLL(idx_row);
+
+#define KERNEL_MAT1_VEC_BATCH_f32(idx_row, idx_vec) \
+  XT_MADD_SX2(_xtfloatx2_acc_##idx_row##_##idx_vec, \
+              _xtfloatx2_vec_batch_##idx_vec, _xtfloatx2_mat1_##idx_row);
+
+/*---------------------------------------------------------*/
+#define ADD_BIAS_8b_ACC_FOR_8bx8b(idx)                                        \
+  /* Load 8b bias */                                                          \
+  _WORD8_bias = *_WORD8_p_bias++;                                             \
+  /* Copy 8-bits to unsigned 32-bits */                                       \
+  _UWORD32_bias = _WORD8_bias;                                                \
+  /*Move unsigned 32 bit value to DR register*/                               \
+  _ae_int64_bias = AE_MOVINT64_FROMINT32X2((AE_MOVDA32X2(_UWORD32_bias, 0))); \
+  _ae_int64_bias = AE_SRAA64(_ae_int64_bias, 32);                             \
+  _ae_int64_sat_bias = AE_SLAA64S(_ae_int64_bias, bias_shift);                \
+  _ae_int64_acc_##idx = AE_SRAA64(_ae_int64_acc_##idx, 16);                   \
+  _ae_int64_acc_##idx = AE_ADD64S(_ae_int64_acc_##idx, _ae_int64_sat_bias);
+
+#define ADD_BIAS_32b_ACC_FOR_8bx8b(idx)                                    \
+  ae_int32_loadip(_ae_int32_bias, _ae_int32_p_bias,                        \
+                  INCREMENT_IN_BYTES_FOR_INT32);                           \
+  _ae_int64_sat_bias = AE_SLAA64S(((ae_int64)_ae_int32_bias), bias_shift); \
+  _ae_int64_acc_##idx = AE_SRAA64(_ae_int64_acc_##idx, 16);                \
+  _ae_int64_acc_##idx = AE_ADD64S(_ae_int64_acc_##idx, _ae_int64_sat_bias);
+
+#define ADD_BIAS_16b_ACC_FOR_8bx16b(idx)                                   \
+  ae_int16_loadip(_ae_int16_bias, _ae_int16_p_bias,                        \
+                  INCREMENT_IN_BYTES_FOR_INT16);                           \
+  /* Saturate 16b bias after shift to 64b */                               \
+  _ae_int64_sat_bias = AE_SLAA64S(((ae_int64)_ae_int16_bias), bias_shift); \
+  _ae_int64_acc_##idx = AE_SRAA64(_ae_int64_acc_##idx, 8);                 \
+  _ae_int64_acc_##idx = AE_ADD64S(_ae_int64_acc_##idx, _ae_int64_sat_bias);
+
+#define ADD_BIAS_16b_ACC_FOR_16bx8b ADD_BIAS_16b_ACC_FOR_8bx16b
+
+#define ADD_BIAS_64b_ACC_FOR_8bx16b(idx)                                   \
+  ae_int64_loadip(_ae_int64_bias, _ae_int64_p_bias,                        \
+                  INCREMENT_IN_BYTES_FOR_INT64);                           \
+  /* Saturate 64b bias after shift to 64b */                               \
+  _ae_int64_sat_bias = AE_SLAA64S(((ae_int64)_ae_int64_bias), bias_shift); \
+  _ae_int64_acc_##idx = AE_SRAA64(_ae_int64_acc_##idx, 8);                 \
+  _ae_int64_acc_##idx = AE_ADD64S(_ae_int64_acc_##idx, _ae_int64_sat_bias);
+
+#define ADD_BIAS_16b_ACC_FOR_16bx16b(idx)                                  \
+  ae_int16_loadip(_ae_int16_bias, _ae_int16_p_bias,                        \
+                  INCREMENT_IN_BYTES_FOR_INT16);                           \
+  /* Saturate 16b bias after shift to 64b */                               \
+  _ae_int64_sat_bias = AE_SLAA64S(((ae_int64)_ae_int16_bias), bias_shift); \
+  _ae_int64_acc_##idx = AE_ADD64S(_ae_int64_acc_##idx, _ae_int64_sat_bias);
+
+#define ADD_BIAS_64b_ACC_FOR_16bx16b(idx)                                  \
+  ae_int64_loadip(_ae_int64_bias, _ae_int64_p_bias,                        \
+                  INCREMENT_IN_BYTES_FOR_INT64);                           \
+  /* Saturate 64b bias after shift to 64b */                               \
+  _ae_int64_sat_bias = AE_SLAA64S(((ae_int64)_ae_int64_bias), bias_shift); \
+  _ae_int64_acc_##idx = AE_ADD64S(_ae_int64_acc_##idx, _ae_int64_sat_bias);
+
+#define ADD_BIAS_ASYM8b_ACC_FOR_ASYM8bxASYM8b(idx)                      \
+  /* Load 32b bias */                                                   \
+  _WORD32_bias = *_WORD32_p_bias++;                                     \
+  _ae_int64_sat_bias =                                                  \
+      AE_SRAI64(AE_MOVINT64_FROMINT32X2(AE_MOVDA32(_WORD32_bias)), 32); \
+  _ae_int64_acc_##idx = AE_ADD64S(_ae_int64_acc_##idx, _ae_int64_sat_bias);
+
+/*------------------ time batching macros ----------------- */
+#define ADD_BIAS_BATCH_ROW_8b_ACC_FOR_8bx8b(idx_row) \
+  LOAD_BIAS_8b_FOR_8bx8b;                            \
+  ADD_BIAS_BATCH_ACC_VEC_UNROLL(idx_row);
+
+#define ADD_BIAS_BATCH_ROW_16b_ACC_FOR_8bx16b(idx_row) \
+  LOAD_BIAS_16b_FOR_8bx16b;                            \
+  ADD_BIAS_BATCH_ACC_VEC_UNROLL(idx_row);
+
+#define ADD_BIAS_BATCH_ROW_16b_ACC_FOR_16bx8b(idx_row) \
+  LOAD_BIAS_16b_FOR_16bx8b;                            \
+  ADD_BIAS_BATCH_ACC_VEC_UNROLL(idx_row);
+
+#define ADD_BIAS_BATCH_ROW_16b_ACC_FOR_16bx16b(idx_row) \
+  LOAD_BIAS_16b_FOR_16bx16b;                            \
+  ADD_BIAS_BATCH_ACC_VEC_UNROLL(idx_row);
+
+#define ADD_BIAS_BATCH_ROW_ASYM8b_ACC_FOR_ASYM8bxASYM8b(idx_row) \
+  LOAD_BIAS_ASYM8b ADD_BIAS_BATCH_ACC_VEC_UNROLL(idx_row);
+
+#define ADD_BIAS_BATCH_8b_ACC_FOR_8bx8b(idx_row, idx_vec) \
+  _ae_int64_acc_##idx_row##_##idx_vec =                   \
+      AE_SRAA64(_ae_int64_acc_##idx_row##_##idx_vec, 16); \
+  _ae_int64_acc_##idx_row##_##idx_vec =                   \
+      AE_ADD64S(_ae_int64_acc_##idx_row##_##idx_vec, _ae_int64_sat_bias);
+
+#define ADD_BIAS_BATCH_16b_ACC_FOR_8bx16b(idx_row, idx_vec) \
+  _ae_int64_acc_##idx_row##_##idx_vec =                     \
+      AE_SRAA64(_ae_int64_acc_##idx_row##_##idx_vec, 8);    \
+  _ae_int64_acc_##idx_row##_##idx_vec =                     \
+      AE_ADD64S(_ae_int64_acc_##idx_row##_##idx_vec, _ae_int64_sat_bias);
+
+#define ADD_BIAS_BATCH_16b_ACC_FOR_16bx16b(idx_row, idx_vec) \
+  _ae_int64_acc_##idx_row##_##idx_vec =                      \
+      AE_ADD64S(_ae_int64_acc_##idx_row##_##idx_vec, _ae_int64_sat_bias);
+
+#define ADD_BIAS_BATCH_16b_ACC_FOR_16bx8b ADD_BIAS_BATCH_16b_ACC_FOR_8bx16b
+#define ADD_BIAS_BATCH_ASYM8b_ACC_FOR_ASYM8bxASYM8b \
+  ADD_BIAS_BATCH_16b_ACC_FOR_16bx16b
+
+#define ADD_BIAS_BATCH_ROW_ACC_FOR_f32(idx_row) \
+  LOAD_BIAS_f32;                                \
+  ADD_BIAS_BATCH_ACC_VEC_UNROLL(idx_row);
+
+#define ADD_BIAS_BATCH_ACC_FOR_f32(idx_row, idx_vec)     \
+  _xtfloat_acc_##idx_row##_##idx_vec =                   \
+      XT_RADD_SX2(_xtfloatx2_acc_##idx_row##_##idx_vec); \
+  _xtfloat_acc_##idx_row##_##idx_vec =                   \
+      XT_ADD_S(_xtfloat_acc_##idx_row##_##idx_vec, _xtfloat_bias);
+
+#define STORE_ACC_8bx8b_AT_SCRATCH_32b(idx)  \
+  (*((ae_int32 *)p_scratch + m_itr + idx)) = \
+      AE_ROUND32F64SSYM(AE_SLAA64S(_ae_int64_acc_##idx, acc_shift));
+
+#define STORE_ACC_8bx8b_AT_OUT_8b(idx)                                    \
+  ae_int32 _ae_int32_tmp_var_##idx;                                       \
+  ae_f32x2 _ae_f32x2_tmp_var_##idx = AE_SLAA32S(                          \
+      AE_ROUND32F64SSYM(AE_SLAA64S(_ae_int64_acc_##idx, acc_shift)), 24); \
+  _ae_int32_tmp_var_##idx = AE_SLAA32S(_ae_f32x2_tmp_var_##idx, -24);     \
+  (*((WORD8 *)p_out + m_itr + idx)) = (*((UWORD32 *)&_ae_int32_tmp_var_##idx));
+
+#define STORE_ACC_8bx8b_AT_OUT_16b(idx)                                   \
+  ae_int32 _ae_int32_tmp_var_##idx;                                       \
+  ae_f32x2 _ae_f32x2_tmp_var_##idx = AE_SLAA32S(                          \
+      AE_ROUND32F64SSYM(AE_SLAA64S(_ae_int64_acc_##idx, acc_shift)), 16); \
+  _ae_int32_tmp_var_##idx = AE_SLAA32S(_ae_f32x2_tmp_var_##idx, -16);     \
+  (*((WORD16 *)p_out + m_itr + idx)) = (*((UWORD32 *)&_ae_int32_tmp_var_##idx));
+
+#define STORE_ACC_8bx8b_AT_OUT_32b(idx)  \
+  (*((ae_int32 *)p_out + m_itr + idx)) = \
+      AE_ROUND32F64SSYM(AE_SLAA64S(_ae_int64_acc_##idx, acc_shift));
+
+#define STORE_ACC_ASYM8bxASYM8b_AT_OUT_ASYM8b(idx)                      \
+  _ae_int32x2_acc_##idx = AE_MIN32(                                     \
+      AE_MAX32(_ae_int32x2_acc_##idx, AE_MOVDA32(0)), AE_MOVDA32(255)); \
+  (*((UWORD8 *)p_out + m_itr + idx)) =                                  \
+      (UWORD8)AE_MOVAD32_L(_ae_int32x2_acc_##idx);
+
+/* ====================================================================================================
+ */
+#define STORE_ACC_8bx16b_AT_SCRATCH_32b(idx) \
+  (*((ae_int32 *)p_scratch + m_itr + idx)) = \
+      AE_ROUND32F64SSYM(AE_SLAA64S(_ae_int64_acc_##idx, acc_shift));
+
+#define STORE_ACC_8bx16b_AT_OUT_16b(idx)                                  \
+  ae_int32 _ae_int32_tmp_var_##idx;                                       \
+  ae_f32x2 _ae_f32x2_tmp_var_##idx = AE_SLAA32S(                          \
+      AE_ROUND32F64SSYM(AE_SLAA64S(_ae_int64_acc_##idx, acc_shift)), 16); \
+  _ae_int32_tmp_var_##idx = AE_SLAA32S(_ae_f32x2_tmp_var_##idx, -16);     \
+  (*((WORD16 *)p_out + m_itr + idx)) = (*((UWORD32 *)&_ae_int32_tmp_var_##idx));
+
+#define STORE_ACC_16bx8b_AT_OUT_16b STORE_ACC_8bx16b_AT_OUT_16b
+
+#define STORE_ACC_8bx16b_AT_OUT_32b(idx) \
+  (*((ae_int32 *)p_out + m_itr + idx)) = \
+      AE_ROUND32F64SSYM(AE_SLAA64S(_ae_int64_acc_##idx, acc_shift));
+
+#define STORE_ACC_8bx16b_AT_OUT_64b(idx) \
+  (*((ae_int64 *)p_out + m_itr + idx)) = \
+      AE_SLAA64S(_ae_int64_acc_##idx, acc_shift);
+
+/* ====================================================================================================
+ */
+#define STORE_ACC_16bx16b_AT_SCRATCH_32b(idx) \
+  (*((ae_int32 *)p_scratch + m_itr + idx)) =  \
+      AE_ROUND32F64SSYM(AE_SLAA64S(_ae_int64_acc_##idx, acc_shift));
+
+#define STORE_ACC_16bx16b_AT_OUT_16b(idx)                                 \
+  ae_int32 _ae_int32_tmp_var_##idx;                                       \
+  ae_f32x2 _ae_f32x2_tmp_var_##idx = AE_SLAA32S(                          \
+      AE_ROUND32F64SSYM(AE_SLAA64S(_ae_int64_acc_##idx, acc_shift)), 16); \
+  _ae_int32_tmp_var_##idx = AE_SLAA32S(_ae_f32x2_tmp_var_##idx, -16);     \
+  (*((WORD16 *)p_out + m_itr + idx)) = (*((UWORD32 *)&_ae_int32_tmp_var_##idx));
+
+#define STORE_ACC_16bx16b_AT_OUT_32b(idx) \
+  (*((ae_int32 *)p_out + m_itr + idx)) =  \
+      AE_ROUND32F64SSYM(AE_SLAA64S(_ae_int64_acc_##idx, acc_shift));
+
+#define STORE_ACC_16bx16b_AT_OUT_64b(idx) \
+  (*((ae_int64 *)p_out + m_itr + idx)) =  \
+      AE_SLAA64S(_ae_int64_acc_##idx, acc_shift);
+
+/*------------------ time batching macros ----------------- */
+#define STORE_ACC_BATCH_ROW_8bx8b_AT_OUT_32b(idx_row) \
+  STORE_ACC_BATCH_VEC_UNROLL(idx_row);
+
+#define STORE_ACC_BATCH_ROW_8bx8b_AT_OUT_8b(idx_row) \
+  STORE_ACC_BATCH_VEC_UNROLL(idx_row);
+
+#define STORE_ACC_BATCH_8bx8b_AT_OUT_32b(idx_row, idx_vec)      \
+  (*((ae_int32 *)p_out[vec_itr + idx_vec] + m_itr + idx_row)) = \
+      AE_ROUND32F64SSYM(                                        \
+          AE_SLAA64S(_ae_int64_acc_##idx_row##_##idx_vec, acc_shift));
+
+#define STORE_ACC_BATCH_8bx8b_AT_OUT_8b(idx_row, idx_vec)              \
+  ae_int32 _ae_int32_tmp_var_##idx_row##_##idx_vec;                    \
+  ae_f32x2 _ae_f32x2_tmp_var_##idx_row##_##idx_vec =                   \
+      AE_SLAA32S(AE_ROUND32F64SSYM(AE_SLAA64S(                         \
+                     _ae_int64_acc_##idx_row##_##idx_vec, acc_shift)), \
+                 24);                                                  \
+  _ae_int32_tmp_var_##idx_row##_##idx_vec =                            \
+      AE_SLAA32S(_ae_f32x2_tmp_var_##idx_row##_##idx_vec, -24);        \
+  (*((WORD8 *)p_out[vec_itr + idx_vec] + m_itr + idx_row)) =           \
+      (*((UWORD32 *)&_ae_int32_tmp_var_##idx_row##_##idx_vec));
+
+#define STORE_ACC_BATCH_ROW_8bx16b_AT_OUT_64b(idx_row) \
+  STORE_ACC_BATCH_VEC_UNROLL(idx_row);
+
+#define STORE_ACC_BATCH_ROW_16bx8b_AT_OUT_16b \
+  STORE_ACC_BATCH_ROW_8bx16b_AT_OUT_64b
+
+#define STORE_ACC_BATCH_ROW_8bx16b_AT_OUT_16b \
+  STORE_ACC_BATCH_ROW_8bx16b_AT_OUT_64b
+
+#define STORE_ACC_BATCH_8bx16b_AT_OUT_64b(idx_row, idx_vec)     \
+  (*((ae_int64 *)p_out[vec_itr + idx_vec] + m_itr + idx_row)) = \
+      AE_SLAA64S(_ae_int64_acc_##idx_row##_##idx_vec, acc_shift);
+
+#define STORE_ACC_BATCH_8bx16b_AT_OUT_16b(idx_row, idx_vec) \
+  STORE_ACC_BATCH_16bx16b_AT_OUT_16b(idx_row, idx_vec);
+
+#define STORE_ACC_BATCH_ROW_16bx16b_AT_OUT_64b(idx_row) \
+  STORE_ACC_BATCH_VEC_UNROLL(idx_row);
+
+#define STORE_ACC_BATCH_ROW_16bx16b_AT_OUT_16b \
+  STORE_ACC_BATCH_ROW_16bx16b_AT_OUT_64b
+
+#define STORE_ACC_BATCH_16bx16b_AT_OUT_64b(idx_row, idx_vec)    \
+  (*((ae_int64 *)p_out[vec_itr + idx_vec] + m_itr + idx_row)) = \
+      AE_SLAA64S(_ae_int64_acc_##idx_row##_##idx_vec, acc_shift);
+
+#define STORE_STRIDE_ACC_BATCH_16bx16b_AT_OUT_16b(idx_row, idx_vec)    \
+  ae_int32 _ae_int32_tmp_var_##idx_row##_##idx_vec;                    \
+  ae_f32x2 _ae_f32x2_tmp_var_##idx_row##_##idx_vec =                   \
+      AE_SLAA32S(AE_ROUND32F64SSYM(AE_SLAA64S(                         \
+                     _ae_int64_acc_##idx_row##_##idx_vec, acc_shift)), \
+                 16);                                                  \
+  _ae_int32_tmp_var_##idx_row##_##idx_vec =                            \
+      AE_SLAA32S(_ae_f32x2_tmp_var_##idx_row##_##idx_vec, -16);        \
+  (*((WORD16 *)p_out + (vec_itr + idx_vec) * out_offset +              \
+     (m_itr + idx_row) * out_stride)) =                                \
+      (*((UWORD32 *)&_ae_int32_tmp_var_##idx_row##_##idx_vec));
+
+#define STORE_ACC_BATCH_ROW_AT_OUT_f32(idx_row) \
+  STORE_ACC_BATCH_VEC_UNROLL(idx_row);
+
+#define STORE_ACC_BATCH_AT_OUT_f32(idx_row, idx_vec)                \
+  /*p_out value stored in a tmp pointer to make it inout for ISA */ \
+  p_out_tmp = (p_out[vec_itr + idx_vec] + m_itr + idx_row);         \
+  XT_SSIP(_xtfloat_acc_##idx_row##_##idx_vec, p_out_tmp, 0);
+
+#define STORE_ACC_BATCH_ROW_ASYM8bxASYM8b_AT_OUT_ASYM8b(idx_row) \
+  STORE_ACC_BATCH_VEC_UNROLL(idx_row);
+
+#define STORE_ACC_BATCH_ASYM8bxASYM8b_AT_OUT_ASYM8b(idx_row, idx_vec)          \
+  _ae_int32x2_acc_##idx_row##_##idx_vec =                                      \
+      AE_MIN32(AE_MAX32(_ae_int32x2_acc_##idx_row##_##idx_vec, AE_MOVDA32(0)), \
+               AE_MOVDA32(255));                                               \
+  (*((UWORD8 *)(p_out[vec_itr + idx_vec] + m_itr + idx_row))) =                \
+      (UWORD8)AE_MOVAD32_L(_ae_int32x2_acc_##idx_row##_##idx_vec);
+
+/*---------------------------------------------------------*/
+/* Specific macros needed for extra calculations involved
+  for ASYM8b */
+
+/* This is written to match with Tensorflow */
+#define ADJUST_ACC_ASYM8b(idx)                                             \
+  /* Multiply accumulator with 'out_multiplier', same as Tensorflow */     \
+  ae_int32x2 _ae_int32x2_acc_##idx =                                       \
+      AE_SLAA32(AE_MOVINT32X2_FROMINT64(_ae_int64_acc_##idx), left_shift); \
+  _ae_int32x2_acc_##idx =                                                  \
+      AE_MULFP32X2RAS(_ae_int32x2_acc_##idx, AE_MOVDA32(out_multiplier));  \
+  /* Shift by out_shift, same as Tensorflow */                             \
+  _ae_int64_acc_##idx =                                                    \
+      AE_SLAI64(AE_MOVINT64_FROMINT32X2(_ae_int32x2_acc_##idx), 32);       \
+  _ae_int64_acc_##idx = AE_SRAA64(_ae_int64_acc_##idx, right_shift);       \
+  _ae_int32x2_acc_##idx = AE_ROUND32F64SSYM(_ae_int64_acc_##idx);          \
+  /* Add output zero point */                                              \
+  (_ae_int32x2_acc_##idx) =                                                \
+      AE_ADD32S(_ae_int32x2_acc_##idx, AE_MOVDA32(out_zero_bias));
+
+/* For time batching */
+#define ADJUST_ACC_BATCH_ROW_ASYM8b(idx_row) \
+  ADJUST_ACC_BATCH_VEC_UNROLL(idx_row);
+
+/* For time batching */
+#define ADJUST_ACC_BATCH_ASYM8b(idx_row, idx_vec)                             \
+  /* Multiply accumulator with 'out_multiplier', same as Tensorflow */        \
+  ae_int32x2 _ae_int32x2_acc_##idx_row##_##idx_vec =                          \
+      AE_SLAA32(AE_MOVINT32X2_FROMINT64(_ae_int64_acc_##idx_row##_##idx_vec), \
+                left_shift);                                                  \
+  _ae_int32x2_acc_##idx_row##_##idx_vec = AE_MULFP32X2RAS(                    \
+      _ae_int32x2_acc_##idx_row##_##idx_vec, AE_MOVDA32(out_multiplier));     \
+  /* Shift by out_shift, same as Tensorflow */                                \
+  _ae_int64_acc_##idx_row##_##idx_vec = AE_SLAI64(                            \
+      AE_MOVINT64_FROMINT32X2(_ae_int32x2_acc_##idx_row##_##idx_vec), 32);    \
+  _ae_int64_acc_##idx_row##_##idx_vec =                                       \
+      AE_SRAA64(_ae_int64_acc_##idx_row##_##idx_vec, right_shift);            \
+  _ae_int32x2_acc_##idx_row##_##idx_vec =                                     \
+      AE_ROUND32F64SSYM(_ae_int64_acc_##idx_row##_##idx_vec);                 \
+  /* Add output zero point */                                                 \
+  (_ae_int32x2_acc_##idx_row##_##idx_vec) = AE_ADD32S(                        \
+      _ae_int32x2_acc_##idx_row##_##idx_vec, AE_MOVDA32(out_zero_bias));
+
+/*---------------------------------------------------------*/
+/* ====================================================================================================
+ */
+#if (ROW_UNROLL == 1)
+#define SETUP_ACC UNROLL_SETUP_ACC(0)
+#define SETUP_MAT1 UNROLL_SETUP_MAT1(0)
+#define SETUP_MAT2 UNROLL_SETUP_MAT2(0)
+#define KERNEL_MAT1_VEC1 UNROLL_KERNEL_MAT1_VEC1(0)
+#define KERNEL_MAT2_VEC2 UNROLL_KERNEL_MAT2_VEC2(0)
+#define ADD_BIAS_ACC UNROLL_ADD_BIAS_ACC(0)
+#define ADJUST_ACC UNROLL_ADJUST_ACC(0)
+#define STORE_ACC UNROLL_STORE_ACC(0)
+
+#elif (ROW_UNROLL == 2)
+#define SETUP_ACC UNROLL_SETUP_ACC(0) UNROLL_SETUP_ACC(1)
+#define SETUP_MAT1 UNROLL_SETUP_MAT1(0) UNROLL_SETUP_MAT1(1)
+#define SETUP_MAT2 UNROLL_SETUP_MAT2(0) UNROLL_SETUP_MAT2(1)
+#define KERNEL_MAT1_VEC1 UNROLL_KERNEL_MAT1_VEC1(0) UNROLL_KERNEL_MAT1_VEC1(1)
+#define KERNEL_MAT2_VEC2 UNROLL_KERNEL_MAT2_VEC2(0) UNROLL_KERNEL_MAT2_VEC2(1)
+#define ADD_BIAS_ACC UNROLL_ADD_BIAS_ACC(0) UNROLL_ADD_BIAS_ACC(1)
+#define ADJUST_ACC UNROLL_ADJUST_ACC(0) UNROLL_ADJUST_ACC(1)
+#define STORE_ACC UNROLL_STORE_ACC(0) UNROLL_STORE_ACC(1)
+
+#elif (ROW_UNROLL == 4)
+#define SETUP_ACC     \
+  UNROLL_SETUP_ACC(0) \
+  UNROLL_SETUP_ACC(1) UNROLL_SETUP_ACC(2) UNROLL_SETUP_ACC(3)
+#define SETUP_MAT1     \
+  UNROLL_SETUP_MAT1(0) \
+  UNROLL_SETUP_MAT1(1) UNROLL_SETUP_MAT1(2) UNROLL_SETUP_MAT1(3)
+#define SETUP_MAT2     \
+  UNROLL_SETUP_MAT2(0) \
+  UNROLL_SETUP_MAT2(1) UNROLL_SETUP_MAT2(2) UNROLL_SETUP_MAT2(3)
+#define KERNEL_MAT1_VEC1     \
+  UNROLL_KERNEL_MAT1_VEC1(0) \
+  UNROLL_KERNEL_MAT1_VEC1(1) \
+  UNROLL_KERNEL_MAT1_VEC1(2) UNROLL_KERNEL_MAT1_VEC1(3)
+#define KERNEL_MAT2_VEC2     \
+  UNROLL_KERNEL_MAT2_VEC2(0) \
+  UNROLL_KERNEL_MAT2_VEC2(1) \
+  UNROLL_KERNEL_MAT2_VEC2(2) UNROLL_KERNEL_MAT2_VEC2(3)
+#define ADD_BIAS_ACC     \
+  UNROLL_ADD_BIAS_ACC(0) \
+  UNROLL_ADD_BIAS_ACC(1) UNROLL_ADD_BIAS_ACC(2) UNROLL_ADD_BIAS_ACC(3)
+#define ADJUST_ACC     \
+  UNROLL_ADJUST_ACC(0) \
+  UNROLL_ADJUST_ACC(1) UNROLL_ADJUST_ACC(2) UNROLL_ADJUST_ACC(3)
+#define STORE_ACC     \
+  UNROLL_STORE_ACC(0) \
+  UNROLL_STORE_ACC(1) UNROLL_STORE_ACC(2) UNROLL_STORE_ACC(3)
+
+#elif (ROW_UNROLL == 8)
+#define SETUP_ACC     \
+  UNROLL_SETUP_ACC(0) \
+  UNROLL_SETUP_ACC(1) \
+  UNROLL_SETUP_ACC(2) \
+  UNROLL_SETUP_ACC(3) \
+  UNROLL_SETUP_ACC(4) \
+  UNROLL_SETUP_ACC(5) UNROLL_SETUP_ACC(6) UNROLL_SETUP_ACC(7)
+#define SETUP_MAT1     \
+  UNROLL_SETUP_MAT1(0) \
+  UNROLL_SETUP_MAT1(1) \
+  UNROLL_SETUP_MAT1(2) \
+  UNROLL_SETUP_MAT1(3) \
+  UNROLL_SETUP_MAT1(4) \
+  UNROLL_SETUP_MAT1(5) UNROLL_SETUP_MAT1(6) UNROLL_SETUP_MAT1(7)
+#define SETUP_MAT2     \
+  UNROLL_SETUP_MAT2(0) \
+  UNROLL_SETUP_MAT2(1) \
+  UNROLL_SETUP_MAT2(2) \
+  UNROLL_SETUP_MAT2(3) \
+  UNROLL_SETUP_MAT2(4) \
+  UNROLL_SETUP_MAT2(5) UNROLL_SETUP_MAT2(6) UNROLL_SETUP_MAT2(7)
+#define KERNEL_MAT1_VEC1     \
+  UNROLL_KERNEL_MAT1_VEC1(0) \
+  UNROLL_KERNEL_MAT1_VEC1(1) \
+  UNROLL_KERNEL_MAT1_VEC1(2) \
+  UNROLL_KERNEL_MAT1_VEC1(3) \
+  UNROLL_KERNEL_MAT1_VEC1(4) \
+  UNROLL_KERNEL_MAT1_VEC1(5) \
+  UNROLL_KERNEL_MAT1_VEC1(6) UNROLL_KERNEL_MAT1_VEC1(7)
+#define KERNEL_MAT2_VEC2     \
+  UNROLL_KERNEL_MAT2_VEC2(0) \
+  UNROLL_KERNEL_MAT2_VEC2(1) \
+  UNROLL_KERNEL_MAT2_VEC2(2) \
+  UNROLL_KERNEL_MAT2_VEC2(3) \
+  UNROLL_KERNEL_MAT2_VEC2(4) \
+  UNROLL_KERNEL_MAT2_VEC2(5) \
+  UNROLL_KERNEL_MAT2_VEC2(6) UNROLL_KERNEL_MAT2_VEC2(7)
+#define ADD_BIAS_ACC     \
+  UNROLL_ADD_BIAS_ACC(0) \
+  UNROLL_ADD_BIAS_ACC(1) \
+  UNROLL_ADD_BIAS_ACC(2) \
+  UNROLL_ADD_BIAS_ACC(3) \
+  UNROLL_ADD_BIAS_ACC(4) \
+  UNROLL_ADD_BIAS_ACC(5) UNROLL_ADD_BIAS_ACC(6) UNROLL_ADD_BIAS_ACC(7)
+#define ADJUST_ACC     \
+  UNROLL_ADJUST_ACC(0) \
+  UNROLL_ADJUST_ACC(1) \
+  UNROLL_ADJUST_ACC(2) \
+  UNROLL_ADJUST_ACC(3) \
+  UNROLL_ADJUST_ACC(4) \
+  UNROLL_ADJUST_ACC(5) UNROLL_ADJUST_ACC(6) UNROLL_ADJUST_ACC(7)
+#define STORE_ACC     \
+  UNROLL_STORE_ACC(0) \
+  UNROLL_STORE_ACC(1) \
+  UNROLL_STORE_ACC(2) \
+  UNROLL_STORE_ACC(3) \
+  UNROLL_STORE_ACC(4) \
+  UNROLL_STORE_ACC(5) UNROLL_STORE_ACC(6) UNROLL_STORE_ACC(7)
+
+#endif /* (ROW_UNROLL == 1) */
+
+#if (ROW_UNROLL == 4 && VEC_UNROLL == 2)
+
+#define SETUP_VEC_BATCH UNROLL_SETUP_VEC_BATCH(0) UNROLL_SETUP_VEC_BATCH(1)
+
+#define SETUP_ACC_BATCH         \
+  UNROLL_ROW_SETUP_ACC_BATCH(0) \
+  UNROLL_ROW_SETUP_ACC_BATCH(1) \
+  UNROLL_ROW_SETUP_ACC_BATCH(2) UNROLL_ROW_SETUP_ACC_BATCH(3)
+#define SETUP_ACC_BATCH_VEC_UNROLL(idx_row) \
+  UNROLL_SETUP_ACC_BATCH(idx_row, 0) UNROLL_SETUP_ACC_BATCH(idx_row, 1)
+#define SETUP_ACC_BATCH_TAIL   \
+  UNROLL_SETUP_ACC_BATCH(0, 0) \
+  UNROLL_SETUP_ACC_BATCH(1, 0) \
+  UNROLL_SETUP_ACC_BATCH(2, 0) UNROLL_SETUP_ACC_BATCH(3, 0)
+
+#define LOAD_VEC_BATCH UNROLL_LOAD_VEC_BATCH(0) UNROLL_LOAD_VEC_BATCH(1)
+#define LOAD_MAT1         \
+  UNROLL_LOAD_ROW_MAT1(0) \
+  UNROLL_LOAD_ROW_MAT1(1) UNROLL_LOAD_ROW_MAT1(2) UNROLL_LOAD_ROW_MAT1(3)
+
+#define KERNEL_MAT1_VEC_BATCH         \
+  UNROLL_ROW_KERNEL_MAT1_VEC_BATCH(0) \
+  UNROLL_ROW_KERNEL_MAT1_VEC_BATCH(1) \
+  UNROLL_ROW_KERNEL_MAT1_VEC_BATCH(2) UNROLL_ROW_KERNEL_MAT1_VEC_BATCH(3)
+#define KERNEL_MAT1_VEC_BATCH_VEC_UNROLL(idx_row) \
+  UNROLL_KERNEL_MAT1_VEC_BATCH(idx_row, 0)        \
+  UNROLL_KERNEL_MAT1_VEC_BATCH(idx_row, 1)
+#define KERNEL_MAT1_VEC_BATCH_TAIL   \
+  UNROLL_KERNEL_MAT1_VEC_BATCH(0, 0) \
+  UNROLL_KERNEL_MAT1_VEC_BATCH(1, 0) \
+  UNROLL_KERNEL_MAT1_VEC_BATCH(2, 0) UNROLL_KERNEL_MAT1_VEC_BATCH(3, 0)
+
+#define ADD_BIAS_ACC_BATCH   \
+  UNROLL_ROW_ADD_BIAS_ACC(0) \
+  UNROLL_ROW_ADD_BIAS_ACC(1) \
+  UNROLL_ROW_ADD_BIAS_ACC(2) UNROLL_ROW_ADD_BIAS_ACC(3)
+#define ADD_BIAS_BATCH_ACC_VEC_UNROLL(idx_row) \
+  UNROLL_ADD_BIAS_ACC_BATCH(idx_row, 0) UNROLL_ADD_BIAS_ACC_BATCH(idx_row, 1)
+#define ADD_BIAS_ACC_BATCH_TAIL                     \
+  LOAD_BIAS UNROLL_ADD_BIAS_ACC_BATCH(0, 0)         \
+      LOAD_BIAS UNROLL_ADD_BIAS_ACC_BATCH(1, 0)     \
+          LOAD_BIAS UNROLL_ADD_BIAS_ACC_BATCH(2, 0) \
+              LOAD_BIAS UNROLL_ADD_BIAS_ACC_BATCH(3, 0)
+
+#define STORE_ACC_BATCH   \
+  UNROLL_ROW_STORE_ACC(0) \
+  UNROLL_ROW_STORE_ACC(1) UNROLL_ROW_STORE_ACC(2) UNROLL_ROW_STORE_ACC(3)
+#define STORE_ACC_BATCH_VEC_UNROLL(idx_row) \
+  UNROLL_STORE_ACC_BATCH(idx_row, 0) UNROLL_STORE_ACC_BATCH(idx_row, 1)
+#define STORE_ACC_BATCH_TAIL   \
+  UNROLL_STORE_ACC_BATCH(0, 0) \
+  UNROLL_STORE_ACC_BATCH(1, 0) \
+  UNROLL_STORE_ACC_BATCH(2, 0) UNROLL_STORE_ACC_BATCH(3, 0)
+
+#define ADJUST_ACC_BATCH_TAIL   \
+  UNROLL_ADJUST_ACC_BATCH(0, 0) \
+  UNROLL_ADJUST_ACC_BATCH(1, 0) \
+  UNROLL_ADJUST_ACC_BATCH(2, 0) UNROLL_ADJUST_ACC_BATCH(3, 0)
+#define ADJUST_ACC_BATCH   \
+  UNROLL_ROW_ADJUST_ACC(0) \
+  UNROLL_ROW_ADJUST_ACC(1) UNROLL_ROW_ADJUST_ACC(2) UNROLL_ROW_ADJUST_ACC(3)
+#define ADJUST_ACC_BATCH_VEC_UNROLL(idx_row) \
+  UNROLL_ADJUST_ACC_BATCH(idx_row, 0) UNROLL_ADJUST_ACC_BATCH(idx_row, 1)
+
+#endif /* (ROW_UNROLL == 4 && VEC_UNROLL == 2)*/
+
+#endif /* __XA_NNLIB_COMMON_MACROS_H__ */
diff --git a/tensorflow/lite/micro/kernels/xtensa_hifimini/xa_nnlib/algo/common/include/xa_nnlib_definitions.h b/tensorflow/lite/micro/kernels/xtensa_hifimini/xa_nnlib/algo/common/include/xa_nnlib_definitions.h
new file mode 100644
index 00000000000..c1453cb8da7
--- /dev/null
+++ b/tensorflow/lite/micro/kernels/xtensa_hifimini/xa_nnlib/algo/common/include/xa_nnlib_definitions.h
@@ -0,0 +1,57 @@
+/*******************************************************************************
+ * Copyright (c) 2019-2020 Cadence Design Systems, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to use this Software with Cadence processor cores only and
+ * not with any other processors and platforms, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ ******************************************************************************/
+
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef __XA_OPUS_CODEC_DEFINITIONS_H__
+#define __XA_OPUS_CODEC_DEFINITIONS_H__
+
+#include "tensorflow/lite/micro/kernels/xtensa_hifimini/xa_nnlib/algo/common/include/xa_api_defs.h"
+
+/* Identification Strings */
+#define LIBNAME "HiFi Mini Neural Network Library"
+#define LIBVERSION "0.6.0"
+
+#define LIB_APIVERSION_MAJOR 1
+#define LIB_APIVERSION_MINOR 0
+
+#if LIB_APIVERSION_MAJOR != XA_APIVERSION_MAJOR || \
+    LIB_APIVERSION_MINOR != XA_APIVERSION_MINOR
+// #error "Version Mismatch"
+#endif
+
+#define LIB_APIVERSION \
+  XA_MAKE_VERSION_STR(LIB_APIVERSION_MAJOR, LIB_APIVERSION_MINOR)
+
+#endif /* __XA_OPUS_CODEC_DEFINITIONS_H__ */
diff --git a/tensorflow/lite/micro/kernels/xtensa_hifimini/xa_nnlib/algo/common/include/xa_nnlib_err_chk.h b/tensorflow/lite/micro/kernels/xtensa_hifimini/xa_nnlib/algo/common/include/xa_nnlib_err_chk.h
new file mode 100644
index 00000000000..8508e54e515
--- /dev/null
+++ b/tensorflow/lite/micro/kernels/xtensa_hifimini/xa_nnlib/algo/common/include/xa_nnlib_err_chk.h
@@ -0,0 +1,84 @@
+/*******************************************************************************
+ * Copyright (c) 2019-2020 Cadence Design Systems, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to use this Software with Cadence processor cores only and
+ * not with any other processors and platforms, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ ******************************************************************************/
+
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef __XA_NNLIB_ERR_CHK_H__
+#define __XA_NNLIB_ERR_CHK_H__
+
+#ifndef NULL
+#define NULL (void *)0
+#endif /* NULL */
+
+#ifndef DISABLE_ARG_CHK
+
+#define XA_NNLIB_ARG_CHK_PTR(_ptr, _err) \
+  do {                                   \
+    if ((_ptr) == NULL) return (_err);   \
+  } while (0)
+
+#define XA_NNLIB_ARG_CHK_ALIGN(_ptr, _align, _err)                 \
+  do {                                                             \
+    if (((unsigned int)(_ptr) & ((_align)-1)) != 0) return (_err); \
+  } while (0)
+
+#define XA_NNLIB_ARG_CHK_COND(_cond, _err) \
+  do {                                     \
+    if ((_cond)) return (_err);            \
+  } while (0)
+
+#else /* DISABLE_ARG_CHK */
+
+#define XA_NNLIB_ARG_CHK_PTR(_ptr, _err)
+#define XA_NNLIB_ARG_CHK_ALIGN(_ptr, _align, _err)
+#define XA_NNLIB_ARG_CHK_COND(_cond, _err)
+
+#endif /* DISABLE_ARG_CHK */
+
+#define XA_NNLIB_CHK_PTR(_ptr, _err)   \
+  do {                                 \
+    if ((_ptr) == NULL) return (_err); \
+  } while (0)
+
+#define XA_NNLIB_CHK_ALIGN(_ptr, _align, _err)                     \
+  do {                                                             \
+    if (((unsigned int)(_ptr) & ((_align)-1)) != 0) return (_err); \
+  } while (0)
+
+#define XA_NNLIB_CHK_COND(_cond, _err) \
+  do {                                 \
+    if ((_cond)) return (_err);        \
+  } while (0)
+
+#endif /* __XA_NNLIB_ERR_CHK_H__ */
diff --git a/tensorflow/lite/micro/kernels/xtensa_hifimini/xa_nnlib/algo/kernels/activations/hifi_mini/xa_nn_activations_asym8s_asym8s.c b/tensorflow/lite/micro/kernels/xtensa_hifimini/xa_nnlib/algo/kernels/activations/hifi_mini/xa_nn_activations_asym8s_asym8s.c
new file mode 100644
index 00000000000..060b70696e0
--- /dev/null
+++ b/tensorflow/lite/micro/kernels/xtensa_hifimini/xa_nnlib/algo/kernels/activations/hifi_mini/xa_nn_activations_asym8s_asym8s.c
@@ -0,0 +1,176 @@
+/*******************************************************************************
+ * Copyright (c) 2019-2020 Cadence Design Systems, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to use this Software with Cadence processor cores only and
+ * not with any other processors and platforms, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ ******************************************************************************/
+
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xa_nnlib_common.h"
+
+#define ALIGNMENT 8 /* 8 bytes alignment */
+
+#define ALIGN_PTR(x, bytes) ((((unsigned)(x)) + (bytes - 1)) & (~(bytes - 1)))
+
+#define LIMIT(out, inp, min, max) \
+  {                               \
+    out = min;                    \
+    out = AE_MAXP24S(inp, min);   \
+    out = AE_MINP24S(out, max);   \
+  }
+
+#define STORE_8X2_FROM_24X2(out_ptr, val) \
+  {                                       \
+    int o1, o2;                           \
+    o1 = AE_MOVAP24S_H(val);              \
+    o2 = AE_MOVAP24S_L(val);              \
+    *out_ptr++ = (WORD8)o1;               \
+    *out_ptr++ = (WORD8)o2;               \
+  }
+
+/*
+ * inp: p_vec: 4 byte aligned input pointer
+ * out: p_out: no alignment needed for output pointer*/
+WORD32 xa_nn_vec_activation_min_max_asym8s_asym8s(
+    WORD8 *__restrict__ p_out, const WORD8 *__restrict__ p_vec,
+    int activation_min, int activation_max, WORD32 vec_length) {
+  int i;
+  ae_p24x2s x, y, min, max;
+
+  /* NULL pointer checks */
+  XA_NNLIB_ARG_CHK_PTR(p_out, -1);
+  XA_NNLIB_ARG_CHK_PTR(p_vec, -1);
+
+  /* Basic Parameter checks */
+  XA_NNLIB_ARG_CHK_COND((vec_length <= 0), -1);
+
+  /* Basic Parameter checks */
+  XA_NNLIB_ARG_CHK_COND((activation_max < activation_min), -1);
+
+  WORD8 *p_o = p_out;
+  WORD8 *p_v = (WORD8 *)p_vec;
+
+  min = AE_SRAIP24(AE_CVTP24A16(activation_min), 8);
+  max = AE_SRAIP24(AE_CVTP24A16(activation_max), 8);
+
+  int pre_loop_count = 0;
+  // pre loop, active when input ptr is not 4 byte aligned
+  pre_loop_count = (int)((unsigned)ALIGN_PTR(p_v, 4) - (unsigned)p_v);
+  pre_loop_count = (pre_loop_count < vec_length) ? pre_loop_count : vec_length;
+
+  vec_length = vec_length - pre_loop_count;
+  vec_length = (vec_length < 0) ? 0 : vec_length;
+
+  for (i = 0; i < pre_loop_count; i++) {
+    int i1;
+    i1 = ((WORD8)*p_v++);
+    x = AE_MOVPA24(i1);
+    LIMIT(y, x, min, max)
+    i1 = AE_MOVAP24S_H(y);
+    *p_o++ = (WORD8)i1;
+  }
+
+  if ((activation_max >= (int)127) && (activation_min <= (int)-128)) {
+    p_v = p_v - 2;
+    for (i = 0; i < (vec_length >> 1); i++) {
+      AE_LP8X2F_IU(x, (WORD8 *)p_v, 2 * sizeof(WORD8));
+      y = AE_SRAIP24(x, 16);
+
+      STORE_8X2_FROM_24X2(p_o, y)
+    }
+    if (vec_length & 1) {
+      p_v = p_v + 2;
+      int i1;
+      i1 = (WORD8)p_v[0];
+      *p_o++ = (WORD8)i1;
+    }
+  } else if ((activation_max < (int)127) && (activation_min <= (int)-128)) {
+    p_v = p_v - 2;
+    for (i = 0; i < (vec_length >> 1); i++) {
+      AE_LP8X2F_IU(x, (WORD8 *)p_v, 2 * sizeof(WORD8));
+      y = AE_SRAIP24(x, 16);
+
+      y = AE_MINP24S(y, max);
+
+      STORE_8X2_FROM_24X2(p_o, y)
+    }
+    if (vec_length & 1) {
+      p_v = p_v + 2;
+      int i1;
+      i1 = (WORD8)p_v[0];
+      y = AE_MOVPA24(i1);
+
+      y = AE_MINP24S(y, max);
+
+      i1 = AE_MOVAP24S_H(y);
+      *p_o++ = (WORD8)i1;
+    }
+  } else if ((activation_max >= (int)127) && (activation_min > (int)-128)) {
+    p_v = p_v - 2;
+    for (i = 0; i < (vec_length >> 1); i++) {
+      AE_LP8X2F_IU(x, (WORD8 *)p_v, 2 * sizeof(WORD8));
+      y = AE_SRAIP24(x, 16);
+
+      y = AE_MAXP24S(y, min);
+
+      STORE_8X2_FROM_24X2(p_o, y)
+    }
+    if (vec_length & 1) {
+      p_v = p_v + 2;
+      int i1;
+      i1 = (WORD8)p_v[0];
+      y = AE_MOVPA24(i1);
+
+      y = AE_MAXP24S(y, min);
+
+      i1 = AE_MOVAP24S_H(y);
+      *p_o++ = (WORD8)i1;
+    }
+  } else {
+    p_v = p_v - 2;
+    for (i = 0; i < (vec_length >> 1); i++) {
+      AE_LP8X2F_IU(x, (WORD8 *)p_v, 2 * sizeof(WORD8));
+      x = AE_SRAIP24(x, 16);
+      LIMIT(y, x, min, max)
+      STORE_8X2_FROM_24X2(p_o, y)
+    }
+    if (vec_length & 1) {
+      p_v = p_v + 2;
+      int i1;
+      i1 = (WORD8)p_v[0];
+      x = AE_MOVPA24(i1);
+      LIMIT(y, x, min, max)
+      i1 = AE_MOVAP24S_H(y);
+      *p_o++ = (WORD8)i1;
+    }
+  }
+  return 0;
+}
diff --git a/tensorflow/lite/micro/kernels/xtensa_hifimini/xa_nnlib/algo/kernels/activations/hifi_mini/xa_nn_softmax_asym8_asym8.c b/tensorflow/lite/micro/kernels/xtensa_hifimini/xa_nnlib/algo/kernels/activations/hifi_mini/xa_nn_softmax_asym8_asym8.c
new file mode 100644
index 00000000000..2442cff0b96
--- /dev/null
+++ b/tensorflow/lite/micro/kernels/xtensa_hifimini/xa_nnlib/algo/kernels/activations/hifi_mini/xa_nn_softmax_asym8_asym8.c
@@ -0,0 +1,1005 @@
+/*******************************************************************************
+ * Copyright (c) 2019-2020 Cadence Design Systems, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to use this Software with Cadence processor cores only and
+ * not with any other processors and platforms, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ ******************************************************************************/
+
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xa_nnlib_common.h"
+
+#define ALIGNMENT 8 /* 8 bytes alignment */
+#define ALIGNED_SIZE(x, bytes) (((x) + (bytes - 1)) & (~(bytes - 1)))
+#define ALIGN_PTR(x, bytes) ((((unsigned)(x)) + (bytes - 1)) & (~(bytes - 1)))
+
+#ifndef AE_LP8X2F_IU
+#define AE_LP8X2F_IU(p_x, p_in, x)                           \
+  AE_LP16F_IU(p_x, (ae_p16s *)p_in, x);                      \
+  ae_p24x2s p_tmp1 = AE_SLLIP24(p_x, 8);                     \
+  ae_p24x2s p_tmp2 = AE_ANDP48(p_x, AE_MOVPA24(0xFFFF0000)); \
+  p_x = AE_SELP24_LL(p_tmp2, p_tmp1);
+
+#endif
+
+#define NSA64_T(y, x)               \
+  {                                 \
+    ae_q56s q_tmp = *(ae_q56s *)&x; \
+    y = AE_NSAQ56S(q_tmp) + 8;      \
+  }
+
+#define MULFP32X2RAS_T(result, a, b)             \
+  {                                              \
+    ae_q56s q_a = AE_CVTQ48A32S(a);              \
+    ae_p24x2s p_b = AE_CVTP24A16X2_HL(b, b);     \
+    ae_q56s q_out = AE_MULFQ32SP16U_L(q_a, p_b); \
+    q_out = AE_SRAIQ56(q_out, 16);               \
+    AE_MULAFQ32SP16S_H(q_out, q_a, p_b);         \
+    q_out = AE_ROUNDSQ32ASYM(q_out);             \
+    *(ae_q32s *)&result = q_out;                 \
+  }
+
+#define MULFP32X2RS_T(result, a, b)              \
+  {                                              \
+    ae_q56s q_a = AE_CVTQ48A32S(a);              \
+    ae_p24x2s p_b = AE_CVTP24A16X2_HL(b, b);     \
+    ae_q56s q_out = AE_MULFQ32SP16U_L(q_a, p_b); \
+    q_out = AE_SRAIQ56(q_out, 16);               \
+    AE_MULAFQ32SP16S_H(q_out, q_a, p_b);         \
+    q_out = AE_ROUNDSQ32SYM(q_out);              \
+    *(ae_q32s *)&result = q_out;                 \
+  }
+#define ADD32S_T(result, a, b)             \
+  {                                        \
+    ae_q56s q_a = AE_CVTQ48A32S(a);        \
+    ae_q56s q_b = AE_CVTQ48A32S(b);        \
+    ae_q56s q_out = AE_ADDSQ56S(q_a, q_b); \
+    q_out = AE_SATQ48S(q_out);             \
+    *(ae_q32s *)&result = q_out;           \
+  }
+
+#define SUB32S_T(result, a, b)             \
+  {                                        \
+    ae_q56s q_a = AE_CVTQ48A32S(a);        \
+    ae_q56s q_b = AE_CVTQ48A32S(b);        \
+    ae_q56s q_out = AE_SUBSQ56S(q_a, q_b); \
+    q_out = AE_SATQ48S(q_out);             \
+    *(ae_q32s *)&result = q_out;           \
+  }
+
+#define SLAI32S_T(result, a, b)         \
+  {                                     \
+    ae_q56s q_a = AE_CVTQ48A32S(a);     \
+    ae_q56s q_out = AE_SLLIQ56(q_a, b); \
+    q_out = AE_SATQ48S(q_out);          \
+    *(ae_q32s *)&result = q_out;        \
+  }
+
+#define SRAA32RS_T(result, a, b)             \
+  {                                          \
+    ae_q56s q_a = AE_CVTQ48A32S(a);          \
+    ae_q56s q_out = AE_SLAASQ56S(q_a, (-b)); \
+    q_out = AE_ROUNDSQ32ASYM(q_out);         \
+    *(ae_q32s *)&result = q_out;             \
+  }
+
+#define SRAI32R_T(result, a, b)         \
+  {                                     \
+    ae_q56s q_a = AE_CVTQ48A32S(a);     \
+    ae_q56s q_out = AE_SRAIQ56(q_a, b); \
+    q_out = AE_ROUNDSQ32ASYM(q_out);    \
+    *(ae_q32s *)&result = q_out;        \
+  }
+
+static const int CONSTANT_TERM = (0x70f5a894);
+static const int CONSTANT_1_OVER_3 = (0x2aaaaaab);
+static const int CONSTANT_1_OVER_8 = (0x10000000);
+static const int ONE_QUATER_Q26 = (0x1000000);  // Q6.26
+static const int MASK = (0xffffff);
+static const int Q31 = 0x7fffffff;
+static const int constant_48_over_17 = 1515870810;
+static const int constant_neg_32_over_17 = -1010580540;  // Q29
+static const int F2_ONE = 0x20000000;
+
+static const int constant_neg_32_over_17_Q21 = -3947580;  // Q21
+static const int constant_48_over_17_Q21 = 5921370;       // Q21
+
+static ae_p24x2s GetReciprocal(ae_q56s q_x, int x_integerbits, int *lsh) {
+  int headroom_plus_one;
+  ae_p24x2s p_x;
+  ae_q56s q_tmp;
+  ae_p24x2s p_half_den;
+  int i;
+
+  headroom_plus_one = AE_NSAQ56S(q_x) + 8;
+  headroom_plus_one = headroom_plus_one - 31;
+  *lsh = x_integerbits - headroom_plus_one;
+
+  q_x = (q_x << (headroom_plus_one + 15));
+  p_half_den = AE_ROUNDSP24Q48SYM(q_x);
+
+  q_tmp = AE_CVTQ48A32S(constant_48_over_17);
+  AE_MULAFP24S_LL(q_tmp, p_half_den, AE_MOVPA24(constant_neg_32_over_17_Q21));
+  p_x = AE_ROUNDSP24Q48SYM(q_tmp);
+
+  for (i = 0; i < 3; i++) {
+    q_tmp = AE_CVTQ48A32S(F2_ONE);
+    AE_MULSFP24S_LL(q_tmp, p_x, p_half_den);
+    ae_p24x2s p_one_minus_half_denominator_times_x = AE_ROUNDSP24Q48SYM(q_tmp);
+
+    q_tmp = AE_MULFP24S_LL(p_x, p_one_minus_half_denominator_times_x);
+    ae_p24x2s p_m = AE_ROUNDSP24Q48SYM(q_tmp);
+    p_m = AE_SLLISP24S(p_m, 2);
+    p_x = AE_ADDSP24S(p_x, p_m);
+  }
+
+  p_x = AE_SLLISP24S(p_x, 1);
+
+  return p_x;
+}
+
+static const int MASK_16BITS = (0xffff);
+static const int ONE_QUATER_Q18 = (0x10000);          // Q18
+static const int CONSTANT_1_OVER_8_Q23 = (0x100000);  // Q23
+static const int CONSTANT_1_OVER_3_Q23 = (0x2aaaaa);  // Q23
+static const int CONSTANT_TERM_Q23 = (0x70f5a8);      // Q23
+static const int Q23 = 0x7fffff;
+
+#define GEMMLOWP_EXP_BARREL_SHIFTER_OPT_II(p_in_out, exponent,                \
+                                           FixedPointMultiplier, p_remainder) \
+  {                                                                           \
+    ae_p24x2s p_out;                                                          \
+                                                                              \
+    ae_p24x2s p_zero = AE_ZEROP48();                                          \
+                                                                              \
+    ae_p24x2s p_scale = AE_MOVPA24(1 << (18 + exponent));                     \
+    ae_p24x2s p_mask = p_remainder & p_scale;                                 \
+                                                                              \
+    ae_p24x2s p_FixedPointMultiplier = AE_MOVPA24(FixedPointMultiplier >> 8); \
+                                                                              \
+    ae_q56s q_tmp1 = AE_MULFP24S_HH(p_in_out, p_FixedPointMultiplier);        \
+    ae_q56s q_tmp2 = AE_MULFP24S_LL(p_in_out, p_FixedPointMultiplier);        \
+    ae_p24x2s p_t1 = AE_ROUNDSP24Q48SYM(q_tmp1);                              \
+    ae_p24x2s p_t2 = AE_ROUNDSP24Q48SYM(q_tmp2);                              \
+    p_out = AE_SELP24_LL(p_t1, p_t2);                                         \
+                                                                              \
+    xtbool2 flag_le = AE_LTP24S(p_zero, p_mask);                              \
+    AE_MOVTP24X2(p_in_out, p_out, flag_le);                                   \
+  }
+
+#define EXP_Q26_II(p_exp_y, p_inp_t)                                        \
+  {                                                                         \
+    ae_p24x2s p_x1_in, p_x2, p_x3, p_x4, p_x4_by_4, p_y1, p_y2, p_y3, p_y4, \
+        p_y5, p_y6, p_y;                                                    \
+                                                                            \
+    p_x2 = p_inp_t & AE_MOVPA24(MASK_16BITS);                               \
+    ae_p24x2s p_a_mod_quater_minus_q_1_by_4 =                               \
+        p_x2 - AE_MOVPA24(ONE_QUATER_Q18);                                  \
+    ae_p24x2s p_x_in = p_a_mod_quater_minus_q_1_by_4 << 5;                  \
+    ae_p24x2s p_remainder = p_a_mod_quater_minus_q_1_by_4 - p_inp_t;        \
+                                                                            \
+    p_x1_in = AE_ADDSP24S(p_x_in, AE_MOVPA24(CONSTANT_1_OVER_8_Q23));       \
+                                                                            \
+    ae_q56s q_tmp1 = AE_MULFP24S_HH(p_x1_in, p_x1_in);                      \
+    ae_q56s q_tmp2 = AE_MULFP24S_LL(p_x1_in, p_x1_in);                      \
+    ae_p24x2s p_t1 = AE_ROUNDSP24Q48SYM(q_tmp1);                            \
+    ae_p24x2s p_t2 = AE_ROUNDSP24Q48SYM(q_tmp2);                            \
+    p_x2 = AE_SELP24_LL(p_t1, p_t2);                                        \
+                                                                            \
+    q_tmp1 = AE_MULFP24S_HH(p_t1, p_x1_in);                                 \
+    q_tmp2 = AE_MULFP24S_LL(p_t2, p_x1_in);                                 \
+    p_t1 = AE_ROUNDSP24Q48SYM(q_tmp1);                                      \
+    p_t2 = AE_ROUNDSP24Q48SYM(q_tmp2);                                      \
+    p_x3 = AE_SELP24_LL(p_t1, p_t2);                                        \
+                                                                            \
+    q_tmp1 = AE_MULFP24S_HH(p_x2, p_x2);                                    \
+    q_tmp2 = AE_MULFP24S_LL(p_x2, p_x2);                                    \
+    p_t1 = AE_ROUNDSP24Q48SYM(q_tmp1);                                      \
+    p_t2 = AE_ROUNDSP24Q48SYM(q_tmp2);                                      \
+    p_x4 = AE_SELP24_LL(p_t1, p_t2);                                        \
+    p_x4_by_4 = p_x4 >> 2;                                                  \
+                                                                            \
+    p_y1 = AE_ADDSP24S(p_x4_by_4, p_x3);                                    \
+                                                                            \
+    ae_p24x2s p_const = AE_MOVPA24(CONSTANT_1_OVER_3_Q23);                  \
+    q_tmp1 = AE_MULFP24S_HH(p_y1, p_const);                                 \
+    q_tmp2 = AE_MULFP24S_LL(p_y1, p_const);                                 \
+    p_t1 = AE_ROUNDSP24Q48SYM(q_tmp1);                                      \
+    p_t2 = AE_ROUNDSP24Q48SYM(q_tmp2);                                      \
+    p_y2 = AE_SELP24_LL(p_t1, p_t2);                                        \
+                                                                            \
+    p_y3 = AE_ADDSP24S(p_y2, p_x2);                                         \
+    p_y4 = p_y3 >> 1;                                                       \
+                                                                            \
+    p_y5 = AE_ADDSP24S(p_x1_in, p_y4); /* ADD32S_T(y5, x1_in, y4);  */      \
+                                                                            \
+    p_const = AE_MOVPA24(CONSTANT_TERM_Q23);                                \
+    q_tmp1 = AE_MULFP24S_HH(p_y5, p_const);                                 \
+    q_tmp2 = AE_MULFP24S_LL(p_y5, p_const);                                 \
+    p_t1 = AE_ROUNDSP24Q48SYM(q_tmp1);                                      \
+    p_t2 = AE_ROUNDSP24Q48SYM(q_tmp2);                                      \
+    p_y6 = AE_SELP24_LL(p_t1, p_t2);                                        \
+    p_y = AE_ADDSP24S(p_y6, p_const);                                       \
+                                                                            \
+    {                                                                       \
+      GEMMLOWP_EXP_BARREL_SHIFTER_OPT_II(p_y, -2, 1672461947, p_remainder); \
+      GEMMLOWP_EXP_BARREL_SHIFTER_OPT_II(p_y, -1, 1302514674, p_remainder); \
+      GEMMLOWP_EXP_BARREL_SHIFTER_OPT_II(p_y, 0, 790015084, p_remainder);   \
+      GEMMLOWP_EXP_BARREL_SHIFTER_OPT_II(p_y, 1, 290630308, p_remainder);   \
+      GEMMLOWP_EXP_BARREL_SHIFTER_OPT_II(p_y, 2, 39332535, p_remainder);    \
+      GEMMLOWP_EXP_BARREL_SHIFTER_OPT_II(p_y, 3, 720401, p_remainder);      \
+      GEMMLOWP_EXP_BARREL_SHIFTER_OPT_II(p_y, 4, 242, p_remainder);         \
+    }                                                                       \
+    p_exp_y = p_y;                                                          \
+    p_const = AE_MOVPA24(Q23);                                              \
+    xtbool2 flag_eq = AE_EQP24(p_inp_t, AE_ZEROP48());                      \
+    AE_MOVTP24X2(p_exp_y, p_const, flag_eq);                                \
+  }
+
+#define GEMMLOWP_EXP_BARREL_SHIFTER_OPT_I(p_in_out, exponent,                 \
+                                          FixedPointMultiplier, p_remainder)  \
+  {                                                                           \
+    ae_p24x2s p_out;                                                          \
+                                                                              \
+    ae_p24x2s p_zero = AE_ZEROP48();                                          \
+                                                                              \
+    ae_p24x2s p_scale = AE_MOVPA24(1 << (18 + exponent));                     \
+    ae_p24x2s p_mask = p_remainder & p_scale;                                 \
+                                                                              \
+    ae_p24x2s p_FixedPointMultiplier = AE_MOVPA24(FixedPointMultiplier >> 8); \
+                                                                              \
+    ae_q56s q_tmp1 = AE_MULFP24S_HH(p_in_out, p_FixedPointMultiplier);        \
+    p_out = AE_ROUNDSP24Q48SYM(q_tmp1);                                       \
+                                                                              \
+    xtbool2 flag_le = AE_LTP24S(p_zero, p_mask);                              \
+    AE_MOVTP24X2(p_in_out, p_out, flag_le);                                   \
+  }
+
+#define EXP_Q26_I(p_exp_y, p_inp_t)                                         \
+  {                                                                         \
+    ae_p24x2s p_x1_in, p_x2, p_x3, p_x4, p_x4_by_4, p_y1, p_y2, p_y3, p_y4, \
+        p_y5, p_y6, p_y;                                                    \
+                                                                            \
+    p_x2 = p_inp_t & AE_MOVPA24(MASK_16BITS);                               \
+    ae_p24x2s p_a_mod_quater_minus_q_1_by_4 =                               \
+        p_x2 - AE_MOVPA24(ONE_QUATER_Q18);                                  \
+    ae_p24x2s p_x_in = p_a_mod_quater_minus_q_1_by_4 << 5;                  \
+    ae_p24x2s p_remainder = p_a_mod_quater_minus_q_1_by_4 - p_inp_t;        \
+                                                                            \
+    p_x1_in = AE_ADDSP24S(p_x_in, AE_MOVPA24(CONSTANT_1_OVER_8_Q23));       \
+                                                                            \
+    ae_q56s q_tmp1 = AE_MULFP24S_HH(p_x1_in, p_x1_in);                      \
+    p_x2 = AE_ROUNDSP24Q48SYM(q_tmp1);                                      \
+                                                                            \
+    q_tmp1 = AE_MULFP24S_HH(p_x2, p_x1_in);                                 \
+    p_x3 = AE_ROUNDSP24Q48SYM(q_tmp1);                                      \
+                                                                            \
+    q_tmp1 = AE_MULFP24S_HH(p_x2, p_x2);                                    \
+    p_x4 = AE_ROUNDSP24Q48SYM(q_tmp1);                                      \
+    p_x4_by_4 = p_x4 >> 2;                                                  \
+                                                                            \
+    p_y1 = AE_ADDSP24S(p_x4_by_4, p_x3);                                    \
+                                                                            \
+    ae_p24x2s p_const = AE_MOVPA24(CONSTANT_1_OVER_3_Q23);                  \
+    q_tmp1 = AE_MULFP24S_HH(p_y1, p_const);                                 \
+    p_y2 = AE_ROUNDSP24Q48SYM(q_tmp1);                                      \
+                                                                            \
+    p_y3 = AE_ADDSP24S(p_y2, p_x2);                                         \
+    p_y4 = p_y3 >> 1;                                                       \
+                                                                            \
+    p_y5 = AE_ADDSP24S(p_x1_in, p_y4); /* ADD32S_T(y5, x1_in, y4);  */      \
+                                                                            \
+    p_const = AE_MOVPA24(CONSTANT_TERM_Q23);                                \
+    q_tmp1 = AE_MULFP24S_HH(p_y5, p_const);                                 \
+    p_y6 = AE_ROUNDSP24Q48SYM(q_tmp1);                                      \
+    p_y = AE_ADDSP24S(p_y6, p_const);                                       \
+                                                                            \
+    {                                                                       \
+      GEMMLOWP_EXP_BARREL_SHIFTER_OPT_I(p_y, -2, 1672461947, p_remainder);  \
+      GEMMLOWP_EXP_BARREL_SHIFTER_OPT_I(p_y, -1, 1302514674, p_remainder);  \
+      GEMMLOWP_EXP_BARREL_SHIFTER_OPT_I(p_y, 0, 790015084, p_remainder);    \
+      GEMMLOWP_EXP_BARREL_SHIFTER_OPT_I(p_y, 1, 290630308, p_remainder);    \
+      GEMMLOWP_EXP_BARREL_SHIFTER_OPT_I(p_y, 2, 39332535, p_remainder);     \
+      GEMMLOWP_EXP_BARREL_SHIFTER_OPT_I(p_y, 3, 720401, p_remainder);       \
+      GEMMLOWP_EXP_BARREL_SHIFTER_OPT_I(p_y, 4, 242, p_remainder);          \
+    }                                                                       \
+    p_exp_y = p_y;                                                          \
+    p_const = AE_MOVPA24(Q23);                                              \
+    xtbool2 flag_eq = AE_EQP24(p_inp_t, AE_ZEROP48());                      \
+    AE_MOVTP24X2(p_exp_y, p_const, flag_eq);                                \
+  }
+
+WORD32 xa_nn_vec_softmax_asym8u_8(UWORD8 *__restrict__ pOut,
+                                  const UWORD8 *__restrict__ pVec,
+                                  WORD32 diffmin, WORD32 input_beta_left_shift,
+                                  WORD32 input_beta_multiplier,
+                                  WORD32 vec_length, pVOID pScratch) {
+  /* NULL pointer checks */
+  XA_NNLIB_ARG_CHK_PTR(pOut, -1);
+  XA_NNLIB_ARG_CHK_PTR(pVec, -1);
+  XA_NNLIB_ARG_CHK_PTR(pScratch, -1);
+  /* Pointer alignment checks */
+  /* No alignment (1-byte) needed for any pointer */
+  /* Basic Parameter checks */
+  XA_NNLIB_ARG_CHK_COND((vec_length <= 0), -1);
+  XA_NNLIB_ARG_CHK_COND(
+      ((input_beta_left_shift < -31) || (input_beta_left_shift > 31)), -1);
+  XA_NNLIB_ARG_CHK_COND((input_beta_multiplier < 0), -1);
+
+  int i;
+  int shift_bits_reciprocal;
+  UWORD8 *p_in;
+  WORD32 *__restrict pExp = (WORD32 *)ALIGN_PTR(pScratch, ALIGNMENT);
+  ae_p24f *__restrict pTmpScratch = (ae_p24f *)pExp;
+  int max;
+  ae_p24x2s p_x;
+  ae_p24x2s p_max = AE_MOVPA24(0xFF800000);
+  ae_p24x2s p_recip_sum_exp;
+  int pre_loop_count;
+  int main_loop_count;
+  int post_loop_count;
+
+  if (vec_length > 1) {
+    pre_loop_count = (int)pVec & 0x1;
+    main_loop_count = vec_length - pre_loop_count;
+    post_loop_count = (main_loop_count & 1);
+    main_loop_count = main_loop_count >> 1;
+  } else {
+    pre_loop_count = 0;
+    main_loop_count = 0;
+    post_loop_count = vec_length;
+  }
+
+  /* Calculating Max */
+  {
+    p_in = (UWORD8 *)pVec;
+
+    if (pre_loop_count) {
+      p_x = AE_MOVPA24(*p_in++);
+      p_max = AE_MAXP24S(p_max, p_x);
+    }
+
+    p_in -= 2;
+    for (i = 0; i < main_loop_count; i++) {
+      AE_LP8X2F_IU(p_x, p_in, 2 * sizeof(WORD8));
+      p_x = AE_SRLIP24(p_x, 16);
+      p_max = AE_MAXP24S(p_max, p_x);
+    }
+
+    if (post_loop_count) {
+      p_in += 2;
+      p_x = AE_MOVPA24(*p_in);
+      p_max = AE_MAXP24S(p_max, p_x);
+    }
+    p_max = AE_MAXP24S(p_max, AE_SELP24_LH(p_max, p_max));
+    max = AE_MOVAP24S_L(p_max);
+  }
+
+  /* Calculate exponents */
+  {
+    ae_q56s q_sum_exp = AE_ZEROQ56();
+    ae_p24x2s p_rem_x, p_y, p_exp_y;
+    ae_p24x2s p_zero = AE_ZEROP48();
+    ae_p24x2s p_input_beta_multiplier =
+        AE_MOVPA24((input_beta_multiplier >> 8));
+    ae_p24x2s p_diffmin = AE_MOVPA24(diffmin);
+    int input_beta_left_shift_for_24bit = input_beta_left_shift - 8;
+
+    p_in = (UWORD8 *)pVec;
+    WUR_AE_SAR(input_beta_left_shift_for_24bit);
+
+    if (pre_loop_count) {
+      p_x = AE_MOVPA24(*p_in++);
+      p_rem_x = p_x - p_max;
+      p_y = AE_SLLSSP24S(p_rem_x);
+
+      ae_q56s q_dequantized_y1 = AE_MULFP24S_LL(p_y, p_input_beta_multiplier);
+
+      ae_p24x2s p_dequantized_y1 = AE_ROUNDSP24Q48ASYM(q_dequantized_y1);
+
+      EXP_Q26_I(p_exp_y, p_dequantized_y1)
+
+      xtbool2 flag_cmp = AE_LTP24S(p_rem_x, p_diffmin);
+      AE_MOVTP24X2(p_exp_y, p_zero, flag_cmp);
+
+      *pTmpScratch++ = p_exp_y;
+
+      p_exp_y = p_exp_y >> 4;
+
+      AE_MULAP24S_LL(q_sum_exp, p_exp_y, AE_MOVPA24(1));
+    }
+
+    p_in -= 2;
+    for (i = 0; i < main_loop_count; i++) {
+      AE_LP8X2F_IU(p_x, p_in, 2 * sizeof(WORD8));
+      p_x = AE_SRLIP24(p_x, 16);
+      p_rem_x = p_x - p_max;
+      p_y = AE_SLLSSP24S(p_rem_x);
+
+      ae_q56s q_dequantized_y1 = AE_MULFP24S_HH(p_y, p_input_beta_multiplier);
+      ae_q56s q_dequantized_y2 = AE_MULFP24S_LL(p_y, p_input_beta_multiplier);
+
+      ae_p24x2s p_dequantized_y1 = AE_ROUNDSP24Q48ASYM(q_dequantized_y1);
+      ae_p24x2s p_dequantized_y2 = AE_ROUNDSP24Q48ASYM(q_dequantized_y2);
+
+      ae_p24x2s p_dequantized =
+          AE_SELP24_LL(p_dequantized_y1, p_dequantized_y2);
+
+      EXP_Q26_II(p_exp_y, p_dequantized)
+
+      xtbool2 flag_cmp = AE_LTP24S(p_rem_x, p_diffmin);
+      AE_MOVTP24X2(p_exp_y, p_zero, flag_cmp);
+
+      *pTmpScratch++ = AE_SELP24_HH(p_exp_y, p_exp_y);
+      *pTmpScratch++ = p_exp_y; /* store lower element */
+
+      p_exp_y = p_exp_y >> 4;
+
+      AE_MULAAP24S_HH_LL(q_sum_exp, p_exp_y, AE_MOVPA24(1));
+    }
+    if (post_loop_count) {
+      p_in += 2;
+
+      p_x = AE_MOVPA24(*p_in);
+      p_rem_x = p_x - p_max;
+      p_y = AE_SLLSSP24S(p_rem_x);
+
+      ae_q56s q_dequantized_y1 = AE_MULFP24S_LL(p_y, p_input_beta_multiplier);
+
+      ae_p24x2s p_dequantized_y1 = AE_ROUNDSP24Q48ASYM(q_dequantized_y1);
+
+      EXP_Q26_I(p_exp_y, p_dequantized_y1)
+
+      xtbool2 flag_cmp = AE_LTP24S(p_rem_x, p_diffmin);
+      AE_MOVTP24X2(p_exp_y, p_zero, flag_cmp);
+
+      *pTmpScratch = p_exp_y;
+
+      p_exp_y = p_exp_y >> 4;
+
+      AE_MULAP24S_LL(q_sum_exp, p_exp_y, AE_MOVPA24(1));
+    }
+    p_recip_sum_exp = GetReciprocal(q_sum_exp, 12, &shift_bits_reciprocal);
+  }
+
+  /* Calculate output */
+  {
+    ae_p24x2s p_exp;
+
+    int shift_val = -(shift_bits_reciprocal + 31 - 8 - 8);
+
+    ae_p24x2s p_min = AE_ZEROP48();
+    ae_p24x2s p_max = AE_MOVPA24(255);
+
+    for (i = 0; i<vec_length> > 1; i++) {
+      int out;
+
+      p_exp = *(ae_p24x2f *)&pExp[2 * i];
+
+      ae_q56s q_tmp1 = AE_MULFP24S_HH(p_exp, p_recip_sum_exp);
+      ae_q56s q_tmp2 = AE_MULFP24S_LL(p_exp, p_recip_sum_exp);
+
+      q_tmp1 = AE_SLAASQ56S(q_tmp1, shift_val);
+      q_tmp2 = AE_SLAASQ56S(q_tmp2, shift_val);
+
+      ae_p24x2s p_out1 = AE_ROUNDSP24Q48ASYM(q_tmp1);
+      ae_p24x2s p_out2 = AE_ROUNDSP24Q48ASYM(q_tmp2);
+
+      ae_p24x2s p_out = AE_SELP24_LL(p_out1, p_out2);
+
+      p_out = AE_MAXP24S(p_out, p_min);
+      p_out = AE_MINP24S(p_out, p_max);
+
+      out = AE_MOVAP24S_H(p_out);
+      *pOut++ = (UWORD8)out;
+
+      out = AE_MOVAP24S_L(p_out);
+      *pOut++ = (UWORD8)out;
+    }
+
+    if (vec_length & 0x1) {
+      int out;
+
+      p_exp = *(ae_p24f *)&pExp[vec_length - 1];
+
+      ae_q56s q_tmp1 = AE_MULFP24S_LL(p_exp, p_recip_sum_exp);
+
+      q_tmp1 = AE_SLAASQ56S(q_tmp1, shift_val);
+
+      ae_p24x2s p_out = AE_ROUNDSP24Q48ASYM(q_tmp1);
+
+      p_out = AE_MAXP24S(p_out, p_min);
+      p_out = AE_MINP24S(p_out, p_max);
+
+      out = AE_MOVAP24S_L(p_out);
+      *pOut++ = (UWORD8)out;
+    }
+  }
+
+  return 0;
+}
+
+WORD32 xa_nn_vec_softmax_asym8s_8(WORD8 *__restrict__ pOut,
+                                  const WORD8 *__restrict__ pVec,
+                                  WORD32 diffmin, WORD32 input_beta_left_shift,
+                                  WORD32 input_beta_multiplier,
+                                  WORD32 vec_length, pVOID pScratch) {
+  /* NULL pointer checks */
+  XA_NNLIB_ARG_CHK_PTR(pOut, -1);
+  XA_NNLIB_ARG_CHK_PTR(pVec, -1);
+  XA_NNLIB_ARG_CHK_PTR(pScratch, -1);
+  /* Pointer alignment checks */
+  /* No alignment (1-byte) needed for any pointer */
+  /* Basic Parameter checks */
+  XA_NNLIB_ARG_CHK_COND((vec_length <= 0), -1);
+  XA_NNLIB_ARG_CHK_COND(
+      ((input_beta_left_shift < -31) || (input_beta_left_shift > 31)), -1);
+  XA_NNLIB_ARG_CHK_COND((input_beta_multiplier < 0), -1);
+
+  int i;
+  int shift_bits_reciprocal;
+  WORD8 *p_in;
+  WORD32 *__restrict pExp = (WORD32 *)ALIGN_PTR(pScratch, ALIGNMENT);
+  ae_p24x2s p_recip_sum_exp;
+  ae_p24x2s p_x;
+  ae_p24x2s p_max = AE_MOVPA24(0xFF800000);
+
+  int pre_loop_count;
+  int main_loop_count;
+  int post_loop_count;
+
+  if (vec_length > 1) {
+    pre_loop_count = (int)pVec & 0x1;
+    main_loop_count = vec_length - pre_loop_count;
+    post_loop_count = (main_loop_count & 1);
+    main_loop_count = main_loop_count >> 1;
+  } else {
+    pre_loop_count = 0;
+    main_loop_count = 0;
+    post_loop_count = vec_length;
+  }
+
+  /* Calculating Max */
+  {
+    p_in = (WORD8 *)pVec;
+
+    if (pre_loop_count) {
+      p_x = AE_MOVPA24(*p_in++);
+      p_max = AE_MAXP24S(p_max, p_x);
+    }
+
+    p_in -= 2;
+    for (i = 0; i < main_loop_count; i++) {
+      AE_LP8X2F_IU(p_x, p_in, 2 * sizeof(WORD8));
+      p_max = AE_MAXP24S(p_max, p_x);
+    }
+    p_max = AE_SRAIP24(p_max, 16);
+
+    if (post_loop_count) {
+      p_in += 2;
+      p_x = AE_MOVPA24(*p_in);
+      p_max = AE_MAXP24S(p_max, p_x);
+    }
+    p_max = AE_MAXP24S(p_max, AE_SELP24_LH(p_max, p_max));
+  }
+
+  /* Calculate exponents */
+  {
+    ae_q56s q_sum_exp = AE_ZEROQ56();
+    ae_p24x2s p_rem_x, p_y, p_exp_y;
+    ae_p24x2s p_zero = AE_ZEROP48();
+    ae_p24x2s p_input_beta_multiplier =
+        AE_MOVPA24((input_beta_multiplier >> 8));
+    ae_p24x2s p_diffmin = AE_MOVPA24(diffmin);
+    int input_beta_left_shift_for_24bit = input_beta_left_shift - 8;
+
+    p_in = (WORD8 *)pVec;
+    WUR_AE_SAR(input_beta_left_shift_for_24bit);
+
+    if (pre_loop_count) {
+      p_x = AE_MOVPA24(*p_in++);
+      p_rem_x = p_x - p_max;
+      p_y = AE_SLLSSP24S(p_rem_x);
+
+      ae_q56s q_dequantized_y1 = AE_MULFP24S_LL(p_y, p_input_beta_multiplier);
+
+      ae_p24x2s p_dequantized_y1 = AE_ROUNDSP24Q48ASYM(q_dequantized_y1);
+
+      EXP_Q26_I(p_exp_y, p_dequantized_y1)
+
+      xtbool2 flag_cmp = AE_LTP24S(p_rem_x, p_diffmin);
+      AE_MOVTP24X2(p_exp_y, p_zero, flag_cmp);
+
+      *(ae_p24f *)&pExp[0] = p_exp_y;
+
+      p_exp_y = p_exp_y >> 4;
+
+      AE_MULAP24S_LL(q_sum_exp, p_exp_y, AE_MOVPA24(1));
+    }
+
+    p_in -= 2;
+    for (i = 0; i < main_loop_count; i++) {
+      AE_LP8X2F_IU(p_x, p_in, 2 * sizeof(WORD8));
+      p_x = AE_SRAIP24(p_x, 16);
+      p_rem_x = p_x - p_max;
+      p_y = AE_SLLSSP24S(p_rem_x);
+
+      ae_q56s q_dequantized_y1 = AE_MULFP24S_HH(p_y, p_input_beta_multiplier);
+      ae_q56s q_dequantized_y2 = AE_MULFP24S_LL(p_y, p_input_beta_multiplier);
+
+      ae_p24x2s p_dequantized_y1 = AE_ROUNDSP24Q48ASYM(q_dequantized_y1);
+      ae_p24x2s p_dequantized_y2 = AE_ROUNDSP24Q48ASYM(q_dequantized_y2);
+
+      ae_p24x2s p_dequantized =
+          AE_SELP24_LL(p_dequantized_y1, p_dequantized_y2);
+
+      EXP_Q26_II(p_exp_y, p_dequantized)
+
+      xtbool2 flag_cmp = AE_LTP24S(p_rem_x, p_diffmin);
+      AE_MOVTP24X2(p_exp_y, p_zero, flag_cmp);
+
+      //*(ae_p24x2f *)&pExp[pre_loop_count + 2*i] = p_exp_y;
+      *(ae_p24f *)&pExp[pre_loop_count + 2 * i] =
+          AE_SELP24_HH(p_exp_y, p_exp_y);
+      *(ae_p24f *)&pExp[pre_loop_count + 2 * i + 1] =
+          AE_SELP24_LL(p_exp_y, p_exp_y);
+      //*(ae_p24f *)&pExp[0] = p_exp_y;
+
+      p_exp_y = p_exp_y >> 4;
+
+      AE_MULAAP24S_HH_LL(q_sum_exp, p_exp_y, AE_MOVPA24(1));
+    }
+
+    if (post_loop_count) {
+      p_in += 2;
+
+      p_x = AE_MOVPA24(*p_in);
+      p_rem_x = p_x - p_max;
+      p_y = AE_SLLSSP24S(p_rem_x);
+
+      ae_q56s q_dequantized_y1 = AE_MULFP24S_LL(p_y, p_input_beta_multiplier);
+
+      ae_p24x2s p_dequantized_y1 = AE_ROUNDSP24Q48ASYM(q_dequantized_y1);
+
+      EXP_Q26_I(p_exp_y, p_dequantized_y1)
+
+      xtbool2 flag_cmp = AE_LTP24S(p_rem_x, p_diffmin);
+      AE_MOVTP24X2(p_exp_y, p_zero, flag_cmp);
+
+      *(ae_p24f *)&pExp[vec_length - 1] = p_exp_y;
+
+      p_exp_y = p_exp_y >> 4;
+
+      AE_MULAP24S_LL(q_sum_exp, p_exp_y, AE_MOVPA24(1));
+    }
+
+    p_recip_sum_exp = GetReciprocal(q_sum_exp, 12, &shift_bits_reciprocal);
+  }
+
+  /* Calculate output */
+  pExp = (WORD32 *)ALIGN_PTR(pScratch, ALIGNMENT);
+  {
+    ae_p24x2s p_exp;
+
+    int shift_val = -(shift_bits_reciprocal + 31 - 8 - 8);
+
+    ae_p24x2s p_min = AE_MOVPA24(-128);
+    ae_p24x2s p_max = AE_MOVPA24(127);
+
+    for (i = 0; i<vec_length> > 1; i++) {
+      int out;
+
+      p_exp = *(ae_p24x2f *)&pExp[2 * i];
+
+      ae_q56s q_tmp1 = AE_MULFP24S_HH(p_exp, p_recip_sum_exp);
+      ae_q56s q_tmp2 = AE_MULFP24S_LL(p_exp, p_recip_sum_exp);
+
+      q_tmp1 = AE_SLAASQ56S(q_tmp1, shift_val);
+      q_tmp2 = AE_SLAASQ56S(q_tmp2, shift_val);
+
+      ae_p24x2s p_out1 = AE_ROUNDSP24Q48ASYM(q_tmp1);
+      ae_p24x2s p_out2 = AE_ROUNDSP24Q48ASYM(q_tmp2);
+
+      ae_p24x2s p_out = AE_SELP24_LL(p_out1, p_out2);
+
+      p_out = AE_SUBSP24S(p_out, AE_MOVPA24(128));
+      p_out = AE_MAXP24S(p_out, p_min);
+      p_out = AE_MINP24S(p_out, p_max);
+
+      out = AE_MOVAP24S_H(p_out);
+      *pOut++ = (WORD8)out;
+
+      out = AE_MOVAP24S_L(p_out);
+      *pOut++ = (WORD8)out;
+    }
+
+    if (vec_length & 0x1) {
+      int out;
+
+      p_exp = *(ae_p24f *)&pExp[vec_length - 1];
+
+      ae_q56s q_tmp1 = AE_MULFP24S_LL(p_exp, p_recip_sum_exp);
+
+      q_tmp1 = AE_SLAASQ56S(q_tmp1, shift_val);
+
+      ae_p24x2s p_out = AE_ROUNDSP24Q48ASYM(q_tmp1);
+
+      p_out = AE_SUBSP24S(p_out, AE_MOVPA24(128));
+      p_out = AE_MAXP24S(p_out, p_min);
+      p_out = AE_MINP24S(p_out, p_max);
+
+      out = AE_MOVAP24S_L(p_out);
+      *pOut++ = (WORD8)out;
+    }
+  }
+
+  return 0;
+}
+
+WORD32 xa_nn_vec_softmax_asym8s_16(WORD16 *__restrict__ pOut,
+                                   const WORD8 *__restrict__ pVec,
+                                   WORD32 diffmin, WORD32 input_beta_left_shift,
+                                   WORD32 input_beta_multiplier,
+                                   WORD32 vec_length, pVOID pScratch) {
+  /* NULL pointer checks */
+  XA_NNLIB_ARG_CHK_PTR(pOut, -1);
+  XA_NNLIB_ARG_CHK_PTR(pVec, -1);
+  XA_NNLIB_ARG_CHK_PTR(pScratch, -1);
+  /* Pointer alignment checks */
+  /* No alignment (1-byte) needed for any pointer */
+  /* Basic Parameter checks */
+  XA_NNLIB_ARG_CHK_COND((vec_length <= 0), -1);
+  XA_NNLIB_ARG_CHK_COND(
+      ((input_beta_left_shift < -31) || (input_beta_left_shift > 31)), -1);
+  XA_NNLIB_ARG_CHK_COND((input_beta_multiplier < 0), -1);
+
+  int i;
+  int shift_bits_reciprocal;
+  WORD8 *p_in;
+  WORD32 *__restrict pExp = (WORD32 *)ALIGN_PTR(pScratch, ALIGNMENT);
+  ae_p24x2s p_recip_sum_exp;
+  ae_p24x2s p_x;
+  ae_p24x2s p_max = AE_MOVPA24(0xFF800000);
+
+  int pre_loop_count;
+  int main_loop_count;
+  int post_loop_count;
+
+  if (vec_length > 1) {
+    pre_loop_count = (int)pVec & 0x1;
+    main_loop_count = vec_length - pre_loop_count;
+    post_loop_count = (main_loop_count & 1);
+    main_loop_count = main_loop_count >> 1;
+  } else {
+    pre_loop_count = 0;
+    main_loop_count = 0;
+    post_loop_count = vec_length;
+  }
+
+  /* Calculating Max */
+  {
+    p_in = (WORD8 *)pVec;
+
+    if (pre_loop_count) {
+      p_x = AE_MOVPA24(*p_in++);
+      p_max = AE_MAXP24S(p_max, p_x);
+    }
+
+    p_in -= 2;
+    for (i = 0; i < main_loop_count; i++) {
+      AE_LP8X2F_IU(p_x, p_in, 2 * sizeof(WORD8));
+      p_max = AE_MAXP24S(p_max, p_x);
+    }
+    p_max = AE_SRAIP24(p_max, 16);
+
+    if (post_loop_count) {
+      p_in += 2;
+      p_x = AE_MOVPA24(*p_in);
+      p_max = AE_MAXP24S(p_max, p_x);
+    }
+    p_max = AE_MAXP24S(p_max, AE_SELP24_LH(p_max, p_max));
+  }
+
+  /* Calculate exponents */
+  {
+    ae_q56s q_sum_exp = AE_ZEROQ56();
+    ae_p24x2s p_rem_x, p_y, p_exp_y;
+    ae_p24x2s p_zero = AE_ZEROP48();
+    ae_p24x2s p_input_beta_multiplier =
+        AE_MOVPA24((input_beta_multiplier >> 8));
+    ae_p24x2s p_diffmin = AE_MOVPA24(diffmin);
+    int input_beta_left_shift_for_24bit = input_beta_left_shift - 8;
+
+    p_in = (WORD8 *)pVec;
+    WUR_AE_SAR(input_beta_left_shift_for_24bit);
+
+    if (pre_loop_count) {
+      p_x = AE_MOVPA24(*p_in++);
+      p_rem_x = p_x - p_max;
+      p_y = AE_SLLSSP24S(p_rem_x);
+
+      ae_q56s q_dequantized_y1 = AE_MULFP24S_LL(p_y, p_input_beta_multiplier);
+
+      ae_p24x2s p_dequantized_y1 = AE_ROUNDSP24Q48ASYM(q_dequantized_y1);
+
+      EXP_Q26_I(p_exp_y, p_dequantized_y1)
+
+      xtbool2 flag_cmp = AE_LTP24S(p_rem_x, p_diffmin);
+      AE_MOVTP24X2(p_exp_y, p_zero, flag_cmp);
+
+      *(ae_p24f *)&pExp[0] = p_exp_y;
+
+      p_exp_y = p_exp_y >> 4;
+
+      AE_MULAP24S_LL(q_sum_exp, p_exp_y, AE_MOVPA24(1));
+    }
+
+    p_in -= 2;
+    for (i = 0; i < main_loop_count; i++) {
+      AE_LP8X2F_IU(p_x, p_in, 2 * sizeof(WORD8));
+      p_x = AE_SRAIP24(p_x, 16);
+      p_rem_x = p_x - p_max;
+      p_y = AE_SLLSSP24S(p_rem_x);
+
+      ae_q56s q_dequantized_y1 = AE_MULFP24S_HH(p_y, p_input_beta_multiplier);
+      ae_q56s q_dequantized_y2 = AE_MULFP24S_LL(p_y, p_input_beta_multiplier);
+
+      ae_p24x2s p_dequantized_y1 = AE_ROUNDSP24Q48ASYM(q_dequantized_y1);
+      ae_p24x2s p_dequantized_y2 = AE_ROUNDSP24Q48ASYM(q_dequantized_y2);
+
+      ae_p24x2s p_dequantized =
+          AE_SELP24_LL(p_dequantized_y1, p_dequantized_y2);
+
+      EXP_Q26_II(p_exp_y, p_dequantized)
+
+      xtbool2 flag_cmp = AE_LTP24S(p_rem_x, p_diffmin);
+      AE_MOVTP24X2(p_exp_y, p_zero, flag_cmp);
+
+      *(ae_p24f *)&pExp[pre_loop_count + 2 * i] =
+          AE_SELP24_HH(p_exp_y, p_exp_y);
+      *(ae_p24f *)&pExp[pre_loop_count + 2 * i + 1] =
+          AE_SELP24_LL(p_exp_y, p_exp_y);
+
+      p_exp_y = p_exp_y >> 4;
+
+      AE_MULAAP24S_HH_LL(q_sum_exp, p_exp_y, AE_MOVPA24(1));
+    }
+
+    if (post_loop_count) {
+      p_in += 2;
+
+      p_x = AE_MOVPA24(*p_in);
+      p_rem_x = p_x - p_max;
+      p_y = AE_SLLSSP24S(p_rem_x);
+
+      ae_q56s q_dequantized_y1 = AE_MULFP24S_LL(p_y, p_input_beta_multiplier);
+
+      ae_p24x2s p_dequantized_y1 = AE_ROUNDSP24Q48ASYM(q_dequantized_y1);
+
+      EXP_Q26_I(p_exp_y, p_dequantized_y1)
+
+      xtbool2 flag_cmp = AE_LTP24S(p_rem_x, p_diffmin);
+      AE_MOVTP24X2(p_exp_y, p_zero, flag_cmp);
+
+      *(ae_p24f *)&pExp[vec_length - 1] = p_exp_y;
+
+      p_exp_y = p_exp_y >> 4;
+
+      AE_MULAP24S_LL(q_sum_exp, p_exp_y, AE_MOVPA24(1));
+    }
+
+    p_recip_sum_exp = GetReciprocal(q_sum_exp, 12, &shift_bits_reciprocal);
+  }
+
+  /* Calculate output */
+  pExp = (WORD32 *)ALIGN_PTR(pScratch, ALIGNMENT);
+  {
+    ae_p24x2s p_exp;
+
+    int shift_val = -(shift_bits_reciprocal + 31 - 8 - 16);
+
+    ae_p24x2s p_min = AE_MOVPA24(-32768);
+    ae_p24x2s p_max = AE_MOVPA24(32767);
+
+    for (i = 0; i<vec_length> > 1; i++) {
+      int out;
+
+      p_exp = *(ae_p24x2f *)&pExp[2 * i];
+
+      ae_q56s q_tmp1 = AE_MULFP24S_HH(p_exp, p_recip_sum_exp);
+      ae_q56s q_tmp2 = AE_MULFP24S_LL(p_exp, p_recip_sum_exp);
+
+      q_tmp1 = AE_SLAASQ56S(q_tmp1, shift_val);
+      q_tmp2 = AE_SLAASQ56S(q_tmp2, shift_val);
+
+      ae_p24x2s p_out1 = AE_ROUNDSP24Q48ASYM(q_tmp1);
+      ae_p24x2s p_out2 = AE_ROUNDSP24Q48ASYM(q_tmp2);
+
+      ae_p24x2s p_out = AE_SELP24_LL(p_out1, p_out2);
+
+      p_out = AE_SUBSP24S(p_out, AE_MOVPA24(32768));
+      p_out = AE_MAXP24S(p_out, p_min);
+      p_out = AE_MINP24S(p_out, p_max);
+
+      out = AE_MOVAP24S_H(p_out);
+      *pOut++ = (WORD16)out;
+
+      out = AE_MOVAP24S_L(p_out);
+      *pOut++ = (WORD16)out;
+    }
+
+    if (vec_length & 0x1) {
+      int out;
+
+      p_exp = *(ae_p24f *)&pExp[vec_length - 1];
+
+      ae_q56s q_tmp1 = AE_MULFP24S_LL(p_exp, p_recip_sum_exp);
+
+      q_tmp1 = AE_SLAASQ56S(q_tmp1, shift_val);
+
+      ae_p24x2s p_out = AE_ROUNDSP24Q48ASYM(q_tmp1);
+
+      p_out = AE_SUBSP24S(p_out, AE_MOVPA24(32768));
+      p_out = AE_MAXP24S(p_out, p_min);
+      p_out = AE_MINP24S(p_out, p_max);
+
+      out = AE_MOVAP24S_L(p_out);
+      *pOut++ = (WORD16)out;
+    }
+  }
+
+  return 0;
+}
+
+int xa_nn_get_softmax_scratch_size(int inp_precision, int out_precision,
+                                   int length) {
+  int size_of_one_elm_in_bytes, total_bytes;
+  (void)out_precision;
+
+  /* This function returns scratch size required by softmax implementation in
+     bytes scratch memory is needed to save exponents of inputs computed in the
+     function, every exponent is computed as 32 bit (4 bytes) number currently*/
+  switch (inp_precision) {
+    case PREC_ASYM8U:
+      size_of_one_elm_in_bytes = 4;
+      break;
+    case PREC_SYM8S:
+      size_of_one_elm_in_bytes = 4;
+      break;
+    default:
+      size_of_one_elm_in_bytes = 4;
+      break;
+  }
+
+  total_bytes = size_of_one_elm_in_bytes * length;
+  total_bytes = ALIGNED_SIZE(total_bytes, ALIGNMENT);
+
+  return total_bytes;
+}
diff --git a/tensorflow/lite/micro/kernels/xtensa_hifimini/xa_nnlib/algo/kernels/basic/hifi_mini/xa_nn_dot_prod_16x16.c b/tensorflow/lite/micro/kernels/xtensa_hifimini/xa_nnlib/algo/kernels/basic/hifi_mini/xa_nn_dot_prod_16x16.c
new file mode 100644
index 00000000000..80697ca7068
--- /dev/null
+++ b/tensorflow/lite/micro/kernels/xtensa_hifimini/xa_nnlib/algo/kernels/basic/hifi_mini/xa_nn_dot_prod_16x16.c
@@ -0,0 +1,175 @@
+/*******************************************************************************
+ * Copyright (c) 2019-2020 Cadence Design Systems, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to use this Software with Cadence processor cores only and
+ * not with any other processors and platforms, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ ******************************************************************************/
+
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xa_nnlib_common.h"
+#include "xa_nnlib_common_macros.h"
+
+/*----------------------------Main function---------------------------------*/
+WORD32 xa_nn_dot_prod_16x16_asym8s(
+    WORD8 *__restrict__ p_out,               /* pointer to output */
+    const WORD16 *__restrict__ p_inp1_start, /* pointer to input1 */
+    const WORD16 *__restrict__ p_inp2_start, /* pointer to input2 */
+    const WORD32 *bias_ptr, WORD32 vec_length, WORD32 out_multiplier,
+    WORD32 out_shift, WORD32 out_zero_bias, WORD32 vec_count) {
+  /* NULL pointer checks */
+  XA_NNLIB_ARG_CHK_PTR(p_out, -1);
+  XA_NNLIB_ARG_CHK_PTR(p_inp1_start, -1);
+  XA_NNLIB_ARG_CHK_PTR(p_inp2_start, -1);
+  /* Pointer alignment checks */
+  XA_NNLIB_ARG_CHK_ALIGN(p_inp1_start, sizeof(WORD16), -1);
+  XA_NNLIB_ARG_CHK_ALIGN(p_inp2_start, sizeof(WORD16), -1);
+  /* Basic Parameter checks */
+  XA_NNLIB_ARG_CHK_COND((vec_length <= 0), -1);
+  XA_NNLIB_ARG_CHK_COND((out_shift < -31 || out_shift > 31), -1);
+  XA_NNLIB_ARG_CHK_COND((out_zero_bias < -128 || out_zero_bias > 127), -1);
+  int left_shift, right_shift;
+  int loopcnt;
+  const WORD32 bias_buffer[2] = {0, 0};
+  const WORD32 *p_bias_load;
+  WORD32 bias_address_increment = sizeof(WORD32);
+
+  if (bias_ptr == NULL) {
+    p_bias_load = bias_buffer - 1;
+    bias_address_increment = 0;
+  } else {
+    p_bias_load = bias_ptr - 1;
+  }
+
+  left_shift = out_shift < 0 ? 0 : out_shift;
+  right_shift = out_shift > 0 ? 0 : -out_shift;
+  /* inp1 4-bytes aligned, inp2 4-bytes aligned and vec_length is multple of 2
+   */
+  if (((((unsigned)p_inp1_start) & 0x3) == 0) &&
+      ((((unsigned)p_inp2_start) & 0x3) == 0) && ((vec_length & 0x1) == 0)) {
+    const ae_p16x2s *pt_inp1, *pt_inp2;
+    pt_inp1 = (const ae_p16x2s *)&p_inp1_start[-2];
+    pt_inp2 = (const ae_p16x2s *)&p_inp2_start[-2];
+
+    ae_q56s output_int8_max_56 = AE_CVTQ48A32S(127);
+    ae_q56s output_int8_min_56 = AE_CVTQ48A32S(-128);
+    for (loopcnt = 0; loopcnt < vec_count; loopcnt++) {
+      ae_p24x2s dp_inp1, dp_inp2;
+      ae_q32s dq_out32;
+      ae_q56s dq_out;
+      int i;
+
+      AE_LQ32F_XU(dq_out, (ae_q32s *)p_bias_load, bias_address_increment);
+
+      for (i = 0; i < (vec_length >> 1); i++) {
+        AE_LP16X2F_IU(dp_inp1, pt_inp1, 4);
+        AE_LP16X2F_IU(dp_inp2, pt_inp2, 4);
+        AE_MULAAP24S_HH_LL(dq_out, dp_inp1, dp_inp2);
+      }
+
+      dq_out32 = AE_SATQ48S(dq_out);
+      MULTIPLY_BY_QUANTIZED_MULTIPLIER(dq_out, AE_TRUNCA32Q48(dq_out32),
+                                       out_multiplier, left_shift, right_shift);
+      dq_out = AE_ADDSQ56S(dq_out, AE_CVTQ48A32S(out_zero_bias));
+
+      dq_out = AE_MAXQ56S(dq_out, output_int8_min_56);
+      dq_out = AE_MINQ56S(dq_out, output_int8_max_56);
+      *p_out++ = (WORD8)AE_TRUNCA32Q48(dq_out);
+    }
+  } else {
+#ifndef DISABLE_NNLIB_UNALIGNED_SUPPORT
+    for (loopcnt = 0; loopcnt < vec_count; loopcnt++) {
+      ae_p24x2s dp_inp1, dp_inp2;
+      ae_q32s dq_out32;
+      ae_q56s dq_out;
+      int i;
+      const WORD16 *p_inp1 = (WORD16 *)&p_inp1_start[loopcnt * vec_length];
+      const WORD16 *p_inp2 = (WORD16 *)&p_inp2_start[loopcnt * vec_length];
+
+      AE_LQ32F_XU(dq_out, (ae_q32s *)p_bias_load, bias_address_increment);
+
+      if (((((unsigned)p_inp1) & 3) != 0 && (((unsigned)p_inp2) & 3) != 0) ||
+          ((((unsigned)p_inp1) & 3) == 0 && (((unsigned)p_inp2) & 3) == 0)) {
+        int pre_loop_count = ((int)(((unsigned)p_inp1) & 3)) >> 1;
+        if (pre_loop_count != 0) {
+          dp_inp1 = AE_CVTP24A16X2_LL(*p_inp1++, *p_inp2++);
+          AE_MULAP24S_HL(dq_out, dp_inp1, dp_inp1);
+        }
+        const ae_p16x2s *pt_inp1, *pt_inp2;
+        pt_inp1 = (const ae_p16x2s *)(p_inp1 - 2);
+        pt_inp2 = (const ae_p16x2s *)(p_inp2 - 2);
+        for (i = 0; i < (vec_length - pre_loop_count - 1); i += 2) {
+          AE_LP16X2F_IU(dp_inp1, pt_inp1, 4);
+          AE_LP16X2F_IU(dp_inp2, pt_inp2, 4);
+          AE_MULAAP24S_HH_LL(dq_out, dp_inp1, dp_inp2);
+        }
+        if ((vec_length - pre_loop_count) & 1) {
+          dp_inp1 = AE_CVTP24A16X2_LL(p_inp1[i], p_inp2[i]);
+          AE_MULAP24S_HL(dq_out, dp_inp1, dp_inp1);
+        }
+      } else {
+        /* One of the pointers in not aligned to 4 bytes, if it is p_inp1, swap
+         * them */
+        if ((((unsigned)p_inp1) & 3) != 0) {
+          const WORD16 *p_tmp;
+          p_tmp = p_inp1;
+          p_inp1 = p_inp2;
+          p_inp2 = p_tmp;
+        }
+        const ae_p16x2s *pt_inp1 = (const ae_p16x2s *)(p_inp1 - 2);
+        const ae_p16s *pt_inp2 = (const ae_p16s *)(p_inp2 - 1);
+        for (i = 0; i < (vec_length - 1); i += 2) {
+          ae_p24x2s dp_t0, dp_t1;
+          AE_LP16X2F_IU(dp_inp1, pt_inp1, 4);
+          AE_LP16F_IU(dp_t0, pt_inp2, 2);
+          AE_LP16F_IU(dp_t1, pt_inp2, 2);
+          dp_inp2 = AE_SELP24_LL(dp_t0, dp_t1);
+          AE_MULAAP24S_HH_LL(dq_out, dp_inp1, dp_inp2);
+        }
+        if (vec_length & 1) {
+          dp_inp1 = AE_CVTP24A16X2_LL(p_inp1[i], p_inp2[i]);
+          AE_MULAP24S_HL(dq_out, dp_inp1, dp_inp1);
+        }
+      }
+      dq_out32 = AE_SATQ48S(dq_out);
+      MULTIPLY_BY_QUANTIZED_MULTIPLIER(dq_out, AE_TRUNCA32Q48(dq_out32),
+                                       out_multiplier, left_shift, right_shift);
+      dq_out = AE_ADDSQ56S(dq_out, AE_CVTQ48A32S(out_zero_bias));
+      WORD32 out_i32 = AE_TRUNCA32Q48(AE_SATQ48S(dq_out));
+      out_i32 = out_i32 < -128 ? -128 : out_i32;
+      out_i32 = out_i32 > 127 ? 127 : out_i32;
+      *p_out++ = (WORD8)out_i32;
+    }
+#else
+    return 1;
+#endif
+  }
+  return 0;
+}
diff --git a/tensorflow/lite/micro/kernels/xtensa_hifimini/xa_nnlib/algo/kernels/fc/hifi_mini/xa_nn_fully_connected.c b/tensorflow/lite/micro/kernels/xtensa_hifimini/xa_nnlib/algo/kernels/fc/hifi_mini/xa_nn_fully_connected.c
new file mode 100644
index 00000000000..0a9325e81bf
--- /dev/null
+++ b/tensorflow/lite/micro/kernels/xtensa_hifimini/xa_nnlib/algo/kernels/fc/hifi_mini/xa_nn_fully_connected.c
@@ -0,0 +1,142 @@
+/*******************************************************************************
+ * Copyright (c) 2019-2020 Cadence Design Systems, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to use this Software with Cadence processor cores only and
+ * not with any other processors and platforms, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ ******************************************************************************/
+
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xa_nnlib_err_chk.h"
+#include "xa_nnlib_kernels_api.h"
+#include "xa_type_def.h"
+
+WORD32 xa_nn_fully_connected_asym8uxasym8u_asym8u(
+    UWORD8 *__restrict__ p_out, const UWORD8 *__restrict__ p_weight,
+    const UWORD8 *__restrict__ p_inp, const WORD32 *__restrict__ p_bias,
+    WORD32 weight_depth, WORD32 out_depth, WORD32 input_zero_bias,
+    WORD32 weight_zero_bias, WORD32 out_multiplier, WORD32 out_shift,
+    WORD32 out_zero_bias) {
+  /* NULL pointer checks */
+  XA_NNLIB_ARG_CHK_PTR(p_out, -1);
+  XA_NNLIB_ARG_CHK_PTR(p_weight, -1);
+  XA_NNLIB_ARG_CHK_PTR(p_inp, -1);
+  XA_NNLIB_ARG_CHK_PTR(p_bias, -1);
+  /* Pointer alignment checks */
+  XA_NNLIB_ARG_CHK_ALIGN(p_bias, sizeof(WORD32), -1);
+  /* Basic Parameter checks */
+  XA_NNLIB_ARG_CHK_COND((out_depth <= 0), -1);
+  XA_NNLIB_ARG_CHK_COND((input_zero_bias < -255 || input_zero_bias > 0), -1);
+  XA_NNLIB_ARG_CHK_COND((weight_zero_bias < -255 || weight_zero_bias > 0), -1);
+  XA_NNLIB_ARG_CHK_COND((out_shift < -31 || out_shift > 31), -1);
+  XA_NNLIB_ARG_CHK_COND((out_zero_bias < 0 || out_zero_bias > 255), -1);
+
+  WORD32 ret = 0;
+  ret = xa_nn_matXvec_out_stride_asym8uxasym8u_asym8u(
+      p_out, p_weight, p_inp, p_bias, out_depth /* rows */
+      ,
+      weight_depth /* cols */
+      ,
+      weight_depth /* row_stride */
+      ,
+      1 /* out_stride */
+      ,
+      weight_zero_bias, input_zero_bias, out_multiplier, out_shift,
+      out_zero_bias);
+  return ret;
+}
+
+WORD32 xa_nn_fully_connected_sym8sxasym8s_asym8s(
+    WORD8 *__restrict__ p_out, const WORD8 *__restrict__ p_weight,
+    const WORD8 *__restrict__ p_inp, const WORD32 *__restrict__ p_bias,
+    WORD32 weight_depth, WORD32 out_depth, WORD32 input_zero_bias,
+    WORD32 out_multiplier, WORD32 out_shift, WORD32 out_zero_bias) {
+  /* NULL pointer checks */
+  XA_NNLIB_ARG_CHK_PTR(p_out, -1);
+  XA_NNLIB_ARG_CHK_PTR(p_weight, -1);
+  XA_NNLIB_ARG_CHK_PTR(p_inp, -1);
+  XA_NNLIB_ARG_CHK_PTR(p_bias, -1);
+  /* Pointer alignment checks */
+  XA_NNLIB_ARG_CHK_ALIGN(p_bias, sizeof(WORD32), -1);
+  /* Basic Parameter checks */
+  XA_NNLIB_ARG_CHK_COND((out_depth <= 0), -1);
+  XA_NNLIB_ARG_CHK_COND((input_zero_bias < -127 || input_zero_bias > 128), -1);
+  XA_NNLIB_ARG_CHK_COND((out_shift < -31 || out_shift > 31), -1);
+  XA_NNLIB_ARG_CHK_COND((out_zero_bias < -128 || out_zero_bias > 127), -1);
+
+  WORD32 ret = 0;
+  ret = xa_nn_matXvec_out_stride_sym8sxasym8s_asym8s(
+      p_out, p_weight, p_inp, p_bias, out_depth /* rows */
+      ,
+      weight_depth /* cols */
+      ,
+      weight_depth /* row_stride */
+      ,
+      1 /* out_stride */
+      ,
+      input_zero_bias, out_multiplier, out_shift, out_zero_bias);
+  return ret;
+}
+
+WORD32 xa_nn_fully_connected_asym8sxasym8s_asym8s(
+    WORD8 *__restrict__ p_out, const WORD8 *__restrict__ p_weight,
+    const WORD8 *__restrict__ p_inp, const WORD32 *__restrict__ p_bias,
+    WORD32 weight_depth, WORD32 out_depth, WORD32 weight_zero_bias,
+    WORD32 input_zero_bias, WORD32 out_multiplier, WORD32 out_shift,
+    WORD32 out_zero_bias) {
+  /* NULL pointer checks */
+  XA_NNLIB_ARG_CHK_PTR(p_out, -1);
+  XA_NNLIB_ARG_CHK_PTR(p_weight, -1);
+  XA_NNLIB_ARG_CHK_PTR(p_inp, -1);
+  XA_NNLIB_ARG_CHK_PTR(p_bias, -1);
+  /* Pointer alignment checks */
+  XA_NNLIB_ARG_CHK_ALIGN(p_bias, sizeof(WORD32), -1);
+  /* Basic Parameter checks */
+  XA_NNLIB_ARG_CHK_COND((out_depth <= 0), -1);
+  XA_NNLIB_ARG_CHK_COND((weight_zero_bias < -127 || weight_zero_bias > 128),
+                        -1);
+  XA_NNLIB_ARG_CHK_COND((input_zero_bias < -127 || input_zero_bias > 128), -1);
+  XA_NNLIB_ARG_CHK_COND((out_shift < -31 || out_shift > 31), -1);
+  XA_NNLIB_ARG_CHK_COND((out_zero_bias < -128 || out_zero_bias > 127), -1);
+
+  WORD32 ret = 0;
+  ret = xa_nn_matXvec_out_stride_asym8sxasym8s_asym8s(
+      p_out, p_weight, p_inp, p_bias, out_depth /* rows */
+      ,
+      weight_depth /* cols */
+      ,
+      weight_depth /* row_stride */
+      ,
+      1 /* out_stride */
+      ,
+      weight_zero_bias, input_zero_bias, out_multiplier, out_shift,
+      out_zero_bias);
+  return ret;
+}
diff --git a/tensorflow/lite/micro/kernels/xtensa_hifimini/xa_nnlib/algo/kernels/matXvec/hifi_mini/xa_nn_matXvec_sym8sxasym8s.c b/tensorflow/lite/micro/kernels/xtensa_hifimini/xa_nnlib/algo/kernels/matXvec/hifi_mini/xa_nn_matXvec_sym8sxasym8s.c
new file mode 100644
index 00000000000..71af822e68b
--- /dev/null
+++ b/tensorflow/lite/micro/kernels/xtensa_hifimini/xa_nnlib/algo/kernels/matXvec/hifi_mini/xa_nn_matXvec_sym8sxasym8s.c
@@ -0,0 +1,1053 @@
+/*******************************************************************************
+ * Copyright (c) 2019-2020 Cadence Design Systems, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to use this Software with Cadence processor cores only and
+ * not with any other processors and platforms, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ ******************************************************************************/
+
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xa_nnlib_common.h"
+#include "xa_nnlib_common_macros.h"
+
+#define ADD_OUT_OFFSET_STORE_INT8(ptr, data, out_offset) \
+  {                                                      \
+    data = AE_ADDSQ56S(data, AE_CVTQ48A32S(out_offset)); \
+    int out_i32 = AE_TRUNCA32Q48(AE_SATQ48S(data));      \
+    out_i32 = out_i32 < -128 ? -128 : out_i32;           \
+    out_i32 = out_i32 > 127 ? 127 : out_i32;             \
+    *(ptr) = (WORD8)out_i32;                             \
+  }
+
+WORD32 xa_nn_matXvec_out_stride_sym8sxasym8s_asym8s(
+    WORD8 *__restrict__ p_out, const WORD8 *__restrict__ p_mat1,
+    const WORD8 *__restrict__ p_vec1, const WORD32 *__restrict__ p_bias,
+    WORD32 rows, WORD32 cols1, WORD32 row_stride1, WORD32 out_stride,
+    WORD32 vec1_zero_bias, WORD32 out_multiplier, WORD32 out_shift,
+    WORD32 out_zero_bias) {
+  /* NULL pointer checks */
+  XA_NNLIB_ARG_CHK_PTR(p_out, -1);
+  XA_NNLIB_ARG_CHK_PTR(p_mat1, -1);
+  XA_NNLIB_ARG_CHK_PTR(p_vec1, -1);
+  /* Pointer alignment checks */
+  XA_NNLIB_ARG_CHK_ALIGN(p_bias, sizeof(WORD32), -1);
+  /* Basic Parameter checks */
+  XA_NNLIB_ARG_CHK_COND((rows <= 0), -1);
+  XA_NNLIB_ARG_CHK_COND((cols1 <= 0), -1);
+  XA_NNLIB_ARG_CHK_COND((row_stride1 < cols1), -1);
+  XA_NNLIB_ARG_CHK_COND((vec1_zero_bias < -127 || vec1_zero_bias > 128), -1);
+  XA_NNLIB_ARG_CHK_COND((out_shift < -31 || out_shift > 31), -1);
+  XA_NNLIB_ARG_CHK_COND((out_zero_bias < -128 || out_zero_bias > 127), -1);
+
+  /* Iterators used in for loops */
+  int m_itr, c_itr, i;
+  /* Assign initial value so this value will be used in trailing loop */
+  m_itr = 0;
+  /* Shifts to match with Tensorflow */
+  int left_shift, right_shift;
+
+  left_shift = out_shift < 0 ? 0 : out_shift;
+  right_shift = out_shift > 0 ? 0 : -out_shift;
+
+  const WORD8 *p_mat1_0, *p_mat1_1, *p_mat1_2, *p_mat1_3;
+  const WORD8 *p_vec1_0;
+  ae_p24x2s dp_mat1_0, dp_mat1_1, dp_mat1_2, dp_mat1_3, dp_vec1_0;
+  ae_p24x2s dp_vec1_zb;
+  ae_q56s dq_acc[4];
+  ae_q56s dq_out32, dq_out;
+
+  dp_vec1_zb = AE_MOVPA24(vec1_zero_bias);
+  if (((((unsigned)p_mat1) & 1) == 0) && ((((unsigned)p_vec1) & 1) == 0) &&
+      ((row_stride1 & 1) == 0)) {
+    for (m_itr = 0; m_itr < (rows - 3); m_itr += 4) {
+      p_mat1_0 = &p_mat1[(m_itr + 0) * row_stride1 - 2];
+      p_mat1_1 = &p_mat1[(m_itr + 1) * row_stride1 - 2];
+      p_mat1_2 = &p_mat1[(m_itr + 2) * row_stride1 - 2];
+      p_mat1_3 = &p_mat1[(m_itr + 3) * row_stride1 - 2];
+      p_vec1_0 = p_vec1 - 2;
+
+      dq_acc[0] = dq_acc[1] = dq_acc[2] = dq_acc[3] = AE_ZEROQ56();
+
+      /* AE_LP8X2F* instruction loads in upper 8 bits of P register, so shifting
+      vector right by 16 to get multiplication result in middle 32 bits of Q
+      register (lower 16 bits 0) */
+      for (c_itr = 0; c_itr < (cols1 - 1); c_itr += 2) {
+        AE_LP8X2F_IU(dp_mat1_0, p_mat1_0, 2);
+        AE_LP8X2F_IU(dp_mat1_1, p_mat1_1, 2);
+        AE_LP8X2F_IU(dp_mat1_2, p_mat1_2, 2);
+        AE_LP8X2F_IU(dp_mat1_3, p_mat1_3, 2);
+        AE_LP8X2F_IU(dp_vec1_0, p_vec1_0, 2);
+        dp_vec1_0 = AE_SRAIP24(dp_vec1_0, 16);
+        dp_vec1_0 = AE_ADDSP24S(dp_vec1_0, dp_vec1_zb);
+        AE_MULAAP24S_HH_LL(dq_acc[0], dp_mat1_0, dp_vec1_0);
+        AE_MULAAP24S_HH_LL(dq_acc[1], dp_mat1_1, dp_vec1_0);
+        AE_MULAAP24S_HH_LL(dq_acc[2], dp_mat1_2, dp_vec1_0);
+        AE_MULAAP24S_HH_LL(dq_acc[3], dp_mat1_3, dp_vec1_0);
+      }
+      /* Pointers are aligned so can do 8X2 loads and ignore L parts of
+       * registers */
+      if (cols1 & 1) {
+        AE_LP8X2F_IU(dp_mat1_0, p_mat1_0, 2);
+        AE_LP8X2F_IU(dp_mat1_1, p_mat1_1, 2);
+        AE_LP8X2F_IU(dp_mat1_2, p_mat1_2, 2);
+        AE_LP8X2F_IU(dp_mat1_3, p_mat1_3, 2);
+        AE_LP8X2F_IU(dp_vec1_0, p_vec1_0, 2);
+        dp_vec1_0 = AE_SRAIP24(dp_vec1_0, 16);
+        dp_vec1_0 = AE_ADDSP24S(dp_vec1_0, dp_vec1_zb);
+        AE_MULAP24S_HH(dq_acc[0], dp_mat1_0, dp_vec1_0);
+        AE_MULAP24S_HH(dq_acc[1], dp_mat1_1, dp_vec1_0);
+        AE_MULAP24S_HH(dq_acc[2], dp_mat1_2, dp_vec1_0);
+        AE_MULAP24S_HH(dq_acc[3], dp_mat1_3, dp_vec1_0);
+      }
+
+      if (p_bias != NULL) {
+        for (i = 0; i < 4; i++)
+          dq_acc[i] = AE_ADDSQ56S(dq_acc[i], *(ae_q32s *)(&p_bias[m_itr + i]));
+      }
+
+      for (i = 0; i < 4; i++) {
+        dq_out32 = AE_SATQ48S(dq_acc[i]);
+        MULTIPLY_BY_QUANTIZED_MULTIPLIER(dq_out, AE_TRUNCA32Q48(dq_out32),
+                                         out_multiplier, left_shift,
+                                         right_shift);
+        ADD_OUT_OFFSET_STORE_INT8(&p_out[(m_itr + i) * out_stride], dq_out,
+                                  out_zero_bias);
+      }
+    }
+    for (; m_itr < rows; m_itr++) {
+      p_mat1_0 = &p_mat1[m_itr * row_stride1 - 2];
+      p_vec1_0 = p_vec1 - 2;
+
+      dq_acc[0] = AE_ZEROQ56();
+
+      for (c_itr = 0; c_itr < (cols1 - 1); c_itr += 2) {
+        AE_LP8X2F_IU(dp_mat1_0, p_mat1_0, 2);
+        AE_LP8X2F_IU(dp_vec1_0, p_vec1_0, 2);
+        dp_vec1_0 = AE_SRAIP24(dp_vec1_0, 16);
+        dp_vec1_0 = AE_ADDSP24S(dp_vec1_0, dp_vec1_zb);
+        AE_MULAAP24S_HH_LL(dq_acc[0], dp_mat1_0, dp_vec1_0);
+      }
+      /* Pointers are aligned so can do 8X2 loads and ignore L parts of
+       * registers */
+      if (cols1 & 1) {
+        AE_LP8X2F_IU(dp_mat1_0, p_mat1_0, 2);
+        AE_LP8X2F_IU(dp_vec1_0, p_vec1_0, 2);
+        dp_vec1_0 = AE_SRAIP24(dp_vec1_0, 16);
+        dp_vec1_0 = AE_ADDSP24S(dp_vec1_0, dp_vec1_zb);
+        AE_MULAP24S_HH(dq_acc[0], dp_mat1_0, dp_vec1_0);
+      }
+
+      if (p_bias != NULL)
+        dq_acc[0] = AE_ADDSQ56S(dq_acc[0], *(ae_q32s *)(&p_bias[m_itr]));
+
+      dq_out32 = AE_SATQ48S(dq_acc[0]);
+      MULTIPLY_BY_QUANTIZED_MULTIPLIER(dq_out, AE_TRUNCA32Q48(dq_out32),
+                                       out_multiplier, left_shift, right_shift);
+      ADD_OUT_OFFSET_STORE_INT8(&p_out[m_itr * out_stride], dq_out,
+                                out_zero_bias);
+    }
+  } else {
+    if ((((unsigned)p_mat1) & 1) == 0) {
+      for (m_itr = 0; m_itr < (rows - 3); m_itr += 4) {
+        p_mat1_0 = &p_mat1[(m_itr + 0) * row_stride1 - 2];
+        p_mat1_1 = &p_mat1[(m_itr + 1) * row_stride1];
+        p_mat1_2 = &p_mat1[(m_itr + 2) * row_stride1 - 2];
+        p_mat1_3 = &p_mat1[(m_itr + 3) * row_stride1];
+        p_vec1_0 = p_vec1;
+
+        dq_acc[0] = dq_acc[1] = dq_acc[2] = dq_acc[3] = AE_ZEROQ56();
+
+        /* Matrix elements are kept in upper 8 bits of P registers, vector
+        elements are kept in lower 8 bits of P registers, typecasting to UWORD8
+        is to avoid extra extui instructions since signed 8-bit load in not
+        there in HiFiMini */
+        for (c_itr = 0; c_itr < (cols1 - 1); c_itr += 2) {
+          AE_LP8X2F_IU(dp_mat1_0, p_mat1_0, 2);
+          dp_mat1_1 = AE_CVTP24A16X2_LL((UWORD8)p_mat1_1[c_itr],
+                                        (UWORD8)p_mat1_1[c_itr + 1]);
+          AE_LP8X2F_IU(dp_mat1_2, p_mat1_2, 2);
+          dp_mat1_3 = AE_CVTP24A16X2_LL((UWORD8)p_mat1_3[c_itr],
+                                        (UWORD8)p_mat1_3[c_itr + 1]);
+          dp_vec1_0 = AE_CVTP24A16X2_LL((UWORD8)p_vec1_0[c_itr],
+                                        (UWORD8)p_vec1_0[c_itr + 1]);
+          dp_mat1_1 = AE_SLLIP24(dp_mat1_1, 8);
+          dp_mat1_3 = AE_SLLIP24(dp_mat1_3, 8);
+          dp_vec1_0 = AE_SLLIP24(dp_vec1_0, 8);
+          dp_vec1_0 = AE_SRAIP24(dp_vec1_0, 16);
+          dp_vec1_0 = AE_ADDSP24S(dp_vec1_0, dp_vec1_zb);
+          AE_MULAAP24S_HH_LL(dq_acc[0], dp_mat1_0, dp_vec1_0);
+          AE_MULAAP24S_HH_LL(dq_acc[1], dp_mat1_1, dp_vec1_0);
+          AE_MULAAP24S_HH_LL(dq_acc[2], dp_mat1_2, dp_vec1_0);
+          AE_MULAAP24S_HH_LL(dq_acc[3], dp_mat1_3, dp_vec1_0);
+        }
+        if (cols1 & 1) {
+          ae_p24x2s dp_mat1_01, dp_mat1_23;
+          dp_mat1_01 =
+              AE_CVTP24A16X2_LL((UWORD8)p_mat1_0[2], (UWORD8)p_mat1_1[c_itr]);
+          dp_mat1_23 =
+              AE_CVTP24A16X2_LL((UWORD8)p_mat1_2[2], (UWORD8)p_mat1_3[c_itr]);
+          dp_vec1_0 = AE_MOVPA24(p_vec1_0[c_itr]);
+          dp_mat1_01 = AE_SLLIP24(dp_mat1_01, 8);
+          dp_mat1_23 = AE_SLLIP24(dp_mat1_23, 8);
+          dp_vec1_0 = AE_ADDSP24S(dp_vec1_0, dp_vec1_zb);
+          AE_MULAP24S_HH(dq_acc[0], dp_mat1_01, dp_vec1_0);
+          AE_MULAP24S_LL(dq_acc[1], dp_mat1_01, dp_vec1_0);
+          AE_MULAP24S_HH(dq_acc[2], dp_mat1_23, dp_vec1_0);
+          AE_MULAP24S_LL(dq_acc[3], dp_mat1_23, dp_vec1_0);
+        }
+
+        if (p_bias != NULL) {
+          for (i = 0; i < 4; i++)
+            dq_acc[i] =
+                AE_ADDSQ56S(dq_acc[i], *(ae_q32s *)(&p_bias[m_itr + i]));
+        }
+
+        for (i = 0; i < 4; i++) {
+          dq_out32 = AE_SATQ48S(dq_acc[i]);
+          MULTIPLY_BY_QUANTIZED_MULTIPLIER(dq_out, AE_TRUNCA32Q48(dq_out32),
+                                           out_multiplier, left_shift,
+                                           right_shift);
+          ADD_OUT_OFFSET_STORE_INT8(&p_out[(m_itr + i) * out_stride], dq_out,
+                                    out_zero_bias);
+        }
+      }
+    } else {
+      for (m_itr = 0; m_itr < (rows - 3); m_itr += 4) {
+        p_mat1_0 = &p_mat1[(m_itr + 0) * row_stride1];
+        p_mat1_1 = &p_mat1[(m_itr + 1) * row_stride1];
+        p_mat1_2 = &p_mat1[(m_itr + 2) * row_stride1];
+        p_mat1_3 = &p_mat1[(m_itr + 3) * row_stride1];
+        p_vec1_0 = p_vec1;
+
+        dq_acc[0] = dq_acc[1] = dq_acc[2] = dq_acc[3] = AE_ZEROQ56();
+
+        /* Matrix elements are kept in upper 8 bits of P registers, vector
+        elements are kept in lower 8 bits of P registers, typecasting to UWORD8
+        is to avoid extra extui instructions since signed 8-bit load in not
+        there in HiFiMini */
+        for (c_itr = 0; c_itr < (cols1 - 1); c_itr += 2) {
+          dp_mat1_0 = AE_CVTP24A16X2_LL((UWORD8)p_mat1_0[c_itr],
+                                        (UWORD8)p_mat1_0[c_itr + 1]);
+          dp_mat1_1 = AE_CVTP24A16X2_LL((UWORD8)p_mat1_1[c_itr],
+                                        (UWORD8)p_mat1_1[c_itr + 1]);
+          dp_mat1_2 = AE_CVTP24A16X2_LL((UWORD8)p_mat1_2[c_itr],
+                                        (UWORD8)p_mat1_2[c_itr + 1]);
+          dp_mat1_3 = AE_CVTP24A16X2_LL((UWORD8)p_mat1_3[c_itr],
+                                        (UWORD8)p_mat1_3[c_itr + 1]);
+          dp_vec1_0 = AE_CVTP24A16X2_LL((UWORD8)p_vec1_0[c_itr],
+                                        (UWORD8)p_vec1_0[c_itr + 1]);
+          dp_mat1_0 = AE_SLLIP24(dp_mat1_0, 8);
+          dp_mat1_1 = AE_SLLIP24(dp_mat1_1, 8);
+          dp_mat1_2 = AE_SLLIP24(dp_mat1_2, 8);
+          dp_mat1_3 = AE_SLLIP24(dp_mat1_3, 8);
+          dp_vec1_0 = AE_SLLIP24(dp_vec1_0, 8);
+          dp_vec1_0 = AE_SRAIP24(dp_vec1_0, 16);
+          dp_vec1_0 = AE_ADDSP24S(dp_vec1_0, dp_vec1_zb);
+          AE_MULAAP24S_HH_LL(dq_acc[0], dp_mat1_0, dp_vec1_0);
+          AE_MULAAP24S_HH_LL(dq_acc[1], dp_mat1_1, dp_vec1_0);
+          AE_MULAAP24S_HH_LL(dq_acc[2], dp_mat1_2, dp_vec1_0);
+          AE_MULAAP24S_HH_LL(dq_acc[3], dp_mat1_3, dp_vec1_0);
+        }
+        if (cols1 & 1) {
+          ae_p24x2s dp_mat1_01, dp_mat1_23;
+          dp_mat1_01 = AE_CVTP24A16X2_LL((UWORD8)p_mat1_0[c_itr],
+                                         (UWORD8)p_mat1_1[c_itr]);
+          dp_mat1_23 = AE_CVTP24A16X2_LL((UWORD8)p_mat1_2[c_itr],
+                                         (UWORD8)p_mat1_3[c_itr]);
+          dp_vec1_0 = AE_MOVPA24(p_vec1_0[c_itr]);
+          dp_mat1_01 = AE_SLLIP24(dp_mat1_01, 8);
+          dp_mat1_23 = AE_SLLIP24(dp_mat1_23, 8);
+          dp_vec1_0 = AE_ADDSP24S(dp_vec1_0, dp_vec1_zb);
+          AE_MULAP24S_HH(dq_acc[0], dp_mat1_01, dp_vec1_0);
+          AE_MULAP24S_LL(dq_acc[1], dp_mat1_01, dp_vec1_0);
+          AE_MULAP24S_HH(dq_acc[2], dp_mat1_23, dp_vec1_0);
+          AE_MULAP24S_LL(dq_acc[3], dp_mat1_23, dp_vec1_0);
+        }
+
+        if (p_bias != NULL) {
+          for (i = 0; i < 4; i++)
+            dq_acc[i] =
+                AE_ADDSQ56S(dq_acc[i], *(ae_q32s *)(&p_bias[m_itr + i]));
+        }
+
+        for (i = 0; i < 4; i++) {
+          dq_out32 = AE_SATQ48S(dq_acc[i]);
+          MULTIPLY_BY_QUANTIZED_MULTIPLIER(dq_out, AE_TRUNCA32Q48(dq_out32),
+                                           out_multiplier, left_shift,
+                                           right_shift);
+          ADD_OUT_OFFSET_STORE_INT8(&p_out[(m_itr + i) * out_stride], dq_out,
+                                    out_zero_bias);
+        }
+      }
+    }
+    for (; m_itr < rows; m_itr++) {
+      p_mat1_0 = &p_mat1[m_itr * row_stride1];
+      p_vec1_0 = p_vec1;
+
+      dq_acc[0] = AE_ZEROQ56();
+
+      for (c_itr = 0; c_itr < (cols1 - 1); c_itr += 2) {
+        dp_mat1_0 = AE_CVTP24A16X2_LL((UWORD8)p_mat1_0[c_itr],
+                                      (UWORD8)p_mat1_0[c_itr + 1]);
+        dp_vec1_0 = AE_CVTP24A16X2_LL((UWORD8)p_vec1_0[c_itr],
+                                      (UWORD8)p_vec1_0[c_itr + 1]);
+        dp_mat1_0 = AE_SLLIP24(dp_mat1_0, 8);
+        dp_vec1_0 = AE_SLLIP24(dp_vec1_0, 8);
+        dp_vec1_0 = AE_SRAIP24(dp_vec1_0, 16);
+        dp_vec1_0 = AE_ADDSP24S(dp_vec1_0, dp_vec1_zb);
+        AE_MULAAP24S_HH_LL(dq_acc[0], dp_mat1_0, dp_vec1_0);
+      }
+      if (cols1 & 1) {
+        dp_mat1_0 = AE_CVTP24A16(p_mat1_0[c_itr]);
+        dp_vec1_0 = AE_CVTP24A16(p_vec1_0[c_itr]);
+        dp_vec1_0 = AE_ADDSP24S(dp_vec1_0, AE_CVTP24A16(vec1_zero_bias));
+        AE_MULAP24S_LL(dq_acc[0], dp_mat1_0, dp_vec1_0);
+      }
+
+      if (p_bias != NULL)
+        dq_acc[0] = AE_ADDSQ56S(dq_acc[0], *(ae_q32s *)(&p_bias[m_itr]));
+
+      dq_out32 = AE_SATQ48S(dq_acc[0]);
+      MULTIPLY_BY_QUANTIZED_MULTIPLIER(dq_out, AE_TRUNCA32Q48(dq_out32),
+                                       out_multiplier, left_shift, right_shift);
+      ADD_OUT_OFFSET_STORE_INT8(&p_out[m_itr * out_stride], dq_out,
+                                out_zero_bias);
+    }
+  }
+
+  return 0;
+}
+
+WORD32 xa_nn_matXvec_out_stride_asym8sxasym8s_asym8s(
+    WORD8 *__restrict__ p_out, const WORD8 *__restrict__ p_mat1,
+    const WORD8 *__restrict__ p_vec1, const WORD32 *__restrict__ p_bias,
+    WORD32 rows, WORD32 cols1, WORD32 row_stride1, WORD32 out_stride,
+    WORD32 mat1_zero_bias, WORD32 vec1_zero_bias, WORD32 out_multiplier,
+    WORD32 out_shift, WORD32 out_zero_bias) {
+  /* NULL pointer checks */
+  XA_NNLIB_ARG_CHK_PTR(p_out, -1);
+  XA_NNLIB_ARG_CHK_PTR(p_mat1, -1);
+  XA_NNLIB_ARG_CHK_PTR(p_vec1, -1);
+  /* Pointer alignment checks */
+  XA_NNLIB_ARG_CHK_ALIGN(p_bias, sizeof(WORD32), -1);
+  /* Basic Parameter checks */
+  XA_NNLIB_ARG_CHK_COND((rows <= 0), -1);
+  XA_NNLIB_ARG_CHK_COND((cols1 <= 0), -1);
+  XA_NNLIB_ARG_CHK_COND((row_stride1 < cols1), -1);
+  XA_NNLIB_ARG_CHK_COND((mat1_zero_bias < -127 || mat1_zero_bias > 128), -1);
+  XA_NNLIB_ARG_CHK_COND((vec1_zero_bias < -127 || vec1_zero_bias > 128), -1);
+  XA_NNLIB_ARG_CHK_COND((out_shift < -31 || out_shift > 31), -1);
+  XA_NNLIB_ARG_CHK_COND((out_zero_bias < -128 || out_zero_bias > 127), -1);
+
+  /* Iterators used in for loops */
+  int m_itr, c_itr, i;
+  /* Assign initial value so this value will be used in trailing loop */
+  m_itr = 0;
+  /* Shifts to match with Tensorflow */
+  int left_shift, right_shift;
+
+  left_shift = out_shift < 0 ? 0 : out_shift;
+  right_shift = out_shift > 0 ? 0 : -out_shift;
+
+  const WORD8 *p_mat1_0, *p_mat1_1, *p_mat1_2, *p_mat1_3;
+  const WORD8 *p_vec1_0;
+  ae_p24x2s dp_mat1_0, dp_mat1_1, dp_mat1_2, dp_mat1_3, dp_vec1_0;
+  ae_p24x2s dp_vec1_zb, dp_mat1_zb;
+  ae_q56s dq_acc_0, dq_acc_1, dq_acc_2, dq_acc_3;
+  ae_q56s dq_out32, dq_out;
+
+  const WORD32 bias_buffer[1] = {0};
+  const WORD32 *p_bias_load;
+  WORD32 bias_address_increment = sizeof(WORD32);
+
+  dp_mat1_zb = AE_MOVPA24(mat1_zero_bias);
+  dp_vec1_zb = AE_MOVPA24(vec1_zero_bias);
+
+  /* Check for alignment conditions */
+  if (((((unsigned)p_mat1) & 1) == 0) && ((((unsigned)p_vec1) & 1) == 0) &&
+      ((row_stride1 & 1) == 0) && ((cols1 & 1) == 0)) {
+    /* Calculate partial zero offset adjustment outside the loop */
+    WORD32 zero_offset_adjustment;
+
+    // Constant part of total zero bias
+    ae_q56s dq_zero_bias_sum =
+        AE_CVTQ48A32S(vec1_zero_bias * cols1 * mat1_zero_bias);
+
+    WORD8 *p_inp = (WORD8 *)p_vec1 - 2;
+    for (i = 0; i < (cols1 >> 1); i++) {
+      /* Input vector is in MSB 8 bits, matrix zero bias in LSB 8 bits */
+      AE_LP8X2F_IU(dp_vec1_0, p_inp, 2);
+      AE_MULAAP24S_HH_LL(dq_zero_bias_sum, dp_vec1_0, dp_mat1_zb);
+    }
+    /* Product is already aligned to bits 16 to 47 in QR register. */
+    zero_offset_adjustment = AE_TRUNCA32Q48(dq_zero_bias_sum);
+
+    /* If bias is not provided, use a dummy zero value from bias_buffer. */
+    if (p_bias == NULL) {
+      p_bias_load = bias_buffer - 1;
+      bias_address_increment = 0;
+    } else {
+      p_bias_load = p_bias - 1;
+    }
+
+    for (m_itr = 0; m_itr < (rows - 3); m_itr += 4) {
+      p_mat1_0 = &p_mat1[(m_itr + 0) * row_stride1 - 2];
+      p_mat1_1 = &p_mat1[(m_itr + 1) * row_stride1 - 2];
+      p_mat1_2 = &p_mat1[(m_itr + 2) * row_stride1 - 2];
+      p_mat1_3 = &p_mat1[(m_itr + 3) * row_stride1 - 2];
+      p_vec1_0 = p_vec1 - 2;
+
+      AE_LQ32F_XU(dq_acc_0, (ae_q32s *)p_bias_load, bias_address_increment);
+      AE_LQ32F_XU(dq_acc_1, (ae_q32s *)p_bias_load, bias_address_increment);
+      AE_LQ32F_XU(dq_acc_2, (ae_q32s *)p_bias_load, bias_address_increment);
+      AE_LQ32F_XU(dq_acc_3, (ae_q32s *)p_bias_load, bias_address_increment);
+
+      dq_acc_0 = AE_ADDQ56(dq_acc_0, AE_CVTQ48A32S(zero_offset_adjustment));
+      dq_acc_1 = AE_ADDQ56(dq_acc_1, AE_CVTQ48A32S(zero_offset_adjustment));
+      dq_acc_2 = AE_ADDQ56(dq_acc_2, AE_CVTQ48A32S(zero_offset_adjustment));
+      dq_acc_3 = AE_ADDQ56(dq_acc_3, AE_CVTQ48A32S(zero_offset_adjustment));
+
+      /* AE_LP8X2F* instruction loads in upper 8 bits of P register, so shifting
+      vector right by 16 to get multiplication result in middle 32 bits of Q
+      register (lower 16 bits 0) */
+      for (c_itr = 0; c_itr < (cols1 - 1); c_itr += 2) {
+        AE_LP8X2F_IU(dp_mat1_0, p_mat1_0, 2);
+        AE_LP8X2F_IU(dp_mat1_1, p_mat1_1, 2);
+        AE_LP8X2F_IU(dp_mat1_2, p_mat1_2, 2);
+        AE_LP8X2F_IU(dp_mat1_3, p_mat1_3, 2);
+        AE_LP8X2F_IU(dp_vec1_0, p_vec1_0, 2);
+
+        dp_vec1_0 = AE_SRAIP24(dp_vec1_0, 16);
+        dp_vec1_0 = AE_ADDSP24S(dp_vec1_0, dp_vec1_zb);
+
+        AE_MULAAP24S_HH_LL(dq_acc_0, dp_mat1_0, dp_vec1_0);
+        AE_MULAAP24S_HH_LL(dq_acc_1, dp_mat1_1, dp_vec1_0);
+        AE_MULAAP24S_HH_LL(dq_acc_2, dp_mat1_2, dp_vec1_0);
+        AE_MULAAP24S_HH_LL(dq_acc_3, dp_mat1_3, dp_vec1_0);
+      }
+
+      /* Pointers are aligned so can do 8X2 loads and ignore L parts of
+       * registers */
+      if (cols1 & 1) {
+        AE_LP8X2F_IU(dp_mat1_0, p_mat1_0, 2);
+        AE_LP8X2F_IU(dp_mat1_1, p_mat1_1, 2);
+        AE_LP8X2F_IU(dp_mat1_2, p_mat1_2, 2);
+        AE_LP8X2F_IU(dp_mat1_3, p_mat1_3, 2);
+        AE_LP8X2F_IU(dp_vec1_0, p_vec1_0, 2);
+
+        dp_vec1_0 = AE_SRAIP24(dp_vec1_0, 16);
+        dp_vec1_0 = AE_ADDSP24S(dp_vec1_0, dp_vec1_zb);
+
+        AE_MULAP24S_HH(dq_acc_0, dp_mat1_0, dp_vec1_0);
+        AE_MULAP24S_HH(dq_acc_1, dp_mat1_1, dp_vec1_0);
+        AE_MULAP24S_HH(dq_acc_2, dp_mat1_2, dp_vec1_0);
+        AE_MULAP24S_HH(dq_acc_3, dp_mat1_3, dp_vec1_0);
+      }
+
+      dq_out32 = AE_SATQ48S(dq_acc_0);
+      MULTIPLY_BY_QUANTIZED_MULTIPLIER(dq_out, AE_TRUNCA32Q48(dq_out32),
+                                       out_multiplier, left_shift, right_shift);
+      ADD_OUT_OFFSET_STORE_INT8(&p_out[(m_itr + i) * out_stride], dq_out,
+                                out_zero_bias);
+
+      dq_out32 = AE_SATQ48S(dq_acc_1);
+      MULTIPLY_BY_QUANTIZED_MULTIPLIER(dq_out, AE_TRUNCA32Q48(dq_out32),
+                                       out_multiplier, left_shift, right_shift);
+      ADD_OUT_OFFSET_STORE_INT8(&p_out[(m_itr + i) * out_stride], dq_out,
+                                out_zero_bias);
+
+      dq_out32 = AE_SATQ48S(dq_acc_2);
+      MULTIPLY_BY_QUANTIZED_MULTIPLIER(dq_out, AE_TRUNCA32Q48(dq_out32),
+                                       out_multiplier, left_shift, right_shift);
+      ADD_OUT_OFFSET_STORE_INT8(&p_out[(m_itr + i) * out_stride], dq_out,
+                                out_zero_bias);
+
+      dq_out32 = AE_SATQ48S(dq_acc_3);
+      MULTIPLY_BY_QUANTIZED_MULTIPLIER(dq_out, AE_TRUNCA32Q48(dq_out32),
+                                       out_multiplier, left_shift, right_shift);
+      ADD_OUT_OFFSET_STORE_INT8(&p_out[(m_itr + i) * out_stride], dq_out,
+                                out_zero_bias);
+    }
+    for (; m_itr < rows; m_itr++) {
+      p_mat1_0 = &p_mat1[m_itr * row_stride1 - 2];
+      p_vec1_0 = p_vec1 - 2;
+
+      AE_LQ32F_XU(dq_acc_0, (ae_q32s *)p_bias_load, bias_address_increment);
+      dq_acc_0 = AE_ADDQ56(dq_acc_0, AE_CVTQ48A32S(zero_offset_adjustment));
+
+      for (c_itr = 0; c_itr < (cols1 - 1); c_itr += 2) {
+        AE_LP8X2F_IU(dp_mat1_0, p_mat1_0, 2);
+        AE_LP8X2F_IU(dp_vec1_0, p_vec1_0, 2);
+        dp_vec1_0 = AE_SRAIP24(dp_vec1_0, 16);
+        dp_vec1_0 = AE_ADDSP24S(dp_vec1_0, dp_vec1_zb);
+
+        AE_MULAAP24S_HH_LL(dq_acc_0, dp_mat1_0, dp_vec1_0);
+      }
+
+      dq_out32 = AE_SATQ48S(dq_acc_0);
+      MULTIPLY_BY_QUANTIZED_MULTIPLIER(dq_out, AE_TRUNCA32Q48(dq_out32),
+                                       out_multiplier, left_shift, right_shift);
+      ADD_OUT_OFFSET_STORE_INT8(&p_out[m_itr * out_stride], dq_out,
+                                out_zero_bias);
+    }
+  } else {
+#ifndef DISABLE_NNLIB_UNALIGNED_SUPPORT
+    ae_q56s dq_acc[4];
+
+    if ((((unsigned)p_mat1) & 1) == 0) {
+      for (m_itr = 0; m_itr < (rows - 3); m_itr += 4) {
+        p_mat1_0 = &p_mat1[(m_itr + 0) * row_stride1 - 2];
+        p_mat1_1 = &p_mat1[(m_itr + 1) * row_stride1];
+        p_mat1_2 = &p_mat1[(m_itr + 2) * row_stride1 - 2];
+        p_mat1_3 = &p_mat1[(m_itr + 3) * row_stride1];
+        p_vec1_0 = p_vec1;
+
+        dq_acc[0] = dq_acc[1] = dq_acc[2] = dq_acc[3] = AE_ZEROQ56();
+
+        /* Matrix elements are kept in upper 8 bits of P registers, vector
+        elements are kept in lower 8 bits of P registers, typecasting to UWORD8
+        is to avoid extra extui instructions since signed 8-bit load in not
+        there in HiFiMini */
+        for (c_itr = 0; c_itr < (cols1 - 1); c_itr += 2) {
+          AE_LP8X2F_IU(dp_mat1_0, p_mat1_0, 2);
+          dp_mat1_1 = AE_CVTP24A16X2_LL((UWORD8)p_mat1_1[c_itr],
+                                        (UWORD8)p_mat1_1[c_itr + 1]);
+          AE_LP8X2F_IU(dp_mat1_2, p_mat1_2, 2);
+          dp_mat1_3 = AE_CVTP24A16X2_LL((UWORD8)p_mat1_3[c_itr],
+                                        (UWORD8)p_mat1_3[c_itr + 1]);
+          dp_vec1_0 = AE_CVTP24A16X2_LL((UWORD8)p_vec1_0[c_itr],
+                                        (UWORD8)p_vec1_0[c_itr + 1]);
+          dp_mat1_1 = AE_SLLIP24(dp_mat1_1, 8);
+          dp_mat1_3 = AE_SLLIP24(dp_mat1_3, 8);
+          dp_vec1_0 = AE_SLLIP24(dp_vec1_0, 8);
+          dp_vec1_0 = AE_SRAIP24(dp_vec1_0, 16);
+          dp_vec1_0 = AE_ADDSP24S(dp_vec1_0, dp_vec1_zb);
+
+          dp_mat1_0 = AE_SRAIP24(dp_mat1_0, 16);
+          dp_mat1_0 = AE_ADDSP24S(dp_mat1_0, dp_mat1_zb);
+          dp_mat1_1 = AE_SRAIP24(dp_mat1_1, 16);
+          dp_mat1_1 = AE_ADDSP24S(dp_mat1_1, dp_mat1_zb);
+          dp_mat1_2 = AE_SRAIP24(dp_mat1_2, 16);
+          dp_mat1_2 = AE_ADDSP24S(dp_mat1_2, dp_mat1_zb);
+          dp_mat1_3 = AE_SRAIP24(dp_mat1_3, 16);
+          dp_mat1_3 = AE_ADDSP24S(dp_mat1_3, dp_mat1_zb);
+
+          AE_MULAAP24S_HH_LL(dq_acc[0], dp_mat1_0, dp_vec1_0);
+          AE_MULAAP24S_HH_LL(dq_acc[1], dp_mat1_1, dp_vec1_0);
+          AE_MULAAP24S_HH_LL(dq_acc[2], dp_mat1_2, dp_vec1_0);
+          AE_MULAAP24S_HH_LL(dq_acc[3], dp_mat1_3, dp_vec1_0);
+        }
+        if (cols1 & 1) {
+          ae_p24x2s dp_mat1_01, dp_mat1_23;
+          dp_mat1_01 =
+              AE_CVTP24A16X2_LL((UWORD8)p_mat1_0[2], (UWORD8)p_mat1_1[c_itr]);
+          dp_mat1_23 =
+              AE_CVTP24A16X2_LL((UWORD8)p_mat1_2[2], (UWORD8)p_mat1_3[c_itr]);
+          dp_vec1_0 = AE_MOVPA24(p_vec1_0[c_itr]);
+          dp_mat1_01 = AE_SLLIP24(dp_mat1_01, 8);
+          dp_mat1_23 = AE_SLLIP24(dp_mat1_23, 8);
+          dp_vec1_0 = AE_ADDSP24S(dp_vec1_0, dp_vec1_zb);
+
+          dp_mat1_01 = AE_SRAIP24(dp_mat1_01, 16);
+          dp_mat1_01 = AE_ADDSP24S(dp_mat1_01, dp_mat1_zb);
+          dp_mat1_23 = AE_SRAIP24(dp_mat1_23, 16);
+          dp_mat1_23 = AE_ADDSP24S(dp_mat1_23, dp_mat1_zb);
+
+          AE_MULAP24S_HH(dq_acc[0], dp_mat1_01, dp_vec1_0);
+          AE_MULAP24S_LL(dq_acc[1], dp_mat1_01, dp_vec1_0);
+          AE_MULAP24S_HH(dq_acc[2], dp_mat1_23, dp_vec1_0);
+          AE_MULAP24S_LL(dq_acc[3], dp_mat1_23, dp_vec1_0);
+        }
+
+        dq_acc[0] = AE_SLLISQ56S(dq_acc[0], 16);
+        dq_acc[1] = AE_SLLISQ56S(dq_acc[1], 16);
+        dq_acc[2] = AE_SLLISQ56S(dq_acc[2], 16);
+        dq_acc[3] = AE_SLLISQ56S(dq_acc[3], 16);
+
+        if (p_bias != NULL) {
+          for (i = 0; i < 4; i++)
+            dq_acc[i] =
+                AE_ADDSQ56S(dq_acc[i], *(ae_q32s *)(&p_bias[m_itr + i]));
+        }
+
+        for (i = 0; i < 4; i++) {
+          dq_out32 = AE_SATQ48S(dq_acc[i]);
+          MULTIPLY_BY_QUANTIZED_MULTIPLIER(dq_out, AE_TRUNCA32Q48(dq_out32),
+                                           out_multiplier, left_shift,
+                                           right_shift);
+          ADD_OUT_OFFSET_STORE_INT8(&p_out[(m_itr + i) * out_stride], dq_out,
+                                    out_zero_bias);
+        }
+      }
+    } else {
+      for (m_itr = 0; m_itr < (rows - 3); m_itr += 4) {
+        p_mat1_0 = &p_mat1[(m_itr + 0) * row_stride1];
+        p_mat1_1 = &p_mat1[(m_itr + 1) * row_stride1];
+        p_mat1_2 = &p_mat1[(m_itr + 2) * row_stride1];
+        p_mat1_3 = &p_mat1[(m_itr + 3) * row_stride1];
+        p_vec1_0 = p_vec1;
+
+        dq_acc[0] = dq_acc[1] = dq_acc[2] = dq_acc[3] = AE_ZEROQ56();
+
+        /* Matrix elements are kept in upper 8 bits of P registers, vector
+        elements are kept in lower 8 bits of P registers, typecasting to UWORD8
+        is to avoid extra extui instructions since signed 8-bit load in not
+        there in HiFiMini */
+        for (c_itr = 0; c_itr < (cols1 - 1); c_itr += 2) {
+          dp_mat1_0 = AE_CVTP24A16X2_LL((UWORD8)p_mat1_0[c_itr],
+                                        (UWORD8)p_mat1_0[c_itr + 1]);
+          dp_mat1_1 = AE_CVTP24A16X2_LL((UWORD8)p_mat1_1[c_itr],
+                                        (UWORD8)p_mat1_1[c_itr + 1]);
+          dp_mat1_2 = AE_CVTP24A16X2_LL((UWORD8)p_mat1_2[c_itr],
+                                        (UWORD8)p_mat1_2[c_itr + 1]);
+          dp_mat1_3 = AE_CVTP24A16X2_LL((UWORD8)p_mat1_3[c_itr],
+                                        (UWORD8)p_mat1_3[c_itr + 1]);
+          dp_vec1_0 = AE_CVTP24A16X2_LL((UWORD8)p_vec1_0[c_itr],
+                                        (UWORD8)p_vec1_0[c_itr + 1]);
+          dp_mat1_0 = AE_SLLIP24(dp_mat1_0, 8);
+          dp_mat1_1 = AE_SLLIP24(dp_mat1_1, 8);
+          dp_mat1_2 = AE_SLLIP24(dp_mat1_2, 8);
+          dp_mat1_3 = AE_SLLIP24(dp_mat1_3, 8);
+          dp_vec1_0 = AE_SLLIP24(dp_vec1_0, 8);
+          dp_vec1_0 = AE_SRAIP24(dp_vec1_0, 16);
+          dp_vec1_0 = AE_ADDSP24S(dp_vec1_0, dp_vec1_zb);
+
+          dp_mat1_0 = AE_SRAIP24(dp_mat1_0, 16);
+          dp_mat1_0 = AE_ADDSP24S(dp_mat1_0, dp_mat1_zb);
+          dp_mat1_1 = AE_SRAIP24(dp_mat1_1, 16);
+          dp_mat1_1 = AE_ADDSP24S(dp_mat1_1, dp_mat1_zb);
+          dp_mat1_2 = AE_SRAIP24(dp_mat1_2, 16);
+          dp_mat1_2 = AE_ADDSP24S(dp_mat1_2, dp_mat1_zb);
+          dp_mat1_3 = AE_SRAIP24(dp_mat1_3, 16);
+          dp_mat1_3 = AE_ADDSP24S(dp_mat1_3, dp_mat1_zb);
+
+          AE_MULAAP24S_HH_LL(dq_acc[0], dp_mat1_0, dp_vec1_0);
+          AE_MULAAP24S_HH_LL(dq_acc[1], dp_mat1_1, dp_vec1_0);
+          AE_MULAAP24S_HH_LL(dq_acc[2], dp_mat1_2, dp_vec1_0);
+          AE_MULAAP24S_HH_LL(dq_acc[3], dp_mat1_3, dp_vec1_0);
+        }
+        if (cols1 & 1) {
+          ae_p24x2s dp_mat1_01, dp_mat1_23;
+          dp_mat1_01 = AE_CVTP24A16X2_LL((UWORD8)p_mat1_0[c_itr],
+                                         (UWORD8)p_mat1_1[c_itr]);
+          dp_mat1_23 = AE_CVTP24A16X2_LL((UWORD8)p_mat1_2[c_itr],
+                                         (UWORD8)p_mat1_3[c_itr]);
+          dp_vec1_0 = AE_MOVPA24(p_vec1_0[c_itr]);
+          dp_mat1_01 = AE_SLLIP24(dp_mat1_01, 8);
+          dp_mat1_23 = AE_SLLIP24(dp_mat1_23, 8);
+          dp_vec1_0 = AE_ADDSP24S(dp_vec1_0, dp_vec1_zb);
+
+          dp_mat1_01 = AE_SRAIP24(dp_mat1_01, 16);
+          dp_mat1_01 = AE_ADDSP24S(dp_mat1_01, dp_mat1_zb);
+          dp_mat1_23 = AE_SRAIP24(dp_mat1_23, 16);
+          dp_mat1_23 = AE_ADDSP24S(dp_mat1_23, dp_mat1_zb);
+
+          AE_MULAP24S_HH(dq_acc[0], dp_mat1_01, dp_vec1_0);
+          AE_MULAP24S_LL(dq_acc[1], dp_mat1_01, dp_vec1_0);
+          AE_MULAP24S_HH(dq_acc[2], dp_mat1_23, dp_vec1_0);
+          AE_MULAP24S_LL(dq_acc[3], dp_mat1_23, dp_vec1_0);
+        }
+
+        dq_acc[0] = AE_SLLISQ56S(dq_acc[0], 16);
+        dq_acc[1] = AE_SLLISQ56S(dq_acc[1], 16);
+        dq_acc[2] = AE_SLLISQ56S(dq_acc[2], 16);
+        dq_acc[3] = AE_SLLISQ56S(dq_acc[3], 16);
+
+        if (p_bias != NULL) {
+          for (i = 0; i < 4; i++)
+            dq_acc[i] =
+                AE_ADDSQ56S(dq_acc[i], *(ae_q32s *)(&p_bias[m_itr + i]));
+        }
+
+        for (i = 0; i < 4; i++) {
+          dq_out32 = AE_SATQ48S(dq_acc[i]);
+          MULTIPLY_BY_QUANTIZED_MULTIPLIER(dq_out, AE_TRUNCA32Q48(dq_out32),
+                                           out_multiplier, left_shift,
+                                           right_shift);
+          ADD_OUT_OFFSET_STORE_INT8(&p_out[(m_itr + i) * out_stride], dq_out,
+                                    out_zero_bias);
+        }
+      }
+    }
+    for (; m_itr < rows; m_itr++) {
+      p_mat1_0 = &p_mat1[m_itr * row_stride1];
+      p_vec1_0 = p_vec1;
+
+      dq_acc[0] = AE_ZEROQ56();
+
+      for (c_itr = 0; c_itr < (cols1 - 1); c_itr += 2) {
+        dp_mat1_0 = AE_CVTP24A16X2_LL((UWORD8)p_mat1_0[c_itr],
+                                      (UWORD8)p_mat1_0[c_itr + 1]);
+        dp_vec1_0 = AE_CVTP24A16X2_LL((UWORD8)p_vec1_0[c_itr],
+                                      (UWORD8)p_vec1_0[c_itr + 1]);
+        dp_mat1_0 = AE_SLLIP24(dp_mat1_0, 8);
+        dp_vec1_0 = AE_SLLIP24(dp_vec1_0, 8);
+        dp_vec1_0 = AE_SRAIP24(dp_vec1_0, 16);
+        dp_vec1_0 = AE_ADDSP24S(dp_vec1_0, dp_vec1_zb);
+
+        dp_mat1_0 = AE_SRAIP24(dp_mat1_0, 16);
+        dp_mat1_0 = AE_ADDSP24S(dp_mat1_0, dp_mat1_zb);
+
+        AE_MULAAP24S_HH_LL(dq_acc[0], dp_mat1_0, dp_vec1_0);
+      }
+      if (cols1 & 1) {
+        dp_mat1_0 = AE_CVTP24A16(p_mat1_0[c_itr]);
+        dp_vec1_0 = AE_CVTP24A16(p_vec1_0[c_itr]);
+        dp_vec1_0 = AE_ADDSP24S(dp_vec1_0, AE_CVTP24A16(vec1_zero_bias));
+
+        dp_mat1_0 = AE_SRAIP24(dp_mat1_0, 16);
+        dp_mat1_0 = AE_ADDSP24S(dp_mat1_0, dp_mat1_zb);
+
+        AE_MULAP24S_LL(dq_acc[0], dp_mat1_0, dp_vec1_0);
+      }
+
+      dq_acc[0] = AE_SLLISQ56S(dq_acc[0], 16);
+
+      if (p_bias != NULL)
+        dq_acc[0] = AE_ADDSQ56S(dq_acc[0], *(ae_q32s *)(&p_bias[m_itr]));
+
+      dq_out32 = AE_SATQ48S(dq_acc[0]);
+      MULTIPLY_BY_QUANTIZED_MULTIPLIER(dq_out, AE_TRUNCA32Q48(dq_out32),
+                                       out_multiplier, left_shift, right_shift);
+      ADD_OUT_OFFSET_STORE_INT8(&p_out[m_itr * out_stride], dq_out,
+                                out_zero_bias);
+    }
+#else
+    return 1;
+#endif
+  }
+
+  return 0;
+}
+
+#define STORE_INT16(ptr, data)                                         \
+  {                                                                    \
+    int out_i32 = AE_TRUNCA32Q48(AE_SATQ48S(data));                    \
+    out_i32 = out_i32 < (int)0xffff8000L ? (int)0xffff8000L : out_i32; \
+    out_i32 = out_i32 > (int)0x7fff ? (int)0x7fff : out_i32;           \
+    *(ptr) = (WORD16)out_i32;                                          \
+  }
+
+WORD32 xa_nn_matXvec_out_stride_sym8sxasym8s_16(
+    WORD16 *__restrict__ p_out, const WORD8 *__restrict__ p_mat1,
+    const WORD8 *__restrict__ p_vec1, const WORD32 *__restrict__ p_bias,
+    WORD32 rows, WORD32 cols1, WORD32 row_stride1, WORD32 out_stride,
+    WORD32 vec1_zero_bias, WORD32 out_multiplier, WORD32 out_shift) {
+  /* NULL pointer checks */
+  XA_NNLIB_ARG_CHK_PTR(p_out, -1);
+  XA_NNLIB_ARG_CHK_PTR(p_mat1, -1);
+  XA_NNLIB_ARG_CHK_PTR(p_vec1, -1);
+  /* Pointer alignment checks */
+  XA_NNLIB_ARG_CHK_ALIGN(p_out, sizeof(WORD16), -1);
+  XA_NNLIB_ARG_CHK_ALIGN(p_bias, sizeof(WORD32), -1);
+  /* Basic Parameter checks */
+  XA_NNLIB_ARG_CHK_COND((rows <= 0), -1);
+  XA_NNLIB_ARG_CHK_COND((cols1 <= 0), -1);
+  XA_NNLIB_ARG_CHK_COND((row_stride1 < cols1), -1);
+  XA_NNLIB_ARG_CHK_COND((vec1_zero_bias < -127 || vec1_zero_bias > 128), -1);
+  XA_NNLIB_ARG_CHK_COND((out_shift < -31 || out_shift > 31), -1);
+
+  /* Iterators used in for loops */
+  int m_itr, c_itr, i;
+  /* Assign initial value so this value will be used in trailing loop */
+  m_itr = 0;
+  /* Shifts to match with Tensorflow */
+  int left_shift, right_shift;
+
+  left_shift = out_shift < 0 ? 0 : out_shift;
+  right_shift = out_shift > 0 ? 0 : -out_shift;
+
+  const WORD8 *p_mat1_0, *p_mat1_1, *p_mat1_2, *p_mat1_3;
+  const WORD8 *p_vec1_0;
+  ae_p24x2s dp_mat1_0, dp_mat1_1, dp_mat1_2, dp_mat1_3, dp_vec1_0;
+  ae_p24x2s dp_vec1_zb;
+  ae_q56s dq_acc[4];
+  ae_q56s dq_out32, dq_out;
+
+  dp_vec1_zb = AE_MOVPA24(vec1_zero_bias);
+  if (((((unsigned)p_mat1) & 1) == 0) && ((((unsigned)p_vec1) & 1) == 0) &&
+      ((row_stride1 & 1) == 0)) {
+    for (m_itr = 0; m_itr < (rows - 3); m_itr += 4) {
+      p_mat1_0 = &p_mat1[(m_itr + 0) * row_stride1 - 2];
+      p_mat1_1 = &p_mat1[(m_itr + 1) * row_stride1 - 2];
+      p_mat1_2 = &p_mat1[(m_itr + 2) * row_stride1 - 2];
+      p_mat1_3 = &p_mat1[(m_itr + 3) * row_stride1 - 2];
+      p_vec1_0 = p_vec1 - 2;
+
+      dq_acc[0] = dq_acc[1] = dq_acc[2] = dq_acc[3] = AE_ZEROQ56();
+
+      /* AE_LP8X2F* instruction loads in upper 8 bits of P register, so shifting
+      vector right by 16 to get multiplication result in middle 32 bits of Q
+      register (lower 16 bits 0) */
+      for (c_itr = 0; c_itr < (cols1 - 1); c_itr += 2) {
+        AE_LP8X2F_IU(dp_mat1_0, p_mat1_0, 2);
+        AE_LP8X2F_IU(dp_mat1_1, p_mat1_1, 2);
+        AE_LP8X2F_IU(dp_mat1_2, p_mat1_2, 2);
+        AE_LP8X2F_IU(dp_mat1_3, p_mat1_3, 2);
+        AE_LP8X2F_IU(dp_vec1_0, p_vec1_0, 2);
+        dp_vec1_0 = AE_SRAIP24(dp_vec1_0, 16);
+        dp_vec1_0 = AE_ADDSP24S(dp_vec1_0, dp_vec1_zb);
+        AE_MULAAP24S_HH_LL(dq_acc[0], dp_mat1_0, dp_vec1_0);
+        AE_MULAAP24S_HH_LL(dq_acc[1], dp_mat1_1, dp_vec1_0);
+        AE_MULAAP24S_HH_LL(dq_acc[2], dp_mat1_2, dp_vec1_0);
+        AE_MULAAP24S_HH_LL(dq_acc[3], dp_mat1_3, dp_vec1_0);
+      }
+      /* Pointers are aligned so can do 8X2 loads and ignore L parts of
+       * registers */
+      if (cols1 & 1) {
+        AE_LP8X2F_IU(dp_mat1_0, p_mat1_0, 2);
+        AE_LP8X2F_IU(dp_mat1_1, p_mat1_1, 2);
+        AE_LP8X2F_IU(dp_mat1_2, p_mat1_2, 2);
+        AE_LP8X2F_IU(dp_mat1_3, p_mat1_3, 2);
+        AE_LP8X2F_IU(dp_vec1_0, p_vec1_0, 2);
+        dp_vec1_0 = AE_SRAIP24(dp_vec1_0, 16);
+        dp_vec1_0 = AE_ADDSP24S(dp_vec1_0, dp_vec1_zb);
+        AE_MULAP24S_HH(dq_acc[0], dp_mat1_0, dp_vec1_0);
+        AE_MULAP24S_HH(dq_acc[1], dp_mat1_1, dp_vec1_0);
+        AE_MULAP24S_HH(dq_acc[2], dp_mat1_2, dp_vec1_0);
+        AE_MULAP24S_HH(dq_acc[3], dp_mat1_3, dp_vec1_0);
+      }
+
+      if (p_bias != NULL) {
+        for (i = 0; i < 4; i++)
+          dq_acc[i] = AE_ADDSQ56S(dq_acc[i], *(ae_q32s *)(&p_bias[m_itr + i]));
+      }
+
+      for (i = 0; i < 4; i++) {
+        dq_out32 = AE_SATQ48S(dq_acc[i]);
+        MULTIPLY_BY_QUANTIZED_MULTIPLIER(dq_out, AE_TRUNCA32Q48(dq_out32),
+                                         out_multiplier, left_shift,
+                                         right_shift);
+        STORE_INT16(&p_out[(m_itr + i) * out_stride], dq_out);
+      }
+    }
+    for (; m_itr < rows; m_itr++) {
+      p_mat1_0 = &p_mat1[m_itr * row_stride1 - 2];
+      p_vec1_0 = p_vec1 - 2;
+
+      dq_acc[0] = AE_ZEROQ56();
+
+      for (c_itr = 0; c_itr < (cols1 - 1); c_itr += 2) {
+        AE_LP8X2F_IU(dp_mat1_0, p_mat1_0, 2);
+        AE_LP8X2F_IU(dp_vec1_0, p_vec1_0, 2);
+        dp_vec1_0 = AE_SRAIP24(dp_vec1_0, 16);
+        dp_vec1_0 = AE_ADDSP24S(dp_vec1_0, dp_vec1_zb);
+        AE_MULAAP24S_HH_LL(dq_acc[0], dp_mat1_0, dp_vec1_0);
+      }
+      /* Pointers are aligned so can do 8X2 loads and ignore L parts of
+       * registers */
+      if (cols1 & 1) {
+        AE_LP8X2F_IU(dp_mat1_0, p_mat1_0, 2);
+        AE_LP8X2F_IU(dp_vec1_0, p_vec1_0, 2);
+        dp_vec1_0 = AE_SRAIP24(dp_vec1_0, 16);
+        dp_vec1_0 = AE_ADDSP24S(dp_vec1_0, dp_vec1_zb);
+        AE_MULAP24S_HH(dq_acc[0], dp_mat1_0, dp_vec1_0);
+      }
+
+      if (p_bias != NULL)
+        dq_acc[0] = AE_ADDSQ56S(dq_acc[0], *(ae_q32s *)(&p_bias[m_itr]));
+
+      dq_out32 = AE_SATQ48S(dq_acc[0]);
+      MULTIPLY_BY_QUANTIZED_MULTIPLIER(dq_out, AE_TRUNCA32Q48(dq_out32),
+                                       out_multiplier, left_shift, right_shift);
+      STORE_INT16(&p_out[m_itr * out_stride], dq_out);
+    }
+  } else {
+#ifndef DISABLE_NNLIB_UNALIGNED_SUPPORT
+    if ((((unsigned)p_mat1) & 1) == 0) {
+      for (m_itr = 0; m_itr < (rows - 3); m_itr += 4) {
+        p_mat1_0 = &p_mat1[(m_itr + 0) * row_stride1 - 2];
+        p_mat1_1 = &p_mat1[(m_itr + 1) * row_stride1];
+        p_mat1_2 = &p_mat1[(m_itr + 2) * row_stride1 - 2];
+        p_mat1_3 = &p_mat1[(m_itr + 3) * row_stride1];
+        p_vec1_0 = p_vec1;
+
+        dq_acc[0] = dq_acc[1] = dq_acc[2] = dq_acc[3] = AE_ZEROQ56();
+
+        /* Matrix elements are kept in upper 8 bits of P registers, vector
+        elements are kept in lower 8 bits of P registers, typecasting to UWORD8
+        is to avoid extra extui instructions since signed 8-bit load in not
+        there in HiFiMini */
+        for (c_itr = 0; c_itr < (cols1 - 1); c_itr += 2) {
+          AE_LP8X2F_IU(dp_mat1_0, p_mat1_0, 2);
+          dp_mat1_1 = AE_CVTP24A16X2_LL((UWORD8)p_mat1_1[c_itr],
+                                        (UWORD8)p_mat1_1[c_itr + 1]);
+          AE_LP8X2F_IU(dp_mat1_2, p_mat1_2, 2);
+          dp_mat1_3 = AE_CVTP24A16X2_LL((UWORD8)p_mat1_3[c_itr],
+                                        (UWORD8)p_mat1_3[c_itr + 1]);
+          dp_vec1_0 = AE_CVTP24A16X2_LL((UWORD8)p_vec1_0[c_itr],
+                                        (UWORD8)p_vec1_0[c_itr + 1]);
+          dp_mat1_1 = AE_SLLIP24(dp_mat1_1, 8);
+          dp_mat1_3 = AE_SLLIP24(dp_mat1_3, 8);
+          dp_vec1_0 = AE_SLLIP24(dp_vec1_0, 8);
+          dp_vec1_0 = AE_SRAIP24(dp_vec1_0, 16);
+          dp_vec1_0 = AE_ADDSP24S(dp_vec1_0, dp_vec1_zb);
+          AE_MULAAP24S_HH_LL(dq_acc[0], dp_mat1_0, dp_vec1_0);
+          AE_MULAAP24S_HH_LL(dq_acc[1], dp_mat1_1, dp_vec1_0);
+          AE_MULAAP24S_HH_LL(dq_acc[2], dp_mat1_2, dp_vec1_0);
+          AE_MULAAP24S_HH_LL(dq_acc[3], dp_mat1_3, dp_vec1_0);
+        }
+        if (cols1 & 1) {
+          ae_p24x2s dp_mat1_01, dp_mat1_23;
+          dp_mat1_01 =
+              AE_CVTP24A16X2_LL((UWORD8)p_mat1_0[2], (UWORD8)p_mat1_1[c_itr]);
+          dp_mat1_23 =
+              AE_CVTP24A16X2_LL((UWORD8)p_mat1_2[2], (UWORD8)p_mat1_3[c_itr]);
+          dp_vec1_0 = AE_MOVPA24(p_vec1_0[c_itr]);
+          dp_mat1_01 = AE_SLLIP24(dp_mat1_01, 8);
+          dp_mat1_23 = AE_SLLIP24(dp_mat1_23, 8);
+          dp_vec1_0 = AE_ADDSP24S(dp_vec1_0, dp_vec1_zb);
+          AE_MULAP24S_HH(dq_acc[0], dp_mat1_01, dp_vec1_0);
+          AE_MULAP24S_LL(dq_acc[1], dp_mat1_01, dp_vec1_0);
+          AE_MULAP24S_HH(dq_acc[2], dp_mat1_23, dp_vec1_0);
+          AE_MULAP24S_LL(dq_acc[3], dp_mat1_23, dp_vec1_0);
+        }
+
+        if (p_bias != NULL) {
+          for (i = 0; i < 4; i++)
+            dq_acc[i] =
+                AE_ADDSQ56S(dq_acc[i], *(ae_q32s *)(&p_bias[m_itr + i]));
+        }
+
+        for (i = 0; i < 4; i++) {
+          dq_out32 = AE_SATQ48S(dq_acc[i]);
+          MULTIPLY_BY_QUANTIZED_MULTIPLIER(dq_out, AE_TRUNCA32Q48(dq_out32),
+                                           out_multiplier, left_shift,
+                                           right_shift);
+          STORE_INT16(&p_out[(m_itr + i) * out_stride], dq_out);
+        }
+      }
+    } else {
+      for (m_itr = 0; m_itr < (rows - 3); m_itr += 4) {
+        p_mat1_0 = &p_mat1[(m_itr + 0) * row_stride1];
+        p_mat1_1 = &p_mat1[(m_itr + 1) * row_stride1];
+        p_mat1_2 = &p_mat1[(m_itr + 2) * row_stride1];
+        p_mat1_3 = &p_mat1[(m_itr + 3) * row_stride1];
+        p_vec1_0 = p_vec1;
+
+        dq_acc[0] = dq_acc[1] = dq_acc[2] = dq_acc[3] = AE_ZEROQ56();
+
+        /* Matrix elements are kept in upper 8 bits of P registers, vector
+        elements are kept in lower 8 bits of P registers, typecasting to UWORD8
+        is to avoid extra extui instructions since signed 8-bit load in not
+        there in HiFiMini */
+        for (c_itr = 0; c_itr < (cols1 - 1); c_itr += 2) {
+          dp_mat1_0 = AE_CVTP24A16X2_LL((UWORD8)p_mat1_0[c_itr],
+                                        (UWORD8)p_mat1_0[c_itr + 1]);
+          dp_mat1_1 = AE_CVTP24A16X2_LL((UWORD8)p_mat1_1[c_itr],
+                                        (UWORD8)p_mat1_1[c_itr + 1]);
+          dp_mat1_2 = AE_CVTP24A16X2_LL((UWORD8)p_mat1_2[c_itr],
+                                        (UWORD8)p_mat1_2[c_itr + 1]);
+          dp_mat1_3 = AE_CVTP24A16X2_LL((UWORD8)p_mat1_3[c_itr],
+                                        (UWORD8)p_mat1_3[c_itr + 1]);
+          dp_vec1_0 = AE_CVTP24A16X2_LL((UWORD8)p_vec1_0[c_itr],
+                                        (UWORD8)p_vec1_0[c_itr + 1]);
+          dp_mat1_0 = AE_SLLIP24(dp_mat1_0, 8);
+          dp_mat1_1 = AE_SLLIP24(dp_mat1_1, 8);
+          dp_mat1_2 = AE_SLLIP24(dp_mat1_2, 8);
+          dp_mat1_3 = AE_SLLIP24(dp_mat1_3, 8);
+          dp_vec1_0 = AE_SLLIP24(dp_vec1_0, 8);
+          dp_vec1_0 = AE_SRAIP24(dp_vec1_0, 16);
+          dp_vec1_0 = AE_ADDSP24S(dp_vec1_0, dp_vec1_zb);
+          AE_MULAAP24S_HH_LL(dq_acc[0], dp_mat1_0, dp_vec1_0);
+          AE_MULAAP24S_HH_LL(dq_acc[1], dp_mat1_1, dp_vec1_0);
+          AE_MULAAP24S_HH_LL(dq_acc[2], dp_mat1_2, dp_vec1_0);
+          AE_MULAAP24S_HH_LL(dq_acc[3], dp_mat1_3, dp_vec1_0);
+        }
+        if (cols1 & 1) {
+          ae_p24x2s dp_mat1_01, dp_mat1_23;
+          dp_mat1_01 = AE_CVTP24A16X2_LL((UWORD8)p_mat1_0[c_itr],
+                                         (UWORD8)p_mat1_1[c_itr]);
+          dp_mat1_23 = AE_CVTP24A16X2_LL((UWORD8)p_mat1_2[c_itr],
+                                         (UWORD8)p_mat1_3[c_itr]);
+          dp_vec1_0 = AE_MOVPA24(p_vec1_0[c_itr]);
+          dp_mat1_01 = AE_SLLIP24(dp_mat1_01, 8);
+          dp_mat1_23 = AE_SLLIP24(dp_mat1_23, 8);
+          dp_vec1_0 = AE_ADDSP24S(dp_vec1_0, dp_vec1_zb);
+          AE_MULAP24S_HH(dq_acc[0], dp_mat1_01, dp_vec1_0);
+          AE_MULAP24S_LL(dq_acc[1], dp_mat1_01, dp_vec1_0);
+          AE_MULAP24S_HH(dq_acc[2], dp_mat1_23, dp_vec1_0);
+          AE_MULAP24S_LL(dq_acc[3], dp_mat1_23, dp_vec1_0);
+        }
+
+        if (p_bias != NULL) {
+          for (i = 0; i < 4; i++)
+            dq_acc[i] =
+                AE_ADDSQ56S(dq_acc[i], *(ae_q32s *)(&p_bias[m_itr + i]));
+        }
+
+        for (i = 0; i < 4; i++) {
+          dq_out32 = AE_SATQ48S(dq_acc[i]);
+          MULTIPLY_BY_QUANTIZED_MULTIPLIER(dq_out, AE_TRUNCA32Q48(dq_out32),
+                                           out_multiplier, left_shift,
+                                           right_shift);
+          STORE_INT16(&p_out[(m_itr + i) * out_stride], dq_out);
+        }
+      }
+    }
+    for (; m_itr < rows; m_itr++) {
+      p_mat1_0 = &p_mat1[m_itr * row_stride1];
+      p_vec1_0 = p_vec1;
+
+      dq_acc[0] = AE_ZEROQ56();
+
+      for (c_itr = 0; c_itr < (cols1 - 1); c_itr += 2) {
+        dp_mat1_0 = AE_CVTP24A16X2_LL((UWORD8)p_mat1_0[c_itr],
+                                      (UWORD8)p_mat1_0[c_itr + 1]);
+        dp_vec1_0 = AE_CVTP24A16X2_LL((UWORD8)p_vec1_0[c_itr],
+                                      (UWORD8)p_vec1_0[c_itr + 1]);
+        dp_mat1_0 = AE_SLLIP24(dp_mat1_0, 8);
+        dp_vec1_0 = AE_SLLIP24(dp_vec1_0, 8);
+        dp_vec1_0 = AE_SRAIP24(dp_vec1_0, 16);
+        dp_vec1_0 = AE_ADDSP24S(dp_vec1_0, dp_vec1_zb);
+        AE_MULAAP24S_HH_LL(dq_acc[0], dp_mat1_0, dp_vec1_0);
+      }
+      if (cols1 & 1) {
+        dp_mat1_0 = AE_CVTP24A16(p_mat1_0[c_itr]);
+        dp_vec1_0 = AE_CVTP24A16(p_vec1_0[c_itr]);
+        dp_vec1_0 = AE_ADDSP24S(dp_vec1_0, AE_CVTP24A16(vec1_zero_bias));
+        AE_MULAP24S_LL(dq_acc[0], dp_mat1_0, dp_vec1_0);
+      }
+
+      if (p_bias != NULL)
+        dq_acc[0] = AE_ADDSQ56S(dq_acc[0], *(ae_q32s *)(&p_bias[m_itr]));
+
+      dq_out32 = AE_SATQ48S(dq_acc[0]);
+      MULTIPLY_BY_QUANTIZED_MULTIPLIER(dq_out, AE_TRUNCA32Q48(dq_out32),
+                                       out_multiplier, left_shift, right_shift);
+      STORE_INT16(&p_out[m_itr * out_stride], dq_out);
+    }
+#else
+    return 1;
+#endif
+  }
+
+  return 0;
+}
diff --git a/tensorflow/lite/micro/kernels/xtensa_hifimini/xa_nnlib/include/nnlib/xa_nnlib_api.h b/tensorflow/lite/micro/kernels/xtensa_hifimini/xa_nnlib/include/nnlib/xa_nnlib_api.h
new file mode 100644
index 00000000000..eef1959378f
--- /dev/null
+++ b/tensorflow/lite/micro/kernels/xtensa_hifimini/xa_nnlib/include/nnlib/xa_nnlib_api.h
@@ -0,0 +1,43 @@
+/*******************************************************************************
+ * Copyright (c) 2019-2020 Cadence Design Systems, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to use this Software with Cadence processor cores only and
+ * not with any other processors and platforms, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ ******************************************************************************/
+
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef __XA_NNLIB_API_H__
+#define __XA_NNLIB_API_H__
+
+#include "tensorflow/lite/micro/kernels/xtensa_hifimini/xa_nnlib/include/nnlib/xa_nnlib_kernels_api.h"
+#include "tensorflow/lite/micro/kernels/xtensa_hifimini/xa_nnlib/include/xa_type_def.h"
+
+#endif /* __XA_NNLIB_API_H__ */
diff --git a/tensorflow/lite/micro/kernels/xtensa_hifimini/xa_nnlib/include/nnlib/xa_nnlib_kernels_api.h b/tensorflow/lite/micro/kernels/xtensa_hifimini/xa_nnlib/include/nnlib/xa_nnlib_kernels_api.h
new file mode 100644
index 00000000000..0c857710c0d
--- /dev/null
+++ b/tensorflow/lite/micro/kernels/xtensa_hifimini/xa_nnlib/include/nnlib/xa_nnlib_kernels_api.h
@@ -0,0 +1,298 @@
+/*******************************************************************************
+ * Copyright (c) 2019-2020 Cadence Design Systems, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to use this Software with Cadence processor cores only and
+ * not with any other processors and platforms, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ ******************************************************************************/
+
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef __XA_NNLIB_KERNELS_API_H__
+#define __XA_NNLIB_KERNELS_API_H__
+
+/**
+ * @file xa_nnlib_kernels_api.h
+ * @brief This file gives the API definition for the HiFi NNLIB
+ *
+ * matXvec KERNELS API NAMING CONVENTION <br>
+ * <br>
+ * xa_nn_matXvec_<batch>_[m]x[n]_[p]_<activation>, where
+ * - <batch>: Optional 'batch' tag to indicate time batching routine
+ * - [m]: Matrix precision in bits
+ * - [n]: Vector (and bias for non-activation routines) precision in bits
+ * - [p]: Output precision in bits
+ * - <activation>: optional activation tag 'sigmoid' / 'tanh'
+ *
+ * These set of kernels perform dual matXvec followed by optional
+ * activation function. There are several variants based on the input,
+ * output precision and use of activation functions.
+ *
+ * Restriction,
+ * - All pointers (p_out, p_mat1, p_mat2, p_vec1, p_vec2, p_bias, p_scratch)
+ * must be SIMD (64-bit) aligned and should not overlap.
+ * - p_mat2, p_vec2 can be 'NULL', but other pointers cannot be 'NULL'
+ * - Variables cols1, cols2, row_stride1, row_stride2 must be multiple of 4
+ *
+ * Usage of few critical variables,
+ * - acc_shift:
+ *   -# In case of valid activation tag i.e. <activation>: shift to be
+ *   applied on accumulator to match accumulator's Q format with activation
+ *   function's input's Q format
+ *   -# In case of bypass i.e. no activation tag: shift to be applied on
+ *   accumulator.
+ *   -# Positive value denotes left shift, and negative value denotes right
+ * shift.
+ * - bias_shift: shift which is to be applied on bias to match bias's
+ *   Q format with accumulator's Q format. Positive value denotes left shift,
+ *   and negative value denotes right shift.
+ * - bias_precision: This represents bias precision
+ *   -# For 16x16, and 8x16 apis, valid values are '16' and '64'
+ *   -# For 8x8 apis, valid values are '8' and '32'
+ *
+ * Output 8b, 16b, 32b of fixed point apis (only for bypass variants) is
+ * extracted from 64b accumulator with symmetric rounding. Output 64b of fixed
+ * point apis (only for bypass variants) is extracted from 64b accumulator.
+ * Output 8b, 16b of fixed point apis (only for activation variants) is
+ * symmetrically rounded.
+ *
+ * matXvec 16x16 Kernels,
+ * - Bypass kernels with 16, 32, 64 bit output: 3
+ * - Fused kernel with 2 activation variants:   2
+ * - Time batching kernel:                      1 (Not implemented)
+ * - Total:                                     6
+ *
+ * matXvec 8x16 Kernels,
+ * - Bypass kernels with 16, 32, 64 bit output: 3
+ * - Fused kernel with 2 activation variants:   2
+ * - Time batching kernel:                      1 (Not implemented)
+ * - Total:                                     6
+ *
+ * matXvec 8x8 Kernels,
+ * - Bypass kernels with 8, 16, 32 bit output: 3
+ * - Fused kernel with 2 activation variants:  2
+ * - Time batching kernel:                     1 (Not implemented)
+ * - Total:                                    6
+ *
+ * matXvec float32 x float32 Kernels,
+ * - Bypass kernels 32 bit output:            1
+ * - Fused kernel with 2 activation variants: 2
+ * - Time batching kernel:                    1 (Not implemented)
+ * - Total:                                   4
+ *
+ * ACTIVATION KERNELS API NAMING CONVENTION <br>
+ * <br>
+ * xa_nn_vec_[activation]_[n]_[p] for fixed point <br>
+ * xa_nn_vec_[activation]_f32_f32 for floating point, where
+ * - [activation]: One of activations - sigmoid/tanh/relu/relu1/relu6/softmax
+ * - [n]:          Input precision in bits
+ * - [p]:          Output precision in bits
+ *
+ * Possible values,
+ * - 'n' takes value '32', and expects input in Q6.25 format.
+ * - 'p' takes values '32' and '16', gives output in Q16.15 and Q0.15 formats
+ * respectively.
+ *
+ * There is WORD32 datatype variable 'threshold' for 'relu' related apis, which
+ * expects value in Q16.15 format.
+ *
+ * Restriction,
+ * - All pointers (p_out, p_vec) must be 32-bit aligned and should not overlap.
+ *
+ * activation 32_32 kernels,
+ * - Vector activation kernels: 6
+ * - Total:                     6
+ *
+ * activation f32_f32 kernels,
+ * - Vector activation kernels: 6
+ * - Total:                     6
+ *
+ * activation 32_16 kernels,
+ * - Vector activation kernels: 2
+ * - Total:                     2
+ */
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+WORD32 xa_nn_conv2d_depthwise_getsize(
+    WORD32 input_height, WORD32 input_width, WORD32 input_channels,
+    WORD32 kernel_height, WORD32 kernel_width, WORD32 channels_multiplier,
+    WORD32 x_stride, WORD32 y_stride, WORD32 x_padding, WORD32 y_padding,
+    WORD32 output_height, WORD32 output_width, WORD32 circ_buf_precision,
+    WORD32 inp_data_format);
+
+WORD32 xa_nn_vec_activation_min_max_asym8u_asym8u(
+    UWORD8 *__restrict__ p_out, const UWORD8 *__restrict__ p_vec,
+    int activation_min, int activation_max, WORD32 vec_length);
+
+WORD32 xa_nn_vec_activation_min_max_asym8s_asym8s(
+    WORD8 *__restrict__ p_out, const WORD8 *__restrict__ p_vec,
+    int activation_min, int activation_max, WORD32 vec_length);
+
+WORD32 xa_nn_conv2d_std_getsize(WORD32 input_height, WORD32 input_channels,
+                                WORD32 kernel_height, WORD32 kernel_width,
+                                WORD32 y_stride, WORD32 y_padding,
+                                WORD32 out_height, WORD32 input_precision);
+
+WORD32 xa_nn_conv2d_std_asym8uxasym8u(
+    UWORD8 *__restrict__ p_out, const UWORD8 *__restrict__ p_inp,
+    const UWORD8 *__restrict__ p_kernel, const WORD32 *__restrict__ p_bias,
+    WORD32 input_height, WORD32 input_width, WORD32 input_channels,
+    WORD32 kernel_height, WORD32 kernel_width, WORD32 out_channels,
+    WORD32 x_stride, WORD32 y_stride, WORD32 x_padding, WORD32 y_padding,
+    WORD32 out_height, WORD32 out_width, WORD32 input_zero_bias,
+    WORD32 kernel_zero_bias, WORD32 out_multiplier, WORD32 out_shift,
+    WORD32 out_zero_bias, WORD32 out_data_format, VOID *p_scratch);
+
+WORD32 xa_nn_conv2d_std_per_chan_sym8sxasym8s(
+    WORD8 *__restrict__ p_out, const WORD8 *__restrict__ p_inp,
+    const WORD8 *__restrict__ p_kernel, const WORD32 *__restrict__ p_bias,
+    WORD32 input_height, WORD32 input_width, WORD32 input_channels,
+    WORD32 kernel_height, WORD32 kernel_width, WORD32 out_channels,
+    WORD32 x_stride, WORD32 y_stride, WORD32 x_padding, WORD32 y_padding,
+    WORD32 out_height, WORD32 out_width, WORD32 input_zero_bias,
+    WORD32 *p_out_multiplier, WORD32 *p_out_shift, WORD32 out_zero_bias,
+    WORD32 out_data_format, VOID *p_scratch);
+
+WORD32 xa_nn_conv2d_depthwise_asym8uxasym8u(
+    pUWORD8 __restrict__ p_out, const UWORD8 *__restrict__ p_kernel,
+    const UWORD8 *__restrict__ p_inp, const WORD32 *__restrict__ p_bias,
+    WORD32 input_height, WORD32 input_width, WORD32 input_channels,
+    WORD32 kernel_height, WORD32 kernel_width, WORD32 channels_multiplier,
+    WORD32 x_stride, WORD32 y_stride, WORD32 x_padding, WORD32 y_padding,
+    WORD32 out_height, WORD32 out_width, WORD32 input_zero_bias,
+    WORD32 kernel_zero_bias, WORD32 out_multiplier, WORD32 out_shift,
+    WORD32 out_zero_bias, WORD32 inp_data_format, WORD32 out_data_format,
+    pVOID p_scratch);
+
+WORD32 xa_nn_conv2d_depthwise_per_chan_sym8sxasym8s(
+    WORD8 *__restrict__ p_out, const WORD8 *__restrict__ p_kernel,
+    const WORD8 *__restrict__ p_inp, const WORD32 *__restrict__ p_bias,
+    WORD32 input_height, WORD32 input_width, WORD32 input_channels,
+    WORD32 kernel_height, WORD32 kernel_width, WORD32 channels_multiplier,
+    WORD32 x_stride, WORD32 y_stride, WORD32 x_padding, WORD32 y_padding,
+    WORD32 out_height, WORD32 out_width, WORD32 input_zero_bias,
+    const WORD32 *p_out_multiplier, const WORD32 *p_out_shift,
+    WORD32 out_zero_bias, WORD32 inp_data_format, WORD32 out_data_format,
+    pVOID p_scratch);
+
+WORD32 xa_nn_fully_connected_asym8uxasym8u_asym8u(
+    pUWORD8 __restrict__ p_out, const UWORD8 *__restrict__ p_weight,
+    const UWORD8 *__restrict__ p_inp, const WORD32 *__restrict__ p_bias,
+    WORD32 weight_depth, WORD32 out_depth, WORD32 input_zero_bias,
+    WORD32 weight_zero_bias, WORD32 out_multiplier, WORD32 out_shift,
+    WORD32 out_zero_bias);
+
+WORD32 xa_nn_fully_connected_sym8sxasym8s_asym8s(
+    pWORD8 __restrict__ p_out, const WORD8 *__restrict__ p_weight,
+    const WORD8 *__restrict__ p_inp, const WORD32 *__restrict__ p_bias,
+    WORD32 weight_depth, WORD32 out_depth, WORD32 input_zero_bias,
+    WORD32 out_multiplier, WORD32 out_shift, WORD32 out_zero_bias);
+
+WORD32 xa_nn_fully_connected_asym8sxasym8s_asym8s(
+    WORD8 *__restrict__ p_out, const WORD8 *__restrict__ p_weight,
+    const WORD8 *__restrict__ p_inp, const WORD32 *__restrict__ p_bias,
+    WORD32 weight_depth, WORD32 out_depth, WORD32 weight_zero_bias,
+    WORD32 input_zero_bias, WORD32 out_multiplier, WORD32 out_shift,
+    WORD32 out_zero_bias);
+
+WORD32 xa_nn_vec_softmax_asym8u_8(UWORD8 *__restrict__ p_out,
+                                  const UWORD8 *__restrict__ p_vec,
+                                  WORD32 diffmin, WORD32 input_left_shift,
+                                  WORD32 input_multiplier, WORD32 vec_length,
+                                  pVOID p_scratch);
+
+WORD32 xa_nn_vec_softmax_asym8s_16(WORD16 *__restrict__ p_out,
+                                   const WORD8 *__restrict__ p_vec,
+                                   WORD32 diffmin, WORD32 input_left_shift,
+                                   WORD32 input_multiplier, WORD32 vec_length,
+                                   pVOID p_scratch);
+
+WORD32 xa_nn_vec_softmax_asym8s_8(WORD8 *__restrict__ p_out,
+                                  const WORD8 *__restrict__ p_vec,
+                                  WORD32 diffmin, WORD32 input_left_shift,
+                                  WORD32 input_multiplier, WORD32 vec_length,
+                                  pVOID p_scratch);
+
+int xa_nn_get_softmax_scratch_size(int inp_precision, int out_precision,
+                                   int length);
+
+WORD32 xa_nn_matXvec_out_stride_asym8uxasym8u_asym8u(
+    UWORD8 *__restrict__ p_out, const UWORD8 *__restrict__ p_mat1,
+    const UWORD8 *__restrict__ p_vec1, const WORD32 *__restrict__ p_bias,
+    WORD32 rows, WORD32 cols1, WORD32 row_stride1, WORD32 out_stride,
+    WORD32 mat1_zero_bias, WORD32 vec1_zero_bias, WORD32 out_multiplier,
+    WORD32 out_shift, WORD32 out_zero_bias);
+
+WORD32 xa_nn_matXvec_out_stride_sym8sxasym8s_asym8s(
+    WORD8 *__restrict__ p_out, const WORD8 *__restrict__ p_mat1,
+    const WORD8 *__restrict__ p_vec1, const WORD32 *__restrict__ p_bias,
+    WORD32 rows, WORD32 cols1, WORD32 row_stride1, WORD32 out_stride,
+    WORD32 vec1_zero_bias, WORD32 out_multiplier, WORD32 out_shift,
+    WORD32 out_zero_bias);
+
+WORD32 xa_nn_matXvec_out_stride_asym8sxasym8s_asym8s(
+    WORD8 *__restrict__ p_out, const WORD8 *__restrict__ p_mat1,
+    const WORD8 *__restrict__ p_vec1, const WORD32 *__restrict__ p_bias,
+    WORD32 rows, WORD32 cols1, WORD32 row_stride1, WORD32 out_stride,
+    WORD32 mat1_zero_bias, WORD32 vec1_zero_bias, WORD32 out_multiplier,
+    WORD32 out_shift, WORD32 out_zero_bias);
+
+WORD32 xa_nn_matXvec_out_stride_sym8sxasym8s_16(
+    WORD16 *__restrict__ p_out, const WORD8 *__restrict__ p_mat1,
+    const WORD8 *__restrict__ p_vec1, const WORD32 *__restrict__ p_bias,
+    WORD32 rows, WORD32 cols1, WORD32 row_stride1, WORD32 out_stride,
+    WORD32 vec1_zero_bias, WORD32 out_multiplier, WORD32 out_shift);
+
+WORD32 xa_nn_dot_prod_16x16_asym8s(
+    WORD8 *__restrict__ p_out,               /* pointer to output */
+    const WORD16 *__restrict__ p_inp1_start, /* pointer to input1 */
+    const WORD16 *__restrict__ p_inp2_start, /* pointer to input2 */
+    const WORD32 *bias_ptr, WORD32 vec_length, WORD32 out_multiplier,
+    WORD32 out_shift, WORD32 out_zero_bias, WORD32 vec_count);
+
+/* Mapping the functions names from previous naming convension for backward
+ * compatibility */
+#define xa_nn_vec_activation_min_max_asym8_asym8 \
+  xa_nn_vec_activation_min_max_asym8u_asym8u
+#define xa_nn_conv2d_std_asym8xasym8 xa_nn_conv2d_std_asym8uxasym8u
+#define xa_nn_conv2d_depthwise_asym8xasym8 xa_nn_conv2d_depthwise_asym8uxasym8u
+#define xa_nn_fully_connected_asym8xasym8_asym8 \
+  xa_nn_fully_connected_asym8uxasym8u_asym8u
+#define xa_nn_vec_softmax_asym8_asym8 xa_nn_vec_softmax_asym8u_asym8u
+#define xa_nn_dot_prod_asym8xasym8_asym8 xa_nn_dot_prod_asym8uxasym8u_asym8u
+#define xa_nn_matXvec_out_stride_asym8xasym8_asym8 \
+  xa_nn_matXvec_out_stride_asym8uxasym8u_asym8u
+
+#if defined(__cplusplus)
+}
+#endif
+#endif /* __XA_NNLIB_KERNELS_API_H__ */
diff --git a/tensorflow/lite/micro/kernels/xtensa_hifimini/xa_nnlib/include/nnlib/xa_nnlib_standards.h b/tensorflow/lite/micro/kernels/xtensa_hifimini/xa_nnlib/include/nnlib/xa_nnlib_standards.h
new file mode 100644
index 00000000000..6e336d8fbb5
--- /dev/null
+++ b/tensorflow/lite/micro/kernels/xtensa_hifimini/xa_nnlib/include/nnlib/xa_nnlib_standards.h
@@ -0,0 +1,170 @@
+/*******************************************************************************
+ * Copyright (c) 2019-2020 Cadence Design Systems, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to use this Software with Cadence processor cores only and
+ * not with any other processors and platforms, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ ******************************************************************************/
+
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef __STANDARDS_H__
+#define __STANDARDS_H__
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+typedef double flt64;
+typedef char Int4;
+typedef char Int8;
+typedef int16_t Int16;
+typedef int Int32;
+typedef int Int24;
+typedef int64_t int Int64;
+typedef int Bool;
+typedef float Flt32;
+
+#ifdef MODEL_FLT64
+typedef double vect_t;
+typedef double coeff_t;
+typedef double accu_t;
+
+#elif MODEL_INT16
+typedef int16_t vect_t;
+typedef int16_t coeff_t;
+typedef signed char coeff8_t;
+typedef int64_t accu_t;
+typedef float coefff32_t;
+#endif
+
+typedef struct xa_nnlib_opaque {
+  Int32 _;
+} * xa_nnlib_handle_t;
+
+typedef enum _xa_nnlib_prec_t {
+  PREC_8 = 8,
+  PREC_16 = 16,
+  PREC_32 = 32,
+  PREC_F32 = -1,
+  PREC_F16 = -2,
+  PREC_ASYM8U = -3,
+  PREC_ASYM8S = -4,
+  PREC_SYM8S = -5
+} xa_nnlib_prec_t;
+
+typedef enum _xa_nnlib_shape_type_t {
+  SHAPE_UNKNOWN_T = 0,
+  SHAPE_VECTOR_T = 1,
+  SHAPE_MATRIX_T = 2,
+  SHAPE_CUBE_DWH_T = 3,
+  SHAPE_CUBE_WHD_T = 4
+} xa_nnlib_shape_type_t;
+
+typedef struct _xa_nnlib_shape_t {
+  xa_nnlib_shape_type_t shape_type;
+  Int32 n_shapes;
+  Int32 shape_offset;  // Offest between current shape and next shape
+  union {
+    struct {
+      Int32 height;
+      Int32 height_offset;
+      Int32 width;
+      Int32 width_offset;
+      Int32 depth;
+      Int32 depth_offset;
+    } cube;
+
+    struct {
+      Int32 length;
+    } vector;
+    struct {
+      Int32 rows;
+      Int32 row_offset;  // Offset between current row and next row
+      Int32 cols;
+    } matrix;
+  } dim;
+} xa_nnlib_shape_t;
+
+/*****************************************************************************/
+/* Constant hash defines                                                     */
+/*****************************************************************************/
+#define XA_NNLIB_NO_ERROR 0
+/* error handling 'AND' definition */
+#define XA_FATAL_ERROR 0x80000000
+
+enum xa_error_severity {
+  xa_severity_nonfatal = 0,
+  xa_severity_fatal = (int)0xffffffff
+};
+
+enum xa_error_class {
+  xa_class_nnlib = 0,
+  xa_class_config = 1,
+  xa_class_execute = 2
+};
+
+#define XA_NNLIB_GENERIC 0
+
+#define XA_ERROR_CODE(severity, class, codec, index) \
+  ((severity << 31) | (class << 12) | (codec << 7) | index)
+#define XA_ERROR_SEVERITY(code) (((code)&XA_FATAL_ERROR) != 0)
+#define XA_ERROR_CLASS(code) (((code) >> 12) & 0x0f)
+#define XA_ERROR_CODEC(code) (((code) >> 7) & 0x1f)
+#define XA_ERROR_SUBCODE(code) (((code) >> 0) & 0x3f)
+
+/* Our convention is that only nnlib-class errors can be generic ones. */
+
+/*****************************************************************************/
+/* Class 0: NNLib Errors                                                     */
+/*****************************************************************************/
+/* Non Fatal Errors */
+/* (none) */
+/* Fatal Errors */
+enum xa_error_fatal_nnlib_generic {
+  XA_NNLIB_FATAL_MEM_ALLOC =
+      XA_ERROR_CODE(xa_severity_fatal, xa_class_nnlib, XA_NNLIB_GENERIC, 0),
+  XA_NNLIB_FATAL_MEM_ALIGN =
+      XA_ERROR_CODE(xa_severity_fatal, xa_class_nnlib, XA_NNLIB_GENERIC, 1),
+  XA_NNLIB_FATAL_INVALID_SHAPE =
+      XA_ERROR_CODE(xa_severity_fatal, xa_class_nnlib, XA_NNLIB_GENERIC, 3)
+};
+
+/*****************************************************************************/
+/* NNLib Startup Functions                                                   */
+/*****************************************************************************/
+const Int8* xa_nnlib_get_lib_name_string(void);
+const Int8* xa_nnlib_get_lib_version_string(void);
+const Int8* xa_nnlib_get_lib_api_version_string(void);
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif /* __STANDARDS_H__ */
diff --git a/tensorflow/lite/micro/kernels/xtensa_hifimini/xa_nnlib/include/xa_type_def.h b/tensorflow/lite/micro/kernels/xtensa_hifimini/xa_nnlib/include/xa_type_def.h
new file mode 100644
index 00000000000..06da90d1ea7
--- /dev/null
+++ b/tensorflow/lite/micro/kernels/xtensa_hifimini/xa_nnlib/include/xa_type_def.h
@@ -0,0 +1,106 @@
+/*******************************************************************************
+ * Copyright (c) 2019-2020 Cadence Design Systems, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to use this Software with Cadence processor cores only and
+ * not with any other processors and platforms, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ ******************************************************************************/
+
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef __XA_TYPE_DEF_H__
+#define __XA_TYPE_DEF_H__
+
+/****************************************************************************/
+/*     types               type define    prefix        examples      bytes */
+/************************  ***********    ******    ****************  ***** */
+typedef signed char WORD8;      /* b       WORD8    b_name     1   */
+typedef signed char* pWORD8;    /* pb      pWORD8   pb_nmae    1   */
+typedef unsigned char UWORD8;   /* ub      UWORD8   ub_count   1   */
+typedef unsigned char* pUWORD8; /* pub     pUWORD8  pub_count  1   */
+
+typedef int16_t WORD16;     /* s       WORD16   s_count    2   */
+typedef int16_t* pWORD16;   /* ps      pWORD16  ps_count   2   */
+typedef uint16_t UWORD16;   /* us      UWORD16  us_count   2   */
+typedef uint16_t* pUWORD16; /* pus     pUWORD16 pus_count  2   */
+
+typedef signed int WORD24;      /* k       WORD24   k_count    3   */
+typedef signed int* pWORD24;    /* pk      pWORD24  pk_count   3   */
+typedef unsigned int UWORD24;   /* uk      UWORD24  uk_count   3   */
+typedef unsigned int* pUWORD24; /* puk     pUWORD24 puk_count  3   */
+
+typedef signed int WORD32;      /* i       WORD32   i_count    4   */
+typedef signed int* pWORD32;    /* pi      pWORD32  pi_count   4   */
+typedef unsigned int UWORD32;   /* ui      UWORD32  ui_count   4   */
+typedef unsigned int* pUWORD32; /* pui     pUWORD32 pui_count  4   */
+
+typedef int64_t WORD40;     /* m       WORD40   m_count    5   */
+typedef int64_t* pWORD40;   /* pm      pWORD40  pm_count   5   */
+typedef uint64_t UWORD40;   /* um      UWORD40  um_count   5   */
+typedef uint64_t* pUWORD40; /* pum     pUWORD40 pum_count  5   */
+
+typedef int64_t WORD64;      /* h       WORD64   h_count    8   */
+typedef int64_t* pWORD64;    /* ph      pWORD64  ph_count   8   */
+typedef uint64_t UWORD64;    /* uh      UWORD64  uh_count   8   */
+typedef uint64_tg* pUWORD64; /* puh     pUWORD64 puh_count  8   */
+
+typedef float FLOAT32;    /* f       FLOAT32  f_count    4   */
+typedef float* pFLOAT32;  /* pf      pFLOAT32 pf_count   4   */
+typedef double FLOAT64;   /* d       UFLOAT64 d_count    8   */
+typedef double* pFlOAT64; /* pd      pFLOAT64 pd_count   8   */
+
+typedef void VOID;   /* v       VOID     v_flag     4   */
+typedef void* pVOID; /* pv      pVOID    pv_flag    4   */
+
+/* variable size types: platform optimized implementation */
+typedef signed int BOOL;       /* bool    BOOL     bool_true      */
+typedef unsigned int UBOOL;    /* ubool   BOOL     ubool_true     */
+typedef signed int FLAG;       /* flag    FLAG     flag_false     */
+typedef unsigned int UFLAG;    /* uflag   FLAG     uflag_false    */
+typedef signed int LOOPIDX;    /* lp      LOOPIDX  lp_index       */
+typedef unsigned int ULOOPIDX; /* ulp     SLOOPIDX ulp_index      */
+typedef signed int WORD;       /* lp      LOOPIDX  lp_index       */
+typedef unsigned int UWORD;    /* ulp     SLOOPIDX ulp_index      */
+
+typedef LOOPIDX LOOPINDEX;   /* lp    LOOPIDX  lp_index       */
+typedef ULOOPIDX ULOOPINDEX; /* ulp   SLOOPIDX ulp_index      */
+
+#define PLATFORM_INLINE __inline
+
+typedef struct xa_codec_opaque {
+  WORD32 _;
+} * xa_codec_handle_t;
+
+typedef int XA_ERRORCODE;
+
+typedef XA_ERRORCODE xa_codec_func_t(xa_codec_handle_t p_xa_module_obj,
+                                     WORD32 i_cmd, WORD32 i_idx,
+                                     pVOID pv_value);
+
+#endif /* __XA_TYPE_DEF_H__ */
diff --git a/tensorflow/lite/micro/kernels/xtensa_hifimini/xtensa_tf_micro_common.h b/tensorflow/lite/micro/kernels/xtensa_hifimini/xtensa_tf_micro_common.h
new file mode 100644
index 00000000000..7e96ad8b77f
--- /dev/null
+++ b/tensorflow/lite/micro/kernels/xtensa_hifimini/xtensa_tf_micro_common.h
@@ -0,0 +1,88 @@
+/******************************************************************************
+ * Copyright (C) 2019 Cadence Design Systems, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to use this Software with Cadence processor cores only and
+ * not with any other processors and platforms, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ ******************************************************************************/
+
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef __XTENSA_TF_MICRO_COMMON__
+#define __XTENSA_TF_MICRO_COMMON__
+
+#if defined HIFI_NNLIB_OPT || defined HIFI_MINI_NNLIB_OPT
+#include "tensorflow/lite/micro/kernels/xtensa_hifimini/xa_nnlib/include/nnlib/xa_nnlib_api.h"
+#include "tensorflow/lite/micro/kernels/xtensa_hifimini/xa_nnlib/include/nnlib/xa_nnlib_standards.h"
+
+#define CHECK_ERR_HIFI_NNLIB_KER(ret, err_msg) \
+  if (ret != 0) {                              \
+    TF_LITE_KERNEL_LOG(context, err_msg);      \
+    return kTfLiteError;                       \
+  }
+
+#ifndef XTENSA_NNLIB_MAX_SCRATCH_SIZE
+#define XTENSA_NNLIB_MAX_SCRATCH_SIZE (70 * 1024)
+#endif
+
+#define ALLOCATE_XTENSA_NNLIB_SCRATCH_MEM \
+  uint8_t xtensa_nnlib_scratch_buf[XTENSA_NNLIB_MAX_SCRATCH_SIZE];
+
+#define MIN(a, b) (a) < (b) ? (a) : (b);
+#define MAX(a, b) (a) > (b) ? (a) : (b);
+
+#define ACTIVATION_MIN_MAX(data_type, out, inp, min, max) \
+  {                                                       \
+    data_type temp = MAX(inp, min);                       \
+    out = MIN(temp, max);                                 \
+  }
+
+#define ACTIVATION_MIN_MAX_F32(out, inp, min, max) \
+  {                                                \
+    float temp = MAX(inp, min);                    \
+    out = MIN(temp, max);                          \
+  }
+
+#define ACTIVATION_MIN_MAX_ASYM8(out, inp, min, max) \
+  {                                                  \
+    int32_t temp = MAX((int32_t)inp, min);           \
+    out = (uint8_t)MIN(temp, max);                   \
+  }
+
+#define ALIGNED_SIZE(x, bytes) (((x) + (bytes - 1)) & (~(bytes - 1)))
+#define ALIGN_PTR(x, bytes) ((((unsigned)(x)) + (bytes - 1)) & (~(bytes - 1)))
+
+#define PRINT_VAR(var)            \
+  printf("%s = %d\n", #var, var); \
+  fflush(stdout);                 \
+  fflush(stderr);
+
+#endif /* HIFI_NNLIB_OPT */
+
+#endif /* __XTENSA_TF_MICRO_COMMON__ */
diff --git a/tensorflow/lite/micro/tools/make/ext_libs/xtensa_hifimini_nn_library.inc b/tensorflow/lite/micro/tools/make/ext_libs/xtensa_hifimini_nn_library.inc
new file mode 100644
index 00000000000..ef9582dfd92
--- /dev/null
+++ b/tensorflow/lite/micro/tools/make/ext_libs/xtensa_hifimini_nn_library.inc
@@ -0,0 +1,30 @@
+ifneq ($(filter xtensa-xpg, $(ALL_TAGS)),)
+
+    XTENSA_PATH = $(MAKEFILE_DIR)/../../kernels/xtensa_hifimini
+
+    ifneq (,$(filter xtensa_hifimini%, $(ALL_TAGS)))
+
+        CCFLAGS += -DHIFI_MINI_NNLIB_OPT \
+                   -DDISABLE_NNLIB_UNALIGNED_SUPPORT \
+                   -DXTENSA_NNLIB_MAX_SCRATCH_SIZE=1024
+
+        CXXFLAGS += -DHIFI_MINI_NNLIB_OPT \
+                   -DDISABLE_NNLIB_UNALIGNED_SUPPORT \
+                   -DXTENSA_NNLIB_MAX_SCRATCH_SIZE=1024
+
+        MICROLITE_CC_SRCS += \
+                    $(XTENSA_PATH)/xa_nnlib/algo/kernels/activations/hifi_mini/xa_nn_activations_asym8s_asym8s.c \
+                    $(XTENSA_PATH)/xa_nnlib/algo/kernels/activations/hifi_mini/xa_nn_softmax_asym8_asym8.c \
+                    $(XTENSA_PATH)/xa_nnlib/algo/kernels/basic/hifi_mini/xa_nn_dot_prod_16x16.c \
+                    $(XTENSA_PATH)/xa_nnlib/algo/kernels/fc/hifi_mini/xa_nn_fully_connected.c \
+                    $(XTENSA_PATH)/xa_nnlib/algo/kernels/matXvec/hifi_mini/xa_nn_matXvec_sym8sxasym8s.c \
+
+
+        INCLUDES += -I$(XTENSA_PATH)/xa_nnlib/algo/kernels/ \
+                    -I$(XTENSA_PATH)/xa_nnlib/include/nnlib/ \
+                    -I$(XTENSA_PATH)/xa_nnlib/include/ \
+                    -I$(XTENSA_PATH)/xa_nnlib/algo/common/include/ \
+
+    endif
+
+endif

From 7ebe70863df817d02c758ebb860e09add0e69119 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 24 Jun 2020 17:13:07 -0700
Subject: [PATCH 1042/1390] Remove `unroll_factor` member from `KernelThunk`

PiperOrigin-RevId: 318171874
Change-Id: Ib1c1693f93e580e51515640295531edcb494e3b0
---
 .../xla/service/gpu/ir_emitter_unnested.cc    | 35 +++++++++----------
 .../xla/service/gpu/ir_emitter_unnested.h     | 15 ++++----
 .../compiler/xla/service/gpu/kernel_thunk.cc  |  6 ++--
 .../compiler/xla/service/gpu/kernel_thunk.h   |  8 +----
 .../service/mlir_gpu/mlir_compiler_impl.cc    |  3 +-
 5 files changed, 27 insertions(+), 40 deletions(-)

diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
index 74aad5f5bd5..6882a188374 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
@@ -377,8 +377,7 @@ Status IrEmitterUnnested::HandlePadToStatic(HloInstruction* pad_to_static) {
   int unroll_factor = 1;
   string ir_name = IrName(pad_to_static);
   auto kernel_thunk = BuildKernelThunk(pad_to_static,
-                                       /*implements_whole_instruction=*/true,
-                                       /*unroll_factor=*/unroll_factor);
+                                       /*implements_whole_instruction=*/true);
   // pseudo code for padToStatic on a 2d array
   //   int* source_array = input[0];
   //   int* dest_array = output[0];
@@ -487,8 +486,7 @@ Status IrEmitterUnnested::HandleSliceToDynamic(
   int unroll_factor = 1;
   string ir_name = IrName(slice_to_dynamic);
   auto kernel_thunk = BuildKernelThunk(slice_to_dynamic,
-                                       /*implements_whole_instruction=*/true,
-                                       /*unroll_factor=*/unroll_factor);
+                                       /*implements_whole_instruction=*/true);
 
   std::vector<llvm::Value*> dynamic_dims;
   const Shape& input_shape = slice_to_dynamic->operand(0)->shape();
@@ -619,9 +617,8 @@ Status IrEmitterUnnested::HandleFusion(HloInstruction* fusion) {
         // emit it in a separate kernel. Treat it like a loop fusion, writing to
         // the output buffer.
         {
-          int unroll_factor = ComputeMaxUnrollFactor(fusion);
-          thunks.push_back(BuildKernelThunk(
-              fusion, /*implements_whole_instruction=*/false, unroll_factor));
+          thunks.push_back(
+              BuildKernelThunk(fusion, /*implements_whole_instruction=*/false));
           GpuElementalIrEmitter operand_elemental_emitter(
               hlo_module_config_, ir_emitter_context_->llvm_module(), &b_,
               GetNestedComputer());
@@ -633,7 +630,8 @@ Status IrEmitterUnnested::HandleFusion(HloInstruction* fusion) {
 
           TF_RETURN_IF_ERROR(EmitTargetElementLoopInThunk(
               *fusion, operand_fused_emitter.GetGenerator(root->operand(0)),
-              static_cast<KernelThunk*>(thunks.back().get())));
+              static_cast<KernelThunk*>(thunks.back().get()),
+              ComputeMaxUnrollFactor(fusion)));
         }
 
         // Now build the actual scatter, reading and writing to the freshly
@@ -1673,8 +1671,7 @@ GetHloBufferSlices(const HloInstruction* hlo,
 }
 
 std::unique_ptr<KernelThunk> IrEmitterUnnested::BuildKernelThunk(
-    const HloInstruction* inst, bool implements_whole_instruction,
-    int unroll_factor) {
+    const HloInstruction* inst, bool implements_whole_instruction) {
   const BufferAssignment& buffer_assn =
       ir_emitter_context_->buffer_assignment();
 
@@ -1776,7 +1773,7 @@ std::unique_ptr<KernelThunk> IrEmitterUnnested::BuildKernelThunk(
 
   return absl::make_unique<KernelThunk>(
       non_constant_buffers, std::string(kernel->getName()),
-      implements_whole_instruction ? inst : nullptr, unroll_factor);
+      implements_whole_instruction ? inst : nullptr);
 }
 
 StatusOr<std::unique_ptr<Thunk>> IrEmitterUnnested::BuildInitializerThunk(
@@ -2048,8 +2045,8 @@ std::unique_ptr<Thunk> IrEmitterUnnested::BuildConditionalThunk(
 
 Status IrEmitterUnnested::EmitTargetElementLoopInThunk(
     const HloInstruction& hlo,
-    const llvm_ir::ElementGenerator& element_generator, KernelThunk* thunk) {
-  int unroll_factor = thunk->unroll_factor();
+    const llvm_ir::ElementGenerator& element_generator, KernelThunk* thunk,
+    int unroll_factor) {
   VLOG(3) << bindings_.ToString();
 
   bool multi_output = hlo.shape().IsTuple();
@@ -2144,10 +2141,10 @@ Status IrEmitterUnnested::EmitTargetElementLoop(
     unroll_factor = ComputeMaxUnrollFactor(&hlo);
   }
 
-  std::unique_ptr<KernelThunk> kernel_thunk = BuildKernelThunk(
-      &hlo, /*implements_whole_instruction=*/true, unroll_factor);
-  Status emit_status =
-      EmitTargetElementLoopInThunk(hlo, body_emitter, kernel_thunk.get());
+  std::unique_ptr<KernelThunk> kernel_thunk =
+      BuildKernelThunk(&hlo, /*implements_whole_instruction=*/true);
+  Status emit_status = EmitTargetElementLoopInThunk(
+      hlo, body_emitter, kernel_thunk.get(), unroll_factor);
   thunk_sequence_->emplace_back(std::move(kernel_thunk));
 
   return emit_status;
@@ -3715,8 +3712,8 @@ void IrEmitterUnnested::EmitElementForInputFusibleSlices(
 Status IrEmitterUnnested::EmitInputFusibleNonStridedSlices(
     HloInstruction* unnested_hlo) {
   constexpr int unroll_factor = 1;
-  std::unique_ptr<KernelThunk> kernel_thunk = BuildKernelThunk(
-      unnested_hlo, /*implements_whole_instruction=*/true, unroll_factor);
+  std::unique_ptr<KernelThunk> kernel_thunk =
+      BuildKernelThunk(unnested_hlo, /*implements_whole_instruction=*/true);
 
   TF_ASSIGN_OR_RETURN(Shape element_shape,
                       GetConsistentInputShapeForRootSlices(*unnested_hlo));
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h
index 05436f2c47c..e5a1550e82e 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h
@@ -132,10 +132,11 @@ class IrEmitterUnnested : public IrEmitter,
       const llvm_ir::ElementGenerator& body_emitter) override;
 
   // Same as `EmitTargetElementLoop`, but in given `thunk` rather than
-  // `LastThunk()`.
+  // `LastThunk()`. The kernel implementation will be unrolled if
+  // `unroll_factor` is greater than one.
   Status EmitTargetElementLoopInThunk(
       const HloInstruction& hlo, const llvm_ir::ElementGenerator& body_emitter,
-      KernelThunk* thunk);
+      KernelThunk* thunk, int unroll_factor);
 
   // Emits LLVM global variables corresponding to constant instructions.
   Status EmitConstantGlobals();
@@ -489,13 +490,11 @@ class IrEmitterUnnested : public IrEmitter,
 
   // Returns a KernelThunk that invokes the kernel emitted for `inst`. The
   // caller needs to make sure `inst` outlives the lifetime of the returned
-  // Thunk object. The kernel implementation will be unrolled if unroll_factor
-  // is greater than one. 'implements_whole_instruction' specifies whether
-  // this KernelThunk implements the whole 'inst' HloInstruction. In some
-  // cases 'inst' will be implemented by a sequence of Thunks.
+  // Thunk object. 'implements_whole_instruction' specifies whether this
+  // KernelThunk implements the whole 'inst' HloInstruction. In some cases
+  // 'inst' will be implemented by a sequence of Thunks.
   std::unique_ptr<KernelThunk> BuildKernelThunk(
-      const HloInstruction* inst, bool implements_whole_instruction,
-      int unroll_factor = 1);
+      const HloInstruction* inst, bool implements_whole_instruction);
 
   // Returns a thunk that, given a reduce or select-and-scatter op,
   // initializes its memory to the appropriate initial value.
diff --git a/tensorflow/compiler/xla/service/gpu/kernel_thunk.cc b/tensorflow/compiler/xla/service/gpu/kernel_thunk.cc
index f0c7e285734..881c8e00779 100644
--- a/tensorflow/compiler/xla/service/gpu/kernel_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/kernel_thunk.cc
@@ -35,12 +35,10 @@ namespace gpu {
 
 KernelThunk::KernelThunk(absl::Span<const BufferAllocation* const> args,
                          const string& kernel_name,
-                         const HloInstruction* hlo_instruction,
-                         int unroll_factor)
+                         const HloInstruction* hlo_instruction)
     : Thunk(Kind::kKernel, hlo_instruction),
       args_(args.begin(), args.end()),
-      kernel_name_(kernel_name),
-      unroll_factor_(unroll_factor) {}
+      kernel_name_(kernel_name) {}
 
 Status KernelThunk::Initialize(const GpuExecutable& executable,
                                se::StreamExecutor* executor) {
diff --git a/tensorflow/compiler/xla/service/gpu/kernel_thunk.h b/tensorflow/compiler/xla/service/gpu/kernel_thunk.h
index 25acabb239b..a3386df5917 100644
--- a/tensorflow/compiler/xla/service/gpu/kernel_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/kernel_thunk.h
@@ -48,14 +48,12 @@ class KernelThunk : public Thunk {
   //
   // `hlo_instruction` is as in Thunk. Other arguments are as the class members.
   KernelThunk(absl::Span<const BufferAllocation* const> args,
-              const string& kernel_name, const HloInstruction* hlo_instruction,
-              int unroll_factor);
+              const string& kernel_name, const HloInstruction* hlo_instruction);
   KernelThunk(const KernelThunk&) = delete;
   KernelThunk& operator=(const KernelThunk&) = delete;
   ~KernelThunk() override = default;
 
   const string& kernel_name() const { return kernel_name_; }
-  int unroll_factor() const { return unroll_factor_; }
   void SetLaunchDimensions(const LaunchDimensions& launch_dims);
 
   Status Initialize(const GpuExecutable& executable,
@@ -69,10 +67,6 @@ class KernelThunk : public Thunk {
   // Entry kernel name for the computation.
   const string kernel_name_;
 
-  // The number of times this kernel should be unrolled. This works as a
-  // multiplier on the number of elements produced by a GPU thread.
-  const int unroll_factor_;
-
   // The thread and block dimension used to launch the kernel.
   // Will be set by IrEmitterUnnested.
   LaunchDimensions launch_dimensions_;
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/mlir_compiler_impl.cc b/tensorflow/compiler/xla/service/mlir_gpu/mlir_compiler_impl.cc
index 35ac3b2bf63..a1e05f5421d 100644
--- a/tensorflow/compiler/xla/service/mlir_gpu/mlir_compiler_impl.cc
+++ b/tensorflow/compiler/xla/service/mlir_gpu/mlir_compiler_impl.cc
@@ -437,8 +437,7 @@ StatusOr<std::unique_ptr<gpu::KernelThunk>> TransformKernelToXlaThunk(
 
   // Finally, create the thunk and set the launch dimensions.
   auto thunk = absl::make_unique<gpu::KernelThunk>(
-      buffers, kernel.getName().str(), instr,
-      /*unroll_factor=*/1);
+      buffers, kernel.getName().str(), instr);
 
   // Set launch bounds.
   mlir::gpu::KernelDim3 block = launchOp.getBlockSizeOperandValues();

From 7c1f1bf9e42d94984ba589a97eb17d010711ade3 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 24 Jun 2020 17:24:39 -0700
Subject: [PATCH 1043/1390] Throw error when TF_ListPhysicalDevices fails.

PiperOrigin-RevId: 318173722
Change-Id: I3649bfa6804fd81252292470516872382612c23d
---
 tensorflow/python/tfe_wrapper.cc | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/tensorflow/python/tfe_wrapper.cc b/tensorflow/python/tfe_wrapper.cc
index 4ad49d91134..80cce331353 100644
--- a/tensorflow/python/tfe_wrapper.cc
+++ b/tensorflow/python/tfe_wrapper.cc
@@ -254,10 +254,14 @@ py::object TFE_Py_ExecuteCancelable_wrapper(
 }
 
 static py::object TF_ListPhysicalDevices() {
+  tensorflow::Safe_TF_StatusPtr status = tensorflow::make_safe(TF_NewStatus());
   std::vector<string> devices;
   tensorflow::Status s =
       tensorflow::DeviceFactory::ListAllPhysicalDevices(&devices);
-  MaybeRaiseRegisteredFromStatus(s);
+  tensorflow::Set_TF_Status_from_Status(status.get(), s);
+  if (!s.ok()) {
+    return py::none();
+  }
   PyObject* result = PyList_New(devices.size());
   int i = 0;
   for (auto& dev : devices) {

From 2c11be6aa68402d6bfbec8416c70e4f10198758d Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 24 Jun 2020 17:34:41 -0700
Subject: [PATCH 1044/1390] update bot_config including new reviewers for
 filesystems and security

PiperOrigin-RevId: 318175201
Change-Id: I691d559c86d00fabb8f0c028c1e815f804f7d350
---
 .github/bot_config.yml | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/.github/bot_config.yml b/.github/bot_config.yml
index 88c737f41e2..d0e7256aec0 100644
--- a/.github/bot_config.yml
+++ b/.github/bot_config.yml
@@ -27,6 +27,19 @@ assignees:
 # A list of assignees for compiler folder
 compiler_assignees:
    - joker-eph
+# filesystem path
+filesystem_path:
+   - tensorflow/c/experimental/filesystem
+# security path
+security_path:
+   - tensorflow/security
+# words checklist
+segfault_memory:
+   - segfault
+   - memory leaks
+# assignees
+filesystem_security_assignee:
+   - mihaimaruseac
 # Cuda Comment
 cuda_comment: >
    From the template it looks like you are installing **TensorFlow** (TF) prebuilt binaries:

From 3c350251a8c60a1cb101a63742e501d85b05dd72 Mon Sep 17 00:00:00 2001
From: Pavithra Vijay <psv@google.com>
Date: Wed, 24 Jun 2020 17:40:48 -0700
Subject: [PATCH 1045/1390] Update v1 only `ProximalAdagradOptimizer` test with
 proper reason.

PiperOrigin-RevId: 318176105
Change-Id: I85a13341bf659d1930dc55762f8c8a11020678c2
---
 .../python/training/proximal_adagrad_test.py     | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/tensorflow/python/training/proximal_adagrad_test.py b/tensorflow/python/training/proximal_adagrad_test.py
index ce214ac418a..ea603d4d39c 100644
--- a/tensorflow/python/training/proximal_adagrad_test.py
+++ b/tensorflow/python/training/proximal_adagrad_test.py
@@ -65,15 +65,15 @@ class ProximalAdagradOptimizerTest(test.TestCase):
       self.assertStartsWith(opt_vars[1].name, var1._shared_name)
       self.assertEqual(2, len(opt_vars))
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("ProximalAdagradOptimizer is supported only in V1.")
   def testProximalAdagradwithoutRegularization(self):
     self.doTestProximalAdagradwithoutRegularization(use_resource=False)
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("ProximalAdagradOptimizer is supported only in V1.")
   def testResourceProximalAdagradwithoutRegularization(self):
     self.doTestProximalAdagradwithoutRegularization(use_resource=True)
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("ProximalAdagradOptimizer is supported only in V1.")
   def testProximalAdagradwithoutRegularization2(self):
     with self.cached_session() as sess:
       var0 = variables.Variable([1.0, 2.0])
@@ -100,7 +100,7 @@ class ProximalAdagradOptimizerTest(test.TestCase):
       self.assertAllClose(np.array([-1.60261, -2.296985]), v0_val)
       self.assertAllClose(np.array([3.715679, 2.433051]), v1_val)
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("ProximalAdagradOptimizer is supported only in V1.")
   def testMinimizeSparseResourceVariable(self):
     for dtype in [dtypes.float32, dtypes.float64]:
       with self.cached_session():
@@ -119,7 +119,7 @@ class ProximalAdagradOptimizerTest(test.TestCase):
                                            self.evaluate(var0),
                                            atol=0.01)
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("ProximalAdagradOptimizer is supported only in V1.")
   def testProximalAdagradWithL1(self):
     with self.cached_session() as sess:
       var0 = variables.Variable([1.0, 2.0])
@@ -146,7 +146,7 @@ class ProximalAdagradOptimizerTest(test.TestCase):
       self.assertAllClose(np.array([-6.663634, -9.190331]), v0_val)
       self.assertAllClose(np.array([2.959304, 1.029232]), v1_val)
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("ProximalAdagradOptimizer is supported only in V1.")
   def testProximalAdagradWithL1_L2(self):
     with self.cached_session() as sess:
       var0 = variables.Variable([1.0, 2.0])
@@ -213,7 +213,7 @@ class ProximalAdagradOptimizerTest(test.TestCase):
     v0_val, v1_val = self.evaluate([var0, var1])
     return v0_val, v1_val
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("ProximalAdagradOptimizer is supported only in V1.")
   def testEquivAdagradwithoutRegularization(self):
     with self.cached_session():
       val0, val1 = self.applyOptimizer(
@@ -231,7 +231,7 @@ class ProximalAdagradOptimizerTest(test.TestCase):
     self.assertAllClose(val0, val2)
     self.assertAllClose(val1, val3)
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("ProximalAdagradOptimizer is supported only in V1.")
   def testEquivSparseAdagradwithoutRegularization(self):
     with self.cached_session():
       val0, val1 = self.applyOptimizer(

From 0cc96ea13bf6131dcf49fd3f5e64175b9e7824f0 Mon Sep 17 00:00:00 2001
From: Robert David <lrdx@google.com>
Date: Wed, 24 Jun 2020 17:46:50 -0700
Subject: [PATCH 1046/1390] Remove redundant ternary operator: scratch3 is
 already nullptr if asymmetric_quantize_inputs is false.

PiperOrigin-RevId: 318176946
Change-Id: Ia695d44cc11ea89460efefc0c9cebfafb6eb3b26
---
 tensorflow/lite/kernels/lstm_eval.cc | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/tensorflow/lite/kernels/lstm_eval.cc b/tensorflow/lite/kernels/lstm_eval.cc
index 178c2b62fb9..91b8d4e721c 100644
--- a/tensorflow/lite/kernels/lstm_eval.cc
+++ b/tensorflow/lite/kernels/lstm_eval.cc
@@ -231,8 +231,7 @@ void CalculateLstmOutputHybrid(
       tensor_utils::MatrixBatchVectorMultiplyAccumulate(
           projection_weights, n_output, n_cell, scratch1,
           projection_weights_scale, scratch2, n_batch, output_state,
-          /*per_channel_scale=*/nullptr,
-          asymmetric_quantize_inputs ? scratch3 : nullptr, scratch4,
+          /*per_channel_scale=*/nullptr, scratch3, scratch4,
           projection_weights_row_sums, compute_row_sums, scratch2, context);
     }
     if (proj_clip > 0.0f) {

From 40a490ccb19593b2e588f951ac58f5f5fffe8b98 Mon Sep 17 00:00:00 2001
From: Henry Tan <henrytan@google.com>
Date: Wed, 24 Jun 2020 17:50:38 -0700
Subject: [PATCH 1047/1390] TPU library internal refactor.

PiperOrigin-RevId: 318177458
Change-Id: I849fb7214fac4eebbfec3a929f030a3de3cbc9a5
---
 tensorflow/core/tpu/kernels/tpu_op_util.cc | 1 -
 tensorflow/stream_executor/tpu/BUILD       | 5 +----
 2 files changed, 1 insertion(+), 5 deletions(-)

diff --git a/tensorflow/core/tpu/kernels/tpu_op_util.cc b/tensorflow/core/tpu/kernels/tpu_op_util.cc
index 31b0cc6c72d..7b7cf468499 100644
--- a/tensorflow/core/tpu/kernels/tpu_op_util.cc
+++ b/tensorflow/core/tpu/kernels/tpu_op_util.cc
@@ -157,7 +157,6 @@ TpuCompilationCacheKey CreateCompilationCacheKey(
 TpuCompilationCacheKey CreateCompilationCacheKey(
     absl::string_view function_name, uint64 function_library_fingerprint,
     absl::string_view mlir_module, const OpInputList& guaranteed_constants,
-    size_t guaranteed_constants_size,
     const std::vector<TensorShape>& dynamic_shapes,
     const TPUCompileMetadataProto& metadata,
     const TpuMeshStateInterface& mesh_state) {
diff --git a/tensorflow/stream_executor/tpu/BUILD b/tensorflow/stream_executor/tpu/BUILD
index 71c2c728a17..06165e90330 100644
--- a/tensorflow/stream_executor/tpu/BUILD
+++ b/tensorflow/stream_executor/tpu/BUILD
@@ -1,10 +1,7 @@
 # Description: StreamExecutor Interface for TPUs
 
 package(
-    default_visibility = [
-        "//learning/brain/tfrc/runtime/tpu_driver:__subpackages__",
-        "//tensorflow/core/tpu:__subpackages__",
-    ],
+    default_visibility = ["//tensorflow/core/tpu:__subpackages__"],
     licenses = ["notice"],  # Apache 2.0
 )
 

From e795122d92008e337368389e5f36c89dc8a02853 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 24 Jun 2020 18:27:30 -0700
Subject: [PATCH 1048/1390] Integrate LLVM at
 https://github.com/llvm/llvm-project/commit/1c0bbe4341ac

PiperOrigin-RevId: 318182263
Change-Id: I7643bd239e3e64f789a73632edf0bc3dde8d4203
---
 tensorflow/workspace.bzl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index 454c3e5ec21..8eb25f81aae 100755
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -710,8 +710,8 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
     )
 
     # Check out LLVM and MLIR from llvm-project.
-    LLVM_COMMIT = "cdd6a2788caced6b377af818154138d9983dba5f"
-    LLVM_SHA256 = "0b61c3585b8157d820edb5177bf821a9f6ce7b73747ef63cdc3ce12f79f24138"
+    LLVM_COMMIT = "1c0bbe4341ac0ffbaf2e1f482239b45166607f2d"
+    LLVM_SHA256 = "6e4afff5fb0bb2142d871df94826dd52ae00730485049669567b5c0ea4f18bd2"
     LLVM_URLS = [
         "https://storage.googleapis.com/mirror.tensorflow.org/github.com/llvm/llvm-project/archive/{commit}.tar.gz".format(commit = LLVM_COMMIT),
         "https://github.com/llvm/llvm-project/archive/{commit}.tar.gz".format(commit = LLVM_COMMIT),

From 83d90177627a7a18add94de7b850cc020a509dd3 Mon Sep 17 00:00:00 2001
From: Jacques Pienaar <jpienaar@google.com>
Date: Wed, 24 Jun 2020 18:46:48 -0700
Subject: [PATCH 1049/1390] Split out tf_ops cc_library rule

This should make it easier to split the generated ops files. On its own this doesn't help the compilation, it just moves the ops to their own lib where we could split those up again.

PiperOrigin-RevId: 318184606
Change-Id: Ie0f26cb7d3e4c3da70c66e845312896afed81a90
---
 tensorflow/compiler/mlir/tensorflow/BUILD     | 151 +++++++++++++++---
 .../mlir/tensorflow/ir/tf_op_interfaces.h     |  28 ++++
 .../compiler/mlir/tensorflow/ir/tf_ops.h      |   2 +-
 .../mlir/tensorflow/ir/tf_verifiers.cc        |   2 +-
 .../mlir/tensorflow/ir/tf_verifiers.h         |   2 +-
 5 files changed, 156 insertions(+), 29 deletions(-)
 create mode 100644 tensorflow/compiler/mlir/tensorflow/ir/tf_op_interfaces.h

diff --git a/tensorflow/compiler/mlir/tensorflow/BUILD b/tensorflow/compiler/mlir/tensorflow/BUILD
index 5de8db4db0f..c348284daff 100644
--- a/tensorflow/compiler/mlir/tensorflow/BUILD
+++ b/tensorflow/compiler/mlir/tensorflow/BUILD
@@ -61,16 +61,8 @@ gentbl(
 )
 
 gentbl(
-    name = "tensorflow_ops_inc_gen",
+    name = "tensorflow_struct_doc_gen",
     tbl_outs = [
-        (
-            "-gen-op-decls",
-            "ir/tf_ops.h.inc",
-        ),
-        (
-            "-gen-op-defs",
-            "ir/tf_ops.cc.inc",
-        ),
         (
             "-gen-dialect-doc",
             "g3doc/tf_ops.md",
@@ -92,6 +84,44 @@ gentbl(
     test = True,
 )
 
+cc_library(
+    name = "tensorflow_op_interfaces",
+    srcs = [
+        "ir/tf_op_interfaces.cc.inc",
+        "ir/tf_op_interfaces.h.inc",
+        "ir/tf_verifiers.cc",
+    ],
+    hdrs = [
+        "ir/tf_op_interfaces.h",
+        "ir/tf_verifiers.h",
+    ],
+    deps = [
+        ":tensorflow_op_interfaces_inc_gen",
+        ":tensorflow_structs",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Support",
+    ],
+)
+
+gentbl(
+    name = "tensorflow_ops_inc_gen",
+    tbl_outs = [
+        (
+            "-gen-op-decls",
+            "ir/tf_ops.h.inc",
+        ),
+        (
+            "-gen-op-defs",
+            "ir/tf_ops.cc.inc",
+        ),
+    ],
+    tblgen = "@llvm-project//mlir:mlir-tblgen",
+    td_file = "ir/tf_ops.td",
+    td_srcs = [
+        ":tensorflow_ops_td_files",
+    ],
+)
+
 gentbl(
     name = "tf_saved_model_inc_gen",
     tbl_outs = [
@@ -235,15 +265,96 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "tensorflow_traits",
+    srcs = [
+    ],
+    hdrs = [
+        "ir/tf_traits.h",
+    ],
+    deps = [
+        ":tensorflow_types",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Support",
+    ],
+)
+
+cc_library(
+    name = "tensorflow_ops",
+    srcs = [
+        "ir/tf_ops.cc",
+        "ir/tf_ops.cc.inc",
+        "ir/tf_ops.h",
+    ],
+    hdrs = [
+    ],
+    textual_hdrs = [
+        "ir/tf_ops.h.inc",
+    ],
+    deps = [
+        ":tensorflow_attributes",
+        ":tensorflow_canonicalize_inc_gen",
+        ":tensorflow_op_interfaces",
+        ":tensorflow_op_interfaces_inc_gen",
+        ":tensorflow_ops_inc_gen",
+        ":tensorflow_side_effects",
+        ":tensorflow_structs",
+        ":tensorflow_traits",
+        ":tensorflow_types",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:DerivedAttributeOpInterface",
+        "@llvm-project//mlir:Dialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:InferTypeOpInterface",
+        "@llvm-project//mlir:Parser",
+        "@llvm-project//mlir:SideEffects",
+        "@llvm-project//mlir:StandardOps",
+        "@llvm-project//mlir:Support",
+    ],
+)
+
+cc_library(
+    name = "tensorflow_structs",
+    srcs = [
+        "ir/tf_structs.cc",
+    ],
+    hdrs = [
+        "ir/tf_structs.h",
+    ],
+    textual_hdrs = [
+        "ir/tf_structs.h.inc",
+    ],
+    deps = [
+        ":tensorflow_struct_doc_gen",
+        "//tensorflow/core:framework",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:IR",
+    ],
+)
+
+cc_library(
+    name = "tensorflow_side_effects",
+    srcs = [
+    ],
+    hdrs = [
+        "ir/tf_side_effects.h",
+    ],
+    deps = ["@llvm-project//mlir:SideEffects"],
+)
+
 cc_library(
     name = "tensorflow_types",
     srcs = [
         "ir/tf_types.cc",
     ],
     hdrs = [
-        "ir/tf_types.def",
         "ir/tf_types.h",
     ],
+    textual_hdrs = [
+        "ir/tf_types.def",
+    ],
     deps = [
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:Dialect",
@@ -258,24 +369,14 @@ cc_library(
         "ir/tf_executor.cc",
         "ir/tf_executor.cc.inc",
         "ir/tf_executor.h.inc",
-        "ir/tf_op_interfaces.cc.inc",
-        "ir/tf_op_interfaces.h.inc",
-        "ir/tf_ops.cc",
-        "ir/tf_ops.cc.inc",
-        "ir/tf_ops.h.inc",
         "ir/tf_saved_model.cc",
-        "ir/tf_structs.cc",
-        "ir/tf_verifiers.cc",
     ],
     hdrs = [
         "ir/tf_device.h",
         "ir/tf_executor.h",
         "ir/tf_ops.h",
         "ir/tf_saved_model.h",
-        "ir/tf_side_effects.h",
         "ir/tf_structs.h",
-        "ir/tf_traits.h",
-        "ir/tf_verifiers.h",
         "transforms/bridge.h",
         "transforms/einsum.h",
         "transforms/passes.h",
@@ -290,8 +391,10 @@ cc_library(
         ":tensorflow_canonicalize_inc_gen",
         ":tensorflow_device_ops_inc_gen",
         ":tensorflow_executor_inc_gen",
-        ":tensorflow_op_interfaces_inc_gen",
-        ":tensorflow_ops_inc_gen",
+        ":tensorflow_op_interfaces",
+        ":tensorflow_ops",
+        ":tensorflow_structs",
+        ":tensorflow_traits",
         ":tensorflow_types",
         ":tf_saved_model_inc_gen",
         "//tensorflow/compiler/mlir/lite:validators",
@@ -422,7 +525,6 @@ cc_library(
         "//tensorflow/core/platform:threadpool_options",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:Support",
     ],
     alwayslink = 1,
@@ -485,8 +587,6 @@ cc_library(
         ":tensorflow_passes",
         ":tensorflow_types",
         "//tensorflow/core:core_cpu",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Pass",
@@ -956,7 +1056,6 @@ tf_cc_test(
         ":convert_tensor",
         ":tensorflow",
         "//tensorflow/compiler/xla:test",
-        "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_op_interfaces.h b/tensorflow/compiler/mlir/tensorflow/ir/tf_op_interfaces.h
new file mode 100644
index 00000000000..ec1f748367d
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_op_interfaces.h
@@ -0,0 +1,28 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_IR_TF_OP_INTERFACES_H_
+#define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_IR_TF_OP_INTERFACES_H_
+#include "mlir/IR/OpImplementation.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_structs.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_verifiers.h"
+
+namespace mlir {
+namespace TF {
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_op_interfaces.h.inc"
+}  // namespace TF
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_IR_TF_OP_INTERFACES_H_
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h
index 88307267ab4..ab6a2be0635 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h
@@ -33,6 +33,7 @@ limitations under the License.
 #include "mlir/Interfaces/InferTypeOpInterface.h"  // from @llvm-project
 #include "mlir/Interfaces/SideEffectInterfaces.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_attributes.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_op_interfaces.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_structs.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_traits.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
@@ -92,7 +93,6 @@ class TensorFlowDialect : public Dialect {
 // both mutex.h and this header file.
 #undef mutex_lock
 
-#include "tensorflow/compiler/mlir/tensorflow/ir/tf_op_interfaces.h.inc"
 #define GET_OP_CLASSES
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h.inc"
 
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_verifiers.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_verifiers.cc
index 772769eebc3..d6d22098666 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_verifiers.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_verifiers.cc
@@ -16,7 +16,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_verifiers.h"
 
 #include "mlir/Support/LLVM.h"  // from @llvm-project
-#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_op_interfaces.h"
 
 namespace mlir {
 namespace TF {
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_verifiers.h b/tensorflow/compiler/mlir/tensorflow/ir/tf_verifiers.h
index f7d38f2b371..8fbf54c48d6 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_verifiers.h
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_verifiers.h
@@ -16,8 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_IR_TF_VERIFIERS_H_
 #define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_IR_TF_VERIFIERS_H_
 
+#include "mlir/IR/Operation.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
-#include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
 
 namespace mlir {
 namespace TF {

From 9db000200763326cce30f0f9e744253d813afd1b Mon Sep 17 00:00:00 2001
From: Chenkai Kuang <chenkai@google.com>
Date: Wed, 24 Jun 2020 18:51:58 -0700
Subject: [PATCH 1050/1390] Clarify ReplicaContext.devices returns a nested
 list for MS and MWMS.

PiperOrigin-RevId: 318185207
Change-Id: Iaafc17d4396adf918f5e6ed25009f86213f9591d
---
 tensorflow/python/distribute/distribute_lib.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/tensorflow/python/distribute/distribute_lib.py b/tensorflow/python/distribute/distribute_lib.py
index a06767b95cc..4a57628d4c7 100644
--- a/tensorflow/python/distribute/distribute_lib.py
+++ b/tensorflow/python/distribute/distribute_lib.py
@@ -2814,8 +2814,15 @@ class ReplicaContext(object):
     return self._strategy
 
   @property
+  @deprecation.deprecated(None, "Please avoid relying on devices property.")
   def devices(self):
-    """The devices this replica is to be executed on, as a tuple of strings."""
+    """Returns the devices this replica is to be executed on, as a tuple of strings.
+
+    NOTE: For `tf.distribute.MirroredStrategy` and
+    `tf.distribute.experimental.MultiWorkerMirroredStrategy`, this returns a
+    nested
+    list of device strings, e.g, [["gpu:0"]].
+    """
     require_replica_context(self)
     return (device_util.current(),)
 

From 22a2abd1a106cb9d8dbfb8957532932327477987 Mon Sep 17 00:00:00 2001
From: Henry Tan <henrytan@google.com>
Date: Wed, 24 Jun 2020 18:59:58 -0700
Subject: [PATCH 1051/1390] Updating calling C_API through the TPU C_API proxy.

PiperOrigin-RevId: 318186086
Change-Id: I2cf472a1d1b17cc27d07c7e085073620cc2c699e
---
 tensorflow/core/tpu/kernels/BUILD                    | 1 +
 tensorflow/core/tpu/kernels/tpu_compile_op_common.cc | 3 ++-
 tensorflow/core/tpu/kernels/tpu_util_c_api.h         | 1 +
 3 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/tensorflow/core/tpu/kernels/BUILD b/tensorflow/core/tpu/kernels/BUILD
index 071c99babfe..8c0b574045b 100644
--- a/tensorflow/core/tpu/kernels/BUILD
+++ b/tensorflow/core/tpu/kernels/BUILD
@@ -49,6 +49,7 @@ cc_library(
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/protobuf/tpu:compile_metadata_proto_cc",
         "//tensorflow/core/protobuf/tpu:dynamic_padding_proto_cc",
+        "//tensorflow/core/tpu:tpu_api",
         "//tensorflow/core/tpu:tpu_configuration",
         "//tensorflow/core/tpu:tpu_defs",
         "//tensorflow/stream_executor/tpu:tpu_platform_interface",
diff --git a/tensorflow/core/tpu/kernels/tpu_compile_op_common.cc b/tensorflow/core/tpu/kernels/tpu_compile_op_common.cc
index 9cc494bc244..6fa431df9e3 100644
--- a/tensorflow/core/tpu/kernels/tpu_compile_op_common.cc
+++ b/tensorflow/core/tpu/kernels/tpu_compile_op_common.cc
@@ -33,6 +33,7 @@ limitations under the License.
 #include "tensorflow/core/tpu/kernels/tpu_program_group_interface.h"
 #include "tensorflow/core/tpu/kernels/tpu_util.h"
 #include "tensorflow/core/tpu/kernels/tpu_util_c_api.h"
+#include "tensorflow/core/tpu/tpu_api.h"
 #include "tensorflow/core/tpu/tpu_configuration.h"
 #include "tensorflow/core/tpu/tpu_defs.h"
 
@@ -530,7 +531,7 @@ void TpuCompileOpKernelCommon::Compute(OpKernelContext* ctx) {
       ctx->cancellation_manager()->get_cancellation_token();
   const bool already_cancelled =
       !ctx->cancellation_manager()->RegisterCallback(token, [ctx, done]() {
-        if (TpuCompile_ShouldTpuCompileOpIgnoreCancellation()) {
+        if (UtilApiFn()->TpuCompile_ShouldTpuCompileOpIgnoreCancellationFn()) {
           return;
         }
 
diff --git a/tensorflow/core/tpu/kernels/tpu_util_c_api.h b/tensorflow/core/tpu/kernels/tpu_util_c_api.h
index 4679ee00d15..f53a6b94564 100644
--- a/tensorflow/core/tpu/kernels/tpu_util_c_api.h
+++ b/tensorflow/core/tpu/kernels/tpu_util_c_api.h
@@ -41,6 +41,7 @@ bool TpuCompile_ShouldTpuCompileOpIgnoreCancellation();
 
 struct TfTpu_UtilApiFn {
   TFTPU_ADD_FN_IN_STRUCT(TpuCompile_IsTpuCompilationEnabled);
+  TFTPU_ADD_FN_IN_STRUCT(TpuCompile_ShouldTpuCompileOpIgnoreCancellation);
   TFTPU_ADD_FN_IN_STRUCT(TpuCompile_ToTpuShapeRepresentation);
 };
 

From dc8862dc73b3a5fd92907f993acec1d094a0b40a Mon Sep 17 00:00:00 2001
From: Bruce Fontaine <bfontain@google.com>
Date: Wed, 24 Jun 2020 19:00:55 -0700
Subject: [PATCH 1052/1390] Allow tf.distribute.TPUStrategy to be used with
 TPUEmbedding API and ensure that LossScaleOptimizer properly rejects it.

PiperOrigin-RevId: 318186211
Change-Id: Id3b9cb8288e5d28ddbaec97d5b35627ab35bc08d
---
 .../keras/mixed_precision/experimental/loss_scale_optimizer.py | 3 ++-
 tensorflow/python/tpu/tpu_embedding_v2.py                      | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/tensorflow/python/keras/mixed_precision/experimental/loss_scale_optimizer.py b/tensorflow/python/keras/mixed_precision/experimental/loss_scale_optimizer.py
index 72d9d0390fd..d7de7c41208 100644
--- a/tensorflow/python/keras/mixed_precision/experimental/loss_scale_optimizer.py
+++ b/tensorflow/python/keras/mixed_precision/experimental/loss_scale_optimizer.py
@@ -440,7 +440,8 @@ class LossScaleOptimizer(_DelegatingTrackableMixin, optimizer_v2.OptimizerV2):
     if not strategy_supports_loss_scaling():
       strategy = distribution_strategy_context.get_strategy()
       if isinstance(strategy,
-                    (tpu_strategy.TPUStrategy, tpu_strategy.TPUStrategyV1)):
+                    (tpu_strategy.TPUStrategy, tpu_strategy.TPUStrategyV1,
+                     tpu_strategy.TPUStrategyV2)):
         raise ValueError(
             'Loss scaling is not supported with TPUStrategy. Loss scaling is '
             'unnecessary with TPUs, since they support bfloat16 instead of '
diff --git a/tensorflow/python/tpu/tpu_embedding_v2.py b/tensorflow/python/tpu/tpu_embedding_v2.py
index 6db67fea367..eea2dea53c2 100644
--- a/tensorflow/python/tpu/tpu_embedding_v2.py
+++ b/tensorflow/python/tpu/tpu_embedding_v2.py
@@ -265,7 +265,8 @@ class TPUEmbedding(tracking.AutoTrackable):
       Adam or Adagrad).
     """
     self._strategy = distribution_strategy_context.get_strategy()
-    self._using_tpu = isinstance(self._strategy, tpu_strategy.TPUStrategy)
+    self._using_tpu = isinstance(self._strategy, (tpu_strategy.TPUStrategy,
+                                                  tpu_strategy.TPUStrategyV2))
     self._pipeline_execution_with_tensor_core = (
         pipeline_execution_with_tensor_core)
 

From ba51d7e17847bf20ecca4cfe9213f9845cf7e09b Mon Sep 17 00:00:00 2001
From: Scott Zhu <scottzhu@google.com>
Date: Wed, 24 Jun 2020 19:35:41 -0700
Subject: [PATCH 1053/1390] Move saved_model_mixed_api_test to keras/distribute

PiperOrigin-RevId: 318189813
Change-Id: I7659dd47ddd91e698fc23edd5a527e05a382d513
---
 tensorflow/python/distribute/BUILD            | 19 -------------------
 tensorflow/python/keras/distribute/BUILD      | 19 +++++++++++++++++++
 .../distribute/saved_model_mixed_api_test.py  |  0
 3 files changed, 19 insertions(+), 19 deletions(-)
 rename tensorflow/python/{ => keras}/distribute/saved_model_mixed_api_test.py (100%)

diff --git a/tensorflow/python/distribute/BUILD b/tensorflow/python/distribute/BUILD
index 87a4419e9eb..239f44f4926 100644
--- a/tensorflow/python/distribute/BUILD
+++ b/tensorflow/python/distribute/BUILD
@@ -1595,25 +1595,6 @@ distribute_py_test(
     ],
 )
 
-distribute_py_test(
-    name = "saved_model_mixed_api_test",
-    size = "medium",
-    srcs = ["saved_model_mixed_api_test.py"],
-    full_precision = True,
-    main = "saved_model_mixed_api_test.py",
-    shard_count = 7,
-    tags = [
-        "multi_and_single_gpu",
-        "no_rocm",
-    ],
-    deps = [
-        ":combinations",
-        ":saved_model_test_base",
-        "//tensorflow/python/eager:test",
-        "//tensorflow/python/keras/saving",
-    ],
-)
-
 distribute_py_test(
     name = "ctl_correctness_test",
     srcs = ["ctl_correctness_test.py"],
diff --git a/tensorflow/python/keras/distribute/BUILD b/tensorflow/python/keras/distribute/BUILD
index 935cdbcb239..9fa2f315720 100644
--- a/tensorflow/python/keras/distribute/BUILD
+++ b/tensorflow/python/keras/distribute/BUILD
@@ -547,3 +547,22 @@ py_test(
         "//tensorflow/python/keras/optimizer_v2",
     ],
 )
+
+distribute_py_test(
+    name = "saved_model_mixed_api_test",
+    size = "medium",
+    srcs = ["saved_model_mixed_api_test.py"],
+    full_precision = True,
+    main = "saved_model_mixed_api_test.py",
+    shard_count = 7,
+    tags = [
+        "multi_and_single_gpu",
+        "no_rocm",
+    ],
+    deps = [
+        "//tensorflow/python/distribute:combinations",
+        "//tensorflow/python/distribute:saved_model_test_base",
+        "//tensorflow/python/eager:test",
+        "//tensorflow/python/keras/saving",
+    ],
+)
diff --git a/tensorflow/python/distribute/saved_model_mixed_api_test.py b/tensorflow/python/keras/distribute/saved_model_mixed_api_test.py
similarity index 100%
rename from tensorflow/python/distribute/saved_model_mixed_api_test.py
rename to tensorflow/python/keras/distribute/saved_model_mixed_api_test.py

From 8d699b2f42c9562272ff648ebf71de1bca6c63d2 Mon Sep 17 00:00:00 2001
From: George Karpenkov <cheshire@google.com>
Date: Wed, 24 Jun 2020 19:38:56 -0700
Subject: [PATCH 1054/1390] [SE] Port * and -> operator for StaturOr from absl

PiperOrigin-RevId: 318190091
Change-Id: I740286eb8a71ebcf493fa13f9f7ced45be24048b
---
 tensorflow/stream_executor/lib/statusor.h     | 57 +++++++++++++++++++
 .../stream_executor/lib/statusor_test.cc      | 20 +++++++
 2 files changed, 77 insertions(+)

diff --git a/tensorflow/stream_executor/lib/statusor.h b/tensorflow/stream_executor/lib/statusor.h
index 738abf95893..2243fb1b34a 100644
--- a/tensorflow/stream_executor/lib/statusor.h
+++ b/tensorflow/stream_executor/lib/statusor.h
@@ -195,6 +195,27 @@ class StatusOr : private internal_statusor::StatusOrData<T>,
   const T&& ValueOrDie() const &&;
   T&& ValueOrDie() &&;
 
+  // Returns a reference to the current value.
+  //
+  // REQUIRES: this->ok() == true, otherwise the behavior is undefined.
+  //
+  // Use this->ok() or `operator bool()` to verify that there is a current
+  // value. Alternatively, see ValueOrDie() for a similar API that guarantees
+  // CHECK-failing if there is no current value.
+  const T& operator*() const&;
+  T& operator*() &;
+  const T&& operator*() const&&;
+  T&& operator*() &&;
+
+  // Returns a pointer to the current value.
+  //
+  // REQUIRES: this->ok() == true, otherwise the behavior is undefined.
+  //
+  // Use this->ok() or `operator bool()` to verify that there is a current
+  // value.
+  const T* operator->() const;
+  T* operator->();
+
   T ConsumeValueOrDie() { return std::move(ValueOrDie()); }
 
   // Ignores any errors. This method does nothing except potentially suppress
@@ -303,6 +324,42 @@ T&& StatusOr<T>::ValueOrDie() && {
   return std::move(this->data_);
 }
 
+template <typename T>
+const T* StatusOr<T>::operator->() const {
+  this->EnsureOk();
+  return &this->data_;
+}
+
+template <typename T>
+T* StatusOr<T>::operator->() {
+  this->EnsureOk();
+  return &this->data_;
+}
+
+template <typename T>
+const T& StatusOr<T>::operator*() const& {
+  this->EnsureOk();
+  return this->data_;
+}
+
+template <typename T>
+T& StatusOr<T>::operator*() & {
+  this->EnsureOk();
+  return this->data_;
+}
+
+template <typename T>
+const T&& StatusOr<T>::operator*() const&& {
+  this->EnsureOk();
+  return std::move(this->data_);
+}
+
+template <typename T>
+T&& StatusOr<T>::operator*() && {
+  this->EnsureOk();
+  return std::move(this->data_);
+}
+
 template <typename T>
 void StatusOr<T>::IgnoreError() const {
   // no-op
diff --git a/tensorflow/stream_executor/lib/statusor_test.cc b/tensorflow/stream_executor/lib/statusor_test.cc
index 16480b30789..46bdb9d208f 100644
--- a/tensorflow/stream_executor/lib/statusor_test.cc
+++ b/tensorflow/stream_executor/lib/statusor_test.cc
@@ -413,6 +413,26 @@ TEST(StatusOr, TestPointerValueConst) {
   EXPECT_EQ(&kI, thing.ValueOrDie());
 }
 
+TEST(StatusOr, TestArrowOperator) {
+  StatusOr<std::unique_ptr<int>> uptr = ReturnUniquePtr();
+  EXPECT_EQ(*uptr->get(), 0);
+}
+
+TEST(StatusOr, TestArrowOperatorNotOk) {
+  StatusOr<Base1> error(Status(tensorflow::error::CANCELLED, "cancelled"));
+  EXPECT_DEATH(error->pad_++, "cancelled");
+}
+
+TEST(StatusOr, TestStarOperator) {
+  StatusOr<std::unique_ptr<int>> uptr = ReturnUniquePtr();
+  EXPECT_EQ(**uptr, 0);
+}
+
+TEST(StatusOr, TestStarOperatorDeath) {
+  StatusOr<Base1> error(Status(tensorflow::error::CANCELLED, "cancelled"));
+  EXPECT_DEATH(*error, "cancelled");
+}
+
 // NOTE(tucker): StatusOr does not support this kind
 // of resize op.
 // TEST(StatusOr, StatusOrVectorOfUniquePointerCanResize) {

From 896c9220929e85fec93c6587330f19815f71e589 Mon Sep 17 00:00:00 2001
From: Saurabh Saxena <srbs@google.com>
Date: Wed, 24 Jun 2020 19:47:27 -0700
Subject: [PATCH 1055/1390] Internal.

PiperOrigin-RevId: 318190892
Change-Id: I9770cc8689f1ad20c25b04ca2d5503bab9f0b33c
---
 tensorflow/core/lib/llvm_rtti/llvm_rtti.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/lib/llvm_rtti/llvm_rtti.h b/tensorflow/core/lib/llvm_rtti/llvm_rtti.h
index 72c4174b592..77a63f807fa 100644
--- a/tensorflow/core/lib/llvm_rtti/llvm_rtti.h
+++ b/tensorflow/core/lib/llvm_rtti/llvm_rtti.h
@@ -15,7 +15,7 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_LIB_CORE_LLVM_RTTI_H_
 #define TENSORFLOW_CORE_LIB_CORE_LLVM_RTTI_H_
 
-#include "third_party/llvm/llvm-project/llvm/include/llvm/Support/Casting.h"
+#include "llvm/Support/Casting.h"
 
 namespace tensorflow {
 using llvm::dyn_cast;

From 772b836fdb445a5d1493ce9fb641037131834814 Mon Sep 17 00:00:00 2001
From: Michael Gester <mgester@google.com>
Date: Wed, 24 Jun 2020 20:59:56 -0700
Subject: [PATCH 1056/1390] Added TF-to-XLA_HLO lowering for AvgPoolGrad

Also added required ops and MLIR unit tests.

PiperOrigin-RevId: 318199300
Change-Id: I46bf921b5a14c1c4428bfdf51d3e3415a3af65bc
---
 .../mlir/tensorflow/ir/tf_generated_ops.td    |  23 ++
 tensorflow/compiler/mlir/xla/BUILD            |   1 +
 .../compiler/mlir/xla/tests/legalize-tf.mlir  | 256 +++++++++++++++-
 .../mlir/xla/transforms/legalize_tf.cc        | 277 ++++++++++++++++--
 tensorflow/compiler/tests/BUILD               |   2 +
 .../compiler/tests/pooling_ops_3d_test.py     |  26 ++
 tensorflow/compiler/tests/pooling_ops_test.py |  15 +
 7 files changed, 580 insertions(+), 20 deletions(-)

diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td b/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
index 65ca3ea4dbd..146af4adbbc 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
@@ -745,6 +745,29 @@ def TF_AvgPoolGradOp : TF_Op<"AvgPoolGrad", [NoSideEffect]> {
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<1>;
 }
 
+def TF_AvgPool3DGradOp : TF_Op<"AvgPool3DGrad", [NoSideEffect]> {
+  let summary = "Computes gradients of the 3D average pooling function.";
+
+  let description = [{
+  }];
+
+  let arguments = (ins
+    I32Tensor:$orig_input_shape,
+    TF_FpTensor:$grad,
+
+    Confined<I64ArrayAttr, [ArrayMinCount<5>]>:$ksize,
+    Confined<I64ArrayAttr, [ArrayMinCount<5>]>:$strides,
+    TF_AnyStrAttrOf<["SAME", "VALID"]>:$padding,
+    DefaultValuedAttr<TF_AnyStrAttrOf<["NDHWC", "NCDHW"]>, "NDHWC">:$data_format
+  );
+
+  let results = (outs
+    TF_FpTensor:$output
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<1>;
+}
+
 def TF_BatchMatMulOp : TF_Op<"BatchMatMul", [NoSideEffect]> {
   let summary = "Multiplies slices of two tensors in batches.";
 
diff --git a/tensorflow/compiler/mlir/xla/BUILD b/tensorflow/compiler/mlir/xla/BUILD
index d089f80d571..657f34ee02e 100644
--- a/tensorflow/compiler/mlir/xla/BUILD
+++ b/tensorflow/compiler/mlir/xla/BUILD
@@ -175,6 +175,7 @@ cc_library(
         "//tensorflow/compiler/xla:xla_data_proto_cc",
         "//tensorflow/compiler/xla/client:padding",
         "//tensorflow/compiler/xla/client:sharding_builder",
+        "//tensorflow/compiler/xla/client/lib:conv_grad_size_util",
         "//tensorflow/core:framework",
         "//tensorflow/core/kernels:conv_grad_shape_utils",
         "//tensorflow/core/lib/bfloat16",
diff --git a/tensorflow/compiler/mlir/xla/tests/legalize-tf.mlir b/tensorflow/compiler/mlir/xla/tests/legalize-tf.mlir
index 2cd98ea3f6b..10d69221979 100644
--- a/tensorflow/compiler/mlir/xla/tests/legalize-tf.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/legalize-tf.mlir
@@ -1096,7 +1096,6 @@ func @max_pool_grad_valid(%orig_input: tensor<10x24x24x64xf32>, %orig_output: te
   // CHECK: "xla_hlo.return"(%[[SELECT_RESULT]]) : (tensor<f32>) -> ()
   // CHECK: }) {window_dimensions = dense<[1, 2, 2, 1]> : tensor<4xi64>, window_strides = dense<[1, 2, 2, 1]> : tensor<4xi64>} : (tensor<10x24x24x64xf32>, tensor<10x12x12x64xf32>, tensor<f32>) -> tensor<10x24x24x64xf32>
   // CHECK: return %[[RESULT]] : tensor<10x24x24x64xf32>
-  // CHECK: }
   %result = "tf.MaxPoolGrad"(%orig_input, %orig_output, %grad) {
      data_format = "NHWC",
      ksize = [1, 2, 2, 1],
@@ -1120,7 +1119,6 @@ func @max_pool_3d_grad_valid(%orig_input: tensor<10x8x24x24x64xf32>, %orig_outpu
   // CHECK: "xla_hlo.return"(%[[SELECT_RESULT]]) : (tensor<f32>) -> ()
   // CHECK: }) {window_dimensions = dense<[1, 1, 2, 2, 1]> : tensor<5xi64>, window_strides = dense<[1, 1, 2, 2, 1]> : tensor<5xi64>} : (tensor<10x8x24x24x64xf32>, tensor<10x8x12x12x64xf32>, tensor<f32>) -> tensor<10x8x24x24x64xf32>
   // CHECK: return %[[RESULT]] : tensor<10x8x24x24x64xf32>
-  // CHECK: }
   %result = "tf.MaxPool3DGrad"(%orig_input, %orig_output, %grad) {data_format = "NDHWC", ksize = [1, 1, 2, 2, 1], padding = "VALID", strides = [1, 1, 2, 2, 1]} : (tensor<10x8x24x24x64xf32>, tensor<10x8x12x12x64xf32>, tensor<10x8x12x12x64xf32>) -> tensor<10x8x24x24x64xf32>
   return %result : tensor<10x8x24x24x64xf32>
 }
@@ -3838,6 +3836,260 @@ func @avgpool_same_padding(%arg0: tensor<2x13x25x7xf32>) -> tensor<2x4x7x7xf32>
   return %0 : tensor<2x4x7x7xf32>
 }
 
+//===----------------------------------------------------------------------===//
+// AvgPoolGrad op legalizations.
+//===----------------------------------------------------------------------===//
+
+// CHECK-LABEL:   func @avgpool_grad_valid_padding(
+// CHECK-SAME:      %[[OUT_GRAD:.*]]: tensor<10x12x16x64xf32>) -> tensor<10x24x32x64xf32> {
+// CHECK:           %[[ZERO:.*]] = xla_hlo.constant dense<0.000000e+00> : tensor<f32>
+// CHECK:           %[[DIVISOR:.*]] = xla_hlo.constant dense<4.000000e+00> : tensor<f32>
+// CHECK:           %[[OUT_GRAD_DIVIDED:.*]] = xla_chlo.broadcast_divide %[[OUT_GRAD]], %[[DIVISOR]]
+// CHECK_SAME:        broadcast_dimensions = dense<[]>
+// CHECK_SAME:        -> tensor<10x12x16x64xf32>
+// CHECK:           %[[REDUCE_WINDOW_INPUT:.*]] = "xla_hlo.pad"(%[[OUT_GRAD_DIVIDED]], %[[ZERO]])
+// CHECK-SAME:        edge_padding_high = dense<[0, 1, 1, 0]>
+// CHECK-SAME:        edge_padding_low = dense<[0, 1, 1, 0]>
+// CHECK-SAME:        interior_padding = dense<[0, 1, 1, 0]>
+// CHECK-SAME:        -> tensor<10x25x33x64xf32>
+// CHECK:           %[[RESULT:.*]] = "xla_hlo.reduce_window"(%[[REDUCE_WINDOW_INPUT]], %[[ZERO]]) ( {
+// CHECK:           ^bb0(%[[ARG1:.*]]: tensor<f32>, %[[ARG2:.*]]: tensor<f32>):
+// CHECK:             %[[SUM:.*]] = xla_hlo.add %[[ARG1]], %[[ARG2]] : tensor<f32>
+// CHECK:             "xla_hlo.return"(%[[SUM]]) : (tensor<f32>) -> ()
+// CHECK:           })
+// CHECK-SAME:        window_dimensions = dense<[1, 2, 2, 1]>
+// CHECK-SAME:        window_strides = dense<1>
+// CHECK-SAME:        -> tensor<10x24x32x64xf32>
+// CHECK:           return %[[RESULT]] : tensor<10x24x32x64xf32>
+func @avgpool_grad_valid_padding(%grad: tensor<10x12x16x64xf32>) -> tensor<10x24x32x64xf32> {
+  %orig_input_shape = "tf.Const"() {value = dense<[10, 24, 32, 64]> : tensor<4xi32>} : () -> (tensor<4xi32>)
+  %result = "tf.AvgPoolGrad"(%orig_input_shape, %grad) {
+     data_format = "NHWC",
+     ksize = [1, 2, 2, 1],
+     padding = "VALID",
+     strides = [1, 2, 2, 1]
+  } : (tensor<4xi32>, tensor<10x12x16x64xf32>) -> tensor<10x24x32x64xf32>
+  return %result : tensor<10x24x32x64xf32>
+}
+
+// CHECK-LABEL:   func @avgpool_3d_grad_valid_padding(
+// CHECK-SAME:      %[[OUT_GRAD:.*]]: tensor<10x8x12x16x64xf32>) -> tensor<10x8x24x32x64xf32> {
+// CHECK:           %[[ZERO:.*]] = xla_hlo.constant dense<0.000000e+00> : tensor<f32>
+// CHECK:           %[[DIVISOR:.*]] = xla_hlo.constant dense<4.000000e+00> : tensor<f32>
+// CHECK:           %[[OUT_GRAD_DIVIDED:.*]] = xla_chlo.broadcast_divide %[[OUT_GRAD]], %[[DIVISOR]] {broadcast_dimensions = dense<[]> : tensor<0xi64>} : (tensor<10x8x12x16x64xf32>, tensor<f32>) -> tensor<10x8x12x16x64xf32>
+// CHECK:           %[[REDUCE_WINDOW_INPUT:.*]] = "xla_hlo.pad"(%[[OUT_GRAD_DIVIDED]], %[[ZERO]])
+// CHECK-SAME:        edge_padding_high = dense<[0, 0, 1, 1, 0]>
+// CHECK-SAME:        edge_padding_low = dense<[0, 0, 1, 1, 0]>
+// CHECK-SAME:        interior_padding = dense<[0, 0, 1, 1, 0]>
+// CHECK-SAME:        -> tensor<10x8x25x33x64xf32>
+// CHECK:           %[[RESULT:.*]] = "xla_hlo.reduce_window"(%[[REDUCE_WINDOW_INPUT]], %[[ZERO]]) ( {
+// CHECK:           ^bb0(%[[ARG1:.*]]: tensor<f32>, %[[ARG2:.*]]: tensor<f32>):
+// CHECK:             %[[SUM:.*]] = xla_hlo.add %[[ARG1]], %[[ARG2]] : tensor<f32>
+// CHECK:             "xla_hlo.return"(%[[SUM]]) : (tensor<f32>) -> ()
+// CHECK:           })
+// CHECK-SAME:        window_dimensions = dense<[1, 1, 2, 2, 1]>
+// CHECK-SAME:        window_strides = dense<1>
+// CHECK-SAME:        -> tensor<10x8x24x32x64xf32>
+// CHECK:           return %[[RESULT]] : tensor<10x8x24x32x64xf32>
+func @avgpool_3d_grad_valid_padding(%grad: tensor<10x8x12x16x64xf32>) -> tensor<10x8x24x32x64xf32> {
+  %orig_input_shape = "tf.Const"() {value = dense<[10, 8, 24, 32, 64]> : tensor<5xi32>} : () -> (tensor<5xi32>)
+  %result = "tf.AvgPool3DGrad"(%orig_input_shape, %grad) {
+    data_format = "NDHWC",
+    ksize = [1, 1, 2, 2, 1],
+    padding = "VALID",
+    strides = [1, 1, 2, 2, 1]} : (tensor<5xi32>, tensor<10x8x12x16x64xf32>) -> tensor<10x8x24x32x64xf32>
+  return %result : tensor<10x8x24x32x64xf32>
+}
+
+// CHECK-LABEL:   func @avgpool_grad_same_padding(
+// CHECK-SAME:      %[[OUT_GRAD:.*]]: tensor<2x4x7x9xf32>) -> tensor<2x13x25x9xf32> {
+// CHECK:           %[[ZERO:.*]] = xla_hlo.constant dense<0.000000e+00> : tensor<f32>
+// CHECK:           %[[ALL_ONES:.*]] = xla_hlo.constant dense<1.000000e+00> : tensor<2x13x25x9xf32>
+// CHECK:           %[[DIVISOR:.*]] = "xla_hlo.reduce_window"(%[[ALL_ONES]], %[[ZERO]]) ( {
+// CHECK:           ^bb0(%[[ARG1:.*]]: tensor<f32>, %[[ARG2:.*]]: tensor<f32>):
+// CHECK:             %[[SUM1:.*]] = xla_hlo.add %[[ARG1]], %[[ARG2]] : tensor<f32>
+// CHECK:             "xla_hlo.return"(%[[SUM1]]) : (tensor<f32>) -> ()
+// CHECK:           })
+// CHECK-SAME:        padding = dense<{{\[\[}}0, 0], [0, 1], [1, 1], [0, 0]]>
+// CHECK-SAME:        window_dimensions = dense<[1, 2, 3, 1]>
+// CHECK-SAME:        window_strides = dense<[1, 4, 4, 1]>
+// CHECK-SAME:        -> tensor<2x4x7x9xf32>
+// CHECK:           %[[OUT_GRAD_DIVIDED:.*]] = xla_hlo.divide %[[OUT_GRAD]], %[[DIVISOR]] : tensor<2x4x7x9xf32>
+// CHECK:           %[[REDUCE_WINDOW_INPUT:.*]] = "xla_hlo.pad"(%[[OUT_GRAD_DIVIDED]], %[[ZERO]])
+// CHECK-SAME:        edge_padding_high = dense<[0, 0, 1, 0]>
+// CHECK-SAME:        edge_padding_low = dense<[0, 1, 1, 0]>
+// CHECK-SAME:        interior_padding = dense<[0, 3, 3, 0]>
+// CHECK-SAME:        -> tensor<2x14x27x9xf32>
+// CHECK:           %[[RESULT:.*]] = "xla_hlo.reduce_window"(%[[REDUCE_WINDOW_INPUT]], %[[ZERO]]) ( {
+// CHECK:           ^bb0(%[[ARG3:.*]]: tensor<f32>, %[[ARG4:.*]]: tensor<f32>):
+// CHECK:             %[[SUM2:.*]] = xla_hlo.add %[[ARG3]], %[[ARG4]] : tensor<f32>
+// CHECK:             "xla_hlo.return"(%[[SUM2]]) : (tensor<f32>) -> ()
+// CHECK:           })
+// CHECK-SAME:        window_dimensions = dense<[1, 2, 3, 1]>
+// CHECK-SAME:        window_strides = dense<1>
+// CHECK-SAME:        -> tensor<2x13x25x9xf32>
+// CHECK:           return %[[RESULT]] : tensor<2x13x25x9xf32>
+func @avgpool_grad_same_padding(%grad: tensor<2x4x7x9xf32>) -> tensor<2x13x25x9xf32> {
+  %orig_input_shape = "tf.Const"() {value = dense<[2, 13, 25, 9]> : tensor<4xi32>} : () -> (tensor<4xi32>)
+  %result = "tf.AvgPoolGrad"(%orig_input_shape, %grad) {
+     data_format = "NHWC",
+     ksize = [1, 2, 3, 1],
+     padding = "SAME",
+     strides = [1, 4, 4, 1]
+  } : (tensor<4xi32>, tensor<2x4x7x9xf32>) -> tensor<2x13x25x9xf32>
+  return %result : tensor<2x13x25x9xf32>
+}
+
+// CHECK-LABEL:   func @avgpool_3d_grad_same_padding(
+// CHECK-SAME:      %[[OUT_GRAD:.*]]: tensor<2x8x4x7x9xf32>) -> tensor<2x8x13x25x9xf32> {
+// CHECK:           %[[ZERO:.*]] = xla_hlo.constant dense<0.000000e+00> : tensor<f32>
+// CHECK:           %[[ALL_ONES:.*]] = xla_hlo.constant dense<1.000000e+00> : tensor<2x8x13x25x9xf32>
+// CHECK:           %[[DIVISOR:.*]] = "xla_hlo.reduce_window"(%[[ALL_ONES]], %[[ZERO]]) ( {
+// CHECK:           ^bb0(%[[ARG1:.*]]: tensor<f32>, %[[ARG2:.*]]: tensor<f32>):
+// CHECK:             %[[SUM1:.*]] = xla_hlo.add %[[ARG1]], %[[ARG2]] : tensor<f32>
+// CHECK:             "xla_hlo.return"(%[[SUM1]]) : (tensor<f32>) -> ()
+// CHECK:           })
+// CHECK-SAME:        padding = dense<{{\[\[}}0, 0], [0, 0], [0, 1], [1, 1], [0, 0]]>
+// CHECK-SAME:        window_dimensions = dense<[1, 1, 2, 3, 1]>
+// CHECK-SAME:        window_strides = dense<[1, 1, 4, 4, 1]>
+// CHECK-SAME:        -> tensor<2x8x4x7x9xf32>
+// CHECK:           %[[OUT_GRAD_DIVIDED:.*]] = xla_hlo.divide %[[OUT_GRAD]], %[[DIVISOR]] : tensor<2x8x4x7x9xf32>
+// CHECK:           %[[REDUCE_WINDOW_INPUT:.*]] = "xla_hlo.pad"(%[[OUT_GRAD_DIVIDED]], %[[ZERO]])
+// CHECK-SAME:        edge_padding_high = dense<[0, 0, 0, 1, 0]>
+// CHECK-SAME:        edge_padding_low = dense<[0, 0, 1, 1, 0]>
+// CHECK-SAME:        interior_padding = dense<[0, 0, 3, 3, 0]>
+// CHECK-SAME:        -> tensor<2x8x14x27x9xf32>
+// CHECK:           %[[RESULT:.*]] = "xla_hlo.reduce_window"(%[[REDUCE_WINDOW_INPUT]], %[[ZERO]]) ( {
+// CHECK:           ^bb0(%[[ARG3:.*]]: tensor<f32>, %[[ARG4:.*]]: tensor<f32>):
+// CHECK:             %[[SUM2:.*]] = xla_hlo.add %[[ARG3]], %[[ARG4]] : tensor<f32>
+// CHECK:             "xla_hlo.return"(%[[SUM2]]) : (tensor<f32>) -> ()
+// CHECK:           })
+// CHECK-SAME:        window_dimensions = dense<[1, 1, 2, 3, 1]>
+// CHECK-SAME:        window_strides = dense<1>
+// CHECK-SAME:        -> tensor<2x8x13x25x9xf32>
+// CHECK:           return %[[RESULT]] : tensor<2x8x13x25x9xf32>
+func @avgpool_3d_grad_same_padding(%grad: tensor<2x8x4x7x9xf32>) -> tensor<2x8x13x25x9xf32> {
+  %orig_input_shape = "tf.Const"() {value = dense<[2, 8, 13, 25, 9]> : tensor<5xi32>} : () -> (tensor<5xi32>)
+  %result = "tf.AvgPool3DGrad"(%orig_input_shape, %grad) {
+    data_format = "NDHWC",
+    ksize = [1, 1, 2, 3, 1],
+    padding = "SAME",
+    strides = [1, 1, 4, 4, 1]} : (tensor<5xi32>, tensor<2x8x4x7x9xf32>) -> tensor<2x8x13x25x9xf32>
+  return %result : tensor<2x8x13x25x9xf32>
+}
+
+// CHECK-LABEL:   func @avgpool_grad_nchw_format(
+// CHECK-SAME:      %[[OUT_GRAD:.*]]: tensor<2x9x4x7xf32>) -> tensor<2x9x13x25xf32> {
+// CHECK:           %[[ZERO:.*]] = xla_hlo.constant dense<0.000000e+00> : tensor<f32>
+// CHECK:           %[[ALL_ONES:.*]] = xla_hlo.constant dense<1.000000e+00> : tensor<2x9x13x25xf32>
+// CHECK:           %[[DIVISOR:.*]] = "xla_hlo.reduce_window"(%[[ALL_ONES]], %[[ZERO]]) ( {
+// CHECK:           ^bb0(%[[ARG1:.*]]: tensor<f32>, %[[ARG2:.*]]: tensor<f32>):
+// CHECK:             %[[SUM1:.*]] = xla_hlo.add %[[ARG1]], %[[ARG2]] : tensor<f32>
+// CHECK:             "xla_hlo.return"(%[[SUM1]]) : (tensor<f32>) -> ()
+// CHECK:           })
+// CHECK-SAME:        padding = dense<{{\[\[}}0, 0], [0, 0], [0, 1], [1, 1]]>
+// CHECK-SAME:        window_dimensions = dense<[1, 1, 2, 3]>
+// CHECK-SAME:        window_strides = dense<[1, 1, 4, 4]>
+// CHECK-SAME:        -> tensor<2x9x4x7xf32>
+// CHECK:           %[[OUT_GRAD_DIVIDED:.*]] = xla_hlo.divide %[[OUT_GRAD]], %[[DIVISOR]] : tensor<2x9x4x7xf32>
+// CHECK:           %[[REDUCE_WINDOW_INPUT:.*]] = "xla_hlo.pad"(%[[OUT_GRAD_DIVIDED]], %[[ZERO]])
+// CHECK-SAME:        edge_padding_high = dense<[0, 0, 0, 1]>
+// CHECK-SAME:        edge_padding_low = dense<[0, 0, 1, 1]>
+// CHECK-SAME:        interior_padding = dense<[0, 0, 3, 3]>
+// CHECK-SAME:        -> tensor<2x9x14x27xf32>
+// CHECK:           %[[RESULT:.*]] = "xla_hlo.reduce_window"(%[[REDUCE_WINDOW_INPUT]], %[[ZERO]]) ( {
+// CHECK:           ^bb0(%[[ARG3:.*]]: tensor<f32>, %[[ARG4:.*]]: tensor<f32>):
+// CHECK:             %[[SUM2:.*]] = xla_hlo.add %[[ARG3]], %[[ARG4]] : tensor<f32>
+// CHECK:             "xla_hlo.return"(%[[SUM2]]) : (tensor<f32>) -> ()
+// CHECK:           })
+// CHECK-SAME:        window_dimensions = dense<[1, 1, 2, 3]>
+// CHECK-SAME:        window_strides = dense<1>
+// CHECK-SAME:        -> tensor<2x9x13x25xf32>
+// CHECK:           return %[[RESULT]] : tensor<2x9x13x25xf32>
+func @avgpool_grad_nchw_format(%grad: tensor<2x9x4x7xf32>) -> tensor<2x9x13x25xf32> {
+  %orig_input_shape = "tf.Const"() {value = dense<[2, 9, 13, 25]> : tensor<4xi32>} : () -> (tensor<4xi32>)
+  %result = "tf.AvgPoolGrad"(%orig_input_shape, %grad) {
+     data_format = "NCHW",
+     ksize = [1, 1, 2, 3],
+     padding = "SAME",
+     strides = [1, 1, 4, 4]
+  } : (tensor<4xi32>, tensor<2x9x4x7xf32>) -> tensor<2x9x13x25xf32>
+  return %result : tensor<2x9x13x25xf32>
+}
+
+// CHECK-LABEL:   func @avgpool_3d_grad_ncdwh_format(
+// CHECK-SAME:      %[[OUT_GRAD:.*]]: tensor<2x9x8x4x7xf32>) -> tensor<2x9x8x13x25xf32> {
+// CHECK:           %[[ZERO:.*]] = xla_hlo.constant dense<0.000000e+00> : tensor<f32>
+// CHECK:           %[[ALL_ONES:.*]] = xla_hlo.constant dense<1.000000e+00> : tensor<2x9x8x13x25xf32>
+// CHECK:           %[[DIVISOR:.*]] = "xla_hlo.reduce_window"(%[[ALL_ONES]], %[[ZERO]]) ( {
+// CHECK:           ^bb0(%[[ARG1:.*]]: tensor<f32>, %[[ARG2:.*]]: tensor<f32>):
+// CHECK:             %[[SUM1:.*]] = xla_hlo.add %[[ARG1]], %[[ARG2]] : tensor<f32>
+// CHECK:             "xla_hlo.return"(%[[SUM1]]) : (tensor<f32>) -> ()
+// CHECK:           })
+// CHECK-SAME:        padding = dense<{{\[\[}}0, 0], [0, 0], [0, 0], [0, 1], [1, 1]]>
+// CHECK-SAME:        window_dimensions = dense<[1, 1, 1, 2, 3]>
+// CHECK-SAME:        window_strides = dense<[1, 1, 1, 4, 4]>
+// CHECK-SAME:        -> tensor<2x9x8x4x7xf32>
+// CHECK:           %[[OUT_GRAD_DIVIDED:.*]] = xla_hlo.divide %[[OUT_GRAD]], %[[DIVISOR]] : tensor<2x9x8x4x7xf32>
+// CHECK:           %[[REDUCE_WINDOW_INPUT:.*]] = "xla_hlo.pad"(%[[OUT_GRAD_DIVIDED]], %[[ZERO]])
+// CHECK-SAME:        edge_padding_high = dense<[0, 0, 0, 0, 1]>
+// CHECK-SAME:        edge_padding_low = dense<[0, 0, 0, 1, 1]>
+// CHECK-SAME:        interior_padding = dense<[0, 0, 0, 3, 3]>
+// CHECK-SAME:        -> tensor<2x9x8x14x27xf32>
+// CHECK:           %[[RESULT:.*]] = "xla_hlo.reduce_window"(%[[REDUCE_WINDOW_INPUT]], %[[ZERO]]) ( {
+// CHECK:           ^bb0(%[[ARG3:.*]]: tensor<f32>, %[[ARG4:.*]]: tensor<f32>):
+// CHECK:             %[[SUM2:.*]] = xla_hlo.add %[[ARG3]], %[[ARG4]] : tensor<f32>
+// CHECK:             "xla_hlo.return"(%[[SUM2]]) : (tensor<f32>) -> ()
+// CHECK:           })
+// CHECK-SAME:        window_dimensions = dense<[1, 1, 1, 2, 3]>
+// CHECK-SAME:        window_strides = dense<1> : tensor<5xi64>
+// CHECK-SAME:        -> tensor<2x9x8x13x25xf32>
+// CHECK:           return %[[RESULT]] : tensor<2x9x8x13x25xf32>
+func @avgpool_3d_grad_ncdwh_format(%grad: tensor<2x9x8x4x7xf32>) -> tensor<2x9x8x13x25xf32> {
+  %orig_input_shape = "tf.Const"() {value = dense<[2, 9, 8, 13, 25]> : tensor<5xi32>} : () -> (tensor<5xi32>)
+  %result = "tf.AvgPool3DGrad"(%orig_input_shape, %grad) {
+    data_format = "NCDHW",
+    ksize = [1, 1, 1, 2, 3],
+    padding = "SAME",
+    strides = [1, 1, 1, 4, 4]} : (tensor<5xi32>, tensor<2x9x8x4x7xf32>) -> tensor<2x9x8x13x25xf32>
+  return %result : tensor<2x9x8x13x25xf32>
+}
+
+// CHECK-LABEL:   func @avgpool_grad_bf16(
+// CHECK-SAME:      %[[OUT_GRAD:.*]]: tensor<10x12x16x64xbf16>) -> tensor<10x24x32x64xbf16> {
+// CHECK:           %[[ZERO:.*]] = xla_hlo.constant dense<0.000000e+00> : tensor<bf16>
+// CHECK:           %[[DIVISOR:.*]] = xla_hlo.constant dense<4.000000e+00> : tensor<bf16>
+// CHECK:           %[[OUT_GRAD_DIVIDED:.*]] = xla_chlo.broadcast_divide %[[OUT_GRAD]], %[[DIVISOR]]
+// CHECK-SAME:        broadcast_dimensions = dense<[]>
+// CHECK-SAME:        -> tensor<10x12x16x64xbf16>
+// CHECK:           %[[REDUCE_WINDOW_INPUT:.*]] = "xla_hlo.pad"(%[[OUT_GRAD_DIVIDED]], %[[ZERO]])
+// CHECK-SAME:        edge_padding_high = dense<[0, 1, 1, 0]>
+// CHECK-SAME:        edge_padding_low = dense<[0, 1, 1, 0]>
+// CHECK-SAME:        interior_padding = dense<[0, 1, 1, 0]>
+// CHECK-SAME:        -> tensor<10x25x33x64xbf16>
+// CHECK:           %[[REDUCE_WINDOW_INPUT_CONVERTED:.*]] = "xla_hlo.convert"(%[[REDUCE_WINDOW_INPUT]]) : (tensor<10x25x33x64xbf16>) -> tensor<10x25x33x64xf32>
+// CHECK:           %[[ZERO_F32:.*]] = xla_hlo.constant dense<0.000000e+00> : tensor<f32>
+// CHECK:           %[[RESULT:.*]] = "xla_hlo.reduce_window"(%[[REDUCE_WINDOW_INPUT_CONVERTED]], %[[ZERO_F32]]) ( {
+// CHECK:           ^bb0(%[[ARG1:.*]]: tensor<f32>, %[[ARG2:.*]]: tensor<f32>):
+// CHECK:             %[[SUM:.*]] = xla_hlo.add %[[ARG1]], %[[ARG2]] : tensor<f32>
+// CHECK:             "xla_hlo.return"(%[[SUM]]) : (tensor<f32>) -> ()
+// CHECK:           })
+// CHECK-SAME:        window_dimensions = dense<[1, 2, 2, 1]>
+// CHECK-SAME:        window_strides = dense<1>
+// CHECK-SAME:        -> tensor<10x24x32x64xf32>
+// CHECK:           %[[RESULT_CONVERTED:.*]] = "xla_hlo.convert"(%[[RESULT]]) : (tensor<10x24x32x64xf32>) -> tensor<10x24x32x64xbf16>
+// CHECK:           return %[[RESULT_CONVERTED]] : tensor<10x24x32x64xbf16>
+func @avgpool_grad_bf16(%grad: tensor<10x12x16x64xbf16>) -> tensor<10x24x32x64xbf16> {
+  %orig_input_shape = "tf.Const"() {value = dense<[10, 24, 32, 64]> : tensor<4xi32>} : () -> (tensor<4xi32>)
+  %result = "tf.AvgPoolGrad"(%orig_input_shape, %grad) {
+     data_format = "NHWC",
+     ksize = [1, 2, 2, 1],
+     padding = "VALID",
+     strides = [1, 2, 2, 1]
+  } : (tensor<4xi32>, tensor<10x12x16x64xbf16>) -> tensor<10x24x32x64xbf16>
+  return %result : tensor<10x24x32x64xbf16>
+}
+
 // CHECK-LABEL: xla_sharding
 func @xla_sharding(%arg0: tensor<4x16xf32>) -> tensor<4x16xf32> {
   // CHECK-NEXT: "xla_hlo.custom_call"(%arg0) {backend_config = "", call_target_name = "Sharding", has_side_effect = false, xla_hlo.sharding = ""}
diff --git a/tensorflow/compiler/mlir/xla/transforms/legalize_tf.cc b/tensorflow/compiler/mlir/xla/transforms/legalize_tf.cc
index 1788cd1b270..73aa555fb2d 100644
--- a/tensorflow/compiler/mlir/xla/transforms/legalize_tf.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/legalize_tf.cc
@@ -51,6 +51,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/xla/ir/hlo_utils.h"
 #include "tensorflow/compiler/mlir/xla/transforms/passes.h"
 #include "tensorflow/compiler/mlir/xla/transforms/rewriters.h"
+#include "tensorflow/compiler/xla/client/lib/conv_grad_size_util.h"
 #include "tensorflow/compiler/xla/client/padding.h"
 #include "tensorflow/compiler/xla/client/sharding_builder.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
@@ -570,7 +571,7 @@ Value BatchDot(Location loc, Value lhs, bool transpose_lhs, Value rhs,
       dimension_numbers, precision_config);
 }
 
-// Builds body for reduce op by using the using the template binary op as the
+// Builds body for reduce op by using the template binary op as the
 // reducer op.
 template <typename Op>
 static void BuildReduceBody(Type element_type, Region *body,
@@ -1697,17 +1698,21 @@ class ConvertFusedBatchNormV3Op
   }
 };
 
-// Returns padding attribute for ReduceWindow op with given params.
+using PaddingArray =
+    std::vector<std::pair<tensorflow::int64, tensorflow::int64>>;
+
+// Returns padding values for ReduceWindow op as a vector of pairs.
 //
 // Requires padding to be either 'SAME' or 'VALID' and the number of input
 // dimensions to be equal to the size of window dimensions and window strides.
 template <int num_dims>
-static DenseIntElementsAttr GetReduceWindowPadding(
+static PaddingArray GetReduceWindowPaddingAsArray(
     llvm::ArrayRef<int64_t> input_dims, ArrayAttr window_dims,
     ArrayAttr window_strides, StringRef padding, Builder *builder) {
-  if (padding == "VALID") return {};
-  DCHECK_EQ(padding.str(), "SAME");
-
+  if (padding == "VALID") {
+    return PaddingArray(num_dims, std::make_pair(0, 0));
+  }
+  assert(padding == "SAME");
   llvm::SmallVector<tensorflow::int64, num_dims> input_shape, window_shape,
       strides;
   input_shape.reserve(input_dims.size());
@@ -1720,9 +1725,21 @@ static DenseIntElementsAttr GetReduceWindowPadding(
   for (Attribute attr : window_strides)
     strides.push_back(attr.cast<IntegerAttr>().getInt());
 
-  std::vector<std::pair<tensorflow::int64, tensorflow::int64>> paddings =
-      ::xla::MakePadding(input_shape, window_shape, strides,
-                         ::xla::Padding::kSame);
+  PaddingArray paddings = ::xla::MakePadding(input_shape, window_shape, strides,
+                                             ::xla::Padding::kSame);
+  return paddings;
+}
+
+// Same as GetReduceWindowPaddingAsArray but returns padding as
+// DenseIntElementsAttr. Returns empty attribute for `VALID` padding.
+template <int num_dims>
+static DenseIntElementsAttr GetReduceWindowPaddingAsAttr(
+    llvm::ArrayRef<int64_t> input_dims, ArrayAttr window_dims,
+    ArrayAttr window_strides, StringRef padding, Builder *builder) {
+  if (padding == "VALID") return {};
+  assert(padding == "SAME");
+  PaddingArray paddings = GetReduceWindowPaddingAsArray<num_dims>(
+      input_dims, window_dims, window_strides, padding, builder);
   int64_t rank = paddings.size();
   llvm::SmallVector<int64_t, num_dims * 2> flatten_paddings(rank * 2);
   for (int i = 0; i < rank; i++) {
@@ -1774,8 +1791,8 @@ class ConvertAvgPoolOp : public OpRewritePattern<TF::AvgPoolOp> {
     Value init =
         GetScalarConstOfType(sum_element_type, op.getLoc(), 0, &rewriter);
     DenseIntElementsAttr paddings_attr =
-        GetReduceWindowPadding<4>(input_type.getShape(), op.ksize(),
-                                  op.strides(), op.padding(), &rewriter);
+        GetReduceWindowPaddingAsAttr<4>(input_type.getShape(), op.ksize(),
+                                        op.strides(), op.padding(), &rewriter);
     auto reduce = rewriter.create<ReduceWindowOp>(
         op.getLoc(), result_type, input_value, init,
         GetI64ElementsAttr(op.ksize()), GetI64ElementsAttr(op.strides()),
@@ -1807,6 +1824,229 @@ class ConvertAvgPoolOp : public OpRewritePattern<TF::AvgPoolOp> {
   }
 };
 
+// `AvgPoolGradOp` is converted to the following operations:
+// 1. Divide each entry of the output gradient (the gradient for the previous
+//    layer in backpropagation order) by the count of the corresponding window
+//    (i.e., the number of non-padding entries of the window which `AvgPool`
+//    has mapped to this entry in forward propagation).
+// 2. Add appropriate interior and exterior padding for step 3 (see example
+//    below).
+// 3. Convolve the result of step 2. with a kernel consisting of 1's (same shape
+//    as windows) and stride 1 in each dimension. This is implemented as a
+//    `ReduceWindowOp` with `AddOp` as body.
+//
+// Example:
+// Let f : R^4 -> R^2 be an average pool function with window size 3, stride 2,
+// and SAME padding with 0's. It is defined by
+//    f(x) = [ (x_1 + x_2 + x_3) / 3 ]      ( x = (x_1, x_2, x_3, x_4) )
+//           [ (x_3 + x_4 + 0)   / 2 ]      (the 0 results from right padding)
+// Note that for SAME padding in `AvgPool` the padded entries are not counted
+// for the average, this is why the second denominator is 2 and not 3.
+// The Jacobian Df is
+//    [ 1/3  1/3  1/3  0   ]
+//    [ 0    0    1/2  1/2 ]
+//
+// Note that the Jacobian is constant (this is why `ConvertAvgPoolGradOp` only
+// needs the original input shape and not the tensor as argument).
+// Let v = [ 4  6 ]^T  be the output gradient (^T = transposed). Then the
+// average pool gradient is given by
+//    Df^T * v = [ 4/3  4/3  13/3  3 ]^T
+// Instead of a matrix-vector-multiplication we can utilize the sparsity and
+// structure of Df by using the 3-step approach from above:
+// 1. Divide output gradient v by window counts: [ 4/3  6/2 ]^T
+// 2. Add appropriate padding: [ 0  0  4/3  0  3  0 ]^T
+// 3. Convolve with kernel [ 1  1  1 ]: [ 4/3  4/3  11/3  3 ]^T
+//
+// Note that the padding in step 2. is chosen in such a way that the subsequent
+// convolution produces the gradient. Higher dimensions, different padding, and
+// different windows/strides work in a similar way, the main difference is in
+// the computation of the paddings in step 2.
+//
+// For more details on backpropagation for convolution of which `AvgPoolGrad`
+// is a special case see `tensorflow/core/kernels/conv_grad_ops.h`.
+// `tensorflow/compiler/mlir/xla/tests/legalize-tf.mlir` has more
+// examples for different cases.
+template <typename OpTy, int num_dims>
+class ConvertAvgPoolGradOp : public OpRewritePattern<OpTy> {
+  using DimVector = SmallVector<int64_t, num_dims>;
+
+ public:
+  using OpRewritePattern<OpTy>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(OpTy op,
+                                PatternRewriter &rewriter) const override {
+    Location loc = op.getLoc();
+    tensorflow::TensorFormat data_format;
+    if (!FormatFromString(op.data_format().str(), &data_format)) {
+      return failure();
+    }
+    // `out_grad` is the gradient that was propagated via backpropagation from
+    // the output layer.
+    Value out_grad = op.grad();
+    auto out_grad_type =
+        out_grad.getType().template dyn_cast<RankedTensorType>();
+    if (!out_grad_type) {
+      return failure();
+    }
+    Type element_type = out_grad_type.getElementType();
+    DenseIntElementsAttr orig_input_shape_attr;
+    if (!matchPattern(op.orig_input_shape(),
+                      m_Constant(&orig_input_shape_attr))) {
+      return failure();
+    }
+    auto orig_input_shape_values = orig_input_shape_attr.getValues<int32_t>();
+    DimVector orig_input_shape(orig_input_shape_values.begin(),
+                               orig_input_shape_values.end());
+    RankedTensorType orig_input_type =
+        RankedTensorType::get(orig_input_shape, element_type);
+    DimVector ksize, strides;
+    GetI64ArrayAttrValues(op.ksize(), &ksize);
+    GetI64ArrayAttrValues(op.strides(), &strides);
+    Value zero = GetScalarConstOfType(element_type, loc, 0, &rewriter);
+
+    Operation *out_grad_divided = nullptr;
+    if (op.padding() == "VALID") {
+      // All window counts are equal here because we don't have padding
+      // (each entry of `out_grad` corresponds to a window that consists of
+      //  original input entries only).
+      int64_t window_count = std::accumulate(ksize.begin(), ksize.end(), 1,
+                                             std::multiplies<int64_t>());
+      // Divide `out_grad` by window counts.
+      Value divisor =
+          GetScalarConstOfType(element_type, loc, window_count, &rewriter);
+      auto scalar_broadcast_dims = GetI64ElementsAttr({}, &rewriter);
+      out_grad_divided = rewriter.create<xla_chlo::BroadcastDivOp>(
+          loc, out_grad_type, out_grad, divisor, scalar_broadcast_dims);
+    } else {
+      assert(op.padding() == "SAME");
+      // For SAME padding, only original entries that contributed to a window
+      // are counted for the average of this window, not padded entries.
+
+      // Build all-ones tensor of same shape as the original input.
+      ElementsAttr splat = xla::getSplat(&rewriter, orig_input_type, 1);
+      auto all_ones_tensor = rewriter.create<ConstOp>(loc, splat);
+
+      // Get the same padding as for the original input.
+      DenseIntElementsAttr orig_padding_attr =
+          GetReduceWindowPaddingAsAttr<num_dims>(orig_input_shape, op.ksize(),
+                                                 op.strides(), op.padding(),
+                                                 &rewriter);
+
+      // Count the 1's in each window, using the same padding as for the
+      // original input, which gives us the window counts by which `out_grad`
+      // needs to be divided.
+      auto window_counts = rewriter.create<ReduceWindowOp>(
+          loc, out_grad_type,
+          /*operand=*/all_ones_tensor,
+          /*init_value=*/zero,
+          /*window_dimensions=*/GetI64ElementsAttr(op.ksize()),
+          /*window_strides=*/GetI64ElementsAttr(op.strides()),
+          /*base_dilations=*/DenseIntElementsAttr(),
+          /*window_dilations=*/DenseIntElementsAttr(),
+          /*padding=*/orig_padding_attr);
+      BuildReduceBody<AddOp>(element_type, &window_counts.body(), &rewriter);
+
+      // Divide `out_grad` by window counts.
+      out_grad_divided = rewriter.create<xla_hlo::DivOp>(
+          loc, out_grad_type, out_grad, window_counts);
+    }
+
+    // Get same padding as for original input.
+    PaddingArray orig_padding = GetReduceWindowPaddingAsArray<num_dims>(
+        orig_input_shape, op.ksize(), op.strides(), op.padding(), &rewriter);
+
+    // Add padding around `out_grad_divided` values in such a way that the
+    // subsequent `ReduceWindowOp` produces the gradient.
+    DimVector out_grad_shape(
+        llvm::to_vector<num_dims>(out_grad_type.getShape()));
+    DimVector low_padding(num_dims, 0);
+    DimVector high_padding(num_dims, 0);
+    DimVector interior_padding(num_dims, 0);
+    constexpr int num_spatial_dims = num_dims - 2;
+    for (int i = 0; i < num_spatial_dims; ++i) {
+      int dim = tensorflow::GetTensorSpatialDimIndex(num_dims, data_format, i);
+      int orig_input_shape_padded_in_dim = orig_input_shape[dim] +
+                                           orig_padding[dim].first +
+                                           orig_padding[dim].second;
+      // Set interior padding such that neighboring entries from
+      // `out_grad_divided` have distance `strides[dim]` from each other in
+      // every dimension.
+      interior_padding[dim] = strides[dim] - 1;
+      // Set exterior padding in the same way as for convolution gradient
+      // computation.
+      auto status = ::xla::ConvGradExtractAndVerifyDimension(
+          /*input_size=*/orig_input_shape_padded_in_dim,
+          /*filter_size=*/ksize[dim],
+          /*output_size=*/out_grad_shape[dim],
+          /*dilation=*/1,
+          /*stride=*/strides[dim],
+          /*padding=*/::xla::Padding::kValid);
+      if (!status.ok()) {
+        return failure();
+      }
+      ::xla::SpatialDimensionOutputSizeAndPadding &conv_grad_spatial_dim =
+          status.ValueOrDie();
+      // Subtract the original exterior padding since it doesn't contribute to
+      // the gradient. Note that we save one `PadOp` and some unnecessary kernel
+      // computations, compared to the `xla::AvgPoolGrad` implementation, by
+      // subtracting the original exterior padding before `ReduceWindowOp`
+      // instead of trimming the result of `ReduceWindowOp` (the final result is
+      // the same because all strides are 1).
+      low_padding[dim] =
+          conv_grad_spatial_dim.pad_before - orig_padding[dim].first;
+      high_padding[dim] =
+          conv_grad_spatial_dim.pad_after - orig_padding[dim].second;
+
+      // Update `out_grad_shape` to result shape of following `PadOp`.
+      out_grad_shape[dim] = low_padding[dim] + high_padding[dim] +
+                            (out_grad_shape[dim] - 1) * strides[dim] + 1;
+    }
+    Value reduce_window_input = rewriter.create<PadOp>(
+        loc, RankedTensorType::get(out_grad_shape, element_type),
+        /*operand=*/out_grad_divided->getOpResult(0),
+        /*padding_value=*/zero,
+        /*edge_padding_low=*/GetI64ElementsAttr(low_padding, &rewriter),
+        /*edge_padding_high=*/GetI64ElementsAttr(high_padding, &rewriter),
+        /*interior_padding=*/GetI64ElementsAttr(interior_padding, &rewriter));
+
+    // Compute result by convolving `reduce_window_input` with an all-ones
+    // kernel, using `ReduceWindowOp` with `AddOp` body.
+
+    Type sum_element_type = GetSumAccumulationType(element_type);
+    if (element_type != sum_element_type) {
+      // Convert to appropriate sum accumulation type to avoid precision loss.
+      reduce_window_input = rewriter.create<ConvertOp>(loc, reduce_window_input,
+                                                       sum_element_type);
+      zero = GetScalarConstOfType(sum_element_type, loc, 0, &rewriter);
+    }
+    auto ones = GetI64ElementsAttr(DimVector(num_dims, 1), &rewriter);
+    auto reduce_window_op = rewriter.create<ReduceWindowOp>(
+        loc, RankedTensorType::get(orig_input_shape, sum_element_type),
+        /*operand=*/reduce_window_input,
+        /*init_value=*/zero,
+        /*window_dimensions=*/GetI64ElementsAttr(op.ksize()),
+        /*window_strides=*/ones,
+        /*base_dilations=*/DenseIntElementsAttr(),
+        /*window_dilations=*/DenseIntElementsAttr(),
+        /*padding=*/DenseIntElementsAttr());
+    BuildReduceBody<AddOp>(sum_element_type, &reduce_window_op.body(),
+                           &rewriter);
+    Value result = reduce_window_op.getResult();
+
+    if (element_type != sum_element_type) {
+      // Convert back to original element type.
+      result = rewriter.create<ConvertOp>(op.getLoc(), result, element_type);
+    }
+    rewriter.replaceOp(op, {result});
+    return success();
+  }
+};
+
+using ConvertAvgPool2DGradOp =
+    ConvertAvgPoolGradOp<TF::AvgPoolGradOp, /*num_dims=*/4>;
+using ConvertAvgPool3DGradOp =
+    ConvertAvgPoolGradOp<TF::AvgPool3DGradOp, /*num_dims=*/5>;
+
 // Converts MaxPool op to HLO ReduceWindow op by setting appropriate window
 // dimensions with max as the reduction function.
 //
@@ -1831,7 +2071,7 @@ class ConvertMaxPoolOp : public OpRewritePattern<OpTy> {
 
     auto input_ty = op.input().getType().template dyn_cast<RankedTensorType>();
     if (!input_ty) return failure();
-    DenseIntElementsAttr paddings_attr = GetReduceWindowPadding<num_dims>(
+    DenseIntElementsAttr paddings_attr = GetReduceWindowPaddingAsAttr<num_dims>(
         input_ty.getShape(), op.ksize(), op.strides(), op.padding(), &rewriter);
     auto reduce = rewriter.create<ReduceWindowOp>(
         loc, op.getType(), op.input(), init.getResult(),
@@ -3381,7 +3621,7 @@ class ConvertMaxPoolGradOp : public OpRewritePattern<OpTy> {
     auto input_ty =
         op.orig_input().getType().template dyn_cast<RankedTensorType>();
     if (!input_ty) return failure();
-    DenseIntElementsAttr paddings_attr = GetReduceWindowPadding<num_dims>(
+    DenseIntElementsAttr paddings_attr = GetReduceWindowPaddingAsAttr<num_dims>(
         input_ty.getShape(), op.ksize(), op.strides(), op.padding(), &rewriter);
 
     auto result = rewriter.create<SelectAndScatterOp>(
@@ -5202,11 +5442,12 @@ LogicalResult legalizeTF(Operation *op, bool allow_partial_conversion,
       ConvertFusedBatchNormGradOp, ConvertFusedBatchNormGradV2Op,
       ConvertFusedBatchNormGradV3Op, ConvertFusedBatchNormV3Op,
       ConvertInfeedDequeueTupleOp, ConvertInplaceUpdateOp, ConvertLinSpaceOp,
-      ConvertMaxOp, ConvertMinOp, ConvertAvgPoolOp, ConvertMaxPool2DOp,
-      ConvertMaxPool3DOp, ConvertMaxPool2DGradOp, ConvertMaxPool3DGradOp,
-      ConvertMeanOp, ConvertOneHotOp, ConvertOutfeedEnqueueTupleOp,
-      ConvertProdOp, ConvertQrOp, ConvertDynamicRangeOp, ConvertRangeOp,
-      ConvertSelectV2Op, ConvertSigmoidOp, ConvertShapeOp, ConvertSizeOp,
+      ConvertMaxOp, ConvertMinOp, ConvertAvgPoolOp, ConvertAvgPool2DGradOp,
+      ConvertAvgPool3DGradOp, ConvertMaxPool2DOp, ConvertMaxPool3DOp,
+      ConvertMaxPool2DGradOp, ConvertMaxPool3DGradOp, ConvertMeanOp,
+      ConvertOneHotOp, ConvertOutfeedEnqueueTupleOp, ConvertProdOp, ConvertQrOp,
+      ConvertDynamicRangeOp, ConvertRangeOp, ConvertSelectV2Op,
+      ConvertSigmoidOp, ConvertShapeOp, ConvertSizeOp,
       ConvertSoftmaxOp<TF::LogSoftmaxOp, true>,
       ConvertSoftmaxOp<TF::SoftmaxOp, false>, ConvertSplitOp, ConvertSplitVOp,
       ConvertStridedSliceOp, ConvertStridedSliceGradOp, ConvertSumOp,
diff --git a/tensorflow/compiler/tests/BUILD b/tensorflow/compiler/tests/BUILD
index 42353451408..0b5a6c147dc 100644
--- a/tensorflow/compiler/tests/BUILD
+++ b/tensorflow/compiler/tests/BUILD
@@ -934,6 +934,7 @@ tf_xla_py_test(
     name = "pooling_ops_test",
     size = "medium",
     srcs = ["pooling_ops_test.py"],
+    enable_mlir_bridge = True,
     python_version = "PY3",
     shard_count = 10,
     tags = [
@@ -953,6 +954,7 @@ tf_xla_py_test(
     name = "pooling_ops_3d_test",
     size = "medium",
     srcs = ["pooling_ops_3d_test.py"],
+    enable_mlir_bridge = True,
     python_version = "PY3",
     shard_count = 10,
     tags = [
diff --git a/tensorflow/compiler/tests/pooling_ops_3d_test.py b/tensorflow/compiler/tests/pooling_ops_3d_test.py
index 9a008940fa2..a833daa39be 100644
--- a/tensorflow/compiler/tests/pooling_ops_3d_test.py
+++ b/tensorflow/compiler/tests/pooling_ops_3d_test.py
@@ -23,6 +23,7 @@ import numpy as np
 from tensorflow.compiler.tests import xla_test
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_nn_ops
 from tensorflow.python.ops import nn_ops
@@ -74,6 +75,9 @@ class Pooling3DTest(xla_test.XLATestCase):
     actual = vals.flatten()
     self.assertAllClose(expected, actual)
 
+  @test_util.disable_mlir_bridge("TODO(b/159812644): AvgPool TF to HLO lowering"
+                                 " doesn't support all paddings and data "
+                                 "formats")
   def testAvgPool3dValidPadding(self):
     expected_output = [20.5, 21.5, 22.5]
     self._VerifyValues(
@@ -84,6 +88,9 @@ class Pooling3DTest(xla_test.XLATestCase):
         padding="VALID",
         expected=expected_output)
 
+  @test_util.disable_mlir_bridge("TODO(b/159812644): AvgPool TF to HLO lowering"
+                                 " doesn't support all paddings and data "
+                                 "formats")
   def testAvgPool3dSamePadding(self):
     expected_output = [20.5, 21.5, 22.5, 26.5, 27.5, 28.5]
     self._VerifyValues(
@@ -94,6 +101,9 @@ class Pooling3DTest(xla_test.XLATestCase):
         padding="SAME",
         expected=expected_output)
 
+  @test_util.disable_mlir_bridge("TODO(b/159812644): AvgPool TF to HLO lowering"
+                                 " doesn't support all paddings and data "
+                                 "formats")
   def testAvgPool3dSamePaddingDifferentStrides(self):
     expected_output = [1.5, 4.5, 7.5, 17.5, 20.5, 23.5, 33.5, 36.5, 39.5]
     self._VerifyValues(
@@ -154,6 +164,8 @@ class Pooling3DTest(xla_test.XLATestCase):
         padding="SAME",
         expected=expected_output.flatten())
 
+  @test_util.disable_mlir_bridge("TODO(b/159845178): Implement support for "
+                                 "MaxPoolGradGrad op in MLIR-based bridge")
   def testKernelSmallerThanStride(self):
     self._VerifyValues(
         nn_ops.max_pool3d,
@@ -311,6 +323,8 @@ class Pooling3DTest(xla_test.XLATestCase):
             atol=1e-6)
         self.assertShapeEqual(actual_grad_gradients_vals, outputs)
 
+  @test_util.disable_mlir_bridge("TODO(b/159845178): Implement support for "
+                                 "MaxPoolGradGrad op in MLIR-based bridge")
   def testMaxPoolGradValidPadding1_1_3d(self):
     self._VerifyGradient(
         nn_ops.max_pool3d,
@@ -321,6 +335,8 @@ class Pooling3DTest(xla_test.XLATestCase):
         padding="VALID",
         pool_grad_grad_func=gen_nn_ops.max_pool3d_grad_grad)
 
+  @test_util.disable_mlir_bridge("TODO(b/159845178): Implement support for "
+                                 "MaxPoolGradGrad op in MLIR-based bridge")
   def testMaxPoolGradValidPadding2_1_6_3d(self):
     self._VerifyGradient(
         nn_ops.max_pool3d,
@@ -343,6 +359,8 @@ class Pooling3DTest(xla_test.XLATestCase):
         strides=[1, 1, 1],
         padding="VALID")
 
+  @test_util.disable_mlir_bridge("TODO(b/159845178): Implement support for "
+                                 "MaxPoolGradGrad op in MLIR-based bridge")
   def testMaxPoolGradValidPadding2_2_3d(self):
     self._VerifyGradient(
         nn_ops.max_pool3d,
@@ -353,6 +371,8 @@ class Pooling3DTest(xla_test.XLATestCase):
         padding="VALID",
         pool_grad_grad_func=gen_nn_ops.max_pool3d_grad_grad)
 
+  @test_util.disable_mlir_bridge("TODO(b/159845178): Implement support for "
+                                 "MaxPoolGradGrad op in MLIR-based bridge")
   def testMaxPoolGradSamePadding1_1_3d(self):
     self._VerifyGradient(
         nn_ops.max_pool3d,
@@ -363,6 +383,8 @@ class Pooling3DTest(xla_test.XLATestCase):
         padding="SAME",
         pool_grad_grad_func=gen_nn_ops.max_pool3d_grad_grad)
 
+  @test_util.disable_mlir_bridge("TODO(b/159845178): Implement support for "
+                                 "MaxPoolGradGrad op in MLIR-based bridge")
   def testMaxPoolGradSamePadding2_1_3d(self):
     self._VerifyGradient(
         nn_ops.max_pool3d,
@@ -373,6 +395,8 @@ class Pooling3DTest(xla_test.XLATestCase):
         padding="SAME",
         pool_grad_grad_func=gen_nn_ops.max_pool3d_grad_grad)
 
+  @test_util.disable_mlir_bridge("TODO(b/159845178): Implement support for "
+                                 "MaxPoolGradGrad op in MLIR-based bridge")
   def testMaxPoolGradSamePadding2_2_3d(self):
     self._VerifyGradient(
         nn_ops.max_pool3d,
@@ -383,6 +407,8 @@ class Pooling3DTest(xla_test.XLATestCase):
         padding="SAME",
         pool_grad_grad_func=gen_nn_ops.max_pool3d_grad_grad)
 
+  @test_util.disable_mlir_bridge("TODO(b/159845178): Implement support for "
+                                 "MaxPoolGradGrad op in MLIR-based bridge")
   def testMaxPoolGradSamePadding3_1_3d(self):
     self._VerifyGradient(
         nn_ops.max_pool3d,
diff --git a/tensorflow/compiler/tests/pooling_ops_test.py b/tensorflow/compiler/tests/pooling_ops_test.py
index bcc5ce77ec6..293e1010b08 100644
--- a/tensorflow/compiler/tests/pooling_ops_test.py
+++ b/tensorflow/compiler/tests/pooling_ops_test.py
@@ -23,6 +23,7 @@ import numpy as np
 from tensorflow.compiler.tests import xla_test
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_nn_ops
 from tensorflow.python.ops import nn_ops
@@ -267,6 +268,9 @@ class PoolingTest(xla_test.XLATestCase):
         expected=[1, 3, 9, 11])
 
   # Average pooling
+  @test_util.disable_mlir_bridge("TODO(b/159812644): AvgPool TF to HLO lowering"
+                                 " doesn't support all paddings and data "
+                                 "formats")
   def testAvgPoolValidPadding(self):
     expected_output = [7, 8, 9]
     self._VerifyValues(
@@ -277,6 +281,9 @@ class PoolingTest(xla_test.XLATestCase):
         padding="VALID",
         expected=expected_output)
 
+  @test_util.disable_mlir_bridge("TODO(b/159812644): AvgPool TF to HLO lowering"
+                                 " doesn't support all paddings and data "
+                                 "formats")
   def testAvgPoolSamePadding(self):
     expected_output = [7., 8., 9., 11.5, 12.5, 13.5]
     self._VerifyValues(
@@ -542,12 +549,20 @@ class PoolGradTest(xla_test.XLATestCase):
         padding="SAME",
         pool_grad_grad_func=pool_grad_grad_func)
 
+  @test_util.disable_mlir_bridge("TODO(b/159845178): Implement support for "
+                                 "MaxPoolGradGrad op in MLIR-based bridge")
   def testMaxPool(self):
     self._TestPooling(
         nn_ops.max_pool,
         gen_nn_ops.max_pool_grad,
         pool_grad_grad_func=gen_nn_ops.max_pool_grad_grad)
 
+  # TODO(b/159845178): Remove this once MLIR bridge supports MaxPoolGradGrad
+  # (then `testMaxPool` test will be sufficient)
+  def testMaxPoolNoGradGrad(self):
+    self._TestPooling(
+        nn_ops.max_pool, gen_nn_ops.max_pool_grad, pool_grad_grad_func=None)
+
   def testAvgPool(self):
     # Wrapper around AvgPoolGrad that ignores extra arguments needed by
     # MaxPoolGrad.

From 86fc04ef9b322f76da0537719a32fa2fe5aefdd0 Mon Sep 17 00:00:00 2001
From: Brian Zhao <bmzhao@google.com>
Date: Wed, 24 Jun 2020 21:04:07 -0700
Subject: [PATCH 1057/1390] Adding Variable class for Variable Reloading in the
 SavedModel C API.

PiperOrigin-RevId: 318199820
Change-Id: I901124780f8687d0f572cff4546f8792f1120e47
---
 .../c/experimental/saved_model/core/BUILD     |  54 ++++-
 .../saved_model/core/constant_loading_test.cc | 111 ++++++++++
 .../saved_model/core/revived_types/BUILD      |  21 ++
 .../core/revived_types/variable.cc            |  78 +++++++
 .../saved_model/core/revived_types/variable.h |  76 +++++++
 .../saved_model/core/saved_model_utils.cc     |  20 ++
 .../saved_model/core/saved_model_utils.h      |  10 +
 .../core/saved_model_utils_test.cc            | 199 ------------------
 .../core/saved_variable_loading_test.cc       | 122 +++++++++++
 .../saved_model/core/test_utils.cc            | 143 +++++++++++++
 .../saved_model/core/test_utils.h             |  75 +++++++
 11 files changed, 705 insertions(+), 204 deletions(-)
 create mode 100644 tensorflow/c/experimental/saved_model/core/constant_loading_test.cc
 create mode 100644 tensorflow/c/experimental/saved_model/core/revived_types/variable.cc
 create mode 100644 tensorflow/c/experimental/saved_model/core/revived_types/variable.h
 delete mode 100644 tensorflow/c/experimental/saved_model/core/saved_model_utils_test.cc
 create mode 100644 tensorflow/c/experimental/saved_model/core/saved_variable_loading_test.cc
 create mode 100644 tensorflow/c/experimental/saved_model/core/test_utils.cc
 create mode 100644 tensorflow/c/experimental/saved_model/core/test_utils.h

diff --git a/tensorflow/c/experimental/saved_model/core/BUILD b/tensorflow/c/experimental/saved_model/core/BUILD
index bc9a5fd9442..b61f38d5533 100644
--- a/tensorflow/c/experimental/saved_model/core/BUILD
+++ b/tensorflow/c/experimental/saved_model/core/BUILD
@@ -63,10 +63,34 @@ cc_library(
         "//tensorflow/c:tf_tensor_internal",
         "//tensorflow/c/eager:immediate_execution_context",
         "//tensorflow/c/experimental/saved_model/core/revived_types:constant",
+        "//tensorflow/c/experimental/saved_model/core/revived_types:variable",
         "//tensorflow/core:protos_all_cc",
     ],
 )
 
+cc_library(
+    name = "test_utils",
+    testonly = True,
+    srcs = [
+        "test_utils.cc",
+    ],
+    hdrs = [
+        "test_utils.h",
+    ],
+    deps = [
+        "//tensorflow/c:tensor_interface",
+        "//tensorflow/c/eager:immediate_execution_context",
+        "//tensorflow/c/eager:immediate_execution_tensor_handle",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core/common_runtime:core_cpu_lib",
+        "//tensorflow/core/common_runtime/eager:context",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
 cc_library(
     name = "tf_saved_model_impl",
     srcs = [
@@ -106,15 +130,35 @@ filegroup(
 )
 
 tf_cc_test(
-    name = "saved_model_utils_test",
+    name = "constant_loading_test",
     srcs = [
-        "saved_model_utils_test.cc",
+        "constant_loading_test.cc",
     ],
     deps = [
         ":saved_model_utils",
-        "//tensorflow/c:tensor_interface",
-        "//tensorflow/c/eager:abstract_tensor_handle",
-        "//tensorflow/c/eager:immediate_execution_context",
+        ":test_utils",
+        "//tensorflow/c/eager:immediate_execution_tensor_handle",
+        "//tensorflow/c/experimental/saved_model/core/revived_types:constant",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core/common_runtime:core_cpu_lib",
+        "//tensorflow/core/common_runtime/eager:context",
+        "//tensorflow/core/common_runtime/eager:core",
+    ],
+)
+
+tf_cc_test(
+    name = "saved_variable_loading_test",
+    srcs = [
+        "saved_variable_loading_test.cc",
+    ],
+    deps = [
+        ":saved_model_utils",
+        ":test_utils",
+        "//tensorflow/c:tensor_interface",
         "//tensorflow/c/eager:immediate_execution_tensor_handle",
         "//tensorflow/c/experimental/saved_model/core/revived_types:constant",
         "//tensorflow/core:framework",
diff --git a/tensorflow/c/experimental/saved_model/core/constant_loading_test.cc b/tensorflow/c/experimental/saved_model/core/constant_loading_test.cc
new file mode 100644
index 00000000000..1bffcf18f8f
--- /dev/null
+++ b/tensorflow/c/experimental/saved_model/core/constant_loading_test.cc
@@ -0,0 +1,111 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+#include <vector>
+
+#include "tensorflow/c/eager/immediate_execution_tensor_handle.h"
+#include "tensorflow/c/experimental/saved_model/core/revived_types/constant.h"
+#include "tensorflow/c/experimental/saved_model/core/saved_model_utils.h"
+#include "tensorflow/c/experimental/saved_model/core/test_utils.h"
+#include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/common_runtime/eager/context.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor.pb.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace {
+
+class ConstantTest : public ::testing::TestWithParam<
+                         std::tuple<DataType, std::vector<int64>, bool>> {
+ public:
+  ConstantTest()
+      : device_mgr_(testing::CreateTestingDeviceMgr()),
+        ctx_(testing::CreateTestingEagerContext(device_mgr_.get())) {}
+
+  EagerContext* context() { return ctx_.get(); }
+
+ private:
+  std::unique_ptr<StaticDeviceMgr> device_mgr_;
+  EagerContextPtr ctx_;
+};
+
+// Basic sanity check that roundtripping a Tensor->Tensorproto->Constant
+// preserves values.
+TEST_P(ConstantTest, CreateConstantSuccessful) {
+  // Get test parameters
+  auto& test_params = GetParam();
+  DataType dtype = std::get<0>(test_params);
+  TensorShape shape(std::get<1>(test_params));
+  bool tensorproto_use_tensor_content = std::get<2>(test_params);
+
+  // Construct a Tensor with the given dtype + shape
+  Tensor expected(dtype, shape);
+  testing::FillNumericTensorBuffer(expected.dtype(), expected.NumElements(),
+                                   expected.data(), 42);
+
+  // Serialize it to a Tensorproto
+  TensorProto proto;
+  if (tensorproto_use_tensor_content) {
+    expected.AsProtoTensorContent(&proto);
+  } else {
+    expected.AsProtoField(&proto);
+  }
+
+  // Revival should succeed w/o errors
+  std::unique_ptr<Constant> revived;
+  TF_EXPECT_OK(internal::TensorProtoToConstant(context(), proto, &revived));
+
+  // The revived tensorhandle should have the exact same dtype, shape, +
+  // approx equivalent data to the original.
+  ImmediateExecutionTensorHandle* handle = revived->handle();
+  Status status;
+  AbstractTensorPtr revived_tensor(handle->Resolve(&status));
+  TF_EXPECT_OK(status) << "Failed to convert tensorhandle to tensor";
+  EXPECT_EQ(revived_tensor->Type(), expected.dtype());
+  EXPECT_EQ(revived_tensor->NumElements(), expected.NumElements());
+  EXPECT_EQ(revived_tensor->NumDims(), expected.dims());
+  for (int i = 0; i < expected.dims(); ++i) {
+    EXPECT_EQ(revived_tensor->Dim(i), expected.dim_size(i));
+  }
+
+  testing::CheckBufferDataIsEqual(expected.dtype(), expected.NumElements(),
+                                  revived_tensor->Data(), expected.data());
+}
+
+// Test against combinations of tensors that are
+// 1. Varying dtypes
+// 2. Varying shapes
+// 3. TensorProto serialized using tensor_content vs repeated type
+INSTANTIATE_TEST_SUITE_P(
+    ConstantIntegerDtypesTest, ConstantTest,
+    ::testing::Combine(
+        ::testing::ValuesIn(testing::DataTypeSetToVector(kDataTypeIsInteger)),
+        ::testing::ValuesIn(testing::InterestingShapes()),
+        ::testing::Values(false, true)));
+
+INSTANTIATE_TEST_SUITE_P(
+    ConstantFloatingDtypesTest, ConstantTest,
+    ::testing::Combine(::testing::Values(DT_FLOAT, DT_DOUBLE),
+                       ::testing::ValuesIn(testing::InterestingShapes()),
+                       ::testing::Values(false, true)));
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/c/experimental/saved_model/core/revived_types/BUILD b/tensorflow/c/experimental/saved_model/core/revived_types/BUILD
index ad3844e00a0..84fad2ea8f6 100644
--- a/tensorflow/c/experimental/saved_model/core/revived_types/BUILD
+++ b/tensorflow/c/experimental/saved_model/core/revived_types/BUILD
@@ -28,6 +28,27 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "variable",
+    srcs = [
+        "variable.cc",
+    ],
+    hdrs = [
+        "variable.h",
+    ],
+    deps = [
+        ":tensorhandle_convertible",
+        "//tensorflow/c/eager:immediate_execution_context",
+        "//tensorflow/c/eager:immediate_execution_tensor_handle",
+        "//tensorflow/c/experimental/saved_model/core/ops:variable_ops",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/common_runtime/eager:context",
+        "@com_google_absl//absl/types:optional",
+    ],
+)
+
 cc_library(
     name = "tensorhandle_convertible",
     hdrs = [
diff --git a/tensorflow/c/experimental/saved_model/core/revived_types/variable.cc b/tensorflow/c/experimental/saved_model/core/revived_types/variable.cc
new file mode 100644
index 00000000000..d831a8dd840
--- /dev/null
+++ b/tensorflow/c/experimental/saved_model/core/revived_types/variable.cc
@@ -0,0 +1,78 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/c/experimental/saved_model/core/revived_types/variable.h"
+
+#include <memory>
+
+#include "tensorflow/c/eager/immediate_execution_tensor_handle.h"
+#include "tensorflow/c/experimental/saved_model/core/ops/variable_ops.h"
+#include "tensorflow/core/common_runtime/eager/context.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace tensorflow {
+
+Variable::Variable(ImmediateExecutionContext* ctx, DataType dtype,
+                   TensorShape shape, absl::optional<std::string> name,
+                   ImmediateTensorHandlePtr handle)
+    : TensorHandleConvertible(std::move(handle)),
+      name_(name.has_value() ? *name : "Variable"),
+      dtype_(dtype),
+      shape_(shape),
+      ctx_(ctx) {}
+
+Variable::~Variable() {
+  // If the handle is null (perhaps because variable was std::moved from), then
+  // we don't have to do anything.
+  if (handle_ == nullptr) {
+    return;
+  }
+
+  Status status = internal::DestroyResource(ctx_, handle_.get());
+  if (!status.ok()) {
+    LOG(ERROR) << "Error destroying variable: " << name_
+               << "due to: " << status;
+  }
+}
+
+DataType Variable::dtype() { return dtype_; }
+
+TensorShape Variable::shape() { return shape_; }
+
+Status Variable::Assign(ImmediateExecutionTensorHandle* handle) {
+  return internal::AssignVariable(ctx_, handle_.get(), dtype_, handle);
+}
+
+Status Variable::ReadValue(ImmediateTensorHandlePtr* out) {
+  return internal::ReadVariable(ctx_, handle_.get(), dtype_, out);
+}
+
+Status Variable::CreateUninitialized(ImmediateExecutionContext* ctx,
+                                     DataType dtype, TensorShape shape,
+                                     absl::optional<std::string> name,
+                                     std::unique_ptr<Variable>* output) {
+  ImmediateTensorHandlePtr handle;
+  TF_RETURN_IF_ERROR(internal::CreateUninitializedResourceVariable(
+      ctx, dtype, shape, &handle));
+
+  output->reset(
+      new Variable(ctx, dtype, shape, std::move(name), std::move(handle)));
+  return Status();
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/c/experimental/saved_model/core/revived_types/variable.h b/tensorflow/c/experimental/saved_model/core/revived_types/variable.h
new file mode 100644
index 00000000000..48ea1d08862
--- /dev/null
+++ b/tensorflow/c/experimental/saved_model/core/revived_types/variable.h
@@ -0,0 +1,76 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_REVIVED_VARIABLE_H_
+#define TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_REVIVED_VARIABLE_H_
+
+#include <memory>
+
+#include "absl/types/optional.h"
+#include "tensorflow/c/eager/immediate_execution_context.h"
+#include "tensorflow/c/eager/immediate_execution_tensor_handle.h"
+#include "tensorflow/c/experimental/saved_model/core/revived_types/tensorhandle_convertible.h"
+#include "tensorflow/core/common_runtime/eager/context.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/protobuf/saved_object_graph.pb.h"
+
+namespace tensorflow {
+
+class Variable : public TensorHandleConvertible {
+ public:
+  // Creates an uninitialized resource variable. Note that a caller must
+  // call "assign" to associate a value with the variable.
+  static Status CreateUninitialized(ImmediateExecutionContext* ctx,
+                                    DataType dtype, TensorShape shape,
+                                    absl::optional<std::string> name,
+                                    std::unique_ptr<Variable>* output);
+
+  // The dtype of the underlying variable.
+  DataType dtype();
+
+  // The shape of the underlying variable.
+  TensorShape shape();
+
+  // Updates the variable's contents with `handle`.
+  Status Assign(ImmediateExecutionTensorHandle* handle);
+
+  // Reads the value of the variable, and stores it in `out`
+  Status ReadValue(ImmediateTensorHandlePtr* out);
+
+  // Variable is movable, but not copyable.
+  Variable(Variable&& other) = default;
+  Variable& operator=(Variable&& other) = default;
+
+  ~Variable() override;
+
+ private:
+  Variable(ImmediateExecutionContext* ctx, DataType dtype, TensorShape shape,
+           absl::optional<std::string> name, ImmediateTensorHandlePtr handle);
+  Variable(const Variable& variable) = delete;
+  Variable& operator=(const Variable&) = delete;
+
+  std::string name_;
+  DataType dtype_;
+  TensorShape shape_;
+
+  // ctx_ must outlive Variable.
+  ImmediateExecutionContext* ctx_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_REVIVED_VARIABLE_H_
diff --git a/tensorflow/c/experimental/saved_model/core/saved_model_utils.cc b/tensorflow/c/experimental/saved_model/core/saved_model_utils.cc
index 9fe9caa27d7..196420eb537 100644
--- a/tensorflow/c/experimental/saved_model/core/saved_model_utils.cc
+++ b/tensorflow/c/experimental/saved_model/core/saved_model_utils.cc
@@ -15,8 +15,13 @@ limitations under the License.
 
 #include "tensorflow/c/experimental/saved_model/core/saved_model_utils.h"
 
+#include <memory>
+
 #include "tensorflow/c/experimental/saved_model/core/revived_types/constant.h"
+#include "tensorflow/c/experimental/saved_model/core/revived_types/variable.h"
 #include "tensorflow/c/tf_tensor_internal.h"
+#include "tensorflow/core/framework/tensor.pb.h"
+#include "tensorflow/core/protobuf/saved_object_graph.pb.h"
 
 namespace tensorflow {
 namespace internal {
@@ -34,5 +39,20 @@ Status TensorProtoToConstant(ImmediateExecutionContext* ctx,
   return Constant::Create(ctx, &tensor_interface, output);
 }
 
+// This follows the python variable restoration logic:
+// https://github.com/tensorflow/tensorflow/blob/516608035f85cec8b126712b0ff8407220206b22/tensorflow/python/saved_model/load.py#L407
+Status LoadSavedVariable(ImmediateExecutionContext* ctx,
+                         const SavedVariable& variable,
+                         std::unique_ptr<Variable>* output) {
+  const std::string& name = variable.name();
+  tensorflow::TensorShape shape(variable.shape());
+  tensorflow::DataType dtype = variable.dtype();
+
+  TF_RETURN_IF_ERROR(
+      Variable::CreateUninitialized(ctx, dtype, shape, name, output));
+
+  return Status();
+}
+
 }  // namespace internal
 }  // namespace tensorflow
diff --git a/tensorflow/c/experimental/saved_model/core/saved_model_utils.h b/tensorflow/c/experimental/saved_model/core/saved_model_utils.h
index 5223f1c5f7d..ab1531709e4 100644
--- a/tensorflow/c/experimental/saved_model/core/saved_model_utils.h
+++ b/tensorflow/c/experimental/saved_model/core/saved_model_utils.h
@@ -21,7 +21,9 @@ limitations under the License.
 
 #include "tensorflow/c/eager/immediate_execution_context.h"
 #include "tensorflow/c/experimental/saved_model/core/revived_types/constant.h"
+#include "tensorflow/c/experimental/saved_model/core/revived_types/variable.h"
 #include "tensorflow/core/framework/tensor.pb.h"
+#include "tensorflow/core/protobuf/saved_object_graph.pb.h"
 
 namespace tensorflow {
 namespace internal {
@@ -33,6 +35,14 @@ Status TensorProtoToConstant(ImmediateExecutionContext* ctx,
                              const TensorProto& proto,
                              std::unique_ptr<Constant>* output);
 
+// Creates a tensorflow::Variable from a SavedVariable. This is similar to the
+// logic in:
+// https://github.com/tensorflow/tensorflow/blob/516608035f85cec8b126712b0ff8407220206b22/tensorflow/python/saved_model/load.py#L407
+// Note that the caller **must assign a value** to the loaded variable.
+Status LoadSavedVariable(ImmediateExecutionContext* ctx,
+                         const SavedVariable& variable,
+                         std::unique_ptr<Variable>* output);
+
 }  // namespace internal
 }  // namespace tensorflow
 
diff --git a/tensorflow/c/experimental/saved_model/core/saved_model_utils_test.cc b/tensorflow/c/experimental/saved_model/core/saved_model_utils_test.cc
deleted file mode 100644
index 483162574f7..00000000000
--- a/tensorflow/c/experimental/saved_model/core/saved_model_utils_test.cc
+++ /dev/null
@@ -1,199 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/c/experimental/saved_model/core/saved_model_utils.h"
-
-#include <string.h>
-
-#include <memory>
-#include <vector>
-
-#include "tensorflow/c/eager/immediate_execution_tensor_handle.h"
-#include "tensorflow/c/experimental/saved_model/core/revived_types/constant.h"
-#include "tensorflow/c/tensor_interface.h"
-#include "tensorflow/core/common_runtime/device_mgr.h"
-#include "tensorflow/core/common_runtime/eager/context.h"
-#include "tensorflow/core/framework/numeric_types.h"
-#include "tensorflow/core/framework/register_types.h"
-#include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/framework/tensor.pb.h"
-#include "tensorflow/core/framework/tensor_shape.h"
-#include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/framework/types.pb.h"
-#include "tensorflow/core/lib/bfloat16/bfloat16.h"
-#include "tensorflow/core/lib/core/status_test_util.h"
-#include "tensorflow/core/platform/logging.h"
-#include "tensorflow/core/platform/test.h"
-#include "tensorflow/core/platform/types.h"
-
-namespace tensorflow {
-namespace {
-
-// Converts a tensorflow::DatatypeSet to std::vector<DataType>.
-// This is needed for GTest's ::testing::ValuesIn, since
-// DataTypeSet doesn't fullfill all the constraints of an STL-like iterable.
-std::vector<DataType> DataTypeSetToVector(DataTypeSet set) {
-  std::vector<DataType> result;
-  result.reserve(set.size());
-  for (DataType dt : set) {
-    result.push_back(dt);
-  }
-  return result;
-}
-
-// Returns a vector of shapes intended to be "interesting" test cases.
-std::vector<std::vector<int64>> InterestingShapes() {
-  std::vector<std::vector<int64>> interesting_shapes;
-  interesting_shapes.push_back({});             // Scalar
-  interesting_shapes.push_back({10});           // 1D Vector
-  interesting_shapes.push_back({3, 3});         // 2D Matrix
-  interesting_shapes.push_back({1, 4, 6, 10});  // Higher Dimension Tensor
-  return interesting_shapes;
-}
-
-// Fills a numeric tensor with `value`.
-void FillNumericTensor(Tensor* tensor, int8 value) {
-  switch (tensor->dtype()) {
-#define CASE(type)                                    \
-  case DataTypeToEnum<type>::value: {                 \
-    const auto& flattened = tensor->flat<type>();     \
-    for (int i = 0; i < tensor->NumElements(); ++i) { \
-      flattened(i) = value;                           \
-    }                                                 \
-    break;                                            \
-  }
-    TF_CALL_INTEGRAL_TYPES(CASE);
-    TF_CALL_double(CASE);
-    TF_CALL_float(CASE);
-#undef CASE
-    default:
-      CHECK(false) << "Unsupported data type: "
-                   << DataTypeString(tensor->dtype());
-      break;
-  }
-}
-
-// Checks the underlying data is equal for the buffers for two numeric tensors.
-// Note: The caller must ensure to check that the dtypes and sizes of the
-// underlying buffers are the same before calling this.
-void CheckBufferDataIsEqual(DataType dtype, int64 num_elements, void* a,
-                            void* b) {
-  switch (dtype) {
-#define CASE(type)                               \
-  case DataTypeToEnum<type>::value: {            \
-    type* typed_a = static_cast<type*>(a);       \
-    type* typed_b = static_cast<type*>(b);       \
-    for (int64 i = 0; i < num_elements; ++i) {   \
-      if (DataTypeIsFloating(dtype)) {           \
-        EXPECT_FLOAT_EQ(typed_a[i], typed_b[i]); \
-      } else {                                   \
-        EXPECT_EQ(typed_a[i], typed_b[i]);       \
-      }                                          \
-    }                                            \
-    break;                                       \
-  }
-    TF_CALL_INTEGRAL_TYPES(CASE);
-    TF_CALL_double(CASE);
-    TF_CALL_float(CASE);
-#undef CASE
-    default:
-      CHECK(false) << "Unsupported data type: " << DataTypeString(dtype);
-  }
-}
-
-class ConstantTest : public ::testing::TestWithParam<
-                         std::tuple<DataType, std::vector<int64>, bool>> {
- public:
-  ConstantTest()
-      : device_mgr_(std::make_unique<StaticDeviceMgr>(DeviceFactory::NewDevice(
-            "CPU", {}, "/job:localhost/replica:0/task:0"))),
-        ctx_(new EagerContext(
-            SessionOptions(),
-            tensorflow::ContextDevicePlacementPolicy::DEVICE_PLACEMENT_SILENT,
-            tensorflow::ContextMirroringPolicy::MIRRORING_NONE,
-            /* async= */ false,
-            /* lazy_copy_function_remote_inputs= */ false, device_mgr_.get(),
-            /* device_mgr_owned= */ false, /* rendezvous= */ nullptr,
-            /* custom_kernel_creator= */ nullptr,
-            /* cluster_flr= */ nullptr)) {}
-
-  EagerContext* context() { return ctx_.get(); }
-
- private:
-  std::unique_ptr<StaticDeviceMgr> device_mgr_;
-  EagerContextPtr ctx_;
-};
-
-// Basic sanity check that roundtripping a Tensor->Tensorproto->Constant
-// preserves values.
-TEST_P(ConstantTest, CreateConstantSuccessful) {
-  // Get test parameters
-  auto& test_params = GetParam();
-  DataType dtype = std::get<0>(test_params);
-  TensorShape shape(std::get<1>(test_params));
-  bool tensorproto_use_tensor_content = std::get<2>(test_params);
-
-  // Construct a Tensor with the given dtype + shape
-  Tensor expected(dtype, shape);
-  FillNumericTensor(&expected, 42);
-
-  // Serialize it to a Tensorproto
-  TensorProto proto;
-  if (tensorproto_use_tensor_content) {
-    expected.AsProtoTensorContent(&proto);
-  } else {
-    expected.AsProtoField(&proto);
-  }
-
-  // Revival should succeed w/o errors
-  std::unique_ptr<Constant> revived;
-  TF_EXPECT_OK(internal::TensorProtoToConstant(context(), proto, &revived));
-
-  // The revived tensorhandle should have the exact same dtype, shape, +
-  // approx equivalent data to the original.
-  ImmediateExecutionTensorHandle* handle = revived->handle();
-  Status status;
-  AbstractTensorPtr revived_tensor(handle->Resolve(&status));
-  TF_EXPECT_OK(status) << "Failed to convert tensorhandle to tensor";
-  EXPECT_EQ(revived_tensor->Type(), expected.dtype());
-  EXPECT_EQ(revived_tensor->NumElements(), expected.NumElements());
-  EXPECT_EQ(revived_tensor->NumDims(), expected.dims());
-  for (int i = 0; i < expected.dims(); ++i) {
-    EXPECT_EQ(revived_tensor->Dim(i), expected.dim_size(i));
-  }
-
-  CheckBufferDataIsEqual(expected.dtype(), expected.NumElements(),
-                         revived_tensor->Data(), expected.data());
-}
-
-// Test against combinations of tensors that are
-// 1. Varying dtypes
-// 2. Varying shapes
-// 3. TensorProto serialized using tensor_content vs repeated type
-INSTANTIATE_TEST_SUITE_P(
-    ConstantIntegerDtypesTest, ConstantTest,
-    ::testing::Combine(
-        ::testing::ValuesIn(DataTypeSetToVector(kDataTypeIsInteger)),
-        ::testing::ValuesIn(InterestingShapes()),
-        ::testing::Values(false, true)));
-
-INSTANTIATE_TEST_SUITE_P(
-    ConstantFloatingDtypesTest, ConstantTest,
-    ::testing::Combine(::testing::Values(DT_FLOAT, DT_DOUBLE),
-                       ::testing::ValuesIn(InterestingShapes()),
-                       ::testing::Values(false, true)));
-
-}  // namespace
-}  // namespace tensorflow
diff --git a/tensorflow/c/experimental/saved_model/core/saved_variable_loading_test.cc b/tensorflow/c/experimental/saved_model/core/saved_variable_loading_test.cc
new file mode 100644
index 00000000000..cf58e5e3536
--- /dev/null
+++ b/tensorflow/c/experimental/saved_model/core/saved_variable_loading_test.cc
@@ -0,0 +1,122 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+#include <vector>
+
+#include "tensorflow/c/eager/immediate_execution_tensor_handle.h"
+#include "tensorflow/c/experimental/saved_model/core/revived_types/constant.h"
+#include "tensorflow/c/experimental/saved_model/core/saved_model_utils.h"
+#include "tensorflow/c/experimental/saved_model/core/test_utils.h"
+#include "tensorflow/c/tensor_interface.h"
+#include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/common_runtime/eager/context.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor.pb.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/protobuf/saved_object_graph.pb.h"
+
+namespace tensorflow {
+namespace {
+
+class SavedVariableLoadingTest : public ::testing::TestWithParam<
+                                     std::tuple<DataType, std::vector<int64>>> {
+ public:
+  SavedVariableLoadingTest()
+      : device_mgr_(testing::CreateTestingDeviceMgr()),
+        ctx_(testing::CreateTestingEagerContext(device_mgr_.get())) {}
+
+  EagerContext* context() { return ctx_.get(); }
+
+ private:
+  std::unique_ptr<StaticDeviceMgr> device_mgr_;
+  EagerContextPtr ctx_;
+};
+
+// Sanity check that constructing a tensorflow::Variable from a SavedVariable
+// 1. does not cause an error
+// 2. preserves dtype and shape.
+TEST_P(SavedVariableLoadingTest, LoadSavedVariableSuccessful) {
+  auto& test_params = GetParam();
+  DataType dtype = std::get<0>(test_params);
+  TensorShape shape(std::get<1>(test_params));
+
+  SavedVariable saved_variable;
+  saved_variable.set_dtype(dtype);
+  shape.AsProto(saved_variable.mutable_shape());
+
+  std::unique_ptr<Variable> var;
+  TF_EXPECT_OK(internal::LoadSavedVariable(context(), saved_variable, &var));
+  EXPECT_EQ(var->dtype(), dtype);
+  EXPECT_EQ(var->shape(), shape);
+}
+
+// Assigning and reading values should yield
+// consistent results.
+TEST_P(SavedVariableLoadingTest, AssignAndReadVariableSuccesful) {
+  auto& test_params = GetParam();
+  DataType dtype = std::get<0>(test_params);
+  std::vector<int64> shape_vector = std::get<1>(test_params);
+  TensorShape shape(shape_vector);
+
+  // Create the variable.
+  Status status;
+  std::unique_ptr<Variable> var;
+  TF_EXPECT_OK(Variable::CreateUninitialized(context(), dtype, shape,
+                                             absl::nullopt, &var));
+
+  // Create a TensorHandle
+  ImmediateTensorHandlePtr expected_handle =
+      testing::CreateTensorHandle(context(), dtype, shape_vector, 42);
+  AbstractTensorPtr expected_tensor(expected_handle->Resolve(&status));
+  TF_EXPECT_OK(status) << status.error_message();
+
+  // Assign the tensorhandle to the variable.
+  TF_EXPECT_OK(var->Assign(expected_handle.get()));
+
+  // Read back the value from the variable
+  ImmediateTensorHandlePtr output_handle;
+  TF_EXPECT_OK(var->ReadValue(&output_handle));
+  AbstractTensorPtr output_tensor(output_handle->Resolve(&status));
+  TF_EXPECT_OK(status) << status.error_message();
+
+  // Check that output_tensor == expected_tensor
+  EXPECT_EQ(output_tensor->Type(), expected_tensor->Type());
+  EXPECT_EQ(output_tensor->NumElements(), expected_tensor->NumElements());
+  testing::CheckBufferDataIsEqual(
+      output_tensor->Type(), output_tensor->NumElements(),
+      output_tensor->Data(), expected_tensor->Data());
+}
+
+// Test against combinations of SavedVariables of
+// 1. Varying dtypes
+// 2. Varying shapes
+INSTANTIATE_TEST_SUITE_P(
+    SavedVariableIntegerDtypesTest, SavedVariableLoadingTest,
+    ::testing::Combine(
+        ::testing::ValuesIn(testing::DataTypeSetToVector(kDataTypeIsInteger)),
+        ::testing::ValuesIn(testing::InterestingShapes())));
+
+INSTANTIATE_TEST_SUITE_P(
+    SavedVariableFloatingDtypesTest, SavedVariableLoadingTest,
+    ::testing::Combine(::testing::Values(DT_FLOAT, DT_DOUBLE),
+                       ::testing::ValuesIn(testing::InterestingShapes())));
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/c/experimental/saved_model/core/test_utils.cc b/tensorflow/c/experimental/saved_model/core/test_utils.cc
new file mode 100644
index 00000000000..920b7dd0139
--- /dev/null
+++ b/tensorflow/c/experimental/saved_model/core/test_utils.cc
@@ -0,0 +1,143 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/c/experimental/saved_model/core/test_utils.h"
+
+#include <memory>
+#include <vector>
+
+#include "absl/types/span.h"
+#include "tensorflow/c/eager/immediate_execution_tensor_handle.h"
+#include "tensorflow/c/tensor_interface.h"
+#include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/common_runtime/eager/context.h"
+#include "tensorflow/core/framework/numeric_types.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/lib/bfloat16/bfloat16.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace testing {
+
+std::unique_ptr<StaticDeviceMgr> CreateTestingDeviceMgr() {
+  return std::make_unique<StaticDeviceMgr>(
+      DeviceFactory::NewDevice("CPU", {}, "/job:localhost/replica:0/task:0"));
+}
+
+EagerContextPtr CreateTestingEagerContext(DeviceMgr* device_mgr) {
+  return EagerContextPtr(new EagerContext(
+      SessionOptions(),
+      tensorflow::ContextDevicePlacementPolicy::DEVICE_PLACEMENT_SILENT,
+      tensorflow::ContextMirroringPolicy::MIRRORING_NONE,
+      /* async= */ false,
+      /* lazy_copy_function_remote_inputs= */ false, device_mgr,
+      /* device_mgr_owned= */ false, /* rendezvous= */ nullptr,
+      /* custom_kernel_creator= */ nullptr,
+      /* cluster_flr= */ nullptr));
+}
+
+std::vector<DataType> DataTypeSetToVector(DataTypeSet set) {
+  std::vector<DataType> result;
+  result.reserve(set.size());
+  for (DataType dt : set) {
+    result.push_back(dt);
+  }
+  return result;
+}
+
+std::vector<std::vector<int64>> InterestingShapes() {
+  std::vector<std::vector<int64>> interesting_shapes;
+  interesting_shapes.push_back({});             // Scalar
+  interesting_shapes.push_back({10});           // 1D Vector
+  interesting_shapes.push_back({3, 3});         // 2D Matrix
+  interesting_shapes.push_back({1, 4, 6, 10});  // Higher Dimension Tensor
+  return interesting_shapes;
+}
+
+ImmediateTensorHandlePtr CreateTensorHandle(ImmediateExecutionContext* ctx,
+                                            DataType dtype,
+                                            absl::Span<const int64> shape,
+                                            int8 value) {
+  AbstractTensorPtr tensor(ctx->CreateTensor(dtype, shape));
+  CHECK_NE(tensor.get(), nullptr)
+      << "Tensor creation failed for tensor of dtype: "
+      << DataTypeString(dtype);
+  CHECK_EQ(tensor->Type(), dtype);
+  for (int i = 0; i < shape.size(); ++i) {
+    CHECK_EQ(tensor->Dim(i), shape[i]);
+  }
+  FillNumericTensorBuffer(tensor->Type(), tensor->NumElements(), tensor->Data(),
+                          value);
+  ImmediateTensorHandlePtr handle(ctx->CreateLocalHandle(tensor.get()));
+  CHECK_NE(handle.get(), nullptr);
+  return handle;
+}
+
+void FillNumericTensorBuffer(DataType dtype, size_t num_elements, void* buffer,
+                             int8 value) {
+  switch (dtype) {
+#define CASE(type)                                   \
+  case DataTypeToEnum<type>::value: {                \
+    type* typed_buffer = static_cast<type*>(buffer); \
+    for (size_t i = 0; i < num_elements; ++i) {      \
+      typed_buffer[i] = value;                       \
+    }                                                \
+    break;                                           \
+  }
+    TF_CALL_INTEGRAL_TYPES(CASE);
+    TF_CALL_double(CASE);
+    TF_CALL_float(CASE);
+#undef CASE
+    default:
+      CHECK(false) << "Unsupported data type: " << DataTypeString(dtype);
+      break;
+  }
+}
+
+// Checks the underlying data is equal for the buffers for two numeric tensors.
+// Note: The caller must ensure to check that the dtypes and sizes of the
+// underlying buffers are the same before calling this.
+void CheckBufferDataIsEqual(DataType dtype, int64 num_elements, void* a,
+                            void* b) {
+  switch (dtype) {
+#define CASE(type)                               \
+  case DataTypeToEnum<type>::value: {            \
+    type* typed_a = static_cast<type*>(a);       \
+    type* typed_b = static_cast<type*>(b);       \
+    for (int64 i = 0; i < num_elements; ++i) {   \
+      if (DataTypeIsFloating(dtype)) {           \
+        EXPECT_FLOAT_EQ(typed_a[i], typed_b[i]); \
+      } else {                                   \
+        EXPECT_EQ(typed_a[i], typed_b[i]);       \
+      }                                          \
+    }                                            \
+    break;                                       \
+  }
+    TF_CALL_INTEGRAL_TYPES(CASE);
+    TF_CALL_double(CASE);
+    TF_CALL_float(CASE);
+#undef CASE
+    default:
+      CHECK(false) << "Unsupported data type: " << DataTypeString(dtype);
+  }
+}
+
+}  // namespace testing
+}  // namespace tensorflow
diff --git a/tensorflow/c/experimental/saved_model/core/test_utils.h b/tensorflow/c/experimental/saved_model/core/test_utils.h
new file mode 100644
index 00000000000..fe80a660649
--- /dev/null
+++ b/tensorflow/c/experimental/saved_model/core/test_utils.h
@@ -0,0 +1,75 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_TEST_UTILS_H_
+#define TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_TEST_UTILS_H_
+
+#include <memory>
+#include <vector>
+
+#include "absl/types/span.h"
+#include "tensorflow/c/eager/immediate_execution_context.h"
+#include "tensorflow/c/eager/immediate_execution_tensor_handle.h"
+#include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/common_runtime/eager/context.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace testing {
+
+// Creates a DeviceMgr suitable for local tests.
+std::unique_ptr<StaticDeviceMgr> CreateTestingDeviceMgr();
+
+// Creates an EagerContext suitable for local tests. Does not take ownership
+// of `device_mgr`.
+EagerContextPtr CreateTestingEagerContext(DeviceMgr* device_mgr);
+
+// Converts a tensorflow::DatatypeSet to std::vector<DataType>.
+// This is useful for tests using GTest's ::testing::ValuesIn, since
+// DataTypeSet doesn't fullfill all the constraints of an STL-like iterable.
+std::vector<DataType> DataTypeSetToVector(DataTypeSet set);
+
+// Returns a vector of shapes intended to be "interesting" test cases.
+// Currently, this returns scalar, 1D vector, 2D matrix, and a 4D tensor shapes
+std::vector<std::vector<int64>> InterestingShapes();
+
+// Returns a TensorHandle of `dtype` and `shape`, filled with `value`.
+// `dtype` must be an integer dtype, float, or double.
+// If a TensorHandle cannot be created successfully, this function will
+// CHECK fail. This should only be used for testing purposes.
+ImmediateTensorHandlePtr CreateTensorHandle(ImmediateExecutionContext* ctx,
+                                            DataType dtype,
+                                            absl::Span<const int64> shape,
+                                            int8 value);
+
+// Fills a numeric tensor's buffer with `value`.
+// dtype must be any integer dtype, float or double.
+void FillNumericTensorBuffer(DataType dtype, size_t num_elements, void* buffer,
+                             int8 value);
+
+// Checks the underlying data is equal for the buffers for two numeric tensors.
+// Note: The caller must ensure to check that the dtypes and sizes of the
+// underlying buffers are the same before calling this.
+// dtype must be any integer dtype, float, or double.
+void CheckBufferDataIsEqual(DataType dtype, int64 num_elements, void* a,
+                            void* b);
+
+}  // namespace testing
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_TEST_UTILS_H_

From 23c45d4828a582614e3632c496da6d23e763e458 Mon Sep 17 00:00:00 2001
From: Saurabh Saxena <srbs@google.com>
Date: Wed, 24 Jun 2020 21:37:26 -0700
Subject: [PATCH 1058/1390] Implement MLIR and graph tracing using abstract
 interfaces shared with immediate execution mode. Enabled unified API test to
 run with TFRT. Replace dyn_cast with tensorflow::dyn_cast.
 TF_ExecuteOperation no longer takes a TF_ExecutionContext arg since the
 current impls tie the op builder to the creating context and it is not clear
 if we will ever support that API.

PiperOrigin-RevId: 318203001
Change-Id: If5048b1404b87c809606c236419e1869630bcd46
---
 tensorflow/c/eager/BUILD                      |  12 +
 tensorflow/c/eager/abstract_function.h        |   2 +-
 .../c/eager/c_api_unified_experimental.cc     | 104 +++-
 .../c/eager/c_api_unified_experimental.h      |   5 +-
 .../eager/c_api_unified_experimental_eager.cc | 186 ++-----
 .../eager/c_api_unified_experimental_graph.cc | 350 ++++++++-----
 .../c_api_unified_experimental_internal.h     | 169 ++-----
 .../eager/c_api_unified_experimental_test.cc  | 174 +++----
 tensorflow/compiler/mlir/tensorflow/c/BUILD   |   5 +
 .../c/c_api_unified_experimental_mlir.cc      | 467 ++++++++++++------
 ..._unified_experimental_mlir_registration.cc |   4 +-
 11 files changed, 777 insertions(+), 701 deletions(-)

diff --git a/tensorflow/c/eager/BUILD b/tensorflow/c/eager/BUILD
index da7ddc3ec06..9696a3415bf 100644
--- a/tensorflow/c/eager/BUILD
+++ b/tensorflow/c/eager/BUILD
@@ -158,9 +158,13 @@ cc_library(
         "//tensorflow:internal",
     ],
     deps = [
+        ":abstract_context",
+        ":abstract_operation",
+        ":abstract_tensor_handle",
         ":c_api",
         ":c_api_experimental",
         "//tensorflow/c:c_api_internal",
+        "//tensorflow/c:conversion_macros",
         "//tensorflow/c:tf_status",
         "//tensorflow/core/platform:casts",
         "//tensorflow/core/platform:types",
@@ -541,6 +545,9 @@ tf_cuda_library(
             ":abstract_operation",
             ":abstract_context",
             ":abstract_tensor_handle",
+            ":immediate_execution_tensor_handle",
+            ":immediate_execution_context",
+            "//tensorflow/core/lib/llvm_rtti",
             "//tensorflow/c:c_api",
             "//tensorflow/c:c_api_internal",
             "//tensorflow/core:core_cpu",
@@ -559,6 +566,7 @@ tf_cuda_library(
             "//tensorflow/core:lib_internal",
             "//tensorflow/core:protos_all_cc",
             "@com_google_absl//absl/types:variant",
+            "//tensorflow/c:conversion_macros",
         ],
     }) + select({
         "//tensorflow:with_xla_support": [
@@ -732,6 +740,10 @@ filegroup(
         ],
         exclude = [
             "c_api_experimental.cc",
+            "c_api_unified_experimental.cc",
+            "c_api_unified_experimental_eager.cc",
+            "c_api_unified_experimental_graph.cc",
+            "c_api_unified_experimental_internal.h",
             "*test*",
             "*dlpack*",
         ],
diff --git a/tensorflow/c/eager/abstract_function.h b/tensorflow/c/eager/abstract_function.h
index 303dd435c05..f02bc97b28c 100644
--- a/tensorflow/c/eager/abstract_function.h
+++ b/tensorflow/c/eager/abstract_function.h
@@ -25,7 +25,7 @@ namespace tensorflow {
 // function.
 class AbstractFunction {
  protected:
-  enum AbstractFunctionKind { kGraphFunc, kMlirFunc };
+  enum AbstractFunctionKind { kGraph, kMlir };
   explicit AbstractFunction(AbstractFunctionKind kind) : kind_(kind) {}
 
  public:
diff --git a/tensorflow/c/eager/c_api_unified_experimental.cc b/tensorflow/c/eager/c_api_unified_experimental.cc
index e5030a602b3..605a60c186c 100644
--- a/tensorflow/c/eager/c_api_unified_experimental.cc
+++ b/tensorflow/c/eager/c_api_unified_experimental.cc
@@ -22,15 +22,17 @@ limitations under the License.
 #include "tensorflow/c/eager/c_api_unified_experimental_internal.h"
 #include "tensorflow/c/tf_datatype.h"
 #include "tensorflow/c/tf_status.h"
+#include "tensorflow/c/tf_status_helper.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/lib/llvm_rtti/llvm_rtti.h"
+#include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/types.h"
 
 using tensorflow::string;
-using tensorflow::internal::OutputList;
-using tensorflow::internal::unwrap;
 
 namespace tensorflow {
-namespace internal {
-typedef absl::flat_hash_map<std::string, FactoryFunction> FactoriesMap;
+namespace tracing {
+typedef absl::flat_hash_map<std::string, tracing::FactoryFunction> FactoriesMap;
 
 static FactoriesMap& GetFactories() {
   static FactoriesMap* factories = new FactoriesMap;
@@ -48,8 +50,8 @@ void RegisterTracingEngineFactory(const string& name, FactoryFunction factory) {
 
 void SetDefaultTracingEngine(const char* name) { default_factory = name; }
 
-static ExecutionContext* CreateTracingExecutionContext(const char* fn_name,
-                                                       TF_Status* s) {
+static TracingContext* CreateTracingExecutionContext(const char* fn_name,
+                                                     TF_Status* s) {
   auto entry = GetFactories().find(default_factory);
   if (entry != GetFactories().end()) return entry->second(fn_name, s);
   string msg = absl::StrCat(
@@ -70,7 +72,7 @@ static ExecutionContext* CreateTracingExecutionContext(const char* fn_name,
   return nullptr;
 }
 
-}  // end namespace internal
+}  // end namespace tracing
 }  // end namespace tensorflow
 
 // =============================================================================
@@ -83,43 +85,77 @@ static ExecutionContext* CreateTracingExecutionContext(const char* fn_name,
 //
 // =============================================================================
 
+using tensorflow::AbstractFunction;
+using tensorflow::AbstractTensorHandle;
+using tensorflow::DataType;
+using tensorflow::dyn_cast;
+using tensorflow::OutputList;
+using tensorflow::Status;
+using tensorflow::unwrap;
+using tensorflow::wrap;
+using tensorflow::tracing::CreateTracingExecutionContext;
+using tensorflow::tracing::SetDefaultTracingEngine;
+using tensorflow::tracing::TracingContext;
+using tensorflow::tracing::TracingOperation;
+using tensorflow::tracing::TracingTensorHandle;
+
 void TF_SetTracingImplementation(const char* name) {
-  tensorflow::internal::SetDefaultTracingEngine(name);
+  SetDefaultTracingEngine(name);
 }
 
 // Creates a new TensorFlow function, it is an execution context attached to a
 // given tracing context.
 TF_ExecutionContext* TF_CreateFunction(const char* fn_name, TF_Status* s) {
-  return wrap(tensorflow::internal::CreateTracingExecutionContext(fn_name, s));
+  return wrap(CreateTracingExecutionContext(fn_name, s));
 }
 
 TF_AbstractFunction* TF_FinalizeFunction(TF_ExecutionContext* ctx,
                                          TF_OutputList* outputs, TF_Status* s) {
-  auto* func = wrap(unwrap(ctx)->Finalize(unwrap(outputs), s));
+  AbstractFunction* func;
+  TracingContext* tracing_ctx = dyn_cast<TracingContext>(unwrap(ctx));
+  if (!tracing_ctx) {
+    Set_TF_Status_from_Status(
+        s, tensorflow::errors::InvalidArgument(
+               "Only TracingContext can be converted into a function."));
+    return nullptr;
+  }
+  Set_TF_Status_from_Status(s, tracing_ctx->Finalize(unwrap(outputs), &func));
   TF_DeleteExecutionContext(ctx);
-  return func;
+  return wrap(func);
 }
 
 TF_AbstractTensor* TF_AddFunctionParameter(TF_ExecutionContext* func,
                                            TF_DataType dtype, TF_Status* s) {
-  return wrap(unwrap(func)->AddParameter(dtype, s));
+  TracingTensorHandle* t;
+  TracingContext* tracing_ctx = dyn_cast<TracingContext>(unwrap(func));
+  if (!tracing_ctx) {
+    Set_TF_Status_from_Status(
+        s, tensorflow::errors::InvalidArgument(
+               "TF_AddFunctionParameter must be called on a TracingContext."));
+    return nullptr;
+  }
+  Set_TF_Status_from_Status(
+      s, tracing_ctx->AddParameter(static_cast<DataType>(dtype), &t));
+  return wrap(t);
 }
 
-void TF_DeleteExecutionContext(TF_ExecutionContext* c) { delete unwrap(c); }
+void TF_DeleteExecutionContext(TF_ExecutionContext* c) { unwrap(c)->Release(); }
 
 TF_AbstractOp* TF_NewAbstractOp(TF_ExecutionContext* c) {
-  return wrap(unwrap(c)->CreateOperation());
+  return wrap((unwrap(c)->CreateOperation()));
 }
 
-void TF_DeleteAbstractOp(TF_AbstractOp* op) { delete unwrap(op); }
+void TF_DeleteAbstractOp(TF_AbstractOp* op) { unwrap(op)->Release(); }
 
-void TF_DeleteAbstractTensor(TF_AbstractTensor* t) { delete unwrap(t); }
+void TF_DeleteAbstractTensor(TF_AbstractTensor* t) { unwrap(t)->Release(); }
 
 TF_OutputList* TF_NewOutputList() { return wrap(new OutputList); }
 void TF_DeleteOutputList(TF_OutputList* o) { delete unwrap(o); }
 void TF_OutputListSetNumOutputs(TF_OutputList* o, int num_outputs,
                                 TF_Status* s) {
   unwrap(o)->expected_num_outputs = num_outputs;
+  unwrap(o)->outputs.clear();
+  unwrap(o)->outputs.resize(num_outputs);
 }
 int TF_OutputListNumOutputs(TF_OutputList* o) {
   return unwrap(o)->outputs.size();
@@ -134,24 +170,46 @@ void TF_OutputListPushBack(TF_OutputList* o, TF_AbstractTensor* tensor,
 
 void TF_AbstractOpSetOpType(TF_AbstractOp* op, const char* const op_type,
                             TF_Status* s) {
-  unwrap(op)->SetOpType(op_type, s);
+  Set_TF_Status_from_Status(s, unwrap(op)->Reset(op_type,
+                                                 /*raw_device_name=*/nullptr));
 }
 
 void TF_AbstractOpSetOpName(TF_AbstractOp* op, const char* const op_name,
                             TF_Status* s) {
-  unwrap(op)->SetOpName(op_name, s);
+  TracingOperation* tracing_op = dyn_cast<TracingOperation>(unwrap(op));
+  if (!tracing_op) {
+    Set_TF_Status_from_Status(
+        s, tensorflow::errors::InvalidArgument(
+               "TF_AbstractOpSetOpName must be called on a TracingOperation."));
+    return;
+  }
+  Set_TF_Status_from_Status(s, tracing_op->SetOpName(op_name));
 }
 
 void TF_AbstractOpSetAttrType(TF_AbstractOp* op, const char* const attr_name,
                               TF_DataType value, TF_Status* s) {
-  unwrap(op)->SetAttrType(attr_name, value, s);
+  Status status =
+      unwrap(op)->SetAttrType(attr_name, static_cast<DataType>(value));
+  TF_SetStatus(s, static_cast<TF_Code>(status.code()),
+               status.error_message().c_str());
 }
 
 void TF_ExecuteOperation(TF_AbstractOp* op, int num_inputs,
                          TF_AbstractTensor* const* inputs, TF_OutputList* o,
-                         TF_ExecutionContext* ctx, TF_Status* s) {
-  unwrap(ctx)->ExecuteOperation(unwrap(op), num_inputs, &unwrap(*inputs),
-                                unwrap(o), s);
+                         TF_Status* s) {
+  for (int i = 0; i < num_inputs; i++) {
+    Set_TF_Status_from_Status(s, unwrap(op)->AddInput(unwrap(inputs[i])));
+    if (TF_GetCode(s) != TF_OK) {
+      return;
+    }
+  }
+  int num_outputs = unwrap(o)->expected_num_outputs;
+  Set_TF_Status_from_Status(
+      s, unwrap(op)->Execute(
+             absl::MakeSpan(reinterpret_cast<AbstractTensorHandle**>(
+                                unwrap(o)->outputs.data()),
+                            unwrap(o)->outputs.size()),
+             &num_outputs));
 }
 
 void TF_DeleteAbstractFunction(TF_AbstractFunction* func) {
@@ -161,5 +219,5 @@ void TF_DeleteAbstractFunction(TF_AbstractFunction* func) {
 void TF_ExecutionContextRegisterFunction(TF_ExecutionContext* ctx,
                                          TF_AbstractFunction* func,
                                          TF_Status* s) {
-  unwrap(ctx)->RegisterFunction(unwrap(func), s);
+  Set_TF_Status_from_Status(s, unwrap(ctx)->RegisterFunction(unwrap(func)));
 }
diff --git a/tensorflow/c/eager/c_api_unified_experimental.h b/tensorflow/c/eager/c_api_unified_experimental.h
index 86c59a7f625..b66869b4290 100644
--- a/tensorflow/c/eager/c_api_unified_experimental.h
+++ b/tensorflow/c/eager/c_api_unified_experimental.h
@@ -110,7 +110,7 @@ void TF_OutputListPushBack(TF_OutputList* o, TF_AbstractTensor* tensor,
 // Any active tape will observe the effects of this execution.
 void TF_ExecuteOperation(TF_AbstractOp* op, int num_inputs,
                          TF_AbstractTensor* const* inputs, TF_OutputList* o,
-                         TF_ExecutionContext* ctx, TF_Status* s);
+                         TF_Status* s);
 
 // Creates a new TF_AbstractFunction from the current tracing states in the
 // context. The provided `ctx` is consumed by this API call and deleted.
@@ -137,7 +137,8 @@ TF_AbstractTensor* TF_CreateAbstractTensorFromEagerTensor(TFE_TensorHandle* t,
                                                           TF_Status* s);
 TFE_TensorHandle* TF_AbstractTensorGetEagerTensor(TF_AbstractTensor* at,
                                                   TF_Status* s);
-TFE_Context* TF_ExecutionContextGetTFEContext(TF_ExecutionContext*);
+TFE_Context* TF_ExecutionContextGetTFEContext(TF_ExecutionContext*,
+                                              TF_Status* s);
 
 #ifdef __cplusplus
 } /* end extern "C" */
diff --git a/tensorflow/c/eager/c_api_unified_experimental_eager.cc b/tensorflow/c/eager/c_api_unified_experimental_eager.cc
index cf8cf845834..986b48ff8f2 100644
--- a/tensorflow/c/eager/c_api_unified_experimental_eager.cc
+++ b/tensorflow/c/eager/c_api_unified_experimental_eager.cc
@@ -15,180 +15,68 @@ limitations under the License.
 
 #include <vector>
 
+#include "tensorflow/c/eager/abstract_context.h"
+#include "tensorflow/c/eager/abstract_tensor_handle.h"
 #include "tensorflow/c/eager/c_api.h"
 #include "tensorflow/c/eager/c_api_unified_experimental.h"
 #include "tensorflow/c/eager/c_api_unified_experimental_internal.h"
-#include "tensorflow/c/tf_datatype.h"
+#include "tensorflow/c/eager/immediate_execution_context.h"
+#include "tensorflow/c/eager/immediate_execution_tensor_handle.h"
+#include "tensorflow/c/eager/tfe_context_internal.h"
+#include "tensorflow/c/eager/tfe_tensorhandle_internal.h"
 #include "tensorflow/c/tf_status.h"
-#include "tensorflow/core/lib/gtl/inlined_vector.h"
-#include "tensorflow/core/platform/casts.h"
+#include "tensorflow/core/lib/llvm_rtti/llvm_rtti.h"
 #include "tensorflow/core/platform/strcat.h"
-#include "tensorflow/core/platform/types.h"
-
-using tensorflow::string;
-
-namespace tensorflow {
-namespace internal {
-
-// Simple wrapper over a TFE_TensorHandle
-struct EagerTensor : public AbstractTensor {
-  TFE_TensorHandle* t = nullptr;
-  EagerTensor() : AbstractTensor(kKind) {}
-  explicit EagerTensor(TFE_TensorHandle* t) : AbstractTensor(kKind), t(t) {}
-  ~EagerTensor() override { TFE_DeleteTensorHandle(t); }
-  static constexpr AbstractTensorKind kKind = kEagerTensor;
-};
-
-// Simple wrapper over a TFE_Op
-class EagerOp : public AbstractOp {
- public:
-  explicit EagerOp(TFE_Context* ctx) : AbstractOp(kKind), ctx_(ctx) {}
-  void SetOpType(const char* const op_type, TF_Status* s) override {
-    op_ = TFE_NewOp(ctx_, op_type, s);
-  }
-  void SetOpName(const char* const op_name, TF_Status* s) override {
-    // Name is ignored in eager mode.
-  }
-  void SetAttrType(const char* const attr_name, TF_DataType value,
-                   TF_Status* s) override {
-    if (op_ == nullptr) {
-      TF_SetStatus(s, TF_FAILED_PRECONDITION,
-                   "op_type must be specified before specifying attrs.");
-      return;
-    }
-    TFE_OpSetAttrType(op_, attr_name, value);
-  }
-
-  ~EagerOp() override { TFE_DeleteOp(op_); }
-  static constexpr AbstractOpKind kKind = kEagerOp;
-
- private:
-  friend class EagerContext;  // For access to op_.
-  TFE_Op* op_ = nullptr;
-  TFE_Context* ctx_;
-};
-
-// Wraps a TFE_Context and dispatch EagerOp with EagerTensor inputs.
-class EagerContext : public ExecutionContext {
- public:
-  EagerContext() : ExecutionContext(kKind) {}
-
-  void Build(TFE_ContextOptions* options, TF_Status* status) {
-    eager_ctx_ = TFE_NewContext(options, status);
-  }
-
-  AbstractOp* CreateOperation() override {
-    // TODO(srbs): Should the lifetime of this op be tied to the context.
-    return new EagerOp(eager_ctx_);
-  }
-
-  void ExecuteOperation(AbstractOp* op, int num_inputs,
-                        AbstractTensor* const* inputs, OutputList* o,
-                        TF_Status* s) override {
-    auto* eager_op = dyncast<EagerOp>(op);
-    if (eager_op == nullptr) {
-      TF_SetStatus(s, TF_INVALID_ARGUMENT,
-                   "Unable to cast AbstractOp to TF_EagerOp.");
-      return;
-    }
-    auto* tfe_op = eager_op->op_;
-    if (TF_GetCode(s) != TF_OK) return;
-    for (int i = 0; i < num_inputs; ++i) {
-      auto* eager_tensor = dyncast<const EagerTensor>(inputs[i]);
-      if (!eager_tensor) {
-        TF_SetStatus(s, TF_INVALID_ARGUMENT, "Not an eager tensor.");
-        return;
-      }
-      TFE_OpAddInput(tfe_op, eager_tensor->t, s);
-      if (TF_GetCode(s) != TF_OK) return;
-    }
-    if (o->expected_num_outputs == -1) {
-      string msg =
-          "The number of outputs must be provided in eager mode. Use "
-          "TF_OutputListSetNumOutputs.";
-      TF_SetStatus(s, TF_INVALID_ARGUMENT, msg.c_str());
-      return;
-    }
-    tensorflow::gtl::InlinedVector<TFE_TensorHandle*, 2> retvals;
-    int num_retvals = o->expected_num_outputs;
-    retvals.resize(num_retvals);
-    TFE_Execute(tfe_op, retvals.data(), &num_retvals, s);
-    if (TF_GetCode(s) != TF_OK) {
-      return;
-    }
-    o->outputs.clear();
-    o->outputs.reserve(num_retvals);
-    for (int i = 0; i < num_retvals; ++i) {
-      o->outputs.push_back(new EagerTensor(retvals[i]));
-    }
-  }
-
-  AbstractTensor* AddParameter(TF_DataType dtype, TF_Status* s) override {
-    TF_SetStatus(s, TF_INVALID_ARGUMENT,
-                 "Can't add function parameter on an eager context.");
-    return nullptr;
-  }
-  AbstractFunction* Finalize(OutputList* outputs, TF_Status* s) override {
-    TF_SetStatus(s, TF_INVALID_ARGUMENT,
-                 "Can't use finalize function on an eager context.");
-    return nullptr;
-  }
-
-  void RegisterFunction(AbstractFunction* afunc, TF_Status* s) override {
-    auto* func = afunc->GetTfFunction(s);
-    if (!func) {
-      return;
-    }
-    TFE_ContextAddFunction(eager_ctx_, func, s);
-  }
-
-  ~EagerContext() override { TFE_DeleteContext(eager_ctx_); }
-
-  static constexpr ExecutionContextKind kKind = kEagerContext;
-
- private:
-  friend TFE_Context* ::TF_ExecutionContextGetTFEContext(
-      TF_ExecutionContext* ctx);
-  TFE_Context* eager_ctx_;
-};
-
-}  // namespace internal
-}  // namespace tensorflow
 
 // =============================================================================
 // Public C API entry points
 // These are only the entry points specific to the Eager API.
 // =============================================================================
 
-using tensorflow::internal::dyncast;
-using tensorflow::internal::unwrap;
+using tensorflow::AbstractContext;
+using tensorflow::AbstractTensorHandle;
+using tensorflow::dyn_cast;
+using tensorflow::ImmediateExecutionContext;
+using tensorflow::ImmediateExecutionTensorHandle;
+using tensorflow::string;
+using tensorflow::unwrap;
+using tensorflow::wrap;
+using tensorflow::strings::StrCat;
 
 TF_ExecutionContext* TF_NewEagerExecutionContext(TFE_ContextOptions* options,
                                                  TF_Status* s) {
-  auto* ctx = new tensorflow::internal::EagerContext();
-  ctx->Build(options, s);
-  return wrap(ctx);
+  TFE_Context* c_ctx = TFE_NewContext(options, s);
+  if (TF_GetCode(s) != TF_OK) {
+    return nullptr;
+  }
+  return wrap(static_cast<AbstractContext*>(unwrap(c_ctx)));
 }
 
 TF_AbstractTensor* TF_CreateAbstractTensorFromEagerTensor(TFE_TensorHandle* t,
                                                           TF_Status* s) {
-  return wrap(new tensorflow::internal::EagerTensor(t));
+  return wrap(static_cast<AbstractTensorHandle*>(unwrap(t)));
 }
 
 TFE_TensorHandle* TF_AbstractTensorGetEagerTensor(TF_AbstractTensor* at,
                                                   TF_Status* s) {
-  auto* eager_tensor = dyncast<tensorflow::internal::EagerTensor>(unwrap(at));
-  if (!eager_tensor) {
-    string msg = tensorflow::strings::StrCat("Not an eager tensor handle.",
-                                             reinterpret_cast<uintptr_t>(at));
+  auto handle = dyn_cast<ImmediateExecutionTensorHandle>(unwrap(at));
+  if (!handle) {
+    string msg =
+        StrCat("Not an eager tensor handle.", reinterpret_cast<uintptr_t>(at));
     TF_SetStatus(s, TF_INVALID_ARGUMENT, msg.c_str());
     return nullptr;
   }
-  return eager_tensor->t;
+  return wrap(handle);
 }
 
-TFE_Context* TF_ExecutionContextGetTFEContext(TF_ExecutionContext* ctx) {
-  auto* eager_ctx = dyncast<tensorflow::internal::EagerContext>(unwrap(ctx));
-  if (!eager_ctx) return nullptr;
-  return eager_ctx->eager_ctx_;
+TFE_Context* TF_ExecutionContextGetTFEContext(TF_ExecutionContext* ctx,
+                                              TF_Status* s) {
+  auto imm_ctx = dyn_cast<ImmediateExecutionContext>(unwrap(ctx));
+  if (!imm_ctx) {
+    string msg =
+        StrCat("Not an eager context.", reinterpret_cast<uintptr_t>(ctx));
+    TF_SetStatus(s, TF_INVALID_ARGUMENT, msg.c_str());
+    return nullptr;
+  }
+  return wrap(imm_ctx);
 }
diff --git a/tensorflow/c/eager/c_api_unified_experimental_graph.cc b/tensorflow/c/eager/c_api_unified_experimental_graph.cc
index dd5a95b3526..bda5e163a50 100644
--- a/tensorflow/c/eager/c_api_unified_experimental_graph.cc
+++ b/tensorflow/c/eager/c_api_unified_experimental_graph.cc
@@ -18,77 +18,198 @@ limitations under the License.
 
 #include "absl/strings/str_cat.h"
 #include "tensorflow/c/c_api.h"
+#include "tensorflow/c/eager/abstract_context.h"
 #include "tensorflow/c/eager/c_api_internal.h"
 #include "tensorflow/c/eager/c_api_unified_experimental.h"
 #include "tensorflow/c/eager/c_api_unified_experimental_internal.h"
 #include "tensorflow/c/tf_datatype.h"
 #include "tensorflow/c/tf_status.h"
+#include "tensorflow/c/tf_status_helper.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/lib/llvm_rtti/llvm_rtti.h"
+#include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/strcat.h"
 #include "tensorflow/core/platform/types.h"
 
+using tensorflow::dyn_cast;
 using tensorflow::string;
 
 namespace tensorflow {
-namespace internal {
+namespace tracing {
+namespace graph {
 
 class GraphContext;
+class GraphOperation;
+class GraphTensor;
 
 // GraphTensor wraps a `TF_Output`, i.e. a pointer to TF_Operation and the index
 // into the list of outputs for the operation.
-struct GraphTensor : public AbstractTensor {
-  TF_Output output{};
-  GraphContext* ctx = nullptr;
-  GraphTensor() : AbstractTensor(kKind) {}
-  GraphTensor(TF_Output output, GraphContext* ctx)
-      : AbstractTensor(kKind), output(output), ctx(ctx) {}
-  static constexpr AbstractTensorKind kKind = kGraphTensor;
+class GraphTensor : public TracingTensorHandle {
+ public:
+  explicit GraphTensor(TF_Output output)
+      : TracingTensorHandle(kGraph), output_(output) {}
+  void Release() override { delete this; }
+  TF_Output output_;
+
+  // For LLVM style RTTI.
+  static bool classof(const AbstractTensorHandle* ptr) {
+    return ptr->getKind() == kGraph;
+  }
 };
 
-// GraphOp wraps and populate a TF_OperationDescription.
-class GraphOp : public AbstractOp {
+// GraphOperation wraps and populates a TF_OperationDescription.
+class GraphOperation : public TracingOperation {
  public:
-  explicit GraphOp(TF_Graph* g) : AbstractOp(kKind), g_(g) {}
-  void SetOpType(const char* const op_type, TF_Status* s) override {
+  explicit GraphOperation(TF_Graph* g) : TracingOperation(kGraph), g_(g) {}
+  void Release() override { delete this; }
+  Status Reset(const char* op, const char* raw_device_name) override {
     if (op_) {
-      TF_SetStatus(
-          s, TF_FAILED_PRECONDITION,
-          strings::StrCat("SetOpType called on already built op.").c_str());
-      return;
+      return errors::FailedPrecondition("Reset called on already built op.");
     }
-    if (op_name_ != nullptr) {
-      op_.reset(TF_NewOperation(g_, op_type, op_name_));
-      op_name_ = nullptr;
-    } else {
-      op_type_ = op_type;
+    if (raw_device_name) {
+      device_name_ = raw_device_name;
     }
+    op_type_ = op;
+    return Status::OK();
   }
-  void SetOpName(const char* const op_name, TF_Status* s) override {
+  Status SetOpName(const char* const op_name) override {
     if (op_) {
-      TF_SetStatus(
-          s, TF_FAILED_PRECONDITION,
-          strings::StrCat("SetOpName called on already built op.").c_str());
-      return;
+      return errors::FailedPrecondition(
+          "SetOpName called on already built op.");
     }
-    if (op_type_ != nullptr) {
-      op_.reset(TF_NewOperation(g_, op_type_, op_name));
-      op_type_ = nullptr;
-    } else {
-      op_name_ = op_name;
+    if (op_type_.empty()) {
+      return errors::FailedPrecondition(
+          "GraphOperation::Reset must be called before calling SetOpName.");
     }
+    op_.reset(TF_NewOperation(g_, op_type_.c_str(), op_name));
+    return Status::OK();
   }
-  void SetAttrType(const char* const attr_name, TF_DataType value,
-                   TF_Status* s) override {
-    if (!op_) {
-      TF_SetStatus(
-          s, TF_FAILED_PRECONDITION,
-          "op_type and op_name must be specified before specifying attrs.");
-      return;
-    }
-    TF_SetAttrType(op_.get(), attr_name, value);
-  }
-  ~GraphOp() override {}
+  const string& Name() const override { return op_type_; }
+  const string& DeviceName() const override { return device_name_; }
 
-  static constexpr AbstractOpKind kKind = kGraphOp;
+  Status SetDeviceName(const char* name) override {
+    // TODO(srbs): Implement this.
+    device_name_ = name;
+    return Status::OK();
+  }
+
+  Status AddInput(AbstractTensorHandle* input) override {
+    GraphTensor* t = dyn_cast<GraphTensor>(input);
+    if (!t) {
+      return tensorflow::errors::InvalidArgument(
+          "Unable to cast input to GraphTensor");
+    }
+    TF_AddInput(op_.get(), t->output_);
+    return Status::OK();
+  }
+  Status AddInputList(absl::Span<AbstractTensorHandle*> inputs) override {
+    return tensorflow::errors::Unimplemented(
+        "AddInputList has not been implemented yet.");
+  }
+  Status Execute(absl::Span<AbstractTensorHandle*> retvals,
+                 int* num_retvals) override {
+    auto* tf_opdesc = op_.release();
+    if (tf_opdesc == nullptr) {
+      return errors::InvalidArgument("AbstractOp is incomplete.");
+    }
+    TF_Status* s = TF_NewStatus();
+    auto* operation = TF_FinishOperation(tf_opdesc, s);
+    TF_RETURN_IF_ERROR(StatusFromTF_Status(s));
+    TF_DeleteStatus(s);
+    *num_retvals = TF_OperationNumOutputs(operation);
+    for (int i = 0; i < *num_retvals; ++i) {
+      retvals[i] = new GraphTensor({operation, i});
+    }
+    return Status::OK();
+  }
+
+  Status SetAttrString(const char* attr_name, const char* data,
+                       size_t length) override {
+    return tensorflow::errors::Unimplemented(
+        "SetAttrString has not been implemented yet.");
+  }
+  Status SetAttrInt(const char* attr_name, int64_t value) override {
+    return tensorflow::errors::Unimplemented(
+        "SetAttrInt has not been implemented yet.");
+  }
+  Status SetAttrFloat(const char* attr_name, float value) override {
+    return tensorflow::errors::Unimplemented(
+        "SetAttrFloat has not been implemented yet.");
+  }
+  Status SetAttrBool(const char* attr_name, bool value) override {
+    return tensorflow::errors::Unimplemented(
+        "SetAttrBool has not been implemented yet.");
+  }
+  Status SetAttrType(const char* const attr_name, DataType value) override {
+    if (!op_) {
+      return Status(
+          error::Code::FAILED_PRECONDITION,
+          "op_type and op_name must be specified before specifying attrs.");
+    }
+    op_->node_builder.Attr(attr_name, value);
+    return Status::OK();
+  }
+  Status SetAttrShape(const char* attr_name, const int64_t* dims,
+                      const int num_dims) override {
+    return tensorflow::errors::Unimplemented(
+        "SetAttrShape has not been implemented yet.");
+  }
+  Status SetAttrFunction(const char* attr_name,
+                         const AbstractOperation* value) override {
+    return tensorflow::errors::Unimplemented(
+        "SetAttrFunction has not been implemented yet.");
+  }
+  Status SetAttrFunctionName(const char* attr_name, const char* value,
+                             size_t length) override {
+    return tensorflow::errors::Unimplemented(
+        "SetAttrFunctionName has not been implemented yet.");
+  }
+  Status SetAttrTensor(const char* attr_name,
+                       AbstractTensorInterface* tensor) override {
+    return tensorflow::errors::Unimplemented(
+        "SetAttrTensor has not been implemented yet.");
+  }
+  Status SetAttrStringList(const char* attr_name, const void* const* values,
+                           const size_t* lengths, int num_values) override {
+    return tensorflow::errors::Unimplemented(
+        "SetAttrStringList has not been implemented yet.");
+  }
+  Status SetAttrFloatList(const char* attr_name, const float* values,
+                          int num_values) override {
+    return tensorflow::errors::Unimplemented(
+        "SetAttrFloatList has not been implemented yet.");
+  }
+  Status SetAttrIntList(const char* attr_name, const int64_t* values,
+                        int num_values) override {
+    return tensorflow::errors::Unimplemented(
+        "SetAttrIntList has not been implemented yet.");
+  }
+  Status SetAttrTypeList(const char* attr_name, const DataType* values,
+                         int num_values) override {
+    return tensorflow::errors::Unimplemented(
+        "SetAttrTypeList has not been implemented yet.");
+  }
+  Status SetAttrBoolList(const char* attr_name, const unsigned char* values,
+                         int num_values) override {
+    return tensorflow::errors::Unimplemented(
+        "SetAttrBoolList has not been implemented yet.");
+  }
+  Status SetAttrShapeList(const char* attr_name, const int64_t** dims,
+                          const int* num_dims, int num_values) override {
+    return tensorflow::errors::Unimplemented(
+        "SetAttrShapeList has not been implemented yet.");
+  }
+  Status SetAttrFunctionList(
+      const char* attr_name,
+      absl::Span<const AbstractOperation*> values) override {
+    return tensorflow::errors::Unimplemented(
+        "SetAttrFunctionList has not been implemented yet.");
+  }
+  // For LLVM style RTTI.
+  static bool classof(const AbstractOperation* ptr) {
+    return ptr->getKind() == kGraph;
+  }
+  ~GraphOperation() override {}
 
  private:
   friend class GraphContext;  // For access to op_.
@@ -96,123 +217,109 @@ class GraphOp : public AbstractOp {
   std::unique_ptr<TF_OperationDescription> op_;
   // Hold `op_type` and `op_name` till both are available since we need both
   // to build a graph operation.
-  const char* op_type_ = nullptr;
+  string op_type_;
   const char* op_name_ = nullptr;
+  // TODO(srbs): Use this.
+  string device_name_;
 };
 
 // GraphFunction is a thin wrapper over a TF_Function.
 struct GraphFunction : public AbstractFunction {
   TF_Function* func = nullptr;
-  GraphFunction() : AbstractFunction(kKind) {}
+  GraphFunction() : AbstractFunction(kGraph) {}
   explicit GraphFunction(TF_Function* func)
-      : AbstractFunction(kKind), func(func) {}
+      : AbstractFunction(kGraph), func(func) {}
   ~GraphFunction() override {
     if (func) TF_DeleteFunction(func);
   }
 
-  TF_Function* GetTfFunction(TF_Status* s) override { return func; }
+  Status GetFunctionDef(FunctionDef** fdef) override {
+    *fdef = &func->fdef;
+    return Status::OK();
+  }
 
-  static constexpr AbstractFunctionKind kKind = kGraphFunc;
+  // For LLVM style RTTI.
+  static bool classof(const AbstractFunction* ptr) {
+    return ptr->getKind() == kGraph;
+  }
 };
 
 // GraphContext wraps a TF_Graph modeling a single function and manages the
 // "execution" of operation, i.e. adding them to the function.
-class GraphContext : public ExecutionContext {
+class GraphContext : public TracingContext {
  public:
   explicit GraphContext(const char* name)
-      : ExecutionContext(kKind),
+      : TracingContext(kGraph),
         graph_(new TF_Graph(), TF_DeleteGraph),
         name_(name) {}
 
-  AbstractOp* CreateOperation() override {
-    // TODO(srbs): Should the lifetime of this op be tied to the context.
-    return new GraphOp(graph_.get());
+  void Release() override { delete this; }
+
+  TracingOperation* CreateOperation() override {
+    return new GraphOperation(graph_.get());
   }
 
-  void ExecuteOperation(AbstractOp* op, int num_inputs,
-                        AbstractTensor* const* inputs, OutputList* o,
-                        TF_Status* s) override {
-    auto* graph_op = dyncast<GraphOp>(op);
-    if (graph_op == nullptr) {
-      TF_SetStatus(s, TF_INVALID_ARGUMENT,
-                   "Unable to cast AbstractOp to TF_GraphOp.");
-      return;
+  Status AddParameter(DataType dtype, TracingTensorHandle** output) override {
+    auto operation = CreateOperation();
+    TF_RETURN_IF_ERROR(operation->Reset("Placeholder", nullptr));
+    TF_RETURN_IF_ERROR(
+        operation->SetOpName(absl::StrCat("_input_", inputs_.size()).c_str()));
+    TF_RETURN_IF_ERROR(operation->SetAttrType("dtype", dtype));
+    int num_outputs = 1;
+    std::vector<AbstractTensorHandle*> outputs(num_outputs);
+    TF_RETURN_IF_ERROR(operation->Execute(
+        absl::Span<AbstractTensorHandle*>(outputs), &num_outputs));
+
+    if (num_outputs != 1) {
+      return errors::Internal("Expected 1 output but found ", num_outputs);
     }
-    auto* tf_opdesc = graph_op->op_.release();
-    if (tf_opdesc == nullptr) {
-      TF_SetStatus(s, TF_INVALID_ARGUMENT, "AbstractOp is incomplete.");
-      return;
-    }
-    for (int i = 0; i < num_inputs; ++i) {
-      auto* graph_tensor = dyncast<GraphTensor>(inputs[i]);
-      if (!graph_tensor) {
-        TF_SetStatus(s, TF_INVALID_ARGUMENT,
-                     "Capturing eager tensors is not supported yet.");
-        return;
-      } else {
-        if (graph_tensor->ctx != this) {
-          TF_SetStatus(
-              s, TF_INVALID_ARGUMENT,
-              "Capturing tensors from other graphs is not supported yet.");
-          return;
-        }
-        TF_AddInput(tf_opdesc, graph_tensor->output);
-      }
-    }
-    auto* operation = TF_FinishOperation(tf_opdesc, s);
-    // TF_FinishOperation deletes `tf_opdesc` so clear its reference.
-    graph_op->op_ = nullptr;
-    if (TF_GetCode(s) != TF_OK) return;
-    int num_outputs = TF_OperationNumOutputs(operation);
-    o->outputs.clear();
-    o->outputs.reserve(num_outputs);
-    for (int i = 0; i < num_outputs; ++i) {
-      o->outputs.push_back(new GraphTensor({operation, i}, this));
+    auto* t = dyn_cast<GraphTensor>(outputs[0]);
+    if (!t) {
+      return tensorflow::errors::InvalidArgument(
+          "Unable to cast input to GraphTensor");
     }
+    inputs_.push_back(t->output_);
+    *output = tensorflow::down_cast<TracingTensorHandle*>(outputs[0]);
+    return Status::OK();
   }
 
-  AbstractTensor* AddParameter(TF_DataType dtype, TF_Status* s) override {
-    TF_OperationDescription* opdesc =
-        TF_NewOperation(graph_.get(), "Placeholder",
-                        absl::StrCat("_input_", inputs_.size()).c_str());
-    TF_SetAttrType(opdesc, "dtype", dtype);
-    auto* operation = TF_FinishOperation(opdesc, s);
-    if (!s->status.ok()) return nullptr;
-
-    inputs_.push_back(TF_Output{operation, 0});
-    return new GraphTensor(inputs_.back(), this);
-  }
-
-  AbstractFunction* Finalize(OutputList* outputs, TF_Status* s) override {
+  Status Finalize(OutputList* outputs, AbstractFunction** f) override {
     std::unique_ptr<GraphFunction> func(new GraphFunction);
     std::vector<TF_Output> graph_outputs;
     graph_outputs.reserve(outputs->outputs.size());
-    for (AbstractTensor* abstract_output : outputs->outputs) {
-      GraphTensor* output = dyncast<GraphTensor>(abstract_output);
+    for (auto* abstract_output : outputs->outputs) {
+      GraphTensor* output = dyn_cast<GraphTensor>(abstract_output);
       if (!output) {
-        TF_SetStatus(s, TF_UNIMPLEMENTED,
-                     "Returning a non-graph tensor from a function has not "
-                     "been implemented yet.");
-        return nullptr;
+        return errors::Unimplemented(
+            "Returning a non-graph tensor from a function has not "
+            "been implemented yet.");
       }
-      graph_outputs.push_back(output->output);
+      graph_outputs.push_back(output->output_);
     }
 
+    auto s = TF_NewStatus();
     func->func = TF_GraphToFunction(
         graph_.get(), name_, 0, -1, nullptr, inputs_.size(), inputs_.data(),
         graph_outputs.size(), graph_outputs.data(), nullptr, nullptr, name_, s);
-    if (TF_GetCode(s) != TF_OK) return nullptr;
-    return func.release();
+    TF_RETURN_IF_ERROR(StatusFromTF_Status(s));
+    TF_DeleteStatus(s);
+    *f = func.release();
+    return Status::OK();
   }
 
-  void RegisterFunction(AbstractFunction* func, TF_Status* s) override {
-    TF_SetStatus(s, TF_UNIMPLEMENTED,
-                 "Registering graph functions has not been implemented yet.");
+  Status RegisterFunction(AbstractFunction* func) override {
+    return errors::Unimplemented(
+        "Registering graph functions has not been implemented yet.");
   }
 
-  ~GraphContext() override {}
-
-  static constexpr ExecutionContextKind kKind = kGraphContext;
+  Status RemoveFunction(const string& func) override {
+    return errors::Unimplemented(
+        "GraphContext::RemoveFunction has not been implemented yet.");
+  }
+  // For LLVM style RTTI.
+  static bool classof(const AbstractContext* ptr) {
+    return ptr->getKind() == kGraph;
+  }
 
  private:
   std::unique_ptr<TF_Graph, decltype(&TF_DeleteGraph)> graph_;
@@ -220,7 +327,7 @@ class GraphContext : public ExecutionContext {
   const char* name_;
 };
 
-static ExecutionContext* GraphTracingFactory(const char* name, TF_Status* s) {
+static TracingContext* GraphTracingFactory(const char* name, TF_Status* s) {
   return new GraphContext(name);
 }
 
@@ -231,5 +338,6 @@ static bool register_tracing = [] {
   return true;
 }();
 
-}  // namespace internal
+}  // namespace graph
+}  // namespace tracing
 }  // namespace tensorflow
diff --git a/tensorflow/c/eager/c_api_unified_experimental_internal.h b/tensorflow/c/eager/c_api_unified_experimental_internal.h
index 8fc696f0f2f..5e09d4a6024 100644
--- a/tensorflow/c/eager/c_api_unified_experimental_internal.h
+++ b/tensorflow/c/eager/c_api_unified_experimental_internal.h
@@ -19,6 +19,10 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/c/c_api.h"
+#include "tensorflow/c/conversion_macros.h"
+#include "tensorflow/c/eager/abstract_context.h"
+#include "tensorflow/c/eager/abstract_operation.h"
+#include "tensorflow/c/eager/abstract_tensor_handle.h"
 #include "tensorflow/c/eager/c_api_unified_experimental.h"
 #include "tensorflow/c/tf_datatype.h"
 #include "tensorflow/c/tf_status.h"
@@ -26,7 +30,14 @@ limitations under the License.
 #include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
-namespace internal {
+
+// Represents the results of the execution of an operation.
+struct OutputList {
+  std::vector<AbstractTensorHandle*> outputs;
+  int expected_num_outputs = -1;
+};
+
+namespace tracing {
 
 // =============================================================================
 // Implementation detail for the unified execution APIs for Eager and tracing
@@ -37,165 +48,75 @@ namespace internal {
 // `c_api_unified_experimental.h` header.
 // =============================================================================
 
-// We can't depend on C++ rtti, but we still want to be able to have a safe
-// dynamic_cast to provide diagnostics to the user when the API is misused.
-// Instead we model RTTI by listing all the possible subclasses for each
-// abstract base. Each subclass initializes the base class with the right
-// `kind`, which allows an equivalent to `std::dynamic_cast` provided by this
-// utility.
-template <typename T, typename S>
-T* dyncast(S source) {
-  if (source->getKind() != T::kKind) {
-    return nullptr;
-  }
-  return tensorflow::down_cast<T*>(source);
-}
-
-// Represents either an EagerTensor or a GraphTensor.
+// Represents either a MlirTensor or a GraphTensor.
 // This base class does not expose any public methods other than to distinguish
 // which subclass it actually is. The user is responsible to use the right
-// type of AbstractTensor in their context (do not pass an EagerTensor to a
+// type of AbstractTensor in their context (do not pass an MlirTensor to a
 // GraphContext and vice-versa).
-class AbstractTensor {
+class TracingTensorHandle : public AbstractTensorHandle {
  protected:
-  enum AbstractTensorKind { kMlirTensor, kGraphTensor, kEagerTensor };
-  explicit AbstractTensor(AbstractTensorKind kind) : kind_(kind) {}
+  explicit TracingTensorHandle(AbstractTensorHandleKind kind)
+      : AbstractTensorHandle(kind) {}
 
  public:
-  // Returns which subclass is this instance of.
-  AbstractTensorKind getKind() const { return kind_; }
-  virtual ~AbstractTensor() = default;
-
- private:
-  const AbstractTensorKind kind_;
-};
-
-// Represents the results of the execution of an operation.
-struct OutputList {
-  std::vector<AbstractTensor*> outputs;
-  int expected_num_outputs = -1;
-};
-
-// Holds the result of tracing a function.
-class AbstractFunction {
- protected:
-  enum AbstractFunctionKind { kGraphFunc };
-  explicit AbstractFunction(AbstractFunctionKind kind) : kind_(kind) {}
-
- public:
-  // Returns which subclass is this instance of.
-  AbstractFunctionKind getKind() const { return kind_; }
-  virtual ~AbstractFunction() = default;
-
-  // Temporary API till we figure the right abstraction for AbstractFunction.
-  // At the moment both Eager and Graph needs access to a "TF_Function" object.
-  virtual TF_Function* GetTfFunction(TF_Status* s) = 0;
-
- private:
-  const AbstractFunctionKind kind_;
+  // For LLVM style RTTI.
+  static bool classof(const AbstractTensorHandle* ptr) {
+    return ptr->getKind() == kGraph || ptr->getKind() == kMlir;
+  }
 };
 
 // An abstract operation describes an operation by its type, name, and
 // attributes. It can be "executed" by the context with some input tensors.
 // It is allowed to reusing the same abstract operation for multiple execution
 // on a given context, with the same or different input tensors.
-class AbstractOp {
+class TracingOperation : public AbstractOperation {
  protected:
-  enum AbstractOpKind { kMlirOp, kGraphOp, kEagerOp };
-  explicit AbstractOp(AbstractOpKind kind) : kind_(kind) {}
+  explicit TracingOperation(AbstractOperationKind kind)
+      : AbstractOperation(kind) {}
 
  public:
-  // Returns which subclass is this instance of.
-  AbstractOpKind getKind() const { return kind_; }
-  virtual ~AbstractOp() = default;
-
-  // Sets the type of the operation (for example `AddV2`).
-  virtual void SetOpType(const char* op_type, TF_Status* s) = 0;
-
   // Sets the name of the operation: this is an optional identifier that is
   // not intended to carry semantics and preserved/propagated without
   // guarantees.
-  virtual void SetOpName(const char* op_name, TF_Status* s) = 0;
+  virtual Status SetOpName(const char* op_name) = 0;
 
-  // Add a `TypeAttribute` on the operation.
-  virtual void SetAttrType(const char* attr_name, TF_DataType value,
-                           TF_Status* s) = 0;
-
- private:
-  const AbstractOpKind kind_;
+  // For LLVM style RTTI.
+  static bool classof(const AbstractOperation* ptr) {
+    return ptr->getKind() == kGraph || ptr->getKind() == kMlir;
+  }
 };
 
 // This holds the context for the execution: dispatching operations either to an
-// eager implementation or to a graph implementation.
-struct ExecutionContext {
+// MLIR implementation or to a graph implementation.
+class TracingContext : public AbstractContext {
  protected:
-  enum ExecutionContextKind { kMlirContext, kGraphContext, kEagerContext };
-  explicit ExecutionContext(ExecutionContextKind kind) : k(kind) {}
+  explicit TracingContext(AbstractContextKind kind) : AbstractContext(kind) {}
 
  public:
-  // Returns which subclass is this instance of.
-  ExecutionContextKind getKind() const { return k; }
-  virtual ~ExecutionContext() = default;
-
-  // Executes the operation on the provided inputs and populate the OutputList
-  // with the results. The input tensors must match the current context.
-  // The effect of "executing" an operation depends on the context: in an Eager
-  // context it will dispatch it to the runtime for execution, while in a
-  // tracing context it will add the operation to the current function.
-  virtual void ExecuteOperation(AbstractOp* op, int num_inputs,
-                                AbstractTensor* const* inputs, OutputList* o,
-                                TF_Status* s) = 0;
-
-  // Creates an empty AbstractOperation suitable to use with this context.
-  virtual AbstractOp* CreateOperation() = 0;
-
   // Add a function parameter and return the corresponding tensor.
-  // This is only valid with an ExecutionContext obtained from a TracingContext,
-  // it'll always error out with an eager context.
-  virtual AbstractTensor* AddParameter(TF_DataType dtype, TF_Status* s) = 0;
+  virtual Status AddParameter(DataType dtype, TracingTensorHandle**) = 0;
 
   // Finalize this context and make a function out of it. The context is in a
   // invalid state after this call and must be destroyed.
-  // This is only valid with an ExecutionContext obtained from a TracingContext,
-  // it'll always error out with an eager context.
-  virtual AbstractFunction* Finalize(OutputList* outputs, TF_Status* s) = 0;
+  virtual Status Finalize(OutputList* outputs, AbstractFunction**) = 0;
 
-  // Registers a functions with this context, after this the function is
-  // available to be called/referenced by its name in this context.
-  virtual void RegisterFunction(AbstractFunction* func, TF_Status* s) = 0;
-
- private:
-  const ExecutionContextKind k;
+  // For LLVM style RTTI.
+  static bool classof(const AbstractContext* ptr) {
+    return ptr->getKind() == kGraph || ptr->getKind() == kMlir;
+  }
 };
 
-typedef ExecutionContext* (*FactoryFunction)(const char* fn_name, TF_Status*);
+typedef TracingContext* (*FactoryFunction)(const char* fn_name, TF_Status*);
 void SetDefaultTracingEngine(const char* name);
 void RegisterTracingEngineFactory(const ::tensorflow::string& name,
                                   FactoryFunction factory);
+}  // namespace tracing
 
-// Create utilities to wrap/unwrap: this convert from the C opaque types to the
-// C++ implementation, and back.
-#define MAKE_WRAP_UNWRAP(C_TYPEDEF, CPP_CLASS)                              \
-  static inline CPP_CLASS* const& unwrap(C_TYPEDEF* const& o) {             \
-    return reinterpret_cast<CPP_CLASS* const&>(o);                          \
-  }                                                                         \
-  static inline const CPP_CLASS* const& unwrap(const C_TYPEDEF* const& o) { \
-    return reinterpret_cast<const CPP_CLASS* const&>(o);                    \
-  }                                                                         \
-  static inline C_TYPEDEF* const& wrap(CPP_CLASS* const& o) {               \
-    return reinterpret_cast<C_TYPEDEF* const&>(o);                          \
-  }                                                                         \
-  static inline const C_TYPEDEF* const& wrap(const CPP_CLASS* const& o) {   \
-    return reinterpret_cast<const C_TYPEDEF* const&>(o);                    \
-  }
-
-MAKE_WRAP_UNWRAP(TF_ExecutionContext, ExecutionContext)
-MAKE_WRAP_UNWRAP(TF_AbstractFunction, AbstractFunction)
-MAKE_WRAP_UNWRAP(TF_AbstractTensor, AbstractTensor)
-MAKE_WRAP_UNWRAP(TF_AbstractOp, AbstractOp)
-MAKE_WRAP_UNWRAP(TF_OutputList, OutputList)
-
-}  // namespace internal
+DEFINE_CONVERSION_FUNCTIONS(AbstractContext, TF_ExecutionContext)
+DEFINE_CONVERSION_FUNCTIONS(AbstractTensorHandle, TF_AbstractTensor)
+DEFINE_CONVERSION_FUNCTIONS(AbstractFunction, TF_AbstractFunction)
+DEFINE_CONVERSION_FUNCTIONS(AbstractOperation, TF_AbstractOp)
+DEFINE_CONVERSION_FUNCTIONS(OutputList, TF_OutputList)
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_C_EAGER_C_API_UNIFIED_EXPERIMENTAL_INTERNAL_H_
diff --git a/tensorflow/c/eager/c_api_unified_experimental_test.cc b/tensorflow/c/eager/c_api_unified_experimental_test.cc
index 24d170f2f99..221ed356645 100644
--- a/tensorflow/c/eager/c_api_unified_experimental_test.cc
+++ b/tensorflow/c/eager/c_api_unified_experimental_test.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <memory>
 
 #include "tensorflow/c/eager/c_api.h"
+#include "tensorflow/c/eager/c_api_experimental.h"
 #include "tensorflow/c/eager/c_api_test_util.h"
 #include "tensorflow/c/tf_datatype.h"
 #include "tensorflow/c/tf_status.h"
@@ -29,15 +30,19 @@ using tensorflow::string;
 namespace tensorflow {
 namespace {
 
-class UnifiedCAPI : public ::testing::TestWithParam<const char*> {
+class UnifiedCAPI
+    : public ::testing::TestWithParam<std::tuple<const char*, bool>> {
  protected:
-  void SetUp() override { TF_SetTracingImplementation(GetParam()); }
+  void SetUp() override {
+    TF_SetTracingImplementation(std::get<0>(GetParam()));
+  }
 };
 
 TEST_P(UnifiedCAPI, TestBasicEager) {
   std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
       TF_NewStatus(), TF_DeleteStatus);
   TFE_ContextOptions* opts = TFE_NewContextOptions();
+  TFE_ContextOptionsSetTfrt(opts, std::get<1>(GetParam()));
   TF_ExecutionContext* ctx = TF_NewEagerExecutionContext(opts, status.get());
   ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
   TFE_DeleteContextOptions(opts);
@@ -45,7 +50,8 @@ TEST_P(UnifiedCAPI, TestBasicEager) {
   ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
 
   // Build an abstract input tensor.
-  TFE_Context* eager_ctx = TF_ExecutionContextGetTFEContext(ctx);
+  TFE_Context* eager_ctx = TF_ExecutionContextGetTFEContext(ctx, status.get());
+  ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
   TFE_TensorHandle* t = TestScalarTensorHandle(eager_ctx, 2.0f);
   TF_AbstractTensor* at =
       TF_CreateAbstractTensorFromEagerTensor(t, status.get());
@@ -63,7 +69,7 @@ TEST_P(UnifiedCAPI, TestBasicEager) {
   ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
 
   // Execute.
-  TF_ExecuteOperation(op, 2, inputs, o, ctx, status.get());
+  TF_ExecuteOperation(op, 2, inputs, o, status.get());
   ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
 
   // Clean up operation and inputs.
@@ -109,9 +115,11 @@ TEST_P(UnifiedCAPI, TestBasicGraph) {
   // Build inputs and outputs.
   TF_AbstractTensor* inputs[2] = {placeholder_t, placeholder_t};
   TF_OutputList* add_outputs = TF_NewOutputList();
+  TF_OutputListSetNumOutputs(add_outputs, 1, status.get());
+  ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
 
   // Execute.
-  TF_ExecuteOperation(add_op, 2, inputs, add_outputs, graph_ctx, status.get());
+  TF_ExecuteOperation(add_op, 2, inputs, add_outputs, status.get());
   ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
 
   // Clean up operation and inputs.
@@ -123,6 +131,7 @@ TEST_P(UnifiedCAPI, TestBasicGraph) {
 
   // Build eager context.
   TFE_ContextOptions* opts = TFE_NewContextOptions();
+  TFE_ContextOptionsSetTfrt(opts, std::get<1>(GetParam()));
   TF_ExecutionContext* eager_execution_ctx =
       TF_NewEagerExecutionContext(opts, status.get());
   ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
@@ -137,16 +146,14 @@ TEST_P(UnifiedCAPI, TestBasicGraph) {
 
   // Build an abstract input tensor.
   TFE_Context* eager_ctx =
-      TF_ExecutionContextGetTFEContext(eager_execution_ctx);
+      TF_ExecutionContextGetTFEContext(eager_execution_ctx, status.get());
+  ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
   TFE_TensorHandle* input_eager = TestScalarTensorHandle(eager_ctx, 2.0f);
   TF_AbstractTensor* input_t =
       TF_CreateAbstractTensorFromEagerTensor(input_eager, status.get());
   ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
 
-  TF_OutputListSetNumOutputs(add_outputs, 1, status.get());
-  ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
-  TF_ExecuteOperation(fn_op, 1, &input_t, add_outputs, eager_execution_ctx,
-                      status.get());
+  TF_ExecuteOperation(fn_op, 1, &input_t, add_outputs, status.get());
   ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
 
   ASSERT_EQ(1, TF_OutputListNumOutputs(add_outputs));
@@ -195,8 +202,10 @@ TEST_P(UnifiedCAPI, TestMultiOutputGraph) {
     ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
     TF_AbstractTensor* inputs[2] = {arg0, arg1};
     TF_OutputList* add_outputs = TF_NewOutputList();
+    TF_OutputListSetNumOutputs(add_outputs, 1, status.get());
+    ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
     // Trace the operation now (create a node in the graph).
-    TF_ExecuteOperation(add_op, 2, inputs, add_outputs, graph_ctx, s);
+    TF_ExecuteOperation(add_op, 2, inputs, add_outputs, s);
     ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
     TF_DeleteAbstractOp(add_op);
     // Extract the resulting tensor.
@@ -215,8 +224,10 @@ TEST_P(UnifiedCAPI, TestMultiOutputGraph) {
     ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
     TF_AbstractTensor* inputs[2] = {arg1, arg1};
     TF_OutputList* add_outputs = TF_NewOutputList();
+    TF_OutputListSetNumOutputs(add_outputs, 1, status.get());
+    ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
     // Trace the operation now (create a node in the graph).
-    TF_ExecuteOperation(add_op, 2, inputs, add_outputs, graph_ctx, s);
+    TF_ExecuteOperation(add_op, 2, inputs, add_outputs, s);
     ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
     TF_DeleteAbstractOp(add_op);
     // Extract the resulting tensor.
@@ -256,6 +267,7 @@ TEST_P(UnifiedCAPI, TestMultiOutputGraph) {
 
   // Build eager context.
   TFE_ContextOptions* opts = TFE_NewContextOptions();
+  TFE_ContextOptionsSetTfrt(opts, std::get<1>(GetParam()));
   TF_ExecutionContext* eager_execution_ctx =
       TF_NewEagerExecutionContext(opts, s);
   ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
@@ -273,7 +285,8 @@ TEST_P(UnifiedCAPI, TestMultiOutputGraph) {
   std::vector<TF_AbstractTensor*> func_args;
   {
     TFE_Context* eager_ctx =
-        TF_ExecutionContextGetTFEContext(eager_execution_ctx);
+        TF_ExecutionContextGetTFEContext(eager_execution_ctx, status.get());
+    ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
     TFE_TensorHandle* input_eager = TestScalarTensorHandle(eager_ctx, 2.0f);
     func_args.push_back(TF_CreateAbstractTensorFromEagerTensor(input_eager, s));
     ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
@@ -286,7 +299,7 @@ TEST_P(UnifiedCAPI, TestMultiOutputGraph) {
   TF_OutputListSetNumOutputs(func_outputs, 2, s);
   ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
   TF_ExecuteOperation(fn_op, func_args.size(), func_args.data(), func_outputs,
-                      eager_execution_ctx, s);
+                      s);
   ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
   TF_DeleteAbstractOp(fn_op);
   for (TF_AbstractTensor* t : func_args) TF_DeleteAbstractTensor(t);
@@ -314,20 +327,21 @@ TEST_P(UnifiedCAPI, TestMultiOutputGraph) {
   TF_DeleteAbstractFunction(func);
 }
 
-TEST(UnifiedCAPI, TF_ExecutionContextToFunctionWithEagerContextRaises) {
+TEST_P(UnifiedCAPI, TF_ExecutionContextToFunctionWithEagerContextRaises) {
   std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
       TF_NewStatus(), TF_DeleteStatus);
   TFE_ContextOptions* opts = TFE_NewContextOptions();
+  TFE_ContextOptionsSetTfrt(opts, std::get<1>(GetParam()));
   TF_ExecutionContext* ctx = TF_NewEagerExecutionContext(opts, status.get());
   ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
   TFE_DeleteContextOptions(opts);
 
-  TF_AbstractFunction* func = TF_FinalizeFunction(ctx, nullptr, status.get());
-  ASSERT_EQ(nullptr, func);
+  TF_AbstractFunction* f = TF_FinalizeFunction(ctx, nullptr, status.get());
+  ASSERT_EQ(nullptr, f);
   ASSERT_EQ(TF_INVALID_ARGUMENT, TF_GetCode(status.get()));
 }
 
-TEST_P(UnifiedCAPI, TF_CallingSetOpTypeAfterFinishingOpBuildingRaises) {
+TEST_P(UnifiedCAPI, TF_AbstractOpSetOpTypeAfterFinishingOpBuildingRaises) {
   std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
       TF_NewStatus(), TF_DeleteStatus);
   TF_ExecutionContext* graph_ctx = TF_CreateFunction("some_func", status.get());
@@ -348,7 +362,7 @@ TEST_P(UnifiedCAPI, TF_CallingSetOpTypeAfterFinishingOpBuildingRaises) {
   TF_DeleteExecutionContext(graph_ctx);
 }
 
-TEST_P(UnifiedCAPI, TF_CallingSetOpNameAfterFinishingOpBuildingRaises) {
+TEST_P(UnifiedCAPI, TF_AbstractOpSetOpNameAfterFinishingOpBuildingRaises) {
   std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
       TF_NewStatus(), TF_DeleteStatus);
   TF_ExecutionContext* graph_ctx = TF_CreateFunction("some_func", status.get());
@@ -369,116 +383,44 @@ TEST_P(UnifiedCAPI, TF_CallingSetOpNameAfterFinishingOpBuildingRaises) {
   TF_DeleteExecutionContext(graph_ctx);
 }
 
-TEST_P(UnifiedCAPI, TestExecutingEagerOpInGraphModeRaises) {
-  // Build an Eager context.
+TEST_P(UnifiedCAPI, TF_AbstractTensorGetEagerTensorOnGraphTensorRaises) {
   std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
       TF_NewStatus(), TF_DeleteStatus);
-  TFE_ContextOptions* opts = TFE_NewContextOptions();
-  TF_ExecutionContext* ctx = TF_NewEagerExecutionContext(opts, status.get());
-  ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
-  TFE_DeleteContextOptions(opts);
-
-  ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
-
-  // Build an Eager operation.
-  auto* op = TF_NewAbstractOp(ctx);
-  TF_AbstractOpSetOpType(op, "Add", status.get());
-  ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
-
-  // Build an abstract input tensor.
-  TFE_Context* eager_ctx = TF_ExecutionContextGetTFEContext(ctx);
-  TFE_TensorHandle* t = TestScalarTensorHandle(eager_ctx, 2.0f);
-  TF_AbstractTensor* at =
-      TF_CreateAbstractTensorFromEagerTensor(t, status.get());
-  ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
-
-  // Build inputs and outputs.
-  TF_AbstractTensor* inputs[2] = {at, at};
-  TF_OutputList* o = TF_NewOutputList();
-  TF_OutputListSetNumOutputs(o, 1, status.get());
-  ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
-
-  // Build a Graph context.
-  ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
-  TF_ExecutionContext* graph_ctx = TF_CreateFunction("some_func", status.get());
-  ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
-
-  // Execute eager op using graph context.
-  TF_ExecuteOperation(op, 2, inputs, o, graph_ctx, status.get());
-  ASSERT_EQ(TF_INVALID_ARGUMENT, TF_GetCode(status.get()));
-
-  // Clean up operation and inputs.
-  TF_DeleteAbstractOp(op);
-  TF_DeleteAbstractTensor(at);
-
-  TF_DeleteOutputList(o);
-  TF_DeleteExecutionContext(ctx);
-  TF_DeleteExecutionContext(graph_ctx);
-}
-
-TEST_P(UnifiedCAPI, TestExecutingGraphOpInEagerModeRaises) {
-  std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
-      TF_NewStatus(), TF_DeleteStatus);
-  ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
   TF_ExecutionContext* graph_ctx = TF_CreateFunction("some_func", status.get());
   ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
 
   // Add a placeholder to the graph.
-  auto* placeholder_op = TF_NewAbstractOp(graph_ctx);
-  TF_AbstractOpSetOpType(placeholder_op, "Placeholder", status.get());
-  ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
-  TF_AbstractOpSetOpName(placeholder_op, "my_ph", status.get());
-  ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
-  TF_AbstractOpSetAttrType(placeholder_op, "dtype", TF_FLOAT, status.get());
-  ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
-
-  // Build inputs and outputs.
-  TF_OutputList* placeholder_outputs = TF_NewOutputList();
-
-  // Execute.
-  TF_ExecuteOperation(placeholder_op, 0, nullptr, placeholder_outputs,
-                      graph_ctx, status.get());
-  ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
-  ASSERT_EQ(1, TF_OutputListNumOutputs(placeholder_outputs));
-  TF_AbstractTensor* placeholder_t = TF_OutputListGet(placeholder_outputs, 0);
-
-  // Delete placeholder op.
-  TF_DeleteAbstractOp(placeholder_op);
-
-  // Build an abstract operation.
-  auto* add_op = TF_NewAbstractOp(graph_ctx);
-  TF_AbstractOpSetOpType(add_op, "Add", status.get());
-  ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
-  TF_AbstractOpSetOpName(add_op, "my_add", status.get());
-  ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
-
-  // Build inputs and outputs.
-  TF_AbstractTensor* inputs[2] = {placeholder_t, placeholder_t};
-  TF_OutputList* add_outputs = TF_NewOutputList();
-
-  // Build eager context.
-  TFE_ContextOptions* opts = TFE_NewContextOptions();
-  TF_ExecutionContext* eager_execution_ctx =
-      TF_NewEagerExecutionContext(opts, status.get());
-  ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
-  TFE_DeleteContextOptions(opts);
-
-  // Execute.
-  TF_ExecuteOperation(add_op, 2, inputs, add_outputs, eager_execution_ctx,
-                      status.get());
+  auto placeholder_t =
+      TF_AddFunctionParameter(graph_ctx, TF_FLOAT, status.get());
+  TF_AbstractTensorGetEagerTensor(placeholder_t, status.get());
   ASSERT_EQ(TF_INVALID_ARGUMENT, TF_GetCode(status.get()));
 
-  // Clean up operation and inputs.
   TF_DeleteAbstractTensor(placeholder_t);
-  TF_DeleteAbstractOp(add_op);
-  TF_DeleteOutputList(add_outputs);
-  TF_DeleteOutputList(placeholder_outputs);
   TF_DeleteExecutionContext(graph_ctx);
-  TF_DeleteExecutionContext(eager_execution_ctx);
 }
 
+TEST_P(UnifiedCAPI, TF_ExecutionContextGetTFEContextFromFunctionContextRaises) {
+  std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
+      TF_NewStatus(), TF_DeleteStatus);
+  TF_ExecutionContext* graph_ctx = TF_CreateFunction("some_func", status.get());
+  ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
+
+  TF_ExecutionContextGetTFEContext(graph_ctx, status.get());
+  ASSERT_EQ(TF_INVALID_ARGUMENT, TF_GetCode(status.get()));
+
+  TF_DeleteExecutionContext(graph_ctx);
+}
+#ifdef PLATFORM_GOOGLE
 INSTANTIATE_TEST_SUITE_P(Tracing, UnifiedCAPI,
-                         ::testing::Values("graphdef", "mlir"));
+                         ::testing::Combine(::testing::Values("graphdef",
+                                                              "mlir"),
+                                            ::testing::Values(true, false)));
+#else
+INSTANTIATE_TEST_SUITE_P(Tracing, UnifiedCAPI,
+                         ::testing::Combine(::testing::Values("graphdef",
+                                                              "mlir"),
+                                            ::testing::Values(false)));
+#endif
 
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tensorflow/c/BUILD b/tensorflow/compiler/mlir/tensorflow/c/BUILD
index 6d8b73b758a..801e35280d6 100644
--- a/tensorflow/compiler/mlir/tensorflow/c/BUILD
+++ b/tensorflow/compiler/mlir/tensorflow/c/BUILD
@@ -23,8 +23,12 @@ tf_cuda_library(
     copts = tf_copts() + tfe_xla_copts(),
     deps = [
         "//tensorflow/c:c_api",
+        "//tensorflow/c:tensor_interface",
         "//tensorflow/c:tf_status_helper",
         "//tensorflow/c:tf_status_internal",
+        "//tensorflow/c/eager:abstract_context",
+        "//tensorflow/c/eager:abstract_operation",
+        "//tensorflow/c/eager:abstract_tensor_handle",
         "//tensorflow/c/eager:c_api",
         "//tensorflow/c/eager:c_api_internal",
         "//tensorflow/c/eager:c_api_unified_internal",
@@ -35,6 +39,7 @@ tf_cuda_library(
         "//tensorflow/compiler/mlir/tensorflow:tensorflow_types",
         "//tensorflow/core:framework",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/lib/llvm_rtti",
         "//tensorflow/core/platform:errors",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
diff --git a/tensorflow/compiler/mlir/tensorflow/c/c_api_unified_experimental_mlir.cc b/tensorflow/compiler/mlir/tensorflow/c/c_api_unified_experimental_mlir.cc
index 0e8b7fedd9b..935e87c5fa4 100644
--- a/tensorflow/compiler/mlir/tensorflow/c/c_api_unified_experimental_mlir.cc
+++ b/tensorflow/compiler/mlir/tensorflow/c/c_api_unified_experimental_mlir.cc
@@ -26,14 +26,19 @@ limitations under the License.
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/IR/Module.h"  // from @llvm-project
 #include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/OperationSupport.h"  // from @llvm-project
 #include "mlir/IR/StandardTypes.h"  // from @llvm-project
 #include "mlir/IR/TypeUtilities.h"  // from @llvm-project
 #include "mlir/Pass/PassManager.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "tensorflow/c/c_api.h"
+#include "tensorflow/c/eager/abstract_context.h"
+#include "tensorflow/c/eager/abstract_operation.h"
+#include "tensorflow/c/eager/abstract_tensor_handle.h"
 #include "tensorflow/c/eager/c_api.h"
 #include "tensorflow/c/eager/c_api_internal.h"
 #include "tensorflow/c/eager/c_api_unified_experimental_internal.h"
+#include "tensorflow/c/tensor_interface.h"
 #include "tensorflow/c/tf_status.h"
 #include "tensorflow/c/tf_status_helper.h"
 #include "tensorflow/c/tf_status_internal.h"
@@ -47,16 +52,21 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/utils/error_util.h"
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/lib/llvm_rtti/llvm_rtti.h"
 #include "tensorflow/core/platform/errors.h"
 
 namespace mlir {
 namespace TF {
-using tensorflow::internal::AbstractFunction;
-using tensorflow::internal::AbstractOp;
-using tensorflow::internal::AbstractTensor;
-using tensorflow::internal::dyncast;
-using tensorflow::internal::ExecutionContext;
-using tensorflow::internal::OutputList;
+using tensorflow::AbstractFunction;
+using tensorflow::AbstractOperation;
+using tensorflow::AbstractTensorHandle;
+using tensorflow::AbstractTensorInterface;
+using tensorflow::dyn_cast;
+using tensorflow::OutputList;
+using tensorflow::string;
+using tensorflow::tracing::TracingContext;
+using tensorflow::tracing::TracingOperation;
+using tensorflow::tracing::TracingTensorHandle;
 
 namespace {
 
@@ -78,43 +88,104 @@ Status ConvertDataTypeToTensor(tensorflow::DataType dtype, Builder builder,
   return s;
 }
 
-class MlirTensor : public AbstractTensor {
+class MlirTensor : public TracingTensorHandle {
  public:
-  explicit MlirTensor(Value value) : AbstractTensor(kKind), value_(value) {}
+  explicit MlirTensor(Value value)
+      : TracingTensorHandle(kMlir), value_(value) {}
+
+  void Release() override { delete this; }
 
   Value getValue() { return value_; }
 
-  static constexpr AbstractTensorKind kKind = kMlirTensor;
+  // For LLVM style RTTI.
+  static bool classof(const AbstractTensorHandle* ptr) {
+    return ptr->getKind() == kMlir;
+  }
 
  private:
   Value value_;
 };
 
-class MlirAbstractOp : public AbstractOp {
+class MlirFunctionContext;
+
+class MlirAbstractOp : public TracingOperation {
  public:
-  explicit MlirAbstractOp(MLIRContext* context)
-      : AbstractOp(kKind), context_(context) {}
+  explicit MlirAbstractOp(MLIRContext* context,
+                          MlirFunctionContext* function_context)
+      : TracingOperation(kMlir),
+        context_(context),
+        function_context_(function_context) {}
 
-  void SetOpType(const char* op_type, TF_Status* s) override;
+  void Release() override { delete this; }
 
-  void SetAttrType(const char* attr_name, TF_DataType dtype,
-                   TF_Status* s) override;
+  Status Reset(const char* op, const char* raw_device_name) override;
 
-  void SetOpName(const char* const op_name, TF_Status* s) override;
+  const string& Name() const override;
+
+  const string& DeviceName() const override;
+
+  Status SetDeviceName(const char* name) override;
+
+  Status AddInput(AbstractTensorHandle* input) override;
+  Status AddInputList(absl::Span<AbstractTensorHandle*> inputs) override;
+  Status Execute(absl::Span<AbstractTensorHandle*> retvals,
+                 int* num_retvals) override;
+
+  Status SetAttrString(const char* attr_name, const char* data,
+                       size_t length) override;
+  Status SetAttrInt(const char* attr_name, int64_t value) override;
+  Status SetAttrFloat(const char* attr_name, float value) override;
+  Status SetAttrBool(const char* attr_name, bool value) override;
+  Status SetAttrType(const char* attr_name,
+                     tensorflow::DataType dtype) override;
+  Status SetAttrShape(const char* attr_name, const int64_t* dims,
+                      const int num_dims) override;
+  Status SetAttrFunction(const char* attr_name,
+                         const AbstractOperation* value) override;
+  Status SetAttrFunctionName(const char* attr_name, const char* value,
+                             size_t length) override;
+  Status SetAttrTensor(const char* attr_name,
+                       AbstractTensorInterface* tensor) override;
+  Status SetAttrStringList(const char* attr_name, const void* const* values,
+                           const size_t* lengths, int num_values) override;
+  Status SetAttrFloatList(const char* attr_name, const float* values,
+                          int num_values) override;
+  Status SetAttrIntList(const char* attr_name, const int64_t* values,
+                        int num_values) override;
+  Status SetAttrTypeList(const char* attr_name,
+                         const tensorflow::DataType* values,
+                         int num_values) override;
+  Status SetAttrBoolList(const char* attr_name, const unsigned char* values,
+                         int num_values) override;
+  Status SetAttrShapeList(const char* attr_name, const int64_t** dims,
+                          const int* num_dims, int num_values) override;
+  Status SetAttrFunctionList(
+      const char* attr_name,
+      absl::Span<const AbstractOperation*> values) override;
+
+  Status SetOpName(const char* const op_name) override;
 
   MLIRContext* GetContext() { return context_; }
 
-  Type AddRef(Type type, TF_Status* s);
+  Status AddRef(Type type, Type* output_type);
 
-  OperationState* Create(ArrayRef<Value> operands, TF_Status* s);
+  Status Create(ArrayRef<Value> operands, OperationState**);
 
-  static constexpr AbstractOpKind kKind = kMlirOp;
+  // For LLVM style RTTI.
+  static bool classof(const AbstractOperation* ptr) {
+    return ptr->getKind() == kMlir;
+  }
 
  private:
   MLIRContext* context_;
+  MlirFunctionContext* function_context_;
+  SmallVector<Value, 8> operands_;
   llvm::StringMap<Attribute> attrs_;
   std::unique_ptr<OperationState> state_;
   const char* op_name_ = nullptr;
+  string tf_op_type_;
+  // TODO(srbs): Use this.
+  string device_name_;
 };
 
 // MlirFunction is a thin wrapper over a FuncOp.
@@ -122,14 +193,17 @@ class MlirFunction : public AbstractFunction {
  public:
   explicit MlirFunction(std::unique_ptr<MLIRContext> context,
                         OwningModuleRef module, FuncOp func)
-      : AbstractFunction(kKind),
+      : AbstractFunction(kMlir),
         context_(std::move(context)),
         module_(std::move(module)),
         func_(func) {}
 
-  TF_Function* GetTfFunction(TF_Status* s) override;
+  Status GetFunctionDef(tensorflow::FunctionDef** f) override;
 
-  static constexpr AbstractFunctionKind kKind = kGraphFunc;
+  // For LLVM style RTTI.
+  static bool classof(const AbstractFunction* ptr) {
+    return ptr->getKind() == kMlir;
+  }
 
  private:
   std::unique_ptr<MLIRContext> context_;
@@ -137,10 +211,10 @@ class MlirFunction : public AbstractFunction {
   FuncOp func_;
 };
 
-class MlirFunctionContext : public ExecutionContext {
+class MlirFunctionContext : public TracingContext {
  public:
   explicit MlirFunctionContext(const char* name)
-      : ExecutionContext(kKind),
+      : TracingContext(kMlir),
         context_(std::make_unique<MLIRContext>()),
         builder_(context_.get()) {
     // TODO(aminim) figure out the location story here
@@ -151,24 +225,27 @@ class MlirFunctionContext : public ExecutionContext {
     builder_ = OpBuilder::atBlockBegin(func_.addEntryBlock());
   }
 
-  AbstractOp* CreateOperation() override {
-    return new MlirAbstractOp(context_.get());
+  void Release() override { delete this; }
+
+  AbstractOperation* CreateOperation() override {
+    return new MlirAbstractOp(context_.get(), this);
   }
+  Status AddParameter(tensorflow::DataType dtype,
+                      TracingTensorHandle** handle) override;
 
-  void ExecuteOperation(AbstractOp* abstract_op, int num_inputs,
-                        AbstractTensor* const* inputs, OutputList* o,
-                        TF_Status* s) override;
+  Status Finalize(OutputList* outputs, AbstractFunction** f) override;
 
-  AbstractTensor* AddParameter(TF_DataType dtype, TF_Status* s) override;
-
-  AbstractFunction* Finalize(OutputList* outputs, TF_Status* s) override;
-
-  void RegisterFunction(AbstractFunction* func, TF_Status* s) override {
-    s->status = tensorflow::errors::Unimplemented(
+  Status RegisterFunction(AbstractFunction* func) override {
+    return tensorflow::errors::Unimplemented(
         "Registering graph functions has not been implemented yet.");
   }
 
-  static constexpr ExecutionContextKind kKind = kMlirContext;
+  Status RemoveFunction(const string& func) override {
+    return tensorflow::errors::Unimplemented(
+        "MlirFunctionContext::RemoveFunction has not been implemented yet.");
+  }
+
+  Operation* CreateOperationFromState(const OperationState& state);
 
  private:
   std::unique_ptr<MLIRContext> context_;
@@ -177,91 +254,88 @@ class MlirFunctionContext : public ExecutionContext {
   OwningModuleRef module_;
 };
 
-void MlirAbstractOp::SetOpType(const char* op_type, TF_Status* s) {
+Status MlirAbstractOp::Reset(const char* op, const char* device_name) {
   if (state_) {
-    s->status = tensorflow::errors::FailedPrecondition(
-        "SetOpType called on already built op.");
-    return;
+    return tensorflow::errors::FailedPrecondition(
+        "Reset called on already built op.");
   }
+  tf_op_type_ = op;
   std::string name = "tf.";
-  name += op_type;
+  name += op;
   // TODO(aminim) figure out the location story here
   state_ = std::make_unique<OperationState>(UnknownLoc::get(context_), name);
+  return Status::OK();
 }
 
-void MlirAbstractOp::SetAttrType(const char* attr_name, TF_DataType dtype,
-                                 TF_Status* s) {
+Status MlirAbstractOp::SetAttrType(const char* attr_name,
+                                   tensorflow::DataType dtype) {
   if (!state_) {
-    s->status = tensorflow::errors::FailedPrecondition(
-        "op_type must be specified before specifying attrs.");
-    return;
+    return Status(tensorflow::error::Code::FAILED_PRECONDITION,
+                  "op_type must be specified before specifying attrs.");
   }
   Type mlir_type;
   Builder builder(context_);
-  s->status = ConvertDataTypeToTensor(static_cast<tensorflow::DataType>(dtype),
-                                      builder, &mlir_type);
-  if (!s->status.ok()) return;
+  TF_RETURN_IF_ERROR(ConvertDataTypeToTensor(dtype, builder, &mlir_type));
   attrs_[attr_name] = TypeAttr::get(mlir_type);
+  return Status::OK();
 }
 
-void MlirAbstractOp::SetOpName(const char* const op_name, TF_Status* s) {
+Status MlirAbstractOp::SetOpName(const char* const op_name) {
   // TODO(aminim): should we use a location?
   if (op_name_) {
-    s->status = tensorflow::errors::FailedPrecondition(
+    return tensorflow::errors::FailedPrecondition(
         "SetOpName called on already built op.");
-    return;
   }
   op_name_ = op_name;
+  return Status::OK();
 }
 
-Type MlirAbstractOp::AddRef(Type type, TF_Status* s) {
+Status MlirAbstractOp::AddRef(Type type, Type* output_type) {
   Type elt_type = getElementTypeOrSelf(type);
   if (elt_type.isa<mlir::TF::TensorFlowRefType>()) {
-    s->status = tensorflow::errors::InvalidArgument(
+    return tensorflow::errors::InvalidArgument(
         "Requested reference to a reference type");
-    return nullptr;
   }
   elt_type = TensorFlowRefType::get(elt_type);
   if (RankedTensorType tensor_type = type.dyn_cast<RankedTensorType>()) {
-    return RankedTensorType::get(tensor_type.getShape(), elt_type);
+    *output_type = RankedTensorType::get(tensor_type.getShape(), elt_type);
   }
-  return UnrankedTensorType::get(elt_type);
+  *output_type = UnrankedTensorType::get(elt_type);
+  return Status::OK();
 }
 
-OperationState* MlirAbstractOp::Create(ArrayRef<Value> operands, TF_Status* s) {
+Status MlirAbstractOp::Create(ArrayRef<Value> operands,
+                              OperationState** state) {
   state_->operands = llvm::to_vector<4>(operands);
   const tensorflow::OpDef* op_def;
   auto node_name = state_->name.getStringRef().drop_front(
       TensorFlowDialect::getDialectNamespace().size() + 1);
-  s->status =
-      tensorflow::OpRegistry::Global()->LookUpOpDef(node_name.str(), &op_def);
-  if (!s->status.ok()) return nullptr;
+  TF_RETURN_IF_ERROR(
+      tensorflow::OpRegistry::Global()->LookUpOpDef(node_name.str(), &op_def));
   Builder builder(context_);
   // Process operands according to the op_def and infer derived attributes.
   int current_operand = 0;
   for (const tensorflow::OpDef::ArgDef& input_arg : op_def->input_arg()) {
     if (!input_arg.number_attr().empty()) {
       // TODO(b/156122856): we don't support variadic operands.
-      s->status = tensorflow::errors::Unimplemented(
+      return tensorflow::errors::Unimplemented(
           "Unsupported 'number_attr' for '", input_arg.number_attr(), "'");
-      return nullptr;
     } else if (!input_arg.type_list_attr().empty()) {
-      s->status = tensorflow::errors::InvalidArgument(
+      return tensorflow::errors::InvalidArgument(
           "Unsupported 'type_list_attr' for '", input_arg.number_attr(), "'");
-      return nullptr;
     }
     if (current_operand >= operands.size()) {
-      s->status = tensorflow::errors::InvalidArgument("Missing operand for '",
-                                                      input_arg.name(), "'");
-      return nullptr;
+      return tensorflow::errors::InvalidArgument("Missing operand for '",
+                                                 input_arg.name(), "'");
     }
     Type expected_type;
     if (input_arg.type() != tensorflow::DT_INVALID) {
-      s->status =
-          ConvertDataTypeToTensor(input_arg.type(), builder, &expected_type);
-      if (!s->status.ok()) return nullptr;
-      if (input_arg.is_ref()) expected_type = AddRef(expected_type, s);
-      if (!s->status.ok()) return nullptr;
+      TF_RETURN_IF_ERROR(
+          ConvertDataTypeToTensor(input_arg.type(), builder, &expected_type));
+      Type output_type;
+      if (input_arg.is_ref())
+        TF_RETURN_IF_ERROR(AddRef(expected_type, &output_type));
+      expected_type = output_type;
     } else {
       expected_type = operands[current_operand].getType();
     }
@@ -277,17 +351,15 @@ OperationState* MlirAbstractOp::Create(ArrayRef<Value> operands, TF_Status* s) {
       // Same type repeated "repeats" times.
       Attribute repeats_attr = attrs_[output_arg.number_attr()];
       if (!repeats_attr) {
-        s->status = tensorflow::errors::InvalidArgument(
+        return tensorflow::errors::InvalidArgument(
             "Missing attribute '", output_arg.number_attr(),
             "' required for output list '", output_arg.name(), "'");
-        return nullptr;
       }
       if (!repeats_attr.isa<IntegerAttr>()) {
-        s->status = tensorflow::errors::InvalidArgument(
+        return tensorflow::errors::InvalidArgument(
             "Attribute '", output_arg.number_attr(),
             "' required for output list '", output_arg.name(),
             "' isn't an integer");
-        return nullptr;
       }
       int64_t repeats = repeats_attr.cast<IntegerAttr>().getInt();
 
@@ -295,102 +367,186 @@ OperationState* MlirAbstractOp::Create(ArrayRef<Value> operands, TF_Status* s) {
         // Same type repeated "repeats" times.
         Attribute attr = attrs_[output_arg.type_attr()];
         if (!attr) {
-          s->status = tensorflow::errors::InvalidArgument(
+          return tensorflow::errors::InvalidArgument(
               "Missing attribute '", output_arg.type_attr(),
               "' required for output '", output_arg.name(), "'");
-          return nullptr;
         }
         TypeAttr type_attr = attr.dyn_cast<TypeAttr>();
         if (!type_attr) {
-          s->status = tensorflow::errors::InvalidArgument(
+          return tensorflow::errors::InvalidArgument(
               "Attribute '", output_arg.type_attr(), "' required for output '",
               output_arg.name(), "' isn't a type attribute");
-          return nullptr;
         }
         for (int i = 0; i < repeats; ++i)
           state_->types.push_back(type_attr.getType());
       } else if (output_arg.type() != tensorflow::DT_INVALID) {
         for (int i = 0; i < repeats; ++i) {
           Type type;
-          s->status =
-              ConvertDataTypeToTensor(output_arg.type(), builder, &type);
-          if (!s->status.ok()) return nullptr;
+          TF_RETURN_IF_ERROR(
+              ConvertDataTypeToTensor(output_arg.type(), builder, &type));
           state_->types.push_back(type);
         }
       } else {
-        s->status = tensorflow::errors::InvalidArgument(
+        return tensorflow::errors::InvalidArgument(
             "Missing type or type_attr field in ",
             output_arg.ShortDebugString());
-        return nullptr;
       }
     } else if (!output_arg.type_attr().empty()) {
       Attribute attr = attrs_[output_arg.type_attr()];
       if (!attr) {
-        s->status = tensorflow::errors::InvalidArgument(
+        return tensorflow::errors::InvalidArgument(
             "Missing attribute '", output_arg.type_attr(),
             "' required for output '", output_arg.name(), "'");
-        return nullptr;
       }
       TypeAttr type_attr = attr.dyn_cast<TypeAttr>();
       if (!type_attr) {
-        s->status = tensorflow::errors::InvalidArgument(
+        return tensorflow::errors::InvalidArgument(
             "Attribute '", output_arg.type_attr(), "' required for output '",
             output_arg.name(), "' isn't a type attribute");
-        return nullptr;
       }
       state_->types.push_back(type_attr.getValue());
     } else if (!output_arg.type_list_attr().empty()) {
       // This is pointing to an attribute which is an array of types.
       Attribute attr = attrs_[output_arg.type_list_attr()];
       if (!attr) {
-        s->status = tensorflow::errors::InvalidArgument(
+        return tensorflow::errors::InvalidArgument(
             "Missing attribute '", output_arg.type_list_attr(),
             "' required for output '", output_arg.name(), "'");
-        return nullptr;
       }
       ArrayAttr array_attr = attr.dyn_cast<ArrayAttr>();
       if (!array_attr) {
-        s->status = tensorflow::errors::InvalidArgument(
+        return tensorflow::errors::InvalidArgument(
             "Attribute '", output_arg.type_list_attr(),
             "' required for output '", output_arg.name(),
             "' isn't an array attribute");
-        return nullptr;
       }
       for (Attribute attr : array_attr) {
         TypeAttr type_attr = attr.dyn_cast<TypeAttr>();
         if (!type_attr) {
-          s->status = tensorflow::errors::InvalidArgument(
+          return tensorflow::errors::InvalidArgument(
               "Array Attribute '", output_arg.type_list_attr(),
               "' required for output '", output_arg.name(),
               "' has a non-Type element");
-          return nullptr;
         }
         state_->types.push_back(type_attr.getValue());
       }
     } else if (output_arg.type() != tensorflow::DT_INVALID) {
       Type type;
       Builder builder(context_);
-      s->status = ConvertDataTypeToTensor(output_arg.type(), builder, &type);
-      if (!s->status.ok()) return nullptr;
+      TF_RETURN_IF_ERROR(
+          ConvertDataTypeToTensor(output_arg.type(), builder, &type));
       state_->types.push_back(type);
     } else {
-      s->status = tensorflow::errors::InvalidArgument(
-          "No type fields in ", output_arg.ShortDebugString());
-      if (!s->status.ok()) return nullptr;
+      return tensorflow::errors::InvalidArgument("No type fields in ",
+                                                 output_arg.ShortDebugString());
     }
     if (output_arg.is_ref()) {
       // For all types that were added by this function call, make them refs.
       for (Type& type : llvm::make_range(&state_->types[original_size],
                                          state_->types.end())) {
-        type = AddRef(type, s);
-        if (!s->status.ok()) return nullptr;
+        Type output_type;
+        TF_RETURN_IF_ERROR(AddRef(type, &output_type));
+        type = output_type;
       }
     }
   }
-  return state_.get();
+  *state = state_.get();
+  return Status::OK();
 }
 
-TF_Function* MlirFunction::GetTfFunction(TF_Status* s) {
+const string& MlirAbstractOp::Name() const { return tf_op_type_; }
+
+const string& MlirAbstractOp::DeviceName() const { return device_name_; }
+
+Status MlirAbstractOp::SetDeviceName(const char* name) {
+  device_name_ = name;
+  return Status::OK();
+}
+
+Status MlirAbstractOp::AddInputList(absl::Span<AbstractTensorHandle*> inputs) {
+  return tensorflow::errors::Unimplemented(
+      "AddInputList has not been implemented yet.");
+}
+
+Status MlirAbstractOp::SetAttrString(const char* attr_name, const char* data,
+                                     size_t length) {
+  return tensorflow::errors::Unimplemented(
+      "SetAttrString has not been implemented yet.");
+}
+Status MlirAbstractOp::SetAttrInt(const char* attr_name, int64_t value) {
+  return tensorflow::errors::Unimplemented(
+      "SetAttrInt has not been implemented yet.");
+}
+Status MlirAbstractOp::SetAttrFloat(const char* attr_name, float value) {
+  return tensorflow::errors::Unimplemented(
+      "SetAttrFloat has not been implemented yet.");
+}
+Status MlirAbstractOp::SetAttrBool(const char* attr_name, bool value) {
+  return tensorflow::errors::Unimplemented(
+      "SetAttrBool has not been implemented yet.");
+}
+Status MlirAbstractOp::SetAttrShape(const char* attr_name, const int64_t* dims,
+                                    const int num_dims) {
+  return tensorflow::errors::Unimplemented(
+      "SetAttrShape has not been implemented yet.");
+}
+Status MlirAbstractOp::SetAttrFunction(const char* attr_name,
+                                       const AbstractOperation* value) {
+  return tensorflow::errors::Unimplemented(
+      "SetAttrFunction has not been implemented yet.");
+}
+Status MlirAbstractOp::SetAttrFunctionName(const char* attr_name,
+                                           const char* value, size_t length) {
+  return tensorflow::errors::Unimplemented(
+      "SetAttrFunctionName has not been implemented yet.");
+}
+Status MlirAbstractOp::SetAttrTensor(const char* attr_name,
+                                     AbstractTensorInterface* tensor) {
+  return tensorflow::errors::Unimplemented(
+      "SetAttrTensor has not been implemented yet.");
+}
+Status MlirAbstractOp::SetAttrStringList(const char* attr_name,
+                                         const void* const* values,
+                                         const size_t* lengths,
+                                         int num_values) {
+  return tensorflow::errors::Unimplemented(
+      "SetAttrStringList has not been implemented yet.");
+}
+Status MlirAbstractOp::SetAttrFloatList(const char* attr_name,
+                                        const float* values, int num_values) {
+  return tensorflow::errors::Unimplemented(
+      "SetAttrFloatList has not been implemented yet.");
+}
+Status MlirAbstractOp::SetAttrIntList(const char* attr_name,
+                                      const int64_t* values, int num_values) {
+  return tensorflow::errors::Unimplemented(
+      "SetAttrIntList has not been implemented yet.");
+}
+Status MlirAbstractOp::SetAttrTypeList(const char* attr_name,
+                                       const tensorflow::DataType* values,
+                                       int num_values) {
+  return tensorflow::errors::Unimplemented(
+      "SetAttrTypeList has not been implemented yet.");
+}
+Status MlirAbstractOp::SetAttrBoolList(const char* attr_name,
+                                       const unsigned char* values,
+                                       int num_values) {
+  return tensorflow::errors::Unimplemented(
+      "SetAttrBoolList has not been implemented yet.");
+}
+Status MlirAbstractOp::SetAttrShapeList(const char* attr_name,
+                                        const int64_t** dims,
+                                        const int* num_dims, int num_values) {
+  return tensorflow::errors::Unimplemented(
+      "SetAttrShapeList has not been implemented yet.");
+}
+Status MlirAbstractOp::SetAttrFunctionList(
+    const char* attr_name, absl::Span<const AbstractOperation*> values) {
+  return tensorflow::errors::Unimplemented(
+      "SetAttrFunctionList has not been implemented yet.");
+}
+
+Status MlirFunction::GetFunctionDef(tensorflow::FunctionDef** f) {
   PassManager pm(func_.getContext());
   pm.addNestedPass<FuncOp>(CreateFunctionalToExecutorDialectConversionPass());
   pm.addNestedPass<FuncOp>(CreateBreakUpIslandsPass());
@@ -400,75 +556,59 @@ TF_Function* MlirFunction::GetTfFunction(TF_Status* s) {
   StatusScopedDiagnosticHandler diag_handler(func_.getContext());
   LogicalResult result = pm.run(func_.getParentOfType<ModuleOp>());
   (void)result;
-  s->status = diag_handler.ConsumeStatus();
-  if (!s->status.ok()) return nullptr;
+  TF_RETURN_IF_ERROR(diag_handler.ConsumeStatus());
 
   tensorflow::GraphExportConfig configs;
-  std::unique_ptr<TF_Function> tf_function(new TF_Function);
-  s->status = ConvertMlirFunctionToFunctionLibraryDef(func_, configs,
-                                                      &tf_function->fdef);
-  return tf_function.release();
+  *f = new tensorflow::FunctionDef();
+  return ConvertMlirFunctionToFunctionLibraryDef(func_, configs, *f);
 }
 
-void MlirFunctionContext::ExecuteOperation(AbstractOp* abstract_op,
-                                           int num_inputs,
-                                           AbstractTensor* const* inputs,
-                                           OutputList* o, TF_Status* s) {
-  auto* mlir_op = dyncast<MlirAbstractOp>(abstract_op);
-  if (mlir_op == nullptr) {
-    s->status = tensorflow::errors::InvalidArgument(
-        "Unable to cast AbstractOp to TF_GraphOp.");
-    return;
-  }
-  SmallVector<Value, 8> operands;
-  for (int i = 0; i < num_inputs; ++i) {
-    auto* operand = dyncast<MlirTensor>(inputs[i]);
-    if (!operand) {
-      s->status = tensorflow::errors::InvalidArgument(
-          "Capturing eager tensors is not supported yet.");
-      return;
-    }
-    if (operand->getValue().getContext() != context_.get()) {
-      s->status = tensorflow::errors::InvalidArgument(
-          "Capturing tensors from other context is not supported.");
-      return;
-    }
-    operands.push_back(operand->getValue());
-  }
-  OperationState* state = mlir_op->Create(operands, s);
-  if (!s->status.ok() || !state) return;
-  Operation* op = builder_.createOperation(*state);
-  int num_results = op->getNumResults();
-  o->outputs.clear();
-  o->outputs.reserve(num_results);
-  for (Value result : op->getResults())
-    o->outputs.push_back(new MlirTensor(result));
+Status MlirAbstractOp::Execute(absl::Span<AbstractTensorHandle*> retvals,
+                               int* num_retvals) {
+  OperationState* state;
+  TF_RETURN_IF_ERROR(Create(operands_, &state));
+  Operation* op = function_context_->CreateOperationFromState(*state);
+  *num_retvals = op->getNumResults();
+  for (int i = 0; i < *num_retvals; i++)
+    retvals[i] = new MlirTensor(op->getResult(i));
+  return Status::OK();
 }
 
-AbstractTensor* MlirFunctionContext::AddParameter(TF_DataType dtype,
-                                                  TF_Status* s) {
+Operation* MlirFunctionContext::CreateOperationFromState(
+    const OperationState& state) {
+  return builder_.createOperation(state);
+}
+
+Status MlirFunctionContext::AddParameter(tensorflow::DataType dtype,
+                                         TracingTensorHandle** handle) {
   Type type;
-  s->status = ConvertDataTypeToTensor(static_cast<tensorflow::DataType>(dtype),
-                                      builder_, &type);
-  if (!s->status.ok()) return nullptr;
-  return new MlirTensor(func_.getBody().front().addArgument(type));
+  TF_RETURN_IF_ERROR(ConvertDataTypeToTensor(dtype, builder_, &type));
+  *handle = new MlirTensor(func_.getBody().front().addArgument(type));
+  return Status::OK();
 }
 
-AbstractFunction* MlirFunctionContext::Finalize(OutputList* outputs,
-                                                TF_Status* s) {
+Status MlirAbstractOp::AddInput(AbstractTensorHandle* input) {
+  auto* operand = dyn_cast<MlirTensor>(input);
+  if (!operand) {
+    return tensorflow::errors::InvalidArgument(
+        "Unable to cast input to MlirTensor");
+  }
+  operands_.push_back(operand->getValue());
+  return Status::OK();
+}
+Status MlirFunctionContext::Finalize(OutputList* outputs,
+                                     AbstractFunction** f) {
   Block& body = func_.getBody().front();
   SmallVector<Value, 8> ret_operands;
-  for (AbstractTensor* output : outputs->outputs) {
-    auto* operand = dyncast<MlirTensor>(output);
+  for (auto* output : outputs->outputs) {
+    auto* operand = dyn_cast<MlirTensor>(output);
     if (!operand) {
-      s->status = tensorflow::errors::InvalidArgument(
+      return tensorflow::errors::InvalidArgument(
           "Capturing eager tensors is not supported yet.");
-      return nullptr;
     }
     if (operand->getValue().getContext() != context_.get()) {
-      s->status = tensorflow::errors::InvalidArgument(
+      return tensorflow::errors::InvalidArgument(
           "Capturing tensors from other context is not supported.");
-      return nullptr;
     }
     ret_operands.push_back(operand->getValue());
   }
@@ -478,16 +618,17 @@ AbstractFunction* MlirFunctionContext::Finalize(OutputList* outputs,
   auto result_types =
       llvm::to_vector<8>(body.getTerminator()->getOperandTypes());
   func_.setType(FunctionType::get(arg_types, result_types, func_.getContext()));
-  return new MlirFunction(std::move(context_), std::move(module_), func_);
+  *f = new MlirFunction(std::move(context_), std::move(module_), func_);
+  return Status::OK();
 }
 
 extern "C" {
-ExecutionContext* MlirTracingFactory(const char* fn_name, TF_Status* s) {
+TracingContext* MlirTracingFactory(const char* fn_name, TF_Status* s) {
   RegisterDialects();
   return new MlirFunctionContext(fn_name);
 }
 }
 
-}  // end anonymous namespace
-}  // end namespace TF
-}  // end namespace mlir
+}  // namespace
+}  // namespace TF
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tensorflow/c/c_api_unified_experimental_mlir_registration.cc b/tensorflow/compiler/mlir/tensorflow/c/c_api_unified_experimental_mlir_registration.cc
index 778f4b777a3..01a079b5247 100644
--- a/tensorflow/compiler/mlir/tensorflow/c/c_api_unified_experimental_mlir_registration.cc
+++ b/tensorflow/compiler/mlir/tensorflow/c/c_api_unified_experimental_mlir_registration.cc
@@ -15,10 +15,10 @@ limitations under the License.
 
 #include "tensorflow/c/eager/c_api_unified_experimental_internal.h"
 
-using tensorflow::internal::ExecutionContext;
+using tensorflow::tracing::TracingContext;
 
 extern "C" {
-ExecutionContext* MlirTracingFactory(const char* fn_name, TF_Status* s);
+TracingContext* MlirTracingFactory(const char* fn_name, TF_Status* s);
 }
 
 namespace {

From 6823877219e5d8392679b210c0a76a126eb7b987 Mon Sep 17 00:00:00 2001
From: Marcello Maggioni <maggioni@google.com>
Date: Wed, 24 Jun 2020 21:45:14 -0700
Subject: [PATCH 1059/1390] [XLA] Add kCbrt to hlo_cost_analysis as
 trascendental.

It should have a cost similar to kSqrt.

PiperOrigin-RevId: 318203675
Change-Id: Ic55aba543bd1760ca0d89c62401524f4ca4ac97b
---
 tensorflow/compiler/xla/service/hlo_cost_analysis.cc | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/tensorflow/compiler/xla/service/hlo_cost_analysis.cc b/tensorflow/compiler/xla/service/hlo_cost_analysis.cc
index 814643718ba..72b15db0dcd 100644
--- a/tensorflow/compiler/xla/service/hlo_cost_analysis.cc
+++ b/tensorflow/compiler/xla/service/hlo_cost_analysis.cc
@@ -103,10 +103,11 @@ Status HloCostAnalysis::HandleElementwiseOp(
   // trascendental ops (tanh or exp).
   if (opcode == HloOpcode::kExp || opcode == HloOpcode::kLog ||
       opcode == HloOpcode::kLogistic || opcode == HloOpcode::kPower ||
-      opcode == HloOpcode::kSqrt || opcode == HloOpcode::kRsqrt ||
-      opcode == HloOpcode::kTanh || opcode == HloOpcode::kSin ||
-      opcode == HloOpcode::kCos || opcode == HloOpcode::kExpm1 ||
-      opcode == HloOpcode::kLog1p || opcode == HloOpcode::kAtan2) {
+      opcode == HloOpcode::kSqrt || opcode == HloOpcode::kCbrt ||
+      opcode == HloOpcode::kRsqrt || opcode == HloOpcode::kTanh ||
+      opcode == HloOpcode::kSin || opcode == HloOpcode::kCos ||
+      opcode == HloOpcode::kExpm1 || opcode == HloOpcode::kLog1p ||
+      opcode == HloOpcode::kAtan2) {
     current_properties_[kTranscendentalsKey] = computation_count;
   } else {
     // Note: transcendental operations are considered a separate category from

From 284b0dd7a3379c3ae6a1def7a30603b0fc1e108e Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 24 Jun 2020 21:48:39 -0700
Subject: [PATCH 1060/1390] Fix the api docstrings for `on_*_batch_begin`
 methods in `callbacks.Callback` class.

PiperOrigin-RevId: 318203984
Change-Id: I85e2aa5d4c498b67f3e9130fa45a38173f84b35b
---
 tensorflow/python/keras/callbacks.py | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/tensorflow/python/keras/callbacks.py b/tensorflow/python/keras/callbacks.py
index 138a682c739..9e7575a232b 100644
--- a/tensorflow/python/keras/callbacks.py
+++ b/tensorflow/python/keras/callbacks.py
@@ -665,8 +665,9 @@ class Callback(object):
 
     Arguments:
         batch: Integer, index of batch within the current epoch.
-        logs: Dict. Has keys `batch` and `size` representing the current batch
-          number and the size of the batch.
+        logs: Dict, contains the return value of `model.train_step`. Typically,
+          the values of the `Model`'s metrics are returned.  Example:
+          `{'loss': 0.2, 'accuracy': 0.7}`.
     """
     # For backwards compatibility.
     self.on_batch_begin(batch, logs=logs)
@@ -697,8 +698,9 @@ class Callback(object):
 
     Arguments:
         batch: Integer, index of batch within the current epoch.
-        logs: Dict. Has keys `batch` and `size` representing the current batch
-          number and the size of the batch.
+        logs: Dict, contains the return value of `model.test_step`. Typically,
+          the values of the `Model`'s metrics are returned.  Example:
+          `{'loss': 0.2, 'accuracy': 0.7}`.
     """
 
   @doc_controls.for_subclass_implementers
@@ -725,8 +727,9 @@ class Callback(object):
 
     Arguments:
         batch: Integer, index of batch within the current epoch.
-        logs: Dict. Has keys `batch` and `size` representing the current batch
-          number and the size of the batch.
+        logs: Dict, contains the return value of `model.predict_step`,
+          it typically returns a dict with a key 'outputs' containing
+          the model's outputs.
     """
 
   @doc_controls.for_subclass_implementers

From fdc80210a7289d3e4ce9102cc5395d3e702326a9 Mon Sep 17 00:00:00 2001
From: Haoyu Zhang <haoyuzhang@google.com>
Date: Wed, 24 Jun 2020 21:59:34 -0700
Subject: [PATCH 1061/1390] Reduce memory copies in
 RegisterExistingFunctionsOnRemoteWorkers.

PiperOrigin-RevId: 318205034
Change-Id: I9dfceae9d9c9896931c99c4f39e0227b8da17df5
---
 .../core/common_runtime/eager/context.cc      | 31 ++++++++++++-------
 1 file changed, 19 insertions(+), 12 deletions(-)

diff --git a/tensorflow/core/common_runtime/eager/context.cc b/tensorflow/core/common_runtime/eager/context.cc
index be750bcc9bc..970dc40e38c 100644
--- a/tensorflow/core/common_runtime/eager/context.cc
+++ b/tensorflow/core/common_runtime/eager/context.cc
@@ -685,24 +685,31 @@ Status EagerContext::RegisterExistingFunctionsOnRemoteWorkers(
   // Register multiple functions on selected remote workers.
   uint64 context_id = GetContextId();
   FunctionDefLibrary function_defs = func_lib_def_.ToProto();
-  for (int i = 0; i < remote_workers.size(); i++) {
+  std::vector<std::shared_ptr<eager::EnqueueRequest>> requests(
+      function_defs.function_size());
+  for (int i = 0; i < function_defs.function_size(); i++) {
+    requests[i] = std::make_shared<eager::EnqueueRequest>();
+    requests[i]->set_context_id(context_id);
+    eager::RegisterFunctionOp* register_function =
+        requests[i]->add_queue()->mutable_register_function();
+    *register_function->mutable_function_def() =
+        std::move(*function_defs.mutable_function(i));
+    StripDefaultAttributes(
+        *OpRegistry::Global(),
+        register_function->mutable_function_def()->mutable_node_def());
+  }
+
+  for (auto& remote_worker : remote_workers) {
     core::RefCountPtr<eager::EagerClient> eager_client;
-    Status s = GetClient(remote_workers[i], &eager_client);
+    Status s = GetClient(remote_worker, &eager_client);
     if (!s.ok()) {
       continue;
     }
-    for (int j = 0; j < function_defs.function_size(); j++) {
-      auto request = std::make_shared<eager::EnqueueRequest>();
-      request->set_context_id(context_id);
-      eager::RegisterFunctionOp* register_function =
-          request->add_queue()->mutable_register_function();
-      *register_function->mutable_function_def() = function_defs.function(j);
-      StripDefaultAttributes(
-          *OpRegistry::Global(),
-          register_function->mutable_function_def()->mutable_node_def());
+    for (int i = 0; i < requests.size(); i++) {
       auto response = std::make_shared<eager::EnqueueResponse>();
       eager_client->StreamingEnqueueAsync(
-          request.get(), response.get(), [request, response](const Status& s) {
+          requests[i].get(), response.get(),
+          [request = requests[i], response](const Status& s) {
             if (!s.ok()) {
               LOG(ERROR) << "Failed to register function remotely due to "
                          << s.error_message()

From 09d0ef73ff9887736b5e6584b6f78ff8b805ef1c Mon Sep 17 00:00:00 2001
From: Marcello Maggioni <maggioni@google.com>
Date: Wed, 24 Jun 2020 22:28:01 -0700
Subject: [PATCH 1062/1390] [XLA] Add expander pass for kLogistic.

Adding an HLO pass that expands the kLogistic HLO into a desired sequence of
different HLOs.

Currently two different strategies are added.

1) A lowering through an expansion using TAHN (0.5 + 0.5 * tanh(0.5 * x))
2) A lowering through an expansion using EXP (1.0 / (1.0 + exp(-x)))

PiperOrigin-RevId: 318208462
Change-Id: Ibcfba8e95f76c85cdbffc42566f5cec5e663c72b
---
 tensorflow/compiler/xla/service/BUILD         |  67 ++++++++++++
 .../compiler/xla/service/logistic_expander.cc |  88 +++++++++++++++
 .../compiler/xla/service/logistic_expander.h  |  53 +++++++++
 .../xla/service/logistic_expander_test.cc     | 103 ++++++++++++++++++
 4 files changed, 311 insertions(+)
 create mode 100644 tensorflow/compiler/xla/service/logistic_expander.cc
 create mode 100644 tensorflow/compiler/xla/service/logistic_expander.h
 create mode 100644 tensorflow/compiler/xla/service/logistic_expander_test.cc

diff --git a/tensorflow/compiler/xla/service/BUILD b/tensorflow/compiler/xla/service/BUILD
index 10e2d7e65d1..8ea08086a02 100644
--- a/tensorflow/compiler/xla/service/BUILD
+++ b/tensorflow/compiler/xla/service/BUILD
@@ -2044,6 +2044,73 @@ tf_cc_test(
     ],
 )
 
+cc_library(
+    name = "logistic_expander",
+    srcs = ["logistic_expander.cc"],
+    hdrs = ["logistic_expander.h"],
+    deps = [
+        ":hlo",
+        ":hlo_casting_utils",
+        ":hlo_creation_utils",
+        ":hlo_evaluator",
+        ":hlo_pass",
+        ":hlo_query",
+        ":op_expander_pass",
+        ":pattern_matcher",
+        "//tensorflow/compiler/xla:comparison_util",
+        "//tensorflow/compiler/xla:literal",
+        "//tensorflow/compiler/xla:literal_util",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla:window_util",
+        "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/core:lib",
+        "//tensorflow/core/platform:logging",
+        "//tensorflow/core/platform:types",
+        "//tensorflow/stream_executor/lib",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:optional",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
+tf_cc_test(
+    name = "logistic_expander_test",
+    srcs = ["logistic_expander_test.cc"],
+    deps = [
+        ":hlo",
+        ":hlo_casting_utils",
+        ":hlo_creation_utils",
+        ":hlo_parser",
+        ":hlo_pass",
+        ":hlo_pass_pipeline",
+        ":logistic_expander",
+        ":pattern_matcher",
+        ":pattern_matcher_gmock",
+        ":shape_inference",
+        "//tensorflow/compiler/xla:literal",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla:window_util",
+        "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",  # fixdeps: keep
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
 cc_library(
     name = "all_reduce_combiner",
     srcs = ["all_reduce_combiner.cc"],
diff --git a/tensorflow/compiler/xla/service/logistic_expander.cc b/tensorflow/compiler/xla/service/logistic_expander.cc
new file mode 100644
index 00000000000..7cef5bb4070
--- /dev/null
+++ b/tensorflow/compiler/xla/service/logistic_expander.cc
@@ -0,0 +1,88 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/logistic_expander.h"
+
+#include "absl/types/optional.h"
+#include "absl/types/span.h"
+#include "tensorflow/compiler/xla/literal.h"
+#include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_creation_utils.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace xla {
+
+namespace {
+
+HloInstruction* ExpandLogisticWithTanh(HloInstruction* logistic) {
+  HloInstruction* operand = logistic->mutable_operand(0);
+  const Shape operand_shape = operand->shape();
+  HloInstruction* half_constant = MakeScalarLike(operand, 0.5f);
+  HloInstruction* tanh_instr =
+      MakeUnaryHlo(HloOpcode::kTanh,
+                   MakeBinaryHlo(HloOpcode::kMultiply, half_constant, operand)
+                       .ValueOrDie())
+          .ValueOrDie();
+  return MakeBinaryHlo(
+             HloOpcode::kAdd, half_constant,
+             MakeBinaryHlo(HloOpcode::kMultiply, half_constant, tanh_instr)
+                 .ValueOrDie())
+      .ValueOrDie();
+}
+
+HloInstruction* ExpandLogisticWithExp(HloInstruction* logistic) {
+  HloInstruction* operand = logistic->mutable_operand(0);
+  const Shape operand_shape = operand->shape();
+  // Computing 1.0 / (1.0 - exp(-x))
+  HloInstruction* one_constant = MakeScalarLike(operand, 1.0f);
+  HloInstruction* exp_instr =
+      MakeUnaryHlo(HloOpcode::kExp,
+                   MakeUnaryHlo(HloOpcode::kNegate, operand).ValueOrDie())
+          .ValueOrDie();
+  HloInstruction* denominator =
+      MakeBinaryHlo(HloOpcode::kAdd, one_constant, exp_instr).ValueOrDie();
+  return MakeBinaryHlo(HloOpcode::kDivide, one_constant, denominator)
+      .ValueOrDie();
+}
+
+}  // namespace
+
+bool LogisticExpander::InstructionMatchesPattern(HloInstruction* instruction) {
+  return instruction->opcode() == HloOpcode::kLogistic;
+}
+
+StatusOr<HloInstruction*> LogisticExpander::ExpandInstruction(
+    HloInstruction* instruction) {
+  switch (expansion_type_) {
+    case LogisticExpansionType::kTanh:
+      return ExpandLogisticWithTanh(instruction);
+    case LogisticExpansionType::kExp:
+      return ExpandLogisticWithExp(instruction);
+  }
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/logistic_expander.h b/tensorflow/compiler/xla/service/logistic_expander.h
new file mode 100644
index 00000000000..f59bbe137df
--- /dev/null
+++ b/tensorflow/compiler/xla/service/logistic_expander.h
@@ -0,0 +1,53 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_LOGISTIC_EXPANDER_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_LOGISTIC_EXPANDER_H_
+
+#include <utility>
+
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
+#include "tensorflow/compiler/xla/service/op_expander_pass.h"
+
+namespace xla {
+
+enum class LogisticExpansionType {
+  kTanh,  // Expands as 0.5 + 0.5*tanh(0.5*x)
+  kExp,   // Expands as 1.0 / (1.0 + exp(-x))
+};
+
+// A pass which performs expansion of the logistic function.
+class LogisticExpander : public OpExpanderPass {
+ public:
+  explicit LogisticExpander(LogisticExpansionType expansion_type)
+      : expansion_type_(expansion_type) {}
+  ~LogisticExpander() override = default;
+  absl::string_view name() const override { return "logistic-expander"; }
+
+ private:
+  // Returns `true` if `instruction` should be expanded by this pass.
+  bool InstructionMatchesPattern(HloInstruction* instruction) override;
+  // Returns a replacement for `instruction`, or nullptr if no replacement is
+  // needed (e.g. only the to_apply subcomputation of the instruction was
+  // modified).
+  StatusOr<HloInstruction*> ExpandInstruction(
+      HloInstruction* instruction) override;
+  LogisticExpansionType expansion_type_;
+};
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_LOGISTIC_EXPANDER_H_
diff --git a/tensorflow/compiler/xla/service/logistic_expander_test.cc b/tensorflow/compiler/xla/service/logistic_expander_test.cc
new file mode 100644
index 00000000000..e5fd25fb818
--- /dev/null
+++ b/tensorflow/compiler/xla/service/logistic_expander_test.cc
@@ -0,0 +1,103 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/logistic_expander.h"
+
+#include "absl/memory/memory.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_join.h"
+#include "tensorflow/compiler/xla/layout_util.h"
+#include "tensorflow/compiler/xla/literal.h"
+#include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_creation_utils.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_instructions.h"
+#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
+#include "tensorflow/compiler/xla/service/hlo_pass_fix.h"
+#include "tensorflow/compiler/xla/service/hlo_pass_pipeline.h"
+#include "tensorflow/compiler/xla/service/pattern_matcher.h"
+#include "tensorflow/compiler/xla/service/pattern_matcher_gmock.h"
+#include "tensorflow/compiler/xla/service/shape_inference.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/compiler/xla/window_util.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+
+namespace xla {
+namespace {
+
+namespace m = match;
+
+class LogisticExpanderTest : public HloTestBase {};
+
+// Test that we expand kLogistic with 0.5 + 0.5 * tanh(0.5*x) when the proper
+// option is enabled.
+TEST_F(LogisticExpanderTest, ExpandWithTanh) {
+  const char* kModuleStr = R"(
+    HloModule m
+    test {
+      p = f32[2,3] parameter(0)
+      ROOT r = f32[2,3] logistic(p)
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(kModuleStr));
+
+  auto computation = m->entry_computation();
+  HloInstruction* root = computation->root_instruction();
+  EXPECT_EQ(root->opcode(), HloOpcode::kLogistic);
+  LogisticExpander logistic_expander(LogisticExpansionType::kTanh);
+  ASSERT_TRUE(logistic_expander.Run(m.get()).ValueOrDie());
+  root = computation->root_instruction();
+  EXPECT_THAT(m->entry_computation()->root_instruction(),
+              GmockMatch(m::AddAnyOrder(
+                  m::MultiplyAnyOrder(m::Broadcast(m::ConstantScalar(0.5)),
+                                      m::Tanh(m::MultiplyAnyOrder(
+                                          m::Broadcast(m::ConstantScalar(0.5)),
+                                          m::Parameter(0)))),
+                  m::Broadcast(m::ConstantScalar(0.5)))));
+}
+
+// Test that we expand kLogistic with 1.0 / (1.0 + exp(-x)) when the proper
+// option is enabled.
+TEST_F(LogisticExpanderTest, ExpandWithEXP) {
+  const char* kModuleStr = R"(
+    HloModule m
+    test {
+      p = f32[2,3] parameter(0)
+      ROOT r = f32[2,3] logistic(p)
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(kModuleStr));
+
+  auto computation = m->entry_computation();
+  HloInstruction* root = computation->root_instruction();
+  EXPECT_EQ(root->opcode(), HloOpcode::kLogistic);
+  LogisticExpander logistic_expander(LogisticExpansionType::kExp);
+  ASSERT_TRUE(logistic_expander.Run(m.get()).ValueOrDie());
+  root = computation->root_instruction();
+  EXPECT_THAT(m->entry_computation()->root_instruction(),
+              GmockMatch(m::Divide(
+                  m::Broadcast(m::ConstantScalar(1.0)),
+                  m::AddAnyOrder(m::Broadcast(m::ConstantScalar(1.0)),
+                                 m::Exp(m::Negate(m::Parameter(0)))))));
+}
+
+}  // namespace
+}  // namespace xla

From 6a6071412a2fd19ec9b23f2cc3e2c9d5e20a9c45 Mon Sep 17 00:00:00 2001
From: Christian Sigg <csigg@google.com>
Date: Wed, 24 Jun 2020 22:59:45 -0700
Subject: [PATCH 1063/1390] Remove @io_bazel_rules_docker dependencies, we no
 longer use them.

PiperOrigin-RevId: 318212634
Change-Id: Iada36cd6cf549b4c615be29b76803637de36d5de
---
 WORKSPACE                                     | 12 ---
 tensorflow/opensource_only.files              |  1 -
 .../preconfig/generate/workspace.bzl          | 77 -------------------
 3 files changed, 90 deletions(-)
 delete mode 100644 third_party/toolchains/preconfig/generate/workspace.bzl

diff --git a/WORKSPACE b/WORKSPACE
index ea741c31c7f..fa39cedae9b 100644
--- a/WORKSPACE
+++ b/WORKSPACE
@@ -36,18 +36,6 @@ load(
 
 bazel_toolchains_repositories()
 
-load(
-    "@io_bazel_rules_docker//repositories:repositories.bzl",
-    container_repositories = "repositories",
-)
-
-container_repositories()
-
-load("//third_party/toolchains/preconfig/generate:workspace.bzl",
-     "remote_config_workspace")
-
-remote_config_workspace()
-
 # Use `swift_rules_dependencies` to fetch the toolchains. With the
 # `git_repository` rules above, the following call will skip redefining them.
 load("@build_bazel_rules_swift//swift:repositories.bzl", "swift_rules_dependencies")
diff --git a/tensorflow/opensource_only.files b/tensorflow/opensource_only.files
index 3d57e5f2089..e8cc7602cd3 100644
--- a/tensorflow/opensource_only.files
+++ b/tensorflow/opensource_only.files
@@ -233,7 +233,6 @@ tensorflow/third_party/toolchains/preconfig/generate/BUILD
 tensorflow/third_party/toolchains/preconfig/generate/archives.bzl
 tensorflow/third_party/toolchains/preconfig/generate/containers.bzl
 tensorflow/third_party/toolchains/preconfig/generate/generate.bzl
-tensorflow/third_party/toolchains/preconfig/generate/workspace.bzl
 tensorflow/third_party/toolchains/preconfig/ubuntu16.04/clang/BUILD
 tensorflow/third_party/toolchains/preconfig/ubuntu16.04/clang/cc_toolchain_config.bzl
 tensorflow/third_party/toolchains/preconfig/ubuntu16.04/clang/dummy_toolchain.bzl
diff --git a/third_party/toolchains/preconfig/generate/workspace.bzl b/third_party/toolchains/preconfig/generate/workspace.bzl
deleted file mode 100644
index 81b57dfa8a9..00000000000
--- a/third_party/toolchains/preconfig/generate/workspace.bzl
+++ /dev/null
@@ -1,77 +0,0 @@
-load(
-    "@io_bazel_rules_docker//repositories:repositories.bzl",
-    container_repositories = "repositories",
-)
-load(
-    "@io_bazel_rules_docker//container:container.bzl",
-    "container_pull",
-)
-load(":containers.bzl", "container_digests")
-
-def _remote_config_workspace():
-    container_repositories()
-
-    container_pull(
-        name = "centos6",
-        registry = "gcr.io",
-        repository = "tensorflow-testing/nosla-centos6",
-        digest = container_digests["centos6"],
-    )
-
-    container_pull(
-        name = "ubuntu16.04",
-        registry = "gcr.io",
-        repository = "tensorflow-testing/nosla-ubuntu16.04",
-        digest = container_digests["ubuntu16.04"],
-    )
-
-    container_pull(
-        name = "cuda10.0-cudnn7-ubuntu14.04",
-        registry = "gcr.io",
-        repository = "tensorflow-testing/nosla-cuda10.0-cudnn7-ubuntu14.04",
-        digest = container_digests["cuda10.0-cudnn7-ubuntu14.04"],
-    )
-
-    container_pull(
-        name = "cuda10.0-cudnn7-centos6",
-        registry = "gcr.io",
-        repository = "tensorflow-testing/nosla-cuda10.0-cudnn7-centos6",
-        digest = container_digests["cuda10.0-cudnn7-centos6"],
-    )
-
-    container_pull(
-        name = "cuda10.1-cudnn7-centos6",
-        registry = "gcr.io",
-        repository = "tensorflow-testing/nosla-cuda10.1-cudnn7-centos6",
-        digest = container_digests["cuda10.1-cudnn7-centos6"],
-    )
-
-    container_pull(
-        name = "ubuntu16.04-manylinux2010",
-        registry = "gcr.io",
-        repository = "tensorflow-testing/nosla-ubuntu16.04-manylinux2010",
-        digest = container_digests["ubuntu16.04-manylinux2010"],
-    )
-
-    container_pull(
-        name = "cuda10.0-cudnn7-ubuntu16.04-manylinux2010",
-        registry = "gcr.io",
-        repository = "tensorflow-testing/nosla-cuda10.0-cudnn7-ubuntu16.04-manylinux2010",
-        digest = container_digests["cuda10.0-cudnn7-ubuntu16.04-manylinux2010"],
-    )
-
-    container_pull(
-        name = "cuda10.1-cudnn7-ubuntu16.04-manylinux2010",
-        registry = "gcr.io",
-        repository = "tensorflow-testing/nosla-cuda10.1-cudnn7-ubuntu16.04-manylinux2010",
-        digest = container_digests["cuda10.1-cudnn7-ubuntu16.04-manylinux2010"],
-    )
-
-    container_pull(
-        name = "rocm-ubuntu16.04",
-        registry = "gcr.io",
-        repository = "tensorflow-testing/nosla-rocm-ubuntu16.04",
-        digest = container_digests["rocm-ubuntu16.04"],
-    )
-
-remote_config_workspace = _remote_config_workspace

From 1a5653cd40b0678366f12f99cea91aa437ca1b8c Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 24 Jun 2020 23:03:34 -0700
Subject: [PATCH 1064/1390] [XLA] Extend implementation of Comparison in HLO,
 so that the instruction can be more readily extended to support total
 ordering of all comparison directions. A new comparison type,
 kFloatTotalOrder, is added but its lowering is not yet supported. Its
 implementation will be later added to support more precise comparison of
 floating point numbers.

PiperOrigin-RevId: 318213010
Change-Id: I7886a99d2188ea7502e2f00d159492a95d147002
---
 tensorflow/compiler/xla/BUILD                 |   5 +
 tensorflow/compiler/xla/client/xla_builder.h  |   9 +-
 tensorflow/compiler/xla/comparison_util.cc    | 159 ++++++++++++++++--
 tensorflow/compiler/xla/comparison_util.h     | 107 ++++++++++--
 .../compiler/xla/service/hlo_creation_utils.h |   2 +-
 .../compiler/xla/service/hlo_instruction.h    |   2 +-
 .../compiler/xla/service/hlo_instructions.cc  |   6 +-
 .../compiler/xla/service/hlo_instructions.h   |   4 +-
 8 files changed, 258 insertions(+), 36 deletions(-)

diff --git a/tensorflow/compiler/xla/BUILD b/tensorflow/compiler/xla/BUILD
index 45f49cee328..35fa6a617f0 100644
--- a/tensorflow/compiler/xla/BUILD
+++ b/tensorflow/compiler/xla/BUILD
@@ -119,14 +119,19 @@ cc_library(
     ],
     hdrs = [
         "comparison_util.h",
+        "primitive_util.h",
     ],
     visibility = [":friends"],
     deps = [
+        ":status_macros",
         ":statusor",
         ":types",
         ":util",
+        ":xla_data_proto_cc",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:optional",
     ],
 )
 
diff --git a/tensorflow/compiler/xla/client/xla_builder.h b/tensorflow/compiler/xla/client/xla_builder.h
index 3fc26747468..23a29243ccc 100644
--- a/tensorflow/compiler/xla/client/xla_builder.h
+++ b/tensorflow/compiler/xla/client/xla_builder.h
@@ -733,13 +733,14 @@ class XlaBuilder {
   // broadcast_dimensions specifies which dimensions to use for broadcasting
   // when the operation is between tensors of different ranks. The direction is
   // only used if opcode is kCompare.
-  XlaOp BinaryOp(HloOpcode binop, XlaOp lhs, XlaOp rhs,
-                 absl::Span<const int64> broadcast_dimensions,
-                 absl::optional<ComparisonDirection> direction = absl::nullopt);
+  XlaOp BinaryOp(
+      HloOpcode binop, XlaOp lhs, XlaOp rhs,
+      absl::Span<const int64> broadcast_dimensions,
+      absl::optional<Comparison::Direction> direction = absl::nullopt);
 
   // Internal helper method for binary op compare without broadcast dimensions.
   virtual StatusOr<XlaOp> Compare(const Shape& shape, XlaOp lhs, XlaOp rhs,
-                                  ComparisonDirection direction);
+                                  Comparison::Direction direction);
 
   // Internal helper method that does the building for an arbitrary binary op
   // with same ranked operands that doesn't broadcast.
diff --git a/tensorflow/compiler/xla/comparison_util.cc b/tensorflow/compiler/xla/comparison_util.cc
index de34ad678e7..47fb69e3bce 100644
--- a/tensorflow/compiler/xla/comparison_util.cc
+++ b/tensorflow/compiler/xla/comparison_util.cc
@@ -19,33 +19,33 @@ limitations under the License.
 
 namespace xla {
 
-std::string ComparisonDirectionToString(ComparisonDirection direction) {
+std::string ComparisonDirectionToString(Comparison::Direction direction) {
   switch (direction) {
-    case ComparisonDirection::kEq:
+    case Comparison::Direction::kEq:
       return "EQ";
-    case ComparisonDirection::kNe:
+    case Comparison::Direction::kNe:
       return "NE";
-    case ComparisonDirection::kGe:
+    case Comparison::Direction::kGe:
       return "GE";
-    case ComparisonDirection::kGt:
+    case Comparison::Direction::kGt:
       return "GT";
-    case ComparisonDirection::kLe:
+    case Comparison::Direction::kLe:
       return "LE";
-    case ComparisonDirection::kLt:
+    case Comparison::Direction::kLt:
       return "LT";
   }
 }
 
-StatusOr<ComparisonDirection> StringToComparisonDirection(
+StatusOr<Comparison::Direction> StringToComparisonDirection(
     absl::string_view direction_name) {
   static auto* direction_map =
-      new absl::flat_hash_map<string, ComparisonDirection>({
-          {"EQ", ComparisonDirection::kEq},
-          {"NE", ComparisonDirection::kNe},
-          {"GE", ComparisonDirection::kGe},
-          {"GT", ComparisonDirection::kGt},
-          {"LE", ComparisonDirection::kLe},
-          {"LT", ComparisonDirection::kLt},
+      new absl::flat_hash_map<string, Comparison::Direction>({
+          {"EQ", Comparison::Direction::kEq},
+          {"NE", Comparison::Direction::kNe},
+          {"GE", Comparison::Direction::kGe},
+          {"GT", Comparison::Direction::kGt},
+          {"LE", Comparison::Direction::kLe},
+          {"LT", Comparison::Direction::kLt},
       });
   auto it = direction_map->find(direction_name);
   if (it == direction_map->end()) {
@@ -54,4 +54,133 @@ StatusOr<ComparisonDirection> StringToComparisonDirection(
   return it->second;
 }
 
+Comparison::Comparison(Direction dir, PrimitiveType type) : dir_(dir) {
+  switch (type) {
+    case S8:
+    case S16:
+    case S32:
+    case S64:
+      type_ = Type::kSigned;
+      break;
+    case PRED:
+    case U8:
+    case U16:
+    case U32:
+    case U64:
+      type_ = Type::kUnsigned;
+      break;
+    case F16:
+    case F32:
+    case BF16:
+    case F64:
+    case C64:
+    case C128:
+      type_ = Type::kFloat;
+      break;
+    default:
+      LOG(FATAL) << "Unsupported comparison mode."
+                 << ComparisonDirectionToString(dir) << ":"
+                 << PrimitiveType_Name(type) << "\n";
+  }
+}
+
+Comparison Comparison::Converse() const {
+  return Comparison(Converse(dir_), type_);
+}
+
+absl::optional<Comparison> Comparison::Inverse() const {
+  switch (type_) {
+    case Type::kFloat:
+      // Floating-point comparisons don't have inverses unless total order is
+      // supported (e.g. comparison can return true if one operand is NaN).
+      return absl::nullopt;
+    case Type::kFloatTotalOrder:
+    case Type::kSigned:
+    case Type::kUnsigned:
+      return Comparison(Inverse(dir_), type_);
+  }
+}
+
+bool Comparison::IsReflexive() const {
+  switch (dir_) {
+    case Direction::kEq:
+    case Direction::kGe:
+    case Direction::kLe:
+      return IsSigned() || IsUnsigned() || IsFloatTotalOrder();
+    case Direction::kNe:
+    case Direction::kGt:
+    case Direction::kLt:
+      return false;
+  }
+}
+
+bool Comparison::IsAntireflexive() const {
+  switch (dir_) {
+    case Direction::kNe:
+      return IsSigned() || IsUnsigned() || IsFloatTotalOrder();
+    case Direction::kGt:
+    case Direction::kLt:
+      return true;
+    case Direction::kEq:
+    case Direction::kGe:
+    case Direction::kLe:
+      return false;
+  }
+}
+
+/* static */ Comparison::Direction Comparison::Converse(
+    Comparison::Direction dir) {
+  switch (dir) {
+    case Comparison::Direction::kEq:
+      return Comparison::Direction::kEq;
+    case Comparison::Direction::kNe:
+      return Comparison::Direction::kNe;
+    case Comparison::Direction::kGe:
+      return Comparison::Direction::kLe;
+    case Comparison::Direction::kGt:
+      return Comparison::Direction::kLt;
+    case Comparison::Direction::kLe:
+      return Comparison::Direction::kGe;
+    case Comparison::Direction::kLt:
+      return Comparison::Direction::kGt;
+  }
+}
+
+/* static */ Comparison::Direction Comparison::Inverse(
+    Comparison::Direction dir) {
+  switch (dir) {
+    case Direction::kEq:
+      return Direction::kNe;
+    case Direction::kNe:
+      return Direction::kEq;
+    case Direction::kGe:
+      return Direction::kLt;
+    case Direction::kGt:
+      return Direction::kLe;
+    case Direction::kLe:
+      return Direction::kGt;
+    case Direction::kLt:
+      return Direction::kGe;
+  }
+}
+
+/* static */ const char* Comparison::ComparisonTypeToString(
+    Comparison::Type type) {
+  switch (type) {
+    case Type::kFloat:
+      return "f";
+    case Type::kFloatTotalOrder:
+      return "ft";
+    case Type::kSigned:
+      return "s";
+    case Type::kUnsigned:
+      return "u";
+  }
+}
+
+std::string Comparison::ToString(std::string prefix1,
+                                 std::string prefix2) const {
+  return prefix1 + std::string(ComparisonDirectionToString(dir_)) + prefix2 +
+         std::string(ComparisonTypeToString(type_));
+}
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/comparison_util.h b/tensorflow/compiler/xla/comparison_util.h
index 8b150c3cfad..11335c6b5ba 100644
--- a/tensorflow/compiler/xla/comparison_util.h
+++ b/tensorflow/compiler/xla/comparison_util.h
@@ -17,26 +17,111 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_XLA_COMPARISON_UTIL_H_
 
 #include "absl/base/macros.h"
+#include "absl/types/optional.h"
+#include "tensorflow/compiler/xla/primitive_util.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/types.h"
 
 namespace xla {
 
-// Represents different comparison operations.
-enum class ComparisonDirection : uint8 {
-  kEq,
-  kNe,
-  kGe,
-  kGt,
-  kLe,
-  kLt,
+class Comparison {
+ public:
+  // Represents type of comparison
+  enum class Type : uint8 {
+    kFloat,
+    kFloatTotalOrder,
+    kSigned,
+    kUnsigned,
+  };
+  //
+  // Represents different comparison operations.
+  enum class Direction : uint8 {
+    kEq,
+    kNe,
+    kGe,
+    kGt,
+    kLe,
+    kLt,
+  };
+
+  Comparison() = delete;
+  explicit Comparison(Direction dir, Type type) : dir_(dir), type_(type) {}
+  explicit Comparison(Direction dir, PrimitiveType type);
+
+  Direction GetDirection() const { return dir_; }
+  Type GetType() const { return type_; }
+
+  inline bool IsEq() const { return dir_ == Direction::kEq; }
+  inline bool IsNe() const { return dir_ == Direction::kNe; }
+  inline bool IsGe() const { return dir_ == Direction::kGe; }
+  inline bool IsGt() const { return dir_ == Direction::kGt; }
+  inline bool IsLt() const { return dir_ == Direction::kLt; }
+  inline bool IsFloat() const { return type_ == Type::kFloat; }
+  inline bool IsFloatTotalOrder() const {
+    return type_ == Type::kFloatTotalOrder;
+  }
+  inline bool IsSigned() const { return type_ == Type::kSigned; }
+  inline bool IsUnsigned() const { return type_ == Type::kUnsigned; }
+
+  // Returns true for comparisons, for which (a dir a) is always true.
+  bool IsReflexive() const;
+
+  // Returns true for comparisons, for which (a dir a) is always false.
+  bool IsAntireflexive() const;
+
+  // Gets the converse of the given comparison direction (e.g. >= turns to <=).
+  // Useful when commuting operands to get constants into
+  // immediate-accepting positions in the ISA.
+  Comparison Converse() const;
+
+  // Gets the inverse of the given comparison if it exists (e.g. >= turns to <).
+  // Returns optional value because not all inversions may be supported.
+  absl::optional<Comparison> Inverse() const;
+
+  std::string ToString(std::string prefix1 = ".",
+                       std::string prefix2 = ".") const;
+
+  template <typename T, typename Comparator = bool (*)(const T, const T)>
+  Comparator GetComparator() const {
+    switch (GetDirection()) {
+      case Direction::kEq:
+        return +[](const T a, const T b) { return a == b; };
+      case Direction::kNe:
+        return +[](const T a, const T b) { return a != b; };
+      case Direction::kGe:
+        return +[](const T a, const T b) { return a >= b; };
+      case Direction::kGt:
+        return +[](const T a, const T b) { return a > b; };
+      case Direction::kLe:
+        return +[](const T a, const T b) { return a <= b; };
+      case Direction::kLt:
+        return +[](const T a, const T b) { return a < b; };
+    }
+  }
+
+  template <typename T>
+  bool Compare(const T a, const T b) const {
+    return GetComparator<T>()(a, b);
+  }
+
+ private:
+  static Direction Converse(Direction dir);
+  static Direction Inverse(Direction dir);
+  static const char* ComparisonTypeToString(Type type);
+
+  const Direction dir_;
+  Type type_;
 };
 
-string ComparisonDirectionToString(ComparisonDirection direction);
+inline std::ostream& operator<<(std::ostream& os, const Comparison& cmp) {
+  return os << cmp.ToString();
+}
+string ComparisonDirectionToString(Comparison::Direction direction);
 
-StatusOr<ComparisonDirection> StringToComparisonDirection(
+StatusOr<Comparison::Direction> StringToComparisonDirection(
     absl::string_view direction_name);
 
-}  // namespace xla
+using ComparisonDirection = Comparison::Direction;
 
+}  // namespace xla
 #endif  // TENSORFLOW_COMPILER_XLA_COMPARISON_UTIL_H_
diff --git a/tensorflow/compiler/xla/service/hlo_creation_utils.h b/tensorflow/compiler/xla/service/hlo_creation_utils.h
index 2ba753d3cdb..2b17ae3d967 100644
--- a/tensorflow/compiler/xla/service/hlo_creation_utils.h
+++ b/tensorflow/compiler/xla/service/hlo_creation_utils.h
@@ -39,7 +39,7 @@ StatusOr<HloInstruction*> MakeBinaryHlo(HloOpcode opcode, HloInstruction* lhs,
 
 // Creates a compare HLO instruction and adds it to the computation containing
 // `lhs` and `rhs` (`lhs` and `rhs` must be in the same computation).
-StatusOr<HloInstruction*> MakeCompareHlo(ComparisonDirection direction,
+StatusOr<HloInstruction*> MakeCompareHlo(Comparison::Direction direction,
                                          HloInstruction* lhs,
                                          HloInstruction* rhs);
 
diff --git a/tensorflow/compiler/xla/service/hlo_instruction.h b/tensorflow/compiler/xla/service/hlo_instruction.h
index f3bb59ff625..95850a8d9da 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.h
+++ b/tensorflow/compiler/xla/service/hlo_instruction.h
@@ -595,7 +595,7 @@ class HloInstruction {
   // Creates a compare op, performing the comparison specified in direction.
   static std::unique_ptr<HloInstruction> CreateCompare(
       const Shape& shape, HloInstruction* lhs, HloInstruction* rhs,
-      ComparisonDirection direction);
+      Comparison::Direction direction);
 
   static std::unique_ptr<HloInstruction> CreateTriangularSolve(
       const Shape& shape, HloInstruction* a, HloInstruction* b,
diff --git a/tensorflow/compiler/xla/service/hlo_instructions.cc b/tensorflow/compiler/xla/service/hlo_instructions.cc
index 2a53841fd34..3d34fa03a80 100644
--- a/tensorflow/compiler/xla/service/hlo_instructions.cc
+++ b/tensorflow/compiler/xla/service/hlo_instructions.cc
@@ -208,14 +208,16 @@ HloCompareInstruction::HloCompareInstruction(const Shape& shape,
                                              HloInstruction* lhs,
                                              HloInstruction* rhs,
                                              ComparisonDirection direction)
-    : HloInstruction(HloOpcode::kCompare, shape), direction_(direction) {
+    : HloInstruction(HloOpcode::kCompare, shape),
+      compare_(direction, lhs->shape().element_type()) {
   AppendOperand(lhs);
   AppendOperand(rhs);
 }
 
 HloInstructionProto HloCompareInstruction::ToProto() const {
   HloInstructionProto proto = HloInstruction::ToProto();
-  proto.set_comparison_direction(ComparisonDirectionToString(direction_));
+  proto.set_comparison_direction(
+      ComparisonDirectionToString(compare_.GetDirection()));
   return proto;
 }
 
diff --git a/tensorflow/compiler/xla/service/hlo_instructions.h b/tensorflow/compiler/xla/service/hlo_instructions.h
index f5a963ef063..51317b32bd0 100644
--- a/tensorflow/compiler/xla/service/hlo_instructions.h
+++ b/tensorflow/compiler/xla/service/hlo_instructions.h
@@ -137,7 +137,7 @@ class HloCompareInstruction : public HloInstruction {
   explicit HloCompareInstruction(const Shape& shape, HloInstruction* lhs,
                                  HloInstruction* rhs,
                                  ComparisonDirection direction);
-  ComparisonDirection direction() const { return direction_; }
+  ComparisonDirection direction() const { return compare_.GetDirection(); }
   HloInstructionProto ToProto() const override;
 
  private:
@@ -151,7 +151,7 @@ class HloCompareInstruction : public HloInstruction {
       const Shape& shape, absl::Span<HloInstruction* const> new_operands,
       HloCloneContext* context) const override;
 
-  ComparisonDirection direction_;
+  Comparison compare_;
 };
 
 class HloTriangularSolveInstruction : public HloInstruction {

From d23d4efee7fd1cb621433eb6c423cf25a37b7da5 Mon Sep 17 00:00:00 2001
From: Marcello Maggioni <maggioni@google.com>
Date: Wed, 24 Jun 2020 23:10:42 -0700
Subject: [PATCH 1065/1390] [XLA] Add LogisticExpander to CPU and GPU XLA
 pipelines.

PiperOrigin-RevId: 318213693
Change-Id: I6a64e9237328388782e481de6ff0268ae11e9695
---
 tensorflow/compiler/xla/service/cpu/BUILD           | 1 +
 tensorflow/compiler/xla/service/cpu/cpu_compiler.cc | 3 +++
 tensorflow/compiler/xla/service/gpu/BUILD           | 1 +
 tensorflow/compiler/xla/service/gpu/gpu_compiler.cc | 4 ++++
 4 files changed, 9 insertions(+)

diff --git a/tensorflow/compiler/xla/service/cpu/BUILD b/tensorflow/compiler/xla/service/cpu/BUILD
index e12c67f2357..102753b882f 100644
--- a/tensorflow/compiler/xla/service/cpu/BUILD
+++ b/tensorflow/compiler/xla/service/cpu/BUILD
@@ -150,6 +150,7 @@ cc_library(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
         "//tensorflow/compiler/xla/service:algebraic_simplifier",
+        "//tensorflow/compiler/xla/service:logistic_expander",
         "//tensorflow/compiler/xla/service:batch_dot_simplification",
         "//tensorflow/compiler/xla/service:batchnorm_expander",
         "//tensorflow/compiler/xla/service:buffer_assignment",
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
index 31b9fe1c920..5464cfee082 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
@@ -94,6 +94,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_verifier.h"
 #include "tensorflow/compiler/xla/service/indexed_array_analysis.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h"
+#include "tensorflow/compiler/xla/service/logistic_expander.h"
 #include "tensorflow/compiler/xla/service/map_inliner.h"
 #include "tensorflow/compiler/xla/service/reshape_mover.h"
 #include "tensorflow/compiler/xla/service/rng_bit_generator_expander.h"
@@ -281,6 +282,8 @@ Status CpuCompiler::RunHloPassesThroughLayoutAssn(
       /*rewrite_training_op=*/true,
       /*rewrite_inference_op=*/true,
       /*rewrite_grad_op=*/true);
+  pipeline.AddPass<LogisticExpander>(
+      /*expansion_type=*/LogisticExpansionType::kExp);
   pipeline.AddPass<DynamicPadder>();
   pipeline.AddPass<ScatterExpander>();
   pipeline.AddPass<HloGetDimensionSizeRewriter>();
diff --git a/tensorflow/compiler/xla/service/gpu/BUILD b/tensorflow/compiler/xla/service/gpu/BUILD
index 472d2117a2c..f93b0d7df5d 100644
--- a/tensorflow/compiler/xla/service/gpu/BUILD
+++ b/tensorflow/compiler/xla/service/gpu/BUILD
@@ -1212,6 +1212,7 @@ cc_library(
         "//tensorflow/compiler/xla/service:hlo_subcomputation_unification",
         "//tensorflow/compiler/xla/service:hlo_verifier",
         "//tensorflow/compiler/xla/service:llvm_compiler",
+        "//tensorflow/compiler/xla/service:logistic_expander",
         "//tensorflow/compiler/xla/service:rng_bit_generator_expander",
         "//tensorflow/compiler/xla/service:rng_expander",
         "//tensorflow/compiler/xla/service:slice_sinker",
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc b/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
index 156cb112285..f13665a8cba 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
@@ -86,6 +86,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_subcomputation_unification.h"
 #include "tensorflow/compiler/xla/service/hlo_verifier.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h"
+#include "tensorflow/compiler/xla/service/logistic_expander.h"
 #include "tensorflow/compiler/xla/service/rng_bit_generator_expander.h"
 #include "tensorflow/compiler/xla/service/rng_expander.h"
 #include "tensorflow/compiler/xla/service/slice_sinker.h"
@@ -176,6 +177,9 @@ Status GpuCompiler::OptimizeHloModule(
         /*rewrite_inference_op=*/true,
         /*rewrite_grad_op=*/true);
 
+    pipeline.AddPass<LogisticExpander>(
+        /*expansion_type=*/LogisticExpansionType::kExp);
+
     pipeline.AddPass<DynamicPadder>();
 
     {

From 8f980c526957edf8cb29884079e4356130aabf14 Mon Sep 17 00:00:00 2001
From: George Karpenkov <cheshire@google.com>
Date: Wed, 24 Jun 2020 23:22:48 -0700
Subject: [PATCH 1066/1390] [XLA] Perform "copy protection" for aliasing on CPU
 and GPU

Aliasing for an HLO module is decided at compile time, but the buffer is either
donated or not at runtime.
In order to avoid recompilations for all possible aliasing configurations it is
often advantageous to opportunistically assign an aliasing which may or may
not actually hold at runtime.
In this case "copy protection" kicks in: if the aliasing specifies that the
output buffer B is aliased to input buffer A, but A is not actually donated at
runtime, we instead allocate a new fresh buffer C for the output,
and copy the contents of A into C (hence "copy protection").

PiperOrigin-RevId: 318214814
Change-Id: Ib5333aafbb5428308e15c18c950110ec6ddecdc5
---
 .../xla/service/cpu/cpu_executable.cc         |  33 ++-
 .../xla/service/gpu/buffer_allocations.cc     |   7 +
 .../xla/service/gpu/buffer_allocations.h      |   4 +
 .../xla/service/gpu/gpu_executable.cc         |  28 ++-
 tensorflow/compiler/xla/tests/BUILD           |   1 +
 .../xla/tests/buffer_donation_test.cc         | 194 ++++++++++++------
 6 files changed, 200 insertions(+), 67 deletions(-)

diff --git a/tensorflow/compiler/xla/service/cpu/cpu_executable.cc b/tensorflow/compiler/xla/service/cpu/cpu_executable.cc
index d095d220b97..8ba6f5a7159 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_executable.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_executable.cc
@@ -216,6 +216,8 @@ StatusOr<ExecutionOutput> CpuExecutable::CreateResultShapedBuffer(
                          stream->parent()->device_ordinal());
   const HloInputOutputAliasConfig& input_output_alias =
       module().input_output_alias_config();
+  HloInstruction* root = hlo_module_->entry_computation()->root_instruction();
+  const Shape& root_shape = root->shape();
 
   // Move se::OwningDeviceMemory values which contain the array(s) of the result
   // into the respective location in ScopedShapedBuffer which is returned to the
@@ -230,6 +232,13 @@ StatusOr<ExecutionOutput> CpuExecutable::CreateResultShapedBuffer(
     const HloValue* value_source = sources.values()[0];
     HloInstruction* src = value_source->instruction();
 
+    // The source for this result buffer can be a nested buffer such as
+    // a tuple element.
+    TF_ASSIGN_OR_RETURN(
+        const BufferAllocation::Slice slice,
+        this->assignment_->GetUniqueSlice(src, value_source->index()));
+    const BufferAllocation::Index buffer_index = slice.index();
+
     // TODO(cheshire): duplication with other backends.
     absl::optional<HloInputOutputAliasConfig::Alias> alias =
         input_output_alias.GetAliasedParameter(index);
@@ -258,17 +267,27 @@ StatusOr<ExecutionOutput> CpuExecutable::CreateResultShapedBuffer(
           // ScopedShapedBuffer result, if the ExecutionOutput is not committed.
           result.AddAliasedIndex(index);
         }
+      } else {
+        VLOG(3) << "Using copy-protection: aliasing is specified, but the "
+                   "buffer is not donated; allocating a fresh buffer";
+        int64 allocation_size =
+            ShapeUtil::ByteSizeOf(ShapeUtil::GetSubshape(root_shape, index));
+        TF_ASSIGN_OR_RETURN(
+            se::OwningDeviceMemory allocated_buffer,
+            run_options->allocator()->Allocate(
+                stream->parent()->device_ordinal(), allocation_size));
+        result_buffer = allocated_buffer.Release();
+        MaybeOwningDeviceMemory& registered_buffer = buffers[buffer_index];
+        CHECK_EQ(result_buffer.size(),
+                 registered_buffer.AsDeviceMemoryBase().size());
+        std::memcpy(/*dest=*/result_buffer.opaque(),
+                    /*src=*/registered_buffer.AsDeviceMemoryBase().opaque(),
+                    /*n=*/result_buffer.size());
+        registered_buffer = result_buffer;
       }
     }
 
     if (result_buffer.is_null()) {
-      // The source for this result buffer can be a nested buffer such as
-      // a tuple element. The source instruction should have a
-      // non-parameter buffer assigned.
-      TF_ASSIGN_OR_RETURN(
-          const BufferAllocation::Slice slice,
-          this->assignment_->GetUniqueSlice(src, value_source->index()));
-      const BufferAllocation::Index buffer_index = slice.index();
       MaybeOwningDeviceMemory& buffer = buffers[buffer_index];
       if (absl::optional<se::OwningDeviceMemory> owned_buffer =
               buffer.Release()) {
diff --git a/tensorflow/compiler/xla/service/gpu/buffer_allocations.cc b/tensorflow/compiler/xla/service/gpu/buffer_allocations.cc
index 4a5618695cc..cac335ce087 100644
--- a/tensorflow/compiler/xla/service/gpu/buffer_allocations.cc
+++ b/tensorflow/compiler/xla/service/gpu/buffer_allocations.cc
@@ -64,6 +64,13 @@ se::DeviceMemoryBase BufferAllocations::GetDeviceAddress(
   return buffers_[buffer_index];
 }
 
+se::DeviceMemoryBase& BufferAllocations::GetMutableDeviceAddress(
+    BufferAllocation::Index buffer_index) {
+  CHECK_GE(buffer_index, 0);
+  CHECK_LT(buffer_index, buffers_.size());
+  return buffers_[buffer_index];
+}
+
 se::DeviceMemoryBase BufferAllocations::GetDeviceAddress(
     const BufferAllocation::Slice& buffer_slice) const {
   se::DeviceMemoryBase base = GetDeviceAddress(buffer_slice.index());
diff --git a/tensorflow/compiler/xla/service/gpu/buffer_allocations.h b/tensorflow/compiler/xla/service/gpu/buffer_allocations.h
index a5297502994..0d534b0d286 100644
--- a/tensorflow/compiler/xla/service/gpu/buffer_allocations.h
+++ b/tensorflow/compiler/xla/service/gpu/buffer_allocations.h
@@ -58,6 +58,10 @@ class BufferAllocations {
   se::DeviceMemoryBase GetDeviceAddress(
       BufferAllocation::Index buffer_index) const;
 
+  // Returns a mutable value for the allocation at a given `buffer_index`.
+  se::DeviceMemoryBase& GetMutableDeviceAddress(
+      BufferAllocation::Index buffer_index);
+
   // Same as above, but also adjusts the returned address for the offset and
   // size contained in the given slice.
   se::DeviceMemoryBase GetDeviceAddress(
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
index 520bbedbaeb..c3bc6489d73 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
@@ -329,7 +329,7 @@ StatusOr<se::DeviceMemoryBase> GpuExecutable::BufferForAllocation(
   if (allocation.is_thread_local()) {
     return se::DeviceMemoryBase{};
   } else if (allocation.is_entry_computation_parameter()) {
-    auto param_no = allocation.parameter_number();
+    int64 param_no = allocation.parameter_number();
     se::DeviceMemoryBase registered_buffer =
         arguments[param_no]
             .Buffer(allocation.param_shape_index())
@@ -457,6 +457,7 @@ StatusOr<ExecutionOutput> GpuExecutable::ExecuteAsyncOnStream(
   se::StreamExecutor* executor = run_options->stream()->parent();
 
   HloInstruction* root = hlo_module_->entry_computation()->root_instruction();
+  const Shape& root_shape = root->shape();
   auto device_ordinal = executor->device_ordinal();
   ExecutionOutput result(/*on_host_shape=*/root->shape(),
                          /*on_device_shape=*/root->shape(), memory_allocator,
@@ -477,7 +478,8 @@ StatusOr<ExecutionOutput> GpuExecutable::ExecuteAsyncOnStream(
     CHECK_EQ(1, sources.values().size());
     HloInstruction* src_hlo = sources.values()[0]->instruction();
 
-    VLOG(4) << "Looking at: " << sources.values()[0];
+    VLOG(4) << "Looking at: " << src_hlo->ToString()
+            << "@ index: " << index.ToString();
 
     const HloInputOutputAliasConfig& input_output_alias =
         module().input_output_alias_config();
@@ -508,6 +510,28 @@ StatusOr<ExecutionOutput> GpuExecutable::ExecuteAsyncOnStream(
           // result, if the ExecutionOutput is not committed.
           result.AddAliasedIndex(index);
         }
+      } else if (src_hlo->opcode() != HloOpcode::kParameter) {
+        // The guard is above is not to insert copy-protection when aliasing
+        // pass-through params, as we do not need to write into the output
+        // buffer.
+        VLOG(3) << "Using copy-protection: aliasing is specified, but the "
+                   "buffer is not donated; allocating a fresh buffer";
+        int64 allocation_size =
+            ShapeUtil::ByteSizeOf(ShapeUtil::GetSubshape(root_shape, index));
+        TF_ASSIGN_OR_RETURN(
+            se::OwningDeviceMemory allocated_buffer,
+            memory_allocator->Allocate(device_ordinal, allocation_size));
+        result_buffer = allocated_buffer.Release();
+        TF_ASSIGN_OR_RETURN(
+            const BufferAllocation::Slice slice,
+            assignment_->GetUniqueSlice(src_hlo, sources.values()[0]->index()));
+        CHECK_EQ(slice.offset(), 0) << "Parameter should get its own slice";
+        se::DeviceMemoryBase& aliased_buffer =
+            buffer_allocations.GetMutableDeviceAddress(slice.index());
+        CHECK_EQ(aliased_buffer.size(), result_buffer.size());
+        run_options->stream()->ThenMemcpyD2D(&result_buffer, aliased_buffer,
+                                             aliased_buffer.size());
+        aliased_buffer = result_buffer;
       }
     }
 
diff --git a/tensorflow/compiler/xla/tests/BUILD b/tensorflow/compiler/xla/tests/BUILD
index 9b36117602b..0d59678e35b 100644
--- a/tensorflow/compiler/xla/tests/BUILD
+++ b/tensorflow/compiler/xla/tests/BUILD
@@ -339,6 +339,7 @@ xla_test(
     deps = [
         ":hlo_test_base",
         ":literal_test_util",
+        ":verified_hlo_module",
         ":xla_internal_test_main",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:status_macros",
diff --git a/tensorflow/compiler/xla/tests/buffer_donation_test.cc b/tensorflow/compiler/xla/tests/buffer_donation_test.cc
index 18cde722a64..5f936870103 100644
--- a/tensorflow/compiler/xla/tests/buffer_donation_test.cc
+++ b/tensorflow/compiler/xla/tests/buffer_donation_test.cc
@@ -21,11 +21,13 @@ limitations under the License.
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/service/backend.h"
 #include "tensorflow/compiler/xla/service/executable.h"
+#include "tensorflow/compiler/xla/service/hlo_input_output_alias_config.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
+#include "tensorflow/compiler/xla/tests/verified_hlo_module.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 
 namespace xla {
@@ -53,8 +55,13 @@ class BufferDonationTest : public HloTestBase {
   const Backend* backend_;
   se::StreamExecutor* executor_;
 
+  // If `donate_arguments` is `true` gives up ownership of the buffers used for
+  // the input allocation.
   void RunAndCheck(std::unique_ptr<HloModule> hlo_module,
-                   const Literal& argument_literal, Literal* expected) {
+                   absl::Span<Literal const> argument_literals,
+                   absl::Span<bool const> donate_arguments,
+                   absl::Span<bool const> expected_runtime_aliasing,
+                   const Literal& expected) {
     // Create a copy of the output shape because the HLO module is std::moved
     // into the compiler and may be deallocated.
     const Shape output_shape = hlo_module->result_shape();
@@ -62,6 +69,8 @@ class BufferDonationTest : public HloTestBase {
     TF_ASSERT_OK_AND_ASSIGN(hlo_module, backend_->compiler()->RunHloPasses(
                                             std::move(hlo_module), executor_,
                                             /*device_allocator=*/nullptr));
+    HloInputOutputAliasConfig alias_config =
+        hlo_module->input_output_alias_config();
     TF_ASSERT_OK_AND_ASSIGN(
         std::unique_ptr<Executable> executable,
         backend_->compiler()->RunBackend(std::move(hlo_module), executor_,
@@ -70,34 +79,48 @@ class BufferDonationTest : public HloTestBase {
     se::Stream stream(executor_);
     ASSERT_TRUE(stream.Init().ok());
 
-    auto memory_allocator =
-        absl::make_unique<se::StreamExecutorMemoryAllocator>(
-            platform_, backend_->stream_executors());
+    se::StreamExecutorMemoryAllocator memory_allocator(
+        platform_, backend_->stream_executors());
     ExecutableRunOptions run_options;
     run_options.set_stream(&stream);
-    run_options.set_allocator(memory_allocator.get());
+    run_options.set_allocator(&memory_allocator);
     ServiceExecutableRunOptions service_run_options(run_options);
 
-    // Allocate input buffers that will be reused as outputs.
-    TF_ASSERT_OK_AND_ASSIGN(
-        auto scoped_shaped_buffer,
-        backend_->transfer_manager()->AllocateScopedShapedBuffer(
-            argument_literal.shape(), memory_allocator.get(),
-            executor_->device_ordinal()));
-    auto shaped_buffer = scoped_shaped_buffer.release();
-    TF_CHECK_OK(backend_->transfer_manager()->TransferLiteralToDevice(
-        &stream, argument_literal, shaped_buffer));
-    auto input_buffers = shaped_buffer.buffers();
-    ShapeTree<MaybeOwningDeviceMemory> owned_buffers(argument_literal.shape());
-    owned_buffers.ForEachMutableElement(
-        [&](const ShapeIndex& index, MaybeOwningDeviceMemory* device_memory) {
-          *device_memory = se::OwningDeviceMemory(input_buffers.element(index),
-                                                  executor_->device_ordinal(),
-                                                  memory_allocator.get());
-        });
-
     std::vector<ExecutionInput> args;
-    args.emplace_back(ExecutionInput(std::move(owned_buffers)));
+    std::vector<ShapeTree<se::DeviceMemoryBase>> inputs_buffers;
+
+    CHECK_EQ(argument_literals.size(), donate_arguments.size());
+
+    for (int arg_num = 0; arg_num < argument_literals.size(); arg_num++) {
+      const bool donate_argument = donate_arguments[arg_num];
+      const Literal& argument_literal = argument_literals[arg_num];
+
+      // Allocate input buffers that will be reused as outputs.
+      TF_ASSERT_OK_AND_ASSIGN(
+          ScopedShapedBuffer scoped_shaped_buffer,
+          backend_->transfer_manager()->AllocateScopedShapedBuffer(
+              argument_literal.shape(), &memory_allocator,
+              executor_->device_ordinal()));
+      ShapedBuffer shaped_buffer = scoped_shaped_buffer.release();
+      TF_CHECK_OK(backend_->transfer_manager()->TransferLiteralToDevice(
+          &stream, argument_literal, shaped_buffer));
+      ShapeTree<se::DeviceMemoryBase> input_buffers = shaped_buffer.buffers();
+      inputs_buffers.push_back(input_buffers);
+      ShapeTree<MaybeOwningDeviceMemory> owned_buffers(
+          argument_literal.shape());
+      owned_buffers.ForEachMutableElement(
+          [&](const ShapeIndex& index, MaybeOwningDeviceMemory* device_memory) {
+            if (donate_argument) {
+              *device_memory = se::OwningDeviceMemory(
+                  input_buffers.element(index), executor_->device_ordinal(),
+                  &memory_allocator);
+            } else {
+              *device_memory = input_buffers.element(index);
+            }
+          });
+
+      args.emplace_back(ExecutionInput(std::move(owned_buffers)));
+    }
 
     TF_ASSERT_OK_AND_ASSIGN(
         ExecutionOutput output,
@@ -110,18 +133,17 @@ class BufferDonationTest : public HloTestBase {
 
     // Check for expected aliasing between input and output buffers.
 #ifndef XLA_TEST_BACKEND_INTERPRETER
-    for (int i = 0; i < ShapeUtil::TupleElementCount(argument_literal.shape());
-         ++i) {
-      const ShapeIndex index({i});
-      if (input_buffers.element(index).size() ==
-          output.Result().buffer(index).size()) {
-        ASSERT_EQ(input_buffers.element(index).opaque(),
-                  output.Result().buffer(index).opaque());
-      } else {
-        ASSERT_NE(input_buffers.element(index).opaque(),
-                  output.Result().buffer(index).opaque());
-      }
-    }
+    alias_config.ForEachAlias(
+        [&](const ShapeIndex& output_index,
+            const HloInputOutputAliasConfig::Alias& alias) {
+          int arg_num = alias.parameter_number;
+          const void* input_ptr =
+              inputs_buffers[arg_num].element(alias.parameter_index).opaque();
+          const void* output_ptr =
+              output.Result().buffer(output_index).opaque();
+          ASSERT_EQ(input_ptr == output_ptr,
+                    expected_runtime_aliasing[arg_num]);
+        });
 #endif
 
     TF_ASSERT_OK(run_options.stream()->BlockHostUntilDone());
@@ -129,7 +151,7 @@ class BufferDonationTest : public HloTestBase {
         Literal result_literal,
         backend_->transfer_manager()->TransferLiteralFromDevice(
             &stream, output.Result()));
-    EXPECT_TRUE(LiteralTestUtil::Equal(*expected, result_literal));
+    EXPECT_TRUE(LiteralTestUtil::Equal(expected, result_literal));
 
     // Memories are automatically deallocated.
   }
@@ -188,6 +210,28 @@ class BufferDonationTest : public HloTestBase {
     return builder.Build();
   }
 
+  std::unique_ptr<HloModule> CreateTestModule(absl::string_view module_name) {
+    std::unique_ptr<HloModule> module =
+        CreateNewVerifiedModule(std::string(module_name));
+    HloComputation* condition =
+        module->AddEmbeddedComputation(BuildWhileConditionComputation("if<4"));
+    HloComputation* body =
+        module->AddEmbeddedComputation(BuildWhileBodyComputation("add-update"));
+
+    HloComputation::Builder builder = HloComputation::Builder("SimpleWhile");
+    HloInstruction* param = builder.AddInstruction(
+        HloInstruction::CreateParameter(0, t_s32_f32v1_, "param"));
+    HloInstruction* while0 = builder.AddInstruction(
+        HloInstruction::CreateWhile(t_s32_f32v1_, condition, body, param));
+    HloInstruction* gte0 = builder.AddInstruction(
+        HloInstruction::CreateGetTupleElement(s32_, while0, 0));
+    HloInstruction* gte1 = builder.AddInstruction(
+        HloInstruction::CreateGetTupleElement(f32v1_, while0, 1));
+    builder.AddInstruction(HloInstruction::CreateTuple({gte0, gte1}));
+    module->AddEntryComputation(builder.Build());
+    return module;
+  }
+
   Shape s32_ = ShapeUtil::MakeShape(xla::S32, {});
   Shape r0f32_ = ShapeUtil::MakeShape(xla::F32, {});
   Shape f32v1_ = ShapeUtil::MakeShape(F32, {1});
@@ -197,31 +241,65 @@ class BufferDonationTest : public HloTestBase {
 // This tests a simple while loop where the parameters are aliased with the
 // output buffers.
 TEST_F(BufferDonationTest, SimpleWhileTupleTest) {
-  auto module = CreateNewVerifiedModule("SimpleWhile");
-  auto condition =
-      module->AddEmbeddedComputation(BuildWhileConditionComputation("if<4"));
-  auto body =
-      module->AddEmbeddedComputation(BuildWhileBodyComputation("add-update"));
-
-  auto builder = HloComputation::Builder("SimpleWhile");
-  auto param = builder.AddInstruction(
-      HloInstruction::CreateParameter(0, t_s32_f32v1_, "param"));
-  auto while0 = builder.AddInstruction(
-      HloInstruction::CreateWhile(t_s32_f32v1_, condition, body, param));
-  auto gte0 = builder.AddInstruction(
-      HloInstruction::CreateGetTupleElement(s32_, while0, 0));
-  auto gte1 = builder.AddInstruction(
-      HloInstruction::CreateGetTupleElement(f32v1_, while0, 1));
-  builder.AddInstruction(HloInstruction::CreateTuple({gte0, gte1}));
-  module->AddEntryComputation(builder.Build());
+  std::unique_ptr<HloModule> module = CreateTestModule("SimpleWhile");
   TF_ASSERT_OK(module->input_output_alias_config().SetUpAlias({0}, 0, {0}));
   TF_ASSERT_OK(module->input_output_alias_config().SetUpAlias({1}, 0, {1}));
 
-  auto arg = LiteralUtil::MakeTupleFromSlices(
-      {LiteralUtil::CreateR0<int>(0), LiteralUtil::CreateR1<float>({1.1f})});
-  auto expected = LiteralUtil::MakeTupleFromSlices(
+  std::vector<Literal> args;
+  args.push_back(LiteralUtil::MakeTupleFromSlices(
+      {LiteralUtil::CreateR0<int>(0), LiteralUtil::CreateR1<float>({1.1f})}));
+  Literal expected = LiteralUtil::MakeTupleFromSlices(
       {LiteralUtil::CreateR0<int>(4), LiteralUtil::CreateR1<float>({5.5f})});
-  RunAndCheck(std::move(module), arg, &expected);
+  RunAndCheck(std::move(module), args, /*donate_arguments=*/{true},
+              /*expected_runtime_aliasing=*/{true}, expected);
+}
+
+// Tests a case where we have promised aliasing to the compiler, but the runtime
+// has not actually donated the buffers.
+TEST_F(BufferDonationTest, SimpleWhileTupleTestCopyProtection) {
+  std::unique_ptr<HloModule> module =
+      CreateTestModule("SimpleWhileCopyProtection");
+  TF_ASSERT_OK(module->input_output_alias_config().SetUpAlias({0}, 0, {0}));
+  TF_ASSERT_OK(module->input_output_alias_config().SetUpAlias({1}, 0, {1}));
+
+  std::vector<Literal> args;
+  args.push_back(LiteralUtil::MakeTupleFromSlices(
+      {LiteralUtil::CreateR0<int>(0), LiteralUtil::CreateR1<float>({1.1f})}));
+  Literal expected = LiteralUtil::MakeTupleFromSlices(
+      {LiteralUtil::CreateR0<int>(4), LiteralUtil::CreateR1<float>({5.5f})});
+  RunAndCheck(std::move(module), args, /*donate_arguments=*/{false},
+              /*expected_runtime_aliasing=*/{false}, expected);
+}
+
+// Tests a case that on XLA:GPU alias passthrough params automatically aliases
+// pass-through parameters, even if the underlying buffer is not donated.
+TEST_F(BufferDonationTest, TestNoCopyProtectionOnPassthroughParam) {
+  HloModuleConfig config;
+  config.set_alias_passthrough_params(true);
+
+  StatusOr<std::unique_ptr<VerifiedHloModule>> module =
+      ParseAndReturnVerifiedModule(R"(
+HloModule module
+
+ENTRY entry {
+  a = f32[] parameter(0)
+  b = f32[] parameter(1)
+  ROOT out = (f32[], f32[]) tuple(a, b)
+}
+  )",
+                                   config);
+
+  std::vector<Literal> args;
+  args.push_back(LiteralUtil::CreateR0<float>(0.1));
+  args.push_back(LiteralUtil::CreateR0<float>(0.2));
+  Literal expected = LiteralUtil::MakeTupleFromSlices(
+      {LiteralUtil::CreateR0<float>(0.1), LiteralUtil::CreateR0<float>(0.2)});
+
+  // Alias-passthrough-params is only implemented on GPU.
+#ifdef XLA_TEST_BACKEND_GPU
+  RunAndCheck(std::move(*module), args, /*donate_arguments=*/{false, false},
+              /*expected_runtime_aliasing=*/{true, true}, expected);
+#endif
 }
 
 }  // namespace

From a6916e026dcc3bda54d563e3842b4c8ef2db6c55 Mon Sep 17 00:00:00 2001
From: Henry Tan <henrytan@google.com>
Date: Wed, 24 Jun 2020 23:53:11 -0700
Subject: [PATCH 1067/1390] TPU internal library refactor.

PiperOrigin-RevId: 318217549
Change-Id: I2462eb2b1c294c1adfa88094f714cb2700b5a7d0
---
 tensorflow/core/tpu/kernels/tpu_configuration_ops.cc | 12 ++++++------
 .../core/tpu/kernels/tpu_mesh_state_interface.h      |  2 +-
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/tensorflow/core/tpu/kernels/tpu_configuration_ops.cc b/tensorflow/core/tpu/kernels/tpu_configuration_ops.cc
index 583f1aec207..7d3814ad3c3 100644
--- a/tensorflow/core/tpu/kernels/tpu_configuration_ops.cc
+++ b/tensorflow/core/tpu/kernels/tpu_configuration_ops.cc
@@ -36,7 +36,7 @@ namespace {
 Status GetTpuMeshStateInterface(const ResourceMgr* rmgr,
                                 tpu::TpuMeshStateInterface** state) {
   if (!rmgr->Lookup(rmgr->default_container(),
-                    tpu::kTpuMeshCommonStateResourceName, state)
+                    tpu::kTpuMeshStateInterfaceResourceName, state)
            .ok()) {
     return errors::FailedPrecondition(
         "The TPU system has not been initialized.");
@@ -96,16 +96,16 @@ void ConfigureDistributedTpuOp::Compute(OpKernelContext* ctx) {
 
   auto* rmgr = GetTPUConfigResourceMgr();
   OP_REQUIRES_OK(ctx, DeleteIfExists<tpu::TpuMeshStateInterface>(
-                          rmgr, tpu::kTpuMeshCommonStateResourceName));
+                          rmgr, tpu::kTpuMeshStateInterfaceResourceName));
 
   tpu::ConfigApiFn()->ConfigureDistributedTpuOp_DoWorkFn(
       num_devices_per_host.size(), num_devices_per_host.data(),
       &host_config_output_size, &host_config_output, status);
 
   auto* tpu_mesh = tpu::TpuMeshStateInterface::Create();
-  OP_REQUIRES_OK(ctx,
-                 rmgr->Create(rmgr->default_container(),
-                              tpu::kTpuMeshCommonStateResourceName, tpu_mesh));
+  OP_REQUIRES_OK(
+      ctx, rmgr->Create(rmgr->default_container(),
+                        tpu::kTpuMeshStateInterfaceResourceName, tpu_mesh));
 
   Tensor* ctx_output;
   OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}), &ctx_output));
@@ -198,7 +198,7 @@ void ShutdownDistributedTpuOp::Compute(OpKernelContext* ctx) {
   TF_Status* status = TF_NewStatus();
   OP_REQUIRES_OK(ctx, DeleteIfExists<tpu::TpuMeshStateInterface>(
                           GetTPUConfigResourceMgr(),
-                          tpu::kTpuMeshCommonStateResourceName));
+                          tpu::kTpuMeshStateInterfaceResourceName));
   tpu::ConfigApiFn()->ShutdownDistributedTpuOp_DoWorkFn(status);
   OP_REQUIRES_OK(ctx, StatusFromTF_Status(status));
   TF_DeleteStatus(status);
diff --git a/tensorflow/core/tpu/kernels/tpu_mesh_state_interface.h b/tensorflow/core/tpu/kernels/tpu_mesh_state_interface.h
index e2ac38b5f84..7906844c965 100644
--- a/tensorflow/core/tpu/kernels/tpu_mesh_state_interface.h
+++ b/tensorflow/core/tpu/kernels/tpu_mesh_state_interface.h
@@ -29,7 +29,7 @@ class TpuMeshCommonState;
 
 namespace tpu {
 
-const char kTpuMeshCommonStateResourceName[] = "tpu_mesh_common_state";
+const char kTpuMeshStateInterfaceResourceName[] = "tpu_mesh_common_state";
 
 class TpuMeshStateInterface : public tensorflow::ResourceBase {
  public:

From d659eb9c0ddc0e9ad2a7f4addb98380e2ea70954 Mon Sep 17 00:00:00 2001
From: Henry Tan <henrytan@google.com>
Date: Thu, 25 Jun 2020 00:21:04 -0700
Subject: [PATCH 1068/1390] TPU internal library refactor.

PiperOrigin-RevId: 318220378
Change-Id: Ic7b1060986d2e0089177809b234d703387423970
---
 tensorflow/core/tpu/kernels/tpu_configuration_ops.cc | 12 ++++++------
 .../core/tpu/kernels/tpu_mesh_state_interface.h      |  2 +-
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/tensorflow/core/tpu/kernels/tpu_configuration_ops.cc b/tensorflow/core/tpu/kernels/tpu_configuration_ops.cc
index 7d3814ad3c3..583f1aec207 100644
--- a/tensorflow/core/tpu/kernels/tpu_configuration_ops.cc
+++ b/tensorflow/core/tpu/kernels/tpu_configuration_ops.cc
@@ -36,7 +36,7 @@ namespace {
 Status GetTpuMeshStateInterface(const ResourceMgr* rmgr,
                                 tpu::TpuMeshStateInterface** state) {
   if (!rmgr->Lookup(rmgr->default_container(),
-                    tpu::kTpuMeshStateInterfaceResourceName, state)
+                    tpu::kTpuMeshCommonStateResourceName, state)
            .ok()) {
     return errors::FailedPrecondition(
         "The TPU system has not been initialized.");
@@ -96,16 +96,16 @@ void ConfigureDistributedTpuOp::Compute(OpKernelContext* ctx) {
 
   auto* rmgr = GetTPUConfigResourceMgr();
   OP_REQUIRES_OK(ctx, DeleteIfExists<tpu::TpuMeshStateInterface>(
-                          rmgr, tpu::kTpuMeshStateInterfaceResourceName));
+                          rmgr, tpu::kTpuMeshCommonStateResourceName));
 
   tpu::ConfigApiFn()->ConfigureDistributedTpuOp_DoWorkFn(
       num_devices_per_host.size(), num_devices_per_host.data(),
       &host_config_output_size, &host_config_output, status);
 
   auto* tpu_mesh = tpu::TpuMeshStateInterface::Create();
-  OP_REQUIRES_OK(
-      ctx, rmgr->Create(rmgr->default_container(),
-                        tpu::kTpuMeshStateInterfaceResourceName, tpu_mesh));
+  OP_REQUIRES_OK(ctx,
+                 rmgr->Create(rmgr->default_container(),
+                              tpu::kTpuMeshCommonStateResourceName, tpu_mesh));
 
   Tensor* ctx_output;
   OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}), &ctx_output));
@@ -198,7 +198,7 @@ void ShutdownDistributedTpuOp::Compute(OpKernelContext* ctx) {
   TF_Status* status = TF_NewStatus();
   OP_REQUIRES_OK(ctx, DeleteIfExists<tpu::TpuMeshStateInterface>(
                           GetTPUConfigResourceMgr(),
-                          tpu::kTpuMeshStateInterfaceResourceName));
+                          tpu::kTpuMeshCommonStateResourceName));
   tpu::ConfigApiFn()->ShutdownDistributedTpuOp_DoWorkFn(status);
   OP_REQUIRES_OK(ctx, StatusFromTF_Status(status));
   TF_DeleteStatus(status);
diff --git a/tensorflow/core/tpu/kernels/tpu_mesh_state_interface.h b/tensorflow/core/tpu/kernels/tpu_mesh_state_interface.h
index 7906844c965..e2ac38b5f84 100644
--- a/tensorflow/core/tpu/kernels/tpu_mesh_state_interface.h
+++ b/tensorflow/core/tpu/kernels/tpu_mesh_state_interface.h
@@ -29,7 +29,7 @@ class TpuMeshCommonState;
 
 namespace tpu {
 
-const char kTpuMeshStateInterfaceResourceName[] = "tpu_mesh_common_state";
+const char kTpuMeshCommonStateResourceName[] = "tpu_mesh_common_state";
 
 class TpuMeshStateInterface : public tensorflow::ResourceBase {
  public:

From 80301b6f8c7a3e77afd8726b9dd9f4d7b00c09f0 Mon Sep 17 00:00:00 2001
From: Tres Popp <tpopp@google.com>
Date: Thu, 25 Jun 2020 01:20:30 -0700
Subject: [PATCH 1069/1390] Integrate LLVM at
 https://github.com/llvm/llvm-project/commit/4c6548222b3c

PiperOrigin-RevId: 318226982
Change-Id: Ib2c98ac9c9f2b10a8f155b512b9844b64acbf46f
---
 tensorflow/workspace.bzl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index 8eb25f81aae..e74e3c55673 100755
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -710,8 +710,8 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
     )
 
     # Check out LLVM and MLIR from llvm-project.
-    LLVM_COMMIT = "1c0bbe4341ac0ffbaf2e1f482239b45166607f2d"
-    LLVM_SHA256 = "6e4afff5fb0bb2142d871df94826dd52ae00730485049669567b5c0ea4f18bd2"
+    LLVM_COMMIT = "4c6548222b3c41d024581d28f42b3f02510bcfe3"
+    LLVM_SHA256 = "42e4b9d1d5f6854746c84368a7f2a3da6883695d411658ec7d07323baf3e5130"
     LLVM_URLS = [
         "https://storage.googleapis.com/mirror.tensorflow.org/github.com/llvm/llvm-project/archive/{commit}.tar.gz".format(commit = LLVM_COMMIT),
         "https://github.com/llvm/llvm-project/archive/{commit}.tar.gz".format(commit = LLVM_COMMIT),

From b26df598bd832e5b3587b520bac210fc6946dd7f Mon Sep 17 00:00:00 2001
From: Stephan Herhut <herhut@google.com>
Date: Thu, 25 Jun 2020 01:56:45 -0700
Subject: [PATCH 1070/1390] Simplify the tanh approximation a bit and bring it
 closer to the Eigen implementation.

PiperOrigin-RevId: 318230546
Change-Id: I3e95853baf3e2a8d478565ad495ee1260ef19acb
---
 .../tests/legalize_tanh_to_approximation.mlir | 159 +++++++++---------
 .../legalize_tanh_to_approximation.cc         |  45 ++---
 2 files changed, 92 insertions(+), 112 deletions(-)

diff --git a/tensorflow/compiler/mlir/xla/tests/legalize_tanh_to_approximation.mlir b/tensorflow/compiler/mlir/xla/tests/legalize_tanh_to_approximation.mlir
index a8286c9b5a9..f3bdc7d96cb 100644
--- a/tensorflow/compiler/mlir/xla/tests/legalize_tanh_to_approximation.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/legalize_tanh_to_approximation.mlir
@@ -16,58 +16,53 @@ func @tanh_f32(%arg0 : f32) -> f32 {
 }
 
 // NOTE: Assertions have been autogenerated by utils/generate-test-checks.py
-// CHECK:       module {
 
 // CHECK-LABEL:   func @tanh_f32(
 // CHECK-SAME:                   %[[VAL_0:.*]]: f32) -> f32 {
-// CHECK:           %[[VAL_1:.*]] = constant 2.000000e+01 : f32
-// CHECK:           %[[VAL_2:.*]] = constant 1.000000e+00 : f32
-// CHECK:           %[[VAL_3:.*]] = constant 4.000000e-04 : f32
-// CHECK:           %[[VAL_4:.*]] = constant 9.000000e+00 : f32
-// CHECK:           %[[VAL_5:.*]] = constant -2.76076837E-16 : f32
-// CHECK:           %[[VAL_6:.*]] = constant 2.00018794E-13 : f32
-// CHECK:           %[[VAL_7:.*]] = constant -8.60467184E-11 : f32
-// CHECK:           %[[VAL_8:.*]] = constant 5.12229725E-8 : f32
-// CHECK:           %[[VAL_9:.*]] = constant 1.48572235E-5 : f32
-// CHECK:           %[[VAL_10:.*]] = constant 6.37261954E-4 : f32
-// CHECK:           %[[VAL_11:.*]] = constant 0.00489352457 : f32
-// CHECK:           %[[VAL_12:.*]] = constant 1.19825836E-6 : f32
-// CHECK:           %[[VAL_13:.*]] = constant 1.18534706E-4 : f32
-// CHECK:           %[[VAL_14:.*]] = constant 0.00226843474 : f32
-// CHECK:           %[[VAL_15:.*]] = constant 0.00489352504 : f32
-// CHECK:           %[[VAL_16:.*]] = absf %[[VAL_0]] : f32
-// CHECK:           %[[VAL_17:.*]] = copysign %[[VAL_2]], %[[VAL_0]] : f32
-// CHECK:           %[[VAL_18:.*]] = cmpf "ult", %[[VAL_16]], %[[VAL_1]] : f32
-// CHECK:           %[[VAL_19:.*]] = cmpf "olt", %[[VAL_16]], %[[VAL_3]] : f32
-// CHECK:           %[[VAL_20:.*]] = cmpf "ule", %[[VAL_16]], %[[VAL_4]] : f32
-// CHECK:           %[[VAL_21:.*]] = copysign %[[VAL_4]], %[[VAL_0]] : f32
-// CHECK:           %[[VAL_22:.*]] = select %[[VAL_20]], %[[VAL_0]], %[[VAL_21]] : f32
-// CHECK:           %[[VAL_23:.*]] = mulf %[[VAL_22]], %[[VAL_22]] : f32
-// CHECK:           %[[VAL_24:.*]] = mulf %[[VAL_23]], %[[VAL_5]] : f32
+// CHECK:           %[[VAL_1:.*]] = constant 4.000000e-04 : f32
+// CHECK:           %[[VAL_2:.*]] = constant 7.90531111 : f32
+// CHECK:           %[[VAL_3:.*]] = constant -7.90531111 : f32
+// CHECK:           %[[VAL_4:.*]] = constant -2.76076837E-16 : f32
+// CHECK:           %[[VAL_5:.*]] = constant 2.00018794E-13 : f32
+// CHECK:           %[[VAL_6:.*]] = constant -8.60467184E-11 : f32
+// CHECK:           %[[VAL_7:.*]] = constant 5.12229725E-8 : f32
+// CHECK:           %[[VAL_8:.*]] = constant 1.48572235E-5 : f32
+// CHECK:           %[[VAL_9:.*]] = constant 6.37261954E-4 : f32
+// CHECK:           %[[VAL_10:.*]] = constant 0.00489352457 : f32
+// CHECK:           %[[VAL_11:.*]] = constant 1.19825836E-6 : f32
+// CHECK:           %[[VAL_12:.*]] = constant 1.18534706E-4 : f32
+// CHECK:           %[[VAL_13:.*]] = constant 0.00226843474 : f32
+// CHECK:           %[[VAL_14:.*]] = constant 0.00489352504 : f32
+// CHECK:           %[[VAL_15:.*]] = absf %[[VAL_0]] : f32
+// CHECK:           %[[VAL_16:.*]] = cmpf "olt", %[[VAL_15]], %[[VAL_1]] : f32
+// CHECK:           %[[VAL_17:.*]] = cmpf "ule", %[[VAL_0]], %[[VAL_2]] : f32
+// CHECK:           %[[VAL_18:.*]] = select %[[VAL_17]], %[[VAL_0]], %[[VAL_2]] : f32
+// CHECK:           %[[VAL_19:.*]] = cmpf "uge", %[[VAL_18]], %[[VAL_3]] : f32
+// CHECK:           %[[VAL_20:.*]] = select %[[VAL_19]], %[[VAL_18]], %[[VAL_3]] : f32
+// CHECK:           %[[VAL_21:.*]] = mulf %[[VAL_20]], %[[VAL_20]] : f32
+// CHECK:           %[[VAL_22:.*]] = mulf %[[VAL_21]], %[[VAL_4]] : f32
+// CHECK:           %[[VAL_23:.*]] = addf %[[VAL_22]], %[[VAL_5]] : f32
+// CHECK:           %[[VAL_24:.*]] = mulf %[[VAL_21]], %[[VAL_23]] : f32
 // CHECK:           %[[VAL_25:.*]] = addf %[[VAL_24]], %[[VAL_6]] : f32
-// CHECK:           %[[VAL_26:.*]] = mulf %[[VAL_23]], %[[VAL_25]] : f32
+// CHECK:           %[[VAL_26:.*]] = mulf %[[VAL_21]], %[[VAL_25]] : f32
 // CHECK:           %[[VAL_27:.*]] = addf %[[VAL_26]], %[[VAL_7]] : f32
-// CHECK:           %[[VAL_28:.*]] = mulf %[[VAL_23]], %[[VAL_27]] : f32
+// CHECK:           %[[VAL_28:.*]] = mulf %[[VAL_21]], %[[VAL_27]] : f32
 // CHECK:           %[[VAL_29:.*]] = addf %[[VAL_28]], %[[VAL_8]] : f32
-// CHECK:           %[[VAL_30:.*]] = mulf %[[VAL_23]], %[[VAL_29]] : f32
+// CHECK:           %[[VAL_30:.*]] = mulf %[[VAL_21]], %[[VAL_29]] : f32
 // CHECK:           %[[VAL_31:.*]] = addf %[[VAL_30]], %[[VAL_9]] : f32
-// CHECK:           %[[VAL_32:.*]] = mulf %[[VAL_23]], %[[VAL_31]] : f32
+// CHECK:           %[[VAL_32:.*]] = mulf %[[VAL_21]], %[[VAL_31]] : f32
 // CHECK:           %[[VAL_33:.*]] = addf %[[VAL_32]], %[[VAL_10]] : f32
-// CHECK:           %[[VAL_34:.*]] = mulf %[[VAL_23]], %[[VAL_33]] : f32
-// CHECK:           %[[VAL_35:.*]] = addf %[[VAL_34]], %[[VAL_11]] : f32
-// CHECK:           %[[VAL_36:.*]] = mulf %[[VAL_22]], %[[VAL_35]] : f32
-// CHECK:           %[[VAL_37:.*]] = mulf %[[VAL_23]], %[[VAL_12]] : f32
+// CHECK:           %[[VAL_34:.*]] = mulf %[[VAL_20]], %[[VAL_33]] : f32
+// CHECK:           %[[VAL_35:.*]] = mulf %[[VAL_21]], %[[VAL_11]] : f32
+// CHECK:           %[[VAL_36:.*]] = addf %[[VAL_35]], %[[VAL_12]] : f32
+// CHECK:           %[[VAL_37:.*]] = mulf %[[VAL_21]], %[[VAL_36]] : f32
 // CHECK:           %[[VAL_38:.*]] = addf %[[VAL_37]], %[[VAL_13]] : f32
-// CHECK:           %[[VAL_39:.*]] = mulf %[[VAL_23]], %[[VAL_38]] : f32
+// CHECK:           %[[VAL_39:.*]] = mulf %[[VAL_21]], %[[VAL_38]] : f32
 // CHECK:           %[[VAL_40:.*]] = addf %[[VAL_39]], %[[VAL_14]] : f32
-// CHECK:           %[[VAL_41:.*]] = mulf %[[VAL_23]], %[[VAL_40]] : f32
-// CHECK:           %[[VAL_42:.*]] = addf %[[VAL_41]], %[[VAL_15]] : f32
-// CHECK:           %[[VAL_43:.*]] = divf %[[VAL_36]], %[[VAL_42]] : f32
-// CHECK:           %[[VAL_44:.*]] = select %[[VAL_19]], %[[VAL_0]], %[[VAL_43]] : f32
-// CHECK:           %[[VAL_45:.*]] = select %[[VAL_18]], %[[VAL_44]], %[[VAL_17]] : f32
-// CHECK:           return %[[VAL_45]] : f32
+// CHECK:           %[[VAL_41:.*]] = divf %[[VAL_34]], %[[VAL_40]] : f32
+// CHECK:           %[[VAL_42:.*]] = select %[[VAL_16]], %[[VAL_0]], %[[VAL_41]] : f32
+// CHECK:           return %[[VAL_42]] : f32
 // CHECK:         }
-// CHECK:       }
 
 // -----
 
@@ -77,58 +72,54 @@ func @tanh_f16(%arg0 : f16) -> f16 {
 }
 
 // NOTE: Assertions have been autogenerated by utils/generate-test-checks.py
-// CHECK:       module {
 
 // CHECK-LABEL:   func @tanh_f16(
 // CHECK-SAME:                   %[[VAL_0:.*]]: f16) -> f16 {
-// CHECK:           %[[VAL_1:.*]] = constant 2.000000e+01 : f32
-// CHECK:           %[[VAL_2:.*]] = constant 1.000000e+00 : f32
-// CHECK:           %[[VAL_3:.*]] = constant 4.000000e-04 : f32
-// CHECK:           %[[VAL_4:.*]] = constant 9.000000e+00 : f32
-// CHECK:           %[[VAL_5:.*]] = constant -2.76076837E-16 : f32
-// CHECK:           %[[VAL_6:.*]] = constant 2.00018794E-13 : f32
-// CHECK:           %[[VAL_7:.*]] = constant -8.60467184E-11 : f32
-// CHECK:           %[[VAL_8:.*]] = constant 5.12229725E-8 : f32
-// CHECK:           %[[VAL_9:.*]] = constant 1.48572235E-5 : f32
-// CHECK:           %[[VAL_10:.*]] = constant 6.37261954E-4 : f32
-// CHECK:           %[[VAL_11:.*]] = constant 0.00489352457 : f32
-// CHECK:           %[[VAL_12:.*]] = constant 1.19825836E-6 : f32
-// CHECK:           %[[VAL_13:.*]] = constant 1.18534706E-4 : f32
-// CHECK:           %[[VAL_14:.*]] = constant 0.00226843474 : f32
-// CHECK:           %[[VAL_15:.*]] = constant 0.00489352504 : f32
-// CHECK:           %[[VAL_16:.*]] = fpext %[[VAL_0]] : f16 to f32
-// CHECK:           %[[VAL_17:.*]] = absf %[[VAL_16]] : f32
-// CHECK:           %[[VAL_18:.*]] = copysign %[[VAL_2]], %[[VAL_16]] : f32
-// CHECK:           %[[VAL_19:.*]] = cmpf "ult", %[[VAL_17]], %[[VAL_1]] : f32
-// CHECK:           %[[VAL_20:.*]] = cmpf "olt", %[[VAL_17]], %[[VAL_3]] : f32
-// CHECK:           %[[VAL_21:.*]] = cmpf "ule", %[[VAL_17]], %[[VAL_4]] : f32
-// CHECK:           %[[VAL_22:.*]] = copysign %[[VAL_4]], %[[VAL_16]] : f32
-// CHECK:           %[[VAL_23:.*]] = select %[[VAL_21]], %[[VAL_16]], %[[VAL_22]] : f32
-// CHECK:           %[[VAL_24:.*]] = mulf %[[VAL_23]], %[[VAL_23]] : f32
-// CHECK:           %[[VAL_25:.*]] = mulf %[[VAL_24]], %[[VAL_5]] : f32
+// CHECK:           %[[VAL_1:.*]] = constant 4.000000e-04 : f32
+// CHECK:           %[[VAL_2:.*]] = constant 7.90531111 : f32
+// CHECK:           %[[VAL_3:.*]] = constant -7.90531111 : f32
+// CHECK:           %[[VAL_4:.*]] = constant -2.76076837E-16 : f32
+// CHECK:           %[[VAL_5:.*]] = constant 2.00018794E-13 : f32
+// CHECK:           %[[VAL_6:.*]] = constant -8.60467184E-11 : f32
+// CHECK:           %[[VAL_7:.*]] = constant 5.12229725E-8 : f32
+// CHECK:           %[[VAL_8:.*]] = constant 1.48572235E-5 : f32
+// CHECK:           %[[VAL_9:.*]] = constant 6.37261954E-4 : f32
+// CHECK:           %[[VAL_10:.*]] = constant 0.00489352457 : f32
+// CHECK:           %[[VAL_11:.*]] = constant 1.19825836E-6 : f32
+// CHECK:           %[[VAL_12:.*]] = constant 1.18534706E-4 : f32
+// CHECK:           %[[VAL_13:.*]] = constant 0.00226843474 : f32
+// CHECK:           %[[VAL_14:.*]] = constant 0.00489352504 : f32
+// CHECK:           %[[VAL_15:.*]] = fpext %[[VAL_0]] : f16 to f32
+// CHECK:           %[[VAL_16:.*]] = absf %[[VAL_15]] : f32
+// CHECK:           %[[VAL_17:.*]] = cmpf "olt", %[[VAL_16]], %[[VAL_1]] : f32
+// CHECK:           %[[VAL_18:.*]] = cmpf "ule", %[[VAL_15]], %[[VAL_2]] : f32
+// CHECK:           %[[VAL_19:.*]] = select %[[VAL_18]], %[[VAL_15]], %[[VAL_2]] : f32
+// CHECK:           %[[VAL_20:.*]] = cmpf "uge", %[[VAL_19]], %[[VAL_3]] : f32
+// CHECK:           %[[VAL_21:.*]] = select %[[VAL_20]], %[[VAL_19]], %[[VAL_3]] : f32
+// CHECK:           %[[VAL_22:.*]] = mulf %[[VAL_21]], %[[VAL_21]] : f32
+// CHECK:           %[[VAL_23:.*]] = mulf %[[VAL_22]], %[[VAL_4]] : f32
+// CHECK:           %[[VAL_24:.*]] = addf %[[VAL_23]], %[[VAL_5]] : f32
+// CHECK:           %[[VAL_25:.*]] = mulf %[[VAL_22]], %[[VAL_24]] : f32
 // CHECK:           %[[VAL_26:.*]] = addf %[[VAL_25]], %[[VAL_6]] : f32
-// CHECK:           %[[VAL_27:.*]] = mulf %[[VAL_24]], %[[VAL_26]] : f32
+// CHECK:           %[[VAL_27:.*]] = mulf %[[VAL_22]], %[[VAL_26]] : f32
 // CHECK:           %[[VAL_28:.*]] = addf %[[VAL_27]], %[[VAL_7]] : f32
-// CHECK:           %[[VAL_29:.*]] = mulf %[[VAL_24]], %[[VAL_28]] : f32
+// CHECK:           %[[VAL_29:.*]] = mulf %[[VAL_22]], %[[VAL_28]] : f32
 // CHECK:           %[[VAL_30:.*]] = addf %[[VAL_29]], %[[VAL_8]] : f32
-// CHECK:           %[[VAL_31:.*]] = mulf %[[VAL_24]], %[[VAL_30]] : f32
+// CHECK:           %[[VAL_31:.*]] = mulf %[[VAL_22]], %[[VAL_30]] : f32
 // CHECK:           %[[VAL_32:.*]] = addf %[[VAL_31]], %[[VAL_9]] : f32
-// CHECK:           %[[VAL_33:.*]] = mulf %[[VAL_24]], %[[VAL_32]] : f32
+// CHECK:           %[[VAL_33:.*]] = mulf %[[VAL_22]], %[[VAL_32]] : f32
 // CHECK:           %[[VAL_34:.*]] = addf %[[VAL_33]], %[[VAL_10]] : f32
-// CHECK:           %[[VAL_35:.*]] = mulf %[[VAL_24]], %[[VAL_34]] : f32
-// CHECK:           %[[VAL_36:.*]] = addf %[[VAL_35]], %[[VAL_11]] : f32
-// CHECK:           %[[VAL_37:.*]] = mulf %[[VAL_23]], %[[VAL_36]] : f32
-// CHECK:           %[[VAL_38:.*]] = mulf %[[VAL_24]], %[[VAL_12]] : f32
+// CHECK:           %[[VAL_35:.*]] = mulf %[[VAL_21]], %[[VAL_34]] : f32
+// CHECK:           %[[VAL_36:.*]] = mulf %[[VAL_22]], %[[VAL_11]] : f32
+// CHECK:           %[[VAL_37:.*]] = addf %[[VAL_36]], %[[VAL_12]] : f32
+// CHECK:           %[[VAL_38:.*]] = mulf %[[VAL_22]], %[[VAL_37]] : f32
 // CHECK:           %[[VAL_39:.*]] = addf %[[VAL_38]], %[[VAL_13]] : f32
-// CHECK:           %[[VAL_40:.*]] = mulf %[[VAL_24]], %[[VAL_39]] : f32
+// CHECK:           %[[VAL_40:.*]] = mulf %[[VAL_22]], %[[VAL_39]] : f32
 // CHECK:           %[[VAL_41:.*]] = addf %[[VAL_40]], %[[VAL_14]] : f32
-// CHECK:           %[[VAL_42:.*]] = mulf %[[VAL_24]], %[[VAL_41]] : f32
-// CHECK:           %[[VAL_43:.*]] = addf %[[VAL_42]], %[[VAL_15]] : f32
-// CHECK:           %[[VAL_44:.*]] = divf %[[VAL_37]], %[[VAL_43]] : f32
-// CHECK:           %[[VAL_45:.*]] = select %[[VAL_20]], %[[VAL_16]], %[[VAL_44]] : f32
-// CHECK:           %[[VAL_46:.*]] = select %[[VAL_19]], %[[VAL_45]], %[[VAL_18]] : f32
-// CHECK:           %[[VAL_47:.*]] = fptrunc %[[VAL_46]] : f32 to f16
-// CHECK:           return %[[VAL_47]] : f16
+// CHECK:           %[[VAL_42:.*]] = divf %[[VAL_35]], %[[VAL_41]] : f32
+// CHECK:           %[[VAL_43:.*]] = select %[[VAL_17]], %[[VAL_15]], %[[VAL_42]] : f32
+// CHECK:           %[[VAL_44:.*]] = fptrunc %[[VAL_43]] : f32 to f16
+// CHECK:           return %[[VAL_44]] : f16
 // CHECK:         }
-// CHECK:       }
+
 
diff --git a/tensorflow/compiler/mlir/xla/transforms/legalize_tanh_to_approximation.cc b/tensorflow/compiler/mlir/xla/transforms/legalize_tanh_to_approximation.cc
index 9696db377da..fe809d41962 100644
--- a/tensorflow/compiler/mlir/xla/transforms/legalize_tanh_to_approximation.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/legalize_tanh_to_approximation.cc
@@ -28,25 +28,30 @@ namespace xla {
 namespace {
 
 /// Emits the fast tanh approximation that is also used by XLA.
-static Value EmitTanhApproximation(Value input, Value abs_value, Location loc,
-                                   PatternRewriter &rewriter) {
+Value EmitTanhApproximation(Value input, Location loc,
+                            PatternRewriter &rewriter) {
   // For small values of x, we can approximate tanh(x)=x. For extremely small
   // values of x (|x| < 1e-37), the other approximation would evaluate
   // tanh(x) = 0.
   constexpr float kCanUseApprox = 0.0004;
+  Value abs_value = rewriter.create<AbsFOp>(loc, input);
   Value can_use_approx =
       rewriter.create<ConstantOp>(loc, rewriter.getF32FloatAttr(kCanUseApprox));
   Value return_input = rewriter.create<CmpFOp>(loc, CmpFPredicate::OLT,
                                                abs_value, can_use_approx);
-
-  // Clamp the input to [-9, 9].
-  Value plus_nine =
-      rewriter.create<ConstantOp>(loc, rewriter.getF32FloatAttr(9.0));
-  Value smaller_than_nine =
-      rewriter.create<CmpFOp>(loc, CmpFPredicate::ULE, abs_value, plus_nine);
-  Value input_clamped = rewriter.create<SelectOp>(
-      loc, smaller_than_nine, input,
-      rewriter.create<CopySignOp>(loc, plus_nine, input));
+  // Clamp the input to [-c, c].
+  Value max_clamp = rewriter.create<ConstantOp>(
+      loc, rewriter.getF32FloatAttr(7.90531110763549805f));
+  Value smaller_than_max =
+      rewriter.create<CmpFOp>(loc, CmpFPredicate::ULE, input, max_clamp);
+  Value clamped_half =
+      rewriter.create<SelectOp>(loc, smaller_than_max, input, max_clamp);
+  Value min_clamp = rewriter.create<ConstantOp>(
+      loc, rewriter.getF32FloatAttr(-7.90531110763549805f));
+  Value larger_than_min =
+      rewriter.create<CmpFOp>(loc, CmpFPredicate::UGE, clamped_half, min_clamp);
+  Value input_clamped =
+      rewriter.create<SelectOp>(loc, larger_than_min, clamped_half, min_clamp);
 
   static constexpr std::array<float, 7> numerator_coeffs{
       -2.76076847742355e-16f, 2.00018790482477e-13f, -8.60467152213735e-11f,
@@ -109,23 +114,7 @@ class ApproximateTanhLowering : public OpRewritePattern<TanhOp> {
       return failure();
     }
 
-    // For |operand| > 20.0, we just return -1/1.
-    constexpr double kMaxValue = 20.0;
-    Value max_value =
-        rewriter.create<ConstantOp>(loc, rewriter.getF32FloatAttr(kMaxValue));
-    Value abs_value = rewriter.create<AbsFOp>(loc, input);
-
-    Value one = rewriter.create<ConstantOp>(loc, rewriter.getF32FloatAttr(1.0));
-    Value one_with_sign = rewriter.create<CopySignOp>(loc, one, input);
-
-    Value smaller_than_twenty =
-        rewriter.create<CmpFOp>(loc, CmpFPredicate::ULT, abs_value, max_value);
-
-    // Otherwise, we use the approximation.
-    Value approx = EmitTanhApproximation(input, abs_value, loc, rewriter);
-
-    Value result = rewriter.create<SelectOp>(loc, smaller_than_twenty, approx,
-                                             one_with_sign);
+    Value result = EmitTanhApproximation(input, loc, rewriter);
 
     // Truncate back if needed.
     if (operand_type.isF16()) {

From a18853eb0e18b47952bab3ba5df582b0f8b3516d Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 25 Jun 2020 02:01:34 -0700
Subject: [PATCH 1071/1390] compat: Update forward compatibility horizon to
 2020-06-25

PiperOrigin-RevId: 318231043
Change-Id: Ic8bf82284920a04fe9d1589753905c69bbf8b8e4
---
 tensorflow/python/compat/compat.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/compat/compat.py b/tensorflow/python/compat/compat.py
index 07a3a3ce563..4f16f0ec932 100644
--- a/tensorflow/python/compat/compat.py
+++ b/tensorflow/python/compat/compat.py
@@ -33,7 +33,7 @@ from tensorflow.python.util.tf_export import tf_export
 # This value changes every day with an automatic CL. It can be modified in code
 # via `forward_compatibility_horizon()` or with the environment variable
 # TF_FORWARD_COMPATIBILITY_DELTA_DAYS, which is added to the compatibility date.
-_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2020, 6, 24)
+_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2020, 6, 25)
 _FORWARD_COMPATIBILITY_DELTA_DAYS_VAR_NAME = "TF_FORWARD_COMPATIBILITY_DELTA_DAYS"
 _FORWARD_COMPATIBILITY_DATE_NUMBER = None
 

From f74654ac7b314a212b1df6687c2f99800084e97f Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 25 Jun 2020 02:01:35 -0700
Subject: [PATCH 1072/1390] Update GraphDef version to 443.

PiperOrigin-RevId: 318231045
Change-Id: Idfe29a20335a70aadb152f5612eef8c6d5160b92
---
 tensorflow/core/public/version.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index 62c72b59f7e..497f0b631ee 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -108,7 +108,7 @@ limitations under the License.
 
 #define TF_GRAPH_DEF_VERSION_MIN_PRODUCER 0
 #define TF_GRAPH_DEF_VERSION_MIN_CONSUMER 0
-#define TF_GRAPH_DEF_VERSION 442  // Updated: 2020/6/24
+#define TF_GRAPH_DEF_VERSION 443  // Updated: 2020/6/25
 
 // Checkpoint compatibility versions (the versions field in SavedSliceMeta).
 //

From b779bfdbd750ae7e7540451988cb21e1c47c60c4 Mon Sep 17 00:00:00 2001
From: Felix Johnny <felixjohnny.thomasmathibalan@arm.com>
Date: Thu, 25 Jun 2020 12:00:04 +0200
Subject: [PATCH 1073/1390] Max pool & Review comment update.

1. Review comments are taken in
2. Interface for max pool is updated.
---
 .../lite/micro/kernels/cmsis-nn/pooling.cc    | 205 ++++++++++--------
 .../lite/micro/tools/make/ext_libs/cmsis.inc  |   1 -
 .../tools/make/third_party_downloads.inc      |   6 +-
 3 files changed, 119 insertions(+), 93 deletions(-)

diff --git a/tensorflow/lite/micro/kernels/cmsis-nn/pooling.cc b/tensorflow/lite/micro/kernels/cmsis-nn/pooling.cc
index 94f8e928868..0738b4894fc 100644
--- a/tensorflow/lite/micro/kernels/cmsis-nn/pooling.cc
+++ b/tensorflow/lite/micro/kernels/cmsis-nn/pooling.cc
@@ -34,12 +34,17 @@ constexpr int kOutputTensor = 0;
 
 struct OpData {
   TfLitePaddingValues padding;
+  // Index to buffer for optimizations if applicable.
+  int buffer_idx;
+
+  int32_t activation_min;
+  int32_t activation_max;
 };
 
-TfLiteStatus CalculateOpData(const TfLiteContext* context,
+TfLiteStatus CalculateOpData(TfLiteContext* context,
                              const TfLitePoolParams* params,
                              const TfLiteTensor* input,
-                             const TfLiteTensor* output, OpData* data) {
+                             TfLiteTensor* output, OpData* data) {
   // input: batch, height, width, channel
   int height = SizeOfDimension(input, 1);
   int width = SizeOfDimension(input, 2);
@@ -52,11 +57,21 @@ TfLiteStatus CalculateOpData(const TfLiteContext* context,
       /*dilation_rate_width=*/1, height, width, params->filter_height,
       params->filter_width, params->padding, &out_height, &out_width);
 
+  if (input->type != kTfLiteFloat32) {
+    TF_LITE_ENSURE_STATUS(CalculateActivationRangeQuantized(
+        context, params->activation, output, &data->activation_min,
+        &data->activation_max));
+    TFLITE_DCHECK_LE(data->activation_min, data->activation_max);
+  }
+
+  // Set buffer index to a reset value
+  data->buffer_idx = -1;
+
   return kTfLiteOk;
 }
 
 void AverageEvalFloat(const TfLiteContext* context, const TfLiteNode* node,
-                      const TfLitePoolParams* params, const OpData* data,
+                      const TfLitePoolParams* params, const OpData& data,
                       const TfLiteTensor* input, TfLiteTensor* output) {
   float activation_min, activation_max;
   CalculateActivationRange(params->activation, &activation_min,
@@ -67,8 +82,8 @@ void AverageEvalFloat(const TfLiteContext* context, const TfLiteNode* node,
   op_params.stride_width = params->stride_width;
   op_params.filter_height = params->filter_height;
   op_params.filter_width = params->filter_width;
-  op_params.padding_values.height = data->padding.height;
-  op_params.padding_values.width = data->padding.width;
+  op_params.padding_values.height = data.padding.height;
+  op_params.padding_values.width = data.padding.width;
   op_params.float_activation_min = activation_min;
   op_params.float_activation_max = activation_max;
   reference_ops::AveragePool(
@@ -77,30 +92,25 @@ void AverageEvalFloat(const TfLiteContext* context, const TfLiteNode* node,
 }
 
 void AverageEvalQuantized(TfLiteContext* context, const TfLiteNode* node,
-                          const TfLitePoolParams* params, const OpData* data,
+                          const TfLitePoolParams* params, const OpData& data,
                           const TfLiteTensor* input, TfLiteTensor* output) {
   TFLITE_DCHECK(input->type == kTfLiteUInt8 || input->type == kTfLiteInt8);
 
-  int32_t activation_min, activation_max;
-  (void)CalculateActivationRangeQuantized(context, params->activation, output,
-                                          &activation_min, &activation_max);
   PoolParams op_params;
   op_params.stride_height = params->stride_height;
   op_params.stride_width = params->stride_width;
   op_params.filter_height = params->filter_height;
   op_params.filter_width = params->filter_width;
-  op_params.padding_values.height = data->padding.height;
-  op_params.padding_values.width = data->padding.width;
-  op_params.quantized_activation_min = activation_min;
-  op_params.quantized_activation_max = activation_max;
+  op_params.padding_values.height = data.padding.height;
+  op_params.padding_values.width = data.padding.width;
+  op_params.quantized_activation_min = data.activation_min;
+  op_params.quantized_activation_max = data.activation_max;
 
   if (input->type == kTfLiteUInt8) {
     reference_ops::AveragePool(
         op_params, GetTensorShape(input), GetTensorData<uint8_t>(input),
         GetTensorShape(output), GetTensorData<uint8_t>(output));
   } else {
-    TFLITE_DCHECK_LE(activation_min, activation_max);
-
     RuntimeShape input_shape = GetTensorShape(input);
     TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
 
@@ -124,10 +134,10 @@ void AverageEvalQuantized(TfLiteContext* context, const TfLiteNode* node,
     cmsis_nn_pool_params pool_params;
     pool_params.stride.h = params->stride_height;
     pool_params.stride.w = params->stride_width;
-    pool_params.padding.h = data->padding.height;
-    pool_params.padding.w = data->padding.width;
-    pool_params.activation.min = activation_min;
-    pool_params.activation.max = activation_max;
+    pool_params.padding.h = data.padding.height;
+    pool_params.padding.w = data.padding.width;
+    pool_params.activation.min = data.activation_min;
+    pool_params.activation.max = data.activation_max;
 
     cmsis_nn_dims filter_dims;
     filter_dims.n = 1;
@@ -152,19 +162,18 @@ void AverageEvalQuantized(TfLiteContext* context, const TfLiteNode* node,
 }
 
 void MaxEvalFloat(TfLiteContext* context, TfLiteNode* node,
-                  TfLitePoolParams* params, OpData* data, TfLiteTensor* input,
-                  TfLiteTensor* output) {
+                  TfLitePoolParams* params, const OpData& data,
+                  TfLiteTensor* input, TfLiteTensor* output) {
   float activation_min, activation_max;
   CalculateActivationRange(params->activation, &activation_min,
                            &activation_max);
-
   tflite::PoolParams op_params;
   op_params.stride_height = params->stride_height;
   op_params.stride_width = params->stride_width;
   op_params.filter_height = params->filter_height;
   op_params.filter_width = params->filter_width;
-  op_params.padding_values.height = data->padding.height;
-  op_params.padding_values.width = data->padding.width;
+  op_params.padding_values.height = data.padding.height;
+  op_params.padding_values.width = data.padding.width;
   op_params.float_activation_min = activation_min;
   op_params.float_activation_max = activation_max;
   reference_ops::MaxPool(op_params, GetTensorShape(input),
@@ -173,71 +182,68 @@ void MaxEvalFloat(TfLiteContext* context, TfLiteNode* node,
 }
 
 void MaxEvalQuantizedUInt8(TfLiteContext* context, TfLiteNode* node,
-                           TfLitePoolParams* params, OpData* data,
+                           TfLitePoolParams* params, const OpData& data,
                            TfLiteTensor* input, TfLiteTensor* output) {
-  int32_t activation_min, activation_max;
-  (void)CalculateActivationRangeQuantized(context, params->activation, output,
-                                          &activation_min, &activation_max);
-
   tflite::PoolParams op_params;
   op_params.stride_height = params->stride_height;
   op_params.stride_width = params->stride_width;
   op_params.filter_height = params->filter_height;
   op_params.filter_width = params->filter_width;
-  op_params.padding_values.height = data->padding.height;
-  op_params.padding_values.width = data->padding.width;
-  op_params.quantized_activation_min = activation_min;
-  op_params.quantized_activation_max = activation_max;
+  op_params.padding_values.height = data.padding.height;
+  op_params.padding_values.width = data.padding.width;
+  op_params.quantized_activation_min = data.activation_min;
+  op_params.quantized_activation_max = data.activation_max;
   reference_ops::MaxPool(op_params, GetTensorShape(input),
                          GetTensorData<uint8_t>(input), GetTensorShape(output),
                          GetTensorData<uint8_t>(output));
 }
 
 TfLiteStatus MaxEvalInt8(TfLiteContext* context, const TfLiteNode* node,
-                         const TfLitePoolParams* params, const OpData* data,
+                         const TfLitePoolParams* params, const OpData& data,
                          TfLiteTensor* input, TfLiteTensor* output) {
-  int32_t activation_min, activation_max;
-  (void)CalculateActivationRangeQuantized(context, params->activation, output,
-                                          &activation_min, &activation_max);
-
-  TFLITE_DCHECK_LE(activation_min, activation_max);
 
   RuntimeShape input_shape = GetTensorShape(input);
-  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
-
   RuntimeShape output_shape = GetTensorShape(output);
-  TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
-
   const int depth = MatchingDim(input_shape, 3, output_shape, 3);
-  const int input_height = input_shape.Dims(1);
-  const int input_width = input_shape.Dims(2);
-  const int output_height = output_shape.Dims(1);
-  const int output_width = output_shape.Dims(2);
-  const int stride_height = params->stride_height;
-  const int stride_width = params->stride_width;
 
-  const int filter_height = params->filter_height;
-  const int filter_width = params->filter_width;
-  const int padding_height = data->padding.height;
-  const int padding_width = data->padding.width;
+  cmsis_nn_dims input_dims;
+  input_dims.n = 1;
+  input_dims.h = input_shape.Dims(1);
+  input_dims.w = input_shape.Dims(2);
+  input_dims.c = depth;
 
-  int16_t* scratch_buffer = nullptr;
+  cmsis_nn_dims output_dims;
+  output_dims.n = 1;
+  output_dims.h = output_shape.Dims(1);
+  output_dims.w = output_shape.Dims(2);
+  output_dims.c = depth;
+
+  cmsis_nn_pool_params pool_params;
+  pool_params.stride.h = params->stride_height;
+  pool_params.stride.w = params->stride_width;
+  pool_params.padding.h = data.padding.height;
+  pool_params.padding.w = data.padding.width;
+  pool_params.activation.min = data.activation_min;
+  pool_params.activation.max = data.activation_max;
+
+  cmsis_nn_dims filter_dims;
+  filter_dims.n = 1;
+  filter_dims.h = params->filter_height;
+  filter_dims.w = params->filter_width;
+  filter_dims.c = 1;
 
   auto* buffer_idx = reinterpret_cast<int*>(node->user_data);
-
+  cmsis_nn_context ctx;
+  ctx.buf = nullptr;
+  ctx.size = 0;
   if (*buffer_idx > -1) {
-    void* raw = context->GetScratchBuffer(context, *buffer_idx);
-    scratch_buffer = reinterpret_cast<int16_t*>(raw);
+    ctx.buf = context->GetScratchBuffer(context, *buffer_idx);
   }
 
-  TF_LITE_ENSURE_EQ(
-      context, arm_max_pool_s8_opt(
-                   input_height, input_width, output_height, output_width,
-                   stride_height, stride_width, filter_height, filter_width,
-                   padding_height, padding_width, activation_min,
-                   activation_max, depth, GetTensorData<int8_t>(input),
-                   scratch_buffer, GetTensorData<int8_t>(output)),
-      ARM_MATH_SUCCESS);
+  TFLITE_DCHECK_EQ(arm_max_pool_s8(&ctx, &pool_params, &input_dims,
+                                   GetTensorData<int8_t>(input), &filter_dims,
+                                   &output_dims, GetTensorData<int8_t>(output)),
+                   ARM_MATH_SUCCESS);
 
   return kTfLiteOk;
 }
@@ -245,17 +251,43 @@ TfLiteStatus MaxEvalInt8(TfLiteContext* context, const TfLiteNode* node,
 }  // namespace
 
 void* Init(TfLiteContext* context, const char* buffer, size_t length) {
-  void* raw;
-  context->AllocatePersistentBuffer(context, sizeof(int), &raw);
-  return raw;
+  TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
+  void* data = nullptr;
+  if (context->AllocatePersistentBuffer(context, sizeof(OpData), &data) ==
+      kTfLiteError) {
+    return nullptr;
+  }
+  return data;
 }
 
-TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+TfLiteStatus MaxPrepare(TfLiteContext* context, TfLiteNode* node) {
+  TFLITE_DCHECK(node->user_data != nullptr);
+  TFLITE_DCHECK(node->builtin_data != nullptr);
+
+  OpData* data = static_cast<OpData*>(node->user_data);
+  auto* params = reinterpret_cast<TfLitePoolParams*>(node->builtin_data);
+
   const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  TF_LITE_ENSURE_STATUS(CalculateOpData(context, params, input, output, data));
+
+  return kTfLiteOk;
+}
+
+TfLiteStatus AveragePrepare(TfLiteContext* context, TfLiteNode* node) {
+  TFLITE_DCHECK(node->user_data != nullptr);
+  TFLITE_DCHECK(node->builtin_data != nullptr);
+
+  OpData* data = static_cast<OpData*>(node->user_data);
+  auto* params = reinterpret_cast<TfLitePoolParams*>(node->builtin_data);
+
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  TF_LITE_ENSURE_STATUS(CalculateOpData(context, params, input, output, data));
 
   if (input->type == kTfLiteInt8) {
-    const TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
-
     RuntimeShape input_shape = GetTensorShape(input);
     TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
 
@@ -268,14 +300,11 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
     const int32_t buffer_size =
         arm_avgpool_s8_get_buffer_size(output_width, depth);
 
-    int* buffer_idx = reinterpret_cast<int*>(node->user_data);
-
-    node->user_data = buffer_idx;
     if (buffer_size > 0) {
       TF_LITE_ENSURE_STATUS(context->RequestScratchBufferInArena(
-          context, buffer_size, buffer_idx));
+          context, buffer_size, &data->buffer_idx));
     } else {
-      *buffer_idx = -1;
+      data->buffer_idx = -1;
     }
   }
   return kTfLiteOk;
@@ -283,21 +312,20 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 
 TfLiteStatus AverageEval(TfLiteContext* context, TfLiteNode* node) {
   auto* params = reinterpret_cast<TfLitePoolParams*>(node->builtin_data);
-  OpData data;
+
+  const OpData& data = *(static_cast<const OpData*>(node->user_data));
 
   const TfLiteTensor* input = GetInput(context, node, kInputTensor);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
 
-  TF_LITE_ENSURE_STATUS(CalculateOpData(context, params, input, output, &data));
-
   // Inputs and outputs share the same type, guaranteed by the converter.
   switch (input->type) {
     case kTfLiteFloat32:
-      AverageEvalFloat(context, node, params, &data, input, output);
+      AverageEvalFloat(context, node, params, data, input, output);
       break;
     case kTfLiteUInt8:
     case kTfLiteInt8:
-      AverageEvalQuantized(context, node, params, &data, input, output);
+      AverageEvalQuantized(context, node, params, data, input, output);
       break;
     default:
       TF_LITE_KERNEL_LOG(context, "Input type %s is not currently supported",
@@ -309,23 +337,22 @@ TfLiteStatus AverageEval(TfLiteContext* context, TfLiteNode* node) {
 
 TfLiteStatus MaxEval(TfLiteContext* context, TfLiteNode* node) {
   auto* params = reinterpret_cast<TfLitePoolParams*>(node->builtin_data);
-  OpData data;
+
+  const OpData& data = *(static_cast<const OpData*>(node->user_data));
 
   TfLiteTensor* input = &context->tensors[flatbuffers::EndianScalar(
       node->inputs->data[kInputTensor])];
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
 
-  TF_LITE_ENSURE_STATUS(CalculateOpData(context, params, input, output, &data));
-
   switch (input->type) {
     case kTfLiteFloat32:
-      MaxEvalFloat(context, node, params, &data, input, output);
+      MaxEvalFloat(context, node, params, data, input, output);
       break;
     case kTfLiteUInt8:
-      MaxEvalQuantizedUInt8(context, node, params, &data, input, output);
+      MaxEvalQuantizedUInt8(context, node, params, data, input, output);
       break;
     case kTfLiteInt8:
-      MaxEvalInt8(context, node, params, &data, input, output);
+      MaxEvalInt8(context, node, params, data, input, output);
       break;
     default:
       TF_LITE_KERNEL_LOG(context, "Type %s not currently supported.",
@@ -340,7 +367,7 @@ TfLiteStatus MaxEval(TfLiteContext* context, TfLiteNode* node) {
 TfLiteRegistration* Register_AVERAGE_POOL_2D() {
   static TfLiteRegistration r = {/*init=*/pooling::Init,
                                  /*free=*/nullptr,
-                                 /*prepare=*/pooling::Prepare,
+                                 /*prepare=*/pooling::AveragePrepare,
                                  /*invoke=*/pooling::AverageEval,
                                  /*profiling_string=*/nullptr,
                                  /*builtin_code=*/0,
@@ -352,7 +379,7 @@ TfLiteRegistration* Register_AVERAGE_POOL_2D() {
 TfLiteRegistration* Register_MAX_POOL_2D() {
   static TfLiteRegistration r = {/*init=*/pooling::Init,
                                  /*free=*/nullptr,
-                                 /*prepare=*/nullptr,
+                                 /*prepare=*/pooling::MaxPrepare,
                                  /*invoke=*/pooling::MaxEval,
                                  /*profiling_string=*/nullptr,
                                  /*builtin_code=*/0,
diff --git a/tensorflow/lite/micro/tools/make/ext_libs/cmsis.inc b/tensorflow/lite/micro/tools/make/ext_libs/cmsis.inc
index 32874f13dbf..8bb0d58bad1 100644
--- a/tensorflow/lite/micro/tools/make/ext_libs/cmsis.inc
+++ b/tensorflow/lite/micro/tools/make/ext_libs/cmsis.inc
@@ -57,7 +57,6 @@ ifneq ($(filter cmsis-nn,$(ALL_TAGS)),)
       $(CMSIS_PATH)/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_wrapper_s8.c \
       $(CMSIS_PATH)/CMSIS/NN/Source/ConvolutionFunctions/arm_depthwise_conv_wrapper_s8.c \
       $(CMSIS_PATH)/CMSIS/NN/Source/PoolingFunctions/arm_pool_q7_HWC.c \
-      $(CMSIS_PATH)/CMSIS/NN/Source/PoolingFunctions/arm_max_pool_s8_opt.c \
       $(CMSIS_PATH)/CMSIS/NN/Source/PoolingFunctions/arm_avgpool_s8.c \
       $(CMSIS_PATH)/CMSIS/NN/Source/PoolingFunctions/arm_max_pool_s8.c \
       $(CMSIS_PATH)/CMSIS/NN/Source/ActivationFunctions/arm_relu_q15.c \
diff --git a/tensorflow/lite/micro/tools/make/third_party_downloads.inc b/tensorflow/lite/micro/tools/make/third_party_downloads.inc
index 2f2f9396dc0..8590ace9fda 100644
--- a/tensorflow/lite/micro/tools/make/third_party_downloads.inc
+++ b/tensorflow/lite/micro/tools/make/third_party_downloads.inc
@@ -28,8 +28,8 @@ LEON_BCC2_MD5 := "cdf78082be4882da2a92c9baa82fe765"
 TSIM_URL := "https://www.gaisler.com/anonftp/tsim/tsim-eval-2.0.63.tar.gz"
 TSIM_MD5 := "afa0095d3ed989a949e1467f94e41d2f"
 
-CMSIS_URL := "https://github.com/ARM-software/CMSIS_5/archive/0f1587564506b385d57a58baed8c2c6a1e2b959d.zip"
-CMSIS_MD5 := "b7bf586417df9ed586d50cb9b885509f"
+CMSIS_URL := "https://github.com/ARM-software/CMSIS_5/archive/9daaa7a34a5627a24009462b8fa8413a00c4fdb1.zip"
+CMSIS_MD5 := "b988dacff8925ffffcb7e5079cc713b7"
 
 AM_SDK_URL := "http://s3.asia.ambiqmicro.com/downloads/AmbiqSuite-Rel2.2.0.zip"
 AM_SDK_MD5 := "7605fa2d4d97e6bb7a1190c92b66b597"
@@ -88,5 +88,5 @@ ETHOSU_MD5 := "d2073c8d88fc167fd5c46b5dcda58ea1"
 
 HIMAX_WE1_SDK_URL ="https://www.himax.com.tw/we-i/himax_we1_sdk_v02.zip"
 HIMAX_WE1_SDK_MD5 ="9a4b2f29b16052764e437b64bdcba816"
-                    
+
 

From fa300c585a11ad13cd498326e1925b83e7ea9843 Mon Sep 17 00:00:00 2001
From: Vo Van Nghia <vovannghia2409@gmail.com>
Date: Thu, 25 Jun 2020 21:48:37 +0700
Subject: [PATCH 1074/1390] Add more function to writable_file

---
 .../filesystem/plugins/gcs/gcs_filesystem.cc  | 75 ++++++++++++++++++-
 1 file changed, 73 insertions(+), 2 deletions(-)

diff --git a/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem.cc b/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem.cc
index e56b0504e66..c9e5dd84c5a 100644
--- a/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem.cc
+++ b/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem.cc
@@ -82,12 +82,83 @@ typedef struct GCSFile {
   bool sync_need;
 } GCSFile;
 
-static void Cleanup(TF_WritableFile* file) {
+static void SyncImpl(const std::string& bucket, const std::string& object,
+                     TempFile* outfile, gcs::Client* gcs_client,
+                     TF_Status* status) {
+  outfile->operator<<(std::flush);
+  // TODO(vnvo2409): Add resumable upload, compose object, etc.
+  auto metadata = gcs_client->UploadFile(outfile->getName(), bucket, object);
+  if (!metadata) {
+    TF_SetStatusFromGCSStatus(metadata.status(), status);
+    return;
+  }
+  outfile->clear();
+  outfile->seekp(std::ios::end);
+  TF_SetStatus(status, TF_OK, "");
+}
+
+void Cleanup(TF_WritableFile* file) {
   auto gcs_file = static_cast<GCSFile*>(file->plugin_file);
   delete gcs_file;
 }
 
-// TODO(vnvo2409): Implement later
+void Append(const TF_WritableFile* file, const char* buffer, size_t n,
+            TF_Status* status) {
+  auto gcs_file = static_cast<GCSFile*>(file->plugin_file);
+  if (!gcs_file->outfile.is_open()) {
+    TF_SetStatus(status, TF_FAILED_PRECONDITION,
+                 "The internal temporary file is not writable.");
+    return;
+  }
+  gcs_file->sync_need = true;
+  gcs_file->outfile.write(buffer, n);
+  if (!gcs_file->outfile)
+    TF_SetStatus(status, TF_INTERNAL,
+                 "Could not append to the internal temporary file.");
+  else
+    TF_SetStatus(status, TF_OK, "");
+}
+
+int64_t Tell(const TF_WritableFile* file, TF_Status* status) {
+  auto gcs_file = static_cast<GCSFile*>(file->plugin_file);
+  int64_t position = int64_t(gcs_file->outfile.tellp());
+  if (position == -1)
+    TF_SetStatus(status, TF_INTERNAL,
+                 "tellp on the internal temporary file failed");
+  else
+    TF_SetStatus(status, TF_OK, "");
+  return position;
+}
+
+void Flush(const TF_WritableFile* file, TF_Status* status) {
+  auto gcs_file = static_cast<GCSFile*>(file->plugin_file);
+  if (gcs_file->sync_need) {
+    if (!gcs_file->outfile) {
+      TF_SetStatus(status, TF_INTERNAL,
+                   "Could not append to the internal temporary file.");
+      return;
+    }
+    SyncImpl(gcs_file->bucket, gcs_file->object, &gcs_file->outfile,
+             gcs_file->gcs_client, status);
+    if(TF_GetCode(status) != TF_OK)
+      return;
+    gcs_file->sync_need = false;
+  } else {
+    TF_SetStatus(status, TF_OK, "");
+  }
+}
+
+void Sync(const TF_WritableFile* file, TF_Status* status) {
+  Flush(file, status);
+}
+
+void Close(const TF_WritableFile* file, TF_Status* status) {
+  auto gcs_file = static_cast<GCSFile*>(file->plugin_file);
+  if (gcs_file->sync_need) {
+    Flush(file, status);
+  }
+  gcs_file->outfile.close();
+}
 
 }  // namespace tf_writable_file
 

From cfa6727d63e69687e25099e80ff1c2716cf43697 Mon Sep 17 00:00:00 2001
From: abhichou4 <abhichou4@gmail.com>
Date: Thu, 25 Jun 2020 20:26:05 +0530
Subject: [PATCH 1075/1390] add dependencies

---
 tensorflow/python/eager/BUILD               |  1 +
 tensorflow/python/eager/forwardprop_test.py | 28 ++++++++++-----------
 tensorflow/python/ops/parallel_for/BUILD    |  1 +
 3 files changed, 16 insertions(+), 14 deletions(-)

diff --git a/tensorflow/python/eager/BUILD b/tensorflow/python/eager/BUILD
index af5f3d16408..d46ac37d435 100644
--- a/tensorflow/python/eager/BUILD
+++ b/tensorflow/python/eager/BUILD
@@ -613,6 +613,7 @@ py_library(
         "//tensorflow/python:platform",
         "//tensorflow/python:pywrap_tfe",
         "//tensorflow/python:util",
+	"//tensorflow/python/ops/parallel_for:control_flow_ops",
     ],
 )
 
diff --git a/tensorflow/python/eager/forwardprop_test.py b/tensorflow/python/eager/forwardprop_test.py
index e16c65d34ab..be16195c63f 100644
--- a/tensorflow/python/eager/forwardprop_test.py
+++ b/tensorflow/python/eager/forwardprop_test.py
@@ -167,8 +167,8 @@ def _vectorize_parameters(f, params, use_pfor, dtype):
   if use_pfor:
     return control_flow_ops.vectorized_map(
         _wrapper, math_ops.range(total_size))
-  else:
-    return map_fn.map_fn(_wrapper, math_ops.range(total_size), dtype)
+
+  return map_fn.map_fn(_wrapper, math_ops.range(total_size), dtype)
 
 
 def _forward_over_back_hessian(f, params, use_pfor, dtype=None):
@@ -348,7 +348,7 @@ class ForwardpropTest(test.TestCase, parameterized.TestCase):
       self._execution_count = execution_count + 1
       x = array_ops.zeros([execution_count])
       with forwardprop.ForwardAccumulator(
-              x, array_ops.ones_like(x)) as acc:
+          x, array_ops.ones_like(x)) as acc:
         y = x + x
       self.assertAllClose(2. * array_ops.ones_like(x), acc.jvp(y))
 
@@ -357,10 +357,10 @@ class ForwardpropTest(test.TestCase, parameterized.TestCase):
     x = constant_op.constant(-2.)
     with self.assertRaisesRegexp(ValueError, "multiple times"):
       with forwardprop.ForwardAccumulator(
-              [x, x], [1., 2.]):
+          [x, x], [1., 2.]):
         pass
     with forwardprop.ForwardAccumulator(
-            [x], [3.]) as acc:
+        [x], [3.]) as acc:
       self.assertAllClose(3., acc.jvp(x))
       acc._watch(x, constant_op.constant(10.))
       self.assertAllClose(13., acc.jvp(x))
@@ -556,9 +556,9 @@ class ForwardpropTest(test.TestCase, parameterized.TestCase):
 
     primal = constant_op.constant(1.1)
     with forwardprop.ForwardAccumulator(
-            primal, constant_op.constant(1.)) as outer_acc:
+        primal, constant_op.constant(1.)) as outer_acc:
       with forwardprop.ForwardAccumulator(
-              primal, constant_op.constant(1.)) as acc:
+          primal, constant_op.constant(1.)) as acc:
         primal_out = f(primal)
     inner_jvp = acc.jvp(primal_out)
     outer_jvp = outer_acc.jvp(inner_jvp)
@@ -574,9 +574,9 @@ class ForwardpropTest(test.TestCase, parameterized.TestCase):
     inner_jvp = constant_op.constant(3.)
     with forwardprop.ForwardAccumulator(
         [primal_in, inner_jvp],
-            [constant_op.constant(2.), constant_op.constant(4.)]) as outer_acc:
+        [constant_op.constant(2.), constant_op.constant(4.)]) as outer_acc:
       with forwardprop.ForwardAccumulator(
-              primal_in, inner_jvp) as inner_acc:
+          primal_in, inner_jvp) as inner_acc:
         packed_input_indices, packed_input_tangents = (
             forwardprop_util.pack_tangents([primal_in]))
         self.assertAllClose([3., 2., 4.], packed_input_tangents)
@@ -606,9 +606,9 @@ class ForwardpropTest(test.TestCase, parameterized.TestCase):
 
       primal = constant_op.constant(1.1)
       with forwardprop.ForwardAccumulator(
-              primal, constant_op.constant(1.)) as outer_acc:
+          primal, constant_op.constant(1.)) as outer_acc:
         with forwardprop.ForwardAccumulator(
-                primal, constant_op.constant(1.)) as acc:
+            primal, constant_op.constant(1.)) as acc:
           primal_out = f(primal)
       inner_jvp = acc.jvp(primal_out)
       outer_jvp = outer_acc.jvp(inner_jvp)
@@ -640,7 +640,7 @@ class ForwardpropTest(test.TestCase, parameterized.TestCase):
     matmul = def_function.function(math_ops.matmul)
 
     with forwardprop.ForwardAccumulator(
-            primals=[m1, m2], tangents=[tangent1, tangent2]) as acc:
+        primals=[m1, m2], tangents=[tangent1, tangent2]) as acc:
       result1 = matmul(m1, m1, transpose_b=True)
       result2 = matmul(m2, m2, transpose_b=True)
 
@@ -851,7 +851,7 @@ class ForwardpropTest(test.TestCase, parameterized.TestCase):
   def testVariableWatched(self):
     v = variables.Variable([1., 2., 3.])
     with forwardprop.ForwardAccumulator(
-            v, constant_op.constant([.1, -.2, .3])) as acc:
+        v, constant_op.constant([.1, -.2, .3])) as acc:
       self.assertAllClose([.1, -.2, .3], acc.jvp(v))
       x = v * 2.
       self.assertAllClose([.2, -.4, .6], acc.jvp(x))
@@ -882,7 +882,7 @@ class ForwardpropTest(test.TestCase, parameterized.TestCase):
         if self._v is None:
           self._v = variables.Variable([1., 2., 3.])
         with forwardprop.ForwardAccumulator(
-                self._v, constant_op.constant([.1, -.2, .3])) as acc:
+            self._v, constant_op.constant([.1, -.2, .3])) as acc:
           x = self._v * 2.
           x2 = self._v + .1
         return acc.jvp((self._v, x, x2))
diff --git a/tensorflow/python/ops/parallel_for/BUILD b/tensorflow/python/ops/parallel_for/BUILD
index cf9f485a40f..1f55625e493 100644
--- a/tensorflow/python/ops/parallel_for/BUILD
+++ b/tensorflow/python/ops/parallel_for/BUILD
@@ -89,6 +89,7 @@ py_library(
         "//tensorflow/python:util",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:function",
+	"//tensorflow/python/ops/signal",
     ],
 )
 

From 3398514578cfac7d4c3867ba9c0dbeb2b5c5e461 Mon Sep 17 00:00:00 2001
From: Tres Popp <tpopp@google.com>
Date: Thu, 25 Jun 2020 06:06:43 -0700
Subject: [PATCH 1076/1390] Integrate LLVM at
 https://github.com/llvm/llvm-project/commit/9fb7e98db5aa

PiperOrigin-RevId: 318257184
Change-Id: I5ffc7ac0661dc40615fcfbfdbf4992b035109a50
---
 tensorflow/workspace.bzl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index e74e3c55673..6d0979acb06 100755
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -710,8 +710,8 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
     )
 
     # Check out LLVM and MLIR from llvm-project.
-    LLVM_COMMIT = "4c6548222b3c41d024581d28f42b3f02510bcfe3"
-    LLVM_SHA256 = "42e4b9d1d5f6854746c84368a7f2a3da6883695d411658ec7d07323baf3e5130"
+    LLVM_COMMIT = "9fb7e98db5aaef617878a127b663efa4d01aa834"
+    LLVM_SHA256 = "cdffd64994ef6557b38d40e7fe0e18c1ae5c7d8b26be9bfa8acc6de5f79c6b2a"
     LLVM_URLS = [
         "https://storage.googleapis.com/mirror.tensorflow.org/github.com/llvm/llvm-project/archive/{commit}.tar.gz".format(commit = LLVM_COMMIT),
         "https://github.com/llvm/llvm-project/archive/{commit}.tar.gz".format(commit = LLVM_COMMIT),

From 884b55fe86627271d9483e510e4b5d0d3eccf53d Mon Sep 17 00:00:00 2001
From: Adrian Kuegel <akuegel@google.com>
Date: Thu, 25 Jun 2020 06:08:43 -0700
Subject: [PATCH 1077/1390] Workaround for the missing
 libtensorflow_framework.so when running tf_to_cubin.

PiperOrigin-RevId: 318257426
Change-Id: I83440763f2224463966c40138b5d72f3de741e51
---
 tensorflow/core/kernels/cubin_headers/build_defs.bzl | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/tensorflow/core/kernels/cubin_headers/build_defs.bzl b/tensorflow/core/kernels/cubin_headers/build_defs.bzl
index cdf77e59433..d60aaadcbf6 100644
--- a/tensorflow/core/kernels/cubin_headers/build_defs.bzl
+++ b/tensorflow/core/kernels/cubin_headers/build_defs.bzl
@@ -27,7 +27,7 @@ def _gen_kernel_cubin_impl(ctx):
         filename = "%s.%s.cubin" % (name, arch)
         cubin = ctx.actions.declare_file(filename)
         ctx.actions.run(
-            inputs = [ctx.file.mlir_op],
+            inputs = [ctx.file.mlir_op, ctx.file._tfso],
             outputs = [cubin],
             executable = ctx.executable._tool,
             arguments = cmd_args + [
@@ -49,6 +49,11 @@ _gen_kernel_cubin_rule = rule(
         "same_shape": attr.string(),
         "unroll_factors": attr.string(),
         "gpu_archs": attr.string_list(mandatory = True),
+        "_tfso": attr.label(
+            default = Label("//tensorflow:libtensorflow_framework.so.2"),
+            cfg = "host",
+            allow_single_file = True,
+        ),
         "_tool": attr.label(
             executable = True,
             default = Label("//tensorflow/compiler/mlir/tools/kernel_gen:tf_to_cubin"),

From 9a99c02411ed46af1304cf1c393ee1e6df1907d2 Mon Sep 17 00:00:00 2001
From: Rahul Joshi <jurahul@google.com>
Date: Thu, 25 Jun 2020 08:45:22 -0700
Subject: [PATCH 1078/1390] [MLIR][NFC] Adopt variadic isa<>

PiperOrigin-RevId: 318279074
Change-Id: I9845b0278737a4d91b0e1e6699ae008d78e76556
---
 tensorflow/compiler/mlir/lite/flatbuffer_export.cc |  5 ++---
 .../mlir/lite/quantization/quantization_driver.cc  |  4 ++--
 .../mlir/lite/quantization/quantization_utils.h    |  6 +++---
 .../compiler/mlir/lite/tf_to_tfl_flatbuffer.cc     | 10 ++++------
 .../mlir/lite/transforms/default_quant_params.cc   |  3 +--
 .../tensorflow/analysis/side_effect_analysis.cc    |  3 +--
 .../transforms/annotate_parameter_replication.cc   |  2 +-
 .../tensorflow/transforms/collection_ops_util.cc   |  3 +--
 .../mlir/tensorflow/transforms/constant_fold.cc    |  3 +--
 .../tensorflow/transforms/fused_kernel_matcher.cc  |  2 +-
 .../transforms/optimize_global_tensors.cc          |  2 +-
 .../transforms/promote_resources_to_args.cc        |  3 +--
 .../tensorflow/transforms/replicate_to_island.cc   |  4 ++--
 .../tensorflow/transforms/resource_op_lifting.cc   |  2 +-
 .../mlir/tensorflow/transforms/shape_inference.cc  | 14 ++++++--------
 .../transforms/tensor_array_ops_decomposition.cc   |  2 +-
 .../transforms/tensor_list_ops_decomposition.cc    |  2 +-
 .../transforms/tpu_host_computation_expansion.cc   |  2 +-
 .../transforms/tpu_sharding_identification_pass.cc |  2 +-
 .../tpu_variable_runtime_reformatting.cc           |  2 +-
 .../mlir/tensorflow/translate/breakup-islands.cc   |  5 ++---
 tensorflow/compiler/mlir/xla/mlir_hlo_to_hlo.cc    |  2 +-
 .../experimental/conv_emitter/conv_emitter.cc      |  3 +--
 .../xla/service/mlir_gpu/kernel_lowering.cc        |  7 +++----
 24 files changed, 40 insertions(+), 53 deletions(-)

diff --git a/tensorflow/compiler/mlir/lite/flatbuffer_export.cc b/tensorflow/compiler/mlir/lite/flatbuffer_export.cc
index e34e7ae7ca6..ee8b34598e2 100644
--- a/tensorflow/compiler/mlir/lite/flatbuffer_export.cc
+++ b/tensorflow/compiler/mlir/lite/flatbuffer_export.cc
@@ -190,9 +190,8 @@ static StatusOr<tflite::TensorType> GetTFLiteType(Type type,
 }
 
 static bool IsConst(Operation* op) {
-  return isa<mlir::ConstantOp>(op) || isa<mlir::TF::ConstOp>(op) ||
-         isa<tfl::ConstOp>(op) || isa<tfl::QConstOp>(op) ||
-         isa<tfl::SparseConstOp>(op) || isa<tfl::SparseQConstOp>(op);
+  return isa<mlir::ConstantOp, mlir::TF::ConstOp, tfl::ConstOp, tfl::QConstOp,
+             tfl::SparseConstOp, tfl::SparseQConstOp>(op);
 }
 
 template <typename T>
diff --git a/tensorflow/compiler/mlir/lite/quantization/quantization_driver.cc b/tensorflow/compiler/mlir/lite/quantization/quantization_driver.cc
index f3e746c7a43..bc97c42c955 100644
--- a/tensorflow/compiler/mlir/lite/quantization/quantization_driver.cc
+++ b/tensorflow/compiler/mlir/lite/quantization/quantization_driver.cc
@@ -289,8 +289,8 @@ class QuantizationDriver {
       llvm::errs() << "\n\n\n" << current_op->getName() << "\n";
     }
     fn_.walk([&](Operation *op) {
-      if (llvm::isa<quant::QuantizeCastOp>(op) ||
-          llvm::isa<quant::DequantizeCastOp>(op) || llvm::isa<ConstantOp>(op))
+      if (llvm::isa<quant::QuantizeCastOp, quant::DequantizeCastOp, ConstantOp>(
+              op))
         return;
       if (current_op == op) llvm::errs() << "===>>>";
       llvm::errs() << op->getName() << " : (";
diff --git a/tensorflow/compiler/mlir/lite/quantization/quantization_utils.h b/tensorflow/compiler/mlir/lite/quantization/quantization_utils.h
index f17e44cd756..ad99b1c58d2 100644
--- a/tensorflow/compiler/mlir/lite/quantization/quantization_utils.h
+++ b/tensorflow/compiler/mlir/lite/quantization/quantization_utils.h
@@ -172,7 +172,7 @@ struct QuantizationPattern : public RewritePattern {
     Value quantized_value = op->getResult(0);
     for (Operation* quantized_op : quantized_value.getUsers()) {
       // If it is requantize op, we shouldn't rewrite this op.
-      if (llvm::isa<Q>(quantized_op) || llvm::isa<DQ>(quantized_op)) {
+      if (llvm::isa<Q, DQ>(quantized_op)) {
         return failure();
       }
 
@@ -180,8 +180,8 @@ struct QuantizationPattern : public RewritePattern {
       // ops dialect, we shouldn't rewrite.
       if (quantized_op->isKnownTerminator() ||
           quantized_op->hasTrait<OpTrait::quant::NoQuantizableResult>() ||
-          llvm::isa<quant::QuantizeCastOp>(quantized_op) ||
-          llvm::isa<quant::DequantizeCastOp>(quantized_op)) {
+          llvm::isa<quant::QuantizeCastOp, quant::DequantizeCastOp>(
+              quantized_op)) {
         return failure();
       }
 
diff --git a/tensorflow/compiler/mlir/lite/tf_to_tfl_flatbuffer.cc b/tensorflow/compiler/mlir/lite/tf_to_tfl_flatbuffer.cc
index 2e45953c5fa..aa89472f92a 100644
--- a/tensorflow/compiler/mlir/lite/tf_to_tfl_flatbuffer.cc
+++ b/tensorflow/compiler/mlir/lite/tf_to_tfl_flatbuffer.cc
@@ -49,12 +49,10 @@ using mlir::OwningModuleRef;
 using stream_executor::port::StatusOr;
 
 bool IsControlFlowV1Op(Operation* op) {
-  return mlir::isa<mlir::tf_executor::SwitchOp>(op) ||
-         mlir::isa<mlir::tf_executor::MergeOp>(op) ||
-         mlir::isa<mlir::tf_executor::EnterOp>(op) ||
-         mlir::isa<mlir::tf_executor::ExitOp>(op) ||
-         mlir::isa<mlir::tf_executor::NextIterationSinkOp>(op) ||
-         mlir::isa<mlir::tf_executor::NextIterationSourceOp>(op);
+  return mlir::isa<mlir::tf_executor::SwitchOp, mlir::tf_executor::MergeOp,
+                   mlir::tf_executor::EnterOp, mlir::tf_executor::ExitOp,
+                   mlir::tf_executor::NextIterationSinkOp,
+                   mlir::tf_executor::NextIterationSourceOp>(op);
 }
 
 mlir::LogicalResult IsValidGraph(mlir::ModuleOp module) {
diff --git a/tensorflow/compiler/mlir/lite/transforms/default_quant_params.cc b/tensorflow/compiler/mlir/lite/transforms/default_quant_params.cc
index c23ae9fcfab..451eb613543 100644
--- a/tensorflow/compiler/mlir/lite/transforms/default_quant_params.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/default_quant_params.cc
@@ -110,8 +110,7 @@ void DefaultQuantParamsPass::runOnFunction() {
   func.walk([&](Operation *op) {
     if (op->isKnownTerminator() ||
         op->hasTrait<OpTrait::quant::NoQuantizableResult>() ||
-        llvm::isa<quant::QuantizeCastOp>(op) ||
-        llvm::isa<quant::DequantizeCastOp>(op))
+        llvm::isa<quant::QuantizeCastOp, quant::DequantizeCastOp>(op))
       return;
 
     for (auto res : op->getResults()) {
diff --git a/tensorflow/compiler/mlir/tensorflow/analysis/side_effect_analysis.cc b/tensorflow/compiler/mlir/tensorflow/analysis/side_effect_analysis.cc
index f7b88317cd4..35f02ba8445 100644
--- a/tensorflow/compiler/mlir/tensorflow/analysis/side_effect_analysis.cc
+++ b/tensorflow/compiler/mlir/tensorflow/analysis/side_effect_analysis.cc
@@ -100,8 +100,7 @@ int64_t FindPassthroughArgumentForReturnValue(int64_t return_index,
       value = graph.GetFetch().getOperand(res_num);
     } else if (auto island = llvm::dyn_cast<tf_executor::IslandOp>(op)) {
       value = island.GetYield().getOperand(res_num);
-    } else if (llvm::isa<TF::IdentityNOp>(op) ||
-               llvm::isa<TF::IdentityOp>(op)) {
+    } else if (llvm::isa<TF::IdentityNOp, TF::IdentityOp>(op)) {
       value = op->getOperand(res_num);
     } else {
       return -1;
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/annotate_parameter_replication.cc b/tensorflow/compiler/mlir/tensorflow/transforms/annotate_parameter_replication.cc
index 6ba6f416c70..dc24e478378 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/annotate_parameter_replication.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/annotate_parameter_replication.cc
@@ -48,7 +48,7 @@ struct AnnotateParameterReplication
 // tf.IdentityOp or a tf.ReadVariableOp.
 Value SkipIdentityAndReadVariable(Value v) {
   while (auto op = v.getDefiningOp()) {
-    if (!(isa<TF::IdentityOp>(op) || isa<TF::ReadVariableOp>(op))) break;
+    if (!isa<TF::IdentityOp, TF::ReadVariableOp>(op)) break;
     v = op->getOperand(0);
   }
   return v;
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/collection_ops_util.cc b/tensorflow/compiler/mlir/tensorflow/transforms/collection_ops_util.cc
index 8951b49acb7..58c4eac5c95 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/collection_ops_util.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/collection_ops_util.cc
@@ -219,8 +219,7 @@ llvm::Optional<RankedTensorType> GetElementTypeFromAccess(
       auto type_from_callee = GetElementTypeFromAccess(
           callee.getArgument(use.getOperandNumber()), module, infer_from_op);
       if (type_from_callee.hasValue()) return type_from_callee;
-    } else if (llvm::isa<TF::IdentityOp>(use.getOwner()) ||
-               llvm::isa<TF::IdentityNOp>(use.getOwner())) {
+    } else if (llvm::isa<TF::IdentityOp, TF::IdentityNOp>(use.getOwner())) {
       auto type_from_alias = GetElementTypeFromAccess(
           use.getOwner()->getResult(use.getOperandNumber()), module,
           infer_from_op);
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/constant_fold.cc b/tensorflow/compiler/mlir/tensorflow/transforms/constant_fold.cc
index 55a0b5c3fd3..16de2874fda 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/constant_fold.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/constant_fold.cc
@@ -49,8 +49,7 @@ LogicalResult ConstantFoldFallbackHook(
   }
 
   // Do not execute function calls.
-  if (llvm::isa<TF::WhileOp>(inst) || llvm::isa<TF::IfOp>(inst) ||
-      llvm::isa<CallOpInterface>(inst)) {
+  if (llvm::isa<TF::WhileOp, TF::IfOp, CallOpInterface>(inst)) {
     return failure();
   }
 
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/fused_kernel_matcher.cc b/tensorflow/compiler/mlir/tensorflow/transforms/fused_kernel_matcher.cc
index d10f5e26e8f..21f4581f76a 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/fused_kernel_matcher.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/fused_kernel_matcher.cc
@@ -53,7 +53,7 @@ struct FusedKernelMatcherPass
 };
 
 bool IsActivationFunction(Operation *op) {
-  return isa<EluOp>(op) || isa<ReluOp>(op) || isa<Relu6Op>(op);
+  return isa<EluOp, ReluOp, Relu6Op>(op);
 }
 
 // Finds and returns an activation op that uses the result of `op`. If there are
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/optimize_global_tensors.cc b/tensorflow/compiler/mlir/tensorflow/transforms/optimize_global_tensors.cc
index 07cc6203cbd..67a6c8dd6dd 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/optimize_global_tensors.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/optimize_global_tensors.cc
@@ -96,7 +96,7 @@ class ResourceAnalyzer {
     }
 
     func.walk([&](Operation* op) {
-      if (isa<TF::ReadVariableOp>(op) || isa<ReturnOp>(op)) {
+      if (isa<TF::ReadVariableOp, ReturnOp>(op)) {
         return;
       }
       if (auto assign_variable = dyn_cast<TF::AssignVariableOp>(op)) {
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/promote_resources_to_args.cc b/tensorflow/compiler/mlir/tensorflow/transforms/promote_resources_to_args.cc
index af36770f496..961287b0b1f 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/promote_resources_to_args.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/promote_resources_to_args.cc
@@ -97,8 +97,7 @@ llvm::SmallSet<llvm::StringRef, 1> GetCompositeResourceUserNames(
   // the error message are ordered.
   llvm::SmallSet<llvm::StringRef, 1> composite_users;
   for (Operation* user : resource.getUsers())
-    if (!llvm::isa<TF::ReadVariableOp>(user) &&
-        !llvm::isa<TF::AssignVariableOp>(user))
+    if (!llvm::isa<TF::ReadVariableOp, TF::AssignVariableOp>(user))
       composite_users.insert(user->getName().getStringRef());
 
   return composite_users;
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/replicate_to_island.cc b/tensorflow/compiler/mlir/tensorflow/transforms/replicate_to_island.cc
index 15eb5593651..6eedfbbaf4b 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/replicate_to_island.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/replicate_to_island.cc
@@ -53,8 +53,8 @@ struct ReplicateToIslandPass
 
 // Returns whether op requires `_xla_replica_id` attribute.
 bool RequiresReplicaIDAttribute(Operation* op) {
-  return llvm::isa<TF::EnqueueTPUEmbeddingSparseTensorBatchOp>(op) ||
-         llvm::isa<TF::EnqueueTPUEmbeddingRaggedTensorBatchOp>(op);
+  return llvm::isa<TF::EnqueueTPUEmbeddingSparseTensorBatchOp,
+                   TF::EnqueueTPUEmbeddingRaggedTensorBatchOp>(op);
 }
 
 // Adds integer attribute that represents replica id for replicated ops that
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/resource_op_lifting.cc b/tensorflow/compiler/mlir/tensorflow/transforms/resource_op_lifting.cc
index 799ab3a0f0d..2d30bbd1b93 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/resource_op_lifting.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/resource_op_lifting.cc
@@ -140,7 +140,7 @@ struct ResourceOpLiftingPass
 // such nodes to carry information.
 void RemoveIdentity(Block* block) {
   for (auto& op : llvm::make_early_inc_range(*block)) {
-    if (isa<TF::IdentityOp>(&op) || isa<TF::IdentityNOp>(&op)) {
+    if (isa<TF::IdentityOp, TF::IdentityNOp>(&op)) {
       op.replaceAllUsesWith(op.getOperands());
       op.erase();
     }
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference.cc b/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference.cc
index 8c537d01d5c..5907e72e602 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference.cc
@@ -114,14 +114,12 @@ Optional<SmallVector<Type, 4>> InferShapeForFunctionReturnType(FuncOp func) {
 
 // Returns if the shape inference pass supports an op outside the TF dialect.
 bool IsSupportedNonTFOp(Operation* op) {
-  return isa<ReturnOp>(op) || isa<tf_device::ReturnOp>(op) ||
-         isa<tf_executor::EnterOp>(op) || isa<tf_executor::ExitOp>(op) ||
-         isa<tf_executor::FetchOp>(op) || isa<tf_executor::GraphOp>(op) ||
-         isa<tf_executor::IslandOp>(op) || isa<tf_executor::LoopCondOp>(op) ||
-         isa<tf_executor::MergeOp>(op) ||
-         isa<tf_executor::NextIterationSinkOp>(op) ||
-         isa<tf_executor::SwitchNOp>(op) || isa<tf_executor::SwitchOp>(op) ||
-         isa<tf_executor::YieldOp>(op);
+  return isa<ReturnOp, tf_device::ReturnOp, tf_executor::EnterOp,
+             tf_executor::ExitOp, tf_executor::FetchOp, tf_executor::GraphOp,
+             tf_executor::IslandOp, tf_executor::LoopCondOp,
+             tf_executor::MergeOp, tf_executor::NextIterationSinkOp,
+             tf_executor::SwitchNOp, tf_executor::SwitchOp,
+             tf_executor::YieldOp>(op);
 }
 
 // Returns whether a cast back would need to be inserted, e.g., whether the
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tensor_array_ops_decomposition.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tensor_array_ops_decomposition.cc
index a9e1243714e..cbd24f8a815 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tensor_array_ops_decomposition.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tensor_array_ops_decomposition.cc
@@ -440,7 +440,7 @@ llvm::SmallDenseMap<int64_t, llvm::SmallVector<string, 4>> AccessedGradients(
   };
   for (FuncOp func : funcs) {
     for (auto& op : func.front().getOperations()) {
-      if (llvm::isa<TF::IdentityOp>(&op) || llvm::isa<TF::IdentityNOp>(&op)) {
+      if (llvm::isa<TF::IdentityOp, TF::IdentityNOp>(&op)) {
         op.replaceAllUsesWith(op.getOperands());
         continue;
       }
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tensor_list_ops_decomposition.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tensor_list_ops_decomposition.cc
index b118ab6c6c9..11153f0dfc3 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tensor_list_ops_decomposition.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tensor_list_ops_decomposition.cc
@@ -640,7 +640,7 @@ LogicalResult DecomposeTensorListOpsInternal(
         decomposed_partitioned_call_callees) {
   for (auto& op : llvm::make_early_inc_range(block->getOperations())) {
     // TODO(yuanzx): Add a pass to remove identities in device computation.
-    if (llvm::isa<TF::IdentityOp>(&op) || llvm::isa<TF::IdentityNOp>(&op)) {
+    if (llvm::isa<TF::IdentityOp, TF::IdentityNOp>(&op)) {
       op.replaceAllUsesWith(op.getOperands());
       op.erase();
     } else if (auto list = llvm::dyn_cast<TF::EmptyTensorListOp>(&op)) {
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_host_computation_expansion.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_host_computation_expansion.cc
index e2c439feeef..2a3f2197155 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_host_computation_expansion.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_host_computation_expansion.cc
@@ -52,7 +52,7 @@ Operation* GetOpOfValue(Value value) {
 
 // TODO(b/158596585): Replace this with a cost model analysis.
 bool IsTrivialUnaryOperation(Operation* op) {
-  return llvm::isa<TF::CastOp>(op) || llvm::isa<TF::IdentityOp>(op);
+  return llvm::isa<TF::CastOp, TF::IdentityOp>(op);
 }
 
 // Adds outside compilation attributes to unary ops such as Identity/Cast ops
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_sharding_identification_pass.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_sharding_identification_pass.cc
index b05e87c6485..1203eea2f84 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_sharding_identification_pass.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_sharding_identification_pass.cc
@@ -67,7 +67,7 @@ void GetAdjacentXlaShardingOp(Operation* op,
     return;
   }
 
-  if (llvm::isa<TF::IdentityOp>(op) || llvm::isa<TF::CastOp>(op)) {
+  if (llvm::isa<TF::IdentityOp, TF::CastOp>(op)) {
     for (auto user : op->getUsers())
       GetAdjacentXlaShardingOp(user, sharding_op);
   }
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_variable_runtime_reformatting.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_variable_runtime_reformatting.cc
index d88982d9ee7..b8f55e3b979 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_variable_runtime_reformatting.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_variable_runtime_reformatting.cc
@@ -127,7 +127,7 @@ Value SkipIdentity(Value v, bool allow_other_use,
   while (auto result = v.dyn_cast<OpResult>()) {
     if (!(allow_other_use || v.hasOneUse())) break;
     auto op = result.getDefiningOp();
-    if (!llvm::isa<TF::IdentityOp>(op) && !llvm::isa<TF::IdentityNOp>(op)) {
+    if (!llvm::isa<TF::IdentityOp, TF::IdentityNOp>(op)) {
       break;
     }
     v = op->getOperand(result.getResultNumber());
diff --git a/tensorflow/compiler/mlir/tensorflow/translate/breakup-islands.cc b/tensorflow/compiler/mlir/tensorflow/translate/breakup-islands.cc
index 7284626c46a..f09cf7b093e 100644
--- a/tensorflow/compiler/mlir/tensorflow/translate/breakup-islands.cc
+++ b/tensorflow/compiler/mlir/tensorflow/translate/breakup-islands.cc
@@ -306,9 +306,8 @@ void BreakUpIslands::BreakUpIsland(
               llvm::dyn_cast<tf_executor::IslandOp>(owner->getParentOp())) {
         (*new_control_inputs)[other_island_op].push_back(sink_island_control);
       } else if (owner->getDialect() == island_op.getDialect() &&
-                 !llvm::isa<tf_executor::GraphOp>(owner) &&
-                 !llvm::isa<tf_executor::YieldOp>(owner) &&
-                 !llvm::isa<tf_executor::NextIterationSourceOp>(owner)) {
+                 !llvm::isa<tf_executor::GraphOp, tf_executor::YieldOp,
+                            tf_executor::NextIterationSourceOp>(owner)) {
         (*new_control_inputs)[owner].push_back(sink_island_control);
       } else {
         owner->emitOpError("adding control dependency not supported");
diff --git a/tensorflow/compiler/mlir/xla/mlir_hlo_to_hlo.cc b/tensorflow/compiler/mlir/xla/mlir_hlo_to_hlo.cc
index 7a576780c61..ff751a1f9f5 100644
--- a/tensorflow/compiler/mlir/xla/mlir_hlo_to_hlo.cc
+++ b/tensorflow/compiler/mlir/xla/mlir_hlo_to_hlo.cc
@@ -1060,7 +1060,7 @@ LogicalResult ConvertToHloModule::Lower(
     return success();
   }
 
-  if (isa<xla_hlo::ReturnOp>(inst) || isa<mlir::ReturnOp>(inst)) {
+  if (isa<xla_hlo::ReturnOp, mlir::ReturnOp>(inst)) {
     // Construct the return value for the function. If there are multiple
     // values returned, then create a tuple, else return value directly.
     xla::XlaOp return_value;
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/experimental/conv_emitter/conv_emitter.cc b/tensorflow/compiler/xla/service/mlir_gpu/experimental/conv_emitter/conv_emitter.cc
index 36cf37e4044..1bac9a13553 100644
--- a/tensorflow/compiler/xla/service/mlir_gpu/experimental/conv_emitter/conv_emitter.cc
+++ b/tensorflow/compiler/xla/service/mlir_gpu/experimental/conv_emitter/conv_emitter.cc
@@ -193,8 +193,7 @@ mlir::Operation* HoistAndFix(llvm::iplist<mlir::Operation>::iterator begin_op,
 
   const bool any_op_is_loop_variant = [&] {
     for (mlir::Operation& op : llvm::make_range(begin_op, end_op)) {
-      if (mlir::isa<mlir::AffineForOp>(op) ||
-          mlir::isa<mlir::AffineStoreOp>(op)) {
+      if (mlir::isa<mlir::AffineForOp, mlir::AffineStoreOp>(op)) {
         return true;
       }
     }
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/kernel_lowering.cc b/tensorflow/compiler/xla/service/mlir_gpu/kernel_lowering.cc
index 196ea218ef3..3d21379a624 100644
--- a/tensorflow/compiler/xla/service/mlir_gpu/kernel_lowering.cc
+++ b/tensorflow/compiler/xla/service/mlir_gpu/kernel_lowering.cc
@@ -174,8 +174,7 @@ struct DeadTempBufferRemoval
     for (auto result : op->getResults()) {
       if (!llvm::all_of(result.getUsers(), [&](mlir::Operation* op) {
             // Store and Dealloc is OK.
-            if (llvm::isa<mlir::StoreOp>(op) ||
-                llvm::isa<mlir::DeallocOp>(op)) {
+            if (llvm::isa<mlir::StoreOp, mlir::DeallocOp>(op)) {
               return true;
             }
             // Load without uses is also ok.
@@ -225,8 +224,8 @@ struct MoveScalarComputationsIntoGpuLaunch
     : mlir::PassWrapper<MoveScalarComputationsIntoGpuLaunch,
                         mlir::FunctionPass> {
   static bool isInliningBeneficiary(mlir::Operation* op) {
-    return llvm::isa<mlir::ConstantOp>(op) || llvm::isa<mlir::DimOp>(op) ||
-           llvm::isa<mlir::SelectOp>(op) || llvm::isa<mlir::CmpIOp>(op);
+    return llvm::isa<mlir::ConstantOp, mlir::DimOp, mlir::SelectOp,
+                     mlir::CmpIOp>(op);
   }
 
   static bool extractBeneficiaryOps(

From 9c8db3c6b5e7b33f05dc15e7e2e724579f198f56 Mon Sep 17 00:00:00 2001
From: abhichou4 <abhichou4@gmail.com>
Date: Thu, 25 Jun 2020 22:03:16 +0530
Subject: [PATCH 1079/1390] fix BUILD format

---
 tensorflow/python/eager/BUILD            | 2 +-
 tensorflow/python/ops/parallel_for/BUILD | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/python/eager/BUILD b/tensorflow/python/eager/BUILD
index d46ac37d435..b49d233a0ed 100644
--- a/tensorflow/python/eager/BUILD
+++ b/tensorflow/python/eager/BUILD
@@ -613,7 +613,7 @@ py_library(
         "//tensorflow/python:platform",
         "//tensorflow/python:pywrap_tfe",
         "//tensorflow/python:util",
-	"//tensorflow/python/ops/parallel_for:control_flow_ops",
+        "//tensorflow/python/ops/parallel_for:control_flow_ops",
     ],
 )
 
diff --git a/tensorflow/python/ops/parallel_for/BUILD b/tensorflow/python/ops/parallel_for/BUILD
index 1f55625e493..2f3f7309395 100644
--- a/tensorflow/python/ops/parallel_for/BUILD
+++ b/tensorflow/python/ops/parallel_for/BUILD
@@ -89,7 +89,7 @@ py_library(
         "//tensorflow/python:util",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:function",
-	"//tensorflow/python/ops/signal",
+        "//tensorflow/python/ops/signal",
     ],
 )
 

From cdf920928f5cc5684e5b7b15a55bfee3b311d933 Mon Sep 17 00:00:00 2001
From: Hongkun Yu <hongkuny@google.com>
Date: Thu, 25 Jun 2020 09:51:32 -0700
Subject: [PATCH 1080/1390] Updating calling C_API through the TPU C_API proxy.

PiperOrigin-RevId: 318291000
Change-Id: I2dbd4d3d70b09a08282a5cc42fa5c69de64268da
---
 tensorflow/core/tpu/kernels/BUILD                    | 1 -
 tensorflow/core/tpu/kernels/tpu_compile_op_common.cc | 3 +--
 tensorflow/core/tpu/kernels/tpu_util_c_api.h         | 1 -
 3 files changed, 1 insertion(+), 4 deletions(-)

diff --git a/tensorflow/core/tpu/kernels/BUILD b/tensorflow/core/tpu/kernels/BUILD
index 8c0b574045b..071c99babfe 100644
--- a/tensorflow/core/tpu/kernels/BUILD
+++ b/tensorflow/core/tpu/kernels/BUILD
@@ -49,7 +49,6 @@ cc_library(
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/protobuf/tpu:compile_metadata_proto_cc",
         "//tensorflow/core/protobuf/tpu:dynamic_padding_proto_cc",
-        "//tensorflow/core/tpu:tpu_api",
         "//tensorflow/core/tpu:tpu_configuration",
         "//tensorflow/core/tpu:tpu_defs",
         "//tensorflow/stream_executor/tpu:tpu_platform_interface",
diff --git a/tensorflow/core/tpu/kernels/tpu_compile_op_common.cc b/tensorflow/core/tpu/kernels/tpu_compile_op_common.cc
index 6fa431df9e3..9cc494bc244 100644
--- a/tensorflow/core/tpu/kernels/tpu_compile_op_common.cc
+++ b/tensorflow/core/tpu/kernels/tpu_compile_op_common.cc
@@ -33,7 +33,6 @@ limitations under the License.
 #include "tensorflow/core/tpu/kernels/tpu_program_group_interface.h"
 #include "tensorflow/core/tpu/kernels/tpu_util.h"
 #include "tensorflow/core/tpu/kernels/tpu_util_c_api.h"
-#include "tensorflow/core/tpu/tpu_api.h"
 #include "tensorflow/core/tpu/tpu_configuration.h"
 #include "tensorflow/core/tpu/tpu_defs.h"
 
@@ -531,7 +530,7 @@ void TpuCompileOpKernelCommon::Compute(OpKernelContext* ctx) {
       ctx->cancellation_manager()->get_cancellation_token();
   const bool already_cancelled =
       !ctx->cancellation_manager()->RegisterCallback(token, [ctx, done]() {
-        if (UtilApiFn()->TpuCompile_ShouldTpuCompileOpIgnoreCancellationFn()) {
+        if (TpuCompile_ShouldTpuCompileOpIgnoreCancellation()) {
           return;
         }
 
diff --git a/tensorflow/core/tpu/kernels/tpu_util_c_api.h b/tensorflow/core/tpu/kernels/tpu_util_c_api.h
index f53a6b94564..4679ee00d15 100644
--- a/tensorflow/core/tpu/kernels/tpu_util_c_api.h
+++ b/tensorflow/core/tpu/kernels/tpu_util_c_api.h
@@ -41,7 +41,6 @@ bool TpuCompile_ShouldTpuCompileOpIgnoreCancellation();
 
 struct TfTpu_UtilApiFn {
   TFTPU_ADD_FN_IN_STRUCT(TpuCompile_IsTpuCompilationEnabled);
-  TFTPU_ADD_FN_IN_STRUCT(TpuCompile_ShouldTpuCompileOpIgnoreCancellation);
   TFTPU_ADD_FN_IN_STRUCT(TpuCompile_ToTpuShapeRepresentation);
 };
 

From ce60f54198231646613e2deb53d850634f680329 Mon Sep 17 00:00:00 2001
From: Yanhua Sun <yanhuasun@google.com>
Date: Thu, 25 Jun 2020 10:00:02 -0700
Subject: [PATCH 1081/1390] disable xla execution for the particular test

The same test can run with and without XLA compilation. In non-XLA gpu case, it exercises gpu branch. In XLA gpu cases, it exercises the default case. This test is to test the non-XLA case so that we disable XLA. We have explicit test for XLA case.

PiperOrigin-RevId: 318292680
Change-Id: I5720bab3d98c861951a09b62d09fe9ef0a5abb10
---
 tensorflow/python/ops/control_flow_ops_test.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/tensorflow/python/ops/control_flow_ops_test.py b/tensorflow/python/ops/control_flow_ops_test.py
index 33d061b2b72..d1d0f65e07c 100644
--- a/tensorflow/python/ops/control_flow_ops_test.py
+++ b/tensorflow/python/ops/control_flow_ops_test.py
@@ -1225,6 +1225,11 @@ class IndexedCaseTest(test_util.TensorFlowTestCase, parameterized.TestCase):
 
 class ExecuteFnForDeviceTest(test_util.TensorFlowTestCase):
 
+  # The same test can run with and without XLA compilation.
+  # In non-XLA gpu case, it exercises gpu branch.
+  # In XLA gpu cases, it exercises the default case.
+  # This test is to test the non-XLA case so that we disable XLA.
+  @test_util.disable_xla("xla has different execution branch")
   def testCommonCases(self):
 
     def cpu_fn(x):

From ff6628ab6c9ca4955f0a5dc9c8624ead8c2c2990 Mon Sep 17 00:00:00 2001
From: "T.J. Alumbaugh" <talumbau@google.com>
Date: Thu, 25 Jun 2020 10:03:12 -0700
Subject: [PATCH 1082/1390] Increases safety of printf on tensor names

PiperOrigin-RevId: 318293374
Change-Id: Iab2333401127d362c136940cd54877f508a0d6cd
---
 tensorflow/lite/tools/verifier.cc | 34 ++++++++++++++++++-------------
 1 file changed, 20 insertions(+), 14 deletions(-)

diff --git a/tensorflow/lite/tools/verifier.cc b/tensorflow/lite/tools/verifier.cc
index 1d0b813a2c4..b396e93013d 100644
--- a/tensorflow/lite/tools/verifier.cc
+++ b/tensorflow/lite/tools/verifier.cc
@@ -28,7 +28,13 @@ namespace tflite {
 
 namespace {
 
-// Reports error message when the reporter is set.
+const char* NameOrEmptyString(const flatbuffers::String* str) {
+  if (str == nullptr || str->c_str() == nullptr) {
+    return "";
+  }
+  return str->c_str();
+}
+
 void ReportError(ErrorReporter* error_reporter, const char* format, ...) {
   if (error_reporter) {
     va_list args;
@@ -37,7 +43,6 @@ void ReportError(ErrorReporter* error_reporter, const char* format, ...) {
     va_end(args);
   }
 }
-
 // Returns the int32_t value pointed by ptr.
 const uint32_t* GetIntPtr(const char* ptr) {
   return reinterpret_cast<const uint32_t*>(ptr);
@@ -63,7 +68,7 @@ bool VerifyStringTensorBuffer(const Tensor& tensor, const Buffer& buffer,
   uint32_t buffer_size = buffer.data()->size();
   if (buffer_size < sizeof(uint32_t)) {
     ReportError(error_reporter, "String tensor %s is invalid (empty)",
-                tensor.name()->c_str());
+                NameOrEmptyString(tensor.name()));
     return false;
   }
   const char* buffer_ptr = reinterpret_cast<const char*>(buffer.data()->data());
@@ -72,7 +77,7 @@ bool VerifyStringTensorBuffer(const Tensor& tensor, const Buffer& buffer,
   if (num_strings > kMaxNumString) {
     ReportError(error_reporter,
                 "String tensor %s has invalid num of string set: %d",
-                tensor.name()->c_str(), num_strings);
+                NameOrEmptyString(tensor.name()), num_strings);
     return false;
   }
   uint32_t header_offsets =
@@ -82,7 +87,7 @@ bool VerifyStringTensorBuffer(const Tensor& tensor, const Buffer& buffer,
     ReportError(error_reporter,
                 "String tensor %s buffer requires at least %d bytes, but is "
                 "allocated with %d bytes",
-                tensor.name()->c_str(), header_offsets, buffer_size);
+                NameOrEmptyString(tensor.name()), header_offsets, buffer_size);
     return false;
   }
 
@@ -92,7 +97,7 @@ bool VerifyStringTensorBuffer(const Tensor& tensor, const Buffer& buffer,
   if (*GetIntPtr(buffer_ptr + offset) != header_offsets) {
     ReportError(error_reporter,
                 "String tensor %s buffer initial offset must be: %d",
-                tensor.name()->c_str(), header_offsets);
+                NameOrEmptyString(tensor.name()), header_offsets);
     return false;
   }
   offset += sizeof(int32_t);
@@ -101,14 +106,14 @@ bool VerifyStringTensorBuffer(const Tensor& tensor, const Buffer& buffer,
     if (string_offset < prev_ptr || string_offset > buffer_size) {
       ReportError(error_reporter,
                   "String tensor %s buffer is invalid: index %d",
-                  tensor.name()->c_str(), i);
+                  NameOrEmptyString(tensor.name()), i);
       return false;
     }
   }
   if (*GetIntPtr(buffer_ptr + offset - sizeof(int32_t)) != buffer_size) {
     ReportError(error_reporter,
                 "String tensor %s buffer last offset must be %d",
-                tensor.name()->c_str(), buffer_size);
+                NameOrEmptyString(tensor.name()), buffer_size);
     return false;
   }
   return true;
@@ -323,13 +328,13 @@ bool VerifyNumericTensorBuffer(const Tensor& tensor, const Buffer& buffer,
     const auto num_elements = VerifyAndCountSparseElements(tensor);
     if (!num_elements.has_value()) {
       ReportError(error_reporter, "Tensor %s has invalid sparsity parameters",
-                  tensor.name()->c_str());
+                  NameOrEmptyString(tensor.name()));
       return false;
     }
     bytes_required = num_elements.value();
     if (bytes_required > UINT_MAX) {
       ReportError(error_reporter, "Tensor %s dimension overflow",
-                  tensor.name()->c_str());
+                  NameOrEmptyString(tensor.name()));
       return false;
     }
   } else {
@@ -337,7 +342,7 @@ bool VerifyNumericTensorBuffer(const Tensor& tensor, const Buffer& buffer,
       bytes_required *= dim;
       if (bytes_required > UINT_MAX) {
         ReportError(error_reporter, "Tensor %s dimension overflow",
-                    tensor.name()->c_str());
+                    NameOrEmptyString(tensor.name()));
         return false;
       }
     }
@@ -376,12 +381,12 @@ bool VerifyNumericTensorBuffer(const Tensor& tensor, const Buffer& buffer,
       break;
     default:
       ReportError(error_reporter, "Tensor %s invalid type: %d",
-                  tensor.name()->c_str(), tensor.type());
+                  NameOrEmptyString(tensor.name()), tensor.type());
       return false;
   }
   if (bytes_required > UINT_MAX) {
     ReportError(error_reporter, "Tensor %s dimension overflow",
-                tensor.name()->c_str());
+                NameOrEmptyString(tensor.name()));
     return false;
   }
 
@@ -389,7 +394,8 @@ bool VerifyNumericTensorBuffer(const Tensor& tensor, const Buffer& buffer,
     ReportError(
         error_reporter,
         "Tensor %s requires %d bytes, but is allocated with %d bytes buffer",
-        tensor.name()->c_str(), bytes_required, buffer.data()->size());
+        NameOrEmptyString(tensor.name()), bytes_required,
+        buffer.data()->size());
     return false;
   }
   return true;

From f97836a1a1ede75baa3b91ee54fea3c46a6dd127 Mon Sep 17 00:00:00 2001
From: Lei Zhang <antiagainst@google.com>
Date: Thu, 25 Jun 2020 10:07:44 -0700
Subject: [PATCH 1083/1390] Move Analysis library BUILD rules to preamble

PiperOrigin-RevId: 318294378
Change-Id: I3bfb10e133133b3651015f1a71078fc5e1d926b3
---
 third_party/llvm/llvm.autogenerated.BUILD | 66 ++++++++++++-----------
 1 file changed, 34 insertions(+), 32 deletions(-)

diff --git a/third_party/llvm/llvm.autogenerated.BUILD b/third_party/llvm/llvm.autogenerated.BUILD
index 75a53262c3b..9a98f69dd23 100644
--- a/third_party/llvm/llvm.autogenerated.BUILD
+++ b/third_party/llvm/llvm.autogenerated.BUILD
@@ -678,6 +678,40 @@ gentbl(
     ]),
 )
 
+# TODO(b/159809163): autogenerate this after enabling release-mode ML
+# InlineAdvisor
+cc_library(
+    name = "Analysis",
+    srcs = glob(
+        [
+            "lib/Analysis/*.c",
+            "lib/Analysis/*.cpp",
+            "lib/Analysis/*.inc",
+            "include/llvm/Transforms/Utils/Local.h",
+            "include/llvm/Transforms/Scalar.h",
+            "lib/Analysis/*.h",
+        ],
+        exclude = [
+            "lib/Analysis/MLInlineAdvisor.cpp",
+            "lib/Analysis/ReleaseModeModelRunner.cpp",
+        ],
+    ),
+    hdrs = glob([
+        "include/llvm/Analysis/*.h",
+        "include/llvm/Analysis/*.def",
+        "include/llvm/Analysis/*.inc",
+    ]),
+    copts = llvm_copts,
+    deps = [
+        ":BinaryFormat",
+        ":Core",
+        ":Object",
+        ":ProfileData",
+        ":Support",
+        ":config",
+    ],
+)
+
 ########################## Begin generated content ##########################
 cc_library(
     name = "AArch64AsmParser",
@@ -1393,38 +1427,6 @@ cc_library(
     ],
 )
 
-cc_library(
-    name = "Analysis",
-    srcs = glob(
-        [
-            "lib/Analysis/*.c",
-            "lib/Analysis/*.cpp",
-            "lib/Analysis/*.inc",
-            "include/llvm/Transforms/Utils/Local.h",
-            "include/llvm/Transforms/Scalar.h",
-            "lib/Analysis/*.h",
-        ],
-        exclude = [
-            "lib/Analysis/MLInlineAdvisor.cpp",
-            "lib/Analysis/ReleaseModeModelRunner.cpp",
-        ],
-    ),
-    hdrs = glob([
-        "include/llvm/Analysis/*.h",
-        "include/llvm/Analysis/*.def",
-        "include/llvm/Analysis/*.inc",
-    ]),
-    copts = llvm_copts,
-    deps = [
-        ":BinaryFormat",
-        ":Core",
-        ":Object",
-        ":ProfileData",
-        ":Support",
-        ":config",
-    ],
-)
-
 cc_library(
     name = "AsmParser",
     srcs = glob([

From 82abf0dbf316526cd718ae8cd7b11cfcb805805e Mon Sep 17 00:00:00 2001
From: Raman Sarokin <sorokin@google.com>
Date: Thu, 25 Jun 2020 10:18:19 -0700
Subject: [PATCH 1084/1390] Added GPUCustomMemoryDescriptor and updated all
 functions.

PiperOrigin-RevId: 318296675
Change-Id: If29361fb81f29b08fdcc094cc0ad9b60a97a4d67
---
 tensorflow/lite/delegates/gpu/cl/arguments.cc | 41 +++++++++++++++++++
 tensorflow/lite/delegates/gpu/cl/arguments.h  |  4 ++
 tensorflow/lite/delegates/gpu/cl/gpu_object.h | 11 +++++
 .../lite/delegates/gpu/cl/tensor_type.cc      | 31 ++++++++++++++
 .../lite/delegates/gpu/cl/tensor_type.h       |  3 ++
 tensorflow/lite/delegates/gpu/cl/texture2d.cc |  2 +-
 6 files changed, 91 insertions(+), 1 deletion(-)

diff --git a/tensorflow/lite/delegates/gpu/cl/arguments.cc b/tensorflow/lite/delegates/gpu/cl/arguments.cc
index f5a39f1dd7e..efad6e884bd 100644
--- a/tensorflow/lite/delegates/gpu/cl/arguments.cc
+++ b/tensorflow/lite/delegates/gpu/cl/arguments.cc
@@ -162,6 +162,7 @@ Arguments::Arguments(Arguments&& args)
       image2d_arrays_(std::move(args.image2d_arrays_)),
       images3d_(std::move(args.images3d_)),
       image_buffers_(std::move(args.image_buffers_)),
+      custom_memories_(std::move(args.custom_memories_)),
       object_refs_(std::move(args.object_refs_)),
       objects_(std::move(args.objects_)) {}
 Arguments& Arguments::operator=(Arguments&& args) {
@@ -177,6 +178,7 @@ Arguments& Arguments::operator=(Arguments&& args) {
     image2d_arrays_ = std::move(args.image2d_arrays_);
     images3d_ = std::move(args.images3d_);
     image_buffers_ = std::move(args.image_buffers_);
+    custom_memories_ = std::move(args.custom_memories_);
     object_refs_ = std::move(args.object_refs_);
     objects_ = std::move(args.objects_);
   }
@@ -216,6 +218,11 @@ void Arguments::AddImageBuffer(const std::string& name,
   image_buffers_[name] = desc;
 }
 
+void Arguments::AddCustomMemory(const std::string& name,
+                                const GPUCustomMemoryDescriptor& desc) {
+  custom_memories_[name] = desc;
+}
+
 void Arguments::AddObjectRef(const std::string& name, AccessType access_type,
                              GPUObjectDescriptorPtr&& descriptor_ptr) {
   object_refs_[name] = {access_type, std::move(descriptor_ptr)};
@@ -250,6 +257,9 @@ void Arguments::AddGPUResources(const std::string& name,
   for (const auto& r : resources.image_buffers) {
     AddImageBuffer(absl::StrCat(name, "_", r.first), r.second);
   }
+  for (const auto& r : resources.custom_memories) {
+    AddCustomMemory(absl::StrCat(name, "_", r.first), r.second);
+  }
 }
 
 absl::Status Arguments::SetInt(const std::string& name, int value) {
@@ -346,6 +356,17 @@ absl::Status Arguments::SetImageBuffer(const std::string& name, cl_mem memory) {
   return absl::OkStatus();
 }
 
+absl::Status Arguments::SetCustomMemory(const std::string& name,
+                                        cl_mem memory) {
+  auto it = custom_memories_.find(name);
+  if (it == custom_memories_.end()) {
+    return absl::NotFoundError(
+        absl::StrCat("No custom memory argument with name - ", name));
+  }
+  it->second.memory = memory;
+  return absl::OkStatus();
+}
+
 absl::Status Arguments::SetObjectRef(const std::string& name,
                                      const GPUObject* object) {
   auto it = object_refs_.find(name);
@@ -380,6 +401,10 @@ absl::Status Arguments::SetGPUResources(
   for (const auto& r : resources.image_buffers) {
     RETURN_IF_ERROR(SetImageBuffer(absl::StrCat(name, "_", r.first), r.second));
   }
+  for (const auto& r : resources.custom_memories) {
+    RETURN_IF_ERROR(
+        SetCustomMemory(absl::StrCat(name, "_", r.first), r.second));
+  }
   return absl::OkStatus();
 }
 
@@ -440,6 +465,9 @@ absl::Status Arguments::Merge(Arguments&& args, const std::string& postfix) {
   for (const auto& v : args.image_buffers_) {
     AddImageBuffer(RenameArg(object_names, postfix, v.first), v.second);
   }
+  for (const auto& v : args.custom_memories_) {
+    AddCustomMemory(RenameArg(object_names, postfix, v.first), v.second);
+  }
   return absl::OkStatus();
 }
 
@@ -488,6 +516,9 @@ std::string Arguments::GetListOfArgs() {
                                 " image3d_t ", t.first),
                    &result);
   }
+  for (auto& t : custom_memories_) {
+    AppendArgument(absl::StrCat(t.second.type_name, " ", t.first), &result);
+  }
   for (int i = 0; i < shared_int4s_data_.size() / 4; ++i) {
     AppendArgument(absl::StrCat("int4 shared_int4_", i), &result);
   }
@@ -551,6 +582,16 @@ absl::Status Arguments::Bind(cl_kernel kernel, int offset) {
     }
     offset++;
   }
+  for (auto& t : custom_memories_) {
+    const int error_code =
+        clSetKernelArg(kernel, offset, sizeof(cl_mem), &t.second.memory);
+    if (error_code != CL_SUCCESS) {
+      return absl::UnknownError(absl::StrCat(
+          "Failed to set kernel arguments - ", CLErrorCodeToString(error_code),
+          "(at index - ", offset, ")"));
+    }
+    offset++;
+  }
   for (int i = 0; i < shared_int4s_data_.size() / 4; ++i) {
     const int error_code = clSetKernelArg(kernel, offset, sizeof(int32_t) * 4,
                                           &shared_int4s_data_[i * 4]);
diff --git a/tensorflow/lite/delegates/gpu/cl/arguments.h b/tensorflow/lite/delegates/gpu/cl/arguments.h
index 4bebb0b2628..8a97849ec16 100644
--- a/tensorflow/lite/delegates/gpu/cl/arguments.h
+++ b/tensorflow/lite/delegates/gpu/cl/arguments.h
@@ -46,6 +46,8 @@ class Arguments {
   void AddImage3D(const std::string& name, const GPUImage3DDescriptor& desc);
   void AddImageBuffer(const std::string& name,
                       const GPUImageBufferDescriptor& desc);
+  void AddCustomMemory(const std::string& name,
+                       const GPUCustomMemoryDescriptor& desc);
 
   void AddObjectRef(const std::string& name, AccessType access_type,
                     GPUObjectDescriptorPtr&& descriptor_ptr);
@@ -61,6 +63,7 @@ class Arguments {
   absl::Status SetImage2DArray(const std::string& name, cl_mem memory);
   absl::Status SetImage3D(const std::string& name, cl_mem memory);
   absl::Status SetImageBuffer(const std::string& name, cl_mem memory);
+  absl::Status SetCustomMemory(const std::string& name, cl_mem memory);
   absl::Status SetObjectRef(const std::string& name, const GPUObject* object);
 
   std::string GetListOfArgs();
@@ -153,6 +156,7 @@ class Arguments {
   std::map<std::string, GPUImage2DArrayDescriptor> image2d_arrays_;
   std::map<std::string, GPUImage3DDescriptor> images3d_;
   std::map<std::string, GPUImageBufferDescriptor> image_buffers_;
+  std::map<std::string, GPUCustomMemoryDescriptor> custom_memories_;
 
   struct ObjectRefArg {
     AccessType access_type;
diff --git a/tensorflow/lite/delegates/gpu/cl/gpu_object.h b/tensorflow/lite/delegates/gpu/cl/gpu_object.h
index 85f2bba42a7..2d7183754dc 100644
--- a/tensorflow/lite/delegates/gpu/cl/gpu_object.h
+++ b/tensorflow/lite/delegates/gpu/cl/gpu_object.h
@@ -54,6 +54,11 @@ struct GPUImageBufferDescriptor {
   cl_mem memory;
 };
 
+struct GPUCustomMemoryDescriptor {
+  std::string type_name;
+  cl_mem memory;
+};
+
 enum class MemoryType { GLOBAL, CONSTANT, LOCAL };
 
 std::string MemoryTypeToCLType(MemoryType type);
@@ -75,6 +80,8 @@ struct GPUResources {
   std::vector<std::pair<std::string, GPUImage2DArrayDescriptor>> image2d_arrays;
   std::vector<std::pair<std::string, GPUImage3DDescriptor>> images3d;
   std::vector<std::pair<std::string, GPUImageBufferDescriptor>> image_buffers;
+  std::vector<std::pair<std::string, GPUCustomMemoryDescriptor>>
+      custom_memories;
 
   std::vector<std::string> GetNames() const {
     std::vector<std::string> names = ints;
@@ -94,6 +101,9 @@ struct GPUResources {
     for (const auto& obj : image_buffers) {
       names.push_back(obj.first);
     }
+    for (const auto& obj : custom_memories) {
+      names.push_back(obj.first);
+    }
     return names;
   }
 };
@@ -106,6 +116,7 @@ struct GPUResourcesWithValue {
   std::vector<std::pair<std::string, cl_mem>> image2d_arrays;
   std::vector<std::pair<std::string, cl_mem>> images3d;
   std::vector<std::pair<std::string, cl_mem>> image_buffers;
+  std::vector<std::pair<std::string, cl_mem>> custom_memories;
 };
 
 class GPUObjectDescriptor {
diff --git a/tensorflow/lite/delegates/gpu/cl/tensor_type.cc b/tensorflow/lite/delegates/gpu/cl/tensor_type.cc
index d04313226ca..b29a87de893 100644
--- a/tensorflow/lite/delegates/gpu/cl/tensor_type.cc
+++ b/tensorflow/lite/delegates/gpu/cl/tensor_type.cc
@@ -183,6 +183,8 @@ absl::Status TensorDescriptor::PerformSelector(
     return PerformGetPtrWithSliceOffsetSelector(args, result);
   } else if (selector == "GetWHOffset") {
     return PerformGetWHOffsetSelector(args, result);
+  } else if (selector == "GetHandle") {
+    return PerformGetHandleSelector(args, result);
   } else {
     return absl::NotFoundError(absl::StrCat(
         "TensorDescriptor don't have selector with name - ", selector));
@@ -399,6 +401,35 @@ absl::Status TensorDescriptor::PerformGetWHOffsetSelector(
   return absl::OkStatus();
 }
 
+absl::Status TensorDescriptor::PerformGetHandleSelector(
+    const std::vector<std::string>& args, std::string* result) const {
+  if (!args.empty()) {
+    return absl::NotFoundError(
+        absl::StrCat("GetHandle does not require arguments, but ", args.size(),
+                     " was passed"));
+  }
+  switch (storage_type) {
+    case TensorStorageType::BUFFER:
+      *result = "buffer";
+      return absl::OkStatus();
+    case TensorStorageType::IMAGE_BUFFER:
+      *result = "image_buffer";
+      return absl::OkStatus();
+    case TensorStorageType::TEXTURE_2D:
+    case TensorStorageType::SINGLE_TEXTURE_2D:
+      *result = "image2d";
+      return absl::OkStatus();
+    case TensorStorageType::TEXTURE_ARRAY:
+      *result = "image2d_array";
+      return absl::OkStatus();
+    case TensorStorageType::TEXTURE_3D:
+      *result = "image3d";
+      return absl::OkStatus();
+    case TensorStorageType::UNKNOWN:
+      return absl::UnavailableError("Unknown type");
+  }
+}
+
 std::string TensorDescriptor::DeclareAddress(const std::string& var_name,
                                              const std::string& address) const {
   return absl::StrCat(StorageTypeToAddressType(), " ", var_name, " = ", address,
diff --git a/tensorflow/lite/delegates/gpu/cl/tensor_type.h b/tensorflow/lite/delegates/gpu/cl/tensor_type.h
index 12c078f1025..df1c21cd5da 100644
--- a/tensorflow/lite/delegates/gpu/cl/tensor_type.h
+++ b/tensorflow/lite/delegates/gpu/cl/tensor_type.h
@@ -91,6 +91,9 @@ struct TensorDescriptor : public GPUObjectDescriptor {
   absl::Status PerformGetWHOffsetSelector(const std::vector<std::string>& args,
                                           std::string* result) const;
 
+  absl::Status PerformGetHandleSelector(const std::vector<std::string>& args,
+                                        std::string* result) const;
+
   std::string DeclareAddress(const std::string& var_name,
                              const std::string& address) const;
 
diff --git a/tensorflow/lite/delegates/gpu/cl/texture2d.cc b/tensorflow/lite/delegates/gpu/cl/texture2d.cc
index 1b774c40862..f2249a44c1d 100644
--- a/tensorflow/lite/delegates/gpu/cl/texture2d.cc
+++ b/tensorflow/lite/delegates/gpu/cl/texture2d.cc
@@ -84,7 +84,7 @@ absl::Status Texture2DDescriptor::PerformReadSelector(
     const std::vector<std::string>& args, std::string* result) const {
   if (args.size() != 2) {
     return absl::NotFoundError(
-        absl::StrCat("Texture2DDescriptor Read require one argument, but ",
+        absl::StrCat("Texture2DDescriptor Read require two arguments, but ",
                      args.size(), " was passed"));
   }
   const std::string read =

From c11d5d8881fd927165eeb09fd524a80ebaf009f2 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 25 Jun 2020 10:39:52 -0700
Subject: [PATCH 1085/1390] Add "-parse_headers" feature to rules that break
 when bazel makes parse headers a validation action

PiperOrigin-RevId: 318301423
Change-Id: I60a6616077a2e27b1bc06b66d37af19fab11673d
---
 tensorflow/lite/toco/python/BUILD | 1 +
 tensorflow/python/BUILD           | 3 +++
 2 files changed, 4 insertions(+)

diff --git a/tensorflow/lite/toco/python/BUILD b/tensorflow/lite/toco/python/BUILD
index bada1016d26..ac7d94a37bd 100644
--- a/tensorflow/lite/toco/python/BUILD
+++ b/tensorflow/lite/toco/python/BUILD
@@ -30,6 +30,7 @@ cc_library(
     name = "toco_python_api",
     srcs = ["toco_python_api.cc"],
     hdrs = ["toco_python_api.h"],
+    features = ["-parse_headers"],
     visibility = [
         "//tensorflow/python:__subpackages__",
     ],
diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index d42d218734e..1aae054387b 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -978,6 +978,7 @@ cc_library(
 
 cc_header_only_library(
     name = "py_func_headers_lib",
+    features = ["-parse_headers"],
     tags = ["no-ide"],
     deps = [
         ":py_func_lib",
@@ -1061,6 +1062,7 @@ cc_library(
     name = "py_seq_tensor",
     srcs = ["lib/core/py_seq_tensor.cc"],
     hdrs = ["lib/core/py_seq_tensor.h"],
+    features = ["-parse_headers"],
     deps = [
         ":ndarray_tensor",
         ":ndarray_tensor_bridge",
@@ -1213,6 +1215,7 @@ cc_header_only_library(
     extra_deps = [
         "//tensorflow/core:protos_all_cc",
     ],
+    features = ["-parse_headers"],
     tags = ["no-ide"],
     deps = [
         ":python_op_gen",

From 1099faa8d6a941ef44d09ed8c372ff0ffda94112 Mon Sep 17 00:00:00 2001
From: Marcello Maggioni <maggioni@google.com>
Date: Thu, 25 Jun 2020 10:44:00 -0700
Subject: [PATCH 1086/1390] [XLA] Adding description for Cubic root (Cbrt) HLO
 in operation_semantics.

PiperOrigin-RevId: 318302332
Change-Id: I25d3be03fbcca34d62e4f8d6b150926974e26f95
---
 tensorflow/compiler/xla/g3doc/operation_semantics.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tensorflow/compiler/xla/g3doc/operation_semantics.md b/tensorflow/compiler/xla/g3doc/operation_semantics.md
index 56aa3feefaa..3031bfbf2e2 100644
--- a/tensorflow/compiler/xla/g3doc/operation_semantics.md
+++ b/tensorflow/compiler/xla/g3doc/operation_semantics.md
@@ -1310,6 +1310,8 @@ using the comparison operator of the element type of `operand`.
 
 <b>`Sqrt(operand)`</b> Element-wise square root operation `x -> sqrt(x)`.
 
+<b>`Cbrt(operand)`</b> Element-wise cubic root operation `x -> cbrt(x)`.
+
 <b>`Tanh(operand)`</b> Element-wise hyperbolic tangent `x -> tanh(x)`.
 
 
From d66fc89d5320cda7e0f8f58a09177673ca193271 Mon Sep 17 00:00:00 2001
From: Frank Chen <frankchn@google.com>
Date: Thu, 25 Jun 2020 10:51:51 -0700
Subject: [PATCH 1087/1390] Updating calling C_API through the TPU C_API proxy.

PiperOrigin-RevId: 318303930
Change-Id: I2b3ca2b2d3b32276bec73cc35c3d550ab596b4d8
---
 tensorflow/core/tpu/kernels/BUILD                    | 1 +
 tensorflow/core/tpu/kernels/tpu_compile_op_common.cc | 3 ++-
 tensorflow/core/tpu/kernels/tpu_util_c_api.h         | 1 +
 tensorflow/core/tpu/tpu_library_init_fns.inc         | 1 +
 4 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/tensorflow/core/tpu/kernels/BUILD b/tensorflow/core/tpu/kernels/BUILD
index 071c99babfe..8c0b574045b 100644
--- a/tensorflow/core/tpu/kernels/BUILD
+++ b/tensorflow/core/tpu/kernels/BUILD
@@ -49,6 +49,7 @@ cc_library(
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/protobuf/tpu:compile_metadata_proto_cc",
         "//tensorflow/core/protobuf/tpu:dynamic_padding_proto_cc",
+        "//tensorflow/core/tpu:tpu_api",
         "//tensorflow/core/tpu:tpu_configuration",
         "//tensorflow/core/tpu:tpu_defs",
         "//tensorflow/stream_executor/tpu:tpu_platform_interface",
diff --git a/tensorflow/core/tpu/kernels/tpu_compile_op_common.cc b/tensorflow/core/tpu/kernels/tpu_compile_op_common.cc
index 9cc494bc244..6fa431df9e3 100644
--- a/tensorflow/core/tpu/kernels/tpu_compile_op_common.cc
+++ b/tensorflow/core/tpu/kernels/tpu_compile_op_common.cc
@@ -33,6 +33,7 @@ limitations under the License.
 #include "tensorflow/core/tpu/kernels/tpu_program_group_interface.h"
 #include "tensorflow/core/tpu/kernels/tpu_util.h"
 #include "tensorflow/core/tpu/kernels/tpu_util_c_api.h"
+#include "tensorflow/core/tpu/tpu_api.h"
 #include "tensorflow/core/tpu/tpu_configuration.h"
 #include "tensorflow/core/tpu/tpu_defs.h"
 
@@ -530,7 +531,7 @@ void TpuCompileOpKernelCommon::Compute(OpKernelContext* ctx) {
       ctx->cancellation_manager()->get_cancellation_token();
   const bool already_cancelled =
       !ctx->cancellation_manager()->RegisterCallback(token, [ctx, done]() {
-        if (TpuCompile_ShouldTpuCompileOpIgnoreCancellation()) {
+        if (UtilApiFn()->TpuCompile_ShouldTpuCompileOpIgnoreCancellationFn()) {
           return;
         }
 
diff --git a/tensorflow/core/tpu/kernels/tpu_util_c_api.h b/tensorflow/core/tpu/kernels/tpu_util_c_api.h
index 4679ee00d15..f53a6b94564 100644
--- a/tensorflow/core/tpu/kernels/tpu_util_c_api.h
+++ b/tensorflow/core/tpu/kernels/tpu_util_c_api.h
@@ -41,6 +41,7 @@ bool TpuCompile_ShouldTpuCompileOpIgnoreCancellation();
 
 struct TfTpu_UtilApiFn {
   TFTPU_ADD_FN_IN_STRUCT(TpuCompile_IsTpuCompilationEnabled);
+  TFTPU_ADD_FN_IN_STRUCT(TpuCompile_ShouldTpuCompileOpIgnoreCancellation);
   TFTPU_ADD_FN_IN_STRUCT(TpuCompile_ToTpuShapeRepresentation);
 };
 
diff --git a/tensorflow/core/tpu/tpu_library_init_fns.inc b/tensorflow/core/tpu/tpu_library_init_fns.inc
index 29fdb42d95e..16d06539349 100644
--- a/tensorflow/core/tpu/tpu_library_init_fns.inc
+++ b/tensorflow/core/tpu/tpu_library_init_fns.inc
@@ -148,6 +148,7 @@ tensorflow::Status SetTpuUtilStructFns(void* library_handle) {
   auto* util_fn = tensorflow::tpu::UtilApiFn();
 
   TFTPU_SET_FN(util_fn, TpuCompile_IsTpuCompilationEnabled);
+  TFTPU_SET_FN(util_fn, TpuCompile_ShouldTpuCompileOpIgnoreCancellation);
   TFTPU_SET_FN(util_fn, TpuCompile_ToTpuShapeRepresentation);
 
   return tensorflow::Status::OK();

From 35a3ab91b42503776f428bda574b74b9a99cd110 Mon Sep 17 00:00:00 2001
From: Giorgio Arena <giorgio.arena@arm.com>
Date: Thu, 25 Jun 2020 10:54:10 -0700
Subject: [PATCH 1088/1390] =?UTF-8?q?PR=20#34999:=20Extend=20tests=20in=20?=
 =?UTF-8?q?TFLite=20micro=20for=20conv=20and=20depth=5Fconv=20to=20support?=
 =?UTF-8?q?=20dilat=E2=80=A6?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Imported from GitHub PR https://github.com/tensorflow/tensorflow/pull/34999

…ion != 1
Copybara import of the project:

--
e6eabb04686999aebae95d2964d5f3dc3ac96061 by Giorgio Arena <giorgio.arena@arm.com>:

Extend tests in TFLite micro for conv and depth_conv to support dilation != 1

--
14cee9716ca0622b96aca6f6d84b9320420954bc by Giorgio Arena <giorgio.arena@arm.com>:

Fix depthwise_conv dilated mismatches

COPYBARA_INTEGRATE_REVIEW=https://github.com/tensorflow/tensorflow/pull/34999 from giorgio-arenarm:dilation_tests_tflu 14cee9716ca0622b96aca6f6d84b9320420954bc
PiperOrigin-RevId: 318304398
Change-Id: I04e16c34fdcc123e6d93dc15ff021fa69eadda6e
---
 tensorflow/lite/micro/kernels/conv_test.cc    |  78 ++++++
 .../lite/micro/kernels/depthwise_conv_test.cc | 236 ++++++++++++++----
 2 files changed, 268 insertions(+), 46 deletions(-)

diff --git a/tensorflow/lite/micro/kernels/conv_test.cc b/tensorflow/lite/micro/kernels/conv_test.cc
index 2747ec5a9f3..686b3f98ff5 100644
--- a/tensorflow/lite/micro/kernels/conv_test.cc
+++ b/tensorflow/lite/micro/kernels/conv_test.cc
@@ -296,6 +296,43 @@ TF_LITE_MICRO_TEST(SimpleTestQuantized) {
       &tflite::testing::common_conv_params);
 }
 
+TF_LITE_MICRO_TEST(SimpleTestDilatedQuantized) {
+  const int output_dims_count = 24;
+  uint8_t output_data[output_dims_count];
+
+  const float input_scale = 0.5f;
+  const float filter_scale = 0.5f;
+  const float output_scale = 1.0f;
+
+  const int input_elements = 48;
+  const int input_shape[] = {4, 2, 4, 6, 1};
+  const float input_data[] = {
+      // b = 0
+      1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4,
+      // b = 1
+      1, 2, 3, 4, 5, 6, 2, 6, 2, 4, 4, 2, 3, 2, 6, 5, 1, 4, 1, 2, 1, 4, 6, 3};
+  const int output_elements = 24;
+  const int output_shape[] = {4, 2, 2, 2, 3};
+  const float golden_data[] = {25, 2, 7, 25, 2, 7, 10, 2, -3, 10, 2, -3,
+                               39, 7, 6, 50, 3, 4, 14, 4, -5, 15, 0, -7};
+
+  uint8_t input_quantized[input_elements];
+  uint8_t filter_quantized[tflite::testing::kFilterElements];
+  int32_t bias_quantized[tflite::testing::kBiasElements];
+  uint8_t golden_quantized[output_elements];
+
+  TfLiteConvParams conv_params{tflite::testing::common_conv_params};
+  conv_params.dilation_width_factor = 3;
+  conv_params.dilation_height_factor = 2;
+
+  tflite::testing::TestConvQuantizedPerLayer(
+      input_shape, input_data, input_quantized, input_scale,
+      tflite::testing::kFilterShape, tflite::testing::kFilterData,
+      filter_quantized, filter_scale, tflite::testing::kBiasShape,
+      tflite::testing::kBiasData, bias_quantized, output_shape, golden_data,
+      golden_quantized, output_data, output_scale, &conv_params);
+}
+
 TF_LITE_MICRO_TEST(SimpleTestQuantizedPerChannel) {
   const int output_dims_count = 12;
   int8_t output_data[output_dims_count];
@@ -322,6 +359,47 @@ TF_LITE_MICRO_TEST(SimpleTestQuantizedPerChannel) {
       output_zero_point, &tflite::testing::common_conv_params);
 }
 
+TF_LITE_MICRO_TEST(SimpleTestDilatedQuantizedPerChannel) {
+  const int output_dims_count = 24;
+  int8_t output_data[output_dims_count];
+
+  const float input_scale = 0.5f;
+  const float output_scale = 1.0f;
+  const int input_zero_point = 0;
+  const int output_zero_point = 0;
+
+  const int input_elements = 48;
+  const int input_shape[] = {4, 2, 4, 6, 1};
+  const float input_data[] = {
+      // b = 0
+      1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4,
+      // b = 1
+      1, 2, 3, 4, 5, 6, 2, 6, 2, 4, 4, 2, 3, 2, 6, 5, 1, 4, 1, 2, 1, 4, 6, 3};
+  const int output_elements = 24;
+  const int output_shape[] = {4, 2, 2, 2, 3};
+  const float golden_data[] = {25, 2, 7, 25, 2, 7, 10, 2, -3, 10, 2, -3,
+                               39, 7, 6, 50, 3, 4, 14, 4, -5, 15, 0, -7};
+
+  int8_t input_quantized[input_elements];
+  int8_t filter_quantized[tflite::testing::kFilterElements];
+  int32_t bias_quantized[tflite::testing::kBiasElements];
+  int8_t golden_quantized[output_elements];
+  int zero_points[tflite::testing::kBiasElements + 1];
+  float scales[tflite::testing::kBiasElements + 1];
+
+  TfLiteConvParams conv_params{tflite::testing::common_conv_params};
+  conv_params.dilation_width_factor = 3;
+  conv_params.dilation_height_factor = 2;
+
+  tflite::testing::TestConvQuantizedPerChannel(
+      input_shape, input_data, input_quantized, input_scale, input_zero_point,
+      tflite::testing::kFilterShape, tflite::testing::kFilterData,
+      filter_quantized, tflite::testing::kBiasShape, tflite::testing::kBiasData,
+      bias_quantized, scales, zero_points, output_shape, golden_data,
+      golden_quantized, output_data, output_scale, output_zero_point,
+      &conv_params);
+}
+
 TF_LITE_MICRO_TEST(SimpleTestQuantizedPerChannelRelu6) {
   // conv params:
   // padding, stride_<width,height>, dilation_<width, height>, activation
diff --git a/tensorflow/lite/micro/kernels/depthwise_conv_test.cc b/tensorflow/lite/micro/kernels/depthwise_conv_test.cc
index 464241fdf25..cd62de0d17e 100644
--- a/tensorflow/lite/micro/kernels/depthwise_conv_test.cc
+++ b/tensorflow/lite/micro/kernels/depthwise_conv_test.cc
@@ -38,11 +38,10 @@ constexpr int kOutputTensorIndex = 3;
 // The tensors parameter contains both the input tensors as well as a
 // preallocated output tensor into which the output is stored.
 template <typename T>
-TfLiteStatus ValidateDepthwiseConvGoldens(const T* expected_output_data,
-                                          int output_length,
-                                          TfLiteFusedActivation activation,
-                                          float tolerance, int tensors_size,
-                                          TfLiteTensor* tensors) {
+TfLiteStatus ValidateDepthwiseConvGoldens(
+    const T* expected_output_data, int output_length,
+    TfLiteDepthwiseConvParams* conv_params, float tolerance, int tensors_size,
+    TfLiteTensor* tensors) {
   TfLiteContext context;
   PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
 
@@ -54,16 +53,13 @@ TfLiteStatus ValidateDepthwiseConvGoldens(const T* expected_output_data,
   int input_depth = tensors[0].dims->data[3];
   int output_depth = tensors[1].dims->data[3];
   int depth_mul = output_depth / input_depth;
-  TfLiteDepthwiseConvParams builtin_data;
-  builtin_data.padding = kTfLitePaddingValid;
-  builtin_data.activation = activation;
-  builtin_data.stride_height = 1;
-  builtin_data.stride_width = 1;
-  builtin_data.dilation_height_factor = 1;
-  builtin_data.dilation_width_factor = 1;
-  builtin_data.depth_multiplier = depth_mul;
 
-  const char* init_data = reinterpret_cast<const char*>(&builtin_data);
+  conv_params->padding = kTfLitePaddingValid;
+  conv_params->stride_height = 1;
+  conv_params->stride_width = 1;
+  conv_params->depth_multiplier = depth_mul;
+
+  const char* init_data = reinterpret_cast<const char*>(conv_params);
   size_t init_data_size = 0;
   void* user_data = nullptr;
   if (registration->init) {
@@ -78,7 +74,7 @@ TfLiteStatus ValidateDepthwiseConvGoldens(const T* expected_output_data,
   node.inputs = inputs_array;
   node.outputs = outputs_array;
   node.user_data = user_data;
-  node.builtin_data = reinterpret_cast<void*>(&builtin_data);
+  node.builtin_data = reinterpret_cast<void*>(conv_params);
   node.custom_initial_data = nullptr;
   node.custom_initial_data_size = 0;
   if (registration->prepare) {
@@ -105,7 +101,7 @@ void TestDepthwiseConvFloat(const int* input_dims_data, const float* input_data,
                             const float* bias_data,
                             const float* expected_output_data,
                             const int* output_dims_data,
-                            TfLiteFusedActivation activation,
+                            TfLiteDepthwiseConvParams* conv_params,
                             float* output_data) {
   TfLiteIntArray* input_dims = IntArrayFromInts(input_dims_data);
   TfLiteIntArray* filter_dims = IntArrayFromInts(filter_dims_data);
@@ -124,7 +120,7 @@ void TestDepthwiseConvFloat(const int* input_dims_data, const float* input_data,
   };
 
   ValidateDepthwiseConvGoldens(expected_output_data, output_dims_count,
-                               activation, 1e-5, tensors_size, tensors);
+                               conv_params, 1e-5, tensors_size, tensors);
 }
 
 void TestDepthwiseConvQuantizedPerLayer(
@@ -135,7 +131,7 @@ void TestDepthwiseConvQuantizedPerLayer(
     const int* bias_dims_data, const float* bias_data, int32_t* bias_quantized,
     const float* golden, uint8_t* golden_quantized, const int* output_dims_data,
     uint8_t* output_data, float output_scale, int output_zero_point,
-    TfLiteFusedActivation activation) {
+    TfLiteDepthwiseConvParams* conv_params) {
   TfLiteIntArray* input_dims = IntArrayFromInts(input_dims_data);
   TfLiteIntArray* filter_dims = IntArrayFromInts(filter_dims_data);
   TfLiteIntArray* bias_dims = IntArrayFromInts(bias_dims_data);
@@ -174,7 +170,7 @@ void TestDepthwiseConvQuantizedPerLayer(
 
   AsymmetricQuantize(golden, golden_quantized, output_dims_count, output_scale,
                      output_zero_point);
-  ValidateDepthwiseConvGoldens(golden_quantized, output_dims_count, activation,
+  ValidateDepthwiseConvGoldens(golden_quantized, output_dims_count, conv_params,
                                1.0, tensors_size, tensors);
 }
 
@@ -187,7 +183,7 @@ void TestDepthwiseConvQuantizedPerChannel(
     const int* output_dims_data, const float* expected_output_data,
     int8_t* expected_output_data_quantized, int8_t* output_data,
     float output_scale, int output_zero_point,
-    TfLiteFusedActivation activation) {
+    TfLiteDepthwiseConvParams* conv_params) {
   TfLiteIntArray* input_dims = IntArrayFromInts(input_dims_data);
   TfLiteIntArray* filter_dims = IntArrayFromInts(filter_dims_data);
   TfLiteIntArray* bias_dims = IntArrayFromInts(bias_dims_data);
@@ -242,7 +238,7 @@ void TestDepthwiseConvQuantizedPerChannel(
 
   TF_LITE_MICRO_EXPECT_EQ(
       kTfLiteOk, ValidateDepthwiseConvGoldens(expected_output_data_quantized,
-                                              output_dims_count, activation,
+                                              output_dims_count, conv_params,
                                               1.0, tensors_size, tensors));
 }
 
@@ -269,9 +265,15 @@ TF_LITE_MICRO_TEST(SimpleTest) {
   const int output_shape[] = {4, 1, 2, 1, 4};
   const int output_dims_count = 8;
   float output_data[output_dims_count];
+
+  TfLiteDepthwiseConvParams conv_params;
+  conv_params.activation = kTfLiteActNone;
+  conv_params.dilation_width_factor = 1;
+  conv_params.dilation_height_factor = 1;
+
   tflite::testing::TestDepthwiseConvFloat(
       input_shape, input_values, filter_shape, filter_values, bias_shape,
-      bias_values, golden, output_shape, kTfLiteActNone, output_data);
+      bias_values, golden, output_shape, &conv_params, output_data);
 }
 
 TF_LITE_MICRO_TEST(SimpleTestQuantized) {
@@ -304,12 +306,64 @@ TF_LITE_MICRO_TEST(SimpleTestQuantized) {
   uint8_t golden_quantized[output_elements];
   uint8_t output_data[output_elements];
 
+  TfLiteDepthwiseConvParams conv_params;
+  conv_params.activation = kTfLiteActNone;
+  conv_params.dilation_width_factor = 1;
+  conv_params.dilation_height_factor = 1;
+
   tflite::testing::TestDepthwiseConvQuantizedPerLayer(
       input_shape, input_values, input_quantized, input_scale, input_zero_point,
       filter_shape, filter_values, filter_quantized, filter_scale,
       filter_zero_point, bias_shape, bias_values, bias_quantized, golden,
       golden_quantized, output_shape, output_data, output_scale,
-      output_zero_point, kTfLiteActNone);
+      output_zero_point, &conv_params);
+}
+
+TF_LITE_MICRO_TEST(SimpleTestDilatedQuantized) {
+  const int input_elements = 48;
+  const int input_shape[] = {4, 1, 4, 6, 2};
+  const float input_values[] = {1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2,   // h = 0
+                                3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4,   // h = 1
+                                1, 2, 3, 4, 5, 6, 2, 6, 2, 4, 4, 2,   // h = 2
+                                3, 2, 6, 5, 1, 4, 1, 2, 1, 4, 6, 3};  // h = 3
+  const int filter_elements = 16;
+  const int filter_shape[] = {4, 1, 2, 2, 4};
+  const float filter_values[] = {1, 2, 3, 4, -9, 10,  -11, 12,
+                                 5, 6, 7, 8, 13, -14, 15,  -16};
+  const int bias_elements = 4;
+  const int bias_shape[] = {4, 1, 1, 1, 4};
+  const int output_elements = 24;
+  const float bias_values[] = {1, 2, 3, 4};
+  const float golden[] = {
+      15, 2,  88, -48, 25, 14, 72, 0,  61, -2,  56, 48,  // h = 0
+      -4, 52, 12, 48,  11, 70, 63, 40, 51, -30, 41, 48   // h = 1
+  };
+  const int output_shape[] = {4, 1, 2, 3, 4};
+
+  const float input_scale = 0.5f;
+  const int input_zero_point = 128;
+  const float filter_scale = 0.5f;
+  const int filter_zero_point = 128;
+  const float output_scale = 1.0f;
+  const int output_zero_point = 128;
+
+  uint8_t input_quantized[input_elements];
+  uint8_t filter_quantized[filter_elements];
+  int32_t bias_quantized[bias_elements];
+  uint8_t golden_quantized[output_elements];
+  uint8_t output_data[output_elements];
+
+  TfLiteDepthwiseConvParams conv_params;
+  conv_params.activation = kTfLiteActNone;
+  conv_params.dilation_width_factor = 3;
+  conv_params.dilation_height_factor = 2;
+
+  tflite::testing::TestDepthwiseConvQuantizedPerLayer(
+      input_shape, input_values, input_quantized, input_scale, input_zero_point,
+      filter_shape, filter_values, filter_quantized, filter_scale,
+      filter_zero_point, bias_shape, bias_values, bias_quantized, golden,
+      golden_quantized, output_shape, output_data, output_scale,
+      output_zero_point, &conv_params);
 }
 
 TF_LITE_MICRO_TEST(SimpleTestRelu) {
@@ -329,9 +383,14 @@ TF_LITE_MICRO_TEST(SimpleTestRelu) {
   const float golden_relu[] = {71, 0, 99, 0, 91, 0, 127, 0};
   float output_data[output_dims_count];
 
+  TfLiteDepthwiseConvParams conv_params;
+  conv_params.activation = kTfLiteActRelu;
+  conv_params.dilation_width_factor = 1;
+  conv_params.dilation_height_factor = 1;
+
   tflite::testing::TestDepthwiseConvFloat(
       input_shape, input_values, filter_shape, filter_values, bias_shape,
-      bias_values, golden_relu, output_shape, kTfLiteActRelu, output_data);
+      bias_values, golden_relu, output_shape, &conv_params, output_data);
 }
 
 TF_LITE_MICRO_TEST(SimpleTestReluQuantized) {
@@ -347,7 +406,6 @@ TF_LITE_MICRO_TEST(SimpleTestReluQuantized) {
   const int output_elements = 8;
   const float bias_values[] = {1, 2, 3, 4};
   const int output_shape[] = {4, 1, 2, 1, 4};
-  const int output_dims_count = 8;
   const float golden_relu[] = {71, 0, 99, 0, 91, 0, 127, 0};
 
   const float input_scale = 0.5f;
@@ -363,15 +421,20 @@ TF_LITE_MICRO_TEST(SimpleTestReluQuantized) {
   uint8_t golden_quantized[output_elements];
   uint8_t output_data[output_elements];
 
+  TfLiteDepthwiseConvParams conv_params;
+  conv_params.activation = kTfLiteActRelu;
+  conv_params.dilation_width_factor = 1;
+  conv_params.dilation_height_factor = 1;
+
   tflite::testing::TestDepthwiseConvQuantizedPerLayer(
       input_shape, input_values, input_quantized, input_scale, input_zero_point,
       filter_shape, filter_values, filter_quantized, filter_scale,
       filter_zero_point, bias_shape, bias_values, bias_quantized, golden_relu,
       golden_quantized, output_shape, output_data, output_scale,
-      output_zero_point, kTfLiteActRelu);
+      output_zero_point, &conv_params);
 }
 
-TF_LITE_MICRO_TEST(SimpleTestOptimizedFilterWidth) {
+TF_LITE_MICRO_TEST(SimpleTestQuantizedOptimizedFilterWidth) {
   const int input_elements = 12;
   const float input_values[] = {1, 2, 7, 8, 3, 4, 9, 10, 5, 6, 11, 12};
   const int filter_elements = 16;
@@ -401,12 +464,17 @@ TF_LITE_MICRO_TEST(SimpleTestOptimizedFilterWidth) {
   uint8_t golden_quantized[output_dims_count];
   uint8_t output_data[output_dims_count];
 
+  TfLiteDepthwiseConvParams conv_params;
+  conv_params.activation = kTfLiteActNone;
+  conv_params.dilation_width_factor = 1;
+  conv_params.dilation_height_factor = 1;
+
   tflite::testing::TestDepthwiseConvQuantizedPerLayer(
       input_shape, input_values, input_quantized, input_scale, input_zero_point,
       filter_shape, filter_values, filter_quantized, filter_scale,
       filter_zero_point, bias_shape, bias_values, bias_quantized, goldens,
       golden_quantized, output_shape, output_data, output_scale,
-      output_zero_point, kTfLiteActNone);
+      output_zero_point, &conv_params);
 }
 
 TF_LITE_MICRO_TEST(SimpleTestQuantizedPerChannel) {
@@ -440,11 +508,16 @@ TF_LITE_MICRO_TEST(SimpleTestQuantizedPerChannel) {
   int zero_points[bias_elements + 1];
   float scales[bias_elements + 1];
 
+  TfLiteDepthwiseConvParams conv_params;
+  conv_params.activation = kTfLiteActNone;
+  conv_params.dilation_width_factor = 1;
+  conv_params.dilation_height_factor = 1;
+
   tflite::testing::TestDepthwiseConvQuantizedPerChannel(
       input_shape, input_values, input_quantized, input_scale, input_zero_point,
       filter_shape, filter_values, filter_quantized, bias_shape, bias_values,
       bias_quantized, output_shape, golden, golden_quantized, output_data,
-      output_scale, output_zero_point, kTfLiteActNone);
+      output_scale, output_zero_point, &conv_params);
 }
 
 TF_LITE_MICRO_TEST(SimpleTestQuantizedPerChannelDepthMultiplier1) {
@@ -480,11 +553,16 @@ TF_LITE_MICRO_TEST(SimpleTestQuantizedPerChannelDepthMultiplier1) {
   int zero_points[bias_elements + 1];
   float scales[bias_elements + 1];
 
+  TfLiteDepthwiseConvParams conv_params;
+  conv_params.activation = kTfLiteActNone;
+  conv_params.dilation_width_factor = 1;
+  conv_params.dilation_height_factor = 1;
+
   tflite::testing::TestDepthwiseConvQuantizedPerChannel(
       input_shape, input_values, input_quantized, input_scale, input_zero_point,
       filter_shape, filter_values, filter_quantized, bias_shape, bias_values,
       bias_quantized, output_shape, golden, golden_quantized, output_data,
-      output_scale, output_zero_point, kTfLiteActNone);
+      output_scale, output_zero_point, &conv_params);
 }
 
 TF_LITE_MICRO_TEST(TestQuantizedPerChannelDepthMultiplier1Relu6) {
@@ -519,15 +597,62 @@ TF_LITE_MICRO_TEST(TestQuantizedPerChannelDepthMultiplier1Relu6) {
   int zero_points[bias_elements + 1];
   float scales[bias_elements + 1];
 
-  tflite::testing::TestDepthwiseConvFloat(
-      input_shape, input_values, filter_shape, filter_values, bias_shape,
-      bias_values, golden, output_shape, kTfLiteActRelu6, output_float);
+  TfLiteDepthwiseConvParams conv_params;
+  conv_params.activation = kTfLiteActRelu6;
+  conv_params.dilation_width_factor = 1;
+  conv_params.dilation_height_factor = 1;
 
   tflite::testing::TestDepthwiseConvQuantizedPerChannel(
       input_shape, input_values, input_quantized, input_scale, input_zero_point,
       filter_shape, filter_values, filter_quantized, bias_shape, bias_values,
       bias_quantized, output_shape, golden, golden_quantized, output_data,
-      output_scale, output_zero_point, kTfLiteActRelu6);
+      output_scale, output_zero_point, &conv_params);
+}
+
+TF_LITE_MICRO_TEST(SimpleTestDilatedQuantizedPerChannel) {
+  const int input_elements = 48;
+  const int input_shape[] = {4, 1, 4, 6, 2};
+  const float input_values[] = {1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2,   // h = 0
+                                3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4,   // h = 1
+                                1, 2, 3, 4, 5, 6, 2, 6, 2, 4, 4, 2,   // h = 2
+                                3, 2, 6, 5, 1, 4, 1, 2, 1, 4, 6, 3};  // h = 3
+  const int filter_elements = 16;
+  const int filter_shape[] = {4, 1, 2, 2, 4};
+  const float filter_values[] = {1, 2, 3, 4, -9, 10,  -11, 12,
+                                 5, 6, 7, 8, 13, -14, 15,  -16};
+  const int bias_elements = 4;
+  const int bias_shape[] = {4, 1, 1, 1, 4};
+  const int output_elements = 24;
+  const float bias_values[] = {1, 2, 3, 4};
+  const float golden[] = {
+      15, 2,  88, -48, 25, 14, 72, 0,  61, -2,  56, 48,  // h = 0
+      -4, 52, 12, 48,  11, 70, 63, 40, 51, -30, 41, 48   // h = 1
+  };
+  const int output_shape[] = {4, 1, 2, 3, 4};
+  int8_t output_data[output_elements];
+
+  const float input_scale = 0.5;
+  const float output_scale = 1.0f;
+  const int input_zero_point = 0;
+  const int output_zero_point = 0;
+
+  int8_t input_quantized[input_elements];
+  int8_t filter_quantized[filter_elements];
+  int32_t bias_quantized[bias_elements];
+  int8_t golden_quantized[output_elements];
+  int zero_points[bias_elements + 1];
+  float scales[bias_elements + 1];
+
+  TfLiteDepthwiseConvParams conv_params;
+  conv_params.activation = kTfLiteActNone;
+  conv_params.dilation_width_factor = 3;
+  conv_params.dilation_height_factor = 2;
+
+  tflite::testing::TestDepthwiseConvQuantizedPerChannel(
+      input_shape, input_values, input_quantized, input_scale, input_zero_point,
+      filter_shape, filter_values, filter_quantized, bias_shape, bias_values,
+      bias_quantized, output_shape, golden, golden_quantized, output_data,
+      output_scale, output_zero_point, &conv_params);
 }
 
 TF_LITE_MICRO_TEST(TestQuantizedPerChannelCompareWithFloat) {
@@ -558,15 +683,20 @@ TF_LITE_MICRO_TEST(TestQuantizedPerChannelCompareWithFloat) {
   const int input_zero_point = 0;
   const int output_zero_point = 0;
 
+  TfLiteDepthwiseConvParams conv_params;
+  conv_params.activation = kTfLiteActNone;
+  conv_params.dilation_width_factor = 1;
+  conv_params.dilation_height_factor = 1;
+
   tflite::testing::TestDepthwiseConvQuantizedPerChannel(
       input_dims, input_data, input_quantized, input_scale, input_zero_point,
       filter_dims, filter_data, filter_quantized, bias_dims, bias_data,
       bias_quantized, output_dims, golden, golden_quantized, output_data,
-      output_scale, output_zero_point, kTfLiteActNone);
+      output_scale, output_zero_point, &conv_params);
 
   tflite::testing::TestDepthwiseConvFloat(
       input_dims, input_data, filter_dims, filter_data, bias_dims, bias_data,
-      golden, output_dims, kTfLiteActNone, output_float);
+      golden, output_dims, &conv_params, output_float);
 }
 
 TF_LITE_MICRO_TEST(FilterDimsNotMatchingAffineQuantization) {
@@ -636,22 +766,27 @@ TF_LITE_MICRO_TEST(FilterDimsNotMatchingAffineQuantization) {
       output_tensor,
   };
 
+  TfLiteDepthwiseConvParams conv_params;
+  conv_params.activation = kTfLiteActNone;
+  conv_params.dilation_width_factor = 1;
+  conv_params.dilation_height_factor = 1;
+
   // Set filter quant to mismatched dimension.
   TfLiteAffineQuantization* quant = reinterpret_cast<TfLiteAffineQuantization*>(
       filter_tensor.quantization.params);
   quant->scale->size = 2;
-  TF_LITE_MICRO_EXPECT_EQ(
-      kTfLiteError, tflite::testing::ValidateDepthwiseConvGoldens(
-                        golden_quantized, output_size, kTfLiteActNone, 1e-5,
-                        tensors_size, tensors));
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteError,
+                          tflite::testing::ValidateDepthwiseConvGoldens(
+                              golden_quantized, output_size, &conv_params, 1e-5,
+                              tensors_size, tensors));
 
   // Set scale back to correct dimension, and make zero point array too short.
   quant->scale->size = filter_shape[0];
   quant->zero_point->size = 2;
-  TF_LITE_MICRO_EXPECT_EQ(
-      kTfLiteError, tflite::testing::ValidateDepthwiseConvGoldens(
-                        golden_quantized, output_size, kTfLiteActNone, 1e-5,
-                        tensors_size, tensors));
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteError,
+                          tflite::testing::ValidateDepthwiseConvGoldens(
+                              golden_quantized, output_size, &conv_params, 1e-5,
+                              tensors_size, tensors));
 }
 
 TF_LITE_MICRO_TEST(PerChannelBroadcastQuantizationParams) {
@@ -743,9 +878,14 @@ TF_LITE_MICRO_TEST(PerChannelBroadcastQuantizationParams) {
   tflite::AsymmetricQuantize(golden, golden_quantized, output_dims_count,
                              output_scale, 0);
 
+  TfLiteDepthwiseConvParams conv_params;
+  conv_params.activation = kTfLiteActNone;
+  conv_params.dilation_width_factor = 1;
+  conv_params.dilation_height_factor = 1;
+
   TF_LITE_MICRO_EXPECT_EQ(
       kTfLiteOk, tflite::testing::ValidateDepthwiseConvGoldens(
-                     golden_quantized, output_dims_count, kTfLiteActNone, 1e-5,
+                     golden_quantized, output_dims_count, &conv_params, 1e-5,
                      tensors_size, tensors));
 }
 
@@ -891,8 +1031,12 @@ TF_LITE_MICRO_TEST(Int8Input32x4Filter32x4ShouldMatchGolden) {
   // Errors due to quantization should not exceed 1.
   constexpr int kQuantizationTolerance = 1;
 
+  TfLiteDepthwiseConvParams conv_params;
+  conv_params.activation = kTfLiteActNone;
+  conv_params.dilation_width_factor = 1;
+  conv_params.dilation_height_factor = 1;
   TfLiteStatus status = tflite::testing::ValidateDepthwiseConvGoldens(
-      golden_quantized, output_elements, kTfLiteActNone, kQuantizationTolerance,
+      golden_quantized, output_elements, &conv_params, kQuantizationTolerance,
       kTensorsSize, tensors);
 }
 

From 79960845e7f4a4b6a3af93fa45bd8653b249ee8f Mon Sep 17 00:00:00 2001
From: Andy Ly <lyandy@google.com>
Date: Thu, 25 Jun 2020 11:07:55 -0700
Subject: [PATCH 1089/1390] Replaced unused dyn_cast with isa in GraphPruning
 pass. (NFC)

PiperOrigin-RevId: 318307554
Change-Id: I0fdaab87f8fa98696e02269d4bd9b12c6e8a3fa4
---
 tensorflow/compiler/mlir/tensorflow/transforms/graph_pruning.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/graph_pruning.cc b/tensorflow/compiler/mlir/tensorflow/transforms/graph_pruning.cc
index 498b9fa79a0..f4d3eda3e7e 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/graph_pruning.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/graph_pruning.cc
@@ -53,7 +53,7 @@ void PruneGraph(GraphOp graph) {
 
   while (!ops_to_visit.empty()) {
     Operation* op = ops_to_visit.pop_back_val();
-    if (auto island_op = llvm::dyn_cast<IslandOp>(op)) {
+    if (llvm::isa<IslandOp>(op)) {
       // Visit island and island inner ops operands.
       op->walk([&](Operation* inner_op) { visit_op(inner_op); });
       continue;

From 13e7dee685a9d7cd753ef1f6a3ac8ff54f679927 Mon Sep 17 00:00:00 2001
From: Katherine Wu <kathywu@google.com>
Date: Thu, 25 Jun 2020 11:29:31 -0700
Subject: [PATCH 1090/1390] Don't trace OpLayer in SavedModel.

OP Layers wrap a single Tensorflow op in a Layer class. Previously, SavedModel would wrap every internal layer call in a tf.function, so that the user can inspect individual layers in the loaded model. For TensorflowOpLayer, this is unnecessary because (1) wrapping a single op in a tf.function is very inefficient (2) the user is unlikely to individually inspect the autogenerated op layers in the loaded model.

This change also resolves the saving issue that occurs when a user builds a functional model while using the eager-computed results of `tf.shape(x)` as the input shape to another op layer.

An example to help illustrate:
```
x = tf.keras.Input((2,))  # Shape is (None, 2)
state = tf.zeros(4, tf.shape(x)[0])  # Expected shape is (4, None)
LSTM(inputs, initial_state=state)
```

Prior to this CL, the TensorFlowOpLayers generated for tf.shape and tf.zeros would be separately wrapped in tf.functions when saving. This results in `state` having a shape of `(None, None)` instead of `(4, None)`, causing potential problems when saving the rest of the model.

PiperOrigin-RevId: 318311978
Change-Id: I15099d8ba29c1d4facd3f88630f8e2651f22ae83
---
 tensorflow/python/keras/engine/base_layer.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tensorflow/python/keras/engine/base_layer.py b/tensorflow/python/keras/engine/base_layer.py
index 1cd28a7a6e4..a9239144231 100644
--- a/tensorflow/python/keras/engine/base_layer.py
+++ b/tensorflow/python/keras/engine/base_layer.py
@@ -3093,6 +3093,9 @@ class TensorFlowOpLayer(Layer):
     # This means `built` is not set in `__call__`.
     self.built = True
 
+    # Do not individually trace TensorflowOpLayers in the SavedModel.
+    self._must_restore_from_config = True
+
   def call(self, inputs):
     if context.executing_eagerly():
       return self._defun_call(inputs)

From ddc3c83a8b09d14ae66f78fd5f30fc1b4c4f74cb Mon Sep 17 00:00:00 2001
From: Ruoxin Sang <rxsang@google.com>
Date: Thu, 25 Jun 2020 11:33:15 -0700
Subject: [PATCH 1091/1390] Fix a memory leak in TPUStrategy. Currently
 TPUStrategy always caches the function passed into `strategy.run`, which will
 causes objects not released in time.

PiperOrigin-RevId: 318312817
Change-Id: I7b275fe87a4454d94ce31076ce38d12a635c477b
---
 tensorflow/python/distribute/tpu_strategy.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/tensorflow/python/distribute/tpu_strategy.py b/tensorflow/python/distribute/tpu_strategy.py
index df393c61dbb..96d39a6ec2b 100644
--- a/tensorflow/python/distribute/tpu_strategy.py
+++ b/tensorflow/python/distribute/tpu_strategy.py
@@ -477,7 +477,11 @@ class TPUExtended(distribute_lib.StrategyExtendedV1):
       # not specified.
       steps_per_run = 1
 
+    # `self._tpu_function_cache` is a dict of `tf.function`s, thus if a
+    # `tf.function` is passed into `strategy.run` in eager mode, the
+    # `tf.function` won't get retraced.
     self._tpu_function_cache = weakref.WeakKeyDictionary()
+
     self._tpu_cluster_resolver = tpu_cluster_resolver
     self._tpu_metadata = self._tpu_cluster_resolver.get_tpu_system_metadata()
     self._device_assignment = device_assignment
@@ -1095,7 +1099,7 @@ class TPUExtended(distribute_lib.StrategyExtendedV1):
     return func(args, kwargs)
 
   def _tpu_function_creator(self, fn, options):
-    if fn in self._tpu_function_cache:
+    if context.executing_eagerly() and fn in self._tpu_function_cache:
       return self._tpu_function_cache[fn]
 
     strategy = self._container_strategy()
@@ -1180,8 +1184,7 @@ class TPUExtended(distribute_lib.StrategyExtendedV1):
 
     if context.executing_eagerly():
       tpu_function = def_function.function(tpu_function)
-
-    self._tpu_function_cache[fn] = tpu_function
+      self._tpu_function_cache[fn] = tpu_function
     return tpu_function
 
   def _in_multi_worker_mode(self):

From b9b4645ec407b693d73eb29c7dd9cea15610ec45 Mon Sep 17 00:00:00 2001
From: Marcello Maggioni <maggioni@google.com>
Date: Thu, 25 Jun 2020 11:40:15 -0700
Subject: [PATCH 1092/1390] [XLA] Change builder to emit kLogistic when asked
 to emit code for Logistic function.

PiperOrigin-RevId: 318314472
Change-Id: I9cc5a2e7ad9fefe85ef0099d2b9d33c31b5f405d
---
 .../tensorflow/transforms/legalize_hlo_patterns.td    |  1 +
 tensorflow/compiler/mlir/xla/ir/hlo_ops.td            |  4 ++++
 tensorflow/compiler/mlir/xla/ir/hlo_ops_base.td       | 11 +++++++++++
 .../mlir/xla/transforms/legalize_tf_patterns.td       |  1 +
 tensorflow/compiler/xla/client/lib/math.cc            |  5 -----
 tensorflow/compiler/xla/client/lib/math.h             |  3 ---
 tensorflow/compiler/xla/client/xla_builder.cc         |  3 +++
 tensorflow/compiler/xla/client/xla_builder.h          |  4 ++++
 8 files changed, 24 insertions(+), 8 deletions(-)

diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/legalize_hlo_patterns.td b/tensorflow/compiler/mlir/tensorflow/transforms/legalize_hlo_patterns.td
index 3e910cd9512..28a857172d3 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/legalize_hlo_patterns.td
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/legalize_hlo_patterns.td
@@ -111,6 +111,7 @@ foreach Mapping = [[HLO_AbsOp, TF_AbsOp],
                    [HLO_IsFiniteOp, TF_IsFiniteOp],
                    [HLO_LogOp, TF_LogOp],
                    [HLO_Log1pOp, TF_Log1pOp],
+                   [HLO_LogisticOp, TF_SigmoidOp],
                    [HLO_NotOp, TF_LogicalNotOp],
                    [HLO_NegOp, TF_NegOp],
                    [HLO_RealOp, TF_RealOp],
diff --git a/tensorflow/compiler/mlir/xla/ir/hlo_ops.td b/tensorflow/compiler/mlir/xla/ir/hlo_ops.td
index f92d1c5b85c..8ec0008515a 100644
--- a/tensorflow/compiler/mlir/xla/ir/hlo_ops.td
+++ b/tensorflow/compiler/mlir/xla/ir/hlo_ops.td
@@ -212,6 +212,10 @@ def HLO_Log1pOp: HLO_UnaryElementwiseOp<"log_plus_one",
     [NoSideEffect, SameOperandsAndResultType], HLO_FpOrComplexTensor>,
     BASE_HLO_Log1pOp;
 
+def HLO_LogisticOp: HLO_UnaryElementwiseOp<"logistic",
+    [NoSideEffect, SameOperandsAndResultType], HLO_FpOrComplexTensor>,
+    BASE_HLO_LogisticOp;
+
 def HLO_NotOp: HLO_UnaryElementwiseOp<"not",
     [NoSideEffect, SameOperandsAndResultType], HLO_PredOrIntTensor>,
     BASE_HLO_NotOp;
diff --git a/tensorflow/compiler/mlir/xla/ir/hlo_ops_base.td b/tensorflow/compiler/mlir/xla/ir/hlo_ops_base.td
index b0975d9ab03..84045d25e3e 100644
--- a/tensorflow/compiler/mlir/xla/ir/hlo_ops_base.td
+++ b/tensorflow/compiler/mlir/xla/ir/hlo_ops_base.td
@@ -264,6 +264,17 @@ class BASE_HLO_Log1pOp {
   }];
 }
 
+class BASE_HLO_LogisticOp {
+  string summary = "Logistic operator";
+
+  string description = [{
+    Returns `logistic(operand)` element-wise.
+
+    See
+    https://www.tensorflow.org/xla/operation_semantics#element-wise_unary_functions.
+  }];
+}
+
 class BASE_HLO_NegOp {
   string summary = "Negation operator";
 
diff --git a/tensorflow/compiler/mlir/xla/transforms/legalize_tf_patterns.td b/tensorflow/compiler/mlir/xla/transforms/legalize_tf_patterns.td
index df7b887fcad..c7c7f142b11 100644
--- a/tensorflow/compiler/mlir/xla/transforms/legalize_tf_patterns.td
+++ b/tensorflow/compiler/mlir/xla/transforms/legalize_tf_patterns.td
@@ -543,6 +543,7 @@ foreach Mapping = [
                    [TF_NegOp, HLO_NegOp],
                    [TF_RealOp, HLO_RealOp],
                    [TF_RsqrtOp, HLO_RsqrtOp],
+                   [TF_SigmoidOp, HLO_LogisticOp],
                    [TF_SinOp, HLO_SinOp],
                    [TF_SqrtOp, HLO_SqrtOp],
                    [TF_TanhOp, HLO_TanhOp],
diff --git a/tensorflow/compiler/xla/client/lib/math.cc b/tensorflow/compiler/xla/client/lib/math.cc
index 6cbaa043055..baafd7d705b 100644
--- a/tensorflow/compiler/xla/client/lib/math.cc
+++ b/tensorflow/compiler/xla/client/lib/math.cc
@@ -1393,11 +1393,6 @@ XlaOp NextAfter(XlaOp from, XlaOp to) {
   });
 }
 
-XlaOp Logistic(XlaOp x) {
-  auto one = xla::ScalarLike(x, 1);
-  return xla::Div(one, (one + xla::Exp(xla::Neg(x))));
-}
-
 // Computes an approximation to the modified Bessel function of the first kind,
 // zeroth order.
 // The following implementation follows Cephes' F32 and F64 implementation of
diff --git a/tensorflow/compiler/xla/client/lib/math.h b/tensorflow/compiler/xla/client/lib/math.h
index f862372a288..f03348c0a57 100644
--- a/tensorflow/compiler/xla/client/lib/math.h
+++ b/tensorflow/compiler/xla/client/lib/math.h
@@ -111,9 +111,6 @@ XlaOp Sinh(XlaOp x);
 // is true, otherwise returns its argument.
 xla::XlaOp MaybeConjugate(xla::XlaOp x, bool conjugate);
 
-// Computes the logistic function: logistic(x) = 0.5 + 0.5 * tanh(0.5 * x).
-XlaOp Logistic(XlaOp x);
-
 // Computes the Modified Bessel function of the first kind of the zeroth order
 // at x.
 XlaOp BesselI0e(XlaOp x);
diff --git a/tensorflow/compiler/xla/client/xla_builder.cc b/tensorflow/compiler/xla/client/xla_builder.cc
index 56e9aba6112..89bc30e1a0e 100644
--- a/tensorflow/compiler/xla/client/xla_builder.cc
+++ b/tensorflow/compiler/xla/client/xla_builder.cc
@@ -3639,6 +3639,9 @@ XlaOp Log(const XlaOp operand) {
 XlaOp Log1p(const XlaOp operand) {
   return operand.builder()->UnaryOp(HloOpcode::kLog1p, operand);
 }
+XlaOp Logistic(const XlaOp operand) {
+  return operand.builder()->UnaryOp(HloOpcode::kLogistic, operand);
+}
 XlaOp Sign(const XlaOp operand) {
   return operand.builder()->UnaryOp(HloOpcode::kSign, operand);
 }
diff --git a/tensorflow/compiler/xla/client/xla_builder.h b/tensorflow/compiler/xla/client/xla_builder.h
index 23a29243ccc..12623d7912f 100644
--- a/tensorflow/compiler/xla/client/xla_builder.h
+++ b/tensorflow/compiler/xla/client/xla_builder.h
@@ -1073,6 +1073,7 @@ class XlaBuilder {
   friend XlaOp Round(XlaOp operand);
   friend XlaOp Log(XlaOp operand);
   friend XlaOp Log1p(XlaOp operand);
+  friend XlaOp Logistic(XlaOp operand);
   friend XlaOp Sign(XlaOp operand);
   friend XlaOp Clz(XlaOp operand);
   friend XlaOp Cos(XlaOp operand);
@@ -1914,6 +1915,9 @@ XlaOp Log(XlaOp operand);
 // Enqueues an log1p instruction (log(x+1)) onto the computation.
 XlaOp Log1p(XlaOp operand);
 
+// Enqueues a logistic instruction onto the computation.
+XlaOp Logistic(XlaOp operand);
+
 // Enqueues a sign instruction onto the computation.
 XlaOp Sign(XlaOp operand);
 

From 5d6b09c84b9be3bd22a852a53d91d30d705b12fb Mon Sep 17 00:00:00 2001
From: VoVAllen <jz1749@nyu.edu>
Date: Thu, 25 Jun 2020 18:58:02 +0000
Subject: [PATCH 1093/1390] fix

---
 tensorflow/c/eager/dlpack.cc     | 3 +--
 tensorflow/python/tfe_wrapper.cc | 5 +++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/tensorflow/c/eager/dlpack.cc b/tensorflow/c/eager/dlpack.cc
index a0d6fe914c2..25d9a422b3e 100644
--- a/tensorflow/c/eager/dlpack.cc
+++ b/tensorflow/c/eager/dlpack.cc
@@ -221,8 +221,7 @@ Status TfDataTypeFormDlDataType(const DLDataType& dtype,
 // Wraps the deleter function of DLManagedTensor to match the function signature
 // TFE_NewTensorHandleFromDeviceMemory.
 void DeallocatorWrapperFunc(void* data, size_t len, void* dlmt_vptr) {
-  DLManagedTensor* dlmt = static_cast<DLManagedTensor*>(dlmt_vptr);
-  dlmt->deleter(const_cast<DLManagedTensor*>(dlmt));
+  TFE_CallDLManagedTensorDeleter(dlmt_vptr);
 }
 
 // Checks whether the stride array matches the layout of compact, row-majored
diff --git a/tensorflow/python/tfe_wrapper.cc b/tensorflow/python/tfe_wrapper.cc
index 80cce331353..63ac82e80a5 100644
--- a/tensorflow/python/tfe_wrapper.cc
+++ b/tensorflow/python/tfe_wrapper.cc
@@ -1158,7 +1158,7 @@ PYBIND11_MODULE(_pywrap_tfe, m) {
       status->status = tensorflow::errors::InvalidArgument(
           "DLPack tensor must be a capsule with name \"dltensor\", got \"%s\". "
           "Note that a DLPack tensor may be consumed at most once.",
-          absl::string_view(pycapsule.name()));
+          pycapsule.name());
       tensorflow::MaybeRaiseRegisteredFromTFStatus(status.get());
     }
 
@@ -1169,7 +1169,8 @@ PYBIND11_MODULE(_pywrap_tfe, m) {
 
     PyCapsule_SetName(pycapsule.ptr(), "used_dltensor");
     PyCapsule_SetDestructor(pycapsule.ptr(), nullptr);
-    return py::handle(EagerTensorFromHandle(thandle));
+    PyObject* pyhandle = EagerTensorFromHandle(thandle, true);
+    return tensorflow::PyoOrThrow(pyhandle);
   });
 
   m.def("TFE_Py_RegisterCustomDevice", [](const py::handle& context,

From 87cf536bfdb541a83275d5d5775cb439ee7a6992 Mon Sep 17 00:00:00 2001
From: VoVAllen <jz1749@nyu.edu>
Date: Thu, 25 Jun 2020 19:05:00 +0000
Subject: [PATCH 1094/1390] revert

---
 tensorflow/python/tfe_wrapper.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/tfe_wrapper.cc b/tensorflow/python/tfe_wrapper.cc
index 63ac82e80a5..41714456cdc 100644
--- a/tensorflow/python/tfe_wrapper.cc
+++ b/tensorflow/python/tfe_wrapper.cc
@@ -1158,7 +1158,7 @@ PYBIND11_MODULE(_pywrap_tfe, m) {
       status->status = tensorflow::errors::InvalidArgument(
           "DLPack tensor must be a capsule with name \"dltensor\", got \"%s\". "
           "Note that a DLPack tensor may be consumed at most once.",
-          pycapsule.name());
+          absl::string_view(pycapsule.name()));
       tensorflow::MaybeRaiseRegisteredFromTFStatus(status.get());
     }
 

From 0fd7e611baf70a73b7d73c2f23c95d76d3c889bb Mon Sep 17 00:00:00 2001
From: Shanqing Cai <cais@google.com>
Date: Thu, 25 Jun 2020 12:06:42 -0700
Subject: [PATCH 1095/1390] [tfdbg] Add reasons for run_v1_only decorators

PiperOrigin-RevId: 318320055
Change-Id: I26271c9339b0381dd624a8dcee8758083b1313a8
---
 .../python/debug/cli/profile_analyzer_cli_test.py     |  4 ++--
 tensorflow/python/debug/lib/debug_utils_test.py       | 11 +----------
 .../python/debug/lib/session_debug_grpc_test.py       |  7 ++++---
 3 files changed, 7 insertions(+), 15 deletions(-)

diff --git a/tensorflow/python/debug/cli/profile_analyzer_cli_test.py b/tensorflow/python/debug/cli/profile_analyzer_cli_test.py
index 8f159a0a740..ee4c5a1a6fc 100644
--- a/tensorflow/python/debug/cli/profile_analyzer_cli_test.py
+++ b/tensorflow/python/debug/cli/profile_analyzer_cli_test.py
@@ -70,7 +70,7 @@ def _assert_no_lines_match(pattern, lines):
         "%s matched at least one line in %s." % (pattern, str(lines)))
 
 
-@test_util.run_v1_only("b/120545219")
+@test_util.run_v1_only("Requires tf.Session")
 class ProfileAnalyzerListProfileTest(test_util.TensorFlowTestCase):
 
   def testNodeInfoEmpty(self):
@@ -322,7 +322,7 @@ class ProfileAnalyzerListProfileTest(test_util.TensorFlowTestCase):
     _assert_at_least_one_line_matches(r"Device Total.*0\.009ms", prof_output)
 
 
-@test_util.run_v1_only("b/120545219")
+@test_util.run_v1_only("Requires tf.Session")
 class ProfileAnalyzerPrintSourceTest(test_util.TensorFlowTestCase):
 
   def setUp(self):
diff --git a/tensorflow/python/debug/lib/debug_utils_test.py b/tensorflow/python/debug/lib/debug_utils_test.py
index 6e0b637b7c8..c8effc8eeed 100644
--- a/tensorflow/python/debug/lib/debug_utils_test.py
+++ b/tensorflow/python/debug/lib/debug_utils_test.py
@@ -31,6 +31,7 @@ from tensorflow.python.ops import variables
 from tensorflow.python.platform import googletest
 
 
+@test_util.run_v1_only("Requires tf.Session")
 class DebugUtilsTest(test_util.TensorFlowTestCase):
 
   @classmethod
@@ -192,7 +193,6 @@ class DebugUtilsTest(test_util.TensorFlowTestCase):
     self.assertEqual(["file:///tmp/tfdbg_1", "file:///tmp/tfdbg_2"],
                      watch_0.debug_urls)
 
-  @test_util.run_v1_only("b/120545219")
   def testWatchGraph_allNodes(self):
     debug_utils.watch_graph(
         self._run_options,
@@ -227,7 +227,6 @@ class DebugUtilsTest(test_util.TensorFlowTestCase):
     # Assert that the wildcard node name has been created.
     self.assertIn("*", node_names)
 
-  @test_util.run_v1_only("b/120545219")
   def testWatchGraph_nodeNameWhitelist(self):
     debug_utils.watch_graph(
         self._run_options,
@@ -242,7 +241,6 @@ class DebugUtilsTest(test_util.TensorFlowTestCase):
         sorted(["a1_init", "a1", "a1/Assign", "a1/read", "p1"]),
         sorted(node_names))
 
-  @test_util.run_v1_only("b/120545219")
   def testWatchGraph_opTypeWhitelist(self):
     debug_utils.watch_graph(
         self._run_options,
@@ -268,7 +266,6 @@ class DebugUtilsTest(test_util.TensorFlowTestCase):
         ["DebugIdentity"], ["file:///tmp/tfdbg_1"])
     self.assertEqual(["p1"], node_names)
 
-  @test_util.run_v1_only("b/120545219")
   def testWatchGraph_tensorDTypeWhitelist(self):
     debug_utils.watch_graph(
         self._run_options,
@@ -281,7 +278,6 @@ class DebugUtilsTest(test_util.TensorFlowTestCase):
         ["DebugIdentity"], ["file:///tmp/tfdbg_1"])
     self.assertItemsEqual(["a1", "a1/Assign", "b", "b/Assign"], node_names)
 
-  @test_util.run_v1_only("b/120545219")
   def testWatchGraph_nodeNameAndTensorDTypeWhitelists(self):
     debug_utils.watch_graph(
         self._run_options,
@@ -295,7 +291,6 @@ class DebugUtilsTest(test_util.TensorFlowTestCase):
         ["DebugIdentity"], ["file:///tmp/tfdbg_1"])
     self.assertItemsEqual(["a1", "a1/Assign"], node_names)
 
-  @test_util.run_v1_only("b/120545219")
   def testWatchGraph_nodeNameBlacklist(self):
     debug_utils.watch_graph_with_blacklists(
         self._run_options,
@@ -310,7 +305,6 @@ class DebugUtilsTest(test_util.TensorFlowTestCase):
         sorted(["b_init", "b", "b/Assign", "b/read", "c", "s"]),
         sorted(node_names))
 
-  @test_util.run_v1_only("b/120545219")
   def testWatchGraph_opTypeBlacklist(self):
     debug_utils.watch_graph_with_blacklists(
         self._run_options,
@@ -323,7 +317,6 @@ class DebugUtilsTest(test_util.TensorFlowTestCase):
         ["DebugIdentity"], ["file:///tmp/tfdbg_1"])
     self.assertEqual(sorted(["p1", "s"]), sorted(node_names))
 
-  @test_util.run_v1_only("b/120545219")
   def testWatchGraph_nodeNameAndOpTypeBlacklists(self):
     debug_utils.watch_graph_with_blacklists(
         self._run_options,
@@ -337,7 +330,6 @@ class DebugUtilsTest(test_util.TensorFlowTestCase):
         ["DebugIdentity"], ["file:///tmp/tfdbg_1"])
     self.assertEqual(["s"], node_names)
 
-  @test_util.run_v1_only("b/120545219")
   def testWatchGraph_tensorDTypeBlacklists(self):
     debug_utils.watch_graph_with_blacklists(
         self._run_options,
@@ -354,7 +346,6 @@ class DebugUtilsTest(test_util.TensorFlowTestCase):
     self.assertNotIn("b/Assign", node_names)
     self.assertIn("s", node_names)
 
-  @test_util.run_v1_only("b/120545219")
   def testWatchGraph_nodeNameAndTensorDTypeBlacklists(self):
     debug_utils.watch_graph_with_blacklists(
         self._run_options,
diff --git a/tensorflow/python/debug/lib/session_debug_grpc_test.py b/tensorflow/python/debug/lib/session_debug_grpc_test.py
index 196c34695e4..6cf8d8b5a41 100644
--- a/tensorflow/python/debug/lib/session_debug_grpc_test.py
+++ b/tensorflow/python/debug/lib/session_debug_grpc_test.py
@@ -91,7 +91,8 @@ class GrpcDebugServerTest(test_util.TensorFlowTestCase):
     server.stop_server().wait()
 
 
-@test_util.run_v1_only("b/120545219")
+@test_util.run_v1_only(
+    "GrpcDebugWrapperSession and GrpcDebugHookare are for tf.Session only")
 class SessionDebugGrpcTest(session_debug_testlib.SessionDebugTestBase):
 
   @classmethod
@@ -354,7 +355,7 @@ class SessionDebugConcurrentTest(
     return urls
 
 
-@test_util.run_v1_only("b/120545219")
+@test_util.run_v1_only("GrpcDebugWrapperSession is for tf.Session only")
 class SessionDebugGrpcGatingTest(test_util.TensorFlowTestCase):
   """Test server gating of debug ops."""
 
@@ -732,7 +733,7 @@ class SessionDebugGrpcGatingTest(test_util.TensorFlowTestCase):
       self.assertEqual("DebugNumericSummary", debug_watch.debug_op)
 
 
-@test_util.run_v1_only("b/120545219")
+@test_util.run_v1_only("GrpcDebugWrapperSession is for tf.Session only")
 class DelayedDebugServerTest(test_util.TensorFlowTestCase):
 
   def testDebuggedSessionRunWorksWithDelayedDebugServerStartup(self):

From 112aa0206d22a55dbb644fc117ccfc33c2130b45 Mon Sep 17 00:00:00 2001
From: Austin Anderson <angerson@google.com>
Date: Thu, 25 Jun 2020 12:33:05 -0700
Subject: [PATCH 1096/1390] Internal experimental CI changes

PiperOrigin-RevId: 318325499
Change-Id: I32ca160bd0ecc01676b8758f3496415df8b7268c
---
 .../per_release/scripts/cpu_libtensorflow.sh  | 40 +++++++++++
 .../per_release/scripts/cpu_py35_nonpip.sh    | 48 +++++++++++++
 .../per_release/scripts/cpu_py35_pip.sh       | 52 ++++++++++++++
 .../per_release/scripts/cpu_py36_nonpip.sh    | 48 +++++++++++++
 .../per_release/scripts/cpu_py36_pip.sh       | 52 ++++++++++++++
 .../per_release/scripts/cpu_py37_nonpip.sh    | 48 +++++++++++++
 .../per_release/scripts/cpu_py37_pip.sh       | 52 ++++++++++++++
 .../per_release/scripts/cpu_py38_nonpip.sh    | 48 +++++++++++++
 .../per_release/scripts/cpu_py38_pip.sh       | 52 ++++++++++++++
 .../per_release/scripts/gpu_libtensorflow.sh  | 40 +++++++++++
 .../per_release/scripts/gpu_pip_on_cpu.sh     | 61 ++++++++++++++++
 .../{nonpip_gpu.sh => gpu_py35_nonpip.sh}     | 23 ++-----
 .../per_release/scripts/gpu_py35_pip.sh       | 69 +++++++++++++++++++
 .../per_release/scripts/gpu_py36_nonpip.sh    | 60 ++++++++++++++++
 .../per_release/scripts/gpu_py36_pip.sh       | 69 +++++++++++++++++++
 .../per_release/scripts/gpu_py37_nonpip.sh    | 60 ++++++++++++++++
 .../per_release/scripts/gpu_py37_pip.sh       | 69 +++++++++++++++++++
 .../per_release/scripts/gpu_py38_nonpip.sh    | 60 ++++++++++++++++
 .../per_release/scripts/gpu_py38_pip.sh       | 69 +++++++++++++++++++
 .../ci_build/per_release/scripts/sanity.sh    | 36 ++++++++++
 20 files changed, 1037 insertions(+), 19 deletions(-)
 create mode 100644 tensorflow/tools/ci_build/per_release/scripts/cpu_libtensorflow.sh
 create mode 100644 tensorflow/tools/ci_build/per_release/scripts/cpu_py35_nonpip.sh
 create mode 100644 tensorflow/tools/ci_build/per_release/scripts/cpu_py35_pip.sh
 create mode 100644 tensorflow/tools/ci_build/per_release/scripts/cpu_py36_nonpip.sh
 create mode 100644 tensorflow/tools/ci_build/per_release/scripts/cpu_py36_pip.sh
 create mode 100644 tensorflow/tools/ci_build/per_release/scripts/cpu_py37_nonpip.sh
 create mode 100644 tensorflow/tools/ci_build/per_release/scripts/cpu_py37_pip.sh
 create mode 100644 tensorflow/tools/ci_build/per_release/scripts/cpu_py38_nonpip.sh
 create mode 100644 tensorflow/tools/ci_build/per_release/scripts/cpu_py38_pip.sh
 create mode 100644 tensorflow/tools/ci_build/per_release/scripts/gpu_libtensorflow.sh
 create mode 100755 tensorflow/tools/ci_build/per_release/scripts/gpu_pip_on_cpu.sh
 rename tensorflow/tools/ci_build/per_release/scripts/{nonpip_gpu.sh => gpu_py35_nonpip.sh} (79%)
 create mode 100644 tensorflow/tools/ci_build/per_release/scripts/gpu_py35_pip.sh
 create mode 100644 tensorflow/tools/ci_build/per_release/scripts/gpu_py36_nonpip.sh
 create mode 100644 tensorflow/tools/ci_build/per_release/scripts/gpu_py36_pip.sh
 create mode 100644 tensorflow/tools/ci_build/per_release/scripts/gpu_py37_nonpip.sh
 create mode 100644 tensorflow/tools/ci_build/per_release/scripts/gpu_py37_pip.sh
 create mode 100644 tensorflow/tools/ci_build/per_release/scripts/gpu_py38_nonpip.sh
 create mode 100644 tensorflow/tools/ci_build/per_release/scripts/gpu_py38_pip.sh
 create mode 100644 tensorflow/tools/ci_build/per_release/scripts/sanity.sh

diff --git a/tensorflow/tools/ci_build/per_release/scripts/cpu_libtensorflow.sh b/tensorflow/tools/ci_build/per_release/scripts/cpu_libtensorflow.sh
new file mode 100644
index 00000000000..a0e3a7f4594
--- /dev/null
+++ b/tensorflow/tools/ci_build/per_release/scripts/cpu_libtensorflow.sh
@@ -0,0 +1,40 @@
+#!/bin/bash
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+set -e
+
+# Source the external common scripts.
+source tensorflow/tools/ci_build/release/common.sh
+
+
+# Install latest bazel
+install_bazelisk
+which bazel
+
+# Install realpath
+sudo apt-get install realpath
+
+# Update the version string to nightly
+if [ -n "${IS_NIGHTLY_BUILD}" ]; then
+  ./tensorflow/tools/ci_build/update_version.py --nightly
+fi
+
+./tensorflow/tools/ci_build/linux/libtensorflow.sh
+
+# Copy the nightly version update script
+if [ -n "${IS_NIGHTLY_BUILD}" ]; then
+  cp tensorflow/tools/ci_build/builds/libtensorflow_nightly_symlink.sh lib_package
+fi
+
diff --git a/tensorflow/tools/ci_build/per_release/scripts/cpu_py35_nonpip.sh b/tensorflow/tools/ci_build/per_release/scripts/cpu_py35_nonpip.sh
new file mode 100644
index 00000000000..5339671cce3
--- /dev/null
+++ b/tensorflow/tools/ci_build/per_release/scripts/cpu_py35_nonpip.sh
@@ -0,0 +1,48 @@
+#!/bin/bash
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+set -e
+set -x
+
+source tensorflow/tools/ci_build/release/common.sh
+
+install_ubuntu_16_pip_deps pip3.5
+# Update bazel
+install_bazelisk
+
+# Run configure.
+export TF_NEED_GCP=1
+export TF_NEED_HDFS=1
+export TF_NEED_S3=1
+export TF_NEED_CUDA=0
+export CC_OPT_FLAGS='-mavx'
+export PYTHON_BIN_PATH=$(which python3.5)
+export TF2_BEHAVIOR=1
+yes "" | "$PYTHON_BIN_PATH" configure.py
+tag_filters="-no_oss,-oss_serial,-gpu,-tpu,-benchmark-test,-no_oss_py35,-v1only"
+
+# Get the default test targets for bazel.
+source tensorflow/tools/ci_build/build_scripts/PRESUBMIT_BUILD_TARGETS.sh
+
+# Run tests
+set +e
+bazel test --test_output=errors --config=opt --test_lang_filters=py \
+  --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.1:toolchain \
+  --linkopt=-lrt \
+  --action_env=TF2_BEHAVIOR="${TF2_BEHAVIOR}" \
+  --build_tag_filters="${tag_filters}" \
+  --test_tag_filters="${tag_filters}" -- \
+  ${DEFAULT_BAZEL_TARGETS} -//tensorflow/lite/...
+test_xml_summary_exit
diff --git a/tensorflow/tools/ci_build/per_release/scripts/cpu_py35_pip.sh b/tensorflow/tools/ci_build/per_release/scripts/cpu_py35_pip.sh
new file mode 100644
index 00000000000..5d0cbacb0b7
--- /dev/null
+++ b/tensorflow/tools/ci_build/per_release/scripts/cpu_py35_pip.sh
@@ -0,0 +1,52 @@
+#!/bin/bash
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+set -e
+set -x
+
+source tensorflow/tools/ci_build/release/common.sh
+
+install_ubuntu_16_pip_deps pip3.5
+# Update bazel
+install_bazelisk
+
+# Export required variables for running pip.sh
+export OS_TYPE="UBUNTU"
+export CONTAINER_TYPE="CPU"
+export TF_PYTHON_VERSION='python3.5'
+
+# Run configure.
+export TF_NEED_GCP=1
+export TF_NEED_HDFS=1
+export TF_NEED_S3=1
+export TF_NEED_CUDA=0
+export CC_OPT_FLAGS='-mavx'
+export PYTHON_BIN_PATH=$(which ${TF_PYTHON_VERSION})
+yes "" | "$PYTHON_BIN_PATH" configure.py
+
+# Get the default test targets for bazel.
+source tensorflow/tools/ci_build/build_scripts/PRESUBMIT_BUILD_TARGETS.sh
+
+# Export optional variables for running pip.sh
+export TF_BUILD_FLAGS="--config=opt --config=v2 --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.1:toolchain"
+export TF_TEST_FLAGS="--define=no_tensorflow_py_deps=true --test_lang_filters=py --test_output=errors --verbose_failures=true --keep_going --test_env=TF2_BEHAVIOR=1"
+export TF_TEST_TARGETS="${DEFAULT_BAZEL_TARGETS} -//tensorflow/lite/... "
+export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
+export TF_TEST_FILTER_TAGS='-no_oss,-oss_serial,-no_oss_py35,-v1only'
+#export IS_NIGHTLY=0 # Not nightly; uncomment if building from tf repo.
+export TF_PROJECT_NAME="tensorflow_cpu"
+export TF_PIP_TEST_ROOT="pip_test"
+
+./tensorflow/tools/ci_build/builds/pip_new.sh
diff --git a/tensorflow/tools/ci_build/per_release/scripts/cpu_py36_nonpip.sh b/tensorflow/tools/ci_build/per_release/scripts/cpu_py36_nonpip.sh
new file mode 100644
index 00000000000..c2790420afc
--- /dev/null
+++ b/tensorflow/tools/ci_build/per_release/scripts/cpu_py36_nonpip.sh
@@ -0,0 +1,48 @@
+#!/bin/bash
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+set -e
+set -x
+
+source tensorflow/tools/ci_build/release/common.sh
+
+install_ubuntu_16_pip_deps pip3.6
+# Update bazel
+install_bazelisk
+
+# Run configure.
+export TF_NEED_GCP=1
+export TF_NEED_HDFS=1
+export TF_NEED_S3=1
+export TF_NEED_CUDA=0
+export CC_OPT_FLAGS='-mavx'
+export PYTHON_BIN_PATH=$(which python3.6)
+export TF2_BEHAVIOR=1
+yes "" | "$PYTHON_BIN_PATH" configure.py
+tag_filters="-no_oss,-oss_serial,-gpu,-tpu,-benchmark-test,-no_oss_py36,-v1only"
+
+# Get the default test targets for bazel.
+source tensorflow/tools/ci_build/build_scripts/PRESUBMIT_BUILD_TARGETS.sh
+
+# Run tests
+set +e
+bazel test --test_output=errors --config=opt --test_lang_filters=py \
+  --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.1:toolchain \
+  --linkopt=-lrt \
+  --action_env=TF2_BEHAVIOR="${TF2_BEHAVIOR}" \
+  --build_tag_filters="${tag_filters}" \
+  --test_tag_filters="${tag_filters}" -- \
+  ${DEFAULT_BAZEL_TARGETS} -//tensorflow/lite/...
+test_xml_summary_exit
diff --git a/tensorflow/tools/ci_build/per_release/scripts/cpu_py36_pip.sh b/tensorflow/tools/ci_build/per_release/scripts/cpu_py36_pip.sh
new file mode 100644
index 00000000000..25c4de88cdd
--- /dev/null
+++ b/tensorflow/tools/ci_build/per_release/scripts/cpu_py36_pip.sh
@@ -0,0 +1,52 @@
+#!/bin/bash
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+set -e
+set -x
+
+source tensorflow/tools/ci_build/release/common.sh
+
+install_ubuntu_16_pip_deps pip3.6
+# Update bazel
+install_bazelisk
+
+# Export required variables for running pip.sh
+export OS_TYPE="UBUNTU"
+export CONTAINER_TYPE="CPU"
+export TF_PYTHON_VERSION='python3.6'
+
+# Run configure.
+export TF_NEED_GCP=1
+export TF_NEED_HDFS=1
+export TF_NEED_S3=1
+export TF_NEED_CUDA=0
+export CC_OPT_FLAGS='-mavx'
+export PYTHON_BIN_PATH=$(which ${TF_PYTHON_VERSION})
+yes "" | "$PYTHON_BIN_PATH" configure.py
+
+# Get the default test targets for bazel.
+source tensorflow/tools/ci_build/build_scripts/PRESUBMIT_BUILD_TARGETS.sh
+
+# Export optional variables for running pip.sh
+export TF_BUILD_FLAGS="--config=opt --config=v2 --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.1:toolchain"
+export TF_TEST_FLAGS="--define=no_tensorflow_py_deps=true --test_lang_filters=py --test_output=errors --verbose_failures=true --keep_going --test_env=TF2_BEHAVIOR=1"
+export TF_TEST_TARGETS="${DEFAULT_BAZEL_TARGETS} -//tensorflow/lite/... "
+export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
+export TF_TEST_FILTER_TAGS='-no_oss,-oss_serial,-no_oss_py36,-v1only'
+#export IS_NIGHTLY=0 # Not nightly; uncomment if building from tf repo.
+export TF_PROJECT_NAME="tensorflow_cpu"
+export TF_PIP_TEST_ROOT="pip_test"
+
+./tensorflow/tools/ci_build/builds/pip_new.sh
diff --git a/tensorflow/tools/ci_build/per_release/scripts/cpu_py37_nonpip.sh b/tensorflow/tools/ci_build/per_release/scripts/cpu_py37_nonpip.sh
new file mode 100644
index 00000000000..f6415a7c9ad
--- /dev/null
+++ b/tensorflow/tools/ci_build/per_release/scripts/cpu_py37_nonpip.sh
@@ -0,0 +1,48 @@
+#!/bin/bash
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+set -e
+set -x
+
+source tensorflow/tools/ci_build/release/common.sh
+
+install_ubuntu_16_pip_deps pip3.7
+# Update bazel
+install_bazelisk
+
+# Run configure.
+export TF_NEED_GCP=1
+export TF_NEED_HDFS=1
+export TF_NEED_S3=1
+export TF_NEED_CUDA=0
+export CC_OPT_FLAGS='-mavx'
+export PYTHON_BIN_PATH=$(which python3.7)
+export TF2_BEHAVIOR=1
+yes "" | "$PYTHON_BIN_PATH" configure.py
+tag_filters="-no_oss,-oss_serial,-gpu,-tpu,-benchmark-test,-no_oss_py37,-v1only"
+
+# Get the default test targets for bazel.
+source tensorflow/tools/ci_build/build_scripts/PRESUBMIT_BUILD_TARGETS.sh
+
+# Run tests
+set +e
+bazel test --test_output=errors --config=opt --test_lang_filters=py \
+  --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.1:toolchain \
+  --linkopt=-lrt \
+  --action_env=TF2_BEHAVIOR="${TF2_BEHAVIOR}" \
+  --build_tag_filters="${tag_filters}" \
+  --test_tag_filters="${tag_filters}" -- \
+  ${DEFAULT_BAZEL_TARGETS} -//tensorflow/lite/...
+test_xml_summary_exit
diff --git a/tensorflow/tools/ci_build/per_release/scripts/cpu_py37_pip.sh b/tensorflow/tools/ci_build/per_release/scripts/cpu_py37_pip.sh
new file mode 100644
index 00000000000..940cef32ef8
--- /dev/null
+++ b/tensorflow/tools/ci_build/per_release/scripts/cpu_py37_pip.sh
@@ -0,0 +1,52 @@
+#!/bin/bash
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+set -e
+set -x
+
+source tensorflow/tools/ci_build/release/common.sh
+
+install_ubuntu_16_pip_deps pip3.7
+# Update bazel
+install_bazelisk
+
+# Export required variables for running pip.sh
+export OS_TYPE="UBUNTU"
+export CONTAINER_TYPE="CPU"
+export TF_PYTHON_VERSION='python3.7'
+
+# Run configure.
+export TF_NEED_GCP=1
+export TF_NEED_HDFS=1
+export TF_NEED_S3=1
+export TF_NEED_CUDA=0
+export CC_OPT_FLAGS='-mavx'
+export PYTHON_BIN_PATH=$(which ${TF_PYTHON_VERSION})
+yes "" | "$PYTHON_BIN_PATH" configure.py
+
+# Get the default test targets for bazel.
+source tensorflow/tools/ci_build/build_scripts/PRESUBMIT_BUILD_TARGETS.sh
+
+# Export optional variables for running pip.sh
+export TF_BUILD_FLAGS="--config=opt --config=v2 --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.1:toolchain"
+export TF_TEST_FLAGS="--define=no_tensorflow_py_deps=true --test_lang_filters=py --test_output=errors --verbose_failures=true --keep_going --test_env=TF2_BEHAVIOR=1"
+export TF_TEST_TARGETS="${DEFAULT_BAZEL_TARGETS} -//tensorflow/lite/... "
+export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
+export TF_TEST_FILTER_TAGS='-no_oss,-oss_serial,-no_oss_py37,-v1only'
+#export IS_NIGHTLY=0 # Not nightly; uncomment if building from tf repo.
+export TF_PROJECT_NAME="tensorflow_cpu"
+export TF_PIP_TEST_ROOT="pip_test"
+
+./tensorflow/tools/ci_build/builds/pip_new.sh
diff --git a/tensorflow/tools/ci_build/per_release/scripts/cpu_py38_nonpip.sh b/tensorflow/tools/ci_build/per_release/scripts/cpu_py38_nonpip.sh
new file mode 100644
index 00000000000..ff7a9f3baef
--- /dev/null
+++ b/tensorflow/tools/ci_build/per_release/scripts/cpu_py38_nonpip.sh
@@ -0,0 +1,48 @@
+#!/bin/bash
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+set -e
+set -x
+
+source tensorflow/tools/ci_build/release/common.sh
+
+install_ubuntu_16_pip_deps pip3.8
+# Update bazel
+install_bazelisk
+
+# Run configure.
+export TF_NEED_GCP=1
+export TF_NEED_HDFS=1
+export TF_NEED_S3=1
+export TF_NEED_CUDA=0
+export CC_OPT_FLAGS='-mavx'
+export PYTHON_BIN_PATH=$(which python3.8)
+export TF2_BEHAVIOR=1
+yes "" | "$PYTHON_BIN_PATH" configure.py
+tag_filters="-no_oss,-oss_serial,-gpu,-tpu,-benchmark-test,-no_oss_py38,-v1only"
+
+# Get the default test targets for bazel.
+source tensorflow/tools/ci_build/build_scripts/PRESUBMIT_BUILD_TARGETS.sh
+
+# Run tests
+set +e
+bazel test --test_output=errors --config=opt --test_lang_filters=py \
+  --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.1:toolchain \
+  --linkopt=-lrt \
+  --action_env=TF2_BEHAVIOR="${TF2_BEHAVIOR}" \
+  --build_tag_filters="${tag_filters}" \
+  --test_tag_filters="${tag_filters}" -- \
+  ${DEFAULT_BAZEL_TARGETS} -//tensorflow/lite/...
+test_xml_summary_exit
diff --git a/tensorflow/tools/ci_build/per_release/scripts/cpu_py38_pip.sh b/tensorflow/tools/ci_build/per_release/scripts/cpu_py38_pip.sh
new file mode 100644
index 00000000000..a27d1f863d6
--- /dev/null
+++ b/tensorflow/tools/ci_build/per_release/scripts/cpu_py38_pip.sh
@@ -0,0 +1,52 @@
+#!/bin/bash
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+set -e
+set -x
+
+source tensorflow/tools/ci_build/release/common.sh
+
+install_ubuntu_16_pip_deps pip3.8
+# Update bazel
+install_bazelisk
+
+# Export required variables for running pip.sh
+export OS_TYPE="UBUNTU"
+export CONTAINER_TYPE="CPU"
+export TF_PYTHON_VERSION='python3.8'
+
+# Run configure.
+export TF_NEED_GCP=1
+export TF_NEED_HDFS=1
+export TF_NEED_S3=1
+export TF_NEED_CUDA=0
+export CC_OPT_FLAGS='-mavx'
+export PYTHON_BIN_PATH=$(which ${TF_PYTHON_VERSION})
+yes "" | "$PYTHON_BIN_PATH" configure.py
+
+# Get the default test targets for bazel.
+source tensorflow/tools/ci_build/build_scripts/PRESUBMIT_BUILD_TARGETS.sh
+
+# Export optional variables for running pip.sh
+export TF_BUILD_FLAGS="--config=opt --config=v2 --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.1:toolchain"
+export TF_TEST_FLAGS="--define=no_tensorflow_py_deps=true --test_lang_filters=py --test_output=errors --verbose_failures=true --keep_going --test_env=TF2_BEHAVIOR=1"
+export TF_TEST_TARGETS="${DEFAULT_BAZEL_TARGETS} -//tensorflow/lite/... "
+export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
+export TF_TEST_FILTER_TAGS='-no_oss,-oss_serial,-no_oss_py38,-v1only'
+#export IS_NIGHTLY=0 # Not nightly; uncomment if building from tf repo.
+export TF_PROJECT_NAME="tensorflow_cpu"
+export TF_PIP_TEST_ROOT="pip_test"
+
+./tensorflow/tools/ci_build/builds/pip_new.sh
diff --git a/tensorflow/tools/ci_build/per_release/scripts/gpu_libtensorflow.sh b/tensorflow/tools/ci_build/per_release/scripts/gpu_libtensorflow.sh
new file mode 100644
index 00000000000..d294311d1ff
--- /dev/null
+++ b/tensorflow/tools/ci_build/per_release/scripts/gpu_libtensorflow.sh
@@ -0,0 +1,40 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+set -e
+
+# Source the external common scripts.
+source tensorflow/tools/ci_build/release/common.sh
+
+
+# Install latest bazel
+install_bazelisk
+which bazel
+
+# Install realpath
+sudo apt-get install realpath
+
+export TF_NEED_CUDA=1
+
+# Update the version string to nightly
+if [ -n "${IS_NIGHTLY_BUILD}" ]; then
+  ./tensorflow/tools/ci_build/update_version.py --nightly
+fi
+
+./tensorflow/tools/ci_build/linux/libtensorflow.sh
+
+# Copy the nightly version update script
+if [ -n "${IS_NIGHTLY_BUILD}" ]; then
+  cp tensorflow/tools/ci_build/builds/libtensorflow_nightly_symlink.sh lib_package
+fi
diff --git a/tensorflow/tools/ci_build/per_release/scripts/gpu_pip_on_cpu.sh b/tensorflow/tools/ci_build/per_release/scripts/gpu_pip_on_cpu.sh
new file mode 100755
index 00000000000..6e67bf20730
--- /dev/null
+++ b/tensorflow/tools/ci_build/per_release/scripts/gpu_pip_on_cpu.sh
@@ -0,0 +1,61 @@
+#!/bin/bash
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+set -e
+set -x
+
+source tensorflow/tools/ci_build/release/common.sh
+
+install_ubuntu_16_pip_deps pip3.6
+# Update Bazel to the desired version
+install_bazelisk
+
+# Run configure.
+export TF_NEED_GCP=1
+export TF_NEED_HDFS=1
+export TF_NEED_S3=1
+export TF_NEED_CUDA=1
+export TF_CUDA_VERSION=10
+export TF_CUDNN_VERSION=7
+export TF_NEED_TENSORRT=1
+export TENSORRT_INSTALL_PATH=/usr/local/tensorrt
+export CC_OPT_FLAGS='-mavx'
+export PYTHON_BIN_PATH=$(which python3.6)
+export LD_LIBRARY_PATH="/usr/local/cuda:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:$TENSORRT_INSTALL_PATH/lib"
+export TF_CUDA_COMPUTE_CAPABILITIES=sm_35,sm_37,sm_52,sm_60,sm_61,compute_70
+
+yes "" | "$PYTHON_BIN_PATH" configure.py
+
+########################
+## Build GPU pip package
+########################
+bazel build --config=opt \
+  --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.1:toolchain \
+  tensorflow/tools/pip_package:build_pip_package
+
+# Set TF nightly flag so we get the proper version of estimator
+if [[ "$IS_NIGHTLY" == 1 ]]; then
+  NIGHTLY_FLAG="--nightly_flag"
+fi
+
+PIP_WHL_DIR=whl
+mkdir -p ${PIP_WHL_DIR}
+PIP_WHL_DIR=$(readlink -f ${PIP_WHL_DIR})  # Get absolute path
+bazel-bin/tensorflow/tools/pip_package/build_pip_package "${PIP_WHL_DIR}" "${NIGHTLY_FLAG}"
+WHL_PATH=$(ls "${PIP_WHL_DIR}"/*.whl)
+
+cp "${WHL_PATH}" "$(pwd)"/.
+chmod +x tensorflow/tools/ci_build/builds/docker_cpu_pip.sh
+docker run -e "BAZEL_VERSION=${BAZEL_VERSION}" -e "CI_BUILD_USER=$(id -u -n)" -e "CI_BUILD_UID=$(id -u)"  -e "CI_BUILD_GROUP=$(id -g -n)" -e "CI_BUILD_GID=$(id -g)"  -e "CI_BUILD_HOME=/bazel_pip" -v "$(pwd)":/bazel_pip tensorflow/tensorflow:devel "./bazel_pip/tensorflow/tools/ci_build/builds/with_the_same_user" "./bazel_pip/tensorflow/tools/ci_build/builds/docker_cpu_pip.sh"
diff --git a/tensorflow/tools/ci_build/per_release/scripts/nonpip_gpu.sh b/tensorflow/tools/ci_build/per_release/scripts/gpu_py35_nonpip.sh
similarity index 79%
rename from tensorflow/tools/ci_build/per_release/scripts/nonpip_gpu.sh
rename to tensorflow/tools/ci_build/per_release/scripts/gpu_py35_nonpip.sh
index 6fd7c3d5854..d9a10c9551d 100644
--- a/tensorflow/tools/ci_build/per_release/scripts/nonpip_gpu.sh
+++ b/tensorflow/tools/ci_build/per_release/scripts/gpu_py35_nonpip.sh
@@ -16,23 +16,9 @@
 set -e
 set -x
 
-if [[ -n "${KOKORO_ARTIFACTS_DIR}" ]]; then
-  cd "${KOKORO_ARTIFACTS_DIR}"
-  ls
-  source "$(find "${KOKORO_ARTIFACTS_DIR}" -name "common_google.sh")"
-  cd git/gob-tensorflow
-
-fi
-
-if [[ -z "${TF_KOKORO_PY_VERSION}" ]]; then
-  echo "You must set TF_KOKORO_PY_VERSION, e.g. '3.7', indicating the "
-  echo "Python version to be used for this build."
-  exit 2
-fi
-
 source tensorflow/tools/ci_build/release/common.sh
 
-install_ubuntu_16_pip_deps "pip${TF_KOKORO_PY_VERSION}"
+install_ubuntu_16_pip_deps pip3.5
 # Update bazel
 install_bazelisk
 
@@ -46,7 +32,7 @@ export TF_CUDNN_VERSION=7
 export TF_NEED_TENSORRT=1
 export TENSORRT_INSTALL_PATH=/usr/local/tensorrt
 export CC_OPT_FLAGS='-mavx'
-export PYTHON_BIN_PATH=$(which "python${TF_KOKORO_PY_VERSION}")
+export PYTHON_BIN_PATH=$(which python3.5)
 export TF2_BEHAVIOR=1
 export PROJECT_NAME="tensorflow_gpu"
 export LD_LIBRARY_PATH="/usr/local/cuda:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:$TENSORRT_INSTALL_PATH/lib"
@@ -57,8 +43,7 @@ yes "" | "$PYTHON_BIN_PATH" configure.py
 # Get the default test targets for bazel.
 source tensorflow/tools/ci_build/build_scripts/PRESUBMIT_BUILD_TARGETS.sh
 
-# Exclude -no_oss_py36, for example
-tag_filters="gpu,requires-gpu,-no_gpu,-no_oss,-oss_serial,-no_oss_py${TF_KOKORO_PY_VERSION//.}"
+tag_filters="gpu,requires-gpu,-no_gpu,-no_oss,-oss_serial,-no_oss_py35"
 
 set +e
 bazel test --config=cuda --config=opt \
@@ -66,8 +51,8 @@ bazel test --config=cuda --config=opt \
   --linkopt=-lrt \
   --action_env=TF2_BEHAVIOR="${TF2_BEHAVIOR}" \
   --test_lang_filters=py \
-  --build_tag_filters=${tag_filters} \
   --test_tag_filters=${tag_filters} \
+  --build_tag_filters=${tag_filters} \
   --test_timeout="300,450,1200,3600" --local_test_jobs=4 \
   --test_output=errors --verbose_failures=true --keep_going \
   --run_under=//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute \
diff --git a/tensorflow/tools/ci_build/per_release/scripts/gpu_py35_pip.sh b/tensorflow/tools/ci_build/per_release/scripts/gpu_py35_pip.sh
new file mode 100644
index 00000000000..abf5c1db4b4
--- /dev/null
+++ b/tensorflow/tools/ci_build/per_release/scripts/gpu_py35_pip.sh
@@ -0,0 +1,69 @@
+#!/bin/bash
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+set -e
+set -x
+
+source tensorflow/tools/ci_build/release/common.sh
+
+install_ubuntu_16_pip_deps pip3.5
+# Update bazel
+install_bazelisk
+
+# Export required variables for running pip.sh
+export OS_TYPE="UBUNTU"
+export CONTAINER_TYPE="GPU"
+export TF_PYTHON_VERSION='python3.5'
+
+# Run configure.
+export TF_NEED_GCP=1
+export TF_NEED_HDFS=1
+export TF_NEED_S3=1
+export TF_NEED_CUDA=1
+export TF_CUDA_VERSION=10
+export TF_CUDNN_VERSION=7
+export TF_NEED_TENSORRT=1
+export TENSORRT_INSTALL_PATH=/usr/local/tensorrt
+export CC_OPT_FLAGS='-mavx'
+export PYTHON_BIN_PATH=$(which ${TF_PYTHON_VERSION})
+export PROJECT_NAME="tensorflow_gpu"
+export LD_LIBRARY_PATH="/usr/local/cuda:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:$TENSORRT_INSTALL_PATH/lib"
+export TF_CUDA_COMPUTE_CAPABILITIES=sm_35,sm_37,sm_52,sm_60,sm_61,compute_70
+
+yes "" | "$PYTHON_BIN_PATH" configure.py
+
+# Get the default test targets for bazel.
+source tensorflow/tools/ci_build/build_scripts/PRESUBMIT_BUILD_TARGETS.sh
+
+# Export optional variables for running pip.sh
+export TF_TEST_FILTER_TAGS='gpu,requires-gpu,-no_gpu,-no_oss,-oss_serial,-no_oss_py35'
+export TF_BUILD_FLAGS="--config=opt --config=v2 --config=cuda --distinct_host_configuration=false \
+--action_env=TF_CUDA_VERSION --action_env=TF_CUDNN_VERSION --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.1:toolchain "
+export TF_TEST_FLAGS="--test_tag_filters=${TF_TEST_FILTER_TAGS} --build_tag_filters=${TF_TEST_FILTER_TAGS} \
+--distinct_host_configuration=false \
+--action_env=TF_CUDA_VERSION --action_env=TF_CUDNN_VERSION --test_env=TF2_BEHAVIOR=1 \
+--config=cuda --test_output=errors --local_test_jobs=4 --test_lang_filters=py \
+--verbose_failures=true --keep_going --define=no_tensorflow_py_deps=true \
+--run_under=//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute "
+export TF_TEST_TARGETS="${DEFAULT_BAZEL_TARGETS} -//tensorflow/lite/... "
+export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
+#export IS_NIGHTLY=0 # Not nightly; uncomment if building from tf repo.
+export TF_PROJECT_NAME=${PROJECT_NAME}
+export TF_PIP_TEST_ROOT="pip_test"
+
+# To build both tensorflow and tensorflow-gpu pip packages
+export TF_BUILD_BOTH_GPU_PACKAGES=1
+
+./tensorflow/tools/ci_build/builds/pip_new.sh
diff --git a/tensorflow/tools/ci_build/per_release/scripts/gpu_py36_nonpip.sh b/tensorflow/tools/ci_build/per_release/scripts/gpu_py36_nonpip.sh
new file mode 100644
index 00000000000..547bb0a1fba
--- /dev/null
+++ b/tensorflow/tools/ci_build/per_release/scripts/gpu_py36_nonpip.sh
@@ -0,0 +1,60 @@
+#!/bin/bash
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+set -e
+set -x
+
+source tensorflow/tools/ci_build/release/common.sh
+
+install_ubuntu_16_pip_deps pip3.6
+# Update bazel
+install_bazelisk
+
+# Run configure.
+export TF_NEED_GCP=1
+export TF_NEED_HDFS=1
+export TF_NEED_S3=1
+export TF_NEED_CUDA=1
+export TF_CUDA_VERSION=10
+export TF_CUDNN_VERSION=7
+export TF_NEED_TENSORRT=1
+export TENSORRT_INSTALL_PATH=/usr/local/tensorrt
+export CC_OPT_FLAGS='-mavx'
+export PYTHON_BIN_PATH=$(which python3.6)
+export TF2_BEHAVIOR=1
+export PROJECT_NAME="tensorflow_gpu"
+export LD_LIBRARY_PATH="/usr/local/cuda:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:$TENSORRT_INSTALL_PATH/lib"
+export TF_CUDA_COMPUTE_CAPABILITIES=sm_35,sm_37,sm_52,sm_60,sm_61,compute_70
+
+yes "" | "$PYTHON_BIN_PATH" configure.py
+
+# Get the default test targets for bazel.
+source tensorflow/tools/ci_build/build_scripts/PRESUBMIT_BUILD_TARGETS.sh
+
+tag_filters="gpu,requires-gpu,-no_gpu,-no_oss,-oss_serial,-no_oss_py36"
+
+set +e
+bazel test --config=cuda --config=opt \
+  --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.1:toolchain \
+  --linkopt=-lrt \
+  --action_env=TF2_BEHAVIOR="${TF2_BEHAVIOR}" \
+  --test_lang_filters=py \
+  --test_tag_filters=${tag_filters} \
+  --build_tag_filters=${tag_filters} \
+  --test_timeout="300,450,1200,3600" --local_test_jobs=4 \
+  --test_output=errors --verbose_failures=true --keep_going \
+  --run_under=//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute \
+  -- ${DEFAULT_BAZEL_TARGETS} -//tensorflow/lite/...
+test_xml_summary_exit
diff --git a/tensorflow/tools/ci_build/per_release/scripts/gpu_py36_pip.sh b/tensorflow/tools/ci_build/per_release/scripts/gpu_py36_pip.sh
new file mode 100644
index 00000000000..17b52d9ce6b
--- /dev/null
+++ b/tensorflow/tools/ci_build/per_release/scripts/gpu_py36_pip.sh
@@ -0,0 +1,69 @@
+#!/bin/bash
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+set -e
+set -x
+
+source tensorflow/tools/ci_build/release/common.sh
+
+install_ubuntu_16_pip_deps pip3.6
+# Update bazel
+install_bazelisk
+
+# Export required variables for running pip.sh
+export OS_TYPE="UBUNTU"
+export CONTAINER_TYPE="GPU"
+export TF_PYTHON_VERSION='python3.6'
+
+# Run configure.
+export TF_NEED_GCP=1
+export TF_NEED_HDFS=1
+export TF_NEED_S3=1
+export TF_NEED_CUDA=1
+export TF_CUDA_VERSION=10
+export TF_CUDNN_VERSION=7
+export TF_NEED_TENSORRT=1
+export TENSORRT_INSTALL_PATH=/usr/local/tensorrt
+export CC_OPT_FLAGS='-mavx'
+export PYTHON_BIN_PATH=$(which ${TF_PYTHON_VERSION})
+export PROJECT_NAME="tensorflow_gpu"
+export LD_LIBRARY_PATH="/usr/local/cuda:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:$TENSORRT_INSTALL_PATH/lib"
+export TF_CUDA_COMPUTE_CAPABILITIES=sm_35,sm_37,sm_52,sm_60,sm_61,compute_70
+
+yes "" | "$PYTHON_BIN_PATH" configure.py
+
+# Get the default test targets for bazel.
+source tensorflow/tools/ci_build/build_scripts/PRESUBMIT_BUILD_TARGETS.sh
+
+# Export optional variables for running pip.sh
+export TF_TEST_FILTER_TAGS='gpu,requires-gpu,-no_gpu,-no_oss,-oss_serial,-no_oss_py36'
+export TF_BUILD_FLAGS="--config=opt --config=v2 --config=cuda --distinct_host_configuration=false \
+--action_env=TF_CUDA_VERSION --action_env=TF_CUDNN_VERSION --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.1:toolchain "
+export TF_TEST_FLAGS="--test_tag_filters=${TF_TEST_FILTER_TAGS} --build_tag_filters=${TF_TEST_FILTER_TAGS} \
+--distinct_host_configuration=false \
+--action_env=TF_CUDA_VERSION --action_env=TF_CUDNN_VERSION --test_env=TF2_BEHAVIOR=1 \
+--config=cuda --test_output=errors --local_test_jobs=4 --test_lang_filters=py \
+--verbose_failures=true --keep_going --define=no_tensorflow_py_deps=true \
+--run_under=//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute "
+export TF_TEST_TARGETS="${DEFAULT_BAZEL_TARGETS} -//tensorflow/lite/... "
+export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
+#export IS_NIGHTLY=0 # Not nightly; uncomment if building from tf repo.
+export TF_PROJECT_NAME=${PROJECT_NAME}
+export TF_PIP_TEST_ROOT="pip_test"
+
+# To build both tensorflow and tensorflow-gpu pip packages
+export TF_BUILD_BOTH_GPU_PACKAGES=1
+
+./tensorflow/tools/ci_build/builds/pip_new.sh
diff --git a/tensorflow/tools/ci_build/per_release/scripts/gpu_py37_nonpip.sh b/tensorflow/tools/ci_build/per_release/scripts/gpu_py37_nonpip.sh
new file mode 100644
index 00000000000..54a72459fa1
--- /dev/null
+++ b/tensorflow/tools/ci_build/per_release/scripts/gpu_py37_nonpip.sh
@@ -0,0 +1,60 @@
+#!/bin/bash
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+set -e
+set -x
+
+source tensorflow/tools/ci_build/release/common.sh
+
+install_ubuntu_16_pip_deps pip3.7
+# Update bazel
+install_bazelisk
+
+# Run configure.
+export TF_NEED_GCP=1
+export TF_NEED_HDFS=1
+export TF_NEED_S3=1
+export TF_NEED_CUDA=1
+export TF_CUDA_VERSION=10
+export TF_CUDNN_VERSION=7
+export TF_NEED_TENSORRT=1
+export TENSORRT_INSTALL_PATH=/usr/local/tensorrt
+export CC_OPT_FLAGS='-mavx'
+export PYTHON_BIN_PATH=$(which python3.7)
+export TF2_BEHAVIOR=1
+export PROJECT_NAME="tensorflow_gpu"
+export LD_LIBRARY_PATH="/usr/local/cuda:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:$TENSORRT_INSTALL_PATH/lib"
+export TF_CUDA_COMPUTE_CAPABILITIES=sm_35,sm_37,sm_52,sm_60,sm_61,compute_70
+
+yes "" | "$PYTHON_BIN_PATH" configure.py
+
+# Get the default test targets for bazel.
+source tensorflow/tools/ci_build/build_scripts/PRESUBMIT_BUILD_TARGETS.sh
+
+tag_filters="gpu,requires-gpu,-no_gpu,-no_oss,-oss_serial,-no_oss_py37"
+
+set +e
+bazel test --config=cuda --config=opt \
+  --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.1:toolchain \
+  --linkopt=-lrt \
+  --action_env=TF2_BEHAVIOR="${TF2_BEHAVIOR}" \
+  --test_lang_filters=py \
+  --build_tag_filters=${tag_filters} \
+  --test_tag_filters=${tag_filters} \
+  --test_timeout="300,450,1200,3600" --local_test_jobs=4 \
+  --test_output=errors --verbose_failures=true --keep_going \
+  --run_under=//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute \
+  -- ${DEFAULT_BAZEL_TARGETS} -//tensorflow/lite/...
+test_xml_summary_exit
diff --git a/tensorflow/tools/ci_build/per_release/scripts/gpu_py37_pip.sh b/tensorflow/tools/ci_build/per_release/scripts/gpu_py37_pip.sh
new file mode 100644
index 00000000000..2b17849b737
--- /dev/null
+++ b/tensorflow/tools/ci_build/per_release/scripts/gpu_py37_pip.sh
@@ -0,0 +1,69 @@
+#!/bin/bash
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+set -e
+set -x
+
+source tensorflow/tools/ci_build/release/common.sh
+
+install_ubuntu_16_pip_deps pip3.7
+# Update bazel
+install_bazelisk
+
+# Export required variables for running pip.sh
+export OS_TYPE="UBUNTU"
+export CONTAINER_TYPE="GPU"
+export TF_PYTHON_VERSION='python3.7'
+
+# Run configure.
+export TF_NEED_GCP=1
+export TF_NEED_HDFS=1
+export TF_NEED_S3=1
+export TF_NEED_CUDA=1
+export TF_CUDA_VERSION=10
+export TF_CUDNN_VERSION=7
+export TF_NEED_TENSORRT=1
+export TENSORRT_INSTALL_PATH=/usr/local/tensorrt
+export CC_OPT_FLAGS='-mavx'
+export PYTHON_BIN_PATH=$(which ${TF_PYTHON_VERSION})
+export PROJECT_NAME="tensorflow_gpu"
+export LD_LIBRARY_PATH="/usr/local/cuda:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:$TENSORRT_INSTALL_PATH/lib"
+export TF_CUDA_COMPUTE_CAPABILITIES=sm_35,sm_37,sm_52,sm_60,sm_61,compute_70
+
+yes "" | "$PYTHON_BIN_PATH" configure.py
+
+# Get the default test targets for bazel.
+source tensorflow/tools/ci_build/build_scripts/PRESUBMIT_BUILD_TARGETS.sh
+
+# Export optional variables for running pip.sh
+export TF_TEST_FILTER_TAGS='gpu,requires-gpu,-no_gpu,-no_oss,-oss_serial,-no_oss_py37'
+export TF_BUILD_FLAGS="--config=opt --config=v2 --config=cuda --distinct_host_configuration=false \
+--action_env=TF_CUDA_VERSION --action_env=TF_CUDNN_VERSION --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.1:toolchain "
+export TF_TEST_FLAGS="--test_tag_filters=${TF_TEST_FILTER_TAGS} --build_tag_filters=${TF_TEST_FILTER_TAGS} \
+--distinct_host_configuration=false \
+--action_env=TF_CUDA_VERSION --action_env=TF_CUDNN_VERSION --test_env=TF2_BEHAVIOR=1 \
+--config=cuda --test_output=errors --local_test_jobs=4 --test_lang_filters=py \
+--verbose_failures=true --keep_going --define=no_tensorflow_py_deps=true \
+--run_under=//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute "
+export TF_TEST_TARGETS="${DEFAULT_BAZEL_TARGETS} -//tensorflow/lite/... "
+export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
+#export IS_NIGHTLY=0 # Not nightly; uncomment if building from tf repo.
+export TF_PROJECT_NAME=${PROJECT_NAME}
+export TF_PIP_TEST_ROOT="pip_test"
+
+# To build both tensorflow and tensorflow-gpu pip packages
+export TF_BUILD_BOTH_GPU_PACKAGES=1
+
+./tensorflow/tools/ci_build/builds/pip_new.sh
diff --git a/tensorflow/tools/ci_build/per_release/scripts/gpu_py38_nonpip.sh b/tensorflow/tools/ci_build/per_release/scripts/gpu_py38_nonpip.sh
new file mode 100644
index 00000000000..ab88f4712f0
--- /dev/null
+++ b/tensorflow/tools/ci_build/per_release/scripts/gpu_py38_nonpip.sh
@@ -0,0 +1,60 @@
+#!/bin/bash
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+set -e
+set -x
+
+source tensorflow/tools/ci_build/release/common.sh
+
+install_ubuntu_16_pip_deps pip3.8
+# Update bazel
+update_bazel_linux
+
+# Run configure.
+export TF_NEED_GCP=1
+export TF_NEED_HDFS=1
+export TF_NEED_S3=1
+export TF_NEED_CUDA=1
+export TF_CUDA_VERSION=10
+export TF_CUDNN_VERSION=7
+export TF_NEED_TENSORRT=1
+export TENSORRT_INSTALL_PATH=/usr/local/tensorrt
+export CC_OPT_FLAGS='-mavx'
+export PYTHON_BIN_PATH=$(which python3.8)
+export TF2_BEHAVIOR=1
+export PROJECT_NAME="tensorflow_gpu"
+export LD_LIBRARY_PATH="/usr/local/cuda:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:$TENSORRT_INSTALL_PATH/lib"
+export TF_CUDA_COMPUTE_CAPABILITIES=sm_35,sm_37,sm_52,sm_60,sm_61,compute_70
+
+yes "" | "$PYTHON_BIN_PATH" configure.py
+
+# Get the default test targets for bazel.
+source tensorflow/tools/ci_build/build_scripts/PRESUBMIT_BUILD_TARGETS.sh
+
+tag_filters="gpu,requires-gpu,-no_gpu,-no_oss,-oss_serial,-no_oss_py38"
+
+test +e
+bazel test --config=cuda --config=opt \
+  --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.1:toolchain \
+  --linkopt=-lrt \
+  --action_env=TF2_BEHAVIOR="${TF2_BEHAVIOR}" \
+  --test_lang_filters=py \
+  --build_tag_filters=${tag_filters} \
+  --test_tag_filters=${tag_filters} \
+  --test_timeout="300,450,1200,3600" --local_test_jobs=4 \
+  --test_output=errors --verbose_failures=true --keep_going \
+  --run_under=//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute \
+  -- ${DEFAULT_BAZEL_TARGETS} -//tensorflow/lite/...
+test_xml_summary_exit
diff --git a/tensorflow/tools/ci_build/per_release/scripts/gpu_py38_pip.sh b/tensorflow/tools/ci_build/per_release/scripts/gpu_py38_pip.sh
new file mode 100644
index 00000000000..1ba8c078021
--- /dev/null
+++ b/tensorflow/tools/ci_build/per_release/scripts/gpu_py38_pip.sh
@@ -0,0 +1,69 @@
+#!/bin/bash
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+set -e
+set -x
+
+source tensorflow/tools/ci_build/release/common.sh
+
+install_ubuntu_16_pip_deps pip3.8
+# Update bazel
+update_bazel_linux
+
+# Export required variables for running pip.sh
+export OS_TYPE="UBUNTU"
+export CONTAINER_TYPE="GPU"
+export TF_PYTHON_VERSION='python3.8'
+
+# Run configure.
+export TF_NEED_GCP=1
+export TF_NEED_HDFS=1
+export TF_NEED_S3=1
+export TF_NEED_CUDA=1
+export TF_CUDA_VERSION=10
+export TF_CUDNN_VERSION=7
+export TF_NEED_TENSORRT=1
+export TENSORRT_INSTALL_PATH=/usr/local/tensorrt
+export CC_OPT_FLAGS='-mavx'
+export PYTHON_BIN_PATH=$(which ${TF_PYTHON_VERSION})
+export PROJECT_NAME="tensorflow_gpu"
+export LD_LIBRARY_PATH="/usr/local/cuda:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:$TENSORRT_INSTALL_PATH/lib"
+export TF_CUDA_COMPUTE_CAPABILITIES=sm_35,sm_37,sm_52,sm_60,sm_61,compute_70
+
+yes "" | "$PYTHON_BIN_PATH" configure.py
+
+# Get the default test targets for bazel.
+source tensorflow/tools/ci_build/build_scripts/PRESUBMIT_BUILD_TARGETS.sh
+
+# Export optional variables for running pip.sh
+export TF_TEST_FILTER_TAGS='gpu,requires-gpu,-no_gpu,-no_oss,-oss_serial,-no_oss_py38'
+export TF_BUILD_FLAGS="--config=opt --config=v2 --config=cuda --distinct_host_configuration=false \
+--action_env=TF_CUDA_VERSION --action_env=TF_CUDNN_VERSION --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.1:toolchain "
+export TF_TEST_FLAGS="--test_tag_filters=${TF_TEST_FILTER_TAGS} --build_tag_filters=${TF_TEST_FILTER_TAGS} \
+--distinct_host_configuration=false \
+--action_env=TF_CUDA_VERSION --action_env=TF_CUDNN_VERSION --test_env=TF2_BEHAVIOR=1 \
+--config=cuda --test_output=errors --local_test_jobs=4 --test_lang_filters=py \
+--verbose_failures=true --keep_going --define=no_tensorflow_py_deps=true \
+--run_under=//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute "
+export TF_TEST_TARGETS="${DEFAULT_BAZEL_TARGETS} -//tensorflow/lite/... "
+export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
+#export IS_NIGHTLY=0 # Not nightly; uncomment if building from tf repo.
+export TF_PROJECT_NAME=${PROJECT_NAME}
+export TF_PIP_TEST_ROOT="pip_test"
+
+# To build both tensorflow and tensorflow-gpu pip packages
+export TF_BUILD_BOTH_GPU_PACKAGES=1
+
+./tensorflow/tools/ci_build/builds/pip_new.sh
diff --git a/tensorflow/tools/ci_build/per_release/scripts/sanity.sh b/tensorflow/tools/ci_build/per_release/scripts/sanity.sh
new file mode 100644
index 00000000000..4fc600de867
--- /dev/null
+++ b/tensorflow/tools/ci_build/per_release/scripts/sanity.sh
@@ -0,0 +1,36 @@
+#!/bin/bash
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+set -e
+
+# Install latest bazel
+source tensorflow/tools/ci_build/release/common.sh
+install_bazelisk
+which bazel
+
+# We need py3 lint
+sudo pip3 install pep8
+
+# TODO(gunan): figure out why we get stuck with later versions of pylint.
+# Install pylint.
+sudo python3 -m pip install setuptools --upgrade
+sudo python2 -m pip install pylint==1.6.4
+sudo python3 -m pip install pylint==1.6.4
+
+# TODO(yifeif): print pylint version for debug. remove later.
+python3 -m pylint --version
+
+# Run tensorflow sanity checks.
+tensorflow/tools/ci_build/ci_sanity.sh

From ec65dacc8180af1e5687ef26d4c0985cba77ffe8 Mon Sep 17 00:00:00 2001
From: Raman Sarokin <sorokin@google.com>
Date: Thu, 25 Jun 2020 12:52:35 -0700
Subject: [PATCH 1097/1390] Changed signature of GetGPUResources. Will allow to
 bind the same object to different descriptors, for example TensorDescriptor
 and BufferDescriptor.

PiperOrigin-RevId: 318329573
Change-Id: Ib6a0c55ad5826006d95e27a866999d214ba39da2
---
 tensorflow/lite/delegates/gpu/cl/arguments.cc | 36 +++++++++--------
 tensorflow/lite/delegates/gpu/cl/arguments.h  |  2 -
 tensorflow/lite/delegates/gpu/cl/buffer.cc    | 17 +++++---
 tensorflow/lite/delegates/gpu/cl/buffer.h     |  5 ++-
 tensorflow/lite/delegates/gpu/cl/gpu_object.h | 13 ++++---
 .../lite/delegates/gpu/cl/linear_storage.cc   | 34 +++++++++-------
 .../lite/delegates/gpu/cl/linear_storage.h    |  5 ++-
 tensorflow/lite/delegates/gpu/cl/tensor.cc    | 39 +++++++++++--------
 tensorflow/lite/delegates/gpu/cl/tensor.h     |  3 +-
 .../lite/delegates/gpu/cl/tensor_type.cc      | 22 ++++++-----
 .../lite/delegates/gpu/cl/tensor_type.h       |  2 +-
 tensorflow/lite/delegates/gpu/cl/texture2d.cc | 19 +++++----
 tensorflow/lite/delegates/gpu/cl/texture2d.h  |  5 ++-
 13 files changed, 117 insertions(+), 85 deletions(-)

diff --git a/tensorflow/lite/delegates/gpu/cl/arguments.cc b/tensorflow/lite/delegates/gpu/cl/arguments.cc
index efad6e884bd..8548f093d78 100644
--- a/tensorflow/lite/delegates/gpu/cl/arguments.cc
+++ b/tensorflow/lite/delegates/gpu/cl/arguments.cc
@@ -225,13 +225,15 @@ void Arguments::AddCustomMemory(const std::string& name,
 
 void Arguments::AddObjectRef(const std::string& name, AccessType access_type,
                              GPUObjectDescriptorPtr&& descriptor_ptr) {
-  object_refs_[name] = {access_type, std::move(descriptor_ptr)};
+  descriptor_ptr->SetAccess(access_type);
+  object_refs_[name] = {std::move(descriptor_ptr)};
 }
 
 void Arguments::AddObject(const std::string& name, AccessType access_type,
                           GPUObjectPtr&& object,
                           GPUObjectDescriptorPtr&& descriptor_ptr) {
-  objects_[name] = {access_type, std::move(object), std::move(descriptor_ptr)};
+  descriptor_ptr->SetAccess(access_type);
+  objects_[name] = {std::move(object), std::move(descriptor_ptr)};
 }
 
 void Arguments::AddGPUResources(const std::string& name,
@@ -374,7 +376,10 @@ absl::Status Arguments::SetObjectRef(const std::string& name,
     return absl::NotFoundError(
         absl::StrCat("No object ref with name - ", name));
   }
-  return SetGPUResources(name, object->GetGPUResources(it->second.access_type));
+  GPUResourcesWithValue resources;
+  RETURN_IF_ERROR(
+      object->GetGPUResources(it->second.descriptor.get(), &resources));
+  return SetGPUResources(name, resources);
 }
 
 absl::Status Arguments::SetGPUResources(
@@ -429,7 +434,7 @@ absl::Status Arguments::Merge(Arguments&& args, const std::string& postfix) {
       return absl::InvalidArgumentError(
           absl::StrCat("Object reference name collision. Name - ", name));
     }
-    object_refs_[name] = {v.second.access_type, std::move(v.second.descriptor)};
+    object_refs_[name] = {std::move(v.second.descriptor)};
   }
   for (auto& v : args.objects_) {
     object_names.push_back(v.first);
@@ -438,7 +443,7 @@ absl::Status Arguments::Merge(Arguments&& args, const std::string& postfix) {
       return absl::InvalidArgumentError(
           absl::StrCat("Object name collision. Name - ", name));
     }
-    objects_[name] = {v.second.access_type, std::move(v.second.obj_ptr),
+    objects_[name] = {std::move(v.second.obj_ptr),
                       std::move(v.second.descriptor)};
   }
   for (const auto& v : args.int_values_) {
@@ -722,23 +727,20 @@ absl::Status Arguments::ResolveSelector(
     const std::vector<std::string>& args,
     const std::vector<std::string>& template_args, std::string* result) {
   const GPUObjectDescriptor* desc_ptr;
-  AccessType access_type;
   if (auto it = object_refs_.find(object_name); it != object_refs_.end()) {
     desc_ptr = it->second.descriptor.get();
-    access_type = it->second.access_type;
   } else if (auto it = objects_.find(object_name); it != objects_.end()) {
     desc_ptr = it->second.descriptor.get();
-    access_type = it->second.access_type;
   } else {
     return absl::NotFoundError(
         absl::StrCat("No object with name - ", object_name));
   }
-  auto names = desc_ptr->GetGPUResources(access_type).GetNames();
+  auto names = desc_ptr->GetGPUResources().GetNames();
   const auto* tensor_desc = dynamic_cast<const TensorDescriptor*>(desc_ptr);
   if (tensor_desc && selector == "Write") {
     if (auto it = linkables.find(object_name); it != linkables.end()) {
-      if (access_type != AccessType::WRITE &&
-          access_type != AccessType::READ_WRITE) {
+      if (desc_ptr->GetAccess() != AccessType::WRITE &&
+          desc_ptr->GetAccess() != AccessType::READ_WRITE) {
         return absl::FailedPreconditionError(absl::StrCat(
             "Object with name - ", object_name, " should have Write access."));
       }
@@ -812,14 +814,14 @@ absl::Status Arguments::ResolveSelectorsPass(
 
 absl::Status Arguments::AddObjectArgs() {
   for (auto& t : objects_) {
-    AddGPUResources(t.first,
-                    t.second.descriptor->GetGPUResources(t.second.access_type));
-    RETURN_IF_ERROR(SetGPUResources(
-        t.first, t.second.obj_ptr->GetGPUResources(t.second.access_type)));
+    AddGPUResources(t.first, t.second.descriptor->GetGPUResources());
+    GPUResourcesWithValue resources;
+    RETURN_IF_ERROR(t.second.obj_ptr->GetGPUResources(t.second.descriptor.get(),
+                                                      &resources));
+    RETURN_IF_ERROR(SetGPUResources(t.first, resources));
   }
   for (auto& t : object_refs_) {
-    AddGPUResources(t.first,
-                    t.second.descriptor->GetGPUResources(t.second.access_type));
+    AddGPUResources(t.first, t.second.descriptor->GetGPUResources());
   }
   return absl::OkStatus();
 }
diff --git a/tensorflow/lite/delegates/gpu/cl/arguments.h b/tensorflow/lite/delegates/gpu/cl/arguments.h
index 8a97849ec16..0648ae43101 100644
--- a/tensorflow/lite/delegates/gpu/cl/arguments.h
+++ b/tensorflow/lite/delegates/gpu/cl/arguments.h
@@ -159,13 +159,11 @@ class Arguments {
   std::map<std::string, GPUCustomMemoryDescriptor> custom_memories_;
 
   struct ObjectRefArg {
-    AccessType access_type;
     GPUObjectDescriptorPtr descriptor;
   };
   std::map<std::string, ObjectRefArg> object_refs_;
 
   struct ObjectArg {
-    AccessType access_type;
     GPUObjectPtr obj_ptr;
     GPUObjectDescriptorPtr descriptor;
   };
diff --git a/tensorflow/lite/delegates/gpu/cl/buffer.cc b/tensorflow/lite/delegates/gpu/cl/buffer.cc
index 223da82f51c..a612be452d0 100644
--- a/tensorflow/lite/delegates/gpu/cl/buffer.cc
+++ b/tensorflow/lite/delegates/gpu/cl/buffer.cc
@@ -47,11 +47,11 @@ absl::Status CreateBuffer(size_t size_in_bytes, bool gpu_read_only,
 }
 }  // namespace
 
-GPUResources BufferDescriptor::GetGPUResources(AccessType access_type) const {
+GPUResources BufferDescriptor::GetGPUResources() const {
   GPUResources resources;
   GPUBufferDescriptor desc;
   desc.data_type = element_type;
-  desc.access_type = access_type;
+  desc.access_type = access_type_;
   desc.element_size = element_size;
   desc.memory_type = memory_type;
   desc.attributes = attributes;
@@ -142,10 +142,15 @@ void Buffer::Release() {
   }
 }
 
-GPUResourcesWithValue Buffer::GetGPUResources(AccessType access_type) const {
-  GPUResourcesWithValue resources;
-  resources.buffers.push_back({"buffer", buffer_});
-  return resources;
+absl::Status Buffer::GetGPUResources(const GPUObjectDescriptor* obj_ptr,
+                                     GPUResourcesWithValue* resources) const {
+  const auto* buffer_desc = dynamic_cast<const BufferDescriptor*>(obj_ptr);
+  if (!buffer_desc) {
+    return absl::InvalidArgumentError("Expected BufferDescriptor on input.");
+  }
+
+  resources->buffers.push_back({"buffer", buffer_});
+  return absl::OkStatus();
 }
 
 absl::Status CreateReadOnlyBuffer(size_t size_in_bytes, CLContext* context,
diff --git a/tensorflow/lite/delegates/gpu/cl/buffer.h b/tensorflow/lite/delegates/gpu/cl/buffer.h
index d50f63c7d5d..dc5befebea2 100644
--- a/tensorflow/lite/delegates/gpu/cl/buffer.h
+++ b/tensorflow/lite/delegates/gpu/cl/buffer.h
@@ -40,7 +40,7 @@ struct BufferDescriptor : public GPUObjectDescriptor {
                                const std::vector<std::string>& template_args,
                                std::string* result) const override;
 
-  GPUResources GetGPUResources(AccessType access_type) const override;
+  GPUResources GetGPUResources() const override;
   absl::Status PerformReadSelector(const std::vector<std::string>& args,
                                    std::string* result) const;
   absl::Status PerformGetPtrSelector(
@@ -77,7 +77,8 @@ class Buffer : public GPUObject {
   template <typename T>
   absl::Status ReadData(CLCommandQueue* queue, std::vector<T>* result) const;
 
-  GPUResourcesWithValue GetGPUResources(AccessType access_type) const override;
+  absl::Status GetGPUResources(const GPUObjectDescriptor* obj_ptr,
+                               GPUResourcesWithValue* resources) const override;
 
  private:
   void Release();
diff --git a/tensorflow/lite/delegates/gpu/cl/gpu_object.h b/tensorflow/lite/delegates/gpu/cl/gpu_object.h
index 2d7183754dc..68a8877ca59 100644
--- a/tensorflow/lite/delegates/gpu/cl/gpu_object.h
+++ b/tensorflow/lite/delegates/gpu/cl/gpu_object.h
@@ -139,12 +139,14 @@ class GPUObjectDescriptor {
     *result = "";
     return absl::OkStatus();
   }
-  virtual GPUResources GetGPUResources(AccessType access_type) const {
-    return GPUResources();
-  }
+  virtual GPUResources GetGPUResources() const { return GPUResources(); }
+
+  void SetAccess(AccessType access_type) { access_type_ = access_type; }
+  AccessType GetAccess() const { return access_type_; }
 
  protected:
   mutable std::map<std::string, std::string> state_vars_;
+  AccessType access_type_;
 };
 
 using GPUObjectDescriptorPtr = std::unique_ptr<GPUObjectDescriptor>;
@@ -158,8 +160,9 @@ class GPUObject {
   GPUObject(const GPUObject&) = delete;
   GPUObject& operator=(const GPUObject&) = delete;
   virtual ~GPUObject() = default;
-  virtual GPUResourcesWithValue GetGPUResources(
-      AccessType access_type) const = 0;
+  virtual absl::Status GetGPUResources(
+      const GPUObjectDescriptor* obj_ptr,
+      GPUResourcesWithValue* resources) const = 0;
 };
 
 using GPUObjectPtr = std::unique_ptr<GPUObject>;
diff --git a/tensorflow/lite/delegates/gpu/cl/linear_storage.cc b/tensorflow/lite/delegates/gpu/cl/linear_storage.cc
index ee0ea3efbec..abaa891805e 100644
--- a/tensorflow/lite/delegates/gpu/cl/linear_storage.cc
+++ b/tensorflow/lite/delegates/gpu/cl/linear_storage.cc
@@ -23,21 +23,20 @@ namespace tflite {
 namespace gpu {
 namespace cl {
 
-GPUResources TensorLinearDescriptor::GetGPUResources(
-    AccessType access_type) const {
+GPUResources TensorLinearDescriptor::GetGPUResources() const {
   GPUResources resources;
   resources.ints.push_back("length");
   if (storage_type == LinearStorageType::BUFFER) {
     GPUBufferDescriptor desc;
     desc.data_type = element_type;
-    desc.access_type = access_type;
+    desc.access_type = access_type_;
     desc.element_size = 4;
     desc.memory_type = memory_type;
     resources.buffers.push_back({"buffer", desc});
   } else {
     GPUImage2DDescriptor desc;
     desc.data_type = element_type;
-    desc.access_type = access_type;
+    desc.access_type = access_type_;
     resources.images2d.push_back({"tex2d", desc});
   }
   return resources;
@@ -122,18 +121,25 @@ std::string LinearStorage::GetDeclaration() const {
   }
 }
 
-GPUResourcesWithValue LinearStorage::GetGPUResources(
-    AccessType access_type) const {
-  GPUResourcesWithValue resources;
-  resources.ints.push_back({"length", depth_});
-
-  if (storage_type_ == LinearStorageType::BUFFER) {
-    resources.buffers.push_back({"buffer", memory_});
-  } else {
-    resources.images2d.push_back({"tex2d", memory_});
+absl::Status LinearStorage::GetGPUResources(
+    const GPUObjectDescriptor* obj_ptr,
+    GPUResourcesWithValue* resources) const {
+  const auto* linear_desc =
+      dynamic_cast<const TensorLinearDescriptor*>(obj_ptr);
+  if (!linear_desc) {
+    return absl::InvalidArgumentError(
+        "Expected TensorLinearDescriptor on input.");
   }
 
-  return resources;
+  resources->ints.push_back({"length", depth_});
+
+  if (storage_type_ == LinearStorageType::BUFFER) {
+    resources->buffers.push_back({"buffer", memory_});
+  } else {
+    resources->images2d.push_back({"tex2d", memory_});
+  }
+
+  return absl::OkStatus();
 }
 
 LinearStorageType DeduceLinearStorageType(
diff --git a/tensorflow/lite/delegates/gpu/cl/linear_storage.h b/tensorflow/lite/delegates/gpu/cl/linear_storage.h
index 14c8460bf80..29de71c6b5e 100644
--- a/tensorflow/lite/delegates/gpu/cl/linear_storage.h
+++ b/tensorflow/lite/delegates/gpu/cl/linear_storage.h
@@ -47,7 +47,7 @@ struct TensorLinearDescriptor : public GPUObjectDescriptor {
                                const std::vector<std::string>& template_args,
                                std::string* result) const override;
 
-  GPUResources GetGPUResources(AccessType access_type) const override;
+  GPUResources GetGPUResources() const override;
   absl::Status PerformReadSelector(const std::vector<std::string>& args,
                                    std::string* result) const;
 };
@@ -79,7 +79,8 @@ class LinearStorage : public GPUObject {
   std::string ReadLinearFLT4(const std::string& z_coord) const;
   std::string GetDeclaration() const;
 
-  GPUResourcesWithValue GetGPUResources(AccessType access_type) const override;
+  absl::Status GetGPUResources(const GPUObjectDescriptor* obj_ptr,
+                               GPUResourcesWithValue* resources) const override;
 
  private:
   friend absl::Status CreateTextureLinearStorage(int size, DataType data_type,
diff --git a/tensorflow/lite/delegates/gpu/cl/tensor.cc b/tensorflow/lite/delegates/gpu/cl/tensor.cc
index f35d5554b1f..ad4cdf6a2a1 100644
--- a/tensorflow/lite/delegates/gpu/cl/tensor.cc
+++ b/tensorflow/lite/delegates/gpu/cl/tensor.cc
@@ -144,44 +144,49 @@ void Tensor::Release() {
   }
 }
 
-GPUResourcesWithValue Tensor::GetGPUResources(AccessType access_type) const {
-  GPUResourcesWithValue resources;
+absl::Status Tensor::GetGPUResources(const GPUObjectDescriptor* obj_ptr,
+                                     GPUResourcesWithValue* resources) const {
+  const auto* tensor_desc = dynamic_cast<const TensorDescriptor*>(obj_ptr);
+  if (!tensor_desc) {
+    return absl::InvalidArgumentError("Expected TensorDescriptor on input.");
+  }
   if (descriptor_.HasAxis(Axis::WIDTH)) {
-    resources.ints.push_back({"width", Width()});
-    resources.ints.push_back({"width_batched", Width() * Batch()});
+    resources->ints.push_back({"width", Width()});
+    resources->ints.push_back({"width_batched", Width() * Batch()});
   }
   if (descriptor_.HasAxis(Axis::HEIGHT)) {
-    resources.ints.push_back({"height", Height()});
+    resources->ints.push_back({"height", Height()});
   }
   if (descriptor_.HasAxis(Axis::CHANNELS)) {
-    resources.ints.push_back({"slices", Slices()});
-    resources.ints.push_back({"channels", Channels()});
+    resources->ints.push_back({"slices", Slices()});
+    resources->ints.push_back({"channels", Channels()});
   }
   if (descriptor_.HasAxis(Axis::BATCH)) {
-    resources.ints.push_back({"batch", Batch()});
+    resources->ints.push_back({"batch", Batch()});
   }
   if (descriptor_.HasAxis(Axis::DEPTH)) {
-    resources.ints.push_back({"depth", Depth()});
+    resources->ints.push_back({"depth", Depth()});
   }
 
   if (descriptor_.storage_type == TensorStorageType::BUFFER) {
-    resources.buffers.push_back({"buffer", memory_});
+    resources->buffers.push_back({"buffer", memory_});
   } else if (descriptor_.storage_type == TensorStorageType::TEXTURE_2D ||
              descriptor_.storage_type == TensorStorageType::SINGLE_TEXTURE_2D) {
-    resources.images2d.push_back({"image2d", memory_});
+    resources->images2d.push_back({"image2d", memory_});
   } else if (descriptor_.storage_type == TensorStorageType::TEXTURE_ARRAY) {
-    resources.image2d_arrays.push_back({"image2d_array", memory_});
+    resources->image2d_arrays.push_back({"image2d_array", memory_});
   } else if (descriptor_.storage_type == TensorStorageType::TEXTURE_3D) {
-    resources.images3d.push_back({"image3d", memory_});
+    resources->images3d.push_back({"image3d", memory_});
   } else if (descriptor_.storage_type == TensorStorageType::IMAGE_BUFFER) {
-    if (access_type == AccessType::READ) {
-      resources.image_buffers.push_back({"image_buffer", image_buffer_memory_});
+    if (obj_ptr->GetAccess() == AccessType::READ) {
+      resources->image_buffers.push_back(
+          {"image_buffer", image_buffer_memory_});
     } else {
-      resources.buffers.push_back({"buffer", memory_});
+      resources->buffers.push_back({"buffer", memory_});
     }
   }
 
-  return resources;
+  return absl::OkStatus();
 }
 
 int3 Tensor::GetFullTensorRegion() const {
diff --git a/tensorflow/lite/delegates/gpu/cl/tensor.h b/tensorflow/lite/delegates/gpu/cl/tensor.h
index 7de42a810ec..3183c5ff731 100644
--- a/tensorflow/lite/delegates/gpu/cl/tensor.h
+++ b/tensorflow/lite/delegates/gpu/cl/tensor.h
@@ -58,7 +58,8 @@ class Tensor : public GPUObject {
 
   virtual ~Tensor() { Release(); }
 
-  GPUResourcesWithValue GetGPUResources(AccessType access_type) const override;
+  absl::Status GetGPUResources(const GPUObjectDescriptor* obj_ptr,
+                               GPUResourcesWithValue* resources) const override;
 
   int Width() const { return shape_.w; }
   int Height() const { return shape_.h; }
diff --git a/tensorflow/lite/delegates/gpu/cl/tensor_type.cc b/tensorflow/lite/delegates/gpu/cl/tensor_type.cc
index b29a87de893..8b57e1b6160 100644
--- a/tensorflow/lite/delegates/gpu/cl/tensor_type.cc
+++ b/tensorflow/lite/delegates/gpu/cl/tensor_type.cc
@@ -73,7 +73,7 @@ std::string ToString(TensorStorageType type) {
   }
 }
 
-GPUResources TensorDescriptor::GetGPUResources(AccessType access_type) const {
+GPUResources TensorDescriptor::GetGPUResources() const {
   GPUResources resources;
   if (HasAxis(Axis::WIDTH)) {
     resources.ints.push_back("width");
@@ -95,35 +95,35 @@ GPUResources TensorDescriptor::GetGPUResources(AccessType access_type) const {
   if (storage_type == TensorStorageType::BUFFER) {
     GPUBufferDescriptor desc;
     desc.data_type = data_type;
-    desc.access_type = access_type;
+    desc.access_type = access_type_;
     desc.element_size = 4;
     resources.buffers.push_back({"buffer", desc});
   } else if (storage_type == TensorStorageType::SINGLE_TEXTURE_2D ||
              storage_type == TensorStorageType::TEXTURE_2D) {
     GPUImage2DDescriptor desc;
     desc.data_type = data_type;
-    desc.access_type = access_type;
+    desc.access_type = access_type_;
     resources.images2d.push_back({"image2d", desc});
   } else if (storage_type == TensorStorageType::TEXTURE_ARRAY) {
     GPUImage2DArrayDescriptor desc;
     desc.data_type = data_type;
-    desc.access_type = access_type;
+    desc.access_type = access_type_;
     resources.image2d_arrays.push_back({"image2d_array", desc});
   } else if (storage_type == TensorStorageType::TEXTURE_3D) {
     GPUImage3DDescriptor desc;
     desc.data_type = data_type;
-    desc.access_type = access_type;
+    desc.access_type = access_type_;
     resources.images3d.push_back({"image3d", desc});
   } else if (storage_type == TensorStorageType::IMAGE_BUFFER) {
-    if (access_type == AccessType::READ) {
+    if (access_type_ == AccessType::READ) {
       GPUImageBufferDescriptor desc;
       desc.data_type = data_type;
-      desc.access_type = access_type;
+      desc.access_type = access_type_;
       resources.image_buffers.push_back({"image_buffer", desc});
     } else {
       GPUBufferDescriptor desc;
       desc.data_type = data_type;
-      desc.access_type = access_type;
+      desc.access_type = access_type_;
       desc.element_size = 4;
       resources.buffers.push_back({"buffer", desc});
     }
@@ -413,7 +413,11 @@ absl::Status TensorDescriptor::PerformGetHandleSelector(
       *result = "buffer";
       return absl::OkStatus();
     case TensorStorageType::IMAGE_BUFFER:
-      *result = "image_buffer";
+      if (access_type_ == AccessType::READ) {
+        *result = "image_buffer";
+      } else {
+        *result = "buffer";
+      }
       return absl::OkStatus();
     case TensorStorageType::TEXTURE_2D:
     case TensorStorageType::SINGLE_TEXTURE_2D:
diff --git a/tensorflow/lite/delegates/gpu/cl/tensor_type.h b/tensorflow/lite/delegates/gpu/cl/tensor_type.h
index df1c21cd5da..7371b4d7007 100644
--- a/tensorflow/lite/delegates/gpu/cl/tensor_type.h
+++ b/tensorflow/lite/delegates/gpu/cl/tensor_type.h
@@ -61,7 +61,7 @@ struct TensorDescriptor : public GPUObjectDescriptor {
                                const std::vector<std::string>& template_args,
                                std::string* result) const override;
 
-  GPUResources GetGPUResources(AccessType access_type) const override;
+  GPUResources GetGPUResources() const override;
 
   bool HasAxis(Axis axis) const;
   void SetTextureAddressMode(TextureAddressMode mode);
diff --git a/tensorflow/lite/delegates/gpu/cl/texture2d.cc b/tensorflow/lite/delegates/gpu/cl/texture2d.cc
index f2249a44c1d..ec4909dcac1 100644
--- a/tensorflow/lite/delegates/gpu/cl/texture2d.cc
+++ b/tensorflow/lite/delegates/gpu/cl/texture2d.cc
@@ -59,12 +59,11 @@ absl::Status CreateTexture2D(int width, int height, cl_channel_type type,
 }
 }  // namespace
 
-GPUResources Texture2DDescriptor::GetGPUResources(
-    AccessType access_type) const {
+GPUResources Texture2DDescriptor::GetGPUResources() const {
   GPUResources resources;
   GPUImage2DDescriptor desc;
   desc.data_type = element_type;
-  desc.access_type = access_type;
+  desc.access_type = access_type_;
   resources.images2d.push_back({"tex2d", desc});
   return resources;
 }
@@ -130,10 +129,16 @@ void Texture2D::Release() {
   }
 }
 
-GPUResourcesWithValue Texture2D::GetGPUResources(AccessType access_type) const {
-  GPUResourcesWithValue resources;
-  resources.images2d.push_back({"tex2d", texture_});
-  return resources;
+absl::Status Texture2D::GetGPUResources(
+    const GPUObjectDescriptor* obj_ptr,
+    GPUResourcesWithValue* resources) const {
+  const auto* texture_desc = dynamic_cast<const Texture2DDescriptor*>(obj_ptr);
+  if (!texture_desc) {
+    return absl::InvalidArgumentError("Expected Texture2DDescriptor on input.");
+  }
+
+  resources->images2d.push_back({"tex2d", texture_});
+  return absl::OkStatus();
 }
 
 // Creates new 4-channel 2D texture with f32 elements
diff --git a/tensorflow/lite/delegates/gpu/cl/texture2d.h b/tensorflow/lite/delegates/gpu/cl/texture2d.h
index cd41bb60aee..54a2732fc90 100644
--- a/tensorflow/lite/delegates/gpu/cl/texture2d.h
+++ b/tensorflow/lite/delegates/gpu/cl/texture2d.h
@@ -39,7 +39,7 @@ struct Texture2DDescriptor : public GPUObjectDescriptor {
                                const std::vector<std::string>& template_args,
                                std::string* result) const override;
 
-  GPUResources GetGPUResources(AccessType access_type) const override;
+  GPUResources GetGPUResources() const override;
   absl::Status PerformReadSelector(const std::vector<std::string>& args,
                                    std::string* result) const;
 };
@@ -70,7 +70,8 @@ class Texture2D : public GPUObject {
   template <typename T>
   absl::Status ReadData(CLCommandQueue* queue, std::vector<T>* result) const;
 
-  GPUResourcesWithValue GetGPUResources(AccessType access_type) const override;
+  absl::Status GetGPUResources(const GPUObjectDescriptor* obj_ptr,
+                               GPUResourcesWithValue* resources) const override;
 
  private:
   void Release();

From bd173ff4f8209c1a5cb046d85843c2f3deba4f8e Mon Sep 17 00:00:00 2001
From: Austin Anderson <angerson@google.com>
Date: Thu, 25 Jun 2020 13:07:09 -0700
Subject: [PATCH 1098/1390] Fix docker for numpy again

I forgot about the trouble using '<' in pip dependencies on the CLI.
This time I verified the build works.

PiperOrigin-RevId: 318332763
Change-Id: I347aee8121464232222e72d89409c600159ee80c
---
 .../tools/dockerfiles/dockerfiles/devel-cpu-jupyter.Dockerfile  | 2 +-
 tensorflow/tools/dockerfiles/dockerfiles/devel-cpu.Dockerfile   | 2 +-
 .../tools/dockerfiles/dockerfiles/devel-gpu-jupyter.Dockerfile  | 2 +-
 tensorflow/tools/dockerfiles/dockerfiles/devel-gpu.Dockerfile   | 2 +-
 .../dockerfiles/mkl_horovod/devel-horovod-jupyter.Dockerfile    | 2 +-
 .../dockerfiles/mkl_horovod/devel-horovod.Dockerfile            | 2 +-
 .../dockerfiles/ppc64le/devel-cpu-ppc64le-jupyter.Dockerfile    | 2 +-
 .../dockerfiles/ppc64le/devel-cpu-ppc64le.Dockerfile            | 2 +-
 .../dockerfiles/ppc64le/devel-gpu-ppc64le-jupyter.Dockerfile    | 2 +-
 .../dockerfiles/ppc64le/devel-gpu-ppc64le.Dockerfile            | 2 +-
 .../tools/dockerfiles/partials/ubuntu/bazel.partial.Dockerfile  | 2 +-
 .../dockerfiles/partials/ubuntu/bazelbuild.partial.Dockerfile   | 2 +-
 12 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/tensorflow/tools/dockerfiles/dockerfiles/devel-cpu-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/devel-cpu-jupyter.Dockerfile
index 3ea196bb354..78ec4416f47 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/devel-cpu-jupyter.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/devel-cpu-jupyter.Dockerfile
@@ -84,7 +84,7 @@ RUN python3 -m pip --no-cache-dir install \
     keras_preprocessing \
     matplotlib \
     mock \
-    numpy<1.19.0 \
+    'numpy<1.19.0' \
     scipy \
     sklearn \
     pandas \
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/devel-cpu.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/devel-cpu.Dockerfile
index d3a487d754c..018b7bb35ba 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/devel-cpu.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/devel-cpu.Dockerfile
@@ -84,7 +84,7 @@ RUN python3 -m pip --no-cache-dir install \
     keras_preprocessing \
     matplotlib \
     mock \
-    numpy<1.19.0 \
+    'numpy<1.19.0' \
     scipy \
     sklearn \
     pandas \
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/devel-gpu-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/devel-gpu-jupyter.Dockerfile
index f6e6b2e4869..b99c384fe20 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/devel-gpu-jupyter.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/devel-gpu-jupyter.Dockerfile
@@ -126,7 +126,7 @@ RUN python3 -m pip --no-cache-dir install \
     keras_preprocessing \
     matplotlib \
     mock \
-    numpy<1.19.0 \
+    'numpy<1.19.0' \
     scipy \
     sklearn \
     pandas \
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/devel-gpu.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/devel-gpu.Dockerfile
index 9301ca36712..4493964cffc 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/devel-gpu.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/devel-gpu.Dockerfile
@@ -126,7 +126,7 @@ RUN python3 -m pip --no-cache-dir install \
     keras_preprocessing \
     matplotlib \
     mock \
-    numpy<1.19.0 \
+    'numpy<1.19.0' \
     scipy \
     sklearn \
     pandas \
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/mkl_horovod/devel-horovod-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/mkl_horovod/devel-horovod-jupyter.Dockerfile
index a41f979aecd..5ed856259a9 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/mkl_horovod/devel-horovod-jupyter.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/mkl_horovod/devel-horovod-jupyter.Dockerfile
@@ -84,7 +84,7 @@ RUN python3 -m pip --no-cache-dir install \
     keras_preprocessing \
     matplotlib \
     mock \
-    numpy<1.19.0 \
+    'numpy<1.19.0' \
     scipy \
     sklearn \
     pandas \
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/mkl_horovod/devel-horovod.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/mkl_horovod/devel-horovod.Dockerfile
index 8da0f799662..a4a0bee0bc6 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/mkl_horovod/devel-horovod.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/mkl_horovod/devel-horovod.Dockerfile
@@ -84,7 +84,7 @@ RUN python3 -m pip --no-cache-dir install \
     keras_preprocessing \
     matplotlib \
     mock \
-    numpy<1.19.0 \
+    'numpy<1.19.0' \
     scipy \
     sklearn \
     pandas \
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-cpu-ppc64le-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-cpu-ppc64le-jupyter.Dockerfile
index 9542bf3d17a..14ae948c31a 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-cpu-ppc64le-jupyter.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-cpu-ppc64le-jupyter.Dockerfile
@@ -83,7 +83,7 @@ RUN python3 -m pip --no-cache-dir install \
     keras_preprocessing \
     matplotlib \
     mock \
-    numpy<1.19.0 \
+    'numpy<1.19.0' \
     scipy \
     sklearn \
     pandas \
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-cpu-ppc64le.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-cpu-ppc64le.Dockerfile
index 60b0e07c62a..c098b863eaa 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-cpu-ppc64le.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-cpu-ppc64le.Dockerfile
@@ -83,7 +83,7 @@ RUN python3 -m pip --no-cache-dir install \
     keras_preprocessing \
     matplotlib \
     mock \
-    numpy<1.19.0 \
+    'numpy<1.19.0' \
     scipy \
     sklearn \
     pandas \
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-gpu-ppc64le-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-gpu-ppc64le-jupyter.Dockerfile
index 00116417490..1967c20419c 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-gpu-ppc64le-jupyter.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-gpu-ppc64le-jupyter.Dockerfile
@@ -125,7 +125,7 @@ RUN python3 -m pip --no-cache-dir install \
     keras_preprocessing \
     matplotlib \
     mock \
-    numpy<1.19.0 \
+    'numpy<1.19.0' \
     scipy \
     sklearn \
     pandas \
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-gpu-ppc64le.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-gpu-ppc64le.Dockerfile
index 6c01d3499c9..ffd74c52efa 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-gpu-ppc64le.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-gpu-ppc64le.Dockerfile
@@ -125,7 +125,7 @@ RUN python3 -m pip --no-cache-dir install \
     keras_preprocessing \
     matplotlib \
     mock \
-    numpy<1.19.0 \
+    'numpy<1.19.0' \
     scipy \
     sklearn \
     pandas \
diff --git a/tensorflow/tools/dockerfiles/partials/ubuntu/bazel.partial.Dockerfile b/tensorflow/tools/dockerfiles/partials/ubuntu/bazel.partial.Dockerfile
index 7babbdaae97..9dbfb2e71ac 100644
--- a/tensorflow/tools/dockerfiles/partials/ubuntu/bazel.partial.Dockerfile
+++ b/tensorflow/tools/dockerfiles/partials/ubuntu/bazel.partial.Dockerfile
@@ -14,7 +14,7 @@ RUN python3 -m pip --no-cache-dir install \
     keras_preprocessing \
     matplotlib \
     mock \
-    numpy<1.19.0 \
+    'numpy<1.19.0' \
     scipy \
     sklearn \
     pandas \
diff --git a/tensorflow/tools/dockerfiles/partials/ubuntu/bazelbuild.partial.Dockerfile b/tensorflow/tools/dockerfiles/partials/ubuntu/bazelbuild.partial.Dockerfile
index f3f9d689506..015427b707b 100644
--- a/tensorflow/tools/dockerfiles/partials/ubuntu/bazelbuild.partial.Dockerfile
+++ b/tensorflow/tools/dockerfiles/partials/ubuntu/bazelbuild.partial.Dockerfile
@@ -13,7 +13,7 @@ RUN python3 -m pip --no-cache-dir install \
     keras_preprocessing \
     matplotlib \
     mock \
-    numpy<1.19.0 \
+    'numpy<1.19.0' \
     scipy \
     sklearn \
     pandas \

From b9347647b8558ee18bc73d3a26a7e760e702168c Mon Sep 17 00:00:00 2001
From: Geeta Chavan <geetac@google.com>
Date: Thu, 25 Jun 2020 13:08:54 -0700
Subject: [PATCH 1099/1390] Update version to master

PiperOrigin-RevId: 318333129
Change-Id: Ide2e10bc27cc0bfd0536944c556a34c8e49d1fba
---
 tensorflow/core/public/version.h      | 2 +-
 tensorflow/tensorflow.bzl             | 2 +-
 tensorflow/tools/pip_package/setup.py | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index 497f0b631ee..df2ff65b892 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -21,7 +21,7 @@ limitations under the License.
 // Also update tensorflow/tensorflow.bzl and
 // tensorflow/tools/pip_package/setup.py
 #define TF_MAJOR_VERSION 2
-#define TF_MINOR_VERSION 2
+#define TF_MINOR_VERSION 4
 #define TF_PATCH_VERSION 0
 
 // TF_VERSION_SUFFIX is non-empty for pre-releases (e.g. "-alpha", "-alpha.1",
diff --git a/tensorflow/tensorflow.bzl b/tensorflow/tensorflow.bzl
index 4a4f8837867..a80499ae813 100644
--- a/tensorflow/tensorflow.bzl
+++ b/tensorflow/tensorflow.bzl
@@ -59,7 +59,7 @@ load(
 # not contain rc or alpha, only numbers.
 # Also update tensorflow/core/public/version.h
 # and tensorflow/tools/pip_package/setup.py
-VERSION = "2.2.0"
+VERSION = "2.4.0"
 VERSION_MAJOR = VERSION.split(".")[0]
 
 # Sanitize a dependency so that it works correctly from code that includes
diff --git a/tensorflow/tools/pip_package/setup.py b/tensorflow/tools/pip_package/setup.py
index 42233f80c1c..0c476336781 100644
--- a/tensorflow/tools/pip_package/setup.py
+++ b/tensorflow/tools/pip_package/setup.py
@@ -49,7 +49,7 @@ from setuptools.dist import Distribution
 # result for pip.
 # Also update tensorflow/tensorflow.bzl and
 # tensorflow/core/public/version.h
-_VERSION = '2.2.0'
+_VERSION = '2.4.0'
 
 REQUIRED_PACKAGES = [
     'absl-py >= 0.7.0',

From fd1b654146fb3d1ab52b06cb74405a27d9f45037 Mon Sep 17 00:00:00 2001
From: Ran Chen <crccw@google.com>
Date: Thu, 25 Jun 2020 13:19:29 -0700
Subject: [PATCH 1100/1390] An update on multi worker strategy combinations

- Use the pool runner instead of starting new processes for each test.
- Respect required_gpus and add CPU/GPU MWMS combinations
- Remove 2 workers combinations to avoid too many combinations. chief+worker should be representative enough.
- Add a test case to use num_workers without strategy

PiperOrigin-RevId: 318335074
Change-Id: If25b0d7ab99dd43ad3e23f2be53d557d1c438319
---
 tensorflow/python/distribute/BUILD            |  3 +-
 tensorflow/python/distribute/combinations.py  | 44 ++++++++--
 .../python/distribute/combinations_test.py    |  9 ++
 .../python/distribute/multi_process_runner.py |  2 +
 .../distribute/strategy_combinations.py       | 82 +++++++++++++++----
 .../python/distribute/strategy_common_test.py | 22 +++--
 6 files changed, 130 insertions(+), 32 deletions(-)

diff --git a/tensorflow/python/distribute/BUILD b/tensorflow/python/distribute/BUILD
index 239f44f4926..0f461121b30 100644
--- a/tensorflow/python/distribute/BUILD
+++ b/tensorflow/python/distribute/BUILD
@@ -815,6 +815,7 @@ py_library(
         "//tensorflow/python:framework_combinations",
         "//tensorflow/python:framework_test_combinations_lib",
         "//tensorflow/python:platform",
+        "//tensorflow/python:tf_decorator",
         "//tensorflow/python:util",
         "//tensorflow/python/eager:context",
         "@six_archive//:six",
@@ -1796,7 +1797,7 @@ distribute_py_test(
     name = "strategy_common_test",
     srcs = ["strategy_common_test.py"],
     python_version = "PY3",
-    shard_count = 12,
+    shard_count = 2,
     tags = [
         "multi_and_single_gpu",
         # TODO(b/155301154): Enable this test on multi-gpu guitar once multi process
diff --git a/tensorflow/python/distribute/combinations.py b/tensorflow/python/distribute/combinations.py
index 9a479a3769b..44c3c8b9a6e 100644
--- a/tensorflow/python/distribute/combinations.py
+++ b/tensorflow/python/distribute/combinations.py
@@ -81,6 +81,8 @@ class ClusterParameters(combinations_lib.ParameterModifier):
       update["has_chief"] = strategy.has_chief if strategy else False
     if "num_workers" in requested_parameters:
       update["num_workers"] = strategy.num_workers if strategy else 1
+    if "runner" in requested_parameters:
+      update["runner"] = strategy.runner if strategy else None
     return update
 
 
@@ -218,7 +220,8 @@ class NamedDistribution(object):
                required_tpu=False,
                use_cloud_tpu=False,
                has_chief=False,
-               num_workers=1):
+               num_workers=1,
+               use_pool_runner=True):
     """Initialize NamedDistribution.
 
     Args:
@@ -229,6 +232,8 @@ class NamedDistribution(object):
       use_cloud_tpu: Whether the strategy requires cloud TPU.
       has_chief: Whether the strategy requires a chief worker.
       num_workers: The number of workers that the strategy requires.
+      use_pool_runner: Whether to use a pool runner so that workers are re-used
+        each time.
     """
     object.__init__(self)
     self._name = name
@@ -238,6 +243,23 @@ class NamedDistribution(object):
     self.use_cloud_tpu = use_cloud_tpu
     self.has_chief = has_chief
     self.num_workers = num_workers
+    self._runner = None
+
+    if _num_total_workers(self.has_chief, self.num_workers) > 1:
+      cluster_spec = multi_worker_test_base.create_cluster_spec(
+          has_chief=has_chief,
+          num_workers=num_workers,
+          num_ps=0,
+          has_eval=False)
+      if use_pool_runner:
+        # Need to create the strategy in the initializer so that collectives are
+        # configured before eager context initialization.
+        self._runner = multi_process_runner.MultiProcessPoolRunner(
+            cluster_spec, initializer=self._distribution_fn)
+
+  @property
+  def runner(self):
+    return self._runner
 
   @property
   def strategy(self):
@@ -360,7 +382,7 @@ def _multi_worker_test(test_method):
     arguments.
   """
 
-  def decorator(self, has_chief, num_workers, **kwargs):
+  def decorator(self, has_chief, num_workers, runner, **kwargs):
     if _num_total_workers(has_chief, num_workers) == 1 or _running_in_worker:
       # We're in worker process or the test is for single worker. Either case we
       # execute the test method directly instead of spawning subprocesses.
@@ -384,16 +406,22 @@ def _multi_worker_test(test_method):
     #                   # _running_in_worker is True
     #                   [sub process]test_method()
     test_id = self.id()
-    cluster_spec = multi_worker_test_base.create_cluster_spec(
-        has_chief=has_chief, num_workers=num_workers, num_ps=0, has_eval=False)
-    result = multi_process_runner.run(
-        _test_runner, cluster_spec, args=(test_id,))
-    for was_successful in result.return_value:
+    if runner:
+      result = runner.run(_test_runner, args=(test_id,))
+    else:
+      cluster_spec = multi_worker_test_base.create_cluster_spec(
+          has_chief=has_chief,
+          num_workers=num_workers,
+          num_ps=0,
+          has_eval=False)
+      result = multi_process_runner.run(
+          _test_runner, cluster_spec, args=(test_id,)).return_value
+    for was_successful in result:
       if not was_successful:
         raise AssertionError("some worker failed, see logs for details")
 
   argspec = tf_inspect.getfullargspec(test_method)
-  decorator_args = (argspec.args or []) + ["has_chief", "num_workers"]
+  decorator_args = (argspec.args or []) + ["has_chief", "num_workers", "runner"]
   decorator_argspec = argspec._replace(args=decorator_args)
   return tf_decorator.make_decorator(
       test_method, decorator, decorator_argspec=decorator_argspec)
diff --git a/tensorflow/python/distribute/combinations_test.py b/tensorflow/python/distribute/combinations_test.py
index 7033e3e3b33..6d9d0b2570f 100644
--- a/tensorflow/python/distribute/combinations_test.py
+++ b/tensorflow/python/distribute/combinations_test.py
@@ -19,6 +19,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import os
 import unittest
 
 from absl.testing import parameterized
@@ -105,6 +106,14 @@ class ClusterParametersShouldFailTest(test.TestCase, parameterized.TestCase):
     # combinations library should raise an exception.
     pass
 
+  @combinations.generate(combinations.combine(num_workers=2,))
+  def testUseWithoutStrategy(self):
+    # There's no perfect way to check if the test runs in a subprocess. We
+    # approximate by checking the presence of TF_CONFIG, which is normally not
+    # set to the main process.
+    self.assertNotEqual(os.getenv("TF_CONFIG"), "")
+    raise ValueError("actually run")
+
 
 # Tests that we *actually* run the test method in multiple workers instead of
 # just passing silently. More importantly, it verifies that the test can fail.
diff --git a/tensorflow/python/distribute/multi_process_runner.py b/tensorflow/python/distribute/multi_process_runner.py
index 4971eea93ad..3dd651e67b2 100644
--- a/tensorflow/python/distribute/multi_process_runner.py
+++ b/tensorflow/python/distribute/multi_process_runner.py
@@ -771,6 +771,8 @@ class MultiProcessPoolRunner(object):
     Returns:
       A list of return values.
     """
+    # TODO(b/150264776): skip in OSS until it's implemented.
+    multi_process_lib.Process()
     if self._runner is None:
       self._start()
 
diff --git a/tensorflow/python/distribute/strategy_combinations.py b/tensorflow/python/distribute/strategy_combinations.py
index 1fa42cb8645..21297edbcfa 100644
--- a/tensorflow/python/distribute/strategy_combinations.py
+++ b/tensorflow/python/distribute/strategy_combinations.py
@@ -20,10 +20,12 @@ from __future__ import print_function
 
 from tensorflow.python import tf2
 from tensorflow.python.distribute import central_storage_strategy
+from tensorflow.python.distribute import cluster_resolver
 from tensorflow.python.distribute import collective_all_reduce_strategy
 from tensorflow.python.distribute import combinations
 from tensorflow.python.distribute import distribution_strategy_context
 from tensorflow.python.distribute import mirrored_strategy as mirrored_lib
+from tensorflow.python.distribute import multi_process_runner
 from tensorflow.python.distribute import one_device_strategy as one_device_lib
 from tensorflow.python.distribute import tpu_strategy as tpu_lib
 from tensorflow.python.distribute.cluster_resolver import tpu_cluster_resolver
@@ -101,6 +103,35 @@ def _get_tpu_strategy_creator(steps_per_run,
   return _create_tpu_strategy
 
 
+def _get_multi_worker_mirrored_creator(required_gpus):
+
+  def _create_multi_worker_mirrored():
+    tf_config = cluster_resolver.TFConfigClusterResolver()
+    resolver = cluster_resolver.SimpleClusterResolver(
+        cluster_spec=tf_config.cluster_spec(),
+        task_type=tf_config.task_type,
+        task_id=tf_config.task_id,
+        environment=tf_config.environment,
+        num_accelerators={"GPU": required_gpus},
+        rpc_layer=tf_config.rpc_layer,
+    )
+    strategy = collective_all_reduce_strategy.CollectiveAllReduceStrategy(
+        cluster_resolver=resolver)
+    # TODO(b/152320929): Wait for the cluster before proceeding, otherwise
+    # collectives may hang if any worker launches collectives before the chief
+    # creates the strategy.
+    try:
+      multi_process_runner.barrier().wait()
+    except ValueError:
+      # If the creator is called in the main process,
+      # multi_process_runner.barrier() raises ValueError, which is safe to
+      # ignore.
+      pass
+    return strategy
+
+  return _create_multi_worker_mirrored
+
+
 # pylint: disable=g-long-lambda
 default_strategy = combinations.NamedDistribution(
     "Default",
@@ -169,18 +200,36 @@ central_storage_strategy_with_gpu_and_cpu = combinations.NamedDistribution(
     lambda: central_storage_strategy.CentralStorageStrategy(
         ["/gpu:0", "/cpu:0"]),
     required_gpus=1)
-multi_worker_mirrored_two_workers = combinations.NamedDistribution(
-    "MultiWorkerMirroredTwoWorkers",
-    collective_all_reduce_strategy.CollectiveAllReduceStrategy,
-    has_chief=False,
-    num_workers=2,
-)
-multi_worker_mirrored_one_chief_one_worker = combinations.NamedDistribution(
-    "MultiWorkerMirroredOneChiefOneWorker",
-    collective_all_reduce_strategy.CollectiveAllReduceStrategy,
+# chief + 1 worker, with CPU.
+multi_worker_mirrored_2x1_cpu = combinations.NamedDistribution(
+    "MultiWorkerMirrored2x1CPU",
+    _get_multi_worker_mirrored_creator(required_gpus=0),
     has_chief=True,
     num_workers=1,
 )
+# chief + 1 worker, with 1 GPU each.
+multi_worker_mirrored_2x1_gpu = combinations.NamedDistribution(
+    "MultiWorkerMirrored2x1GPU",
+    _get_multi_worker_mirrored_creator(required_gpus=1),
+    has_chief=True,
+    num_workers=1,
+    required_gpus=1,
+)
+# chief + 1 worker, with 2 GPU each.
+multi_worker_mirrored_2x2_gpu = combinations.NamedDistribution(
+    "MultiWorkerMirrored2x2GPU",
+    _get_multi_worker_mirrored_creator(required_gpus=2),
+    has_chief=True,
+    num_workers=1,
+    required_gpus=2,
+)
+# chief + 3 workers, with CPU.
+multi_worker_mirrored_4x1_cpu = combinations.NamedDistribution(
+    "MultiWorkerMirrored4x1CPU",
+    _get_multi_worker_mirrored_creator(required_gpus=0),
+    has_chief=True,
+    num_workers=3,
+)
 
 gradient_descent_optimizer_v1_fn = combinations.NamedObject(
     "GradientDescentV1",
@@ -284,14 +333,19 @@ def distributions_and_v1_and_v2_optimizers():
 
 
 strategies_minus_tpu = [
-    default_strategy, one_device_strategy, one_device_strategy_gpu,
-    mirrored_strategy_with_gpu_and_cpu, mirrored_strategy_with_two_gpus,
-    central_storage_strategy_with_gpu_and_cpu
+    default_strategy,
+    one_device_strategy,
+    one_device_strategy_gpu,
+    mirrored_strategy_with_gpu_and_cpu,
+    mirrored_strategy_with_two_gpus,
+    central_storage_strategy_with_gpu_and_cpu,
 ]
 
 strategies_minus_default_and_tpu = [
-    one_device_strategy, one_device_strategy_gpu,
-    mirrored_strategy_with_gpu_and_cpu, mirrored_strategy_with_two_gpus
+    one_device_strategy,
+    one_device_strategy_gpu,
+    mirrored_strategy_with_gpu_and_cpu,
+    mirrored_strategy_with_two_gpus,
 ]
 
 tpu_strategies = [
diff --git a/tensorflow/python/distribute/strategy_common_test.py b/tensorflow/python/distribute/strategy_common_test.py
index 7744364c544..03a7b539988 100644
--- a/tensorflow/python/distribute/strategy_common_test.py
+++ b/tensorflow/python/distribute/strategy_common_test.py
@@ -39,8 +39,10 @@ class StrategyReduceTest(test.TestCase, parameterized.TestCase):
 
   @combinations.generate(
       combinations.combine(
-          strategy=[strategy_combinations.multi_worker_mirrored_two_workers] +
-          strategy_combinations.strategies_minus_tpu,
+          strategy=[
+              strategy_combinations.multi_worker_mirrored_2x1_cpu,
+              strategy_combinations.multi_worker_mirrored_2x1_gpu,
+          ] + strategy_combinations.strategies_minus_tpu,
           mode=['eager']))
   def testSimpleReduce(self, strategy):
 
@@ -69,7 +71,10 @@ class StrategyReduceTest(test.TestCase, parameterized.TestCase):
 
 @combinations.generate(
     combinations.combine(
-        strategy=[strategy_combinations.multi_worker_mirrored_two_workers],
+        strategy=[
+            strategy_combinations.multi_worker_mirrored_2x1_cpu,
+            strategy_combinations.multi_worker_mirrored_2x1_gpu,
+        ],
         mode=['eager']))
 class DistributedCollectiveAllReduceStrategyTest(
     strategy_test_lib.DistributionTestBase,
@@ -83,7 +88,7 @@ class DistributedCollectiveAllReduceStrategyTest(
       return d.shard(input_context.num_input_pipelines,
                      input_context.input_pipeline_id)
 
-    expected_sum_on_workers = [10, 35]
+    expected_sum_on_workers = {'chief': 10, 'worker': 35}
     input_iterator = iter(
         strategy.experimental_distribute_datasets_from_function(dataset_fn))
 
@@ -95,7 +100,7 @@ class DistributedCollectiveAllReduceStrategyTest(
     sum_value = math_ops.reduce_sum(result)
     self.assertEqual(
         sum_value.numpy(),
-        expected_sum_on_workers[multi_worker_test_base.get_task_index()])
+        expected_sum_on_workers[multi_worker_test_base.get_task_type()])
 
   def testReduceHostTensor(self, strategy):
     reduced = strategy.reduce(
@@ -141,7 +146,7 @@ class StrategyClusterResolverTest(test.TestCase, parameterized.TestCase):
 
   @combinations.generate(
       combinations.combine(
-          strategy=[strategy_combinations.multi_worker_mirrored_two_workers] +
+          strategy=[strategy_combinations.multi_worker_mirrored_2x1_cpu] +
           strategy_combinations.all_strategies,
           mode=['eager']))
   def testClusterResolverProperty(self, strategy):
@@ -163,9 +168,8 @@ class StrategyClusterResolverTest(test.TestCase, parameterized.TestCase):
     self.assertTrue(hasattr(resolver, 'num_accelerators'))
     self.assertIsNone(resolver.rpc_layer)
     if isinstance(strategy, CollectiveAllReduceStrategy):
-      self.assertGreaterEqual(resolver.task_id, 0)
-      self.assertLessEqual(resolver.task_id, 1)
-      self.assertEqual(resolver.task_type, 'worker')
+      self.assertEqual(resolver.task_id, 0)
+      self.assertAllInSet(resolver.task_type, ['chief', 'worker'])
     elif isinstance(strategy, TPUStrategy):
       # TPUStrategy does not have task_id and task_type applicable.
       self.assertIsNone(resolver.task_id)

From e277ceb693e5311bffa7671306e3b0be9ca3e220 Mon Sep 17 00:00:00 2001
From: Felix Johnny <felixjohnny.thomasmathibalan@arm.com>
Date: Thu, 25 Jun 2020 22:27:20 +0200
Subject: [PATCH 1101/1390] Review comment fix: Incorrect assignment of 
 buffer_idx

buffer_idx was incorrectly initialized in the Eval routines
of Average and Max pool. This is fixed.
---
 tensorflow/lite/micro/kernels/cmsis-nn/pooling.cc | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/tensorflow/lite/micro/kernels/cmsis-nn/pooling.cc b/tensorflow/lite/micro/kernels/cmsis-nn/pooling.cc
index 0738b4894fc..aadeaceca11 100644
--- a/tensorflow/lite/micro/kernels/cmsis-nn/pooling.cc
+++ b/tensorflow/lite/micro/kernels/cmsis-nn/pooling.cc
@@ -145,12 +145,11 @@ void AverageEvalQuantized(TfLiteContext* context, const TfLiteNode* node,
     filter_dims.w = params->filter_width;
     filter_dims.c = 1;
 
-    auto* buffer_idx = reinterpret_cast<int*>(node->user_data);
     cmsis_nn_context ctx;
     ctx.buf = nullptr;
     ctx.size = 0;
-    if (*buffer_idx > -1) {
-      ctx.buf = context->GetScratchBuffer(context, *buffer_idx);
+    if (data.buffer_idx > -1) {
+      ctx.buf = context->GetScratchBuffer(context, data.buffer_idx);
     }
 
     TFLITE_DCHECK_EQ(
@@ -232,12 +231,11 @@ TfLiteStatus MaxEvalInt8(TfLiteContext* context, const TfLiteNode* node,
   filter_dims.w = params->filter_width;
   filter_dims.c = 1;
 
-  auto* buffer_idx = reinterpret_cast<int*>(node->user_data);
   cmsis_nn_context ctx;
   ctx.buf = nullptr;
   ctx.size = 0;
-  if (*buffer_idx > -1) {
-    ctx.buf = context->GetScratchBuffer(context, *buffer_idx);
+  if (data.buffer_idx > -1) {
+    ctx.buf = context->GetScratchBuffer(context, data.buffer_idx);
   }
 
   TFLITE_DCHECK_EQ(arm_max_pool_s8(&ctx, &pool_params, &input_dims,

From 241a209628ca74059df9f46ca4b316b1b23f9090 Mon Sep 17 00:00:00 2001
From: Yujing Zhang <yujingzhang@google.com>
Date: Thu, 25 Jun 2020 13:20:46 -0700
Subject: [PATCH 1102/1390] Enable packed variable in TPUStrategy by default
 when mlir bridge is disabled.

PiperOrigin-RevId: 318335348
Change-Id: I6384f381c8eca249ecb8e2c7398f049d431eb9d7
---
 tensorflow/python/distribute/tpu_strategy.py | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/tensorflow/python/distribute/tpu_strategy.py b/tensorflow/python/distribute/tpu_strategy.py
index 96d39a6ec2b..64b52b89622 100644
--- a/tensorflow/python/distribute/tpu_strategy.py
+++ b/tensorflow/python/distribute/tpu_strategy.py
@@ -220,7 +220,9 @@ class TPUStrategyV2(distribute_lib.Strategy):
     # Packed variable is used to reduce the overhead of function execution.
     # For a DistributedVariable, only one variable handle is captured into a
     # function graph. It's only supported in eager mode.
-    self._enable_packed_variable_in_eager_mode = False
+    # TODO(b/145922293): Enable this when MLIR bridge is enabled.
+    self._enable_packed_variable_in_eager_mode = (
+        not context.context().enable_mlir_bridge)
 
   def run(self, fn, args=(), kwargs=None, options=None):
     """Run the computation defined by `fn` on each TPU replica.
@@ -330,7 +332,9 @@ class TPUStrategy(distribute_lib.Strategy):
     # Packed variable is used to reduce the overhead of function execution.
     # For a DistributedVariable, only one variable handle is captured into a
     # function graph. It's only supported in eager mode.
-    self._enable_packed_variable_in_eager_mode = False
+    # TODO(b/145922293): Enable this when MLIR bridge is enabled.
+    self._enable_packed_variable_in_eager_mode = (
+        not context.context().enable_mlir_bridge)
 
   # TODO(cjfj): Modify `_call_for_each_replica` in `TPUExtended` such that this
   # can use the default implementation.
@@ -390,7 +394,9 @@ class TPUStrategyV1(distribute_lib.StrategyV1):
     # Packed variable is used to reduce the overhead of function execution.
     # For a DistributedVariable, only one variable handle is captured into a
     # function graph. It's only supported in eager mode.
-    self._enable_packed_variable_in_eager_mode = False
+    # TODO(b/145922293): Enable this when MLIR bridge is enabled.
+    self._enable_packed_variable_in_eager_mode = (
+        not context.context().enable_mlir_bridge)
 
   @property
   def steps_per_run(self):

From 15e40df1c98accf1942febb996af9f6470976bec Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 25 Jun 2020 13:24:36 -0700
Subject: [PATCH 1103/1390] [XLA] Change builder to emit kLogistic when asked
 to emit code for Logistic function.

PiperOrigin-RevId: 318336084
Change-Id: I1d28d021db80b0d6c93cf60d1ad44fb16086d961
---
 .../tensorflow/transforms/legalize_hlo_patterns.td    |  1 -
 tensorflow/compiler/mlir/xla/ir/hlo_ops.td            |  4 ----
 tensorflow/compiler/mlir/xla/ir/hlo_ops_base.td       | 11 -----------
 .../mlir/xla/transforms/legalize_tf_patterns.td       |  1 -
 tensorflow/compiler/xla/client/lib/math.cc            |  5 +++++
 tensorflow/compiler/xla/client/lib/math.h             |  3 +++
 tensorflow/compiler/xla/client/xla_builder.cc         |  3 ---
 tensorflow/compiler/xla/client/xla_builder.h          |  4 ----
 8 files changed, 8 insertions(+), 24 deletions(-)

diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/legalize_hlo_patterns.td b/tensorflow/compiler/mlir/tensorflow/transforms/legalize_hlo_patterns.td
index 28a857172d3..3e910cd9512 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/legalize_hlo_patterns.td
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/legalize_hlo_patterns.td
@@ -111,7 +111,6 @@ foreach Mapping = [[HLO_AbsOp, TF_AbsOp],
                    [HLO_IsFiniteOp, TF_IsFiniteOp],
                    [HLO_LogOp, TF_LogOp],
                    [HLO_Log1pOp, TF_Log1pOp],
-                   [HLO_LogisticOp, TF_SigmoidOp],
                    [HLO_NotOp, TF_LogicalNotOp],
                    [HLO_NegOp, TF_NegOp],
                    [HLO_RealOp, TF_RealOp],
diff --git a/tensorflow/compiler/mlir/xla/ir/hlo_ops.td b/tensorflow/compiler/mlir/xla/ir/hlo_ops.td
index 8ec0008515a..f92d1c5b85c 100644
--- a/tensorflow/compiler/mlir/xla/ir/hlo_ops.td
+++ b/tensorflow/compiler/mlir/xla/ir/hlo_ops.td
@@ -212,10 +212,6 @@ def HLO_Log1pOp: HLO_UnaryElementwiseOp<"log_plus_one",
     [NoSideEffect, SameOperandsAndResultType], HLO_FpOrComplexTensor>,
     BASE_HLO_Log1pOp;
 
-def HLO_LogisticOp: HLO_UnaryElementwiseOp<"logistic",
-    [NoSideEffect, SameOperandsAndResultType], HLO_FpOrComplexTensor>,
-    BASE_HLO_LogisticOp;
-
 def HLO_NotOp: HLO_UnaryElementwiseOp<"not",
     [NoSideEffect, SameOperandsAndResultType], HLO_PredOrIntTensor>,
     BASE_HLO_NotOp;
diff --git a/tensorflow/compiler/mlir/xla/ir/hlo_ops_base.td b/tensorflow/compiler/mlir/xla/ir/hlo_ops_base.td
index 84045d25e3e..b0975d9ab03 100644
--- a/tensorflow/compiler/mlir/xla/ir/hlo_ops_base.td
+++ b/tensorflow/compiler/mlir/xla/ir/hlo_ops_base.td
@@ -264,17 +264,6 @@ class BASE_HLO_Log1pOp {
   }];
 }
 
-class BASE_HLO_LogisticOp {
-  string summary = "Logistic operator";
-
-  string description = [{
-    Returns `logistic(operand)` element-wise.
-
-    See
-    https://www.tensorflow.org/xla/operation_semantics#element-wise_unary_functions.
-  }];
-}
-
 class BASE_HLO_NegOp {
   string summary = "Negation operator";
 
diff --git a/tensorflow/compiler/mlir/xla/transforms/legalize_tf_patterns.td b/tensorflow/compiler/mlir/xla/transforms/legalize_tf_patterns.td
index c7c7f142b11..df7b887fcad 100644
--- a/tensorflow/compiler/mlir/xla/transforms/legalize_tf_patterns.td
+++ b/tensorflow/compiler/mlir/xla/transforms/legalize_tf_patterns.td
@@ -543,7 +543,6 @@ foreach Mapping = [
                    [TF_NegOp, HLO_NegOp],
                    [TF_RealOp, HLO_RealOp],
                    [TF_RsqrtOp, HLO_RsqrtOp],
-                   [TF_SigmoidOp, HLO_LogisticOp],
                    [TF_SinOp, HLO_SinOp],
                    [TF_SqrtOp, HLO_SqrtOp],
                    [TF_TanhOp, HLO_TanhOp],
diff --git a/tensorflow/compiler/xla/client/lib/math.cc b/tensorflow/compiler/xla/client/lib/math.cc
index baafd7d705b..6cbaa043055 100644
--- a/tensorflow/compiler/xla/client/lib/math.cc
+++ b/tensorflow/compiler/xla/client/lib/math.cc
@@ -1393,6 +1393,11 @@ XlaOp NextAfter(XlaOp from, XlaOp to) {
   });
 }
 
+XlaOp Logistic(XlaOp x) {
+  auto one = xla::ScalarLike(x, 1);
+  return xla::Div(one, (one + xla::Exp(xla::Neg(x))));
+}
+
 // Computes an approximation to the modified Bessel function of the first kind,
 // zeroth order.
 // The following implementation follows Cephes' F32 and F64 implementation of
diff --git a/tensorflow/compiler/xla/client/lib/math.h b/tensorflow/compiler/xla/client/lib/math.h
index f03348c0a57..f862372a288 100644
--- a/tensorflow/compiler/xla/client/lib/math.h
+++ b/tensorflow/compiler/xla/client/lib/math.h
@@ -111,6 +111,9 @@ XlaOp Sinh(XlaOp x);
 // is true, otherwise returns its argument.
 xla::XlaOp MaybeConjugate(xla::XlaOp x, bool conjugate);
 
+// Computes the logistic function: logistic(x) = 0.5 + 0.5 * tanh(0.5 * x).
+XlaOp Logistic(XlaOp x);
+
 // Computes the Modified Bessel function of the first kind of the zeroth order
 // at x.
 XlaOp BesselI0e(XlaOp x);
diff --git a/tensorflow/compiler/xla/client/xla_builder.cc b/tensorflow/compiler/xla/client/xla_builder.cc
index 89bc30e1a0e..56e9aba6112 100644
--- a/tensorflow/compiler/xla/client/xla_builder.cc
+++ b/tensorflow/compiler/xla/client/xla_builder.cc
@@ -3639,9 +3639,6 @@ XlaOp Log(const XlaOp operand) {
 XlaOp Log1p(const XlaOp operand) {
   return operand.builder()->UnaryOp(HloOpcode::kLog1p, operand);
 }
-XlaOp Logistic(const XlaOp operand) {
-  return operand.builder()->UnaryOp(HloOpcode::kLogistic, operand);
-}
 XlaOp Sign(const XlaOp operand) {
   return operand.builder()->UnaryOp(HloOpcode::kSign, operand);
 }
diff --git a/tensorflow/compiler/xla/client/xla_builder.h b/tensorflow/compiler/xla/client/xla_builder.h
index 12623d7912f..23a29243ccc 100644
--- a/tensorflow/compiler/xla/client/xla_builder.h
+++ b/tensorflow/compiler/xla/client/xla_builder.h
@@ -1073,7 +1073,6 @@ class XlaBuilder {
   friend XlaOp Round(XlaOp operand);
   friend XlaOp Log(XlaOp operand);
   friend XlaOp Log1p(XlaOp operand);
-  friend XlaOp Logistic(XlaOp operand);
   friend XlaOp Sign(XlaOp operand);
   friend XlaOp Clz(XlaOp operand);
   friend XlaOp Cos(XlaOp operand);
@@ -1915,9 +1914,6 @@ XlaOp Log(XlaOp operand);
 // Enqueues an log1p instruction (log(x+1)) onto the computation.
 XlaOp Log1p(XlaOp operand);
 
-// Enqueues a logistic instruction onto the computation.
-XlaOp Logistic(XlaOp operand);
-
 // Enqueues a sign instruction onto the computation.
 XlaOp Sign(XlaOp operand);
 

From d1b34cd93b354fc59b0995a506dc6d4451ca6a33 Mon Sep 17 00:00:00 2001
From: Raman Sarokin <sorokin@google.com>
Date: Thu, 25 Jun 2020 13:40:02 -0700
Subject: [PATCH 1104/1390] ConvolutionTransposed3x3 converted to new style.

PiperOrigin-RevId: 318338990
Change-Id: Ib653b082ab3edc598856807d64f3ac2870f4381c
---
 .../cl/kernels/convolution_transposed_3x3.cc  | 207 +++++++++---------
 .../cl/kernels/convolution_transposed_3x3.h   |  26 ++-
 2 files changed, 127 insertions(+), 106 deletions(-)

diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3.cc b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3.cc
index 9b028721d2d..89eb75bfc68 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3.cc
@@ -30,18 +30,23 @@ namespace cl {
 namespace {
 
 std::string GenerateConvolutionTransposedCode(
-    const OperationDef& op_def, const LinearStorage& biases,
-    const std::vector<ElementwiseOperation*>& linked_operations,
+    const OperationDef& op_def,
     ConvolutionTransposed3x3::WeightsUploadType weights_upload_type,
-    int2 padding, int3 work_group_launch_order) {
-  std::string c = GetCommonDefines(op_def.precision);
-
-  TensorCodeGenerator src_tensor(
-      "src_data", WHSPoint{"src_size.x", "src_size.y", "src_size.z"},
-      op_def.src_tensors[0]);
-  TensorCodeGenerator dst_tensor(
-      "dst_data", WHSPoint{"dst_size.x", "dst_size.y", "dst_size.z"},
-      op_def.dst_tensors[0]);
+    int2 padding, int3 work_group_launch_order, Arguments* args) {
+  auto src_desc = absl::make_unique<TensorDescriptor>(op_def.src_tensors[0]);
+  src_desc->SetTextureAddressMode(TextureAddressMode::ZERO);
+  if (op_def.IsBatchSupported()) {
+    src_desc->SetStateVar("BatchedWidth", "true");
+  }
+  args->AddObjectRef("src_tensor", AccessType::READ, std::move(src_desc));
+  auto dst_desc = absl::make_unique<TensorDescriptor>(op_def.dst_tensors[0]);
+  if (op_def.IsBatchSupported()) {
+    dst_desc->SetStateVar("BatchedWidth", "true");
+  }
+  args->AddObjectRef("dst_tensor", AccessType::WRITE, std::move(dst_desc));
+  args->AddInt("filter_offset");
+  args->AddInt("padding_x");
+  args->AddInt("padding_y");
 
   const auto src_tensor_type = op_def.src_tensors[0].storage_type;
   const bool manual_clamp = src_tensor_type == TensorStorageType::BUFFER ||
@@ -53,6 +58,7 @@ std::string GenerateConvolutionTransposedCode(
       weights_upload_type ==
           ConvolutionTransposed3x3::WeightsUploadType::LOCAL_MEM_ASYNC;
 
+  std::string c = GetCommonDefines(op_def.precision);
   switch (op_def.precision) {
     case CalculationsPrecision::F32:
     case CalculationsPrecision::F16:
@@ -77,19 +83,10 @@ std::string GenerateConvolutionTransposedCode(
           : "__global";
 
   const std::string pixel_stride =
-      op_def.IsBatchSupported() ? "dst_size.w" : "1";
+      op_def.IsBatchSupported() ? "args.dst_tensor.Batch()" : "1";
   c += "__attribute__((reqd_work_group_size(8, 4, 1)))\n";
   c += "__kernel void main_function(\n";
-  c += src_tensor.GetDeclaration(AccessType::READ) + ",\n";
-  c += "    " + weights_space + " FLT4* filters,\n";
-  c += biases.GetDeclaration();
-  c += GetArgsDeclaration(linked_operations);
-  c += dst_tensor.GetDeclaration(AccessType::WRITE) + ",\n";
-  c += "    int4 src_size,             \n";
-  c += "    int4 dst_size,             \n";
-  c += "    int filter_offset,         \n";
-  c += "    int2 padding               \n";
-  c += ") {\n";
+  c += "$0) {\n";
   int3 launch_remap;
   launch_remap[work_group_launch_order.x] = 0;
   launch_remap[work_group_launch_order.y] = 1;
@@ -106,28 +103,28 @@ std::string GenerateConvolutionTransposedCode(
   };
   if (op_def.IsBatchSupported()) {
     c += "  int linear_id = " + GetGlobalID(0) + ";\n";
-    c += "  int X0 = linear_id / dst_size.w;\n";
-    c += "  int B = linear_id % dst_size.w;\n";
-    c += "  int DST_X = X0 * 2 * dst_size.w + B;\n";
-    c += "  int SRC_X = linear_id + padding.x;\n";
+    c += "  int X0 = linear_id / args.dst_tensor.Batch();\n";
+    c += "  int B = linear_id % args.dst_tensor.Batch();\n";
+    c += "  int DST_X = X0 * 2 * args.dst_tensor.Batch() + B;\n";
+    c += "  int SRC_X = linear_id + args.padding_x;\n";
   } else {
     c += "  int X = " + GetGlobalID(0) + ";\n";
     c += "  int DST_X = X * 2;\n";
-    c += "  int SRC_X = X + padding.x;\n";
+    c += "  int SRC_X = X + args.padding_x;\n";
   }
   c += "  int Y = " + GetGlobalID(1) + ";\n";
   c += "  int DST_Y = Y * 2;\n";
-  c += "  int SRC_Y = Y + padding.y;\n";
+  c += "  int SRC_Y = Y + args.padding_y;\n";
   c += "  int Z = " + GetGlobalID(2) + ";\n";
   if (!need_local_mem) {
-    c += "  if (DST_X >= dst_size.x || DST_Y >= dst_size.y || Z >= dst_size.z) "
-         "return;\n";
+    c += "  if (DST_X >= args.dst_tensor.Width() || DST_Y >= "
+         "args.dst_tensor.Height() || Z >= args.dst_tensor.Slices()) return;\n";
   }
   c += "  ACCUM_FLT4 r0 = (ACCUM_FLT4)(0.0f);\n";
   c += "  ACCUM_FLT4 r1 = (ACCUM_FLT4)(0.0f);\n";
   c += "  ACCUM_FLT4 r2 = (ACCUM_FLT4)(0.0f);\n";
   c += "  ACCUM_FLT4 r3 = (ACCUM_FLT4)(0.0f);\n";
-  c += "  int f_offset = Z * filter_offset;\n";
+  c += "  int f_offset = Z * args.filter_offset;\n";
   if (need_local_mem) {
     c += "  __local FLT4 weights_cache[36];\n";
   }
@@ -136,35 +133,41 @@ std::string GenerateConvolutionTransposedCode(
     c += "  int local_id = (int)(get_local_id(1) * 8 + get_local_id(0));\n";
   }
   if (manual_clamp) {
-    const std::string layer_offset = "src_size.x * src_size.y";
     const std::string next_x = "SRC_X + " + pixel_stride;
-    c += "  bool in_x0 = SRC_X >= 0 && SRC_X < src_size.x;\n";
-    c += "  bool in_x1 = " + next_x + " >= 0 && " + next_x + " < src_size.x;\n";
-    c += "  bool in_y0 = SRC_Y >= 0 && SRC_Y < src_size.y;\n";
-    c += "  bool in_y1 = SRC_Y + 1 >= 0 && SRC_Y + 1 < src_size.y;\n";
+    c += "  bool in_x0 = SRC_X >= 0 && SRC_X < args.src_tensor.Width();\n";
+    c += "  bool in_x1 = " + next_x + " >= 0 && " + next_x +
+         " < args.src_tensor.Width();\n";
+    c += "  bool in_y0 = SRC_Y >= 0 && SRC_Y < args.src_tensor.Height();\n";
+    c += "  bool in_y1 = SRC_Y + 1 >= 0 && SRC_Y + 1 < "
+         "args.src_tensor.Height();\n";
     if (src_tensor_type == TensorStorageType::BUFFER) {
-      c += "  int xc0 = clamp(SRC_X, 0, src_size.x - 1);\n";
-      c += "  int xc1 = clamp(" + next_x + ", 0, src_size.x - 1);\n";
-      c += "  int yc0 = clamp(SRC_Y, 0, src_size.y - 1);\n";
-      c += "  int yc1 = clamp(SRC_Y + 1, 0, src_size.y - 1);\n";
-      c += "  " + src_tensor.GetAddressWHS("addr_0", "xc0", "yc0", "0");
-      c += "  " + src_tensor.GetAddressWHS("addr_1", "xc1", "yc0", "0");
-      c += "  " + src_tensor.GetAddressWHS("addr_2", "xc0", "yc1", "0");
-      c += "  " + src_tensor.GetAddressWHS("addr_3", "xc1", "yc1", "0");
-      c += "  int dz = " + layer_offset + ";\n";
+      c += "  int xc0 = clamp(SRC_X, 0, args.src_tensor.Width() - 1);\n";
+      c += "  int xc1 = clamp(" + next_x +
+           ", 0, args.src_tensor.Width() - 1);\n";
+      c += "  int yc0 = clamp(SRC_Y, 0, args.src_tensor.Height() - 1);\n";
+      c += "  int yc1 = clamp(SRC_Y + 1, 0, args.src_tensor.Height() - 1);\n";
+      c += "  args.src_tensor.GetAddress(addr_0, xc0, yc0, 0);\n";
+      c += "  args.src_tensor.GetAddress(addr_1, xc1, yc0, 0);\n";
+      c += "  args.src_tensor.GetAddress(addr_2, xc0, yc1, 0);\n";
+      c += "  args.src_tensor.GetAddress(addr_3, xc1, yc1, 0);\n";
+      c += "  int dz = args.src_tensor.SliceStride();\n";
     } else {  // TensorStorageType::IMAGE_BUFFER
-      c += "  " + src_tensor.GetAddressWHS("addr_0", "SRC_X", "SRC_Y", "0");
-      c += "  " + src_tensor.GetAddressWHS("addr_1", next_x, "SRC_Y", "0");
-      c += "  " + src_tensor.GetAddressWHS("addr_2", "SRC_X", "SRC_Y + 1", "0");
-      c += "  " + src_tensor.GetAddressWHS("addr_3", next_x, "SRC_Y + 1", "0");
+      c += "  args.src_tensor.GetAddress(addr_0, SRC_X, SRC_Y, 0);\n";
+      c += "  args.src_tensor.GetAddress(addr_1," + next_x + ", SRC_Y, 0);\n";
+      c += "  args.src_tensor.GetAddress(addr_2, SRC_X, SRC_Y + 1, 0);\n";
+      c += "  args.src_tensor.GetAddress(addr_3," + next_x + ", SRC_Y+1, 0);\n";
       c += "  addr_0 = select(-1, addr_0, (in_x0 && in_y0));\n";
       c += "  addr_1 = select(-1, addr_1, (in_x1 && in_y0));\n";
       c += "  addr_2 = select(-1, addr_2, (in_x0 && in_y1));\n";
       c += "  addr_3 = select(-1, addr_3, (in_x1 && in_y1));\n";
-      c += "  int dz_0 = select(0, " + layer_offset + ", (in_x0 && in_y0));\n";
-      c += "  int dz_1 = select(0, " + layer_offset + ", (in_x1 && in_y0));\n";
-      c += "  int dz_2 = select(0, " + layer_offset + ", (in_x0 && in_y1));\n";
-      c += "  int dz_3 = select(0, " + layer_offset + ", (in_x1 && in_y1));\n";
+      c += "  int dz_0 = select(0, args.src_tensor.SliceStride(), (in_x0 && "
+           "in_y0));\n";
+      c += "  int dz_1 = select(0, args.src_tensor.SliceStride(), (in_x1 && "
+           "in_y0));\n";
+      c += "  int dz_2 = select(0, args.src_tensor.SliceStride(), (in_x0 && "
+           "in_y1));\n";
+      c += "  int dz_3 = select(0, args.src_tensor.SliceStride(), (in_x1 && "
+           "in_y1));\n";
     }
   }
   auto read_src = [&](int x, int y) {
@@ -172,17 +175,16 @@ std::string GenerateConvolutionTransposedCode(
       const std::string id = std::to_string(y * 2 + x);
       const std::string addr = "addr_" + std::to_string(y * 2 + x);
       if (src_tensor_type == TensorStorageType::IMAGE_BUFFER) {
-        return src_tensor.Read(addr) + "; " + addr + " += dz_" + id + ";\n";
+        return "args.src_tensor.Read(" + addr + "); " + addr + " += dz_" + id +
+               ";\n";
       } else {
-        return src_tensor.Read(addr) + " * (FLT)(in_x" + std::to_string(x) +
-               " && in_y" + std::to_string(y) + "); " + addr + " += dz;\n";
+        return "args.src_tensor.Read(" + addr + ") * (FLT)(in_x" +
+               std::to_string(x) + " && in_y" + std::to_string(y) + "); " +
+               addr + " += dz;\n";
       }
     } else {
-      return src_tensor.ReadWHS(
-                 "SRC_X + " + std::to_string(x) + "*" + pixel_stride,
-                 "SRC_Y + " + std::to_string(y), "s",
-                 TextureAddressMode::ZERO) +
-             ";\n";
+      return "args.src_tensor.Read(SRC_X + " + std::to_string(x) + "*" +
+             pixel_stride + ", SRC_Y + " + std::to_string(y) + ", s);\n";
     }
   };
   const int padding_x_rem = abs(padding.x) % 2;
@@ -201,25 +203,28 @@ std::string GenerateConvolutionTransposedCode(
     permutation = {{0, 0}, {0, 1}, {0, 2}, {0, 3}, {1, 1},
                    {1, 3}, {2, 2}, {2, 3}, {3, 3}};
   }
-  c += "  for (int s = 0; s < src_size.z; ++s) {\n";
+  c += "  for (int s = 0; s < args.src_tensor.Slices(); ++s) {\n";
   if (need_local_mem) {
     c += "    barrier(CLK_LOCAL_MEM_FENCE);\n";
   }
   if (weights_upload_type ==
       ConvolutionTransposed3x3::WeightsUploadType::LOCAL_MEM_ASYNC) {
-    c += "    async_work_group_copy(weights_cache, filters + f_offset, 36, "
+    c += "    async_work_group_copy(weights_cache, "
+         "args.weights.GetPtr(f_offset), 36, "
          "0);\n";
   } else if (weights_upload_type ==
              ConvolutionTransposed3x3::WeightsUploadType::
                  LOCAL_MEM_BY_THREADS) {
-    c += "    weights_cache[local_id] = filters[f_offset + local_id];\n";
+    c += "    weights_cache[local_id] = args.weights.Read(f_offset + "
+         "local_id);\n";
     c += "    if (local_id < 4) {\n";
-    c += "      weights_cache[local_id + 32] = filters[f_offset + local_id + "
-         "32];\n";
+    c += "      weights_cache[local_id + 32] = args.weights.Read(f_offset + "
+         "local_id + "
+         "32);\n";
     c += "    };\n";
   } else {  // GLOBAL_MEM/CONSTANT_MEM
-    c +=
-        "    " + weights_space + " FLT4* weights_cache = filters + f_offset;\n";
+    c += "    " + weights_space +
+         " FLT4* weights_cache = args.weights.GetPtr(f_offset);\n";
   }
   c += "    FLT4 src0 = " + read_src(0, 0);
   c += "    FLT4 src1 = " + read_src(1, 0);
@@ -237,10 +242,10 @@ std::string GenerateConvolutionTransposedCode(
   }
   c += "  }\n";
   if (need_local_mem) {
-    c += "  if (DST_X >= dst_size.x || DST_Y >= dst_size.y || Z >= dst_size.z) "
-         "return;\n";
+    c += "  if (DST_X >= args.dst_tensor.Width() || DST_Y >= "
+         "args.dst_tensor.Height() || Z >= args.dst_tensor.Slices()) return;\n";
   }
-  c += "  FLT4 bias_val = " + biases.ReadLinearFLT4("Z") + ";\n";
+  c += "  FLT4 bias_val = args.biases.Read(Z);\n";
   for (int y = 0; y < 2; ++y) {
     for (int x = 0; x < 2; ++x) {
       const std::string s_x = std::to_string(x);
@@ -248,11 +253,10 @@ std::string GenerateConvolutionTransposedCode(
       const std::string id = std::to_string(y * 2 + x);
       const std::string x_c = "DST_X + " + s_x + " * " + pixel_stride;
       const std::string y_c = "DST_Y + " + s_y;
-      c += "  if (" + x_c + " < dst_size.x && " + y_c + " < dst_size.y) {\n";
+      c += "  if (" + x_c + " < args.dst_tensor.Width() && " + y_c +
+           " < args.dst_tensor.Height()) {\n";
       c += "    FLT4 res0 = TO_FLT4(r" + id + ") + bias_val;\n";
-      const LinkingContext context{"res0", x_c, y_c, "Z"};
-      c += PostProcess(linked_operations, context);
-      c += "    " + dst_tensor.WriteWHS("res0", x_c, y_c, "Z");
+      c += "    args.dst_tensor.Write(res0, " + x_c + ", " + y_c + ", Z);\n";
       c += "  }\n";
     }
   }
@@ -283,9 +287,7 @@ ConvolutionTransposed3x3::ConvolutionTransposed3x3(
     : GPUOperation(std::move(operation)),
       padding_(operation.padding_),
       work_group_launch_order_(operation.work_group_launch_order_),
-      weights_(std::move(operation.weights_)),
       weights_upload_type_(operation.weights_upload_type_),
-      biases_(std::move(operation.biases_)),
       kernel_(std::move(operation.kernel_)),
       work_group_size_(operation.work_group_size_) {}
 
@@ -294,9 +296,7 @@ ConvolutionTransposed3x3& ConvolutionTransposed3x3::operator=(
   if (this != &operation) {
     std::swap(padding_, operation.padding_);
     std::swap(work_group_launch_order_, operation.work_group_launch_order_);
-    weights_ = std::move(operation.weights_);
     std::swap(weights_upload_type_, operation.weights_upload_type_);
-    biases_ = std::move(operation.biases_);
     kernel_ = std::move(operation.kernel_);
     std::swap(work_group_size_, operation.work_group_size_);
     GPUOperation::operator=(std::move(operation));
@@ -306,9 +306,16 @@ ConvolutionTransposed3x3& ConvolutionTransposed3x3::operator=(
 
 absl::Status ConvolutionTransposed3x3::Compile(
     const CreationContext& creation_context) {
-  const auto code = GenerateConvolutionTransposedCode(
-      definition_, biases_, linked_operations_, weights_upload_type_, padding_,
-      work_group_launch_order_);
+  std::string code = GenerateConvolutionTransposedCode(
+      definition_, weights_upload_type_, padding_, work_group_launch_order_,
+      &args_);
+  std::string element_wise_code;
+  RETURN_IF_ERROR(
+      MergeOperations(linked_operations_, &args_, &element_wise_code));
+  RETURN_IF_ERROR(args_.TransformToCLCode(creation_context.device->GetInfo(),
+                                          {{"dst_tensor", element_wise_code}},
+                                          &code));
+
   std::vector<CompilerOptions> options;
   if (definition_.precision == CalculationsPrecision::F16 &&
       creation_context.device->IsPowerVR()) {
@@ -321,21 +328,17 @@ absl::Status ConvolutionTransposed3x3::Compile(
 }
 
 absl::Status ConvolutionTransposed3x3::BindArguments() {
-  kernel_.ResetBindingCounter();
-  RETURN_IF_ERROR(kernel_.SetMemoryAuto(src_[0]->GetMemoryPtr()));
-  RETURN_IF_ERROR(kernel_.SetMemoryAuto(weights_.GetMemoryPtr()));
-  RETURN_IF_ERROR(kernel_.SetMemoryAuto(biases_.GetMemoryPtr()));
-  RETURN_IF_ERROR(BindArgs(&kernel_, linked_operations_));
-  RETURN_IF_ERROR(kernel_.SetMemoryAuto(dst_[0]->GetMemoryPtrForWriting()));
-  RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->GetWBatchedHSB()));
-  RETURN_IF_ERROR(kernel_.SetBytesAuto(dst_[0]->GetWBatchedHSB()));
-  const int filters_offset = 4 * 9 * src_[0]->Slices();
-  RETURN_IF_ERROR(kernel_.SetBytesAuto(filters_offset));
+  RETURN_IF_ERROR(args_.SetObjectRef("src_tensor", src_[0]));
+  RETURN_IF_ERROR(args_.SetObjectRef("dst_tensor", dst_[0]));
+  RETURN_IF_ERROR(args_.SetInt("filter_offset", 4 * 9 * src_[0]->Slices()));
   const int padding_x =
       padding_.x >= 1 ? (padding_.x - 1) / 2 : (padding_.x - 2) / 2;
   const int padding_y =
       padding_.y >= 1 ? (padding_.y - 1) / 2 : (padding_.y - 2) / 2;
-  return kernel_.SetBytesAuto(int2(padding_x * src_[0]->Batch(), padding_y));
+  RETURN_IF_ERROR(args_.SetInt("padding_x", padding_x * src_[0]->Batch()));
+  RETURN_IF_ERROR(args_.SetInt("padding_y", padding_y));
+  RETURN_IF_ERROR(SetArguments(linked_operations_, &args_));
+  return args_.Bind(kernel_.kernel());
 }
 
 int3 ConvolutionTransposed3x3::GetGridSize() const {
@@ -378,13 +381,17 @@ absl::Status CreateConvolutionTransposed3x3(
       ConvolutionTransposed3x3(definition, *creation_context.device, padding);
   RETURN_IF_ERROR(
       result->UploadWeights(attr.weights, creation_context.context));
-  LinearStorageCreateInfo create_info;
-  create_info.storage_type = LinearStorageType::TEXTURE_2D;
-  create_info.data_type = definition.GetDataType();
-  create_info.name = "biases";
-  create_info.aligned_size = attr.weights.shape.o;
-  RETURN_IF_ERROR(CreateLinearStorage(
-      create_info, attr.bias, creation_context.context, &result->biases_));
+
+  TensorLinearDescriptor desc;
+  desc.storage_type = LinearStorageType::TEXTURE_2D;
+  desc.element_type = definition.GetDataType();
+
+  LinearStorage lt;
+  RETURN_IF_ERROR(
+      CreateLinearStorage(desc, attr.bias, creation_context.context, &lt));
+  result->args_.AddObject("biases", AccessType::READ,
+                          absl::make_unique<LinearStorage>(std::move(lt)),
+                          absl::make_unique<TensorLinearDescriptor>(desc));
   return absl::OkStatus();
 }
 
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3.h b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3.h
index fa44d6a7270..b11c83dfd85 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3.h
@@ -73,9 +73,7 @@ class ConvolutionTransposed3x3 : public GPUOperation {
 
   int2 padding_;
   int3 work_group_launch_order_;
-  Buffer weights_;
   WeightsUploadType weights_upload_type_;
-  LinearStorage biases_;
 
   CLKernel kernel_;
   int3 work_group_size_ = int3(8, 4, 1);
@@ -93,17 +91,33 @@ absl::Status ConvolutionTransposed3x3::UploadWeights(
   const bool f32_weights = definition_.precision == CalculationsPrecision::F32;
   const int flt4_size = f32_weights ? sizeof(float4) : sizeof(half4);
 
+  Buffer weights_buffer;
   if (f32_weights) {
     std::vector<float4> gpu_data(flt4_count);
     RearrangeWeightsData(weights, absl::MakeSpan(gpu_data));
-    return CreateReadOnlyBuffer(flt4_size * flt4_count, gpu_data.data(),
-                                context, &weights_);
+    RETURN_IF_ERROR(CreateReadOnlyBuffer(
+        flt4_size * flt4_count, gpu_data.data(), context, &weights_buffer));
   } else {
     std::vector<half4> gpu_data(flt4_count);
     RearrangeWeightsData(weights, absl::MakeSpan(gpu_data));
-    return CreateReadOnlyBuffer(flt4_size * flt4_count, gpu_data.data(),
-                                context, &weights_);
+    RETURN_IF_ERROR(CreateReadOnlyBuffer(
+        flt4_size * flt4_count, gpu_data.data(), context, &weights_buffer));
   }
+
+  BufferDescriptor desc;
+  desc.element_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
+  desc.element_size = 4;
+  desc.memory_type =
+      weights_upload_type_ ==
+              ConvolutionTransposed3x3::WeightsUploadType::CONSTANT_MEM
+          ? MemoryType::CONSTANT
+          : MemoryType::GLOBAL;
+
+  args_.AddObject("weights", AccessType::READ,
+                  absl::make_unique<Buffer>(std::move(weights_buffer)),
+                  absl::make_unique<BufferDescriptor>(desc));
+
+  return absl::OkStatus();
 }
 
 template <DataType S, typename T>

From 3b1d46a9960f7aca36af45477152e4b9cfb34381 Mon Sep 17 00:00:00 2001
From: Raman Sarokin <sorokin@google.com>
Date: Thu, 25 Jun 2020 13:40:11 -0700
Subject: [PATCH 1105/1390] ConvolutionTransposed4x4 converted to new style.

PiperOrigin-RevId: 318339012
Change-Id: I27d104bf2482fc3bbe5559367fc2a000c90c0269
---
 .../cl/kernels/convolution_transposed_4x4.cc  | 221 +++++++++---------
 .../cl/kernels/convolution_transposed_4x4.h   |  26 ++-
 2 files changed, 130 insertions(+), 117 deletions(-)

diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_4x4.cc b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_4x4.cc
index 209b675087e..61882f29f15 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_4x4.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_4x4.cc
@@ -30,17 +30,21 @@ namespace cl {
 namespace {
 
 std::string GenerateConvolutionTransposedCode(
-    const OperationDef& op_def, const LinearStorage& biases,
-    const std::vector<ElementwiseOperation*>& linked_operations,
-    ConvolutionTransposed4x4::WeightsUploadType weights_upload_type) {
-  std::string c = GetCommonDefines(op_def.precision);
-
-  TensorCodeGenerator src_tensor(
-      "src_data", WHSPoint{"src_size.x", "src_size.y", "src_size.z"},
-      op_def.src_tensors[0]);
-  TensorCodeGenerator dst_tensor(
-      "dst_data", WHSPoint{"dst_size.x", "dst_size.y", "dst_size.z"},
-      op_def.dst_tensors[0]);
+    const OperationDef& op_def,
+    ConvolutionTransposed4x4::WeightsUploadType weights_upload_type,
+    Arguments* args) {
+  auto src_desc = absl::make_unique<TensorDescriptor>(op_def.src_tensors[0]);
+  src_desc->SetTextureAddressMode(TextureAddressMode::ZERO);
+  if (op_def.IsBatchSupported()) {
+    src_desc->SetStateVar("BatchedWidth", "true");
+  }
+  args->AddObjectRef("src_tensor", AccessType::READ, std::move(src_desc));
+  auto dst_desc = absl::make_unique<TensorDescriptor>(op_def.dst_tensors[0]);
+  if (op_def.IsBatchSupported()) {
+    dst_desc->SetStateVar("BatchedWidth", "true");
+  }
+  args->AddObjectRef("dst_tensor", AccessType::WRITE, std::move(dst_desc));
+  args->AddInt("filter_offset");
 
   const auto src_tensor_type = op_def.src_tensors[0].storage_type;
   const bool manual_clamp = src_tensor_type == TensorStorageType::BUFFER ||
@@ -52,6 +56,7 @@ std::string GenerateConvolutionTransposedCode(
       weights_upload_type ==
           ConvolutionTransposed4x4::WeightsUploadType::LOCAL_MEM_ASYNC;
 
+  std::string c = GetCommonDefines(op_def.precision);
   switch (op_def.precision) {
     case CalculationsPrecision::F32:
     case CalculationsPrecision::F16:
@@ -76,33 +81,26 @@ std::string GenerateConvolutionTransposedCode(
           : "__global";
 
   const std::string pixel_stride =
-      op_def.IsBatchSupported() ? "dst_size.w" : "1";
+      op_def.IsBatchSupported() ? "args.dst_tensor.Batch()" : "1";
   c += "__attribute__((reqd_work_group_size(8, 4, 1)))\n";
   c += "__kernel void main_function(\n";
-  c += src_tensor.GetDeclaration(AccessType::READ) + ",\n";
-  c += "    " + weights_space + " FLT4* filters,\n";
-  c += biases.GetDeclaration();
-  c += GetArgsDeclaration(linked_operations);
-  c += dst_tensor.GetDeclaration(AccessType::WRITE) + ",\n";
-  c += "    int4 src_size,             \n";
-  c += "    int4 dst_size,             \n";
-  c += "    int filter_offset          \n";
-  c += ") {\n";
+  c += "$0) {\n";
   if (op_def.IsBatchSupported()) {
     c += "  int linear_id = get_global_id(0);\n";
-    c += "  int X0 = linear_id / dst_size.w;\n";
-    c += "  int B = linear_id % dst_size.w;\n";
+    c += "  int X0 = linear_id / args.dst_tensor.Batch();\n";
+    c += "  int B = linear_id % args.dst_tensor.Batch();\n";
   }
   c += "  int X = get_global_id(0);\n";
   c += "  int Y = get_global_id(1);\n";
   c += "  int Z = get_global_id(2);\n";
   if (!need_local_mem) {
     if (op_def.IsBatchSupported()) {
-      c += "  if (X0 * 2 * dst_size.w > dst_size.x || Y * 2 > dst_size.y || Z "
-           ">= "
-           "dst_size.z) return;\n";
+      c += "  if (X0 * 2 * args.dst_tensor.Batch() > args.dst_tensor.Width() "
+           "|| Y * 2 > args.dst_tensor.Height() || Z "
+           ">= args.dst_tensor.Slices()) return;\n";
     } else {
-      c += "  if (X * 2 > dst_size.x || Y * 2 > dst_size.y || Z >= dst_size.z) "
+      c += "  if (X * 2 > args.dst_tensor.Width() || Y * 2 > "
+           "args.dst_tensor.Height() || Z >= args.dst_tensor.Slices()) "
            "return;\n";
     }
   }
@@ -110,7 +108,7 @@ std::string GenerateConvolutionTransposedCode(
   c += "  ACCUM_FLT4 r1 = (ACCUM_FLT4)(0.0f);\n";
   c += "  ACCUM_FLT4 r2 = (ACCUM_FLT4)(0.0f);\n";
   c += "  ACCUM_FLT4 r3 = (ACCUM_FLT4)(0.0f);\n";
-  c += "  int f_offset = Z * filter_offset;\n";
+  c += "  int f_offset = Z * args.filter_offset;\n";
   if (need_local_mem) {
     c += "  __local FLT4 weights_cache[64];\n";
   }
@@ -120,38 +118,42 @@ std::string GenerateConvolutionTransposedCode(
   }
   if (manual_clamp) {
     const std::string prev_x = "X - " + pixel_stride;
-    c += "  bool in_x0 = " + prev_x + " >= 0 && " + prev_x + " < src_size.x;\n";
-    c += "  bool in_x1 = X >= 0 && X < src_size.x;\n";
-    c += "  bool in_y0 = Y - 1 >= 0 && Y - 1 < src_size.y;\n";
-    c += "  bool in_y1 = Y >= 0 && Y < src_size.y;\n";
+    c += "  bool in_x0 = " + prev_x + " >= 0 && " + prev_x +
+         " < args.src_tensor.Width();\n";
+    c += "  bool in_x1 = X >= 0 && X < args.src_tensor.Width();\n";
+    c += "  bool in_y0 = Y - 1 >= 0 && Y - 1 < args.src_tensor.Height();\n";
+    c += "  bool in_y1 = Y >= 0 && Y < args.src_tensor.Height();\n";
     if (src_tensor_type == TensorStorageType::IMAGE_BUFFER) {
-      c += "  int addr_0 = select(-1, (Y - 1) * src_size.x + " + prev_x +
-           ", (in_x0 && in_y0));\n";
-      c += "  int addr_1 = select(-1, (Y - 1) * src_size.x + X, (in_x1 && "
+      c += "  int addr_0 = select(-1, (Y - 1) * args.src_tensor.Width() + " +
+           prev_x + ", (in_x0 && in_y0));\n";
+      c += "  int addr_1 = select(-1, (Y - 1) * args.src_tensor.Width() + X, "
+           "(in_x1 && "
            "in_y0));\n";
-      c += "  int addr_2 = select(-1, Y * src_size.x + " + prev_x +
+      c += "  int addr_2 = select(-1, Y * args.src_tensor.Width() + " + prev_x +
            ", (in_x0 && in_y1));\n";
-      c += "  int addr_3 = select(-1, Y * src_size.x + X, (in_x1 && "
+      c += "  int addr_3 = select(-1, Y * args.src_tensor.Width() + X, (in_x1 "
+           "&& "
            "in_y1));\n";
-      c += "  int dz_0 = select(0, src_size.x * src_size.y, (in_x0 && "
+      c += "  int dz_0 = select(0, args.src_tensor.SliceStride(), (in_x0 && "
            "in_y0));\n";
-      c += "  int dz_1 = select(0, src_size.x * src_size.y, (in_x1 && "
+      c += "  int dz_1 = select(0, args.src_tensor.SliceStride(), (in_x1 && "
            "in_y0));\n";
-      c += "  int dz_2 = select(0, src_size.x * src_size.y, (in_x0 && "
+      c += "  int dz_2 = select(0, args.src_tensor.SliceStride(), (in_x0 && "
            "in_y1));\n";
-      c += "  int dz_3 = select(0, src_size.x * src_size.y, (in_x1 && "
+      c += "  int dz_3 = select(0, args.src_tensor.SliceStride(), (in_x1 && "
            "in_y1));\n";
     }
     if (src_tensor_type == TensorStorageType::BUFFER) {
-      c += "  int xc0 = clamp(" + prev_x + ", 0, src_size.x - 1);\n";
-      c += "  int xc1 = clamp(X, 0, src_size.x - 1);\n";
-      c += "  int yc0 = clamp(Y - 1, 0, src_size.y - 1);\n";
-      c += "  int yc1 = clamp(Y, 0, src_size.y - 1);\n";
-      c += "  int addr_0 = yc0 * src_size.x + xc0;\n";
-      c += "  int addr_1 = yc0 * src_size.x + xc1;\n";
-      c += "  int addr_2 = yc1 * src_size.x + xc0;\n";
-      c += "  int addr_3 = yc1 * src_size.x + xc1;\n";
-      c += "  int dz = src_size.x * src_size.y;\n";
+      c += "  int xc0 = clamp(" + prev_x +
+           ", 0, args.src_tensor.Width() - 1);\n";
+      c += "  int xc1 = clamp(X, 0, args.src_tensor.Width() - 1);\n";
+      c += "  int yc0 = clamp(Y - 1, 0, args.src_tensor.Height() - 1);\n";
+      c += "  int yc1 = clamp(Y, 0, args.src_tensor.Height() - 1);\n";
+      c += "  int addr_0 = yc0 * args.src_tensor.Width() + xc0;\n";
+      c += "  int addr_1 = yc0 * args.src_tensor.Width() + xc1;\n";
+      c += "  int addr_2 = yc1 * args.src_tensor.Width() + xc0;\n";
+      c += "  int addr_3 = yc1 * args.src_tensor.Width() + xc1;\n";
+      c += "  int dz = args.src_tensor.SliceStride();\n";
     }
   }
   auto read_src = [&](int x, int y) {
@@ -159,34 +161,38 @@ std::string GenerateConvolutionTransposedCode(
       const std::string id = std::to_string(y * 2 + x);
       const std::string addr = "addr_" + std::to_string(y * 2 + x);
       if (src_tensor_type == TensorStorageType::IMAGE_BUFFER) {
-        return src_tensor.Read(addr) + "; " + addr + " += dz_" + id + ";";
+        return "args.src_tensor.Read(" + addr + "); " + addr + " += dz_" + id +
+               ";";
       } else {
-        return src_tensor.Read(addr) + " * (FLT)(in_x" + std::to_string(x) +
-               " && in_y" + std::to_string(y) + "); " + addr + " += dz;";
+        return "args.src_tensor.Read(" + addr + ") * (FLT)(in_x" +
+               std::to_string(x) + " && in_y" + std::to_string(y) + "); " +
+               addr + " += dz;";
       }
     } else {
-      return src_tensor.ReadWHS(
-          "X + " + std::to_string(x - 1) + "*" + pixel_stride,
-          "Y + " + std::to_string(y - 1), "s", TextureAddressMode::ZERO);
+      return "args.src_tensor.Read(X + " + std::to_string(x - 1) + " * " +
+             pixel_stride + ", Y + " + std::to_string(y - 1) + ", s);";
     }
   };
-  c += "  for (int s = 0; s < src_size.z; ++s) {\n";
+  c += "  for (int s = 0; s < args.src_tensor.Slices(); ++s) {\n";
   if (need_local_mem) {
     c += "    barrier(CLK_LOCAL_MEM_FENCE);\n";
   }
   if (weights_upload_type ==
       ConvolutionTransposed4x4::WeightsUploadType::LOCAL_MEM_ASYNC) {
-    c += "    async_work_group_copy(weights_cache, filters + f_offset, 64, "
+    c += "    async_work_group_copy(weights_cache, "
+         "args.weights.GetPtr(f_offset), 64, "
          "0);\n";
   } else if (weights_upload_type ==
              ConvolutionTransposed4x4::WeightsUploadType::
                  LOCAL_MEM_BY_THREADS) {
-    c += "    weights_cache[local_id] = filters[f_offset + local_id];\n";
-    c += "    weights_cache[local_id + 32] = filters[f_offset + local_id + "
-         "32];\n";
+    c += "    weights_cache[local_id] = args.weights.Read(f_offset + "
+         "local_id);\n";
+    c += "    weights_cache[local_id + 32] = args.weights.Read(f_offset + "
+         "local_id + "
+         "32);\n";
   } else {  // GLOBAL_MEM
-    c +=
-        "    " + weights_space + " FLT4* weights_cache = filters + f_offset;\n";
+    c += "    " + weights_space +
+         " FLT4* weights_cache = args.weights.GetPtr(f_offset);\n";
   }
   c += "    FLT4 src0 = " + read_src(0, 0) + ";\n";
   c += "    FLT4 src1 = " + read_src(1, 0) + ";\n";
@@ -216,48 +222,41 @@ std::string GenerateConvolutionTransposedCode(
   c += "\n";
   if (need_local_mem) {
     if (op_def.IsBatchSupported()) {
-      c += "  if (X0 * 2 * dst_size.w > dst_size.x || Y * 2 > dst_size.y || Z "
-           ">= "
-           "dst_size.z) return;\n";
+      c += "  if (X0 * 2 * args.dst_tensor.Batch() > args.dst_tensor.Width() "
+           "|| Y * 2 > args.dst_tensor.Height() || Z "
+           ">= args.dst_tensor.Slices()) return;\n";
     } else {
-      c += "  if (X * 2 > dst_size.x || Y * 2 > dst_size.y || Z >= dst_size.z) "
+      c += "  if (X * 2 > args.dst_tensor.Width() || Y * 2 > "
+           "args.dst_tensor.Height() || Z >= args.dst_tensor.Slices()) "
            "return;\n";
     }
   }
   if (op_def.IsBatchSupported()) {
-    c += "  X = X0 * 2 * dst_size.w + B - dst_size.w;\n";
+    c += "  X = X0 * 2 * args.dst_tensor.Batch() + B - "
+         "args.dst_tensor.Batch();\n";
   } else {
     c += "  X = X * 2 - 1;\n";
   }
   c += "  Y = Y * 2 - 1;\n";
   c += "\n";
-  c += "  FLT4 bias_val = " + biases.ReadLinearFLT4("Z") + ";\n";
+  c += "  FLT4 bias_val = args.biases.Read(Z);\n";
   c += "  if (X >= 0 && Y >= 0) {\n";
   c += "    FLT4 result = TO_FLT4(r0) + bias_val;\n";
-  LinkingContext context{"result", "X", "Y", "Z"};
-  c += PostProcess(linked_operations, context);
-  c += "  " + dst_tensor.WriteWHS("result", "X", "Y", "Z") + "\n";
+  c += "    args.dst_tensor.Write(result, X, Y, Z);\n";
   c += "  }\n";
-  c += "  if (X + " + pixel_stride + " < dst_size.x && Y >= 0) {\n";
+  c +=
+      "  if (X + " + pixel_stride + " < args.dst_tensor.Width() && Y >= 0) {\n";
   c += "    FLT4 result = TO_FLT4(r1) + bias_val;\n";
-  context = {"result", "X + " + pixel_stride, "Y", "Z"};
-  c += PostProcess(linked_operations, context);
-  c += "  " + dst_tensor.WriteWHS("result", "X + " + pixel_stride, "Y", "Z") +
-       "\n";
+  c += "    args.dst_tensor.Write(result, X + " + pixel_stride + ", Y, Z);\n";
   c += "  }\n";
-  c += "  if (X >= 0 && Y + 1 < dst_size.y) {\n";
+  c += "  if (X >= 0 && Y + 1 < args.dst_tensor.Height()) {\n";
   c += "    FLT4 result = TO_FLT4(r2) + bias_val;\n";
-  context = {"result", "X", "Y + 1", "Z"};
-  c += PostProcess(linked_operations, context);
-  c += "  " + dst_tensor.WriteWHS("result", "X", "Y + 1", "Z") + "\n";
+  c += "    args.dst_tensor.Write(result, X, Y + 1, Z);\n";
   c += "  }\n";
-  c += "  if (X + " + pixel_stride + " < dst_size.x && Y + 1 < dst_size.y) {\n";
+  c += "  if (X + " + pixel_stride +
+       " < args.dst_tensor.Width() && Y + 1 < args.dst_tensor.Height()) {\n";
   c += "    FLT4 result = TO_FLT4(r3) + bias_val;\n";
-  context = {"result", "X + " + pixel_stride, "Y + 1", "Z"};
-  c += PostProcess(linked_operations, context);
-  c += "  " +
-       dst_tensor.WriteWHS("result", "X + " + pixel_stride, "Y + 1", "Z") +
-       "\n";
+  c += "    args.dst_tensor.Write(result, X + " + pixel_stride + ", Y+1, Z);\n";
   c += "  }\n";
   c += "}\n";
   return c;
@@ -282,18 +281,14 @@ ConvolutionTransposed4x4::ConvolutionTransposed4x4(
 ConvolutionTransposed4x4::ConvolutionTransposed4x4(
     ConvolutionTransposed4x4&& operation)
     : GPUOperation(std::move(operation)),
-      weights_(std::move(operation.weights_)),
       weights_upload_type_(operation.weights_upload_type_),
-      biases_(std::move(operation.biases_)),
       kernel_(std::move(operation.kernel_)),
       work_group_size_(operation.work_group_size_) {}
 
 ConvolutionTransposed4x4& ConvolutionTransposed4x4::operator=(
     ConvolutionTransposed4x4&& operation) {
   if (this != &operation) {
-    weights_ = std::move(operation.weights_);
     std::swap(weights_upload_type_, operation.weights_upload_type_);
-    biases_ = std::move(operation.biases_);
     kernel_ = std::move(operation.kernel_);
     std::swap(work_group_size_, operation.work_group_size_);
     GPUOperation::operator=(std::move(operation));
@@ -303,8 +298,14 @@ ConvolutionTransposed4x4& ConvolutionTransposed4x4::operator=(
 
 absl::Status ConvolutionTransposed4x4::Compile(
     const CreationContext& creation_context) {
-  const auto code = GenerateConvolutionTransposedCode(
-      definition_, biases_, linked_operations_, weights_upload_type_);
+  std::string code = GenerateConvolutionTransposedCode(
+      definition_, weights_upload_type_, &args_);
+  std::string element_wise_code;
+  RETURN_IF_ERROR(
+      MergeOperations(linked_operations_, &args_, &element_wise_code));
+  RETURN_IF_ERROR(args_.TransformToCLCode(creation_context.device->GetInfo(),
+                                          {{"dst_tensor", element_wise_code}},
+                                          &code));
 
   std::vector<CompilerOptions> options;
   if (definition_.precision == CalculationsPrecision::F16 &&
@@ -318,17 +319,11 @@ absl::Status ConvolutionTransposed4x4::Compile(
 }
 
 absl::Status ConvolutionTransposed4x4::BindArguments() {
-  kernel_.ResetBindingCounter();
-  RETURN_IF_ERROR(kernel_.SetMemoryAuto(src_[0]->GetMemoryPtr()));
-  RETURN_IF_ERROR(kernel_.SetMemoryAuto(weights_.GetMemoryPtr()));
-  RETURN_IF_ERROR(kernel_.SetMemoryAuto(biases_.GetMemoryPtr()));
-  RETURN_IF_ERROR(BindArgs(&kernel_, linked_operations_));
-  RETURN_IF_ERROR(kernel_.SetMemoryAuto(dst_[0]->GetMemoryPtrForWriting()));
-  RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->GetWBatchedHSB()));
-  RETURN_IF_ERROR(kernel_.SetBytesAuto(dst_[0]->GetWBatchedHSB()));
-  const int32_t filters_offset = 4 * 16 * src_[0]->Slices();
-  RETURN_IF_ERROR(kernel_.SetBytesAuto(filters_offset));
-  return absl::OkStatus();
+  RETURN_IF_ERROR(args_.SetObjectRef("src_tensor", src_[0]));
+  RETURN_IF_ERROR(args_.SetObjectRef("dst_tensor", dst_[0]));
+  RETURN_IF_ERROR(args_.SetInt("filter_offset", 4 * 16 * src_[0]->Slices()));
+  RETURN_IF_ERROR(SetArguments(linked_operations_, &args_));
+  return args_.Bind(kernel_.kernel());
 }
 
 int3 ConvolutionTransposed4x4::GetGridSize() const {
@@ -363,13 +358,17 @@ absl::Status CreateConvolutionTransposed4x4(
   *result = ConvolutionTransposed4x4(definition, *creation_context.device);
   RETURN_IF_ERROR(
       result->UploadWeights(attr.weights, creation_context.context));
-  LinearStorageCreateInfo create_info;
-  create_info.storage_type = LinearStorageType::TEXTURE_2D;
-  create_info.data_type = definition.GetDataType();
-  create_info.name = "biases";
-  create_info.aligned_size = attr.weights.shape.o;
-  RETURN_IF_ERROR(CreateLinearStorage(
-      create_info, attr.bias, creation_context.context, &result->biases_));
+
+  TensorLinearDescriptor desc;
+  desc.storage_type = LinearStorageType::TEXTURE_2D;
+  desc.element_type = definition.GetDataType();
+
+  LinearStorage lt;
+  RETURN_IF_ERROR(
+      CreateLinearStorage(desc, attr.bias, creation_context.context, &lt));
+  result->args_.AddObject("biases", AccessType::READ,
+                          absl::make_unique<LinearStorage>(std::move(lt)),
+                          absl::make_unique<TensorLinearDescriptor>(desc));
   return absl::OkStatus();
 }
 
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_4x4.h b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_4x4.h
index 870c72f7aa2..b7d52a8cf5a 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_4x4.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_4x4.h
@@ -71,9 +71,7 @@ class ConvolutionTransposed4x4 : public GPUOperation {
   absl::Status BindArguments();
   int3 GetGridSize() const;
 
-  Buffer weights_;
   WeightsUploadType weights_upload_type_;
-  LinearStorage biases_;
 
   CLKernel kernel_;
   int3 work_group_size_ = int3(8, 4, 1);
@@ -91,17 +89,33 @@ absl::Status ConvolutionTransposed4x4::UploadWeights(
   const bool f32_weights = definition_.precision == CalculationsPrecision::F32;
   const int flt4_size = f32_weights ? sizeof(float4) : sizeof(half4);
 
+  Buffer weights_buffer;
   if (f32_weights) {
     std::vector<float4> gpu_data(flt4_count);
     RearrangeWeightsData(weights, absl::MakeSpan(gpu_data));
-    return CreateReadOnlyBuffer(flt4_size * flt4_count, gpu_data.data(),
-                                context, &weights_);
+    RETURN_IF_ERROR(CreateReadOnlyBuffer(
+        flt4_size * flt4_count, gpu_data.data(), context, &weights_buffer));
   } else {
     std::vector<half4> gpu_data(flt4_count);
     RearrangeWeightsData(weights, absl::MakeSpan(gpu_data));
-    return CreateReadOnlyBuffer(flt4_size * flt4_count, gpu_data.data(),
-                                context, &weights_);
+    RETURN_IF_ERROR(CreateReadOnlyBuffer(
+        flt4_size * flt4_count, gpu_data.data(), context, &weights_buffer));
   }
+
+  BufferDescriptor desc;
+  desc.element_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
+  desc.element_size = 4;
+  desc.memory_type =
+      weights_upload_type_ ==
+              ConvolutionTransposed4x4::WeightsUploadType::CONSTANT_MEM
+          ? MemoryType::CONSTANT
+          : MemoryType::GLOBAL;
+
+  args_.AddObject("weights", AccessType::READ,
+                  absl::make_unique<Buffer>(std::move(weights_buffer)),
+                  absl::make_unique<BufferDescriptor>(desc));
+
+  return absl::OkStatus();
 }
 
 template <DataType S, typename T>

From debab172f3dbcaff246787613eee196fe15693b7 Mon Sep 17 00:00:00 2001
From: Lukas Geiger <lukas.geiger94@gmail.com>
Date: Thu, 25 Jun 2020 23:07:58 +0200
Subject: [PATCH 1106/1390] Simplify calls to .executing_eagerly()

---
 tensorflow/python/eager/backprop.py     | 2 +-
 tensorflow/python/framework/ops.py      | 4 +---
 tensorflow/python/ops/summary_ops_v2.py | 4 ++--
 3 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/tensorflow/python/eager/backprop.py b/tensorflow/python/eager/backprop.py
index 5800a51f89a..f7795ee004f 100644
--- a/tensorflow/python/eager/backprop.py
+++ b/tensorflow/python/eager/backprop.py
@@ -699,7 +699,7 @@ def _ones(shape, dtype):
   if as_dtype == dtypes.string:
     return None
 
-  if not context.context().executing_eagerly():
+  if not context.executing_eagerly():
     return array_ops.ones(shape, dtype)
 
   if as_dtype.is_bool:
diff --git a/tensorflow/python/framework/ops.py b/tensorflow/python/framework/ops.py
index 2f723baaf1d..000e3bb87a0 100644
--- a/tensorflow/python/framework/ops.py
+++ b/tensorflow/python/framework/ops.py
@@ -6411,9 +6411,7 @@ def name_scope(name, default_name=None, values=None, skip_on_eager=True):
   Returns:
     `name_scope*` context manager.
   """
-  ctx = context.context()
-  in_eager_mode = ctx.executing_eagerly()
-  if not in_eager_mode:
+  if not context.executing_eagerly():
     return internal_name_scope_v1(name, default_name, values)
 
   if skip_on_eager:
diff --git a/tensorflow/python/ops/summary_ops_v2.py b/tensorflow/python/ops/summary_ops_v2.py
index 91b8e61b341..a663f010eb6 100644
--- a/tensorflow/python/ops/summary_ops_v2.py
+++ b/tensorflow/python/ops/summary_ops_v2.py
@@ -1186,7 +1186,7 @@ def trace_on(graph=True, profiler=False):  # pylint: disable=redefined-outer-nam
   if ops.inside_function():
     logging.warn("Cannot enable trace inside a tf.function.")
     return
-  if not context.context().executing_eagerly():
+  if not context.executing_eagerly():
     logging.warn("Must enable trace in eager mode.")
     return
 
@@ -1231,7 +1231,7 @@ def trace_export(name, step=None, profiler_outdir=None):
   if ops.inside_function():
     logging.warn("Cannot export trace inside a tf.function.")
     return
-  if not context.context().executing_eagerly():
+  if not context.executing_eagerly():
     logging.warn("Can only export trace while executing eagerly.")
     return
 

From e4025640f7131ff369948f66a732fe50bc3e5a59 Mon Sep 17 00:00:00 2001
From: Brian Zhao <bmzhao@google.com>
Date: Thu, 25 Jun 2020 14:08:13 -0700
Subject: [PATCH 1107/1390] Adding a missing "data" attributed to the test
 rule.

PiperOrigin-RevId: 318344476
Change-Id: I8de69f571855f26d0f5cd33767ebcec40f1552b5
---
 tensorflow/cc/saved_model/experimental/tests/BUILD             | 3 +++
 .../cc/saved_model/experimental/tests/saved_model_api_test.cc  | 2 +-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/tensorflow/cc/saved_model/experimental/tests/BUILD b/tensorflow/cc/saved_model/experimental/tests/BUILD
index f24bcfdee2a..1cc159a8f05 100644
--- a/tensorflow/cc/saved_model/experimental/tests/BUILD
+++ b/tensorflow/cc/saved_model/experimental/tests/BUILD
@@ -10,6 +10,9 @@ tf_cc_test(
     srcs = [
         "saved_model_api_test.cc",
     ],
+    data = [
+        "//tensorflow/cc/saved_model:saved_model_half_plus_two",
+    ],
     deps = [
         "//tensorflow/cc/experimental/base/public:runtime",
         "//tensorflow/cc/experimental/base/public:runtime_builder",
diff --git a/tensorflow/cc/saved_model/experimental/tests/saved_model_api_test.cc b/tensorflow/cc/saved_model/experimental/tests/saved_model_api_test.cc
index 7f7f6b09a6d..ad80b74f1d5 100644
--- a/tensorflow/cc/saved_model/experimental/tests/saved_model_api_test.cc
+++ b/tensorflow/cc/saved_model/experimental/tests/saved_model_api_test.cc
@@ -90,7 +90,7 @@ TEST_P(CPPSavedModelAPITest, LoadsSavedModel) {
   // That unblocks writing other tests that require a TF_SavedModel*,
   // like loading a ConcreteFunction. This test at least checks that the
   // C API builds and can be minimally run.
-  EXPECT_EQ(status.code(), TF_UNIMPLEMENTED);
+  EXPECT_EQ(status.code(), TF_UNIMPLEMENTED) << status.message();
 }
 
 INSTANTIATE_TEST_SUITE_P(RuntimeAgnosticCPPSavedModelTests,

From 94f361d057c969afa5dd83eac4aa03697dc7766d Mon Sep 17 00:00:00 2001
From: Ayush Dubey <ayushd@google.com>
Date: Thu, 25 Jun 2020 14:08:29 -0700
Subject: [PATCH 1108/1390] Remove `run_deprecated_v1` annotations from
 collective ops tests.

PiperOrigin-RevId: 318344535
Change-Id: Idce034f9c0eba341047a76e74a3a57d6220c786f
---
 .../python/ops/collective_ops_gpu_test.py     | 37 ++++++++++---------
 .../python/ops/collective_ops_xla_test.py     |  5 +--
 2 files changed, 21 insertions(+), 21 deletions(-)

diff --git a/tensorflow/python/ops/collective_ops_gpu_test.py b/tensorflow/python/ops/collective_ops_gpu_test.py
index dfa4d445b0d..872fb49834c 100644
--- a/tensorflow/python/ops/collective_ops_gpu_test.py
+++ b/tensorflow/python/ops/collective_ops_gpu_test.py
@@ -60,7 +60,6 @@ class CollectiveOpGPUTest(test.TestCase):
           len(gpus)))
     context.ensure_initialized()
 
-  @test_util.run_deprecated_v1
   def testBasicNcclAllReduce(self):
     inputs = [[0.1, 1.1, 2.1, 3.1, 4.1, 5.1, 6.1, 7.1],
               [0.3, 1.3, 2.3, 3.3, 4.3, 5.3, 6.3, 7.3]]
@@ -69,7 +68,9 @@ class CollectiveOpGPUTest(test.TestCase):
     instance_key = 1
     devices = ['/GPU:{}'.format(i) for i in range(self._group_size)]
 
-    with self.session(config=self._configure()) as sess:
+    # Tests that execute collectives need to be enclosed in graph or tf.function
+    with ops.Graph().as_default(), self.session(
+        config=self._configure()) as sess:
       if not test_util.is_gpu_available(cuda_only=True):
         self.skipTest('No GPU available')
       collectives = []
@@ -82,14 +83,15 @@ class CollectiveOpGPUTest(test.TestCase):
     for result in results:
       self.assertAllClose(result, expected, rtol=1e-5, atol=1e-5)
 
-  @test_util.run_deprecated_v1
   def testInt32Error(self):
     inputs = [[0, 1], [2, 3]]
     group_key = 1
     instance_key = 50
     devices = ['/GPU:{}'.format(i) for i in range(self._group_size)]
 
-    with self.session(config=self._configure()) as sess:
+    # Tests that execute collectives need to be enclosed in graph or tf.function
+    with ops.Graph().as_default(), self.session(
+        config=self._configure()) as sess:
       if not test_util.is_gpu_available(cuda_only=True):
         self.skipTest('No GPU available')
       collectives = []
@@ -103,7 +105,6 @@ class CollectiveOpGPUTest(test.TestCase):
           'does not support datatype DT_INT32 on DEVICE_GPU'):
         sess.run(collectives)
 
-  @test_util.run_deprecated_v1
   def testFp16Reduce(self):
     inputs = [[0.1, 1.1, 2.1, 3.1, 4.1, 5.1, 6.1, 7.1],
               [0.3, 1.3, 2.3, 3.3, 4.3, 5.3, 6.3, 7.3]]
@@ -112,7 +113,8 @@ class CollectiveOpGPUTest(test.TestCase):
     instance_key = 100
     devices = ['/GPU:{}'.format(i) for i in range(self._group_size)]
 
-    with self.session(config=self._configure()) as sess:
+    with ops.Graph().as_default(), self.session(
+        config=self._configure()) as sess:
       if not test_util.is_gpu_available(cuda_only=True):
         self.skipTest('No GPU available')
       collectives = []
@@ -126,7 +128,6 @@ class CollectiveOpGPUTest(test.TestCase):
       logging.info('i {} result {} expected {}'.format(i, results[i], expected))
       self.assertAllClose(result, expected, rtol=1e-3, atol=1e-3)
 
-  @test_util.run_deprecated_v1
   def testNcclHintAllReduce(self):
     inputs = [[0.1, 1.1, 2.1, 3.1, 4.1, 5.1, 6.1, 7.1],
               [0.3, 1.3, 2.3, 3.3, 4.3, 5.3, 6.3, 7.3]]
@@ -135,7 +136,7 @@ class CollectiveOpGPUTest(test.TestCase):
     instance_key = 1
     devices = ['/GPU:{}'.format(i) for i in range(self._group_size)]
 
-    with self.session(
+    with ops.Graph().as_default(), self.session(
         config=self._configure(set_config_proto_nccl=False)) as sess:
       if not test_util.is_gpu_available(cuda_only=True):
         self.skipTest('No GPU available')
@@ -150,14 +151,14 @@ class CollectiveOpGPUTest(test.TestCase):
     for result in results:
       self.assertAllClose(result, expected, rtol=1e-5, atol=1e-5)
 
-  @test_util.run_deprecated_v1
   def testBasicNcclBroadcast(self):
     tensor_value = [0.1, 1.1, 2.1, 3.1, 4.1, 5.1, 6.1, 7.1]
     group_key = 1
     instance_key = 1
     devices = ['/GPU:{}'.format(i) for i in range(self._group_size)]
 
-    with self.session(config=self._configure()) as sess:
+    with ops.Graph().as_default(), self.session(
+        config=self._configure()) as sess:
       if not test_util.is_gpu_available(cuda_only=True):
         self.skipTest('No GPU available')
       collectives = []
@@ -173,14 +174,14 @@ class CollectiveOpGPUTest(test.TestCase):
     for result in results:
       self.assertAllClose(result, tensor_value, rtol=1e-5, atol=1e-5)
 
-  @test_util.run_deprecated_v1
   def testNcclBroadcastDoubleRecv(self):
     tensor_value = [0.1, 1.1, 2.1, 3.1, 4.1, 5.1, 6.1, 7.1]
     group_key = 1
     instance_key = 1
     devices = ['/GPU:{}'.format(i) for i in range(self._group_size)]
 
-    with self.session(config=self._configure()) as sess:
+    with ops.Graph().as_default(), self.session(
+        config=self._configure()) as sess:
       if not test_util.is_gpu_available(cuda_only=True):
         self.skipTest('No GPU available')
       collectives = []
@@ -192,14 +193,14 @@ class CollectiveOpGPUTest(test.TestCase):
       with self.assertRaisesRegexp(errors.InternalError, 'found no source'):
         sess.run(collectives)
 
-  @test_util.run_deprecated_v1
   def testNcclBroadcastDoubleSend(self):
     tensor_value = [0.1, 1.1, 2.1, 3.1, 4.1, 5.1, 6.1, 7.1]
     group_key = 1
     instance_key = 1
     devices = ['/GPU:{}'.format(i) for i in range(self._group_size)]
 
-    with self.session(config=self._configure()) as sess:
+    with ops.Graph().as_default(), self.session(
+        config=self._configure()) as sess:
       if not test_util.is_gpu_available(cuda_only=True):
         self.skipTest('No GPU available')
       collectives = []
@@ -211,7 +212,6 @@ class CollectiveOpGPUTest(test.TestCase):
       with self.assertRaisesRegexp(errors.InternalError, 'already has source'):
         sess.run(collectives)
 
-  @test_util.run_deprecated_v1
   def testBasicNcclAllGather(self):
     inputs = [[0.1, 1.1, 2.1, 3.1, 4.1, 5.1, 6.1, 7.1],
               [0.3, 1.3, 2.3, 3.3, 4.3, 5.3, 6.3, 7.3]]
@@ -221,7 +221,8 @@ class CollectiveOpGPUTest(test.TestCase):
     instance_key = 1
     devices = ['/GPU:{}'.format(i) for i in range(self._group_size)]
 
-    with self.session(config=self._configure()) as sess:
+    with ops.Graph().as_default(), self.session(
+        config=self._configure()) as sess:
       if not test_util.is_gpu_available(cuda_only=True):
         self.skipTest('No GPU available')
       collectives = []
@@ -234,13 +235,13 @@ class CollectiveOpGPUTest(test.TestCase):
     for result in results:
       self.assertAllClose(result, expected, rtol=1e-5, atol=1e-5)
 
-  @test_util.run_deprecated_v1
   def testCollectiveDeviceMismatch(self):
     group_key = 10
     instance_key = 20
     t0 = [1, 2, 3, 4]
     t1 = [5, 6, 7, 8]
-    with self.session(
+
+    with ops.Graph().as_default(), self.session(
         config=self._configure(set_config_proto_nccl=False)) as sess:
       if not test_util.is_gpu_available(cuda_only=True):
         self.skipTest('No GPU available')
diff --git a/tensorflow/python/ops/collective_ops_xla_test.py b/tensorflow/python/ops/collective_ops_xla_test.py
index 613dd2527f4..c7550c854e0 100644
--- a/tensorflow/python/ops/collective_ops_xla_test.py
+++ b/tensorflow/python/ops/collective_ops_xla_test.py
@@ -23,7 +23,6 @@ from tensorflow.core.protobuf import rewriter_config_pb2
 from tensorflow.python.eager import def_function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import ops
-from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import collective_ops
 from tensorflow.python.platform import test
@@ -31,7 +30,6 @@ from tensorflow.python.platform import test
 
 class CollectiveOpXlaTest(test.TestCase):
 
-  @test_util.run_deprecated_v1
   def testScopedAllocatorWithXla(self):
     group_size = 2
     group_key = 1
@@ -50,7 +48,8 @@ class CollectiveOpXlaTest(test.TestCase):
     del rewrite_options.scoped_allocator_opts.enable_op[:]
     rewrite_options.scoped_allocator_opts.enable_op.append('CollectiveReduce')
 
-    with self.session(config=cfg) as sess:
+    # Tests that execute collectives need to be enclosed in graph or tf.function
+    with ops.Graph().as_default(), self.session(config=cfg) as sess:
       run_ops = []
       for i in range(group_size):
         with ops.device('CPU:%d' % i):

From 278c7f86f1e3f1fed4c58cb11108ca1e215d55e2 Mon Sep 17 00:00:00 2001
From: Robert David <lrdx@google.com>
Date: Thu, 25 Jun 2020 14:17:56 -0700
Subject: [PATCH 1109/1390] Nit: Fix spelling of function

PiperOrigin-RevId: 318346342
Change-Id: I2b2ae294301f3936d0c1011346b12393102e57ed
---
 .../internal/optimized/neon_tensor_utils.h    | 16 +++++++--------
 .../internal/optimized/sse_tensor_utils.h     | 16 +++++++--------
 .../reference/portable_tensor_utils.cc        | 16 +++++++--------
 .../reference/portable_tensor_utils.h         | 16 +++++++--------
 .../reference/portable_tensor_utils_impl.h    | 16 +++++++--------
 .../lite/kernels/internal/tensor_utils.h      | 14 ++++++-------
 .../kernels/internal/tensor_utils_test.cc     | 20 +++++++++----------
 tensorflow/lite/kernels/lstm_eval.cc          |  6 +++---
 8 files changed, 60 insertions(+), 60 deletions(-)

diff --git a/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.h b/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.h
index 7417e836b5c..62884620324 100644
--- a/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.h
+++ b/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.h
@@ -303,14 +303,14 @@ void MeanStddevNormalization(const float* input_vector, float* output_vector,
   PortableMeanStddevNormalization(input_vector, output_vector, v_size, n_batch);
 }
 
-void TwoGateSaturationgAdd(const int8_t* input, int8_t input_zp,
-                           const int8_t* recurrent, int8_t recurrent_zp,
-                           int32_t input_effective_scale_a,
-                           int32_t input_effective_scale_b,
-                           int32_t recurrent_effective_scale_a,
-                           int32_t recurrent_effective_scale_b, int32_t n_batch,
-                           int32_t n_cell, int16_t* output) {
-  PortableTwoGateSaturationgAdd(
+void TwoGateSaturatingAdd(const int8_t* input, int8_t input_zp,
+                          const int8_t* recurrent, int8_t recurrent_zp,
+                          int32_t input_effective_scale_a,
+                          int32_t input_effective_scale_b,
+                          int32_t recurrent_effective_scale_a,
+                          int32_t recurrent_effective_scale_b, int32_t n_batch,
+                          int32_t n_cell, int16_t* output) {
+  PortableTwoGateSaturatingAdd(
       input, input_zp, recurrent, recurrent_zp, input_effective_scale_a,
       input_effective_scale_b, recurrent_effective_scale_a,
       recurrent_effective_scale_b, n_batch, n_cell, output);
diff --git a/tensorflow/lite/kernels/internal/optimized/sse_tensor_utils.h b/tensorflow/lite/kernels/internal/optimized/sse_tensor_utils.h
index af29dda7229..9f73ef6435a 100644
--- a/tensorflow/lite/kernels/internal/optimized/sse_tensor_utils.h
+++ b/tensorflow/lite/kernels/internal/optimized/sse_tensor_utils.h
@@ -313,14 +313,14 @@ void MeanStddevNormalization(const float* input_vector, float* output_vector,
   PortableMeanStddevNormalization(input_vector, output_vector, v_size, n_batch);
 }
 
-void TwoGateSaturationgAdd(const int8_t* input, int8_t input_zp,
-                           const int8_t* recurrent, int8_t recurrent_zp,
-                           int32_t input_effective_scale_a,
-                           int32_t input_effective_scale_b,
-                           int32_t recurrent_effective_scale_a,
-                           int32_t recurrent_effective_scale_b, int32_t n_batch,
-                           int32_t n_cell, int16_t* output) {
-  PortableTwoGateSaturationgAdd(
+void TwoGateSaturatingAdd(const int8_t* input, int8_t input_zp,
+                          const int8_t* recurrent, int8_t recurrent_zp,
+                          int32_t input_effective_scale_a,
+                          int32_t input_effective_scale_b,
+                          int32_t recurrent_effective_scale_a,
+                          int32_t recurrent_effective_scale_b, int32_t n_batch,
+                          int32_t n_cell, int16_t* output) {
+  PortableTwoGateSaturatingAdd(
       input, input_zp, recurrent, recurrent_zp, input_effective_scale_a,
       input_effective_scale_b, recurrent_effective_scale_a,
       recurrent_effective_scale_b, n_batch, n_cell, output);
diff --git a/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.cc b/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.cc
index 856331a62e7..4a8d4b0fb6a 100644
--- a/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.cc
+++ b/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.cc
@@ -785,14 +785,14 @@ void PortableMeanStddevNormalization(const float* input_vector,
   }
 }
 
-void PortableTwoGateSaturationgAdd(const int8_t* input, int8_t input_zp,
-                                   const int8_t* recurrent, int8_t recurrent_zp,
-                                   int32_t input_effective_scale_a,
-                                   int32_t input_effective_scale_b,
-                                   int32_t recurrent_effective_scale_a,
-                                   int32_t recurrent_effective_scale_b,
-                                   int32_t n_batch, int32_t n_cell,
-                                   int16_t* output) {
+void PortableTwoGateSaturatingAdd(const int8_t* input, int8_t input_zp,
+                                  const int8_t* recurrent, int8_t recurrent_zp,
+                                  int32_t input_effective_scale_a,
+                                  int32_t input_effective_scale_b,
+                                  int32_t recurrent_effective_scale_a,
+                                  int32_t recurrent_effective_scale_b,
+                                  int32_t n_batch, int32_t n_cell,
+                                  int16_t* output) {
   const int32_t int16_max = std::numeric_limits<int16>::max();
   const int32_t int16_min = std::numeric_limits<int16>::min();
   for (int i = 0; i < n_batch * n_cell; ++i) {
diff --git a/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.h b/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.h
index ecb7fe8ea2b..602576ca3db 100644
--- a/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.h
+++ b/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.h
@@ -307,14 +307,14 @@ void MeanStddevNormalization(const float* input_vector, float* output_vector,
   PortableMeanStddevNormalization(input_vector, output_vector, v_size, n_batch);
 }
 
-void TwoGateSaturationgAdd(const int8_t* input, int8_t input_zp,
-                           const int8_t* recurrent, int8_t recurrent_zp,
-                           int32_t input_effective_scale_a,
-                           int32_t input_effective_scale_b,
-                           int32_t recurrent_effective_scale_a,
-                           int32_t recurrent_effective_scale_b, int32_t n_batch,
-                           int32_t n_cell, int16_t* output) {
-  PortableTwoGateSaturationgAdd(
+void TwoGateSaturatingAdd(const int8_t* input, int8_t input_zp,
+                          const int8_t* recurrent, int8_t recurrent_zp,
+                          int32_t input_effective_scale_a,
+                          int32_t input_effective_scale_b,
+                          int32_t recurrent_effective_scale_a,
+                          int32_t recurrent_effective_scale_b, int32_t n_batch,
+                          int32_t n_cell, int16_t* output) {
+  PortableTwoGateSaturatingAdd(
       input, input_zp, recurrent, recurrent_zp, input_effective_scale_a,
       input_effective_scale_b, recurrent_effective_scale_a,
       recurrent_effective_scale_b, n_batch, n_cell, output);
diff --git a/tensorflow/lite/kernels/internal/reference/portable_tensor_utils_impl.h b/tensorflow/lite/kernels/internal/reference/portable_tensor_utils_impl.h
index 556e4640cbb..86cd4e35cb0 100644
--- a/tensorflow/lite/kernels/internal/reference/portable_tensor_utils_impl.h
+++ b/tensorflow/lite/kernels/internal/reference/portable_tensor_utils_impl.h
@@ -225,14 +225,14 @@ void PortableMeanStddevNormalization(const float* input_vector,
                                      int n_batch);
 
 // Saturate Add.
-void PortableTwoGateSaturationgAdd(const int8_t* input, int8_t input_zp,
-                                   const int8_t* recurrent, int8_t recurrent_zp,
-                                   int32_t input_effective_scale_a,
-                                   int32_t input_effective_scale_b,
-                                   int32_t recurrent_effective_scale_a,
-                                   int32_t recurrent_effective_scale_b,
-                                   int32_t n_batch, int32_t n_cell,
-                                   int16_t* output);
+void PortableTwoGateSaturatingAdd(const int8_t* input, int8_t input_zp,
+                                  const int8_t* recurrent, int8_t recurrent_zp,
+                                  int32_t input_effective_scale_a,
+                                  int32_t input_effective_scale_b,
+                                  int32_t recurrent_effective_scale_a,
+                                  int32_t recurrent_effective_scale_b,
+                                  int32_t n_batch, int32_t n_cell,
+                                  int16_t* output);
 
 }  // namespace tensor_utils
 }  // namespace tflite
diff --git a/tensorflow/lite/kernels/internal/tensor_utils.h b/tensorflow/lite/kernels/internal/tensor_utils.h
index 716fbaa740e..12a5344e251 100644
--- a/tensorflow/lite/kernels/internal/tensor_utils.h
+++ b/tensorflow/lite/kernels/internal/tensor_utils.h
@@ -626,13 +626,13 @@ void MeanStddevNormalization(const float* input_vector, float* output_vector,
                              int v_size, int n_batch);
 
 // Saturate Add with rescale on both inputs.
-void TwoGateSaturationgAdd(const int8_t* input, int8_t input_zp,
-                           const int8_t* recurrent, int8_t recurrent_zp,
-                           int32_t input_effective_scale_a,
-                           int32_t input_effective_scale_b,
-                           int32_t recurrent_effective_scale_a,
-                           int32_t recurrent_effective_scale_b, int32_t n_batch,
-                           int32_t n_cell, int16_t* output);
+void TwoGateSaturatingAdd(const int8_t* input, int8_t input_zp,
+                          const int8_t* recurrent, int8_t recurrent_zp,
+                          int32_t input_effective_scale_a,
+                          int32_t input_effective_scale_b,
+                          int32_t recurrent_effective_scale_a,
+                          int32_t recurrent_effective_scale_b, int32_t n_batch,
+                          int32_t n_cell, int16_t* output);
 
 }  // namespace tensor_utils
 }  // namespace tflite
diff --git a/tensorflow/lite/kernels/internal/tensor_utils_test.cc b/tensorflow/lite/kernels/internal/tensor_utils_test.cc
index 825070cf510..117152c2a9d 100644
--- a/tensorflow/lite/kernels/internal/tensor_utils_test.cc
+++ b/tensorflow/lite/kernels/internal/tensor_utils_test.cc
@@ -1946,13 +1946,13 @@ TEST(uKernels, ReductionSumVectorIntegerTest) {
   EXPECT_THAT(result1, testing::ElementsAreArray({3, 6, -1, 3, 15}));
 }
 
-void TwoGateSaturationgAdd(const int8_t* input, int8_t input_zp,
-                           const int8_t* recurrent, int8_t recurrent_zp,
-                           int32_t input_effective_scale_a,
-                           int32_t input_effective_scale_b,
-                           int32_t recurrent_effective_scale_a,
-                           int32_t recurrent_effective_scale_b, int32_t n_batch,
-                           int32_t n_cell, int16_t* output);
+void TwoGateSaturatingAdd(const int8_t* input, int8_t input_zp,
+                          const int8_t* recurrent, int8_t recurrent_zp,
+                          int32_t input_effective_scale_a,
+                          int32_t input_effective_scale_b,
+                          int32_t recurrent_effective_scale_a,
+                          int32_t recurrent_effective_scale_b, int32_t n_batch,
+                          int32_t n_cell, int16_t* output);
 
 TEST(uKernels, TwoGateSaturateAddTest) {
   const std::vector<int8_t> input1 = {1, 2, 3, 4, 55, 66, 77};
@@ -1965,9 +1965,9 @@ TEST(uKernels, TwoGateSaturateAddTest) {
   const int32_t shift2 = -6;
   std::vector<int16_t> output(7);
 
-  TwoGateSaturationgAdd(input1.data(), input1_zp, input2.data(), input2_zp,
-                        multiplier1, shift1, multiplier2, shift2, 1, 7,
-                        output.data());
+  TwoGateSaturatingAdd(input1.data(), input1_zp, input2.data(), input2_zp,
+                       multiplier1, shift1, multiplier2, shift2, 1, 7,
+                       output.data());
 
   const std::vector<int16_t> expected_output = {1, 0, 0, 0, 0, 1, 1};
   EXPECT_THAT(output, testing::ElementsAreArray(expected_output));
diff --git a/tensorflow/lite/kernels/lstm_eval.cc b/tensorflow/lite/kernels/lstm_eval.cc
index 91b8d4e721c..c53f213aa61 100644
--- a/tensorflow/lite/kernels/lstm_eval.cc
+++ b/tensorflow/lite/kernels/lstm_eval.cc
@@ -1556,7 +1556,7 @@ inline void LstmStepInteger8x8_8(
       effective_recurrent_to_forget_scale_b, n_batch, n_output, n_cell,
       scratch1, intermediate_zp[5]);
 
-  tensor_utils::TwoGateSaturationgAdd(
+  tensor_utils::TwoGateSaturatingAdd(
       scratch0, intermediate_zp[4], scratch1, intermediate_zp[5],
       intermediate_scale_a[2], intermediate_scale_b[2], intermediate_scale_a[3],
       intermediate_scale_b[3], n_batch, n_cell, forget_gate_scratch);
@@ -1584,7 +1584,7 @@ inline void LstmStepInteger8x8_8(
       effective_recurrent_to_cell_scale_a, effective_recurrent_to_cell_scale_b,
       n_batch, n_output, n_cell, scratch1, intermediate_zp[8]);
 
-  tensor_utils::TwoGateSaturationgAdd(
+  tensor_utils::TwoGateSaturatingAdd(
       scratch0, intermediate_zp[7], scratch1, intermediate_zp[8],
       intermediate_scale_a[4], intermediate_scale_b[4], intermediate_scale_a[5],
       intermediate_scale_b[5], n_batch, n_cell, cell_gate_scratch);
@@ -1613,7 +1613,7 @@ inline void LstmStepInteger8x8_8(
       effective_recurrent_to_output_scale_b, n_batch, n_output, n_cell,
       scratch1, intermediate_zp[11]);
 
-  tensor_utils::TwoGateSaturationgAdd(
+  tensor_utils::TwoGateSaturatingAdd(
       scratch0, intermediate_zp[10], scratch1, intermediate_zp[11],
       intermediate_scale_a[6], intermediate_scale_b[6], intermediate_scale_a[7],
       intermediate_scale_b[7], n_batch, n_cell, output_gate_scratch);

From c68d3583efc458af3192d89b166ee30ff7d427de Mon Sep 17 00:00:00 2001
From: Allen Lavoie <allenl@google.com>
Date: Thu, 25 Jun 2020 14:31:16 -0700
Subject: [PATCH 1110/1390] Parallel device: add a test for collectives inside
 a function

PiperOrigin-RevId: 318348962
Change-Id: Ic01d87314c88fb6676667da936ed0b7b6093a1e2
---
 .../parallel_device/parallel_device_test.py   | 30 ++++++++++++++++++-
 1 file changed, 29 insertions(+), 1 deletion(-)

diff --git a/tensorflow/python/distribute/parallel_device/parallel_device_test.py b/tensorflow/python/distribute/parallel_device/parallel_device_test.py
index 1429c522aba..f6d6b525ae9 100644
--- a/tensorflow/python/distribute/parallel_device/parallel_device_test.py
+++ b/tensorflow/python/distribute/parallel_device/parallel_device_test.py
@@ -23,12 +23,14 @@ import threading
 from tensorflow.python.distribute.parallel_device import parallel_device
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
+from tensorflow.python.eager import def_function
 from tensorflow.python.framework import config
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import ops
 from tensorflow.python.module import module
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import collective_ops
+from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
@@ -42,7 +44,7 @@ from tensorflow.python.util import nest
 # communicate.
 # TODO(allenl): Switch to using a collective manager.
 _COUNTER_LOCK = threading.Lock()
-_COUNTER = 0
+_COUNTER = 100
 
 
 def _collective_reduce(inputs, operation, num_replicas):
@@ -171,6 +173,32 @@ class ParallelDeviceTests(_VirtualDeviceTestCase):
       context._reset_context()
       config.set_synchronous_execution(previous)
 
+  def test_collective_in_function(self):
+    c = constant_op.constant([2])
+
+    @def_function.function
+    def broadcast_send_recv(device_id):
+
+      @def_function.function
+      def send():
+        s0 = collective_ops.broadcast_send(
+            c * 3, c.shape, c.dtype, group_size=2, group_key=1, instance_key=1)
+        with ops.control_dependencies([s0.op]):
+          return array_ops.identity(c)
+
+      @def_function.function
+      def recv():
+        r0 = collective_ops.broadcast_recv(
+            c.shape, c.dtype, group_size=2, group_key=1, instance_key=1)
+        return r0
+
+      return control_flow_ops.switch_case(
+          device_id, branch_fns={0: send, 1: recv})
+
+    with ops.device(self.device.name):
+      result = broadcast_send_recv(self.device.device_ids)
+    self.assertAllClose([[2], [6]], self.device.unpack(result))
+
   def test_checkpointing(self):
     self.skipTest(
         "Disable saving until SaveableObject's methods are traceable.")

From 71172f732269523529ffef50638aab39cb86a136 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 25 Jun 2020 14:33:48 -0700
Subject: [PATCH 1111/1390] Update Eigen to:
 https://gitlab.com/libeigen/eigen/-/commit/7222f0b6b58759b2207e6ec3224adb246fd23349

PiperOrigin-RevId: 318349562
Change-Id: I07cbe35a57f1f7c8f540baeff0cbb3151b3079d0
---
 tensorflow/workspace.bzl | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index 6d0979acb06..5a047a58a41 100755
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -237,11 +237,11 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
         name = "eigen_archive",
         build_file = clean_dep("//third_party:eigen.BUILD"),
         patch_file = clean_dep("//third_party/eigen3:gpu_packet_math.patch"),
-        sha256 = "f632d82e43ffc46adfac9043beace700b0265748075e7edc0701d81380258038",  # SHARED_EIGEN_SHA
-        strip_prefix = "eigen-386d809bde475c65b7940f290efe80e6a05878c4",
+        sha256 = "6248a3e48a8c4876e04c233956f73b7d09017e29f458b368aa97f8b9092d46e0",  # SHARED_EIGEN_SHA
+        strip_prefix = "eigen-7222f0b6b58759b2207e6ec3224adb246fd23349",
         urls = [
-            "https://storage.googleapis.com/mirror.tensorflow.org/gitlab.com/libeigen/eigen/-/archive/386d809bde475c65b7940f290efe80e6a05878c4/eigen-386d809bde475c65b7940f290efe80e6a05878c4.tar.gz",
-            "https://gitlab.com/libeigen/eigen/-/archive/386d809bde475c65b7940f290efe80e6a05878c4/eigen-386d809bde475c65b7940f290efe80e6a05878c4.tar.gz",
+            "https://storage.googleapis.com/mirror.tensorflow.org/gitlab.com/libeigen/eigen/-/archive/7222f0b6b58759b2207e6ec3224adb246fd23349/eigen-7222f0b6b58759b2207e6ec3224adb246fd23349.tar.gz",
+            "https://gitlab.com/libeigen/eigen/-/archive/7222f0b6b58759b2207e6ec3224adb246fd23349/eigen-7222f0b6b58759b2207e6ec3224adb246fd23349.tar.gz",
         ],
     )
 

From e25272e74334cb2a4c6b256c132786f72c28fcd0 Mon Sep 17 00:00:00 2001
From: Nat Jeffries <njeff@google.com>
Date: Thu, 25 Jun 2020 14:57:27 -0700
Subject: [PATCH 1112/1390] Refactor Elementwise Test.

PiperOrigin-RevId: 318354380
Change-Id: I4129008725bc447e304028cb337d08ce1e12d6b7
---
 .../lite/micro/kernels/elementwise_test.cc    | 146 +++++++-----------
 1 file changed, 57 insertions(+), 89 deletions(-)

diff --git a/tensorflow/lite/micro/kernels/elementwise_test.cc b/tensorflow/lite/micro/kernels/elementwise_test.cc
index 6e583dd2137..8f028b1f451 100644
--- a/tensorflow/lite/micro/kernels/elementwise_test.cc
+++ b/tensorflow/lite/micro/kernels/elementwise_test.cc
@@ -23,13 +23,12 @@ namespace tflite {
 namespace testing {
 
 void TestElementwiseFloat(tflite::BuiltinOperator op,
-                          std::initializer_list<int> input_dims_data,
-                          std::initializer_list<float> input_data,
-                          std::initializer_list<int> output_dims_data,
-                          std::initializer_list<float> expected_output_data,
+                          const int* input_dims_data, const float* input_data,
+                          const int* output_dims_data,
+                          const float* expected_output_data,
                           float* output_data) {
-  TfLiteIntArray* input_dims = IntArrayFromInitializer(input_dims_data);
-  TfLiteIntArray* output_dims = IntArrayFromInitializer(output_dims_data);
+  TfLiteIntArray* input_dims = IntArrayFromInts(input_dims_data);
+  TfLiteIntArray* output_dims = IntArrayFromInts(output_dims_data);
   const int output_dims_count = ElementCount(*output_dims);
 
   constexpr int input_size = 1;
@@ -54,9 +53,9 @@ void TestElementwiseFloat(tflite::BuiltinOperator op,
   if (registration->init) {
     user_data = registration->init(&context, nullptr, 0);
   }
-  int inputs_array_data[] = {1, 0};
+  static int inputs_array_data[] = {1, 0};
   TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
-  int outputs_array_data[] = {1, 1};
+  static int outputs_array_data[] = {1, 1};
   TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
 
   TfLiteNode node;
@@ -77,19 +76,15 @@ void TestElementwiseFloat(tflite::BuiltinOperator op,
     registration->free(&context, user_data);
   }
   for (int i = 0; i < output_dims_count; ++i) {
-    TF_LITE_MICRO_EXPECT_NEAR(expected_output_data.begin()[i], output_data[i],
-                              1e-5f);
+    TF_LITE_MICRO_EXPECT_NEAR(expected_output_data[i], output_data[i], 1e-5f);
   }
 }
 
-void TestElementwiseBool(tflite::BuiltinOperator op,
-                         std::initializer_list<int> input_dims_data,
-                         std::initializer_list<bool> input_data,
-                         std::initializer_list<int> output_dims_data,
-                         std::initializer_list<bool> expected_output_data,
-                         bool* output_data) {
-  TfLiteIntArray* input_dims = IntArrayFromInitializer(input_dims_data);
-  TfLiteIntArray* output_dims = IntArrayFromInitializer(output_dims_data);
+void TestElementwiseBool(tflite::BuiltinOperator op, const int* input_dims_data,
+                         const bool* input_data, const int* output_dims_data,
+                         const bool* expected_output_data, bool* output_data) {
+  TfLiteIntArray* input_dims = IntArrayFromInts(input_dims_data);
+  TfLiteIntArray* output_dims = IntArrayFromInts(output_dims_data);
   const int output_dims_count = ElementCount(*output_dims);
 
   constexpr int input_size = 1;
@@ -115,9 +110,9 @@ void TestElementwiseBool(tflite::BuiltinOperator op,
     user_data = registration->init(&context, nullptr, 0);
   }
 
-  int inputs_array_data[] = {1, 0};
+  const int inputs_array_data[] = {1, 0};
   TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
-  int outputs_array_data[] = {1, 1};
+  const int outputs_array_data[] = {1, 1};
   TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
 
   TfLiteNode node;
@@ -138,7 +133,7 @@ void TestElementwiseBool(tflite::BuiltinOperator op,
     registration->free(&context, user_data);
   }
   for (int i = 0; i < output_dims_count; ++i) {
-    TF_LITE_MICRO_EXPECT_EQ(expected_output_data.begin()[i], output_data[i]);
+    TF_LITE_MICRO_EXPECT_EQ(expected_output_data[i], output_data[i]);
   }
 }
 
@@ -149,110 +144,83 @@ TF_LITE_MICRO_TESTS_BEGIN
 
 TF_LITE_MICRO_TEST(Abs) {
   constexpr int output_dims_count = 4;
+  const int shape[] = {2, 2, 2};
+  const float input[] = {0.01, -0.01, 10, -10};
+  const float golden[] = {0.01, 0.01, 10, 10};
   float output_data[output_dims_count];
-  tflite::testing::TestElementwiseFloat(
-      tflite::BuiltinOperator_ABS,     // ABS operator
-      {2, 2, 2},                       // Input shape
-      {0.01f, -0.01f, 10.0f, -10.0f},  // Input values
-      {2, 2, 2},                       // Output shape
-      {0.01f, 0.01f, 10.0f, 10.0f},    // Output values
-      output_data);
+  tflite::testing::TestElementwiseFloat(tflite::BuiltinOperator_ABS, shape,
+                                        input, shape, golden, output_data);
 }
 
 TF_LITE_MICRO_TEST(Sin) {
   constexpr int output_dims_count = 4;
+  const int shape[] = {2, 2, 2};
+  const float input[] = {0, 3.1415926, -3.1415926, 1};
+  const float golden[] = {0, 0, 0, 0.84147};
   float output_data[output_dims_count];
-  tflite::testing::TestElementwiseFloat(
-      tflite::BuiltinOperator_SIN,            // SIN operator
-      {2, 2, 2},                              // Input shape
-      {0.0f, 3.1415926f, -3.1415926f, 1.0f},  // Input values
-      {2, 2, 2},                              // Output shape
-      {0.0f, 0.0f, 0.0f, 0.84147f},           // Output values
-      output_data);
+  tflite::testing::TestElementwiseFloat(tflite::BuiltinOperator_SIN, shape,
+                                        input, shape, golden, output_data);
 }
 
 TF_LITE_MICRO_TEST(Cos) {
   constexpr int output_dims_count = 4;
+  const int shape[] = {2, 2, 2};
+  const float input[] = {0, 3.1415926, -3.1415926, 1};
+  const float golden[] = {1, -1, -1, 0.54030};
   float output_data[output_dims_count];
-  tflite::testing::TestElementwiseFloat(
-      tflite::BuiltinOperator_COS,            // COS operator
-      {2, 2, 2},                              // Input shape
-      {0.0f, 3.1415926f, -3.1415926f, 1.0f},  // Input values
-      {2, 2, 2},                              // Output shape
-      {1.0f, -1.0f, -1.0f, 0.54030f},         // Output values
-      output_data);
+  tflite::testing::TestElementwiseFloat(tflite::BuiltinOperator_COS, shape,
+                                        input, shape, golden, output_data);
 }
 
 TF_LITE_MICRO_TEST(Log) {
   constexpr int output_dims_count = 4;
+  const int shape[] = {2, 2, 2};
+  const float input[] = {1, 2.7182818, 0.5, 2};
+  const float golden[] = {0, 1, -0.6931472, 0.6931472};
   float output_data[output_dims_count];
-  tflite::testing::TestElementwiseFloat(
-      tflite::BuiltinOperator_LOG,            // LOG operator
-      {2, 2, 2},                              // Input shape
-      {1.0f, 2.7182818f, 0.5f, 2.0f},         // Input values
-      {2, 2, 2},                              // Output shape
-      {0.0f, 1.0f, -0.6931472f, 0.6931472f},  // Output values
-      output_data);
+  tflite::testing::TestElementwiseFloat(tflite::BuiltinOperator_LOG, shape,
+                                        input, shape, golden, output_data);
 }
 
 TF_LITE_MICRO_TEST(Sqrt) {
   constexpr int output_dims_count = 4;
+  const int shape[] = {2, 2, 2};
+  const float input[] = {0, 1, 2, 4};
+  const float golden[] = {0, 1, 1.41421, 2};
   float output_data[output_dims_count];
-  tflite::testing::TestElementwiseFloat(
-      tflite::BuiltinOperator_SQRT,  // SQRT operator
-      {2, 2, 2},                     // Input shape
-      {0.0f, 1.0f, 2.0f, 4.0f},      // Input values
-      {2, 2, 2},                     // Output shape
-      {0.0f, 1.0f, 1.41421f, 2.0f},  // Output values
-      output_data);
+  tflite::testing::TestElementwiseFloat(tflite::BuiltinOperator_SQRT, shape,
+                                        input, shape, golden, output_data);
 }
 
 TF_LITE_MICRO_TEST(Rsqrt) {
   constexpr int output_dims_count = 4;
+  const int shape[] = {2, 2, 2};
+  const float input[] = {1, 2, 4, 9};
+  const float golden[] = {1, 0.7071, 0.5, 0.33333};
   float output_data[output_dims_count];
-  tflite::testing::TestElementwiseFloat(
-      tflite::BuiltinOperator_RSQRT,    // RSQRT operator
-      {2, 2, 2},                        // Input shape
-      {1.0f, 2.0f, 4.0f, 9.0f},         // Input values
-      {2, 2, 2},                        // Output shape
-      {1.0f, 0.7071f, 0.5f, 0.33333f},  // Output values
-      output_data);
+  tflite::testing::TestElementwiseFloat(tflite::BuiltinOperator_RSQRT, shape,
+                                        input, shape, golden, output_data);
 }
 
 TF_LITE_MICRO_TEST(Square) {
   constexpr int output_dims_count = 4;
+  const int shape[] = {2, 2, 2};
+  const float input[] = {1, 2, 0.5, -3.0};
+  const float golden[] = {1, 4.0, 0.25, 9.0};
   float output_data[output_dims_count];
-  tflite::testing::TestElementwiseFloat(
-      tflite::BuiltinOperator_SQUARE,  // SQARE operator
-      {2, 2, 2},                       // Input shape
-      {1.0f, 2.0f, 0.5f, -3.0f},       // Input values
-      {2, 2, 2},                       // Output shape
-      {1.0f, 4.0f, 0.25f, 9.0f},       // Output values
-      output_data);
+  tflite::testing::TestElementwiseFloat(tflite::BuiltinOperator_SQUARE, shape,
+                                        input, shape, golden, output_data);
 }
 
 TF_LITE_MICRO_TEST(LogicalNot) {
   constexpr int output_dims_count = 4;
+  const int shape[] = {2, 2, 2};
+  const bool input[] = {true, false, false, true};
+  const bool golden[] = {false, true, true, false};
   bool output_data[output_dims_count];
-  tflite::testing::TestElementwiseBool(
-      tflite::BuiltinOperator_LOGICAL_NOT,  // Logical NOT operator
-      {2, 2, 2},                            // Input shape
-      {true, false, false, true},           // Input values
-      {2, 2, 2},                            // Output shape
-      {false, true, true, false},           // Output values
-      output_data);
-}
-
-TF_LITE_MICRO_TEST(TANH) {
-  constexpr int output_dims_count = 4;
-  float output_data[output_dims_count];
-  tflite::testing::TestElementwiseFloat(
-      tflite::BuiltinOperator_TANH,    // TANH operator
-      {2, 2, 2},                       // Input shape
-      {0.0f, 50.0f, 0.5f, -50.0f},     // Input values
-      {2, 2, 2},                       // Output shape
-      {0.0f, 1.0f, 0.462117f, -1.0f},  // Output values
-      output_data);
+  tflite::testing::TestElementwiseBool(tflite::BuiltinOperator_LOGICAL_NOT,
+                                       shape, input, shape, golden,
+                                       output_data);
 }
 
 TF_LITE_MICRO_TESTS_END

From f0b9d27774323675e083bf6cfdda88d92ebfc63c Mon Sep 17 00:00:00 2001
From: Nat Jeffries <njeff@google.com>
Date: Thu, 25 Jun 2020 15:05:02 -0700
Subject: [PATCH 1113/1390] Fix some issues with optimized xtensa kernels
 build.

PiperOrigin-RevId: 318355987
Change-Id: I6122de816e0ae691fad2ff721ff6186cd2c4aca2
---
 .../xa_nnlib/algo/common/include/xa_nnlib_common.h          | 4 ++--
 .../activations/hifi_mini/xa_nn_softmax_asym8_asym8.c       | 6 +++---
 .../xa_nnlib/include/nnlib/xa_nnlib_standards.h             | 2 +-
 .../kernels/xtensa_hifimini/xa_nnlib/include/xa_type_def.h  | 4 +++-
 4 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/tensorflow/lite/micro/kernels/xtensa_hifimini/xa_nnlib/algo/common/include/xa_nnlib_common.h b/tensorflow/lite/micro/kernels/xtensa_hifimini/xa_nnlib/algo/common/include/xa_nnlib_common.h
index 3e87bf27383..404278e678f 100644
--- a/tensorflow/lite/micro/kernels/xtensa_hifimini/xa_nnlib/algo/common/include/xa_nnlib_common.h
+++ b/tensorflow/lite/micro/kernels/xtensa_hifimini/xa_nnlib/algo/common/include/xa_nnlib_common.h
@@ -47,8 +47,8 @@ limitations under the License.
 #include <xtensa/tie/xt_FP.h>
 #endif
 
-#include "tensorflow/lite/micro/kernels/xtensa_hifimini/xa_nnlib/algo/include/xa_nnlib_err_chk.h"
-#include "tensorflow/lite/micro/kernels/xtensa_hifimini/xa_nnlib/algo/include/xa_nnlib_kernels_api.h"
+#include "tensorflow/lite/micro/kernels/xtensa_hifimini/xa_nnlib/algo/common/include/xa_nnlib_err_chk.h"
+#include "tensorflow/lite/micro/kernels/xtensa_hifimini/xa_nnlib/include/nnlib/xa_nnlib_kernels_api.h"
 #include "tensorflow/lite/micro/kernels/xtensa_hifimini/xa_nnlib/include/nnlib/xa_nnlib_standards.h"
 #include "tensorflow/lite/micro/kernels/xtensa_hifimini/xa_nnlib/include/xa_type_def.h"
 
diff --git a/tensorflow/lite/micro/kernels/xtensa_hifimini/xa_nnlib/algo/kernels/activations/hifi_mini/xa_nn_softmax_asym8_asym8.c b/tensorflow/lite/micro/kernels/xtensa_hifimini/xa_nnlib/algo/kernels/activations/hifi_mini/xa_nn_softmax_asym8_asym8.c
index 2442cff0b96..4f7dce839d3 100644
--- a/tensorflow/lite/micro/kernels/xtensa_hifimini/xa_nnlib/algo/kernels/activations/hifi_mini/xa_nn_softmax_asym8_asym8.c
+++ b/tensorflow/lite/micro/kernels/xtensa_hifimini/xa_nnlib/algo/kernels/activations/hifi_mini/xa_nn_softmax_asym8_asym8.c
@@ -498,7 +498,7 @@ WORD32 xa_nn_vec_softmax_asym8u_8(UWORD8 *__restrict__ pOut,
     ae_p24x2s p_min = AE_ZEROP48();
     ae_p24x2s p_max = AE_MOVPA24(255);
 
-    for (i = 0; i<vec_length> > 1; i++) {
+    for (i = 0; i<vec_length >> 1; i++) {
       int out;
 
       p_exp = *(ae_p24x2f *)&pExp[2 * i];
@@ -713,7 +713,7 @@ WORD32 xa_nn_vec_softmax_asym8s_8(WORD8 *__restrict__ pOut,
     ae_p24x2s p_min = AE_MOVPA24(-128);
     ae_p24x2s p_max = AE_MOVPA24(127);
 
-    for (i = 0; i<vec_length> > 1; i++) {
+    for (i = 0; i<vec_length >> 1; i++) {
       int out;
 
       p_exp = *(ae_p24x2f *)&pExp[2 * i];
@@ -928,7 +928,7 @@ WORD32 xa_nn_vec_softmax_asym8s_16(WORD16 *__restrict__ pOut,
     ae_p24x2s p_min = AE_MOVPA24(-32768);
     ae_p24x2s p_max = AE_MOVPA24(32767);
 
-    for (i = 0; i<vec_length> > 1; i++) {
+    for (i = 0; i<vec_length >> 1; i++) {
       int out;
 
       p_exp = *(ae_p24x2f *)&pExp[2 * i];
diff --git a/tensorflow/lite/micro/kernels/xtensa_hifimini/xa_nnlib/include/nnlib/xa_nnlib_standards.h b/tensorflow/lite/micro/kernels/xtensa_hifimini/xa_nnlib/include/nnlib/xa_nnlib_standards.h
index 6e336d8fbb5..36ea75d1e25 100644
--- a/tensorflow/lite/micro/kernels/xtensa_hifimini/xa_nnlib/include/nnlib/xa_nnlib_standards.h
+++ b/tensorflow/lite/micro/kernels/xtensa_hifimini/xa_nnlib/include/nnlib/xa_nnlib_standards.h
@@ -47,7 +47,7 @@ typedef char Int8;
 typedef int16_t Int16;
 typedef int Int32;
 typedef int Int24;
-typedef int64_t int Int64;
+typedef int64_t Int64;
 typedef int Bool;
 typedef float Flt32;
 
diff --git a/tensorflow/lite/micro/kernels/xtensa_hifimini/xa_nnlib/include/xa_type_def.h b/tensorflow/lite/micro/kernels/xtensa_hifimini/xa_nnlib/include/xa_type_def.h
index 06da90d1ea7..5bc14d04882 100644
--- a/tensorflow/lite/micro/kernels/xtensa_hifimini/xa_nnlib/include/xa_type_def.h
+++ b/tensorflow/lite/micro/kernels/xtensa_hifimini/xa_nnlib/include/xa_type_def.h
@@ -37,6 +37,8 @@ limitations under the License.
 #ifndef __XA_TYPE_DEF_H__
 #define __XA_TYPE_DEF_H__
 
+#include <stdint.h>
+
 /****************************************************************************/
 /*     types               type define    prefix        examples      bytes */
 /************************  ***********    ******    ****************  ***** */
@@ -68,7 +70,7 @@ typedef uint64_t* pUWORD40; /* pum     pUWORD40 pum_count  5   */
 typedef int64_t WORD64;      /* h       WORD64   h_count    8   */
 typedef int64_t* pWORD64;    /* ph      pWORD64  ph_count   8   */
 typedef uint64_t UWORD64;    /* uh      UWORD64  uh_count   8   */
-typedef uint64_tg* pUWORD64; /* puh     pUWORD64 puh_count  8   */
+typedef uint64_t* pUWORD64;  /* puh     pUWORD64 puh_count  8   */
 
 typedef float FLOAT32;    /* f       FLOAT32  f_count    4   */
 typedef float* pFLOAT32;  /* pf      pFLOAT32 pf_count   4   */

From aa1499f356d48231cc90948fe9fc1036c6b0cf8c Mon Sep 17 00:00:00 2001
From: Nick Kreeger <kreeger@google.com>
Date: Thu, 25 Jun 2020 15:10:52 -0700
Subject: [PATCH 1114/1390] Drop InitGraphAndContextTensorData() from
 MicroAllocator.

This method is trivial and this file is easier to trace by just calling the two methods that it calls instead.

PiperOrigin-RevId: 318357169
Change-Id: Ib75aaaf67f4aa6908e5aabc7ce0fb7a84a87608e
---
 tensorflow/lite/micro/micro_allocator.cc | 11 ++---------
 tensorflow/lite/micro/micro_allocator.h  |  5 -----
 2 files changed, 2 insertions(+), 14 deletions(-)

diff --git a/tensorflow/lite/micro/micro_allocator.cc b/tensorflow/lite/micro/micro_allocator.cc
index 239a23335a6..5b62ae6c3b3 100644
--- a/tensorflow/lite/micro/micro_allocator.cc
+++ b/tensorflow/lite/micro/micro_allocator.cc
@@ -612,8 +612,9 @@ TfLiteStatus MicroAllocator::StartModelAllocation(
   TFLITE_DCHECK(subgraph != nullptr);
   model_is_allocating_ = true;
 
+  TF_LITE_ENSURE_STATUS(AllocateTfLiteTensorArray(context, subgraph));
   TF_LITE_ENSURE_STATUS(
-      InitGraphAndContextTensorData(model, context, subgraph));
+      PopulateTfLiteTensorArrayFromFlatbuffer(model, context, subgraph));
   TF_LITE_ENSURE_STATUS(
       AllocateNodeAndRegistrations(subgraph, node_and_registrations));
   TF_LITE_ENSURE_STATUS(PrepareNodeAndRegistrationDataFromFlatbuffer(
@@ -866,14 +867,6 @@ ErrorReporter* MicroAllocator::error_reporter() const {
   return error_reporter_;
 }
 
-TfLiteStatus MicroAllocator::InitGraphAndContextTensorData(
-    const Model* model, TfLiteContext* context, const SubGraph* subgraph) {
-  TF_LITE_ENSURE_STATUS(AllocateTfLiteTensorArray(context, subgraph));
-  TF_LITE_ENSURE_STATUS(
-      PopulateTfLiteTensorArrayFromFlatbuffer(model, context, subgraph));
-  return kTfLiteOk;
-}
-
 const SubGraph* MicroAllocator::GetSubGraphFromModel(const Model* model) {
   auto* subgraphs = model->subgraphs();
   if (subgraphs->size() != 1) {
diff --git a/tensorflow/lite/micro/micro_allocator.h b/tensorflow/lite/micro/micro_allocator.h
index ab3f2a44d18..5fad5a2e5cc 100644
--- a/tensorflow/lite/micro/micro_allocator.h
+++ b/tensorflow/lite/micro/micro_allocator.h
@@ -179,11 +179,6 @@ class MicroAllocator {
   ErrorReporter* error_reporter() const;
 
  private:
-  // Initializes the graph and allocates TfLiteContext tensor data.
-  TfLiteStatus InitGraphAndContextTensorData(const Model* model,
-                                             TfLiteContext* context,
-                                             const SubGraph* subgraph);
-
   // Returns the first subgraph from the model.
   const SubGraph* GetSubGraphFromModel(const Model* model);
 

From 0e11504518ed5fed663cd059eef253baff991847 Mon Sep 17 00:00:00 2001
From: Jian Li <jianlijianli@google.com>
Date: Thu, 25 Jun 2020 15:12:03 -0700
Subject: [PATCH 1115/1390] Rename "sanity check" to "consistency check".

PiperOrigin-RevId: 318357382
Change-Id: I57fd8c9fd45efa5255e853a20b482111dfa0c274
---
 .../lite/kernels/internal/optimized/depthwiseconv_float.h   | 2 +-
 .../lite/kernels/internal/optimized/depthwiseconv_uint8.h   | 2 +-
 .../kernels/internal/optimized/integer_ops/depthwise_conv.h | 2 +-
 .../lite/kernels/internal/reference/integer_ops/conv.h      | 4 ++--
 .../lite/kernels/internal/resize_nearest_neighbor_test.cc   | 2 +-
 tensorflow/lite/tools/optimize/modify_model_interface.cc    | 6 +++---
 6 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/tensorflow/lite/kernels/internal/optimized/depthwiseconv_float.h b/tensorflow/lite/kernels/internal/optimized/depthwiseconv_float.h
index a8f41d5a108..a8903c1d275 100644
--- a/tensorflow/lite/kernels/internal/optimized/depthwiseconv_float.h
+++ b/tensorflow/lite/kernels/internal/optimized/depthwiseconv_float.h
@@ -769,7 +769,7 @@ void FloatDepthwiseConvAccumRow(int stride, int dilation_factor,
                                 int out_x_buffer_start, int out_x_buffer_end,
                                 int output_depth, float* acc_buffer) {
   ruy::profiler::ScopeLabel label(__PRETTY_FUNCTION__);
-  // Sanity check parameters. This is important in particular to ensure
+  // Consistency check parameters. This is important in particular to ensure
   // that we keep the number of template instantiations minimal, so we don't
   // increase binary size unnecessarily.
   static_assert(kFixedDepthMultiplier || !kFixedInputDepth, "");
diff --git a/tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8.h b/tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8.h
index 3f93a491862..8ec4af7b018 100644
--- a/tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8.h
+++ b/tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8.h
@@ -1478,7 +1478,7 @@ void QuantizedDepthwiseConvAccumRow(int stride, int dilation_factor,
                                     int out_x_buffer_end, int output_depth,
                                     int32* acc_buffer) {
   ruy::profiler::ScopeLabel label(__PRETTY_FUNCTION__);
-  // Sanity check parameters. This is important in particular to ensure
+  // Consistency check parameters. This is important in particular to ensure
   // that we keep the number of template instantiations minimal, so we don't
   // increase binary size unnecessarily.
   static_assert(kFixedDepthMultiplier || !kFixedInputDepth, "");
diff --git a/tensorflow/lite/kernels/internal/optimized/integer_ops/depthwise_conv.h b/tensorflow/lite/kernels/internal/optimized/integer_ops/depthwise_conv.h
index c84e7dc04d9..1b4b88fc622 100644
--- a/tensorflow/lite/kernels/internal/optimized/integer_ops/depthwise_conv.h
+++ b/tensorflow/lite/kernels/internal/optimized/integer_ops/depthwise_conv.h
@@ -1430,7 +1430,7 @@ void QuantizedDepthwiseConvAccumRow(int stride, int dilation_factor,
                                     int out_x_buffer_end, int output_depth,
                                     int32* acc_buffer) {
   ruy::profiler::ScopeLabel label(__PRETTY_FUNCTION__);
-  // Sanity check parameters. This is important in particular to ensure
+  // Consistency check parameters. This is important in particular to ensure
   // that we keep the number of template instantiations minimal, so we don't
   // increase binary size unnecessarily.
   static_assert(kFixedDepthMultiplier || !kFixedInputDepth, "");
diff --git a/tensorflow/lite/kernels/internal/reference/integer_ops/conv.h b/tensorflow/lite/kernels/internal/reference/integer_ops/conv.h
index 9131c7dbe57..df6b787338d 100644
--- a/tensorflow/lite/kernels/internal/reference/integer_ops/conv.h
+++ b/tensorflow/lite/kernels/internal/reference/integer_ops/conv.h
@@ -42,7 +42,7 @@ inline void ConvPerChannel(
   const int32 output_activation_min = params.quantized_activation_min;
   const int32 output_activation_max = params.quantized_activation_max;
 
-  // Sanity check.
+  // Consistency check.
   TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
   TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
   TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
@@ -143,7 +143,7 @@ inline void ConvPerChannel(
   const int32 output_activation_min = params.quantized_activation_min;
   const int32 output_activation_max = params.quantized_activation_max;
 
-  // Sanity check.
+  // Consistency check.
   TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
   TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
   TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
diff --git a/tensorflow/lite/kernels/internal/resize_nearest_neighbor_test.cc b/tensorflow/lite/kernels/internal/resize_nearest_neighbor_test.cc
index 4659d3a80e4..f8a455e7451 100644
--- a/tensorflow/lite/kernels/internal/resize_nearest_neighbor_test.cc
+++ b/tensorflow/lite/kernels/internal/resize_nearest_neighbor_test.cc
@@ -42,7 +42,7 @@ void TestReferenceResizeNearestNeighbor(
   ASSERT_EQ(expected_output_data, output_data);
 }
 
-// Sanity test values are from
+// Consistency test values are from
 // third_party/tensorflow/core/kernels/resize_nearest_neighbor_op_test.cc.
 
 TEST(ResizeNearestNeighborReference, Test2x2To1x1) {
diff --git a/tensorflow/lite/tools/optimize/modify_model_interface.cc b/tensorflow/lite/tools/optimize/modify_model_interface.cc
index 9451483b79d..91c9b7e8b74 100644
--- a/tensorflow/lite/tools/optimize/modify_model_interface.cc
+++ b/tensorflow/lite/tools/optimize/modify_model_interface.cc
@@ -211,7 +211,7 @@ TfLiteStatus SetOutputTypeToUINT8(ModelT* model,
 TfLiteStatus RemoveInputTensor(ModelT* model,
                                const std::vector<TensorOpTensor>& inputs,
                                int32 original_number_tensors) {
-  // Sanity check to make sure that erase start from the end.
+  // Consistency check to make sure that erase start from the end.
   int last_op_index = std::numeric_limits<int32_t>::max();
   int last_tensor_index = std::numeric_limits<int32_t>::max();
   for (auto tot : inputs) {
@@ -237,7 +237,7 @@ TfLiteStatus RemoveInputTensor(ModelT* model,
 TfLiteStatus RemoveOutputTensor(ModelT* model,
                                 const std::vector<TensorOpTensor>& outputs,
                                 int32 original_number_tensors) {
-  // Sanity check to make sure that erase start from the end.
+  // Consistency check to make sure that erase start from the end.
   int last_op_index = std::numeric_limits<int32_t>::max();
   int last_tensor_index = std::numeric_limits<int32_t>::max();
   for (auto tot : outputs) {
@@ -338,7 +338,7 @@ TfLiteStatus ModifyModelInterface(const string& input_file,
                                   const string& output_file,
                                   const TensorType& input_type,
                                   const TensorType& output_type) {
-  // Sanity Check
+  // Consistency Check
   if (input_type != tflite::TensorType_INT8 &&
       input_type != tflite::TensorType_UINT8) {
     return kTfLiteError;

From a39efe62d28a754bd27d08d485b414c5555c7411 Mon Sep 17 00:00:00 2001
From: Rick Chao <rchao@google.com>
Date: Thu, 25 Jun 2020 15:25:59 -0700
Subject: [PATCH 1116/1390] Fix cluster_resolver test breakage in
 strategy_common_test.

PiperOrigin-RevId: 318360022
Change-Id: I112e970eb438621d54a5aa935d1b5df63b3a6e9d
---
 tensorflow/python/distribute/strategy_common_test.py | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/tensorflow/python/distribute/strategy_common_test.py b/tensorflow/python/distribute/strategy_common_test.py
index 03a7b539988..9f628ea1250 100644
--- a/tensorflow/python/distribute/strategy_common_test.py
+++ b/tensorflow/python/distribute/strategy_common_test.py
@@ -160,20 +160,16 @@ class StrategyClusterResolverTest(test.TestCase, parameterized.TestCase):
 
     with strategy.scope():
       self.assertIs(strategy.cluster_resolver, resolver)
+
     self.assertTrue(hasattr(resolver, 'cluster_spec'))
-    if isinstance(strategy, TPUStrategy):
-      self.skipTest('b/159747888')
-    self.assertTrue(hasattr(resolver, 'environment'))
     self.assertTrue(hasattr(resolver, 'master'))
     self.assertTrue(hasattr(resolver, 'num_accelerators'))
-    self.assertIsNone(resolver.rpc_layer)
+    self.assertTrue(hasattr(resolver, 'task_id'))
+    self.assertTrue(hasattr(resolver, 'task_type'))
     if isinstance(strategy, CollectiveAllReduceStrategy):
       self.assertEqual(resolver.task_id, 0)
       self.assertAllInSet(resolver.task_type, ['chief', 'worker'])
-    elif isinstance(strategy, TPUStrategy):
-      # TPUStrategy does not have task_id and task_type applicable.
-      self.assertIsNone(resolver.task_id)
-      self.assertIsNone(resolver.task_type)
+      self.assertIsNone(resolver.rpc_layer)
 
 
 if __name__ == '__main__':

From 5acfeabefa48599afe221a06c88fd678a96176c0 Mon Sep 17 00:00:00 2001
From: Robert David <lrdx@google.com>
Date: Thu, 25 Jun 2020 15:27:24 -0700
Subject: [PATCH 1117/1390] Hybrid LSTMs: Create constants for temporary tensor
 indices, use them consistently

PiperOrigin-RevId: 318360238
Change-Id: I42a0ce56e728bfe57b631457a5bcaab214e514b1
---
 .../kernels/bidirectional_sequence_lstm.cc    |   3 +-
 tensorflow/lite/kernels/lstm.cc               | 152 ++++++++++--------
 .../kernels/unidirectional_sequence_lstm.cc   |  37 ++---
 3 files changed, 100 insertions(+), 92 deletions(-)

diff --git a/tensorflow/lite/kernels/bidirectional_sequence_lstm.cc b/tensorflow/lite/kernels/bidirectional_sequence_lstm.cc
index fd60fe573ef..f1a77e2b1cb 100644
--- a/tensorflow/lite/kernels/bidirectional_sequence_lstm.cc
+++ b/tensorflow/lite/kernels/bidirectional_sequence_lstm.cc
@@ -535,7 +535,8 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
     node->temporaries = TfLiteIntArrayCreate(2);  // the two scratch buffers.
   }
   // Create a scratch buffer tensor.
-  node->temporaries->data[kFwScratchBuffer] = op_data->scratch_tensor_index;
+  node->temporaries->data[kFwScratchBuffer] =
+      op_data->scratch_tensor_index + kFwScratchBuffer;
   TfLiteTensor* fw_scratch_buffer =
       GetTemporary(context, node, kFwScratchBuffer);
   fw_scratch_buffer->type = input->type;
diff --git a/tensorflow/lite/kernels/lstm.cc b/tensorflow/lite/kernels/lstm.cc
index 803fbba4eae..75de587774a 100644
--- a/tensorflow/lite/kernels/lstm.cc
+++ b/tensorflow/lite/kernels/lstm.cc
@@ -60,6 +60,21 @@ struct OpData {
 namespace full {
 namespace {
 
+// Named temporary tensors.
+enum HybridTemporaryTensor {
+  kScratchBuffer = 0,
+  kInputQuantized = 1,
+  kOutputStateQuantized = 2,
+  kCellStateQuantized = 3,
+  kScalingFactors = 4,
+  kProductScalingFactors = 5,
+  kRecoveredCellWeights = 6,
+  kAccumScratch = 7,
+  kZeroPoints = 8,
+  kRowSums = 9,
+  kNumHybridTemporaryTensors = 10,
+};
+
 TfLiteStatus PopulateQuantizedLstmParams8x8_16(
     TfLiteContext* context, TfLiteNode* node,
     lstm_eval::IntegerLstmParameter* integer_lstm_param) {
@@ -280,14 +295,14 @@ TfLiteStatus PopulateQuantizedLstmParams8x8_16(
 
   if (use_peephole) {
     if (!use_cifg) {
-      effective_cell_to_input_scale = std::pow(2, cell_scale) *
+      effective_cell_to_input_scale = std::pow(2, cell_scale) *  // NOLINT
                                       cell_to_input_weight_scale /
                                       intermediate_scale[0];
     }
-    effective_cell_to_forget_scale = std::pow(2, cell_scale) *
+    effective_cell_to_forget_scale = std::pow(2, cell_scale) *  // NOLINT
                                      cell_to_forget_weight_scale /
                                      intermediate_scale[1];
-    effective_cell_to_output_scale = std::pow(2, cell_scale) *
+    effective_cell_to_output_scale = std::pow(2, cell_scale) *  // NOLINT
                                      cell_to_output_weight_scale /
                                      intermediate_scale[3];
   }
@@ -724,7 +739,8 @@ TfLiteStatus PopulateQuantizedLstmParams8x8_8(
 void* Init(TfLiteContext* context, const char* buffer, size_t length) {
   auto* op_data = new OpData();
   op_data->kernel_type = kTfLiteLSTMFullKernel;
-  context->AddTensors(context, /*tensors_to_add=*/10,
+  // TODO(b/159066113): maybe just add the minimum required temp tensors?
+  context->AddTensors(context, kNumHybridTemporaryTensors,
                       &op_data->scratch_tensor_index);
   return op_data;
 }
@@ -1233,7 +1249,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 
   TfLiteIntArrayFree(node->temporaries);
   if (is_hybrid_op) {
-    node->temporaries = TfLiteIntArrayCreate(10);
+    node->temporaries = TfLiteIntArrayCreate(kNumHybridTemporaryTensors);
   } else if (is_integer) {
     if (is_8x8_16) {
       node->temporaries = TfLiteIntArrayCreate(6);
@@ -1248,8 +1264,9 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   // TODO(b/152066492): Create a is_float boolean and reorganize the temporary
   // buffer allocation logic.
   if (!is_integer) {
-    node->temporaries->data[0] = op_data->scratch_tensor_index;
-    TfLiteTensor* scratch_buffer = GetTemporary(context, node, /*index=*/0);
+    node->temporaries->data[kScratchBuffer] =
+        op_data->scratch_tensor_index + kScratchBuffer;
+    TfLiteTensor* scratch_buffer = GetTemporary(context, node, kScratchBuffer);
     scratch_buffer->type = input->type;
     scratch_buffer->allocation_type = kTfLiteArenaRw;
 
@@ -1273,8 +1290,10 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
     op_data->compute_row_sums = true;
     // Allocate temporary tensors to store quantized values of input,
     // output_state and cell_state tensors.
-    node->temporaries->data[1] = op_data->scratch_tensor_index + 1;
-    TfLiteTensor* input_quantized = GetTemporary(context, node, /*index=*/1);
+    node->temporaries->data[kInputQuantized] =
+        op_data->scratch_tensor_index + kInputQuantized;
+    TfLiteTensor* input_quantized =
+        GetTemporary(context, node, kInputQuantized);
     input_quantized->type = input_to_output_weights->type;
     input_quantized->allocation_type = kTfLiteArenaRw;
     if (!TfLiteIntArrayEqual(input_quantized->dims, input->dims)) {
@@ -1282,9 +1301,10 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
       TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, input_quantized,
                                                        input_quantized_size));
     }
-    node->temporaries->data[2] = op_data->scratch_tensor_index + 2;
+    node->temporaries->data[kOutputStateQuantized] =
+        op_data->scratch_tensor_index + kOutputStateQuantized;
     TfLiteTensor* output_state_quantized =
-        GetTemporary(context, node, /*index=*/2);
+        GetTemporary(context, node, kOutputStateQuantized);
     output_state_quantized->type = input_to_output_weights->type;
     output_state_quantized->allocation_type = kTfLiteArenaRw;
     if (!TfLiteIntArrayEqual(output_state_quantized->dims,
@@ -1295,9 +1315,10 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
                         context->ResizeTensor(context, output_state_quantized,
                                               output_state_quantized_size));
     }
-    node->temporaries->data[3] = op_data->scratch_tensor_index + 3;
+    node->temporaries->data[kCellStateQuantized] =
+        op_data->scratch_tensor_index + kCellStateQuantized;
     TfLiteTensor* cell_state_quantized =
-        GetTemporary(context, node, /*index=*/3);
+        GetTemporary(context, node, kCellStateQuantized);
     cell_state_quantized->type = input_to_output_weights->type;
     cell_state_quantized->allocation_type = kTfLiteArenaRw;
     if (!TfLiteIntArrayEqual(cell_state_quantized->dims, cell_state->dims)) {
@@ -1312,8 +1333,10 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
     // a vector once (which produces the scaling factors) and multiply it with
     // different matrices (which requires multiplying the scaling factors with
     // the scaling factor of the matrix).
-    node->temporaries->data[4] = op_data->scratch_tensor_index + 4;
-    TfLiteTensor* scaling_factors = GetTemporary(context, node, /*index=*/4);
+    node->temporaries->data[kScalingFactors] =
+        op_data->scratch_tensor_index + kScalingFactors;
+    TfLiteTensor* scaling_factors =
+        GetTemporary(context, node, kScalingFactors);
     scaling_factors->type = kTfLiteFloat32;
     scaling_factors->allocation_type = kTfLiteArenaRw;
     int scaling_dims[1] = {n_batch};
@@ -1323,9 +1346,10 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
       TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, scaling_factors,
                                                        scaling_factors_size));
     }
-    node->temporaries->data[5] = op_data->scratch_tensor_index + 5;
+    node->temporaries->data[kProductScalingFactors] =
+        op_data->scratch_tensor_index + kProductScalingFactors;
     TfLiteTensor* prod_scaling_factors =
-        GetTemporary(context, node, /*index=*/5);
+        GetTemporary(context, node, kProductScalingFactors);
     prod_scaling_factors->type = kTfLiteFloat32;
     prod_scaling_factors->allocation_type = kTfLiteArenaRw;
     if (!TfLiteIntArrayEqualsArray(prod_scaling_factors->dims, 1,
@@ -1339,9 +1363,10 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 
     // Allocate a temporary tensor to store the recovered cell weights. Since
     // this is used for diagonal matrices, only need to store n_cell values.
-    node->temporaries->data[6] = op_data->scratch_tensor_index + 6;
+    node->temporaries->data[kRecoveredCellWeights] =
+        op_data->scratch_tensor_index + kRecoveredCellWeights;
     TfLiteTensor* recovered_cell_weights =
-        GetTemporary(context, node, /*index=*/6);
+        GetTemporary(context, node, kRecoveredCellWeights);
     recovered_cell_weights->type = kTfLiteFloat32;
     recovered_cell_weights->allocation_type = kTfLiteArenaRw;
     int recovered_cell_dims[1] = {n_cell};
@@ -1355,8 +1380,9 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
     }
     // Allocate a temporary tensor to store accumulate values for matrix
     // multiplication before multiplication by scaling factor
-    node->temporaries->data[7] = op_data->scratch_tensor_index + 7;
-    TfLiteTensor* accum_scratch = GetTemporary(context, node, /*index=*/7);
+    node->temporaries->data[kAccumScratch] =
+        op_data->scratch_tensor_index + kAccumScratch;
+    TfLiteTensor* accum_scratch = GetTemporary(context, node, kAccumScratch);
     accum_scratch->type = kTfLiteInt32;
     accum_scratch->allocation_type = kTfLiteArenaRw;
     int accum_scratch_dims[2] = {n_cell, n_batch};
@@ -1369,8 +1395,9 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
           context, context->ResizeTensor(context, accum_scratch, accum_size));
     }
 
-    node->temporaries->data[8] = op_data->scratch_tensor_index + 8;
-    TfLiteTensor* zero_points = GetTemporary(context, node, /*index=*/8);
+    node->temporaries->data[kZeroPoints] =
+        op_data->scratch_tensor_index + kZeroPoints;
+    TfLiteTensor* zero_points = GetTemporary(context, node, kZeroPoints);
     zero_points->type = kTfLiteFloat32;
     zero_points->allocation_type = kTfLiteArenaRw;
     int zero_points_dims[1] = {n_batch};
@@ -1381,7 +1408,8 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
                                                        zero_points_size));
     }
 
-    node->temporaries->data[9] = op_data->scratch_tensor_index + 9;
+    node->temporaries->data[kRowSums] =
+        op_data->scratch_tensor_index + kRowSums;
     const TfLiteTensor* input_to_input_weights =
         GetOptionalInputTensor(context, node, kInputToInputWeightsTensor);
     const bool use_cifg = (input_to_input_weights == nullptr);
@@ -1392,7 +1420,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
       row_sums_rows += ceil(static_cast<float>(n_output) / n_cell);
     }
 
-    TfLiteTensor* row_sums = GetTemporary(context, node, /*index=*/9);
+    TfLiteTensor* row_sums = GetTemporary(context, node, kRowSums);
     row_sums->type = kTfLiteInt32;
     row_sums->allocation_type = kTfLiteArenaRwPersistent;
     const int row_sums_dims[2] = {row_sums_rows, n_cell};
@@ -1422,7 +1450,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
         node->temporaries->data[scratch_index] =
             op_data->scratch_tensor_index + scratch_index;
         TfLiteTensor* scratch_tensor =
-            GetTemporary(context, node, /*index=*/scratch_index);
+            GetTemporary(context, node, scratch_index);
         scratch_tensor->type = kTfLiteInt16;
         if (scratch_index == 4) {
           scratch_tensor->type = kTfLiteInt8;
@@ -1459,7 +1487,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
         node->temporaries->data[scratch_index] =
             op_data->scratch_tensor_index + scratch_index;
         TfLiteTensor* scratch_tensor =
-            GetTemporary(context, node, /*index=*/scratch_index);
+            GetTemporary(context, node, scratch_index);
         if (scratch_index == 0 || scratch_index == 1) {
           scratch_tensor->type = kTfLiteInt8;
         } else {
@@ -1549,7 +1577,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   switch (input_to_output_weights->type) {
     case kTfLiteFloat32: {
       // Index the scratch buffers pointers to the global scratch buffer.
-      TfLiteTensor* scratch_buffer = GetTemporary(context, node, /*index=*/0);
+      TfLiteTensor* scratch_buffer = GetTemporary(context, node, 0);
       return lstm_eval::EvalFloat(
           input, input_to_input_weights, input_to_forget_weights,
           input_to_cell_weights, input_to_output_weights,
@@ -1574,24 +1602,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
     case kTfLiteInt8: {
       const bool is_hybrid = (input->type == kTfLiteFloat32);
       if (is_hybrid) {
-        // Index the scratch buffers pointers to the global scratch buffer.
-        TfLiteTensor* scratch_buffer = GetTemporary(context, node, /*index=*/0);
-        TfLiteTensor* input_quantized =
-            GetTemporary(context, node, /*index=*/1);
-        TfLiteTensor* output_state_quantized =
-            GetTemporary(context, node, /*index=*/2);
-        TfLiteTensor* cell_state_quantized =
-            GetTemporary(context, node, /*index=*/3);
-        TfLiteTensor* scaling_factors =
-            GetTemporary(context, node, /*index=*/4);
-        TfLiteTensor* prod_scaling_factors =
-            GetTemporary(context, node, /*index=*/5);
-        TfLiteTensor* recovered_cell_weights =
-            GetTemporary(context, node, /*index=*/6);
-        TfLiteTensor* output_scratch_buffer =
-            GetTemporary(context, node, /*index=*/7);
-        TfLiteTensor* zero_points = GetTemporary(context, node, /*index=*/8);
-        TfLiteTensor* row_sums = GetTemporary(context, node, /*index=*/9);
+        TfLiteTensor* row_sums = GetTemporary(context, node, kRowSums);
         const int row_sums_size = row_sums->dims->data[0];
         return lstm_eval::EvalHybrid(
             input, input_to_input_weights, input_to_forget_weights,
@@ -1601,32 +1612,35 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
             cell_to_input_weights, cell_to_forget_weights,
             cell_to_output_weights, input_layer_norm_coefficients,
             forget_layer_norm_coefficients, cell_layer_norm_coefficients,
-            output_layer_norm_coefficients,
-            /*aux_input=*/nullptr,
+            output_layer_norm_coefficients, /*aux_input=*/nullptr,
             /*aux_input_to_input_weights=*/nullptr,
             /*aux_input_to_forget_weights=*/nullptr,
             /*aux_input_to_cell_weights=*/nullptr,
             /*aux_input_to_output_weights=*/nullptr, input_gate_bias,
             forget_gate_bias, cell_gate_bias, output_gate_bias,
             projection_weights, projection_bias, params,
-            /*forward_sequence=*/true,
-            /*time_major=*/true, /*output_offset=*/0, scratch_buffer,
-            scaling_factors, prod_scaling_factors, recovered_cell_weights,
-            input_quantized,
-            /*aux_input_quantized=*/nullptr, output_state_quantized,
-            cell_state_quantized, output_state, cell_state,
-            output_scratch_buffer, output, zero_points, row_sums, row_sums_size,
+            /*forward_sequence=*/true, /*time_major=*/true, /*output_offset=*/0,
+            GetTemporary(context, node, kScratchBuffer),
+            GetTemporary(context, node, kScalingFactors),
+            GetTemporary(context, node, kProductScalingFactors),
+            GetTemporary(context, node, kRecoveredCellWeights),
+            GetTemporary(context, node, kInputQuantized),
+            /*aux_input_quantized=*/nullptr,
+            GetTemporary(context, node, kOutputStateQuantized),
+            GetTemporary(context, node, kCellStateQuantized), output_state,
+            cell_state, GetTemporary(context, node, kAccumScratch), output,
+            GetTemporary(context, node, kZeroPoints), row_sums, row_sums_size,
             &op_data->compute_row_sums,
             CpuBackendContext::GetFromContext(context));
       } else {
         const int num_intermediate_tensors = node->intermediates->size;
         if (num_intermediate_tensors == 5) {
-          TfLiteTensor* scratch0 = GetTemporary(context, node, /*index=*/0);
-          TfLiteTensor* scratch1 = GetTemporary(context, node, /*index=*/1);
-          TfLiteTensor* scratch2 = GetTemporary(context, node, /*index=*/2);
-          TfLiteTensor* scratch3 = GetTemporary(context, node, /*index=*/3);
-          TfLiteTensor* scratch4 = GetTemporary(context, node, /*index=*/4);
-          TfLiteTensor* scratch5 = GetTemporary(context, node, /*index=*/5);
+          TfLiteTensor* scratch0 = GetTemporary(context, node, 0);
+          TfLiteTensor* scratch1 = GetTemporary(context, node, 1);
+          TfLiteTensor* scratch2 = GetTemporary(context, node, 2);
+          TfLiteTensor* scratch3 = GetTemporary(context, node, 3);
+          TfLiteTensor* scratch4 = GetTemporary(context, node, 4);
+          TfLiteTensor* scratch5 = GetTemporary(context, node, 5);
           return lstm_eval::EvalInteger8x8_16(
               input, input_to_input_weights, input_to_forget_weights,
               input_to_cell_weights, input_to_output_weights,
@@ -1642,14 +1656,14 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
               scratch3, scratch4, scratch5,
               CpuBackendContext::GetFromContext(context));
         } else {
-          TfLiteTensor* scratch0 = GetTemporary(context, node, /*index=*/0);
-          TfLiteTensor* scratch1 = GetTemporary(context, node, /*index=*/1);
-          TfLiteTensor* scratch2 = GetTemporary(context, node, /*index=*/2);
-          TfLiteTensor* scratch3 = GetTemporary(context, node, /*index=*/3);
-          TfLiteTensor* scratch4 = GetTemporary(context, node, /*index=*/4);
-          TfLiteTensor* scratch5 = GetTemporary(context, node, /*index=*/5);
-          TfLiteTensor* scratch6 = GetTemporary(context, node, /*index=*/6);
-          TfLiteTensor* scratch7 = GetTemporary(context, node, /*index=*/7);
+          TfLiteTensor* scratch0 = GetTemporary(context, node, 0);
+          TfLiteTensor* scratch1 = GetTemporary(context, node, 1);
+          TfLiteTensor* scratch2 = GetTemporary(context, node, 2);
+          TfLiteTensor* scratch3 = GetTemporary(context, node, 3);
+          TfLiteTensor* scratch4 = GetTemporary(context, node, 4);
+          TfLiteTensor* scratch5 = GetTemporary(context, node, 5);
+          TfLiteTensor* scratch6 = GetTemporary(context, node, 6);
+          TfLiteTensor* scratch7 = GetTemporary(context, node, 7);
           return lstm_eval::EvalInteger8x8_8(
               input, input_to_input_weights, input_to_forget_weights,
               input_to_cell_weights, input_to_output_weights,
diff --git a/tensorflow/lite/kernels/unidirectional_sequence_lstm.cc b/tensorflow/lite/kernels/unidirectional_sequence_lstm.cc
index 0b2cba72369..026b2452aef 100644
--- a/tensorflow/lite/kernels/unidirectional_sequence_lstm.cc
+++ b/tensorflow/lite/kernels/unidirectional_sequence_lstm.cc
@@ -345,7 +345,8 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   } else {
     node->temporaries = TfLiteIntArrayCreate(1);
   }
-  node->temporaries->data[0] = scratch_tensor_index;
+  node->temporaries->data[kScratchBuffer] =
+      scratch_tensor_index + kScratchBuffer;
 
   // Create a scratch buffer tensor.
   TfLiteTensor* scratch_buffer = GetTemporary(context, node, kScratchBuffer);
@@ -557,7 +558,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
       GetOptionalInputTensor(context, node, lstm::full::kProjectionBiasTensor);
 
   // Index the scratch buffers pointers to the global scratch buffer.
-  TfLiteTensor* scratch_buffer = GetTemporary(context, node, /*index=*/0);
+  TfLiteTensor* scratch_buffer = GetTemporary(context, node, kScratchBuffer);
 
   TfLiteTensor* output_state =
       GetVariableInput(context, node, lstm::full::kOutputStateTensor);
@@ -620,21 +621,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
     case kTfLiteUInt8:
     case kTfLiteInt8: {
       OpData* op_data = reinterpret_cast<OpData*>(node->user_data);
-      TfLiteTensor* input_quantized = GetTemporary(context, node, /*index=*/1);
-      TfLiteTensor* output_state_quantized =
-          GetTemporary(context, node, /*index=*/2);
-      TfLiteTensor* cell_state_quantized =
-          GetTemporary(context, node, /*index=*/3);
-      TfLiteTensor* scaling_factors = GetTemporary(context, node, /*index=*/4);
-      TfLiteTensor* prod_scaling_factors =
-          GetTemporary(context, node, /*index=*/5);
-      TfLiteTensor* recovered_cell_weights =
-          GetTemporary(context, node, /*index=*/6);
-      TfLiteTensor* accum_scratch =
-          GetTemporary(context, node, /*index=*/kAccumScratch);
-      TfLiteTensor* zero_points =
-          GetTemporary(context, node, /*index=*/kZeroPoints);
-      TfLiteTensor* row_sums = GetTemporary(context, node, /*index=*/kRowSums);
+      TfLiteTensor* row_sums = GetTemporary(context, node, kRowSums);
       const int row_sums_size = row_sums->dims->data[0];
       return lstm_eval::EvalHybrid(
           input, input_to_input_weights, input_to_forget_weights,
@@ -652,11 +639,17 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
           forget_gate_bias, cell_gate_bias, output_gate_bias,
           projection_weights, projection_bias, &lstm_params,
           /*forward_sequence=*/true, time_major,
-          /*output_offset=*/0, scratch_buffer, scaling_factors,
-          prod_scaling_factors, recovered_cell_weights, input_quantized,
-          /*aux_input_quantized=*/nullptr, output_state_quantized,
-          cell_state_quantized, output_state, cell_state, accum_scratch, output,
-          zero_points, row_sums, row_sums_size, &op_data->compute_row_sums,
+          /*output_offset=*/0, scratch_buffer,
+          GetTemporary(context, node, kScalingFactors),
+          GetTemporary(context, node, kProductScalingFactors),
+          GetTemporary(context, node, kRecoveredCellWeights),
+          GetTemporary(context, node, kInputQuantized),
+          /*aux_input_quantized=*/nullptr,
+          GetTemporary(context, node, kOutputStateQuantized),
+          GetTemporary(context, node, kCellStateQuantized), output_state,
+          cell_state, GetTemporary(context, node, kAccumScratch), output,
+          GetTemporary(context, node, kZeroPoints), row_sums, row_sums_size,
+          &op_data->compute_row_sums,
           CpuBackendContext::GetFromContext(context));
     }
     default:

From 7e798a2afeab42ac28380072d7a7a422a53a27d6 Mon Sep 17 00:00:00 2001
From: Nat Jeffries <njeff@google.com>
Date: Thu, 25 Jun 2020 15:31:39 -0700
Subject: [PATCH 1118/1390] Refactor Logical Test.

Refactor to not use the error prone IntArrayFromInitializer function in the tests.

PiperOrigin-RevId: 318360930
Change-Id: Ic4a82a51ed20e6a019892e7ba94fbe0e2b3a2710
---
 tensorflow/lite/micro/kernels/logical_test.cc | 74 ++++++++++---------
 1 file changed, 38 insertions(+), 36 deletions(-)

diff --git a/tensorflow/lite/micro/kernels/logical_test.cc b/tensorflow/lite/micro/kernels/logical_test.cc
index f9ad9023d4a..89a7a0ae74a 100644
--- a/tensorflow/lite/micro/kernels/logical_test.cc
+++ b/tensorflow/lite/micro/kernels/logical_test.cc
@@ -22,17 +22,13 @@ namespace tflite {
 namespace testing {
 namespace {
 
-void TestLogicalOp(tflite::BuiltinOperator op,
-                   std::initializer_list<int> input1_dims_data,
-                   std::initializer_list<bool> input1_data,
-                   std::initializer_list<int> input2_dims_data,
-                   std::initializer_list<bool> input2_data,
-                   std::initializer_list<int> output_dims_data,
-                   std::initializer_list<bool> expected_output_data,
-                   bool* output_data) {
-  TfLiteIntArray* input1_dims = IntArrayFromInitializer(input1_dims_data);
-  TfLiteIntArray* input2_dims = IntArrayFromInitializer(input2_dims_data);
-  TfLiteIntArray* output_dims = IntArrayFromInitializer(output_dims_data);
+void TestLogicalOp(tflite::BuiltinOperator op, const int* input1_dims_data,
+                   const bool* input1_data, const int* input2_dims_data,
+                   const bool* input2_data, const int* output_dims_data,
+                   const bool* expected_output_data, bool* output_data) {
+  TfLiteIntArray* input1_dims = IntArrayFromInts(input1_dims_data);
+  TfLiteIntArray* input2_dims = IntArrayFromInts(input2_dims_data);
+  TfLiteIntArray* output_dims = IntArrayFromInts(output_dims_data);
   const int output_dims_count = ElementCount(*output_dims);
 
   constexpr int inputs_size = 2;
@@ -73,7 +69,7 @@ void TestLogicalOp(tflite::BuiltinOperator op,
 
   TF_LITE_MICRO_EXPECT_EQ(output_dims_count, 4);
   for (int i = 0; i < output_dims_count; ++i) {
-    TF_LITE_MICRO_EXPECT_EQ(expected_output_data.begin()[i], output_data[i]);
+    TF_LITE_MICRO_EXPECT_EQ(expected_output_data[i], output_data[i]);
   }
 }
 
@@ -84,43 +80,49 @@ void TestLogicalOp(tflite::BuiltinOperator op,
 TF_LITE_MICRO_TESTS_BEGIN
 
 TF_LITE_MICRO_TEST(LogicalOr) {
+  const int shape[] = {4, 1, 1, 1, 4};
+  const bool input1[] = {true, false, false, true};
+  const bool input2[] = {true, false, true, false};
+  const bool golden[] = {true, false, true, true};
   bool output_data[4];
-  tflite::testing::TestLogicalOp(
-      tflite::BuiltinOperator_LOGICAL_OR,           // operator
-      {4, 1, 1, 1, 4}, {true, false, false, true},  // input1
-      {4, 1, 1, 1, 4}, {true, false, true, false},  // input2
-      {4, 1, 1, 1, 4}, {true, false, true, true},   // expected output
-      output_data);
+  tflite::testing::TestLogicalOp(tflite::BuiltinOperator_LOGICAL_OR, shape,
+                                 input1, shape, input2, shape, golden,
+                                 output_data);
 }
 
 TF_LITE_MICRO_TEST(BroadcastLogicalOr) {
+  const int input1_shape[] = {4, 1, 1, 1, 4};
+  const bool input1[] = {true, false, false, true};
+  const int input2_shape[] = {4, 1, 1, 1, 1};
+  const bool input2[] = {false};
+  const bool golden[] = {true, false, false, true};
   bool output_data[4];
-  tflite::testing::TestLogicalOp(
-      tflite::BuiltinOperator_LOGICAL_OR,           // operator
-      {4, 1, 1, 1, 4}, {true, false, false, true},  // input1
-      {4, 1, 1, 1, 1}, {false},                     // input2
-      {4, 1, 1, 1, 4}, {true, false, false, true},  // expected output
-      output_data);
+  tflite::testing::TestLogicalOp(tflite::BuiltinOperator_LOGICAL_OR,
+                                 input1_shape, input1, input2_shape, input2,
+                                 input1_shape, golden, output_data);
 }
 
 TF_LITE_MICRO_TEST(LogicalAnd) {
+  const int shape[] = {4, 1, 1, 1, 4};
+  const bool input1[] = {true, false, false, true};
+  const bool input2[] = {true, false, true, false};
+  const bool golden[] = {true, false, false, false};
   bool output_data[4];
-  tflite::testing::TestLogicalOp(
-      tflite::BuiltinOperator_LOGICAL_AND,           // operator
-      {4, 1, 1, 1, 4}, {true, false, false, true},   // input1
-      {4, 1, 1, 1, 4}, {true, false, true, false},   // input2
-      {4, 1, 1, 1, 4}, {true, false, false, false},  // expected output
-      output_data);
+  tflite::testing::TestLogicalOp(tflite::BuiltinOperator_LOGICAL_AND, shape,
+                                 input1, shape, input2, shape, golden,
+                                 output_data);
 }
 
 TF_LITE_MICRO_TEST(BroadcastLogicalAnd) {
+  const int input1_shape[] = {4, 1, 1, 1, 4};
+  const bool input1[] = {true, false, false, true};
+  const int input2_shape[] = {4, 1, 1, 1, 1};
+  const bool input2[] = {true};
+  const bool golden[] = {true, false, false, true};
   bool output_data[4];
-  tflite::testing::TestLogicalOp(
-      tflite::BuiltinOperator_LOGICAL_AND,          // operator
-      {4, 1, 1, 1, 4}, {true, false, false, true},  // input1
-      {4, 1, 1, 1, 1}, {true},                      // input2
-      {4, 1, 1, 1, 4}, {true, false, false, true},  // expected output
-      output_data);
+  tflite::testing::TestLogicalOp(tflite::BuiltinOperator_LOGICAL_AND,
+                                 input1_shape, input1, input2_shape, input2,
+                                 input1_shape, golden, output_data);
 }
 
 TF_LITE_MICRO_TESTS_END

From 698a934bfc15ecc789c84c4e236ab6061e7ff8b3 Mon Sep 17 00:00:00 2001
From: tg-at-google <taregaskin@google.com>
Date: Thu, 25 Jun 2020 18:58:04 -0400
Subject: [PATCH 1119/1390] Update quantization_utils.h

---
 tensorflow/core/kernels/quantization_utils.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/kernels/quantization_utils.h b/tensorflow/core/kernels/quantization_utils.h
index 06c901967b0..fef3ed582b3 100644
--- a/tensorflow/core/kernels/quantization_utils.h
+++ b/tensorflow/core/kernels/quantization_utils.h
@@ -268,7 +268,7 @@ inline void RequantizeManyInNewRangeReference(const qint32* input, int64 count,
   // that could be easily adapted for a SIMD implementation. It should also be
   // possible to perform all the calculations in 32-bit rather than 64, but
   // that's not been implemented yet.
-  for (size_t index = 0; static_cast<tensorflow::int64>(index) < count; ++index) {
+  for (tensorflow::int64 index = 0; index < count; ++index) {
     const int64 input_value = static_cast<int64>(input[index]);
     const int64 fp_value =
         ((input_value * range_scale_fp) >> 32) + input_offset_fp;

From 6a6086501f3f49200a56d35444f58133fd4745e1 Mon Sep 17 00:00:00 2001
From: tg-at-google <taregaskin@google.com>
Date: Thu, 25 Jun 2020 19:01:56 -0400
Subject: [PATCH 1120/1390] Update resolve_batch_normalization.cc

---
 .../resolve_batch_normalization.cc                   | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/tensorflow/lite/toco/graph_transformations/resolve_batch_normalization.cc b/tensorflow/lite/toco/graph_transformations/resolve_batch_normalization.cc
index 3b60bf1b2a4..f0ca8e08749 100644
--- a/tensorflow/lite/toco/graph_transformations/resolve_batch_normalization.cc
+++ b/tensorflow/lite/toco/graph_transformations/resolve_batch_normalization.cc
@@ -125,12 +125,12 @@ namespace toco {
       multiplier_array.GetBuffer<ArrayDataType::kFloat>().data;
   const auto& offset_float_data =
       offset_array.GetBuffer<ArrayDataType::kFloat>().data;
-
-  CHECK(static_cast<int>(mul_float_data.size()) == buffer_size);
-  CHECK(static_cast<int>(add_float_data.size()) == buffer_size);
-  CHECK(static_cast<int>(mean_float_data.size()) == buffer_size);
-  CHECK(static_cast<int>(multiplier_float_data.size()) == buffer_size);
-  CHECK(static_cast<int>(offset_float_data.size()) == buffer_size);
+  size_t buffer_size_for_compare = buffer_size; 
+  CHECK(mul_float_data.size() == buffer_size_for_compare);
+  CHECK(add_float_data.size() == buffer_size_for_compare);
+  CHECK(mean_float_data.size() == buffer_size_for_compare);
+  CHECK(multiplier_float_data.size() == buffer_size_for_compare);
+  CHECK(offset_float_data.size() == buffer_size_for_compare);
 
   for (int i = 0; i < buffer_size; i++) {
     mul_float_data[i] = multiplier_float_data[i];

From 71a74e6882e89221de9dba99a1a4378a47034ff0 Mon Sep 17 00:00:00 2001
From: Giorgio Arena <giorgio.arena@arm.com>
Date: Thu, 25 Jun 2020 15:56:09 -0700
Subject: [PATCH 1121/1390] PR #35000: Add tests in TFLite micro for
 Float/Uint8/Int8 Tanh activation

Imported from GitHub PR https://github.com/tensorflow/tensorflow/pull/35000

Copybara import of the project:

--
de4b77729bb86450608f8536c56a53d2fe214e9f by Giorgio Arena <giorgio.arena@arm.com>:

Add tests in TFLite micro for Float/Uint8/Int8 Tanh activation

--
c7264109a6bd17a89bd28607dac22d8a9caa8880 by Giorgio Arena <giorgio.arena@arm.com>:

Remove dynamic allocations and use new tensor quantization functions

--
59c5ff87faa9aea204774cd7050c42248aa50a74 by Giorgio Arena <giorgio.arena@arm.com>:

Move reference/optimized tanh activation functions to their own header

--
7aa85f0da1c49a9183b1c10ec7aff6925535303d by Giorgio Arena <giorgio.arena@arm.com>:

Use const int[] for shapes in TanH test and remove elementwise op

--
24653f7127bb3142002696142be66162c46c2c5c by Giorgio Arena <giorgio.arena@arm.com>:

Fix build failures for tanh op in activation_tests

--
6407b349e691ea50c6c6e04b64551e73f16bf201 by Giorgio Arena <giorgio.arena@arm.com>:

Fix build failures and use TF_LITE_KERNEL_LOG

COPYBARA_INTEGRATE_REVIEW=https://github.com/tensorflow/tensorflow/pull/35000 from giorgio-arenarm:tanh_s8_tests 6407b349e691ea50c6c6e04b64551e73f16bf201
PiperOrigin-RevId: 318365260
Change-Id: I6bed099dfab3112363f80bc4f3724b39366f4e3a
---
 .../internal/reference/legacy_reference_ops.h |  44 +---
 .../lite/kernels/internal/reference/tanh.h    |  43 ++++
 tensorflow/lite/micro/kernels/tanh.cc         |  82 +++----
 tensorflow/lite/micro/kernels/tanh_test.cc    | 223 +++++++++++-------
 4 files changed, 225 insertions(+), 167 deletions(-)

diff --git a/tensorflow/lite/kernels/internal/reference/legacy_reference_ops.h b/tensorflow/lite/kernels/internal/reference/legacy_reference_ops.h
index f62c9bd197c..85d3b674c92 100644
--- a/tensorflow/lite/kernels/internal/reference/legacy_reference_ops.h
+++ b/tensorflow/lite/kernels/internal/reference/legacy_reference_ops.h
@@ -25,6 +25,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/reference/depthwiseconv_float.h"
 #include "tensorflow/lite/kernels/internal/reference/depthwiseconv_uint8.h"
 #include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
+#include "tensorflow/lite/kernels/internal/reference/tanh.h"
 #include "tensorflow/lite/kernels/internal/types.h"
 
 namespace tflite {
@@ -844,49 +845,6 @@ inline void Logistic(const RuntimeShape& input_shape, const int16* input_data,
   Logistic(params, input_shape, input_data, output_shape, output_data);
 }
 
-inline void Tanh(const TanhParams& params, const RuntimeShape& input_shape,
-                 const uint8* input_data, const RuntimeShape& output_shape,
-                 uint8* output_data) {
-  const int32 input_zero_point = params.input_zero_point;
-  const int32 input_range_radius = params.input_range_radius;
-  const int32 input_multiplier = params.input_multiplier;
-  const int input_left_shift = params.input_left_shift;
-  const int32 output_zero_point = 128;
-  const int flat_size = MatchingFlatSize(input_shape, output_shape);
-
-  for (int i = 0; i < flat_size; i++) {
-    const uint8 input_val_u8 = input_data[i];
-    const int32 input_val_centered =
-        static_cast<int32>(input_val_u8) - input_zero_point;
-    uint8 output_val;
-    if (input_val_centered <= -input_range_radius) {
-      output_val = 0;
-    } else if (input_val_centered >= input_range_radius) {
-      output_val = 255;
-    } else {
-      const int32 input_val_rescaled =
-          MultiplyByQuantizedMultiplierGreaterThanOne(
-              input_val_centered, input_multiplier, input_left_shift);
-      using FixedPoint4 = gemmlowp::FixedPoint<int32, 4>;
-      using FixedPoint0 = gemmlowp::FixedPoint<int32, 0>;
-      const FixedPoint4 input_val_f4 = FixedPoint4::FromRaw(input_val_rescaled);
-      const FixedPoint0 output_val_f0 = gemmlowp::tanh(input_val_f4);
-      // Convert from Q0.31 to Q24.7.
-      using gemmlowp::RoundingDivideByPOT;
-      int32 output_val_s32 = RoundingDivideByPOT(output_val_f0.raw(), 24);
-      output_val_s32 += output_zero_point;
-      if (output_val_s32 == 256) {
-        output_val_s32 = 255;
-      }
-      // Reinterpret as Q0.7, encoded in uint8.
-      TFLITE_DCHECK_GE(output_val_s32, 0);
-      TFLITE_DCHECK_LE(output_val_s32, 255);
-      output_val = static_cast<uint8>(output_val_s32);
-    }
-    output_data[i] = output_val;
-  }
-}
-
 inline void Tanh(const uint8* input_data, const RuntimeShape& input_shape,
                  int32 input_zero_point, int32 input_range_radius,
                  int32 input_multiplier, int input_left_shift,
diff --git a/tensorflow/lite/kernels/internal/reference/tanh.h b/tensorflow/lite/kernels/internal/reference/tanh.h
index 0f31d4ddeef..04c66989b48 100644
--- a/tensorflow/lite/kernels/internal/reference/tanh.h
+++ b/tensorflow/lite/kernels/internal/reference/tanh.h
@@ -80,6 +80,49 @@ inline void Tanh(const TanhParams& params, const RuntimeShape& input_shape,
   }
 }
 
+inline void Tanh(const TanhParams& params, const RuntimeShape& input_shape,
+                 const uint8* input_data, const RuntimeShape& output_shape,
+                 uint8* output_data) {
+  const int32 input_zero_point = params.input_zero_point;
+  const int32 input_range_radius = params.input_range_radius;
+  const int32 input_multiplier = params.input_multiplier;
+  const int input_left_shift = params.input_left_shift;
+  const int32 output_zero_point = 128;
+  const int flat_size = MatchingFlatSize(input_shape, output_shape);
+
+  for (int i = 0; i < flat_size; i++) {
+    const uint8 input_val_u8 = input_data[i];
+    const int32 input_val_centered =
+        static_cast<int32>(input_val_u8) - input_zero_point;
+    uint8 output_val;
+    if (input_val_centered <= -input_range_radius) {
+      output_val = 0;
+    } else if (input_val_centered >= input_range_radius) {
+      output_val = 255;
+    } else {
+      const int32 input_val_rescaled =
+          MultiplyByQuantizedMultiplierGreaterThanOne(
+              input_val_centered, input_multiplier, input_left_shift);
+      using FixedPoint4 = gemmlowp::FixedPoint<int32, 4>;
+      using FixedPoint0 = gemmlowp::FixedPoint<int32, 0>;
+      const FixedPoint4 input_val_f4 = FixedPoint4::FromRaw(input_val_rescaled);
+      const FixedPoint0 output_val_f0 = gemmlowp::tanh(input_val_f4);
+      // Convert from Q0.31 to Q24.7.
+      using gemmlowp::RoundingDivideByPOT;
+      int32 output_val_s32 = RoundingDivideByPOT(output_val_f0.raw(), 24);
+      output_val_s32 += output_zero_point;
+      if (output_val_s32 == 256) {
+        output_val_s32 = 255;
+      }
+      // Reinterpret as Q0.7, encoded in uint8.
+      TFLITE_DCHECK_GE(output_val_s32, 0);
+      TFLITE_DCHECK_LE(output_val_s32, 255);
+      output_val = static_cast<uint8>(output_val_s32);
+    }
+    output_data[i] = output_val;
+  }
+}
+
 }  // namespace reference_ops
 }  // namespace tflite
 
diff --git a/tensorflow/lite/micro/kernels/tanh.cc b/tensorflow/lite/micro/kernels/tanh.cc
index d978c7a1308..deb9ebb7a6e 100644
--- a/tensorflow/lite/micro/kernels/tanh.cc
+++ b/tensorflow/lite/micro/kernels/tanh.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/kernels/op_macros.h"
+#include "tensorflow/lite/micro/micro_utils.h"
 
 namespace tflite {
 namespace ops {
@@ -41,15 +42,14 @@ struct OpData {
 
 TfLiteStatus CalculateArithmeticOpData(TfLiteContext* context, TfLiteNode* node,
                                        OpData* data) {
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
   const TfLiteTensor* input = GetInput(context, node, kInputTensor);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
 
   TF_LITE_ENSURE_TYPES_EQ(context, input->type, output->type);
-  if (input->type == kTfLiteInt8) {
-    TF_LITE_ENSURE_EQ(context, output->params.zero_point, 0);
 
-    // The number if input integer bits is set to be consistent with the
-    // required value in reference_integer_ops::Tanh
+  if (input->type == kTfLiteUInt8 || input->type == kTfLiteInt8) {
     static constexpr int kInputIntegerBits = 4;
     const double input_real_multiplier =
         static_cast<double>(input->params.scale) *
@@ -70,44 +70,46 @@ TfLiteStatus TanhEval(TfLiteContext* context, TfLiteNode* node) {
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
   OpData data;
   CalculateArithmeticOpData(context, node, &data);
+  switch (input->type) {
+    case kTfLiteFloat32: {
+      reference_ops::Tanh(GetTensorShape(input), GetTensorData<float>(input),
+                          GetTensorShape(output), GetTensorData<float>(output));
+      return kTfLiteOk;
+    } break;
+    case kTfLiteInt16: {
+      TanhParams params;
+      params.input_left_shift = data.input_left_shift;
+      reference_ops::Tanh(params, GetTensorShape(input),
+                          GetTensorData<int16_t>(input), GetTensorShape(output),
+                          GetTensorData<int16_t>(output));
+      return kTfLiteOk;
+    } break;
+    case kTfLiteUInt8: {
+      TanhParams params;
+      params.input_zero_point = input->params.zero_point;
+      params.input_range_radius = data.input_range_radius;
+      params.input_multiplier = data.input_multiplier;
+      params.input_left_shift = data.input_left_shift;
+      reference_ops::Tanh(params, GetTensorShape(input),
+                          GetTensorData<uint8_t>(input), GetTensorShape(output),
+                          GetTensorData<uint8_t>(output));
 
-  if (input->type == kTfLiteFloat32) {
-    switch (output->type) {
-      case kTfLiteFloat32: {
-        reference_ops::Tanh(GetTensorShape(input), GetTensorData<float>(input),
-                            GetTensorShape(output),
-                            GetTensorData<float>(output));
-        return kTfLiteOk;
-      }
-      default:
-        TF_LITE_KERNEL_LOG(context, "Input %s, output %s not supported.",
-                           TfLiteTypeGetName(input->type),
-                           TfLiteTypeGetName(output->type));
-        return kTfLiteError;
-    }
-  } else if (input->type == kTfLiteInt8) {
-    switch (output->type) {
-      case kTfLiteInt8: {
-        reference_integer_ops::Tanh(
-            input->params.zero_point, data.input_range_radius,
-            data.input_multiplier, data.input_left_shift,
-            NumElements(input->dims), GetTensorData<int8_t>(input),
-            GetTensorData<int8_t>(output));
-        return kTfLiteOk;
-      }
-      default:
-        TF_LITE_KERNEL_LOG(context, "Input %s, output %s not supported.",
-                           TfLiteTypeGetName(input->type),
-                           TfLiteTypeGetName(output->type));
-        return kTfLiteError;
-    }
-  } else {
-    TF_LITE_KERNEL_LOG(context, "Input %s, output %s not supported.",
-                       TfLiteTypeGetName(input->type),
-                       TfLiteTypeGetName(output->type));
-    return kTfLiteError;
+      return kTfLiteOk;
+    } break;
+    case kTfLiteInt8: {
+      reference_integer_ops::Tanh(
+          input->params.zero_point, data.input_range_radius,
+          data.input_multiplier, data.input_left_shift,
+          NumElements(input->dims), GetTensorData<int8_t>(input),
+          GetTensorData<int8_t>(output));
+      return kTfLiteOk;
+    } break;
+    default:
+      TF_LITE_KERNEL_LOG(context, "Input %s, output %s not supported.",
+                         TfLiteTypeGetName(input->type),
+                         TfLiteTypeGetName(output->type));
+      return kTfLiteError;
   }
-  return kTfLiteOk;
 }
 
 }  // namespace activations
diff --git a/tensorflow/lite/micro/kernels/tanh_test.cc b/tensorflow/lite/micro/kernels/tanh_test.cc
index 0dd8d619a7c..54c9816c9a9 100644
--- a/tensorflow/lite/micro/kernels/tanh_test.cc
+++ b/tensorflow/lite/micro/kernels/tanh_test.cc
@@ -23,13 +23,54 @@ namespace tflite {
 namespace testing {
 namespace {
 
-void TestTanhFloat(std::initializer_list<int> input_dims_data,
-                   std::initializer_list<float> input_data,
-                   std::initializer_list<float> expected_output_data,
-                   std::initializer_list<int> output_dims_data,
-                   float* output_data) {
-  TfLiteIntArray* input_dims = IntArrayFromInitializer(input_dims_data);
-  TfLiteIntArray* output_dims = IntArrayFromInitializer(output_dims_data);
+constexpr int tanh_vec_size = 90;
+
+const float tanh_input_vec_fp[tanh_vec_size] = {
+    -8.0000000000, -7.8181818182, -7.6363636364, -7.4545454545, -7.2727272727,
+    -7.0909090909, -6.9090909091, -6.7272727273, -6.5454545455, -6.3636363636,
+    -6.1818181818, -6.0000000000, -5.8181818182, -5.6363636364, -5.4545454545,
+    -5.2727272727, -5.0909090909, -4.9090909091, -4.7272727273, -4.5454545455,
+    -4.3636363636, -4.1818181818, -4.0000000000, -3.8181818182, -3.6363636364,
+    -3.4545454545, -3.2727272727, -3.0909090909, -2.9090909091, -2.7272727273,
+    -2.5454545455, -2.3636363636, -2.1818181818, -2.0000000000, -1.8181818182,
+    -1.6363636364, -1.4545454545, -1.2727272727, -1.0909090909, -0.9090909091,
+    -0.7272727273, -0.5454545455, -0.3636363636, -0.1818181818, 0.0000000000,
+    0.1818181818,  0.3636363636,  0.5454545455,  0.7272727273,  0.9090909091,
+    1.0909090909,  1.2727272727,  1.4545454545,  1.6363636364,  1.8181818182,
+    2.0000000000,  2.1818181818,  2.3636363636,  2.5454545455,  2.7272727273,
+    2.9090909091,  3.0909090909,  3.2727272727,  3.4545454545,  3.6363636364,
+    3.8181818182,  4.0000000000,  4.1818181818,  4.3636363636,  4.5454545455,
+    4.7272727273,  4.9090909091,  5.0909090909,  5.2727272727,  5.4545454545,
+    5.6363636364,  5.8181818182,  6.0000000000,  6.1818181818,  6.3636363636,
+    6.5454545455,  6.7272727273,  6.9090909091,  7.0909090909,  7.2727272727,
+    7.4545454545,  7.6363636364,  7.8181818182,  8.0000000000};
+
+const float tanh_output_vec_fp[tanh_vec_size] = {
+    -0.9999997749, -0.9999996762, -0.9999995342, -0.9999993300, -0.9999990361,
+    -0.9999986134, -0.9999980053, -0.9999971306, -0.9999958722, -0.9999940619,
+    -0.9999914578, -0.9999877117, -0.9999823226, -0.9999745703, -0.9999634183,
+    -0.9999473758, -0.9999242982, -0.9998911009, -0.9998433469, -0.9997746542,
+    -0.9996758446, -0.9995337191, -0.9993292997, -0.9990353053, -0.9986125310,
+    -0.9980046622, -0.9971308601, -0.9958751909, -0.9940716137, -0.9914827859,
+    -0.9877703933, -0.9824541388, -0.9748561217, -0.9640275801, -0.9486568273,
+    -0.9269625051, -0.8965880154, -0.8545351057, -0.7972097087, -0.7206956332,
+    -0.6213939966, -0.4971057414, -0.3484130125, -0.1798408185, 0.0000000000,
+    0.1798408185,  0.3484130125,  0.4971057414,  0.6213939966,  0.7206956332,
+    0.7972097087,  0.8545351057,  0.8965880154,  0.9269625051,  0.9486568273,
+    0.9640275801,  0.9748561217,  0.9824541388,  0.9877703933,  0.9914827859,
+    0.9940716137,  0.9958751909,  0.9971308601,  0.9980046622,  0.9986125310,
+    0.9990353053,  0.9993292997,  0.9995337191,  0.9996758446,  0.9997746542,
+    0.9998433469,  0.9998911009,  0.9999242982,  0.9999473758,  0.9999634183,
+    0.9999745703,  0.9999823226,  0.9999877117,  0.9999914578,  0.9999940619,
+    0.9999958722,  0.9999971306,  0.9999980053,  0.9999986134,  0.9999990361,
+    0.9999993300,  0.9999995342,  0.9999996762,  0.9999997749};
+
+void TestTanhFloat(const int input_dims_data[], const float* input_data,
+                   const float* expected_output_data,
+                   const int output_dims_data[], float* output_data,
+                   const float tolerance) {
+  TfLiteIntArray* input_dims = IntArrayFromInts(input_dims_data);
+  TfLiteIntArray* output_dims = IntArrayFromInts(output_dims_data);
   const int output_elements_count = ElementCount(*output_dims);
 
   constexpr int inputs_size = 1;
@@ -75,28 +116,36 @@ void TestTanhFloat(std::initializer_list<int> input_dims_data,
     registration->free(&context, user_data);
   }
   for (int i = 0; i < output_elements_count; ++i) {
-    TF_LITE_MICRO_EXPECT_NEAR(expected_output_data.begin()[i], output_data[i],
-                              1e-5f);
+    TF_LITE_MICRO_EXPECT_NEAR(expected_output_data[i], output_data[i],
+                              tolerance);
   }
 }
 
-void TestTanhInt8(std::initializer_list<int> input_dims_data,
-                  std::initializer_list<int8_t> input_data, float input_min,
-                  float input_max,
-                  std::initializer_list<int8_t> expected_output_data,
-                  std::initializer_list<int> output_dims_data, float output_min,
-                  float output_max, int8_t* output_data) {
-  TfLiteIntArray* input_dims = IntArrayFromInitializer(input_dims_data);
-  TfLiteIntArray* output_dims = IntArrayFromInitializer(output_dims_data);
+template <typename T>
+void TestTanhQuantized(const int input_dims_data[], const float* input_data,
+                       T* input_quantized, float input_scale,
+                       int input_zero_point, const float* expected_output_data,
+                       T* expected_output_quantized,
+                       const int output_dims_data[], float output_scale,
+                       int output_zero_point, T* output_quantized,
+                       const int tolerance) {
+  static_assert(sizeof(T) == 1, "Valid only for 8bit data types");
+  TfLiteIntArray* input_dims = IntArrayFromInts(input_dims_data);
+  TfLiteIntArray* output_dims = IntArrayFromInts(output_dims_data);
   const int output_elements_count = ElementCount(*output_dims);
 
+  tflite::AsymmetricQuantize(expected_output_data, expected_output_quantized,
+                             output_elements_count, output_scale,
+                             output_zero_point);
+
   constexpr int inputs_size = 1;
   constexpr int outputs_size = 1;
   constexpr int tensors_size = inputs_size + outputs_size;
   TfLiteTensor tensors[tensors_size] = {
-      CreateQuantizedTensor(input_data, input_dims, input_min, input_max),
-      CreateQuantizedTensor(output_data, output_dims, output_min, output_max),
-  };
+      CreateQuantizedTensor(input_data, input_quantized, input_dims,
+                            input_scale, input_zero_point),
+      CreateQuantizedTensor(output_quantized, output_dims, output_scale,
+                            output_zero_point)};
 
   TfLiteContext context;
   PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
@@ -107,7 +156,7 @@ void TestTanhInt8(std::initializer_list<int> input_dims_data,
   TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
 
   const char* init_data = nullptr;
-  size_t init_data_size = 1;
+  size_t init_data_size = 0;
   void* user_data = nullptr;
   if (registration->init) {
     user_data = registration->init(&context, init_data, init_data_size);
@@ -133,8 +182,8 @@ void TestTanhInt8(std::initializer_list<int> input_dims_data,
     registration->free(&context, user_data);
   }
   for (int i = 0; i < output_elements_count; ++i) {
-    TF_LITE_MICRO_EXPECT_NEAR(expected_output_data.begin()[i], output_data[i],
-                              1);
+    TF_LITE_MICRO_EXPECT_NEAR(expected_output_quantized[i], output_quantized[i],
+                              tolerance);
   }
 }
 
@@ -144,71 +193,77 @@ void TestTanhInt8(std::initializer_list<int> input_dims_data,
 
 TF_LITE_MICRO_TESTS_BEGIN
 
-TF_LITE_MICRO_TEST(SimpleTestFloat) {
-  const int output_elements_count = 10;
-  float output_data[output_elements_count];
-  tflite::testing::TestTanhFloat({2, 1, 5},  // Input shape.
-                                 {
-                                     1.0,
-                                     2.0,
-                                     3.0,
-                                     4.0,
-                                     93.0,
-                                     -1.0,
-                                     -2.0,
-                                     -3.0,
-                                     -4.0,
-                                     -93.0,
-                                 },
-                                 {
-                                     // Expected results.
-                                     0.76159416,
-                                     0.96402758,
-                                     0.99505475,
-                                     0.9993293,
-                                     1.0,
-                                     -0.76159416,
-                                     -0.96402758,
-                                     -0.99505475,
-                                     -0.9993293,
-                                     -1.0,
-                                 },
-                                 {2, 1, 5},  // Output shape.
-                                 output_data);
+TF_LITE_MICRO_TEST(SimpleTestTanhFloat) {
+  using tflite::testing::tanh_input_vec_fp;
+  using tflite::testing::tanh_output_vec_fp;
+  using tflite::testing::tanh_vec_size;
+
+  const int input_shape[] = {2, 1, tanh_vec_size};
+  const int output_shape[] = {2, 1, tanh_vec_size};
+
+  float output_data[tanh_vec_size];
+  tflite::testing::TestTanhFloat(  //
+      input_shape,                 // Input shape.
+      tanh_input_vec_fp,           // Input data
+      tanh_output_vec_fp,          // Expected results.
+      output_shape,                // Output shape.
+      output_data, 1e-7 /* tolerance */);
 }
 
-TF_LITE_MICRO_TEST(SimpleTestInt8) {
-  using tflite::testing::F2QS;
+TF_LITE_MICRO_TEST(SimpleTestTanhUInt8) {
+  using tflite::testing::tanh_input_vec_fp;
+  using tflite::testing::tanh_output_vec_fp;
+  using tflite::testing::tanh_vec_size;
 
-  const float input_min = -31.75f;
-  const float input_max = 32.0f;
-  const float output_min = -1.0f;
-  const float output_max = (127.0f / 128.0f);
+  const float input_scale = 16 / 256.f;
+  const int input_zero_point = 128;
+  const float output_scale = 1.99999955 / 256.f;
+  const int output_zero_point = 128;
 
-  const int output_elements_count = 10;
-  int8_t output_data[output_elements_count];
-  tflite::testing::TestTanhInt8(
-      {2, 1, output_elements_count},  // Input shape.
-      {F2QS(1.0, input_min, input_max), F2QS(2.0, input_min, input_max),
-       F2QS(3.0, input_min, input_max), F2QS(4.0, input_min, input_max),
-       F2QS(5.0, input_min, input_max), F2QS(-1.0, input_min, input_max),
-       F2QS(-2.0, input_min, input_max), F2QS(-3.0, input_min, input_max),
-       F2QS(-4.0, input_min, input_max), F2QS(-5.0, input_min, input_max)},
-      input_min, input_max,  // Input quantized range.
-      {                      // Expected results.
-       F2QS(0.76159416, output_min, output_max),
-       F2QS(0.96402758, output_min, output_max),
-       F2QS(0.99505475, output_min, output_max),
-       F2QS(0.9993293, output_min, output_max),
-       F2QS(0.9999092, output_min, output_max),
-       F2QS(-0.76159416, output_min, output_max),
-       F2QS(-0.96402758, output_min, output_max),
-       F2QS(-0.99505475, output_min, output_max),
-       F2QS(-0.9993293, output_min, output_max),
-       F2QS(-0.9999092, output_min, output_max)},
-      {2, 1, output_elements_count},  // Output shape.
-      output_min, output_max,         // Output quantized range.
-      output_data);
+  const int input_shape[] = {2, 1, tanh_vec_size};
+  const int output_shape[] = {2, 1, tanh_vec_size};
+
+  uint8_t input_quantized[tanh_vec_size];
+  uint8_t expected_output_quantized[tanh_vec_size];
+  uint8_t output_quantized[tanh_vec_size];
+  tflite::testing::TestTanhQuantized<uint8_t>(        //
+      input_shape,                                    // Input shape.
+      tanh_input_vec_fp, input_quantized,             // Input data.
+      input_scale, input_zero_point,                  // Input quantized info.
+      tanh_output_vec_fp, expected_output_quantized,  // Expected results.
+      output_shape,                                   // Output shape.
+      output_scale, output_zero_point,                // Output quantized info.
+      output_quantized,                               // Operation results
+      2                                               // Tolerance.
+  );
+}
+
+TF_LITE_MICRO_TEST(SimpleTestTanhUInt8) {
+  using tflite::testing::tanh_input_vec_fp;
+  using tflite::testing::tanh_output_vec_fp;
+  using tflite::testing::tanh_vec_size;
+
+  const float input_scale = 16 / 256.f;
+  const int input_zero_point = 0;
+  const float output_scale = 1.99999955 / 256.f;
+  const int output_zero_point = 0;
+
+  const int input_shape[] = {2, 1, tanh_vec_size};
+  const int output_shape[] = {2, 1, tanh_vec_size};
+
+  int8_t input_quantized[tanh_vec_size];
+  int8_t expected_output_quantized[tanh_vec_size];
+  int8_t output_quantized[tanh_vec_size];
+  tflite::testing::TestTanhQuantized<int8_t>(         //
+      input_shape,                                    // Input shape.
+      tanh_input_vec_fp, input_quantized,             // Input data.
+      input_scale, input_zero_point,                  // Input quantized info.
+      tanh_output_vec_fp, expected_output_quantized,  // Expected results.
+      output_shape,                                   // Output shape.
+      output_scale, output_zero_point,                // Output quantized info.
+      output_quantized,                               // Operation results
+      2                                               // Tolerance.
+  );
 }
 
 TF_LITE_MICRO_TESTS_END

From 7fa8850c5a1e0806760eab68d31d698240dc4c31 Mon Sep 17 00:00:00 2001
From: tg-at-google <taregaskin@google.com>
Date: Thu, 25 Jun 2020 19:05:45 -0400
Subject: [PATCH 1122/1390] Update unpartition_embedding_lookup.cc

---
 .../graph_transformations/unpartition_embedding_lookup.cc     | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/lite/toco/graph_transformations/unpartition_embedding_lookup.cc b/tensorflow/lite/toco/graph_transformations/unpartition_embedding_lookup.cc
index dffc0921799..6dbdb42ecf9 100644
--- a/tensorflow/lite/toco/graph_transformations/unpartition_embedding_lookup.cc
+++ b/tensorflow/lite/toco/graph_transformations/unpartition_embedding_lookup.cc
@@ -55,8 +55,8 @@ namespace toco {
   auto* stitch_op = static_cast<DynamicStitchOperator*>(op_it->get());
 
   // Split up the DynamicStitch inputs into the indices and data.
-  std::vector<string> stitch_indices_inputs;
-  std::vector<string> stitch_data_inputs;
+  std::vector<std::string> stitch_indices_inputs;
+  std::vector<std::string> stitch_data_inputs;
   for (int i = 0; i < stitch_op->num_partitions; ++i) {
     stitch_indices_inputs.push_back(stitch_op->inputs[i]);
   }

From 59debc3af2b7dfe00385688f78633ff1d36f749f Mon Sep 17 00:00:00 2001
From: Nat Jeffries <njeff@google.com>
Date: Thu, 25 Jun 2020 16:05:07 -0700
Subject: [PATCH 1123/1390] Fix issue where printing
 TF_LITE_REPORT_ERROR(reporter, "%f", 1.01") would report close to 1.1*2^0
 instead of 1.01*2^0 due to truncation of leading zeros.

PiperOrigin-RevId: 318366939
Change-Id: I7658714fd3175bbd9e559821568ab3ca8deb729c
---
 tensorflow/lite/micro/micro_string.cc      | 40 ++++++++++++++++++++++
 tensorflow/lite/micro/micro_string_test.cc | 11 ++++++
 2 files changed, 51 insertions(+)

diff --git a/tensorflow/lite/micro/micro_string.cc b/tensorflow/lite/micro/micro_string.cc
index 9952565ef52..6d6495ed7c9 100644
--- a/tensorflow/lite/micro/micro_string.cc
+++ b/tensorflow/lite/micro/micro_string.cc
@@ -163,7 +163,47 @@ char* FastFloatToBufferLeft(float f, char* buffer) {
   *current = '.';
   current += 1;
   *current = 0;
+
+  // Prepend leading zeros to fill in all 7 bytes of the fraction. Truncate
+  // zeros off the end of the fraction. Every fractional value takes 7 bytes.
+  // For example, 2500 would be written into the buffer as 0002500 since it
+  // represents .00025.
+  constexpr int kMaxFractionalDigits = 7;
+
+  // Abort early if there is not enough space in the buffer.
+  if (current_end - current <= kMaxFractionalDigits) {
+    return current;
+  }
+
+  // Pre-fill buffer with zeros to ensure zero-truncation works properly.
+  for (int i = 1; i < kMaxFractionalDigits; i++) {
+    *(current + i) = '0';
+  }
+
+  // Track how large the fraction is to add leading zeros.
+  char* previous = current;
   current = StrCatUInt32(current, (current_end - current), scaled_fraction, 10);
+  int fraction_digits = current - previous;
+  int leading_zeros = kMaxFractionalDigits - fraction_digits;
+
+  // Overwrite the null terminator from StrCatUInt32 to ensure zero-trunctaion
+  // works properly.
+  *current = '0';
+
+  // Shift fraction values and prepent zeros.
+  for (int i = 0; i < fraction_digits; i++) {
+    current--;
+    *(current + leading_zeros) = *current;
+    *current = '0';
+  }
+  current += kMaxFractionalDigits;
+
+  // Truncate trailing zeros for cleaner logs. Ensure we leave at least one
+  // fractional character for the case when scaled_fraction is 0.
+  while (*(current - 1) == '0' && (current - 1) > previous) {
+    current--;
+  }
+  *current = 0;
   current = StrCatStr(current, (current_end - current), "*2^");
   current = StrCatInt32(current, (current_end - current), exponent);
   return current;
diff --git a/tensorflow/lite/micro/micro_string_test.cc b/tensorflow/lite/micro/micro_string_test.cc
index 2339e36bdc3..fb8183bb492 100644
--- a/tensorflow/lite/micro/micro_string_test.cc
+++ b/tensorflow/lite/micro/micro_string_test.cc
@@ -110,6 +110,17 @@ TF_LITE_MICRO_TEST(FloatFormatOverrunShouldTruncate) {
   TF_LITE_MICRO_EXPECT_STRING_EQ(golden, buffer);
 }
 
+TF_LITE_MICRO_TEST(FloatFormatShouldPrintFractionCorrectly) {
+  const int kBufferLen = 24;
+  char buffer[kBufferLen];
+  const char golden[] = "Float: 1.0625*2^0";
+  // Add small offset to float value to account for float rounding error.
+  int bytes_written =
+      MicroSnprintf(buffer, kBufferLen, "Float: %f", 1.0625001f);
+  TF_LITE_MICRO_EXPECT_EQ(sizeof(golden), bytes_written);
+  TF_LITE_MICRO_EXPECT_STRING_EQ(golden, buffer);
+}
+
 TF_LITE_MICRO_TEST(StringFormatOverrunShouldTruncate) {
   const int kBufferLen = 10;
   char buffer[kBufferLen];

From aaaa43026450abf1aaceec0b569cd3223d570e54 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 25 Jun 2020 16:21:53 -0700
Subject: [PATCH 1124/1390] Parallel device: add a test for collectives inside
 a function

PiperOrigin-RevId: 318369928
Change-Id: I81f21c732ed59d0541bce683072bdb950c84408a
---
 .../parallel_device/parallel_device_test.py   | 30 +------------------
 1 file changed, 1 insertion(+), 29 deletions(-)

diff --git a/tensorflow/python/distribute/parallel_device/parallel_device_test.py b/tensorflow/python/distribute/parallel_device/parallel_device_test.py
index f6d6b525ae9..1429c522aba 100644
--- a/tensorflow/python/distribute/parallel_device/parallel_device_test.py
+++ b/tensorflow/python/distribute/parallel_device/parallel_device_test.py
@@ -23,14 +23,12 @@ import threading
 from tensorflow.python.distribute.parallel_device import parallel_device
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
-from tensorflow.python.eager import def_function
 from tensorflow.python.framework import config
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import ops
 from tensorflow.python.module import module
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import collective_ops
-from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
@@ -44,7 +42,7 @@ from tensorflow.python.util import nest
 # communicate.
 # TODO(allenl): Switch to using a collective manager.
 _COUNTER_LOCK = threading.Lock()
-_COUNTER = 100
+_COUNTER = 0
 
 
 def _collective_reduce(inputs, operation, num_replicas):
@@ -173,32 +171,6 @@ class ParallelDeviceTests(_VirtualDeviceTestCase):
       context._reset_context()
       config.set_synchronous_execution(previous)
 
-  def test_collective_in_function(self):
-    c = constant_op.constant([2])
-
-    @def_function.function
-    def broadcast_send_recv(device_id):
-
-      @def_function.function
-      def send():
-        s0 = collective_ops.broadcast_send(
-            c * 3, c.shape, c.dtype, group_size=2, group_key=1, instance_key=1)
-        with ops.control_dependencies([s0.op]):
-          return array_ops.identity(c)
-
-      @def_function.function
-      def recv():
-        r0 = collective_ops.broadcast_recv(
-            c.shape, c.dtype, group_size=2, group_key=1, instance_key=1)
-        return r0
-
-      return control_flow_ops.switch_case(
-          device_id, branch_fns={0: send, 1: recv})
-
-    with ops.device(self.device.name):
-      result = broadcast_send_recv(self.device.device_ids)
-    self.assertAllClose([[2], [6]], self.device.unpack(result))
-
   def test_checkpointing(self):
     self.skipTest(
         "Disable saving until SaveableObject's methods are traceable.")

From d9a716e942ff6f27e81d9e667c18ef1e1de38357 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 25 Jun 2020 16:33:48 -0700
Subject: [PATCH 1125/1390] Integrate LLVM at
 https://github.com/llvm/llvm-project/commit/4df7d852afc0

PiperOrigin-RevId: 318372152
Change-Id: Ifc0fef0bc361e1be117c4a1dbd76b62642f53c85
---
 tensorflow/workspace.bzl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index 5a047a58a41..03a5d15e29f 100755
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -710,8 +710,8 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
     )
 
     # Check out LLVM and MLIR from llvm-project.
-    LLVM_COMMIT = "9fb7e98db5aaef617878a127b663efa4d01aa834"
-    LLVM_SHA256 = "cdffd64994ef6557b38d40e7fe0e18c1ae5c7d8b26be9bfa8acc6de5f79c6b2a"
+    LLVM_COMMIT = "4df7d852afc04844184f0a02d3a3ca4449bbbc5f"
+    LLVM_SHA256 = "07460a0c233dbf92de1e068d1fbafcabd93ab6f453d4c75bdca021f23c5a4abc"
     LLVM_URLS = [
         "https://storage.googleapis.com/mirror.tensorflow.org/github.com/llvm/llvm-project/archive/{commit}.tar.gz".format(commit = LLVM_COMMIT),
         "https://github.com/llvm/llvm-project/archive/{commit}.tar.gz".format(commit = LLVM_COMMIT),

From 68f26867e7fd30d46e079e3febb9f90ee735db74 Mon Sep 17 00:00:00 2001
From: David Majnemer <majnemer@google.com>
Date: Thu, 25 Jun 2020 16:56:23 -0700
Subject: [PATCH 1126/1390] [XLA] Don't crash in the verifier if an operand's
 parent is null

PiperOrigin-RevId: 318376207
Change-Id: I32350514d0c65afb16915c7e80624bead1faaea9
---
 tensorflow/compiler/xla/service/hlo_verifier.cc | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tensorflow/compiler/xla/service/hlo_verifier.cc b/tensorflow/compiler/xla/service/hlo_verifier.cc
index d8baebd6fdd..62b0d98418c 100644
--- a/tensorflow/compiler/xla/service/hlo_verifier.cc
+++ b/tensorflow/compiler/xla/service/hlo_verifier.cc
@@ -1324,7 +1324,8 @@ Status VerifyHloStructure(HloModule* module) {
               "Operand %d (%s) of instruction %s is in a different "
               "computation: %s vs %s",
               i, operand->name(), instruction->name(),
-              operand->parent()->name(), instruction->parent()->name());
+              operand->parent() ? operand->parent()->name() : "(null)",
+              instruction->parent()->name());
         }
       }
     }

From 46c3b18c790e27d13aecb6ffc0dd266032f62581 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 25 Jun 2020 17:07:11 -0700
Subject: [PATCH 1127/1390] Change build system to use disabled_backends to
 disable selected backends. NFC.

PiperOrigin-RevId: 318378024
Change-Id: I67c6d5f90b111555e5944d76cbd47c22875c5078
---
 tensorflow/compiler/xla/client/lib/BUILD     |  2 +-
 tensorflow/compiler/xla/tests/BUILD          | 16 ++++++++--------
 tensorflow/compiler/xla/tests/build_defs.bzl |  6 +++---
 3 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/tensorflow/compiler/xla/client/lib/BUILD b/tensorflow/compiler/xla/client/lib/BUILD
index 38ddbd5abf7..06fd8ceeb2b 100644
--- a/tensorflow/compiler/xla/client/lib/BUILD
+++ b/tensorflow/compiler/xla/client/lib/BUILD
@@ -441,7 +441,7 @@ cc_library(
 xla_test(
     name = "self_adjoint_eig_test",
     srcs = ["self_adjoint_eig_test.cc"],
-    blacklisted_backends = [
+    disabled_backends = [
         "cpu",
         "gpu",
     ],
diff --git a/tensorflow/compiler/xla/tests/BUILD b/tensorflow/compiler/xla/tests/BUILD
index 0d59678e35b..83851fabd53 100644
--- a/tensorflow/compiler/xla/tests/BUILD
+++ b/tensorflow/compiler/xla/tests/BUILD
@@ -381,7 +381,7 @@ xla_test(
     timeout = "long",
     srcs = ["conv_depthwise_backprop_filter_test.cc"],
     # these backends do not natively handle batch group counts.
-    blacklisted_backends = [
+    disabled_backends = [
         "gpu",
         "cpu",
     ],
@@ -405,7 +405,7 @@ xla_test(
     name = "grouped_convolution_test",
     timeout = "long",
     srcs = ["grouped_convolution_test.cc"],
-    blacklisted_backends = [
+    disabled_backends = [
         # disabled because it times out.
         "cpu",
     ],
@@ -490,7 +490,7 @@ xla_test(
 xla_test(
     name = "xla_hlo_profile_test",
     srcs = ["xla_hlo_profile_test.cc"],
-    blacklisted_backends = [
+    disabled_backends = [
         # Hlo profiles are not supported on the interpreter backend.
         "interpreter",
     ],
@@ -1220,7 +1220,7 @@ xla_test(
 xla_test(
     name = "batch_normalization_test",
     srcs = ["batch_normalization_test.cc"],
-    blacklisted_backends = [
+    disabled_backends = [
         # BatchNorm HLOs are not handled by the interpreter backend, and the
         # BatchNorm expander is not run on the interpreter.
         "interpreter",
@@ -1886,7 +1886,7 @@ xla_test(
 xla_test(
     name = "all_reduce_test",
     srcs = ["all_reduce_test.cc"],
-    blacklisted_backends = [
+    disabled_backends = [
         # All reduce is not supported on the interpreter backend.
         "interpreter",
     ],
@@ -2062,7 +2062,7 @@ xla_test(
 xla_test(
     name = "execution_profile_test",
     srcs = ["execution_profile_test.cc"],
-    blacklisted_backends = [
+    disabled_backends = [
         # Execution profiles are not supported on the interpreter backend.
         "interpreter",
     ],
@@ -2081,7 +2081,7 @@ xla_test(
     name = "execution_profile_test_with_xla_hlo_profile",
     srcs = ["execution_profile_test.cc"],
     args = ["--xla_hlo_profile"],
-    blacklisted_backends = [
+    disabled_backends = [
         # Hlo profiles are not supported on the interpreter backend.
         "interpreter",
     ],
@@ -2306,7 +2306,7 @@ xla_test(
 xla_test(
     name = "outfeed_in_nested_computation_test",
     srcs = ["outfeed_in_nested_computation_test.cc"],
-    blacklisted_backends = [
+    disabled_backends = [
         # Outfeed ops are not supported on the interpreter backend.
         "interpreter",
     ],
diff --git a/tensorflow/compiler/xla/tests/build_defs.bzl b/tensorflow/compiler/xla/tests/build_defs.bzl
index 94d870aa2ef..b91b14d5616 100644
--- a/tensorflow/compiler/xla/tests/build_defs.bzl
+++ b/tensorflow/compiler/xla/tests/build_defs.bzl
@@ -34,7 +34,7 @@ def xla_test(
         deps,
         xla_test_library_deps = [],
         backends = [],
-        blacklisted_backends = [],
+        disabled_backends = [],
         real_hardware_only = False,
         args = [],
         tags = [],
@@ -99,7 +99,7 @@ def xla_test(
       backends: A list of backends to generate tests for. Supported values: "cpu",
         "gpu". If this list is empty, the test will be generated for all supported
         backends.
-      blacklisted_backends: A list of backends to NOT generate tests for.
+      disabled_backends: A list of backends to NOT generate tests for.
       args: Test arguments for the target.
       tags: Tags for the target.
       copts: Additional copts to pass to the build.
@@ -121,7 +121,7 @@ def xla_test(
     backends = [
         backend
         for backend in backends
-        if backend not in blacklisted_backends
+        if backend not in disabled_backends
     ]
 
     native.cc_library(

From d7c7e4245cdc0f7e9a6587185d43dc9e0d6d46c5 Mon Sep 17 00:00:00 2001
From: Saurabh Saxena <srbs@google.com>
Date: Thu, 25 Jun 2020 17:14:06 -0700
Subject: [PATCH 1128/1390] nit: Rename AbstractOpPtr -> AbstractOperationPtr.

PiperOrigin-RevId: 318379131
Change-Id: I0969cc92e08d5d3c2bdabb2a00bdd4a642634a3e
---
 tensorflow/c/eager/abstract_operation.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/c/eager/abstract_operation.h b/tensorflow/c/eager/abstract_operation.h
index de8c7c951f1..ff17fcf3cea 100644
--- a/tensorflow/c/eager/abstract_operation.h
+++ b/tensorflow/c/eager/abstract_operation.h
@@ -122,7 +122,7 @@ struct AbstractOperationDeleter {
 };
 }  // namespace internal
 
-using AbstractOpPtr =
+using AbstractOperationPtr =
     std::unique_ptr<AbstractOperation, internal::AbstractOperationDeleter>;
 
 }  // namespace tensorflow

From ec63b43c65edbd6829a3417a45a8697a02b2ec3b Mon Sep 17 00:00:00 2001
From: Wenhao Jia <jiawenhao@google.com>
Date: Thu, 25 Jun 2020 17:26:08 -0700
Subject: [PATCH 1129/1390] Clean up build files.

PiperOrigin-RevId: 318380913
Change-Id: I6c4a461abb04b9c575dedf47a4afc1c87aa53417
---
 tensorflow/stream_executor/tpu/BUILD          | 31 +++++++++----------
 .../tpu/tpu_executor_interface.h              | 13 +++-----
 .../tpu/tpu_stream_interface.h                |  1 +
 3 files changed, 20 insertions(+), 25 deletions(-)

diff --git a/tensorflow/stream_executor/tpu/BUILD b/tensorflow/stream_executor/tpu/BUILD
index 06165e90330..66b0013512f 100644
--- a/tensorflow/stream_executor/tpu/BUILD
+++ b/tensorflow/stream_executor/tpu/BUILD
@@ -48,7 +48,7 @@ cc_library(
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
         "//tensorflow/compiler/xla/service:shaped_buffer",
-        "//tensorflow/stream_executor:stream",
+        "//tensorflow/stream_executor:device_memory",
         "@com_google_absl//absl/container:inlined_vector",
     ],
 )
@@ -83,7 +83,7 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core/platform:types",
         "//tensorflow/core/tpu:tpu_api",
-        "//tensorflow/stream_executor:stream",
+        "//tensorflow/stream_executor",
         "//tensorflow/stream_executor/lib",
         "@com_google_absl//absl/container:flat_hash_map",
     ],
@@ -139,7 +139,7 @@ cc_library(
         "//tensorflow/compiler/xla/service:shaped_buffer",
         "//tensorflow/compiler/xla/service:transfer_manager",
         "//tensorflow/core/tpu:tpu_api",
-        "//tensorflow/stream_executor:stream",
+        "//tensorflow/stream_executor",
     ],
 )
 
@@ -165,17 +165,6 @@ cc_library(
         "//tensorflow/core/platform:types",
         "//tensorflow/stream_executor:multi_platform_manager",
         "//tensorflow/stream_executor:stream_executor_headers",
-        "//tensorflow/stream_executor:stream_executor_pimpl",
-    ],
-)
-
-cc_library(
-    name = "tpu_stream_interface",
-    hdrs = ["tpu_stream_interface.h"],
-    visibility = ["//visibility:public"],
-    deps = [
-        "//tensorflow/stream_executor:device_memory",
-        "//tensorflow/stream_executor:stream_executor_internal",
     ],
 )
 
@@ -185,8 +174,16 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         ":tpu_platform_interface",
-        "//tensorflow/core/platform:errors",
-        "//tensorflow/stream_executor:stream_executor_internal",
-        "//tensorflow/stream_executor:stream_header",
+        "//tensorflow/stream_executor:stream_executor_headers",
+        "//tensorflow/stream_executor/lib",
+    ],
+)
+
+cc_library(
+    name = "tpu_stream_interface",
+    hdrs = ["tpu_stream_interface.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/stream_executor:stream_executor_headers",
     ],
 )
diff --git a/tensorflow/stream_executor/tpu/tpu_executor_interface.h b/tensorflow/stream_executor/tpu/tpu_executor_interface.h
index 5b00f615ca7..d3145b140b8 100644
--- a/tensorflow/stream_executor/tpu/tpu_executor_interface.h
+++ b/tensorflow/stream_executor/tpu/tpu_executor_interface.h
@@ -16,13 +16,11 @@ limitations under the License.
 #ifndef TENSORFLOW_STREAM_EXECUTOR_TPU_TPU_EXECUTOR_INTERFACE_H_
 #define TENSORFLOW_STREAM_EXECUTOR_TPU_TPU_EXECUTOR_INTERFACE_H_
 
-#include "tensorflow/core/platform/errors.h"
+#include <memory>
+
 #include "tensorflow/stream_executor/device_memory.h"
-#include "tensorflow/stream_executor/event.h"
-#include "tensorflow/stream_executor/stream.h"
-#include "tensorflow/stream_executor/stream_executor.h"
+#include "tensorflow/stream_executor/lib/statusor.h"
 #include "tensorflow/stream_executor/stream_executor_internal.h"
-#include "tensorflow/stream_executor/timer.h"
 #include "tensorflow/stream_executor/tpu/tpu_platform_interface.h"
 
 namespace tpu {
@@ -33,11 +31,10 @@ namespace tensorflow {
 namespace tpu {
 
 class TpuExecutorInterface
-    : public ::stream_executor::internal::StreamExecutorInterface {
+    : public stream_executor::internal::StreamExecutorInterface {
  public:
-  using Status = ::stream_executor::port::Status;
   template <typename T>
-  using StatusOr = ::stream_executor::port::StatusOr<T>;
+  using StatusOr = stream_executor::port::StatusOr<T>;
 
   class TemporaryDeviceMemory {
    public:
diff --git a/tensorflow/stream_executor/tpu/tpu_stream_interface.h b/tensorflow/stream_executor/tpu/tpu_stream_interface.h
index aa281cdb58c..4b56f75a369 100644
--- a/tensorflow/stream_executor/tpu/tpu_stream_interface.h
+++ b/tensorflow/stream_executor/tpu/tpu_stream_interface.h
@@ -17,6 +17,7 @@ limitations under the License.
 #define TENSORFLOW_STREAM_EXECUTOR_TPU_TPU_STREAM_INTERFACE_H_
 
 #include "tensorflow/stream_executor/device_memory.h"
+#include "tensorflow/stream_executor/stream_executor.h"
 #include "tensorflow/stream_executor/stream_executor_internal.h"
 
 namespace tensorflow {

From 811d9ac43dea8f06a79d9106d22cb75d1e0c0ed6 Mon Sep 17 00:00:00 2001
From: Mingming Liu <mingmingl@google.com>
Date: Thu, 25 Jun 2020 17:52:40 -0700
Subject: [PATCH 1130/1390] Implement the support for
 'enable_large_batch_splitting' for in-graph batching.

PiperOrigin-RevId: 318384852
Change-Id: I6813027dd44a239e891b9d5c49de81d051ffe82b
---
 .../base_api/api_def_BatchFunction.pbtxt      |   5 +-
 tensorflow/core/kernels/BUILD                 |   8 +-
 tensorflow/core/kernels/batch_kernels.cc      | 266 +++++++++++++++++-
 .../batching_util/shared_batch_scheduler.h    | 213 ++++++++++++--
 tensorflow/python/ops/batch_ops_test.py       |  56 ++++
 5 files changed, 514 insertions(+), 34 deletions(-)

diff --git a/tensorflow/core/api_def/base_api/api_def_BatchFunction.pbtxt b/tensorflow/core/api_def/base_api/api_def_BatchFunction.pbtxt
index ae5942b3617..a7792dc9bf2 100644
--- a/tensorflow/core/api_def/base_api/api_def_BatchFunction.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_BatchFunction.pbtxt
@@ -50,8 +50,9 @@ END
     description: <<END
 Optional list of allowed batch sizes. If left empty, does
 nothing. Otherwise, supplies a list of batch sizes, causing the op to pad
-batches up to one of those sizes. The entries must increase monotonically, and
-the final entry must equal max_batch_size.
+batches up to one of those sizes. The entries must increase monotonically.
+If enable_large_batch_splitting is false (i.e., large-input-split is not
+enabled) the final entry must equal max_batch_size.
 END
   }
   attr {
diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index 1e05ee90ff8..38775bac967 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -668,10 +668,14 @@ cc_library(
         ":concat_lib_hdrs",
         ":ops_util_hdrs",
         ":split_lib_hdrs",
-        "//tensorflow/core:framework_headers_lib",
-        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
         "//tensorflow/core/kernels/batching_util:periodic_function_dynamic",
         "//tensorflow/core/kernels/batching_util:shared_batch_scheduler_hdrs",
+        "//tensorflow/core/util:incremental_barrier",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/strings",
     ],
     alwayslink = 1,
 )
diff --git a/tensorflow/core/kernels/batch_kernels.cc b/tensorflow/core/kernels/batch_kernels.cc
index 6449a399573..b1347031f7e 100644
--- a/tensorflow/core/kernels/batch_kernels.cc
+++ b/tensorflow/core/kernels/batch_kernels.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "absl/container/flat_hash_map.h"
 #include "absl/strings/str_cat.h"
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/op_kernel.h"
@@ -31,7 +32,9 @@ limitations under the License.
 #include "tensorflow/core/lib/random/random.h"
 #include "tensorflow/core/platform/context.h"
 #include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/util/incremental_barrier.h"
 #include "tensorflow/core/util/ptr_util.h"
 
 namespace tensorflow {
@@ -306,9 +309,52 @@ Status Split(OpKernelContext* context, const Tensor& input,
   return split_status;
 }
 
+// Wrapper class to allow both lock-free construction and concurrent updates on
+// a shared 'status'.
+class ThreadSafeStatus {
+ public:
+  const Status& status() const& TF_LOCKS_EXCLUDED(mutex_) {
+    tf_shared_lock lock(mutex_);
+    return status_;
+  }
+  Status status() && TF_LOCKS_EXCLUDED(mutex_) {
+    tf_shared_lock lock(mutex_);
+    return std::move(status_);
+  }
+
+  // Retains the first error status: replaces the current status with
+  // `new_status` if `new_status` is not OK and the previous status is OK.
+  void Update(const Status& new_status) TF_LOCKS_EXCLUDED(mutex_) {
+    if (new_status.ok()) {
+      return;
+    }
+
+    mutex_lock lock(mutex_);
+    status_.Update(new_status);
+  }
+  void Update(Status&& new_status) TF_LOCKS_EXCLUDED(mutex_) {
+    if (new_status.ok()) {
+      return;
+    }
+
+    mutex_lock lock(mutex_);
+    status_.Update(std::forward<Status>(new_status));
+  }
+
+ private:
+  mutable mutex mutex_;
+  Status status_ TF_GUARDED_BY(mutex_);
+};
+
 // A class encapsulating the state and logic for batching tensors.
 class BatchResource : public ResourceBase {
  public:
+  // Given a BatchTask (from one op invocation) with 'num_outputs'== M and
+  // splitted into N sub tasks, TensorMatrix is a N X M matrix.
+  // Namely, TensorMatrix[i][j] indicates the i-th split tensor of j-th output;
+  // concatenating tensors along the 2nd dimension gives a output tensor.
+  typedef std::vector<std::vector<Tensor>> TensorMatrix;
+
   static Status Create(int32 num_batch_threads, int32 max_batch_size,
                        int32 batch_timeout_micros, int32 max_enqueued_batches,
                        const std::vector<int32>& allowed_batch_sizes,
@@ -327,12 +373,27 @@ class BatchResource : public ResourceBase {
         max_enqueued_batches;
     new_resource->batcher_queue_options_.batch_timeout_micros =
         batch_timeout_micros;
-
     // Support for splitting large batch is still in progress.
     new_resource->batcher_queue_options_.enable_large_batch_splitting =
         enable_large_batch_splitting;
-
     new_resource->allowed_batch_sizes_ = allowed_batch_sizes;
+    if (enable_large_batch_splitting) {
+      new_resource->batcher_queue_options_.split_input_task_func =
+          [](std::unique_ptr<BatchTask>* input_task,
+             int open_batch_remaining_slot, int max_batch_size,
+             std::vector<std::unique_ptr<BatchTask>>* output_tasks) -> Status {
+        return SplitInputTask(input_task, open_batch_remaining_slot,
+                              max_batch_size, output_tasks);
+      };
+
+      if (allowed_batch_sizes.empty()) {
+        new_resource->batcher_queue_options_.max_execution_batch_size =
+            max_batch_size;
+      } else {
+        new_resource->batcher_queue_options_.max_execution_batch_size =
+            *allowed_batch_sizes.rbegin();
+      }
+    }
 
     new_resource->fhandle_ = fhandle;
 
@@ -379,6 +440,9 @@ class BatchResource : public ResourceBase {
     }
     batch_components->context = context;
     batch_components->done_callback = std::move(done_callback);
+    batch_components->split_index = 0;
+    batch_components->output = std::make_shared<TensorMatrix>();
+    batch_components->status = std::make_shared<ThreadSafeStatus>();
 
     BatcherQueue* batcher_queue;
     TF_RETURN_IF_ERROR(
@@ -389,7 +453,15 @@ class BatchResource : public ResourceBase {
  private:
   BatchResource() = default;
 
-  // One input to be batched. Corresponds to one invocation of the batch op.
+  // One task to be batched, corresponds to a `slice` of input from one batch-op
+  // invocation.
+  //
+  // Given input from one batch-op invocation, a `slice` of this input is:
+  // 1) Split each Tensor in `BatchTask::inputs` along the 0th dimension.
+  // 2) 'split_index' is calculated along the 0-th dimension.
+  //
+  // Note input from one batch-op invocation is valid and considered a
+  // specialized `slice`.
   struct BatchTask : public serving::BatchTask {
     // A unique ID to identify this invocation of Batch.
     int64 guid;
@@ -401,6 +473,24 @@ class BatchResource : public ResourceBase {
     OpKernelContext* context;
     AsyncOpKernel::DoneCallback done_callback;
 
+    // The index of this split, along the 0-th dimension of input from op
+    // invocation.
+    int split_index = 0;
+
+    // Two-dimensional tensor matrix, ownership shared by:
+    // 1) each split of task (to fill one row in this matrix)
+    // and
+    // 2) callback that runs to merge output of individual splits for an op
+    // invocation, after all splits complete.
+    std::shared_ptr<TensorMatrix> output;
+
+    // 'status' records error (could be from any split) if at least one split
+    // returns error, OK otherwise.
+    // Ownership is shared by individual splits and callback.
+    std::shared_ptr<ThreadSafeStatus> status;
+
+    bool is_partial = false;
+
     size_t size() const override { return inputs[0].shape().dim_size(0); }
 
     uint64 start_time;
@@ -496,6 +586,136 @@ class BatchResource : public ResourceBase {
     return Status::OK();
   }
 
+  // Split 'input' of 'input_task_ptr' along 0th dimension, into a list of
+  // 'output_tasks'.
+  // Task sizes are determined by
+  // 1) open_batch_remaining_slot
+  // 2) max_batch_size
+  // 3) size-of-input-task
+  // in a way that
+  // 1) Task sizes add up to `size-of-input-task`.
+  // 2) Task sizes from left to right are like
+  //    [open_batch_remaining_slot, max_batch_size, max_batch_size, ...,
+  //    `size-of-input-task` - `sum-of-previous-elements`].
+  //
+  // REQUIRES:
+  // Caller should make sure size-of-input-task is greater than
+  // open_batch_remaining_slot.
+  static Status SplitInputTask(
+      std::unique_ptr<BatchTask>* input_task_ptr, int open_batch_remaining_slot,
+      int max_batch_size,
+      std::vector<std::unique_ptr<BatchTask>>* output_tasks) {
+    BatchTask& input_task = *(*input_task_ptr);
+    const int64 input_task_size = input_task.size();
+
+    DCHECK_GT(input_task_size, open_batch_remaining_slot);
+
+    std::shared_ptr<ThreadSafeStatus> shared_status = input_task.status;
+
+    // `split_task_done_callback` runs only after all splitted tasks are
+    // complete.
+    std::function<void()> split_task_done_callback =
+        [done_callback = input_task.done_callback, output = input_task.output,
+         op_kernel_context = input_task.context, status = shared_status]() {
+          const int num_output = op_kernel_context->num_outputs();
+          for (int i = 0; i < num_output; ++i) {
+            Tensor output_tensor;
+
+            // Concat would memcpy each input tensor to one output tensor.
+            // In this context, Concat can be further optimized to get rid of
+            // some (probably all) memcpy when input tensors are slices of
+            // another copy.
+            // TODO(b/154140947):
+            // Add a custom implementation of Split and then optimize Concat.
+            std::vector<Tensor> to_concatenate;
+            to_concatenate.reserve(output->size());
+            for (int j = 0; j < output->size(); ++j) {
+              to_concatenate.push_back(std::move((*output)[j][i]));
+            }
+            const auto concat_status =
+                Concat(op_kernel_context, to_concatenate, &output_tensor);
+            if (!concat_status.ok()) {
+              status->Update(concat_status);
+            }
+
+            op_kernel_context->set_output(i, std::move(output_tensor));
+          }
+          op_kernel_context->SetStatus(status->status());
+          done_callback();
+        };
+    IncrementalBarrier barrier(split_task_done_callback);
+
+    std::vector<int64> output_task_sizes;
+
+    if (open_batch_remaining_slot > 0) {
+      output_task_sizes.push_back(open_batch_remaining_slot);
+    }
+
+    for (int left_task_size = input_task_size - open_batch_remaining_slot;
+         left_task_size > 0; left_task_size -= max_batch_size) {
+      int next_task_size = std::min(left_task_size, max_batch_size);
+      output_task_sizes.push_back(next_task_size);
+    }
+
+    const int output_task_num = output_task_sizes.size();
+    input_task.output->resize(output_task_num);
+
+    for (int i = 0; i < output_task_num; ++i) {
+      (*input_task.output)[i].resize(input_task.context->num_outputs());
+    }
+
+    output_tasks->reserve(output_task_num);
+    for (int i = 0; i < output_task_num; i++) {
+      auto task = absl::make_unique<BatchTask>();
+      task->guid = input_task.guid;
+      task->propagated_context = Context(ContextKind::kThread);
+      task->captured_inputs = input_task.captured_inputs;
+      task->context = input_task.context;
+      task->done_callback = barrier.Inc();
+      task->start_time = input_task.start_time;
+      task->split_index = i;
+      task->inputs.reserve(input_task.inputs.size());
+      task->is_partial = true;
+      task->status = input_task.status;
+
+      task->output = input_task.output;
+      output_tasks->push_back(std::move(task));
+    }
+
+    const int num_input_tensors = input_task.inputs.size();
+
+    // Splits each input tensor according to `output_task_sizes`, and
+    // initializes input of `output_tasks` with split results.
+    for (int i = 0; i < num_input_tensors; ++i) {
+      std::vector<Tensor> split_tensors;
+      const Tensor& input_tensor = input_task.inputs[i];
+      // TODO(b/154140947):
+      // Figure out the optimal implementation of Split, by using
+      // 'Tensor::Slice' and eliminating unnecessary memcpy as much as possible.
+      const Status split_status = Split(input_task.context, input_tensor,
+                                        output_task_sizes, &split_tensors);
+      if (!split_status.ok()) {
+        return errors::Internal(
+            "When splitting input, Tensor split operation failed: ",
+            split_status.ToString());
+      }
+      if (split_tensors.size() != output_task_sizes.size()) {
+        return errors::Internal(
+            "When splitting input, tensor split operation did not work as "
+            "expected; got ",
+            split_tensors.size(), " splits; expected ",
+            output_task_sizes.size());
+      }
+      for (int j = 0; j < output_tasks->size(); ++j) {
+        BatchTask& output_task = *((*output_tasks)[j]);
+        auto moved_tensor_iter = std::next(split_tensors.begin(), j);
+        std::move(moved_tensor_iter, moved_tensor_iter + 1,
+                  std::back_inserter(output_task.inputs));
+      }
+    }
+    return Status::OK();
+  }
+
   Status SplitOutputTensors(const std::vector<Tensor>& combined_outputs,
                             Batch* batch) const {
     DCHECK_GE(batch->num_tasks(), 1);
@@ -552,11 +772,16 @@ class BatchResource : public ResourceBase {
             task_sizes_plus_optional_padding.size());
       }
 
+      // Ignore a possible final split_tensors entry containing the padding.
       for (int j = 0; j < batch->num_tasks(); ++j) {
         BatchTask& task = *(batch->mutable_task(j));
-        task.context->set_output(i, std::move(split_tensor.at(j)));
-      }  // (Ignore a possible final split_tensors entry containing the
-         // padding.)
+        if (task.is_partial) {
+          std::vector<Tensor>& tensor_vector = (*task.output)[task.split_index];
+          tensor_vector[i] = std::move(split_tensor[j]);
+        } else {
+          task.context->set_output(i, split_tensor[j]);
+        }
+      }
     }
 
     return Status::OK();
@@ -585,11 +810,17 @@ class BatchResource : public ResourceBase {
         return;
       }
       for (int i = 0; i < batch->num_tasks(); ++i) {
-        batch->mutable_task(i)->context->SetStatus(status);
+        if (batch->task(i).is_partial) {
+          batch->mutable_task(i)->status->Update(status);
+        } else {
+          batch->mutable_task(i)->context->SetStatus(status);
+        }
+
         batch->mutable_task(i)->done_callback();
       }
       cleanup_done = true;
     };
+
     auto finally =
         gtl::MakeCleanup([&cleanup_fn, &status] { cleanup_fn(status); });
 
@@ -680,12 +911,12 @@ class BatchResource : public ResourceBase {
 
     // Process each input edge one at a time (the typical case has just one).
     for (int i = 0; i < num_input_edges; ++i) {
-      last_task_context->set_output(i, concatenated_tensors.at(i));
+      last_task_context->set_output(i, concatenated_tensors[i]);
 
       // Emit batch->num_tasks() - 1 empty output tensors.
       for (int task_idx = 0; task_idx < batch->num_tasks() - 1; ++task_idx) {
         const BatchTask& task = batch->task(task_idx);
-        TensorShape output_shape(task.inputs.at(i).shape());
+        TensorShape output_shape(task.inputs[i].shape());
         output_shape.set_dim(0, 0);
         Tensor* output = nullptr;
         OP_REQUIRES_OK_ASYNC(
@@ -811,7 +1042,6 @@ class BatchFunctionKernel : public AsyncOpKernel {
     OP_REQUIRES_OK(c,
                    c->GetAttr("max_enqueued_batches", &max_enqueued_batches_));
     OP_REQUIRES_OK(c, c->GetAttr("allowed_batch_sizes", &allowed_batch_sizes_));
-    OP_REQUIRES_OK(c, ValidateAllowedBatchSizes());
 
     auto lib = c->function_library();
     OP_REQUIRES(c, lib != nullptr, errors::Internal("No function library"));
@@ -826,6 +1056,13 @@ class BatchFunctionKernel : public AsyncOpKernel {
     } else {
       enable_large_batch_splitting_ = false;
     }
+
+    if (enable_large_batch_splitting_ && (!allowed_batch_sizes_.empty())) {
+      max_execution_batch_size_ = *allowed_batch_sizes_.rbegin();
+    } else {
+      max_execution_batch_size_ = max_batch_size_;
+    }
+    OP_REQUIRES_OK(c, ValidateAllowedBatchSizes());
   }
 
   bool IsExpensive() override { return false; }
@@ -865,10 +1102,14 @@ class BatchFunctionKernel : public AsyncOpKernel {
         return errors::InvalidArgument(
             "allowed_batch_sizes entries must be monotonically increasing");
       }
-      if (i == allowed_batch_sizes_.size() - 1 && size != max_batch_size_) {
+
+      if ((!enable_large_batch_splitting_) &&
+          (i == allowed_batch_sizes_.size() - 1) && (size != max_batch_size_)) {
         return errors::InvalidArgument(
-            "final entry in allowed_batch_sizes must equal max_batch_size");
+            "final entry in allowed_batch_sizes must equal max_batch_size when "
+            "enable_large_batch_splitting is False");
       }
+
       last_size = size;
     }
     return Status::OK();
@@ -880,6 +1121,7 @@ class BatchFunctionKernel : public AsyncOpKernel {
   string batcher_queue_;
   int32 num_batch_threads_;
   int32 max_batch_size_;
+  int32 max_execution_batch_size_;
   int32 batch_timeout_micros_;
   int32 max_enqueued_batches_;
   std::vector<int32> allowed_batch_sizes_;
diff --git a/tensorflow/core/kernels/batching_util/shared_batch_scheduler.h b/tensorflow/core/kernels/batching_util/shared_batch_scheduler.h
index e47e069eff5..6763e366c76 100644
--- a/tensorflow/core/kernels/batching_util/shared_batch_scheduler.h
+++ b/tensorflow/core/kernels/batching_util/shared_batch_scheduler.h
@@ -34,6 +34,7 @@ limitations under the License.
 #include "tensorflow/core/platform/byte_order.h"
 #include "tensorflow/core/platform/cpu_info.h"
 #include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/thread_annotations.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/lib/connected_traceme.h"
@@ -133,12 +134,13 @@ class SharedBatchScheduler
   // The returned queue's destructor blocks until all tasks submitted to it have
   // been processed.
   struct QueueOptions {
-    // The maximum size of each batch.
+    // The size limit of an input batch to the queue.
     //
-    // The scheduler may form batches of any size between 1 and this number
-    // (inclusive). If there is a need to quantize the batch sizes, i.e. only
-    // submit batches whose size is in a small set of allowed sizes, that can be
-    // done by adding padding in the process-batch callback.
+    // If `enable_large_batch_splitting` is True, 'max_batch_size' should be
+    // greater or equal than `max_execution_batch_size`; otherwise
+    // `max_batch_size` should be equal to `max_execution_batch_size`.
+    // TODO(b/154140947):
+    // Rename it to 'input_batch_size_limit' here and in caller's code.
     size_t max_batch_size = 1000;
 
     // If a task has been enqueued for this amount of time (in microseconds),
@@ -163,8 +165,36 @@ class SharedBatchScheduler
     size_t max_enqueued_batches = 10;
 
     // If true, queue implementation would split one input batch task into
-    // subtasks and fit them into different batches.
+    // subtasks (as specified by `split_input_task_func` below) and fit subtasks
+    // into different batches.
+    //
+    // For usage of `split_input_task_func`, please see its comment.
     bool enable_large_batch_splitting = false;
+
+    // `input_task`: a unit of task to be splitted (raw pointer not owned).
+    // `first_output_task_size`: task size of first output.
+    // `max_batch_size`: Maximum size of each batch.
+    // `output_tasks`: A list of output tasks after split.
+    //
+    // REQUIRED:
+    // 1) All `output_tasks` should be non-empty tasks.
+    // 2) Sizes of `output_tasks` add up to size of `input_task`.
+    //
+    // NOTE:
+    // Instantiations of `TaskType` may vary, so it's up to caller to define
+    // how (e.g., which members to access) to split input tasks.
+    std::function<Status(std::unique_ptr<TaskType>* input_task,
+                         int first_output_task_size, int max_batch_size,
+                         std::vector<std::unique_ptr<TaskType>>* output_tasks)>
+        split_input_task_func;
+
+    // The maximum size of each enqueued batch (i.e., in `batches_`).
+    //
+    // The scheduler may form batches of any size between 1 and this number
+    // (inclusive). If there is a need to quantize the batch sizes, i.e. only
+    // submit batches whose size is in a small set of allowed sizes, that can be
+    // done by adding padding in the process-batch callback.
+    size_t max_execution_batch_size = 1000;
   };
   Status AddQueue(const QueueOptions& options,
                   std::function<void(std::unique_ptr<Batch<TaskType>>)>
@@ -237,6 +267,10 @@ class Queue {
   using ProcessBatchCallback =
       std::function<void(std::unique_ptr<Batch<TaskType>>)>;
   using SchedulableBatchCallback = std::function<void()>;
+  using SplitInputTaskIntoSubtasksCallback = std::function<Status(
+      std::unique_ptr<TaskType>* input_task, int open_batch_remaining_slot,
+      int max_batch_size,
+      std::vector<std::unique_ptr<TaskType>>* output_tasks)>;
   Queue(const typename SharedBatchScheduler<TaskType>::QueueOptions& options,
         Env* env, ProcessBatchCallback process_batch_callback,
         SchedulableBatchCallback schedulable_batch_callback);
@@ -248,6 +282,12 @@ class Queue {
   // BatchScheduler::Schedule().
   Status Schedule(std::unique_ptr<TaskType>* task);
 
+  // 'ScheduleWithoutSplit'.
+  Status ScheduleWithoutSplit(std::unique_ptr<TaskType>* task);
+
+  // 'ScheduleWithSplit'
+  Status ScheduleWithSplit(std::unique_ptr<TaskType>* task);
+
   // Returns the number of enqueued tasks, with the same semantics as
   // BatchScheduler::NumEnqueuedTasks().
   size_t NumEnqueuedTasks() const;
@@ -259,6 +299,17 @@ class Queue {
   // Returns the maximum allowed size of tasks submitted to the queue.
   size_t max_task_size() const { return options_.max_batch_size; }
 
+  // Returns the maximum allowed size of tasks to be enqueued.
+  // Returned value would be less than or equal to the maximum allowed input
+  // size that's provided by caller of batch scheduler.
+  size_t max_execution_batch_size() const {
+    if (options_.enable_large_batch_splitting) {
+      return options_.max_execution_batch_size;
+    } else {
+      return options_.max_batch_size;
+    }
+  }
+
   // Called by a thread that is ready to process a batch, to request one from
   // this queue. Either returns a batch that is ready to be processed, or
   // nullptr if the queue declines to schedule a batch at this time. If it
@@ -285,6 +336,12 @@ class Queue {
   // fresh open batch behind it.
   void StartNewBatch() TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
+  // Split `input task` into `output_tasks` according to 'task_sizes'.
+  Status SplitInputBatchIntoSubtasks(
+      std::unique_ptr<TaskType>* input_task,
+      std::vector<std::unique_ptr<TaskType>>* output_tasks)
+      TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
   // Determines whether the open batch residing at the back of 'batches_' is
   // currently schedulable.
   bool IsOpenBatchSchedulable() const TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
@@ -294,8 +351,8 @@ class Queue {
   // The environment to use.
   Env* env_;
 
-  // A callback invoked to processes a batch of work units. Always invoked from
-  // a batch thread.
+  // A callback invoked to processes a batch of work units. Always invoked
+  // from a batch thread.
   ProcessBatchCallback process_batch_callback_;
 
   // A callback invoked to notify the scheduler that a new batch has become
@@ -319,18 +376,19 @@ class Queue {
   // in 'batches_'. Valid iff that batch contains at least one task.
   uint64 open_batch_start_time_micros_ TF_GUARDED_BY(mu_);
 
-  // Whether this queue contains a batch that is eligible to be scheduled. Used
-  // to keep track of when to call 'schedulable_batch_callback_'.
+  // Whether this queue contains a batch that is eligible to be scheduled.
+  // Used to keep track of when to call 'schedulable_batch_callback_'.
   bool schedulable_batch_ TF_GUARDED_BY(mu_) = false;
 
   // The number of batches currently being processed by batch threads.
   // Incremented in ScheduleBatch() and decremented in ProcessBatch().
   int num_batches_being_processed_ TF_GUARDED_BY(mu_) = 0;
 
-  // Used by CloseAndWaitUntilEmpty() to wait until the queue is empty, for the
-  // case in which the queue is not empty when CloseAndWaitUntilEmpty() starts.
-  // When ProcessBatch() dequeues the last batch and makes the queue empty, if
-  // 'empty_notification_' is non-null it calls 'empty_notification_->Notify()'.
+  // Used by CloseAndWaitUntilEmpty() to wait until the queue is empty, for
+  // the case in which the queue is not empty when CloseAndWaitUntilEmpty()
+  // starts. When ProcessBatch() dequeues the last batch and makes the queue
+  // empty, if 'empty_notification_' is non-null it calls
+  // 'empty_notification_->Notify()'.
   Notification* empty_notification_ TF_GUARDED_BY(mu_) = nullptr;
 
   TF_DISALLOW_COPY_AND_ASSIGN(Queue);
@@ -416,6 +474,23 @@ Status SharedBatchScheduler<TaskType>::AddQueue(
         options.max_enqueued_batches);
   }
 
+  if (options.enable_large_batch_splitting &&
+      options.split_input_task_func == nullptr) {
+    return errors::InvalidArgument(
+        "split_input_task_func must be specified when split_input_task is "
+        "true: ",
+        options.enable_large_batch_splitting);
+  }
+
+  if (options.enable_large_batch_splitting &&
+      (options.max_batch_size < options.max_execution_batch_size)) {
+    return errors::InvalidArgument(
+        "When enable_large_batch_splitting is true, max_batch_size must be "
+        "greater than or equal to max_execution_batch_size.",
+        options.enable_large_batch_splitting, options.max_batch_size,
+        options.max_execution_batch_size);
+  }
+
   auto schedulable_batch_callback = [this] {
     mutex_lock l(mu_);
     schedulable_batch_cv_.notify_one();
@@ -533,9 +608,17 @@ Queue<TaskType>::~Queue() {
 
 template <typename TaskType>
 Status Queue<TaskType>::Schedule(std::unique_ptr<TaskType>* task) {
+  if (options_.enable_large_batch_splitting) {
+    return ScheduleWithSplit(std::move(task));
+  }
+  return ScheduleWithoutSplit(std::move(task));
+}
+
+template <typename TaskType>
+Status Queue<TaskType>::ScheduleWithoutSplit(std::unique_ptr<TaskType>* task) {
   if ((*task)->size() > options_.max_batch_size) {
     return errors::InvalidArgument("Task size ", (*task)->size(),
-                                   " is larger than maximum batch size ",
+                                   " is larger than maximum input batch size ",
                                    options_.max_batch_size);
   }
 
@@ -577,6 +660,89 @@ Status Queue<TaskType>::Schedule(std::unique_ptr<TaskType>* task) {
   return Status::OK();
 }
 
+// TODO(b/154140947):
+// Merge `ScheduleWithSplit` and `ScheduleWithoutSplit` into `Schedule`.
+// Two variants are created so original path (ScheduleWithoutSplit) is kept as
+// it is.
+template <typename TaskType>
+Status Queue<TaskType>::ScheduleWithSplit(std::unique_ptr<TaskType>* task) {
+  profiler::TraceMe trace_me([task] {
+    return strings::StrCat("ScheduleWithSplit:", (*task)->size());
+  });
+  if ((*task)->size() > options_.max_batch_size) {
+    return errors::InvalidArgument("Task size ", (*task)->size(),
+                                   " is larger than maximum input batch size ",
+                                   options_.max_batch_size);
+  }
+
+  // The max size to be enqueued.
+  const int max_execution_batch_size = options_.max_execution_batch_size;
+
+  bool notify_of_schedulable_batch = false;
+  {
+    mutex_lock l(mu_);
+
+    DCHECK(!closed_);
+
+    const int num_new_batches_schedulable =
+        options_.max_enqueued_batches - batches_.size();
+    const int open_batch_capacity =
+        max_execution_batch_size - batches_.back()->size();
+    const int scheduling_capacity =
+        (num_new_batches_schedulable * max_execution_batch_size) +
+        open_batch_capacity;
+
+    // The scenario when concurrent incoming batches arrives and use up all
+    // queue capacity isn't covered by unit test.
+    // The coverage boils down to sepcify "function library" in a way that,
+    // one batch task can synchronize with another task, and then two tasks
+    // run concurrently. An integration test might be a better fit.
+    if ((*task)->size() > scheduling_capacity) {
+      return errors::Unavailable(
+          "The batch scheduling queue to which this task was submitted is "
+          "full");
+    }
+
+    const int64 open_batch_remaining_slot =
+        max_execution_batch_size - batches_.back()->size();
+
+    const int64 input_task_size = (*task)->size();
+
+    std::vector<std::unique_ptr<TaskType>> output_tasks;
+
+    if (input_task_size <= open_batch_remaining_slot) {
+      // This is the fast path when input doesn't need to be splitted.
+      output_tasks.push_back(std::move(*task));
+    } else {
+      TF_RETURN_IF_ERROR(SplitInputBatchIntoSubtasks(task, &output_tasks));
+    }
+
+    for (int i = 0; i < output_tasks.size(); ++i) {
+      if (batches_.back()->size() + output_tasks[i]->size() >
+          options_.max_execution_batch_size) {
+        StartNewBatch();
+      }
+      if (batches_.back()->empty()) {
+        open_batch_start_time_micros_ = env_->NowMicros();
+      }
+      batches_.back()->AddTask(std::move(output_tasks[i]));
+    }
+
+    if (!schedulable_batch_) {
+      if (batches_.size() > 1 || IsOpenBatchSchedulable()) {
+        schedulable_batch_ = true;
+        notify_of_schedulable_batch = true;
+      }
+    }
+  }
+
+  if (notify_of_schedulable_batch) {
+    schedulable_batch_callback_();
+  }
+
+  return Status::OK();
+}
+
 template <typename TaskType>
 size_t Queue<TaskType>::NumEnqueuedTasks() const {
   mutex_lock l(mu_);
@@ -593,8 +759,8 @@ size_t Queue<TaskType>::SchedulingCapacity() const {
   const int num_new_batches_schedulable =
       options_.max_enqueued_batches - batches_.size();
   const int open_batch_capacity =
-      options_.max_batch_size - batches_.back()->size();
-  return (num_new_batches_schedulable * options_.max_batch_size) +
+      max_execution_batch_size() - batches_.back()->size();
+  return (num_new_batches_schedulable * max_execution_batch_size()) +
          open_batch_capacity;
 }
 
@@ -676,13 +842,24 @@ void Queue<TaskType>::StartNewBatch() {
   batches_.emplace_back(new Batch<TaskType>(++traceme_context_id_counter_));
 }
 
+template <typename TaskType>
+Status Queue<TaskType>::SplitInputBatchIntoSubtasks(
+    std::unique_ptr<TaskType>* input_task,
+    std::vector<std::unique_ptr<TaskType>>* output_tasks) {
+  const int open_batch_remaining_slot =
+      max_execution_batch_size() - batches_.back()->size();
+  return options_.split_input_task_func(
+      std::move(input_task), open_batch_remaining_slot,
+      max_execution_batch_size(), std::move(output_tasks));
+}
+
 template <typename TaskType>
 bool Queue<TaskType>::IsOpenBatchSchedulable() const {
   Batch<TaskType>* open_batch = batches_.back().get();
   if (open_batch->empty()) {
     return false;
   }
-  return closed_ || open_batch->size() >= options_.max_batch_size ||
+  return closed_ || open_batch->size() >= max_execution_batch_size() ||
          env_->NowMicros() >=
              open_batch_start_time_micros_ + options_.batch_timeout_micros;
 }
diff --git a/tensorflow/python/ops/batch_ops_test.py b/tensorflow/python/ops/batch_ops_test.py
index 68da61b6a7e..f63f39d27d8 100644
--- a/tensorflow/python/ops/batch_ops_test.py
+++ b/tensorflow/python/ops/batch_ops_test.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 
 import threading
 import time
+import numpy as np
 
 from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
@@ -348,6 +349,61 @@ class BatchOpsTest(test.TestCase):
                                    ".*2 arguments.*but 1.*"):
         sess.run([result], feed_dict={inp: [2]})
 
+  def testBatchFunctionOpWithLargeBatchSplitted(self):
+    """Tests that the batch_function op works with large batch splitted."""
+    if context.executing_eagerly():
+      return
+
+    with self.cached_session() as sess:
+
+      @function.Defun(dtypes.int32)
+      def computation(in_t):
+        return in_t + 3
+
+      inp = array_ops.placeholder(dtype=dtypes.int32)
+      result = gen_batch_ops.batch_function(
+          [inp],
+          num_batch_threads=2,
+          # enable_large_batch_splitting is True, so it's valid as long as
+          # max('allowed_batch_sizes') <= 'max_batch_size'.
+          allowed_batch_sizes=[1, 2],
+          max_batch_size=5,
+          batch_timeout_micros=100000,  # 100ms
+          Tout=[dtypes.int32],
+          enable_large_batch_splitting=True,
+          f=computation,
+          captured_tensors=computation.captured_inputs)
+      thread1_results = []
+      thread2_results = []
+
+      # Input sizes of worker1 and main thread are larger than
+      # max(allowed_batch_sizes), while input size of worker2 is smaller.
+      def worker1():
+        thread1_results.extend(
+            sess.run([result], feed_dict={inp: [5, 6, 7, 8, 9]}))
+
+      worker_thread1 = threading.Thread(target=worker1)
+      worker_thread1.start()
+
+      def worker2():
+        thread2_results.extend(sess.run([result], feed_dict={inp: [10]}))
+
+      worker_thread2 = threading.Thread(target=worker2)
+      worker_thread2.start()
+
+      main_results = sess.run([result], feed_dict={inp: [2, 3, 4]})
+      worker_thread1.join()
+      worker_thread2.join()
+      self.assertTrue(
+          np.all(np.equal(thread2_results[0], np.array([13], dtype=np.int32))))
+      self.assertTrue(
+          np.all(
+              np.equal(thread1_results[0],
+                       np.array([8, 9, 10, 11, 12], dtype=np.int32))))
+      self.assertTrue(
+          np.all(
+              np.equal(main_results[0], np.array([5, 6, 7], dtype=np.int32))))
+
   def testBasicUnbatchDecoratedWithReshape(self):
     """Tests that the batch_function decorator works."""
     if context.executing_eagerly():

From 425317f2c674d61be92a0be082f7909974a4e334 Mon Sep 17 00:00:00 2001
From: Brian Zhao <bmzhao@google.com>
Date: Thu, 25 Jun 2020 18:43:56 -0700
Subject: [PATCH 1131/1390] Add missing kernels dependency.

PiperOrigin-RevId: 318391623
Change-Id: I4a56e85440bffec87186bdbffb07e66478a80031
---
 tensorflow/c/experimental/saved_model/core/BUILD     | 1 +
 tensorflow/c/experimental/saved_model/core/ops/BUILD | 1 +
 2 files changed, 2 insertions(+)

diff --git a/tensorflow/c/experimental/saved_model/core/BUILD b/tensorflow/c/experimental/saved_model/core/BUILD
index b61f38d5533..5452907f3e8 100644
--- a/tensorflow/c/experimental/saved_model/core/BUILD
+++ b/tensorflow/c/experimental/saved_model/core/BUILD
@@ -161,6 +161,7 @@ tf_cc_test(
         "//tensorflow/c:tensor_interface",
         "//tensorflow/c/eager:immediate_execution_tensor_handle",
         "//tensorflow/c/experimental/saved_model/core/revived_types:constant",
+        "//tensorflow/core:all_kernels",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
diff --git a/tensorflow/c/experimental/saved_model/core/ops/BUILD b/tensorflow/c/experimental/saved_model/core/ops/BUILD
index 6205ef82bab..9dd87e94578 100644
--- a/tensorflow/c/experimental/saved_model/core/ops/BUILD
+++ b/tensorflow/c/experimental/saved_model/core/ops/BUILD
@@ -47,6 +47,7 @@ tf_cc_test(
         "//tensorflow/c/eager:abstract_tensor_handle",
         "//tensorflow/c/eager:immediate_execution_context",
         "//tensorflow/c/eager:immediate_execution_tensor_handle",
+        "//tensorflow/core:all_kernels",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",

From a419f0337042f908826d9a9dd6045c543edc4c5b Mon Sep 17 00:00:00 2001
From: Jay Shi <xiaojies@google.com>
Date: Thu, 25 Jun 2020 19:13:58 -0700
Subject: [PATCH 1132/1390] [tf.data] Update the time computation functions for
 different nodes in the auto-tuning optimization.

PiperOrigin-RevId: 318395100
Change-Id: I077dbd8cb35aa08fecf8c13056335264602e8502
---
 tensorflow/core/framework/model.cc      | 444 +++++++++++++-----------
 tensorflow/core/framework/model.h       |  20 +-
 tensorflow/core/framework/model_test.cc |  52 +--
 3 files changed, 274 insertions(+), 242 deletions(-)

diff --git a/tensorflow/core/framework/model.cc b/tensorflow/core/framework/model.cc
index fd28cfe7f6a..198d2f6574c 100644
--- a/tensorflow/core/framework/model.cc
+++ b/tensorflow/core/framework/model.cc
@@ -48,25 +48,27 @@ class InterleaveMany : public Node {
 
   void InputTimeLocked(absl::flat_hash_map<string, double>* input_times)
       const override TF_SHARED_LOCKS_REQUIRED(mu_) {
-    double old_input_time;
+    double inherited_input_time;
     if (output_) {
-      old_input_time = (*input_times)[output_->long_name()];
+      inherited_input_time = (*input_times)[output_->long_name()];
     } else {
-      old_input_time = gtl::FindWithDefault(*input_times, kInputTimeKey, 0.0L);
+      inherited_input_time =
+          gtl::FindWithDefault(*input_times, kInputTimeKey, 0.0L);
     }
 
     if (num_inputs() <= 1) {
-      (*input_times)[long_name()] = old_input_time;
+      (*input_times)[long_name()] = inherited_input_time;
       return;
     }
-    // Here `old_input_time + SelfProcessingTimeLocked()` is the average input
-    // time for the interleave node to call one of the `(num_inputs() - 1)`
-    // input nodes(except the first one) to return an element. Regardless of the
-    // `block_length` parameter of interleave node, the average input time for
-    // any of the `(num_inputs() - 1)` input nodes to be called is computed as:
-    double new_input_time = (old_input_time + SelfProcessingTimeLocked()) *
-                            static_cast<double>(num_inputs() - 1);
-    (*input_times)[long_name()] = new_input_time;
+    // Here `inherited_input_time + SelfProcessingTimeLocked()` is the average
+    // input time for InterleaveMany node to call one of the
+    // `(num_inputs() - 1)` input nodes (except first input) to return an
+    // element. Regardless of the `block_length` parameter of InterleaveMany
+    // node, the average input time for any of the `(num_inputs() - 1)` input
+    // nodes to be called is computed as:
+    double input_time = (inherited_input_time + SelfProcessingTimeLocked()) *
+                        static_cast<double>(num_inputs() - 1);
+    (*input_times)[long_name()] = input_time;
   }
 
   // The output time is the sum of the self processing time and the average
@@ -88,9 +90,10 @@ class InterleaveMany : public Node {
       return;
     }
 
-    double output_time = (OutputTimeForInputs(*output_times) -
-                          (*output_times)[inputs_.front()->long_name()]) /
-                         static_cast<double>(num_inputs() - 1);
+    double inputs_output_time =
+        (OutputTimeForInputs(*output_times) -
+         (*output_times)[inputs_.front()->long_name()]) /
+        static_cast<double>(num_inputs() - 1);
     if (gradients) {
       for (const auto& node : CollectNodes(TraversalOrder::REVERSE_BFS)) {
         auto* gradient = gtl::FindOrNull(*gradients, node->long_name());
@@ -100,9 +103,8 @@ class InterleaveMany : public Node {
       }
 
       (*output_time_gradients)[long_name()] =
-          (OutputTimeGradientsForInputs(*output_time_gradients) -
-           (*output_time_gradients)[inputs_.front()->long_name()]) /
-          static_cast<double>(num_inputs() - 1);
+          OutputTimeGradientsForInputs(*output_time_gradients) -
+          (*output_time_gradients)[inputs_.front()->long_name()];
 
       // Set derivatives w.r.t. tunable parameters of the subtree rooted in the
       // first input equal to 0 since its output time is excluded from
@@ -114,7 +116,7 @@ class InterleaveMany : public Node {
         (*gradients)[pair.first] = 0.0L;
       }
     }
-    (*output_times)[long_name()] = self_processing_time + output_time;
+    (*output_times)[long_name()] = self_processing_time + inputs_output_time;
   }
 
   // The processing time is the sum of the self processing time and the average
@@ -131,12 +133,12 @@ class InterleaveMany : public Node {
       (*total_processing_times)[long_name()] = self_processing_time;
       return;
     }
-    double processing_time =
+    double inputs_processing_time =
         (TotalProcessingTimeForInputs(*total_processing_times) -
          (*total_processing_times)[inputs_.front()->long_name()]) /
         static_cast<double>(num_inputs() - 1);
     (*total_processing_times)[long_name()] =
-        self_processing_time + processing_time;
+        self_processing_time + inputs_processing_time;
   }
 };
 
@@ -170,26 +172,37 @@ class AsyncInterleaveMany : public Node {
 
   void InputTimeLocked(absl::flat_hash_map<string, double>* input_times)
       const override TF_SHARED_LOCKS_REQUIRED(mu_) {
-    double input_time;
+    double inherited_input_time;
+    if (output_) {
+      inherited_input_time = (*input_times)[output_->long_name()];
+    } else {
+      inherited_input_time =
+          gtl::FindWithDefault(*input_times, kInputTimeKey, 0.0L);
+    }
 
     if (num_inputs() <= 1) {
-      if (output_) {
-        input_time = (*input_times)[output_->long_name()];
-      } else {
-        input_time = gtl::FindWithDefault(*input_times, kInputTimeKey, 0.0L);
-      }
-    } else {
-      input_time =
-          SelfProcessingTimeLocked() * static_cast<double>(num_inputs() - 1);
+      (*input_times)[long_name()] = inherited_input_time;
+      return;
     }
+    // Here `inherited_input_time + SelfProcessingTimeLocked()` is the average
+    // input time for AsyncInterleaveMany node to call one of the
+    // `(num_inputs() - 1)` input nodes (except first input) to return an
+    // element. Regardless of the `block_length` parameter of
+    // AsyncInterleaveMany node, the average input time for any of the
+    // `(num_inputs() - 1)` input nodes to be called is computed as:
+    double input_time = (inherited_input_time + SelfProcessingTimeLocked()) *
+                        static_cast<double>(num_inputs() - 1);
     (*input_times)[long_name()] = input_time;
   }
 
-  // The output time is estimated using `ComputeWaitTime(output_time,
-  // input_time, parallelism, ...)`, where `output_time` is the sum of the
-  // self-processing time and the average output time of inputs comprising the
-  // interleave "cycle", `input_time` is specified through `input_times` and
-  // `buffer_size` is derived from parallelism.
+  // The output time is the sum of self processing time and expected wait time
+  // from the buffer model estimated using
+  // `ComputeWaitTime(producer_time, consumer_time, parallelism, ...)`, where
+  // `producer_time` is the average output time of inputs comprising the
+  // interleave "cycle" divided by `parallelism`, `consumer_time` is the
+  // `input_time` specified through `input_times` divided by `num_inputs() - 1`,
+  // and if the node has parallelism parameter, then `buffer_size` is derived
+  // from `parallelism`.
   void OutputTimeLocked(
       const absl::flat_hash_map<string, double>& input_times,
       absl::flat_hash_map<string, double>* gradients,
@@ -207,42 +220,37 @@ class AsyncInterleaveMany : public Node {
       return;
     }
 
-    double input_time;
-    if (output_) {
-      input_time = input_times.at(output_->long_name());
-    } else {
-      input_time = gtl::FindWithDefault(input_times, kInputTimeKey, 0.0L);
-    }
-
+    double output_time, wait_time, consumer_time, producer_time;
+    double input_time = input_times.at(long_name());
+    consumer_time = input_time / static_cast<double>(num_inputs() - 1);
     double parallelism = num_inputs() - 1;  // default to cycle length
     auto* parameter = gtl::FindOrNull(parameters_, kParallelism);
     if (parameter) {
       parallelism = std::min(parallelism, (*parameter)->value);
     }
-
     double output_time_for_inputs =
         OutputTimeForInputs(*output_times) -
         (*output_times)[inputs_.front()->long_name()];
-    double output_time = output_time_for_inputs /
-                         static_cast<double>(num_inputs() - 1) / parallelism;
-    double result;
+    producer_time = output_time_for_inputs /
+                    static_cast<double>(num_inputs() - 1) / parallelism;
 
     if (gradients) {
-      double output_time_der = 0.0L;
-      double input_time_der = 0.0L;
+      double producer_time_der = 0.0L;
+      double consumer_time_der = 0.0L;
       double buffer_size_der = 0.0L;
-      result = ComputeWaitTime(self_processing_time + output_time, input_time,
-                               parallelism, &output_time_der, &input_time_der,
-                               &buffer_size_der);
-      (*output_time_gradients)[long_name()] = input_time_der;
-      double parallelism_der = -output_time_for_inputs /
-                               static_cast<double>(num_inputs() - 1) /
-                               Square(parallelism);
+      wait_time = ComputeWaitTime(producer_time, consumer_time, parallelism,
+                                  &producer_time_der, &consumer_time_der,
+                                  &buffer_size_der);
+      double inputs_time_der_sum =
+          OutputTimeGradientsForInputs(*output_time_gradients);
+      (*output_time_gradients)[long_name()] =
+          consumer_time_der +
+          producer_time_der * inputs_time_der_sum / parallelism;
 
       for (const auto& node : CollectNodes(TraversalOrder::REVERSE_BFS)) {
         auto* gradient = gtl::FindOrNull(*gradients, node->long_name());
         if (gradient) {
-          *gradient *= (output_time_der /
+          *gradient *= (producer_time_der /
                         static_cast<double>(num_inputs() - 1) / parallelism);
         }
       }
@@ -259,16 +267,16 @@ class AsyncInterleaveMany : public Node {
       // Add derivative w.r.t. own parallelism parameter.
       if (parameter && (*parameter)->state->tunable) {
         (*gradients)[long_name()] =
-            output_time_der * parallelism_der + buffer_size_der;
+            buffer_size_der - producer_time_der * producer_time / parallelism;
       }
     } else {
-      result = ComputeWaitTime(self_processing_time + output_time, input_time,
-                               parallelism,
-                               /*output_time_derivative=*/nullptr,
-                               /*input_time_derivative=*/nullptr,
-                               /*buffer_size_derivative=*/nullptr);
+      wait_time = ComputeWaitTime(producer_time, consumer_time, parallelism,
+                                  /*producer_time_derivative=*/nullptr,
+                                  /*consumer_time_derivative=*/nullptr,
+                                  /*buffer_size_derivative=*/nullptr);
     }
-    (*output_times)[long_name()] = result;
+    output_time = self_processing_time + wait_time;
+    (*output_times)[long_name()] = output_time;
   }
 
   // The processing time is the sum of the self processing time and the average
@@ -285,12 +293,12 @@ class AsyncInterleaveMany : public Node {
       (*total_processing_times)[long_name()] = self_processing_time;
       return;
     }
-    double processing_time =
+    double inputs_processing_time =
         (TotalProcessingTimeForInputs(*total_processing_times) -
          (*total_processing_times)[inputs_.front()->long_name()]) /
         static_cast<double>(num_inputs() - 1);
     (*total_processing_times)[long_name()] =
-        self_processing_time + processing_time;
+        self_processing_time + inputs_processing_time;
   }
 };
 
@@ -307,22 +315,25 @@ class KnownRatio : public Node {
                                         ratio_);
   }
 
+  // The input time is the sum of inherited input time and self processing time,
+  // divided by `ratio_`.
   void InputTimeLocked(absl::flat_hash_map<string, double>* input_times)
       const override TF_SHARED_LOCKS_REQUIRED(mu_) {
-    double old_input_time;
+    double inherited_input_time;
     if (output_) {
-      old_input_time = (*input_times)[output_->long_name()];
+      inherited_input_time = (*input_times)[output_->long_name()];
     } else {
-      old_input_time = gtl::FindWithDefault(*input_times, kInputTimeKey, 0.0L);
+      inherited_input_time =
+          gtl::FindWithDefault(*input_times, kInputTimeKey, 0.0L);
     }
 
     if (ratio_ == 0) {
-      (*input_times)[long_name()] = old_input_time;
+      (*input_times)[long_name()] = inherited_input_time;
       return;
     }
-    double new_input_time =
-        (old_input_time + SelfProcessingTimeLocked()) / ratio_;
-    (*input_times)[long_name()] = new_input_time;
+    double input_time =
+        (inherited_input_time + SelfProcessingTimeLocked()) / ratio_;
+    (*input_times)[long_name()] = input_time;
   }
 
   // The output time is the sum of the self processing time and the product of
@@ -343,8 +354,6 @@ class KnownRatio : public Node {
       }
       return;
     }
-    double result =
-        self_processing_time + ratio_ * OutputTimeForInputs(*output_times);
     if (gradients) {
       for (const auto& node : CollectNodes(TraversalOrder::REVERSE_BFS)) {
         auto* gradient = gtl::FindOrNull(*gradients, node->long_name());
@@ -355,7 +364,8 @@ class KnownRatio : public Node {
       (*output_time_gradients)[long_name()] =
           OutputTimeGradientsForInputs(*output_time_gradients);
     }
-    (*output_times)[long_name()] = result;
+    double inputs_output_time = ratio_ * OutputTimeForInputs(*output_times);
+    (*output_times)[long_name()] = self_processing_time + inputs_output_time;
   }
 
   // The processing time is the sum of the self processing time and the product
@@ -368,10 +378,14 @@ class KnownRatio : public Node {
     if (processing_times) {
       (*processing_times)[long_name()] = self_processing_time;
     }
-    double processing_time =
+    if (ratio_ == 0) {
+      (*total_processing_times)[long_name()] = self_processing_time;
+      return;
+    }
+    double inputs_processing_time =
         ratio_ * TotalProcessingTimeForInputs(*total_processing_times);
     (*total_processing_times)[long_name()] =
-        self_processing_time + processing_time;
+        self_processing_time + inputs_processing_time;
   }
 
  private:
@@ -401,34 +415,42 @@ class AsyncKnownRatio : public Node {
         Args{id_, name_, std::move(output)}, ratio_, parameters);
   }
 
+  // The input time is the sum of inherited input time and parallelism adjusted
+  // self processing time, divided by `ratio_`.
   void InputTimeLocked(absl::flat_hash_map<string, double>* input_times)
       const override TF_SHARED_LOCKS_REQUIRED(mu_) {
-    double input_time;
-
-    if (ratio_ == 0.0) {
-      if (output_) {
-        input_time = (*input_times)[output_->long_name()];
-      } else {
-        input_time = gtl::FindWithDefault(*input_times, kInputTimeKey, 0.0L);
-      }
-      (*input_times)[long_name()] = input_time;
-      return;
+    double inherited_input_time;
+    if (output_) {
+      inherited_input_time = (*input_times)[output_->long_name()];
+    } else {
+      inherited_input_time =
+          gtl::FindWithDefault(*input_times, kInputTimeKey, 0.0L);
     }
-
     double parallelism = 1.0;
     auto* parallelism_parameter = gtl::FindOrNull(parameters_, kParallelism);
     if (parallelism_parameter) {
       parallelism = (*parallelism_parameter)->value;
     }
-    input_time = SelfProcessingTimeLocked() / ratio_ / parallelism;
+
+    if (ratio_ == 0.0) {
+      (*input_times)[long_name()] =
+          inherited_input_time + SelfProcessingTimeLocked() / parallelism;
+      return;
+    }
+    double input_time =
+        (inherited_input_time + SelfProcessingTimeLocked() / parallelism) /
+        ratio_;
     (*input_times)[long_name()] = input_time;
   }
 
-  // The output time is estimated using `ComputeWaitTime(output_time,
-  // input_time, parallelism, ...)`, where `output_time` is the sum of the self
-  // processing time and the product of `ratio_` and the sum of output times of
-  // inputs, `input_time` is specified through `input_times` and if the node
-  // has parallelism parameter, then `buffer_size` is derived from parallelism.
+  // The output time is the sum of parallelism adjusted self processing time and
+  // expected wait time from the buffer model estimated using
+  // `ComputeWaitTime(producer_time, consumer_time, parallelism, ...)`, where
+  // `producer_time` is the product of `ratio_` and the sum of output times of
+  // inputs, `consumer_time` is the product of `ratio_` and the `input_time`
+  // specified through `input_times` (since for each element stored in the
+  // buffer, the inputs need to be called `ratio_` times), and if the node has
+  // parallelism parameter, then `buffer_size` is derived from `parallelism`.
   //
   // Current implementation assumes that there is at most 1 parameter per node.
   void OutputTimeLocked(
@@ -448,85 +470,83 @@ class AsyncKnownRatio : public Node {
       buffer_size = (*buffer_size_parameter)->value;
     }
     double self_processing_time = SelfProcessingTimeLocked();
-    double result;
-    double input_time;
-    if (output_) {
-      input_time = input_times.at(output_->long_name());
-    } else {
-      input_time = gtl::FindWithDefault(input_times, kInputTimeKey, 0.0L);
-    }
+    double output_time, wait_time, consumer_time, producer_time;
+    double input_time = input_times.at(long_name());
 
-    if (ratio_ == 0.0) {
-      double output_time = self_processing_time / parallelism;
+    if (ratio_ == 0) {
+      consumer_time = input_time;
+      producer_time = 0.0L;
       if (gradients) {
         for (const auto& node : CollectNodes(TraversalOrder::REVERSE_BFS)) {
           gradients->erase(node->long_name());
         }
 
-        double output_time_der = 0.0L;
-        double input_time_der = 0.0L;
+        double producer_time_der = 0.0L;
+        double consumer_time_der = 0.0L;
         double buffer_size_der = 0.0L;
-        result = ComputeWaitTime(output_time, input_time, buffer_size,
-                                 &output_time_der, &input_time_der,
-                                 &buffer_size_der);
-        (*output_time_gradients)[long_name()] = input_time_der;
-        // Add derivative w.r.t. own parameter if it's tunable.
+        wait_time = ComputeWaitTime(producer_time, consumer_time, buffer_size,
+                                    &producer_time_der, &consumer_time_der,
+                                    &buffer_size_der);
+        (*output_time_gradients)[long_name()] = consumer_time_der;
         if (parallelism_parameter && (*parallelism_parameter)->state->tunable) {
-          (*gradients)[long_name()] =
-              -output_time_der * self_processing_time / Square(parallelism) +
-              buffer_size_der;
+          (*gradients)[long_name()] = -(1.0L + consumer_time_der) *
+                                          self_processing_time /
+                                          Square(parallelism) +
+                                      buffer_size_der;
         } else if (buffer_size_parameter &&
                    (*buffer_size_parameter)->state->tunable) {
           (*gradients)[long_name()] = buffer_size_der;
         }
       } else {
-        result = ComputeWaitTime(output_time, input_time, buffer_size,
-                                 /*output_time_derivative=*/nullptr,
-                                 /*input_time_derivative=*/nullptr,
-                                 /*buffer_size_derivative=*/nullptr);
+        wait_time = ComputeWaitTime(producer_time, consumer_time, buffer_size,
+                                    /*producer_time_derivative=*/nullptr,
+                                    /*consumer_time_derivative=*/nullptr,
+                                    /*buffer_size_derivative=*/nullptr);
       }
-      (*output_times)[long_name()] = result;
+      output_time = self_processing_time / parallelism + wait_time;
+      (*output_times)[long_name()] = output_time;
       return;
     }
 
-    double output_time = self_processing_time / parallelism +
-                         ratio_ * OutputTimeForInputs(*output_times);
+    consumer_time = input_time * ratio_;
+    producer_time = ratio_ * OutputTimeForInputs(*output_times);
     if (gradients) {
-      double output_time_der = 0.0L;
-      double input_time_der = 0.0L;
+      double producer_time_der = 0.0L;
+      double consumer_time_der = 0.0L;
       double buffer_size_der = 0.0L;
-      result =
-          ComputeWaitTime(output_time, input_time, buffer_size,
-                          &output_time_der, &input_time_der, &buffer_size_der);
-      (*output_time_gradients)[long_name()] = input_time_der;
+      wait_time = ComputeWaitTime(producer_time, consumer_time, buffer_size,
+                                  &producer_time_der, &consumer_time_der,
+                                  &buffer_size_der);
+      double inputs_time_der_sum =
+          OutputTimeGradientsForInputs(*output_time_gradients);
+      (*output_time_gradients)[long_name()] =
+          consumer_time_der + producer_time_der * inputs_time_der_sum;
 
       for (const auto& node : CollectNodes(TraversalOrder::REVERSE_BFS)) {
         auto* gradient = gtl::FindOrNull(*gradients, node->long_name());
         if (gradient) {
-          *gradient *= (ratio_ * output_time_der);
+          *gradient *= (ratio_ * producer_time_der);
         }
       }
 
       // Add derivative w.r.t. own parameter if it's tunable.
       if (parallelism_parameter && (*parallelism_parameter)->state->tunable) {
-        double inputs_time_der_sum =
-            OutputTimeGradientsForInputs(*output_time_gradients);
         (*gradients)[long_name()] =
-            -output_time_der * self_processing_time / Square(parallelism) +
-            buffer_size_der -
-            output_time_der * inputs_time_der_sum * self_processing_time /
-                Square(parallelism);
+            buffer_size_der - (1.0L + consumer_time_der +
+                               producer_time_der * inputs_time_der_sum) *
+                                  self_processing_time / Square(parallelism);
       } else if (buffer_size_parameter &&
                  (*buffer_size_parameter)->state->tunable) {
         (*gradients)[long_name()] = buffer_size_der;
       }
     } else {
-      result = ComputeWaitTime(output_time, input_time, buffer_size,
-                               /*output_time_derivative=*/nullptr,
-                               /*input_time_derivative=*/nullptr,
-                               /*buffer_size_derivative=*/nullptr);
+      wait_time = ComputeWaitTime(producer_time, consumer_time, buffer_size,
+                                  /*producer_time_derivative=*/nullptr,
+                                  /*consumer_time_derivative=*/nullptr,
+                                  /*buffer_size_derivative=*/nullptr);
     }
-    (*output_times)[long_name()] = result;
+    output_time = self_processing_time / parallelism + wait_time;
+    (*output_times)[long_name()] = output_time;
   }
 
   // The processing time is the sum of the self processing time and the product
@@ -539,10 +559,14 @@ class AsyncKnownRatio : public Node {
     if (processing_times) {
       (*processing_times)[long_name()] = self_processing_time;
     }
-    double processing_time =
+    if (ratio_ == 0) {
+      (*total_processing_times)[long_name()] = self_processing_time;
+      return;
+    }
+    double inputs_processing_time =
         ratio_ * TotalProcessingTimeForInputs(*total_processing_times);
     (*total_processing_times)[long_name()] =
-        self_processing_time + processing_time;
+        self_processing_time + inputs_processing_time;
   }
 
  private:
@@ -561,26 +585,29 @@ class UnknownRatio : public Node {
     return std::make_shared<UnknownRatio>(Args{id_, name_, std::move(output)});
   }
 
+  // The input time is the sum of inherited input time and self processing time,
+  // divided by the ratio estimate.
   void InputTimeLocked(absl::flat_hash_map<string, double>* input_times)
       const override TF_SHARED_LOCKS_REQUIRED(mu_) {
-    double old_input_time;
+    double inherited_input_time;
     if (output_) {
-      old_input_time = (*input_times)[output_->long_name()];
+      inherited_input_time = (*input_times)[output_->long_name()];
     } else {
-      old_input_time = gtl::FindWithDefault(*input_times, kInputTimeKey, 0.0L);
+      inherited_input_time =
+          gtl::FindWithDefault(*input_times, kInputTimeKey, 0.0L);
     }
 
     if (num_elements_ == 0 || inputs_.empty() ||
         inputs_.front()->num_elements() == 0) {
-      (*input_times)[long_name()] = old_input_time;
+      (*input_times)[long_name()] = inherited_input_time;
       return;
     }
     std::shared_ptr<Node> input = inputs_.front();
     double ratio = static_cast<double>(input->num_elements()) /
                    static_cast<double>(num_elements_);
-    double new_input_time =
-        (old_input_time + SelfProcessingTimeLocked()) / ratio;
-    (*input_times)[long_name()] = new_input_time;
+    double input_time =
+        (inherited_input_time + SelfProcessingTimeLocked()) / ratio;
+    (*input_times)[long_name()] = input_time;
   }
 
   // The output time is the sum of the self processing time and the product of
@@ -606,8 +633,6 @@ class UnknownRatio : public Node {
     // elements consumed per output is the same across all inputs.
     double ratio = static_cast<double>(inputs_.front()->num_elements()) /
                    static_cast<double>(num_elements_);
-    double result =
-        self_processing_time + ratio * OutputTimeForInputs(*output_times);
     if (gradients) {
       for (const auto& node : CollectNodes(TraversalOrder::REVERSE_BFS)) {
         auto* gradient = gtl::FindOrNull(*gradients, node->long_name());
@@ -618,7 +643,8 @@ class UnknownRatio : public Node {
       (*output_time_gradients)[long_name()] =
           OutputTimeGradientsForInputs(*output_time_gradients);
     }
-    (*output_times)[long_name()] = result;
+    double inputs_output_time = ratio * OutputTimeForInputs(*output_times);
+    (*output_times)[long_name()] = self_processing_time + inputs_output_time;
   }
 
   // The processing time is the sum of the self processing time and the product
@@ -640,10 +666,10 @@ class UnknownRatio : public Node {
     std::shared_ptr<Node> input = inputs_.front();
     double ratio = static_cast<double>(input->num_elements()) /
                    static_cast<double>(num_elements_);
-    double processing_time =
+    double inputs_processing_time =
         ratio * TotalProcessingTimeForInputs(*total_processing_times);
     (*total_processing_times)[long_name()] =
-        self_processing_time + processing_time;
+        self_processing_time + inputs_processing_time;
   }
 };
 
@@ -659,15 +685,17 @@ class Unknown : public Node {
     return std::make_shared<Unknown>(Args{id_, name_, std::move(output)});
   }
 
+  // The input time is the inherited input time.
   void InputTimeLocked(absl::flat_hash_map<string, double>* input_times)
       const override TF_SHARED_LOCKS_REQUIRED(mu_) {
-    double input_time;
+    double inherited_input_time;
     if (output_) {
-      input_time = (*input_times)[output_->long_name()];
+      inherited_input_time = (*input_times)[output_->long_name()];
     } else {
-      input_time = gtl::FindWithDefault(*input_times, kInputTimeKey, 0.0L);
+      inherited_input_time =
+          gtl::FindWithDefault(*input_times, kInputTimeKey, 0.0L);
     }
-    (*input_times)[long_name()] = input_time;
+    (*input_times)[long_name()] = inherited_input_time;
   }
 
   // The output time is the sum of output times of inputs.
@@ -677,8 +705,7 @@ class Unknown : public Node {
       absl::flat_hash_map<string, double>* output_times,
       absl::flat_hash_map<string, double>* output_time_gradients) const override
       TF_SHARED_LOCKS_REQUIRED(mu_) {
-    double result = OutputTimeForInputs(*output_times);
-    (*output_times)[long_name()] = result;
+    (*output_times)[long_name()] = OutputTimeForInputs(*output_times);
     if (gradients) {
       (*output_time_gradients)[long_name()] =
           OutputTimeGradientsForInputs(*output_time_gradients);
@@ -690,9 +717,11 @@ class Unknown : public Node {
       absl::flat_hash_map<string, double>* processing_times,
       absl::flat_hash_map<string, double>* total_processing_times) override
       TF_SHARED_LOCKS_REQUIRED(mu_) {
-    double processing_time =
+    if (processing_times) {
+      (*processing_times)[long_name()] = SelfProcessingTimeLocked();
+    }
+    (*total_processing_times)[long_name()] =
         TotalProcessingTimeForInputs(*total_processing_times);
-    (*total_processing_times)[long_name()] = processing_time;
   }
 };
 
@@ -739,13 +768,13 @@ std::shared_ptr<Node> MakeUnknownNode(Node::Args args) {
   return std::make_shared<Unknown>(std::move(args));
 }
 
-double Node::ComputeWaitTime(const double& output_time,
-                             const double& input_time,
+double Node::ComputeWaitTime(const double& producer_time,
+                             const double& consumer_time,
                              const double& buffer_size,
-                             double* output_time_derivative,
-                             double* input_time_derivative,
+                             double* producer_time_derivative,
+                             double* consumer_time_derivative,
                              double* buffer_size_derivative) {
-  // If we set x=`input_time`, y=`output_time`, n=`buffer_size`,
+  // If we set x=`consumer_time`, y=`producer_time`, n=`buffer_size`,
   // p=`p_buffer_empty`, T=`wait_time`, then we have:
   // if y = 0, then p = 0;
   // elif x = 0, then p = 1;
@@ -761,20 +790,20 @@ double Node::ComputeWaitTime(const double& output_time,
 
   // Case 1: if producer is infinitely fast. The buffer will always be full.
   // Wait time will always be 0.
-  if (output_time == 0) {
-    if (output_time_derivative) {
-      // Note a common error is `*output_time_derivative = 0` since p=0 on the
+  if (producer_time == 0) {
+    if (producer_time_derivative) {
+      // Note a common error is `*producer_time_derivative = 0` since p=0 on the
       // line y=0 doesn't imply dp/dy = 0 there. Actually to compute dp/dy at
       // (x,0), we need to consider lim_{dy->0+} [p(x,dy)-p(x,0)] / dy, where
       // p(x,0)=0 and p(x,dy) = [1 - x/dy] / [1 - power(x/dy, n+1)].
-      if (buffer_size == 0 || input_time == 0) {
-        *output_time_derivative = 1.0L;
+      if (buffer_size == 0 || consumer_time == 0) {
+        *producer_time_derivative = 1.0L;
       } else {
-        *output_time_derivative = 0.0L;
+        *producer_time_derivative = 0.0L;
       }
     }
-    if (input_time_derivative) {
-      *input_time_derivative = 0.0L;
+    if (consumer_time_derivative) {
+      *consumer_time_derivative = 0.0L;
     }
     if (buffer_size_derivative) {
       *buffer_size_derivative = 0.0L;
@@ -784,98 +813,99 @@ double Node::ComputeWaitTime(const double& output_time,
 
   // Case 2: if consumer is infinitely fast. Wait time is always the time to
   // produce an output.
-  if (input_time == 0) {
-    if (output_time_derivative) {
-      *output_time_derivative = 1.0L;
+  if (consumer_time == 0) {
+    if (producer_time_derivative) {
+      *producer_time_derivative = 1.0L;
     }
-    if (input_time_derivative) {
-      // Note a common error is `*input_time_derivative = 0` since p=1 on the
+    if (consumer_time_derivative) {
+      // Note a common error is `*consumer_time_derivative = 0` since p=1 on the
       // line x=0 doesn't imply dp/dx = 0 there. Actually to compute dp/dx at
       // (0,y), we need to consider lim_{dx->0+} [p(dx,y)-p(0,y)] / dx, where
       // p(0,y)=1, p(dx,y) = [1 - dx/y] / [1 - power(dx/y, n+1)] if y!=0.
       if (buffer_size == 0) {
-        *input_time_derivative = 0.0L;
+        *consumer_time_derivative = 0.0L;
       } else {
-        *input_time_derivative = -1.0L;
+        *consumer_time_derivative = -1.0L;
       }
     }
     if (buffer_size_derivative) {
       *buffer_size_derivative = 0.0L;
     }
-    return output_time;
+    return producer_time;
   }
 
   // Case 3: the consumer and the producer are equally fast. Expected wait time
   // decreases linearly with the size of the buffer.
-  if (input_time == output_time) {
+  if (consumer_time == producer_time) {
     const double p_buffer_empty = 1.0L / (buffer_size + 1.0L);
     const double p_buffer_empty_der =
         -buffer_size / (2.0L * buffer_size + 2.0L);
-    if (output_time_derivative) {
-      // Note a common error is `*output_time_derivative = p_buffer_empty` since
-      // p=1/(n+1) on the line x=y doesn't imply dp/dy = 0 there. Actually to
-      // compute dp/dy at (y,y), we need to consider
+    if (producer_time_derivative) {
+      // Note a common error is `*producer_time_derivative = p_buffer_empty`
+      // since p=1/(n+1) on the line x=y doesn't imply dp/dy = 0 there. Actually
+      // to compute dp/dy at (y,y), we need to consider
       // lim_{dy->0} [p(y,y+dy)-p(y,y)] / dy, where p(y,y)=1/(n+1),
       // p(y,y+dy) = [1 - y/(y+dy)] / [1 - power(y/(y+dy), n+1)].
-      *output_time_derivative = p_buffer_empty - p_buffer_empty_der;
+      *producer_time_derivative = p_buffer_empty - p_buffer_empty_der;
     }
-    if (input_time_derivative) {
-      // Note a common error is `*input_time_derivative = 0` since
+    if (consumer_time_derivative) {
+      // Note a common error is `*consumer_time_derivative = 0` since
       // p=1/(n+1) on the line x=y doesn't imply dp/dx = 0 there. Actually to
       // compute dp/dx at (x,x), we need to consider
       // lim_{dx->0} [p(x+dx,x)-p(x,x)] / dx, where p(x,x)=1/(n+1),
       // p(x+dx,x) = [1 - (x+dx)/x] / [1 - power((x+dx)/x, n+1)].
-      *input_time_derivative = p_buffer_empty_der;
+      *consumer_time_derivative = p_buffer_empty_der;
     }
     if (buffer_size_derivative) {
-      *buffer_size_derivative = -output_time / Square(buffer_size + 1.0L);
+      *buffer_size_derivative = -producer_time / Square(buffer_size + 1.0L);
     }
-    return p_buffer_empty * output_time;
+    return p_buffer_empty * producer_time;
   }
 
   // Case 4: the consumer is slower than the producer and neither is infinitely
   // fast. Case 4 and Case 5 actually follow same formula. Separate them for
   // numerical computation reasons.
-  if (input_time > output_time) {
-    const double ratio = output_time / input_time;
+  if (consumer_time > producer_time) {
+    const double ratio = producer_time / consumer_time;
     const double ratio_pow = std::pow(ratio, buffer_size);
     const double p_buffer_empty =
         ratio_pow * (1.0L - ratio) / (1.0L - ratio * ratio_pow);
     const double p_buffer_empty_der =
         (buffer_size - (buffer_size + 1.0L) * ratio + ratio_pow * ratio) *
         ratio_pow / ratio / Square(1.0L - ratio_pow * ratio);
-    if (output_time_derivative) {
-      *output_time_derivative = p_buffer_empty + p_buffer_empty_der * ratio;
+    if (producer_time_derivative) {
+      *producer_time_derivative = p_buffer_empty + p_buffer_empty_der * ratio;
     }
-    if (input_time_derivative) {
-      *input_time_derivative = -p_buffer_empty_der * Square(ratio);
+    if (consumer_time_derivative) {
+      *consumer_time_derivative = -p_buffer_empty_der * Square(ratio);
     }
     if (buffer_size_derivative) {
       *buffer_size_derivative = p_buffer_empty / (1.0L - ratio_pow * ratio) *
-                                std::log(ratio) * output_time;
+                                std::log(ratio) * producer_time;
     }
-    return p_buffer_empty * output_time;
+    return p_buffer_empty * producer_time;
   }
 
   // Case 5: the producer is slower than the consumer and neither is infinitely
   // fast.
-  const double ratio = input_time / output_time;
+  const double ratio = consumer_time / producer_time;
   const double ratio_pow = std::pow(ratio, buffer_size);
   const double p_buffer_empty = (1.0L - ratio) / (1.0L - ratio_pow * ratio);
   const double p_buffer_empty_der =
       ((buffer_size + 1.0L - buffer_size * ratio) * ratio_pow - 1.0L) /
       Square(1.0L - ratio_pow * ratio);
-  if (output_time_derivative) {
-    *output_time_derivative = p_buffer_empty - p_buffer_empty_der * ratio;
+  if (producer_time_derivative) {
+    *producer_time_derivative = p_buffer_empty - p_buffer_empty_der * ratio;
   }
-  if (input_time_derivative) {
-    *input_time_derivative = p_buffer_empty_der;
+  if (consumer_time_derivative) {
+    *consumer_time_derivative = p_buffer_empty_der;
   }
   if (buffer_size_derivative) {
     *buffer_size_derivative = p_buffer_empty / (1.0L - ratio_pow * ratio) *
-                              ratio_pow * ratio * std::log(ratio) * output_time;
+                              ratio_pow * ratio * std::log(ratio) *
+                              producer_time;
   }
-  return p_buffer_empty * output_time;
+  return p_buffer_empty * producer_time;
 }
 
 void Node::CollectTunableParameters(
diff --git a/tensorflow/core/framework/model.h b/tensorflow/core/framework/model.h
index 12a0abae5bc..82814ed1353 100644
--- a/tensorflow/core/framework/model.h
+++ b/tensorflow/core/framework/model.h
@@ -294,9 +294,10 @@ class Node {
     autotune_.store(autotune);
   }
 
-  // Given the average time between output events (`output_time`), the average
-  // time between input events (`input_time`) and the buffer size, the method
-  // computes the expected time an input event will have to wait.
+  // Given the average time between events when the elements in the buffer are
+  // produced (`producer_time`), the average time between events when elements
+  // in the buffer are consumed (`consumer_time`) and the buffer size, the
+  // method computes the expected time an consumer event will have to wait.
   //
   // The wait time is approximated as the product of the probability the buffer
   // will be empty and the time it takes to produce an element into the buffer.
@@ -305,13 +306,14 @@ class Node {
   // problem as an M/M/1/K queue
   // (https://en.wikipedia.org/wiki/Birth%E2%80%93death_process#M/M/1/K_queue).
   //
-  // Collects derivatives of `ComputeWaitTime` w.r.t `output_time`, `input_time'
-  // and `buffer_size` if the corresponding pointers are not `nullptr`.
-  static double ComputeWaitTime(const double& output_time,
-                                const double& input_time,
+  // Collects derivatives of `ComputeWaitTime` w.r.t `producer_time`,
+  // `consumer_time' and `buffer_size` if the corresponding pointers are not
+  // `nullptr`.
+  static double ComputeWaitTime(const double& producer_time,
+                                const double& consumer_time,
                                 const double& buffer_size,
-                                double* output_time_derivative,
-                                double* input_time_derivative,
+                                double* producer_time_derivative,
+                                double* consumer_time_derivative,
                                 double* buffer_size_derivative);
 
   // Collects tunable parameters in the subtree rooted in this node.
diff --git a/tensorflow/core/framework/model_test.cc b/tensorflow/core/framework/model_test.cc
index 688dd0083e9..3cbe90b08f9 100644
--- a/tensorflow/core/framework/model_test.cc
+++ b/tensorflow/core/framework/model_test.cc
@@ -776,51 +776,51 @@ class ComputeWaitTimeTest
     : public ::testing::TestWithParam<std::tuple<double, double, double>> {};
 
 TEST_P(ComputeWaitTimeTest, Model) {
-  const double output_time = std::get<0>(GetParam());
-  const double input_time = std::get<1>(GetParam());
+  const double producer_time = std::get<0>(GetParam());
+  const double consumer_time = std::get<1>(GetParam());
   const double buffer_size = std::get<2>(GetParam());
 
-  double output_time_derivative = 0.0L;
-  double input_time_derivative = 0.0L;
+  double producer_time_derivative = 0.0L;
+  double consumer_time_derivative = 0.0L;
   double buffer_size_derivative = 0.0L;
 
   double wait_time = model::Node::ComputeWaitTime(
-      output_time, input_time, buffer_size, &output_time_derivative,
-      &input_time_derivative, &buffer_size_derivative);
+      producer_time, consumer_time, buffer_size, &producer_time_derivative,
+      &consumer_time_derivative, &buffer_size_derivative);
 
-  double new_wait_time =
-      model::Node::ComputeWaitTime(output_time + kParameterStep, input_time,
-                                   buffer_size, nullptr, nullptr, nullptr);
-  EXPECT_NEAR(output_time_derivative,
+  double new_wait_time = model::Node::ComputeWaitTime(
+      producer_time + kParameterStep, consumer_time, buffer_size, nullptr,
+      nullptr, nullptr);
+  EXPECT_NEAR(producer_time_derivative,
               (new_wait_time - wait_time) / kParameterStep,
               kComparisonPrecision);
 
-  if (output_time >= kParameterStep) {
-    new_wait_time =
-        model::Node::ComputeWaitTime(output_time - kParameterStep, input_time,
-                                     buffer_size, nullptr, nullptr, nullptr);
-    EXPECT_NEAR(output_time_derivative,
+  if (producer_time >= kParameterStep) {
+    new_wait_time = model::Node::ComputeWaitTime(producer_time - kParameterStep,
+                                                 consumer_time, buffer_size,
+                                                 nullptr, nullptr, nullptr);
+    EXPECT_NEAR(producer_time_derivative,
                 (wait_time - new_wait_time) / kParameterStep,
                 kComparisonPrecision);
   }
 
-  new_wait_time =
-      model::Node::ComputeWaitTime(output_time, input_time + kParameterStep,
-                                   buffer_size, nullptr, nullptr, nullptr);
-  EXPECT_NEAR(input_time_derivative,
+  new_wait_time = model::Node::ComputeWaitTime(
+      producer_time, consumer_time + kParameterStep, buffer_size, nullptr,
+      nullptr, nullptr);
+  EXPECT_NEAR(consumer_time_derivative,
               (new_wait_time - wait_time) / kParameterStep,
               kComparisonPrecision);
 
-  if (input_time >= kParameterStep) {
-    new_wait_time =
-        model::Node::ComputeWaitTime(output_time, input_time - kParameterStep,
-                                     buffer_size, nullptr, nullptr, nullptr);
-    EXPECT_NEAR(input_time_derivative,
+  if (consumer_time >= kParameterStep) {
+    new_wait_time = model::Node::ComputeWaitTime(
+        producer_time, consumer_time - kParameterStep, buffer_size, nullptr,
+        nullptr, nullptr);
+    EXPECT_NEAR(consumer_time_derivative,
                 (wait_time - new_wait_time) / kParameterStep,
                 kComparisonPrecision);
   }
 
-  new_wait_time = model::Node::ComputeWaitTime(output_time, input_time,
+  new_wait_time = model::Node::ComputeWaitTime(producer_time, consumer_time,
                                                buffer_size + kParameterStep,
                                                nullptr, nullptr, nullptr);
   EXPECT_NEAR(buffer_size_derivative,
@@ -828,7 +828,7 @@ TEST_P(ComputeWaitTimeTest, Model) {
               kComparisonPrecision);
 
   if (buffer_size >= kParameterStep) {
-    new_wait_time = model::Node::ComputeWaitTime(output_time, input_time,
+    new_wait_time = model::Node::ComputeWaitTime(producer_time, consumer_time,
                                                  buffer_size - kParameterStep,
                                                  nullptr, nullptr, nullptr);
     EXPECT_NEAR(buffer_size_derivative,

From dacf32e8fdcb788f95883befb77c2b32e3d9ab2e Mon Sep 17 00:00:00 2001
From: Hye Soo Yang <hyey@google.com>
Date: Thu, 25 Jun 2020 19:40:33 -0700
Subject: [PATCH 1133/1390] Register image decoding ops to new DecodeImageV2Op
 kernel. This change:

- Registers existing Decode* ops to DecodeImageV2Op kernel.
- Removes `decode_bmp_op.cc` as a newer version of its implementation exists in `decode_image_op.cc`. Removes all references to the deleted file.

PiperOrigin-RevId: 318397665
Change-Id: I30daaa535e8f99655e836d635f627cbac3086694
---
 tensorflow/core/kernels/BUILD              |   8 -
 tensorflow/core/kernels/decode_bmp_op.cc   | 197 --------------------
 tensorflow/core/kernels/decode_image_op.cc | 203 +++++++++++++++++----
 tensorflow/python/ops/image_ops_test.py    |   4 +-
 4 files changed, 167 insertions(+), 245 deletions(-)
 delete mode 100644 tensorflow/core/kernels/decode_bmp_op.cc

diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index 38775bac967..6cc3293707f 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -3015,7 +3015,6 @@ cc_library(
         ":attention_ops",
         ":colorspace_op",
         ":crop_and_resize_op",
-        ":decode_bmp_op",
         ":decode_image_op",
         ":draw_bounding_box_op",
         ":encode_jpeg_op",
@@ -3090,12 +3089,6 @@ tf_kernel_library(
     deps = IMAGE_DEPS + ["//tensorflow/core:framework_internal"],
 )
 
-tf_kernel_library(
-    name = "decode_bmp_op",
-    prefix = "decode_bmp_op",
-    deps = IMAGE_DEPS,
-)
-
 tf_kernel_library(
     name = "decode_image_op",
     prefix = "decode_image_op",
@@ -6896,7 +6889,6 @@ filegroup(
         "broadcast_to_op.cc",
         "bucketize_op.cc",
         "ctc_decoder_ops.cc",
-        "decode_bmp_op.cc",
         "depthtospace_op.cc",
         "dynamic_stitch_op.cc",
         "extract_image_patches_op.cc",
diff --git a/tensorflow/core/kernels/decode_bmp_op.cc b/tensorflow/core/kernels/decode_bmp_op.cc
deleted file mode 100644
index 122b7ecb3da..00000000000
--- a/tensorflow/core/kernels/decode_bmp_op.cc
+++ /dev/null
@@ -1,197 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// See docs in ../ops/image_ops.cc
-
-#include <memory>
-#include "tensorflow/core/framework/bounds_check.h"
-#include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/framework/register_types.h"
-#include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/framework/tensor_shape.h"
-#include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/framework/types.pb.h"
-#include "tensorflow/core/lib/core/status.h"
-#include "tensorflow/core/platform/logging.h"
-
-namespace tensorflow {
-
-// Decode the contents of a BMP file
-class DecodeBmpOp : public OpKernel {
- public:
-  explicit DecodeBmpOp(OpKernelConstruction* context) : OpKernel(context) {
-    OP_REQUIRES_OK(context, context->GetAttr("channels", &channels_));
-    OP_REQUIRES(
-        context,
-        channels_ == 0 || channels_ == 1 || channels_ == 3 || channels_ == 4,
-        errors::InvalidArgument("channels must be 0, 1, 3 or 4, got ",
-                                channels_));
-  }
-  inline int32 ByteSwapInt32ForBigEndian(int32 x) {
-#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
-    return le32toh(x);
-#else
-    return x;
-#endif
-  }
-
-  void Compute(OpKernelContext* context) override {
-    const Tensor& contents = context->input(0);
-    OP_REQUIRES(context, TensorShapeUtils::IsScalar(contents.shape()),
-                errors::InvalidArgument("contents must be scalar, got shape ",
-                                        contents.shape().DebugString()));
-
-    // Start decoding image to get shape details
-    const StringPiece input = contents.scalar<tstring>()();
-
-    OP_REQUIRES(context, (32 <= input.size()),
-                errors::InvalidArgument("Incomplete bmp content, requires at "
-                                        "least 32 bytes to find the header "
-                                        "size, width, height, and bpp, got ",
-                                        input.size(), " bytes"));
-
-    const uint8* img_bytes = reinterpret_cast<const uint8*>(input.data());
-    int32 header_size_ = internal::SubtleMustCopy(
-        *(reinterpret_cast<const int32*>(img_bytes + 10)));
-    const int32 header_size = ByteSwapInt32ForBigEndian(header_size_);
-    int32 width_ = internal::SubtleMustCopy(
-        *(reinterpret_cast<const int32*>(img_bytes + 18)));
-    const int32 width = ByteSwapInt32ForBigEndian(width_);
-    int32 height_ = internal::SubtleMustCopy(
-        *(reinterpret_cast<const int32*>(img_bytes + 22)));
-    const int32 height = ByteSwapInt32ForBigEndian(height_);
-    int32 bpp_ = internal::SubtleMustCopy(
-        *(reinterpret_cast<const int32*>(img_bytes + 28)));
-    const int32 bpp = ByteSwapInt32ForBigEndian(bpp_);
-
-    if (channels_) {
-      OP_REQUIRES(context, (channels_ == bpp / 8),
-                  errors::InvalidArgument(
-                      "channels attribute ", channels_,
-                      " does not match bits per pixel from file ", bpp / 8));
-    } else {
-      channels_ = bpp / 8;
-    }
-
-    // Current implementation only supports 1, 3 or 4 channel
-    // bitmaps.
-    OP_REQUIRES(context, (channels_ == 1 || channels_ == 3 || channels_ == 4),
-                errors::InvalidArgument(
-                    "Number of channels must be 1, 3 or 4, was ", channels_));
-
-    OP_REQUIRES(context, width > 0,
-                errors::InvalidArgument("Width must be positive"));
-    OP_REQUIRES(context, height != 0,
-                errors::InvalidArgument("Height must be nonzero"));
-    OP_REQUIRES(context, header_size >= 0,
-                errors::InvalidArgument("header size must be nonnegative"));
-
-    // The real requirement is < 2^31 minus some headers and channel data,
-    // so rounding down to something that's still ridiculously big.
-    OP_REQUIRES(
-        context,
-        (static_cast<int64>(width) * std::abs(static_cast<int64>(height))) <
-            static_cast<int64>(std::numeric_limits<int32_t>::max() / 8),
-        errors::InvalidArgument(
-            "Total possible pixel bytes must be less than 2^30"));
-
-    const int32 abs_height = abs(height);
-
-    // there may be padding bytes when the width is not a multiple of 4 bytes
-    const int row_size = (channels_ * width + 3) / 4 * 4;
-
-    const int64 last_pixel_offset = static_cast<int64>(header_size) +
-                                    (abs_height - 1) * row_size +
-                                    (width - 1) * channels_;
-
-    // [expected file size] = [last pixel offset] + [last pixel size=channels]
-    const int64 expected_file_size = last_pixel_offset + channels_;
-
-    OP_REQUIRES(
-        context, (expected_file_size <= input.size()),
-        errors::InvalidArgument("Incomplete bmp content, requires at least ",
-                                expected_file_size, " bytes, got ",
-                                input.size(), " bytes"));
-
-    // if height is negative, data layout is top down
-    // otherwise, it's bottom up
-    bool top_down = (height < 0);
-
-    // Decode image, allocating tensor once the image size is known
-    Tensor* output = nullptr;
-    OP_REQUIRES_OK(
-        context, context->allocate_output(
-                     0, TensorShape({abs_height, width, channels_}), &output));
-
-    const uint8* bmp_pixels = &img_bytes[header_size];
-
-    Decode(bmp_pixels, row_size, output->flat<uint8>().data(), width,
-           abs_height, channels_, top_down);
-  }
-
-  uint8* Decode(const uint8* input, const int row_size, uint8* const output,
-                const int width, const int height, const int channels,
-                bool top_down);
-
- private:
-  int channels_;
-};
-REGISTER_KERNEL_BUILDER(Name("DecodeBmp").Device(DEVICE_CPU), DecodeBmpOp);
-
-uint8* DecodeBmpOp::Decode(const uint8* input, const int row_size,
-                           uint8* const output, const int width,
-                           const int height, const int channels,
-                           bool top_down) {
-  for (int i = 0; i < height; i++) {
-    int src_pos;
-    int dst_pos;
-
-    for (int j = 0; j < width; j++) {
-      if (!top_down) {
-        src_pos = ((height - 1 - i) * row_size) + j * channels;
-      } else {
-        src_pos = i * row_size + j * channels;
-      }
-
-      dst_pos = (i * width + j) * channels;
-
-      switch (channels) {
-        case 1:
-          output[dst_pos] = input[src_pos];
-          break;
-        case 3:
-          // BGR -> RGB
-          output[dst_pos] = input[src_pos + 2];
-          output[dst_pos + 1] = input[src_pos + 1];
-          output[dst_pos + 2] = input[src_pos];
-          break;
-        case 4:
-          // BGRA -> RGBA
-          output[dst_pos] = input[src_pos + 2];
-          output[dst_pos + 1] = input[src_pos + 1];
-          output[dst_pos + 2] = input[src_pos];
-          output[dst_pos + 3] = input[src_pos + 3];
-          break;
-        default:
-          LOG(FATAL) << "Unexpected number of channels: " << channels;
-          break;
-      }
-    }
-  }
-
-  return output;
-}
-
-}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/decode_image_op.cc b/tensorflow/core/kernels/decode_image_op.cc
index 8d0c0d89d43..6527979c5ee 100644
--- a/tensorflow/core/kernels/decode_image_op.cc
+++ b/tensorflow/core/kernels/decode_image_op.cc
@@ -360,21 +360,96 @@ class DecodeImageOp : public OpKernel {
 class DecodeImageV2Op : public OpKernel {
  public:
   explicit DecodeImageV2Op(OpKernelConstruction* context) : OpKernel(context) {
-    OP_REQUIRES_OK(context, context->GetAttr("channels", &channels_));
-    OP_REQUIRES(
-        context,
-        channels_ == 0 || channels_ == 1 || channels_ == 3 || channels_ == 4,
-        errors::InvalidArgument("`channels` must be 0, 1, 3 or 4 but got ",
-                                channels_));
-    OP_REQUIRES_OK(context, context->GetAttr("dtype", &data_type_));
-    OP_REQUIRES(
-        context,
-        data_type_ == DataType::DT_UINT8 || data_type_ == DataType::DT_UINT16 ||
-            data_type_ == DataType::DT_FLOAT,
-        errors::InvalidArgument(
-            "`dtype` must be unit8, unit16, float but got: ", data_type_));
-    OP_REQUIRES_OK(context,
-                   context->GetAttr("expand_animations", &expand_animations_));
+    // Keep track of op string information because:
+    // [1] Currently by the API, PNG, JPEG and GIF can decode each other and
+    //     depending on the op type, we need to return either 3-D or 4-D shapes.
+    // [2] Different ops have different attributes. e.g. `DecodeImage` op has
+    //     `expand_animations` attribute that other ops don't.
+    //     `DecodeAndDropJpeg` also has additional attributes.
+    op_type_ = type_string();
+
+    // Validate op type.
+    OP_REQUIRES(context,
+                op_type_ == "DecodeJpeg" || op_type_ == "DecodeAndCropJpeg" ||
+                    op_type_ == "DecodePng" || op_type_ == "DecodeGif" ||
+                    op_type_ == "DecodeBmp" || op_type_ == "DecodeImage",
+                errors::InvalidArgument("Bad op type ", op_type_));
+
+    // Get attributes from `DecodeJpeg` and `DecodeAndCropJpeg` op
+    // invocations. For `DecodeImage` op, set JPEG decoding setting to TF
+    // default.
+    if (op_type_ == "DecodeJpeg" || op_type_ == "DecodeAndCropJpeg") {
+      OP_REQUIRES_OK(context, context->GetAttr("ratio", &flags_.ratio));
+      OP_REQUIRES(context,
+                  flags_.ratio == 1 || flags_.ratio == 2 || flags_.ratio == 4 ||
+                      flags_.ratio == 8,
+                  errors::InvalidArgument("ratio must be 1, 2, 4, or 8, got ",
+                                          flags_.ratio));
+      OP_REQUIRES_OK(context, context->GetAttr("fancy_upscaling",
+                                               &flags_.fancy_upscaling));
+      OP_REQUIRES_OK(context,
+                     context->GetAttr("try_recover_truncated",
+                                      &flags_.try_recover_truncated_jpeg));
+      OP_REQUIRES_OK(context,
+                     context->GetAttr("acceptable_fraction",
+                                      &flags_.min_acceptable_fraction));
+      string dct_method;
+      OP_REQUIRES_OK(context, context->GetAttr("dct_method", &dct_method));
+      OP_REQUIRES(
+          context,
+          (dct_method.empty() || dct_method == "INTEGER_FAST" ||
+           dct_method == "INTEGER_ACCURATE"),
+          errors::InvalidArgument("dct_method must be one of "
+                                  "{'', 'INTEGER_FAST', 'INTEGER_ACCURATE'}"));
+      // The TensorFlow-chosen default for JPEG decoding is IFAST, sacrificing
+      // image quality for speed.
+      if (dct_method.empty() || dct_method == "INTEGER_FAST") {
+        flags_.dct_method = JDCT_IFAST;
+      } else if (dct_method == "INTEGER_ACCURATE") {
+        flags_.dct_method = JDCT_ISLOW;
+      }
+    } else {
+      flags_ = jpeg::UncompressFlags();
+      flags_.dct_method = JDCT_IFAST;
+    }
+
+    // Get `dtype` attribute from `DecodePng` or `DecodeImage` op invocations.
+    if (op_type_ == "DecodePng" || op_type_ == "DecodeImage") {
+      OP_REQUIRES_OK(context, context->GetAttr("dtype", &data_type_));
+      if (op_type_ == "DecodePng") {
+        OP_REQUIRES(
+            context,
+            data_type_ == DataType::DT_UINT8 ||
+                data_type_ == DataType::DT_UINT16,
+            errors::InvalidArgument(
+                "`dtype` for `DecodePng` must be unit8, unit16 but got: ",
+                data_type_));
+      } else {
+        OP_REQUIRES(context,
+                    data_type_ == DataType::DT_UINT8 ||
+                        data_type_ == DataType::DT_UINT16 ||
+                        data_type_ == DataType::DT_FLOAT,
+                    errors::InvalidArgument("`dtype` for `DecodeImage` must be "
+                                            "unit8, unit16, float but got: ",
+                                            data_type_));
+        OP_REQUIRES_OK(context, context->GetAttr("expand_animations",
+                                                 &expand_animations_));
+      }
+    }
+
+    // Get `channels` attribute for all ops except `DecodeGif` op.
+    // `DecodeGif` doesn't have `channels` attribute but it supports 3
+    // channels by default.
+    if (op_type_ != "DecodeGif") {
+      OP_REQUIRES_OK(context, context->GetAttr("channels", &channels_));
+      OP_REQUIRES(
+          context,
+          channels_ == 0 || channels_ == 1 || channels_ == 3 || channels_ == 4,
+          errors::InvalidArgument("`channels` must be 0, 1, 3 or 4 but got ",
+                                  channels_));
+    } else {
+      channels_ = 3;
+    }
   }
 
   // Helper for decoding BMP.
@@ -423,14 +498,29 @@ class DecodeImageV2Op : public OpKernel {
 
   void DecodeJpegV2(OpKernelContext* context, StringPiece input) {
     OP_REQUIRES(context, channels_ == 0 || channels_ == 1 || channels_ == 3,
-                errors::InvalidArgument("JPEG does not support 4 channels."));
+                errors::InvalidArgument("JPEG does not support 4 channels"));
 
-    // Use default settings for `DecodeImage` op. Use local copy of flags to
-    // avoid race condition as the class member is shared among different
-    // invocations.
-    jpeg::UncompressFlags flags = jpeg::UncompressFlags();
+    // Use local copy of flags to avoid race condition as the class member is
+    // shared among different invocations.
+    jpeg::UncompressFlags flags = flags_;
     flags.components = channels_;
-    flags.dct_method = JDCT_IFAST;
+
+    if (op_type_ == "DecodeAndCropJpeg") {
+      flags.crop = true;
+      // Update flags to include crop window.
+      const Tensor& crop_window = context->input(1);
+      OP_REQUIRES(context, crop_window.dims() == 1,
+                  errors::InvalidArgument("crop_window must be 1-D, got shape ",
+                                          crop_window.shape().DebugString()));
+      OP_REQUIRES(context, crop_window.dim_size(0) == 4,
+                  errors::InvalidArgument("crop_size must have four elements ",
+                                          crop_window.shape().DebugString()));
+      auto crop_window_vec = crop_window.vec<int32>();
+      flags.crop_y = crop_window_vec(0);
+      flags.crop_x = crop_window_vec(1);
+      flags.crop_height = crop_window_vec(2);
+      flags.crop_width = crop_window_vec(3);
+    }
 
     // Output tensor and the image buffer size.
     Tensor* output = nullptr;
@@ -438,13 +528,22 @@ class DecodeImageV2Op : public OpKernel {
 
     // Decode JPEG. Directly allocate to the output buffer if data type is
     // uint8 (to save extra copying). Otherwise, allocate a new uint8 buffer
-    // with buffer size. `jpeg::Uncompress` support unit8 only.
+    // with buffer size. `jpeg::Uncompress` supports unit8 only.
     uint8* buffer = jpeg::Uncompress(
         input.data(), input.size(), flags, nullptr /* nwarn */,
         [&](int width, int height, int channels) -> uint8* {
           buffer_size = height * width * channels;
-          Status status = context->allocate_output(
-              0, TensorShape({height, width, channels}), &output);
+          Status status;
+          // By the existing API, we support decoding JPEG with `DecodeGif`
+          // op. We need to make sure to return 4-D shapes when using
+          // `DecodeGif`.
+          if (op_type_ == "DecodeGif") {
+            status = context->allocate_output(
+                0, TensorShape({1, height, width, channels}), &output);
+          } else {
+            status = context->allocate_output(
+                0, TensorShape({height, width, channels}), &output);
+          }
           if (!status.ok()) {
             VLOG(1) << status;
             context->SetStatus(status);
@@ -458,8 +557,10 @@ class DecodeImageV2Op : public OpKernel {
           }
         });
 
-    OP_REQUIRES(context, buffer,
-                errors::InvalidArgument("jpeg::Uncompress failed."));
+    OP_REQUIRES(
+        context, buffer,
+        errors::InvalidArgument(
+            "jpeg::Uncompress failed. Invalid JPEG data or crop window."));
 
     // For when desired data type if unit8, the output buffer is already
     // allocated during the `jpeg::Uncompress` call above; return.
@@ -487,8 +588,7 @@ class DecodeImageV2Op : public OpKernel {
   }
 
   void DecodePngV2(OpKernelContext* context, StringPiece input) {
-    int channel_bits;
-    channel_bits = (data_type_ == DataType::DT_UINT8) ? 8 : 16;
+    int channel_bits = (data_type_ == DataType::DT_UINT8) ? 8 : 16;
     png::DecodeContext decode;
     OP_REQUIRES(
         context, png::CommonInitDecode(input, channels_, channel_bits, &decode),
@@ -514,8 +614,16 @@ class DecodeImageV2Op : public OpKernel {
     }
 
     Tensor* output = nullptr;
-    const auto status = context->allocate_output(
-        0, TensorShape({height, width, decode.channels}), &output);
+    Status status;
+    // By the existing API, we support decoding PNG with `DecodeGif` op.
+    // We need to make sure to return 4-D shapes when using `DecodeGif`.
+    if (op_type_ == "DecodeGif") {
+      status = context->allocate_output(
+          0, TensorShape({1, height, width, decode.channels}), &output);
+    } else {
+      status = context->allocate_output(
+          0, TensorShape({height, width, decode.channels}), &output);
+    }
     if (!status.ok()) png::CommonFreeDecode(&decode);
     OP_REQUIRES_OK(context, status);
 
@@ -575,12 +683,28 @@ class DecodeImageV2Op : public OpKernel {
           buffer_size = num_frames * height * width * channels;
 
           Status status;
-          if (expand_animations_) {
+          // By the existing API, we support decoding GIF with `decode_jpeg` or
+          // with `decode_png` if the GIF is a single-frame GIF (non-animated).
+          // We need to make sure to return 3-D shapes when using in this case.
+          if (op_type_ == "DecodePng" || op_type_ == "DecodeJpeg") {
+            if (num_frames == 1) {
+              status = context->allocate_output(
+                  0, TensorShape({height, width, channels}), &output);
+            } else {
+              status = errors::InvalidArgument(
+                  "Got ", num_frames, " frames, but animated gifs ",
+                  "can only be decoded by tf.io.decode_gif or ",
+                  "tf.io.decode_image");
+            }
+          } else if (op_type_ == "DecodeGif" ||
+                     (op_type_ == "DecodeImage" && expand_animations_)) {
             status = context->allocate_output(
                 0, TensorShape({num_frames, height, width, channels}), &output);
-          } else {
+          } else if (op_type_ == "DecodeImage" && !expand_animations_) {
             status = context->allocate_output(
                 0, TensorShape({height, width, channels}), &output);
+          } else {
+            status = errors::InvalidArgument("Bad op type ", op_type_);
           }
           if (!status.ok()) {
             VLOG(1) << status;
@@ -742,17 +866,20 @@ class DecodeImageV2Op : public OpKernel {
 
  private:
   int channels_ = 0;
-  DataType data_type_;
-  bool expand_animations_;
+  DataType data_type_ = DataType::DT_UINT8;
+  bool expand_animations_ = true;
+  jpeg::UncompressFlags flags_;
+  string op_type_;
 };
 
-REGISTER_KERNEL_BUILDER(Name("DecodeJpeg").Device(DEVICE_CPU), DecodeImageOp);
-REGISTER_KERNEL_BUILDER(Name("DecodePng").Device(DEVICE_CPU), DecodeImageOp);
-REGISTER_KERNEL_BUILDER(Name("DecodeGif").Device(DEVICE_CPU), DecodeImageOp);
+REGISTER_KERNEL_BUILDER(Name("DecodeJpeg").Device(DEVICE_CPU), DecodeImageV2Op);
+REGISTER_KERNEL_BUILDER(Name("DecodePng").Device(DEVICE_CPU), DecodeImageV2Op);
+REGISTER_KERNEL_BUILDER(Name("DecodeGif").Device(DEVICE_CPU), DecodeImageV2Op);
 REGISTER_KERNEL_BUILDER(Name("DecodeAndCropJpeg").Device(DEVICE_CPU),
-                        DecodeImageOp);
+                        DecodeImageV2Op);
 REGISTER_KERNEL_BUILDER(Name("DecodeImage").Device(DEVICE_CPU),
                         DecodeImageV2Op);
+REGISTER_KERNEL_BUILDER(Name("DecodeBmp").Device(DEVICE_CPU), DecodeImageV2Op);
 
 void DecodeImageV2Op::DecodeBMP(const uint8* input, const int row_size,
                                 uint8* const output, const int width,
diff --git a/tensorflow/python/ops/image_ops_test.py b/tensorflow/python/ops/image_ops_test.py
index a05209c2038..da0492e3a56 100644
--- a/tensorflow/python/ops/image_ops_test.py
+++ b/tensorflow/python/ops/image_ops_test.py
@@ -5175,8 +5175,8 @@ class SobelEdgesTest(test_util.TensorFlowTestCase):
 class DecodeImageTest(test_util.TensorFlowTestCase):
 
   _FORWARD_COMPATIBILITY_HORIZONS = [
-      (2020, 6, 11),
-      (2020, 7, 11),
+      (2020, 1, 1),
+      (2020, 7, 14),
       (2525, 1, 1),  # future behavior
   ]
 

From a2fc81967a7f432bfd3a9661b9ff22ed4c20d5ac Mon Sep 17 00:00:00 2001
From: Ashwin Murthy <ashwinm@google.com>
Date: Thu, 25 Jun 2020 19:41:33 -0700
Subject: [PATCH 1134/1390] Fix github links to be permanent links for
 references in g3docs

PiperOrigin-RevId: 318397774
Change-Id: I28146684a831046be47ec5c55df71fbe27682605
---
 .../lite/g3doc/convert/operation_fusion.md    | 16 +++++-----
 tensorflow/lite/g3doc/convert/rnn.md          | 32 +++++++++----------
 2 files changed, 23 insertions(+), 25 deletions(-)

diff --git a/tensorflow/lite/g3doc/convert/operation_fusion.md b/tensorflow/lite/g3doc/convert/operation_fusion.md
index 74af2a08b81..d5ad874561c 100644
--- a/tensorflow/lite/g3doc/convert/operation_fusion.md
+++ b/tensorflow/lite/g3doc/convert/operation_fusion.md
@@ -77,7 +77,7 @@ TensorFlow Lite fused operations is below:
 
 In the TensorFlow model source code, identify and abstract out the composite
 operation into a `tf.function` with the
-[experimental\_implements](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/eager/function.py#L88)
+[experimental\_implements](https://github.com/tensorflow/tensorflow/blob/c11d5d8881fd927165eeb09fd524a80ebaf009f2/tensorflow/python/eager/def_function.py#L470)
 function annotation. See an example of [embedding lookup](#composing_ops). The
 function defines the interface and its arguments should be used to implement the
 conversion logic.
@@ -90,12 +90,12 @@ The conversion code is written per the interface of the function with the
 composite implementation of this interface with the fused one.
 
 In the prepare-composite-functions pass, plugin in your
-[conversion code](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/compiler/mlir/lite/transforms/prepare_composite_functions_tf.cc#L108).
+[conversion code](https://github.com/tensorflow/tensorflow/blob/c11d5d8881fd927165eeb09fd524a80ebaf009f2/tensorflow/compiler/mlir/lite/transforms/prepare_composite_functions_tf.cc#L115).
 
 In more advanced usages, it is possible to implement complex transformations of
 the composite operation's operands in order to derive the operands of the fused
 operation. See
-[Keras LSTM](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/compiler/mlir/lite/utils/lstm_utils.cc#L627).
+[Keras LSTM](https://github.com/tensorflow/tensorflow/blob/1099faa8d6a941ef44d09ed8c372ff0ffda94112/tensorflow/compiler/mlir/lite/utils/lstm_utils.cc#L627).
 conversion code as an example.
 
 ### Convert to TensorFlow Lite
@@ -116,7 +116,7 @@ operations in TensorFlow Lite.
 <a id="composing_ops"></a>
 
 The use of `tf.function` with the
-[experimental\_implements](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/eager/function.py#L88)
+[experimental\_implements](https://github.com/tensorflow/tensorflow/blob/c11d5d8881fd927165eeb09fd524a80ebaf009f2/tensorflow/python/eager/def_function.py#L470)
 function attribute allows users to explicitly compose new operations using
 TensorFlow primitive operations and specify the interface that the resultant
 composite operation implements. This is very useful as it provides:
@@ -197,10 +197,10 @@ models in the converter to support the composite operation fusion use case.
 Specifically, the new features added are:
 
 1.  Importing TensorFlow
-    [saved models into MLIR](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/compiler/mlir/tensorflow/translate/import_model.cc#L3593)
-1.  [fuse composite operations](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/compiler/mlir/lite/transforms/prepare_composite_functions_tf.cc#L103)
-1.  [variable mutability analysis](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/compiler/mlir/tensorflow/transforms/optimize_global_tensors.cc#L43)
-1.  [freeze all read-only variables](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/compiler/mlir/tensorflow/transforms/freeze_global_tensors.cc#L44)
+    [saved models into MLIR](https://github.com/tensorflow/tensorflow/blob/1099faa8d6a941ef44d09ed8c372ff0ffda94112/tensorflow/compiler/mlir/tensorflow/translate/import_model.cc#L3748)
+1.  [fuse composite operations](https://github.com/tensorflow/tensorflow/blob/1099faa8d6a941ef44d09ed8c372ff0ffda94112/tensorflow/compiler/mlir/lite/transforms/prepare_composite_functions_tf.cc#L103)
+1.  [variable mutability analysis](https://github.com/tensorflow/tensorflow/blob/1099faa8d6a941ef44d09ed8c372ff0ffda94112/tensorflow/compiler/mlir/tensorflow/transforms/optimize_global_tensors.cc#L43)
+1.  [freeze all read-only variables](https://github.com/tensorflow/tensorflow/blob/1099faa8d6a941ef44d09ed8c372ff0ffda94112/tensorflow/compiler/mlir/tensorflow/transforms/freeze_global_tensors.cc#L44)
 
 This allows us to perform operation fusion using the functions representing the
 composite operations prior to function inlining and variable freezing.
diff --git a/tensorflow/lite/g3doc/convert/rnn.md b/tensorflow/lite/g3doc/convert/rnn.md
index ce9cf91f867..cd0efe5b854 100644
--- a/tensorflow/lite/g3doc/convert/rnn.md
+++ b/tensorflow/lite/g3doc/convert/rnn.md
@@ -16,9 +16,9 @@ two fold:
     **user-defined** **RNN implementations** to plug in and get converted to
     TensorFlow Lite. We provide a couple of out of box examples of such
     conversion using lingvo’s
-    [LSTMCellSimple](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/compiler/mlir/lite/transforms/prepare_composite_functions_tf.cc#L123)
+    [LSTMCellSimple](https://github.com/tensorflow/tensorflow/blob/82abf0dbf316526cd718ae8cd7b11cfcb805805e/tensorflow/compiler/mlir/lite/transforms/prepare_composite_functions_tf.cc#L130)
     and
-    [LayerNormalizedLSTMCellSimple](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/compiler/mlir/lite/utils/lstm_utils.cc#L519)
+    [LayerNormalizedLSTMCellSimple](https://github.com/tensorflow/tensorflow/blob/c11d5d8881fd927165eeb09fd524a80ebaf009f2/tensorflow/compiler/mlir/lite/transforms/prepare_composite_functions_tf.cc#L137)
     RNN interfaces.
 
 ## Converter API
@@ -71,9 +71,9 @@ illustrates the end to end usage with the TensorFlow Lite interpreter.
 
 We support out-of-the-box conversion of Keras LSTM to TensorFlow Lite. For
 details on how this works please refer to the
-[Keras LSTM interface](https://colab.sandbox.google.com/github/tensorflow/tensorflow/blob/master/tensorflow/lite/examples/experimental_new_converter/Keras_LSTM_fusion_Codelab.ipynb)<span style="text-decoration:space;">
+[Keras LSTM interface](https://github.com/tensorflow/tensorflow/blob/35a3ab91b42503776f428bda574b74b9a99cd110/tensorflow/python/keras/layers/recurrent_v2.py#L1238)<span style="text-decoration:space;">
 </span>and to the conversion logic
-[here](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/compiler/mlir/lite/utils/lstm_utils.cc#L627).
+[here](https://github.com/tensorflow/tensorflow/blob/35a3ab91b42503776f428bda574b74b9a99cd110/tensorflow/compiler/mlir/lite/utils/lstm_utils.cc#L627).
 
 Also important is to highlight the TensorFlow Lite’s LSTM contract with respect
 to the Keras operation definition:
@@ -86,8 +86,6 @@ to the Keras operation definition:
     split into 4 equal sized tensors along the dimension 0. These correspond to
     **input gate, forget gate, cell, and output gate**.
 
-See the detailed conversion code from Keras LSTM to TensorFlow Lite
-[here](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/compiler/mlir/lite/utils/lstm_utils.cc#L627).
 
 #### Keras LSTM Variants
 
@@ -96,13 +94,13 @@ See the detailed conversion code from Keras LSTM to TensorFlow Lite
 Users may choose time-major or no time-major. Keras LSTM adds a time-major
 attribute in the function def attributes. For Unidirectional sequence LSTM, we
 can simply map to unidirecional\_sequence\_lstm's
-[time major attribute](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/compiler/mlir/lite/ir/tfl_ops.td#L3508).
+[time major attribute](https://github.com/tensorflow/tensorflow/blob/35a3ab91b42503776f428bda574b74b9a99cd110/tensorflow/compiler/mlir/lite/ir/tfl_ops.td#L3902).
 
 ##### BiDirectional LSTM
 
 Bidirectional LSTM can be implemented with two Keras LSTM layers, one for
 forward and one for backward, see examples
-[here](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/keras/layers/wrappers.py#L381).
+[here](https://github.com/tensorflow/tensorflow/blob/35a3ab91b42503776f428bda574b74b9a99cd110/tensorflow/python/keras/layers/wrappers.py#L382).
 Once we see the go\_backward attribute, we recognize it as backward LSTM, then
 we group forward & backward LSTM together. **This is future work.** Currently,
 this creates two UnidirectionalSequenceLSTM operations in the TensorFlow Lite
@@ -113,13 +111,13 @@ model.
 TensorFlow Lite also provides a way to convert user defined LSTM
 implementations. Here we use Lingvo’s LSTM as an example of how that can be
 implemented. For details please refer to the
-[lingvo.LSTMCellSimple interface](https://github.com/tensorflow/lingvo/blob/master/lingvo/core/rnn_cell.py#L230)
+[lingvo.LSTMCellSimple interface](https://github.com/tensorflow/lingvo/blob/91a4609dbc2579748a95110eda59c66d17c594c5/lingvo/core/rnn_cell.py#L228)
 and the conversion logic
-[here](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/compiler/mlir/lite/transforms/prepare_composite_functions_tf.cc#L123).
+[here](https://github.com/tensorflow/tensorflow/blob/82abf0dbf316526cd718ae8cd7b11cfcb805805e/tensorflow/compiler/mlir/lite/transforms/prepare_composite_functions_tf.cc#L130).
 We also provide an example for another of Lingvo’s LSTM definitions in
-[lingvo.LayerNormalizedLSTMCellSimple interface](https://github.com/tensorflow/lingvo/blob/master/lingvo/core/rnn_cell.py#L1179)
+[lingvo.LayerNormalizedLSTMCellSimple interface](https://github.com/tensorflow/lingvo/blob/91a4609dbc2579748a95110eda59c66d17c594c5/lingvo/core/rnn_cell.py#L1173)
 and its convertion logic
-[here](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/compiler/mlir/lite/transforms/prepare_composite_functions_tf.cc#L130).
+[here](https://github.com/tensorflow/tensorflow/blob/c11d5d8881fd927165eeb09fd524a80ebaf009f2/tensorflow/compiler/mlir/lite/transforms/prepare_composite_functions_tf.cc#L137).
 
 ## “Bring your own TensorFlow RNN” to TensorFlow Lite
 
@@ -138,7 +136,7 @@ functionality that is currently exposed by TensorFlow Lite’s fused LSTM op lik
 layer normalization), then extend the TensorFlow Lite converter by writing
 custom conversion code and plug it into the prepare-composite-functions
 MLIR-pass
-[here](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/compiler/mlir/lite/transforms/prepare_composite_functions_tf.cc#L108).
+[here](https://github.com/tensorflow/tensorflow/blob/c11d5d8881fd927165eeb09fd524a80ebaf009f2/tensorflow/compiler/mlir/lite/transforms/prepare_composite_functions_tf.cc#L115).
 The function’s interface should be treated like an API contract and should
 contain the arguments needed to convert to fused TensorFlow Lite LSTM
 operations - i.e. input, bias, weights, projection, layer normalization, etc. It
@@ -152,7 +150,7 @@ A complete example of such conversion flow is Lingvo’s LSTMCellSimple to
 TensorFlow Lite conversion.
 
 The LSTMCellSimple in Lingvo is defined
-[here](https://github.com/tensorflow/lingvo/blob/master/lingvo/core/rnn_cell.py#L230).
+[here](https://github.com/tensorflow/lingvo/blob/91a4609dbc2579748a95110eda59c66d17c594c5/lingvo/core/rnn_cell.py#L228).
 Models trained with this LSTM cell can be converted to TensorFlow Lite as
 follows:
 
@@ -164,7 +162,7 @@ follows:
     conversion code.
 1.  Extend the prepare-composite-functions pass to plug in a custom composite op
     to TensorFlow Lite fused LSTM op conversion. See
-    [LSTMCellSimple](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/compiler/mlir/lite/transforms/prepare_composite_functions_tf.cc#L123)
+    [LSTMCellSimple](https://github.com/tensorflow/tensorflow/blob/82abf0dbf316526cd718ae8cd7b11cfcb805805e/tensorflow/compiler/mlir/lite/transforms/prepare_composite_functions_tf.cc#L130)
     conversion code.
 
     The conversion contract:
@@ -180,10 +178,10 @@ follows:
 1.  The **projection** is extracted by slicing the transposed projection tensor.
 
 1.  Similar conversion is written for
-    [LayerNormalizedLSTMCellSimple](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/compiler/mlir/lite/utils/lstm_utils.cc#L519).
+    [LayerNormalizedLSTMCellSimple](https://github.com/tensorflow/tensorflow/blob/c11d5d8881fd927165eeb09fd524a80ebaf009f2/tensorflow/compiler/mlir/lite/transforms/prepare_composite_functions_tf.cc#L137).
 
 1.  The rest of the TensorFlow Lite conversion infrastructure, including all the
-    [MLIR passes](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/compiler/mlir/lite/tf_tfl_passes.cc#L58)
+    [MLIR passes](https://github.com/tensorflow/tensorflow/blob/35a3ab91b42503776f428bda574b74b9a99cd110/tensorflow/compiler/mlir/lite/tf_tfl_passes.cc#L57)
     defined as well as the final export to TensorFlow Lite flatbuffer can be
     reused.
 

From 6e39967e512ecafa376e372935ebcead391835db Mon Sep 17 00:00:00 2001
From: Sanjoy Das <sanjoy@google.com>
Date: Thu, 25 Jun 2020 20:15:50 -0700
Subject: [PATCH 1135/1390] Some refactoring groundwork towards an LHLO backend
 for XLA GPU; NFC

  1. Rename partition_assignment to launch_dimensions
  2. Make launch_dimensions not depend on stream executor
  3. Make XLA GPU IR emitters not depend on se::DeviceDescription

PiperOrigin-RevId: 318400758
Change-Id: Ie4c7f2334b7e922e1c607f2bd6a89a693377fb96
---
 tensorflow/compiler/xla/service/gpu/BUILD     | 49 ++++++-------------
 .../xla/service/gpu/buffer_comparator.cc      | 17 ++++---
 .../compiler/xla/service/gpu/gpu_compiler.cc  | 25 ++++++++--
 .../compiler/xla/service/gpu/gpu_compiler.h   |  1 -
 .../xla/service/gpu/gpu_device_info.h         | 39 +++++++++++++++
 .../xla/service/gpu/ir_emission_utils.cc      |  8 +--
 .../xla/service/gpu/ir_emission_utils.h       |  4 +-
 .../compiler/xla/service/gpu/ir_emitter.cc    |  9 ++--
 .../xla/service/gpu/ir_emitter_context.h      | 32 ++++++------
 .../xla/service/gpu/ir_emitter_unnested.cc    | 33 +++++++------
 .../xla/service/gpu/ir_emitter_unnested.h     |  8 +--
 .../compiler/xla/service/gpu/kernel_thunk.h   |  2 +-
 ...ion_assignment.cc => launch_dimensions.cc} | 26 ++++------
 ...ition_assignment.h => launch_dimensions.h} | 26 ++++------
 .../xla/service/gpu/parallel_loop_emitter.h   |  2 +-
 .../compiler/xla/service/gpu/thunk_emitter.cc |  2 +-
 .../compiler/xla/service/gpu/thunk_emitter.h  |  4 +-
 tensorflow/compiler/xla/service/llvm_ir/BUILD |  4 +-
 .../llvm_ir/dynamic_update_slice_util.cc      |  3 +-
 .../llvm_ir/dynamic_update_slice_util.h       |  2 +-
 .../compiler/xla/service/llvm_ir/sort_util.cc |  2 +-
 .../compiler/xla/service/llvm_ir/sort_util.h  |  2 +-
 .../compiler/xla/service/mlir_gpu/BUILD       |  2 +-
 .../service/mlir_gpu/lhlo_dialect_emitter.cc  |  4 +-
 .../service/mlir_gpu/lhlo_dialect_emitter.h   |  2 +-
 .../service/mlir_gpu/mlir_compiler_impl.cc    |  2 +-
 26 files changed, 170 insertions(+), 140 deletions(-)
 create mode 100644 tensorflow/compiler/xla/service/gpu/gpu_device_info.h
 rename tensorflow/compiler/xla/service/gpu/{partition_assignment.cc => launch_dimensions.cc} (79%)
 rename tensorflow/compiler/xla/service/gpu/{partition_assignment.h => launch_dimensions.h} (65%)

diff --git a/tensorflow/compiler/xla/service/gpu/BUILD b/tensorflow/compiler/xla/service/gpu/BUILD
index f93b0d7df5d..5fb405ced53 100644
--- a/tensorflow/compiler/xla/service/gpu/BUILD
+++ b/tensorflow/compiler/xla/service/gpu/BUILD
@@ -91,45 +91,20 @@ cc_library(
 )
 
 cc_library(
-    name = "partition_assignment",
+    name = "launch_dimensions",
     srcs = [
-        "partition_assignment.cc",
+        "launch_dimensions.cc",
     ],
     hdrs = [
-        "partition_assignment.h",
+        "launch_dimensions.h",
     ],
     deps = [
+        ":gpu_device_info",
         "//tensorflow/compiler/xla:shape_util",
-        "//tensorflow/compiler/xla:types",
-        "//tensorflow/compiler/xla:util",
-        "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/core:lib",
-        "//tensorflow/core:stream_executor_no_cuda",
-        "@com_google_absl//absl/memory",
-        "@com_google_absl//absl/strings:str_format",
     ],
 )
 
-# TODO(b/29140563) This target is flaky, disabled until flakiness is
-# root-caused. Failed on 2016-06-08.
-#tf_cc_test(
-#    name = "partition_assignment_test",
-#    srcs = [
-#        "partition_assignment_test.cc",
-#    ],
-#    tags = tf_cuda_tests_tags(),
-#    deps = [
-#        ":partition_assignment",
-#        "//tensorflow/core:stream_executor_no_cuda",
-#        "//tensorflow/compiler/xla:shape_util",
-#        "//tensorflow/compiler/xla:xla_data_proto",
-#        "//tensorflow/compiler/xla/service:gpu_plugin",
-#        "//tensorflow/compiler/xla/service:hlo",
-#        "//tensorflow/compiler/xla/tests:hlo_test_base",
-#        "//tensorflow/core:test_main",
-#    ],
-#)
-
 tf_cc_test(
     name = "custom_call_test",
     srcs = if_cuda_is_configured(["custom_call_test.cc"]),
@@ -245,6 +220,11 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "gpu_device_info",
+    hdrs = ["gpu_device_info.h"],
+)
+
 cc_library(
     name = "ir_emitter",
     srcs = [
@@ -269,9 +249,9 @@ cc_library(
         ":gpu_executable",
         ":hlo_to_ir_bindings",
         ":ir_emission_utils",
+        ":launch_dimensions",
         ":nccl_all_reduce_thunk",
         ":parallel_loop_emitter",
-        ":partition_assignment",
         ":target_util",
         ":thunk",
         ":thunk_emitter",
@@ -319,7 +299,7 @@ cc_library(
     srcs = ["parallel_loop_emitter.cc"],
     hdrs = ["parallel_loop_emitter.h"],
     deps = [
-        ":partition_assignment",
+        ":launch_dimensions",
         ":target_util",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
@@ -572,7 +552,7 @@ cc_library(
         ":ir_emission_utils",
         ":nccl_all_reduce_thunk",  # fixdeps: keep
         ":outfeed_manager",
-        ":partition_assignment",
+        ":launch_dimensions",
         ":stream_assignment",
         ":stream_executor_util",
         ":thunk",
@@ -636,6 +616,7 @@ cc_library(
     hdrs = ["ir_emission_utils.h"],
     deps = [
         ":backend_configs_cc",
+        ":gpu_device_info",
         ":target_util",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:util",
@@ -1169,8 +1150,8 @@ cc_library(
         ":instruction_fusion",
         ":ir_emission_utils",
         ":ir_emitter",
+        ":launch_dimensions",
         ":multi_output_fusion",
-        ":partition_assignment",
         ":reduction_degenerate_dim_remover",
         ":reduction_dimension_grouper",
         ":reduction_layout_normalizer",
@@ -1531,7 +1512,7 @@ cc_library(
     srcs = if_cuda_is_configured(["buffer_comparator.cc"]),
     hdrs = if_cuda_is_configured(["buffer_comparator.h"]),
     deps = if_cuda_is_configured([
-        ":partition_assignment",
+        ":launch_dimensions",
         ":stream_executor_util",
         "@com_google_absl//absl/base",
         "@com_google_absl//absl/strings",
diff --git a/tensorflow/compiler/xla/service/gpu/buffer_comparator.cc b/tensorflow/compiler/xla/service/gpu/buffer_comparator.cc
index 3733372b68b..9b192aaa8e1 100644
--- a/tensorflow/compiler/xla/service/gpu/buffer_comparator.cc
+++ b/tensorflow/compiler/xla/service/gpu/buffer_comparator.cc
@@ -20,7 +20,7 @@ limitations under the License.
 
 #include "absl/base/call_once.h"
 #include "absl/strings/str_replace.h"
-#include "tensorflow/compiler/xla/service/gpu/partition_assignment.h"
+#include "tensorflow/compiler/xla/service/gpu/launch_dimensions.h"
 #include "tensorflow/compiler/xla/service/gpu/stream_executor_util.h"
 #include "tensorflow/compiler/xla/service/hlo_module_config.h"
 #include "tensorflow/compiler/xla/shape_util.h"
@@ -358,7 +358,7 @@ BB1_8:
 
 BB2_3:
  {
- .reg .b32 %temp; 
+ .reg .b32 %temp;
  mov.b64  {%temp, %r2}, %fd2;
  }
  and.b32   %r7, %r2, 2147483647;
@@ -366,14 +366,14 @@ BB2_3:
  @%p4 bra  BB2_8;
 
  {
- .reg .b32 %temp; 
+ .reg .b32 %temp;
  mov.b64  {%r8, %temp}, %fd2;
  }
  setp.ne.s32 %p5, %r8, 0;
  @%p5 bra  BB2_8;
 
  {
- .reg .b32 %temp; 
+ .reg .b32 %temp;
  mov.b64  {%temp, %r3}, %fd1;
  }
  and.b32   %r9, %r3, 2147483647;
@@ -381,7 +381,7 @@ BB2_3:
  @%p6 bra  BB2_8;
 
  {
- .reg .b32 %temp; 
+ .reg .b32 %temp;
  mov.b64  {%r10, %temp}, %fd1;
  }
  setp.ne.s32 %p7, %r10, 0;
@@ -605,8 +605,13 @@ static StatusOr<bool> DeviceCompare(se::Stream* stream,
                                    se::DeviceMemory<uint64>>(
           kernel_name, buffer_compare_ptx, compiled_ptx)));
 
+  GpuDeviceInfo gpu_device_info;
+  gpu_device_info.threads_per_block_limit =
+      executor->GetDeviceDescription().threads_per_block_limit();
+  gpu_device_info.threads_per_warp =
+      executor->GetDeviceDescription().threads_per_warp();
   LaunchDimensions dim =
-      CalculateLaunchDimensions(buffer_shape, executor->GetDeviceDescription());
+      CalculateLaunchDimensions(buffer_shape, gpu_device_info);
 
   stream->ThenLaunch(se::ThreadDim(dim.threads_per_block()),
                      se::BlockDim(dim.block_count()), *comparison_kernel,
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc b/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
index f13665a8cba..3dd722c885d 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
@@ -59,9 +59,9 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
 #include "tensorflow/compiler/xla/service/gpu/ir_emitter_context.h"
 #include "tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h"
+#include "tensorflow/compiler/xla/service/gpu/launch_dimensions.h"
 #include "tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.h"
 #include "tensorflow/compiler/xla/service/gpu/multi_output_fusion.h"
-#include "tensorflow/compiler/xla/service/gpu/partition_assignment.h"
 #include "tensorflow/compiler/xla/service/gpu/reduction_degenerate_dim_remover.h"
 #include "tensorflow/compiler/xla/service/gpu/reduction_dimension_grouper.h"
 #include "tensorflow/compiler/xla/service/gpu/reduction_layout_normalizer.h"
@@ -512,9 +512,28 @@ StatusOr<std::unique_ptr<Executable>> GpuCompiler::RunBackend(
           << buffer_assignment->GetStats().ToString();
   DumpHloModuleIfEnabled(*module, *buffer_assignment, "after_optimizations");
 
+  GpuDeviceInfo gpu_device_info;
+  gpu_device_info.threads_per_block_limit =
+      stream_exec->GetDeviceDescription().threads_per_block_limit();
+  gpu_device_info.threads_per_warp =
+      stream_exec->GetDeviceDescription().threads_per_warp();
+  gpu_device_info.shared_memory_per_block =
+      stream_exec->GetDeviceDescription().shared_memory_per_block();
+
+  absl::optional<CudaComputeCapability> cuda_compute_capability =
+      [&]() -> absl::optional<CudaComputeCapability> {
+    CudaComputeCapability cuda_compute_capability;
+    stream_exec->GetDeviceDescription().cuda_compute_capability(
+        &cuda_compute_capability.cc_major, &cuda_compute_capability.cc_minor);
+    if (cuda_compute_capability.cc_major == -1) {
+      return absl::nullopt;
+    }
+    return cuda_compute_capability;
+  }();
+
   IrEmitterContext ir_emitter_context(
-      module.get(), buffer_assignment.get(), stream_exec->platform(),
-      &stream_exec->GetDeviceDescription(), &llvm_module);
+      module.get(), buffer_assignment.get(), stream_exec->platform()->Name(),
+      gpu_device_info, cuda_compute_capability, &llvm_module);
 
   HloComputation* entry_computation = module->entry_computation();
   IrEmitterUnnested ir_emitter(module->config(), entry_computation,
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_compiler.h b/tensorflow/compiler/xla/service/gpu/gpu_compiler.h
index e8322b70df4..a7706005ba2 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_compiler.h
+++ b/tensorflow/compiler/xla/service/gpu/gpu_compiler.h
@@ -117,7 +117,6 @@ class GpuCompiler : public LLVMCompiler {
 
   TF_DISALLOW_COPY_AND_ASSIGN(GpuCompiler);
 };
-
 }  // namespace gpu
 }  // namespace xla
 
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_device_info.h b/tensorflow/compiler/xla/service/gpu/gpu_device_info.h
new file mode 100644
index 00000000000..7352bad1a66
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/gpu_device_info.h
@@ -0,0 +1,39 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_GPU_DEVICE_INFO_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_GPU_DEVICE_INFO_H_
+
+namespace xla {
+namespace gpu {
+
+// THe information contained in these structures is also contained in
+// se::DeviceDescription, but separating these out lets us write code that does
+// not depend on stream executor.
+
+struct CudaComputeCapability {
+  int cc_major;
+  int cc_minor;
+};
+
+struct GpuDeviceInfo {
+  int threads_per_block_limit;
+  int threads_per_warp;
+  int shared_memory_per_block;
+};
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_GPU_DEVICE_INFO_H_
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emission_utils.cc b/tensorflow/compiler/xla/service/gpu/ir_emission_utils.cc
index 01bcf456f75..6309d7fcdee 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emission_utils.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emission_utils.cc
@@ -129,7 +129,7 @@ bool IsCublasGemm(const HloInstruction& hlo) {
 std::array<int64, 3> GetReductionTiling(
     const ReductionDimensions& reduction_dimensions,
     int smallest_input_dtype_bits,
-    const stream_executor::DeviceDescription* device_description) {
+    absl::optional<CudaComputeCapability> cuda_compute_capability) {
   if (reduction_dimensions.is_row_reduction) {
     int64 tile_z = std::min(reduction_dimensions.dimensions[0], int64{8});
     if (reduction_dimensions.dimensions[1] == 1) {
@@ -140,9 +140,9 @@ std::array<int64, 3> GetReductionTiling(
         0) {
       return {tile_z, 1, 64};
     }
-    int cc_major = 0, cc_minor = 0;
-    if (device_description != nullptr) {
-      device_description->cuda_compute_capability(&cc_major, &cc_minor);
+    int cc_major = 0;
+    if (cuda_compute_capability) {
+      cc_major = cuda_compute_capability->cc_major;
     }
     int unroll_x = 8;
     if (cc_major >= 6 && smallest_input_dtype_bits == 16) {
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emission_utils.h b/tensorflow/compiler/xla/service/gpu/ir_emission_utils.h
index 8a2385a242b..6f731b2936f 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emission_utils.h
+++ b/tensorflow/compiler/xla/service/gpu/ir_emission_utils.h
@@ -20,10 +20,10 @@ limitations under the License.
 
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Value.h"
+#include "tensorflow/compiler/xla/service/gpu/gpu_device_info.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_instructions.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
-#include "tensorflow/stream_executor/device_description.h"
 
 // TODO(jlebar): Move functions related to cublas/cudnn to a separate file; they
 // don't belong in "ir_emission_utils".
@@ -199,7 +199,7 @@ ReductionDimensions GetReductionKindAndContiguousComponents(
 std::array<int64, 3> GetReductionTiling(
     const ReductionDimensions& reduction_dimensions,
     int smallest_input_dtype_bits,
-    const stream_executor::DeviceDescription* device_description);
+    absl::optional<CudaComputeCapability> cuda_compute_capability);
 
 // Emits call to "vprintf" with given format and arguments.
 llvm::Value* EmitPrintf(absl::string_view fmt,
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter.cc
index 5b1a0d7b2bf..04e24733971 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter.cc
@@ -32,7 +32,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.h"
 #include "tensorflow/compiler/xla/service/gpu/ir_emitter_nested.h"
 #include "tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h"
-#include "tensorflow/compiler/xla/service/gpu/partition_assignment.h"
+#include "tensorflow/compiler/xla/service/gpu/launch_dimensions.h"
 #include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instructions.h"
@@ -239,11 +239,8 @@ bool IrEmitter::MaybeEmitDirectAtomicOperation(
     if (target_triple.isNVPTX()) {
       // "atom.add.f64 requires sm_60 or higher."
       // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-atom
-      int cc_major = 0, cc_minor = 0;
-      ir_emitter_context_->device_description().cuda_compute_capability(
-          &cc_major, &cc_minor);
-
-      bool f64_atomic_add_supported = cc_major >= 6;
+      bool f64_atomic_add_supported =
+          ir_emitter_context_->cuda_compute_capability()->cc_major >= 6;
 
       bool atomic_add_supported =
           element_type == F32 ||
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_context.h b/tensorflow/compiler/xla/service/gpu/ir_emitter_context.h
index b9d944b5dc1..7678bb23184 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_context.h
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_context.h
@@ -18,27 +18,27 @@ limitations under the License.
 
 #include "llvm/IR/Module.h"
 #include "tensorflow/compiler/xla/service/buffer_assignment.h"
-#include "tensorflow/compiler/xla/service/gpu/partition_assignment.h"
+#include "tensorflow/compiler/xla/service/gpu/launch_dimensions.h"
 #include "tensorflow/compiler/xla/service/name_uniquer.h"
-#include "tensorflow/core/platform/stream_executor_no_cuda.h"
 
 namespace xla {
 namespace gpu {
-
 // IrEmitterContext encapsulates common (mutable and immutable) data structures
 // used by both IrEmitterNested and IrEmitterUnnested, such as the buffer
 // assignment and the name uniquer.
 class IrEmitterContext {
  public:
-  IrEmitterContext(const HloModule* hlo_module,
-                   const BufferAssignment* buffer_assignment,
-                   const se::Platform* platform,
-                   const se::DeviceDescription* device_desc,
-                   llvm::Module* llvm_module)
+  // cuda_compute_capability is nullopt if we're not compiling for NVIDIA GPUs.
+  IrEmitterContext(
+      const HloModule* hlo_module, const BufferAssignment* buffer_assignment,
+      std::string platform_name, GpuDeviceInfo gpu_device_info,
+      absl::optional<CudaComputeCapability> cuda_compute_capability,
+      llvm::Module* llvm_module)
       : hlo_module_(hlo_module),
         buffer_assignment_(buffer_assignment),
-        platform_(platform),
-        device_desc_(device_desc),
+        platform_name_(std::move(platform_name)),
+        gpu_device_info_(gpu_device_info),
+        cuda_compute_capability_(cuda_compute_capability),
         llvm_module_(llvm_module) {}
   // Disallow copy and assign.
   IrEmitterContext(const IrEmitterContext&) = delete;
@@ -49,9 +49,10 @@ class IrEmitterContext {
   const BufferAssignment& buffer_assignment() const {
     return *buffer_assignment_;
   }
-  const se::Platform* platform() const { return platform_; }
-  const se::DeviceDescription& device_description() const {
-    return *device_desc_;
+  absl::string_view platform_name() const { return platform_name_; }
+  GpuDeviceInfo gpu_device_info() const { return gpu_device_info_; }
+  absl::optional<CudaComputeCapability> cuda_compute_capability() const {
+    return cuda_compute_capability_;
   }
   llvm::Module* llvm_module() { return llvm_module_; }
   NameUniquer* name_uniquer() { return &name_uniquer_; }
@@ -59,8 +60,9 @@ class IrEmitterContext {
  private:
   const HloModule* hlo_module_;
   const BufferAssignment* buffer_assignment_;
-  const se::Platform* platform_;
-  const se::DeviceDescription* device_desc_;
+  std::string platform_name_;
+  GpuDeviceInfo gpu_device_info_;
+  absl::optional<CudaComputeCapability> cuda_compute_capability_;
   llvm::Module* llvm_module_;
   NameUniquer name_uniquer_;
 };
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
index 6882a188374..93dc8de0e0c 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
@@ -54,10 +54,10 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/gpu/ir_emitter_context.h"
 #include "tensorflow/compiler/xla/service/gpu/kernel_mapping_scheme.h"
 #include "tensorflow/compiler/xla/service/gpu/kernel_thunk.h"
+#include "tensorflow/compiler/xla/service/gpu/launch_dimensions.h"
 #include "tensorflow/compiler/xla/service/gpu/memset_thunk.h"
 #include "tensorflow/compiler/xla/service/gpu/nccl_all_reduce_thunk.h"
 #include "tensorflow/compiler/xla/service/gpu/parallel_loop_emitter.h"
-#include "tensorflow/compiler/xla/service/gpu/partition_assignment.h"
 #include "tensorflow/compiler/xla/service/gpu/replica_id_thunk.h"
 #include "tensorflow/compiler/xla/service/gpu/sequential_thunk.h"
 #include "tensorflow/compiler/xla/service/gpu/target_util.h"
@@ -466,7 +466,7 @@ Status IrEmitterUnnested::HandlePadToStatic(HloInstruction* pad_to_static) {
   };
 
   LaunchDimensions launch_dimensions = CalculateLaunchDimensions(
-      input_shape, ir_emitter_context_->device_description(), unroll_factor);
+      input_shape, ir_emitter_context_->gpu_device_info(), unroll_factor);
   UpdateLaunchDimensions(launch_dimensions, kernel_thunk.get(),
                          ir_emitter_context_->llvm_module());
   TF_RETURN_IF_ERROR(
@@ -574,7 +574,7 @@ Status IrEmitterUnnested::HandleSliceToDynamic(
   };
 
   LaunchDimensions launch_dimensions = CalculateLaunchDimensions(
-      input_shape, ir_emitter_context_->device_description(), unroll_factor);
+      input_shape, ir_emitter_context_->gpu_device_info(), unroll_factor);
   UpdateLaunchDimensions(launch_dimensions, kernel_thunk.get(),
                          ir_emitter_context_->llvm_module());
 
@@ -710,7 +710,7 @@ Status IrEmitterUnnested::HandleFusion(HloInstruction* fusion) {
     IrArray output_array = GetIrArray(*fusion, *fusion);
 
     LaunchDimensions launch_dimensions = CalculateLaunchDimensions(
-        update_shape, ir_emitter_context_->device_description());
+        update_shape, ir_emitter_context_->gpu_device_info());
     UpdateLaunchDimensions(launch_dimensions, fusion_thunk.get(),
                            ir_emitter_context_->llvm_module());
     AddThunkToThunkSequence(std::move(fusion_thunk));
@@ -854,7 +854,7 @@ Status IrEmitterUnnested::HandleSelectAndScatter(
   }
 
   LaunchDimensions launch_dimensions = CalculateLaunchDimensions(
-      source->shape(), ir_emitter_context_->device_description());
+      source->shape(), ir_emitter_context_->gpu_device_info());
   llvm::Type* index_type = GetIndexTypeForKernel(
       select_and_scatter, launch_dimensions.launch_bound(), &b_);
   auto index_typed_constant = [&](uint64 c) -> llvm::Constant* {
@@ -1240,7 +1240,7 @@ Status IrEmitterUnnested::EmitScatter(
   // also do one kernel per window instead if bounds checks turn out to be a
   // bottleneck.
   LaunchDimensions launch_dimensions = CalculateLaunchDimensions(
-      updates->shape(), ir_emitter_context_->device_description());
+      updates->shape(), ir_emitter_context_->gpu_device_info());
   UpdateLaunchDimensions(launch_dimensions, thunk,
                          ir_emitter_context_->llvm_module());
 
@@ -1325,7 +1325,7 @@ Status IrEmitterUnnested::HandleSort(HloInstruction* sort) {
   standard_iteration_shape.set_dimensions(dimension_to_sort,
                                           standard_num_iterations_in_sort_dim);
   LaunchDimensions standard_launch_dimensions = CalculateLaunchDimensions(
-      standard_iteration_shape, ir_emitter_context_->device_description());
+      standard_iteration_shape, ir_emitter_context_->gpu_device_info());
 
   // Calculate the launch dimensions for the case where we use tiling. We split
   // the dimension that should be sorted into tiles of size 'kTileSize'. This
@@ -1357,9 +1357,9 @@ Status IrEmitterUnnested::HandleSort(HloInstruction* sort) {
   bool no_tiling =
       kTileSize < 128 ||
       kThreadsPerBlock >
-          ir_emitter_context_->device_description().threads_per_block_limit() ||
+          ir_emitter_context_->gpu_device_info().threads_per_block_limit ||
       total_shared_memory_needed >
-          ir_emitter_context_->device_description().shared_memory_per_block();
+          ir_emitter_context_->gpu_device_info().shared_memory_per_block;
 
   uint64 num_blocks = CeilOfRatio(num_iterations, kThreadsPerBlock);
   LaunchDimensions tiled_launch_dimensions(num_blocks, kThreadsPerBlock);
@@ -1862,7 +1862,7 @@ StatusOr<std::unique_ptr<Thunk>> IrEmitterUnnested::BuildInitializerThunk(
       BuildKernelThunk(hlo, /*implements_whole_instruction=*/false);
   LaunchDimensions launch_dimensions =
       CalculateLaunchDimensions(ShapeUtil::GetSubshape(hlo->shape(), index),
-                                ir_emitter_context_->device_description());
+                                ir_emitter_context_->gpu_device_info());
   UpdateLaunchDimensions(launch_dimensions, kernel_thunk.get(),
                          ir_emitter_context_->llvm_module());
 
@@ -2057,7 +2057,7 @@ Status IrEmitterUnnested::EmitTargetElementLoopInThunk(
           << ShapeUtil::HumanStringWithLayout(hlo.shape())
           << " for unroll_factor " << unroll_factor;
   LaunchDimensions launch_dimensions = CalculateLaunchDimensions(
-      element_shape, ir_emitter_context_->device_description(), unroll_factor);
+      element_shape, ir_emitter_context_->gpu_device_info(), unroll_factor);
   UpdateLaunchDimensions(launch_dimensions, thunk,
                          ir_emitter_context_->llvm_module());
   if (!multi_output) {
@@ -3413,7 +3413,7 @@ ReductionCodegenInfo IrEmitterUnnested::ComputeReductionCodegenInfo(
   }
   std::array<int64, 3> reduction_tiling =
       GetReductionTiling(reduction_dimensions, smallest_input_dtype_bits,
-                         &ir_emitter_context_->device_description());
+                         ir_emitter_context_->cuda_compute_capability());
 
   int64 num_threads_y = reduction_dimensions.is_row_reduction ? 1 : kWarpSize;
   int64 num_threads_x = [&] {
@@ -3439,9 +3439,10 @@ ReductionCodegenInfo IrEmitterUnnested::ComputeReductionCodegenInfo(
                       (reduction_tiling[2] * num_threads_x) ==
                   0;
 
-  int cc_major = 0, cc_minor = 0;
-  ir_emitter_context_->device_description().cuda_compute_capability(&cc_major,
-                                                                    &cc_minor);
+  int cc_major = 0;
+  if (ir_emitter_context_->cuda_compute_capability()) {
+    cc_major = ir_emitter_context_->cuda_compute_capability()->cc_major;
+  }
 
   int num_partial_results = 1;
   KernelMappingScheme::IndexingOrder indexing_order = [&]() {
@@ -3718,7 +3719,7 @@ Status IrEmitterUnnested::EmitInputFusibleNonStridedSlices(
   TF_ASSIGN_OR_RETURN(Shape element_shape,
                       GetConsistentInputShapeForRootSlices(*unnested_hlo));
   LaunchDimensions launch_dimensions = CalculateLaunchDimensions(
-      element_shape, ir_emitter_context_->device_description(), unroll_factor);
+      element_shape, ir_emitter_context_->gpu_device_info(), unroll_factor);
   UpdateLaunchDimensions(launch_dimensions, kernel_thunk.get(),
                          ir_emitter_context_->llvm_module());
 
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h
index e5a1550e82e..d8314f2895f 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h
@@ -66,6 +66,10 @@ class IrEmitterUnnested : public IrEmitter,
     llvm::Value* lane_id;
   };
 
+  absl::string_view platform_name() const override {
+    return ir_emitter_context_->platform_name();
+  }
+
   // A function object to generate code to process one element in a tile.
   //
   // index: the index for the first output element of the current thread.
@@ -255,10 +259,6 @@ class IrEmitterUnnested : public IrEmitter,
         shape, ir_emitter_context_->llvm_module()->getDataLayout());
   }
 
-  const se::Platform* platform() const override {
-    return ir_emitter_context_->platform();
-  }
-
   // Builds the prototype of the IR kernel for `inst` and adds it to the module.
   // This kernel takes as arguments pointers to the given buffer allocations.
   llvm::Function* BuildKernelPrototype(
diff --git a/tensorflow/compiler/xla/service/gpu/kernel_thunk.h b/tensorflow/compiler/xla/service/gpu/kernel_thunk.h
index a3386df5917..8f1debe80e8 100644
--- a/tensorflow/compiler/xla/service/gpu/kernel_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/kernel_thunk.h
@@ -24,7 +24,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/buffer_assignment.h"
 #include "tensorflow/compiler/xla/service/gpu/buffer_allocations.h"
 #include "tensorflow/compiler/xla/service/gpu/hlo_execution_profiler.h"
-#include "tensorflow/compiler/xla/service/gpu/partition_assignment.h"
+#include "tensorflow/compiler/xla/service/gpu/launch_dimensions.h"
 #include "tensorflow/compiler/xla/service/gpu/thunk.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/types.h"
diff --git a/tensorflow/compiler/xla/service/gpu/partition_assignment.cc b/tensorflow/compiler/xla/service/gpu/launch_dimensions.cc
similarity index 79%
rename from tensorflow/compiler/xla/service/gpu/partition_assignment.cc
rename to tensorflow/compiler/xla/service/gpu/launch_dimensions.cc
index 4d89e758049..3668a521ec7 100644
--- a/tensorflow/compiler/xla/service/gpu/partition_assignment.cc
+++ b/tensorflow/compiler/xla/service/gpu/launch_dimensions.cc
@@ -1,4 +1,4 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -13,20 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/xla/service/gpu/partition_assignment.h"
+#include "tensorflow/compiler/xla/service/gpu/launch_dimensions.h"
 
 #include <ostream>
 #include <string>
 
-#include "absl/memory/memory.h"
-#include "absl/strings/str_format.h"
-#include "tensorflow/compiler/xla/map_util.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/shape_util.h"
-#include "tensorflow/compiler/xla/types.h"
-#include "tensorflow/compiler/xla/util.h"
-#include "tensorflow/core/lib/core/bits.h"
 #include "tensorflow/core/platform/logging.h"
 
 namespace xla {
@@ -39,8 +31,8 @@ std::ostream& operator<<(std::ostream& out,
   return out;
 }
 
-int64 ThreadsPerBlockLimit(const se::DeviceDescription& device_desc) {
-  int64 threads_per_block = device_desc.threads_per_block_limit();
+static int64 ThreadsPerBlockLimit(GpuDeviceInfo gpu_device_info) {
+  int64 threads_per_block = gpu_device_info.threads_per_block_limit;
   if (threads_per_block <= 0) {
     static std::atomic<int64> log_count{0};
     if (log_count.fetch_add(1) < 8) {
@@ -49,7 +41,7 @@ int64 ThreadsPerBlockLimit(const se::DeviceDescription& device_desc) {
                       "StreamExecutor's PopulateDeviceDescription should be "
                       "updated for this device.";
     }
-    threads_per_block = device_desc.threads_per_warp();
+    threads_per_block = gpu_device_info.threads_per_warp;
     if (threads_per_block == 0) {
       // Fall back to *something* if we can't even get num threads per warp.
       threads_per_block = 32;
@@ -59,9 +51,9 @@ int64 ThreadsPerBlockLimit(const se::DeviceDescription& device_desc) {
 }
 
 // Calculates the launch dimensions used to invoke `hlo`.
-LaunchDimensions CalculateLaunchDimensions(
-    const Shape& shape, const se::DeviceDescription& device_desc,
-    int unroll_factor) {
+LaunchDimensions CalculateLaunchDimensions(const Shape& shape,
+                                           GpuDeviceInfo gpu_device_info,
+                                           int unroll_factor) {
   int64 num_elements = ShapeUtil::ElementsIn(shape);
   if (num_elements <= 1) {
     return LaunchDimensions();
@@ -81,7 +73,7 @@ LaunchDimensions CalculateLaunchDimensions(
   //
   // TODO(jlebar): Investigate this further, and tune this heuristic so we can
   // run faster on the few benchmarks where smaller block size helps.
-  int64 threads_per_block = ThreadsPerBlockLimit(device_desc);
+  int64 threads_per_block = ThreadsPerBlockLimit(gpu_device_info);
   // We unroll kernels to make use of vectorized loads/stores. This means we
   // need more registers to hold intermediate values. Reduce the number of
   // blocks per thread to increase the number of registers available to ptxas.
diff --git a/tensorflow/compiler/xla/service/gpu/partition_assignment.h b/tensorflow/compiler/xla/service/gpu/launch_dimensions.h
similarity index 65%
rename from tensorflow/compiler/xla/service/gpu/partition_assignment.h
rename to tensorflow/compiler/xla/service/gpu/launch_dimensions.h
index eb41dcccb93..1a5a9d618e4 100644
--- a/tensorflow/compiler/xla/service/gpu/partition_assignment.h
+++ b/tensorflow/compiler/xla/service/gpu/launch_dimensions.h
@@ -1,4 +1,4 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -13,19 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-// Algorithms and data structures for partition assignment.
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_LAUNCH_DIMENSIONS_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_LAUNCH_DIMENSIONS_H_
 
-#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_PARTITION_ASSIGNMENT_H_
-#define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_PARTITION_ASSIGNMENT_H_
-
-#include <iosfwd>
 #include <map>
 #include <memory>
 
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
-#include "tensorflow/core/platform/stream_executor_no_cuda.h"
-#include "tensorflow/core/platform/types.h"
+#include "tensorflow/compiler/xla/service/gpu/gpu_device_info.h"
+#include "tensorflow/compiler/xla/shape.h"
 
 namespace xla {
 namespace gpu {
@@ -57,14 +52,11 @@ class LaunchDimensions {
 std::ostream& operator<<(std::ostream& out,
                          const LaunchDimensions& launch_dims);
 
-// Returns the maximum number of threads per block allowed by the device.
-int64 ThreadsPerBlockLimit(const se::DeviceDescription& device_desc);
-
-LaunchDimensions CalculateLaunchDimensions(
-    const Shape& shape, const se::DeviceDescription& device_desc,
-    int unroll_factor = 1);
+LaunchDimensions CalculateLaunchDimensions(const Shape& shape,
+                                           GpuDeviceInfo gpu_device_info,
+                                           int unroll_factor = 1);
 
 }  // namespace gpu
 }  // namespace xla
 
-#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_PARTITION_ASSIGNMENT_H_
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_LAUNCH_DIMENSIONS_H_
diff --git a/tensorflow/compiler/xla/service/gpu/parallel_loop_emitter.h b/tensorflow/compiler/xla/service/gpu/parallel_loop_emitter.h
index f32ea1ce4c4..0a6b5430b23 100644
--- a/tensorflow/compiler/xla/service/gpu/parallel_loop_emitter.h
+++ b/tensorflow/compiler/xla/service/gpu/parallel_loop_emitter.h
@@ -17,7 +17,7 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_PARALLEL_LOOP_EMITTER_H_
 
 #include "llvm/IR/IRBuilder.h"
-#include "tensorflow/compiler/xla/service/gpu/partition_assignment.h"
+#include "tensorflow/compiler/xla/service/gpu/launch_dimensions.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/ir_array.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/loop_emitter.h"
 
diff --git a/tensorflow/compiler/xla/service/gpu/thunk_emitter.cc b/tensorflow/compiler/xla/service/gpu/thunk_emitter.cc
index 1ce45c0f777..5da7aeaa2d1 100644
--- a/tensorflow/compiler/xla/service/gpu/thunk_emitter.cc
+++ b/tensorflow/compiler/xla/service/gpu/thunk_emitter.cc
@@ -291,7 +291,7 @@ Status ThunkEmitter::HandleCustomCall(HloInstruction* custom_call) {
   }
 
   if (void* call_target = CustomCallTargetRegistry::Global()->Lookup(
-          custom_call->custom_call_target(), platform()->Name())) {
+          custom_call->custom_call_target(), std::string(platform_name()))) {
     auto get_slices_for_instr = [&](const HloInstruction* instr) {
       ShapeTree<BufferAllocation::Slice> slices(instr->shape());
       slices.ForEachMutableElement(
diff --git a/tensorflow/compiler/xla/service/gpu/thunk_emitter.h b/tensorflow/compiler/xla/service/gpu/thunk_emitter.h
index 49d71192e77..f4ef87dac5a 100644
--- a/tensorflow/compiler/xla/service/gpu/thunk_emitter.h
+++ b/tensorflow/compiler/xla/service/gpu/thunk_emitter.h
@@ -35,7 +35,7 @@ class ThunkEmitter {
     virtual StatusOr<BufferAllocation::Slice> MaybeGetAllocationSlice(
         const HloInstruction& hlo, const ShapeIndex& index) const = 0;
     virtual int64 ByteSizeOf(const Shape& shape) const = 0;
-    virtual const se::Platform* platform() const = 0;
+    virtual absl::string_view platform_name() const = 0;
 
     virtual ~EmissionContext() = default;
   };
@@ -62,7 +62,7 @@ class ThunkEmitter {
 
   int64 ByteSizeOf(const Shape& shape) { return context_->ByteSizeOf(shape); }
 
-  const se::Platform* platform() const { return context_->platform(); }
+  absl::string_view platform_name() const { return context_->platform_name(); }
 
   BufferAllocation::Slice GetAllocationSlice(
       const HloInstruction& hlo, const ShapeIndex& index = {}) const {
diff --git a/tensorflow/compiler/xla/service/llvm_ir/BUILD b/tensorflow/compiler/xla/service/llvm_ir/BUILD
index fcad2d88533..67bfb7da20a 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/BUILD
+++ b/tensorflow/compiler/xla/service/llvm_ir/BUILD
@@ -178,8 +178,8 @@ cc_library(
         "//tensorflow/compiler/xla/service:buffer_assignment",
         "//tensorflow/compiler/xla/service:elemental_ir_emitter",
         "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/service/gpu:launch_dimensions",
         "//tensorflow/compiler/xla/service/gpu:parallel_loop_emitter",
-        "//tensorflow/compiler/xla/service/gpu:partition_assignment",
     ],
 )
 
@@ -195,8 +195,8 @@ cc_library(
         ":loop_emitter",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla/service/gpu:launch_dimensions",
         "//tensorflow/compiler/xla/service/gpu:parallel_loop_emitter",
-        "//tensorflow/compiler/xla/service/gpu:partition_assignment",
         "//tensorflow/compiler/xla/service/gpu:target_util",
         "//tensorflow/core:lib",
         "@com_google_absl//absl/strings",
diff --git a/tensorflow/compiler/xla/service/llvm_ir/dynamic_update_slice_util.cc b/tensorflow/compiler/xla/service/llvm_ir/dynamic_update_slice_util.cc
index 77ce26c7e84..406fe84019e 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/dynamic_update_slice_util.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/dynamic_update_slice_util.cc
@@ -14,8 +14,9 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/compiler/xla/service/llvm_ir/dynamic_update_slice_util.h"
+
+#include "tensorflow/compiler/xla/service/gpu/launch_dimensions.h"
 #include "tensorflow/compiler/xla/service/gpu/parallel_loop_emitter.h"
-#include "tensorflow/compiler/xla/service/gpu/partition_assignment.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/loop_emitter.h"
diff --git a/tensorflow/compiler/xla/service/llvm_ir/dynamic_update_slice_util.h b/tensorflow/compiler/xla/service/llvm_ir/dynamic_update_slice_util.h
index 70dc368d5d7..b40501b738c 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/dynamic_update_slice_util.h
+++ b/tensorflow/compiler/xla/service/llvm_ir/dynamic_update_slice_util.h
@@ -18,7 +18,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/buffer_assignment.h"
 #include "tensorflow/compiler/xla/service/elemental_ir_emitter.h"
-#include "tensorflow/compiler/xla/service/gpu/partition_assignment.h"
+#include "tensorflow/compiler/xla/service/gpu/launch_dimensions.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/ir_array.h"
 
diff --git a/tensorflow/compiler/xla/service/llvm_ir/sort_util.cc b/tensorflow/compiler/xla/service/llvm_ir/sort_util.cc
index f2f4f306941..ef20cc81fa1 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/sort_util.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/sort_util.cc
@@ -28,8 +28,8 @@ limitations under the License.
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Value.h"
 #include "tensorflow/compiler/xla/primitive_util.h"
+#include "tensorflow/compiler/xla/service/gpu/launch_dimensions.h"
 #include "tensorflow/compiler/xla/service/gpu/parallel_loop_emitter.h"
-#include "tensorflow/compiler/xla/service/gpu/partition_assignment.h"
 #include "tensorflow/compiler/xla/service/gpu/target_util.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/ir_array.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/kernel_support_library.h"
diff --git a/tensorflow/compiler/xla/service/llvm_ir/sort_util.h b/tensorflow/compiler/xla/service/llvm_ir/sort_util.h
index b9341a34d1f..60a4bab4606 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/sort_util.h
+++ b/tensorflow/compiler/xla/service/llvm_ir/sort_util.h
@@ -21,7 +21,7 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "llvm/IR/Value.h"
-#include "tensorflow/compiler/xla/service/gpu/partition_assignment.h"
+#include "tensorflow/compiler/xla/service/gpu/launch_dimensions.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/ir_array.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/types.h"
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/BUILD b/tensorflow/compiler/xla/service/mlir_gpu/BUILD
index efe69450846..d0adfb96c42 100644
--- a/tensorflow/compiler/xla/service/mlir_gpu/BUILD
+++ b/tensorflow/compiler/xla/service/mlir_gpu/BUILD
@@ -100,7 +100,7 @@ cc_library(
         "//tensorflow/compiler/xla/service/gpu:gpu_types",
         "//tensorflow/compiler/xla/service/gpu:ir_emission_utils",
         "//tensorflow/compiler/xla/service/gpu:nvptx_compiler_impl",
-        "//tensorflow/compiler/xla/service/gpu:partition_assignment",
+        "//tensorflow/compiler/xla/service/gpu:launch_dimensions",
         "//tensorflow/compiler/xla/service/gpu:stream_assignment",
         "//tensorflow/compiler/xla/service/gpu:stream_executor_util",
         "//tensorflow/compiler/xla/service/gpu:target_constants",
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/lhlo_dialect_emitter.cc b/tensorflow/compiler/xla/service/mlir_gpu/lhlo_dialect_emitter.cc
index 6e26d8556e7..b6c0fa4a11b 100644
--- a/tensorflow/compiler/xla/service/mlir_gpu/lhlo_dialect_emitter.cc
+++ b/tensorflow/compiler/xla/service/mlir_gpu/lhlo_dialect_emitter.cc
@@ -222,7 +222,9 @@ int64 LhloDialectEmitter::ByteSizeOf(const Shape& shape) const {
   return ShapeUtil::ByteSizeOf(shape, pointer_size_);
 }
 
-const se::Platform* LhloDialectEmitter::platform() const { return platform_; }
+absl::string_view LhloDialectEmitter::platform_name() const {
+  return platform_->Name();
+}
 
 Status LhloDialectEmitter::EmitComputation(const HloComputation& computation) {
   return computation.root_instruction()->Accept(this);
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/lhlo_dialect_emitter.h b/tensorflow/compiler/xla/service/mlir_gpu/lhlo_dialect_emitter.h
index 5c5610fbf44..7dc201e5359 100644
--- a/tensorflow/compiler/xla/service/mlir_gpu/lhlo_dialect_emitter.h
+++ b/tensorflow/compiler/xla/service/mlir_gpu/lhlo_dialect_emitter.h
@@ -86,7 +86,7 @@ class LhloDialectEmitter : public DfsHloVisitorWithDefault,
   StatusOr<BufferAllocation::Slice> MaybeGetAllocationSlice(
       const HloInstruction& hlo, const ShapeIndex& index) const override;
   int64 ByteSizeOf(const Shape& shape) const override;
-  const se::Platform* platform() const override;
+  absl::string_view platform_name() const override;
   mlir::Location getLocation(const HloInstruction* instr) const;
 
   xla::mlir_gpu::EmissionContext* emission_context_;
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/mlir_compiler_impl.cc b/tensorflow/compiler/xla/service/mlir_gpu/mlir_compiler_impl.cc
index a1e05f5421d..beabc99a173 100644
--- a/tensorflow/compiler/xla/service/mlir_gpu/mlir_compiler_impl.cc
+++ b/tensorflow/compiler/xla/service/mlir_gpu/mlir_compiler_impl.cc
@@ -39,9 +39,9 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/gpu/gpu_types.h"
 #include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
 #include "tensorflow/compiler/xla/service/gpu/kernel_thunk.h"
+#include "tensorflow/compiler/xla/service/gpu/launch_dimensions.h"
 #include "tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.h"
 #include "tensorflow/compiler/xla/service/gpu/nvptx_compiler.h"
-#include "tensorflow/compiler/xla/service/gpu/partition_assignment.h"
 #include "tensorflow/compiler/xla/service/gpu/stream_assignment.h"
 #include "tensorflow/compiler/xla/service/gpu/stream_executor_util.h"
 #include "tensorflow/compiler/xla/service/gpu/target_constants.h"

From 49efec606f4886ae477c4c30b4157beec92f5b3e Mon Sep 17 00:00:00 2001
From: Karim Nosir <karimnosseir@google.com>
Date: Thu, 25 Jun 2020 20:18:40 -0700
Subject: [PATCH 1136/1390] Remove explicit line "experimental_new_converter =
 True" the converter launched and is now the default.

PiperOrigin-RevId: 318401041
Change-Id: If14b594d3d2a1997dfbdd615b55db0a06ff22809
---
 .../experimental_new_converter/keras_lstm.ipynb    |  3 ---
 .../stack_trace_example.py                         |  2 --
 tensorflow/lite/python/lite_test.py                |  7 -------
 tensorflow/lite/python/lite_v2_test.py             | 14 --------------
 4 files changed, 26 deletions(-)

diff --git a/tensorflow/lite/examples/experimental_new_converter/keras_lstm.ipynb b/tensorflow/lite/examples/experimental_new_converter/keras_lstm.ipynb
index b68a232f1d1..4bb16ae706c 100644
--- a/tensorflow/lite/examples/experimental_new_converter/keras_lstm.ipynb
+++ b/tensorflow/lite/examples/experimental_new_converter/keras_lstm.ipynb
@@ -156,9 +156,6 @@
    "outputs": [],
    "source": [
     "converter = tf.lite.TFLiteConverter.from_keras_model(model)\n",
-    "# Note: It will NOT work without enabling the experimental converter!\n",
-    "# `experimental_new_converter` flag.\n",
-    "converter.experimental_new_converter = True\n",
     "tflite_model = converter.convert()"
    ]
   },
diff --git a/tensorflow/lite/examples/experimental_new_converter/stack_trace_example.py b/tensorflow/lite/examples/experimental_new_converter/stack_trace_example.py
index f0940db79e0..ba6733661e7 100644
--- a/tensorflow/lite/examples/experimental_new_converter/stack_trace_example.py
+++ b/tensorflow/lite/examples/experimental_new_converter/stack_trace_example.py
@@ -63,7 +63,6 @@ def test_from_saved_model():
 
   # load the model and convert
   converter = tf.lite.TFLiteConverter.from_saved_model(saved_model_path)
-  converter.experimental_new_converter = True
   converter.convert()
 
 
@@ -78,7 +77,6 @@ def test_from_concrete_function():
 
   func = model.get_concrete_function()
   converter = tf.lite.TFLiteConverter.from_concrete_functions([func])
-  converter.experimental_new_converter = True
   converter.convert()
 
 
diff --git a/tensorflow/lite/python/lite_test.py b/tensorflow/lite/python/lite_test.py
index 478840c5549..4f445cf50d4 100644
--- a/tensorflow/lite/python/lite_test.py
+++ b/tensorflow/lite/python/lite_test.py
@@ -435,7 +435,6 @@ class FromSessionTest(TestModels, parameterized.TestCase):
     # Test None after 1st dimension.
     converter = lite.TFLiteConverter.from_session(sess, [in_tensor],
                                                   [out_tensor])
-    converter.experimental_new_converter = True
     tflite_model = converter.convert()
 
     # Check values from converted model.
@@ -666,8 +665,6 @@ class FromSessionTest(TestModels, parameterized.TestCase):
                                                   [out_tensor])
     log_dir = self.get_temp_dir()
     converter.conversion_summary_dir = log_dir
-    # Conversion logs will only be generated when the mlir converter is enabled.
-    converter.experimental_new_converter = True
     tflite_model = converter.convert()
     self.assertTrue(tflite_model)
 
@@ -1390,7 +1387,6 @@ class FromSessionTest(TestModels, parameterized.TestCase):
 
     converter = lite.TFLiteConverter.from_session(sess, [in_tensor],
                                                   [out_tensor])
-    converter.experimental_new_converter = True
     tflite_model = converter.convert()
 
     # Check values from converted model.
@@ -1797,7 +1793,6 @@ class FromSavedModelTest(TestModels):
                       'If you encountered a problem')
     # Convert model and ensure model is not None.
     converter = lite.TFLiteConverter.from_saved_model(saved_model_dir)
-    converter.experimental_new_converter = True
     tflite_model = converter.convert()
     self.assertTrue(tflite_model)
     self.assertIn(optout_message, log.getvalue())
@@ -2385,8 +2380,6 @@ class GrapplerTest(TestModels, parameterized.TestCase):
     # Convert model.
     converter = lite.TFLiteConverter.from_session(sess, [in_tensor],
                                                   [out_tensor])
-    # Only disable this path in MLIR conversion for toco compatibility.
-    converter.experimental_new_converter = True
     tflite_model = converter.convert()
 
     # Check values from converted model.
diff --git a/tensorflow/lite/python/lite_v2_test.py b/tensorflow/lite/python/lite_v2_test.py
index ea8db15abc2..3b51991d674 100644
--- a/tensorflow/lite/python/lite_v2_test.py
+++ b/tensorflow/lite/python/lite_v2_test.py
@@ -830,7 +830,6 @@ class ControlFlowTest(lite_v2_test_util.ModelTest):
 
     # Convert model.
     converter = lite.TFLiteConverterV2.from_concrete_functions([concrete_func])
-    converter.experimental_new_converter = True
     tflite_model = converter.convert()
 
     # Check values from converted model.
@@ -857,7 +856,6 @@ class ControlFlowTest(lite_v2_test_util.ModelTest):
 
     # Convert model.
     converter = lite.TFLiteConverterV2.from_concrete_functions([concrete_func])
-    converter.experimental_new_converter = True
     tflite_model = converter.convert()
 
     # Check values from converted model.
@@ -887,7 +885,6 @@ class ControlFlowTest(lite_v2_test_util.ModelTest):
 
     # Convert model.
     converter = lite.TFLiteConverterV2.from_concrete_functions([concrete_func])
-    converter.experimental_new_converter = True
     tflite_model = converter.convert()
 
     # Check values from converted model.
@@ -911,7 +908,6 @@ class ControlFlowTest(lite_v2_test_util.ModelTest):
 
     # Convert model.
     converter = lite.TFLiteConverterV2.from_concrete_functions([concrete_func])
-    converter.experimental_new_converter = True
     tflite_model = converter.convert()
 
     # Check values from converted model.
@@ -941,7 +937,6 @@ class ControlFlowTest(lite_v2_test_util.ModelTest):
 
     # Convert model.
     converter = lite.TFLiteConverterV2.from_keras_model(model)
-    converter.experimental_new_converter = True
     tflite_model = converter.convert()
     actual_value = self._evaluateTFLiteModel(tflite_model, [input_data])[0]
 
@@ -963,7 +958,6 @@ class ControlFlowTest(lite_v2_test_util.ModelTest):
 
     # Convert model.
     converter = lite.TFLiteConverterV2.from_keras_model(model)
-    converter.experimental_new_converter = True
     tflite_model = converter.convert()
     actual_value = self._evaluateTFLiteModel(tflite_model, [input_data])[0]
 
@@ -987,7 +981,6 @@ class ControlFlowTest(lite_v2_test_util.ModelTest):
 
     # Convert model.
     converter = lite.TFLiteConverterV2.from_keras_model(model)
-    converter.experimental_new_converter = True
     tflite_model = converter.convert()
     actual_value = self._evaluateTFLiteModel(tflite_model, [input_data])[0]
 
@@ -1024,7 +1017,6 @@ class GrapplerTest(lite_v2_test_util.ModelTest):
     np.testing.assert_almost_equal(expected_value.numpy(), actual_value[0])
 
     # Enable hybrid quantization, same result
-    converter.experimental_new_converter = True
     converter.optimizations = [lite.Optimize.DEFAULT]
     hybrid_tflite_model = converter.convert()
     actual_value = self._evaluateTFLiteModel(hybrid_tflite_model, [input_data])
@@ -1048,7 +1040,6 @@ class UnknownShapes(lite_v2_test_util.ModelTest):
     concrete_func = model.get_concrete_function()
 
     converter = lite.TFLiteConverterV2.from_concrete_functions([concrete_func])
-    converter.experimental_new_converter = True
     tflite_model = converter.convert()
 
     # Check values from converted model.
@@ -1090,12 +1081,10 @@ class UnknownShapes(lite_v2_test_util.ModelTest):
     concrete_func, _ = self._getQuantizedModel()
     float_converter = lite.TFLiteConverterV2.from_concrete_functions(
         [concrete_func])
-    float_converter.experimental_new_converter = True
     float_tflite_model = float_converter.convert()
 
     quantized_converter = lite.TFLiteConverterV2.from_concrete_functions(
         [concrete_func])
-    quantized_converter.experimental_new_converter = True
     quantized_converter.optimizations = [lite.Optimize.DEFAULT]
     quantized_tflite_model = quantized_converter.convert()
 
@@ -1115,14 +1104,12 @@ class UnknownShapes(lite_v2_test_util.ModelTest):
     concrete_func, calibration_gen = self._getQuantizedModel()
     float_converter = lite.TFLiteConverterV2.from_concrete_functions(
         [concrete_func])
-    float_converter.experimental_new_converter = True
     float_tflite_model = float_converter.convert()
 
     quantized_converter = lite.TFLiteConverterV2.from_concrete_functions(
         [concrete_func])
     quantized_converter.optimizations = [lite.Optimize.DEFAULT]
     quantized_converter.representative_dataset = calibration_gen
-    quantized_converter.experimental_new_converter = True
     quantized_tflite_model = quantized_converter.convert()
 
     # The default input and output types should be float.
@@ -1152,7 +1139,6 @@ class UnknownShapes(lite_v2_test_util.ModelTest):
     concrete_func = model.get_concrete_function()
 
     converter = lite.TFLiteConverterV2.from_concrete_functions([concrete_func])
-    converter.experimental_new_converter = True
     tflite_model = converter.convert()
 
     # Check values from converted model.

From 7417aa8bcf8efc47d9f3232d86fcd130ba315ec9 Mon Sep 17 00:00:00 2001
From: Raman Sarokin <sorokin@google.com>
Date: Thu, 25 Jun 2020 20:40:44 -0700
Subject: [PATCH 1137/1390] ConvolutionTransposed3D converted to new style.

PiperOrigin-RevId: 318403136
Change-Id: Ic52a3f9b42235ba94c3b6bade806e1e546f17041
---
 .../cl/kernels/convolution_transposed_3d.cc   | 320 ++++++++----------
 .../cl/kernels/convolution_transposed_3d.h    |  56 ++-
 .../lite/delegates/gpu/cl/tensor_type.cc      |   6 +-
 3 files changed, 187 insertions(+), 195 deletions(-)

diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3d.cc b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3d.cc
index 4f024ee6e87..4e2f612f43f 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3d.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3d.cc
@@ -29,27 +29,33 @@ namespace gpu {
 namespace cl {
 namespace {
 
-std::string GenerateConvolutionTransposed3DCode(
-    const OperationDef& op_def, const LinearStorage& biases,
-    const CLDevice& device, bool weights_are_buffer, const int4& block_size,
-    const std::vector<ElementwiseOperation*>& linked_operations) {
-  TensorCodeGenerator src_tensor(
-      "src_data",
-      WHDSBPoint{"src_size.x", "src_size.y", "src_size.z", "src_size.w",
-                 "batch_size"},
-      op_def.src_tensors[0]);
-  TensorCodeGenerator dst_tensor(
-      "dst_data",
-      WHDSBPoint{"dst_size.x", "dst_size.y", "dst_size.z", "dst_size.w",
-                 "batch_size"},
-      op_def.dst_tensors[0]);
+std::string GenerateConvolutionTransposed3DCode(const OperationDef& op_def,
+                                                const CLDevice& device,
+                                                bool weights_are_buffer,
+                                                const int4& block_size,
+                                                Arguments* args) {
+  auto src_desc = absl::make_unique<TensorDescriptor>(op_def.src_tensors[0]);
+  src_desc->SetTextureAddressMode(GetFastestZeroMode(device));
+  args->AddObjectRef("src_tensor", AccessType::READ, std::move(src_desc));
+  args->AddObjectRef(
+      "dst_tensor", AccessType::WRITE,
+      absl::make_unique<TensorDescriptor>(op_def.dst_tensors[0]));
+  args->AddInt("stride_x");
+  args->AddInt("stride_y");
+  args->AddInt("stride_z");
+  args->AddInt("padding_x");
+  args->AddInt("padding_y");
+  args->AddInt("padding_z");
+  args->AddInt("kernel_size_x");
+  args->AddInt("kernel_size_y");
+  args->AddInt("kernel_size_z");
+  args->AddInt("grid_size_s");
 
   const auto src_tensor_type = op_def.src_tensors[0].storage_type;
   bool image_buffer = src_tensor_type == TensorStorageType::IMAGE_BUFFER;
   bool manual_clamp =
       image_buffer || src_tensor_type == TensorStorageType::BUFFER;
 
-  const std::string batch_id = op_def.IsBatchSupported() ? "B" : "";
   std::string c = GetCommonDefines(op_def.precision);
 
   for (int s = 0; s < block_size.w; ++s) {
@@ -93,134 +99,118 @@ std::string GenerateConvolutionTransposed3DCode(
   }
 
   c += "__kernel void main_function(\n";
-  c += src_tensor.GetDeclaration(AccessType::READ) + ",\n";
-  if (weights_are_buffer) {
-    c += "    __global FLT16* filters,  \n";
-  } else {
-    c += "    __read_only image2d_t filters0,  \n";
-    c += "    __read_only image2d_t filters1,  \n";
-    c += "    __read_only image2d_t filters2,  \n";
-    c += "    __read_only image2d_t filters3,  \n";
-  }
-  c += biases.GetDeclaration();
-  c += GetArgsDeclaration(linked_operations);
-  c += dst_tensor.GetDeclaration(AccessType::WRITE) + ",\n";
-  c += "    int4 kernel_size,          \n";
-  c += "    int4 stride,               \n";
-  c += "    int4 padding,              \n";
-  if (op_def.IsBatchSupported()) {
-    c += "    int batch_size,          \n";
-  }
-  c += "    int grid_size_s,           \n";
-  c += "    int4 src_size,             \n";
-  c += "    int4 dst_size              \n";
-  c += ") {\n";
+  c += "$0) {\n";
   if (op_def.IsBatchSupported()) {
     c += "  int linear_id = get_global_id(0);\n";
-    c += "  int dst_x = (linear_id / batch_size);\n";
-    c += "  int B = linear_id % batch_size;\n";
+    c += "  int dst_x = (linear_id / args.dst_tensor.Batch());\n";
+    c += "  int B = linear_id % args.dst_tensor.Batch();\n";
+    c += "  args.dst_tensor.SetBatchRef(B);\n";
+    c += "  args.src_tensor.SetBatchRef(B);\n";
   } else {
     c += "  int dst_x = get_global_id(0);\n";
   }
-  c += "  int rem_x = dst_x % stride.x;\n";
-  c += "  int ceil_x = dst_x / stride.x;\n";
-  c += "  dst_x = ceil_x * stride.x * " + std::to_string(block_size.x) +
+  c += "  int rem_x = dst_x % args.stride_x;\n";
+  c += "  int ceil_x = dst_x / args.stride_x;\n";
+  c += "  dst_x = ceil_x * args.stride_x * " + std::to_string(block_size.x) +
        " + rem_x;\n";
   c += "  int dst_y = get_global_id(1);\n";
-  c += "  int rem_y = dst_y % stride.y;\n";
-  c += "  int ceil_y = dst_y / stride.y;\n";
-  c += "  dst_y = ceil_y * stride.y * " + std::to_string(block_size.y) +
+  c += "  int rem_y = dst_y % args.stride_y;\n";
+  c += "  int ceil_y = dst_y / args.stride_y;\n";
+  c += "  dst_y = ceil_y * args.stride_y * " + std::to_string(block_size.y) +
        " + rem_y;\n";
   c += "  int linear_id_z = get_global_id(2);\n";
-  c += "  int S = (linear_id_z % grid_size_s) * " +
+  c += "  int S = (linear_id_z % args.grid_size_s) * " +
        std::to_string(block_size.w) + ";\n";
-  c += "  int dst_z = linear_id_z / grid_size_s;\n";
-  c += "  int rem_z = dst_z % stride.z;\n";
-  c += "  int ceil_z = dst_z / stride.z;\n";
-  c += "  dst_z = ceil_z * stride.z * " + std::to_string(block_size.z) +
+  c += "  int dst_z = linear_id_z / args.grid_size_s;\n";
+  c += "  int rem_z = dst_z % args.stride_z;\n";
+  c += "  int ceil_z = dst_z / args.stride_z;\n";
+  c += "  dst_z = ceil_z * args.stride_z * " + std::to_string(block_size.z) +
        " + rem_z;\n";
-  c += "  if (dst_x >= dst_size.x || dst_y >= dst_size.y || dst_z >= "
-       "dst_size.z) return;\n";
+  c += "  if (dst_x >= args.dst_tensor.Width() || dst_y >= "
+       "args.dst_tensor.Height() || dst_z >= "
+       "args.dst_tensor.Depth()) return;\n";
   if (weights_are_buffer) {
-    c += "  int f_base = S * src_size.w * kernel_size.x * kernel_size.y * "
-         "kernel_size.z;\n";
+    c += "  int f_base = S * args.src_tensor.Slices() * args.kernel_size_x * "
+         "args.kernel_size_y * "
+         "args.kernel_size_z;\n";
   }
   for (int i = 0; i < block_size.x * block_size.y * block_size.z * block_size.w;
        ++i) {
     c += "  ACCUM_FLT4 r" + std::to_string(i) +
          " = (ACCUM_FLT4)(0.0f, 0.0f, 0.0f, 0.0f);\n";
   }
-  c += "  int kernel_first_dst_x = dst_x + padding.x;\n";
-  c += "  int kernel_first_dst_y = dst_y + padding.y;\n";
-  c += "  int kernel_first_dst_z = dst_z + padding.z;\n";
-  c += "  int kernel_last_dst_x = kernel_first_dst_x - kernel_size.x;\n";
-  c += "  int kernel_last_dst_y = kernel_first_dst_y - kernel_size.y;\n";
-  c += "  int kernel_last_dst_z = kernel_first_dst_z - kernel_size.z;\n";
-  c += "  int offset_x = abs(padding.x);\n";
-  c += "  int offset_x_strided = offset_x * stride.x;\n";
-  c += "  int src_x = (kernel_first_dst_x + offset_x_strided) / stride.x - "
-       "offset_x;\n";
-  c += "  int offset_y = abs(padding.y);\n";
-  c += "  int offset_y_strided = offset_y * stride.y;\n";
-  c += "  int src_y = (kernel_first_dst_y + offset_y_strided) / stride.y - "
-       "offset_y;\n";
-  c += "  int offset_z = abs(padding.z);\n";
-  c += "  int offset_z_strided = offset_z * stride.z;\n";
-  c += "  int src_z = (kernel_first_dst_z + offset_z_strided) / stride.z - "
-       "offset_z;\n";
-  c += "  int src_as_dst_z = src_z * stride.z;\n";
+  c += "  int kernel_first_dst_x = dst_x + args.padding_x;\n";
+  c += "  int kernel_first_dst_y = dst_y + args.padding_y;\n";
+  c += "  int kernel_first_dst_z = dst_z + args.padding_z;\n";
+  c += "  int kernel_last_dst_x = kernel_first_dst_x - args.kernel_size_x;\n";
+  c += "  int kernel_last_dst_y = kernel_first_dst_y - args.kernel_size_y;\n";
+  c += "  int kernel_last_dst_z = kernel_first_dst_z - args.kernel_size_z;\n";
+  c += "  int offset_x = abs(args.padding_x);\n";
+  c += "  int offset_x_strided = offset_x * args.stride_x;\n";
+  c +=
+      "  int src_x = (kernel_first_dst_x + offset_x_strided) / args.stride_x - "
+      "offset_x;\n";
+  c += "  int offset_y = abs(args.padding_y);\n";
+  c += "  int offset_y_strided = offset_y * args.stride_y;\n";
+  c +=
+      "  int src_y = (kernel_first_dst_y + offset_y_strided) / args.stride_y - "
+      "offset_y;\n";
+  c += "  int offset_z = abs(args.padding_z);\n";
+  c += "  int offset_z_strided = offset_z * args.stride_z;\n";
+  c +=
+      "  int src_z = (kernel_first_dst_z + offset_z_strided) / args.stride_z - "
+      "offset_z;\n";
+  c += "  int src_as_dst_z = src_z * args.stride_z;\n";
   c += "  for (;src_as_dst_z > kernel_last_dst_z; src_z -= 1, src_as_dst_z -= "
-       "stride.z) {\n";
+       "args.stride_z) {\n";
   for (int z = 0; z < block_size.z; ++z) {
     const std::string zindex = std::to_string(z);
     c += "    int sz" + zindex + " = src_z + " + zindex + ";\n";
     if (src_tensor_type != TensorStorageType::TEXTURE_3D) {
       c += "    bool in_z" + zindex + " = sz" + zindex + " >= 0 && sz" +
-           zindex + " < src_size.z;\n";
+           zindex + " < args.src_tensor.Depth();\n";
     }
   }
   if (block_size.z == 1 && (src_tensor_type != TensorStorageType::TEXTURE_3D)) {
     c += "    if (!in_z0) continue;\n";
   }
   c += "    int kernel_z = kernel_first_dst_z - src_as_dst_z;\n";
-  c += "    int src_as_dst_y = src_y * stride.y;\n";
+  c += "    int src_as_dst_y = src_y * args.stride_y;\n";
   c += "    int src_y_copy = src_y;\n";
   c += "    for (;src_as_dst_y > kernel_last_dst_y; src_y_copy -= 1, "
        "src_as_dst_y -= "
-       "stride.y) {\n";
+       "args.stride_y) {\n";
   for (int y = 0; y < block_size.y; ++y) {
     const std::string yindex = std::to_string(y);
     c += "      int sy" + yindex + " = src_y_copy + " + yindex + ";\n";
     if (manual_clamp) {
       c += "      bool in_y" + yindex + " = sy" + yindex + " >= 0 && sy" +
-           yindex + " < src_size.y;\n";
+           yindex + " < args.src_tensor.Height();\n";
       if (!image_buffer) {
         c += "      sy" + yindex + " = clamp(sy" + yindex +
-             ", 0, src_size.y - 1);\n";
+             ", 0, args.src_tensor.Height() - 1);\n";
       }
     }
   }
   c += "      int kernel_y = kernel_first_dst_y - src_as_dst_y;\n";
-  c += "      int src_as_dst_x = src_x * stride.x;\n";
+  c += "      int src_as_dst_x = src_x * args.stride_x;\n";
   c += "      int src_x_copy = src_x;\n";
   c += "      for (;src_as_dst_x > kernel_last_dst_x; src_x_copy -= 1, "
        "src_as_dst_x "
-       "-= stride.x) {\n";
+       "-= args.stride_x) {\n";
   for (int x = 0; x < block_size.x; ++x) {
     const std::string xindex = std::to_string(x);
     c += "        int sx" + xindex + " = src_x_copy + " + xindex + ";\n";
     if (manual_clamp) {
       c += "        bool in_x" + xindex + " = sx" + xindex + " >= 0 && sx" +
-           xindex + " < src_size.x;\n";
+           xindex + " < args.src_tensor.Width();\n";
       if (!image_buffer) {
         c += "        sx" + xindex + " = clamp(sx" + xindex +
-             ", 0, src_size.x - 1);\n";
+             ", 0, args.src_tensor.Width() - 1);\n";
       }
     }
   }
-  const std::string layer_offset =
-      std::string("src_size.x * src_size.y") +
-      (op_def.IsBatchSupported() ? " * batch_size" : "");
+  const std::string layer_offset = "args.src_tensor.SliceStride()";
   for (int z = 0; z < block_size.z; ++z) {
     const std::string zindex = std::to_string(z);
     for (int y = 0; y < block_size.y; ++y) {
@@ -229,20 +219,15 @@ std::string GenerateConvolutionTransposed3DCode(
         const std::string xindex = std::to_string(x);
         const std::string id =
             std::to_string((z * block_size.y + y) * block_size.x + x);
+        c += "        args.src_tensor.GetAddress(addr_" + id + ", sx" + xindex +
+             ", sy" + yindex + ", sz" + zindex + ", 0);";
         if (image_buffer) {
-          c += "        " + src_tensor.GetAddressWHDSB(
-                                "addr_" + id, "sx" + xindex, "sy" + yindex,
-                                "sz" + zindex, "0", batch_id);
           c += "        addr_" + id + " = select(-1, addr_" + id + ", (in_x" +
                xindex + " && in_y" + yindex + "));\n";
           c += absl::Substitute(
               "        int dz_$0 = select(0, $3, (in_x$1 && "
               "in_y$2));\n",
               id, x, y, layer_offset);
-        } else {
-          c += "        " + src_tensor.GetAddressWHDSB(
-                                "addr_" + id, "sx" + xindex, "sy" + yindex,
-                                "sz" + zindex, "0", batch_id);
         }
       }
     }
@@ -254,49 +239,47 @@ std::string GenerateConvolutionTransposed3DCode(
     c += "        if (!in_x0 || !in_y0) continue;\n";
   }
   c += "        int kernel_x = kernel_first_dst_x - src_as_dst_x;\n";
-  c += "        int kernel_index = (kernel_z * kernel_size.y + kernel_y) * "
-       "kernel_size.x + kernel_x;\n";
+  c += "        int kernel_index =(kernel_z * args.kernel_size_y + kernel_y) * "
+       "args.kernel_size_x + kernel_x;\n";
   if (weights_are_buffer) {
-    c += "        int f_offset = f_base + kernel_index * src_size.w * " +
+    c += "        int f_offset = f_base + kernel_index * "
+         "args.src_tensor.Slices() * " +
          std::to_string(block_size.w) + ";\n";
   } else {
-    c += "        int x_c = kernel_index * src_size.w;\n";
+    c += "        int x_c = kernel_index * args.src_tensor.Slices();\n";
   }
-  c += "        for (int s = 0; s < src_size.w; ++s) {\n";
-  const auto mode = GetFastestZeroMode(device);
+  c += "        for (int s = 0; s < args.src_tensor.Slices(); ++s) {\n";
   for (int y = 0; y < block_size.y; ++y) {
     const std::string yindex = std::to_string(y);
     for (int x = 0; x < block_size.x; ++x) {
       const std::string xindex = std::to_string(x);
       const std::string id = std::to_string(y * block_size.x + x);
       if (image_buffer) {
-        c += "          FLT4 src" + id + " = " + src_tensor.Read("addr_" + id) +
-             "; addr_" + id + " += dz_" + id + ";\n";
+        c += "          FLT4 src" + id + " = args.src_tensor.Read(addr_" + id +
+             "); addr_" + id + " += dz_" + id + ";\n";
       } else if (manual_clamp) {
-        c += "          FLT4 src" + id + " = " + src_tensor.Read("addr_" + id) +
-             " * (FLT)(in_x" + xindex + " && in_y" + yindex + "); addr_" + id +
+        c += "          FLT4 src" + id + " = args.src_tensor.Read(addr_" + id +
+             ") * (FLT)(in_x" + xindex + " && in_y" + yindex + "); addr_" + id +
              " += dz;\n";
       } else {
-        c += "          FLT4 src" + id + " = " +
-             src_tensor.ReadWHDSB("sx" + xindex, "sy" + yindex, "sz0", "s",
-                                  batch_id, mode) +
-             ";\n";
+        c += "          FLT4 src" + id + " = args.src_tensor.Read(sx" + xindex +
+             ", sy" + yindex + ", sz0, s);\n";
       }
     }
   }
   if (weights_are_buffer) {
-    c += "          __global FLT16* weights_cache = filters + f_offset;\n";
+    c += "          __global FLT16* weights_cache = "
+         "args.weights.GetPtr(f_offset);\n";
     c += "          f_offset += " + std::to_string(block_size.w) + ";\n";
   } else {
     for (int z = 0; z < block_size.w; ++z) {
-      const std::string fc = "(int2)(S + " + std::to_string(z) + ", x_c)";
       c += absl::Substitute(
-          R"(          FLT4 f$1 = READ_IMAGE(filters0, smp_none, $0);
-          FLT4 f$2 = READ_IMAGE(filters1, smp_none, $0);
-          FLT4 f$3 = READ_IMAGE(filters2, smp_none, $0);
-          FLT4 f$4 = READ_IMAGE(filters3, smp_none, $0);
+          R"(          FLT4 f$1 = args.weights0.Read(S + $0, x_c);
+          FLT4 f$2 = args.weights1.Read(S + $0, x_c);
+          FLT4 f$3 = args.weights2.Read(S + $0, x_c);
+          FLT4 f$4 = args.weights3.Read(S + $0, x_c);
 )",
-          fc, z * 4 + 0, z * 4 + 1, z * 4 + 2, z * 4 + 3);
+          z, z * 4 + 0, z * 4 + 1, z * 4 + 2, z * 4 + 3);
     }
     c += "          x_c++;\n";
   }
@@ -312,27 +295,24 @@ std::string GenerateConvolutionTransposed3DCode(
   c += "    }\n";
   c += "  }\n";
   for (int s = 0; s < block_size.w; ++s) {
-    c += "  if (S < dst_size.w) {\n";
-    c += "    FLT4 bias_val = " + biases.ReadLinearFLT4("S") + ";\n";
+    c += "  if (S < args.dst_tensor.Slices()) {\n";
+    c += "    FLT4 bias_val = args.biases.Read(S);\n";
     for (int z = 0; z < block_size.z; ++z) {
       for (int y = 0; y < block_size.y; ++y) {
         for (int x = 0; x < block_size.x; ++x) {
           const std::string id = std::to_string(
               ((s * block_size.z + z) * block_size.y + y) * block_size.x + x);
           c += "    {\n";
-          c += "      int xc = dst_x + stride.x * " + std::to_string(x) + ";\n";
-          c += "      int yc = dst_y + stride.y * " + std::to_string(y) + ";\n";
-          c += "      int zc = dst_z + stride.z * " + std::to_string(z) + ";\n";
-          c += "      if (xc < dst_size.x && yc < dst_size.y && zc < "
-               "dst_size.z) {\n";
+          c += "      int xc = dst_x + args.stride_x * " + std::to_string(x) +
+               ";\n";
+          c += "      int yc = dst_y + args.stride_y * " + std::to_string(y) +
+               ";\n";
+          c += "      int zc = dst_z + args.stride_z * " + std::to_string(z) +
+               ";\n";
+          c += "      if (xc < args.dst_tensor.Width() && yc < "
+               "args.dst_tensor.Height() && zc < args.dst_tensor.Depth()) {\n";
           c += "        FLT4 res = TO_FLT4(r" + id + ") + bias_val;\n";
-          std::string x_3dcoord =
-              op_def.IsBatchSupported() ? "xc * dst_size.w + B" : "xc";
-          const LinkingContext context{"res", x_3dcoord, "yc", "S"};
-          c += PostProcess(linked_operations, context);
-          c += "        " +
-               dst_tensor.WriteWHDSB("res", "xc", "yc", "zc", "S", batch_id) +
-               "\n";
+          c += "        args.dst_tensor.Write(res, xc, yc, zc, S)\n";
           c += "      }\n";
           c += "    }\n";
         }
@@ -361,12 +341,6 @@ ConvolutionTransposed3D::ConvolutionTransposed3D(
 ConvolutionTransposed3D::ConvolutionTransposed3D(
     ConvolutionTransposed3D&& operation)
     : GPUOperation(std::move(operation)),
-      biases_(std::move(operation.biases_)),
-      weights_0_(std::move(operation.weights_0_)),
-      weights_1_(std::move(operation.weights_1_)),
-      weights_2_(std::move(operation.weights_2_)),
-      weights_3_(std::move(operation.weights_3_)),
-      weights_buf_(std::move(operation.weights_buf_)),
       weights_are_buffer_(operation.weights_are_buffer_),
       kernel_size_(operation.kernel_size_),
       stride_(operation.stride_),
@@ -378,12 +352,6 @@ ConvolutionTransposed3D::ConvolutionTransposed3D(
 ConvolutionTransposed3D& ConvolutionTransposed3D::operator=(
     ConvolutionTransposed3D&& operation) {
   if (this != &operation) {
-    biases_ = std::move(operation.biases_);
-    weights_0_ = std::move(operation.weights_0_);
-    weights_1_ = std::move(operation.weights_1_);
-    weights_2_ = std::move(operation.weights_2_);
-    weights_3_ = std::move(operation.weights_3_);
-    weights_buf_ = std::move(operation.weights_buf_);
     std::swap(weights_are_buffer_, operation.weights_are_buffer_);
     std::swap(kernel_size_, operation.kernel_size_);
     std::swap(stride_, operation.stride_);
@@ -398,9 +366,15 @@ ConvolutionTransposed3D& ConvolutionTransposed3D::operator=(
 
 absl::Status ConvolutionTransposed3D::Compile(
     const CreationContext& creation_context) {
-  const auto code = GenerateConvolutionTransposed3DCode(
-      definition_, biases_, *creation_context.device, weights_are_buffer_,
-      block_size_, linked_operations_);
+  std::string code = GenerateConvolutionTransposed3DCode(
+      definition_, *creation_context.device, weights_are_buffer_, block_size_,
+      &args_);
+  std::string element_wise_code;
+  RETURN_IF_ERROR(
+      MergeOperations(linked_operations_, &args_, &element_wise_code));
+  RETURN_IF_ERROR(args_.TransformToCLCode(creation_context.device->GetInfo(),
+                                          {{"dst_tensor", element_wise_code}},
+                                          &code));
 
   std::vector<CompilerOptions> options;
   if (creation_context.device->IsPowerVR() && block_size_.y != 1) {
@@ -418,33 +392,21 @@ absl::Status ConvolutionTransposed3D::Compile(
 }
 
 absl::Status ConvolutionTransposed3D::BindArguments() {
-  kernel_.ResetBindingCounter();
-  RETURN_IF_ERROR(kernel_.SetMemoryAuto(src_[0]->GetMemoryPtr()));
-  if (weights_are_buffer_) {
-    RETURN_IF_ERROR(kernel_.SetMemoryAuto(weights_buf_.GetMemoryPtr()));
-  } else {
-    RETURN_IF_ERROR(kernel_.SetMemoryAuto(weights_0_.GetMemoryPtr()));
-    RETURN_IF_ERROR(kernel_.SetMemoryAuto(weights_1_.GetMemoryPtr()));
-    RETURN_IF_ERROR(kernel_.SetMemoryAuto(weights_2_.GetMemoryPtr()));
-    RETURN_IF_ERROR(kernel_.SetMemoryAuto(weights_3_.GetMemoryPtr()));
-  }
-  RETURN_IF_ERROR(kernel_.SetMemoryAuto(biases_.GetMemoryPtr()));
-  RETURN_IF_ERROR(BindArgs(&kernel_, linked_operations_));
-  RETURN_IF_ERROR(kernel_.SetMemoryAuto(dst_[0]->GetMemoryPtrForWriting()));
-  RETURN_IF_ERROR(kernel_.SetBytesAuto(
-      int4(kernel_size_.x, kernel_size_.y, kernel_size_.z, 1)));
-  RETURN_IF_ERROR(
-      kernel_.SetBytesAuto(int4(stride_.x, stride_.y, stride_.z, 1)));
-  RETURN_IF_ERROR(
-      kernel_.SetBytesAuto(int4(padding_.x, padding_.y, padding_.z, 1)));
-  if (definition_.IsBatchSupported()) {
-    RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->Batch()));
-  }
-  RETURN_IF_ERROR(
-      kernel_.SetBytesAuto(DivideRoundUp(dst_[0]->Slices(), block_size_.w)));
-  RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->GetWHDS()));
-  RETURN_IF_ERROR(kernel_.SetBytesAuto(dst_[0]->GetWHDS()));
-  return absl::OkStatus();
+  RETURN_IF_ERROR(args_.SetObjectRef("src_tensor", src_[0]));
+  RETURN_IF_ERROR(args_.SetObjectRef("dst_tensor", dst_[0]));
+  RETURN_IF_ERROR(args_.SetInt("stride_x", stride_.x));
+  RETURN_IF_ERROR(args_.SetInt("stride_y", stride_.y));
+  RETURN_IF_ERROR(args_.SetInt("stride_z", stride_.z));
+  RETURN_IF_ERROR(args_.SetInt("padding_x", padding_.x));
+  RETURN_IF_ERROR(args_.SetInt("padding_y", padding_.y));
+  RETURN_IF_ERROR(args_.SetInt("padding_z", padding_.z));
+  RETURN_IF_ERROR(args_.SetInt("kernel_size_x", kernel_size_.x));
+  RETURN_IF_ERROR(args_.SetInt("kernel_size_y", kernel_size_.y));
+  RETURN_IF_ERROR(args_.SetInt("kernel_size_z", kernel_size_.z));
+  RETURN_IF_ERROR(args_.SetInt(
+      "grid_size_s", DivideRoundUp(dst_[0]->Slices(), block_size_.w)));
+  RETURN_IF_ERROR(SetArguments(linked_operations_, &args_));
+  return args_.Bind(kernel_.kernel());
 }
 
 int3 ConvolutionTransposed3D::GetGridSize() const {
@@ -476,14 +438,18 @@ absl::Status CreateConvolutionTransposed3D(
   *result = ConvolutionTransposed3D(definition, attr, *creation_context.device);
   RETURN_IF_ERROR(
       result->UploadWeights(attr.weights, creation_context.context));
-  LinearStorageCreateInfo create_info;
-  create_info.storage_type =
+
+  TensorLinearDescriptor desc;
+  desc.storage_type =
       DeduceLinearStorageType(definition.GetPrimaryStorageType());
-  create_info.data_type = definition.GetDataType();
-  create_info.name = "biases";
-  create_info.aligned_size = attr.weights.shape.o;
-  RETURN_IF_ERROR(CreateLinearStorage(
-      create_info, attr.bias, creation_context.context, &result->biases_));
+  desc.element_type = definition.GetDataType();
+
+  LinearStorage lt;
+  RETURN_IF_ERROR(
+      CreateLinearStorage(desc, attr.bias, creation_context.context, &lt));
+  result->args_.AddObject("biases", AccessType::READ,
+                          absl::make_unique<LinearStorage>(std::move(lt)),
+                          absl::make_unique<TensorLinearDescriptor>(desc));
   return absl::OkStatus();
 }
 
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3d.h b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3d.h
index 14757efb5c8..30e22e6e725 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3d.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3d.h
@@ -68,13 +68,6 @@ class ConvolutionTransposed3D : public GPUOperation {
   absl::Status BindArguments();
   int3 GetGridSize() const;
 
-  LinearStorage biases_;
-
-  Texture2D weights_0_;
-  Texture2D weights_1_;
-  Texture2D weights_2_;
-  Texture2D weights_3_;
-  Buffer weights_buf_;
   bool weights_are_buffer_;
 
   int3 kernel_size_;
@@ -105,29 +98,34 @@ absl::Status ConvolutionTransposed3D::UploadWeights(
 
   const int float4_size = f32_weights ? 16 : 8;
 
+  Texture2D weights_0;
+  Texture2D weights_1;
+  Texture2D weights_2;
+  Texture2D weights_3;
+  Buffer weights_buf;
   if (f32_weights) {
     std::vector<float4> gpu_data(elements_count);
     RearrangeWeightsData(weights, absl::MakeSpan(gpu_data));
     if (weights_are_buffer_) {
       RETURN_IF_ERROR(CreateReadOnlyBuffer(float4_size * elements_count,
                                            gpu_data.data(), context,
-                                           &weights_buf_));
+                                           &weights_buf));
     } else {
       RETURN_IF_ERROR(CreateTexture2DRGBA(
           definition_.GetDataType(), texture_width, texture_height,
-          gpu_data.data(), context, &weights_0_));
+          gpu_data.data(), context, &weights_0));
       RETURN_IF_ERROR(CreateTexture2DRGBA(
           definition_.GetDataType(), texture_width, texture_height,
           gpu_data.data() + texture_width * texture_height, context,
-          &weights_1_));
+          &weights_1));
       RETURN_IF_ERROR(CreateTexture2DRGBA(
           definition_.GetDataType(), texture_width, texture_height,
           gpu_data.data() + texture_width * texture_height * 2, context,
-          &weights_2_));
+          &weights_2));
       RETURN_IF_ERROR(CreateTexture2DRGBA(
           definition_.GetDataType(), texture_width, texture_height,
           gpu_data.data() + texture_width * texture_height * 3, context,
-          &weights_3_));
+          &weights_3));
     }
   } else {
     std::vector<half4> gpu_data(elements_count);
@@ -135,26 +133,50 @@ absl::Status ConvolutionTransposed3D::UploadWeights(
     if (weights_are_buffer_) {
       RETURN_IF_ERROR(CreateReadOnlyBuffer(float4_size * elements_count,
                                            gpu_data.data(), context,
-                                           &weights_buf_));
+                                           &weights_buf));
     } else {
       RETURN_IF_ERROR(CreateTexture2DRGBA(
           definition_.GetDataType(), texture_width, texture_height,
-          gpu_data.data(), context, &weights_0_));
+          gpu_data.data(), context, &weights_0));
       RETURN_IF_ERROR(CreateTexture2DRGBA(
           definition_.GetDataType(), texture_width, texture_height,
           gpu_data.data() + texture_width * texture_height, context,
-          &weights_1_));
+          &weights_1));
       RETURN_IF_ERROR(CreateTexture2DRGBA(
           definition_.GetDataType(), texture_width, texture_height,
           gpu_data.data() + texture_width * texture_height * 2, context,
-          &weights_2_));
+          &weights_2));
       RETURN_IF_ERROR(CreateTexture2DRGBA(
           definition_.GetDataType(), texture_width, texture_height,
           gpu_data.data() + texture_width * texture_height * 3, context,
-          &weights_3_));
+          &weights_3));
     }
   }
 
+  if (weights_are_buffer_) {
+    BufferDescriptor desc;
+    desc.element_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
+    desc.element_size = 16;
+    args_.AddObject("weights", AccessType::READ,
+                    absl::make_unique<Buffer>(std::move(weights_buf)),
+                    absl::make_unique<BufferDescriptor>(desc));
+  } else {
+    Texture2DDescriptor desc;
+    desc.element_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
+    args_.AddObject("weights0", AccessType::READ,
+                    absl::make_unique<Texture2D>(std::move(weights_0)),
+                    absl::make_unique<Texture2DDescriptor>(desc));
+    args_.AddObject("weights1", AccessType::READ,
+                    absl::make_unique<Texture2D>(std::move(weights_1)),
+                    absl::make_unique<Texture2DDescriptor>(desc));
+    args_.AddObject("weights2", AccessType::READ,
+                    absl::make_unique<Texture2D>(std::move(weights_2)),
+                    absl::make_unique<Texture2DDescriptor>(desc));
+    args_.AddObject("weights3", AccessType::READ,
+                    absl::make_unique<Texture2D>(std::move(weights_3)),
+                    absl::make_unique<Texture2DDescriptor>(desc));
+  }
+
   return absl::OkStatus();
 }
 
diff --git a/tensorflow/lite/delegates/gpu/cl/tensor_type.cc b/tensorflow/lite/delegates/gpu/cl/tensor_type.cc
index 8b57e1b6160..dfa2ef41cdf 100644
--- a/tensorflow/lite/delegates/gpu/cl/tensor_type.cc
+++ b/tensorflow/lite/delegates/gpu/cl/tensor_type.cc
@@ -151,7 +151,11 @@ absl::Status TensorDescriptor::PerformSelector(
     if (IsBatchedWidth()) {
       *result = "width_batched * height";
     } else {
-      *result = "width * height";
+      if (HasAxis(Axis::BATCH)) {
+        *result = "width * height * batch";
+      } else {
+        *result = "width * height";
+      }
     }
     return absl::OkStatus();
   } else if (selector == "Channels") {

From 3dddecf0374226fe5e2068097e6902e1c65a1bd2 Mon Sep 17 00:00:00 2001
From: Raman Sarokin <sorokin@google.com>
Date: Thu, 25 Jun 2020 20:40:52 -0700
Subject: [PATCH 1138/1390] Conv3D converted to new style.

PiperOrigin-RevId: 318403150
Change-Id: Ic05f8856fc68658e110fa1276ccd422f0cee9b12
---
 .../lite/delegates/gpu/cl/kernels/conv_3d.cc  | 278 ++++++++----------
 .../lite/delegates/gpu/cl/kernels/conv_3d.h   |  85 ++++--
 .../lite/delegates/gpu/cl/linear_storage.cc   |   7 +
 3 files changed, 186 insertions(+), 184 deletions(-)

diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/conv_3d.cc b/tensorflow/lite/delegates/gpu/cl/kernels/conv_3d.cc
index 564f0d1448e..5e54faa378f 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/conv_3d.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/conv_3d.cc
@@ -44,12 +44,6 @@ Conv3D::Conv3D(const OperationDef& definition,
 
 Conv3D::Conv3D(Conv3D&& operation)
     : GPUOperation(std::move(operation)),
-      weights_0_(std::move(operation.weights_0_)),
-      weights_1_(std::move(operation.weights_1_)),
-      weights_2_(std::move(operation.weights_2_)),
-      weights_3_(std::move(operation.weights_3_)),
-      weights_buf_(std::move(operation.weights_buf_)),
-      biases_(std::move(operation.biases_)),
       stride_(operation.stride_),
       padding_(operation.padding_),
       kernel_size_(operation.kernel_size_),
@@ -59,12 +53,6 @@ Conv3D::Conv3D(Conv3D&& operation)
 
 Conv3D& Conv3D::operator=(Conv3D&& operation) {
   if (this != &operation) {
-    weights_0_ = std::move(operation.weights_0_);
-    weights_1_ = std::move(operation.weights_1_);
-    weights_2_ = std::move(operation.weights_2_);
-    weights_3_ = std::move(operation.weights_3_);
-    weights_buf_ = std::move(operation.weights_buf_);
-    biases_ = std::move(operation.biases_);
     std::swap(stride_, operation.stride_);
     std::swap(padding_, operation.padding_);
     std::swap(kernel_size_, operation.kernel_size_);
@@ -79,9 +67,15 @@ Conv3D& Conv3D::operator=(Conv3D&& operation) {
 absl::Status Conv3D::Compile(const CreationContext& creation_context) {
   const bool stride_correction =
       definition_.IsBatchSupported() && stride_.x != 1;
-  const std::string code =
-      GenerateConv3D(definition_, biases_, stride_correction, conv_params_,
-                     linked_operations_);
+  std::string code =
+      GenerateConv3D(definition_, stride_correction, conv_params_, &args_);
+  std::string element_wise_code;
+  RETURN_IF_ERROR(
+      MergeOperations(linked_operations_, &args_, &element_wise_code));
+  RETURN_IF_ERROR(args_.TransformToCLCode(creation_context.device->GetInfo(),
+                                          {{"dst_tensor", element_wise_code}},
+                                          &code));
+
   std::vector<CompilerOptions> options;
   if (definition_.precision == CalculationsPrecision::F16 &&
       creation_context.device->IsPowerVR()) {
@@ -93,45 +87,31 @@ absl::Status Conv3D::Compile(const CreationContext& creation_context) {
 }
 
 absl::Status Conv3D::BindArguments() {
-  kernel_.ResetBindingCounter();
-  RETURN_IF_ERROR(kernel_.SetMemoryAuto(src_[0]->GetMemoryPtr()));
-  if (conv_params_.AreWeightsBuffer()) {
-    RETURN_IF_ERROR(kernel_.SetMemoryAuto(weights_buf_.GetMemoryPtr()));
-  } else {
-    RETURN_IF_ERROR(kernel_.SetMemoryAuto(weights_0_.GetMemoryPtr()));
-    RETURN_IF_ERROR(kernel_.SetMemoryAuto(weights_1_.GetMemoryPtr()));
-    RETURN_IF_ERROR(kernel_.SetMemoryAuto(weights_2_.GetMemoryPtr()));
-    RETURN_IF_ERROR(kernel_.SetMemoryAuto(weights_3_.GetMemoryPtr()));
-  }
-  RETURN_IF_ERROR(kernel_.SetMemoryAuto(biases_.GetMemoryPtr()));
-  RETURN_IF_ERROR(BindArgs(&kernel_, linked_operations_));
-  RETURN_IF_ERROR(kernel_.SetMemoryAuto(dst_[0]->GetMemoryPtrForWriting()));
+  RETURN_IF_ERROR(args_.SetObjectRef("src_tensor", src_[0]));
+  RETURN_IF_ERROR(args_.SetObjectRef("dst_tensor", dst_[0]));
   if (!conv_params_.x_kernel_is_1) {
-    RETURN_IF_ERROR(kernel_.SetBytesAuto(stride_.x));
-    RETURN_IF_ERROR(kernel_.SetBytesAuto(padding_.x * src_[0]->Batch()));
-    RETURN_IF_ERROR(kernel_.SetBytesAuto(kernel_size_.x));
-    RETURN_IF_ERROR(kernel_.SetBytesAuto(dilation_.x * src_[0]->Batch()));
+    RETURN_IF_ERROR(args_.SetInt("stride_x", stride_.x));
+    RETURN_IF_ERROR(args_.SetInt("padding_x", padding_.x * src_[0]->Batch()));
+    RETURN_IF_ERROR(args_.SetInt("kernel_size_x", kernel_size_.x));
+    RETURN_IF_ERROR(args_.SetInt("dilation_x", dilation_.x * src_[0]->Batch()));
   }
   if (!conv_params_.y_kernel_is_1) {
-    RETURN_IF_ERROR(kernel_.SetBytesAuto(stride_.y));
-    RETURN_IF_ERROR(kernel_.SetBytesAuto(padding_.y));
-    RETURN_IF_ERROR(kernel_.SetBytesAuto(kernel_size_.y));
-    RETURN_IF_ERROR(kernel_.SetBytesAuto(dilation_.y));
+    RETURN_IF_ERROR(args_.SetInt("stride_y", stride_.y));
+    RETURN_IF_ERROR(args_.SetInt("padding_y", padding_.y));
+    RETURN_IF_ERROR(args_.SetInt("kernel_size_y", kernel_size_.y));
+    RETURN_IF_ERROR(args_.SetInt("dilation_y", dilation_.y));
   }
   if (!conv_params_.z_kernel_is_1) {
-    RETURN_IF_ERROR(kernel_.SetBytesAuto(stride_.z));
-    RETURN_IF_ERROR(kernel_.SetBytesAuto(padding_.z));
-    RETURN_IF_ERROR(kernel_.SetBytesAuto(kernel_size_.z));
-    RETURN_IF_ERROR(kernel_.SetBytesAuto(dilation_.z));
+    RETURN_IF_ERROR(args_.SetInt("stride_z", stride_.z));
+    RETURN_IF_ERROR(args_.SetInt("padding_z", padding_.z));
+    RETURN_IF_ERROR(args_.SetInt("kernel_size_z", kernel_size_.z));
+    RETURN_IF_ERROR(args_.SetInt("dilation_z", dilation_.z));
   }
-  if (definition_.IsBatchSupported()) {
-    RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->Batch()));
-  }
-  RETURN_IF_ERROR(kernel_.SetBytesAuto(
+  RETURN_IF_ERROR(args_.SetInt(
+      "grid_size_s",
       DivideRoundUp(dst_[0]->Slices(), conv_params_.block_size.w)));
-  RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->GetWBatchedHDS()));
-  RETURN_IF_ERROR(kernel_.SetBytesAuto(dst_[0]->GetWBatchedHDS()));
-  return absl::OkStatus();
+  RETURN_IF_ERROR(SetArguments(linked_operations_, &args_));
+  return args_.Bind(kernel_.kernel());
 }
 
 int3 Conv3D::GetGridSize() const {
@@ -247,9 +227,9 @@ std::string GenerateGlobalCoordinates(const int4& block_size,
     c += "  int linear_id_z = get_group_id(" + std::to_string(launch_remap[2]) +
          ") * get_local_size(2) + get_local_id(2);\n";
   }
-  c += "  int DST_S = (linear_id_z % grid_size_s) * " +
+  c += "  int DST_S = (linear_id_z % args.grid_size_s) * " +
        std::to_string(block_size.w) + ";\n";
-  c += "  int DST_Z = (linear_id_z / grid_size_s) * " +
+  c += "  int DST_Z = (linear_id_z / args.grid_size_s) * " +
        std::to_string(block_size.z) + ";\n";
   return c;
 }
@@ -313,19 +293,39 @@ std::string GenerateConv(CalculationsPrecision precision,
 }
 }  // namespace
 
-std::string GenerateConv3D(
-    const OperationDef& op_def, const LinearStorage& biases,
-    bool stride_correction, const Conv3D::ConvParams& conv_params,
-    const std::vector<ElementwiseOperation*>& linked_operations) {
-  std::string c = GetCommonDefines(op_def.precision);
-  TensorCodeGenerator src_tensor(
-      "src_data",
-      WHDSPoint{"src_size.x", "src_size.y", "src_size.z", "src_size.w"},
-      op_def.src_tensors[0]);
-  TensorCodeGenerator dst_tensor(
-      "dst_data",
-      WHDSPoint{"dst_size.x", "dst_size.y", "dst_size.z", "dst_size.w"},
-      op_def.dst_tensors[0]);
+std::string GenerateConv3D(const OperationDef& op_def, bool stride_correction,
+                           const Conv3D::ConvParams& conv_params,
+                           Arguments* args) {
+  auto src_desc = absl::make_unique<TensorDescriptor>(op_def.src_tensors[0]);
+  src_desc->SetTextureAddressMode(TextureAddressMode::ZERO);
+  if (op_def.IsBatchSupported()) {
+    src_desc->SetStateVar("BatchedWidth", "true");
+  }
+  args->AddObjectRef("src_tensor", AccessType::READ, std::move(src_desc));
+  auto dst_desc = absl::make_unique<TensorDescriptor>(op_def.dst_tensors[0]);
+  if (op_def.IsBatchSupported()) {
+    dst_desc->SetStateVar("BatchedWidth", "true");
+  }
+  args->AddObjectRef("dst_tensor", AccessType::WRITE, std::move(dst_desc));
+  if (!conv_params.x_kernel_is_1) {
+    args->AddInt("stride_x");
+    args->AddInt("padding_x");
+    args->AddInt("kernel_size_x");
+    args->AddInt("dilation_x");
+  }
+  if (!conv_params.y_kernel_is_1) {
+    args->AddInt("stride_y");
+    args->AddInt("padding_y");
+    args->AddInt("kernel_size_y");
+    args->AddInt("dilation_y");
+  }
+  if (!conv_params.z_kernel_is_1) {
+    args->AddInt("stride_z");
+    args->AddInt("padding_z");
+    args->AddInt("kernel_size_z");
+    args->AddInt("dilation_z");
+  }
+  args->AddInt("grid_size_s");
 
   const auto src_tensor_type = op_def.src_tensors[0].storage_type;
   const bool buffer_type = src_tensor_type == TensorStorageType::BUFFER ||
@@ -355,6 +355,7 @@ std::string GenerateConv3D(
 
   const int3 work_group_size = conv_params.work_group_size;
   const int4 block_size = conv_params.block_size;
+  std::string c = GetCommonDefines(op_def.precision);
   if (need_local_mem) {  // we use fixed workgroup size when use local mem
     c += "__attribute__((reqd_work_group_size(" +
          std::to_string(work_group_size.x) + ", " +
@@ -362,48 +363,13 @@ std::string GenerateConv3D(
          std::to_string(work_group_size.z) + ")))\n";
   }
   c += "__kernel void main_function(\n";
-  c += src_tensor.GetDeclaration(AccessType::READ) + ",\n";
-  if (conv_params.AreWeightsBuffer()) {
-    c += "    __global FLT4* filters,  \n";
-  } else {
-    c += "    __read_only image2d_t filters0,  \n";
-    c += "    __read_only image2d_t filters1,  \n";
-    c += "    __read_only image2d_t filters2,  \n";
-    c += "    __read_only image2d_t filters3,  \n";
-  }
-  c += biases.GetDeclaration();
-  c += GetArgsDeclaration(linked_operations);
-  c += dst_tensor.GetDeclaration(AccessType::WRITE) + ",\n";
-  if (!conv_params.x_kernel_is_1) {
-    c += "    int stride_x,                    \n";
-    c += "    int padding_x,                   \n";
-    c += "    int kernel_size_x,               \n";
-    c += "    int dilation_x,                  \n";
-  }
-  if (!conv_params.y_kernel_is_1) {
-    c += "    int stride_y,                    \n";
-    c += "    int padding_y,                   \n";
-    c += "    int kernel_size_y,               \n";
-    c += "    int dilation_y,                  \n";
-  }
-  if (!conv_params.z_kernel_is_1) {
-    c += "    int stride_z,                    \n";
-    c += "    int padding_z,                   \n";
-    c += "    int kernel_size_z,               \n";
-    c += "    int dilation_z,                  \n";
-  }
-  if (op_def.IsBatchSupported()) {
-    c += "    int batch_size,                  \n";
-  }
-  c += "    int grid_size_s,                   \n";
-  c += "    int4 src_size,                     \n";
-  c += "    int4 dst_size                      \n";
-  c += ") {\n";
+  c += "$0) {\n";
   c += GenerateGlobalCoordinates(block_size,
                                  conv_params.work_group_launch_order);
   if (!need_local_mem) {
-    c += "  if (DST_X >= dst_size.x || DST_Y >= dst_size.y || DST_Z >= "
-         "dst_size.z) return;\n";
+    c += "  if (DST_X >= args.dst_tensor.Width() || DST_Y >= "
+         "args.dst_tensor.Height() || DST_Z >= args.dst_tensor.Depth()) "
+         "return;\n";
   }
   if (conv_params.weights_upload_type ==
       Conv3D::WeightsUploadType::LOCAL_MEM_BY_THREADS) {
@@ -426,44 +392,45 @@ std::string GenerateConv3D(
       const std::string xc = "(DST_X + " + std::to_string(x) + ")";
       if (stride_correction) {
         c += "  int xc" + std::to_string(x) + " = " +
-             GetXStrideCorrected(xc, "batch_size", "stride_x", "padding_x") +
+             GetXStrideCorrected(xc, "args.src_tensor.Batch()", "args.stride_x",
+                                 "args.padding_x") +
              ";\n";
       } else {
         c += "  int xc" + std::to_string(x) + " = " + xc +
-             " * stride_x + padding_x;\n";
+             " * args.stride_x + args.padding_x;\n";
       }
     }
   } else if (!can_read_out_of_x) {
     for (int x = 0; x < block_size.x; ++x) {
       const std::string xc = "(DST_X + " + std::to_string(x) + ")";
       c += "  int xc" + std::to_string(x) + " = clamp(" + xc +
-           ", 0, src_size.x - 1);\n";
+           ", 0, args.src_tensor.Width() - 1);\n";
     }
   }
   if (!conv_params.y_kernel_is_1) {
     for (int y = 0; y < block_size.y; ++y) {
       const std::string yc = "(DST_Y + " + std::to_string(y) + ")";
       c += "  int yc" + std::to_string(y) + " = " + yc +
-           " * stride_y + padding_y;\n";
+           " * args.stride_y + args.padding_y;\n";
     }
   } else if (!can_read_out_of_y) {
     for (int y = 0; y < block_size.y; ++y) {
       const std::string yc = "(DST_Y + " + std::to_string(y) + ")";
       c += "  int yc" + std::to_string(y) + " = clamp(" + yc +
-           ", 0, src_size.y - 1);\n";
+           ", 0, args.src_tensor.Height() - 1);\n";
     }
   }
   if (!conv_params.z_kernel_is_1) {
     for (int z = 0; z < block_size.z; ++z) {
       const std::string zc = "(DST_Z + " + std::to_string(z) + ")";
       c += "  int zc" + std::to_string(z) + " = " + zc +
-           " * stride_z + padding_z;\n";
+           " * args.stride_z + args.padding_z;\n";
     }
   } else if (!can_read_out_of_z) {
     for (int z = 0; z < block_size.z; ++z) {
       const std::string zc = "(DST_Z + " + std::to_string(z) + ")";
       c += "  int zc" + std::to_string(z) + " = clamp(" + zc +
-           ", 0, src_size.z - 1);\n";
+           ", 0, args.src_tensor.Depth() - 1);\n";
     }
   }
   if (need_local_mem) {
@@ -476,55 +443,59 @@ std::string GenerateConv3D(
     c += "  __global FLT4* weights_cache;\n";
   }
   std::string kernel_size;
-  kernel_size += conv_params.x_kernel_is_1 ? "" : " * kernel_size_x";
-  kernel_size += conv_params.y_kernel_is_1 ? "" : " * kernel_size_y";
-  kernel_size += conv_params.z_kernel_is_1 ? "" : " * kernel_size_z";
+  kernel_size += conv_params.x_kernel_is_1 ? "" : " * args.kernel_size_x";
+  kernel_size += conv_params.y_kernel_is_1 ? "" : " * args.kernel_size_y";
+  kernel_size += conv_params.z_kernel_is_1 ? "" : " * args.kernel_size_z";
   if (conv_params.AreWeightsBuffer()) {
-    c += "  __global FLT4* filters_loc = filters + DST_S * 4 * src_size.w" +
+    c += "  __global FLT4* filters_loc = args.weights.GetPtr() + DST_S * 4 * "
+         "args.src_tensor.Slices()" +
          kernel_size + ";\n";
   }
   if (buffer_type) {
-    c += "  const int src_layer_offset = src_size.x * src_size.y;\n";
+    c += "  const int src_layer_offset = args.src_tensor.SliceStride();\n";
   }
   if (!is1x1x1) {
     c += "  int filter_offset = 0;\n";
   }
   if (!conv_params.z_kernel_is_1) {
-    c += "  for (int kz = 0; kz < kernel_size_z; ++kz) {\n";
+    c += "  for (int kz = 0; kz < args.kernel_size_z; ++kz) {\n";
     for (int z = 0; z < block_size.z; ++z) {
       const std::string zck = "zck" + std::to_string(z);
-      c += "  int zck" + std::to_string(z) + " = kz * dilation_z + zc" +
+      c += "  int zck" + std::to_string(z) + " = kz * args.dilation_z + zc" +
            std::to_string(z) + ";\n";
       if (manual_clamp_z) {
         c += "  bool mz" + std::to_string(z) + " = " + zck + " >= 0 && " + zck +
-             " < src_size.z;\n";
-        c += "  " + zck + " = clamp(" + zck + ", 0, src_size.z - 1);\n";
+             " < args.src_tensor.Depth();\n";
+        c += "  " + zck + " = clamp(" + zck +
+             ", 0, args.src_tensor.Depth() - 1);\n";
       }
     }
   }
   if (!conv_params.y_kernel_is_1) {
-    c += "  for (int ky = 0; ky < kernel_size_y; ++ky) {\n";
+    c += "  for (int ky = 0; ky < args.kernel_size_y; ++ky) {\n";
     for (int y = 0; y < block_size.y; ++y) {
       const std::string yck = "yck" + std::to_string(y);
-      c += "  int " + yck + " = ky * dilation_y + yc" + std::to_string(y) +
+      c += "  int " + yck + " = ky * args.dilation_y + yc" + std::to_string(y) +
            ";\n";
       if (manual_clamp_y) {
         c += "  bool my" + std::to_string(y) + " = " + yck + " >= 0 && " + yck +
-             " < src_size.y;\n";
-        c += "  " + yck + " = clamp(" + yck + ", 0, src_size.y - 1);\n";
+             " < args.src_tensor.Height();\n";
+        c += "  " + yck + " = clamp(" + yck +
+             ", 0, args.src_tensor.Height() - 1);\n";
       }
     }
   }
   if (!conv_params.x_kernel_is_1) {
-    c += "  for (int kx = 0; kx < kernel_size_x; ++kx) {\n";
+    c += "  for (int kx = 0; kx < args.kernel_size_x; ++kx) {\n";
     for (int x = 0; x < block_size.x; ++x) {
       const std::string xck = "xck" + std::to_string(x);
-      c += "  int xck" + std::to_string(x) + " = kx * dilation_x + xc" +
+      c += "  int xck" + std::to_string(x) + " = kx * args.dilation_x + xc" +
            std::to_string(x) + ";\n";
       if (manual_clamp_x) {
         c += "  bool mx" + std::to_string(x) + " = " + xck + " >= 0 && " + xck +
-             " < src_size.x;\n";
-        c += "  " + xck + " = clamp(" + xck + ", 0, src_size.x - 1);\n";
+             " < args.src_tensor.Width();\n";
+        c += "  " + xck + " = clamp(" + xck +
+             ", 0, args.src_tensor.Width() - 1);\n";
       }
     }
   }
@@ -577,7 +548,8 @@ std::string GenerateConv3D(
           const std::string xs = std::to_string(x);
           const std::string xc = get_src_x_coord(x);
           const std::string id = zs + ys + xs;
-          c += "  " + src_tensor.GetAddressWHDS("src_a_" + id, xc, yc, zc, "0");
+          c += "  args.src_tensor.GetAddress(src_a_" + id + ", " + xc + ", " +
+               yc + ", " + zc + ", 0);\n";
           if (!is1x1x1 && src_tensor_type == TensorStorageType::IMAGE_BUFFER) {
             std::string condition;
             if (manual_clamp_x) {
@@ -622,7 +594,6 @@ std::string GenerateConv3D(
     }
   };
 
-  const auto mode = TextureAddressMode::ZERO;
   auto read_src = [&]() {
     for (int z = 0; z < block_size.z; ++z) {
       const std::string zs = std::to_string(z);
@@ -642,7 +613,7 @@ std::string GenerateConv3D(
             if (src_tensor_type == TensorStorageType::IMAGE_BUFFER) {
               multiplier = "";
             }
-            c += "    src" + id + " = " + src_tensor.Read("src_a_" + id) +
+            c += "    src" + id + " = args.src_tensor.Read(src_a_" + id + ")" +
                  multiplier + ";\n";
             if (!is1x1x1 &&
                 src_tensor_type == TensorStorageType::IMAGE_BUFFER) {
@@ -651,9 +622,8 @@ std::string GenerateConv3D(
               c += "    src_a_" + id + " += src_layer_offset;\n";
             }
           } else {
-            c += "    src" + id + " = " +
-                 src_tensor.ReadWHDS(xc, yc, zc, "s", mode) + multiplier +
-                 ";\n";
+            c += "    src" + id + " = args.src_tensor.Read(" + xc + ", " + yc +
+                 ", " + zc + ", s)" + multiplier + ";\n";
           }
         }
       }
@@ -683,15 +653,14 @@ std::string GenerateConv3D(
   } else {  // TEXTURES_MEM
     for (int dst_s = 0; dst_s < block_size.w; ++dst_s) {
       const std::string f_y = is1x1x1 ? "s" : "filter_offset";
-      const std::string fc =
-          "(int2)(DST_S + " + std::to_string(dst_s) + ", " + f_y + ")";
       c += absl::Substitute(
-          R"(    FLT4 f$1 = READ_IMAGE(filters0, smp_none, $0);
-    FLT4 f$2 = READ_IMAGE(filters1, smp_none, $0);
-    FLT4 f$3 = READ_IMAGE(filters2, smp_none, $0);
-    FLT4 f$4 = READ_IMAGE(filters3, smp_none, $0);
+          R"(    FLT4 f$2 = args.weights0.Read(DST_S + $0, $1);
+    FLT4 f$3 = args.weights1.Read(DST_S + $0, $1);
+    FLT4 f$4 = args.weights2.Read(DST_S + $0, $1);
+    FLT4 f$5 = args.weights3.Read(DST_S + $0, $1);
 )",
-          fc, dst_s * 4 + 0, dst_s * 4 + 1, dst_s * 4 + 2, dst_s * 4 + 3);
+          dst_s, f_y, dst_s * 4 + 0, dst_s * 4 + 1, dst_s * 4 + 2,
+          dst_s * 4 + 3);
     }
     if (!is1x1x1) {
       c += "    filter_offset++;\n";
@@ -716,7 +685,7 @@ std::string GenerateConv3D(
          std::to_string(block_size.w * 4 * conv_params.src_depth_loop_size) +
          ";\n";
   }
-  c += "  } while (s < src_size.w);\n";
+  c += "  } while (s < args.src_tensor.Slices());\n";
   if (!conv_params.z_kernel_is_1) {
     c += "  }\n";
   }
@@ -728,25 +697,28 @@ std::string GenerateConv3D(
   }
   if (conv_params.weights_upload_type ==
       Conv3D::WeightsUploadType::LOCAL_MEM_ASYNC_SUBGROUP) {
-    c += GenerateAsyncUpload("weights_cache", "biases", "DST_S", block_size.w);
+    c += GenerateAsyncUpload("weights_cache", "args.biases.GetPtr()", "DST_S",
+                             block_size.w);
   } else if (conv_params.weights_upload_type ==
              Conv3D::WeightsUploadType::LOCAL_MEM_BY_THREADS) {
     c += "  barrier(CLK_LOCAL_MEM_FENCE);\n";
-    c += GenerateUploadByThreads("weights_cache", "biases", "DST_S", "lid",
-                                 total_work_items, block_size.w);
+    c +=
+        GenerateUploadByThreads("weights_cache", "args.biases.GetPtr()",
+                                "DST_S", "lid", total_work_items, block_size.w);
     c += "  barrier(CLK_LOCAL_MEM_FENCE);\n";
   } else if (conv_params.weights_upload_type ==
              Conv3D::WeightsUploadType::GLOBAL_MEM) {
-    c += "  weights_cache = biases + DST_S;\n";
+    c += "  weights_cache = args.biases.GetPtr() + DST_S;\n";
   }
   if (need_local_mem) {
-    c += "  if (DST_X >= dst_size.x || DST_Y >= dst_size.y || DST_Z >= "
-         "dst_size.z) return;\n";
+    c += "  if (DST_X >= args.dst_tensor.Width() || DST_Y >= "
+         "args.dst_tensor.Height() || DST_Z >= args.dst_tensor.Depth()) "
+         "return;\n";
   }
   for (int s = 0; s < block_size.w; ++s) {
     const std::string dsts =
         "DST_S" + (s == 0 ? "" : " + " + std::to_string(s));
-    c += "  if (" + dsts + " >= dst_size.w) return;\n";
+    c += "  if (" + dsts + " >= args.dst_tensor.Slices()) return;\n";
     for (int z = 0; z < block_size.z; ++z) {
       const std::string dstz =
           "DST_Z" + (z == 0 ? "" : " + " + std::to_string(z));
@@ -758,18 +730,18 @@ std::string GenerateConv3D(
               "DST_X" + (x == 0 ? "" : " + " + std::to_string(x));
           const std::string r_id = std::to_string(s) + std::to_string(z) +
                                    std::to_string(y) + std::to_string(x);
-          c += "  if (" + dstx + " < dst_size.x && " + dsty +
-               " < dst_size.y && " + dstz + " < dst_size.z) {\n";
+          c += "  if (" + dstx + " < args.dst_tensor.Width() && " + dsty +
+               " < args.dst_tensor.Height() && " + dstz +
+               " < args.dst_tensor.Depth()) {\n";
           if (conv_params.AreWeightsBuffer()) {
             c += "    FLT4 res = TO_FLT4(r" + r_id + ") + weights_cache[" +
                  std::to_string(s) + "];\n";
           } else {
-            c += "    FLT4 res = TO_FLT4(r" + r_id + ") + " +
-                 biases.ReadLinearFLT4(dsts) + ";\n";
+            c += "    FLT4 res = TO_FLT4(r" + r_id + ") + args.biases.Read(" +
+                 dsts + ");\n";
           }
-          // const LinkingContext context{"res", xs, ys, zs};
-          // c += PostProcess(linked_operations, context);
-          c += "    " + dst_tensor.WriteWHDS("res", dstx, dsty, dstz, dsts);
+          c += "    args.dst_tensor.Write(res, " + dstx + ", " + dsty + ", " +
+               dstz + ", " + dsts + ");\n";
           c += "  }\n";
         }
       }
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/conv_3d.h b/tensorflow/lite/delegates/gpu/cl/kernels/conv_3d.h
index 8dfeac1ee6f..7a00fabe6a0 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/conv_3d.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/conv_3d.h
@@ -91,10 +91,10 @@ class Conv3D : public GPUOperation {
                                    const Convolution3DAttributes& attr,
                                    Conv3D* result);
 
-  friend std::string GenerateConv3D(
-      const OperationDef& op_def, const LinearStorage& biases,
-      bool stride_correction, const ConvParams& conv_params,
-      const std::vector<ElementwiseOperation*>& linked_operations);
+  friend std::string GenerateConv3D(const OperationDef& op_def,
+                                    bool stride_correction,
+                                    const ConvParams& conv_params,
+                                    Arguments* args);
 
   ConvParams GuessBestParams(const CLDevice& device,
                              const OperationDef& definition,
@@ -108,13 +108,6 @@ class Conv3D : public GPUOperation {
   absl::Status BindArguments();
   int3 GetGridSize() const;
 
-  Texture2D weights_0_;
-  Texture2D weights_1_;
-  Texture2D weights_2_;
-  Texture2D weights_3_;
-  Buffer weights_buf_;
-  LinearStorage biases_;
-
   int3 stride_;
   int3 padding_;
   int3 kernel_size_;
@@ -129,16 +122,17 @@ absl::Status Conv3D::UploadData(const tflite::gpu::Tensor<OHWDI, T>& weights,
                                 const tflite::gpu::Tensor<Linear, T>& biases,
                                 CLContext* context) {
   RETURN_IF_ERROR(UploadWeights(weights, context));
-  LinearStorageCreateInfo create_info;
-  create_info.storage_type = conv_params_.AreWeightsBuffer()
-                                 ? LinearStorageType::BUFFER
-                                 : LinearStorageType::TEXTURE_2D;
-  create_info.data_type = definition_.precision == CalculationsPrecision::F32
-                              ? DataType::FLOAT32
-                              : DataType::FLOAT16;
-  create_info.name = "biases";
-  create_info.aligned_size = weights.shape.o;
-  RETURN_IF_ERROR(CreateLinearStorage(create_info, biases, context, &biases_));
+  TensorLinearDescriptor desc;
+  desc.storage_type = conv_params_.AreWeightsBuffer()
+                          ? LinearStorageType::BUFFER
+                          : LinearStorageType::TEXTURE_2D;
+  desc.element_type = definition_.GetDataType();
+
+  LinearStorage lt;
+  RETURN_IF_ERROR(CreateLinearStorage(desc, biases, context, &lt));
+  args_.AddObject("biases", AccessType::READ,
+                  absl::make_unique<LinearStorage>(std::move(lt)),
+                  absl::make_unique<TensorLinearDescriptor>(desc));
   return absl::OkStatus();
 }
 
@@ -161,29 +155,34 @@ absl::Status Conv3D::UploadWeights(const tflite::gpu::Tensor<OHWDI, T>& weights,
 
   const int float4_size = f32_weights ? 16 : 8;
 
+  Texture2D weights_0;
+  Texture2D weights_1;
+  Texture2D weights_2;
+  Texture2D weights_3;
+  Buffer weights_buf;
   if (f32_weights) {
     std::vector<float4> gpu_data(elements_count);
     RearrangeWeightsData(weights, absl::MakeSpan(gpu_data));
     if (conv_params_.AreWeightsBuffer()) {
       RETURN_IF_ERROR(CreateReadOnlyBuffer(float4_size * elements_count,
                                            gpu_data.data(), context,
-                                           &weights_buf_));
+                                           &weights_buf));
     } else {
       RETURN_IF_ERROR(CreateTexture2DRGBA(
           definition_.GetDataType(), texture_width, texture_height,
-          gpu_data.data(), context, &weights_0_));
+          gpu_data.data(), context, &weights_0));
       RETURN_IF_ERROR(CreateTexture2DRGBA(
           definition_.GetDataType(), texture_width, texture_height,
           gpu_data.data() + texture_width * texture_height, context,
-          &weights_1_));
+          &weights_1));
       RETURN_IF_ERROR(CreateTexture2DRGBA(
           definition_.GetDataType(), texture_width, texture_height,
           gpu_data.data() + texture_width * texture_height * 2, context,
-          &weights_2_));
+          &weights_2));
       RETURN_IF_ERROR(CreateTexture2DRGBA(
           definition_.GetDataType(), texture_width, texture_height,
           gpu_data.data() + texture_width * texture_height * 3, context,
-          &weights_3_));
+          &weights_3));
     }
   } else {
     std::vector<half4> gpu_data(elements_count);
@@ -191,26 +190,50 @@ absl::Status Conv3D::UploadWeights(const tflite::gpu::Tensor<OHWDI, T>& weights,
     if (conv_params_.AreWeightsBuffer()) {
       RETURN_IF_ERROR(CreateReadOnlyBuffer(float4_size * elements_count,
                                            gpu_data.data(), context,
-                                           &weights_buf_));
+                                           &weights_buf));
     } else {
       RETURN_IF_ERROR(CreateTexture2DRGBA(
           definition_.GetDataType(), texture_width, texture_height,
-          gpu_data.data(), context, &weights_0_));
+          gpu_data.data(), context, &weights_0));
       RETURN_IF_ERROR(CreateTexture2DRGBA(
           definition_.GetDataType(), texture_width, texture_height,
           gpu_data.data() + texture_width * texture_height, context,
-          &weights_1_));
+          &weights_1));
       RETURN_IF_ERROR(CreateTexture2DRGBA(
           definition_.GetDataType(), texture_width, texture_height,
           gpu_data.data() + texture_width * texture_height * 2, context,
-          &weights_2_));
+          &weights_2));
       RETURN_IF_ERROR(CreateTexture2DRGBA(
           definition_.GetDataType(), texture_width, texture_height,
           gpu_data.data() + texture_width * texture_height * 3, context,
-          &weights_3_));
+          &weights_3));
     }
   }
 
+  if (conv_params_.AreWeightsBuffer()) {
+    BufferDescriptor desc;
+    desc.element_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
+    desc.element_size = 4;
+    args_.AddObject("weights", AccessType::READ,
+                    absl::make_unique<Buffer>(std::move(weights_buf)),
+                    absl::make_unique<BufferDescriptor>(desc));
+  } else {
+    Texture2DDescriptor desc;
+    desc.element_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
+    args_.AddObject("weights0", AccessType::READ,
+                    absl::make_unique<Texture2D>(std::move(weights_0)),
+                    absl::make_unique<Texture2DDescriptor>(desc));
+    args_.AddObject("weights1", AccessType::READ,
+                    absl::make_unique<Texture2D>(std::move(weights_1)),
+                    absl::make_unique<Texture2DDescriptor>(desc));
+    args_.AddObject("weights2", AccessType::READ,
+                    absl::make_unique<Texture2D>(std::move(weights_2)),
+                    absl::make_unique<Texture2DDescriptor>(desc));
+    args_.AddObject("weights3", AccessType::READ,
+                    absl::make_unique<Texture2D>(std::move(weights_3)),
+                    absl::make_unique<Texture2DDescriptor>(desc));
+  }
+
   return absl::OkStatus();
 }
 
diff --git a/tensorflow/lite/delegates/gpu/cl/linear_storage.cc b/tensorflow/lite/delegates/gpu/cl/linear_storage.cc
index abaa891805e..09c56c13e4a 100644
--- a/tensorflow/lite/delegates/gpu/cl/linear_storage.cc
+++ b/tensorflow/lite/delegates/gpu/cl/linear_storage.cc
@@ -50,6 +50,13 @@ absl::Status TensorLinearDescriptor::PerformSelector(
     return absl::OkStatus();
   } else if (selector == "Read") {
     return PerformReadSelector(args, result);
+  } else if (selector == "GetPtr") {
+    if (storage_type != LinearStorageType::BUFFER) {
+      return absl::InvalidArgumentError(
+          "GetPtr selector supported for LinearStorageType::BUFFER only.");
+    }
+    *result = "buffer";
+    return absl::OkStatus();
   } else {
     return absl::NotFoundError(absl::StrCat(
         "TensorLinearDescriptor don't have selector with name - ", selector));

From c84c80e92ab96a3e5a0b25c7b88257d44043954b Mon Sep 17 00:00:00 2001
From: Raman Sarokin <sorokin@google.com>
Date: Thu, 25 Jun 2020 21:23:18 -0700
Subject: [PATCH 1139/1390] ConvolutionTransposed converted to new style.

PiperOrigin-RevId: 318407524
Change-Id: Ie8e2e7b85f635eb39a2e5c50fcee6a4e7606b6c6
---
 .../gpu/cl/kernels/convolution_transposed.cc  | 268 ++++++++----------
 .../gpu/cl/kernels/convolution_transposed.h   |  56 ++--
 2 files changed, 159 insertions(+), 165 deletions(-)

diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed.cc b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed.cc
index 7d7ebeb2020..6bf4d6a9aac 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed.cc
@@ -30,25 +30,29 @@ namespace gpu {
 namespace cl {
 namespace {
 
-std::string GenerateConvolutionTransposedCode(
-    const OperationDef& op_def, const LinearStorage& biases,
-    const CLDevice& device, bool weights_are_buffer, const int3& block_size,
-    const std::vector<ElementwiseOperation*>& linked_operations) {
-  TensorCodeGenerator src_tensor(
-      "src_data",
-      WHSBPoint{"src_size.x", "src_size.y", "src_size.z", "src_size.w"},
-      op_def.src_tensors[0]);
-  TensorCodeGenerator dst_tensor(
-      "dst_data",
-      WHSBPoint{"dst_size.x", "dst_size.y", "dst_size.z", "dst_size.w"},
-      op_def.dst_tensors[0]);
+std::string GenerateConvolutionTransposedCode(const OperationDef& op_def,
+                                              const CLDevice& device,
+                                              bool weights_are_buffer,
+                                              const int3& block_size,
+                                              Arguments* args) {
+  auto src_desc = absl::make_unique<TensorDescriptor>(op_def.src_tensors[0]);
+  src_desc->SetTextureAddressMode(GetFastestZeroMode(device));
+  args->AddObjectRef("src_tensor", AccessType::READ, std::move(src_desc));
+  args->AddObjectRef(
+      "dst_tensor", AccessType::WRITE,
+      absl::make_unique<TensorDescriptor>(op_def.dst_tensors[0]));
+  args->AddInt("stride_x");
+  args->AddInt("stride_y");
+  args->AddInt("padding_x");
+  args->AddInt("padding_y");
+  args->AddInt("kernel_size_x");
+  args->AddInt("kernel_size_y");
 
   const auto src_tensor_type = op_def.src_tensors[0].storage_type;
   bool image_buffer = src_tensor_type == TensorStorageType::IMAGE_BUFFER;
   bool manual_clamp =
       image_buffer || src_tensor_type == TensorStorageType::BUFFER;
 
-  const std::string batch_id = op_def.IsBatchSupported() ? "B" : "";
   std::string c = GetCommonDefines(op_def.precision);
 
   for (int z = 0; z < block_size.z; ++z) {
@@ -92,135 +96,118 @@ std::string GenerateConvolutionTransposedCode(
   }
 
   c += "__kernel void main_function(\n";
-  c += src_tensor.GetDeclaration(AccessType::READ) + ",\n";
-  if (weights_are_buffer) {
-    c += "    __global FLT16* filters,  \n";
-  } else {
-    c += "    __read_only image2d_t filters0,  \n";
-    c += "    __read_only image2d_t filters1,  \n";
-    c += "    __read_only image2d_t filters2,  \n";
-    c += "    __read_only image2d_t filters3,  \n";
-  }
-  c += biases.GetDeclaration();
-  c += GetArgsDeclaration(linked_operations);
-  c += dst_tensor.GetDeclaration(AccessType::WRITE) + ",\n";
-  c += "    int2 kernel_size,          \n";
-  c += "    int2 stride,               \n";
-  c += "    int2 padding,              \n";
-  c += "    int4 src_size,             \n";
-  c += "    int4 dst_size              \n";
-  c += ") {\n";
+  c += "$0) {\n";
   if (op_def.IsBatchSupported()) {
     c += "  int linear_id = get_global_id(0);\n";
-    c += "  int dst_x = (linear_id / dst_size.w);\n";
-    c += "  int B = linear_id % dst_size.w;\n";
+    c += "  int dst_x = (linear_id / args.dst_tensor.Batch());\n";
+    c += "  int B = linear_id % args.dst_tensor.Batch();\n";
+    c += "  args.dst_tensor.SetBatchRef(B);\n";
+    c += "  args.src_tensor.SetBatchRef(B);\n";
   } else {
     c += "  int dst_x = get_global_id(0);\n";
   }
-  c += "  int rem_x = dst_x % stride.x;\n";
-  c += "  int ceil_x = dst_x / stride.x;\n";
-  c += "  dst_x = ceil_x * stride.x * " + std::to_string(block_size.x) +
+  c += "  int rem_x = dst_x % args.stride_x;\n";
+  c += "  int ceil_x = dst_x / args.stride_x;\n";
+  c += "  dst_x = ceil_x * args.stride_x * " + std::to_string(block_size.x) +
        " + rem_x;\n";
   c += "  int dst_y = get_global_id(1);\n";
-  c += "  int rem_y = dst_y % stride.y;\n";
-  c += "  int ceil_y = dst_y / stride.y;\n";
-  c += "  dst_y = ceil_y * stride.y * " + std::to_string(block_size.y) +
+  c += "  int rem_y = dst_y % args.stride_y;\n";
+  c += "  int ceil_y = dst_y / args.stride_y;\n";
+  c += "  dst_y = ceil_y * args.stride_y * " + std::to_string(block_size.y) +
        " + rem_y;\n";
   c += "  int dst_z = get_global_id(2) * " + std::to_string(block_size.z) +
        ";\n";
-  c += "  if (dst_x >= dst_size.x || dst_y >= dst_size.y || dst_z >= "
-       "dst_size.z) return;\n";
+  c += "  if (dst_x >= args.dst_tensor.Width() || dst_y >= "
+       "args.dst_tensor.Height() || dst_z >= "
+       "args.dst_tensor.Slices()) return;\n";
   if (weights_are_buffer) {
-    c += "  int f_base = dst_z * src_size.z * kernel_size.x * kernel_size.y;\n";
+    c += "  int f_base = dst_z * args.src_tensor.Slice() * args.kernel_size_x "
+         "* args.kernel_size_y;\n";
   }
   for (int i = 0; i < block_size.x * block_size.y * block_size.z; ++i) {
     c += "  ACCUM_FLT4 r" + std::to_string(i) +
          " = (ACCUM_FLT4)(0.0f, 0.0f, 0.0f, 0.0f);\n";
   }
-  c += "  int kernel_first_dst_x = dst_x + padding.x;\n";
-  c += "  int kernel_first_dst_y = dst_y + padding.y;\n";
-  c += "  int kernel_last_dst_x = kernel_first_dst_x - kernel_size.x;\n";
-  c += "  int kernel_last_dst_y = kernel_first_dst_y - kernel_size.y;\n";
-  c += "  int offset_x = abs(padding.x);\n";
-  c += "  int offset_x_strided = offset_x * stride.x;\n";
-  c += "  int src_x = (kernel_first_dst_x + offset_x_strided) / stride.x - "
-       "offset_x;\n";
-  c += "  int offset_y = abs(padding.y);\n";
-  c += "  int offset_y_strided = offset_y * stride.y;\n";
-  c += "  int src_y = (kernel_first_dst_y + offset_y_strided) / stride.y - "
-       "offset_y;\n";
-  c += "  int src_as_dst_y = src_y * stride.y;\n";
+  c += "  int kernel_first_dst_x = dst_x + args.padding_x;\n";
+  c += "  int kernel_first_dst_y = dst_y + args.padding_y;\n";
+  c += "  int kernel_last_dst_x = kernel_first_dst_x - args.kernel_size_x;\n";
+  c += "  int kernel_last_dst_y = kernel_first_dst_y - args.kernel_size_y;\n";
+  c += "  int offset_x = abs(args.padding_x);\n";
+  c += "  int offset_x_strided = offset_x * args.stride_x;\n";
+  c +=
+      "  int src_x = (kernel_first_dst_x + offset_x_strided) / args.stride_x - "
+      "offset_x;\n";
+  c += "  int offset_y = abs(args.padding_y);\n";
+  c += "  int offset_y_strided = offset_y * args.stride_y;\n";
+  c +=
+      "  int src_y = (kernel_first_dst_y + offset_y_strided) / args.stride_y - "
+      "offset_y;\n";
+  c += "  int src_as_dst_y = src_y * args.stride_y;\n";
   c += "  for (;src_as_dst_y > kernel_last_dst_y; src_y -= 1, src_as_dst_y -= "
-       "stride.y) {\n";
+       "args.stride_y) {\n";
   for (int y = 0; y < block_size.y; ++y) {
     const std::string yindex = std::to_string(y);
     c += "    int sy" + yindex + " = src_y + " + yindex + ";\n";
     if (manual_clamp) {
       c += "    bool in_y" + yindex + " = sy" + yindex + " >= 0 && sy" +
-           yindex + " < src_size.y;\n";
+           yindex + " < args.src_tensor.Height();\n";
       if (!image_buffer) {
         c += "    sy" + yindex + " = clamp(sy" + yindex +
-             ", 0, src_size.y - 1);\n";
+             ", 0, args.src_tensor.Height() - 1);\n";
       }
     }
   }
   c += "    int kernel_y = kernel_first_dst_y - src_as_dst_y;\n";
-  c += "    int src_as_dst_x = src_x * stride.x;\n";
+  c += "    int src_as_dst_x = src_x * args.stride_x;\n";
   c += "    int src_x_copy = src_x;\n";
   c += "    for (;src_as_dst_x > kernel_last_dst_x; src_x_copy -= 1, "
        "src_as_dst_x "
-       "-= stride.x) {\n";
+       "-= args.stride_x) {\n";
   for (int x = 0; x < block_size.x; ++x) {
     const std::string xindex = std::to_string(x);
     c += "      int sx" + xindex + " = src_x_copy + " + xindex + ";\n";
     if (manual_clamp) {
       c += "      bool in_x" + xindex + " = sx" + xindex + " >= 0 && sx" +
-           xindex + " < src_size.x;\n";
+           xindex + " < args.src_tensor.Width();\n";
       if (!image_buffer) {
         c += "      sx" + xindex + " = clamp(sx" + xindex +
-             ", 0, src_size.x - 1);\n";
+             ", 0, args.src_tensor.Width() - 1);\n";
       }
     }
   }
-  const std::string layer_offset =
-      std::string("src_size.x * src_size.y") +
-      (op_def.IsBatchSupported() ? " * src_size.w" : "");
   for (int y = 0; y < block_size.y; ++y) {
     const std::string yindex = std::to_string(y);
     for (int x = 0; x < block_size.x; ++x) {
       const std::string xindex = std::to_string(x);
       const std::string id = std::to_string(y * block_size.x + x);
+      c += "      args.src_tensor.GetAddress(addr_" + id + ", sx" + xindex +
+           ", sy" + yindex + ", 0);\n";
       if (image_buffer) {
-        c += "      " + src_tensor.GetAddressWHSB("addr_" + id, "sx" + xindex,
-                                                  "sy" + yindex, "0", batch_id);
         c += "      addr_" + id + " = select(-1, addr_" + id + ", (in_x" +
              xindex + " && in_y" + yindex + "));\n";
         c += absl::Substitute(
-            "      int dz_$0 = select(0, $3, (in_x$1 && "
-            "in_y$2));\n",
-            y * block_size.x + x, x, y, layer_offset);
-      } else {
-        c += "      " + src_tensor.GetAddressWHSB("addr_" + id, "sx" + xindex,
-                                                  "sy" + yindex, "0", batch_id);
+            "      int dz_$0 = select(0, args.src_tensor.SliceStride(), "
+            "(in_x$1 && in_y$2));\n",
+            y * block_size.x + x, x, y);
       }
     }
   }
   if (src_tensor_type == TensorStorageType::BUFFER) {
-    c += "      int dz = " + layer_offset + ";\n";
+    c += "      int dz = args.src_tensor.SliceStride();\n";
   }
   if (block_size.x == 1 && block_size.y == 1 && manual_clamp) {
     c += "      if (!in_x0 || !in_y0) continue;\n";
   }
   c += "      int kernel_x = kernel_first_dst_x - src_as_dst_x;\n";
-  c += "      int kernel_index = kernel_y * kernel_size.x + kernel_x;\n";
+  c += "      int kernel_index = kernel_y * args.kernel_size_x + kernel_x;\n";
   if (weights_are_buffer) {
-    c += "      int f_offset = f_base + kernel_index * src_size.z * " +
+    c += "      int f_offset = f_base + kernel_index * "
+         "args.src_tensor.Slices() * " +
          std::to_string(block_size.z) + ";\n";
   } else {
-    c += "      int x_c = kernel_index * src_size.z;\n";
+    c += "      int x_c = kernel_index * args.src_tensor.Slices();\n";
   }
-  c += "      for (int s = 0; s < src_size.z; ++s) {\n";
-  const auto mode = GetFastestZeroMode(device);
+  c += "      for (int s = 0; s < args.src_tensor.Slices(); ++s) {\n";
   const bool conditional_read = device.IsMali();
   for (int y = 0; y < block_size.y; ++y) {
     const std::string yindex = std::to_string(y);
@@ -228,39 +215,37 @@ std::string GenerateConvolutionTransposedCode(
       const std::string xindex = std::to_string(x);
       const std::string id = std::to_string(y * block_size.x + x);
       if (image_buffer) {
-        c += "        FLT4 src" + id + " = " + src_tensor.Read("addr_" + id) +
-             "; addr_" + id + " += dz_" + id + ";\n";
+        c += "        FLT4 src" + id + " = args.src_tensor.Read(addr_" + id +
+             "); addr_" + id + " += dz_" + id + ";\n";
       } else if (manual_clamp) {
         if (conditional_read) {
           c += "        FLT4 src" + id + " = in_x" + xindex + " && in_y" +
-               yindex + " ? " + src_tensor.Read("addr_" + id) +
-               " : (FLT4)(0.0f); addr_" + id + " += dz;\n";
+               yindex + " ? args.src_tensor.Read(addr_" + id +
+               ") : (FLT4)(0.0f); addr_" + id + " += dz;\n";
         } else {
-          c += "        FLT4 src" + id + " = " + src_tensor.Read("addr_" + id) +
-               " * (FLT)(in_x" + xindex + " && in_y" + yindex + "); addr_" +
+          c += "        FLT4 src" + id + " = args.src_tensor.Read(addr_" + id +
+               ") * (FLT)(in_x" + xindex + " && in_y" + yindex + "); addr_" +
                id + " += dz;\n";
         }
       } else {
-        c += "        FLT4 src" + id + " = " +
-             src_tensor.ReadWHSB("sx" + xindex, "sy" + yindex, "s", batch_id,
-                                 mode) +
-             ";\n";
+        c += "        FLT4 src" + id + " = args.src_tensor.Read(sx" + xindex +
+             ", sy" + yindex + ", s);\n";
       }
     }
   }
   if (weights_are_buffer) {
-    c += "        __global FLT16* weights_cache = filters + f_offset;\n";
+    c += "        __global FLT16* weights_cache = "
+         "args.weights.GetPtr(f_offset);\n";
     c += "        f_offset += " + std::to_string(block_size.z) + ";\n";
   } else {
     for (int z = 0; z < block_size.z; ++z) {
-      const std::string fc = "(int2)(dst_z + " + std::to_string(z) + ", x_c)";
       c += absl::Substitute(
-          R"(        FLT4 f$1 = READ_IMAGE(filters0, smp_none, $0);
-        FLT4 f$2 = READ_IMAGE(filters1, smp_none, $0);
-        FLT4 f$3 = READ_IMAGE(filters2, smp_none, $0);
-        FLT4 f$4 = READ_IMAGE(filters3, smp_none, $0);
+          R"(        FLT4 f$1 = args.weights0.Read(dst_z + $0, x_c);
+        FLT4 f$2 = args.weights1.Read(dst_z + $0, x_c);
+        FLT4 f$3 = args.weights2.Read(dst_z + $0, x_c);
+        FLT4 f$4 = args.weights3.Read(dst_z + $0, x_c);
 )",
-          fc, z * 4 + 0, z * 4 + 1, z * 4 + 2, z * 4 + 3);
+          z, z * 4 + 0, z * 4 + 1, z * 4 + 2, z * 4 + 3);
     }
     c += "        x_c++;\n";
   }
@@ -275,23 +260,21 @@ std::string GenerateConvolutionTransposedCode(
   c += "    }\n";
   c += "  }\n";
   for (int z = 0; z < block_size.z; ++z) {
-    c += "  if (dst_z < dst_size.z) {\n";
-    c += "    FLT4 bias_val = " + biases.ReadLinearFLT4("dst_z") + ";\n";
+    c += "  if (dst_z < args.dst_tensor.Slices()) {\n";
+    c += "    FLT4 bias_val = args.biases.Read(dst_z);\n";
     for (int y = 0; y < block_size.y; ++y) {
       for (int x = 0; x < block_size.x; ++x) {
         const std::string id =
             std::to_string((z * block_size.y + y) * block_size.x + x);
         c += "    {\n";
-        c += "      int xc = dst_x + stride.x * " + std::to_string(x) + ";\n";
-        c += "      int yc = dst_y + stride.y * " + std::to_string(y) + ";\n";
-        c += "      if (xc < dst_size.x && yc < dst_size.y) {\n";
+        c += "      int xc = dst_x + args.stride_x * " + std::to_string(x) +
+             ";\n";
+        c += "      int yc = dst_y + args.stride_y * " + std::to_string(y) +
+             ";\n";
+        c += "      if (xc < args.dst_tensor.Width() && yc < "
+             "args.dst_tensor.Height()) {\n";
         c += "        FLT4 res = TO_FLT4(r" + id + ") + bias_val;\n";
-        std::string x_3dcoord =
-            op_def.IsBatchSupported() ? "xc * dst_size.w + B" : "xc";
-        const LinkingContext context{"res", x_3dcoord, "yc", "dst_z"};
-        c += PostProcess(linked_operations, context);
-        c += "        " +
-             dst_tensor.WriteWHSB("res", "xc", "yc", "dst_z", batch_id) + "\n";
+        c += "        args.dst_tensor.Write(res, xc, yc, dst_z);\n";
         c += "      }\n";
         c += "    }\n";
       }
@@ -333,12 +316,6 @@ ConvolutionTransposed::ConvolutionTransposed(
 
 ConvolutionTransposed::ConvolutionTransposed(ConvolutionTransposed&& operation)
     : GPUOperation(std::move(operation)),
-      biases_(std::move(operation.biases_)),
-      weights_0_(std::move(operation.weights_0_)),
-      weights_1_(std::move(operation.weights_1_)),
-      weights_2_(std::move(operation.weights_2_)),
-      weights_3_(std::move(operation.weights_3_)),
-      weights_buf_(std::move(operation.weights_buf_)),
       weights_are_buffer_(operation.weights_are_buffer_),
       kernel_size_(operation.kernel_size_),
       stride_(operation.stride_),
@@ -350,12 +327,6 @@ ConvolutionTransposed::ConvolutionTransposed(ConvolutionTransposed&& operation)
 ConvolutionTransposed& ConvolutionTransposed::operator=(
     ConvolutionTransposed&& operation) {
   if (this != &operation) {
-    biases_ = std::move(operation.biases_);
-    weights_0_ = std::move(operation.weights_0_);
-    weights_1_ = std::move(operation.weights_1_);
-    weights_2_ = std::move(operation.weights_2_);
-    weights_3_ = std::move(operation.weights_3_);
-    weights_buf_ = std::move(operation.weights_buf_);
     std::swap(weights_are_buffer_, operation.weights_are_buffer_);
     std::swap(kernel_size_, operation.kernel_size_);
     std::swap(stride_, operation.stride_);
@@ -370,9 +341,15 @@ ConvolutionTransposed& ConvolutionTransposed::operator=(
 
 absl::Status ConvolutionTransposed::Compile(
     const CreationContext& creation_context) {
-  const auto code = GenerateConvolutionTransposedCode(
-      definition_, biases_, *creation_context.device, weights_are_buffer_,
-      block_size_, linked_operations_);
+  std::string code = GenerateConvolutionTransposedCode(
+      definition_, *creation_context.device, weights_are_buffer_, block_size_,
+      &args_);
+  std::string element_wise_code;
+  RETURN_IF_ERROR(
+      MergeOperations(linked_operations_, &args_, &element_wise_code));
+  RETURN_IF_ERROR(args_.TransformToCLCode(creation_context.device->GetInfo(),
+                                          {{"dst_tensor", element_wise_code}},
+                                          &code));
 
   std::vector<CompilerOptions> options;
   // options.push_back(CompilerOptions::POWERVR_FP16);
@@ -382,25 +359,16 @@ absl::Status ConvolutionTransposed::Compile(
 }
 
 absl::Status ConvolutionTransposed::BindArguments() {
-  kernel_.ResetBindingCounter();
-  RETURN_IF_ERROR(kernel_.SetMemoryAuto(src_[0]->GetMemoryPtr()));
-  if (weights_are_buffer_) {
-    RETURN_IF_ERROR(kernel_.SetMemoryAuto(weights_buf_.GetMemoryPtr()));
-  } else {
-    RETURN_IF_ERROR(kernel_.SetMemoryAuto(weights_0_.GetMemoryPtr()));
-    RETURN_IF_ERROR(kernel_.SetMemoryAuto(weights_1_.GetMemoryPtr()));
-    RETURN_IF_ERROR(kernel_.SetMemoryAuto(weights_2_.GetMemoryPtr()));
-    RETURN_IF_ERROR(kernel_.SetMemoryAuto(weights_3_.GetMemoryPtr()));
-  }
-  RETURN_IF_ERROR(kernel_.SetMemoryAuto(biases_.GetMemoryPtr()));
-  RETURN_IF_ERROR(BindArgs(&kernel_, linked_operations_));
-  RETURN_IF_ERROR(kernel_.SetMemoryAuto(dst_[0]->GetMemoryPtrForWriting()));
-  RETURN_IF_ERROR(kernel_.SetBytesAuto(kernel_size_));
-  RETURN_IF_ERROR(kernel_.SetBytesAuto(stride_));
-  RETURN_IF_ERROR(kernel_.SetBytesAuto(padding_));
-  RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->GetWHSB()));
-  RETURN_IF_ERROR(kernel_.SetBytesAuto(dst_[0]->GetWHSB()));
-  return absl::OkStatus();
+  RETURN_IF_ERROR(args_.SetObjectRef("src_tensor", src_[0]));
+  RETURN_IF_ERROR(args_.SetObjectRef("dst_tensor", dst_[0]));
+  RETURN_IF_ERROR(args_.SetInt("stride_x", stride_.x));
+  RETURN_IF_ERROR(args_.SetInt("stride_y", stride_.y));
+  RETURN_IF_ERROR(args_.SetInt("padding_x", padding_.x));
+  RETURN_IF_ERROR(args_.SetInt("padding_y", padding_.y));
+  RETURN_IF_ERROR(args_.SetInt("kernel_size_x", kernel_size_.x));
+  RETURN_IF_ERROR(args_.SetInt("kernel_size_y", kernel_size_.y));
+  RETURN_IF_ERROR(SetArguments(linked_operations_, &args_));
+  return args_.Bind(kernel_.kernel());
 }
 
 int3 ConvolutionTransposed::GetGridSize() const {
@@ -430,14 +398,18 @@ absl::Status CreateConvolutionTransposed(
   *result = ConvolutionTransposed(definition, attr, *creation_context.device);
   RETURN_IF_ERROR(
       result->UploadWeights(attr.weights, creation_context.context));
-  LinearStorageCreateInfo create_info;
-  create_info.storage_type =
+
+  TensorLinearDescriptor desc;
+  desc.storage_type =
       DeduceLinearStorageType(definition.GetPrimaryStorageType());
-  create_info.data_type = definition.GetDataType();
-  create_info.name = "biases";
-  create_info.aligned_size = attr.weights.shape.o;
-  RETURN_IF_ERROR(CreateLinearStorage(
-      create_info, attr.bias, creation_context.context, &result->biases_));
+  desc.element_type = definition.GetDataType();
+
+  LinearStorage lt;
+  RETURN_IF_ERROR(
+      CreateLinearStorage(desc, attr.bias, creation_context.context, &lt));
+  result->args_.AddObject("biases", AccessType::READ,
+                          absl::make_unique<LinearStorage>(std::move(lt)),
+                          absl::make_unique<TensorLinearDescriptor>(desc));
   return absl::OkStatus();
 }
 
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed.h b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed.h
index 867966f55ad..4f4b7100f77 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed.h
@@ -68,13 +68,6 @@ class ConvolutionTransposed : public GPUOperation {
   absl::Status BindArguments();
   int3 GetGridSize() const;
 
-  LinearStorage biases_;
-
-  Texture2D weights_0_;
-  Texture2D weights_1_;
-  Texture2D weights_2_;
-  Texture2D weights_3_;
-  Buffer weights_buf_;
   bool weights_are_buffer_;
 
   int2 kernel_size_;
@@ -103,29 +96,34 @@ absl::Status ConvolutionTransposed::UploadWeights(
 
   const int float4_size = f32_weights ? 16 : 8;
 
+  Texture2D weights_0;
+  Texture2D weights_1;
+  Texture2D weights_2;
+  Texture2D weights_3;
+  Buffer weights_buf;
   if (f32_weights) {
     std::vector<float4> gpu_data(elements_count);
     RearrangeWeightsData(weights, absl::MakeSpan(gpu_data));
     if (weights_are_buffer_) {
       RETURN_IF_ERROR(CreateReadOnlyBuffer(float4_size * elements_count,
                                            gpu_data.data(), context,
-                                           &weights_buf_));
+                                           &weights_buf));
     } else {
       RETURN_IF_ERROR(CreateTexture2DRGBA(
           definition_.GetDataType(), dst_depth, src_depth * kernel_x * kernel_y,
-          gpu_data.data(), context, &weights_0_));
+          gpu_data.data(), context, &weights_0));
       RETURN_IF_ERROR(CreateTexture2DRGBA(
           definition_.GetDataType(), dst_depth, src_depth * kernel_x * kernel_y,
           gpu_data.data() + texture_width * texture_height, context,
-          &weights_1_));
+          &weights_1));
       RETURN_IF_ERROR(CreateTexture2DRGBA(
           definition_.GetDataType(), dst_depth, src_depth * kernel_x * kernel_y,
           gpu_data.data() + texture_width * texture_height * 2, context,
-          &weights_2_));
+          &weights_2));
       RETURN_IF_ERROR(CreateTexture2DRGBA(
           definition_.GetDataType(), dst_depth, src_depth * kernel_x * kernel_y,
           gpu_data.data() + texture_width * texture_height * 3, context,
-          &weights_3_));
+          &weights_3));
     }
   } else {
     std::vector<half4> gpu_data(elements_count);
@@ -133,26 +131,50 @@ absl::Status ConvolutionTransposed::UploadWeights(
     if (weights_are_buffer_) {
       RETURN_IF_ERROR(CreateReadOnlyBuffer(float4_size * elements_count,
                                            gpu_data.data(), context,
-                                           &weights_buf_));
+                                           &weights_buf));
     } else {
       RETURN_IF_ERROR(CreateTexture2DRGBA(
           definition_.GetDataType(), dst_depth, src_depth * kernel_x * kernel_y,
-          gpu_data.data(), context, &weights_0_));
+          gpu_data.data(), context, &weights_0));
       RETURN_IF_ERROR(CreateTexture2DRGBA(
           definition_.GetDataType(), dst_depth, src_depth * kernel_x * kernel_y,
           gpu_data.data() + texture_width * texture_height, context,
-          &weights_1_));
+          &weights_1));
       RETURN_IF_ERROR(CreateTexture2DRGBA(
           definition_.GetDataType(), dst_depth, src_depth * kernel_x * kernel_y,
           gpu_data.data() + texture_width * texture_height * 2, context,
-          &weights_2_));
+          &weights_2));
       RETURN_IF_ERROR(CreateTexture2DRGBA(
           definition_.GetDataType(), dst_depth, src_depth * kernel_x * kernel_y,
           gpu_data.data() + texture_width * texture_height * 3, context,
-          &weights_3_));
+          &weights_3));
     }
   }
 
+  if (weights_are_buffer_) {
+    BufferDescriptor desc;
+    desc.element_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
+    desc.element_size = 16;
+    args_.AddObject("weights", AccessType::READ,
+                    absl::make_unique<Buffer>(std::move(weights_buf)),
+                    absl::make_unique<BufferDescriptor>(desc));
+  } else {
+    Texture2DDescriptor desc;
+    desc.element_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
+    args_.AddObject("weights0", AccessType::READ,
+                    absl::make_unique<Texture2D>(std::move(weights_0)),
+                    absl::make_unique<Texture2DDescriptor>(desc));
+    args_.AddObject("weights1", AccessType::READ,
+                    absl::make_unique<Texture2D>(std::move(weights_1)),
+                    absl::make_unique<Texture2DDescriptor>(desc));
+    args_.AddObject("weights2", AccessType::READ,
+                    absl::make_unique<Texture2D>(std::move(weights_2)),
+                    absl::make_unique<Texture2DDescriptor>(desc));
+    args_.AddObject("weights3", AccessType::READ,
+                    absl::make_unique<Texture2D>(std::move(weights_3)),
+                    absl::make_unique<Texture2DDescriptor>(desc));
+  }
+
   return absl::OkStatus();
 }
 

From 32f1a1aeb5ec6bf0cf1a288e507baabccbedd079 Mon Sep 17 00:00:00 2001
From: Henry Tan <henrytan@google.com>
Date: Thu, 25 Jun 2020 21:36:03 -0700
Subject: [PATCH 1140/1390] Update compilation cache key computation
 implementation.

PiperOrigin-RevId: 318408596
Change-Id: I6c0f6cbddfc83049167c2fb92e9eb5698f885e82
---
 tensorflow/core/tpu/kernels/tpu_op_util.cc | 38 +++++++++++-----------
 1 file changed, 19 insertions(+), 19 deletions(-)

diff --git a/tensorflow/core/tpu/kernels/tpu_op_util.cc b/tensorflow/core/tpu/kernels/tpu_op_util.cc
index 7b7cf468499..570a6deac1b 100644
--- a/tensorflow/core/tpu/kernels/tpu_op_util.cc
+++ b/tensorflow/core/tpu/kernels/tpu_op_util.cc
@@ -22,25 +22,6 @@ limitations under the License.
 namespace tensorflow {
 namespace tpu {
 namespace {
-// Return fingerprint_in_metadata if it's not empty; otherwise read input tensor
-// data to compute the fingerprint.
-std::string GuaranteedConstFingerprint(const string& fingerprint_in_metadata,
-                                       const Tensor* guaranteed_constants,
-                                       size_t guaranteed_constants_size) {
-  if (fingerprint_in_metadata.empty()) {
-    uint64_t fingerprint = 0;
-    for (size_t i = 0; i < guaranteed_constants_size; ++i) {
-      const Tensor& constant = guaranteed_constants[i];
-      fingerprint = TpuCompile_CreateGuaranteedConstFingerprint(
-          fingerprint, constant.tensor_data().data(),
-          constant.tensor_data().size());
-    }
-    return std::to_string(fingerprint);
-  } else {
-    return fingerprint_in_metadata;
-  }
-}
-
 std::string CreateShapePrefix(
     const std::vector<tensorflow::TensorShape>& dynamic_shapes) {
   std::string shapes_prefix;
@@ -87,6 +68,25 @@ std::string CreateConfigPrefix(const TPUCompileMetadataProto& metadata) {
 }
 }  // namespace
 
+// Return fingerprint_in_metadata if it's not empty; otherwise read input tensor
+// data to compute the fingerprint.
+std::string GuaranteedConstFingerprint(const string& fingerprint_in_metadata,
+                                       const Tensor* guaranteed_constants,
+                                       size_t guaranteed_constants_size) {
+  if (fingerprint_in_metadata.empty()) {
+    uint64_t fingerprint = 0;
+    for (size_t i = 0; i < guaranteed_constants_size; ++i) {
+      const Tensor& constant = guaranteed_constants[i];
+      fingerprint = TpuCompile_CreateGuaranteedConstFingerprint(
+          fingerprint, constant.tensor_data().data(),
+          constant.tensor_data().size());
+    }
+    return std::to_string(fingerprint);
+  } else {
+    return fingerprint_in_metadata;
+  }
+}
+
 // The `guaranteed_constants` must be passed as reference due to the lazy
 // evaluation of `guaranteed_const_fingerprint()` callback.
 TpuCompilationCacheKey CreateCompilationCacheKey(

From 4eb59baa38d5ea0be2f94404594b5f5845c7328c Mon Sep 17 00:00:00 2001
From: Ran Chen <crccw@google.com>
Date: Thu, 25 Jun 2020 21:55:50 -0700
Subject: [PATCH 1141/1390] Improve error reporting of multi worker strategy
 combinations

It used to only show "some worker failed, see logs for details", now it can report the traceback. Though since we're unable to differentiate return values from different workers.

PiperOrigin-RevId: 318410347
Change-Id: I751b73bbef33a3bcf6f62a487702958699c7ab85
---
 tensorflow/python/distribute/combinations.py | 49 ++++++++++++++------
 1 file changed, 36 insertions(+), 13 deletions(-)

diff --git a/tensorflow/python/distribute/combinations.py b/tensorflow/python/distribute/combinations.py
index 44c3c8b9a6e..91f152c8e88 100644
--- a/tensorflow/python/distribute/combinations.py
+++ b/tensorflow/python/distribute/combinations.py
@@ -22,6 +22,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import collections
 import re
 import sys
 import types
@@ -332,6 +333,9 @@ def main():
 _running_in_worker = False
 
 
+_TestResult = collections.namedtuple("_TestResult", ["status", "message"])
+
+
 def _test_runner(test_id):
   """Executes the test with the given test_id.
 
@@ -353,15 +357,23 @@ def _test_runner(test_id):
   test = unittest.defaultTestLoader.loadTestsFromName(test_id)
   runner = unittest.TextTestRunner()
   result = runner.run(test)
-  # Print failures and errors to stdout and multi_process_runner will collect
+  # Treat expected failures as failures, so that the main process can get
+  # them and fail as expected. Also treat errors as failures to simplify the
+  # handling.
+  failures = result.failures + result.expectedFailures + result.errors
+  if failures:
+    ret = _TestResult(status="failure", message=failures[0][1])
+  elif result.skipped:
+    ret = _TestResult(status="skipped", message=result.skipped[0][1])
+  else:
+    # Treat unexpectedSuccesses as OK so that the test case in the main process
+    # succeed as well.
+    ret = _TestResult(status="ok", message=None)
+  # Print tracebacks to stdout and multi_process_runner will collect
   # them and stream back to the main process.
-  for _, msg in result.failures + result.errors:
-    print(msg)
-  # Return expected failures as failures, so that the main process can get
-  # them and fail as expected.
-  if result.expectedFailures:
-    return False
-  return result.wasSuccessful()
+  if ret.message:
+    print(ret.message)
+  return ret
 
 
 def _multi_worker_test(test_method):
@@ -407,18 +419,29 @@ def _multi_worker_test(test_method):
     #                   [sub process]test_method()
     test_id = self.id()
     if runner:
-      result = runner.run(_test_runner, args=(test_id,))
+      results = runner.run(_test_runner, args=(test_id,))
     else:
       cluster_spec = multi_worker_test_base.create_cluster_spec(
           has_chief=has_chief,
           num_workers=num_workers,
           num_ps=0,
           has_eval=False)
-      result = multi_process_runner.run(
+      results = multi_process_runner.run(
           _test_runner, cluster_spec, args=(test_id,)).return_value
-    for was_successful in result:
-      if not was_successful:
-        raise AssertionError("some worker failed, see logs for details")
+
+    skip_reason = None
+    for result in results:
+      if result.status == "failure":
+        # We can't tell which worker the return value come from, so we fail on
+        # the  first error.
+        self.fail(result.message)
+        break
+      elif result.status == "skipped":
+        # Record the skip reason, but do not actually skip the test in case some
+        # processes fail instead.
+        skip_reason = result.message
+    if skip_reason is not None:
+      self.skipTest(skip_reason)
 
   argspec = tf_inspect.getfullargspec(test_method)
   decorator_args = (argspec.args or []) + ["has_chief", "num_workers", "runner"]

From 527f6835c3d980cf3379f6e6c0f5029af3f8a460 Mon Sep 17 00:00:00 2001
From: Ran Chen <crccw@google.com>
Date: Thu, 25 Jun 2020 22:26:04 -0700
Subject: [PATCH 1142/1390] Use eager.test.main() instead of platform.test.main

PiperOrigin-RevId: 318413246
Change-Id: I555cdb707f942f4e7073bd58496f8dbd92910d9a
---
 tensorflow/python/distribute/BUILD                | 5 ++++-
 tensorflow/python/distribute/multi_process_lib.py | 2 +-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/tensorflow/python/distribute/BUILD b/tensorflow/python/distribute/BUILD
index 0f461121b30..abbf6dce69a 100644
--- a/tensorflow/python/distribute/BUILD
+++ b/tensorflow/python/distribute/BUILD
@@ -1715,7 +1715,10 @@ py_library(
 py_library(
     name = "multi_process_lib",
     srcs = ["multi_process_lib.py"],
-    deps = ["//tensorflow/python:client_testlib"],
+    deps = [
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python/eager:test",
+    ],
 )
 
 py_test(
diff --git a/tensorflow/python/distribute/multi_process_lib.py b/tensorflow/python/distribute/multi_process_lib.py
index 9b7851439b7..89021448eb2 100644
--- a/tensorflow/python/distribute/multi_process_lib.py
+++ b/tensorflow/python/distribute/multi_process_lib.py
@@ -22,7 +22,7 @@ import multiprocessing as _multiprocessing
 import os
 import unittest
 
-from tensorflow.python.platform import test
+from tensorflow.python.eager import test
 
 
 try:

From 85eac59cd1e7d401918a5daa7eb21e18b95e5e44 Mon Sep 17 00:00:00 2001
From: Karim Nosir <karimnosseir@google.com>
Date: Thu, 25 Jun 2020 22:29:03 -0700
Subject: [PATCH 1143/1390] Remove explicit line "experimental_new_converter =
 True" the converter launched and is now the default.

PiperOrigin-RevId: 318413489
Change-Id: I0e07c1616a6b717debfb84fbc650093f3f0f6437
---
 tensorflow/lite/micro/testing/generate_test_models.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tensorflow/lite/micro/testing/generate_test_models.py b/tensorflow/lite/micro/testing/generate_test_models.py
index 6b5c0bf966d..ff486141857 100644
--- a/tensorflow/lite/micro/testing/generate_test_models.py
+++ b/tensorflow/lite/micro/testing/generate_test_models.py
@@ -63,7 +63,6 @@ def generate_conv_model():
   converter = tf.lite.TFLiteConverter.from_keras_model(model)
   converter.optimizations = [tf.lite.Optimize.DEFAULT]
   converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8]
-  converter.experimental_new_converter = True
   converter.inference_input_type = tf.int8
   converter.inference_output_type = tf.int8
   converter.representative_dataset = representative_dataset_gen

From 358ded006960d91a868000209f1212aefefcb2f2 Mon Sep 17 00:00:00 2001
From: Jaesung Chung <jaesung@google.com>
Date: Thu, 25 Jun 2020 22:40:10 -0700
Subject: [PATCH 1144/1390] Add passes for saved model variable freezing in
 TFLite

The newly added passes are not enabled yet. The follow-up change will enable this
new path after removing the logic for lifting variables from the saved model importer.

PiperOrigin-RevId: 318414703
Change-Id: Ic1b8df19a8a95d7ac8930a8ae198b67dadcce2a0
---
 tensorflow/compiler/mlir/lite/BUILD           |  3 +++
 tensorflow/compiler/mlir/lite/python/BUILD    |  1 +
 .../lite/python/graphdef_to_tfl_flatbuffer.cc |  4 +++-
 .../python/saved_model_to_tfl_flatbuffer.cc   |  5 ++++-
 .../lite/python/tf_tfl_flatbuffer_helpers.cc  | 10 +++++-----
 .../lite/python/tf_tfl_flatbuffer_helpers.h   | 10 ++++++----
 .../compiler/mlir/lite/tf_tfl_passes.cc       | 20 ++++++++++++++++++-
 tensorflow/compiler/mlir/lite/tf_tfl_passes.h |  9 +++++++--
 .../compiler/mlir/lite/tf_tfl_translate.cc    |  6 +++++-
 9 files changed, 53 insertions(+), 15 deletions(-)

diff --git a/tensorflow/compiler/mlir/lite/BUILD b/tensorflow/compiler/mlir/lite/BUILD
index f3ad2e11e57..e3e78a50a4f 100644
--- a/tensorflow/compiler/mlir/lite/BUILD
+++ b/tensorflow/compiler/mlir/lite/BUILD
@@ -786,6 +786,7 @@ tf_cc_binary(
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:Support",
+        "//tensorflow/cc/saved_model:loader",
         "//tensorflow/compiler/mlir:init_mlir",
         "//tensorflow/compiler/mlir/tensorflow:translate_cl_options",
         "//tensorflow/core:protos_all_cc",
@@ -841,6 +842,8 @@ cc_library(
         "//tensorflow/compiler/mlir/tensorflow:tf_graph_optimization_pass",
         "//tensorflow/compiler/mlir/tensorflow:tf_saved_model_passes",
         "//tensorflow/compiler/mlir/tensorflow:translate_lib",
+        "//tensorflow/core:core_cpu_base",
+        "@llvm-project//llvm:Support",
         "@llvm-project//mlir:AllPassesAndDialects",
         "@llvm-project//mlir:Analysis",
         "@llvm-project//mlir:IR",
diff --git a/tensorflow/compiler/mlir/lite/python/BUILD b/tensorflow/compiler/mlir/lite/python/BUILD
index d6a9039ff91..ceca156e07e 100644
--- a/tensorflow/compiler/mlir/lite/python/BUILD
+++ b/tensorflow/compiler/mlir/lite/python/BUILD
@@ -21,6 +21,7 @@ cc_library(
         "//tensorflow/compiler/mlir/tensorflow",
         "//tensorflow/compiler/mlir/tensorflow:convert_graphdef",
         "//tensorflow/compiler/mlir/tensorflow:mlir_roundtrip_flags",
+        "//tensorflow/core:core_cpu_base",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/lite/toco:model_flags_proto_cc",
diff --git a/tensorflow/compiler/mlir/lite/python/graphdef_to_tfl_flatbuffer.cc b/tensorflow/compiler/mlir/lite/python/graphdef_to_tfl_flatbuffer.cc
index 8a2faebcbe6..e786bedc86d 100644
--- a/tensorflow/compiler/mlir/lite/python/graphdef_to_tfl_flatbuffer.cc
+++ b/tensorflow/compiler/mlir/lite/python/graphdef_to_tfl_flatbuffer.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <ostream>
 #include <utility>
 
+#include "llvm/ADT/None.h"
 #include "llvm/Support/ToolOutputFile.h"
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/IR/Module.h"  // from @llvm-project
@@ -90,7 +91,8 @@ Status ConvertGraphDefToTFLiteFlatBuffer(const toco::ModelFlags& model_flags,
   pass_config.lower_tensor_list_ops = true;
 
   return internal::ConvertMLIRToTFLiteFlatBuffer(toco_flags, std::move(module),
-                                                 pass_config, result);
+                                                 pass_config, result,
+                                                 /*session=*/llvm::None);
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/lite/python/saved_model_to_tfl_flatbuffer.cc b/tensorflow/compiler/mlir/lite/python/saved_model_to_tfl_flatbuffer.cc
index ab80746f8b7..ddd36fbd74c 100644
--- a/tensorflow/compiler/mlir/lite/python/saved_model_to_tfl_flatbuffer.cc
+++ b/tensorflow/compiler/mlir/lite/python/saved_model_to_tfl_flatbuffer.cc
@@ -16,6 +16,7 @@ limitations under the License.
 
 #include <utility>
 
+#include "llvm/ADT/None.h"
 #include "llvm/ADT/StringSet.h"
 #include "llvm/Support/ToolOutputFile.h"
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
@@ -165,8 +166,10 @@ Status ConvertSavedModelToTFLiteFlatBuffer(
   pass_config.emit_builtin_tflite_ops = emit_builtin_tflite_ops;
   pass_config.lower_tensor_list_ops = true;
 
+  // TODO(b/153507667): Pass the session object when importing logic is removed.
   auto status = internal::ConvertMLIRToTFLiteFlatBuffer(
-      toco_flags, std::move(module), pass_config, result);
+      toco_flags, std::move(module), pass_config, result,
+      /*session=*/llvm::None);
   return status;
 }
 
diff --git a/tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc b/tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc
index 8f2c8bc362c..4725eb1ac5f 100644
--- a/tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc
+++ b/tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc
@@ -269,10 +269,10 @@ Status DumpOpGraphToFile(mlir::ModuleOp module, const std::string& filename) {
   return Status::OK();
 }
 
-Status ConvertMLIRToTFLiteFlatBuffer(const toco::TocoFlags& toco_flags,
-                                     mlir::OwningModuleRef module,
-                                     const mlir::TFL::PassConfig& pass_config,
-                                     string* result) {
+Status ConvertMLIRToTFLiteFlatBuffer(
+    const toco::TocoFlags& toco_flags, mlir::OwningModuleRef module,
+    const mlir::TFL::PassConfig& pass_config, string* result,
+    llvm::Optional<tensorflow::Session*> session) {
   bool emit_builtin_tflite_ops = !toco_flags.force_select_tf_ops();
   bool emit_select_tf_ops = toco_flags.enable_select_tf_ops();
   bool emit_custom_ops = toco_flags.allow_custom_ops();
@@ -286,7 +286,7 @@ Status ConvertMLIRToTFLiteFlatBuffer(const toco::TocoFlags& toco_flags,
 
   mlir::PassManager pm(module->getContext());
 
-  tensorflow::AddTFToTFLConversionPasses(pass_config, &pm);
+  tensorflow::AddTFToTFLConversionPasses(pass_config, &pm, session);
   // Convert back to outlined while format for export back to flatbuffer.
   if (pass_config.legalize_tf_while) {
     pm.addPass(mlir::TFL::CreateWhileOutlinePass());
diff --git a/tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.h b/tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.h
index 87e73912a46..d79bdc6df67 100644
--- a/tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.h
+++ b/tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.h
@@ -18,9 +18,11 @@ limitations under the License.
 #include <ostream>
 #include <utility>
 
+#include "llvm/ADT/Optional.h"
 #include "mlir/IR/Module.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/lite/common/tfl_pass_config.h"
 #include "tensorflow/compiler/mlir/lite/transforms/passes.h"
+#include "tensorflow/core/public/session.h"
 #include "tensorflow/lite/toco/model_flags.pb.h"
 #include "tensorflow/lite/toco/toco_flags.pb.h"
 #include "tensorflow/lite/toco/types.pb.h"
@@ -44,10 +46,10 @@ Status PopulateQuantizationSpecs(
 
 // Convert imported MLIR file to TfLite flatbuffer.
 // This will also run relevant passes as well.
-Status ConvertMLIRToTFLiteFlatBuffer(const toco::TocoFlags& toco_flags,
-                                     mlir::OwningModuleRef module,
-                                     const mlir::TFL::PassConfig& pass_config,
-                                     string* result);
+Status ConvertMLIRToTFLiteFlatBuffer(
+    const toco::TocoFlags& toco_flags, mlir::OwningModuleRef module,
+    const mlir::TFL::PassConfig& pass_config, string* result,
+    llvm::Optional<tensorflow::Session*> session);
 
 // Give a warning for any unused flags that have been specified.
 void WarningUnusedFlags(const toco::ModelFlags& model_flags,
diff --git a/tensorflow/compiler/mlir/lite/tf_tfl_passes.cc b/tensorflow/compiler/mlir/lite/tf_tfl_passes.cc
index de07ce791a0..3ae78adcc3a 100644
--- a/tensorflow/compiler/mlir/lite/tf_tfl_passes.cc
+++ b/tensorflow/compiler/mlir/lite/tf_tfl_passes.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/mlir/lite/tf_tfl_passes.h"
 
+#include "llvm/ADT/Optional.h"
 #include "mlir/IR/Attributes.h"  // from @llvm-project
 #include "mlir/IR/Function.h"  // from @llvm-project
 #include "mlir/IR/Module.h"  // from @llvm-project
@@ -55,7 +56,8 @@ void AddQuantizationPasses(const mlir::TFL::QuantizationSpecs& quant_specs,
 }
 
 void AddTFToTFLConversionPasses(const mlir::TFL::PassConfig& pass_config,
-                                mlir::OpPassManager* pass_manager) {
+                                mlir::OpPassManager* pass_manager,
+                                llvm::Optional<tensorflow::Session*> session) {
   mlir::TF::StandardPipelineOptions standard_pipeline_options;
   standard_pipeline_options.enable_inliner = false;
   standard_pipeline_options.form_clusters = pass_config.form_clusters;
@@ -65,6 +67,22 @@ void AddTFToTFLConversionPasses(const mlir::TFL::PassConfig& pass_config,
   if (pass_config.shape_inference) {
     pass_manager->addPass(mlir::TF::CreateTFShapeInferencePass());
   }
+
+  if (session.hasValue()) {
+    // Add a pass that converts reference variables to resource variables.
+    pass_manager->addPass(
+        mlir::TF::
+            CreateConvertReadonlyReferenceVariablesToResourceVariablesPass());
+
+    // Add a pass that promotes resource variable to the function arguments.
+    pass_manager->addPass(mlir::TF::CreatePromoteVarHandlesToArgsPass());
+
+    // Add a pass that creates global tensors and converts the function
+    // arguments to the tf_saved_model.bound_input arguments.
+    pass_manager->addPass(
+        mlir::tf_saved_model::CreateLiftVariablesPass(session.getValue()));
+  }
+
   // Keep this pass after the shape inference pass, which couldn't do shape
   // inference for non-tf ops.
   if (!pass_config.quant_specs.serialized_quant_stats.empty()) {
diff --git a/tensorflow/compiler/mlir/lite/tf_tfl_passes.h b/tensorflow/compiler/mlir/lite/tf_tfl_passes.h
index ca153f54902..6c269d19617 100644
--- a/tensorflow/compiler/mlir/lite/tf_tfl_passes.h
+++ b/tensorflow/compiler/mlir/lite/tf_tfl_passes.h
@@ -16,16 +16,21 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_MLIR_LITE_TF_TFL_PASSES_H_
 #define TENSORFLOW_COMPILER_MLIR_LITE_TF_TFL_PASSES_H_
 
+#include "llvm/ADT/Optional.h"
 #include "mlir/IR/Module.h"  // from @llvm-project
 #include "mlir/Pass/PassManager.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/lite/common/tfl_pass_config.h"
+#include "tensorflow/core/public/session.h"
 
 namespace tensorflow {
 
 // Add the TF to TFLite passes, specified in the pass_config, into a
-// pass_manager.
+// pass_manager. The session object will be provided when the TF MLIR is
+// imported from saved model version one and utilized for capturing resource
+// variables.
 void AddTFToTFLConversionPasses(const mlir::TFL::PassConfig& pass_config,
-                                mlir::OpPassManager* pass_manager);
+                                mlir::OpPassManager* pass_manager,
+                                llvm::Optional<tensorflow::Session*> session);
 
 // Add the Quantization passes, specified in the quant_specs, into a pass
 // manager.
diff --git a/tensorflow/compiler/mlir/lite/tf_tfl_translate.cc b/tensorflow/compiler/mlir/lite/tf_tfl_translate.cc
index fcaebe82f74..015312291a5 100644
--- a/tensorflow/compiler/mlir/lite/tf_tfl_translate.cc
+++ b/tensorflow/compiler/mlir/lite/tf_tfl_translate.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include <iostream>
 
 #include "absl/strings/str_split.h"
+#include "llvm/ADT/None.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/FormatVariadic.h"
@@ -29,6 +30,7 @@ limitations under the License.
 #include "mlir/IR/Module.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Support/FileUtilities.h"  // from @llvm-project
+#include "tensorflow/cc/saved_model/loader.h"
 #include "tensorflow/compiler/mlir/init_mlir.h"
 #include "tensorflow/compiler/mlir/lite/common/tfl_pass_config.h"
 #include "tensorflow/compiler/mlir/lite/flatbuffer_export.h"
@@ -220,7 +222,9 @@ int main(int argc, char **argv) {
   pass_config.lower_tensor_list_ops = lower_tensor_list_ops;
   pass_config.legalize_tf_while = convert_tf_while_to_tfl_while;
 
-  tensorflow::AddTFToTFLConversionPasses(pass_config, &pm);
+  // TODO(b/153507667): Pass the session object when importing logic is removed.
+  tensorflow::AddTFToTFLConversionPasses(pass_config, &pm,
+                                         /*session=*/llvm::None);
   // TODO(b/150901738): Move those into tf_tfl_translate.cc.
   // Convert back to outlined while format for export back to flatbuffer.
   if (pass_config.legalize_tf_while) {

From e91815b0437aaa2b8c9fbf427061741b8148b58a Mon Sep 17 00:00:00 2001
From: Frank Chen <frankchn@google.com>
Date: Thu, 25 Jun 2020 23:44:12 -0700
Subject: [PATCH 1145/1390] Register the TPU platform when the relevant TPU
 libraries are dynamically loaded

PiperOrigin-RevId: 318420259
Change-Id: Ic7807130ab4717031e9176b466ed9a51da8b27a7
---
 tensorflow/core/tpu/BUILD                     |  1 +
 .../core/tpu/tpu_api_dlsym_initializer.cc     | 25 ++++++-----
 tensorflow/stream_executor/tpu/BUILD          | 27 ++++++++++--
 .../tpu/device_memory_base_helper.h           | 41 +++++++++++++++++++
 .../stream_executor/tpu/status_helper.h       | 28 +++++++++----
 .../stream_executor/tpu/tpu_executor.cc       | 35 ++++++++--------
 .../stream_executor/tpu/tpu_platform.cc       | 13 ++----
 tensorflow/stream_executor/tpu/tpu_platform.h |  3 +-
 .../tpu/tpu_platform_registration.cc          | 28 +++++++++++++
 tensorflow/stream_executor/tpu/tpu_stream.h   |  8 ++--
 10 files changed, 156 insertions(+), 53 deletions(-)
 create mode 100644 tensorflow/stream_executor/tpu/device_memory_base_helper.h
 create mode 100644 tensorflow/stream_executor/tpu/tpu_platform_registration.cc

diff --git a/tensorflow/core/tpu/BUILD b/tensorflow/core/tpu/BUILD
index 589af63da52..dd7435dba80 100644
--- a/tensorflow/core/tpu/BUILD
+++ b/tensorflow/core/tpu/BUILD
@@ -139,6 +139,7 @@ cc_library(
         "//tensorflow/core/tpu/kernels:tpu_compile_c_api_hdrs",
         "//tensorflow/core/tpu/kernels:tpu_mesh_state_c_api_hdrs",
         "//tensorflow/core/tpu/kernels:tpu_util_c_api_hdrs",
+        "//tensorflow/stream_executor/tpu:tpu_executor_base",
         "//tensorflow/stream_executor/tpu:tpu_executor_c_api_hdrs",
         "//tensorflow/stream_executor/tpu:tpu_node_context_c_api_hdrs",
     ],
diff --git a/tensorflow/core/tpu/tpu_api_dlsym_initializer.cc b/tensorflow/core/tpu/tpu_api_dlsym_initializer.cc
index 495e6a2219b..450f7aaad8f 100644
--- a/tensorflow/core/tpu/tpu_api_dlsym_initializer.cc
+++ b/tensorflow/core/tpu/tpu_api_dlsym_initializer.cc
@@ -21,12 +21,14 @@ limitations under the License.
 #include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/tpu/tpu_api.h"
 #include "tensorflow/stream_executor/tpu/tpu_node_context_c_api.h"
+#include "tensorflow/stream_executor/tpu/tpu_platform.h"
 
-#define TFTPU_SET_FN(Struct, FnName)                                       \
-  Struct->FnName##Fn =                                                     \
-      reinterpret_cast<decltype(FnName)*>(dlsym(library_handle, #FnName)); \
-  if (!(Struct->FnName##Fn)) {                                             \
-    LOG(ERROR) << #FnName " not available in this library.";               \
+#define TFTPU_SET_FN(Struct, FnName)                                         \
+  Struct->FnName##Fn =                                                       \
+      reinterpret_cast<decltype(FnName)*>(dlsym(library_handle, #FnName));   \
+  if (!(Struct->FnName##Fn)) {                                               \
+    LOG(ERROR) << #FnName " not available in this library.";                 \
+    return errors::Unimplemented(#FnName " not available in this library."); \
   }
 
 // Reminder: Update tpu_library_loader_windows.cc if you are adding new publicly
@@ -44,19 +46,20 @@ Status InitializeTpuLibrary(void* library_handle) {
     shared_object_loaded = false;
   }
 
-  TF_RETURN_IF_ERROR(InitializeTpuStructFns(library_handle));
+  Status s = InitializeTpuStructFns(library_handle);
 
-  if (shared_object_loaded) {
+  // TPU platform registration must only be performed after the library is
+  // loaded. We do not want to register a TPU platform in XLA without the
+  // supporting library providing the necessary APIs.
+  if (shared_object_loaded && s.ok()) {
     // TODO(frankchn): Make initialization actually work
     // Initialize TPU platform when the platform code is loaded from a library.
     // InitializeApiFn()->TfTpu_InitializeFn();
 
-    // We should only register the TPU platform when the library is loaded.
-    // TODO(frankchn): Resolve the circular dependency and register the platform
-    // RegisterTpuPlatform();
+    RegisterTpuPlatform();
   }
 
-  return Status::OK();
+  return s;
 }
 
 }  // namespace tpu
diff --git a/tensorflow/stream_executor/tpu/BUILD b/tensorflow/stream_executor/tpu/BUILD
index 66b0013512f..add4db12ae6 100644
--- a/tensorflow/stream_executor/tpu/BUILD
+++ b/tensorflow/stream_executor/tpu/BUILD
@@ -35,6 +35,7 @@ cc_library(
     deps = [
         ":tpu_executor_c_api_hdrs",
         "//tensorflow/core/platform:status",
+        "//tensorflow/core/tpu:tpu_api",
         "//tensorflow/core/tpu/kernels:tpu_util_c_api_hdrs",
     ],
 )
@@ -61,7 +62,16 @@ cc_library(
 )
 
 cc_library(
-    name = "tpu_executor",
+    name = "device_memory_base_helper",
+    hdrs = ["device_memory_base_helper.h"],
+    deps = [
+        ":tpu_executor_c_api_hdrs",
+        "//tensorflow/stream_executor:device_memory",
+    ],
+)
+
+cc_library(
+    name = "tpu_executor_base",
     srcs = [
         "tpu_executor.cc",
         "tpu_platform.cc",
@@ -73,7 +83,7 @@ cc_library(
         "tpu_timer.h",
     ],
     deps = [
-        ":c_api_conversions",
+        ":device_memory_base_helper",
         ":status_helper",
         ":tpu_executor_c_api_hdrs",
         ":tpu_executor_interface",
@@ -90,6 +100,16 @@ cc_library(
     alwayslink = True,
 )
 
+cc_library(
+    name = "tpu_executor",
+    srcs = ["tpu_platform_registration.cc"],
+    deps = [
+        ":tpu_executor_base",
+        "//tensorflow/stream_executor/platform",
+    ],
+    alwayslink = True,
+)
+
 cc_library(
     name = "tpu_node_context",
     srcs = ["tpu_node_context.cc"],
@@ -118,6 +138,7 @@ cc_library(
     srcs = ["tpu_transfer_manager_registration.cc"],
     deps = [
         ":tpu_executor",
+        ":tpu_executor_base",
         ":tpu_transfer_manager_base",
         "//tensorflow/compiler/xla/service:transfer_manager",
     ],
@@ -131,7 +152,7 @@ cc_library(
         ":c_api_conversions",
         ":proto_helper",
         ":status_helper",
-        ":tpu_executor",
+        ":tpu_executor_base",
         ":tpu_executor_c_api_hdrs",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:shape_util",
diff --git a/tensorflow/stream_executor/tpu/device_memory_base_helper.h b/tensorflow/stream_executor/tpu/device_memory_base_helper.h
new file mode 100644
index 00000000000..9937dc29642
--- /dev/null
+++ b/tensorflow/stream_executor/tpu/device_memory_base_helper.h
@@ -0,0 +1,41 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_STREAM_EXECUTOR_TPU_DEVICE_MEMORY_BASE_HELPER_H_
+#define TENSORFLOW_STREAM_EXECUTOR_TPU_DEVICE_MEMORY_BASE_HELPER_H_
+
+#include "tensorflow/stream_executor/device_memory.h"
+#include "tensorflow/stream_executor/tpu/tpu_executor_c_api.h"
+
+class DeviceMemoryBaseHelper {
+ public:
+  static stream_executor::DeviceMemoryBase
+  SE_DeviceMemoryBaseToDeviceMemoryBase(SE_DeviceMemoryBase se_base) {
+    stream_executor::DeviceMemoryBase base(se_base.opaque, se_base.size);
+    base.SetPayload(se_base.payload);
+    return base;
+  }
+
+  static SE_DeviceMemoryBase DeviceMemoryBaseToSE_DeviceMemoryBase(
+      const stream_executor::DeviceMemoryBase& base) {
+    SE_DeviceMemoryBase se_base;
+    se_base.opaque = const_cast<void*>(base.opaque());
+    se_base.payload = base.payload();
+    se_base.size = base.size();
+    return se_base;
+  }
+};
+
+#endif  // TENSORFLOW_STREAM_EXECUTOR_TPU_DEVICE_MEMORY_BASE_HELPER_H_
diff --git a/tensorflow/stream_executor/tpu/status_helper.h b/tensorflow/stream_executor/tpu/status_helper.h
index bc8820f5fef..0129abb0815 100644
--- a/tensorflow/stream_executor/tpu/status_helper.h
+++ b/tensorflow/stream_executor/tpu/status_helper.h
@@ -18,22 +18,34 @@ limitations under the License.
 
 #include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/tpu/kernels/tpu_util_c_api.h"
+#include "tensorflow/core/tpu/tpu_api.h"
 #include "tensorflow/stream_executor/tpu/tpu_executor_c_api.h"
 
-struct StatusHelper {
-  StatusHelper() : c_status(TpuStatus_New()) {}
-  ~StatusHelper() { TpuStatus_Free(c_status); }
-  bool ok() { return TpuStatus_Code(c_status) == 0; }
-  tensorflow::Status status() {
+class StatusHelper {
+ public:
+  StatusHelper()
+      : c_status(tensorflow::tpu::ExecutorApiFn()->TpuStatus_NewFn()) {}
+
+  ~StatusHelper() {
+    tensorflow::tpu::ExecutorApiFn()->TpuStatus_FreeFn(c_status);
+  }
+
+  bool ok() const {
+    return tensorflow::tpu::ExecutorApiFn()->TpuStatus_CodeFn(c_status) == 0;
+  }
+
+  tensorflow::Status status() const {
     if (!ok()) {
       return tensorflow::Status(
-          tensorflow::error::Code(TpuStatus_Code(c_status)),
-          TpuStatus_Message(c_status));
+          tensorflow::error::Code(
+              tensorflow::tpu::ExecutorApiFn()->TpuStatus_CodeFn(c_status)),
+          tensorflow::tpu::ExecutorApiFn()->TpuStatus_MessageFn(c_status));
     } else {
       return tensorflow::Status::OK();
     }
   }
-  SE_Status* c_status;
+
+  SE_Status* c_status;  // NOLINT
 };
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_TPU_STATUS_HELPER_H_
diff --git a/tensorflow/stream_executor/tpu/tpu_executor.cc b/tensorflow/stream_executor/tpu/tpu_executor.cc
index 95c32714732..60d6d22b250 100644
--- a/tensorflow/stream_executor/tpu/tpu_executor.cc
+++ b/tensorflow/stream_executor/tpu/tpu_executor.cc
@@ -20,7 +20,7 @@ limitations under the License.
 #include "tensorflow/core/tpu/tpu_api.h"
 #include "tensorflow/stream_executor/device_memory.h"
 #include "tensorflow/stream_executor/lib/status.h"
-#include "tensorflow/stream_executor/tpu/c_api_conversions.h"
+#include "tensorflow/stream_executor/tpu/device_memory_base_helper.h"
 #include "tensorflow/stream_executor/tpu/status_helper.h"
 #include "tensorflow/stream_executor/tpu/tpu_executor_c_api.h"
 #include "tensorflow/stream_executor/tpu/tpu_stream.h"
@@ -123,25 +123,26 @@ bool TpuExecutor::StopTimer(Stream* stream, ::stream_executor::Timer* timer) {
 
 stream_executor::Event::Status TpuExecutor::PollForEventStatus(
     stream_executor::Event* event) {
-  return stream_executor::Event::Status(TpuExecutor_PollForEventStatus(
-      executor_, event_map().at(event->implementation())));
+  return stream_executor::Event::Status(
+      tpu::ExecutorApiFn()->TpuExecutor_PollForEventStatusFn(
+          executor_, event_map().at(event->implementation())));
 }
 
 Status TpuExecutor::RecordEvent(Stream* stream,
                                 ::stream_executor::Event* event) {
   StatusHelper status;
-  TpuExecutor_RecordEvent(executor_, stream_map().at(stream->implementation()),
-                          event_map().at(event->implementation()),
-                          status.c_status);
+  tpu::ExecutorApiFn()->TpuExecutor_RecordEventFn(
+      executor_, stream_map().at(stream->implementation()),
+      event_map().at(event->implementation()), status.c_status);
   return status.status();
 }
 
 Status TpuExecutor::WaitForEvent(Stream* stream,
                                  ::stream_executor::Event* event) {
   StatusHelper status;
-  TpuExecutor_WaitForEvent(executor_, stream_map().at(stream->implementation()),
-                           event_map().at(event->implementation()),
-                           status.c_status);
+  tpu::ExecutorApiFn()->TpuExecutor_WaitForEventFn(
+      executor_, stream_map().at(stream->implementation()),
+      event_map().at(event->implementation()), status.c_status);
   return status.status();
 }
 
@@ -181,18 +182,18 @@ TpuExecutor::CreateEventImplementation() {
 DeviceMemoryBase TpuExecutor::Allocate(uint64 size, int64 memory_space) {
   SE_DeviceMemoryBase se_base = tpu::ExecutorApiFn()->TpuExecutor_AllocateFn(
       executor_, size, memory_space);
-  return TpuConversions::SE_DeviceMemoryBaseToDeviceMemoryBase(se_base);
+  return DeviceMemoryBaseHelper::SE_DeviceMemoryBaseToDeviceMemoryBase(se_base);
 }
 
 void TpuExecutor::Deallocate(const DeviceMemoryBase& memory) {
   SE_DeviceMemoryBase se_base =
-      TpuConversions::DeviceMemoryBaseToSE_DeviceMemoryBase(memory);
+      DeviceMemoryBaseHelper::DeviceMemoryBaseToSE_DeviceMemoryBase(memory);
   tpu::ExecutorApiFn()->TpuExecutor_DeallocateFn(executor_, &se_base);
 }
 
 void TpuExecutor::Deallocate(DeviceMemoryBase* memory) {
   SE_DeviceMemoryBase se_base =
-      TpuConversions::DeviceMemoryBaseToSE_DeviceMemoryBase(*memory);
+      DeviceMemoryBaseHelper::DeviceMemoryBaseToSE_DeviceMemoryBase(*memory);
   tpu::ExecutorApiFn()->TpuExecutor_DeallocateFn(executor_, &se_base);
 }
 
@@ -268,7 +269,7 @@ bool TpuExecutor::Memcpy(Stream* stream, void* host_dst,
                          const ::stream_executor::DeviceMemoryBase& device_src,
                          uint64 size) {
   SE_DeviceMemoryBase se_base =
-      TpuConversions::DeviceMemoryBaseToSE_DeviceMemoryBase(device_src);
+      DeviceMemoryBaseHelper::DeviceMemoryBaseToSE_DeviceMemoryBase(device_src);
   return tpu::ExecutorApiFn()->TpuExecutor_MemcpyToHostFn(
       executor_, stream_map().at(stream->implementation()), host_dst, &se_base,
       size);
@@ -278,7 +279,8 @@ bool TpuExecutor::Memcpy(Stream* stream,
                          ::stream_executor::DeviceMemoryBase* device_dst,
                          const void* host_src, uint64 size) {
   SE_DeviceMemoryBase se_base =
-      TpuConversions::DeviceMemoryBaseToSE_DeviceMemoryBase(*device_dst);
+      DeviceMemoryBaseHelper::DeviceMemoryBaseToSE_DeviceMemoryBase(
+          *device_dst);
   return tpu::ExecutorApiFn()->TpuExecutor_MemcpyFromHostFn(
       executor_, stream_map().at(stream->implementation()), &se_base, host_src,
       size);
@@ -289,7 +291,8 @@ Status TpuExecutor::SynchronousMemcpy(
     uint64 size) {
   StatusHelper status;
   SE_DeviceMemoryBase se_base =
-      TpuConversions::DeviceMemoryBaseToSE_DeviceMemoryBase(*device_dst);
+      DeviceMemoryBaseHelper::DeviceMemoryBaseToSE_DeviceMemoryBase(
+          *device_dst);
   tpu::ExecutorApiFn()->TpuExecutor_SynchronousMemcpyFromHostFn(
       executor_, &se_base, host_src, size, status.c_status);
   return status.status();
@@ -300,7 +303,7 @@ Status TpuExecutor::SynchronousMemcpy(
     uint64 size) {
   StatusHelper status;
   SE_DeviceMemoryBase se_base =
-      TpuConversions::DeviceMemoryBaseToSE_DeviceMemoryBase(device_src);
+      DeviceMemoryBaseHelper::DeviceMemoryBaseToSE_DeviceMemoryBase(device_src);
   tpu::ExecutorApiFn()->TpuExecutor_SynchronousMemcpyToHostFn(
       executor_, host_dst, &se_base, size, status.c_status);
   return status.status();
diff --git a/tensorflow/stream_executor/tpu/tpu_platform.cc b/tensorflow/stream_executor/tpu/tpu_platform.cc
index db6324ecaec..24767a88ff9 100644
--- a/tensorflow/stream_executor/tpu/tpu_platform.cc
+++ b/tensorflow/stream_executor/tpu/tpu_platform.cc
@@ -109,7 +109,7 @@ TpuPlatform::GetUncachedExecutor(
 }
 
 const std::string& TpuPlatform::Name() const {
-  static std::string* name = new std::string(kName);
+  static std::string* name = new std::string("TPU");
   return *name;
 }
 
@@ -122,7 +122,7 @@ bool TpuPlatform::ShouldRegisterTpuDeviceToDeviceCopy() {
       ->TpuPlatform_ShouldRegisterTpuDeviceToDeviceCopyFn(platform_);
 }
 
-void RegisterTpuPlatform() {
+bool RegisterTpuPlatform() {
   static bool tpu_platform_registered = false;
   if (!tpu_platform_registered) {
     tensorflow::tpu_registered_platform = new tensorflow::TpuPlatform();
@@ -132,14 +132,7 @@ void RegisterTpuPlatform() {
         std::move(platform)));
     tpu_platform_registered = true;
   }
+  return true;
 }
 
-REGISTER_MODULE_INITIALIZER(tpu_platform, RegisterTpuPlatform());
-
-// Note that module initialization sequencing is not supported in the
-// open-source project, so this will be a no-op there.
-REGISTER_MODULE_INITIALIZER_SEQUENCE(tpu_platform, multi_platform_manager);
-REGISTER_MODULE_INITIALIZER_SEQUENCE(multi_platform_manager_listener,
-                                     tpu_platform);
-
 }  // namespace tensorflow
diff --git a/tensorflow/stream_executor/tpu/tpu_platform.h b/tensorflow/stream_executor/tpu/tpu_platform.h
index c2673ab9288..a3852b0edb0 100644
--- a/tensorflow/stream_executor/tpu/tpu_platform.h
+++ b/tensorflow/stream_executor/tpu/tpu_platform.h
@@ -38,7 +38,6 @@ class TpuPlatform : public ::tensorflow::tpu::TpuPlatformInterface {
                           SE_Event*>;
 
   static const ::stream_executor::Platform::Id kId;
-  static constexpr char kName[] = "TPU";
 
   using Status = ::stream_executor::port::Status;
   template <typename T>
@@ -122,7 +121,7 @@ class TpuPlatform : public ::tensorflow::tpu::TpuPlatformInterface {
   EventMap event_map_;
 };
 
-void RegisterTpuPlatform();
+bool RegisterTpuPlatform();
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/stream_executor/tpu/tpu_platform_registration.cc b/tensorflow/stream_executor/tpu/tpu_platform_registration.cc
new file mode 100644
index 00000000000..6f054f57aa9
--- /dev/null
+++ b/tensorflow/stream_executor/tpu/tpu_platform_registration.cc
@@ -0,0 +1,28 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/stream_executor/platform/initialize.h"
+#include "tensorflow/stream_executor/tpu/tpu_platform.h"
+
+REGISTER_MODULE_INITIALIZER(tpu_platform, tensorflow::RegisterTpuPlatform());
+
+DECLARE_MODULE_INITIALIZER(multi_platform_manager);
+DECLARE_MODULE_INITIALIZER(multi_platform_manager_listener);
+
+// Note that module initialization sequencing is not supported in the
+// open-source project, so this will be a no-op there.
+REGISTER_MODULE_INITIALIZER_SEQUENCE(tpu_platform, multi_platform_manager);
+REGISTER_MODULE_INITIALIZER_SEQUENCE(multi_platform_manager_listener,
+                                     tpu_platform);
diff --git a/tensorflow/stream_executor/tpu/tpu_stream.h b/tensorflow/stream_executor/tpu/tpu_stream.h
index 209a624b462..09b496bfedc 100644
--- a/tensorflow/stream_executor/tpu/tpu_stream.h
+++ b/tensorflow/stream_executor/tpu/tpu_stream.h
@@ -18,7 +18,7 @@ limitations under the License.
 
 #include "tensorflow/core/tpu/tpu_api.h"
 #include "tensorflow/stream_executor/stream_executor_internal.h"
-#include "tensorflow/stream_executor/tpu/c_api_conversions.h"
+#include "tensorflow/stream_executor/tpu/device_memory_base_helper.h"
 #include "tensorflow/stream_executor/tpu/status_helper.h"
 #include "tensorflow/stream_executor/tpu/tpu_executor_c_api.h"
 #include "tensorflow/stream_executor/tpu/tpu_stream_interface.h"
@@ -46,8 +46,10 @@ class TpuStream : public tensorflow::tpu::TpuStreamInterface {
     tensorflow::tpu::ExecutorApiFn()
         ->TpuStream_TpuEnqueueOnDeviceSendRecvLocalFn(
             stream_,
-            TpuConversions::DeviceMemoryBaseToSE_DeviceMemoryBase(send_buffer),
-            TpuConversions::DeviceMemoryBaseToSE_DeviceMemoryBase(recv_buffer),
+            DeviceMemoryBaseHelper::DeviceMemoryBaseToSE_DeviceMemoryBase(
+                send_buffer),
+            DeviceMemoryBaseHelper::DeviceMemoryBaseToSE_DeviceMemoryBase(
+                recv_buffer),
             status.c_status);
     return status.status();
   }

From 280981f7d3059f5caed64ed75cccede9d1c53bac Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 26 Jun 2020 02:01:47 -0700
Subject: [PATCH 1146/1390] Update GraphDef version to 444.

PiperOrigin-RevId: 318433626
Change-Id: Ic073a6dbb797eb0f7645068ba4bd6929adc613a1
---
 tensorflow/core/public/version.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index df2ff65b892..7c0eb7bb0c2 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -108,7 +108,7 @@ limitations under the License.
 
 #define TF_GRAPH_DEF_VERSION_MIN_PRODUCER 0
 #define TF_GRAPH_DEF_VERSION_MIN_CONSUMER 0
-#define TF_GRAPH_DEF_VERSION 443  // Updated: 2020/6/25
+#define TF_GRAPH_DEF_VERSION 444  // Updated: 2020/6/26
 
 // Checkpoint compatibility versions (the versions field in SavedSliceMeta).
 //

From a76fe96e7fcc52f68ff23b6ccd644319b121298b Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 26 Jun 2020 02:01:50 -0700
Subject: [PATCH 1147/1390] compat: Update forward compatibility horizon to
 2020-06-26

PiperOrigin-RevId: 318433628
Change-Id: I37ea7da65d53d7746a0b8e126f39efc858d12edb
---
 tensorflow/python/compat/compat.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/compat/compat.py b/tensorflow/python/compat/compat.py
index 4f16f0ec932..ae1da8b7874 100644
--- a/tensorflow/python/compat/compat.py
+++ b/tensorflow/python/compat/compat.py
@@ -33,7 +33,7 @@ from tensorflow.python.util.tf_export import tf_export
 # This value changes every day with an automatic CL. It can be modified in code
 # via `forward_compatibility_horizon()` or with the environment variable
 # TF_FORWARD_COMPATIBILITY_DELTA_DAYS, which is added to the compatibility date.
-_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2020, 6, 25)
+_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2020, 6, 26)
 _FORWARD_COMPATIBILITY_DELTA_DAYS_VAR_NAME = "TF_FORWARD_COMPATIBILITY_DELTA_DAYS"
 _FORWARD_COMPATIBILITY_DATE_NUMBER = None
 

From b3319125a036aea6b7bbc0d1c50753e7be73be27 Mon Sep 17 00:00:00 2001
From: Tres Popp <tpopp@google.com>
Date: Fri, 26 Jun 2020 03:25:52 -0700
Subject: [PATCH 1148/1390] Integrate LLVM at
 https://github.com/llvm/llvm-project/commit/834c71829cc2

PiperOrigin-RevId: 318442401
Change-Id: I64f85c41a4de7a1c4d91bcebd3d141821c78f569
---
 tensorflow/workspace.bzl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index 03a5d15e29f..260ed718099 100755
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -710,8 +710,8 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
     )
 
     # Check out LLVM and MLIR from llvm-project.
-    LLVM_COMMIT = "4df7d852afc04844184f0a02d3a3ca4449bbbc5f"
-    LLVM_SHA256 = "07460a0c233dbf92de1e068d1fbafcabd93ab6f453d4c75bdca021f23c5a4abc"
+    LLVM_COMMIT = "834c71829cc2e22841bf75e86b988c46252c70dc"
+    LLVM_SHA256 = "8d31c3bdc335cecf1c2ef00ff67a93a66e94a135f3808ddcf68ec9494a2bf3b4"
     LLVM_URLS = [
         "https://storage.googleapis.com/mirror.tensorflow.org/github.com/llvm/llvm-project/archive/{commit}.tar.gz".format(commit = LLVM_COMMIT),
         "https://github.com/llvm/llvm-project/archive/{commit}.tar.gz".format(commit = LLVM_COMMIT),

From 4bf09408f5cabfdb61fe148ad4950436630aa2c3 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 26 Jun 2020 04:50:01 -0700
Subject: [PATCH 1149/1390] Adding cuda related library into cuda_deps section.

PiperOrigin-RevId: 318450425
Change-Id: I1eb108a29111c3715ada15579f7957027a7dbe90
---
 tensorflow/core/common_runtime/gpu/BUILD | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

diff --git a/tensorflow/core/common_runtime/gpu/BUILD b/tensorflow/core/common_runtime/gpu/BUILD
index 922a366fbfb..18d6f06a5f4 100644
--- a/tensorflow/core/common_runtime/gpu/BUILD
+++ b/tensorflow/core/common_runtime/gpu/BUILD
@@ -28,10 +28,6 @@ load(
     "if_static",
     "tf_cuda_tests_tags",
 )
-load(
-    "//tensorflow/stream_executor:build_defs.bzl",
-    "if_gpu_is_configured",
-)
 
 package(
     default_visibility = [
@@ -147,13 +143,13 @@ tf_cuda_library(
     copts = tf_copts(),
     cuda_deps = [
         "@local_config_cuda//cuda:cudnn_header",
+        "//tensorflow/stream_executor/cuda:cuda_platform",
     ],
     deps = [
         ":gpu_bfc_allocator",
         ":gpu_id_impl",
         ":gpu_init_impl",
         ":gpu_lib",
-        "//third_party/eigen3",
         "//tensorflow/core:core_cpu_impl",
         "//tensorflow/core:core_cpu_lib",
         "//tensorflow/core:framework",
@@ -165,9 +161,8 @@ tf_cuda_library(
         "//tensorflow/core:stream_executor",
         "//tensorflow/core/profiler/lib:annotated_traceme",
         "//tensorflow/core/profiler/lib:scoped_annotation",
-    ] + if_gpu_is_configured([
-        "//tensorflow/stream_executor/cuda:cuda_platform",
-    ]),
+        "//third_party/eigen3",
+    ],
     alwayslink = 1,
 )
 

From ec14651411d3489e85cabc323bb6fa90eeb7041a Mon Sep 17 00:00:00 2001
From: Christian Sigg <csigg@google.com>
Date: Fri, 26 Jun 2020 05:08:08 -0700
Subject: [PATCH 1150/1390] Remove .oss from find_cuda_config in compression
 script.

See https://github.com/tensorflow/tensorflow/pull/40759

PiperOrigin-RevId: 318452377
Change-Id: I04f3ad1c8cf9cac5446d0a1196ebbf66660bf312
---
 third_party/gpus/compress_find_cuda_config.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/third_party/gpus/compress_find_cuda_config.py b/third_party/gpus/compress_find_cuda_config.py
index 98be39d9245..606bbf2cdd5 100644
--- a/third_party/gpus/compress_find_cuda_config.py
+++ b/third_party/gpus/compress_find_cuda_config.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Compresses the contents of find_cuda_config.py.oss.
+"""Compresses the contents of 'find_cuda.py'.
 
 The compressed file is what is actually being used. It works around remote
 config not being able to upload files yet.
@@ -22,13 +22,13 @@ import zlib
 
 
 def main():
-  with open('find_cuda_config.py.oss', 'rb') as f:
+  with open('find_cuda.py', 'rb') as f:
     data = f.read()
 
   compressed = zlib.compress(data)
   b64encoded = base64.b64encode(compressed)
 
-  with open('find_cuda_config.py.gz.base64.oss', 'wb') as f:
+  with open('find_cuda.py.gz.base64', 'wb') as f:
     f.write(b64encoded)
 
 
From 75ea0b31477d6ba9e990e296bbbd8ca4e7eebadf Mon Sep 17 00:00:00 2001
From: Christian Sigg <csigg@google.com>
Date: Fri, 26 Jun 2020 05:08:10 -0700
Subject: [PATCH 1151/1390] Provide overload to cope with const-ness change of
 NumPy's PyUFuncGenericFunction.

See https://github.com/tensorflow/tensorflow/issues/40688, https://github.com/tensorflow/tensorflow/pull/40654.

PiperOrigin-RevId: 318452381
Change-Id: Icc5152f2b020ef19882a49e3c86ac80bbe048d64
---
 tensorflow/python/lib/core/bfloat16.cc | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/tensorflow/python/lib/core/bfloat16.cc b/tensorflow/python/lib/core/bfloat16.cc
index feb01f11a1a..bb6b720febe 100644
--- a/tensorflow/python/lib/core/bfloat16.cc
+++ b/tensorflow/python/lib/core/bfloat16.cc
@@ -517,7 +517,7 @@ bool RegisterBfloat16Cast(int numpy_type, bool cast_is_safe) {
 }
 
 template <typename InType, typename OutType, typename Functor>
-void BinaryUFunc(char** args, npy_intp* dimensions, npy_intp* steps,
+void BinaryUFunc(char** args, const npy_intp* dimensions, const npy_intp* steps,
                  void* data) {
   const char* i0 = args[0];
   const char* i1 = args[1];
@@ -532,11 +532,17 @@ void BinaryUFunc(char** args, npy_intp* dimensions, npy_intp* steps,
   }
 }
 
+// Numpy changed const-ness of PyUFuncGenericFunction, provide overload.
 template <typename Functor>
 void CompareUFunc(char** args, npy_intp* dimensions, npy_intp* steps,
                   void* data) {
   BinaryUFunc<bfloat16, npy_bool, Functor>(args, dimensions, steps, data);
 }
+template <typename Functor>
+void CompareUFunc(char** args, const npy_intp* dimensions,
+                  const npy_intp* steps, void* data) {
+  BinaryUFunc<bfloat16, npy_bool, Functor>(args, dimensions, steps, data);
+}
 
 struct Bfloat16EqFunctor {
   npy_bool operator()(bfloat16 a, bfloat16 b) { return a == b; }

From 44f2f014e5d84e195af7de6f9d9400df3d3a00dd Mon Sep 17 00:00:00 2001
From: Blake Hechtman <blakehechtman@google.com>
Date: Fri, 26 Jun 2020 05:08:24 -0700
Subject: [PATCH 1152/1390] [XLA:DYNAMIC PADDER] Resize custom calls support
 padded batch and channel dimensions

PiperOrigin-RevId: 318452404
Change-Id: I2c9a9f1456d6a1cc8deb3ffeb62ad876617d4554
---
 .../service/dynamic_dimension_inference.cc    | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

diff --git a/tensorflow/compiler/xla/service/dynamic_dimension_inference.cc b/tensorflow/compiler/xla/service/dynamic_dimension_inference.cc
index e193df6d9bd..b7186c186f4 100644
--- a/tensorflow/compiler/xla/service/dynamic_dimension_inference.cc
+++ b/tensorflow/compiler/xla/service/dynamic_dimension_inference.cc
@@ -257,15 +257,18 @@ Status DynamicDimensionInferenceVisitor::HandleCustomCall(HloInstruction* hlo) {
       hlo, [&](HloInstruction* operand, ShapeIndex index, int64 dimension,
                int64 operand_index, HloInstruction* dynamic_size,
                DimensionConstraint constraint) {
-        if ((hlo->custom_call_target() != "SliceToDynamic" &&
-             hlo->custom_call_target() != "Sharding") ||
-            absl::StartsWith(hlo->custom_call_target(), "Resize")) {
-          return Unimplemented(
-              "CustomCall is not supported to have a dynamic dimension");
+        // Resize custom call should propagate dynamic batch (0) and channel (3)
+        // dimensions.
+        if (hlo->custom_call_target() == "SliceToDynamic" ||
+            hlo->custom_call_target() == "Sharding" ||
+            (absl::StartsWith(hlo->custom_call_target(), "Resize") &&
+             (dimension == 0 || dimension == 3))) {
+          parent_->SetDynamicSize(hlo, {}, dimension, dynamic_size, constraint);
+          return Status::OK();
         }
-
-        parent_->SetDynamicSize(hlo, {}, dimension, dynamic_size, constraint);
-        return Status::OK();
+        return Unimplemented(
+            "CustomCall \"%s\" is not supported to have a dynamic dimension",
+            hlo->custom_call_target());
       });
 }
 

From 2c5f042fbb52254f164288c5c09b2cc18966143a Mon Sep 17 00:00:00 2001
From: Vo Van Nghia <vovannghia2409@gmail.com>
Date: Fri, 26 Jun 2020 21:56:58 +0700
Subject: [PATCH 1153/1390] Add compose object

---
 .../filesystem/plugins/gcs/gcs_filesystem.cc  | 142 +++++++++++++-----
 .../filesystem/plugins/gcs/gcs_helper.cc      |   8 +-
 .../filesystem/plugins/gcs/gcs_helper.h       |   3 +-
 3 files changed, 116 insertions(+), 37 deletions(-)

diff --git a/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem.cc b/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem.cc
index c9e5dd84c5a..284e05e1aaa 100644
--- a/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem.cc
+++ b/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem.cc
@@ -26,6 +26,15 @@ limitations under the License.
 // This filesystem will support `gs://` URI schemes.
 namespace gcs = google::cloud::storage;
 
+// How to upload new data when Flush() is called multiple times.
+// By default the entire file is reuploaded.
+constexpr char kAppendMode[] = "GCS_APPEND_MODE";
+// If GCS_APPEND_MODE=compose then instead the new data is uploaded to a
+// temporary object and composed with the original object. This is disabled by
+// default as the multiple API calls required add a risk of stranding temporary
+// objects.
+constexpr char kComposeAppend[] = "compose";
+
 // We can cast `google::cloud::StatusCode` to `TF_Code` because they have the
 // same integer values. See
 // https://github.com/googleapis/google-cloud-cpp/blob/6c09cbfa0160bc046e5509b4dd2ab4b872648b4a/google/cloud/status.h#L32-L52
@@ -80,21 +89,51 @@ typedef struct GCSFile {
   gcs::Client* gcs_client;  // not owned
   TempFile outfile;
   bool sync_need;
+  // `offset` tells us how many bytes of this file are already uploaded to
+  // server. If `offset == -1`, we always upload the entire temporary file.
+  int64_t offset;
 } GCSFile;
 
 static void SyncImpl(const std::string& bucket, const std::string& object,
-                     TempFile* outfile, gcs::Client* gcs_client,
-                     TF_Status* status) {
+                     int64_t* offset, TempFile* outfile,
+                     gcs::Client* gcs_client, TF_Status* status) {
   outfile->operator<<(std::flush);
-  // TODO(vnvo2409): Add resumable upload, compose object, etc.
-  auto metadata = gcs_client->UploadFile(outfile->getName(), bucket, object);
-  if (!metadata) {
-    TF_SetStatusFromGCSStatus(metadata.status(), status);
-    return;
+  if (*offset == -1) {
+    // UploadFile will automatically switch to resumable upload based on Client
+    // configuration.
+    auto metadata = gcs_client->UploadFile(outfile->getName(), bucket, object);
+    if (!metadata) {
+      TF_SetStatusFromGCSStatus(metadata.status(), status);
+      return;
+    }
+    outfile->clear();
+    outfile->seekp(std::ios::end);
+    TF_SetStatus(status, TF_OK, "");
+  } else {
+    std::string temporary_object =
+        gcs::CreateRandomPrefixName("tf_writable_file_gcs");
+    auto metadata =
+        gcs_client->UploadFile(outfile->getName(), bucket, temporary_object);
+    if (!metadata) {
+      TF_SetStatusFromGCSStatus(metadata.status(), status);
+      return;
+    }
+    const std::vector<gcs::ComposeSourceObject> source_objects = {
+        {object, {}, {}}, {temporary_object, {}, {}}};
+    metadata = gc s_client->ComposeObject(bucket, source_objects, object);
+    if (!metadata) {
+      TF_SetStatusFromGCSStatus(metadata.status(), status);
+      return;
+    }
+    // We truncate the data that are already uploaded.
+    if (!outfile->truncate()) {
+      TF_SetStatus(status, TF_INTERNAL,
+                   "Could not truncate internal temporary file.");
+      return;
+    }
+    *offset = static_cast<int64_t>(metadata->size());
+    TF_SetStatus(status, TF_OK, "");
   }
-  outfile->clear();
-  outfile->seekp(std::ios::end);
-  TF_SetStatus(status, TF_OK, "");
 }
 
 void Cleanup(TF_WritableFile* file) {
@@ -127,7 +166,9 @@ int64_t Tell(const TF_WritableFile* file, TF_Status* status) {
                  "tellp on the internal temporary file failed");
   else
     TF_SetStatus(status, TF_OK, "");
-  return position;
+  return position == -1
+             ? -1
+             : position + (gcs_file->offset == -1 ? 0 : gcs_file->offset);
 }
 
 void Flush(const TF_WritableFile* file, TF_Status* status) {
@@ -138,10 +179,9 @@ void Flush(const TF_WritableFile* file, TF_Status* status) {
                    "Could not append to the internal temporary file.");
       return;
     }
-    SyncImpl(gcs_file->bucket, gcs_file->object, &gcs_file->outfile,
-             gcs_file->gcs_client, status);
-    if(TF_GetCode(status) != TF_OK)
-      return;
+    SyncImpl(gcs_file->bucket, gcs_file->object, &gcs_file->offset,
+             &gcs_file->outfile, gcs_file->gcs_client, status);
+    if (TF_GetCode(status) != TF_OK) return;
     gcs_file->sync_need = false;
   } else {
     TF_SetStatus(status, TF_OK, "");
@@ -173,6 +213,10 @@ namespace tf_read_only_memory_region {
 // SECTION 4. Implementation for `TF_Filesystem`, the actual filesystem
 // ----------------------------------------------------------------------------
 namespace tf_gcs_filesystem {
+typedef struct GCSFile {
+  gcs::Client gcs_client;  // owned
+  bool compose;
+} GCSFile;
 
 // TODO(vnvo2409): Add lazy-loading and customizing parameters.
 void Init(TF_Filesystem* filesystem, TF_Status* status) {
@@ -182,14 +226,19 @@ void Init(TF_Filesystem* filesystem, TF_Status* status) {
     TF_SetStatusFromGCSStatus(client.status(), status);
     return;
   }
-  filesystem->plugin_filesystem = plugin_memory_allocate(sizeof(gcs::Client));
-  auto gcs_client = static_cast<gcs::Client*>(filesystem->plugin_filesystem);
-  (*gcs_client) = client.value();
+
+  const char* append_mode = std::getenv(kAppendMode);
+  bool compose =
+      (append_mode != nullptr) && (!strcmp(kAppendMode, append_mode));
+
+  filesystem->plugin_filesystem =
+      new GCSFile({std::move(client.value()), compose});
   TF_SetStatus(status, TF_OK, "");
 }
 
 void Cleanup(TF_Filesystem* filesystem) {
-  plugin_memory_free(filesystem->plugin_filesystem);
+  auto gcs_file = static_cast<GCSFile*>(filesystem->plugin_filesystem);
+  delete gcs_file;
 }
 
 // TODO(vnvo2409): Implement later
@@ -200,11 +249,12 @@ void NewWritableFile(const TF_Filesystem* filesystem, const char* path,
   ParseGCSPath(path, false, &bucket, &object, status);
   if (TF_GetCode(status) != TF_OK) return;
 
-  auto gcs_client = static_cast<gcs::Client*>(filesystem->plugin_filesystem);
+  auto gcs_file = static_cast<GCSFile*>(filesystem->plugin_filesystem);
   char* temp_file_name = TF_GetTempFileName("");
   file->plugin_file = new tf_writable_file::GCSFile(
-      {std::move(bucket), std::move(object), gcs_client,
-       TempFile(temp_file_name, std::ios::binary | std::ios::out), true});
+      {std::move(bucket), std::move(object), &gcs_file->gcs_client,
+       TempFile(temp_file_name, std::ios::binary | std::ios::out), true,
+       (gcs_file->compose ? 0 : -1)});
   // We are responsible for freeing the pointer returned by TF_GetTempFileName
   free(temp_file_name);
   TF_SetStatus(status, TF_OK, "");
@@ -216,21 +266,43 @@ void NewAppendableFile(const TF_Filesystem* filesystem, const char* path,
   ParseGCSPath(path, false, &bucket, &object, status);
   if (TF_GetCode(status) != TF_OK) return;
 
-  auto gcs_client = static_cast<gcs::Client*>(filesystem->plugin_filesystem);
-  char* temp_file_name = TF_GetTempFileName("");
+  auto gcs_file = static_cast<GCSFile*>(filesystem->plugin_filesystem);
+  char* temp_file_name_c_str = TF_GetTempFileName("");
+  std::string temp_file_name(temp_file_name_c_str);  // To prevent memory-leak
+  free(temp_file_name_c_str);
 
-  auto gcs_status = gcs_client->DownloadToFile(bucket, object, temp_file_name);
-  TF_SetStatusFromGCSStatus(gcs_status, status);
-  auto status_code = TF_GetCode(status);
-  if (status_code != TF_OK && status_code != TF_NOT_FOUND) {
-    return;
+  if (!gcs_file->compose) {
+    auto gcs_status =
+        gcs_file->gcs_client.DownloadToFile(bucket, object, temp_file_name);
+    TF_SetStatusFromGCSStatus(gcs_status, status);
+    auto status_code = TF_GetCode(status);
+    if (status_code != TF_OK && status_code != TF_NOT_FOUND) return;
+    // If this file does not exist on server, we will need to sync it.
+    bool sync_need = (status_code == TF_NOT_FOUND);
+    file->plugin_file = new tf_writable_file::GCSFile(
+        {std::move(bucket), std::move(object), &gcs_file->gcs_client,
+         TempFile(temp_file_name, std::ios::binary | std::ios::app), sync_need,
+         -1});
+  } else {
+    // If compose is true, we do not download anything.
+    // Instead we only check if this file exists on server or not.
+    auto metadata = gcs_file->gcs_client.GetObjectMetadata(bucket, object);
+    TF_SetStatusFromGCSStatus(metadata.status(), status);
+    if (TF_GetCode(status) == TF_OK) {
+      file->plugin_file = new tf_writable_file::GCSFile(
+          {std::move(bucket), std::move(object), &gcs_file->gcs_client,
+           TempFile(temp_file_name, std::ios::binary | std::ios::trunc), false,
+           static_cast<int64_t>(metadata->size())});
+    } else if (TF_GetCode(status) == TF_NOT_FOUND) {
+      file->plugin_file = new tf_writable_file::GCSFile(
+          {std::move(bucket), std::move(object), &gcs_file->gcs_client,
+           TempFile(temp_file_name, std::ios::binary | std::ios::trunc), true,
+           0});
+    } else {
+      return;
+    }
   }
-  // If this file does not exist on server, we will need to sync it.
-  bool sync_need = (status_code == TF_NOT_FOUND);
-  file->plugin_file = new tf_writable_file::GCSFile(
-      {std::move(bucket), std::move(object), gcs_client,
-       TempFile(temp_file_name, std::ios::binary | std::ios::app), sync_need});
-  free(temp_file_name);
+
   TF_SetStatus(status, TF_OK, "");
 }
 
diff --git a/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_helper.cc b/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_helper.cc
index 4504a9f3b35..91f20275dcf 100644
--- a/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_helper.cc
+++ b/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_helper.cc
@@ -20,7 +20,7 @@ limitations under the License.
 #include <string>
 #include <utility>
 
-TempFile::TempFile(const char* temp_file_name, std::ios::openmode mode)
+TempFile::TempFile(const std::string& temp_file_name, std::ios::openmode mode)
     : std::fstream(temp_file_name, mode), name_(temp_file_name) {}
 
 TempFile::TempFile(TempFile&& rhs)
@@ -32,3 +32,9 @@ TempFile::~TempFile() {
 }
 
 const std::string TempFile::getName() const { return name_; }
+
+bool TempFile::truncate() {
+  std::fstream::close();
+  std::fstream::open(name_, std::ios::binary | std::ios::out);
+  return std::fstream::is_open();
+}
diff --git a/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_helper.h b/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_helper.h
index 1a521ca4f1e..034777c3bef 100644
--- a/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_helper.h
+++ b/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_helper.h
@@ -21,10 +21,11 @@ limitations under the License.
 class TempFile : public std::fstream {
  public:
   // We should specify openmode each time we call TempFile.
-  TempFile(const char* temp_file_name, std::ios::openmode mode);
+  TempFile(const std::string& temp_file_name, std::ios::openmode mode);
   TempFile(TempFile&& rhs);
   ~TempFile() override;
   const std::string getName() const;
+  bool truncate();
 
  private:
   const std::string name_;

From c299c894e48570eac2e1a1f7d024583b27f1820c Mon Sep 17 00:00:00 2001
From: Vo Van Nghia <vovannghia2409@gmail.com>
Date: Fri, 26 Jun 2020 22:01:53 +0700
Subject: [PATCH 1154/1390] Add offset == 0

---
 .../c/experimental/filesystem/plugins/gcs/gcs_filesystem.cc    | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem.cc b/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem.cc
index 284e05e1aaa..1054fe1c34e 100644
--- a/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem.cc
+++ b/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem.cc
@@ -98,7 +98,8 @@ static void SyncImpl(const std::string& bucket, const std::string& object,
                      int64_t* offset, TempFile* outfile,
                      gcs::Client* gcs_client, TF_Status* status) {
   outfile->operator<<(std::flush);
-  if (*offset == -1) {
+  // `*offset == 0` means this file does not exist on the server.
+  if (*offset == -1 || *offset == 0) {
     // UploadFile will automatically switch to resumable upload based on Client
     // configuration.
     auto metadata = gcs_client->UploadFile(outfile->getName(), bucket, object);

From f58620df27f69058d84dc938d557972e7581607b Mon Sep 17 00:00:00 2001
From: Vo Van Nghia <vovannghia2409@gmail.com>
Date: Fri, 26 Jun 2020 22:46:11 +0700
Subject: [PATCH 1155/1390] Fix typo

---
 .../c/experimental/filesystem/plugins/gcs/gcs_filesystem.cc  | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem.cc b/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem.cc
index 1054fe1c34e..55527bd12f9 100644
--- a/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem.cc
+++ b/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem.cc
@@ -107,6 +107,9 @@ static void SyncImpl(const std::string& bucket, const std::string& object,
       TF_SetStatusFromGCSStatus(metadata.status(), status);
       return;
     }
+    if (*offset == 0) {
+      *offset = static_cast<int64_t>(metadata->size());
+    }
     outfile->clear();
     outfile->seekp(std::ios::end);
     TF_SetStatus(status, TF_OK, "");
@@ -121,7 +124,7 @@ static void SyncImpl(const std::string& bucket, const std::string& object,
     }
     const std::vector<gcs::ComposeSourceObject> source_objects = {
         {object, {}, {}}, {temporary_object, {}, {}}};
-    metadata = gc s_client->ComposeObject(bucket, source_objects, object);
+    metadata = gcs_client->ComposeObject(bucket, source_objects, object);
     if (!metadata) {
       TF_SetStatusFromGCSStatus(metadata.status(), status);
       return;

From c9a4b8edd445045823bc7ab1c556c44815f970f2 Mon Sep 17 00:00:00 2001
From: Tres Popp <tpopp@google.com>
Date: Fri, 26 Jun 2020 05:20:20 -0700
Subject: [PATCH 1156/1390] Integrate LLVM at
 https://github.com/llvm/llvm-project/commit/d45cf9105b5a

PiperOrigin-RevId: 318453623
Change-Id: I488ba23ac591cca27c7667b20219be4da0894009
---
 tensorflow/workspace.bzl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index 260ed718099..d27a0d2a904 100755
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -710,8 +710,8 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
     )
 
     # Check out LLVM and MLIR from llvm-project.
-    LLVM_COMMIT = "834c71829cc2e22841bf75e86b988c46252c70dc"
-    LLVM_SHA256 = "8d31c3bdc335cecf1c2ef00ff67a93a66e94a135f3808ddcf68ec9494a2bf3b4"
+    LLVM_COMMIT = "d45cf9105b5a88ed03382ffbbfcd54b461f1bb23"
+    LLVM_SHA256 = "c98ec5c91fb2e577a5ad3e8f3e288c6df7794e1a615c91a1a97df169d3f69e36"
     LLVM_URLS = [
         "https://storage.googleapis.com/mirror.tensorflow.org/github.com/llvm/llvm-project/archive/{commit}.tar.gz".format(commit = LLVM_COMMIT),
         "https://github.com/llvm/llvm-project/archive/{commit}.tar.gz".format(commit = LLVM_COMMIT),

From e91990219b7abe6253ae77e597203d402bdca526 Mon Sep 17 00:00:00 2001
From: Dmitry Kovalev <dkovalev@google.com>
Date: Fri, 26 Jun 2020 08:03:50 -0700
Subject: [PATCH 1157/1390] Return TfLiteStatus from
 Interpreter::SetNumThreads()

PiperOrigin-RevId: 318472145
Change-Id: Icb3fd7575c27638e61911f43ca0ebe952236da2c
---
 tensorflow/lite/interpreter.cc      |  5 +++--
 tensorflow/lite/interpreter.h       |  2 +-
 tensorflow/lite/interpreter_test.cc | 12 ++++++------
 3 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/tensorflow/lite/interpreter.cc b/tensorflow/lite/interpreter.cc
index b49aa5031bf..88dcb37898a 100644
--- a/tensorflow/lite/interpreter.cc
+++ b/tensorflow/lite/interpreter.cc
@@ -302,12 +302,12 @@ TfLiteStatus Interpreter::SetExecutionPlan(const std::vector<int>& new_plan) {
 
 void Interpreter::UseNNAPI(bool enable) { primary_subgraph().UseNNAPI(enable); }
 
-void Interpreter::SetNumThreads(int num_threads) {
+TfLiteStatus Interpreter::SetNumThreads(int num_threads) {
   if (num_threads < -1) {
     context_->ReportError(context_,
                           "num_threads should be >=0 or just -1 to let TFLite "
                           "runtime set the value.");
-    return;
+    return kTfLiteError;
   }
 
   for (auto& subgraph : subgraphs_) {
@@ -320,6 +320,7 @@ void Interpreter::SetNumThreads(int num_threads) {
       c->Refresh(context_);
     }
   }
+  return kTfLiteOk;
 }
 
 void Interpreter::SetAllowFp16PrecisionForFp32(bool allow) {
diff --git a/tensorflow/lite/interpreter.h b/tensorflow/lite/interpreter.h
index 41377c4ce1f..4543759f407 100644
--- a/tensorflow/lite/interpreter.h
+++ b/tensorflow/lite/interpreter.h
@@ -371,7 +371,7 @@ class Interpreter {
   /// NOTE: num_threads should be >= -1.
   /// User may pass -1 to let the TFLite interpreter set the no of threads
   /// available to itself.
-  void SetNumThreads(int num_threads);
+  TfLiteStatus SetNumThreads(int num_threads);
 
   /// Allow float16 precision for FP32 calculation when possible.
   /// default: not allow.
diff --git a/tensorflow/lite/interpreter_test.cc b/tensorflow/lite/interpreter_test.cc
index 49b8e7bd816..899811b3fea 100644
--- a/tensorflow/lite/interpreter_test.cc
+++ b/tensorflow/lite/interpreter_test.cc
@@ -1131,26 +1131,26 @@ TEST_F(InterpreterTest, GetSetResetExternalContexts) {
   };
 
   EXPECT_EQ(TestExternalContext::Get(context), nullptr);
-  interpreter_.SetNumThreads(4);
+  ASSERT_EQ(interpreter_.SetNumThreads(4), kTfLiteOk);
 
   TestExternalContext::Set(context, &external_context);
   EXPECT_EQ(TestExternalContext::Get(context), &external_context);
-  interpreter_.SetNumThreads(4);
-  interpreter_.SetNumThreads(5);
+  ASSERT_EQ(interpreter_.SetNumThreads(4), kTfLiteOk);
+  ASSERT_EQ(interpreter_.SetNumThreads(5), kTfLiteOk);
   EXPECT_EQ(external_context.num_refreshes, 2);
 
   // Reset refresh count to 0
   external_context.num_refreshes = 0;
   // Below should not call external context refresh
-  interpreter_.SetNumThreads(-2);
+  ASSERT_EQ(interpreter_.SetNumThreads(-2), kTfLiteError);
   EXPECT_EQ(external_context.num_refreshes, 0);
 
-  interpreter_.SetNumThreads(-1);
+  ASSERT_EQ(interpreter_.SetNumThreads(-1), kTfLiteOk);
   EXPECT_EQ(external_context.num_refreshes, 1);
 
   TestExternalContext::Set(context, nullptr);
   EXPECT_EQ(TestExternalContext::Get(context), nullptr);
-  interpreter_.SetNumThreads(4);
+  ASSERT_EQ(interpreter_.SetNumThreads(4), kTfLiteOk);
 }
 
 struct TestCpuBackendContext : public TfLiteInternalBackendContext {

From 530aa3e4e0fc67c80f7e42e17abaadb79ef659e7 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 26 Jun 2020 09:20:51 -0700
Subject: [PATCH 1158/1390] Add the ability to compile a single-replica
 executable that is portable across devices.

PiperOrigin-RevId: 318485159
Change-Id: Ied5115fab26b49f448ee61fded64904e92794b2b
---
 tensorflow/compiler/xla/pjrt/pjrt_client.cc | 151 +++++++++++++-------
 tensorflow/compiler/xla/pjrt/pjrt_client.h  |  14 +-
 2 files changed, 109 insertions(+), 56 deletions(-)

diff --git a/tensorflow/compiler/xla/pjrt/pjrt_client.cc b/tensorflow/compiler/xla/pjrt/pjrt_client.cc
index e341a11d64f..81f3488c94d 100644
--- a/tensorflow/compiler/xla/pjrt/pjrt_client.cc
+++ b/tensorflow/compiler/xla/pjrt/pjrt_client.cc
@@ -1402,11 +1402,12 @@ static Device* LookupDevice(const PjRtClient& client, int device_id) {
 
 PjRtExecutable::PjRtExecutable(
     std::vector<std::unique_ptr<LocalExecutable>> executables,
-    bool parameter_is_tupled_arguments, DeviceAssignment device_assignment,
+    bool parameter_is_tupled_arguments,
+    std::shared_ptr<DeviceAssignment> device_assignment,
     std::vector<std::pair<int, int>> local_logical_device_ids,
     std::vector<Device*> local_devices, PjRtClient* client)
     : client_(client),
-      device_assignment_(std::make_shared<DeviceAssignment>(device_assignment)),
+      device_assignment_(std::move(device_assignment)),
       parameter_is_tupled_arguments_(parameter_is_tupled_arguments),
       local_logical_device_ids_(std::move(local_logical_device_ids)),
       local_devices_(std::move(local_devices)) {
@@ -1415,11 +1416,21 @@ PjRtExecutable::PjRtExecutable(
     executables_.emplace_back(std::move(executable));
   }
 
-  // This must go after `executables_` is initialized.
-  VLOG(1) << "PjRtExecutable " << name() << " device_assignment:\n"
-          << device_assignment_->ToString();
-
-  const int num_partitions = device_assignment_->computation_count();
+  int num_partitions;
+  if (device_assignment_ == nullptr) {
+    // This must go after `executables_` is initialized.
+    VLOG(1) << "PjRtExecutable " << name() << " portable single-core";
+    num_partitions = 1;
+    CHECK(local_devices_.empty());
+  } else {
+    // This must go after `executables_` is initialized.
+    VLOG(1) << "PjRtExecutable " << name() << " device_assignment:\n"
+            << device_assignment_->ToString();
+    CHECK_GE(local_devices_.size(), 1) << device_assignment_->ToString();
+    CHECK_LE(local_devices_.size(), client_->local_device_count())
+        << "Inconsistent local device count.";
+    num_partitions = device_assignment_->computation_count();
+  }
 
   // SPMD sharding produces a single executable for multiple partitions.
   if (executables_.size() > 1) {
@@ -1427,10 +1438,6 @@ PjRtExecutable::PjRtExecutable(
         << "Number of executables " << executables_.size()
         << " did not match number of partitions " << num_partitions;
   }
-
-  CHECK_GE(local_devices_.size(), 1) << device_assignment_->ToString();
-  CHECK_LE(local_devices_.size(), client_->local_device_count())
-      << "Inconsistent local device count.";
 }
 
 Status PjRtExecutable::SetUpDonation(PjRtClient* client, bool tuple_inputs) {
@@ -1462,7 +1469,8 @@ const std::string& PjRtExecutable::name() const {
 StatusOr<ScopedShapedBuffer> PjRtExecutable::EnqueueExecution(
     absl::Span<PjRtBuffer* const> argument_handles, int replica, int partition,
     int executable_idx, const RunId& run_id, const ExecuteOptions& options,
-    Device* device, std::vector<PjRtBuffer::ScopedHold>* device_buffers) const {
+    Device* device, std::vector<PjRtBuffer::ScopedHold>* device_buffers,
+    std::shared_ptr<DeviceAssignment> device_assignment) const {
   int device_ordinal = device->local_device_state()->device_ordinal();
   tensorflow::profiler::TraceMeConsumer activity(
       "LocalExecutable::Execute", tensorflow::profiler::ContextType::kPjRt,
@@ -1559,7 +1567,7 @@ StatusOr<ScopedShapedBuffer> PjRtExecutable::EnqueueExecution(
   run_options.set_allocator(client_->allocator());
   run_options.set_intra_op_thread_pool(
       client_->client()->backend().eigen_intra_op_thread_pool_device());
-  run_options.set_device_assignment(device_assignment_.get());
+  run_options.set_device_assignment(device_assignment.get());
   run_options.set_run_id(run_id);
   run_options.set_rng_seed(device_state->GetNewPrngSeed());
   run_options.set_gpu_executable_run_options(client_->gpu_run_options());
@@ -1603,7 +1611,7 @@ StatusOr<ScopedShapedBuffer> PjRtExecutable::EnqueueExecution(
     device_state->ThenExecuteOnCallbackThread(
         device_state->compute_stream(),
         [references{std::make_tuple(executables_[executable_idx],
-                                    compute_reservation, device_assignment_)},
+                                    compute_reservation, device_assignment)},
          donated_ptrs{std::move(donated_ptrs)}, allocator{client_->allocator()},
          device_ordinal]() {
           for (const auto& ptr : donated_ptrs) {
@@ -1616,7 +1624,7 @@ StatusOr<ScopedShapedBuffer> PjRtExecutable::EnqueueExecution(
     device_state->ThenRelease(
         device_state->compute_stream(),
         std::make_tuple(executables_[executable_idx], compute_reservation,
-                        device_assignment_));
+                        device_assignment));
   }
 
   return result_buffer_or_status.ConsumeValueOrDie().ConsumeResult();
@@ -1625,9 +1633,22 @@ StatusOr<ScopedShapedBuffer> PjRtExecutable::EnqueueExecution(
 StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>>
 PjRtExecutable::ExecuteHelper(absl::Span<PjRtBuffer* const> argument_handles,
                               int replica, int partition, const RunId& run_id,
-                              const ExecuteOptions& options) const {
-  const int device_id = (*device_assignment_)(replica, partition);
-  Device* device = LookupDevice(*client_, device_id);
+                              const ExecuteOptions& options,
+                              Device* device) const {
+  std::shared_ptr<DeviceAssignment> device_assignment;
+  if (device == nullptr) {
+    CHECK(device_assignment_ != nullptr);
+    const int device_id = (*device_assignment_)(replica, partition);
+    device = LookupDevice(*client_, device_id);
+    device_assignment = device_assignment_;
+  } else {
+    CHECK(device_assignment_ == nullptr);
+    CHECK_EQ(replica, 0);
+    CHECK_EQ(partition, 0);
+    CHECK(local_devices_.empty());
+    device_assignment = std::make_shared<DeviceAssignment>(1, 1);
+    (*device_assignment)(0, 0) = device->id();
+  }
 
   CHECK_EQ(device->host_id(), client_->host_id());
   int device_ordinal = device->local_device_state()->device_ordinal();
@@ -1640,9 +1661,9 @@ PjRtExecutable::ExecuteHelper(absl::Span<PjRtBuffer* const> argument_handles,
 
   std::vector<PjRtBuffer::ScopedHold> device_buffers;
   device_buffers.reserve(argument_handles.size());
-  StatusOr<ScopedShapedBuffer> result_buffer_or_status =
-      EnqueueExecution(argument_handles, replica, partition, executable_idx,
-                       run_id, options, device, &device_buffers);
+  StatusOr<ScopedShapedBuffer> result_buffer_or_status = EnqueueExecution(
+      argument_handles, replica, partition, executable_idx, run_id, options,
+      device, &device_buffers, std::move(device_assignment));
 
   if (!result_buffer_or_status.ok()) {
     LOG(ERROR) << "Execution of replica " << replica
@@ -1736,6 +1757,13 @@ StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>>
 PjRtExecutable::ExecuteOnLocalDevice(
     absl::Span<PjRtBuffer* const> argument_handles, Device* device,
     const ExecuteOptions& options) const {
+  if (device_assignment_ == nullptr) {
+    VLOG(1) << "Executing portable single-core program on "
+            << device->DebugString();
+    return ExecuteHelper(argument_handles,
+                         /*replica=*/0,
+                         /*partition=*/0, RunId(), options, device);
+  }
   for (int i = 0; i < local_devices_.size(); ++i) {
     if (local_devices_[i] == device) {
       VLOG(1) << "Executing computation " << name();
@@ -1754,6 +1782,8 @@ StatusOr<std::vector<std::vector<std::unique_ptr<PjRtBuffer>>>>
 PjRtExecutable::ExecuteOnLocalDevices(
     absl::Span<const std::vector<PjRtBuffer*>> argument_handles,
     const ExecuteOptions& options) const {
+  CHECK(device_assignment_ != nullptr);
+
   RunId run_id;
   tensorflow::profiler::TraceMeProducer activity(
       "LocalExecutable::ExecuteOnLocalDevices",
@@ -1952,16 +1982,33 @@ StatusOr<std::pair<std::vector<Shape>, Shape>> GetShardedProgramShapes(
     build_options.set_device_allocator(client->allocator());
   }
 
-  if (!build_options.has_device_assignment()) {
-    VLOG(2) << "PjRtExecutable::Compile using default device_assignment.";
-    TF_ASSIGN_OR_RETURN(
-        DeviceAssignment device_assignment,
-        client->GetDefaultDeviceAssignment(build_options.num_replicas(),
-                                           build_options.num_partitions()));
-    build_options.set_device_assignment(device_assignment);
+  int num_replicas;
+  int num_partitions;
+  std::shared_ptr<DeviceAssignment> device_assignment;
+  if (options.compile_portable_executable) {
+    if (build_options.has_device_assignment()) {
+      return InvalidArgument(
+          "CompileOptions requests portable executable but "
+          "ExecutableBuildOptions includes a device assignment");
+    }
+    num_replicas = 1;
+    num_partitions = 1;
+  } else {
+    if (!build_options.has_device_assignment()) {
+      VLOG(2) << "PjRtExecutable::Compile using default device_assignment.";
+      TF_ASSIGN_OR_RETURN(
+          DeviceAssignment device_assignment,
+          client->GetDefaultDeviceAssignment(build_options.num_replicas(),
+                                             build_options.num_partitions()));
+      build_options.set_device_assignment(device_assignment);
+    }
+    VLOG(2) << "PjRtExecutable::Compile device_assignment:\n"
+            << build_options.device_assignment().ToString();
+    num_replicas = build_options.device_assignment().replica_count();
+    num_partitions = build_options.device_assignment().computation_count();
+    device_assignment =
+        std::make_shared<DeviceAssignment>(build_options.device_assignment());
   }
-  VLOG(2) << "PjRtExecutable::Compile device_assignment:\n"
-          << build_options.device_assignment().ToString();
 
   TF_ASSIGN_OR_RETURN(ProgramShape program_shape,
                       computation.GetProgramShape());
@@ -2020,33 +2067,31 @@ StatusOr<std::pair<std::vector<Shape>, Shape>> GetShardedProgramShapes(
   TF_RETURN_IF_ERROR(assign_layouts(sharded_shapes.second, &result_layout));
   build_options.set_result_layout(result_layout);
 
-  const int num_replicas = build_options.device_assignment().replica_count();
-  const int num_partitions =
-      build_options.device_assignment().computation_count();
-
   std::vector<std::pair<int, int>> local_logical_device_ids;
   std::vector<Device*> local_devices;
-  for (int replica = 0; replica < num_replicas; ++replica) {
-    for (int partition = 0; partition < num_partitions; ++partition) {
-      int device_id = build_options.device_assignment()(replica, partition);
-      Device* device = LookupDevice(*client, device_id);
-      if (device->host_id() != client->host_id()) {
-        VLOG(3) << "Non-local device: " << device_id;
-        continue;
+  if (device_assignment != nullptr) {
+    for (int replica = 0; replica < num_replicas; ++replica) {
+      for (int partition = 0; partition < num_partitions; ++partition) {
+        int device_id = (*device_assignment)(replica, partition);
+        Device* device = LookupDevice(*client, device_id);
+        if (device->host_id() != client->host_id()) {
+          VLOG(3) << "Non-local device: " << device_id;
+          continue;
+        }
+        local_logical_device_ids.emplace_back(replica, partition);
+        local_devices.push_back(device);
       }
-      local_logical_device_ids.emplace_back(replica, partition);
-      local_devices.push_back(device);
     }
-  }
-  if (local_devices.empty()) {
-    return InvalidArgument(
-        "Device assignment (%s) does not have any local devices.",
-        build_options.device_assignment().ToString());
-  }
+    if (local_devices.empty()) {
+      return InvalidArgument(
+          "Device assignment (%s) does not have any local devices.",
+          device_assignment->ToString());
+    }
 
-  if (build_options.device_ordinal() < 0) {
-    build_options.set_device_ordinal(
-        local_devices.front()->local_device_state()->device_ordinal());
+    if (build_options.device_ordinal() < 0) {
+      build_options.set_device_ordinal(
+          local_devices.front()->local_device_state()->device_ordinal());
+    }
   }
 
   TF_ASSIGN_OR_RETURN(
@@ -2056,7 +2101,7 @@ StatusOr<std::pair<std::vector<Shape>, Shape>> GetShardedProgramShapes(
 
   auto py_executable = absl::make_unique<PjRtExecutable>(
       std::move(local_executables), options.parameter_is_tupled_arguments,
-      build_options.device_assignment(), std::move(local_logical_device_ids),
+      std::move(device_assignment), std::move(local_logical_device_ids),
       std::move(local_devices), client);
   TF_RETURN_IF_ERROR(py_executable->SetUpDonation(
       client, options.parameter_is_tupled_arguments));
diff --git a/tensorflow/compiler/xla/pjrt/pjrt_client.h b/tensorflow/compiler/xla/pjrt/pjrt_client.h
index c50d09f631c..5cc73c7410e 100644
--- a/tensorflow/compiler/xla/pjrt/pjrt_client.h
+++ b/tensorflow/compiler/xla/pjrt/pjrt_client.h
@@ -647,6 +647,12 @@ struct CompileOptions {
 
   // XLA's compilation time options.
   ExecutableBuildOptions executable_build_options;
+
+  // If true, the executable can be run on any device. May only be true if
+  // !executable_build_options.has_device_assignment(), so only applies to
+  // single-device executables. Beware: on GPUs, sometimes an executable
+  // compiled for one device doesn't run on another.
+  bool compile_portable_executable = false;
 };
 
 struct ExecuteOptions {
@@ -673,7 +679,7 @@ class PjRtExecutable {
 
   PjRtExecutable(std::vector<std::unique_ptr<LocalExecutable>> executables,
                  bool parameter_is_tupled_arguments,
-                 DeviceAssignment device_assignment,
+                 std::shared_ptr<DeviceAssignment> device_assignment,
                  std::vector<std::pair<int, int>> local_logical_device_ids,
                  std::vector<Device*> local_devices, PjRtClient* client);
 
@@ -738,10 +744,12 @@ class PjRtExecutable {
       absl::Span<PjRtBuffer* const> argument_handles, int replica,
       int partition, int executable_idx, const RunId& run_id,
       const ExecuteOptions& options, Device* device,
-      std::vector<PjRtBuffer::ScopedHold>* device_buffers) const;
+      std::vector<PjRtBuffer::ScopedHold>* device_buffers,
+      std::shared_ptr<DeviceAssignment> device_assignment) const;
   StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>> ExecuteHelper(
       absl::Span<PjRtBuffer* const> argument_handles, int replica,
-      int partition, const RunId& run_id, const ExecuteOptions& options) const;
+      int partition, const RunId& run_id, const ExecuteOptions& options,
+      Device* device = nullptr) const;
 
   // Create shared pointers so we can free them after the execution: with
   // asynchronous execution, the process being executed can outlive the

From 18b997fb9da507fb78edf3816c72fabd2ca94ad8 Mon Sep 17 00:00:00 2001
From: Tres Popp <tpopp@google.com>
Date: Fri, 26 Jun 2020 09:25:56 -0700
Subject: [PATCH 1159/1390] Integrate LLVM at
 https://github.com/llvm/llvm-project/commit/f0bab7875e78

PiperOrigin-RevId: 318486156
Change-Id: Ia6ed9f1abbeedaffb2ec5542ed2aa138d263c022
---
 tensorflow/compiler/xla/client/compile_only_client.cc | 1 +
 tensorflow/workspace.bzl                              | 4 ++--
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/tensorflow/compiler/xla/client/compile_only_client.cc b/tensorflow/compiler/xla/client/compile_only_client.cc
index e94a9c0e3d4..197e9864402 100644
--- a/tensorflow/compiler/xla/client/compile_only_client.cc
+++ b/tensorflow/compiler/xla/client/compile_only_client.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include "absl/memory/memory.h"
 #include "llvm/ADT/Triple.h"
+#include "llvm/ADT/Twine.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 
 namespace xla {
diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index d27a0d2a904..83c9f4f0f38 100755
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -710,8 +710,8 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
     )
 
     # Check out LLVM and MLIR from llvm-project.
-    LLVM_COMMIT = "d45cf9105b5a88ed03382ffbbfcd54b461f1bb23"
-    LLVM_SHA256 = "c98ec5c91fb2e577a5ad3e8f3e288c6df7794e1a615c91a1a97df169d3f69e36"
+    LLVM_COMMIT = "f0bab7875e78e01c149d12302dcc4b6d4c43e25c"
+    LLVM_SHA256 = "80f63ba572f3b73fe88cebe381aa71b63a339197a50d2571e28286506fe2b43e"
     LLVM_URLS = [
         "https://storage.googleapis.com/mirror.tensorflow.org/github.com/llvm/llvm-project/archive/{commit}.tar.gz".format(commit = LLVM_COMMIT),
         "https://github.com/llvm/llvm-project/archive/{commit}.tar.gz".format(commit = LLVM_COMMIT),

From 3bc3234064c91b50195a70962feae897845aa38b Mon Sep 17 00:00:00 2001
From: Marcello Maggioni <maggioni@google.com>
Date: Fri, 26 Jun 2020 09:44:06 -0700
Subject: [PATCH 1160/1390] [XLA] Change builder to emit kLogistic when asked
 to emit code for Logistic function.

PiperOrigin-RevId: 318489247
Change-Id: I9e30b190ccdd5cdac769806a994a0cae7b12ff7a
---
 .../tensorflow/transforms/legalize_hlo_patterns.td    |  1 +
 tensorflow/compiler/mlir/xla/ir/hlo_ops.td            |  4 ++++
 tensorflow/compiler/mlir/xla/ir/hlo_ops_base.td       | 11 +++++++++++
 .../mlir/xla/transforms/legalize_tf_patterns.td       |  1 +
 tensorflow/compiler/xla/client/lib/math.cc            |  5 -----
 tensorflow/compiler/xla/client/lib/math.h             |  3 ---
 tensorflow/compiler/xla/client/xla_builder.cc         |  3 +++
 tensorflow/compiler/xla/client/xla_builder.h          |  4 ++++
 8 files changed, 24 insertions(+), 8 deletions(-)

diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/legalize_hlo_patterns.td b/tensorflow/compiler/mlir/tensorflow/transforms/legalize_hlo_patterns.td
index 3e910cd9512..28a857172d3 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/legalize_hlo_patterns.td
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/legalize_hlo_patterns.td
@@ -111,6 +111,7 @@ foreach Mapping = [[HLO_AbsOp, TF_AbsOp],
                    [HLO_IsFiniteOp, TF_IsFiniteOp],
                    [HLO_LogOp, TF_LogOp],
                    [HLO_Log1pOp, TF_Log1pOp],
+                   [HLO_LogisticOp, TF_SigmoidOp],
                    [HLO_NotOp, TF_LogicalNotOp],
                    [HLO_NegOp, TF_NegOp],
                    [HLO_RealOp, TF_RealOp],
diff --git a/tensorflow/compiler/mlir/xla/ir/hlo_ops.td b/tensorflow/compiler/mlir/xla/ir/hlo_ops.td
index f92d1c5b85c..8ec0008515a 100644
--- a/tensorflow/compiler/mlir/xla/ir/hlo_ops.td
+++ b/tensorflow/compiler/mlir/xla/ir/hlo_ops.td
@@ -212,6 +212,10 @@ def HLO_Log1pOp: HLO_UnaryElementwiseOp<"log_plus_one",
     [NoSideEffect, SameOperandsAndResultType], HLO_FpOrComplexTensor>,
     BASE_HLO_Log1pOp;
 
+def HLO_LogisticOp: HLO_UnaryElementwiseOp<"logistic",
+    [NoSideEffect, SameOperandsAndResultType], HLO_FpOrComplexTensor>,
+    BASE_HLO_LogisticOp;
+
 def HLO_NotOp: HLO_UnaryElementwiseOp<"not",
     [NoSideEffect, SameOperandsAndResultType], HLO_PredOrIntTensor>,
     BASE_HLO_NotOp;
diff --git a/tensorflow/compiler/mlir/xla/ir/hlo_ops_base.td b/tensorflow/compiler/mlir/xla/ir/hlo_ops_base.td
index b0975d9ab03..84045d25e3e 100644
--- a/tensorflow/compiler/mlir/xla/ir/hlo_ops_base.td
+++ b/tensorflow/compiler/mlir/xla/ir/hlo_ops_base.td
@@ -264,6 +264,17 @@ class BASE_HLO_Log1pOp {
   }];
 }
 
+class BASE_HLO_LogisticOp {
+  string summary = "Logistic operator";
+
+  string description = [{
+    Returns `logistic(operand)` element-wise.
+
+    See
+    https://www.tensorflow.org/xla/operation_semantics#element-wise_unary_functions.
+  }];
+}
+
 class BASE_HLO_NegOp {
   string summary = "Negation operator";
 
diff --git a/tensorflow/compiler/mlir/xla/transforms/legalize_tf_patterns.td b/tensorflow/compiler/mlir/xla/transforms/legalize_tf_patterns.td
index df7b887fcad..c7c7f142b11 100644
--- a/tensorflow/compiler/mlir/xla/transforms/legalize_tf_patterns.td
+++ b/tensorflow/compiler/mlir/xla/transforms/legalize_tf_patterns.td
@@ -543,6 +543,7 @@ foreach Mapping = [
                    [TF_NegOp, HLO_NegOp],
                    [TF_RealOp, HLO_RealOp],
                    [TF_RsqrtOp, HLO_RsqrtOp],
+                   [TF_SigmoidOp, HLO_LogisticOp],
                    [TF_SinOp, HLO_SinOp],
                    [TF_SqrtOp, HLO_SqrtOp],
                    [TF_TanhOp, HLO_TanhOp],
diff --git a/tensorflow/compiler/xla/client/lib/math.cc b/tensorflow/compiler/xla/client/lib/math.cc
index 6cbaa043055..baafd7d705b 100644
--- a/tensorflow/compiler/xla/client/lib/math.cc
+++ b/tensorflow/compiler/xla/client/lib/math.cc
@@ -1393,11 +1393,6 @@ XlaOp NextAfter(XlaOp from, XlaOp to) {
   });
 }
 
-XlaOp Logistic(XlaOp x) {
-  auto one = xla::ScalarLike(x, 1);
-  return xla::Div(one, (one + xla::Exp(xla::Neg(x))));
-}
-
 // Computes an approximation to the modified Bessel function of the first kind,
 // zeroth order.
 // The following implementation follows Cephes' F32 and F64 implementation of
diff --git a/tensorflow/compiler/xla/client/lib/math.h b/tensorflow/compiler/xla/client/lib/math.h
index f862372a288..f03348c0a57 100644
--- a/tensorflow/compiler/xla/client/lib/math.h
+++ b/tensorflow/compiler/xla/client/lib/math.h
@@ -111,9 +111,6 @@ XlaOp Sinh(XlaOp x);
 // is true, otherwise returns its argument.
 xla::XlaOp MaybeConjugate(xla::XlaOp x, bool conjugate);
 
-// Computes the logistic function: logistic(x) = 0.5 + 0.5 * tanh(0.5 * x).
-XlaOp Logistic(XlaOp x);
-
 // Computes the Modified Bessel function of the first kind of the zeroth order
 // at x.
 XlaOp BesselI0e(XlaOp x);
diff --git a/tensorflow/compiler/xla/client/xla_builder.cc b/tensorflow/compiler/xla/client/xla_builder.cc
index 56e9aba6112..89bc30e1a0e 100644
--- a/tensorflow/compiler/xla/client/xla_builder.cc
+++ b/tensorflow/compiler/xla/client/xla_builder.cc
@@ -3639,6 +3639,9 @@ XlaOp Log(const XlaOp operand) {
 XlaOp Log1p(const XlaOp operand) {
   return operand.builder()->UnaryOp(HloOpcode::kLog1p, operand);
 }
+XlaOp Logistic(const XlaOp operand) {
+  return operand.builder()->UnaryOp(HloOpcode::kLogistic, operand);
+}
 XlaOp Sign(const XlaOp operand) {
   return operand.builder()->UnaryOp(HloOpcode::kSign, operand);
 }
diff --git a/tensorflow/compiler/xla/client/xla_builder.h b/tensorflow/compiler/xla/client/xla_builder.h
index 23a29243ccc..12623d7912f 100644
--- a/tensorflow/compiler/xla/client/xla_builder.h
+++ b/tensorflow/compiler/xla/client/xla_builder.h
@@ -1073,6 +1073,7 @@ class XlaBuilder {
   friend XlaOp Round(XlaOp operand);
   friend XlaOp Log(XlaOp operand);
   friend XlaOp Log1p(XlaOp operand);
+  friend XlaOp Logistic(XlaOp operand);
   friend XlaOp Sign(XlaOp operand);
   friend XlaOp Clz(XlaOp operand);
   friend XlaOp Cos(XlaOp operand);
@@ -1914,6 +1915,9 @@ XlaOp Log(XlaOp operand);
 // Enqueues an log1p instruction (log(x+1)) onto the computation.
 XlaOp Log1p(XlaOp operand);
 
+// Enqueues a logistic instruction onto the computation.
+XlaOp Logistic(XlaOp operand);
+
 // Enqueues a sign instruction onto the computation.
 XlaOp Sign(XlaOp operand);
 

From ce9b1295b5689129fe1a35ae75faf069b0fe24ca Mon Sep 17 00:00:00 2001
From: VoVAllen <jz1749@nyu.edu>
Date: Fri, 26 Jun 2020 17:15:28 +0000
Subject: [PATCH 1161/1390] fix

---
 tensorflow/c/eager/dlpack.cc     | 2 +-
 tensorflow/python/tfe_wrapper.cc | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/tensorflow/c/eager/dlpack.cc b/tensorflow/c/eager/dlpack.cc
index 25d9a422b3e..45048bd6efb 100644
--- a/tensorflow/c/eager/dlpack.cc
+++ b/tensorflow/c/eager/dlpack.cc
@@ -323,7 +323,7 @@ TFE_TensorHandle* TFE_HandleFromDLPack(void* dlm, TF_Status* status,
 
   TFE_TensorHandle* handle = TFE_NewTensorHandleFromDeviceMemory(
       ctx, device_name.value().c_str(), dtype, dims, num_dims, data,
-      total_bytes, &DeallocatorWrapperFunc, &dlmt, status);
+      total_bytes, &DeallocatorWrapperFunc, dlmt, status);
 
   return handle;
 }
diff --git a/tensorflow/python/tfe_wrapper.cc b/tensorflow/python/tfe_wrapper.cc
index 41714456cdc..88bb66f189b 100644
--- a/tensorflow/python/tfe_wrapper.cc
+++ b/tensorflow/python/tfe_wrapper.cc
@@ -1169,7 +1169,8 @@ PYBIND11_MODULE(_pywrap_tfe, m) {
 
     PyCapsule_SetName(pycapsule.ptr(), "used_dltensor");
     PyCapsule_SetDestructor(pycapsule.ptr(), nullptr);
-    PyObject* pyhandle = EagerTensorFromHandle(thandle, true);
+    
+    PyObject* pyhandle = EagerTensorFromHandle(thandle);
     return tensorflow::PyoOrThrow(pyhandle);
   });
 

From bd3cc49fb2f3646131312c0db1d0358f24f4d161 Mon Sep 17 00:00:00 2001
From: Ken Franko <kfranko@google.com>
Date: Fri, 26 Jun 2020 10:29:17 -0700
Subject: [PATCH 1162/1390] Add example for TPUExtractOutsideCompilation pass.

PiperOrigin-RevId: 318497793
Change-Id: I15581d8abf8c3a08da5acc460aab3abd03b3a3ec
---
 .../tpu_extract_outside_compilation.cc        | 32 ++++++++++++++++---
 1 file changed, 28 insertions(+), 4 deletions(-)

diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_extract_outside_compilation.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_extract_outside_compilation.cc
index fd7906460fe..af0675197ac 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_extract_outside_compilation.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_extract_outside_compilation.cc
@@ -45,10 +45,34 @@ using OutsideClusterMap =
     llvm::SmallDenseMap<llvm::StringRef, llvm::SmallVector<Operation*, 8>, 8>;
 
 // This pass extracts a CPU computation cluster with `_xla_outside_compilation`
-// annotation from a TPU cluster.  Each outside compilation cluster is moved to
-// a parallel_execute region.  The TPU cluster is also moved to a
-// parallel_execute region.
-// TODO(b/154363171): Add example tranformations.
+// annotation from a TPU cluster. Each outside compilation cluster is moved to
+// a parallel_execute region. The TPU cluster is also moved to a
+// parallel_execute region. Communication ops between device and host are
+// added to pass inputs/outputs to/from the outside compiled region.
+//
+// A simple example:
+//   "tf_device.cluster"() ( {
+//     "tf.A"()
+//     "tf.B"() {_xla_outside_compilation = "cluster1"}
+//     "tf.C"()
+//     tf_device.return
+//   }) {num_cores_per_replica = 1, topology =  "", device_assignment =  []}
+//
+// Would become the following ops (unimportant attribute, type are omitted):
+//   "tf_device.parallel_execute"() ( {
+//     "tf_device.launch"() ( {
+//       "tf.B()
+//       tf_device.return
+//     })
+//     tf_device.return
+//   }, {
+//     "tf_device.cluster"( {
+//       "tf.A"()
+//       "tf.C"()
+//       tf_device.return
+//     })
+//    tf_device.return
+//  })
 
 struct TPUExtractOutsideCompilation
     : public PassWrapper<TPUExtractOutsideCompilation,

From e23f0e2a0b281fe05ad20572cd7440e759a92c9c Mon Sep 17 00:00:00 2001
From: Henry Tan <henrytan@google.com>
Date: Fri, 26 Jun 2020 10:36:22 -0700
Subject: [PATCH 1163/1390] Updating compilation cache to use
 `TpuMeshStateInterface`.

PiperOrigin-RevId: 318499345
Change-Id: Ide5f3ae145dce78bfcc05ff6909b71f6592157c7
---
 tensorflow/core/tpu/kernels/tpu_configuration_ops.cc | 12 ++++++------
 .../core/tpu/kernels/tpu_mesh_state_interface.h      |  2 +-
 tensorflow/core/tpu/kernels/tpu_op_util.cc           |  3 ++-
 3 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/tensorflow/core/tpu/kernels/tpu_configuration_ops.cc b/tensorflow/core/tpu/kernels/tpu_configuration_ops.cc
index 583f1aec207..7d3814ad3c3 100644
--- a/tensorflow/core/tpu/kernels/tpu_configuration_ops.cc
+++ b/tensorflow/core/tpu/kernels/tpu_configuration_ops.cc
@@ -36,7 +36,7 @@ namespace {
 Status GetTpuMeshStateInterface(const ResourceMgr* rmgr,
                                 tpu::TpuMeshStateInterface** state) {
   if (!rmgr->Lookup(rmgr->default_container(),
-                    tpu::kTpuMeshCommonStateResourceName, state)
+                    tpu::kTpuMeshStateInterfaceResourceName, state)
            .ok()) {
     return errors::FailedPrecondition(
         "The TPU system has not been initialized.");
@@ -96,16 +96,16 @@ void ConfigureDistributedTpuOp::Compute(OpKernelContext* ctx) {
 
   auto* rmgr = GetTPUConfigResourceMgr();
   OP_REQUIRES_OK(ctx, DeleteIfExists<tpu::TpuMeshStateInterface>(
-                          rmgr, tpu::kTpuMeshCommonStateResourceName));
+                          rmgr, tpu::kTpuMeshStateInterfaceResourceName));
 
   tpu::ConfigApiFn()->ConfigureDistributedTpuOp_DoWorkFn(
       num_devices_per_host.size(), num_devices_per_host.data(),
       &host_config_output_size, &host_config_output, status);
 
   auto* tpu_mesh = tpu::TpuMeshStateInterface::Create();
-  OP_REQUIRES_OK(ctx,
-                 rmgr->Create(rmgr->default_container(),
-                              tpu::kTpuMeshCommonStateResourceName, tpu_mesh));
+  OP_REQUIRES_OK(
+      ctx, rmgr->Create(rmgr->default_container(),
+                        tpu::kTpuMeshStateInterfaceResourceName, tpu_mesh));
 
   Tensor* ctx_output;
   OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}), &ctx_output));
@@ -198,7 +198,7 @@ void ShutdownDistributedTpuOp::Compute(OpKernelContext* ctx) {
   TF_Status* status = TF_NewStatus();
   OP_REQUIRES_OK(ctx, DeleteIfExists<tpu::TpuMeshStateInterface>(
                           GetTPUConfigResourceMgr(),
-                          tpu::kTpuMeshCommonStateResourceName));
+                          tpu::kTpuMeshStateInterfaceResourceName));
   tpu::ConfigApiFn()->ShutdownDistributedTpuOp_DoWorkFn(status);
   OP_REQUIRES_OK(ctx, StatusFromTF_Status(status));
   TF_DeleteStatus(status);
diff --git a/tensorflow/core/tpu/kernels/tpu_mesh_state_interface.h b/tensorflow/core/tpu/kernels/tpu_mesh_state_interface.h
index e2ac38b5f84..7906844c965 100644
--- a/tensorflow/core/tpu/kernels/tpu_mesh_state_interface.h
+++ b/tensorflow/core/tpu/kernels/tpu_mesh_state_interface.h
@@ -29,7 +29,7 @@ class TpuMeshCommonState;
 
 namespace tpu {
 
-const char kTpuMeshCommonStateResourceName[] = "tpu_mesh_common_state";
+const char kTpuMeshStateInterfaceResourceName[] = "tpu_mesh_common_state";
 
 class TpuMeshStateInterface : public tensorflow::ResourceBase {
  public:
diff --git a/tensorflow/core/tpu/kernels/tpu_op_util.cc b/tensorflow/core/tpu/kernels/tpu_op_util.cc
index 570a6deac1b..477afac6491 100644
--- a/tensorflow/core/tpu/kernels/tpu_op_util.cc
+++ b/tensorflow/core/tpu/kernels/tpu_op_util.cc
@@ -77,6 +77,7 @@ std::string GuaranteedConstFingerprint(const string& fingerprint_in_metadata,
     uint64_t fingerprint = 0;
     for (size_t i = 0; i < guaranteed_constants_size; ++i) {
       const Tensor& constant = guaranteed_constants[i];
+      // TODO(henrytan): constant.tensor_data() may be uninitialized.
       fingerprint = TpuCompile_CreateGuaranteedConstFingerprint(
           fingerprint, constant.tensor_data().data(),
           constant.tensor_data().size());
@@ -140,7 +141,7 @@ TpuCompilationCacheKey CreateCompilationCacheKey(
     // managed through the `TPUCompileOpKernelImpl` that outlives the
     // lifetime of the compilation cache lookups.
     string fingerprint;
-    key.guaranteed_const_fingerprint = [&metadata, guaranteed_constants,
+    key.guaranteed_const_fingerprint = [&metadata, &guaranteed_constants,
                                         guaranteed_constants_size,
                                         fingerprint]() mutable {
       if (fingerprint.empty()) {

From 41fa66a6110d14f3a4399df23c4b4d9b31e746a3 Mon Sep 17 00:00:00 2001
From: Ran Chen <crccw@google.com>
Date: Fri, 26 Jun 2020 10:37:30 -0700
Subject: [PATCH 1164/1390] Support MWMS combinations with graph mode

PiperOrigin-RevId: 318499561
Change-Id: I07f79347ae688ac4de5a258bd8388140400293f9
---
 tensorflow/python/distribute/BUILD            |  9 ++--
 tensorflow/python/distribute/combinations.py  | 45 ++++++++++++++++++-
 .../distribute/strategy_combinations.py       | 28 ++++++++++--
 .../distribute/strategy_combinations_test.py  | 45 +++++++++++++++++--
 .../python/distribute/strategy_common_test.py |  1 -
 5 files changed, 117 insertions(+), 11 deletions(-)

diff --git a/tensorflow/python/distribute/BUILD b/tensorflow/python/distribute/BUILD
index abbf6dce69a..e59fdcc872e 100644
--- a/tensorflow/python/distribute/BUILD
+++ b/tensorflow/python/distribute/BUILD
@@ -810,13 +810,16 @@ py_library(
         "//tensorflow_models:__subpackages__",
     ],
     deps = [
+        ":collective_all_reduce_strategy",
+        ":distribute_lib",
         ":multi_process_runner",
         ":multi_worker_test_base",
         "//tensorflow/python:framework_combinations",
+        "//tensorflow/python:framework_ops",
         "//tensorflow/python:framework_test_combinations_lib",
         "//tensorflow/python:platform",
+        "//tensorflow/python:session",
         "//tensorflow/python:tf_decorator",
-        "//tensorflow/python:util",
         "//tensorflow/python/eager:context",
         "@six_archive//:six",
     ],
@@ -866,7 +869,7 @@ py_library(
     ],
 )
 
-py_test(
+distribute_py_test(
     name = "strategy_combinations_test",
     srcs = ["strategy_combinations_test.py"],
     python_version = "PY3",
@@ -874,9 +877,9 @@ py_test(
         ":combinations",
         ":reduce_util",
         ":strategy_combinations",
-        "//tensorflow/python:client_testlib",
         "//tensorflow/python:config",
         "//tensorflow/python:constant_op",
+        "//tensorflow/python/eager:context",
         "@absl_py//absl/testing:parameterized",
     ],
 )
diff --git a/tensorflow/python/distribute/combinations.py b/tensorflow/python/distribute/combinations.py
index 91f152c8e88..ad8bb879b93 100644
--- a/tensorflow/python/distribute/combinations.py
+++ b/tensorflow/python/distribute/combinations.py
@@ -23,6 +23,7 @@ from __future__ import division
 from __future__ import print_function
 
 import collections
+import copy
 import re
 import sys
 import types
@@ -30,12 +31,17 @@ import unittest
 
 import six
 
+from tensorflow.python.client import session
+from tensorflow.python.distribute import collective_all_reduce_strategy
+from tensorflow.python.distribute import distribute_lib
 from tensorflow.python.distribute import multi_process_runner
 from tensorflow.python.distribute import multi_worker_test_base
 from tensorflow.python.eager import context
 from tensorflow.python.framework import combinations as framework_combinations
+from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_combinations as combinations_lib
 from tensorflow.python.platform import flags
+from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import tf_decorator
 from tensorflow.python.util import tf_inspect
 
@@ -398,7 +404,15 @@ def _multi_worker_test(test_method):
     if _num_total_workers(has_chief, num_workers) == 1 or _running_in_worker:
       # We're in worker process or the test is for single worker. Either case we
       # execute the test method directly instead of spawning subprocesses.
-      test_method(self, **kwargs)
+
+      # For MultiWorkerMirroredStrategy(CollectiveAllReduceStrategy), install a
+      # session that connects to the local server. This is necessary for multi
+      # worker graph mode tests to work. Those tests cannot use their graphs or
+      # sessions, including the one returned by self.cached_session(). Since
+      # existing tests may already be doing so, we only install the session for
+      # multi worker tests.
+      with _multi_worker_session(kwargs):
+        test_method(self, **kwargs)
       return
 
     # We're in the main process. We spawn subprocesses and run the *test* on
@@ -455,3 +469,32 @@ def _num_total_workers(has_chief, num_workers):
   if has_chief:
     return num_workers + 1
   return num_workers
+
+
+def _multi_worker_session(kwargs):
+  """Returns a context manager that enters a session that is configured for the MultiWorkerMirroredStrategy.
+
+  Args:
+    kwargs: a dict. Keyword arguments passed to the test.
+
+  Returns:
+    A context manager. If MultiWorkerMirroredStrategy is the  one and only one
+    strategy in kwargs and it's in graph mode, it's the seesion that is
+    configured for that strategy.  Otherwise, it's a no-op context manager.
+  """
+  strategy = None
+  for _, v in kwargs.items():
+    if isinstance(v, distribute_lib.StrategyBase):
+      if strategy is not None:
+        logging.warning(
+            "The test uses multiple strategies. Skipping "
+            "entering a session that is configured for the strategy.")
+        return ops.NullContextmanager()
+      strategy = v
+  if context.executing_eagerly() or not isinstance(
+      strategy, collective_all_reduce_strategy.CollectiveAllReduceStrategy):
+    return ops.NullContextmanager()
+  sess_config = copy.deepcopy(context.context().config)
+  sess_config = strategy.update_config_proto(sess_config)
+  target = strategy.cluster_resolver.master()
+  return session.Session(config=sess_config, target=target).as_default()
diff --git a/tensorflow/python/distribute/strategy_combinations.py b/tensorflow/python/distribute/strategy_combinations.py
index 21297edbcfa..21e3320448f 100644
--- a/tensorflow/python/distribute/strategy_combinations.py
+++ b/tensorflow/python/distribute/strategy_combinations.py
@@ -111,12 +111,17 @@ def _get_multi_worker_mirrored_creator(required_gpus):
         cluster_spec=tf_config.cluster_spec(),
         task_type=tf_config.task_type,
         task_id=tf_config.task_id,
+        master=tf_config.master(),
         environment=tf_config.environment,
         num_accelerators={"GPU": required_gpus},
-        rpc_layer=tf_config.rpc_layer,
+        rpc_layer=tf_config.rpc_layer or "grpc",
     )
-    strategy = collective_all_reduce_strategy.CollectiveAllReduceStrategy(
-        cluster_resolver=resolver)
+    # Always create the strategy in eager mode so that it starts the server and
+    # configures the eager context. The eager context can no longer be
+    # configured after initialization.
+    with context.eager_mode():
+      strategy = collective_all_reduce_strategy.CollectiveAllReduceStrategy(
+          cluster_resolver=resolver)
     # TODO(b/152320929): Wait for the cluster before proceeding, otherwise
     # collectives may hang if any worker launches collectives before the chief
     # creates the strategy.
@@ -359,6 +364,23 @@ all_strategies_minus_default = strategies_minus_default_and_tpu + tpu_strategies
 
 all_strategies = strategies_minus_tpu + tpu_strategies
 
+two_replica_strategies = [
+    mirrored_strategy_with_gpu_and_cpu,
+    mirrored_strategy_with_two_gpus,
+    multi_worker_mirrored_2x1_cpu,
+    multi_worker_mirrored_2x1_gpu,
+    tpu_strategy,  # steps_per_run=2
+    tpu_strategy_one_step,
+    central_storage_strategy_with_gpu_and_cpu,
+]
+
+four_replica_strategies = [
+    multi_worker_mirrored_2x2_gpu,
+    multi_worker_mirrored_4x1_cpu,
+]
+
+# TODO(b/159831907): replace with two_replica_strategies after the tests using
+# it work with MWMS.
 multidevice_strategies = [
     mirrored_strategy_with_gpu_and_cpu,
     mirrored_strategy_with_two_gpus,
diff --git a/tensorflow/python/distribute/strategy_combinations_test.py b/tensorflow/python/distribute/strategy_combinations_test.py
index 6f75158537d..8b5ea27f512 100644
--- a/tensorflow/python/distribute/strategy_combinations_test.py
+++ b/tensorflow/python/distribute/strategy_combinations_test.py
@@ -23,18 +23,22 @@ from absl.testing import parameterized
 from tensorflow.python.distribute import combinations
 from tensorflow.python.distribute import reduce_util
 from tensorflow.python.distribute import strategy_combinations
+from tensorflow.python.eager import context
+from tensorflow.python.eager import def_function
 from tensorflow.python.framework import config
 from tensorflow.python.framework import constant_op
+from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
 
 
-class StrategyCombinationsTest(test.TestCase, parameterized.TestCase):
+class VirtualDevicesTest(test.TestCase, parameterized.TestCase):
 
   def setUp(self):
+    context._reset_context()  # pylint: disable=protected-access
     # Need to call set_virtual_cpus_to_at_least() in setUp with the maximum
     # value needed in any test.
     strategy_combinations.set_virtual_cpus_to_at_least(3)
-    super(StrategyCombinationsTest, self).setUp()
+    super(VirtualDevicesTest, self).setUp()
 
   def test3VirtualCPUs(self):
     cpu_device = config.list_physical_devices("CPU")[0]
@@ -62,5 +66,40 @@ class StrategyCombinationsTest(test.TestCase, parameterized.TestCase):
       self.assertEqual(2, self.evaluate(num_replicas))
 
 
+class StrategyCombinationsTest(test.TestCase, parameterized.TestCase):
+
+  @combinations.generate(
+      combinations.combine(
+          strategy=strategy_combinations.two_replica_strategies,
+          mode=["graph", "eager"]))
+  def testTwoReplicaStrategy(self, strategy):
+    with strategy.scope():
+
+      @def_function.function
+      def one():
+        return array_ops.identity(1.)
+
+      one_per_replica = strategy.run(one)
+      num_replicas = strategy.reduce(
+          reduce_util.ReduceOp.SUM, one_per_replica, axis=None)
+      self.assertEqual(self.evaluate(num_replicas), 2.)
+
+  @combinations.generate(
+      combinations.combine(
+          strategy=strategy_combinations.four_replica_strategies,
+          mode=["graph", "eager"]))
+  def testFourReplicaStrategy(self, strategy):
+    with strategy.scope():
+
+      @def_function.function
+      def one():
+        return array_ops.identity(1.)
+
+      one_per_replica = strategy.run(one)
+      num_replicas = strategy.reduce(
+          reduce_util.ReduceOp.SUM, one_per_replica, axis=None)
+      self.assertEqual(self.evaluate(num_replicas), 4.)
+
+
 if __name__ == "__main__":
-  test.main()
+  combinations.main()
diff --git a/tensorflow/python/distribute/strategy_common_test.py b/tensorflow/python/distribute/strategy_common_test.py
index 9f628ea1250..9021c53e129 100644
--- a/tensorflow/python/distribute/strategy_common_test.py
+++ b/tensorflow/python/distribute/strategy_common_test.py
@@ -169,7 +169,6 @@ class StrategyClusterResolverTest(test.TestCase, parameterized.TestCase):
     if isinstance(strategy, CollectiveAllReduceStrategy):
       self.assertEqual(resolver.task_id, 0)
       self.assertAllInSet(resolver.task_type, ['chief', 'worker'])
-      self.assertIsNone(resolver.rpc_layer)
 
 
 if __name__ == '__main__':

From 167e0b736f526ea928551fe7893f30c9767827fe Mon Sep 17 00:00:00 2001
From: Advait Jain <advaitjain@google.com>
Date: Fri, 26 Jun 2020 10:53:04 -0700
Subject: [PATCH 1165/1390] Improve continuous integration.

PiperOrigin-RevId: 318502810
Change-Id: I30ff12915c74e31bfdec179b72f97028ac9eac0c
---
 .../lite/micro/examples/person_detection/BUILD | 18 ++++++++++++++++++
 .../person_detection_experimental/BUILD        | 18 ++++++++++++++++++
 .../build_scripts/PRESUBMIT_BUILD_TARGETS.sh   |  2 +-
 3 files changed, 37 insertions(+), 1 deletion(-)

diff --git a/tensorflow/lite/micro/examples/person_detection/BUILD b/tensorflow/lite/micro/examples/person_detection/BUILD
index 8617373cb41..709731eee19 100644
--- a/tensorflow/lite/micro/examples/person_detection/BUILD
+++ b/tensorflow/lite/micro/examples/person_detection/BUILD
@@ -45,6 +45,24 @@ cc_library(
     ],
 )
 
+tflite_micro_cc_test(
+    name = "person_detection_test",
+    srcs = ["person_detection_test.cc"],
+    deps = [
+        ":model_settings",
+        ":person_detect_model_data",
+        ":simple_images_test_data",
+        "//tensorflow/lite:schema_fbs_version",
+        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/micro:micro_error_reporter",
+        "//tensorflow/lite/micro:micro_framework",
+        "//tensorflow/lite/micro:op_resolvers",
+        "//tensorflow/lite/micro/kernels:micro_ops",
+        "//tensorflow/lite/micro/testing:micro_test",
+        "//tensorflow/lite/schema:schema_fbs",
+    ],
+)
+
 cc_library(
     name = "image_provider",
     srcs = [
diff --git a/tensorflow/lite/micro/examples/person_detection_experimental/BUILD b/tensorflow/lite/micro/examples/person_detection_experimental/BUILD
index fd726544be3..056347da46f 100644
--- a/tensorflow/lite/micro/examples/person_detection_experimental/BUILD
+++ b/tensorflow/lite/micro/examples/person_detection_experimental/BUILD
@@ -45,6 +45,24 @@ cc_library(
     ],
 )
 
+tflite_micro_cc_test(
+    name = "person_detection_test",
+    srcs = ["person_detection_test.cc"],
+    deps = [
+        ":model_settings",
+        ":person_detect_model_data",
+        ":simple_images_test_data",
+        "//tensorflow/lite:schema_fbs_version",
+        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/micro:micro_error_reporter",
+        "//tensorflow/lite/micro:micro_framework",
+        "//tensorflow/lite/micro:op_resolvers",
+        "//tensorflow/lite/micro/kernels:micro_ops",
+        "//tensorflow/lite/micro/testing:micro_test",
+        "//tensorflow/lite/schema:schema_fbs",
+    ],
+)
+
 cc_library(
     name = "image_provider",
     srcs = [
diff --git a/tensorflow/tools/ci_build/build_scripts/PRESUBMIT_BUILD_TARGETS.sh b/tensorflow/tools/ci_build/build_scripts/PRESUBMIT_BUILD_TARGETS.sh
index 1893db7802c..203356952cb 100755
--- a/tensorflow/tools/ci_build/build_scripts/PRESUBMIT_BUILD_TARGETS.sh
+++ b/tensorflow/tools/ci_build/build_scripts/PRESUBMIT_BUILD_TARGETS.sh
@@ -15,4 +15,4 @@
 #!/bin/bash
 set -x
 
-DEFAULT_BAZEL_TARGETS="//tensorflow/... -//tensorflow/python/integration_testing/... -//tensorflow/compiler/tf2tensorrt/... -//tensorflow/compiler/xrt/... //tensorflow/compiler/mlir/lite/..."
+DEFAULT_BAZEL_TARGETS="//tensorflow/... -//tensorflow/python/integration_testing/... -//tensorflow/compiler/tf2tensorrt/... -//tensorflow/compiler/xrt/... //tensorflow/compiler/mlir/lite/... -//tensorflow/lite/micro/examples/..."

From bf4a995ec157301757495cf5d777b9c130631b65 Mon Sep 17 00:00:00 2001
From: Yanhui Liang <yhliang@google.com>
Date: Fri, 26 Jun 2020 10:53:33 -0700
Subject: [PATCH 1166/1390] Clean up the LSTM/GRU layer with the new grappler
 selector.

PiperOrigin-RevId: 318502922
Change-Id: I4932546a16d991e2a22157543465eb5e2f63e34d
---
 .../python/keras/layers/recurrent_v2.py       | 144 ++++++------------
 1 file changed, 47 insertions(+), 97 deletions(-)

diff --git a/tensorflow/python/keras/layers/recurrent_v2.py b/tensorflow/python/keras/layers/recurrent_v2.py
index 33babb54357..58eb0bb025b 100644
--- a/tensorflow/python/keras/layers/recurrent_v2.py
+++ b/tensorflow/python/keras/layers/recurrent_v2.py
@@ -385,6 +385,17 @@ class GRU(recurrent.DropoutRNNCellMixin, recurrent.GRU):
       else:
         logging.warn(_CUDNN_NOT_AVAILABLE_MSG % self.name)
 
+    # The first two attributes are added to support TFLite use case.
+    supportive_attributes = {
+        'time_major': time_major,
+        'go_backwards': go_backwards,
+        _FUNCTION_API_NAME_ATTRIBUTE: 'gru_' + str(uuid.uuid4())
+    }
+    self.defun_gru_with_backend_selection = function.defun_with_attributes(
+        gru_with_backend_selection,
+        attributes=supportive_attributes,
+        autograph=False)
+
   def build(self, input_shape):
     super(GRU, self).build(input_shape)
 
@@ -467,7 +478,7 @@ class GRU(recurrent.DropoutRNNCellMixin, recurrent.GRU):
     if dropout_mask is not None:
       inputs = inputs * dropout_mask[0]
 
-    gpu_gru_kwargs = {
+    gru_kwargs = {
         'inputs': inputs,
         'init_h': _read_variable_value(initial_state[0]),
         'kernel': _read_variable_value(self.cell.kernel),
@@ -476,29 +487,11 @@ class GRU(recurrent.DropoutRNNCellMixin, recurrent.GRU):
         'mask': mask,
         'time_major': self.time_major,
         'go_backwards': self.go_backwards,
-        'sequence_lengths': sequence_lengths
+        'sequence_lengths': sequence_lengths,
+        'zero_output_for_mask': self.zero_output_for_mask
     }
-    normal_gru_kwargs = gpu_gru_kwargs.copy()
-    normal_gru_kwargs.update({
-        'zero_output_for_mask': self.zero_output_for_mask,
-    })
-
-    if context.executing_eagerly():
-      device_type = _get_context_device_type()
-      can_use_gpu = (
-          # Either user specified GPU or unspecified but GPU is available.
-          (device_type == _GPU_DEVICE_NAME
-           or (device_type is None and context.num_gpus() > 0))
-          and
-          (mask is None or is_sequence_right_padded(mask, self.time_major)))
-      # Under eager context, check the device placement and prefer the
-      if can_use_gpu:
-        last_output, outputs, new_h, runtime = gpu_gru(**gpu_gru_kwargs)
-      else:
-        last_output, outputs, new_h, runtime = standard_gru(**normal_gru_kwargs)
-    else:
-      last_output, outputs, new_h, runtime = gru_with_backend_selection(
-          **normal_gru_kwargs)
+    (last_output, outputs, new_h,
+     runtime) = self.defun_gru_with_backend_selection(**gru_kwargs)
 
     states = [new_h]
     return last_output, outputs, runtime, states
@@ -765,24 +758,14 @@ def gru_with_backend_selection(inputs, init_h, kernel, recurrent_kernel, bias,
         true_fn=input_right_padded,
         false_fn=input_not_right_padded)
 
-  # Each time a `tf.function` is called, we will give it a unique
-  # identifiable API name, so that Grappler won't get confused when it
-  # sees multiple GRU layers added into same graph, and it will be able
-  # to pair up the different implementations across them.
-  api_name = 'gru_' + str(uuid.uuid4())
-  supportive_attribute = {
-      'time_major': time_major,
-      'go_backwards': go_backwards,
-  }
-  defun_standard_gru = _generate_defun_backend(
-      api_name, _CPU_DEVICE_NAME, standard_gru, supportive_attribute)
-  defun_gpu_gru = _generate_defun_backend(
-      api_name, _GPU_DEVICE_NAME, gpu_gru_with_fallback, supportive_attribute)
+  # Chooses the implementation dynamicly based on the running device.
+  (last_output, outputs, new_h,
+   runtime) = control_flow_ops.execute_fn_for_device(
+       {
+           _CPU_DEVICE_NAME: lambda: standard_gru(**params),
+           _GPU_DEVICE_NAME: lambda: gpu_gru_with_fallback(**params)
+       }, lambda: standard_gru(**params))
 
-  # Call the normal GRU impl and register the CuDNN impl function. The
-  # grappler will kick in during session execution to optimize the graph.
-  last_output, outputs, new_h, runtime = defun_standard_gru(**params)
-  function.register(defun_gpu_gru, **params)
   return last_output, outputs, new_h, runtime
 
 
@@ -1097,6 +1080,18 @@ class LSTM(recurrent.DropoutRNNCellMixin, recurrent.LSTM):
       else:
         logging.warn(_CUDNN_NOT_AVAILABLE_MSG % self.name)
 
+    # The first two attributes are added to support TFLite use case.
+    supportive_attributes = {
+        'time_major': time_major,
+        'go_backwards': go_backwards,
+        _FUNCTION_API_NAME_ATTRIBUTE: 'lstm_' + str(uuid.uuid4())
+    }
+
+    self.defun_lstm_with_backend_selection = function.defun_with_attributes(
+        lstm_with_backend_selection,
+        attributes=supportive_attributes,
+        autograph=False)
+
   def call(self, inputs, mask=None, training=None, initial_state=None):
     # The input should be dense, padded with zeros. If a ragged input is fed
     # into the layer, it is padded and the row lengths are used for masking.
@@ -1145,7 +1140,7 @@ class LSTM(recurrent.DropoutRNNCellMixin, recurrent.LSTM):
       dropout_mask = self.get_dropout_mask_for_cell(inputs, training, count=4)
       if dropout_mask is not None:
         inputs = inputs * dropout_mask[0]
-      gpu_lstm_kwargs = {
+      lstm_kwargs = {
           'inputs': inputs,
           'init_h': _read_variable_value(initial_state[0]),
           'init_c': _read_variable_value(initial_state[1]),
@@ -1155,32 +1150,11 @@ class LSTM(recurrent.DropoutRNNCellMixin, recurrent.LSTM):
           'mask': mask,
           'time_major': self.time_major,
           'go_backwards': self.go_backwards,
-          'sequence_lengths': row_lengths
-      }
-      normal_lstm_kwargs = gpu_lstm_kwargs.copy()
-      normal_lstm_kwargs.update({
+          'sequence_lengths': row_lengths,
           'zero_output_for_mask': self.zero_output_for_mask,
-      })
-
-      if context.executing_eagerly():
-        device_type = _get_context_device_type()
-        can_use_gpu = (
-            # Either user specified GPU or unspecified but GPU is available.
-            (device_type == _GPU_DEVICE_NAME
-             or (device_type is None and context.num_gpus() > 0))
-            and
-            (mask is None or is_sequence_right_padded(mask, self.time_major)))
-        # Under eager context, check the device placement and prefer the
-        # GPU implementation when GPU is available.
-        if can_use_gpu:
-          last_output, outputs, new_h, new_c, runtime = gpu_lstm(
-              **gpu_lstm_kwargs)
-        else:
-          last_output, outputs, new_h, new_c, runtime = standard_lstm(
-              **normal_lstm_kwargs)
-      else:
-        (last_output, outputs, new_h, new_c,
-         runtime) = lstm_with_backend_selection(**normal_lstm_kwargs)
+      }
+      (last_output, outputs, new_h, new_c,
+       runtime) = self.defun_lstm_with_backend_selection(**lstm_kwargs)
 
       states = [new_h, new_c]
 
@@ -1538,25 +1512,13 @@ def lstm_with_backend_selection(inputs, init_h, init_c, kernel,
         true_fn=input_right_padded,
         false_fn=input_not_right_padded)
 
-  # Each time a `tf.function` is called, we will give it a unique
-  # identifiable API name, so that Grappler won't get confused when it
-  # sees multiple LSTM layers added into same graph, and it will be able
-  # to pair up the different implementations across them.
-  api_name = 'lstm_' + str(uuid.uuid4())
-  supportive_attribute = {
-      'time_major': time_major,
-      'go_backwards': go_backwards,
-  }
-  defun_standard_lstm = _generate_defun_backend(
-      api_name, _CPU_DEVICE_NAME, standard_lstm, supportive_attribute)
-  defun_gpu_lstm = _generate_defun_backend(
-      api_name, _GPU_DEVICE_NAME, gpu_lstm_with_fallback, supportive_attribute)
-
-  # Call the normal LSTM impl and register the CuDNN impl function. The
-  # grappler will kick in during session execution to optimize the graph.
-  last_output, outputs, new_h, new_c, runtime = defun_standard_lstm(
-      **params)
-  function.register(defun_gpu_lstm, **params)
+  # Chooses the implementation dynamicly based on the running device.
+  (last_output, outputs, new_h, new_c,
+   runtime) = control_flow_ops.execute_fn_for_device(
+       {
+           _CPU_DEVICE_NAME: lambda: standard_lstm(**params),
+           _GPU_DEVICE_NAME: lambda: gpu_lstm_with_fallback(**params)
+       }, lambda: standard_lstm(**params))
 
   return last_output, outputs, new_h, new_c, runtime
 
@@ -1619,18 +1581,6 @@ def calculate_sequence_by_mask(mask, time_major):
                              axis=timestep_index)
 
 
-def _generate_defun_backend(unique_api_name, preferred_device, func,
-                            supportive_attributes):
-  function_attributes = {
-      _FUNCTION_API_NAME_ATTRIBUTE: unique_api_name,
-      _FUNCTION_DEVICE_ATTRIBUTE: preferred_device,
-  }
-  function_attributes.update(supportive_attributes)
-  return function.defun_with_attributes(func=func,
-                                        attributes=function_attributes,
-                                        autograph=False)
-
-
 def _get_context_device_type():
   """Parse the current context and return the device type, eg CPU/GPU."""
   current_device = context.context().device_name

From f6d5cbdfea5f2dc6301500b0d24d3939d82aa80d Mon Sep 17 00:00:00 2001
From: Dimitris Vardoulakis <dimvar@google.com>
Date: Fri, 26 Jun 2020 11:00:42 -0700
Subject: [PATCH 1167/1390] [TF:XLA] Propagate device assignment from channel
 instructions to while body when possible.

PiperOrigin-RevId: 318504496
Change-Id: I3e1f4148efbcc12aaee783d20ab38e1ff0b12c80
---
 .../xla/service/sharding_propagation.cc       | 43 +++++++++++++---
 .../xla/service/sharding_propagation_test.cc  | 49 +++++++++++++++++++
 2 files changed, 86 insertions(+), 6 deletions(-)

diff --git a/tensorflow/compiler/xla/service/sharding_propagation.cc b/tensorflow/compiler/xla/service/sharding_propagation.cc
index 46ef132c1c0..9c5fa561f0b 100644
--- a/tensorflow/compiler/xla/service/sharding_propagation.cc
+++ b/tensorflow/compiler/xla/service/sharding_propagation.cc
@@ -1379,6 +1379,37 @@ StatusOr<bool> ShardingPropagation::Run(HloModule* module) {
         }
       };
 
+  // If a kWhile doesn't have a device assignment and it contains a channel
+  // instruction which has one, propagate that to the root of the while body.
+  for (auto computation : module->computations()) {
+    for (auto instruction : computation->instructions()) {
+      if (instruction->opcode() == HloOpcode::kWhile &&
+          !instruction->has_sharding() &&
+          !instruction->while_body()->root_instruction()->has_sharding()) {
+        absl::optional<int64> unique_device = absl::nullopt;
+        for (HloInstruction* body_instr :
+             instruction->while_body()->instructions()) {
+          auto opcode = body_instr->opcode();
+          if ((opcode == HloOpcode::kSend || opcode == HloOpcode::kRecv ||
+               opcode == HloOpcode::kAllReduce) &&
+              body_instr->sharding_unique_device()) {
+            if (unique_device == absl::nullopt) {
+              unique_device = body_instr->sharding_unique_device();
+            } else if (unique_device != body_instr->sharding_unique_device()) {
+              // The body contains several device assignments; don't propagate.
+              unique_device = absl::nullopt;
+              break;
+            }
+          }
+        }
+        if (unique_device.has_value()) {
+          instruction->while_body()->root_instruction()->set_device_sharding(
+              *unique_device);
+        }
+      }
+    }
+  }
+
   // Populate computation_map in order to associate while bodies to their
   // while instructions.
   for (auto computation : module->computations()) {
@@ -1404,12 +1435,12 @@ StatusOr<bool> ShardingPropagation::Run(HloModule* module) {
             inst->set_sharding(sharded_inst->sharding());
           }
         }
-      }
-      if (instruction->opcode() == HloOpcode::kWhile) {
-        computation_map[instruction->while_body()] = instruction;
-      } else if (instruction->opcode() == HloOpcode::kConditional) {
-        for (HloComputation* c : instruction->called_computations()) {
-          computation_map[c] = instruction;
+        if (instruction->opcode() == HloOpcode::kWhile) {
+          computation_map[instruction->while_body()] = instruction;
+        } else {
+          for (HloComputation* c : instruction->called_computations()) {
+            computation_map[c] = instruction;
+          }
         }
       }
     }
diff --git a/tensorflow/compiler/xla/service/sharding_propagation_test.cc b/tensorflow/compiler/xla/service/sharding_propagation_test.cc
index a9d685a7a93..bf859197ca1 100644
--- a/tensorflow/compiler/xla/service/sharding_propagation_test.cc
+++ b/tensorflow/compiler/xla/service/sharding_propagation_test.cc
@@ -757,6 +757,55 @@ ENTRY %entry {
   }
 }
 
+TEST_F(ShardingPropagationTest, WhileGetShardingFromRecvInBody) {
+  const char* const hlo_string = R"(
+HloModule module
+
+%cond {
+  %vars.cond = (u32[], f32[]) parameter(0)
+  %count.cond = u32[] get-tuple-element(%vars.cond), index=0
+  %limit = u32[] constant(10)
+  ROOT %lt = pred[] compare(%count.cond, %limit), direction=LT
+}
+
+%body {
+  %param = (u32[], f32[]) parameter(0)
+  %count = u32[] get-tuple-element(%param), index=0
+  %after-all = token[] after-all()
+  %recv = (f32[], u32[], token[]) recv(%after-all), channel_id=1,
+    sharding={maximal device=1}
+  %recv-done = (f32[], token[]) recv-done(%recv), channel_id=1
+  %data = f32[] get-tuple-element(%recv-done), index=0
+  ROOT %tuple = (u32[], f32[]) tuple(%count, %data)
+}
+
+ENTRY %entry {
+  %p0 = f32[] parameter(0)
+  %zero = u32[] constant(0)
+  %init = (u32[], f32[]) tuple(%zero, %p0)
+  %while = (u32[], f32[]) while(%init), body=%body, condition=%cond
+  ROOT %result = f32[] get-tuple-element(%while), index=1
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          ShardingPropagation().Run(module.get()));
+  EXPECT_FALSE(changed);  // The change happens before the fixpt loop
+  auto sharding = ParseSharding("{{maximal device=1}, {maximal device=1}}")
+                      .ConsumeValueOrDie();
+  auto while_instr = FindInstruction(module.get(), "while");
+  EXPECT_NE(nullptr, while_instr);
+  std::vector<const HloInstruction*> instructions{
+      while_instr, while_instr->while_body()->root_instruction(),
+      while_instr->while_body()->parameter_instruction(0),
+      while_instr->while_condition()->parameter_instruction(0)};
+  for (auto instr : instructions) {
+    EXPECT_TRUE(instr->has_sharding());
+    EXPECT_EQ(sharding, instr->sharding());
+  }
+}
+
 TEST_F(ShardingPropagationTest, Dot) {
   const char* const hlo_string = R"(
 HloModule module

From 9bc7e8e8439435edcdabf5459d38cdd31facc1ed Mon Sep 17 00:00:00 2001
From: Chenkai Kuang <chenkai@google.com>
Date: Fri, 26 Jun 2020 11:26:03 -0700
Subject: [PATCH 1168/1390] Handle ShardedVariable in model save to create TF
 1.x style graph that references graph tensors as opposed to eager tensors.

Support capturing ShardedVariable as a single-sharded variable in serving signature.

This allows saving a model with ShardedVariable to SavedModel and then using it in session based APIs (e.g, TF model serving). However loading the model in Python is not implemented in this change.

PiperOrigin-RevId: 318509848
Change-Id: I3fb9a0215e9d19c81412078b2e741dc0b39c1873
---
 tensorflow/python/distribute/BUILD            | 10 ++++
 .../python/distribute/sharded_variable.py     | 56 ++++++++++++++++++-
 .../distribute/sharded_variable_test.py       | 56 +++++++++++++++++++
 3 files changed, 121 insertions(+), 1 deletion(-)

diff --git a/tensorflow/python/distribute/BUILD b/tensorflow/python/distribute/BUILD
index e59fdcc872e..8fa24d887cf 100644
--- a/tensorflow/python/distribute/BUILD
+++ b/tensorflow/python/distribute/BUILD
@@ -1098,8 +1098,18 @@ tf_py_test(
         ":sharded_variable",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:session",
+        "//tensorflow/python:tensor_spec",
         "//tensorflow/python:variables",
         "//tensorflow/python/compat:v2_compat",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/saved_model:loader",
+        "//tensorflow/python/saved_model:save",
+        "//tensorflow/python/saved_model:signature_constants",
+        "//tensorflow/python/saved_model:tag_constants",
+        "//tensorflow/python/training/tracking",
         "//tensorflow/python/training/tracking:util",
     ],
 )
diff --git a/tensorflow/python/distribute/sharded_variable.py b/tensorflow/python/distribute/sharded_variable.py
index 7accc066d8a..8b46db113a6 100644
--- a/tensorflow/python/distribute/sharded_variable.py
+++ b/tensorflow/python/distribute/sharded_variable.py
@@ -20,7 +20,9 @@ from __future__ import print_function
 import copy
 
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import variables as variables_lib
+from tensorflow.python.saved_model import save_context
 from tensorflow.python.training.saving import saveable_object_util
 from tensorflow.python.training.tracking import base as trackable
 
@@ -38,7 +40,39 @@ class ShardedVariable(trackable.Trackable):
   Objects of this class can be saved with a given number of shards and then
   restored from a checkpoint into a different number of shards.
 
+  Objects of this class can be saved to SavedModel format using
+  `tf.saved_model.save`. The SavedModel can be used by programs like TF serving
+  APIs. It is not yet supported to load the SavedModel with
+  `tf.saved_model.load`.
+
+  Since `ShardedVariable` can be saved and then restored to different number of
+  shards depending on the restore environments, for example, TF serving APIs
+  would restore to one shard for serving efficiency, when using
+  `ShardedVariable` in a tf.function, one should generally not assume it has the
+  same number of shards across save and load.
+
   Sharding is only supported along the first dimension.
+
+  >>> class Model(tf.Module):
+  ...   def __init__(self):
+  ...     self.sharded_variable = ShardedVariable([
+  ...       tf.Variable([3.0], dtype=tf.float32),
+  ...       tf.Variable([2.0], dtype=tf.float32)
+  ...     ])
+  ...
+  ...   @tf.function(input_signature=[tf.TensorSpec([], dtype=tf.int32)])
+  ...   def fn(self, x):
+  ...     return tf.nn.embedding_lookup(self.sharded_variable.variables, x)
+  ...
+  ...   @tf.function(input_signature=[tf.TensorSpec([], dtype=tf.int32)])
+  ...   def serve_fn(self, x):
+  ...     return tf.nn.embedding_lookup(self.sharded_variable.variables, x)
+  >>>
+  >>> model = Model()
+  >>> model.fn(1).numpy()
+  2.0
+  >>> tf.saved_model.save(model, export_dir='/tmp/saved_model',
+  ...   signatures=model.serve_fn)
   """
 
   def __init__(self, variables, name='ShardedVariable'):
@@ -96,6 +130,12 @@ class ShardedVariable(trackable.Trackable):
                        'to the order of the `Variable`s in the list passed to '
                        'the constructor. Found {}'.format(save_slice_info))
 
+    # We create an uninitialized saving_variable with the full shape, which can
+    # be later captured in signatures so that the signatures can treat this
+    # ShardedVariable as one single variable.
+    self._saving_variable = resource_variable_ops.UninitializedVariable(
+        shape=self._shape, dtype=self._dtype, name=self._name)
+
   def __iter__(self):
     """Return an iterable for accessing the underlying sharded variables."""
     return iter(self._variables)
@@ -103,6 +143,8 @@ class ShardedVariable(trackable.Trackable):
   @property
   def variables(self):
     """The list of `Variable`s that make up the shards of this object."""
+    if save_context.in_save_context():
+      return [self._saving_variable]
     return self._variables
 
   @property
@@ -136,8 +178,20 @@ class ShardedVariable(trackable.Trackable):
             var_shape=v.shape.as_list())
         saveables.append(
             saveable_object_util.ResourceVariableSaveable(
-                v, save_slice_info.spec, name))  # pylint: disable=protected-access
+                v, save_slice_info.spec, name))
         var_offset[0] += int(v.shape[0])
       return saveables
 
     return {trackable.VARIABLE_VALUE_KEY: _saveable_factory}
+
+  def _map_resources(self):
+    """For implementing `Trackable`."""
+    obj_map, resource_map = {}, {}
+    for v in self._variables + [self._saving_variable]:
+      v_obj_map, v_resource_map = v._map_resources()  # pylint:disable=protected-access
+      obj_map.update(v_obj_map)
+      resource_map.update(v_resource_map)
+    obj_map[self] = ShardedVariable([obj_map[self._saving_variable]],
+                                    name=self.name)
+
+    return obj_map, resource_map
diff --git a/tensorflow/python/distribute/sharded_variable_test.py b/tensorflow/python/distribute/sharded_variable_test.py
index 7110a9ff1fe..b42c8314e98 100644
--- a/tensorflow/python/distribute/sharded_variable_test.py
+++ b/tensorflow/python/distribute/sharded_variable_test.py
@@ -20,14 +20,46 @@ from __future__ import print_function
 
 import os
 
+from tensorflow.python.client import session as session_lib
 from tensorflow.python.compat import v2_compat
 from tensorflow.python.distribute import sharded_variable
+from tensorflow.python.eager import def_function
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_spec
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import embedding_ops
 from tensorflow.python.ops import variables as variables_lib
 from tensorflow.python.platform import test
+from tensorflow.python.saved_model import loader
+from tensorflow.python.saved_model import save
+from tensorflow.python.saved_model import signature_constants
+from tensorflow.python.saved_model import tag_constants
+from tensorflow.python.training.tracking import tracking
 from tensorflow.python.training.tracking import util
 
 
+def _load_and_run(
+    model_dir,
+    inputs,
+    signature_key=signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY):
+  """Load a SavedModel into a TF 1.x-style graph and run `signature_key`."""
+  graph = ops.Graph()
+  with graph.as_default(), session_lib.Session() as session:
+    meta_graph_def = loader.load(session, [tag_constants.SERVING], model_dir)
+    signature = meta_graph_def.signature_def[signature_key]
+    feed_dict = {}
+    for arg_name in inputs.keys():
+      input_tensor = session.graph.get_tensor_by_name(
+          signature.inputs[arg_name].name)
+      feed_dict[input_tensor] = inputs[arg_name]
+    output_dict = {}
+    for output_name, output_tensor_info in signature.outputs.items():
+      output_dict[output_name] = session.graph.get_tensor_by_name(
+          output_tensor_info.name)
+    return session.run(output_dict, feed_dict=feed_dict)
+
+
 class ShardedVariableTest(test.TestCase):
 
   def test_sharded_variable_simple(self):
@@ -116,6 +148,30 @@ class ShardedVariableTest(test.TestCase):
     self.assertAllEqual(self.evaluate(cp2.s.variables[0]), [0, 1])
     self.assertAllEqual(self.evaluate(cp2.s.variables[1]), [2, 3])
 
+  def test_save_graph_def(self):
+    root = tracking.AutoTrackable()
+    v1 = variables_lib.Variable([3.])
+    v2 = variables_lib.Variable([2.])
+    root.v = sharded_variable.ShardedVariable([v1, v2])
+    root.train = def_function.function(
+        lambda x: embedding_ops.embedding_lookup_v2(root.v.variables, x))
+    # TODO(b/144057383): Remove the necessity of root.serve once saving context
+    # is made to tf.function cache.
+    root.serve = def_function.function(
+        lambda x: embedding_ops.embedding_lookup_v2(root.v.variables[0], x),
+        input_signature=[tensor_spec.TensorSpec([2], dtypes.int32, name='x')])
+
+    # Trace and use root.train
+    self.assertAllEqual([3., 2.], root.train([0, 1]).numpy())
+
+    save_dir = os.path.join(self.get_temp_dir(), 'saved_model')
+    save.save(root, save_dir, root.serve)
+    self.assertAllEqual([3., 2.],
+                        _load_and_run(save_dir, {'x': [0, 1]})['output_0'])
+
+    # Continue using root.train for training
+    self.assertAllEqual([3., 2.], root.train([0, 1]).numpy())
+
   def test_validation_errors(self):
     with self.assertRaisesRegexp(ValueError, 'Expected a list of '):
       sharded_variable.ShardedVariable(

From f827641d11f1c71f45c8b8cb1c4344582254937f Mon Sep 17 00:00:00 2001
From: Jinliang Wei <jlwei@google.com>
Date: Fri, 26 Jun 2020 11:30:32 -0700
Subject: [PATCH 1169/1390] Add more details to NMS docstring to describe what
 input boxes are valid.

PiperOrigin-RevId: 318510630
Change-Id: I8a5917a56c2fbe68fc5c1c71b168a371728c7c19
---
 tensorflow/python/ops/image_ops_impl.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/tensorflow/python/ops/image_ops_impl.py b/tensorflow/python/ops/image_ops_impl.py
index ab57ce9244e..a31c9370c37 100644
--- a/tensorflow/python/ops/image_ops_impl.py
+++ b/tensorflow/python/ops/image_ops_impl.py
@@ -4689,7 +4689,11 @@ def non_max_suppression_padded_v2(boxes,
 
   Args:
     boxes: a tensor of rank 2 or higher with a shape of [..., num_boxes, 4].
-      Dimensions except the last two are batch dimensions.
+      Dimensions except the last two are batch dimensions. The last dimension
+      represents box coordinates, given as [y_1, x_1, y_2, x_2]. The coordinates
+      on each dimension can be given in any order
+      (see also `canonicalized_coordinates`) but must describe a box with
+      a positive area.
     scores: a tensor of rank 1 or higher with a shape of [..., num_boxes].
     max_output_size: a scalar integer `Tensor` representing the maximum number
       of boxes to be selected by non max suppression.

From 90eeb30eb956a6ea3770701eefdda73650422ad3 Mon Sep 17 00:00:00 2001
From: Jiho Choi <jihochoi@google.com>
Date: Fri, 26 Jun 2020 11:31:20 -0700
Subject: [PATCH 1170/1390] Add element tracing for parallel_map to track
 exactly which upstream event produces an element for each parallel_map's
 GetNext call.

PiperOrigin-RevId: 318510773
Change-Id: I6a8a0c199d491139dc7ba729b4b168d31e63745d
---
 tensorflow/core/kernels/data/BUILD            |  2 ++
 .../kernels/data/parallel_map_dataset_op.cc   | 21 +++++++++++++++++--
 2 files changed, 21 insertions(+), 2 deletions(-)

diff --git a/tensorflow/core/kernels/data/BUILD b/tensorflow/core/kernels/data/BUILD
index 0972dc83ccf..74283b63b67 100644
--- a/tensorflow/core/kernels/data/BUILD
+++ b/tensorflow/core/kernels/data/BUILD
@@ -470,6 +470,8 @@ tf_kernel_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/profiler/lib:traceme",
+        "//tensorflow/core/profiler/lib:traceme_encode",
     ],
 )
 
diff --git a/tensorflow/core/kernels/data/parallel_map_dataset_op.cc b/tensorflow/core/kernels/data/parallel_map_dataset_op.cc
index bae90549841..e7480ca24d3 100644
--- a/tensorflow/core/kernels/data/parallel_map_dataset_op.cc
+++ b/tensorflow/core/kernels/data/parallel_map_dataset_op.cc
@@ -29,6 +29,8 @@ limitations under the License.
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/random/random.h"
 #include "tensorflow/core/platform/stringprintf.h"
+#include "tensorflow/core/profiler/lib/traceme.h"
+#include "tensorflow/core/profiler/lib/traceme_encode.h"
 #include "tensorflow/core/protobuf/error_codes.pb.h"
 
 namespace tensorflow {
@@ -241,6 +243,10 @@ class ParallelMapDatasetOp::Dataset : public DatasetBase {
       RecordStop(ctx);
       result->notification.WaitForNotification();
       RecordStart(ctx);
+      profiler::TraceMe traceme([&] {
+        return profiler::TraceMeEncode("ParallelMapConsume",
+                                       {{"element_id", result->id}});
+      });
       return ProcessResult(ctx, result, out_tensors, end_of_sequence);
     }
 
@@ -358,10 +364,14 @@ class ParallelMapDatasetOp::Dataset : public DatasetBase {
 
    private:
     struct InvocationResult {
+      InvocationResult() = default;
+      explicit InvocationResult(int64 id) : id(id) {}
+
       Notification notification;
       Status status;
       std::vector<Tensor> return_values;
-      bool end_of_input;
+      bool end_of_input = false;
+      int64 id = -1;
     };
 
     void CancelThreads(bool wait) TF_LOCKS_EXCLUDED(mu_) {
@@ -402,6 +412,10 @@ class ParallelMapDatasetOp::Dataset : public DatasetBase {
     void CallFunction(const std::shared_ptr<IteratorContext>& ctx,
                       const std::shared_ptr<InvocationResult>& result)
         TF_LOCKS_EXCLUDED(*mu_) {
+      profiler::TraceMe traceme([&] {
+        return profiler::TraceMeEncode("ParallelMapProduce",
+                                       {{"element_id", result->id}});
+      });
       // Get the next input element.
       std::vector<Tensor> input_element;
       result->status = input_impl_->GetNext(ctx.get(), &input_element,
@@ -490,6 +504,8 @@ class ParallelMapDatasetOp::Dataset : public DatasetBase {
         return num_calls_ >= num_parallel_calls ||
                invocation_results_.size() >= num_parallel_calls;
       };
+      // Counts the total number of calls to use as an id of InvocationResult.
+      int64 num_total_calls = 0;
       while (true) {
         {
           mutex_lock l(*mu_);
@@ -502,7 +518,8 @@ class ParallelMapDatasetOp::Dataset : public DatasetBase {
             return;
           }
           while (!busy()) {
-            invocation_results_.push_back(std::make_shared<InvocationResult>());
+            invocation_results_.push_back(
+                std::make_shared<InvocationResult>(num_total_calls++));
             new_calls.push_back(invocation_results_.back());
             num_calls_++;
           }

From 6ad384336c53a49a6509572a6ddaeeecd482e44b Mon Sep 17 00:00:00 2001
From: Katherine Wu <kathywu@google.com>
Date: Fri, 26 Jun 2020 11:40:33 -0700
Subject: [PATCH 1171/1390] Add SaveableObjects to SavedModel.

When objects are loaded from the SavedModel, they don't retain their `_gather_saveables_for_checkpoint` functions, which can result in values not being loaded from the checkpoint.

This CL adds a field in the SavedModel proto that stores a save and restore function for each SaveableObject in each node. When loading into Python, the SaveableObjects are restored using the functions.

PiperOrigin-RevId: 318512603
Change-Id: I9b2b773c263703e9eb8e6114c631160ff4f7d1c1
---
 RELEASE.md                                    |   4 +
 .../core/protobuf/saved_object_graph.proto    |   8 ++
 tensorflow/python/ops/lookup_ops.py           |  22 +--
 tensorflow/python/saved_model/load.py         |  13 ++
 tensorflow/python/saved_model/load_test.py    |  16 +++
 tensorflow/python/saved_model/save.py         |  48 ++++++-
 .../training/saving/saveable_object_util.py   | 136 +++++++++++++++++-
 tensorflow/python/training/tracking/base.py   |   8 +-
 .../python/training/tracking/graph_view.py    |  18 ++-
 9 files changed, 253 insertions(+), 20 deletions(-)

diff --git a/RELEASE.md b/RELEASE.md
index 68d9399676a..37b95bb1f40 100644
--- a/RELEASE.md
+++ b/RELEASE.md
@@ -65,6 +65,10 @@ This release contains contributions from many people at Google, as well as:
     exsiting C++ kernel `ExtractGlimpse` does not change as well, so saved
     models will not be impacted.
 
+## Bug Fixes and Other Changes
+
+* Mutable tables now restore checkpointed values when loaded from SavedModel.
+
 # Release 2.1.1
 
 ## Bug Fixes and Other Changes
diff --git a/tensorflow/core/protobuf/saved_object_graph.proto b/tensorflow/core/protobuf/saved_object_graph.proto
index e794b885dec..981908cfa3c 100644
--- a/tensorflow/core/protobuf/saved_object_graph.proto
+++ b/tensorflow/core/protobuf/saved_object_graph.proto
@@ -61,6 +61,8 @@ message SavedObject {
     SavedConstant constant = 9;
     SavedResource resource = 10;
   }
+
+  map<string, SaveableObject> saveable_objects = 11;
 }
 
 // A SavedUserObject is an object (in the object-oriented language of the
@@ -162,3 +164,9 @@ message SavedResource {
   // device.
   string device = 1;
 }
+
+message SaveableObject {
+  // Node ids of concrete functions for saving and loading from a checkpoint.
+  int32 save_function = 2;
+  int32 restore_function = 3;
+}
diff --git a/tensorflow/python/ops/lookup_ops.py b/tensorflow/python/ops/lookup_ops.py
index 15c7f12f89c..96f3cf91499 100644
--- a/tensorflow/python/ops/lookup_ops.py
+++ b/tensorflow/python/ops/lookup_ops.py
@@ -1870,25 +1870,27 @@ class MutableHashTable(LookupInterface):
     return {
         "table":
             functools.partial(
-                MutableHashTable._Saveable, table=self, name=self._name)
+                MutableHashTable._Saveable, table=self, name=self._name,
+                table_name=self._name)
     }
 
   class _Saveable(BaseSaverBuilder.SaveableObject):
-    """SaveableObject implementation for MutableHashTable."""
+    """SaveableObject implementation for DenseHashTable."""
 
-    def __init__(self, table, name):
+    def __init__(self, table, name, table_name=None):
       tensors = table.export()
       specs = [
           BaseSaverBuilder.SaveSpec(tensors[0], "", name + "-keys"),
           BaseSaverBuilder.SaveSpec(tensors[1], "", name + "-values")
       ]
+      self.table_name = table_name or name
       # pylint: disable=protected-access
       super(MutableHashTable._Saveable, self).__init__(table, specs, name)
 
-    def restore(self, restored_tensors, restored_shapes, name=None):
+    def restore(self, restored_tensors, restored_shapes):
       del restored_shapes  # unused
       # pylint: disable=protected-access
-      with ops.name_scope(name, "%s_table_restore" % self.name):
+      with ops.name_scope("%s_table_restore" % self.table_name):
         with ops.colocate_with(self.op.resource_handle):
           return gen_lookup_ops.lookup_table_import_v2(self.op.resource_handle,
                                                        restored_tensors[0],
@@ -2166,25 +2168,27 @@ class DenseHashTable(LookupInterface):
     return {
         "table":
             functools.partial(
-                DenseHashTable._Saveable, table=self, name=self._name)
+                DenseHashTable._Saveable, table=self, name=self._name,
+                table_name=self._name)
     }
 
   class _Saveable(BaseSaverBuilder.SaveableObject):
     """SaveableObject implementation for DenseHashTable."""
 
-    def __init__(self, table, name):
+    def __init__(self, table, name, table_name=None):
       tensors = table.export()
       specs = [
           BaseSaverBuilder.SaveSpec(tensors[0], "", name + "-keys"),
           BaseSaverBuilder.SaveSpec(tensors[1], "", name + "-values")
       ]
+      self.table_name = table_name or name
       # pylint: disable=protected-access
       super(DenseHashTable._Saveable, self).__init__(table, specs, name)
 
-    def restore(self, restored_tensors, restored_shapes, name=None):
+    def restore(self, restored_tensors, restored_shapes):
       del restored_shapes  # unused
       # pylint: disable=protected-access
-      with ops.name_scope(name, "%s_table_restore" % self.name):
+      with ops.name_scope("%s_table_restore" % self.table_name):
         with ops.colocate_with(self.op.resource_handle):
           return gen_lookup_ops.lookup_table_import_v2(self.op.resource_handle,
                                                        restored_tensors[0],
diff --git a/tensorflow/python/saved_model/load.py b/tensorflow/python/saved_model/load.py
index fb2d01cbee2..0835481ab69 100644
--- a/tensorflow/python/saved_model/load.py
+++ b/tensorflow/python/saved_model/load.py
@@ -45,6 +45,7 @@ from tensorflow.python.saved_model import nested_structure_coder
 from tensorflow.python.saved_model import revived_types
 from tensorflow.python.saved_model import utils_impl as saved_model_utils
 from tensorflow.python.training.saving import checkpoint_options
+from tensorflow.python.training.saving import saveable_object_util
 from tensorflow.python.training.tracking import base
 from tensorflow.python.training.tracking import graph_view
 from tensorflow.python.training.tracking import tracking
@@ -146,6 +147,18 @@ class Loader(object):
     self._setup_functions_structures()
     self._setup_functions_captures()
 
+    self._create_saveable_object_factories()
+
+  def _create_saveable_object_factories(self):
+    for node_id, proto in enumerate(self._proto.nodes):
+      node = self.get(node_id)
+      node._self_saveable_object_factories = {}  # pylint: disable=protected-access
+      for name, saveable_object_proto in proto.saveable_objects.items():
+        node._self_saveable_object_factories[name] = (  # pylint: disable=protected-access
+            saveable_object_util.restored_saved_object_factory(
+                self.get(saveable_object_proto.save_function),
+                self.get(saveable_object_proto.restore_function)))
+
   def _load_edges(self):
     """Adds edges from objects to other objects and functions."""
     for node_id, object_proto in enumerate(self._proto.nodes):
diff --git a/tensorflow/python/saved_model/load_test.py b/tensorflow/python/saved_model/load_test.py
index 5449cc1c9a2..c392c7feb31 100644
--- a/tensorflow/python/saved_model/load_test.py
+++ b/tensorflow/python/saved_model/load_test.py
@@ -1795,6 +1795,22 @@ class LoadTest(test.TestCase, parameterized.TestCase):
     options = load_options.LoadOptions(experimental_io_device="/job:localhost")
     self.assertEqual("/job:localhost", options.experimental_io_device)
 
+  def test_load_custom_saveable_object(self, cycles):
+    root = tracking.AutoTrackable()
+    root.table = lookup_ops.MutableHashTable(dtypes.string, dtypes.float32, -1)
+    root.table.insert("foo", 15)
+
+    @def_function.function(
+        input_signature=[tensor_spec.TensorSpec(None, dtypes.string)])
+    def lookup(key):
+      return root.table.lookup(key)
+
+    root.lookup = lookup
+
+    imported = cycle(root, cycles)
+    self.assertEqual(self.evaluate(imported.lookup("foo")), 15)
+    self.assertEqual(self.evaluate(imported.lookup("idk")), -1)
+
 
 class SingleCycleTests(test.TestCase, parameterized.TestCase):
 
diff --git a/tensorflow/python/saved_model/save.py b/tensorflow/python/saved_model/save.py
index 84764431b9d..4220146b6c8 100644
--- a/tensorflow/python/saved_model/save.py
+++ b/tensorflow/python/saved_model/save.py
@@ -19,6 +19,7 @@ from __future__ import division
 from __future__ import print_function
 
 import collections
+import functools
 import os
 
 from tensorflow.core.framework import versions_pb2
@@ -54,6 +55,7 @@ from tensorflow.python.saved_model import tag_constants
 from tensorflow.python.saved_model import utils_impl
 from tensorflow.python.training.saving import checkpoint_options
 from tensorflow.python.training.saving import functional_saver
+from tensorflow.python.training.saving import saveable_object_util
 from tensorflow.python.training.tracking import base
 from tensorflow.python.training.tracking import graph_view
 from tensorflow.python.training.tracking import tracking
@@ -137,12 +139,15 @@ class _AugmentedGraphView(graph_view.ObjectGraphView):
     return obj._list_extra_dependencies_for_serialization(  # pylint: disable=protected-access
         self._serialization_cache)
 
-  def list_functions(self, obj):
+  def list_functions(self, obj, extra_functions=None):
     obj_functions = self._functions.get(obj, None)
     if obj_functions is None:
       obj_functions = obj._list_functions_for_serialization(  # pylint: disable=protected-access
           self._serialization_cache)
       self._functions[obj] = obj_functions
+    if extra_functions:
+      obj_functions = obj_functions.copy()
+      obj_functions.update(extra_functions)
     return obj_functions
 
 
@@ -178,6 +183,12 @@ class _SaveableView(object):
     self.slot_variables = slot_variables
     self.concrete_functions = []
 
+    self.saveable_objects_for_node, all_saveable_functions = (
+        self._add_saveable_objects())
+    saveable_object_functions = {
+        "__SAVEABLE_FUNCTION_{}".format(n): fn
+        for n, fn in enumerate(all_saveable_functions)}
+
     # Maps functions -> wrapped functions that capture variables
     self.wrapped_functions = wrapped_functions or {}
     # Maps names of concrete functions in the object to names of wrapped
@@ -191,7 +202,8 @@ class _SaveableView(object):
     nodes_without_functions = list(self.nodes)
     seen_function_names = set()
     for node in nodes_without_functions:
-      for function in checkpoint_view.list_functions(node).values():
+      for function in checkpoint_view.list_functions(
+          node, saveable_object_functions).values():
         if function not in self.node_ids:
           self.node_ids[function] = len(self.nodes)
           self.nodes.append(function)
@@ -210,6 +222,25 @@ class _SaveableView(object):
             seen_function_names.add(concrete_function.name)
             self.concrete_functions.append(concrete_function)
 
+  def _add_saveable_objects(self):
+    """Retrieves SaveablesObjects and traces their save/restore functions."""
+    # Maps node -> local name -> (save function, restore function)
+    saveable_objects_map = object_identity.ObjectIdentityDictionary()
+    all_saveable_functions = []
+    for node in self.nodes:
+      if resource_variable_ops.is_resource_variable(node):
+        # Resource (and TPU/Mirrored) variables  are automatically revived with
+        # their saveables defined, so there is no need to trace the save
+        # and restore functions.
+        continue
+      saveable_map = saveable_object_util.trace_save_restore_functions(node)
+      if saveable_map:
+        saveable_objects_map[node] = saveable_map
+        for save_fn, restore_fn in saveable_map.values():
+          all_saveable_functions.append(save_fn)
+          all_saveable_functions.append(restore_fn)
+    return saveable_objects_map, all_saveable_functions
+
   @property
   def root(self):
     return self.nodes[0]
@@ -234,6 +265,15 @@ class _SaveableView(object):
         child_proto.node_id = self.node_ids[ref_function]
         child_proto.local_name = local_name
 
+      if node not in self.saveable_objects_for_node:
+        continue
+
+      for local_name, (save_fn, restore_fn) in (
+          self.saveable_objects_for_node[node].items()):
+        saveable_object_proto = object_proto.saveable_objects[local_name]
+        saveable_object_proto.save_function = self.node_ids[save_fn]
+        saveable_object_proto.restore_function = self.node_ids[restore_fn]
+
   def map_resources(self):
     """Makes new resource handle ops corresponding to existing resource tensors.
 
@@ -606,7 +646,9 @@ def _fill_meta_graph_def(meta_graph_def, saveable_view, signature_functions,
   # the exported graph (thus the `to_graph` argument).
   saver = functional_saver.MultiDeviceSaver(
       saveable_view.checkpoint_view.frozen_saveable_objects(
-          object_map=object_map, to_graph=exported_graph))
+          object_map=object_map, to_graph=exported_graph,
+          call_with_mapped_captures=functools.partial(
+              _call_function_with_mapped_captures, resource_map=resource_map)))
 
   with exported_graph.as_default():
     signatures = _generate_signatures(signature_functions, resource_map)
diff --git a/tensorflow/python/training/saving/saveable_object_util.py b/tensorflow/python/training/saving/saveable_object_util.py
index 59d65ade573..bd8814a3c16 100644
--- a/tensorflow/python/training/saving/saveable_object_util.py
+++ b/tensorflow/python/training/saving/saveable_object_util.py
@@ -17,15 +17,26 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import functools
 import six
 
 from tensorflow.python.eager import context
+from tensorflow.python.eager import def_function
+
+from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import device as pydev
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_spec
+from tensorflow.python.framework import tensor_util
+from tensorflow.python.framework import type_spec
+
+
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variables
+from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training.saving import saveable_object
 from tensorflow.python.training.tracking import base as trackable
 from tensorflow.python.util import nest
@@ -279,7 +290,7 @@ def op_list_to_dict(op_list, convert_variable_to_tensor=True):
           raise ValueError(
               ("Two different ResourceVariable objects with the same "
                "shared_name '%s' were passed to the Saver. This likely means "
-               "that they were created in different Graphs or isolation "
+               "that they were created in different Graphs or isoWlation "
                "contexts, and may not be checkpointed together.") %
               (var._shared_name,))
       else:
@@ -349,3 +360,126 @@ def validate_and_slice_inputs(names_to_saveables):
     for converted_saveable_object in saveable_objects_for_op(op, name):
       _add_saveable(saveables, seen_ops, converted_saveable_object)
   return saveables
+
+
+def trace_save_restore_functions(object_to_save):
+  """Gathers all SaveableObjects and traces the save and restore ops."""
+  saveable_map = {}  # Maps name -> (save function, restore function)
+  for name, saveable_factory in (
+      object_to_save._gather_saveables_for_checkpoint().items()):  # pylint: disable=protected-access
+    if not callable(saveable_factory):
+      if isinstance(saveable_factory, saveable_object.SaveableObject):
+        logging.debug(
+            "Trackable {} should return callable factories, not SaveableObjects"
+            " in `_gather_saveables_for_checkpoint`. This could lead to "
+            "problems loading the SavedModel back into Python."
+            .format(object_to_save))
+      continue
+
+    if is_factory_for_restored_saveable_object(saveable_factory):
+      saveable_map[name] = (saveable_factory.keywords["save_function"],
+                            saveable_factory.keywords["restore_function"])
+    else:
+      concrete_save_fn, concrete_restore_fn = _trace_save_and_restore_function(
+          saveable_factory, object_to_save)
+      if concrete_save_fn is not None:
+        saveable_map[name] = (concrete_save_fn, concrete_restore_fn)
+  return saveable_map
+
+
+def _trace_save_and_restore_function(saveable_factory, object_to_save):
+  """Traces the save and restore concrete functions."""
+  saveable = [0]
+
+  @def_function.function(
+      input_signature=[tensor_spec.TensorSpec([], dtypes.string)])
+  def save_fn(checkpoint_key):
+    saveable[0] = saveable_factory(name=checkpoint_key)
+    return [{"name": s.name, "tensor": s.tensor}
+            for s in saveable[0].specs]
+
+  concrete_save_fn = save_fn.get_concrete_function()
+  if isinstance(saveable[0], trackable.PythonStateSaveable):
+    logging.warn(
+        "Note that object {} stores python values into the checkpoint. "
+        "These values will not be restored when loading the SavedModel "
+        "into python.".format(object_to_save))
+    return None, None
+  if isinstance(saveable[0], trackable.NoRestoreSaveable):
+    return None, None
+
+  restored_type_specs = [type_spec.type_spec_from_value(s.tensor)
+                         for s in saveable[0].specs]\
+
+  @def_function.function(input_signature=[restored_type_specs])
+  def restore_fn(restored_tensors):
+    saveable[0].restore(restored_tensors, restored_shapes=None)
+    return 1
+
+  concrete_restore_fn = restore_fn.get_concrete_function()
+  return concrete_save_fn, concrete_restore_fn
+
+
+class RestoredSaveableObject(saveable_object.SaveableObject):
+  """SaveableObject restored from SavedModel using the traced save/restore."""
+
+  def __init__(self, save_function, restore_function, name):
+    self.save_function = save_function
+    self.restore_function = restore_function
+
+    if tensor_util.is_tensor(name):
+      name_tensor = name
+    else:
+      with ops.init_scope():
+        name_tensor = constant_op.constant(name)
+    tensors = save_function(name_tensor)
+    specs = [saveable_object.SaveSpec(x["tensor"], "", x["name"])
+             for x in tensors]
+    super(RestoredSaveableObject, self).__init__(None, specs, name)
+
+  def restore(self, restored_tensors, restored_shapes):
+    del restored_shapes  # unused
+    return self.restore_function(
+        *[restored_tensors[i] for i in range(len(self.specs))])
+
+
+def restored_saved_object_factory(save_function, restore_function):
+  return functools.partial(RestoredSaveableObject,
+                           save_function=save_function,
+                           restore_function=restore_function)
+
+
+def create_saveable_object(factory, name, call_with_mapped_captures):
+  """Creates a SaveableObject while potentially in a different graph.
+
+  When creating the frozen saver for SavedModel, the save and restore ops are
+  placed in a separate graph. Since RestoredSaveableObject uses tf.functions to
+  save and restore, the function captures must be mapped to the new graph.
+
+  Args:
+    factory: Factory method for creating the SaveableObject.
+    name: Checkpoint key of this SaveableObject.
+    call_with_mapped_captures: Helper that calls a tf.function while remapping
+      the captures.
+
+  Returns:
+    a SaveableObject.
+  """
+  if (call_with_mapped_captures is None or
+      not is_factory_for_restored_saveable_object(factory)):
+    return factory(name=name)
+
+  concrete_save_fn = factory.keywords["save_function"]
+  def save_fn(name):
+    return call_with_mapped_captures(concrete_save_fn, [name])
+
+  concrete_restore_fn = factory.keywords["restore_function"]
+  def restore_fn(*restored_tensors):
+    return call_with_mapped_captures(concrete_restore_fn, restored_tensors)
+
+  return factory(save_function=save_fn, restore_function=restore_fn, name=name)
+
+
+def is_factory_for_restored_saveable_object(factory):
+  return (isinstance(factory, functools.partial) and
+          factory.func is RestoredSaveableObject)
diff --git a/tensorflow/python/training/tracking/base.py b/tensorflow/python/training/tracking/base.py
index ea76ad8db47..9337adbf88a 100644
--- a/tensorflow/python/training/tracking/base.py
+++ b/tensorflow/python/training/tracking/base.py
@@ -611,6 +611,12 @@ class Trackable(object):
     # building.
     self._self_name_based_restores = set()
 
+    # Dictionary of SaveableObjects factories. This dictionary is defined when
+    # the object is loaded from the SavedModel. When writing a custom class,
+    # prefer overriding "_gather_saveables_from_checkpoint" to using this
+    # attribute.
+    self._self_saveable_object_factories = {}
+
   @property
   def _object_identifier(self):
     """String used to identify this object in a SavedModel.
@@ -972,7 +978,7 @@ class Trackable(object):
        lambda name="global_name_for_this_object":
        SaveableObject(name=name, ...)}
     """
-    return {}
+    return self._self_saveable_object_factories
 
   def _list_extra_dependencies_for_serialization(self, serialization_cache):
     """Lists extra dependencies to serialize.
diff --git a/tensorflow/python/training/tracking/graph_view.py b/tensorflow/python/training/tracking/graph_view.py
index 041ff38eedd..1cf84023b1c 100644
--- a/tensorflow/python/training/tracking/graph_view.py
+++ b/tensorflow/python/training/tracking/graph_view.py
@@ -208,7 +208,7 @@ class ObjectGraphView(object):
 
   def _add_attributes_to_object_graph(
       self, trackable_objects, object_graph_proto, node_ids, object_names,
-      object_map):
+      object_map, call_with_mapped_captures):
     """Create SaveableObjects and corresponding SerializedTensor protos."""
     named_saveable_objects = []
     if self._saveables_cache is None:
@@ -253,7 +253,9 @@ class ObjectGraphView(object):
                 break
         if saveables is None:
           if callable(saveable_factory):
-            maybe_saveable = saveable_factory(name=attribute.checkpoint_key)
+            maybe_saveable = saveable_object_util.create_saveable_object(
+                saveable_factory, attribute.checkpoint_key,
+                call_with_mapped_captures)
           else:
             maybe_saveable = saveable_factory
           if isinstance(maybe_saveable, saveable_object_lib.SaveableObject):
@@ -332,7 +334,8 @@ class ObjectGraphView(object):
     return object_graph_proto
 
   def _serialize_gathered_objects(self, trackable_objects, path_to_root,
-                                  object_map=None):
+                                  object_map=None,
+                                  call_with_mapped_captures=None):
     """Create SaveableObjects and protos for gathered objects."""
     object_names = object_identity.ObjectIdentityDictionary()
     for obj, path in path_to_root.items():
@@ -354,7 +357,8 @@ class ObjectGraphView(object):
             object_graph_proto=object_graph_proto,
             node_ids=node_ids,
             object_names=object_names,
-            object_map=object_map))
+            object_map=object_map,
+            call_with_mapped_captures=call_with_mapped_captures))
     return named_saveable_objects, object_graph_proto, feed_additions
 
   def serialize_object_graph(self):
@@ -382,7 +386,8 @@ class ObjectGraphView(object):
     return self._serialize_gathered_objects(
         trackable_objects, path_to_root)
 
-  def frozen_saveable_objects(self, object_map=None, to_graph=None):
+  def frozen_saveable_objects(self, object_map=None, to_graph=None,
+                              call_with_mapped_captures=None):
     """Creates SaveableObjects with the current object graph frozen."""
     trackable_objects, path_to_root = self._breadth_first_traversal()
     if to_graph:
@@ -393,7 +398,8 @@ class ObjectGraphView(object):
       named_saveable_objects, graph_proto, _ = self._serialize_gathered_objects(
           trackable_objects,
           path_to_root,
-          object_map)
+          object_map,
+          call_with_mapped_captures)
       with ops.device("/cpu:0"):
         object_graph_tensor = constant_op.constant(
             graph_proto.SerializeToString(), dtype=dtypes.string)

From b6be4f36eb5dd93d6d10c5bf7ca26101639a9494 Mon Sep 17 00:00:00 2001
From: Karim Nosir <karimnosseir@google.com>
Date: Fri, 26 Jun 2020 11:43:57 -0700
Subject: [PATCH 1172/1390] Remove explicit line "experimental_new_converter =
 True" the converter launched and is now the default.

PiperOrigin-RevId: 318513201
Change-Id: If2d4221abfea4467c11b91ca00a83f765d8c6fad
---
 .../mlir/lite/tests/debuginfo/concrete_function_error.py         | 1 -
 .../compiler/mlir/lite/tests/debuginfo/saved_model_error.py      | 1 -
 2 files changed, 2 deletions(-)

diff --git a/tensorflow/compiler/mlir/lite/tests/debuginfo/concrete_function_error.py b/tensorflow/compiler/mlir/lite/tests/debuginfo/concrete_function_error.py
index 7fe587095b6..96c12d6ff2e 100644
--- a/tensorflow/compiler/mlir/lite/tests/debuginfo/concrete_function_error.py
+++ b/tensorflow/compiler/mlir/lite/tests/debuginfo/concrete_function_error.py
@@ -42,7 +42,6 @@ class TestGraphDebugInfo(object):
 
     func = model.get_concrete_function()
     converter = tf.lite.TFLiteConverter.from_concrete_functions([func])
-    converter.experimental_new_converter = True
     converter.convert()
 
 # pylint: disable=line-too-long
diff --git a/tensorflow/compiler/mlir/lite/tests/debuginfo/saved_model_error.py b/tensorflow/compiler/mlir/lite/tests/debuginfo/saved_model_error.py
index fa35d229bc4..2fb58926306 100644
--- a/tensorflow/compiler/mlir/lite/tests/debuginfo/saved_model_error.py
+++ b/tensorflow/compiler/mlir/lite/tests/debuginfo/saved_model_error.py
@@ -51,7 +51,6 @@ class TestGraphDebugInfo(object):
 
     # load the model and convert
     converter = tf.lite.TFLiteConverter.from_saved_model(saved_model_path)
-    converter.experimental_new_converter = True
     converter.convert()
 
 # pylint: disable=line-too-long

From 9cef3740e3cf09659d8f97b2982350c88b103af3 Mon Sep 17 00:00:00 2001
From: XingyuLong <halolong@hotmail.com>
Date: Fri, 26 Jun 2020 15:44:06 -0400
Subject: [PATCH 1173/1390] Using a parameterized test

---
 .../python/keras/layers/convolutional_test.py | 160 +++++++++---------
 1 file changed, 81 insertions(+), 79 deletions(-)

diff --git a/tensorflow/python/keras/layers/convolutional_test.py b/tensorflow/python/keras/layers/convolutional_test.py
index 88cbdec30e2..8dea1e78cb7 100644
--- a/tensorflow/python/keras/layers/convolutional_test.py
+++ b/tensorflow/python/keras/layers/convolutional_test.py
@@ -723,95 +723,97 @@ class ZeroPaddingTest(keras_parameterized.TestCase):
       with self.assertRaises(ValueError):
         keras.layers.ZeroPadding2D(padding=None)
 
-  def test_zero_padding_3d(self):
+  @parameterized.named_parameters(
+      ('channels_first', 'channels_last'),
+  )
+  def test_zero_padding_3d(self, data_format):
     num_samples = 2
     stack_size = 2
     input_len_dim1 = 4
     input_len_dim2 = 5
     input_len_dim3 = 3
 
-    for data_format in ['channels_first', 'channels_last']:
-      if data_format == 'channels_first':
-        inputs = np.ones((num_samples, stack_size, input_len_dim1,
-                          input_len_dim2, input_len_dim3))
-      elif data_format == 'channels_last':
-        inputs = np.ones((num_samples, input_len_dim1, input_len_dim2,
-                          input_len_dim3, stack_size))
+    if data_format == 'channels_first':
+      inputs = np.ones((num_samples, stack_size, input_len_dim1,
+                        input_len_dim2, input_len_dim3))
+    elif data_format == 'channels_last':
+      inputs = np.ones((num_samples, input_len_dim1, input_len_dim2,
+                        input_len_dim3, stack_size))
 
-      with self.cached_session(use_gpu=True):
-        # basic test
-        testing_utils.layer_test(
-            keras.layers.ZeroPadding3D,
-            kwargs={'padding': (2, 2, 2),
-                    'data_format': data_format},
-            input_shape=inputs.shape)
-        testing_utils.layer_test(
-            keras.layers.ZeroPadding3D,
-            kwargs={'padding': ((1, 2), (3, 4), (0, 2)),
-                    'data_format': data_format},
-            input_shape=inputs.shape)
+    with self.cached_session(use_gpu=True):
+      # basic test
+      testing_utils.layer_test(
+          keras.layers.ZeroPadding3D,
+          kwargs={'padding': (2, 2, 2),
+                  'data_format': data_format},
+          input_shape=inputs.shape)
+      testing_utils.layer_test(
+          keras.layers.ZeroPadding3D,
+          kwargs={'padding': ((1, 2), (3, 4), (0, 2)),
+                  'data_format': data_format},
+          input_shape=inputs.shape)
 
-      with self.cached_session(use_gpu=True):
-        # correctness test
-        layer = keras.layers.ZeroPadding3D(padding=(2, 2, 2),
-                                           data_format=data_format)
-        layer.build(inputs.shape)
-        output = layer(keras.backend.variable(inputs))
-        if context.executing_eagerly():
-          np_output = output.numpy()
-        else:
-          np_output = keras.backend.eval(output)
-        if data_format == 'channels_last':
-          for offset in [0, 1, -1, -2]:
-            np.testing.assert_allclose(np_output[:, offset, :, :, :], 0.)
-            np.testing.assert_allclose(np_output[:, :, offset, :, :], 0.)
-            np.testing.assert_allclose(np_output[:, :, :, offset, :], 0.)
-          np.testing.assert_allclose(np_output[:, 2:-2, 2:-2, 2:-2, :], 1.)
-        elif data_format == 'channels_first':
-          for offset in [0, 1, -1, -2]:
-            np.testing.assert_allclose(np_output[:, :, offset, :, :], 0.)
-            np.testing.assert_allclose(np_output[:, :, :, offset, :], 0.)
-            np.testing.assert_allclose(np_output[:, :, :, :, offset], 0.)
-          np.testing.assert_allclose(np_output[:, :, 2:-2, 2:-2, 2:-2], 1.)
+    with self.cached_session(use_gpu=True):
+      # correctness test
+      layer = keras.layers.ZeroPadding3D(padding=(2, 2, 2),
+                                         data_format=data_format)
+      layer.build(inputs.shape)
+      output = layer(keras.backend.variable(inputs))
+      if context.executing_eagerly():
+        np_output = output.numpy()
+      else:
+        np_output = keras.backend.eval(output)
+      if data_format == 'channels_last':
+        for offset in [0, 1, -1, -2]:
+          np.testing.assert_allclose(np_output[:, offset, :, :, :], 0.)
+          np.testing.assert_allclose(np_output[:, :, offset, :, :], 0.)
+          np.testing.assert_allclose(np_output[:, :, :, offset, :], 0.)
+        np.testing.assert_allclose(np_output[:, 2:-2, 2:-2, 2:-2, :], 1.)
+      elif data_format == 'channels_first':
+        for offset in [0, 1, -1, -2]:
+          np.testing.assert_allclose(np_output[:, :, offset, :, :], 0.)
+          np.testing.assert_allclose(np_output[:, :, :, offset, :], 0.)
+          np.testing.assert_allclose(np_output[:, :, :, :, offset], 0.)
+        np.testing.assert_allclose(np_output[:, :, 2:-2, 2:-2, 2:-2], 1.)
 
-        layer = keras.layers.ZeroPadding3D(padding=((1, 2), (3, 4), (0, 2)),
-                                           data_format=data_format)
-        layer.build(inputs.shape)
-        output = layer(keras.backend.variable(inputs))
-        if context.executing_eagerly():
-          np_output = output.numpy()
-        else:
-          np_output = keras.backend.eval(output)
-        if data_format == 'channels_last':
-          for offset in [0]:
-            np.testing.assert_allclose(np_output[:, offset, :, :, :], 0.)
-          for offset in [-1, -2]:
-            np.testing.assert_allclose(np_output[:, offset, :, :, :], 0.)
-          for offset in [0, 1, 2]:
-            np.testing.assert_allclose(np_output[:, :, offset, :, :], 0.)
-          for offset in [-1, -2, -3, -4]:
-            np.testing.assert_allclose(np_output[:, :, offset, :, :], 0.)
-          for offset in [-1, -2]:
-            np.testing.assert_allclose(np_output[:, :, :, offset, :], 0.)
-          np.testing.assert_allclose(np_output[:, 1:-2, 3:-4, 0:-2, :], 1.)
-        elif data_format == 'channels_first':
-          for offset in [0]:
-            np.testing.assert_allclose(np_output[:, :, offset, :, :], 0.)
-          for offset in [-1, -2]:
-            np.testing.assert_allclose(np_output[:, :, offset, :, :], 0.)
-          for offset in [0, 1, 2]:
-            np.testing.assert_allclose(np_output[:, :, :, offset, :], 0.)
-          for offset in [-1, -2, -3, -4]:
-            np.testing.assert_allclose(np_output[:, :, :, offset, :], 0.)
-          for offset in [-1, -2]:
-            np.testing.assert_allclose(np_output[:, :, :, :, offset], 0.)
-          np.testing.assert_allclose(np_output[:, :, 1:-2, 3:-4, 0:-2], 1.)
+      layer = keras.layers.ZeroPadding3D(padding=((1, 2), (3, 4), (0, 2)),
+                                         data_format=data_format)
+      layer.build(inputs.shape)
+      output = layer(keras.backend.variable(inputs))
+      if context.executing_eagerly():
+        np_output = output.numpy()
+      else:
+        np_output = keras.backend.eval(output)
+      if data_format == 'channels_last':
+        for offset in [0]:
+          np.testing.assert_allclose(np_output[:, offset, :, :, :], 0.)
+        for offset in [-1, -2]:
+          np.testing.assert_allclose(np_output[:, offset, :, :, :], 0.)
+        for offset in [0, 1, 2]:
+          np.testing.assert_allclose(np_output[:, :, offset, :, :], 0.)
+        for offset in [-1, -2, -3, -4]:
+          np.testing.assert_allclose(np_output[:, :, offset, :, :], 0.)
+        for offset in [-1, -2]:
+          np.testing.assert_allclose(np_output[:, :, :, offset, :], 0.)
+        np.testing.assert_allclose(np_output[:, 1:-2, 3:-4, 0:-2, :], 1.)
+      elif data_format == 'channels_first':
+        for offset in [0]:
+          np.testing.assert_allclose(np_output[:, :, offset, :, :], 0.)
+        for offset in [-1, -2]:
+          np.testing.assert_allclose(np_output[:, :, offset, :, :], 0.)
+        for offset in [0, 1, 2]:
+          np.testing.assert_allclose(np_output[:, :, :, offset, :], 0.)
+        for offset in [-1, -2, -3, -4]:
+          np.testing.assert_allclose(np_output[:, :, :, offset, :], 0.)
+        for offset in [-1, -2]:
+          np.testing.assert_allclose(np_output[:, :, :, :, offset], 0.)
+        np.testing.assert_allclose(np_output[:, :, 1:-2, 3:-4, 0:-2], 1.)
 
-      # test incorrect use
-      with self.assertRaises(ValueError):
-        keras.layers.ZeroPadding3D(padding=(1, 1))
-      with self.assertRaises(ValueError):
-        keras.layers.ZeroPadding3D(padding=None)
+    # test incorrect use
+    with self.assertRaises(ValueError):
+      keras.layers.ZeroPadding3D(padding=(1, 1))
+    with self.assertRaises(ValueError):
+      keras.layers.ZeroPadding3D(padding=None)
 
 
 @test_util.for_all_test_methods(test_util.disable_xla,

From 67a8a94e632203417ea052f0e62e118fc0347102 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 26 Jun 2020 12:45:33 -0700
Subject: [PATCH 1174/1390] Add SaveableObjects to SavedModel.

When objects are loaded from the SavedModel, they don't retain their `_gather_saveables_for_checkpoint` functions, which can result in values not being loaded from the checkpoint.

This CL adds a field in the SavedModel proto that stores a save and restore function for each SaveableObject in each node. When loading into Python, the SaveableObjects are restored using the functions.

PiperOrigin-RevId: 318525215
Change-Id: Ic4690e0f4d24da9595c44865bbd130c323f8490a
---
 RELEASE.md                                    |   4 -
 .../core/protobuf/saved_object_graph.proto    |   8 --
 tensorflow/python/ops/lookup_ops.py           |  22 ++-
 tensorflow/python/saved_model/load.py         |  13 --
 tensorflow/python/saved_model/load_test.py    |  16 ---
 tensorflow/python/saved_model/save.py         |  48 +------
 .../training/saving/saveable_object_util.py   | 136 +-----------------
 tensorflow/python/training/tracking/base.py   |   8 +-
 .../python/training/tracking/graph_view.py    |  18 +--
 9 files changed, 20 insertions(+), 253 deletions(-)

diff --git a/RELEASE.md b/RELEASE.md
index 37b95bb1f40..68d9399676a 100644
--- a/RELEASE.md
+++ b/RELEASE.md
@@ -65,10 +65,6 @@ This release contains contributions from many people at Google, as well as:
     exsiting C++ kernel `ExtractGlimpse` does not change as well, so saved
     models will not be impacted.
 
-## Bug Fixes and Other Changes
-
-* Mutable tables now restore checkpointed values when loaded from SavedModel.
-
 # Release 2.1.1
 
 ## Bug Fixes and Other Changes
diff --git a/tensorflow/core/protobuf/saved_object_graph.proto b/tensorflow/core/protobuf/saved_object_graph.proto
index 981908cfa3c..e794b885dec 100644
--- a/tensorflow/core/protobuf/saved_object_graph.proto
+++ b/tensorflow/core/protobuf/saved_object_graph.proto
@@ -61,8 +61,6 @@ message SavedObject {
     SavedConstant constant = 9;
     SavedResource resource = 10;
   }
-
-  map<string, SaveableObject> saveable_objects = 11;
 }
 
 // A SavedUserObject is an object (in the object-oriented language of the
@@ -164,9 +162,3 @@ message SavedResource {
   // device.
   string device = 1;
 }
-
-message SaveableObject {
-  // Node ids of concrete functions for saving and loading from a checkpoint.
-  int32 save_function = 2;
-  int32 restore_function = 3;
-}
diff --git a/tensorflow/python/ops/lookup_ops.py b/tensorflow/python/ops/lookup_ops.py
index 96f3cf91499..15c7f12f89c 100644
--- a/tensorflow/python/ops/lookup_ops.py
+++ b/tensorflow/python/ops/lookup_ops.py
@@ -1870,27 +1870,25 @@ class MutableHashTable(LookupInterface):
     return {
         "table":
             functools.partial(
-                MutableHashTable._Saveable, table=self, name=self._name,
-                table_name=self._name)
+                MutableHashTable._Saveable, table=self, name=self._name)
     }
 
   class _Saveable(BaseSaverBuilder.SaveableObject):
-    """SaveableObject implementation for DenseHashTable."""
+    """SaveableObject implementation for MutableHashTable."""
 
-    def __init__(self, table, name, table_name=None):
+    def __init__(self, table, name):
       tensors = table.export()
       specs = [
           BaseSaverBuilder.SaveSpec(tensors[0], "", name + "-keys"),
           BaseSaverBuilder.SaveSpec(tensors[1], "", name + "-values")
       ]
-      self.table_name = table_name or name
       # pylint: disable=protected-access
       super(MutableHashTable._Saveable, self).__init__(table, specs, name)
 
-    def restore(self, restored_tensors, restored_shapes):
+    def restore(self, restored_tensors, restored_shapes, name=None):
       del restored_shapes  # unused
       # pylint: disable=protected-access
-      with ops.name_scope("%s_table_restore" % self.table_name):
+      with ops.name_scope(name, "%s_table_restore" % self.name):
         with ops.colocate_with(self.op.resource_handle):
           return gen_lookup_ops.lookup_table_import_v2(self.op.resource_handle,
                                                        restored_tensors[0],
@@ -2168,27 +2166,25 @@ class DenseHashTable(LookupInterface):
     return {
         "table":
             functools.partial(
-                DenseHashTable._Saveable, table=self, name=self._name,
-                table_name=self._name)
+                DenseHashTable._Saveable, table=self, name=self._name)
     }
 
   class _Saveable(BaseSaverBuilder.SaveableObject):
     """SaveableObject implementation for DenseHashTable."""
 
-    def __init__(self, table, name, table_name=None):
+    def __init__(self, table, name):
       tensors = table.export()
       specs = [
           BaseSaverBuilder.SaveSpec(tensors[0], "", name + "-keys"),
           BaseSaverBuilder.SaveSpec(tensors[1], "", name + "-values")
       ]
-      self.table_name = table_name or name
       # pylint: disable=protected-access
       super(DenseHashTable._Saveable, self).__init__(table, specs, name)
 
-    def restore(self, restored_tensors, restored_shapes):
+    def restore(self, restored_tensors, restored_shapes, name=None):
       del restored_shapes  # unused
       # pylint: disable=protected-access
-      with ops.name_scope("%s_table_restore" % self.table_name):
+      with ops.name_scope(name, "%s_table_restore" % self.name):
         with ops.colocate_with(self.op.resource_handle):
           return gen_lookup_ops.lookup_table_import_v2(self.op.resource_handle,
                                                        restored_tensors[0],
diff --git a/tensorflow/python/saved_model/load.py b/tensorflow/python/saved_model/load.py
index 0835481ab69..fb2d01cbee2 100644
--- a/tensorflow/python/saved_model/load.py
+++ b/tensorflow/python/saved_model/load.py
@@ -45,7 +45,6 @@ from tensorflow.python.saved_model import nested_structure_coder
 from tensorflow.python.saved_model import revived_types
 from tensorflow.python.saved_model import utils_impl as saved_model_utils
 from tensorflow.python.training.saving import checkpoint_options
-from tensorflow.python.training.saving import saveable_object_util
 from tensorflow.python.training.tracking import base
 from tensorflow.python.training.tracking import graph_view
 from tensorflow.python.training.tracking import tracking
@@ -147,18 +146,6 @@ class Loader(object):
     self._setup_functions_structures()
     self._setup_functions_captures()
 
-    self._create_saveable_object_factories()
-
-  def _create_saveable_object_factories(self):
-    for node_id, proto in enumerate(self._proto.nodes):
-      node = self.get(node_id)
-      node._self_saveable_object_factories = {}  # pylint: disable=protected-access
-      for name, saveable_object_proto in proto.saveable_objects.items():
-        node._self_saveable_object_factories[name] = (  # pylint: disable=protected-access
-            saveable_object_util.restored_saved_object_factory(
-                self.get(saveable_object_proto.save_function),
-                self.get(saveable_object_proto.restore_function)))
-
   def _load_edges(self):
     """Adds edges from objects to other objects and functions."""
     for node_id, object_proto in enumerate(self._proto.nodes):
diff --git a/tensorflow/python/saved_model/load_test.py b/tensorflow/python/saved_model/load_test.py
index c392c7feb31..5449cc1c9a2 100644
--- a/tensorflow/python/saved_model/load_test.py
+++ b/tensorflow/python/saved_model/load_test.py
@@ -1795,22 +1795,6 @@ class LoadTest(test.TestCase, parameterized.TestCase):
     options = load_options.LoadOptions(experimental_io_device="/job:localhost")
     self.assertEqual("/job:localhost", options.experimental_io_device)
 
-  def test_load_custom_saveable_object(self, cycles):
-    root = tracking.AutoTrackable()
-    root.table = lookup_ops.MutableHashTable(dtypes.string, dtypes.float32, -1)
-    root.table.insert("foo", 15)
-
-    @def_function.function(
-        input_signature=[tensor_spec.TensorSpec(None, dtypes.string)])
-    def lookup(key):
-      return root.table.lookup(key)
-
-    root.lookup = lookup
-
-    imported = cycle(root, cycles)
-    self.assertEqual(self.evaluate(imported.lookup("foo")), 15)
-    self.assertEqual(self.evaluate(imported.lookup("idk")), -1)
-
 
 class SingleCycleTests(test.TestCase, parameterized.TestCase):
 
diff --git a/tensorflow/python/saved_model/save.py b/tensorflow/python/saved_model/save.py
index 4220146b6c8..84764431b9d 100644
--- a/tensorflow/python/saved_model/save.py
+++ b/tensorflow/python/saved_model/save.py
@@ -19,7 +19,6 @@ from __future__ import division
 from __future__ import print_function
 
 import collections
-import functools
 import os
 
 from tensorflow.core.framework import versions_pb2
@@ -55,7 +54,6 @@ from tensorflow.python.saved_model import tag_constants
 from tensorflow.python.saved_model import utils_impl
 from tensorflow.python.training.saving import checkpoint_options
 from tensorflow.python.training.saving import functional_saver
-from tensorflow.python.training.saving import saveable_object_util
 from tensorflow.python.training.tracking import base
 from tensorflow.python.training.tracking import graph_view
 from tensorflow.python.training.tracking import tracking
@@ -139,15 +137,12 @@ class _AugmentedGraphView(graph_view.ObjectGraphView):
     return obj._list_extra_dependencies_for_serialization(  # pylint: disable=protected-access
         self._serialization_cache)
 
-  def list_functions(self, obj, extra_functions=None):
+  def list_functions(self, obj):
     obj_functions = self._functions.get(obj, None)
     if obj_functions is None:
       obj_functions = obj._list_functions_for_serialization(  # pylint: disable=protected-access
           self._serialization_cache)
       self._functions[obj] = obj_functions
-    if extra_functions:
-      obj_functions = obj_functions.copy()
-      obj_functions.update(extra_functions)
     return obj_functions
 
 
@@ -183,12 +178,6 @@ class _SaveableView(object):
     self.slot_variables = slot_variables
     self.concrete_functions = []
 
-    self.saveable_objects_for_node, all_saveable_functions = (
-        self._add_saveable_objects())
-    saveable_object_functions = {
-        "__SAVEABLE_FUNCTION_{}".format(n): fn
-        for n, fn in enumerate(all_saveable_functions)}
-
     # Maps functions -> wrapped functions that capture variables
     self.wrapped_functions = wrapped_functions or {}
     # Maps names of concrete functions in the object to names of wrapped
@@ -202,8 +191,7 @@ class _SaveableView(object):
     nodes_without_functions = list(self.nodes)
     seen_function_names = set()
     for node in nodes_without_functions:
-      for function in checkpoint_view.list_functions(
-          node, saveable_object_functions).values():
+      for function in checkpoint_view.list_functions(node).values():
         if function not in self.node_ids:
           self.node_ids[function] = len(self.nodes)
           self.nodes.append(function)
@@ -222,25 +210,6 @@ class _SaveableView(object):
             seen_function_names.add(concrete_function.name)
             self.concrete_functions.append(concrete_function)
 
-  def _add_saveable_objects(self):
-    """Retrieves SaveablesObjects and traces their save/restore functions."""
-    # Maps node -> local name -> (save function, restore function)
-    saveable_objects_map = object_identity.ObjectIdentityDictionary()
-    all_saveable_functions = []
-    for node in self.nodes:
-      if resource_variable_ops.is_resource_variable(node):
-        # Resource (and TPU/Mirrored) variables  are automatically revived with
-        # their saveables defined, so there is no need to trace the save
-        # and restore functions.
-        continue
-      saveable_map = saveable_object_util.trace_save_restore_functions(node)
-      if saveable_map:
-        saveable_objects_map[node] = saveable_map
-        for save_fn, restore_fn in saveable_map.values():
-          all_saveable_functions.append(save_fn)
-          all_saveable_functions.append(restore_fn)
-    return saveable_objects_map, all_saveable_functions
-
   @property
   def root(self):
     return self.nodes[0]
@@ -265,15 +234,6 @@ class _SaveableView(object):
         child_proto.node_id = self.node_ids[ref_function]
         child_proto.local_name = local_name
 
-      if node not in self.saveable_objects_for_node:
-        continue
-
-      for local_name, (save_fn, restore_fn) in (
-          self.saveable_objects_for_node[node].items()):
-        saveable_object_proto = object_proto.saveable_objects[local_name]
-        saveable_object_proto.save_function = self.node_ids[save_fn]
-        saveable_object_proto.restore_function = self.node_ids[restore_fn]
-
   def map_resources(self):
     """Makes new resource handle ops corresponding to existing resource tensors.
 
@@ -646,9 +606,7 @@ def _fill_meta_graph_def(meta_graph_def, saveable_view, signature_functions,
   # the exported graph (thus the `to_graph` argument).
   saver = functional_saver.MultiDeviceSaver(
       saveable_view.checkpoint_view.frozen_saveable_objects(
-          object_map=object_map, to_graph=exported_graph,
-          call_with_mapped_captures=functools.partial(
-              _call_function_with_mapped_captures, resource_map=resource_map)))
+          object_map=object_map, to_graph=exported_graph))
 
   with exported_graph.as_default():
     signatures = _generate_signatures(signature_functions, resource_map)
diff --git a/tensorflow/python/training/saving/saveable_object_util.py b/tensorflow/python/training/saving/saveable_object_util.py
index bd8814a3c16..59d65ade573 100644
--- a/tensorflow/python/training/saving/saveable_object_util.py
+++ b/tensorflow/python/training/saving/saveable_object_util.py
@@ -17,26 +17,15 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import functools
 import six
 
 from tensorflow.python.eager import context
-from tensorflow.python.eager import def_function
-
-from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import device as pydev
-from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_spec
-from tensorflow.python.framework import tensor_util
-from tensorflow.python.framework import type_spec
-
-
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variables
-from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training.saving import saveable_object
 from tensorflow.python.training.tracking import base as trackable
 from tensorflow.python.util import nest
@@ -290,7 +279,7 @@ def op_list_to_dict(op_list, convert_variable_to_tensor=True):
           raise ValueError(
               ("Two different ResourceVariable objects with the same "
                "shared_name '%s' were passed to the Saver. This likely means "
-               "that they were created in different Graphs or isoWlation "
+               "that they were created in different Graphs or isolation "
                "contexts, and may not be checkpointed together.") %
               (var._shared_name,))
       else:
@@ -360,126 +349,3 @@ def validate_and_slice_inputs(names_to_saveables):
     for converted_saveable_object in saveable_objects_for_op(op, name):
       _add_saveable(saveables, seen_ops, converted_saveable_object)
   return saveables
-
-
-def trace_save_restore_functions(object_to_save):
-  """Gathers all SaveableObjects and traces the save and restore ops."""
-  saveable_map = {}  # Maps name -> (save function, restore function)
-  for name, saveable_factory in (
-      object_to_save._gather_saveables_for_checkpoint().items()):  # pylint: disable=protected-access
-    if not callable(saveable_factory):
-      if isinstance(saveable_factory, saveable_object.SaveableObject):
-        logging.debug(
-            "Trackable {} should return callable factories, not SaveableObjects"
-            " in `_gather_saveables_for_checkpoint`. This could lead to "
-            "problems loading the SavedModel back into Python."
-            .format(object_to_save))
-      continue
-
-    if is_factory_for_restored_saveable_object(saveable_factory):
-      saveable_map[name] = (saveable_factory.keywords["save_function"],
-                            saveable_factory.keywords["restore_function"])
-    else:
-      concrete_save_fn, concrete_restore_fn = _trace_save_and_restore_function(
-          saveable_factory, object_to_save)
-      if concrete_save_fn is not None:
-        saveable_map[name] = (concrete_save_fn, concrete_restore_fn)
-  return saveable_map
-
-
-def _trace_save_and_restore_function(saveable_factory, object_to_save):
-  """Traces the save and restore concrete functions."""
-  saveable = [0]
-
-  @def_function.function(
-      input_signature=[tensor_spec.TensorSpec([], dtypes.string)])
-  def save_fn(checkpoint_key):
-    saveable[0] = saveable_factory(name=checkpoint_key)
-    return [{"name": s.name, "tensor": s.tensor}
-            for s in saveable[0].specs]
-
-  concrete_save_fn = save_fn.get_concrete_function()
-  if isinstance(saveable[0], trackable.PythonStateSaveable):
-    logging.warn(
-        "Note that object {} stores python values into the checkpoint. "
-        "These values will not be restored when loading the SavedModel "
-        "into python.".format(object_to_save))
-    return None, None
-  if isinstance(saveable[0], trackable.NoRestoreSaveable):
-    return None, None
-
-  restored_type_specs = [type_spec.type_spec_from_value(s.tensor)
-                         for s in saveable[0].specs]\
-
-  @def_function.function(input_signature=[restored_type_specs])
-  def restore_fn(restored_tensors):
-    saveable[0].restore(restored_tensors, restored_shapes=None)
-    return 1
-
-  concrete_restore_fn = restore_fn.get_concrete_function()
-  return concrete_save_fn, concrete_restore_fn
-
-
-class RestoredSaveableObject(saveable_object.SaveableObject):
-  """SaveableObject restored from SavedModel using the traced save/restore."""
-
-  def __init__(self, save_function, restore_function, name):
-    self.save_function = save_function
-    self.restore_function = restore_function
-
-    if tensor_util.is_tensor(name):
-      name_tensor = name
-    else:
-      with ops.init_scope():
-        name_tensor = constant_op.constant(name)
-    tensors = save_function(name_tensor)
-    specs = [saveable_object.SaveSpec(x["tensor"], "", x["name"])
-             for x in tensors]
-    super(RestoredSaveableObject, self).__init__(None, specs, name)
-
-  def restore(self, restored_tensors, restored_shapes):
-    del restored_shapes  # unused
-    return self.restore_function(
-        *[restored_tensors[i] for i in range(len(self.specs))])
-
-
-def restored_saved_object_factory(save_function, restore_function):
-  return functools.partial(RestoredSaveableObject,
-                           save_function=save_function,
-                           restore_function=restore_function)
-
-
-def create_saveable_object(factory, name, call_with_mapped_captures):
-  """Creates a SaveableObject while potentially in a different graph.
-
-  When creating the frozen saver for SavedModel, the save and restore ops are
-  placed in a separate graph. Since RestoredSaveableObject uses tf.functions to
-  save and restore, the function captures must be mapped to the new graph.
-
-  Args:
-    factory: Factory method for creating the SaveableObject.
-    name: Checkpoint key of this SaveableObject.
-    call_with_mapped_captures: Helper that calls a tf.function while remapping
-      the captures.
-
-  Returns:
-    a SaveableObject.
-  """
-  if (call_with_mapped_captures is None or
-      not is_factory_for_restored_saveable_object(factory)):
-    return factory(name=name)
-
-  concrete_save_fn = factory.keywords["save_function"]
-  def save_fn(name):
-    return call_with_mapped_captures(concrete_save_fn, [name])
-
-  concrete_restore_fn = factory.keywords["restore_function"]
-  def restore_fn(*restored_tensors):
-    return call_with_mapped_captures(concrete_restore_fn, restored_tensors)
-
-  return factory(save_function=save_fn, restore_function=restore_fn, name=name)
-
-
-def is_factory_for_restored_saveable_object(factory):
-  return (isinstance(factory, functools.partial) and
-          factory.func is RestoredSaveableObject)
diff --git a/tensorflow/python/training/tracking/base.py b/tensorflow/python/training/tracking/base.py
index 9337adbf88a..ea76ad8db47 100644
--- a/tensorflow/python/training/tracking/base.py
+++ b/tensorflow/python/training/tracking/base.py
@@ -611,12 +611,6 @@ class Trackable(object):
     # building.
     self._self_name_based_restores = set()
 
-    # Dictionary of SaveableObjects factories. This dictionary is defined when
-    # the object is loaded from the SavedModel. When writing a custom class,
-    # prefer overriding "_gather_saveables_from_checkpoint" to using this
-    # attribute.
-    self._self_saveable_object_factories = {}
-
   @property
   def _object_identifier(self):
     """String used to identify this object in a SavedModel.
@@ -978,7 +972,7 @@ class Trackable(object):
        lambda name="global_name_for_this_object":
        SaveableObject(name=name, ...)}
     """
-    return self._self_saveable_object_factories
+    return {}
 
   def _list_extra_dependencies_for_serialization(self, serialization_cache):
     """Lists extra dependencies to serialize.
diff --git a/tensorflow/python/training/tracking/graph_view.py b/tensorflow/python/training/tracking/graph_view.py
index 1cf84023b1c..041ff38eedd 100644
--- a/tensorflow/python/training/tracking/graph_view.py
+++ b/tensorflow/python/training/tracking/graph_view.py
@@ -208,7 +208,7 @@ class ObjectGraphView(object):
 
   def _add_attributes_to_object_graph(
       self, trackable_objects, object_graph_proto, node_ids, object_names,
-      object_map, call_with_mapped_captures):
+      object_map):
     """Create SaveableObjects and corresponding SerializedTensor protos."""
     named_saveable_objects = []
     if self._saveables_cache is None:
@@ -253,9 +253,7 @@ class ObjectGraphView(object):
                 break
         if saveables is None:
           if callable(saveable_factory):
-            maybe_saveable = saveable_object_util.create_saveable_object(
-                saveable_factory, attribute.checkpoint_key,
-                call_with_mapped_captures)
+            maybe_saveable = saveable_factory(name=attribute.checkpoint_key)
           else:
             maybe_saveable = saveable_factory
           if isinstance(maybe_saveable, saveable_object_lib.SaveableObject):
@@ -334,8 +332,7 @@ class ObjectGraphView(object):
     return object_graph_proto
 
   def _serialize_gathered_objects(self, trackable_objects, path_to_root,
-                                  object_map=None,
-                                  call_with_mapped_captures=None):
+                                  object_map=None):
     """Create SaveableObjects and protos for gathered objects."""
     object_names = object_identity.ObjectIdentityDictionary()
     for obj, path in path_to_root.items():
@@ -357,8 +354,7 @@ class ObjectGraphView(object):
             object_graph_proto=object_graph_proto,
             node_ids=node_ids,
             object_names=object_names,
-            object_map=object_map,
-            call_with_mapped_captures=call_with_mapped_captures))
+            object_map=object_map))
     return named_saveable_objects, object_graph_proto, feed_additions
 
   def serialize_object_graph(self):
@@ -386,8 +382,7 @@ class ObjectGraphView(object):
     return self._serialize_gathered_objects(
         trackable_objects, path_to_root)
 
-  def frozen_saveable_objects(self, object_map=None, to_graph=None,
-                              call_with_mapped_captures=None):
+  def frozen_saveable_objects(self, object_map=None, to_graph=None):
     """Creates SaveableObjects with the current object graph frozen."""
     trackable_objects, path_to_root = self._breadth_first_traversal()
     if to_graph:
@@ -398,8 +393,7 @@ class ObjectGraphView(object):
       named_saveable_objects, graph_proto, _ = self._serialize_gathered_objects(
           trackable_objects,
           path_to_root,
-          object_map,
-          call_with_mapped_captures)
+          object_map)
       with ops.device("/cpu:0"):
         object_graph_tensor = constant_op.constant(
             graph_proto.SerializeToString(), dtype=dtypes.string)

From fe045a762ef4f905b6f6413d4570490939e7c94b Mon Sep 17 00:00:00 2001
From: XingyuLong <halolong@hotmail.com>
Date: Fri, 26 Jun 2020 15:53:41 -0400
Subject: [PATCH 1175/1390] fix the parameter usage

---
 tensorflow/python/keras/layers/convolutional_test.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tensorflow/python/keras/layers/convolutional_test.py b/tensorflow/python/keras/layers/convolutional_test.py
index 8dea1e78cb7..42093abfdb7 100644
--- a/tensorflow/python/keras/layers/convolutional_test.py
+++ b/tensorflow/python/keras/layers/convolutional_test.py
@@ -724,7 +724,8 @@ class ZeroPaddingTest(keras_parameterized.TestCase):
         keras.layers.ZeroPadding2D(padding=None)
 
   @parameterized.named_parameters(
-      ('channels_first', 'channels_last'),
+      ('channels_first', 'channels_first'),
+      ('channels_last', 'channels_last')
   )
   def test_zero_padding_3d(self, data_format):
     num_samples = 2

From 71873b4394a83b9c4d8aa8a873293f9718e3a82b Mon Sep 17 00:00:00 2001
From: Katherine Wu <kathywu@google.com>
Date: Fri, 26 Jun 2020 12:56:56 -0700
Subject: [PATCH 1176/1390] Add benchmarks folder, with benchmarks for saving
 and loading application models to the SavedModel format.

PiperOrigin-RevId: 318527488
Change-Id: I6131731146dbbfcf48d23937e6edfc3827a4692d
---
 tensorflow/python/keras/benchmark/BUILD       | 12 +++
 .../applications_saved_model_test.py          | 93 +++++++++++++++++++
 2 files changed, 105 insertions(+)
 mode change 100755 => 100644 tensorflow/python/keras/benchmark/BUILD
 create mode 100644 tensorflow/python/keras/benchmark/applications_saved_model_test.py

diff --git a/tensorflow/python/keras/benchmark/BUILD b/tensorflow/python/keras/benchmark/BUILD
old mode 100755
new mode 100644
index 9fc709ad4de..3387d7ca78b
--- a/tensorflow/python/keras/benchmark/BUILD
+++ b/tensorflow/python/keras/benchmark/BUILD
@@ -43,3 +43,15 @@ cuda_py_test(
         "//tensorflow/python/keras",
     ],
 )
+
+cuda_py_test(
+    name = "applications_saved_model_test",
+    size = "medium",
+    srcs = ["applications_saved_model_test.py"],
+    shard_count = 8,
+    deps = [
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python/keras/applications",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
diff --git a/tensorflow/python/keras/benchmark/applications_saved_model_test.py b/tensorflow/python/keras/benchmark/applications_saved_model_test.py
new file mode 100644
index 00000000000..0111c8f13b9
--- /dev/null
+++ b/tensorflow/python/keras/benchmark/applications_saved_model_test.py
@@ -0,0 +1,93 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Benchmarks for Keras applications."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tempfile
+import time
+
+import six
+
+from tensorflow.python.keras.applications import densenet
+from tensorflow.python.keras.applications import efficientnet
+from tensorflow.python.keras.applications import inception_resnet_v2
+from tensorflow.python.keras.applications import mobilenet_v2
+from tensorflow.python.keras.applications import nasnet
+from tensorflow.python.keras.applications import resnet_v2
+from tensorflow.python.keras.applications import vgg19
+from tensorflow.python.keras.applications import xception
+from tensorflow.python.keras.saving.saved_model import load as keras_load
+from tensorflow.python.platform import benchmark
+from tensorflow.python.platform import gfile
+from tensorflow.python.platform import googletest
+from tensorflow.python.platform import test
+
+
+class BenchmarkSaveApplications(
+    six.with_metaclass(benchmark.ParameterizedBenchmark, test.Benchmark)):
+
+  _benchmark_parameters = [
+      ('ResNet152V2', resnet_v2.ResNet152V2, 2048),
+      ('VGG19', vgg19.VGG19, 512),
+      ('Xception', xception.Xception, 2048),
+      ('InceptionResNetV2', inception_resnet_v2.InceptionResNetV2, 1536),
+      ('MobileNetV2', mobilenet_v2.MobileNetV2, 1280),
+      ('DenseNet201', densenet.DenseNet201, 1920),
+      ('EfficientNetB7', efficientnet.EfficientNetB7, 2560),
+      ('NASNetLarge', nasnet.NASNetLarge, 4032),
+  ]
+
+  def benchmark_save_and_load_applications(self, app, _):
+    trials = 3
+
+    model = app(weights=None)
+    model_name = app.__name__
+
+    tmp_dir = googletest.GetTempDir()
+    gfile.MakeDirs(tmp_dir)
+    save_dir = tempfile.mkdtemp(dir=tmp_dir)
+
+    total_save_time = 0
+    total_load_time = 0
+
+    # Run one untimed iteration of saving/loading.
+    model.save(save_dir, save_format='tf')
+    keras_load.load(save_dir)
+
+    for _ in range(trials):
+      start_time = time.time()
+      model.save(save_dir, save_format='tf')
+      total_save_time += time.time() - start_time
+
+      start_time = time.time()
+      keras_load.load(save_dir)
+      total_load_time += time.time() - start_time
+    self.report_benchmark(
+        iters=trials,
+        wall_time=total_save_time / trials,
+        name='{}.save'.format(model_name))
+
+    self.report_benchmark(
+        iters=1,
+        wall_time=total_load_time / trials,
+        name='{}.load'.format(model_name))
+    gfile.DeleteRecursively(save_dir)
+
+
+if __name__ == '__main__':
+  test.main()

From 52ee1265bb7f70c07b8eb9260ceb4e4064a80e56 Mon Sep 17 00:00:00 2001
From: Ran Chen <crccw@google.com>
Date: Fri, 26 Jun 2020 13:12:41 -0700
Subject: [PATCH 1177/1390] Shutdown global MultiProcessPoolRunner used in
 strategy combinations at exit

Otherwise those processes get SIGTERM, which dumps a tons of logs and triggers thread sanitizer since Python's faulthandler library is not thread safe.

This also removes the previous workaround that turns off log streaming after the test finishes, which sometimes clips the logs since logs and test result are communicated via different pipes.

PiperOrigin-RevId: 318530314
Change-Id: I79e6a138858e1674207747cb998bee4c0376217f
---
 .../python/distribute/multi_process_runner.py | 57 ++++++++-----------
 .../distribute/strategy_combinations.py       | 18 ++++++
 2 files changed, 43 insertions(+), 32 deletions(-)

diff --git a/tensorflow/python/distribute/multi_process_runner.py b/tensorflow/python/distribute/multi_process_runner.py
index 3dd651e67b2..68a407d2b22 100644
--- a/tensorflow/python/distribute/multi_process_runner.py
+++ b/tensorflow/python/distribute/multi_process_runner.py
@@ -717,9 +717,10 @@ class MultiProcessPoolRunner(object):
     self._runner = None
 
   def __del__(self):
-    self._reset()
+    self.shutdown()
 
-  def _reset(self):
+  def shutdown(self):
+    """Shuts down the worker pool."""
     for conn in self._conn.values():
       conn.close()
     self._conn = {}
@@ -776,39 +777,31 @@ class MultiProcessPoolRunner(object):
     if self._runner is None:
       self._start()
 
-    # Since we start the processes as daemon they're going to be killed by
-    # SIGTERM when the program exits. We only turn on streaming during run() to
-    # avoid printing the stacktrace caused by the SIGTERM.
-    self._runner._stream_stdout = True  # pylint: disable=protected-access
+    proc_func = dill.dumps(proc_func, dill.HIGHEST_PROTOCOL)
+    for conn in self._conn.values():
+      conn.send((proc_func, args or [], kwargs or {}))
 
-    try:
-      proc_func = dill.dumps(proc_func, dill.HIGHEST_PROTOCOL)
-      for conn in self._conn.values():
-        conn.send((proc_func, args or [], kwargs or {}))
+    process_statuses = []
+    for (task_type, task_id), conn in self._conn.items():
+      logging.info('Waiting for the result from %s-%d', task_type, task_id)
+      try:
+        process_statuses.append(conn.recv())
+      except EOFError:
+        # This shouldn't happen due to exceptions in proc_func. This usually
+        # means bugs in the runner.
+        self.shutdown()
+        raise RuntimeError('Unexpected EOF. Worker process may have died. '
+                           'Please report a bug')
 
-      process_statuses = []
-      for (task_type, task_id), conn in self._conn.items():
-        logging.info('Waiting for the result from %s-%d', task_type, task_id)
-        try:
-          process_statuses.append(conn.recv())
-        except EOFError:
-          # This shouldn't happen due to exceptions in proc_func. This usually
-          # means bugs in the runner.
-          self._reset()
-          raise RuntimeError('Unexpected EOF. Worker process may have died. '
-                             'Please report a bug')
+    return_values = []
+    for process_status in process_statuses:
+      assert isinstance(process_status, _ProcessStatusInfo)
+      if not process_status.is_successful:
+        six.reraise(*process_status.exc_info)
+      if process_status.return_value is not None:
+        return_values.append(process_status.return_value)
 
-      return_values = []
-      for process_status in process_statuses:
-        assert isinstance(process_status, _ProcessStatusInfo)
-        if not process_status.is_successful:
-          six.reraise(*process_status.exc_info)
-        if process_status.return_value is not None:
-          return_values.append(process_status.return_value)
-
-      return return_values
-    finally:
-      self._runner._stream_stdout = False  # pylint: disable=protected-access
+    return return_values
 
 
 def _pool_runner_worker(initializer, conn):
diff --git a/tensorflow/python/distribute/strategy_combinations.py b/tensorflow/python/distribute/strategy_combinations.py
index 21e3320448f..d66c7acba77 100644
--- a/tensorflow/python/distribute/strategy_combinations.py
+++ b/tensorflow/python/distribute/strategy_combinations.py
@@ -18,6 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import atexit
+
 from tensorflow.python import tf2
 from tensorflow.python.distribute import central_storage_strategy
 from tensorflow.python.distribute import cluster_resolver
@@ -236,6 +238,22 @@ multi_worker_mirrored_4x1_cpu = combinations.NamedDistribution(
     num_workers=3,
 )
 
+
+# Shutdown the runners gracefully to avoid the processes getting SIGTERM.
+def _shutdown_at_exit():
+  for strategy in [
+      multi_worker_mirrored_2x1_cpu,
+      multi_worker_mirrored_2x1_gpu,
+      multi_worker_mirrored_2x2_gpu,
+      multi_worker_mirrored_4x1_cpu,
+  ]:
+    if strategy.runner:
+      strategy.runner.shutdown()
+
+
+atexit.register(_shutdown_at_exit)
+
+
 gradient_descent_optimizer_v1_fn = combinations.NamedObject(
     "GradientDescentV1",
     lambda: gradient_descent.GradientDescentOptimizer(0.001))

From 4031bd0d5b21cd2b67f970180fba6eba92480e31 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 26 Jun 2020 13:20:00 -0700
Subject: [PATCH 1178/1390] Integrate LLVM at
 https://github.com/llvm/llvm-project/commit/a95796a380ed

PiperOrigin-RevId: 318531645
Change-Id: I710c2d66eb1315f02df310c96baf264137cd9783
---
 tensorflow/compiler/mlir/xla/operator_writer_gen.cc | 6 +++++-
 tensorflow/workspace.bzl                            | 4 ++--
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/tensorflow/compiler/mlir/xla/operator_writer_gen.cc b/tensorflow/compiler/mlir/xla/operator_writer_gen.cc
index 10dc5ec9dd0..7c2aaa381ba 100644
--- a/tensorflow/compiler/mlir/xla/operator_writer_gen.cc
+++ b/tensorflow/compiler/mlir/xla/operator_writer_gen.cc
@@ -47,7 +47,11 @@ static std::string GetDefaultAttrExport(
                              storage_type.endswith("FloatAttr") ||
                              storage_type.endswith("IntegerAttr") ||
                              storage_type.endswith("StringAttr"))) {
-    return "Convert" + attr.getReturnType().str();
+    // The return type may contains qualified namespaces. Split to remove them.
+    std::pair<StringRef, StringRef> splits = attr.getReturnType().rsplit("::");
+    StringRef symbol = splits.second;
+    if (symbol.empty()) symbol = splits.first;
+    return "Convert" + symbol.str();
   }
   return "Convert_" + named_attr.name.str();
 }
diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index 83c9f4f0f38..21682f0878e 100755
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -710,8 +710,8 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
     )
 
     # Check out LLVM and MLIR from llvm-project.
-    LLVM_COMMIT = "f0bab7875e78e01c149d12302dcc4b6d4c43e25c"
-    LLVM_SHA256 = "80f63ba572f3b73fe88cebe381aa71b63a339197a50d2571e28286506fe2b43e"
+    LLVM_COMMIT = "a95796a380ed011a73a103e7f7ffa372f23438dd"
+    LLVM_SHA256 = "39fdff3b2c617be7f0990f70ac7c5d8ab1b71e99485437fa794cf54bd7ec487d"
     LLVM_URLS = [
         "https://storage.googleapis.com/mirror.tensorflow.org/github.com/llvm/llvm-project/archive/{commit}.tar.gz".format(commit = LLVM_COMMIT),
         "https://github.com/llvm/llvm-project/archive/{commit}.tar.gz".format(commit = LLVM_COMMIT),

From 6a693774adaf69b8bc0db61264dcc559588bbae9 Mon Sep 17 00:00:00 2001
From: Karim Nosir <karimnosseir@google.com>
Date: Fri, 26 Jun 2020 13:55:32 -0700
Subject: [PATCH 1179/1390] - Remove trivial casts which is casting to same
 type to optimize - Add utils.td and move some common conditions to it which
 are shared by multiple places, moved the C++ code for some of the utils to
 convert_type since they are type related.

PiperOrigin-RevId: 318537997
Change-Id: I8773ae3430d98b876286e9e11d4cf073c973dd8b
---
 tensorflow/compiler/mlir/lite/BUILD           |  4 +++
 .../compiler/mlir/lite/tests/optimize.mlir    |  8 +++++
 .../compiler/mlir/lite/transforms/optimize.cc | 14 ++-------
 .../mlir/lite/transforms/optimize_patterns.td | 18 +++++------
 .../compiler/mlir/lite/utils/convert_type.cc  | 24 ++++++++++++++
 .../compiler/mlir/lite/utils/convert_type.h   | 12 +++++--
 tensorflow/compiler/mlir/lite/utils/utils.td  | 31 +++++++++++++++++++
 7 files changed, 87 insertions(+), 24 deletions(-)
 create mode 100644 tensorflow/compiler/mlir/lite/utils/utils.td

diff --git a/tensorflow/compiler/mlir/lite/BUILD b/tensorflow/compiler/mlir/lite/BUILD
index e3e78a50a4f..e254978882f 100644
--- a/tensorflow/compiler/mlir/lite/BUILD
+++ b/tensorflow/compiler/mlir/lite/BUILD
@@ -150,6 +150,7 @@ gentbl(
     td_srcs = [
         ":tensorflow_lite_ops_td_files",
         "@llvm-project//mlir:StdOpsTdFiles",
+        "utils/utils.td",
         "//tensorflow/compiler/mlir/tensorflow:tensorflow_ops_td_files",
     ],
 )
@@ -379,6 +380,7 @@ cc_library(
         "transforms/passes.h",
     ],
     deps = [
+        "convert_type",
         ":tensorflow_lite",
         ":validators",
         "//tensorflow/compiler/mlir/lite/quantization:quantization_lib",
@@ -409,6 +411,7 @@ cc_library(
         "transforms/passes.h",
     ],
     deps = [
+        "convert_type",
         ":tensorflow_lite",
         ":validators",
         "//tensorflow/compiler/mlir/lite/quantization:quantization_config",
@@ -671,6 +674,7 @@ cc_library(
         "utils/convert_type.h",
     ],
     deps = [
+        ":tensorflow_lite",
         "//tensorflow/compiler/mlir/tensorflow:tensorflow_types",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/core:protos_all_cc",
diff --git a/tensorflow/compiler/mlir/lite/tests/optimize.mlir b/tensorflow/compiler/mlir/lite/tests/optimize.mlir
index a5a074aff4d..7861eb1ec6b 100644
--- a/tensorflow/compiler/mlir/lite/tests/optimize.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/optimize.mlir
@@ -984,3 +984,11 @@ func @ReorderAddWithConstant(%arg0: tensor<2x2xf32>) -> tensor<2x2xf32> {
   // CHECK: %[[RESULT:.*]] = tfl.add %arg0, %[[CONST]] {fused_activation_function = "NONE"} : tensor<2x2xf32>
 }
 
+func @RemoveCast(%arg0: tensor<2x2xf32>) -> tensor<2x2xf32> {
+  %1 = "tfl.cast"(%arg0) : (tensor<2x2xf32>) -> tensor<2x2xf32>
+  return %1 : tensor<2x2xf32>
+
+  // CHECK-LABEL: RemoveCast
+  // CHECK: return %arg0
+}
+
diff --git a/tensorflow/compiler/mlir/lite/transforms/optimize.cc b/tensorflow/compiler/mlir/lite/transforms/optimize.cc
index 30ae4b81f4f..ba91861abbc 100644
--- a/tensorflow/compiler/mlir/lite/transforms/optimize.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/optimize.cc
@@ -36,12 +36,14 @@ limitations under the License.
 #include "mlir/IR/Matchers.h"  // from @llvm-project
 #include "mlir/IR/PatternMatch.h"  // from @llvm-project
 #include "mlir/IR/StandardTypes.h"  // from @llvm-project
+#include "mlir/IR/TypeUtilities.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
 #include "tensorflow/compiler/mlir/lite/quantization/quantization_utils.h"
 #include "tensorflow/compiler/mlir/lite/transforms/passes.h"
 #include "tensorflow/compiler/mlir/lite/utils/attribute_utils.h"
+#include "tensorflow/compiler/mlir/lite/utils/convert_type.h"
 #include "tensorflow/compiler/mlir/lite/utils/validators.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 
@@ -216,18 +218,6 @@ static Type GetShapeStrippedType(TypeAttr type_attr) {
   }
 }
 
-bool NotFromQuantOpDifferentQuant(Value val, TypeAttr qtype_attr) {
-  auto val_defn_op = val.getDefiningOp();
-  TFL::QuantizeOp q_op = llvm::dyn_cast_or_null<TFL::QuantizeOp>(val_defn_op);
-  if (!q_op) return true;
-
-  // Ignore shape details - weŕe really only trying to
-  // check if quantization is the same.
-  auto stripped_src_qtype = GetShapeStrippedType(q_op.qtypeAttr());
-  auto stripped_qtype = GetShapeStrippedType(qtype_attr);
-  return stripped_src_qtype == stripped_qtype;
-}
-
 #include "tensorflow/compiler/mlir/lite/transforms/generated_optimize.inc"
 
 // Fuse Add with proceeding FullyConnected.
diff --git a/tensorflow/compiler/mlir/lite/transforms/optimize_patterns.td b/tensorflow/compiler/mlir/lite/transforms/optimize_patterns.td
index e8f1c9c2cf3..1fae567c835 100644
--- a/tensorflow/compiler/mlir/lite/transforms/optimize_patterns.td
+++ b/tensorflow/compiler/mlir/lite/transforms/optimize_patterns.td
@@ -18,6 +18,7 @@ limitations under the License.
 include "mlir/IR/OpBase.td"
 include "mlir/Dialect/StandardOps/IR/Ops.td"
 include "tensorflow/compiler/mlir/lite/ir/tfl_ops.td"
+include "tensorflow/compiler/mlir/lite/utils/utils.td"
 include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td"
 
 def F32ElementsAttr : ElementsAttrBase<
@@ -26,17 +27,10 @@ def F32ElementsAttr : ElementsAttrBase<
 def ExtractSingleElementAsFloat : NativeCodeCall<
     "ExtractSingleElementAsFloat($_self.cast<ElementsAttr>())">;
 
-// Checks if the value has only one user.
-def HasOneUse : Constraint<CPred<"$0.hasOneUse()">>;
-
 // Checks if the value has rank at most 'n'.
 class HasRankAtMost<int n> : Constraint<
     CPred<"$0.getType().cast<ShapedType>().getRank() <= " # n>>;
 
-// Checks value is not produce by a TLF_QUant with
-// different quantization attribute
-def NotFromQuantOpDifferentQuant : Constraint<CPred<"NotFromQuantOpDifferentQuant($0,$1)">>;
-
 //===----------------------------------------------------------------------===//
 // Ternary ops patterns.
 //===----------------------------------------------------------------------===//
@@ -169,9 +163,9 @@ foreach BinaryOp = [TFL_DivOp, TFL_MulOp] in
 // with the same scale. We want to remove the redundancy.
 // TODO(fengliuai): move this to the sanity check of pre-quantize pass.
 def eliminate_dq_q_pairs : Pat<
-  (TFL_QuantizeOp (TFL_DequantizeOp $in), $qt), 
-  (replaceWithValue $in), 
-  [(NotFromQuantOpDifferentQuant $in, $qt)]>;
+  (TFL_QuantizeOp (TFL_DequantizeOp $in), $qt),
+  (replaceWithValue $in),
+  [(NotFromQuantOpOrSameQuantType $in, $qt)]>;
 
 
 // Constraint that makes sure both operands are the same operands.
@@ -446,6 +440,10 @@ def : Pat<(TFL_MaximumOp (TFL_MulOp:$mul_out $input1,
            (EqualOperands $input1, $input2),
            (HasOneUse $mul_out)]>;
 
+def RemoveTrivialCast : Pat<(TFL_CastOp:$output $input),
+                            (replaceWithValue $input),
+                            [(SameElementType $input, $output)]>;
+
 // Checks if the operand0's rank is one less than operand1's rank.
 def PReluAlphaRankCheck : Constraint<
   CPred<"$0.getType().cast<ShapedType>().getRank() == "
diff --git a/tensorflow/compiler/mlir/lite/utils/convert_type.cc b/tensorflow/compiler/mlir/lite/utils/convert_type.cc
index ed6888d4874..22283d7eace 100644
--- a/tensorflow/compiler/mlir/lite/utils/convert_type.cc
+++ b/tensorflow/compiler/mlir/lite/utils/convert_type.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include "mlir/IR/Builders.h"  // from @llvm-project
 #include "mlir/IR/StandardTypes.h"  // from @llvm-project
 #include "mlir/IR/Types.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/core/framework/types.pb.h"
@@ -111,4 +112,27 @@ StatusOr<tflite::TensorType> TfTypeToTflType(tensorflow::DataType type) {
   }
 }
 
+mlir::Type GetShapeStrippedType(mlir::TypeAttr type_attr) {
+  auto type = type_attr.getValue();
+  auto shaped_type = type.dyn_cast<mlir::ShapedType>();
+  if (shaped_type) {
+    return shaped_type.getElementType();
+  } else {
+    return type;
+  }
+}
+
+bool NotFromQuantOpOrSameQuantType(mlir::Value val, mlir::TypeAttr qtype_attr) {
+  auto val_defn_op = val.getDefiningOp();
+  mlir::TFL::QuantizeOp q_op =
+      llvm::dyn_cast_or_null<mlir::TFL::QuantizeOp>(val_defn_op);
+  if (!q_op) return true;
+
+  // Ignore shape details - weŕe really only trying to
+  // check if quantization is the same.
+  auto stripped_src_qtype = GetShapeStrippedType(q_op.qtypeAttr());
+  auto stripped_qtype = GetShapeStrippedType(qtype_attr);
+  return stripped_src_qtype == stripped_qtype;
+}
+
 }  // namespace tflite
diff --git a/tensorflow/compiler/mlir/lite/utils/convert_type.h b/tensorflow/compiler/mlir/lite/utils/convert_type.h
index 3ae58d565e1..38f52baf0fb 100644
--- a/tensorflow/compiler/mlir/lite/utils/convert_type.h
+++ b/tensorflow/compiler/mlir/lite/utils/convert_type.h
@@ -16,15 +16,16 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_MLIR_LITE_UTILS_CONVERT_TYPE_H_
 #define TENSORFLOW_COMPILER_MLIR_LITE_UTILS_CONVERT_TYPE_H_
 
+#include "mlir/IR/Attributes.h"  // from @llvm-project
 #include "mlir/IR/Types.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 
 namespace mlir {
-
 class Builder;
-}
+}  // namespace mlir
 
 namespace tflite {
 // Convert the scalar type of a TFlite tensor to the corresponding MLIR type.
@@ -37,5 +38,12 @@ tensorflow::DataType TflTypeToTfType(tflite::TensorType type);
 // Convert the Tensorflow scalar type to the corresponding TFLite type
 xla::StatusOr<tflite::TensorType> TfTypeToTflType(tensorflow::DataType type);
 
+// Returns element type from attribute Type 'type_attr'.
+mlir::Type GetShapeStrippedType(mlir::TypeAttr type_attr);
+
+// Returns true if 'val' is not from Quantize op or
+// from Quantize Op with same quant type as 'qtype_attr'
+bool NotFromQuantOpOrSameQuantType(mlir::Value val, mlir::TypeAttr qtype_attr);
+
 }  // namespace tflite
 #endif  // TENSORFLOW_COMPILER_MLIR_LITE_UTILS_CONVERT_TYPE_H_
diff --git a/tensorflow/compiler/mlir/lite/utils/utils.td b/tensorflow/compiler/mlir/lite/utils/utils.td
new file mode 100644
index 00000000000..0b8a37577fe
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/utils/utils.td
@@ -0,0 +1,31 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Utility predicates that are shared by multiple passes.
+
+include "mlir/IR/OpBase.td"
+include "mlir/Dialect/StandardOps/IR/Ops.td"
+
+// Checks value is not produced by a TFL_Quant or
+// from TFL_Quant Op with same quant type.
+def NotFromQuantOpOrSameQuantType : Constraint<
+  CPred<"tflite::NotFromQuantOpOrSameQuantType($0,$1)">>;
+
+// Accepts two inputs and check if both have the same element type.
+def SameElementType : Constraint<
+  CPred<"getElementTypeOrSelf($0) == getElementTypeOrSelf($1)">>;
+
+// Checks if the value has only one user.
+def HasOneUse : Constraint<CPred<"$0.hasOneUse()">>;

From c03c19eb0f1b8dbf1beaac28a6b2ae18e51a9d8e Mon Sep 17 00:00:00 2001
From: Andy Ly <lyandy@google.com>
Date: Fri, 26 Jun 2020 13:56:28 -0700
Subject: [PATCH 1180/1390] Add support for importing _Retval node attributes
 prefixed with `_` as FuncOp result attributes.

Similar to _Arg nodes, _Retval nodes can have internal attributes (prefixed with `_`). These are currently only possible in the `main` graph/function as converting from FunctionDef -> Graph will result in no _Retval optional attributes.

PiperOrigin-RevId: 318538167
Change-Id: Ia88dfd3d46423c2755c46faa93ebe4a9887e713d
---
 .../graphdef2mlir/arg-retval-attrs.pbtxt      | 155 ++++++++++++++++++
 .../tensorflow/translate/export_graphdef.cc   |   2 +
 .../mlir/tensorflow/translate/import_model.cc |  48 ++++--
 3 files changed, 188 insertions(+), 17 deletions(-)
 create mode 100644 tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/arg-retval-attrs.pbtxt

diff --git a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/arg-retval-attrs.pbtxt b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/arg-retval-attrs.pbtxt
new file mode 100644
index 00000000000..0ac7f460afe
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/arg-retval-attrs.pbtxt
@@ -0,0 +1,155 @@
+# RUN: tf-mlir-translate -graphdef-to-mlir -tf-enable-shape-inference-on-import=false %s -tf-graph-as-function -o - | FileCheck %s
+
+node {
+  name: "arg0"
+  op: "_Arg"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "index"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "arg1"
+  op: "_Arg"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_arg1_attr0"
+    value {
+      s: "_arg1_attr0_value"
+    }
+  }
+  attr {
+    key: "_arg1_attr1"
+    value {
+      f: 8.0
+    }
+  }
+  attr {
+    key: "index"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "arg2"
+  op: "_Arg"
+  attr {
+    key: "T"
+    value {
+      type: DT_BOOL
+    }
+  }
+  attr {
+    key: "index"
+    value {
+      i: 2
+    }
+  }
+}
+node {
+  name: "ret0"
+  op: "_Retval"
+  input: "arg0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_ret0_attr0"
+    value {
+      i: 8
+    }
+  }
+  attr {
+    key: "_ret0_attr1"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "index"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "ret1"
+  op: "_Retval"
+  input: "arg1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "index"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "ret2"
+  op: "_Retval"
+  input: "arg2"
+  attr {
+    key: "T"
+    value {
+      type: DT_BOOL
+    }
+  }
+  attr {
+    key: "_ret2_attr0"
+    value {
+      type: DT_VARIANT
+    }
+  }
+  attr {
+    key: "_ret2_attr1"
+    value {
+      shape {
+        dim {
+          size: 128
+        }
+        dim {
+          size: 1024
+        }
+      }
+    }
+  }
+  attr {
+    key: "index"
+    value {
+      i: 2
+    }
+  }
+}
+versions {
+  producer: 121
+}
+
+# Check that _Arg/_Retval attributes prefixed with `_` are imported as FuncOp
+# arg/result attributes, at the right index.
+
+# CHECK:      func @main
+# CHECK-SAME: ({{%.*}}: tensor<*xf32>, {{%.*}}: tensor<*xi32> {tf._arg1_attr0 = "_arg1_attr0_value", tf._arg1_attr1 = 8.000000e+00 : f32}, {{%.*}}: tensor<*xi1>)
+# CHECK-SAME: -> (tensor<*xf32> {tf._ret0_attr0 = 8 : i64, tf._ret0_attr1 = false}, tensor<*xi32>, tensor<*xi1> {tf._ret2_attr0 = !tf.variant, tf._ret2_attr1 = #tf.shape<128x1024>})
+# CHECK-SAME: attributes {tf.entry_function = {control_outputs = "", inputs = "arg0,arg1,arg2", outputs = "ret0,ret1,ret2"}}
diff --git a/tensorflow/compiler/mlir/tensorflow/translate/export_graphdef.cc b/tensorflow/compiler/mlir/tensorflow/translate/export_graphdef.cc
index 8cd14894f8f..b6fad8f5987 100644
--- a/tensorflow/compiler/mlir/tensorflow/translate/export_graphdef.cc
+++ b/tensorflow/compiler/mlir/tensorflow/translate/export_graphdef.cc
@@ -280,6 +280,8 @@ StatusOr<std::unique_ptr<NodeDef>> Exporter::GetArgumentNode(
   return node_def;
 }
 
+// TODO(b/160014479): Support exporting function result attributes as optional
+// attributes.
 StatusOr<std::unique_ptr<NodeDef>> Exporter::GetReturnNode(
     mlir::FuncOp function, Value operand, unsigned index,
     llvm::StringRef name) {
diff --git a/tensorflow/compiler/mlir/tensorflow/translate/import_model.cc b/tensorflow/compiler/mlir/tensorflow/translate/import_model.cc
index fea809c0798..e5b833ac975 100644
--- a/tensorflow/compiler/mlir/tensorflow/translate/import_model.cc
+++ b/tensorflow/compiler/mlir/tensorflow/translate/import_model.cc
@@ -100,6 +100,7 @@ limitations under the License.
 #include "tensorflow/core/grappler/utils/transitive_fanin.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/protobuf/graph_debug_info.pb.h"
@@ -1461,6 +1462,29 @@ Status ImporterBase::ConvertFunctionArgAndRets(
     const absl::InlinedVector<OutputTensor, 4>& arg_nodes,
     const absl::InlinedVector<OutputTensor, 4>& ret_nodes,
     const absl::InlinedVector<Node*, 4>& control_ret_nodes) {
+  auto set_attributes_on_func = [&](Node* node, int64_t index, bool is_arg) {
+    for (const auto& node_attr : node->attrs()) {
+      const auto& key = node_attr.first;
+      // Only import optional attributes (e.g., those starting with an
+      // underscore).
+      if (key.empty() || key[0] != '_') continue;
+      // Ignore shape inference attributes as shape information is already
+      // populated in the result type.
+      if (IsOutputShapesAttribute(node_attr.second, key) ||
+          IsResourceOutputShapesAttribute(node_attr.second, key))
+        continue;
+      TF_ASSIGN_OR_RETURN(auto converted_attr,
+                          ConvertAttributeValue(node_attr.second));
+      std::string dialect_attribute = "tf." + key;
+      if (is_arg) {
+        func.setArgAttr(index, dialect_attribute, converted_attr);
+      } else {
+        func.setResultAttr(index, dialect_attribute, converted_attr);
+      }
+    }
+    return Status::OK();
+  };
+
   auto* bb = &func.front();
   llvm::SmallDenseMap<std::pair<Node*, int>, mlir::Value, 4>
       arg_nodes_to_values;
@@ -1491,19 +1515,8 @@ Status ImporterBase::ConvertFunctionArgAndRets(
           builder_.getStringAttr(arg_node.node->requested_device()));
 
     if (arg_node.node->IsArg()) {
-      for (const auto& arg_node_attr : arg_node.node->attrs()) {
-        const auto& key = arg_node_attr.first;
-        // Only import attributes starting with an underscore.
-        if (key.empty() || key[0] != '_') continue;
-        // Ignore shape inference attributes as shape information is already
-        // populated in the result type.
-        if (IsOutputShapesAttribute(arg_node_attr.second, key) ||
-            IsResourceOutputShapesAttribute(arg_node_attr.second, key))
-          continue;
-        TF_ASSIGN_OR_RETURN(auto converted_attr,
-                            ConvertAttributeValue(arg_node_attr.second));
-        func.setArgAttr(i, llvm::formatv("tf.{0}", key).str(), converted_attr);
-      }
+      TF_RETURN_IF_ERROR(
+          set_attributes_on_func(arg_node.node, i, /*is_arg=*/true));
     }
 
     island->dropAllReferences();
@@ -1511,11 +1524,12 @@ Status ImporterBase::ConvertFunctionArgAndRets(
   }
 
   llvm::SmallVector<mlir::Value, 8> inst_to_return;
-  for (const auto& ret : ret_nodes) {
+  for (auto ret_and_idx : llvm::enumerate(ret_nodes)) {
+    const auto& ret = ret_and_idx.value();
     auto* inst = node_values_[ret.node->id()];
-    auto op = absl::string_view(ret.node->type_string());
-    if (op == FunctionLibraryDefinition::kRetOp ||
-        op == FunctionLibraryDefinition::kDeviceRetOp) {
+    if (ret.node->IsRetval()) {
+      TF_RETURN_IF_ERROR(set_attributes_on_func(ret.node, ret_and_idx.index(),
+                                                /*is_arg=*/false));
       // Lookup the instruction inside the island
       auto island_op = llvm::cast<mlir::tf_executor::IslandOp>(inst);
       mlir::Operation* inner_op = &island_op.GetBody().front();

From ccd7a1c1d8b8e56134ac1c64a4c0c5009ad59bd8 Mon Sep 17 00:00:00 2001
From: Ken Franko <kfranko@google.com>
Date: Fri, 26 Jun 2020 13:56:54 -0700
Subject: [PATCH 1181/1390] Add outside compilation tests with summaries w/ and
 w/o control flow.

PiperOrigin-RevId: 318538246
Change-Id: I8256ff7c34b398b0ab401635ac971f60a42eb277
---
 .../tpu/tpu_outside_compilation_test.py       | 93 +++++++++++++++++++
 1 file changed, 93 insertions(+)

diff --git a/tensorflow/python/tpu/tpu_outside_compilation_test.py b/tensorflow/python/tpu/tpu_outside_compilation_test.py
index 54c2598324c..74e789ba8db 100644
--- a/tensorflow/python/tpu/tpu_outside_compilation_test.py
+++ b/tensorflow/python/tpu/tpu_outside_compilation_test.py
@@ -18,6 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import os
+
 from absl.testing import parameterized
 import numpy as np
 
@@ -30,6 +32,7 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import logging_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import summary_ops_v2 as summary
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import flags
 from tensorflow.python.tpu import tpu
@@ -252,6 +255,96 @@ class TpuOutsideCompilationTest(test.TestCase, parameterized.TestCase):
         strategy.experimental_local_results(train_step()),
         constant_op.constant(58., shape=(strategy.num_replicas_in_sync)))
 
+  def testSummary(self):
+    strategy = get_tpu_strategy()
+
+    def host_computation(x):
+      summary.scalar("x", x, step=0)
+      return x * 2.0
+
+    @def_function.function
+    def step():
+
+      def computation(x):
+        x = x + 1.0
+        y = tpu.outside_compilation(host_computation, x)
+        y = tpu.outside_compilation(host_computation, x)
+        return y + 1.0
+
+      return strategy.run(computation, args=(2.0,))
+
+    summary_writer = summary.create_file_writer(
+        os.path.join(os.getenv("TEST_TMPDIR", "/tmp")), flush_millis=10000)
+    with summary_writer.as_default(), summary.always_record_summaries():
+      self.assertAllEqual(
+          strategy.experimental_local_results(step()),
+          constant_op.constant(7., shape=(strategy.num_replicas_in_sync)))
+
+  @parameterized.parameters((True), (False))
+  def testSummaryInCond(self, take_true_branch):
+    strategy = get_tpu_strategy()
+
+    def host_computation(x):
+      summary.scalar("x", x, step=0)
+      return x * 2.0
+
+    @def_function.function
+    def step(take_true_branch):
+
+      def computation(x):
+        x = x + 1.0
+        if x < 5.0:
+          y = tpu.outside_compilation(host_computation, x)
+          y = tpu.outside_compilation(host_computation, x)
+          x = y
+        return x + 1.0
+
+      if take_true_branch:
+        return strategy.run(computation, args=(2.0,))
+      else:
+        return strategy.run(computation, args=(10.0,))
+
+    summary_writer = summary.create_file_writer(
+        os.path.join(os.getenv("TEST_TMPDIR", "/tmp")), flush_millis=10000)
+
+    output_value = 12.
+    if take_true_branch:
+      output_value = 7.
+    with summary_writer.as_default(), summary.always_record_summaries():
+      self.assertAllEqual(
+          strategy.experimental_local_results(step(take_true_branch)),
+          constant_op.constant(
+              output_value, shape=(strategy.num_replicas_in_sync)))
+
+  def testSummaryInWhile(self):
+    strategy = get_tpu_strategy()
+
+    def host_computation(x):
+      summary.scalar("x", x, step=0)
+      return x * 2.0
+
+    @def_function.function
+    def step():
+
+      def computation(x):
+        n = 0
+        while n < 3:
+          x = x + 1.0
+          y = tpu.outside_compilation(host_computation, x)
+          y = tpu.outside_compilation(host_computation, x)
+          x = y
+          n = n + 1
+        return y + 1.0
+
+      return strategy.run(computation, args=(2.0,))
+
+    summary_writer = summary.create_file_writer(
+        os.path.join(os.getenv("TEST_TMPDIR", "/tmp")), flush_millis=10000)
+    with summary_writer.as_default(), summary.always_record_summaries():
+      self.assertAllEqual(
+          strategy.experimental_local_results(step()),
+          constant_op.constant(31., shape=(strategy.num_replicas_in_sync)))
+
 
 if __name__ == "__main__":
   test.main()

From 560db7e28c379050c137a8af0d7374787ed238a6 Mon Sep 17 00:00:00 2001
From: Scott Zhu <scottzhu@google.com>
Date: Fri, 26 Jun 2020 13:59:36 -0700
Subject: [PATCH 1182/1390] Add a deprecation warning for PeepholeLSTMCell,
 which has been moved to tensorflow_addons.

We will remove this API in the next public release (2.4).

PiperOrigin-RevId: 318538711
Change-Id: Ieec33cf4b43bc25a16a009d9f5f645a30a0f3375
---
 RELEASE.md                                  |  8 ++++
 tensorflow/python/keras/layers/recurrent.py | 42 +++++++++++++++++++++
 2 files changed, 50 insertions(+)

diff --git a/RELEASE.md b/RELEASE.md
index 68d9399676a..5b6b57d8103 100644
--- a/RELEASE.md
+++ b/RELEASE.md
@@ -65,6 +65,14 @@ This release contains contributions from many people at Google, as well as:
     exsiting C++ kernel `ExtractGlimpse` does not change as well, so saved
     models will not be impacted.
 
+## Bug Fixes and Other Changes
+
+*   `tf.keras`:
+    *   Deprecated the `tf.keras.experimental.PeepholeLSTMCell` layer, which was
+        moved to `tensorflow_addons` as
+        `tensorflow_addons.rnn.PeepholeLSTMCell`. This experimental API is
+        expected to be removed from TF in the next public release (2.4).
+
 # Release 2.1.1
 
 ## Bug Fixes and Other Changes
diff --git a/tensorflow/python/keras/layers/recurrent.py b/tensorflow/python/keras/layers/recurrent.py
index 78a4a33a533..2760509cd72 100644
--- a/tensorflow/python/keras/layers/recurrent.py
+++ b/tensorflow/python/keras/layers/recurrent.py
@@ -43,6 +43,7 @@ from tensorflow.python.ops import state_ops
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training.tracking import base as trackable
 from tensorflow.python.training.tracking import data_structures
+from tensorflow.python.util import deprecation
 from tensorflow.python.util import nest
 from tensorflow.python.util.tf_export import keras_export
 from tensorflow.tools.docs import doc_controls
@@ -2556,6 +2557,47 @@ class PeepholeLSTMCell(LSTMCell):
   ```
   """
 
+  @deprecation.deprecated(
+      None, 'Please use tensorflow_addons.rnn.PeepholeLSTMCell instead')
+  def __init__(self,
+               units,
+               activation='tanh',
+               recurrent_activation='hard_sigmoid',
+               use_bias=True,
+               kernel_initializer='glorot_uniform',
+               recurrent_initializer='orthogonal',
+               bias_initializer='zeros',
+               unit_forget_bias=True,
+               kernel_regularizer=None,
+               recurrent_regularizer=None,
+               bias_regularizer=None,
+               kernel_constraint=None,
+               recurrent_constraint=None,
+               bias_constraint=None,
+               dropout=0.,
+               recurrent_dropout=0.,
+               implementation=1,
+               **kwargs):
+    super(PeepholeLSTMCell, self).__init__(
+        units=units,
+        activation=activation,
+        recurrent_activation=recurrent_activation,
+        use_bias=use_bias,
+        kernel_initializer=kernel_initializer,
+        recurrent_initializer=recurrent_initializer,
+        bias_initializer=bias_initializer,
+        unit_forget_bias=unit_forget_bias,
+        kernel_regularizer=kernel_regularizer,
+        recurrent_regularizer=recurrent_regularizer,
+        bias_regularizer=bias_regularizer,
+        kernel_constraint=kernel_constraint,
+        recurrent_constraint=recurrent_constraint,
+        bias_constraint=bias_constraint,
+        dropout=dropout,
+        recurrent_dropout=recurrent_dropout,
+        implementation=implementation,
+        **kwargs)
+
   def build(self, input_shape):
     super(PeepholeLSTMCell, self).build(input_shape)
     # The following are the weight matrices for the peephole connections. These

From fbac6b974de07a175c8b8d65912043b2f4bd1bc3 Mon Sep 17 00:00:00 2001
From: Ayush Dubey <ayushd@google.com>
Date: Fri, 26 Jun 2020 14:02:34 -0700
Subject: [PATCH 1183/1390] Propagate `collective_executor` in OpKernels that
 create run options.

A number of OpKernels create run options and launch another kernel or function.
Before this change, we would miss setting `collective_executor` in some places,
which would lead to a confusing `Failed to get CollectiveExecutor from
OpKernelContext` error message.

PiperOrigin-RevId: 318539208
Change-Id: I0ddbcb5fbae7969277d5b2e016484e22d6faba0e
---
 tensorflow/core/kernels/batch_kernels.cc  | 1 +
 tensorflow/core/kernels/function_ops.cc   | 2 +-
 tensorflow/core/kernels/functional_ops.cc | 1 +
 3 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/tensorflow/core/kernels/batch_kernels.cc b/tensorflow/core/kernels/batch_kernels.cc
index 5f89e81dfdd..818685a3fff 100644
--- a/tensorflow/core/kernels/batch_kernels.cc
+++ b/tensorflow/core/kernels/batch_kernels.cc
@@ -840,6 +840,7 @@ class BatchResource : public ResourceBase {
     FunctionLibraryRuntime::Options opts;
     opts.step_container = last_task_context->step_container();
     opts.cancellation_manager = last_task_context->cancellation_manager();
+    opts.collective_executor = last_task_context->collective_executor();
     opts.stats_collector = last_task_context->stats_collector();
     opts.rendezvous = last_task_context->rendezvous();
     opts.runner = last_task_context->runner();
diff --git a/tensorflow/core/kernels/function_ops.cc b/tensorflow/core/kernels/function_ops.cc
index 5c4d68545a1..dd312fbf3e6 100644
--- a/tensorflow/core/kernels/function_ops.cc
+++ b/tensorflow/core/kernels/function_ops.cc
@@ -267,11 +267,11 @@ class SymbolicGradientOp : public AsyncOpKernel {
     FunctionLibraryRuntime::Options opts;
     opts.rendezvous = ctx->rendezvous();
     opts.cancellation_manager = ctx->cancellation_manager();
+    opts.collective_executor = ctx->collective_executor();
     opts.runner = ctx->runner();
     opts.run_all_kernels_inline = ctx->run_all_kernels_inline();
     opts.stats_collector = ctx->stats_collector();
     opts.step_container = ctx->step_container();
-    opts.collective_executor = ctx->collective_executor();
     std::vector<Tensor> args;
     args.reserve(ctx->num_inputs());
     for (int i = 0; i < ctx->num_inputs(); ++i) {
diff --git a/tensorflow/core/kernels/functional_ops.cc b/tensorflow/core/kernels/functional_ops.cc
index 96c0a3d6bdc..03b0438ae31 100644
--- a/tensorflow/core/kernels/functional_ops.cc
+++ b/tensorflow/core/kernels/functional_ops.cc
@@ -114,6 +114,7 @@ void SetRunOptions(OpKernelContext* ctx, FunctionLibraryRuntime::Options* opts,
                    bool always_collect_stats) {
   opts->rendezvous = ctx->rendezvous();
   opts->cancellation_manager = ctx->cancellation_manager();
+  opts->collective_executor = ctx->collective_executor();
   if (always_collect_stats) {
     opts->stats_collector = ctx->stats_collector();
   }

From 79258378e868930b5fda1af04381f528ca36f769 Mon Sep 17 00:00:00 2001
From: Jose Baiocchi <jbaiocchi@google.com>
Date: Fri, 26 Jun 2020 14:14:23 -0700
Subject: [PATCH 1184/1390] Add FindMutablePlanesWithPrefix to xplane_utils

PiperOrigin-RevId: 318541548
Change-Id: Ic5360ea6def1a01f498958650b182cf7b643ddc2
---
 .../core/profiler/utils/xplane_utils.cc       | 27 ++++++++++++-------
 tensorflow/core/profiler/utils/xplane_utils.h | 12 ++++-----
 2 files changed, 24 insertions(+), 15 deletions(-)

diff --git a/tensorflow/core/profiler/utils/xplane_utils.cc b/tensorflow/core/profiler/utils/xplane_utils.cc
index f3cbf5d0699..1fef12580dd 100644
--- a/tensorflow/core/profiler/utils/xplane_utils.cc
+++ b/tensorflow/core/profiler/utils/xplane_utils.cc
@@ -56,15 +56,6 @@ const XPlane* FindPlaneWithName(const XSpace& space, absl::string_view name) {
   return nullptr;
 }
 
-std::vector<const XPlane*> FindPlanesWithPrefix(const XSpace& space,
-                                                absl::string_view prefix) {
-  std::vector<const XPlane*> result;
-  for (const XPlane& plane : space.planes()) {
-    if (absl::StartsWith(plane.name(), prefix)) result.push_back(&plane);
-  }
-  return result;
-}
-
 XPlane* FindMutablePlaneWithName(XSpace* space, absl::string_view name) {
   for (XPlane& plane : *space->mutable_planes()) {
     if (plane.name() == name) return &plane;
@@ -81,6 +72,24 @@ XPlane* FindOrAddMutablePlaneWithName(XSpace* space, absl::string_view name) {
   return plane;
 }
 
+std::vector<const XPlane*> FindPlanesWithPrefix(const XSpace& space,
+                                                absl::string_view prefix) {
+  std::vector<const XPlane*> result;
+  for (const XPlane& plane : space.planes()) {
+    if (absl::StartsWith(plane.name(), prefix)) result.push_back(&plane);
+  }
+  return result;
+}
+
+std::vector<XPlane*> FindMutablePlanesWithPrefix(XSpace* space,
+                                                 absl::string_view prefix) {
+  std::vector<XPlane*> result;
+  for (XPlane& plane : *space->mutable_planes()) {
+    if (absl::StartsWith(plane.name(), prefix)) result.push_back(&plane);
+  }
+  return result;
+}
+
 bool IsNested(const XEvent& event, const XEvent& parent) {
   return XEventTimespan(parent).Includes(XEventTimespan(event));
 }
diff --git a/tensorflow/core/profiler/utils/xplane_utils.h b/tensorflow/core/profiler/utils/xplane_utils.h
index 7575244e7bd..ff65f5af3ef 100644
--- a/tensorflow/core/profiler/utils/xplane_utils.h
+++ b/tensorflow/core/profiler/utils/xplane_utils.h
@@ -27,18 +27,18 @@ namespace profiler {
 
 // Returns the plane with the given name or nullptr if not found.
 const XPlane* FindPlaneWithName(const XSpace& space, absl::string_view name);
-
-// Returns all the planes with a given prefix.
-std::vector<const XPlane*> FindPlanesWithPrefix(const XSpace& space,
-                                                absl::string_view prefix);
-
-// Returns the plane with the given name in the container or null if not found.
 XPlane* FindMutablePlaneWithName(XSpace* space, absl::string_view name);
 
 // Returns the plane with the given name in the container. If necessary, adds a
 // new plane to the container.
 XPlane* FindOrAddMutablePlaneWithName(XSpace* space, absl::string_view name);
 
+// Returns all the planes with a given prefix.
+std::vector<const XPlane*> FindPlanesWithPrefix(const XSpace& space,
+                                                absl::string_view prefix);
+std::vector<XPlane*> FindMutablePlanesWithPrefix(XSpace* space,
+                                                 absl::string_view prefix);
+
 // Returns true if event is nested by parent.
 bool IsNested(const tensorflow::profiler::XEvent& event,
               const tensorflow::profiler::XEvent& parent);

From c9c8450a5d1f7107ac63a353c170ef932b8027b5 Mon Sep 17 00:00:00 2001
From: Ran Chen <crccw@google.com>
Date: Fri, 26 Jun 2020 14:30:38 -0700
Subject: [PATCH 1185/1390] Disable tsan while investigating

PiperOrigin-RevId: 318544493
Change-Id: Ibfb37c5cbdbd0d4a66463a0da0b99140321db404
---
 tensorflow/python/distribute/BUILD | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tensorflow/python/distribute/BUILD b/tensorflow/python/distribute/BUILD
index 8fa24d887cf..9b9d951aa86 100644
--- a/tensorflow/python/distribute/BUILD
+++ b/tensorflow/python/distribute/BUILD
@@ -829,6 +829,9 @@ py_test(
     name = "combinations_test",
     srcs = ["combinations_test.py"],
     python_version = "PY3",
+    tags = [
+        "notsan",  # TODO(b/160006974)
+    ],
     deps = [
         ":combinations",
         "//tensorflow/python:client_testlib",
@@ -1819,6 +1822,7 @@ distribute_py_test(
         # TODO(b/155301154): Enable this test on multi-gpu guitar once multi process
         # runner can run on guitar.
         "noguitar",
+        "notsan",  # TODO(b/160006974)
     ],
     xla_enable_strict_auto_jit = True,
     deps = [

From 4a69bbf13f56fff964efe7372810e52f4976c99d Mon Sep 17 00:00:00 2001
From: Ashwin Murthy <ashwinm@google.com>
Date: Fri, 26 Jun 2020 14:31:13 -0700
Subject: [PATCH 1186/1390] Small update in the operation fusion doc

PiperOrigin-RevId: 318544605
Change-Id: Id6c9a687b3490e537028a1a3eba7ffc20e987380
---
 tensorflow/lite/g3doc/convert/operation_fusion.md | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/tensorflow/lite/g3doc/convert/operation_fusion.md b/tensorflow/lite/g3doc/convert/operation_fusion.md
index d5ad874561c..c8637c83a9e 100644
--- a/tensorflow/lite/g3doc/convert/operation_fusion.md
+++ b/tensorflow/lite/g3doc/convert/operation_fusion.md
@@ -127,13 +127,12 @@ composite operation implements. This is very useful as it provides:
     arguments of the `tf.function` correspond to the arguments of this
     interface.
 
-As an example, let’s consider a composite operation defined in
-[Lingvo/TensorFlow](https://github.com/tensorflow/lingvo) to implement embedding
-lookup. This maps to a fused operation in TensorFlow Lite.
+As an example, let’s consider a composite operation defined to implement
+embedding lookup. This maps to a fused operation in TensorFlow Lite.
 
 ```python
   @tf.function(
-        experimental_implements="lingvo.embedding_lookup")
+        experimental_implements="embedding_lookup")
     def EmbFprop(embs, ids_vec):
       """Embedding forward prop.
 
@@ -229,7 +228,7 @@ Here is code snippet from the pass showing the main workflow:
 ```
 void PrepareCompositeFunctionsPass::ConvertTFImplements(FuncOp func,
                                                         StringAttr attr) {
-  if (attr.getValue() == "lingvo.embedding_lookup") {
+  if (attr.getValue() == "embedding_lookup") {
     func.eraseBody();
     func.addEntryBlock();
     // Convert the composite embedding_lookup function body to a

From 225729dc760514be6ce3e8b4674e94cab4b2379f Mon Sep 17 00:00:00 2001
From: Yanhui Liang <yhliang@google.com>
Date: Fri, 26 Jun 2020 14:31:42 -0700
Subject: [PATCH 1187/1390] Move Keras benchmark test from Keras/tests to
 Keras/benchmark.

PiperOrigin-RevId: 318544697
Change-Id: Ia0d372fdecae1b14fa5c8a24ef2cc1e7feada741
---
 tensorflow/python/keras/benchmark/BUILD       | 15 +++++++++++++++
 .../model_components_benchmarks_test.py}      | 19 ++-----------------
 tensorflow/python/keras/tests/BUILD           | 15 ---------------
 3 files changed, 17 insertions(+), 32 deletions(-)
 mode change 100644 => 100755 tensorflow/python/keras/benchmark/BUILD
 rename tensorflow/python/keras/{tests/eager_benchmarks_test.py => benchmark/model_components_benchmarks_test.py} (95%)

diff --git a/tensorflow/python/keras/benchmark/BUILD b/tensorflow/python/keras/benchmark/BUILD
old mode 100644
new mode 100755
index 3387d7ca78b..7d462b7de95
--- a/tensorflow/python/keras/benchmark/BUILD
+++ b/tensorflow/python/keras/benchmark/BUILD
@@ -55,3 +55,18 @@ cuda_py_test(
         "@absl_py//absl/testing:parameterized",
     ],
 )
+
+cuda_py_test(
+    name = "model_components_benchmarks_test",
+    srcs = ["model_components_benchmarks_test.py"],
+    python_version = "PY3",
+    deps = [
+        "//tensorflow/python:random_ops",
+        "//tensorflow/python:training_lib",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/eager:backprop",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:profiler",
+        "//tensorflow/python/eager:test",
+    ],
+)
diff --git a/tensorflow/python/keras/tests/eager_benchmarks_test.py b/tensorflow/python/keras/benchmark/model_components_benchmarks_test.py
similarity index 95%
rename from tensorflow/python/keras/tests/eager_benchmarks_test.py
rename to tensorflow/python/keras/benchmark/model_components_benchmarks_test.py
index 055e08e8227..da6aef4efeb 100644
--- a/tensorflow/python/keras/tests/eager_benchmarks_test.py
+++ b/tensorflow/python/keras/benchmark/model_components_benchmarks_test.py
@@ -12,22 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-r"""Benchmarks for low-level eager execution primitives.
-
-To run CPU benchmarks:
-  bazel run -c opt benchmarks_test -- --benchmarks=.
-
-To run GPU benchmarks:
-  bazel run --config=cuda -c opt --copt="-mavx" benchmarks_test -- \
-    --benchmarks=.
-
-To run a subset of benchmarks using --benchmarks flag.
---benchmarks: the list of benchmarks to run. The specified value is interpreted
-as a regular expression and any benchmark whose name contains a partial match
-to the regular expression is executed.
-e.g. --benchmarks=".*matmul*." will run all matmul related benchmarks.
-
-"""
+r"""Benchmarks on Keras components with different Keras model types."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
@@ -118,7 +103,7 @@ def run_benchmark(func, num_iters, execution_mode=None):
     return end - start
 
 
-class MicroBenchmarks(test.Benchmark):
+class KerasComponentsBenchmarks(test.Benchmark):
 
   def _run(self, func, num_iters, execution_mode=None):
     total_time = run_benchmark(func, num_iters, execution_mode)
diff --git a/tensorflow/python/keras/tests/BUILD b/tensorflow/python/keras/tests/BUILD
index b01fc3ca903..88273f4bb2b 100644
--- a/tensorflow/python/keras/tests/BUILD
+++ b/tensorflow/python/keras/tests/BUILD
@@ -117,21 +117,6 @@ tf_py_test(
     ],
 )
 
-cuda_py_test(
-    name = "eager_benchmarks_test",
-    srcs = ["eager_benchmarks_test.py"],
-    python_version = "PY3",
-    deps = [
-        "//tensorflow/python:random_ops",
-        "//tensorflow/python:training_lib",
-        "//tensorflow/python/data/ops:dataset_ops",
-        "//tensorflow/python/eager:backprop",
-        "//tensorflow/python/eager:context",
-        "//tensorflow/python/eager:profiler",
-        "//tensorflow/python/eager:test",
-    ],
-)
-
 tf_py_test(
     name = "graph_util_test",
     srcs = ["graph_util_test.py"],

From 2d7920fb9bd8fb543e33ccf25d7230f9908005e0 Mon Sep 17 00:00:00 2001
From: Karim Nosir <karimnosseir@google.com>
Date: Fri, 26 Jun 2020 14:33:59 -0700
Subject: [PATCH 1188/1390] Use GraphImportConfig instead of multiple function
 arguments which maps to this config anyways.

PiperOrigin-RevId: 318545163
Change-Id: Ie1f2ddfdb1c808ba2c4b694ba5a173d369dcb68d
---
 tensorflow/compiler/mlir/lite/BUILD             |  2 ++
 .../compiler/mlir/lite/tf_tfl_translate.cc      |  9 ++++++---
 .../compiler/mlir/lite/tf_to_tfl_flatbuffer.cc  | 17 ++++++++---------
 .../compiler/mlir/lite/tf_to_tfl_flatbuffer.h   | 10 +++++-----
 4 files changed, 21 insertions(+), 17 deletions(-)

diff --git a/tensorflow/compiler/mlir/lite/BUILD b/tensorflow/compiler/mlir/lite/BUILD
index e254978882f..d5f7aa3aa9c 100644
--- a/tensorflow/compiler/mlir/lite/BUILD
+++ b/tensorflow/compiler/mlir/lite/BUILD
@@ -792,6 +792,7 @@ tf_cc_binary(
         "@llvm-project//mlir:Support",
         "//tensorflow/cc/saved_model:loader",
         "//tensorflow/compiler/mlir:init_mlir",
+        "//tensorflow/compiler/mlir/tensorflow:mlir_roundtrip_flags",
         "//tensorflow/compiler/mlir/tensorflow:translate_cl_options",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/platform:errors",
@@ -874,6 +875,7 @@ cc_library(
         "//tensorflow/compiler/mlir/tensorflow",
         "//tensorflow/compiler/mlir/tensorflow:decode_constant_pass",
         "//tensorflow/compiler/mlir/tensorflow:error_util",
+        "//tensorflow/compiler/mlir/tensorflow:mlir_roundtrip_flags",
         "//tensorflow/compiler/mlir/tensorflow:tf_dialect_lib",
         "//tensorflow/compiler/mlir/tensorflow:tf_dialect_passes",
         "//tensorflow/compiler/mlir/tensorflow:translate_lib",
diff --git a/tensorflow/compiler/mlir/lite/tf_tfl_translate.cc b/tensorflow/compiler/mlir/lite/tf_tfl_translate.cc
index 015312291a5..963ab743a83 100644
--- a/tensorflow/compiler/mlir/lite/tf_tfl_translate.cc
+++ b/tensorflow/compiler/mlir/lite/tf_tfl_translate.cc
@@ -39,6 +39,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/lite/tf_tfl_translate_cl.h"
 #include "tensorflow/compiler/mlir/lite/tf_to_tfl_flatbuffer.h"
 #include "tensorflow/compiler/mlir/lite/transforms/passes.h"
+#include "tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_flags.h"
 #include "tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate_cl.h"
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/platform/errors.h"
@@ -170,11 +171,13 @@ int main(int argc, char **argv) {
     module = tensorflow::ImportSavedModel(input_file_name, saved_model_version,
                                           tags, exported_names, &context);
   } else {
+    tensorflow::GraphImportConfig specs;
+    specs.upgrade_legacy = upgrade_legacy;
+    specs.prune_unused_nodes = true;
     module = tensorflow::LoadFromGraphdefOrMlirSource(
         input_file_name, input_mlir, use_splatted_constant, custom_opdefs,
-        debug_info_file, input_arrays, input_dtypes, input_shapes,
-        output_arrays,
-        /*prune_unused_nodes=*/true, upgrade_legacy, &source_mgr, &context);
+        specs, debug_info_file, input_arrays, input_dtypes, input_shapes,
+        output_arrays, &source_mgr, &context);
   }
 
   // If errors occur, the library call in the above already logged the error
diff --git a/tensorflow/compiler/mlir/lite/tf_to_tfl_flatbuffer.cc b/tensorflow/compiler/mlir/lite/tf_to_tfl_flatbuffer.cc
index aa89472f92a..a5fa5883db2 100644
--- a/tensorflow/compiler/mlir/lite/tf_to_tfl_flatbuffer.cc
+++ b/tensorflow/compiler/mlir/lite/tf_to_tfl_flatbuffer.cc
@@ -75,11 +75,10 @@ mlir::LogicalResult IsValidGraph(mlir::ModuleOp module) {
 StatusOr<OwningModuleRef> LoadFromGraphdefOrMlirSource(
     const std::string& input_filename, bool input_mlir,
     bool use_splatted_constant, const std::vector<std::string>& extra_tf_opdefs,
-    absl::string_view debug_info_file, absl::string_view input_arrays,
-    absl::string_view input_dtypes, absl::string_view input_shapes,
-    absl::string_view output_arrays, bool prune_unused_nodes,
-    bool enable_upgrade_legacy, llvm::SourceMgr* source_mgr,
-    MLIRContext* context) {
+    const GraphImportConfig& specs, absl::string_view debug_info_file,
+    absl::string_view input_arrays, absl::string_view input_dtypes,
+    absl::string_view input_shapes, absl::string_view output_arrays,
+    llvm::SourceMgr* source_mgr, MLIRContext* context) {
   // Set up the input file.
   std::string error_message;
   auto file = mlir::openInputFile(input_filename, &error_message);
@@ -113,15 +112,15 @@ StatusOr<OwningModuleRef> LoadFromGraphdefOrMlirSource(
     return tensorflow::GraphdefToSplattedMlirTranslateFunction(
         file->getBuffer(), debug_info_file, input_arrays, input_dtypes,
         input_shapes, output_arrays, /*control_output_arrays=*/"",
-        prune_unused_nodes, /*convert_legacy_fed_inputs=*/true,
-        /*graph_as_function=*/false, enable_upgrade_legacy,
+        specs.prune_unused_nodes, /*convert_legacy_fed_inputs=*/true,
+        /*graph_as_function=*/false, specs.upgrade_legacy,
         /*enable_shape_inference=*/false, context);
   }
   return tensorflow::GraphdefToMlirTranslateFunction(
       file->getBuffer(), debug_info_file, input_arrays, input_dtypes,
       input_shapes, output_arrays, /*control_output_arrays=*/"",
-      prune_unused_nodes, /*convert_legacy_fed_inputs=*/true,
-      /*graph_as_function=*/false, enable_upgrade_legacy,
+      specs.prune_unused_nodes, /*convert_legacy_fed_inputs=*/true,
+      /*graph_as_function=*/false, specs.upgrade_legacy,
       /*enable_shape_inference=*/false, context);
 }
 
diff --git a/tensorflow/compiler/mlir/lite/tf_to_tfl_flatbuffer.h b/tensorflow/compiler/mlir/lite/tf_to_tfl_flatbuffer.h
index 82cf9c9549b..4ad58c4f8ef 100644
--- a/tensorflow/compiler/mlir/lite/tf_to_tfl_flatbuffer.h
+++ b/tensorflow/compiler/mlir/lite/tf_to_tfl_flatbuffer.h
@@ -25,6 +25,7 @@ limitations under the License.
 #include "mlir/Pass/PassManager.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/lite/common/tfl_pass_config.h"
 #include "tensorflow/compiler/mlir/lite/quantization/quantization_config.h"
+#include "tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_flags.h"
 #include "tensorflow/stream_executor/lib/statusor.h"
 
 namespace tensorflow {
@@ -38,11 +39,10 @@ stream_executor::port::StatusOr<mlir::OwningModuleRef>
 LoadFromGraphdefOrMlirSource(
     const std::string& input_filename, bool input_mlir,
     bool use_splatted_constant, const std::vector<std::string>& extra_tf_opdefs,
-    absl::string_view debug_info_file, absl::string_view input_arrays,
-    absl::string_view input_dtypes, absl::string_view input_shapes,
-    absl::string_view output_arrays, bool prune_unused_nodes,
-    bool enable_upgrade_legacy, llvm::SourceMgr* source_mgr,
-    mlir::MLIRContext* context);
+    const GraphImportConfig& specs, absl::string_view debug_info_file,
+    absl::string_view input_arrays, absl::string_view input_dtypes,
+    absl::string_view input_shapes, absl::string_view output_arrays,
+    llvm::SourceMgr* source_mgr, mlir::MLIRContext* context);
 
 // Load Saved model (either v1 or v2) into MLIR.
 stream_executor::port::StatusOr<mlir::OwningModuleRef> ImportSavedModel(

From feb5afe4e83841cdbca9e72d022e45140fb2ae61 Mon Sep 17 00:00:00 2001
From: Kuangyuan Chen <chky@google.com>
Date: Fri, 26 Jun 2020 14:35:45 -0700
Subject: [PATCH 1189/1390] Add an overload of tensorflow::MlirModuleToString()
 that takes OpPrintingFlags.

PiperOrigin-RevId: 318545535
Change-Id: I9faa30f7a01e35b0e26ea0a05db505a66a04c914
---
 .../mlir/tensorflow/translate/import_model.cc        | 12 ++++++++----
 .../mlir/tensorflow/translate/import_model.h         |  3 +++
 2 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/tensorflow/compiler/mlir/tensorflow/translate/import_model.cc b/tensorflow/compiler/mlir/tensorflow/translate/import_model.cc
index e5b833ac975..312bf6b2678 100644
--- a/tensorflow/compiler/mlir/tensorflow/translate/import_model.cc
+++ b/tensorflow/compiler/mlir/tensorflow/translate/import_model.cc
@@ -55,7 +55,6 @@ limitations under the License.
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/IR/Module.h"  // from @llvm-project
 #include "mlir/IR/OpDefinition.h"  // from @llvm-project
-#include "mlir/IR/OperationSupport.h"  // from @llvm-project
 #include "mlir/IR/StandardTypes.h"  // from @llvm-project
 #include "mlir/IR/Types.h"  // from @llvm-project
 #include "mlir/IR/Verifier.h"  // from @llvm-project
@@ -3763,15 +3762,20 @@ StatusOr<mlir::OwningModuleRef> ConvertSavedModelV1ToMlir(
                                                  context);
 }
 
-std::string MlirModuleToString(mlir::ModuleOp module, bool show_debug_info) {
+std::string MlirModuleToString(mlir::ModuleOp module,
+                               mlir::OpPrintingFlags flags) {
   std::string txt_module;
   {
-    mlir::OpPrintingFlags flags;
-    if (show_debug_info) flags.enableDebugInfo();
     llvm::raw_string_ostream os{txt_module};
     module.print(os, flags);
   }
   return txt_module;
 }
 
+std::string MlirModuleToString(mlir::ModuleOp module, bool show_debug_info) {
+  mlir::OpPrintingFlags flags;
+  if (show_debug_info) flags.enableDebugInfo();
+  return MlirModuleToString(module, flags);
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tensorflow/translate/import_model.h b/tensorflow/compiler/mlir/tensorflow/translate/import_model.h
index 80001c44389..8632345b033 100644
--- a/tensorflow/compiler/mlir/tensorflow/translate/import_model.h
+++ b/tensorflow/compiler/mlir/tensorflow/translate/import_model.h
@@ -20,6 +20,7 @@ limitations under the License.
 
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/IR/Module.h"  // from @llvm-project
+#include "mlir/IR/OperationSupport.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "tensorflow/cc/saved_model/bundle_v2.h"
 #include "tensorflow/cc/saved_model/loader.h"
@@ -67,6 +68,8 @@ ConvertSavedModelV1ToMlir(const SavedModelBundle& saved_model,
                           mlir::MLIRContext* context);
 
 // Serialize a MLIR module to a string.
+std::string MlirModuleToString(mlir::ModuleOp module,
+                               mlir::OpPrintingFlags flags);
 std::string MlirModuleToString(mlir::ModuleOp m, bool show_debug_info = false);
 
 }  // namespace tensorflow

From c18d0c5f541c9fe0bd44a318e6da37a4316ef769 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 26 Jun 2020 14:40:48 -0700
Subject: [PATCH 1190/1390] Fix TODO in _ReductionDims.

PiperOrigin-RevId: 318546422
Change-Id: I01ce5aa8cb156b14055c72e78595f252be09e886
---
 tensorflow/python/ops/math_ops.py   |  9 ++----
 tensorflow/python/ops/sparse_ops.py | 50 ++++++++++++++++++-----------
 2 files changed, 34 insertions(+), 25 deletions(-)

diff --git a/tensorflow/python/ops/math_ops.py b/tensorflow/python/ops/math_ops.py
index 79c74a81d80..7810dae2688 100644
--- a/tensorflow/python/ops/math_ops.py
+++ b/tensorflow/python/ops/math_ops.py
@@ -1823,13 +1823,8 @@ if not six.PY2:
                                           _range_tensor_conversion_function)
 
 # Reduction operations
-def _ReductionDims(x, axis, reduction_indices=None):  # pylint: disable=invalid-name
-  """Returns range(0, rank(x)) if reduction_indices is None."""
-  # TODO(aselle): Remove this after deprecation
-  if reduction_indices is not None:
-    if axis is not None:
-      raise ValueError("Can't specify both axis' and 'reduction_indices'.")
-    axis = reduction_indices
+def _ReductionDims(x, axis):  # pylint: disable=invalid-name
+  """Returns range(0, rank(x)) if axis is None."""
   if axis is not None:
     return axis
   else:
diff --git a/tensorflow/python/ops/sparse_ops.py b/tensorflow/python/ops/sparse_ops.py
index cee1dc23aa0..5e956434342 100644
--- a/tensorflow/python/ops/sparse_ops.py
+++ b/tensorflow/python/ops/sparse_ops.py
@@ -1238,21 +1238,24 @@ def sparse_reduce_max_v2(
   if keepdims is None:
     keepdims = False
 
-  # reduction_axes is the deprecated name for axis.
-  reduction_axes = None
-
   if output_is_sparse:
     output_ind, output_val, output_shape = (
         gen_sparse_ops.sparse_reduce_max_sparse(
-            sp_input.indices, sp_input.values, sp_input.dense_shape,
-            math_ops._ReductionDims(sp_input, axis, reduction_axes), keepdims,
+            sp_input.indices,
+            sp_input.values,
+            sp_input.dense_shape,
+            math_ops._ReductionDims(sp_input, axis),
+            keepdims,
             name=name))
 
     return sparse_tensor.SparseTensor(output_ind, output_val, output_shape)
 
   return gen_sparse_ops.sparse_reduce_max(
-      sp_input.indices, sp_input.values, sp_input.dense_shape,
-      math_ops._ReductionDims(sp_input, axis, reduction_axes), keepdims,
+      sp_input.indices,
+      sp_input.values,
+      sp_input.dense_shape,
+      math_ops._ReductionDims(sp_input, axis),
+      keepdims,
       name=name)
 
 
@@ -1319,12 +1322,14 @@ def sparse_reduce_max(sp_input, axis=None, keepdims=None,
   """
   keepdims = deprecation.deprecated_argument_lookup("keepdims", keepdims,
                                                     "keep_dims", keep_dims)
+  axis = deprecation.deprecated_argument_lookup("axis", axis, "reduction_axes",
+                                                reduction_axes)
   if keepdims is None:
     keepdims = False
 
   return gen_sparse_ops.sparse_reduce_max(
       sp_input.indices, sp_input.values, sp_input.dense_shape,
-      math_ops._ReductionDims(sp_input, axis, reduction_axes), keepdims)
+      math_ops._ReductionDims(sp_input, axis), keepdims)
 
 
 @tf_export(v1=["sparse.reduce_max_sparse", "sparse_reduce_max_sparse"])
@@ -1367,13 +1372,15 @@ def sparse_reduce_max_sparse(sp_input,
   """
   keepdims = deprecation.deprecated_argument_lookup("keepdims", keepdims,
                                                     "keep_dims", keep_dims)
+  axis = deprecation.deprecated_argument_lookup("axis", axis, "reduction_axes",
+                                                reduction_axes)
   if keepdims is None:
     keepdims = False
 
   output_ind, output_val, output_shape = (
       gen_sparse_ops.sparse_reduce_max_sparse(
           sp_input.indices, sp_input.values, sp_input.dense_shape,
-          math_ops._ReductionDims(sp_input, axis, reduction_axes), keepdims))
+          math_ops._ReductionDims(sp_input, axis), keepdims))
 
   return sparse_tensor.SparseTensor(output_ind, output_val, output_shape)
 
@@ -1428,20 +1435,23 @@ def sparse_reduce_sum_v2(
   if keepdims is None:
     keepdims = False
 
-  # reduction_axes is the deprecated name for axis.
-  reduction_axes = None
-
   if output_is_sparse:
     output_ind, output_val, output_shape = (
         gen_sparse_ops.sparse_reduce_sum_sparse(
-            sp_input.indices, sp_input.values, sp_input.dense_shape,
-            math_ops._ReductionDims(sp_input, axis, reduction_axes), keepdims,
+            sp_input.indices,
+            sp_input.values,
+            sp_input.dense_shape,
+            math_ops._ReductionDims(sp_input, axis),
+            keepdims,
             name=name))
     return sparse_tensor.SparseTensor(output_ind, output_val, output_shape)
 
   return gen_sparse_ops.sparse_reduce_sum(
-      sp_input.indices, sp_input.values, sp_input.dense_shape,
-      math_ops._ReductionDims(sp_input, axis, reduction_axes), keepdims,
+      sp_input.indices,
+      sp_input.values,
+      sp_input.dense_shape,
+      math_ops._ReductionDims(sp_input, axis),
+      keepdims,
       name=name)
 
 
@@ -1495,12 +1505,14 @@ def sparse_reduce_sum(sp_input, axis=None, keepdims=None,
   """
   keepdims = deprecation.deprecated_argument_lookup("keepdims", keepdims,
                                                     "keep_dims", keep_dims)
+  axis = deprecation.deprecated_argument_lookup("axis", axis, "reduction_axes",
+                                                reduction_axes)
   if keepdims is None:
     keepdims = False
 
   return gen_sparse_ops.sparse_reduce_sum(
       sp_input.indices, sp_input.values, sp_input.dense_shape,
-      math_ops._ReductionDims(sp_input, axis, reduction_axes), keepdims)
+      math_ops._ReductionDims(sp_input, axis), keepdims)
 
 
 @tf_export(v1=["sparse.reduce_sum_sparse", "sparse_reduce_sum_sparse"])
@@ -1543,13 +1555,15 @@ def sparse_reduce_sum_sparse(sp_input,
   """
   keepdims = deprecation.deprecated_argument_lookup("keepdims", keepdims,
                                                     "keep_dims", keep_dims)
+  axis = deprecation.deprecated_argument_lookup("axis", axis, "reduction_axes",
+                                                reduction_axes)
   if keepdims is None:
     keepdims = False
 
   output_ind, output_val, output_shape = (
       gen_sparse_ops.sparse_reduce_sum_sparse(
           sp_input.indices, sp_input.values, sp_input.dense_shape,
-          math_ops._ReductionDims(sp_input, axis, reduction_axes), keepdims))
+          math_ops._ReductionDims(sp_input, axis), keepdims))
 
   return sparse_tensor.SparseTensor(output_ind, output_val, output_shape)
 

From 62fab0a5cd1d211246238c9bf7c34f77ad32bbe6 Mon Sep 17 00:00:00 2001
From: Rahul Joshi <jurahul@google.com>
Date: Fri, 26 Jun 2020 14:44:50 -0700
Subject: [PATCH 1191/1390] [MLIR] Add region based while op to the TF dialect 
 - Adopt ParentOneOf trait for YieldOp now that it can have multiple parent
 Ops.  - Support LoopLikeOpInterface to allow LICM on the WhileRegion body.  -
 Add tests for verification as well as for LICM.

PiperOrigin-RevId: 318547164
Change-Id: Ib33fd7cb9c6bb7d507e896d5ad04fe717b19e495
---
 tensorflow/compiler/mlir/tensorflow/BUILD     |   4 +
 .../compiler/mlir/tensorflow/ir/tf_ops.cc     |  99 ++++++--
 .../compiler/mlir/tensorflow/ir/tf_ops.h      |   1 +
 .../compiler/mlir/tensorflow/ir/tf_ops.td     |  59 ++++-
 .../mlir/tensorflow/tests/tf-ops.mlir         | 226 +++++++++++++++++-
 .../mlir/tensorflow/tests/while_licm.mlir     |  69 ++++++
 tensorflow/compiler/mlir/xla/BUILD            |   1 +
 7 files changed, 432 insertions(+), 27 deletions(-)
 create mode 100644 tensorflow/compiler/mlir/tensorflow/tests/while_licm.mlir

diff --git a/tensorflow/compiler/mlir/tensorflow/BUILD b/tensorflow/compiler/mlir/tensorflow/BUILD
index c348284daff..ed55a5c3793 100644
--- a/tensorflow/compiler/mlir/tensorflow/BUILD
+++ b/tensorflow/compiler/mlir/tensorflow/BUILD
@@ -36,6 +36,7 @@ filegroup(
         "@llvm-project//mlir:OpBaseTdFiles",
         "@llvm-project//mlir:include/mlir/Interfaces/CallInterfaces.td",
         "@llvm-project//mlir:include/mlir/Interfaces/InferTypeOpInterface.td",
+        "@llvm-project//mlir:include/mlir/Interfaces/LoopLikeInterface.td",
         "@llvm-project//mlir:include/mlir/Interfaces/SideEffectInterfaces.td",
     ],
 )
@@ -308,6 +309,7 @@ cc_library(
         "@llvm-project//mlir:Dialect",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:InferTypeOpInterface",
+        "@llvm-project//mlir:LoopLikeInterface",
         "@llvm-project//mlir:Parser",
         "@llvm-project//mlir:SideEffects",
         "@llvm-project//mlir:StandardOps",
@@ -408,6 +410,7 @@ cc_library(
         "@llvm-project//mlir:Dialect",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:InferTypeOpInterface",
+        "@llvm-project//mlir:LoopLikeInterface",
         "@llvm-project//mlir:Parser",
         "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:SideEffects",
@@ -1286,6 +1289,7 @@ genrule(
     srcs = [
         "@llvm-project//mlir:include/mlir/Interfaces/CallInterfaces.td",
         "@llvm-project//mlir:include/mlir/Interfaces/InferTypeOpInterface.td",
+        "@llvm-project//mlir:include/mlir/Interfaces/LoopLikeInterface.td",
         "@llvm-project//mlir:include/mlir/Interfaces/SideEffectInterfaces.td",
         "@llvm-project//mlir:include/mlir/IR/OpBase.td",
         "ir/tf_generated_ops.td",
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc
index 6d8c5af297d..97903170fba 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc
@@ -2103,21 +2103,6 @@ static LogicalResult Verify(IfOp op) {
   return success();
 }
 
-//===----------------------------------------------------------------------===//
-// YieldOp
-//===----------------------------------------------------------------------===//
-
-static LogicalResult Verify(YieldOp op) {
-  auto parent = op.getParentOp();
-  // A YieldOp should be contained within an IfRegion op
-  // (and WhileRegion in future)
-  if (!isa<IfRegionOp>(parent))
-    op.emitError() << " expects parent op "
-                   << "'" << IfRegionOp::getOperationName() << "' but got '"
-                   << parent->getName().getStringRef() << "'";
-  return success();
-}
-
 //===----------------------------------------------------------------------===//
 // IfRegionOp
 //===----------------------------------------------------------------------===//
@@ -2129,8 +2114,10 @@ LogicalResult VerifyRegionResults(Operation *op, Region &region,
   YieldOp yield = cast<YieldOp>(region.front().getTerminator());
   unsigned expected_num_results = op->getNumResults();
   if (yield.getNumOperands() != expected_num_results)
-    return op->emitError(region_name + " region should have " +
-                         Twine(expected_num_results) + " results");
+    return op->emitOpError()
+           << region_name + " should have same number (" << expected_num_results
+           << ") of results as " << op_name << " but has "
+           << yield.getNumOperands() << " results";
 
   for (int idx : llvm::seq<int>(0, expected_num_results)) {
     auto op_result_type = op->getResult(idx).getType().cast<TensorType>();
@@ -4186,6 +4173,81 @@ static LogicalResult Verify(WhileOp op) {
   return success();
 }
 
+//===----------------------------------------------------------------------===//
+// WhileRegionOp
+//===----------------------------------------------------------------------===//
+static LogicalResult Verify(WhileRegionOp op) {
+  // Verify that the condition generates a single tensor<i1> result.
+  YieldOp yield = cast<YieldOp>(op.cond().front().getTerminator());
+  if (yield.getNumOperands() != 1)
+    return op.emitOpError()
+           << "condition should have a single tensor<i1> result";
+
+  auto cond_type = yield.getOperand(0).getType().dyn_cast<RankedTensorType>();
+  if (!cond_type || !cond_type.getShape().equals({}) ||
+      !cond_type.getElementType().isInteger(/*width=*/1))
+    return op.emitOpError()
+           << "condition should have a single tensor<i1> result";
+
+  // The body result types should match while op result types.
+  if (failed(VerifyRegionResults(op, op.body(), "body"))) return failure();
+
+  // Both condition and body should have same number and type of operands as
+  // the WhileRegion inputs.
+  const int num_inputs = op.getNumOperands();
+  auto block_inputs_match_op_inputs = [&](Region &region,
+                                          StringRef name) -> LogicalResult {
+    Block &block = region.front();
+    if (block.getNumArguments() != num_inputs)
+      return op.emitOpError()
+             << name << " should have same number of inputs (" << num_inputs
+             << ") as " << WhileRegionOp::getOperationName() << " but has "
+             << block.getNumArguments() << " inputs";
+
+    for (auto types_idx : llvm::enumerate(
+             llvm::zip(op.getOperandTypes(), block.getArgumentTypes()))) {
+      auto op_input_type = std::get<0>(types_idx.value());
+      auto block_input_type = std::get<1>(types_idx.value());
+      if (!AreCastCompatible({block_input_type, op_input_type}))
+        return op.emitOpError(llvm::formatv(
+            "{0} input type {1} is incompatible with {2} "
+            "input type {3} at index {4}",
+            name, block_input_type, WhileRegionOp::getOperationName(),
+            op_input_type, types_idx.index()));
+    }
+    return success();
+  };
+
+  if (failed(block_inputs_match_op_inputs(op.cond(), "condition")) ||
+      failed(block_inputs_match_op_inputs(op.body(), "body")))
+    return failure();
+
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// WhileRegionOp LoopLikeOpInterface
+//===----------------------------------------------------------------------===//
+
+Region &WhileRegionOp::getLoopBody() { return body(); }
+
+bool WhileRegionOp::isDefinedOutsideOfLoop(Value value) {
+  // If the Op defining the value exists and the defining op is outside the
+  // scope of this WhileRegion, then we can infer that its defined outside.
+  // The defining Op is outside the scope of this WhileRegion if this
+  // WhileRegionOp is not an ancestor of the defining op in the parent chain.
+  Operation *def_op = value.getDefiningOp();
+  return def_op && !getOperation()->isAncestor(def_op);
+}
+
+LogicalResult WhileRegionOp::moveOutOfLoop(
+    llvm::ArrayRef<mlir::Operation *> ops) {
+  // Move the hoisted value to just before the while.
+  Operation *while_op = this->getOperation();
+  for (auto op : ops) op->moveBefore(while_op);
+  return success();
+}
+
 //===----------------------------------------------------------------------===//
 // XdivyOp
 //===----------------------------------------------------------------------===//
@@ -4220,7 +4282,8 @@ struct TFInlinerInterface : public DialectInlinerInterface {
                        BlockAndValueMapping &valueMapping) const final {
     // Allow inlining in regions attached to region based control flow
     // operations only if the src region is a single block region
-    return isa<IfRegionOp>(dest->getParentOp()) && llvm::hasSingleElement(*src);
+    return isa<IfRegionOp, WhileRegionOp>(dest->getParentOp()) &&
+           llvm::hasSingleElement(*src);
   }
 
   // Defines the legality of inlining TF operations.
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h
index ab6a2be0635..dbc14485cdb 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h
@@ -31,6 +31,7 @@ limitations under the License.
 #include "mlir/Interfaces/CallInterfaces.h"  // from @llvm-project
 #include "mlir/Interfaces/DerivedAttributeOpInterface.h"  // from @llvm-project
 #include "mlir/Interfaces/InferTypeOpInterface.h"  // from @llvm-project
+#include "mlir/Interfaces/LoopLikeInterface.h"  // from @llvm-project
 #include "mlir/Interfaces/SideEffectInterfaces.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_attributes.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_op_interfaces.h"
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td
index f5d8fbae46a..10b77127655 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td
@@ -31,6 +31,7 @@ include "tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td"
 include "tensorflow/compiler/mlir/tensorflow/ir/tf_op_base.td"
 include "mlir/Interfaces/CallInterfaces.td"
 include "mlir/Interfaces/InferTypeOpInterface.td"
+include "mlir/Interfaces/LoopLikeInterface.td"
 include "mlir/IR/OpBase.td"
 
 class TF_TensorListInitOp<string mnemonic> : TF_Op<mnemonic, [NoSideEffect]> {
@@ -230,7 +231,8 @@ else_branch: A function that takes 'inputs' and returns a list of
   }];
 }
 
-def TF_YieldOp : TF_Op<"Yield", [Terminator]> {
+def TF_YieldOp : TF_Op<"Yield",
+      [Terminator, ParentOneOf<["IfRegionOp", "WhileRegionOp"]>]> {
   let summary = "Yield operation";
 
   let description = [{
@@ -242,10 +244,6 @@ def TF_YieldOp : TF_Op<"Yield", [Terminator]> {
   }];
 
   let arguments = (ins Variadic<AnyType>:$operands);
-
-  let verifier = [{
-    return Verify(*this);
-  }];
 }
 
 def TF_IfRegionOp : TF_Op<"IfRegion",
@@ -595,7 +593,7 @@ output = input; While (Cond(output)) { output = Body(output) }
 
 input: A list of input tensors whose types are T.
 output: A list of output tensors whose types are T.
-cond: A function takes 'input' and returns a tensor.  If the tensor is
+cond: A function that takes 'input' and returns a tensor.  If the tensor is
     a scalar of non-boolean, the scalar is converted to a boolean
     according to the following rule: if the scalar is a numerical
     value, non-zero means True and zero means False; if the scalar is
@@ -631,6 +629,55 @@ body: A function that takes a list of tensors and returns another
   }];
 }
 
+def TL_WhileRegionOp : TF_Op<"WhileRegion",
+      [DeclareOpInterfaceMethods<LoopLikeOpInterface>,
+       SingleBlockImplicitTerminator<"YieldOp">]> {
+  let summary = "while operation";
+  let description = [{
+  The tf.WhileRegion op represents a while loop using 2 regions and a set of
+  iteration variables. The iteration variables maintained by this Op have the
+  same types as the inputs. The Op executes a while loop described by the
+  following pseudo code:
+
+  ```
+     func WhileRegionOp(inputs) {
+       iteration_vars = inputs;
+       while (cond(iteration_vars)) {
+           iteration_vars = body(iteration_vars);
+       }
+       return iteration_vars;
+     }
+  ```
+
+  `cond` is the condition region and `body` is the body region. Both these
+  regions accept the current value of the iteration variables as inputs. The
+  condition region returns a tensor<i1> which, if false, will exit the loop.
+  The body region computes new values of the iteration variables. The iteration
+  variables are initialized to the Op input, and the results of the
+  tf.WhileRegion op are the final values of the iteration variables.
+
+  This implies that the operand and result types for tf.WhileRegion should be
+  the same. Note that the condition and body regions can implicitly capture
+  loop invariant values directly.
+  }];
+
+  let arguments = (ins
+    Variadic<AnyTensor>:$input,
+
+    // Used to map StatelessWhile and While op defined in TensorFlow to a common
+    // op.
+    DefaultValuedAttr<BoolAttr, "false">:$is_stateless,
+    DefaultValuedAttr<I64Attr, "10">:$parallel_iterations
+  );
+  let results = (outs Variadic<AnyTensor>:$output);
+
+  TF_DerivedOperandTypeListAttr T = TF_DerivedOperandTypeListAttr<0>;
+
+  let regions = (region SizedRegion<1>:$cond, SizedRegion<1>:$body);
+
+  let verifier = [{ return Verify(*this); }];
+}
+
 def TF_TensorListReserveOp : TF_TensorListInitOp<"TensorListReserve"> {
   let summary = "List of the given size with empty elements.";
 
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tf-ops.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tf-ops.mlir
index 4ba2e83300b..3ae75b475d6 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tf-ops.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tf-ops.mlir
@@ -830,7 +830,7 @@ func @testInvalidIfOp(tensor<i1>, tensor<*xf32>) -> tensor<2xf32> {
 
 // Test invalid tf.Yield operation (parent should be IfRegion)
 func @testInvalidYieldOp(%arg0: f32) -> () {
-  // expected-error @+1 {{expects parent op 'tf.IfRegion'}}
+  // expected-error @+1 {{'tf.Yield' op expects parent op to be one of 'tf.IfRegion, tf.WhileRegion'}}
   "tf.Yield"(%arg0) : (f32) -> ()
 }
 
@@ -964,7 +964,7 @@ func @testIfRegionElseTerminator(%arg0: tensor<i1>, %arg1: tensor<2xf32>) -> ten
 
 // tf.Region yield number of results should match op number of results
 func @testIfRegionThenResultCount(%arg0: tensor<i1>, %arg1: tensor<2xf32>) -> tensor<2xf32> {
-  // expected-error @+1 {{then region should have 1 result}}
+  // expected-error @+1 {{'tf.IfRegion' op then should have same number (1) of results as tf.IfRegion but has 2 results}}
   %0 = "tf.IfRegion"(%arg0) ({
      %t = "tf.Abs"(%arg1) : (tensor<2xf32>) -> tensor<2xf32>
      "tf.Yield"(%t, %t) : (tensor<2xf32>, tensor<2xf32>) -> ()
@@ -979,7 +979,7 @@ func @testIfRegionThenResultCount(%arg0: tensor<i1>, %arg1: tensor<2xf32>) -> te
 // -----
 
 func @testIfRegionElseResultCount(%arg0: tensor<i1>, %arg1: tensor<2xf32>) -> tensor<2xf32> {
-  // expected-error @+1 {{else region should have 1 result}}
+  // expected-error @+1 {{tf.IfRegion' op else should have same number (1) of results as tf.IfRegion but has 2 results}}
   %0 = "tf.IfRegion"(%arg0) ({
      %t = "tf.Abs"(%arg1) : (tensor<2xf32>) -> tensor<2xf32>
      "tf.Yield"(%t) : (tensor<2xf32>) -> ()
@@ -1629,6 +1629,226 @@ func @testWhileResult(tensor<*x!tf.resource<tensor<32xf32>>>) -> (tensor<!tf.res
   return %1 : tensor<!tf.resource>
 }
 
+// -----
+// WhileRegion tests
+
+// Simple While region
+// CHECK-LABEL: testValidWhileRegion
+func @testValidWhileRegion(%arg0 : tensor<*xf32>, %arg1 : tensor<i32>) -> tensor<*xf32> {
+  %0:2 = "tf.WhileRegion"(%arg0, %arg1) (
+    {
+      // condition, check if count has reached 0
+      ^bb0(%carg0: tensor<*xf32>, %carg1: tensor<i32>):
+      %zero = constant dense<0> : tensor<i32>
+      %ne = "tf.NotEqual"(%carg1, %zero) : (tensor<i32>, tensor<i32>) -> tensor<i1>
+      "tf.Yield"(%ne) : (tensor<i1>) -> ()
+    },
+    {
+      // loop body
+      ^bb0(%barg0: tensor<*xf32>, %barg1: tensor<i32>):
+      %add = "tf.Add"(%barg0, %barg0) : (tensor<*xf32>, tensor<*xf32>) -> tensor<*xf32>
+      %one = constant dense<1> : tensor<i32>
+      %sub = "tf.Sub"(%barg1, %one) : (tensor<i32>, tensor<i32>) -> tensor<i32>
+      "tf.Yield"(%add, %sub) : (tensor<*xf32>, tensor<i32>) -> ()
+    }
+  ) { is_stateless = false } : (tensor<*xf32>, tensor<i32>) -> (tensor<*xf32>, tensor<i32>)
+
+  return %0#0 : tensor<*xf32>
+}
+
+// -----
+
+// While region with no inputs (and hence no outputs) (infinite loop)
+// CHECK-LABEL: testValidWhileRegionNoInputs
+func @printer(tensor<i32>) -> ()
+func @testValidWhileRegionNoInputs() -> () {
+  "tf.WhileRegion"() (
+    {
+      %true = constant dense<1> : tensor<i1>
+      "tf.Yield"(%true) : (tensor<i1>) -> ()
+    },
+    {
+      %one = constant dense<1> : tensor<i32>
+      call @printer(%one) : (tensor<i32>) -> ()
+      // TODO(b/159753381): tf.IfRegion implicit terminator not working
+      "tf.Yield"() : () -> ()
+    }
+  ) { is_stateless = true } : () -> ()
+  return
+}
+
+// -----
+
+func @testInvalidWhileRegionMismatchCondInputCount(%arg : tensor<i32>) -> (tensor<i32>) {
+  // expected-error @+1 {{'tf.WhileRegion' op condition should have same number of inputs (1) as tf.WhileRegion but has 0 inputs}}
+  %0 = "tf.WhileRegion"(%arg) (
+     {
+       // ^bb0(%carg: tensor<i32>):
+        %true = constant dense<1> : tensor<i1>
+        "tf.Yield"(%true) : (tensor<i1>) -> ()
+     },
+     {
+       ^bb0(%barg: tensor<i32>):
+        "tf.Yield"(%arg) : (tensor<i32>) -> ()
+     }
+  ) : (tensor<i32>) -> (tensor<i32>)
+
+  return %0 : tensor<i32>
+}
+
+// -----
+
+func @testInvalidWhileRegionMismatchCondInputType(%arg : tensor<i32>) -> (tensor<i32>) {
+  // expected-error @+1 {{'tf.WhileRegion' op condition input type tensor<f32> is incompatible with tf.WhileRegion input type tensor<i32> at index 0}}
+  %0 = "tf.WhileRegion"(%arg) (
+     {
+       ^bb0(%carg: tensor<f32>):
+        %true = constant dense<1> : tensor<i1>
+        "tf.Yield"(%true) : (tensor<i1>) -> ()
+     },
+     {
+       ^bb0(%barg: tensor<i32>):
+        "tf.Yield"(%barg) : (tensor<i32>) -> ()
+     }
+  ) : (tensor<i32>) -> (tensor<i32>)
+
+  return %0 : tensor<i32>
+}
+
+// -----
+
+func @testInvalidWhileRegionMismatchBodyInputCount(%arg : tensor<i32>) -> (tensor<i32>) {
+  // expected-error @+1 {{'tf.WhileRegion' op body should have same number of inputs (1) as tf.WhileRegion but has 2 inputs}}
+  %0 = "tf.WhileRegion"(%arg) (
+     {
+       ^bb0(%carg: tensor<i32>):
+        %true = constant dense<1> : tensor<i1>
+        "tf.Yield"(%true) : (tensor<i1>) -> ()
+     },
+     {
+       ^bb0(%barg0: tensor<i32>, %barg1 : tensor<f32>):
+        "tf.Yield"(%barg0) : (tensor<i32>) -> ()
+     }
+  ) : (tensor<i32>) -> (tensor<i32>)
+
+  return %0 : tensor<i32>
+}
+
+// -----
+
+func @testInvalidWhileRegionMismatchBodyInputType(%arg : tensor<i32>) -> (tensor<i32>) {
+  // expected-error @+1 {{body input type tensor<f32> is incompatible with tf.WhileRegion input type tensor<i32> at index 0}}
+  %0 = "tf.WhileRegion"(%arg) (
+     {
+       ^bb0(%carg: tensor<i32>):
+        %true = constant dense<1> : tensor<i1>
+        "tf.Yield"(%true) : (tensor<i1>) -> ()
+     },
+     {
+       ^bb0(%barg: tensor<f32>):
+        %c = "tf.Cast"(%barg) : (tensor<f32>) -> tensor<i32>
+        "tf.Yield"(%c) : (tensor<i32>) -> ()
+     }
+  ) : (tensor<i32>) -> (tensor<i32>)
+
+  return %0 : tensor<i32>
+}
+
+// -----
+
+func @testInvalidWhileRegionConditionOutputCount2(%arg : tensor<i32>) -> (tensor<i32>) {
+  // expected-error @+1 {{'tf.WhileRegion' op condition should have a single tensor<i1> result}}
+  %0 = "tf.WhileRegion"(%arg) (
+     {
+       ^bb0(%carg: tensor<i32>):
+        %true = constant dense<1> : tensor<i1>
+        "tf.Yield"(%true, %true) : (tensor<i1>, tensor<i1>) -> ()
+     },
+     {
+       ^bb0(%barg: tensor<i32>):
+        "tf.Yield"(%barg) : (tensor<i32>) -> ()
+     }
+  ) : (tensor<i32>) -> (tensor<i32>)
+
+  return %0 : tensor<i32>
+}
+
+// -----
+
+func @testInvalidWhileRegionConditionOutputCount0(%arg : tensor<i32>) -> (tensor<i32>) {
+  // expected-error @+1 {{'tf.WhileRegion' op condition should have a single tensor<i1> result}}
+  %0 = "tf.WhileRegion"(%arg) (
+     {
+       ^bb0(%carg: tensor<i32>):
+        "tf.Yield"() : () -> ()
+     },
+     {
+       ^bb0(%barg: tensor<i32>):
+        "tf.Yield"(%barg) : (tensor<i32>) -> ()
+     }
+  ) : (tensor<i32>) -> (tensor<i32>)
+
+  return %0 : tensor<i32>
+}
+
+// -----
+
+func @testInvalidWhileRegionConditionOutputType(%arg : tensor<i32>) -> (tensor<i32>) {
+  // expected-error @+1 {{'tf.WhileRegion' op condition should have a single tensor<i1> result}}
+  %0 = "tf.WhileRegion"(%arg) (
+     {
+       ^bb0(%carg: tensor<i32>):
+        "tf.Yield"(%carg) : (tensor<i32>) -> ()
+     },
+     {
+       ^bb0(%barg: tensor<i32>):
+        "tf.Yield"(%barg) : (tensor<i32>) -> ()
+     }
+  ) : (tensor<i32>) -> (tensor<i32>)
+
+  return %0 : tensor<i32>
+}
+
+// -----
+
+func @testInvalidWhileRegionMismatchBodyOutputCount(%arg : tensor<i32>) -> (tensor<i32>) {
+  // expected-error @+1 {{'tf.WhileRegion' op body should have same number (1) of results as tf.WhileRegion but has 2 results}}
+  %0 = "tf.WhileRegion"(%arg) (
+     {
+       ^bb0(%carg: tensor<i32>):
+        %true = constant dense<1> : tensor<i1>
+        "tf.Yield"(%true) : (tensor<i1>) -> ()
+     },
+     {
+       ^bb0(%barg: tensor<i32>):
+        %false = constant dense<1> : tensor<i1>
+        "tf.Yield"(%barg, %false) : (tensor<i32>, tensor<i1>) -> ()
+     }
+  ) : (tensor<i32>) -> (tensor<i32>)
+
+  return %0 : tensor<i32>
+}
+
+// -----
+
+func @testInvalidWhileRegionMismatchBodyOutputType(%arg : tensor<i32>) -> (tensor<i32>) {
+  // expected-error @+1 {{body result type tensor<f32> is incompatible with tf.WhileRegion result type tensor<i32> at index 0}}
+  %0 = "tf.WhileRegion"(%arg) (
+     {
+       ^bb0(%carg: tensor<i32>):
+        %true = constant dense<1> : tensor<i1>
+        "tf.Yield"(%true) : (tensor<i1>) -> ()
+     },
+     {
+       ^bb0(%barg: tensor<i32>):
+        %c = "tf.Cast"(%barg) : (tensor<i32>) -> tensor<f32>
+        "tf.Yield"(%c) : (tensor<f32>) -> ()
+     }
+  ) : (tensor<i32>) -> (tensor<i32>)
+
+  return %0 : tensor<i32>
+}
+
 // -----
 
 // CHECK-LABEL: func @testValidShape
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/while_licm.mlir b/tensorflow/compiler/mlir/tensorflow/tests/while_licm.mlir
new file mode 100644
index 00000000000..d4d8569119c
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/tests/while_licm.mlir
@@ -0,0 +1,69 @@
+// RUN: tf-opt -split-input-file -loop-invariant-code-motion %s | FileCheck %s
+
+// CHECK: while_1([[ARG0:%[^ :]*]]: tensor<i32>, [[ARG1:%[^ :]*]]: tensor<1xf32>)
+func @while_1(%arg0: tensor<i32>, %arg1: tensor<1xf32>) -> tensor<1xf32> {
+  // CHECK: [[CST:%[^ ]*]] = constant dense<1> : tensor<i32>
+  // CHECK: "tf.WhileRegion"([[ARG0]], [[ARG1]])
+  // CHECK: (tensor<i32>, tensor<1xf32>) -> (tensor<i32>, tensor<1xf32>)
+  %0:2 = "tf.WhileRegion"(%arg0, %arg1) (
+    // cond
+    {
+    ^bb0(%condArg0: tensor<*xi32>, %condArg1: tensor<*xf32>):
+      %0 = "std.constant" () {value = dense<0> : tensor<i32>} : () -> tensor<i32> loc("Const")
+      %1 = "tf.NotEqual"(%condArg0, %0) : (tensor<*xi32>, tensor<i32>) -> tensor<i1>
+      "tf.Yield"(%1) : (tensor<i1>) -> ()
+    },
+    // body
+    {
+    ^bb0(%bodyArg0: tensor<*xi32>, %bodyArg1: tensor<*xf32>):
+      %0 = "std.constant" () {value = dense<1> : tensor<i32>} : () -> tensor<i32> loc("Const")
+      %1 = "tf.Sub"(%bodyArg0, %0) : (tensor<*xi32>, tensor<i32>) -> tensor<*xi32>
+      %2 = "tf.Add"(%bodyArg1, %bodyArg1) : (tensor<*xf32>, tensor<*xf32>) -> tensor<*xf32>
+      "tf.Yield"(%1, %2) : (tensor<*xi32>, tensor<*xf32>) -> ()
+    }
+  ) : (tensor<i32>, tensor<1xf32>) -> (tensor<i32>, tensor<1xf32>) loc("WhileOp")
+  return %0#1 : tensor<1xf32>
+}
+
+// -----
+
+// Test WhileRegionOp::isDefinedOutsideOfLoop
+// CHECK-LABEL: testWhileRegionisDefinedOutsideOfLoop
+func @testWhileRegionisDefinedOutsideOfLoop(%arg0 : tensor<4xf32>, %arg1 : tensor<i32>) -> tensor<4xf32> {
+  %a = "tf.Neg"(%arg0) : (tensor<4xf32>) -> tensor<4xf32>
+  %b = "tf.Abs"(%arg0) : (tensor<4xf32>) -> tensor<4xf32>
+  // Verify that the Div and Mul are hoisted out of the body
+  // CHECK: "tf.Div"
+  // CHECK: constant dense<2.200000e+01>
+  // CHECK: "tf.Mul"
+  // CHECK: "tf.WhileRegion"
+  // Verify that Add and Sub is not hoisted out
+  // CHECK: "tf.Add"
+  // CHECK: "tf.Sub"
+  %0:2 = "tf.WhileRegion"(%arg0, %arg1) (
+    {
+      // condition, check if count has reached 0
+      ^bb0(%carg0: tensor<4xf32>, %carg1: tensor<i32>):
+      %zero = constant dense<0> : tensor<i32>
+      %ne = "tf.NotEqual"(%carg1, %zero) : (tensor<i32>, tensor<i32>) -> tensor<i1>
+      "tf.Yield"(%ne) : (tensor<i1>) -> ()
+    },
+    {
+      // loop body
+      ^bb0(%barg0: tensor<4xf32>, %barg1: tensor<i32>):
+      %add = "tf.Add"(%barg0, %barg0) : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
+      %one = constant dense<1> : tensor<i32>
+      %sub = "tf.Sub"(%barg1, %one) : (tensor<i32>, tensor<i32>) -> tensor<i32>
+
+      // Some loop invariant math
+      %li0 = "tf.Div"(%a, %b) : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
+      %cst = constant dense<22.0> : tensor<f32>
+      %li1 = "tf.Mul"(%li0, %cst) : (tensor<4xf32>, tensor<f32>) -> tensor<4xf32>
+
+      %final = "tf.Add"(%add, %li1) : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
+      "tf.Yield"(%final, %sub) : (tensor<4xf32>, tensor<i32>) -> ()
+    }
+  ) { is_stateless = false } : (tensor<4xf32>, tensor<i32>) -> (tensor<4xf32>, tensor<i32>)
+
+  return %0#0 : tensor<4xf32>
+}
diff --git a/tensorflow/compiler/mlir/xla/BUILD b/tensorflow/compiler/mlir/xla/BUILD
index 657f34ee02e..8b2dc6ec8ed 100644
--- a/tensorflow/compiler/mlir/xla/BUILD
+++ b/tensorflow/compiler/mlir/xla/BUILD
@@ -41,6 +41,7 @@ filegroup(
         "ir/lhlo_ops.td",
         "@llvm-project//mlir:OpBaseTdFiles",
         "@llvm-project//mlir:include/mlir/Interfaces/InferTypeOpInterface.td",
+        "@llvm-project//mlir:include/mlir/Interfaces/LoopLikeInterface.td",
         "@llvm-project//mlir:include/mlir/Interfaces/SideEffectInterfaces.td",
         "@llvm-project//mlir:include/mlir/Interfaces/ViewLikeInterface.td",
     ],

From 87d64cbd42aa3116235e7e89778251f80a7ff788 Mon Sep 17 00:00:00 2001
From: Smit Hinsu <hinsu@google.com>
Date: Fri, 26 Jun 2020 14:49:10 -0700
Subject: [PATCH 1192/1390] Verify AddNode status and disable
 vectorizer_registry_test

PiperOrigin-RevId: 318547921
Change-Id: I7cf77e0e5b68326d2ebdff87a22171fc86ccd7ea
---
 tensorflow/core/grappler/optimizers/data/vectorization/BUILD   | 1 +
 .../optimizers/data/vectorization/vectorizer_registry_test.cc  | 3 +++
 2 files changed, 4 insertions(+)

diff --git a/tensorflow/core/grappler/optimizers/data/vectorization/BUILD b/tensorflow/core/grappler/optimizers/data/vectorization/BUILD
index 00a700428d1..e1f648d007a 100644
--- a/tensorflow/core/grappler/optimizers/data/vectorization/BUILD
+++ b/tensorflow/core/grappler/optimizers/data/vectorization/BUILD
@@ -52,6 +52,7 @@ cc_library(
 tf_cc_test(
     name = "vectorizer_registry_test",
     srcs = ["vectorizer_registry_test.cc"],
+    tags = ["notap"],  # TODO(b/159771496)
     deps = [
         ":vectorizer_registry",
         "//tensorflow/core:test",
diff --git a/tensorflow/core/grappler/optimizers/data/vectorization/vectorizer_registry_test.cc b/tensorflow/core/grappler/optimizers/data/vectorization/vectorizer_registry_test.cc
index 0f34d2b7ebe..f0253f71d84 100644
--- a/tensorflow/core/grappler/optimizers/data/vectorization/vectorizer_registry_test.cc
+++ b/tensorflow/core/grappler/optimizers/data/vectorization/vectorizer_registry_test.cc
@@ -14,7 +14,9 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/core/grappler/optimizers/data/vectorization/vectorizer_registry.h"
+
 #include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace tensorflow {
@@ -41,6 +43,7 @@ TEST(TestVectorizer, TestTestVectorizer) {
   NodeDef node_def;
   Status s;
   Node* node = g.AddNode(node_def, &s);
+  TF_ASSERT_OK(s);
   std::vector<WrappedTensor> inputs, outputs;
   EXPECT_TRUE(
       vectorizer->Vectorize(*node, &g, std::move(inputs), &outputs).ok());

From 3c9776c4967589a5c3ca84f0f61b16487b442811 Mon Sep 17 00:00:00 2001
From: Raman Sarokin <sorokin@google.com>
Date: Fri, 26 Jun 2020 14:54:18 -0700
Subject: [PATCH 1193/1390] Converters updated to new style.

PiperOrigin-RevId: 318548915
Change-Id: Ibeb244bfceb92c68f2cabd8e7588ccc14b48ed91
---
 .../delegates/gpu/cl/inference_context.cc     |   2 +-
 .../lite/delegates/gpu/cl/kernels/BUILD       |   1 +
 .../delegates/gpu/cl/kernels/converter.cc     | 155 ++++++++++--------
 tensorflow/lite/delegates/gpu/cl/tensor.cc    |  27 ++-
 tensorflow/lite/delegates/gpu/cl/tensor.h     |   6 +-
 5 files changed, 115 insertions(+), 76 deletions(-)

diff --git a/tensorflow/lite/delegates/gpu/cl/inference_context.cc b/tensorflow/lite/delegates/gpu/cl/inference_context.cc
index 568b3199a9f..7ff10f16fe1 100644
--- a/tensorflow/lite/delegates/gpu/cl/inference_context.cc
+++ b/tensorflow/lite/delegates/gpu/cl/inference_context.cc
@@ -475,7 +475,7 @@ absl::Status InferenceContext::AllocateMemoryForBuffers(const CLDevice& device,
       const auto& shape = tensor_reserver_.Get(t.first).shape;
       const int buffer_index = buffer_assignment.object_ids[tensor_index];
       RETURN_IF_ERROR(CreateSharedTensor(
-          *context, device, shared_buffers_[buffer_index].GetMemoryPtr(), shape,
+          *context, shared_buffers_[buffer_index].GetMemoryPtr(), shape,
           t.second, &shared_buffer_tensors_[tensor_index]));
       created_tensors[tensor_index] = true;
     }
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/BUILD b/tensorflow/lite/delegates/gpu/cl/kernels/BUILD
index 21fb65e8909..bbfe7b13d8e 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/BUILD
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/BUILD
@@ -331,6 +331,7 @@ cc_library(
     deps = [
         ":util",
         "//tensorflow/lite/delegates/gpu:spi",
+        "//tensorflow/lite/delegates/gpu/cl:arguments",
         "//tensorflow/lite/delegates/gpu/cl:cl_command_queue",
         "//tensorflow/lite/delegates/gpu/cl:cl_errors",
         "//tensorflow/lite/delegates/gpu/cl:environment",
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/converter.cc b/tensorflow/lite/delegates/gpu/cl/kernels/converter.cc
index 4d1b274a0aa..69873aa9922 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/converter.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/converter.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include <array>
 #include <string>
 
+#include "tensorflow/lite/delegates/gpu/cl/arguments.h"
 #include "tensorflow/lite/delegates/gpu/cl/cl_command_queue.h"
 #include "tensorflow/lite/delegates/gpu/cl/cl_errors.h"
 #include "tensorflow/lite/delegates/gpu/cl/kernels/util.h"
@@ -40,20 +41,22 @@ class OpenClConverterImpl : public TensorObjectConverter {
                             Environment* environment) = 0;
 
  protected:
-  absl::Status DispatchKernel(cl_mem input, cl_mem output) {
+  absl::Status DispatchKernel(cl_mem buffer_mem, Tensor* tensor) {
     kernel_.ResetBindingCounter();
-    RETURN_IF_ERROR(kernel_.SetMemoryAuto(input));
-    RETURN_IF_ERROR(kernel_.SetMemoryAuto(output));
-    int3 grid = int3(dims_.w * dims_.b, dims_.h, dims_.d());
-    int4 size = int4(dims_.w, dims_.h, dims_.d(), dims_.b);
-    RETURN_IF_ERROR(kernel_.SetBytesAuto(size));
-    RETURN_IF_ERROR(kernel_.SetBytesAuto(dims_.c));
+    RETURN_IF_ERROR(kernel_.SetMemoryAuto(buffer_mem));
+    RETURN_IF_ERROR(args_.SetObjectRef("tensor", tensor));
+    RETURN_IF_ERROR(args_.Bind(kernel_.kernel(), kernel_.GetBindingCounter()));
+    int3 grid = int3(tensor->Width() * tensor->Batch(), tensor->Height(),
+                     tensor->Slices());
     return queue_->DispatchImplicit(kernel_, grid, {16, 8, 1});
   }
 
-  Dimensions dims_;
+  Arguments args_;
+  BHWC shape_;
   CLKernel kernel_;
+  TensorDescriptor tensor_descriptor_;
   CLCommandQueue* queue_ = nullptr;
+  const CLContext* context_ = nullptr;
 };
 
 bool IsSupportedDataType(DataType type) {
@@ -87,15 +90,11 @@ class FromTensorConverter : public OpenClConverterImpl {
   std::pair<std::string, std::string> GetToDhwc4Kernel(
       const TensorObjectDef& input_def,
       const TensorObjectDef& output_def) const {
-    return std::make_pair(
-        "__global " + ToCLDataType(output_def.object_def.data_type, 4) +
-            "* dst",
-        "dst[(d * size.y + y) * size.x + x] = " +
-            (output_def.object_def.data_type == input_def.object_def.data_type
-                 ? "input;"
-                 : "convert_" +
-                       ToCLDataType(output_def.object_def.data_type, 4) +
-                       "(input);"));
+    return std::make_pair("__global " +
+                              ToCLDataType(output_def.object_def.data_type, 4) +
+                              "* dst",
+                          "dst[(d * args.tensor.Height() + y) * "
+                          "args.tensor.Width() + x] = input;");
   }
 
   std::pair<std::string, std::string> GetToBhwcKernel(
@@ -105,16 +104,16 @@ class FromTensorConverter : public OpenClConverterImpl {
         "__global " + ToCLDataType(output_def.object_def.data_type) + "* dst",
         R"(
   int c = d * 4;
-  int index = ((b * size.y + y) * size.x + x) * channels + c;
+  int index = ((b * args.tensor.Height() + y) * args.tensor.Width() + x) * args.tensor.Channels() + c;
 
   dst[index] = input.x;
-  if (c + 1 < channels) {
+  if (c + 1 < args.tensor.Channels()) {
     dst[index + 1] = input.y;
   }
-  if (c + 2 < channels) {
+  if (c + 2 < args.tensor.Channels()) {
     dst[index + 2] = input.z;
   }
-  if (c + 3 < channels) {
+  if (c + 3 < args.tensor.Channels()) {
     dst[index + 3] = input.w;
   })");
   }
@@ -128,12 +127,11 @@ class FromTensorConverter : public OpenClConverterImpl {
 
     TensorStorageType src_tensor_type = ToTensorStorageType(
         input_def.object_def.object_type, input_def.object_def.data_layout);
-    TensorDescriptor src_descr;
-    src_descr.storage_type = src_tensor_type;
-    src_descr.data_type = input_def.object_def.data_type;
-    TensorCodeGenerator src_tensor(
-        "src", WHSBPoint{"size.x", "size.y", "size.z", "size.w"}, src_descr);
-
+    tensor_descriptor_.layout = Layout::BHWC;
+    tensor_descriptor_.storage_type = src_tensor_type;
+    tensor_descriptor_.data_type = input_def.object_def.data_type;
+    args_.AddObjectRef("tensor", AccessType::READ,
+                       absl::make_unique<TensorDescriptor>(tensor_descriptor_));
     std::string shader_src =
         R"(
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
@@ -141,19 +139,23 @@ class FromTensorConverter : public OpenClConverterImpl {
 const sampler_t smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;
 
 __kernel void from_tensor()" +
-        src_tensor.GetDeclaration(AccessType::READ) + ", " +
-        params_kernel.first + R"(, int4 size, int channels) {
+        params_kernel.first + R"(, $0) {
   int linear_id = get_global_id(0);
-  int x = (linear_id / size.w);
-  int b = linear_id % size.w;
+  int x = linear_id / args.tensor.Batch();
+  int b = linear_id % args.tensor.Batch();
   int y = get_global_id(1);
   int d = get_global_id(2);
-  if (x >= size.x || y >= size.y || d >= size.z) return;
-  )" + ToCLDataType(input_def.object_def.data_type, 4) +
-        " input = " + src_tensor.ReadWHSB("x", "y", "d", "b") + ";\n" +
+  if (x >= args.tensor.Width() || y >= args.tensor.Height() || d >= args.tensor.Slices()) return;
+  )" + ToCLDataType(output_def.object_def.data_type, 4) +
+        " input = args.tensor.Read<" +
+        ToCLDataType(output_def.object_def.data_type) + ">(x, y, d, b);\n" +
         params_kernel.second + "\n}";
     queue_ = environment->queue();
-    dims_ = input_def.dimensions;
+    context_ = &environment->context();
+    shape_ = BHWC(input_def.dimensions.b, input_def.dimensions.h,
+                  input_def.dimensions.w, input_def.dimensions.c);
+    RETURN_IF_ERROR(args_.TransformToCLCode(environment->device().GetInfo(), {},
+                                            &shader_src));
     return environment->program_cache()->GetOrCreateCLKernel(
         shader_src, "from_tensor", environment->context(),
         environment->device(), &kernel_);
@@ -166,15 +168,23 @@ __kernel void from_tensor()" +
       return absl::InvalidArgumentError(
           "Missing output in from_tensor converter");
     }
+    cl_mem memory = nullptr;
     auto input_texture = absl::get_if<OpenClTexture>(&input_obj);
     if (input_texture && input_texture->memobj) {
-      return DispatchKernel(input_texture->memobj, output->memobj);
+      memory = input_texture->memobj;
     }
     auto input_buffer = absl::get_if<OpenClBuffer>(&input_obj);
     if (input_buffer && input_buffer->memobj) {
-      return DispatchKernel(input_buffer->memobj, output->memobj);
+      memory = input_buffer->memobj;
     }
-    return absl::InvalidArgumentError("Missing input in from_tensor converter");
+    if (!memory) {
+      return absl::InvalidArgumentError(
+          "Missing input in from_tensor converter");
+    }
+    Tensor tensor;
+    RETURN_IF_ERROR(CreateSharedTensor(*context_, memory, shape_,
+                                       tensor_descriptor_, &tensor));
+    return DispatchKernel(output->memobj, &tensor);
   }
 };
 
@@ -208,10 +218,12 @@ class ToTensorConverter : public OpenClConverterImpl {
     return std::make_pair(
         "__global " + ToCLDataType(input_def.object_def.data_type, 4) + "* src",
         output_def.object_def.data_type == input_def.object_def.data_type
-            ? "result = src[(d * size.y + y) * size.x + x];"
+            ? "result = src[(d * args.tensor.Height() + y) * "
+              "args.tensor.Width() + x];"
             : "result = convert_" +
                   ToCLDataType(output_def.object_def.data_type, 4) +
-                  "(src[(d * size.y + y) * size.x + x]);");
+                  "(src[(d * args.tensor.Height() + y) * args.tensor.Width() + "
+                  "x]);");
   }
 
   std::pair<std::string, std::string> GetFromBhwcKernel(
@@ -220,11 +232,11 @@ class ToTensorConverter : public OpenClConverterImpl {
     return std::make_pair(
         "__global " + ToCLDataType(input_def.object_def.data_type) + "* src",
         R"(int c = d * 4;
-  int index = ((b * size.y + y) * size.x + x) * channels + c;
+  int index = ((b * args.tensor.Height() + y) * args.tensor.Width() + x) * args.tensor.Channels() + c;
   result.x = src[index];
-  result.y = c + 1 < channels ? src[index + 1] : 1;
-  result.z = c + 2 < channels ? src[index + 2] : 2;
-  result.w = c + 3 < channels ? src[index + 3] : 3;
+  result.y = c + 1 < args.tensor.Channels() ? src[index + 1] : 1;
+  result.z = c + 2 < args.tensor.Channels() ? src[index + 2] : 2;
+  result.w = c + 3 < args.tensor.Channels() ? src[index + 3] : 3;
 )");
   }
 
@@ -236,31 +248,34 @@ class ToTensorConverter : public OpenClConverterImpl {
                              : GetFromDhwc4Kernel(input_def, output_def);
     TensorStorageType dst_tensor_type = ToTensorStorageType(
         output_def.object_def.object_type, output_def.object_def.data_layout);
-    TensorDescriptor dst_descr;
-    dst_descr.storage_type = dst_tensor_type;
-    dst_descr.data_type = output_def.object_def.data_type;
-    TensorCodeGenerator dst_tensor(
-        "dst", WHSBPoint{"size.x", "size.y", "size.z", "size.w"}, dst_descr);
+    tensor_descriptor_.layout = Layout::BHWC;
+    tensor_descriptor_.storage_type = dst_tensor_type;
+    tensor_descriptor_.data_type = output_def.object_def.data_type;
+    args_.AddObjectRef("tensor", AccessType::WRITE,
+                       absl::make_unique<TensorDescriptor>(tensor_descriptor_));
     std::string shader_src =
         R"(
+#pragma OPENCL EXTENSION cl_khr_3d_image_writes : enable
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
 
 __kernel void to_tensor()" +
-        params_kernel.first + ", " +
-        dst_tensor.GetDeclaration(AccessType::WRITE) +
-        R"(, int4 size, int channels) {
+        params_kernel.first + R"(, $0) {
   int linear_id = get_global_id(0);
-  int x = (linear_id / size.w);
-  int b = linear_id % size.w;
+  int x = linear_id / args.tensor.Batch();
+  int b = linear_id % args.tensor.Batch();
   int y = get_global_id(1);
   int d = get_global_id(2);
 
-  if (x >= size.x || y >= size.y || d >= size.z) return;
+  if (x >= args.tensor.Width() || y >= args.tensor.Height() || d >= args.tensor.Slices()) return;
   )" + ToCLDataType(output_def.object_def.data_type, 4) +
         " result;\n" + params_kernel.second + "\n  " +
-        dst_tensor.WriteWHSB("result", "x", "y", "d", "b") + ";\n}";
+        "args.tensor.Write(result, x, y, d, b);\n}";
     queue_ = environment->queue();
-    dims_ = output_def.dimensions;
+    context_ = &environment->context();
+    shape_ = BHWC(output_def.dimensions.b, output_def.dimensions.h,
+                  output_def.dimensions.w, output_def.dimensions.c);
+    RETURN_IF_ERROR(args_.TransformToCLCode(environment->device().GetInfo(), {},
+                                            &shader_src));
     return environment->program_cache()->GetOrCreateCLKernel(
         shader_src, "to_tensor", environment->context(), environment->device(),
         &kernel_);
@@ -272,15 +287,23 @@ __kernel void to_tensor()" +
     if (!input || !input->memobj) {
       return absl::InvalidArgumentError("Missing input in to_tensor converter");
     }
+    cl_mem memory = nullptr;
     auto output_texture = absl::get_if<OpenClTexture>(&output_obj);
     if (output_texture && output_texture->memobj) {
-      return DispatchKernel(input->memobj, output_texture->memobj);
+      memory = output_texture->memobj;
     }
     auto output_buffer = absl::get_if<OpenClBuffer>(&output_obj);
     if (output_buffer && output_buffer->memobj) {
-      return DispatchKernel(input->memobj, output_buffer->memobj);
+      memory = output_buffer->memobj;
     }
-    return absl::InvalidArgumentError("Missing input in to_tensor converter");
+    if (!memory) {
+      return absl::InvalidArgumentError(
+          "Missing output in to_tensor converter");
+    }
+    Tensor tensor;
+    RETURN_IF_ERROR(CreateSharedTensor(*context_, memory, shape_,
+                                       tensor_descriptor_, &tensor));
+    return DispatchKernel(input->memobj, &tensor);
   }
 };
 
@@ -326,7 +349,8 @@ class TrivialCopier : public OpenClConverterImpl {
   absl::Status Init(const TensorObjectDef& input_def,
                     const TensorObjectDef& output_def,
                     Environment* environment) final {
-    dims_ = input_def.dimensions;
+    shape_ = BHWC(input_def.dimensions.b, input_def.dimensions.h,
+                  input_def.dimensions.w, input_def.dimensions.c);
     data_type_ = input_def.object_def.data_type;
     queue_ = environment->queue();
     region_ = CalculateTextureRegion(output_def);
@@ -352,10 +376,11 @@ class TrivialCopier : public OpenClConverterImpl {
     if (input.memobj == output.memobj) {
       return absl::OkStatus();
     }
-    return GetOpenCLError(clEnqueueCopyBuffer(
-        queue_->queue(), input.memobj, output.memobj, 0, 0,
-        SizeOf(data_type_) * dims_.w * dims_.h * dims_.d() * dims_.b * 4, 0,
-        nullptr, nullptr));
+    return GetOpenCLError(
+        clEnqueueCopyBuffer(queue_->queue(), input.memobj, output.memobj, 0, 0,
+                            SizeOf(data_type_) * shape_.w * shape_.h *
+                                AlignByN(shape_.c, 4) * shape_.b,
+                            0, nullptr, nullptr));
   }
 
   absl::Status Copy(const OpenClTexture& input, const OpenClTexture& output) {
diff --git a/tensorflow/lite/delegates/gpu/cl/tensor.cc b/tensorflow/lite/delegates/gpu/cl/tensor.cc
index ad4cdf6a2a1..cc01f888b71 100644
--- a/tensorflow/lite/delegates/gpu/cl/tensor.cc
+++ b/tensorflow/lite/delegates/gpu/cl/tensor.cc
@@ -76,6 +76,23 @@ absl::Status CreateTensor(const CLContext& context, const CLDevice& device,
   return absl::OkStatus();
 }
 
+absl::Status CreateTensorShared(const CLContext& context, const BHWDC& shape,
+                                const TensorDescriptor& descriptor,
+                                cl_mem memory, Tensor* result) {
+  const bool memory_owner = false;
+  if (descriptor.storage_type == TensorStorageType::IMAGE_BUFFER) {
+    cl_mem image_memory;
+    RETURN_IF_ERROR(CreateImageBufferFromBuffer(
+        context, memory, descriptor.data_type,
+        shape.b * shape.w * shape.h * shape.d * DivideRoundUp(shape.c, 4),
+        &image_memory));
+    *result = Tensor(memory, memory_owner, image_memory, shape, descriptor);
+  } else {
+    *result = Tensor(memory, memory_owner, shape, descriptor);
+  }
+  return absl::OkStatus();
+}
+
 }  // namespace
 
 Tensor::Tensor(cl_mem memory, bool memory_owner, const BHWC& shape,
@@ -415,21 +432,19 @@ absl::Status CreateTensor(const CLContext& context, const CLDevice& device,
   return CreateTensor(context, device, shape, descriptor, nullptr, result);
 }
 
-absl::Status CreateSharedTensor(const CLContext& context,
-                                const CLDevice& device, cl_mem memory,
+absl::Status CreateSharedTensor(const CLContext& context, cl_mem memory,
                                 const BHWC& shape,
                                 const TensorDescriptor& descriptor,
                                 Tensor* result) {
   const BHWDC shape5D(shape.b, shape.h, shape.w, 1, shape.c);
-  return CreateTensor(context, device, shape5D, descriptor, memory, result);
+  return CreateTensorShared(context, shape5D, descriptor, memory, result);
 }
 
-absl::Status CreateSharedTensor(const CLContext& context,
-                                const CLDevice& device, cl_mem memory,
+absl::Status CreateSharedTensor(const CLContext& context, cl_mem memory,
                                 const BHWDC& shape,
                                 const TensorDescriptor& descriptor,
                                 Tensor* result) {
-  return CreateTensor(context, device, shape, descriptor, memory, result);
+  return CreateTensorShared(context, shape, descriptor, memory, result);
 }
 
 absl::Status AllocateTensorMemory(const CLContext& context,
diff --git a/tensorflow/lite/delegates/gpu/cl/tensor.h b/tensorflow/lite/delegates/gpu/cl/tensor.h
index 3183c5ff731..8d914970743 100644
--- a/tensorflow/lite/delegates/gpu/cl/tensor.h
+++ b/tensorflow/lite/delegates/gpu/cl/tensor.h
@@ -176,14 +176,12 @@ absl::Status CreateTensor(const CLContext& context, const CLDevice& device,
                           const BHWDC& shape,
                           const TensorDescriptor& descriptor, Tensor* result);
 
-absl::Status CreateSharedTensor(const CLContext& context,
-                                const CLDevice& device, cl_mem memory,
+absl::Status CreateSharedTensor(const CLContext& context, cl_mem memory,
                                 const BHWC& shape,
                                 const TensorDescriptor& descriptor,
                                 Tensor* result);
 
-absl::Status CreateSharedTensor(const CLContext& context,
-                                const CLDevice& device, cl_mem memory,
+absl::Status CreateSharedTensor(const CLContext& context, cl_mem memory,
                                 const BHWDC& shape,
                                 const TensorDescriptor& descriptor,
                                 Tensor* result);

From d184481b84bb56f7acda8cb3e0b70063df8a0234 Mon Sep 17 00:00:00 2001
From: Robert David <lrdx@google.com>
Date: Fri, 26 Jun 2020 14:55:31 -0700
Subject: [PATCH 1194/1390] LSTM: Split cell update to separate functions.

PiperOrigin-RevId: 318549115
Change-Id: I6e890091990f3a829e8db0675c84480ac2108823
---
 tensorflow/lite/kernels/lstm_eval.cc          | 172 ++++++++++--------
 .../calibration/builtin_logging_ops/lstm.cc   |  46 +++--
 2 files changed, 128 insertions(+), 90 deletions(-)

diff --git a/tensorflow/lite/kernels/lstm_eval.cc b/tensorflow/lite/kernels/lstm_eval.cc
index c53f213aa61..9b24dc04c26 100644
--- a/tensorflow/lite/kernels/lstm_eval.cc
+++ b/tensorflow/lite/kernels/lstm_eval.cc
@@ -127,6 +127,45 @@ inline float GetTensorScale(const TfLiteTensor* tensor) {
   return tensor == nullptr ? 1.0f : tensor->params.scale;
 }
 
+// LINT.IfChange
+// Updates the LSTM cell state, used by both float and hybrid LSTM versions.
+//
+// Implements the following formula:
+//   cell_state_new = clip(forget_gate * cell_state + input_gate * cell_gate)
+//
+// With CIFG LSTM, input gate is replaced by (1-forget_gate).
+//
+// Parameters:
+//  - n_batch, n_cell: sizes of vectors
+//  - cell_state: input/output vector, size n_batch*n_cell
+//  - input_gate: input vector, size n_batch*n_cell.
+//  - forget_gate: input/scratch vector, size n_batch*n_cell, modified with CIFG
+//  - cell_gate: input vector, size n_batch*n_cell.
+//  - use_cifg: use 1-forget_gate instead of input_gate.
+//  - clip: if > 0, clip the resulting cell state to [-clip, +clip].
+void UpdateLstmCellFloat(int n_batch, int n_cell, float* cell_state,
+                         const float* input_gate, float* forget_gate,
+                         const float* cell_gate, bool use_cifg, float clip) {
+  tensor_utils::VectorVectorCwiseProduct(forget_gate, cell_state,
+                                         n_batch * n_cell, cell_state);
+
+  if (use_cifg) {
+    // With CIFG, input_gate = 1-forget_gate. Use the forget_gate array as
+    // scratch, as input_gate array is not allocated in this case. (Be careful
+    // not to write to the scratch before reading the forget gate data.)
+    float* scratch = forget_gate;
+    tensor_utils::Sub1Vector(forget_gate, n_batch * n_cell, scratch);
+    tensor_utils::VectorVectorCwiseProductAccumulate(
+        cell_gate, scratch, n_batch * n_cell, cell_state);
+  } else {
+    tensor_utils::VectorVectorCwiseProductAccumulate(
+        cell_gate, input_gate, n_batch * n_cell, cell_state);
+  }
+  if (clip > 0.0f) {
+    tensor_utils::CwiseClipping(cell_state, n_batch * n_cell, clip);
+  }
+}
+
 // Calculates the output state tensor of an LSTM step.
 //
 // Implements the following formula:
@@ -148,7 +187,6 @@ inline float GetTensorScale(const TfLiteTensor* tensor) {
 //  - proj_clip: if > 0, clip the output of the projection.
 //  - output_state: output vector, size n_batch*n_output. Must be contigous.
 //  - scratch: scratch area, size n_batch*n_cell.
-// LINT.IfChange
 void CalculateLstmOutputFloat(int n_batch, int n_cell, int n_output,
                               const float* cell_state, const float* output_gate,
                               TfLiteFusedActivation activation,
@@ -180,7 +218,8 @@ void CalculateLstmOutputFloat(int n_batch, int n_cell, int n_output,
     std::copy_n(scratch, n_batch * n_output, output_state);
   }
 }
-// LINT.ThenChange(//tensorflow/lite/tools/optimize/calibration/builtin_logging_ops/lstm.cc)
+// LINT.ThenChange(../tools/optimize/calibration/builtin_logging_ops/lstm.cc,\
+//                 ../experimental/kernels/fp16/lstm_eval.cc)
 
 // Calculates the output state tensor of an LSTM step. See Float version too.
 //
@@ -242,6 +281,44 @@ void CalculateLstmOutputHybrid(
   }
 }
 
+// Updates the LSTM cell state, used by both integer LSTM versions.
+// Also see UpdateLstmCellFloat.
+//
+// Parameters:
+//  - n_batch, n_cell: sizes of vectors
+//  - cell_state: input/output vector, size n_batch*n_cell
+//  - cell_state_scale: scaling factor of cell state.
+//  - input_gate: input vector, size n_batch*n_cell.
+//  - forget_gate: input/scratch vector, size n_batch*n_cell, always modified.
+//  - cell_gate: input vector, size n_batch*n_cell.
+//  - use_cifg: use 1-forget_gate instead of input_gate.
+//  - clip: if > 0, clip the resulting cell state to [-clip, +clip].
+void UpdateLstmCellInteger(int n_batch, int n_cell, int16_t* cell_state,
+                           int32_t cell_state_scale, const int16_t* input_gate,
+                           int16_t* forget_gate, const int16_t* cell_gate,
+                           bool use_cifg, int16_t clip) {
+  // Use the forget_gate array as scratch, as input_gate array is not allocated
+  // in CIFG case. (Be careful not to write to the scratch before reading the
+  // forget gate data.)
+  int16_t* scratch = forget_gate;
+
+  tensor_utils::CwiseMul(forget_gate, cell_state, n_batch, n_cell, 15,
+                         cell_state);
+  if (use_cifg) {
+    tensor_utils::Sub1Vector(forget_gate, n_batch * n_cell, scratch);
+    tensor_utils::CwiseMul(scratch, cell_gate, n_batch, n_cell,
+                           30 + cell_state_scale, scratch);
+  } else {
+    tensor_utils::CwiseMul(input_gate, cell_gate, n_batch, n_cell,
+                           30 + cell_state_scale, scratch);
+  }
+  tensor_utils::CwiseAdd(cell_state, scratch, n_batch, n_cell, cell_state);
+
+  if (clip > 0) {
+    tensor_utils::CwiseClipping(cell_state, n_batch * n_cell, clip);
+  }
+}
+
 // Calculates the output state tensor of an LSTM step. See Float and hybrid
 // versions as well.
 //
@@ -552,8 +629,6 @@ inline void LstmStepFloat(
                                      forget_gate_scratch);
 
   // For each batch and cell: update the cell.
-  tensor_utils::VectorVectorCwiseProduct(forget_gate_scratch, cell_state_ptr,
-                                         n_batch * n_cell, cell_state_ptr);
   if (use_layer_norm) {
     tensor_utils::MeanStddevNormalization(cell_gate_scratch, cell_gate_scratch,
                                           n_cell, n_batch);
@@ -565,21 +640,10 @@ inline void LstmStepFloat(
   }
   tensor_utils::ApplyActivationToVector(cell_gate_scratch, n_batch * n_cell,
                                         params->activation, cell_gate_scratch);
-  if (use_cifg) {
-    tensor_utils::Sub1Vector(forget_gate_scratch, n_batch * n_cell,
-                             forget_gate_scratch);
-    tensor_utils::VectorVectorCwiseProductAccumulate(
-        cell_gate_scratch, forget_gate_scratch, n_batch * n_cell,
-        cell_state_ptr);
-  } else {
-    tensor_utils::VectorVectorCwiseProductAccumulate(
-        cell_gate_scratch, input_gate_scratch, n_batch * n_cell,
-        cell_state_ptr);
-  }
-  if (params->cell_clip > 0.0) {
-    tensor_utils::CwiseClipping(cell_state_ptr, n_batch * n_cell,
-                                params->cell_clip);
-  }
+
+  UpdateLstmCellFloat(n_batch, n_cell, cell_state_ptr, input_gate_scratch,
+                      forget_gate_scratch, cell_gate_scratch, use_cifg,
+                      params->cell_clip);
 
   // For each batch and cell: update the output gate.
   if (use_peephole) {
@@ -611,7 +675,8 @@ inline void LstmStepFloat(
                 output_ptr + b * output_batch_leading_dim);
   }
 }
-// LINT.ThenChange(//tensorflow/lite/tools/optimize/calibration/builtin_logging_ops/lstm.cc)
+// LINT.ThenChange(../tools/optimize/calibration/builtin_logging_ops/lstm.cc,\
+//                 ../experimental/kernels/fp16/lstm_eval.cc)
 
 // Same as above but with quantized weight matrices. In detail:
 // Input of size 'n_batch * n_input':
@@ -995,8 +1060,6 @@ inline void LstmStepHybrid(
                                      forget_gate_scratch);
 
   // For each batch and cell: update the cell.
-  tensor_utils::VectorVectorCwiseProduct(forget_gate_scratch, cell_state_ptr,
-                                         n_batch * n_cell, cell_state_ptr);
   if (use_layer_norm) {
     tensor_utils::MeanStddevNormalization(cell_gate_scratch, cell_gate_scratch,
                                           n_cell, n_batch);
@@ -1008,21 +1071,10 @@ inline void LstmStepHybrid(
   }
   tensor_utils::ApplyActivationToVector(cell_gate_scratch, n_batch * n_cell,
                                         params->activation, cell_gate_scratch);
-  if (use_cifg) {
-    tensor_utils::Sub1Vector(forget_gate_scratch, n_batch * n_cell,
-                             forget_gate_scratch);
-    tensor_utils::VectorVectorCwiseProductAccumulate(
-        cell_gate_scratch, forget_gate_scratch, n_batch * n_cell,
-        cell_state_ptr);
-  } else {
-    tensor_utils::VectorVectorCwiseProductAccumulate(
-        cell_gate_scratch, input_gate_scratch, n_batch * n_cell,
-        cell_state_ptr);
-  }
-  if (params->cell_clip > 0.0) {
-    tensor_utils::CwiseClipping(cell_state_ptr, n_batch * n_cell,
-                                params->cell_clip);
-  }
+
+  UpdateLstmCellFloat(n_batch, n_cell, cell_state_ptr, input_gate_scratch,
+                      forget_gate_scratch, cell_gate_scratch, use_cifg,
+                      params->cell_clip);
 
   // For each batch and cell: update the output gate.
   if (use_peephole) {
@@ -1152,6 +1204,7 @@ inline void LstmStepHybrid(
 //   output_state_ptr - size 'n_batch * n_output'
 //   cell_state_ptr   - size 'n_batch * n_cell'
 //   output_ptr       - size 'n_batch * n_output'
+// TODO(b/159947023): scratch0 is not used if (!cifg). Don't allocate then.
 inline void LstmStepInteger8x8_16(
     const int8_t* input_ptr, const int8_t* input_to_input_weight_ptr,
     int32_t effective_input_to_input_scale_a,
@@ -1300,10 +1353,7 @@ inline void LstmStepInteger8x8_16(
                           cell_gate_scratch);
 
   // Input gate.
-  if (use_cifg) {
-    tensor_utils::Sub1Vector(forget_gate_scratch, n_batch * n_cell,
-                             input_gate_scratch);
-  } else {
+  if (!use_cifg) {
     tensor_utils::MatrixBatchVectorMultiplyAccumulate(
         input_ptr, input_to_input_effective_bias, input_to_input_weight_ptr,
         effective_input_to_input_scale_a, effective_input_to_input_scale_b,
@@ -1331,20 +1381,9 @@ inline void LstmStepInteger8x8_16(
                                input_gate_scratch);
   }
 
-  // New cell state.
-  tensor_utils::CwiseMul(forget_gate_scratch, cell_state_ptr, n_batch, n_cell,
-                         15, forget_gate_scratch);
-
-  tensor_utils::CwiseMul(input_gate_scratch, cell_gate_scratch, n_batch, n_cell,
-                         30 + cell_state_scale, cell_gate_scratch);
-
-  tensor_utils::CwiseAdd(forget_gate_scratch, cell_gate_scratch, n_batch,
-                         n_cell, cell_state_ptr);
-
-  if (quantized_cell_clip > 0) {
-    tensor_utils::CwiseClipping(cell_state_ptr, n_batch * n_cell,
-                                quantized_cell_clip);
-  }
+  UpdateLstmCellInteger(n_batch, n_cell, cell_state_ptr, cell_state_scale,
+                        input_gate_scratch, forget_gate_scratch,
+                        cell_gate_scratch, use_cifg, quantized_cell_clip);
 
   // Ouptut gate.
   tensor_utils::MatrixBatchVectorMultiplyAccumulate(
@@ -1480,6 +1519,7 @@ inline void LstmStepInteger8x8_16(
 //   cell_state_ptr   - size 'n_batch * n_cell'
 //   output_ptr       - size 'n_batch * n_output'
 // TODO(b/148688698): Move zero point calculation into Prepare().
+// TODO(b/159947023): scratch5 is unused, remove.
 inline void LstmStepInteger8x8_8(
     const int8_t* input_ptr, int32_t input_zp,
     const int8_t* input_to_input_weight_ptr,
@@ -1537,7 +1577,6 @@ inline void LstmStepInteger8x8_8(
     int16_t* scratch7) {
   ruy::profiler::ScopeLabel label("LstmStepInteger8x8_8");
   // Make named scratch buffers for the different gates.
-  int16_t* input_gate_scratch = scratch5;
   int16_t* forget_gate_scratch = scratch2;
   int16_t* cell_gate_scratch = scratch3;
   int16_t* output_gate_scratch = scratch4;
@@ -1628,23 +1667,10 @@ inline void LstmStepInteger8x8_8(
   tensor_utils::ApplySigmoidFloat(output_gate_scratch, n_batch, n_cell,
                                   output_gate_scratch);
 
-  // Input gate with cifg
-  tensor_utils::Sub1Vector(forget_gate_scratch, n_batch * n_cell,
-                           input_gate_scratch);
-
-  // New cell.
-  tensor_utils::CwiseMul(forget_gate_scratch, cell_state_ptr, n_batch, n_cell,
-                         15 + 15 - 15, scratch6);
-
-  tensor_utils::CwiseMul(input_gate_scratch, cell_gate_scratch, n_batch, n_cell,
-                         15 + 15 - 15, scratch7);
-
-  tensor_utils::CwiseAdd(scratch6, scratch7, n_batch, n_cell, cell_state_ptr);
-
-  if (quantized_cell_clip > 0) {
-    tensor_utils::CwiseClipping(cell_state_ptr, n_batch * n_cell,
-                                quantized_cell_clip);
-  }
+  UpdateLstmCellInteger(n_batch, n_cell, cell_state_ptr,
+                        /*cell_state_scale=*/-15, /*input_gate=*/nullptr,
+                        forget_gate_scratch, cell_gate_scratch,
+                        /*use_cifg=*/true, quantized_cell_clip);
 
   CalculateLstmOutputInteger8x8_8(
       n_batch, n_cell, n_output, cell_state_ptr, output_gate_scratch,
diff --git a/tensorflow/lite/tools/optimize/calibration/builtin_logging_ops/lstm.cc b/tensorflow/lite/tools/optimize/calibration/builtin_logging_ops/lstm.cc
index 6399af013c5..e61faf2b822 100644
--- a/tensorflow/lite/tools/optimize/calibration/builtin_logging_ops/lstm.cc
+++ b/tensorflow/lite/tools/optimize/calibration/builtin_logging_ops/lstm.cc
@@ -37,6 +37,31 @@ namespace builtin {
 
 namespace {
 
+// TODO(b/159066113): This is the exact same function as UpdateLstmCellFloat in
+// kernels/lstm_eval.cc, make that public and remove this.
+void UpdateLstmCellFloat(int n_batch, int n_cell, float* cell_state,
+                         const float* input_gate, float* forget_gate,
+                         const float* cell_gate, bool use_cifg, float clip) {
+  tensor_utils::VectorVectorCwiseProduct(forget_gate, cell_state,
+                                         n_batch * n_cell, cell_state);
+
+  if (use_cifg) {
+    // With CIFG, input_gate = 1-forget_gate. Use the forget_gate array as
+    // scratch, as input_gate array is not allocated in this case. (Be careful
+    // not to write to the scratch before reading the forget gate data.)
+    float* scratch = forget_gate;
+    tensor_utils::Sub1Vector(forget_gate, n_batch * n_cell, scratch);
+    tensor_utils::VectorVectorCwiseProductAccumulate(
+        cell_gate, scratch, n_batch * n_cell, cell_state);
+  } else {
+    tensor_utils::VectorVectorCwiseProductAccumulate(
+        cell_gate, input_gate, n_batch * n_cell, cell_state);
+  }
+  if (clip > 0.0f) {
+    tensor_utils::CwiseClipping(cell_state, n_batch * n_cell, clip);
+  }
+}
+
 void CalculateLstmOutputFloat(
     int n_batch, int n_cell, int n_output, const float* cell_state,
     const float* output_gate, TfLiteFusedActivation activation,
@@ -230,8 +255,6 @@ inline void LstmStepWithAuxInput(
                                      forget_gate_scratch);
 
   // For each batch and cell: update the cell.
-  tensor_utils::VectorVectorCwiseProduct(forget_gate_scratch, cell_state_ptr,
-                                         n_batch * n_cell, cell_state_ptr);
   if (use_layer_norm) {
     logger->LogTensorValue(intermediate_tensor_indexes[2], cell_gate_scratch,
                            n_cell * n_batch, error_reporter);
@@ -245,21 +268,10 @@ inline void LstmStepWithAuxInput(
   }
   tensor_utils::ApplyActivationToVector(cell_gate_scratch, n_batch * n_cell,
                                         params->activation, cell_gate_scratch);
-  if (use_cifg) {
-    tensor_utils::Sub1Vector(forget_gate_scratch, n_batch * n_cell,
-                             forget_gate_scratch);
-    tensor_utils::VectorVectorCwiseProductAccumulate(
-        cell_gate_scratch, forget_gate_scratch, n_batch * n_cell,
-        cell_state_ptr);
-  } else {
-    tensor_utils::VectorVectorCwiseProductAccumulate(
-        cell_gate_scratch, input_gate_scratch, n_batch * n_cell,
-        cell_state_ptr);
-  }
-  if (params->cell_clip > 0.0) {
-    tensor_utils::CwiseClipping(cell_state_ptr, n_batch * n_cell,
-                                params->cell_clip);
-  }
+
+  UpdateLstmCellFloat(n_batch, n_cell, cell_state_ptr, input_gate_scratch,
+                      forget_gate_scratch, cell_gate_scratch, use_cifg,
+                      params->cell_clip);
 
   // For each batch and cell: update the output gate.
   if (use_peephole) {

From df6b21c13c82b5d0981642cfe18f10e60f78ea5c Mon Sep 17 00:00:00 2001
From: Katherine Wu <kathywu@google.com>
Date: Fri, 26 Jun 2020 14:58:58 -0700
Subject: [PATCH 1195/1390] Add SaveableObjects to SavedModel.

When objects are loaded from the SavedModel, they don't retain their `_gather_saveables_for_checkpoint` functions, which can result in values not being loaded from the checkpoint.

This CL adds a field in the SavedModel proto that stores a save and restore function for each SaveableObject in each node. When loading into Python, the SaveableObjects are restored using the functions.

PiperOrigin-RevId: 318549786
Change-Id: I688c72d7658e1bca98abf373a13a0e15a7fb83e2
---
 RELEASE.md                                    |   1 +
 .../core/protobuf/saved_object_graph.proto    |   8 +
 tensorflow/python/ops/lookup_ops.py           |  22 ++-
 tensorflow/python/saved_model/load.py         |  13 ++
 tensorflow/python/saved_model/load_test.py    |  16 ++
 tensorflow/python/saved_model/save.py         |  48 +++++-
 .../training/saving/saveable_object_util.py   | 157 +++++++++++++++++-
 tensorflow/python/training/tracking/base.py   |   8 +-
 .../python/training/tracking/graph_view.py    |  18 +-
 9 files changed, 271 insertions(+), 20 deletions(-)

diff --git a/RELEASE.md b/RELEASE.md
index 5b6b57d8103..804126a9402 100644
--- a/RELEASE.md
+++ b/RELEASE.md
@@ -72,6 +72,7 @@ This release contains contributions from many people at Google, as well as:
         moved to `tensorflow_addons` as
         `tensorflow_addons.rnn.PeepholeLSTMCell`. This experimental API is
         expected to be removed from TF in the next public release (2.4).
+* Mutable tables now restore checkpointed values when loaded from SavedModel.
 
 # Release 2.1.1
 
diff --git a/tensorflow/core/protobuf/saved_object_graph.proto b/tensorflow/core/protobuf/saved_object_graph.proto
index e794b885dec..981908cfa3c 100644
--- a/tensorflow/core/protobuf/saved_object_graph.proto
+++ b/tensorflow/core/protobuf/saved_object_graph.proto
@@ -61,6 +61,8 @@ message SavedObject {
     SavedConstant constant = 9;
     SavedResource resource = 10;
   }
+
+  map<string, SaveableObject> saveable_objects = 11;
 }
 
 // A SavedUserObject is an object (in the object-oriented language of the
@@ -162,3 +164,9 @@ message SavedResource {
   // device.
   string device = 1;
 }
+
+message SaveableObject {
+  // Node ids of concrete functions for saving and loading from a checkpoint.
+  int32 save_function = 2;
+  int32 restore_function = 3;
+}
diff --git a/tensorflow/python/ops/lookup_ops.py b/tensorflow/python/ops/lookup_ops.py
index 15c7f12f89c..96f3cf91499 100644
--- a/tensorflow/python/ops/lookup_ops.py
+++ b/tensorflow/python/ops/lookup_ops.py
@@ -1870,25 +1870,27 @@ class MutableHashTable(LookupInterface):
     return {
         "table":
             functools.partial(
-                MutableHashTable._Saveable, table=self, name=self._name)
+                MutableHashTable._Saveable, table=self, name=self._name,
+                table_name=self._name)
     }
 
   class _Saveable(BaseSaverBuilder.SaveableObject):
-    """SaveableObject implementation for MutableHashTable."""
+    """SaveableObject implementation for DenseHashTable."""
 
-    def __init__(self, table, name):
+    def __init__(self, table, name, table_name=None):
       tensors = table.export()
       specs = [
           BaseSaverBuilder.SaveSpec(tensors[0], "", name + "-keys"),
           BaseSaverBuilder.SaveSpec(tensors[1], "", name + "-values")
       ]
+      self.table_name = table_name or name
       # pylint: disable=protected-access
       super(MutableHashTable._Saveable, self).__init__(table, specs, name)
 
-    def restore(self, restored_tensors, restored_shapes, name=None):
+    def restore(self, restored_tensors, restored_shapes):
       del restored_shapes  # unused
       # pylint: disable=protected-access
-      with ops.name_scope(name, "%s_table_restore" % self.name):
+      with ops.name_scope("%s_table_restore" % self.table_name):
         with ops.colocate_with(self.op.resource_handle):
           return gen_lookup_ops.lookup_table_import_v2(self.op.resource_handle,
                                                        restored_tensors[0],
@@ -2166,25 +2168,27 @@ class DenseHashTable(LookupInterface):
     return {
         "table":
             functools.partial(
-                DenseHashTable._Saveable, table=self, name=self._name)
+                DenseHashTable._Saveable, table=self, name=self._name,
+                table_name=self._name)
     }
 
   class _Saveable(BaseSaverBuilder.SaveableObject):
     """SaveableObject implementation for DenseHashTable."""
 
-    def __init__(self, table, name):
+    def __init__(self, table, name, table_name=None):
       tensors = table.export()
       specs = [
           BaseSaverBuilder.SaveSpec(tensors[0], "", name + "-keys"),
           BaseSaverBuilder.SaveSpec(tensors[1], "", name + "-values")
       ]
+      self.table_name = table_name or name
       # pylint: disable=protected-access
       super(DenseHashTable._Saveable, self).__init__(table, specs, name)
 
-    def restore(self, restored_tensors, restored_shapes, name=None):
+    def restore(self, restored_tensors, restored_shapes):
       del restored_shapes  # unused
       # pylint: disable=protected-access
-      with ops.name_scope(name, "%s_table_restore" % self.name):
+      with ops.name_scope("%s_table_restore" % self.table_name):
         with ops.colocate_with(self.op.resource_handle):
           return gen_lookup_ops.lookup_table_import_v2(self.op.resource_handle,
                                                        restored_tensors[0],
diff --git a/tensorflow/python/saved_model/load.py b/tensorflow/python/saved_model/load.py
index fb2d01cbee2..0835481ab69 100644
--- a/tensorflow/python/saved_model/load.py
+++ b/tensorflow/python/saved_model/load.py
@@ -45,6 +45,7 @@ from tensorflow.python.saved_model import nested_structure_coder
 from tensorflow.python.saved_model import revived_types
 from tensorflow.python.saved_model import utils_impl as saved_model_utils
 from tensorflow.python.training.saving import checkpoint_options
+from tensorflow.python.training.saving import saveable_object_util
 from tensorflow.python.training.tracking import base
 from tensorflow.python.training.tracking import graph_view
 from tensorflow.python.training.tracking import tracking
@@ -146,6 +147,18 @@ class Loader(object):
     self._setup_functions_structures()
     self._setup_functions_captures()
 
+    self._create_saveable_object_factories()
+
+  def _create_saveable_object_factories(self):
+    for node_id, proto in enumerate(self._proto.nodes):
+      node = self.get(node_id)
+      node._self_saveable_object_factories = {}  # pylint: disable=protected-access
+      for name, saveable_object_proto in proto.saveable_objects.items():
+        node._self_saveable_object_factories[name] = (  # pylint: disable=protected-access
+            saveable_object_util.restored_saved_object_factory(
+                self.get(saveable_object_proto.save_function),
+                self.get(saveable_object_proto.restore_function)))
+
   def _load_edges(self):
     """Adds edges from objects to other objects and functions."""
     for node_id, object_proto in enumerate(self._proto.nodes):
diff --git a/tensorflow/python/saved_model/load_test.py b/tensorflow/python/saved_model/load_test.py
index 5449cc1c9a2..c392c7feb31 100644
--- a/tensorflow/python/saved_model/load_test.py
+++ b/tensorflow/python/saved_model/load_test.py
@@ -1795,6 +1795,22 @@ class LoadTest(test.TestCase, parameterized.TestCase):
     options = load_options.LoadOptions(experimental_io_device="/job:localhost")
     self.assertEqual("/job:localhost", options.experimental_io_device)
 
+  def test_load_custom_saveable_object(self, cycles):
+    root = tracking.AutoTrackable()
+    root.table = lookup_ops.MutableHashTable(dtypes.string, dtypes.float32, -1)
+    root.table.insert("foo", 15)
+
+    @def_function.function(
+        input_signature=[tensor_spec.TensorSpec(None, dtypes.string)])
+    def lookup(key):
+      return root.table.lookup(key)
+
+    root.lookup = lookup
+
+    imported = cycle(root, cycles)
+    self.assertEqual(self.evaluate(imported.lookup("foo")), 15)
+    self.assertEqual(self.evaluate(imported.lookup("idk")), -1)
+
 
 class SingleCycleTests(test.TestCase, parameterized.TestCase):
 
diff --git a/tensorflow/python/saved_model/save.py b/tensorflow/python/saved_model/save.py
index 84764431b9d..4220146b6c8 100644
--- a/tensorflow/python/saved_model/save.py
+++ b/tensorflow/python/saved_model/save.py
@@ -19,6 +19,7 @@ from __future__ import division
 from __future__ import print_function
 
 import collections
+import functools
 import os
 
 from tensorflow.core.framework import versions_pb2
@@ -54,6 +55,7 @@ from tensorflow.python.saved_model import tag_constants
 from tensorflow.python.saved_model import utils_impl
 from tensorflow.python.training.saving import checkpoint_options
 from tensorflow.python.training.saving import functional_saver
+from tensorflow.python.training.saving import saveable_object_util
 from tensorflow.python.training.tracking import base
 from tensorflow.python.training.tracking import graph_view
 from tensorflow.python.training.tracking import tracking
@@ -137,12 +139,15 @@ class _AugmentedGraphView(graph_view.ObjectGraphView):
     return obj._list_extra_dependencies_for_serialization(  # pylint: disable=protected-access
         self._serialization_cache)
 
-  def list_functions(self, obj):
+  def list_functions(self, obj, extra_functions=None):
     obj_functions = self._functions.get(obj, None)
     if obj_functions is None:
       obj_functions = obj._list_functions_for_serialization(  # pylint: disable=protected-access
           self._serialization_cache)
       self._functions[obj] = obj_functions
+    if extra_functions:
+      obj_functions = obj_functions.copy()
+      obj_functions.update(extra_functions)
     return obj_functions
 
 
@@ -178,6 +183,12 @@ class _SaveableView(object):
     self.slot_variables = slot_variables
     self.concrete_functions = []
 
+    self.saveable_objects_for_node, all_saveable_functions = (
+        self._add_saveable_objects())
+    saveable_object_functions = {
+        "__SAVEABLE_FUNCTION_{}".format(n): fn
+        for n, fn in enumerate(all_saveable_functions)}
+
     # Maps functions -> wrapped functions that capture variables
     self.wrapped_functions = wrapped_functions or {}
     # Maps names of concrete functions in the object to names of wrapped
@@ -191,7 +202,8 @@ class _SaveableView(object):
     nodes_without_functions = list(self.nodes)
     seen_function_names = set()
     for node in nodes_without_functions:
-      for function in checkpoint_view.list_functions(node).values():
+      for function in checkpoint_view.list_functions(
+          node, saveable_object_functions).values():
         if function not in self.node_ids:
           self.node_ids[function] = len(self.nodes)
           self.nodes.append(function)
@@ -210,6 +222,25 @@ class _SaveableView(object):
             seen_function_names.add(concrete_function.name)
             self.concrete_functions.append(concrete_function)
 
+  def _add_saveable_objects(self):
+    """Retrieves SaveablesObjects and traces their save/restore functions."""
+    # Maps node -> local name -> (save function, restore function)
+    saveable_objects_map = object_identity.ObjectIdentityDictionary()
+    all_saveable_functions = []
+    for node in self.nodes:
+      if resource_variable_ops.is_resource_variable(node):
+        # Resource (and TPU/Mirrored) variables  are automatically revived with
+        # their saveables defined, so there is no need to trace the save
+        # and restore functions.
+        continue
+      saveable_map = saveable_object_util.trace_save_restore_functions(node)
+      if saveable_map:
+        saveable_objects_map[node] = saveable_map
+        for save_fn, restore_fn in saveable_map.values():
+          all_saveable_functions.append(save_fn)
+          all_saveable_functions.append(restore_fn)
+    return saveable_objects_map, all_saveable_functions
+
   @property
   def root(self):
     return self.nodes[0]
@@ -234,6 +265,15 @@ class _SaveableView(object):
         child_proto.node_id = self.node_ids[ref_function]
         child_proto.local_name = local_name
 
+      if node not in self.saveable_objects_for_node:
+        continue
+
+      for local_name, (save_fn, restore_fn) in (
+          self.saveable_objects_for_node[node].items()):
+        saveable_object_proto = object_proto.saveable_objects[local_name]
+        saveable_object_proto.save_function = self.node_ids[save_fn]
+        saveable_object_proto.restore_function = self.node_ids[restore_fn]
+
   def map_resources(self):
     """Makes new resource handle ops corresponding to existing resource tensors.
 
@@ -606,7 +646,9 @@ def _fill_meta_graph_def(meta_graph_def, saveable_view, signature_functions,
   # the exported graph (thus the `to_graph` argument).
   saver = functional_saver.MultiDeviceSaver(
       saveable_view.checkpoint_view.frozen_saveable_objects(
-          object_map=object_map, to_graph=exported_graph))
+          object_map=object_map, to_graph=exported_graph,
+          call_with_mapped_captures=functools.partial(
+              _call_function_with_mapped_captures, resource_map=resource_map)))
 
   with exported_graph.as_default():
     signatures = _generate_signatures(signature_functions, resource_map)
diff --git a/tensorflow/python/training/saving/saveable_object_util.py b/tensorflow/python/training/saving/saveable_object_util.py
index 59d65ade573..c3c3570c0f8 100644
--- a/tensorflow/python/training/saving/saveable_object_util.py
+++ b/tensorflow/python/training/saving/saveable_object_util.py
@@ -17,15 +17,26 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import functools
 import six
 
 from tensorflow.python.eager import context
+from tensorflow.python.eager import def_function
+
+from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import device as pydev
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_spec
+from tensorflow.python.framework import tensor_util
+from tensorflow.python.framework import type_spec
+
+
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variables
+from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training.saving import saveable_object
 from tensorflow.python.training.tracking import base as trackable
 from tensorflow.python.util import nest
@@ -279,7 +290,7 @@ def op_list_to_dict(op_list, convert_variable_to_tensor=True):
           raise ValueError(
               ("Two different ResourceVariable objects with the same "
                "shared_name '%s' were passed to the Saver. This likely means "
-               "that they were created in different Graphs or isolation "
+               "that they were created in different Graphs or isoWlation "
                "contexts, and may not be checkpointed together.") %
               (var._shared_name,))
       else:
@@ -349,3 +360,147 @@ def validate_and_slice_inputs(names_to_saveables):
     for converted_saveable_object in saveable_objects_for_op(op, name):
       _add_saveable(saveables, seen_ops, converted_saveable_object)
   return saveables
+
+
+def trace_save_restore_functions(object_to_save):
+  """Gathers all SaveableObjects and traces the save and restore ops."""
+  saveable_map = {}  # Maps name -> (save function, restore function)
+  for name, saveable_factory in (
+      object_to_save._gather_saveables_for_checkpoint().items()):  # pylint: disable=protected-access
+    if not callable(saveable_factory):
+      if isinstance(saveable_factory, saveable_object.SaveableObject):
+        logging.debug(
+            "Trackable {} should return callable factories, not SaveableObjects"
+            " in `_gather_saveables_for_checkpoint`. This could lead to "
+            "problems loading the SavedModel back into Python."
+            .format(object_to_save))
+      continue
+
+    if is_factory_for_restored_saveable_object(saveable_factory):
+      saveable_map[name] = (saveable_factory.keywords["save_function"],
+                            saveable_factory.keywords["restore_function"])
+    else:
+      concrete_save_fn, concrete_restore_fn = _trace_save_and_restore_function(
+          saveable_factory, object_to_save)
+      if concrete_save_fn is not None:
+        saveable_map[name] = (concrete_save_fn, concrete_restore_fn)
+  return saveable_map
+
+
+def _trace_save_and_restore_function(saveable_factory, object_to_save):
+  """Traces the save and restore concrete functions."""
+  saveables = []
+
+  @def_function.function(
+      input_signature=[tensor_spec.TensorSpec([], dtypes.string)])
+  def save_fn(checkpoint_key):
+    maybe_saveable = saveable_factory(name=checkpoint_key)
+    if isinstance(maybe_saveable, saveable_object.SaveableObject):
+      maybe_saveable = [maybe_saveable]
+    saveables[:] = maybe_saveable
+
+    # Return list of all SaveSpecs created by the factory.
+    ret = []
+    for saveable in saveables:
+      for spec in saveable.specs:
+        ret.append({"name": spec.name, "tensor": spec.tensor,
+                    "slice_spec": spec.slice_spec})
+    return ret
+
+  concrete_save_fn = save_fn.get_concrete_function()
+  if any(isinstance(saveable, trackable.PythonStateSaveable)
+         for saveable in saveables):
+    logging.warn(
+        "Note that object {} stores python values into the checkpoint. "
+        "These values will not be restored when loading the SavedModel "
+        "into python.".format(object_to_save))
+    return None, None
+  if any(isinstance(saveable, trackable.NoRestoreSaveable)
+         for saveable in saveables):
+    return None, None
+
+  restored_type_specs = []
+  tensor_structure = []
+  for saveable in saveables:
+    saveable_tensor_structure = []
+    tensor_structure.append(saveable_tensor_structure)
+    for spec in saveable.specs:
+      restored_type_specs.append(type_spec.type_spec_from_value(spec.tensor))
+      saveable_tensor_structure.append(spec.name)
+
+  @def_function.function(input_signature=restored_type_specs)
+  def restore_fn(*restored_tensors):
+    structured_restored_tensors = nest.pack_sequence_as(
+        tensor_structure, restored_tensors)
+    for saveable, restored_tensors in zip(saveables,
+                                          structured_restored_tensors):
+      saveable.restore(restored_tensors, restored_shapes=None)
+    return 1
+
+  concrete_restore_fn = restore_fn.get_concrete_function()
+  return concrete_save_fn, concrete_restore_fn
+
+
+class RestoredSaveableObject(saveable_object.SaveableObject):
+  """SaveableObject restored from SavedModel using the traced save/restore."""
+
+  def __init__(self, save_function, restore_function, name):
+    self.save_function = save_function
+    self.restore_function = restore_function
+
+    if tensor_util.is_tensor(name):
+      name_tensor = name
+    else:
+      with ops.init_scope():
+        name_tensor = constant_op.constant(name)
+    tensors = save_function(name_tensor)
+    specs = [saveable_object.SaveSpec(x["tensor"], x["slice_spec"], x["name"])
+             for x in tensors]
+    super(RestoredSaveableObject, self).__init__(None, specs, name)
+
+  def restore(self, restored_tensors, restored_shapes):
+    del restored_shapes  # unused
+    return self.restore_function(
+        *[restored_tensors[i] for i in range(len(self.specs))])
+
+
+def restored_saved_object_factory(save_function, restore_function):
+  return functools.partial(RestoredSaveableObject,
+                           save_function=save_function,
+                           restore_function=restore_function)
+
+
+def create_saveable_object(factory, name, call_with_mapped_captures):
+  """Creates a SaveableObject while potentially in a different graph.
+
+  When creating the frozen saver for SavedModel, the save and restore ops are
+  placed in a separate graph. Since RestoredSaveableObject uses tf.functions to
+  save and restore, the function captures must be mapped to the new graph.
+
+  Args:
+    factory: Factory method for creating the SaveableObject.
+    name: Checkpoint key of this SaveableObject.
+    call_with_mapped_captures: Helper that calls a tf.function while remapping
+      the captures.
+
+  Returns:
+    a SaveableObject.
+  """
+  if (call_with_mapped_captures is None or
+      not is_factory_for_restored_saveable_object(factory)):
+    return factory(name=name)
+
+  concrete_save_fn = factory.keywords["save_function"]
+  def save_fn(name):
+    return call_with_mapped_captures(concrete_save_fn, [name])
+
+  concrete_restore_fn = factory.keywords["restore_function"]
+  def restore_fn(*restored_tensors):
+    return call_with_mapped_captures(concrete_restore_fn, restored_tensors)
+
+  return factory(save_function=save_fn, restore_function=restore_fn, name=name)
+
+
+def is_factory_for_restored_saveable_object(factory):
+  return (isinstance(factory, functools.partial) and
+          factory.func is RestoredSaveableObject)
diff --git a/tensorflow/python/training/tracking/base.py b/tensorflow/python/training/tracking/base.py
index ea76ad8db47..9337adbf88a 100644
--- a/tensorflow/python/training/tracking/base.py
+++ b/tensorflow/python/training/tracking/base.py
@@ -611,6 +611,12 @@ class Trackable(object):
     # building.
     self._self_name_based_restores = set()
 
+    # Dictionary of SaveableObjects factories. This dictionary is defined when
+    # the object is loaded from the SavedModel. When writing a custom class,
+    # prefer overriding "_gather_saveables_from_checkpoint" to using this
+    # attribute.
+    self._self_saveable_object_factories = {}
+
   @property
   def _object_identifier(self):
     """String used to identify this object in a SavedModel.
@@ -972,7 +978,7 @@ class Trackable(object):
        lambda name="global_name_for_this_object":
        SaveableObject(name=name, ...)}
     """
-    return {}
+    return self._self_saveable_object_factories
 
   def _list_extra_dependencies_for_serialization(self, serialization_cache):
     """Lists extra dependencies to serialize.
diff --git a/tensorflow/python/training/tracking/graph_view.py b/tensorflow/python/training/tracking/graph_view.py
index 041ff38eedd..1cf84023b1c 100644
--- a/tensorflow/python/training/tracking/graph_view.py
+++ b/tensorflow/python/training/tracking/graph_view.py
@@ -208,7 +208,7 @@ class ObjectGraphView(object):
 
   def _add_attributes_to_object_graph(
       self, trackable_objects, object_graph_proto, node_ids, object_names,
-      object_map):
+      object_map, call_with_mapped_captures):
     """Create SaveableObjects and corresponding SerializedTensor protos."""
     named_saveable_objects = []
     if self._saveables_cache is None:
@@ -253,7 +253,9 @@ class ObjectGraphView(object):
                 break
         if saveables is None:
           if callable(saveable_factory):
-            maybe_saveable = saveable_factory(name=attribute.checkpoint_key)
+            maybe_saveable = saveable_object_util.create_saveable_object(
+                saveable_factory, attribute.checkpoint_key,
+                call_with_mapped_captures)
           else:
             maybe_saveable = saveable_factory
           if isinstance(maybe_saveable, saveable_object_lib.SaveableObject):
@@ -332,7 +334,8 @@ class ObjectGraphView(object):
     return object_graph_proto
 
   def _serialize_gathered_objects(self, trackable_objects, path_to_root,
-                                  object_map=None):
+                                  object_map=None,
+                                  call_with_mapped_captures=None):
     """Create SaveableObjects and protos for gathered objects."""
     object_names = object_identity.ObjectIdentityDictionary()
     for obj, path in path_to_root.items():
@@ -354,7 +357,8 @@ class ObjectGraphView(object):
             object_graph_proto=object_graph_proto,
             node_ids=node_ids,
             object_names=object_names,
-            object_map=object_map))
+            object_map=object_map,
+            call_with_mapped_captures=call_with_mapped_captures))
     return named_saveable_objects, object_graph_proto, feed_additions
 
   def serialize_object_graph(self):
@@ -382,7 +386,8 @@ class ObjectGraphView(object):
     return self._serialize_gathered_objects(
         trackable_objects, path_to_root)
 
-  def frozen_saveable_objects(self, object_map=None, to_graph=None):
+  def frozen_saveable_objects(self, object_map=None, to_graph=None,
+                              call_with_mapped_captures=None):
     """Creates SaveableObjects with the current object graph frozen."""
     trackable_objects, path_to_root = self._breadth_first_traversal()
     if to_graph:
@@ -393,7 +398,8 @@ class ObjectGraphView(object):
       named_saveable_objects, graph_proto, _ = self._serialize_gathered_objects(
           trackable_objects,
           path_to_root,
-          object_map)
+          object_map,
+          call_with_mapped_captures)
       with ops.device("/cpu:0"):
         object_graph_tensor = constant_op.constant(
             graph_proto.SerializeToString(), dtype=dtypes.string)

From 36d22bf508d9c4e48510e7037baf3bd1bf89ec46 Mon Sep 17 00:00:00 2001
From: Jacques Pienaar <jpienaar@google.com>
Date: Fri, 26 Jun 2020 15:03:51 -0700
Subject: [PATCH 1196/1390] Query hasRank before querying getRank

It is not guaranteed that the operand is ranked.

PiperOrigin-RevId: 318550777
Change-Id: I8fa97508913c7202e30dbfb65ef40192a2eb5b68
---
 .../mlir/tensorflow/tests/shape_inference.mlir         | 10 ++++++++++
 .../mlir/tensorflow/transforms/shape_inference.cc      |  6 ++++--
 2 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/tensorflow/compiler/mlir/tensorflow/tests/shape_inference.mlir b/tensorflow/compiler/mlir/tensorflow/tests/shape_inference.mlir
index bf952445fa8..1fd30953799 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/shape_inference.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/shape_inference.mlir
@@ -457,4 +457,14 @@ func @multiple_blocks_one_return(%arg0: tensor<?xf32>) -> tensor<*xf32> {
     %1 = tensor_cast %arg0 : tensor<1xi32> to tensor<*xi32>
     return %1 : tensor<*xi32>
   }
+
+  // CHECK-LABEL: operand_pack_unranked
+  // Verify fix: this only verifies that shape inference runs and completes on
+  // this input, rather than refining any shapes.
+  func @operand_pack_unranked(%arg0: tensor<*xf32>) -> () {
+   // CHECK: tf.Pack
+   %outputs_0 = "tf.Pack"(%arg0) {axis = 0 : i64, device = ""} : (tensor<*xf32>) -> tensor<*xf32>
+   %outputs_2 = "tf.TensorSliceDataset"(%outputs_0) {device = "", output_shapes = [#tf.shape<>]} : (tensor<*xf32>) -> tensor<!tf.variant>
+   return
+  }
 }
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference.cc b/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference.cc
index 5907e72e602..b18d2a95787 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference.cc
@@ -369,7 +369,8 @@ LogicalResult ComputeInputsRequiredForOutput(ValuePort value_port,
   // Note: this focusses only on the trivial pack op case and this could be
   // generalized.
   if (auto pack_op = dyn_cast<TF::PackOp>(op)) {
-    if (pack_op.getType().cast<TensorType>().getRank() != 1) return failure();
+    auto type = pack_op.getType().cast<TensorType>();
+    if (!type.hasRank() || type.getRank() != 1) return failure();
     if (port.size() != 2) return failure();
     assert(port[0] == 0);
     ValuePort req(pack_op.getOperand(port[1]));
@@ -405,7 +406,8 @@ Attribute ComputeOutputComponent(const ValuePort& value_port,
   // Note: this focusses only on the trivial pack op case and this could be
   // generalized.
   if (auto pack_op = dyn_cast<TF::PackOp>(op)) {
-    if (pack_op.getType().cast<TensorType>().getRank() != 1) return nullptr;
+    TensorType type = pack_op.getType().cast<TensorType>();
+    if (!type.hasRank() || type.getRank() != 1) return nullptr;
     if (port.size() != 2 || port[0] != 0) return nullptr;
     ValuePort op_port(op->getOperand(port[1]));
     return values(op_port);

From e1399e577e6ede18a645ac5b7e9da33bb441bd96 Mon Sep 17 00:00:00 2001
From: Andy Ly <lyandy@google.com>
Date: Fri, 26 Jun 2020 15:16:15 -0700
Subject: [PATCH 1197/1390] Add simpler island coarsening/executor to
 functional conversion pass.

Functionalization should be enabled prior the TPU MLIR bridge running, resulting in no V1 control flow ops. Once graph pruning is safe to run before island coarsening, this pass can replace the existing island coarsening pass as it is simply relying on a topological sort ordering of ops across tf_executor.island ops in a tf_executor.graph.

PiperOrigin-RevId: 318552984
Change-Id: Ic59066013c9c3c7b0c014f2d69ebdd8bde2db3e3
---
 tensorflow/compiler/mlir/tensorflow/BUILD     |   1 +
 .../tests/tf-executor-to-functional.mlir      | 110 +++++++++++++++++
 .../mlir/tensorflow/transforms/passes.h       |   5 +
 .../translate/tf_executor_to_functional.cc    | 114 ++++++++++++++++++
 4 files changed, 230 insertions(+)
 create mode 100644 tensorflow/compiler/mlir/tensorflow/tests/tf-executor-to-functional.mlir
 create mode 100644 tensorflow/compiler/mlir/tensorflow/translate/tf_executor_to_functional.cc

diff --git a/tensorflow/compiler/mlir/tensorflow/BUILD b/tensorflow/compiler/mlir/tensorflow/BUILD
index ed55a5c3793..778b1754410 100644
--- a/tensorflow/compiler/mlir/tensorflow/BUILD
+++ b/tensorflow/compiler/mlir/tensorflow/BUILD
@@ -660,6 +660,7 @@ cc_library(
         "transforms/tpu_space_to_depth_pass.cc",
         "transforms/tpu_variable_runtime_reformatting.cc",
         "translate/breakup-islands.cc",
+        "translate/tf_executor_to_functional.cc",
         "translate/tf_functional_to_executor.cc",
     ],
     hdrs = [
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tf-executor-to-functional.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tf-executor-to-functional.mlir
new file mode 100644
index 00000000000..0fe74623e11
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tf-executor-to-functional.mlir
@@ -0,0 +1,110 @@
+// RUN: tf-opt -tf-executor-to-functional-conversion %s -split-input-file -verify-diagnostics | FileCheck %s
+
+func @unsupported_op() {
+  tf_executor.graph {
+    // expected-error@+1 {{'tf_executor.ControlTrigger' op is not supported for lifting out of tf_executor.graph, expected tf_executor.island}}
+    %control = tf_executor.ControlTrigger {}
+    tf_executor.fetch
+  }
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func @empty_graph
+// CHECK-NEXT: return
+func @empty_graph() {
+  tf_executor.graph {
+    tf_executor.fetch
+  }
+  return
+}
+
+// CHECK-LABEL: func @empty_island
+// CHECK-NEXT: return
+func @empty_island() {
+  tf_executor.graph {
+    %control = tf_executor.island {
+      tf_executor.yield
+    }
+    tf_executor.fetch
+  }
+  return
+}
+
+// CHECK-LABEL: func @island_forwarding_result
+// CHECK-SAME: ([[ARG_0:%.*]]: tensor<i32>)
+// CHECK-NEXT: return [[ARG_0]]
+func @island_forwarding_result(%arg0: tensor<i32>) -> tensor<i32> {
+  %graph_result = tf_executor.graph {
+    %output, %control = tf_executor.island {
+      tf_executor.yield %arg0 : tensor<i32>
+    }
+    tf_executor.fetch %output : tensor<i32>
+  }
+  return %graph_result : tensor<i32>
+}
+
+// CHECK-LABEL: func @transitive_data_dependencies
+// CHECK-SAME: ([[ARG_0:%.*]]: tensor<i32>)
+// CHECK-NEXT: [[A:%.*]] = "tf.opA"([[ARG_0]])
+// CHECK-NEXT: [[B:%.*]] = "tf.opB"([[A]])
+// CHECK-NEXT: return [[B]]
+func @transitive_data_dependencies(%arg0: tensor<i32>) -> tensor<i32> {
+  %graph_result = tf_executor.graph {
+    %output0, %control0 = tf_executor.island {
+      %a = "tf.opA"(%arg0) : (tensor<i32>) -> tensor<i32>
+      tf_executor.yield %a : tensor<i32>
+    }
+    %output1, %control1 = tf_executor.island {
+      %b = "tf.opB"(%output0) : (tensor<i32>) -> tensor<i32>
+      tf_executor.yield %b : tensor<i32>
+    }
+    tf_executor.fetch %output1 : tensor<i32>
+  }
+  return %graph_result : tensor<i32>
+}
+
+// CHECK-LABEL: func @transitive_control_dependencies
+// CHECK-SAME: ([[ARG_0:%.*]]: tensor<i32>)
+// CHECK-NEXT: "tf.opA"([[ARG_0]])
+// CHECK-NEXT: [[B:%.*]] = "tf.opB"([[ARG_0]])
+// CHECK-NEXT: return [[B]]
+func @transitive_control_dependencies(%arg0: tensor<i32>) -> tensor<i32> {
+  %graph_result = tf_executor.graph {
+    %output0, %control0 = tf_executor.island {
+      %a = "tf.opA"(%arg0) : (tensor<i32>) -> tensor<i32>
+      tf_executor.yield %a : tensor<i32>
+    }
+    %output1, %control1 = tf_executor.island(%control0) {
+      %b = "tf.opB"(%arg0) : (tensor<i32>) -> tensor<i32>
+      tf_executor.yield %b : tensor<i32>
+    }
+    tf_executor.fetch %output1 : tensor<i32>
+  }
+  return %graph_result : tensor<i32>
+}
+
+// CHECK-LABEL: func @multiple_inner_ops
+// CHECK-SAME: ([[ARG_0:%.*]]: tensor<i32>)
+// CHECK-NEXT: [[A:%.*]] = "tf.opA"([[ARG_0]])
+// CHECK-NEXT: [[B:%.*]] = "tf.opB"([[A]])
+// CHECK-NEXT: [[C:%.*]] = "tf.opC"([[A]])
+// CHECK-NEXT: [[D:%.*]] = "tf.opD"([[C]])
+// CHECK-NEXT: return [[B]], [[D]]
+func @multiple_inner_ops(%arg0: tensor<i32>) -> (tensor<i32>, tensor<i32>) {
+  %graph_result:2 = tf_executor.graph {
+    %output0_0, %output0_1, %control0 = tf_executor.island {
+      %a = "tf.opA"(%arg0) : (tensor<i32>) -> tensor<i32>
+      %b = "tf.opB"(%a) : (tensor<i32>) -> tensor<i32>
+      tf_executor.yield %a, %b : tensor<i32>, tensor<i32>
+    }
+    %output1_0, %output1_1, %control1 = tf_executor.island {
+      %c = "tf.opC"(%output0_0) : (tensor<i32>) -> tensor<i32>
+      %d = "tf.opD"(%c) : (tensor<i32>) -> tensor<i32>
+      tf_executor.yield %c, %d : tensor<i32>, tensor<i32>
+    }
+    tf_executor.fetch %output0_1, %output1_1 : tensor<i32>, tensor<i32>
+  }
+  return %graph_result#0, %graph_result#1 : tensor<i32>, tensor<i32>
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/passes.h b/tensorflow/compiler/mlir/tensorflow/transforms/passes.h
index 795f48711cb..85efb761d8b 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/passes.h
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/passes.h
@@ -31,6 +31,11 @@ std::unique_ptr<OperationPass<FuncOp>> CreateBreakUpIslandsPass();
 std::unique_ptr<OperationPass<FuncOp>>
 CreateFunctionalToExecutorDialectConversionPass();
 
+// Creates a pass that lifts inner ops of tf_executor.island ops in
+// tf_executor.graph into the same block as the tf_executor.graph.
+std::unique_ptr<OperationPass<FuncOp>>
+CreateExecutorDialectToFunctionalConversionPass();
+
 namespace TF {
 // Transforms functional control flow operations in the TensorFlow dialect to
 // MLIR Control Flow Graph (CFG) form.
diff --git a/tensorflow/compiler/mlir/tensorflow/translate/tf_executor_to_functional.cc b/tensorflow/compiler/mlir/tensorflow/translate/tf_executor_to_functional.cc
new file mode 100644
index 00000000000..16a8da6cb1d
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/translate/tf_executor_to_functional.cc
@@ -0,0 +1,114 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <iterator>
+#include <memory>
+#include <tuple>
+
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Support/Casting.h"
+#include "mlir/IR/Function.h"  // from @llvm-project
+#include "mlir/IR/Visitors.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_executor.h"
+
+namespace mlir {
+
+namespace {
+// This pass lifts tf_executor.island inner ops from a tf_executor.graph that
+// contains only tf_executor.island ops.
+//
+// e.g.
+//   func @my_fn(%arg0, %arg1) -> (...) {
+//     %graph_results:2 = tf_executor.graph {
+//       %island_0_result, %island_0_control = tf_executor.island {
+//         %a = tf.opA(%arg0)
+//         tf_executor.yield %a
+//       }
+//       %island_1_result, %island_1_control = tf_executor.island {
+//         %b = tf.opB(%arg1, %island_0_result)
+//         tf_executor.yield %b
+//       }
+//       tf_executor.fetch %island_0_result, %island_1_result
+//     }
+//     return %graph_results#0, %graph_results#1
+//   }
+//
+// will be transformed into:
+//   func @my_fn(%arg0, %arg1) -> (...) {
+//     %a = tf.opA(%arg0)
+//     %b = tf.opB(%arg1, %a)
+//     return %a, %b
+//   }
+
+struct ExecutorDialectToFunctionalConversion
+    : public PassWrapper<ExecutorDialectToFunctionalConversion, FunctionPass> {
+  void runOnFunction() override;
+};
+
+// Extracts inner ops of tf_executor.island ops in a tf_executor.graph, in the
+// order of ops in tf_executor.graph.
+LogicalResult LiftIslandOpInnerOpsFromGraph(tf_executor::GraphOp graph) {
+  auto graph_position = graph.getOperation()->getIterator();
+  Block* parent_block = graph.getOperation()->getBlock();
+  for (Operation& op : graph.GetBody().without_terminator()) {
+    auto island_op = llvm::dyn_cast<tf_executor::IslandOp>(op);
+    if (!island_op)
+      return op.emitOpError()
+             << "is not supported for lifting out of tf_executor.graph, "
+                "expected tf_executor.island";
+
+    // Move inner ops in island to before the outer graph.
+    auto& island_body = island_op.GetBody().getOperations();
+    parent_block->getOperations().splice(graph_position, island_body,
+                                         island_body.begin(),
+                                         std::prev(island_body.end()));
+    // Forward inner op aliased results to outer island op result uses.
+    for (auto result :
+         llvm::zip(island_op.outputs(), island_op.GetYield().fetches()))
+      std::get<0>(result).replaceAllUsesWith(std::get<1>(result));
+  }
+
+  // Forward aliased results in graph to graph result uses.
+  for (auto result : llvm::zip(graph.results(), graph.GetFetch().fetches()))
+    std::get<0>(result).replaceAllUsesWith(std::get<1>(result));
+
+  graph.erase();
+  return success();
+}
+
+void ExecutorDialectToFunctionalConversion::runOnFunction() {
+  auto result = getFunction().walk([](tf_executor::GraphOp graph) {
+    if (failed(LiftIslandOpInnerOpsFromGraph(graph)))
+      return WalkResult::interrupt();
+
+    return WalkResult::advance();
+  });
+  if (result.wasInterrupted()) signalPassFailure();
+}
+}  // end anonymous namespace
+
+std::unique_ptr<OperationPass<FuncOp>>
+CreateExecutorDialectToFunctionalConversionPass() {
+  return std::make_unique<ExecutorDialectToFunctionalConversion>();
+}
+
+}  // namespace mlir
+
+static mlir::PassRegistration<mlir::ExecutorDialectToFunctionalConversion> pass(
+    "tf-executor-to-functional-conversion",
+    "Transform from the TF executor dialect (tf_executor.graph containing only "
+    "tf_executor.island ops) to func op.");

From 21af381760146dfbdd234127e2502dbab1653284 Mon Sep 17 00:00:00 2001
From: Jared Duke <jdduke@google.com>
Date: Fri, 26 Jun 2020 15:25:31 -0700
Subject: [PATCH 1198/1390] Tweak documentation for the latest TFLite converter
 backend

The new TFLite converter is enabled by default, and no longer considered
experimental. The experimental *flag* `experimental_new_converter` remains
as a mechanism for disabling the feature if necessary.

PiperOrigin-RevId: 318554449
Change-Id: I0d0b6b0f598056df56ddc400ba0096ccdc78f157
---
 tensorflow/lite/python/lite.py      | 10 +++-------
 tensorflow/lite/python/lite_test.py | 10 ----------
 2 files changed, 3 insertions(+), 17 deletions(-)

diff --git a/tensorflow/lite/python/lite.py b/tensorflow/lite/python/lite.py
index 334edde94db..e0917e0bd2a 100644
--- a/tensorflow/lite/python/lite.py
+++ b/tensorflow/lite/python/lite.py
@@ -84,10 +84,6 @@ from tensorflow.python.util import deprecation as _deprecation
 from tensorflow.python.util.tf_export import tf_export as _tf_export
 
 
-# The default value of `experimental_new_converter`.
-_USE_EXPERIMENTAL_NEW_CONVERTER = True
-
-
 @_tf_export("lite.Optimize")
 class Optimize(enum.Enum):
   """Enum defining the optimizations to apply when generating tflite graphs.
@@ -393,7 +389,7 @@ class TFLiteConverterBase(object):
     self.target_spec = TargetSpec()
     self.optimizations = []
     self.representative_dataset = None
-    self.experimental_new_converter = _USE_EXPERIMENTAL_NEW_CONVERTER
+    self.experimental_new_converter = True
     self._experimental_new_quantizer = False
     self._experimental_calibrate_only = False
     # The 'GraphDebugInfo'  contains the stack traces of all the original nodes
@@ -539,7 +535,7 @@ class TFLiteConverterBaseV2(TFLiteConverterBase):
       training integer quantization. (default tf.float32, must be in
       {tf.float32, tf.int8, tf.uint8})
     experimental_new_converter: Experimental flag, subject to change. Enables
-      MLIR-based conversion instead of TOCO conversion.
+      MLIR-based conversion instead of TOCO conversion. (default True)
   """
 
   def __init__(self):
@@ -621,7 +617,7 @@ class TFLiteConverterBaseV2(TFLiteConverterBase):
           "experimental_new_converter=True. "
           "The old converter (TOCO) is deprecated.")
     else:
-      logging.info("Using experimental converter: If you encountered a problem "
+      logging.info("Using new converter: If you encounter a problem "
                    "please file a bug. You can opt-out "
                    "by setting experimental_new_converter=False")
 
diff --git a/tensorflow/lite/python/lite_test.py b/tensorflow/lite/python/lite_test.py
index 4f445cf50d4..ede24b2ede5 100644
--- a/tensorflow/lite/python/lite_test.py
+++ b/tensorflow/lite/python/lite_test.py
@@ -59,16 +59,6 @@ from tensorflow.python.training.training_util import write_graph
 class LiteTest(test_util.TensorFlowTestCase):
   """Base class of all the tests in this module."""
 
-  def setUp(self):
-    self._original_use_experimental_new_converter = (
-        lite._USE_EXPERIMENTAL_NEW_CONVERTER)
-    super(LiteTest, self).setUp()
-
-  def tearDown(self):
-    super(LiteTest, self).tearDown()
-    lite._USE_EXPERIMENTAL_NEW_CONVERTER = (
-        self._original_use_experimental_new_converter)
-
 
 class TestModels(LiteTest):
 

From 425ca1e951e83891e0a749e71689e6b1bd7d3bb4 Mon Sep 17 00:00:00 2001
From: Jose Baiocchi <jbaiocchi@google.com>
Date: Fri, 26 Jun 2020 15:30:41 -0700
Subject: [PATCH 1199/1390] Fix group_id type

PiperOrigin-RevId: 318555247
Change-Id: I335c09133a61bfd84ea2f8fd8090f60ddc685d89
---
 tensorflow/core/profiler/utils/derived_timeline.cc | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tensorflow/core/profiler/utils/derived_timeline.cc b/tensorflow/core/profiler/utils/derived_timeline.cc
index ac4f4efe6d2..42e0718f8b6 100644
--- a/tensorflow/core/profiler/utils/derived_timeline.cc
+++ b/tensorflow/core/profiler/utils/derived_timeline.cc
@@ -250,7 +250,7 @@ void DeriveEventsFromHostTrace(const XPlane* host_trace,
     uint64 max_launch_time_ps = 0ULL;
     uint64 total_launch_time_ps = 0ULL;
   };
-  typedef absl::flat_hash_map<uint64 /*group_id*/, GroupLaunchInfo>
+  typedef absl::flat_hash_map<int64 /*group_id*/, GroupLaunchInfo>
       DeviceLaunchInfo;
 
   int num_devices = device_traces.size();
@@ -308,9 +308,9 @@ void DeriveEventsFromHostTrace(const XPlane* host_trace,
         device_plane.GetOrCreateLine(kThreadIdKernelLaunch);
     launch_line.SetName(kKernelLaunchLineName);
     launch_line.SetTimestampNs(std::min(device_plane_start, host_plane_start));
-    for (const auto& it : per_device_launch_info[i]) {
-      uint64 group_id = it.first;
-      const GroupLaunchInfo& group_info = it.second;
+    for (const auto& kv : per_device_launch_info[i]) {
+      int64 group_id = kv.first;
+      const GroupLaunchInfo& group_info = kv.second;
       if (auto group_name = gtl::FindOrNull(event_group_name_map, group_id)) {
         XEventBuilder device_event =
             launch_line.AddEvent(*device_plane.GetOrCreateEventMetadata(

From de51a99cb9e0352438929d104852c0deb5277aa0 Mon Sep 17 00:00:00 2001
From: Frank Chen <frankchn@google.com>
Date: Fri, 26 Jun 2020 15:41:11 -0700
Subject: [PATCH 1200/1390] Include tpu_executor_base only for open source
 builds to minimize build size

PiperOrigin-RevId: 318556803
Change-Id: Ifccf091b9557a2ed62ece024f1ec676d2237893e
---
 tensorflow/core/tpu/BUILD                        | 8 ++++++--
 tensorflow/core/tpu/tpu_api_dlsym_initializer.cc | 9 ++++++++-
 2 files changed, 14 insertions(+), 3 deletions(-)

diff --git a/tensorflow/core/tpu/BUILD b/tensorflow/core/tpu/BUILD
index dd7435dba80..d38717f4e40 100644
--- a/tensorflow/core/tpu/BUILD
+++ b/tensorflow/core/tpu/BUILD
@@ -139,10 +139,14 @@ cc_library(
         "//tensorflow/core/tpu/kernels:tpu_compile_c_api_hdrs",
         "//tensorflow/core/tpu/kernels:tpu_mesh_state_c_api_hdrs",
         "//tensorflow/core/tpu/kernels:tpu_util_c_api_hdrs",
-        "//tensorflow/stream_executor/tpu:tpu_executor_base",
         "//tensorflow/stream_executor/tpu:tpu_executor_c_api_hdrs",
         "//tensorflow/stream_executor/tpu:tpu_node_context_c_api_hdrs",
-    ],
+    ] + select({
+        "//tensorflow:oss": [
+            "//tensorflow/stream_executor/tpu:tpu_executor_base",
+        ],
+        "//conditions:default": [],
+    }),
 )
 
 cc_library(
diff --git a/tensorflow/core/tpu/tpu_api_dlsym_initializer.cc b/tensorflow/core/tpu/tpu_api_dlsym_initializer.cc
index 450f7aaad8f..a0a1097e539 100644
--- a/tensorflow/core/tpu/tpu_api_dlsym_initializer.cc
+++ b/tensorflow/core/tpu/tpu_api_dlsym_initializer.cc
@@ -19,9 +19,10 @@ limitations under the License.
 
 #include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/status.h"
+#if !defined(PLATFORM_GOOGLE)
 #include "tensorflow/core/tpu/tpu_api.h"
-#include "tensorflow/stream_executor/tpu/tpu_node_context_c_api.h"
 #include "tensorflow/stream_executor/tpu/tpu_platform.h"
+#endif
 
 #define TFTPU_SET_FN(Struct, FnName)                                         \
   Struct->FnName##Fn =                                                       \
@@ -37,6 +38,11 @@ limitations under the License.
 namespace tensorflow {
 namespace tpu {
 
+#if defined(PLATFORM_GOOGLE)
+Status InitializeTpuLibrary(void* library_handle) {
+  return errors::Unimplemented("You must statically link in a TPU library.");
+}
+#else
 #include "tensorflow/core/tpu/tpu_library_init_fns.inc"
 
 Status InitializeTpuLibrary(void* library_handle) {
@@ -61,6 +67,7 @@ Status InitializeTpuLibrary(void* library_handle) {
 
   return s;
 }
+#endif
 
 }  // namespace tpu
 }  // namespace tensorflow

From 3780057e34ec238bd08c5a88fe6ef23863fd4b01 Mon Sep 17 00:00:00 2001
From: Frank Chen <frankchn@google.com>
Date: Fri, 26 Jun 2020 15:44:11 -0700
Subject: [PATCH 1201/1390] Add virtual_device to TPU library

PiperOrigin-RevId: 318557230
Change-Id: I5898a269e816011715c646679b3dafe0ccbcb8d5
---
 tensorflow/core/tpu/BUILD             | 11 +++
 tensorflow/core/tpu/virtual_device.cc | 97 +++++++++++++++++++++++++++
 tensorflow/core/tpu/virtual_device.h  | 39 +++++++++++
 3 files changed, 147 insertions(+)
 create mode 100644 tensorflow/core/tpu/virtual_device.cc
 create mode 100644 tensorflow/core/tpu/virtual_device.h

diff --git a/tensorflow/core/tpu/BUILD b/tensorflow/core/tpu/BUILD
index d38717f4e40..a1cc7701714 100644
--- a/tensorflow/core/tpu/BUILD
+++ b/tensorflow/core/tpu/BUILD
@@ -154,3 +154,14 @@ cc_library(
     hdrs = ["tpu_library_init_fns.inc"],
     visibility = ["//visibility:public"],
 )
+
+cc_library(
+    name = "virtual_device",
+    srcs = ["virtual_device.cc"],
+    hdrs = ["virtual_device.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:protos_all_cc",
+    ],
+)
diff --git a/tensorflow/core/tpu/virtual_device.cc b/tensorflow/core/tpu/virtual_device.cc
new file mode 100644
index 00000000000..f410158556a
--- /dev/null
+++ b/tensorflow/core/tpu/virtual_device.cc
@@ -0,0 +1,97 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/tpu/virtual_device.h"
+
+#include "tensorflow/core/framework/tensor.pb.h"
+
+namespace tensorflow {
+namespace {
+
+class VirtualDeviceContext : public DeviceContext {
+ public:
+  VirtualDeviceContext() = default;
+
+  void CopyCPUTensorToDevice(const Tensor* cpu_tensor, Device* device,
+                             Tensor* device_tensor, StatusCallback done,
+                             bool sync_dst_compute) const override;
+  void CopyDeviceTensorToCPU(const Tensor* device_tensor,
+                             StringPiece tensor_name, Device* device,
+                             Tensor* cpu_tensor, StatusCallback done) override;
+  void CopyTensorInSameDevice(const Tensor* input_tensor, Device* device,
+                              Tensor* output_tensor,
+                              StatusCallback done) const override;
+};
+
+void VirtualDeviceContext::CopyCPUTensorToDevice(const Tensor* cpu_tensor,
+                                                 Device* device,
+                                                 Tensor* device_tensor,
+                                                 StatusCallback done,
+                                                 bool sync_dst_compute) const {
+  *device_tensor = *cpu_tensor;
+  done(Status::OK());
+}
+
+void VirtualDeviceContext::CopyDeviceTensorToCPU(const Tensor* device_tensor,
+                                                 StringPiece tensor_name,
+                                                 Device* device,
+                                                 Tensor* cpu_tensor,
+                                                 StatusCallback done) {
+  *cpu_tensor = *device_tensor;
+  done(Status::OK());
+}
+
+void VirtualDeviceContext::CopyTensorInSameDevice(const Tensor* input_tensor,
+                                                  Device* device,
+                                                  Tensor* output_tensor,
+                                                  StatusCallback done) const {
+  *output_tensor = *input_tensor;
+  done(Status::OK());
+}
+
+}  // namespace
+
+// VirtualDevice
+
+VirtualDevice::VirtualDevice(Env* env,
+                             const DeviceAttributes& device_attributes)
+    : Device(env, device_attributes) {}
+
+Status VirtualDevice::Sync() { return Status::OK(); }
+
+Allocator* VirtualDevice::GetAllocator(AllocatorAttributes attr) {
+  // Tensors always live on the host.
+  return cpu_allocator();
+}
+
+Status VirtualDevice::MakeTensorFromProto(const TensorProto& tensor_proto,
+                                          const AllocatorAttributes alloc_attrs,
+                                          Tensor* tensor) {
+  Tensor parsed(tensor_proto.dtype());
+  Allocator* allocator = cpu_allocator();
+  if (!parsed.FromProto(allocator, tensor_proto)) {
+    return errors::InvalidArgument("Cannot parse tensor from proto: ",
+                                   tensor_proto.DebugString());
+  }
+  *tensor = parsed;
+  return Status::OK();
+}
+
+Status VirtualDevice::TryGetDeviceContext(DeviceContext** out_context) {
+  *out_context = new VirtualDeviceContext;
+  return Status::OK();
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/tpu/virtual_device.h b/tensorflow/core/tpu/virtual_device.h
new file mode 100644
index 00000000000..06ff9959b14
--- /dev/null
+++ b/tensorflow/core/tpu/virtual_device.h
@@ -0,0 +1,39 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_TPU_VIRTUAL_DEVICE_H_
+#define TENSORFLOW_CORE_TPU_VIRTUAL_DEVICE_H_
+
+#include "tensorflow/core/common_runtime/device.h"
+
+namespace tensorflow {
+
+// A dummy device that exists primarily for operator placement, without
+// corresponding directly to a piece of hardware.
+class VirtualDevice : public Device {
+ public:
+  VirtualDevice(Env* env, const DeviceAttributes& device_attributes);
+
+  Status Sync() override;
+  Allocator* GetAllocator(AllocatorAttributes attr) override;
+  Status MakeTensorFromProto(const TensorProto& tensor_proto,
+                             const AllocatorAttributes alloc_attrs,
+                             Tensor* tensor) override;
+  Status TryGetDeviceContext(DeviceContext** out_context) override;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TPU_VIRTUAL_DEVICE_H_

From 4d7d1a8c34924581ee6d444fb7e99d3edab331f4 Mon Sep 17 00:00:00 2001
From: Andy Ly <lyandy@google.com>
Date: Fri, 26 Jun 2020 15:50:57 -0700
Subject: [PATCH 1202/1390] Fix comments in
 ExecutorDialectToFunctionalConversion pass (NFC).

The term "aliased" is replaced with comments on the explicit forwarding of fetches to op results.

PiperOrigin-RevId: 318558280
Change-Id: I8bee37166539756953d089eb7fc5bb07115f8eaa
---
 .../mlir/tensorflow/translate/tf_executor_to_functional.cc   | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/tensorflow/compiler/mlir/tensorflow/translate/tf_executor_to_functional.cc b/tensorflow/compiler/mlir/tensorflow/translate/tf_executor_to_functional.cc
index 16a8da6cb1d..adf8f3c0174 100644
--- a/tensorflow/compiler/mlir/tensorflow/translate/tf_executor_to_functional.cc
+++ b/tensorflow/compiler/mlir/tensorflow/translate/tf_executor_to_functional.cc
@@ -76,13 +76,14 @@ LogicalResult LiftIslandOpInnerOpsFromGraph(tf_executor::GraphOp graph) {
     parent_block->getOperations().splice(graph_position, island_body,
                                          island_body.begin(),
                                          std::prev(island_body.end()));
-    // Forward inner op aliased results to outer island op result uses.
+    // Forward island fetches (tf_executor.yield operands) to island op result
+    // uses.
     for (auto result :
          llvm::zip(island_op.outputs(), island_op.GetYield().fetches()))
       std::get<0>(result).replaceAllUsesWith(std::get<1>(result));
   }
 
-  // Forward aliased results in graph to graph result uses.
+  // Forward graph fetches (tf_executor.fetch operands) to graph op result uses.
   for (auto result : llvm::zip(graph.results(), graph.GetFetch().fetches()))
     std::get<0>(result).replaceAllUsesWith(std::get<1>(result));
 

From 61a6e22f5fd3296c3838da13ec1fb28fccc5f83c Mon Sep 17 00:00:00 2001
From: Robert David <lrdx@google.com>
Date: Fri, 26 Jun 2020 16:22:10 -0700
Subject: [PATCH 1203/1390] Add additional scratch buffers for Hybrid LSTM,
 storing quantization temporary informations separately for different inputs.

This allows to process LSTM gates independently, instead of processing strictly sequentially based on different inputs.

PiperOrigin-RevId: 318563361
Change-Id: I27b4d14ec9e93083a5ad48729260d7b4a1d43cde
---
 .../kernels/bidirectional_sequence_lstm.cc    | 148 ++++++++++++------
 tensorflow/lite/kernels/lstm.cc               |  93 +++++++----
 tensorflow/lite/kernels/lstm_eval.cc          | 139 ++++++++--------
 tensorflow/lite/kernels/lstm_eval.h           |  18 ++-
 tensorflow/lite/kernels/lstm_eval_test.cc     |  83 +++++++---
 .../kernels/unidirectional_sequence_lstm.cc   |  90 +++++++----
 6 files changed, 369 insertions(+), 202 deletions(-)

diff --git a/tensorflow/lite/kernels/bidirectional_sequence_lstm.cc b/tensorflow/lite/kernels/bidirectional_sequence_lstm.cc
index f1a77e2b1cb..1ce131a96ac 100644
--- a/tensorflow/lite/kernels/bidirectional_sequence_lstm.cc
+++ b/tensorflow/lite/kernels/bidirectional_sequence_lstm.cc
@@ -138,15 +138,19 @@ enum TemporaryTensor {
   kBwActivationStateQuantized = 4,
   kFwCellStateQuantized = 5,
   kBwCellStateQuantized = 6,
-  kScalingFactors = 7,
-  kProductScalingFactors = 8,
-  kRecoveredCellWeights = 9,
-  kAccumScratchBuffer = 10,
-  kZeroPoints = 11,
-  kFwRowSums = 12,
-  kBwRowSums = 13,
-  kAuxInputQuantized = 14,  // Optional, quantized tensor for auxiliary input.
-  kNumTemporaryTensors = 15
+  kInputScalingFactors = 7,
+  kAuxInputScalingFactors = 8,
+  kOutputStateScalingFactors = 9,
+  kProductScalingFactors = 10,
+  kRecoveredCellWeights = 11,
+  kAccumScratchBuffer = 12,
+  kInputZeroPoints = 13,
+  kAuxInputZeroPoints = 14,
+  kOutputStateZeroPoints = 15,
+  kFwRowSums = 16,
+  kBwRowSums = 17,
+  kAuxInputQuantized = 18,  // Optional, quantized tensor for auxiliary input.
+  kNumTemporaryTensors = 19,
 };
 
 struct OpData {
@@ -699,18 +703,41 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
     // a vector once (which produces the scaling factors) and multiply it with
     // different matrices (which requires multiplying the scaling factors with
     // the scaling factor of the matrix).
-    node->temporaries->data[kScalingFactors] =
-        op_data->scratch_tensor_index + kScalingFactors;
-    TfLiteTensor* scaling_factors =
-        GetTemporary(context, node, kScalingFactors);
-    scaling_factors->type = kTfLiteFloat32;
-    scaling_factors->allocation_type = kTfLiteArenaRw;
+    node->temporaries->data[kInputScalingFactors] =
+        op_data->scratch_tensor_index + kInputScalingFactors;
+    TfLiteTensor* input_sf = GetTemporary(context, node, kInputScalingFactors);
+    input_sf->type = kTfLiteFloat32;
+    input_sf->allocation_type = kTfLiteArenaRw;
     int scaling_dims[1] = {n_batch};
-    if (!TfLiteIntArrayEqualsArray(scaling_factors->dims, 1, scaling_dims)) {
-      TfLiteIntArray* scaling_factors_size = TfLiteIntArrayCreate(1);
-      scaling_factors_size->data[0] = n_batch;
-      TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, scaling_factors,
-                                                       scaling_factors_size));
+    if (!TfLiteIntArrayEqualsArray(input_sf->dims, 1, scaling_dims)) {
+      TfLiteIntArray* input_sf_size = TfLiteIntArrayCreate(1);
+      input_sf_size->data[0] = n_batch;
+      TF_LITE_ENSURE_OK(
+          context, context->ResizeTensor(context, input_sf, input_sf_size));
+    }
+    node->temporaries->data[kAuxInputScalingFactors] =
+        op_data->scratch_tensor_index + kAuxInputScalingFactors;
+    TfLiteTensor* aux_input_sf =
+        GetTemporary(context, node, kAuxInputScalingFactors);
+    aux_input_sf->type = kTfLiteFloat32;
+    aux_input_sf->allocation_type = kTfLiteArenaRw;
+    if (!TfLiteIntArrayEqualsArray(aux_input_sf->dims, 1, scaling_dims)) {
+      TfLiteIntArray* aux_input_sf_size = TfLiteIntArrayCreate(1);
+      aux_input_sf_size->data[0] = n_batch;
+      TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, aux_input_sf,
+                                                       aux_input_sf_size));
+    }
+    node->temporaries->data[kOutputStateScalingFactors] =
+        op_data->scratch_tensor_index + kOutputStateScalingFactors;
+    TfLiteTensor* output_state_sf =
+        GetTemporary(context, node, kOutputStateScalingFactors);
+    output_state_sf->type = kTfLiteFloat32;
+    output_state_sf->allocation_type = kTfLiteArenaRw;
+    if (!TfLiteIntArrayEqualsArray(output_state_sf->dims, 1, scaling_dims)) {
+      TfLiteIntArray* output_state_sf_size = TfLiteIntArrayCreate(1);
+      output_state_sf_size->data[0] = n_batch;
+      TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, output_state_sf,
+                                                       output_state_sf_size));
     }
     node->temporaries->data[kProductScalingFactors] =
         op_data->scratch_tensor_index + kProductScalingFactors;
@@ -768,16 +795,40 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
     }
 
     // Allocate temporary tensors for storing zero-points.
-    node->temporaries->data[kZeroPoints] =
-        op_data->scratch_tensor_index + kZeroPoints;
-    TfLiteTensor* zero_points = GetTemporary(context, node, kZeroPoints);
-    zero_points->type = kTfLiteFloat32;
-    zero_points->allocation_type = kTfLiteArenaRw;
-    if (!TfLiteIntArrayEqualsArray(zero_points->dims, 1, scaling_dims)) {
-      TfLiteIntArray* zero_points_size = TfLiteIntArrayCreate(1);
-      zero_points_size->data[0] = n_batch;
-      TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, zero_points,
-                                                       zero_points_size));
+    node->temporaries->data[kInputZeroPoints] =
+        op_data->scratch_tensor_index + kInputZeroPoints;
+    TfLiteTensor* input_zp = GetTemporary(context, node, kInputZeroPoints);
+    input_zp->type = kTfLiteFloat32;
+    input_zp->allocation_type = kTfLiteArenaRw;
+    if (!TfLiteIntArrayEqualsArray(input_zp->dims, 1, scaling_dims)) {
+      TfLiteIntArray* input_zp_size = TfLiteIntArrayCreate(1);
+      input_zp_size->data[0] = n_batch;
+      TF_LITE_ENSURE_OK(
+          context, context->ResizeTensor(context, input_zp, input_zp_size));
+    }
+    node->temporaries->data[kAuxInputZeroPoints] =
+        op_data->scratch_tensor_index + kAuxInputZeroPoints;
+    TfLiteTensor* aux_input_zp =
+        GetTemporary(context, node, kAuxInputZeroPoints);
+    aux_input_zp->type = kTfLiteFloat32;
+    aux_input_zp->allocation_type = kTfLiteArenaRw;
+    if (!TfLiteIntArrayEqualsArray(aux_input_zp->dims, 1, scaling_dims)) {
+      TfLiteIntArray* aux_input_zp_size = TfLiteIntArrayCreate(1);
+      aux_input_zp_size->data[0] = n_batch;
+      TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, aux_input_zp,
+                                                       aux_input_zp_size));
+    }
+    node->temporaries->data[kOutputStateZeroPoints] =
+        op_data->scratch_tensor_index + kOutputStateZeroPoints;
+    TfLiteTensor* output_state_zp =
+        GetTemporary(context, node, kOutputStateZeroPoints);
+    output_state_zp->type = kTfLiteFloat32;
+    output_state_zp->allocation_type = kTfLiteArenaRw;
+    if (!TfLiteIntArrayEqualsArray(output_state_zp->dims, 1, scaling_dims)) {
+      TfLiteIntArray* output_state_zp_size = TfLiteIntArrayCreate(1);
+      output_state_zp_size->data[0] = n_batch;
+      TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, output_state_zp,
+                                                       output_state_zp_size));
     }
 
     // Allocate temporary tensors for caching row sums for hybrid zero-point
@@ -1071,8 +1122,6 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
           GetTemporary(context, node, kFwCellStateQuantized);
       TfLiteTensor* bw_cell_state_quantized =
           GetTemporary(context, node, kBwCellStateQuantized);
-      TfLiteTensor* scaling_factors =
-          GetTemporary(context, node, kScalingFactors);
       TfLiteTensor* prod_scaling_factors =
           GetTemporary(context, node, kProductScalingFactors);
       TfLiteTensor* recovered_cell_weights =
@@ -1082,7 +1131,6 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
                         : nullptr;
       TfLiteTensor* accum_scratch =
           GetTemporary(context, node, kAccumScratchBuffer);
-      TfLiteTensor* zero_points = GetTemporary(context, node, kZeroPoints);
       TfLiteTensor* fw_row_sums = GetTemporary(context, node, kFwRowSums);
       TfLiteTensor* bw_row_sums = GetTemporary(context, node, kBwRowSums);
       const int fw_row_sums_size = fw_row_sums->dims->data[0];
@@ -1104,12 +1152,17 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
           fw_output_gate_bias, fw_projection_weights, fw_projection_bias,
           &lstm_params,
           /*forward_sequence=*/true, /*time_major=*/true, /*output_offset=*/0,
-          fw_scratch_buffer, scaling_factors, prod_scaling_factors,
-          recovered_cell_weights, input_quantized, aux_input_quantized,
-          fw_activation_state_quantized, fw_cell_state_quantized,
-          fw_activation_state, fw_cell_state, accum_scratch, fw_output,
-          zero_points, fw_row_sums, fw_row_sums_size,
-          &op_data->compute_fw_row_sums,
+          fw_scratch_buffer, GetTemporary(context, node, kInputScalingFactors),
+          GetTemporary(context, node, kAuxInputScalingFactors),
+          GetTemporary(context, node, kOutputStateScalingFactors),
+          prod_scaling_factors, recovered_cell_weights, input_quantized,
+          aux_input_quantized, fw_activation_state_quantized,
+          fw_cell_state_quantized, fw_activation_state, fw_cell_state,
+          accum_scratch, fw_output,
+          GetTemporary(context, node, kInputZeroPoints),
+          GetTemporary(context, node, kAuxInputZeroPoints),
+          GetTemporary(context, node, kOutputStateZeroPoints), fw_row_sums,
+          fw_row_sums_size, &op_data->compute_fw_row_sums,
           CpuBackendContext::GetFromContext(context));
       TF_LITE_ENSURE_OK(context, fw_pass_status);
 
@@ -1130,12 +1183,17 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
           bw_output_gate_bias, bw_projection_weights, bw_projection_bias,
           &lstm_params,
           /*forward_sequence=*/false, /*time_major=*/true, bw_output_offset,
-          bw_scratch_buffer, scaling_factors, prod_scaling_factors,
-          recovered_cell_weights, input_quantized, aux_input_quantized,
-          bw_activation_state_quantized, bw_cell_state_quantized,
-          bw_activation_state, bw_cell_state, accum_scratch, actual_bw_output,
-          zero_points, bw_row_sums, bw_row_sums_size,
-          &op_data->compute_bw_row_sums,
+          bw_scratch_buffer, GetTemporary(context, node, kInputScalingFactors),
+          GetTemporary(context, node, kAuxInputScalingFactors),
+          GetTemporary(context, node, kOutputStateScalingFactors),
+          prod_scaling_factors, recovered_cell_weights, input_quantized,
+          aux_input_quantized, bw_activation_state_quantized,
+          bw_cell_state_quantized, bw_activation_state, bw_cell_state,
+          accum_scratch, actual_bw_output,
+          GetTemporary(context, node, kInputZeroPoints),
+          GetTemporary(context, node, kAuxInputZeroPoints),
+          GetTemporary(context, node, kOutputStateZeroPoints), bw_row_sums,
+          bw_row_sums_size, &op_data->compute_bw_row_sums,
           CpuBackendContext::GetFromContext(context));
       TF_LITE_ENSURE_OK(context, bw_pass_status);
       return kTfLiteOk;
diff --git a/tensorflow/lite/kernels/lstm.cc b/tensorflow/lite/kernels/lstm.cc
index 75de587774a..c39f715446b 100644
--- a/tensorflow/lite/kernels/lstm.cc
+++ b/tensorflow/lite/kernels/lstm.cc
@@ -66,13 +66,15 @@ enum HybridTemporaryTensor {
   kInputQuantized = 1,
   kOutputStateQuantized = 2,
   kCellStateQuantized = 3,
-  kScalingFactors = 4,
-  kProductScalingFactors = 5,
-  kRecoveredCellWeights = 6,
-  kAccumScratch = 7,
-  kZeroPoints = 8,
-  kRowSums = 9,
-  kNumHybridTemporaryTensors = 10,
+  kInputScalingFactors = 4,
+  kOutputStateScalingFactors = 5,
+  kProductScalingFactors = 6,
+  kRecoveredCellWeights = 7,
+  kAccumScratch = 8,
+  kInputZeroPoints = 9,
+  kOutputStateZeroPoints = 10,
+  kRowSums = 11,
+  kNumHybridTemporaryTensors = 12,
 };
 
 TfLiteStatus PopulateQuantizedLstmParams8x8_16(
@@ -1333,18 +1335,29 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
     // a vector once (which produces the scaling factors) and multiply it with
     // different matrices (which requires multiplying the scaling factors with
     // the scaling factor of the matrix).
-    node->temporaries->data[kScalingFactors] =
-        op_data->scratch_tensor_index + kScalingFactors;
-    TfLiteTensor* scaling_factors =
-        GetTemporary(context, node, kScalingFactors);
-    scaling_factors->type = kTfLiteFloat32;
-    scaling_factors->allocation_type = kTfLiteArenaRw;
+    node->temporaries->data[kInputScalingFactors] =
+        op_data->scratch_tensor_index + kInputScalingFactors;
+    TfLiteTensor* input_sf = GetTemporary(context, node, kInputScalingFactors);
+    input_sf->type = kTfLiteFloat32;
+    input_sf->allocation_type = kTfLiteArenaRw;
     int scaling_dims[1] = {n_batch};
-    if (!TfLiteIntArrayEqualsArray(scaling_factors->dims, 1, scaling_dims)) {
-      TfLiteIntArray* scaling_factors_size = TfLiteIntArrayCreate(1);
-      scaling_factors_size->data[0] = n_batch;
-      TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, scaling_factors,
-                                                       scaling_factors_size));
+    if (!TfLiteIntArrayEqualsArray(input_sf->dims, 1, scaling_dims)) {
+      TfLiteIntArray* input_sf_size = TfLiteIntArrayCreate(1);
+      input_sf_size->data[0] = n_batch;
+      TF_LITE_ENSURE_OK(
+          context, context->ResizeTensor(context, input_sf, input_sf_size));
+    }
+    node->temporaries->data[kOutputStateScalingFactors] =
+        op_data->scratch_tensor_index + kOutputStateScalingFactors;
+    TfLiteTensor* output_state_sf =
+        GetTemporary(context, node, kOutputStateScalingFactors);
+    output_state_sf->type = kTfLiteFloat32;
+    output_state_sf->allocation_type = kTfLiteArenaRw;
+    if (!TfLiteIntArrayEqualsArray(output_state_sf->dims, 1, scaling_dims)) {
+      TfLiteIntArray* output_state_sf_size = TfLiteIntArrayCreate(1);
+      output_state_sf_size->data[0] = n_batch;
+      TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, output_state_sf,
+                                                       output_state_sf_size));
     }
     node->temporaries->data[kProductScalingFactors] =
         op_data->scratch_tensor_index + kProductScalingFactors;
@@ -1394,18 +1407,28 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
       TF_LITE_ENSURE_OK(
           context, context->ResizeTensor(context, accum_scratch, accum_size));
     }
-
-    node->temporaries->data[kZeroPoints] =
-        op_data->scratch_tensor_index + kZeroPoints;
-    TfLiteTensor* zero_points = GetTemporary(context, node, kZeroPoints);
-    zero_points->type = kTfLiteFloat32;
-    zero_points->allocation_type = kTfLiteArenaRw;
-    int zero_points_dims[1] = {n_batch};
-    if (!TfLiteIntArrayEqualsArray(zero_points->dims, 1, zero_points_dims)) {
-      TfLiteIntArray* zero_points_size = TfLiteIntArrayCreate(1);
-      zero_points_size->data[0] = n_batch;
-      TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, zero_points,
-                                                       zero_points_size));
+    node->temporaries->data[kInputZeroPoints] =
+        op_data->scratch_tensor_index + kInputZeroPoints;
+    TfLiteTensor* input_zp = GetTemporary(context, node, kInputZeroPoints);
+    input_zp->type = kTfLiteFloat32;
+    input_zp->allocation_type = kTfLiteArenaRw;
+    if (!TfLiteIntArrayEqualsArray(input_zp->dims, 1, scaling_dims)) {
+      TfLiteIntArray* input_zp_size = TfLiteIntArrayCreate(1);
+      input_zp_size->data[0] = n_batch;
+      TF_LITE_ENSURE_OK(
+          context, context->ResizeTensor(context, input_zp, input_zp_size));
+    }
+    node->temporaries->data[kOutputStateZeroPoints] =
+        op_data->scratch_tensor_index + kOutputStateZeroPoints;
+    TfLiteTensor* output_state_zp =
+        GetTemporary(context, node, kOutputStateZeroPoints);
+    output_state_zp->type = kTfLiteFloat32;
+    output_state_zp->allocation_type = kTfLiteArenaRw;
+    if (!TfLiteIntArrayEqualsArray(output_state_zp->dims, 1, scaling_dims)) {
+      TfLiteIntArray* output_state_zp_size = TfLiteIntArrayCreate(1);
+      output_state_zp_size->data[0] = n_batch;
+      TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, output_state_zp,
+                                                       output_state_zp_size));
     }
 
     node->temporaries->data[kRowSums] =
@@ -1621,7 +1644,9 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
             projection_weights, projection_bias, params,
             /*forward_sequence=*/true, /*time_major=*/true, /*output_offset=*/0,
             GetTemporary(context, node, kScratchBuffer),
-            GetTemporary(context, node, kScalingFactors),
+            GetTemporary(context, node, kInputScalingFactors),
+            /*aux_input_sf=*/nullptr,
+            GetTemporary(context, node, kOutputStateScalingFactors),
             GetTemporary(context, node, kProductScalingFactors),
             GetTemporary(context, node, kRecoveredCellWeights),
             GetTemporary(context, node, kInputQuantized),
@@ -1629,8 +1654,10 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
             GetTemporary(context, node, kOutputStateQuantized),
             GetTemporary(context, node, kCellStateQuantized), output_state,
             cell_state, GetTemporary(context, node, kAccumScratch), output,
-            GetTemporary(context, node, kZeroPoints), row_sums, row_sums_size,
-            &op_data->compute_row_sums,
+            GetTemporary(context, node, kInputZeroPoints),
+            /*aux_input_zp=*/nullptr,
+            GetTemporary(context, node, kOutputStateZeroPoints), row_sums,
+            row_sums_size, &op_data->compute_row_sums,
             CpuBackendContext::GetFromContext(context));
       } else {
         const int num_intermediate_tensors = node->intermediates->size;
diff --git a/tensorflow/lite/kernels/lstm_eval.cc b/tensorflow/lite/kernels/lstm_eval.cc
index 9b24dc04c26..f97411a3a97 100644
--- a/tensorflow/lite/kernels/lstm_eval.cc
+++ b/tensorflow/lite/kernels/lstm_eval.cc
@@ -785,14 +785,15 @@ inline void LstmStepHybrid(
     const float* projection_bias_ptr, const TfLiteLSTMParams* params,
     int n_batch, int n_cell, int n_input, int n_aux_input, int n_output,
     int output_batch_leading_dim, float* scratch0, float* scratch1,
-    float* scratch2, float* scratch3, float* scaling_factors,
-    float* scaling_factors_scratch, float* recovered_cell_weights,
-    int8_t* quantized_input_ptr, int8_t* quantized_aux_input_ptr,
-    int8_t* quantized_output_state_ptr, int8_t* quantized_output_scratch,
-    float* output_state_ptr, float* cell_state_ptr, int32_t* accum_scratch_ptr,
-    float* output_ptr, int32_t* zero_points, int32_t* row_sums,
-    int row_sums_size, bool* compute_row_sums, bool asymmetric_quantize_inputs,
-    CpuBackendContext* context) {
+    float* scratch2, float* scratch3, float* input_sf, float* aux_input_sf,
+    float* output_state_sf, float* scaling_factors_scratch,
+    float* recovered_cell_weights, int8_t* quantized_input_ptr,
+    int8_t* quantized_aux_input_ptr, int8_t* quantized_output_state_ptr,
+    int8_t* quantized_output_scratch, float* output_state_ptr,
+    float* cell_state_ptr, int32_t* accum_scratch_ptr, float* output_ptr,
+    int32_t* input_zp, int32_t* aux_input_zp, int32_t* output_state_zp,
+    int32_t* row_sums, int row_sums_size, bool* compute_row_sums,
+    bool asymmetric_quantize_inputs, CpuBackendContext* context) {
   ruy::profiler::ScopeLabel label("LstmStepHybrid");
   // Since we have already checked that weights are all there or none, we
   // can check the existence of only one to the get the condition.
@@ -897,38 +898,37 @@ inline void LstmStepHybrid(
 
   if (!tensor_utils::IsZeroVector(input_ptr, n_batch * n_input)) {
     tensor_utils::BatchQuantizeFloats(input_ptr, n_batch, n_input,
-                                      quantized_input_ptr, scaling_factors,
-                                      zero_points, asymmetric_quantize_inputs);
+                                      quantized_input_ptr, input_sf, input_zp,
+                                      asymmetric_quantize_inputs);
     if (!use_cifg) {
       tensor_utils::MatrixBatchVectorMultiplyAccumulate(
           input_to_input_weights_ptr, n_cell, n_input, quantized_input_ptr,
-          input_to_input_weights_scale, scaling_factors, n_batch,
-          input_gate_scratch, /*per_channel_scale=*/nullptr, zero_points,
-          accum_scratch_ptr, input_to_input_row_sums, compute_row_sums,
-          scaling_factors_scratch, context);
+          input_to_input_weights_scale, input_sf, n_batch, input_gate_scratch,
+          /*per_channel_scale=*/nullptr, input_zp, accum_scratch_ptr,
+          input_to_input_row_sums, compute_row_sums, scaling_factors_scratch,
+          context);
     }
 
     tensor_utils::MatrixBatchVectorMultiplyAccumulate(
         input_to_forget_weights_ptr, n_cell, n_input, quantized_input_ptr,
-        input_to_forget_weights_scale, scaling_factors, n_batch,
-        forget_gate_scratch, /*per_channel_scale=*/nullptr, zero_points,
-        accum_scratch_ptr, input_to_forget_row_sums, compute_row_sums,
-        scaling_factors_scratch, context);
+        input_to_forget_weights_scale, input_sf, n_batch, forget_gate_scratch,
+        /*per_channel_scale=*/nullptr, input_zp, accum_scratch_ptr,
+        input_to_forget_row_sums, compute_row_sums, scaling_factors_scratch,
+        context);
 
     tensor_utils::MatrixBatchVectorMultiplyAccumulate(
         input_to_cell_weights_ptr, n_cell, n_input, quantized_input_ptr,
-        input_to_cell_weights_scale, scaling_factors, n_batch,
-        cell_gate_scratch,
-        /*per_channel_scale=*/nullptr, zero_points, accum_scratch_ptr,
+        input_to_cell_weights_scale, input_sf, n_batch, cell_gate_scratch,
+        /*per_channel_scale=*/nullptr, input_zp, accum_scratch_ptr,
         input_to_cell_row_sums, compute_row_sums, scaling_factors_scratch,
         context);
 
     tensor_utils::MatrixBatchVectorMultiplyAccumulate(
         input_to_output_weights_ptr, n_cell, n_input, quantized_input_ptr,
-        input_to_output_weights_scale, scaling_factors, n_batch,
-        output_gate_scratch, /*per_channel_scale=*/nullptr, zero_points,
-        accum_scratch_ptr, input_to_output_row_sums, compute_row_sums,
-        scaling_factors_scratch, context);
+        input_to_output_weights_scale, input_sf, n_batch, output_gate_scratch,
+        /*per_channel_scale=*/nullptr, input_zp, accum_scratch_ptr,
+        input_to_output_row_sums, compute_row_sums, scaling_factors_scratch,
+        context);
   }
 
   // For each batch and cell: compute aux_input_weight * aux_input.
@@ -936,15 +936,15 @@ inline void LstmStepHybrid(
   if (aux_input_ptr != nullptr &&
       !tensor_utils::IsZeroVector(aux_input_ptr, n_batch * n_aux_input)) {
     tensor_utils::BatchQuantizeFloats(aux_input_ptr, n_batch, n_aux_input,
-                                      quantized_aux_input_ptr, scaling_factors,
-                                      zero_points, asymmetric_quantize_inputs);
+                                      quantized_aux_input_ptr, aux_input_sf,
+                                      aux_input_zp, asymmetric_quantize_inputs);
 
     if (!use_cifg) {
       tensor_utils::MatrixBatchVectorMultiplyAccumulate(
           aux_input_to_input_weights_ptr, n_cell, n_aux_input,
           quantized_aux_input_ptr, aux_input_to_input_weights_scale,
-          scaling_factors, n_batch, input_gate_scratch,
-          /*per_channel_scale=*/nullptr, zero_points, accum_scratch_ptr,
+          aux_input_sf, n_batch, input_gate_scratch,
+          /*per_channel_scale=*/nullptr, aux_input_zp, accum_scratch_ptr,
           aux_input_to_input_row_sums, compute_row_sums,
           scaling_factors_scratch, context);
     }
@@ -952,24 +952,23 @@ inline void LstmStepHybrid(
     tensor_utils::MatrixBatchVectorMultiplyAccumulate(
         aux_input_to_forget_weights_ptr, n_cell, n_aux_input,
         quantized_aux_input_ptr, aux_input_to_forget_weights_scale,
-        scaling_factors, n_batch, forget_gate_scratch,
-        /*per_channel_scale=*/nullptr, zero_points, accum_scratch_ptr,
+        aux_input_sf, n_batch, forget_gate_scratch,
+        /*per_channel_scale=*/nullptr, aux_input_zp, accum_scratch_ptr,
         aux_input_to_forget_row_sums, compute_row_sums, scaling_factors_scratch,
         context);
 
     tensor_utils::MatrixBatchVectorMultiplyAccumulate(
         aux_input_to_cell_weights_ptr, n_cell, n_aux_input,
-        quantized_aux_input_ptr, aux_input_to_cell_weights_scale,
-        scaling_factors, n_batch, cell_gate_scratch,
-        /*per_channel_scale=*/nullptr, zero_points, accum_scratch_ptr,
-        aux_input_to_cell_row_sums, compute_row_sums, scaling_factors_scratch,
-        context);
+        quantized_aux_input_ptr, aux_input_to_cell_weights_scale, aux_input_sf,
+        n_batch, cell_gate_scratch, /*per_channel_scale=*/nullptr, aux_input_zp,
+        accum_scratch_ptr, aux_input_to_cell_row_sums, compute_row_sums,
+        scaling_factors_scratch, context);
 
     tensor_utils::MatrixBatchVectorMultiplyAccumulate(
         aux_input_to_output_weights_ptr, n_cell, n_aux_input,
         quantized_aux_input_ptr, aux_input_to_output_weights_scale,
-        scaling_factors, n_batch, output_gate_scratch,
-        /*per_channel_scale=*/nullptr, zero_points, accum_scratch_ptr,
+        aux_input_sf, n_batch, output_gate_scratch,
+        /*per_channel_scale=*/nullptr, aux_input_zp, accum_scratch_ptr,
         aux_input_to_output_row_sums, compute_row_sums, scaling_factors_scratch,
         context);
   }
@@ -978,14 +977,14 @@ inline void LstmStepHybrid(
     // Save quantization and matmul computation for all zero input.
     tensor_utils::BatchQuantizeFloats(
         output_state_ptr, n_batch, n_output, quantized_output_state_ptr,
-        scaling_factors, zero_points, asymmetric_quantize_inputs);
+        output_state_sf, output_state_zp, asymmetric_quantize_inputs);
     // For each batch and cell: compute recurrent_weight * output_state.
     if (!use_cifg) {
       tensor_utils::MatrixBatchVectorMultiplyAccumulate(
           recurrent_to_input_weights_ptr, n_cell, n_output,
           quantized_output_state_ptr, recurrent_to_input_weights_scale,
-          scaling_factors, n_batch, input_gate_scratch,
-          /*per_channel_scale=*/nullptr, zero_points, accum_scratch_ptr,
+          output_state_sf, n_batch, input_gate_scratch,
+          /*per_channel_scale=*/nullptr, output_state_zp, accum_scratch_ptr,
           recurrent_to_input_row_sums, compute_row_sums,
           scaling_factors_scratch, context);
     }
@@ -993,24 +992,24 @@ inline void LstmStepHybrid(
     tensor_utils::MatrixBatchVectorMultiplyAccumulate(
         recurrent_to_forget_weights_ptr, n_cell, n_output,
         quantized_output_state_ptr, recurrent_to_forget_weights_scale,
-        scaling_factors, n_batch, forget_gate_scratch,
-        /*per_channel_scale=*/nullptr, zero_points, accum_scratch_ptr,
+        output_state_sf, n_batch, forget_gate_scratch,
+        /*per_channel_scale=*/nullptr, output_state_zp, accum_scratch_ptr,
         recurrent_to_forget_row_sums, compute_row_sums, scaling_factors_scratch,
         context);
 
     tensor_utils::MatrixBatchVectorMultiplyAccumulate(
         recurrent_to_cell_weights_ptr, n_cell, n_output,
         quantized_output_state_ptr, recurrent_to_cell_weights_scale,
-        scaling_factors, n_batch, cell_gate_scratch,
-        /*per_channel_scale=*/nullptr, zero_points, accum_scratch_ptr,
+        output_state_sf, n_batch, cell_gate_scratch,
+        /*per_channel_scale=*/nullptr, output_state_zp, accum_scratch_ptr,
         recurrent_to_cell_row_sums, compute_row_sums, scaling_factors_scratch,
         context);
 
     tensor_utils::MatrixBatchVectorMultiplyAccumulate(
         recurrent_to_output_weights_ptr, n_cell, n_output,
         quantized_output_state_ptr, recurrent_to_output_weights_scale,
-        scaling_factors, n_batch, output_gate_scratch,
-        /*per_channel_scale=*/nullptr, zero_points, accum_scratch_ptr,
+        output_state_sf, n_batch, output_gate_scratch,
+        /*per_channel_scale=*/nullptr, output_state_zp, accum_scratch_ptr,
         recurrent_to_output_row_sums, compute_row_sums, scaling_factors_scratch,
         context);
   }
@@ -1102,7 +1101,7 @@ inline void LstmStepHybrid(
       params->activation, projection_weights_ptr, projection_weights_scale,
       projection_bias_ptr, params->proj_clip, output_state_ptr,
       asymmetric_quantize_inputs, projection_weights_row_sums, compute_row_sums,
-      context, scratch2, quantized_output_scratch, scaling_factors, zero_points,
+      context, scratch2, quantized_output_scratch, input_sf, input_zp,
       accum_scratch_ptr);
 
   // Copy output_state_ptr to the output. Note that the output batch rows may
@@ -1892,14 +1891,16 @@ TfLiteStatus EvalHybrid(
     const TfLiteTensor* cell_gate_bias, const TfLiteTensor* output_gate_bias,
     const TfLiteTensor* projection_weights, const TfLiteTensor* projection_bias,
     const TfLiteLSTMParams* params, bool forward_sequence, bool time_major,
-    int output_offset, TfLiteTensor* scratch_buffer,
-    TfLiteTensor* scaling_factors, TfLiteTensor* prod_scaling_factors,
-    TfLiteTensor* recovered_cell_weights, TfLiteTensor* input_quantized,
-    TfLiteTensor* aux_input_quantized, TfLiteTensor* output_state_quantized,
-    TfLiteTensor* cell_state_quantized, TfLiteTensor* output_state,
-    TfLiteTensor* cell_state, TfLiteTensor* output_scratch_buffer,
-    TfLiteTensor* output, TfLiteTensor* zero_points, TfLiteTensor* row_sums,
-    int row_sums_size, bool* compute_row_sums, CpuBackendContext* context) {
+    int output_offset, TfLiteTensor* scratch_buffer, TfLiteTensor* input_sf,
+    TfLiteTensor* aux_input_sf, TfLiteTensor* output_state_sf,
+    TfLiteTensor* prod_scaling_factors, TfLiteTensor* recovered_cell_weights,
+    TfLiteTensor* input_quantized, TfLiteTensor* aux_input_quantized,
+    TfLiteTensor* output_state_quantized, TfLiteTensor* cell_state_quantized,
+    TfLiteTensor* output_state, TfLiteTensor* cell_state,
+    TfLiteTensor* output_scratch_buffer, TfLiteTensor* output,
+    TfLiteTensor* input_zp, TfLiteTensor* aux_input_zp,
+    TfLiteTensor* output_state_zp, TfLiteTensor* row_sums, int row_sums_size,
+    bool* compute_row_sums, CpuBackendContext* context) {
   TF_LITE_ASSERT(input->dims->size >= 2 && input->dims->size <= 3);
   const int n_input = input->dims->data[input->dims->size - 1];
   int max_time, n_batch;
@@ -1939,10 +1940,14 @@ TfLiteStatus EvalHybrid(
   const int output_batch_leading_dim =
       output->dims->data[output->dims->size - 1];
 
-  int32_t* zero_points_ptr = nullptr;
+  int32_t* input_zp_ptr = nullptr;
+  int32_t* aux_input_zp_ptr = nullptr;
+  int32_t* output_state_zp_ptr = nullptr;
   int32_t* row_sums_ptr = nullptr;
   if (params->asymmetric_quantize_inputs) {
-    zero_points_ptr = GetTensorData<int32_t>(zero_points);
+    input_zp_ptr = GetTensorData<int32_t>(input_zp);
+    aux_input_zp_ptr = GetTensorData<int32_t>(aux_input_zp);
+    output_state_zp_ptr = GetTensorData<int32_t>(output_state_zp);
     row_sums_ptr = GetTensorData<int32_t>(row_sums);
   }
 
@@ -2005,7 +2010,9 @@ TfLiteStatus EvalHybrid(
           GetTensorData<float>(projection_bias), params, n_batch, n_cell,
           n_input, aux_input_size, n_output, output_batch_leading_dim,
           input_gate_scratch, forget_gate_scratch, cell_gate_scratch,
-          output_gate_scratch, GetTensorData<float>(scaling_factors),
+          output_gate_scratch, GetTensorData<float>(input_sf),
+          GetTensorData<float>(aux_input_sf),
+          GetTensorData<float>(output_state_sf),
           GetTensorData<float>(prod_scaling_factors),
           GetTensorData<float>(recovered_cell_weights),
           GetTensorData<int8_t>(input_quantized),
@@ -2014,8 +2021,9 @@ TfLiteStatus EvalHybrid(
           GetTensorData<int8_t>(cell_state_quantized),
           GetTensorData<float>(output_state), GetTensorData<float>(cell_state),
           GetTensorData<int32_t>(output_scratch_buffer), output_ptr,
-          zero_points_ptr, row_sums_ptr, row_sums_size, compute_row_sums,
-          params->asymmetric_quantize_inputs, context);
+          input_zp_ptr, aux_input_zp_ptr, output_state_zp_ptr, row_sums_ptr,
+          row_sums_size, compute_row_sums, params->asymmetric_quantize_inputs,
+          context);
     }
   } else {
     for (int b = 0; b < n_batch; b++) {
@@ -2092,7 +2100,9 @@ TfLiteStatus EvalHybrid(
             /*n_batch=*/1, n_cell, n_input, aux_input_size, n_output,
             output_batch_leading_dim, input_gate_scratch_ptr,
             forget_gate_scratch_ptr, cell_gate_scratch_ptr,
-            output_gate_scratch_ptr, GetTensorData<float>(scaling_factors),
+            output_gate_scratch_ptr, GetTensorData<float>(input_sf),
+            GetTensorData<float>(aux_input_sf),
+            GetTensorData<float>(output_state_sf),
             GetTensorData<float>(prod_scaling_factors),
             GetTensorData<float>(recovered_cell_weights),
             GetTensorData<int8_t>(input_quantized),
@@ -2100,8 +2110,9 @@ TfLiteStatus EvalHybrid(
             GetTensorData<int8_t>(output_state_quantized),
             GetTensorData<int8_t>(cell_state_quantized), output_state_ptr,
             cell_state_ptr, GetTensorData<int32_t>(output_scratch_buffer),
-            output_ptr, zero_points_ptr, row_sums_ptr, row_sums_size,
-            compute_row_sums, params->asymmetric_quantize_inputs, context);
+            output_ptr, input_zp_ptr, aux_input_zp_ptr, output_state_zp_ptr,
+            row_sums_ptr, row_sums_size, compute_row_sums,
+            params->asymmetric_quantize_inputs, context);
       }
     }
   }
diff --git a/tensorflow/lite/kernels/lstm_eval.h b/tensorflow/lite/kernels/lstm_eval.h
index 9b3bd0c54ec..d3fdf037b5c 100644
--- a/tensorflow/lite/kernels/lstm_eval.h
+++ b/tensorflow/lite/kernels/lstm_eval.h
@@ -148,14 +148,16 @@ TfLiteStatus EvalHybrid(
     const TfLiteTensor* cell_gate_bias, const TfLiteTensor* output_gate_bias,
     const TfLiteTensor* projection_weights, const TfLiteTensor* projection_bias,
     const TfLiteLSTMParams* params, bool forward_sequence, bool time_major,
-    int output_offset, TfLiteTensor* scratch_buffer,
-    TfLiteTensor* scaling_factors, TfLiteTensor* prod_scaling_factors,
-    TfLiteTensor* recovered_cell_weights, TfLiteTensor* input_quantized,
-    TfLiteTensor* aux_input_quantized, TfLiteTensor* output_state_quantized,
-    TfLiteTensor* cell_state_quantized, TfLiteTensor* output_state,
-    TfLiteTensor* cell_state, TfLiteTensor* output_scratch_buffer,
-    TfLiteTensor* output, TfLiteTensor* zero_points, TfLiteTensor* row_sums,
-    int row_sums_size, bool* compute_row_sums, CpuBackendContext* context);
+    int output_offset, TfLiteTensor* scratch_buffer, TfLiteTensor* input_sf,
+    TfLiteTensor* aux_input_sf, TfLiteTensor* output_state_sf,
+    TfLiteTensor* prod_scaling_factors, TfLiteTensor* recovered_cell_weights,
+    TfLiteTensor* input_quantized, TfLiteTensor* aux_input_quantized,
+    TfLiteTensor* output_state_quantized, TfLiteTensor* cell_state_quantized,
+    TfLiteTensor* output_state, TfLiteTensor* cell_state,
+    TfLiteTensor* output_scratch_buffer, TfLiteTensor* output,
+    TfLiteTensor* input_zp, TfLiteTensor* aux_input_zp,
+    TfLiteTensor* output_state_zp, TfLiteTensor* row_sums, int row_sums_size,
+    bool* compute_row_sums, CpuBackendContext* context);
 
 TfLiteStatus EvalInteger8x8_16(
     const TfLiteTensor* input, const TfLiteTensor* input_to_input_weights,
diff --git a/tensorflow/lite/kernels/lstm_eval_test.cc b/tensorflow/lite/kernels/lstm_eval_test.cc
index 78459117859..459315bd1c9 100644
--- a/tensorflow/lite/kernels/lstm_eval_test.cc
+++ b/tensorflow/lite/kernels/lstm_eval_test.cc
@@ -654,15 +654,27 @@ class HybridLstmParam : public BaseLstmParam {
     scratch_buffer_tensor_.data.f = scratch_buffer_.data();
     return &scratch_buffer_tensor_;
   }
-  TfLiteTensor* GetScalingFactors() {
-    PackWeightToTensor(&scaling_factors_tensor_, scaling_factors_,
-                       scaling_factors_size_);
-    scaling_factors_tensor_.data.f = scaling_factors_.data();
-    return &scaling_factors_tensor_;
+  TfLiteTensor* GetInputScalingFactors() {
+    PackWeightToTensor(&input_sf_tensor_, input_sf_,
+                       quantization_extra_scratch_buffer_sizes_);
+    input_sf_tensor_.data.f = input_sf_.data();
+    return &input_sf_tensor_;
+  }
+  TfLiteTensor* GetAuxInputScalingFactors() {
+    PackWeightToTensor(&aux_input_sf_tensor_, aux_input_sf_,
+                       quantization_extra_scratch_buffer_sizes_);
+    aux_input_sf_tensor_.data.f = aux_input_sf_.data();
+    return &aux_input_sf_tensor_;
+  }
+  TfLiteTensor* GetOutputStateScalingFactors() {
+    PackWeightToTensor(&output_state_sf_tensor_, output_state_sf_,
+                       quantization_extra_scratch_buffer_sizes_);
+    output_state_sf_tensor_.data.f = output_state_sf_.data();
+    return &output_state_sf_tensor_;
   }
   TfLiteTensor* GetProdScalingFactors() {
     PackWeightToTensor(&prod_scaling_factors_tensor_, prod_scaling_factors_,
-                       prod_scaling_factors_size_);
+                       quantization_extra_scratch_buffer_sizes_);
     prod_scaling_factors_tensor_.data.f = prod_scaling_factors_.data();
     return &prod_scaling_factors_tensor_;
   }
@@ -682,10 +694,23 @@ class HybridLstmParam : public BaseLstmParam {
     cell_quantized_tensor_.data.int8 = cell_quantized_.data();
     return &cell_quantized_tensor_;
   }
-  TfLiteTensor* GetZeroPoints() {
-    PackWeightToTensor(&zero_points_tensor_, zero_points_, zero_points_size_);
-    zero_points_tensor_.data.i32 = zero_points_.data();
-    return &zero_points_tensor_;
+  TfLiteTensor* GetInputZeroPoints() {
+    PackWeightToTensor(&zero_points_tensor0_, input_zp_,
+                       quantization_extra_scratch_buffer_sizes_);
+    zero_points_tensor0_.data.i32 = input_zp_.data();
+    return &zero_points_tensor0_;
+  }
+  TfLiteTensor* GetAuxInputZeroPoints() {
+    PackWeightToTensor(&zero_points_tensor1_, aux_input_zp_,
+                       quantization_extra_scratch_buffer_sizes_);
+    zero_points_tensor1_.data.i32 = aux_input_zp_.data();
+    return &zero_points_tensor1_;
+  }
+  TfLiteTensor* GetOutputStateZeroPoints() {
+    PackWeightToTensor(&zero_points_tensor2_, output_state_zp_,
+                       quantization_extra_scratch_buffer_sizes_);
+    zero_points_tensor2_.data.i32 = output_state_zp_.data();
+    return &zero_points_tensor2_;
   }
   TfLiteTensor* GetRowSums() {
     PackWeightToTensor(&row_sums_tensor_, row_sums_, row_sums_size_);
@@ -776,12 +801,16 @@ class HybridLstmParam : public BaseLstmParam {
   ~HybridLstmParam() {
     TfLiteIntArrayFree(scratch_buffer_tensor_.dims);
     TfLiteIntArrayFree(accum_scratch_tensor_.dims);
-    TfLiteIntArrayFree(scaling_factors_tensor_.dims);
+    TfLiteIntArrayFree(input_sf_tensor_.dims);
+    TfLiteIntArrayFree(aux_input_sf_tensor_.dims);
+    TfLiteIntArrayFree(output_state_sf_tensor_.dims);
     TfLiteIntArrayFree(prod_scaling_factors_tensor_.dims);
     TfLiteIntArrayFree(input_quantized_tensor_.dims);
     TfLiteIntArrayFree(activation_quantized_tensor_.dims);
     TfLiteIntArrayFree(cell_quantized_tensor_.dims);
-    TfLiteIntArrayFree(zero_points_tensor_.dims);
+    TfLiteIntArrayFree(zero_points_tensor0_.dims);
+    TfLiteIntArrayFree(zero_points_tensor1_.dims);
+    TfLiteIntArrayFree(zero_points_tensor2_.dims);
     TfLiteIntArrayFree(row_sums_tensor_.dims);
   }
 
@@ -792,14 +821,24 @@ class HybridLstmParam : public BaseLstmParam {
   std::vector<int32_t> scratch_buffer_size_ = {n_batch_, n_cell_ * 4};
   TfLiteTensor scratch_buffer_tensor_;
 
-  std::vector<float> scaling_factors_;
-  std::vector<int32_t> scaling_factors_size_ = {n_batch_};
-  TfLiteTensor scaling_factors_tensor_;
+  std::vector<int32_t> quantization_extra_scratch_buffer_sizes_ = {n_batch_};
+  std::vector<float> input_sf_;
+  TfLiteTensor input_sf_tensor_;
+  std::vector<float> aux_input_sf_;
+  TfLiteTensor aux_input_sf_tensor_;
+  std::vector<float> output_state_sf_;
+  TfLiteTensor output_state_sf_tensor_;
 
   std::vector<float> prod_scaling_factors_;
-  std::vector<int32_t> prod_scaling_factors_size_ = {n_batch_};
   TfLiteTensor prod_scaling_factors_tensor_;
 
+  std::vector<int32_t> input_zp_;
+  TfLiteTensor zero_points_tensor0_;
+  std::vector<int32_t> aux_input_zp_;
+  TfLiteTensor zero_points_tensor1_;
+  std::vector<int32_t> output_state_zp_;
+  TfLiteTensor zero_points_tensor2_;
+
   std::vector<int8_t> input_quantized_;
   TfLiteTensor input_quantized_tensor_;
 
@@ -813,10 +852,6 @@ class HybridLstmParam : public BaseLstmParam {
       16, 4, 5, 6, 1, 1, 3, 4, -5, 6, 1, 14, 5, 6, 1, 1, 3, 4, -5, 6,
   };
 
-  std::vector<int32_t> zero_points_;
-  std::vector<int32_t> zero_points_size_ = {n_batch_};
-  TfLiteTensor zero_points_tensor_;
-
   std::vector<int32_t> row_sums_;
   std::vector<int32_t> row_sums_size_ = {n_row_sums_, n_cell_};
   TfLiteTensor row_sums_tensor_;
@@ -896,13 +931,17 @@ void TestOneHybridAsymmLSTM() {
       /*forward_sequence=*/true,
       /*time_major=*/true,
       /*output_offset=*/0, one_parameter.GetScratchBuffer(),
-      one_parameter.GetScalingFactors(), one_parameter.GetProdScalingFactors(),
+      one_parameter.GetInputScalingFactors(),
+      one_parameter.GetAuxInputScalingFactors(),
+      one_parameter.GetOutputStateScalingFactors(),
+      one_parameter.GetProdScalingFactors(),
       /*recovered_cell_weights=*/nullptr, one_parameter.GetInputQuantized(),
       /*aux_input_quantized=*/nullptr,
       one_parameter.GetActivationStateQuantized(),
       one_parameter.GetCellStateQuantized(), activation, cell,
       one_parameter.GetAccumScratchBuffer(), output,
-      one_parameter.GetZeroPoints(), one_parameter.GetRowSums(),
+      one_parameter.GetInputZeroPoints(), one_parameter.GetAuxInputZeroPoints(),
+      one_parameter.GetOutputStateZeroPoints(), one_parameter.GetRowSums(),
       one_parameter.GetNumRowSums(), &compute_row_sums, &context);
   const std::vector<float> expected_cell = {
       7.83134,  1.96158, 2.18285, 3.28739,  0.483214,
diff --git a/tensorflow/lite/kernels/unidirectional_sequence_lstm.cc b/tensorflow/lite/kernels/unidirectional_sequence_lstm.cc
index 026b2452aef..0849c6dc0e4 100644
--- a/tensorflow/lite/kernels/unidirectional_sequence_lstm.cc
+++ b/tensorflow/lite/kernels/unidirectional_sequence_lstm.cc
@@ -45,13 +45,15 @@ enum TemporaryTensor {
   kInputQuantized = 1,
   kOutputStateQuantized = 2,
   kCellStateQuantized = 3,
-  kScalingFactors = 4,
-  kProductScalingFactors = 5,
-  kRecoveredCellWeights = 6,
-  kAccumScratch = 7,
-  kZeroPoints = 8,
-  kRowSums = 9,
-  kNumTemporaryTensors = 10
+  kInputScalingFactors = 4,
+  kOutputStateScalingFactors = 5,
+  kProductScalingFactors = 6,
+  kRecoveredCellWeights = 7,
+  kAccumScratch = 8,
+  kInputZeroPoints = 9,
+  kOutputStateZeroPoints = 10,
+  kRowSums = 11,
+  kNumTemporaryTensors = 12,
 };
 
 void* Init(TfLiteContext* context, const char* buffer, size_t length) {
@@ -416,18 +418,29 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
     // a vector once (which produces the scaling factors) and multiply it with
     // different matrices (which requires multiplying the scaling factors with
     // the scaling factor of the matrix).
-    node->temporaries->data[kScalingFactors] =
-        scratch_tensor_index + kScalingFactors;
-    TfLiteTensor* scaling_factors =
-        GetTemporary(context, node, kScalingFactors);
-    scaling_factors->type = kTfLiteFloat32;
-    scaling_factors->allocation_type = kTfLiteArenaRw;
+    node->temporaries->data[kInputScalingFactors] =
+        op_data->scratch_tensor_index + kInputScalingFactors;
+    TfLiteTensor* input_sf = GetTemporary(context, node, kInputScalingFactors);
+    input_sf->type = kTfLiteFloat32;
+    input_sf->allocation_type = kTfLiteArenaRw;
     int scaling_dims[1] = {n_batch};
-    if (!TfLiteIntArrayEqualsArray(scaling_factors->dims, 1, scaling_dims)) {
-      TfLiteIntArray* scaling_factors_size = TfLiteIntArrayCreate(1);
-      scaling_factors_size->data[0] = n_batch;
-      TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, scaling_factors,
-                                                       scaling_factors_size));
+    if (!TfLiteIntArrayEqualsArray(input_sf->dims, 1, scaling_dims)) {
+      TfLiteIntArray* input_sf_size = TfLiteIntArrayCreate(1);
+      input_sf_size->data[0] = n_batch;
+      TF_LITE_ENSURE_OK(
+          context, context->ResizeTensor(context, input_sf, input_sf_size));
+    }
+    node->temporaries->data[kOutputStateScalingFactors] =
+        op_data->scratch_tensor_index + kOutputStateScalingFactors;
+    TfLiteTensor* output_state_sf =
+        GetTemporary(context, node, kOutputStateScalingFactors);
+    output_state_sf->type = kTfLiteFloat32;
+    output_state_sf->allocation_type = kTfLiteArenaRw;
+    if (!TfLiteIntArrayEqualsArray(output_state_sf->dims, 1, scaling_dims)) {
+      TfLiteIntArray* output_state_sf_size = TfLiteIntArrayCreate(1);
+      output_state_sf_size->data[0] = n_batch;
+      TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, output_state_sf,
+                                                       output_state_sf_size));
     }
     node->temporaries->data[kProductScalingFactors] =
         scratch_tensor_index + kProductScalingFactors;
@@ -477,15 +490,28 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
       TF_LITE_ENSURE_OK(
           context, context->ResizeTensor(context, accum_scratch, accum_size));
     }
-    node->temporaries->data[kZeroPoints] = scratch_tensor_index + kZeroPoints;
-    TfLiteTensor* zero_points = GetTemporary(context, node, kZeroPoints);
-    zero_points->type = kTfLiteFloat32;
-    zero_points->allocation_type = kTfLiteArenaRw;
-    if (!TfLiteIntArrayEqualsArray(zero_points->dims, 1, scaling_dims)) {
-      TfLiteIntArray* zero_points_size = TfLiteIntArrayCreate(1);
-      zero_points_size->data[0] = n_batch;
-      TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, zero_points,
-                                                       zero_points_size));
+    node->temporaries->data[kInputZeroPoints] =
+        op_data->scratch_tensor_index + kInputZeroPoints;
+    TfLiteTensor* input_zp = GetTemporary(context, node, kInputZeroPoints);
+    input_zp->type = kTfLiteFloat32;
+    input_zp->allocation_type = kTfLiteArenaRw;
+    if (!TfLiteIntArrayEqualsArray(input_zp->dims, 1, scaling_dims)) {
+      TfLiteIntArray* input_zp_size = TfLiteIntArrayCreate(1);
+      input_zp_size->data[0] = n_batch;
+      TF_LITE_ENSURE_OK(
+          context, context->ResizeTensor(context, input_zp, input_zp_size));
+    }
+    node->temporaries->data[kOutputStateZeroPoints] =
+        op_data->scratch_tensor_index + kOutputStateZeroPoints;
+    TfLiteTensor* output_state_zp =
+        GetTemporary(context, node, kOutputStateZeroPoints);
+    output_state_zp->type = kTfLiteFloat32;
+    output_state_zp->allocation_type = kTfLiteArenaRw;
+    if (!TfLiteIntArrayEqualsArray(output_state_zp->dims, 1, scaling_dims)) {
+      TfLiteIntArray* output_state_zp_size = TfLiteIntArrayCreate(1);
+      output_state_zp_size->data[0] = n_batch;
+      TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, output_state_zp,
+                                                       output_state_zp_size));
     }
     node->temporaries->data[kRowSums] = scratch_tensor_index + kRowSums;
     TfLiteTensor* row_sums = GetTemporary(context, node, kRowSums);
@@ -640,7 +666,9 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
           projection_weights, projection_bias, &lstm_params,
           /*forward_sequence=*/true, time_major,
           /*output_offset=*/0, scratch_buffer,
-          GetTemporary(context, node, kScalingFactors),
+          GetTemporary(context, node, kInputScalingFactors),
+          /*aux_input_sf=*/nullptr,
+          GetTemporary(context, node, kOutputStateScalingFactors),
           GetTemporary(context, node, kProductScalingFactors),
           GetTemporary(context, node, kRecoveredCellWeights),
           GetTemporary(context, node, kInputQuantized),
@@ -648,8 +676,10 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
           GetTemporary(context, node, kOutputStateQuantized),
           GetTemporary(context, node, kCellStateQuantized), output_state,
           cell_state, GetTemporary(context, node, kAccumScratch), output,
-          GetTemporary(context, node, kZeroPoints), row_sums, row_sums_size,
-          &op_data->compute_row_sums,
+          GetTemporary(context, node, kInputZeroPoints),
+          /*aux_input_zp=*/nullptr,
+          GetTemporary(context, node, kOutputStateZeroPoints), row_sums,
+          row_sums_size, &op_data->compute_row_sums,
           CpuBackendContext::GetFromContext(context));
     }
     default:

From 355b4f2fc5b7e732cc982102cd6b0c715e60fa7c Mon Sep 17 00:00:00 2001
From: Rick Chao <rchao@google.com>
Date: Fri, 26 Jun 2020 16:37:33 -0700
Subject: [PATCH 1204/1390] MultiProcessRunner: Update run_contained to not
 return _ProcessStatusInfo in finally block; if sys.exit() is called in a
 subprocess, a SystemExit is raised, in which case we do not expect a
 _ProcessStatusInfo to be returned.

Minor change of logic in join().

PiperOrigin-RevId: 318565799
Change-Id: I9c468bd801515cd53f08c06b28e97638a735cd8b
---
 .../python/distribute/multi_process_runner.py | 27 +++++-----
 .../distribute/multi_process_runner_test.py   | 50 ++++++++++++++-----
 2 files changed, 51 insertions(+), 26 deletions(-)

diff --git a/tensorflow/python/distribute/multi_process_runner.py b/tensorflow/python/distribute/multi_process_runner.py
index 68a407d2b22..89162b50f4b 100644
--- a/tensorflow/python/distribute/multi_process_runner.py
+++ b/tensorflow/python/distribute/multi_process_runner.py
@@ -487,12 +487,6 @@ class MultiProcessRunner(object):
       logging.info('%s-%d exit code: %s', task_type, task_id, p.exitcode)
 
     process_statuses = self._queue_to_list(self._process_status_queue)
-    if not self._all_forced_terminated and len(
-        process_statuses) != self._outstanding_subprocess_count:
-      raise UnexpectedSubprocessExitError(
-          'Missing status(es) from %d subprocess(es). See logs for details.' %
-          (self._outstanding_subprocess_count - len(process_statuses)),
-          self._get_mpr_result(process_statuses))
     for process_status in process_statuses:
       assert isinstance(process_status, _ProcessStatusInfo)
       if not process_status.is_successful:
@@ -500,12 +494,12 @@ class MultiProcessRunner(object):
 
     # Checking all the processes that are expected to exit properly.
     for (task_type, task_id), p in self._processes.items():
-      if self._dependence_on_chief and task_type != 'chief':
+      if self._dependence_on_chief and chief and task_type != 'chief':
         # If _dependence_on_chief, other processes may have been
         # forced-terminated, which is expected.
         continue
       # Successfully exiting process has exit code 0.
-      if p.exitcode > 0:
+      if p.exitcode is None or p.exitcode > 0:
         raise UnexpectedSubprocessExitError(
             'Subprocess %s-%d exited with exit code %d. See logs for details.' %
             (task_type, task_id, p.exitcode),
@@ -844,17 +838,24 @@ def _run_contained(proc_func, args, kwargs):
 
   Returns:
     a _ProcessStatusInfo.
+
   """
+  is_successful = False
+  return_value = None
+  exc_info = None
   try:
     return_value = proc_func(*args, **kwargs)
     is_successful = True
-    exc_info = None
+    return _ProcessStatusInfo(
+        is_successful=is_successful,
+        exc_info=exc_info,
+        return_value=return_value)
+
+  # If `proc_func` ends up exiting with `sys.exit()`, the `SystemExit` is not
+  # handled here.
   except Exception:  # pylint: disable=broad-except
-    return_value = None
-    is_successful = False
     exc_info = sys.exc_info()
-  finally:
-    return _ProcessStatusInfo(  # pylint: disable=lost-exception
+    return _ProcessStatusInfo(
         is_successful=is_successful,
         exc_info=exc_info,
         return_value=return_value)
diff --git a/tensorflow/python/distribute/multi_process_runner_test.py b/tensorflow/python/distribute/multi_process_runner_test.py
index acec6d0c999..a6219dc5322 100644
--- a/tensorflow/python/distribute/multi_process_runner_test.py
+++ b/tensorflow/python/distribute/multi_process_runner_test.py
@@ -156,7 +156,11 @@ class MultiProcessRunnerTest(test.TestCase):
     mpr.start()
     time.sleep(5)
     mpr.terminate('worker', 0)
-    std_stream_results = mpr.join().stdout
+    with self.assertRaises(
+        multi_process_runner.UnexpectedSubprocessExitError) as cm:
+      mpr.join()
+
+    std_stream_results = cm.exception.mpr_result.stdout
 
     # Worker 0 is terminated in the middle, so it should not have iteration 9
     # printed.
@@ -327,7 +331,7 @@ class MultiProcessRunnerTest(test.TestCase):
           proc_func_expected_to_seg_fault,
           multi_worker_test_base.create_cluster_spec(num_workers=1),
           list_stdout=True)
-    self.assertIn('Missing status(es) from 1 subprocess(es).',
+    self.assertIn('Subprocess worker-0 exited with exit code',
                   str(cm.exception))
     list_to_assert = cm.exception.mpr_result.stdout
     self.assertTrue(any('SIGSEGV' in line for line in list_to_assert))
@@ -351,20 +355,40 @@ class MultiProcessRunnerTest(test.TestCase):
     list_to_assert = cm.exception.mpr_result.stdout
     self.assertTrue(any('SIGSEGV' in line for line in list_to_assert))
 
-  def test_non_zero_exit_code_raises_error(self):
+  def test_exit_code_is_reported_by_chief_subprocess(self):
 
-    def proc_func_expected_to_exit_with_1():
-      sys.exit(1)
+    def proc_func_expected_to_exit_with_20():
+      if multi_worker_test_base.get_task_type() == 'worker':
+        time.sleep(10000)
+      sys.exit(20)
+
+    mpr = multi_process_runner.MultiProcessRunner(
+        proc_func_expected_to_exit_with_20,
+        multi_worker_test_base.create_cluster_spec(
+            has_chief=True, num_workers=1))
+    mpr.start()
+
+    with self.assertRaisesRegex(
+        multi_process_runner.UnexpectedSubprocessExitError,
+        'Subprocess chief-0 exited with exit code 20'):
+      mpr.join()
+
+  def test_exit_code_is_reported_by_subprocess(self):
+
+    def proc_func_expected_to_exit_with_10():
+      sys.exit(10)
+
+    mpr = multi_process_runner.MultiProcessRunner(
+        proc_func_expected_to_exit_with_10,
+        multi_worker_test_base.create_cluster_spec(num_workers=1))
+    mpr.start()
+
+    with self.assertRaisesRegex(
+        multi_process_runner.UnexpectedSubprocessExitError,
+        'Subprocess worker-0 exited with exit code 10'):
+      mpr.join()
 
-    with self.assertRaises(
-        multi_process_runner.UnexpectedSubprocessExitError) as cm:
-      multi_process_runner.run(
-          proc_func_expected_to_exit_with_1,
-          multi_worker_test_base.create_cluster_spec(num_workers=1))
-    self.assertIn('Missing status(es) from 1 subprocess(es).',
-                  str(cm.exception))
 
-    
 class MultiProcessPoolRunnerTest(test.TestCase):
 
   def test_same_process_across_runs(self):

From fb03bc60fe90d332e357aafa8359a44369ff8caf Mon Sep 17 00:00:00 2001
From: RJ Skerry-Ryan <rjryan@google.com>
Date: Fri, 26 Jun 2020 16:39:44 -0700
Subject: [PATCH 1205/1390] Python 3 fixes for ConvertLiteralToNumpyArray.

In Python 3, range does not return a list.

PiperOrigin-RevId: 318566171
Change-Id: I76b66d3fac05660f3735a6fcfa3bd9c1eefb610b
---
 tensorflow/compiler/xla/python_api/xla_literal.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/compiler/xla/python_api/xla_literal.py b/tensorflow/compiler/xla/python_api/xla_literal.py
index 19bd685ab22..5b542b733a5 100644
--- a/tensorflow/compiler/xla/python_api/xla_literal.py
+++ b/tensorflow/compiler/xla/python_api/xla_literal.py
@@ -48,9 +48,9 @@ def ConvertLiteralToNumpyArray(literal):
     #    on the LiteralProto's layout.
     layout_order = literal.shape.layout.minor_to_major
     numpy_shape = tuple(literal.shape.dimensions)
-    if layout_order == range(len(literal.shape.dimensions)):
+    if layout_order == list(range(len(literal.shape.dimensions))):
       numpy_reshaper = lambda arr: arr.reshape(numpy_shape, order='F')
-    elif layout_order == range(len(literal.shape.dimensions) - 1, -1, -1):
+    elif layout_order == list(range(len(literal.shape.dimensions) - 1, -1, -1)):
       numpy_reshaper = lambda arr: arr.reshape(numpy_shape, order='C')
     else:
       raise NotImplementedError('Unsupported layout: {0}'.format(layout_order))

From f2088ca5c11ad640364f0af861b34b7e073079dd Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 26 Jun 2020 16:55:50 -0700
Subject: [PATCH 1206/1390] Integrate LLVM at
 https://github.com/llvm/llvm-project/commit/ff5ccf258e29

PiperOrigin-RevId: 318568587
Change-Id: I98f9cd2c5c08ed9b3a5c89b5629d4e7e572533ff
---
 tensorflow/workspace.bzl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index 21682f0878e..39e1e26dea7 100755
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -710,8 +710,8 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
     )
 
     # Check out LLVM and MLIR from llvm-project.
-    LLVM_COMMIT = "a95796a380ed011a73a103e7f7ffa372f23438dd"
-    LLVM_SHA256 = "39fdff3b2c617be7f0990f70ac7c5d8ab1b71e99485437fa794cf54bd7ec487d"
+    LLVM_COMMIT = "ff5ccf258e297df29f32d6b5e4fa0a7b95c44f9c"
+    LLVM_SHA256 = "dba7d310b0703b103df38381d00abe78b4354887e5f0241bd3d212694780ae5f"
     LLVM_URLS = [
         "https://storage.googleapis.com/mirror.tensorflow.org/github.com/llvm/llvm-project/archive/{commit}.tar.gz".format(commit = LLVM_COMMIT),
         "https://github.com/llvm/llvm-project/archive/{commit}.tar.gz".format(commit = LLVM_COMMIT),

From eba2e3ca1cd3ffdfa4a8adbaebf11ca76a1a4743 Mon Sep 17 00:00:00 2001
From: Jeremy Lau <lauj@google.com>
Date: Fri, 26 Jun 2020 16:59:01 -0700
Subject: [PATCH 1207/1390] Temporarily disable failing
 vectorizer_registry_test.

PiperOrigin-RevId: 318569061
Change-Id: I8a973d7fe1e89fcc4e46d6c7bb7f2bbfc8b7659d
---
 tensorflow/core/grappler/optimizers/data/vectorization/BUILD | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/tensorflow/core/grappler/optimizers/data/vectorization/BUILD b/tensorflow/core/grappler/optimizers/data/vectorization/BUILD
index e1f648d007a..7eae11a0c0c 100644
--- a/tensorflow/core/grappler/optimizers/data/vectorization/BUILD
+++ b/tensorflow/core/grappler/optimizers/data/vectorization/BUILD
@@ -52,7 +52,10 @@ cc_library(
 tf_cc_test(
     name = "vectorizer_registry_test",
     srcs = ["vectorizer_registry_test.cc"],
-    tags = ["notap"],  # TODO(b/159771496)
+    tags = [
+        "no_oss",  # TODO(b/159771496)
+        "notap",  # TODO(b/159771496)
+    ],
     deps = [
         ":vectorizer_registry",
         "//tensorflow/core:test",

From cd463b2bab5eff7eced13ad1a0d95421ccfc8985 Mon Sep 17 00:00:00 2001
From: Henry Tan <henrytan@google.com>
Date: Fri, 26 Jun 2020 17:00:06 -0700
Subject: [PATCH 1208/1390] TPU op internal refactor.

PiperOrigin-RevId: 318569224
Change-Id: I93bb934321b166c9ce8d7dc3b8275abe215d0e1e
---
 .../core/tpu/kernels/tpu_compile_c_api.h      |  2 +-
 tensorflow/core/tpu/kernels/tpu_op_util.cc    | 33 +++++--------------
 tensorflow/core/tpu/kernels/tpu_op_util.h     |  7 ----
 3 files changed, 9 insertions(+), 33 deletions(-)

diff --git a/tensorflow/core/tpu/kernels/tpu_compile_c_api.h b/tensorflow/core/tpu/kernels/tpu_compile_c_api.h
index e82df78b3bd..37de24c339b 100644
--- a/tensorflow/core/tpu/kernels/tpu_compile_c_api.h
+++ b/tensorflow/core/tpu/kernels/tpu_compile_c_api.h
@@ -35,7 +35,7 @@ struct CompilationCacheKeyProperty {
   const char* mlir_module;
   const int32_t* device_ids;
   size_t device_ids_size;
-  size_t guaranteed_constants_size;
+  int32_t guaranteed_constants_size;
   uint64_t function_library_fingerprint;
   int32_t num_cores_per_replica;
   int32_t num_replicas;
diff --git a/tensorflow/core/tpu/kernels/tpu_op_util.cc b/tensorflow/core/tpu/kernels/tpu_op_util.cc
index 477afac6491..b3b675e2734 100644
--- a/tensorflow/core/tpu/kernels/tpu_op_util.cc
+++ b/tensorflow/core/tpu/kernels/tpu_op_util.cc
@@ -70,14 +70,12 @@ std::string CreateConfigPrefix(const TPUCompileMetadataProto& metadata) {
 
 // Return fingerprint_in_metadata if it's not empty; otherwise read input tensor
 // data to compute the fingerprint.
-std::string GuaranteedConstFingerprint(const string& fingerprint_in_metadata,
-                                       const Tensor* guaranteed_constants,
-                                       size_t guaranteed_constants_size) {
+std::string GuaranteedConstFingerprint(
+    const string& fingerprint_in_metadata,
+    const OpInputList& guaranteed_constants) {
   if (fingerprint_in_metadata.empty()) {
     uint64_t fingerprint = 0;
-    for (size_t i = 0; i < guaranteed_constants_size; ++i) {
-      const Tensor& constant = guaranteed_constants[i];
-      // TODO(henrytan): constant.tensor_data() may be uninitialized.
+    for (const Tensor& constant : guaranteed_constants) {
       fingerprint = TpuCompile_CreateGuaranteedConstFingerprint(
           fingerprint, constant.tensor_data().data(),
           constant.tensor_data().size());
@@ -92,8 +90,7 @@ std::string GuaranteedConstFingerprint(const string& fingerprint_in_metadata,
 // evaluation of `guaranteed_const_fingerprint()` callback.
 TpuCompilationCacheKey CreateCompilationCacheKey(
     absl::string_view function_name, uint64 function_library_fingerprint,
-    absl::string_view mlir_module, const Tensor* guaranteed_constants,
-    size_t guaranteed_constants_size,
+    absl::string_view mlir_module, const OpInputList& guaranteed_constants,
     const std::vector<TensorShape>& dynamic_shapes,
     const TPUCompileMetadataProto& metadata,
     const TpuMeshStateInterface& mesh_state) {
@@ -119,7 +116,7 @@ TpuCompilationCacheKey CreateCompilationCacheKey(
           mlir_module.data(),
           flattened_device_ids.data(),
           flattened_device_ids.size(),
-          guaranteed_constants_size,
+          guaranteed_constants.size(),
           function_library_fingerprint,
           metadata.num_cores_per_replica(),
           metadata.num_replicas(),
@@ -133,7 +130,7 @@ TpuCompilationCacheKey CreateCompilationCacheKey(
 
   // Guaranteed constants can be different across sessions. Use session_handle
   // and guaranteed_const fingerprint to guarantee no collision.
-  if (guaranteed_constants != nullptr && guaranteed_constants_size > 0) {
+  if (guaranteed_constants.size() > 0) {
     key.has_guaranteed_const = true;
     key.session_handle = metadata.session_handle();
     // Both `metadata` and `guaranteed_constants` lifetime are captured by
@@ -142,29 +139,15 @@ TpuCompilationCacheKey CreateCompilationCacheKey(
     // lifetime of the compilation cache lookups.
     string fingerprint;
     key.guaranteed_const_fingerprint = [&metadata, &guaranteed_constants,
-                                        guaranteed_constants_size,
                                         fingerprint]() mutable {
       if (fingerprint.empty()) {
         fingerprint = GuaranteedConstFingerprint(
-            metadata.guaranteed_const_fingerprint(), guaranteed_constants,
-            guaranteed_constants_size);
+            metadata.guaranteed_const_fingerprint(), guaranteed_constants);
       }
       return fingerprint;
     };
   }
   return key;
 }
-
-TpuCompilationCacheKey CreateCompilationCacheKey(
-    absl::string_view function_name, uint64 function_library_fingerprint,
-    absl::string_view mlir_module, const OpInputList& guaranteed_constants,
-    const std::vector<TensorShape>& dynamic_shapes,
-    const TPUCompileMetadataProto& metadata,
-    const TpuMeshStateInterface& mesh_state) {
-  return CreateCompilationCacheKey(
-      function_name, function_library_fingerprint, mlir_module,
-      (guaranteed_constants.size() > 0 ? &guaranteed_constants[0] : nullptr),
-      guaranteed_constants.size(), dynamic_shapes, metadata, mesh_state);
-}
 }  // namespace tpu
 }  // namespace tensorflow
diff --git a/tensorflow/core/tpu/kernels/tpu_op_util.h b/tensorflow/core/tpu/kernels/tpu_op_util.h
index bbaa05682e6..0a9657ca05e 100644
--- a/tensorflow/core/tpu/kernels/tpu_op_util.h
+++ b/tensorflow/core/tpu/kernels/tpu_op_util.h
@@ -34,13 +34,6 @@ TpuCompilationCacheKey CreateCompilationCacheKey(
     const std::vector<TensorShape>& dynamic_shapes,
     const TPUCompileMetadataProto& metadata,
     const TpuMeshStateInterface& mesh_state);
-TpuCompilationCacheKey CreateCompilationCacheKey(
-    absl::string_view function_name, uint64 function_library_fingerprint,
-    absl::string_view mlir_module, const Tensor* guaranteed_constants,
-    size_t guaranteed_constants_size,
-    const std::vector<TensorShape>& dynamic_shapes,
-    const TPUCompileMetadataProto& metadata,
-    const TpuMeshStateInterface& mesh_state);
 }  // namespace tpu
 }  // namespace tensorflow
 

From 432b1c1c431eb52fc48a9d9b90a68452183d66f8 Mon Sep 17 00:00:00 2001
From: Robert David <lrdx@google.com>
Date: Fri, 26 Jun 2020 17:04:22 -0700
Subject: [PATCH 1209/1390] lstm_eval_tests.cc: Give zero_points_tensorN_ a
 proper name.

PiperOrigin-RevId: 318569888
Change-Id: I21d55643be277ef84d3ee56ca4a50462f1e5aaea
---
 tensorflow/lite/kernels/lstm_eval_test.cc | 30 +++++++++++------------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/tensorflow/lite/kernels/lstm_eval_test.cc b/tensorflow/lite/kernels/lstm_eval_test.cc
index 459315bd1c9..adaa5db1e20 100644
--- a/tensorflow/lite/kernels/lstm_eval_test.cc
+++ b/tensorflow/lite/kernels/lstm_eval_test.cc
@@ -695,22 +695,22 @@ class HybridLstmParam : public BaseLstmParam {
     return &cell_quantized_tensor_;
   }
   TfLiteTensor* GetInputZeroPoints() {
-    PackWeightToTensor(&zero_points_tensor0_, input_zp_,
+    PackWeightToTensor(&input_zp_tensor_, input_zp_,
                        quantization_extra_scratch_buffer_sizes_);
-    zero_points_tensor0_.data.i32 = input_zp_.data();
-    return &zero_points_tensor0_;
+    input_zp_tensor_.data.i32 = input_zp_.data();
+    return &input_zp_tensor_;
   }
   TfLiteTensor* GetAuxInputZeroPoints() {
-    PackWeightToTensor(&zero_points_tensor1_, aux_input_zp_,
+    PackWeightToTensor(&aux_input_zp_tensor_, aux_input_zp_,
                        quantization_extra_scratch_buffer_sizes_);
-    zero_points_tensor1_.data.i32 = aux_input_zp_.data();
-    return &zero_points_tensor1_;
+    aux_input_zp_tensor_.data.i32 = aux_input_zp_.data();
+    return &aux_input_zp_tensor_;
   }
   TfLiteTensor* GetOutputStateZeroPoints() {
-    PackWeightToTensor(&zero_points_tensor2_, output_state_zp_,
+    PackWeightToTensor(&output_state_zp_tensor_, output_state_zp_,
                        quantization_extra_scratch_buffer_sizes_);
-    zero_points_tensor2_.data.i32 = output_state_zp_.data();
-    return &zero_points_tensor2_;
+    output_state_zp_tensor_.data.i32 = output_state_zp_.data();
+    return &output_state_zp_tensor_;
   }
   TfLiteTensor* GetRowSums() {
     PackWeightToTensor(&row_sums_tensor_, row_sums_, row_sums_size_);
@@ -808,9 +808,9 @@ class HybridLstmParam : public BaseLstmParam {
     TfLiteIntArrayFree(input_quantized_tensor_.dims);
     TfLiteIntArrayFree(activation_quantized_tensor_.dims);
     TfLiteIntArrayFree(cell_quantized_tensor_.dims);
-    TfLiteIntArrayFree(zero_points_tensor0_.dims);
-    TfLiteIntArrayFree(zero_points_tensor1_.dims);
-    TfLiteIntArrayFree(zero_points_tensor2_.dims);
+    TfLiteIntArrayFree(input_zp_tensor_.dims);
+    TfLiteIntArrayFree(aux_input_zp_tensor_.dims);
+    TfLiteIntArrayFree(output_state_zp_tensor_.dims);
     TfLiteIntArrayFree(row_sums_tensor_.dims);
   }
 
@@ -833,11 +833,11 @@ class HybridLstmParam : public BaseLstmParam {
   TfLiteTensor prod_scaling_factors_tensor_;
 
   std::vector<int32_t> input_zp_;
-  TfLiteTensor zero_points_tensor0_;
+  TfLiteTensor input_zp_tensor_;
   std::vector<int32_t> aux_input_zp_;
-  TfLiteTensor zero_points_tensor1_;
+  TfLiteTensor aux_input_zp_tensor_;
   std::vector<int32_t> output_state_zp_;
-  TfLiteTensor zero_points_tensor2_;
+  TfLiteTensor output_state_zp_tensor_;
 
   std::vector<int8_t> input_quantized_;
   TfLiteTensor input_quantized_tensor_;

From c9d13afacd601789d0c71152f257c1b6a7a599e8 Mon Sep 17 00:00:00 2001
From: Sanjoy Das <sanjoy@google.com>
Date: Fri, 26 Jun 2020 17:12:02 -0700
Subject: [PATCH 1210/1390] Enable the SystemZ target in the open source LLVM
 build file

PiperOrigin-RevId: 318570949
Change-Id: Iffdbb3f90db1c69c772874b2b38fc2a44bc169a0
---
 third_party/llvm/llvm.autogenerated.BUILD | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/third_party/llvm/llvm.autogenerated.BUILD b/third_party/llvm/llvm.autogenerated.BUILD
index 9a98f69dd23..85efc0db65e 100644
--- a/third_party/llvm/llvm.autogenerated.BUILD
+++ b/third_party/llvm/llvm.autogenerated.BUILD
@@ -32,6 +32,7 @@ llvm_targets = [
     "ARM",
     "NVPTX",
     "PowerPC",
+    "SystemZ",
     "X86",
 ]
 
@@ -533,6 +534,23 @@ llvm_target_list = [
             ("-gen-disassembler", "lib/Target/PowerPC/PPCGenDisassemblerTables.inc"),
         ],
     },
+    {
+        "name": "SystemZ",
+        "lower_name": "system_z",
+        "short_name": "SystemZ",
+        "dir_name": "SystemZ",
+        "tbl_outs": [
+            ("-gen-asm-writer", "lib/Target/SystemZ/SystemZGenAsmWriter.inc"),
+            ("-gen-asm-matcher", "lib/Target/SystemZ/SystemZGenAsmMatcher.inc"),
+            ("-gen-emitter", "lib/Target/SystemZ/SystemZGenMCCodeEmitter.inc"),
+            ("-gen-register-info", "lib/Target/SystemZ/SystemZGenRegisterInfo.inc"),
+            ("-gen-instr-info", "lib/Target/SystemZ/SystemZGenInstrInfo.inc"),
+            ("-gen-dag-isel", "lib/Target/SystemZ/SystemZGenDAGISel.inc"),
+            ("-gen-callingconv", "lib/Target/SystemZ/SystemZGenCallingConv.inc"),
+            ("-gen-subtarget", "lib/Target/SystemZ/SystemZGenSubtargetInfo.inc"),
+            ("-gen-disassembler", "lib/Target/SystemZ/SystemZGenDisassemblerTables.inc"),
+        ],
+    },
     {
         "name": "X86",
         "lower_name": "x86",
@@ -3910,6 +3928,7 @@ cc_library(
     deps = [
         ":MC",
         ":Support",
+        ":SystemZCommonTableGen",
         ":SystemZInfo",
         ":config",
     ],
@@ -3945,6 +3964,7 @@ cc_library(
         "lib/Target/SystemZ/TargetInfo/*.c",
         "lib/Target/SystemZ/TargetInfo/*.cpp",
         "lib/Target/SystemZ/TargetInfo/*.inc",
+        "lib/Target/SystemZ/MCTargetDesc/*.h",
     ]),
     hdrs = glob([
         "include/llvm/Target/SystemZ/TargetInfo/*.h",
@@ -3955,6 +3975,7 @@ cc_library(
     copts = llvm_copts + ["-Iexternal/llvm-project/llvm/lib/Target/SystemZ"],
     deps = [
         ":Support",
+        ":SystemZCommonTableGen",
         ":config",
     ],
 )

From a9a82a1f0d575418628282086f03ab66107a9c4c Mon Sep 17 00:00:00 2001
From: Chenkai Kuang <chenkai@google.com>
Date: Fri, 26 Jun 2020 18:21:04 -0700
Subject: [PATCH 1211/1390] Make MultiProccessRunner rpc_layer defaults to
 "grpc" if not specified.

PiperOrigin-RevId: 318578551
Change-Id: I59ee41ded51ed8117ec63ef4ed31c8ad4c4fab3d
---
 tensorflow/python/distribute/multi_process_runner.py  | 4 ++--
 tensorflow/python/distribute/strategy_combinations.py | 6 +++++-
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/tensorflow/python/distribute/multi_process_runner.py b/tensorflow/python/distribute/multi_process_runner.py
index 89162b50f4b..33451f2f255 100644
--- a/tensorflow/python/distribute/multi_process_runner.py
+++ b/tensorflow/python/distribute/multi_process_runner.py
@@ -138,7 +138,7 @@ class MultiProcessRunner(object):
                     "worker2.example.com:2222"],
          "ps": ["ps0.example.com:2222",
                 "ps1.example.com:2222"]}
-      rpc_layer: RPC layer to use. Default value is 'grpc+loas'.
+      rpc_layer: RPC layer to use. Default value is 'grpc'.
       max_run_time: If set, child processes is forced to exit at approximately
         this many seconds after `start` is called. We achieve this through
         `signal.alarm()` api. Note that this is best effort at Python level
@@ -184,7 +184,7 @@ class MultiProcessRunner(object):
 
     self._proc_func = proc_func
     self._cluster_spec = cluster_spec
-    self._rpc_layer = rpc_layer
+    self._rpc_layer = rpc_layer or 'grpc'
     self._max_run_time = max_run_time
     self._grpc_fail_fast = grpc_fail_fast
     self._stream_stdout = stream_stdout
diff --git a/tensorflow/python/distribute/strategy_combinations.py b/tensorflow/python/distribute/strategy_combinations.py
index d66c7acba77..33c6fd17fc5 100644
--- a/tensorflow/python/distribute/strategy_combinations.py
+++ b/tensorflow/python/distribute/strategy_combinations.py
@@ -109,11 +109,15 @@ def _get_multi_worker_mirrored_creator(required_gpus):
 
   def _create_multi_worker_mirrored():
     tf_config = cluster_resolver.TFConfigClusterResolver()
+    master = tf_config.master()
+    if tf_config.rpc_layer:
+      # Strip off the rpc_layer suffix.
+      master = master[len("%s://" % tf_config.rpc_layer):]
     resolver = cluster_resolver.SimpleClusterResolver(
         cluster_spec=tf_config.cluster_spec(),
         task_type=tf_config.task_type,
         task_id=tf_config.task_id,
-        master=tf_config.master(),
+        master=master,
         environment=tf_config.environment,
         num_accelerators={"GPU": required_gpus},
         rpc_layer=tf_config.rpc_layer or "grpc",

From 07dd408634a5e9268c389ef32724bfed4ca0e66f Mon Sep 17 00:00:00 2001
From: Tomer Kaftan <kaftan@google.com>
Date: Fri, 26 Jun 2020 18:38:25 -0700
Subject: [PATCH 1212/1390] Switch Keras microbenchmarks to use the TF api
 directly instead of using direct internal imports.

It also renames the keras benchmarks directory to `benchmarks` instead of `benchmark` for consistency w/ other benchmarks folders in tf
Also adds an overhead benchmark for `.predict`. It only runs it a few times because `predict` appears to have fairly high overheads currently.

PiperOrigin-RevId: 318580303
Change-Id: Ibe99ee56974ead3783fcefd9072bb87896f743fe
---
 .../keras/{benchmark => benchmarks}/BUILD     |  12 +-
 .../keras/{benchmark => benchmarks}/README.md |   0
 .../{benchmark => benchmarks}/__init__.py     |   0
 .../applications_saved_model_test.py          |   0
 .../eager_microbenchmarks_test.py             | 109 ++++++++++--------
 .../keras_cpu_benchmark_test.py               |  43 +++----
 .../model_components_benchmarks_test.py       |  81 +++++++------
 7 files changed, 123 insertions(+), 122 deletions(-)
 rename tensorflow/python/keras/{benchmark => benchmarks}/BUILD (80%)
 rename tensorflow/python/keras/{benchmark => benchmarks}/README.md (100%)
 rename tensorflow/python/keras/{benchmark => benchmarks}/__init__.py (100%)
 rename tensorflow/python/keras/{benchmark => benchmarks}/applications_saved_model_test.py (100%)
 rename tensorflow/python/keras/{benchmark => benchmarks}/eager_microbenchmarks_test.py (73%)
 rename tensorflow/python/keras/{benchmark => benchmarks}/keras_cpu_benchmark_test.py (80%)
 rename tensorflow/python/keras/{benchmark => benchmarks}/model_components_benchmarks_test.py (83%)

diff --git a/tensorflow/python/keras/benchmark/BUILD b/tensorflow/python/keras/benchmarks/BUILD
similarity index 80%
rename from tensorflow/python/keras/benchmark/BUILD
rename to tensorflow/python/keras/benchmarks/BUILD
index 7d462b7de95..ca72f31eb27 100755
--- a/tensorflow/python/keras/benchmark/BUILD
+++ b/tensorflow/python/keras/benchmarks/BUILD
@@ -29,7 +29,7 @@ py_test(
     srcs = ["keras_cpu_benchmark_test.py"],
     python_version = "PY3",
     deps = [
-        "//tensorflow/python/keras",
+        "//tensorflow:tensorflow_py",
         "//third_party/py/numpy",
     ],
 )
@@ -40,7 +40,7 @@ cuda_py_test(
     srcs = ["eager_microbenchmarks_test.py"],
     python_version = "PY3",
     deps = [
-        "//tensorflow/python/keras",
+        "//tensorflow:tensorflow_py",
     ],
 )
 
@@ -61,12 +61,6 @@ cuda_py_test(
     srcs = ["model_components_benchmarks_test.py"],
     python_version = "PY3",
     deps = [
-        "//tensorflow/python:random_ops",
-        "//tensorflow/python:training_lib",
-        "//tensorflow/python/data/ops:dataset_ops",
-        "//tensorflow/python/eager:backprop",
-        "//tensorflow/python/eager:context",
-        "//tensorflow/python/eager:profiler",
-        "//tensorflow/python/eager:test",
+        "//tensorflow:tensorflow_py",
     ],
 )
diff --git a/tensorflow/python/keras/benchmark/README.md b/tensorflow/python/keras/benchmarks/README.md
similarity index 100%
rename from tensorflow/python/keras/benchmark/README.md
rename to tensorflow/python/keras/benchmarks/README.md
diff --git a/tensorflow/python/keras/benchmark/__init__.py b/tensorflow/python/keras/benchmarks/__init__.py
similarity index 100%
rename from tensorflow/python/keras/benchmark/__init__.py
rename to tensorflow/python/keras/benchmarks/__init__.py
diff --git a/tensorflow/python/keras/benchmark/applications_saved_model_test.py b/tensorflow/python/keras/benchmarks/applications_saved_model_test.py
similarity index 100%
rename from tensorflow/python/keras/benchmark/applications_saved_model_test.py
rename to tensorflow/python/keras/benchmarks/applications_saved_model_test.py
diff --git a/tensorflow/python/keras/benchmark/eager_microbenchmarks_test.py b/tensorflow/python/keras/benchmarks/eager_microbenchmarks_test.py
similarity index 73%
rename from tensorflow/python/keras/benchmark/eager_microbenchmarks_test.py
rename to tensorflow/python/keras/benchmarks/eager_microbenchmarks_test.py
index b4d0837c326..aa33618fbe7 100644
--- a/tensorflow/python/keras/benchmark/eager_microbenchmarks_test.py
+++ b/tensorflow/python/keras/benchmarks/eager_microbenchmarks_test.py
@@ -19,15 +19,9 @@ from __future__ import print_function
 
 import time
 
+import tensorflow as tf
+
 from tensorflow.python.eager import context
-from tensorflow.python.framework import ops
-from tensorflow.python.keras.engine import base_layer
-from tensorflow.python.keras.layers import advanced_activations
-from tensorflow.python.keras.layers import convolutional
-from tensorflow.python.keras.layers import core
-from tensorflow.python.keras.layers import embeddings
-from tensorflow.python.keras.layers import normalization
-from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
 from tensorflow.python.util import tf_inspect
 
@@ -92,24 +86,39 @@ class MicroBenchmarksBase(test.Benchmark):
 
   def benchmark_layers_call_overhead(self):
 
-    class OnlyOverheadLayer(base_layer.Layer):
+    class OnlyOverheadLayer(tf.keras.layers.Layer):
 
       def call(self, x):
         return x
 
     layer = OnlyOverheadLayer()
-    x = ops.convert_to_tensor([[1.]])
+    x = tf.convert_to_tensor([[1.]])
 
     def fn():
-      layer(x)
+      layer(x)  # pylint: disable=not-callable
 
     self._run(fn, 10000)
 
+  def benchmark_model_predict_tensorlike_overhead(self):
+
+    class OnlyOverheadLayer(tf.keras.layers.Layer):
+
+      def call(self, x):
+        return x
+
+    model = tf.keras.Sequential([OnlyOverheadLayer()])
+    x = tf.convert_to_tensor([[1.]])
+
+    def fn():
+      model.predict(x)
+
+    self._run(fn, 20)
+
   # Naming convention: benchmark_layers_{module_name}_{class}_overhead.
   def benchmark_layers_advanced_activations_leaky_relu_overhead(self):
 
-    layer = advanced_activations.LeakyReLU()
-    x = array_ops.ones((1, 1))
+    layer = tf.keras.layers.LeakyReLU()
+    x = tf.ones((1, 1))
 
     def fn():
       layer(x)
@@ -118,8 +127,8 @@ class MicroBenchmarksBase(test.Benchmark):
 
   def benchmark_layers_advanced_activations_prelu_overhead(self):
 
-    layer = advanced_activations.PReLU()
-    x = array_ops.ones((1, 1))
+    layer = tf.keras.layers.PReLU()
+    x = tf.ones((1, 1))
 
     def fn():
       layer(x)
@@ -128,8 +137,8 @@ class MicroBenchmarksBase(test.Benchmark):
 
   def benchmark_layers_advanced_activations_elu_overhead(self):
 
-    layer = advanced_activations.ELU()
-    x = array_ops.ones((1, 1))
+    layer = tf.keras.layers.ELU()
+    x = tf.ones((1, 1))
 
     def fn():
       layer(x)
@@ -138,8 +147,8 @@ class MicroBenchmarksBase(test.Benchmark):
 
   def benchmark_layers_advanced_activations_thresholded_relu_overhead(self):
 
-    layer = advanced_activations.ThresholdedReLU()
-    x = array_ops.ones((1, 1))
+    layer = tf.keras.layers.ThresholdedReLU()
+    x = tf.ones((1, 1))
 
     def fn():
       layer(x)
@@ -148,8 +157,8 @@ class MicroBenchmarksBase(test.Benchmark):
 
   def benchmark_layers_advanced_activations_softmax_overhead(self):
 
-    layer = advanced_activations.Softmax()
-    x = array_ops.ones((1, 1))
+    layer = tf.keras.layers.Softmax()
+    x = tf.ones((1, 1))
 
     def fn():
       layer(x)
@@ -158,8 +167,8 @@ class MicroBenchmarksBase(test.Benchmark):
 
   def benchmark_layers_advanced_activations_relu_overhead(self):
 
-    layer = advanced_activations.ReLU()
-    x = array_ops.ones((1, 1))
+    layer = tf.keras.layers.ReLU()
+    x = tf.ones((1, 1))
 
     def fn():
       layer(x)
@@ -168,8 +177,8 @@ class MicroBenchmarksBase(test.Benchmark):
 
   def benchmark_layers_core_masking_overhead(self):
 
-    layer = core.Masking()
-    x = array_ops.ones((1, 1))
+    layer = tf.keras.layers.Masking()
+    x = tf.ones((1, 1))
 
     def fn():
       layer(x)
@@ -178,8 +187,8 @@ class MicroBenchmarksBase(test.Benchmark):
 
   def benchmark_layers_core_dropout_overhead(self):
 
-    layer = core.Dropout(0.5)
-    x = array_ops.ones((1, 1))
+    layer = tf.keras.layers.Dropout(0.5)
+    x = tf.ones((1, 1))
 
     def fn():
       layer(x, training=True)
@@ -188,8 +197,8 @@ class MicroBenchmarksBase(test.Benchmark):
 
   def benchmark_layers_core_flatten_overhead(self):
 
-    layer = core.Flatten()
-    x = ops.convert_to_tensor([[[1.]]])
+    layer = tf.keras.layers.Flatten()
+    x = tf.convert_to_tensor([[[1.]]])
 
     def fn():
       layer(x)
@@ -198,8 +207,8 @@ class MicroBenchmarksBase(test.Benchmark):
 
   def benchmark_layers_core_dense_overhead(self):
 
-    layer = core.Dense(1)
-    x = ops.convert_to_tensor([[1.]])
+    layer = tf.keras.layers.Dense(1)
+    x = tf.convert_to_tensor([[1.]])
 
     def fn():
       layer(x)
@@ -208,8 +217,8 @@ class MicroBenchmarksBase(test.Benchmark):
 
   def benchmark_layers_convolutional_conv1d_overhead(self):
 
-    layer = convolutional.Conv1D(1, (1,))
-    x = array_ops.ones((1, 1, 1))
+    layer = tf.keras.layers.Conv1D(1, (1,))
+    x = tf.ones((1, 1, 1))
 
     def fn():
       layer(x)
@@ -218,8 +227,8 @@ class MicroBenchmarksBase(test.Benchmark):
 
   def benchmark_layers_convolutional_conv2d_overhead(self):
 
-    layer = convolutional.Conv2D(1, (1, 1))
-    x = array_ops.ones((1, 1, 1, 1))
+    layer = tf.keras.layers.Conv2D(1, (1, 1))
+    x = tf.ones((1, 1, 1, 1))
 
     def fn():
       layer(x)
@@ -228,8 +237,8 @@ class MicroBenchmarksBase(test.Benchmark):
 
   def benchmark_layers_convolutional_conv3d_overhead(self):
 
-    layer = convolutional.Conv3D(1, (1, 1, 1))
-    x = array_ops.ones((1, 1, 1, 1, 1))
+    layer = tf.keras.layers.Conv3D(1, (1, 1, 1))
+    x = tf.ones((1, 1, 1, 1, 1))
 
     def fn():
       layer(x)
@@ -238,8 +247,8 @@ class MicroBenchmarksBase(test.Benchmark):
 
   def benchmark_layers_embeddings_embedding_overhead(self):
 
-    layer = embeddings.Embedding(1, 1)
-    x = array_ops.zeros((1, 1), dtype="int32")
+    layer = tf.keras.layers.Embedding(1, 1)
+    x = tf.zeros((1, 1), dtype="int32")
 
     def fn():
       layer(x)
@@ -248,8 +257,8 @@ class MicroBenchmarksBase(test.Benchmark):
 
   def benchmark_layers_batch_norm_fused_inf(self):
 
-    layer = normalization.BatchNormalization(fused=True)
-    x = array_ops.ones((1, 1, 1, 1))
+    layer = tf.keras.layers.BatchNormalization(fused=True)
+    x = tf.ones((1, 1, 1, 1))
 
     def fn():
       layer(x)
@@ -258,8 +267,8 @@ class MicroBenchmarksBase(test.Benchmark):
 
   def benchmark_layers_batch_norm_fused_train(self):
 
-    layer = normalization.BatchNormalization(fused=True)
-    x = array_ops.ones((1, 1, 1, 1))
+    layer = tf.keras.layers.BatchNormalization(fused=True)
+    x = tf.ones((1, 1, 1, 1))
 
     def fn():
       layer(x, training=True)
@@ -268,8 +277,8 @@ class MicroBenchmarksBase(test.Benchmark):
 
   def benchmark_layers_batch_norm_nonfused_inf(self):
 
-    layer = normalization.BatchNormalization(fused=False)
-    x = array_ops.ones((1, 1, 1, 1))
+    layer = tf.keras.layers.BatchNormalization(fused=False)
+    x = tf.ones((1, 1, 1, 1))
 
     def fn():
       layer(x)
@@ -278,8 +287,8 @@ class MicroBenchmarksBase(test.Benchmark):
 
   def benchmark_layers_batch_norm_nonfused_train(self):
 
-    layer = normalization.BatchNormalization(fused=False)
-    x = array_ops.ones((1, 1, 1, 1))
+    layer = tf.keras.layers.BatchNormalization(fused=False)
+    x = tf.ones((1, 1, 1, 1))
 
     def fn():
       layer(x, training=True)
@@ -288,8 +297,8 @@ class MicroBenchmarksBase(test.Benchmark):
 
   def benchmark_layers_normalization_layer_normalization_overhead(self):
 
-    layer = normalization.LayerNormalization()
-    x = array_ops.ones((1, 1))
+    layer = tf.keras.layers.LayerNormalization()
+    x = tf.ones((1, 1))
 
     def fn():
       layer(x, training=True)
@@ -298,5 +307,5 @@ class MicroBenchmarksBase(test.Benchmark):
 
 
 if __name__ == "__main__":
-  ops.enable_eager_execution()
+  assert tf.executing_eagerly()
   test.main()
diff --git a/tensorflow/python/keras/benchmark/keras_cpu_benchmark_test.py b/tensorflow/python/keras/benchmarks/keras_cpu_benchmark_test.py
similarity index 80%
rename from tensorflow/python/keras/benchmark/keras_cpu_benchmark_test.py
rename to tensorflow/python/keras/benchmarks/keras_cpu_benchmark_test.py
index 43e2470cf6b..b214df04746 100644
--- a/tensorflow/python/keras/benchmark/keras_cpu_benchmark_test.py
+++ b/tensorflow/python/keras/benchmarks/keras_cpu_benchmark_test.py
@@ -23,7 +23,8 @@ import timeit
 import numpy as np
 import six
 
-from tensorflow.python import keras
+import tensorflow as tf
+
 from tensorflow.python.platform import benchmark
 from tensorflow.python.platform import test
 
@@ -40,7 +41,7 @@ _LSTM_X = np.random.randint(0, 1999, size=(2500, 100))
 _LSTM_Y = np.random.random((2500, 1))
 
 
-class TimerCallback(keras.callbacks.Callback):
+class TimerCallback(tf.keras.callbacks.Callback):
 
   def __init__(self):
     self.times = []
@@ -110,35 +111,35 @@ class KerasModelCPUBenchmark(
         extras=results)
 
   def _mnist_mlp(self):
-    model = keras.Sequential()
-    model.add(keras.layers.Dense(512, activation='relu', input_shape=(784,)))
-    model.add(keras.layers.Dropout(0.2))
-    model.add(keras.layers.Dense(512, activation='relu'))
-    model.add(keras.layers.Dropout(0.2))
-    model.add(keras.layers.Dense(10, activation='softmax'))
+    model = tf.keras.Sequential()
+    model.add(tf.keras.layers.Dense(512, activation='relu', input_shape=(784,)))
+    model.add(tf.keras.layers.Dropout(0.2))
+    model.add(tf.keras.layers.Dense(512, activation='relu'))
+    model.add(tf.keras.layers.Dropout(0.2))
+    model.add(tf.keras.layers.Dense(10, activation='softmax'))
 
     return model
 
   def _mnist_convnet(self):
-    model = keras.Sequential()
+    model = tf.keras.Sequential()
     model.add(
-        keras.layers.Conv2D(
+        tf.keras.layers.Conv2D(
             32, kernel_size=(3, 3), activation='relu', input_shape=(28, 28, 1)))
-    model.add(keras.layers.Conv2D(64, (3, 3), activation='relu'))
-    model.add(keras.layers.MaxPooling2D(pool_size=(2, 2)))
-    model.add(keras.layers.Dropout(0.25))
-    model.add(keras.layers.Flatten())
-    model.add(keras.layers.Dense(128, activation='relu'))
-    model.add(keras.layers.Dropout(0.5))
-    model.add(keras.layers.Dense(10, activation='softmax'))
+    model.add(tf.keras.layers.Conv2D(64, (3, 3), activation='relu'))
+    model.add(tf.keras.layers.MaxPooling2D(pool_size=(2, 2)))
+    model.add(tf.keras.layers.Dropout(0.25))
+    model.add(tf.keras.layers.Flatten())
+    model.add(tf.keras.layers.Dense(128, activation='relu'))
+    model.add(tf.keras.layers.Dropout(0.5))
+    model.add(tf.keras.layers.Dense(10, activation='softmax'))
 
     return model
 
   def _imdb_lstm(self):
-    model = keras.Sequential()
-    model.add(keras.layers.Embedding(20000, 128))
-    model.add(keras.layers.LSTM(128, dropout=0.2, recurrent_dropout=0.2))
-    model.add(keras.layers.Dense(1, activation='sigmoid'))
+    model = tf.keras.Sequential()
+    model.add(tf.keras.layers.Embedding(20000, 128))
+    model.add(tf.keras.layers.LSTM(128, dropout=0.2, recurrent_dropout=0.2))
+    model.add(tf.keras.layers.Dense(1, activation='sigmoid'))
 
     return model
 
diff --git a/tensorflow/python/keras/benchmark/model_components_benchmarks_test.py b/tensorflow/python/keras/benchmarks/model_components_benchmarks_test.py
similarity index 83%
rename from tensorflow/python/keras/benchmark/model_components_benchmarks_test.py
rename to tensorflow/python/keras/benchmarks/model_components_benchmarks_test.py
index da6aef4efeb..5119d196b6a 100644
--- a/tensorflow/python/keras/benchmark/model_components_benchmarks_test.py
+++ b/tensorflow/python/keras/benchmarks/model_components_benchmarks_test.py
@@ -22,29 +22,26 @@ import time
 import numpy as np
 from six.moves import xrange  # pylint: disable=redefined-builtin
 
-from tensorflow.python import keras
-from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.eager import backprop  # pylint: disable=unused-import
+import tensorflow as tf
+
 from tensorflow.python.eager import context
 from tensorflow.python.eager import profiler
-from tensorflow.python.eager import test
-from tensorflow.python.ops import random_ops
-from tensorflow.python.training import gradient_descent
+from tensorflow.python.platform import test
 
 
-class SubclassedKerasModel(keras.Model):
+class SubclassedKerasModel(tf.keras.Model):
 
   def __init__(self, initializer="ones"):
     super(SubclassedKerasModel, self).__init__()
-    self.layer_a = keras.layers.Dense(
+    self.layer_a = tf.keras.layers.Dense(
         64, kernel_initializer=initializer, bias_initializer="zeros")
-    self.layer_b = keras.layers.Dense(
+    self.layer_b = tf.keras.layers.Dense(
         128, kernel_initializer=initializer, bias_initializer="zeros")
-    self.layer_c = keras.layers.Dense(
+    self.layer_c = tf.keras.layers.Dense(
         256, kernel_initializer=initializer, bias_initializer="zeros")
-    self.layer_d = keras.layers.Dense(
+    self.layer_d = tf.keras.layers.Dense(
         256, kernel_initializer=initializer, bias_initializer="zeros")
-    self.layer_e = keras.layers.Dense(
+    self.layer_e = tf.keras.layers.Dense(
         10, kernel_initializer=initializer, bias_initializer="zeros")
 
   def call(self, x):
@@ -56,32 +53,32 @@ class SubclassedKerasModel(keras.Model):
 
 
 def make_keras_model(initializer="ones"):
-  model_input = keras.Input(shape=(10,))
-  x = keras.layers.Dense(
+  model_input = tf.keras.Input(shape=(10,))
+  x = tf.keras.layers.Dense(
       64, kernel_initializer=initializer, bias_initializer="zeros")(model_input)
-  x = keras.layers.Dense(
+  x = tf.keras.layers.Dense(
       128, kernel_initializer=initializer, bias_initializer="zeros")(x)
-  x = keras.layers.Dense(
+  x = tf.keras.layers.Dense(
       256, kernel_initializer=initializer, bias_initializer="zeros")(x)
-  x = keras.layers.Dense(
+  x = tf.keras.layers.Dense(
       256, kernel_initializer=initializer, bias_initializer="zeros")(x)
-  x = keras.layers.Dense(
+  x = tf.keras.layers.Dense(
       10, kernel_initializer=initializer, bias_initializer="zeros")(x)
-  return keras.Model(inputs=model_input, outputs=x)
+  return tf.keras.Model(inputs=model_input, outputs=x)
 
 
 def make_sequential_keras_model(initializer="ones"):
-  model = keras.models.Sequential()
-  model.add(keras.layers.Dense(
+  model = tf.keras.models.Sequential()
+  model.add(tf.keras.layers.Dense(
       64, kernel_initializer=initializer, bias_initializer="zeros",
       input_shape=(10,)))
-  model.add(keras.layers.Dense(
+  model.add(tf.keras.layers.Dense(
       128, kernel_initializer=initializer, bias_initializer="zeros"))
-  model.add(keras.layers.Dense(
+  model.add(tf.keras.layers.Dense(
       256, kernel_initializer=initializer, bias_initializer="zeros"))
-  model.add(keras.layers.Dense(
+  model.add(tf.keras.layers.Dense(
       256, kernel_initializer=initializer, bias_initializer="zeros"))
-  model.add(keras.layers.Dense(
+  model.add(tf.keras.layers.Dense(
       10, kernel_initializer=initializer, bias_initializer="zeros"))
   return model
 
@@ -120,9 +117,9 @@ class KerasComponentsBenchmarks(test.Benchmark):
 
   def benchmark_keras_model_subclassed(self):
     model = SubclassedKerasModel()
-    data = random_ops.random_uniform((10, 10))
+    data = tf.random.uniform((10, 10))
 
-    func = lambda: model(data)
+    func = lambda: model(data)  # pylint: disable=not-callable
     # First call is more expensive (creates variables etc.), discount that.
     func()
 
@@ -135,16 +132,16 @@ class KerasComponentsBenchmarks(test.Benchmark):
 
   def benchmark_keras_model_functional(self):
     model = make_keras_model()
-    data = random_ops.random_uniform((10, 10))
-    func = lambda: model(data)
+    data = tf.random.uniform((10, 10))
+    func = lambda: model(data)  # pylint: disable=not-callable
     # Symmetry with benchmark_keras_model_subclassed
     func()
-    assert np.equal(func(), SubclassedKerasModel()(data)).all()
+    assert np.equal(func(), SubclassedKerasModel()(data)).all()  # pylint: disable=not-callable
     self._run(func, 30000)
 
   def benchmark_keras_model_sequential(self):
     model = make_sequential_keras_model()
-    data = random_ops.random_uniform((10, 10))
+    data = tf.random.uniform((10, 10))
     func = lambda: model(data)
     # Symmetry with benchmark_keras_model_functional
     func()
@@ -152,11 +149,11 @@ class KerasComponentsBenchmarks(test.Benchmark):
     self._run(func, 30000)
 
   def _benchmark_keras_model_fit(self, model, run_eagerly=False):
-    data = random_ops.random_uniform((10, 10), minval=-1, maxval=1)
-    labels = random_ops.random_uniform((10, 10), minval=-1, maxval=1)
-    dataset = dataset_ops.Dataset.from_tensors((data, labels)).repeat()
+    data = tf.random.uniform((10, 10), minval=-1, maxval=1)
+    labels = tf.random.uniform((10, 10), minval=-1, maxval=1)
+    dataset = tf.data.Dataset.from_tensors((data, labels)).repeat()
     model.compile(
-        gradient_descent.GradientDescentOptimizer(learning_rate=0.001),
+        "sgd",
         loss="mse", run_eagerly=run_eagerly)
     func = lambda: model.fit(dataset, epochs=1, steps_per_epoch=1000, verbose=0)
     # First call is more expensive (creates variables etc.), discount that.
@@ -165,11 +162,11 @@ class KerasComponentsBenchmarks(test.Benchmark):
     self._run(func, 1)
 
   def _benchmark_keras_model_evaluate(self, model, run_eagerly=False):
-    data = random_ops.random_uniform((10, 10), minval=-1, maxval=1)
-    labels = random_ops.random_uniform((10, 10), minval=-1, maxval=1)
-    dataset = dataset_ops.Dataset.from_tensors((data, labels)).repeat()
+    data = tf.random.uniform((10, 10), minval=-1, maxval=1)
+    labels = tf.random.uniform((10, 10), minval=-1, maxval=1)
+    dataset = tf.data.Dataset.from_tensors((data, labels)).repeat()
     model.compile(
-        gradient_descent.GradientDescentOptimizer(learning_rate=0.001),
+        "sgd",
         loss="mse", run_eagerly=run_eagerly)
     func = lambda: model.evaluate(dataset, steps=1000, verbose=0)
     # First call is more expensive (creates variables etc.), discount that.
@@ -178,10 +175,10 @@ class KerasComponentsBenchmarks(test.Benchmark):
     self._run(func, 1)
 
   def _benchmark_keras_model_predict(self, model, run_eagerly=False):
-    data = random_ops.random_uniform((10, 10), minval=-1, maxval=1)
-    dataset = dataset_ops.Dataset.from_tensors(data).repeat()
+    data = tf.random.uniform((10, 10), minval=-1, maxval=1)
+    dataset = tf.data.Dataset.from_tensors(data).repeat()
     model.compile(
-        gradient_descent.GradientDescentOptimizer(learning_rate=0.001),
+        "sgd",
         loss="mse", run_eagerly=run_eagerly)
     func = lambda: model.predict(dataset, steps=1000, verbose=0)
     # First call is more expensive (creates variables etc.), discount that.

From 15f7e2fca22d2f0bf1d071b45c6e486edc9e4f2f Mon Sep 17 00:00:00 2001
From: Sanjoy Das <sanjoy@google.com>
Date: Fri, 26 Jun 2020 18:51:42 -0700
Subject: [PATCH 1213/1390] Add a tool that lowers HLO to LLVM IR via XLA GPU
 and use that to write FileCheck tests for scatter

This will be useful to make sure we keep generating the same LLVM IR as we add an LHLO backend.

I also discovered a source of non-determinism via the scatter.hlo test, where in
IrEmitterUnnested::BuildKernelThunk we were iterating over a std::map that had
pointer keys.  Fix that as well.
PiperOrigin-RevId: 318581722
Change-Id: I0b5f0cdd760880378bb046f66468af2f0c4a6a15
---
 tensorflow/compiler/mlir/runlit.cfg.py        |   2 +-
 tensorflow/compiler/mlir/runlit.site.cfg.py   |   1 +
 tensorflow/compiler/xla/service/gpu/BUILD     |   8 +-
 .../compiler/xla/service/gpu/gpu_compiler.cc  | 154 ++++++---
 .../compiler/xla/service/gpu/gpu_compiler.h   |  27 +-
 .../xla/service/gpu/ir_emitter_unnested.cc    |  74 +++--
 .../compiler/xla/service/gpu/tests/BUILD      |  37 ++-
 .../xla/service/gpu/tests/hlo_to_llvm_ir.cc   | 100 ++++++
 .../xla/service/gpu/tests/scatter.hlo         | 296 ++++++++++++++++++
 9 files changed, 602 insertions(+), 97 deletions(-)
 create mode 100644 tensorflow/compiler/xla/service/gpu/tests/hlo_to_llvm_ir.cc
 create mode 100644 tensorflow/compiler/xla/service/gpu/tests/scatter.hlo

diff --git a/tensorflow/compiler/mlir/runlit.cfg.py b/tensorflow/compiler/mlir/runlit.cfg.py
index 2d225342b56..00d909b523e 100644
--- a/tensorflow/compiler/mlir/runlit.cfg.py
+++ b/tensorflow/compiler/mlir/runlit.cfg.py
@@ -73,7 +73,7 @@ tool_names = [
     'mlir-opt', 'mlir-translate', 'tf-opt', 'tf_tfl_translate',
     'tf_tfjs_translate', 'flatbuffer_to_string', 'flatbuffer_translate',
     'tf-mlir-translate', 'mlir-tflite-runner', 'tfcompile',
-    'json_to_flatbuffer', 'xla-gpu-opt', 'xla-opt'
+    'json_to_flatbuffer', 'xla-gpu-opt', 'xla-opt', 'hlo_to_llvm_ir'
 ]
 tools = [ToolSubst(s, unresolved='ignore') for s in tool_names]
 llvm_config.add_tool_substitutions(tools, tool_dirs)
diff --git a/tensorflow/compiler/mlir/runlit.site.cfg.py b/tensorflow/compiler/mlir/runlit.site.cfg.py
index 3e7596c75d7..c5cd2b17920 100644
--- a/tensorflow/compiler/mlir/runlit.site.cfg.py
+++ b/tensorflow/compiler/mlir/runlit.site.cfg.py
@@ -48,6 +48,7 @@ mlir_tf_tools_dirs = [
     'tensorflow/compiler/mlir/xla',
     'tensorflow/compiler/aot',
     'tensorflow/compiler/xla/service/mlir_gpu',
+    'tensorflow/compiler/xla/service/gpu/tests',
 ]
 config.mlir_tf_tools_dirs = [
     os.path.join(real_test_srcdir, os.environ['TEST_WORKSPACE'], s)
diff --git a/tensorflow/compiler/xla/service/gpu/BUILD b/tensorflow/compiler/xla/service/gpu/BUILD
index 5fb405ced53..785122e23b4 100644
--- a/tensorflow/compiler/xla/service/gpu/BUILD
+++ b/tensorflow/compiler/xla/service/gpu/BUILD
@@ -1133,14 +1133,12 @@ cc_library(
     deps = [
         ":alias_passthrough_params",
         ":cudnn_batchnorm_rewriter",
-        ":cudnn_pad_for_convolutions",
         ":fusion_merger",
         ":gemm_rewriter",
         ":gpu_constants",
         ":gpu_conv_algorithm_picker",
-        ":gpu_conv_padding_legalization",
-        ":gpu_conv_rewriter",
         ":gpu_copy_insertion",
+        ":gpu_device_info",
         ":gpu_executable",
         ":gpu_hlo_schedule",
         ":gpu_layout_assignment",
@@ -1188,7 +1186,6 @@ cc_library(
         "//tensorflow/compiler/xla/service:hlo_get_dimension_size_rewriter",
         "//tensorflow/compiler/xla/service:hlo_pass",
         "//tensorflow/compiler/xla/service:hlo_pass_pipeline",
-        "//tensorflow/compiler/xla/service:hlo_proto_cc",
         "//tensorflow/compiler/xla/service:hlo_proto_util",
         "//tensorflow/compiler/xla/service:hlo_subcomputation_unification",
         "//tensorflow/compiler/xla/service:hlo_verifier",
@@ -1214,11 +1211,8 @@ cc_library(
         "//tensorflow/core:stream_executor_no_cuda",
         "//tensorflow/core/profiler/lib:traceme",
         "//tensorflow/stream_executor:stream_executor_headers",
-        "@com_google_absl//absl/container:node_hash_map",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/types:optional",
-        "@com_google_absl//absl/types:span",
         "@llvm-project//llvm:Core",
     ],
 )
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc b/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
index 3dd722c885d..2b31099d26f 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
@@ -464,6 +464,66 @@ StatusOr<std::unique_ptr<HloModule>> GpuCompiler::RunHloPasses(
   return std::move(module);
 }
 
+static Status CompileModuleToLlvmIrImpl(
+    HloModule* hlo_module, llvm::LLVMContext* llvm_context,
+    const std::string& target_triple, const std::string& data_layout,
+    const std::string& platform_name, GpuDeviceInfo gpu_device_info,
+    absl::optional<CudaComputeCapability> cuda_compute_capability,
+    const HloDataflowAnalysis::CanShareBuffer& can_share_buffer_function,
+    int pointer_size, std::unique_ptr<llvm::Module>* llvm_module,
+    std::unique_ptr<StreamAssignment>* stream_assignment,
+    std::unique_ptr<GpuHloSchedule>* hlo_schedule,
+    std::unique_ptr<BufferAssignment>* buffer_assignment,
+    std::unique_ptr<ThunkSequence>* thunk_sequence) {
+  *llvm_module = absl::make_unique<llvm::Module>("", *llvm_context);
+
+  (*llvm_module)->setTargetTriple(target_triple);
+  (*llvm_module)->setDataLayout(data_layout);
+
+  *stream_assignment = AssignStreams(*hlo_module);
+  TF_ASSIGN_OR_RETURN(
+      *hlo_schedule,
+      GpuHloSchedule::Build(*hlo_module, **stream_assignment, pointer_size));
+
+  auto buffer_size_bytes_function =
+      [pointer_size](const BufferValue& buffer_value) -> int64 {
+    return GpuCompiler::GetSizeOfShape(buffer_value.shape(), pointer_size);
+  };
+
+  TF_ASSIGN_OR_RETURN(
+      *buffer_assignment,
+      BufferAssigner::Run(
+          hlo_module, (*hlo_schedule)->ConsumeHloOrdering(),
+          buffer_size_bytes_function,
+          /*color_alignment=*/
+          [](LogicalBuffer::Color) { return kXlaAllocatedBufferAlignBytes; },
+          /*allocate_buffers_for_constants=*/true,
+          /*colorer=*/BufferAssigner::DefaultColorer(),
+          /*must_not_live_out=*/{}, can_share_buffer_function));
+
+  VLOG(1) << "Buffer Assignment Stats "
+          << (*buffer_assignment)->GetStats().ToString();
+  DumpHloModuleIfEnabled(*hlo_module, **buffer_assignment,
+                         "after_optimizations");
+
+  IrEmitterContext ir_emitter_context(
+      hlo_module, buffer_assignment->get(), platform_name, gpu_device_info,
+      cuda_compute_capability, llvm_module->get());
+
+  HloComputation* entry_computation = hlo_module->entry_computation();
+  IrEmitterUnnested ir_emitter(hlo_module->config(), entry_computation,
+                               &ir_emitter_context);
+
+  TF_RETURN_IF_ERROR(ir_emitter.EmitConstantGlobals());
+
+  {
+    XLA_SCOPED_LOGGING_TIMER("GpuCompiler::RunBackend - IR emission");
+    TF_RETURN_IF_ERROR(entry_computation->Accept(&ir_emitter));
+  }
+  *thunk_sequence = ir_emitter.ConsumeThunkSequence();
+  return Status::OK();
+}
+
 StatusOr<std::unique_ptr<Executable>> GpuCompiler::RunBackend(
     std::unique_ptr<HloModule> module, se::StreamExecutor* stream_exec,
     se::DeviceMemoryAllocator* device_allocator) {
@@ -483,35 +543,6 @@ StatusOr<std::unique_ptr<Executable>> GpuCompiler::RunBackend(
   };
   llvm_context.setDiagnosticHandlerCallBack(DiagnosticHandler, &printer);
 
-  llvm::Module llvm_module(module->name().c_str(), llvm_context);
-  // Set the target triple and the data layout.
-  llvm_module.setTargetTriple(target_triple_);
-  llvm_module.setDataLayout(data_layout_);
-
-  // Determine the HLO schedule, which is an ordering of HLO instructions.  This
-  // is used by buffer assignment to enable buffer reuse, and the same ordering
-  // must also be used to determine the thunk launch schedule.
-  std::unique_ptr<StreamAssignment> stream_assignment = AssignStreams(*module);
-  TF_ASSIGN_OR_RETURN(
-      std::unique_ptr<GpuHloSchedule> hlo_schedule,
-      GpuHloSchedule::Build(*module, *stream_assignment, pointer_size_));
-
-  // Run buffer analysis on the HLO graph. This analysis figures out which
-  // temporary buffers are required to run the computation.
-  TF_ASSIGN_OR_RETURN(
-      std::unique_ptr<BufferAssignment> buffer_assignment,
-      BufferAssigner::Run(
-          module.get(), hlo_schedule->ConsumeHloOrdering(),
-          BufferSizeBytesFunction(),
-          /*color_alignment=*/
-          [](LogicalBuffer::Color) { return kXlaAllocatedBufferAlignBytes; },
-          /*allocate_buffers_for_constants=*/true,
-          /*colorer=*/BufferAssigner::DefaultColorer(),
-          /*must_not_live_out=*/{}, GetCanShareBuffer()));
-  VLOG(1) << "Buffer Assignment Stats "
-          << buffer_assignment->GetStats().ToString();
-  DumpHloModuleIfEnabled(*module, *buffer_assignment, "after_optimizations");
-
   GpuDeviceInfo gpu_device_info;
   gpu_device_info.threads_per_block_limit =
       stream_exec->GetDeviceDescription().threads_per_block_limit();
@@ -531,32 +562,29 @@ StatusOr<std::unique_ptr<Executable>> GpuCompiler::RunBackend(
     return cuda_compute_capability;
   }();
 
-  IrEmitterContext ir_emitter_context(
-      module.get(), buffer_assignment.get(), stream_exec->platform()->Name(),
-      gpu_device_info, cuda_compute_capability, &llvm_module);
+  std::unique_ptr<llvm::Module> llvm_module;
+  std::unique_ptr<StreamAssignment> stream_assignment;
+  std::unique_ptr<GpuHloSchedule> hlo_schedule;
+  std::unique_ptr<BufferAssignment> buffer_assignment;
+  std::unique_ptr<ThunkSequence> thunk_sequence;
 
-  HloComputation* entry_computation = module->entry_computation();
-  IrEmitterUnnested ir_emitter(module->config(), entry_computation,
-                               &ir_emitter_context);
-
-  TF_RETURN_IF_ERROR(ir_emitter.EmitConstantGlobals());
-
-  {
-    XLA_SCOPED_LOGGING_TIMER("GpuCompiler::RunBackend - IR emission");
-    TF_RETURN_IF_ERROR(entry_computation->Accept(&ir_emitter));
-  }
+  TF_RETURN_IF_ERROR(CompileModuleToLlvmIrImpl(
+      module.get(), &llvm_context, target_triple_, data_layout_,
+      stream_exec->platform()->Name(), gpu_device_info, cuda_compute_capability,
+      GetCanShareBuffer(), pointer_size_, &llvm_module, &stream_assignment,
+      &hlo_schedule, &buffer_assignment, &thunk_sequence));
 
   if (user_pre_optimization_hook_) {
-    user_pre_optimization_hook_(llvm_module);
+    user_pre_optimization_hook_(*llvm_module);
   }
   string ir_module_string_before_opt;
   const bool embed_ir_in_executable =
       module->config().debug_options().xla_embed_ir_in_executable();
   if (embed_ir_in_executable) {
-    ir_module_string_before_opt = llvm_ir::DumpModuleToString(llvm_module);
+    ir_module_string_before_opt = llvm_ir::DumpModuleToString(*llvm_module);
   }
 
-  llvm_ir::DumpIrIfEnabled(*module, llvm_module, /*optimized=*/false);
+  llvm_ir::DumpIrIfEnabled(*module, *llvm_module, /*optimized=*/false);
 
   {
     XLA_SCOPED_LOGGING_TIMER("GpuCompiler::RunBackend - Running LLVM verifier");
@@ -565,7 +593,7 @@ StatusOr<std::unique_ptr<Executable>> GpuCompiler::RunBackend(
     llvm::raw_string_ostream err_stream(err);
 
     // verifyModule() returns true if the module is broken.
-    TF_RET_CHECK(!llvm::verifyModule(llvm_module, &err_stream))
+    TF_RET_CHECK(!llvm::verifyModule(*llvm_module, &err_stream))
         << "Invalid LLVM IR before optimizations:\n"
         << err_stream.str()
         << "\nThis probably indicates a bug in the HLO -> LLVM IR lowering. "
@@ -578,11 +606,11 @@ StatusOr<std::unique_ptr<Executable>> GpuCompiler::RunBackend(
 
   using BackendCompileResult = std::pair<std::string, std::vector<uint8>>;
   TF_ASSIGN_OR_RETURN(BackendCompileResult backend_result,
-                      CompileTargetBinary(module.get(), &llvm_module,
+                      CompileTargetBinary(module.get(), llvm_module.get(),
                                           gpu_version, stream_exec));
 
   auto thunk_schedule = absl::make_unique<ThunkSchedule>(
-      ir_emitter.ConsumeThunkSequence(), std::move(stream_assignment),
+      std::move(thunk_sequence), std::move(stream_assignment),
       hlo_schedule->ThunkLaunchOrder());
   if (DumpingEnabledForHloModule(*module)) {
     DumpToFileInDirOrStdout(*module, "", "thunk_schedule",
@@ -602,8 +630,9 @@ StatusOr<std::unique_ptr<Executable>> GpuCompiler::RunBackend(
                    cost_analysis.bytes_accessed());
     if (module->config().hlo_profiling_enabled()) {
       profile_index_map = absl::make_unique<HloProfileIndexMap>(*module);
-      profile_printer = CreateHloProfilePrinterData(
-          *profile_index_map, cost_analysis, entry_computation->name());
+      profile_printer =
+          CreateHloProfilePrinterData(*profile_index_map, cost_analysis,
+                                      module->entry_computation()->name());
     }
   }
 
@@ -625,5 +654,30 @@ GpuCompiler::CompileAheadOfTime(std::unique_ptr<HloModuleGroup> module_group,
   return Unimplemented("not yet implemented: GpuCompiler::CompileAheadOfTime");
 }
 
+static absl::optional<bool> DummyCanShareBufferFunction(const HloInstruction*,
+                                                        const HloInstruction*,
+                                                        const ShapeIndex&) {
+  return absl::nullopt;
+}
+
+StatusOr<std::unique_ptr<llvm::Module>> CompileModuleToLlvmIr(
+    HloModule* hlo_module, llvm::LLVMContext* llvm_context,
+    const std::string& target_triple, const std::string& data_layout,
+    const std::string& platform_name, GpuDeviceInfo gpu_device_info,
+    absl::optional<CudaComputeCapability> cuda_compute_capability,
+    int pointer_size) {
+  std::unique_ptr<llvm::Module> llvm_module;
+  std::unique_ptr<StreamAssignment> stream_assignment;
+  std::unique_ptr<GpuHloSchedule> hlo_schedule;
+  std::unique_ptr<BufferAssignment> buffer_assignment;
+  std::unique_ptr<ThunkSequence> thunk_sequence;
+
+  TF_RETURN_IF_ERROR(CompileModuleToLlvmIrImpl(
+      hlo_module, llvm_context, target_triple, data_layout, platform_name,
+      gpu_device_info, cuda_compute_capability, DummyCanShareBufferFunction,
+      pointer_size, &llvm_module, &stream_assignment, &hlo_schedule,
+      &buffer_assignment, &thunk_sequence));
+  return llvm_module;
+}
 }  // namespace gpu
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_compiler.h b/tensorflow/compiler/xla/service/gpu/gpu_compiler.h
index a7706005ba2..7b6e4c78832 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_compiler.h
+++ b/tensorflow/compiler/xla/service/gpu/gpu_compiler.h
@@ -21,6 +21,7 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/compiler/xla/service/executable.h"
+#include "tensorflow/compiler/xla/service/gpu/gpu_device_info.h"
 #include "tensorflow/compiler/xla/service/gpu/gpu_executable.h"
 #include "tensorflow/compiler/xla/service/hlo_dataflow_analysis.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
@@ -94,15 +95,19 @@ class GpuCompiler : public LLVMCompiler {
   HloCostAnalysis::ShapeSizeFunction ShapeSizeBytesFunction() const override {
     // Capture just the pointer size, not the entire GpuCompiler object.
     return [pointer_size = pointer_size_](const Shape& shape) {
-      if (shape.is_static() || shape.IsTuple()) {
-        return ShapeUtil::ByteSizeOf(shape, pointer_size);
-      }
-      // Each dynamic dimension size is represented as a S32.
-      int64 metadata_size = sizeof(int32) * shape.dimensions_size();
-      return ShapeUtil::ByteSizeOf(shape, pointer_size) + metadata_size;
+      return GetSizeOfShape(shape, pointer_size);
     };
   }
 
+  static int64 GetSizeOfShape(const Shape& shape, int pointer_size) {
+    if (shape.is_static() || shape.IsTuple()) {
+      return ShapeUtil::ByteSizeOf(shape, pointer_size);
+    }
+    // Each dynamic dimension size is represented as a S32.
+    int64 metadata_size = sizeof(int32) * shape.dimensions_size();
+    return ShapeUtil::ByteSizeOf(shape, pointer_size) + metadata_size;
+  }
+
  private:
   se::Platform::Id platform_id_;
 
@@ -117,6 +122,16 @@ class GpuCompiler : public LLVMCompiler {
 
   TF_DISALLOW_COPY_AND_ASSIGN(GpuCompiler);
 };
+
+// Compile `hlo_module` using XLA GPU and return the LLVM module thus generated.
+// The GpuExecutable (and the Thunks that are part of it) are not returned.
+StatusOr<std::unique_ptr<llvm::Module>> CompileModuleToLlvmIr(
+    HloModule* hlo_module, llvm::LLVMContext* llvm_context,
+    const std::string& target_triple, const std::string& data_layout,
+    const std::string& platform_name, GpuDeviceInfo gpu_device_info,
+    absl::optional<CudaComputeCapability> cuda_compute_capability,
+    int pointer_size);
+
 }  // namespace gpu
 }  // namespace xla
 
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
index 93dc8de0e0c..1065928687f 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
@@ -1444,8 +1444,6 @@ Status IrEmitterUnnested::HandleCollectivePermute(HloInstruction* hlo) {
   return Status::OK();
 }
 
-namespace {}  // namespace
-
 Status IrEmitterUnnested::HandleAllReduce(HloInstruction* crs) {
   VLOG(2) << "AllReduce; replica count: " << hlo_module_config_.replica_count()
           << "; operand count: " << crs->operand_count()
@@ -1557,29 +1555,37 @@ Status IrEmitterUnnested::HandleAfterAll(HloInstruction* after_all) {
   return Status::OK();
 }
 
+// Describes how to access a particular subshape for an HLO.  For instance if
+// `.hlo_index` is {1} and `.gte_index` is {3, 4} then buffer for `.instr` at
+// ShapeIndex {1} (i.e. the buffer for the second tuple element of hlo) is found
+// at `.buffer_slice`[3][4].  That is, `.slice` is a void***, which we
+// dereference twice -- first at index 3, and then at index 4 -- to get the
+// address of our buffer.
+struct HloBufferSlice {
+  const HloInstruction* instr;
+  ShapeIndex hlo_index;
+
+  // The root buffer to look at.
+  BufferAllocation::Slice buffer_slice;
+
+  // Describes how to dereference starting at that buffer to get to the buffer
+  // in question.
+  ShapeIndex gte_index;
+};
+
 // Figures out how to access the buffers for all subshapes of hlo's operands and
 // for hlo itself (i.e. all the buffers produced by HLO).
 //
-// Returns a map keyed on the pair {HloInstruction, ShapeIndex}.  The value for
-// this key is a pair {Slice, ShapeIndex}, where the slice tells you the root
-// buffer to look in, and the ShapeIndex describes how to dereference starting
-// at that buffer to get to the buffer in question.
-//
-// For example, if {hlo, {1}} is mapped to {slice, {3, 4}}, then the buffer for
-// hlo at ShapeIndex {1} (i.e. the buffer for the second tuple element of hlo)
-// is found at slice[3][4].  That is, slice is a void***, which we dereference
-// twice -- first at index 3, and then at index 4 -- to get the address of our
-// buffer.
+// Returns a vector of `HloBufferSlice`s, one for each HLO subshape `hlo` needs
+// to access (including one or more for itself).
 //
 // This function conservatively assumes that we'll touch all sub-buffers of
 // every operand and of the output.
-static std::map<std::pair<const HloInstruction*, ShapeIndex>,
-                std::pair<BufferAllocation::Slice, ShapeIndex>>
-GetHloBufferSlices(const HloInstruction* hlo,
-                   const BufferAssignment& buffer_assn) {
-  std::map<std::pair<const HloInstruction*, ShapeIndex>,
-           std::pair<BufferAllocation::Slice, ShapeIndex>>
-      slices;
+static std::vector<HloBufferSlice> GetHloBufferSlices(
+    const HloInstruction* hlo, const BufferAssignment& buffer_assn) {
+  std::vector<HloBufferSlice> result;
+  absl::flat_hash_set<std::pair<const HloInstruction*, ShapeIndex>>
+      inserted_buffer_slices;
 
   // Tries to find a slice plus an array of indices i1, ..., iN such that the
   // sub-buffer for instr at index can be found at slice[i1]...[iN].
@@ -1646,13 +1652,18 @@ GetHloBufferSlices(const HloInstruction* hlo,
   auto add_slices_for = [&](const HloInstruction* instr) {
     ShapeUtil::ForEachSubshape(
         instr->shape(), [&](const Shape& /*shape*/, const ShapeIndex& index) {
-          if (slices.count({instr, index})) {
+          if (!inserted_buffer_slices.insert({instr, index}).second) {
             // HLOs can have duplicate operands; don't bother redoing work.
             return;
           }
           auto maybe_slice = find_slice_for(instr, index);
           if (maybe_slice.has_value()) {
-            slices[{instr, index}] = *maybe_slice;
+            HloBufferSlice hlo_buffer_slice;
+            hlo_buffer_slice.instr = instr;
+            hlo_buffer_slice.hlo_index = index;
+            hlo_buffer_slice.buffer_slice = maybe_slice->first;
+            hlo_buffer_slice.gte_index = maybe_slice->second;
+            result.push_back(hlo_buffer_slice);
           } else {
             VLOG(1) << "Couldn't find buffer for " << instr->ToString()
                     << " at index " << index.ToString();
@@ -1667,7 +1678,7 @@ GetHloBufferSlices(const HloInstruction* hlo,
     add_slices_for(operand);
   }
 
-  return slices;
+  return result;
 }
 
 std::unique_ptr<KernelThunk> IrEmitterUnnested::BuildKernelThunk(
@@ -1675,9 +1686,8 @@ std::unique_ptr<KernelThunk> IrEmitterUnnested::BuildKernelThunk(
   const BufferAssignment& buffer_assn =
       ir_emitter_context_->buffer_assignment();
 
-  std::map<std::pair<const HloInstruction*, ShapeIndex>,
-           std::pair<BufferAllocation::Slice, ShapeIndex>>
-      hlo_slices = GetHloBufferSlices(inst, buffer_assn);
+  std::vector<HloBufferSlice> hlo_slices =
+      GetHloBufferSlices(inst, buffer_assn);
 
   // Figure out which buffer allocations need to be passed as arguments to our
   // kernel.  This is simply all of the allocations referenced in hlo_slices,
@@ -1685,8 +1695,8 @@ std::unique_ptr<KernelThunk> IrEmitterUnnested::BuildKernelThunk(
   // buffer because even if the kernel itself doesn't use it, a nested
   // subcomputation within the kernel (e.g. a kMap's computation) might.
   std::unordered_set<const BufferAllocation*> buffers_needed;
-  for (const auto& kv : hlo_slices) {
-    buffers_needed.insert(kv.second.first.allocation());
+  for (const auto& hlo_buffer_slice : hlo_slices) {
+    buffers_needed.insert(hlo_buffer_slice.buffer_slice.allocation());
   }
   absl::optional<const BufferAllocation*> temp_buffer;
   for (const BufferAllocation& alloc : buffer_assn.Allocations()) {
@@ -1730,11 +1740,11 @@ std::unique_ptr<KernelThunk> IrEmitterUnnested::BuildKernelThunk(
 
   // For each buffer our kernel might want to touch, bind it to a value derived
   // from our kernel args.
-  for (const auto& kv : hlo_slices) {
-    const HloInstruction* instr = kv.first.first;
-    const ShapeIndex& index = kv.first.second;
-    const BufferAllocation::Slice& slice = kv.second.first;
-    const ShapeIndex& gte_index = kv.second.second;
+  for (const auto& hlo_buffer_slice : hlo_slices) {
+    const HloInstruction* instr = hlo_buffer_slice.instr;
+    const ShapeIndex& index = hlo_buffer_slice.hlo_index;
+    const BufferAllocation::Slice& slice = hlo_buffer_slice.buffer_slice;
+    const ShapeIndex& gte_index = hlo_buffer_slice.gte_index;
 
     VLOG(3) << "Buffer for " << instr->ToString() << " at " << index.ToString()
             << " is found in slice " << slice.ToString() << " at GTE index "
diff --git a/tensorflow/compiler/xla/service/gpu/tests/BUILD b/tensorflow/compiler/xla/service/gpu/tests/BUILD
index e2765e429a0..a23c14017a4 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/BUILD
+++ b/tensorflow/compiler/xla/service/gpu/tests/BUILD
@@ -5,11 +5,12 @@
 # need to run on machines with GPUs present.
 
 load("//tensorflow/compiler/xla/tests:build_defs.bzl", "xla_test")
-load("//tensorflow:tensorflow.bzl", "tf_cc_test")
+load("//tensorflow:tensorflow.bzl", "tf_cc_binary", "tf_cc_test")
 load(
     "//tensorflow/core/platform:build_config_root.bzl",
     "tf_cuda_tests_tags",
 )
+load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
 
 package(
     default_visibility = [":friends"],
@@ -456,3 +457,37 @@ xla_test(
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
     ],
 )
+
+tf_cc_binary(
+    name = "hlo_to_llvm_ir",
+    srcs = ["hlo_to_llvm_ir.cc"],
+    deps = [
+        "//tensorflow/compiler/xla:status",
+        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/service/gpu:gpu_compiler",
+        "//tensorflow/compiler/xla/service/gpu:gpu_device_info",
+        "//tensorflow/compiler/xla/service/gpu:target_constants",
+        "//tensorflow/compiler/xla/tools:hlo_module_loader",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:lib",
+    ],
+)
+
+glob_lit_tests(
+    data = [":test_utilities"],
+    default_tags = tf_cuda_tests_tags() + [
+        "no_pip",
+    ],
+    driver = "@llvm-project//mlir:run_lit.sh",
+    test_file_exts = ["hlo"],
+)
+
+# Bundle together all of the test utilities that are used by tests.
+filegroup(
+    name = "test_utilities",
+    testonly = True,
+    data = [
+        "//tensorflow/compiler/xla/service/gpu/tests:hlo_to_llvm_ir",
+        "@llvm-project//llvm:FileCheck",
+    ],
+)
diff --git a/tensorflow/compiler/xla/service/gpu/tests/hlo_to_llvm_ir.cc b/tensorflow/compiler/xla/service/gpu/tests/hlo_to_llvm_ir.cc
new file mode 100644
index 00000000000..588f96bdf8d
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/tests/hlo_to_llvm_ir.cc
@@ -0,0 +1,100 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/gpu/gpu_compiler.h"
+#include "tensorflow/compiler/xla/service/gpu/gpu_device_info.h"
+#include "tensorflow/compiler/xla/service/gpu/target_constants.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/status.h"
+#include "tensorflow/compiler/xla/tools/hlo_module_loader.h"
+#include "tensorflow/core/platform/init_main.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/util/command_line_flags.h"
+
+const char* const kUsage = R"(
+This tool reads in an HloMoudle from a file, compiles it using the NVPTX
+compiler and prints out the LLVM IR generated by the IR emitter.  The LLVM IR is
+not optimized by the LLVM pass pipeline, so this tool can be used to unit test
+the XLA GPU IR emitters.
+
+Note that the LLVM IR does not contain the *full* module, but only parts that
+will be code generated into PTX.  The NVPTX compiler also generates a
+GpuExecutable on the size that is not printed.)";
+
+namespace {
+xla::Status CompileAndPrintLlvmIr(const std::string& hlo_text) {
+  TF_ASSIGN_OR_RETURN(
+      std::unique_ptr<xla::HloModule> hlo_module,
+      xla::LoadModuleFromData(/*data=*/hlo_text, /*format=*/"hlo"));
+  llvm::LLVMContext llvm_context;
+
+  // For now we pretend we're compiling for V100.  This can be generalized
+  // later.
+
+  xla::gpu::GpuDeviceInfo gpu_device_info;
+  gpu_device_info.threads_per_block_limit = 1024;
+  gpu_device_info.threads_per_warp = 32;
+  gpu_device_info.shared_memory_per_block = 1024 * 96;
+
+  xla::gpu::CudaComputeCapability cuda_compute_capability;
+  cuda_compute_capability.cc_major = 7;
+  cuda_compute_capability.cc_minor = 0;
+  std::string target_triple = "nvptx64-nvidia-cuda";
+  std::string datalayout = "nvptx64-nvidia-cuda";
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<llvm::Module> llvm_module,
+                      xla::gpu::CompileModuleToLlvmIr(
+                          hlo_module.get(), &llvm_context,
+                          /*target_triple=*/xla::gpu::nvptx::kTargetTriple,
+                          /*data_layout=*/xla::gpu::nvptx::kDataLayout,
+                          /*platform_name=*/"CUDA", gpu_device_info,
+                          cuda_compute_capability, /*pointer_size=*/8));
+
+  llvm_module->print(llvm::outs(), nullptr);
+  return xla::Status::OK();
+}
+
+xla::Status CompileAndPrintLlvmIrFromFile(const std::string& file_name) {
+  std::string full_text;
+  TF_RETURN_IF_ERROR(tensorflow::ReadFileToString(tensorflow::Env::Default(),
+                                                  file_name, &full_text));
+
+  std::vector<std::string> hlo_module_texts =
+      absl::StrSplit(full_text, "// -----");
+  for (const std::string& hlo_module_text : hlo_module_texts) {
+    TF_RETURN_IF_ERROR(CompileAndPrintLlvmIr(hlo_module_text));
+  }
+
+  return xla::Status::OK();
+}
+}  // namespace
+
+int main(int argc, char** argv) {
+  std::vector<tensorflow::Flag> flag_list;
+  xla::AppendDebugOptionsFlags(&flag_list);
+  // The usage string includes the message at the top of the file, the
+  // DebugOptions flags and the flags defined above.
+  const std::string kUsageString = absl::StrCat(
+      kUsage, "\n\n", tensorflow::Flags::Usage(argv[0], flag_list));
+  bool parse_ok = tensorflow::Flags::Parse(&argc, argv, flag_list);
+  tensorflow::port::InitMain(kUsageString.c_str(), &argc, &argv);
+  if (!parse_ok) {
+    LOG(QFATAL) << kUsageString;
+  }
+
+  QCHECK(argc == 2) << "Must specify a single input file";
+  TF_CHECK_OK(CompileAndPrintLlvmIrFromFile(argv[1]));
+
+  return 0;
+}
diff --git a/tensorflow/compiler/xla/service/gpu/tests/scatter.hlo b/tensorflow/compiler/xla/service/gpu/tests/scatter.hlo
new file mode 100644
index 00000000000..b1cfb826e5f
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/tests/scatter.hlo
@@ -0,0 +1,296 @@
+// RUN: hlo_to_llvm_ir %s | FileCheck %s
+
+// CHECK-LABEL: define void @scatter_TensorFlowScatterV1(i8* align 64 dereferenceable(36) %alloc0, i8* align 16 dereferenceable(36) %alloc1, i8* align 16 dereferenceable(24) %alloc2, i8* align 16 dereferenceable(8) %alloc3) {
+// CHECK: entry:
+// CHECK:         %[[VAL_0:.*]] = getelementptr inbounds i8, i8* %[[VAL_1:.*]], i64 0
+// CHECK:         %[[VAL_2:.*]] = bitcast i8* %[[VAL_0]] to [3 x [3 x i32]]*
+// CHECK:         %[[VAL_3:.*]] = getelementptr inbounds i8, i8* %[[VAL_4:.*]], i64 0
+// CHECK:         %[[VAL_5:.*]] = bitcast i8* %[[VAL_3]] to [3 x [3 x i32]]*
+// CHECK:         %[[VAL_6:.*]] = getelementptr inbounds i8, i8* %[[VAL_7:.*]], i64 0
+// CHECK:         %[[VAL_8:.*]] = bitcast i8* %[[VAL_6]] to [2 x i32]*
+// CHECK:         %[[VAL_9:.*]] = getelementptr inbounds i8, i8* %[[VAL_10:.*]], i64 0
+// CHECK:         %[[VAL_11:.*]] = bitcast i8* %[[VAL_9]] to [2 x [3 x i32]]*
+// CHECK:         %[[VAL_12:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range !2
+// CHECK:         %[[VAL_13:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !3
+// CHECK:         %[[VAL_14:.*]] = mul nuw nsw i32 %[[VAL_12]], 6
+// CHECK:         %[[VAL_15:.*]] = add nuw nsw i32 %[[VAL_14]], %[[VAL_13]]
+// CHECK:         %[[VAL_16:.*]] = icmp ult i32 %[[VAL_15]], 6
+// CHECK:         call void @llvm.assume(i1 %[[VAL_16]])
+// CHECK:         %[[VAL_17:.*]] = udiv i32 %[[VAL_15]], 1
+// CHECK:         %[[VAL_18:.*]] = urem i32 %[[VAL_17]], 3
+// CHECK:         %[[VAL_19:.*]] = udiv i32 %[[VAL_15]], 3
+// CHECK:         %[[VAL_20:.*]] = icmp ult i32 %[[VAL_15]], 6
+// CHECK:         br i1 %[[VAL_20]], label %[[VAL_21:.*]], label %[[VAL_22:.*]]
+// CHECK:       scatter_TensorFlowScatterV1.in_bounds-after:      ; preds = %[[VAL_23:.*]], %[[VAL_24:.*]]
+// CHECK:         ret void
+// CHECK:       scatter_TensorFlowScatterV1.in_bounds-true:       ; preds = %[[VAL_24]]
+// CHECK:         %[[VAL_25:.*]] = getelementptr inbounds [2 x i32], [2 x i32]* %[[VAL_8]], i32 0, i32 %[[VAL_19]]
+// CHECK:         %[[VAL_26:.*]] = load i32, i32* %[[VAL_25]], align 4, !invariant.load !4, !noalias !5
+// CHECK:         %[[VAL_27:.*]] = add i32 0, %[[VAL_26]]
+// CHECK:         %[[VAL_28:.*]] = icmp ult i32 %[[VAL_26]], 3
+// CHECK:         %[[VAL_29:.*]] = and i1 true, %[[VAL_28]]
+// CHECK:         br i1 %[[VAL_29]], label %[[VAL_30:.*]], label %[[VAL_23]]
+// CHECK:       scatter.in_bounds-after:                          ; preds = %[[VAL_30]], %[[VAL_21]]
+// CHECK:         br label %[[VAL_22]]
+// CHECK:       scatter.in_bounds-true:                           ; preds = %[[VAL_21]]
+// CHECK:         %[[VAL_31:.*]] = getelementptr inbounds [3 x [3 x i32]], [3 x [3 x i32]]* %[[VAL_2]], i32 0, i32 %[[VAL_27]], i32 %[[VAL_18]]
+// CHECK:         %[[VAL_32:.*]] = alloca i32, align 4
+// CHECK:         %[[VAL_33:.*]] = bitcast [2 x [3 x i32]]* %[[VAL_11]] to i32*
+// CHECK:         %[[VAL_34:.*]] = getelementptr inbounds i32, i32* %[[VAL_33]], i32 %[[VAL_15]]
+// CHECK:         %[[VAL_35:.*]] = load i32, i32* %[[VAL_34]], align 4, !invariant.load !4, !noalias !5
+// CHECK:         store i32 %[[VAL_35]], i32* %[[VAL_32]], align 4
+// CHECK:         %[[VAL_36:.*]] = load i32, i32* %[[VAL_32]], align 4
+// CHECK:         store atomic i32 %[[VAL_36]], i32* %[[VAL_31]] unordered, align 4
+// CHECK:         br label %[[VAL_23]]
+// CHECK: !nvvm.annotations = !{!0, !1}
+// CHECK: !0 = !{void (i8*, i8*, i8*, i8*)* @scatter_TensorFlowScatterV1, !"kernel", i32 1}
+// CHECK: !1 = !{void (i8*, i8*, i8*, i8*)* @scatter_TensorFlowScatterV1, !"reqntidx", i32 6}
+// CHECK: !2 = !{i32 0, i32 1}
+// CHECK: !3 = !{i32 0, i32 6}
+// CHECK: !4 = !{}
+// CHECK: !5 = !{!6}
+// CHECK: !6 = !{!"buffer: {index:0, offset:0, size:36}", !7}
+// CHECK: !7 = !{!"XLA global AA domain"}
+
+
+HloModule TensorFlowScatterV1
+
+update_s32 (lhs: s32[], rhs: s32[]) -> s32[] {
+  lhs = s32[] parameter(0)
+  ROOT rhs = s32[] parameter(1)
+}
+
+ENTRY main {
+  operand = s32[3,3] parameter(0)
+  indices = s32[2] parameter(1)
+  updates = s32[2,3] parameter(2)
+  ROOT scatter_TensorFlowScatterV1 = s32[3,3] scatter(operand, indices, updates),
+      to_apply=update_s32,
+      update_window_dims={1},
+      inserted_window_dims={0},
+      scatter_dims_to_operand_dims={0},
+      index_vector_dim=1
+}
+
+
+// -----
+
+// CHECK-LABEL: define void @scatter_ScatterIntoScalar(i8* align 64 dereferenceable(4) %alloc0, i8* align 16 dereferenceable(4) %alloc1, i8* align 16 dereferenceable(4) %alloc2, i8* align 16 %alloc3) {
+// CHECK:       entry:
+// CHECK:         %[[VAL_37:.*]] = getelementptr inbounds i8, i8* %[[VAL_38:.*]], i64 0
+// CHECK:         %[[VAL_39:.*]] = bitcast i8* %[[VAL_37]] to i32*
+// CHECK:         %[[VAL_40:.*]] = getelementptr inbounds i8, i8* %[[VAL_41:.*]], i64 0
+// CHECK:         %[[VAL_42:.*]] = bitcast i8* %[[VAL_40]] to i32*
+// CHECK:         %[[VAL_43:.*]] = getelementptr inbounds i8, i8* %[[VAL_44:.*]], i64 0
+// CHECK:         %[[VAL_45:.*]] = bitcast i8* %[[VAL_43]] to [0 x i32]*
+// CHECK:         %[[VAL_46:.*]] = getelementptr inbounds i8, i8* %[[VAL_47:.*]], i64 0
+// CHECK:         %[[VAL_48:.*]] = bitcast i8* %[[VAL_46]] to i32*
+// CHECK:         %[[VAL_49:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range !2
+// CHECK:         %[[VAL_50:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !2
+// CHECK:         %[[VAL_51:.*]] = mul nuw nsw i32 %[[VAL_49]], 1
+// CHECK:         %[[VAL_52:.*]] = add nuw nsw i32 %[[VAL_51]], %[[VAL_50]]
+// CHECK:         %[[VAL_53:.*]] = icmp ult i32 %[[VAL_52]], 1
+// CHECK:         call void @llvm.assume(i1 %[[VAL_53]])
+// CHECK:         %[[VAL_54:.*]] = icmp ult i32 %[[VAL_52]], 1
+// CHECK:         br i1 %[[VAL_54]], label %[[VAL_55:.*]], label %[[VAL_56:.*]]
+// CHECK:       scatter_ScatterIntoScalar.in_bounds-after:        ; preds = %[[VAL_57:.*]], %[[VAL_58:.*]]
+// CHECK:         ret void
+// CHECK:       scatter_ScatterIntoScalar.in_bounds-true:         ; preds = %[[VAL_58]]
+// CHECK:         br i1 true, label %[[VAL_59:.*]], label %[[VAL_57]]
+// CHECK:       scatter.in_bounds-after:                          ; preds = %[[VAL_59]], %[[VAL_55]]
+// CHECK:         br label %[[VAL_56]]
+// CHECK:       scatter.in_bounds-true:                           ; preds = %[[VAL_55]]
+// CHECK:         %[[VAL_60:.*]] = alloca i32, align 4
+// CHECK:         %[[VAL_61:.*]] = load i32, i32* %[[VAL_48]], align 4, !invariant.load !3, !noalias !4
+// CHECK:         store i32 %[[VAL_61]], i32* %[[VAL_60]], align 4
+// CHECK:         %[[VAL_62:.*]] = load i32, i32* %[[VAL_60]], align 4
+// CHECK:         store atomic i32 %[[VAL_62]], i32* %[[VAL_39]] unordered, align 4
+// CHECK:         br label %[[VAL_57]]
+// CHECK: !nvvm.annotations = !{!0, !1}
+// CHECK: !0 = !{void (i8*, i8*, i8*, i8*)* @scatter_ScatterIntoScalar, !"kernel", i32 1}
+// CHECK: !1 = !{void (i8*, i8*, i8*, i8*)* @scatter_ScatterIntoScalar, !"reqntidx", i32 1}
+// CHECK: !2 = !{i32 0, i32 1}
+// CHECK: !3 = !{}
+// CHECK: !4 = !{!5}
+// CHECK: !5 = !{!"buffer: {index:0, offset:0, size:4}", !6}
+// CHECK: !6 = !{!"XLA global AA domain"}
+
+HloModule ScatterIntoScalar
+
+update_s32 {
+  lhs = s32[] parameter(0)
+  ROOT rhs = s32[] parameter(1)
+}
+
+ENTRY main {
+  parameter.1 = s32[] parameter(0)
+  parameter.2 = s32[0]{0} parameter(1)
+  parameter.3 = s32[] parameter(2)
+  ROOT scatter_ScatterIntoScalar = s32[] scatter(parameter.1, parameter.2, parameter.3),
+      update_window_dims={},
+      inserted_window_dims={},
+      scatter_dims_to_operand_dims={},
+      index_vector_dim=0,
+      to_apply=update_s32
+}
+
+
+// -----
+
+// CHECK-LABEL: define void @scatter_TensorFlowScatter_Mul(i8* align 64 dereferenceable(36) %alloc0, i8* align 16 dereferenceable(36) %alloc1, i8* align 16 dereferenceable(24) %alloc2, i8* align 16 dereferenceable(8) %alloc3) {
+// CHECK:         %[[VAL_63:.*]] = alloca i32, align 4
+// CHECK:         %[[VAL_64:.*]] = alloca i32, align 4
+// CHECK:         %[[VAL_65:.*]] = getelementptr inbounds i8, i8* %[[VAL_66:.*]], i64 0
+// CHECK:         %[[VAL_67:.*]] = bitcast i8* %[[VAL_65]] to [3 x [3 x i32]]*
+// CHECK:         %[[VAL_68:.*]] = getelementptr inbounds i8, i8* %[[VAL_69:.*]], i64 0
+// CHECK:         %[[VAL_70:.*]] = bitcast i8* %[[VAL_68]] to [3 x [3 x i32]]*
+// CHECK:         %[[VAL_71:.*]] = getelementptr inbounds i8, i8* %[[VAL_72:.*]], i64 0
+// CHECK:         %[[VAL_73:.*]] = bitcast i8* %[[VAL_71]] to [2 x i32]*
+// CHECK:         %[[VAL_74:.*]] = getelementptr inbounds i8, i8* %[[VAL_75:.*]], i64 0
+// CHECK:         %[[VAL_76:.*]] = bitcast i8* %[[VAL_74]] to [2 x [3 x i32]]*
+// CHECK:         %[[VAL_77:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range !2
+// CHECK:         %[[VAL_78:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !3
+// CHECK:         %[[VAL_79:.*]] = mul nuw nsw i32 %[[VAL_77]], 6
+// CHECK:         %[[VAL_80:.*]] = add nuw nsw i32 %[[VAL_79]], %[[VAL_78]]
+// CHECK:         %[[VAL_81:.*]] = icmp ult i32 %[[VAL_80]], 6
+// CHECK:         call void @llvm.assume(i1 %[[VAL_81]])
+// CHECK:         %[[VAL_82:.*]] = udiv i32 %[[VAL_80]], 1
+// CHECK:         %[[VAL_83:.*]] = urem i32 %[[VAL_82]], 3
+// CHECK:         %[[VAL_84:.*]] = udiv i32 %[[VAL_80]], 3
+// CHECK:         %[[VAL_85:.*]] = icmp ult i32 %[[VAL_80]], 6
+// CHECK:         br i1 %[[VAL_85]], label %[[VAL_86:.*]], label %[[VAL_87:.*]]
+// CHECK:       scatter_TensorFlowScatter_Mul.in_bounds-after:    ; preds = %[[VAL_88:.*]], %[[VAL_89:.*]]
+// CHECK:         ret void
+// CHECK:       scatter_TensorFlowScatter_Mul.in_bounds-true:     ; preds = %[[VAL_89]]
+// CHECK:         %[[VAL_90:.*]] = getelementptr inbounds [2 x i32], [2 x i32]* %[[VAL_73]], i32 0, i32 %[[VAL_84]]
+// CHECK:         %[[VAL_91:.*]] = load i32, i32* %[[VAL_90]], align 4, !invariant.load !4, !noalias !5
+// CHECK:         %[[VAL_92:.*]] = add i32 0, %[[VAL_91]]
+// CHECK:         %[[VAL_93:.*]] = icmp ult i32 %[[VAL_91]], 3
+// CHECK:         %[[VAL_94:.*]] = and i1 true, %[[VAL_93]]
+// CHECK:         br i1 %[[VAL_94]], label %[[VAL_95:.*]], label %[[VAL_88]]
+// CHECK:       scatter.in_bounds-after:                          ; preds = %[[VAL_96:.*]], %[[VAL_86]]
+// CHECK:         br label %[[VAL_87]]
+// CHECK:       scatter.in_bounds-true:                           ; preds = %[[VAL_86]]
+// CHECK:         %[[VAL_97:.*]] = getelementptr inbounds [3 x [3 x i32]], [3 x [3 x i32]]* %[[VAL_67]], i32 0, i32 %[[VAL_92]], i32 %[[VAL_83]]
+// CHECK:         %[[VAL_98:.*]] = alloca i32, align 4
+// CHECK:         %[[VAL_99:.*]] = bitcast [2 x [3 x i32]]* %[[VAL_76]] to i32*
+// CHECK:         %[[VAL_100:.*]] = getelementptr inbounds i32, i32* %[[VAL_99]], i32 %[[VAL_80]]
+// CHECK:         %[[VAL_101:.*]] = load i32, i32* %[[VAL_100]], align 4, !invariant.load !4, !noalias !5
+// CHECK:         store i32 %[[VAL_101]], i32* %[[VAL_98]], align 4
+// CHECK:         %[[VAL_102:.*]] = load i32, i32* %[[VAL_98]], align 4
+// CHECK:         %[[VAL_103:.*]] = load i32, i32* %[[VAL_97]], align 4
+// CHECK:         store i32 %[[VAL_103]], i32* %[[VAL_64]], align 4
+// CHECK:         br label %[[VAL_104:.*]]
+// CHECK:       atomic_op_loop_exit:                              ; preds = %[[VAL_104]]
+// CHECK:         br label %[[VAL_88]]
+// CHECK:       atomic_op_loop_body:                              ; preds = %[[VAL_104]], %[[VAL_95]]
+// CHECK:         %[[VAL_105:.*]] = load i32, i32* %[[VAL_64]], align 4
+// CHECK:         store i32 %[[VAL_105]], i32* %[[VAL_63]], align 4
+// CHECK:         call void @mul_s32(i32* %[[VAL_63]], i32* %[[VAL_98]], i32* %[[VAL_63]], i8* null)
+// CHECK:         %[[VAL_106:.*]] = load i32, i32* %[[VAL_63]], align 4
+// CHECK:         %[[VAL_107:.*]] = cmpxchg i32* %[[VAL_97]], i32 %[[VAL_105]], i32 %[[VAL_106]] seq_cst seq_cst
+// CHECK:         %[[VAL_108:.*]] = extractvalue { i32, i1 } %[[VAL_107]], 0
+// CHECK:         store i32 %[[VAL_108]], i32* %[[VAL_64]], align 4
+// CHECK:         %[[VAL_109:.*]] = extractvalue { i32, i1 } %[[VAL_107]], 1
+// CHECK:         br i1 %[[VAL_109]], label %[[VAL_96]], label %[[VAL_104]]
+// CHECK: !nvvm.annotations = !{!0, !1}
+// CHECK: !0 = !{void (i8*, i8*, i8*, i8*)* @scatter_TensorFlowScatter_Mul, !"kernel", i32 1}
+// CHECK: !1 = !{void (i8*, i8*, i8*, i8*)* @scatter_TensorFlowScatter_Mul, !"reqntidx", i32 6}
+// CHECK: !2 = !{i32 0, i32 1}
+// CHECK: !3 = !{i32 0, i32 6}
+// CHECK: !4 = !{}
+// CHECK: !5 = !{!6}
+// CHECK: !6 = !{!"buffer: {index:0, offset:0, size:36}", !7}
+// CHECK: !7 = !{!"XLA global AA domain"}
+// CHECK: !8 = !{!9}
+// CHECK: !9 = !{!"buffer: {index:4, offset:0, size:4}", !7}
+// CHECK: !10 = !{!11}
+// CHECK: !11 = !{!"buffer: {index:6, offset:0, size:4}", !7}
+// CHECK: !12 = !{!13}
+// CHECK: !13 = !{!"buffer: {index:5, offset:0, size:4}", !7}
+
+HloModule TensorFlowScatter_Mul
+
+mul_s32 (lhs: s32[], rhs: s32[]) -> s32[] {
+  lhs = s32[] parameter(0)
+  rhs = s32[] parameter(1)
+  ROOT mul = s32[] multiply(s32[] lhs, s32[] rhs)
+}
+
+ENTRY main {
+  operand = s32[3,3] parameter(0)
+  indices = s32[2] parameter(1)
+  updates = s32[2,3] parameter(2)
+  ROOT scatter_TensorFlowScatter_Mul = s32[3,3] scatter(operand, indices, updates),
+      to_apply=mul_s32,
+      update_window_dims={1},
+      inserted_window_dims={0},
+      scatter_dims_to_operand_dims={0},
+      index_vector_dim=1
+}
+
+// -----
+
+// CHECK-LABEL: define void @scatter_ScalarUpdate(i8* align 64 dereferenceable(16) %alloc0, i8* align 16 dereferenceable(16) %alloc1, i8* align 16 dereferenceable(4) %alloc2, i8* align 16 dereferenceable(4) %alloc3) {
+// CHECK:       entry:
+// CHECK:         %[[VAL_118:.*]] = getelementptr inbounds i8, i8* %[[VAL_119:.*]], i64 0
+// CHECK:         %[[VAL_120:.*]] = bitcast i8* %[[VAL_118]] to [4 x i32]*
+// CHECK:         %[[VAL_121:.*]] = getelementptr inbounds i8, i8* %[[VAL_122:.*]], i64 0
+// CHECK:         %[[VAL_123:.*]] = bitcast i8* %[[VAL_121]] to [4 x i32]*
+// CHECK:         %[[VAL_124:.*]] = getelementptr inbounds i8, i8* %[[VAL_125:.*]], i64 0
+// CHECK:         %[[VAL_126:.*]] = bitcast i8* %[[VAL_124]] to i32*
+// CHECK:         %[[VAL_127:.*]] = getelementptr inbounds i8, i8* %[[VAL_128:.*]], i64 0
+// CHECK:         %[[VAL_129:.*]] = bitcast i8* %[[VAL_127]] to i32*
+// CHECK:         %[[VAL_130:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range !2
+// CHECK:         %[[VAL_131:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !2
+// CHECK:         %[[VAL_132:.*]] = mul nuw nsw i32 %[[VAL_130]], 1
+// CHECK:         %[[VAL_133:.*]] = add nuw nsw i32 %[[VAL_132]], %[[VAL_131]]
+// CHECK:         %[[VAL_134:.*]] = icmp ult i32 %[[VAL_133]], 1
+// CHECK:         call void @llvm.assume(i1 %[[VAL_134]])
+// CHECK:         %[[VAL_135:.*]] = icmp ult i32 %[[VAL_133]], 1
+// CHECK:         br i1 %[[VAL_135]], label %[[VAL_136:.*]], label %[[VAL_137:.*]]
+// CHECK:       scatter_ScalarUpdate.in_bounds-after:             ; preds = %[[VAL_138:.*]], %[[VAL_139:.*]]
+// CHECK:         ret void
+// CHECK:       scatter_ScalarUpdate.in_bounds-true:              ; preds = %[[VAL_139]]
+// CHECK:         %[[VAL_140:.*]] = load i32, i32* %[[VAL_126]], align 4, !invariant.load !3, !noalias !4
+// CHECK:         %[[VAL_141:.*]] = add i32 0, %[[VAL_140]]
+// CHECK:         %[[VAL_142:.*]] = icmp ult i32 %[[VAL_140]], 4
+// CHECK:         %[[VAL_143:.*]] = and i1 true, %[[VAL_142]]
+// CHECK:         br i1 %[[VAL_143]], label %[[VAL_144:.*]], label %[[VAL_138]]
+// CHECK:       scatter.in_bounds-after:                          ; preds = %[[VAL_144]], %[[VAL_136]]
+// CHECK:         br label %[[VAL_137]]
+// CHECK:       scatter.in_bounds-true:                           ; preds = %[[VAL_136]]
+// CHECK:         %[[VAL_145:.*]] = getelementptr inbounds [4 x i32], [4 x i32]* %[[VAL_120]], i32 0, i32 %[[VAL_141]]
+// CHECK:         %[[VAL_146:.*]] = alloca i32, align 4
+// CHECK:         %[[VAL_147:.*]] = load i32, i32* %[[VAL_129]], align 4, !invariant.load !3, !noalias !4
+// CHECK:         store i32 %[[VAL_147]], i32* %[[VAL_146]], align 4
+// CHECK:         %[[VAL_148:.*]] = load i32, i32* %[[VAL_146]], align 4
+// CHECK:         store atomic i32 %[[VAL_148]], i32* %[[VAL_145]] unordered, align 4
+// CHECK:         br label %[[VAL_138]]
+// CHECK: !nvvm.annotations = !{!0, !1}
+// CHECK: !0 = !{void (i8*, i8*, i8*, i8*)* @scatter_ScalarUpdate, !"kernel", i32 1}
+// CHECK: !1 = !{void (i8*, i8*, i8*, i8*)* @scatter_ScalarUpdate, !"reqntidx", i32 1}
+// CHECK: !2 = !{i32 0, i32 1}
+// CHECK: !3 = !{}
+// CHECK: !4 = !{!5}
+// CHECK: !5 = !{!"buffer: {index:0, offset:0, size:16}", !6}
+// CHECK: !6 = !{!"XLA global AA domain"}
+
+HloModule ScalarUpdate
+
+update_s32 (lhs: s32[], rhs: s32[]) -> s32[] {
+  lhs = s32[] parameter(0)
+  ROOT rhs = s32[] parameter(1)
+}
+
+ENTRY main {
+  operand = s32[4]{0} parameter(0)
+  index = s32[] parameter(1)
+  updates = s32[] parameter(2)
+  ROOT scatter_ScalarUpdate = s32[4]{0} scatter(operand, index, updates),
+      to_apply=update_s32,
+      update_window_dims={},
+      inserted_window_dims={0},
+      scatter_dims_to_operand_dims={0},
+      index_vector_dim=0
+}

From 4c268edb3ce8894302a1357b2e78cd0f1062ff89 Mon Sep 17 00:00:00 2001
From: Russell Power <power@google.com>
Date: Fri, 26 Jun 2020 19:01:15 -0700
Subject: [PATCH 1214/1390] Add custom op registration mechanism for the TF
 dialect.

PiperOrigin-RevId: 318582669
Change-Id: I9be81ec073cc26b43f3d0f78576d546f48e9d1d1
---
 .../compiler/mlir/tensorflow/ir/tf_ops.cc     |  8 ++++++
 .../compiler/mlir/tensorflow/ir/tf_ops.h      | 26 +++++++++++++++++++
 2 files changed, 34 insertions(+)

diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc
index 97903170fba..eb831ab9d1a 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc
@@ -4321,6 +4321,10 @@ struct TFInlinerInterface : public DialectInlinerInterface {
 
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_op_interfaces.cc.inc"
 
+std::vector<TensorFlowDialect::AdditionalOpFunction>
+    *TensorFlowDialect::additional_operation_hooks_ =
+        new std::vector<TensorFlowDialect::AdditionalOpFunction>();
+
 TensorFlowDialect::TensorFlowDialect(MLIRContext *context)
     : Dialect(/*name=*/"tf", context) {
   addOperations<
@@ -4338,6 +4342,10 @@ TensorFlowDialect::TensorFlowDialect(MLIRContext *context)
   // Support unknown operations because not all TensorFlow operations are
   // registered.
   allowUnknownOperations();
+
+  for (auto hook : *TensorFlowDialect::additional_operation_hooks_) {
+    hook(*this);
+  }
 }
 
 namespace {
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h
index dbc14485cdb..f37b71575f6 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h
@@ -84,6 +84,32 @@ class TensorFlowDialect : public Dialect {
   // value with the desired resultant type.
   Operation *materializeConstant(OpBuilder &builder, Attribute value, Type type,
                                  Location loc) override;
+
+  typedef std::function<void(TensorFlowDialect &dialect)> AdditionalOpFunction;
+
+  // Register an op registration hook which is invoked during construction.
+  //
+  // A hook may use the public addOperations() method to add additional
+  // operations to the dialect. Hooks will only apply to subsequent
+  // instantations of the Dialect/MLIRContext.
+  static void RegisterAdditionalOperationHook(AdditionalOpFunction fn) {
+    additional_operation_hooks_->push_back(std::move(fn));
+  }
+
+  // Re-define publicly the protected addOperations() method from the Dialect
+  // class, usually used in a Dialect constructor. This allows hook
+  // functions to register operations on the TensorFlow dialect using the
+  // same interface.
+  template <typename... Args>
+  void addOperations() {
+    (void)std::initializer_list<int>{
+        0, (addOperation(AbstractOperation::get<Args>(*this)), 0)...};
+  }
+
+ private:
+  // Hook functions which may add additional operations to the dialect.
+  // These are invoked at construction time.
+  static std::vector<AdditionalOpFunction> *additional_operation_hooks_;
 };
 
 // TODO(b/131258166): TensorFlow's mutex.h defines a `mutex_lock` macro, whose

From b84b934fa962d6ea0e98e5e34dcd4be340b31a25 Mon Sep 17 00:00:00 2001
From: Vo Van Nghia <vovannghia2409@gmail.com>
Date: Sat, 27 Jun 2020 10:22:59 +0700
Subject: [PATCH 1215/1390] truncate when offset == 0

---
 .../experimental/filesystem/plugins/gcs/gcs_filesystem.cc  | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem.cc b/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem.cc
index 55527bd12f9..43b739e6cd7 100644
--- a/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem.cc
+++ b/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem.cc
@@ -97,7 +97,7 @@ typedef struct GCSFile {
 static void SyncImpl(const std::string& bucket, const std::string& object,
                      int64_t* offset, TempFile* outfile,
                      gcs::Client* gcs_client, TF_Status* status) {
-  outfile->operator<<(std::flush);
+  outfile->flush();
   // `*offset == 0` means this file does not exist on the server.
   if (*offset == -1 || *offset == 0) {
     // UploadFile will automatically switch to resumable upload based on Client
@@ -108,6 +108,11 @@ static void SyncImpl(const std::string& bucket, const std::string& object,
       return;
     }
     if (*offset == 0) {
+      if (!outfile->truncate()) {
+        TF_SetStatus(status, TF_INTERNAL,
+                     "Could not truncate internal temporary file.");
+        return;
+      }
       *offset = static_cast<int64_t>(metadata->size());
     }
     outfile->clear();

From 897803b00c6a83c10f0fb796834f577150431f1a Mon Sep 17 00:00:00 2001
From: Ran Chen <crccw@google.com>
Date: Fri, 26 Jun 2020 20:20:16 -0700
Subject: [PATCH 1216/1390] Add gather() util for multi worker tests

It takes a PerReplica value of n-dim, and returns a tensor of (n+1)-dim.

PiperOrigin-RevId: 318588244
Change-Id: I821f7701a6066e178db54e12a66e773c1767314c
---
 tensorflow/python/distribute/BUILD            | 36 +++++++++
 tensorflow/python/distribute/test_util.py     | 72 ++++++++++++++++++
 .../python/distribute/test_util_test.py       | 75 +++++++++++++++++++
 3 files changed, 183 insertions(+)
 create mode 100644 tensorflow/python/distribute/test_util.py
 create mode 100644 tensorflow/python/distribute/test_util_test.py

diff --git a/tensorflow/python/distribute/BUILD b/tensorflow/python/distribute/BUILD
index 9b9d951aa86..1dc081c55f6 100644
--- a/tensorflow/python/distribute/BUILD
+++ b/tensorflow/python/distribute/BUILD
@@ -20,6 +20,7 @@ py_library(
         ":single_loss_example",
         ":strategy_combinations",
         ":strategy_test_lib",
+        ":test_util",
         "//tensorflow/python/keras/distribute:keras_correctness_test_lib",
         "//tensorflow/python/keras/distribute:keras_test_lib",
     ],
@@ -1858,3 +1859,38 @@ distribute_py_test(
         "//tensorflow/python/eager:test",
     ],
 )
+
+py_library(
+    name = "test_util",
+    srcs = ["test_util.py"],
+    srcs_version = "PY3",
+    deps = [
+        ":collective_all_reduce_strategy",
+        ":cross_device_utils",
+        ":distribute_utils",
+        ":values",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:util",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/types",
+    ],
+)
+
+distribute_py_test(
+    name = "test_util_test",
+    srcs = ["test_util_test.py"],
+    tags = [
+        "multi_and_single_gpu",
+    ],
+    deps = [
+        ":combinations",
+        ":strategy_combinations",
+        ":test_util",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/eager:test",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
diff --git a/tensorflow/python/distribute/test_util.py b/tensorflow/python/distribute/test_util.py
new file mode 100644
index 00000000000..8ab054ac63e
--- /dev/null
+++ b/tensorflow/python/distribute/test_util.py
@@ -0,0 +1,72 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Test utilities."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import functools
+
+from tensorflow.python.distribute import collective_all_reduce_strategy
+from tensorflow.python.distribute import cross_device_utils
+from tensorflow.python.distribute import distribute_utils
+from tensorflow.python.distribute import values
+from tensorflow.python.eager import def_function
+from tensorflow.python.ops import array_ops
+from tensorflow.python.types import core
+from tensorflow.python.util import nest
+
+
+def gather(strategy, value):
+  """Gathers value from all workers.
+
+  This is intended for tests before we implement an official all-gather API.
+
+  Args:
+    strategy: a `tf.distribute.Strategy`.
+    value: a nested structure of n-dim `tf.distribute.DistributedValue` of
+      `tf.Tensor`, or of a `tf.Tensor` if the strategy only has one replica.
+
+  Returns:
+    a (n+1)-dim `tf.Tensor`.
+  """
+  return nest.map_structure(functools.partial(_gather, strategy), value)
+
+
+def _gather(strategy, value):
+  """Gathers a single value."""
+  # pylint: disable=protected-access
+  if not isinstance(value, values.DistributedValues):
+    assert isinstance(value, core.Tensor)
+    value = values.PerReplica([value])
+  if not isinstance(strategy.extended,
+                    collective_all_reduce_strategy.CollectiveAllReduceExtended):
+    return array_ops.stack(value._values)
+  assert len(strategy.extended.worker_devices) == len(value._values)
+  inputs = [array_ops.expand_dims_v2(v, axis=0) for v in value._values]
+  collective_keys = strategy.extended._collective_keys
+  devices = strategy.extended.worker_devices
+  group_size = strategy.num_replicas_in_sync
+
+  @def_function.function
+  def gather_fn():
+    gathered = cross_device_utils.build_collective_gather(
+        inputs, devices, group_size, collective_keys)
+    return distribute_utils.update_regroup(
+        strategy.extended, gathered, group=True)
+
+  return gather_fn()
+  # pylint: enable=protected-access
diff --git a/tensorflow/python/distribute/test_util_test.py b/tensorflow/python/distribute/test_util_test.py
new file mode 100644
index 00000000000..7dab2e199b1
--- /dev/null
+++ b/tensorflow/python/distribute/test_util_test.py
@@ -0,0 +1,75 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for test utilities."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+
+from tensorflow.python.distribute import combinations
+from tensorflow.python.distribute import strategy_combinations
+from tensorflow.python.distribute import test_util
+from tensorflow.python.eager import def_function
+from tensorflow.python.eager import test
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import array_ops
+
+
+@combinations.generate(
+    combinations.combine(
+        strategy=[
+            strategy_combinations.multi_worker_mirrored_2x1_cpu,
+            strategy_combinations.multi_worker_mirrored_2x1_gpu,
+            strategy_combinations.multi_worker_mirrored_2x2_gpu,
+        ] + strategy_combinations.strategies_minus_tpu,
+        mode=['eager', 'graph']))
+class GatherTest(test.TestCase, parameterized.TestCase):
+
+  def testOne(self, strategy):
+
+    @def_function.function
+    def f():
+      return array_ops.ones((), dtypes.float32)
+
+    results = test_util.gather(strategy, strategy.run(f))
+    self.assertAllEqual(
+        self.evaluate(results), [1.] * strategy.num_replicas_in_sync)
+
+  def testNest(self, strategy):
+
+    @def_function.function
+    def f():
+      return {
+          'foo':
+              array_ops.ones((), dtypes.float32),
+          'bar': [
+              array_ops.zeros((), dtypes.float32),
+              array_ops.ones((), dtypes.float32),
+          ]
+      }
+
+    results = test_util.gather(strategy, strategy.run(f))
+    self.assertAllEqual(
+        self.evaluate(results['foo']), [1.] * strategy.num_replicas_in_sync)
+    self.assertAllEqual(
+        self.evaluate(results['bar'][0]), [0.] * strategy.num_replicas_in_sync)
+    self.assertAllEqual(
+        self.evaluate(results['bar'][1]), [1.] * strategy.num_replicas_in_sync)
+
+
+if __name__ == '__main__':
+  combinations.main()

From 064994341a8c9636d888358da12cfefb90f95d00 Mon Sep 17 00:00:00 2001
From: Frank Chen <frankchn@google.com>
Date: Fri, 26 Jun 2020 22:08:49 -0700
Subject: [PATCH 1217/1390] Add TPUMeshStateInterface integration to TPU

PiperOrigin-RevId: 318595505
Change-Id: I214c3bd9efcf62bf3b474de9d300b1a74b66c6c1
---
 .../core/tpu/kernels/tpu_configuration_ops.cc    | 16 +++++++++++++++-
 tensorflow/core/tpu/tpu_config_c_api.h           |  6 +++++-
 tensorflow/core/tpu/tpu_library_init_fns.inc     |  1 +
 3 files changed, 21 insertions(+), 2 deletions(-)

diff --git a/tensorflow/core/tpu/kernels/tpu_configuration_ops.cc b/tensorflow/core/tpu/kernels/tpu_configuration_ops.cc
index 7d3814ad3c3..065a7f77dd6 100644
--- a/tensorflow/core/tpu/kernels/tpu_configuration_ops.cc
+++ b/tensorflow/core/tpu/kernels/tpu_configuration_ops.cc
@@ -174,9 +174,10 @@ void WaitForDistributedTpuOp::Compute(OpKernelContext* ctx) {
   OP_REQUIRES_OK(ctx, GetTpuMeshStateInterface(rmgr, &mesh_state));
   core::ScopedUnref mesh_state_unref(mesh_state);
 
+  auto* mesh_common_state = mesh_state->mesh_common_state();
   tpu::ConfigApiFn()->WaitForDistributedTpuOp_DoWorkFn(
       num_hosts, num_devices_per_host,
-      const_cast<const int32_t**>(mapping_arg.data()), mesh_state,
+      const_cast<const int32_t**>(mapping_arg.data()), mesh_common_state,
       &tpu_topology_output_size, &tpu_topology_output, status);
 
   Tensor* ctx_output;
@@ -210,12 +211,25 @@ void InitializeHostForDistributedTpuOp::Compute(OpKernelContext* ctx) {
   VLOG(1) << "InitializeHostForDistributedTpuOp";
   XLA_SCOPED_LOGGING_TIMER("InitializeHostForDistributedTpuOp");
 
+  auto* rmgr = GetTPUConfigResourceMgr();
   auto tpu_host_config = ctx->input(0).scalar<tstring>()();
 
   size_t device_id_output_size;
   int32_t* device_id_output;
   TF_Status* status = TF_NewStatus();
 
+  bool is_master_worker =
+      tpu::ConfigApiFn()->TpuConfigurationApi_HasTPUPodStateFn();
+  if (!is_master_worker) {
+    // Reset the mesh interface if we are not the master.
+    OP_REQUIRES_OK(ctx, DeleteIfExists<tpu::TpuMeshStateInterface>(
+                            rmgr, tpu::kTpuMeshStateInterfaceResourceName));
+    auto* mesh_state_interface = tpu::TpuMeshStateInterface::Create();
+    OP_REQUIRES_OK(ctx, rmgr->Create(rmgr->default_container(),
+                                     tpu::kTpuMeshStateInterfaceResourceName,
+                                     mesh_state_interface));
+  }
+
   tpu::ConfigApiFn()->InitializeHostForDistributedTpuOp_DoWorkFn(
       tpu_host_config.size(), tpu_host_config.data(),
       enable_whole_mesh_compilations_, &device_id_output_size,
diff --git a/tensorflow/core/tpu/tpu_config_c_api.h b/tensorflow/core/tpu/tpu_config_c_api.h
index 9c0ed203d4b..8530df5ac26 100644
--- a/tensorflow/core/tpu/tpu_config_c_api.h
+++ b/tensorflow/core/tpu/tpu_config_c_api.h
@@ -25,6 +25,7 @@ limitations under the License.
 typedef struct TpuSerializedProto TpuSerializedProto;
 
 namespace tensorflow {
+class TpuMeshCommonState;
 namespace tpu {
 class TpuMeshStateInterface;
 }  // namespace tpu
@@ -40,7 +41,7 @@ TFTPU_CAPI_EXPORT void ConfigureDistributedTpuOp_DoWork(
 TFTPU_CAPI_EXPORT void WaitForDistributedTpuOp_DoWork(
     const size_t num_hosts, const size_t num_cores_per_host,
     const int32_t** host_ordinal_to_global_core_id_map,
-    tensorflow::tpu::TpuMeshStateInterface* tpu_mesh_state_interface,
+    tensorflow::TpuMeshCommonState* tpu_mesh_common_state,
     size_t* tpu_topology_output_size, char** tpu_topology_output,
     TF_Status* status);
 
@@ -60,6 +61,8 @@ TFTPU_CAPI_EXPORT void DisconnectDistributedTpuChipsOp_DoWork(
 
 TFTPU_CAPI_EXPORT void TpuConfigurationApi_FreeCharArray(char* output);
 TFTPU_CAPI_EXPORT void TpuConfigurationApi_FreeInt32Array(int32_t* output);
+
+TFTPU_CAPI_EXPORT bool TpuConfigurationApi_HasTPUPodState();
 }
 
 struct TfTpu_ConfigApiFn {
@@ -71,6 +74,7 @@ struct TfTpu_ConfigApiFn {
   TFTPU_ADD_FN_IN_STRUCT(DisconnectDistributedTpuChipsOp_DoWork);
   TFTPU_ADD_FN_IN_STRUCT(TpuConfigurationApi_FreeCharArray);
   TFTPU_ADD_FN_IN_STRUCT(TpuConfigurationApi_FreeInt32Array);
+  TFTPU_ADD_FN_IN_STRUCT(TpuConfigurationApi_HasTPUPodState);
 };
 
 #endif  // TENSORFLOW_CORE_TPU_TPU_CONFIG_C_API_H_
diff --git a/tensorflow/core/tpu/tpu_library_init_fns.inc b/tensorflow/core/tpu/tpu_library_init_fns.inc
index 16d06539349..f8bde09e728 100644
--- a/tensorflow/core/tpu/tpu_library_init_fns.inc
+++ b/tensorflow/core/tpu/tpu_library_init_fns.inc
@@ -11,6 +11,7 @@ tensorflow::Status SetTpuConfigStructFns(void* library_handle) {
   TFTPU_SET_FN(config_fn, DisconnectDistributedTpuChipsOp_DoWork);
   TFTPU_SET_FN(config_fn, TpuConfigurationApi_FreeCharArray);
   TFTPU_SET_FN(config_fn, TpuConfigurationApi_FreeInt32Array);
+  TFTPU_SET_FN(config_fn, TpuConfigurationApi_HasTPUPodState);
 
   return tensorflow::Status::OK();
 }

From ff28621f7368c9447f9ffaa200bbbcffe5399633 Mon Sep 17 00:00:00 2001
From: Advait Jain <advaitjain@google.com>
Date: Fri, 26 Jun 2020 22:14:38 -0700
Subject: [PATCH 1218/1390] Separate out parse functionality into helper
 functions.

Ops in this change:
 * L2Normalization
 * Less
 * LessEqual
 * Log
 * LogicalAnd
 * LogicalNot
 * LogicalOr
 * Logistic
 * Maximum

PiperOrigin-RevId: 318595850
Change-Id: I17605d841170ae9cbd4e92d44432327ab40b401b
---
 .../lite/core/api/flatbuffer_conversions.cc   | 155 +++++++++++++++---
 .../lite/core/api/flatbuffer_conversions.h    |  42 +++++
 .../lite/micro/micro_mutable_op_resolver.h    |  40 ++---
 3 files changed, 190 insertions(+), 47 deletions(-)

diff --git a/tensorflow/lite/core/api/flatbuffer_conversions.cc b/tensorflow/lite/core/api/flatbuffer_conversions.cc
index efd8439f762..668228bd8c1 100644
--- a/tensorflow/lite/core/api/flatbuffer_conversions.cc
+++ b/tensorflow/lite/core/api/flatbuffer_conversions.cc
@@ -461,6 +461,97 @@ TfLiteStatus ParseGreaterEqual(const Operator*, BuiltinOperator, ErrorReporter*,
   return kTfLiteOk;
 }
 
+TfLiteStatus ParseL2Normalization(const Operator* op, BuiltinOperator,
+                                  ErrorReporter* error_reporter,
+                                  BuiltinDataAllocator* allocator,
+                                  void** builtin_data) {
+  CheckParsePointerParams(op, error_reporter, allocator, builtin_data);
+
+  SafeBuiltinDataAllocator safe_allocator(allocator);
+  std::unique_ptr<TfLiteL2NormParams,
+                  SafeBuiltinDataAllocator::BuiltinDataDeleter>
+      params = safe_allocator.Allocate<TfLiteL2NormParams>();
+  TF_LITE_ENSURE(error_reporter, params != nullptr);
+
+  const L2NormOptions* schema_params = op->builtin_options_as_L2NormOptions();
+
+  if (schema_params != nullptr) {
+    params->activation =
+        ConvertActivation(schema_params->fused_activation_function());
+  } else {
+    // TODO(b/157480169): We should either return kTfLiteError or fill in some
+    // reasonable defaults in the params struct. We are not doing so until we
+    // better undertand the ramifications of changing the legacy behavior.
+  }
+
+  *builtin_data = params.release();
+  return kTfLiteOk;
+}
+
+// We have this parse function instead of directly returning kTfLiteOk from the
+// switch-case in ParseOpData because this function is used as part of the
+// selective registration for the OpResolver implementation in micro.
+TfLiteStatus ParseLess(const Operator*, BuiltinOperator, ErrorReporter*,
+                       BuiltinDataAllocator*, void**) {
+  return kTfLiteOk;
+}
+
+// We have this parse function instead of directly returning kTfLiteOk from the
+// switch-case in ParseOpData because this function is used as part of the
+// selective registration for the OpResolver implementation in micro.
+TfLiteStatus ParseLessEqual(const Operator*, BuiltinOperator, ErrorReporter*,
+                            BuiltinDataAllocator*, void**) {
+  return kTfLiteOk;
+}
+
+// We have this parse function instead of directly returning kTfLiteOk from the
+// switch-case in ParseOpData because this function is used as part of the
+// selective registration for the OpResolver implementation in micro.
+TfLiteStatus ParseLog(const Operator*, BuiltinOperator, ErrorReporter*,
+                      BuiltinDataAllocator*, void**) {
+  return kTfLiteOk;
+}
+
+// We have this parse function instead of directly returning kTfLiteOk from the
+// switch-case in ParseOpData because this function is used as part of the
+// selective registration for the OpResolver implementation in micro.
+TfLiteStatus ParseLogicalAnd(const Operator*, BuiltinOperator, ErrorReporter*,
+                             BuiltinDataAllocator*, void**) {
+  return kTfLiteOk;
+}
+
+// We have this parse function instead of directly returning kTfLiteOk from the
+// switch-case in ParseOpData because this function is used as part of the
+// selective registration for the OpResolver implementation in micro.
+TfLiteStatus ParseLogicalNot(const Operator*, BuiltinOperator, ErrorReporter*,
+                             BuiltinDataAllocator*, void**) {
+  return kTfLiteOk;
+}
+
+// We have this parse function instead of directly returning kTfLiteOk from the
+// switch-case in ParseOpData because this function is used as part of the
+// selective registration for the OpResolver implementation in micro.
+TfLiteStatus ParseLogicalOr(const Operator*, BuiltinOperator, ErrorReporter*,
+                            BuiltinDataAllocator*, void**) {
+  return kTfLiteOk;
+}
+
+// We have this parse function instead of directly returning kTfLiteOk from the
+// switch-case in ParseOpData because this function is used as part of the
+// selective registration for the OpResolver implementation in micro.
+TfLiteStatus ParseLogistic(const Operator*, BuiltinOperator, ErrorReporter*,
+                           BuiltinDataAllocator*, void**) {
+  return kTfLiteOk;
+}
+
+// We have this parse function instead of directly returning kTfLiteOk from the
+// switch-case in ParseOpData because this function is used as part of the
+// selective registration for the OpResolver implementation in micro.
+TfLiteStatus ParseMaximum(const Operator*, BuiltinOperator, ErrorReporter*,
+                          BuiltinDataAllocator*, void**) {
+  return kTfLiteOk;
+}
+
 TfLiteStatus ParsePool(const Operator* op, BuiltinOperator,
                        ErrorReporter* error_reporter,
                        BuiltinDataAllocator* allocator, void** builtin_data) {
@@ -684,14 +775,56 @@ TfLiteStatus ParseOpData(const Operator* op, BuiltinOperator op_type,
                                builtin_data);
     }
 
-    case BuiltinOperator_MAX_POOL_2D: {
-      return ParsePool(op, op_type, error_reporter, allocator, builtin_data);
+    case BuiltinOperator_L2_NORMALIZATION: {
+      return ParseL2Normalization(op, op_type, error_reporter, allocator,
+                                  builtin_data);
     }
 
     case BuiltinOperator_L2_POOL_2D: {
       return ParsePool(op, op_type, error_reporter, allocator, builtin_data);
     }
 
+    case BuiltinOperator_LESS: {
+      return ParseLess(op, op_type, error_reporter, allocator, builtin_data);
+    }
+
+    case BuiltinOperator_LESS_EQUAL: {
+      return ParseLessEqual(op, op_type, error_reporter, allocator,
+                            builtin_data);
+    }
+
+    case BuiltinOperator_LOG: {
+      return ParseLog(op, op_type, error_reporter, allocator, builtin_data);
+    }
+
+    case BuiltinOperator_LOGICAL_AND: {
+      return ParseLogicalAnd(op, op_type, error_reporter, allocator,
+                             builtin_data);
+    }
+
+    case BuiltinOperator_LOGICAL_NOT: {
+      return ParseLogicalNot(op, op_type, error_reporter, allocator,
+                             builtin_data);
+    }
+
+    case BuiltinOperator_LOGICAL_OR: {
+      return ParseLogicalOr(op, op_type, error_reporter, allocator,
+                            builtin_data);
+    }
+
+    case BuiltinOperator_LOGISTIC: {
+      return ParseLogistic(op, op_type, error_reporter, allocator,
+                           builtin_data);
+    }
+
+    case BuiltinOperator_MAXIMUM: {
+      return ParseMaximum(op, op_type, error_reporter, allocator, builtin_data);
+    }
+
+    case BuiltinOperator_MAX_POOL_2D: {
+      return ParsePool(op, op_type, error_reporter, allocator, builtin_data);
+    }
+
     case BuiltinOperator_QUANTIZE: {
       return ParseQuantize(op, op_type, error_reporter, allocator,
                            builtin_data);
@@ -820,16 +953,6 @@ TfLiteStatus ParseOpData(const Operator* op, BuiltinOperator op_type,
       *builtin_data = params.release();
       return kTfLiteOk;
     }
-    case BuiltinOperator_L2_NORMALIZATION: {
-      auto params = safe_allocator.Allocate<TfLiteL2NormParams>();
-      TF_LITE_ENSURE(error_reporter, params != nullptr);
-      if (const auto* schema_params = op->builtin_options_as_L2NormOptions()) {
-        params->activation =
-            ConvertActivation(schema_params->fused_activation_function());
-      }
-      *builtin_data = params.release();
-      return kTfLiteOk;
-    }
     case BuiltinOperator_LOCAL_RESPONSE_NORMALIZATION: {
       auto params = safe_allocator.Allocate<TfLiteLocalResponseNormParams>();
       TF_LITE_ENSURE(error_reporter, params != nullptr);
@@ -1214,14 +1337,9 @@ TfLiteStatus ParseOpData(const Operator* op, BuiltinOperator op_type,
     case BuiltinOperator_EXP:
     case BuiltinOperator_EXPAND_DIMS:
     case BuiltinOperator_HARD_SWISH:
-    case BuiltinOperator_LESS:
-    case BuiltinOperator_LESS_EQUAL:
-    case BuiltinOperator_LOG:
-    case BuiltinOperator_LOGISTIC:
     case BuiltinOperator_LOG_SOFTMAX:
     case BuiltinOperator_MATRIX_DIAG:
     case BuiltinOperator_MATRIX_SET_DIAG:
-    case BuiltinOperator_MAXIMUM:
     case BuiltinOperator_MINIMUM:
     case BuiltinOperator_NEG:
     case BuiltinOperator_NOT_EQUAL:
@@ -1244,9 +1362,6 @@ TfLiteStatus ParseOpData(const Operator* op, BuiltinOperator op_type,
     case BuiltinOperator_TOPK_V2:
     case BuiltinOperator_TRANSPOSE:
     case BuiltinOperator_POW:
-    case BuiltinOperator_LOGICAL_OR:
-    case BuiltinOperator_LOGICAL_AND:
-    case BuiltinOperator_LOGICAL_NOT:
     case BuiltinOperator_FLOOR_DIV:
     case BuiltinOperator_SQUARE:
     case BuiltinOperator_ZEROS_LIKE:
diff --git a/tensorflow/lite/core/api/flatbuffer_conversions.h b/tensorflow/lite/core/api/flatbuffer_conversions.h
index 7cf8decaca4..8b8e201afce 100644
--- a/tensorflow/lite/core/api/flatbuffer_conversions.h
+++ b/tensorflow/lite/core/api/flatbuffer_conversions.h
@@ -140,6 +140,48 @@ TfLiteStatus ParseGreaterEqual(const Operator* op, BuiltinOperator op_type,
                                BuiltinDataAllocator* allocator,
                                void** builtin_data);
 
+TfLiteStatus ParseL2Normalization(const Operator* op, BuiltinOperator op_type,
+                                  ErrorReporter* error_reporter,
+                                  BuiltinDataAllocator* allocator,
+                                  void** builtin_data);
+
+TfLiteStatus ParseLess(const Operator* op, BuiltinOperator op_type,
+                       ErrorReporter* error_reporter,
+                       BuiltinDataAllocator* allocator, void** builtin_data);
+
+TfLiteStatus ParseLessEqual(const Operator* op, BuiltinOperator op_type,
+                            ErrorReporter* error_reporter,
+                            BuiltinDataAllocator* allocator,
+                            void** builtin_data);
+
+TfLiteStatus ParseLog(const Operator* op, BuiltinOperator op_type,
+                      ErrorReporter* error_reporter,
+                      BuiltinDataAllocator* allocator, void** builtin_data);
+
+TfLiteStatus ParseLogicalAnd(const Operator* op, BuiltinOperator op_type,
+                             ErrorReporter* error_reporter,
+                             BuiltinDataAllocator* allocator,
+                             void** builtin_data);
+
+TfLiteStatus ParseLogicalNot(const Operator* op, BuiltinOperator op_type,
+                             ErrorReporter* error_reporter,
+                             BuiltinDataAllocator* allocator,
+                             void** builtin_data);
+
+TfLiteStatus ParseLogicalOr(const Operator* op, BuiltinOperator op_type,
+                            ErrorReporter* error_reporter,
+                            BuiltinDataAllocator* allocator,
+                            void** builtin_data);
+
+TfLiteStatus ParseLogistic(const Operator* op, BuiltinOperator op_type,
+                           ErrorReporter* error_reporter,
+                           BuiltinDataAllocator* allocator,
+                           void** builtin_data);
+
+TfLiteStatus ParseMaximum(const Operator* op, BuiltinOperator op_type,
+                          ErrorReporter* error_reporter,
+                          BuiltinDataAllocator* allocator, void** builtin_data);
+
 TfLiteStatus ParsePool(const Operator* op, BuiltinOperator op_type,
                        ErrorReporter* error_reporter,
                        BuiltinDataAllocator* allocator, void** builtin_data);
diff --git a/tensorflow/lite/micro/micro_mutable_op_resolver.h b/tensorflow/lite/micro/micro_mutable_op_resolver.h
index 34d8f9dc143..246ca723d49 100644
--- a/tensorflow/lite/micro/micro_mutable_op_resolver.h
+++ b/tensorflow/lite/micro/micro_mutable_op_resolver.h
@@ -201,67 +201,53 @@ class MicroMutableOpResolver : public MicroOpResolver {
   }
 
   TfLiteStatus AddL2Normalization() {
-    // TODO(b/149408647): Replace ParseOpData with the operator specific parse
-    // function.
     return AddBuiltin(BuiltinOperator_L2_NORMALIZATION,
                       *tflite::ops::micro::Register_L2_NORMALIZATION(),
-                      ParseOpData);
+                      ParseL2Normalization);
   }
 
   TfLiteStatus AddLess() {
-    // TODO(b/149408647): Replace ParseOpData with the operator specific parse
-    // function.
     return AddBuiltin(BuiltinOperator_LESS,
-                      *tflite::ops::micro::Register_LESS(), ParseOpData);
+                      *tflite::ops::micro::Register_LESS(), ParseLess);
   }
 
   TfLiteStatus AddLessEqual() {
-    // TODO(b/149408647): Replace ParseOpData with the operator specific parse
-    // function.
     return AddBuiltin(BuiltinOperator_LESS_EQUAL,
-                      *tflite::ops::micro::Register_LESS_EQUAL(), ParseOpData);
+                      *tflite::ops::micro::Register_LESS_EQUAL(),
+                      ParseLessEqual);
   }
 
   TfLiteStatus AddLog() {
-    // TODO(b/149408647): Replace ParseOpData with the operator specific parse
-    // function.
     return AddBuiltin(BuiltinOperator_LOG, *tflite::ops::micro::Register_LOG(),
-                      ParseOpData);
+                      ParseLog);
   }
 
   TfLiteStatus AddLogicalAnd() {
-    // TODO(b/149408647): Replace ParseOpData with the operator specific parse
-    // function.
     return AddBuiltin(BuiltinOperator_LOGICAL_AND,
-                      *tflite::ops::micro::Register_LOGICAL_AND(), ParseOpData);
+                      *tflite::ops::micro::Register_LOGICAL_AND(),
+                      ParseLogicalAnd);
   }
 
   TfLiteStatus AddLogicalNot() {
-    // TODO(b/149408647): Replace ParseOpData with the operator specific parse
-    // function.
     return AddBuiltin(BuiltinOperator_LOGICAL_NOT,
-                      *tflite::ops::micro::Register_LOGICAL_NOT(), ParseOpData);
+                      *tflite::ops::micro::Register_LOGICAL_NOT(),
+                      ParseLogicalNot);
   }
 
   TfLiteStatus AddLogicalOr() {
-    // TODO(b/149408647): Replace ParseOpData with the operator specific parse
-    // function.
     return AddBuiltin(BuiltinOperator_LOGICAL_OR,
-                      *tflite::ops::micro::Register_LOGICAL_OR(), ParseOpData);
+                      *tflite::ops::micro::Register_LOGICAL_OR(),
+                      ParseLogicalOr);
   }
 
   TfLiteStatus AddLogistic() {
-    // TODO(b/149408647): Replace ParseOpData with the operator specific parse
-    // function.
     return AddBuiltin(BuiltinOperator_LOGISTIC,
-                      *tflite::ops::micro::Register_LOGISTIC(), ParseOpData);
+                      *tflite::ops::micro::Register_LOGISTIC(), ParseLogistic);
   }
 
   TfLiteStatus AddMaximum() {
-    // TODO(b/149408647): Replace ParseOpData with the operator specific parse
-    // function.
     return AddBuiltin(BuiltinOperator_MAXIMUM,
-                      *tflite::ops::micro::Register_MAXIMUM(), ParseOpData);
+                      *tflite::ops::micro::Register_MAXIMUM(), ParseMaximum);
   }
 
   TfLiteStatus AddMaxPool2D() {

From b928c4c5ec0dcc4415adebd935eb676ae7a9fa37 Mon Sep 17 00:00:00 2001
From: Advait Jain <advaitjain@google.com>
Date: Fri, 26 Jun 2020 23:13:26 -0700
Subject: [PATCH 1219/1390] Separate out parse functionality into helper
 functions.

Non trivial parse function:
 * Mean

Stub parse function:
 * HardSwish
 * Minimum
 * Neg
 * NotEqual
 * Pad
 * PadV2
 * Prelu
 * Relu
 * Relu6
 * Round
 * Rsqrt
 * Sin
 * Sqrt
 * Square
 * Tanh

PiperOrigin-RevId: 318599351
Change-Id: I7dd72ae56611efd57d5b47dce844f2bcdb345766
---
 .../lite/core/api/flatbuffer_conversions.cc   | 260 +++++++++++++++---
 .../lite/core/api/flatbuffer_conversions.h    |  66 +++++
 .../lite/micro/micro_mutable_op_resolver.h    |  65 ++---
 3 files changed, 312 insertions(+), 79 deletions(-)

diff --git a/tensorflow/lite/core/api/flatbuffer_conversions.cc b/tensorflow/lite/core/api/flatbuffer_conversions.cc
index 668228bd8c1..4d5c01a92da 100644
--- a/tensorflow/lite/core/api/flatbuffer_conversions.cc
+++ b/tensorflow/lite/core/api/flatbuffer_conversions.cc
@@ -461,6 +461,14 @@ TfLiteStatus ParseGreaterEqual(const Operator*, BuiltinOperator, ErrorReporter*,
   return kTfLiteOk;
 }
 
+// We have this parse function instead of directly returning kTfLiteOk from the
+// switch-case in ParseOpData because this function is used as part of the
+// selective registration for the OpResolver implementation in micro.
+TfLiteStatus ParseHardSwish(const Operator*, BuiltinOperator, ErrorReporter*,
+                            BuiltinDataAllocator*, void**) {
+  return kTfLiteOk;
+}
+
 TfLiteStatus ParseL2Normalization(const Operator* op, BuiltinOperator,
                                   ErrorReporter* error_reporter,
                                   BuiltinDataAllocator* allocator,
@@ -552,6 +560,46 @@ TfLiteStatus ParseMaximum(const Operator*, BuiltinOperator, ErrorReporter*,
   return kTfLiteOk;
 }
 
+// We have this parse function instead of directly returning kTfLiteOk from the
+// switch-case in ParseOpData because this function is used as part of the
+// selective registration for the OpResolver implementation in micro.
+TfLiteStatus ParseMinimum(const Operator*, BuiltinOperator, ErrorReporter*,
+                          BuiltinDataAllocator*, void**) {
+  return kTfLiteOk;
+}
+
+// We have this parse function instead of directly returning kTfLiteOk from the
+// switch-case in ParseOpData because this function is used as part of the
+// selective registration for the OpResolver implementation in micro.
+TfLiteStatus ParseNeg(const Operator*, BuiltinOperator, ErrorReporter*,
+                      BuiltinDataAllocator*, void**) {
+  return kTfLiteOk;
+}
+
+// We have this parse function instead of directly returning kTfLiteOk from the
+// switch-case in ParseOpData because this function is used as part of the
+// selective registration for the OpResolver implementation in micro.
+TfLiteStatus ParseNotEqual(const Operator*, BuiltinOperator, ErrorReporter*,
+                           BuiltinDataAllocator*, void**) {
+  return kTfLiteOk;
+}
+
+// We have this parse function instead of directly returning kTfLiteOk from the
+// switch-case in ParseOpData because this function is used as part of the
+// selective registration for the OpResolver implementation in micro.
+TfLiteStatus ParsePad(const Operator*, BuiltinOperator, ErrorReporter*,
+                      BuiltinDataAllocator*, void**) {
+  return kTfLiteOk;
+}
+
+// We have this parse function instead of directly returning kTfLiteOk from the
+// switch-case in ParseOpData because this function is used as part of the
+// selective registration for the OpResolver implementation in micro.
+TfLiteStatus ParsePadV2(const Operator*, BuiltinOperator, ErrorReporter*,
+                        BuiltinDataAllocator*, void**) {
+  return kTfLiteOk;
+}
+
 TfLiteStatus ParsePool(const Operator* op, BuiltinOperator,
                        ErrorReporter* error_reporter,
                        BuiltinDataAllocator* allocator, void** builtin_data) {
@@ -583,6 +631,65 @@ TfLiteStatus ParsePool(const Operator* op, BuiltinOperator,
   return kTfLiteOk;
 }
 
+// We have this parse function instead of directly returning kTfLiteOk from the
+// switch-case in ParseOpData because this function is used as part of the
+// selective registration for the OpResolver implementation in micro.
+TfLiteStatus ParsePrelu(const Operator*, BuiltinOperator, ErrorReporter*,
+                        BuiltinDataAllocator*, void**) {
+  return kTfLiteOk;
+}
+
+// We have this parse function instead of directly returning kTfLiteOk from the
+// switch-case in ParseOpData because this function is used as part of the
+// selective registration for the OpResolver implementation in micro.
+TfLiteStatus ParseQuantize(const Operator*, BuiltinOperator, ErrorReporter*,
+                           BuiltinDataAllocator*, void**) {
+  return kTfLiteOk;
+}
+
+TfLiteStatus ParseReducer(const Operator* op, BuiltinOperator,
+                          ErrorReporter* error_reporter,
+                          BuiltinDataAllocator* allocator,
+                          void** builtin_data) {
+  CheckParsePointerParams(op, error_reporter, allocator, builtin_data);
+
+  SafeBuiltinDataAllocator safe_allocator(allocator);
+
+  std::unique_ptr<TfLiteReducerParams,
+                  SafeBuiltinDataAllocator::BuiltinDataDeleter>
+      params = safe_allocator.Allocate<TfLiteReducerParams>();
+  TF_LITE_ENSURE(error_reporter, params != nullptr);
+
+  const ReducerOptions* schema_params = op->builtin_options_as_ReducerOptions();
+
+  if (schema_params != nullptr) {
+    params->keep_dims = schema_params->keep_dims();
+  } else {
+    // TODO(b/157480169): We should either return kTfLiteError or fill in some
+    // reasonable defaults in the params struct. We are not doing so until we
+    // better undertand the ramifications of changing the legacy behavior.
+  }
+
+  *builtin_data = params.release();
+  return kTfLiteOk;
+}
+
+// We have this parse function instead of directly returning kTfLiteOk from the
+// switch-case in ParseOpData because this function is used as part of the
+// selective registration for the OpResolver implementation in micro.
+TfLiteStatus ParseRelu(const Operator*, BuiltinOperator, ErrorReporter*,
+                       BuiltinDataAllocator*, void**) {
+  return kTfLiteOk;
+}
+
+// We have this parse function instead of directly returning kTfLiteOk from the
+// switch-case in ParseOpData because this function is used as part of the
+// selective registration for the OpResolver implementation in micro.
+TfLiteStatus ParseRelu6(const Operator*, BuiltinOperator, ErrorReporter*,
+                        BuiltinDataAllocator*, void**) {
+  return kTfLiteOk;
+}
+
 TfLiteStatus ParseReshape(const Operator* op, BuiltinOperator,
                           ErrorReporter* error_reporter,
                           BuiltinDataAllocator* allocator,
@@ -627,8 +734,24 @@ TfLiteStatus ParseReshape(const Operator* op, BuiltinOperator,
 // We have this parse function instead of directly returning kTfLiteOk from the
 // switch-case in ParseOpData because this function is used as part of the
 // selective registration for the OpResolver implementation in micro.
-TfLiteStatus ParseQuantize(const Operator*, BuiltinOperator, ErrorReporter*,
-                           BuiltinDataAllocator*, void**) {
+TfLiteStatus ParseRound(const Operator*, BuiltinOperator, ErrorReporter*,
+                        BuiltinDataAllocator*, void**) {
+  return kTfLiteOk;
+}
+
+// We have this parse function instead of directly returning kTfLiteOk from the
+// switch-case in ParseOpData because this function is used as part of the
+// selective registration for the OpResolver implementation in micro.
+TfLiteStatus ParseRsqrt(const Operator*, BuiltinOperator, ErrorReporter*,
+                        BuiltinDataAllocator*, void**) {
+  return kTfLiteOk;
+}
+
+// We have this parse function instead of directly returning kTfLiteOk from the
+// switch-case in ParseOpData because this function is used as part of the
+// selective registration for the OpResolver implementation in micro.
+TfLiteStatus ParseSin(const Operator*, BuiltinOperator, ErrorReporter*,
+                      BuiltinDataAllocator*, void**) {
   return kTfLiteOk;
 }
 
@@ -658,6 +781,22 @@ TfLiteStatus ParseSoftmax(const Operator* op, BuiltinOperator,
   return kTfLiteOk;
 }
 
+// We have this parse function instead of directly returning kTfLiteOk from the
+// switch-case in ParseOpData because this function is used as part of the
+// selective registration for the OpResolver implementation in micro.
+TfLiteStatus ParseSqrt(const Operator*, BuiltinOperator, ErrorReporter*,
+                       BuiltinDataAllocator*, void**) {
+  return kTfLiteOk;
+}
+
+// We have this parse function instead of directly returning kTfLiteOk from the
+// switch-case in ParseOpData because this function is used as part of the
+// selective registration for the OpResolver implementation in micro.
+TfLiteStatus ParseSquare(const Operator*, BuiltinOperator, ErrorReporter*,
+                         BuiltinDataAllocator*, void**) {
+  return kTfLiteOk;
+}
+
 TfLiteStatus ParseSvdf(const Operator* op, BuiltinOperator,
                        ErrorReporter* error_reporter,
                        BuiltinDataAllocator* allocator, void** builtin_data) {
@@ -686,6 +825,14 @@ TfLiteStatus ParseSvdf(const Operator* op, BuiltinOperator,
   return kTfLiteOk;
 }
 
+// We have this parse function instead of directly returning kTfLiteOk from the
+// switch-case in ParseOpData because this function is used as part of the
+// selective registration for the OpResolver implementation in micro.
+TfLiteStatus ParseTanh(const Operator*, BuiltinOperator, ErrorReporter*,
+                       BuiltinDataAllocator*, void**) {
+  return kTfLiteOk;
+}
+
 TfLiteStatus ParseOpData(const Operator* op, BuiltinOperator op_type,
                          ErrorReporter* error_reporter,
                          BuiltinDataAllocator* allocator, void** builtin_data) {
@@ -775,6 +922,11 @@ TfLiteStatus ParseOpData(const Operator* op, BuiltinOperator op_type,
                                builtin_data);
     }
 
+    case BuiltinOperator_HARD_SWISH: {
+      return ParseHardSwish(op, op_type, error_reporter, allocator,
+                            builtin_data);
+    }
+
     case BuiltinOperator_L2_NORMALIZATION: {
       return ParseL2Normalization(op, op_type, error_reporter, allocator,
                                   builtin_data);
@@ -825,23 +977,98 @@ TfLiteStatus ParseOpData(const Operator* op, BuiltinOperator op_type,
       return ParsePool(op, op_type, error_reporter, allocator, builtin_data);
     }
 
+    case BuiltinOperator_MEAN: {
+      return ParseReducer(op, op_type, error_reporter, allocator, builtin_data);
+    }
+
+    case BuiltinOperator_MINIMUM: {
+      return ParseMinimum(op, op_type, error_reporter, allocator, builtin_data);
+    }
+    case BuiltinOperator_NEG: {
+      return ParseNeg(op, op_type, error_reporter, allocator, builtin_data);
+    }
+    case BuiltinOperator_NOT_EQUAL: {
+      return ParseNotEqual(op, op_type, error_reporter, allocator,
+                           builtin_data);
+    }
+    case BuiltinOperator_PAD: {
+      return ParsePad(op, op_type, error_reporter, allocator, builtin_data);
+    }
+    case BuiltinOperator_PADV2: {
+      return ParsePadV2(op, op_type, error_reporter, allocator, builtin_data);
+    }
+    case BuiltinOperator_PRELU: {
+      return ParsePrelu(op, op_type, error_reporter, allocator, builtin_data);
+    }
     case BuiltinOperator_QUANTIZE: {
       return ParseQuantize(op, op_type, error_reporter, allocator,
                            builtin_data);
     }
 
+    case BuiltinOperator_REDUCE_ANY: {
+      return ParseReducer(op, op_type, error_reporter, allocator, builtin_data);
+    }
+
+    case BuiltinOperator_REDUCE_MAX: {
+      return ParseReducer(op, op_type, error_reporter, allocator, builtin_data);
+    }
+
+    case BuiltinOperator_REDUCE_MIN: {
+      return ParseReducer(op, op_type, error_reporter, allocator, builtin_data);
+    }
+
+    case BuiltinOperator_REDUCE_PROD: {
+      return ParseReducer(op, op_type, error_reporter, allocator, builtin_data);
+    }
+
+    case BuiltinOperator_RELU: {
+      return ParseRelu(op, op_type, error_reporter, allocator, builtin_data);
+    }
+
+    case BuiltinOperator_RELU6: {
+      return ParseRelu6(op, op_type, error_reporter, allocator, builtin_data);
+    }
+
     case BuiltinOperator_RESHAPE: {
       return ParseReshape(op, op_type, error_reporter, allocator, builtin_data);
     }
 
+    case BuiltinOperator_ROUND: {
+      return ParseRound(op, op_type, error_reporter, allocator, builtin_data);
+    }
+
+    case BuiltinOperator_RSQRT: {
+      return ParseRsqrt(op, op_type, error_reporter, allocator, builtin_data);
+    }
+
+    case BuiltinOperator_SIN: {
+      return ParseSin(op, op_type, error_reporter, allocator, builtin_data);
+    }
+
     case BuiltinOperator_SOFTMAX: {
       return ParseSoftmax(op, op_type, error_reporter, allocator, builtin_data);
     }
 
+    case BuiltinOperator_SQRT: {
+      return ParseSqrt(op, op_type, error_reporter, allocator, builtin_data);
+    }
+
+    case BuiltinOperator_SQUARE: {
+      return ParseSquare(op, op_type, error_reporter, allocator, builtin_data);
+    }
+
+    case BuiltinOperator_SUM: {
+      return ParseReducer(op, op_type, error_reporter, allocator, builtin_data);
+    }
+
     case BuiltinOperator_SVDF: {
       return ParseSvdf(op, op_type, error_reporter, allocator, builtin_data);
     }
 
+    case BuiltinOperator_TANH: {
+      return ParseTanh(op, op_type, error_reporter, allocator, builtin_data);
+    }
+
     case BuiltinOperator_CAST: {
       auto params = safe_allocator.Allocate<TfLiteCastParams>();
       TF_LITE_ENSURE(error_reporter, params != nullptr);
@@ -1106,20 +1333,6 @@ TfLiteStatus ParseOpData(const Operator* op, BuiltinOperator op_type,
       *builtin_data = params.release();
       return kTfLiteOk;
     }
-    case BuiltinOperator_MEAN:
-    case BuiltinOperator_REDUCE_MAX:
-    case BuiltinOperator_REDUCE_MIN:
-    case BuiltinOperator_REDUCE_PROD:
-    case BuiltinOperator_REDUCE_ANY:
-    case BuiltinOperator_SUM: {
-      auto params = safe_allocator.Allocate<TfLiteReducerParams>();
-      TF_LITE_ENSURE(error_reporter, params != nullptr);
-      if (const auto* schema_params = op->builtin_options_as_ReducerOptions()) {
-        params->keep_dims = schema_params->keep_dims();
-      }
-      *builtin_data = params.release();
-      return kTfLiteOk;
-    }
     case BuiltinOperator_SPLIT: {
       auto params = safe_allocator.Allocate<TfLiteSplitParams>();
       TF_LITE_ENSURE(error_reporter, params != nullptr);
@@ -1336,34 +1549,19 @@ TfLiteStatus ParseOpData(const Operator* op, BuiltinOperator op_type,
     case BuiltinOperator_EQUAL:
     case BuiltinOperator_EXP:
     case BuiltinOperator_EXPAND_DIMS:
-    case BuiltinOperator_HARD_SWISH:
     case BuiltinOperator_LOG_SOFTMAX:
     case BuiltinOperator_MATRIX_DIAG:
     case BuiltinOperator_MATRIX_SET_DIAG:
-    case BuiltinOperator_MINIMUM:
-    case BuiltinOperator_NEG:
-    case BuiltinOperator_NOT_EQUAL:
-    case BuiltinOperator_PAD:
-    case BuiltinOperator_PADV2:
-    case BuiltinOperator_PRELU:
-    case BuiltinOperator_RELU:
-    case BuiltinOperator_RELU6:
     case BuiltinOperator_RELU_N1_TO_1:
-    case BuiltinOperator_ROUND:
-    case BuiltinOperator_RSQRT:
     case BuiltinOperator_SELECT:
     case BuiltinOperator_SELECT_V2:
-    case BuiltinOperator_SIN:
     case BuiltinOperator_SLICE:
     case BuiltinOperator_SPACE_TO_BATCH_ND:
-    case BuiltinOperator_SQRT:
-    case BuiltinOperator_TANH:
     case BuiltinOperator_TILE:
     case BuiltinOperator_TOPK_V2:
     case BuiltinOperator_TRANSPOSE:
     case BuiltinOperator_POW:
     case BuiltinOperator_FLOOR_DIV:
-    case BuiltinOperator_SQUARE:
     case BuiltinOperator_ZEROS_LIKE:
     case BuiltinOperator_FILL:
     case BuiltinOperator_FLOOR_MOD:
diff --git a/tensorflow/lite/core/api/flatbuffer_conversions.h b/tensorflow/lite/core/api/flatbuffer_conversions.h
index 8b8e201afce..c63968cbdec 100644
--- a/tensorflow/lite/core/api/flatbuffer_conversions.h
+++ b/tensorflow/lite/core/api/flatbuffer_conversions.h
@@ -140,6 +140,11 @@ TfLiteStatus ParseGreaterEqual(const Operator* op, BuiltinOperator op_type,
                                BuiltinDataAllocator* allocator,
                                void** builtin_data);
 
+TfLiteStatus ParseHardSwish(const Operator* op, BuiltinOperator op_type,
+                            ErrorReporter* error_reporter,
+                            BuiltinDataAllocator* allocator,
+                            void** builtin_data);
+
 TfLiteStatus ParseL2Normalization(const Operator* op, BuiltinOperator op_type,
                                   ErrorReporter* error_reporter,
                                   BuiltinDataAllocator* allocator,
@@ -182,27 +187,88 @@ TfLiteStatus ParseMaximum(const Operator* op, BuiltinOperator op_type,
                           ErrorReporter* error_reporter,
                           BuiltinDataAllocator* allocator, void** builtin_data);
 
+TfLiteStatus ParseMinimum(const Operator* op, BuiltinOperator op_type,
+                          ErrorReporter* error_reporter,
+                          BuiltinDataAllocator* allocator, void** builtin_data);
+
+TfLiteStatus ParseNeg(const Operator* op, BuiltinOperator op_type,
+                      ErrorReporter* error_reporter,
+                      BuiltinDataAllocator* allocator, void** builtin_data);
+
+TfLiteStatus ParseNotEqual(const Operator* op, BuiltinOperator op_type,
+                           ErrorReporter* error_reporter,
+                           BuiltinDataAllocator* allocator,
+                           void** builtin_data);
+
+TfLiteStatus ParsePad(const Operator* op, BuiltinOperator op_type,
+                      ErrorReporter* error_reporter,
+                      BuiltinDataAllocator* allocator, void** builtin_data);
+
+TfLiteStatus ParsePadV2(const Operator* op, BuiltinOperator op_type,
+                        ErrorReporter* error_reporter,
+                        BuiltinDataAllocator* allocator, void** builtin_data);
+
 TfLiteStatus ParsePool(const Operator* op, BuiltinOperator op_type,
                        ErrorReporter* error_reporter,
                        BuiltinDataAllocator* allocator, void** builtin_data);
 
+TfLiteStatus ParsePrelu(const Operator* op, BuiltinOperator op_type,
+                        ErrorReporter* error_reporter,
+                        BuiltinDataAllocator* allocator, void** builtin_data);
+
 TfLiteStatus ParseQuantize(const Operator* op, BuiltinOperator op_type,
                            ErrorReporter* error_reporter,
                            BuiltinDataAllocator* allocator,
                            void** builtin_data);
 
+TfLiteStatus ParseReducer(const Operator* op, BuiltinOperator op_type,
+                          ErrorReporter* error_reporter,
+                          BuiltinDataAllocator* allocator, void** builtin_data);
+
+TfLiteStatus ParseRelu(const Operator* op, BuiltinOperator op_type,
+                       ErrorReporter* error_reporter,
+                       BuiltinDataAllocator* allocator, void** builtin_data);
+
+TfLiteStatus ParseRelu6(const Operator* op, BuiltinOperator op_type,
+                        ErrorReporter* error_reporter,
+                        BuiltinDataAllocator* allocator, void** builtin_data);
+
 TfLiteStatus ParseReshape(const Operator* op, BuiltinOperator op_type,
                           ErrorReporter* error_reporter,
                           BuiltinDataAllocator* allocator, void** builtin_data);
 
+TfLiteStatus ParseRound(const Operator* op, BuiltinOperator op_type,
+                        ErrorReporter* error_reporter,
+                        BuiltinDataAllocator* allocator, void** builtin_data);
+
+TfLiteStatus ParseRsqrt(const Operator* op, BuiltinOperator op_type,
+                        ErrorReporter* error_reporter,
+                        BuiltinDataAllocator* allocator, void** builtin_data);
+
+TfLiteStatus ParseSin(const Operator* op, BuiltinOperator op_type,
+                      ErrorReporter* error_reporter,
+                      BuiltinDataAllocator* allocator, void** builtin_data);
+
 TfLiteStatus ParseSoftmax(const Operator* op, BuiltinOperator op_type,
                           ErrorReporter* error_reporter,
                           BuiltinDataAllocator* allocator, void** builtin_data);
 
+TfLiteStatus ParseSqrt(const Operator* op, BuiltinOperator op_type,
+                       ErrorReporter* error_reporter,
+                       BuiltinDataAllocator* allocator, void** builtin_data);
+
+TfLiteStatus ParseSquare(const Operator* op, BuiltinOperator op_type,
+                         ErrorReporter* error_reporter,
+                         BuiltinDataAllocator* allocator, void** builtin_data);
+
 TfLiteStatus ParseSvdf(const Operator* op, BuiltinOperator op_type,
                        ErrorReporter* error_reporter,
                        BuiltinDataAllocator* allocator, void** builtin_data);
 
+TfLiteStatus ParseTanh(const Operator* op, BuiltinOperator op_type,
+                       ErrorReporter* error_reporter,
+                       BuiltinDataAllocator* allocator, void** builtin_data);
+
 }  // namespace tflite
 
 #endif  // TENSORFLOW_LITE_CORE_API_FLATBUFFER_CONVERSIONS_H_
diff --git a/tensorflow/lite/micro/micro_mutable_op_resolver.h b/tensorflow/lite/micro/micro_mutable_op_resolver.h
index 246ca723d49..90f0c39f1d1 100644
--- a/tensorflow/lite/micro/micro_mutable_op_resolver.h
+++ b/tensorflow/lite/micro/micro_mutable_op_resolver.h
@@ -194,10 +194,9 @@ class MicroMutableOpResolver : public MicroOpResolver {
   }
 
   TfLiteStatus AddHardSwish() {
-    // TODO(b/149408647): Replace ParseOpData with the operator specific parse
-    // function.
     return AddBuiltin(BuiltinOperator_HARD_SWISH,
-                      *tflite::ops::micro::Register_HARD_SWISH(), ParseOpData);
+                      *tflite::ops::micro::Register_HARD_SWISH(),
+                      ParseHardSwish);
   }
 
   TfLiteStatus AddL2Normalization() {
@@ -256,17 +255,13 @@ class MicroMutableOpResolver : public MicroOpResolver {
   }
 
   TfLiteStatus AddMean() {
-    // TODO(b/149408647): Replace ParseOpData with the operator specific parse
-    // function.
     return AddBuiltin(BuiltinOperator_MEAN,
-                      *tflite::ops::micro::Register_MEAN(), ParseOpData);
+                      *tflite::ops::micro::Register_MEAN(), ParseReducer);
   }
 
   TfLiteStatus AddMinimum() {
-    // TODO(b/149408647): Replace ParseOpData with the operator specific parse
-    // function.
     return AddBuiltin(BuiltinOperator_MINIMUM,
-                      *tflite::ops::micro::Register_MINIMUM(), ParseOpData);
+                      *tflite::ops::micro::Register_MINIMUM(), ParseMinimum);
   }
 
   TfLiteStatus AddMul() {
@@ -277,17 +272,13 @@ class MicroMutableOpResolver : public MicroOpResolver {
   }
 
   TfLiteStatus AddNeg() {
-    // TODO(b/149408647): Replace ParseOpData with the operator specific parse
-    // function.
     return AddBuiltin(BuiltinOperator_NEG, *tflite::ops::micro::Register_NEG(),
-                      ParseOpData);
+                      ParseNeg);
   }
 
   TfLiteStatus AddNotEqual() {
-    // TODO(b/149408647): Replace ParseOpData with the operator specific parse
-    // function.
     return AddBuiltin(BuiltinOperator_NOT_EQUAL,
-                      *tflite::ops::micro::Register_NOT_EQUAL(), ParseOpData);
+                      *tflite::ops::micro::Register_NOT_EQUAL(), ParseNotEqual);
   }
 
   TfLiteStatus AddPack() {
@@ -298,24 +289,18 @@ class MicroMutableOpResolver : public MicroOpResolver {
   }
 
   TfLiteStatus AddPad() {
-    // TODO(b/149408647): Replace ParseOpData with the operator specific parse
-    // function.
     return AddBuiltin(BuiltinOperator_PAD, *tflite::ops::micro::Register_PAD(),
-                      ParseOpData);
+                      ParsePad);
   }
 
   TfLiteStatus AddPadV2() {
-    // TODO(b/149408647): Replace ParseOpData with the operator specific parse
-    // function.
     return AddBuiltin(BuiltinOperator_PADV2,
-                      *tflite::ops::micro::Register_PADV2(), ParseOpData);
+                      *tflite::ops::micro::Register_PADV2(), ParsePadV2);
   }
 
   TfLiteStatus AddPrelu() {
-    // TODO(b/149408647): Replace ParseOpData with the operator specific parse
-    // function.
     return AddBuiltin(BuiltinOperator_PRELU,
-                      *tflite::ops::micro::Register_PRELU(), ParseOpData);
+                      *tflite::ops::micro::Register_PRELU(), ParsePrelu);
   }
 
   TfLiteStatus AddQuantize() {
@@ -324,17 +309,13 @@ class MicroMutableOpResolver : public MicroOpResolver {
   }
 
   TfLiteStatus AddRelu() {
-    // TODO(b/149408647): Replace ParseOpData with the operator specific parse
-    // function.
     return AddBuiltin(BuiltinOperator_RELU,
-                      *tflite::ops::micro::Register_RELU(), ParseOpData);
+                      *tflite::ops::micro::Register_RELU(), ParseRelu);
   }
 
   TfLiteStatus AddRelu6() {
-    // TODO(b/149408647): Replace ParseOpData with the operator specific parse
-    // function.
     return AddBuiltin(BuiltinOperator_RELU6,
-                      *tflite::ops::micro::Register_RELU6(), ParseOpData);
+                      *tflite::ops::micro::Register_RELU6(), ParseRelu6);
   }
 
   TfLiteStatus AddReshape() {
@@ -351,24 +332,18 @@ class MicroMutableOpResolver : public MicroOpResolver {
   }
 
   TfLiteStatus AddRound() {
-    // TODO(b/149408647): Replace ParseOpData with the operator specific parse
-    // function.
     return AddBuiltin(BuiltinOperator_ROUND,
-                      *tflite::ops::micro::Register_ROUND(), ParseOpData);
+                      *tflite::ops::micro::Register_ROUND(), ParseRound);
   }
 
   TfLiteStatus AddRsqrt() {
-    // TODO(b/149408647): Replace ParseOpData with the operator specific parse
-    // function.
     return AddBuiltin(BuiltinOperator_RSQRT,
-                      *tflite::ops::micro::Register_RSQRT(), ParseOpData);
+                      *tflite::ops::micro::Register_RSQRT(), ParseRsqrt);
   }
 
   TfLiteStatus AddSin() {
-    // TODO(b/149408647): Replace ParseOpData with the operator specific parse
-    // function.
     return AddBuiltin(BuiltinOperator_SIN, *tflite::ops::micro::Register_SIN(),
-                      ParseOpData);
+                      ParseSin);
   }
 
   TfLiteStatus AddSoftmax() {
@@ -384,17 +359,13 @@ class MicroMutableOpResolver : public MicroOpResolver {
   }
 
   TfLiteStatus AddSqrt() {
-    // TODO(b/149408647): Replace ParseOpData with the operator specific parse
-    // function.
     return AddBuiltin(BuiltinOperator_SQRT,
-                      *tflite::ops::micro::Register_SQRT(), ParseOpData);
+                      *tflite::ops::micro::Register_SQRT(), ParseSqrt);
   }
 
   TfLiteStatus AddSquare() {
-    // TODO(b/149408647): Replace ParseOpData with the operator specific parse
-    // function.
     return AddBuiltin(BuiltinOperator_SQUARE,
-                      *tflite::ops::micro::Register_SQUARE(), ParseOpData);
+                      *tflite::ops::micro::Register_SQUARE(), ParseSquare);
   }
 
   TfLiteStatus AddStridedSlice() {
@@ -418,10 +389,8 @@ class MicroMutableOpResolver : public MicroOpResolver {
   }
 
   TfLiteStatus AddTanh() {
-    // TODO(b/149408647): Replace ParseOpData with the operator specific parse
-    // function.
     return AddBuiltin(BuiltinOperator_TANH,
-                      *tflite::ops::micro::Register_TANH(), ParseOpData);
+                      *tflite::ops::micro::Register_TANH(), ParseTanh);
   }
 
   TfLiteStatus AddUnpack() {

From 655ce09f679a90ecd561538227c703b42d0fc5fa Mon Sep 17 00:00:00 2001
From: Christian Sigg <csigg@google.com>
Date: Sat, 27 Jun 2020 01:35:38 -0700
Subject: [PATCH 1220/1390] Provide ldexp float overload for HIP, it's missing
 in their headers.

PiperOrigin-RevId: 318607361
Change-Id: Id4d3058d3960484e9f2136de7dc72b055c1a3fad
---
 tensorflow/core/kernels/cwise_ops_gpu_common.cu.h | 6 ++++++
 tensorflow/core/kernels/rnn/blas_gemm.h           | 5 +++++
 2 files changed, 11 insertions(+)

diff --git a/tensorflow/core/kernels/cwise_ops_gpu_common.cu.h b/tensorflow/core/kernels/cwise_ops_gpu_common.cu.h
index ecc58da315f..8849c3f4edd 100644
--- a/tensorflow/core/kernels/cwise_ops_gpu_common.cu.h
+++ b/tensorflow/core/kernels/cwise_ops_gpu_common.cu.h
@@ -30,6 +30,12 @@ limitations under the License.
 #include "tensorflow/core/platform/types.h"
 
 #include "tensorflow/core/platform/logging.h"
+
+#ifdef __HIP_DEVICE_COMPILE__
+// Provide ldexp float overload for HIP, it's missing in their headers.
+__device__ inline float ldexp(float x, int exp) { return ldexpf(x, exp); }
+#endif
+
 namespace tensorflow {
 namespace functor {
 
diff --git a/tensorflow/core/kernels/rnn/blas_gemm.h b/tensorflow/core/kernels/rnn/blas_gemm.h
index 126e1edef17..74f4cd2bb39 100644
--- a/tensorflow/core/kernels/rnn/blas_gemm.h
+++ b/tensorflow/core/kernels/rnn/blas_gemm.h
@@ -25,6 +25,11 @@ limitations under the License.
 #include "tensorflow/core/kernels/eigen_contraction_kernel.h"
 #endif
 
+#ifdef __HIP_DEVICE_COMPILE__
+// Provide ldexp float overload for HIP, it's missing in their headers.
+__device__ inline float ldexp(float x, int exp) { return ldexpf(x, exp); }
+#endif
+
 namespace tensorflow {
 class OpKernelContext;
 namespace functor {

From 0c894a6202d0805315021165aa176bf043f7f392 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sat, 27 Jun 2020 02:01:25 -0700
Subject: [PATCH 1221/1390] compat: Update forward compatibility horizon to
 2020-06-27

PiperOrigin-RevId: 318608502
Change-Id: Id0056ec37abb145bbcb2ce180e598886a2d3e825
---
 tensorflow/python/compat/compat.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/compat/compat.py b/tensorflow/python/compat/compat.py
index ae1da8b7874..6c0f39a5494 100644
--- a/tensorflow/python/compat/compat.py
+++ b/tensorflow/python/compat/compat.py
@@ -33,7 +33,7 @@ from tensorflow.python.util.tf_export import tf_export
 # This value changes every day with an automatic CL. It can be modified in code
 # via `forward_compatibility_horizon()` or with the environment variable
 # TF_FORWARD_COMPATIBILITY_DELTA_DAYS, which is added to the compatibility date.
-_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2020, 6, 26)
+_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2020, 6, 27)
 _FORWARD_COMPATIBILITY_DELTA_DAYS_VAR_NAME = "TF_FORWARD_COMPATIBILITY_DELTA_DAYS"
 _FORWARD_COMPATIBILITY_DATE_NUMBER = None
 

From 4aa879aab5fe6139768f98423a30de0bc5d26862 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sat, 27 Jun 2020 02:01:26 -0700
Subject: [PATCH 1222/1390] Update GraphDef version to 445.

PiperOrigin-RevId: 318608503
Change-Id: I54e0274fcd9177f1e8c265752402340a05273ba0
---
 tensorflow/core/public/version.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index 7c0eb7bb0c2..5f6b50c71f6 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -108,7 +108,7 @@ limitations under the License.
 
 #define TF_GRAPH_DEF_VERSION_MIN_PRODUCER 0
 #define TF_GRAPH_DEF_VERSION_MIN_CONSUMER 0
-#define TF_GRAPH_DEF_VERSION 444  // Updated: 2020/6/26
+#define TF_GRAPH_DEF_VERSION 445  // Updated: 2020/6/27
 
 // Checkpoint compatibility versions (the versions field in SavedSliceMeta).
 //

From 6dc94954e1409db74245c98888daffa2f23b5a28 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20Kubov=C4=8D=C3=ADk?= <markub3327@gmail.com>
Date: Sat, 27 Jun 2020 12:17:17 +0200
Subject: [PATCH 1223/1390] Update save.py

---
 tensorflow/python/keras/saving/save.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/tensorflow/python/keras/saving/save.py b/tensorflow/python/keras/saving/save.py
index 100fb05943a..43c09a62ea9 100644
--- a/tensorflow/python/keras/saving/save.py
+++ b/tensorflow/python/keras/saving/save.py
@@ -48,7 +48,6 @@ _KERAS_SAVED_MODEL_STILL_EXPERIMENTAL = True
 def save_model(model,
                filepath,
                overwrite=True,
-               lockFile=True,
                include_optimizer=True,
                save_format=None,
                signatures=None,
@@ -96,7 +95,6 @@ def save_model(model,
       overwrite: Whether we should overwrite any existing model at the target
         location, or instead ask the user with a manual prompt.
       include_optimizer: If True, save optimizer's state together.
-      lockFile: If True, protect model file while saving model.
       save_format: Either 'tf' or 'h5', indicating whether to save the model
         to Tensorflow SavedModel or HDF5. Defaults to 'tf' in TF 2.X, and 'h5'
         in TF 1.X.
@@ -130,7 +128,7 @@ def save_model(model,
           'to the Tensorflow SavedModel format (by setting save_format="tf") '
           'or using `save_weights`.')
     hdf5_format.save_model_to_hdf5(
-        model, filepath, overwrite, lockFile, include_optimizer)
+        model, filepath, overwrite, include_optimizer)
   else:
     saved_model_save.save(model, filepath, overwrite, include_optimizer,
                           signatures, options)

From 37ac571e14ab2494677158126ca4256eb41bc8f6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20Kubov=C4=8D=C3=ADk?= <markub3327@gmail.com>
Date: Sat, 27 Jun 2020 12:30:28 +0200
Subject: [PATCH 1224/1390] tf.io.gfile

---
 tensorflow/python/keras/saving/hdf5_format.py | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/tensorflow/python/keras/saving/hdf5_format.py b/tensorflow/python/keras/saving/hdf5_format.py
index 93f06c018d1..c60aedb56e1 100644
--- a/tensorflow/python/keras/saving/hdf5_format.py
+++ b/tensorflow/python/keras/saving/hdf5_format.py
@@ -35,6 +35,7 @@ from tensorflow.python.ops import variables as variables_module
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import serialization
 from tensorflow.python.util.lazy_loader import LazyLoader
+import tensorflow.python.platform.gfile as gfile
 
 # pylint: disable=g-import-not-at-top
 try:
@@ -52,12 +53,11 @@ sequential_lib = LazyLoader(
     "tensorflow.python.keras.engine.sequential")
 # pylint:enable=g-inconsistent-quotes
 
-
 # create lock file
 def create_lockfile(filepath):
   lockfile_path = f"{filepath}.lock"
 
-  f = open(lockfile_path, 'w')
+  f = gfile.GFile(lockfile_path, 'w')
   f.write(f"{os.getpid()}")
   f.close()
 
@@ -65,10 +65,7 @@ def create_lockfile(filepath):
 
 def check_lockfile(filepath):
   lockfile_path = f"{filepath}.lock"
-  if os.path.exists(lockfile_path):
-    # use PID?
-    return True
-  return False
+  return gfile.Exists(lockfile_path)
 
 def save_model_to_hdf5(model, filepath, overwrite=True, lockFile=True, include_optimizer=True):
   """Saves a model to a HDF5 file.
@@ -152,7 +149,7 @@ def save_model_to_hdf5(model, filepath, overwrite=True, lockFile=True, include_o
 
       # remove lock file
       if (lockFile == True):
-        os.remove(lockfile_path)
+         gfile.Remove(lockfile_path)
 
 
 def load_model_from_hdf5(filepath, custom_objects=None, compile=True):  # pylint: disable=redefined-builtin

From 75850423270a9943e4320233af4f16a44100e735 Mon Sep 17 00:00:00 2001
From: Jiho Choi <jihochoi@google.com>
Date: Sat, 27 Jun 2020 10:13:09 -0700
Subject: [PATCH 1225/1390] Use the event name as part of the step name for the
 explicit root events.

PiperOrigin-RevId: 318634056
Change-Id: I2860534f4ebe62e732306a39a6a8fd57f6366b16
---
 .../profiler/convert/xplane_to_trace_events.cc  |  3 +++
 tensorflow/core/profiler/utils/group_events.cc  | 17 ++++++++++++++++-
 .../core/profiler/utils/group_events_test.cc    |  7 ++++---
 3 files changed, 23 insertions(+), 4 deletions(-)

diff --git a/tensorflow/core/profiler/convert/xplane_to_trace_events.cc b/tensorflow/core/profiler/convert/xplane_to_trace_events.cc
index 882f50e6080..ceb3e003564 100644
--- a/tensorflow/core/profiler/convert/xplane_to_trace_events.cc
+++ b/tensorflow/core/profiler/convert/xplane_to_trace_events.cc
@@ -91,6 +91,9 @@ void ConvertXPlaneToTraceEvents(uint32 device_id, const XPlaneVisitor& xplane,
           xevent.ForEachStat([&](const XStatVisitor& stat) {
             if (stat.ValueCase() == XStat::VALUE_NOT_SET) return;
             if (IsInternalStat(stat.Type())) return;
+            if (stat.Type() == StatType::kStepName) {
+              event->set_name(stat.ToString());
+            }
             args[std::string(stat.Name())] = stat.ToString();
           });
         });
diff --git a/tensorflow/core/profiler/utils/group_events.cc b/tensorflow/core/profiler/utils/group_events.cc
index 0772cff7b97..926dfe65156 100644
--- a/tensorflow/core/profiler/utils/group_events.cc
+++ b/tensorflow/core/profiler/utils/group_events.cc
@@ -139,12 +139,25 @@ bool HasFunctionRun(EventNode* event_node) {
   return false;
 }
 
+bool IsImplicitRootEvent(const XEventVisitor& event) {
+  static const auto* const kImplicitRootEvents = new absl::flat_hash_set<int64>{
+      HostEventType::kFunctionRun, HostEventType::kSessionRun,
+      HostEventType::kRunGraph, HostEventType::kExecutorStateProcess};
+  return event.Type().has_value() &&
+         kImplicitRootEvents->contains(*event.Type());
+}
+
 void ProcessRootEvent(int64 group_id, EventNode* root_event,
                       EventGroupNameMap* event_group_name_map) {
   root_event->PropagateGroupId(group_id);
   std::string group_name = root_event->GetGroupName();
   // TODO(jihochoi): change event name instead.
-  root_event->AddStepName(group_name);
+  if (!IsImplicitRootEvent(root_event->GetEventVisitor())) {
+    // Add the `step_name` stat for the user-defined root events only. When an
+    // XEvent is converted to a trace event, the trace event name is set to the
+    // `step_name` stat's value if present.
+    root_event->AddStepName(group_name);
+  }
   event_group_name_map->emplace(group_id, std::move(group_name));
 }
 
@@ -336,6 +349,8 @@ std::string EventNode::GetGroupName() const {
   if (absl::optional<XStatVisitor> stat =
           GetContextStat(StatType::kGraphType)) {
     absl::StrAppend(&name, stat->StrOrRefValue(), " ");
+  } else if (!(IsImplicitRootEvent(visitor_))) {
+    absl::StrAppend(&name, GetEventVisitor().Name(), " ");
   }
   int64 step_num = group_id_.value_or(0);
   if (absl::optional<XStatVisitor> stat = GetContextStat(StatType::kIterNum)) {
diff --git a/tensorflow/core/profiler/utils/group_events_test.cc b/tensorflow/core/profiler/utils/group_events_test.cc
index e9f5d58f8d5..231eec5ebe7 100644
--- a/tensorflow/core/profiler/utils/group_events_test.cc
+++ b/tensorflow/core/profiler/utils/group_events_test.cc
@@ -40,8 +40,9 @@ TEST(GroupEventsTest, GroupGpuTraceTest) {
   host_plane_builder.ReserveLines(2);
 
   auto main_thread = host_plane_builder.GetOrCreateLine(0);
-  CreateXEvent(&host_plane_builder, &main_thread, HostEventType::kTraceContext,
-               0, 100, {{StatType::kStepNum, kStepNum}});
+  CreateXEvent(
+      &host_plane_builder, &main_thread, HostEventType::kTraceContext, 0, 100,
+      {{StatType::kGraphType, "train"}, {StatType::kStepNum, kStepNum}});
   CreateXEvent(&host_plane_builder, &main_thread, HostEventType::kFunctionRun,
                10, 90, {{StatType::kStepId, kStepId}});
 
@@ -68,7 +69,7 @@ TEST(GroupEventsTest, GroupGpuTraceTest) {
                 device_plane->lines(0).events(0).stats(1)),
             StatType::kGroupId);
   EXPECT_EQ(event_group_name_map.size(), 1);
-  EXPECT_EQ(event_group_name_map[0], "123");
+  EXPECT_EQ(event_group_name_map[0], "train 123");
 }
 
 TEST(GroupEventsTest, GroupTensorFlowLoopTest) {

From 9008adb5e26080f2eaf745bbe0a581ea3eacb272 Mon Sep 17 00:00:00 2001
From: Benoit Jacob <benoitjacob@google.com>
Date: Sat, 27 Jun 2020 12:07:56 -0700
Subject: [PATCH 1226/1390] Adapt to upcoming ruy::MulParams change where only
 the fields meaningful in each case will exist.

PiperOrigin-RevId: 318640066
Change-Id: Ic9bee87edde887859c9e6cbc53ebe3291978c77c
---
 tensorflow/lite/kernels/BUILD                 |  1 +
 .../lite/kernels/cpu_backend_gemm_ruy.h       | 75 +++++++++++++++----
 2 files changed, 60 insertions(+), 16 deletions(-)

diff --git a/tensorflow/lite/kernels/BUILD b/tensorflow/lite/kernels/BUILD
index 9d3e5929d82..cb00d73adac 100644
--- a/tensorflow/lite/kernels/BUILD
+++ b/tensorflow/lite/kernels/BUILD
@@ -361,6 +361,7 @@ cc_library(
         "@ruy//ruy",
         "@ruy//ruy:matrix",
         "@ruy//ruy:path",
+        "@ruy//ruy:mul_params",
         "@ruy//ruy/profiler:instrumentation",
         # We only need to depend on gemmlowp and Eigen when tflite_with_ruy
         # is false, but putting these dependencies in a select() seems to
diff --git a/tensorflow/lite/kernels/cpu_backend_gemm_ruy.h b/tensorflow/lite/kernels/cpu_backend_gemm_ruy.h
index 07ae2ff08b7..6a818834d30 100644
--- a/tensorflow/lite/kernels/cpu_backend_gemm_ruy.h
+++ b/tensorflow/lite/kernels/cpu_backend_gemm_ruy.h
@@ -17,6 +17,7 @@ limitations under the License.
 #define TENSORFLOW_LITE_KERNELS_CPU_BACKEND_GEMM_RUY_H_
 
 #include "ruy/matrix.h"  // from @ruy
+#include "ruy/mul_params.h"  // from @ruy
 #include "ruy/ruy.h"  // from @ruy
 #include "tensorflow/lite/kernels/cpu_backend_context.h"
 #include "tensorflow/lite/kernels/cpu_backend_gemm_params.h"
@@ -57,23 +58,65 @@ void MakeRuyMatrix(const MatrixParams<Scalar>& params, DataPointer data_ptr,
   }
 }
 
-template <typename GemmParamsType, typename RuySpecType>
-void MakeRuyMulParams(const GemmParamsType& params,
-                      RuySpecType* ruy_mul_params) {
-  // This validation has already been performed by the Gemm API entry point,
-  // but it doesn't hurt to test specifically this again here, where it's
-  // being used.
-  ValidateGemmParams(params);
+// Floating-point case.
+template <typename AccumScalar, typename DstScalar,
+          QuantizationFlavor quantization_flavor>
+struct MakeRuyMulParamsImpl final {
+  static void Run(
+      const GemmParams<AccumScalar, DstScalar, quantization_flavor>& params,
+      ruy::MulParams<AccumScalar, DstScalar>* ruy_mul_params) {
+    static_assert(quantization_flavor == QuantizationFlavor::kFloatingPoint,
+                  "");
+    ruy_mul_params->set_bias(params.bias);
+    ruy_mul_params->set_clamp_min(params.clamp_min);
+    ruy_mul_params->set_clamp_max(params.clamp_max);
+  }
+};
 
-  ruy_mul_params->set_multiplier_fixedpoint(params.multiplier_fixedpoint);
-  ruy_mul_params->set_multiplier_exponent(params.multiplier_exponent);
-  ruy_mul_params->set_multiplier_fixedpoint_perchannel(
-      params.multiplier_fixedpoint_perchannel);
-  ruy_mul_params->set_multiplier_exponent_perchannel(
-      params.multiplier_exponent_perchannel);
-  ruy_mul_params->set_bias(params.bias);
-  ruy_mul_params->set_clamp_min(params.clamp_min);
-  ruy_mul_params->set_clamp_max(params.clamp_max);
+// Integer-quantized case with destination type narrower than int32
+template <typename DstScalar, QuantizationFlavor quantization_flavor>
+struct MakeRuyMulParamsImpl<std::int32_t, DstScalar, quantization_flavor>
+    final {
+  static void Run(
+      const GemmParams<std::int32_t, DstScalar, quantization_flavor>& params,
+      ruy::MulParams<std::int32_t, DstScalar>* ruy_mul_params) {
+    static_assert(sizeof(DstScalar) < sizeof(std::int32_t), "");
+    if (quantization_flavor ==
+        QuantizationFlavor::kIntegerWithUniformMultiplier) {
+      ruy_mul_params->set_multiplier_fixedpoint(params.multiplier_fixedpoint);
+      ruy_mul_params->set_multiplier_exponent(params.multiplier_exponent);
+    }
+    if (quantization_flavor ==
+        QuantizationFlavor::kIntegerWithPerRowMultiplier) {
+      ruy_mul_params->set_multiplier_fixedpoint_perchannel(
+          params.multiplier_fixedpoint_perchannel);
+      ruy_mul_params->set_multiplier_exponent_perchannel(
+          params.multiplier_exponent_perchannel);
+    }
+    ruy_mul_params->set_bias(params.bias);
+    ruy_mul_params->set_clamp_min(params.clamp_min);
+    ruy_mul_params->set_clamp_max(params.clamp_max);
+  }
+};
+
+// Raw-integer case with destination type int32.
+template <QuantizationFlavor quantization_flavor>
+struct MakeRuyMulParamsImpl<std::int32_t, std::int32_t, quantization_flavor>
+    final {
+  static void Run(
+      const GemmParams<std::int32_t, std::int32_t, quantization_flavor>& params,
+      ruy::MulParams<std::int32_t, std::int32_t>* ruy_mul_params) {
+    ruy_mul_params->set_bias(params.bias);
+  }
+};
+
+template <typename AccumScalar, typename DstScalar,
+          QuantizationFlavor quantization_flavor>
+void MakeRuyMulParams(
+    const GemmParams<AccumScalar, DstScalar, quantization_flavor>& params,
+    ruy::MulParams<AccumScalar, DstScalar>* ruy_mul_params) {
+  MakeRuyMulParamsImpl<AccumScalar, DstScalar, quantization_flavor>::Run(
+      params, ruy_mul_params);
 }
 
 template <typename LhsScalar, typename RhsScalar, typename AccumScalar,

From 7333b4da5895d83475e990e601cbb3ee1dc0feae Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20Kubov=C4=8D=C3=ADk?= <markub3327@gmail.com>
Date: Sat, 27 Jun 2020 22:05:15 +0200
Subject: [PATCH 1227/1390] Update hdf5_format.py

---
 tensorflow/python/keras/saving/hdf5_format.py | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/tensorflow/python/keras/saving/hdf5_format.py b/tensorflow/python/keras/saving/hdf5_format.py
index c60aedb56e1..42920d9f546 100644
--- a/tensorflow/python/keras/saving/hdf5_format.py
+++ b/tensorflow/python/keras/saving/hdf5_format.py
@@ -35,7 +35,7 @@ from tensorflow.python.ops import variables as variables_module
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import serialization
 from tensorflow.python.util.lazy_loader import LazyLoader
-import tensorflow.python.platform.gfile as gfile
+from tensorflow.python.platform.gfile import GFile, Remove, Exists
 
 # pylint: disable=g-import-not-at-top
 try:
@@ -57,17 +57,17 @@ sequential_lib = LazyLoader(
 def create_lockfile(filepath):
   lockfile_path = f"{filepath}.lock"
 
-  f = gfile.GFile(lockfile_path, 'w')
-  f.write(f"{os.getpid()}")
+  f = GFile(lockfile_path, 'w')
+  f.write(str(os.getpid()))
   f.close()
 
   return lockfile_path 
 
 def check_lockfile(filepath):
   lockfile_path = f"{filepath}.lock"
-  return gfile.Exists(lockfile_path)
+  return Exists(lockfile_path)
 
-def save_model_to_hdf5(model, filepath, overwrite=True, lockFile=True, include_optimizer=True):
+def save_model_to_hdf5(model, filepath, overwrite=True, lockfile=True, include_optimizer=True):
   """Saves a model to a HDF5 file.
 
   The saved model contains:
@@ -87,6 +87,7 @@ def save_model_to_hdf5(model, filepath, overwrite=True, lockFile=True, include_o
       overwrite: Whether we should overwrite any existing
           model at the target location, or instead
           ask the user with a manual prompt.
+      lockfile: Create a lockfile before saving the model file to prevent from reading, while saving is not done.
       include_optimizer: If True, save optimizer's state together.
 
   Raises:
@@ -114,7 +115,7 @@ def save_model_to_hdf5(model, filepath, overwrite=True, lockFile=True, include_o
         return
 
     # create lock file
-    if (lockFile == True):
+    if (lockfile == True):
       lockfile_path = create_lockfile(filepath)
 
     f = h5py.File(filepath, mode='w')
@@ -148,8 +149,8 @@ def save_model_to_hdf5(model, filepath, overwrite=True, lockFile=True, include_o
       f.close()
 
       # remove lock file
-      if (lockFile == True):
-         gfile.Remove(lockfile_path)
+      if (lockfile == True):
+        Remove(lockfile_path)
 
 
 def load_model_from_hdf5(filepath, custom_objects=None, compile=True):  # pylint: disable=redefined-builtin

From 1bec6970af0d31d96bc3b096f6d7021d52f6945f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20Kubov=C4=8D=C3=ADk?= <markub3327@gmail.com>
Date: Sat, 27 Jun 2020 22:05:28 +0200
Subject: [PATCH 1228/1390] Update hdf5_format.py


From 3e91127393367cd6fb31cbdb2fe1751804225032 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20Kubov=C4=8D=C3=ADk?= <markub3327@gmail.com>
Date: Sat, 27 Jun 2020 22:09:11 +0200
Subject: [PATCH 1229/1390] Update hdf5_format.py

---
 tensorflow/python/keras/saving/hdf5_format.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/python/keras/saving/hdf5_format.py b/tensorflow/python/keras/saving/hdf5_format.py
index 42920d9f546..4c42a1ff04e 100644
--- a/tensorflow/python/keras/saving/hdf5_format.py
+++ b/tensorflow/python/keras/saving/hdf5_format.py
@@ -55,7 +55,7 @@ sequential_lib = LazyLoader(
 
 # create lock file
 def create_lockfile(filepath):
-  lockfile_path = f"{filepath}.lock"
+  lockfile_path = filepath + ".lock"
 
   f = GFile(lockfile_path, 'w')
   f.write(str(os.getpid()))
@@ -64,7 +64,7 @@ def create_lockfile(filepath):
   return lockfile_path 
 
 def check_lockfile(filepath):
-  lockfile_path = f"{filepath}.lock"
+  lockfile_path = filepath + ".lock"
   return Exists(lockfile_path)
 
 def save_model_to_hdf5(model, filepath, overwrite=True, lockfile=True, include_optimizer=True):

From c6a99a31b44ed8da1106690285c9c5e511f1afd0 Mon Sep 17 00:00:00 2001
From: Dimitris Vardoulakis <dimvar@google.com>
Date: Sat, 27 Jun 2020 13:43:52 -0700
Subject: [PATCH 1230/1390] [TF:XLA] Check that the device assignments of
 channel instructions in a while don't conflict with other device assignments.
 We have been relying on this invariant, but didn't check it. Follow-up to
 cl/318371660.

PiperOrigin-RevId: 318645283
Change-Id: I3c61151c71e9b288233458ac9cf971c3dffad1d8
---
 tensorflow/compiler/xla/service/BUILD         |   1 +
 .../xla/service/sharding_propagation.cc       | 102 +++++++++++----
 .../xla/service/sharding_propagation_test.cc  | 120 ++++++++++++++++++
 3 files changed, 198 insertions(+), 25 deletions(-)

diff --git a/tensorflow/compiler/xla/service/BUILD b/tensorflow/compiler/xla/service/BUILD
index 8ea08086a02..ed786992e4f 100644
--- a/tensorflow/compiler/xla/service/BUILD
+++ b/tensorflow/compiler/xla/service/BUILD
@@ -514,6 +514,7 @@ cc_library(
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:optional",
     ],
diff --git a/tensorflow/compiler/xla/service/sharding_propagation.cc b/tensorflow/compiler/xla/service/sharding_propagation.cc
index 9c5fa561f0b..a1903cd2746 100644
--- a/tensorflow/compiler/xla/service/sharding_propagation.cc
+++ b/tensorflow/compiler/xla/service/sharding_propagation.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "absl/algorithm/container.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
+#include "absl/status/status.h"
 #include "absl/strings/str_split.h"
 #include "absl/types/optional.h"
 #include "tensorflow/compiler/xla/service/dot_as_convolution_util.h"
@@ -1294,6 +1295,79 @@ StatusOr<bool> ProcessShardingInstruction(HloModule* module) {
   return changed;
 }
 
+// If a while contains a channel instruction on device D, check that any other
+// instructions with a device assignment are on D. Further, annotate the root
+// instruction of the while body to ensure that HLO partitioning will keep the
+// entire while instruction on D.
+Status CheckAndUpdateDeviceAssignmentsInWhileBody(
+    HloInstruction* while_instruction) {
+  auto bad_status = [](HloInstruction* instruction, int64 device,
+                       HloInstruction* channel_instruction,
+                       int64 correct_device) {
+    return FailedPrecondition(
+        "Instruction: %s is on device: %d, which conflicts with device: %d "
+        "of channel instruction: %s",
+        instruction->name(), device, correct_device,
+        channel_instruction->name());
+  };
+
+  CHECK_EQ(while_instruction->opcode(), HloOpcode::kWhile);
+  HloComputation* while_body = while_instruction->while_body();
+  // Maps a device number to an instruction in the while_body with that
+  // device assignment.
+  std::map<int64, HloInstruction*> devices_to_instructions;
+  absl::optional<int64> unique_device = absl::nullopt;
+  HloInstruction* channel_instruction = nullptr;
+
+  for (HloInstruction* instruction : while_body->instructions()) {
+    if (instruction->sharding_unique_device()) {
+      auto opcode = instruction->opcode();
+      int64 device = *instruction->sharding_unique_device();
+      if (unique_device.has_value()) {
+        if (*unique_device != device) {
+          return bad_status(instruction, device, channel_instruction,
+                            *unique_device);
+        }
+      } else if (opcode == HloOpcode::kSend || opcode == HloOpcode::kRecv ||
+                 // Cross-replica AllReduces don't have a channel_id, and we
+                 // don't enforce any invariant about their device assignment.
+                 (opcode == HloOpcode::kAllReduce &&
+                  instruction->channel_id())) {
+        channel_instruction = instruction;
+        unique_device = device;
+        if (!devices_to_instructions.empty()) {
+          for (auto it = devices_to_instructions.begin();
+               it != devices_to_instructions.end(); ++it) {
+            if (*unique_device != it->first) {
+              return bad_status(it->second, it->first, channel_instruction,
+                                *unique_device);
+            }
+          }
+        }
+      } else {
+        devices_to_instructions[device] = instruction;
+      }
+    }
+  }
+
+  if (unique_device.has_value()) {
+    auto while_device = while_instruction->sharding_unique_device();
+    if (while_device.has_value() && *unique_device != *while_device) {
+      return bad_status(while_instruction, *while_device, channel_instruction,
+                        *unique_device);
+    }
+    auto body_root = while_body->root_instruction();
+    auto root_device = body_root->sharding_unique_device();
+    if (!root_device.has_value()) {
+      body_root->set_device_sharding(*unique_device);
+    } else if (*unique_device != *root_device) {
+      return bad_status(body_root, *root_device, channel_instruction,
+                        *unique_device);
+    }
+  }
+  return Status::OK();
+}
+
 }  // namespace
 
 /*static*/ Status ShardingPropagation::NormalizeDomain(
@@ -1379,33 +1453,11 @@ StatusOr<bool> ShardingPropagation::Run(HloModule* module) {
         }
       };
 
-  // If a kWhile doesn't have a device assignment and it contains a channel
-  // instruction which has one, propagate that to the root of the while body.
   for (auto computation : module->computations()) {
     for (auto instruction : computation->instructions()) {
-      if (instruction->opcode() == HloOpcode::kWhile &&
-          !instruction->has_sharding() &&
-          !instruction->while_body()->root_instruction()->has_sharding()) {
-        absl::optional<int64> unique_device = absl::nullopt;
-        for (HloInstruction* body_instr :
-             instruction->while_body()->instructions()) {
-          auto opcode = body_instr->opcode();
-          if ((opcode == HloOpcode::kSend || opcode == HloOpcode::kRecv ||
-               opcode == HloOpcode::kAllReduce) &&
-              body_instr->sharding_unique_device()) {
-            if (unique_device == absl::nullopt) {
-              unique_device = body_instr->sharding_unique_device();
-            } else if (unique_device != body_instr->sharding_unique_device()) {
-              // The body contains several device assignments; don't propagate.
-              unique_device = absl::nullopt;
-              break;
-            }
-          }
-        }
-        if (unique_device.has_value()) {
-          instruction->while_body()->root_instruction()->set_device_sharding(
-              *unique_device);
-        }
+      if (instruction->opcode() == HloOpcode::kWhile) {
+        TF_RETURN_IF_ERROR(
+            CheckAndUpdateDeviceAssignmentsInWhileBody(instruction));
       }
     }
   }
diff --git a/tensorflow/compiler/xla/service/sharding_propagation_test.cc b/tensorflow/compiler/xla/service/sharding_propagation_test.cc
index bf859197ca1..d62328aa9ad 100644
--- a/tensorflow/compiler/xla/service/sharding_propagation_test.cc
+++ b/tensorflow/compiler/xla/service/sharding_propagation_test.cc
@@ -806,6 +806,126 @@ ENTRY %entry {
   }
 }
 
+TEST_F(ShardingPropagationTest, WhileConflictingShardingInBodyBeforeRecv) {
+  const char* const hlo_string = R"(
+HloModule module
+
+%cond {
+  %vars.cond = (u32[], f32[]) parameter(0)
+  %count.cond = u32[] get-tuple-element(%vars.cond), index=0
+  %limit = u32[] constant(10)
+  ROOT %lt = pred[] compare(%count.cond, %limit), direction=LT
+}
+
+%body {
+  %param = (u32[], f32[]) parameter(0)
+  %count = u32[] get-tuple-element(%param), index=0,
+    sharding={maximal device=0}
+  %after-all = token[] after-all()
+  %recv = (f32[], u32[], token[]) recv(%after-all), channel_id=1,
+    sharding={maximal device=1}
+  %recv-done = (f32[], token[]) recv-done(%recv), channel_id=1
+  %data = f32[] get-tuple-element(%recv-done), index=0
+  ROOT %tuple = (u32[], f32[]) tuple(%count, %data)
+}
+
+ENTRY %entry {
+  %p0 = f32[] parameter(0)
+  %zero = u32[] constant(0)
+  %init = (u32[], f32[]) tuple(%zero, %p0)
+  %while = (u32[], f32[]) while(%init), body=%body, condition=%cond
+  ROOT %result = f32[] get-tuple-element(%while), index=1
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  auto result = ShardingPropagation().Run(module.get());
+  EXPECT_THAT(result.status().error_message(),
+              ::testing::HasSubstr(
+                  "Instruction: count is on device: 0, which conflicts with "
+                  "device: 1 of channel instruction: recv"));
+}
+
+TEST_F(ShardingPropagationTest, WhileConflictingShardingInBodyAfterRecv) {
+  const char* const hlo_string = R"(
+HloModule module
+
+%cond {
+  %vars.cond = (u32[], f32[]) parameter(0)
+  %count.cond = u32[] get-tuple-element(%vars.cond), index=0
+  %limit = u32[] constant(10)
+  ROOT %lt = pred[] compare(%count.cond, %limit), direction=LT
+}
+
+%body {
+  %param = (u32[], f32[]) parameter(0)
+  %count = u32[] get-tuple-element(%param), index=0
+  %after-all = token[] after-all()
+  %recv = (f32[], u32[], token[]) recv(%after-all), channel_id=1,
+    sharding={maximal device=1}
+  %recv-done = (f32[], token[]) recv-done(%recv), channel_id=1
+  %data = f32[] get-tuple-element(%recv-done), index=0,
+    sharding={maximal device=0}
+  ROOT %tuple = (u32[], f32[]) tuple(%count, %data)
+}
+
+ENTRY %entry {
+  %p0 = f32[] parameter(0)
+  %zero = u32[] constant(0)
+  %init = (u32[], f32[]) tuple(%zero, %p0)
+  %while = (u32[], f32[]) while(%init), body=%body, condition=%cond
+  ROOT %result = f32[] get-tuple-element(%while), index=1
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  auto result = ShardingPropagation().Run(module.get());
+  EXPECT_THAT(result.status().error_message(),
+              ::testing::HasSubstr(
+                  "Instruction: data is on device: 0, which conflicts with "
+                  "device: 1 of channel instruction: recv"));
+}
+
+TEST_F(ShardingPropagationTest, WhileConflictingShardingOnWhileInstruction) {
+  const char* const hlo_string = R"(
+HloModule module
+
+%cond {
+  %vars.cond = (u32[], f32[]) parameter(0)
+  %count.cond = u32[] get-tuple-element(%vars.cond), index=0
+  %limit = u32[] constant(10)
+  ROOT %lt = pred[] compare(%count.cond, %limit), direction=LT
+}
+
+%body {
+  %param = (u32[], f32[]) parameter(0)
+  %count = u32[] get-tuple-element(%param), index=0
+  %after-all = token[] after-all()
+  %recv = (f32[], u32[], token[]) recv(%after-all), channel_id=1,
+    sharding={maximal device=1}
+  %recv-done = (f32[], token[]) recv-done(%recv), channel_id=1
+  %data = f32[] get-tuple-element(%recv-done), index=0
+  ROOT %tuple = (u32[], f32[]) tuple(%count, %data)
+}
+
+ENTRY %entry {
+  %p0 = f32[] parameter(0)
+  %zero = u32[] constant(0)
+  %init = (u32[], f32[]) tuple(%zero, %p0)
+  %while = (u32[], f32[]) while(%init), body=%body, condition=%cond,
+    sharding={maximal device=0}
+  ROOT %result = f32[] get-tuple-element(%while), index=1
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  auto result = ShardingPropagation().Run(module.get());
+  EXPECT_THAT(result.status().error_message(),
+              ::testing::HasSubstr(
+                  "Instruction: while is on device: 0, which conflicts with "
+                  "device: 1 of channel instruction: recv"));
+}
+
 TEST_F(ShardingPropagationTest, Dot) {
   const char* const hlo_string = R"(
 HloModule module

From cbb94efa589fc1d1b34948b9ca03829fe8613d12 Mon Sep 17 00:00:00 2001
From: Advait Jain <advaitjain@google.com>
Date: Sat, 27 Jun 2020 14:10:50 -0700
Subject: [PATCH 1231/1390] Separate out parse functionality into helper
 functions.

Ops in this change:
 * Mul
 * Pack
 * ResizeNearestNeighbor
 * Split
 * StridedSlice
 * Sub
 * Unpack

PiperOrigin-RevId: 318646600
Change-Id: I5139cb5fa781440bffd4afa3111061549e77e364
---
 .../lite/core/api/flatbuffer_conversions.cc   | 299 +++++++++++++-----
 .../lite/core/api/flatbuffer_conversions.h    |  31 ++
 .../lite/micro/micro_mutable_op_resolver.h    |  28 +-
 3 files changed, 259 insertions(+), 99 deletions(-)

diff --git a/tensorflow/lite/core/api/flatbuffer_conversions.cc b/tensorflow/lite/core/api/flatbuffer_conversions.cc
index 4d5c01a92da..6019b1d0552 100644
--- a/tensorflow/lite/core/api/flatbuffer_conversions.cc
+++ b/tensorflow/lite/core/api/flatbuffer_conversions.cc
@@ -568,6 +568,31 @@ TfLiteStatus ParseMinimum(const Operator*, BuiltinOperator, ErrorReporter*,
   return kTfLiteOk;
 }
 
+TfLiteStatus ParseMul(const Operator* op, BuiltinOperator,
+                      ErrorReporter* error_reporter,
+                      BuiltinDataAllocator* allocator, void** builtin_data) {
+  CheckParsePointerParams(op, error_reporter, allocator, builtin_data);
+
+  SafeBuiltinDataAllocator safe_allocator(allocator);
+  std::unique_ptr<TfLiteMulParams, SafeBuiltinDataAllocator::BuiltinDataDeleter>
+      params = safe_allocator.Allocate<TfLiteMulParams>();
+  TF_LITE_ENSURE(error_reporter, params != nullptr);
+
+  const MulOptions* schema_params = op->builtin_options_as_MulOptions();
+
+  if (schema_params != nullptr) {
+    params->activation =
+        ConvertActivation(schema_params->fused_activation_function());
+  } else {
+    // TODO(b/157480169): We should either return kTfLiteError or fill in some
+    // reasonable defaults in the params struct. We are not doing so until we
+    // better undertand the ramifications of changing the legacy behavior.
+  }
+
+  *builtin_data = params.release();
+  return kTfLiteOk;
+}
+
 // We have this parse function instead of directly returning kTfLiteOk from the
 // switch-case in ParseOpData because this function is used as part of the
 // selective registration for the OpResolver implementation in micro.
@@ -584,6 +609,32 @@ TfLiteStatus ParseNotEqual(const Operator*, BuiltinOperator, ErrorReporter*,
   return kTfLiteOk;
 }
 
+TfLiteStatus ParsePack(const Operator* op, BuiltinOperator,
+                       ErrorReporter* error_reporter,
+                       BuiltinDataAllocator* allocator, void** builtin_data) {
+  CheckParsePointerParams(op, error_reporter, allocator, builtin_data);
+
+  SafeBuiltinDataAllocator safe_allocator(allocator);
+  std::unique_ptr<TfLitePackParams,
+                  SafeBuiltinDataAllocator::BuiltinDataDeleter>
+      params = safe_allocator.Allocate<TfLitePackParams>();
+  TF_LITE_ENSURE(error_reporter, params != nullptr);
+
+  const PackOptions* schema_params = op->builtin_options_as_PackOptions();
+
+  if (schema_params != nullptr) {
+    params->values_count = schema_params->values_count();
+    params->axis = schema_params->axis();
+  } else {
+    // TODO(b/157480169): We should either return kTfLiteError or fill in some
+    // reasonable defaults in the params struct. We are not doing so until we
+    // better undertand the ramifications of changing the legacy behavior.
+  }
+
+  *builtin_data = params.release();
+  return kTfLiteOk;
+}
+
 // We have this parse function instead of directly returning kTfLiteOk from the
 // switch-case in ParseOpData because this function is used as part of the
 // selective registration for the OpResolver implementation in micro.
@@ -731,6 +782,33 @@ TfLiteStatus ParseReshape(const Operator* op, BuiltinOperator,
   return kTfLiteOk;
 }
 
+TfLiteStatus ParseResizeNearestNeighbor(const Operator* op, BuiltinOperator,
+                                        ErrorReporter* error_reporter,
+                                        BuiltinDataAllocator* allocator,
+                                        void** builtin_data) {
+  CheckParsePointerParams(op, error_reporter, allocator, builtin_data);
+
+  SafeBuiltinDataAllocator safe_allocator(allocator);
+  std::unique_ptr<TfLiteResizeNearestNeighborParams,
+                  SafeBuiltinDataAllocator::BuiltinDataDeleter>
+      params = safe_allocator.Allocate<TfLiteResizeNearestNeighborParams>();
+  TF_LITE_ENSURE(error_reporter, params != nullptr);
+
+  const ResizeNearestNeighborOptions* schema_params =
+      op->builtin_options_as_ResizeNearestNeighborOptions();
+
+  if (schema_params != nullptr) {
+    params->align_corners = schema_params->align_corners();
+    params->half_pixel_centers = schema_params->half_pixel_centers();
+  } else {
+    params->align_corners = false;
+    params->half_pixel_centers = false;
+  }
+
+  *builtin_data = params.release();
+  return kTfLiteOk;
+}
+
 // We have this parse function instead of directly returning kTfLiteOk from the
 // switch-case in ParseOpData because this function is used as part of the
 // selective registration for the OpResolver implementation in micro.
@@ -781,6 +859,31 @@ TfLiteStatus ParseSoftmax(const Operator* op, BuiltinOperator,
   return kTfLiteOk;
 }
 
+TfLiteStatus ParseSplit(const Operator* op, BuiltinOperator,
+                        ErrorReporter* error_reporter,
+                        BuiltinDataAllocator* allocator, void** builtin_data) {
+  CheckParsePointerParams(op, error_reporter, allocator, builtin_data);
+
+  SafeBuiltinDataAllocator safe_allocator(allocator);
+  std::unique_ptr<TfLiteSplitParams,
+                  SafeBuiltinDataAllocator::BuiltinDataDeleter>
+      params = safe_allocator.Allocate<TfLiteSplitParams>();
+  TF_LITE_ENSURE(error_reporter, params != nullptr);
+
+  const SplitOptions* schema_params = op->builtin_options_as_SplitOptions();
+
+  if (schema_params != nullptr) {
+    params->num_splits = schema_params->num_splits();
+  } else {
+    // TODO(b/157480169): We should either return kTfLiteError or fill in some
+    // reasonable defaults in the params struct. We are not doing so until we
+    // better undertand the ramifications of changing the legacy behavior.
+  }
+
+  *builtin_data = params.release();
+  return kTfLiteOk;
+}
+
 // We have this parse function instead of directly returning kTfLiteOk from the
 // switch-case in ParseOpData because this function is used as part of the
 // selective registration for the OpResolver implementation in micro.
@@ -797,6 +900,62 @@ TfLiteStatus ParseSquare(const Operator*, BuiltinOperator, ErrorReporter*,
   return kTfLiteOk;
 }
 
+TfLiteStatus ParseStridedSlice(const Operator* op, BuiltinOperator,
+                               ErrorReporter* error_reporter,
+                               BuiltinDataAllocator* allocator,
+                               void** builtin_data) {
+  CheckParsePointerParams(op, error_reporter, allocator, builtin_data);
+
+  SafeBuiltinDataAllocator safe_allocator(allocator);
+  std::unique_ptr<TfLiteStridedSliceParams,
+                  SafeBuiltinDataAllocator::BuiltinDataDeleter>
+      params = safe_allocator.Allocate<TfLiteStridedSliceParams>();
+  TF_LITE_ENSURE(error_reporter, params != nullptr);
+
+  const StridedSliceOptions* schema_params =
+      op->builtin_options_as_StridedSliceOptions();
+
+  if (schema_params != nullptr) {
+    params->begin_mask = schema_params->begin_mask();
+    params->end_mask = schema_params->end_mask();
+    params->ellipsis_mask = schema_params->ellipsis_mask();
+    params->new_axis_mask = schema_params->new_axis_mask();
+    params->shrink_axis_mask = schema_params->shrink_axis_mask();
+  } else {
+    // TODO(b/157480169): We should either return kTfLiteError or fill in some
+    // reasonable defaults in the params struct. We are not doing so until we
+    // better undertand the ramifications of changing the legacy behavior.
+  }
+
+  *builtin_data = params.release();
+  return kTfLiteOk;
+}
+
+TfLiteStatus ParseSub(const Operator* op, BuiltinOperator,
+                      ErrorReporter* error_reporter,
+                      BuiltinDataAllocator* allocator, void** builtin_data) {
+  CheckParsePointerParams(op, error_reporter, allocator, builtin_data);
+
+  SafeBuiltinDataAllocator safe_allocator(allocator);
+  std::unique_ptr<TfLiteSubParams, SafeBuiltinDataAllocator::BuiltinDataDeleter>
+      params = safe_allocator.Allocate<TfLiteSubParams>();
+  TF_LITE_ENSURE(error_reporter, params != nullptr);
+
+  const SubOptions* schema_params = op->builtin_options_as_SubOptions();
+
+  if (schema_params != nullptr) {
+    params->activation =
+        ConvertActivation(schema_params->fused_activation_function());
+  } else {
+    // TODO(b/157480169): We should either return kTfLiteError or fill in some
+    // reasonable defaults in the params struct. We are not doing so until we
+    // better undertand the ramifications of changing the legacy behavior.
+  }
+
+  *builtin_data = params.release();
+  return kTfLiteOk;
+}
+
 TfLiteStatus ParseSvdf(const Operator* op, BuiltinOperator,
                        ErrorReporter* error_reporter,
                        BuiltinDataAllocator* allocator, void** builtin_data) {
@@ -833,6 +992,32 @@ TfLiteStatus ParseTanh(const Operator*, BuiltinOperator, ErrorReporter*,
   return kTfLiteOk;
 }
 
+TfLiteStatus ParseUnpack(const Operator* op, BuiltinOperator,
+                         ErrorReporter* error_reporter,
+                         BuiltinDataAllocator* allocator, void** builtin_data) {
+  CheckParsePointerParams(op, error_reporter, allocator, builtin_data);
+
+  SafeBuiltinDataAllocator safe_allocator(allocator);
+  std::unique_ptr<TfLiteUnpackParams,
+                  SafeBuiltinDataAllocator::BuiltinDataDeleter>
+      params = safe_allocator.Allocate<TfLiteUnpackParams>();
+  TF_LITE_ENSURE(error_reporter, params != nullptr);
+
+  const UnpackOptions* schema_params = op->builtin_options_as_UnpackOptions();
+
+  if (schema_params != nullptr) {
+    params->num = schema_params->num();
+    params->axis = schema_params->axis();
+  } else {
+    // TODO(b/157480169): We should either return kTfLiteError or fill in some
+    // reasonable defaults in the params struct. We are not doing so until we
+    // better undertand the ramifications of changing the legacy behavior.
+  }
+
+  *builtin_data = params.release();
+  return kTfLiteOk;
+}
+
 TfLiteStatus ParseOpData(const Operator* op, BuiltinOperator op_type,
                          ErrorReporter* error_reporter,
                          BuiltinDataAllocator* allocator, void** builtin_data) {
@@ -984,22 +1169,36 @@ TfLiteStatus ParseOpData(const Operator* op, BuiltinOperator op_type,
     case BuiltinOperator_MINIMUM: {
       return ParseMinimum(op, op_type, error_reporter, allocator, builtin_data);
     }
+
+    case BuiltinOperator_MUL: {
+      return ParseMul(op, op_type, error_reporter, allocator, builtin_data);
+    }
+
     case BuiltinOperator_NEG: {
       return ParseNeg(op, op_type, error_reporter, allocator, builtin_data);
     }
+
     case BuiltinOperator_NOT_EQUAL: {
       return ParseNotEqual(op, op_type, error_reporter, allocator,
                            builtin_data);
     }
+
+    case BuiltinOperator_PACK: {
+      return ParsePack(op, op_type, error_reporter, allocator, builtin_data);
+    }
+
     case BuiltinOperator_PAD: {
       return ParsePad(op, op_type, error_reporter, allocator, builtin_data);
     }
+
     case BuiltinOperator_PADV2: {
       return ParsePadV2(op, op_type, error_reporter, allocator, builtin_data);
     }
+
     case BuiltinOperator_PRELU: {
       return ParsePrelu(op, op_type, error_reporter, allocator, builtin_data);
     }
+
     case BuiltinOperator_QUANTIZE: {
       return ParseQuantize(op, op_type, error_reporter, allocator,
                            builtin_data);
@@ -1033,6 +1232,11 @@ TfLiteStatus ParseOpData(const Operator* op, BuiltinOperator op_type,
       return ParseReshape(op, op_type, error_reporter, allocator, builtin_data);
     }
 
+    case BuiltinOperator_RESIZE_NEAREST_NEIGHBOR: {
+      return ParseResizeNearestNeighbor(op, op_type, error_reporter, allocator,
+                                        builtin_data);
+    }
+
     case BuiltinOperator_ROUND: {
       return ParseRound(op, op_type, error_reporter, allocator, builtin_data);
     }
@@ -1049,6 +1253,10 @@ TfLiteStatus ParseOpData(const Operator* op, BuiltinOperator op_type,
       return ParseSoftmax(op, op_type, error_reporter, allocator, builtin_data);
     }
 
+    case BuiltinOperator_SPLIT: {
+      return ParseSplit(op, op_type, error_reporter, allocator, builtin_data);
+    }
+
     case BuiltinOperator_SQRT: {
       return ParseSqrt(op, op_type, error_reporter, allocator, builtin_data);
     }
@@ -1057,6 +1265,15 @@ TfLiteStatus ParseOpData(const Operator* op, BuiltinOperator op_type,
       return ParseSquare(op, op_type, error_reporter, allocator, builtin_data);
     }
 
+    case BuiltinOperator_STRIDED_SLICE: {
+      return ParseStridedSlice(op, op_type, error_reporter, allocator,
+                               builtin_data);
+    }
+
+    case BuiltinOperator_SUB: {
+      return ParseSub(op, op_type, error_reporter, allocator, builtin_data);
+    }
+
     case BuiltinOperator_SUM: {
       return ParseReducer(op, op_type, error_reporter, allocator, builtin_data);
     }
@@ -1069,6 +1286,10 @@ TfLiteStatus ParseOpData(const Operator* op, BuiltinOperator op_type,
       return ParseTanh(op, op_type, error_reporter, allocator, builtin_data);
     }
 
+    case BuiltinOperator_UNPACK: {
+      return ParseUnpack(op, op_type, error_reporter, allocator, builtin_data);
+    }
+
     case BuiltinOperator_CAST: {
       auto params = safe_allocator.Allocate<TfLiteCastParams>();
       TF_LITE_ENSURE(error_reporter, params != nullptr);
@@ -1150,16 +1371,6 @@ TfLiteStatus ParseOpData(const Operator* op, BuiltinOperator op_type,
     case BuiltinOperator_HASHTABLE_LOOKUP:
       // no-op.
       return kTfLiteOk;
-    case BuiltinOperator_MUL: {
-      auto params = safe_allocator.Allocate<TfLiteMulParams>();
-      TF_LITE_ENSURE(error_reporter, params != nullptr);
-      if (const auto* schema_params = op->builtin_options_as_MulOptions()) {
-        params->activation =
-            ConvertActivation(schema_params->fused_activation_function());
-      }
-      *builtin_data = params.release();
-      return kTfLiteOk;
-    }
     case BuiltinOperator_DIV: {
       auto params = safe_allocator.Allocate<TfLiteDivParams>();
       TF_LITE_ENSURE(error_reporter, params != nullptr);
@@ -1170,16 +1381,6 @@ TfLiteStatus ParseOpData(const Operator* op, BuiltinOperator op_type,
       *builtin_data = params.release();
       return kTfLiteOk;
     }
-    case BuiltinOperator_SUB: {
-      auto params = safe_allocator.Allocate<TfLiteSubParams>();
-      TF_LITE_ENSURE(error_reporter, params != nullptr);
-      if (const auto* schema_params = op->builtin_options_as_SubOptions()) {
-        params->activation =
-            ConvertActivation(schema_params->fused_activation_function());
-      }
-      *builtin_data = params.release();
-      return kTfLiteOk;
-    }
     case BuiltinOperator_LOCAL_RESPONSE_NORMALIZATION: {
       auto params = safe_allocator.Allocate<TfLiteLocalResponseNormParams>();
       TF_LITE_ENSURE(error_reporter, params != nullptr);
@@ -1275,21 +1476,6 @@ TfLiteStatus ParseOpData(const Operator* op, BuiltinOperator op_type,
       *builtin_data = params.release();
       return kTfLiteOk;
     }
-    case BuiltinOperator_RESIZE_NEAREST_NEIGHBOR: {
-      auto params =
-          safe_allocator.Allocate<TfLiteResizeNearestNeighborParams>();
-      TF_LITE_ENSURE(error_reporter, params != nullptr);
-      if (const auto* schema_params =
-              op->builtin_options_as_ResizeNearestNeighborOptions()) {
-        params->align_corners = schema_params->align_corners();
-        params->half_pixel_centers = schema_params->half_pixel_centers();
-      } else {
-        params->align_corners = false;
-        params->half_pixel_centers = false;
-      }
-      *builtin_data = params.release();
-      return kTfLiteOk;
-    }
     case BuiltinOperator_SKIP_GRAM: {
       auto params = safe_allocator.Allocate<TfLiteSkipGramParams>();
       TF_LITE_ENSURE(error_reporter, params != nullptr);
@@ -1333,15 +1519,6 @@ TfLiteStatus ParseOpData(const Operator* op, BuiltinOperator op_type,
       *builtin_data = params.release();
       return kTfLiteOk;
     }
-    case BuiltinOperator_SPLIT: {
-      auto params = safe_allocator.Allocate<TfLiteSplitParams>();
-      TF_LITE_ENSURE(error_reporter, params != nullptr);
-      if (const auto* schema_params = op->builtin_options_as_SplitOptions()) {
-        params->num_splits = schema_params->num_splits();
-      }
-      *builtin_data = params.release();
-      return kTfLiteOk;
-    }
     case BuiltinOperator_SPLIT_V: {
       auto params = safe_allocator.Allocate<TfLiteSplitParams>();
       TF_LITE_ENSURE(error_reporter, params != nullptr);
@@ -1364,20 +1541,6 @@ TfLiteStatus ParseOpData(const Operator* op, BuiltinOperator op_type,
       *builtin_data = params.release();
       return kTfLiteOk;
     }
-    case BuiltinOperator_STRIDED_SLICE: {
-      auto params = safe_allocator.Allocate<TfLiteStridedSliceParams>();
-      TF_LITE_ENSURE(error_reporter, params != nullptr);
-      if (const auto* schema_params =
-              op->builtin_options_as_StridedSliceOptions()) {
-        params->begin_mask = schema_params->begin_mask();
-        params->end_mask = schema_params->end_mask();
-        params->ellipsis_mask = schema_params->ellipsis_mask();
-        params->new_axis_mask = schema_params->new_axis_mask();
-        params->shrink_axis_mask = schema_params->shrink_axis_mask();
-      }
-      *builtin_data = params.release();
-      return kTfLiteOk;
-    }
     case BuiltinOperator_TRANSPOSE_CONV: {
       auto params = safe_allocator.Allocate<TfLiteTransposeConvParams>();
       TF_LITE_ENSURE(error_reporter, params != nullptr);
@@ -1410,16 +1573,6 @@ TfLiteStatus ParseOpData(const Operator* op, BuiltinOperator op_type,
       *builtin_data = params.release();
       return kTfLiteOk;
     }
-    case BuiltinOperator_PACK: {
-      auto params = safe_allocator.Allocate<TfLitePackParams>();
-      TF_LITE_ENSURE(error_reporter, params != nullptr);
-      if (const auto* pack_params = op->builtin_options_as_PackOptions()) {
-        params->values_count = pack_params->values_count();
-        params->axis = pack_params->axis();
-      }
-      *builtin_data = params.release();
-      return kTfLiteOk;
-    }
     case BuiltinOperator_DELEGATE: {
       // TODO(ycling): Revisit when supporting saving delegated models.
       TF_LITE_REPORT_ERROR(error_reporter,
@@ -1448,16 +1601,6 @@ TfLiteStatus ParseOpData(const Operator* op, BuiltinOperator op_type,
       *builtin_data = params.release();
       return kTfLiteOk;
     }
-    case BuiltinOperator_UNPACK: {
-      auto params = safe_allocator.Allocate<TfLiteUnpackParams>();
-      TF_LITE_ENSURE(error_reporter, params != nullptr);
-      if (const auto* unpack_params = op->builtin_options_as_UnpackOptions()) {
-        params->num = unpack_params->num();
-        params->axis = unpack_params->axis();
-      }
-      *builtin_data = params.release();
-      return kTfLiteOk;
-    }
     case BuiltinOperator_LEAKY_RELU: {
       auto params = safe_allocator.Allocate<TfLiteLeakyReluParams>();
       TF_LITE_ENSURE(error_reporter, params != nullptr);
diff --git a/tensorflow/lite/core/api/flatbuffer_conversions.h b/tensorflow/lite/core/api/flatbuffer_conversions.h
index c63968cbdec..89363df5692 100644
--- a/tensorflow/lite/core/api/flatbuffer_conversions.h
+++ b/tensorflow/lite/core/api/flatbuffer_conversions.h
@@ -191,6 +191,10 @@ TfLiteStatus ParseMinimum(const Operator* op, BuiltinOperator op_type,
                           ErrorReporter* error_reporter,
                           BuiltinDataAllocator* allocator, void** builtin_data);
 
+TfLiteStatus ParseMul(const Operator* op, BuiltinOperator op_type,
+                      ErrorReporter* error_reporter,
+                      BuiltinDataAllocator* allocator, void** builtin_data);
+
 TfLiteStatus ParseNeg(const Operator* op, BuiltinOperator op_type,
                       ErrorReporter* error_reporter,
                       BuiltinDataAllocator* allocator, void** builtin_data);
@@ -200,6 +204,10 @@ TfLiteStatus ParseNotEqual(const Operator* op, BuiltinOperator op_type,
                            BuiltinDataAllocator* allocator,
                            void** builtin_data);
 
+TfLiteStatus ParsePack(const Operator* op, BuiltinOperator op_type,
+                       ErrorReporter* error_reporter,
+                       BuiltinDataAllocator* allocator, void** builtin_data);
+
 TfLiteStatus ParsePad(const Operator* op, BuiltinOperator op_type,
                       ErrorReporter* error_reporter,
                       BuiltinDataAllocator* allocator, void** builtin_data);
@@ -237,6 +245,12 @@ TfLiteStatus ParseReshape(const Operator* op, BuiltinOperator op_type,
                           ErrorReporter* error_reporter,
                           BuiltinDataAllocator* allocator, void** builtin_data);
 
+TfLiteStatus ParseResizeNearestNeighbor(const Operator* op,
+                                        BuiltinOperator op_type,
+                                        ErrorReporter* error_reporter,
+                                        BuiltinDataAllocator* allocator,
+                                        void** builtin_data);
+
 TfLiteStatus ParseRound(const Operator* op, BuiltinOperator op_type,
                         ErrorReporter* error_reporter,
                         BuiltinDataAllocator* allocator, void** builtin_data);
@@ -253,6 +267,10 @@ TfLiteStatus ParseSoftmax(const Operator* op, BuiltinOperator op_type,
                           ErrorReporter* error_reporter,
                           BuiltinDataAllocator* allocator, void** builtin_data);
 
+TfLiteStatus ParseSplit(const Operator* op, BuiltinOperator op_type,
+                        ErrorReporter* error_reporter,
+                        BuiltinDataAllocator* allocator, void** builtin_data);
+
 TfLiteStatus ParseSqrt(const Operator* op, BuiltinOperator op_type,
                        ErrorReporter* error_reporter,
                        BuiltinDataAllocator* allocator, void** builtin_data);
@@ -261,6 +279,15 @@ TfLiteStatus ParseSquare(const Operator* op, BuiltinOperator op_type,
                          ErrorReporter* error_reporter,
                          BuiltinDataAllocator* allocator, void** builtin_data);
 
+TfLiteStatus ParseStridedSlice(const Operator* op, BuiltinOperator op_type,
+                               ErrorReporter* error_reporter,
+                               BuiltinDataAllocator* allocator,
+                               void** builtin_data);
+
+TfLiteStatus ParseSub(const Operator* op, BuiltinOperator op_type,
+                      ErrorReporter* error_reporter,
+                      BuiltinDataAllocator* allocator, void** builtin_data);
+
 TfLiteStatus ParseSvdf(const Operator* op, BuiltinOperator op_type,
                        ErrorReporter* error_reporter,
                        BuiltinDataAllocator* allocator, void** builtin_data);
@@ -269,6 +296,10 @@ TfLiteStatus ParseTanh(const Operator* op, BuiltinOperator op_type,
                        ErrorReporter* error_reporter,
                        BuiltinDataAllocator* allocator, void** builtin_data);
 
+TfLiteStatus ParseUnpack(const Operator* op, BuiltinOperator op_type,
+                         ErrorReporter* error_reporter,
+                         BuiltinDataAllocator* allocator, void** builtin_data);
+
 }  // namespace tflite
 
 #endif  // TENSORFLOW_LITE_CORE_API_FLATBUFFER_CONVERSIONS_H_
diff --git a/tensorflow/lite/micro/micro_mutable_op_resolver.h b/tensorflow/lite/micro/micro_mutable_op_resolver.h
index 90f0c39f1d1..c499af78a98 100644
--- a/tensorflow/lite/micro/micro_mutable_op_resolver.h
+++ b/tensorflow/lite/micro/micro_mutable_op_resolver.h
@@ -265,10 +265,8 @@ class MicroMutableOpResolver : public MicroOpResolver {
   }
 
   TfLiteStatus AddMul() {
-    // TODO(b/149408647): Replace ParseOpData with the operator specific parse
-    // function.
     return AddBuiltin(BuiltinOperator_MUL, *tflite::ops::micro::Register_MUL(),
-                      ParseOpData);
+                      ParseMul);
   }
 
   TfLiteStatus AddNeg() {
@@ -282,10 +280,8 @@ class MicroMutableOpResolver : public MicroOpResolver {
   }
 
   TfLiteStatus AddPack() {
-    // TODO(b/149408647): Replace ParseOpData with the operator specific parse
-    // function.
     return AddBuiltin(BuiltinOperator_PACK,
-                      *tflite::ops::micro::Register_PACK(), ParseOpData);
+                      *tflite::ops::micro::Register_PACK(), ParsePack);
   }
 
   TfLiteStatus AddPad() {
@@ -324,11 +320,9 @@ class MicroMutableOpResolver : public MicroOpResolver {
   }
 
   TfLiteStatus AddResizeNearestNeighbor() {
-    // TODO(b/149408647): Replace ParseOpData with the operator specific parse
-    // function.
     return AddBuiltin(BuiltinOperator_RESIZE_NEAREST_NEIGHBOR,
                       *tflite::ops::micro::Register_RESIZE_NEAREST_NEIGHBOR(),
-                      ParseOpData);
+                      ParseResizeNearestNeighbor);
   }
 
   TfLiteStatus AddRound() {
@@ -352,10 +346,8 @@ class MicroMutableOpResolver : public MicroOpResolver {
   }
 
   TfLiteStatus AddSplit() {
-    // TODO(b/149408647): Replace ParseOpData with the operator specific parse
-    // function.
     return AddBuiltin(BuiltinOperator_SPLIT,
-                      *tflite::ops::micro::Register_SPLIT(), ParseOpData);
+                      *tflite::ops::micro::Register_SPLIT(), ParseSplit);
   }
 
   TfLiteStatus AddSqrt() {
@@ -369,18 +361,14 @@ class MicroMutableOpResolver : public MicroOpResolver {
   }
 
   TfLiteStatus AddStridedSlice() {
-    // TODO(b/149408647): Replace ParseOpData with the operator specific parse
-    // function.
     return AddBuiltin(BuiltinOperator_STRIDED_SLICE,
                       *tflite::ops::micro::Register_STRIDED_SLICE(),
-                      ParseOpData);
+                      ParseStridedSlice);
   }
 
   TfLiteStatus AddSub() {
-    // TODO(b/149408647): Replace ParseOpData with the operator specific parse
-    // function.
     return AddBuiltin(BuiltinOperator_SUB, *tflite::ops::micro::Register_SUB(),
-                      ParseOpData);
+                      ParseSub);
   }
 
   TfLiteStatus AddSvdf() {
@@ -394,10 +382,8 @@ class MicroMutableOpResolver : public MicroOpResolver {
   }
 
   TfLiteStatus AddUnpack() {
-    // TODO(b/149408647): Replace ParseOpData with the operator specific parse
-    // function.
     return AddBuiltin(BuiltinOperator_UNPACK,
-                      *tflite::ops::micro::Register_UNPACK(), ParseOpData);
+                      *tflite::ops::micro::Register_UNPACK(), ParseUnpack);
   }
 
   unsigned int GetRegistrationLength() { return registrations_len_; }

From 186e35d7df9451dbd1be029a8a3801c4c18e0054 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sun, 28 Jun 2020 02:01:27 -0700
Subject: [PATCH 1232/1390] compat: Update forward compatibility horizon to
 2020-06-28

PiperOrigin-RevId: 318683633
Change-Id: Ia1a1f7f2871ce87ecea92bdd7dbbdd6d9eab6377
---
 tensorflow/python/compat/compat.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/compat/compat.py b/tensorflow/python/compat/compat.py
index 6c0f39a5494..3ba4e8949b8 100644
--- a/tensorflow/python/compat/compat.py
+++ b/tensorflow/python/compat/compat.py
@@ -33,7 +33,7 @@ from tensorflow.python.util.tf_export import tf_export
 # This value changes every day with an automatic CL. It can be modified in code
 # via `forward_compatibility_horizon()` or with the environment variable
 # TF_FORWARD_COMPATIBILITY_DELTA_DAYS, which is added to the compatibility date.
-_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2020, 6, 27)
+_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2020, 6, 28)
 _FORWARD_COMPATIBILITY_DELTA_DAYS_VAR_NAME = "TF_FORWARD_COMPATIBILITY_DELTA_DAYS"
 _FORWARD_COMPATIBILITY_DATE_NUMBER = None
 

From 9b13f1b47fc97cb8b167d7fe5f1dc2277c85698d Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sun, 28 Jun 2020 02:01:29 -0700
Subject: [PATCH 1233/1390] Update GraphDef version to 446.

PiperOrigin-RevId: 318683634
Change-Id: I83991efd709ace1b27bab48bb2f582a1db36840c
---
 tensorflow/core/public/version.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index 5f6b50c71f6..595486022fe 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -108,7 +108,7 @@ limitations under the License.
 
 #define TF_GRAPH_DEF_VERSION_MIN_PRODUCER 0
 #define TF_GRAPH_DEF_VERSION_MIN_CONSUMER 0
-#define TF_GRAPH_DEF_VERSION 445  // Updated: 2020/6/27
+#define TF_GRAPH_DEF_VERSION 446  // Updated: 2020/6/28
 
 // Checkpoint compatibility versions (the versions field in SavedSliceMeta).
 //

From bd006c354f11f9045d344f3e48b47be9f8368dac Mon Sep 17 00:00:00 2001
From: Mehdi Amini <aminim@google.com>
Date: Sun, 28 Jun 2020 07:55:17 -0700
Subject: [PATCH 1234/1390] Remove some remaining references to the TF control
 dialect

This dialect was removed a few weeks ago, but we still have some special handling
remaining in some place. This is cleaning up some of the export path.

PiperOrigin-RevId: 318702263
Change-Id: I8ea70062bbff3d65e30a3aedb2a2bcc1efa7fc3c
---
 .../translate/export_tf_dialect_op.cc         | 33 ++-----------------
 .../mlir/tensorflow/utils/export_utils.cc     | 11 +++----
 2 files changed, 7 insertions(+), 37 deletions(-)

diff --git a/tensorflow/compiler/mlir/tensorflow/translate/export_tf_dialect_op.cc b/tensorflow/compiler/mlir/tensorflow/translate/export_tf_dialect_op.cc
index 2bea9ba43e2..3ca06e5efa9 100644
--- a/tensorflow/compiler/mlir/tensorflow/translate/export_tf_dialect_op.cc
+++ b/tensorflow/compiler/mlir/tensorflow/translate/export_tf_dialect_op.cc
@@ -131,39 +131,14 @@ StatusOr<std::unique_ptr<NodeDef>> ConvertTFDialectOpToNodeDef(
   // Use auto generated function to populate derived attribute.
   //
   // Note: This only populates derived attributes for TensorFlow ops that are
-  // generated using the TableGen. Manually defined ops and TF ops with control
-  // edges (i.e TF op names with leading '_' in names) should have all the
+  // generated using the TableGen. Manually defined ops should have all the
   // attributes present as native MLIR op attributes.
 
-  // If the operation is in the TensorFlow control dialect, we create a
-  // temporary copy in the TensorFlow dialect. This is needed because we
-  // auto-generated the registration for TensorFlow dialect only.
-  // TODO(aminim): this is only done while we're using the TF control dialect
-  // as a temporary stage when exporting to GraphDef. Remove when we update the
-  // export.
-  auto erase_clone = [](mlir::Operation* op) { op->erase(); };
-  std::unique_ptr<mlir::Operation, decltype(erase_clone)> cloned_inst(
-      nullptr, erase_clone);
-  if (inst->getDialect() && inst->getDialect()->getNamespace() == "_tf") {
-    mlir::OperationState result(inst->getLoc(),
-                                inst->getName().getStringRef().drop_front());
-    for (mlir::Value operand : inst->getOperands())
-      result.operands.push_back(operand);
-
-    // Add a result type for each non-control result we find
-    for (mlir::Type result_type : inst->getResultTypes())
-      result.types.push_back(result_type);
-    cloned_inst.reset(mlir::Operation::create(result));
-    cloned_inst->setAttrs(inst->getAttrs());
-    inst = cloned_inst.get();
-  }
-
   // The elements are owned by the MLIRContext.
   absl::flat_hash_set<absl::string_view> attrs_to_ignore;
   if (inst->isRegistered()) {
     // We ignore attributes attached to the operation when there is already a
     // derived attribute defined in ODS.
-    // TODO(aminim) replace absl::flat_hash_set with a SmallDenseSet.
     llvm::SmallDenseSet<llvm::StringRef> derived_attrs;
     CollectDerivedAttrsName(inst, &derived_attrs);
     for (auto name : derived_attrs) attrs_to_ignore.insert(name.data());
@@ -198,10 +173,8 @@ StatusOr<std::unique_ptr<NodeDef>> ConvertTFDialectOpToNodeDef(
         inst->getName().getStringRef().str());
   }
 
-  // If the instruction is in the TF dialect, the code above already filtered
-  // results with control types. Here we only add the shapes for the leading
-  // values with ShapedType, assuming values with non-ShapedType are put at the
-  // end of the result.
+  // Here we only add the shapes for the leading values with ShapedType,
+  // assuming values with non-ShapedType are put at the end of the result.
   if (!ignore_unregistered_attrs && inst->getNumResults() > 0) {
     auto values = inst->getResults();
     auto begin = values.begin();
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/export_utils.cc b/tensorflow/compiler/mlir/tensorflow/utils/export_utils.cc
index 26552087173..852bc72d7de 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/export_utils.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/export_utils.cc
@@ -64,7 +64,6 @@ std::set<std::string>* GlobalOpPrefixes() {
   static std::set<std::string>* global_op_prefixes = [] {
     std::set<std::string>* result = new std::set<std::string>;
     result->insert("tf.");
-    result->insert("_tf.");
     result->insert("tf_executor.");
     return result;
   }();
@@ -276,7 +275,7 @@ StatusOr<llvm::StringRef> GetTensorFlowOpName(llvm::StringRef op_name) {
   // When being converted to MLIR, some prefixes and suffixes are added to the
   // operation types, and we have to remove them when converting the
   // operations back to a graph:
-  // - "_tf.", "tf." or "tf_executor." : every operation type has this prefix.
+  // - "tf." or "tf_executor." : every operation type has this prefix.
   // - ".sink" or ".Sink": only the NextIteration operation has this suffix. We
   // don't need to consider ".source"/".Source" because the nodes with this
   // suffix are skipped by the caller and will not be added to the graph.
@@ -313,9 +312,8 @@ StatusOr<std::unique_ptr<NodeDef>> GetOperationNodeDef(
     // Some control flow ops in TensorFlow Graph have their respective "Ref" ops
     // as well. For example there is Enter and RefEnter op. RefEnter forwards
     // the input ref buffer to output. However both Enter and RefEnter are
-    // mapped to tf_executor::EnterOp during import and then to _tf.Enter op in
-    // control dialect. Check if it is a Ref op to correctly map to the
-    // TensorFlow Graph op.
+    // mapped to tf_executor::EnterOp during import. Check if it is a Ref op to
+    // correctly map to the TensorFlow Graph op.
     if (IsRefTypeControlOp(inst)) op_name = "Ref";
     TF_ASSIGN_OR_RETURN(auto tf_name,
                         GetTensorFlowOpName(inst->getName().getStringRef()));
@@ -516,8 +514,7 @@ Status SetSizeAttribute(absl::string_view name, size_t size,
 }
 
 bool IsLegacyCallInstruction(mlir::Operation* inst) {
-  return llvm::dyn_cast<mlir::TF::LegacyCallOp>(inst) ||
-         inst->getName().getStringRef().compare("_tf.LegacyCall") == 0;
+  return llvm::dyn_cast<mlir::TF::LegacyCallOp>(inst);
 }
 
 Status AddTensorFlowOpPrefix(std::string prefix) {

From 1fb8f4988d69237879aac4d9e3f268f837dc0221 Mon Sep 17 00:00:00 2001
From: Chris Gorgolewski <chrisgo@google.com>
Date: Sun, 28 Jun 2020 13:16:52 -0700
Subject: [PATCH 1235/1390] Return a more intuitive error when trying to fit a
 model with an empty dataset.

PiperOrigin-RevId: 318721653
Change-Id: I8566cd07909bf6b08ac48061f040cd2295a58b3b
---
 tensorflow/python/keras/engine/training.py      | 6 +++++-
 tensorflow/python/keras/engine/training_test.py | 8 ++++++++
 2 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/tensorflow/python/keras/engine/training.py b/tensorflow/python/keras/engine/training.py
index 5355920ced5..986d3a05887 100644
--- a/tensorflow/python/keras/engine/training.py
+++ b/tensorflow/python/keras/engine/training.py
@@ -1023,7 +1023,7 @@ class Model(base_layer.Layer, version_utils.ModelVersionSelector):
         2. If `model.fit` is  wrapped in `tf.function`.
 
         ValueError: In case of mismatch between the provided input data
-            and what the model expects.
+            and what the model expects or when the input data is empty.
     """
     _keras_api_gauge.get_cell('fit').set(True)
     # Legacy graph support is contained in `training_v1.Model`.
@@ -1083,6 +1083,7 @@ class Model(base_layer.Layer, version_utils.ModelVersionSelector):
       # happen after `callbacks.on_train_begin`.
       data_handler._initial_epoch = (  # pylint: disable=protected-access
           self._maybe_load_initial_epoch_from_ckpt(initial_epoch))
+      logs = None
       for epoch, iterator in data_handler.enumerate_epochs():
         self.reset_metrics()
         callbacks.on_epoch_begin(epoch)
@@ -1101,6 +1102,9 @@ class Model(base_layer.Layer, version_utils.ModelVersionSelector):
               logs = tmp_logs  # No error, now safe to assign to logs.
               end_step = step + data_handler.step_increment
               callbacks.on_train_batch_end(end_step, logs)
+
+        if logs is None:
+          raise ValueError('Expect x to be a non-empty array or dataset.')
         epoch_logs = copy.copy(logs)
 
         # Run validation.
diff --git a/tensorflow/python/keras/engine/training_test.py b/tensorflow/python/keras/engine/training_test.py
index ad904ce9aa7..23a2969d343 100644
--- a/tensorflow/python/keras/engine/training_test.py
+++ b/tensorflow/python/keras/engine/training_test.py
@@ -90,6 +90,14 @@ class TrainingTest(keras_parameterized.TestCase):
     hist = model.fit(x=np.array([0.]), y=np.array([0.]))
     self.assertAllClose(hist.history['loss'][0], 10000)
 
+  @keras_parameterized.run_all_keras_modes(always_skip_v1=True)
+  def test_fit_on_empty(self):
+    model = sequential.Sequential([layers_module.Dense(1)])
+    model.compile('sgd', 'mse', run_eagerly=testing_utils.should_run_eagerly())
+    with self.assertRaisesRegexp(
+        ValueError, 'Expect x to be a non-empty array or dataset.'):
+      model.fit(x=np.array([]), y=np.array([]))
+
   @keras_parameterized.run_all_keras_modes
   def test_run_eagerly_setting(self):
     model = sequential.Sequential([layers_module.Dense(1)])

From 18c701c7b5b9e06a8c80ac457fc39d9cdab448b6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20Kubov=C4=8D=C3=ADk?= <markub3327@gmail.com>
Date: Sun, 28 Jun 2020 22:40:58 +0200
Subject: [PATCH 1236/1390] Update hdf5_format.py

---
 tensorflow/python/keras/saving/hdf5_format.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tensorflow/python/keras/saving/hdf5_format.py b/tensorflow/python/keras/saving/hdf5_format.py
index 4c42a1ff04e..a3472ff2c68 100644
--- a/tensorflow/python/keras/saving/hdf5_format.py
+++ b/tensorflow/python/keras/saving/hdf5_format.py
@@ -35,7 +35,7 @@ from tensorflow.python.ops import variables as variables_module
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import serialization
 from tensorflow.python.util.lazy_loader import LazyLoader
-from tensorflow.python.platform.gfile import GFile, Remove, Exists
+import tensorflow.python.platform.gfile as gfile
 
 # pylint: disable=g-import-not-at-top
 try:
@@ -57,7 +57,7 @@ sequential_lib = LazyLoader(
 def create_lockfile(filepath):
   lockfile_path = filepath + ".lock"
 
-  f = GFile(lockfile_path, 'w')
+  f = gfile.GFile(lockfile_path, 'w')
   f.write(str(os.getpid()))
   f.close()
 
@@ -65,7 +65,7 @@ def create_lockfile(filepath):
 
 def check_lockfile(filepath):
   lockfile_path = filepath + ".lock"
-  return Exists(lockfile_path)
+  return gfile.Exists(lockfile_path)
 
 def save_model_to_hdf5(model, filepath, overwrite=True, lockfile=True, include_optimizer=True):
   """Saves a model to a HDF5 file.
@@ -150,7 +150,7 @@ def save_model_to_hdf5(model, filepath, overwrite=True, lockfile=True, include_o
 
       # remove lock file
       if (lockfile == True):
-        Remove(lockfile_path)
+        gfile.Remove(lockfile_path)
 
 
 def load_model_from_hdf5(filepath, custom_objects=None, compile=True):  # pylint: disable=redefined-builtin

From 89a77964c8082f0632c636ca4fa21207b934b449 Mon Sep 17 00:00:00 2001
From: settle <31239886+settle@users.noreply.github.com>
Date: Sun, 28 Jun 2020 16:43:21 -0400
Subject: [PATCH 1237/1390] Update bazel version

---
 .../dockerfiles/arm64v8/devel-cpu-arm64v8-jupyter.Dockerfile    | 2 +-
 .../dockerfiles/arm64v8/devel-cpu-arm64v8.Dockerfile            | 2 +-
 .../dockerfiles/ppc64le/devel-cpu-ppc64le-jupyter.Dockerfile    | 2 +-
 .../dockerfiles/ppc64le/devel-cpu-ppc64le.Dockerfile            | 2 +-
 .../dockerfiles/ppc64le/devel-gpu-ppc64le-jupyter.Dockerfile    | 2 +-
 .../dockerfiles/ppc64le/devel-gpu-ppc64le.Dockerfile            | 2 +-
 .../tools/dockerfiles/partials/ubuntu/bazel.partial.Dockerfile  | 2 +-
 .../partials/ubuntu/bazelbuild-arm64v8.partial.Dockerfile       | 2 +-
 .../dockerfiles/partials/ubuntu/bazelbuild.partial.Dockerfile   | 2 +-
 9 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/tensorflow/tools/dockerfiles/dockerfiles/arm64v8/devel-cpu-arm64v8-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/arm64v8/devel-cpu-arm64v8-jupyter.Dockerfile
index 168e57d363a..89b0eca8d3a 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/arm64v8/devel-cpu-arm64v8-jupyter.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/arm64v8/devel-cpu-arm64v8-jupyter.Dockerfile
@@ -93,7 +93,7 @@ RUN python3 -m pip --no-cache-dir install \
     enum34
 
 # Build and install bazel
-ENV BAZEL_VERSION 3.0.0
+ENV BAZEL_VERSION 3.1.0
 WORKDIR /
 RUN mkdir /bazel && \
     cd /bazel && \
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/arm64v8/devel-cpu-arm64v8.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/arm64v8/devel-cpu-arm64v8.Dockerfile
index 70d6df8df14..c3686e99b60 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/arm64v8/devel-cpu-arm64v8.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/arm64v8/devel-cpu-arm64v8.Dockerfile
@@ -93,7 +93,7 @@ RUN python3 -m pip --no-cache-dir install \
     enum34
 
 # Build and install bazel
-ENV BAZEL_VERSION 3.0.0
+ENV BAZEL_VERSION 3.1.0
 WORKDIR /
 RUN mkdir /bazel && \
     cd /bazel && \
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-cpu-ppc64le-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-cpu-ppc64le-jupyter.Dockerfile
index 905faca7893..95dabb57a85 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-cpu-ppc64le-jupyter.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-cpu-ppc64le-jupyter.Dockerfile
@@ -91,7 +91,7 @@ RUN python3 -m pip --no-cache-dir install \
     enum34
 
 # Build and install bazel
-ENV BAZEL_VERSION 3.0.0
+ENV BAZEL_VERSION 3.1.0
 WORKDIR /
 RUN mkdir /bazel && \
     cd /bazel && \
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-cpu-ppc64le.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-cpu-ppc64le.Dockerfile
index 378c5f8279b..98adde7a390 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-cpu-ppc64le.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-cpu-ppc64le.Dockerfile
@@ -91,7 +91,7 @@ RUN python3 -m pip --no-cache-dir install \
     enum34
 
 # Build and install bazel
-ENV BAZEL_VERSION 3.0.0
+ENV BAZEL_VERSION 3.1.0
 WORKDIR /
 RUN mkdir /bazel && \
     cd /bazel && \
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-gpu-ppc64le-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-gpu-ppc64le-jupyter.Dockerfile
index 083ce05d2b2..9775878bbb3 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-gpu-ppc64le-jupyter.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-gpu-ppc64le-jupyter.Dockerfile
@@ -133,7 +133,7 @@ RUN python3 -m pip --no-cache-dir install \
     enum34
 
 # Build and install bazel
-ENV BAZEL_VERSION 3.0.0
+ENV BAZEL_VERSION 3.1.0
 WORKDIR /
 RUN mkdir /bazel && \
     cd /bazel && \
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-gpu-ppc64le.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-gpu-ppc64le.Dockerfile
index 2c13e47c257..535d58ae3f0 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-gpu-ppc64le.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-gpu-ppc64le.Dockerfile
@@ -133,7 +133,7 @@ RUN python3 -m pip --no-cache-dir install \
     enum34
 
 # Build and install bazel
-ENV BAZEL_VERSION 3.0.0
+ENV BAZEL_VERSION 3.1.0
 WORKDIR /
 RUN mkdir /bazel && \
     cd /bazel && \
diff --git a/tensorflow/tools/dockerfiles/partials/ubuntu/bazel.partial.Dockerfile b/tensorflow/tools/dockerfiles/partials/ubuntu/bazel.partial.Dockerfile
index fc6baecb9b6..54fdb2be648 100644
--- a/tensorflow/tools/dockerfiles/partials/ubuntu/bazel.partial.Dockerfile
+++ b/tensorflow/tools/dockerfiles/partials/ubuntu/bazel.partial.Dockerfile
@@ -23,7 +23,7 @@ RUN python3 -m pip --no-cache-dir install \
     enum34
 
 # Install bazel
-ARG BAZEL_VERSION=3.0.0
+ARG BAZEL_VERSION=3.1.0
 RUN mkdir /bazel && \
     wget -O /bazel/installer.sh "https://github.com/bazelbuild/bazel/releases/download/${BAZEL_VERSION}/bazel-${BAZEL_VERSION}-installer-linux-x86_64.sh" && \
     wget -O /bazel/LICENSE.txt "https://raw.githubusercontent.com/bazelbuild/bazel/master/LICENSE" && \
diff --git a/tensorflow/tools/dockerfiles/partials/ubuntu/bazelbuild-arm64v8.partial.Dockerfile b/tensorflow/tools/dockerfiles/partials/ubuntu/bazelbuild-arm64v8.partial.Dockerfile
index 2f923e84737..c5c2028c8a2 100644
--- a/tensorflow/tools/dockerfiles/partials/ubuntu/bazelbuild-arm64v8.partial.Dockerfile
+++ b/tensorflow/tools/dockerfiles/partials/ubuntu/bazelbuild-arm64v8.partial.Dockerfile
@@ -23,7 +23,7 @@ RUN python3 -m pip --no-cache-dir install \
     enum34
 
 # Build and install bazel
-ENV BAZEL_VERSION 3.0.0
+ENV BAZEL_VERSION 3.1.0
 WORKDIR /
 RUN mkdir /bazel && \
     cd /bazel && \
diff --git a/tensorflow/tools/dockerfiles/partials/ubuntu/bazelbuild.partial.Dockerfile b/tensorflow/tools/dockerfiles/partials/ubuntu/bazelbuild.partial.Dockerfile
index 628e56890a8..34ad4d971ba 100644
--- a/tensorflow/tools/dockerfiles/partials/ubuntu/bazelbuild.partial.Dockerfile
+++ b/tensorflow/tools/dockerfiles/partials/ubuntu/bazelbuild.partial.Dockerfile
@@ -21,7 +21,7 @@ RUN python3 -m pip --no-cache-dir install \
     enum34
 
 # Build and install bazel
-ENV BAZEL_VERSION 3.0.0
+ENV BAZEL_VERSION 3.1.0
 WORKDIR /
 RUN mkdir /bazel && \
     cd /bazel && \

From 996f37c25dfa1bd81d28d76c19f46e8755c31d28 Mon Sep 17 00:00:00 2001
From: settle <31239886+settle@users.noreply.github.com>
Date: Sun, 28 Jun 2020 17:52:01 -0400
Subject: [PATCH 1238/1390] Update bazelbuild-arm64v8.partial.Dockerfile (#5)

---
 .../devel-cpu-arm64v8-jupyter.Dockerfile      | 21 +++++++++++--------
 .../arm64v8/devel-cpu-arm64v8.Dockerfile      | 21 +++++++++++--------
 .../bazelbuild-arm64v8.partial.Dockerfile     | 21 +++++++++++--------
 3 files changed, 36 insertions(+), 27 deletions(-)

diff --git a/tensorflow/tools/dockerfiles/dockerfiles/arm64v8/devel-cpu-arm64v8-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/arm64v8/devel-cpu-arm64v8-jupyter.Dockerfile
index 89b0eca8d3a..74ffbb8fc10 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/arm64v8/devel-cpu-arm64v8-jupyter.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/arm64v8/devel-cpu-arm64v8-jupyter.Dockerfile
@@ -78,18 +78,21 @@ RUN apt-get update && apt-get install -y \
     swig
 
 RUN apt-get update && apt-get install -y \
-    python3-pil \
-    python3-h5py \
-    python3-matplotlib \
-    python3-mock \
-    python3-numpy \
-    python3-scipy \
-    python3-sklearn \
-    python3-pandas \
-    python3-portpicker
+    gfortran \
+    libblas-dev \
+    liblapack-dev
 
 RUN python3 -m pip --no-cache-dir install \
+    Pillow \
     keras_preprocessing \
+    h5py \
+    matplotlib \
+    mock \
+    'numpy<1.19.0' \
+    scipy \
+    sklearn \
+    pandas \
+    portpicker \
     enum34
 
 # Build and install bazel
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/arm64v8/devel-cpu-arm64v8.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/arm64v8/devel-cpu-arm64v8.Dockerfile
index c3686e99b60..c2861e9f01e 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/arm64v8/devel-cpu-arm64v8.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/arm64v8/devel-cpu-arm64v8.Dockerfile
@@ -78,18 +78,21 @@ RUN apt-get update && apt-get install -y \
     swig
 
 RUN apt-get update && apt-get install -y \
-    python3-pil \
-    python3-h5py \
-    python3-matplotlib \
-    python3-mock \
-    python3-numpy \
-    python3-scipy \
-    python3-sklearn \
-    python3-pandas \
-    python3-portpicker
+    gfortran \
+    libblas-dev \
+    liblapack-dev
 
 RUN python3 -m pip --no-cache-dir install \
+    Pillow \
     keras_preprocessing \
+    h5py \
+    matplotlib \
+    mock \
+    'numpy<1.19.0' \
+    scipy \
+    sklearn \
+    pandas \
+    portpicker \
     enum34
 
 # Build and install bazel
diff --git a/tensorflow/tools/dockerfiles/partials/ubuntu/bazelbuild-arm64v8.partial.Dockerfile b/tensorflow/tools/dockerfiles/partials/ubuntu/bazelbuild-arm64v8.partial.Dockerfile
index c5c2028c8a2..b5a62151d74 100644
--- a/tensorflow/tools/dockerfiles/partials/ubuntu/bazelbuild-arm64v8.partial.Dockerfile
+++ b/tensorflow/tools/dockerfiles/partials/ubuntu/bazelbuild-arm64v8.partial.Dockerfile
@@ -8,18 +8,21 @@ RUN apt-get update && apt-get install -y \
     swig
 
 RUN apt-get update && apt-get install -y \
-    python3-pil \
-    python3-h5py \
-    python3-matplotlib \
-    python3-mock \
-    python3-numpy \
-    python3-scipy \
-    python3-sklearn \
-    python3-pandas \
-    python3-portpicker
+    gfortran \
+    libblas-dev \
+    liblapack-dev
 
 RUN python3 -m pip --no-cache-dir install \
+    Pillow \
     keras_preprocessing \
+    h5py \
+    matplotlib \
+    mock \
+    'numpy<1.19.0' \
+    scipy \
+    sklearn \
+    pandas \
+    portpicker \
     enum34
 
 # Build and install bazel

From 595b01b5450f9fcbcf4bb52aa810141bb9838ad7 Mon Sep 17 00:00:00 2001
From: Andrew Audibert <aaudibert@google.com>
Date: Sun, 28 Jun 2020 19:06:06 -0700
Subject: [PATCH 1239/1390] Fix typo in snapshot doc

PiperOrigin-RevId: 318740879
Change-Id: I61700cb8f1eeadb10b60ea204dfbcdc922e91d18
---
 tensorflow/python/data/experimental/ops/snapshot.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/data/experimental/ops/snapshot.py b/tensorflow/python/data/experimental/ops/snapshot.py
index 782e9490d25..acba57da2a4 100644
--- a/tensorflow/python/data/experimental/ops/snapshot.py
+++ b/tensorflow/python/data/experimental/ops/snapshot.py
@@ -312,7 +312,7 @@ def snapshot(path, compression="AUTO", reader_func=None, shard_func=None):
       reader_func=user_reader_func))
   ```
 
-  By default, snapshot parallelize reads by the number of cores available on
+  By default, snapshot parallelizes reads by the number of cores available on
   the system, but will not attempt to shuffle the data.
 
   Args:

From f1e09a65aea4cdfd40c47aa7966ae8d128a04849 Mon Sep 17 00:00:00 2001
From: Thai Nguyen <thaink@google.com>
Date: Sun, 28 Jun 2020 20:54:05 -0700
Subject: [PATCH 1240/1390] Add unsupported ops to flex whitelist. The list of
 ops includes: - AdjustContrastv2 - AdjustHue - AdjustSaturation -
 Conv3DBackpropInputV2 - Cumsum - Diag - Dilation2D - EncodeBase64 -
 ImageProjectiveTransformV2 - MatrixInverse - SampleDistortedBoundingBoxV2 -
 StaticRegexPlace - StringSplit - Substr - RaggedRange - RaggedTensorToTensor
 - ReduceJoin - UnicodeDecode - UnicodeDecodeWithOffsets - UnicodeEncode -
 UnicodeTranscode

The binary size of libtensorflowlite_flex_jni.so
 increases 800KB from 26.9MB to 27.7MB.

PiperOrigin-RevId: 318747397
Change-Id: Ic99d87ae46442112b5a2da02e631a415c3da8e33
---
 tensorflow/core/kernels/BUILD                 | 32 +++++++++++++++++
 tensorflow/lite/delegates/flex/build_def.bzl  |  4 ++-
 .../delegates/flex/whitelisted_flex_ops.cc    | 35 +++++++++++++++++++
 tensorflow/lite/toco/tflite/export_test.cc    |  4 +--
 4 files changed, 72 insertions(+), 3 deletions(-)

diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index 6cc3293707f..0ed12d0e1ec 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -6682,6 +6682,9 @@ filegroup(
 filegroup(
     name = "android_extended_ops_headers",
     srcs = [
+        "adjust_contrast_op.h",
+        "adjust_hue_op.h",
+        "adjust_saturation_op.h",
         "argmax_op.h",
         "avgpooling_op.h",
         "batch_matmul_op_impl.h",
@@ -6697,10 +6700,13 @@ filegroup(
         "data_format_ops.h",
         "depthtospace_op.h",
         "depthwise_conv_op.h",
+        "diag_op.h",
+        "dilation_ops.h",
         "extract_image_patches_op.h",
         "fake_quant_ops_functor.h",
         "fused_batch_norm_op.h",
         "gemm_functors.h",
+        "image_ops.h",
         "image_resizer_state.h",
         "initializable_lookup_table.h",
         "inplace_ops.cc",
@@ -6708,6 +6714,8 @@ filegroup(
         "lookup_table_init_op.h",
         "lookup_table_op.h",
         "lookup_util.h",
+        "linalg_ops_common.h",
+        "list_kernels.h",
         "matrix_diag_op.h",
         "matrix_set_diag_op.h",
         "maxpooling_op.h",
@@ -6730,6 +6738,7 @@ filegroup(
         "resize_nearest_neighbor_op.h",
         "reverse_op.h",
         "save_restore_tensor.h",
+        "scan_ops.h",
         "scatter_nd_op.h",
         "scatter_nd_op_cpu_impl.h",
         "segment_reduction_ops.h",
@@ -6743,6 +6752,7 @@ filegroup(
         "string_util.h",
         "string_to_hash_bucket_op.h",
         "tensor_array.h",
+        "tensor_list.h",
         "tile_functor.h",
         "tile_ops_cpu_impl.h",
         "tile_ops_impl.h",
@@ -6772,6 +6782,7 @@ filegroup(
         "conv_grad_filter_ops.cc",
         "conv_grad_input_ops.cc",
         "conv_grad_ops.h",
+        "conv_grad_ops_3d.cc",
         "conv_grad_shape_utils.h",
         "conv_grad_shape_utils.cc",
         "conv_ops.cc",
@@ -6885,17 +6896,26 @@ filegroup(
 filegroup(
     name = "android_extended_ops_group2",
     srcs = [
+        "adjust_contrast_op.cc",
+        "adjust_hue_op.cc",
+        "adjust_saturation_op.cc",
+        "base64_ops.cc",
         "batchtospace_op.cc",
         "broadcast_to_op.cc",
         "bucketize_op.cc",
         "ctc_decoder_ops.cc",
         "depthtospace_op.cc",
+        "diag_op.cc",
+        "dilation_ops.cc",
         "dynamic_stitch_op.cc",
         "extract_image_patches_op.cc",
         "fft_ops.cc",
+        "image_ops.cc",
         "in_topk_op.cc",
         "in_topk_op.h",
         "initializable_lookup_table.cc",
+        "linalg_ops_common.cc",
+        "list_kernels.cc",
         "logging_ops.cc",
         "logging_ops.h",
         "lookup_table_init_op.cc",
@@ -6903,6 +6923,7 @@ filegroup(
         "lookup_util.cc",
         "lrn_op.cc",
         "matrix_diag_op.cc",
+        "matrix_inverse_op.cc",
         "matrix_set_diag_op.cc",
         "maxpooling_op.cc",
         "mfcc.cc",
@@ -6923,9 +6944,12 @@ filegroup(
         "queue_base.cc",
         "queue_op.cc",
         "queue_ops.cc",
+        "ragged_range_op.cc",
+        "ragged_tensor_to_tensor_op.cc",
         "random_op.cc",
         "random_op_cpu.h",
         "random_poisson_op.cc",
+        "reduce_join_op.cc",
         "reduction_ops_all.cc",
         "reduction_ops_any.cc",
         "reduction_ops_common.cc",
@@ -6934,15 +6958,18 @@ filegroup(
         "reduction_ops_min.cc",
         "reduction_ops_prod.cc",
         "reduction_ops_sum.cc",
+        "regex_replace_op.cc",
         "relu_op.cc",
         "reshape_util.cc",
         "resize_bilinear_op.cc",
         "resize_nearest_neighbor_op.cc",
         "restore_op.cc",
         "reverse_op.cc",
+        "sample_distorted_bounding_box_op.cc",
         "save_op.cc",
         "save_restore_tensor.cc",
         "save_restore_v2_ops.cc",
+        "scan_ops.cc",
         "scatter_nd_op.cc",
         "scatter_nd_op_cpu_impl_0.cc",
         "scatter_nd_op_cpu_impl_1.cc",
@@ -6975,10 +7002,13 @@ filegroup(
         "stateless_random_ops.cc",
         "string_join_op.cc",
         "string_util.cc",
+        "string_split_op.cc",
         "string_to_hash_bucket_op.cc",
+        "substr_op.cc",
         "summary_op.cc",
         "tensor_array.cc",
         "tensor_array_ops.cc",
+        "tensor_list.cc",
         "tile_functor_cpu.h",
         "tile_functor_cpu_bfloat16.cc",
         "tile_functor_cpu_bool.cc",
@@ -7009,6 +7039,7 @@ filegroup(
         "training_ops.cc",
         "transpose_functor_cpu.cc",
         "transpose_op.cc",
+        "unicode_ops.cc",
         "unique_op.cc",
         "where_op.cc",
         "xent_op.cc",
@@ -7184,6 +7215,7 @@ cc_library(
         "@com_google_protobuf//:protobuf",
         "@fft2d",
         "@gemmlowp",
+        "@icu//:common",
     ],
     alwayslink = 1,
 )
diff --git a/tensorflow/lite/delegates/flex/build_def.bzl b/tensorflow/lite/delegates/flex/build_def.bzl
index c48ae5744f0..2ff762b658b 100644
--- a/tensorflow/lite/delegates/flex/build_def.bzl
+++ b/tensorflow/lite/delegates/flex/build_def.bzl
@@ -138,7 +138,8 @@ def tflite_flex_jni_library(
             name = "%s_tensorflow_lib" % name,
             srcs = if_mobile([
                 "//tensorflow/core:portable_op_registrations_and_gradients",
-                "//tensorflow/core/kernels:android_all_ops",
+                "//tensorflow/core/kernels:android_core_ops",
+                "//tensorflow/core/kernels:android_extended_ops",
             ]) + [CUSTOM_KERNEL_HEADER.header],
             copts = tf_copts(android_optimization_level_override = None) + tf_opts_nortti_if_lite_protos() + if_ios(["-Os"]),
             defines = [
@@ -165,6 +166,7 @@ def tflite_flex_jni_library(
                 "@gemmlowp",
                 "//tensorflow/core:protos_all_cc",
                 "//tensorflow/core:portable_tensorflow_lib_lite",
+                "//tensorflow/core/platform:strong_hash",
             ],
             alwayslink = 1,
         )
diff --git a/tensorflow/lite/delegates/flex/whitelisted_flex_ops.cc b/tensorflow/lite/delegates/flex/whitelisted_flex_ops.cc
index e173b47ec32..41b68fcb244 100644
--- a/tensorflow/lite/delegates/flex/whitelisted_flex_ops.cc
+++ b/tensorflow/lite/delegates/flex/whitelisted_flex_ops.cc
@@ -27,10 +27,15 @@ bool IsWhitelistedFlexOp(const std::string& tensorflow_op_name) {
           "Add",
           "AddN",
           "AddV2",
+          "AdjustContrast",
+          "AdjustContrastv2",
+          "AdjustHue",
+          "AdjustSaturation",
           "All",
           "Any",
           "ApplyAdadelta",
           "ApplyAdagrad",
+          "ApplyAdagradV2",
           "ApplyAdagradDA",
           "ApplyAdam",
           "ApplyAdaMax",
@@ -56,6 +61,7 @@ bool IsWhitelistedFlexOp(const std::string& tensorflow_op_name) {
           "AudioSpectrogram",
           "AvgPool",
           "AvgPool3D",
+          "AvgPool3DGrad",
           "AvgPoolGrad",
           "BatchMatMul",
           "BatchMatMulV2",
@@ -86,8 +92,11 @@ bool IsWhitelistedFlexOp(const std::string& tensorflow_op_name) {
           "Conv2DBackpropFilter",
           "Conv2DBackpropInput",
           "Conv3D",
+          "Conv3DBackpropInputV2",
           "Cos",
           "Cosh",
+          "Cumprod",
+          "Cumsum",
           "CropAndResize",
           "CropAndResizeGradBoxes",
           "CropAndResizeGradImage",
@@ -97,6 +106,7 @@ bool IsWhitelistedFlexOp(const std::string& tensorflow_op_name) {
           "DataFormatVecPermute",
           "DebugGradientIdentity",
           "DebugGradientRefIdentity",
+          "DecodeBase64",
           "DecodeBmp",
           "DecodeWav",
           "DeleteSessionTensor",
@@ -104,6 +114,8 @@ bool IsWhitelistedFlexOp(const std::string& tensorflow_op_name) {
           "DepthwiseConv2dNative",
           "Dequantize",
           "DestroyTemporaryVariable",
+          "Diag",
+          "Dilation2D",
           "Div",
           "DivNoNan",
           "DynamicPartition",
@@ -112,6 +124,7 @@ bool IsWhitelistedFlexOp(const std::string& tensorflow_op_name) {
           "Elu",
           "EluGrad",
           "Empty",
+          "EncodeBase64",
           "EncodeWav",
           "EnsureShape",
           "Enter",
@@ -140,6 +153,7 @@ bool IsWhitelistedFlexOp(const std::string& tensorflow_op_name) {
           "FusedBatchNorm",
           "FusedBatchNormGrad",
           "FusedBatchNormGradV2",
+          "FusedBatchNormGradV3",
           "FusedBatchNormV2",
           "FusedBatchNormV3",
           "FusedPadConv2D",
@@ -164,6 +178,7 @@ bool IsWhitelistedFlexOp(const std::string& tensorflow_op_name) {
           "IRFFT2D",
           "IRFFT3D",
           "Imag",
+          "ImageProjectiveTransformV2",
           "ImmutableConst",
           "InTopK",
           "InTopKV2",
@@ -191,6 +206,7 @@ bool IsWhitelistedFlexOp(const std::string& tensorflow_op_name) {
           "MatrixDiag",
           "MatrixDiagV2",
           "MatrixDiagV3",
+          "MatrixInverse",
           "MatrixSetDiag",
           "MatrixSetDiagV2",
           "MatrixSetDiagV3",
@@ -278,6 +294,8 @@ bool IsWhitelistedFlexOp(const std::string& tensorflow_op_name) {
           "QueueIsClosedV2",
           "QueueSize",
           "QueueSizeV2",
+          "RaggedRange",
+          "RaggedTensorToTensor",
           "RandomGamma",
           "RandomStandardNormal",
           "RandomUniform",
@@ -288,6 +306,7 @@ bool IsWhitelistedFlexOp(const std::string& tensorflow_op_name) {
           "RealDiv",
           "Reciprocal",
           "ReciprocalGrad",
+          "ReduceJoin",
           "_Recv",
           "RefEnter",
           "RefExit",
@@ -296,6 +315,7 @@ bool IsWhitelistedFlexOp(const std::string& tensorflow_op_name) {
           "RefNextIteration",
           "RefSelect",
           "RefSwitch",
+          "RegexReplace",
           "Relu",
           "Relu6",
           "Relu6Grad",
@@ -310,6 +330,7 @@ bool IsWhitelistedFlexOp(const std::string& tensorflow_op_name) {
           "ResizeNearestNeighborGrad",
           "ResourceApplyAdadelta",
           "ResourceApplyAdagrad",
+          "ResourceApplyAdagradV2",
           "ResourceApplyAdagradDA",
           "ResourceApplyAdam",
           "ResourceApplyAdaMax",
@@ -347,6 +368,7 @@ bool IsWhitelistedFlexOp(const std::string& tensorflow_op_name) {
           "Round",
           "Rsqrt",
           "RsqrtGrad",
+          "SampleDistortedBoundingBoxV2",
           "Save",
           "SaveSlices",
           "SaveV2",
@@ -420,13 +442,19 @@ bool IsWhitelistedFlexOp(const std::string& tensorflow_op_name) {
           "StackPush",
           "StackPushV2",
           "StackV2",
+          "StaticRegexReplace",
           "StopGradient",
           "StridedSlice",
           "StridedSliceAssign",
           "StridedSliceGrad",
           "StringJoin",
+          "StringSplit",
+          "StringSplitV2",
+          "StringToHashBucket",
           "StringToHashBucketFast",
+          "StringToHashBucketStrong",
           "Sub",
+          "Substr",
           "Sum",
           "Switch",
           "SymbolicGradient",
@@ -475,6 +503,10 @@ bool IsWhitelistedFlexOp(const std::string& tensorflow_op_name) {
           "Transpose",
           "TruncateDiv",
           "TruncatedNormal",
+          "UnicodeDecode",
+          "UnicodeDecodeWithOffsets",
+          "UnicodeEncode",
+          "UnicodeTranscode",
           "Unique",
           "UniqueV2",
           "UniqueWithCounts",
@@ -493,6 +525,9 @@ bool IsWhitelistedFlexOp(const std::string& tensorflow_op_name) {
       });
   return whitelisted_flex_ops->find(tensorflow_op_name) !=
          whitelisted_flex_ops->end();
+  // Prevent lint error about this function being too long. This function
+  // is a set of ops, and making it shorter won't help readbility.
+  // NOLINTNEXTLINE
 }
 
 }  // namespace flex
diff --git a/tensorflow/lite/toco/tflite/export_test.cc b/tensorflow/lite/toco/tflite/export_test.cc
index ed347a28d51..16724f7ea46 100644
--- a/tensorflow/lite/toco/tflite/export_test.cc
+++ b/tensorflow/lite/toco/tflite/export_test.cc
@@ -495,7 +495,7 @@ TEST_F(OpSetsTest, TfSelectOnly) {
   EXPECT_THAT(
       ImportExport(
           {"Add", "AdjustHue", "RandomUniform", "UnrollAndFold", "Assert"}),
-      ElementsAre("custom:AdjustHue", "custom:FlexAdd", "custom:FlexAssert",
+      ElementsAre("custom:FlexAdd", "custom:FlexAdjustHue", "custom:FlexAssert",
                   "custom:FlexRandomUniform", "custom:UnrollAndFold"));
 }
 
@@ -512,7 +512,7 @@ TEST_F(OpSetsTest, BuiltinsAndTfSelect) {
   EXPECT_THAT(
       ImportExport(
           {"Add", "AdjustHue", "RandomUniform", "UnrollAndFold", "Assert"}),
-      ElementsAre("builtin:ADD", "custom:AdjustHue", "custom:FlexAssert",
+      ElementsAre("builtin:ADD", "custom:FlexAdjustHue", "custom:FlexAssert",
                   "custom:FlexRandomUniform", "custom:UnrollAndFold"));
 }
 

From 327f734c77bc8e114cedb6059b9ff10323d09d4f Mon Sep 17 00:00:00 2001
From: Jaesung Chung <jaesung@google.com>
Date: Sun, 28 Jun 2020 21:18:39 -0700
Subject: [PATCH 1241/1390] Add lift_variables boolean flag to
 ExperimentalConvertSavedModelV1ToMlir Python API

Lifting variables will be done via the MLIR passes not in the saved model importer.

PiperOrigin-RevId: 318749146
Change-Id: I2bf9701a425d808b1fee5cc96131d47394367d4a
---
 tensorflow/compiler/mlir/python/BUILD         |  2 ++
 tensorflow/compiler/mlir/python/mlir.cc       | 26 +++++++++++++++++--
 tensorflow/compiler/mlir/python/mlir.h        |  4 ++-
 .../tests/tf_saved_model/common_v1.py         | 12 ++++-----
 tensorflow/python/mlir_wrapper.cc             |  5 ++--
 tensorflow/python/pywrap_mlir.py              |  3 ++-
 6 files changed, 39 insertions(+), 13 deletions(-)

diff --git a/tensorflow/compiler/mlir/python/BUILD b/tensorflow/compiler/mlir/python/BUILD
index 3cb8d059164..5bbfba773a3 100644
--- a/tensorflow/compiler/mlir/python/BUILD
+++ b/tensorflow/compiler/mlir/python/BUILD
@@ -11,6 +11,8 @@ cc_library(
         "//tensorflow/c:tf_status",
         "//tensorflow/c:tf_status_helper",
         "//tensorflow/compiler/mlir/tensorflow:convert_graphdef",
+        "//tensorflow/compiler/mlir/tensorflow:tensorflow_passes",
+        "//tensorflow/compiler/mlir/tensorflow:tf_saved_model_passes",
         "//tensorflow/compiler/mlir/tensorflow:error_util",
         # (yongtang) The graph_optimization_pass_registration needs to be part
         # of a shared object that will be loaded whenever `import tensorflow`
diff --git a/tensorflow/compiler/mlir/python/mlir.cc b/tensorflow/compiler/mlir/python/mlir.cc
index f22fb519a64..1f6c1a13e07 100644
--- a/tensorflow/compiler/mlir/python/mlir.cc
+++ b/tensorflow/compiler/mlir/python/mlir.cc
@@ -21,6 +21,8 @@ limitations under the License.
 #include "mlir/Pass/PassRegistry.h"  // from @llvm-project
 #include "tensorflow/c/tf_status.h"
 #include "tensorflow/c/tf_status_helper.h"
+#include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
+#include "tensorflow/compiler/mlir/tensorflow/transforms/tf_saved_model_passes.h"
 #include "tensorflow/compiler/mlir/tensorflow/translate/import_model.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/error_util.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/import_utils.h"
@@ -95,7 +97,7 @@ std::string ExperimentalConvertSavedModelToMlir(
 
 std::string ExperimentalConvertSavedModelV1ToMlir(
     const std::string &saved_model_path, const std::string &tags,
-    bool show_debug_info, TF_Status *status) {
+    bool lift_variables, bool show_debug_info, TF_Status *status) {
   // Load the saved model into a SavedModelBundle.
 
   std::unordered_set<string> tag_set =
@@ -118,7 +120,27 @@ std::string ExperimentalConvertSavedModelV1ToMlir(
     return "// error";
   }
 
-  return MlirModuleToString(*module_or.ConsumeValueOrDie(), show_debug_info);
+  // Run the tf standard pipeline by default and then, run passes that lift
+  // variables if the flag is set on the module.
+  mlir::OwningModuleRef module = module_or.ConsumeValueOrDie();
+  mlir::PassManager pm(&context);
+  std::string error;
+  llvm::raw_string_ostream error_stream(error);
+
+  mlir::TF::StandardPipelineOptions tf_options;
+  mlir::TF::CreateTFStandardPipeline(pm, tf_options);
+  if (lift_variables) {
+    pm.addPass(mlir::TF::CreatePromoteVarHandlesToArgsPass());
+    pm.addPass(
+        mlir::tf_saved_model::CreateLiftVariablesPass(bundle.GetSession()));
+  }
+
+  mlir::StatusScopedDiagnosticHandler diagnostic_handler(&context);
+  if (failed(pm.run(*module))) {
+    Set_TF_Status_from_Status(status, diagnostic_handler.ConsumeStatus());
+    return "// error";
+  }
+  return MlirModuleToString(*module, show_debug_info);
 }
 
 std::string ExperimentalRunPassPipeline(const std::string &mlir_txt,
diff --git a/tensorflow/compiler/mlir/python/mlir.h b/tensorflow/compiler/mlir/python/mlir.h
index b85b40981a1..98f892da13e 100644
--- a/tensorflow/compiler/mlir/python/mlir.h
+++ b/tensorflow/compiler/mlir/python/mlir.h
@@ -50,12 +50,14 @@ std::string ExperimentalConvertSavedModelToMlir(
 // Args:
 //   saved_model_path: File path from which to load the SavedModel.
 //   tags: Tags to identify MetaGraphDef that need to be loaded.
+//   lift_variables: Boolean flag that indicates whether to hoist variables
+//                   after loading the SavedModel.
 //
 // Returns:
 //   A string of textual MLIR representing the raw imported SavedModel.
 std::string ExperimentalConvertSavedModelV1ToMlir(
     const std::string &saved_model_path, const std::string &tags,
-    bool show_debug_info, TF_Status *status);
+    bool lift_variables, bool show_debug_info, TF_Status *status);
 
 std::string ExperimentalRunPassPipeline(const std::string &mlir_txt,
                                         const std::string &pass_pipeline,
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/common_v1.py b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/common_v1.py
index 5bfcfa5378a..c8dcb7ba231 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/common_v1.py
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/common_v1.py
@@ -95,15 +95,13 @@ def do_test(signature_def_map,
     builder.save()
 
     logging.info('Saved model to: %s', save_model_path)
+    # TODO(b/153507667): Set the following boolean flag once the hoisting
+    #                    variables logic from SavedModel importer is removed.
+    lift_variables = False
     mlir = pywrap_mlir.experimental_convert_saved_model_v1_to_mlir(
         save_model_path, ','.join([tf.saved_model.tag_constants.SERVING]),
-        show_debug_info)
-    # We don't strictly need this, but it serves as a handy sanity check
-    # for that API, which is otherwise a bit annoying to test.
-    # The canonicalization shouldn't affect these tests in any way.
-    mlir = pywrap_mlir.experimental_run_pass_pipeline(mlir,
-                                                      'tf-standard-pipeline',
-                                                      show_debug_info)
+        lift_variables, show_debug_info)
+
     if canonicalize:
       mlir = pywrap_mlir.experimental_run_pass_pipeline(mlir, 'canonicalize',
                                                         show_debug_info)
diff --git a/tensorflow/python/mlir_wrapper.cc b/tensorflow/python/mlir_wrapper.cc
index 9b40ab4cb71..f37874525ef 100644
--- a/tensorflow/python/mlir_wrapper.cc
+++ b/tensorflow/python/mlir_wrapper.cc
@@ -44,12 +44,13 @@ PYBIND11_MODULE(_pywrap_mlir, m) {
 
   m.def("ExperimentalConvertSavedModelV1ToMlir",
         [](const std::string &saved_model_path, const std::string &tags,
-           bool show_debug_info) {
+           bool lift_variables, bool show_debug_info) {
           tensorflow::Safe_TF_StatusPtr status =
               tensorflow::make_safe(TF_NewStatus());
           std::string output =
               tensorflow::ExperimentalConvertSavedModelV1ToMlir(
-                  saved_model_path, tags, show_debug_info, status.get());
+                  saved_model_path, tags, lift_variables, show_debug_info,
+                  status.get());
           tensorflow::MaybeRaiseRegisteredFromTFStatus(status.get());
           return output;
         });
diff --git a/tensorflow/python/pywrap_mlir.py b/tensorflow/python/pywrap_mlir.py
index 73c69a8b4bd..4e934046d94 100644
--- a/tensorflow/python/pywrap_mlir.py
+++ b/tensorflow/python/pywrap_mlir.py
@@ -37,10 +37,11 @@ def experimental_convert_saved_model_to_mlir(saved_model_path, exported_names,
 
 
 def experimental_convert_saved_model_v1_to_mlir(saved_model_path, tags,
+                                                lift_variables,
                                                 show_debug_info):
   return ExperimentalConvertSavedModelV1ToMlir(
       str(saved_model_path).encode('utf-8'),
-      str(tags).encode('utf-8'), show_debug_info)
+      str(tags).encode('utf-8'), lift_variables, show_debug_info)
 
 
 def experimental_run_pass_pipeline(mlir_txt, pass_pipeline, show_debug_info):

From e89160d8d32de79b4235d5a83f73fba14bee72e0 Mon Sep 17 00:00:00 2001
From: Thai Nguyen <thaink@google.com>
Date: Sun, 28 Jun 2020 21:33:16 -0700
Subject: [PATCH 1242/1390] Keep the flex whitelist sorted

These markers will automatically sort the ops list when users add ops, making it easier to manage and avoid duplicate. This cl does not add any additional ops.

PiperOrigin-RevId: 318750011
Change-Id: I3f2e2516acbdabd15963a65fb87ecdf559da69f1
---
 .../delegates/flex/whitelisted_flex_ops.cc    | 74 ++++++++++---------
 1 file changed, 38 insertions(+), 36 deletions(-)

diff --git a/tensorflow/lite/delegates/flex/whitelisted_flex_ops.cc b/tensorflow/lite/delegates/flex/whitelisted_flex_ops.cc
index 41b68fcb244..b76a3b52356 100644
--- a/tensorflow/lite/delegates/flex/whitelisted_flex_ops.cc
+++ b/tensorflow/lite/delegates/flex/whitelisted_flex_ops.cc
@@ -22,6 +22,7 @@ namespace flex {
 bool IsWhitelistedFlexOp(const std::string& tensorflow_op_name) {
   static const std::set<std::string>* whitelisted_flex_ops =
       new std::set<std::string>({
+          // go/keep-sorted start
           "Abort",
           "Abs",
           "Add",
@@ -33,12 +34,12 @@ bool IsWhitelistedFlexOp(const std::string& tensorflow_op_name) {
           "AdjustSaturation",
           "All",
           "Any",
+          "ApplyAdaMax",
           "ApplyAdadelta",
           "ApplyAdagrad",
-          "ApplyAdagradV2",
           "ApplyAdagradDA",
+          "ApplyAdagradV2",
           "ApplyAdam",
-          "ApplyAdaMax",
           "ApplyAddSign",
           "ApplyCenteredRMSProp",
           "ApplyFtrl",
@@ -50,10 +51,8 @@ bool IsWhitelistedFlexOp(const std::string& tensorflow_op_name) {
           "ApplyProximalGradientDescent",
           "ApplyRMSProp",
           "ApproximateEqual",
-          "_Arg",
           "ArgMax",
           "ArgMin",
-          "_ArrayToList",
           "Assert",
           "Assign",
           "AssignAdd",
@@ -73,10 +72,12 @@ bool IsWhitelistedFlexOp(const std::string& tensorflow_op_name) {
           "BiasAddGrad",
           "BiasAddV1",
           "BoostedTreesBucketize",
-          "Bucketize",
           "BroadcastArgs",
           "BroadcastGradientArgs",
           "BroadcastTo",
+          "Bucketize",
+          "CTCBeamSearchDecoder",
+          "CTCGreedyDecoder",
           "Cast",
           "Ceil",
           "CheckNumerics",
@@ -95,13 +96,11 @@ bool IsWhitelistedFlexOp(const std::string& tensorflow_op_name) {
           "Conv3DBackpropInputV2",
           "Cos",
           "Cosh",
-          "Cumprod",
-          "Cumsum",
           "CropAndResize",
           "CropAndResizeGradBoxes",
           "CropAndResizeGradImage",
-          "CTCBeamSearchDecoder",
-          "CTCGreedyDecoder",
+          "Cumprod",
+          "Cumsum",
           "DataFormatDimMap",
           "DataFormatVecPermute",
           "DebugGradientIdentity",
@@ -134,6 +133,11 @@ bool IsWhitelistedFlexOp(const std::string& tensorflow_op_name) {
           "Exp",
           "ExpandDims",
           "ExtractImagePatches",
+          "FFT",
+          "FFT2D",
+          "FFT3D",
+          "FIFOQueue",
+          "FIFOQueueV2",
           "FakeQuantWithMinMaxArgs",
           "FakeQuantWithMinMaxArgsGradient",
           "FakeQuantWithMinMaxVars",
@@ -141,11 +145,6 @@ bool IsWhitelistedFlexOp(const std::string& tensorflow_op_name) {
           "FakeQuantWithMinMaxVarsPerChannel",
           "FakeQuantWithMinMaxVarsPerChannelGradient",
           "FakeQueue",
-          "FFT",
-          "FFT2D",
-          "FFT3D",
-          "FIFOQueue",
-          "FIFOQueueV2",
           "Fill",
           "Floor",
           "FloorDiv",
@@ -166,42 +165,38 @@ bool IsWhitelistedFlexOp(const std::string& tensorflow_op_name) {
           "GetSessionTensor",
           "Greater",
           "GreaterEqual",
-          "_HostCast",
-          "_HostRecv",
-          "_HostSend",
-          "Identity",
-          "IdentityN",
           "IFFT",
           "IFFT2D",
           "IFFT3D",
           "IRFFT",
           "IRFFT2D",
           "IRFFT3D",
+          "Identity",
+          "IdentityN",
           "Imag",
           "ImageProjectiveTransformV2",
           "ImmutableConst",
           "InTopK",
           "InTopKV2",
           "Inv",
-          "InvertPermutation",
           "InvGrad",
+          "InvertPermutation",
           "IsFinite",
           "IsNan",
           "IsVariableInitialized",
+          "LRN",
           "LeakyRelu",
           "LeakyReluGrad",
           "Less",
           "LessEqual",
           "LinSpace",
           "ListDiff",
-          "_ListToArray",
           "Log",
+          "LogSoftmax",
           "LogicalAnd",
           "LogicalNot",
           "LogicalOr",
-          "LogSoftmax",
           "LoopCond",
-          "LRN",
           "MatMul",
           "MatrixDiag",
           "MatrixDiagV2",
@@ -211,7 +206,6 @@ bool IsWhitelistedFlexOp(const std::string& tensorflow_op_name) {
           "MatrixSetDiagV2",
           "MatrixSetDiagV3",
           "Max",
-          "Maximum",
           "MaxPool",
           "MaxPool3D",
           "MaxPoolGrad",
@@ -221,6 +215,7 @@ bool IsWhitelistedFlexOp(const std::string& tensorflow_op_name) {
           "MaxPoolGradWithArgmax",
           "MaxPoolV2",
           "MaxPoolWithArgmax",
+          "Maximum",
           "Mean",
           "Merge",
           "MergeV2Checkpoints",
@@ -234,20 +229,20 @@ bool IsWhitelistedFlexOp(const std::string& tensorflow_op_name) {
           "Multinomial",
           "Neg",
           "NextIteration",
+          "NoOp",
           "NonMaxSuppression",
           "NonMaxSuppressionV2",
           "NonMaxSuppressionV3",
           "NonMaxSuppressionV4",
           "NonMaxSuppressionWithOverlaps",
-          "NoOp",
           "NotEqual",
           "OneHot",
           "OnesLike",
           "Pack",
           "Pad",
+          "PadV2",
           "PaddingFIFOQueue",
           "PaddingFIFOQueueV2",
-          "PadV2",
           "ParallelDynamicStitch",
           "ParseExample",
           "ParseExampleV2",
@@ -262,6 +257,8 @@ bool IsWhitelistedFlexOp(const std::string& tensorflow_op_name) {
           "Print",
           "PrintV2",
           "Prod",
+          "QuantizeDownAndShrinkRange",
+          "QuantizeV2",
           "QuantizedAdd",
           "QuantizedAvgPool",
           "QuantizedBatchNormWithGlobalNormalization",
@@ -272,12 +269,10 @@ bool IsWhitelistedFlexOp(const std::string& tensorflow_op_name) {
           "QuantizedMatMul",
           "QuantizedMaxPool",
           "QuantizedMul",
-          "QuantizeDownAndShrinkRange",
           "QuantizedRelu",
           "QuantizedRelu6",
           "QuantizedReshape",
           "QuantizedResizeBilinear",
-          "QuantizeV2",
           "QueueClose",
           "QueueCloseV2",
           "QueueDequeue",
@@ -294,6 +289,9 @@ bool IsWhitelistedFlexOp(const std::string& tensorflow_op_name) {
           "QueueIsClosedV2",
           "QueueSize",
           "QueueSizeV2",
+          "RFFT",
+          "RFFT2D",
+          "RFFT3D",
           "RaggedRange",
           "RaggedTensorToTensor",
           "RandomGamma",
@@ -307,7 +305,6 @@ bool IsWhitelistedFlexOp(const std::string& tensorflow_op_name) {
           "Reciprocal",
           "ReciprocalGrad",
           "ReduceJoin",
-          "_Recv",
           "RefEnter",
           "RefExit",
           "RefIdentity",
@@ -328,12 +325,12 @@ bool IsWhitelistedFlexOp(const std::string& tensorflow_op_name) {
           "ResizeBilinearGrad",
           "ResizeNearestNeighbor",
           "ResizeNearestNeighborGrad",
+          "ResourceApplyAdaMax",
           "ResourceApplyAdadelta",
           "ResourceApplyAdagrad",
-          "ResourceApplyAdagradV2",
           "ResourceApplyAdagradDA",
+          "ResourceApplyAdagradV2",
           "ResourceApplyAdam",
-          "ResourceApplyAdaMax",
           "ResourceApplyAddSign",
           "ResourceApplyCenteredRMSProp",
           "ResourceApplyFtrl",
@@ -358,13 +355,9 @@ bool IsWhitelistedFlexOp(const std::string& tensorflow_op_name) {
           "Restore",
           "RestoreSlice",
           "RestoreV2",
-          "_Retval",
           "Reverse",
           "ReverseSequence",
           "ReverseV2",
-          "RFFT",
-          "RFFT2D",
-          "RFFT3D",
           "Round",
           "Rsqrt",
           "RsqrtGrad",
@@ -381,7 +374,6 @@ bool IsWhitelistedFlexOp(const std::string& tensorflow_op_name) {
           "Select",
           "Selu",
           "SeluGrad",
-          "_Send",
           "Shape",
           "ShapeN",
           "ShardedFilename",
@@ -522,6 +514,16 @@ bool IsWhitelistedFlexOp(const std::string& tensorflow_op_name) {
           "Xdivy",
           "Xlogy",
           "ZerosLike",
+          "_Arg",
+          "_ArrayToList",
+          "_HostCast",
+          "_HostRecv",
+          "_HostSend",
+          "_ListToArray",
+          "_Recv",
+          "_Retval",
+          "_Send",
+          // go/keep-sorted end
       });
   return whitelisted_flex_ops->find(tensorflow_op_name) !=
          whitelisted_flex_ops->end();

From 609b8407c29a27ab8430657227d347d9c2c88cd7 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 29 Jun 2020 02:01:53 -0700
Subject: [PATCH 1243/1390] Update GraphDef version to 447.

PiperOrigin-RevId: 318772149
Change-Id: I157c4357dbb35b0c5526f48bc752d3642201ca24
---
 tensorflow/core/public/version.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index 595486022fe..48fba954022 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -108,7 +108,7 @@ limitations under the License.
 
 #define TF_GRAPH_DEF_VERSION_MIN_PRODUCER 0
 #define TF_GRAPH_DEF_VERSION_MIN_CONSUMER 0
-#define TF_GRAPH_DEF_VERSION 446  // Updated: 2020/6/28
+#define TF_GRAPH_DEF_VERSION 447  // Updated: 2020/6/29
 
 // Checkpoint compatibility versions (the versions field in SavedSliceMeta).
 //

From f3ccf59812a46a6031e6fe033298cd94d05c7626 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 29 Jun 2020 02:01:54 -0700
Subject: [PATCH 1244/1390] compat: Update forward compatibility horizon to
 2020-06-29

PiperOrigin-RevId: 318772157
Change-Id: If32a0ee2f27fa77e3725f2862d41422a526264e3
---
 tensorflow/python/compat/compat.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/compat/compat.py b/tensorflow/python/compat/compat.py
index 3ba4e8949b8..71a30af6e47 100644
--- a/tensorflow/python/compat/compat.py
+++ b/tensorflow/python/compat/compat.py
@@ -33,7 +33,7 @@ from tensorflow.python.util.tf_export import tf_export
 # This value changes every day with an automatic CL. It can be modified in code
 # via `forward_compatibility_horizon()` or with the environment variable
 # TF_FORWARD_COMPATIBILITY_DELTA_DAYS, which is added to the compatibility date.
-_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2020, 6, 28)
+_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2020, 6, 29)
 _FORWARD_COMPATIBILITY_DELTA_DAYS_VAR_NAME = "TF_FORWARD_COMPATIBILITY_DELTA_DAYS"
 _FORWARD_COMPATIBILITY_DATE_NUMBER = None
 

From e66a20bb445ff1de5d03d822f8a06ad2aff29088 Mon Sep 17 00:00:00 2001
From: Stephan Herhut <herhut@google.com>
Date: Mon, 29 Jun 2020 03:32:41 -0700
Subject: [PATCH 1245/1390] Integrate LLVM at
 https://github.com/llvm/llvm-project/commit/1becd298b82e

PiperOrigin-RevId: 318783264
Change-Id: Ia60554710b51f7d62de6bff9c157932b0ef929ea
---
 tensorflow/workspace.bzl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index 39e1e26dea7..feae4080dd2 100755
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -710,8 +710,8 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
     )
 
     # Check out LLVM and MLIR from llvm-project.
-    LLVM_COMMIT = "ff5ccf258e297df29f32d6b5e4fa0a7b95c44f9c"
-    LLVM_SHA256 = "dba7d310b0703b103df38381d00abe78b4354887e5f0241bd3d212694780ae5f"
+    LLVM_COMMIT = "1becd298b82ed2f1a8ba5e61c5ad2ce7fe32d812"
+    LLVM_SHA256 = "a8f0947748a8e576675932a85537b9d67495ef7cd4e663811ea2f96fe2c2db47"
     LLVM_URLS = [
         "https://storage.googleapis.com/mirror.tensorflow.org/github.com/llvm/llvm-project/archive/{commit}.tar.gz".format(commit = LLVM_COMMIT),
         "https://github.com/llvm/llvm-project/archive/{commit}.tar.gz".format(commit = LLVM_COMMIT),

From 22def20bae7be6d5b790b360abed5919385b16c2 Mon Sep 17 00:00:00 2001
From: Christian Sigg <csigg@google.com>
Date: Mon, 29 Jun 2020 04:23:28 -0700
Subject: [PATCH 1246/1390] New ROCm 3.5 RBE docker based on Ubuntu 18.04,
 re-enable RBE.

Fix list of cxx_builtin_include_directories. Only a few are needed, but those are more complicated (mix of symlinked and real paths).

Properly return error from crosstool wrapper.

PiperOrigin-RevId: 318788040
Change-Id: Ia66898e98a9a4d8fb479c7e75317f4114f6081e5
---
 .bazelrc                                      | 17 ++++
 tensorflow/core/util/gpu_launch_config.h      | 40 ++-------
 ....local-toolchain-ubuntu18.04-manylinux2010 | 34 ++++++++
 .../ci_build/Dockerfile.rbe.rocm-ubuntu16.04  | 37 ---------
 ...rocm-ubuntu18.04-manylinux2010-multipython | 79 ++++++++++++++++++
 .../bin/crosstool_wrapper_driver_rocm.tpl     | 19 ++++-
 third_party/gpus/rocm_configure.bzl           | 83 +++----------------
 .../preconfig/generate/containers.bzl         |  2 +-
 .../toolchains/remote_config/configs.bzl      | 12 +--
 .../toolchains/remote_config/containers.bzl   | 10 ++-
 10 files changed, 184 insertions(+), 149 deletions(-)
 create mode 100644 tensorflow/tools/ci_build/Dockerfile.local-toolchain-ubuntu18.04-manylinux2010
 delete mode 100644 tensorflow/tools/ci_build/Dockerfile.rbe.rocm-ubuntu16.04
 create mode 100644 tensorflow/tools/ci_build/Dockerfile.rbe.rocm-ubuntu18.04-manylinux2010-multipython

diff --git a/.bazelrc b/.bazelrc
index f11c376df65..9f33b600dcd 100644
--- a/.bazelrc
+++ b/.bazelrc
@@ -458,6 +458,23 @@ build:rbe_linux_cuda_clang_py36 --config=rbe_linux_cuda_clang_base --repo_env=TF
 build:rbe_linux_cuda_clang_py37 --config=rbe_linux_cuda_clang_base --repo_env=TF_PYTHON_CONFIG_REPO="@ubuntu16.04-clang_manylinux2010-cuda10.1-cudnn7-tensorrt6.0_config_python3.7"
 build:rbe_linux_cuda_clang_py38 --config=rbe_linux_cuda_clang_base --repo_env=TF_PYTHON_CONFIG_REPO="@ubuntu16.04-clang_manylinux2010-cuda10.1-cudnn7-tensorrt6.0_config_python3.8"
 
+# ROCm
+build:rbe_linux_rocm_base --config=rbe_linux
+build:rbe_linux_rocm_base --repo_env=TF_NEED_ROCM=1
+build:rbe_linux_rocm_base --crosstool_top="@ubuntu18.04-gcc7_manylinux2010-rocm_config_rocm//crosstool:toolchain"
+build:rbe_linux_rocm_base --extra_toolchains="@ubuntu18.04-gcc7_manylinux2010-rocm_config_rocm//crosstool:toolchain-linux-x86_64"
+build:rbe_linux_rocm_base --extra_execution_platforms="@ubuntu18.04-gcc7_manylinux2010-rocm_config_platform//:platform"
+build:rbe_linux_rocm_base --host_platform="@ubuntu18.04-gcc7_manylinux2010-rocm_config_platform//:platform"
+build:rbe_linux_rocm_base --platforms="@ubuntu18.04-gcc7_manylinux2010-rocm_config_platform//:platform"
+build:rbe_linux_rocm_base --action_env=TF_ROCM_CONFIG_REPO="@ubuntu18.04-gcc7_manylinux2010-rocm_config_rocm"
+build:rbe_linux_rocm_base --define=using_rocm_hipcc=true
+build:rbe_linux_rocm_py2.7 --config=rbe_linux_rocm_base --repo_env=TF_PYTHON_CONFIG_REPO="@ubuntu18.04-gcc7_manylinux2010-rocm_config_python2.7"
+build:rbe_linux_rocm_py3.5 --config=rbe_linux_rocm_base --repo_env=TF_PYTHON_CONFIG_REPO="@ubuntu18.04-gcc7_manylinux2010-rocm_config_python3.5"
+build:rbe_linux_rocm_py3.6 --config=rbe_linux_rocm_base --repo_env=TF_PYTHON_CONFIG_REPO="@ubuntu18.04-gcc7_manylinux2010-rocm_config_python3.6"
+build:rbe_linux_rocm_py3.7 --config=rbe_linux_rocm_base --repo_env=TF_PYTHON_CONFIG_REPO="@ubuntu18.04-gcc7_manylinux2010-rocm_config_python3.7"
+build:rbe_linux_rocm_py3.8 --config=rbe_linux_rocm_base --repo_env=TF_PYTHON_CONFIG_REPO="@ubuntu18.04-gcc7_manylinux2010-rocm_config_python3.8"
+
+# Linux CPU
 build:rbe_linux_py2 --config=rbe_linux
 build:rbe_linux_py2 --repo_env=PYTHON_BIN_PATH="/usr/bin/python2"
 build:rbe_linux_py2 --python_path="/usr/bin/python2"
diff --git a/tensorflow/core/util/gpu_launch_config.h b/tensorflow/core/util/gpu_launch_config.h
index 4dfaf333d4b..0b943e917da 100644
--- a/tensorflow/core/util/gpu_launch_config.h
+++ b/tensorflow/core/util/gpu_launch_config.h
@@ -168,18 +168,10 @@ GpuLaunchConfig GetGpuLaunchConfig(int work_element_count,
       block_size_limit);
   CHECK_EQ(err, cudaSuccess);
 #elif TENSORFLOW_USE_ROCM
-  // Earlier versions of this HIP routine incorrectly returned void.
-  // TODO re-enable hipError_t error checking when HIP is fixed.
-  // ROCm interface uses unsigned int, convert after checking
-  uint32_t block_count_uint = 0;
-  uint32_t thread_per_block_uint = 0;
-  CHECK_GE(block_size_limit, 0);
-  uint32_t block_size_limit_uint = static_cast<uint32_t>(block_size_limit);
-  hipOccupancyMaxPotentialBlockSize(&block_count_uint, &thread_per_block_uint,
-                                    func, dynamic_shared_memory_size,
-                                    block_size_limit_uint);
-  block_count = static_cast<int>(block_count_uint);
-  thread_per_block = static_cast<int>(thread_per_block_uint);
+  hipError_t err = hipOccupancyMaxPotentialBlockSize(
+      &block_count, &thread_per_block, func, dynamic_shared_memory_size,
+      block_size_limit);
+  CHECK_EQ(err, hipSuccess);
 #endif
 
   block_count =
@@ -208,27 +200,13 @@ GpuLaunchConfig GetGpuLaunchConfigFixedBlockSize(
   cudaError_t err = cudaOccupancyMaxActiveBlocksPerMultiprocessor(
       &block_count, func, fixed_block_size, dynamic_shared_memory_size);
   CHECK_EQ(err, cudaSuccess);
+#elif TENSORFLOW_USE_ROCM
+  hipError_t err = hipOccupancyMaxActiveBlocksPerMultiprocessor(
+      &block_count, func, fixed_block_size, dynamic_shared_memory_size);
+  CHECK_EQ(err, hipSuccess);
+#endif
   block_count = std::min(block_count * d.getNumGpuMultiProcessors(),
                          DivUp(work_element_count, fixed_block_size));
-#elif TENSORFLOW_USE_ROCM
-  // ROCM TODO re-enable this after hipOccupancyMaxActiveBlocksPerMultiprocessor
-  // is implemented
-  // hipError_t err = hipOccupancyMaxActiveBlocksPerMultiprocessor(
-  //    &block_count, &thread_per_block, func, dynamic_shared_memory_size,
-  //    block_size_limit);
-  // CHECK_EQ(err, hipSuccess);
-
-  // Apply the heuristic in GetGpuLaunchConfig(int, const Eigen::GpuDevice&)
-  // that the kernel is quite simple and will largely be memory-limited.
-  const int physical_thread_count = std::min(
-      d.getNumGpuMultiProcessors() * d.maxGpuThreadsPerMultiProcessor(),
-      work_element_count);
-  // Assume the kernel be simple enough that it is okay to use 1024 threads
-  // per workgroup.
-  int thread_per_block = std::min(1024, d.maxGpuThreadsPerBlock());
-  block_count = std::min(DivUp(physical_thread_count, thread_per_block),
-                         d.getNumGpuMultiProcessors());
-#endif
 
   config.virtual_thread_count = work_element_count;
   config.thread_per_block = fixed_block_size;
diff --git a/tensorflow/tools/ci_build/Dockerfile.local-toolchain-ubuntu18.04-manylinux2010 b/tensorflow/tools/ci_build/Dockerfile.local-toolchain-ubuntu18.04-manylinux2010
new file mode 100644
index 00000000000..df5eec3135d
--- /dev/null
+++ b/tensorflow/tools/ci_build/Dockerfile.local-toolchain-ubuntu18.04-manylinux2010
@@ -0,0 +1,34 @@
+# Dockerfile to build a manylinux 2010 compliant cross-compiler.
+#
+# Builds a devtoolset gcc/libstdc++ that targets manylinux 2010 compatible
+# glibc (2.12) and system libstdc++ (4.4).
+#
+# To build a new version, run:
+# $ docker build -f Dockerfile.local-toolchain-ubuntu18.04-manylinux2010 \
+#  --tag "local-toolchain-ubuntu18.04-manylinux2010" .
+
+FROM ubuntu:18.04 as local-toolchain-ubuntu18.04-manylinux2010
+
+ENV DEBIAN_FRONTEND=noninteractive
+RUN apt-get update && apt-get install -y \
+      cpio \
+      file \
+      flex \
+      g++ \
+      make \
+      patch \
+      rpm2cpio \
+      unar \
+      wget \
+      xz-utils \
+      && \
+    rm -rf /var/lib/apt/lists/*
+
+ADD devtoolset/fixlinks.sh fixlinks.sh
+ADD devtoolset/build_devtoolset.sh build_devtoolset.sh
+ADD devtoolset/rpm-patch.sh rpm-patch.sh
+
+# Set up a sysroot for glibc 2.12 / libstdc++ 4.4 / devtoolset-7 in /dt7.
+RUN /build_devtoolset.sh devtoolset-7 /dt7
+# Set up a sysroot for glibc 2.12 / libstdc++ 4.4 / devtoolset-8 in /dt8.
+RUN /build_devtoolset.sh devtoolset-8 /dt8
diff --git a/tensorflow/tools/ci_build/Dockerfile.rbe.rocm-ubuntu16.04 b/tensorflow/tools/ci_build/Dockerfile.rbe.rocm-ubuntu16.04
deleted file mode 100644
index 7fb037f0dfa..00000000000
--- a/tensorflow/tools/ci_build/Dockerfile.rbe.rocm-ubuntu16.04
+++ /dev/null
@@ -1,37 +0,0 @@
-# $ docker build -f Dockerfile.rbe.rocm-ubuntu16.04 \
-#       --tag "gcr.io/tensorflow-testing/nosla-rocm-ubuntu16.04" .
-# $ docker push gcr.io/tensorflow-testing/nosla-rocm-ubuntu16.04
-FROM launcher.gcr.io/google/rbe-ubuntu16-04:latest
-MAINTAINER Christian Sigg <csigg@google.com>
-
-ARG DEB_ROCM_REPO=http://repo.radeon.com/rocm/apt/debian/
-ARG ROCM_PATH=/opt/rocm
-
-# Add rocm repository
-RUN apt-get clean all
-RUN wget -qO - $DEB_ROCM_REPO/rocm.gpg.key | apt-key add -
-RUN sh -c  "echo deb [arch=amd64] $DEB_ROCM_REPO xenial main > /etc/apt/sources.list.d/rocm.list"
-
-# Install rocm pkgs
-RUN apt-get update --allow-insecure-repositories && \
-    DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated \
-    rocm-dev rocm-libs hipcub rocm-utils rocm-cmake \
-    rocfft miopen-hip miopengemm rocblas hipblas rocrand rccl && \
-    apt-get clean && \
-    rm -rf /var/lib/apt/lists/*
-
-ENV HCC_HOME=$ROCM_PATH/hcc
-ENV HIP_PATH=$ROCM_PATH/hip
-ENV OPENCL_ROOT=$ROCM_PATH/opencl
-ENV PATH="$HCC_HOME/bin:$HIP_PATH/bin:${PATH}"
-ENV PATH="$ROCM_PATH/bin:${PATH}"
-ENV PATH="$OPENCL_ROOT/bin:${PATH}"
-
-# Add target file to help determine which device(s) to build for
-RUN bash -c 'echo -e "gfx803\ngfx900\ngfx906" >> /opt/rocm/bin/target.lst'
-
-# Copy and run the install scripts.
-COPY install/*.sh /install/
-RUN /install/install_pip_packages_remote.sh
-RUN /install/install_pip_packages.sh
-
diff --git a/tensorflow/tools/ci_build/Dockerfile.rbe.rocm-ubuntu18.04-manylinux2010-multipython b/tensorflow/tools/ci_build/Dockerfile.rbe.rocm-ubuntu18.04-manylinux2010-multipython
new file mode 100644
index 00000000000..dc7b85bca4d
--- /dev/null
+++ b/tensorflow/tools/ci_build/Dockerfile.rbe.rocm-ubuntu18.04-manylinux2010-multipython
@@ -0,0 +1,79 @@
+# Dockerfile for ROCm RBE builds.
+#
+# To push a new version, run:
+# $ docker build -f Dockerfile.local-toolchain-ubuntu18.04-manylinux2010 \
+#  --tag "local-toolchain-ubuntu18.04-manylinux2010" .
+# $ docker build -f Dockerfile.rbe.rocm-ubuntu18.04-manylinux2010-multipython \
+#  --tag "gcr.io/tensorflow-testing/nosla-rocm-ubuntu18.04-manylinux2010-multipython" .
+# $ docker push gcr.io/tensorflow-testing/nosla-rocm-ubuntu18.04-manylinux2010-multipython
+
+FROM ubuntu:18.04
+COPY --from=local-toolchain-ubuntu18.04-manylinux2010 /dt7 /dt7
+COPY --from=local-toolchain-ubuntu18.04-manylinux2010 /dt8 /dt8
+
+ARG DEBIAN_FRONTEND=noninteractive
+
+# Install ROCm packages
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    curl libnuma-dev gnupg sudo libelf1 build-essential \
+  && curl -sL http://repo.radeon.com/rocm/apt/debian/rocm.gpg.key | apt-key add - \
+  && printf "deb [arch=amd64] http://repo.radeon.com/rocm/apt/debian/ xenial main" | tee /etc/apt/sources.list.d/rocm.list \
+  && apt-get update && apt-get install -y --no-install-recommends \
+    rocm-dev rocm-libs hipcub rocm-utils rocm-cmake \
+    rocfft miopen-hip miopengemm rocblas hipblas rocrand rccl \
+  && apt-get clean && rm -rf /var/lib/apt/lists/*
+
+# Set ROCm environment variables and paths.
+ARG ROCM_PATH=/opt/rocm
+ENV HCC_HOME=$ROCM_PATH/hcc
+ENV HIP_PATH=$ROCM_PATH/hip
+ENV OPENCL_ROOT=$ROCM_PATH/opencl
+ENV PATH="$ROCM_PATH/bin:${PATH}"
+ENV PATH="$HCC_HOME/bin:$HIP_PATH/bin:${PATH}"
+ENV PATH="$OPENCL_ROOT/bin:${PATH}"
+
+# Set target file to help determine which device(s) to build for
+RUN bash -c "ls -al /opt/roc*"
+RUN bash -c "echo -e 'gfx803\ngfx900\ngfx906' > $ROCM_PATH/bin/target.lst"
+
+# Copy and run the install scripts.
+COPY install/install_bootstrap_deb_packages.sh /install/
+RUN /install/install_bootstrap_deb_packages.sh
+
+COPY install/install_deb_packages.sh /install/
+RUN /install/install_deb_packages.sh
+
+# Install additional packages needed for this image:
+# - dependencies to build Python from source
+# - patchelf, as it is required by auditwheel
+RUN apt-get update && apt-get install -y \
+    libbz2-dev \
+    libffi-dev \
+    libgdbm-dev \
+    libncurses5-dev \
+    libnss3-dev \
+    libreadline-dev \
+    patchelf \
+      && \
+    rm -rf /var/lib/apt/lists/*
+
+COPY install/install_bazel.sh /install/
+RUN /install/install_bazel.sh
+
+COPY install/build_and_install_python.sh /install/
+RUN /install/build_and_install_python.sh "2.7.17" "--enable-unicode=ucs4"
+RUN /install/build_and_install_python.sh "3.5.9"
+RUN /install/build_and_install_python.sh "3.6.9"
+RUN /install/build_and_install_python.sh "3.7.7"
+RUN /install/build_and_install_python.sh "3.8.2"
+
+COPY install/install_pip_packages_by_version.sh /install/
+RUN /install/install_pip_packages_by_version.sh "/usr/local/bin/pip2.7"
+RUN /install/install_pip_packages_by_version.sh "/usr/local/bin/pip3.8"
+RUN /install/install_pip_packages_by_version.sh "/usr/local/bin/pip3.5"
+RUN /install/install_pip_packages_by_version.sh "/usr/local/bin/pip3.6"
+RUN /install/install_pip_packages_by_version.sh "/usr/local/bin/pip3.7"
+
+ENV CLANG_VERSION="r42cab985fd95ba4f3f290e7bb26b93805edb447d"
+COPY install/install_latest_clang.sh /install/
+RUN /install/install_latest_clang.sh
diff --git a/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_rocm.tpl b/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_rocm.tpl
index 89275128a9c..8848bd32c2e 100755
--- a/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_rocm.tpl
+++ b/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_rocm.tpl
@@ -121,6 +121,23 @@ def GetHipccOptions(argv):
   return ''
 
 
+def system(cmd):
+  """Invokes cmd with os.system().
+
+  Args:
+    cmd: The command.
+
+  Returns:
+    The exit code if the process exited with exit() or -signal
+    if the process was terminated by a signal.
+  """
+  retv = os.system(cmd)
+  if os.WIFEXITED(retv):
+    return os.WEXITSTATUS(retv)
+  else:
+    return -os.WTERMSIG(retv)
+
+
 def InvokeHipcc(argv, log=False):
   """Call hipcc with arguments assembled from argv.
 
@@ -215,7 +232,7 @@ def InvokeHipcc(argv, log=False):
         + cmd
   if log: Log(cmd)
   if VERBOSE: print(cmd)
-  return os.system(cmd)
+  return system(cmd)
 
 
 def main():
diff --git a/third_party/gpus/rocm_configure.bzl b/third_party/gpus/rocm_configure.bzl
index 4cfec2459e4..6a1204b87db 100644
--- a/third_party/gpus/rocm_configure.bzl
+++ b/third_party/gpus/rocm_configure.bzl
@@ -169,24 +169,7 @@ def auto_configure_warning(msg):
 
 # END cc_configure common functions (see TODO above).
 
-def _host_compiler_includes(repository_ctx, cc):
-    """Computed the list of gcc include directories.
-
-    Args:
-      repository_ctx: The repository context.
-      cc: The path to the gcc host compiler.
-
-    Returns:
-      A list of gcc include directories.
-    """
-    inc_dirs = get_cxx_inc_directories(repository_ctx, cc)
-
-    # Add numpy headers
-    inc_dirs.append("/usr/lib/python2.7/dist-packages/numpy/core/include")
-
-    return inc_dirs
-
-def _rocm_include_path(repository_ctx, rocm_config):
+def _rocm_include_path(repository_ctx, rocm_config, bash_bin):
     """Generates the cxx_builtin_include_directory entries for rocm inc dirs.
 
     Args:
@@ -200,59 +183,18 @@ def _rocm_include_path(repository_ctx, rocm_config):
     """
     inc_dirs = []
 
-    # general ROCm include path
-    inc_dirs.append(rocm_config.rocm_toolkit_path + "/include")
-
-    # Add HSA headers
+    # Add HSA headers (needs to match $HSA_PATH)
     inc_dirs.append(rocm_config.rocm_toolkit_path + "/hsa/include")
 
-    # Add HIP headers
-    inc_dirs.append(rocm_config.rocm_toolkit_path + "/include/hip")
-    inc_dirs.append(rocm_config.rocm_toolkit_path + "/include/hip/hcc_detail")
+    # Add HIP headers (needs to match $HIP_PATH)
     inc_dirs.append(rocm_config.rocm_toolkit_path + "/hip/include")
 
-    # Add HIP-Clang headers
-    inc_dirs.append(rocm_config.rocm_toolkit_path + "/llvm/lib/clang/8.0/include")
-    inc_dirs.append(rocm_config.rocm_toolkit_path + "/llvm/lib/clang/9.0.0/include")
-    inc_dirs.append(rocm_config.rocm_toolkit_path + "/llvm/lib/clang/10.0.0/include")
-    inc_dirs.append(rocm_config.rocm_toolkit_path + "/llvm/lib/clang/11.0.0/include")
-
-    # Add rocrand and hiprand headers
-    inc_dirs.append(rocm_config.rocm_toolkit_path + "/rocrand/include")
-    inc_dirs.append(rocm_config.rocm_toolkit_path + "/hiprand/include")
-
-    # Add rocfft headers
-    inc_dirs.append(rocm_config.rocm_toolkit_path + "/rocfft/include")
-
-    # Add rocBLAS headers
-    inc_dirs.append(rocm_config.rocm_toolkit_path + "/rocblas/include")
-
-    # Add MIOpen headers
-    inc_dirs.append(rocm_config.rocm_toolkit_path + "/miopen/include")
-
-    # Add RCCL headers
-    inc_dirs.append(rocm_config.rocm_toolkit_path + "/rccl/include")
-
-    # Add hcc headers
-    inc_dirs.append(rocm_config.rocm_toolkit_path + "/hcc/include")
-    inc_dirs.append(rocm_config.rocm_toolkit_path + "/hcc/compiler/lib/clang/7.0.0/include/")
-    inc_dirs.append(rocm_config.rocm_toolkit_path + "/hcc/lib/clang/7.0.0/include")
-
-    # Newer hcc builds use/are based off of clang 8.0.0.
-    inc_dirs.append(rocm_config.rocm_toolkit_path + "/hcc/compiler/lib/clang/8.0.0/include/")
-    inc_dirs.append(rocm_config.rocm_toolkit_path + "/hcc/lib/clang/8.0.0/include")
-
-    # Support hcc based off clang 9.0.0, included in ROCm2.2
-    inc_dirs.append(rocm_config.rocm_toolkit_path + "/hcc/compiler/lib/clang/9.0.0/include/")
-    inc_dirs.append(rocm_config.rocm_toolkit_path + "/hcc/lib/clang/9.0.0/include")
-
-    # Support hcc based off clang 10.0.0, included in ROCm2.8
-    inc_dirs.append(rocm_config.rocm_toolkit_path + "/hcc/compiler/lib/clang/10.0.0/include/")
-    inc_dirs.append(rocm_config.rocm_toolkit_path + "/hcc/lib/clang/10.0.0/include")
-
-    # Support hcc based off clang 11.0.0, included in ROCm3.1
-    inc_dirs.append(rocm_config.rocm_toolkit_path + "/hcc/compiler/lib/clang/11.0.0/include/")
-    inc_dirs.append(rocm_config.rocm_toolkit_path + "/hcc/lib/clang/11.0.0/include")
+    # Add HIP-Clang headers (realpath relative to compiler binary)
+    rocm_toolkit_path = realpath(repository_ctx, rocm_config.rocm_toolkit_path, bash_bin)
+    inc_dirs.append(rocm_toolkit_path + "/llvm/lib/clang/8.0/include")
+    inc_dirs.append(rocm_toolkit_path + "/llvm/lib/clang/9.0.0/include")
+    inc_dirs.append(rocm_toolkit_path + "/llvm/lib/clang/10.0.0/include")
+    inc_dirs.append(rocm_toolkit_path + "/llvm/lib/clang/11.0.0/include")
 
     return inc_dirs
 
@@ -277,7 +219,7 @@ def _rocm_toolkit_path(repository_ctx, bash_bin):
     rocm_toolkit_path = get_host_environ(repository_ctx, _ROCM_TOOLKIT_PATH, _DEFAULT_ROCM_TOOLKIT_PATH)
     if files_exist(repository_ctx, [rocm_toolkit_path], bash_bin) != [True]:
         auto_configure_fail("Cannot find rocm toolkit path.")
-    return realpath(repository_ctx, rocm_toolkit_path, bash_bin)
+    return rocm_toolkit_path
 
 def _amdgpu_targets(repository_ctx):
     """Returns a list of strings representing AMDGPU targets."""
@@ -734,8 +676,9 @@ def _create_local_rocm_repository(repository_ctx):
 
     rocm_defines["%{host_compiler_path}"] = "clang/bin/crosstool_wrapper_driver_is_not_gcc"
 
-    rocm_defines["%{cxx_builtin_include_directories}"] = to_list_of_strings(host_compiler_includes +
-                                                                            _rocm_include_path(repository_ctx, rocm_config))
+    rocm_defines["%{cxx_builtin_include_directories}"] = to_list_of_strings(
+        host_compiler_includes + _rocm_include_path(repository_ctx, rocm_config, bash_bin),
+    )
 
     verify_build_defines(rocm_defines)
 
diff --git a/third_party/toolchains/preconfig/generate/containers.bzl b/third_party/toolchains/preconfig/generate/containers.bzl
index 260b7c31717..6751fbce1f7 100644
--- a/third_party/toolchains/preconfig/generate/containers.bzl
+++ b/third_party/toolchains/preconfig/generate/containers.bzl
@@ -12,6 +12,6 @@ container_digests = {
     "cuda10.1-cudnn7-ubuntu16.04-manylinux2010-multipython": "sha256:3f890a951c81a201d60d0161a56ce628a90323be0c7f795550caa37f6f41a85c",
     "cuda10.1-cudnn7-ubuntu18.04-manylinux2010-multipython": "sha256:bd7666d1ef49b2b2e2a64981f1c9234deeccdb0d5198b30ff4289c3dfcffedbf",
     "cuda11.0-cudnn8-ubuntu18.04-manylinux2010-multipython": "sha256:b52edb4e35c780334ba417b008927722ae668847715a1624e9b2984e99c05338",
-    "rocm-ubuntu16.04": "sha256:e645447dd6127325f3e97b8bf23424f637a8579d963b34fcc6772cf7cfaa0ebe",
+    "rocm-ubuntu18.04-manylinux2010-multipython": "sha256:ac52a60d12d0c9f81e558782b5431127b93bb1a13dab7294b3a5b3de91173019",
     "windows-1803": "sha256:f109576c7c0c8a1783ff22b666e8923b52dbbe7933f69a1c7a7275202c304a12",
 }
diff --git a/third_party/toolchains/remote_config/configs.bzl b/third_party/toolchains/remote_config/configs.bzl
index 501759dcd50..89ccde7945d 100644
--- a/third_party/toolchains/remote_config/configs.bzl
+++ b/third_party/toolchains/remote_config/configs.bzl
@@ -94,11 +94,13 @@ def initialize_rbe_configs():
     )
 
     tensorflow_rbe_config(
-        name = "ubuntu16.04-py3_opt-gcc5-rocm",
-        compiler = "gcc",
-        os = "ubuntu16.04",
-        python_versions = ["3"],
-        rocm_version = "2.5",  # Any version will do.
+        name = "ubuntu18.04-gcc7_manylinux2010-rocm",
+        compiler = "/dt7/usr/bin/gcc",
+        compiler_prefix = "/usr/bin",
+        rocm_version = "3.5",  # Any version will do.
+        os = "ubuntu18.04-manylinux2010-multipython",
+        python_versions = ["2.7", "3.5", "3.6", "3.7", "3.8"],
+        python_install_path = "/usr/local",
     )
 
     tensorflow_rbe_win_config(
diff --git a/third_party/toolchains/remote_config/containers.bzl b/third_party/toolchains/remote_config/containers.bzl
index d94bce91675..18c6caa9d86 100644
--- a/third_party/toolchains/remote_config/containers.bzl
+++ b/third_party/toolchains/remote_config/containers.bzl
@@ -45,12 +45,14 @@ containers = {
         "digest": container_digests["cuda11.0-cudnn8-ubuntu18.04-manylinux2010-multipython"],
     },
 
-    # Built with //tensorflow/tools/ci_build/Dockerfile.rbe.rocm-ubuntu16.04
-    "rocm-ubuntu16.04": {
+    # Built with //tensorflow/tools/ci_build/Dockerfile.rbe.rocm-ubuntu18.04-manylinux2010-multipython.
+    "rocm-ubuntu18.04-manylinux2010-multipython": {
         "registry": "gcr.io",
-        "repository": "tensorflow-testing/nosla-rocm-ubuntu16.04",
-        "digest": container_digests["rocm-ubuntu16.04"],
+        "repository": "tensorflow-testing/nosla-rocm-ubuntu18.04-manylinux2010-multipython",
+        "digest": container_digests["rocm-ubuntu18.04-manylinux2010-multipython"],
     },
+
+    # Built by gunan@ from a private Dockerfile.
     "windows-1803": {
         "registry": "gcr.io",
         "repository": "tensorflow-testing/tf-win-rbe",

From a979f5409b7a9c4165ccb743b63d945fc090e2a0 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 29 Jun 2020 04:39:44 -0700
Subject: [PATCH 1247/1390] Adds additional ops to Flex whitelist.

PiperOrigin-RevId: 318789841
Change-Id: I500bf53b776666aee1f8ea52cf23960a458c2923
---
 tensorflow/lite/delegates/flex/whitelisted_flex_ops.cc | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/tensorflow/lite/delegates/flex/whitelisted_flex_ops.cc b/tensorflow/lite/delegates/flex/whitelisted_flex_ops.cc
index b76a3b52356..11d2d074e53 100644
--- a/tensorflow/lite/delegates/flex/whitelisted_flex_ops.cc
+++ b/tensorflow/lite/delegates/flex/whitelisted_flex_ops.cc
@@ -108,6 +108,7 @@ bool IsWhitelistedFlexOp(const std::string& tensorflow_op_name) {
           "DecodeBase64",
           "DecodeBmp",
           "DecodeWav",
+          "DeepCopy",
           "DeleteSessionTensor",
           "DepthToSpace",
           "DepthwiseConv2dNative",
@@ -178,6 +179,9 @@ bool IsWhitelistedFlexOp(const std::string& tensorflow_op_name) {
           "ImmutableConst",
           "InTopK",
           "InTopKV2",
+          "InplaceAdd",
+          "InplaceSub",
+          "InplaceUpdate",
           "Inv",
           "InvGrad",
           "InvertPermutation",
@@ -372,6 +376,7 @@ bool IsWhitelistedFlexOp(const std::string& tensorflow_op_name) {
           "SegmentProd",
           "SegmentSum",
           "Select",
+          "SelectV2",
           "Selu",
           "SeluGrad",
           "Shape",

From ed9d45f096097c77664815c361c75e73af4f32d4 Mon Sep 17 00:00:00 2001
From: Tres Popp <tpopp@google.com>
Date: Mon, 29 Jun 2020 05:10:56 -0700
Subject: [PATCH 1248/1390] Add shape constraints to XLA CHLO-LHLO lowering.

This adds constraints for binary broadcasting operations and then supports removing them for IREE use cases.

PiperOrigin-RevId: 318794098
Change-Id: I22de7328df7ecfcdefc050dd729a1498985d7dbd
---
 .../chlo_legalize_to_hlo_broadcasts.mlir      | 52 ++++++----
 .../tests/legalize-tf-binary-elementwise.mlir | 99 +++++++++----------
 .../xla/transforms/chlo_legalize_to_hlo.cc    | 19 +++-
 3 files changed, 95 insertions(+), 75 deletions(-)

diff --git a/tensorflow/compiler/mlir/xla/tests/chlo_legalize_to_hlo_broadcasts.mlir b/tensorflow/compiler/mlir/xla/tests/chlo_legalize_to_hlo_broadcasts.mlir
index 107a668c0a7..65285021fd4 100644
--- a/tensorflow/compiler/mlir/xla/tests/chlo_legalize_to_hlo_broadcasts.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/chlo_legalize_to_hlo_broadcasts.mlir
@@ -1,4 +1,4 @@
-// RUN: xla-opt -test-xla-chlo-legalize-to-hlo -split-input-file -verify-diagnostics %s -o - | FileCheck %s
+// RUN: xla-opt -test-xla-chlo-legalize-to-hlo -cse -split-input-file -verify-diagnostics %s -o - | FileCheck %s
 
 // Check the non-broadcast case for each registered op, then just check a
 // representative op for detailed broadcast semantics.
@@ -14,14 +14,18 @@ func @addWithoutBroadcast(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<
 // CHECK-SAME: %[[ARG0:.+]]: tensor<?xf32>
 // CHECK-SAME: %[[ARG1:.+]]: tensor<?x?xf32>
 func @dynamicBroadcast(%arg0: tensor<?xf32>, %arg1: tensor<?x?xf32>) -> tensor<?x?xf32> {
-  // CHECK-DAG: %[[ARG0_S:.+]] = shape.shape_of %[[ARG0]]
-  // CHECK-DAG: %[[ARG1_S:.+]] = shape.shape_of %[[ARG1]]
-  // CHECK-DAG: %[[RESULT_S:.+]] = "shape.broadcast"(%[[ARG0_S]], %[[ARG1_S]])
-  // CHECK: %[[RESULT_EXTENTS:.+]] = shape.to_extent_tensor %[[RESULT_S]]
-  // CHECK-DAG: %[[ARG0_B:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%[[ARG0]], %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<1> : tensor<1xi64>}
-  // CHECK-DAG: %[[ARG1_B:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%[[ARG1]], %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>}
-  // CHECK-DAG: %[[RESULT:.+]] = xla_hlo.add %[[ARG0_B]], %[[ARG1_B]]
-  // CHECK: return %[[RESULT]] : tensor<?x?xf32>
+  // CHECK-DAG:  %[[ARG0_S:.+]] = shape.shape_of %[[ARG0]]
+  // CHECK-DAG:  %[[ARG1_S:.+]] = shape.shape_of %[[ARG1]]
+  // CHECK-NEXT: %[[WITNESS:.+]] = shape.cstr_broadcastable %[[ARG0_S]], %[[ARG1_S]]
+  // CHECK-NEXT: %[[FINAL_RESULT:.+]] = shape.assuming %[[WITNESS]]
+  // CHECK-DAG:    %[[RESULT_S:.+]] = "shape.broadcast"(%[[ARG0_S]], %[[ARG1_S]])
+  // CHECK:        %[[RESULT_EXTENTS:.+]] = shape.to_extent_tensor %[[RESULT_S]]
+  // CHECK-DAG:    %[[ARG0_B:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%[[ARG0]], %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<1> : tensor<1xi64>}
+  // CHECK-DAG:    %[[ARG1_B:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%[[ARG1]], %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>}
+  // CHECK-NEXT:   %[[RESULT:.+]] = xla_hlo.add %[[ARG0_B]], %[[ARG1_B]]
+  // CHECK-NEXT:   shape.assuming_yield %[[RESULT]]
+  // CHECK-NEXT: }
+  // CHECK-NEXT:      return %[[FINAL_RESULT]] : tensor<?x?xf32>
   %0 = xla_chlo.broadcast_add %arg0, %arg1 : (tensor<?xf32>, tensor<?x?xf32>) -> tensor<?x?xf32>
   return %0 : tensor<?x?xf32>
 }
@@ -31,14 +35,18 @@ func @dynamicBroadcast(%arg0: tensor<?xf32>, %arg1: tensor<?x?xf32>) -> tensor<?
 // CHECK-SAME: %[[ARG0:.+]]: tensor<?xf32>
 // CHECK-SAME: %[[ARG1:.+]]: tensor<?x?xf32>
 func @dynamicBroadcastComplex(%arg0: tensor<?xf32>, %arg1: tensor<?x?xf32>) -> tensor<?x?xcomplex<f32>> {
-  // CHECK-DAG: %[[ARG0_S:.+]] = shape.shape_of %[[ARG0]]
-  // CHECK-DAG: %[[ARG1_S:.+]] = shape.shape_of %[[ARG1]]
-  // CHECK-DAG: %[[RESULT_S:.+]] = "shape.broadcast"(%[[ARG0_S]], %[[ARG1_S]])
-  // CHECK: %[[RESULT_EXTENTS:.+]] = shape.to_extent_tensor %[[RESULT_S]]
-  // CHECK-DAG: %[[ARG0_B:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%[[ARG0]], %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<?xf32>, tensor<2xindex>) -> tensor<?x?xf32>
-  // CHECK-DAG: %[[ARG1_B:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%[[ARG1]], %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>} : (tensor<?x?xf32>, tensor<2xindex>) -> tensor<?x?xf32>
-  // CHECK-DAG: %[[RESULT:.+]] = "xla_hlo.complex"(%[[ARG0_B]], %[[ARG1_B]]) : (tensor<?x?xf32>, tensor<?x?xf32>) -> tensor<?x?xcomplex<f32>>
-  // CHECK: return %[[RESULT]] : tensor<?x?xcomplex<f32>>
+  // CHECK-DAG:  %[[ARG0_S:.+]] = shape.shape_of %[[ARG0]]
+  // CHECK-DAG:  %[[ARG1_S:.+]] = shape.shape_of %[[ARG1]]
+  // CHECK-NEXT: %[[WITNESS:.+]] = shape.cstr_broadcastable %[[ARG0_S]], %[[ARG1_S]]
+  // CHECK-NEXT: %[[FINAL_RESULT:.+]] = shape.assuming %[[WITNESS]]
+  // CHECK-NEXT:   %[[RESULT_S:.+]] = "shape.broadcast"(%[[ARG0_S]], %[[ARG1_S]])
+  // CHECK-NEXT:   %[[RESULT_EXTENTS:.+]] = shape.to_extent_tensor %[[RESULT_S]]
+  // CHECK-DAG:    %[[ARG0_B:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%[[ARG0]], %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<?xf32>, tensor<2xindex>) -> tensor<?x?xf32>
+  // CHECK-DAG:    %[[ARG1_B:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%[[ARG1]], %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>} : (tensor<?x?xf32>, tensor<2xindex>) -> tensor<?x?xf32>
+  // CHECK-NEXT:   %[[RESULT:.+]] = "xla_hlo.complex"(%[[ARG0_B]], %[[ARG1_B]]) : (tensor<?x?xf32>, tensor<?x?xf32>) -> tensor<?x?xcomplex<f32>>
+  // CHECK-NEXT:   shape.assuming_yield %[[RESULT]]
+  // CHECK-NEXT: }
+  // CHECK-NEXT: return %[[FINAL_RESULT]] : tensor<?x?xcomplex<f32>>
   %0 = xla_chlo.broadcast_complex %arg0, %arg1 : (tensor<?xf32>, tensor<?x?xf32>) -> tensor<?x?xcomplex<f32>>
   return %0 : tensor<?x?xcomplex<f32>>
 }
@@ -50,12 +58,16 @@ func @dynamicBroadcastComplex(%arg0: tensor<?xf32>, %arg1: tensor<?x?xf32>) -> t
 func @dynamicBroadcastCompare(%arg0: tensor<?xf32>, %arg1: tensor<?x?xf32>) -> tensor<?x?xi1> {
   // CHECK-DAG: %[[ARG0_S:.+]] = shape.shape_of %[[ARG0]]
   // CHECK-DAG: %[[ARG1_S:.+]] = shape.shape_of %[[ARG1]]
-  // CHECK-DAG: %[[RESULT_S:.+]] = "shape.broadcast"(%[[ARG0_S]], %[[ARG1_S]])
+  // CHECK: %[[WITNESS:.+]] = shape.cstr_broadcastable %[[ARG0_S]], %[[ARG1_S]]
+  // CHECK: %[[FINAL_RESULT:.+]] = shape.assuming %[[WITNESS]]
+  // CHECK: %[[RESULT_S:.+]] = "shape.broadcast"(%[[ARG0_S]], %[[ARG1_S]])
   // CHECK: %[[RESULT_EXTENTS:.+]] = shape.to_extent_tensor %[[RESULT_S]]
   // CHECK-DAG: %[[ARG0_B:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%[[ARG0]], %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<?xf32>, tensor<2xindex>) -> tensor<?x?xf32>
   // CHECK-DAG: %[[ARG1_B:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%[[ARG1]], %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>} : (tensor<?x?xf32>, tensor<2xindex>) -> tensor<?x?xf32>
-  // CHECK-DAG: %[[RESULT:.+]] = "xla_hlo.compare"(%[[ARG0_B]], %[[ARG1_B]]) {comparison_direction = "EQ"} : (tensor<?x?xf32>, tensor<?x?xf32>) -> tensor<?x?xi1>
-  // CHECK: return %[[RESULT]] : tensor<?x?xi1>
+  // CHECK: %[[RESULT:.+]] = "xla_hlo.compare"(%[[ARG0_B]], %[[ARG1_B]]) {comparison_direction = "EQ"} : (tensor<?x?xf32>, tensor<?x?xf32>) -> tensor<?x?xi1>
+  // CHECK: shape.assuming_yield %[[RESULT]]
+  // CHECK-NEXT: }
+  // CHECK: return %[[FINAL_RESULT]] : tensor<?x?xi1>
   %0 = xla_chlo.broadcast_compare %arg0, %arg1 {comparison_direction = "EQ"} : (tensor<?xf32>, tensor<?x?xf32>) -> tensor<?x?xi1>
   return %0 : tensor<?x?xi1>
 }
diff --git a/tensorflow/compiler/mlir/xla/tests/legalize-tf-binary-elementwise.mlir b/tensorflow/compiler/mlir/xla/tests/legalize-tf-binary-elementwise.mlir
index 2153258993a..3d270a52f48 100644
--- a/tensorflow/compiler/mlir/xla/tests/legalize-tf-binary-elementwise.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/legalize-tf-binary-elementwise.mlir
@@ -1,7 +1,7 @@
 // Note that binary elementwise tests are run with chlo legalization enabled
 // (unlike the rest), since this is the primary use case for such ops and
 // verification of shapes and broadcasts is desired.
-// RUN: tf-opt "-xla-legalize-tf=allow-partial-conversion legalize-chlo=true" %s | FileCheck %s
+// RUN: tf-opt "-xla-legalize-tf=allow-partial-conversion legalize-chlo=true" -canonicalize %s | FileCheck %s
 
 //===----------------------------------------------------------------------===//
 // Binary op legalizations.
@@ -24,13 +24,8 @@ func @add(%arg0: tensor<2xi32>) -> tensor<2xi32> {
 // patterns unambiguous and more interesting (once broadcastable trait is
 // fixed upstream).
 func @broadcast_add(%arg0: tensor<1xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi32> {
-  // CHECK: %[[UNUSED_LHS_SHAPE:.+]] = shape.const_shape [1]
-  // CHECK: %[[UNUSED_RHS_SHAPE:.+]] = shape.const_shape [1, 2]
-  // CHECK: %[[RESULT_SHAPE:.+]] = shape.const_shape [1, 2]
-  // CHECK-DAG: %[[RESULT_EXTENTS:.+]] = shape.to_extent_tensor %[[RESULT_SHAPE]]
-  // CHECK-DAG: %[[LHS_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg0, %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<1> : tensor<1xi64>}
-  // CHECK-DAG: %[[RHS_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg1, %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>}
-  // CHECK: xla_hlo.add %[[LHS_BCAST]], %[[RHS_BCAST]]
+  // CHECK-NEXT: %[[LHS_BCAST:.+]] = "xla_hlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<1> : tensor<1xi64>}
+  // CHECK-NEXT: xla_hlo.add %[[LHS_BCAST]], %arg1
   %0 = "tf.Add"(%arg0, %arg1) : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi32>
   return %0: tensor<1x2xi32>
 }
@@ -39,26 +34,26 @@ func @broadcast_add(%arg0: tensor<1xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2x
 // TODO(laurenzo): Change this to a (4x1x1 + 1x4x4x4) shaped add once upstream
 // broadcastable bug is fixed (helps make the CHECK matching unambiguous)
 func @broadcast_multi_dim_add(%arg0: tensor<4x1x1xi32>, %arg1: tensor<4x4x4x4xi32>) -> tensor<4x4x4x4xi32> {
-  // CHECK: %[[UNUSED_LHS_SHAPE:.+]] = shape.const_shape [4, 1, 1]
-  // CHECK: %[[UNUSED_RHS_SHAPE:.+]] = shape.const_shape [4, 4, 4, 4]
-  // CHECK: %[[RESULT_SHAPE:.+]] = shape.const_shape [4, 4, 4, 4]
-  // CHECK-DAG: %[[RESULT_EXTENTS:.+]] = shape.to_extent_tensor %[[RESULT_SHAPE]]
-  // CHECK-DAG: %[[LHS_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg0, %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<[1, 2, 3]> : tensor<3xi64>}
-  // CHECK-DAG: %[[RHS_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg1, %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<[0, 1, 2, 3]> : tensor<4xi64>}
-  // CHECK: xla_hlo.add %[[LHS_BCAST]], %[[RHS_BCAST]]
+  // CHECK-NEXT: %[[LHS_BCAST:.+]] = "xla_hlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<[1, 2, 3]> : tensor<3xi64>}
+  // CHECK-NEXT: xla_hlo.add %[[LHS_BCAST]], %arg1
   %0 = "tf.Add"(%arg0, %arg1) : (tensor<4x1x1xi32>, tensor<4x4x4x4xi32>) -> tensor<4x4x4x4xi32>
   return %0: tensor<4x4x4x4xi32>
 }
 
 // CHECK-LABEL: func @add_dynamic
 func @add_dynamic(%arg0: tensor<?xi32>, %arg1: tensor<?x?xi32>) -> tensor<?x?xi32> {
-  // CHECK-DAG: %[[LHS_SHAPE:.+]] = shape.shape_of %arg0
-  // CHECK-DAG: %[[RHS_SHAPE:.+]] = shape.shape_of %arg1
-  // CHECK-DAG: %[[RESULT_SHAPE:.+]] = "shape.broadcast"(%[[LHS_SHAPE]], %[[RHS_SHAPE]])
-  // CHECK-DAG: %[[RESULT_EXTENTS:.+]] = shape.to_extent_tensor %[[RESULT_SHAPE]]
-  // CHECK-DAG: %[[LHS_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg0, %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<1> : tensor<1xi64>}
-  // CHECK-DAG: %[[RHS_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg1, %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>}
-  // CHECK: xla_hlo.add %4, %5 : tensor<?x?xi32>
+  // CHECK-DAG:  %[[CSTR_LHS_SHAPE:.+]] = shape.shape_of %arg0
+  // CHECK-DAG:  %[[CSTR_RHS_SHAPE:.+]] = shape.shape_of %arg1
+  // CHECK-NEXT: %[[WITNESS:.+]] = shape.cstr_broadcastable %[[CSTR_LHS_SHAPE]], %[[CSTR_RHS_SHAPE]]
+  // CHECK-NEXT: shape.assuming %[[WITNESS:.+]]
+  // CHECK-DAG:    %[[LHS_SHAPE:.+]] = shape.shape_of %arg0
+  // CHECK-DAG:    %[[RHS_SHAPE:.+]] = shape.shape_of %arg1
+  // CHECK-NEXT:   %[[RESULT_SHAPE:.+]] = "shape.broadcast"(%[[LHS_SHAPE]], %[[RHS_SHAPE]])
+  // CHECK-NEXT:   %[[RESULT_EXTENTS:.+]] = shape.to_extent_tensor %[[RESULT_SHAPE]]
+  // CHECK-NEXT:   %[[LHS_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg0, %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<1> : tensor<1xi64>}
+  // CHECK-NEXT:   %[[RHS_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg1, %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>}
+  // CHECK-NEXT:   %[[RESULT:.+]] = xla_hlo.add %[[LHS_BCAST]], %[[RHS_BCAST]] : tensor<?x?xi32>
+  // CHECK-NEXT:   shape.assuming_yield %[[RESULT]]
   %0 = "tf.Add"(%arg0, %arg1) : (tensor<?xi32>, tensor<?x?xi32>) -> tensor<?x?xi32>
   return %0: tensor<?x?xi32>
 }
@@ -80,21 +75,21 @@ func @shift_left(%arg0: tensor<4xi32>, %arg1: tensor<4xi32>) -> tensor<4xi32> {
 
 // CHECK-LABEL: func @div_unranked
 func @div_unranked(%arg0: tensor<*xi32>, %arg1: tensor<?x?xi32>) -> tensor<?x?xi32> {
-  // CHECK: tf.Div
+  // CHECK-NEXT: tf.Div
   %0 = "tf.Div"(%arg0, %arg1) : (tensor<*xi32>, tensor<?x?xi32>) -> tensor<?x?xi32>
   return %0: tensor<?x?xi32>
 }
 
 // CHECK-LABEL: func @maximum
 func @maximum(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
-  // CHECK:  xla_hlo.maximum %arg0, %arg1 : tensor<4xf32>
+  // CHECK-NEXT:  xla_hlo.maximum %arg0, %arg1 : tensor<4xf32>
   %0 = "tf.Maximum"(%arg0, %arg1) : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
   return %0 : tensor<4xf32>
 }
 
 // CHECK-LABEL: func @minimum
 func @minimum(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
-  // CHECK:  xla_hlo.minimum %arg0, %arg1 : tensor<4xf32>
+  // CHECK-NEXT:  xla_hlo.minimum %arg0, %arg1 : tensor<4xf32>
   %0 = "tf.Minimum"(%arg0, %arg1) : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
   return %0 : tensor<4xf32>
 }
@@ -200,26 +195,25 @@ func @equal(%arg0: tensor<2xi32>) -> tensor<2xi1> {
 
 // CHECK-LABEL: func @equal_dynamic
 func @equal_dynamic(%arg0: tensor<?xi32>, %arg1: tensor<1xi32>) -> tensor<?xi1> {
-  // CHECK-DAG: %[[LHS_SHAPE:.+]] = shape.shape_of %arg0
-  // CHECK-DAG: %[[RHS_SHAPE:.+]] = shape.const_shape [1]
-  // CHECK-DAG: %[[RESULT_SHAPE:.+]] = "shape.broadcast"(%[[LHS_SHAPE]], %[[RHS_SHAPE]])
-  // CHECK-DAG: %[[RESULT_EXTENTS:.+]] = shape.to_extent_tensor %[[RESULT_SHAPE]]
-  // CHECK-DAG: %[[LHS_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg0, %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<0> : tensor<1xi64>}
-  // CHECK-DAG: %[[RHS_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg1, %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<0> : tensor<1xi64>}
-  // CHECK: "xla_hlo.compare"(%[[LHS_BCAST]], %[[RHS_BCAST]]) {comparison_direction = "EQ"}
+  // CHECK-DAG:  %[[LHS_SHAPE:.+]] = shape.shape_of %arg0
+  // CHECK-DAG:  %[[RHS_SHAPE:.+]] = shape.const_shape [1]
+  // CHECK-NEXT: %[[WITNESS:.+]] = shape.cstr_broadcastable %[[LHS_SHAPE]], %[[RHS_SHAPE]]
+  // CHECK-NEXT: shape.assuming %[[WITNESS]] -> (tensor<?xi1>) {
+  // CHECK-DAG:    %[[LHS_SHAPE1:.+]] = shape.shape_of %arg0
+  // CHECK-NEXT:   %[[RESULT_SHAPE:.+]] = "shape.broadcast"(%[[LHS_SHAPE1]], %[[RHS_SHAPE]])
+  // CHECK-NEXT:   %[[RESULT_EXTENTS:.+]] = shape.to_extent_tensor %[[RESULT_SHAPE]]
+  // CHECK-DAG:    %[[LHS_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg0, %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<0> : tensor<1xi64>}
+  // CHECK-DAG:    %[[RHS_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg1, %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<0> : tensor<1xi64>}
+  // CHECK-NEXT:   %[[RESULT:.+]] = "xla_hlo.compare"(%[[LHS_BCAST]], %[[RHS_BCAST]]) {comparison_direction = "EQ"}
+  // CHECK-NEXT:   shape.assuming_yield %[[RESULT]]
   %0 = "tf.Equal"(%arg0, %arg1) : (tensor<?xi32>, tensor<1xi32>) -> tensor<?xi1>
   return %0: tensor<?xi1>
 }
 
 // CHECK-LABEL: func @equal_broadcast
 func @equal_broadcast(%arg0: tensor<1xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi1> {
-  // CHECK-DAG: %[[LHS_SHAPE:.+]] = shape.const_shape [1]
-  // CHECK-DAG: %[[RHS_SHAPE:.+]] = shape.const_shape [1, 2]
-  // CHECK-DAG: %[[RESULT_SHAPE:.+]] = shape.const_shape [1, 2]
-  // CHECK-DAG: %[[RESULT_EXTENTS:.+]] = shape.to_extent_tensor %[[RESULT_SHAPE]]
-  // CHECK-DAG: %[[LHS_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg0, %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<1> : tensor<1xi64>}
-  // CHECK-DAG: %[[RHS_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg1, %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>}
-  // CHECK: "xla_hlo.compare"(%[[LHS_BCAST]], %[[RHS_BCAST]]) {comparison_direction = "EQ"}
+  // CHECK-DAG: %[[LHS_BCAST:.+]] = "xla_hlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<1> : tensor<1xi64>}
+  // CHECK-NEXT: "xla_hlo.compare"(%[[LHS_BCAST]], %arg1) {comparison_direction = "EQ"}
   %0 = "tf.Equal"(%arg0, %arg1) : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
   return %0: tensor<1x2xi1>
 }
@@ -281,26 +275,25 @@ func @greater(%arg0: tensor<2xi32>) -> tensor<2xi1> {
 
 // CHECK-LABEL: func @broadcast_greater
 func @broadcast_greater(%arg0: tensor<1xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi1> {
-  // CHECK-DAG: %[[LHS_SHAPE:.+]] = shape.const_shape [1]
-  // CHECK-DAG: %[[RHS_SHAPE:.+]] = shape.const_shape [1, 2]
-  // CHECK-DAG: %[[RESULT_SHAPE:.+]] = shape.const_shape [1, 2]
-  // CHECK-DAG: %[[RESULT_EXTENTS:.+]] = shape.to_extent_tensor %[[RESULT_SHAPE]]
-  // CHECK-DAG: %[[LHS_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg0, %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<1> : tensor<1xi64>}
-  // CHECK-DAG: %[[RHS_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg1, %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>}
-  // CHECK: "xla_hlo.compare"(%[[LHS_BCAST]], %[[RHS_BCAST]]) {comparison_direction = "GT"}
+  // CHECK-NEXT: %[[LHS_BCAST:.+]] = "xla_hlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<1> : tensor<1xi64>}
+  // CHECK-NEXT: "xla_hlo.compare"(%[[LHS_BCAST]], %arg1) {comparison_direction = "GT"}
   %0 = "tf.Greater"(%arg0, %arg1) : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
   return %0: tensor<1x2xi1>
 }
 
 // CHECK-LABEL: func @greater_dynamic
 func @greater_dynamic(%arg0: tensor<?xi32>, %arg1: tensor<?xi32>) -> tensor<?xi1> {
-  // CHECK-DAG: %[[LHS_SHAPE:.+]] = shape.shape_of %arg0
-  // CHECK-DAG: %[[RHS_SHAPE:.+]] = shape.shape_of %arg1
-  // CHECK-DAG: %[[RESULT_SHAPE:.+]] = "shape.broadcast"(%[[LHS_SHAPE]], %[[RHS_SHAPE]])
-  // CHECK-DAG: %[[RESULT_EXTENTS:.+]] = shape.to_extent_tensor %[[RESULT_SHAPE]]
-  // CHECK-DAG: %[[LHS_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg0, %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<0> : tensor<1xi64>}
-  // CHECK-DAG: %[[RHS_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg1, %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<0> : tensor<1xi64>}
-  // CHECK: "xla_hlo.compare"(%[[LHS_BCAST]], %[[RHS_BCAST]]) {comparison_direction = "GT"}
+  // CHECK-DAG:  %[[LHS_SHAPE:.+]] = shape.shape_of %arg0
+  // CHECK-DAG:  %[[RHS_SHAPE:.+]] = shape.shape_of %arg1
+  // CHECK-NEXT: %[[WITNESS:.+]] = shape.cstr_broadcastable %[[LHS_SHAPE]], %[[RHS_SHAPE]]
+  // CHECK-NEXT: shape.assuming %[[WITNESS]]
+  // CHECK-DAG:    %[[LHS_SHAPE1:.+]] = shape.shape_of %arg0
+  // CHECK-DAG:    %[[RHS_SHAPE1:.+]] = shape.shape_of %arg1
+  // CHECK-NEXT:   %[[RESULT_SHAPE:.+]] = "shape.broadcast"(%[[LHS_SHAPE1]], %[[RHS_SHAPE1]])
+  // CHECK-NEXT:   %[[RESULT_EXTENTS:.+]] = shape.to_extent_tensor %[[RESULT_SHAPE]]
+  // CHECK-DAG:    %[[LHS_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg0, %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<0> : tensor<1xi64>}
+  // CHECK-DAG:    %[[RHS_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg1, %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<0> : tensor<1xi64>}
+  // CHECK-NEXT:   "xla_hlo.compare"(%[[LHS_BCAST]], %[[RHS_BCAST]]) {comparison_direction = "GT"}
   %0 = "tf.Greater"(%arg0, %arg1) : (tensor<?xi32>, tensor<?xi32>) -> tensor<?xi1>
   return %0: tensor<?xi1>
 }
diff --git a/tensorflow/compiler/mlir/xla/transforms/chlo_legalize_to_hlo.cc b/tensorflow/compiler/mlir/xla/transforms/chlo_legalize_to_hlo.cc
index e5a79616d5b..97afa9617c4 100644
--- a/tensorflow/compiler/mlir/xla/transforms/chlo_legalize_to_hlo.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/chlo_legalize_to_hlo.cc
@@ -112,6 +112,19 @@ struct ConvertRankedDynamicBroadcastBinaryOp
 
     // Compute result shape.
     auto loc = op.getLoc();
+
+    // Insert a constraint on the shapes being broadcastable and insert all
+    // future code into an assuming block reliant on the constraint.
+    Value lhs_shape = rewriter.create<shape::ShapeOfOp>(loc, lhs);
+    Value rhs_shape = rewriter.create<shape::ShapeOfOp>(loc, rhs);
+    auto broadcastable_cstr =
+        rewriter.create<shape::CstrBroadcastableOp>(loc, lhs_shape, rhs_shape);
+    auto assuming_op = rewriter.create<shape::AssumingOp>(
+        loc, ArrayRef<Type>{result_type}, broadcastable_cstr.result());
+
+    OpBuilder::InsertionGuard guard(rewriter);
+    rewriter.createBlock(&assuming_op.doRegion());
+
     int64_t result_rank = std::max(lhs_type.getRank(), rhs_type.getRank());
     Value result_extents =
         xla::ComputeBinaryElementwiseBroadcastingResultExtents(loc, lhs, rhs,
@@ -140,8 +153,10 @@ struct ConvertRankedDynamicBroadcastBinaryOp
         rewriter.getI64TensorAttr(rhs_broadcast_dimensions));
 
     // And generate the final non-broadcasted binary op.
-    rewriter.replaceOp(op, {Adaptor::CreateOp(op, result_type, broadcasted_lhs,
-                                              broadcasted_rhs, rewriter)});
+    Value final_result = Adaptor::CreateOp(op, result_type, broadcasted_lhs,
+                                           broadcasted_rhs, rewriter);
+    rewriter.create<shape::AssumingYieldOp>(loc, final_result);
+    rewriter.replaceOp(op, {assuming_op.getResult(0)});
     return success();
   }
 };

From 101011d693f1958e8133054849ea0317d1b3834d Mon Sep 17 00:00:00 2001
From: Raman Sarokin <sorokin@google.com>
Date: Mon, 29 Jun 2020 09:16:37 -0700
Subject: [PATCH 1249/1390] Internal change

PiperOrigin-RevId: 318827511
Change-Id: I36ccbb90ccf8f9e28f5d17973e2407d39b1a28e5
---
 tensorflow/lite/delegates/gpu/cl/tensor_type.cc | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/tensorflow/lite/delegates/gpu/cl/tensor_type.cc b/tensorflow/lite/delegates/gpu/cl/tensor_type.cc
index dfa2ef41cdf..52621921bfa 100644
--- a/tensorflow/lite/delegates/gpu/cl/tensor_type.cc
+++ b/tensorflow/lite/delegates/gpu/cl/tensor_type.cc
@@ -391,9 +391,10 @@ absl::Status TensorDescriptor::PerformGetPtrWithSliceOffsetSelector(
 
 absl::Status TensorDescriptor::PerformGetWHOffsetSelector(
     const std::vector<std::string>& args, std::string* result) const {
-  if (storage_type != TensorStorageType::BUFFER) {
+  if (storage_type != TensorStorageType::BUFFER &&
+      storage_type != TensorStorageType::IMAGE_BUFFER) {
     return absl::InvalidArgumentError(
-        "GetWHOffset selector can be used only with BUFFER");
+        "GetWHOffset selector can be used only with BUFFER/IMAGE_BUFFER");
   }
   if (args.size() != 2) {
     return absl::NotFoundError(absl::StrCat(

From d9654dc6c62b3aed0558ded31ded62db82c839e9 Mon Sep 17 00:00:00 2001
From: Raman Sarokin <sorokin@google.com>
Date: Mon, 29 Jun 2020 09:16:46 -0700
Subject: [PATCH 1250/1390] ConvPowerVR updated to new style. Added ability to
 bind Tensor with BufferDescriptor.

PiperOrigin-RevId: 318827549
Change-Id: Icec00da989538c0f5d127440de1883f7b455baf8
---
 tensorflow/lite/delegates/gpu/cl/BUILD        |   1 +
 tensorflow/lite/delegates/gpu/cl/arguments.cc |   4 +-
 .../delegates/gpu/cl/kernels/conv_powervr.cc  | 241 +++++++++---------
 .../delegates/gpu/cl/kernels/conv_powervr.h   |  95 +++++--
 tensorflow/lite/delegates/gpu/cl/tensor.cc    |  11 +
 5 files changed, 210 insertions(+), 142 deletions(-)

diff --git a/tensorflow/lite/delegates/gpu/cl/BUILD b/tensorflow/lite/delegates/gpu/cl/BUILD
index 1f894e7c142..1b75f077139 100644
--- a/tensorflow/lite/delegates/gpu/cl/BUILD
+++ b/tensorflow/lite/delegates/gpu/cl/BUILD
@@ -460,6 +460,7 @@ cc_library(
     srcs = ["tensor.cc"],
     hdrs = ["tensor.h"],
     deps = [
+        ":buffer",
         ":cl_command_queue",
         ":cl_context",
         ":cl_device",
diff --git a/tensorflow/lite/delegates/gpu/cl/arguments.cc b/tensorflow/lite/delegates/gpu/cl/arguments.cc
index 8548f093d78..79241091b14 100644
--- a/tensorflow/lite/delegates/gpu/cl/arguments.cc
+++ b/tensorflow/lite/delegates/gpu/cl/arguments.cc
@@ -789,8 +789,8 @@ absl::Status Arguments::ResolveSelectorsPass(
         next = (*code)[next_position];
       }
       if (next != '(') {
-        return absl::NotFoundError(
-            absl::StrCat("Expected ( after function ", selector_name, " call"));
+        return absl::NotFoundError(absl::StrCat(
+            "Expected ( after ", object_name, ".", selector_name, " call"));
       }
       std::vector<std::string> args;
       size_t close_bracket_pos;
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/conv_powervr.cc b/tensorflow/lite/delegates/gpu/cl/kernels/conv_powervr.cc
index c20cfdbeaa3..a34fa909267 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/conv_powervr.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/conv_powervr.cc
@@ -85,9 +85,9 @@ std::string GenerateBlockCoords(const int3& block_size,
       c += "  int linear_hw = get_group_id(" + std::to_string(launch_remap[0]) +
            ") * get_local_size(0) + get_local_id(0);\n";
     }
-    c += "  int Y = (linear_hw / task_size_x) * " +
+    c += "  int Y = (linear_hw / args.task_size_x) * " +
          std::to_string(block_size.y) + ";\n";
-    c += "  int X = (linear_hw % task_size_x) * " +
+    c += "  int X = (linear_hw % args.task_size_x) * " +
          std::to_string(block_size.x) + ";\n";
     if (work_group_launch_order[1] == 1) {
       c += "  int Z = get_global_id(1) * " + std::to_string(block_size.z) +
@@ -165,8 +165,6 @@ ConvPowerVR::ConvPowerVR(const OperationDef& definition)
 
 ConvPowerVR::ConvPowerVR(ConvPowerVR&& operation)
     : GPUOperation(std::move(operation)),
-      weights_(std::move(operation.weights_)),
-      biases_(std::move(operation.biases_)),
       stride_padding_(operation.stride_padding_),
       kernel_dilation_(operation.kernel_dilation_),
       conv_params_(operation.conv_params_),
@@ -174,8 +172,6 @@ ConvPowerVR::ConvPowerVR(ConvPowerVR&& operation)
 
 ConvPowerVR& ConvPowerVR::operator=(ConvPowerVR&& operation) {
   if (this != &operation) {
-    weights_ = std::move(operation.weights_);
-    biases_ = std::move(operation.biases_);
     std::swap(stride_padding_, operation.stride_padding_);
     std::swap(kernel_dilation_, operation.kernel_dilation_);
     std::swap(conv_params_, operation.conv_params_);
@@ -188,9 +184,14 @@ ConvPowerVR& ConvPowerVR::operator=(ConvPowerVR&& operation) {
 absl::Status ConvPowerVR::Compile(const CreationContext& creation_context) {
   const bool stride_correction =
       definition_.IsBatchSupported() && stride_padding_.x != 1;
-  const std::string code =
-      GenerateConv(*creation_context.device, definition_, stride_correction,
-                   conv_params_, linked_operations_);
+  std::string code = GenerateConv(*creation_context.device, definition_,
+                                  stride_correction, conv_params_, &args_);
+  std::string element_wise_code;
+  RETURN_IF_ERROR(
+      MergeOperations(linked_operations_, &args_, &element_wise_code));
+  RETURN_IF_ERROR(args_.TransformToCLCode(creation_context.device->GetInfo(),
+                                          {{"dst_tensor", element_wise_code}},
+                                          &code));
   std::vector<CompilerOptions> options;
   if (definition_.precision == CalculationsPrecision::F16 &&
       creation_context.device->IsPowerVR()) {
@@ -205,31 +206,30 @@ absl::Status ConvPowerVR::Compile(const CreationContext& creation_context) {
 }
 
 absl::Status ConvPowerVR::BindArguments() {
-  kernel_.ResetBindingCounter();
-  RETURN_IF_ERROR(kernel_.SetMemoryAuto(src_[0]->GetMemoryPtr()));
-  if (definition_.src_tensors.size() == 1) {
-    RETURN_IF_ERROR(kernel_.SetMemoryAuto(weights_.GetMemoryPtr()));
-  } else {
-    RETURN_IF_ERROR(kernel_.SetMemoryAuto(src_[1]->GetMemoryPtr()));
+  if (definition_.src_tensors.size() == 2) {
+    RETURN_IF_ERROR(args_.SetObjectRef("weights", src_[1]));
   }
-  RETURN_IF_ERROR(kernel_.SetMemoryAuto(biases_.GetMemoryPtr()));
-  RETURN_IF_ERROR(BindArgs(&kernel_, linked_operations_));
-  RETURN_IF_ERROR(kernel_.SetMemoryAuto(dst_[0]->GetMemoryPtrForWriting()));
   if (!conv_params_.x_kernel_is_1 || !conv_params_.y_kernel_is_1) {
-    RETURN_IF_ERROR(kernel_.SetBytesAuto(
-        int4(stride_padding_.x, stride_padding_.y,
-             stride_padding_.z * src_[0]->Batch(), stride_padding_.w)));
-    RETURN_IF_ERROR(kernel_.SetBytesAuto(
-        int4(kernel_dilation_.x, kernel_dilation_.y,
-             kernel_dilation_.z * src_[0]->Batch(), kernel_dilation_.w)));
+    RETURN_IF_ERROR(args_.SetInt("stride_x", stride_padding_.x));
+    RETURN_IF_ERROR(args_.SetInt("stride_y", stride_padding_.y));
+    RETURN_IF_ERROR(
+        args_.SetInt("padding_x", stride_padding_.z * src_[0]->Batch()));
+    RETURN_IF_ERROR(args_.SetInt("padding_y", stride_padding_.w));
+    RETURN_IF_ERROR(args_.SetInt("kernel_size_x", kernel_dilation_.x));
+    RETURN_IF_ERROR(args_.SetInt("kernel_size_y", kernel_dilation_.y));
+    RETURN_IF_ERROR(
+        args_.SetInt("dilation_x", kernel_dilation_.z * src_[0]->Batch()));
+    RETURN_IF_ERROR(args_.SetInt("dilation_y", kernel_dilation_.w));
   }
+  RETURN_IF_ERROR(args_.SetObjectRef("src_tensor", src_[0]));
+  RETURN_IF_ERROR(args_.SetObjectRef("dst_tensor", dst_[0]));
   if (conv_params_.linear_hw) {
     const int grid_x = DivideRoundUp(dst_[0]->Width() * dst_[0]->Batch(),
                                      conv_params_.block_size.x);
-    RETURN_IF_ERROR(kernel_.SetBytesAuto(grid_x));
+    RETURN_IF_ERROR(args_.SetInt("task_size_x", grid_x));
   }
-  RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->GetWBatchedHSB()));
-  RETURN_IF_ERROR(kernel_.SetBytesAuto(dst_[0]->GetWBatchedHSB()));
+  RETURN_IF_ERROR(SetArguments(linked_operations_, &args_));
+  RETURN_IF_ERROR(args_.Bind(kernel_.kernel()));
   return absl::OkStatus();
 }
 
@@ -287,19 +287,36 @@ absl::Status ConvPowerVR::AddToQueue(CLCommandQueue* queue) {
                                  conv_params_.work_group_size);
 }
 
-std::string GenerateConv(
-    const CLDevice& device, const OperationDef& op_def, bool stride_correction,
-    const ConvPowerVR::ConvParams& conv_params,
-    const std::vector<ElementwiseOperation*>& linked_operations) {
-  std::string c = GetCommonDefines(op_def.precision);
-  TensorCodeGenerator src_tensor(
-      "src_data", WHSPoint{"src_size.x", "src_size.y", "src_size.z"},
-      op_def.src_tensors[0]);
-  TensorCodeGenerator dst_tensor(
-      "dst_data", WHSPoint{"dst_size.x", "dst_size.y", "dst_size.z"},
-      op_def.dst_tensors[0]);
-
+std::string GenerateConv(const CLDevice& device, const OperationDef& op_def,
+                         bool stride_correction,
+                         const ConvPowerVR::ConvParams& conv_params,
+                         Arguments* args) {
+  auto src_desc = absl::make_unique<TensorDescriptor>(op_def.src_tensors[0]);
+  src_desc->SetTextureAddressMode(TextureAddressMode::ZERO);
+  if (op_def.IsBatchSupported()) {
+    src_desc->SetStateVar("BatchedWidth", "true");
+  }
+  args->AddObjectRef("src_tensor", AccessType::READ, std::move(src_desc));
+  auto dst_desc = absl::make_unique<TensorDescriptor>(op_def.dst_tensors[0]);
+  if (op_def.IsBatchSupported()) {
+    dst_desc->SetStateVar("BatchedWidth", "true");
+  }
+  args->AddObjectRef("dst_tensor", AccessType::WRITE, std::move(dst_desc));
   const bool is1x1 = conv_params.x_kernel_is_1 && conv_params.y_kernel_is_1;
+  if (!is1x1) {
+    args->AddInt("stride_x");
+    args->AddInt("stride_y");
+    args->AddInt("padding_x");
+    args->AddInt("padding_y");
+    args->AddInt("kernel_size_x");
+    args->AddInt("kernel_size_y");
+    args->AddInt("dilation_x");
+    args->AddInt("dilation_y");
+  }
+  if (conv_params.linear_hw) {
+    args->AddInt("task_size_x");
+  }
+
   const auto src_tensor_type = op_def.src_tensors[0].storage_type;
   const bool buffer_type = src_tensor_type == TensorStorageType::BUFFER ||
                            src_tensor_type == TensorStorageType::IMAGE_BUFFER;
@@ -331,6 +348,7 @@ std::string GenerateConv(
   const std::string weights_global_ptr =
       weights_space + " " + weights_data_type + "*";
 
+  std::string c = GetCommonDefines(op_def.precision);
   if (use_simd_broadcast) {
     if (device.cl_version() == OpenCLVersion::CL_2_0) {
       c += "#pragma OPENCL EXTENSION cl_khr_subgroups : enable\n";
@@ -350,21 +368,7 @@ std::string GenerateConv(
          std::to_string(simd_size) + ")))\n";
   }
   c += "__kernel void main_function(\n";
-  c += src_tensor.GetDeclaration(AccessType::READ) + ",\n";
-  c += "    " + weights_global_ptr + " filters_buffer,    \n";
-  c += "    " + weights_global_ptr + " biases             \n";
-  c += GetArgsDeclaration(linked_operations);
-  c += dst_tensor.GetDeclaration(AccessType::WRITE) + ",\n";
-  if (!is1x1) {
-    c += "    int4 stride_padding,           \n";
-    c += "    int4 kernel_dilation,          \n";
-  }
-  if (conv_params.linear_hw) {
-    c += "    int task_size_x,               \n";
-  }
-  c += "    int4 src_size,                   \n";
-  c += "    int4 dst_size                    \n";
-  c += ") {\n";
+  c += "$0) {\n";
   c += GenerateBlockCoords(conv_params.block_size,
                            conv_params.work_group_launch_order,
                            conv_params.linear_hw);
@@ -377,7 +381,8 @@ std::string GenerateConv(
     dst_y[y] = "(Y + " + std::to_string(y) + ")";
   }
   if (!late_oob_check) {
-    c += "  if (X >= dst_size.x || Y >= dst_size.y || Z >= dst_size.z) {\n";
+    c += "  if (X >= args.dst_tensor.Width() || Y >= args.dst_tensor.Height() "
+         "|| Z >= args.dst_tensor.Slices()) {\n";
     c += "    return;\n";
     c += "  }\n";
   }
@@ -405,17 +410,17 @@ std::string GenerateConv(
     for (int x = 0; x < block_size.x; ++x) {
       if (stride_correction) {
         c += "  int xc" + std::to_string(x) + " = " +
-             GetXStrideCorrected(dst_x[x], "src_size.w", "stride_padding.x",
-                                 "stride_padding.z") +
+             GetXStrideCorrected(dst_x[x], "args.src_tensor.Batch()",
+                                 "args.stride_x", "args.padding_x") +
              ";\n";
       } else {
         c += "  int xc" + std::to_string(x) + " = " + dst_x[x] +
-             " * stride_padding.x + stride_padding.z;\n";
+             " * args.stride_x + args.padding_x;\n";
       }
     }
     for (int y = 0; y < block_size.y; ++y) {
       c += "  int yc" + std::to_string(y) + " = " + dst_y[y] +
-           " * stride_padding.y + stride_padding.w;\n";
+           " * args.stride_y + args.padding_y;\n";
     }
   }
   if (need_local_mem) {
@@ -427,44 +432,45 @@ std::string GenerateConv(
   if (is1x1) {
     if (conv_params.different_weights_for_height) {
       c += "  " + weights_global_ptr +
-           " filters_loc = filters_buffer + (Z * src_size.y + Y * " +
-           std::to_string(block_size.z) +
-           ") * "
-           "4 * src_size.z;\n";
+           " filters_loc = args.weights.GetPtr() + (Z * "
+           "args.src_tensor.Height() + Y * " +
+           std::to_string(block_size.z) + ") * 4 * args.src_tensor.Slices();\n";
     } else {
       c += "  " + weights_global_ptr +
-           " filters_loc = filters_buffer + Z * 4 * "
-           "src_size.z;\n";
+           " filters_loc = args.weights.GetPtr() + Z * 4 * "
+           "args.src_tensor.Slices();\n";
     }
   } else {
     c += "  " + weights_global_ptr +
-         " filters_loc = filters_buffer + Z * 4 * "
-         "src_size.z * kernel_dilation.x * kernel_dilation.y;\n";
+         " filters_loc = args.weights.GetPtr() + Z * 4 * "
+         "args.src_tensor.Slices() *args.kernel_size_x * args.kernel_size_y;\n";
   }
   if (buffer_type) {
-    c += "  const int src_layer_offset = src_size.x * src_size.y;\n";
+    c += "  const int src_layer_offset = args.src_tensor.SliceStride();\n";
   }
   if (!is1x1) {
-    c += "  for (int ky = 0; ky < kernel_dilation.y; ++ky) {\n";
+    c += "  for (int ky = 0; ky < args.kernel_size_y; ++ky) {\n";
     for (int y = 0; y < block_size.y; ++y) {
       const std::string yck = "yck" + std::to_string(y);
-      c += "  int " + yck + " = ky * kernel_dilation.w + yc" +
-           std::to_string(y) + ";\n";
+      c += "  int " + yck + " = ky * args.dilation_y + yc" + std::to_string(y) +
+           ";\n";
       if (manual_clamp) {
         c += "  bool my" + std::to_string(y) + " = " + yck + " >= 0 && " + yck +
-             " < src_size.y;\n";
-        c += "  " + yck + " = clamp(" + yck + ", 0, src_size.y - 1);\n";
+             " < args.src_tensor.Height();\n";
+        c += "  " + yck + " = clamp(" + yck +
+             ", 0, args.src_tensor.Height() - 1);\n";
       }
     }
-    c += "  for (int kx = 0; kx < kernel_dilation.x; ++kx) {\n";
+    c += "  for (int kx = 0; kx < args.kernel_size_x; ++kx) {\n";
     for (int x = 0; x < block_size.x; ++x) {
       const std::string xck = "xck" + std::to_string(x);
-      c += "  int xck" + std::to_string(x) + " = kx * kernel_dilation.z + xc" +
+      c += "  int xck" + std::to_string(x) + " = kx * args.dilation_x + xc" +
            std::to_string(x) + ";\n";
       if (manual_clamp) {
         c += "  bool mx" + std::to_string(x) + " = " + xck + " >= 0 && " + xck +
-             " < src_size.x;\n";
-        c += "  " + xck + " = clamp(" + xck + ", 0, src_size.x - 1);\n";
+             " < args.src_tensor.Width();\n";
+        c += "  " + xck + " = clamp(" + xck +
+             ", 0, args.src_tensor.Width() - 1);\n";
       }
     }
   }
@@ -473,10 +479,13 @@ std::string GenerateConv(
       const std::string yck = "yck" + std::to_string(y);
       for (int x = 0; x < block_size.x; ++x) {
         const std::string xck = "xck" + std::to_string(x);
-        std::string xc = is1x1 ? "min(" + dst_x[x] + ", src_size.x - 1)" : xck;
-        std::string yc = is1x1 ? "min(" + dst_y[y] + ", src_size.y - 1)" : yck;
+        std::string xc =
+            is1x1 ? "min(" + dst_x[x] + ", args.src_tensor.Width() - 1)" : xck;
+        std::string yc =
+            is1x1 ? "min(" + dst_y[y] + ", args.src_tensor.Height() - 1)" : yck;
         std::string id = std::to_string(y) + std::to_string(x);
-        c += "  int src_a_" + id + " = " + yc + " * src_size.x + " + xc + ";\n";
+        c += "  int src_a_" + id + " = " + yc +
+             " * args.src_tensor.Width() + " + xc + ";\n";
       }
     }
   }
@@ -489,31 +498,26 @@ std::string GenerateConv(
       }
     }
   };
-  const auto mode = TextureAddressMode::ZERO;
   const bool conditional_read = device.IsMali();
   auto read_src = [&]() {
+    const std::string cl_type = ToCLDataType(conv_params.weights_data_type);
     for (int y = 0; y < block_size.y; ++y) {
       for (int x = 0; x < block_size.x; ++x) {
         if (buffer_type) {
           std::string id = std::to_string(y) + std::to_string(x);
           if (is1x1) {
-            c += "    src" + id + " = " +
-                 src_tensor.ReadAsType(conv_params.weights_data_type,
-                                       "src_a_" + id) +
-                 ";\n";
+            c += "    src" + id + " = args.src_tensor.Read<" + cl_type +
+                 ">(src_a_" + id + ");\n";
           } else {
             std::string condition =
                 "mx" + std::to_string(x) + " && my" + std::to_string(y);
             if (conditional_read) {
-              c += "    src" + id + " = " + condition + " ? " +
-                   src_tensor.ReadAsType(conv_params.weights_data_type,
-                                         "src_a_" + id) +
-                   " : (FLT4)(0.0f);\n";
+              c += "    src" + id + " = " + condition +
+                   " ? args.src_tensor.Read<" + cl_type + ">(src_a_" + id +
+                   ") : (FLT4)(0.0f);\n";
             } else {
-              c += "    src" + id + " = " +
-                   src_tensor.ReadAsType(conv_params.weights_data_type,
-                                         "src_a_" + id) +
-                   " * (FLT)(" + condition + ");\n";
+              c += "    src" + id + " = args.src_tensor.Read<" + cl_type +
+                   ">(src_a_" + id + ") * (FLT)(" + condition + ");\n";
             }
           }
           c += "    src_a_" + id + " += src_layer_offset;\n";
@@ -521,10 +525,8 @@ std::string GenerateConv(
           std::string id = std::to_string(y) + std::to_string(x);
           const std::string xc = is1x1 ? dst_x[x] : "xck" + std::to_string(x);
           const std::string yc = is1x1 ? dst_y[y] : "yck" + std::to_string(y);
-          c += "    src" + id + " = " +
-               src_tensor.ReadAsTypeWHS(conv_params.weights_data_type, xc, yc,
-                                        "s", mode) +
-               ";\n";
+          c += "    src" + id + " = args.src_tensor.Read<" + cl_type + ">(" +
+               xc + ", " + yc + ", s);\n";
         }
       }
     }
@@ -639,31 +641,33 @@ std::string GenerateConv(
     c += "    s += 1;\n";
   }
   c += "    filters_loc += " + std::to_string(local_mem_size) + ";\n";
-  c += "  } while (s < src_size.z);\n";
+  c += "  } while (s < args.src_tensor.Slices());\n";
   if (!is1x1) {
     c += "  };\n";
     c += "  };\n";
   }
   if (conv_params.weights_upload_type ==
       ConvPowerVR::WeightsUploadType::LOCAL_MEM_ASYNC_SUBGROUP) {
-    c += GenerateAsyncUpload("weights_cache", "biases", "Z", block_size.z);
+    c += GenerateAsyncUpload("weights_cache", "args.biases.GetPtr()", "Z",
+                             block_size.z);
   } else if (conv_params.weights_upload_type ==
              ConvPowerVR::WeightsUploadType::LOCAL_MEM_BY_THREADS) {
     c += "    barrier(CLK_LOCAL_MEM_FENCE);\n";
-    c += GenerateUploadByThreads("weights_cache", "biases", "Z", "lid",
-                                 total_work_items, block_size.z);
+    c += GenerateUploadByThreads("weights_cache", "args.biases.GetPtr()", "Z",
+                                 "lid", total_work_items, block_size.z);
     c += "    barrier(CLK_LOCAL_MEM_FENCE);\n";
   } else {
-    c += "    weights_cache = biases + Z;\n";
+    c += "    weights_cache = args.biases.GetPtr() + Z;\n";
   }
   if (late_oob_check) {
-    c += "  if (X >= dst_size.x || Y >= dst_size.y || Z >= dst_size.z) {\n";
+    c += "  if (X >= args.dst_tensor.Width() || Y >= args.dst_tensor.Height() "
+         "|| Z >= args.dst_tensor.Slices()) {\n";
     c += "    return;\n";
     c += "  }\n";
   }
   for (int z = 0; z < block_size.z; ++z) {
     const std::string sz = std::to_string(z);
-    c += "  if (Z + " + sz + " >= dst_size.z) return;\n";
+    c += "  if (Z + " + sz + " >= args.dst_tensor.Slices()) return;\n";
     c += "  {\n";
     c += "    FLT4 bias_val = TO_FLT4(weights_cache[" + sz + "]);\n";
     for (int y = 0; y < block_size.y; ++y) {
@@ -675,18 +679,18 @@ std::string GenerateConv(
         bool need_x_check = x != 0;
         bool need_y_check = y != 0;
         if (need_x_check && need_y_check) {
-          c += "  if (" + xs + " < dst_size.x && " + ys + " < dst_size.y) {\n";
+          c += "  if (" + xs + " < args.dst_tensor.Width() && " + ys +
+               " < args.dst_tensor.Height()) {\n";
         } else if (need_x_check && !need_y_check) {
-          c += "  if (" + xs + " < dst_size.x) {\n";
+          c += "  if (" + xs + " < args.dst_tensor.Width()) {\n";
         } else if (!need_x_check && need_y_check) {
-          c += "  if (" + ys + " < dst_size.y) {\n";
+          c += "  if (" + ys + " < args.dst_tensor.Height()) {\n";
         } else {
           c += "  {\n";
         }
         c += "    FLT4 res = TO_FLT4(r" + r_id + ") + bias_val;\n";
-        const LinkingContext context{"res", xs, ys, zs};
-        c += PostProcess(linked_operations, context);
-        c += "    " + dst_tensor.WriteWHS("res", xs, ys, zs) + "\n";
+        c += "    args.dst_tensor.Write(res, " + xs + ", " + ys + ", " + zs +
+             ");\n";
         c += "  }\n";
       }
     }
@@ -999,12 +1003,17 @@ absl::Status CreateConvPowerVRDynamicWeights(
     ConvPowerVR* result, const BHWC* dst_shape) {
   *result = ConvPowerVR(definition, attr, weights_shape,
                         *creation_context.device, dst_shape);
-  LinearStorageCreateInfo create_info;
-  create_info.storage_type = LinearStorageType::BUFFER;
-  create_info.data_type = result->conv_params_.weights_data_type;
-  create_info.aligned_size = weights_shape.b;
-  return CreateLinearStorage(create_info, attr.bias, creation_context.context,
-                             &result->biases_);
+  BufferDescriptor desc;
+  desc.element_type = definition.src_tensors[1].data_type;
+  desc.element_size = 4;
+  desc.memory_type = result->conv_params_.weights_upload_type ==
+                             ConvPowerVR::WeightsUploadType::CONSTANT_MEM
+                         ? MemoryType::CONSTANT
+                         : MemoryType::GLOBAL;
+
+  result->args_.AddObjectRef("weights", AccessType::READ,
+                             absl::make_unique<BufferDescriptor>(desc));
+  return result->UploadBias(attr.bias, creation_context.context);
 }
 
 absl::Status CreateConvPowerVRWino4x4To6x6(
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/conv_powervr.h b/tensorflow/lite/delegates/gpu/cl/kernels/conv_powervr.h
index cf182404e23..cf82ff1e966 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/conv_powervr.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/conv_powervr.h
@@ -149,6 +149,10 @@ class ConvPowerVR : public GPUOperation {
   absl::Status UploadWeights(const tflite::gpu::Tensor<OHWI, T>& weights,
                              CLContext* context);
 
+  template <DataType T>
+  absl::Status UploadBias(const tflite::gpu::Tensor<Linear, T>& bias,
+                          CLContext* context);
+
   friend absl::Status CreateConvPowerVR(const CreationContext& creation_context,
                                         const OperationDef& definition,
                                         const Convolution2DAttributes& attr,
@@ -171,10 +175,11 @@ class ConvPowerVR : public GPUOperation {
       const Convolution2DAttributes& attr, ConvPowerVR* result,
       const BHWC* dst_shape);
 
-  friend std::string GenerateConv(
-      const CLDevice& device, const OperationDef& op_def,
-      bool stride_correction, const ConvParams& conv_params,
-      const std::vector<ElementwiseOperation*>& linked_operations);
+  friend std::string GenerateConv(const CLDevice& device,
+                                  const OperationDef& op_def,
+                                  bool stride_correction,
+                                  const ConvParams& conv_params,
+                                  Arguments* args);
 
   ConvParams GuessBestParams(const CLDevice& device,
                              const OperationDef& definition,
@@ -203,9 +208,6 @@ class ConvPowerVR : public GPUOperation {
   absl::Status BindArguments();
   int3 GetGridSize() const;
 
-  Buffer weights_;
-  LinearStorage biases_;
-
   int4 stride_padding_;
   int4 kernel_dilation_;
   ConvParams conv_params_;
@@ -218,11 +220,7 @@ absl::Status ConvPowerVR::UploadData(
     const tflite::gpu::Tensor<OHWI, T>& weights,
     const tflite::gpu::Tensor<Linear, T>& biases, CLContext* context) {
   RETURN_IF_ERROR(UploadWeights(weights, context));
-  LinearStorageCreateInfo create_info;
-  create_info.storage_type = LinearStorageType::BUFFER;
-  create_info.data_type = conv_params_.weights_data_type;
-  create_info.aligned_size = weights.shape.o;
-  RETURN_IF_ERROR(CreateLinearStorage(create_info, biases, context, &biases_));
+  RETURN_IF_ERROR(UploadBias(biases, context));
   return absl::OkStatus();
 }
 
@@ -233,14 +231,47 @@ absl::Status ConvPowerVR::UploadDataForWinograd4x4To6x6(
   tflite::gpu::Tensor<OHWI, T> wino_weights;
   RearrangeWeightsToWinograd4x4To6x6Weights(weights, &wino_weights);
   RETURN_IF_ERROR(UploadWeights(wino_weights, context));
-  LinearStorageCreateInfo create_info;
-  create_info.storage_type = LinearStorageType::BUFFER;
-  create_info.data_type = conv_params_.weights_data_type;
-  create_info.aligned_size = weights.shape.o;
-  tflite::gpu::Tensor<Linear, DataType::FLOAT32> bias;
-  bias.shape = Linear(weights.shape.o);
-  bias.data.resize(weights.shape.o, 0.0f);
-  RETURN_IF_ERROR(CreateLinearStorage(create_info, bias, context, &biases_));
+  tflite::gpu::Tensor<Linear, DataType::FLOAT32> biases;
+  biases.shape = Linear(weights.shape.o);
+  biases.data.resize(weights.shape.o, 0.0f);
+  RETURN_IF_ERROR(UploadBias(biases, context));
+  return absl::OkStatus();
+}
+
+template <DataType T>
+absl::Status ConvPowerVR::UploadBias(const tflite::gpu::Tensor<Linear, T>& bias,
+                                     CLContext* context) {
+  BufferDescriptor desc;
+  desc.element_type = conv_params_.weights_data_type;
+  desc.element_size = 4;
+  desc.memory_type = conv_params_.weights_upload_type ==
+                             ConvPowerVR::WeightsUploadType::CONSTANT_MEM
+                         ? MemoryType::CONSTANT
+                         : MemoryType::GLOBAL;
+
+  Buffer bias_buffer;
+  int aligned_channels = AlignByN(bias.shape.v, 4 * conv_params_.block_size.z);
+  if (conv_params_.weights_data_type == DataType::FLOAT32) {
+    std::vector<float> gpu_data(aligned_channels);
+    for (int i = 0; i < gpu_data.size(); ++i) {
+      gpu_data[i] = i < bias.shape.v ? bias.data[i] : 0.0f;
+    }
+    RETURN_IF_ERROR(CreateReadOnlyBuffer(sizeof(float) * gpu_data.size(),
+                                         gpu_data.data(), context,
+                                         &bias_buffer));
+  } else {
+    std::vector<half> gpu_data(aligned_channels);
+    for (int i = 0; i < gpu_data.size(); ++i) {
+      gpu_data[i] = i < bias.shape.v ? bias.data[i] : 0.0f;
+    }
+    RETURN_IF_ERROR(CreateReadOnlyBuffer(sizeof(half) * gpu_data.size(),
+                                         gpu_data.data(), context,
+                                         &bias_buffer));
+  }
+
+  args_.AddObject("biases", AccessType::READ,
+                  absl::make_unique<Buffer>(std::move(bias_buffer)),
+                  absl::make_unique<BufferDescriptor>(desc));
   return absl::OkStatus();
 }
 
@@ -257,19 +288,35 @@ absl::Status ConvPowerVR::UploadWeights(
   const int elements_count =
       weights.shape.h * weights.shape.w * src_depth * dst_depth_aligned * 4;
 
+  Buffer weights_buffer;
   if (f32_weights) {
     std::vector<float4> gpu_data(elements_count);
     RearrangeWeightsToOHWIOGroupI4O4(weights, conv_params_.block_size.z,
                                      absl::MakeSpan(gpu_data));
-    return CreateReadOnlyBuffer(float4_size * elements_count, gpu_data.data(),
-                                context, &weights_);
+    RETURN_IF_ERROR(CreateReadOnlyBuffer(float4_size * elements_count,
+                                         gpu_data.data(), context,
+                                         &weights_buffer));
   } else {
     std::vector<half4> gpu_data(elements_count);
     RearrangeWeightsToOHWIOGroupI4O4(weights, conv_params_.block_size.z,
                                      absl::MakeSpan(gpu_data));
-    return CreateReadOnlyBuffer(float4_size * elements_count, gpu_data.data(),
-                                context, &weights_);
+    RETURN_IF_ERROR(CreateReadOnlyBuffer(float4_size * elements_count,
+                                         gpu_data.data(), context,
+                                         &weights_buffer));
   }
+
+  BufferDescriptor desc;
+  desc.element_type = conv_params_.weights_data_type;
+  desc.element_size = 4;
+  desc.memory_type = conv_params_.weights_upload_type ==
+                             ConvPowerVR::WeightsUploadType::CONSTANT_MEM
+                         ? MemoryType::CONSTANT
+                         : MemoryType::GLOBAL;
+
+  args_.AddObject("weights", AccessType::READ,
+                  absl::make_unique<Buffer>(std::move(weights_buffer)),
+                  absl::make_unique<BufferDescriptor>(desc));
+  return absl::OkStatus();
 }
 
 absl::Status CreateConvPowerVR(const CreationContext& creation_context,
diff --git a/tensorflow/lite/delegates/gpu/cl/tensor.cc b/tensorflow/lite/delegates/gpu/cl/tensor.cc
index cc01f888b71..228f84e2663 100644
--- a/tensorflow/lite/delegates/gpu/cl/tensor.cc
+++ b/tensorflow/lite/delegates/gpu/cl/tensor.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <cstring>
 
 #include "absl/strings/str_cat.h"
+#include "tensorflow/lite/delegates/gpu/cl/buffer.h"
 #include "tensorflow/lite/delegates/gpu/cl/cl_image_format.h"
 #include "tensorflow/lite/delegates/gpu/cl/tensor_type.h"
 #include "tensorflow/lite/delegates/gpu/common/data_type.h"
@@ -163,6 +164,16 @@ void Tensor::Release() {
 
 absl::Status Tensor::GetGPUResources(const GPUObjectDescriptor* obj_ptr,
                                      GPUResourcesWithValue* resources) const {
+  const auto* buffer_desc = dynamic_cast<const BufferDescriptor*>(obj_ptr);
+  if (buffer_desc) {
+    if (descriptor_.storage_type != TensorStorageType::BUFFER) {
+      return absl::InvalidArgumentError(
+          "Tensor can be used with BufferDescriptor only wtih "
+          "TensorStorageType::BUFFER.");
+    }
+    resources->buffers.push_back({"buffer", memory_});
+    return absl::OkStatus();
+  }
   const auto* tensor_desc = dynamic_cast<const TensorDescriptor*>(obj_ptr);
   if (!tensor_desc) {
     return absl::InvalidArgumentError("Expected TensorDescriptor on input.");

From 99ac01d611480ea590e25753812ef3829b828e09 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 29 Jun 2020 09:41:49 -0700
Subject: [PATCH 1251/1390] Throw error when TF_ListPhysicalDevices fails.

PiperOrigin-RevId: 318832184
Change-Id: Ib2f2112527d9fb52017259f7b5172f712b902e74
---
 tensorflow/python/tfe_wrapper.cc | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/tensorflow/python/tfe_wrapper.cc b/tensorflow/python/tfe_wrapper.cc
index 80cce331353..4ad49d91134 100644
--- a/tensorflow/python/tfe_wrapper.cc
+++ b/tensorflow/python/tfe_wrapper.cc
@@ -254,14 +254,10 @@ py::object TFE_Py_ExecuteCancelable_wrapper(
 }
 
 static py::object TF_ListPhysicalDevices() {
-  tensorflow::Safe_TF_StatusPtr status = tensorflow::make_safe(TF_NewStatus());
   std::vector<string> devices;
   tensorflow::Status s =
       tensorflow::DeviceFactory::ListAllPhysicalDevices(&devices);
-  tensorflow::Set_TF_Status_from_Status(status.get(), s);
-  if (!s.ok()) {
-    return py::none();
-  }
+  MaybeRaiseRegisteredFromStatus(s);
   PyObject* result = PyList_New(devices.size());
   int i = 0;
   for (auto& dev : devices) {

From a3e749b9007186b8c71d802c919ddb92408adaf8 Mon Sep 17 00:00:00 2001
From: Raman Sarokin <sorokin@google.com>
Date: Mon, 29 Jun 2020 09:49:07 -0700
Subject: [PATCH 1252/1390] ConvBuffer1x1 updated to new style. Added ability
 to operate on tensor elements of X2 or X4 size for Buffer tensor.

PiperOrigin-RevId: 318833564
Change-Id: I4ca1fef6e2c136245777267b4e8f223daca78413
---
 .../gpu/cl/kernels/conv_buffer_1x1.cc         | 128 +++++++++---------
 .../gpu/cl/kernels/conv_buffer_1x1.h          |  62 ++++++---
 tensorflow/lite/delegates/gpu/cl/tensor.cc    |   4 +
 .../lite/delegates/gpu/cl/tensor_type.cc      |  75 ++++++----
 .../lite/delegates/gpu/cl/tensor_type.h       |   3 +
 5 files changed, 162 insertions(+), 110 deletions(-)

diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/conv_buffer_1x1.cc b/tensorflow/lite/delegates/gpu/cl/kernels/conv_buffer_1x1.cc
index cb86e023545..94a81ce3fa5 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/conv_buffer_1x1.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/conv_buffer_1x1.cc
@@ -81,14 +81,26 @@ std::string GetComputationPart(const int3& block_size, int element_size,
   return c;
 }
 
-std::string GenerateConvBuffer1x1(
-    const OperationDef& op_def, const ConvBuffer1x1::ConvParams& conv_params,
-    const std::vector<ElementwiseOperation*>& linked_operations) {
-  std::string c = GetCommonDefines(op_def.precision);
-  TensorCodeGenerator dst_tensor(
-      "dst_data", WHSPoint{"dst_size.x", "dst_size.y", "dst_size.z"},
-      op_def.dst_tensors[0]);
+std::string GenerateConvBuffer1x1(const OperationDef& op_def,
+                                  const ConvBuffer1x1::ConvParams& conv_params,
+                                  Arguments* args) {
+  auto src_desc = absl::make_unique<TensorDescriptor>(op_def.src_tensors[0]);
+  if (op_def.IsBatchSupported()) {
+    src_desc->SetStateVar("BatchedWidth", "true");
+  }
+  if (conv_params.element_size == 8) {
+    src_desc->SetStateVar("ElementsX2", "true");
+  } else if (conv_params.element_size == 16) {
+    src_desc->SetStateVar("ElementsX4", "true");
+  }
+  args->AddObjectRef("src_tensor", AccessType::READ, std::move(src_desc));
+  auto dst_desc = absl::make_unique<TensorDescriptor>(op_def.dst_tensors[0]);
+  if (op_def.IsBatchSupported()) {
+    dst_desc->SetStateVar("BatchedWidth", "true");
+  }
+  args->AddObjectRef("dst_tensor", AccessType::WRITE, std::move(dst_desc));
 
+  std::string c = GetCommonDefines(op_def.precision);
   switch (op_def.precision) {
     case CalculationsPrecision::F32:
       c += "#define FLT8 float8\n";
@@ -105,34 +117,30 @@ std::string GenerateConvBuffer1x1(
   const int element_size = conv_params.element_size / 4;
 
   c += "__kernel void main_function(\n";
-  c += "    __global FLT" + std::to_string(element_size * 4) + "* src_data,\n";
-  c += "    __global FLT16* filters_buffer,   \n";
-  c += "    __global FLT4* biases             \n";
-  c += GetArgsDeclaration(linked_operations);
-  c += dst_tensor.GetDeclaration(AccessType::WRITE) + ",\n";
-  c += "    int4 src_size,                   \n";
-  c += "    int4 dst_size                    \n";
-  c += ") {\n";
+  c += "$0) {\n";
   c += "  int X = get_global_id(0) * " +
        std::to_string(block_size.x * element_size) + ";\n";
   c += "  int X_SRC = get_global_id(0) * " + std::to_string(block_size.x) +
        ";\n";
   c += "  int Y = get_global_id(1) * " + std::to_string(block_size.y) + ";\n";
   c += "  int Z = get_global_id(2) * " + std::to_string(block_size.z) + ";\n";
-  c += "  if (X >= dst_size.x || Y >= dst_size.y || Z >= dst_size.z) return;\n";
+  c += "  if (X >= args.dst_tensor.Width() || Y >= args.dst_tensor.Height() || "
+       "Z >= args.dst_tensor.Slices()) return;\n";
   if (conv_params.different_weights_for_height) {
-    c += "  __global FLT16* weights_cache = filters_buffer + (Z * src_size.y + "
+    c += "  __global FLT16* weights_cache = args.weights.GetPtr() + (Z * "
+         "args.src_tensor.Height() + "
          "Y * " +
          std::to_string(block_size.z) +
          ") * "
-         "src_size.z;\n";
+         "args.src_tensor.Slices();\n";
   } else {
-    c += "  __global FLT16* weights_cache = filters_buffer + Z * src_size.z;\n";
+    c += "  __global FLT16* weights_cache = args.weights.GetPtr() + Z * "
+         "args.src_tensor.Slices();\n";
   }
   for (int z = 0; z < block_size.z; ++z) {
     const std::string z_s = std::to_string(z);
-    c += "  ACCUM_FLT4 bias_val_" + z_s + " = TO_ACCUM_TYPE(biases[Z + " + z_s +
-         "]);\n";
+    c += "  ACCUM_FLT4 bias_val_" + z_s +
+         " = TO_ACCUM_TYPE(args.biases.Read(Z + " + z_s + "));\n";
     for (int y = 0; y < block_size.y; ++y) {
       for (int x = 0; x < block_size.x * element_size; ++x) {
         c += "  ACCUM_FLT4 r" + z_s + std::to_string(y) + std::to_string(x) +
@@ -143,56 +151,54 @@ std::string GenerateConvBuffer1x1(
   for (int x = 0; x < block_size.x; ++x) {
     std::string x_s = std::to_string(x);
     c += "  int xc" + x_s + " = min(X_SRC + " + std::to_string(x) +
-         ", src_size.x - 1);\n";
+         ", args.src_tensor.Width() - 1);\n";
   }
   for (int y = 0; y < block_size.y; ++y) {
     std::string y_s = std::to_string(y);
-    c += "  int yc" + y_s + " = min(Y + " + y_s + ", src_size.y - 1);\n";
+    c += "  int yc" + y_s + " = min(Y + " + y_s +
+         ", args.src_tensor.Height() - 1);\n";
   }
   for (int y = 0; y < block_size.y; ++y) {
     std::string y_s = std::to_string(y);
     for (int x = 0; x < block_size.x; ++x) {
       std::string x_s = std::to_string(x);
       std::string i_s = std::to_string(y * block_size.x + x);
-      c += "  int src_addr_" + i_s + " = (yc" + y_s + ") * src_size.x + (xc" +
-           x_s + ");\n";
+      c += "  int src_addr_" + i_s + " = (yc" + y_s +
+           ") * args.src_tensor.Width() + (xc" + x_s + ");\n";
     }
   }
-  c += "  for (int s = 0; s < src_size.z; ++s) {\n";
+  c += "  for (int s = 0; s < args.src_tensor.Slices(); ++s) {\n";
   for (int y = 0; y < block_size.y; ++y) {
     std::string y_s = std::to_string(y);
     for (int x = 0; x < block_size.x; ++x) {
       std::string x_s = std::to_string(x);
       std::string i_s = std::to_string(y * block_size.x + x);
       c += "    FLT" + std::to_string(element_size * 4) + " s" + i_s +
-           " = src_data[src_addr_" + i_s + "];\n";
+           " = args.src_tensor.Read(src_addr_" + i_s + ");\n";
     }
   }
   c += GetComputationPart(block_size, element_size, op_def.precision);
   for (int i = 0; i < block_size.x * block_size.y; ++i) {
     std::string i_s = std::to_string(i);
-    c += "    src_addr_" + i_s + " += src_size.w;\n";
+    c += "    src_addr_" + i_s + " += args.src_tensor.SliceStride();\n";
   }
   c += "    weights_cache += " + std::to_string(block_size.z) + ";\n";
-  c += "  }\n";  // src_size.z = SRC_DEPTH
+  c += "  }\n";  // SRC_SLICES
 
   for (int z = 0; z < block_size.z; ++z) {
     const std::string z_s = std::to_string(z);
     if (z != 0) {
-      c += "  if (Z + " + z_s + " >= dst_size.z) return;\n";
+      c += "  if (Z + " + z_s + " >= args.dst_tensor.Slices()) return;\n";
     }
     for (int y = 0; y < block_size.y; ++y) {
       const std::string y_s = std::to_string(y);
       for (int x = 0; x < block_size.x * element_size; ++x) {
         const std::string x_s = std::to_string(x);
-        c += "  if (X + " + x_s + " < dst_size.x && Y + " + y_s +
-             " < dst_size.y) {\n";
+        c += "  if (X + " + x_s + " < args.dst_tensor.Width() && Y + " + y_s +
+             " < args.dst_tensor.Height()) {\n";
         c += "    FLT4 res = TO_FLT4(r" + z_s + y_s + x_s + ");\n";
-        const LinkingContext context{"res", "X + " + x_s, "Y + " + y_s,
-                                     "Z + " + z_s};
-        c += PostProcess(linked_operations, context);
-        c += "    " + dst_tensor.WriteWHS("res", "X + " + x_s, "Y + " + y_s,
-                                          "Z + " + z_s);
+        c += "    args.dst_tensor.Write(res, X + " + x_s + ", Y + " + y_s +
+             ", Z + " + z_s + ");\n";
         c += "  }\n";
       }
     }
@@ -275,15 +281,11 @@ ConvBuffer1x1::ConvBuffer1x1(const OperationDef& definition,
 
 ConvBuffer1x1::ConvBuffer1x1(ConvBuffer1x1&& operation)
     : GPUOperation(std::move(operation)),
-      weights_(std::move(operation.weights_)),
-      biases_(std::move(operation.biases_)),
       conv_params_(std::move(operation.conv_params_)),
       kernel_(std::move(operation.kernel_)) {}
 
 ConvBuffer1x1& ConvBuffer1x1::operator=(ConvBuffer1x1&& operation) {
   if (this != &operation) {
-    weights_ = std::move(operation.weights_);
-    biases_ = std::move(operation.biases_);
     std::swap(conv_params_, operation.conv_params_);
     kernel_ = std::move(operation.kernel_);
     GPUOperation::operator=(std::move(operation));
@@ -292,8 +294,13 @@ ConvBuffer1x1& ConvBuffer1x1::operator=(ConvBuffer1x1&& operation) {
 }
 
 absl::Status ConvBuffer1x1::Compile(const CreationContext& creation_context) {
-  std::string code =
-      GenerateConvBuffer1x1(definition_, conv_params_, linked_operations_);
+  std::string code = GenerateConvBuffer1x1(definition_, conv_params_, &args_);
+  std::string element_wise_code;
+  RETURN_IF_ERROR(
+      MergeOperations(linked_operations_, &args_, &element_wise_code));
+  RETURN_IF_ERROR(args_.TransformToCLCode(creation_context.device->GetInfo(),
+                                          {{"dst_tensor", element_wise_code}},
+                                          &code));
   RETURN_IF_ERROR(creation_context.cache->GetOrCreateCLKernel(
       code, "main_function", *creation_context.context,
       *creation_context.device, &kernel_));
@@ -301,23 +308,13 @@ absl::Status ConvBuffer1x1::Compile(const CreationContext& creation_context) {
 }
 
 absl::Status ConvBuffer1x1::BindArguments() {
-  kernel_.ResetBindingCounter();
-  RETURN_IF_ERROR(kernel_.SetMemoryAuto(src_[0]->GetMemoryPtr()));
-  if (definition_.src_tensors.size() == 1) {
-    RETURN_IF_ERROR(kernel_.SetMemoryAuto(weights_.GetMemoryPtr()));
-  } else {
-    RETURN_IF_ERROR(kernel_.SetMemoryAuto(src_[1]->GetMemoryPtr()));
+  if (definition_.src_tensors.size() == 2) {
+    RETURN_IF_ERROR(args_.SetObjectRef("weights", src_[1]));
   }
-  RETURN_IF_ERROR(kernel_.SetMemoryAuto(biases_.GetMemoryPtr()));
-  RETURN_IF_ERROR(BindArgs(&kernel_, linked_operations_));
-  RETURN_IF_ERROR(kernel_.SetMemoryAuto(dst_[0]->GetMemoryPtrForWriting()));
-  const int src_width_elements = DivideRoundUp(
-      src_[0]->Width() * src_[0]->Batch(), (conv_params_.element_size / 4));
-  int4 src_size = int4(src_width_elements, src_[0]->Height(), src_[0]->Slices(),
-                       src_width_elements * src_[0]->Height());
-  RETURN_IF_ERROR(kernel_.SetBytesAuto(src_size));
-  RETURN_IF_ERROR(kernel_.SetBytesAuto(dst_[0]->GetWBatchedHSB()));
-  return absl::OkStatus();
+  RETURN_IF_ERROR(args_.SetObjectRef("src_tensor", src_[0]));
+  RETURN_IF_ERROR(args_.SetObjectRef("dst_tensor", dst_[0]));
+  RETURN_IF_ERROR(SetArguments(linked_operations_, &args_));
+  return args_.Bind(kernel_.kernel());
 }
 
 int3 ConvBuffer1x1::GetGridSize() const {
@@ -445,12 +442,13 @@ absl::Status CreateConvBuffer1x1DynamicWeights(
                                 dst_depth);
   }
   *result = ConvBuffer1x1(definition, conv_params);
-  LinearStorageCreateInfo create_info;
-  create_info.storage_type = LinearStorageType::BUFFER;
-  create_info.data_type = result->definition_.GetDataType();
-  create_info.aligned_size = weights_shape.b;
-  return CreateLinearStorage(create_info, attr.bias, creation_context.context,
-                             &result->biases_);
+  BufferDescriptor desc;
+  desc.element_type = definition.src_tensors[1].data_type;
+  desc.element_size = 16;
+  desc.memory_type = MemoryType::GLOBAL;
+  result->args_.AddObjectRef("weights", AccessType::READ,
+                             absl::make_unique<BufferDescriptor>(desc));
+  return result->UploadBiases(attr.bias, creation_context.context);
 }
 
 }  // namespace cl
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/conv_buffer_1x1.h b/tensorflow/lite/delegates/gpu/cl/kernels/conv_buffer_1x1.h
index d85fca2c6d9..9e3f9711682 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/conv_buffer_1x1.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/conv_buffer_1x1.h
@@ -102,12 +102,13 @@ class ConvBuffer1x1 : public GPUOperation {
   absl::Status UploadWeights(const tflite::gpu::Tensor<OHWI, T>& weights,
                              CLContext* context);
 
+  template <DataType T>
+  absl::Status UploadBiases(const tflite::gpu::Tensor<Linear, T>& biases,
+                            CLContext* context);
+
   absl::Status BindArguments();
   int3 GetGridSize() const;
 
-  Buffer weights_;
-  LinearStorage biases_;
-
   ConvParams conv_params_;
   CLKernel kernel_;
 };
@@ -117,11 +118,7 @@ absl::Status ConvBuffer1x1::UploadData(
     const tflite::gpu::Tensor<OHWI, T>& weights,
     const tflite::gpu::Tensor<Linear, T>& biases, CLContext* context) {
   RETURN_IF_ERROR(UploadWeights(weights, context));
-  LinearStorageCreateInfo create_info;
-  create_info.storage_type = LinearStorageType::BUFFER;
-  create_info.data_type = definition_.GetDataType();
-  create_info.aligned_size = weights.shape.o;
-  RETURN_IF_ERROR(CreateLinearStorage(create_info, biases, context, &biases_));
+  RETURN_IF_ERROR(UploadBiases(biases, context));
   return absl::OkStatus();
 }
 
@@ -132,15 +129,12 @@ absl::Status ConvBuffer1x1::UploadDataForWinograd4x4To6x6(
   tflite::gpu::Tensor<OHWI, T> wino_weights;
   RearrangeWeightsToWinograd4x4To6x6Weights(weights, &wino_weights);
   RETURN_IF_ERROR(UploadWeights(wino_weights, context));
-
-  LinearStorageCreateInfo create_info;
-  create_info.storage_type = LinearStorageType::BUFFER;
-  create_info.data_type = definition_.GetDataType();
-  create_info.aligned_size = weights.shape.o;
   tflite::gpu::Tensor<Linear, DataType::FLOAT32> bias;
   bias.shape = Linear(weights.shape.o);
   bias.data.resize(weights.shape.o, 0.0f);
-  return CreateLinearStorage(create_info, bias, context, &biases_);
+  RETURN_IF_ERROR(UploadBiases(bias, context));
+
+  return absl::OkStatus();
 }
 
 template <DataType T>
@@ -156,19 +150,51 @@ absl::Status ConvBuffer1x1::UploadWeights(
   const int elements_count =
       weights.shape.h * weights.shape.w * src_depth * dst_depth_aligned * 4;
 
+  Buffer weights_buffer;
   if (f32_weights) {
     std::vector<float4> gpu_data(elements_count);
     RearrangeWeightsToOHWIOGroupI4O4(weights, conv_params_.block_size.z,
                                      absl::MakeSpan(gpu_data));
-    return CreateReadOnlyBuffer(float4_size * elements_count, gpu_data.data(),
-                                context, &weights_);
+    RETURN_IF_ERROR(CreateReadOnlyBuffer(float4_size * elements_count,
+                                         gpu_data.data(), context,
+                                         &weights_buffer));
   } else {
     std::vector<half4> gpu_data(elements_count);
     RearrangeWeightsToOHWIOGroupI4O4(weights, conv_params_.block_size.z,
                                      absl::MakeSpan(gpu_data));
-    return CreateReadOnlyBuffer(float4_size * elements_count, gpu_data.data(),
-                                context, &weights_);
+    RETURN_IF_ERROR(CreateReadOnlyBuffer(float4_size * elements_count,
+                                         gpu_data.data(), context,
+                                         &weights_buffer));
   }
+
+  BufferDescriptor desc;
+  desc.element_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
+  desc.element_size = 16;
+  desc.memory_type = MemoryType::GLOBAL;
+
+  args_.AddObject("weights", AccessType::READ,
+                  absl::make_unique<Buffer>(std::move(weights_buffer)),
+                  absl::make_unique<BufferDescriptor>(desc));
+  return absl::OkStatus();
+}
+
+template <DataType T>
+absl::Status ConvBuffer1x1::UploadBiases(
+    const tflite::gpu::Tensor<Linear, T>& biases, CLContext* context) {
+  TensorLinearDescriptor desc;
+  desc.storage_type = LinearStorageType::BUFFER;
+  desc.element_type = definition_.GetDataType();
+
+  tflite::gpu::Tensor<Linear, DataType::FLOAT32> bias = biases;
+  int channels = AlignByN(biases.shape.v, 4 * conv_params_.block_size.z);
+  bias.shape = Linear(channels);
+  bias.data.resize(channels, 0.0f);
+  LinearStorage lt;
+  RETURN_IF_ERROR(CreateLinearStorage(desc, bias, context, &lt));
+  args_.AddObject("biases", AccessType::READ,
+                  absl::make_unique<LinearStorage>(std::move(lt)),
+                  absl::make_unique<TensorLinearDescriptor>(desc));
+  return absl::OkStatus();
 }
 
 bool IsConvBuffer1x1Supported(const OperationDef& definition,
diff --git a/tensorflow/lite/delegates/gpu/cl/tensor.cc b/tensorflow/lite/delegates/gpu/cl/tensor.cc
index 228f84e2663..6a7e2174b9f 100644
--- a/tensorflow/lite/delegates/gpu/cl/tensor.cc
+++ b/tensorflow/lite/delegates/gpu/cl/tensor.cc
@@ -180,7 +180,11 @@ absl::Status Tensor::GetGPUResources(const GPUObjectDescriptor* obj_ptr,
   }
   if (descriptor_.HasAxis(Axis::WIDTH)) {
     resources->ints.push_back({"width", Width()});
+    resources->ints.push_back({"width_div2", Width() / 2});
+    resources->ints.push_back({"width_div4", Width() / 4});
     resources->ints.push_back({"width_batched", Width() * Batch()});
+    resources->ints.push_back({"width_batched_div2", Width() * Batch() / 2});
+    resources->ints.push_back({"width_batched_div4", Width() * Batch() / 4});
   }
   if (descriptor_.HasAxis(Axis::HEIGHT)) {
     resources->ints.push_back({"height", Height()});
diff --git a/tensorflow/lite/delegates/gpu/cl/tensor_type.cc b/tensorflow/lite/delegates/gpu/cl/tensor_type.cc
index 52621921bfa..9070cadfb85 100644
--- a/tensorflow/lite/delegates/gpu/cl/tensor_type.cc
+++ b/tensorflow/lite/delegates/gpu/cl/tensor_type.cc
@@ -77,7 +77,11 @@ GPUResources TensorDescriptor::GetGPUResources() const {
   GPUResources resources;
   if (HasAxis(Axis::WIDTH)) {
     resources.ints.push_back("width");
+    resources.ints.push_back("width_div2");
+    resources.ints.push_back("width_div4");
     resources.ints.push_back("width_batched");
+    resources.ints.push_back("width_batched_div2");
+    resources.ints.push_back("width_batched_div4");
   }
   if (HasAxis(Axis::HEIGHT)) {
     resources.ints.push_back("height");
@@ -97,6 +101,14 @@ GPUResources TensorDescriptor::GetGPUResources() const {
     desc.data_type = data_type;
     desc.access_type = access_type_;
     desc.element_size = 4;
+    auto it1 = state_vars_.find("ElementsX2");
+    if (it1 != state_vars_.end() && it1->second == "true") {
+      desc.element_size = 8;
+    }
+    auto it2 = state_vars_.find("ElementsX4");
+    if (it2 != state_vars_.end() && it2->second == "true") {
+      desc.element_size = 16;
+    }
     resources.buffers.push_back({"buffer", desc});
   } else if (storage_type == TensorStorageType::SINGLE_TEXTURE_2D ||
              storage_type == TensorStorageType::TEXTURE_2D) {
@@ -135,11 +147,7 @@ absl::Status TensorDescriptor::PerformSelector(
     const std::string& selector, const std::vector<std::string>& args,
     const std::vector<std::string>& template_args, std::string* result) const {
   if (selector == "Width") {
-    if (IsBatchedWidth()) {
-      *result = "width_batched";
-    } else {
-      *result = "width";
-    }
+    *result = GetWidth();
     return absl::OkStatus();
   } else if (selector == "Height") {
     *result = "height";
@@ -148,15 +156,7 @@ absl::Status TensorDescriptor::PerformSelector(
     *result = "slices";
     return absl::OkStatus();
   } else if (selector == "SliceStride") {
-    if (IsBatchedWidth()) {
-      *result = "width_batched * height";
-    } else {
-      if (HasAxis(Axis::BATCH)) {
-        *result = "width * height * batch";
-      } else {
-        *result = "width * height";
-      }
-    }
+    *result = GetSliceStride();
     return absl::OkStatus();
   } else if (selector == "Channels") {
     *result = "channels";
@@ -379,13 +379,7 @@ absl::Status TensorDescriptor::PerformGetPtrWithSliceOffsetSelector(
         "GetPtrWithSliceOffset require one argument(slice coordinate), but ",
         args.size(), " was passed"));
   }
-  const std::string width = IsBatchedWidth() ? "width_batched" : "width";
-  if (HasAxis(Axis::DEPTH)) {
-    *result =
-        absl::StrCat("buffer + ", args[0], " * ", width, " * height * depth");
-  } else {
-    *result = absl::StrCat("buffer + ", args[0], " * ", width, " * height");
-  }
+  *result = absl::StrCat("buffer + ", args[0], " * ", GetSliceStride());
   return absl::OkStatus();
 }
 
@@ -401,8 +395,7 @@ absl::Status TensorDescriptor::PerformGetWHOffsetSelector(
         "GetWHOffset require two arguments(X and Y coordinates), but ",
         args.size(), " was passed"));
   }
-  const std::string width = IsBatchedWidth() ? "width_batched" : "width";
-  *result = absl::StrCat(args[1], " * ", width, " + ", args[0]);
+  *result = absl::StrCat(args[1], " * ", GetWidth(), " + ", args[0]);
   return absl::OkStatus();
 }
 
@@ -466,9 +459,8 @@ std::string TensorDescriptor::GetGlobalAddressNoDeclarationWHS(
   switch (storage_type) {
     case TensorStorageType::BUFFER:
     case TensorStorageType::IMAGE_BUFFER: {
-      const std::string width = IsBatchedWidth() ? "width_batched" : "width";
       return absl::Substitute("((($2) * height + ($1)) * $3 + ($0))", x, y, s,
-                              width);
+                              GetWidth());
     }
     case TensorStorageType::TEXTURE_2D:
       return absl::Substitute("(int2)(($0), ($1) * slices + ($2))", x, y, s);
@@ -512,10 +504,9 @@ std::string TensorDescriptor::GetGlobalAddressNoDeclarationWHDS(
   switch (storage_type) {
     case TensorStorageType::BUFFER:
     case TensorStorageType::IMAGE_BUFFER: {
-      const std::string width = IsBatchedWidth() ? "width_batched" : "width";
       return absl::Substitute(
           "(((($3) * slices + ($2)) * height + ($1)) * $4 + ($0))", x, y, s, z,
-          width);
+          GetWidth());
     }
     case TensorStorageType::TEXTURE_2D:
       return absl::Substitute(
@@ -673,6 +664,36 @@ bool TensorDescriptor::IsBatchedWidth() const {
   return it != state_vars_.end() && it->second == "true";
 }
 
+std::string TensorDescriptor::GetWidth() const {
+  std::string div;
+  auto it1 = state_vars_.find("ElementsX2");
+  if (it1 != state_vars_.end() && it1->second == "true") {
+    div = "_div2";
+  }
+  auto it2 = state_vars_.find("ElementsX4");
+  if (it2 != state_vars_.end() && it2->second == "true") {
+    div = "_div4";
+  }
+  auto it = state_vars_.find("BatchedWidth");
+  if (it != state_vars_.end() && it->second == "true") {
+    return "width_batched" + div;
+  } else {
+    return "width" + div;
+  }
+}
+
+std::string TensorDescriptor::GetSliceStride() const {
+  if (IsBatchedWidth()) {
+    return GetWidth() + " * height";
+  } else {
+    if (HasAxis(Axis::BATCH)) {
+      return GetWidth() + " * height * batch";
+    } else {
+      return GetWidth() + " * height";
+    }
+  }
+}
+
 TextureAddressMode TensorDescriptor::ModeFromState() const {
   auto it = state_vars_.find("TextureMode");
   if (it != state_vars_.end()) {
diff --git a/tensorflow/lite/delegates/gpu/cl/tensor_type.h b/tensorflow/lite/delegates/gpu/cl/tensor_type.h
index 7371b4d7007..73b15ca322d 100644
--- a/tensorflow/lite/delegates/gpu/cl/tensor_type.h
+++ b/tensorflow/lite/delegates/gpu/cl/tensor_type.h
@@ -112,6 +112,9 @@ struct TensorDescriptor : public GPUObjectDescriptor {
 
   bool IsBatchedWidth() const;
 
+  std::string GetWidth() const;
+  std::string GetSliceStride() const;
+
   TextureAddressMode ModeFromState() const;
 
   absl::Status GetDataTypeFromTemplateArgs(const std::string& template_arg,

From 7042a6453c468cd4820063b15c2d0d2406435f00 Mon Sep 17 00:00:00 2001
From: Alexander Belyaev <pifon@google.com>
Date: Mon, 29 Jun 2020 09:49:44 -0700
Subject: [PATCH 1253/1390] Integrate LLVM at
 https://github.com/llvm/llvm-project/commit/e34523c87c3f

PiperOrigin-RevId: 318833652
Change-Id: I21b3b93c60bcc7ebf55310abec4e5d02beb4ae58
---
 tensorflow/workspace.bzl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index feae4080dd2..92f7ef0cdd1 100755
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -710,8 +710,8 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
     )
 
     # Check out LLVM and MLIR from llvm-project.
-    LLVM_COMMIT = "1becd298b82ed2f1a8ba5e61c5ad2ce7fe32d812"
-    LLVM_SHA256 = "a8f0947748a8e576675932a85537b9d67495ef7cd4e663811ea2f96fe2c2db47"
+    LLVM_COMMIT = "e34523c87c3f1cfabcf741568dede026bbb12d3a"
+    LLVM_SHA256 = "04e82e8fa5d492dc4c298d538e48e461c711bed3d58ab6f4dbc8aa735765ac4b"
     LLVM_URLS = [
         "https://storage.googleapis.com/mirror.tensorflow.org/github.com/llvm/llvm-project/archive/{commit}.tar.gz".format(commit = LLVM_COMMIT),
         "https://github.com/llvm/llvm-project/archive/{commit}.tar.gz".format(commit = LLVM_COMMIT),

From e207527e4a2d11d5dd52ac32e56eee4eb8ad89e1 Mon Sep 17 00:00:00 2001
From: Vo Van Nghia <vovannghia2409@gmail.com>
Date: Tue, 23 Jun 2020 13:40:30 +0700
Subject: [PATCH 1254/1390] Add random access file and test

---
 .../filesystem/plugins/gcs/gcs_filesystem.cc  | 59 ++++++++++++++-
 .../plugins/gcs/gcs_filesystem_test.cc        | 72 ++++++++++++++++++-
 2 files changed, 128 insertions(+), 3 deletions(-)

diff --git a/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem.cc b/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem.cc
index 43b739e6cd7..4e21c252bc8 100644
--- a/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem.cc
+++ b/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem.cc
@@ -75,8 +75,45 @@ void ParseGCSPath(const std::string& fname, bool object_empty_ok,
 // SECTION 1. Implementation for `TF_RandomAccessFile`
 // ----------------------------------------------------------------------------
 namespace tf_random_access_file {
+typedef struct GCSFile {
+  const char* bucket;
+  const char* object;
+  gcs::Client* gcs_client;  // not owned
+} GCSFile;
 
-// TODO(vnvo2409): Implement later
+TF_STATIC void Cleanup(TF_RandomAccessFile* file) {
+  auto gcs_file = static_cast<GCSFile*>(file->plugin_file);
+  plugin_memory_free(const_cast<char*>(gcs_file->bucket));
+  plugin_memory_free(const_cast<char*>(gcs_file->object));
+  delete gcs_file;
+}
+
+// TODO(vnvo2409): Adding cache.
+// `google-cloud-cpp` is working on a feature that we may want to use.
+// See https://github.com/googleapis/google-cloud-cpp/issues/4013.
+TF_STATIC int64_t Read(const TF_RandomAccessFile* file, uint64_t offset,
+                       size_t n, char* buffer, TF_Status* status) {
+  auto gcs_file = static_cast<GCSFile*>(file->plugin_file);
+  auto stream =
+      gcs_file->gcs_client->ReadObject(gcs_file->bucket, gcs_file->object,
+                                       gcs::ReadRange(offset, offset + n));
+  TF_SetStatusFromGCSStatus(stream.status(), status);
+  if ((TF_GetCode(status) != TF_OK) &&
+      (TF_GetCode(status) != TF_OUT_OF_RANGE)) {
+    return -1;
+  }
+  int64_t read;
+  if (!absl::SimpleAtoi(stream.headers().find("content-length")->second,
+                        &read)) {
+    TF_SetStatus(status, TF_UNKNOWN, "Could not get content-length header");
+    return -1;
+  }
+  if (read != n) {
+    TF_SetStatus(status, TF_OUT_OF_RANGE, "Read less bytes than requested");
+  }
+  stream.read(buffer, read);
+  return read;
+}
 
 }  // namespace tf_random_access_file
 
@@ -251,6 +288,19 @@ void Cleanup(TF_Filesystem* filesystem) {
 }
 
 // TODO(vnvo2409): Implement later
+TF_STATIC void NewRandomAccessFile(const TF_Filesystem* filesystem,
+                                   const char* path, TF_RandomAccessFile* file,
+                                   TF_Status* status) {
+  char* bucket;
+  char* object;
+  ParseGCSPath(path, false, &bucket, &object, status);
+  if (TF_GetCode(status) != TF_OK) return;
+
+  auto gcs_client = static_cast<gcs::Client*>(filesystem->plugin_filesystem);
+  file->plugin_file =
+      new tf_random_access_file::GCSFile({bucket, object, gcs_client});
+  TF_SetStatus(status, TF_OK, "");
+}
 
 void NewWritableFile(const TF_Filesystem* filesystem, const char* path,
                      TF_WritableFile* file, TF_Status* status) {
@@ -322,6 +372,11 @@ static void ProvideFilesystemSupportFor(TF_FilesystemPluginOps* ops,
   TF_SetFilesystemVersionMetadata(ops);
   ops->scheme = strdup(uri);
 
+  ops->random_access_file_ops = static_cast<TF_RandomAccessFileOps*>(
+      plugin_memory_allocate(TF_RANDOM_ACCESS_FILE_OPS_SIZE));
+  ops->random_access_file_ops->cleanup = tf_random_access_file::Cleanup;
+  ops->random_access_file_ops->read = tf_random_access_file::Read;
+
   ops->writable_file_ops = static_cast<TF_WritableFileOps*>(
       plugin_memory_allocate(TF_WRITABLE_FILE_OPS_SIZE));
   ops->writable_file_ops->cleanup = tf_writable_file::Cleanup;
@@ -330,6 +385,8 @@ static void ProvideFilesystemSupportFor(TF_FilesystemPluginOps* ops,
       plugin_memory_allocate(TF_FILESYSTEM_OPS_SIZE));
   ops->filesystem_ops->init = tf_gcs_filesystem::Init;
   ops->filesystem_ops->cleanup = tf_gcs_filesystem::Cleanup;
+  ops->filesystem_ops->new_random_access_file =
+      tf_gcs_filesystem::NewRandomAccessFile;
   ops->filesystem_ops->new_writable_file = tf_gcs_filesystem::NewWritableFile;
   ops->filesystem_ops->new_appendable_file =
       tf_gcs_filesystem::NewAppendableFile;
diff --git a/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem_test.cc b/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem_test.cc
index db8c538d167..1e9e47b4092 100644
--- a/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem_test.cc
+++ b/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem_test.cc
@@ -15,10 +15,15 @@ limitations under the License.
 #include "tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem.h"
 
 #include "tensorflow/c/tf_status_helper.h"
+#include "tensorflow/core/platform/path.h"
 #include "tensorflow/core/platform/stacktrace_handler.h"
 #include "tensorflow/core/platform/test.h"
 
-#define ASSERT_TF_OK(x) ASSERT_EQ(TF_OK, TF_GetCode(x))
+#define ASSERT_TF_OK(x) ASSERT_EQ(TF_OK, TF_GetCode(x)) << TF_Message(x)
+
+static const char* content = "abcdefghijklmnopqrstuvwxyz1234567890";  // 36
+
+namespace gcs = google::cloud::storage;
 
 namespace tensorflow {
 namespace {
@@ -26,10 +31,13 @@ namespace {
 class GCSFilesystemTest : public ::testing::Test {
  public:
   void SetUp() override {
+    root_dir_ = io::JoinPath(
+        tmp_dir_,
+        ::testing::UnitTest::GetInstance()->current_test_info()->name());
     status_ = TF_NewStatus();
     filesystem_ = new TF_Filesystem;
     tf_gcs_filesystem::Init(filesystem_, status_);
-    ASSERT_TF_OK(status_) << "Can not initialize filesystem. "
+    ASSERT_TF_OK(status_) << "Could not initialize filesystem. "
                           << TF_Message(status_);
   }
   void TearDown() override {
@@ -38,10 +46,66 @@ class GCSFilesystemTest : public ::testing::Test {
     delete filesystem_;
   }
 
+  static bool InitializeTmpDir() {
+    // This env should be something like `gs://bucket/path`
+    const char* test_dir = getenv("GCS_TEST_TMPDIR");
+    if (test_dir != nullptr) {
+      // We add a random value into `test_dir` to ensures that two consecutive
+      // runs are unlikely to clash.
+      std::random_device rd;
+      std::mt19937 gen(rd());
+      std::uniform_int_distribution<> distribution;
+      std::string rng_val = std::to_string(distribution(gen));
+      tmp_dir_ = io::JoinPath(string(test_dir), rng_val);
+      return true;
+    } else {
+      return false;
+    }
+  }
+
+  std::string GetURIForPath(absl::string_view path) {
+    const std::string translated_name =
+        tensorflow::io::JoinPath(root_dir_, path);
+    return translated_name;
+  }
+
  protected:
   TF_Filesystem* filesystem_;
   TF_Status* status_;
+
+ private:
+  std::string root_dir_;
+  static std::string tmp_dir_;
 };
+std::string GCSFilesystemTest::tmp_dir_;
+
+TEST_F(GCSFilesystemTest, StandaloneRandomAccessFile) {
+  // TODO: Put the code which creates file on the server to a seperate function
+  // if needed.
+  std::string filepath = GetURIForPath("a_file");
+  char* bucket;
+  char* object;
+  ParseGCSPath(filepath, false, &bucket, &object, status_);
+  ASSERT_TF_OK(status_);
+  auto gcs_client = static_cast<gcs::Client*>(filesystem_->plugin_filesystem);
+  auto writer = gcs_client->WriteObject(bucket, object);
+  writer << content;
+  writer.Close();
+  ASSERT_TRUE(writer.metadata()) << writer.metadata().status().message();
+
+  TF_RandomAccessFile* file = new TF_RandomAccessFile;
+  tf_gcs_filesystem::NewRandomAccessFile(filesystem_, filepath.c_str(), file,
+                                         status_);
+  ASSERT_TF_OK(status_);
+
+  char* result = new char[36];
+  int64_t read = tf_random_access_file::Read(file, 0, 36, result, status_);
+  ASSERT_TF_OK(status_);
+  ASSERT_EQ(read, 36) << "Number of bytes read: " << read;
+  ASSERT_EQ(absl::string_view(result).substr(0, read),
+            absl::string_view(content))
+      << "Result: " << absl::string_view(result).substr(0, read);
+  delete result;
 
 TEST_F(GCSFilesystemTest, ParseGCSPath) {
   std::string bucket, object;
@@ -70,6 +134,10 @@ TEST_F(GCSFilesystemTest, ParseGCSPath) {
 
 GTEST_API_ int main(int argc, char** argv) {
   tensorflow::testing::InstallStacktraceHandler();
+  if (!tensorflow::GCSFilesystemTest::InitializeTmpDir()) {
+    std::cerr << "Could not read GCS_TEST_TMPDIR env";
+    return -1;
+  }
   ::testing::InitGoogleTest(&argc, argv);
   return RUN_ALL_TESTS();
 }

From 541d22f55f55c3742304f3d54723d3e192a1ad6d Mon Sep 17 00:00:00 2001
From: Vo Van Nghia <vovannghia2409@gmail.com>
Date: Wed, 24 Jun 2020 01:08:14 +0700
Subject: [PATCH 1255/1390] Clean up test

---
 .../experimental/filesystem/plugins/gcs/BUILD |   3 +-
 .../filesystem/plugins/gcs/gcs_filesystem.cc  |  26 ++---
 .../filesystem/plugins/gcs/gcs_filesystem.h   |   8 ++
 .../plugins/gcs/gcs_filesystem_test.cc        | 101 +++++++++++++-----
 4 files changed, 97 insertions(+), 41 deletions(-)

diff --git a/tensorflow/c/experimental/filesystem/plugins/gcs/BUILD b/tensorflow/c/experimental/filesystem/plugins/gcs/BUILD
index d2b77c93f88..a0a83efaca2 100644
--- a/tensorflow/c/experimental/filesystem/plugins/gcs/BUILD
+++ b/tensorflow/c/experimental/filesystem/plugins/gcs/BUILD
@@ -30,6 +30,7 @@ cc_library(
         "//tensorflow/c:tf_status",
         "//tensorflow/c/experimental/filesystem:filesystem_interface",
         "@com_github_googlecloudplatform_google_cloud_cpp//:storage_client",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -46,7 +47,6 @@ cc_library(
 tf_cc_test(
     name = "gcs_filesystem_test",
     srcs = [
-        "gcs_filesystem.cc",
         "gcs_filesystem_test.cc",
     ],
     tags = [
@@ -58,5 +58,6 @@ tf_cc_test(
         "//tensorflow/c:tf_status_helper",
         "//tensorflow/core/platform:stacktrace_handler",
         "//tensorflow/core/platform:test",
+        "@com_google_absl//absl/strings",
     ],
 )
diff --git a/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem.cc b/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem.cc
index 4e21c252bc8..693db98928c 100644
--- a/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem.cc
+++ b/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem.cc
@@ -17,6 +17,7 @@ limitations under the License.
 #include <stdlib.h>
 #include <string.h>
 
+#include "absl/strings/numbers.h"
 #include "google/cloud/storage/client.h"
 #include "tensorflow/c/env.h"
 #include "tensorflow/c/experimental/filesystem/plugins/gcs/gcs_helper.h"
@@ -76,27 +77,24 @@ void ParseGCSPath(const std::string& fname, bool object_empty_ok,
 // ----------------------------------------------------------------------------
 namespace tf_random_access_file {
 typedef struct GCSFile {
-  const char* bucket;
-  const char* object;
+  const std::string bucket;
+  const std::string object;
   gcs::Client* gcs_client;  // not owned
 } GCSFile;
 
-TF_STATIC void Cleanup(TF_RandomAccessFile* file) {
+void Cleanup(TF_RandomAccessFile* file) {
   auto gcs_file = static_cast<GCSFile*>(file->plugin_file);
-  plugin_memory_free(const_cast<char*>(gcs_file->bucket));
-  plugin_memory_free(const_cast<char*>(gcs_file->object));
   delete gcs_file;
 }
 
 // TODO(vnvo2409): Adding cache.
 // `google-cloud-cpp` is working on a feature that we may want to use.
 // See https://github.com/googleapis/google-cloud-cpp/issues/4013.
-TF_STATIC int64_t Read(const TF_RandomAccessFile* file, uint64_t offset,
-                       size_t n, char* buffer, TF_Status* status) {
+int64_t Read(const TF_RandomAccessFile* file, uint64_t offset, size_t n,
+             char* buffer, TF_Status* status) {
   auto gcs_file = static_cast<GCSFile*>(file->plugin_file);
-  auto stream =
-      gcs_file->gcs_client->ReadObject(gcs_file->bucket, gcs_file->object,
-                                       gcs::ReadRange(offset, offset + n));
+  auto stream = gcs_file->gcs_client->ReadObject(
+      gcs_file->bucket, gcs_file->object, gcs::ReadRange(offset, offset + n));
   TF_SetStatusFromGCSStatus(stream.status(), status);
   if ((TF_GetCode(status) != TF_OK) &&
       (TF_GetCode(status) != TF_OUT_OF_RANGE)) {
@@ -288,11 +286,9 @@ void Cleanup(TF_Filesystem* filesystem) {
 }
 
 // TODO(vnvo2409): Implement later
-TF_STATIC void NewRandomAccessFile(const TF_Filesystem* filesystem,
-                                   const char* path, TF_RandomAccessFile* file,
-                                   TF_Status* status) {
-  char* bucket;
-  char* object;
+void NewRandomAccessFile(const TF_Filesystem* filesystem, const char* path,
+                         TF_RandomAccessFile* file, TF_Status* status) {
+  std::string bucket, object;
   ParseGCSPath(path, false, &bucket, &object, status);
   if (TF_GetCode(status) != TF_OK) return;
 
diff --git a/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem.h b/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem.h
index de96ea6b844..7fab3bdbbd1 100644
--- a/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem.h
+++ b/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem.h
@@ -22,9 +22,17 @@
 void ParseGCSPath(const std::string& fname, bool object_empty_ok,
                   std::string* bucket, std::string* object, TF_Status* status);
 
+namespace tf_random_access_file {
+void Cleanup(TF_RandomAccessFile* file);
+int64_t Read(const TF_RandomAccessFile* file, uint64_t offset, size_t n,
+             char* buffer, TF_Status* status);
+}  // namespace tf_random_access_file
+
 namespace tf_gcs_filesystem {
 void Init(TF_Filesystem* filesystem, TF_Status* status);
 void Cleanup(TF_Filesystem* filesystem);
+void NewRandomAccessFile(const TF_Filesystem* filesystem, const char* path,
+                         TF_RandomAccessFile* file, TF_Status* status);
 void NewWritableFile(const TF_Filesystem* filesystem, const char* path,
                      TF_WritableFile* file, TF_Status* status);
 void NewAppendableFile(const TF_Filesystem* filesystem, const char* path,
diff --git a/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem_test.cc b/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem_test.cc
index 1e9e47b4092..60244b1e880 100644
--- a/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem_test.cc
+++ b/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem_test.cc
@@ -14,6 +14,9 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem.h"
 
+#include <random>
+
+#include "absl/strings/string_view.h"
 #include "tensorflow/c/tf_status_helper.h"
 #include "tensorflow/core/platform/path.h"
 #include "tensorflow/core/platform/stacktrace_handler.h"
@@ -21,7 +24,9 @@ limitations under the License.
 
 #define ASSERT_TF_OK(x) ASSERT_EQ(TF_OK, TF_GetCode(x)) << TF_Message(x)
 
-static const char* content = "abcdefghijklmnopqrstuvwxyz1234567890";  // 36
+static const char* content = "abcdefghijklmnopqrstuvwxyz1234567890";
+// We will work with content_view instead of content.
+static const absl::string_view content_view = content;
 
 namespace gcs = google::cloud::storage;
 
@@ -50,6 +55,15 @@ class GCSFilesystemTest : public ::testing::Test {
     // This env should be something like `gs://bucket/path`
     const char* test_dir = getenv("GCS_TEST_TMPDIR");
     if (test_dir != nullptr) {
+      std::string bucket, object;
+      TF_Status* status = TF_NewStatus();
+      ParseGCSPath(test_dir, true, &bucket, &object, status);
+      if (TF_GetCode(status) != TF_OK) {
+        TF_DeleteStatus(status);
+        return false;
+      }
+      TF_DeleteStatus(status);
+
       // We add a random value into `test_dir` to ensures that two consecutive
       // runs are unlikely to clash.
       std::random_device rd;
@@ -79,33 +93,40 @@ class GCSFilesystemTest : public ::testing::Test {
 };
 std::string GCSFilesystemTest::tmp_dir_;
 
-TEST_F(GCSFilesystemTest, StandaloneRandomAccessFile) {
-  // TODO: Put the code which creates file on the server to a seperate function
-  // if needed.
-  std::string filepath = GetURIForPath("a_file");
-  char* bucket;
-  char* object;
-  ParseGCSPath(filepath, false, &bucket, &object, status_);
-  ASSERT_TF_OK(status_);
-  auto gcs_client = static_cast<gcs::Client*>(filesystem_->plugin_filesystem);
+::testing::AssertionResult WriteToServer(const std::string& path, size_t length,
+                                         gcs::Client* gcs_client,
+                                         TF_Status* status) {
+  std::string bucket, object;
+  ParseGCSPath(path, false, &bucket, &object, status);
+  if (TF_GetCode(status) != TF_OK) {
+    return ::testing::AssertionFailure() << TF_Message(status);
+  }
+
   auto writer = gcs_client->WriteObject(bucket, object);
-  writer << content;
+  writer.write(content, length);
   writer.Close();
-  ASSERT_TRUE(writer.metadata()) << writer.metadata().status().message();
+  if (writer.metadata()) {
+    return ::testing::AssertionSuccess();
+  } else {
+    return ::testing::AssertionFailure()
+           << writer.metadata().status().message();
+  }
+}
 
-  TF_RandomAccessFile* file = new TF_RandomAccessFile;
-  tf_gcs_filesystem::NewRandomAccessFile(filesystem_, filepath.c_str(), file,
-                                         status_);
-  ASSERT_TF_OK(status_);
-
-  char* result = new char[36];
-  int64_t read = tf_random_access_file::Read(file, 0, 36, result, status_);
-  ASSERT_TF_OK(status_);
-  ASSERT_EQ(read, 36) << "Number of bytes read: " << read;
-  ASSERT_EQ(absl::string_view(result).substr(0, read),
-            absl::string_view(content))
-      << "Result: " << absl::string_view(result).substr(0, read);
-  delete result;
+::testing::AssertionResult CompareSubString(int64_t offset, size_t n,
+                                            absl::string_view result,
+                                            size_t read) {
+  // Result isn't a null-terminated string so we have to wrap it inside a
+  // `string_view`
+  if (n == read && content_view.substr(offset, n) ==
+                       absl::string_view(result).substr(0, read)) {
+    return ::testing::AssertionSuccess();
+  } else {
+    return ::testing::AssertionFailure()
+           << "Result: " << absl::string_view(result).substr(0, read)
+           << " Read:" << read;
+  }
+}
 
 TEST_F(GCSFilesystemTest, ParseGCSPath) {
   std::string bucket, object;
@@ -129,6 +150,36 @@ TEST_F(GCSFilesystemTest, ParseGCSPath) {
   ASSERT_EQ(TF_GetCode(status_), TF_INVALID_ARGUMENT);
 }
 
+TEST_F(GCSFilesystemTest, RandomAccessFile) {
+  std::string filepath = GetURIForPath("a_file");
+  auto gcs_client = static_cast<gcs::Client*>(filesystem_->plugin_filesystem);
+  ASSERT_TRUE(
+      WriteToServer(filepath, content_view.length(), gcs_client, status_));
+
+  TF_RandomAccessFile* file = new TF_RandomAccessFile;
+  tf_gcs_filesystem::NewRandomAccessFile(filesystem_, filepath.c_str(), file,
+                                         status_);
+  ASSERT_TF_OK(status_);
+
+  char* result = new char[content_view.length()];
+  int64_t read = tf_random_access_file::Read(file, 0, 36, result, status_);
+  ASSERT_TF_OK(status_);
+  ASSERT_TRUE(CompareSubString(0, content_view.length(), result, read));
+
+  read = tf_random_access_file::Read(file, 0, 4, result, status_);
+  ASSERT_TF_OK(status_);
+  ASSERT_TRUE(CompareSubString(0, 4, result, read));
+
+  read = tf_random_access_file::Read(file, content_view.length() - 2, 4, result,
+                                     status_);
+  ASSERT_EQ(TF_GetCode(status_), TF_OUT_OF_RANGE) << TF_Message(status_);
+  ASSERT_TRUE(CompareSubString(content_view.length() - 2, 2, result, read));
+
+  delete result;
+  tf_random_access_file::Cleanup(file);
+  delete file;
+}
+
 }  // namespace
 }  // namespace tensorflow
 

From 469c64ed5a0ac471873328126774c9bb7144c670 Mon Sep 17 00:00:00 2001
From: Vo Van Nghia <vovannghia2409@gmail.com>
Date: Thu, 25 Jun 2020 16:55:15 +0700
Subject: [PATCH 1256/1390] Use std::move

---
 .../c/experimental/filesystem/plugins/gcs/gcs_filesystem.cc   | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem.cc b/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem.cc
index 693db98928c..91e9361343a 100644
--- a/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem.cc
+++ b/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem.cc
@@ -293,8 +293,8 @@ void NewRandomAccessFile(const TF_Filesystem* filesystem, const char* path,
   if (TF_GetCode(status) != TF_OK) return;
 
   auto gcs_client = static_cast<gcs::Client*>(filesystem->plugin_filesystem);
-  file->plugin_file =
-      new tf_random_access_file::GCSFile({bucket, object, gcs_client});
+  file->plugin_file = new tf_random_access_file::GCSFile(
+      {std::move(bucket), std::move(object), gcs_client});
   TF_SetStatus(status, TF_OK, "");
 }
 

From 3cfdb506158c41c1ca29183e399eaa75de4b23ae Mon Sep 17 00:00:00 2001
From: Vo Van Nghia <vovannghia2409@gmail.com>
Date: Thu, 25 Jun 2020 17:35:16 +0700
Subject: [PATCH 1257/1390] test if read is -1 when error

---
 .../plugins/gcs/gcs_filesystem_test.cc          | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem_test.cc b/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem_test.cc
index 60244b1e880..5460e8fad6a 100644
--- a/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem_test.cc
+++ b/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem_test.cc
@@ -152,17 +152,22 @@ TEST_F(GCSFilesystemTest, ParseGCSPath) {
 
 TEST_F(GCSFilesystemTest, RandomAccessFile) {
   std::string filepath = GetURIForPath("a_file");
-  auto gcs_client = static_cast<gcs::Client*>(filesystem_->plugin_filesystem);
-  ASSERT_TRUE(
-      WriteToServer(filepath, content_view.length(), gcs_client, status_));
-
   TF_RandomAccessFile* file = new TF_RandomAccessFile;
   tf_gcs_filesystem::NewRandomAccessFile(filesystem_, filepath.c_str(), file,
                                          status_);
   ASSERT_TF_OK(status_);
-
   char* result = new char[content_view.length()];
-  int64_t read = tf_random_access_file::Read(file, 0, 36, result, status_);
+  int64_t read = tf_random_access_file::Read(file, 0, 1, result, status_);
+  ASSERT_EQ(read, -1) << "Read: " << read;
+  ASSERT_EQ(TF_GetCode(status_), TF_NOT_FOUND) << TF_Message(status_);
+  TF_SetStatus(status_, TF_OK, "");
+
+  auto gcs_client = static_cast<gcs::Client*>(filesystem_->plugin_filesystem);
+  ASSERT_TRUE(
+      WriteToServer(filepath, content_view.length(), gcs_client, status_));
+
+  read = tf_random_access_file::Read(file, 0, content_view.length(), result,
+                                     status_);
   ASSERT_TF_OK(status_);
   ASSERT_TRUE(CompareSubString(0, content_view.length(), result, read));
 

From 1382c3bf0cc5211e1725a8d03e0158d8c91a92c3 Mon Sep 17 00:00:00 2001
From: Vo Van Nghia <vovannghia2409@gmail.com>
Date: Tue, 30 Jun 2020 00:12:30 +0700
Subject: [PATCH 1258/1390] Using GCSFile struct for tf_gcs_filesystem

---
 .../c/experimental/filesystem/plugins/gcs/gcs_filesystem.cc   | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem.cc b/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem.cc
index 91e9361343a..5578246c880 100644
--- a/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem.cc
+++ b/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem.cc
@@ -292,9 +292,9 @@ void NewRandomAccessFile(const TF_Filesystem* filesystem, const char* path,
   ParseGCSPath(path, false, &bucket, &object, status);
   if (TF_GetCode(status) != TF_OK) return;
 
-  auto gcs_client = static_cast<gcs::Client*>(filesystem->plugin_filesystem);
+  auto gcs_file = static_cast<GCSFile*>(filesystem->plugin_filesystem);
   file->plugin_file = new tf_random_access_file::GCSFile(
-      {std::move(bucket), std::move(object), gcs_client});
+      {std::move(bucket), std::move(object), &gcs_file->gcs_client});
   TF_SetStatus(status, TF_OK, "");
 }
 

From c033ac399dd9ee8767e1e53040e1ffdf924736a1 Mon Sep 17 00:00:00 2001
From: Russell Power <power@google.com>
Date: Mon, 29 Jun 2020 10:00:22 -0700
Subject: [PATCH 1259/1390] Address clang tidy lint for copy by value in TF
 dialect.

PiperOrigin-RevId: 318835676
Change-Id: Icc4beb228381ccafb608c7b82f631f604953884d
---
 tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc
index eb831ab9d1a..2526fb5897d 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc
@@ -4343,7 +4343,7 @@ TensorFlowDialect::TensorFlowDialect(MLIRContext *context)
   // registered.
   allowUnknownOperations();
 
-  for (auto hook : *TensorFlowDialect::additional_operation_hooks_) {
+  for (const auto &hook : *TensorFlowDialect::additional_operation_hooks_) {
     hook(*this);
   }
 }

From f576cc66fb13da84d38b68c47e79ff2d5a63856f Mon Sep 17 00:00:00 2001
From: Hye Soo Yang <hyey@google.com>
Date: Mon, 29 Jun 2020 10:08:34 -0700
Subject: [PATCH 1260/1390] - Read 2 bytes for retrieving bpp from bmp image
 bytes instead of 4 bytes. (Referred to:
 https://en.wikipedia.org/wiki/BMP_file_format#DIB_header_(bitmap_information_header))
 - decode_bmp should reject `channels=1` argument:
 https://github.com/tensorflow/tensorflow/blob/595b01b5450f9fcbcf4bb52aa810141bb9838ad7/tensorflow/python/ops/image_ops_impl.py#L2666
 https://www.tensorflow.org/api_docs/python/tf/io/decode_bmp

PiperOrigin-RevId: 318837540
Change-Id: I56d3566827682023e0fa0748fe030c16e1d7a100
---
 tensorflow/core/kernels/decode_image_op.cc | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/tensorflow/core/kernels/decode_image_op.cc b/tensorflow/core/kernels/decode_image_op.cc
index 6527979c5ee..3b7cb267b52 100644
--- a/tensorflow/core/kernels/decode_image_op.cc
+++ b/tensorflow/core/kernels/decode_image_op.cc
@@ -750,9 +750,10 @@ class DecodeImageV2Op : public OpKernel {
   }
 
   void DecodeBmpV2(OpKernelContext* context, StringPiece input) {
-    OP_REQUIRES(context, channels_ == 0 || channels_ == 3,
-                errors::InvalidArgument(
-                    "`channels` must be 0 or 3 for BMP, but got ", channels_));
+    OP_REQUIRES(
+        context, channels_ != 1,
+        errors::InvalidArgument(
+            "`channels` must be 0, 3 or 4 for BMP, but got ", channels_));
 
     OP_REQUIRES(context, (32 <= input.size()),
                 errors::InvalidArgument("Incomplete bmp content, requires at "
@@ -770,9 +771,9 @@ class DecodeImageV2Op : public OpKernel {
     int32 height_ = internal::SubtleMustCopy(
         *(reinterpret_cast<const int32*>(img_bytes + 22)));
     const int32 height = ByteSwapInt32ForBigEndian(height_);
-    int32 bpp_ = internal::SubtleMustCopy(
-        *(reinterpret_cast<const int32*>(img_bytes + 28)));
-    const int32 bpp = ByteSwapInt32ForBigEndian(bpp_);
+    int16 bpp_ = internal::SubtleMustCopy(
+        *(reinterpret_cast<const int16*>(img_bytes + 28)));
+    const int16 bpp = le16toh(bpp_);
 
     if (channels_) {
       OP_REQUIRES(context, (channels_ == bpp / 8),

From d21c515752f2b3f6b006bff54ca42e6b6df270b3 Mon Sep 17 00:00:00 2001
From: Gaurav Jain <gjn@google.com>
Date: Mon, 29 Jun 2020 10:29:20 -0700
Subject: [PATCH 1261/1390] Rollforward: Ignore other graph inputs in custom
 gradient

Include capturing logic when function building.

PiperOrigin-RevId: 318842114
Change-Id: Idcc936a944589219cae14d117b745c0d46c35340
---
 tensorflow/python/ops/custom_gradient.py | 20 +++++++++++++++-----
 tensorflow/python/ops/gradients_test.py  | 24 ++++++++++++++++++++++++
 2 files changed, 39 insertions(+), 5 deletions(-)

diff --git a/tensorflow/python/ops/custom_gradient.py b/tensorflow/python/ops/custom_gradient.py
index 5f4ee055621..ae06d022ec7 100644
--- a/tensorflow/python/ops/custom_gradient.py
+++ b/tensorflow/python/ops/custom_gradient.py
@@ -312,7 +312,16 @@ def _graph_mode_decorator(f, args, kwargs):
         "The custom_gradient decorator currently supports keywords "
         "arguments only when eager execution is enabled.")
   name = "CustomGradient-%s" % ops.uid()
-  args = nest.map_structure(ops.convert_to_tensor, args)
+
+  default_graph = ops.get_default_graph()
+  def convert_arg(x):
+    x = ops.convert_to_tensor(x)
+    # If graph building, be sure to capture all inputs
+    if default_graph.building_function and x.graph != default_graph:
+      x = default_graph.capture(x)
+    return x
+
+  args = nest.map_structure(convert_arg, args)
 
   # Checking global and local variables attempts to ensure that no non-resource
   # Variables are added to the graph.
@@ -336,15 +345,15 @@ def _graph_mode_decorator(f, args, kwargs):
           "All variables used by a function wrapped with @custom_gradient must "
           "be `ResourceVariable`s. Ensure that no `variable_scope` is created "
           "with `use_resource=False`.")
+
   # The variables that grad_fn needs to return gradients for are the set of
   # variables used that are *not* part of the inputs.
-  inputs = args
   variables_in_tape = frozenset([
       v.ref() for v in variable_watcher.watched_variables()
-  ]) - frozenset(v.ref() for v in inputs)
+  ]) - frozenset(v.ref() for v in args)
   variables_in_subgraph = frozenset([
       v.ref()
-      for v in get_dependent_variables(input_ops=inputs, output_ops=result)
+      for v in get_dependent_variables(input_ops=args, output_ops=result)
   ])
   variables = list(
       [v.deref() for v in variables_in_subgraph.union(variables_in_tape)])
@@ -515,7 +524,8 @@ def recompute_grad(f):
 
         def transpose(*t_args, **t_kwargs):
           """Gradient function calculation for forward mode autodiff."""
-          # Just throw an error since gradients / activations are not stored on tape for recompute.
+          # Just throw an error since gradients / activations are not stored on
+          # tape for recompute.
           raise NotImplementedError(
               "recompute_grad tried to transpose grad of {}. "
               "Consider not using recompute_grad in forward mode"
diff --git a/tensorflow/python/ops/gradients_test.py b/tensorflow/python/ops/gradients_test.py
index fc5f38aedba..ba053e88ede 100644
--- a/tensorflow/python/ops/gradients_test.py
+++ b/tensorflow/python/ops/gradients_test.py
@@ -1197,6 +1197,30 @@ class CustomGradientTest(test_util.TensorFlowTestCase, parameterized.TestCase):
         dw = sess.run(math_ops.reduce_sum(grads[1]))
         self.assertEqual(12., dw)
 
+  def testCustomGradientWithCapture(self):
+    with ops.Graph().as_default():
+      x = constant(3.)
+
+      @framework_function.Defun(dtypes.float32)
+      def F(y):
+
+        @custom_gradient.custom_gradient
+        def MyMultiply(x1, x2):
+          result = x1 * x2
+
+          def Grad(dy):
+            # Switched the ordering here.
+            return [dy * x1, dy * x2]
+
+          return result, Grad
+
+        res = MyMultiply(x, y)
+        return gradients.gradients(res, [y])
+
+      y = constant(5.)
+      dy = F(y)
+      self.assertAllEqual(5., self.evaluate(dy))
+
   def testCustomGradientWithVariablesNoFalsePositives(self):
 
     @custom_gradient.custom_gradient

From bbfb45c6ad7ca875bf2ff45db69bb9630895bbbe Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 29 Jun 2020 10:31:39 -0700
Subject: [PATCH 1262/1390] [XLA:Python] Fix crash in outfeed_receiver.

PiperOrigin-RevId: 318842645
Change-Id: I4cbfe777acb6ebf23fc206014c0db007a1cf2476
---
 tensorflow/compiler/xla/python/outfeed_receiver_py.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/compiler/xla/python/outfeed_receiver_py.cc b/tensorflow/compiler/xla/python/outfeed_receiver_py.cc
index 67926d5bc6b..d297df332ff 100644
--- a/tensorflow/compiler/xla/python/outfeed_receiver_py.cc
+++ b/tensorflow/compiler/xla/python/outfeed_receiver_py.cc
@@ -52,7 +52,7 @@ class OutfeedReceiverForPython {
                std::shared_ptr<Literal> literal) {
           this->Callback(device, consumer_id, std::move(literal));
         };
-    std::vector<PjRtClient*> client_ptrs(clients.size());
+    std::vector<PjRtClient*> client_ptrs(clients_.size());
     absl::c_transform(clients_, client_ptrs.begin(),
                       [](const std::shared_ptr<PyClient>& client) {
                         return client->pjrt_client();

From e3036c5cc9429021129bf90b80e60a8097e32fb9 Mon Sep 17 00:00:00 2001
From: Jared Duke <jdduke@google.com>
Date: Mon, 29 Jun 2020 10:37:39 -0700
Subject: [PATCH 1263/1390] Remove TFLite reduce test cases with redundant axes

PiperOrigin-RevId: 318844070
Change-Id: If4b385d4988ca71808dc8c94c617aaf6a406f260
---
 tensorflow/lite/testing/op_tests/reduce.py | 71 ++++++++++++++++++----
 1 file changed, 60 insertions(+), 11 deletions(-)

diff --git a/tensorflow/lite/testing/op_tests/reduce.py b/tensorflow/lite/testing/op_tests/reduce.py
index f6d05cbbdfc..259dcad68f3 100644
--- a/tensorflow/lite/testing/op_tests/reduce.py
+++ b/tensorflow/lite/testing/op_tests/reduce.py
@@ -50,9 +50,25 @@ def make_reduce_tests(reduce_op,
             "input_dtype": [tf.float32, tf.int32, tf.int64],
             "input_shape": [[3, 3, 2, 4]],
             "axis": [
-                0, 1, 2, [0, 1], [0, 2], [1, 2], [0, 1, 2], [1, 0], [2, 0],
-                [2, 1], [2, 1, 0], [2, 0, 1], -1, -2, -3, [1, -1], [0, -1],
-                [-1, 0], [-1, -2, -3], [0, 0, 0], [2, 2, 0], [1, 0, -3, -3]
+                0,
+                1,
+                2,
+                [0, 1],
+                [0, 2],
+                [1, 2],
+                [0, 1, 2],
+                [1, 0],
+                [2, 0],
+                [2, 1],
+                [2, 1, 0],
+                [2, 0, 1],
+                -1,
+                -2,
+                -3,
+                [1, -1],
+                [0, -1],
+                [-1, 0],
+                [-1, -2, -3],
             ],
             "const_axis": [True, False],
             "keepdims": [True, False],
@@ -62,11 +78,28 @@ def make_reduce_tests(reduce_op,
             "input_dtype": [tf.float32],
             "input_shape": [[1, 8, 8, 3]],
             "axis": [
-                0, 1, 2, 3, [1, 2], [0, 3], [1, 2, 3], [0, 1, 2,
-                                                        3], [3, 2, 1, 0],
-                [3, 1, 0, 2], [2, 0], [3, 0], [3, 1], [1, 0], -1, -2, -3, -4,
-                [0, -2], [2, 3, -1, 0], [3, 1, 2, -3], [3, -4], [2, 2, 2],
-                [2, 2, 3], [-3, -3, -4], [-3, 2, 1]
+                0,
+                1,
+                2,
+                3,
+                [1, 2],
+                [0, 3],
+                [1, 2, 3],
+                [0, 1, 2, 3],
+                [3, 2, 1, 0],
+                [3, 1, 0, 2],
+                [2, 0],
+                [3, 0],
+                [3, 1],
+                [1, 0],
+                -1,
+                -2,
+                -3,
+                -4,
+                [0, -2],
+                [2, 3, 1, 0],
+                [3, 1, 2],
+                [3, -4],
             ],
             "const_axis": [True, False],
             "keepdims": [True, False],
@@ -92,9 +125,25 @@ def make_reduce_tests(reduce_op,
             "input_dtype": [tf.float32],
             "input_shape": [[3, 3, 2, 4]],
             "axis": [
-                0, 1, 2, [0, 1], [0, 2], [1, 2], [0, 1, 2], [1, 0], [2, 0],
-                [2, 1], [2, 1, 0], [2, 0, 1], -1, -2, -3, [1, -1], [0, -1],
-                [-1, 0], [-1, -2, -3], [0, 0, 0], [2, 2, 0], [1, 0, -3, -3]
+                0,
+                1,
+                2,
+                [0, 1],
+                [0, 2],
+                [1, 2],
+                [0, 1, 2],
+                [1, 0],
+                [2, 0],
+                [2, 1],
+                [2, 1, 0],
+                [2, 0, 1],
+                -1,
+                -2,
+                -3,
+                [1, -1],
+                [0, -1],
+                [-1, 0],
+                [-1, -2, -3],
             ],
             "const_axis": [True],
             "keepdims": [True, False],

From 777b6ad484d1647a1b7a64ab862b1cb3ff706a4f Mon Sep 17 00:00:00 2001
From: Raman Sarokin <sorokin@google.com>
Date: Mon, 29 Jun 2020 10:42:03 -0700
Subject: [PATCH 1264/1390] Improved AddBias transformation.

PiperOrigin-RevId: 318845057
Change-Id: I41321cdea9d8c605fa77dcff4f962a891536d985
---
 .../gpu/common/transformations/add_bias.cc    | 39 ++++++++++++-------
 1 file changed, 24 insertions(+), 15 deletions(-)

diff --git a/tensorflow/lite/delegates/gpu/common/transformations/add_bias.cc b/tensorflow/lite/delegates/gpu/common/transformations/add_bias.cc
index 7feac824ef7..ec2474138a3 100644
--- a/tensorflow/lite/delegates/gpu/common/transformations/add_bias.cc
+++ b/tensorflow/lite/delegates/gpu/common/transformations/add_bias.cc
@@ -27,38 +27,47 @@ namespace tflite {
 namespace gpu {
 namespace {
 
-template <typename T>
-TransformResult FillBias(Node* node) {
-  auto& attr = absl::any_cast<T&>(node->operation.attributes);
-  if (attr.bias.data.empty()) {
-    const int dst_channels = attr.weights.shape.o;
-    attr.bias = MakeZeroTensor<Linear, DataType::FLOAT32>(Linear(dst_channels));
+TransformResult FillBias(
+    int output_channels,
+    tflite::gpu::Tensor<Linear, DataType::FLOAT32>* biases) {
+  if (biases->data.empty()) {
+    *biases =
+        MakeZeroTensor<Linear, DataType::FLOAT32>(Linear(output_channels));
     return {TransformStatus::APPLIED, "Added bias"};
   }
+  if (biases->shape.v != output_channels) {
+    float last_value = biases->data.back();
+    biases->shape.v = output_channels;
+    biases->data.resize(output_channels, last_value);
+    return {TransformStatus::APPLIED, "Bias extended"};
+  }
   return {TransformStatus::SKIPPED, ""};
 }
 
-template TransformResult FillBias<Convolution2DAttributes>(Node* node);
-template TransformResult FillBias<ConvolutionTransposedAttributes>(Node* node);
-template TransformResult FillBias<DepthwiseConvolution2DAttributes>(Node* node);
-template TransformResult FillBias<FullyConnectedAttributes>(Node* node);
-
 class AddBias : public NodeTransformation {
  public:
   TransformResult ApplyToNode(Node* node, GraphFloat32* graph) final {
     if (node->operation.type == ToString(OperationType::CONVOLUTION_2D)) {
-      return FillBias<Convolution2DAttributes>(node);
+      auto& attr =
+          absl::any_cast<Convolution2DAttributes&>(node->operation.attributes);
+      return FillBias(attr.weights.shape.o, &attr.bias);
     }
     if (node->operation.type ==
         ToString(OperationType::CONVOLUTION_TRANSPOSED)) {
-      return FillBias<ConvolutionTransposedAttributes>(node);
+      auto& attr = absl::any_cast<ConvolutionTransposedAttributes&>(
+          node->operation.attributes);
+      return FillBias(attr.weights.shape.o, &attr.bias);
     }
     if (node->operation.type ==
         ToString(OperationType::DEPTHWISE_CONVOLUTION)) {
-      return FillBias<DepthwiseConvolution2DAttributes>(node);
+      auto& attr = absl::any_cast<DepthwiseConvolution2DAttributes&>(
+          node->operation.attributes);
+      return FillBias(attr.weights.shape.o * attr.weights.shape.i, &attr.bias);
     }
     if (node->operation.type == ToString(OperationType::FULLY_CONNECTED)) {
-      return FillBias<FullyConnectedAttributes>(node);
+      auto& attr =
+          absl::any_cast<FullyConnectedAttributes&>(node->operation.attributes);
+      return FillBias(attr.weights.shape.o, &attr.bias);
     }
     return {TransformStatus::SKIPPED, ""};
   }

From 4e0d3b117d8025f05111f16063892915a6ef48c4 Mon Sep 17 00:00:00 2001
From: Michael Gester <mgester@google.com>
Date: Mon, 29 Jun 2020 10:50:39 -0700
Subject: [PATCH 1265/1390] Add capability to restrict functionalization via
 node filter

For the MLIR-based TPU bridge, functionalization runs before TPU cluster
extraction as part of the graph-to-MLIR conversion. This was problematic because
previously also non-TPU-nodes were functionalized at this stage which caused
issues in the TF v1 session runtime that assumes certain nodes are left
unchanged.
This change adds the capability to restrict functionalization to certain loops
and conditions, according to a user-defined node filter, which can be used to
fix the above issues (there will be a separate CL for this).

PiperOrigin-RevId: 318846935
Change-Id: I36078909c6091de083ffa5d57cdf63eca5f844ef
---
 tensorflow/compiler/tf2xla/BUILD              |   1 +
 .../compiler/tf2xla/functionalize_cond.cc     |  81 ++-
 .../compiler/tf2xla/functionalize_cond.h      |  29 +-
 .../tf2xla/functionalize_cond_test.cc         |   4 +-
 .../tf2xla/functionalize_control_flow.cc      |  14 +-
 .../tf2xla/functionalize_control_flow.h       |  21 +-
 .../tf2xla/functionalize_control_flow_test.cc | 517 +++++++++++-------
 .../tf2xla/functionalize_control_flow_util.cc |   6 +-
 .../tf2xla/functionalize_control_flow_util.h  |  13 +-
 .../compiler/tf2xla/functionalize_while.cc    |  72 +--
 .../compiler/tf2xla/functionalize_while.h     |  10 +-
 11 files changed, 494 insertions(+), 274 deletions(-)

diff --git a/tensorflow/compiler/tf2xla/BUILD b/tensorflow/compiler/tf2xla/BUILD
index b8680c18f7b..dfb5fccf9af 100644
--- a/tensorflow/compiler/tf2xla/BUILD
+++ b/tensorflow/compiler/tf2xla/BUILD
@@ -781,6 +781,7 @@ cc_library(
         "//tensorflow/core:graph",
         "//tensorflow/core:lib",
         "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:optional",
     ],
 )
diff --git a/tensorflow/compiler/tf2xla/functionalize_cond.cc b/tensorflow/compiler/tf2xla/functionalize_cond.cc
index f9af5581a67..7fbd3dfa62c 100644
--- a/tensorflow/compiler/tf2xla/functionalize_cond.cc
+++ b/tensorflow/compiler/tf2xla/functionalize_cond.cc
@@ -22,10 +22,10 @@ limitations under the License.
 #include <vector>
 
 #include "absl/memory/memory.h"
+#include "absl/strings/match.h"
 #include "absl/strings/str_join.h"
 #include "absl/types/optional.h"
 #include "tensorflow/compiler/jit/union_find.h"
-#include "tensorflow/compiler/tf2xla/functionalize_control_flow_util.h"
 #include "tensorflow/compiler/tf2xla/tf2xla_util.h"
 #include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/common_runtime/shape_refiner.h"
@@ -285,8 +285,12 @@ string StateMap::AncestorStateToString(const Node* node) const {
 }
 
 FunctionalizeCond::FunctionalizeCond(Graph* graph,
-                                     FunctionLibraryDefinition* library)
-    : state_map_(graph), library_(library), graph_(graph) {}
+                                     FunctionLibraryDefinition* library,
+                                     const NodeFilter& node_filter)
+    : state_map_(graph),
+      library_(library),
+      graph_(graph),
+      node_filter_(node_filter) {}
 
 // Class representing the merge/switch nodes that will become a conditional.
 class Conditional {
@@ -807,11 +811,13 @@ Status Conditional::BuildIfNode(Graph* graph,
           << PartialTensorShapeUtils::PartialShapeListString(output_shapes);
 
   builder.Attr("Tcond", DT_BOOL);
-  string outside_compilation;
-  if (GetNodeAttr(predicate_.node->def(), kXlaOutsideCompilationAttrName,
-                  &outside_compilation)
-          .ok()) {
-    builder.Attr(kXlaOutsideCompilationAttrName, outside_compilation);
+  // Add all underscore attributes, these need to be propagated.
+  for (const auto& attr : predicate_.node->def().attr()) {
+    const string& name(attr.first);
+    const AttrValue& value(attr.second);
+    if (absl::StartsWith(name, "_")) {
+      builder.Attr(name, value);
+    }
   }
   builder.Device(predicate_.node->assigned_device_name());
   // Conditional should be the first input ...
@@ -1076,7 +1082,7 @@ StatusOr<StateMap::CondId> FunctionalizeCond::JoinCondStatesMerge(
   // Determine the flow state when joining two states for a merge
   // node. Combining the two states for a merge node is effectively performing a
   // disjunction of the states along the different input edges. For a merge that
-  // can be transformed into a If the two inputs paths have to have a predicate
+  // can be transformed into an If the two inputs paths have to have a predicate
   // on which they differ (e.g., along one edge predicate `p` has to hold while
   // on another it should not). This function first determines this predicate
   // and then the resultant state is the common path between the two inputs
@@ -1368,8 +1374,9 @@ void FunctionalizeCond::DeleteReachableAndDeadNodes(
   deleted[graph_->kSourceId] = true;
   deleted[graph_->kSinkId] = true;
 
-  // All remaining Switch nodes are not reachable from a Merge node and
-  // removed. This is to account for dead Switch nodes.
+  // All remaining switch nodes that were not excluded from functionalization
+  // according to `node_filter_` are not reachable from a merge node and
+  // removed. This is to account for dead switch nodes.
   for (int s_id : switch_ids_) {
     Node* s = graph_->FindNodeId(s_id);
     if (s == nullptr) continue;
@@ -1379,11 +1386,17 @@ void FunctionalizeCond::DeleteReachableAndDeadNodes(
       // conditional.
       if (!e->IsControlEdge()) delete_nodes.push_back(e->dst()->id());
     }
-    deleted[s_id] = true;
-    graph_->RemoveNode(s);
+    // Only remove switch node if we have functionalized the corresponding
+    // condition before (according to `node_filter_`).
+    if (!node_filter_ || node_filter_(s)) {
+      VLOG(2) << "Removing obsolete switch node " << s->name();
+      deleted[s_id] = true;
+      graph_->RemoveNode(s);
+    }
   }
 
-  // All merge nodes should have been transformed at this point and we remove
+  // All merge nodes that were not excluded from functionalization according to
+  // `node_filter_` should have been transformed at this point and we remove
   // them from the graph here.
   for (Node* m : merge_order) {
     for (const Edge* e : m->out_edges()) {
@@ -1393,8 +1406,13 @@ void FunctionalizeCond::DeleteReachableAndDeadNodes(
       // being removed in AddOutputEdges.
       if (!e->IsControlEdge()) delete_nodes.push_back(e->dst()->id());
     }
-    deleted[m->id()] = true;
-    graph_->RemoveNode(m);
+    // Only remove merge node if we have functionalized the corresponding
+    // condition before (according to `node_filter_`).
+    if (!node_filter_ || node_filter_(m)) {
+      VLOG(2) << "Removing obsolete merge node " << m->name();
+      deleted[m->id()] = true;
+      graph_->RemoveNode(m);
+    }
   }
 
   // Enqueue all the dead nodes.
@@ -1403,7 +1421,7 @@ void FunctionalizeCond::DeleteReachableAndDeadNodes(
       delete_nodes.push_back(n->id());
     }
   }
-
+  // Remove dead nodes and nodes that are reachable from dead nodes.
   while (!delete_nodes.empty()) {
     int d_id = delete_nodes.front();
     delete_nodes.pop_front();
@@ -1414,6 +1432,7 @@ void FunctionalizeCond::DeleteReachableAndDeadNodes(
     for (const Edge* e : d->out_edges()) {
       delete_nodes.push_back(e->dst()->id());
     }
+    VLOG(2) << "Removing obsolete node " << d->name();
     deleted[d_id] = true;
     graph_->RemoveNode(d);
   }
@@ -1454,6 +1473,7 @@ Status FunctionalizeCond::FunctionalizeInternal() {
   // AncestorState from the innermost to the outermost into IfOps;
   // Note: In the above only nodes that feed into a merge node will be
   // considered for functionalization.
+  // Note: Nodes for which `node_filter_` returns false are excluded.
 
   // Perform a DFS over the graph and
   // * Determine the reverse topological order of the nodes (there should be no
@@ -1463,12 +1483,18 @@ Status FunctionalizeCond::FunctionalizeInternal() {
   std::vector<Node*> rev_topo_order;
   std::vector<Node*> merge_order;
   DFS(*graph_, nullptr, [&](Node* n) {
-    if (IsSwitch(n)) {
-      AddSwitchId(n->id());
-    }
-    if (IsMerge(n)) {
-      merge_order.push_back(n);
+    // Only collect switch and merge nodes that are not filtered out, those form
+    // the conditions that will be functionalized.
+    if (!node_filter_ || node_filter_(n)) {
+      if (IsSwitch(n)) {
+        AddSwitchId(n->id());
+      }
+      if (IsMerge(n)) {
+        merge_order.push_back(n);
+      }
     }
+    // Collect all other nodes here, independent of `node_filter_`, because they
+    // might belong to a condition that should be functionalized.
     if (n->IsOp()) {
       rev_topo_order.push_back(n);
     }
@@ -1571,19 +1597,22 @@ void FunctionalizeCond::AddSwitchId(int switch_id) {
 }
 
 Status FunctionalizeCond::Functionalize(Graph* graph,
-                                        FunctionLibraryDefinition* library) {
+                                        FunctionLibraryDefinition* library,
+                                        const NodeFilter& node_filter) {
   VLOG(1) << "FunctionalizeCond::Functionalize";
-  FunctionalizeCond fc(graph, library);
+  FunctionalizeCond fc(graph, library, node_filter);
   return fc.FunctionalizeInternal();
 }
 
 }  // namespace functionalize_cond
 
-Status FunctionalizeCond(Graph* graph, FunctionLibraryDefinition* library) {
+Status FunctionalizeCond(Graph* graph, FunctionLibraryDefinition* library,
+                         const NodeFilter& node_filter) {
   // FunctionalizeControlFlow is invoked for every function, so the loops's
   // bodies and conditionals that were extracted into functions will be handled
   // in successive invocations.
-  return functionalize_cond::FunctionalizeCond::Functionalize(graph, library);
+  return functionalize_cond::FunctionalizeCond::Functionalize(graph, library,
+                                                              node_filter);
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/functionalize_cond.h b/tensorflow/compiler/tf2xla/functionalize_cond.h
index 7940732a11d..741fe04a500 100644
--- a/tensorflow/compiler/tf2xla/functionalize_cond.h
+++ b/tensorflow/compiler/tf2xla/functionalize_cond.h
@@ -17,6 +17,8 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_TF2XLA_FUNCTIONALIZE_COND_H_
 
 #include <deque>
+
+#include "tensorflow/compiler/tf2xla/functionalize_control_flow_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/graph/graph.h"
@@ -26,8 +28,17 @@ namespace tensorflow {
 // Functionalize all the switch-merge nodes of a loop-free graph into If
 // nodes. That is, attempt to transform every remaining switch and merge nodes
 // in the graph into If nodes.
-// Precondition: All while loops have been removed from graph.
-Status FunctionalizeCond(Graph* graph, FunctionLibraryDefinition* library);
+//
+// If `node_filter` is defined, then only conditions for whose nodes
+// `node_filter` returns true are functionalized.
+//
+// Preconditions:
+// a) Same as for `FunctionalizeControlFlow` (see comment there).
+// b) While loops must have been functionalized before according to
+//    `node_filter` (e.g., by calling `FunctionalizeWhileLoop` with the same
+//    filter before calling this function).
+Status FunctionalizeCond(Graph* graph, FunctionLibraryDefinition* library,
+                         const NodeFilter& node_filter = {});
 
 // Internal functions/classes exposed for testing purposes.
 namespace functionalize_cond {
@@ -172,11 +183,9 @@ class StateMap {
 // of the given graph together.
 class FunctionalizeCond {
  public:
-  // Functionalize all the switch-merge nodes of a loop-free graph into If
-  // nodes. That is, attempt to transform every remaining switch and merge nodes
-  // in the graph into If nodes.
-  // Precondition: All while loops have been removed from graph.
-  static Status Functionalize(Graph* graph, FunctionLibraryDefinition* library);
+  // See comment for function `FunctionalizeCond`.
+  static Status Functionalize(Graph* graph, FunctionLibraryDefinition* library,
+                              const NodeFilter& node_filter);
 
   // Build identity node with the same name as the merge that will be replaced
   // in case the output is fetched/colocated.
@@ -197,7 +206,8 @@ class FunctionalizeCond {
   void AddSwitchId(int switch_id);
 
  private:
-  FunctionalizeCond(Graph* graph, FunctionLibraryDefinition* library);
+  FunctionalizeCond(Graph* graph, FunctionLibraryDefinition* library,
+                    const NodeFilter& node_filter);
 
   // Performs the actual cond functionalization. Iterate over groups of merge
   // nodes (linked by common predicates & ancestor IDs), from innermost to
@@ -268,6 +278,9 @@ class FunctionalizeCond {
   friend class FunctionalizeCondTest;
 
   std::vector<int> switch_ids_;
+
+  // Controls which nodes are skipped for functionalization.
+  NodeFilter node_filter_ = {};
 };
 
 }  // namespace functionalize_cond
diff --git a/tensorflow/compiler/tf2xla/functionalize_cond_test.cc b/tensorflow/compiler/tf2xla/functionalize_cond_test.cc
index aba0b411b08..0438c41c5d6 100644
--- a/tensorflow/compiler/tf2xla/functionalize_cond_test.cc
+++ b/tensorflow/compiler/tf2xla/functionalize_cond_test.cc
@@ -40,8 +40,8 @@ class FunctionalizeCondTest : public ::testing::Test {
     graph_.reset(new Graph(OpRegistry::Global()));
     flib_def_.reset(
         new FunctionLibraryDefinition(OpRegistry::Global(), fdef_lib_));
-    fc_.reset(new functionalize_cond::FunctionalizeCond(graph_.get(),
-                                                        flib_def_.get()));
+    fc_.reset(new functionalize_cond::FunctionalizeCond(
+        graph_.get(), flib_def_.get(), NodeFilter{}));
   }
 
   StateMap::CondId GetUniqueId(const StateMap::StateMap::CondState& state) {
diff --git a/tensorflow/compiler/tf2xla/functionalize_control_flow.cc b/tensorflow/compiler/tf2xla/functionalize_control_flow.cc
index 2fcfd20f49f..10b26f9801c 100644
--- a/tensorflow/compiler/tf2xla/functionalize_control_flow.cc
+++ b/tensorflow/compiler/tf2xla/functionalize_control_flow.cc
@@ -46,20 +46,19 @@ limitations under the License.
 
 namespace tensorflow {
 
-// Transformation that converts TensorFlow's graph control flow constructs into
-// functional equivalents.
 Status FunctionalizeControlFlow(Graph* graph,
-                                FunctionLibraryDefinition* library) {
+                                FunctionLibraryDefinition* library,
+                                const NodeFilter& node_filter) {
   VLOG(2) << "FunctionalizeControlFlow (initial): "
           << DumpGraphToFile("functionalize_initial", *graph, library);
 
   // Functionalize and remove while loops from graph.
-  TF_RETURN_IF_ERROR(FunctionalizeWhileLoop(graph, library));
+  TF_RETURN_IF_ERROR(FunctionalizeWhileLoop(graph, library, node_filter));
 
   // FunctionalizeControlFlow is invoked for every function, so the loops's
   // bodies and conditionals that were extracted into functions will be handled
   // in successive invocations.
-  TF_RETURN_IF_ERROR(FunctionalizeCond(graph, library));
+  TF_RETURN_IF_ERROR(FunctionalizeCond(graph, library, node_filter));
 
   VLOG(2) << "FunctionalizeControlFlow (final): "
           << DumpGraphToFile("functionalize_final", *graph, library);
@@ -68,12 +67,13 @@ Status FunctionalizeControlFlow(Graph* graph,
 }
 
 Status FunctionalizeControlFlowForGraphDef(GraphDef* graph_def,
-    FunctionLibraryDefinition* library) {
+                                           FunctionLibraryDefinition* library,
+                                           const NodeFilter& node_filter) {
   FunctionDefLibrary function_lib = graph_def->library();
   Graph graph(OpRegistry::Global());
 
   TF_RETURN_IF_ERROR(ConvertGraphDefToGraph({}, *graph_def, &graph));
-  TF_RETURN_IF_ERROR(FunctionalizeControlFlow(&graph, library));
+  TF_RETURN_IF_ERROR(FunctionalizeControlFlow(&graph, library, node_filter));
   graph.ToGraphDef(graph_def);
   std::swap(*graph_def->mutable_library(), function_lib);
   return Status::OK();
diff --git a/tensorflow/compiler/tf2xla/functionalize_control_flow.h b/tensorflow/compiler/tf2xla/functionalize_control_flow.h
index fb35d1b4198..f9e751e2d67 100644
--- a/tensorflow/compiler/tf2xla/functionalize_control_flow.h
+++ b/tensorflow/compiler/tf2xla/functionalize_control_flow.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_TF2XLA_FUNCTIONALIZE_CONTROL_FLOW_H_
 #define TENSORFLOW_COMPILER_TF2XLA_FUNCTIONALIZE_CONTROL_FLOW_H_
 
+#include "tensorflow/compiler/tf2xla/functionalize_control_flow_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/core/common_runtime/optimization_registry.h"
 #include "tensorflow/core/framework/function.h"
@@ -26,11 +27,27 @@ namespace tensorflow {
 // Transformation that converts tf.while_loop() loops into functional While
 // operators and tf.cond() conditionals into function If operators, suitable for
 // XLA compilation.
+//
+// If `node_filter` is defined, then only loops and conditions for whose
+// nodes `node_filter` returns true are functionalized.
+//
+// Precondition:
+// For any node in a loop or condition for which `node_filter` returns true,
+// all nodes inside of the same loop or condition must also return true
+// (including nodes in other nested loops and conditions inside of that loop or
+// condition).
+// This means that a "not to be functionalized" loop or condition is not allowed
+// inside a "to be functionalized" loop or condition.
+//
+// The user of this function is responsible for using a node filter that
+// satisfies the above conditions.
 Status FunctionalizeControlFlow(Graph* graph,
-                                FunctionLibraryDefinition* library);
+                                FunctionLibraryDefinition* library,
+                                const NodeFilter& node_filter = {});
 
 Status FunctionalizeControlFlowForGraphDef(GraphDef* graph_def,
-                                           FunctionLibraryDefinition* library);
+                                           FunctionLibraryDefinition* library,
+                                           const NodeFilter& node_filter = {});
 
 // This pass looks at the graph, and turns V1 control flow structure
 // (Switch/Merge/etc.) into V2 control flow structure (If/While).
diff --git a/tensorflow/compiler/tf2xla/functionalize_control_flow_test.cc b/tensorflow/compiler/tf2xla/functionalize_control_flow_test.cc
index 8f53d227249..79a042ad680 100644
--- a/tensorflow/compiler/tf2xla/functionalize_control_flow_test.cc
+++ b/tensorflow/compiler/tf2xla/functionalize_control_flow_test.cc
@@ -62,7 +62,18 @@ Status FindIfThenAndElse(const GraphDef& graph, string* op_name,
 // z = control_flow_ops.cond(
 //     math_ops.less(y, x), lambda: math_ops.multiply(y, 17),
 //     lambda: math_ops.add(x, 23))
-TEST(FunctionalizeControlFlow, Conditional) {
+//
+// Tests different node filters.
+class ConditionalTestFixture : public ::testing::TestWithParam<bool> {
+ protected:
+  void SetUp() override { restrict_to_tpu_nodes_ = GetParam(); }
+  void RunTest();
+
+ private:
+  bool restrict_to_tpu_nodes_ = false;
+};
+
+void ConditionalTestFixture::RunTest() {
   Graph graph(OpRegistry::Global());
   {
     Scope scope = Scope::NewRootScope().ExitOnError();
@@ -92,14 +103,25 @@ TEST(FunctionalizeControlFlow, Conditional) {
                             std::initializer_list<Input>{add, mul});
 
     TF_EXPECT_OK(scope.ToGraph(&graph));
+
+    // Set `_tpu_replicate` attribute for all nodes.
+    for (Node* n : graph.nodes()) {
+      n->AddAttr("_tpu_replicate", "cluster");
+    }
   }
+  // If `restrict_to_tpu_nodes_` is true let filter function return true for
+  // `_tpu_replicate` nodes.
+  NodeFilter node_filter =
+      restrict_to_tpu_nodes_
+          ? [](const Node* n) { return n->attrs().Find("_tpu_replicate"); }
+          : NodeFilter{};
 
   FunctionLibraryDefinition library(OpRegistry::Global(), {});
   GraphDef optimized_graph_def;
   graph.ToGraphDef(&optimized_graph_def);
-  TF_ASSERT_OK(
-      FunctionalizeControlFlowForGraphDef(&optimized_graph_def, &library));
-  TF_ASSERT_OK(FunctionalizeControlFlow(&graph, &library));
+  TF_ASSERT_OK(FunctionalizeControlFlowForGraphDef(&optimized_graph_def,
+                                                   &library, node_filter));
+  TF_ASSERT_OK(FunctionalizeControlFlow(&graph, &library, node_filter));
   GraphDef converted_graph_def;
   graph.ToGraphDef(&converted_graph_def);
 
@@ -180,6 +202,13 @@ TEST(FunctionalizeControlFlow, Conditional) {
   }
 }
 
+TEST_P(ConditionalTestFixture, ConditionalTests) { RunTest(); }
+
+INSTANTIATE_TEST_SUITE_P(
+    FunctionalizeControlFlow, ConditionalTestFixture, ::testing::Bool(),
+    [](const ::testing::TestParamInfo<ConditionalTestFixture::ParamType>&
+           info) { return info.param ? "with_filter" : "without_filter"; });
+
 // Returns the names of the "cond" and "body" functions for the While node
 // in a graph.
 Status FindWhileCondAndBody(const GraphDef& graph, NameAttrList* cond,
@@ -758,25 +787,75 @@ TEST(FunctionalizeControlFlow, TwoLoopVars) {
   }
 }
 
-// Example with nesting, loop-invariant arguments, and resource variables.
-//
-// accum = resource_variable_ops.ResourceVariable(1)
-// x = array_ops.placeholder(2, dtype=dtypes.int32)
-// y = 3 + x
-//
-// def inner_body(j, k):
-//   add = state_ops.assign_add(accum, k * j + x)
-//   with ops.control_dependencies([add]):
-//     return [j + 1, k]
-//
-// def body(i):
-//   m = control_flow_ops.while_loop(lambda j, k: j < 5, inner_body,
-//                                   [1, y], name="inner")
-//   with ops.control_dependencies(m):
-//     return [i + 1]
-//
-// z = control_flow_ops.while_loop(lambda i: i < 10, body, [0], name="outer")
-TEST(FunctionalizeControlFlow, Complex) {
+// More complex example with nesting, loop-invariant arguments, and resource
+// variables. Used for multiple tests with different node filters.
+class ComplexTestFixture
+    : public ::testing::TestWithParam<std::tuple<bool, bool, bool>> {
+ protected:
+  void SetUp() override {
+    restrict_to_tpu_nodes_ = std::get<0>(GetParam());
+    mark_inner_loop_tpu_ = std::get<1>(GetParam());
+    mark_outer_loop_tpu_ = std::get<2>(GetParam());
+  }
+  void RunTest();
+
+ private:
+  void CheckOuterNodesFunctionalized(const GraphDef& graph_def,
+                                     const FunctionLibraryDefinition& library,
+                                     NameAttrList& inner_cond_fn,
+                                     NameAttrList& inner_body_fn);
+  void CheckInnerNodesFunctionalized(const GraphDef& graph_def,
+                                     const FunctionLibraryDefinition& library,
+                                     const NameAttrList& inner_cond_fn,
+                                     const NameAttrList& inner_body_fn);
+
+  bool restrict_to_tpu_nodes_ = false;
+  bool mark_inner_loop_tpu_ = false;
+  bool mark_outer_loop_tpu_ = false;
+};
+
+TEST_P(ComplexTestFixture, ComplexTests) { RunTest(); }
+
+INSTANTIATE_TEST_SUITE_P(
+    FunctionalizeControlFlow, ComplexTestFixture,
+    ::testing::Combine(::testing::Bool(), ::testing::Bool(), ::testing::Bool()),
+    [](const ::testing::TestParamInfo<ComplexTestFixture::ParamType>& info) {
+      bool restrict_to_tpu_nodes = std::get<0>(info.param);
+      bool mark_inner_loop_tpu = std::get<1>(info.param);
+      bool mark_outer_loop_tpu = std::get<2>(info.param);
+
+      string node_string;
+      if (mark_inner_loop_tpu && mark_outer_loop_tpu)
+        node_string = "both_loops_tpu";
+      else if (!mark_inner_loop_tpu && !mark_outer_loop_tpu)
+        node_string = "no_loop_tpu";
+      else
+        node_string = mark_inner_loop_tpu ? "inner_loop_tpu" : "outer_loop_tpu";
+
+      string name = absl::StrCat(
+          restrict_to_tpu_nodes ? "restricted_" : "unrestricted_", node_string);
+      return name;
+    });
+
+void ComplexTestFixture::RunTest() {
+  // Graph:
+  //
+  // accum = resource_variable_ops.ResourceVariable(1)
+  // x = array_ops.placeholder(2, dtype=dtypes.int32)
+  // y = 3 + x
+  //
+  // def inner_body(j, k):
+  //   add = state_ops.assign_add(accum, k * j + x)
+  //   with ops.control_dependencies([add]):
+  //     return [j + 1, k]
+  //
+  // def body(i):
+  //   m = control_flow_ops.while_loop(lambda j, k: j < 5, inner_body,
+  //                                   [1, y], name="inner")
+  //   with ops.control_dependencies(m):
+  //     return [i + 1]
+  //
+  // z = control_flow_ops.while_loop(lambda i: i < 10, body, [0], name="outer")
   Graph graph(OpRegistry::Global());
   {
     Scope scope = Scope::NewRootScope().ExitOnError();
@@ -846,7 +925,8 @@ TEST(FunctionalizeControlFlow, Complex) {
                                   5);
     auto less_j =
         ops::Less(scope.WithOpName("outer/inner/Less_j"), merge_j.output, five);
-    auto loop_cond = ops::LoopCond(scope.WithOpName("outer/LoopCond"), less_j);
+    auto loop_cond =
+        ops::LoopCond(scope.WithOpName("outer/inner/LoopCond"), less_j);
 
     auto switch_j = ops::Switch(scope.WithOpName("outer/inner/Switch_j"),
                                 merge_j.output, loop_cond);
@@ -906,193 +986,246 @@ TEST(FunctionalizeControlFlow, Complex) {
 
     TF_EXPECT_OK(scope.ToGraph(&graph));
   }
+  // Add '_tpu_replicate' attributes as specified.
+  for (Node* n : graph.nodes()) {
+    string name = n->name();
+    bool is_inner_node = name.find("outer/inner/") != string::npos;
+    bool is_outer_node = !is_inner_node && name.find("outer/") != string::npos;
+    if ((is_inner_node && mark_inner_loop_tpu_) ||
+        (is_outer_node && mark_outer_loop_tpu_)) {
+      n->AddAttr("_tpu_replicate", "cluster");
+    }
+  }
 
   FunctionLibraryDefinition library(OpRegistry::Global(), {});
-  GraphDef optimized_graph_def;
-  graph.ToGraphDef(&optimized_graph_def);
-  TF_ASSERT_OK(
-      FunctionalizeControlFlowForGraphDef(&optimized_graph_def, &library));
-  TF_ASSERT_OK(FunctionalizeControlFlow(&graph, &library));
-  GraphDef converted_graph_def;
-  graph.ToGraphDef(&converted_graph_def);
+  GraphDef orig_graph_def, optimized_graph_def;
+  graph.ToGraphDef(&orig_graph_def);
+  optimized_graph_def = orig_graph_def;
+  // If `restrict_to_tpu_nodes_` is true let filter function return true for
+  // `_tpu_replicate` nodes, otherwise don't set filter.
+  NodeFilter node_filter =
+      restrict_to_tpu_nodes_
+          ? [](const Node* n) { return n->attrs().Find("_tpu_replicate"); }
+          : NodeFilter{};
 
-  for (const GraphDef& graph_def : {optimized_graph_def, converted_graph_def}) {
-    NameAttrList outer_cond_fn, outer_body_fn;
-    TF_EXPECT_OK(
-        FindWhileCondAndBody(graph_def, &outer_cond_fn, &outer_body_fn));
+  Status status1 = FunctionalizeControlFlowForGraphDef(&optimized_graph_def,
+                                                       &library, node_filter);
+  Status status2 = FunctionalizeControlFlow(&graph, &library, node_filter);
+  ASSERT_EQ(status1, status2);
+  if (restrict_to_tpu_nodes_ && mark_outer_loop_tpu_ && !mark_inner_loop_tpu_) {
+    // This case violates the precondition of `FunctionalizeControlFlow`, we
+    // expect an internal error.
+    ASSERT_EQ(errors::IsInternal(status1), true);
+    return;
+  } else {
+    // Supported cases, no error expected.
+    TF_ASSERT_OK(status1);
+  }
 
-    // Outer graph.
-    {
-      Scope scope = Scope::NewRootScope().ExitOnError();
-      auto x = ops::Placeholder(scope.WithOpName("x"), DT_INT32);
-      auto three = ops::Const<int32>(scope.WithOpName("three"), 3);
-      auto y = ops::Add(scope.WithOpName("y"), x, three);
-
-      auto var = ops::VarHandleOp(scope.WithOpName("Variable"), DT_INT32,
-                                  TensorShape({}));
-
-      auto zero = ops::Const<int32>(scope.WithOpName("outer/Const"), 0);
-
-      auto while_op = ops::While(scope.WithOpName("outer/LoopCond"),
-                                 std::initializer_list<Input>{zero, y, x, var},
-                                 outer_cond_fn, outer_body_fn);
-      auto sink = ops::Identity(scope.WithOpName("sink"), while_op[0]);
-      GraphDef expected;
-      TF_EXPECT_OK(scope.ToGraphDef(&expected));
-      TF_EXPECT_GRAPH_EQ(expected, graph_def);
-    }
-
-    // Outer condition graph.
-    {
-      Scope scope = Scope::NewRootScope().ExitOnError();
-      auto arg0 = ops::_Arg(scope.WithOpName("arg0"), DT_INT32, 0);
-      auto arg1 = ops::_Arg(scope.WithOpName("arg1"), DT_INT32, 1);
-      auto arg2 = ops::_Arg(scope.WithOpName("arg2"), DT_INT32, 2);
-      auto arg3 = ops::_Arg(scope.WithOpName("arg3"), DT_RESOURCE, 3);
-
-      auto ten = ops::Const<int32>(
-          scope.WithOpName("outer/Less/y").WithControlDependencies(arg0.output),
-          10);
-      auto less = ops::Less(scope.WithOpName("outer/Less_i"), arg0, ten);
-      auto retval = ops::_Retval(scope.WithOpName("retval0_RetVal"), less, 0);
-
-      GraphDef expected;
-      TF_EXPECT_OK(scope.ToGraphDef(&expected));
-
-      InstantiationResultForTest result;
-      TF_EXPECT_OK(
-          InstantiateFunctionForTest(outer_cond_fn.name(), library, &result));
-
-      EXPECT_EQ((DataTypeVector{DT_INT32, DT_INT32, DT_INT32, DT_RESOURCE}),
-                result.arg_types);
-      EXPECT_EQ(DataTypeVector{DT_BOOL}, result.ret_types);
-      TF_EXPECT_GRAPH_EQ(expected, result.gdef);
-    }
-
-    // Outer body graph.
+  GraphDef optimized_converted_graph_def;
+  graph.ToGraphDef(&optimized_converted_graph_def);
+  for (const GraphDef& graph_def :
+       {optimized_graph_def, optimized_converted_graph_def}) {
     NameAttrList inner_cond_fn, inner_body_fn;
-    {
-      InstantiationResultForTest result;
-      TF_EXPECT_OK(
-          InstantiateFunctionForTest(outer_body_fn.name(), library, &result));
-
-      // Find the inner condition and body names.
-      TF_EXPECT_OK(
-          FindWhileCondAndBody(result.gdef, &inner_cond_fn, &inner_body_fn));
-
-      Scope scope = Scope::NewRootScope().ExitOnError();
-      auto arg0 = ops::_Arg(scope.WithOpName("arg0"), DT_INT32, 0);
-      auto arg1 = ops::_Arg(scope.WithOpName("arg1"), DT_INT32, 1);
-      auto arg2 = ops::_Arg(scope.WithOpName("arg2"), DT_INT32, 2);
-      auto arg3 = ops::_Arg(scope.WithOpName("arg3"), DT_RESOURCE, 3);
-
-      auto identity_i = ops::Identity(scope.WithOpName("outer/Identity"), arg0);
-      auto one_j = ops::Const<int32>(
-          scope.WithOpName("outer/j").WithControlDependencies(identity_i), 1);
-      auto while_op =
-          ops::While(scope.WithOpName("outer/LoopCond_1"),
-                     std::initializer_list<Input>{one_j, arg1, arg2, arg3},
-                     inner_cond_fn, inner_body_fn);
-
-      auto one_outer = ops::Const<int32>(
-          scope.WithOpName("outer/add/y").WithControlDependencies(identity_i),
-          1);
-      auto add_i =
-          ops::Add(scope.WithOpName("outer/add")
-                       .WithControlDependencies(absl::Span<const Operation>{
-                           while_op[0].op(), while_op[1].op()}),
-                   identity_i, one_outer);
-
-      auto retval0 = ops::_Retval(scope.WithOpName("retval0_RetVal"), add_i, 0);
-      auto retval1 = ops::_Retval(scope.WithOpName("retval1_RetVal"), arg1, 1);
-      auto retval2 = ops::_Retval(scope.WithOpName("retval2_RetVal"), arg2, 2);
-      auto retval3 = ops::_Retval(scope.WithOpName("retval3_RetVal"), arg3, 3);
-
-      GraphDef expected;
-      TF_EXPECT_OK(scope.ToGraphDef(&expected));
-
-      EXPECT_EQ((DataTypeVector{DT_INT32, DT_INT32, DT_INT32, DT_RESOURCE}),
-                result.arg_types);
-      EXPECT_EQ((DataTypeVector{DT_INT32, DT_INT32, DT_INT32, DT_RESOURCE}),
-                result.ret_types);
-      TF_EXPECT_GRAPH_EQ(expected, result.gdef);
+    if (!restrict_to_tpu_nodes_ ||
+        (restrict_to_tpu_nodes_ && mark_outer_loop_tpu_ &&
+         mark_inner_loop_tpu_)) {
+      // We expect that both inner and outer nodes have been functionalized.
+      CheckOuterNodesFunctionalized(graph_def, library, inner_cond_fn,
+                                    inner_body_fn);
+      CheckInnerNodesFunctionalized(graph_def, library, inner_cond_fn,
+                                    inner_body_fn);
+    } else /*restrict_to_tpu_nodes_ == true*/ {
+      if (!mark_outer_loop_tpu_ && !mark_inner_loop_tpu_) {
+        // Graph has no TPU nodes so we expect no functionalization.
+        TF_EXPECT_GRAPH_EQ(orig_graph_def, graph_def);
+      } else if (!mark_outer_loop_tpu_ && mark_inner_loop_tpu_) {
+        // We expect that only inner nodes have been functionalized.
+        TF_EXPECT_OK(
+            FindWhileCondAndBody(graph_def, &inner_cond_fn, &inner_body_fn));
+        CheckInnerNodesFunctionalized(graph_def, library, inner_cond_fn,
+                                      inner_body_fn);
+      }
     }
+  }
+}
 
-    // Inner condition graph.
-    {
-      Scope scope = Scope::NewRootScope().ExitOnError();
-      auto arg0 = ops::_Arg(scope.WithOpName("arg0"), DT_INT32, 0);
-      auto arg1 = ops::_Arg(scope.WithOpName("arg1"), DT_INT32, 1);
-      auto arg2 = ops::_Arg(scope.WithOpName("arg2"), DT_INT32, 2);
-      auto arg3 = ops::_Arg(scope.WithOpName("arg3"), DT_RESOURCE, 3);
+void ComplexTestFixture::CheckOuterNodesFunctionalized(
+    const GraphDef& graph_def, const FunctionLibraryDefinition& library,
+    NameAttrList& inner_cond_fn, NameAttrList& inner_body_fn) {
+  NameAttrList outer_cond_fn, outer_body_fn;
+  TF_EXPECT_OK(FindWhileCondAndBody(graph_def, &outer_cond_fn, &outer_body_fn));
 
-      auto five = ops::Const<int32>(
-          scope.WithOpName("outer/inner/Five").WithControlDependencies(arg0),
-          5);
-      auto less_j =
-          ops::Less(scope.WithOpName("outer/inner/Less_j"), arg0, five);
-      auto retval = ops::_Retval(scope.WithOpName("retval0_RetVal"), less_j, 0);
+  // Outer graph.
+  {
+    Scope scope = Scope::NewRootScope().ExitOnError();
+    auto x = ops::Placeholder(scope.WithOpName("x"), DT_INT32);
+    auto three = ops::Const<int32>(scope.WithOpName("three"), 3);
+    auto y = ops::Add(scope.WithOpName("y"), x, three);
 
-      GraphDef expected;
-      TF_EXPECT_OK(scope.ToGraphDef(&expected));
+    auto var = ops::VarHandleOp(scope.WithOpName("Variable"), DT_INT32,
+                                TensorShape({}));
 
-      InstantiationResultForTest result;
-      TF_EXPECT_OK(
-          InstantiateFunctionForTest(inner_cond_fn.name(), library, &result));
+    auto zero = ops::Const<int32>(scope.WithOpName("outer/Const"), 0);
 
-      EXPECT_EQ((DataTypeVector{DT_INT32, DT_INT32, DT_INT32, DT_RESOURCE}),
-                result.arg_types);
-      EXPECT_EQ(DataTypeVector{DT_BOOL}, result.ret_types);
-      TF_EXPECT_GRAPH_EQ(expected, result.gdef);
-    }
+    auto while_op = ops::While(scope.WithOpName("outer/LoopCond"),
+                               std::initializer_list<Input>{zero, y, x, var},
+                               outer_cond_fn, outer_body_fn);
+    auto sink = ops::Identity(scope.WithOpName("sink"), while_op[0]);
+    GraphDef expected;
+    TF_EXPECT_OK(scope.ToGraphDef(&expected));
+    TF_EXPECT_GRAPH_EQ(expected, graph_def);
+  }
 
-    // Inner body graph.
-    {
-      Scope scope = Scope::NewRootScope().ExitOnError();
-      auto arg0 = ops::_Arg(scope.WithOpName("arg0"), DT_INT32, 0);
-      auto arg1 = ops::_Arg(scope.WithOpName("arg1"), DT_INT32, 1);
-      auto arg2 = ops::_Arg(scope.WithOpName("arg2"), DT_INT32, 2);
-      auto arg3 = ops::_Arg(scope.WithOpName("arg3"), DT_RESOURCE, 3);
+  // Outer condition graph.
+  {
+    Scope scope = Scope::NewRootScope().ExitOnError();
+    auto arg0 = ops::_Arg(scope.WithOpName("arg0"), DT_INT32, 0);
+    auto arg1 = ops::_Arg(scope.WithOpName("arg1"), DT_INT32, 1);
+    auto arg2 = ops::_Arg(scope.WithOpName("arg2"), DT_INT32, 2);
+    auto arg3 = ops::_Arg(scope.WithOpName("arg3"), DT_RESOURCE, 3);
 
-      auto identity_j =
-          ops::Identity(scope.WithOpName("outer/inner/Identity_j"), arg0);
-      auto identity_k =
-          ops::Identity(scope.WithOpName("outer/inner/Identity_k"), arg1);
+    auto ten = ops::Const<int32>(
+        scope.WithOpName("outer/Less/y").WithControlDependencies(arg0.output),
+        10);
+    auto less = ops::Less(scope.WithOpName("outer/Less_i"), arg0, ten);
+    auto retval = ops::_Retval(scope.WithOpName("retval0_RetVal"), less, 0);
 
-      auto mul_jk =
-          ops::Mul(scope.WithOpName("outer/inner/mul"), identity_j, identity_k);
-      auto add_jkx =
-          ops::Add(scope.WithOpName("outer/inner/add"), mul_jk, arg2);
-      auto assign = ops::AssignAddVariableOp(
-          scope.WithOpName("outer/inner/assign_add"), arg3, add_jkx);
+    GraphDef expected;
+    TF_EXPECT_OK(scope.ToGraphDef(&expected));
 
-      auto one = ops::Const<int32>(
-          scope.WithOpName("outer/inner/One")
-              .WithControlDependencies(
-                  absl::Span<const Operation>{assign.operation}),
-          1);
-      auto add_j =
-          ops::Add(scope.WithOpName("outer/inner/add_j"), identity_j, one);
+    InstantiationResultForTest result;
+    TF_EXPECT_OK(
+        InstantiateFunctionForTest(outer_cond_fn.name(), library, &result));
 
-      auto retval0 = ops::_Retval(scope.WithOpName("retval0_RetVal"), add_j, 0);
-      auto retval1 =
-          ops::_Retval(scope.WithOpName("retval1_RetVal"), identity_k, 1);
-      auto retval2 = ops::_Retval(scope.WithOpName("retval2_RetVal"), arg2, 2);
-      auto retval3 = ops::_Retval(scope.WithOpName("retval3_RetVal"), arg3, 3);
+    EXPECT_EQ((DataTypeVector{DT_INT32, DT_INT32, DT_INT32, DT_RESOURCE}),
+              result.arg_types);
+    EXPECT_EQ(DataTypeVector{DT_BOOL}, result.ret_types);
+    TF_EXPECT_GRAPH_EQ(expected, result.gdef);
+  }
 
-      GraphDef expected;
-      TF_EXPECT_OK(scope.ToGraphDef(&expected));
+  // Outer body graph.
+  {
+    InstantiationResultForTest result;
+    TF_EXPECT_OK(
+        InstantiateFunctionForTest(outer_body_fn.name(), library, &result));
 
-      InstantiationResultForTest result;
-      TF_EXPECT_OK(
-          InstantiateFunctionForTest(inner_body_fn.name(), library, &result));
+    // Find the inner condition and body names.
+    TF_EXPECT_OK(
+        FindWhileCondAndBody(result.gdef, &inner_cond_fn, &inner_body_fn));
 
-      EXPECT_EQ((DataTypeVector{DT_INT32, DT_INT32, DT_INT32, DT_RESOURCE}),
-                result.arg_types);
-      EXPECT_EQ((DataTypeVector{DT_INT32, DT_INT32, DT_INT32, DT_RESOURCE}),
-                result.ret_types);
-      TF_EXPECT_GRAPH_EQ(expected, result.gdef);
-    }
+    Scope scope = Scope::NewRootScope().ExitOnError();
+    auto arg0 = ops::_Arg(scope.WithOpName("arg0"), DT_INT32, 0);
+    auto arg1 = ops::_Arg(scope.WithOpName("arg1"), DT_INT32, 1);
+    auto arg2 = ops::_Arg(scope.WithOpName("arg2"), DT_INT32, 2);
+    auto arg3 = ops::_Arg(scope.WithOpName("arg3"), DT_RESOURCE, 3);
+
+    auto identity_i = ops::Identity(scope.WithOpName("outer/Identity"), arg0);
+    auto one_j = ops::Const<int32>(
+        scope.WithOpName("outer/j").WithControlDependencies(identity_i), 1);
+    auto while_op =
+        ops::While(scope.WithOpName("outer/inner/LoopCond"),
+                   std::initializer_list<Input>{one_j, arg1, arg2, arg3},
+                   inner_cond_fn, inner_body_fn);
+
+    auto one_outer = ops::Const<int32>(
+        scope.WithOpName("outer/add/y").WithControlDependencies(identity_i), 1);
+    auto add_i =
+        ops::Add(scope.WithOpName("outer/add")
+                     .WithControlDependencies(absl::Span<const Operation>{
+                         while_op[0].op(), while_op[1].op()}),
+                 identity_i, one_outer);
+
+    auto retval0 = ops::_Retval(scope.WithOpName("retval0_RetVal"), add_i, 0);
+    auto retval1 = ops::_Retval(scope.WithOpName("retval1_RetVal"), arg1, 1);
+    auto retval2 = ops::_Retval(scope.WithOpName("retval2_RetVal"), arg2, 2);
+    auto retval3 = ops::_Retval(scope.WithOpName("retval3_RetVal"), arg3, 3);
+
+    GraphDef expected;
+    TF_EXPECT_OK(scope.ToGraphDef(&expected));
+
+    EXPECT_EQ((DataTypeVector{DT_INT32, DT_INT32, DT_INT32, DT_RESOURCE}),
+              result.arg_types);
+    EXPECT_EQ((DataTypeVector{DT_INT32, DT_INT32, DT_INT32, DT_RESOURCE}),
+              result.ret_types);
+    TF_EXPECT_GRAPH_EQ(expected, result.gdef);
+  }
+}
+
+void ComplexTestFixture::CheckInnerNodesFunctionalized(
+    const GraphDef& graph_def, const FunctionLibraryDefinition& library,
+    const NameAttrList& inner_cond_fn, const NameAttrList& inner_body_fn) {
+  // Inner condition graph.
+  {
+    Scope scope = Scope::NewRootScope().ExitOnError();
+    auto arg0 = ops::_Arg(scope.WithOpName("arg0"), DT_INT32, 0);
+    auto arg1 = ops::_Arg(scope.WithOpName("arg1"), DT_INT32, 1);
+    auto arg2 = ops::_Arg(scope.WithOpName("arg2"), DT_INT32, 2);
+    auto arg3 = ops::_Arg(scope.WithOpName("arg3"), DT_RESOURCE, 3);
+
+    auto five = ops::Const<int32>(
+        scope.WithOpName("outer/inner/Five").WithControlDependencies(arg0), 5);
+    auto less_j = ops::Less(scope.WithOpName("outer/inner/Less_j"), arg0, five);
+    auto retval = ops::_Retval(scope.WithOpName("retval0_RetVal"), less_j, 0);
+
+    GraphDef expected;
+    TF_EXPECT_OK(scope.ToGraphDef(&expected));
+
+    InstantiationResultForTest result;
+    TF_EXPECT_OK(
+        InstantiateFunctionForTest(inner_cond_fn.name(), library, &result));
+
+    EXPECT_EQ((DataTypeVector{DT_INT32, DT_INT32, DT_INT32, DT_RESOURCE}),
+              result.arg_types);
+    EXPECT_EQ(DataTypeVector{DT_BOOL}, result.ret_types);
+    TF_EXPECT_GRAPH_EQ(expected, result.gdef);
+  }
+
+  // Inner body graph.
+  {
+    Scope scope = Scope::NewRootScope().ExitOnError();
+    auto arg0 = ops::_Arg(scope.WithOpName("arg0"), DT_INT32, 0);
+    auto arg1 = ops::_Arg(scope.WithOpName("arg1"), DT_INT32, 1);
+    auto arg2 = ops::_Arg(scope.WithOpName("arg2"), DT_INT32, 2);
+    auto arg3 = ops::_Arg(scope.WithOpName("arg3"), DT_RESOURCE, 3);
+
+    auto identity_j =
+        ops::Identity(scope.WithOpName("outer/inner/Identity_j"), arg0);
+    auto identity_k =
+        ops::Identity(scope.WithOpName("outer/inner/Identity_k"), arg1);
+
+    auto mul_jk =
+        ops::Mul(scope.WithOpName("outer/inner/mul"), identity_j, identity_k);
+    auto add_jkx = ops::Add(scope.WithOpName("outer/inner/add"), mul_jk, arg2);
+    auto assign = ops::AssignAddVariableOp(
+        scope.WithOpName("outer/inner/assign_add"), arg3, add_jkx);
+
+    auto one = ops::Const<int32>(
+        scope.WithOpName("outer/inner/One")
+            .WithControlDependencies(
+                absl::Span<const Operation>{assign.operation}),
+        1);
+    auto add_j =
+        ops::Add(scope.WithOpName("outer/inner/add_j"), identity_j, one);
+
+    auto retval0 = ops::_Retval(scope.WithOpName("retval0_RetVal"), add_j, 0);
+    auto retval1 =
+        ops::_Retval(scope.WithOpName("retval1_RetVal"), identity_k, 1);
+    auto retval2 = ops::_Retval(scope.WithOpName("retval2_RetVal"), arg2, 2);
+    auto retval3 = ops::_Retval(scope.WithOpName("retval3_RetVal"), arg3, 3);
+
+    GraphDef expected;
+    TF_EXPECT_OK(scope.ToGraphDef(&expected));
+
+    InstantiationResultForTest result;
+    TF_EXPECT_OK(
+        InstantiateFunctionForTest(inner_body_fn.name(), library, &result));
+
+    EXPECT_EQ((DataTypeVector{DT_INT32, DT_INT32, DT_INT32, DT_RESOURCE}),
+              result.arg_types);
+    EXPECT_EQ((DataTypeVector{DT_INT32, DT_INT32, DT_INT32, DT_RESOURCE}),
+              result.ret_types);
+    TF_EXPECT_GRAPH_EQ(expected, result.gdef);
   }
 }
 
diff --git a/tensorflow/compiler/tf2xla/functionalize_control_flow_util.cc b/tensorflow/compiler/tf2xla/functionalize_control_flow_util.cc
index c31d2a4f07f..8df1c5f0c50 100644
--- a/tensorflow/compiler/tf2xla/functionalize_control_flow_util.cc
+++ b/tensorflow/compiler/tf2xla/functionalize_control_flow_util.cc
@@ -51,7 +51,8 @@ xla::StatusOr<Node*> BuildRetvalNode(Graph* graph, DataType type, int index) {
 
 Status ExtractWhileLoopFrames(
     const std::vector<ControlFlowInfo>& cf_info, const Graph* graph,
-    std::unordered_map<string, WhileLoopFrame>* frames) {
+    std::unordered_map<string, WhileLoopFrame>* frames,
+    const NodeFilter& node_filter) {
   for (Node* node : graph->op_nodes()) {
     const ControlFlowInfo& cf = cf_info[node->id()];
 
@@ -81,6 +82,9 @@ Status ExtractWhileLoopFrames(
       frame.loop_cond = node;
     }
     frame.nodes.insert(node);
+    if (node->IsControlFlow() && node_filter && !node_filter(node)) {
+      frame.should_be_functionalized = false;
+    }
   }
 
   return Status::OK();
diff --git a/tensorflow/compiler/tf2xla/functionalize_control_flow_util.h b/tensorflow/compiler/tf2xla/functionalize_control_flow_util.h
index f986376c8e3..1152a14f961 100644
--- a/tensorflow/compiler/tf2xla/functionalize_control_flow_util.h
+++ b/tensorflow/compiler/tf2xla/functionalize_control_flow_util.h
@@ -26,6 +26,8 @@ limitations under the License.
 
 namespace tensorflow {
 
+using NodeFilter = std::function<bool(const Node*)>;
+
 // Information about a loop argument.
 struct WhileLoopArg {
   // Every loop argument has an Enter node.
@@ -60,13 +62,22 @@ struct WhileLoopFrame {
 
   // Set of nodes that belong to the loop frame.
   std::unordered_set<Node*> nodes;
+
+  // After `ExtractWhileLoopFrames` this is true if for all control flow nodes
+  // of this frame `node_filter` returns true, i.e., the frame should be
+  // functionalized, and false otherwise.
+  bool should_be_functionalized = true;
 };
 
 // Extracts v1 while loops within a graph and creates a map of
 // <ControlFLowInfo.name, WhileLoopFrame>.
+// If `node_filter` is defined, then we keep track of frames that should be
+// functionalized according to the filter (see comment for
+// `FunctionalizeControlFlow` for more details about node filters).
 Status ExtractWhileLoopFrames(
     const std::vector<ControlFlowInfo>& cf_info, const Graph* graph,
-    std::unordered_map<string, WhileLoopFrame>* frames);
+    std::unordered_map<string, WhileLoopFrame>* frames,
+    const NodeFilter& node_filter = {});
 
 // Check that the graph has no cycle containing the given node.
 Status CheckNodeNotInCycle(const Node* node, const int num_nodes);
diff --git a/tensorflow/compiler/tf2xla/functionalize_while.cc b/tensorflow/compiler/tf2xla/functionalize_while.cc
index 4e55aef3713..ab2a6958723 100644
--- a/tensorflow/compiler/tf2xla/functionalize_while.cc
+++ b/tensorflow/compiler/tf2xla/functionalize_while.cc
@@ -22,11 +22,10 @@ limitations under the License.
 #include <vector>
 
 #include "absl/memory/memory.h"
+#include "absl/strings/match.h"
 #include "absl/types/optional.h"
 #include "tensorflow/compiler/jit/union_find.h"
-#include "tensorflow/compiler/tf2xla/frontend_attributes_util.h"
 #include "tensorflow/compiler/tf2xla/functionalize_cond.h"
-#include "tensorflow/compiler/tf2xla/functionalize_control_flow_util.h"
 #include "tensorflow/compiler/tf2xla/tf2xla_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/core/common_runtime/function.h"
@@ -162,7 +161,7 @@ Status BuildLoopBody(const Graph& graph, WhileLoopFrame* frame,
   *body_output = absl::make_unique<Graph>(graph.op_registry());
   Graph* output = body_output->get();
 
-  // Map from nodes in the original graph to the condition graph.
+  // Map from nodes in the original graph to the body graph.
   std::vector<Node*> node_map(graph.num_node_ids(), nullptr);
   std::vector<bool> squash_src_outputs(graph.num_node_ids(), false);
 
@@ -212,7 +211,14 @@ Status BuildLoopBody(const Graph& graph, WhileLoopFrame* frame,
 }
 
 Status FunctionalizeLoop(Graph* graph, WhileLoopFrame* frame,
-                         FunctionLibraryDefinition* library) {
+                         FunctionLibraryDefinition* library,
+                         const NodeFilter& node_filter) {
+  if (node_filter && !frame->should_be_functionalized) {
+    VLOG(2) << "Skipping functionalization for frame " << frame->name
+            << " because it has control flow nodes that are filtered out by "
+               "the specified node filter.";
+    return Status::OK();
+  }
   VLOG(2) << "Frame " << frame->name << " before: "
           << DumpGraphToFile("functionalize_before", *graph, library);
 
@@ -349,10 +355,6 @@ Status FunctionalizeLoop(Graph* graph, WhileLoopFrame* frame,
         return errors::InvalidArgument("Missing Switch successor to ",
                                        FormatNodeForError(*arg.merge));
       }
-
-      // Update the device on the Identity outputs of the switch to match their
-      // target. These Identity outputs do not
-
       // Loop over the switch node's output to:
       // - Find the Exit successor.
       // - Set the sharding on all Identity outputs of the switch. These
@@ -402,12 +404,12 @@ Status FunctionalizeLoop(Graph* graph, WhileLoopFrame* frame,
   std::unique_ptr<Graph> cond_graph;
   TF_RETURN_IF_ERROR(BuildLoopCondition(*graph, frame, &cond_graph));
   FixupSourceAndSinkEdges(cond_graph.get());
-  TF_RETURN_IF_ERROR(FunctionalizeCond(cond_graph.get(), library));
+  TF_RETURN_IF_ERROR(FunctionalizeCond(cond_graph.get(), library, node_filter));
   DataTypeVector arg_types;
   std::unique_ptr<Graph> body_graph;
   TF_RETURN_IF_ERROR(BuildLoopBody(*graph, frame, &arg_types, &body_graph));
   FixupSourceAndSinkEdges(body_graph.get());
-  TF_RETURN_IF_ERROR(FunctionalizeCond(body_graph.get(), library));
+  TF_RETURN_IF_ERROR(FunctionalizeCond(body_graph.get(), library, node_filter));
 
   VLOG(2) << "Frame " << frame->name << " condition: "
           << DumpGraphToFile("loop_condition", *cond_graph, library)
@@ -433,17 +435,13 @@ Status FunctionalizeLoop(Graph* graph, WhileLoopFrame* frame,
   builder.Attr("T", arg_types);
   builder.Attr("cond", cond_name);
   builder.Attr("body", body_name);
-  string outside_compilation;
-  string frontend_attributes;
-  if (GetNodeAttr(frame->loop_cond->def(), kXlaFrontendAttributesAttrName,
-                  &frontend_attributes)
-          .ok()) {
-    builder.Attr(kXlaFrontendAttributesAttrName, frontend_attributes);
-  }
-  if (GetNodeAttr(frame->loop_cond->def(), kXlaOutsideCompilationAttrName,
-                  &outside_compilation)
-          .ok()) {
-    builder.Attr(kXlaOutsideCompilationAttrName, outside_compilation);
+  // Add all underscore attributes, these need to be propagated.
+  for (const auto& attr : frame->loop_cond->def().attr()) {
+    const string& name(attr.first);
+    const AttrValue& value(attr.second);
+    if (absl::StartsWith(name, "_")) {
+      builder.Attr(name, value);
+    }
   }
   std::vector<NodeDefBuilder::NodeOut> inputs;
   for (int i = 0; i < frame->args.size(); ++i) {
@@ -495,6 +493,7 @@ Status FunctionalizeLoop(Graph* graph, WhileLoopFrame* frame,
   // Remove the old nodes from the graph, and add the while node to the parent
   // frame.
   for (Node* node : frame->nodes) {
+    VLOG(2) << "Removing obsolete node " << node->name();
     graph->RemoveNode(node);
   }
   frame->nodes.clear();
@@ -507,8 +506,8 @@ Status FunctionalizeLoop(Graph* graph, WhileLoopFrame* frame,
 }
 }  // namespace
 
-Status FunctionalizeWhileLoop(Graph* graph,
-                              FunctionLibraryDefinition* library) {
+Status FunctionalizeWhileLoop(Graph* graph, FunctionLibraryDefinition* library,
+                              const NodeFilter& node_filter) {
   // Note: BuildControlFlowInfo() requires that the graph's source node is
   // connected to all source nodes in the graph. Many graphs violate this
   // invariant.
@@ -523,7 +522,8 @@ Status FunctionalizeWhileLoop(Graph* graph,
 
   // Builds Frames, indexed by name.
   std::unordered_map<string, WhileLoopFrame> frames;
-  TF_RETURN_IF_ERROR(ExtractWhileLoopFrames(cf_info, graph, &frames));
+  TF_RETURN_IF_ERROR(
+      ExtractWhileLoopFrames(cf_info, graph, &frames, node_filter));
 
   // Adds frames with no children (i.e., the innermost frames) to a worklist.
   std::deque<WhileLoopFrame*> worklist;
@@ -533,7 +533,9 @@ Status FunctionalizeWhileLoop(Graph* graph,
     }
   }
 
-  // Eliminate loops from innermost to outermost.
+  // Eliminate loops from innermost to outermost. Note that the precondition for
+  // `node_filter` in `FunctionalizeControlFlow` makes sure that this approach
+  // works.
   while (!worklist.empty()) {
     WhileLoopFrame* frame = worklist.front();
     worklist.pop_front();
@@ -542,7 +544,7 @@ Status FunctionalizeWhileLoop(Graph* graph,
       continue;
     }
 
-    TF_RETURN_IF_ERROR(FunctionalizeLoop(graph, frame, library));
+    TF_RETURN_IF_ERROR(FunctionalizeLoop(graph, frame, library, node_filter));
 
     // If the parent has no remaining children, add it to the worklist.
     --frame->parent->num_children;
@@ -551,14 +553,16 @@ Status FunctionalizeWhileLoop(Graph* graph,
     }
   }
 
-  // There should be no cycle at this point, since while loops have been removed
-  // from graph.
-  // Check that the newly added While nodes don't feed into themselves.
-  for (const Node* node : graph->op_nodes()) {
-    if (node->def().op() == "While") {
-      TF_RETURN_WITH_CONTEXT_IF_ERROR(
-          CheckNodeNotInCycle(node, graph->num_node_ids()),
-          "Functionalizing loop failed.");
+  if (!node_filter) {
+    // There should be no cycle at this point, since while loops have been
+    // removed from graph. Check that the newly added While nodes don't feed
+    // into themselves.
+    for (const Node* node : graph->op_nodes()) {
+      if (node->def().op() == "While") {
+        TF_RETURN_WITH_CONTEXT_IF_ERROR(
+            CheckNodeNotInCycle(node, graph->num_node_ids()),
+            "Functionalizing loop failed.");
+      }
     }
   }
 
diff --git a/tensorflow/compiler/tf2xla/functionalize_while.h b/tensorflow/compiler/tf2xla/functionalize_while.h
index 207b29b8498..ddd6b655cd5 100644
--- a/tensorflow/compiler/tf2xla/functionalize_while.h
+++ b/tensorflow/compiler/tf2xla/functionalize_while.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_TF2XLA_FUNCTIONALIZE_WHILE_H_
 #define TENSORFLOW_COMPILER_TF2XLA_FUNCTIONALIZE_WHILE_H_
 
+#include "tensorflow/compiler/tf2xla/functionalize_control_flow_util.h"
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/graph/graph.h"
 
@@ -24,7 +25,14 @@ namespace tensorflow {
 // Transformation that converts tf.while_loop() loops into functional While
 // operators, suitable for XLA compilation. If lookup_library is provided, use
 // it to make the library for control flow self-contained.
-Status FunctionalizeWhileLoop(Graph* graph, FunctionLibraryDefinition* library);
+//
+// If `node_filter` is defined, then only loops for whose nodes `node_filter`
+// returns true are functionalized.
+//
+// Preconditions:
+// Same as for `FunctionalizeControlFlow` (see comment there).
+Status FunctionalizeWhileLoop(Graph* graph, FunctionLibraryDefinition* library,
+                              const NodeFilter& node_filter = {});
 
 }  // namespace tensorflow
 

From e085901e2cb41d8a5a4cb7283e1b6d719d0dc4b8 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 29 Jun 2020 10:58:45 -0700
Subject: [PATCH 1266/1390] Change the update_version script to not add +1 to
 the minor version of TF for nightly builds.

PiperOrigin-RevId: 318848771
Change-Id: Idb9cd031564a18f408fc927de5025a457fa79f3a
---
 tensorflow/tools/ci_build/update_version.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/tensorflow/tools/ci_build/update_version.py b/tensorflow/tools/ci_build/update_version.py
index 203c438c8fa..6e94952ef3f 100755
--- a/tensorflow/tools/ci_build/update_version.py
+++ b/tensorflow/tools/ci_build/update_version.py
@@ -294,10 +294,8 @@ def main():
       new_version = Version.parse_from_string(args.version, NIGHTLY_VERSION)
       new_version.set_identifier_string("-dev" + time.strftime("%Y%m%d"))
     else:
-      # Dev minor version is one ahead of official.
-      nightly_minor_ver = int(old_version.minor) + 1
       new_version = Version(old_version.major,
-                            str(nightly_minor_ver),
+                            str(old_version.minor),
                             old_version.patch,
                             "-dev" + time.strftime("%Y%m%d"),
                             NIGHTLY_VERSION)

From f96a4f7478f227859e2a325c46f49200b5598690 Mon Sep 17 00:00:00 2001
From: Ken Franko <kfranko@google.com>
Date: Mon, 29 Jun 2020 11:17:20 -0700
Subject: [PATCH 1267/1390] Replace usages of Tensorflow DistributionStrategy
 method experimental_run_v2 with run.

PiperOrigin-RevId: 318852933
Change-Id: I10e255518dbed9bff095dc08d46d6cb8dde58830
---
 tensorflow/python/distribute/mirrored_run.py   | 2 +-
 tensorflow/python/distribute/ps_values_test.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/python/distribute/mirrored_run.py b/tensorflow/python/distribute/mirrored_run.py
index ed338b05a4c..c0438d4fd12 100644
--- a/tensorflow/python/distribute/mirrored_run.py
+++ b/tensorflow/python/distribute/mirrored_run.py
@@ -83,7 +83,7 @@ def call_for_each_replica(strategy, fn, args=None, kwargs=None):
         "overhead currently. We will be working on improving "
         "this in the future, but for now please wrap "
         "`call_for_each_replica` or `experimental_run` or "
-        "`experimental_run_v2` inside a tf.function to get "
+        "`run` inside a tf.function to get "
         "the best performance." % strategy.__class__.__name__, 5)
   else:
     # When a tf.function is wrapped to trigger _call_for_each_replica (see
diff --git a/tensorflow/python/distribute/ps_values_test.py b/tensorflow/python/distribute/ps_values_test.py
index b8d6b3f35a0..a1d1a381528 100644
--- a/tensorflow/python/distribute/ps_values_test.py
+++ b/tensorflow/python/distribute/ps_values_test.py
@@ -57,7 +57,7 @@ class AggregatingVariableTest(test.TestCase, parameterized.TestCase):
 
     per_replica_results = self.evaluate(
         distribution.experimental_local_results(
-            distribution.experimental_run_v2(assign)))
+            distribution.run(assign)))
     self.assertAllEqual([3], per_replica_results)
 
 
From f072535ba3aa4b6b318175b4e599022f0c92703c Mon Sep 17 00:00:00 2001
From: Hongkun Yu <hongkuny@google.com>
Date: Mon, 29 Jun 2020 11:23:02 -0700
Subject: [PATCH 1268/1390] Fix Checkpoint docstring typos. restore->read

PiperOrigin-RevId: 318854154
Change-Id: Ib125308cc50ff9c205f85b6315badd3963f2c382
---
 tensorflow/python/training/tracking/util.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tensorflow/python/training/tracking/util.py b/tensorflow/python/training/tracking/util.py
index 7b603ed5dc2..407021672da 100644
--- a/tensorflow/python/training/tracking/util.py
+++ b/tensorflow/python/training/tracking/util.py
@@ -1994,7 +1994,7 @@ class Checkpoint(tracking.AutoTrackable):
     return file_path
 
   def read(self, save_path, options=None):
-    """Read a training checkpoint written with `write`.
+    """Reads a training checkpoint written with `write`.
 
     Reads this `Checkpoint` and any objects it depends on.
 
@@ -2017,7 +2017,7 @@ class Checkpoint(tracking.AutoTrackable):
     # With restore() assert_consumed() would have failed.
     checkpoint.read(path).assert_consumed()
 
-    # You can also pass options to restore(). For example this
+    # You can also pass options to read(). For example this
     # runs the IO ops on the localhost:
     options = tf.CheckpointOptions(experimental_io_device="/job:localhost")
     checkpoint.read(path, options=options)
@@ -2035,7 +2035,7 @@ class Checkpoint(tracking.AutoTrackable):
     return self._saver.restore(save_path=save_path, options=options)
 
   def restore(self, save_path, options=None):
-    """Restore a training checkpoint.
+    """Restores a training checkpoint.
 
     Restores this `Checkpoint` and any objects it depends on.
 

From ce054f48e61fbff37ed7512b74732bd875d9d63e Mon Sep 17 00:00:00 2001
From: Dan Moldovan <mdan@google.com>
Date: Mon, 29 Jun 2020 11:23:20 -0700
Subject: [PATCH 1269/1390] Clean up the test to remove duplication and run
 with within the real framework.

PiperOrigin-RevId: 318854218
Change-Id: I45125a5a028a6d5fd7ae8cd9bed698d31d04196b
---
 tensorflow/python/autograph/converters/BUILD  |  12 -
 .../autograph/converters/asserts_test.py      |  13 +-
 .../converters/break_statements_test.py       |  68 ++-
 .../autograph/converters/call_trees_test.py   | 240 ++++++-----
 .../conditional_expressions_test.py           |  21 +-
 .../converters/continue_statements_test.py    | 104 +++--
 .../autograph/converters/control_flow_test.py | 407 ++++++++++--------
 .../autograph/converters/directives_test.py   |  57 +--
 .../autograph/converters/functions_test.py    |  91 ++--
 .../converters/list_comprehensions_test.py    |  26 +-
 .../python/autograph/converters/lists_test.py |  95 ++--
 .../converters/logical_expressions_test.py    |  61 ++-
 .../converters/loop_integration_test.py       |  95 ----
 .../converters/return_statements_test.py      | 125 +++---
 .../autograph/converters/slices_test.py       |  39 +-
 .../autograph/converters/variables_test.py    | 128 +++---
 .../python/autograph/core/converter_test.py   |  36 +-
 .../autograph/core/converter_testing.py       | 174 +++-----
 .../python/autograph/impl/conversion.py       |  16 +-
 19 files changed, 826 insertions(+), 982 deletions(-)
 delete mode 100644 tensorflow/python/autograph/converters/loop_integration_test.py

diff --git a/tensorflow/python/autograph/converters/BUILD b/tensorflow/python/autograph/converters/BUILD
index 9cf3bba8dd5..f584038978f 100644
--- a/tensorflow/python/autograph/converters/BUILD
+++ b/tensorflow/python/autograph/converters/BUILD
@@ -77,12 +77,6 @@ py_test(
     srcs = ["call_trees_test.py"],
     python_version = "PY3",
     srcs_version = "PY3",
-    tags = [
-        "no_oss_py2",
-        "no_pip",
-        "no_windows",
-        "nopip",
-    ],
     deps = [
         ":converters",
         "//tensorflow/python:client_testlib",
@@ -119,12 +113,6 @@ py_test(
     srcs = ["control_flow_test.py"],
     python_version = "PY3",
     srcs_version = "PY3",
-    tags = [
-        "no_oss_py2",
-        "no_pip",
-        "no_windows",
-        "nopip",
-    ],
     deps = [
         ":converters",
         "//tensorflow/python:client_testlib",
diff --git a/tensorflow/python/autograph/converters/asserts_test.py b/tensorflow/python/autograph/converters/asserts_test.py
index dc435cbc90e..bf063829e42 100644
--- a/tensorflow/python/autograph/converters/asserts_test.py
+++ b/tensorflow/python/autograph/converters/asserts_test.py
@@ -24,7 +24,6 @@ from tensorflow.python.autograph.converters import return_statements
 from tensorflow.python.autograph.core import converter_testing
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import errors_impl
-from tensorflow.python.framework import ops
 from tensorflow.python.platform import test
 
 
@@ -32,17 +31,15 @@ class AssertsTest(converter_testing.TestCase):
 
   def test_basic(self):
 
-    def test_fn(a):
+    def f(a):
       assert a, 'testmsg'
       return a
 
-    with ops.Graph().as_default():
-      with self.converted(
-          test_fn, (functions, asserts, return_statements), {}) as result:
-        op = result.test_fn(constant_op.constant(False))
+    tr = self.transform(f, (functions, asserts, return_statements))
 
-      with self.assertRaisesRegexp(errors_impl.InvalidArgumentError, 'testmsg'):
-        self.evaluate(op)
+    op = tr(constant_op.constant(False))
+    with self.assertRaisesRegexp(errors_impl.InvalidArgumentError, 'testmsg'):
+      self.evaluate(op)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/autograph/converters/break_statements_test.py b/tensorflow/python/autograph/converters/break_statements_test.py
index 37accdcc1be..6313cda37fc 100644
--- a/tensorflow/python/autograph/converters/break_statements_test.py
+++ b/tensorflow/python/autograph/converters/break_statements_test.py
@@ -21,20 +21,18 @@ from __future__ import print_function
 from tensorflow.python.autograph.converters import break_statements
 from tensorflow.python.autograph.core import converter_testing
 from tensorflow.python.autograph.pyct import anno
-from tensorflow.python.framework import constant_op
 from tensorflow.python.platform import test
 
 
 class BreakCanonicalizationTest(converter_testing.TestCase):
 
-  def assertTransformedEquivalent(self, test_fn, *inputs):
-    with self.converted(test_fn, break_statements, {},
-                        (constant_op.constant,)) as result:
-      self.assertEqual(test_fn(*inputs), result.test_fn(*inputs))
+  def assertTransformedEquivalent(self, f, *inputs):
+    tr = self.transform(f, break_statements)
+    self.assertEqual(f(*inputs), tr(*inputs))
 
   def test_while_loop(self):
 
-    def test_fn(x):
+    def f(x):
       v = []
       while x > 0:
         x -= 1
@@ -43,28 +41,29 @@ class BreakCanonicalizationTest(converter_testing.TestCase):
         v.append(x)
       return v
 
-    self.assertTransformedEquivalent(test_fn, 0)
-    self.assertTransformedEquivalent(test_fn, 1)
-    self.assertTransformedEquivalent(test_fn, 4)
+    self.assertTransformedEquivalent(f, 0)
+    self.assertTransformedEquivalent(f, 1)
+    self.assertTransformedEquivalent(f, 4)
 
   def test_while_loop_preserves_directives(self):
 
-    def test_fn(x):
+    def f(x):
       while x > 0:
         x -= 1
         if x % 2 == 0:
           break
 
-    node, ctx = self.prepare(test_fn, {})
+    _, node, ctx = self.transform(f, (), include_ast=True)
     fake_annotation = object()
     anno.setanno(node.body[0], anno.Basic.DIRECTIVES, fake_annotation)
     node = break_statements.transform(node, ctx)
+
     self.assertIs(
         anno.getanno(node.body[1], anno.Basic.DIRECTIVES), fake_annotation)
 
   def test_for_loop(self):
 
-    def test_fn(a):
+    def f(a):
       v = []
       for x in a:
         x -= 1
@@ -73,20 +72,18 @@ class BreakCanonicalizationTest(converter_testing.TestCase):
         v.append(x)
       return v
 
-    with self.converted(test_fn, break_statements, {},
-                        (constant_op.constant,)) as result:
-      # The break is incompletely canonicalized. The loop will not interrupt,
-      # but the section following the break will be skipped.
-      self.assertEqual([3], result.test_fn([5, 4]))
+    tr = self.transform(f, break_statements)
+
+    self.assertEqual([3], tr([5, 4]))
 
   def test_for_loop_preserves_directives(self):
 
-    def test_fn(a):
+    def f(a):
       for x in a:
         if x % 2 == 0:
           break
 
-    node, ctx = self.prepare(test_fn, {})
+    _, node, ctx = self.transform(f, (), include_ast=True)
     fake_annotation = object()
     anno.setanno(node.body[0], anno.Basic.DIRECTIVES, fake_annotation)
     node = break_statements.transform(node, ctx)
@@ -95,7 +92,7 @@ class BreakCanonicalizationTest(converter_testing.TestCase):
 
   def test_nested(self):
 
-    def test_fn(x):
+    def f(x):
       v = []
       u = []
       w = []
@@ -110,13 +107,13 @@ class BreakCanonicalizationTest(converter_testing.TestCase):
         v.append(x)
       return v, u, w
 
-    self.assertTransformedEquivalent(test_fn, 0)
-    self.assertTransformedEquivalent(test_fn, 3)
-    self.assertTransformedEquivalent(test_fn, 11)
+    self.assertTransformedEquivalent(f, 0)
+    self.assertTransformedEquivalent(f, 3)
+    self.assertTransformedEquivalent(f, 11)
 
   def test_nested_loops(self):
 
-    def test_fn(x):
+    def f(x):
       v = []
       u = []
       while x > 0:
@@ -132,14 +129,14 @@ class BreakCanonicalizationTest(converter_testing.TestCase):
         v.append(x)
       return v, u
 
-    self.assertTransformedEquivalent(test_fn, 0)
-    self.assertTransformedEquivalent(test_fn, 2)
-    self.assertTransformedEquivalent(test_fn, 3)
-    self.assertTransformedEquivalent(test_fn, 5)
+    self.assertTransformedEquivalent(f, 0)
+    self.assertTransformedEquivalent(f, 2)
+    self.assertTransformedEquivalent(f, 3)
+    self.assertTransformedEquivalent(f, 5)
 
   def test_loop_orelse(self):
 
-    def test_fn(x):
+    def f(x):
       v = []
       u = []
       while x > 0:
@@ -153,12 +150,12 @@ class BreakCanonicalizationTest(converter_testing.TestCase):
         v.append(x)
       return v, u
 
-    self.assertTransformedEquivalent(test_fn, 0)
-    self.assertTransformedEquivalent(test_fn, 2)
-    self.assertTransformedEquivalent(test_fn, 3)
+    self.assertTransformedEquivalent(f, 0)
+    self.assertTransformedEquivalent(f, 2)
+    self.assertTransformedEquivalent(f, 3)
 
   def test_multiple_correlated_breaks_with_side_effects(self):
-    def test_fn(cond1):
+    def f(cond1):
       lst = []
       while True:
         if cond1:
@@ -169,8 +166,9 @@ class BreakCanonicalizationTest(converter_testing.TestCase):
           break
       return lst
 
-    self.assertTransformedEquivalent(test_fn, True)
-    self.assertTransformedEquivalent(test_fn, False)
+    self.assertTransformedEquivalent(f, True)
+    self.assertTransformedEquivalent(f, False)
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/autograph/converters/call_trees_test.py b/tensorflow/python/autograph/converters/call_trees_test.py
index 86ca2dc9c24..a0bae91af02 100644
--- a/tensorflow/python/autograph/converters/call_trees_test.py
+++ b/tensorflow/python/autograph/converters/call_trees_test.py
@@ -27,169 +27,193 @@ from tensorflow.python.autograph.core import converter_testing
 from tensorflow.python.platform import test
 
 
+class MockConvertedCall(object):
+
+  def __init__(self):
+    self.calls = []
+
+  def __call__(self, f, args, kwargs, caller_fn_scope=None, options=None):
+    del caller_fn_scope, options
+    self.calls.append((args, kwargs))
+    kwargs = kwargs or {}
+    return f(*args, **kwargs)
+
+
 class CallTreesTest(converter_testing.TestCase):
 
+  def _transform_with_mock(self, f):
+    mock = MockConvertedCall()
+    tr = self.transform(
+        f, (functions, call_trees),
+        ag_overrides={'converted_call': mock})
+    return tr, mock
+
   def test_function_no_args(self):
 
-    def test_fn(f):
+    def f(f):
       return f() + 20
 
-    with self.converted(test_fn, (functions, call_trees), {}) as result:
-      self.assertEqual(result.test_fn(lambda: 1), 21)
-      self.assertListEqual(self.dynamic_calls, [((), None)])
+    tr, mock = self._transform_with_mock(f)
+
+    self.assertEqual(tr(lambda: 1), 21)
+    self.assertListEqual(mock.calls, [((), None)])
 
   def test_function_with_expression_in_argument(self):
 
-    def test_fn(f, g):
+    def f(f, g):
       return f(g() + 20) + 4000
 
-    with self.converted(test_fn, (functions, call_trees), {}) as result:
-      self.assertEqual(result.test_fn(lambda x: x + 300, lambda: 1), 4321)
-      self.assertListEqual(self.dynamic_calls, [
-          ((), None),
-          ((21,), None),
-      ])
+    tr, mock = self._transform_with_mock(f)
+
+    self.assertEqual(tr(lambda x: x + 300, lambda: 1), 4321)
+    self.assertListEqual(mock.calls, [
+        ((), None),
+        ((21,), None),
+    ])
 
   def test_function_with_call_in_argument(self):
 
-    def test_fn(f, g):
+    def f(f, g):
       return f(g()) + 300
 
-    with self.converted(test_fn, (functions, call_trees), {}) as result:
-      self.assertEqual(result.test_fn(lambda x: x + 20, lambda: 1), 321)
-      self.assertListEqual(self.dynamic_calls, [
-          ((), None),
-          ((1,), None),
-      ])
+    tr, mock = self._transform_with_mock(f)
+
+    self.assertEqual(tr(lambda x: x + 20, lambda: 1), 321)
+    self.assertListEqual(mock.calls, [
+        ((), None),
+        ((1,), None),
+    ])
 
   def test_function_chaining(self):
 
     def get_one():
       return 1
 
-    def test_fn():
+    def f():
       return get_one().__add__(20)
 
-    with self.converted(test_fn, (functions, call_trees),
-                        {'get_one': get_one}, ()) as result:
+    tr, mock = self._transform_with_mock(f)
 
-      self.assertEqual(result.test_fn(), 21)
-
-      self.assertListEqual(self.dynamic_calls, [
-          ((), None),
-          ((20,), None),
-      ])
+    self.assertEqual(tr(), 21)
+    self.assertListEqual(mock.calls, [
+        ((), None),
+        ((20,), None),
+    ])
 
   def test_function_with_single_arg(self):
 
-    def test_fn(f, a):
+    def f(f, a):
       return f(a) + 20
 
-    with self.converted(test_fn, (functions, call_trees), {}) as result:
-      self.assertEqual(result.test_fn(lambda a: a, 1), 21)
-      self.assertListEqual(self.dynamic_calls, [((1,), None)])
+    tr, mock = self._transform_with_mock(f)
+
+    self.assertEqual(tr(lambda a: a, 1), 21)
+    self.assertListEqual(mock.calls, [((1,), None)])
 
   def test_function_with_args_only(self):
 
-    def test_fn(f, a, b):
+    def f(f, a, b):
       return f(a, b) + 300
 
-    with self.converted(test_fn, (functions, call_trees), {}) as result:
-      self.assertEqual(result.test_fn(lambda a, b: a + b, 1, 20), 321)
-      self.assertListEqual(self.dynamic_calls, [((1, 20), None)])
+    tr, mock = self._transform_with_mock(f)
+
+    self.assertEqual(tr(lambda a, b: a + b, 1, 20), 321)
+    self.assertListEqual(mock.calls, [((1, 20), None)])
 
   def test_function_with_kwarg(self):
 
-    def test_fn(f, a, b):
+    def f(f, a, b):
       return f(a, c=b) + 300
 
-    with self.converted(test_fn, (functions, call_trees), {}) as result:
-      self.assertEqual(result.test_fn(lambda a, c: a + c, 1, 20), 321)
-      self.assertListEqual(self.dynamic_calls, [((1,), {'c': 20})])
+    tr, mock = self._transform_with_mock(f)
+
+    self.assertEqual(tr(lambda a, c: a + c, 1, 20), 321)
+    self.assertListEqual(mock.calls, [((1,), {'c': 20})])
 
   def test_function_with_kwargs_starargs(self):
 
-    def test_fn(f, a, *args, **kwargs):
+    def f(f, a, *args, **kwargs):
       return f(a, *args, **kwargs) + 5
 
-    with self.converted(test_fn, (functions, call_trees), {}) as result:
-      self.assertEqual(
-          result.test_fn(lambda *args, **kwargs: 7, 1, *[2, 3], **{
-              'b': 4,
-              'c': 5
-          }), 12)
-      self.assertListEqual(self.dynamic_calls, [((1, 2, 3), {'b': 4, 'c': 5})])
+    tr, mock = self._transform_with_mock(f)
+
+    self.assertEqual(
+        tr(lambda *args, **kwargs: 7, 1, *[2, 3], **{
+            'b': 4,
+            'c': 5
+        }), 12)
+    self.assertListEqual(mock.calls, [((1, 2, 3), {'b': 4, 'c': 5})])
 
   def test_function_with_starargs_only(self):
 
-    def f(*args):
+    def g(*args):
       return sum(args)
 
-    def test_fn():
+    def f():
       args = [1, 20, 300]
-      return f(*args) + 4000
+      return g(*args) + 4000
 
-    with self.converted(test_fn, (functions, call_trees),
-                        {'f': f}) as result:
-      self.assertEqual(result.test_fn(), 4321)
-      self.assertListEqual(self.dynamic_calls, [((1, 20, 300), None)])
+    tr, mock = self._transform_with_mock(f)
 
-  # TODO(b/142586827): Enable this test.
-  #   def test_function_with_starargs_mixed(self):
-  #
-  #     def f(a, b, c, d):
-  #       return a * 1000 + b * 100 + c * 10 + d
-  #
-  #     def test_fn():
-  #       args1 = (1,)
-  #       args2 = [3]
-  #       return f(*args1, 2, *args2, 4)
-  #
-  #     with self.converted(test_fn, (functions, call_trees),
-  #                         {'f': f}) as result:
-  #       self.assertEqual(result.test_fn(), 1234)
-  #       self.assertListEqual(self.dynamic_calls, [((1, 2, 3, 4), None)])
+    self.assertEqual(tr(), 4321)
+    self.assertListEqual(mock.calls, [((1, 20, 300), None)])
+
+  def test_function_with_starargs_mixed(self):
+
+    def g(a, b, c, d):
+      return a * 1000 + b * 100 + c * 10 + d
+
+    def f():
+      args1 = (1,)
+      args2 = [3]
+      return g(*args1, 2, *args2, 4)
+
+    tr, mock = self._transform_with_mock(f)
+
+    self.assertEqual(tr(), 1234)
+    self.assertListEqual(mock.calls, [((1, 2, 3, 4), None)])
 
   def test_function_with_kwargs_keywords(self):
 
-    def test_fn(f, a, b, **kwargs):
+    def f(f, a, b, **kwargs):
       return f(a, b=b, **kwargs) + 5
 
-    with self.converted(test_fn, (functions, call_trees), {}) as result:
-      self.assertEqual(
-          result.test_fn(lambda *args, **kwargs: 7, 1, 2, **{'c': 3}), 12)
-      self.assertListEqual(self.dynamic_calls, [((1,), {'b': 2, 'c': 3})])
+    tr, mock = self._transform_with_mock(f)
 
-  # TODO(b/142586827): Enable this test.
-  #   def test_function_with_multiple_kwargs(self):
-  #
-  #     def test_fn(f, a, b, c, kwargs1, kwargs2):
-  #       return f(a, b=b, **kwargs1, c=c, **kwargs2) + 5
-  #
-  #     with self.converted(test_fn, (functions, call_trees), {}) as result:
-  #       self.assertEqual(
-  #           result.test_fn(lambda *args, **kwargs: 7, 1, 2, 3, {'d': 4},
-  #                          {'e': 5}), 12)
-  #       self.assertListEqual(self.dynamic_calls, [((1,), {
-  #           'b': 2,
-  #           'c': 3,
-  #           'd': 4,
-  #           'e': 5
-  #       })])
+    self.assertEqual(
+        tr(lambda *args, **kwargs: 7, 1, 2, **{'c': 3}), 12)
+    self.assertListEqual(mock.calls, [((1,), {'b': 2, 'c': 3})])
+
+  def test_function_with_multiple_kwargs(self):
+
+    def f(f, a, b, c, kwargs1, kwargs2):
+      return f(a, b=b, **kwargs1, c=c, **kwargs2) + 5
+
+    tr, mock = self._transform_with_mock(f)
+
+    self.assertEqual(
+        tr(lambda *args, **kwargs: 7, 1, 2, 3, {'d': 4}, {'e': 5}), 12)
+    self.assertListEqual(mock.calls, [((1,), {
+        'b': 2,
+        'c': 3,
+        'd': 4,
+        'e': 5
+    })])
 
   def test_function_with_call_in_lambda_argument(self):
 
-    def f(l, a):
+    def h(l, a):
       return l(a) + 4000
 
     def g(a, *args):
       return a + sum(args)
 
-    def test_fn(f, g, a, *args):
-      return f(lambda x: g(x, *args), a)
+    def f(h, g, a, *args):
+      return h(lambda x: g(x, *args), a)
 
-    with self.converted(test_fn, (functions, call_trees), {}) as result:
-      self.assertEqual(result.test_fn(f, g, 1, *(20, 300)), 4321)
+    tr, _ = self._transform_with_mock(f)
+
+    self.assertEqual(tr(h, g, 1, *(20, 300)), 4321)
 
   def test_debugger_set_trace(self):
 
@@ -198,13 +222,13 @@ class CallTreesTest(converter_testing.TestCase):
     pdb = imp.new_module('fake_pdb')
     pdb.set_trace = lambda: tracking_list.append(1)
 
-    def test_fn():
+    def f():
       return pdb.set_trace()
 
-    with self.converted(test_fn, (functions, call_trees),
-                        {'pdb': pdb}) as result:
-      result.test_fn()
-      self.assertListEqual(tracking_list, [1])
+    tr, _ = self._transform_with_mock(f)
+
+    tr()
+    self.assertListEqual(tracking_list, [1])
 
   def test_class_method(self):
 
@@ -217,10 +241,10 @@ class CallTreesTest(converter_testing.TestCase):
         return self.other_method(a) + 300
 
     tc = TestClass()
-    with self.converted(TestClass.test_method, (functions, call_trees),
-                        {}) as result:
-      self.assertEqual(321, result.test_method(tc, 1))
-      self.assertListEqual(self.dynamic_calls, [((1,), None)])
+    tr, mock = self._transform_with_mock(TestClass.test_method)
+
+    self.assertEqual(321, tr(tc, 1))
+    self.assertListEqual(mock.calls, [((1,), None)])
 
   def test_object_method(self):
 
@@ -233,10 +257,10 @@ class CallTreesTest(converter_testing.TestCase):
         return self.other_method(a) + 300
 
     tc = TestClass()
-    with self.converted(tc.test_method, (functions, call_trees),
-                        {}) as result:
-      self.assertEqual(321, result.test_method(tc, 1))
-      self.assertListEqual(self.dynamic_calls, [((1,), None)])
+    tr, mock = self._transform_with_mock(tc.test_method)
+
+    self.assertEqual(321, tr(tc, 1))
+    self.assertListEqual(mock.calls, [((1,), None)])
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/autograph/converters/conditional_expressions_test.py b/tensorflow/python/autograph/converters/conditional_expressions_test.py
index dd1f8d485cc..020849d79f5 100644
--- a/tensorflow/python/autograph/converters/conditional_expressions_test.py
+++ b/tensorflow/python/autograph/converters/conditional_expressions_test.py
@@ -25,28 +25,27 @@ from tensorflow.python.platform import test
 
 class ConditionalExpressionsTest(converter_testing.TestCase):
 
-  def assertTransformedEquivalent(self, test_fn, *inputs):
-    ns = {}
-    with self.converted(test_fn, conditional_expressions, ns) as result:
-      self.assertEqual(test_fn(*inputs), result.test_fn(*inputs))
+  def assertTransformedEquivalent(self, f, *inputs):
+    tr = self.transform(f, conditional_expressions)
+    self.assertEqual(f(*inputs), tr(*inputs))
 
   def test_basic(self):
 
-    def test_fn(x):
+    def f(x):
       return 1 if x else 0
 
-    self.assertTransformedEquivalent(test_fn, 0)
-    self.assertTransformedEquivalent(test_fn, 3)
+    self.assertTransformedEquivalent(f, 0)
+    self.assertTransformedEquivalent(f, 3)
 
   def test_nested_orelse(self):
 
-    def test_fn(x):
+    def f(x):
       y = x * x if x > 0 else x if x else 1
       return y
 
-    self.assertTransformedEquivalent(test_fn, -2)
-    self.assertTransformedEquivalent(test_fn, 0)
-    self.assertTransformedEquivalent(test_fn, 2)
+    self.assertTransformedEquivalent(f, -2)
+    self.assertTransformedEquivalent(f, 0)
+    self.assertTransformedEquivalent(f, 2)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/autograph/converters/continue_statements_test.py b/tensorflow/python/autograph/converters/continue_statements_test.py
index a24ddd5e527..ed6e27fca6f 100644
--- a/tensorflow/python/autograph/converters/continue_statements_test.py
+++ b/tensorflow/python/autograph/converters/continue_statements_test.py
@@ -20,21 +20,19 @@ from __future__ import print_function
 
 from tensorflow.python.autograph.converters import continue_statements
 from tensorflow.python.autograph.core import converter_testing
-from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import ops
 from tensorflow.python.platform import test
 
 
 class ContinueCanonicalizationTest(converter_testing.TestCase):
 
-  def assertTransformedEquivalent(self, test_fn, *inputs):
-    with self.converted(test_fn, continue_statements, {'ops': ops},
-                        (constant_op.constant,)) as result:
-      self.assertEqual(test_fn(*inputs), result.test_fn(*inputs))
+  def assertTransformedEquivalent(self, f, *inputs):
+    tr = self.transform(f, continue_statements)
+    self.assertEqual(f(*inputs), tr(*inputs))
 
   def test_basic(self):
 
-    def test_fn(x):
+    def f(x):
       v = []
       while x > 0:
         x -= 1
@@ -43,14 +41,14 @@ class ContinueCanonicalizationTest(converter_testing.TestCase):
         v.append(x)
       return v
 
-    self.assertTransformedEquivalent(test_fn, 0)
-    self.assertTransformedEquivalent(test_fn, 1)
-    self.assertTransformedEquivalent(test_fn, 3)
-    self.assertTransformedEquivalent(test_fn, 4)
+    self.assertTransformedEquivalent(f, 0)
+    self.assertTransformedEquivalent(f, 1)
+    self.assertTransformedEquivalent(f, 3)
+    self.assertTransformedEquivalent(f, 4)
 
   def test_multiple_continues(self):
 
-    def test_fn(x):
+    def f(x):
       v = []
       while x > 0:
         x -= 1
@@ -61,14 +59,14 @@ class ContinueCanonicalizationTest(converter_testing.TestCase):
         v.append(x)
       return v
 
-    self.assertTransformedEquivalent(test_fn, 0)
-    self.assertTransformedEquivalent(test_fn, 1)
-    self.assertTransformedEquivalent(test_fn, 3)
-    self.assertTransformedEquivalent(test_fn, 4)
+    self.assertTransformedEquivalent(f, 0)
+    self.assertTransformedEquivalent(f, 1)
+    self.assertTransformedEquivalent(f, 3)
+    self.assertTransformedEquivalent(f, 4)
 
   def test_multiple_continues_in_nested_scope(self):
 
-    def test_fn(a):
+    def f(a):
       v = []
       for x in a:
         x -= 1
@@ -81,14 +79,14 @@ class ContinueCanonicalizationTest(converter_testing.TestCase):
         v.append(x)
       return v
 
-    self.assertTransformedEquivalent(test_fn, [])
-    self.assertTransformedEquivalent(test_fn, [1])
-    self.assertTransformedEquivalent(test_fn, [2])
-    self.assertTransformedEquivalent(test_fn, [1, 2, 3])
+    self.assertTransformedEquivalent(f, [])
+    self.assertTransformedEquivalent(f, [1])
+    self.assertTransformedEquivalent(f, [2])
+    self.assertTransformedEquivalent(f, [1, 2, 3])
 
   def test_for_loop(self):
 
-    def test_fn(a):
+    def f(a):
       v = []
       for x in a:
         x -= 1
@@ -97,14 +95,14 @@ class ContinueCanonicalizationTest(converter_testing.TestCase):
         v.append(x)
       return v
 
-    self.assertTransformedEquivalent(test_fn, [])
-    self.assertTransformedEquivalent(test_fn, [1])
-    self.assertTransformedEquivalent(test_fn, [2])
-    self.assertTransformedEquivalent(test_fn, [1, 2, 3])
+    self.assertTransformedEquivalent(f, [])
+    self.assertTransformedEquivalent(f, [1])
+    self.assertTransformedEquivalent(f, [2])
+    self.assertTransformedEquivalent(f, [1, 2, 3])
 
   def test_nested_with(self):
 
-    def test_fn(x):
+    def f(x):
       v = []
       while x > 0:
         x -= 1
@@ -114,14 +112,14 @@ class ContinueCanonicalizationTest(converter_testing.TestCase):
         v.append(x)
       return v
 
-    self.assertTransformedEquivalent(test_fn, 0)
-    self.assertTransformedEquivalent(test_fn, 1)
-    self.assertTransformedEquivalent(test_fn, 3)
-    self.assertTransformedEquivalent(test_fn, 4)
+    self.assertTransformedEquivalent(f, 0)
+    self.assertTransformedEquivalent(f, 1)
+    self.assertTransformedEquivalent(f, 3)
+    self.assertTransformedEquivalent(f, 4)
 
   def test_nested_multiple_withs(self):
 
-    def test_fn(x):
+    def f(x):
       v = []
       while x > 0:
         x -= 1
@@ -133,14 +131,14 @@ class ContinueCanonicalizationTest(converter_testing.TestCase):
         v.append(x)
       return v
 
-    self.assertTransformedEquivalent(test_fn, 0)
-    self.assertTransformedEquivalent(test_fn, 1)
-    self.assertTransformedEquivalent(test_fn, 3)
-    self.assertTransformedEquivalent(test_fn, 4)
+    self.assertTransformedEquivalent(f, 0)
+    self.assertTransformedEquivalent(f, 1)
+    self.assertTransformedEquivalent(f, 3)
+    self.assertTransformedEquivalent(f, 4)
 
   def test_nested_multiple_withs_and_statements(self):
 
-    def test_fn(x):
+    def f(x):
       v = []
       while x > 0:
         x -= 1
@@ -154,14 +152,14 @@ class ContinueCanonicalizationTest(converter_testing.TestCase):
         v.append(x)
       return v
 
-    self.assertTransformedEquivalent(test_fn, 0)
-    self.assertTransformedEquivalent(test_fn, 1)
-    self.assertTransformedEquivalent(test_fn, 3)
-    self.assertTransformedEquivalent(test_fn, 4)
+    self.assertTransformedEquivalent(f, 0)
+    self.assertTransformedEquivalent(f, 1)
+    self.assertTransformedEquivalent(f, 3)
+    self.assertTransformedEquivalent(f, 4)
 
   def test_nested_multiple_withs_and_nested_withs(self):
 
-    def test_fn(x):
+    def f(x):
       v = []
       while x > 0:
         x -= 1
@@ -176,14 +174,14 @@ class ContinueCanonicalizationTest(converter_testing.TestCase):
         v.append(x)
       return v
 
-    self.assertTransformedEquivalent(test_fn, 0)
-    self.assertTransformedEquivalent(test_fn, 1)
-    self.assertTransformedEquivalent(test_fn, 3)
-    self.assertTransformedEquivalent(test_fn, 4)
+    self.assertTransformedEquivalent(f, 0)
+    self.assertTransformedEquivalent(f, 1)
+    self.assertTransformedEquivalent(f, 3)
+    self.assertTransformedEquivalent(f, 4)
 
   def test_nested(self):
 
-    def test_fn(x):
+    def f(x):
       v = []
       u = []
       w = []
@@ -198,14 +196,14 @@ class ContinueCanonicalizationTest(converter_testing.TestCase):
         v.append(x)
       return v, u, w
 
-    self.assertTransformedEquivalent(test_fn, 0)
-    self.assertTransformedEquivalent(test_fn, 1)
-    self.assertTransformedEquivalent(test_fn, 3)
-    self.assertTransformedEquivalent(test_fn, 4)
+    self.assertTransformedEquivalent(f, 0)
+    self.assertTransformedEquivalent(f, 1)
+    self.assertTransformedEquivalent(f, 3)
+    self.assertTransformedEquivalent(f, 4)
 
   def test_multiple_guarded_continues_with_side_effects(self):
 
-    def test_fn(x):
+    def f(x):
       def track(u, x):
         u.append(x)
         return x
@@ -221,8 +219,8 @@ class ContinueCanonicalizationTest(converter_testing.TestCase):
         v.append(x)
       return u, v
 
-    self.assertTransformedEquivalent(test_fn, 3)
-    self.assertTransformedEquivalent(test_fn, 2)
+    self.assertTransformedEquivalent(f, 3)
+    self.assertTransformedEquivalent(f, 2)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/autograph/converters/control_flow_test.py b/tensorflow/python/autograph/converters/control_flow_test.py
index f0681128698..87f59bef675 100644
--- a/tensorflow/python/autograph/converters/control_flow_test.py
+++ b/tensorflow/python/autograph/converters/control_flow_test.py
@@ -23,6 +23,8 @@ import collections
 
 import numpy as np
 
+from tensorflow.python.autograph.converters import break_statements
+from tensorflow.python.autograph.converters import continue_statements
 from tensorflow.python.autograph.converters import control_flow
 from tensorflow.python.autograph.core import converter_testing
 from tensorflow.python.eager import def_function
@@ -34,7 +36,8 @@ from tensorflow.python.framework import tensor_util
 from tensorflow.python.platform import test
 from tensorflow.python.util import nest
 
-# TODO(mdan): These tests are not isolated - they also test the operators.
+
+for_unaffected_global = None
 
 
 class ControlFlowTestBase(converter_testing.TestCase):
@@ -45,22 +48,19 @@ class ControlFlowTestBase(converter_testing.TestCase):
         actual)
     self.assertAllEqual(values, expected)
 
-  def assertTransformedResult(self, test_fn, inputs, expected, symbols=None):
+  def assertTransformedResult(self, f, inputs, expected):
     if not isinstance(inputs, tuple):
       inputs = (inputs,)
-    if not symbols:
-      symbols = {}
-    with self.converted(test_fn, control_flow, symbols,
-                        (constant_op.constant,)) as result:
-      returns = result.test_fn(*inputs)
-      self.assertValuesEqual(returns, expected)
+    tr = self.transform(f, control_flow)
+    returns = tr(*inputs)
+    self.assertValuesEqual(returns, expected)
 
 
 class NestedControlFlowTest(ControlFlowTestBase):
 
   def test_basic(self):
 
-    def test_fn(n):
+    def f(n):
       i = 0
       j = 0
       s = 0
@@ -73,7 +73,7 @@ class NestedControlFlowTest(ControlFlowTestBase):
         j = 0
       return s, i, j, n
 
-    self.assertTransformedResult(test_fn, constant_op.constant(5),
+    self.assertTransformedResult(f, constant_op.constant(5),
                                  (25, 5, 0, 5))
 
   def test_composite_state_complex(self):
@@ -88,7 +88,7 @@ class NestedControlFlowTest(ControlFlowTestBase):
       def __init__(self, y):
         self.y = y
 
-    def test_fn(n):
+    def f(n):
       tc = TestClassX(TestClassY({'z': TestClassX(n)}))
       if n > 0:
         while n > 0:
@@ -97,19 +97,17 @@ class NestedControlFlowTest(ControlFlowTestBase):
           n -= 1
       return n, tc
 
-    with self.converted(test_fn, control_flow, {
-        'TestClassX': TestClassX,
-        'TestClassY': TestClassY,
-    }) as result:
-      n, tc = result.test_fn(constant_op.constant(5))
-      self.assertValuesEqual((n, tc.x.y['z'].x), (0, 6))
+    tr = self.transform(f, control_flow)
+
+    n, tc = tr(constant_op.constant(5))
+    self.assertValuesEqual((n, tc.x.y['z'].x), (0, 6))
 
 
 class WhileStatementTest(ControlFlowTestBase):
 
   def test_basic(self):
 
-    def test_fn(n):
+    def f(n):
       i = 0
       s = 0
       while i < n:
@@ -117,16 +115,16 @@ class WhileStatementTest(ControlFlowTestBase):
         i += 1
       return s, i, n
 
-    self.assertTransformedResult(test_fn, constant_op.constant(5), (10, 5, 5))
+    self.assertTransformedResult(f, constant_op.constant(5), (10, 5, 5))
 
   def test_single_output(self):
 
-    def test_fn(n):
+    def f(n):
       while n > 0:
         n -= 1
       return n
 
-    self.assertTransformedResult(test_fn, constant_op.constant(5), 0)
+    self.assertTransformedResult(f, constant_op.constant(5), 0)
 
   def test_composite_state_attr(self):
 
@@ -135,19 +133,18 @@ class WhileStatementTest(ControlFlowTestBase):
       def __init__(self):
         self.x = constant_op.constant(3)
 
-    def test_fn(n):
+    def f(n):
       tc = TestClass()
       while n > 0:
         tc.x += 1
         n -= 1
       return n
 
-    self.assertTransformedResult(
-        test_fn, constant_op.constant(5), 0, symbols={'TestClass': TestClass})
+    self.assertTransformedResult(f, constant_op.constant(5), 0)
 
   def test_composite_state_slice(self):
 
-    def test_fn(n):
+    def f(n):
       d = {'a': n}
       k = 'a'
       while n > 0:
@@ -155,25 +152,25 @@ class WhileStatementTest(ControlFlowTestBase):
         n -= 1
       return d[k], n
 
-    self.assertTransformedResult(test_fn, constant_op.constant(5), (10, 0))
+    self.assertTransformedResult(f, constant_op.constant(5), (10, 0))
 
   def test_composite_state_literal_slice(self):
 
-    def test_fn(n):
+    def f(n):
       d = {'a': n}
       while n > 0:
         d['a'] += 1
         n -= 1
       return d['a'], n
 
-    self.assertTransformedResult(test_fn, constant_op.constant(5), (10, 0))
+    self.assertTransformedResult(f, constant_op.constant(5), (10, 0))
 
   def test_composite_state_attr_initialized_in_loop(self):
 
     class TestClass(object):
       pass
 
-    def test_fn(n, x):
+    def f(n, x):
       tc = TestClass()
       while n < 5:
         if n == 0:
@@ -183,19 +180,15 @@ class WhileStatementTest(ControlFlowTestBase):
         n += 1
       return tc.subattr
 
-    self.assertTransformedResult(
-        test_fn, (0, constant_op.constant(10)),
-        14,
-        symbols={'TestClass': TestClass})
-    with self.converted(
-        test_fn, control_flow, {'TestClass': TestClass}) as result:
-      with self.assertRaisesRegex(
-          ValueError, "'tc.subattr' must be defined before the loop"):
-        result.test_fn(constant_op.constant(0), 0)
+    self.assertTransformedResult(f, (0, constant_op.constant(10)), 14)
+    tr = self.transform(f, control_flow)
+    with self.assertRaisesRegex(
+        ValueError, "'tc.subattr' must be defined before the loop"):
+      tr(constant_op.constant(0), 0)
 
   def test_composite_state_slice_initialized_in_loop(self):
 
-    def test_fn(n, x):
+    def f(n, x):
       d = {}
       k = 'subkey'
       while n < 5:
@@ -206,16 +199,16 @@ class WhileStatementTest(ControlFlowTestBase):
         n += 1
       return d
 
-    self.assertTransformedResult(test_fn, (0, constant_op.constant(10)),
+    self.assertTransformedResult(f, (0, constant_op.constant(10)),
                                  {'subkey': 14})
-    with self.converted(test_fn, control_flow, {}) as result:
-      with self.assertRaisesRegex(
-          ValueError, r"'d\[k\]' must be defined before the loop"):
-        result.test_fn(constant_op.constant(0), 0)
+    tr = self.transform(f, control_flow)
+    with self.assertRaisesRegex(
+        ValueError, r"'d\[k\]' must be defined before the loop"):
+      tr(constant_op.constant(0), 0)
 
   def test_composite_state_literal_slice_initialized_in_loop(self):
 
-    def test_fn(n, x):
+    def f(n, x):
       d = {}
       while n < 5:
         if n == 0:
@@ -225,16 +218,16 @@ class WhileStatementTest(ControlFlowTestBase):
         n += 1
       return d
 
-    self.assertTransformedResult(test_fn, (0, constant_op.constant(10)),
+    self.assertTransformedResult(f, (0, constant_op.constant(10)),
                                  {'subkey': 14})
-    with self.converted(test_fn, control_flow, {}) as result:
-      with self.assertRaisesRegex(
-          ValueError, r"'d\['subkey'\]' must be defined before the loop"):
-        result.test_fn(constant_op.constant(0), 0)
+    tr = self.transform(f, control_flow)
+    with self.assertRaisesRegex(
+        ValueError, r"'d\['subkey'\]' must be defined before the loop"):
+      tr(constant_op.constant(0), 0)
 
   def test_composite_state_slice_aliased_to_local(self):
 
-    def test_fn(n, x):
+    def f(n, x):
       d = {}
       while n < 5:
         k = 'subkey'
@@ -242,15 +235,15 @@ class WhileStatementTest(ControlFlowTestBase):
         n += 1
       return d
 
-    self.assertTransformedResult(test_fn, (0, constant_op.constant(10)),
+    self.assertTransformedResult(f, (0, constant_op.constant(10)),
                                  {'subkey': 11})
-    with self.converted(test_fn, control_flow, {}) as result:
-      # TODO(b/136999953): Better error message.
-      # Note that this error happens at execution time.
-      with self.assertRaises(errors.InaccessibleTensorError):
-        graph_fn = def_function.function(result.test_fn, autograph=False)
-        self.evaluate(
-            graph_fn(constant_op.constant(0), constant_op.constant(5)))
+    tr = self.transform(f, control_flow)
+    # TODO(b/136999953): Better error message.
+    # Note that this error happens at execution time.
+    with self.assertRaises(errors.InaccessibleTensorError):
+      graph_fn = def_function.function(tr, autograph=False)
+      self.evaluate(
+          graph_fn(constant_op.constant(0), constant_op.constant(5)))
 
   def test_local_composite_attr(self):
 
@@ -259,19 +252,18 @@ class WhileStatementTest(ControlFlowTestBase):
       def __init__(self):
         self.x = constant_op.constant(3)
 
-    def test_fn(n):
+    def f(n):
       while n > 0:
         tc = TestClass()
         tc.x = tc.x
         n -= 1
       return n
 
-    self.assertTransformedResult(
-        test_fn, constant_op.constant(5), 0, symbols={'TestClass': TestClass})
+    self.assertTransformedResult(f, constant_op.constant(5), 0)
 
   def test_local_composite_slice(self):
 
-    def test_fn(n):
+    def f(n):
       while n > 0:
         d = {'x': n}
         k = 'x'
@@ -279,26 +271,26 @@ class WhileStatementTest(ControlFlowTestBase):
         n -= 1
       return n
 
-    self.assertTransformedResult(test_fn, constant_op.constant(5), 0, {})
+    self.assertTransformedResult(f, constant_op.constant(5), 0)
 
   def test_local_composite_literal_slice(self):
 
-    def test_fn(n):
+    def f(n):
       while n > 0:
         d = {'x': n}
         d['x'] = d['x']
         n -= 1
       return n
 
-    self.assertTransformedResult(test_fn, constant_op.constant(5), 0, {})
+    self.assertTransformedResult(f, constant_op.constant(5), 0)
 
   def test_non_tensor_state(self):
 
-    # This class is ok to be in a tf.while_loop's state.
+    # This class is ok to be in a tf.while's state.
     class TestClass(collections.namedtuple('TestClass', ('x'))):
       pass
 
-    def test_fn(n):
+    def f(n):
       tc = TestClass([constant_op.constant(0)])
       while n > 0:
         tc = TestClass([constant_op.constant(3)])
@@ -306,9 +298,7 @@ class WhileStatementTest(ControlFlowTestBase):
         n -= 1
       return tc.x[0]
 
-    ns = {'TestClass': TestClass, 'constant_op': constant_op}
-    self.assertTransformedResult(
-        test_fn, constant_op.constant(5), 4, symbols=ns)
+    self.assertTransformedResult(f, constant_op.constant(5), 4)
 
   def test_non_tensor_state_illegal_type(self):
 
@@ -317,20 +307,20 @@ class WhileStatementTest(ControlFlowTestBase):
       def __init__(self):
         self.x = [constant_op.constant(3)]
 
-    def test_fn(n):
+    def f(n):
       while n > 0:
         tc = TestClass()
         tc.x[0] = tc.x[0] + 1
         n -= 1
       return tc.x[0]
 
-    with self.converted(
-        test_fn, control_flow, {'TestClass': TestClass}) as result:
-      # The tested function would require `tc` to become part of the while loop
-      # state, but TensorFlow doesn't support classes at the moment.
-      with self.assertRaisesRegexp(
-          ValueError, 'tc.*must be defined before the loop'):
-        result.test_fn(constant_op.constant(5))
+    tr = self.transform(f, control_flow)
+
+    # The tested function would require `tc` to become part of the while loop
+    # state, but TensorFlow doesn't support classes at the moment.
+    with self.assertRaisesRegex(
+        ValueError, 'tc.*must be defined before the loop'):
+      tr(constant_op.constant(5))
 
   def test_dispatches_by_cond_only(self):
 
@@ -343,27 +333,27 @@ class WhileStatementTest(ControlFlowTestBase):
       def __add__(self, other):
         return TensorIncompatibleNumeric(self.val + other)
 
-    def test_fn(n, s):
+    def f(n, s):
       while n > 0:
         n -= 1
         s += n
       return s
 
-    self.assertTransformedResult(test_fn, (constant_op.constant(5), 0), 10)
-    with self.converted(test_fn, control_flow, {}) as result:
-      # n alone controls the staging. When the loop is not staged, Python
-      # knows how to add the two objects. But when staged, tf.while_loop will
-      # not know how to deal with the TensorIncompatibleNumeric object.
-      self.assertEqual(result.test_fn(5, TensorIncompatibleNumeric(0)).val, 10)
-      with self.assertRaises(TypeError):
-        result.test_fn(constant_op.constant(5), TensorIncompatibleNumeric(0))
+    self.assertTransformedResult(f, (constant_op.constant(5), 0), 10)
+    tr = self.transform(f, control_flow)
+    # n alone controls the staging. When the loop is not staged, Python
+    # knows how to add the two objects. But when staged, tf.while will
+    # not know how to deal with the TensorIncompatibleNumeric object.
+    self.assertEqual(tr(5, TensorIncompatibleNumeric(0)).val, 10)
+    with self.assertRaises(TypeError):
+      tr(constant_op.constant(5), TensorIncompatibleNumeric(0))
 
 
 class IfStatementTest(ControlFlowTestBase):
 
   def test_basic(self):
 
-    def test_fn(n):
+    def f(n):
       a = 0
       b = 0
       if n > 0:
@@ -372,20 +362,20 @@ class IfStatementTest(ControlFlowTestBase):
         b = 2 * n
       return a, b
 
-    self.assertTransformedResult(test_fn, constant_op.constant(1), (-1, 0))
-    self.assertTransformedResult(test_fn, constant_op.constant(-1), (0, -2))
+    self.assertTransformedResult(f, constant_op.constant(1), (-1, 0))
+    self.assertTransformedResult(f, constant_op.constant(-1), (0, -2))
 
   def test_sparse_tensor(self):
 
-    def test_fn(cond, a):
+    def f(cond, a):
       if cond:
         a = -a
       return a
 
     st = sparse_tensor.SparseTensor(
         indices=((0,),), values=(0,), dense_shape=(1,))
-    self.assertTransformedResult(test_fn, (st, constant_op.constant(1)), -1)
-    self.assertTransformedResult(test_fn, (None, constant_op.constant(1)), 1)
+    self.assertTransformedResult(f, (st, constant_op.constant(1)), -1)
+    self.assertTransformedResult(f, (None, constant_op.constant(1)), 1)
 
   def test_complex_outputs(self):
 
@@ -395,7 +385,7 @@ class IfStatementTest(ControlFlowTestBase):
         self.a = a
         self.b = b
 
-    def test_fn(n, obj):
+    def f(n, obj):
       obj.a = 0
       obj.b = 0
       if n > 0:
@@ -404,94 +394,94 @@ class IfStatementTest(ControlFlowTestBase):
         obj.b = 2 * n
       return obj
 
-    with self.converted(test_fn, control_flow, {}) as result:
-      res_obj = result.test_fn(constant_op.constant(1), TestClass(0, 0))
-      self.assertValuesEqual((res_obj.a, res_obj.b), (-1, 0))
-      res_obj = result.test_fn(constant_op.constant(-1), TestClass(0, 0))
-      self.assertValuesEqual((res_obj.a, res_obj.b), (0, -2))
+    tr = self.transform(f, control_flow)
+
+    res_obj = tr(constant_op.constant(1), TestClass(0, 0))
+    self.assertValuesEqual((res_obj.a, res_obj.b), (-1, 0))
+    res_obj = tr(constant_op.constant(-1), TestClass(0, 0))
+    self.assertValuesEqual((res_obj.a, res_obj.b), (0, -2))
 
   def test_single_output(self):
 
-    def test_fn(n):
+    def f(n):
       if n > 0:
         n = -n
       return n
 
-    self.assertTransformedResult(test_fn, constant_op.constant(1), -1)
+    self.assertTransformedResult(f, constant_op.constant(1), -1)
 
   def test_unbalanced(self):
 
-    def test_fn(n):
+    def f(n):
       if n > 0:
         n = 3
       return n
 
-    self.assertTransformedResult(test_fn, constant_op.constant(2), 3)
-    self.assertTransformedResult(test_fn, constant_op.constant(-3), -3)
+    self.assertTransformedResult(f, constant_op.constant(2), 3)
+    self.assertTransformedResult(f, constant_op.constant(-3), -3)
 
   def test_unbalanced_raising(self):
 
-    def test_fn(n):
+    def f(n):
       if n > 0:
         n = n + 1
         raise ValueError()
       return n
 
-    self.assertTransformedResult(test_fn, -3, -3)
+    self.assertTransformedResult(f, -3, -3)
 
-    with self.converted(test_fn, control_flow, {}) as result:
-      with self.assertRaises(ValueError):
-        result.test_fn(1)
+    tr = self.transform(f, control_flow)
+
+    with self.assertRaises(ValueError):
+      tr(1)
 
   def test_local_var(self):
 
-    def test_fn(n):
+    def f(n):
       if n > 0:
         b = 4
         n = b + 1
       return n
 
-    self.assertTransformedResult(test_fn, constant_op.constant(1), 5)
-    self.assertTransformedResult(test_fn, constant_op.constant(-1), -1)
+    self.assertTransformedResult(f, constant_op.constant(1), 5)
+    self.assertTransformedResult(f, constant_op.constant(-1), -1)
 
   def test_local_remains_local(self):
 
-    def test_fn(n):
+    def f(n):
       if n > 0:
         b = 4
         n = b + 1
       return n
 
-    self.assertTransformedResult(test_fn, constant_op.constant(1), 5)
-    self.assertTransformedResult(test_fn, constant_op.constant(-1), -1)
+    self.assertTransformedResult(f, constant_op.constant(1), 5)
+    self.assertTransformedResult(f, constant_op.constant(-1), -1)
 
   def test_no_outputs(self):
 
-    def test_fn(n):
+    def f(n):
       if n > 0:
         b = 4  # pylint:disable=unused-variable
       return n
 
-    # Without side effect guards, the if statement will stage a cond,
-    # but that will be pruned at execution.
-    self.assertTransformedResult(test_fn, constant_op.constant(1), 1)
-    self.assertTransformedResult(test_fn, constant_op.constant(-1), -1)
+    self.assertTransformedResult(f, constant_op.constant(1), 1)
+    self.assertTransformedResult(f, constant_op.constant(-1), -1)
 
   def test_created_outputs(self):
 
-    def test_fn(i):
+    def f(i):
       if i == 0:
         result = i - 1
       else:
         result = i + 1
       return result
 
-    self.assertTransformedResult(test_fn, 0, -1)
-    self.assertTransformedResult(test_fn, 1, 2)
+    self.assertTransformedResult(f, 0, -1)
+    self.assertTransformedResult(f, 1, 2)
 
   def test_created_loop_local_outputs(self):
 
-    def test_fn(n, x):
+    def f(n, x):
       for i in n:
         if i == 0:
           result = i - 1
@@ -501,11 +491,11 @@ class IfStatementTest(ControlFlowTestBase):
           x += 1
       return x
 
-    self.assertTransformedResult(test_fn, (range(5), 10), 14)
+    self.assertTransformedResult(f, (range(5), 10), 14)
 
   def test_created_loop_variable(self):
 
-    def test_fn(n, x):
+    def f(n, x):
       for i in n:
         if i == 0:
           result = i - 1
@@ -514,22 +504,26 @@ class IfStatementTest(ControlFlowTestBase):
             x += 1
       return x
 
-    self.assertTransformedResult(test_fn, (range(5), 10), 14)
+    self.assertTransformedResult(f, (range(5), 10), 14)
 
   def test_unaffected_global(self):
 
-    def test_fn(i):
-      global g  # pylint:disable=global-variable-undefined
-      if i == 0:
-        g = i - 1
-      return g
+    global for_unaffected_global
+    for_unaffected_global = 3
 
-    self.assertTransformedResult(test_fn, 1, 3, symbols={'g': 3})
-    self.assertTransformedResult(test_fn, 0, -1, symbols={'g': 3})
+    def f(i):
+      global for_unaffected_global
+      if i == 0:
+        for_unaffected_global = i - 1
+      return for_unaffected_global
+
+    self.assertTransformedResult(f, 1, 3)
+    self.assertTransformedResult(f, 0, -1)
+    self.assertEqual(for_unaffected_global, -1)
 
   def test_unaffected_nonlocal(self):
 
-    def test_fn(i):
+    def f(i):
       def inner_fn():
         nonlocal n
         if i == 0:
@@ -539,12 +533,12 @@ class IfStatementTest(ControlFlowTestBase):
       inner_fn()
       return n
 
-    self.assertTransformedResult(test_fn, 1, 3)
-    self.assertTransformedResult(test_fn, 0, -1)
+    self.assertTransformedResult(f, 1, 3)
+    self.assertTransformedResult(f, 0, -1)
 
   def test_output_defined_in_prior_except(self):
 
-    def test_fn(i):
+    def f(i):
       try:
         raise ValueError()
       except ValueError:
@@ -553,8 +547,8 @@ class IfStatementTest(ControlFlowTestBase):
         x = i - 1
       return x
 
-    self.assertTransformedResult(test_fn, 1, 1)
-    self.assertTransformedResult(test_fn, 0, -1)
+    self.assertTransformedResult(f, 1, 1)
+    self.assertTransformedResult(f, 0, -1)
 
   def test_unbalanced_multiple_composites(self):
 
@@ -564,7 +558,7 @@ class IfStatementTest(ControlFlowTestBase):
         self.b = 2
         self.c = 3
 
-    def test_fn(x, condition):
+    def f(x, condition):
 
       z = 5
       if condition:
@@ -574,9 +568,9 @@ class IfStatementTest(ControlFlowTestBase):
 
       return x.b, x.c, z
 
-    self.assertTransformedResult(test_fn, (Foo(), constant_op.constant(True)),
+    self.assertTransformedResult(f, (Foo(), constant_op.constant(True)),
                                  (7, 11, 13))
-    self.assertTransformedResult(test_fn, (Foo(), constant_op.constant(False)),
+    self.assertTransformedResult(f, (Foo(), constant_op.constant(False)),
                                  (2, 3, 5))
 
   def test_unbalanced_composite(self):
@@ -586,7 +580,7 @@ class IfStatementTest(ControlFlowTestBase):
       def __init__(self):
         self.b = 2
 
-    def test_fn(x, condition):
+    def f(x, condition):
 
       z = 5
       if condition:
@@ -595,9 +589,9 @@ class IfStatementTest(ControlFlowTestBase):
 
       return x.b, z
 
-    self.assertTransformedResult(test_fn, (Foo(), constant_op.constant(True)),
+    self.assertTransformedResult(f, (Foo(), constant_op.constant(True)),
                                  (7, 13))
-    self.assertTransformedResult(test_fn, (Foo(), constant_op.constant(False)),
+    self.assertTransformedResult(f, (Foo(), constant_op.constant(False)),
                                  (2, 5))
 
 
@@ -605,7 +599,7 @@ class ForStatementTest(ControlFlowTestBase):
 
   def test_basic(self):
 
-    def test_fn(l):
+    def f(l):
       s1 = 0
       s2 = 0
       for e in l:
@@ -613,21 +607,21 @@ class ForStatementTest(ControlFlowTestBase):
         s2 += e * e
       return s1, s2
 
-    self.assertTransformedResult(test_fn, constant_op.constant([1, 3]), (4, 10))
+    self.assertTransformedResult(f, constant_op.constant([1, 3]), (4, 10))
     empty_vector = constant_op.constant([], shape=(0,), dtype=dtypes.int32)
-    self.assertTransformedResult(test_fn, empty_vector, (0, 0))
+    self.assertTransformedResult(f, empty_vector, (0, 0))
 
   def test_single_output(self):
 
-    def test_fn(l):
+    def f(l):
       s = 0
       for e in l:
         s += e
       return s
 
-    self.assertTransformedResult(test_fn, constant_op.constant([1, 3]), 4)
+    self.assertTransformedResult(f, constant_op.constant([1, 3]), 4)
     empty_vector = constant_op.constant([], shape=(0,), dtype=dtypes.int32)
-    self.assertTransformedResult(test_fn, empty_vector, 0)
+    self.assertTransformedResult(f, empty_vector, 0)
 
   def test_iterated_expression(self):
 
@@ -637,26 +631,23 @@ class ForStatementTest(ControlFlowTestBase):
       eval_count[0] += 1
       return x
 
-    def test_fn(n):
+    def f(n):
       s = 0
       for e in count_evals(range(n)):
         s += e
       return s
 
-    ns = {'count_evals': count_evals}
-    node, ctx = self.prepare(test_fn, ns)
-    node = control_flow.transform(node, ctx)
+    tr = self.transform(f, control_flow)
 
-    with self.compiled(node, ns) as result:
-      self.assertEqual(result.test_fn(5), 10)
-      self.assertEqual(eval_count[0], 1)
+    self.assertEqual(tr(5), 10)
+    self.assertEqual(eval_count[0], 1)
 
   def test_composite_state_initialized_in_loop(self):
 
     class TestClass(object):
       pass
 
-    def test_fn(n, x):
+    def f(n, x):
       tc = TestClass()
       for i in n:
         if i == 0:
@@ -665,37 +656,97 @@ class ForStatementTest(ControlFlowTestBase):
           tc.x = tc.x + i
       return tc.x
 
-    self.assertTransformedResult(
-        test_fn, (range(5), constant_op.constant(10)),
-        20,
-        symbols={'TestClass': TestClass})
-    with self.converted(
-        test_fn, control_flow, {'TestClass': TestClass}) as result:
-      with self.assertRaisesRegex(
-          ValueError, "'tc.x' must be defined before the loop"):
-        result.test_fn(constant_op.constant(list(range(5))), 0)
+    self.assertTransformedResult(f, (range(5), constant_op.constant(10)), 20)
+    tr = self.transform(f, control_flow)
+
+    with self.assertRaisesRegex(
+        ValueError, "'tc.x' must be defined before the loop"):
+      tr(constant_op.constant(list(range(5))), 0)
 
   def test_tuple_unpacking(self):
-    def test_fn(x_list):
-      z = tf.constant(0)  # pylint:disable=undefined-variable
+
+    def f(x_list):
+      z = constant_op.constant(0)  # pylint:disable=undefined-variable
       for i, x in enumerate(x_list):
         z = z + x + i
       return z
 
-    self.assertTransformedResult(test_fn, [3, 3], 7)
+    self.assertTransformedResult(f, [3, 3], 7)
 
   def test_with_comprehension_in_body(self):
 
-    def test_fn(l, n):
+    def f(l, n):
       s = constant_op.constant(list(range(n)))
       for _ in l:
         s += constant_op.constant([a for a in range(n)])
       return s
 
-    self.assertTransformedResult(
-        test_fn, (constant_op.constant([1, 2, 3]), 5),
-        np.array(range(5)) * 4,
-        symbols={'constant_op': constant_op})
+    self.assertTransformedResult(f, (constant_op.constant([1, 2, 3]), 5),
+                                 np.array(range(5)) * 4)
+
+
+class AdvancedControlFlowTest(ControlFlowTestBase):
+
+  def assertTransformedEquivalent(self, f, *inputs):
+    tr = self.transform(
+        f, (break_statements, continue_statements, control_flow))
+    self.assertEqual(f(*inputs), tr(*inputs))
+
+  def test_while_with_else(self):
+
+    def f(x):
+      while x > 2:
+        x /= 2
+      else:
+        x += 1
+      return x
+
+    self.assertTransformedEquivalent(f, 4)
+    self.assertTransformedEquivalent(f, 2)
+
+  def test_while_with_else_and_break(self):
+
+    def f(cond1):
+      x = 8
+      while x > 2:
+        x /= 2
+        if cond1:
+          break
+      else:
+        x += 1
+      return x
+
+    self.assertTransformedEquivalent(f, True)
+    self.assertTransformedEquivalent(f, False)
+
+  def test_for_with_else(self):
+
+    def f(l):
+      res = 0
+      for x in l:
+        res += x
+      else:
+        res += 1
+      return res
+
+    self.assertTransformedEquivalent(f, [])
+    self.assertTransformedEquivalent(f, [1, 2])
+
+  def test_for_with_else_and_break(self):
+
+    def f(flag):
+      l = [1, 2, 3]
+      res = 0
+      for x in l:
+        res += x
+        if flag:
+          break
+      else:
+        res += 1
+      return res
+
+    self.assertTransformedEquivalent(f, True)
+    self.assertTransformedEquivalent(f, False)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/autograph/converters/directives_test.py b/tensorflow/python/autograph/converters/directives_test.py
index f86e7a9a0bd..ac8730fe185 100644
--- a/tensorflow/python/autograph/converters/directives_test.py
+++ b/tensorflow/python/autograph/converters/directives_test.py
@@ -22,7 +22,6 @@ from tensorflow.python.autograph.converters import directives as directives_conv
 from tensorflow.python.autograph.core import converter_testing
 from tensorflow.python.autograph.lang import directives
 from tensorflow.python.autograph.pyct import anno
-from tensorflow.python.autograph.pyct import parser
 from tensorflow.python.platform import test
 
 
@@ -30,13 +29,12 @@ class DirectivesTest(converter_testing.TestCase):
 
   def test_local_target(self):
 
-    def test_fn():
+    def f():
       l = []
       string_var = 0
       directives.set_element_type(l, 'a', string_var)
 
-    node, ctx = self.prepare(test_fn, {'directives': directives})
-    node = directives_converter.transform(node, ctx)
+    _, node, _ = self.transform(f, directives_converter, include_ast=True)
 
     def_, = anno.getanno(node.body[0].targets[0],
                          anno.Static.DEFINITIONS)
@@ -46,11 +44,11 @@ class DirectivesTest(converter_testing.TestCase):
 
   def test_argument_target(self):
 
-    def test_fn(a):
+    def f(a):
       directives.set_element_type(a, 1, shape=2)
+      pass
 
-    node, ctx = self.prepare(test_fn, {'directives': directives})
-    node = directives_converter.transform(node, ctx)
+    _, node, _ = self.transform(f, directives_converter, include_ast=True)
 
     def_, = anno.getanno(node.args.args[0], anno.Static.DEFINITIONS)
     d = def_.directives[directives.set_element_type]
@@ -59,13 +57,13 @@ class DirectivesTest(converter_testing.TestCase):
 
   def test_loop_target(self):
 
-    def test_fn():
+    def f():
       a = True
       while True:
         directives.set_loop_options(parallel_iterations=10, back_prop=a)
+        pass
 
-    node, ctx = self.prepare(test_fn, {'directives': directives})
-    node = directives_converter.transform(node, ctx)
+    _, node, _ = self.transform(f, directives_converter, include_ast=True)
 
     d = anno.getanno(node.body[1], anno.Basic.DIRECTIVES)
     d = d[directives.set_loop_options]
@@ -75,40 +73,23 @@ class DirectivesTest(converter_testing.TestCase):
 
   def test_loop_target_no_loop(self):
 
-    def test_fn():
+    def f():
       directives.set_loop_options()
+      pass
 
-    node, ctx = self.prepare(test_fn, {'directives': directives})
     with self.assertRaisesRegexp(ValueError, 'must be used inside a statement'):
-      node = directives_converter.transform(node, ctx)
+      self.transform(f, directives_converter, include_ast=True)
 
   def test_loop_target_not_first(self):
 
-    def test_fn():
+    def f():
       a = 1
       while True:
         a = 2
         directives.set_loop_options(parallel_iterations=10, back_prop=a)
 
-    node, ctx = self.prepare(test_fn, {'directives': directives})
     with self.assertRaisesRegexp(ValueError, 'must be the first statement'):
-      node = directives_converter.transform(node, ctx)
-
-  def test_invalid_default(self):
-
-    def invalid_directive(valid_arg, invalid_default=object()):
-      del valid_arg
-      del invalid_default
-      return
-
-    def call_invalid_directive():
-      invalid_directive(1)
-
-    node, _ = parser.parse_entity(call_invalid_directive, ())
-    # Find the call to the invalid directive
-    node = node.body[0].value
-    with self.assertRaisesRegexp(ValueError, 'Unexpected keyword.*'):
-      directives_converter._map_args(node, invalid_directive)
+      self.transform(f, directives_converter, include_ast=True)
 
   def test_value_verification_does_not_trigger_properties(self):
 
@@ -122,11 +103,11 @@ class DirectivesTest(converter_testing.TestCase):
 
     tc = TestClass()
 
-    def test_fn():
+    def f():
       return tc.b + 1
 
-    node, ctx = self.prepare(test_fn, {'tc': tc})
-    node = directives_converter.transform(node, ctx)
+    _, node, _ = self.transform(f, directives_converter, include_ast=True)
+
     self.assertIsNotNone(node)
 
   def test_value_verification_does_not_trigger_getattr(self):
@@ -143,11 +124,11 @@ class DirectivesTest(converter_testing.TestCase):
 
     tc = TestClass()
 
-    def test_fn():
+    def f():
       return tc.b + 1
 
-    node, ctx = self.prepare(test_fn, {'tc': tc})
-    node = directives_converter.transform(node, ctx)
+    _, node, _ = self.transform(f, directives_converter, include_ast=True)
+
     self.assertIsNotNone(node)
     self.assertFalse(tc.getattr_called)
 
diff --git a/tensorflow/python/autograph/converters/functions_test.py b/tensorflow/python/autograph/converters/functions_test.py
index 2a51ef71ebf..f659c3fdf83 100644
--- a/tensorflow/python/autograph/converters/functions_test.py
+++ b/tensorflow/python/autograph/converters/functions_test.py
@@ -23,51 +23,49 @@ from tensorflow.python.autograph.converters import return_statements
 from tensorflow.python.autograph.core import ag_ctx
 from tensorflow.python.autograph.core import converter
 from tensorflow.python.autograph.core import converter_testing
+from tensorflow.python.autograph.impl import api
 from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test
 
 
 class FunctionTransformer(converter_testing.TestCase):
 
-  @test_util.run_deprecated_v1
   def test_basic(self):
 
-    def test_fn(l):
+    def f(l):
       """Docstring."""
       a = 1
       l += a
       return l
 
-    with self.converted(test_fn, functions, {}) as result:
-      result_op = result.test_fn(constant_op.constant(1))
-      self.assertIn('test_fn/', result_op.op.name)
-      self.assertEqual('Docstring.', result.test_fn.__doc__)
+    tr = self.transform(f, functions)
+
+    result_op = tr(constant_op.constant(1))
+    self.assertIn('f/', result_op.op.name)
+    self.assertEqual('Docstring.', tr.__doc__)
 
-  @test_util.run_deprecated_v1
   def test_multiline_docstring(self):
 
-    tf = None
-
-    def test_fn():
+    def f():
       """First sentence.
 
       Second sentence.
+
+      Returns:
+        Something.
       """
-      return tf.constant(1)
+      return constant_op.constant(1)
 
-    with self.converted(test_fn, functions, {},
-                        (constant_op.constant,)) as result:
-      result_op = result.test_fn()
-      self.assertIn('test_fn/', result_op.op.name)
-      self.assertIn('First sentence.', result.test_fn.__doc__)
-      self.assertIn('Second sentence.', result.test_fn.__doc__)
+    tr = self.transform(f, functions)
+
+    result_op = tr()
+    self.assertIn('f/', result_op.op.name)
+    self.assertIn('First sentence.', tr.__doc__)
+    self.assertIn('Second sentence.', tr.__doc__)
 
-  @test_util.run_deprecated_v1
   def test_nested_functions(self):
 
-    def test_fn(l):
+    def f(l):
 
       def inner_fn(i):
         return i + 1
@@ -75,41 +73,35 @@ class FunctionTransformer(converter_testing.TestCase):
       l += 1
       return l, inner_fn(l)
 
-    with self.converted(test_fn, (functions, return_statements), {},
-                        (ops.name_scope,)) as result:
-      first, second = result.test_fn(constant_op.constant(1))
-      self.assertIn('test_fn/', first.op.name)
-      self.assertNotIn('inner_fn', first.op.name)
-      self.assertIn('test_fn/inner_fn/', second.op.inputs[0].name)
+    tr = self.transform(f, (functions, return_statements))
+
+    first, second = tr(constant_op.constant(1))
+    self.assertIn('f/', first.op.name)
+    self.assertNotIn('inner_fn', first.op.name)
+    self.assertIn('f/inner_fn/', second.op.inputs[0].name)
 
-  @test_util.run_deprecated_v1
   def test_conversion_context_preserves_in_inner_functions(self):
 
     def inner_fn_callee():
       self.assertEqual(
           ag_ctx.control_status_ctx().status, ag_ctx.Status.DISABLED)
 
-    def test_fn():
+    def f():
       def inner_fn():
         inner_fn_callee()
       with ag_ctx.ControlStatusCtx(
           ag_ctx.Status.DISABLED, converter.ConversionOptions(recursive=True)):
         inner_fn()
 
-    ns = {
-        'inner_fn_callee': inner_fn_callee,
-        'ag_ctx': ag_ctx,
-        'converter': converter
-    }
-    with self.converted(test_fn, functions, ns) as result:
-      result.test_fn()
+    tr = self.transform(f, functions)
+
+    tr()
 
-  @test_util.run_deprecated_v1
   def test_method(self):
 
     class TestClass(object):
 
-      def test_fn(self, l):
+      def f(self, l):
 
         def inner_fn(i):
           return i + 1
@@ -117,25 +109,22 @@ class FunctionTransformer(converter_testing.TestCase):
         l += 1
         return l, inner_fn(l)
 
-    ns = {'TestClass': TestClass}
-    node, ctx = self.prepare(TestClass, ns)
-    node = functions.transform(node, ctx)
-    node = return_statements.transform(node, ctx)
+    tr = self.transform(TestClass.f, (functions, return_statements))
 
-    with self.compiled(node, {}, (ops.name_scope,)) as result:
-      first, second = result.TestClass().test_fn(constant_op.constant(1))
-      self.assertIn('test_fn/', first.op.name)
-      self.assertNotIn('inner_fn', first.op.name)
-      self.assertIn('test_fn/inner_fn/', second.op.inputs[0].name)
+    first, second = tr(TestClass(), constant_op.constant(1))
+    self.assertIn('f/', first.op.name)
+    self.assertNotIn('inner_fn', first.op.name)
+    self.assertIn('f/inner_fn/', second.op.inputs[0].name)
 
   def test_lambda_in_return_value(self):
 
-    def test_fn():
+    def f():
       return lambda x: x + 1
 
-    with self.converted(test_fn, functions, {}) as result:
-      result_l = result.test_fn()
-      self.assertTrue(result_l.fake_autograph_artifact)
+    tr = self.transform(f, functions)
+
+    result_l = tr()
+    self.assertTrue(api.is_autograph_artifact(result_l))
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/autograph/converters/list_comprehensions_test.py b/tensorflow/python/autograph/converters/list_comprehensions_test.py
index 1e66139af63..7a075903673 100644
--- a/tensorflow/python/autograph/converters/list_comprehensions_test.py
+++ b/tensorflow/python/autograph/converters/list_comprehensions_test.py
@@ -25,36 +25,36 @@ from tensorflow.python.platform import test
 
 class ListCompTest(converter_testing.TestCase):
 
-  def assertTransformedEquivalent(self, test_fn, *inputs):
-    with self.converted(test_fn, list_comprehensions, {}) as result:
-      self.assertEqual(test_fn(*inputs), result.test_fn(*inputs))
+  def assertTransformedEquivalent(self, f, *inputs):
+    tr = self.transform(f, list_comprehensions)
+    self.assertEqual(f(*inputs), tr(*inputs))
 
   def test_basic(self):
 
-    def test_fn(l):
+    def f(l):
       s = [e * e for e in l]
       return s
 
-    self.assertTransformedEquivalent(test_fn, [])
-    self.assertTransformedEquivalent(test_fn, [1, 2, 3])
+    self.assertTransformedEquivalent(f, [])
+    self.assertTransformedEquivalent(f, [1, 2, 3])
 
   def test_multiple_generators(self):
 
-    def test_fn(l):
-      s = [e * e for sublist in l for e in sublist]
+    def f(l):
+      s = [e * e for sublist in l for e in sublist]  # pylint:disable=g-complex-comprehension
       return s
 
-    self.assertTransformedEquivalent(test_fn, [])
-    self.assertTransformedEquivalent(test_fn, [[1], [2], [3]])
+    self.assertTransformedEquivalent(f, [])
+    self.assertTransformedEquivalent(f, [[1], [2], [3]])
 
   def test_cond(self):
 
-    def test_fn(l):
+    def f(l):
       s = [e * e for e in l if e > 1]
       return s
 
-    self.assertTransformedEquivalent(test_fn, [])
-    self.assertTransformedEquivalent(test_fn, [1, 2, 3])
+    self.assertTransformedEquivalent(f, [])
+    self.assertTransformedEquivalent(f, [1, 2, 3])
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/autograph/converters/lists_test.py b/tensorflow/python/autograph/converters/lists_test.py
index 9436b69d749..75280730598 100644
--- a/tensorflow/python/autograph/converters/lists_test.py
+++ b/tensorflow/python/autograph/converters/lists_test.py
@@ -18,12 +18,11 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.autograph.converters import directives as directives_converter
 from tensorflow.python.autograph.converters import lists
 from tensorflow.python.autograph.core import converter_testing
 from tensorflow.python.autograph.lang import directives
 from tensorflow.python.autograph.lang import special_functions
-from tensorflow.python.autograph.pyct import anno
-from tensorflow.python.autograph.pyct import parser
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
@@ -31,101 +30,81 @@ from tensorflow.python.ops import list_ops
 from tensorflow.python.platform import test
 
 
-tf = None  # Will be replaced by a mock.
-
-
 class ListTest(converter_testing.TestCase):
 
   def test_empty_list(self):
 
-    def test_fn():
+    def f():
       return []
 
-    with self.converted(test_fn, lists, {}) as result:
-      tl = result.test_fn()
-      # Empty tensor lists cannot be evaluated or stacked.
-      self.assertTrue(isinstance(tl, ops.Tensor))
-      self.assertEqual(tl.dtype, dtypes.variant)
+    tr = self.transform(f, lists)
+
+    tl = tr()
+    # Empty tensor lists cannot be evaluated or stacked.
+    self.assertIsInstance(tl, ops.Tensor)
+    self.assertEqual(tl.dtype, dtypes.variant)
 
   def test_initialized_list(self):
 
-    def test_fn():
+    def f():
       return [1, 2, 3]
 
-    with self.converted(test_fn, lists, {}) as result:
-      self.assertAllEqual(result.test_fn(), [1, 2, 3])
+    tr = self.transform(f, lists)
+
+    self.assertAllEqual(tr(), [1, 2, 3])
 
   def test_list_append(self):
 
-    def test_fn():
+    def f():
       l = special_functions.tensor_list([1])
       l.append(2)
       l.append(3)
       return l
 
-    ns = {'special_functions': special_functions}
-    with self.converted(test_fn, lists, ns) as result:
-      with self.cached_session() as sess:
-        tl = result.test_fn()
-        r = list_ops.tensor_list_stack(tl, dtypes.int32)
-        self.assertAllEqual(self.evaluate(r), [1, 2, 3])
+    tr = self.transform(f, lists)
+
+    tl = tr()
+    r = list_ops.tensor_list_stack(tl, dtypes.int32)
+    self.assertAllEqual(self.evaluate(r), [1, 2, 3])
 
   def test_list_pop(self):
 
-    def test_fn():
+    def f():
       l = special_functions.tensor_list([1, 2, 3])
+      directives.set_element_type(l, dtype=dtypes.int32, shape=())
       s = l.pop()
       return s, l
 
-    ns = {'special_functions': special_functions}
-    node, ctx = self.prepare(test_fn, ns)
-    def_, = anno.getanno(node.body[0].targets[0],
-                         anno.Static.ORIG_DEFINITIONS)
-    def_.directives[directives.set_element_type] = {
-        'dtype': parser.parse_expression('tf.int32'),
-        'shape': parser.parse_expression('()'),
-    }
-    node = lists.transform(node, ctx)
+    tr = self.transform(f, (directives_converter, lists))
 
-    with self.compiled(node, ns, (dtypes.int32,)) as result:
-      with self.cached_session() as sess:
-        ts, tl = result.test_fn()
-        r = list_ops.tensor_list_stack(tl, dtypes.int32)
-        self.assertAllEqual(self.evaluate(r), [1, 2])
-        self.assertAllEqual(self.evaluate(ts), 3)
+    ts, tl = tr()
+    r = list_ops.tensor_list_stack(tl, dtypes.int32)
+    self.assertAllEqual(self.evaluate(r), [1, 2])
+    self.assertAllEqual(self.evaluate(ts), 3)
 
   def test_double_list_pop(self):
 
-    def test_fn(l):
+    def f(l):
       s = l.pop().pop()
       return s
 
-    with self.converted(test_fn, lists, {}) as result:
-      test_input = [1, 2, [1, 2, 3]]
-      # TODO(mdan): Pass a list of lists of tensor when we fully support that.
-      # For now, we just pass a regular Python list of lists just to verify that
-      # the two pop calls are sequenced properly.
-      self.assertAllEqual(result.test_fn(test_input), 3)
+    tr = self.transform(f, lists)
+
+    test_input = [1, 2, [1, 2, 3]]
+    # TODO(mdan): Pass a list of lists of tensor when we fully support that.
+    # For now, we just pass a regular Python list of lists just to verify that
+    # the two pop calls are sequenced properly.
+    self.assertAllEqual(tr(test_input), 3)
 
   def test_list_stack(self):
 
-    def test_fn():
+    def f():
       l = [1, 2, 3]
-      return tf.stack(l)
+      return array_ops.stack(l)
 
-    node, ctx = self.prepare(test_fn, {})
-    def_, = anno.getanno(node.body[0].targets[0],
-                         anno.Static.ORIG_DEFINITIONS)
-    def_.directives[directives.set_element_type] = {
-        'dtype': parser.parse_expression('tf.int32')
-    }
-    node = lists.transform(node, ctx)
+    tr = self.transform(f, lists)
 
-    with self.compiled(node, {}, (array_ops.stack, dtypes.int32)) as result:
-      with self.cached_session() as sess:
-        self.assertAllEqual(self.evaluate(result.test_fn()), [1, 2, 3])
-
-  # TODO(mdan): Add a test with tf.stack with axis kwarg.
+    self.assertAllEqual(self.evaluate(tr()), [1, 2, 3])
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/autograph/converters/logical_expressions_test.py b/tensorflow/python/autograph/converters/logical_expressions_test.py
index 67ccd1fb479..d201f746fc6 100644
--- a/tensorflow/python/autograph/converters/logical_expressions_test.py
+++ b/tensorflow/python/autograph/converters/logical_expressions_test.py
@@ -27,62 +27,59 @@ from tensorflow.python.platform import test
 
 class LogicalExpressionTest(converter_testing.TestCase):
 
-  @test_util.run_deprecated_v1
   def test_equals(self):
 
-    def test_fn(a, b):
+    def f(a, b):
       return a == b
 
-    with self.converted(test_fn, logical_expressions, {}) as result:
-      with self.cached_session() as sess:
-        self.assertTrue(sess.run(result.test_fn(constant_op.constant(1), 1)))
-        self.assertFalse(sess.run(result.test_fn(constant_op.constant(1), 2)))
+    tr = self.transform(f, logical_expressions)
+
+    self.assertTrue(self.evaluate(tr(constant_op.constant(1), 1)))
+    self.assertFalse(self.evaluate(tr(constant_op.constant(1), 2)))
 
   @test_util.run_deprecated_v1
   def test_bool_ops(self):
 
-    def test_fn(a, b, c):
+    def f(a, b, c):
       return (a or b) and (a or b or c) and not c
 
-    with self.converted(test_fn, logical_expressions, {}) as result:
-      with self.cached_session() as sess:
-        self.assertTrue(
-            sess.run(result.test_fn(constant_op.constant(True), False, False)))
-        self.assertFalse(
-            sess.run(result.test_fn(constant_op.constant(True), False, True)))
+    tr = self.transform(f, logical_expressions)
+
+    self.assertTrue(self.evaluate(tr(constant_op.constant(True), False, False)))
+    self.assertFalse(self.evaluate(tr(constant_op.constant(True), False, True)))
 
-  @test_util.run_deprecated_v1
   def test_comparison(self):
 
-    def test_fn(a, b, c, d):
+    def f(a, b, c, d):
       return a < b == c > d
 
-    with self.converted(test_fn, logical_expressions, {}) as result:
-      with self.cached_session() as sess:
-        # Note: having just the first constant a tensor tests that the
-        # operations execute in the correct order. If anything other than
-        # a < b executed first, the result would be a Python scalar and not a
-        # Tensor. This is valid as long as the dispat is automatic based on
-        # type.
-        self.assertTrue(
-            sess.run(result.test_fn(constant_op.constant(1), 2, 2, 1)))
-        self.assertFalse(
-            sess.run(result.test_fn(constant_op.constant(1), 2, 2, 3)))
+    tr = self.transform(f, logical_expressions)
+
+    # Note: having just the first constant a tensor tests that the
+    # operations execute in the correct order. If anything other than
+    # a < b executed first, the result would be a Python scalar and not a
+    # Tensor. This is valid as long as the dispat is automatic based on
+    # type.
+    self.assertTrue(self.evaluate(tr(constant_op.constant(1), 2, 2, 1)))
+    self.assertFalse(self.evaluate(tr(constant_op.constant(1), 2, 2, 3)))
 
   def test_default_ops(self):
 
-    def test_fn(a, b):
+    def f(a, b):
       return a in b
 
-    with self.converted(test_fn, logical_expressions, {}) as result:
-      self.assertTrue(result.test_fn('a', ('a',)))
+    tr = self.transform(f, logical_expressions)
+
+    self.assertTrue(tr('a', ('a',)))
 
   def test_unary_ops(self):
-    def test_fn(a):
+
+    def f(a):
       return ~a, -a, +a
 
-    with self.converted(test_fn, logical_expressions, {}) as result:
-      self.assertEqual(result.test_fn(1), (-2, -1, 1))
+    tr = self.transform(f, logical_expressions)
+
+    self.assertEqual(tr(1), (-2, -1, 1))
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/autograph/converters/loop_integration_test.py b/tensorflow/python/autograph/converters/loop_integration_test.py
deleted file mode 100644
index 351eb7b92cf..00000000000
--- a/tensorflow/python/autograph/converters/loop_integration_test.py
+++ /dev/null
@@ -1,95 +0,0 @@
-# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Integration Tests for loop."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.python.autograph.converters import break_statements
-from tensorflow.python.autograph.converters import continue_statements
-from tensorflow.python.autograph.converters import control_flow
-from tensorflow.python.autograph.core import converter_testing
-from tensorflow.python.framework import constant_op
-from tensorflow.python.platform import test
-
-
-class LoopIntegrationTest(converter_testing.TestCase):
-
-  def assertTransformedEquivalent(self, test_fn, *inputs):
-    with self.converted(test_fn,
-                        [break_statements, continue_statements, control_flow],
-                        {}, (constant_op.constant,)) as result:
-      self.assertEqual(test_fn(*inputs), result.test_fn(*inputs))
-
-  def test_while_loop_with_else(self):
-
-    def test_fn(x):
-      while x > 2:
-        x /= 2
-      else:
-        x += 1
-      return x
-
-    self.assertTransformedEquivalent(test_fn, 4)
-    self.assertTransformedEquivalent(test_fn, 2)
-
-  def test_while_loop_with_else_and_break(self):
-
-    def test_fn(cond1):
-      x = 8
-      while x > 2:
-        x /= 2
-        if cond1:
-          break
-      else:
-        x += 1
-      return x
-
-    self.assertTransformedEquivalent(test_fn, True)
-    self.assertTransformedEquivalent(test_fn, False)
-
-  def test_for_loop_with_else(self):
-
-    def test_fn(l):
-      res = 0
-      for x in l:
-        res += x
-      else:
-        res += 1
-      return res
-
-    self.assertTransformedEquivalent(test_fn, [])
-    self.assertTransformedEquivalent(test_fn, [1, 2])
-
-  def test_for_loop_with_else_and_break(self):
-
-    def test_fn(flag):
-      l = [1, 2, 3]
-      res = 0
-      for x in l:
-        res += x
-        if flag:
-          break
-      else:
-        res += 1
-      return res
-
-    self.assertTransformedEquivalent(test_fn, True)
-    self.assertTransformedEquivalent(test_fn, False)
-
-
-if __name__ == '__main__':
-  test.main()
diff --git a/tensorflow/python/autograph/converters/return_statements_test.py b/tensorflow/python/autograph/converters/return_statements_test.py
index 3f1e6a0bd97..de98d3b1b80 100644
--- a/tensorflow/python/autograph/converters/return_statements_test.py
+++ b/tensorflow/python/autograph/converters/return_statements_test.py
@@ -27,81 +27,80 @@ from tensorflow.python.platform import test
 
 class SingleReturnTest(converter_testing.TestCase):
 
-  def assertTransformedEquivalent(self, test_fn, *inputs):
-    ns = {'ops': ops}
-    with self.converted(test_fn, (functions, return_statements), ns) as result:
-      self.assertEqual(test_fn(*inputs), result.test_fn(*inputs))
+  def assertTransformedEquivalent(self, f, *inputs):
+    tr = self.transform(f, (functions, return_statements))
+    self.assertEqual(f(*inputs), tr(*inputs))
 
   def test_straightline(self):
 
-    def test_fn(x):
+    def f(x):
       return x * x
 
-    self.assertTransformedEquivalent(test_fn, 2)
+    self.assertTransformedEquivalent(f, 2)
 
   def test_superfluous_returns(self):
 
-    def test_fn():
+    def f():
       retval = 1
       return retval
       retval = 2  # pylint:disable=unreachable
       return retval
 
-    self.assertTransformedEquivalent(test_fn)
+    self.assertTransformedEquivalent(f)
 
   def test_superfluous_returns_adjacent(self):
 
-    def test_fn():
+    def f():
       return 1
       return 2  # pylint:disable=unreachable
 
-    self.assertTransformedEquivalent(test_fn)
+    self.assertTransformedEquivalent(f)
 
   def test_conditional(self):
 
-    def test_fn(x):
+    def f(x):
       if x > 0:
         return x
       else:
         return x * x
 
-    self.assertTransformedEquivalent(test_fn, 2)
-    self.assertTransformedEquivalent(test_fn, -2)
+    self.assertTransformedEquivalent(f, 2)
+    self.assertTransformedEquivalent(f, -2)
 
   def test_conditional_missing_else(self):
 
-    def test_fn(x):
+    def f(x):
       if x > 0:
         return x
 
-    self.assertTransformedEquivalent(test_fn, 2)
-    self.assertTransformedEquivalent(test_fn, -2)
+    self.assertTransformedEquivalent(f, 2)
+    self.assertTransformedEquivalent(f, -2)
 
   def test_conditional_missing_else_then_default(self):
 
-    def test_fn(x):
+    def f(x):
       if x > 0:
         return x
       return x * x
 
-    self.assertTransformedEquivalent(test_fn, 2)
-    self.assertTransformedEquivalent(test_fn, -2)
+    self.assertTransformedEquivalent(f, 2)
+    self.assertTransformedEquivalent(f, -2)
 
   def test_conditional_else_only_then_default(self):
 
-    def test_fn(x):
+    def f(x):
       if x < 0:
         x *= x
       else:
         return x
       return x
 
-    self.assertTransformedEquivalent(test_fn, 2)
-    self.assertTransformedEquivalent(test_fn, -2)
+    self.assertTransformedEquivalent(f, 2)
+    self.assertTransformedEquivalent(f, -2)
 
   def test_conditional_nested(self):
 
-    def test_fn(x):
+    def f(x):
       if x > 0:
         if x < 5:
           return x
@@ -110,53 +109,53 @@ class SingleReturnTest(converter_testing.TestCase):
       else:
         return x * x * x
 
-    self.assertTransformedEquivalent(test_fn, 2)
-    self.assertTransformedEquivalent(test_fn, -2)
-    self.assertTransformedEquivalent(test_fn, 5)
+    self.assertTransformedEquivalent(f, 2)
+    self.assertTransformedEquivalent(f, -2)
+    self.assertTransformedEquivalent(f, 5)
 
   def test_context_manager(self):
 
-    def test_fn(x):
+    def f(x):
       with ops.name_scope(''):
         return x * x
 
-    self.assertTransformedEquivalent(test_fn, 2)
-    self.assertTransformedEquivalent(test_fn, -2)
+    self.assertTransformedEquivalent(f, 2)
+    self.assertTransformedEquivalent(f, -2)
 
   def test_context_manager_in_conditional(self):
 
-    def test_fn(x):
+    def f(x):
       if x > 0:
         with ops.name_scope(''):
           return x * x
       else:
         return x
 
-    self.assertTransformedEquivalent(test_fn, 2)
-    self.assertTransformedEquivalent(test_fn, -2)
+    self.assertTransformedEquivalent(f, 2)
+    self.assertTransformedEquivalent(f, -2)
 
   def text_conditional_in_context_manager(self):
 
-    def test_fn(x):
+    def f(x):
       with ops.name_scope(''):
         if x > 0:
           return x * x
         else:
           return x
 
-    self.assertTransformedEquivalent(test_fn, 2)
-    self.assertTransformedEquivalent(test_fn, -2)
+    self.assertTransformedEquivalent(f, 2)
+    self.assertTransformedEquivalent(f, -2)
 
   def test_no_return(self):
 
-    def test_fn(x):
+    def f(x):
       x *= x
 
-    self.assertTransformedEquivalent(test_fn, 2)
+    self.assertTransformedEquivalent(f, 2)
 
   def test_nested_function(self):
 
-    def test_fn(x):
+    def f(x):
 
       def inner_fn(y):
         if y > 0:
@@ -166,33 +165,33 @@ class SingleReturnTest(converter_testing.TestCase):
 
       return inner_fn(x)
 
-    self.assertTransformedEquivalent(test_fn, 2)
-    self.assertTransformedEquivalent(test_fn, -2)
+    self.assertTransformedEquivalent(f, 2)
+    self.assertTransformedEquivalent(f, -2)
 
   def test_nested_function_in_control_flow(self):
 
-    def test_fn(x):
+    def f(x):
 
       if x:
         def inner_fn(y):
           return y
         inner_fn(x)
 
-    self.assertTransformedEquivalent(test_fn, 2)
-    self.assertTransformedEquivalent(test_fn, -2)
+    self.assertTransformedEquivalent(f, 2)
+    self.assertTransformedEquivalent(f, -2)
 
   def test_for_loop(self):
 
-    def test_fn(n):
+    def f(n):
       for _ in range(n):
         return 1
 
-    self.assertTransformedEquivalent(test_fn, 2)
-    self.assertTransformedEquivalent(test_fn, 0)
+    self.assertTransformedEquivalent(f, 2)
+    self.assertTransformedEquivalent(f, 0)
 
   def test_while_loop(self):
 
-    def test_fn(n):
+    def f(n):
       i = 0
       s = 0
       while i < n:
@@ -202,23 +201,23 @@ class SingleReturnTest(converter_testing.TestCase):
           return s
       return -1
 
-    self.assertTransformedEquivalent(test_fn, 0)
-    self.assertTransformedEquivalent(test_fn, 2)
-    self.assertTransformedEquivalent(test_fn, 4)
+    self.assertTransformedEquivalent(f, 0)
+    self.assertTransformedEquivalent(f, 2)
+    self.assertTransformedEquivalent(f, 4)
 
   def test_null_return(self):
 
-    def test_fn(n):
+    def f(n):
       if n > 4:
         return
       return
 
-    self.assertTransformedEquivalent(test_fn, 4)
-    self.assertTransformedEquivalent(test_fn, 5)
+    self.assertTransformedEquivalent(f, 4)
+    self.assertTransformedEquivalent(f, 5)
 
   def test_nested_multiple_withs(self):
 
-    def test_fn(x):
+    def f(x):
       v = []
       while x > 0:
         x -= 1
@@ -230,14 +229,14 @@ class SingleReturnTest(converter_testing.TestCase):
         v.append(x)
       return v
 
-    self.assertTransformedEquivalent(test_fn, 0)
-    self.assertTransformedEquivalent(test_fn, 1)
-    self.assertTransformedEquivalent(test_fn, 3)
-    self.assertTransformedEquivalent(test_fn, 4)
+    self.assertTransformedEquivalent(f, 0)
+    self.assertTransformedEquivalent(f, 1)
+    self.assertTransformedEquivalent(f, 3)
+    self.assertTransformedEquivalent(f, 4)
 
   def test_multiple_returns_in_nested_scope(self):
 
-    def test_fn(a):
+    def f(a):
       v = []
       for x in a:
         x -= 1
@@ -250,10 +249,10 @@ class SingleReturnTest(converter_testing.TestCase):
         v.append(x)
       return v
 
-    self.assertTransformedEquivalent(test_fn, [])
-    self.assertTransformedEquivalent(test_fn, [1])
-    self.assertTransformedEquivalent(test_fn, [2])
-    self.assertTransformedEquivalent(test_fn, [1, 2, 3])
+    self.assertTransformedEquivalent(f, [])
+    self.assertTransformedEquivalent(f, [1])
+    self.assertTransformedEquivalent(f, [2])
+    self.assertTransformedEquivalent(f, [1, 2, 3])
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/autograph/converters/slices_test.py b/tensorflow/python/autograph/converters/slices_test.py
index 2fea1c7f81f..5a4bd6f65bd 100644
--- a/tensorflow/python/autograph/converters/slices_test.py
+++ b/tensorflow/python/autograph/converters/slices_test.py
@@ -18,11 +18,10 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.autograph.converters import directives as directives_converter
 from tensorflow.python.autograph.converters import slices
 from tensorflow.python.autograph.core import converter_testing
 from tensorflow.python.autograph.lang import directives
-from tensorflow.python.autograph.pyct import anno
-from tensorflow.python.autograph.pyct import parser
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.ops import list_ops
@@ -33,42 +32,26 @@ class SliceTest(converter_testing.TestCase):
 
   def test_index_access(self):
 
-    def test_fn(l):
+    def f(l):
+      directives.set_element_type(l, dtypes.int32)
       return l[1]
 
-    node, ctx = self.prepare(test_fn, {})
-    def_, = anno.getanno(node.args.args[0], anno.Static.DEFINITIONS)
-    def_.directives[directives.set_element_type] = {
-        'dtype': parser.parse_expression('tf.int32')
-    }
-    node = slices.transform(node, ctx)
+    tr = self.transform(f, (directives_converter, slices))
 
-    with self.compiled(node, {}, (dtypes.int32,)) as result:
-      with self.cached_session() as sess:
-        tl = list_ops.tensor_list_from_tensor(
-            [1, 2], element_shape=constant_op.constant([], dtype=dtypes.int32))
-        y = result.test_fn(tl)
-        self.assertEqual(2, self.evaluate(y))
+    tl = list_ops.tensor_list_from_tensor(
+        [1, 2], element_shape=constant_op.constant([], dtype=dtypes.int32))
+    y = tr(tl)
+    self.assertEqual(2, self.evaluate(y))
 
   def test_index_access_multiple_definitions(self):
 
-    def test_fn(l):
+    def f(l):
+      directives.set_element_type(l, dtypes.int32)
       if l:
         l = []
       return l[1]
 
-    node, ctx = self.prepare(test_fn, {})
-    def_, = anno.getanno(node.args.args[0], anno.Static.DEFINITIONS)
-    def_.directives[directives.set_element_type] = {
-        'dtype': parser.parse_expression('tf.int32')
-    }
-    def_, = anno.getanno(node.body[0].body[0].targets[0],
-                         anno.Static.DEFINITIONS)
-    def_.directives[directives.set_element_type] = {
-        'dtype': parser.parse_expression('tf.float32')
-    }
-    with self.assertRaises(ValueError):
-      slices.transform(node, ctx)
+    self.transform(f, (directives_converter, slices))
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/autograph/converters/variables_test.py b/tensorflow/python/autograph/converters/variables_test.py
index 93a31e63de3..2e22cdcb77f 100644
--- a/tensorflow/python/autograph/converters/variables_test.py
+++ b/tensorflow/python/autograph/converters/variables_test.py
@@ -18,8 +18,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import contextlib
-
 from tensorflow.python.autograph.converters import variables
 from tensorflow.python.autograph.core import converter_testing
 from tensorflow.python.platform import test
@@ -27,60 +25,63 @@ from tensorflow.python.platform import test
 
 class VariablesTest(converter_testing.TestCase):
 
-  @contextlib.contextmanager
-  def apply_add_one_conversion(self, fn):
+  def transform_with_test_ld(self, f):
     """Generates code which adds 1 to all variable reads."""
-    with self.converted(fn, variables, {}) as result:
-      result.ag__.__dict__['ld'] = lambda x: x + 1
-      yield result
+    return self.transform(f, variables, ag_overrides={'ld': lambda x: x + 1})
 
   def test_read(self):
 
-    def test_fn(l):
+    def f(l):
       return l
 
-    with self.apply_add_one_conversion(test_fn) as result:
-      self.assertEqual(result.test_fn(1), 2)
+    tr = self.transform_with_test_ld(f)
+
+    self.assertEqual(tr(1), 2)
 
   def test_aug_assign(self):
 
-    def test_fn(l):
+    def f(l):
       l *= 10
       return l
 
-    with self.apply_add_one_conversion(test_fn) as result:
-      self.assertEqual(result.test_fn(1), (1 + 1) * 10 + 1)  # two reads
+    tr = self.transform_with_test_ld(f)
+
+    self.assertEqual(tr(1), (1 + 1) * 10 + 1)  # two reads
 
   def test_del(self):
 
-    def test_fn(l):
+    def f(l):
       del l
       return l
 
-    with self.converted(test_fn, variables, {}) as result:
-      with self.assertRaisesRegex(
-          NameError, "'l' is used before assignment"):
-        result.test_fn(1)
+    tr = self.transform(f, variables)
 
-  def test_del_getitem_ignored(self):
+    with self.assertRaisesRegex(NameError, "'l' is used before assignment"):
+      tr(1)
 
-    def basic_slice(l):
+  def test_del_getitem_ignored_basic_slice(self):
+
+    def f(l):
       del l[0]
       return l
 
-    with self.converted(basic_slice, variables, {}) as result:
-      self.assertListEqual([2], result.basic_slice([1, 2]))
+    tr = self.transform(f, variables)
 
-    def range_slice(l):
+    self.assertListEqual([2], tr([1, 2]))
+
+  def test_del_getitem_ignored_range_slice(self):
+
+    def f(l):
       del l[0:2]
       return l
 
-    with self.converted(range_slice, variables, {}) as result:
-      self.assertListEqual([], result.range_slice([1, 2]))
+    tr = self.transform(f, variables)
+
+    self.assertListEqual([], tr([1, 2]))
 
   def test_del_getattr_ignored(self):
 
-    def test_fn(l):
+    def f(l):
       del l.a
       return l
 
@@ -90,50 +91,60 @@ class VariablesTest(converter_testing.TestCase):
         self.a = 1
         self.b = 2
 
-    with self.converted(test_fn, variables, {}) as result:
-      self.assertFalse(hasattr(result.test_fn(TestClass()), 'a'))
-      self.assertEqual(result.test_fn(TestClass()).b, 2)
+    tr = self.transform(f, variables)
 
-  def test_del_packing_ignored(self):
-    # Note: test for UnboundLocalError, not NameError because in this case we
+    self.assertFalse(hasattr(tr(TestClass()), 'a'))
+    self.assertEqual(tr(TestClass()).b, 2)
+
+  def test_del_packing_ignored_list(self):
+    # Note: testing for UnboundLocalError, not NameError because in this case we
     # don't rewrite the del.
 
-    def list_(a, b):
+    def f(a, b):
       del [a, b]
       return a
 
-    with self.converted(list_, variables, {}) as result:
-      with self.assertRaises(UnboundLocalError):
-        result.list_(1, 2)
+    tr = self.transform(f, variables)
 
-    def nested(a, b, c):
+    with self.assertRaises(UnboundLocalError):
+      tr(1, 2)
+
+  def test_del_packing_ignored_nested(self):
+    # Note: testing for UnboundLocalError, not NameError because in this case we
+    # don't rewrite the del.
+
+    def f(a, b, c):
       del [a, (b, c)]
       return c
 
-    with self.converted(nested, variables, {}) as result:
-      with self.assertRaises(UnboundLocalError):
-        result.nested(1, 2, 3)
+    tr = self.transform(f, variables)
 
-  def test_del_item_multiple_mixed(self):
+    with self.assertRaises(UnboundLocalError):
+      tr(1, 2, 3)
 
-    def test_fn_failing(a, b, c):
+  def test_del_item_multiple_mixed_used_after(self):
+
+    def f(a, b, c):
       del a, b, c[0]
       a = 1
       return a, b, c
 
-    with self.converted(test_fn_failing, variables, {}) as result:
-      with self.assertRaisesRegex(
-          NameError, "'b' is used before assignment"):
-        result.test_fn_failing(1, 2, [1, 2])
+    tr = self.transform(f, variables)
 
-    def test_fn_passing(a, b, c):
+    with self.assertRaisesRegex(NameError, "'b' is used before assignment"):
+      tr(1, 2, [1, 2])
+
+  def test_del_item_multiple_mixed_unused_after(self):
+
+    def f(a, b, c):
       del a, b, c[0]
       a = 1
       b = 2
       return c
 
-    with self.converted(test_fn_passing, variables, {}) as result:
-      self.assertListEqual([2], result.test_fn_passing(1, 2, [1, 2]))
+    tr = self.transform(f, variables)
+
+    self.assertListEqual([2], tr(1, 2, [1, 2]))
 
   def test_attribute(self):
 
@@ -146,12 +157,13 @@ class VariablesTest(converter_testing.TestCase):
         self.v += other
         return self
 
-    def test_fn(l):
+    def f(l):
       return l.v
 
     tc = TestClass()
-    with self.apply_add_one_conversion(test_fn) as result:
-      self.assertEqual(result.test_fn(tc), 2)
+    tr = self.transform_with_test_ld(f)
+
+    self.assertEqual(tr(tc), 2)
 
   def test_subscript(self):
 
@@ -167,12 +179,13 @@ class VariablesTest(converter_testing.TestCase):
       def __getitem__(self, _):
         return self.v
 
-    def test_fn(l):
+    def f(l):
       return l[0]
 
     tc = TestClass()
-    with self.apply_add_one_conversion(test_fn) as result:
-      self.assertEqual(result.test_fn(tc), 2)
+    tr = self.transform_with_test_ld(f)
+
+    self.assertEqual(tr(tc), 2)
 
   def test_call(self):
 
@@ -188,12 +201,13 @@ class VariablesTest(converter_testing.TestCase):
       def __call__(self):
         return self.v
 
-    def test_fn(l):
+    def f(l):
       return l()
 
     tc = TestClass()
-    with self.apply_add_one_conversion(test_fn) as result:
-      self.assertEqual(result.test_fn(tc), 2)
+    tr = self.transform_with_test_ld(f)
+
+    self.assertEqual(tr(tc), 2)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/autograph/core/converter_test.py b/tensorflow/python/autograph/core/converter_test.py
index 030ec761d95..f2533762c8c 100644
--- a/tensorflow/python/autograph/core/converter_test.py
+++ b/tensorflow/python/autograph/core/converter_test.py
@@ -18,6 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import imp
+
 from tensorflow.python.autograph.core import converter
 from tensorflow.python.autograph.core import converter_testing
 from tensorflow.python.autograph.pyct import anno
@@ -38,16 +40,18 @@ class ConversionOptionsTest(converter_testing.TestCase):
     opts_ast = opts.to_ast()
 
     template = '''
-    def test_fn():
+    def f():
       return opts_ast
     '''
     opts_packed = templates.replace(template, opts_ast=opts_ast)
 
     reparsed, _, _ = loader.load_ast(opts_packed)
-    reparsed.__dict__['ag__'] = self.make_fake_mod(
-        'fake_ag', converter.ConversionOptions, converter.Feature)
+    fake_ag = imp.new_module('fake_ag')
+    fake_ag.ConversionOptions = converter.ConversionOptions
+    fake_ag.Feature = converter.Feature
+    reparsed.ag__ = fake_ag
 
-    reparsed_opts = reparsed.test_fn()
+    reparsed_opts = reparsed.f()
 
     self.assertEqual(opts.recursive, reparsed_opts.recursive)
     self.assertEqual(opts.user_requested, False)
@@ -63,12 +67,12 @@ class ConverterBaseTest(converter_testing.TestCase):
 
     directive_key = object
 
-    def test_fn():
+    def f():
       a = 1
       return a
 
-    ns = {}
-    node, ctx = self.prepare(test_fn, ns)
+    _, node, ctx = self.transform(f, (), include_ast=True)
+
     symbol_a = node.body[1].value
     defs, = anno.getanno(symbol_a, anno.Static.ORIG_DEFINITIONS)
     defs.directives[directive_key] = {
@@ -84,12 +88,12 @@ class ConverterBaseTest(converter_testing.TestCase):
 
     directive_key = object
 
-    def test_fn():
+    def f():
       a = 1
       return a
 
-    ns = {}
-    node, ctx = self.prepare(test_fn, ns)
+    _, node, ctx = self.transform(f, (), include_ast=True)
+
     symbol_a = node.body[1].value
     c = TestConverter(ctx)
     value = c.get_definition_directive(symbol_a, directive_key, 'test_arg',
@@ -100,14 +104,14 @@ class ConverterBaseTest(converter_testing.TestCase):
 
     directive_key = object
 
-    def test_fn():
+    def f():
       a = 1
       if a:
         a = 2
       return a
 
-    ns = {}
-    node, ctx = self.prepare(test_fn, ns)
+    _, node, ctx = self.transform(f, (), include_ast=True)
+
     symbol_a = node.body[2].value
     defs = anno.getanno(symbol_a, anno.Static.ORIG_DEFINITIONS)
     defs[0].directives[directive_key] = {
@@ -127,14 +131,14 @@ class ConverterBaseTest(converter_testing.TestCase):
 
     directive_key = object
 
-    def test_fn():
+    def f():
       a = 1
       if a:
         a = 2
       return a
 
-    ns = {}
-    node, ctx = self.prepare(test_fn, ns)
+    _, node, ctx = self.transform(f, (), include_ast=True)
+
     symbol_a = node.body[2].value
     defs = anno.getanno(symbol_a, anno.Static.ORIG_DEFINITIONS)
     defs[0].directives[directive_key] = {
diff --git a/tensorflow/python/autograph/core/converter_testing.py b/tensorflow/python/autograph/core/converter_testing.py
index fbb031876ad..22e06000906 100644
--- a/tensorflow/python/autograph/core/converter_testing.py
+++ b/tensorflow/python/autograph/core/converter_testing.py
@@ -25,27 +25,15 @@ import sys
 
 import six
 
-from tensorflow.python.autograph import operators
-from tensorflow.python.autograph import utils
 from tensorflow.python.autograph.core import config
 from tensorflow.python.autograph.core import converter
-from tensorflow.python.autograph.core import function_wrappers
-from tensorflow.python.autograph.lang import special_functions
-from tensorflow.python.autograph.pyct import anno
-from tensorflow.python.autograph.pyct import cfg
-from tensorflow.python.autograph.pyct import loader
-from tensorflow.python.autograph.pyct import naming
-from tensorflow.python.autograph.pyct import origin_info
-from tensorflow.python.autograph.pyct import parser
-from tensorflow.python.autograph.pyct import pretty_printer
-from tensorflow.python.autograph.pyct import qual_names
-from tensorflow.python.autograph.pyct import transformer
-from tensorflow.python.autograph.pyct.static_analysis import activity
-from tensorflow.python.autograph.pyct.static_analysis import reaching_definitions
+from tensorflow.python.autograph.impl import api
+from tensorflow.python.autograph.impl import conversion
+from tensorflow.python.framework import ops
 from tensorflow.python.platform import test
 
 
-def whitelist(entity):
+def whitelist(f):
   """Helper that marks a callable as whtelitisted."""
   if 'whitelisted_module_for_testing' not in sys.modules:
     whitelisted_mod = imp.new_module('whitelisted_module_for_testing')
@@ -54,7 +42,7 @@ def whitelist(entity):
         (config.DoNotConvert('whitelisted_module_for_testing'),) +
         config.CONVERSION_RULES)
 
-  entity.__module__ = 'whitelisted_module_for_testing'
+  f.__module__ = 'whitelisted_module_for_testing'
 
 
 def is_inside_generated_code():
@@ -76,9 +64,39 @@ def is_inside_generated_code():
     del frame
 
 
+class TestingTranspiler(conversion.AutoGraphTranspiler):
+  """Testing version that only applies given transformations."""
+
+  def __init__(self, converters):
+    super(TestingTranspiler, self).__init__()
+    if isinstance(converters, (list, tuple)):
+      self._converters = converters
+    else:
+      self._converters = (converters,)
+    self.transformed_ast = None
+
+  def transform_ast(self, node, ctx):
+    node = self.initial_analysis(node, ctx)
+
+    for c in self._converters:
+      node = c.transform(node, ctx)
+
+    self.transformed_ast = node
+    self.transform_ctx = ctx
+    return node
+
+
 class TestCase(test.TestCase):
   """Base class for unit tests in this module. Contains relevant utilities."""
 
+  def setUp(self):
+    # AutoGraph tests must run in graph mode to properly test control flow.
+    self.graph = ops.Graph().as_default()
+    self.graph.__enter__()
+
+  def tearDown(self):
+    self.graph.__exit__(None, None, None)
+
   @contextlib.contextmanager
   def assertPrints(self, expected_result):
     try:
@@ -89,108 +107,26 @@ class TestCase(test.TestCase):
     finally:
       sys.stdout = sys.__stdout__
 
-  @contextlib.contextmanager
-  def compiled(self, node, namespace, symbols=()):
-    source = None
-
-    self.dynamic_calls = []
-    # See api.converted_call
-    def converted_call(
-        f, args, kwargs, unused_opts=None, unused_function_ctx=None):
-      """Mock version of api.converted_call."""
-      self.dynamic_calls.append((args, kwargs))
-      if kwargs is None:
-        kwargs = {}
-      return f(*args, **kwargs)
-
-    def fake_autograph_artifact(f):
-      setattr(f, 'fake_autograph_artifact', True)
-      return f
-
-    try:
-      result, source, source_map = loader.load_ast(
-          node, include_source_map=True)
-      # TODO(mdan): Move the unparsing from converter into pyct and reuse here.
-
-      # TODO(mdan): Move this into self.prepare()
-      result.tf = self.make_fake_mod('fake_tf', *symbols)
-      fake_ag = self.make_fake_mod('fake_ag', converted_call,
-                                   converter.ConversionOptions)
-      fake_ag.__dict__.update(operators.__dict__)
-      fake_ag.__dict__.update(special_functions.__dict__)
-      fake_ag.ConversionOptions = converter.ConversionOptions
-      fake_ag.Feature = converter.Feature
-      fake_ag.utils = utils
-      fake_ag.FunctionScope = function_wrappers.FunctionScope
-      fake_ag.autograph_artifact = fake_autograph_artifact
-      result.ag__ = fake_ag
-      result.ag_source_map__ = source_map
-      for k, v in namespace.items():
-        result.__dict__[k] = v
-      yield result
-    except Exception:  # pylint:disable=broad-except
-      if source is None:
-        print('Offending AST:\n%s' % pretty_printer.fmt(node, color=False))
-      else:
-        print('Offending source code:\n%s' % source)
-      raise
-
-  @contextlib.contextmanager
-  def converted(self, entity, converter_module, namespace, tf_symbols=()):
-
-    node, ctx = self.prepare(entity, namespace)
-
-    if not isinstance(converter_module, (list, tuple)):
-      converter_module = (converter_module,)
-    for m in converter_module:
-      node = m.transform(node, ctx)
-
-    with self.compiled(node, namespace, tf_symbols) as result:
-      yield result
-
-  def make_fake_mod(self, name, *symbols):
-    fake_mod = imp.new_module(name)
-    for s in symbols:
-      if hasattr(s, '__name__'):
-        setattr(fake_mod, s.__name__, s)
-      elif hasattr(s, 'name'):
-        # This is a bit of a hack, but works for things like tf.int32
-        setattr(fake_mod, s.name, s)
-      else:
-        raise ValueError('can not attach %s - what should be its name?' % s)
-    return fake_mod
-
-  def attach_namespace(self, module, **ns):
-    for k, v in ns.items():
-      setattr(module, k, v)
-
-  def prepare(self, test_fn, namespace, recursive=True):
-    namespace['ConversionOptions'] = converter.ConversionOptions
-
-    future_features = ('print_function', 'division')
-    node, source = parser.parse_entity(test_fn, future_features=future_features)
-    namer = naming.Namer(namespace)
+  def transform(
+      self, f, converter_module, include_ast=False, ag_overrides=None):
     program_ctx = converter.ProgramContext(
-        options=converter.ConversionOptions(recursive=recursive),
-        autograph_module=None)
-    entity_info = transformer.EntityInfo(
-        name=test_fn.__name__,
-        source_code=source,
-        source_file='<fragment>',
-        future_features=future_features,
-        namespace=namespace)
-    ctx = transformer.Context(entity_info, namer, program_ctx)
-    origin_info.resolve_entity(node, source, test_fn)
+        options=converter.ConversionOptions(recursive=True),
+        autograph_module=api)
 
-    graphs = cfg.build(node)
-    node = qual_names.resolve(node)
-    node = activity.resolve(node, ctx, None)
-    node = reaching_definitions.resolve(node, ctx, graphs)
-    anno.dup(
-        node,
-        {
-            anno.Static.DEFINITIONS: anno.Static.ORIG_DEFINITIONS,
-        },
-    )
+    conversion.create_custom_vars(program_ctx)
+    custom_vars = dict(conversion.custom_vars)
 
-    return node, ctx
+    if ag_overrides:
+      modified_ag = imp.new_module('fake_autograph')
+      modified_ag.__dict__.update(custom_vars['ag__'].__dict__)
+      modified_ag.__dict__.update(ag_overrides)
+      custom_vars['ag__'] = modified_ag
+
+    tr = TestingTranspiler(converter_module)
+    transformed, _, _ = tr.transform_function(
+        f, program_ctx.options, program_ctx, custom_vars)
+
+    if include_ast:
+      return transformed, tr.transformed_ast, tr.transform_ctx
+
+    return transformed
diff --git a/tensorflow/python/autograph/impl/conversion.py b/tensorflow/python/autograph/impl/conversion.py
index eeea0aef896..4d5ddeebcc1 100644
--- a/tensorflow/python/autograph/impl/conversion.py
+++ b/tensorflow/python/autograph/impl/conversion.py
@@ -60,11 +60,7 @@ class AutoGraphTranspiler(transpiler.FunctionTranspiler):
   def get_transformed_name(self, node):
     return 'tf__' + super(AutoGraphTranspiler, self).get_transformed_name(node)
 
-  def transform_ast(self, node, ctx):
-    # TODO(mdan): Insert list_comprehensions somewhere.
-    unsupported_features_checker.verify(node)
-
-    # Run initial analysis.
+  def initial_analysis(self, node, ctx):
     graphs = cfg.build(node)
     node = qual_names.resolve(node)
     node = activity.resolve(node, ctx, None)
@@ -75,6 +71,11 @@ class AutoGraphTranspiler(transpiler.FunctionTranspiler):
             anno.Static.DEFINITIONS: anno.Static.ORIG_DEFINITIONS,
         },
     )
+    return node
+
+  def transform_ast(self, node, ctx):
+    unsupported_features_checker.verify(node)
+    node = self.initial_analysis(node, ctx)
 
     node = functions.transform(node, ctx)
     node = directives.transform(node, ctx)
@@ -114,7 +115,7 @@ def convert(entity, program_ctx):
                      'expose a __code__ object. If this is a @tf.function,'
                      ' try passing f.python_function instead.')
 
-  _create_custom_vars(program_ctx)
+  create_custom_vars(program_ctx)
   transformed, module, source_map = _TRANSPILER.transform_function(
       entity, program_ctx.options, program_ctx, custom_vars)
 
@@ -248,7 +249,8 @@ def cache_whitelisted(entity, options):
 
 
 # TODO(mdan): Move into core or replace with an actual importable module.
-def _create_custom_vars(program_ctx):
+# Visible for testing.
+def create_custom_vars(program_ctx):
   """Adds namespace references to the module that exposes the api itself."""
   global custom_vars
   if custom_vars is None:

From 4b5ef9c84402d0ff6f6ec82c1f4883a0f51d1644 Mon Sep 17 00:00:00 2001
From: abhichou4 <abhichou4@gmail.com>
Date: Tue, 30 Jun 2020 00:39:02 +0530
Subject: [PATCH 1270/1390] remove context, change import

---
 tensorflow/python/eager/forwardprop_test.py     | 3 ---
 tensorflow/python/ops/parallel_for/gradients.py | 2 +-
 2 files changed, 1 insertion(+), 4 deletions(-)

diff --git a/tensorflow/python/eager/forwardprop_test.py b/tensorflow/python/eager/forwardprop_test.py
index be16195c63f..015dff37506 100644
--- a/tensorflow/python/eager/forwardprop_test.py
+++ b/tensorflow/python/eager/forwardprop_test.py
@@ -286,9 +286,6 @@ class ForwardpropTest(test.TestCase, parameterized.TestCase):
     )
 
   def testJVPFunctionRaisesError(self):
-    context.ensure_initialized()
-    ctx = context.context()
-
     sum_outputs = (constant_op.constant(6.),)
 
     with self.assertRaisesRegexp(ValueError, r".*was expected to be of shape*"):
diff --git a/tensorflow/python/ops/parallel_for/gradients.py b/tensorflow/python/ops/parallel_for/gradients.py
index 94ab49951aa..81a00d4ab69 100644
--- a/tensorflow/python/ops/parallel_for/gradients.py
+++ b/tensorflow/python/ops/parallel_for/gradients.py
@@ -20,7 +20,7 @@ from __future__ import print_function
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
-from tensorflow.python.ops import gradients as gradient_ops
+from tensorflow.python.ops import gradients_impl as gradient_ops
 from tensorflow.python.ops.parallel_for import control_flow_ops
 from tensorflow.python.util import nest
 

From 876b655af4714d5283e142e10dec456168dd84e8 Mon Sep 17 00:00:00 2001
From: abhichou4 <abhichou4@gmail.com>
Date: Tue, 30 Jun 2020 00:56:29 +0530
Subject: [PATCH 1271/1390] removed import

---
 tensorflow/python/eager/forwardprop_test.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tensorflow/python/eager/forwardprop_test.py b/tensorflow/python/eager/forwardprop_test.py
index 015dff37506..30c9beed53d 100644
--- a/tensorflow/python/eager/forwardprop_test.py
+++ b/tensorflow/python/eager/forwardprop_test.py
@@ -27,7 +27,6 @@ import numpy as np
 from tensorflow.python import pywrap_tfe
 from tensorflow.python.distribute import mirrored_strategy
 from tensorflow.python.eager import backprop
-from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
 from tensorflow.python.eager import forwardprop
 from tensorflow.python.eager import forwardprop_util

From 18da5652e33f49afe3ef2d142802114b054e6298 Mon Sep 17 00:00:00 2001
From: Raman Sarokin <sorokin@google.com>
Date: Mon, 29 Jun 2020 12:23:10 -0700
Subject: [PATCH 1272/1390] Removed old style linkable stuff. Removed flt_type.

PiperOrigin-RevId: 318867975
Change-Id: I31639d855f45fbf09ba669457d1c598f3e88ac91
---
 tensorflow/lite/delegates/gpu/cl/BUILD        |   1 -
 tensorflow/lite/delegates/gpu/cl/cl_kernel.cc |  30 -----
 tensorflow/lite/delegates/gpu/cl/cl_kernel.h  |  19 ---
 .../lite/delegates/gpu/cl/kernels/BUILD       |  15 ---
 .../lite/delegates/gpu/cl/kernels/add.cc      |  61 ---------
 .../lite/delegates/gpu/cl/kernels/add.h       |   4 -
 .../cl/kernels/convolution_transposed_thin.h  |   1 -
 .../delegates/gpu/cl/kernels/elementwise.cc   | 120 ++----------------
 .../delegates/gpu/cl/kernels/elementwise.h    |  14 --
 .../lite/delegates/gpu/cl/kernels/flt_type.cc |  83 ------------
 .../lite/delegates/gpu/cl/kernels/flt_type.h  |  92 --------------
 .../delegates/gpu/cl/kernels/gpu_operation.cc |  29 -----
 .../delegates/gpu/cl/kernels/gpu_operation.h  |  30 -----
 .../lite/delegates/gpu/cl/kernels/prelu.cc    |  44 +------
 .../lite/delegates/gpu/cl/kernels/prelu.h     |  10 --
 .../gpu/cl/kernels/quantize_and_dequantize.cc |  59 +--------
 .../gpu/cl/kernels/quantize_and_dequantize.h  |  10 --
 .../lite/delegates/gpu/cl/kernels/relu.cc     |  53 +-------
 .../lite/delegates/gpu/cl/kernels/relu.h      |   9 --
 19 files changed, 13 insertions(+), 671 deletions(-)
 delete mode 100644 tensorflow/lite/delegates/gpu/cl/kernels/flt_type.cc
 delete mode 100644 tensorflow/lite/delegates/gpu/cl/kernels/flt_type.h

diff --git a/tensorflow/lite/delegates/gpu/cl/BUILD b/tensorflow/lite/delegates/gpu/cl/BUILD
index 1b75f077139..ffb9d6204ad 100644
--- a/tensorflow/lite/delegates/gpu/cl/BUILD
+++ b/tensorflow/lite/delegates/gpu/cl/BUILD
@@ -203,7 +203,6 @@ cc_library(
         ":cl_program",
         ":opencl_wrapper",
         ":util",
-        "//tensorflow/lite/delegates/gpu/cl/kernels:flt_type",
         "//tensorflow/lite/delegates/gpu/common:status",
         "@com_google_absl//absl/strings",
     ],
diff --git a/tensorflow/lite/delegates/gpu/cl/cl_kernel.cc b/tensorflow/lite/delegates/gpu/cl/cl_kernel.cc
index 04bf95d870a..c498c14dfe8 100644
--- a/tensorflow/lite/delegates/gpu/cl/cl_kernel.cc
+++ b/tensorflow/lite/delegates/gpu/cl/cl_kernel.cc
@@ -153,36 +153,6 @@ absl::Status CLKernel::SetBytesAuto(const void* ptr, int length) {
   return absl::OkStatus();
 }
 
-template <>
-absl::Status CLKernel::SetBytes<FLT>(int index, const FLT& value) const {
-  return SetBytes(index, value.GetData(), value.GetSize());
-}
-
-template <>
-absl::Status CLKernel::SetBytes<FLT2>(int index, const FLT2& value) const {
-  return SetBytes(index, value.GetData(), value.GetSize());
-}
-
-template <>
-absl::Status CLKernel::SetBytes<FLT4>(int index, const FLT4& value) const {
-  return SetBytes(index, value.GetData(), value.GetSize());
-}
-
-template <>
-absl::Status CLKernel::SetBytesAuto<FLT>(const FLT& value) {
-  return SetBytesAuto(value.GetData(), value.GetSize());
-}
-
-template <>
-absl::Status CLKernel::SetBytesAuto<FLT2>(const FLT2& value) {
-  return SetBytesAuto(value.GetData(), value.GetSize());
-}
-
-template <>
-absl::Status CLKernel::SetBytesAuto<FLT4>(const FLT4& value) {
-  return SetBytesAuto(value.GetData(), value.GetSize());
-}
-
 }  // namespace cl
 }  // namespace gpu
 }  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/cl_kernel.h b/tensorflow/lite/delegates/gpu/cl/cl_kernel.h
index be9dc6dbf03..81a777ed822 100644
--- a/tensorflow/lite/delegates/gpu/cl/cl_kernel.h
+++ b/tensorflow/lite/delegates/gpu/cl/cl_kernel.h
@@ -21,7 +21,6 @@ limitations under the License.
 #include "tensorflow/lite/delegates/gpu/cl/cl_context.h"
 #include "tensorflow/lite/delegates/gpu/cl/cl_device.h"
 #include "tensorflow/lite/delegates/gpu/cl/cl_program.h"
-#include "tensorflow/lite/delegates/gpu/cl/kernels/flt_type.h"
 #include "tensorflow/lite/delegates/gpu/cl/opencl_wrapper.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
 
@@ -87,24 +86,6 @@ class CLKernel {
   cl_kernel kernel_ = nullptr;
 };
 
-template <>
-absl::Status CLKernel::SetBytes<FLT>(int index, const FLT& value) const;
-
-template <>
-absl::Status CLKernel::SetBytes<FLT2>(int index, const FLT2& value) const;
-
-template <>
-absl::Status CLKernel::SetBytes<FLT4>(int index, const FLT4& value) const;
-
-template <>
-absl::Status CLKernel::SetBytesAuto<FLT>(const FLT& value);
-
-template <>
-absl::Status CLKernel::SetBytesAuto<FLT2>(const FLT2& value);
-
-template <>
-absl::Status CLKernel::SetBytesAuto<FLT4>(const FLT4& value);
-
 }  // namespace cl
 }  // namespace gpu
 }  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/BUILD b/tensorflow/lite/delegates/gpu/cl/kernels/BUILD
index bbfe7b13d8e..1f81a34604a 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/BUILD
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/BUILD
@@ -536,7 +536,6 @@ cc_library(
     srcs = ["convolution_transposed_thin.cc"],
     hdrs = ["convolution_transposed_thin.h"],
     deps = [
-        ":flt_type",
         ":gpu_operation",
         ":util",
         ":work_group_picking",
@@ -685,17 +684,6 @@ cc_test(
     ],
 )
 
-cc_library(
-    name = "flt_type",
-    srcs = ["flt_type.cc"],
-    hdrs = ["flt_type.h"],
-    deps = [
-        "//tensorflow/lite/delegates/gpu/cl:precision",
-        "//tensorflow/lite/delegates/gpu/common:types",
-        "@com_google_absl//absl/strings",
-    ],
-)
-
 cc_library(
     name = "fully_connected",
     srcs = ["fully_connected.cc"],
@@ -927,7 +915,6 @@ cc_library(
     srcs = ["prelu.cc"],
     hdrs = ["prelu.h"],
     deps = [
-        ":flt_type",
         ":gpu_operation",
         ":util",
         "//tensorflow/lite/delegates/gpu/cl:cl_context",
@@ -965,7 +952,6 @@ cc_library(
     srcs = ["quantize_and_dequantize.cc"],
     hdrs = ["quantize_and_dequantize.h"],
     deps = [
-        ":flt_type",
         ":gpu_operation",
         ":util",
         "//tensorflow/lite/delegates/gpu/cl:cl_context",
@@ -1004,7 +990,6 @@ cc_library(
     srcs = ["relu.cc"],
     hdrs = ["relu.h"],
     deps = [
-        ":flt_type",
         ":gpu_operation",
         "//tensorflow/lite/delegates/gpu/cl:precision",
         "//tensorflow/lite/delegates/gpu/common:operations",
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/add.cc b/tensorflow/lite/delegates/gpu/cl/kernels/add.cc
index 7a5eab67474..858d188945f 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/add.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/add.cc
@@ -67,66 +67,6 @@ Add& Add::operator=(Add&& operation) {
   return *this;
 }
 
-void Add::SetLinkIndex(int index) {
-  link_index_ = index;
-}
-
-std::string Add::GetCoreCode(const LinkingContext& context) const {
-  std::string result;
-  for (int i = 1; i < src_depthes_.size(); ++i) {
-    const std::string tensor_name =
-        absl::StrCat("src_data_", link_index_, "_", i);
-    const std::string size_name =
-        "src_size_" + std::to_string(link_index_) + "_" + std::to_string(i);
-    TensorCodeGenerator src_tensor(
-        tensor_name,
-        WHSPoint{size_name + ".x", size_name + ".y", size_name + ".z"},
-        definition_.src_tensors[i]);
-    if (src_depthes_[i] != dst_depth_) {
-      absl::StrAppend(&result, "  if (", context.s_coord, " < ",
-                      src_depthes_[i], ") {\n");
-      absl::StrAppend(&result, "  ", context.var_name, " += ",
-                      src_tensor.ReadWHS(context.x_coord, context.y_coord,
-                                         context.s_coord) +
-                          ";\n");
-      absl::StrAppend(&result, "  }\n");
-    } else {
-      absl::StrAppend(&result, "  ", context.var_name, " += ",
-                      src_tensor.ReadWHS(context.x_coord, context.y_coord,
-                                         context.s_coord) +
-                          ";\n");
-    }
-  }
-  return result;
-}
-
-std::string Add::GetArgsDeclaration() const {
-  std::string args;
-  for (int i = 1; i < src_depthes_.size(); ++i) {
-    const std::string tensor_name =
-        absl::StrCat("src_data_", link_index_, "_", i);
-    absl::StrAppend(&args, ",\n",
-                    GetTensorDeclaration(AccessType::READ, tensor_name,
-                                         definition_.src_tensors[i]));
-  }
-  for (int i = 1; i < src_depthes_.size(); ++i) {
-    const std::string size_name =
-        "src_size_" + std::to_string(link_index_) + "_" + std::to_string(i);
-    absl::StrAppend(&args, ",\n   int4 ", size_name);
-  }
-  return args;
-}
-
-absl::Status Add::BindArguments(CLKernel* kernel) {
-  for (int i = 1; i < src_depthes_.size(); ++i) {
-    RETURN_IF_ERROR(kernel->SetMemoryAuto(src_[i]->GetMemoryPtr()));
-  }
-  for (int i = 1; i < src_depthes_.size(); ++i) {
-    RETURN_IF_ERROR(kernel->SetBytesAuto(src_[i]->GetWBatchedHSB()));
-  }
-  return absl::OkStatus();
-}
-
 absl::Status Add::SetArgs(const std::string& unique_postfix, Arguments* args) {
   for (int i = 1; i < definition_.src_tensors.size(); ++i) {
     std::string tensor_name = absl::StrCat("src_data_", i, unique_postfix);
@@ -138,7 +78,6 @@ absl::Status Add::SetArgs(const std::string& unique_postfix, Arguments* args) {
 Add CreateAdd(const OperationDef& definition, const std::vector<int>& channels,
               int dst_channels) {
   Add operation(definition, channels, dst_channels);
-  operation.SetLinkIndex(0);
   return operation;
 }
 
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/add.h b/tensorflow/lite/delegates/gpu/cl/kernels/add.h
index bfe7fb9d2c1..f20425c48dd 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/add.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/add.h
@@ -42,10 +42,6 @@ class Add : public ElementwiseOperation {
   Add(const Add&) = delete;
   Add& operator=(const Add&) = delete;
 
-  void SetLinkIndex(int index) override;
-  std::string GetCoreCode(const LinkingContext& context) const override;
-  std::string GetArgsDeclaration() const override;
-  absl::Status BindArguments(CLKernel* kernel) override;
   absl::Status SetArgs(const std::string& unique_postfix,
                        Arguments* args) override;
   bool IsLinkable() const override { return dst_depth_ == src_depthes_[0]; }
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_thin.h b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_thin.h
index 3061d545f75..9a994d61e70 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_thin.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_thin.h
@@ -19,7 +19,6 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/lite/delegates/gpu/cl/buffer.h"
-#include "tensorflow/lite/delegates/gpu/cl/kernels/flt_type.h"
 #include "tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h"
 #include "tensorflow/lite/delegates/gpu/cl/tensor.h"
 #include "tensorflow/lite/delegates/gpu/cl/texture2d.h"
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/elementwise.cc b/tensorflow/lite/delegates/gpu/cl/kernels/elementwise.cc
index aa19d995be8..47d7dababeb 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/elementwise.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/elementwise.cc
@@ -143,24 +143,16 @@ ElementwiseOneInput& ElementwiseOneInput::operator=(
   return *this;
 }
 
-std::string ElementwiseOneInput::GetCoreCode(
-    const LinkingContext& context) const {
-  return GetOneInputCode(op_type_, definition_.precision, context.var_name);
-}
-
 ElementwiseOneInput CreateElementwiseOneInput(const OperationDef& definition,
                                               const OperationType& op_type) {
   ElementwiseOneInput operation(definition, op_type);
-  operation.SetLinkIndex(0);
   return operation;
 }
 
 ElementwiseOneRuntimeOneScalar::ElementwiseOneRuntimeOneScalar(
     const OperationDef& definition, const OperationType& op_type,
     float scalar_parameter, CalculationsPrecision scalar_precision)
-    : ElementwiseOperation(definition),
-      op_type_(op_type),
-      scalar_parameter_(FLT(scalar_precision, scalar_parameter)) {
+    : ElementwiseOperation(definition), op_type_(op_type) {
   if (definition.precision == CalculationsPrecision::F32) {
     args_.AddFloat("scalar", scalar_parameter);
   } else {
@@ -173,43 +165,18 @@ ElementwiseOneRuntimeOneScalar::ElementwiseOneRuntimeOneScalar(
     ElementwiseOneRuntimeOneScalar&& operation)
     : ElementwiseOperation(std::move(operation)),
       link_index_(operation.link_index_),
-      op_type_(operation.op_type_),
-      scalar_parameter_(std::move(operation.scalar_parameter_)) {}
+      op_type_(operation.op_type_) {}
 
 ElementwiseOneRuntimeOneScalar& ElementwiseOneRuntimeOneScalar::operator=(
     ElementwiseOneRuntimeOneScalar&& operation) {
   if (this != &operation) {
     link_index_ = operation.link_index_;
     op_type_ = operation.op_type_;
-    scalar_parameter_ = operation.scalar_parameter_;
     ElementwiseOperation::operator=(std::move(operation));
   }
   return *this;
 }
 
-void ElementwiseOneRuntimeOneScalar::SetLinkIndex(int index) {
-  link_index_ = index;
-  scalar_parameter_.SetName(absl::StrCat("scalar_parmeter_", index));
-}
-
-std::string ElementwiseOneRuntimeOneScalar::GetCoreCode(
-    const LinkingContext& context) const {
-  std::string second_var =
-      absl::StrCat("(FLT)(", scalar_parameter_.GetName(), ")");
-  return GetTwoInputCode(op_type_, context.var_name, second_var);
-}
-
-std::string ElementwiseOneRuntimeOneScalar::GetArgsDeclaration() const {
-  std::string args;
-  absl::StrAppend(&args, ",\n    ", scalar_parameter_.GetDeclaration());
-  return args;
-}
-
-absl::Status ElementwiseOneRuntimeOneScalar::BindArguments(CLKernel* kernel) {
-  RETURN_IF_ERROR(kernel->SetBytesAuto(scalar_parameter_));
-  return absl::OkStatus();
-}
-
 ElementwiseOneRuntimeOneScalar CreateElementwiseOneRuntimeOneScalar(
     const CreationContext& creation_context, const OperationDef& definition,
     const OperationType& op_type, float scalar_parameter) {
@@ -218,7 +185,6 @@ ElementwiseOneRuntimeOneScalar CreateElementwiseOneRuntimeOneScalar(
                                     : definition.precision;
   ElementwiseOneRuntimeOneScalar operation(definition, op_type,
                                            scalar_parameter, scalar_precision);
-  operation.SetLinkIndex(0);
   return operation;
 }
 
@@ -227,8 +193,7 @@ ElementwiseTwoInput::ElementwiseTwoInput(const OperationDef& definition,
                                          const BroadcastSettings& broadcast)
     : ElementwiseOperation(definition),
       op_type_(op_type),
-      broadcast_(broadcast),
-      use_constant_tensor_(false) {
+      broadcast_(broadcast) {
   auto src_desc =
       absl::make_unique<TensorDescriptor>(definition.src_tensors[1]);
   if (definition.IsBatchSupported()) {
@@ -254,12 +219,11 @@ ElementwiseTwoInput::ElementwiseTwoInput(const OperationDef& definition,
                                          Tensor&& constant_tensor)
     : ElementwiseOperation(definition),
       op_type_(op_type),
-      broadcast_(broadcast),
-      use_constant_tensor_(true),
-      constant_tensor_(std::move(constant_tensor)) {
-  args_.AddObjectRef(
-      "second_tensor", AccessType::READ,
-      absl::make_unique<TensorDescriptor>(constant_tensor.GetDescriptor()));
+      broadcast_(broadcast) {
+  auto descriptor = constant_tensor.GetDescriptor();
+  args_.AddObject("second_tensor", AccessType::READ,
+                  absl::make_unique<Tensor>(std::move(constant_tensor)),
+                  absl::make_unique<TensorDescriptor>(descriptor));
   const std::string x_coord = broadcast.width ? "0" : "X_COORD";
   const std::string y_coord = broadcast.height ? "0" : "Y_COORD";
   const std::string s_coord = broadcast.channels ? "0" : "S_COORD";
@@ -277,9 +241,7 @@ ElementwiseTwoInput::ElementwiseTwoInput(ElementwiseTwoInput&& operation)
     : ElementwiseOperation(std::move(operation)),
       link_index_(operation.link_index_),
       op_type_(operation.op_type_),
-      broadcast_(operation.broadcast_),
-      use_constant_tensor_(operation.use_constant_tensor_),
-      constant_tensor_(std::move(operation.constant_tensor_)) {}
+      broadcast_(operation.broadcast_) {}
 
 ElementwiseTwoInput& ElementwiseTwoInput::operator=(
     ElementwiseTwoInput&& operation) {
@@ -287,73 +249,15 @@ ElementwiseTwoInput& ElementwiseTwoInput::operator=(
     link_index_ = operation.link_index_;
     op_type_ = operation.op_type_;
     broadcast_ = operation.broadcast_;
-    use_constant_tensor_ = operation.use_constant_tensor_;
-    constant_tensor_ = std::move(operation.constant_tensor_);
     ElementwiseOperation::operator=(std::move(operation));
   }
   return *this;
 }
 
-void ElementwiseTwoInput::SetLinkIndex(int index) {
-  link_index_ = index;
-}
-
-std::string ElementwiseTwoInput::GetCoreCode(
-    const LinkingContext& context) const {
-  std::string result;
-  std::string second_var;
-  const std::string size_name = "src_size_" + std::to_string(link_index_);
-  TensorDescriptor descriptor = use_constant_tensor_
-                                    ? constant_tensor_.GetDescriptor()
-                                    : definition_.src_tensors[1];
-  TensorCodeGenerator src_tensor(
-      absl::StrCat("src_data_", link_index_),
-      WHSPoint{size_name + ".x", size_name + ".y", size_name + ".z"},
-      descriptor);
-  const std::string x_coord = broadcast_.width ? "0" : context.x_coord;
-  const std::string y_coord = broadcast_.height ? "0" : context.y_coord;
-  const std::string s_coord = broadcast_.channels ? "0" : context.s_coord;
-  second_var = "second_var_" + std::to_string(link_index_);
-  result = "  FLT4 " + second_var + " = " +
-           src_tensor.ReadWHS(x_coord, y_coord, s_coord) + ";\n";
-  if (broadcast_.channels) {
-    result += "  " + second_var + ".y = " + second_var + ".x;\n";
-    result += "  " + second_var + ".z = " + second_var + ".x;\n";
-    result += "  " + second_var + ".w = " + second_var + ".x;\n";
-  }
-  return result + GetTwoInputCode(op_type_, context.var_name, second_var);
-}
-
-std::string ElementwiseTwoInput::GetArgsDeclaration() const {
-  std::string args;
-  TensorDescriptor descriptor = use_constant_tensor_
-                                    ? constant_tensor_.GetDescriptor()
-                                    : definition_.src_tensors[1];
-  absl::StrAppend(
-      &args, ",\n",
-      GetTensorDeclaration(AccessType::READ,
-                           absl::StrCat("src_data_", link_index_), descriptor));
-  absl::StrAppend(&args, ",\n   int4 src_size_", link_index_);
-  return args;
-}
-
-absl::Status ElementwiseTwoInput::BindArguments(CLKernel* kernel) {
-  if (use_constant_tensor_) {
-    RETURN_IF_ERROR(kernel->SetMemoryAuto(constant_tensor_.GetMemoryPtr()));
-    RETURN_IF_ERROR(kernel->SetBytesAuto(constant_tensor_.GetWBatchedHSB()));
-  } else {
-    RETURN_IF_ERROR(kernel->SetMemoryAuto(src_[1]->GetMemoryPtr()));
-    RETURN_IF_ERROR(kernel->SetBytesAuto(src_[1]->GetWBatchedHSB()));
-  }
-  return absl::OkStatus();
-}
-
 absl::Status ElementwiseTwoInput::SetArgs(const std::string& unique_postfix,
                                           Arguments* args) {
   std::string tensor_name = absl::StrCat("second_tensor", unique_postfix);
-  if (use_constant_tensor_) {
-    RETURN_IF_ERROR(args->SetObjectRef(tensor_name, &constant_tensor_));
-  } else {
+  if (src_.size() == 2) {
     RETURN_IF_ERROR(args->SetObjectRef(tensor_name, src_[1]));
   }
   return absl::OkStatus();
@@ -382,7 +286,6 @@ absl::Status CreateElementwiseTwoInput(
   broadcast.channels = shape.c == 1;
   *result = ElementwiseTwoInput(definition, op_type, broadcast,
                                 std::move(gpu_tensor));
-  result->SetLinkIndex(0);
   return absl::OkStatus();
 }
 
@@ -410,7 +313,6 @@ absl::Status CreateElementwiseTwoInput(
   broadcast.channels = shape.c == 1;
   *result = ElementwiseTwoInput(definition, op_type, broadcast,
                                 std::move(gpu_tensor));
-  result->SetLinkIndex(0);
   return absl::OkStatus();
 }
 
@@ -422,7 +324,6 @@ ElementwiseTwoInput CreateElementwiseTwoInput(const OperationDef& definition,
   broadcast.height = shape.h == 1;
   broadcast.channels = shape.c == 1;
   ElementwiseTwoInput operation(definition, op_type, broadcast);
-  operation.SetLinkIndex(0);
   return operation;
 }
 
@@ -433,7 +334,6 @@ ElementwiseTwoInput CreateElementwiseTwoInput(const OperationDef& definition,
   broadcast.height = false;
   broadcast.channels = false;
   ElementwiseTwoInput operation(definition, op_type, broadcast);
-  operation.SetLinkIndex(0);
   return operation;
 }
 
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/elementwise.h b/tensorflow/lite/delegates/gpu/cl/kernels/elementwise.h
index 1c2adc04858..be037802dbc 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/elementwise.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/elementwise.h
@@ -39,8 +39,6 @@ class ElementwiseOneInput : public ElementwiseOperation {
   ElementwiseOneInput(const ElementwiseOneInput&) = delete;
   ElementwiseOneInput& operator=(const ElementwiseOneInput&) = delete;
 
-  std::string GetCoreCode(const LinkingContext& context) const override;
-
  private:
   OperationType op_type_;
 };
@@ -67,15 +65,9 @@ class ElementwiseOneRuntimeOneScalar : public ElementwiseOperation {
   ElementwiseOneRuntimeOneScalar& operator=(
       const ElementwiseOneRuntimeOneScalar&) = delete;
 
-  void SetLinkIndex(int index) override;
-  std::string GetCoreCode(const LinkingContext& context) const override;
-  std::string GetArgsDeclaration() const override;
-  absl::Status BindArguments(CLKernel* kernel) override;
-
  private:
   int link_index_;
   OperationType op_type_;
-  FLT scalar_parameter_;
 };
 
 ElementwiseOneRuntimeOneScalar CreateElementwiseOneRuntimeOneScalar(
@@ -109,10 +101,6 @@ class ElementwiseTwoInput : public ElementwiseOperation {
   ElementwiseTwoInput(const ElementwiseTwoInput&) = delete;
   ElementwiseTwoInput& operator=(const ElementwiseTwoInput&) = delete;
 
-  void SetLinkIndex(int index) override;
-  std::string GetCoreCode(const LinkingContext& context) const override;
-  std::string GetArgsDeclaration() const override;
-  absl::Status BindArguments(CLKernel* kernel) override;
   absl::Status SetArgs(const std::string& unique_postfix,
                        Arguments* args) override;
 
@@ -120,8 +108,6 @@ class ElementwiseTwoInput : public ElementwiseOperation {
   int link_index_;
   OperationType op_type_;
   BroadcastSettings broadcast_;
-  bool use_constant_tensor_;
-  Tensor constant_tensor_;
 };
 
 absl::Status CreateElementwiseTwoInput(
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/flt_type.cc b/tensorflow/lite/delegates/gpu/cl/kernels/flt_type.cc
deleted file mode 100644
index e49267bad99..00000000000
--- a/tensorflow/lite/delegates/gpu/cl/kernels/flt_type.cc
+++ /dev/null
@@ -1,83 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/lite/delegates/gpu/cl/kernels/flt_type.h"
-
-#include "absl/strings/str_cat.h"
-
-namespace tflite {
-namespace gpu {
-namespace cl {
-
-FLT::FLT(CalculationsPrecision precision, float value)
-    : f32_(precision == CalculationsPrecision::F32), active_(true) {
-  if (f32_) {
-    f_value_ = value;
-  } else {
-    h_value_ = half(value);
-  }
-}
-
-const void* FLT::GetData() const {
-  return f32_ ? static_cast<const void*>(&f_value_)
-              : static_cast<const void*>(&h_value_);
-}
-
-std::string FLT::GetDeclaration() const {
-  const std::string type = f32_ ? "float" : "half";
-  return absl::StrCat(type, " ", name_);
-}
-
-FLT2::FLT2(CalculationsPrecision precision, const float2& value)
-    : f32_(precision == CalculationsPrecision::F32), active_(true) {
-  if (f32_) {
-    f_value_ = value;
-  } else {
-    h_value_ = half2(value);
-  }
-}
-
-const void* FLT2::GetData() const {
-  return f32_ ? static_cast<const void*>(&f_value_)
-              : static_cast<const void*>(&h_value_);
-}
-
-std::string FLT2::GetDeclaration() const {
-  const std::string type = f32_ ? "float2" : "half2";
-  return absl::StrCat(type, " ", name_);
-}
-
-FLT4::FLT4(CalculationsPrecision precision, const float4& value)
-    : f32_(precision == CalculationsPrecision::F32), active_(true) {
-  if (f32_) {
-    f_value_ = value;
-  } else {
-    h_value_ = half4(value);
-  }
-}
-
-const void* FLT4::GetData() const {
-  return f32_ ? static_cast<const void*>(&f_value_)
-              : static_cast<const void*>(&h_value_);
-}
-
-std::string FLT4::GetDeclaration() const {
-  const std::string type = f32_ ? "float4" : "half4";
-  return absl::StrCat(type, " ", name_);
-}
-
-}  // namespace cl
-}  // namespace gpu
-}  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/flt_type.h b/tensorflow/lite/delegates/gpu/cl/kernels/flt_type.h
deleted file mode 100644
index 9caf017fce7..00000000000
--- a/tensorflow/lite/delegates/gpu/cl/kernels/flt_type.h
+++ /dev/null
@@ -1,92 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_FLT_TYPE_H_
-#define TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_FLT_TYPE_H_
-
-#include <string>
-
-#include "tensorflow/lite/delegates/gpu/cl/precision.h"
-#include "tensorflow/lite/delegates/gpu/common/types.h"
-
-namespace tflite {
-namespace gpu {
-namespace cl {
-
-class FLT {
- public:
-  FLT() = default;
-  FLT(CalculationsPrecision precision, float value);
-
-  const void* GetData() const;
-  size_t GetSize() const { return f32_ ? sizeof(float) : sizeof(half); }
-  bool Active() const { return active_; }
-  std::string GetDeclaration() const;
-  std::string GetName() const { return name_; }
-  void SetName(const std::string& name) { name_ = name; }
-
- private:
-  float f_value_;
-  half h_value_;
-  bool f32_;
-  bool active_ = false;
-  std::string name_;
-};
-
-class FLT2 {
- public:
-  FLT2() = default;
-  FLT2(CalculationsPrecision precision, const float2& value);
-
-  const void* GetData() const;
-  size_t GetSize() const { return f32_ ? 8 : 4; }
-  bool Active() const { return active_; }
-  std::string GetDeclaration() const;
-  std::string GetName() const { return name_; }
-  void SetName(const std::string& name) { name_ = name; }
-
- private:
-  float2 f_value_;
-  half2 h_value_;
-  bool f32_;
-  bool active_ = false;
-  std::string name_;
-};
-
-class FLT4 {
- public:
-  FLT4() {}
-  FLT4(CalculationsPrecision precision, const float4& value);
-
-  const void* GetData() const;
-  size_t GetSize() const { return f32_ ? sizeof(float4) : sizeof(half4); }
-  bool Active() const { return active_; }
-  std::string GetDeclaration() const;
-  std::string GetName() const { return name_; }
-  void SetName(const std::string& name) { name_ = name; }
-
- private:
-  float4 f_value_;
-  half4 h_value_;
-  bool f32_;
-  bool active_ = false;
-  std::string name_;
-};
-
-}  // namespace cl
-}  // namespace gpu
-}  // namespace tflite
-
-#endif  // TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_FLT_TYPE_H_
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.cc b/tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.cc
index d365ec30d5d..669e4478bdd 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.cc
@@ -138,7 +138,6 @@ GPUOperation& GPUOperation::operator=(GPUOperation&& operation) {
 
 void GPUOperation::AddOperation(ElementwiseOperation* operation) {
   linked_operations_.push_back(operation);
-  operation->SetLinkIndex(linked_operations_.size());
 }
 
 ElementwiseOperation::ElementwiseOperation(ElementwiseOperation&& operation)
@@ -203,34 +202,6 @@ absl::Status ElementwiseOperation::Tune(const TuningParameters& params) {
   return GetBestWorkGroup(params, kernel_, GetGridSize(), &work_group_size_);
 }
 
-std::string GetArgsDeclaration(
-    const std::vector<ElementwiseOperation*>& linked_ops) {
-  std::string code;
-  for (auto linked_op : linked_ops) {
-    code += linked_op->GetArgsDeclaration();
-  }
-  code += ",\n";
-
-  return code;
-}
-
-std::string PostProcess(const std::vector<ElementwiseOperation*>& linked_ops,
-                        const LinkingContext& context) {
-  std::string code;
-  for (auto linked_op : linked_ops) {
-    code += "{" + linked_op->GetCoreCode(context) + "}";
-  }
-  return code;
-}
-
-absl::Status BindArgs(CLKernel* kernel,
-                      const std::vector<ElementwiseOperation*>& linked_ops) {
-  for (auto linked_op : linked_ops) {
-    RETURN_IF_ERROR(linked_op->BindArguments(kernel));
-  }
-  return absl::OkStatus();
-}
-
 absl::Status MergeOperations(
     const std::vector<ElementwiseOperation*>& linked_ops,
     Arguments* merged_args, std::string* merged_code) {
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h b/tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h
index 44134de1f8b..1e101ef2849 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h
@@ -144,21 +144,6 @@ class ElementwiseOperation : public GPUOperation {
   ElementwiseOperation(const ElementwiseOperation&) = delete;
   ElementwiseOperation& operator=(const ElementwiseOperation&) = delete;
 
-  // We need this function for resolving naming conflicts.
-  // Unfortunately we don't know upfront(at creation time) will be the operation
-  // linked or not. Operation should be created and SetLinkIndex(0) must be
-  // called to initialize specific for this op linked info, and this is mean
-  // that operation is not linked. But if we decided to link it, we need update
-  // operation linked info and use names for kernel arguments according to this
-  // index(this is responsibility of particular implementation of
-  // ElementwiseOperation to generate right names).
-  virtual void SetLinkIndex(int index) {}
-
-  virtual std::string GetCoreCode(const LinkingContext& context) const = 0;
-  virtual std::string GetArgsDeclaration() const { return ""; }
-  virtual absl::Status BindArguments(CLKernel* kernel) {
-    return absl::OkStatus();
-  }
   virtual absl::Status SetArgs(const std::string& unique_postfix,
                                Arguments* args) {
     return absl::OkStatus();
@@ -179,21 +164,6 @@ class ElementwiseOperation : public GPUOperation {
   int3 work_group_size_ = int3(8, 4, 1);
 };
 
-// Generates arguments declarations string for elementwise
-// operations in linked_ops.
-// Every ElementwiseOperation can generate arguments declarations.
-std::string GetArgsDeclaration(
-    const std::vector<ElementwiseOperation*>& linked_ops);
-
-std::string PostProcess(const std::vector<ElementwiseOperation*>& linked_ops,
-                        const LinkingContext& context);
-
-// Binds arguments to given kernel for elementwise operations in
-// linked_ops.
-// Every ElementwiseOperation can bind her arguments.
-absl::Status BindArgs(CLKernel* kernel,
-                      const std::vector<ElementwiseOperation*>& linked_ops);
-
 absl::Status MergeOperations(
     const std::vector<ElementwiseOperation*>& linked_ops,
     Arguments* merged_args, std::string* merged_code);
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/prelu.cc b/tensorflow/lite/delegates/gpu/cl/kernels/prelu.cc
index 3c7eb79e85c..85c88f3b51b 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/prelu.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/prelu.cc
@@ -28,7 +28,6 @@ PReLU::PReLU(const OperationDef& definition, const PReLUAttributes& attr,
              CalculationsPrecision scalar_precision)
     : ElementwiseOperation(definition) {
   if (attr.clip != 0) {
-    clip_ = FLT(scalar_precision, attr.clip);
     if (definition.precision == CalculationsPrecision::F32) {
       args_.AddFloat("clip", attr.clip);
     } else {
@@ -44,55 +43,15 @@ PReLU::PReLU(const OperationDef& definition, const PReLUAttributes& attr,
   }
 }
 
-PReLU::PReLU(PReLU&& operation)
-    : ElementwiseOperation(std::move(operation)),
-      clip_(std::move(operation.clip_)),
-      alpha_(std::move(operation.alpha_)) {}
+PReLU::PReLU(PReLU&& operation) : ElementwiseOperation(std::move(operation)) {}
 
 PReLU& PReLU::operator=(PReLU&& operation) {
   if (this != &operation) {
-    clip_ = std::move(operation.clip_);
-    alpha_ = std::move(operation.alpha_);
     ElementwiseOperation::operator=(std::move(operation));
   }
   return *this;
 }
 
-void PReLU::SetLinkIndex(int index) {
-  clip_.SetName(absl::StrCat("prelu_clip", index));
-  alpha_.SetName(absl::StrCat("prelu_alpha_", index));
-}
-
-std::string PReLU::GetCoreCode(const LinkingContext& context) const {
-  if (!clip_.Active()) {
-    return absl::StrCat(context.var_name, " = max((FLT4)(0.0f), ",
-                        context.var_name, ") + min((FLT4)(0.0f), ",
-                        context.var_name, ") * ",
-                        alpha_.ReadLinearFLT4(context.s_coord), ";\n");
-  } else {
-    return absl::StrCat(context.var_name, " = clamp(", context.var_name,
-                        ", (FLT4)(0.0f), (FLT4)(", clip_.GetName(),
-                        ")) + min((FLT4)(0.0f), ", context.var_name, ") * ",
-                        alpha_.ReadLinearFLT4(context.s_coord), ";\n");
-  }
-}
-
-std::string PReLU::GetArgsDeclaration() const {
-  std::string args = absl::StrCat(",\n    ", alpha_.GetDeclaration());
-  if (clip_.Active()) {
-    absl::StrAppend(&args, ",\n    ", clip_.GetDeclaration());
-  }
-  return args;
-}
-
-absl::Status PReLU::BindArguments(CLKernel* kernel) {
-  RETURN_IF_ERROR(kernel->SetMemoryAuto(alpha_.GetMemoryPtr()));
-  if (clip_.Active()) {
-    RETURN_IF_ERROR(kernel->SetBytesAuto(clip_));
-  }
-  return absl::OkStatus();
-}
-
 absl::Status CreatePReLU(const CreationContext& creation_context,
                          const OperationDef& definition,
                          const PReLUAttributes& attr, PReLU* result) {
@@ -106,7 +65,6 @@ absl::Status CreatePReLU(const CreationContext& creation_context,
                                     : definition.precision;
   *result = PReLU(definition, attr, scalar_precision);
   RETURN_IF_ERROR(result->UploadParameters(*alpha, creation_context.context));
-  result->SetLinkIndex(0);
   return absl::OkStatus();
 }
 
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/prelu.h b/tensorflow/lite/delegates/gpu/cl/kernels/prelu.h
index 80ee49f77ca..e65559cf7c7 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/prelu.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/prelu.h
@@ -20,7 +20,6 @@ limitations under the License.
 
 #include "tensorflow/lite/delegates/gpu/cl/cl_context.h"
 #include "tensorflow/lite/delegates/gpu/cl/cl_kernel.h"
-#include "tensorflow/lite/delegates/gpu/cl/kernels/flt_type.h"
 #include "tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h"
 #include "tensorflow/lite/delegates/gpu/cl/linear_storage.h"
 #include "tensorflow/lite/delegates/gpu/common/data_type.h"
@@ -41,11 +40,6 @@ class PReLU : public ElementwiseOperation {
   PReLU(const PReLU&) = delete;
   PReLU& operator=(const PReLU&) = delete;
 
-  void SetLinkIndex(int index) override;
-  std::string GetCoreCode(const LinkingContext& context) const override;
-  std::string GetArgsDeclaration() const override;
-  absl::Status BindArguments(CLKernel* kernel) override;
-
   friend absl::Status CreatePReLU(const CreationContext& creation_context,
                                   const OperationDef& definition,
                                   const PReLUAttributes& attr, PReLU* result);
@@ -57,9 +51,6 @@ class PReLU : public ElementwiseOperation {
   template <DataType T>
   absl::Status UploadParameters(
       const tflite::gpu::Tensor<Linear, T>& parameters, CLContext* context);
-
-  FLT clip_;
-  LinearStorage alpha_;
 };
 
 absl::Status CreatePReLU(const CreationContext& creation_context,
@@ -73,7 +64,6 @@ absl::Status PReLU::UploadParameters(
   desc.storage_type =
       DeduceLinearStorageType(definition_.GetPrimaryStorageType());
   desc.element_type = definition_.GetPrimaryDataType();
-  RETURN_IF_ERROR(CreateLinearStorage(desc, parameters, context, &alpha_));
 
   LinearStorage lt;
   RETURN_IF_ERROR(CreateLinearStorage(desc, parameters, context, &lt));
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/quantize_and_dequantize.cc b/tensorflow/lite/delegates/gpu/cl/kernels/quantize_and_dequantize.cc
index 9b31001a4db..957fc9bbb98 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/quantize_and_dequantize.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/quantize_and_dequantize.cc
@@ -44,75 +44,19 @@ FLT4 clamped_value = min((FLT4)(args.max), max((FLT4)(args.min), in_out_value));
 FLT4 quantized_value = round((clamped_value - (FLT4)(args.min)) / (FLT4)(args.scale));
 FLT4 dequantized_value = quantized_value * (FLT4)(args.scale) + (FLT4)(args.min);
 in_out_value = dequantized_value;)";
-  min_ = FLT(scalar_precision, attr.min);
-  max_ = FLT(scalar_precision, attr.max);
-  scale_ = FLT(scalar_precision, attr.scale);
 }
 
 QuantizeAndDequantize::QuantizeAndDequantize(QuantizeAndDequantize&& operation)
-    : ElementwiseOperation(std::move(operation)),
-      min_(std::move(operation.min_)),
-      max_(std::move(operation.max_)),
-      scale_(std::move(operation.scale_)) {}
+    : ElementwiseOperation(std::move(operation)) {}
 
 QuantizeAndDequantize& QuantizeAndDequantize::operator=(
     QuantizeAndDequantize&& operation) {
   if (this != &operation) {
-    min_ = std::move(operation.min_);
-    max_ = std::move(operation.max_);
-    scale_ = std::move(operation.scale_);
     ElementwiseOperation::operator=(std::move(operation));
   }
   return *this;
 }
 
-void QuantizeAndDequantize::SetLinkIndex(int index) {
-  min_.SetName(absl::StrCat("quantize_and_dequantize_min_", index));
-  max_.SetName(absl::StrCat("quantize_and_dequantize_max_", index));
-  scale_.SetName(absl::StrCat("quantize_and_dequantize_scale_", index));
-}
-
-std::string QuantizeAndDequantize::GetCoreCode(
-    const LinkingContext& context) const {
-  std::string scale_string, max_string, min_string;
-  if (!scale_.Active()) {
-    scale_string = "(FLT4)(1.0f)";
-  } else {
-    scale_string = absl::StrCat("(FLT4)(", scale_.GetName(), ")");
-  }
-  if (!max_.Active()) {
-    max_string = "(FLT4)(0.0f)";
-  } else {
-    max_string = absl::StrCat("(FLT4)(", max_.GetName(), ")");
-  }
-  if (!min_.Active()) {
-    min_string = "(FLT4)(0.0f)";
-  } else {
-    min_string = absl::StrCat("(FLT4)(", min_.GetName(), ")");
-  }
-  std::string clamped_value = absl::StrCat(
-      "min(", max_string, ", max(", min_string, ", ", context.var_name, "))");
-  std::string quantized_value = absl::StrCat(
-      "round((", clamped_value, " - ", min_string, ") / ", scale_string, ")");
-  std::string dequantized_value =
-      absl::StrCat(quantized_value, " * ", scale_string, " + ", min_string);
-
-  return absl::StrCat(context.var_name, " = ", dequantized_value, ";\n");
-}
-
-std::string QuantizeAndDequantize::GetArgsDeclaration() const {
-  return absl::StrCat(",\n    ", min_.GetDeclaration(), ",\n    ",
-                      max_.GetDeclaration(), ",\n    ",
-                      scale_.GetDeclaration());
-}
-
-absl::Status QuantizeAndDequantize::BindArguments(CLKernel* kernel) {
-  RETURN_IF_ERROR(kernel->SetBytesAuto(min_));
-  RETURN_IF_ERROR(kernel->SetBytesAuto(max_));
-  RETURN_IF_ERROR(kernel->SetBytesAuto(scale_));
-  return absl::OkStatus();
-}
-
 absl::Status CreateQuantizeAndDequantize(
     const CreationContext& creation_context, const OperationDef& definition,
     const QuantizeAndDequantizeAttributes& attr,
@@ -133,7 +77,6 @@ absl::Status CreateQuantizeAndDequantize(
   } else {
     *result = QuantizeAndDequantize(definition, attr, scalar_precision);
   }
-  result->SetLinkIndex(0);
   return absl::OkStatus();
 }
 
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/quantize_and_dequantize.h b/tensorflow/lite/delegates/gpu/cl/kernels/quantize_and_dequantize.h
index 47009d4c0f5..a40aa21d23c 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/quantize_and_dequantize.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/quantize_and_dequantize.h
@@ -20,7 +20,6 @@ limitations under the License.
 
 #include "tensorflow/lite/delegates/gpu/cl/cl_context.h"
 #include "tensorflow/lite/delegates/gpu/cl/cl_kernel.h"
-#include "tensorflow/lite/delegates/gpu/cl/kernels/flt_type.h"
 #include "tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h"
 #include "tensorflow/lite/delegates/gpu/cl/linear_storage.h"
 #include "tensorflow/lite/delegates/gpu/common/data_type.h"
@@ -53,11 +52,6 @@ class QuantizeAndDequantize : public ElementwiseOperation {
   QuantizeAndDequantize(const QuantizeAndDequantize&) = delete;
   QuantizeAndDequantize& operator=(const QuantizeAndDequantize&) = delete;
 
-  void SetLinkIndex(int index) override;
-  std::string GetCoreCode(const LinkingContext& context) const override;
-  std::string GetArgsDeclaration() const override;
-  absl::Status BindArguments(CLKernel* kernel) override;
-
   friend absl::Status CreateQuantizeAndDequantize(
       const CreationContext& creation_context, const OperationDef& definition,
       const QuantizeAndDequantizeAttributes& attr,
@@ -71,10 +65,6 @@ class QuantizeAndDequantize : public ElementwiseOperation {
   template <DataType T>
   absl::Status UploadParameters(
       const tflite::gpu::Tensor<Linear, T>& parameters, CLContext* context);
-
-  FLT min_;
-  FLT max_;
-  FLT scale_;
 };
 
 absl::Status CreateQuantizeAndDequantize(
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/relu.cc b/tensorflow/lite/delegates/gpu/cl/kernels/relu.cc
index f89454675ba..774c030545a 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/relu.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/relu.cc
@@ -28,7 +28,6 @@ ReLU::ReLU(const OperationDef& definition, const ReLUAttributes& attr,
   std::string min_func;
   if (attr.alpha != 0.0f) {
     min_func = "min(in_out_value * args.alpha, (FLT)(0.0f))";
-    alpha_ = FLT(scalar_precision, attr.alpha);
     if (definition.precision == CalculationsPrecision::F32) {
       args_.AddFloat("alpha", attr.alpha);
     } else {
@@ -38,7 +37,6 @@ ReLU::ReLU(const OperationDef& definition, const ReLUAttributes& attr,
     min_func = "(FLT)(0.0f)";
   }
   if (attr.clip != 0.0f) {
-    clip_ = FLT(scalar_precision, attr.clip);
     if (definition.precision == CalculationsPrecision::F32) {
       args_.AddFloat("clip", attr.clip);
     } else {
@@ -51,70 +49,21 @@ ReLU::ReLU(const OperationDef& definition, const ReLUAttributes& attr,
   }
 }
 
-ReLU::ReLU(ReLU&& operation)
-    : ElementwiseOperation(std::move(operation)),
-      alpha_(std::move(operation.alpha_)),
-      clip_(std::move(operation.clip_)) {}
+ReLU::ReLU(ReLU&& operation) : ElementwiseOperation(std::move(operation)) {}
 
 ReLU& ReLU::operator=(ReLU&& operation) {
   if (this != &operation) {
-    alpha_ = std::move(operation.alpha_);
-    clip_ = std::move(operation.clip_);
     ElementwiseOperation::operator=(std::move(operation));
   }
   return *this;
 }
 
-void ReLU::SetLinkIndex(int index) {
-  alpha_.SetName(absl::StrCat("relu_alpha", index));
-  clip_.SetName(absl::StrCat("relu_clip", index));
-}
-
-std::string ReLU::GetCoreCode(const LinkingContext& context) const {
-  std::string min_func;
-  if (!alpha_.Active()) {
-    min_func = "(FLT)(0.0f)";
-  } else {
-    min_func = absl::StrCat("min(", context.var_name, " * (FLT)(",
-                            alpha_.GetName(), "), (FLT)(0.0f))");
-  }
-  if (!clip_.Active()) {
-    return absl::StrCat(context.var_name, " = max(", context.var_name, ", ",
-                        min_func, ");\n");
-  } else {
-    return absl::StrCat(context.var_name, " = clamp(", context.var_name,
-                        ", " + min_func + ", (FLT)(", clip_.GetName(), "));\n");
-  }
-}
-
-std::string ReLU::GetArgsDeclaration() const {
-  std::string args;
-  if (alpha_.Active()) {
-    absl::StrAppend(&args, ",\n    ", alpha_.GetDeclaration());
-  }
-  if (clip_.Active()) {
-    absl::StrAppend(&args, ",\n    ", clip_.GetDeclaration());
-  }
-  return args;
-}
-
-absl::Status ReLU::BindArguments(CLKernel* kernel) {
-  if (alpha_.Active()) {
-    RETURN_IF_ERROR(kernel->SetBytesAuto(alpha_));
-  }
-  if (clip_.Active()) {
-    RETURN_IF_ERROR(kernel->SetBytesAuto(clip_));
-  }
-  return absl::OkStatus();
-}
-
 ReLU CreateReLU(const CreationContext& creation_context,
                 const OperationDef& definition, const ReLUAttributes& attr) {
   const auto scalar_precision = creation_context.device->IsPowerVR()
                                     ? CalculationsPrecision::F32
                                     : definition.precision;
   ReLU operation(definition, attr, scalar_precision);
-  operation.SetLinkIndex(0);
   return operation;
 }
 
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/relu.h b/tensorflow/lite/delegates/gpu/cl/kernels/relu.h
index c8260a33faf..ccb6f6ca37f 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/relu.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/relu.h
@@ -18,7 +18,6 @@ limitations under the License.
 
 #include <string>
 
-#include "tensorflow/lite/delegates/gpu/cl/kernels/flt_type.h"
 #include "tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h"
 #include "tensorflow/lite/delegates/gpu/common/operations.h"
 
@@ -34,11 +33,6 @@ class ReLU : public ElementwiseOperation {
   ReLU(const ReLU&) = delete;
   ReLU& operator=(const ReLU&) = delete;
 
-  void SetLinkIndex(int index) override;
-  std::string GetCoreCode(const LinkingContext& context) const override;
-  std::string GetArgsDeclaration() const override;
-  absl::Status BindArguments(CLKernel* kernel) override;
-
   friend ReLU CreateReLU(const CreationContext& creation_context,
                          const OperationDef& definition,
                          const ReLUAttributes& attr);
@@ -46,9 +40,6 @@ class ReLU : public ElementwiseOperation {
  private:
   ReLU(const OperationDef& definition, const ReLUAttributes& attr,
        CalculationsPrecision scalar_precision);
-
-  FLT alpha_;
-  FLT clip_;
 };
 
 ReLU CreateReLU(const CreationContext& creation_context,

From 52bc6112a6bdfd1227988a5c2740e4b553145c3f Mon Sep 17 00:00:00 2001
From: Henry Tan <henrytan@google.com>
Date: Mon, 29 Jun 2020 12:43:52 -0700
Subject: [PATCH 1273/1390] Clean up dependencies for BUILD file.

PiperOrigin-RevId: 318873036
Change-Id: I821c2e1d740ae2d4364ba90b473551fed468fde5
---
 tensorflow/stream_executor/tpu/BUILD | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/stream_executor/tpu/BUILD b/tensorflow/stream_executor/tpu/BUILD
index add4db12ae6..051d66e2e48 100644
--- a/tensorflow/stream_executor/tpu/BUILD
+++ b/tensorflow/stream_executor/tpu/BUILD
@@ -170,6 +170,7 @@ cc_library(
     hdrs = ["tpu_computation_placer.h"],
     deps = [
         ":tpu_executor",
+        ":tpu_executor_base",
         ":tpu_executor_c_api_hdrs",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla/service:computation_placer",

From fb6fbb469bf9b5999744d145c36b6e56931e50fe Mon Sep 17 00:00:00 2001
From: Rachel Lim <rachelim@google.com>
Date: Mon, 29 Jun 2020 13:10:59 -0700
Subject: [PATCH 1274/1390] [tf.data] Fix ChooseFastestBranch dataset.
 Previously, WrapperDataset was given the wrong output_dtypes and
 output_shapes (it should have the shape of the input dataset, not the output
 dataset).

PiperOrigin-RevId: 318878512
Change-Id: Ie52fb02c0c6ce81ed1f202c18b0cfcf6c61f1fcc
---
 .../data/experimental/choose_fastest_branch_dataset_op.cc   | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tensorflow/core/kernels/data/experimental/choose_fastest_branch_dataset_op.cc b/tensorflow/core/kernels/data/experimental/choose_fastest_branch_dataset_op.cc
index e911d82b7d4..8772f21ef8f 100644
--- a/tensorflow/core/kernels/data/experimental/choose_fastest_branch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/choose_fastest_branch_dataset_op.cc
@@ -495,9 +495,9 @@ class ChooseFastestBranchDatasetOp : public UnaryDatasetOpKernel {
         DatasetContext::Params params;
         params.type_string = "ChooseFastestBranch_Wrapper";
         params.node_name = strings::StrCat(params.type_string, branch_index);
-        DatasetBase* temp_dataset =
-            new WrapperDataset(std::move(params), &dataset()->output_types_,
-                               &dataset()->output_shapes_, input_impl_.get());
+        DatasetBase* temp_dataset = new WrapperDataset(
+            std::move(params), &input_impl_->output_dtypes(),
+            &input_impl_->output_shapes(), input_impl_.get());
 
         if (is_experiment) {
           // When running experiment iterations, we add a TakeDataset in between

From 2031229901d442210b332c7ff68cbe3ebf0b0e4f Mon Sep 17 00:00:00 2001
From: Jacques Pienaar <jpienaar@google.com>
Date: Mon, 29 Jun 2020 13:14:55 -0700
Subject: [PATCH 1275/1390] Interchange ops to match generation order. NFC

Seems these two ops differ in order from what the script does, so updating these so that the file is "clean" wrt updates.

PiperOrigin-RevId: 318879255
Change-Id: I1fd67a70a42683e5b29520f4a6168ea546f13ed2
---
 .../mlir/tensorflow/ir/tf_generated_ops.td    | 43 +++++++++----------
 1 file changed, 20 insertions(+), 23 deletions(-)

diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td b/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
index 146af4adbbc..ef145415308 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
@@ -725,6 +725,26 @@ window in `value`.
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
 }
 
+def TF_AvgPool3DGradOp : TF_Op<"AvgPool3DGrad", [NoSideEffect]> {
+  let summary = "Computes gradients of average pooling function.";
+
+  let arguments = (ins
+    I32Tensor:$orig_input_shape,
+    TF_FpTensor:$grad,
+
+    Confined<I64ArrayAttr, [ArrayMinCount<5>]>:$ksize,
+    Confined<I64ArrayAttr, [ArrayMinCount<5>]>:$strides,
+    TF_AnyStrAttrOf<["SAME", "VALID"]>:$padding,
+    DefaultValuedAttr<TF_AnyStrAttrOf<["NDHWC", "NCDHW"]>, "NDHWC">:$data_format
+  );
+
+  let results = (outs
+    TF_FpTensor:$output
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<1>;
+}
+
 def TF_AvgPoolGradOp : TF_Op<"AvgPoolGrad", [NoSideEffect]> {
   let summary = "Computes gradients of the average pooling function.";
 
@@ -745,29 +765,6 @@ def TF_AvgPoolGradOp : TF_Op<"AvgPoolGrad", [NoSideEffect]> {
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<1>;
 }
 
-def TF_AvgPool3DGradOp : TF_Op<"AvgPool3DGrad", [NoSideEffect]> {
-  let summary = "Computes gradients of the 3D average pooling function.";
-
-  let description = [{
-  }];
-
-  let arguments = (ins
-    I32Tensor:$orig_input_shape,
-    TF_FpTensor:$grad,
-
-    Confined<I64ArrayAttr, [ArrayMinCount<5>]>:$ksize,
-    Confined<I64ArrayAttr, [ArrayMinCount<5>]>:$strides,
-    TF_AnyStrAttrOf<["SAME", "VALID"]>:$padding,
-    DefaultValuedAttr<TF_AnyStrAttrOf<["NDHWC", "NCDHW"]>, "NDHWC">:$data_format
-  );
-
-  let results = (outs
-    TF_FpTensor:$output
-  );
-
-  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<1>;
-}
-
 def TF_BatchMatMulOp : TF_Op<"BatchMatMul", [NoSideEffect]> {
   let summary = "Multiplies slices of two tensors in batches.";
 

From 92d15f9719417116d417e28dde61e1a17ca4dfad Mon Sep 17 00:00:00 2001
From: Raman Sarokin <sorokin@google.com>
Date: Mon, 29 Jun 2020 13:31:19 -0700
Subject: [PATCH 1276/1390] Cleaning. Removed TensorCodeGenerator.

PiperOrigin-RevId: 318882704
Change-Id: I5386e7bc38cf7e1059edb24e1078b7a2cdd6fc9f
---
 .../lite/delegates/gpu/cl/kernels/util.cc     | 430 ------------------
 .../lite/delegates/gpu/cl/kernels/util.h      | 181 --------
 2 files changed, 611 deletions(-)

diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/util.cc b/tensorflow/lite/delegates/gpu/cl/kernels/util.cc
index 8cfff18b4ee..72426b62d39 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/util.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/util.cc
@@ -122,436 +122,6 @@ std::string GetCommonDefines(CalculationsPrecision precision) {
   return result;
 }
 
-TensorCodeGenerator::TensorCodeGenerator(const std::string& name,
-                                         const WHSPoint& sizes,
-                                         const TensorDescriptor& descriptor)
-    : tensor_name_(name),
-      width_name_(sizes.w_name),
-      height_name_(sizes.h_name),
-      slices_name_(sizes.s_name),
-      descriptor_(descriptor) {}
-
-TensorCodeGenerator::TensorCodeGenerator(const std::string& name,
-                                         const WHSBPoint& sizes,
-                                         const TensorDescriptor& descriptor)
-    : tensor_name_(name),
-      width_name_(sizes.w_name),
-      height_name_(sizes.h_name),
-      slices_name_(sizes.s_name),
-      batch_name_(sizes.b_name),
-      descriptor_(descriptor) {}
-
-TensorCodeGenerator::TensorCodeGenerator(const std::string& name,
-                                         const WHDSPoint& sizes,
-                                         const TensorDescriptor& descriptor)
-    : tensor_name_(name),
-      width_name_(sizes.w_name),
-      height_name_(sizes.h_name),
-      depth_name_(sizes.d_name),
-      slices_name_(sizes.s_name),
-      descriptor_(descriptor) {}
-
-TensorCodeGenerator::TensorCodeGenerator(const std::string& name,
-                                         const WHDSBPoint& sizes,
-                                         const TensorDescriptor& descriptor)
-    : tensor_name_(name),
-      width_name_(sizes.w_name),
-      height_name_(sizes.h_name),
-      depth_name_(sizes.d_name),
-      slices_name_(sizes.s_name),
-      batch_name_(sizes.b_name),
-      descriptor_(descriptor) {}
-
-std::string TensorCodeGenerator::GetDeclaration(AccessType access_type) const {
-  return GetTensorDeclaration(access_type, tensor_name_, descriptor_);
-}
-
-std::string TensorCodeGenerator::ReadWHS(
-    const std::string& x, const std::string& y, const std::string& s,
-    TextureAddressMode address_mode) const {
-  return Read(GetGlobalAddressNoDeclarationWHS(x, y, s), address_mode);
-}
-
-std::string TensorCodeGenerator::ReadWHSB(
-    const std::string& x, const std::string& y, const std::string& s,
-    const std::string& b, TextureAddressMode address_mode) const {
-  return Read(GetGlobalAddressNoDeclarationWHSB(x, y, s, b), address_mode);
-}
-
-std::string TensorCodeGenerator::ReadWHDS(
-    const std::string& x, const std::string& y, const std::string& z,
-    const std::string& s, TextureAddressMode address_mode) const {
-  return Read(GetGlobalAddressNoDeclarationWHDS(x, y, z, s), address_mode);
-}
-
-std::string TensorCodeGenerator::ReadWHDSB(
-    const std::string& x, const std::string& y, const std::string& z,
-    const std::string& s, const std::string& b,
-    TextureAddressMode address_mode) const {
-  return Read(GetGlobalAddressNoDeclarationWHDSB(x, y, z, s, b), address_mode);
-}
-
-std::string TensorCodeGenerator::ReadAsFloatWHS(
-    const std::string& x, const std::string& y, const std::string& s,
-    TextureAddressMode address_mode) const {
-  return ReadAsFloat(GetGlobalAddressNoDeclarationWHS(x, y, s), address_mode);
-}
-
-std::string TensorCodeGenerator::ReadAsFloatWHSB(
-    const std::string& x, const std::string& y, const std::string& s,
-    const std::string& b, TextureAddressMode address_mode) const {
-  return ReadAsFloat(GetGlobalAddressNoDeclarationWHSB(x, y, s, b),
-                     address_mode);
-}
-
-std::string TensorCodeGenerator::ReadAsFloatWHDS(
-    const std::string& x, const std::string& y, const std::string& z,
-    const std::string& s, TextureAddressMode address_mode) const {
-  return ReadAsFloat(GetGlobalAddressNoDeclarationWHDS(x, y, z, s),
-                     address_mode);
-}
-
-std::string TensorCodeGenerator::ReadAsFloatWHDSB(
-    const std::string& x, const std::string& y, const std::string& z,
-    const std::string& s, const std::string& b,
-    TextureAddressMode address_mode) const {
-  return ReadAsFloat(GetGlobalAddressNoDeclarationWHDSB(x, y, z, s, b),
-                     address_mode);
-}
-
-std::string TensorCodeGenerator::ReadAsTypeWHS(
-    DataType type, const std::string& x, const std::string& y,
-    const std::string& s, TextureAddressMode address_mode) const {
-  return ReadAsType(type, GetGlobalAddressNoDeclarationWHS(x, y, s),
-                    address_mode);
-}
-
-std::string TensorCodeGenerator::ReadAsTypeWHSB(
-    DataType type, const std::string& x, const std::string& y,
-    const std::string& s, const std::string& b,
-    TextureAddressMode address_mode) const {
-  return ReadAsType(type, GetGlobalAddressNoDeclarationWHSB(x, y, s, b),
-                    address_mode);
-}
-
-std::string TensorCodeGenerator::ReadAsTypeWHDS(
-    DataType type, const std::string& x, const std::string& y,
-    const std::string& z, const std::string& s,
-    TextureAddressMode address_mode) const {
-  return ReadAsType(type, GetGlobalAddressNoDeclarationWHDS(x, y, z, s),
-                    address_mode);
-}
-
-std::string TensorCodeGenerator::ReadAsTypeWHDSB(
-    DataType type, const std::string& x, const std::string& y,
-    const std::string& z, const std::string& s, const std::string& b,
-    TextureAddressMode address_mode) const {
-  return ReadAsType(type, GetGlobalAddressNoDeclarationWHDSB(x, y, z, s, b),
-                    address_mode);
-}
-
-std::string TensorCodeGenerator::GetAddressWHS(const std::string& var_name,
-                                               const std::string& x,
-                                               const std::string& y,
-                                               const std::string& s) const {
-  return DeclareAddress(var_name, GetGlobalAddressNoDeclarationWHS(x, y, s));
-}
-
-std::string TensorCodeGenerator::GetAddressWHSB(const std::string& var_name,
-                                                const std::string& x,
-                                                const std::string& y,
-                                                const std::string& s,
-                                                const std::string& b) const {
-  return DeclareAddress(var_name,
-                        GetGlobalAddressNoDeclarationWHSB(x, y, s, b));
-}
-
-std::string TensorCodeGenerator::GetAddressWHDS(const std::string& var_name,
-                                                const std::string& x,
-                                                const std::string& y,
-                                                const std::string& z,
-                                                const std::string& s) const {
-  return DeclareAddress(var_name,
-                        GetGlobalAddressNoDeclarationWHDS(x, y, z, s));
-}
-
-std::string TensorCodeGenerator::GetAddressWHDSB(
-    const std::string& var_name, const std::string& x, const std::string& y,
-    const std::string& z, const std::string& s, const std::string& b) const {
-  return DeclareAddress(var_name,
-                        GetGlobalAddressNoDeclarationWHDSB(x, y, z, s, b));
-}
-
-std::string TensorCodeGenerator::GetGlobalAddressNoDeclarationWHS(
-    const std::string& x, const std::string& y, const std::string& s) const {
-  switch (descriptor_.storage_type) {
-    case TensorStorageType::BUFFER:
-    case TensorStorageType::IMAGE_BUFFER:
-      return absl::Substitute("((($2) * $3 + ($1)) * $4 + ($0))", x, y, s,
-                              height_name_, width_name_);
-    case TensorStorageType::TEXTURE_2D:
-      return absl::Substitute("(int2)(($0), ($1) * $3 + ($2))", x, y, s,
-                              slices_name_);
-    case TensorStorageType::SINGLE_TEXTURE_2D:
-      return absl::StrCat("(int2)(", x, ", ", y, ")");
-    case TensorStorageType::TEXTURE_ARRAY:
-    case TensorStorageType::TEXTURE_3D:
-      return absl::StrCat("(int4)(", x, ", ", y, ", ", s, ", 0)");
-    case TensorStorageType::UNKNOWN:
-      return "error";
-  }
-}
-
-std::string TensorCodeGenerator::GetGlobalAddressNoDeclarationWHSB(
-    const std::string& x, const std::string& y, const std::string& s,
-    const std::string& b) const {
-  if (b.empty()) {
-    return GetGlobalAddressNoDeclarationWHS(x, y, s);
-  }
-  switch (descriptor_.storage_type) {
-    case TensorStorageType::BUFFER:
-    case TensorStorageType::IMAGE_BUFFER:
-      return absl::Substitute("(((($3) * $4 + $2) * $5 + ($1)) * $6 + ($0))", b,
-                              x, y, s, height_name_, width_name_, batch_name_);
-    case TensorStorageType::TEXTURE_2D:
-      return absl::Substitute("(int2)(($0) * $4 + ($1), ($2) * $5 + ($3))", x,
-                              b, y, s, batch_name_, slices_name_);
-    case TensorStorageType::SINGLE_TEXTURE_2D:
-      return absl::Substitute("(int2)(($0) * $3 + ($1), ($2))", x, b, y,
-                              batch_name_);
-    case TensorStorageType::TEXTURE_ARRAY:
-    case TensorStorageType::TEXTURE_3D:
-      return absl::Substitute("(int4)(($0) * $4 + ($1), ($2), ($3), 0)", x, b,
-                              y, s, batch_name_);
-    case TensorStorageType::UNKNOWN:
-      return "error";
-    default:
-      return "error";
-  }
-}
-
-std::string TensorCodeGenerator::GetGlobalAddressNoDeclarationWHDS(
-    const std::string& x, const std::string& y, const std::string& z,
-    const std::string& s) const {
-  switch (descriptor_.storage_type) {
-    case TensorStorageType::BUFFER:
-    case TensorStorageType::IMAGE_BUFFER:
-      return absl::Substitute("(((($3) * $4 + ($2)) * $5 + ($1)) * $6 + ($0))",
-                              x, y, s, z, slices_name_, height_name_,
-                              width_name_);
-    case TensorStorageType::TEXTURE_2D:
-      return absl::Substitute("(int2)(($0) * $4 + ($1), ($2) * $5 + ($3))", x,
-                              z, y, s, depth_name_, slices_name_);
-    case TensorStorageType::SINGLE_TEXTURE_2D:
-      return absl::Substitute("(int2)(($0) * $3 + ($1), ($2))", x, z, y,
-                              depth_name_);
-    case TensorStorageType::TEXTURE_ARRAY:
-    case TensorStorageType::TEXTURE_3D:
-      return absl::Substitute("(int4)(($0), ($1), ($2) * $4 + ($3), 0)", x, y,
-                              z, s, slices_name_);
-    case TensorStorageType::UNKNOWN:
-      return "error";
-  }
-}
-
-std::string TensorCodeGenerator::GetGlobalAddressNoDeclarationWHDSB(
-    const std::string& x, const std::string& y, const std::string& z,
-    const std::string& s, const std::string& b) const {
-  if (b.empty()) {
-    return GetGlobalAddressNoDeclarationWHDS(x, y, z, s);
-  }
-  switch (descriptor_.storage_type) {
-    case TensorStorageType::BUFFER:
-    case TensorStorageType::IMAGE_BUFFER:
-      return absl::Substitute(
-          "((((($4) * $5 + ($3)) * $6 + $2) * $7 + ($1)) * $8 + ($0))", b, x, y,
-          s, z, slices_name_, height_name_, width_name_, batch_name_);
-    case TensorStorageType::TEXTURE_2D:
-      return absl::Substitute(
-          "(int2)((($0) * $5 + ($1)) * $6 + ($2), ($3) * $7 + ($4))", x, b, z,
-          y, s, batch_name_, depth_name_, slices_name_);
-    case TensorStorageType::SINGLE_TEXTURE_2D:
-      return absl::Substitute("(int2)((($0) * $4 + ($1)) * $5 + ($2), ($3))", x,
-                              b, z, y, batch_name_, depth_name_);
-    case TensorStorageType::TEXTURE_ARRAY:
-    case TensorStorageType::TEXTURE_3D:
-      return absl::Substitute(
-          "(int4)(($0) * $5 + ($1), ($2), ($3) * $6 + ($4), 0)", x, b, y, z, s,
-          batch_name_, slices_name_);
-    case TensorStorageType::UNKNOWN:
-      return "error";
-    default:
-      return "error";
-  }
-}
-
-std::string TensorCodeGenerator::DeclareAddress(
-    const std::string& var_name, const std::string& address) const {
-  switch (descriptor_.storage_type) {
-    case TensorStorageType::BUFFER:
-    case TensorStorageType::IMAGE_BUFFER:
-      return absl::StrCat("int ", var_name, " = ", address, ";\n");
-    case TensorStorageType::TEXTURE_2D:
-    case TensorStorageType::SINGLE_TEXTURE_2D:
-      return absl::StrCat("int2 ", var_name, " = ", address, ";\n");
-    case TensorStorageType::TEXTURE_ARRAY:
-    case TensorStorageType::TEXTURE_3D:
-      return absl::StrCat("int4 ", var_name, " = ", address, ";\n");
-    case TensorStorageType::UNKNOWN:
-      return "";
-  }
-}
-
-std::string TensorCodeGenerator::WriteWHS(const std::string& var_name,
-                                          const std::string& x,
-                                          const std::string& y,
-                                          const std::string& s) const {
-  return Write(var_name, GetGlobalAddressNoDeclarationWHS(x, y, s));
-}
-
-std::string TensorCodeGenerator::WriteWHSB(const std::string& var_name,
-                                           const std::string& x,
-                                           const std::string& y,
-                                           const std::string& s,
-                                           const std::string& b) const {
-  return Write(var_name, GetGlobalAddressNoDeclarationWHSB(x, y, s, b));
-}
-
-std::string TensorCodeGenerator::WriteWHDS(const std::string& var_name,
-                                           const std::string& x,
-                                           const std::string& y,
-                                           const std::string& z,
-                                           const std::string& s) const {
-  return Write(var_name, GetGlobalAddressNoDeclarationWHDS(x, y, z, s));
-}
-
-std::string TensorCodeGenerator::WriteWHDSB(
-    const std::string& var_name, const std::string& x, const std::string& y,
-    const std::string& z, const std::string& s, const std::string& b) const {
-  return Write(var_name, GetGlobalAddressNoDeclarationWHDSB(x, y, z, s, b));
-}
-
-std::string TensorCodeGenerator::Read(const std::string& global_address,
-                                      TextureAddressMode address_mode) const {
-  switch (descriptor_.storage_type) {
-    case TensorStorageType::BUFFER:
-      return absl::StrCat(tensor_name_, "[", global_address, "]");
-    case TensorStorageType::TEXTURE_2D:
-    case TensorStorageType::TEXTURE_3D:
-    case TensorStorageType::SINGLE_TEXTURE_2D:
-    case TensorStorageType::TEXTURE_ARRAY:
-      return absl::StrCat(
-          GetReadImageFromDataType(descriptor_.data_type), "(", tensor_name_,
-          ", " + TextureAddressModeToString(address_mode) + ", ",
-          global_address, ")");
-    case TensorStorageType::IMAGE_BUFFER:
-      return absl::StrCat(GetReadImageFromDataType(descriptor_.data_type), "(",
-                          tensor_name_, ", ", global_address, ")");
-    case TensorStorageType::UNKNOWN:
-      return "";
-  }
-}
-
-std::string TensorCodeGenerator::ReadAsFloat(
-    const std::string& global_address, TextureAddressMode address_mode) const {
-  switch (descriptor_.storage_type) {
-    case TensorStorageType::BUFFER:
-      return absl::StrCat("convert_float4(", tensor_name_, "[", global_address,
-                          "])");
-    case TensorStorageType::TEXTURE_2D:
-    case TensorStorageType::TEXTURE_3D:
-    case TensorStorageType::SINGLE_TEXTURE_2D:
-    case TensorStorageType::TEXTURE_ARRAY:
-      return absl::StrCat(
-          "read_imagef(", tensor_name_,
-          ", " + TextureAddressModeToString(address_mode) + ", ",
-          global_address, ")");
-    case TensorStorageType::IMAGE_BUFFER:
-      return absl::StrCat("read_imagef(", tensor_name_, ", ", global_address,
-                          ")");
-    case TensorStorageType::UNKNOWN:
-      return "";
-  }
-}
-
-std::string TensorCodeGenerator::ReadAsType(
-    DataType type, const std::string& global_address,
-    TextureAddressMode address_mode) const {
-  const std::string read_as =
-      type == DataType::FLOAT16 ? "read_imageh" : "read_imagef";
-  switch (descriptor_.storage_type) {
-    case TensorStorageType::BUFFER: {
-      const std::string reading =
-          absl::StrCat(tensor_name_, "[", global_address, "]");
-      if (type == descriptor_.data_type) {
-        return reading;
-      } else {
-        const std::string conversion =
-            type == DataType::FLOAT16 ? "convert_half4" : "convert_float4";
-        return absl::StrCat(conversion, "(", reading, ")");
-      }
-    }
-    case TensorStorageType::TEXTURE_2D:
-    case TensorStorageType::TEXTURE_3D:
-    case TensorStorageType::SINGLE_TEXTURE_2D:
-    case TensorStorageType::TEXTURE_ARRAY:
-      return absl::StrCat(
-          read_as, "(", tensor_name_,
-          ", " + TextureAddressModeToString(address_mode) + ", ",
-          global_address, ")");
-    case TensorStorageType::IMAGE_BUFFER:
-      return absl::StrCat(read_as, "(", tensor_name_, ", ", global_address,
-                          ")");
-    case TensorStorageType::UNKNOWN:
-      return "";
-  }
-}
-
-std::string TensorCodeGenerator::Write(
-    const std::string& var_name, const std::string& global_address) const {
-  switch (descriptor_.storage_type) {
-    case TensorStorageType::BUFFER:
-    case TensorStorageType::IMAGE_BUFFER:
-      return absl::StrCat(tensor_name_, "[", global_address, "] = ", var_name,
-                          ";\n");
-    case TensorStorageType::TEXTURE_2D:
-    case TensorStorageType::TEXTURE_3D:
-    case TensorStorageType::SINGLE_TEXTURE_2D:
-    case TensorStorageType::TEXTURE_ARRAY:
-      return absl::StrCat(GetWriteImageFromDataType(descriptor_.data_type), "(",
-                          tensor_name_, ", ", global_address, ", ", var_name,
-                          ");\n");
-    case TensorStorageType::UNKNOWN:
-      return "";
-  }
-}
-
-std::string GetTensorDeclaration(AccessType access,
-                                 const std::string& tensor_name,
-                                 const TensorDescriptor& descriptor) {
-  switch (descriptor.storage_type) {
-    case TensorStorageType::BUFFER:
-      return absl::StrCat("__global ", ToCLDataType(descriptor.data_type, 4),
-                          "* ", tensor_name);
-    case TensorStorageType::TEXTURE_2D:
-    case TensorStorageType::SINGLE_TEXTURE_2D:
-      return GetImageModifier(access) + " image2d_t " + tensor_name;
-    case TensorStorageType::TEXTURE_ARRAY:
-      return GetImageModifier(access) + " image2d_array_t " + tensor_name;
-    case TensorStorageType::TEXTURE_3D:
-      return GetImageModifier(access) + " image3d_t " + tensor_name;
-    case TensorStorageType::IMAGE_BUFFER:
-      if (access == AccessType::WRITE) {
-        return absl::StrCat("__global ", ToCLDataType(descriptor.data_type, 4),
-                            "* ", tensor_name);
-      } else {
-        return GetImageModifier(access) + " image1d_buffer_t " + tensor_name;
-      }
-    case TensorStorageType::UNKNOWN:
-      return "error";
-  }
-}
-
 std::string GetXStrideCorrected(const std::string& src_x,
                                 const std::string& batch_size,
                                 const std::string& stride_x,
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/util.h b/tensorflow/lite/delegates/gpu/cl/kernels/util.h
index 3a51d064b40..42be865e3a3 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/util.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/util.h
@@ -36,187 +36,6 @@ namespace cl {
 
 std::string GetCommonDefines(CalculationsPrecision precision);
 
-struct WHSPoint {
-  std::string w_name;
-  std::string h_name;
-  std::string s_name;
-};
-struct WHSBPoint {
-  std::string w_name;
-  std::string h_name;
-  std::string s_name;
-  std::string b_name;
-};
-struct WHDSPoint {
-  std::string w_name;
-  std::string h_name;
-  std::string d_name;
-  std::string s_name;
-};
-struct WHDSBPoint {
-  std::string w_name;
-  std::string h_name;
-  std::string d_name;
-  std::string s_name;
-  std::string b_name;
-};
-
-class TensorCodeGenerator {
- public:
-  TensorCodeGenerator() = default;
-  TensorCodeGenerator(const std::string& name, const WHSPoint& sizes,
-                      const TensorDescriptor& descriptor);
-  TensorCodeGenerator(const std::string& name, const WHSBPoint& sizes,
-                      const TensorDescriptor& descriptor);
-  TensorCodeGenerator(const std::string& name, const WHDSPoint& sizes,
-                      const TensorDescriptor& descriptor);
-  TensorCodeGenerator(const std::string& name, const WHDSBPoint& sizes,
-                      const TensorDescriptor& descriptor);
-
-  std::string GetDeclaration(AccessType access) const;
-
-  std::string GetAddressWHS(const std::string& var_name, const std::string& x,
-                            const std::string& y, const std::string& s) const;
-
-  std::string GetAddressWHSB(const std::string& var_name, const std::string& x,
-                             const std::string& y, const std::string& s,
-                             const std::string& b) const;
-
-  std::string GetAddressWHDS(const std::string& var_name, const std::string& x,
-                             const std::string& y, const std::string& z,
-                             const std::string& s) const;
-
-  std::string GetAddressWHDSB(const std::string& var_name, const std::string& x,
-                              const std::string& y, const std::string& z,
-                              const std::string& s, const std::string& b) const;
-
-  // This function (and functions below) accept TextureAddressMode, but this
-  // argument applicable only for texture types. Buffer types ignore this
-  // parameter.
-  std::string ReadWHS(
-      const std::string& x, const std::string& y, const std::string& s,
-      TextureAddressMode address_mode = TextureAddressMode::DONT_CARE) const;
-
-  std::string ReadWHSB(
-      const std::string& x, const std::string& y, const std::string& s,
-      const std::string& b,
-      TextureAddressMode address_mode = TextureAddressMode::DONT_CARE) const;
-
-  std::string ReadWHDS(
-      const std::string& x, const std::string& y, const std::string& z,
-      const std::string& s,
-      TextureAddressMode address_mode = TextureAddressMode::DONT_CARE) const;
-
-  std::string ReadWHDSB(
-      const std::string& x, const std::string& y, const std::string& z,
-      const std::string& s, const std::string& b,
-      TextureAddressMode address_mode = TextureAddressMode::DONT_CARE) const;
-
-  // Optimization for textures, so as in opencl we can use read_imagef for any
-  // texture type.
-  std::string ReadAsFloatWHS(
-      const std::string& x, const std::string& y, const std::string& s,
-      TextureAddressMode address_mode = TextureAddressMode::DONT_CARE) const;
-
-  std::string ReadAsFloatWHSB(
-      const std::string& x, const std::string& y, const std::string& s,
-      const std::string& b,
-      TextureAddressMode address_mode = TextureAddressMode::DONT_CARE) const;
-
-  std::string ReadAsFloatWHDS(
-      const std::string& x, const std::string& y, const std::string& z,
-      const std::string& s,
-      TextureAddressMode address_mode = TextureAddressMode::DONT_CARE) const;
-
-  std::string ReadAsFloatWHDSB(
-      const std::string& x, const std::string& y, const std::string& z,
-      const std::string& s, const std::string& b,
-      TextureAddressMode address_mode = TextureAddressMode::DONT_CARE) const;
-
-  // Optimization for textures, so as in opencl we can use read_imagef for any
-  // texture type.
-  std::string ReadAsTypeWHS(
-      DataType type, const std::string& x, const std::string& y,
-      const std::string& s,
-      TextureAddressMode address_mode = TextureAddressMode::DONT_CARE) const;
-
-  std::string ReadAsTypeWHSB(
-      DataType type, const std::string& x, const std::string& y,
-      const std::string& s, const std::string& b,
-      TextureAddressMode address_mode = TextureAddressMode::DONT_CARE) const;
-
-  std::string ReadAsTypeWHDS(
-      DataType type, const std::string& x, const std::string& y,
-      const std::string& z, const std::string& s,
-      TextureAddressMode address_mode = TextureAddressMode::DONT_CARE) const;
-
-  std::string ReadAsTypeWHDSB(
-      DataType type, const std::string& x, const std::string& y,
-      const std::string& z, const std::string& s, const std::string& b,
-      TextureAddressMode address_mode = TextureAddressMode::DONT_CARE) const;
-
-  std::string WriteWHS(const std::string& var_name, const std::string& x,
-                       const std::string& y, const std::string& s) const;
-
-  std::string WriteWHSB(const std::string& var_name, const std::string& x,
-                        const std::string& y, const std::string& s,
-                        const std::string& b) const;
-
-  std::string WriteWHDS(const std::string& var_name, const std::string& x,
-                        const std::string& y, const std::string& z,
-                        const std::string& s) const;
-
-  std::string WriteWHDSB(const std::string& var_name, const std::string& x,
-                         const std::string& y, const std::string& z,
-                         const std::string& s, const std::string& b) const;
-
-  std::string Read(
-      const std::string& global_address,
-      TextureAddressMode address_mode = TextureAddressMode::DONT_CARE) const;
-  // Optimization for textures, so as in opencl we can use read_imagef for any
-  // texture type.
-  std::string ReadAsFloat(
-      const std::string& global_address,
-      TextureAddressMode address_mode = TextureAddressMode::DONT_CARE) const;
-  std::string ReadAsType(
-      DataType type, const std::string& global_address,
-      TextureAddressMode address_mode = TextureAddressMode::DONT_CARE) const;
-  std::string Write(const std::string& var_name,
-                    const std::string& global_address) const;
-
- private:
-  std::string GetGlobalAddressNoDeclarationWHS(const std::string& x,
-                                               const std::string& y,
-                                               const std::string& s) const;
-  std::string GetGlobalAddressNoDeclarationWHSB(const std::string& x,
-                                                const std::string& y,
-                                                const std::string& s,
-                                                const std::string& b) const;
-  std::string GetGlobalAddressNoDeclarationWHDS(const std::string& x,
-                                                const std::string& y,
-                                                const std::string& z,
-                                                const std::string& s) const;
-  std::string GetGlobalAddressNoDeclarationWHDSB(const std::string& x,
-                                                 const std::string& y,
-                                                 const std::string& z,
-                                                 const std::string& s,
-                                                 const std::string& b) const;
-  std::string DeclareAddress(const std::string& var_name,
-                             const std::string& address) const;
-
-  std::string tensor_name_;
-  std::string width_name_ = "unknown";
-  std::string height_name_ = "unknown";
-  std::string depth_name_ = "unknown";
-  std::string slices_name_ = "unknown";
-  std::string batch_name_ = "unknown";
-  TensorDescriptor descriptor_;
-};
-
-std::string GetTensorDeclaration(AccessType access,
-                                 const std::string& tensor_name,
-                                 const TensorDescriptor& descriptor);
-
 // Calculates correct X coordinate when stride != 1 and batch != 1 for layouts
 // with B after W (for example HWBC4) and WB stored in one axis of GPU
 // resources.

From 1b53b995da446faf4772380f4c40ded498c84954 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 29 Jun 2020 13:54:09 -0700
Subject: [PATCH 1277/1390] SPMD sharding for convolutions with a dilated
 kernel

PiperOrigin-RevId: 318887130
Change-Id: Ied7c5ea0dde042fef675c52431b0e047288f0e90
---
 .../compiler/xla/service/spmd/spmd_partitioner.cc  | 14 +++++---------
 1 file changed, 5 insertions(+), 9 deletions(-)

diff --git a/tensorflow/compiler/xla/service/spmd/spmd_partitioner.cc b/tensorflow/compiler/xla/service/spmd/spmd_partitioner.cc
index 635446a18a1..7e136be54e6 100644
--- a/tensorflow/compiler/xla/service/spmd/spmd_partitioner.cc
+++ b/tensorflow/compiler/xla/service/spmd/spmd_partitioner.cc
@@ -418,21 +418,17 @@ PartitionedHlo::ReshardAsWindowedInput(const Window& window,
       continue;
     }
     const auto& wd = window.dimensions(i);
-    if (wd.window_dilation() != 1) {
-      // TODO(yuanzx): Support window dilation.
-      VLOG(2) << "Failed to reshard window operand due to window dilation";
-      return absl::nullopt;
-    }
+    const auto dilated_size = 1 + (wd.size() - 1) * wd.window_dilation();
     int64 full_size =
         base_shape_.dimensions(i) +
         (wd.base_dilation() - 1) * (base_shape_.dimensions(i) - 1) +
         wd.padding_high() + wd.padding_low();
-    if (full_size < wd.size()) {
+    if (full_size < dilated_size) {
       VLOG(2) << "Failed to reshard window operand because the window size is "
                  "larger than padded base size";
       return absl::nullopt;
     }
-    int64 window_count = (full_size - wd.size()) / wd.stride() + 1;
+    int64 window_count = (full_size - dilated_size) / wd.stride() + 1;
     per_shard_window_counts[i] = CeilOfRatio(window_count, shard_count);
     if (wd.stride() != 1 &&
         (wd.stride() * per_shard_window_counts[i]) % wd.base_dilation() != 0) {
@@ -457,7 +453,7 @@ PartitionedHlo::ReshardAsWindowedInput(const Window& window,
         wd.stride() * per_shard_window_counts[i],
         wd.base_dilation() - 1 - swd->padding_low(), wd.base_dilation());
     int64 dilated_shard_size =
-        wd.stride() * (per_shard_window_counts[i] - 1) + wd.size();
+        wd.stride() * (per_shard_window_counts[i] - 1) + dilated_size;
     limit_on_padded_calculations[i] = MultiplyAddDivideOffsetCalculation(
         wd.stride() * per_shard_window_counts[i],
         dilated_shard_size + wd.base_dilation() - 1 - swd->padding_low(),
@@ -493,7 +489,7 @@ PartitionedHlo::ReshardAsWindowedInput(const Window& window,
       for (int64 shard_ordinal = 0; shard_ordinal < shard_count;
            ++shard_ordinal) {
         int64 wanted_limit_on_dilated_shard =
-            wd.stride() * (per_shard_window_counts[i] - 1) + wd.size();
+            wd.stride() * (per_shard_window_counts[i] - 1) + dilated_size;
         int64 actual_limit_on_dilated_shard_without_pad_high =
             get_first_valid_element_offset_on_dilated_shard(shard_ordinal) +
             (max_shard_size - 1) * wd.base_dilation() + 1;

From 1e1ce81457e539953664f6cc93160f41c9a20645 Mon Sep 17 00:00:00 2001
From: Davide Libenzi <dlibenzi@google.com>
Date: Mon, 29 Jun 2020 14:24:22 -0700
Subject: [PATCH 1278/1390] Expose device memory information via XRT API.

PiperOrigin-RevId: 318893328
Change-Id: I1bdfc8c6fcabe7b4f9a662272aa0c40a795299da
---
 .../compiler/xrt/kernels/xrt_state_ops.cc     |  5 +++
 .../compiler/xrt/kernels/xrt_state_ops.h      | 36 +++++++++++++++++++
 tensorflow/compiler/xrt/ops/xrt_state_ops.cc  | 10 ++++++
 tensorflow/compiler/xrt/tests/raw_api_test.cc | 16 +++++++++
 tensorflow/compiler/xrt/xrt.proto             |  7 ++++
 tensorflow/core/platform/default/port.cc      |  8 +++--
 tensorflow/core/platform/mem.h                | 12 ++++++-
 .../stream_executor/host/host_gpu_executor.cc |  8 +++++
 .../stream_executor/host/host_gpu_executor.h  |  4 +--
 9 files changed, 99 insertions(+), 7 deletions(-)

diff --git a/tensorflow/compiler/xrt/kernels/xrt_state_ops.cc b/tensorflow/compiler/xrt/kernels/xrt_state_ops.cc
index 97844d2f937..95772e51ba9 100644
--- a/tensorflow/compiler/xrt/kernels/xrt_state_ops.cc
+++ b/tensorflow/compiler/xrt/kernels/xrt_state_ops.cc
@@ -195,4 +195,9 @@ REGISTER_KERNEL_BUILDER(Name("XRTCompactAllocations").Device(DEVICE_XLA_CPU),
 REGISTER_KERNEL_BUILDER(Name("XRTMetricsCollect").Device(DEVICE_CPU),
                         XRTMetricsCollectOp);
 
+REGISTER_KERNEL_BUILDER(Name("XRTMemoryInfo").Device(DEVICE_XLA_GPU),
+                        XRTMemoryInfoOp<XRTGenericDeviceAccessor>);
+REGISTER_KERNEL_BUILDER(Name("XRTMemoryInfo").Device(DEVICE_XLA_CPU),
+                        XRTMemoryInfoOp<XRTGenericDeviceAccessor>);
+
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/xrt/kernels/xrt_state_ops.h b/tensorflow/compiler/xrt/kernels/xrt_state_ops.h
index 94b2f50cbaa..7e43f72e4d1 100644
--- a/tensorflow/compiler/xrt/kernels/xrt_state_ops.h
+++ b/tensorflow/compiler/xrt/kernels/xrt_state_ops.h
@@ -739,6 +739,42 @@ class XRTCompactAllocationsOp : public OpKernel {
   }
 };
 
+template <class DeviceAccessor>
+class XRTMemoryInfoOp : public OpKernel {
+ public:
+  explicit XRTMemoryInfoOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+  ~XRTMemoryInfoOp() override = default;
+  XRTMemoryInfoOp(const XRTMemoryInfoOp&) = delete;
+  XRTMemoryInfoOp& operator=(const XRTMemoryInfoOp&) = delete;
+
+  void Compute(OpKernelContext* ctx) override {
+    auto kernel_fn = [&]() -> Status {
+      VLOG(1) << "XRTMemoryInfoOp::Compute";
+
+      class DeviceAccessor::ScopedRef device_ref;
+      TF_RETURN_IF_ERROR(DeviceAccessor::InitScopedRef(ctx, &device_ref));
+      TF_ASSIGN_OR_RETURN(
+          se::StreamExecutor * stream_executor,
+          device_ref.backend()->stream_executor(device_ref.device_ordinal()));
+      int64 mem_free = -1;
+      int64 mem_total = -1;
+      if (!stream_executor->DeviceMemoryUsage(&mem_free, &mem_total)) {
+        VLOG(2) << "Device " << ctx->device()->name()
+                << " does not expose memory information";
+      }
+      xrt::MemoryInfo mem_info;
+      mem_info.set_kb_total((mem_total >= 0) ? mem_total / 1024 : -1);
+      mem_info.set_kb_free((mem_free >= 0) ? mem_free / 1024 : -1);
+
+      Tensor output(DT_STRING, TensorShape({}));
+      output.scalar<tstring>()() = mem_info.SerializeAsString();
+      ctx->set_output(0, output);
+      return Status::OK();
+    };
+    OP_REQUIRES_OK(ctx, kernel_fn());
+  }
+};
+
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_COMPILER_XRT_KERNELS_XRT_STATE_OPS_H_
diff --git a/tensorflow/compiler/xrt/ops/xrt_state_ops.cc b/tensorflow/compiler/xrt/ops/xrt_state_ops.cc
index dca757bec3a..a4be39b96c6 100644
--- a/tensorflow/compiler/xrt/ops/xrt_state_ops.cc
+++ b/tensorflow/compiler/xrt/ops/xrt_state_ops.cc
@@ -228,4 +228,14 @@ Reads the selected metric values from the metrics collection registry.
 'result' is a serialized xrt::MetricsReport proto.
 )");
 
+REGISTER_OP("XRTMemoryInfo")
+    .Output("result: string")
+    .SetShapeFn(tensorflow::shape_inference::ScalarShape)
+    .Doc(
+        R"(
+Returns the memory information of the device this op executes on/
+
+'result' is a serialized xrt::MemoryInfo proto.
+)");
+
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/xrt/tests/raw_api_test.cc b/tensorflow/compiler/xrt/tests/raw_api_test.cc
index 21195b94398..9d65ff9e0b7 100644
--- a/tensorflow/compiler/xrt/tests/raw_api_test.cc
+++ b/tensorflow/compiler/xrt/tests/raw_api_test.cc
@@ -2206,6 +2206,22 @@ TEST(RawApiTest, TestMetricsFetch) {
   }
 }
 
+TEST(RawApiTest, TestMemoryInfo) {
+  Scope root = Scope::NewRootScope().WithDevice(DeviceFromFlag());
+  Output result = ops::XRTMemoryInfo(root);
+  TF_ASSERT_OK(root.status());
+
+  ClientSession session(root);
+  std::vector<Tensor> outputs;
+  TF_EXPECT_OK(session.Run({result}, &outputs));
+  ASSERT_EQ(outputs.size(), 1);
+
+  xrt::MemoryInfo mem_info;
+  EXPECT_TRUE(ParseFromTString(outputs[0].scalar<tstring>()(), &mem_info));
+  EXPECT_GT(mem_info.kb_total(), 0);
+  EXPECT_GT(mem_info.kb_free(), 0);
+}
+
 }  // namespace
 
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/xrt/xrt.proto b/tensorflow/compiler/xrt/xrt.proto
index 9a351732c4b..0a96843c6c4 100644
--- a/tensorflow/compiler/xrt/xrt.proto
+++ b/tensorflow/compiler/xrt/xrt.proto
@@ -268,3 +268,10 @@ message MetricValues {
 message MetricsReport {
   repeated MetricValues metrics = 1;
 }
+
+message MemoryInfo {
+  // The total memory on a device, in KB.
+  int64 kb_total = 1;
+  // The free memory on a device, in KB.
+  int64 kb_free = 2;
+}
diff --git a/tensorflow/core/platform/default/port.cc b/tensorflow/core/platform/default/port.cc
index b3a4bbbecbd..11b3cd7fd9a 100644
--- a/tensorflow/core/platform/default/port.cc
+++ b/tensorflow/core/platform/default/port.cc
@@ -348,15 +348,17 @@ double NominalCPUFrequency() {
   return absl::base_internal::NominalCPUFrequency();
 }
 
-int64 AvailableRam() {
+MemoryInfo GetMemoryInfo() {
+  MemoryInfo mem_info = {INT64_MAX, INT64_MAX};
 #if defined(__linux__) && !defined(__ANDROID__)
   struct sysinfo info;
   int err = sysinfo(&info);
   if (err == 0) {
-    return info.freeram;
+    mem_info.free = info.freeram;
+    mem_info.total = info.totalram;
   }
 #endif
-  return INT64_MAX;
+  return mem_info;
 }
 
 }  // namespace port
diff --git a/tensorflow/core/platform/mem.h b/tensorflow/core/platform/mem.h
index e8150f73220..27ad3574182 100644
--- a/tensorflow/core/platform/mem.h
+++ b/tensorflow/core/platform/mem.h
@@ -59,8 +59,18 @@ void MallocExtension_ReleaseToSystem(std::size_t num_bytes);
 // routine, this routine returns 0.
 std::size_t MallocExtension_GetAllocatedSize(const void* p);
 
+struct MemoryInfo {
+  int64 total = 0;
+  int64 free = 0;
+};
+
+// Retrieves the host memory information. If any of the fields in the returned
+// MemoryInfo structure is INT64_MAX, it means such information is not
+// available.
+MemoryInfo GetMemoryInfo();
+
 // Returns the amount of RAM available in bytes, or INT64_MAX if unknown.
-int64 AvailableRam();
+static inline int64 AvailableRam() { return GetMemoryInfo().free; }
 
 }  // namespace port
 }  // namespace tensorflow
diff --git a/tensorflow/stream_executor/host/host_gpu_executor.cc b/tensorflow/stream_executor/host/host_gpu_executor.cc
index d6fd0ce9821..88de44ef396 100644
--- a/tensorflow/stream_executor/host/host_gpu_executor.cc
+++ b/tensorflow/stream_executor/host/host_gpu_executor.cc
@@ -17,6 +17,7 @@ limitations under the License.
 // class declaration].
 #include "tensorflow/stream_executor/host/host_gpu_executor.h"
 
+#include <stdint.h>
 #include <string.h>
 
 #include "absl/strings/numbers.h"
@@ -58,6 +59,13 @@ port::Status HostExecutor::Init(int device_ordinal,
   return port::Status::OK();
 }
 
+bool HostExecutor::DeviceMemoryUsage(int64 *free, int64 *total) const {
+  tensorflow::port::MemoryInfo mem_info = tensorflow::port::GetMemoryInfo();
+  *free = (mem_info.free != INT64_MAX) ? mem_info.free : -1;
+  *total = (mem_info.total != INT64_MAX) ? mem_info.total : -1;
+  return true;
+}
+
 DeviceMemoryBase HostExecutor::Allocate(uint64 size, int64 memory_space) {
   CHECK_EQ(memory_space, 0);
   // Use a minimum alignment of 64 bytes to be friendly to AVX512 code.
diff --git a/tensorflow/stream_executor/host/host_gpu_executor.h b/tensorflow/stream_executor/host/host_gpu_executor.h
index c971ec89bf0..9b896fe06f8 100644
--- a/tensorflow/stream_executor/host/host_gpu_executor.h
+++ b/tensorflow/stream_executor/host/host_gpu_executor.h
@@ -130,9 +130,7 @@ class HostExecutor : public internal::StreamExecutorInterface {
 
   int PlatformDeviceCount() override { return 1; }
 
-  bool DeviceMemoryUsage(int64 *free, int64 *total) const override {
-    return false;
-  }
+  bool DeviceMemoryUsage(int64 *free, int64 *total) const override;
 
   port::StatusOr<std::unique_ptr<DeviceDescription>> CreateDeviceDescription()
       const override {

From 0e21ee9a48d6f0f69a32a3a10a01555bd6ff8198 Mon Sep 17 00:00:00 2001
From: Berkin Ilbeyi <berkin@google.com>
Date: Mon, 29 Jun 2020 14:32:00 -0700
Subject: [PATCH 1279/1390] [XLA] Deprecate BufferValue::color.

PiperOrigin-RevId: 318895000
Change-Id: I8ea6f18ffcb55a41a13180c84bbd0637d5a6e06f
---
 tensorflow/compiler/xla/service/buffer_value.h | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tensorflow/compiler/xla/service/buffer_value.h b/tensorflow/compiler/xla/service/buffer_value.h
index bd2a09e4aaf..8177eb63389 100644
--- a/tensorflow/compiler/xla/service/buffer_value.h
+++ b/tensorflow/compiler/xla/service/buffer_value.h
@@ -108,18 +108,21 @@ class BufferValue {
 
   // Return the color of the BufferValue. Differently colored buffers can not be
   // parts of the same allocation.
+  ABSL_DEPRECATED("Use Layout::memory_space instead.")
   Color color() const {
     CHECK_NE(color_, kInvalidColor)
         << "Should not query the color of a buffer that was never colored";
     return color_;
   }
 
+  ABSL_DEPRECATED("Use Layout::memory_space instead.")
   void set_color(Color color) {
     CHECK_NE(color, kInvalidColor)
         << "Should not set the color of a buffer to the invalid color";
     color_ = color;
   }
 
+  ABSL_DEPRECATED("Use Layout::memory_space instead.")
   bool has_color() const { return color_ != kInvalidColor; }
 
   // Return the shape of the buffer. This reference points into the shape field

From 69e0a88ffd28d2fdb15bc590353b1e13d78f640d Mon Sep 17 00:00:00 2001
From: Henry Tan <henrytan@google.com>
Date: Mon, 29 Jun 2020 15:45:25 -0700
Subject: [PATCH 1280/1390] Splitting TpuCompilationCacheLookup interface and
 TpuCompilationCacheLocalLookup.

PiperOrigin-RevId: 318909518
Change-Id: I76de6a69723d70f542935c23ed05e1ea4e07f52d
---
 tensorflow/core/tpu/kernels/BUILD             | 16 ++++-
 ... => tpu_compilation_cache_local_lookup.cc} |  6 +-
 .../tpu_compilation_cache_local_lookup.h      | 62 +++++++++++++++++++
 .../kernels/tpu_compilation_cache_lookup.h    | 47 +++-----------
 4 files changed, 87 insertions(+), 44 deletions(-)
 rename tensorflow/core/tpu/kernels/{tpu_compilation_cache_lookup.cc => tpu_compilation_cache_local_lookup.cc} (93%)
 create mode 100644 tensorflow/core/tpu/kernels/tpu_compilation_cache_local_lookup.h

diff --git a/tensorflow/core/tpu/kernels/BUILD b/tensorflow/core/tpu/kernels/BUILD
index 8c0b574045b..57be8a3f03f 100644
--- a/tensorflow/core/tpu/kernels/BUILD
+++ b/tensorflow/core/tpu/kernels/BUILD
@@ -197,13 +197,11 @@ cc_library(
 
 cc_library(
     name = "tpu_compilation_cache_lookup",
-    srcs = ["tpu_compilation_cache_lookup.cc"],
     hdrs = [
         "tpu_compilation_cache_lookup.h",
     ],
     deps = [
         ":tpu_compilation_cache_entry",
-        ":tpu_compilation_cache_external",
         ":tpu_compilation_cache_interface",
         ":tpu_compilation_cache_proto_cc",
         "//tensorflow/core/lib/core:refcount",
@@ -212,6 +210,20 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "tpu_compilation_cache_local_lookup",
+    srcs = ["tpu_compilation_cache_local_lookup.cc"],
+    hdrs = ["tpu_compilation_cache_local_lookup.h"],
+    deps = [
+        ":tpu_compilation_cache_entry",
+        ":tpu_compilation_cache_external",
+        ":tpu_compilation_cache_interface",
+        ":tpu_compilation_cache_lookup",
+        ":tpu_compilation_cache_proto_cc",
+        "//tensorflow/core/platform:status",
+    ],
+)
+
 cc_library(
     name = "tpu_mesh_state_c_api_hdrs",
     hdrs = ["tpu_mesh_state_c_api.h"],
diff --git a/tensorflow/core/tpu/kernels/tpu_compilation_cache_lookup.cc b/tensorflow/core/tpu/kernels/tpu_compilation_cache_local_lookup.cc
similarity index 93%
rename from tensorflow/core/tpu/kernels/tpu_compilation_cache_lookup.cc
rename to tensorflow/core/tpu/kernels/tpu_compilation_cache_local_lookup.cc
index 9285dff62ce..f30a503d2d2 100644
--- a/tensorflow/core/tpu/kernels/tpu_compilation_cache_lookup.cc
+++ b/tensorflow/core/tpu/kernels/tpu_compilation_cache_local_lookup.cc
@@ -12,14 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/core/tpu/kernels/tpu_compilation_cache_lookup.h"
-
-#include "tensorflow/core/profiler/lib/traceme.h"
-#include "tensorflow/core/tpu/kernels/tpu_compilation_cache_external.h"
+#include "tensorflow/core/tpu/kernels/tpu_compilation_cache_local_lookup.h"
 
 namespace tensorflow {
 namespace tpu {
-
 namespace {
 class CompilationCacheFetchTargetUtility {
  public:
diff --git a/tensorflow/core/tpu/kernels/tpu_compilation_cache_local_lookup.h b/tensorflow/core/tpu/kernels/tpu_compilation_cache_local_lookup.h
new file mode 100644
index 00000000000..eb5aadcd3e2
--- /dev/null
+++ b/tensorflow/core/tpu/kernels/tpu_compilation_cache_local_lookup.h
@@ -0,0 +1,62 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_TPU_KERNELS_TPU_COMPILATION_CACHE_LOCAL_LOOKUP_H_
+#define TENSORFLOW_CORE_TPU_KERNELS_TPU_COMPILATION_CACHE_LOCAL_LOOKUP_H_
+
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/tpu/kernels/tpu_compilation_cache.pb.h"
+#include "tensorflow/core/tpu/kernels/tpu_compilation_cache_entry.h"
+#include "tensorflow/core/tpu/kernels/tpu_compilation_cache_external.h"
+#include "tensorflow/core/tpu/kernels/tpu_compilation_cache_interface.h"
+#include "tensorflow/core/tpu/kernels/tpu_compilation_cache_lookup.h"
+
+namespace tensorflow {
+namespace tpu {
+
+// Class for looking up TPU programs when the execute and compile Op are in the
+// same address space. The proto is simply looked up in the compilation cache,
+// without any serialization taking place.
+class TpuCompilationCacheLocalLookup
+    : public TpuCompilationCacheLookup<
+          CompilationCacheEntryRef<TpuCompilationCacheEntry>> {
+ public:
+  using TpuCompilationCacheEntryRef =
+      ::tensorflow::tpu::CompilationCacheEntryRef<TpuCompilationCacheEntry>;
+  using EntryRefImpl =
+      ::tensorflow::tpu::TpuCompilationCacheExternal::EntryRefImpl;
+
+  explicit TpuCompilationCacheLocalLookup(TpuCompilationCacheInterface* cache);
+  ~TpuCompilationCacheLocalLookup() override;
+
+  Status Lookup(const string& proto_key,
+                std::unique_ptr<TpuCompilationCacheEntryRef>* entry,
+                CompilationCacheFetchTarget fetch_target) override;
+
+  Status Lookup(int64 uid, int proto_index,
+                std::unique_ptr<TpuCompilationCacheEntryRef>* entry,
+                CompilationCacheFetchTarget fetch_target) override;
+
+  string DebugString() const override;
+
+ private:
+  // The subgraph compilation cache, in the same process address space where the
+  // lookups are happening.
+  TpuCompilationCacheInterface* cache_;
+};
+
+}  // namespace tpu
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TPU_KERNELS_TPU_COMPILATION_CACHE_LOCAL_LOOKUP_H_
diff --git a/tensorflow/core/tpu/kernels/tpu_compilation_cache_lookup.h b/tensorflow/core/tpu/kernels/tpu_compilation_cache_lookup.h
index 21ca74c46a8..fb9c3a88407 100644
--- a/tensorflow/core/tpu/kernels/tpu_compilation_cache_lookup.h
+++ b/tensorflow/core/tpu/kernels/tpu_compilation_cache_lookup.h
@@ -19,22 +19,17 @@ limitations under the License.
 #include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/tpu/kernels/tpu_compilation_cache.pb.h"
 #include "tensorflow/core/tpu/kernels/tpu_compilation_cache_entry.h"
-#include "tensorflow/core/tpu/kernels/tpu_compilation_cache_external.h"
 #include "tensorflow/core/tpu/kernels/tpu_compilation_cache_interface.h"
 
 namespace tensorflow {
 namespace tpu {
 
-// Base class allowing Execute Ops to look up ISA protos. Different subclasses
+// Base class allowing Execute Ops to look up TPU programs. Different subclasses
 // are used when the execute Op is in the same address space as the compile Op,
 // and when they need to communicate over RPC.
+template <typename TpuCompilationCacheEntryRefType>
 class TpuCompilationCacheLookup : public ResourceBase {
  public:
-  using TpuCompilationCacheEntryRef =
-      ::tensorflow::tpu::CompilationCacheEntryRef<TpuCompilationCacheEntry>;
-  using EntryRefImpl =
-      ::tensorflow::tpu::TpuCompilationCacheExternal::EntryRefImpl;
-
   ~TpuCompilationCacheLookup() override = default;
 
   // Looks up an executable corresponding to the model-parallel core index of
@@ -49,11 +44,12 @@ class TpuCompilationCacheLookup : public ResourceBase {
   // fetch_target requests one of them, then after this call
   //   (*entry)->get().get_executable() will return nullptr.
   virtual Status Lookup(const string& proto_key,
-                        std::unique_ptr<TpuCompilationCacheEntryRef>* entry,
+                        std::unique_ptr<TpuCompilationCacheEntryRefType>* entry,
                         CompilationCacheFetchTarget fetch_target) = 0;
 
-  virtual Status Lookup(const string& proto_key,
-                        std::unique_ptr<TpuCompilationCacheEntryRef>* entry) {
+  virtual Status Lookup(
+      const string& proto_key,
+      std::unique_ptr<TpuCompilationCacheEntryRefType>* entry) {
     return Lookup(proto_key, std::move(entry),
                   CompilationCacheFetchTarget::MAIN);
   }
@@ -63,40 +59,17 @@ class TpuCompilationCacheLookup : public ResourceBase {
   // returned in program. The wrapper is guaranteed to be valid only during the
   // execution of the Op requesting the proto.
   virtual Status Lookup(int64 uid, int proto_index,
-                        std::unique_ptr<TpuCompilationCacheEntryRef>* entry,
+                        std::unique_ptr<TpuCompilationCacheEntryRefType>* entry,
                         CompilationCacheFetchTarget fetch_target) = 0;
 
-  virtual Status Lookup(int64 uid, int proto_index,
-                        std::unique_ptr<TpuCompilationCacheEntryRef>* entry) {
+  virtual Status Lookup(
+      int64 uid, int proto_index,
+      std::unique_ptr<TpuCompilationCacheEntryRefType>* entry) {
     return Lookup(uid, proto_index, std::move(entry),
                   CompilationCacheFetchTarget::MAIN);
   }
 };
 
-// Class for looking up ISA protos when the execute and compile Op are in the
-// same address space. The proto is simply looked up in the compilation cache,
-// without any serialization taking place.
-class TpuCompilationCacheLocalLookup : public TpuCompilationCacheLookup {
- public:
-  explicit TpuCompilationCacheLocalLookup(TpuCompilationCacheInterface* cache);
-  ~TpuCompilationCacheLocalLookup() override;
-
-  Status Lookup(const string& proto_key,
-                std::unique_ptr<TpuCompilationCacheEntryRef>* entry,
-                CompilationCacheFetchTarget fetch_target) override;
-
-  Status Lookup(int64 uid, int proto_index,
-                std::unique_ptr<TpuCompilationCacheEntryRef>* entry,
-                CompilationCacheFetchTarget fetch_target) override;
-
-  string DebugString() const override;
-
- private:
-  // The subgraph compilation cache, in the same process address space where the
-  // lookups are happening.
-  TpuCompilationCacheInterface* cache_;
-};
-
 }  // namespace tpu
 }  // namespace tensorflow
 

From 0512207863cf97668cf717ae011030564bc569ce Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 29 Jun 2020 16:03:47 -0700
Subject: [PATCH 1281/1390] Fix minor doc issue on TFLite GPU quantization
 support

PiperOrigin-RevId: 318912761
Change-Id: Ied17ea9b94a64b330d2107fd3af31815dcb27aac
---
 tensorflow/lite/g3doc/performance/gpu_advanced.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/lite/g3doc/performance/gpu_advanced.md b/tensorflow/lite/g3doc/performance/gpu_advanced.md
index 4f6c4dea9dd..7e0816bd87e 100644
--- a/tensorflow/lite/g3doc/performance/gpu_advanced.md
+++ b/tensorflow/lite/g3doc/performance/gpu_advanced.md
@@ -233,8 +233,8 @@ the future.
 
 The GPU delegate already supports
 [float16 quantized](https://www.tensorflow.org/lite/performance/post_training_float16_quant)
-models. There is experimental support on Android to run 8-bit quantized as well.
-This includes all flavors of quantization, including:
+models. There is experimental support on Android and iOS to run 8-bit quantized
+as well. This includes all flavors of quantization, including:
 
 *   Models trained with
     [Quantization-aware training](https://www.tensorflow.org/lite/convert/quantization)

From b671b131f4f7b161d19f084c659b7b2971461806 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 29 Jun 2020 16:28:32 -0700
Subject: [PATCH 1282/1390] Fix delegate options passing for QoS parameters.

PiperOrigin-RevId: 318917134
Change-Id: I4236c38a8d7e2ede76a2f546e1a7ec11fb79884c
---
 tensorflow/lite/delegates/nnapi/nnapi_delegate.cc  | 14 ++++++++++++++
 .../lite/delegates/nnapi/nnapi_delegate_test.cc    |  1 +
 2 files changed, 15 insertions(+)

diff --git a/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc b/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc
index 58ab13ab657..cb95d7bd248 100644
--- a/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc
+++ b/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc
@@ -4277,6 +4277,13 @@ StatefulNnApiDelegate::StatefulNnApiDelegate(const NnApi* nnapi,
   delegate_data_.max_number_delegated_partitions =
       options.max_number_delegated_partitions;
   delegate_data_.allow_fp16 = options.allow_fp16;
+  delegate_data_.execution_priority = options.execution_priority;
+  delegate_data_.max_compilation_timeout_duration_ns =
+      options.max_compilation_timeout_duration_ns;
+  delegate_data_.max_execution_timeout_duration_ns =
+      options.max_execution_timeout_duration_ns;
+  delegate_data_.max_execution_loop_timeout_duration_ns =
+      options.max_execution_loop_timeout_duration_ns;
   TFLITE_LOG_PROD_ONCE(tflite::TFLITE_LOG_INFO,
                        "Created TensorFlow Lite delegate for NNAPI.");
   Prepare = DoPrepare;
@@ -4307,6 +4314,13 @@ const StatefulNnApiDelegate::Options StatefulNnApiDelegate::GetOptions(
   options.max_number_delegated_partitions =
       delegate_data->max_number_delegated_partitions;
   options.allow_fp16 = delegate_data->allow_fp16;
+  options.execution_priority = delegate_data->execution_priority;
+  options.max_compilation_timeout_duration_ns =
+      delegate_data->max_compilation_timeout_duration_ns;
+  options.max_execution_timeout_duration_ns =
+      delegate_data->max_execution_timeout_duration_ns;
+  options.max_execution_loop_timeout_duration_ns =
+      delegate_data->max_execution_loop_timeout_duration_ns;
   return options;
 }
 
diff --git a/tensorflow/lite/delegates/nnapi/nnapi_delegate_test.cc b/tensorflow/lite/delegates/nnapi/nnapi_delegate_test.cc
index f8d368839c2..198502c0af5 100644
--- a/tensorflow/lite/delegates/nnapi/nnapi_delegate_test.cc
+++ b/tensorflow/lite/delegates/nnapi/nnapi_delegate_test.cc
@@ -302,6 +302,7 @@ TEST(NNAPIDelegate, StatefulDelegateWithCompilationCaching) {
 // Sanity check for the state-ful NNAPI delegate with QoS hints.
 TEST(NNAPIDelegate, StatefulDelegateWithQoS) {
   StatefulNnApiDelegate::Options options;
+  options.accelerator_name = "nnapi-reference";
   options.execution_priority = ANEURALNETWORKS_PRIORITY_HIGH;
   options.max_compilation_timeout_duration_ns = UINT64_MAX;
   options.max_execution_timeout_duration_ns = UINT64_MAX;

From 34e9d95cfabaabb40b8966fab3956141b6475e0d Mon Sep 17 00:00:00 2001
From: Revan Sopher <rsopher@google.com>
Date: Mon, 29 Jun 2020 16:45:38 -0700
Subject: [PATCH 1283/1390] Fix failures due to creating depset from depset.

PiperOrigin-RevId: 318920191
Change-Id: I72d193238c0a93b15c32d1dd739b3d613b0036d0
---
 tensorflow/python/tpu/tpu_test_wrapper.bzl | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/tensorflow/python/tpu/tpu_test_wrapper.bzl b/tensorflow/python/tpu/tpu_test_wrapper.bzl
index c8ea62c0e5a..fda631701bc 100644
--- a/tensorflow/python/tpu/tpu_test_wrapper.bzl
+++ b/tensorflow/python/tpu/tpu_test_wrapper.bzl
@@ -49,7 +49,11 @@ def get_kwargs_for_wrapping(
         "//tensorflow/python/tpu:tpu_test_wrapper.py",
     )
 
-    deps = depset(kwargs["deps"])
+    # deps might be either a list or a depset, so we standardize here.
+    deps = kwargs["deps"]
+    if type(deps) == type(list()):
+        deps = depset(deps)
+
     kwargs["python_version"] = kwargs.get("python_version", "PY3")
     kwargs["srcs"] = [wrapper_src] + kwargs["srcs"]
     kwargs["deps"] = depset(

From 1c38ee07f94dfdd389fb671be3c5e04aae0de103 Mon Sep 17 00:00:00 2001
From: Meghna Natraj <mnatraj@google.com>
Date: Mon, 29 Jun 2020 16:49:15 -0700
Subject: [PATCH 1284/1390] Update comment on removing unused tensors

PiperOrigin-RevId: 318920797
Change-Id: I927ac3c10fc2c2153ec5bfbf1f04465e8c41f1d2
---
 tensorflow/lite/tools/optimize/modify_model_interface_test.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/lite/tools/optimize/modify_model_interface_test.cc b/tensorflow/lite/tools/optimize/modify_model_interface_test.cc
index 9e4808127bb..55147cec1ec 100644
--- a/tensorflow/lite/tools/optimize/modify_model_interface_test.cc
+++ b/tensorflow/lite/tools/optimize/modify_model_interface_test.cc
@@ -334,7 +334,7 @@ TEST(ModelInterface, Int8SingleInputOutput) {
 
   // Verify results.
   EXPECT_EQ(model->subgraphs.size(), 1);
-  // TODO (b/158254056): Remove unused inputs and outputs from tensor list
+  // TODO(mnatraj): The float input tensor has not been removed.
   // EXPECT_EQ(model->subgraphs[0]->tensors.size(), 2);
   EXPECT_EQ(model->subgraphs[0]->tensors.size(), 3);
   EXPECT_EQ(model->subgraphs[0]->inputs.size(), 1);
@@ -464,7 +464,7 @@ TEST(ModelInterface, Int8MutipleInputOutput) {
 
   // Verify results.
   EXPECT_EQ(model->subgraphs.size(), 1);
-  // TODO (b/158254056): Remove unused inputs and outputs from tensor list
+  // TODO(mnatraj): The two float input tensors have not been removed.
   // EXPECT_EQ(model->subgraphs[0]->tensors.size(), 4);
   EXPECT_EQ(model->subgraphs[0]->tensors.size(), 6);
   EXPECT_EQ(model->subgraphs[0]->inputs.size(), 2);

From a67138f13e87481b45cc6184cfdc1e543da82645 Mon Sep 17 00:00:00 2001
From: Gaurav Jain <gjn@google.com>
Date: Mon, 29 Jun 2020 16:58:06 -0700
Subject: [PATCH 1285/1390] Code clean-ups for custom gradients

- Remove unnecessary set operation since all args are converted to
  tensors.
- Make _get_dependent_variables private
- Avoid redundant calls to nest.flatten by requiring inputs to
  _get_dependent_variables be flattened.
- Clean up session usage in tests
- Clean up asserts in tests

PiperOrigin-RevId: 318922308
Change-Id: Ie30dac5f5c610c6610bfe4c6084c6543d059844e
---
 tensorflow/python/ops/custom_gradient.py |  34 +++--
 tensorflow/python/ops/gradients_test.py  | 151 ++++++++++-------------
 2 files changed, 86 insertions(+), 99 deletions(-)

diff --git a/tensorflow/python/ops/custom_gradient.py b/tensorflow/python/ops/custom_gradient.py
index ae06d022ec7..19d15773c1e 100644
--- a/tensorflow/python/ops/custom_gradient.py
+++ b/tensorflow/python/ops/custom_gradient.py
@@ -287,14 +287,22 @@ def get_variable_by_name(var_name):
     return None
 
 
-def get_dependent_variables(input_ops, output_ops):
-  """Finds variables involved in the subgraph b/w input_ops and output_ops."""
+def _get_dependent_variables(input_ops, output_ops):
+  """Finds variables involved in the subgraph between input_ops and output_ops.
+
+  Args:
+    input_ops: Flattened list of input ops
+    output_ops: Flattened list of output ops
+
+  Returns:
+    A list of variables
+  """
 
   # avoids the edge-case when input_ops == output_ops.
   output_ops = nest.map_structure(gen_array_ops.identity, output_ops)
   inbetween_ops = op_selector.get_backward_walk_ops(
-      seed_ops=nest.flatten(output_ops),
-      stop_at_ts=nest.flatten(input_ops),
+      seed_ops=output_ops,
+      stop_at_ts=input_ops,
       inclusive=False,
       only_differentiable=True)
   var_ops = (op for op in inbetween_ops if op.type in VAR_OP_TYPES)
@@ -332,7 +340,11 @@ def _graph_mode_decorator(f, args, kwargs):
   ])
   with tape_lib.VariableWatcher() as variable_watcher:
     result, grad_fn = f(*args)
+
   args = nest.flatten(args)
+  flat_result = nest.flatten(result)
+  flat_result_len = len(flat_result)
+
   after_vars = set([
       v.ref() for v in current_var_scope.global_variables() +
       current_var_scope.local_variables()
@@ -350,10 +362,10 @@ def _graph_mode_decorator(f, args, kwargs):
   # variables used that are *not* part of the inputs.
   variables_in_tape = frozenset([
       v.ref() for v in variable_watcher.watched_variables()
-  ]) - frozenset(v.ref() for v in args)
+  ])
   variables_in_subgraph = frozenset([
       v.ref()
-      for v in get_dependent_variables(input_ops=args, output_ops=result)
+      for v in _get_dependent_variables(input_ops=args, output_ops=flat_result)
   ])
   variables = list(
       [v.deref() for v in variables_in_subgraph.union(variables_in_tape)])
@@ -369,8 +381,6 @@ def _graph_mode_decorator(f, args, kwargs):
     # User seems to intend to use variables but none were captured.
     logging.warn("@custom_gradient grad_fn has 'variables' in signature, but "
                  "no ResourceVariables were used on the forward pass.")
-  flat_result = nest.flatten(result)
-  flat_result_len = len(flat_result)
 
   all_tensors = flat_result + args + variables
 
@@ -485,8 +495,8 @@ def recompute_grad(f):
     f: function `f(*x)` that returns a `Tensor` or sequence of `Tensor` outputs.
 
   Returns:
-   A function `g` that wraps `f`, but which recomputes `f` on the backwards
-   pass of a gradient call.
+    A function `g` that wraps `f`, but which recomputes `f` on the backwards
+    pass of a gradient call.
   """
   # TODO(cdfreeman) Add is_recomputing functionality from graph mode version
 
@@ -578,8 +588,8 @@ def grad_pass_through(f):
       outputs.
 
   Returns:
-   A function `h(x)` which returns the same values as `f(x)` and whose
-   gradients are the same as those of an identity function.
+    A function `h(x)` which returns the same values as `f(x)` and whose
+    gradients are the same as those of an identity function.
   """
   @custom_gradient
   def _grad_pass_through_op(*args, **kwargs):
diff --git a/tensorflow/python/ops/gradients_test.py b/tensorflow/python/ops/gradients_test.py
index ba053e88ede..81140af6e79 100644
--- a/tensorflow/python/ops/gradients_test.py
+++ b/tensorflow/python/ops/gradients_test.py
@@ -42,6 +42,7 @@ from tensorflow.python.ops import custom_gradient
 from tensorflow.python.ops import data_flow_grad  # pylint: disable=unused-import
 from tensorflow.python.ops import data_flow_ops  # pylint: disable=unused-import
 from tensorflow.python.ops import functional_ops  # pylint: disable=unused-import
+from tensorflow.python.ops import gradient_checker_v2
 from tensorflow.python.ops import gradients
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import init_ops
@@ -59,7 +60,6 @@ from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 from tensorflow.python.ops.nn_ops import bias_add
 from tensorflow.python.platform import googletest
-from tensorflow.python.ops import gradient_checker_v2
 from tensorflow.python.util import nest
 
 
@@ -112,7 +112,7 @@ class GradientsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
       self.assertEqual(gw1.op.colocation_groups(), wx.op.colocation_groups())
 
       gw2 = gradients.gradients(z, [w], colocate_gradients_with_ops=False)[0]
-      self.assertTrue(wx.op.colocation_groups() != gw2.op.colocation_groups())
+      self.assertNotEqual(wx.op.colocation_groups(), gw2.op.colocation_groups())
 
   def testColocateGradientsWithAggregationInMultipleDevices(self):
     with ops.Graph().as_default() as g:
@@ -131,7 +131,7 @@ class GradientsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
       self.assertEqual(gw1.op.colocation_groups(), w.op.colocation_groups())
 
       gw2 = gradients.gradients(z, [w], colocate_gradients_with_ops=False)[0]
-      self.assertTrue(w.op.colocation_groups() != gw2.op.colocation_groups())
+      self.assertNotEqual(w.op.colocation_groups(), gw2.op.colocation_groups())
 
   def testColocateGradientsWithGateGradients(self):
     if not test_util.is_gpu_available():
@@ -146,9 +146,9 @@ class GradientsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
 
       gz_x = gradients.gradients(z, [x], colocate_gradients_with_ops=True,
                                  gate_gradients=True)[0]
-      with session.Session():
-        # Make sure the placer doesn't complain.
-        self.evaluate(gz_x)
+
+      # Make sure the placer doesn't complain.
+      self.evaluate(gz_x)
 
   def testBoundaryStop(self):
     # Test that we don't differentiate 'x'. The gradient function for 'x' is
@@ -411,8 +411,8 @@ class GradientsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
       y = constant(3.0, shape=[3, 1])
       grads = gradients.gradients(
           [y], [x], unconnected_gradients="zero")
-      with self.cached_session() as sess:
-        self.assertAllEqual([[0.0, 0.0], [0.0, 0.0]], self.evaluate(grads)[0])
+
+      self.assertAllEqual([[0.0, 0.0], [0.0, 0.0]], self.evaluate(grads)[0])
 
   def testUnconnectedGradientsZeroConnectedGradients(self):
     with ops.Graph().as_default():
@@ -420,8 +420,8 @@ class GradientsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
       y = x * 3.0
       grad = gradients.gradients(
           [y], [x], unconnected_gradients="zero")
-      with self.cached_session() as sess:
-        self.assertEquals(3.0, self.evaluate(grad)[0])
+
+      self.assertEqual(3.0, self.evaluate(grad)[0])
 
   def testUnknownUnconnectedGradientsValueGiven(self):
     with ops.Graph().as_default():
@@ -518,8 +518,8 @@ class FunctionGradientsTest(test_util.TensorFlowTestCase):
 
     y = f(x, b)
     grads = gradients.gradients(y, [x, b])
-    with self.cached_session() as sess:
-      return sess.run(grads)
+
+    return self.evaluate(grads)
 
   def testFunctionGradientsBasic(self):
     g = ops.Graph()
@@ -541,9 +541,8 @@ class FunctionGradientsTest(test_util.TensorFlowTestCase):
       # Build gradient graph (should add SymbolicGradient node for function).
       grads = gradients.gradients(y, [x, b1])
 
-      with self.cached_session() as sess:
-        self.assertAllEqual([40.0], self.evaluate(grads)[0])
-        self.assertAllEqual([10.0], self.evaluate(grads)[1])
+      self.assertAllEqual([40.0], self.evaluate(grads)[0])
+      self.assertAllEqual([10.0], self.evaluate(grads)[1])
 
   def testFunctionGradientsWithGradFunc(self):
     g = ops.Graph()
@@ -590,8 +589,8 @@ class FunctionGradientsTest(test_util.TensorFlowTestCase):
         return g[0]
 
       f = Foo()
-      with self.cached_session() as sess:
-        self.assertEqual(self.evaluate(f), 2.0)
+
+      self.assertEqual(self.evaluate(f), 2.0)
 
   def testGradientOfCaptured(self):
     with ops.Graph().as_default():
@@ -604,8 +603,8 @@ class FunctionGradientsTest(test_util.TensorFlowTestCase):
         return g[0]
 
       f = Foo()
-      with self.cached_session() as sess:
-        self.assertEqual(self.evaluate(f), 2.0)
+
+      self.assertEqual(self.evaluate(f), 2.0)
 
   def testCapturedResourceVariable(self):
     with ops.Graph().as_default():
@@ -618,9 +617,9 @@ class FunctionGradientsTest(test_util.TensorFlowTestCase):
         return g[0]
 
       f = Foo()
-      with self.cached_session() as sess:
-        self.evaluate(variables.global_variables_initializer())
-        self.assertEqual(self.evaluate(f), 2.0)
+
+      self.evaluate(variables.global_variables_initializer())
+      self.assertEqual(self.evaluate(f), 2.0)
 
   def testCapturedNested(self):
     with ops.Graph().as_default():
@@ -643,11 +642,11 @@ class FunctionGradientsTest(test_util.TensorFlowTestCase):
         return Inner()
 
       x1_grad, x2_grad = Outer()
-      with self.cached_session() as sess:
-        # 1.0 + None + 2.0 + 1.0 = 4.0
-        self.assertEqual(self.evaluate(x1_grad), 4.0)
-        # None + 1.0 + 1.0 + None = 2.0
-        self.assertEqual(self.evaluate(x2_grad), 2.0)
+
+      # 1.0 + None + 2.0 + 1.0 = 4.0
+      self.assertEqual(self.evaluate(x1_grad), 4.0)
+      # None + 1.0 + 1.0 + None = 2.0
+      self.assertEqual(self.evaluate(x2_grad), 2.0)
 
   def testCapturedFromFunction(self):
     with ops.Graph().as_default():
@@ -666,8 +665,8 @@ class FunctionGradientsTest(test_util.TensorFlowTestCase):
         return Inner()
 
       z_grad = Outer()
-      with self.cached_session() as sess:
-        self.assertEqual(self.evaluate(z_grad), 3.0)
+
+      self.assertEqual(self.evaluate(z_grad), 3.0)
 
   def testCapturedEagerTensors(self):
     # Test that we can handle captured eager tensors unrelated to the gradient
@@ -898,8 +897,8 @@ class IndexedSlicesToTensorTest(test_util.TensorFlowTestCase):
     with warnings.catch_warnings(record=True) as w:
       math_ops.multiply(c_sparse, 1.0)
     self.assertEqual(1, len(w))
-    self.assertTrue(
-        "with 100000000 elements. This may consume a large amount of memory." in
+    self.assertIn(
+        "with 100000000 elements. This may consume a large amount of memory.",
         str(w[0].message))
 
     # Unknown dense shape: warning.
@@ -910,8 +909,8 @@ class IndexedSlicesToTensorTest(test_util.TensorFlowTestCase):
     with warnings.catch_warnings(record=True) as w:
       math_ops.multiply(c_sparse, 1.0)
     self.assertEqual(1, len(w))
-    self.assertTrue(
-        "of unknown shape. This may consume a large amount of memory." in
+    self.assertIn(
+        "of unknown shape. This may consume a large amount of memory.",
         str(w[0].message))
 
 
@@ -950,7 +949,7 @@ class ResourceCondTest(test_util.TensorFlowTestCase):
 
     grads = gradients.gradients(
         loss, [gamma])
-    self.assertTrue(None not in grads)
+    self.assertNotIn(None, grads)
 
 
 class GetDependentVariablesTest(test_util.TensorFlowTestCase):
@@ -960,7 +959,7 @@ class GetDependentVariablesTest(test_util.TensorFlowTestCase):
       func = lambda x: array_ops.identity(x) + 5.0
       input_t = constant_op.constant(2.0)
       result_t = func(input_t)
-      dependent_vars = custom_gradient.get_dependent_variables(
+      dependent_vars = custom_gradient._get_dependent_variables(
           [input_t], [result_t])
 
       # There are no variables.
@@ -976,7 +975,7 @@ class GetDependentVariablesTest(test_util.TensorFlowTestCase):
 
       input_t = constant_op.constant(2.0)
       result_t = func(input_t)
-      dependent_vars = custom_gradient.get_dependent_variables(
+      dependent_vars = custom_gradient._get_dependent_variables(
           [input_t], [result_t])
       self.assertEqual(dependent_vars, [var])
 
@@ -991,7 +990,7 @@ class GetDependentVariablesTest(test_util.TensorFlowTestCase):
 
       input_t = constant_op.constant(2.0)
       result_t = func(input_t)
-      dependent_vars = custom_gradient.get_dependent_variables(
+      dependent_vars = custom_gradient._get_dependent_variables(
           [input_t], [result_t])
       self.assertEqual(set(dependent_vars), set([v_o, v_z]))
 
@@ -1005,7 +1004,7 @@ class GetDependentVariablesTest(test_util.TensorFlowTestCase):
 
       func = lambda x: array_ops.identity(x) + 5.0
       result_t = func(input_t)
-      dependent_vars = custom_gradient.get_dependent_variables(
+      dependent_vars = custom_gradient._get_dependent_variables(
           [input_t], [result_t])
       self.assertEqual(dependent_vars, [])
 
@@ -1022,7 +1021,7 @@ class GetDependentVariablesTest(test_util.TensorFlowTestCase):
 
       input_t = constant_op.constant(2.0)
       result_t = _Func(input_t)
-      dependent_vars = custom_gradient.get_dependent_variables(
+      dependent_vars = custom_gradient._get_dependent_variables(
           [input_t], [result_t])
       self.assertEqual(dependent_vars, [])
 
@@ -1041,33 +1040,10 @@ class GetDependentVariablesTest(test_util.TensorFlowTestCase):
 
       input_t = constant_op.constant(2.0)
       result_t = _Func(input_t)
-      dependent_vars = custom_gradient.get_dependent_variables(
+      dependent_vars = custom_gradient._get_dependent_variables(
           [input_t], [result_t])
       self.assertEqual(dependent_vars, [var_trainable])
 
-  def testNesting(self):
-    with ops.Graph().as_default():
-      init = constant_op.constant(100.0, shape=(5,))
-      var = variables.Variable(init, shape=(5,))
-
-      def _Func(inputs):
-        x = inputs["x"]
-        result = array_ops.identity(x) + 5.0 + var
-        return {
-            "y": result
-        }
-
-      input_t = constant_op.constant(2.0)
-      func_inputs = {
-          "x": input_t
-      }
-      result_t = _Func(func_inputs)
-
-      # Ensure we can deal with dictionary input and output.
-      dependent_vars = custom_gradient.get_dependent_variables(
-          func_inputs, result_t)
-      self.assertEqual(dependent_vars, [var])
-
   def testVariablesOutsideAndCustomGradient(self):
     with ops.Graph().as_default():
       init = constant_op.constant(100.0, shape=(5,))
@@ -1092,7 +1068,7 @@ class GetDependentVariablesTest(test_util.TensorFlowTestCase):
 
       input_t = constant_op.constant(2.0)
       result_t = _Func(input_t)
-      dependent_vars = custom_gradient.get_dependent_variables(
+      dependent_vars = custom_gradient._get_dependent_variables(
           [input_t], [result_t])
       self.assertEqual(dependent_vars, [var])
 
@@ -1133,8 +1109,8 @@ class CustomGradientTest(test_util.TensorFlowTestCase, parameterized.TestCase):
       x2 = constant(5.)
       y = MyMultiply(x1, x2)
       dy = gradients.gradients(y, [x1, x2])
-      with session.Session() as sess:
-        self.assertAllEqual([3., 5.], self.evaluate(dy))
+
+      self.assertAllEqual([3., 5.], self.evaluate(dy))
 
   def testCustomGradientClass(self):
 
@@ -1191,11 +1167,11 @@ class CustomGradientTest(test_util.TensorFlowTestCase, parameterized.TestCase):
         assert len(all_vars) == 1
       grads = gradients.gradients(y, [x, all_vars[0]])
       for g in grads:
-        self.assertTrue(g is not None)
-      with session.Session() as sess:
-        self.evaluate(variables.global_variables_initializer())
-        dw = sess.run(math_ops.reduce_sum(grads[1]))
-        self.assertEqual(12., dw)
+        self.assertIsNotNone(g)
+
+      self.evaluate(variables.global_variables_initializer())
+      dw = self.evaluate(math_ops.reduce_sum(grads[1]))
+      self.assertEqual(12., dw)
 
   def testCustomGradientWithCapture(self):
     with ops.Graph().as_default():
@@ -1252,10 +1228,10 @@ class CustomGradientTest(test_util.TensorFlowTestCase, parameterized.TestCase):
       _, var_grads = grads[0], grads[1:]
       for g in grads:
         self.assertIsNotNone(g)
-      with session.Session() as sess:
-        self.evaluate(variables.global_variables_initializer())
-        dw = sess.run(math_ops.reduce_sum(var_grads[-1]))
-        self.assertEqual(9., dw)
+
+      self.evaluate(variables.global_variables_initializer())
+      dw = self.evaluate(math_ops.reduce_sum(var_grads[-1]))
+      self.assertEqual(9., dw)
 
   def testCustomGradientWithVariablesEager(self):
     with context.eager_mode():
@@ -1380,10 +1356,10 @@ class CustomGradientTest(test_util.TensorFlowTestCase, parameterized.TestCase):
       grads = gradients.gradients(y, [x, all_vars[0]])
       for g in grads:
         self.assertIsNotNone(g)
-      with session.Session() as sess:
-        self.evaluate(variables.global_variables_initializer())
-        dw = sess.run(math_ops.reduce_sum(grads[1]))
-        self.assertEqual(12., dw)
+
+      self.evaluate(variables.global_variables_initializer())
+      dw = self.evaluate(math_ops.reduce_sum(grads[1]))
+      self.assertEqual(12., dw)
 
   @parameterized.named_parameters(
       [(("_%s_%s" % (x_struct, y_struct)).replace(" ", "").replace("None", ""),  # pylint: disable=g-complex-comprehension
@@ -1439,8 +1415,8 @@ class TensorListGradientsTest(test_util.TensorFlowTestCase):
       grad_tl = list_ops.tensor_list_push_back(tl, constant(5.0))
 
       grad = gradients.gradients(tl, a, grad_ys=grad_tl)[0]
-      with self.cached_session() as sess:
-        self.assertEquals(self.evaluate(grad), 5.)
+
+      self.assertEqual(self.evaluate(grad), 5.)
 
 
 class VariablesGradientTest(test_util.TensorFlowTestCase):
@@ -1463,7 +1439,7 @@ class VariablesGradientTest(test_util.TensorFlowTestCase):
   def _grad(self, f, argnums=0):
     """Return a function which computes the gradient of `f`."""
 
-    def _f(*params):
+    def F(*params):
       with backprop.GradientTape() as tape:
         tape.watch(params)
         outputs = f(*params)
@@ -1472,7 +1448,7 @@ class VariablesGradientTest(test_util.TensorFlowTestCase):
           params[argnums],
           unconnected_gradients=unconnected_gradients.UnconnectedGradients.ZERO)
 
-    return _f
+    return F
 
   def _test_gradients(self, f, inputs, order, delta=1e-3, rtol=1e-2, atol=1e-6):
     """Tests backward jacobians of `f`'s [0, `order`)-order gradients."""
@@ -1495,10 +1471,10 @@ class VariablesGradientTest(test_util.TensorFlowTestCase):
   def testCustomGradientRecomputeGradHigherOrder(self):
 
     @custom_gradient.recompute_grad
-    def f(x):
+    def F(x):
       return math_ops.reduce_prod(math_ops.tanh(x)**2)
 
-    self._test_gradients(f, [constant_op.constant([1.])], order=3)
+    self._test_gradients(F, [constant_op.constant([1.])], order=3)
 
   @test_util.run_in_graph_and_eager_modes
   def testFnRecompute(self):
@@ -1666,8 +1642,9 @@ class GradPassThroughTest(test_util.TensorFlowTestCase):
     y = custom_gradient.grad_pass_through(
         lambda v: state_ops.assign(x, v))(z**2)
     grads = gradients.gradients(y, z)
-    with self.cached_session() as sess:
-      sess.run(variables.global_variables_initializer())
+
+    with self.cached_session():
+      self.evaluate(variables.global_variables_initializer())
       self.assertAllClose(grads[0].eval(), 6.0)
 
     # Verify that variables involved in the wrapped op do not receive gradients.

From 0f63bc53c59883d378199640080de367a7da3f66 Mon Sep 17 00:00:00 2001
From: Rick Chao <rchao@google.com>
Date: Mon, 29 Jun 2020 17:09:32 -0700
Subject: [PATCH 1286/1390] MultiProcessRunner: Add useful set_args and
 get_manager methods in MPR for tests that require parent-sub communication.

PiperOrigin-RevId: 318924352
Change-Id: Idace293ea73bdd3d37bbc782c6a138a6c6339c08
---
 .../python/distribute/multi_process_runner.py | 27 +++++++++++++++++++
 1 file changed, 27 insertions(+)

diff --git a/tensorflow/python/distribute/multi_process_runner.py b/tensorflow/python/distribute/multi_process_runner.py
index 33451f2f255..99389aab626 100644
--- a/tensorflow/python/distribute/multi_process_runner.py
+++ b/tensorflow/python/distribute/multi_process_runner.py
@@ -218,6 +218,10 @@ class MultiProcessRunner(object):
     # This flag will be set to True once terminate_all() is called.
     self._all_forced_terminated = False
 
+  def set_args(self, args=None, kwargs=None):
+    self._args = args or self._args
+    self._kwargs = kwargs or self._kwargs
+
   def _continuously_readline_from_sub(self, pipe_r, task_type, task_id):
     """Function to continuously read lines from subprocesses."""
     with os.fdopen(pipe_r.fileno(), 'r', closefd=False) as reader:
@@ -547,6 +551,29 @@ class MultiProcessRunner(object):
                      task_type, task_id)
     self._all_forced_terminated = True
 
+  def get_manager(self):
+    """Returns the multiprocessing manager object for concurrency tools.
+
+    The manager object is useful as it controls a server process that holds
+    the python objects that can be shared across processes. This can be used
+    for parent-subprocess communication:
+
+    ```python
+    mpr = multi_process_runner.MultiProcessRunner(...)
+    manager = mpr.get_manager()
+    some_event_happening_in_subprocess = manager.Event()
+    mpr.set_args(args=(some_event_happening_in_subprocess,))
+    mpr.start()
+    some_event_happening_in_subprocess.wait()
+    # Do something that only should after some event happens in subprocess.
+    ```
+
+    Note that the user of multi_process_runner should not create additional
+    `multiprocessing.Manager()` objects; doing so can result in segfault in
+    some cases.
+    """
+    return self._manager
+
 
 class _Process(multi_process_lib.Process):
   """A modified `multiprocessing.Process` that can set up environment variables."""

From d54bfad4cd23c34aa36ba2ecb2a2eb86587ce039 Mon Sep 17 00:00:00 2001
From: Haoliang Zhang <haoliang@google.com>
Date: Mon, 29 Jun 2020 17:09:48 -0700
Subject: [PATCH 1287/1390] Clean up the LSTM/GRU layer with the new grappler
 selector.

PiperOrigin-RevId: 318924380
Change-Id: I1a6a83faa50c5ede8f564bba1d51f46bffc0507a
---
 .../python/keras/layers/recurrent_v2.py       | 144 ++++++++++++------
 1 file changed, 97 insertions(+), 47 deletions(-)

diff --git a/tensorflow/python/keras/layers/recurrent_v2.py b/tensorflow/python/keras/layers/recurrent_v2.py
index 58eb0bb025b..33babb54357 100644
--- a/tensorflow/python/keras/layers/recurrent_v2.py
+++ b/tensorflow/python/keras/layers/recurrent_v2.py
@@ -385,17 +385,6 @@ class GRU(recurrent.DropoutRNNCellMixin, recurrent.GRU):
       else:
         logging.warn(_CUDNN_NOT_AVAILABLE_MSG % self.name)
 
-    # The first two attributes are added to support TFLite use case.
-    supportive_attributes = {
-        'time_major': time_major,
-        'go_backwards': go_backwards,
-        _FUNCTION_API_NAME_ATTRIBUTE: 'gru_' + str(uuid.uuid4())
-    }
-    self.defun_gru_with_backend_selection = function.defun_with_attributes(
-        gru_with_backend_selection,
-        attributes=supportive_attributes,
-        autograph=False)
-
   def build(self, input_shape):
     super(GRU, self).build(input_shape)
 
@@ -478,7 +467,7 @@ class GRU(recurrent.DropoutRNNCellMixin, recurrent.GRU):
     if dropout_mask is not None:
       inputs = inputs * dropout_mask[0]
 
-    gru_kwargs = {
+    gpu_gru_kwargs = {
         'inputs': inputs,
         'init_h': _read_variable_value(initial_state[0]),
         'kernel': _read_variable_value(self.cell.kernel),
@@ -487,11 +476,29 @@ class GRU(recurrent.DropoutRNNCellMixin, recurrent.GRU):
         'mask': mask,
         'time_major': self.time_major,
         'go_backwards': self.go_backwards,
-        'sequence_lengths': sequence_lengths,
-        'zero_output_for_mask': self.zero_output_for_mask
+        'sequence_lengths': sequence_lengths
     }
-    (last_output, outputs, new_h,
-     runtime) = self.defun_gru_with_backend_selection(**gru_kwargs)
+    normal_gru_kwargs = gpu_gru_kwargs.copy()
+    normal_gru_kwargs.update({
+        'zero_output_for_mask': self.zero_output_for_mask,
+    })
+
+    if context.executing_eagerly():
+      device_type = _get_context_device_type()
+      can_use_gpu = (
+          # Either user specified GPU or unspecified but GPU is available.
+          (device_type == _GPU_DEVICE_NAME
+           or (device_type is None and context.num_gpus() > 0))
+          and
+          (mask is None or is_sequence_right_padded(mask, self.time_major)))
+      # Under eager context, check the device placement and prefer the
+      if can_use_gpu:
+        last_output, outputs, new_h, runtime = gpu_gru(**gpu_gru_kwargs)
+      else:
+        last_output, outputs, new_h, runtime = standard_gru(**normal_gru_kwargs)
+    else:
+      last_output, outputs, new_h, runtime = gru_with_backend_selection(
+          **normal_gru_kwargs)
 
     states = [new_h]
     return last_output, outputs, runtime, states
@@ -758,14 +765,24 @@ def gru_with_backend_selection(inputs, init_h, kernel, recurrent_kernel, bias,
         true_fn=input_right_padded,
         false_fn=input_not_right_padded)
 
-  # Chooses the implementation dynamicly based on the running device.
-  (last_output, outputs, new_h,
-   runtime) = control_flow_ops.execute_fn_for_device(
-       {
-           _CPU_DEVICE_NAME: lambda: standard_gru(**params),
-           _GPU_DEVICE_NAME: lambda: gpu_gru_with_fallback(**params)
-       }, lambda: standard_gru(**params))
+  # Each time a `tf.function` is called, we will give it a unique
+  # identifiable API name, so that Grappler won't get confused when it
+  # sees multiple GRU layers added into same graph, and it will be able
+  # to pair up the different implementations across them.
+  api_name = 'gru_' + str(uuid.uuid4())
+  supportive_attribute = {
+      'time_major': time_major,
+      'go_backwards': go_backwards,
+  }
+  defun_standard_gru = _generate_defun_backend(
+      api_name, _CPU_DEVICE_NAME, standard_gru, supportive_attribute)
+  defun_gpu_gru = _generate_defun_backend(
+      api_name, _GPU_DEVICE_NAME, gpu_gru_with_fallback, supportive_attribute)
 
+  # Call the normal GRU impl and register the CuDNN impl function. The
+  # grappler will kick in during session execution to optimize the graph.
+  last_output, outputs, new_h, runtime = defun_standard_gru(**params)
+  function.register(defun_gpu_gru, **params)
   return last_output, outputs, new_h, runtime
 
 
@@ -1080,18 +1097,6 @@ class LSTM(recurrent.DropoutRNNCellMixin, recurrent.LSTM):
       else:
         logging.warn(_CUDNN_NOT_AVAILABLE_MSG % self.name)
 
-    # The first two attributes are added to support TFLite use case.
-    supportive_attributes = {
-        'time_major': time_major,
-        'go_backwards': go_backwards,
-        _FUNCTION_API_NAME_ATTRIBUTE: 'lstm_' + str(uuid.uuid4())
-    }
-
-    self.defun_lstm_with_backend_selection = function.defun_with_attributes(
-        lstm_with_backend_selection,
-        attributes=supportive_attributes,
-        autograph=False)
-
   def call(self, inputs, mask=None, training=None, initial_state=None):
     # The input should be dense, padded with zeros. If a ragged input is fed
     # into the layer, it is padded and the row lengths are used for masking.
@@ -1140,7 +1145,7 @@ class LSTM(recurrent.DropoutRNNCellMixin, recurrent.LSTM):
       dropout_mask = self.get_dropout_mask_for_cell(inputs, training, count=4)
       if dropout_mask is not None:
         inputs = inputs * dropout_mask[0]
-      lstm_kwargs = {
+      gpu_lstm_kwargs = {
           'inputs': inputs,
           'init_h': _read_variable_value(initial_state[0]),
           'init_c': _read_variable_value(initial_state[1]),
@@ -1150,11 +1155,32 @@ class LSTM(recurrent.DropoutRNNCellMixin, recurrent.LSTM):
           'mask': mask,
           'time_major': self.time_major,
           'go_backwards': self.go_backwards,
-          'sequence_lengths': row_lengths,
-          'zero_output_for_mask': self.zero_output_for_mask,
+          'sequence_lengths': row_lengths
       }
-      (last_output, outputs, new_h, new_c,
-       runtime) = self.defun_lstm_with_backend_selection(**lstm_kwargs)
+      normal_lstm_kwargs = gpu_lstm_kwargs.copy()
+      normal_lstm_kwargs.update({
+          'zero_output_for_mask': self.zero_output_for_mask,
+      })
+
+      if context.executing_eagerly():
+        device_type = _get_context_device_type()
+        can_use_gpu = (
+            # Either user specified GPU or unspecified but GPU is available.
+            (device_type == _GPU_DEVICE_NAME
+             or (device_type is None and context.num_gpus() > 0))
+            and
+            (mask is None or is_sequence_right_padded(mask, self.time_major)))
+        # Under eager context, check the device placement and prefer the
+        # GPU implementation when GPU is available.
+        if can_use_gpu:
+          last_output, outputs, new_h, new_c, runtime = gpu_lstm(
+              **gpu_lstm_kwargs)
+        else:
+          last_output, outputs, new_h, new_c, runtime = standard_lstm(
+              **normal_lstm_kwargs)
+      else:
+        (last_output, outputs, new_h, new_c,
+         runtime) = lstm_with_backend_selection(**normal_lstm_kwargs)
 
       states = [new_h, new_c]
 
@@ -1512,13 +1538,25 @@ def lstm_with_backend_selection(inputs, init_h, init_c, kernel,
         true_fn=input_right_padded,
         false_fn=input_not_right_padded)
 
-  # Chooses the implementation dynamicly based on the running device.
-  (last_output, outputs, new_h, new_c,
-   runtime) = control_flow_ops.execute_fn_for_device(
-       {
-           _CPU_DEVICE_NAME: lambda: standard_lstm(**params),
-           _GPU_DEVICE_NAME: lambda: gpu_lstm_with_fallback(**params)
-       }, lambda: standard_lstm(**params))
+  # Each time a `tf.function` is called, we will give it a unique
+  # identifiable API name, so that Grappler won't get confused when it
+  # sees multiple LSTM layers added into same graph, and it will be able
+  # to pair up the different implementations across them.
+  api_name = 'lstm_' + str(uuid.uuid4())
+  supportive_attribute = {
+      'time_major': time_major,
+      'go_backwards': go_backwards,
+  }
+  defun_standard_lstm = _generate_defun_backend(
+      api_name, _CPU_DEVICE_NAME, standard_lstm, supportive_attribute)
+  defun_gpu_lstm = _generate_defun_backend(
+      api_name, _GPU_DEVICE_NAME, gpu_lstm_with_fallback, supportive_attribute)
+
+  # Call the normal LSTM impl and register the CuDNN impl function. The
+  # grappler will kick in during session execution to optimize the graph.
+  last_output, outputs, new_h, new_c, runtime = defun_standard_lstm(
+      **params)
+  function.register(defun_gpu_lstm, **params)
 
   return last_output, outputs, new_h, new_c, runtime
 
@@ -1581,6 +1619,18 @@ def calculate_sequence_by_mask(mask, time_major):
                              axis=timestep_index)
 
 
+def _generate_defun_backend(unique_api_name, preferred_device, func,
+                            supportive_attributes):
+  function_attributes = {
+      _FUNCTION_API_NAME_ATTRIBUTE: unique_api_name,
+      _FUNCTION_DEVICE_ATTRIBUTE: preferred_device,
+  }
+  function_attributes.update(supportive_attributes)
+  return function.defun_with_attributes(func=func,
+                                        attributes=function_attributes,
+                                        autograph=False)
+
+
 def _get_context_device_type():
   """Parse the current context and return the device type, eg CPU/GPU."""
   current_device = context.context().device_name

From ba08416163a57d829f80aa24d4814c15e8db7cbd Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 29 Jun 2020 17:18:49 -0700
Subject: [PATCH 1288/1390] Update ops-related pbtxt files.

PiperOrigin-RevId: 318925840
Change-Id: Idb555913da0a6f5ac268873596e55f765b549e85
---
 .../compat/ops_history_v2/CSVDatasetV2.pbtxt  | 69 +++++++++++++++++++
 tensorflow/core/ops/ops.pbtxt                 | 69 +++++++++++++++++++
 2 files changed, 138 insertions(+)
 create mode 100644 tensorflow/core/ops/compat/ops_history_v2/CSVDatasetV2.pbtxt

diff --git a/tensorflow/core/ops/compat/ops_history_v2/CSVDatasetV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/CSVDatasetV2.pbtxt
new file mode 100644
index 00000000000..142d0c1754a
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v2/CSVDatasetV2.pbtxt
@@ -0,0 +1,69 @@
+op {
+  name: "CSVDatasetV2"
+  input_arg {
+    name: "filenames"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "compression_type"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "buffer_size"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "header"
+    type: DT_BOOL
+  }
+  input_arg {
+    name: "field_delim"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "use_quote_delim"
+    type: DT_BOOL
+  }
+  input_arg {
+    name: "na_value"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "select_cols"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "record_defaults"
+    type_list_attr: "output_types"
+  }
+  input_arg {
+    name: "exclude_cols"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/ops.pbtxt b/tensorflow/core/ops/ops.pbtxt
index dec894cc173..b9e6dd8d0c6 100644
--- a/tensorflow/core/ops/ops.pbtxt
+++ b/tensorflow/core/ops/ops.pbtxt
@@ -6694,6 +6694,75 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "CSVDatasetV2"
+  input_arg {
+    name: "filenames"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "compression_type"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "buffer_size"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "header"
+    type: DT_BOOL
+  }
+  input_arg {
+    name: "field_delim"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "use_quote_delim"
+    type: DT_BOOL
+  }
+  input_arg {
+    name: "na_value"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "select_cols"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "record_defaults"
+    type_list_attr: "output_types"
+  }
+  input_arg {
+    name: "exclude_cols"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
 op {
   name: "CTCBeamSearchDecoder"
   input_arg {

From e9512ca322653e20bb9cc32e6b989a76cf6fa09d Mon Sep 17 00:00:00 2001
From: Rick Chao <rchao@google.com>
Date: Mon, 29 Jun 2020 17:31:20 -0700
Subject: [PATCH 1289/1390] MultiProcessRunner: Add
 MultiProcessRunner.get_process_exit_code().

PiperOrigin-RevId: 318927745
Change-Id: I418d1817c7749f6a86332c3ac0a920d26aede96f
---
 .../python/distribute/multi_process_runner.py   | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/tensorflow/python/distribute/multi_process_runner.py b/tensorflow/python/distribute/multi_process_runner.py
index 99389aab626..7f653c0e2de 100644
--- a/tensorflow/python/distribute/multi_process_runner.py
+++ b/tensorflow/python/distribute/multi_process_runner.py
@@ -416,6 +416,23 @@ class MultiProcessRunner(object):
     p = self._processes.get((task_type, task_id), None)
     return p.pid if p else None
 
+  def get_process_exit_code(self, task_type, task_id):
+    """Returns the subprocess exit code given the task type and task id.
+
+    Args:
+      task_type: The task type.
+      task_id: The task id.
+
+    Returns:
+      The subprocess exit code; `None` if the subprocess has not exited yet.
+
+    Raises:
+      KeyError: If the corresponding subprocess is not found with `task_type`
+        and `task_id`.
+    """
+    p = self._processes[(task_type, task_id)]
+    return p.exitcode if p else None
+
   def _join_or_terminate(self, task_type, task_id, process, timeout):
     """Joins a process. If it times out, terminate all procsses."""
     logging.info('joining %s-%d', task_type, task_id)

From e11a21f9b57761bf78f2420fd996b3cd24e97c5f Mon Sep 17 00:00:00 2001
From: Mehdi Amini <aminim@google.com>
Date: Mon, 29 Jun 2020 17:47:12 -0700
Subject: [PATCH 1290/1390] Internal build/config change

PiperOrigin-RevId: 318929856
Change-Id: I285199659be5af115a59fdbbbd4b70bda34badcf
---
 tensorflow/compiler/mlir/xla/BUILD | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tensorflow/compiler/mlir/xla/BUILD b/tensorflow/compiler/mlir/xla/BUILD
index 8b2dc6ec8ed..b1187d86b1a 100644
--- a/tensorflow/compiler/mlir/xla/BUILD
+++ b/tensorflow/compiler/mlir/xla/BUILD
@@ -15,7 +15,6 @@ package_group(
         "//learning/brain/experimental/mlir/...",
         "//learning/brain/google/xla/kernels/...",
         "//learning/brain/google/xla/mlir/...",
-        "//learning/brain/swift/swift_mlir/...",
         "//learning/pathways/data_parallel/tf2xla/...",
         "//platforms/xla/...",
         "//tensorflow/compiler/mlir/...",

From e7f5d45e425e958ce5fb7d69594af2b5eb9c218d Mon Sep 17 00:00:00 2001
From: Tim Shen <timshen@google.com>
Date: Mon, 29 Jun 2020 17:47:19 -0700
Subject: [PATCH 1291/1390] [MLIR] Remove tuple from LHLO.

MLIR conveniently provide light-weight facilities to replace tuple semantics in XLA. Specifically:
* XLA ops don't use multi-level tuples.
* MLIR Variadic operands can represent one-level tuples.
* The named getter or mlir::XXOp::getODSOperands(i) returns a slice of operands under the same Variadic name.

This simplifies both the writer (less builder.create<> calls) and the analysis (no GTE and TupleOp to hop through).

I'd like to keep it as a separate discussion (and planning) to deprecate tuples in the HLO dialect. HLO dialect is more complicated in that it converts from and to XLA HLO and even TF.

PiperOrigin-RevId: 318929868
Change-Id: I19cbcca61cf6553f10d4c0d90b69d1bf911ce1b7
---
 tensorflow/compiler/mlir/xla/ir/lhlo_ops.td   | 55 +++++++++----------
 .../compiler/mlir/xla/tests/lhlo_ops.mlir     | 46 ++++++++++------
 2 files changed, 55 insertions(+), 46 deletions(-)

diff --git a/tensorflow/compiler/mlir/xla/ir/lhlo_ops.td b/tensorflow/compiler/mlir/xla/ir/lhlo_ops.td
index 95ad97118ef..41830804be1 100644
--- a/tensorflow/compiler/mlir/xla/ir/lhlo_ops.td
+++ b/tensorflow/compiler/mlir/xla/ir/lhlo_ops.td
@@ -65,10 +65,6 @@ def LHLO_PredOrIntBuffer : MemRefOf<[HLO_Int, HLO_Pred]>;
 
 def LHLO_Buffer : MemRefOf<[AnyFloat, AnySignlessInteger, AnyComplex]>;
 
-def LHLO_TupleBuffer : NestedTupleOf<[LHLO_Buffer]>;
-
-def LHLO_BufferOrTuple : AnyTypeOf<[LHLO_Buffer, LHLO_TupleBuffer]>;
-
 //===----------------------------------------------------------------------===//
 // XLA nullary op definitions.
 //===----------------------------------------------------------------------===//
@@ -225,9 +221,9 @@ def LHLO_ReduceOp: LHLO_Op<"reduce", [
       SingleBlockImplicitTerminator<"TerminatorOp">
     ]>, BASE_HLO_ReduceOp {
   let arguments = (ins
-    Arg<Variadic<LHLO_BufferOrTuple>, "", [MemRead]>:$operands,
-    Arg<Variadic<LHLO_BufferOrTuple>, "", [MemRead]>:$init_values,
-    Arg<Variadic<LHLO_BufferOrTuple>, "", [MemWrite]>:$out,
+    Arg<Variadic<LHLO_Buffer>, "", [MemRead]>:$operands,
+    Arg<Variadic<LHLO_Buffer>, "", [MemRead]>:$init_values,
+    Arg<Variadic<LHLO_Buffer>, "", [MemWrite]>:$out,
     I64ElementsAttr:$dimensions
   );
 
@@ -255,24 +251,33 @@ def LHLO_ReduceWindowOp: LHLO_Op<"reduce_window", [
   let regions = (region SizedRegion<1>:$body);
 }
 
+// TODO(timshen): Add a custom parser to hide operand_segment_sizes. For example,
+// A tuple-like pattern match syntax could work:
+// xla_lhlo.case %index, (%input0, %input1, %input2), (%output0, %output1) {
+//   ...
+// }, {
+//   ...
+// } : (type_input0, type_input1, type_input2, type_output0, type_output1) -> ()
 def LHLO_CaseOp: LHLO_Op<"case", [
+      AttrSizedOperandSegments,
       SingleBlockImplicitTerminator<"TerminatorOp">
     ]>, BASE_HLO_CaseOp {
 
   let arguments = (ins
     Arg<LHLO_Buffer, "", [MemRead]>:$index,
-    Arg<Variadic<LHLO_BufferOrTuple>, "", [MemRead]>:$branch_operands,
-    Arg<LHLO_BufferOrTuple, "", [MemWrite]>:$out
+    Arg<Variadic<LHLO_Buffer>, "", [MemRead]>:$branch_operands,
+    Arg<Variadic<LHLO_Buffer>, "", [MemWrite]>:$out
   );
 
   let regions = (region VariadicRegion<SizedRegion<1>>:$branches);
 }
 
 // TODO(timshen): Add a custom syntax for this.
-def LHLO_WhileOp: LHLO_Op<"while", [SameTypeOperands]>, BASE_HLO_WhileOp {
+def LHLO_WhileOp: LHLO_Op<"while", [SameVariadicOperandSize]>,
+                  BASE_HLO_WhileOp {
   let arguments = (ins
-    Arg<LHLO_BufferOrTuple, "", [MemRead]>:$val,
-    Arg<LHLO_BufferOrTuple, "", [MemWrite]>:$output
+    Arg<Variadic<LHLO_Buffer>, "", [MemRead]>:$val,
+    Arg<Variadic<LHLO_Buffer>, "", [MemWrite]>:$output
   );
 
   let regions = (region SizedRegion<1>:$cond, SizedRegion<1>:$body);
@@ -282,20 +287,6 @@ def LHLO_WhileOp: LHLO_Op<"while", [SameTypeOperands]>, BASE_HLO_WhileOp {
 // XLA tuple op definitions.
 //===----------------------------------------------------------------------===//
 
-def LHLO_GetTupleElementOp: LHLO_Op<"get_tuple_element", []>, BASE_HLO_GetTupleElementOp {
-  let arguments = (ins
-    Arg<LHLO_TupleBuffer, "", [MemRead]>:$input,
-    Arg<LHLO_BufferOrTuple, "", [MemWrite]>:$out,
-    I32Attr:$index
-  );
-}
-
-def LHLO_TupleOp : LHLO_Op<"tuple", []>, BASE_HLO_TupleOp {
-   let arguments = (ins
-     Arg<Variadic<LHLO_BufferOrTuple>, "", [MemRead]>:$val,
-     Arg<LHLO_TupleBuffer, "", [MemWrite]>:$out);
-}
-
 def LHLO_CompareOp: LHLO_Op<"compare", []>, BASE_HLO_CompareOp {
   let arguments = (ins
     Arg<LHLO_Buffer, "", [MemRead]>:$lhs,
@@ -436,7 +427,9 @@ def LHLO_BatchNormGradOp : LHLO_Op<"batch_norm_grad", []>,
     Arg<LHLO_Buffer, "", [MemRead]>:$mean,
     Arg<LHLO_Buffer, "", [MemRead]>:$variance,
     Arg<LHLO_Buffer, "", [MemRead]>:$grad_output,
-    Arg<LHLO_TupleBuffer, "", [MemWrite]>:$output,
+    Arg<LHLO_Buffer, "", [MemWrite]>:$grad_operand,  // gradient of $operand.
+    Arg<LHLO_Buffer, "", [MemWrite]>:$grad_scale,
+    Arg<LHLO_Buffer, "", [MemWrite]>:$grad_offset,
     F32Attr:$epsilon,
     I64Attr:$feature_index
   );
@@ -465,7 +458,9 @@ def LHLO_BatchNormTrainingOp : LHLO_Op<"batch_norm_training", []>,
     Arg<LHLO_Buffer, "", [MemRead]>:$operand,
     Arg<LHLO_Buffer, "", [MemRead]>:$scale,
     Arg<LHLO_Buffer, "", [MemRead]>:$offset,
-    Arg<LHLO_TupleBuffer, "", [MemWrite]>:$output,
+    Arg<LHLO_Buffer, "", [MemWrite]>:$output,
+    Arg<LHLO_Buffer, "", [MemWrite]>:$batch_mean,
+    Arg<LHLO_Buffer, "", [MemWrite]>:$batch_var,
     F32Attr:$epsilon,
     I64Attr:$feature_index
   );
@@ -749,10 +744,10 @@ def LHLO_RngGetAndUpdateStateOp: LHLO_Op<"rng_get_and_update_state", []> {
 }
 
 // TODO(timshen): add a custom verifier.
-def LHLO_SortOp: LHLO_Op<"sort", []>, BASE_HLO_SortOp {
+def LHLO_SortOp: LHLO_Op<"sort", [SameVariadicOperandSize, SameOperandsShape]>, BASE_HLO_SortOp {
   let arguments = (ins
     Arg<Variadic<LHLO_Buffer>, "", [MemRead]>:$operands,
-    LHLO_BufferOrTuple:$output,
+    Arg<Variadic<LHLO_Buffer>, "", [MemWrite]>:$output,
     DefaultValuedAttr<I64Attr, "-1">:$dimension,
     DefaultValuedAttr<BoolAttr, "false">:$is_stable
   );
diff --git a/tensorflow/compiler/mlir/xla/tests/lhlo_ops.mlir b/tensorflow/compiler/mlir/xla/tests/lhlo_ops.mlir
index 1e803da4ac6..4a60c3f4c17 100644
--- a/tensorflow/compiler/mlir/xla/tests/lhlo_ops.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/lhlo_ops.mlir
@@ -422,7 +422,8 @@ func @case_memref(%index: memref<i32>, %operand_1: memref<f32>, %operand_2: memr
       "xla_lhlo.add"(%arg0, %arg0, %out) : (memref<f32>, memref<f32>, memref<f32>) -> ()
       "xla_lhlo.terminator"() : () -> ()
     }
-  ) : (memref<i32>, memref<f32>, memref<f32>, memref<f32>, memref<f32>) -> ()
+  ) {operand_segment_sizes = dense<[1, 3, 1]> : vector<3xi32>}
+  : (memref<i32>, memref<f32>, memref<f32>, memref<f32>, memref<f32>) -> ()
   return
 }
 
@@ -784,10 +785,11 @@ func @fft_memrefs(%arg0: memref<3x9xf32>, %arg_out: memref<3x5xcomplex<f32>>) ->
 // CHECK-LABEL: func @batch_norm_grad_memrefs
 func @batch_norm_grad_memrefs(%arg0: memref<8x8x8x8xf32>, %arg1: memref<8xf32>, %arg2: memref<8xf32>,
                               %arg3: memref<8xf32>, %arg4: memref<8x8x8x8xf32>,
-                              %arg_out: tuple<memref<8x8x8x8xf32>, memref<8xf32>, memref<8xf32>>) -> () {
-  "xla_lhlo.batch_norm_grad"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg_out) {epsilon = 1.000000e-03 : f32, feature_index = 3 : i64}
+                              %grad_operand: memref<8x8x8x8xf32>, %grad_scale: memref<8xf32>,
+                              %grad_offset: memref<8xf32>) -> () {
+  "xla_lhlo.batch_norm_grad"(%arg0, %arg1, %arg2, %arg3, %arg4, %grad_operand, %grad_scale, %grad_offset) {epsilon = 1.000000e-03 : f32, feature_index = 3 : i64}
       : (memref<8x8x8x8xf32>, memref<8xf32>, memref<8xf32>, memref<8xf32>, memref<8x8x8x8xf32>,
-         tuple<memref<8x8x8x8xf32>, memref<8xf32>, memref<8xf32>>) -> ()
+         memref<8x8x8x8xf32>, memref<8xf32>, memref<8xf32>) -> ()
   return
 }
 
@@ -805,9 +807,10 @@ func @batch_norm_inference_memrefs(%arg0: memref<8x8x8x8xf32>, %arg1: memref<8xf
 
 // CHECK-LABEL: func @batch_norm_training_memrefs
 func @batch_norm_training_memrefs(%arg0: memref<8x8x8x8xf32>, %arg1: memref<8xf32>, %arg2: memref<8xf32>,
-                                  %arg_out: tuple<memref<8x8x8x8xf32>, memref<8xf32>, memref<8xf32>>) -> () {
-  "xla_lhlo.batch_norm_training"(%arg0, %arg1, %arg2, %arg_out) {epsilon = 1.000000e-03 : f32, feature_index = 3 : i64}
-      : (memref<8x8x8x8xf32>, memref<8xf32>, memref<8xf32>, tuple<memref<8x8x8x8xf32>, memref<8xf32>, memref<8xf32>>) -> ()
+                                  %output: memref<8x8x8x8xf32>, %batch_mean: memref<8xf32>,
+                                  %batch_var: memref<8xf32>) -> () {
+  "xla_lhlo.batch_norm_training"(%arg0, %arg1, %arg2, %output, %batch_mean, %batch_var) {epsilon = 1.000000e-03 : f32, feature_index = 3 : i64}
+      : (memref<8x8x8x8xf32>, memref<8xf32>, memref<8xf32>, memref<8x8x8x8xf32>, memref<8xf32>, memref<8xf32>) -> ()
   return
 }
 
@@ -866,6 +869,17 @@ func @while_memrefs(%arg0: memref<i64>, %arg_out: memref<i64>) -> () {
 
 // -----
 
+// CHECK-LABEL: func @while_memrefs
+func @while_memrefs(%arg0: memref<i64>, %arg1: memref<5xf32>, %arg0_out: memref<i64>, %arg1_out: memref<5xf32>) -> () {
+  "xla_lhlo.while"(%arg0, %arg1, %arg0_out, %arg1_out) (
+    { ^bb0(%cur0: memref<i64>, %cur1: memref<5xf32>, %cond: memref<i1>): "xla_lhlo.terminator"() : () -> () },
+    { ^bb0(%cur0: memref<i64>, %cur1: memref<5xf32>, %body_out0: memref<i64>, %body_out1: memref<5xf32>): "xla_lhlo.terminator"() : () -> () }
+  ) : (memref<i64>, memref<5xf32>, memref<i64>, memref<5xf32>) -> ()
+  return
+}
+
+// -----
+
 // CHECK-LABEL: func @bitcast_memrefs
 func @bitcast_memrefs(%arg0: memref<1xf64>, %arg_out: memref<2xi32>) -> () {
   "xla_lhlo.bitcast"(%arg0, %arg_out) : (memref<1xf64>, memref<2xi32>) -> ()
@@ -930,12 +944,12 @@ func @rng_get_and_update_state_memrefs(%state: memref<1xui64>) -> () {
 
 // CHECK-LABEL: func @sort_memrefs
 func @sort_memrefs(%arg0: memref<16x16xf32>, %arg1: memref<16x16xf16>,
-                   %arg_out: tuple<memref<16x16xf32>, memref<16x16xf16>>) -> () {
-  "xla_lhlo.sort"(%arg0, %arg1, %arg_out) ( {
+                   %out0: memref<16x16xf32>, %out1: memref<16x16xf16>) -> () {
+  "xla_lhlo.sort"(%arg0, %arg1, %out0, %out1) ( {
   ^bb0(%a: tensor<f32>, %b: tensor<f32>, %c: tensor<f16>, %d: tensor<f16>):
     %7 = "xla_hlo.compare"(%a, %b) {comparison_direction = "GT"} : (tensor<f32>, tensor<f32>) -> tensor<i1>
     "xla_hlo.return"(%7) : (tensor<i1>) -> ()
-  }) {dimension = 1 : i64, is_stable = true} : (memref<16x16xf32>, memref<16x16xf16>, tuple<memref<16x16xf32>, memref<16x16xf16>>) -> ()
+  }) {dimension = 1 : i64, is_stable = true} : (memref<16x16xf32>, memref<16x16xf16>, memref<16x16xf32>, memref<16x16xf16>) -> ()
   return
 }
 
@@ -943,12 +957,12 @@ func @sort_memrefs(%arg0: memref<16x16xf32>, %arg1: memref<16x16xf16>,
 
 // CHECK-LABEL: func @sort_memrefs
 func @sort_memrefs(%arg0: memref<16x16xf32>, %arg1: memref<16x16xf16>,
-                   %arg_out: tuple<memref<16x16xf32>, memref<16x16xf16>>) -> () {
-  "xla_lhlo.sort"(%arg0, %arg1, %arg_out) ( {
+                   %out0: memref<16x16xf32>, %out1: memref<16x16xf16>) -> () {
+  "xla_lhlo.sort"(%arg0, %arg1, %out0, %out1) ( {
   ^bb0(%a: tensor<f32>, %b: tensor<f32>, %c: tensor<f16>, %d: tensor<f16>):
     %7 = "xla_hlo.compare"(%a, %b) {comparison_direction = "GT"} : (tensor<f32>, tensor<f32>) -> tensor<i1>
     "xla_hlo.return"(%7) : (tensor<i1>) -> ()
-  }) {dimension = 1 : i64} : (memref<16x16xf32>, memref<16x16xf16>, tuple<memref<16x16xf32>, memref<16x16xf16>>) -> ()
+  }) {dimension = 1 : i64} : (memref<16x16xf32>, memref<16x16xf16>, memref<16x16xf32>, memref<16x16xf16>) -> ()
   return
 }
 
@@ -956,11 +970,11 @@ func @sort_memrefs(%arg0: memref<16x16xf32>, %arg1: memref<16x16xf16>,
 
 // CHECK-LABEL: func @sort_memrefs
 func @sort_memrefs(%arg0: memref<16x16xf32>, %arg1: memref<16x16xf16>,
-                   %arg_out: tuple<memref<16x16xf32>, memref<16x16xf16>>) -> () {
-  "xla_lhlo.sort"(%arg0, %arg1, %arg_out) ( {
+                   %out0: memref<16x16xf32>, %out1: memref<16x16xf16>) -> () {
+  "xla_lhlo.sort"(%arg0, %arg1, %out0, %out1) ( {
   ^bb0(%a: tensor<f32>, %b: tensor<f32>, %c: tensor<f16>, %d: tensor<f16>):
     %7 = "xla_hlo.compare"(%a, %b) {comparison_direction = "GT"} : (tensor<f32>, tensor<f32>) -> tensor<i1>
     "xla_hlo.return"(%7) : (tensor<i1>) -> ()
-  }) : (memref<16x16xf32>, memref<16x16xf16>, tuple<memref<16x16xf32>, memref<16x16xf16>>) -> ()
+  }) : (memref<16x16xf32>, memref<16x16xf16>, memref<16x16xf32>, memref<16x16xf16>) -> ()
   return
 }

From 2ab7873606c5f5c8f9a4c2615f18c293b5bfbd6c Mon Sep 17 00:00:00 2001
From: Gaurav Jain <gjn@google.com>
Date: Mon, 29 Jun 2020 17:55:25 -0700
Subject: [PATCH 1292/1390] Include offending variables in error message

PiperOrigin-RevId: 318930843
Change-Id: I106d9259a59045b9f885b47cc93fd13c9048893f
---
 tensorflow/python/ops/custom_gradient.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/tensorflow/python/ops/custom_gradient.py b/tensorflow/python/ops/custom_gradient.py
index 19d15773c1e..f42e000b4b9 100644
--- a/tensorflow/python/ops/custom_gradient.py
+++ b/tensorflow/python/ops/custom_gradient.py
@@ -374,9 +374,9 @@ def _graph_mode_decorator(f, args, kwargs):
   variables_in_signature = ("variables" in grad_argspec.args or
                             grad_argspec.varkw)
   if variables and not variables_in_signature:
-    raise TypeError("If using @custom_gradient with a function that "
-                    "uses variables, then grad_fn must accept a keyword "
-                    "argument 'variables'.")
+    raise TypeError(
+        "@tf.custom_gradient grad_fn must accept keyword argument 'variables', "
+        "since function uses variables: {}".format(variables))
   if variables_in_signature and not variables:
     # User seems to intend to use variables but none were captured.
     logging.warn("@custom_gradient grad_fn has 'variables' in signature, but "
@@ -441,9 +441,9 @@ def _eager_mode_decorator(f, args, kwargs):
   grad_argspec = tf_inspect.getfullargspec(grad_fn)
   if (variables and ("variables" not in grad_argspec.args) and
       not grad_argspec.varkw):
-    raise TypeError("If using @custom_gradient with a function that "
-                    "uses variables, then grad_fn must accept a keyword "
-                    "argument 'variables'.")
+    raise TypeError(
+        "@tf.custom_gradient grad_fn must accept keyword argument 'variables', "
+        "since function uses variables: {}".format(variables))
   flat_result = nest.flatten(result)
   # TODO(apassos) consider removing the identity below.
   flat_result = [gen_array_ops.identity(x) for x in flat_result]

From 24f835217fd27f21141ff0254a2b93ea3cfd7b6c Mon Sep 17 00:00:00 2001
From: Dero Gharibian <dero@google.com>
Date: Mon, 29 Jun 2020 18:11:30 -0700
Subject: [PATCH 1293/1390] Eliminate unnecessary packing/unpacking of string
 tensors across the C-API and language bindings.

With the std::string -> tensorflow::tstring migration in tensorflow core complete, we now have an ABI stable string type with associated C accessors/modifiers. The packing/unpacking of string tensors across the C-API is now superfluous and can be removed.  This is an ABI breaking change; updates to language bindings in tensorflow/ are included in this CL.

PiperOrigin-RevId: 318933001
Change-Id: I3c99cf70834ba0b6cefc4ba39a35d6c168e880db
---
 RELEASE.md                                    |   5 +
 tensorflow/c/BUILD                            |   4 +
 tensorflow/c/c_api.h                          |   1 +
 tensorflow/c/c_api_test.cc                    |  27 ++-
 tensorflow/c/tf_tensor.cc                     | 137 +------------
 tensorflow/c/tf_tensor.h                      |  28 ---
 tensorflow/c/tf_tstring.h                     |  20 ++
 tensorflow/core/platform/BUILD                |   8 +
 tensorflow/go/tensor.go                       | 180 ++++--------------
 tensorflow/java/src/main/native/tensor_jni.cc | 113 +++--------
 .../org/tensorflow/op/core/ConstantTest.java  |  15 +-
 tensorflow/python/lib/core/ndarray_tensor.cc  |  59 ++----
 12 files changed, 145 insertions(+), 452 deletions(-)
 create mode 100644 tensorflow/c/tf_tstring.h

diff --git a/RELEASE.md b/RELEASE.md
index 33bc49da750..28d8bd7ca36 100644
--- a/RELEASE.md
+++ b/RELEASE.md
@@ -6,6 +6,11 @@
 
 * <DOCUMENT BREAKING CHANGES HERE>
 * <THIS SECTION SHOULD CONTAIN API, ABI AND BEHAVIORAL BREAKING CHANGES>
+* The byte layout for string tensors across the C-API has been updated to match
+  TF Core/C++; i.e., a contiguous array of `tensorflow::tstring`/`TF_TString`s.
+* C-API functions `TF_StringDecode`, `TF_StringEncode`, and
+  `TF_StringEncodedSize` are no longer relevant and have been removed; see
+  core/platform/ctstring.h for string access/modification in C.
 
 ## Known Caveats
 
diff --git a/tensorflow/c/BUILD b/tensorflow/c/BUILD
index 92cda95e8e4..410fc22069f 100644
--- a/tensorflow/c/BUILD
+++ b/tensorflow/c/BUILD
@@ -29,6 +29,8 @@ filegroup(
         "tf_file_statistics.h",
         "tf_status.h",
         "tf_tensor.h",
+        "tf_tstring.h",
+        "//tensorflow/core/platform:ctstring",
     ],
     visibility = ["//tensorflow:__subpackages__"],
 )
@@ -48,6 +50,7 @@ filegroup(
             "*test*",
         ],
     ) + [
+        "//tensorflow/core/platform:ctstring",
         "//tensorflow/cc:srcs_no_runtime",
         "//tensorflow/core/distributed_runtime:server_lib.h",
     ],
@@ -78,6 +81,7 @@ tf_cuda_library(
         "c_api_internal.h",
         "tf_datatype.h",
         "tf_tensor.h",
+        "tf_tstring.h",
     ],
     visibility = [
         "//tensorflow:internal",
diff --git a/tensorflow/c/c_api.h b/tensorflow/c/c_api.h
index 0c413f6ebae..808bcf3bd80 100644
--- a/tensorflow/c/c_api.h
+++ b/tensorflow/c/c_api.h
@@ -23,6 +23,7 @@ limitations under the License.
 #include "tensorflow/c/tf_datatype.h"
 #include "tensorflow/c/tf_status.h"
 #include "tensorflow/c/tf_tensor.h"
+#include "tensorflow/c/tf_tstring.h"
 
 // --------------------------------------------------------------------------
 // C API for TensorFlow.
diff --git a/tensorflow/c/c_api_test.cc b/tensorflow/c/c_api_test.cc
index 0fc07b87070..bbbbb8f7d56 100644
--- a/tensorflow/c/c_api_test.cc
+++ b/tensorflow/c/c_api_test.cc
@@ -2286,14 +2286,15 @@ TEST_F(CApiAttributesTest, Tensor) {
 
 TEST_F(CApiAttributesTest, StringTensor) {
   // Create the string-Tensor "attribute" value.
-  char encoded[] = {
-      0,   0, 0, 0, 0, 0, 0, 0,  // array[uint64] offsets
-      1,                         // varint encoded string length
-      'A',
-  };
+  const char test_string[] =
+      "borkborkborkborkborkborkborkbork";  // >24bytes to force heap alloc
+  TF_TString tstr[1];
+  TF_TString_Init(&tstr[0]);
+  TF_TString_Copy(&tstr[0], test_string, sizeof(test_string) - 1);
+
   auto deallocator = [](void* data, size_t len, void* arg) {};
-  unique_tensor_ptr t_in(TF_NewTensor(TF_STRING, nullptr, 0, &encoded[0],
-                                      sizeof(encoded), deallocator, nullptr),
+  unique_tensor_ptr t_in(TF_NewTensor(TF_STRING, nullptr, 0, &tstr[0],
+                                      sizeof(tstr), deallocator, nullptr),
                          TF_DeleteTensor);
 
   // Create a TF_Operation with the attribute t_in
@@ -2312,9 +2313,17 @@ TEST_F(CApiAttributesTest, StringTensor) {
   EXPECT_EQ(TF_STRING, TF_TensorType(t_out));
   EXPECT_EQ(0, TF_NumDims(t_out));
   ASSERT_EQ(TF_TensorByteSize(t_in.get()), TF_TensorByteSize(t_out));
-  EXPECT_EQ(0, memcmp(TF_TensorData(t_in.get()), TF_TensorData(t_out),
-                      TF_TensorByteSize(t_out)));
+  TF_TString* t_in_tstr = static_cast<TF_TString*>(TF_TensorData(t_in.get()));
+  TF_TString* t_out_tstr = static_cast<TF_TString*>(TF_TensorData(t_out));
+  EXPECT_EQ(absl::string_view(test_string),
+            absl::string_view(TF_TString_GetDataPointer(t_out_tstr),
+                              TF_TString_GetSize(t_out_tstr)));
+  EXPECT_EQ(absl::string_view(TF_TString_GetDataPointer(t_in_tstr),
+                              TF_TString_GetSize(t_in_tstr)),
+            absl::string_view(TF_TString_GetDataPointer(t_out_tstr),
+                              TF_TString_GetSize(t_out_tstr)));
   TF_DeleteTensor(t_out);
+  TF_TString_Dealloc(&tstr[0]);
 }
 
 TEST_F(CApiAttributesTest, TensorList) {
diff --git a/tensorflow/c/tf_tensor.cc b/tensorflow/c/tf_tensor.cc
index 7e4d3bb4932..0feb986ce44 100644
--- a/tensorflow/c/tf_tensor.cc
+++ b/tensorflow/c/tf_tensor.cc
@@ -228,57 +228,6 @@ Status TensorInterface::BitcastFrom(const TensorInterface& from, DataType type,
 }  // namespace tensorflow
 
 // --------------------------------------------------------------------------
-void StringEncode(const char* src, size_t src_len, char* dst) {
-  dst = tensorflow::core::EncodeVarint64(dst, src_len);
-  memcpy(dst, src, src_len);
-}
-
-size_t TF_StringEncode(const char* src, size_t src_len, char* dst,
-                       size_t dst_len, TF_Status* status) {
-  const size_t sz = TF_StringEncodedSize(src_len);
-  if (sz < src_len) {
-    Set_TF_Status_from_Status(
-        status, InvalidArgument("src string is too large to encode"));
-    return 0;
-  }
-  if (dst_len < sz) {
-    Set_TF_Status_from_Status(
-        status,
-        InvalidArgument("dst_len (", dst_len, ") too small to encode a ",
-                        src_len, "-byte string"));
-    return 0;
-  }
-  StringEncode(src, src_len, dst);
-  return sz;
-}
-
-static Status TF_StringDecode_Impl(const char* src, size_t src_len,
-                                   const char** dst, size_t* dst_len) {
-  tensorflow::uint64 len64 = 0;
-  const char* p = tensorflow::core::GetVarint64Ptr(src, src + src_len, &len64);
-  if (p == nullptr) {
-    return InvalidArgument("invalid string encoding or truncated src buffer");
-  }
-  if (len64 > std::numeric_limits<size_t>::max()) {
-    return InvalidArgument("encoded string is ", len64,
-                           "-bytes, which is too large for this architecture");
-  }
-  *dst = p;
-  *dst_len = static_cast<size_t>(len64);
-  return Status::OK();
-}
-
-size_t TF_StringDecode(const char* src, size_t src_len, const char** dst,
-                       size_t* dst_len, TF_Status* status) {
-  Set_TF_Status_from_Status(status,
-                            TF_StringDecode_Impl(src, src_len, dst, dst_len));
-  if (TF_GetCode(status) != TF_OK) return 0;
-  return static_cast<size_t>(*dst - src) + *dst_len;
-}
-
-size_t TF_StringEncodedSize(size_t len) {
-  return static_cast<size_t>(tensorflow::core::VarintLength(len)) + len;
-}
 
 static void DeleteArray(void* data, size_t size, void* arg) {
   DCHECK_EQ(data, arg);
@@ -334,58 +283,12 @@ TF_Tensor* TF_TensorFromTensor(const tensorflow::Tensor& src, Status* status) {
     std::memcpy(TF_TensorData(t), str.c_str(), str.size());
     return t;
   }
-  if (src.dtype() != tensorflow::DT_STRING) {
-    Tensor tensor;
-    if (!tensor.CopyFrom(src, src.shape())) {
-      return nullptr;
-    }
-    return new TF_Tensor{new tensorflow::TensorInterface(tensor)};
-  }
-  // DT_STRING tensors require a copying since TF_Tensor.buffer expects a flatly
-  // encoded sequence of strings.
 
-  // Compute bytes needed for encoding.
-  size_t size = 0;
-  const auto& srcarray = src.flat<tstring>();
-  for (int i = 0; i < srcarray.size(); ++i) {
-    const string& s = srcarray(i);
-    // uint64 starting_offset, TF_StringEncode-d string.
-    size += sizeof(tensorflow::uint64) + TF_StringEncodedSize(s.size());
-  }
-
-  // Encode all strings.
-  char* base = new char[size];
-  char* data_start = base + sizeof(tensorflow::uint64) * srcarray.size();
-  char* dst = data_start;  // Where next string is encoded.
-  size_t dst_len = size - static_cast<size_t>(data_start - base);
-  tensorflow::uint64* offsets = reinterpret_cast<tensorflow::uint64*>(base);
-  for (int i = 0; i < srcarray.size(); ++i) {
-    *offsets = (dst - data_start);
-    offsets++;
-    const string& s = srcarray(i);
-    const size_t consumed = TF_StringEncodedSize(s.size());
-    StringEncode(s.data(), s.size(), dst);
-    dst += consumed;
-    dst_len -= consumed;
-  }
-  if (dst != base + size) {
-    *status = InvalidArgument(
-        "invalid string tensor encoding (decoded ", (dst - base),
-        " bytes, but the tensor is encoded in ", size, " bytes");
-    delete[] base;
+  Tensor tensor;
+  if (!tensor.CopyFrom(src, src.shape())) {
     return nullptr;
   }
-
-  auto dims = src.shape().dim_sizes();
-  std::vector<tensorflow::int64> dimvec(dims.size());
-  for (size_t i = 0; i < dims.size(); ++i) {
-    dimvec[i] = dims[i];
-  }
-  static_assert(sizeof(int64_t) == sizeof(tensorflow::int64),
-                "64-bit int types should match in size");
-  return TF_NewTensor(TF_STRING,
-                      reinterpret_cast<const int64_t*>(dimvec.data()),
-                      dimvec.size(), base, size, DeleteArray, base);
+  return new TF_Tensor{new tensorflow::TensorInterface(tensor)};
 }
 
 Status TF_TensorToTensor(const TF_Tensor* src, Tensor* dst) {
@@ -409,39 +312,7 @@ Status TensorInterface::ToTensor(tensorflow::Tensor* dst) const {
     }
     return Status::OK();
   }
-  if (tensor_.dtype() != DT_STRING) {
-    *dst = tensor_;
-    return Status::OK();
-  }
-  // TF_STRING tensors require copying since Tensor class expects a sequence of
-  // string objects.
-  const tensorflow::int64 num_elements = tensor_.NumElements();
-  const char* input = reinterpret_cast<const char*>(Data());
-  const size_t src_size = ByteSize();
-  if (static_cast<tensorflow::int64>(src_size / sizeof(tensorflow::uint64)) <
-      num_elements) {
-    return InvalidArgument(
-        "Malformed TF_STRING tensor; too short to hold number of elements");
-  }
-  const char* data_start = input + sizeof(tensorflow::uint64) * num_elements;
-  const char* limit = input + src_size;
-
-  *dst = tensorflow::Tensor(tensor_.dtype(), tensor_.shape());
-  auto dstarray = dst->flat<tstring>();
-  for (tensorflow::int64 i = 0; i < num_elements; ++i) {
-    tensorflow::uint64 offset =
-        reinterpret_cast<const tensorflow::uint64*>(input)[i];
-    if (static_cast<ptrdiff_t>(offset) >= (limit - data_start)) {
-      return InvalidArgument("Malformed TF_STRING tensor; element ", i,
-                             " out of range");
-    }
-    size_t len;
-    const char* p;
-    const char* srcp = data_start + offset;
-    Status status = TF_StringDecode_Impl(srcp, limit - srcp, &p, &len);
-    if (!status.ok()) return status;
-    dstarray(i).assign(p, len);
-  }
+  *dst = tensor_;
   return Status::OK();
 }
 
diff --git a/tensorflow/c/tf_tensor.h b/tensorflow/c/tf_tensor.h
index 7ed4a9f754e..acdf053e63a 100644
--- a/tensorflow/c/tf_tensor.h
+++ b/tensorflow/c/tf_tensor.h
@@ -148,34 +148,6 @@ TF_CAPI_EXPORT extern void TF_TensorBitcastFrom(const TF_Tensor* from,
                                                 int num_new_dims,
                                                 TF_Status* status);
 
-// --------------------------------------------------------------------------
-// Encode the string `src` (`src_len` bytes long) into `dst` in the format
-// required by TF_STRING tensors. Does not write to memory more than `dst_len`
-// bytes beyond `*dst`. `dst_len` should be at least
-// TF_StringEncodedSize(src_len).
-//
-// On success returns the size in bytes of the encoded string.
-// Returns an error into `status` otherwise.
-TF_CAPI_EXPORT extern size_t TF_StringEncode(const char* src, size_t src_len,
-                                             char* dst, size_t dst_len,
-                                             TF_Status* status);
-
-// Decode a string encoded using TF_StringEncode.
-//
-// On success, sets `*dst` to the start of the decoded string and `*dst_len` to
-// its length. Returns the number of bytes starting at `src` consumed while
-// decoding. `*dst` points to memory within the encoded buffer.  On failure,
-// `*dst` and `*dst_len` are undefined and an error is set in `status`.
-//
-// Does not read memory more than `src_len` bytes beyond `src`.
-TF_CAPI_EXPORT extern size_t TF_StringDecode(const char* src, size_t src_len,
-                                             const char** dst, size_t* dst_len,
-                                             TF_Status* status);
-
-// Return the size in bytes required to encode a string `len` bytes long into a
-// TF_STRING tensor.
-TF_CAPI_EXPORT extern size_t TF_StringEncodedSize(size_t len);
-
 // Returns bool iff this tensor is aligned.
 TF_CAPI_EXPORT extern bool TF_TensorIsAligned(const TF_Tensor*);
 
diff --git a/tensorflow/c/tf_tstring.h b/tensorflow/c/tf_tstring.h
new file mode 100644
index 00000000000..8b576ff8197
--- /dev/null
+++ b/tensorflow/c/tf_tstring.h
@@ -0,0 +1,20 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_C_TF_TSTRING_H_
+#define TENSORFLOW_C_TF_TSTRING_H_
+
+#include "tensorflow/core/platform/ctstring.h"
+
+#endif  // THIRD_PARTY_TENSORFLOW_C_TF_TSTRING_H_
diff --git a/tensorflow/core/platform/BUILD b/tensorflow/core/platform/BUILD
index 33a1e7cfe0a..305e6ccf2d4 100644
--- a/tensorflow/core/platform/BUILD
+++ b/tensorflow/core/platform/BUILD
@@ -762,6 +762,14 @@ cc_library(
     ],
 )
 
+filegroup(
+    name = "ctstring",
+    srcs = [
+        "ctstring.h",
+        "ctstring_internal.h",
+    ],
+)
+
 cc_library(
     name = "types",
     hdrs = ["types.h"],
diff --git a/tensorflow/go/tensor.go b/tensorflow/go/tensor.go
index bd84254f698..9221d35274c 100644
--- a/tensorflow/go/tensor.go
+++ b/tensorflow/go/tensor.go
@@ -16,14 +16,20 @@ limitations under the License.
 
 package tensorflow
 
-// #include <stdlib.h>
-// #include <string.h>
-// #include "tensorflow/c/c_api.h"
+/*
+#include <stdlib.h>
+#include <string.h>
+#include "tensorflow/c/c_api.h"
+
+void toNewTString(_GoString_ gstr, TF_TString *tstr) {
+    TF_TString_Init(tstr);
+    TF_TString_Copy(tstr, _GoStringPtr(gstr), _GoStringLen(gstr));
+}
+*/
 import "C"
 
 import (
 	"bytes"
-	"encoding/binary"
 	"fmt"
 	"io"
 	"math/bits"
@@ -79,9 +85,7 @@ func NewTensor(value interface{}) (*Tensor, error) {
 	nflattened := numElements(shape)
 	nbytes := typeOf(dataType, nil).Size() * uintptr(nflattened)
 	if dataType == String {
-		// TF_STRING tensors are encoded as an array of 8-byte offsets
-		// followed by string data. See c_api.h.
-		nbytes = uintptr(nflattened*8 + int64(byteSizeOfEncodedStrings(val)))
+		nbytes = uintptr(nflattened) * C.sizeof_TF_TString
 	}
 	var shapePtr *C.int64_t
 	if len(shape) > 0 {
@@ -94,35 +98,26 @@ func NewTensor(value interface{}) (*Tensor, error) {
 	runtime.SetFinalizer(t, (*Tensor).finalize)
 	raw := tensorData(t.c)
 	buf := bytes.NewBuffer(raw[:0:len(raw)])
-	if dataType != String {
-		if isAllArray(val.Type()) {
-			// We have arrays all the way down, or just primitive types. We can
-			// just copy the memory in as it is all contiguous.
-			if err := copyPtr(buf, unpackEFace(value).data, int(val.Type().Size())); err != nil {
-				return nil, err
-			}
-		} else {
-			// When there are slices involved the memory for each leaf slice may
-			// not be contiguous with the others or in the order we might
-			// expect, so we need to work our way down to each slice of
-			// primitives and copy them individually
-			if err := encodeTensorWithSlices(buf, val, shape); err != nil {
-				return nil, err
-			}
-		}
 
-		if uintptr(buf.Len()) != nbytes {
-			return nil, bug("NewTensor incorrectly calculated the size of a tensor with type %v and shape %v as %v bytes instead of %v", dataType, shape, nbytes, buf.Len())
-		}
-	} else {
-		e := stringEncoder{offsets: buf, data: raw[nflattened*8:], status: newStatus()}
-		if err := e.encode(reflect.ValueOf(value), shape); err != nil {
+	if isAllArray(val.Type()) {
+		// We have arrays all the way down, or just primitive types. We can
+		// just copy the memory in as it is all contiguous.
+		if err := copyPtr(buf, unpackEFace(value).data, int(val.Type().Size())); err != nil {
 			return nil, err
 		}
-		if int64(buf.Len()) != nflattened*8 {
-			return nil, bug("invalid offset encoding for TF_STRING tensor with shape %v (got %v, want %v)", shape, buf.Len(), nflattened*8)
+	} else {
+		// When there are slices involved the memory for each leaf slice may
+		// not be contiguous with the others or in the order we might
+		// expect, so we need to work our way down to each slice of
+		// primitives and copy them individually
+		if err := encodeTensorWithSlices(buf, val, shape); err != nil {
+			return nil, err
 		}
 	}
+
+	if uintptr(buf.Len()) != nbytes {
+		return nil, bug("NewTensor incorrectly calculated the size of a tensor with type %v and shape %v as %v bytes instead of %v", dataType, shape, nbytes, buf.Len())
+	}
 	return t, nil
 }
 
@@ -131,6 +126,8 @@ func NewTensor(value interface{}) (*Tensor, error) {
 // contiguous in RAM.
 func isAllArray(typ reflect.Type) bool {
 	switch typ.Kind() {
+	case reflect.String:
+		return false
 	case reflect.Slice:
 		return false
 	case reflect.Array:
@@ -312,58 +309,16 @@ func setSliceInSlice(slice reflect.Value, index int, content sliceHeader) {
 
 // decodeOneDimString decodes a string tensor into a one-dimensional []string.
 func decodeOneDimString(raw []byte, nStrings int) ([]string, error) {
-	// Start by making an array of all the strings
 	strs := make([]string, nStrings)
-	// The first nStrings * 8 bytes of raw are offsets into the second half of
-	// the raw data. This second half is where the strings are encoded.
-	offsets := (*(*[]int64)(unsafe.Pointer(&raw)))[:nStrings]
+	tstrs := (*(*[]C.TF_TString)(unsafe.Pointer(&raw)))[:nStrings]
 
-	// Reset raw after the offsets. Now the offsets will work relative to raw
-	raw = raw[nStrings*8:]
-	// Next we work out the final length of the string data so we can copy the
-	// good data out of raw (which is owned by the C tensor and won't be safe
-	// to access if the tensor is freed)
-	r := bytes.NewReader(raw)
-	var totalLength int
-	for _, offset := range offsets {
-		// At each offset we should find a varint length of a string.
-		// Errors here should mean the tensor is corrupt.
-		if _, err := r.Seek(offset, io.SeekStart); err != nil {
-			return nil, err
-		}
-		l, err := binary.ReadUvarint(r)
-		if err != nil {
-			return nil, err
-		}
-		totalLength += int(l)
+	for i, tstr := range tstrs {
+		dst := C.TF_TString_GetDataPointer(&tstr)
+		dstLen := C.TF_TString_GetSize(&tstr)
+
+		strs[i] = C.GoStringN(dst, C.int(dstLen))
 	}
 
-	// Lets allocate a big buffer to carry our string data.
-	stringData := make([]byte, 0, totalLength)
-	// Now copy the string data across into our new buffer, keeping track of the
-	// location of each string in the strs slice.
-	var cursor int
-	for i, offset := range offsets {
-		// At each offset we should find a varint length. Read it
-		if _, err := r.Seek(offset, io.SeekStart); err != nil {
-			return nil, err
-		}
-		l, err := binary.ReadUvarint(r)
-		if err != nil {
-			return nil, err
-		}
-
-		// Then copy the actual string into our large buffer
-		target := stringData[cursor : cursor+int(l)]
-		if _, err := r.Read(target); err != nil {
-			return nil, err
-		}
-		// Track where this string data is.
-		strs[i] = *(*string)(unsafe.Pointer(&target))
-		cursor += int(l)
-	}
-
-	// So now we have a big slice of strings
 	return strs, nil
 }
 
@@ -469,26 +424,6 @@ func numElements(shape []int64) int64 {
 	return n
 }
 
-// byteSizeOfEncodedStrings returns the size of the encoded strings in val.
-// val MUST be a string, or a container (array/slice etc.) of strings.
-// Tensorflow encodes strings as the varint encoded length followed by the
-// string bytes. We could call into the C library to do this but cgo has a heavy
-// overhead. So we just do that calculation in Go
-func byteSizeOfEncodedStrings(val reflect.Value) int {
-	if val.Kind() == reflect.String {
-		return sizeVarUint(uint64(val.Len())) + val.Len()
-	}
-	if val.Kind() != reflect.Slice && val.Kind() != reflect.Array {
-		panic(fmt.Sprintf("unexpected type %s", val.Type()))
-	}
-	// Otherwise must be an array or slice.
-	var size int
-	for i := 0; i < val.Len(); i++ {
-		size += byteSizeOfEncodedStrings(val.Index(i))
-	}
-	return size
-}
-
 // sizeVarUint determines how many bytes it would take to encode the int v as
 // an unsigned varint
 func sizeVarUint(v uint64) int {
@@ -509,12 +444,18 @@ func encodeTensorWithSlices(w *bytes.Buffer, v reflect.Value, shape []int64) err
 		if v.Len() != expected {
 			return fmt.Errorf("mismatched slice lengths: %d and %d", v.Len(), expected)
 		}
+	} else if v.Kind() == reflect.String {
+		s := v.Interface().(string)
+		var tstr C.TF_TString
+		C.toNewTString(s, &tstr)
+		ptr := unsafe.Pointer(&tstr)
+		return copyPtr(w, ptr, C.sizeof_TF_TString)
 	} else if v.Kind() != reflect.Array {
 		return fmt.Errorf("unsupported type %v", v.Type())
 	}
 
 	// Once we have just a single dimension we can just copy the data
-	if len(shape) == 1 && v.Len() > 0 {
+	if len(shape) == 1 && v.Len() > 0 && v.Index(0).Kind() != reflect.String {
 		elt := v.Index(0)
 		if !elt.CanAddr() {
 			panic("cannot take address")
@@ -556,45 +497,6 @@ func copyPtr(w *bytes.Buffer, ptr unsafe.Pointer, l int) error {
 	return err
 }
 
-type stringEncoder struct {
-	offsets *bytes.Buffer
-	data    []byte
-	offset  uint64
-	status  *status
-}
-
-func (e *stringEncoder) encode(v reflect.Value, shape []int64) error {
-	if v.Kind() == reflect.String {
-		if err := copyPtr(e.offsets, unsafe.Pointer(&e.offset), int(unsafe.Sizeof(e.offset))); err != nil {
-			return err
-		}
-		// A string is encoded as the varint length followed by the string bytes.
-		// We do this in Go to avoid the considerable overhead of a cgo call into
-		// the tensorflow library
-		s := v.String()
-		n := binary.PutUvarint(e.data[e.offset:], uint64(len(s)))
-		e.offset += uint64(n)
-		n = copy(e.data[e.offset:], s)
-		e.offset += uint64(n)
-		return nil
-	}
-
-	if v.Kind() == reflect.Slice {
-		expected := int(shape[0])
-		if v.Len() != expected {
-			return fmt.Errorf("mismatched slice lengths: %d and %d", v.Len(), expected)
-		}
-	}
-
-	subShape := shape[1:]
-	for i := 0; i < v.Len(); i++ {
-		if err := e.encode(v.Index(i), subShape); err != nil {
-			return err
-		}
-	}
-	return nil
-}
-
 func bug(format string, args ...interface{}) error {
 	return fmt.Errorf("BUG: Please report at https://github.com/tensorflow/tensorflow/issues with the note: Go TensorFlow %v: %v", Version(), fmt.Sprintf(format, args...))
 }
diff --git a/tensorflow/java/src/main/native/tensor_jni.cc b/tensorflow/java/src/main/native/tensor_jni.cc
index 7e3cf4a88aa..5fa4e4b3e12 100644
--- a/tensorflow/java/src/main/native/tensor_jni.cc
+++ b/tensorflow/java/src/main/native/tensor_jni.cc
@@ -220,16 +220,13 @@ size_t readNDArray(JNIEnv* env, TF_DataType dtype, const char* src,
   }
 }
 
-jbyteArray TF_StringDecodeTojbyteArray(JNIEnv* env, const char* src,
-                                       size_t src_len, TF_Status* status) {
-  const char* dst = nullptr;
-  size_t dst_len = 0;
-  TF_StringDecode(src, src_len, &dst, &dst_len, status);
-  if (TF_GetCode(status) != TF_OK) {
-    return nullptr;
-  }
+jbyteArray TF_StringDecodeTojbyteArray(JNIEnv* env, const TF_TString* src) {
+  const char* dst = TF_TString_GetDataPointer(src);
+  size_t dst_len = TF_TString_GetSize(src);
+
   jbyteArray ret = env->NewByteArray(dst_len);
   jbyte* cpy = env->GetByteArrayElements(ret, nullptr);
+
   memcpy(cpy, dst, dst_len);
   env->ReleaseByteArrayElements(ret, cpy, 0);
   return ret;
@@ -238,69 +235,32 @@ jbyteArray TF_StringDecodeTojbyteArray(JNIEnv* env, const char* src,
 class StringTensorWriter {
  public:
   StringTensorWriter(TF_Tensor* t, int num_elements)
-      : offset_(0),
-        poffsets_(static_cast<char*>(TF_TensorData(t))),
-        pdata_(poffsets_ + 8 * num_elements),
-        plimit_(poffsets_ + TF_TensorByteSize(t)) {}
+      : index_(0), data_(static_cast<TF_TString*>(TF_TensorData(t))) {}
 
   void Add(const char* src, size_t len, TF_Status* status) {
     if (TF_GetCode(status) != TF_OK) return;
-    if (plimit_ - poffsets_ < sizeof(offset_)) {
-      TF_SetStatus(status, TF_OUT_OF_RANGE,
-                   "TF_STRING tensor encoding ran out of space for offsets, "
-                   "this is likely a bug, please file an issue at "
-                   "https://github.com/tensorflow/tensorflow/issues/new");
-      return;
-    }
-    memcpy(poffsets_, &offset_, sizeof(offset_));
-    size_t written =
-        TF_StringEncode(src, len, pdata_, (plimit_ - pdata_), status);
-    offset_ += written;
-    poffsets_ += 8;
-    pdata_ += written;
+    TF_TString_Init(&data_[index_]);
+    TF_TString_Copy(&data_[index_++], src, len);
   }
 
  private:
-  uint64_t offset_;
-  char* poffsets_;
-  char* pdata_;
-  const char* plimit_;
+  int index_;
+  TF_TString* data_;
 };
 
 class StringTensorReader {
  public:
   StringTensorReader(const TF_Tensor* t, int num_elements)
-      : index_(0),
-        offsets_(static_cast<const char*>(TF_TensorData(t))),
-        data_(offsets_ + 8 * num_elements),
-        limit_(offsets_ + TF_TensorByteSize(t)) {}
+      : index_(0), data_(static_cast<const TF_TString*>(TF_TensorData(t))) {}
 
   jbyteArray Next(JNIEnv* env, TF_Status* status) {
     if (TF_GetCode(status) != TF_OK) return nullptr;
-    uint64_t offset = 0;
-    const char* poffset = offsets_ + sizeof(offset) * index_;
-    if (poffset >= limit_) {
-      TF_SetStatus(
-          status, TF_INTERNAL,
-          "Invalid TF_STRING tensor, offsets table seems to be too small");
-      return nullptr;
-    }
-    memcpy(&offset, poffset, sizeof(offset));
-    const char* pdata = data_ + offset;
-    if (pdata >= limit_) {
-      TF_SetStatus(status, TF_INTERNAL,
-                   "Invalid TF_STRING tensor, invalid entry in offset table");
-      return nullptr;
-    }
-    ++index_;
-    return TF_StringDecodeTojbyteArray(env, pdata, (limit_ - pdata), status);
+    return TF_StringDecodeTojbyteArray(env, &data_[index_++]);
   }
 
  private:
   int index_;
-  const char* offsets_;
-  const char* data_;
-  const char* limit_;
+  const TF_TString* data_;
 };
 
 void readNDStringArray(JNIEnv* env, StringTensorReader* reader, int dims_left,
@@ -367,17 +327,16 @@ JNIEXPORT jlong JNICALL Java_org_tensorflow_Tensor_allocateScalarBytes(
   // TF_STRING tensors are encoded with a table of 8-byte offsets followed by
   // TF_StringEncode-encoded bytes.
   size_t src_len = static_cast<int>(env->GetArrayLength(value));
-  size_t dst_len = TF_StringEncodedSize(src_len);
-  TF_Tensor* t = TF_AllocateTensor(TF_STRING, nullptr, 0, 8 + dst_len);
-  char* dst = static_cast<char*>(TF_TensorData(t));
-  memset(dst, 0, 8);  // The offset table
+  TF_Tensor* t = TF_AllocateTensor(TF_STRING, nullptr, 0, sizeof(TF_TString));
+  TF_TString* dst = static_cast<TF_TString*>(TF_TensorData(t));
 
   TF_Status* status = TF_NewStatus();
   jbyte* jsrc = env->GetByteArrayElements(value, nullptr);
   // jsrc is an unsigned byte*, TF_StringEncode requires a char*.
   // reinterpret_cast<> for this conversion should be safe.
-  TF_StringEncode(reinterpret_cast<const char*>(jsrc), src_len, dst + 8,
-                  dst_len, status);
+  TF_TString_Init(&dst[0]);
+  TF_TString_Copy(&dst[0], reinterpret_cast<const char*>(jsrc), src_len);
+
   env->ReleaseByteArrayElements(value, jsrc, JNI_ABORT);
   if (!throwExceptionIfNotOK(env, status)) {
     TF_DeleteStatus(status);
@@ -388,27 +347,18 @@ JNIEXPORT jlong JNICALL Java_org_tensorflow_Tensor_allocateScalarBytes(
 }
 
 namespace {
-size_t nonScalarTF_STRINGTensorSize(JNIEnv* env, jarray value, int num_dims) {
-  if (num_dims == 0) {
-    // This is the last dimension, i.e., value should correspond to a jbyteArray
-    // encoding the string.
-    return TF_StringEncodedSize(
-        static_cast<size_t>(env->GetArrayLength(value)));
-  }
+void checkForNullEntries(JNIEnv* env, jarray value, int num_dims) {
   jsize len = env->GetArrayLength(value);
-  size_t ret = 0;
   for (jsize i = 0; i < len; ++i) {
     jarray elem = static_cast<jarray>(
         env->GetObjectArrayElement(static_cast<jobjectArray>(value), i));
     if (elem == nullptr) {
       throwException(env, kNullPointerException,
                      "null entries in provided array");
-      return ret;
+      return;
     }
-    ret += nonScalarTF_STRINGTensorSize(env, elem, num_dims - 1);
-    if (env->ExceptionCheck()) return ret;
+    if (env->ExceptionCheck()) return;
   }
-  return ret;
 }
 
 void fillNonScalarTF_STRINGTensorData(JNIEnv* env, jarray value, int num_dims,
@@ -448,11 +398,10 @@ JNIEXPORT jlong JNICALL Java_org_tensorflow_Tensor_allocateNonScalarBytes(
     }
     env->ReleaseLongArrayElements(shape, jdims, JNI_ABORT);
   }
-  const size_t encoded_size =
-      nonScalarTF_STRINGTensorSize(env, value, num_dims);
+  checkForNullEntries(env, value, num_dims);
   if (env->ExceptionCheck()) return 0;
   TF_Tensor* t = TF_AllocateTensor(TF_STRING, dims, num_dims,
-                                   8 * num_elements + encoded_size);
+                                   sizeof(TF_TString) * num_elements);
   if (t == nullptr) {
     delete[] dims;
     throwException(env, kNullPointerException,
@@ -572,20 +521,8 @@ JNIEXPORT jbyteArray JNICALL Java_org_tensorflow_Tensor_scalarBytes(
                    "Tensor is not a string/bytes scalar");
     return nullptr;
   }
-  const char* data = static_cast<const char*>(TF_TensorData(t));
-  const char* src = data + 8;
-  size_t src_len = TF_TensorByteSize(t) - 8;
-  uint64_t offset = 0;
-  memcpy(&offset, data, sizeof(offset));
-  if (offset >= src_len) {
-    throwException(env, kIllegalArgumentException,
-                   "invalid tensor encoding: bad offsets");
-    return nullptr;
-  }
-  TF_Status* status = TF_NewStatus();
-  jbyteArray ret = TF_StringDecodeTojbyteArray(env, src, src_len, status);
-  throwExceptionIfNotOK(env, status);
-  TF_DeleteStatus(status);
+  const TF_TString* data = static_cast<const TF_TString*>(TF_TensorData(t));
+  jbyteArray ret = TF_StringDecodeTojbyteArray(env, &data[0]);
   return ret;
 }
 
diff --git a/tensorflow/java/src/test/java/org/tensorflow/op/core/ConstantTest.java b/tensorflow/java/src/test/java/org/tensorflow/op/core/ConstantTest.java
index 7d3b26de8dc..cc2a7adcf35 100644
--- a/tensorflow/java/src/test/java/org/tensorflow/op/core/ConstantTest.java
+++ b/tensorflow/java/src/test/java/org/tensorflow/op/core/ConstantTest.java
@@ -27,7 +27,6 @@ import java.nio.DoubleBuffer;
 import java.nio.FloatBuffer;
 import java.nio.IntBuffer;
 import java.nio.LongBuffer;
-
 import org.junit.Test;
 import org.junit.runner.RunWith;
 import org.junit.runners.JUnit4;
@@ -179,17 +178,13 @@ public class ConstantTest {
     byte[] data = {(byte) 1, (byte) 2, (byte) 3, (byte) 4};
     long[] shape = {};
 
-    // byte arrays (DataType.STRING in Tensorflow) are encoded as an offset in the data buffer,
-    // followed by a varint encoded size, followed by the data.
     ByteArrayOutputStream baout = new ByteArrayOutputStream();
     DataOutputStream out = new DataOutputStream(baout);
-    // Offset in array.
-    out.writeLong(0L);
-    // Varint encoded length of buffer.
-    // For any number < 0x80, the varint encoding is simply the number itself.
-    // https://developers.google.com/protocol-buffers/docs/encoding#varints
-    assertTrue(data.length < 0x80);
-    out.write(data.length);
+    // We construct a TF_TString_Small tstring, which has the capacity for a 22 byte string.
+    // The first 6 most significant bits of the first byte represent length; the remaining
+    // 2-bits are type indicators, and are left as 0b00 to denote a TF_TSTR_SMALL type.
+    assertTrue(data.length <= 22);
+    out.writeByte(data.length << 2);
     out.write(data);
     out.close();
     byte[] content = baout.toByteArray();
diff --git a/tensorflow/python/lib/core/ndarray_tensor.cc b/tensorflow/python/lib/core/ndarray_tensor.cc
index 2afd2888e8f..e2fb3ec8dc9 100644
--- a/tensorflow/python/lib/core/ndarray_tensor.cc
+++ b/tensorflow/python/lib/core/ndarray_tensor.cc
@@ -245,29 +245,17 @@ Status PyBytesArrayMap(PyArrayObject* array, F f) {
 // the buffer. The caller takes ownership of the buffer.
 Status EncodePyBytesArray(PyArrayObject* array, tensorflow::int64 nelems,
                           size_t* size, void** buffer) {
-  // Compute bytes needed for encoding.
-  *size = 0;
-  TF_RETURN_IF_ERROR(
-      PyBytesArrayMap(array, [&size](const char* ptr, Py_ssize_t len) {
-        *size += sizeof(tensorflow::uint64) +
-                 tensorflow::core::VarintLength(len) + len;
-      }));
   // Encode all strings.
-  std::unique_ptr<char[]> base_ptr(new char[*size]);
-  char* base = base_ptr.get();
-  char* data_start = base + sizeof(tensorflow::uint64) * nelems;
-  char* dst = data_start;  // Where next string is encoded.
-  tensorflow::uint64* offsets = reinterpret_cast<tensorflow::uint64*>(base);
+  *size = nelems * sizeof(tensorflow::tstring);
+  std::unique_ptr<tensorflow::tstring[]> base_ptr(
+      new tensorflow::tstring[nelems]);
+  tensorflow::tstring* dst = base_ptr.get();
 
-  TF_RETURN_IF_ERROR(PyBytesArrayMap(
-      array, [&data_start, &dst, &offsets](const char* ptr, Py_ssize_t len) {
-        *offsets = (dst - data_start);
-        offsets++;
-        dst = tensorflow::core::EncodeVarint64(dst, len);
-        memcpy(dst, ptr, len);
-        dst += len;
+  TF_RETURN_IF_ERROR(
+      PyBytesArrayMap(array, [&dst](const char* ptr, Py_ssize_t len) {
+        dst->assign(ptr, len);
+        dst++;
       }));
-  CHECK_EQ(dst, base + *size);
   *buffer = base_ptr.release();
   return Status::OK();
 }
@@ -275,37 +263,18 @@ Status EncodePyBytesArray(PyArrayObject* array, tensorflow::int64 nelems,
 Status CopyTF_TensorStringsToPyArray(const TF_Tensor* src, uint64 nelems,
                                      PyArrayObject* dst) {
   const void* tensor_data = TF_TensorData(src);
-  const size_t tensor_size = TF_TensorByteSize(src);
-  const char* limit = static_cast<const char*>(tensor_data) + tensor_size;
   DCHECK(tensor_data != nullptr);
   DCHECK_EQ(TF_STRING, TF_TensorType(src));
 
-  const uint64* offsets = static_cast<const uint64*>(tensor_data);
-  const size_t offsets_size = sizeof(uint64) * nelems;
-  const char* data = static_cast<const char*>(tensor_data) + offsets_size;
+  const tstring* tstr = static_cast<const tstring*>(tensor_data);
 
-  const size_t expected_tensor_size =
-      (limit - static_cast<const char*>(tensor_data));
-  if (expected_tensor_size - tensor_size) {
-    return errors::InvalidArgument(
-        "Invalid/corrupt TF_STRING tensor: expected ", expected_tensor_size,
-        " bytes of encoded strings for the tensor containing ", nelems,
-        " strings, but the tensor is encoded in ", tensor_size, " bytes");
-  }
   std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
       TF_NewStatus(), TF_DeleteStatus);
   auto iter = make_safe(PyArray_IterNew(reinterpret_cast<PyObject*>(dst)));
   for (int64 i = 0; i < nelems; ++i) {
-    const char* start = data + offsets[i];
-    const char* ptr = nullptr;
-    size_t len = 0;
-
-    TF_StringDecode(start, limit - start, &ptr, &len, status.get());
-    if (TF_GetCode(status.get()) != TF_OK) {
-      return errors::InvalidArgument(TF_Message(status.get()));
-    }
-
-    auto py_string = make_safe(PyBytes_FromStringAndSize(ptr, len));
+    const tstring& tstr_i = tstr[i];
+    auto py_string =
+        make_safe(PyBytes_FromStringAndSize(tstr_i.data(), tstr_i.size()));
     if (py_string == nullptr) {
       return errors::Internal(
           "failed to create a python byte array when converting element #", i,
@@ -551,14 +520,14 @@ Status NdarrayToTensor(TFE_Context* ctx, PyObject* ndarray,
           static_cast<tensorflow::DataType>(dtype), dims.data(), dims.size(),
           encoded, size, convert_string,
           [](void* data, size_t len, void* arg) {
-            delete[] reinterpret_cast<char*>(data);
+            delete[] reinterpret_cast<tensorflow::tstring*>(data);
           },
           nullptr)});
     } else {
       *ret = make_safe(TF_NewTensor(
           dtype, dims.data(), dims.size(), encoded, size,
           [](void* data, size_t len, void* arg) {
-            delete[] reinterpret_cast<char*>(data);
+            delete[] reinterpret_cast<tensorflow::tstring*>(data);
           },
           nullptr));
     }

From 089ea1bc8cf2cb35acb4a4b324d2809e0b709625 Mon Sep 17 00:00:00 2001
From: Jaesung Chung <jaesung@google.com>
Date: Mon, 29 Jun 2020 18:35:14 -0700
Subject: [PATCH 1294/1390] Hide the classes of the passes for lifting
 variables

Also merges their BUILD targets into the tf_saved_passes and
tensorflow_test_passes targets.

PiperOrigin-RevId: 318936042
Change-Id: Ie7a0314fbc7253237d5127d5ddb45f78c2145cea
---
 tensorflow/compiler/mlir/tensorflow/BUILD     |  62 ++-----
 ...ariables_pass.h => lift_variables_pass.cc} |  22 ++-
 .../lift_variables_pass_registration.cc       |  27 ---
 .../transforms/lift_variables_test_pass.cc    | 161 ++++++++++++++++++
 4 files changed, 191 insertions(+), 81 deletions(-)
 rename tensorflow/compiler/mlir/tensorflow/transforms/{lift_variables_pass.h => lift_variables_pass.cc} (79%)
 delete mode 100644 tensorflow/compiler/mlir/tensorflow/transforms/lift_variables_pass_registration.cc
 create mode 100644 tensorflow/compiler/mlir/tensorflow/transforms/lift_variables_test_pass.cc

diff --git a/tensorflow/compiler/mlir/tensorflow/BUILD b/tensorflow/compiler/mlir/tensorflow/BUILD
index 778b1754410..dff4cb64f90 100644
--- a/tensorflow/compiler/mlir/tensorflow/BUILD
+++ b/tensorflow/compiler/mlir/tensorflow/BUILD
@@ -533,63 +533,25 @@ cc_library(
     alwayslink = 1,
 )
 
-cc_library(
-    name = "lift_variables_pass",
-    hdrs = [
-        "transforms/lift_variables_pass.h",
-    ],
-    deps = [
-        ":lift_variables_lib",
-        ":tensorflow",
-        "//tensorflow/core:core_cpu",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:framework_internal",
-        "//tensorflow/core:lib",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:Pass",
-        "@llvm-project//mlir:Support",
-    ],
-    alwayslink = 1,
-)
-
-cc_library(
-    name = "lift_variables_test_pass",
-    hdrs = [
-        "transforms/lift_variables_test_pass.h",
-    ],
-    deps = [
-        ":lift_variables_lib",
-        ":tensorflow",
-        "//tensorflow/core:core_cpu",
-        "//tensorflow/core:core_cpu_lib",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:framework_internal",
-        "//tensorflow/core:lib",
-        "//tensorflow/core/platform:errors",
-        "//tensorflow/core/platform:status",
-        "//tensorflow/core/platform:threadpool_options",
-        "@llvm-project//mlir:Pass",
-        "@llvm-project//mlir:Support",
-    ],
-    alwayslink = 1,
-)
-
 cc_library(
     name = "tf_saved_model_passes",
     srcs = [
         "transforms/freeze_global_tensors.cc",
-        "transforms/lift_variables_pass_registration.cc",
+        "transforms/lift_variables_pass.cc",
         "transforms/optimize_global_tensors.cc",
     ],
     hdrs = [
         "transforms/tf_saved_model_passes.h",
     ],
     deps = [
-        ":lift_variables_pass",
+        ":lift_variables_lib",
         ":tensorflow",
         ":tensorflow_passes",
         ":tensorflow_types",
         "//tensorflow/core:core_cpu",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:lib",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Pass",
@@ -722,14 +684,24 @@ cc_library(
 cc_library(
     name = "tensorflow_test_passes",
     srcs = [
-        "transforms/lift_variables_test_pass_registration.cc",
+        "transforms/lift_variables_test_pass.cc",
         "transforms/lower_tf_pass.cc",
     ],
     deps = [
-        ":lift_variables_test_pass",
+        ":lift_variables_lib",
         ":lower_tf_lib",
+        ":tensorflow",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:core_cpu_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:lib",
+        "//tensorflow/core/platform:errors",
+        "//tensorflow/core/platform:status",
+        "//tensorflow/core/platform:threadpool_options",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:Support",
     ],
     alwayslink = 1,
 )
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/lift_variables_pass.h b/tensorflow/compiler/mlir/tensorflow/transforms/lift_variables_pass.cc
similarity index 79%
rename from tensorflow/compiler/mlir/tensorflow/transforms/lift_variables_pass.h
rename to tensorflow/compiler/mlir/tensorflow/transforms/lift_variables_pass.cc
index cff4936d6eb..40db9ce5239 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/lift_variables_pass.h
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/lift_variables_pass.cc
@@ -13,18 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSFORMS_LIFT_VARIABLES_PASS_H_
-#define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSFORMS_LIFT_VARIABLES_PASS_H_
-
 #include "mlir/IR/Module.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
-#include "mlir/Pass/PassManager.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/transforms/lift_variables.h"
 #include "tensorflow/core/public/session.h"
 
 namespace mlir {
-namespace tf_saved_model {
+namespace {
 
 // This pass takes care of finding all variables from the function arguments and
 // converting them to the corresponding global tensors, that will be located out
@@ -33,19 +29,27 @@ namespace tf_saved_model {
 class LiftVariablesPass
     : public PassWrapper<LiftVariablesPass, OperationPass<ModuleOp>> {
  public:
-  explicit LiftVariablesPass(::tensorflow::Session* session)
+  explicit LiftVariablesPass(tensorflow::Session* session)
       : session_(session) {}
 
   void runOnOperation() override {
     ModuleOp module = getOperation();
-    if (failed(LiftVariables(module, session_))) signalPassFailure();
+    if (failed(tf_saved_model::LiftVariables(module, session_)))
+      signalPassFailure();
   }
 
  private:
   ::tensorflow::Session* session_;
 };
 
+}  // namespace
+
+namespace tf_saved_model {
+
+std::unique_ptr<OperationPass<ModuleOp>> CreateLiftVariablesPass(
+    tensorflow::Session* session) {
+  return std::make_unique<LiftVariablesPass>(session);
+}
+
 }  // namespace tf_saved_model
 }  // namespace mlir
-
-#endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSFORMS_LIFT_VARIABLES_PASS_H_
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/lift_variables_pass_registration.cc b/tensorflow/compiler/mlir/tensorflow/transforms/lift_variables_pass_registration.cc
deleted file mode 100644
index 672c1695204..00000000000
--- a/tensorflow/compiler/mlir/tensorflow/transforms/lift_variables_pass_registration.cc
+++ /dev/null
@@ -1,27 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/compiler/mlir/tensorflow/transforms/lift_variables_pass.h"
-
-namespace mlir {
-namespace tf_saved_model {
-
-std::unique_ptr<OperationPass<ModuleOp>> CreateLiftVariablesPass(
-    ::tensorflow::Session* session) {
-  return std::make_unique<LiftVariablesPass>(session);
-}
-
-}  // namespace tf_saved_model
-}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/lift_variables_test_pass.cc b/tensorflow/compiler/mlir/tensorflow/transforms/lift_variables_test_pass.cc
new file mode 100644
index 00000000000..1b41cf59757
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/lift_variables_test_pass.cc
@@ -0,0 +1,161 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/transforms/lift_variables.h"
+#include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/platform/threadpool_options.h"
+#include "tensorflow/core/public/session.h"
+
+namespace mlir {
+namespace {
+
+using ::tensorflow::DeviceMgr;
+using ::tensorflow::Session;
+using ::tensorflow::Status;
+using ::tensorflow::Tensor;
+
+// FakeSession is for testing only.
+class FakeSession : public tensorflow::Session {
+ public:
+  FakeSession() {}
+  ~FakeSession() override = default;
+
+  Status Create(const tensorflow::GraphDef& graph) override {
+    return tensorflow::errors::Unimplemented("not available");
+  }
+  Status Extend(const tensorflow::GraphDef& graph) override {
+    return tensorflow::errors::Unimplemented("not available");
+  }
+
+  Status Close() override {
+    return tensorflow::errors::Unimplemented("not available");
+  }
+
+  Status ListDevices(
+      std::vector<tensorflow::DeviceAttributes>* response) override {
+    return tensorflow::errors::Unimplemented("not available");
+  }
+
+  Status LocalDeviceManager(
+      const tensorflow::DeviceMgr** deviceMgrPtr) override {
+    // This method returns a null device manager without making an error.
+    // Users of this method will be notified since it will have a fake data.
+    *deviceMgrPtr = nullptr;
+    return Status::OK();
+  }
+
+  Status Run(const std::vector<std::pair<std::string, Tensor>>& inputs,
+             const std::vector<std::string>& output_names,
+             const std::vector<std::string>& target_nodes,
+             std::vector<Tensor>* outputs) override {
+    tensorflow::RunMetadata run_metadata;
+    return Run(tensorflow::RunOptions(), inputs, output_names, target_nodes,
+               outputs, &run_metadata);
+  }
+
+  Status Run(const tensorflow::RunOptions& run_options,
+             const std::vector<std::pair<std::string, Tensor>>& inputs,
+             const std::vector<std::string>& output_names,
+             const std::vector<std::string>& target_nodes,
+             std::vector<Tensor>* outputs,
+             tensorflow::RunMetadata* run_metadata) override {
+    return Run(run_options, inputs, output_names, target_nodes, outputs,
+               run_metadata, tensorflow::thread::ThreadPoolOptions());
+  }
+
+  Status Run(const tensorflow::RunOptions& run_options,
+             const std::vector<std::pair<std::string, Tensor>>& inputs,
+             const std::vector<std::string>& output_names,
+             const std::vector<std::string>& target_nodes,
+             std::vector<Tensor>* outputs,
+             tensorflow::RunMetadata* run_metadata,
+             const tensorflow::thread::ThreadPoolOptions& thread_pool_options)
+      override {
+    for (const std::string& output_name : output_names) {
+      Tensor output;
+      if (output_name == "dense/bias") {
+        Tensor t = Tensor(tensorflow::DT_FLOAT, tensorflow::TensorShape({50}));
+        t.flat<float>().setZero();
+        outputs->push_back(t);
+      } else if (output_name == "dense/kernel") {
+        Tensor t =
+            Tensor(tensorflow::DT_FLOAT, tensorflow::TensorShape({100, 50}));
+        t.flat<float>().setZero();
+        outputs->push_back(t);
+      } else {
+        // Create a scalar float tensor.
+        Tensor t = Tensor(tensorflow::DT_FLOAT, tensorflow::TensorShape({}));
+        t.flat<float>()(0) = 1.0f;
+        outputs->push_back(t);
+      }
+    }
+    return Status::OK();
+  }
+};
+
+// This pass is only available in the tf-opt binary for testing.
+class LiftVariablesTestPass
+    : public PassWrapper<LiftVariablesTestPass, OperationPass<ModuleOp>> {
+ public:
+  LiftVariablesTestPass() { session_ = new FakeSession(); }
+
+  ~LiftVariablesTestPass() override { delete session_; }
+
+  void runOnOperation() override {
+    ModuleOp module = getOperation();
+    if (failed(tf_saved_model::LiftVariables(module, session_)))
+      signalPassFailure();
+  }
+
+ private:
+  Session* session_;
+};
+
+// This pass is only available in the tf-opt binary for testing.
+class LiftVariablesInvalidSessionTestPass
+    : public PassWrapper<LiftVariablesInvalidSessionTestPass,
+                         OperationPass<ModuleOp>> {
+ public:
+  void runOnOperation() override {
+    ModuleOp module = getOperation();
+    // Pass an invalid session argument, which is a nullptr.
+    if (failed(tf_saved_model::LiftVariables(module, /*session=*/nullptr)))
+      signalPassFailure();
+  }
+};
+
+}  // namespace
+
+namespace tf_saved_model {
+
+static PassRegistration<LiftVariablesTestPass> lift_variables_test_pass(
+    "tf-saved-model-lift-variables-test",
+    "Lift variables and save them as global tensors");
+
+static PassRegistration<LiftVariablesInvalidSessionTestPass>
+    lift_variables_invalid_session_test_pass(
+        "tf-saved-model-lift-variables-invalid-session-test",
+        "Lift variables and save them as global tensors with an invalid "
+        "session");
+
+}  // namespace tf_saved_model
+}  // namespace mlir

From 8562a13cca90a2a60bb91d89fac9e679099c19c8 Mon Sep 17 00:00:00 2001
From: Chao Mei <chaomei@google.com>
Date: Mon, 29 Jun 2020 19:26:54 -0700
Subject: [PATCH 1295/1390] Add nnapi execution priority support to TFLite
 tooling.

PiperOrigin-RevId: 318941844
Change-Id: Ied37ccf4eeb8555a665f31359ef37ec627aeee50
---
 tensorflow/lite/tools/benchmark/README.md     |  2 ++
 tensorflow/lite/tools/delegates/README.md     |  3 ++
 .../delegates/nnapi_delegate_provider.cc      | 33 +++++++++++++++++++
 3 files changed, 38 insertions(+)

diff --git a/tensorflow/lite/tools/benchmark/README.md b/tensorflow/lite/tools/benchmark/README.md
index 1b422c9d86c..9d605c8e549 100644
--- a/tensorflow/lite/tools/benchmark/README.md
+++ b/tensorflow/lite/tools/benchmark/README.md
@@ -76,6 +76,8 @@ Note when `use_legacy_nnapi` is selected, this parameter won't work.
 *   `use_nnapi`: `bool` (default=false) \
     Note some Android P devices will fail to use NNAPI for models in
     `/data/local/tmp/` and this benchmark tool will not correctly use NNAPI.
+*   `nnapi_execution_preference`: `str` (default="")
+*   `nnapi_execution_priority`: `str` (default="")
 *   `nnapi_accelerator_name`: `str` (default="")
 *   `disable_nnapi_cpu`: `bool` (default=false)
 *   `nnapi_allow_fp16`: `bool` (default=false)
diff --git a/tensorflow/lite/tools/delegates/README.md b/tensorflow/lite/tools/delegates/README.md
index ed583cce070..96798f34e40 100644
--- a/tensorflow/lite/tools/delegates/README.md
+++ b/tensorflow/lite/tools/delegates/README.md
@@ -73,6 +73,9 @@ Only Android and iOS devices support GPU delegate.
     [NNAPI execution preference](https://developer.android.com/ndk/reference/group/neural-networks.html#group___neural_networks_1gga034380829226e2d980b2a7e63c992f18af727c25f1e2d8dcc693c477aef4ea5f5)
     to use when executing using NNAPI. Should be one of the following:
     fast_single_answer, sustained_speed, low_power, undefined.
+*   `nnapi_execution_priority`: `string` (default="") \
+    The relative priority for executions of the model in NNAPI. Should be one
+    of the following: default, low, medium and high.
 *   `disable_nnapi_cpu`: `bool` (default=false) \
     Excludes the
     [NNAPI CPU reference implementation](https://developer.android.com/ndk/guides/neuralnetworks#device-assignment)
diff --git a/tensorflow/lite/tools/delegates/nnapi_delegate_provider.cc b/tensorflow/lite/tools/delegates/nnapi_delegate_provider.cc
index bde9c0e03e3..abedba9b52a 100644
--- a/tensorflow/lite/tools/delegates/nnapi_delegate_provider.cc
+++ b/tensorflow/lite/tools/delegates/nnapi_delegate_provider.cc
@@ -30,6 +30,8 @@ class NnapiDelegateProvider : public DelegateProvider {
     default_params_.AddParam("use_nnapi", ToolParam::Create<bool>(false));
     default_params_.AddParam("nnapi_execution_preference",
                              ToolParam::Create<std::string>(""));
+    default_params_.AddParam("nnapi_execution_priority",
+                             ToolParam::Create<std::string>(""));
     default_params_.AddParam("nnapi_accelerator_name",
                              ToolParam::Create<std::string>(""));
     default_params_.AddParam("disable_nnapi_cpu",
@@ -57,6 +59,10 @@ std::vector<Flag> NnapiDelegateProvider::CreateFlags(ToolParams* params) const {
                             "execution preference for nnapi delegate. Should "
                             "be one of the following: fast_single_answer, "
                             "sustained_speed, low_power, undefined"),
+    CreateFlag<std::string>("nnapi_execution_priority", params,
+                            "The model execution priority in nnapi, and it "
+                            "should be one of the following: default, low, "
+                            "medium, high."),
     CreateFlag<std::string>(
         "nnapi_accelerator_name", params,
         "the name of the nnapi accelerator to use (requires Android Q+)"),
@@ -79,6 +85,11 @@ void NnapiDelegateProvider::LogParams(const ToolParams& params) const {
                        << params.Get<std::string>("nnapi_execution_preference")
                        << "]";
     }
+    if (!params.Get<std::string>("nnapi_execution_priority").empty()) {
+      TFLITE_LOG(INFO) << "model execution priority in nnapi: ["
+                       << params.Get<std::string>("nnapi_execution_priority")
+                       << "]";
+    }
     std::string log_string = "nnapi accelerator name: [" +
                              params.Get<std::string>("nnapi_accelerator_name") +
                              "]";
@@ -145,6 +156,28 @@ TfLiteDelegatePtr NnapiDelegateProvider::CreateTfLiteDelegate(
       }
       options.execution_preference = execution_preference;
     }
+
+    std::string string_execution_priority =
+        params.Get<std::string>("nnapi_execution_priority");
+    // Only set execution priority if user explicitly passes one. Otherwise,
+    // leave it as whatever NNAPI has as the default.
+    if (!string_execution_priority.empty()) {
+      int execution_priority = 0;
+      if (string_execution_priority == "default") {
+        execution_priority = ANEURALNETWORKS_PRIORITY_DEFAULT;
+      } else if (string_execution_priority == "low") {
+        execution_priority = ANEURALNETWORKS_PRIORITY_LOW;
+      } else if (string_execution_priority == "medium") {
+        execution_priority = ANEURALNETWORKS_PRIORITY_MEDIUM;
+      } else if (string_execution_priority == "high") {
+        execution_priority = ANEURALNETWORKS_PRIORITY_HIGH;
+      } else {
+        TFLITE_LOG(WARN) << "The provided value (" << string_execution_priority
+                         << ") is not a valid nnapi execution priority.";
+      }
+      options.execution_priority = execution_priority;
+    }
+
     int max_delegated_partitions = params.Get<int>("max_delegated_partitions");
     if (max_delegated_partitions > 0) {
       options.max_number_delegated_partitions = max_delegated_partitions;

From aed5e3ea006fd8e7577c9edaa7a548353c11d19a Mon Sep 17 00:00:00 2001
From: Peng Wang <wangpeng@google.com>
Date: Mon, 29 Jun 2020 19:55:10 -0700
Subject: [PATCH 1296/1390] [TF-numpy] Adds @np_doc to `finfo`, `result_type`
 and `promote_types`.

PiperOrigin-RevId: 318944304
Change-Id: I7c37f744725c8da1e5ca58fd79640354bee05302
---
 tensorflow/python/ops/numpy_ops/np_utils.py | 99 ++++++++-------------
 1 file changed, 38 insertions(+), 61 deletions(-)

diff --git a/tensorflow/python/ops/numpy_ops/np_utils.py b/tensorflow/python/ops/numpy_ops/np_utils.py
index 4a7d5f8fea7..38469a66e29 100644
--- a/tensorflow/python/ops/numpy_ops/np_utils.py
+++ b/tensorflow/python/ops/numpy_ops/np_utils.py
@@ -82,22 +82,6 @@ def _to_numpy_type(dtype):
   return np.dtype(dtype)
 
 
-def finfo(dtype):
-  """Returns properties of floating point types.
-
-  Note that currently it just forwards to the numpy namesake, while tensorflow
-  and numpy dtypes may have different properties.
-
-  Args:
-    dtype: Could be a python type, a numpy type or a TF DType.
-
-  Returns:
-    A class describing properties of `dtype`, as described by
-    https://docs.scipy.org/doc/numpy/reference/generated/numpy.finfo.html
-  """
-  return np.finfo(_to_numpy_type(dtype))
-
-
 def isscalar(val):
   """Returns whether `val` is a scalar value or scalar Tensor."""
   if isinstance(val, np_arrays.ndarray):
@@ -112,51 +96,6 @@ def isscalar(val):
     return np.isscalar(val)
 
 
-# Can't use np_doc because np.result_type is a builtin function.
-def result_type(*arrays_and_dtypes):
-  """Returns the type resulting from applying NumPy type promotion to arguments.
-
-  Args:
-    *arrays_and_dtypes: A list of array_like objects or dtypes.
-
-  Returns:
-    A numpy dtype.
-  """
-
-  def maybe_get_dtype(x):
-    # Don't put np.ndarray in this list, because np.result_type looks at the
-    # value (not just dtype) of np.ndarray to decide the result type.
-    if isinstance(
-        x, (np_arrays.ndarray, core.Tensor, indexed_slices.IndexedSlices)):
-      return _to_numpy_type(x.dtype)
-    elif isinstance(x, dtypes.DType):
-      return _to_numpy_type(x)
-    return x
-
-  arrays_and_dtypes = [
-      maybe_get_dtype(x) for x in nest.flatten(arrays_and_dtypes)
-  ]
-  if not arrays_and_dtypes:
-    # If arrays_and_dtypes is an empty list, let numpy decide what the dtype is.
-    arrays_and_dtypes = [np.asarray([])]
-  return np_dtypes._result_type(*arrays_and_dtypes)  # pylint: disable=protected-access
-
-
-def promote_types(type1, type2):
-  """Returns the type resulting from applying NumPy type promotion.
-
-  Args:
-    type1: A numpy type.
-    type2: A numpy type.
-
-  Returns:
-    A numpy type.
-  """
-  type1 = _to_numpy_type(type1)
-  type2 = _to_numpy_type(type2)
-  return np_dtypes.canonicalize_dtype(np.promote_types(type1, type2))
-
-
 def _has_docstring(f):
   return (f and hasattr(f, '__doc__') and isinstance(f.__doc__, str) and
           f.__doc__)
@@ -360,6 +299,44 @@ def np_doc_only(np_fun_name, np_fun=None):
   return decorator
 
 
+# pylint: disable=g-short-docstring-punctuation,g-no-space-after-docstring-summary,g-docstring-missing-newline,g-doc-return-or-yield,g-doc-args
+@np_doc('finfo')
+def finfo(dtype):
+  """Note that currently it just forwards to the numpy namesake, while
+  tensorflow and numpy dtypes may have different properties."""
+  return np.finfo(_to_numpy_type(dtype))
+# pylint: enable=g-short-docstring-punctuation,g-no-space-after-docstring-summary,g-docstring-missing-newline,g-doc-return-or-yield,g-doc-args
+
+
+# Can't use np_doc because np.result_type is a builtin function.
+@np_doc_only('result_type')
+def result_type(*arrays_and_dtypes):  # pylint: disable=missing-function-docstring
+  def maybe_get_dtype(x):
+    # Don't put np.ndarray in this list, because np.result_type looks at the
+    # value (not just dtype) of np.ndarray to decide the result type.
+    if isinstance(
+        x, (np_arrays.ndarray, core.Tensor, indexed_slices.IndexedSlices)):
+      return _to_numpy_type(x.dtype)
+    elif isinstance(x, dtypes.DType):
+      return _to_numpy_type(x)
+    return x
+
+  arrays_and_dtypes = [
+      maybe_get_dtype(x) for x in nest.flatten(arrays_and_dtypes)
+  ]
+  if not arrays_and_dtypes:
+    # If arrays_and_dtypes is an empty list, let numpy decide what the dtype is.
+    arrays_and_dtypes = [np.asarray([])]
+  return np_dtypes._result_type(*arrays_and_dtypes)  # pylint: disable=protected-access
+
+
+@np_doc('promote_types')
+def promote_types(type1, type2):  # pylint: disable=missing-function-docstring
+  type1 = _to_numpy_type(type1)
+  type2 = _to_numpy_type(type2)
+  return np_dtypes.canonicalize_dtype(np.promote_types(type1, type2))
+
+
 def tf_broadcast(*args):
   """Broadcast tensors.
 

From 912a47cd31c71f8112ba5ec62bbde46a0f83760d Mon Sep 17 00:00:00 2001
From: Akshay Modi <nareshmodi@google.com>
Date: Mon, 29 Jun 2020 20:14:51 -0700
Subject: [PATCH 1297/1390] Small improvements for eager perf

PiperOrigin-RevId: 318946057
Change-Id: I3035725c71b8a87853253c97c2ff7d5dfa74b12d
---
 .../python/ops/numpy_ops/np_array_ops.py      | 41 +++++++++++++------
 tensorflow/python/ops/numpy_ops/np_arrays.py  |  2 +-
 .../python/ops/numpy_ops/np_math_ops.py       |  8 ++--
 tensorflow/python/ops/numpy_ops/np_utils.py   |  7 ++++
 4 files changed, 41 insertions(+), 17 deletions(-)

diff --git a/tensorflow/python/ops/numpy_ops/np_array_ops.py b/tensorflow/python/ops/numpy_ops/np_array_ops.py
index 25241272699..efc6020f070 100644
--- a/tensorflow/python/ops/numpy_ops/np_array_ops.py
+++ b/tensorflow/python/ops/numpy_ops/np_array_ops.py
@@ -161,18 +161,8 @@ def full_like(a, fill_value, dtype=None, order='K', subok=True, shape=None):  #
       array_ops.broadcast_to(fill_value.data, array_ops.shape(a)))
 
 
-# TODO(wangpeng): investigate whether we can make `copy` default to False.
-# pylint: disable=g-short-docstring-punctuation,g-no-space-after-docstring-summary,g-doc-return-or-yield,g-doc-args
-@np_utils.np_doc_only('array')
-def array(val, dtype=None, copy=True, ndmin=0):  # pylint: disable=redefined-outer-name
-  """Since Tensors are immutable, a copy is made only if val is placed on a
-
-  different device than the current one. Even if `copy` is False, a new Tensor
-  may need to be built to satisfy `dtype` and `ndim`. This is used only if `val`
-  is an ndarray or a Tensor.
-  """  # pylint:disable=g-docstring-missing-newline
-  if dtype:
-    dtype = np_utils.result_type(dtype)
+def _array_internal(val, dtype=None, copy=True, ndmin=0):  # pylint: disable=redefined-outer-name
+  """Main implementation of np.array()."""
   if isinstance(val, np_arrays.ndarray):
     result_t = val.data
   else:
@@ -208,6 +198,10 @@ def array(val, dtype=None, copy=True, ndmin=0):  # pylint: disable=redefined-out
     result_t = math_ops.cast(result_t, dtype=dtype)
   elif dtype:
     result_t = math_ops.cast(result_t, dtype)
+
+  if ndmin == 0:
+    return np_arrays.tensor_to_ndarray(result_t)
+
   ndims = array_ops.rank(result_t)
 
   def true_fn():
@@ -221,6 +215,21 @@ def array(val, dtype=None, copy=True, ndmin=0):  # pylint: disable=redefined-out
   return np_arrays.tensor_to_ndarray(result_t)
 
 
+# TODO(wangpeng): investigate whether we can make `copy` default to False.
+# pylint: disable=g-short-docstring-punctuation,g-no-space-after-docstring-summary,g-doc-return-or-yield,g-doc-args
+@np_utils.np_doc_only('array')
+def array(val, dtype=None, copy=True, ndmin=0):  # pylint: disable=redefined-outer-name
+  """Since Tensors are immutable, a copy is made only if val is placed on a
+
+  different device than the current one. Even if `copy` is False, a new Tensor
+  may need to be built to satisfy `dtype` and `ndim`. This is used only if `val`
+  is an ndarray or a Tensor.
+  """  # pylint:disable=g-docstring-missing-newline
+  if dtype:
+    dtype = np_utils.result_type(dtype)
+  return _array_internal(val, dtype, copy, ndmin)
+
+
 # pylint: enable=g-short-docstring-punctuation,g-no-space-after-docstring-summary,g-doc-return-or-yield,g-doc-args
 
 
@@ -360,7 +369,13 @@ def diagflat(v, k=0):
 
 def _promote_dtype(*arrays):
   dtype = np_utils.result_type(*arrays)
-  return [asarray(a, dtype=dtype) for a in arrays]
+  def _fast_asarray(a):
+    if isinstance(a, numbers.Real):
+      return np_utils.tensor_to_ndarray(np_arrays.convert_to_tensor(a, dtype))
+    if isinstance(a, np_arrays.ndarray) and dtype == a.dtype:
+      return a
+    return _array_internal(a, dtype=dtype, copy=False)
+  return [_fast_asarray(a) for a in arrays]
 
 
 @np_utils.np_doc('all')
diff --git a/tensorflow/python/ops/numpy_ops/np_arrays.py b/tensorflow/python/ops/numpy_ops/np_arrays.py
index 38a198e6dab..69276b495e7 100644
--- a/tensorflow/python/ops/numpy_ops/np_arrays.py
+++ b/tensorflow/python/ops/numpy_ops/np_arrays.py
@@ -52,7 +52,7 @@ def convert_to_tensor(value, dtype=None, dtype_hint=None):
   if (dtype is None and isinstance(value, six.integer_types) and
       value >= 2**63):
     dtype = dtypes.uint64
-  elif (dtype is None and isinstance(value, float)):
+  elif dtype is None and dtype_hint is None and isinstance(value, float):
     dtype = np_dtypes.default_float_type()
   return ops.convert_to_tensor(value, dtype=dtype, dtype_hint=dtype_hint)
 
diff --git a/tensorflow/python/ops/numpy_ops/np_math_ops.py b/tensorflow/python/ops/numpy_ops/np_math_ops.py
index 03329bbdbf1..3df8ff5e6b7 100644
--- a/tensorflow/python/ops/numpy_ops/np_math_ops.py
+++ b/tensorflow/python/ops/numpy_ops/np_math_ops.py
@@ -204,14 +204,16 @@ def clip(a, a_min, a_max):  # pylint: disable=missing-docstring
 
 @np_utils.np_doc('matmul')
 def matmul(x1, x2):  # pylint: disable=missing-docstring
-
   def f(x1, x2):
     try:
+      if x1.shape.rank == 2 and x2.shape.rank == 2:
+        # Fast path for known ranks.
+        return math_ops.matmul(x1, x2)
       return np_utils.cond(
-          math_ops.equal(array_ops.rank(x2), 1),
+          math_ops.equal(np_utils.tf_rank(x2), 1),
           lambda: math_ops.tensordot(x1, x2, axes=1),
           lambda: np_utils.cond(  # pylint: disable=g-long-lambda
-              math_ops.equal(array_ops.rank(x1), 1),
+              math_ops.equal(np_utils.tf_rank(x1), 1),
               lambda: math_ops.tensordot(  # pylint: disable=g-long-lambda
                   x1, x2, axes=[[0], [-2]]),
               lambda: math_ops.matmul(x1, x2)))
diff --git a/tensorflow/python/ops/numpy_ops/np_utils.py b/tensorflow/python/ops/numpy_ops/np_utils.py
index 38469a66e29..5446322316b 100644
--- a/tensorflow/python/ops/numpy_ops/np_utils.py
+++ b/tensorflow/python/ops/numpy_ops/np_utils.py
@@ -474,3 +474,10 @@ def reduce_any(input_tensor, axis=None, keepdims=False):
     return math_ops.reduce_any(input_tensor, axis=axis, keepdims=keepdims)
   else:
     return v.any(axis=axis, keepdims=keepdims)
+
+
+def tf_rank(t):
+  r = t.shape.rank
+  if r is not None:
+    return r
+  return array_ops.rank(t)

From a43d419def0ef1aab192096f2e7e91e9aada38b1 Mon Sep 17 00:00:00 2001
From: Jose Baiocchi <jbaiocchi@google.com>
Date: Mon, 29 Jun 2020 21:37:26 -0700
Subject: [PATCH 1298/1390] Expose predicates for matching TF op names and
 types

PiperOrigin-RevId: 318953869
Change-Id: Ie8bd54cbba731a081af667d6107f85bbf6730c46
---
 tensorflow/core/profiler/utils/tf_op_utils.cc | 23 +++++++++++--------
 tensorflow/core/profiler/utils/tf_op_utils.h  | 10 +++++++-
 2 files changed, 23 insertions(+), 10 deletions(-)

diff --git a/tensorflow/core/profiler/utils/tf_op_utils.cc b/tensorflow/core/profiler/utils/tf_op_utils.cc
index 99ccdaad357..eeafd8e6525 100644
--- a/tensorflow/core/profiler/utils/tf_op_utils.cc
+++ b/tensorflow/core/profiler/utils/tf_op_utils.cc
@@ -40,17 +40,23 @@ const absl::string_view kDatasetOp = "Dataset";
 const absl::string_view kMemcpyHToDOp = "MemcpyHToD";
 const absl::string_view kMemcpyDToHOp = "MemcpyDToH";
 
-TfOp ParseTfOpFullname(absl::string_view tf_op_fullname) {
-  // TF Op names have the format "name:type" where:
-  // - name is a NodeDef.name and must match:
+bool IsTfOpName(absl::string_view op_name) {
   static const LazyRE2 kTfOpNameRegEx = {"[A-Za-z0-9.][A-Za-z0-9_./]*"};
-  // - if type starts with underscore it is internal to TensorFlow.
-  // - type is an OpDef.name, must be CamelCase and match:
+  return RE2::FullMatch(op_name, *kTfOpNameRegEx);
+}
+
+bool IsTfOpType(absl::string_view op_type) {
   static const LazyRE2 kTfOpTypeRegEx = {"[A-Z_][a-zA-Z0-9_]*"};
+  return RE2::FullMatch(op_type, *kTfOpTypeRegEx);
+}
 
-  // JAX op types have only lowercase letters and underscores.
+bool IsJaxOpType(absl::string_view op_type) {
   static const LazyRE2 kJaxOpTypeRegEx = {"[a-z_]*"};
+  return RE2::FullMatch(op_type, *kJaxOpTypeRegEx);
+}
 
+TfOp ParseTfOpFullname(absl::string_view tf_op_fullname) {
+  // TF Op names have the format "name:type".
   TfOp tf_op = {Category::kUnknown, tf_op_fullname, kUnknownOp};
   std::vector<absl::string_view> parts =
       absl::StrSplit(tf_op_fullname, absl::MaxSplits(':', 1));
@@ -70,10 +76,9 @@ TfOp ParseTfOpFullname(absl::string_view tf_op_fullname) {
     // input-pipeline analysis.
     tf_op.category = Category::kTfData;
     tf_op.type = kDatasetOp;
-  } else if (RE2::FullMatch(parts[1], *kTfOpTypeRegEx) &&
-             RE2::FullMatch(parts[0], *kTfOpNameRegEx)) {  // TensorFlow
+  } else if (IsTfOpType(parts[1]) && IsTfOpName(parts[0])) {
     tf_op = {Category::kTensorFlow, parts[0], parts[1]};
-  } else if (RE2::FullMatch(parts[1], *kJaxOpTypeRegEx)) {  // JAX
+  } else if (IsJaxOpType(parts[1])) {
     tf_op = {Category::kJax, parts[0], parts[1]};
   }
   return tf_op;
diff --git a/tensorflow/core/profiler/utils/tf_op_utils.h b/tensorflow/core/profiler/utils/tf_op_utils.h
index cb05a6098fa..4a63d68bffb 100644
--- a/tensorflow/core/profiler/utils/tf_op_utils.h
+++ b/tensorflow/core/profiler/utils/tf_op_utils.h
@@ -46,7 +46,6 @@ struct TfOp {
   absl::string_view name;
   absl::string_view type;
 };
-
 TfOp ParseTfOpFullname(absl::string_view tf_op_fullname);
 
 // Returns a vector of TF name scopes extracted from tf_op_full_name.
@@ -87,6 +86,15 @@ inline bool IsMemcpyDToHOp(absl::string_view tf_op_type) {
 std::vector<absl::string_view> ParseTensorShapes(
     absl::string_view tensor_shapes);
 
+// Returns true if the given string matches OpDef.name pattern.
+bool IsTfOpName(absl::string_view op_name);
+
+// Returns true if the given string matches NodeDef.name pattern.
+bool IsTfOpType(absl::string_view op_type);
+
+// Returns true if the given string matches JAX pattern.
+bool IsJaxOpType(absl::string_view op_type);
+
 }  // namespace profiler
 }  // namespace tensorflow
 

From 963f6fdc528d1ad7547b8d3910c07ed2cfa2ecb4 Mon Sep 17 00:00:00 2001
From: Ce Zheng <zce@google.com>
Date: Mon, 29 Jun 2020 22:14:40 -0700
Subject: [PATCH 1299/1390] Add tf.MergeV2Checkpoints, tf.RestoreV2Op,
 tf.ShardedFilename, tf.SparseFillEmptyRows, tf.SparseReshape,
 tf.SparseSegmentSqrtN, tf.StringJoin, tf.StringToHashBucketFast,
 tf.InitializeTableFromTextFileV2 to TensorFlow MLIR ODS.

PiperOrigin-RevId: 318957303
Change-Id: I4c7c859ff0dd7441f6d9a0fd7311fb5740f6b164
---
 .../mlir/tensorflow/ir/tf_generated_ops.td    | 257 ++++++++++++++++++
 .../compiler/mlir/tensorflow/ir/tf_ops.td     |  30 ++
 2 files changed, 287 insertions(+)

diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td b/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
index 041e88a75f2..1edac0e535f 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
@@ -4223,6 +4223,35 @@ tf.imag(input) ==> [4.75, 5.75]
   TF_DerivedResultTypeAttr Tout = TF_DerivedResultTypeAttr<0>;
 }
 
+def TF_InitializeTableFromTextFileV2Op : TF_Op<"InitializeTableFromTextFileV2", []> {
+  let summary = "Initializes a table from a text file.";
+
+  let description = [{
+It inserts one key-value pair into the table for each line of the file.
+The key and value is extracted from the whole line content, elements from the
+split line based on `delimiter` or the line number (starting from zero).
+Where to extract the key and value from a line is specified by `key_index` and
+`value_index`.
+
+- A value of -1 means use the line number(starting from zero), expects `int64`.
+- A value of -2 means use the whole line content, expects `string`.
+- A value >= 0 means use the index (starting at zero) of the split line based
+  on `delimiter`.
+  }];
+
+  let arguments = (ins
+    TF_ResourceTensor:$table_handle,
+    TF_StrTensor:$filename,
+
+    Confined<I64Attr, [IntMinValue<-2>]>:$key_index,
+    Confined<I64Attr, [IntMinValue<-2>]>:$value_index,
+    Confined<DefaultValuedAttr<I64Attr, "-1">, [IntMinValue<-1>]>:$vocab_size,
+    DefaultValuedAttr<StrAttr, "\t">:$delimiter
+  );
+
+  let results = (outs);
+}
+
 def TF_InvOp : TF_Op<"Inv", [NoSideEffect, SameOperandsAndResultType]> {
   let summary = "Computes the reciprocal of x element-wise.";
 
@@ -5684,6 +5713,32 @@ def TF_MaximumOp : TF_Op<"Maximum", [NoSideEffect, ResultsBroadcastableShape]>,
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
 }
 
+def TF_MergeV2CheckpointsOp : TF_Op<"MergeV2Checkpoints", []> {
+  let summary = [{
+V2 format specific: merges the metadata files of sharded checkpoints.  The
+  }];
+
+  let description = [{
+result is one logical checkpoint, with one physical metadata file and renamed
+data files.
+
+Intended for "grouping" multiple checkpoints in a sharded checkpoint setup.
+
+If delete_old_dirs is true, attempts to delete recursively the dirname of each
+path in the input checkpoint_prefixes.  This is useful when those paths are non
+user-facing temporary locations.
+  }];
+
+  let arguments = (ins
+    TF_StrTensor:$checkpoint_prefixes,
+    TF_StrTensor:$destination_prefix,
+
+    DefaultValuedAttr<BoolAttr, "true">:$delete_old_dirs
+  );
+
+  let results = (outs);
+}
+
 def TF_MinOp : TF_Op<"Min", [NoSideEffect]> {
   let summary = [{
 Computes the minimum of elements across dimensions of a tensor.
@@ -7510,6 +7565,38 @@ This operation computes
   TF_DerivedOperandTypeAttr dtype = TF_DerivedOperandTypeAttr<2>;
 }
 
+def TF_RestoreV2Op : TF_Op<"RestoreV2", []> {
+  let summary = "Restores tensors from a V2 checkpoint.";
+
+  let description = [{
+For backward compatibility with the V1 format, this Op currently allows
+restoring from a V1 checkpoint as well:
+  - This Op first attempts to find the V2 index file pointed to by "prefix", and
+    if found proceed to read it as a V2 checkpoint;
+  - Otherwise the V1 read path is invoked.
+Relying on this behavior is not recommended, as the ability to fall back to read
+V1 might be deprecated and eventually removed.
+
+By default, restores the named tensors in full.  If the caller wishes to restore
+specific slices of stored tensors, "shape_and_slices" should be non-empty
+strings and correspondingly well-formed.
+
+Callers must ensure all the named tensors are indeed stored in the checkpoint.
+  }];
+
+  let arguments = (ins
+    TF_StrTensor:$prefix,
+    TF_StrTensor:$tensor_names,
+    TF_StrTensor:$shape_and_slices
+  );
+
+  let results = (outs
+    Variadic<TF_Tensor>:$tensors
+  );
+
+  TF_DerivedResultTypeListAttr dtypes = TF_DerivedResultTypeListAttr<0>;
+}
+
 def TF_ReverseSequenceOp : TF_Op<"ReverseSequence", [NoSideEffect]> {
   let summary = "Reverses variable length slices.";
 
@@ -8282,6 +8369,26 @@ This operation returns N 1-D integer tensors representing shape of `input[i]s`.
   let hasFolder = 1;
 }
 
+def TF_ShardedFilenameOp : TF_Op<"ShardedFilename", [NoSideEffect]> {
+  let summary = [{
+Generate a sharded filename. The filename is printf formatted as
+  }];
+
+  let description = [{
+%s-%05d-of-%05d, basename, shard, num_shards.
+  }];
+
+  let arguments = (ins
+    TF_StrTensor:$basename,
+    I32Tensor:$shard,
+    I32Tensor:$num_shards
+  );
+
+  let results = (outs
+    TF_StrTensor:$filename
+  );
+}
+
 def TF_SigmoidOp : TF_Op<"Sigmoid", [NoSideEffect, SameOperandsAndResultType]> {
   let summary = "Computes sigmoid of `x` element-wise.";
 
@@ -8737,6 +8844,128 @@ x = [[[[1, 2, 3, 4],
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
 }
 
+def TF_SparseFillEmptyRowsOp : TF_Op<"SparseFillEmptyRows", [NoSideEffect]> {
+  let summary = [{
+Fills empty rows in the input 2-D `SparseTensor` with a default value.
+  }];
+
+  let description = [{
+The input `SparseTensor` is represented via the tuple of inputs
+(`indices`, `values`, `dense_shape`).  The output `SparseTensor` has the
+same `dense_shape` but with indices `output_indices` and values
+`output_values`.
+
+This op inserts a single entry for every row that doesn't have any values.
+The index is created as `[row, 0, ..., 0]` and the inserted value
+is `default_value`.
+
+For example, suppose `sp_input` has shape `[5, 6]` and non-empty values:
+
+    [0, 1]: a
+    [0, 3]: b
+    [2, 0]: c
+    [3, 1]: d
+
+Rows 1 and 4 are empty, so the output will be of shape `[5, 6]` with values:
+
+    [0, 1]: a
+    [0, 3]: b
+    [1, 0]: default_value
+    [2, 0]: c
+    [3, 1]: d
+    [4, 0]: default_value
+
+The output `SparseTensor` will be in row-major order and will have the
+same shape as the input.
+
+This op also returns an indicator vector shaped `[dense_shape[0]]` such that
+
+    empty_row_indicator[i] = True iff row i was an empty row.
+
+And a reverse index map vector shaped `[indices.shape[0]]` that is used during
+backpropagation,
+
+    reverse_index_map[j] = out_j s.t. indices[j, :] == output_indices[out_j, :]
+  }];
+
+  let arguments = (ins
+    I64Tensor:$indices,
+    TF_Tensor:$values,
+    I64Tensor:$dense_shape,
+    TF_Tensor:$default_value
+  );
+
+  let results = (outs
+    I64Tensor:$output_indices,
+    TF_Tensor:$output_values,
+    I1Tensor:$empty_row_indicator,
+    I64Tensor:$reverse_index_map
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<1>;
+}
+
+def TF_SparseReshapeOp : TF_Op<"SparseReshape", [NoSideEffect]> {
+  let summary = [{
+Reshapes a SparseTensor to represent values in a new dense shape.
+  }];
+
+  let description = [{
+This operation has the same semantics as reshape on the represented dense
+tensor.  The `input_indices` are recomputed based on the requested `new_shape`.
+
+If one component of `new_shape` is the special value -1, the size of that
+dimension is computed so that the total dense size remains constant.  At
+most one component of `new_shape` can be -1.  The number of dense elements
+implied by `new_shape` must be the same as the number of dense elements
+originally implied by `input_shape`.
+
+Reshaping does not affect the order of values in the SparseTensor.
+
+If the input tensor has rank `R_in` and `N` non-empty values, and `new_shape`
+has length `R_out`, then `input_indices` has shape `[N, R_in]`,
+`input_shape` has length `R_in`, `output_indices` has shape `[N, R_out]`, and
+`output_shape` has length `R_out`.
+  }];
+
+  let arguments = (ins
+    I64Tensor:$input_indices,
+    I64Tensor:$input_shape,
+    I64Tensor:$new_shape
+  );
+
+  let results = (outs
+    I64Tensor:$output_indices,
+    I64Tensor:$output_shape
+  );
+}
+
+def TF_SparseSegmentSqrtNOp : TF_Op<"SparseSegmentSqrtN", [NoSideEffect]> {
+  let summary = [{
+Computes the sum along sparse segments of a tensor divided by the sqrt of N.
+  }];
+
+  let description = [{
+N is the size of the segment being reduced.
+
+See `tf.sparse.segment_sum` for usage examples.
+  }];
+
+  let arguments = (ins
+    TensorOf<[BF16, F32, F64]>:$data,
+    TF_I32OrI64Tensor:$indices,
+    TF_I32OrI64Tensor:$segment_ids
+  );
+
+  let results = (outs
+    TensorOf<[BF16, F32, F64]>:$output
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+  TF_DerivedOperandTypeAttr Tsegmentids = TF_DerivedOperandTypeAttr<2>;
+  TF_DerivedOperandTypeAttr Tidx = TF_DerivedOperandTypeAttr<1>;
+}
+
 def TF_SparseSoftmaxCrossEntropyWithLogitsOp : TF_Op<"SparseSoftmaxCrossEntropyWithLogits", [NoSideEffect]> {
   let summary = [{
 Computes softmax cross entropy cost and gradients to backpropagate.
@@ -9259,6 +9488,34 @@ shape of `StridedSlice`'s `input`.
   }];
 }
 
+def TF_StringJoinOp : TF_Op<"StringJoin", [NoSideEffect]> {
+  let summary = [{
+Joins the strings in the given list of string tensors into one tensor;
+  }];
+
+  let description = [{
+with the given separator (default is an empty separator).
+
+Examples:
+
+>>> s = ["hello", "world", "tensorflow"]
+>>> tf.strings.join(s, " ")
+<tf.Tensor: shape=(), dtype=string, numpy=b'hello world tensorflow'>
+  }];
+
+  let arguments = (ins
+    Variadic<TF_StrTensor>:$inputs,
+
+    StrAttr:$separator
+  );
+
+  let results = (outs
+    TF_StrTensor:$output
+  );
+
+  TF_DerivedOperandSizeAttr N = TF_DerivedOperandSizeAttr<0>;
+}
+
 def TF_SubOp : TF_Op<"Sub", [NoSideEffect, ResultsBroadcastableShape]>,
                WithBroadcastableBinOpBuilder {
   let summary = "Returns x - y element-wise.";
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td
index 10b77127655..d004bc521a8 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td
@@ -1130,4 +1130,34 @@ This function is faster and numerically stabler than `bessel_i1(x)`.
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
 }
 
+def TF_StringToHashBucketFastOp : TF_Op<"StringToHashBucketFast", [NoSideEffect]> {
+  let summary = [{
+Converts each string in the input Tensor to its hash mod by a number of buckets.
+  }];
+
+  let description = [{
+The hash function is deterministic on the content of the string within the
+process and will never change. However, it is not suitable for cryptography.
+This function may be used when CPU time is scarce and inputs are trusted or
+unimportant. There is a risk of adversaries constructing inputs that all hash
+to the same bucket. To prevent this problem, use a strong hash function with
+`tf.string_to_hash_bucket_strong`.
+
+Examples:
+
+>>> tf.strings.to_hash_bucket_fast(["Hello", "TensorFlow", "2.x"], 3).numpy()
+array([0, 2, 2])
+  }];
+
+  let arguments = (ins
+    TF_StrTensor:$input,
+
+    Confined<I64Attr, [IntMinValue<1>]>:$num_buckets
+  );
+
+  let results = (outs
+    I64Tensor:$output
+  );
+}
+
 #endif // TF_OPS

From a0a180abf100a4180250d70a40da097bbe76e5f8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20Kubov=C4=8D=C3=ADk?= <markub3327@gmail.com>
Date: Tue, 30 Jun 2020 07:28:56 +0200
Subject: [PATCH 1300/1390] Update hdf5_format.py

---
 tensorflow/python/keras/saving/hdf5_format.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/keras/saving/hdf5_format.py b/tensorflow/python/keras/saving/hdf5_format.py
index a3472ff2c68..4a2b4ac0f03 100644
--- a/tensorflow/python/keras/saving/hdf5_format.py
+++ b/tensorflow/python/keras/saving/hdf5_format.py
@@ -32,10 +32,10 @@ from tensorflow.python.keras.saving import saving_utils
 from tensorflow.python.keras.utils import conv_utils
 from tensorflow.python.keras.utils.io_utils import ask_to_proceed_with_overwrite
 from tensorflow.python.ops import variables as variables_module
+from tensorflow.python.platform import gfile
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import serialization
 from tensorflow.python.util.lazy_loader import LazyLoader
-import tensorflow.python.platform.gfile as gfile
 
 # pylint: disable=g-import-not-at-top
 try:

From 6e69fc67a04d80dcf79091440b2fb40c65910f3a Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 29 Jun 2020 23:45:38 -0700
Subject: [PATCH 1301/1390] Fix tf.custom_gradient spurious error when the
 grad_fn uses varargs.

The documentation recommends using the signature `g(*grad_ys, variables=None)`
but I get an error with that signature. Same with `g(dy, *, variables=None)`.
The error is that the code that triggers the error does not look for `variables`
in the kwonlyargs.

PiperOrigin-RevId: 318966853
Change-Id: Icb06a9e47499bc62d3dc88a24847cda81f98a543
---
 tensorflow/python/ops/custom_gradient.py |  2 ++
 tensorflow/python/ops/gradients_test.py  | 26 ++++++++++++++++++++++++
 2 files changed, 28 insertions(+)

diff --git a/tensorflow/python/ops/custom_gradient.py b/tensorflow/python/ops/custom_gradient.py
index f42e000b4b9..185f0d04782 100644
--- a/tensorflow/python/ops/custom_gradient.py
+++ b/tensorflow/python/ops/custom_gradient.py
@@ -372,6 +372,7 @@ def _graph_mode_decorator(f, args, kwargs):
 
   grad_argspec = tf_inspect.getfullargspec(grad_fn)
   variables_in_signature = ("variables" in grad_argspec.args or
+                            "variables" in grad_argspec.kwonlyargs or
                             grad_argspec.varkw)
   if variables and not variables_in_signature:
     raise TypeError(
@@ -440,6 +441,7 @@ def _eager_mode_decorator(f, args, kwargs):
   ]
   grad_argspec = tf_inspect.getfullargspec(grad_fn)
   if (variables and ("variables" not in grad_argspec.args) and
+      ("variables" not in grad_argspec.kwonlyargs) and
       not grad_argspec.varkw):
     raise TypeError(
         "@tf.custom_gradient grad_fn must accept keyword argument 'variables', "
diff --git a/tensorflow/python/ops/gradients_test.py b/tensorflow/python/ops/gradients_test.py
index 81140af6e79..78fbcdd6e6f 100644
--- a/tensorflow/python/ops/gradients_test.py
+++ b/tensorflow/python/ops/gradients_test.py
@@ -1293,6 +1293,32 @@ class CustomGradientTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     with variable_scope.variable_scope("vs2", use_resource=False):
       FResource(x)
 
+  @parameterized.parameters(True, False)
+  def testCustomGradientVariablesKwonlyArgs(self, anonymous_varargs):
+    with context.eager_mode():
+      x_captured = variables.Variable(3.)  # Used by FuncMult
+      @custom_gradient.custom_gradient
+      def FuncMult(x):
+        def ActualGrad(dy, variables):  # pylint: disable=redefined-outer-name
+          self.assertLen(variables, 1)
+          self.assertIs(variables[0], x_captured)
+          x_captured_grad = 5. * x * dy
+          return (4. * x_captured * dy, [x_captured_grad])
+        # Define the returned GradMult, using varargs; "variables" is kwonlyarg
+        if anonymous_varargs:
+          def GradMult(dy, *, variables=None):  # pylint: disable=redefined-outer-name
+            return ActualGrad(dy, variables)
+        else:
+          def GradMult(*dys, variables=None):  # pylint: disable=redefined-outer-name
+            return ActualGrad(dys[0], variables)
+
+        return x * x_captured, GradMult
+
+      x = variables.Variable(6.)
+      with backprop.GradientTape(persistent=True) as g:
+        y = FuncMult(x)
+      self.assertAllEqual(g.gradient(y, x), 4. * 3.)
+
   def testWithNumpyInputs(self):
     with context.eager_mode():
 

From 3b321e9bd302edc507fcb3a42b3a88a5768b50eb Mon Sep 17 00:00:00 2001
From: Tomer Kaftan <kaftan@google.com>
Date: Tue, 30 Jun 2020 00:16:25 -0700
Subject: [PATCH 1302/1390] Add Keras serialization for dtype kwargs. Also make
 failed serialization raise an error instead of a warning. The warning is easy
 to miss, and allowing it without erroring out is likely to cause silent,
 hard-to-trace failures.

PiperOrigin-RevId: 318969748
Change-Id: I4da0032cb3e454a3e155ab70b4a15fcb72f38c4c
---
 .../python/keras/engine/functional_test.py    | 71 +++++++++++++++++++
 tensorflow/python/keras/engine/node.py        |  9 +--
 .../keras/saving/saved_model/json_utils.py    |  3 +
 tensorflow/python/util/serialization.py       |  4 ++
 4 files changed, 81 insertions(+), 6 deletions(-)

diff --git a/tensorflow/python/keras/engine/functional_test.py b/tensorflow/python/keras/engine/functional_test.py
index 24b0e147b97..b8768a5e311 100644
--- a/tensorflow/python/keras/engine/functional_test.py
+++ b/tensorflow/python/keras/engine/functional_test.py
@@ -969,6 +969,77 @@ class NetworkConstructionTest(keras_parameterized.TestCase):
     # Check that second input was correctly added to first.
     self.assertEqual(history.history['loss'][0], 0.0)
 
+  @combinations.generate(combinations.keras_mode_combinations())
+  def test_call_kwarg_dtype_serialization(self):
+
+    class Double(layers.Layer):
+
+      def call(self, x1, dtype=None):
+        return math_ops.cast(x1 + x1, dtype=dtype)
+
+    input1 = input_layer_lib.Input(10)
+    outputs = Double()(input1, dtype=dtypes.float16)
+    model = training_lib.Model([input1], outputs)
+    model.compile(
+        'sgd',
+        'mse',
+        run_eagerly=testing_utils.should_run_eagerly())
+    history = model.fit(
+        x=[3 * np.ones((10, 10))],
+        y=6 * np.ones((10, 10)),
+        batch_size=2)
+    # Check that input was correctly doubled.
+    self.assertEqual(history.history['loss'][0], 0.0)
+
+    # Check the output dtype
+    self.assertEqual(model(array_ops.ones(3, 3)).dtype, dtypes.float16)
+
+    model = training_lib.Model.from_config(
+        model.get_config(), custom_objects={'Double': Double})
+    model.compile(
+        'sgd',
+        'mse',
+        run_eagerly=testing_utils.should_run_eagerly())
+    history = model.fit(
+        x=[3 * np.ones((10, 10))],
+        y=6 * np.ones((10, 10)),
+        batch_size=2)
+    # Check that input was correctly doubled.
+    self.assertEqual(history.history['loss'][0], 0.0)
+
+    # Check the output dtype
+    self.assertEqual(model(array_ops.ones(3, 3)).dtype, dtypes.float16)
+
+  @combinations.generate(combinations.keras_mode_combinations())
+  def test_call_kwarg_nonserializable(self):
+
+    class Double(layers.Layer):
+
+      def call(self, x1, kwarg=None):
+        return x1 + x1
+
+    class NonSerializable(object):
+
+      def __init__(self, foo=None):
+        self.foo = foo
+
+    input1 = input_layer_lib.Input(10)
+    outputs = Double()(input1, kwarg=NonSerializable())
+    model = training_lib.Model([input1], outputs)
+    model.compile(
+        'sgd',
+        'mse',
+        run_eagerly=testing_utils.should_run_eagerly())
+    history = model.fit(
+        x=[3 * np.ones((10, 10))],
+        y=6 * np.ones((10, 10)),
+        batch_size=2)
+    # Check that input was correctly doubled.
+    self.assertEqual(history.history['loss'][0], 0.0)
+    with self.assertRaisesRegexp(
+        TypeError, 'Layer double was passed non-JSON-serializable arguments.'):
+      model.get_config()
+
   @combinations.generate(combinations.times(
       combinations.keras_mode_combinations(),
       combinations.keras_tensor_combinations()))
diff --git a/tensorflow/python/keras/engine/node.py b/tensorflow/python/keras/engine/node.py
index c75741da8ec..f629648baf0 100644
--- a/tensorflow/python/keras/engine/node.py
+++ b/tensorflow/python/keras/engine/node.py
@@ -29,7 +29,6 @@ from tensorflow.python.keras import backend
 from tensorflow.python.keras.engine import base_layer_utils
 from tensorflow.python.keras.engine import keras_tensor
 from tensorflow.python.keras.utils import tf_utils
-from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import nest
 from tensorflow.python.util import serialization
 
@@ -175,13 +174,11 @@ class Node(object):
       json.dumps(kwargs, default=serialization.get_json_type)
     except TypeError:
       kwarg_types = nest.map_structure(type, kwargs)
-      logging.warning('Layer ' + self.layer.name +
+      raise TypeError('Layer ' + self.layer.name +
                       ' was passed non-JSON-serializable arguments. ' +
                       'Arguments had types: ' +
-                      str(kwarg_types) + '. They will not be included '
-                      'in the serialized model (and thus will be missing '
-                      'at deserialization time).')
-      kwargs = {}
+                      str(kwarg_types) + '. They cannot be serialized out '
+                      'when saving the model.')
 
     # `kwargs` is added to each Tensor in the first arg. This should be
     # changed in a future version of the serialization format.
diff --git a/tensorflow/python/keras/saving/saved_model/json_utils.py b/tensorflow/python/keras/saving/saved_model/json_utils.py
index 0ac86d4e692..f50bbb02287 100644
--- a/tensorflow/python/keras/saving/saved_model/json_utils.py
+++ b/tensorflow/python/keras/saving/saved_model/json_utils.py
@@ -27,6 +27,7 @@ from __future__ import print_function
 
 import json
 
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.util import serialization
 
@@ -66,4 +67,6 @@ def _decode_helper(obj):
       return tensor_shape.TensorShape(obj['items'])
     elif obj['class_name'] == '__tuple__':
       return tuple(_decode_helper(i) for i in obj['items'])
+    elif obj['class_name'] == 'DType':
+      return dtypes.as_dtype(obj['dtype'])
   return obj
diff --git a/tensorflow/python/util/serialization.py b/tensorflow/python/util/serialization.py
index 1e5de4cb280..bf4a31e1a16 100644
--- a/tensorflow/python/util/serialization.py
+++ b/tensorflow/python/util/serialization.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 import numpy as np
 import wrapt
 
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.util.compat import collections_abc
 
@@ -63,6 +64,9 @@ def get_json_type(obj):
   if isinstance(obj, tensor_shape.TensorShape):
     return obj.as_list()
 
+  if isinstance(obj, dtypes.DType):
+    return {'class_name': 'DType', 'dtype': obj.name}
+
   if isinstance(obj, collections_abc.Mapping):
     return dict(obj)
 

From 35b3d903617ea8993555e59e627501f960698219 Mon Sep 17 00:00:00 2001
From: Stephan Herhut <herhut@google.com>
Date: Tue, 30 Jun 2020 01:09:52 -0700
Subject: [PATCH 1303/1390] Ensure that we remove all debug information during
 kernel lowering.

The MLIR to LLVM translation takes the presence of location information as a signal as to whether it should produce a "debug" module. This degrades performance. To be on the safe side, we now globally strip all debug information and not just for the kernels themselves.

PiperOrigin-RevId: 318975928
Change-Id: If1d871850ee4dcb66db1d4692a89890727ad8f80
---
 tensorflow/compiler/xla/service/mlir_gpu/kernel_lowering.cc | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tensorflow/compiler/xla/service/mlir_gpu/kernel_lowering.cc b/tensorflow/compiler/xla/service/mlir_gpu/kernel_lowering.cc
index 3d21379a624..46c55c26aa3 100644
--- a/tensorflow/compiler/xla/service/mlir_gpu/kernel_lowering.cc
+++ b/tensorflow/compiler/xla/service/mlir_gpu/kernel_lowering.cc
@@ -576,7 +576,8 @@ Status LowerKernelBodiesToNVVM(mlir::ModuleOp module) {
   // Some basic cleanup.
   kernelPm.addNestedPass<::mlir::FuncOp>(::mlir::createCanonicalizerPass());
   kernelPm.addNestedPass<::mlir::FuncOp>(::mlir::createCSEPass());
-  kernelPm.addPass(::mlir::createStripDebugInfoPass());
+  // Remove all location information to prevent a debug build.
+  pm.addPass(::mlir::createStripDebugInfoPass());
 
   if (failed(pm.run(module))) {
     return InternalError("Lowering to NVVM IR failed.");

From 3c7a1d88277833dfa9b76f111f19569e3951174c Mon Sep 17 00:00:00 2001
From: Taehee Jeong <taeheej@google.com>
Date: Tue, 30 Jun 2020 01:36:33 -0700
Subject: [PATCH 1304/1390] Support binding of Metal buffer to quantized models

When an input or output of quantized model is read from Metal buffer directly, correctly link the internal FP32 tensor to the buffer and disable dequantization/quantization for that input/output.

PiperOrigin-RevId: 318978466
Change-Id: I052bf6c1daab6425e6ff66166221f36b075326e8
---
 tensorflow/lite/delegates/gpu/metal_delegate.mm         | 8 ++++++++
 tensorflow/lite/delegates/gpu/metal_delegate_internal.h | 6 ++++--
 2 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/tensorflow/lite/delegates/gpu/metal_delegate.mm b/tensorflow/lite/delegates/gpu/metal_delegate.mm
index 01fa9dd7679..45bfe1f3b2f 100644
--- a/tensorflow/lite/delegates/gpu/metal_delegate.mm
+++ b/tensorflow/lite/delegates/gpu/metal_delegate.mm
@@ -205,6 +205,14 @@ class Delegate {
   }
 
   absl::Status BindBufferToTensor(id<MTLBuffer> buffer, int tensor_index) {
+    // The tensor index is expected to be an input or output tensor of the interpreter.
+    // For quantized model, the buffer should be linked with their dequantized counterpart.
+    if (quant_conversion_map_.find(tensor_index) != quant_conversion_map_.end()) {
+      tensor_index = quant_conversion_map_[tensor_index];
+      // remove [dequantized tensor ID] -> [quantized tensor ID] mapping, to prevent extra
+      // dequant/quant on in/outputs.
+      quant_conversion_map_.erase(tensor_index);
+    }
     for (auto& input : graph_inputs_) {
       if (input.tensor_id == tensor_index) {
         input_output_buffers_[input.id] = buffer;
diff --git a/tensorflow/lite/delegates/gpu/metal_delegate_internal.h b/tensorflow/lite/delegates/gpu/metal_delegate_internal.h
index a479b5c6e28..82bc720844e 100644
--- a/tensorflow/lite/delegates/gpu/metal_delegate_internal.h
+++ b/tensorflow/lite/delegates/gpu/metal_delegate_internal.h
@@ -24,9 +24,11 @@ struct TfLiteDelegate;
 
 // Binds Metal buffer to an input or an output tensor in the initialized
 // delegate. Bound buffer should have sufficient storage to accommodate all
-// elements of a tensor. Returns non-zero on success, or zero otherwise.
+// elements of a tensor. For quantized model, the buffer is bound to internal
+// dequantized float32 tensor.
+// Returns non-zero on success, or zero otherwise.
 //
-// *** Must be called *before* `Interpreter::ModifyGraphWithDelegate`. ***
+// *** Must be called *after* `Interpreter::ModifyGraphWithDelegate`. ***
 bool TFLGpuDelegateBindMetalBufferToTensor(TfLiteDelegate* delegate,
                                            int tensor_index,
                                            id<MTLBuffer> metal_buffer);

From 7b029e2f0f3161ee49720ee6e0710b2ce5f4640a Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 30 Jun 2020 02:01:29 -0700
Subject: [PATCH 1305/1390] compat: Update forward compatibility horizon to
 2020-06-30

PiperOrigin-RevId: 318980913
Change-Id: I9ed67dff265e5ef6c47267f6ffb359239a07aaa9
---
 tensorflow/python/compat/compat.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/compat/compat.py b/tensorflow/python/compat/compat.py
index 71a30af6e47..f1935b80ed5 100644
--- a/tensorflow/python/compat/compat.py
+++ b/tensorflow/python/compat/compat.py
@@ -33,7 +33,7 @@ from tensorflow.python.util.tf_export import tf_export
 # This value changes every day with an automatic CL. It can be modified in code
 # via `forward_compatibility_horizon()` or with the environment variable
 # TF_FORWARD_COMPATIBILITY_DELTA_DAYS, which is added to the compatibility date.
-_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2020, 6, 29)
+_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2020, 6, 30)
 _FORWARD_COMPATIBILITY_DELTA_DAYS_VAR_NAME = "TF_FORWARD_COMPATIBILITY_DELTA_DAYS"
 _FORWARD_COMPATIBILITY_DATE_NUMBER = None
 

From 6bea0bea806b1db9162876505c6f649ac8e76634 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 30 Jun 2020 02:01:30 -0700
Subject: [PATCH 1306/1390] Update GraphDef version to 448.

PiperOrigin-RevId: 318980915
Change-Id: I2c9876c87cd6352fcc66beff4d94e1623effb563
---
 tensorflow/core/public/version.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index 48fba954022..9d183076374 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -108,7 +108,7 @@ limitations under the License.
 
 #define TF_GRAPH_DEF_VERSION_MIN_PRODUCER 0
 #define TF_GRAPH_DEF_VERSION_MIN_CONSUMER 0
-#define TF_GRAPH_DEF_VERSION 447  // Updated: 2020/6/29
+#define TF_GRAPH_DEF_VERSION 448  // Updated: 2020/6/30
 
 // Checkpoint compatibility versions (the versions field in SavedSliceMeta).
 //

From 5d8c5903f4da2d450e428e4eab574175eca7fbab Mon Sep 17 00:00:00 2001
From: Taehee Jeong <taeheej@google.com>
Date: Tue, 30 Jun 2020 02:03:56 -0700
Subject: [PATCH 1307/1390] Revise GPU delegate advanced documentation

PiperOrigin-RevId: 318981233
Change-Id: Iae6ee289e31205ea111d1a6ab899ff24e1358fb9
---
 .../lite/g3doc/performance/gpu_advanced.md    | 20 +++++++++++++++----
 1 file changed, 16 insertions(+), 4 deletions(-)

diff --git a/tensorflow/lite/g3doc/performance/gpu_advanced.md b/tensorflow/lite/g3doc/performance/gpu_advanced.md
index 7e0816bd87e..8d498cd404b 100644
--- a/tensorflow/lite/g3doc/performance/gpu_advanced.md
+++ b/tensorflow/lite/g3doc/performance/gpu_advanced.md
@@ -314,7 +314,7 @@ avoidable memory copies.
 Assuming the image input is in GPU memory, it must first be converted to a
 `MTLBuffer` object for Metal. You can associate a TfLiteTensor to a
 user-prepared `MTLBuffer` with `TFLGpuDelegateBindMetalBufferToTensor()`. Note
-that `TFLGpuDelegateBindMetalBufferToTensor()` must be called before
+that `TFLGpuDelegateBindMetalBufferToTensor()` must be called after
 `Interpreter::ModifyGraphWithDelegate()`. Additionally, the inference output is,
 by default, copied from GPU memory to CPU memory. This behavior can be turned
 off by calling `Interpreter::SetAllowBufferHandleOutput(true)` during
@@ -328,11 +328,19 @@ initialization.
 
 // Prepare GPU delegate.
 auto* delegate = TFLGpuDelegateCreate(nullptr);
-interpreter->SetAllowBufferHandleOutput(true);  // disable default gpu->cpu copy
-if (!TFLGpuDelegateBindMetalBufferToTensor(delegate, interpreter->inputs()[0], user_provided_input_buffer)) return false;
-if (!TFLGpuDelegateBindMetalBufferToTensor(delegate, interpreter->outputs()[0], user_provided_output_buffer)) return false;
+
 if (interpreter->ModifyGraphWithDelegate(delegate) != kTfLiteOk) return false;
 
+interpreter->SetAllowBufferHandleOutput(true);  // disable default gpu->cpu copy
+if (!TFLGpuDelegateBindMetalBufferToTensor(
+        delegate, interpreter->inputs()[0], user_provided_input_buffer)) {
+  return false;
+}
+if (!TFLGpuDelegateBindMetalBufferToTensor(
+        delegate, interpreter->outputs()[0], user_provided_output_buffer)) {
+  return false;
+}
+
 // Run inference.
 if (interpreter->Invoke() != kTfLiteOk) return false;
 ```
@@ -341,6 +349,10 @@ Note: Once the default behavior is turned off, copying the inference output from
 GPU memory to CPU memory requires an explicit call to
 `Interpreter::EnsureTensorDataIsReadable()` for each output tensor.
 
+Note: This also works for quantized models, but you still need to a **float32
+sized buffer with float32 data**, because the buffer will be bound to the
+internal dequantized buffer.
+
 ## Tips and Tricks
 
 *   Some operations that are trivial on the CPU may be high cost on a GPU. One

From d1f65b02771d43a4fe17b85179317c8f06de5a9e Mon Sep 17 00:00:00 2001
From: fo40225 <fo40225@hotmail.com>
Date: Tue, 30 Jun 2020 18:53:23 +0800
Subject: [PATCH 1308/1390] fix nvcc compiler-options

---
 third_party/gpus/crosstool/windows/msvc_wrapper_for_nvcc.py.tpl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/gpus/crosstool/windows/msvc_wrapper_for_nvcc.py.tpl b/third_party/gpus/crosstool/windows/msvc_wrapper_for_nvcc.py.tpl
index d2c9b917168..7e0674963bf 100644
--- a/third_party/gpus/crosstool/windows/msvc_wrapper_for_nvcc.py.tpl
+++ b/third_party/gpus/crosstool/windows/msvc_wrapper_for_nvcc.py.tpl
@@ -158,7 +158,7 @@ def InvokeNvcc(argv, log=False):
   nvccopts += defines
   nvccopts += m_options
   nvccopts += fatbin_options
-  nvccopts += ['--compiler-options="' + " ".join(host_compiler_options) + '"']
+  nvccopts += ['--compiler-options=' + ",".join(host_compiler_options)]
   nvccopts += ['-x', 'cu'] + opt + includes + out + ['-c'] + src_files
   # Specify a unique temp directory for nvcc to generate intermediate files,
   # then Bazel can ignore files under NVCC_TEMP_DIR during dependency check

From 936f4451c73f8b68fb799f506ae28a68fd45b5a5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20Kubov=C4=8D=C3=ADk?= <markub3327@gmail.com>
Date: Tue, 30 Jun 2020 14:11:12 +0200
Subject: [PATCH 1309/1390] Update hdf5_format.py

---
 tensorflow/python/keras/saving/hdf5_format.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/tensorflow/python/keras/saving/hdf5_format.py b/tensorflow/python/keras/saving/hdf5_format.py
index 4a2b4ac0f03..a6aacce057b 100644
--- a/tensorflow/python/keras/saving/hdf5_format.py
+++ b/tensorflow/python/keras/saving/hdf5_format.py
@@ -67,7 +67,8 @@ def check_lockfile(filepath):
   lockfile_path = filepath + ".lock"
   return gfile.Exists(lockfile_path)
 
-def save_model_to_hdf5(model, filepath, overwrite=True, lockfile=True, include_optimizer=True):
+def save_model_to_hdf5(model, filepath, overwrite=True, \
+                       lockfile=True, include_optimizer=True):
   """Saves a model to a HDF5 file.
 
   The saved model contains:
@@ -87,7 +88,9 @@ def save_model_to_hdf5(model, filepath, overwrite=True, lockfile=True, include_o
       overwrite: Whether we should overwrite any existing
           model at the target location, or instead
           ask the user with a manual prompt.
-      lockfile: Create a lockfile before saving the model file to prevent from reading, while saving is not done.
+      lockfile: Create a lockfile before saving the model 
+          file to prevent from reading, while saving 
+          is not done.
       include_optimizer: If True, save optimizer's state together.
 
   Raises:

From 43620fe3a7a250af8d375f069e1bae3592cab950 Mon Sep 17 00:00:00 2001
From: Vo Van Nghia <vovannghia2409@gmail.com>
Date: Tue, 30 Jun 2020 08:10:43 +0700
Subject: [PATCH 1310/1390] Add gcs read only memory region

---
 .../filesystem/plugins/gcs/gcs_filesystem.cc  | 56 ++++++++++++++++++-
 1 file changed, 55 insertions(+), 1 deletion(-)

diff --git a/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem.cc b/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem.cc
index 5578246c880..e5316d61208 100644
--- a/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem.cc
+++ b/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem.cc
@@ -249,8 +249,26 @@ void Close(const TF_WritableFile* file, TF_Status* status) {
 // SECTION 3. Implementation for `TF_ReadOnlyMemoryRegion`
 // ----------------------------------------------------------------------------
 namespace tf_read_only_memory_region {
+typedef struct GCSMemoryRegion {
+  const void* const address;
+  const uint64_t length;
+} GCSMemoryRegion;
 
-// TODO(vnvo2409): Implement later
+void Cleanup(TF_ReadOnlyMemoryRegion* region) {
+  auto r = static_cast<GCSMemoryRegion*>(region->plugin_memory_region);
+  plugin_memory_free(const_cast<void*>(r->address));
+  delete r;
+}
+
+const void* Data(const TF_ReadOnlyMemoryRegion* region) {
+  auto r = static_cast<GCSMemoryRegion*>(region->plugin_memory_region);
+  return r->address;
+}
+
+uint64_t Length(const TF_ReadOnlyMemoryRegion* region) {
+  auto r = static_cast<GCSMemoryRegion*>(region->plugin_memory_region);
+  return r->length;
+}
 
 }  // namespace tf_read_only_memory_region
 
@@ -361,6 +379,42 @@ void NewAppendableFile(const TF_Filesystem* filesystem, const char* path,
   TF_SetStatus(status, TF_OK, "");
 }
 
+// TODO(vnvo2409): We could download into a local temporary file and use
+// memory-mapping.
+void NewReadOnlyMemoryRegionFromFile(const TF_Filesystem* filesystem,
+                                     const char* path,
+                                     TF_ReadOnlyMemoryRegion* region,
+                                     TF_Status* status) {
+  std::string bucket, object;
+  ParseGCSPath(path, false, &bucket, &object, status);
+  if (TF_GetCode(status) != TF_OK) return;
+
+  auto gcs_file = static_cast<GCSFile*>(filesystem->plugin_filesystem);
+  auto metadata = gcs_file->gcs_client.GetObjectMetadata(bucket, object);
+  if (!metadata) {
+    TF_SetStatusFromGCSStatus(metadata.status(), status);
+    return;
+  }
+
+  TF_RandomAccessFile reader;
+  NewRandomAccessFile(filesystem, path, &reader, status);
+  if (TF_GetCode(status) != TF_OK) return;
+  char* buffer = static_cast<char*>(plugin_memory_allocate(metadata->size()));
+  int64_t read =
+      tf_random_access_file::Read(&reader, 0, metadata->size(), buffer, status);
+  tf_random_access_file::Cleanup(&reader);
+  if (TF_GetCode(status) != TF_OK) return;
+
+  if (read > 0 && buffer) {
+    region->plugin_memory_region =
+        new tf_read_only_memory_region::GCSMemoryRegion(
+            {buffer, static_cast<uint64_t>(read)});
+    TF_SetStatus(status, TF_OK, "");
+  } else if (read == 0) {
+    TF_SetStatus(status, TF_INVALID_ARGUMENT, "File is empty");
+  }
+}
+
 }  // namespace tf_gcs_filesystem
 
 static void ProvideFilesystemSupportFor(TF_FilesystemPluginOps* ops,

From 82fe54d40bfb93976691c094b5c03e0a167e2ef7 Mon Sep 17 00:00:00 2001
From: Hye Soo Yang <hyey@google.com>
Date: Tue, 30 Jun 2020 08:40:08 -0700
Subject: [PATCH 1311/1390] - Read 2 bytes for retrieving bpp from bmp image
 bytes instead of 4 bytes. (Referred to:
 https://en.wikipedia.org/wiki/BMP_file_format#DIB_header_(bitmap_information_header))
 - decode_bmp should reject `channels=1` argument:
 https://github.com/tensorflow/tensorflow/blob/595b01b5450f9fcbcf4bb52aa810141bb9838ad7/tensorflow/python/ops/image_ops_impl.py#L2666
 https://www.tensorflow.org/api_docs/python/tf/io/decode_bmp

PiperOrigin-RevId: 319028025
Change-Id: I20f7ec115eb34f3f1617fae850c6840d6dd05eeb
---
 tensorflow/core/kernels/decode_image_op.cc | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tensorflow/core/kernels/decode_image_op.cc b/tensorflow/core/kernels/decode_image_op.cc
index 3b7cb267b52..02f3b9a268c 100644
--- a/tensorflow/core/kernels/decode_image_op.cc
+++ b/tensorflow/core/kernels/decode_image_op.cc
@@ -771,9 +771,9 @@ class DecodeImageV2Op : public OpKernel {
     int32 height_ = internal::SubtleMustCopy(
         *(reinterpret_cast<const int32*>(img_bytes + 22)));
     const int32 height = ByteSwapInt32ForBigEndian(height_);
-    int16 bpp_ = internal::SubtleMustCopy(
-        *(reinterpret_cast<const int16*>(img_bytes + 28)));
-    const int16 bpp = le16toh(bpp_);
+    int32 bpp_ = internal::SubtleMustCopy(
+        *(reinterpret_cast<const int32*>(img_bytes + 28)));
+    const int32 bpp = ByteSwapInt32ForBigEndian(bpp_);
 
     if (channels_) {
       OP_REQUIRES(context, (channels_ == bpp / 8),

From 5e6c1e53cb73117a9aae8330e105a5aa4eda6e07 Mon Sep 17 00:00:00 2001
From: Mihai Maruseac <mihai.maruseac@gmail.com>
Date: Tue, 30 Jun 2020 16:00:07 +0000
Subject: [PATCH 1312/1390] Update
 tensorflow/python/keras/saving/hdf5_format.py

---
 tensorflow/python/keras/saving/hdf5_format.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/keras/saving/hdf5_format.py b/tensorflow/python/keras/saving/hdf5_format.py
index a6aacce057b..c776239da2c 100644
--- a/tensorflow/python/keras/saving/hdf5_format.py
+++ b/tensorflow/python/keras/saving/hdf5_format.py
@@ -152,7 +152,7 @@ def save_model_to_hdf5(model, filepath, overwrite=True, \
       f.close()
 
       # remove lock file
-      if (lockfile == True):
+      if lockfile:
         gfile.Remove(lockfile_path)
 
 
From 6bc47be6d200dbee007247eac9a46f478666581c Mon Sep 17 00:00:00 2001
From: Mihai Maruseac <mihai.maruseac@gmail.com>
Date: Tue, 30 Jun 2020 16:00:15 +0000
Subject: [PATCH 1313/1390] Update
 tensorflow/python/keras/saving/hdf5_format.py

---
 tensorflow/python/keras/saving/hdf5_format.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/keras/saving/hdf5_format.py b/tensorflow/python/keras/saving/hdf5_format.py
index c776239da2c..f82f2c06c31 100644
--- a/tensorflow/python/keras/saving/hdf5_format.py
+++ b/tensorflow/python/keras/saving/hdf5_format.py
@@ -118,7 +118,7 @@ def save_model_to_hdf5(model, filepath, overwrite=True, \
         return
 
     # create lock file
-    if (lockfile == True):
+    if lockfile:
       lockfile_path = create_lockfile(filepath)
 
     f = h5py.File(filepath, mode='w')

From 220e941ef923ca065bc91e5278a6ed42e46f04e8 Mon Sep 17 00:00:00 2001
From: Mihai Maruseac <mihai.maruseac@gmail.com>
Date: Tue, 30 Jun 2020 16:00:24 +0000
Subject: [PATCH 1314/1390] Update
 tensorflow/python/keras/saving/hdf5_format.py

---
 tensorflow/python/keras/saving/hdf5_format.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/keras/saving/hdf5_format.py b/tensorflow/python/keras/saving/hdf5_format.py
index f82f2c06c31..161a95df432 100644
--- a/tensorflow/python/keras/saving/hdf5_format.py
+++ b/tensorflow/python/keras/saving/hdf5_format.py
@@ -190,7 +190,7 @@ def load_model_from_hdf5(filepath, custom_objects=None, compile=True):  # pylint
   opened_new_file = not isinstance(filepath, h5py.File)
   if opened_new_file:
     # check if lock file exist
-    if check_lockfile(filepath) == True:
+    if check_lockfile(filepath):
       raise ValueError('Cannot read from file at this time.')
 
     f = h5py.File(filepath, mode='r')

From b150b898ed6bbb86b6ff13b98f51468b3a768444 Mon Sep 17 00:00:00 2001
From: "T.J. Alumbaugh" <talumbau@google.com>
Date: Tue, 30 Jun 2020 09:27:18 -0700
Subject: [PATCH 1315/1390] Protect printf args from nullptr in verifier.

PiperOrigin-RevId: 319036617
Change-Id: I43deea9d8e4ebc00dad66ff8ea5390def676743b
---
 tensorflow/lite/tools/verifier.cc      | 16 +++++++++++++---
 tensorflow/lite/tools/verifier_test.cc | 13 +++++++++++++
 2 files changed, 26 insertions(+), 3 deletions(-)

diff --git a/tensorflow/lite/tools/verifier.cc b/tensorflow/lite/tools/verifier.cc
index b396e93013d..9befa7fd6f1 100644
--- a/tensorflow/lite/tools/verifier.cc
+++ b/tensorflow/lite/tools/verifier.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <climits>
 #include <complex>
 #include <cstdint>
+#include <cstring>
 
 #include "absl/container/flat_hash_set.h"
 #include "tensorflow/lite/schema/schema_generated.h"
@@ -35,6 +36,10 @@ const char* NameOrEmptyString(const flatbuffers::String* str) {
   return str->c_str();
 }
 
+bool IsNullOrEmptyString(const flatbuffers::String* str) {
+  return strcmp(NameOrEmptyString(str), "") == 0;
+}
+
 void ReportError(ErrorReporter* error_reporter, const char* format, ...) {
   if (error_reporter) {
     va_list args;
@@ -560,13 +565,13 @@ bool VerifyTensors(const Model& model, ErrorReporter* error_reporter) {
       }
       if (tensor->buffer() >= model.buffers()->size()) {
         ReportError(error_reporter, "Tensor %s invalid buffer index: %d",
-                    tensor->name(), tensor->buffer());
+                    NameOrEmptyString(tensor->name()), tensor->buffer());
         return false;
       }
       auto* buffer = model.buffers()->Get(tensor->buffer());
       if (!buffer) {
         ReportError(error_reporter, "Tensor %s buffer %d not set",
-                    tensor->name(), tensor->buffer());
+                    NameOrEmptyString(tensor->name()), tensor->buffer());
         return false;
       }
 
@@ -602,7 +607,12 @@ bool VerifyOps(const Model& model, const OpResolver& resolver,
     }
 
     if (opcode->builtin_code() == BuiltinOperator_CUSTOM) {
-      if (!resolver.FindOp(opcode->custom_code()->c_str(), opcode->version())) {
+      if (IsNullOrEmptyString(opcode->custom_code())) {
+        ReportError(error_reporter,
+                    "Invalid custom op name, cannot be null/empty.");
+        return false;
+      } else if (!resolver.FindOp(opcode->custom_code()->c_str(),
+                                  opcode->version())) {
         ReportError(error_reporter, "Unsupported custom op: %s, version: %d",
                     opcode->custom_code()->c_str(), opcode->version());
         return false;
diff --git a/tensorflow/lite/tools/verifier_test.cc b/tensorflow/lite/tools/verifier_test.cc
index 1e13fda7c33..da4eb55c4f9 100644
--- a/tensorflow/lite/tools/verifier_test.cc
+++ b/tensorflow/lite/tools/verifier_test.cc
@@ -428,6 +428,19 @@ TEST(VerifyModel, UseUnsupportedCustomOps) {
                   "Unsupported custom op: Not supported, version: 1"));
 }
 
+TEST(VerifyModel, UseUnnamedCustomOps) {
+  TfLiteFlatbufferModelBuilder builder({BuiltinOperator_ADD}, {"NewOp"});
+  builder.AddTensor({2, 2}, TensorType_UINT8, {1, 2, 3, 4}, "input1");
+  builder.AddTensor({2, 2}, TensorType_UINT8, {1, 2, 3, 4}, "input2");
+  builder.AddTensor({2, 2}, TensorType_UINT8, {}, "output");
+  builder.AddOperator({0, 1}, {2}, BuiltinOperator_CUSTOM, "");
+  builder.FinishModel({}, {});
+  ASSERT_FALSE(builder.Verify());
+  EXPECT_THAT(builder.GetErrorString(),
+              ::testing::ContainsRegex(
+                  "Invalid custom op name, cannot be null/empty."));
+}
+
 TEST(VerifyModel, UnpopulatedInputToOp) {
   TfLiteFlatbufferModelBuilder builder({}, {"test"});
   builder.AddOperator({1, 2}, {3}, BuiltinOperator_CUSTOM, "test");

From cf034314e9e81643da1252c2b1c3546ee43c22ac Mon Sep 17 00:00:00 2001
From: Jian Li <jianlijianli@google.com>
Date: Tue, 30 Jun 2020 09:38:21 -0700
Subject: [PATCH 1316/1390] Add multiple input calibration instruction to
 post-training colab.

PiperOrigin-RevId: 319038671
Change-Id: Ie5e824689b7d3105cd969b980f525510318f5bb1
---
 .../lite/g3doc/performance/post_training_integer_quant.ipynb  | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tensorflow/lite/g3doc/performance/post_training_integer_quant.ipynb b/tensorflow/lite/g3doc/performance/post_training_integer_quant.ipynb
index 8397dbfa69f..cff1e773938 100644
--- a/tensorflow/lite/g3doc/performance/post_training_integer_quant.ipynb
+++ b/tensorflow/lite/g3doc/performance/post_training_integer_quant.ipynb
@@ -323,7 +323,8 @@
         "id": "rTe8avZJHMDO"
       },
       "source": [
-        "Now, in order to create quantized values with an accurate dynamic range of activations, you need to provide a representative dataset:"
+        "Now, in order to create quantized values with an accurate dynamic range of activations, you need to provide a representative dataset.\n",
+        "To support multiple inputs, each representative data point is a list and elements in the list are fed to the model according to their indices.\n"
       ]
     },
     {
@@ -341,6 +342,7 @@
         "mnist_ds = tf.data.Dataset.from_tensor_slices((images)).batch(1)\n",
         "def representative_data_gen():\n",
         "  for input_value in mnist_ds.take(100):\n",
+        "    # Model has only one input so each data point has one element.\n",
         "    yield [input_value]\n",
         "\n",
         "converter.representative_dataset = representative_data_gen"

From eb27343a9dbb0d3bfd1553aec875652603dff93d Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 30 Jun 2020 09:47:09 -0700
Subject: [PATCH 1317/1390] Not yet public

PiperOrigin-RevId: 319040214
Change-Id: I95edff062ee32c713df97752b29e7172bfe6aec3
---
 .../configuration/configuration.proto         |  9 +++++
 .../configuration/nnapi_plugin.cc             | 16 +++++++++
 .../configuration/nnapi_plugin_test.cc        | 33 +++++++++++++++++++
 3 files changed, 58 insertions(+)

diff --git a/tensorflow/lite/experimental/acceleration/configuration/configuration.proto b/tensorflow/lite/experimental/acceleration/configuration/configuration.proto
index e1c49f02856..44d462da073 100644
--- a/tensorflow/lite/experimental/acceleration/configuration/configuration.proto
+++ b/tensorflow/lite/experimental/acceleration/configuration/configuration.proto
@@ -69,6 +69,13 @@ enum NNAPIExecutionPreference {
   NNAPI_SUSTAINED_SPEED = 3;
 }
 
+enum NNAPIExecutionPriority {
+  NNAPI_PRIORITY_UNDEFINED = 0;
+  NNAPI_PRIORITY_LOW = 1;
+  NNAPI_PRIORITY_MEDIUM = 2;
+  NNAPI_PRIORITY_HIGH = 3;
+}
+
 // One possible acceleration configuration.
 message ComputeSettings {
   // Which preference to use this accelerator for.
@@ -110,6 +117,8 @@ message NNAPISettings {
   // performs less well than the TfLite built-in kernels; but allowing allows a
   // model to be partially accelerated which may be a win.
   optional bool allow_nnapi_cpu_on_android_10_plus = 7;
+
+  optional NNAPIExecutionPriority execution_priority = 8;
 }
 
 // Which GPU backend to select. Default behaviour on Android is to try OpenCL
diff --git a/tensorflow/lite/experimental/acceleration/configuration/nnapi_plugin.cc b/tensorflow/lite/experimental/acceleration/configuration/nnapi_plugin.cc
index 7301983a815..cf99f530d6d 100644
--- a/tensorflow/lite/experimental/acceleration/configuration/nnapi_plugin.cc
+++ b/tensorflow/lite/experimental/acceleration/configuration/nnapi_plugin.cc
@@ -39,6 +39,20 @@ ConvertExecutionPrefence(
   }
 }
 
+inline int ConvertExecutionPriority(
+    NNAPIExecutionPriority from_compatibility_priority) {
+  switch (from_compatibility_priority) {
+    case NNAPIExecutionPriority_NNAPI_PRIORITY_LOW:
+      return ANEURALNETWORKS_PRIORITY_LOW;
+    case NNAPIExecutionPriority_NNAPI_PRIORITY_MEDIUM:
+      return ANEURALNETWORKS_PRIORITY_MEDIUM;
+    case NNAPIExecutionPriority_NNAPI_PRIORITY_HIGH:
+      return ANEURALNETWORKS_PRIORITY_HIGH;
+    default:
+      return ANEURALNETWORKS_PRIORITY_DEFAULT;
+  }
+}
+
 class NnapiPlugin : public DelegatePluginInterface {
  public:
   TfLiteDelegatePtr Create() override {
@@ -80,6 +94,8 @@ class NnapiPlugin : public DelegatePluginInterface {
         ConvertExecutionPrefence(nnapi_settings->execution_preference());
     options_.disallow_nnapi_cpu =
         !nnapi_settings->allow_nnapi_cpu_on_android_10_plus();
+    options_.execution_priority =
+        ConvertExecutionPriority(nnapi_settings->execution_priority());
   }
 
  private:
diff --git a/tensorflow/lite/experimental/acceleration/configuration/nnapi_plugin_test.cc b/tensorflow/lite/experimental/acceleration/configuration/nnapi_plugin_test.cc
index 4f9f5dd08c1..c35e92e6fc1 100644
--- a/tensorflow/lite/experimental/acceleration/configuration/nnapi_plugin_test.cc
+++ b/tensorflow/lite/experimental/acceleration/configuration/nnapi_plugin_test.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include "flatbuffers/flatbuffers.h"  // from @flatbuffers
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/delegates/nnapi/nnapi_delegate.h"
+#include "tensorflow/lite/delegates/nnapi/nnapi_delegate_kernel.h"
 #include "tensorflow/lite/delegates/nnapi/nnapi_delegate_mock_test.h"
 #include "tensorflow/lite/experimental/acceleration/configuration/configuration_generated.h"
 #include "tensorflow/lite/experimental/acceleration/configuration/delegate_registry.h"
@@ -82,6 +83,25 @@ class NNAPIPluginTest : public ::testing::Test {
               kTfLiteOk)
         << " given input: " << input << " expected output: " << output;
   }
+  template <NNAPIExecutionPriority input, int output>
+  void CheckExecutionPriority() {
+    // Note - this uses a template since the NNAPI functions are C function
+    // pointers rather than lambdas so can't capture variables.
+    nnapi_->ANeuralNetworksCompilation_setPriority =
+        [](ANeuralNetworksCompilation* compilation, int32_t priority) {
+          return priority - output;
+        };
+    CreateDelegate(CreateNNAPISettings(fbb_, 0, 0, 0,
+                                       NNAPIExecutionPreference_UNDEFINED, 0, 0,
+                                       /*allow CPU=*/true, input));
+    // Since delegation succeeds, the model becomes immutable and hence can't
+    // reuse it.
+    SingleAddOpModel model;
+    model.Build();
+    EXPECT_EQ(model.Interpreter()->ModifyGraphWithDelegate(delegate_.get()),
+              kTfLiteOk)
+        << " given input: " << input << " expected output: " << output;
+  }
 
   void CreateDelegate(flatbuffers::Offset<NNAPISettings> settings) {
     settings_ = flatbuffers::GetTemporaryPointer(
@@ -124,6 +144,19 @@ TEST_F(NNAPIPluginTest, PassesExecutionPreference) {
                            StatefulNnApiDelegate::Options::kSustainedSpeed>();
 }
 
+TEST_F(NNAPIPluginTest, PassesExecutionPriority) {
+  nnapi_->android_sdk_version =
+      tflite::delegate::nnapi::kMinSdkVersionForNNAPI13;
+  CheckExecutionPriority<NNAPIExecutionPriority_NNAPI_PRIORITY_UNDEFINED,
+                         ANEURALNETWORKS_PRIORITY_DEFAULT>();
+  CheckExecutionPriority<NNAPIExecutionPriority_NNAPI_PRIORITY_LOW,
+                         ANEURALNETWORKS_PRIORITY_LOW>();
+  CheckExecutionPriority<NNAPIExecutionPriority_NNAPI_PRIORITY_MEDIUM,
+                         ANEURALNETWORKS_PRIORITY_MEDIUM>();
+  CheckExecutionPriority<NNAPIExecutionPriority_NNAPI_PRIORITY_HIGH,
+                         ANEURALNETWORKS_PRIORITY_HIGH>();
+}
+
 TEST_F(NNAPIPluginTest, PassesCachingParameters) {
   nnapi_->ANeuralNetworksCompilation_setCaching =
       [](ANeuralNetworksCompilation* compilation, const char* cacheDir,

From 5f63471cd6372cdb9e333b712870d0fbaec2b28b Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 30 Jun 2020 09:49:39 -0700
Subject: [PATCH 1318/1390] In type_index.h there is some logic to warn if RTTI
 is disabled on a Mac build, but unfortunately TARGET_OS_MAC appears to be
 defined for all Apple builds, including iOS. On mobile we want to be able to
 build with -fno-rtti. This change updates the logic to check TARGET_OS_OSX
 which is defined for Mac only.

PiperOrigin-RevId: 319040625
Change-Id: Idff1833fb567e57ca5abb9b1e94f300a2beaa149
---
 tensorflow/core/framework/type_index.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/core/framework/type_index.h b/tensorflow/core/framework/type_index.h
index e8f715bebda..7986904dd7a 100644
--- a/tensorflow/core/framework/type_index.h
+++ b/tensorflow/core/framework/type_index.h
@@ -73,12 +73,12 @@ class TypeIndex {
 #endif  // defined(MACOS) || defined(TARGET_OS_MAC)
 
 #else
-#if defined(MACOS) || defined(TARGET_OS_MAC)
+#if TARGET_OS_OSX
     // Warn MacOS users that not using RTTI can cause problems (b/156979412).
 #warning \
     "Compiling with RTTI disabled on MacOS can cause problems when comparing " \
     "types across shared libraries."
-#endif  // defined(MACOS) || defined(TARGET_OS_MAC)
+#endif  // TARGET_OS_OSX
 
     // No type names available.
     return TypeIndex(static_cast<uint64>(reinterpret_cast<intptr_t>(hash_bit)),

From 5a211dda4d38ad9c382d8bf46d2143c2090a56f8 Mon Sep 17 00:00:00 2001
From: Tomer Kaftan <kaftan@google.com>
Date: Tue, 30 Jun 2020 10:06:10 -0700
Subject: [PATCH 1319/1390] Enable shape value inference in Keras's functional
 API when using KerasTensors and manipulating shape tensors directly with op
 layers. This brings it close to matching the exact behavior that happens when
 using `shape` op op-layers in head.

This CL does so by extracting the shape value inference TF C++ is able to w/in each stand-alone trace graph, by using `tf.ones(shape=x)` then grabbing the shape. It does not use `tf.get_static_value` because that is not powerful enough to do the same degree of inference. E.g. that does not work with multiplication or __get_item__, even though the TF C++ shape inference is able to.

PiperOrigin-RevId: 319043880
Change-Id: I55c161baf7d61b0a642b59a40902456a148b674c
---
 tensorflow/python/keras/engine/base_layer.py  |  3 +-
 .../python/keras/engine/keras_tensor.py       | 65 ++++++++++++++++++-
 .../keras/layers/tensorflow_op_layer_test.py  | 57 ++++++++++++++++
 3 files changed, 121 insertions(+), 4 deletions(-)

diff --git a/tensorflow/python/keras/engine/base_layer.py b/tensorflow/python/keras/engine/base_layer.py
index a9239144231..51a10a2b4bb 100644
--- a/tensorflow/python/keras/engine/base_layer.py
+++ b/tensorflow/python/keras/engine/base_layer.py
@@ -840,8 +840,9 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
         self._handle_activity_regularization(inputs, outputs)
       self._set_mask_metadata(inputs, outputs, input_masks,
                               build_graph=False)
+      outputs = nest.map_structure(
+          keras_tensor.keras_tensor_from_tensor, outputs)
 
-    outputs = nest.map_structure(keras_tensor.keras_tensor_from_tensor, outputs)
     if hasattr(self, '_set_inputs') and not self.inputs:
       # TODO(kaftan): figure out if we ned to do this at all
       # Subclassed network: explicitly set metadata normally set by
diff --git a/tensorflow/python/keras/engine/keras_tensor.py b/tensorflow/python/keras/engine/keras_tensor.py
index c5c0068c652..50d377c5292 100644
--- a/tensorflow/python/keras/engine/keras_tensor.py
+++ b/tensorflow/python/keras/engine/keras_tensor.py
@@ -18,8 +18,10 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import tensor_spec
 from tensorflow.python.framework import type_spec as type_spec_module
 from tensorflow.python.ops import array_ops
 from tensorflow.python.util import nest
@@ -92,12 +94,13 @@ class KerasTensor(object):
   import keras_tensor directly and call `keras_tensor.enable_keras_tensors()`
   """
 
-  def __init__(self, type_spec, name=None):
+  def __init__(self, type_spec, inferred_shape_value=None, name=None):
     """Construct a KerasTensor from a type_spec and an optional name."""
     if not isinstance(type_spec, type_spec_module.TypeSpec):
       raise ValueError('KerasTensors must be constructed with a `tf.TypeSpec`.')
 
     self._type_spec = type_spec
+    self._inferred_shape_value = inferred_shape_value
     if name is None and hasattr(type_spec, 'name'):
       name = type_spec.name
     self._name = name
@@ -209,9 +212,35 @@ class _KerasTensorIterator(object):
 
 
 def keras_tensor_to_placeholder(x):
-  """TODO(kaftan): Docstring."""
+  """Construct a graph placeholder to represent a KerasTensor when tracing."""
   if isinstance(x, KerasTensor):
     spec = x.type_spec
+
+    if x._inferred_shape_value is not None:  # pylint: disable=protected-access
+      # If we suspect this KerasTensor might be representing a shape tensor,
+      # and we were able to extract value information with TensorFlow's shape
+      # handling when making the KerasTensor, we construct the placeholder by
+      # re-injecting the inferred value information into the graph.
+      # Even though keras layers each trace in their own scratch
+      # graph, this shape value info injection allows us to capture
+      # a sizable and useful subset of the C++ shape value inference TF can do
+      # if all tf ops appear in the same graph when using shape ops.
+      #
+      # Examples of things this cannot infer concrete dimensions for
+      # that the full single-graph C++ shape inference sometimes can are:
+      # * cases where the shape tensor is cast out of int32 before being
+      #   manipulated w/ floating point numbers then converted back
+      # * cases where int32 tensors w/ rank > 2 are manipulated before being
+      #   used as a shape tensor
+      inferred_shape_value = array_ops.shape(
+          array_ops.placeholder(
+              shape=x._inferred_shape_value, dtype=dtypes.int32))  # pylint: disable=protected-access
+      if spec.shape.rank == 0:
+        # `tf.shape` always returns a rank-1, we may need to turn it back to a
+        # scalar.
+        inferred_shape_value = inferred_shape_value[0]
+      return inferred_shape_value  # pylint: disable=protected-access
+
     if isinstance(spec, sparse_tensor.SparseTensorSpec):
       # nest.map_structure loses dense shape information for sparse tensors.
       # So, we special-case sparse placeholder creation.
@@ -231,8 +260,38 @@ def keras_tensor_to_placeholder(x):
 
 
 def keras_tensor_from_tensor(x):
+  """Convert a traced (composite)tensor to a representative KerasTensor."""
   name = getattr(x, 'name', None)
-  out = KerasTensor(type_spec_module.type_spec_from_value(x), name=name)
+  inferred_shape_value = None
+  type_spec = type_spec_module.type_spec_from_value(x)
+
+  if (isinstance(type_spec, tensor_spec.TensorSpec)
+      and type_spec.dtype == dtypes.int32
+      and type_spec.shape.rank < 2):
+    # If this tensor might be representing shape information,
+    # (dtype=int32, rank of 0 or 1)
+    # we attempt to capture any value information tensorflow's
+    # shape handling can extract from the current scratch graph.
+    #
+    # Even though keras layers each trace in their own scratch
+    # graph, this shape value info extraction allows us to capture
+    # a sizable and useful subset of the C++ shape value inference TF can do
+    # if all tf ops appear in the same graph when using shape ops.
+    #
+    # Examples of things this cannot infer concrete dimensions for
+    # that the full single-graph C++ shape inference sometimes can are:
+    # * cases where the shape tensor is cast out of int32 before being
+    #   manipulated w/ floating point numbers then converted back
+    # * cases where int32 tensors w/ rank > 2 are manipulated before being
+    #   used as a shape tensor
+    inferred_shape_value = array_ops.ones(shape=x).shape
+    if inferred_shape_value.dims:
+      inferred_shape_value = inferred_shape_value.as_list()
+    else:
+      inferred_shape_value = None
+
+  out = KerasTensor(type_spec,
+                    inferred_shape_value=inferred_shape_value, name=name)
   if hasattr(x, '_keras_mask'):
     out._keras_mask = KerasTensor(  # pylint: disable=protected-access
         type_spec_module.type_spec_from_value(x._keras_mask))  # pylint: disable=protected-access
diff --git a/tensorflow/python/keras/layers/tensorflow_op_layer_test.py b/tensorflow/python/keras/layers/tensorflow_op_layer_test.py
index 0cc643be089..cb044260106 100644
--- a/tensorflow/python/keras/layers/tensorflow_op_layer_test.py
+++ b/tensorflow/python/keras/layers/tensorflow_op_layer_test.py
@@ -78,6 +78,58 @@ def _multiple_ops_in_middle():
   return keras.Model(inputs, outputs)
 
 
+def _shape_op_inference():
+  inputs = keras.Input(shape=(10,))
+  x = array_ops.shape(inputs)
+  x = array_ops.ones(x)
+  assert x.shape.as_list() == [None, 10]
+  outputs = keras.layers.Dense(10)(x)
+  return keras.Model(inputs, outputs)
+
+
+def _shape_op_known_batch_size():
+  inputs = keras.Input(batch_size=2, shape=(10,))
+  x = array_ops.shape(inputs)
+  x = array_ops.ones(x)
+  assert x.shape.as_list() == [2, 10]
+  outputs = keras.layers.Dense(10)(x)
+  if context.executing_eagerly():
+    return keras.Model(inputs, outputs)
+  else:
+    # In V1 the op layer fails for some reason,
+    # but we don't have access to the test case to call
+    # self.skip_test in this util method
+    return keras.Model(inputs, inputs)
+
+
+def _shape_op_slice_and_range():
+  inputs = keras.Input(shape=(10,))
+  batch_size = array_ops.shape(inputs)[0]
+  x = math_ops.range(batch_size * 2)
+  assert x.shape.as_list() == [None]
+  x = array_ops.reshape(x, (batch_size, 2))
+  x = math_ops.cast(x, dtype='float32')
+  outputs = keras.layers.Dense(10)(x)
+  return keras.Model(inputs, outputs)
+
+
+def _shape_op_slice_and_range_known_dim():
+  inputs = keras.Input(batch_size=2, shape=(10,))
+  batch_size = array_ops.shape(inputs)[0]
+  x = math_ops.range(batch_size * 3)
+  assert x.shape.as_list() == [6]
+  x = array_ops.reshape(x, (batch_size, 3))
+  x = math_ops.cast(x, dtype='float32')
+  outputs = keras.layers.Dense(10)(x)
+  if context.executing_eagerly():
+    return keras.Model(inputs, outputs)
+  else:
+    # In V1 the op layer fails for some reason,
+    # but we don't have access to the test case to call
+    # self.skip_test in this util method
+    return keras.Model(inputs, inputs)
+
+
 def _single_standalone_branch():
   inputs = keras.Input(shape=(10,))
   x = keras.layers.Dense(10)(inputs)
@@ -194,6 +246,11 @@ class AutoLambdaTest(keras_parameterized.TestCase):
       ('multiple_ops_at_end', _multiple_ops_at_end),
       ('single_op_in_middle', _single_op_in_middle),
       ('multiple_ops_in_middle', _multiple_ops_in_middle),
+      ('shape_op_inference', _shape_op_inference),
+      ('shape_op_known_batch_size', _shape_op_known_batch_size),
+      ('shape_op_slice_and_range', _shape_op_slice_and_range),
+      ('shape_op_slice_and_range_known_dim',
+       _shape_op_slice_and_range_known_dim),
       ('single_standalone_branch', _single_standalone_branch),
       ('single_op_with_attrs', _single_op_with_attrs),
       ('multiple_uses', _multiple_uses),

From 5890405631a44f53a2d4d5c0ce5b625e1ae340cb Mon Sep 17 00:00:00 2001
From: Robert David <lrdx@google.com>
Date: Tue, 30 Jun 2020 10:37:22 -0700
Subject: [PATCH 1320/1390] Move IsZeroVector and BatchQuantizeFloats calls to
 top.

Update Calibration and FP16 versions to be consistent with the Float version.

PiperOrigin-RevId: 319050730
Change-Id: I1f026189f0b71570f230e794a690dca5be30d597
---
 tensorflow/lite/kernels/lstm_eval.cc          | 91 +++++++++++--------
 .../calibration/builtin_logging_ops/lstm.cc   | 38 ++++----
 2 files changed, 74 insertions(+), 55 deletions(-)

diff --git a/tensorflow/lite/kernels/lstm_eval.cc b/tensorflow/lite/kernels/lstm_eval.cc
index f97411a3a97..42d6f89c0e4 100644
--- a/tensorflow/lite/kernels/lstm_eval.cc
+++ b/tensorflow/lite/kernels/lstm_eval.cc
@@ -511,6 +511,12 @@ inline void LstmStepFloat(
   float* cell_gate_scratch = scratch2;
   float* output_gate_scratch = scratch3;
 
+  const bool is_input_all_zeros =
+      tensor_utils::IsZeroVector(input_ptr, n_batch * n_input);
+  const bool is_aux_input_all_zeros =
+      (aux_input_ptr == nullptr ||
+       tensor_utils::IsZeroVector(aux_input_ptr, n_batch * n_aux_input));
+
   // Initialize scratch buffers with bias for regular lstm or initialize with
   // zero for layer norm lstm.
   if (use_layer_norm) {
@@ -535,7 +541,7 @@ inline void LstmStepFloat(
 
   // For each batch and cell: compute input_weight * input.
   // Skip if input is all zeros.
-  if (!tensor_utils::IsZeroVector(input_ptr, n_batch * n_input)) {
+  if (!is_input_all_zeros) {
     if (!use_cifg) {
       tensor_utils::MatrixBatchVectorMultiplyAccumulate(
           input_to_input_weights_ptr, n_cell, n_input, input_ptr, n_batch,
@@ -555,8 +561,7 @@ inline void LstmStepFloat(
 
   // For each batch and cell: compute aux_input_weight * aux_input.
   // Skip if auxiliary input is not available or all zeros.
-  if (aux_input_ptr != nullptr &&
-      !tensor_utils::IsZeroVector(aux_input_ptr, n_batch * n_aux_input)) {
+  if (!is_aux_input_all_zeros) {
     if (!use_cifg) {
       tensor_utils::MatrixBatchVectorMultiplyAccumulate(
           aux_input_to_input_weights_ptr, n_cell, n_aux_input, aux_input_ptr,
@@ -807,28 +812,6 @@ inline void LstmStepHybrid(
   float* cell_gate_scratch = scratch2;
   float* output_gate_scratch = scratch3;
 
-  // Initialize scratch buffers with bias for regular lstm or initialize with
-  // zero for layer norm lstm.
-  if (use_layer_norm) {
-    if (!use_cifg) {
-      std::fill_n(input_gate_scratch, n_cell * n_batch, 0.0f);
-    }
-    std::fill_n(forget_gate_scratch, n_cell * n_batch, 0.0f);
-    std::fill_n(cell_gate_scratch, n_cell * n_batch, 0.0f);
-    std::fill_n(output_gate_scratch, n_cell * n_batch, 0.0f);
-  } else {
-    if (!use_cifg) {
-      tensor_utils::VectorBatchVectorAssign(input_gate_bias_ptr, n_cell,
-                                            n_batch, input_gate_scratch);
-    }
-    tensor_utils::VectorBatchVectorAssign(forget_gate_bias_ptr, n_cell, n_batch,
-                                          forget_gate_scratch);
-    tensor_utils::VectorBatchVectorAssign(cell_gate_bias_ptr, n_cell, n_batch,
-                                          cell_gate_scratch);
-    tensor_utils::VectorBatchVectorAssign(output_gate_bias_ptr, n_cell, n_batch,
-                                          output_gate_scratch);
-  }
-
   int32_t* input_to_input_row_sums = nullptr;
   int32_t* input_to_forget_row_sums = nullptr;
   int32_t* input_to_cell_row_sums = nullptr;
@@ -896,10 +879,53 @@ inline void LstmStepHybrid(
     }
   }
 
-  if (!tensor_utils::IsZeroVector(input_ptr, n_batch * n_input)) {
+  const bool is_input_all_zeros =
+      tensor_utils::IsZeroVector(input_ptr, n_batch * n_input);
+  const bool is_aux_input_all_zeros =
+      (aux_input_ptr == nullptr ||
+       tensor_utils::IsZeroVector(aux_input_ptr, n_batch * n_aux_input));
+  const bool is_output_state_all_zeros =
+      tensor_utils::IsZeroVector(output_state_ptr, n_batch * n_output);
+
+  if (!is_input_all_zeros) {
     tensor_utils::BatchQuantizeFloats(input_ptr, n_batch, n_input,
                                       quantized_input_ptr, input_sf, input_zp,
                                       asymmetric_quantize_inputs);
+  }
+  if (!is_aux_input_all_zeros) {
+    tensor_utils::BatchQuantizeFloats(aux_input_ptr, n_batch, n_aux_input,
+                                      quantized_aux_input_ptr, aux_input_sf,
+                                      aux_input_zp, asymmetric_quantize_inputs);
+  }
+  if (!is_output_state_all_zeros) {
+    tensor_utils::BatchQuantizeFloats(
+        output_state_ptr, n_batch, n_output, quantized_output_state_ptr,
+        output_state_sf, output_state_zp, asymmetric_quantize_inputs);
+  }
+
+  // Initialize scratch buffers with bias for regular lstm or initialize with
+  // zero for layer norm lstm.
+  if (use_layer_norm) {
+    if (!use_cifg) {
+      std::fill_n(input_gate_scratch, n_cell * n_batch, 0.0f);
+    }
+    std::fill_n(forget_gate_scratch, n_cell * n_batch, 0.0f);
+    std::fill_n(cell_gate_scratch, n_cell * n_batch, 0.0f);
+    std::fill_n(output_gate_scratch, n_cell * n_batch, 0.0f);
+  } else {
+    if (!use_cifg) {
+      tensor_utils::VectorBatchVectorAssign(input_gate_bias_ptr, n_cell,
+                                            n_batch, input_gate_scratch);
+    }
+    tensor_utils::VectorBatchVectorAssign(forget_gate_bias_ptr, n_cell, n_batch,
+                                          forget_gate_scratch);
+    tensor_utils::VectorBatchVectorAssign(cell_gate_bias_ptr, n_cell, n_batch,
+                                          cell_gate_scratch);
+    tensor_utils::VectorBatchVectorAssign(output_gate_bias_ptr, n_cell, n_batch,
+                                          output_gate_scratch);
+  }
+
+  if (!is_input_all_zeros) {
     if (!use_cifg) {
       tensor_utils::MatrixBatchVectorMultiplyAccumulate(
           input_to_input_weights_ptr, n_cell, n_input, quantized_input_ptr,
@@ -933,12 +959,7 @@ inline void LstmStepHybrid(
 
   // For each batch and cell: compute aux_input_weight * aux_input.
   // Skip if auxiliary input is not available or all zeros.
-  if (aux_input_ptr != nullptr &&
-      !tensor_utils::IsZeroVector(aux_input_ptr, n_batch * n_aux_input)) {
-    tensor_utils::BatchQuantizeFloats(aux_input_ptr, n_batch, n_aux_input,
-                                      quantized_aux_input_ptr, aux_input_sf,
-                                      aux_input_zp, asymmetric_quantize_inputs);
-
+  if (!is_aux_input_all_zeros) {
     if (!use_cifg) {
       tensor_utils::MatrixBatchVectorMultiplyAccumulate(
           aux_input_to_input_weights_ptr, n_cell, n_aux_input,
@@ -973,11 +994,7 @@ inline void LstmStepHybrid(
         context);
   }
 
-  if (!tensor_utils::IsZeroVector(output_state_ptr, n_batch * n_output)) {
-    // Save quantization and matmul computation for all zero input.
-    tensor_utils::BatchQuantizeFloats(
-        output_state_ptr, n_batch, n_output, quantized_output_state_ptr,
-        output_state_sf, output_state_zp, asymmetric_quantize_inputs);
+  if (!is_output_state_all_zeros) {
     // For each batch and cell: compute recurrent_weight * output_state.
     if (!use_cifg) {
       tensor_utils::MatrixBatchVectorMultiplyAccumulate(
diff --git a/tensorflow/lite/tools/optimize/calibration/builtin_logging_ops/lstm.cc b/tensorflow/lite/tools/optimize/calibration/builtin_logging_ops/lstm.cc
index e61faf2b822..19be9c59e70 100644
--- a/tensorflow/lite/tools/optimize/calibration/builtin_logging_ops/lstm.cc
+++ b/tensorflow/lite/tools/optimize/calibration/builtin_logging_ops/lstm.cc
@@ -62,7 +62,7 @@ void UpdateLstmCellFloat(int n_batch, int n_cell, float* cell_state,
   }
 }
 
-void CalculateLstmOutputFloat(
+void CalculateLstmOutputCalibration(
     int n_batch, int n_cell, int n_output, const float* cell_state,
     const float* output_gate, TfLiteFusedActivation activation,
     const float* projection_weights, const float* projection_bias,
@@ -97,7 +97,7 @@ void CalculateLstmOutputFloat(
   }
 }
 
-inline void LstmStepWithAuxInput(
+inline void LstmStepCalibration(
     const float* input_ptr, const float* input_to_input_weights_ptr,
     const float* input_to_forget_weights_ptr,
     const float* input_to_cell_weights_ptr,
@@ -126,18 +126,19 @@ inline void LstmStepWithAuxInput(
     float* scratch1, float* scratch2, float* scratch3, float* output_ptr,
     Logger* logger, const std::vector<int>& intermediate_tensor_indexes,
     ErrorReporter* error_reporter) {
-  // Make named scratch buffers for the different gates.
-  float* input_gate_scratch = scratch0;
-  float* forget_gate_scratch = scratch1;
-  float* cell_gate_scratch = scratch2;
-  float* output_gate_scratch = scratch3;
-
+  ruy::profiler::ScopeLabel label("LstmStepCalibration");
   // Since we have already checked that weights are all there or none, we can
   // check the existence of only one to the get the condition.
   const bool use_cifg = (input_to_input_weights_ptr == nullptr);
   const bool use_peephole = (cell_to_output_weights_ptr != nullptr);
   const bool use_layer_norm = (forget_layer_norm_coefficients_ptr != nullptr);
 
+  // Make named scratch buffers for the different gates.
+  float* input_gate_scratch = scratch0;
+  float* forget_gate_scratch = scratch1;
+  float* cell_gate_scratch = scratch2;
+  float* output_gate_scratch = scratch3;
+
   // Initialize scratch buffers with bias for regular lstm or initialize with
   // zero for layer norm lstm.
   if (use_layer_norm) {
@@ -177,7 +178,8 @@ inline void LstmStepWithAuxInput(
       input_to_output_weights_ptr, n_cell, n_input, input_ptr, n_batch,
       output_gate_scratch);
 
-  // If auxiliary input is available then compute aux_input_weight * aux_input
+  // For each batch and cell: compute aux_input_weight * aux_input.
+  // Skip if auxiliary input is not available.
   if (aux_input_ptr != nullptr) {
     if (!use_cifg) {
       tensor_utils::MatrixBatchVectorMultiplyAccumulate(
@@ -293,11 +295,11 @@ inline void LstmStepWithAuxInput(
   tensor_utils::ApplySigmoidToVector(output_gate_scratch, n_batch * n_cell,
                                      output_gate_scratch);
 
-  CalculateLstmOutputFloat(n_batch, n_cell, n_output, cell_state_ptr,
-                           output_gate_scratch, params->activation,
-                           projection_weights_ptr, projection_bias_ptr,
-                           params->proj_clip, output_state_ptr, scratch2,
-                           logger, intermediate_tensor_indexes, error_reporter);
+  CalculateLstmOutputCalibration(
+      n_batch, n_cell, n_output, cell_state_ptr, output_gate_scratch,
+      params->activation, projection_weights_ptr, projection_bias_ptr,
+      params->proj_clip, output_state_ptr, scratch2, logger,
+      intermediate_tensor_indexes, error_reporter);
 
   // Copy output_state to the output. Note that the output batch rows may not be
   // contiguous (output_batch_leading_dim != n_output).
@@ -307,7 +309,7 @@ inline void LstmStepWithAuxInput(
   }
 }
 
-TfLiteStatus EvalFloat(
+TfLiteStatus EvalCalibration(
     const TfLiteTensor* input, const TfLiteTensor* input_to_input_weights,
     const TfLiteTensor* input_to_forget_weights,
     const TfLiteTensor* input_to_cell_weights,
@@ -392,7 +394,7 @@ TfLiteStatus EvalFloat(
       float* output_ptr_time =
           GetTensorData<float>(output) + t_rel * output_step + output_offset;
 
-      LstmStepWithAuxInput(
+      LstmStepCalibration(
           input_ptr, GetTensorData<float>(input_to_input_weights),
           GetTensorData<float>(input_to_forget_weights),
           GetTensorData<float>(input_to_cell_weights),
@@ -454,7 +456,7 @@ TfLiteStatus EvalFloat(
         float* cell_gate_scratch_ptr = cell_gate_scratch + b * n_cell;
         float* output_gate_scratch_ptr = output_gate_scratch + b * n_cell;
 
-        LstmStepWithAuxInput(
+        LstmStepCalibration(
             input_ptr, GetTensorData<float>(input_to_input_weights),
             GetTensorData<float>(input_to_forget_weights),
             GetTensorData<float>(input_to_cell_weights),
@@ -587,7 +589,7 @@ TfLiteStatus lstm_eval(TfLiteContext* context, TfLiteNode* node, Logger* logger,
 
   switch (input_to_output_weights->type) {
     case kTfLiteFloat32: {
-      return EvalFloat(
+      return EvalCalibration(
           input, input_to_input_weights, input_to_forget_weights,
           input_to_cell_weights, input_to_output_weights,
           recurrent_to_input_weights, recurrent_to_forget_weights,

From e668ec531b0e3bc7c376d7453b28dd1d24e7ae4f Mon Sep 17 00:00:00 2001
From: Advait Jain <advaitjain@google.com>
Date: Tue, 30 Jun 2020 10:44:26 -0700
Subject: [PATCH 1321/1390] Delete MockOpResolver because it was unnecessary
 and used ParseOpData.

PiperOrigin-RevId: 319052297
Change-Id: I0cf60b160d82c7f4921190ed39f8eab74b7a12f9
---
 tensorflow/lite/micro/micro_allocator_test.cc | 40 +++++++-------
 .../lite/micro/micro_interpreter_test.cc      | 53 +++++++++++--------
 tensorflow/lite/micro/test_helpers.cc         | 35 ++++++------
 tensorflow/lite/micro/test_helpers.h          | 13 ++---
 4 files changed, 71 insertions(+), 70 deletions(-)

diff --git a/tensorflow/lite/micro/micro_allocator_test.cc b/tensorflow/lite/micro/micro_allocator_test.cc
index f3f3f32611e..510c5ac348e 100644
--- a/tensorflow/lite/micro/micro_allocator_test.cc
+++ b/tensorflow/lite/micro/micro_allocator_test.cc
@@ -158,7 +158,7 @@ TF_LITE_MICRO_TEST(TestMissingQuantization) {
 TF_LITE_MICRO_TEST(TestFailsWhenModelStartsTwice) {
   const tflite::Model* model = tflite::testing::GetSimpleMockModel();
   TfLiteContext context;
-  tflite::testing::MockOpResolver mock_resolver;
+  tflite::AllOpsResolver op_resolver = tflite::testing::GetOpResolver();
   tflite::NodeAndRegistration* node_and_registration;
   constexpr size_t arena_size = 1024;
   uint8_t arena[arena_size];
@@ -166,17 +166,17 @@ TF_LITE_MICRO_TEST(TestFailsWhenModelStartsTwice) {
       tflite::MicroAllocator::Create(arena, arena_size, micro_test::reporter);
   TF_LITE_MICRO_EXPECT_NE(nullptr, allocator);
   TF_LITE_MICRO_EXPECT_EQ(
-      kTfLiteOk, allocator->StartModelAllocation(model, &context, mock_resolver,
+      kTfLiteOk, allocator->StartModelAllocation(model, &context, op_resolver,
                                                  &node_and_registration));
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteError, allocator->StartModelAllocation(
-                                            model, &context, mock_resolver,
-                                            &node_and_registration));
+  TF_LITE_MICRO_EXPECT_EQ(
+      kTfLiteError, allocator->StartModelAllocation(
+                        model, &context, op_resolver, &node_and_registration));
 }
 
 TF_LITE_MICRO_TEST(TestFailsWhenModelFinishesBeforeStart) {
   const tflite::Model* model = tflite::testing::GetSimpleMockModel();
   TfLiteContext context;
-  tflite::testing::MockOpResolver mock_resolver;
+  tflite::AllOpsResolver op_resolver = tflite::testing::GetOpResolver();
   tflite::NodeAndRegistration* node_and_registration;
   constexpr size_t arena_size = 1024;
   uint8_t arena[arena_size];
@@ -190,7 +190,7 @@ TF_LITE_MICRO_TEST(TestFailsWhenModelFinishesBeforeStart) {
 TF_LITE_MICRO_TEST(TestMockModelAllocation) {
   const tflite::Model* model = tflite::testing::GetSimpleMockModel();
   TfLiteContext context;
-  tflite::testing::MockOpResolver mock_resolver;
+  tflite::AllOpsResolver op_resolver = tflite::testing::GetOpResolver();
   tflite::NodeAndRegistration* node_and_registration;
   constexpr size_t arena_size = 1024;
   uint8_t arena[arena_size];
@@ -198,7 +198,7 @@ TF_LITE_MICRO_TEST(TestMockModelAllocation) {
       tflite::MicroAllocator::Create(arena, arena_size, micro_test::reporter);
   TF_LITE_MICRO_EXPECT_NE(nullptr, allocator);
   TF_LITE_MICRO_EXPECT_EQ(
-      kTfLiteOk, allocator->StartModelAllocation(model, &context, mock_resolver,
+      kTfLiteOk, allocator->StartModelAllocation(model, &context, op_resolver,
                                                  &node_and_registration));
   TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk,
                           allocator->FinishModelAllocation(model, &context));
@@ -233,7 +233,7 @@ TF_LITE_MICRO_TEST(TestMockModelAllocation) {
 TF_LITE_MICRO_TEST(TestAllocationForModelsWithBranches) {
   const tflite::Model* model = tflite::testing::GetSimpleModelWithBranch();
   TfLiteContext context;
-  tflite::testing::MockOpResolver mock_resolver;
+  tflite::AllOpsResolver op_resolver = tflite::testing::GetOpResolver();
   tflite::NodeAndRegistration* node_and_registration;
   constexpr size_t arena_size = 4096;
   uint8_t arena[arena_size];
@@ -241,7 +241,7 @@ TF_LITE_MICRO_TEST(TestAllocationForModelsWithBranches) {
       tflite::MicroAllocator::Create(arena, arena_size, micro_test::reporter);
   TF_LITE_MICRO_EXPECT_NE(nullptr, allocator);
   TF_LITE_MICRO_EXPECT_EQ(
-      kTfLiteOk, allocator->StartModelAllocation(model, &context, mock_resolver,
+      kTfLiteOk, allocator->StartModelAllocation(model, &context, op_resolver,
                                                  &node_and_registration));
   TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk,
                           allocator->FinishModelAllocation(model, &context));
@@ -268,7 +268,7 @@ TF_LITE_MICRO_TEST(TestAllocationForModelsWithBranches) {
 TF_LITE_MICRO_TEST(TestAllocationForComplexModelAllocation) {
   const tflite::Model* model = tflite::testing::GetComplexMockModel();
   TfLiteContext context;
-  tflite::testing::MockOpResolver mock_resolver;
+  tflite::AllOpsResolver op_resolver = tflite::testing::GetOpResolver();
   tflite::NodeAndRegistration* node_and_registration;
   constexpr size_t arena_size = 2048;
   uint8_t arena[arena_size];
@@ -276,7 +276,7 @@ TF_LITE_MICRO_TEST(TestAllocationForComplexModelAllocation) {
       tflite::MicroAllocator::Create(arena, arena_size, micro_test::reporter);
   TF_LITE_MICRO_EXPECT_NE(nullptr, allocator);
   TF_LITE_MICRO_EXPECT_EQ(
-      kTfLiteOk, allocator->StartModelAllocation(model, &context, mock_resolver,
+      kTfLiteOk, allocator->StartModelAllocation(model, &context, op_resolver,
                                                  &node_and_registration));
   TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk,
                           allocator->FinishModelAllocation(model, &context));
@@ -312,7 +312,7 @@ TF_LITE_MICRO_TEST(OfflinePlannerBranchesAllOnline) {
   int version = 1;
   int subgraph = 0;
   constexpr int nbr_tensors = 4;
-  tflite::testing::MockOpResolver mock_resolver;
+  tflite::AllOpsResolver op_resolver = tflite::testing::GetOpResolver();
   tflite::NodeAndRegistration* node_and_registration;
   const int32_t metadata_buffer[tflite::testing::kOfflinePlannerHeaderSize +
                                 nbr_tensors] = {version, subgraph,
@@ -346,7 +346,7 @@ TF_LITE_MICRO_TEST(OfflinePlannerBranchesAllOnline) {
       tflite::MicroAllocator::Create(arena, arena_size, micro_test::reporter);
 
   TF_LITE_MICRO_EXPECT_EQ(
-      kTfLiteOk, allocator->StartModelAllocation(model, &context, mock_resolver,
+      kTfLiteOk, allocator->StartModelAllocation(model, &context, op_resolver,
                                                  &node_and_registration));
   TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk,
                           allocator->FinishModelAllocation(model, &context));
@@ -364,7 +364,7 @@ TF_LITE_MICRO_TEST(OfflinePlannerBranchesAllOnline) {
 
 TF_LITE_MICRO_TEST(OfflinePlannerBasic) {
   constexpr int nbr_tensors = 4;
-  tflite::testing::MockOpResolver mock_resolver;
+  tflite::AllOpsResolver op_resolver = tflite::testing::GetOpResolver();
   tflite::NodeAndRegistration* node_and_registration;
   const int32_t metadata_buffer[tflite::testing::kOfflinePlannerHeaderSize +
                                 nbr_tensors] = {1,  0, nbr_tensors,
@@ -402,7 +402,7 @@ TF_LITE_MICRO_TEST(OfflinePlannerBasic) {
       tflite::MicroAllocator::Create(arena, arena_size, micro_test::reporter);
 
   TF_LITE_MICRO_EXPECT_EQ(
-      kTfLiteOk, allocator->StartModelAllocation(model, &context, mock_resolver,
+      kTfLiteOk, allocator->StartModelAllocation(model, &context, op_resolver,
                                                  &node_and_registration));
   TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk,
                           allocator->FinishModelAllocation(model, &context));
@@ -416,7 +416,7 @@ TF_LITE_MICRO_TEST(OfflinePlannerBasic) {
 
 TF_LITE_MICRO_TEST(OfflinePlannerOverlappingAllocation) {
   constexpr int nbr_tensors = 4;
-  tflite::testing::MockOpResolver mock_resolver;
+  tflite::AllOpsResolver op_resolver = tflite::testing::GetOpResolver();
   tflite::NodeAndRegistration* node_and_registration;
   const int32_t metadata_buffer[tflite::testing::kOfflinePlannerHeaderSize +
                                 nbr_tensors] = {
@@ -454,7 +454,7 @@ TF_LITE_MICRO_TEST(OfflinePlannerOverlappingAllocation) {
       tflite::MicroAllocator::Create(arena, arena_size, micro_test::reporter);
 
   TF_LITE_MICRO_EXPECT_EQ(
-      kTfLiteOk, allocator->StartModelAllocation(model, &context, mock_resolver,
+      kTfLiteOk, allocator->StartModelAllocation(model, &context, op_resolver,
                                                  &node_and_registration));
   TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk,
                           allocator->FinishModelAllocation(model, &context));
@@ -469,7 +469,7 @@ TF_LITE_MICRO_TEST(OfflinePlannerOverlappingAllocation) {
 
 TF_LITE_MICRO_TEST(OfflinePlannerOfflineOnline) {
   constexpr int nbr_tensors = 5;
-  tflite::testing::MockOpResolver mock_resolver;
+  tflite::AllOpsResolver op_resolver = tflite::testing::GetOpResolver();
   tflite::NodeAndRegistration* node_and_registration;
   const int32_t metadata_buffer[tflite::testing::kOfflinePlannerHeaderSize +
                                 nbr_tensors] = {
@@ -509,7 +509,7 @@ TF_LITE_MICRO_TEST(OfflinePlannerOfflineOnline) {
       tflite::MicroAllocator::Create(arena, arena_size, micro_test::reporter);
 
   TF_LITE_MICRO_EXPECT_EQ(
-      kTfLiteOk, allocator->StartModelAllocation(model, &context, mock_resolver,
+      kTfLiteOk, allocator->StartModelAllocation(model, &context, op_resolver,
                                                  &node_and_registration));
   TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk,
                           allocator->FinishModelAllocation(model, &context));
diff --git a/tensorflow/lite/micro/micro_interpreter_test.cc b/tensorflow/lite/micro/micro_interpreter_test.cc
index c577d8cb513..a5be011e2f0 100644
--- a/tensorflow/lite/micro/micro_interpreter_test.cc
+++ b/tensorflow/lite/micro/micro_interpreter_test.cc
@@ -18,7 +18,7 @@ limitations under the License.
 #include <cstdint>
 
 #include "tensorflow/lite/core/api/flatbuffer_conversions.h"
-#include "tensorflow/lite/micro/micro_mutable_op_resolver.h"
+#include "tensorflow/lite/micro/all_ops_resolver.h"
 #include "tensorflow/lite/micro/micro_optional_debug_tools.h"
 #include "tensorflow/lite/micro/micro_utils.h"
 #include "tensorflow/lite/micro/recording_micro_allocator.h"
@@ -69,15 +69,15 @@ TF_LITE_MICRO_TESTS_BEGIN
 TF_LITE_MICRO_TEST(TestInterpreter) {
   const tflite::Model* model = tflite::testing::GetSimpleMockModel();
   TF_LITE_MICRO_EXPECT_NE(nullptr, model);
-  tflite::testing::MockOpResolver mock_resolver;
-  constexpr size_t allocator_buffer_size =
-      928 /* optimal arena size at the time of writting. */ +
-      16 /* alignment */ + 100 /* some headroom */;
+
+  tflite::AllOpsResolver op_resolver = tflite::testing::GetOpResolver();
+
+  constexpr size_t allocator_buffer_size = 1000;
   uint8_t allocator_buffer[allocator_buffer_size];
 
   // Create a new scope so that we can test the destructor.
   {
-    tflite::MicroInterpreter interpreter(model, mock_resolver, allocator_buffer,
+    tflite::MicroInterpreter interpreter(model, op_resolver, allocator_buffer,
                                          allocator_buffer_size,
                                          micro_test::reporter);
     TF_LITE_MICRO_EXPECT_EQ(interpreter.AllocateTensors(), kTfLiteOk);
@@ -124,10 +124,12 @@ TF_LITE_MICRO_TEST(TestInterpreter) {
 TF_LITE_MICRO_TEST(TestKernelMemoryPlanning) {
   const tflite::Model* model = tflite::testing::GetSimpleStatefulModel();
   TF_LITE_MICRO_EXPECT_NE(nullptr, model);
-  tflite::testing::MockOpResolver mock_resolver;
+
+  tflite::AllOpsResolver op_resolver = tflite::testing::GetOpResolver();
+
   constexpr size_t allocator_buffer_size = 1024;
   uint8_t allocator_buffer[allocator_buffer_size];
-  tflite::MicroInterpreter interpreter(model, mock_resolver, allocator_buffer,
+  tflite::MicroInterpreter interpreter(model, op_resolver, allocator_buffer,
                                        allocator_buffer_size,
                                        micro_test::reporter);
   TF_LITE_MICRO_EXPECT_EQ(interpreter.AllocateTensors(), kTfLiteOk);
@@ -164,12 +166,13 @@ TF_LITE_MICRO_TEST(TestVariableTensorReset) {
   const tflite::Model* model = tflite::testing::GetComplexMockModel();
   TF_LITE_MICRO_EXPECT_NE(nullptr, model);
 
-  tflite::testing::MockOpResolver mock_resolver;
+  tflite::AllOpsResolver op_resolver = tflite::testing::GetOpResolver();
+
   constexpr size_t allocator_buffer_size =
       2096 /* optimal arena size at the time of writting. */ +
       16 /* alignment */ + 100 /* some headroom */;
   uint8_t allocator_buffer[allocator_buffer_size];
-  tflite::MicroInterpreter interpreter(model, mock_resolver, allocator_buffer,
+  tflite::MicroInterpreter interpreter(model, op_resolver, allocator_buffer,
                                        allocator_buffer_size,
                                        micro_test::reporter);
   TF_LITE_MICRO_EXPECT_EQ(interpreter.AllocateTensors(), kTfLiteOk);
@@ -242,11 +245,12 @@ TF_LITE_MICRO_TEST(TestIncompleteInitialization) {
   const tflite::Model* model = tflite::testing::GetComplexMockModel();
   TF_LITE_MICRO_EXPECT_NE(nullptr, model);
 
-  tflite::testing::MockOpResolver mock_resolver;
+  tflite::AllOpsResolver op_resolver = tflite::testing::GetOpResolver();
+
   constexpr size_t allocator_buffer_size = 2048;
   uint8_t allocator_buffer[allocator_buffer_size];
 
-  tflite::MicroInterpreter interpreter(model, mock_resolver, allocator_buffer,
+  tflite::MicroInterpreter interpreter(model, op_resolver, allocator_buffer,
                                        allocator_buffer_size,
                                        micro_test::reporter);
 }
@@ -257,11 +261,12 @@ TF_LITE_MICRO_TEST(InterpreterWithProfilerShouldProfileOps) {
   const tflite::Model* model = tflite::testing::GetComplexMockModel();
   TF_LITE_MICRO_EXPECT_NE(nullptr, model);
 
-  tflite::testing::MockOpResolver mock_resolver;
+  tflite::AllOpsResolver op_resolver = tflite::testing::GetOpResolver();
+
   constexpr size_t allocator_buffer_size = 2048;
   uint8_t allocator_buffer[allocator_buffer_size];
   tflite::MockProfiler profiler;
-  tflite::MicroInterpreter interpreter(model, mock_resolver, allocator_buffer,
+  tflite::MicroInterpreter interpreter(model, op_resolver, allocator_buffer,
                                        allocator_buffer_size,
                                        micro_test::reporter, &profiler);
 
@@ -282,8 +287,8 @@ TF_LITE_MICRO_TEST(TestIncompleteInitializationAllocationsWithSmallArena) {
   const tflite::Model* model = tflite::testing::GetComplexMockModel();
   TF_LITE_MICRO_EXPECT_NE(nullptr, model);
 
-  tflite::testing::MockOpResolver mock_resolver;
-  // 1kb is too small for the ComplexMockModel:
+  tflite::AllOpsResolver op_resolver = tflite::testing::GetOpResolver();
+
   constexpr size_t allocator_buffer_size = 500;
   uint8_t allocator_buffer[allocator_buffer_size];
 
@@ -292,7 +297,7 @@ TF_LITE_MICRO_TEST(TestIncompleteInitializationAllocationsWithSmallArena) {
           allocator_buffer, allocator_buffer_size, micro_test::reporter);
   TF_LITE_MICRO_EXPECT_NE(nullptr, allocator);
 
-  tflite::MicroInterpreter interpreter(model, mock_resolver, allocator,
+  tflite::MicroInterpreter interpreter(model, op_resolver, allocator,
                                        micro_test::reporter);
 
   // Interpreter fails because arena is too small:
@@ -328,8 +333,9 @@ TF_LITE_MICRO_TEST(TestInterpreterDoesNotAllocateUntilInvoke) {
   const tflite::Model* model = tflite::testing::GetComplexMockModel();
   TF_LITE_MICRO_EXPECT_NE(nullptr, model);
 
-  tflite::testing::MockOpResolver mock_resolver;
-  constexpr size_t allocator_buffer_size = 1024 * 4;
+  tflite::AllOpsResolver op_resolver = tflite::testing::GetOpResolver();
+
+  constexpr size_t allocator_buffer_size = 1024 * 10;
   uint8_t allocator_buffer[allocator_buffer_size];
 
   tflite::RecordingMicroAllocator* allocator =
@@ -337,7 +343,7 @@ TF_LITE_MICRO_TEST(TestInterpreterDoesNotAllocateUntilInvoke) {
           allocator_buffer, allocator_buffer_size, micro_test::reporter);
   TF_LITE_MICRO_EXPECT_NE(nullptr, allocator);
 
-  tflite::MicroInterpreter interpreter(model, mock_resolver, allocator,
+  tflite::MicroInterpreter interpreter(model, op_resolver, allocator,
                                        micro_test::reporter);
 
   // Ensure allocations are zero (ignore tail since some internal structs are
@@ -373,15 +379,18 @@ TF_LITE_MICRO_TEST(TestInterpreterDoesNotAllocateUntilInvoke) {
               tflite::RecordedAllocationType::kTfLiteTensorArray)
           .used_bytes,
       0);
-  TF_LITE_MICRO_EXPECT_GT(
 
+  TF_LITE_MICRO_EXPECT_GT(
       allocator
           ->GetRecordedAllocation(
               tflite::RecordedAllocationType::kTfLiteTensorVariableBufferData)
           .used_bytes,
       0);
-  TF_LITE_MICRO_EXPECT_GT(
 
+  // TODO(b/160160549): This check is mostly meaningless right now because the
+  // operator creation in our mock models is inconsistent. Revisit what this
+  // check should be once the mock models are properly created.
+  TF_LITE_MICRO_EXPECT_EQ(
       allocator->GetRecordedAllocation(tflite::RecordedAllocationType::kOpData)
           .used_bytes,
       0);
diff --git a/tensorflow/lite/micro/test_helpers.cc b/tensorflow/lite/micro/test_helpers.cc
index 35a658c369e..6551683bfb2 100644
--- a/tensorflow/lite/micro/test_helpers.cc
+++ b/tensorflow/lite/micro/test_helpers.cc
@@ -27,6 +27,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/compatibility.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/micro/all_ops_resolver.h"
 #include "tensorflow/lite/micro/micro_utils.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 
@@ -556,6 +557,10 @@ const Model* BuildComplexMockModel() {
 }  // namespace
 
 const TfLiteRegistration* SimpleStatefulOp::getRegistration() {
+  return GetMutableRegistration();
+}
+
+TfLiteRegistration* SimpleStatefulOp::GetMutableRegistration() {
   static TfLiteRegistration r;
   r.init = Init;
   r.prepare = Prepare;
@@ -628,6 +633,10 @@ TfLiteStatus SimpleStatefulOp::Invoke(TfLiteContext* context,
 }
 
 const TfLiteRegistration* MockCustom::getRegistration() {
+  return GetMutableRegistration();
+}
+
+TfLiteRegistration* MockCustom::GetMutableRegistration() {
   static TfLiteRegistration r;
   r.init = Init;
   r.prepare = Prepare;
@@ -667,27 +676,13 @@ TfLiteStatus MockCustom::Invoke(TfLiteContext* context, TfLiteNode* node) {
 
 bool MockCustom::freed_ = false;
 
-const TfLiteRegistration* MockOpResolver::FindOp(BuiltinOperator op) const {
-  return nullptr;
-}
+AllOpsResolver GetOpResolver() {
+  AllOpsResolver op_resolver;
+  op_resolver.AddCustom("mock_custom", MockCustom::GetMutableRegistration());
+  op_resolver.AddCustom("simple_stateful_op",
+                        SimpleStatefulOp::GetMutableRegistration());
 
-const TfLiteRegistration* MockOpResolver::FindOp(const char* op) const {
-  if (strcmp(op, "mock_custom") == 0) {
-    return MockCustom::getRegistration();
-  } else if (strcmp(op, "simple_stateful_op") == 0) {
-    return SimpleStatefulOp::getRegistration();
-  } else {
-    return nullptr;
-  }
-}
-
-MicroOpResolver::BuiltinParseFunction MockOpResolver::GetOpDataParser(
-    tflite::BuiltinOperator) const {
-  // TODO(b/149408647): Figure out an alternative so that we do not have any
-  // references to ParseOpData in the micro code and the signature for
-  // MicroOpResolver::BuiltinParseFunction can be changed to be different from
-  // ParseOpData.
-  return ParseOpData;
+  return op_resolver;
 }
 
 const Model* GetSimpleMockModel() {
diff --git a/tensorflow/lite/micro/test_helpers.h b/tensorflow/lite/micro/test_helpers.h
index 86981cc7f75..c2b489314d0 100644
--- a/tensorflow/lite/micro/test_helpers.h
+++ b/tensorflow/lite/micro/test_helpers.h
@@ -23,7 +23,7 @@ limitations under the License.
 #include "flatbuffers/flatbuffers.h"  // from @flatbuffers
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/kernels/internal/compatibility.h"
-#include "tensorflow/lite/micro/micro_mutable_op_resolver.h"
+#include "tensorflow/lite/micro/all_ops_resolver.h"
 #include "tensorflow/lite/micro/micro_utils.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 
@@ -55,6 +55,7 @@ class SimpleStatefulOp {
 
  public:
   static const TfLiteRegistration* getRegistration();
+  static TfLiteRegistration* GetMutableRegistration();
   static void* Init(TfLiteContext* context, const char* buffer, size_t length);
   static TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node);
   static TfLiteStatus Invoke(TfLiteContext* context, TfLiteNode* node);
@@ -63,6 +64,7 @@ class SimpleStatefulOp {
 class MockCustom {
  public:
   static const TfLiteRegistration* getRegistration();
+  static TfLiteRegistration* GetMutableRegistration();
   static void* Init(TfLiteContext* context, const char* buffer, size_t length);
   static void Free(TfLiteContext* context, void* buffer);
   static TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node);
@@ -71,13 +73,8 @@ class MockCustom {
   static bool freed_;
 };
 
-class MockOpResolver : public MicroOpResolver {
- public:
-  const TfLiteRegistration* FindOp(BuiltinOperator op) const override;
-  const TfLiteRegistration* FindOp(const char* op) const override;
-  MicroOpResolver::BuiltinParseFunction GetOpDataParser(
-      tflite::BuiltinOperator) const override;
-};
+// Returns an Op Resolver that can be used in the testing code.
+AllOpsResolver GetOpResolver();
 
 // Returns a simple example flatbuffer TensorFlow Lite model. Contains 1 input,
 // 1 layer of weights, 1 output Tensor, and 1 operator.

From 7046e2fbed1a280acdac7e4c808e95d1d1b17339 Mon Sep 17 00:00:00 2001
From: Christian Sigg <csigg@google.com>
Date: Tue, 30 Jun 2020 10:47:08 -0700
Subject: [PATCH 1322/1390] Update NCCL source to 2.7.6.

PiperOrigin-RevId: 319052896
Change-Id: Ia08718eb0d719db5c43c1e828444b19a45fb4201
---
 tensorflow/workspace.bzl       |  8 ++++----
 third_party/nccl/archive.patch | 20 ++++++++++----------
 2 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index 92f7ef0cdd1..9435ef96bd1 100755
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -802,11 +802,11 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
         name = "nccl_archive",
         build_file = clean_dep("//third_party:nccl/archive.BUILD"),
         patch_file = clean_dep("//third_party/nccl:archive.patch"),
-        sha256 = "67e15ce3d12ba9ea1e0cb239599202b0f61c146149699341043c072de388e90a",
-        strip_prefix = "nccl-5949d96f36d050e59d05872f8bbffd2549318e95",
+        sha256 = "b8eaed1fb2d0cc2f951625dc4e17185bab9ff3ab188ba4d34a6e3a01ce9f0d57",
+        strip_prefix = "nccl-195232556936b39b01cc908296e1650b80d4a3e9",
         urls = [
-            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/nvidia/nccl/archive/5949d96f36d050e59d05872f8bbffd2549318e95.tar.gz",
-            "https://github.com/nvidia/nccl/archive/5949d96f36d050e59d05872f8bbffd2549318e95.tar.gz",
+            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/nvidia/nccl/archive/195232556936b39b01cc908296e1650b80d4a3e9.tar.gz",
+            "https://github.com/nvidia/nccl/archive/195232556936b39b01cc908296e1650b80d4a3e9.tar.gz",
         ],
     )
 
diff --git a/third_party/nccl/archive.patch b/third_party/nccl/archive.patch
index 94ef48d00e8..9dfe432d60b 100644
--- a/third_party/nccl/archive.patch
+++ b/third_party/nccl/archive.patch
@@ -138,17 +138,17 @@ index 550cfcd0c..8fea91950 100644
    return ncclSuccess;
  }
  
-@@ -640,8 +641,8 @@ ncclResult_t ncclTopoGetXmlFromGpu(struct ncclXmlNode* pciNode, nvmlDevice_t nvm
-     if (index == -1) {
-       const char* busId;
-       NCCLCHECK(xmlGetAttr(sub, "target", &busId));
--      char* path;
--      NCCLCHECK(getPciPath(busId, &path));
-+      char path[PATH_MAX+1];
-+      NCCLCHECK(getPciPath(busId, path));
-       NCCLCHECK(ncclTopoSetAttrFromSys(sub, path, "class", "tclass"));
+@@ -644,8 +644,8 @@ ncclResult_t ncclTopoGetXmlFromGpu(struct ncclXmlNode* pciNode, nvmlDevice_t nvm
+         // Remote NVLink device is not visible inside this VM. Assume NVSwitch.
+         NCCLCHECK(xmlSetAttr(sub, "tclass", "0x068000"));
+       } else {
+-        char* path;
+-        NCCLCHECK(getPciPath(busId, &path));
++        char path[PATH_MAX+1];
++        NCCLCHECK(getPciPath(busId, path));
+         NCCLCHECK(ncclTopoSetAttrFromSys(sub, path, "class", "tclass"));
+       }
      }
-   }
 
 From f02d51952ac587237ea5f7c607a5b379381d09d7 Mon Sep 17 00:00:00 2001
 From: Danilo <doak@google.com>

From f4202184458ac97fdc447805a8cc57b75d4f01cf Mon Sep 17 00:00:00 2001
From: Jacques Pienaar <jpienaar@google.com>
Date: Tue, 30 Jun 2020 10:47:19 -0700
Subject: [PATCH 1323/1390] [mlir] Change tf_executor.SwitchN to
 tf_executor._SwitchN

The actual TF op is _SwitchN, mapping it to tf_executor.SwithN op requires
special handling on export and that has hit us 2x now, so just make it
_SwitchN and be uniform instead.

PiperOrigin-RevId: 319052946
Change-Id: I9bd833dd5a4351f5fdd9fb648af8844f0dcdb76b
---
 .../mlir/tensorflow/ir/tf_executor_ops.td     | 12 ++--
 .../tensorflow/tests/breakup-islands.mlir     |  4 +-
 .../tests/graphdef2mlir/switch_n.pbtxt        |  4 +-
 .../tests/mlir2graphdef/switchn.mlir          | 60 +++++++++++++++++++
 .../tensorflow/tests/shape_inference.mlir     |  4 +-
 .../tensorflow/tests/tf_executor_ops.mlir     |  8 +--
 .../tests/tf_executor_ops_invalid.mlir        | 30 +++++-----
 7 files changed, 91 insertions(+), 31 deletions(-)
 create mode 100644 tensorflow/compiler/mlir/tensorflow/tests/mlir2graphdef/switchn.mlir

diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_executor_ops.td b/tensorflow/compiler/mlir/tensorflow/ir/tf_executor_ops.td
index 0efe578f151..3081018b8da 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_executor_ops.td
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_executor_ops.td
@@ -278,14 +278,14 @@ def TfExecutor_SwitchOp : TfExecutor_Op<"Switch",
    let verifier = ?;
 }
 
-def TfExecutor_SwitchNOp : TfExecutor_Op<"SwitchN",
+def TfExecutor_SwitchNOp : TfExecutor_Op<"_SwitchN",
     [ControlOperandsAfterAllData, HasParent<"GraphOp">]> {
   let summary = [{
-    The "tf_executor.SwitchN" operation takes two inputs, `data` and `index` and
-    an integer attribute `num_outs` indicating the number of outputs. The `data`
-    input is copied to output indicated by the `index` input. The other outputs
-    are marked as dead. If one of the inputs or a control token is dead, then
-    all of the outputs are marked as dead as well.
+    The "tf_executor._SwitchN" operation takes two inputs, `data` and `index`
+    and an integer attribute `num_outs` indicating the number of outputs. The
+    `data` input is copied to output indicated by the `index` input. The other
+    outputs are marked as dead. If one of the inputs or a control token is
+    dead, then all of the outputs are marked as dead as well.
   }];
 
   let description = [{
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/breakup-islands.mlir b/tensorflow/compiler/mlir/tensorflow/tests/breakup-islands.mlir
index bdfe6f2ce07..05d34eb0755 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/breakup-islands.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/breakup-islands.mlir
@@ -331,7 +331,7 @@ func @enter_control_input() {
 }
 
 // CHECK: %[[CONTROL:[^ ,]*]] = tf_executor.island wraps "tf.Print"
-// CHECK: tf_executor.SwitchN {{.*}}, {{.*}} of {{[0-9]*}} (%[[CONTROL]])
+// CHECK: tf_executor._SwitchN {{.*}}, {{.*}} of {{[0-9]*}} (%[[CONTROL]])
 func @switchn_control_input(%arg1: tensor<i32>) {
   tf_executor.graph {
     %island:2 = tf_executor.island {
@@ -339,7 +339,7 @@ func @switchn_control_input(%arg1: tensor<i32>) {
       %print = "tf.Print"(%const) : (tensor<*xi32>) -> (tensor<*xi32>)
       tf_executor.yield %const : tensor<*xi32>
     }
-    %switchn:4 = tf_executor.SwitchN %island#0, %arg1 of 3: tensor<*xi32>
+    %switchn:4 = tf_executor._SwitchN %island#0, %arg1 of 3: tensor<*xi32>
     tf_executor.fetch %switchn#0 : tensor<*xi32>
   }
   return
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/switch_n.pbtxt b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/switch_n.pbtxt
index 4c4c8011932..59731b7cdb3 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/switch_n.pbtxt
+++ b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/switch_n.pbtxt
@@ -1,10 +1,10 @@
 # RUN: tf-mlir-translate -graphdef-to-splatted-mlir %s -o - -mlir-print-debuginfo | FileCheck %s
 
-# CHECK: tf_executor.SwitchN
+# CHECK: tf_executor._SwitchN
 # CHECK-SAME: of 3 : tensor<*xi32>
 # CHECK-SAME: T = i32
 # CHECK-SAME: loc("Case/branch_index/_3")
-# CHECK: tf_executor.SwitchN
+# CHECK: tf_executor._SwitchN
 # CHECK-SAME: of 2 : tensor<*xf32>
 # CHECK-SAME: T = f32
 # CHECK-SAME: loc("Case/Case/input_0/_7")
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/mlir2graphdef/switchn.mlir b/tensorflow/compiler/mlir/tensorflow/tests/mlir2graphdef/switchn.mlir
new file mode 100644
index 00000000000..25f50603521
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/tests/mlir2graphdef/switchn.mlir
@@ -0,0 +1,60 @@
+// RUN: tf-mlir-translate -mlir-to-graphdef %s -o - | FileCheck %s
+
+"module"() ( {
+  "func"() ( {
+    "tf_executor.graph"() ( {
+      %outputs, %control = "tf_executor.island"() ( {
+        %0 = "tf.Const"() {device = "", value = dense<0> : tensor<i32>} : () -> tensor<i32>
+        "tf_executor.yield"(%0) : (tensor<i32>) -> ()
+      }) : () -> (tensor<i32>, !tf_executor.control)
+      %outputs_0:3, %control_1 = "tf_executor._SwitchN"(%outputs, %outputs) {T = i32, device = "", num_outs = 3 : i64} : (tensor<i32>, tensor<i32>) -> (tensor<*xi32>, tensor<*xi32>, tensor<*xi32>, !tf_executor.control)
+      %outputs_2, %control_3 = "tf_executor.island"() ( {
+        %0 = "tf.Identity"(%outputs_0#0) {device = ""} : (tensor<*xi32>) -> tensor<*xi32>
+        "tf_executor.yield"(%0) : (tensor<*xi32>) -> ()
+      }) : () -> (tensor<*xi32>, !tf_executor.control)
+      %outputs_4, %control_5 = "tf_executor.island"(%control_3) ( {
+        %0 = "tf.Const"() {device = "", value = dense<2.000000e+00> : tensor<f32>} : () -> tensor<f32>
+        "tf_executor.yield"(%0) : (tensor<f32>) -> ()
+      }) : (!tf_executor.control) -> (tensor<f32>, !tf_executor.control)
+      %outputs_6, %control_7 = "tf_executor.island"() ( {
+        %0 = "tf.Identity"(%outputs_0#1) {device = ""} : (tensor<*xi32>) -> tensor<*xi32>
+        "tf_executor.yield"(%0) : (tensor<*xi32>) -> ()
+      }) : () -> (tensor<*xi32>, !tf_executor.control)
+      %outputs_8, %control_9 = "tf_executor.island"(%control_7) ( {
+        %0 = "tf.Const"() {device = "", value = dense<3.000000e+00> : tensor<f32>} : () -> tensor<f32>
+        "tf_executor.yield"(%0) : (tensor<f32>) -> ()
+      }) : (!tf_executor.control) -> (tensor<f32>, !tf_executor.control)
+      %outputs_10, %control_11 = "tf_executor.island"() ( {
+        %0 = "tf.Identity"(%outputs_0#2) {device = ""} : (tensor<*xi32>) -> tensor<*xi32>
+        "tf_executor.yield"(%0) : (tensor<*xi32>) -> ()
+      }) : () -> (tensor<*xi32>, !tf_executor.control)
+      %outputs_12, %control_13 = "tf_executor.island"(%control_11) ( {
+        %0 = "tf.Const"() {device = "", value = dense<4.000000e+00> : tensor<f32>} : () -> tensor<f32>
+        "tf_executor.yield"(%0) : (tensor<f32>) -> ()
+      }) : (!tf_executor.control) -> (tensor<f32>, !tf_executor.control)
+      %outputs_14, %control_15 = "tf_executor.island"() ( {
+        %0 = "tf.Const"() {device = "", value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
+        "tf_executor.yield"(%0) : (tensor<f32>) -> ()
+      }) : () -> (tensor<f32>, !tf_executor.control)
+      %outputs_16:2, %control_17 = "tf_executor._SwitchN"(%outputs_14, %outputs) {T = f32, _class = ["Case/input_0"], device = "", num_outs = 2 : i64} : (tensor<f32>, tensor<i32>) -> (tensor<*xf32>, tensor<*xf32>, !tf_executor.control)
+      %outputs_18, %control_19 = "tf_executor.island"() ( {
+        %0 = "tf.Mul"(%outputs_16#0, %outputs_4) {device = ""} : (tensor<*xf32>, tensor<f32>) -> tensor<*xf32>
+        "tf_executor.yield"(%0) : (tensor<*xf32>) -> ()
+      }) : () -> (tensor<*xf32>, !tf_executor.control)
+      %outputs_20, %control_21 = "tf_executor.island"() ( {
+        %0 = "tf.Mul"(%outputs_16#1, %outputs_8) {device = ""} : (tensor<*xf32>, tensor<f32>) -> tensor<*xf32>
+        "tf_executor.yield"(%0) : (tensor<*xf32>) -> ()
+      }) : () -> (tensor<*xf32>, !tf_executor.control)
+      %output, %value_index, %control_22 = "tf_executor.Merge"(%outputs_18, %outputs_20) {N = 2 : i64, T = f32, device = ""} : (tensor<*xf32>, tensor<*xf32>) -> (tensor<*xf32>, tensor<*xi32>, !tf_executor.control)
+      %control_23 = "tf_executor.island"() ( {
+        "tf._Retval"(%output) {T = f32, device = "/job:localhost/replica:0/task:0/device:CPU:0", index = 0 : i64} : (tensor<*xf32>) -> ()
+        "tf_executor.yield"() : () -> ()
+      }) : () -> !tf_executor.control
+      "tf_executor.fetch"() : () -> ()
+    }) : () -> ()
+    "std.return"() : () -> ()
+  }) {sym_name = "main", type = () -> ()} : () -> ()
+  "module_terminator"() : () -> ()
+}) {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, producer = 126 : i32}} : () -> ()
+
+// CHECK: _SwitchN
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/shape_inference.mlir b/tensorflow/compiler/mlir/tensorflow/tests/shape_inference.mlir
index 1fd30953799..4d623e67257 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/shape_inference.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/shape_inference.mlir
@@ -273,7 +273,7 @@ func @multiple_blocks_one_return(%arg0: tensor<?xf32>) -> tensor<*xf32> {
       // CHECK-SAME: : (tensor<32x?x4xf32>, tensor<?x?x?xf32>) ->
       // CHECK: tf_executor.Switch
       // CHECK-SAME: : (tensor<32x?x4xf32>, tensor<i1>) ->
-      // CHECK: tf_executor.SwitchN
+      // CHECK: tf_executor._SwitchN
       // CHECK-SAME: : tensor<?x?x?xf32>
       // CHECK: tf_executor.Enter
       // CHECK-SAME: : (tensor<32x?x4xf32>) ->
@@ -283,7 +283,7 @@ func @multiple_blocks_one_return(%arg0: tensor<?xf32>) -> tensor<*xf32> {
       // CHECK-SAME: tensor<i1>
       %merge:3 = "tf_executor.Merge"(%island#0, %arg1) : (tensor<?x?x?xf32>, tensor<?x?x?xf32>) -> (tensor<?x?x?xf32>, tensor<i32>, !tf_executor.control)
       %switch:3 = "tf_executor.Switch"(%island#0, %arg2) : (tensor<?x?x?xf32>, tensor<i1>) -> (tensor<?x?x?xf32>, tensor<?x?x?xf32>, !tf_executor.control)
-      %switchn:3 = "tf_executor.SwitchN"(%island#0, %arg3) {num_outs = 2} : (tensor<?x?x?xf32>, tensor<i32>) -> (tensor<?x?x?xf32>, tensor<?x?x?xf32>, !tf_executor.control)
+      %switchn:3 = "tf_executor._SwitchN"(%island#0, %arg3) {num_outs = 2} : (tensor<?x?x?xf32>, tensor<i32>) -> (tensor<?x?x?xf32>, tensor<?x?x?xf32>, !tf_executor.control)
       %enter:2 = "tf_executor.Enter"(%island#0) { frame_name = "frame"} : (tensor<?x?x?xf32>) -> (tensor<?x?x?xf32>, !tf_executor.control)
       %exit:2 = "tf_executor.Exit"(%island#0) : (tensor<?x?x?xf32>) -> (tensor<?x?x?xf32>, !tf_executor.control)
       %loop_cond:2 = "tf_executor.LoopCond" (%island#1) : (tensor<*xi1>) -> (tensor<*xi1>, !tf_executor.control)
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tf_executor_ops.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tf_executor_ops.mlir
index 5c2e5afd263..1e537880620 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tf_executor_ops.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tf_executor_ops.mlir
@@ -211,11 +211,11 @@ func @switch_with_control_inputs_functional(%arg0: tensor<i1>, %arg1: !tf_execut
 func @switchN(%arg0: tensor<i32>, %arg1: tensor<*xf32>) -> tensor<*xf32> {
   %fetches = tf_executor.graph {
 
-// CHECK: tf_executor.SwitchN %{{.*}}, %{{.*}} of 5 : tensor<*xf32>
-     %1:6 = tf_executor.SwitchN %arg1, %arg0 of 5 : tensor<*xf32>
+// CHECK: tf_executor._SwitchN %{{.*}}, %{{.*}} of 5 : tensor<*xf32>
+     %1:6 = tf_executor._SwitchN %arg1, %arg0 of 5 : tensor<*xf32>
 
-// CHECK: tf_executor.SwitchN %{{.*}}, %{{.*}} of 12 (%{{.*}}) : tensor<*xf32>
-     %2:13 = tf_executor.SwitchN %arg1, %arg0 of 12 (%1#5) : tensor<*xf32>
+// CHECK: tf_executor._SwitchN %{{.*}}, %{{.*}} of 12 (%{{.*}}) : tensor<*xf32>
+     %2:13 = tf_executor._SwitchN %arg1, %arg0 of 12 (%1#5) : tensor<*xf32>
 
      tf_executor.fetch %2#0 : tensor<*xf32>
   }
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tf_executor_ops_invalid.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tf_executor_ops_invalid.mlir
index 1fdc99d1ec8..2f034f1bfae 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tf_executor_ops_invalid.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tf_executor_ops_invalid.mlir
@@ -391,11 +391,11 @@ func @invalid_switch(%arg0: tensor<*xf32>, %arg1: tensor<i1>) -> tensor<*xf32> {
 
 // -----
 
-// Check that a tf_executor.SwitchN parent is a graph.
+// Check that a tf_executor._SwitchN parent is a graph.
 func @parent_is_graph(%arg0: tensor<*xf32>, %arg1: tensor<i32>) {
   "tf.some_op"() ({
-     %1:6 = tf_executor.SwitchN %arg0, %arg1 of 5 : tensor<*xf32>
-// expected-error@-1 {{'tf_executor.SwitchN' op expects parent op 'tf_executor.graph'}}
+     %1:6 = tf_executor._SwitchN %arg0, %arg1 of 5 : tensor<*xf32>
+// expected-error@-1 {{'tf_executor._SwitchN' op expects parent op 'tf_executor.graph'}}
   }) : () -> ()
   return
 }
@@ -406,8 +406,8 @@ func @parent_is_graph(%arg0: tensor<*xf32>, %arg1: tensor<i32>) {
 func @invalid_switchN(%arg0: tensor<i32>, %arg1: tensor<*xf32>) -> tensor<*xf32> {
   %fetches = tf_executor.graph {
 
-     %1:3 = "tf_executor.SwitchN"(%arg1, %arg0) {num_outs = 5} : (tensor<*xf32>, tensor<i32>) -> (tensor<*xf32>, tensor<*xf32>, !tf_executor.control)
-// expected-error@-1 {{'tf_executor.SwitchN' op expect `num_outs` (5) results but got 2}}
+     %1:3 = "tf_executor._SwitchN"(%arg1, %arg0) {num_outs = 5} : (tensor<*xf32>, tensor<i32>) -> (tensor<*xf32>, tensor<*xf32>, !tf_executor.control)
+// expected-error@-1 {{'tf_executor._SwitchN' op expect `num_outs` (5) results but got 2}}
 
      tf_executor.fetch %1#0 : tensor<*xf32>
   }
@@ -419,8 +419,8 @@ func @invalid_switchN(%arg0: tensor<i32>, %arg1: tensor<*xf32>) -> tensor<*xf32>
 // Check that data operands of SwitchN have tensor type
 func @invalid_switchN(%arg0: i32, %arg1: tensor<i32>) -> tensor<*xi32> {
   %result = tf_executor.graph {
-    %1:3 = "tf_executor.SwitchN"(%arg0, %arg1) {num_outs = 2} : (i32, tensor<i32>) -> (tensor<*xi32>, tensor<i32>, !tf_executor.control)
-// expected-error@-1 {{'tf_executor.SwitchN' op expects data operand to have tensor type but got 'i32'}}
+    %1:3 = "tf_executor._SwitchN"(%arg0, %arg1) {num_outs = 2} : (i32, tensor<i32>) -> (tensor<*xi32>, tensor<i32>, !tf_executor.control)
+// expected-error@-1 {{'tf_executor._SwitchN' op expects data operand to have tensor type but got 'i32'}}
     tf_executor.fetch %1#0 : tensor<*xi32>
   }
   return %result : tensor<*xi32>
@@ -431,8 +431,8 @@ func @invalid_switchN(%arg0: i32, %arg1: tensor<i32>) -> tensor<*xi32> {
 // Check that result of SwitchN has tensor type
 func @invalid_switchN(%arg0: tensor<*xi32>, %arg1: tensor<i32>) -> i32 {
   %result = tf_executor.graph {
-    %1:3 = "tf_executor.SwitchN"(%arg0, %arg1) {num_outs = 2} : (tensor<*xi32>, tensor<i32>) -> (i32, tensor<i32>, !tf_executor.control)
-// expected-error@-1 {{'tf_executor.SwitchN' op expects outputs to have tensor type but got 'i32'}}
+    %1:3 = "tf_executor._SwitchN"(%arg0, %arg1) {num_outs = 2} : (tensor<*xi32>, tensor<i32>) -> (i32, tensor<i32>, !tf_executor.control)
+// expected-error@-1 {{'tf_executor._SwitchN' op expects outputs to have tensor type but got 'i32'}}
     tf_executor.fetch %1#0 : i32
   }
   return %result : i32
@@ -444,8 +444,8 @@ func @invalid_switchN(%arg0: tensor<*xi32>, %arg1: tensor<i32>) -> i32 {
 func @invalid_switchN(%arg0: tensor<4xf32>, %arg1: tensor<i32>) -> tensor<4x!tf.f32ref> {
   %fetches = tf_executor.graph {
 
-    %1:3 = "tf_executor.SwitchN"(%arg0, %arg1) {num_outs = 2} : (tensor<4xf32>, tensor<i32>) -> (tensor<4x!tf.f32ref>, tensor<4xf32>, !tf_executor.control)
-// expected-error@-1 {{'tf_executor.SwitchN' op expects same operand and output element type but got 'tensor<4xf32>' vs 'tensor<4x!tf.f32ref>'}}
+    %1:3 = "tf_executor._SwitchN"(%arg0, %arg1) {num_outs = 2} : (tensor<4xf32>, tensor<i32>) -> (tensor<4x!tf.f32ref>, tensor<4xf32>, !tf_executor.control)
+// expected-error@-1 {{'tf_executor._SwitchN' op expects same operand and output element type but got 'tensor<4xf32>' vs 'tensor<4x!tf.f32ref>'}}
     tf_executor.fetch %1#0 : tensor<4x!tf.f32ref>
   }
   return %fetches : tensor<4x!tf.f32ref>
@@ -457,8 +457,8 @@ func @invalid_switchN(%arg0: tensor<4xf32>, %arg1: tensor<i32>) -> tensor<4x!tf.
 func @invalid_switchN(%arg0: tensor<*xf32>, %arg1: tensor<i32>) -> tensor<*xf32> {
   %fetches = tf_executor.graph {
 
-     %1:3 = "tf_executor.SwitchN"(%arg0, %arg1) {num_outs = 2} : (tensor<*xf32>, tensor<i32>) -> (tensor<*xf32>, tensor<i32>, !tf_executor.control)
-// expected-error@-1 {{'tf_executor.SwitchN' op expects data operand to be broadcastable with all output types but got 'tensor<*xf32>' vs 'tensor<i32>'}}
+     %1:3 = "tf_executor._SwitchN"(%arg0, %arg1) {num_outs = 2} : (tensor<*xf32>, tensor<i32>) -> (tensor<*xf32>, tensor<i32>, !tf_executor.control)
+// expected-error@-1 {{'tf_executor._SwitchN' op expects data operand to be broadcastable with all output types but got 'tensor<*xf32>' vs 'tensor<i32>'}}
 
      tf_executor.fetch %1#0 : tensor<*xf32>
   }
@@ -471,8 +471,8 @@ func @invalid_switchN(%arg0: tensor<*xf32>, %arg1: tensor<i32>) -> tensor<*xf32>
 func @invalid_switchN(%arg0: tensor<i32>, %arg1: tensor<*xf32>) -> tensor<*xf32> {
   %fetches = tf_executor.graph {
 
-     %1:3 = tf_executor.SwitchN %arg1, %arg0 of 2 : tensor<*xf32>, i32
-// expected-error@-1 {{custom op 'tf_executor.SwitchN'  expects only a single data type}}
+     %1:3 = tf_executor._SwitchN %arg1, %arg0 of 2 : tensor<*xf32>, i32
+// expected-error@-1 {{custom op 'tf_executor._SwitchN'  expects only a single data type}}
 
      tf_executor.fetch %1#0 : tensor<*xf32>
   }

From 056b657d58f781a16351ab78606c2702aef838f4 Mon Sep 17 00:00:00 2001
From: Reed Wanderman-Milne <reedwm@google.com>
Date: Tue, 30 Jun 2020 11:22:54 -0700
Subject: [PATCH 1324/1390] Do not log in DefaultLogger.

Instead, directly log in gpu_utils.cc, so that TF_CPP_VMODULE can be used to show the autotune results.

PiperOrigin-RevId: 319061148
Change-Id: Idd27bb2dd19353f099f3f97248f29b7a1a3f6fdb
---
 tensorflow/core/kernels/gpu_utils.cc | 2 ++
 tensorflow/core/platform/logger.cc   | 4 +---
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/tensorflow/core/kernels/gpu_utils.cc b/tensorflow/core/kernels/gpu_utils.cc
index 5c51c2ed539..7da1963c676 100644
--- a/tensorflow/core/kernels/gpu_utils.cc
+++ b/tensorflow/core/kernels/gpu_utils.cc
@@ -163,6 +163,7 @@ void LogConvAutotuneResults(se::dnn::ConvolutionKind kind,
   for (const auto& result : results) {
     *log.add_results() = result;
   }
+  VLOG(2) << log.DebugString();
   Logger::GetSingleton()->LogProto(log);
 }
 
@@ -209,6 +210,7 @@ void LogFusedConvForwardAutotuneResults(
   for (const auto& result : results) {
     *log.add_results() = result;
   }
+  VLOG(2) << log.DebugString();
   Logger::GetSingleton()->LogProto(log);
 }
 
diff --git a/tensorflow/core/platform/logger.cc b/tensorflow/core/platform/logger.cc
index 32fdaa58cf9..653c4fb764d 100644
--- a/tensorflow/core/platform/logger.cc
+++ b/tensorflow/core/platform/logger.cc
@@ -25,9 +25,7 @@ namespace {
 
 class DefaultLogger : public Logger {
  private:
-  void DoLogProto(google::protobuf::Any* proto) override {
-    VLOG(2) << proto->ShortDebugString();
-  }
+  void DoLogProto(google::protobuf::Any* proto) override {}
   void DoFlush() override {}
 };
 

From eba3037bb842998494fda596531d70f945b2578f Mon Sep 17 00:00:00 2001
From: Hye Soo Yang <hyey@google.com>
Date: Tue, 30 Jun 2020 11:23:33 -0700
Subject: [PATCH 1325/1390] Fix macos breakage. Use kLittleEndian and
 BYTE_SWAP_*.

PiperOrigin-RevId: 319061267
Change-Id: Ic19b33e8ce243112e9edad9fef87cc9529062b1e
---
 tensorflow/core/kernels/BUILD              |  1 +
 tensorflow/core/kernels/decode_image_op.cc | 27 +++++++++++++++-------
 2 files changed, 20 insertions(+), 8 deletions(-)

diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index 0ed12d0e1ec..4a84201d750 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -3045,6 +3045,7 @@ IMAGE_DEPS = [
     "//tensorflow/core:lib_internal",
     "//tensorflow/core:png_internal",
     "//tensorflow/core:protos_all_cc",
+    "//tensorflow/core/util/tensor_bundle",
 ]
 
 tf_kernel_library(
diff --git a/tensorflow/core/kernels/decode_image_op.cc b/tensorflow/core/kernels/decode_image_op.cc
index 02f3b9a268c..cfcdf38f972 100644
--- a/tensorflow/core/kernels/decode_image_op.cc
+++ b/tensorflow/core/kernels/decode_image_op.cc
@@ -31,7 +31,9 @@ limitations under the License.
 #include "tensorflow/core/lib/jpeg/jpeg_mem.h"
 #include "tensorflow/core/lib/png/png_io.h"
 #include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/platform/byte_order.h"
 #include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/util/tensor_bundle/byte_swap.h"
 
 namespace tensorflow {
 namespace {
@@ -454,11 +456,20 @@ class DecodeImageV2Op : public OpKernel {
 
   // Helper for decoding BMP.
   inline int32 ByteSwapInt32ForBigEndian(int32 x) {
-#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
-    return le32toh(x);
-#else
-    return x;
-#endif
+    if (!port::kLittleEndian) {
+      return BYTE_SWAP_32(x);
+    } else {
+      return x;
+    }
+  }
+
+  // Helper for decoding BMP.
+  inline int16 ByteSwapInt16ForBigEndian(int16 x) {
+    if (!port::kLittleEndian) {
+      return BYTE_SWAP_16(x);
+    } else {
+      return x;
+    }
   }
 
   void Compute(OpKernelContext* context) override {
@@ -771,9 +782,9 @@ class DecodeImageV2Op : public OpKernel {
     int32 height_ = internal::SubtleMustCopy(
         *(reinterpret_cast<const int32*>(img_bytes + 22)));
     const int32 height = ByteSwapInt32ForBigEndian(height_);
-    int32 bpp_ = internal::SubtleMustCopy(
-        *(reinterpret_cast<const int32*>(img_bytes + 28)));
-    const int32 bpp = ByteSwapInt32ForBigEndian(bpp_);
+    int16 bpp_ = internal::SubtleMustCopy(
+        *(reinterpret_cast<const int16*>(img_bytes + 28)));
+    const int16 bpp = ByteSwapInt16ForBigEndian(bpp_);
 
     if (channels_) {
       OP_REQUIRES(context, (channels_ == bpp / 8),

From ba3ad73a253eba6804ed9b55efc7c07e39760fa3 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 30 Jun 2020 11:26:35 -0700
Subject: [PATCH 1326/1390] Require that axis arguments to reduction ops are
 unique.

PiperOrigin-RevId: 319061955
Change-Id: I2be83506bb053264b113fc6bb1af20bc532aadd7
---
 .../core/kernels/reduction_ops_common.cc      |  5 ++
 .../python/kernel_tests/reduction_ops_test.py |  9 ++
 tensorflow/python/ops/math_ops.py             | 90 +++++++++----------
 tensorflow/python/ops/math_ops_test.py        |  7 +-
 4 files changed, 63 insertions(+), 48 deletions(-)

diff --git a/tensorflow/core/kernels/reduction_ops_common.cc b/tensorflow/core/kernels/reduction_ops_common.cc
index c341e330178..2e21094cc49 100644
--- a/tensorflow/core/kernels/reduction_ops_common.cc
+++ b/tensorflow/core/kernels/reduction_ops_common.cc
@@ -69,6 +69,11 @@ Status SimplifyHelper(const Tensor& data, const Tensor& axis,
                                      " dimension(s)");
     }
     index = (index + data.dims()) % data.dims();
+    if (bitmap[index]) {
+      return errors::InvalidArgument(
+          "Invalid reduction arguments: Axes contains duplicate dimension: ",
+          index);
+    }
     bitmap[index] = true;
   }
   return Status::OK();
diff --git a/tensorflow/python/kernel_tests/reduction_ops_test.py b/tensorflow/python/kernel_tests/reduction_ops_test.py
index 1b5fa201d8f..5146780ca3f 100644
--- a/tensorflow/python/kernel_tests/reduction_ops_test.py
+++ b/tensorflow/python/kernel_tests/reduction_ops_test.py
@@ -25,6 +25,7 @@ import numpy as np
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import test_util
@@ -340,6 +341,14 @@ class SumReductionTest(BaseReductionTest):
                                              ".*must be at most rank 1.*"):
       math_ops.reduce_sum(c_unknown, reduction_axes)
 
+  def testInvalidRepeatedReductionIndices(self):
+    reduction_axes = constant_op.constant([0, 0])
+    c = constant_op.constant([1.0, 2.0])
+    with self.assertRaisesWithPredicateMatch(
+        errors.InvalidArgumentError,
+        ".*Axes contains duplicate dimension: 0.*"):
+      self.evaluate(math_ops.reduce_sum(c, reduction_axes))
+
   # Int64??
 
   @test_util.run_deprecated_v1
diff --git a/tensorflow/python/ops/math_ops.py b/tensorflow/python/ops/math_ops.py
index 7810dae2688..d587285a36e 100644
--- a/tensorflow/python/ops/math_ops.py
+++ b/tensorflow/python/ops/math_ops.py
@@ -1828,18 +1828,18 @@ def _ReductionDims(x, axis):  # pylint: disable=invalid-name
   if axis is not None:
     return axis
   else:
-    # Fast path: avoid creating Rank and Range ops if ndims is known.
+    x_rank = None
     if isinstance(x, ops.Tensor):
-      rank = x.shape.rank
-      if rank is not None:
-        return constant_op.constant(np.arange(rank, dtype=np.int32))
+      x_rank = x.shape.rank
     elif (isinstance(x, sparse_tensor.SparseTensor) and
           x.dense_shape.shape.is_fully_defined()):
-      rank = x.dense_shape.shape.dims[0].value  # sparse.dense_shape is 1-D.
-      return constant_op.constant(np.arange(rank, dtype=np.int32))
-
-    # Otherwise, we rely on Range and Rank to do the right thing at run-time.
-    return range(0, array_ops.rank(x))
+      x_rank = x.dense_shape.shape.dims[0].value  # sparse.dense_shape is 1-D.
+    # Fast path: avoid creating Rank and Range ops if ndims is known.
+    if x_rank:
+      return constant_op.constant(np.arange(x_rank, dtype=np.int32))
+    else:
+      # Otherwise, we rely on Range and Rank to do the right thing at run-time.
+      return range(0, array_ops.rank(x))
 
 
 def _has_fully_defined_shape(tensor):
@@ -1870,8 +1870,8 @@ def reduce_sum_v1(input_tensor,
 
   Reduces `input_tensor` along the dimensions given in `axis`.
   Unless `keepdims` is true, the rank of the tensor is reduced by 1 for each
-  entry in `axis`. If `keepdims` is true, the reduced dimensions
-  are retained with length 1.
+  of the entries in `axis`, which must be unique. If `keepdims` is true, the
+  reduced dimensions are retained with length 1.
 
   If `axis` is None, all dimensions are reduced, and a
   tensor with a single element is returned.
@@ -1920,8 +1920,8 @@ def reduce_sum(input_tensor, axis=None, keepdims=False, name=None):
 
   Reduces `input_tensor` along the dimensions given in `axis`.
   Unless `keepdims` is true, the rank of the tensor is reduced by 1 for each
-  entry in `axis`. If `keepdims` is true, the reduced dimensions
-  are retained with length 1.
+  of the entries in `axis`, which must be unique. If `keepdims` is true, the
+  reduced dimensions are retained with length 1.
 
   If `axis` is None, all dimensions are reduced, and a
   tensor with a single element is returned.
@@ -1997,8 +1997,8 @@ def reduce_euclidean_norm(input_tensor, axis=None, keepdims=False, name=None):
 
   Reduces `input_tensor` along the dimensions given in `axis`.
   Unless `keepdims` is true, the rank of the tensor is reduced by 1 for each
-  entry in `axis`. If `keepdims` is true, the reduced dimensions
-  are retained with length 1.
+  of the entries in `axis`, which must be unique. If `keepdims` is true, the
+  reduced dimensions are retained with length 1.
 
   If `axis` is None, all dimensions are reduced, and a
   tensor with a single element is returned.
@@ -2193,8 +2193,8 @@ def reduce_mean_v1(input_tensor,
   Reduces `input_tensor` along the dimensions given in `axis` by computing the
   mean of elements across the dimensions in `axis`.
   Unless `keepdims` is true, the rank of the tensor is reduced by 1 for each
-  entry in `axis`. If `keepdims` is true, the reduced dimensions
-  are retained with length 1.
+  the entries in `axis`, which must be unique. If `keepdims` is true, the
+  reduced dimensions are retained with length 1.
 
   If `axis` is None, all dimensions are reduced, and a tensor with a single
   element is returned.
@@ -2255,8 +2255,8 @@ def reduce_mean(input_tensor, axis=None, keepdims=False, name=None):
   Reduces `input_tensor` along the dimensions given in `axis` by computing the
   mean of elements across the dimensions in `axis`.
   Unless `keepdims` is true, the rank of the tensor is reduced by 1 for each
-  entry in `axis`. If `keepdims` is true, the reduced dimensions are retained
-  with length 1.
+  of the entries in `axis`, which must be unique. If `keepdims` is true, the
+  reduced dimensions are retained with length 1.
 
   If `axis` is None, all dimensions are reduced, and a tensor with a single
   element is returned.
@@ -2314,8 +2314,8 @@ def reduce_variance(input_tensor, axis=None, keepdims=False, name=None):
 
   Reduces `input_tensor` along the dimensions given in `axis`.
   Unless `keepdims` is true, the rank of the tensor is reduced by 1 for each
-  entry in `axis`. If `keepdims` is true, the reduced dimensions
-  are retained with length 1.
+  of the entries in `axis`, which must be unique. If `keepdims` is true, the
+  reduced dimensions are retained with length 1.
 
   If `axis` is None, all dimensions are reduced, and a
   tensor with a single element is returned.
@@ -2375,8 +2375,8 @@ def reduce_std(input_tensor, axis=None, keepdims=False, name=None):
 
   Reduces `input_tensor` along the dimensions given in `axis`.
   Unless `keepdims` is true, the rank of the tensor is reduced by 1 for each
-  entry in `axis`. If `keepdims` is true, the reduced dimensions
-  are retained with length 1.
+  of the entries in `axis`, which must be unique. If `keepdims` is true, the
+  reduced dimensions are retained with length 1.
 
   If `axis` is None, all dimensions are reduced, and a
   tensor with a single element is returned.
@@ -2469,8 +2469,8 @@ def reduce_prod_v1(input_tensor,
 
   Reduces `input_tensor` along the dimensions given in `axis`.
   Unless `keepdims` is true, the rank of the tensor is reduced by 1 for each
-  entry in `axis`. If `keepdims` is true, the reduced dimensions
-  are retained with length 1.
+  of the entries in `axis`, which must be unique. If `keepdims` is true, the
+  reduced dimensions are retained with length 1.
 
   If `axis` is None, all dimensions are reduced, and a
   tensor with a single element is returned.
@@ -2515,8 +2515,8 @@ def reduce_min_v1(input_tensor,
 
   Reduces `input_tensor` along the dimensions given in `axis`.
   Unless `keepdims` is true, the rank of the tensor is reduced by 1 for each
-  entry in `axis`. If `keepdims` is true, the reduced dimensions
-  are retained with length 1.
+  of the entries in `axis`, which must be unique. If `keepdims` is true, the
+  reduced dimensions are retained with length 1.
 
   If `axis` is None, all dimensions are reduced, and a
   tensor with a single element is returned.
@@ -2553,8 +2553,8 @@ def reduce_min(input_tensor, axis=None, keepdims=False, name=None):
 
   Reduces `input_tensor` along the dimensions given in `axis`.
   Unless `keepdims` is true, the rank of the tensor is reduced by 1 for each
-  entry in `axis`. If `keepdims` is true, the reduced dimensions
-  are retained with length 1.
+  of the entries in `axis`, which must be unique. If `keepdims` is true, the
+  reduced dimensions are retained with length 1.
 
   If `axis` is None, all dimensions are reduced, and a
   tensor with a single element is returned.
@@ -2602,8 +2602,8 @@ def reduce_max_v1(input_tensor,
 
   Reduces `input_tensor` along the dimensions given in `axis`.
   Unless `keepdims` is true, the rank of the tensor is reduced by 1 for each
-  entry in `axis`. If `keepdims` is true, the reduced dimensions
-  are retained with length 1.
+  of the entries in `axis`, which must be unique. If `keepdims` is true, the
+  reduced dimensions are retained with length 1.
 
   If `axis` is None, all dimensions are reduced, and a
   tensor with a single element is returned.
@@ -2640,8 +2640,8 @@ def reduce_max(input_tensor, axis=None, keepdims=False, name=None):
 
   Reduces `input_tensor` along the dimensions given in `axis`.
   Unless `keepdims` is true, the rank of the tensor is reduced by 1 for each
-  entry in `axis`. If `keepdims` is true, the reduced dimensions
-  are retained with length 1.
+  of the entries in `axis`, which must be unique. If `keepdims` is true, the
+  reduced dimensions are retained with length 1.
 
   If `axis` is None, all dimensions are reduced, and a
   tensor with a single element is returned.
@@ -2707,8 +2707,8 @@ def reduce_all_v1(input_tensor,
 
   Reduces `input_tensor` along the dimensions given in `axis`.
   Unless `keepdims` is true, the rank of the tensor is reduced by 1 for each
-  entry in `axis`. If `keepdims` is true, the reduced dimensions
-  are retained with length 1.
+  of the entries in `axis`, which must be unique. If `keepdims` is true, the
+  reduced dimensions are retained with length 1.
 
   If `axis` is None, all dimensions are reduced, and a
   tensor with a single element is returned.
@@ -2754,8 +2754,8 @@ def reduce_all(input_tensor, axis=None, keepdims=False, name=None):
 
   Reduces `input_tensor` along the dimensions given in `axis`.
   Unless `keepdims` is true, the rank of the tensor is reduced by 1 for each
-  entry in `axis`. If `keepdims` is true, the reduced dimensions
-  are retained with length 1.
+  of the entries in `axis`, which must be unique. If `keepdims` is true, the
+  reduced dimensions are retained with length 1.
 
   If `axis` is None, all dimensions are reduced, and a
   tensor with a single element is returned.
@@ -2807,8 +2807,8 @@ def reduce_any_v1(input_tensor,
 
   Reduces `input_tensor` along the dimensions given in `axis`.
   Unless `keepdims` is true, the rank of the tensor is reduced by 1 for each
-  entry in `axis`. If `keepdims` is true, the reduced dimensions
-  are retained with length 1.
+  of the entries in `axis`, which must be unique. If `keepdims` is true, the
+  reduced dimensions are retained with length 1.
 
   If `axis` is None, all dimensions are reduced, and a
   tensor with a single element is returned.
@@ -2854,8 +2854,8 @@ def reduce_any(input_tensor, axis=None, keepdims=False, name=None):
 
   Reduces `input_tensor` along the dimensions given in `axis`.
   Unless `keepdims` is true, the rank of the tensor is reduced by 1 for each
-  entry in `axis`. If `keepdims` is true, the reduced dimensions
-  are retained with length 1.
+  of the entries in `axis`, which must be unique. If `keepdims` is true, the
+  reduced dimensions are retained with length 1.
 
   If `axis` is None, all dimensions are reduced, and a
   tensor with a single element is returned.
@@ -2907,8 +2907,8 @@ def reduce_logsumexp_v1(input_tensor,
 
   Reduces `input_tensor` along the dimensions given in `axis`.
   Unless `keepdims` is true, the rank of the tensor is reduced by 1 for each
-  entry in `axis`. If `keepdims` is true, the reduced dimensions
-  are retained with length 1.
+  of the entries in `axis`, which must be unique. If `keepdims` is true, the
+  reduced dimensions are retained with length 1.
 
   If `axis` has no entries, all dimensions are reduced, and a
   tensor with a single element is returned.
@@ -2956,8 +2956,8 @@ def reduce_logsumexp(input_tensor, axis=None, keepdims=False, name=None):
 
   Reduces `input_tensor` along the dimensions given in `axis`.
   Unless `keepdims` is true, the rank of the tensor is reduced by 1 for each
-  entry in `axis`. If `keepdims` is true, the reduced dimensions
-  are retained with length 1.
+  of the entries in `axis`, which must be unique. If `keepdims` is true, the
+  reduced dimensions are retained with length 1.
 
   If `axis` has no entries, all dimensions are reduced, and a
   tensor with a single element is returned.
diff --git a/tensorflow/python/ops/math_ops_test.py b/tensorflow/python/ops/math_ops_test.py
index 9699f6d2b78..88ef677b7f4 100644
--- a/tensorflow/python/ops/math_ops_test.py
+++ b/tensorflow/python/ops/math_ops_test.py
@@ -57,13 +57,14 @@ class ReduceTest(test_util.TensorFlowTestCase):
   def testReduceExplicitAxes(self):
     x = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.int32)
     with test_util.device(use_gpu=True):
-      for axis in (0, -2, (0, 0), (0, -2)):
+      for axis in (0, -2):
         self.assertAllEqual(self.evaluate(math_ops.reduce_sum(x, axis=axis)),
                             [5, 7, 9])
-      for axis in (1, -1, (1, 1), (1, -1)):
+      for axis in (1, -1):
         self.assertAllEqual(self.evaluate(math_ops.reduce_sum(x, axis=axis)),
                             [6, 15])
-      for axis in (None, (0, 1), (-1, -2), (-2, -1, 0, 1)):
+      for axis in (None, (0, 1), (1, 0), (-1, 0), (0, -1), (-2, 1), (1, -2),
+                   (-1, -2), (-2, -1)):
         self.assertEqual(self.evaluate(math_ops.reduce_sum(x, axis=axis)), 21)
 
   def testReduceInvalidAxis(self):

From 54a01c3a6bb23d4e1e904917caf141a6c1d664d3 Mon Sep 17 00:00:00 2001
From: Allen Lavoie <allenl@google.com>
Date: Tue, 30 Jun 2020 11:38:01 -0700
Subject: [PATCH 1327/1390] Parallel device: add a test for collectives inside
 a function

PiperOrigin-RevId: 319064461
Change-Id: Ib791be6f09194e2df2a64153c32ce32bb83096fd
---
 .../parallel_device/parallel_device_test.py   | 30 ++++++++++++++++++-
 1 file changed, 29 insertions(+), 1 deletion(-)

diff --git a/tensorflow/python/distribute/parallel_device/parallel_device_test.py b/tensorflow/python/distribute/parallel_device/parallel_device_test.py
index 1429c522aba..f6d6b525ae9 100644
--- a/tensorflow/python/distribute/parallel_device/parallel_device_test.py
+++ b/tensorflow/python/distribute/parallel_device/parallel_device_test.py
@@ -23,12 +23,14 @@ import threading
 from tensorflow.python.distribute.parallel_device import parallel_device
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
+from tensorflow.python.eager import def_function
 from tensorflow.python.framework import config
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import ops
 from tensorflow.python.module import module
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import collective_ops
+from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
@@ -42,7 +44,7 @@ from tensorflow.python.util import nest
 # communicate.
 # TODO(allenl): Switch to using a collective manager.
 _COUNTER_LOCK = threading.Lock()
-_COUNTER = 0
+_COUNTER = 100
 
 
 def _collective_reduce(inputs, operation, num_replicas):
@@ -171,6 +173,32 @@ class ParallelDeviceTests(_VirtualDeviceTestCase):
       context._reset_context()
       config.set_synchronous_execution(previous)
 
+  def test_collective_in_function(self):
+    c = constant_op.constant([2])
+
+    @def_function.function
+    def broadcast_send_recv(device_id):
+
+      @def_function.function
+      def send():
+        s0 = collective_ops.broadcast_send(
+            c * 3, c.shape, c.dtype, group_size=2, group_key=1, instance_key=1)
+        with ops.control_dependencies([s0.op]):
+          return array_ops.identity(c)
+
+      @def_function.function
+      def recv():
+        r0 = collective_ops.broadcast_recv(
+            c.shape, c.dtype, group_size=2, group_key=1, instance_key=1)
+        return r0
+
+      return control_flow_ops.switch_case(
+          device_id, branch_fns={0: send, 1: recv})
+
+    with ops.device(self.device.name):
+      result = broadcast_send_recv(self.device.device_ids)
+    self.assertAllClose([[2], [6]], self.device.unpack(result))
+
   def test_checkpointing(self):
     self.skipTest(
         "Disable saving until SaveableObject's methods are traceable.")

From 4bc624a450dc3be65d214cae4018b0873286009b Mon Sep 17 00:00:00 2001
From: Karim Nosir <karimnosseir@google.com>
Date: Tue, 30 Jun 2020 11:40:39 -0700
Subject: [PATCH 1328/1390] - Update SimpleDelegate to allow providing options
 for the delegate which controls delegate max_number of graphs/ min nodes per
 subgraph. - Update Hexagon Delegate to use SimpleDelegate Interface.

PiperOrigin-RevId: 319064977
Change-Id: Icdeeba44f24109fd09f26e372f9668a8c00cedaf
---
 tensorflow/lite/delegates/flex/delegate.h     |   5 +
 tensorflow/lite/delegates/hexagon/BUILD       |   4 +-
 .../delegates/hexagon/hexagon_delegate.cc     | 172 +++---------------
 .../hexagon/hexagon_delegate_kernel.cc        |  12 +-
 .../hexagon/hexagon_delegate_kernel.h         |  15 +-
 .../utils/dummy_delegate/dummy_delegate.cc    |   5 +
 .../lite/delegates/utils/simple_delegate.cc   |   9 +-
 .../lite/delegates/utils/simple_delegate.h    |  13 +-
 8 files changed, 70 insertions(+), 165 deletions(-)

diff --git a/tensorflow/lite/delegates/flex/delegate.h b/tensorflow/lite/delegates/flex/delegate.h
index be890a5456d..baa0b157111 100644
--- a/tensorflow/lite/delegates/flex/delegate.h
+++ b/tensorflow/lite/delegates/flex/delegate.h
@@ -77,6 +77,11 @@ class FlexDelegate : public SimpleDelegateInterface {
 
   TfLiteStatus Initialize(TfLiteContext* context) override;
 
+  SimpleDelegateInterface::Options DelegateOptions() const override {
+    // Use default options.
+    return SimpleDelegateInterface::Options();
+  }
+
   std::unique_ptr<SimpleDelegateKernelInterface> CreateDelegateKernelInterface()
       override;
 
diff --git a/tensorflow/lite/delegates/hexagon/BUILD b/tensorflow/lite/delegates/hexagon/BUILD
index c2845478020..bb4484b3ea1 100644
--- a/tensorflow/lite/delegates/hexagon/BUILD
+++ b/tensorflow/lite/delegates/hexagon/BUILD
@@ -57,9 +57,9 @@ cc_library(
         "//tensorflow/lite:kernel_api",
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/core/api",
-        "//tensorflow/lite/delegates:utils",
         "//tensorflow/lite/delegates/hexagon/builders:op_builder",
         "//tensorflow/lite/delegates/hexagon/hexagon_nn:hexagon_nn_header",
+        "//tensorflow/lite/delegates/utils:simple_delegate",
         "//tensorflow/lite/kernels:kernel_util",
         "//tensorflow/lite/schema:schema_fbs",
         "@hexagon_nn//:hexagon_nn_ops",
@@ -81,7 +81,7 @@ cc_library(
         "//tensorflow/lite:kernel_api",
         "//tensorflow/lite:minimal_logging",
         "//tensorflow/lite/c:common",
-        "//tensorflow/lite/delegates:utils",
+        "//tensorflow/lite/delegates/utils:simple_delegate",
     ],
 )
 
diff --git a/tensorflow/lite/delegates/hexagon/hexagon_delegate.cc b/tensorflow/lite/delegates/hexagon/hexagon_delegate.cc
index 8f689f8f357..83437df8cd6 100644
--- a/tensorflow/lite/delegates/hexagon/hexagon_delegate.cc
+++ b/tensorflow/lite/delegates/hexagon/hexagon_delegate.cc
@@ -20,10 +20,10 @@ limitations under the License.
 
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/context_util.h"
-#include "tensorflow/lite/delegates/utils.h"
 #include "tensorflow/lite/delegates/hexagon/hexagon_delegate_kernel.h"
 #include "tensorflow/lite/delegates/hexagon/hexagon_implementation.h"
 #include "tensorflow/lite/delegates/hexagon/utils.h"
+#include "tensorflow/lite/delegates/utils/simple_delegate.h"
 #include "tensorflow/lite/minimal_logging.h"
 
 namespace tflite {
@@ -33,56 +33,7 @@ constexpr int kMaxHexagonGraphs = 4;
 constexpr int kMaxMaxHexagonGraphs = 16;
 constexpr int kMinNodesPerHexagonGraph = 2;
 
-TfLiteRegistration GetHexagonKernelRegistration() {
-  // This is the registration for the Delegate Node that gets added to
-  // the TFLite graph instead of the subGraph it replaces it.
-  // It is treated as a an OP node. But in our case
-  // Init will initialize the delegate
-  // Invoke will run the delegate graph.
-  // Prepare for prearing the delegate.
-  // Free for any cleaning needed by the delegate.
-  TfLiteRegistration kernel_registration;
-  kernel_registration.profiling_string = nullptr;
-  kernel_registration.builtin_code = kTfLiteBuiltinDelegate;
-  kernel_registration.custom_name = "TfLiteHexagonDelegate";
-  kernel_registration.free = [](TfLiteContext* context, void* buffer) -> void {
-    delete reinterpret_cast<HexagonDelegateKernel*>(buffer);
-  };
-  kernel_registration.init = [](TfLiteContext* context, const char* buffer,
-                                size_t length) -> void* {
-    const TfLiteDelegateParams* params =
-        reinterpret_cast<const TfLiteDelegateParams*>(buffer);
-    auto hexagon_kernel = std::make_unique<HexagonDelegateKernel>();
-    if (hexagon_kernel->Init(context, params) != kTfLiteOk) {
-      return nullptr;
-    }
-    return hexagon_kernel.release();
-  };
-  kernel_registration.invoke = [](TfLiteContext* context,
-                                  TfLiteNode* node) -> TfLiteStatus {
-    HexagonDelegateKernel* kernel =
-        reinterpret_cast<HexagonDelegateKernel*>(node->user_data);
-    if (!kernel) {
-      context->ReportError(context, "Hexagon Kernel was not initialized");
-      return kTfLiteError;
-    }
-    return kernel->Invoke(context, node);
-  };
-  kernel_registration.prepare = [](TfLiteContext* context,
-                                   TfLiteNode* node) -> TfLiteStatus {
-    if (node->user_data == nullptr) {
-      context->ReportError(context, "Hexagon Kernel was not initialized");
-      return kTfLiteError;
-    }
-    HexagonDelegateKernel* kernel =
-        reinterpret_cast<HexagonDelegateKernel*>(node->user_data);
-    return kernel->Prepare(context, node);
-  };
-
-  return kernel_registration;
-}
-
-class HexagonDelegate : public TfLiteDelegate {
+class HexagonDelegate : public SimpleDelegateInterface {
  public:
   explicit HexagonDelegate(const TfLiteHexagonDelegateOptions* params)
       : params_(params != nullptr ? *params
@@ -101,113 +52,40 @@ class HexagonDelegate : public TfLiteDelegate {
     }
   }
 
-  TfLiteHexagonDelegateOptions* params() { return &params_; }
-
-  bool VerifyDelegate() {
-    auto* hexagon_nn = HexagonNNImplementation();
-    if (hexagon_nn == nullptr) {
-      return false;
-    }
-    if (hexagon_nn->hexagon_nn_version != nullptr &&
-        hexagon_nn->hexagon_nn_hexagon_interface_version) {
-      int hexagon_nn_version = -1;
-      int hexagon_interface_version =
-          hexagon_nn->hexagon_nn_hexagon_interface_version();
-      if (hexagon_nn->hexagon_nn_version(&hexagon_nn_version) != 0) {
-        TFLITE_LOG_PROD(tflite::TFLITE_LOG_WARNING,
-                        "Failed to fetch Hexagon NN version. This might be "
-                        "because you're using incompatible versions of "
-                        "libhexagon_interface and libhexagon_nn_skel. "
-                        "You must use compatible versions. "
-                        "Refer to Tensorflow Lite Hexagon Delegate Guide.");
-        return false;
-      }
-      if (hexagon_nn_version != hexagon_interface_version) {
-        TFLITE_LOG_PROD(
-            tflite::TFLITE_LOG_WARNING,
-            "Incompatible versions between interface library and "
-            "libhexagon_skel %d vs %d. You must use compatible versions. "
-            "Refer to Tensorflow Lite Hexagon Delegate Guide.",
-            hexagon_interface_version, hexagon_nn_version);
-        return false;
-      }
-    }
-    return hexagon_nn->hexagon_nn_is_device_supported &&
-           hexagon_nn->hexagon_nn_is_device_supported();
+  bool IsNodeSupportedByDelegate(const TfLiteRegistration* registration,
+                                 const TfLiteNode* node,
+                                 TfLiteContext* context) const override {
+    return IsNodeSupportedByHexagon(registration, node, context);
   }
 
-  ~HexagonDelegate() {
-    TfLiteIntArrayFree(params_.input_batch_dimensions);
-    TfLiteIntArrayFree(params_.output_batch_dimensions);
+  TfLiteStatus Initialize(TfLiteContext* context) override { return kTfLiteOk; }
+
+  const char* Name() const override { return "TfLiteHexagonDelegate"; }
+
+  std::unique_ptr<SimpleDelegateKernelInterface> CreateDelegateKernelInterface()
+      override {
+    return std::make_unique<HexagonDelegateKernel>(params_);
+  }
+
+  SimpleDelegateInterface::Options DelegateOptions() const override {
+    auto options = SimpleDelegateInterface::Options();
+    options.max_delegated_partitions = params_.max_delegated_partitions;
+    options.min_nodes_per_partition = params_.min_nodes_per_partition;
+    return options;
   }
 
  private:
   TfLiteHexagonDelegateOptions params_;
 };
 
-TfLiteStatus DelegatePrepare(TfLiteContext* context, TfLiteDelegate* delegate) {
-  delegates::IsNodeSupportedFn node_supported_fn =
-      [=](TfLiteContext* context, TfLiteNode* node,
-          TfLiteRegistration* registration,
-          std::string* unsupported_details) -> bool {
-    return IsNodeSupportedByHexagon(registration, node, context);
-  };
-  delegates::GraphPartitionHelper helper(context, node_supported_fn);
-  TF_LITE_ENSURE_STATUS(helper.Partition(nullptr));
-
-  TfLiteHexagonDelegateOptions* params =
-      static_cast<TfLiteHexagonDelegateOptions*>(delegate->data_);
-  std::vector<int> supported_nodes = helper.GetNodesOfFirstNLargestPartitions(
-      params->max_delegated_partitions, params->min_nodes_per_partition);
-
-  auto* hexagon_delegate = static_cast<HexagonDelegate*>(delegate);
-  // Make sure dynamic batch is requested on fully delegated graph only.
-  if (supported_nodes.size() != helper.num_total_nodes() &&
-      hexagon_delegate != nullptr &&
-      hexagon_delegate->params()->enable_dynamic_batch_size) {
-    TF_LITE_KERNEL_LOG(
-        context, "Dynamic batch requested on non-fully delegated graph !!.");
-    return kTfLiteError;
-  }
-  TFLITE_LOG_PROD(tflite::TFLITE_LOG_INFO,
-                  "Hexagon delegate: %d nodes delegated out of %d nodes with "
-                  "%d partitions.\n",
-                  supported_nodes.size(), helper.num_total_nodes(),
-                  helper.num_partitions());
-
-  return context->ReplaceNodeSubsetsWithDelegateKernels(
-      context, GetHexagonKernelRegistration(),
-      BuildTfLiteIntArray(supported_nodes).get(), delegate);
-}
-
-TfLiteDelegate* CreateDelegate(const TfLiteHexagonDelegateOptions* params) {
-  TfLiteDelegate* delegate = new HexagonDelegate(params);
-  if (!static_cast<HexagonDelegate*>(delegate)->VerifyDelegate()) {
-    delete delegate;
-    TFLITE_LOG_PROD_ONCE(tflite::TFLITE_LOG_INFO,
-                         "Hexagon Delegate is not supported.\n");
-    return nullptr;
-  }
-
-  delegate->data_ = static_cast<HexagonDelegate*>(delegate)->params();
-  delegate->flags = kTfLiteDelegateFlagsAllowDynamicTensors;
-  delegate->Prepare = &DelegatePrepare;
-  delegate->CopyFromBufferHandle = nullptr;
-  delegate->CopyToBufferHandle = nullptr;
-  delegate->FreeBufferHandle = nullptr;
-
-  TFLITE_LOG_PROD_ONCE(tflite::TFLITE_LOG_INFO,
-                       "Created TensorFlow Lite delegate for Hexagon.");
-
-  return delegate;
-}
-
 }  // namespace
 }  // namespace tflite
 
 TfLiteDelegate* TfLiteHexagonDelegateCreate(
     const TfLiteHexagonDelegateOptions* options) {
-  return tflite::CreateDelegate(options);
+  // return tflite::CreateDelegate(options);
+  return tflite::TfLiteDelegateFactory::CreateSimpleDelegate(
+      std::make_unique<tflite::HexagonDelegate>(options));
 }
 
 TfLiteHexagonDelegateOptions TfLiteHexagonDelegateOptionsDefault() {
@@ -215,7 +93,9 @@ TfLiteHexagonDelegateOptions TfLiteHexagonDelegateOptionsDefault() {
   return result;
 }
 
-void TfLiteHexagonDelegateDelete(TfLiteDelegate* delegate) { delete delegate; }
+void TfLiteHexagonDelegateDelete(TfLiteDelegate* delegate) {
+  tflite::TfLiteDelegateFactory::DeleteSimpleDelegate(delegate);
+}
 
 void TfLiteHexagonInit() { tflite::HexagonDelegateKernel::InitState(); }
 
diff --git a/tensorflow/lite/delegates/hexagon/hexagon_delegate_kernel.cc b/tensorflow/lite/delegates/hexagon/hexagon_delegate_kernel.cc
index 13987091a34..cdf6b555929 100644
--- a/tensorflow/lite/delegates/hexagon/hexagon_delegate_kernel.cc
+++ b/tensorflow/lite/delegates/hexagon/hexagon_delegate_kernel.cc
@@ -20,7 +20,6 @@ limitations under the License.
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/context_util.h"
-#include "tensorflow/lite/delegates/utils.h"
 #include "tensorflow/lite/delegates/hexagon/hexagon_implementation.h"
 #include "tensorflow/lite/delegates/hexagon/utils.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
@@ -51,13 +50,6 @@ TfLiteStatus HexagonDelegateKernel::Init(TfLiteContext* context,
     TF_LITE_KERNEL_LOG(context, "Hexagon interface not available.");
     return kTfLiteError;
   }
-  if (params != nullptr && params->delegate != nullptr) {
-    const ::TfLiteHexagonDelegateOptions* options_ptr =
-        reinterpret_cast<const ::TfLiteHexagonDelegateOptions*>(
-            params->delegate->data_);
-    params_ = (options_ptr == nullptr ? ::TfLiteHexagonDelegateOptions()
-                                      : *options_ptr);
-  }
 
   // Ensure Hexagon NNLib is ready to start working.
   int error = hexagon_nn_->hexagon_nn_config();
@@ -94,8 +86,8 @@ TfLiteStatus HexagonDelegateKernel::Init(TfLiteContext* context,
   return kTfLiteOk;
 }
 
-TfLiteStatus HexagonDelegateKernel::Invoke(TfLiteContext* context,
-                                           TfLiteNode* node) {
+TfLiteStatus HexagonDelegateKernel::Eval(TfLiteContext* context,
+                                         TfLiteNode* node) {
   if (hexagon_nn_ == nullptr) {
     TF_LITE_KERNEL_LOG(context, "Hexagon interface not available.");
     return kTfLiteError;
diff --git a/tensorflow/lite/delegates/hexagon/hexagon_delegate_kernel.h b/tensorflow/lite/delegates/hexagon/hexagon_delegate_kernel.h
index 2feec7be075..e7ba5e4477a 100644
--- a/tensorflow/lite/delegates/hexagon/hexagon_delegate_kernel.h
+++ b/tensorflow/lite/delegates/hexagon/hexagon_delegate_kernel.h
@@ -31,6 +31,7 @@ limitations under the License.
 #include "tensorflow/lite/delegates/hexagon/hexagon_delegate.h"
 #include "tensorflow/lite/delegates/hexagon/hexagon_implementation.h"
 #include "tensorflow/lite/delegates/hexagon/hexagon_nn/hexagon_nn.h"
+#include "tensorflow/lite/delegates/utils/simple_delegate.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
@@ -38,18 +39,22 @@ namespace tflite {
 // Represents an abstraction of a Hexagon NNLib graph with functionality to
 // initialize, prepare and invoke it based on the TFLite subgraph to be
 // delegated.
-class HexagonDelegateKernel {
+class HexagonDelegateKernel : public SimpleDelegateKernelInterface {
  public:
+  explicit HexagonDelegateKernel(const ::TfLiteHexagonDelegateOptions& params)
+      : params_(params) {}
+
   // Initialize the Hexagon graph and add required nodes.
-  TfLiteStatus Init(TfLiteContext* context, const TfLiteDelegateParams* params);
+  TfLiteStatus Init(TfLiteContext* context,
+                    const TfLiteDelegateParams* params) override;
 
   // Prepare the Hexagon graph with hexagon_nn_prepare.
-  TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node);
+  TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) override;
 
   // Allocate Hexagon tensordefs for graph I/O & execute it.
-  TfLiteStatus Invoke(TfLiteContext* context, TfLiteNode* node);
+  TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) override;
 
-  ~HexagonDelegateKernel();
+  ~HexagonDelegateKernel() override;
 
   // Sets the environment required for Hexagon execution: DSP attributes,
   // rpcmem, etc.
diff --git a/tensorflow/lite/delegates/utils/dummy_delegate/dummy_delegate.cc b/tensorflow/lite/delegates/utils/dummy_delegate/dummy_delegate.cc
index 47b67a5fbdd..4de381ee226 100644
--- a/tensorflow/lite/delegates/utils/dummy_delegate/dummy_delegate.cc
+++ b/tensorflow/lite/delegates/utils/dummy_delegate/dummy_delegate.cc
@@ -68,6 +68,11 @@ class DummyDelegate : public SimpleDelegateInterface {
     return std::make_unique<DummyDelegateKernel>(options_);
   }
 
+  SimpleDelegateInterface::Options DelegateOptions() const override {
+    // Use default options.
+    return SimpleDelegateInterface::Options();
+  }
+
  private:
   const DummyDelegateOptions options_;
 };
diff --git a/tensorflow/lite/delegates/utils/simple_delegate.cc b/tensorflow/lite/delegates/utils/simple_delegate.cc
index f8c0a027cca..5eb2e319b94 100644
--- a/tensorflow/lite/delegates/utils/simple_delegate.cc
+++ b/tensorflow/lite/delegates/utils/simple_delegate.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/lite/delegates/utils/simple_delegate.h"
 
+#include <limits>
 #include <memory>
 #include <vector>
 
@@ -78,6 +79,10 @@ TfLiteStatus DelegatePrepare(TfLiteContext* context,
                              TfLiteDelegate* base_delegate) {
   auto* delegate =
       reinterpret_cast<SimpleDelegateInterface*>(base_delegate->data_);
+  auto delegate_options = delegate->DelegateOptions();
+  if (delegate_options.max_delegated_partitions <= 0)
+    delegate_options.max_delegated_partitions = std::numeric_limits<int>::max();
+
   TF_LITE_ENSURE_STATUS(delegate->Initialize(context));
   delegates::IsNodeSupportedFn node_supported_fn =
       [=](TfLiteContext* context, TfLiteNode* node,
@@ -89,7 +94,9 @@ TfLiteStatus DelegatePrepare(TfLiteContext* context,
   delegates::GraphPartitionHelper helper(context, node_supported_fn);
   TF_LITE_ENSURE_STATUS(helper.Partition(nullptr));
 
-  std::vector<int> supported_nodes = helper.GetNodesOfFirstNLargestPartitions();
+  std::vector<int> supported_nodes = helper.GetNodesOfFirstNLargestPartitions(
+      delegate_options.max_delegated_partitions,
+      delegate_options.min_nodes_per_partition);
 
   TFLITE_LOG_PROD(tflite::TFLITE_LOG_INFO,
                   "%s delegate: %d nodes delegated out of %d nodes with "
diff --git a/tensorflow/lite/delegates/utils/simple_delegate.h b/tensorflow/lite/delegates/utils/simple_delegate.h
index 7b6be43047b..338633d92e0 100644
--- a/tensorflow/lite/delegates/utils/simple_delegate.h
+++ b/tensorflow/lite/delegates/utils/simple_delegate.h
@@ -70,7 +70,15 @@ class SimpleDelegateKernelInterface {
 // - CreateDelegateKernelInterface
 class SimpleDelegateInterface {
  public:
-  SimpleDelegateInterface() {}
+  // Options for configuring a delegate.
+  struct Options {
+    // Maximum number of delegated subgraph, values <=0 means unlimited.
+    int max_delegated_partitions = 0;
+
+    // The minimum number of nodes allowed in a delegated graph, values <=0
+    // means unlimited.
+    int min_nodes_per_partition = 0;
+  };
 
   virtual ~SimpleDelegateInterface() {}
 
@@ -95,6 +103,9 @@ class SimpleDelegateInterface {
   // Caller takes ownership of the returned object.
   virtual std::unique_ptr<SimpleDelegateKernelInterface>
   CreateDelegateKernelInterface() = 0;
+
+  // Returns SimpleDelegateInterface::Options which has the delegate options.
+  virtual SimpleDelegateInterface::Options DelegateOptions() const = 0;
 };
 
 // Factory class that provides static methods to deal with SimpleDelegate

From f904a86441dd8ba6c994b4578b3a0b732d02b4c4 Mon Sep 17 00:00:00 2001
From: Yujing Zhang <yujingzhang@google.com>
Date: Tue, 30 Jun 2020 11:41:09 -0700
Subject: [PATCH 1329/1390] Fix SyncOnReadVariable.value() to always return a
 tensor.

PiperOrigin-RevId: 319065089
Change-Id: Ie93caeaf6e24872eaacd11b7d316cb06cc9f3211
---
 tensorflow/python/distribute/values.py      |  4 ++++
 tensorflow/python/distribute/values_test.py | 14 ++++++++++++++
 2 files changed, 18 insertions(+)

diff --git a/tensorflow/python/distribute/values.py b/tensorflow/python/distribute/values.py
index a180c915b25..399e8b80a19 100644
--- a/tensorflow/python/distribute/values.py
+++ b/tensorflow/python/distribute/values.py
@@ -1069,6 +1069,8 @@ class SyncOnReadVariable(DistributedVariable):
   def value(self):
     with ds_context.enter_or_assert_strategy(self._distribute_strategy):
       if ds_context.in_cross_replica_context():
+        if self._aggregation == vs.VariableAggregation.ONLY_FIRST_REPLICA:
+          return self._get_replica(0).value()
         return self._get_cross_replica()
       else:
         # _get_on_device_or_primary() returns a Variable.
@@ -1076,6 +1078,8 @@ class SyncOnReadVariable(DistributedVariable):
 
   def _get_cross_replica(self):
     if self._aggregation == vs.VariableAggregation.ONLY_FIRST_REPLICA:
+      # Consider returning a tensor value here to make the return value of
+      # _get_cross_replica consistent.
       return self._get_replica(0)
 
     with ds_context.enter_or_assert_strategy(self._distribute_strategy):
diff --git a/tensorflow/python/distribute/values_test.py b/tensorflow/python/distribute/values_test.py
index ca752eaf703..6ea523f1416 100644
--- a/tensorflow/python/distribute/values_test.py
+++ b/tensorflow/python/distribute/values_test.py
@@ -1538,6 +1538,20 @@ class SyncOnReadVariableTest(test.TestCase, parameterized.TestCase):
       self.assertIsInstance(converted, ops.Tensor)
       self.assertEqual(converted.dtype, replica_local.dtype)
 
+  @combinations.generate(combinations.combine(
+      distribution=[
+          strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
+          strategy_combinations.tpu_strategy,
+          strategy_combinations.tpu_strategy_packed_var,
+      ], mode=["eager"]))
+  def testValueInCrossReplicaContext(self, distribution):
+    value_list, replica_local = _make_replica_local(
+        variable_scope.VariableAggregation.ONLY_FIRST_REPLICA, distribution)
+
+    self.assertIsInstance(replica_local.value(), ops.Tensor)
+    self.assertEqual(self.evaluate(replica_local.value()),
+                     self.evaluate(value_list[0].value()))
+
   @combinations.generate(mirrored_and_tpu_strategy_combinations())
   def testSaveAndRestoreReplicaLocalSumOneGraph(self, distribution):
     with self.cached_session() as sess:

From 96fb0297f8ca225c4a303ce10bf64a3262e64447 Mon Sep 17 00:00:00 2001
From: Jared Duke <jdduke@google.com>
Date: Tue, 30 Jun 2020 11:44:34 -0700
Subject: [PATCH 1330/1390] [tf.lite] Update op versions for 2.3 release

PiperOrigin-RevId: 319065832
Change-Id: I0d31d4c0e4c57e762ac9fc1390e3de4760e6b935
---
 .../lite/tools/versioning/runtime_version.cc  | 88 +++++++++----------
 1 file changed, 42 insertions(+), 46 deletions(-)

diff --git a/tensorflow/lite/tools/versioning/runtime_version.cc b/tensorflow/lite/tools/versioning/runtime_version.cc
index 976e7e70441..c4bafa5587d 100644
--- a/tensorflow/lite/tools/versioning/runtime_version.cc
+++ b/tensorflow/lite/tools/versioning/runtime_version.cc
@@ -56,40 +56,40 @@ std::string FindMinimumRuntimeVersionForOp(tflite::BuiltinOperator op_code,
           new std::map<std::pair<BuiltinOperator, int>, std::string>({
               {{BuiltinOperator_AVERAGE_POOL_2D, 1}, "1.5.0"},
               {{BuiltinOperator_AVERAGE_POOL_2D, 2}, "1.14.0"},
-              {{BuiltinOperator_AVERAGE_POOL_2D, 3}, kPendingReleaseVersion},
-              {{BuiltinOperator_BATCH_MATMUL, 1}, kPendingReleaseVersion},
-              {{BuiltinOperator_BATCH_MATMUL, 2}, kPendingReleaseVersion},
+              {{BuiltinOperator_AVERAGE_POOL_2D, 3}, "2.3.0"},
+              {{BuiltinOperator_BATCH_MATMUL, 1}, "2.3.0"},
+              {{BuiltinOperator_BATCH_MATMUL, 2}, "2.3.0"},
               {{BuiltinOperator_CONV_2D, 1}, "1.5.0"},
               {{BuiltinOperator_CONV_2D, 2}, "1.14.0"},
               {{BuiltinOperator_CONV_2D, 3}, "1.14.0"},
-              {{BuiltinOperator_CONV_2D, 4}, kPendingReleaseVersion},
+              {{BuiltinOperator_CONV_2D, 4}, "2.3.0"},
               {{BuiltinOperator_CONV_2D, 5}, kPendingReleaseVersion},
               {{BuiltinOperator_DEPTHWISE_CONV_2D, 1}, "1.5.0"},
               {{BuiltinOperator_DEPTHWISE_CONV_2D, 2}, "1.12.0"},
               {{BuiltinOperator_DEPTHWISE_CONV_2D, 3}, "1.14.0"},
               {{BuiltinOperator_DEPTHWISE_CONV_2D, 4}, "2.2.0"},
-              {{BuiltinOperator_DEPTHWISE_CONV_2D, 5}, kPendingReleaseVersion},
-              {{BuiltinOperator_DEPTHWISE_CONV_2D, 6}, kPendingReleaseVersion},
+              {{BuiltinOperator_DEPTHWISE_CONV_2D, 5}, "2.3.0"},
+              {{BuiltinOperator_DEPTHWISE_CONV_2D, 6}, "2.3.0"},
               {{BuiltinOperator_ADD, 1}, "1.5.0"},
               {{BuiltinOperator_ADD, 2}, "1.14.0"},
               {{BuiltinOperator_ADD_N, 1}, "1.14.0"},
               {{BuiltinOperator_SPACE_TO_BATCH_ND, 1}, "1.6.0"},
               {{BuiltinOperator_SPACE_TO_BATCH_ND, 2}, "1.14.0"},
-              {{BuiltinOperator_SPACE_TO_BATCH_ND, 3}, kPendingReleaseVersion},
+              {{BuiltinOperator_SPACE_TO_BATCH_ND, 3}, "2.3.0"},
               {{BuiltinOperator_SUB, 1}, "1.6.0"},
               {{BuiltinOperator_SUB, 2}, "1.14.0"},
-              {{BuiltinOperator_SUB, 3}, kPendingReleaseVersion},
+              {{BuiltinOperator_SUB, 3}, "2.3.0"},
               {{BuiltinOperator_SUB, 4}, kPendingReleaseVersion},
               {{BuiltinOperator_DENSIFY, 1}, "2.2.0"},
               {{BuiltinOperator_DIV, 1}, "1.6.0"},
-              {{BuiltinOperator_DIV, 2}, kPendingReleaseVersion},
+              {{BuiltinOperator_DIV, 2}, "2.3.0"},
               {{BuiltinOperator_BATCH_TO_SPACE_ND, 1}, "1.6.0"},
               {{BuiltinOperator_BATCH_TO_SPACE_ND, 2}, "1.14.0"},
-              {{BuiltinOperator_BATCH_TO_SPACE_ND, 3}, kPendingReleaseVersion},
+              {{BuiltinOperator_BATCH_TO_SPACE_ND, 3}, "2.3.0"},
               {{BuiltinOperator_CAST, 1}, "1.5.0"},
               {{BuiltinOperator_CONCATENATION, 1}, "1.5.0"},
               {{BuiltinOperator_CONCATENATION, 2}, "1.14.0"},
-              {{BuiltinOperator_CONCATENATION, 3}, kPendingReleaseVersion},
+              {{BuiltinOperator_CONCATENATION, 3}, "2.3.0"},
               {{BuiltinOperator_DEPTH_TO_SPACE, 1}, "2.1.0"},
               {{BuiltinOperator_EMBEDDING_LOOKUP, 1}, "1.13.0"},
               {{BuiltinOperator_EMBEDDING_LOOKUP, 2}, "1.14.0"},
@@ -103,38 +103,38 @@ std::string FindMinimumRuntimeVersionForOp(tflite::BuiltinOperator op_code,
               {{BuiltinOperator_FULLY_CONNECTED, 4}, "1.14.0"},
               {{BuiltinOperator_FULLY_CONNECTED, 5}, "2.0.0"},
               {{BuiltinOperator_FULLY_CONNECTED, 6}, "2.1.0"},
-              {{BuiltinOperator_FULLY_CONNECTED, 7}, kPendingReleaseVersion},
-              {{BuiltinOperator_FULLY_CONNECTED, 8}, kPendingReleaseVersion},
-              {{BuiltinOperator_FULLY_CONNECTED, 9}, kPendingReleaseVersion},
+              {{BuiltinOperator_FULLY_CONNECTED, 7}, "2.3.0"},
+              {{BuiltinOperator_FULLY_CONNECTED, 8}, "2.3.0"},
+              {{BuiltinOperator_FULLY_CONNECTED, 9}, "2.3.0"},
               {{BuiltinOperator_GATHER, 1}, "1.6.0"},
               {{BuiltinOperator_GATHER, 2}, "1.14.0"},
               {{BuiltinOperator_GATHER, 3}, "1.15.0"},
               {{BuiltinOperator_GATHER_ND, 1}, "1.14.0"},
-              {{BuiltinOperator_GATHER_ND, 2}, kPendingReleaseVersion},
+              {{BuiltinOperator_GATHER_ND, 2}, "2.3.0"},
               {{BuiltinOperator_HASHTABLE_LOOKUP, 1}, "1.5.0"},
               {{BuiltinOperator_SVDF, 1}, "1.5.0"},
               {{BuiltinOperator_SVDF, 2}, "1.14.0"},
               {{BuiltinOperator_SVDF, 3}, "2.2.0"},
-              {{BuiltinOperator_SVDF, 4}, kPendingReleaseVersion},
+              {{BuiltinOperator_SVDF, 4}, "2.3.0"},
               {{BuiltinOperator_L2_NORMALIZATION, 1}, "1.5.0"},
               {{BuiltinOperator_L2_NORMALIZATION, 2}, "1.14.0"},
               {{BuiltinOperator_L2_POOL_2D, 1}, "1.5.0"},
               {{BuiltinOperator_LOCAL_RESPONSE_NORMALIZATION, 1}, "1.5.0"},
               {{BuiltinOperator_MAX_POOL_2D, 1}, "1.5.0"},
               {{BuiltinOperator_MAX_POOL_2D, 2}, "1.14.0"},
-              {{BuiltinOperator_MAX_POOL_2D, 3}, kPendingReleaseVersion},
+              {{BuiltinOperator_MAX_POOL_2D, 3}, "2.3.0"},
               {{BuiltinOperator_MAXIMUM, 1}, "1.14.0"},
               {{BuiltinOperator_MAXIMUM, 2}, "1.14.0"},
-              {{BuiltinOperator_MAXIMUM, 3}, kPendingReleaseVersion},
-              {{BuiltinOperator_MAXIMUM, 4}, kPendingReleaseVersion},
+              {{BuiltinOperator_MAXIMUM, 3}, "2.3.0"},
+              {{BuiltinOperator_MAXIMUM, 4}, "2.3.0"},
               {{BuiltinOperator_MINIMUM, 1}, "1.14.0"},
               {{BuiltinOperator_MINIMUM, 2}, "1.14.0"},
-              {{BuiltinOperator_MINIMUM, 3}, kPendingReleaseVersion},
-              {{BuiltinOperator_MINIMUM, 4}, kPendingReleaseVersion},
+              {{BuiltinOperator_MINIMUM, 3}, "2.3.0"},
+              {{BuiltinOperator_MINIMUM, 4}, "2.3.0"},
               {{BuiltinOperator_MUL, 1}, "1.5.0"},
               {{BuiltinOperator_MUL, 2}, "1.14.0"},
               {{BuiltinOperator_MUL, 3}, "1.15.0"},
-              {{BuiltinOperator_MUL, 4}, kPendingReleaseVersion},
+              {{BuiltinOperator_MUL, 4}, "2.3.0"},
               {{BuiltinOperator_NON_MAX_SUPPRESSION_V4, 1}, "2.1.0"},
               {{BuiltinOperator_NON_MAX_SUPPRESSION_V5, 1}, "2.1.0"},
               {{BuiltinOperator_PAD, 1}, "1.5.0"},
@@ -146,28 +146,26 @@ std::string FindMinimumRuntimeVersionForOp(tflite::BuiltinOperator op_code,
               {{BuiltinOperator_RESHAPE, 1}, "1.5.0"},
               {{BuiltinOperator_SOFTMAX, 1}, "1.5.0"},
               {{BuiltinOperator_SOFTMAX, 2}, "1.14.0"},
-              {{BuiltinOperator_SOFTMAX, 3}, kPendingReleaseVersion},
+              {{BuiltinOperator_SOFTMAX, 3}, "2.3.0"},
               {{BuiltinOperator_SPACE_TO_DEPTH, 1}, "1.5.0"},
               {{BuiltinOperator_SPACE_TO_DEPTH, 2}, "1.14.0"},
               {{BuiltinOperator_TRANSPOSE, 1}, "1.6.0"},
               {{BuiltinOperator_TRANSPOSE, 2}, "1.14.0"},
               {{BuiltinOperator_TRANSPOSE, 3}, "1.15.0"},
-              {{BuiltinOperator_TRANSPOSE, 4}, kPendingReleaseVersion},
+              {{BuiltinOperator_TRANSPOSE, 4}, "2.3.0"},
               {{BuiltinOperator_LSTM, 1}, "1.7.0"},
               {{BuiltinOperator_LSTM, 2}, "1.10.0"},
               {{BuiltinOperator_LSTM, 3}, "1.14.0"},
-              {{BuiltinOperator_LSTM, 4}, kPendingReleaseVersion},
+              {{BuiltinOperator_LSTM, 4}, "2.3.0"},
               {{BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_LSTM, 1}, "1.13.1"},
               {{BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_LSTM, 2}, "1.14.0"},
-              {{BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_LSTM, 3},
-               kPendingReleaseVersion},
+              {{BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_LSTM, 3}, "2.3.0"},
               {{BuiltinOperator_BIDIRECTIONAL_SEQUENCE_LSTM, 1}, "1.14.0"},
               {{BuiltinOperator_BIDIRECTIONAL_SEQUENCE_LSTM, 2}, "1.14.0"},
               {{BuiltinOperator_BIDIRECTIONAL_SEQUENCE_LSTM, 3}, "1.14.0"},
               {{BuiltinOperator_BIDIRECTIONAL_SEQUENCE_RNN, 1}, "1.14.0"},
               {{BuiltinOperator_BIDIRECTIONAL_SEQUENCE_RNN, 2}, "1.14.0"},
-              {{BuiltinOperator_BIDIRECTIONAL_SEQUENCE_RNN, 3},
-               kPendingReleaseVersion},
+              {{BuiltinOperator_BIDIRECTIONAL_SEQUENCE_RNN, 3}, "2.3.0"},
               {{BuiltinOperator_MEAN, 1}, "1.6.0"},
               {{BuiltinOperator_MEAN, 2}, "1.14.0"},
               {{BuiltinOperator_SUM, 1}, "1.10.0"},
@@ -185,19 +183,18 @@ std::string FindMinimumRuntimeVersionForOp(tflite::BuiltinOperator op_code,
               {{BuiltinOperator_RESIZE_BILINEAR, 3}, "2.2.0"},
               {{BuiltinOperator_RESIZE_NEAREST_NEIGHBOR, 1}, "1.13.1"},
               {{BuiltinOperator_RESIZE_NEAREST_NEIGHBOR, 2}, "1.14.0"},
-              {{BuiltinOperator_RESIZE_NEAREST_NEIGHBOR, 3},
-               kPendingReleaseVersion},
+              {{BuiltinOperator_RESIZE_NEAREST_NEIGHBOR, 3}, "2.3.0"},
               {{BuiltinOperator_RNN, 1}, "1.5.0"},
               {{BuiltinOperator_RNN, 2}, "1.14.0"},
-              {{BuiltinOperator_RNN, 3}, kPendingReleaseVersion},
+              {{BuiltinOperator_RNN, 3}, "2.3.0"},
               {{BuiltinOperator_SKIP_GRAM, 1}, "1.5.0"},
               {{BuiltinOperator_SQUEEZE, 1}, "1.6.0"},
               {{BuiltinOperator_SPLIT, 1}, "1.5.0"},
               {{BuiltinOperator_SPLIT, 2}, "1.14.0"},
               {{BuiltinOperator_SPLIT, 3}, "1.14.0"},
-              {{BuiltinOperator_SPLIT, 4}, kPendingReleaseVersion},
+              {{BuiltinOperator_SPLIT, 4}, "2.3.0"},
               {{BuiltinOperator_SPLIT_V, 1}, "1.13.1"},
-              {{BuiltinOperator_SPLIT_V, 2}, kPendingReleaseVersion},
+              {{BuiltinOperator_SPLIT_V, 2}, "2.3.0"},
               {{BuiltinOperator_STRIDED_SLICE, 1}, "1.6.0"},
               {{BuiltinOperator_STRIDED_SLICE, 2}, "1.14.0"},
               {{BuiltinOperator_STRIDED_SLICE, 3}, "2.1.0"},
@@ -210,42 +207,41 @@ std::string FindMinimumRuntimeVersionForOp(tflite::BuiltinOperator op_code,
               {{BuiltinOperator_ARG_MIN, 2}, "1.14.0"},
               {{BuiltinOperator_TRANSPOSE_CONV, 1}, "1.9.0"},
               {{BuiltinOperator_TRANSPOSE_CONV, 2}, "2.2.0"},
-              {{BuiltinOperator_TRANSPOSE_CONV, 3}, kPendingReleaseVersion},
+              {{BuiltinOperator_TRANSPOSE_CONV, 3}, "2.3.0"},
               {{BuiltinOperator_SPARSE_TO_DENSE, 1}, "1.9.0"},
               {{BuiltinOperator_SPARSE_TO_DENSE, 2}, "1.14.0"},
               {{BuiltinOperator_SPARSE_TO_DENSE, 3}, "1.15.0"},
               {{BuiltinOperator_EXPAND_DIMS, 1}, "1.10.0"},
               {{BuiltinOperator_PACK, 1}, "1.11.0"},
               {{BuiltinOperator_PACK, 2}, "1.14.0"},
-              {{BuiltinOperator_PACK, 3}, kPendingReleaseVersion},
+              {{BuiltinOperator_PACK, 3}, "2.3.0"},
               {{BuiltinOperator_SHAPE, 1}, "1.10.0"},
               {{BuiltinOperator_SLICE, 1}, "1.14.0"},
               {{BuiltinOperator_SLICE, 2}, "1.14.0"},
               {{BuiltinOperator_SLICE, 3}, "1.14.0"},
               {{BuiltinOperator_TANH, 1}, "1.14.0"},
               {{BuiltinOperator_TANH, 2}, "1.14.0"},
-              {{BuiltinOperator_TANH, 3}, kPendingReleaseVersion},
+              {{BuiltinOperator_TANH, 3}, "2.3.0"},
               {{BuiltinOperator_ONE_HOT, 1}, "1.11.0"},
               {{BuiltinOperator_UNPACK, 1}, "1.11.0"},
               {{BuiltinOperator_UNPACK, 2}, "1.14.0"},
               {{BuiltinOperator_UNPACK, 3}, "2.2.0"},
-              {{BuiltinOperator_UNPACK, 4}, kPendingReleaseVersion},
+              {{BuiltinOperator_UNPACK, 4}, "2.3.0"},
               {{BuiltinOperator_LEAKY_RELU, 1}, "1.13.1"},
-              {{BuiltinOperator_LEAKY_RELU, 2}, kPendingReleaseVersion},
+              {{BuiltinOperator_LEAKY_RELU, 2}, "2.3.0"},
               {{BuiltinOperator_LOGISTIC, 1}, "1.14.0"},
               {{BuiltinOperator_LOGISTIC, 2}, "1.14.0"},
-              {{BuiltinOperator_LOGISTIC, 3}, kPendingReleaseVersion},
+              {{BuiltinOperator_LOGISTIC, 3}, "2.3.0"},
               {{BuiltinOperator_LOG_SOFTMAX, 1}, "1.14.0"},
               {{BuiltinOperator_LOG_SOFTMAX, 2}, "1.14.0"},
               {{BuiltinOperator_LSH_PROJECTION, 1}, "1.5.0"},
               {{BuiltinOperator_SQUARED_DIFFERENCE, 1}, "1.13.1"},
               {{BuiltinOperator_MIRROR_PAD, 1}, "1.13.1"},
-              {{BuiltinOperator_MIRROR_PAD, 2}, kPendingReleaseVersion},
+              {{BuiltinOperator_MIRROR_PAD, 2}, "2.3.0"},
               {{BuiltinOperator_UNIQUE, 1}, "1.14.0"},
               {{BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_RNN, 1}, "1.14.0"},
               {{BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_RNN, 2}, "1.14.0"},
-              {{BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_RNN, 3},
-               kPendingReleaseVersion},
+              {{BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_RNN, 3}, "2.3.0"},
               {{BuiltinOperator_WHERE, 1}, "1.14.0"},
               {{BuiltinOperator_DEQUANTIZE, 1}, "1.13.1"},
               {{BuiltinOperator_DEQUANTIZE, 2}, "1.14.0"},
@@ -254,10 +250,10 @@ std::string FindMinimumRuntimeVersionForOp(tflite::BuiltinOperator op_code,
               {{BuiltinOperator_REVERSE_SEQUENCE, 1}, "1.14.0"},
               {{BuiltinOperator_EQUAL, 1}, "1.14.0"},
               {{BuiltinOperator_EQUAL, 2}, "1.14.0"},
-              {{BuiltinOperator_EQUAL, 3}, kPendingReleaseVersion},
+              {{BuiltinOperator_EQUAL, 3}, "2.3.0"},
               {{BuiltinOperator_NOT_EQUAL, 1}, "1.14.0"},
               {{BuiltinOperator_NOT_EQUAL, 2}, "1.14.0"},
-              {{BuiltinOperator_NOT_EQUAL, 3}, kPendingReleaseVersion},
+              {{BuiltinOperator_NOT_EQUAL, 3}, "2.3.0"},
               {{BuiltinOperator_GREATER, 1}, "1.14.0"},
               {{BuiltinOperator_GREATER, 2}, "1.14.0"},
               {{BuiltinOperator_GREATER_EQUAL, 1}, "1.14.0"},
@@ -304,7 +300,7 @@ std::string FindMinimumRuntimeVersionForOp(tflite::BuiltinOperator op_code,
               {{BuiltinOperator_ABS, 1}, "1.13.0"},
               {{BuiltinOperator_HARD_SWISH, 1}, "1.15.0"},
               {{BuiltinOperator_FILL, 1}, "1.13.0"},
-              {{BuiltinOperator_FILL, 2}, kPendingReleaseVersion},
+              {{BuiltinOperator_FILL, 2}, "2.3.0"},
               {{BuiltinOperator_REVERSE_V2, 1}, "1.14.0"},
               {{BuiltinOperator_REVERSE_V2, 2}, "2.2.0"},
               {{BuiltinOperator_RANK, 1}, "1.14.0"},

From 92a5fbbac40db0526c8af831ddd62311d842bddd Mon Sep 17 00:00:00 2001
From: Haoliang Zhang <haoliang@google.com>
Date: Tue, 30 Jun 2020 11:50:56 -0700
Subject: [PATCH 1331/1390] Add `GetMemoryInfo` implementation in
 `tensorflow/core/platform/windows/port.cc`.

The corresponding `AvailableRam` function doesn't need to appear in this file since it has a default implementation in `../platform/mem.h`.

PiperOrigin-RevId: 319067187
Change-Id: I61d7850c335df48c2ba1479230a211f01d2a3fb4
---
 tensorflow/core/platform/windows/port.cc | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/tensorflow/core/platform/windows/port.cc b/tensorflow/core/platform/windows/port.cc
index 117d79aed35..00f52f9b70c 100644
--- a/tensorflow/core/platform/windows/port.cc
+++ b/tensorflow/core/platform/windows/port.cc
@@ -183,13 +183,15 @@ double NominalCPUFrequency() {
   return 1.0;
 }
 
-int64 AvailableRam() {
+MemoryInfo GetMemoryInfo() {
+  MemoryInfo mem_info = {INT64_MAX, INT64_MAX};
   MEMORYSTATUSEX statex;
   statex.dwLength = sizeof(statex);
   if (GlobalMemoryStatusEx(&statex)) {
-    return statex.ullAvailPhys;
+    mem_info.free = statex.ullAvailPhys;
+    mem_info.total = statex.ullTotalPhys;
   }
-  return INT64_MAX;
+  return mem_info;
 }
 
 int NumHyperthreadsPerCore() {

From 913738f8328fb9914f53225853a1d1d7c03e0184 Mon Sep 17 00:00:00 2001
From: Dero Gharibian <dero@google.com>
Date: Tue, 30 Jun 2020 12:00:15 -0700
Subject: [PATCH 1332/1390] Fix msan false positive for
 //third_party/tensorflow/go:tensorflow_test

PiperOrigin-RevId: 319069182
Change-Id: I7a6458577a18561dafbbed7f88c23ccc90366bd2
---
 tensorflow/go/tensor.go | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/tensorflow/go/tensor.go b/tensorflow/go/tensor.go
index 9221d35274c..56594a73270 100644
--- a/tensorflow/go/tensor.go
+++ b/tensorflow/go/tensor.go
@@ -21,9 +21,11 @@ package tensorflow
 #include <string.h>
 #include "tensorflow/c/c_api.h"
 
-void toNewTString(_GoString_ gstr, TF_TString *tstr) {
-    TF_TString_Init(tstr);
-    TF_TString_Copy(tstr, _GoStringPtr(gstr), _GoStringLen(gstr));
+TF_TString toNewTString(_GoString_ gstr) {
+    TF_TString tstr;
+    TF_TString_Init(&tstr);
+    TF_TString_Copy(&tstr, _GoStringPtr(gstr), _GoStringLen(gstr));
+    return tstr;
 }
 */
 import "C"
@@ -446,8 +448,7 @@ func encodeTensorWithSlices(w *bytes.Buffer, v reflect.Value, shape []int64) err
 		}
 	} else if v.Kind() == reflect.String {
 		s := v.Interface().(string)
-		var tstr C.TF_TString
-		C.toNewTString(s, &tstr)
+		tstr := C.toNewTString(s)
 		ptr := unsafe.Pointer(&tstr)
 		return copyPtr(w, ptr, C.sizeof_TF_TString)
 	} else if v.Kind() != reflect.Array {

From b2ba4df1450a5bcf5bcaa74e0c29ab35b9ef7074 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20Kubov=C4=8D=C3=ADk?= <markub3327@gmail.com>
Date: Tue, 30 Jun 2020 21:11:42 +0200
Subject: [PATCH 1333/1390] Update hdf5_format.py

---
 tensorflow/python/keras/saving/hdf5_format.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/python/keras/saving/hdf5_format.py b/tensorflow/python/keras/saving/hdf5_format.py
index 161a95df432..2a3251da747 100644
--- a/tensorflow/python/keras/saving/hdf5_format.py
+++ b/tensorflow/python/keras/saving/hdf5_format.py
@@ -88,8 +88,8 @@ def save_model_to_hdf5(model, filepath, overwrite=True, \
       overwrite: Whether we should overwrite any existing
           model at the target location, or instead
           ask the user with a manual prompt.
-      lockfile: Create a lockfile before saving the model 
-          file to prevent from reading, while saving 
+      lockfile: Create a lockfile before saving the model
+          file to prevent from reading, while saving
           is not done.
       include_optimizer: If True, save optimizer's state together.
 

From 2394a61d64fd198965a0b027b7b840eaca0d6f25 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 30 Jun 2020 12:01:25 -0700
Subject: [PATCH 1334/1390] [Grappler] Replace redundant variable updates (e.g.
 AssignAdd of all zero deltas) with NoOps or Identity. Also relax a constraint
 in dependency optimizer to allow pruning of unused variable reads.

PiperOrigin-RevId: 319069467
Change-Id: I97199e464c8a9fee0055077efbcf481f2545bf8a
---
 tensorflow/core/grappler/op_types.cc          |  4 ++
 tensorflow/core/grappler/op_types.h           |  1 +
 .../grappler/optimizers/constant_folding.cc   | 69 +++++++++++++++++++
 .../grappler/optimizers/constant_folding.h    |  6 ++
 .../optimizers/dependency_optimizer.cc        | 27 ++++++--
 .../python/grappler/constant_folding_test.py  | 47 +++++++++++++
 .../python/grappler/memory_optimizer_test.py  |  1 +
 7 files changed, 151 insertions(+), 4 deletions(-)

diff --git a/tensorflow/core/grappler/op_types.cc b/tensorflow/core/grappler/op_types.cc
index efd23b6005e..9d30f24e047 100644
--- a/tensorflow/core/grappler/op_types.cc
+++ b/tensorflow/core/grappler/op_types.cc
@@ -429,6 +429,10 @@ bool IsReadVariableOp(const NodeDef& node) {
   return node.op() == "ReadVariableOp";
 }
 
+bool IsReadVariablesOp(const NodeDef& node) {
+  return node.op() == "_ReadVariablesOp";
+}
+
 bool IsReal(const NodeDef& node) { return node.op() == "Real"; }
 
 bool IsRealDiv(const NodeDef& node) { return node.op() == "RealDiv"; }
diff --git a/tensorflow/core/grappler/op_types.h b/tensorflow/core/grappler/op_types.h
index 59fc68daba5..141eda7415a 100644
--- a/tensorflow/core/grappler/op_types.h
+++ b/tensorflow/core/grappler/op_types.h
@@ -136,6 +136,7 @@ bool IsQueue(const NodeDef& node);
 bool IsRandomShuffle(const NodeDef& node);
 bool IsRank(const NodeDef& node);
 bool IsReadVariableOp(const NodeDef& node);
+bool IsReadVariablesOp(const NodeDef& node);
 bool IsReal(const NodeDef& node);
 bool IsRealDiv(const NodeDef& node);
 bool IsReciprocalGrad(const NodeDef& node);
diff --git a/tensorflow/core/grappler/optimizers/constant_folding.cc b/tensorflow/core/grappler/optimizers/constant_folding.cc
index 50a3daf379f..b8c2958b6bd 100644
--- a/tensorflow/core/grappler/optimizers/constant_folding.cc
+++ b/tensorflow/core/grappler/optimizers/constant_folding.cc
@@ -1781,9 +1781,11 @@ bool ConstantFolding::IsZeros(const NodeDef& node) const {
   return false;
 }
 
+// Replace an operation with Identity.
 void ConstantFolding::ReplaceOperationWithIdentity(
     int input_to_forward, const GraphProperties& properties, NodeDef* node,
     GraphDef* graph) {
+  if (input_to_forward < 0 || input_to_forward >= node->input_size()) return;
   const DataType dtype = GetDataTypeFromNodeOrProps(*node, properties);
   if (dtype == DT_INVALID) return;
 
@@ -1836,6 +1838,26 @@ void ConstantFolding::ReplaceOperationWithSnapshot(
   graph_modified_ = true;
 }
 
+// Replace a node with NoOp. Change all inputs to control dependencies.
+// If the node has non-control outputs, no change will be performed.
+void ConstantFolding::ReplaceOperationWithNoOp(NodeDef* node, GraphDef* graph) {
+  if (HasRegularOutputs(*node, *node_map_)) return;
+  node->set_op("NoOp");
+  node->clear_attr();
+  // Change all inputs to control dependencies.
+  for (int i = 0; i < node->input_size(); ++i) {
+    if (IsControlInput(node->input(i))) {
+      break;
+    }
+    const string ctrl_dep =
+        AddControlDependency(node->input(i), graph, node_map_.get());
+    node_map_->UpdateInput(node->name(), node->input(i), ctrl_dep);
+    node->set_input(i, ctrl_dep);
+  }
+  DedupControlInputs(node);
+  graph_modified_ = true;
+}
+
 void ConstantFolding::ReplaceBinaryOperationWithBroadcastTo(
     int input_to_broadcast, const GraphProperties& properties, NodeDef* node,
     GraphDef* graph) {
@@ -2036,6 +2058,8 @@ Status ConstantFolding::SimplifyNode(bool use_shape_info, NodeDef* node,
   SET_AND_RETURN_IF_MODIFIED(SimplifyCase(optimized_graph, node));
   SET_AND_RETURN_IF_MODIFIED(
       SimplifySelect(*properties, optimized_graph, node));
+  RETURN_IF_MODIFIED(
+      RemoveRedundantVariableUpdates(*properties, optimized_graph, node));
 
   graph_modified_ = graph_modified_cached;
   return Status::OK();
@@ -2460,6 +2484,51 @@ bool ConstantFolding::SimplifySelect(const GraphProperties& properties,
   return true;
 }
 
+void ConstantFolding::RemoveRedundantVariableUpdates(
+    const GraphProperties& properties, GraphDef* optimized_graph,
+    NodeDef* node) {
+  static const absl::flat_hash_set<string>* kVariableReadOps =
+      new absl::flat_hash_set<string>{"AssignAddVariableOp",
+                                      "AssignSubVariableOp",
+                                      "AssignAdd",
+                                      "AssignSub",
+                                      "ScatterAdd",
+                                      "ScatterSub",
+                                      "ScatterMul",
+                                      "ScatterDiv",
+                                      "ScatterNdAdd",
+                                      "ScatterNdSub",
+                                      "ScatterNdMul",
+                                      "ScatterNdDiv",
+                                      "ResourceScatterAdd",
+                                      "ResourceScatterSub",
+                                      "ResourceScatterMul",
+                                      "ResourceScatterDiv",
+                                      "ResourceScatterNdAdd",
+                                      "ResourceScatterNdSub",
+                                      "ResourceScatterNdMul",
+                                      "ResourceScatterNdDiv"};
+  if (kVariableReadOps == nullptr ||
+      kVariableReadOps->find(node->op()) == kVariableReadOps->end())
+    return;
+  const int value_index = absl::StrContains(node->op(), "Scatter") ? 2 : 1;
+  const NodeDef* delta_node = node_map_->GetNode(node->input(value_index));
+  if (delta_node == nullptr) return;
+  const bool is_add_or_sub = absl::StrContains(node->op(), "Add") ||
+                             absl::StrContains(node->op(), "Sub");
+  if ((is_add_or_sub && IsZeros(*delta_node)) ||
+      (!is_add_or_sub && IsOnes(*delta_node))) {
+    VLOG(1) << "Removing redundant variable update: " << node->DebugString();
+    if (absl::StrContains(node->op(), "Variable") ||
+        absl::StrContains(node->op(), "Resource")) {
+      ReplaceOperationWithNoOp(node, optimized_graph);
+    } else {
+      ReplaceOperationWithIdentity(0 /* input_to_forward */, properties, node,
+                                   optimized_graph);
+    }
+  }
+}
+
 bool ConstantFolding::MoveConstantsPastEnter(GraphDef* optimized_graph,
                                              NodeDef* node) {
   if (!IsEnter(*node) || node->input_size() == 0 ||
diff --git a/tensorflow/core/grappler/optimizers/constant_folding.h b/tensorflow/core/grappler/optimizers/constant_folding.h
index 7a06cfc1e1a..79ef82c9a0a 100644
--- a/tensorflow/core/grappler/optimizers/constant_folding.h
+++ b/tensorflow/core/grappler/optimizers/constant_folding.h
@@ -106,6 +106,7 @@ class ConstantFolding : public GraphOptimizer {
   void ReplaceOperationWithSnapshot(int input_to_forward,
                                     const GraphProperties& properties,
                                     NodeDef* node, GraphDef* graph);
+  void ReplaceOperationWithNoOp(NodeDef* node, GraphDef* graph);
   void ReplaceBinaryOperationWithBroadcastTo(int input_to_broadcast,
                                              const GraphProperties& properties,
                                              NodeDef* node, GraphDef* graph);
@@ -229,6 +230,7 @@ class ConstantFolding : public GraphOptimizer {
       const gtl::InlinedVector<TensorValue, 4>& reduction_indices_vector) const;
   // Changes a reduction into an Identity op, returning true on success.
   bool ReplaceReductionWithIdentity(NodeDef* node) const;
+
   // Simplifies a Reduction operation to an Identity/Reshape operation if
   // applicable.
   bool SimplifyReduction(GraphDef* optimized_graph,
@@ -286,6 +288,10 @@ class ConstantFolding : public GraphOptimizer {
   bool SimplifySelect(const GraphProperties& properties,
                       GraphDef* optimized_graph, NodeDef* node);
 
+  // Replaces variable updates that are effectively no-ops with NoOp nodes.
+  void RemoveRedundantVariableUpdates(const GraphProperties& properties,
+                                      GraphDef* optimized_graph, NodeDef* node);
+
   // Removes Reverse op over dimensions with size 1.
   Status RemoveReverse(const GraphProperties& properties, bool use_shape_info,
                        GraphDef* optimized_graph, NodeDef* node);
diff --git a/tensorflow/core/grappler/optimizers/dependency_optimizer.cc b/tensorflow/core/grappler/optimizers/dependency_optimizer.cc
index 58ef14e3d3d..10914860710 100644
--- a/tensorflow/core/grappler/optimizers/dependency_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/dependency_optimizer.cc
@@ -94,14 +94,33 @@ bool DependencyOptimizer::SafeToRemoveIdentity(const NodeDef& node) const {
 bool DependencyOptimizer::SafeToConvertToNoOp(const NodeDef& node) const {
   if (HasRegularOutputs(node, *node_map_)) {
     // The output values of this node may be needed.
+    VLOG(3) << "Not safe to convert '" << node.name()
+            << " to NoOp. Node has outputs.";
     return false;
   }
-  if (!fetch_nodes_known_ ||
-      nodes_to_preserve_.find(node.name()) != nodes_to_preserve_.end()) {
+  if (!fetch_nodes_known_) {
+    VLOG(3) << "Not safe to convert '" << node.name()
+            << " to NoOp. Fetches unknown.";
     return false;
   }
-  if (IsMerge(node) || IsSwitch(node) || ModifiesFrameInfo(node) ||
-      !IsFreeOfSideEffect(node)) {
+  if (nodes_to_preserve_.find(node.name()) != nodes_to_preserve_.end()) {
+    VLOG(3) << "Not safe to convert to NoOp: " << node.name()
+            << " is in preserve set.";
+    return false;
+  }
+  if (IsMerge(node) || IsSwitch(node) || ModifiesFrameInfo(node)) {
+    VLOG(3) << "Not safe to convert '" << node.name()
+            << " to NoOp. Node modifies frame info.";
+    return false;
+  }
+  // Ops reading variables are marked as stateful, but are safe to remove if
+  // redundant.
+  const bool is_variable_read = IsReadVariableOp(node) ||
+                                IsReadVariablesOp(node) ||
+                                absl::StrContains(node.op(), "Gather");
+  if (!is_variable_read && !IsFreeOfSideEffect(node)) {
+    VLOG(3) << "Not safe to convert '" << node.name()
+            << " to NoOp. Node has side effect.";
     return false;
   }
   if (node.op().rfind("Submodel", 0) == 0) {
diff --git a/tensorflow/python/grappler/constant_folding_test.py b/tensorflow/python/grappler/constant_folding_test.py
index 3ba5b7418a7..3336d3f7e8f 100644
--- a/tensorflow/python/grappler/constant_folding_test.py
+++ b/tensorflow/python/grappler/constant_folding_test.py
@@ -20,12 +20,17 @@ from __future__ import print_function
 
 import numpy as np
 
+from tensorflow.python.eager import backprop
+from tensorflow.python.eager import context
+from tensorflow.python.eager import def_function
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import functional_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.platform import test
 
 
@@ -63,6 +68,48 @@ class ConstantFoldingTest(test.TestCase):
       y_v = self.evaluate(y)
       self.assertAllEqual(np.zeros([10, 20, 30]), y_v)
 
+  # See b/159753857.
+  def testGradientGraphOptimization(self):
+
+    @def_function.function
+    def f(x, y):
+      with backprop.GradientTape() as tape:
+        z = math_ops.mul(x, array_ops.zeros_like(x))
+        l = math_ops.add(z, y)
+        l = math_ops.reduce_sum(l)
+
+      gx, gy = tape.gradient(l, [x, y])
+      x.assign_add(gx)
+      y.assign_add(gy)
+      return x + y
+
+    # XLA completely optimizes away the variable reads and
+    # assignments, so skip the test.
+    if test_util.is_xla_enabled():
+      self.skipTest('Not relevant for XLA')
+    with context.eager_mode():
+      x = resource_variable_ops.ResourceVariable(
+          np.random.uniform(size=[2, 2]), dtype=dtypes.float32)
+      y = resource_variable_ops.ResourceVariable(
+          np.random.uniform(size=[2, 2]), dtype=dtypes.float32)
+      with context.collect_graphs(optimized=True) as graphs:
+        f(x, y).numpy()
+    self.assertLen(graphs, 1)
+    assign_count = 0
+    read_count = 0
+    for node in graphs[0].node:
+      if node.op == 'AssignAddVariableOp':
+        self.assertEqual(node.input[0], 'y')
+        assign_count += 1
+      if node.op == 'ReadVariableOp':
+        read_count += 1
+
+    # Make sure that the only variable update that remains after
+    # grappler optimization is that of y, and that we prune all
+    # but the 2 necessary variable reads.
+    self.assertEqual(assign_count, 1)
+    self.assertEqual(read_count, 2)
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/grappler/memory_optimizer_test.py b/tensorflow/python/grappler/memory_optimizer_test.py
index 2beed594479..446ef128c26 100644
--- a/tensorflow/python/grappler/memory_optimizer_test.py
+++ b/tensorflow/python/grappler/memory_optimizer_test.py
@@ -56,6 +56,7 @@ class MemoryOptimizerSwapTest(test.TestCase):
         rewriter_config_pb2.RewriterConfig(
             disable_model_pruning=True,
             constant_folding=rewriter_config_pb2.RewriterConfig.OFF,
+            dependency_optimization=rewriter_config_pb2.RewriterConfig.OFF,
             memory_optimization=rewriter_config_pb2.RewriterConfig.MANUAL))
     graph = tf_optimizer.OptimizeGraph(config, mg)
 

From ef63f2b24193219fac7d429a4bc5944b07fc196a Mon Sep 17 00:00:00 2001
From: Haifeng Jin <haifengj@google.com>
Date: Tue, 30 Jun 2020 12:08:56 -0700
Subject: [PATCH 1335/1390] Change usages of ResourceVariable to tf.Variable.

PiperOrigin-RevId: 319071154
Change-Id: Ia9137eab43431d98148299681bf05043455a9c33
---
 .../python/keras/engine/training_test.py      |  5 +-
 .../experimental/loss_scale_optimizer_test.py |  5 +-
 tensorflow/python/keras/models_test.py        |  5 +-
 .../keras/optimizer_v2/adadelta_test.py       |  9 +--
 .../python/keras/optimizer_v2/adagrad_test.py | 46 ++++++------
 .../python/keras/optimizer_v2/adam_test.py    | 61 ++++++----------
 .../python/keras/optimizer_v2/adamax_test.py  | 21 +++---
 .../python/keras/optimizer_v2/ftrl_test.py    |  7 +-
 .../optimizer_v2/gradient_descent_test.py     | 49 +++++--------
 .../learning_rate_schedule_test.py            | 13 ++--
 .../legacy_learning_rate_decay_test.py        | 22 +++---
 .../python/keras/optimizer_v2/nadam_test.py   |  9 ++-
 .../keras/optimizer_v2/optimizer_v2_test.py   | 70 ++++++++-----------
 .../python/keras/optimizer_v2/rmsprop_test.py | 27 ++++---
 .../keras/tests/model_subclassing_test.py     |  8 +--
 tensorflow/python/keras/tests/saver_test.py   |  4 +-
 .../tracking_util_with_v1_optimizers_test.py  | 10 +--
 17 files changed, 151 insertions(+), 220 deletions(-)

diff --git a/tensorflow/python/keras/engine/training_test.py b/tensorflow/python/keras/engine/training_test.py
index 23a2969d343..0e41b0f6cf9 100644
--- a/tensorflow/python/keras/engine/training_test.py
+++ b/tensorflow/python/keras/engine/training_test.py
@@ -52,7 +52,6 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
-from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import sparse_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import template
@@ -823,8 +822,8 @@ class TrainingTest(keras_parameterized.TestCase):
 
       def __init__(self):
         super(LayerWithWeightSharedLayers, self).__init__()
-        shared_trainable_var = resource_variable_ops.ResourceVariable(1.)
-        shared_non_trainable_var = resource_variable_ops.ResourceVariable(
+        shared_trainable_var = variables_lib.Variable(1.)
+        shared_non_trainable_var = variables_lib.Variable(
             1., trainable=False)
         self.layer1 = AddWeightLayer(shared_trainable_var,
                                      shared_non_trainable_var)
diff --git a/tensorflow/python/keras/mixed_precision/experimental/loss_scale_optimizer_test.py b/tensorflow/python/keras/mixed_precision/experimental/loss_scale_optimizer_test.py
index 992ed17f0c6..31d75642adb 100644
--- a/tensorflow/python/keras/mixed_precision/experimental/loss_scale_optimizer_test.py
+++ b/tensorflow/python/keras/mixed_precision/experimental/loss_scale_optimizer_test.py
@@ -37,7 +37,6 @@ from tensorflow.python.keras.optimizer_v2 import adam
 from tensorflow.python.keras.optimizer_v2 import gradient_descent
 from tensorflow.python.ops import control_flow_v2_toggles
 from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 from tensorflow.python.training.experimental import loss_scale as loss_scale_module
@@ -158,8 +157,8 @@ class LossScaleOptimizerTest(test.TestCase, parameterized.TestCase):
   def testDynamicLossScale(self, strategy_fn):
     strategy = strategy_fn()
     learning_rate = 2.
-    expected_gradient = resource_variable_ops.ResourceVariable(
-        learning_rate / strategy.num_replicas_in_sync)
+    expected_gradient = variables.Variable(learning_rate /
+                                           strategy.num_replicas_in_sync)
     with strategy.scope():
       var = variables.Variable([5.0])
       opt = gradient_descent.SGD(learning_rate)
diff --git a/tensorflow/python/keras/models_test.py b/tensorflow/python/keras/models_test.py
index 86a2fab10ac..a2887e24b56 100644
--- a/tensorflow/python/keras/models_test.py
+++ b/tensorflow/python/keras/models_test.py
@@ -35,7 +35,7 @@ from tensorflow.python.keras import models
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 from tensorflow.python.training import adam
 
@@ -47,8 +47,7 @@ class TestModel(keras.Model):
     """A test class with one dense layer and number of outputs as a variable."""
     super(TestModel, self).__init__()
     self.layer1 = keras.layers.Dense(n_outputs)
-    self.n_outputs = resource_variable_ops.ResourceVariable(
-        n_outputs, trainable=trainable)
+    self.n_outputs = variables.Variable(n_outputs, trainable=trainable)
 
   def call(self, x):
     return self.layer1(x)
diff --git a/tensorflow/python/keras/optimizer_v2/adadelta_test.py b/tensorflow/python/keras/optimizer_v2/adadelta_test.py
index 0827d367110..fd785aa412f 100644
--- a/tensorflow/python/keras/optimizer_v2/adadelta_test.py
+++ b/tensorflow/python/keras/optimizer_v2/adadelta_test.py
@@ -30,7 +30,6 @@ from tensorflow.python.keras import combinations
 from tensorflow.python.keras.optimizer_v2 import adadelta
 from tensorflow.python.ops import embedding_ops
 from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 
@@ -50,10 +49,8 @@ class AdadeltaOptimizerTest(test.TestCase, parameterized.TestCase):
           var0_init = [1.0, 2.0]
           var1_init = [3.0, 4.0]
           if use_resource:
-            var0 = resource_variable_ops.ResourceVariable(
-                var0_init, dtype=dtype)
-            var1 = resource_variable_ops.ResourceVariable(
-                var1_init, dtype=dtype)
+            var0 = variables.Variable(var0_init, dtype=dtype)
+            var1 = variables.Variable(var1_init, dtype=dtype)
           else:
             var0 = variables.Variable(var0_init, dtype=dtype)
             var1 = variables.Variable(var1_init, dtype=dtype)
@@ -159,7 +156,7 @@ class AdadeltaOptimizerTest(test.TestCase, parameterized.TestCase):
     # TODO(tanzheny, omalleyt): Fix test in eager mode.
     with ops.Graph().as_default():
       for dtype in _DATA_TYPES:
-        var0 = resource_variable_ops.ResourceVariable([[1.0, 2.0]], dtype=dtype)
+        var0 = variables.Variable([[1.0, 2.0]], dtype=dtype)
         x = constant_op.constant([[4.0], [5.0]], dtype=dtype)
 
         def loss():
diff --git a/tensorflow/python/keras/optimizer_v2/adagrad_test.py b/tensorflow/python/keras/optimizer_v2/adagrad_test.py
index a4b331e622c..4496f0b98e7 100644
--- a/tensorflow/python/keras/optimizer_v2/adagrad_test.py
+++ b/tensorflow/python/keras/optimizer_v2/adagrad_test.py
@@ -33,7 +33,6 @@ from tensorflow.python.keras.optimizer_v2 import adagrad
 from tensorflow.python.keras.optimizer_v2 import learning_rate_schedule
 from tensorflow.python.ops import embedding_ops
 from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 
@@ -78,8 +77,8 @@ class AdagradOptimizerTest(test.TestCase, parameterized.TestCase):
       var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
       grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
       grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
-      var0 = resource_variable_ops.ResourceVariable(var0_np)
-      var1 = resource_variable_ops.ResourceVariable(var1_np)
+      var0 = variables.Variable(var0_np)
+      var1 = variables.Variable(var1_np)
       grads0 = constant_op.constant(grads0_np)
       grads1 = constant_op.constant(grads1_np)
 
@@ -129,8 +128,8 @@ class AdagradOptimizerTest(test.TestCase, parameterized.TestCase):
       var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
       grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
       grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
-      var0 = resource_variable_ops.ResourceVariable(var0_np)
-      var1 = resource_variable_ops.ResourceVariable(var1_np)
+      var0 = variables.Variable(var0_np)
+      var1 = variables.Variable(var1_np)
       grads0 = constant_op.constant(grads0_np)
       grads1 = constant_op.constant(grads1_np)
 
@@ -171,8 +170,8 @@ class AdagradOptimizerTest(test.TestCase, parameterized.TestCase):
     var1_np = np.array([3.0, 4.0])
     grads0_np = np.array([0.1, 0.1])
     grads1_np = np.array([0.01, 0.01])
-    var0 = resource_variable_ops.ResourceVariable(var0_np)
-    var1 = resource_variable_ops.ResourceVariable(var1_np)
+    var0 = variables.Variable(var0_np)
+    var1 = variables.Variable(var1_np)
     grads0 = constant_op.constant(grads0_np)
     grads1 = constant_op.constant(grads1_np)
 
@@ -211,8 +210,8 @@ class AdagradOptimizerTest(test.TestCase, parameterized.TestCase):
       var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
       grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
       grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
-      var0 = resource_variable_ops.ResourceVariable(var0_np)
-      var1 = resource_variable_ops.ResourceVariable(var1_np)
+      var0 = variables.Variable(var0_np)
+      var1 = variables.Variable(var1_np)
       grads0 = constant_op.constant(grads0_np)
       grads1 = constant_op.constant(grads1_np)
 
@@ -254,8 +253,7 @@ class AdagradOptimizerTest(test.TestCase, parameterized.TestCase):
     # TODO(tanzheny, omalleyt): Fix test in eager mode.
     with ops.Graph().as_default():
       for dtype in _DATA_TYPES:
-        var0 = resource_variable_ops.ResourceVariable([[1.0, 2.0], [3.0, 4.0]],
-                                                      dtype=dtype)
+        var0 = variables.Variable([[1.0, 2.0], [3.0, 4.0]], dtype=dtype)
         x = constant_op.constant([[4.0], [5.0]], dtype=dtype)
 
         def loss():
@@ -282,8 +280,8 @@ class AdagradOptimizerTest(test.TestCase, parameterized.TestCase):
         var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
         grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
         grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
-        var0 = resource_variable_ops.ResourceVariable(var0_np)
-        var1 = resource_variable_ops.ResourceVariable(var1_np)
+        var0 = variables.Variable(var0_np)
+        var1 = variables.Variable(var1_np)
         grads0 = constant_op.constant(grads0_np)
         grads1 = constant_op.constant(grads1_np)
 
@@ -316,8 +314,8 @@ class AdagradOptimizerTest(test.TestCase, parameterized.TestCase):
         var1_np = np.array([3.0, 3.0, 4.0], dtype=dtype.as_numpy_dtype)
         grads1_np = np.array([0.01, 0, 0.01], dtype=dtype.as_numpy_dtype)
 
-        var0 = resource_variable_ops.ResourceVariable(var0_np)
-        var1 = resource_variable_ops.ResourceVariable(var1_np)
+        var0 = variables.Variable(var0_np)
+        var1 = variables.Variable(var1_np)
         grads0_np_indices = np.array([0, 2], dtype=np.int32)
         grads0 = ops.IndexedSlices(
             constant_op.constant(grads0_np[grads0_np_indices]),
@@ -359,7 +357,7 @@ class AdagradOptimizerTest(test.TestCase, parameterized.TestCase):
         var0_np = np.array([1.0], dtype=dtype.as_numpy_dtype)
         grads0_np = np.array([0.1], dtype=dtype.as_numpy_dtype)
 
-        var0 = resource_variable_ops.ResourceVariable(var0_np)
+        var0 = variables.Variable(var0_np)
         grads0_np_indices = np.array([0], dtype=np.int32)
         grads0 = ops.IndexedSlices(
             constant_op.constant(grads0_np[grads0_np_indices]),
@@ -393,9 +391,9 @@ class AdagradOptimizerTest(test.TestCase, parameterized.TestCase):
       for dtype in _DATA_TYPES:
         var_np = np.array([[1.0], [2.0]], dtype=dtype.as_numpy_dtype)
 
-        repeated_index_update_var = resource_variable_ops.ResourceVariable(
+        repeated_index_update_var = variables.Variable(
             var_np, dtype=dtype)
-        aggregated_update_var = resource_variable_ops.ResourceVariable(
+        aggregated_update_var = variables.Variable(
             var_np, dtype=dtype)
         grad_repeated_index = ops.IndexedSlices(
             constant_op.constant([0.1, 0.1], shape=[2, 1], dtype=dtype),
@@ -424,12 +422,10 @@ class AdagradOptimizerTest(test.TestCase, parameterized.TestCase):
     # TODO(tanzheny, omalleyt): Fix test in eager mode.
     with ops.Graph().as_default():
       for dtype in _DATA_TYPES:
-        var_repeated = resource_variable_ops.ResourceVariable([1.0, 2.0],
-                                                              dtype=dtype)
+        var_repeated = variables.Variable([1.0, 2.0], dtype=dtype)
         loss_repeated = lambda: math_ops.reduce_sum(  # pylint: disable=g-long-lambda
             embedding_ops.embedding_lookup(var_repeated, [0, 0]))  # pylint: disable=cell-var-from-loop
-        var_aggregated = resource_variable_ops.ResourceVariable([1.0, 2.0],
-                                                                dtype=dtype)
+        var_aggregated = variables.Variable([1.0, 2.0], dtype=dtype)
         loss_aggregated = lambda: 2 * math_ops.reduce_sum(  # pylint: disable=g-long-lambda
             embedding_ops.embedding_lookup(var_aggregated, [0]))  # pylint: disable=cell-var-from-loop
         update_op_repeated = adagrad.Adagrad(2.0).minimize(
@@ -453,7 +449,7 @@ class AdagradOptimizerTest(test.TestCase, parameterized.TestCase):
         var0_np = np.array([[0.00872496, -0.106952, 0.110467,
                              0.226505, -0.0147257, -0.0105945]],
                            dtype=dtype.as_numpy_dtype)
-        var0 = resource_variable_ops.ResourceVariable(var0_np)
+        var0 = variables.Variable(var0_np)
         grads0_np = np.array([[
             -5.91278e-05, 5.31673e-05, -2.5779e-06, 4.29153e-05, -8.4877e-05,
             -9.48906e-05
@@ -486,8 +482,8 @@ class AdagradOptimizerTest(test.TestCase, parameterized.TestCase):
         var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
         grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
 
-        var0 = resource_variable_ops.ResourceVariable(var0_np)
-        var1 = resource_variable_ops.ResourceVariable(var1_np)
+        var0 = variables.Variable(var0_np)
+        var1 = variables.Variable(var1_np)
         grads0 = constant_op.constant(grads0_np)
         grads1 = constant_op.constant(grads1_np)
 
diff --git a/tensorflow/python/keras/optimizer_v2/adam_test.py b/tensorflow/python/keras/optimizer_v2/adam_test.py
index 0b1ae51b08c..775ea219d20 100644
--- a/tensorflow/python/keras/optimizer_v2/adam_test.py
+++ b/tensorflow/python/keras/optimizer_v2/adam_test.py
@@ -31,7 +31,6 @@ from tensorflow.python.keras.optimizer_v2 import adam
 from tensorflow.python.keras.optimizer_v2 import learning_rate_schedule
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 
@@ -122,8 +121,8 @@ class AdamOptimizerTest(test.TestCase, parameterized.TestCase):
         var1_np = np.array([3.0, 3.0, 4.0], dtype=dtype.as_numpy_dtype)
         grads1_np = np.array([0.01, 0.0, 0.01], dtype=dtype.as_numpy_dtype)
 
-        var0 = resource_variable_ops.ResourceVariable(var0_np)
-        var1 = resource_variable_ops.ResourceVariable(var1_np)
+        var0 = variables.Variable(var0_np)
+        var1 = variables.Variable(var1_np)
         grads0_np_indices = np.array([0, 2], dtype=np.int32)
         grads0 = ops.IndexedSlices(
             constant_op.constant(grads0_np[grads0_np_indices]),
@@ -212,10 +211,8 @@ class AdamOptimizerTest(test.TestCase, parameterized.TestCase):
         var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
         grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
 
-        var0 = resource_variable_ops.ResourceVariable(
-            var0_np, name="var0_%d" % i)
-        var1 = resource_variable_ops.ResourceVariable(
-            var1_np, name="var1_%d" % i)
+        var0 = variables.Variable(var0_np, name="var0_%d" % i)
+        var1 = variables.Variable(var1_np, name="var1_%d" % i)
         grads0 = constant_op.constant(grads0_np)
         grads1 = constant_op.constant(grads1_np)
 
@@ -272,10 +269,8 @@ class AdamOptimizerTest(test.TestCase, parameterized.TestCase):
         var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
         grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
 
-        var0 = resource_variable_ops.ResourceVariable(
-            var0_np, name="var0_%d" % i)
-        var1 = resource_variable_ops.ResourceVariable(
-            var1_np, name="var1_%d" % i)
+        var0 = variables.Variable(var0_np, name="var0_%d" % i)
+        var1 = variables.Variable(var1_np, name="var1_%d" % i)
         grads0 = constant_op.constant(grads0_np)
         grads1 = constant_op.constant(grads1_np)
 
@@ -366,10 +361,8 @@ class AdamOptimizerTest(test.TestCase, parameterized.TestCase):
         var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
         grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
 
-        var0 = resource_variable_ops.ResourceVariable(
-            var0_np, name="var0_%d" % i)
-        var1 = resource_variable_ops.ResourceVariable(
-            var1_np, name="var1_%d" % i)
+        var0 = variables.Variable(var0_np, name="var0_%d" % i)
+        var1 = variables.Variable(var1_np, name="var1_%d" % i)
         grads0 = constant_op.constant(grads0_np)
         grads1 = constant_op.constant(grads1_np)
 
@@ -413,10 +406,8 @@ class AdamOptimizerTest(test.TestCase, parameterized.TestCase):
         var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
         grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
 
-        var0 = resource_variable_ops.ResourceVariable(
-            var0_np, name="var0_%d" % i)
-        var1 = resource_variable_ops.ResourceVariable(
-            var1_np, name="var1_%d" % i)
+        var0 = variables.Variable(var0_np, name="var0_%d" % i)
+        var1 = variables.Variable(var1_np, name="var1_%d" % i)
         grads0 = constant_op.constant(grads0_np)
         grads1 = constant_op.constant(grads1_np)
 
@@ -536,8 +527,8 @@ class AdamOptimizerTest(test.TestCase, parameterized.TestCase):
 
   def testSlotsUniqueEager(self):
     with context.eager_mode():
-      v1 = resource_variable_ops.ResourceVariable(1.)
-      v2 = resource_variable_ops.ResourceVariable(1.)
+      v1 = variables.Variable(1.)
+      v2 = variables.Variable(1.)
       opt = adam.Adam(1.)
       opt.minimize(lambda: v1 + v2, var_list=[v1, v2])
       # There should be iteration, and two unique slot variables for v1 and v2.
@@ -582,8 +573,8 @@ class NonFusedAdamOptimizerTest(test.TestCase, parameterized.TestCase):
         var1_np = np.array([3.0, 3.0, 4.0], dtype=dtype.as_numpy_dtype)
         grads1_np = np.array([0.01, 0.0, 0.01], dtype=dtype.as_numpy_dtype)
 
-        var0 = resource_variable_ops.ResourceVariable(var0_np)
-        var1 = resource_variable_ops.ResourceVariable(var1_np)
+        var0 = variables.Variable(var0_np)
+        var1 = variables.Variable(var1_np)
         grads0_np_indices = np.array([0, 2], dtype=np.int32)
         grads0 = ops.IndexedSlices(
             constant_op.constant(grads0_np[grads0_np_indices]),
@@ -672,10 +663,8 @@ class NonFusedAdamOptimizerTest(test.TestCase, parameterized.TestCase):
         var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
         grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
 
-        var0 = resource_variable_ops.ResourceVariable(
-            var0_np, name="var0_%d" % i)
-        var1 = resource_variable_ops.ResourceVariable(
-            var1_np, name="var1_%d" % i)
+        var0 = variables.Variable(var0_np, name="var0_%d" % i)
+        var1 = variables.Variable(var1_np, name="var1_%d" % i)
         grads0 = constant_op.constant(grads0_np)
         grads1 = constant_op.constant(grads1_np)
 
@@ -734,10 +723,8 @@ class NonFusedAdamOptimizerTest(test.TestCase, parameterized.TestCase):
         var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
         grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
 
-        var0 = resource_variable_ops.ResourceVariable(
-            var0_np, name="var0_%d" % i)
-        var1 = resource_variable_ops.ResourceVariable(
-            var1_np, name="var1_%d" % i)
+        var0 = variables.Variable(var0_np, name="var0_%d" % i)
+        var1 = variables.Variable(var1_np, name="var1_%d" % i)
         grads0 = constant_op.constant(grads0_np)
         grads1 = constant_op.constant(grads1_np)
 
@@ -830,10 +817,8 @@ class NonFusedAdamOptimizerTest(test.TestCase, parameterized.TestCase):
         var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
         grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
 
-        var0 = resource_variable_ops.ResourceVariable(
-            var0_np, name="var0_%d" % i)
-        var1 = resource_variable_ops.ResourceVariable(
-            var1_np, name="var1_%d" % i)
+        var0 = variables.Variable(var0_np, name="var0_%d" % i)
+        var1 = variables.Variable(var1_np, name="var1_%d" % i)
         grads0 = constant_op.constant(grads0_np)
         grads1 = constant_op.constant(grads1_np)
 
@@ -877,10 +862,8 @@ class NonFusedAdamOptimizerTest(test.TestCase, parameterized.TestCase):
         var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
         grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
 
-        var0 = resource_variable_ops.ResourceVariable(
-            var0_np, name="var0_%d" % i)
-        var1 = resource_variable_ops.ResourceVariable(
-            var1_np, name="var1_%d" % i)
+        var0 = variables.Variable(var0_np, name="var0_%d" % i)
+        var1 = variables.Variable(var1_np, name="var1_%d" % i)
         grads0 = constant_op.constant(grads0_np)
         grads1 = constant_op.constant(grads1_np)
 
diff --git a/tensorflow/python/keras/optimizer_v2/adamax_test.py b/tensorflow/python/keras/optimizer_v2/adamax_test.py
index 07c7ee96eeb..188c40b1342 100644
--- a/tensorflow/python/keras/optimizer_v2/adamax_test.py
+++ b/tensorflow/python/keras/optimizer_v2/adamax_test.py
@@ -29,7 +29,6 @@ from tensorflow.python.keras import combinations
 from tensorflow.python.keras.optimizer_v2 import adamax
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 
@@ -91,8 +90,8 @@ class AdamaxOptimizerTest(test.TestCase, parameterized.TestCase):
         var1_np = np.array([4.0, 5.0, 6.0], dtype=dtype.as_numpy_dtype)
         grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
 
-        var0 = resource_variable_ops.ResourceVariable(var0_np)
-        var1 = resource_variable_ops.ResourceVariable(var1_np)
+        var0 = variables.Variable(var0_np)
+        var1 = variables.Variable(var1_np)
 
         grads0_np_indices = np.array([0, 1], dtype=np.int32)
         grads0 = ops.IndexedSlices(
@@ -186,10 +185,8 @@ class AdamaxOptimizerTest(test.TestCase, parameterized.TestCase):
         var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
         grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
 
-        var0 = resource_variable_ops.ResourceVariable(
-            var0_np, name="var0_%d" % i)
-        var1 = resource_variable_ops.ResourceVariable(
-            var1_np, name="var1_%d" % i)
+        var0 = variables.Variable(var0_np, name="var0_%d" % i)
+        var1 = variables.Variable(var1_np, name="var1_%d" % i)
 
         grads0 = constant_op.constant(grads0_np)
         grads1 = constant_op.constant(grads1_np)
@@ -234,10 +231,8 @@ class AdamaxOptimizerTest(test.TestCase, parameterized.TestCase):
         var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
         grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
 
-        var0 = resource_variable_ops.ResourceVariable(
-            var0_np, name="var0_%d" % i)
-        var1 = resource_variable_ops.ResourceVariable(
-            var1_np, name="var1_%d" % i)
+        var0 = variables.Variable(var0_np, name="var0_%d" % i)
+        var1 = variables.Variable(var1_np, name="var1_%d" % i)
 
         grads0 = constant_op.constant(grads0_np)
         grads1 = constant_op.constant(grads1_np)
@@ -357,8 +352,8 @@ class AdamaxOptimizerTest(test.TestCase, parameterized.TestCase):
 
   def testSlotsUniqueEager(self):
     with context.eager_mode():
-      v1 = resource_variable_ops.ResourceVariable(1.)
-      v2 = resource_variable_ops.ResourceVariable(1.)
+      v1 = variables.Variable(1.)
+      v2 = variables.Variable(1.)
       opt = adamax.Adamax(1.)
       opt.minimize(lambda: v1 + v2, var_list=[v1, v2])
       # There should be iteration, and two unique slot variables for v1 and v2.
diff --git a/tensorflow/python/keras/optimizer_v2/ftrl_test.py b/tensorflow/python/keras/optimizer_v2/ftrl_test.py
index 662fb8d7301..1eae42c07c1 100644
--- a/tensorflow/python/keras/optimizer_v2/ftrl_test.py
+++ b/tensorflow/python/keras/optimizer_v2/ftrl_test.py
@@ -26,7 +26,6 @@ from tensorflow.python.framework import ops
 from tensorflow.python.keras.optimizer_v2 import ftrl
 from tensorflow.python.ops import embedding_ops
 from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 from tensorflow.python.training import adagrad
@@ -40,8 +39,8 @@ class FtrlOptimizerTest(test.TestCase):
     for dtype in [dtypes.float32]:
       with ops.Graph().as_default(), self.cached_session(use_gpu=True):
         if use_resource:
-          var0 = resource_variable_ops.ResourceVariable([0.0, 0.0], dtype=dtype)
-          var1 = resource_variable_ops.ResourceVariable([0.0, 0.0], dtype=dtype)
+          var0 = variables.Variable([0.0, 0.0], dtype=dtype)
+          var1 = variables.Variable([0.0, 0.0], dtype=dtype)
         else:
           var0 = variables.Variable([0.0, 0.0], dtype=dtype)
           var1 = variables.Variable([0.0, 0.0], dtype=dtype)
@@ -109,7 +108,7 @@ class FtrlOptimizerTest(test.TestCase):
     # TODO(tanzheny, omalleyt): Fix test in eager mode.
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
       with ops.Graph().as_default(), self.cached_session(use_gpu=True):
-        var0 = resource_variable_ops.ResourceVariable([[1.0, 2.0]], dtype=dtype)
+        var0 = variables.Variable([[1.0, 2.0]], dtype=dtype)
         x = constant_op.constant([[4.0], [5.0]], dtype=dtype)
 
         def loss():
diff --git a/tensorflow/python/keras/optimizer_v2/gradient_descent_test.py b/tensorflow/python/keras/optimizer_v2/gradient_descent_test.py
index 2c96192905f..d87e6fd1dfa 100644
--- a/tensorflow/python/keras/optimizer_v2/gradient_descent_test.py
+++ b/tensorflow/python/keras/optimizer_v2/gradient_descent_test.py
@@ -33,7 +33,6 @@ from tensorflow.python.keras.optimizer_v2 import learning_rate_schedule
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import embedding_ops
 from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 
@@ -43,8 +42,8 @@ class GradientDescentOptimizerTest(test.TestCase, parameterized.TestCase):
   @combinations.generate(combinations.combine(mode=["graph", "eager"]))
   def testBasic(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
-      var0 = resource_variable_ops.ResourceVariable([1.0, 2.0], dtype=dtype)
-      var1 = resource_variable_ops.ResourceVariable([3.0, 4.0], dtype=dtype)
+      var0 = variables.Variable([1.0, 2.0], dtype=dtype)
+      var1 = variables.Variable([3.0, 4.0], dtype=dtype)
       grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
       grads1 = constant_op.constant([0.01, 0.01], dtype=dtype)
       sgd = gradient_descent.SGD(3.0)
@@ -59,8 +58,8 @@ class GradientDescentOptimizerTest(test.TestCase, parameterized.TestCase):
                                          self.evaluate(var1))
 
   def _test_basic_sgd_with_learning_rate_decay(self, sgd, dtype):
-    var0 = resource_variable_ops.ResourceVariable([1.0, 2.0], dtype=dtype)
-    var1 = resource_variable_ops.ResourceVariable([3.0, 4.0], dtype=dtype)
+    var0 = variables.Variable([1.0, 2.0], dtype=dtype)
+    var1 = variables.Variable([3.0, 4.0], dtype=dtype)
     grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
     grads1 = constant_op.constant([0.01, 0.01], dtype=dtype)
     if not context.executing_eagerly():
@@ -117,8 +116,8 @@ class GradientDescentOptimizerTest(test.TestCase, parameterized.TestCase):
   @combinations.generate(combinations.combine(mode=["graph", "eager"]))
   def testBasicCallableParams(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
-      var0 = resource_variable_ops.ResourceVariable([1.0, 2.0], dtype=dtype)
-      var1 = resource_variable_ops.ResourceVariable([3.0, 4.0], dtype=dtype)
+      var0 = variables.Variable([1.0, 2.0], dtype=dtype)
+      var1 = variables.Variable([3.0, 4.0], dtype=dtype)
       grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
       grads1 = constant_op.constant([0.01, 0.01], dtype=dtype)
       lr = lambda: 3.0
@@ -136,8 +135,8 @@ class GradientDescentOptimizerTest(test.TestCase, parameterized.TestCase):
   @combinations.generate(combinations.combine(mode=["graph", "eager"]))
   def testMinimizeResourceVariable(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
-      var0 = resource_variable_ops.ResourceVariable([[1.0, 2.0]], dtype=dtype)
-      var1 = resource_variable_ops.ResourceVariable([3.0], dtype=dtype)
+      var0 = variables.Variable([[1.0, 2.0]], dtype=dtype)
+      var1 = variables.Variable([3.0], dtype=dtype)
       x = constant_op.constant([[4.0], [5.0]], dtype=dtype)
       loss = lambda: math_ops.matmul(var0, x) + var1  # pylint: disable=cell-var-from-loop
       sgd = gradient_descent.SGD(1.0)
@@ -154,8 +153,8 @@ class GradientDescentOptimizerTest(test.TestCase, parameterized.TestCase):
     # TODO(tanzheny, omalleyt): Fix test in eager mode.
     with ops.Graph().as_default():
       for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
-        var0 = resource_variable_ops.ResourceVariable([[1.0, 2.0]], dtype=dtype)
-        var1 = resource_variable_ops.ResourceVariable([3.0], dtype=dtype)
+        var0 = variables.Variable([[1.0, 2.0]], dtype=dtype)
+        var1 = variables.Variable([3.0], dtype=dtype)
         x = constant_op.constant([[4.0], [5.0]], dtype=dtype)
 
         def loss():
@@ -264,7 +263,7 @@ class GradientDescentOptimizerTest(test.TestCase, parameterized.TestCase):
       optimizer = gradient_descent.SGD(1.0)
 
       def step():
-        self.v = resource_variable_ops.ResourceVariable(1.0)
+        self.v = variables.Variable(1.0)
         with backprop.GradientTape() as tape:
           loss = self.v**2
         grad = tape.gradient(loss, self.v)
@@ -303,12 +302,8 @@ class MomentumOptimizerTest(test.TestCase, parameterized.TestCase):
   @combinations.generate(combinations.combine(mode=["graph", "eager"]))
   def testBasic(self):
     for _, dtype in enumerate([dtypes.half, dtypes.float32, dtypes.float64]):
-      var0 = resource_variable_ops.ResourceVariable([1.0, 2.0],
-                                                    dtype=dtype,
-                                                    name="var0")
-      var1 = resource_variable_ops.ResourceVariable([3.0, 4.0],
-                                                    dtype=dtype,
-                                                    name="var1")
+      var0 = variables.Variable([1.0, 2.0], dtype=dtype, name="var0")
+      var1 = variables.Variable([3.0, 4.0], dtype=dtype, name="var1")
       grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
       grads1 = constant_op.constant([0.01, 0.01], dtype=dtype)
       learning_rate = 2.0
@@ -368,12 +363,8 @@ class MomentumOptimizerTest(test.TestCase, parameterized.TestCase):
     # TODO(tanzheny, omalleyt): Fix test in eager mode.
     with ops.Graph().as_default():
       for dtype in [dtypes.float32, dtypes.float64]:
-        var0 = resource_variable_ops.ResourceVariable([1.0, 2.0],
-                                                      dtype=dtype,
-                                                      name="var0")
-        var1 = resource_variable_ops.ResourceVariable([3.0, 4.0],
-                                                      dtype=dtype,
-                                                      name="var1")
+        var0 = variables.Variable([1.0, 2.0], dtype=dtype, name="var0")
+        var1 = variables.Variable([3.0, 4.0], dtype=dtype, name="var1")
         var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
         var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
         accum0_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
@@ -411,10 +402,8 @@ class MomentumOptimizerTest(test.TestCase, parameterized.TestCase):
         var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
         accum0_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
         accum1_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
-        var0 = resource_variable_ops.ResourceVariable(
-            var0_np, dtype=dtype, name="var0")
-        var1 = resource_variable_ops.ResourceVariable(
-            var1_np, dtype=dtype, name="var1")
+        var0 = variables.Variable(var0_np, dtype=dtype, name="var0")
+        var1 = variables.Variable(var1_np, dtype=dtype, name="var1")
         mom_op = gradient_descent.SGD(
             learning_rate=2.0, momentum=0.9, nesterov=True)
         x_feed = array_ops.placeholder(dtype)
@@ -437,7 +426,7 @@ class MomentumOptimizerTest(test.TestCase, parameterized.TestCase):
     # TODO(tanzheny, omalleyt): Fix test in eager mode.
     with ops.Graph().as_default():
       for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
-        var0 = resource_variable_ops.ResourceVariable([[1.0, 2.0]], dtype=dtype)
+        var0 = variables.Variable([[1.0, 2.0]], dtype=dtype)
 
         # pylint: disable=cell-var-from-loop
         def loss():
@@ -457,7 +446,7 @@ class MomentumOptimizerTest(test.TestCase, parameterized.TestCase):
 
   @combinations.generate(combinations.combine(mode=["graph", "eager"]))
   def testMinimizeWith2DIndicesForEmbeddingLookup(self):
-    var0 = resource_variable_ops.ResourceVariable(array_ops.ones([2, 2]))
+    var0 = variables.Variable(array_ops.ones([2, 2]))
 
     def loss():
       return math_ops.reduce_sum(embedding_ops.embedding_lookup(var0, [[1]]))
diff --git a/tensorflow/python/keras/optimizer_v2/learning_rate_schedule_test.py b/tensorflow/python/keras/optimizer_v2/learning_rate_schedule_test.py
index 859f78744aa..a6fc22f7927 100644
--- a/tensorflow/python/keras/optimizer_v2/learning_rate_schedule_test.py
+++ b/tensorflow/python/keras/optimizer_v2/learning_rate_schedule_test.py
@@ -30,8 +30,6 @@ from tensorflow.python.framework import test_util
 from tensorflow.python.keras import combinations
 from tensorflow.python.keras.optimizer_v2 import gradient_descent
 from tensorflow.python.keras.optimizer_v2 import learning_rate_schedule
-# Import resource_variable_ops for the variables-to-tensor implicit conversion.
-from tensorflow.python.ops import resource_variable_ops  # pylint: disable=unused-import
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import googletest
 
@@ -61,7 +59,7 @@ class LRDecayTestV2(test_util.TensorFlowTestCase, parameterized.TestCase):
 
   def testStaircase(self, serialize):
     if context.executing_eagerly():
-      step = resource_variable_ops.ResourceVariable(0)
+      step = variables.Variable(0)
       self.evaluate(variables.global_variables_initializer())
       decayed_lr = learning_rate_schedule.ExponentialDecay(
           .1, 3, 0.96, staircase=True)
@@ -104,7 +102,7 @@ class LRDecayTestV2(test_util.TensorFlowTestCase, parameterized.TestCase):
       self.assertAllClose(self.evaluate(decayed_lr(step)), expected, 1e-6)
 
   def testPiecewiseConstant(self, serialize):
-    x = resource_variable_ops.ResourceVariable(-999)
+    x = variables.Variable(-999)
     decayed_lr = learning_rate_schedule.PiecewiseConstantDecay(
         [100, 110, 120], [1.0, 0.1, 0.01, 0.001])
     decayed_lr = _maybe_serialized(decayed_lr, serialize)
@@ -145,8 +143,7 @@ class LRDecayTestV2(test_util.TensorFlowTestCase, parameterized.TestCase):
 
   def testPiecewiseConstantEdgeCases(self, serialize):
     # Test casting boundaries from int32 to int64.
-    x_int64 = resource_variable_ops.ResourceVariable(
-        0, dtype=variables.dtypes.int64)
+    x_int64 = variables.Variable(0, dtype=variables.dtypes.int64)
     boundaries, values = [1, 2, 3], [0.4, 0.5, 0.6, 0.7]
     decayed_lr = learning_rate_schedule.PiecewiseConstantDecay(
         boundaries, values)
@@ -312,7 +309,7 @@ class InverseDecayTestV2(test_util.TensorFlowTestCase, parameterized.TestCase):
     initial_lr = 0.1
     k = 10
     decay_rate = 0.96
-    step = resource_variable_ops.ResourceVariable(0)
+    step = variables.Variable(0)
     decayed_lr = learning_rate_schedule.InverseTimeDecay(initial_lr, k,
                                                          decay_rate)
     decayed_lr = _maybe_serialized(decayed_lr, serialize)
@@ -327,7 +324,7 @@ class InverseDecayTestV2(test_util.TensorFlowTestCase, parameterized.TestCase):
     initial_lr = 0.1
     k = 10
     decay_rate = 0.96
-    step = resource_variable_ops.ResourceVariable(0)
+    step = variables.Variable(0)
     decayed_lr = learning_rate_schedule.InverseTimeDecay(
         initial_lr, k, decay_rate, staircase=True)
     decayed_lr = _maybe_serialized(decayed_lr, serialize)
diff --git a/tensorflow/python/keras/optimizer_v2/legacy_learning_rate_decay_test.py b/tensorflow/python/keras/optimizer_v2/legacy_learning_rate_decay_test.py
index b5a3197ca67..19a59a64be0 100644
--- a/tensorflow/python/keras/optimizer_v2/legacy_learning_rate_decay_test.py
+++ b/tensorflow/python/keras/optimizer_v2/legacy_learning_rate_decay_test.py
@@ -23,8 +23,6 @@ import math
 from tensorflow.python.eager import context
 from tensorflow.python.framework import test_util
 from tensorflow.python.keras.optimizer_v2 import legacy_learning_rate_decay as learning_rate_decay
-# Import resource_variable_ops for the variables-to-tensor implicit conversion.
-from tensorflow.python.ops import resource_variable_ops  # pylint: disable=unused-import
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import googletest
 
@@ -42,7 +40,7 @@ class LRDecayTest(test_util.TensorFlowTestCase):
   @test_util.run_in_graph_and_eager_modes
   def testStaircase(self):
     if context.executing_eagerly():
-      step = resource_variable_ops.ResourceVariable(0)
+      step = variables.Variable(0)
       self.evaluate(variables.global_variables_initializer())
       decayed_lr = learning_rate_decay.exponential_decay(
           .1, step, 3, 0.96, staircase=True)
@@ -82,7 +80,7 @@ class LRDecayTest(test_util.TensorFlowTestCase):
 
   @test_util.run_in_graph_and_eager_modes
   def testPiecewiseConstant(self):
-    x = resource_variable_ops.ResourceVariable(-999)
+    x = variables.Variable(-999)
     decayed_lr = learning_rate_decay.piecewise_constant(
         x, [100, 110, 120], [1.0, 0.1, 0.01, 0.001])
 
@@ -103,8 +101,7 @@ class LRDecayTest(test_util.TensorFlowTestCase):
   @test_util.run_in_graph_and_eager_modes
   @test_util.run_v1_only("b/120545219")
   def testPiecewiseConstantEdgeCases(self):
-    x_int = resource_variable_ops.ResourceVariable(
-        0, dtype=variables.dtypes.int32)
+    x_int = variables.Variable(0, dtype=variables.dtypes.int32)
     boundaries, values = [-1.0, 1.0], [1, 2, 3]
     with self.assertRaises(ValueError):
       decayed_lr = learning_rate_decay.piecewise_constant(
@@ -112,7 +109,7 @@ class LRDecayTest(test_util.TensorFlowTestCase):
       if context.executing_eagerly():
         decayed_lr()
 
-    x = resource_variable_ops.ResourceVariable(0.0)
+    x = variables.Variable(0.0)
     boundaries, values = [-1.0, 1.0], [1.0, 2, 3]
     with self.assertRaises(ValueError):
       decayed_lr = learning_rate_decay.piecewise_constant(
@@ -128,8 +125,7 @@ class LRDecayTest(test_util.TensorFlowTestCase):
       learning_rate_decay.piecewise_constant(x_ref, boundaries, values)
 
     # Test casting boundaries from int32 to int64.
-    x_int64 = resource_variable_ops.ResourceVariable(
-        0, dtype=variables.dtypes.int64)
+    x_int64 = variables.Variable(0, dtype=variables.dtypes.int64)
     boundaries, values = [1, 2, 3], [0.4, 0.5, 0.6, 0.7]
     decayed_lr = learning_rate_decay.piecewise_constant(
         x_int64, boundaries, values)
@@ -273,7 +269,7 @@ class ExponentialDecayTest(test_util.TensorFlowTestCase):
     initial_lr = 0.1
     k = 10
     decay_rate = 0.96
-    step = resource_variable_ops.ResourceVariable(0)
+    step = variables.Variable(0)
     decayed_lr = learning_rate_decay.natural_exp_decay(initial_lr, step, k,
                                                        decay_rate)
 
@@ -288,7 +284,7 @@ class ExponentialDecayTest(test_util.TensorFlowTestCase):
     initial_lr = 0.1
     k = 10
     decay_rate = 0.96
-    step = resource_variable_ops.ResourceVariable(0)
+    step = variables.Variable(0)
     decayed_lr = learning_rate_decay.natural_exp_decay(
         initial_lr, step, k, decay_rate, staircase=True)
 
@@ -306,7 +302,7 @@ class InverseDecayTest(test_util.TensorFlowTestCase):
     initial_lr = 0.1
     k = 10
     decay_rate = 0.96
-    step = resource_variable_ops.ResourceVariable(0)
+    step = variables.Variable(0)
     decayed_lr = learning_rate_decay.inverse_time_decay(initial_lr, step, k,
                                                         decay_rate)
 
@@ -321,7 +317,7 @@ class InverseDecayTest(test_util.TensorFlowTestCase):
     initial_lr = 0.1
     k = 10
     decay_rate = 0.96
-    step = resource_variable_ops.ResourceVariable(0)
+    step = variables.Variable(0)
     decayed_lr = learning_rate_decay.inverse_time_decay(
         initial_lr, step, k, decay_rate, staircase=True)
 
diff --git a/tensorflow/python/keras/optimizer_v2/nadam_test.py b/tensorflow/python/keras/optimizer_v2/nadam_test.py
index 255a7dc0626..12380d23f5f 100644
--- a/tensorflow/python/keras/optimizer_v2/nadam_test.py
+++ b/tensorflow/python/keras/optimizer_v2/nadam_test.py
@@ -25,7 +25,6 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.keras.optimizer_v2 import nadam
 from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 
@@ -85,8 +84,8 @@ class NadamOptimizerTest(test.TestCase):
         var1_np = np.array([3.0, 3.0, 4.0], dtype=dtype.as_numpy_dtype)
         grads1_np = np.array([0.01, 0, 0.01], dtype=dtype.as_numpy_dtype)
 
-        var0 = resource_variable_ops.ResourceVariable(var0_np)
-        var1 = resource_variable_ops.ResourceVariable(var1_np)
+        var0 = variables.Variable(var0_np)
+        var1 = variables.Variable(var1_np)
         grads0_np_indices = np.array([0, 2], dtype=np.int32)
         grads0 = ops.IndexedSlices(
             constant_op.constant(grads0_np[grads0_np_indices]),
@@ -132,8 +131,8 @@ class NadamOptimizerTest(test.TestCase):
         var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
         grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
 
-        var0 = resource_variable_ops.ResourceVariable(var0_np)
-        var1 = resource_variable_ops.ResourceVariable(var1_np)
+        var0 = variables.Variable(var0_np)
+        var1 = variables.Variable(var1_np)
         grads0 = constant_op.constant(grads0_np)
         grads1 = constant_op.constant(grads1_np)
         opt = nadam.Nadam()
diff --git a/tensorflow/python/keras/optimizer_v2/optimizer_v2_test.py b/tensorflow/python/keras/optimizer_v2/optimizer_v2_test.py
index 9f99b43ad20..9069fc44b9c 100644
--- a/tensorflow/python/keras/optimizer_v2/optimizer_v2_test.py
+++ b/tensorflow/python/keras/optimizer_v2/optimizer_v2_test.py
@@ -54,7 +54,6 @@ from tensorflow.python.keras.optimizer_v2 import rmsprop
 from tensorflow.python.keras.utils import np_utils
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import clip_ops
-from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
@@ -74,8 +73,8 @@ class OptimizerTest(test.TestCase, parameterized.TestCase):
   def testBasic(self):
     for dtype in _DATA_TYPES:
       with test_util.use_gpu():
-        var0 = resource_variable_ops.ResourceVariable([1.0, 2.0], dtype=dtype)
-        var1 = resource_variable_ops.ResourceVariable([3.0, 4.0], dtype=dtype)
+        var0 = variables.Variable([1.0, 2.0], dtype=dtype)
+        var1 = variables.Variable([3.0, 4.0], dtype=dtype)
         loss = lambda: 5 * var0 + 3 * var1  # pylint: disable=cell-var-from-loop
         sgd = gradient_descent.SGD(3.0)
 
@@ -95,8 +94,8 @@ class OptimizerTest(test.TestCase, parameterized.TestCase):
   def testAdaptiveLearningRate(self):
     for dtype in _DATA_TYPES:
       with self.test_session():
-        var0 = resource_variable_ops.ResourceVariable([1.0, 2.0], dtype=dtype)
-        var1 = resource_variable_ops.ResourceVariable([3.0, 4.0], dtype=dtype)
+        var0 = variables.Variable([1.0, 2.0], dtype=dtype)
+        var1 = variables.Variable([3.0, 4.0], dtype=dtype)
 
         def loss():
           return 5 * var0 + 3 * var1  # pylint: disable=cell-var-from-loop
@@ -163,8 +162,8 @@ class OptimizerTest(test.TestCase, parameterized.TestCase):
   def testNoGradients(self):
     for dtype in _DATA_TYPES:
       with test_util.use_gpu():
-        var0 = resource_variable_ops.ResourceVariable([1.0, 2.0], dtype=dtype)
-        var1 = resource_variable_ops.ResourceVariable([3.0, 4.0], dtype=dtype)
+        var0 = variables.Variable([1.0, 2.0], dtype=dtype)
+        var1 = variables.Variable([3.0, 4.0], dtype=dtype)
         loss = lambda: 5 * var0  # pylint: disable=cell-var-from-loop
         sgd_op = gradient_descent.SGD(3.0)
         with self.assertRaisesRegexp(ValueError, 'No gradients'):
@@ -175,8 +174,8 @@ class OptimizerTest(test.TestCase, parameterized.TestCase):
   def testNoGradientsForAnyVariables_Minimize(self):
     for dtype in _DATA_TYPES:
       with test_util.use_gpu():
-        var0 = resource_variable_ops.ResourceVariable([1.0, 2.0], dtype=dtype)
-        var1 = resource_variable_ops.ResourceVariable([3.0, 4.0], dtype=dtype)
+        var0 = variables.Variable([1.0, 2.0], dtype=dtype)
+        var1 = variables.Variable([3.0, 4.0], dtype=dtype)
         loss = lambda: constant_op.constant(5.0)
 
         sgd_op = gradient_descent.SGD(3.0)
@@ -188,8 +187,8 @@ class OptimizerTest(test.TestCase, parameterized.TestCase):
   def testNoGradientsForAnyVariables_ApplyGradients(self):
     for dtype in _DATA_TYPES:
       with test_util.use_gpu():
-        var0 = resource_variable_ops.ResourceVariable([1.0, 2.0], dtype=dtype)
-        var1 = resource_variable_ops.ResourceVariable([3.0, 4.0], dtype=dtype)
+        var0 = variables.Variable([1.0, 2.0], dtype=dtype)
+        var1 = variables.Variable([3.0, 4.0], dtype=dtype)
         sgd_op = gradient_descent.SGD(3.0)
         with self.assertRaisesRegexp(ValueError,
                                      'No gradients provided for any variable'):
@@ -199,15 +198,15 @@ class OptimizerTest(test.TestCase, parameterized.TestCase):
   def testGradientsAsVariables(self):
     for i, dtype in enumerate(_DATA_TYPES):
       with test_util.use_gpu():
-        var0 = resource_variable_ops.ResourceVariable([1.0, 2.0], dtype=dtype)
-        var1 = resource_variable_ops.ResourceVariable([3.0, 4.0], dtype=dtype)
+        var0 = variables.Variable([1.0, 2.0], dtype=dtype)
+        var1 = variables.Variable([3.0, 4.0], dtype=dtype)
         loss = lambda: 5 * var0 + 3 * var1  # pylint: disable=cell-var-from-loop
 
         sgd = gradient_descent.SGD(3.0)
         grads_and_vars = sgd._compute_gradients(loss, [var0, var1])
         # Convert gradients to tf.Variables
         converted_grads = [
-            resource_variable_ops.ResourceVariable(
+            variables.Variable(
                 array_ops.zeros([2], dtype), name='c_%d_%d' % (i, j))
             for j, gv in enumerate(grads_and_vars)
         ]
@@ -337,7 +336,7 @@ class OptimizerTest(test.TestCase, parameterized.TestCase):
   @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def testGradClipValue(self):
     with test_util.use_gpu():
-      var = resource_variable_ops.ResourceVariable([1.0, 2.0])
+      var = variables.Variable([1.0, 2.0])
       loss = lambda: 3 * var
       opt = gradient_descent.SGD(learning_rate=1.0, clipvalue=1.0)
       opt_op = opt.minimize(loss, [var])
@@ -348,7 +347,7 @@ class OptimizerTest(test.TestCase, parameterized.TestCase):
   @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def testGradClipNorm(self):
     with test_util.use_gpu():
-      var = resource_variable_ops.ResourceVariable([1.0])
+      var = variables.Variable([1.0])
       loss = lambda: 3 * var
       opt = gradient_descent.SGD(learning_rate=1.0, clipnorm=1.0)
       opt_op = opt.minimize(loss, [var])
@@ -370,15 +369,13 @@ class OptimizerTest(test.TestCase, parameterized.TestCase):
   def testWeights(self):
     with test_util.use_gpu():
       opt1 = adam.Adam(learning_rate=1.0)
-      var1 = resource_variable_ops.ResourceVariable([1.0, 2.0],
-                                                    dtype=dtypes.float32)
+      var1 = variables.Variable([1.0, 2.0], dtype=dtypes.float32)
       loss1 = lambda: 3 * var1
       opt_op_1 = opt1.minimize(loss1, [var1])
       self.evaluate(variables.global_variables_initializer())
       config = opt1.get_config()
       opt2 = adam.Adam.from_config(config)
-      var2 = resource_variable_ops.ResourceVariable([1.0, 2.0],
-                                                    dtype=dtypes.float32)
+      var2 = variables.Variable([1.0, 2.0], dtype=dtypes.float32)
       loss2 = lambda: 3 * var2
       opt_op_2 = opt2.minimize(loss2, [var2])
       weights = opt1.get_weights()
@@ -391,10 +388,8 @@ class OptimizerTest(test.TestCase, parameterized.TestCase):
       self.assertEqual(1, self.evaluate(opt1.iterations))
       self.assertEqual(1, self.evaluate(opt2.iterations))
 
-      var3 = resource_variable_ops.ResourceVariable([1.0, 2.0, 3.0],
-                                                    dtype=dtypes.float32)
-      var4 = resource_variable_ops.ResourceVariable([4.0, 5.0, 6.0],
-                                                    dtype=dtypes.float32)
+      var3 = variables.Variable([1.0, 2.0, 3.0], dtype=dtypes.float32)
+      var4 = variables.Variable([4.0, 5.0, 6.0], dtype=dtypes.float32)
       loss3 = lambda: 3 * var3 + 5 * var4
       opt_op_3 = opt1.minimize(loss3, [var3, var4])
 
@@ -405,10 +400,8 @@ class OptimizerTest(test.TestCase, parameterized.TestCase):
         opt2.set_weights(weights)
 
       # Assert set_weights and variables get updated to same value.
-      var5 = resource_variable_ops.ResourceVariable([1.0, 2.0, 3.0],
-                                                    dtype=dtypes.float32)
-      var6 = resource_variable_ops.ResourceVariable([4.0, 5.0, 6.0],
-                                                    dtype=dtypes.float32)
+      var5 = variables.Variable([1.0, 2.0, 3.0], dtype=dtypes.float32)
+      var6 = variables.Variable([4.0, 5.0, 6.0], dtype=dtypes.float32)
       loss4 = lambda: 3 * var5 + 5 * var6
       opt_op_4 = opt2.minimize(loss4, [var5, var6])
       self.evaluate(variables.global_variables_initializer())
@@ -421,8 +414,7 @@ class OptimizerTest(test.TestCase, parameterized.TestCase):
   def testGettingHyperParameters(self):
     with self.test_session():
       opt = adam.Adam(learning_rate=1.0)
-      var = resource_variable_ops.ResourceVariable([1.0, 2.0],
-                                                   dtype=dtypes.float32)
+      var = variables.Variable([1.0, 2.0], dtype=dtypes.float32)
       loss = lambda: 3 * var
       opt_op = opt.minimize(loss, [var])
       self.evaluate(variables.global_variables_initializer())
@@ -446,16 +438,14 @@ class OptimizerTest(test.TestCase, parameterized.TestCase):
   def testGettingHyperParametersWithLrInConstructor(self):
     with self.test_session():
       opt = gradient_descent.SGD(lr=3.0)
-      var = resource_variable_ops.ResourceVariable([1.0, 2.0],
-                                                   dtype=dtypes.float32)
+      var = variables.Variable([1.0, 2.0], dtype=dtypes.float32)
       loss = lambda: 3 * var
       opt_op = opt.minimize(loss, [var])
       self.evaluate(variables.global_variables_initializer())
       self.evaluate(opt_op)
 
-      self.assertIsInstance(opt.lr, resource_variable_ops.ResourceVariable)
-      self.assertIsInstance(opt.learning_rate,
-                            resource_variable_ops.ResourceVariable)
+      self.assertIsInstance(opt.lr, variables.Variable)
+      self.assertIsInstance(opt.learning_rate, variables.Variable)
 
       lr = self.evaluate(opt.lr)
       self.assertEqual(3.0, lr)
@@ -546,8 +536,7 @@ class OptimizerTest(test.TestCase, parameterized.TestCase):
     global_step = training_util.get_or_create_global_step()
     opt = adam.Adam(learning_rate=1.0)
     opt.iterations = global_step
-    var = resource_variable_ops.ResourceVariable([1.0, 2.0],
-                                                 dtype=dtypes.float32)
+    var = variables.Variable([1.0, 2.0], dtype=dtypes.float32)
     self.evaluate(variables.global_variables_initializer())
     init_step_value = self.evaluate(global_step)
     loss = lambda: 3 * var
@@ -629,7 +618,7 @@ class OptimizerTest(test.TestCase, parameterized.TestCase):
   def testAggregationTrue(self):
     # Test that experimental_aggregate_gradients=True works without distributed
     # strategy.
-    var = resource_variable_ops.ResourceVariable([1., 2.])
+    var = variables.Variable([1., 2.])
     opt = gradient_descent.SGD(3.0)
 
     self.evaluate(variables.global_variables_initializer())
@@ -644,7 +633,7 @@ class OptimizerTest(test.TestCase, parameterized.TestCase):
   def testAggregationFalse(self):
     # Test that experimental_aggregate_gradients=False works without distributed
     # strategy.
-    var = resource_variable_ops.ResourceVariable([1., 2.])
+    var = variables.Variable([1., 2.])
     opt = gradient_descent.SGD(3.0)
 
     self.evaluate(variables.global_variables_initializer())
@@ -853,8 +842,7 @@ class OptimizerWithFunctionTest(test.TestCase):
 
   def testBasic(self):
     with context.eager_mode():
-      var = resource_variable_ops.ResourceVariable([1.0, 2.0],
-                                                   dtype=dtypes.float32)
+      var = variables.Variable([1.0, 2.0], dtype=dtypes.float32)
       loss = lambda: 3 * var
       opt = adam.Adam(learning_rate=1.0)
 
diff --git a/tensorflow/python/keras/optimizer_v2/rmsprop_test.py b/tensorflow/python/keras/optimizer_v2/rmsprop_test.py
index 612d8ba0159..5fd91588227 100644
--- a/tensorflow/python/keras/optimizer_v2/rmsprop_test.py
+++ b/tensorflow/python/keras/optimizer_v2/rmsprop_test.py
@@ -35,7 +35,6 @@ from tensorflow.python.keras.optimizer_v2 import learning_rate_schedule
 from tensorflow.python.keras.optimizer_v2 import rmsprop
 from tensorflow.python.ops import embedding_ops
 from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 
@@ -112,8 +111,8 @@ class RMSpropOptimizerTest(test.TestCase):
         var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
         grads1_np = np.array([0.01, 0.2], dtype=dtype.as_numpy_dtype)
 
-        var0 = resource_variable_ops.ResourceVariable(var0_np, dtype=dtype)
-        var1 = resource_variable_ops.ResourceVariable(var1_np, dtype=dtype)
+        var0 = variables.Variable(var0_np, dtype=dtype)
+        var1 = variables.Variable(var1_np, dtype=dtype)
         grads0 = constant_op.constant(grads0_np, dtype=dtype)
         grads1 = constant_op.constant(grads1_np, dtype=dtype)
         opt = rmsprop.RMSprop(
@@ -187,8 +186,8 @@ class RMSpropOptimizerTest(test.TestCase):
       var1_np = np.array([3.0, 4.0])
       grads1_np = np.array([0.01, 0.2])
 
-      var0 = resource_variable_ops.ResourceVariable(var0_np)
-      var1 = resource_variable_ops.ResourceVariable(var1_np)
+      var0 = variables.Variable(var0_np)
+      var1 = variables.Variable(var1_np)
       grads0 = constant_op.constant(grads0_np)
       grads1 = constant_op.constant(grads1_np)
       learning_rate = 0.01
@@ -259,8 +258,8 @@ class RMSpropOptimizerTest(test.TestCase):
       var1_np = np.array([3.0, 4.0])
       grads1_np = np.array([0.01, 0.2])
 
-      var0 = resource_variable_ops.ResourceVariable(var0_np)
-      var1 = resource_variable_ops.ResourceVariable(var1_np)
+      var0 = variables.Variable(var0_np)
+      var1 = variables.Variable(var1_np)
       grads0 = constant_op.constant(grads0_np)
       grads1 = constant_op.constant(grads1_np)
       learning_rate = 0.01
@@ -328,7 +327,7 @@ class RMSpropOptimizerTest(test.TestCase):
     # TODO(tanzheny, omalleyt): Fix test in eager mode.
     with ops.Graph().as_default():
       for dtype in _DATA_TYPES:
-        var0 = resource_variable_ops.ResourceVariable([[1.0, 2.0]], dtype=dtype)
+        var0 = variables.Variable([[1.0, 2.0]], dtype=dtype)
         x = constant_op.constant([[4.0], [5.0]], dtype=dtype)
 
         def loss():
@@ -355,7 +354,7 @@ class RMSpropOptimizerTest(test.TestCase):
       for dtype in _DATA_TYPES:
         if test_util.is_xla_enabled() and dtype.is_complex:
           self.skipTest("b/143578550")
-        var0 = resource_variable_ops.ResourceVariable([[1.0, 2.0]], dtype=dtype)
+        var0 = variables.Variable([[1.0, 2.0]], dtype=dtype)
         x = constant_op.constant([[4.0], [5.0]], dtype=dtype)
 
         def loss():
@@ -462,8 +461,8 @@ class RMSpropOptimizerTest(test.TestCase):
   def testCallableParams(self):
     with context.eager_mode():
       for dtype in _DATA_TYPES:
-        var0 = resource_variable_ops.ResourceVariable([1.0, 2.0], dtype=dtype)
-        var1 = resource_variable_ops.ResourceVariable([3.0, 4.0], dtype=dtype)
+        var0 = variables.Variable([1.0, 2.0], dtype=dtype)
+        var1 = variables.Variable([3.0, 4.0], dtype=dtype)
         grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
         grads1 = constant_op.constant([0.01, 0.01], dtype=dtype)
 
@@ -556,10 +555,8 @@ class SlotColocationTest(test.TestCase, parameterized.TestCase):
   def testRunMinimizeOnGPUForCPUVariables(self, use_resource):
     with ops.device("/device:CPU:0"):
       if use_resource:
-        var0 = resource_variable_ops.ResourceVariable([1.0, 2.0],
-                                                      dtype=dtypes.float32)
-        var1 = resource_variable_ops.ResourceVariable([3.0, 4.0],
-                                                      dtype=dtypes.float32)
+        var0 = variables.Variable([1.0, 2.0], dtype=dtypes.float32)
+        var1 = variables.Variable([3.0, 4.0], dtype=dtypes.float32)
       else:
         var0 = variables.Variable([1.0, 2.0], dtype=dtypes.float32)
         var1 = variables.Variable([3.0, 4.0], dtype=dtypes.float32)
diff --git a/tensorflow/python/keras/tests/model_subclassing_test.py b/tensorflow/python/keras/tests/model_subclassing_test.py
index d2f4ee8d47c..8096b0f7586 100644
--- a/tensorflow/python/keras/tests/model_subclassing_test.py
+++ b/tensorflow/python/keras/tests/model_subclassing_test.py
@@ -37,7 +37,6 @@ from tensorflow.python.keras.tests import model_subclassing_test_util as model_u
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import embedding_ops
 from tensorflow.python.ops import init_ops
-from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import variables as variables_lib
 from tensorflow.python.platform import test
 from tensorflow.python.training.tracking import data_structures
@@ -361,7 +360,7 @@ class ModelSubclassingTest(keras_parameterized.TestCase):
         self.isdep = keras.layers.Dense(1)
         self.notdep = data_structures.NoDependency(keras.layers.Dense(2))
         self.notdep_var = data_structures.NoDependency(
-            resource_variable_ops.ResourceVariable(1., name='notdep_var'))
+            variables_lib.Variable(1., name='notdep_var'))
 
     m = Foo()
     self.assertEqual([m.isdep, m.notdep], m.layers)
@@ -376,9 +375,8 @@ class ModelSubclassingTest(keras_parameterized.TestCase):
       def __init__(self):
         super(ExtraVar, self).__init__()
         self.dense = keras.layers.Dense(1)
-        self.var = resource_variable_ops.ResourceVariable(1.)
-        self.not_trainable_var = resource_variable_ops.ResourceVariable(
-            2., trainable=False)
+        self.var = variables_lib.Variable(1.)
+        self.not_trainable_var = variables_lib.Variable(2., trainable=False)
 
       def call(self, inputs):
         return self.dense(inputs + self.var)
diff --git a/tensorflow/python/keras/tests/saver_test.py b/tensorflow/python/keras/tests/saver_test.py
index f425414a932..d3c87cf1d88 100644
--- a/tensorflow/python/keras/tests/saver_test.py
+++ b/tensorflow/python/keras/tests/saver_test.py
@@ -27,7 +27,7 @@ from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops as ops_lib
 from tensorflow.python.keras.engine import training
 from tensorflow.python.keras.layers import core
-from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 from tensorflow.python.training import adam
 from tensorflow.python.training import saver as saver_module
@@ -112,7 +112,7 @@ class TrackableCompatibilityTests(test.TestCase):
       save_path = object_saver.save(file_prefix=checkpoint_prefix)
 
       # An incompatible object-based checkpoint to check error messages
-      var = resource_variable_ops.ResourceVariable(1., name="a")
+      var = variables.Variable(1., name="a")
       self.evaluate(var.initializer)
       second_saver = trackable_utils.Checkpoint(v=var)
       second_path = second_saver.save(file_prefix=os.path.join(
diff --git a/tensorflow/python/keras/tests/tracking_util_with_v1_optimizers_test.py b/tensorflow/python/keras/tests/tracking_util_with_v1_optimizers_test.py
index b6711ea6fe5..d38ab320592 100644
--- a/tensorflow/python/keras/tests/tracking_util_with_v1_optimizers_test.py
+++ b/tensorflow/python/keras/tests/tracking_util_with_v1_optimizers_test.py
@@ -33,8 +33,8 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.keras.engine import training
 from tensorflow.python.keras.layers import core
-from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import variables
 from tensorflow.python.training import adam
 from tensorflow.python.training import checkpoint_management
 from tensorflow.python.training import saver as saver_lib
@@ -242,7 +242,7 @@ class CheckpointingTests(test.TestCase):
     self.assertAllEqual([1.5], self.evaluate(on_create_m_bias_slot))
     self.assertAllEqual(optimizer_variables[2:],
                         self.evaluate(on_create_optimizer.variables()))
-    dummy_var = resource_variable_ops.ResourceVariable([1.])
+    dummy_var = variables.Variable([1.])
     on_create_optimizer.minimize(loss=dummy_var.read_value)
     status.assert_existing_objects_matched()
     status.assert_consumed()
@@ -470,8 +470,8 @@ class CheckpointingTests(test.TestCase):
 
       def __init__(self):
         super(Model, self).__init__()
-        self.w = resource_variable_ops.ResourceVariable(0.0)
-        self.b = resource_variable_ops.ResourceVariable(0.0)
+        self.w = variables.Variable(0.0)
+        self.b = variables.Variable(0.0)
         self.vars = [self.w, self.b]
 
       def call(self, x):
@@ -663,7 +663,7 @@ class CheckpointCompatibilityTests(test.TestCase):
       self._check_sentinels(root)
       # Check that there is no error when keys are missing from the name-based
       # checkpoint.
-      root.not_in_name_checkpoint = resource_variable_ops.ResourceVariable([1.])
+      root.not_in_name_checkpoint = variables.Variable([1.])
       status = object_saver.restore(save_path)
       with self.assertRaises(AssertionError):
         status.assert_existing_objects_matched()

From 78e1d0f2999734596b05bd08275d4bf94dec9c22 Mon Sep 17 00:00:00 2001
From: Advait Jain <advaitjain@google.com>
Date: Tue, 30 Jun 2020 12:14:25 -0700
Subject: [PATCH 1336/1390] Remove unnecessay op_type parameter from the
 builtin parse functions.

Now that TFLM has completely switched over to the selective registration of
builtin parse functions, we can remove the unnecessary additional parameter.

PiperOrigin-RevId: 319072289
Change-Id: I4a43953e73c54e05b1d9f815bb8cf0605dc45bb8
---
 .../lite/core/api/flatbuffer_conversions.cc   | 285 ++++++++----------
 .../lite/core/api/flatbuffer_conversions.h    | 154 ++++------
 tensorflow/lite/micro/micro_allocator.cc      |   3 +-
 tensorflow/lite/micro/micro_op_resolver.h     |   5 -
 4 files changed, 180 insertions(+), 267 deletions(-)

diff --git a/tensorflow/lite/core/api/flatbuffer_conversions.cc b/tensorflow/lite/core/api/flatbuffer_conversions.cc
index 6019b1d0552..4d243f9a033 100644
--- a/tensorflow/lite/core/api/flatbuffer_conversions.cc
+++ b/tensorflow/lite/core/api/flatbuffer_conversions.cc
@@ -180,13 +180,12 @@ TfLiteStatus ConvertTensorType(TensorType tensor_type, TfLiteType* type,
 // We have this parse function instead of directly returning kTfLiteOk from the
 // switch-case in ParseOpData because this function is used as part of the
 // selective registration for the OpResolver implementation in micro.
-TfLiteStatus ParseAbs(const Operator*, BuiltinOperator, ErrorReporter*,
-                      BuiltinDataAllocator*, void**) {
+TfLiteStatus ParseAbs(const Operator*, ErrorReporter*, BuiltinDataAllocator*,
+                      void**) {
   return kTfLiteOk;
 }
 
-TfLiteStatus ParseAdd(const Operator* op, BuiltinOperator,
-                      ErrorReporter* error_reporter,
+TfLiteStatus ParseAdd(const Operator* op, ErrorReporter* error_reporter,
                       BuiltinDataAllocator* allocator, void** builtin_data) {
   CheckParsePointerParams(op, error_reporter, allocator, builtin_data);
 
@@ -210,8 +209,7 @@ TfLiteStatus ParseAdd(const Operator* op, BuiltinOperator,
   return kTfLiteOk;
 }
 
-TfLiteStatus ParseArgMax(const Operator* op, BuiltinOperator,
-                         ErrorReporter* error_reporter,
+TfLiteStatus ParseArgMax(const Operator* op, ErrorReporter* error_reporter,
                          BuiltinDataAllocator* allocator, void** builtin_data) {
   CheckParsePointerParams(op, error_reporter, allocator, builtin_data);
 
@@ -236,8 +234,7 @@ TfLiteStatus ParseArgMax(const Operator* op, BuiltinOperator,
   return kTfLiteOk;
 }
 
-TfLiteStatus ParseArgMin(const Operator* op, BuiltinOperator,
-                         ErrorReporter* error_reporter,
+TfLiteStatus ParseArgMin(const Operator* op, ErrorReporter* error_reporter,
                          BuiltinDataAllocator* allocator, void** builtin_data) {
   CheckParsePointerParams(op, error_reporter, allocator, builtin_data);
 
@@ -265,12 +262,12 @@ TfLiteStatus ParseArgMin(const Operator* op, BuiltinOperator,
 // We have this parse function instead of directly returning kTfLiteOk from the
 // switch-case in ParseOpData because this function is used as part of the
 // selective registration for the OpResolver implementation in micro.
-TfLiteStatus ParseCeil(const Operator*, BuiltinOperator, ErrorReporter*,
-                       BuiltinDataAllocator*, void**) {
+TfLiteStatus ParseCeil(const Operator*, ErrorReporter*, BuiltinDataAllocator*,
+                       void**) {
   return kTfLiteOk;
 }
 
-TfLiteStatus ParseConcatenation(const Operator* op, BuiltinOperator,
+TfLiteStatus ParseConcatenation(const Operator* op,
                                 ErrorReporter* error_reporter,
                                 BuiltinDataAllocator* allocator,
                                 void** builtin_data) {
@@ -299,8 +296,7 @@ TfLiteStatus ParseConcatenation(const Operator* op, BuiltinOperator,
   return kTfLiteOk;
 }
 
-TfLiteStatus ParseConv2D(const Operator* op, BuiltinOperator,
-                         ErrorReporter* error_reporter,
+TfLiteStatus ParseConv2D(const Operator* op, ErrorReporter* error_reporter,
                          BuiltinDataAllocator* allocator, void** builtin_data) {
   CheckParsePointerParams(op, error_reporter, allocator, builtin_data);
 
@@ -334,12 +330,12 @@ TfLiteStatus ParseConv2D(const Operator* op, BuiltinOperator,
 // We have this parse function instead of directly returning kTfLiteOk from the
 // switch-case in ParseOpData because this function is used as part of the
 // selective registration for the OpResolver implementation in micro.
-TfLiteStatus ParseCos(const Operator*, BuiltinOperator, ErrorReporter*,
-                      BuiltinDataAllocator*, void**) {
+TfLiteStatus ParseCos(const Operator*, ErrorReporter*, BuiltinDataAllocator*,
+                      void**) {
   return kTfLiteOk;
 }
 
-TfLiteStatus ParseDepthwiseConv2D(const Operator* op, BuiltinOperator,
+TfLiteStatus ParseDepthwiseConv2D(const Operator* op,
                                   ErrorReporter* error_reporter,
                                   BuiltinDataAllocator* allocator,
                                   void** builtin_data) {
@@ -378,7 +374,7 @@ TfLiteStatus ParseDepthwiseConv2D(const Operator* op, BuiltinOperator,
 // We have this parse function instead of directly returning kTfLiteOk from the
 // switch-case in ParseOpData because this function is used as part of the
 // selective registration for the OpResolver implementation in micro.
-TfLiteStatus ParseDequantize(const Operator*, BuiltinOperator, ErrorReporter*,
+TfLiteStatus ParseDequantize(const Operator*, ErrorReporter*,
                              BuiltinDataAllocator*, void**) {
   return kTfLiteOk;
 }
@@ -386,20 +382,20 @@ TfLiteStatus ParseDequantize(const Operator*, BuiltinOperator, ErrorReporter*,
 // We have this parse function instead of directly returning kTfLiteOk from the
 // switch-case in ParseOpData because this function is used as part of the
 // selective registration for the OpResolver implementation in micro.
-TfLiteStatus ParseEqual(const Operator*, BuiltinOperator, ErrorReporter*,
-                        BuiltinDataAllocator*, void**) {
+TfLiteStatus ParseEqual(const Operator*, ErrorReporter*, BuiltinDataAllocator*,
+                        void**) {
   return kTfLiteOk;
 }
 
 // We have this parse function instead of directly returning kTfLiteOk from the
 // switch-case in ParseOpData because this function is used as part of the
 // selective registration for the OpResolver implementation in micro.
-TfLiteStatus ParseFloor(const Operator*, BuiltinOperator, ErrorReporter*,
-                        BuiltinDataAllocator*, void**) {
+TfLiteStatus ParseFloor(const Operator*, ErrorReporter*, BuiltinDataAllocator*,
+                        void**) {
   return kTfLiteOk;
 }
 
-TfLiteStatus ParseFullyConnected(const Operator* op, BuiltinOperator,
+TfLiteStatus ParseFullyConnected(const Operator* op,
                                  ErrorReporter* error_reporter,
                                  BuiltinDataAllocator* allocator,
                                  void** builtin_data) {
@@ -448,7 +444,7 @@ TfLiteStatus ParseFullyConnected(const Operator* op, BuiltinOperator,
 // We have this parse function instead of directly returning kTfLiteOk from the
 // switch-case in ParseOpData because this function is used as part of the
 // selective registration for the OpResolver implementation in micro.
-TfLiteStatus ParseGreater(const Operator*, BuiltinOperator, ErrorReporter*,
+TfLiteStatus ParseGreater(const Operator*, ErrorReporter*,
                           BuiltinDataAllocator*, void**) {
   return kTfLiteOk;
 }
@@ -456,7 +452,7 @@ TfLiteStatus ParseGreater(const Operator*, BuiltinOperator, ErrorReporter*,
 // We have this parse function instead of directly returning kTfLiteOk from the
 // switch-case in ParseOpData because this function is used as part of the
 // selective registration for the OpResolver implementation in micro.
-TfLiteStatus ParseGreaterEqual(const Operator*, BuiltinOperator, ErrorReporter*,
+TfLiteStatus ParseGreaterEqual(const Operator*, ErrorReporter*,
                                BuiltinDataAllocator*, void**) {
   return kTfLiteOk;
 }
@@ -464,12 +460,12 @@ TfLiteStatus ParseGreaterEqual(const Operator*, BuiltinOperator, ErrorReporter*,
 // We have this parse function instead of directly returning kTfLiteOk from the
 // switch-case in ParseOpData because this function is used as part of the
 // selective registration for the OpResolver implementation in micro.
-TfLiteStatus ParseHardSwish(const Operator*, BuiltinOperator, ErrorReporter*,
+TfLiteStatus ParseHardSwish(const Operator*, ErrorReporter*,
                             BuiltinDataAllocator*, void**) {
   return kTfLiteOk;
 }
 
-TfLiteStatus ParseL2Normalization(const Operator* op, BuiltinOperator,
+TfLiteStatus ParseL2Normalization(const Operator* op,
                                   ErrorReporter* error_reporter,
                                   BuiltinDataAllocator* allocator,
                                   void** builtin_data) {
@@ -499,15 +495,15 @@ TfLiteStatus ParseL2Normalization(const Operator* op, BuiltinOperator,
 // We have this parse function instead of directly returning kTfLiteOk from the
 // switch-case in ParseOpData because this function is used as part of the
 // selective registration for the OpResolver implementation in micro.
-TfLiteStatus ParseLess(const Operator*, BuiltinOperator, ErrorReporter*,
-                       BuiltinDataAllocator*, void**) {
+TfLiteStatus ParseLess(const Operator*, ErrorReporter*, BuiltinDataAllocator*,
+                       void**) {
   return kTfLiteOk;
 }
 
 // We have this parse function instead of directly returning kTfLiteOk from the
 // switch-case in ParseOpData because this function is used as part of the
 // selective registration for the OpResolver implementation in micro.
-TfLiteStatus ParseLessEqual(const Operator*, BuiltinOperator, ErrorReporter*,
+TfLiteStatus ParseLessEqual(const Operator*, ErrorReporter*,
                             BuiltinDataAllocator*, void**) {
   return kTfLiteOk;
 }
@@ -515,15 +511,15 @@ TfLiteStatus ParseLessEqual(const Operator*, BuiltinOperator, ErrorReporter*,
 // We have this parse function instead of directly returning kTfLiteOk from the
 // switch-case in ParseOpData because this function is used as part of the
 // selective registration for the OpResolver implementation in micro.
-TfLiteStatus ParseLog(const Operator*, BuiltinOperator, ErrorReporter*,
-                      BuiltinDataAllocator*, void**) {
+TfLiteStatus ParseLog(const Operator*, ErrorReporter*, BuiltinDataAllocator*,
+                      void**) {
   return kTfLiteOk;
 }
 
 // We have this parse function instead of directly returning kTfLiteOk from the
 // switch-case in ParseOpData because this function is used as part of the
 // selective registration for the OpResolver implementation in micro.
-TfLiteStatus ParseLogicalAnd(const Operator*, BuiltinOperator, ErrorReporter*,
+TfLiteStatus ParseLogicalAnd(const Operator*, ErrorReporter*,
                              BuiltinDataAllocator*, void**) {
   return kTfLiteOk;
 }
@@ -531,7 +527,7 @@ TfLiteStatus ParseLogicalAnd(const Operator*, BuiltinOperator, ErrorReporter*,
 // We have this parse function instead of directly returning kTfLiteOk from the
 // switch-case in ParseOpData because this function is used as part of the
 // selective registration for the OpResolver implementation in micro.
-TfLiteStatus ParseLogicalNot(const Operator*, BuiltinOperator, ErrorReporter*,
+TfLiteStatus ParseLogicalNot(const Operator*, ErrorReporter*,
                              BuiltinDataAllocator*, void**) {
   return kTfLiteOk;
 }
@@ -539,7 +535,7 @@ TfLiteStatus ParseLogicalNot(const Operator*, BuiltinOperator, ErrorReporter*,
 // We have this parse function instead of directly returning kTfLiteOk from the
 // switch-case in ParseOpData because this function is used as part of the
 // selective registration for the OpResolver implementation in micro.
-TfLiteStatus ParseLogicalOr(const Operator*, BuiltinOperator, ErrorReporter*,
+TfLiteStatus ParseLogicalOr(const Operator*, ErrorReporter*,
                             BuiltinDataAllocator*, void**) {
   return kTfLiteOk;
 }
@@ -547,7 +543,7 @@ TfLiteStatus ParseLogicalOr(const Operator*, BuiltinOperator, ErrorReporter*,
 // We have this parse function instead of directly returning kTfLiteOk from the
 // switch-case in ParseOpData because this function is used as part of the
 // selective registration for the OpResolver implementation in micro.
-TfLiteStatus ParseLogistic(const Operator*, BuiltinOperator, ErrorReporter*,
+TfLiteStatus ParseLogistic(const Operator*, ErrorReporter*,
                            BuiltinDataAllocator*, void**) {
   return kTfLiteOk;
 }
@@ -555,7 +551,7 @@ TfLiteStatus ParseLogistic(const Operator*, BuiltinOperator, ErrorReporter*,
 // We have this parse function instead of directly returning kTfLiteOk from the
 // switch-case in ParseOpData because this function is used as part of the
 // selective registration for the OpResolver implementation in micro.
-TfLiteStatus ParseMaximum(const Operator*, BuiltinOperator, ErrorReporter*,
+TfLiteStatus ParseMaximum(const Operator*, ErrorReporter*,
                           BuiltinDataAllocator*, void**) {
   return kTfLiteOk;
 }
@@ -563,13 +559,12 @@ TfLiteStatus ParseMaximum(const Operator*, BuiltinOperator, ErrorReporter*,
 // We have this parse function instead of directly returning kTfLiteOk from the
 // switch-case in ParseOpData because this function is used as part of the
 // selective registration for the OpResolver implementation in micro.
-TfLiteStatus ParseMinimum(const Operator*, BuiltinOperator, ErrorReporter*,
+TfLiteStatus ParseMinimum(const Operator*, ErrorReporter*,
                           BuiltinDataAllocator*, void**) {
   return kTfLiteOk;
 }
 
-TfLiteStatus ParseMul(const Operator* op, BuiltinOperator,
-                      ErrorReporter* error_reporter,
+TfLiteStatus ParseMul(const Operator* op, ErrorReporter* error_reporter,
                       BuiltinDataAllocator* allocator, void** builtin_data) {
   CheckParsePointerParams(op, error_reporter, allocator, builtin_data);
 
@@ -596,21 +591,20 @@ TfLiteStatus ParseMul(const Operator* op, BuiltinOperator,
 // We have this parse function instead of directly returning kTfLiteOk from the
 // switch-case in ParseOpData because this function is used as part of the
 // selective registration for the OpResolver implementation in micro.
-TfLiteStatus ParseNeg(const Operator*, BuiltinOperator, ErrorReporter*,
-                      BuiltinDataAllocator*, void**) {
+TfLiteStatus ParseNeg(const Operator*, ErrorReporter*, BuiltinDataAllocator*,
+                      void**) {
   return kTfLiteOk;
 }
 
 // We have this parse function instead of directly returning kTfLiteOk from the
 // switch-case in ParseOpData because this function is used as part of the
 // selective registration for the OpResolver implementation in micro.
-TfLiteStatus ParseNotEqual(const Operator*, BuiltinOperator, ErrorReporter*,
+TfLiteStatus ParseNotEqual(const Operator*, ErrorReporter*,
                            BuiltinDataAllocator*, void**) {
   return kTfLiteOk;
 }
 
-TfLiteStatus ParsePack(const Operator* op, BuiltinOperator,
-                       ErrorReporter* error_reporter,
+TfLiteStatus ParsePack(const Operator* op, ErrorReporter* error_reporter,
                        BuiltinDataAllocator* allocator, void** builtin_data) {
   CheckParsePointerParams(op, error_reporter, allocator, builtin_data);
 
@@ -638,21 +632,20 @@ TfLiteStatus ParsePack(const Operator* op, BuiltinOperator,
 // We have this parse function instead of directly returning kTfLiteOk from the
 // switch-case in ParseOpData because this function is used as part of the
 // selective registration for the OpResolver implementation in micro.
-TfLiteStatus ParsePad(const Operator*, BuiltinOperator, ErrorReporter*,
-                      BuiltinDataAllocator*, void**) {
+TfLiteStatus ParsePad(const Operator*, ErrorReporter*, BuiltinDataAllocator*,
+                      void**) {
   return kTfLiteOk;
 }
 
 // We have this parse function instead of directly returning kTfLiteOk from the
 // switch-case in ParseOpData because this function is used as part of the
 // selective registration for the OpResolver implementation in micro.
-TfLiteStatus ParsePadV2(const Operator*, BuiltinOperator, ErrorReporter*,
-                        BuiltinDataAllocator*, void**) {
+TfLiteStatus ParsePadV2(const Operator*, ErrorReporter*, BuiltinDataAllocator*,
+                        void**) {
   return kTfLiteOk;
 }
 
-TfLiteStatus ParsePool(const Operator* op, BuiltinOperator,
-                       ErrorReporter* error_reporter,
+TfLiteStatus ParsePool(const Operator* op, ErrorReporter* error_reporter,
                        BuiltinDataAllocator* allocator, void** builtin_data) {
   CheckParsePointerParams(op, error_reporter, allocator, builtin_data);
 
@@ -685,21 +678,20 @@ TfLiteStatus ParsePool(const Operator* op, BuiltinOperator,
 // We have this parse function instead of directly returning kTfLiteOk from the
 // switch-case in ParseOpData because this function is used as part of the
 // selective registration for the OpResolver implementation in micro.
-TfLiteStatus ParsePrelu(const Operator*, BuiltinOperator, ErrorReporter*,
-                        BuiltinDataAllocator*, void**) {
+TfLiteStatus ParsePrelu(const Operator*, ErrorReporter*, BuiltinDataAllocator*,
+                        void**) {
   return kTfLiteOk;
 }
 
 // We have this parse function instead of directly returning kTfLiteOk from the
 // switch-case in ParseOpData because this function is used as part of the
 // selective registration for the OpResolver implementation in micro.
-TfLiteStatus ParseQuantize(const Operator*, BuiltinOperator, ErrorReporter*,
+TfLiteStatus ParseQuantize(const Operator*, ErrorReporter*,
                            BuiltinDataAllocator*, void**) {
   return kTfLiteOk;
 }
 
-TfLiteStatus ParseReducer(const Operator* op, BuiltinOperator,
-                          ErrorReporter* error_reporter,
+TfLiteStatus ParseReducer(const Operator* op, ErrorReporter* error_reporter,
                           BuiltinDataAllocator* allocator,
                           void** builtin_data) {
   CheckParsePointerParams(op, error_reporter, allocator, builtin_data);
@@ -728,21 +720,20 @@ TfLiteStatus ParseReducer(const Operator* op, BuiltinOperator,
 // We have this parse function instead of directly returning kTfLiteOk from the
 // switch-case in ParseOpData because this function is used as part of the
 // selective registration for the OpResolver implementation in micro.
-TfLiteStatus ParseRelu(const Operator*, BuiltinOperator, ErrorReporter*,
-                       BuiltinDataAllocator*, void**) {
+TfLiteStatus ParseRelu(const Operator*, ErrorReporter*, BuiltinDataAllocator*,
+                       void**) {
   return kTfLiteOk;
 }
 
 // We have this parse function instead of directly returning kTfLiteOk from the
 // switch-case in ParseOpData because this function is used as part of the
 // selective registration for the OpResolver implementation in micro.
-TfLiteStatus ParseRelu6(const Operator*, BuiltinOperator, ErrorReporter*,
-                        BuiltinDataAllocator*, void**) {
+TfLiteStatus ParseRelu6(const Operator*, ErrorReporter*, BuiltinDataAllocator*,
+                        void**) {
   return kTfLiteOk;
 }
 
-TfLiteStatus ParseReshape(const Operator* op, BuiltinOperator,
-                          ErrorReporter* error_reporter,
+TfLiteStatus ParseReshape(const Operator* op, ErrorReporter* error_reporter,
                           BuiltinDataAllocator* allocator,
                           void** builtin_data) {
   CheckParsePointerParams(op, error_reporter, allocator, builtin_data);
@@ -782,7 +773,7 @@ TfLiteStatus ParseReshape(const Operator* op, BuiltinOperator,
   return kTfLiteOk;
 }
 
-TfLiteStatus ParseResizeNearestNeighbor(const Operator* op, BuiltinOperator,
+TfLiteStatus ParseResizeNearestNeighbor(const Operator* op,
                                         ErrorReporter* error_reporter,
                                         BuiltinDataAllocator* allocator,
                                         void** builtin_data) {
@@ -812,29 +803,28 @@ TfLiteStatus ParseResizeNearestNeighbor(const Operator* op, BuiltinOperator,
 // We have this parse function instead of directly returning kTfLiteOk from the
 // switch-case in ParseOpData because this function is used as part of the
 // selective registration for the OpResolver implementation in micro.
-TfLiteStatus ParseRound(const Operator*, BuiltinOperator, ErrorReporter*,
-                        BuiltinDataAllocator*, void**) {
+TfLiteStatus ParseRound(const Operator*, ErrorReporter*, BuiltinDataAllocator*,
+                        void**) {
   return kTfLiteOk;
 }
 
 // We have this parse function instead of directly returning kTfLiteOk from the
 // switch-case in ParseOpData because this function is used as part of the
 // selective registration for the OpResolver implementation in micro.
-TfLiteStatus ParseRsqrt(const Operator*, BuiltinOperator, ErrorReporter*,
-                        BuiltinDataAllocator*, void**) {
+TfLiteStatus ParseRsqrt(const Operator*, ErrorReporter*, BuiltinDataAllocator*,
+                        void**) {
   return kTfLiteOk;
 }
 
 // We have this parse function instead of directly returning kTfLiteOk from the
 // switch-case in ParseOpData because this function is used as part of the
 // selective registration for the OpResolver implementation in micro.
-TfLiteStatus ParseSin(const Operator*, BuiltinOperator, ErrorReporter*,
-                      BuiltinDataAllocator*, void**) {
+TfLiteStatus ParseSin(const Operator*, ErrorReporter*, BuiltinDataAllocator*,
+                      void**) {
   return kTfLiteOk;
 }
 
-TfLiteStatus ParseSoftmax(const Operator* op, BuiltinOperator,
-                          ErrorReporter* error_reporter,
+TfLiteStatus ParseSoftmax(const Operator* op, ErrorReporter* error_reporter,
                           BuiltinDataAllocator* allocator,
                           void** builtin_data) {
   CheckParsePointerParams(op, error_reporter, allocator, builtin_data);
@@ -859,8 +849,7 @@ TfLiteStatus ParseSoftmax(const Operator* op, BuiltinOperator,
   return kTfLiteOk;
 }
 
-TfLiteStatus ParseSplit(const Operator* op, BuiltinOperator,
-                        ErrorReporter* error_reporter,
+TfLiteStatus ParseSplit(const Operator* op, ErrorReporter* error_reporter,
                         BuiltinDataAllocator* allocator, void** builtin_data) {
   CheckParsePointerParams(op, error_reporter, allocator, builtin_data);
 
@@ -887,20 +876,20 @@ TfLiteStatus ParseSplit(const Operator* op, BuiltinOperator,
 // We have this parse function instead of directly returning kTfLiteOk from the
 // switch-case in ParseOpData because this function is used as part of the
 // selective registration for the OpResolver implementation in micro.
-TfLiteStatus ParseSqrt(const Operator*, BuiltinOperator, ErrorReporter*,
-                       BuiltinDataAllocator*, void**) {
+TfLiteStatus ParseSqrt(const Operator*, ErrorReporter*, BuiltinDataAllocator*,
+                       void**) {
   return kTfLiteOk;
 }
 
 // We have this parse function instead of directly returning kTfLiteOk from the
 // switch-case in ParseOpData because this function is used as part of the
 // selective registration for the OpResolver implementation in micro.
-TfLiteStatus ParseSquare(const Operator*, BuiltinOperator, ErrorReporter*,
-                         BuiltinDataAllocator*, void**) {
+TfLiteStatus ParseSquare(const Operator*, ErrorReporter*, BuiltinDataAllocator*,
+                         void**) {
   return kTfLiteOk;
 }
 
-TfLiteStatus ParseStridedSlice(const Operator* op, BuiltinOperator,
+TfLiteStatus ParseStridedSlice(const Operator* op,
                                ErrorReporter* error_reporter,
                                BuiltinDataAllocator* allocator,
                                void** builtin_data) {
@@ -931,8 +920,7 @@ TfLiteStatus ParseStridedSlice(const Operator* op, BuiltinOperator,
   return kTfLiteOk;
 }
 
-TfLiteStatus ParseSub(const Operator* op, BuiltinOperator,
-                      ErrorReporter* error_reporter,
+TfLiteStatus ParseSub(const Operator* op, ErrorReporter* error_reporter,
                       BuiltinDataAllocator* allocator, void** builtin_data) {
   CheckParsePointerParams(op, error_reporter, allocator, builtin_data);
 
@@ -956,8 +944,7 @@ TfLiteStatus ParseSub(const Operator* op, BuiltinOperator,
   return kTfLiteOk;
 }
 
-TfLiteStatus ParseSvdf(const Operator* op, BuiltinOperator,
-                       ErrorReporter* error_reporter,
+TfLiteStatus ParseSvdf(const Operator* op, ErrorReporter* error_reporter,
                        BuiltinDataAllocator* allocator, void** builtin_data) {
   CheckParsePointerParams(op, error_reporter, allocator, builtin_data);
 
@@ -987,13 +974,12 @@ TfLiteStatus ParseSvdf(const Operator* op, BuiltinOperator,
 // We have this parse function instead of directly returning kTfLiteOk from the
 // switch-case in ParseOpData because this function is used as part of the
 // selective registration for the OpResolver implementation in micro.
-TfLiteStatus ParseTanh(const Operator*, BuiltinOperator, ErrorReporter*,
-                       BuiltinDataAllocator*, void**) {
+TfLiteStatus ParseTanh(const Operator*, ErrorReporter*, BuiltinDataAllocator*,
+                       void**) {
   return kTfLiteOk;
 }
 
-TfLiteStatus ParseUnpack(const Operator* op, BuiltinOperator,
-                         ErrorReporter* error_reporter,
+TfLiteStatus ParseUnpack(const Operator* op, ErrorReporter* error_reporter,
                          BuiltinDataAllocator* allocator, void** builtin_data) {
   CheckParsePointerParams(op, error_reporter, allocator, builtin_data);
 
@@ -1047,247 +1033,232 @@ TfLiteStatus ParseOpData(const Operator* op, BuiltinOperator op_type,
   *builtin_data = nullptr;
   switch (op_type) {
     case BuiltinOperator_ABS: {
-      return ParseAbs(op, op_type, error_reporter, allocator, builtin_data);
+      return ParseAbs(op, error_reporter, allocator, builtin_data);
     }
 
     case BuiltinOperator_ADD: {
-      return ParseAdd(op, op_type, error_reporter, allocator, builtin_data);
+      return ParseAdd(op, error_reporter, allocator, builtin_data);
     }
 
     case BuiltinOperator_ARG_MAX: {
-      return ParseArgMax(op, op_type, error_reporter, allocator, builtin_data);
+      return ParseArgMax(op, error_reporter, allocator, builtin_data);
     }
 
     case BuiltinOperator_ARG_MIN: {
-      return ParseArgMin(op, op_type, error_reporter, allocator, builtin_data);
+      return ParseArgMin(op, error_reporter, allocator, builtin_data);
     }
 
     case BuiltinOperator_AVERAGE_POOL_2D: {
-      return ParsePool(op, op_type, error_reporter, allocator, builtin_data);
+      return ParsePool(op, error_reporter, allocator, builtin_data);
     }
 
     case BuiltinOperator_CEIL: {
-      return ParseCeil(op, op_type, error_reporter, allocator, builtin_data);
+      return ParseCeil(op, error_reporter, allocator, builtin_data);
     }
 
     case BuiltinOperator_CONCATENATION: {
-      return ParseConcatenation(op, op_type, error_reporter, allocator,
-                                builtin_data);
+      return ParseConcatenation(op, error_reporter, allocator, builtin_data);
     }
 
     case BuiltinOperator_CONV_2D: {
-      return ParseConv2D(op, op_type, error_reporter, allocator, builtin_data);
+      return ParseConv2D(op, error_reporter, allocator, builtin_data);
     }
 
     case BuiltinOperator_DEPTHWISE_CONV_2D: {
-      return ParseDepthwiseConv2D(op, op_type, error_reporter, allocator,
-                                  builtin_data);
+      return ParseDepthwiseConv2D(op, error_reporter, allocator, builtin_data);
     }
 
     case BuiltinOperator_DEQUANTIZE: {
-      return ParseDequantize(op, op_type, error_reporter, allocator,
-                             builtin_data);
+      return ParseDequantize(op, error_reporter, allocator, builtin_data);
     }
 
     case BuiltinOperator_FLOOR: {
-      return ParseFloor(op, op_type, error_reporter, allocator, builtin_data);
+      return ParseFloor(op, error_reporter, allocator, builtin_data);
     }
 
     case BuiltinOperator_FULLY_CONNECTED: {
-      return ParseFullyConnected(op, op_type, error_reporter, allocator,
-                                 builtin_data);
+      return ParseFullyConnected(op, error_reporter, allocator, builtin_data);
     }
 
     case BuiltinOperator_GREATER: {
-      return ParseGreater(op, op_type, error_reporter, allocator, builtin_data);
+      return ParseGreater(op, error_reporter, allocator, builtin_data);
     }
 
     case BuiltinOperator_GREATER_EQUAL: {
-      return ParseGreaterEqual(op, op_type, error_reporter, allocator,
-                               builtin_data);
+      return ParseGreaterEqual(op, error_reporter, allocator, builtin_data);
     }
 
     case BuiltinOperator_HARD_SWISH: {
-      return ParseHardSwish(op, op_type, error_reporter, allocator,
-                            builtin_data);
+      return ParseHardSwish(op, error_reporter, allocator, builtin_data);
     }
 
     case BuiltinOperator_L2_NORMALIZATION: {
-      return ParseL2Normalization(op, op_type, error_reporter, allocator,
-                                  builtin_data);
+      return ParseL2Normalization(op, error_reporter, allocator, builtin_data);
     }
 
     case BuiltinOperator_L2_POOL_2D: {
-      return ParsePool(op, op_type, error_reporter, allocator, builtin_data);
+      return ParsePool(op, error_reporter, allocator, builtin_data);
     }
 
     case BuiltinOperator_LESS: {
-      return ParseLess(op, op_type, error_reporter, allocator, builtin_data);
+      return ParseLess(op, error_reporter, allocator, builtin_data);
     }
 
     case BuiltinOperator_LESS_EQUAL: {
-      return ParseLessEqual(op, op_type, error_reporter, allocator,
-                            builtin_data);
+      return ParseLessEqual(op, error_reporter, allocator, builtin_data);
     }
 
     case BuiltinOperator_LOG: {
-      return ParseLog(op, op_type, error_reporter, allocator, builtin_data);
+      return ParseLog(op, error_reporter, allocator, builtin_data);
     }
 
     case BuiltinOperator_LOGICAL_AND: {
-      return ParseLogicalAnd(op, op_type, error_reporter, allocator,
-                             builtin_data);
+      return ParseLogicalAnd(op, error_reporter, allocator, builtin_data);
     }
 
     case BuiltinOperator_LOGICAL_NOT: {
-      return ParseLogicalNot(op, op_type, error_reporter, allocator,
-                             builtin_data);
+      return ParseLogicalNot(op, error_reporter, allocator, builtin_data);
     }
 
     case BuiltinOperator_LOGICAL_OR: {
-      return ParseLogicalOr(op, op_type, error_reporter, allocator,
-                            builtin_data);
+      return ParseLogicalOr(op, error_reporter, allocator, builtin_data);
     }
 
     case BuiltinOperator_LOGISTIC: {
-      return ParseLogistic(op, op_type, error_reporter, allocator,
-                           builtin_data);
+      return ParseLogistic(op, error_reporter, allocator, builtin_data);
     }
 
     case BuiltinOperator_MAXIMUM: {
-      return ParseMaximum(op, op_type, error_reporter, allocator, builtin_data);
+      return ParseMaximum(op, error_reporter, allocator, builtin_data);
     }
 
     case BuiltinOperator_MAX_POOL_2D: {
-      return ParsePool(op, op_type, error_reporter, allocator, builtin_data);
+      return ParsePool(op, error_reporter, allocator, builtin_data);
     }
 
     case BuiltinOperator_MEAN: {
-      return ParseReducer(op, op_type, error_reporter, allocator, builtin_data);
+      return ParseReducer(op, error_reporter, allocator, builtin_data);
     }
 
     case BuiltinOperator_MINIMUM: {
-      return ParseMinimum(op, op_type, error_reporter, allocator, builtin_data);
+      return ParseMinimum(op, error_reporter, allocator, builtin_data);
     }
 
     case BuiltinOperator_MUL: {
-      return ParseMul(op, op_type, error_reporter, allocator, builtin_data);
+      return ParseMul(op, error_reporter, allocator, builtin_data);
     }
 
     case BuiltinOperator_NEG: {
-      return ParseNeg(op, op_type, error_reporter, allocator, builtin_data);
+      return ParseNeg(op, error_reporter, allocator, builtin_data);
     }
 
     case BuiltinOperator_NOT_EQUAL: {
-      return ParseNotEqual(op, op_type, error_reporter, allocator,
-                           builtin_data);
+      return ParseNotEqual(op, error_reporter, allocator, builtin_data);
     }
 
     case BuiltinOperator_PACK: {
-      return ParsePack(op, op_type, error_reporter, allocator, builtin_data);
+      return ParsePack(op, error_reporter, allocator, builtin_data);
     }
 
     case BuiltinOperator_PAD: {
-      return ParsePad(op, op_type, error_reporter, allocator, builtin_data);
+      return ParsePad(op, error_reporter, allocator, builtin_data);
     }
 
     case BuiltinOperator_PADV2: {
-      return ParsePadV2(op, op_type, error_reporter, allocator, builtin_data);
+      return ParsePadV2(op, error_reporter, allocator, builtin_data);
     }
 
     case BuiltinOperator_PRELU: {
-      return ParsePrelu(op, op_type, error_reporter, allocator, builtin_data);
+      return ParsePrelu(op, error_reporter, allocator, builtin_data);
     }
 
     case BuiltinOperator_QUANTIZE: {
-      return ParseQuantize(op, op_type, error_reporter, allocator,
-                           builtin_data);
+      return ParseQuantize(op, error_reporter, allocator, builtin_data);
     }
 
     case BuiltinOperator_REDUCE_ANY: {
-      return ParseReducer(op, op_type, error_reporter, allocator, builtin_data);
+      return ParseReducer(op, error_reporter, allocator, builtin_data);
     }
 
     case BuiltinOperator_REDUCE_MAX: {
-      return ParseReducer(op, op_type, error_reporter, allocator, builtin_data);
+      return ParseReducer(op, error_reporter, allocator, builtin_data);
     }
 
     case BuiltinOperator_REDUCE_MIN: {
-      return ParseReducer(op, op_type, error_reporter, allocator, builtin_data);
+      return ParseReducer(op, error_reporter, allocator, builtin_data);
     }
 
     case BuiltinOperator_REDUCE_PROD: {
-      return ParseReducer(op, op_type, error_reporter, allocator, builtin_data);
+      return ParseReducer(op, error_reporter, allocator, builtin_data);
     }
 
     case BuiltinOperator_RELU: {
-      return ParseRelu(op, op_type, error_reporter, allocator, builtin_data);
+      return ParseRelu(op, error_reporter, allocator, builtin_data);
     }
 
     case BuiltinOperator_RELU6: {
-      return ParseRelu6(op, op_type, error_reporter, allocator, builtin_data);
+      return ParseRelu6(op, error_reporter, allocator, builtin_data);
     }
 
     case BuiltinOperator_RESHAPE: {
-      return ParseReshape(op, op_type, error_reporter, allocator, builtin_data);
+      return ParseReshape(op, error_reporter, allocator, builtin_data);
     }
 
     case BuiltinOperator_RESIZE_NEAREST_NEIGHBOR: {
-      return ParseResizeNearestNeighbor(op, op_type, error_reporter, allocator,
+      return ParseResizeNearestNeighbor(op, error_reporter, allocator,
                                         builtin_data);
     }
 
     case BuiltinOperator_ROUND: {
-      return ParseRound(op, op_type, error_reporter, allocator, builtin_data);
+      return ParseRound(op, error_reporter, allocator, builtin_data);
     }
 
     case BuiltinOperator_RSQRT: {
-      return ParseRsqrt(op, op_type, error_reporter, allocator, builtin_data);
+      return ParseRsqrt(op, error_reporter, allocator, builtin_data);
     }
 
     case BuiltinOperator_SIN: {
-      return ParseSin(op, op_type, error_reporter, allocator, builtin_data);
+      return ParseSin(op, error_reporter, allocator, builtin_data);
     }
 
     case BuiltinOperator_SOFTMAX: {
-      return ParseSoftmax(op, op_type, error_reporter, allocator, builtin_data);
+      return ParseSoftmax(op, error_reporter, allocator, builtin_data);
     }
 
     case BuiltinOperator_SPLIT: {
-      return ParseSplit(op, op_type, error_reporter, allocator, builtin_data);
+      return ParseSplit(op, error_reporter, allocator, builtin_data);
     }
 
     case BuiltinOperator_SQRT: {
-      return ParseSqrt(op, op_type, error_reporter, allocator, builtin_data);
+      return ParseSqrt(op, error_reporter, allocator, builtin_data);
     }
 
     case BuiltinOperator_SQUARE: {
-      return ParseSquare(op, op_type, error_reporter, allocator, builtin_data);
+      return ParseSquare(op, error_reporter, allocator, builtin_data);
     }
 
     case BuiltinOperator_STRIDED_SLICE: {
-      return ParseStridedSlice(op, op_type, error_reporter, allocator,
-                               builtin_data);
+      return ParseStridedSlice(op, error_reporter, allocator, builtin_data);
     }
 
     case BuiltinOperator_SUB: {
-      return ParseSub(op, op_type, error_reporter, allocator, builtin_data);
+      return ParseSub(op, error_reporter, allocator, builtin_data);
     }
 
     case BuiltinOperator_SUM: {
-      return ParseReducer(op, op_type, error_reporter, allocator, builtin_data);
+      return ParseReducer(op, error_reporter, allocator, builtin_data);
     }
 
     case BuiltinOperator_SVDF: {
-      return ParseSvdf(op, op_type, error_reporter, allocator, builtin_data);
+      return ParseSvdf(op, error_reporter, allocator, builtin_data);
     }
 
     case BuiltinOperator_TANH: {
-      return ParseTanh(op, op_type, error_reporter, allocator, builtin_data);
+      return ParseTanh(op, error_reporter, allocator, builtin_data);
     }
 
     case BuiltinOperator_UNPACK: {
-      return ParseUnpack(op, op_type, error_reporter, allocator, builtin_data);
+      return ParseUnpack(op, error_reporter, allocator, builtin_data);
     }
 
     case BuiltinOperator_CAST: {
diff --git a/tensorflow/lite/core/api/flatbuffer_conversions.h b/tensorflow/lite/core/api/flatbuffer_conversions.h
index 89363df5692..aaeb98c0a2e 100644
--- a/tensorflow/lite/core/api/flatbuffer_conversions.h
+++ b/tensorflow/lite/core/api/flatbuffer_conversions.h
@@ -69,235 +69,183 @@ TfLiteStatus ParseOpData(const Operator* op, BuiltinOperator op_type,
 TfLiteStatus ConvertTensorType(TensorType tensor_type, TfLiteType* type,
                                ErrorReporter* error_reporter);
 
-// TODO(b/149408647): The (unnecessary) op_type parameter in the functions below
-// is to keep the same signature as ParseOpData. This allows for a gradual
-// transfer to selective registration of the parse function, but should be
-// removed once we are no longer using ParseOpData for the OpResolver
-// implementation in micro.
-
-TfLiteStatus ParseAbs(const Operator* op, BuiltinOperator op_type,
-                      ErrorReporter* error_reporter,
+TfLiteStatus ParseAbs(const Operator* op, ErrorReporter* error_reporter,
                       BuiltinDataAllocator* allocator, void** builtin_data);
 
-TfLiteStatus ParseAdd(const Operator* op, BuiltinOperator op_type,
-                      ErrorReporter* error_reporter,
+TfLiteStatus ParseAdd(const Operator* op, ErrorReporter* error_reporter,
                       BuiltinDataAllocator* allocator, void** builtin_data);
 
-TfLiteStatus ParseArgMax(const Operator* op, BuiltinOperator op_type,
-                         ErrorReporter* error_reporter,
+TfLiteStatus ParseArgMax(const Operator* op, ErrorReporter* error_reporter,
                          BuiltinDataAllocator* allocator, void** builtin_data);
 
-TfLiteStatus ParseArgMin(const Operator* op, BuiltinOperator op_type,
-                         ErrorReporter* error_reporter,
+TfLiteStatus ParseArgMin(const Operator* op, ErrorReporter* error_reporter,
                          BuiltinDataAllocator* allocator, void** builtin_data);
 
-TfLiteStatus ParseCeil(const Operator* op, BuiltinOperator op_type,
-                       ErrorReporter* error_reporter,
+TfLiteStatus ParseCeil(const Operator* op, ErrorReporter* error_reporter,
                        BuiltinDataAllocator* allocator, void** builtin_data);
 
-TfLiteStatus ParseConcatenation(const Operator* op, BuiltinOperator op_type,
+TfLiteStatus ParseConcatenation(const Operator* op,
                                 ErrorReporter* error_reporter,
                                 BuiltinDataAllocator* allocator,
                                 void** builtin_data);
 
-TfLiteStatus ParseConv2D(const Operator* op, BuiltinOperator op_type,
-                         ErrorReporter* error_reporter,
+TfLiteStatus ParseConv2D(const Operator* op, ErrorReporter* error_reporter,
                          BuiltinDataAllocator* allocator, void** builtin_data);
 
-TfLiteStatus ParseCos(const Operator* op, BuiltinOperator op_type,
-                      ErrorReporter* error_reporter,
+TfLiteStatus ParseCos(const Operator* op, ErrorReporter* error_reporter,
                       BuiltinDataAllocator* allocator, void** builtin_data);
 
-TfLiteStatus ParseDepthwiseConv2D(const Operator* op, BuiltinOperator op_type,
+TfLiteStatus ParseDepthwiseConv2D(const Operator* op,
                                   ErrorReporter* error_reporter,
                                   BuiltinDataAllocator* allocator,
                                   void** builtin_data);
 
-TfLiteStatus ParseDequantize(const Operator* op, BuiltinOperator op_type,
-                             ErrorReporter* error_reporter,
+TfLiteStatus ParseDequantize(const Operator* op, ErrorReporter* error_reporter,
                              BuiltinDataAllocator* allocator,
                              void** builtin_data);
 
-TfLiteStatus ParseEqual(const Operator* op, BuiltinOperator op_type,
-                        ErrorReporter* error_reporter,
+TfLiteStatus ParseEqual(const Operator* op, ErrorReporter* error_reporter,
                         BuiltinDataAllocator* allocator, void** builtin_data);
 
-TfLiteStatus ParseFloor(const Operator* op, BuiltinOperator op_type,
-                        ErrorReporter* error_reporter,
+TfLiteStatus ParseFloor(const Operator* op, ErrorReporter* error_reporter,
                         BuiltinDataAllocator* allocator, void** builtin_data);
 
-TfLiteStatus ParseFullyConnected(const Operator* op, BuiltinOperator op_type,
+TfLiteStatus ParseFullyConnected(const Operator* op,
                                  ErrorReporter* error_reporter,
                                  BuiltinDataAllocator* allocator,
                                  void** builtin_data);
 
-TfLiteStatus ParseGreater(const Operator* op, BuiltinOperator op_type,
-                          ErrorReporter* error_reporter,
+TfLiteStatus ParseGreater(const Operator* op, ErrorReporter* error_reporter,
                           BuiltinDataAllocator* allocator, void** builtin_data);
 
-TfLiteStatus ParseGreaterEqual(const Operator* op, BuiltinOperator op_type,
+TfLiteStatus ParseGreaterEqual(const Operator* op,
                                ErrorReporter* error_reporter,
                                BuiltinDataAllocator* allocator,
                                void** builtin_data);
 
-TfLiteStatus ParseHardSwish(const Operator* op, BuiltinOperator op_type,
-                            ErrorReporter* error_reporter,
+TfLiteStatus ParseHardSwish(const Operator* op, ErrorReporter* error_reporter,
                             BuiltinDataAllocator* allocator,
                             void** builtin_data);
 
-TfLiteStatus ParseL2Normalization(const Operator* op, BuiltinOperator op_type,
+TfLiteStatus ParseL2Normalization(const Operator* op,
                                   ErrorReporter* error_reporter,
                                   BuiltinDataAllocator* allocator,
                                   void** builtin_data);
 
-TfLiteStatus ParseLess(const Operator* op, BuiltinOperator op_type,
-                       ErrorReporter* error_reporter,
+TfLiteStatus ParseLess(const Operator* op, ErrorReporter* error_reporter,
                        BuiltinDataAllocator* allocator, void** builtin_data);
 
-TfLiteStatus ParseLessEqual(const Operator* op, BuiltinOperator op_type,
-                            ErrorReporter* error_reporter,
+TfLiteStatus ParseLessEqual(const Operator* op, ErrorReporter* error_reporter,
                             BuiltinDataAllocator* allocator,
                             void** builtin_data);
 
-TfLiteStatus ParseLog(const Operator* op, BuiltinOperator op_type,
-                      ErrorReporter* error_reporter,
+TfLiteStatus ParseLog(const Operator* op, ErrorReporter* error_reporter,
                       BuiltinDataAllocator* allocator, void** builtin_data);
 
-TfLiteStatus ParseLogicalAnd(const Operator* op, BuiltinOperator op_type,
-                             ErrorReporter* error_reporter,
+TfLiteStatus ParseLogicalAnd(const Operator* op, ErrorReporter* error_reporter,
                              BuiltinDataAllocator* allocator,
                              void** builtin_data);
 
-TfLiteStatus ParseLogicalNot(const Operator* op, BuiltinOperator op_type,
-                             ErrorReporter* error_reporter,
+TfLiteStatus ParseLogicalNot(const Operator* op, ErrorReporter* error_reporter,
                              BuiltinDataAllocator* allocator,
                              void** builtin_data);
 
-TfLiteStatus ParseLogicalOr(const Operator* op, BuiltinOperator op_type,
-                            ErrorReporter* error_reporter,
+TfLiteStatus ParseLogicalOr(const Operator* op, ErrorReporter* error_reporter,
                             BuiltinDataAllocator* allocator,
                             void** builtin_data);
 
-TfLiteStatus ParseLogistic(const Operator* op, BuiltinOperator op_type,
-                           ErrorReporter* error_reporter,
+TfLiteStatus ParseLogistic(const Operator* op, ErrorReporter* error_reporter,
                            BuiltinDataAllocator* allocator,
                            void** builtin_data);
 
-TfLiteStatus ParseMaximum(const Operator* op, BuiltinOperator op_type,
-                          ErrorReporter* error_reporter,
+TfLiteStatus ParseMaximum(const Operator* op, ErrorReporter* error_reporter,
                           BuiltinDataAllocator* allocator, void** builtin_data);
 
-TfLiteStatus ParseMinimum(const Operator* op, BuiltinOperator op_type,
-                          ErrorReporter* error_reporter,
+TfLiteStatus ParseMinimum(const Operator* op, ErrorReporter* error_reporter,
                           BuiltinDataAllocator* allocator, void** builtin_data);
 
-TfLiteStatus ParseMul(const Operator* op, BuiltinOperator op_type,
-                      ErrorReporter* error_reporter,
+TfLiteStatus ParseMul(const Operator* op, ErrorReporter* error_reporter,
                       BuiltinDataAllocator* allocator, void** builtin_data);
 
-TfLiteStatus ParseNeg(const Operator* op, BuiltinOperator op_type,
-                      ErrorReporter* error_reporter,
+TfLiteStatus ParseNeg(const Operator* op, ErrorReporter* error_reporter,
                       BuiltinDataAllocator* allocator, void** builtin_data);
 
-TfLiteStatus ParseNotEqual(const Operator* op, BuiltinOperator op_type,
-                           ErrorReporter* error_reporter,
+TfLiteStatus ParseNotEqual(const Operator* op, ErrorReporter* error_reporter,
                            BuiltinDataAllocator* allocator,
                            void** builtin_data);
 
-TfLiteStatus ParsePack(const Operator* op, BuiltinOperator op_type,
-                       ErrorReporter* error_reporter,
+TfLiteStatus ParsePack(const Operator* op, ErrorReporter* error_reporter,
                        BuiltinDataAllocator* allocator, void** builtin_data);
 
-TfLiteStatus ParsePad(const Operator* op, BuiltinOperator op_type,
-                      ErrorReporter* error_reporter,
+TfLiteStatus ParsePad(const Operator* op, ErrorReporter* error_reporter,
                       BuiltinDataAllocator* allocator, void** builtin_data);
 
-TfLiteStatus ParsePadV2(const Operator* op, BuiltinOperator op_type,
-                        ErrorReporter* error_reporter,
+TfLiteStatus ParsePadV2(const Operator* op, ErrorReporter* error_reporter,
                         BuiltinDataAllocator* allocator, void** builtin_data);
 
-TfLiteStatus ParsePool(const Operator* op, BuiltinOperator op_type,
-                       ErrorReporter* error_reporter,
+TfLiteStatus ParsePool(const Operator* op, ErrorReporter* error_reporter,
                        BuiltinDataAllocator* allocator, void** builtin_data);
 
-TfLiteStatus ParsePrelu(const Operator* op, BuiltinOperator op_type,
-                        ErrorReporter* error_reporter,
+TfLiteStatus ParsePrelu(const Operator* op, ErrorReporter* error_reporter,
                         BuiltinDataAllocator* allocator, void** builtin_data);
 
-TfLiteStatus ParseQuantize(const Operator* op, BuiltinOperator op_type,
-                           ErrorReporter* error_reporter,
+TfLiteStatus ParseQuantize(const Operator* op, ErrorReporter* error_reporter,
                            BuiltinDataAllocator* allocator,
                            void** builtin_data);
 
-TfLiteStatus ParseReducer(const Operator* op, BuiltinOperator op_type,
-                          ErrorReporter* error_reporter,
+TfLiteStatus ParseReducer(const Operator* op, ErrorReporter* error_reporter,
                           BuiltinDataAllocator* allocator, void** builtin_data);
 
-TfLiteStatus ParseRelu(const Operator* op, BuiltinOperator op_type,
-                       ErrorReporter* error_reporter,
+TfLiteStatus ParseRelu(const Operator* op, ErrorReporter* error_reporter,
                        BuiltinDataAllocator* allocator, void** builtin_data);
 
-TfLiteStatus ParseRelu6(const Operator* op, BuiltinOperator op_type,
-                        ErrorReporter* error_reporter,
+TfLiteStatus ParseRelu6(const Operator* op, ErrorReporter* error_reporter,
                         BuiltinDataAllocator* allocator, void** builtin_data);
 
-TfLiteStatus ParseReshape(const Operator* op, BuiltinOperator op_type,
-                          ErrorReporter* error_reporter,
+TfLiteStatus ParseReshape(const Operator* op, ErrorReporter* error_reporter,
                           BuiltinDataAllocator* allocator, void** builtin_data);
 
 TfLiteStatus ParseResizeNearestNeighbor(const Operator* op,
-                                        BuiltinOperator op_type,
                                         ErrorReporter* error_reporter,
                                         BuiltinDataAllocator* allocator,
                                         void** builtin_data);
 
-TfLiteStatus ParseRound(const Operator* op, BuiltinOperator op_type,
-                        ErrorReporter* error_reporter,
+TfLiteStatus ParseRound(const Operator* op, ErrorReporter* error_reporter,
                         BuiltinDataAllocator* allocator, void** builtin_data);
 
-TfLiteStatus ParseRsqrt(const Operator* op, BuiltinOperator op_type,
-                        ErrorReporter* error_reporter,
+TfLiteStatus ParseRsqrt(const Operator* op, ErrorReporter* error_reporter,
                         BuiltinDataAllocator* allocator, void** builtin_data);
 
-TfLiteStatus ParseSin(const Operator* op, BuiltinOperator op_type,
-                      ErrorReporter* error_reporter,
+TfLiteStatus ParseSin(const Operator* op, ErrorReporter* error_reporter,
                       BuiltinDataAllocator* allocator, void** builtin_data);
 
-TfLiteStatus ParseSoftmax(const Operator* op, BuiltinOperator op_type,
-                          ErrorReporter* error_reporter,
+TfLiteStatus ParseSoftmax(const Operator* op, ErrorReporter* error_reporter,
                           BuiltinDataAllocator* allocator, void** builtin_data);
 
-TfLiteStatus ParseSplit(const Operator* op, BuiltinOperator op_type,
-                        ErrorReporter* error_reporter,
+TfLiteStatus ParseSplit(const Operator* op, ErrorReporter* error_reporter,
                         BuiltinDataAllocator* allocator, void** builtin_data);
 
-TfLiteStatus ParseSqrt(const Operator* op, BuiltinOperator op_type,
-                       ErrorReporter* error_reporter,
+TfLiteStatus ParseSqrt(const Operator* op, ErrorReporter* error_reporter,
                        BuiltinDataAllocator* allocator, void** builtin_data);
 
-TfLiteStatus ParseSquare(const Operator* op, BuiltinOperator op_type,
-                         ErrorReporter* error_reporter,
+TfLiteStatus ParseSquare(const Operator* op, ErrorReporter* error_reporter,
                          BuiltinDataAllocator* allocator, void** builtin_data);
 
-TfLiteStatus ParseStridedSlice(const Operator* op, BuiltinOperator op_type,
+TfLiteStatus ParseStridedSlice(const Operator* op,
                                ErrorReporter* error_reporter,
                                BuiltinDataAllocator* allocator,
                                void** builtin_data);
 
-TfLiteStatus ParseSub(const Operator* op, BuiltinOperator op_type,
-                      ErrorReporter* error_reporter,
+TfLiteStatus ParseSub(const Operator* op, ErrorReporter* error_reporter,
                       BuiltinDataAllocator* allocator, void** builtin_data);
 
-TfLiteStatus ParseSvdf(const Operator* op, BuiltinOperator op_type,
-                       ErrorReporter* error_reporter,
+TfLiteStatus ParseSvdf(const Operator* op, ErrorReporter* error_reporter,
                        BuiltinDataAllocator* allocator, void** builtin_data);
 
-TfLiteStatus ParseTanh(const Operator* op, BuiltinOperator op_type,
-                       ErrorReporter* error_reporter,
+TfLiteStatus ParseTanh(const Operator* op, ErrorReporter* error_reporter,
                        BuiltinDataAllocator* allocator, void** builtin_data);
 
-TfLiteStatus ParseUnpack(const Operator* op, BuiltinOperator op_type,
-                         ErrorReporter* error_reporter,
+TfLiteStatus ParseUnpack(const Operator* op, ErrorReporter* error_reporter,
                          BuiltinDataAllocator* allocator, void** builtin_data);
 
 }  // namespace tflite
diff --git a/tensorflow/lite/micro/micro_allocator.cc b/tensorflow/lite/micro/micro_allocator.cc
index 5b62ae6c3b3..abc26c90efb 100644
--- a/tensorflow/lite/micro/micro_allocator.cc
+++ b/tensorflow/lite/micro/micro_allocator.cc
@@ -818,8 +818,7 @@ TfLiteStatus MicroAllocator::PrepareNodeAndRegistrationDataFromFlatbuffer(
 
         return kTfLiteError;
       }
-      TF_LITE_ENSURE_STATUS(parser(op, op_type, error_reporter_,
-                                   &builtin_data_allocator,
+      TF_LITE_ENSURE_STATUS(parser(op, error_reporter_, &builtin_data_allocator,
                                    (void**)(&builtin_data)));
     }
 
diff --git a/tensorflow/lite/micro/micro_op_resolver.h b/tensorflow/lite/micro/micro_op_resolver.h
index 9b2b70cb910..757b6b894e7 100644
--- a/tensorflow/lite/micro/micro_op_resolver.h
+++ b/tensorflow/lite/micro/micro_op_resolver.h
@@ -34,12 +34,7 @@ namespace tflite {
 // registered Ops as the template parameter.
 class MicroOpResolver : public OpResolver {
  public:
-  // TODO(b/149408647): The op_type parameter enables a gradual transfer to
-  // selective registration of the parse function. It should be removed once we
-  // no longer need to use ParseOpData (from flatbuffer_conversions.h) as part
-  // of the MicroMutableOpResolver.
   typedef TfLiteStatus (*BuiltinParseFunction)(const Operator* op,
-                                               BuiltinOperator op_type,
                                                ErrorReporter* error_reporter,
                                                BuiltinDataAllocator* allocator,
                                                void** builtin_data);

From 4e004c5e4afaa7c7c9f8cd3a5e807ecce2abecc6 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 30 Jun 2020 12:23:20 -0700
Subject: [PATCH 1337/1390] Fixed a bug where RemoveQuantizationAdapterOps
 would remove a dequantize which fed a return even if that op had other uses.

PiperOrigin-RevId: 319074130
Change-Id: I2b682bcfdfdc4b496711da3ab53bea5975cc8c54
---
 .../compiler/mlir/lite/tests/post-quantize.mlir    | 14 ++++++++++++++
 .../compiler/mlir/lite/transforms/post_quantize.cc |  5 +++--
 2 files changed, 17 insertions(+), 2 deletions(-)

diff --git a/tensorflow/compiler/mlir/lite/tests/post-quantize.mlir b/tensorflow/compiler/mlir/lite/tests/post-quantize.mlir
index 798525775cf..51a4f1d3b27 100644
--- a/tensorflow/compiler/mlir/lite/tests/post-quantize.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/post-quantize.mlir
@@ -63,3 +63,17 @@ func @main2(%arg0: tensor<2x4xf32>, %arg1: tensor<2x4xf32>) -> tensor<2x4xf32> {
 // CHECK-NEXT:  %[[add:.*]] = tfl.add %arg0, %arg1 {fused_activation_function = "NONE"} : tensor<2x4x!quant.uniform<u8:f32, 0.49803921568627452>>
 // CHECK-NEXT:  return %[[add]] : tensor<2x4x!quant.uniform<u8:f32, 0.49803921568627452>>
 // CHECK-NEXT:}
+
+// CHECK-LABEL: HandleReturnedDequantizeWithAnotherUse
+func @HandleReturnedDequantizeWithAnotherUse(%arg0: tensor<128x16xf32>) -> (tensor<128x16xf32>, tensor<128xi32>) {
+// CHECK-NEXT:  %[[cst:.*]] = constant dense<1> : tensor<i32>
+  %cst = constant dense<1> : tensor<i32>
+// CHECK-NEXT:  %[[softmax:.*]] = "tfl.softmax"(%arg0) {beta = 1.000000e+00 : f32} : (tensor<128x16xf32>) -> tensor<128x16xf32>
+  %0 = "tfl.softmax"(%arg0) {beta = 1.000000e+00 : f32} : (tensor<128x16xf32>) -> tensor<128x16xf32>
+  %1 = "tfl.quantize"(%0) {qtype = tensor<128x16x!quant.uniform<u8:f32, 3.906250e-03>>, volatile} : (tensor<128x16xf32>) -> tensor<128x16x!quant.uniform<u8:f32, 3.906250e-03>>
+  %2 = "tfl.dequantize"(%1) : (tensor<128x16x!quant.uniform<u8:f32, 3.906250e-03>>) -> tensor<128x16xf32>
+// CHECK-NEXT:  %[[argmax:.*]] = "tfl.arg_max"(%[[softmax]], %[[cst]]) : (tensor<128x16xf32>, tensor<i32>) -> tensor<128xi32>
+  %3 = "tfl.arg_max"(%2, %cst) : (tensor<128x16xf32>, tensor<i32>) -> tensor<128xi32>
+// CHECK-NEXT:  return %[[softmax]], %[[argmax]] : tensor<128x16xf32>, tensor<128xi32>
+  return %2, %3 : tensor<128x16xf32>, tensor<128xi32>
+}
diff --git a/tensorflow/compiler/mlir/lite/transforms/post_quantize.cc b/tensorflow/compiler/mlir/lite/transforms/post_quantize.cc
index 33380e00543..1c6550bc902 100644
--- a/tensorflow/compiler/mlir/lite/transforms/post_quantize.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/post_quantize.cc
@@ -53,7 +53,6 @@ class PostQuantizePass : public PassWrapper<PostQuantizePass, FunctionPass> {
 void RemoveQuantizationAdaptorOps(FuncOp func) {
   mlir::OpBuilder builder(func.getBody());
   auto& bb = func.front();
-  auto* terminator = bb.getTerminator();
 
   int num_args = bb.getNumArguments();
   llvm::SmallVector<Type, 4> input_types;
@@ -99,13 +98,15 @@ void RemoveQuantizationAdaptorOps(FuncOp func) {
   }
 
   // Edit the return ops and remove the dequantize ops in place.
+  auto* terminator = bb.getTerminator();
   int num_return_operands = terminator->getNumOperands();
   llvm::SmallVector<Type, 4> output_types;
   output_types.reserve(num_return_operands);
   for (int i = 0; i != num_return_operands; ++i) {
     auto returned_value = terminator->getOperand(i);
     Operation* returned_op = returned_value.getDefiningOp();
-    if (returned_op && llvm::isa<DequantizeOp>(returned_op)) {
+    if (returned_op && returned_op->hasOneUse() &&
+        llvm::isa<DequantizeOp>(returned_op)) {
       auto dequantize_op = llvm::cast<DequantizeOp>(returned_op);
       Value dequantized_result = dequantize_op.input();
       output_types.push_back(dequantized_result.getType());

From 26cd260fac5fa98ade11ff2a5ec38ede65631cc0 Mon Sep 17 00:00:00 2001
From: Andrew Audibert <aaudibert@google.com>
Date: Tue, 30 Jun 2020 12:45:18 -0700
Subject: [PATCH 1338/1390] Add additional data validation while saving and
 restoring iterators.

PiperOrigin-RevId: 319078544
Change-Id: I4a439934e1ba35d5eab38513cae735372d62c8d6
---
 tensorflow/core/kernels/data/iterator_ops.cc | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/tensorflow/core/kernels/data/iterator_ops.cc b/tensorflow/core/kernels/data/iterator_ops.cc
index 1996e7f230e..eebcd4a8248 100644
--- a/tensorflow/core/kernels/data/iterator_ops.cc
+++ b/tensorflow/core/kernels/data/iterator_ops.cc
@@ -331,6 +331,12 @@ class IteratorVariantSerializer {
     data.reserve(num_tensors);
     for (int i = 0; i < num_tensors; ++i) {
       auto* w = serialized_vec(i).get<IteratorStateVariant>();
+      if (!w) {
+        return errors::Internal(
+            "Cannot initialize an iterator from tensor ",
+            serialized_vec(i).DebugString(),
+            ". Expected a variant tensor of type IteratorStateVariant");
+      }
       data.push_back(w->GetData());
     }
     reader_ = absl::make_unique<VariantTensorDataReader>(data);
@@ -349,6 +355,10 @@ class IteratorVariantSerializer {
     }
     int64 size = variants_.size();
     for (int64 i = 0; i < size; ++i) {
+      if (variants_[i].GetData() == nullptr) {
+        return errors::Internal(
+            "Cannot serialize an empty IteratorStateVariant");
+      }
       serialized->vec<Variant>()(i) = variants_[i];
     }
     return Status::OK();

From 6e6645336317e78ee929edb8b69aca2e4945e8d8 Mon Sep 17 00:00:00 2001
From: Tomer Kaftan <kaftan@google.com>
Date: Tue, 30 Jun 2020 12:54:16 -0700
Subject: [PATCH 1339/1390] Simplify dtype serialization in Keras to just save
 the dtype name, because things that accept dtypes generally accept the dtype
 string as well, and update HDF5 saving/loading to use the same decoding utils
 as saved_model saving/loading.

PiperOrigin-RevId: 319080398
Change-Id: I972c0c993b1e91a2e9ef203363c18eb655f8eb8d
---
 tensorflow/python/keras/saving/hdf5_format.py            | 5 +++--
 tensorflow/python/keras/saving/saved_model/json_utils.py | 3 ---
 tensorflow/python/util/serialization.py                  | 2 +-
 3 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/tensorflow/python/keras/saving/hdf5_format.py b/tensorflow/python/keras/saving/hdf5_format.py
index 01a5e12e4c6..3aa4fe1245a 100644
--- a/tensorflow/python/keras/saving/hdf5_format.py
+++ b/tensorflow/python/keras/saving/hdf5_format.py
@@ -29,6 +29,7 @@ from tensorflow.python.keras import backend as K
 from tensorflow.python.keras import optimizers
 from tensorflow.python.keras.saving import model_config as model_config_lib
 from tensorflow.python.keras.saving import saving_utils
+from tensorflow.python.keras.saving.saved_model import json_utils
 from tensorflow.python.keras.utils import conv_utils
 from tensorflow.python.keras.utils.io_utils import ask_to_proceed_with_overwrite
 from tensorflow.python.ops import variables as variables_module
@@ -173,7 +174,7 @@ def load_model_from_hdf5(filepath, custom_objects=None, compile=True):  # pylint
     model_config = f.attrs.get('model_config')
     if model_config is None:
       raise ValueError('No model found in config file.')
-    model_config = json.loads(model_config.decode('utf-8'))
+    model_config = json_utils.decode(model_config.decode('utf-8'))
     model = model_config_lib.model_from_config(model_config,
                                                custom_objects=custom_objects)
 
@@ -187,7 +188,7 @@ def load_model_from_hdf5(filepath, custom_objects=None, compile=True):  # pylint
         logging.warning('No training configuration found in the save file, so '
                         'the model was *not* compiled. Compile it manually.')
         return model
-      training_config = json.loads(training_config.decode('utf-8'))
+      training_config = json_utils.decode(training_config.decode('utf-8'))
 
       # Compile model.
       model.compile(**saving_utils.compile_args_from_training_config(
diff --git a/tensorflow/python/keras/saving/saved_model/json_utils.py b/tensorflow/python/keras/saving/saved_model/json_utils.py
index f50bbb02287..0ac86d4e692 100644
--- a/tensorflow/python/keras/saving/saved_model/json_utils.py
+++ b/tensorflow/python/keras/saving/saved_model/json_utils.py
@@ -27,7 +27,6 @@ from __future__ import print_function
 
 import json
 
-from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.util import serialization
 
@@ -67,6 +66,4 @@ def _decode_helper(obj):
       return tensor_shape.TensorShape(obj['items'])
     elif obj['class_name'] == '__tuple__':
       return tuple(_decode_helper(i) for i in obj['items'])
-    elif obj['class_name'] == 'DType':
-      return dtypes.as_dtype(obj['dtype'])
   return obj
diff --git a/tensorflow/python/util/serialization.py b/tensorflow/python/util/serialization.py
index bf4a31e1a16..3b1713b4c61 100644
--- a/tensorflow/python/util/serialization.py
+++ b/tensorflow/python/util/serialization.py
@@ -65,7 +65,7 @@ def get_json_type(obj):
     return obj.as_list()
 
   if isinstance(obj, dtypes.DType):
-    return {'class_name': 'DType', 'dtype': obj.name}
+    return obj.name
 
   if isinstance(obj, collections_abc.Mapping):
     return dict(obj)

From 56a0ce87911236765633d2a873e706ebc6401ef9 Mon Sep 17 00:00:00 2001
From: Haifeng Jin <haifengj@google.com>
Date: Tue, 30 Jun 2020 12:54:41 -0700
Subject: [PATCH 1340/1390] Use data adapters in single_batch_iterator.

PiperOrigin-RevId: 319080480
Change-Id: I05f673abdb8de99ffa52468f8d15f15ac4fe01f3
---
 .../python/keras/engine/data_adapter.py       | 23 +++++++++++--------
 .../python/keras/engine/training_test.py      | 20 ++++++++++++++++
 2 files changed, 34 insertions(+), 9 deletions(-)

diff --git a/tensorflow/python/keras/engine/data_adapter.py b/tensorflow/python/keras/engine/data_adapter.py
index 29a99137982..8184ec7a0c1 100644
--- a/tensorflow/python/keras/engine/data_adapter.py
+++ b/tensorflow/python/keras/engine/data_adapter.py
@@ -272,15 +272,8 @@ class TensorLikeDataAdapter(DataAdapter):
 
     inputs = pack_x_y_sample_weight(x, y, sample_weights)
 
-    num_samples = set(int(i.shape[0]) for i in nest.flatten(inputs))
-    if len(num_samples) > 1:
-      msg = "Data cardinality is ambiguous:\n"
-      for label, data in zip(["x", "y", "sample_weight"], inputs):
-        msg += "  {} sizes: {}\n".format(
-            label, ", ".join(str(i.shape[0]) for i in nest.flatten(data)))
-      msg += "Please provide data which shares the same first dimension."
-      raise ValueError(msg)
-    num_samples = num_samples.pop()
+    num_samples = set(int(i.shape[0]) for i in nest.flatten(inputs)).pop()
+    _check_data_cardinality(inputs)
 
     # If batch_size is not passed but steps is, calculate from the input data.
     # Default to 32 for backwards compat.
@@ -1527,6 +1520,7 @@ def single_batch_iterator(strategy,
   else:
     data = (x, y, sample_weight)
 
+  _check_data_cardinality(data)
   dataset = dataset_ops.DatasetV2.from_tensors(data)
   if class_weight:
     dataset = dataset.map(_make_class_weight_map_fn(class_weight))
@@ -1534,6 +1528,17 @@ def single_batch_iterator(strategy,
   return iter(dataset)
 
 
+def _check_data_cardinality(data):
+  num_samples = set(int(i.shape[0]) for i in nest.flatten(data))
+  if len(num_samples) > 1:
+    msg = "Data cardinality is ambiguous:\n"
+    for label, single_data in zip(["x", "y", "sample_weight"], data):
+      msg += "  {} sizes: {}\n".format(
+          label, ", ".join(str(i.shape[0]) for i in nest.flatten(single_data)))
+    msg += "Make sure all arrays contain the same number of samples."
+    raise ValueError(msg)
+
+
 def _scipy_sparse_to_sparse_tensor(t):
   """Converts a SciPy sparse matrix to a SparseTensor."""
   sparse_coo = t.tocoo()
diff --git a/tensorflow/python/keras/engine/training_test.py b/tensorflow/python/keras/engine/training_test.py
index 0e41b0f6cf9..8cb3f99ddb0 100644
--- a/tensorflow/python/keras/engine/training_test.py
+++ b/tensorflow/python/keras/engine/training_test.py
@@ -1667,6 +1667,26 @@ class TestExceptionsAndWarnings(keras_parameterized.TestCase):
     ):
       model.predict(np.array([]))
 
+  @keras_parameterized.run_all_keras_modes(always_skip_v1=True)
+  def test_on_batch_error_inconsistent_batch_size(self):
+    input_node1 = layers_module.Input(shape=(5,))
+    input_node2 = layers_module.Input(shape=(5,))
+    output_node = layers_module.Concatenate()([input_node1, input_node2])
+    output_node = layers_module.Dense(4)(output_node)
+    model = training_module.Model([input_node1, input_node2], output_node)
+    model.compile(loss='mse')
+
+    with self.assertRaisesRegexp(ValueError, 'Data cardinality is ambiguous'):
+      model.train_on_batch([np.ones((10, 5)), np.ones((10, 5))],
+                           np.ones((11, 4)))
+
+    with self.assertRaisesRegexp(ValueError, 'Data cardinality is ambiguous'):
+      model.test_on_batch([np.ones((10, 5)), np.ones((10, 5))],
+                          np.ones((11, 4)))
+
+    with self.assertRaisesRegexp(ValueError, 'Data cardinality is ambiguous'):
+      model.predict_on_batch([np.ones((10, 5)), np.ones((11, 5))])
+
 
 class LossWeightingTest(keras_parameterized.TestCase):
 

From a8373501ddb6b80fb1375f1e1f961bfa4f78829e Mon Sep 17 00:00:00 2001
From: Peter Hawkins <phawkins@google.com>
Date: Tue, 30 Jun 2020 13:00:40 -0700
Subject: [PATCH 1341/1390] [XLA:Python] Create a trivial forwarding wrapper
 around TraceMeWrapper to avoid registering the same class with pybind11
 twice.

PiperOrigin-RevId: 319081785
Change-Id: Ic06a6ccf86a7a2aac8bb9ecbafd5746ab60a1840
---
 tensorflow/compiler/xla/python/xla.cc | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/tensorflow/compiler/xla/python/xla.cc b/tensorflow/compiler/xla/python/xla.cc
index c0a440aa4bd..ed9b80775d8 100644
--- a/tensorflow/compiler/xla/python/xla.cc
+++ b/tensorflow/compiler/xla/python/xla.cc
@@ -73,7 +73,6 @@ namespace {
 
 namespace py = pybind11;
 
-using ::tensorflow::profiler::TraceMeWrapper;
 
 struct Uniquer {
   absl::Mutex mu;
@@ -160,6 +159,13 @@ Status PyRegisterCustomCallTarget(const std::string& fn_name,
   return Status::OK();
 }
 
+// Adds a trivial forwarding class so these Python bindings and TensorFlow's
+// bindings of the same thing don't register the same class with pybind11.
+class TraceMeWrapper : public tensorflow::profiler::TraceMeWrapper {
+ public:
+  using tensorflow::profiler::TraceMeWrapper::TraceMeWrapper;
+};
+
 void BuildProfilerSubmodule(py::module* m) {
   py::module profiler =
       m->def_submodule("profiler", "TensorFlow profiler integration");

From 0d37c8f114a0ef4228cee2f371f858d701d7f093 Mon Sep 17 00:00:00 2001
From: Austin Anderson <angerson@google.com>
Date: Tue, 30 Jun 2020 13:03:35 -0700
Subject: [PATCH 1342/1390] Testing internal CI changes

PiperOrigin-RevId: 319082460
Change-Id: I6fc7bc575ea7a22151663085e11fccb4bc0ec1e5
---
 .../per_release/scripts/cpu_libtensorflow.bat | 20 ++++++++++++++++
 .../per_release/scripts/cpu_py35_full.bat     | 20 ++++++++++++++++
 .../per_release/scripts/cpu_py36_full.bat     | 20 ++++++++++++++++
 .../per_release/scripts/cpu_py37_full.bat     | 20 ++++++++++++++++
 .../per_release/scripts/cpu_py38_full.bat     | 21 +++++++++++++++++
 .../per_release/scripts/gpu_libtensorflow.bat | 20 ++++++++++++++++
 .../per_release/scripts/gpu_pip_on_cpu.bat    | 21 +++++++++++++++++
 .../per_release/scripts/gpu_py35_full.bat     | 23 +++++++++++++++++++
 .../per_release/scripts/gpu_py36_full.bat     | 23 +++++++++++++++++++
 .../per_release/scripts/gpu_py37_full.bat     | 23 +++++++++++++++++++
 .../per_release/scripts/gpu_py38_full.bat     | 23 +++++++++++++++++++
 11 files changed, 234 insertions(+)
 create mode 100644 tensorflow/tools/ci_build/per_release/scripts/cpu_libtensorflow.bat
 create mode 100644 tensorflow/tools/ci_build/per_release/scripts/cpu_py35_full.bat
 create mode 100644 tensorflow/tools/ci_build/per_release/scripts/cpu_py36_full.bat
 create mode 100644 tensorflow/tools/ci_build/per_release/scripts/cpu_py37_full.bat
 create mode 100644 tensorflow/tools/ci_build/per_release/scripts/cpu_py38_full.bat
 create mode 100644 tensorflow/tools/ci_build/per_release/scripts/gpu_libtensorflow.bat
 create mode 100644 tensorflow/tools/ci_build/per_release/scripts/gpu_pip_on_cpu.bat
 create mode 100644 tensorflow/tools/ci_build/per_release/scripts/gpu_py35_full.bat
 create mode 100644 tensorflow/tools/ci_build/per_release/scripts/gpu_py36_full.bat
 create mode 100644 tensorflow/tools/ci_build/per_release/scripts/gpu_py37_full.bat
 create mode 100644 tensorflow/tools/ci_build/per_release/scripts/gpu_py38_full.bat

diff --git a/tensorflow/tools/ci_build/per_release/scripts/cpu_libtensorflow.bat b/tensorflow/tools/ci_build/per_release/scripts/cpu_libtensorflow.bat
new file mode 100644
index 00000000000..67941234b15
--- /dev/null
+++ b/tensorflow/tools/ci_build/per_release/scripts/cpu_libtensorflow.bat
@@ -0,0 +1,20 @@
+:: Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+::
+:: Licensed under the Apache License, Version 2.0 (the "License");
+:: you may not use this file except in compliance with the License.
+:: You may obtain a copy of the License at
+::
+::     http://www.apache.org/licenses/LICENSE-2.0
+::
+:: Unless required by applicable law or agreed to in writing, software
+:: distributed under the License is distributed on an "AS IS" BASIS,
+:: WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+:: See the License for the specific language governing permissions and
+:: limitations under the License.
+:: =============================================================================
+
+CALL tensorflow\tools\ci_build\release\common_win.bat
+
+call tensorflow\tools\ci_build\windows\cpu\bazel\run_libtensorflow.bat || exit /b 1
+
+copy lib_package %TF_ARTIFACTS_DIR%\lib_package
diff --git a/tensorflow/tools/ci_build/per_release/scripts/cpu_py35_full.bat b/tensorflow/tools/ci_build/per_release/scripts/cpu_py35_full.bat
new file mode 100644
index 00000000000..02b12c7650a
--- /dev/null
+++ b/tensorflow/tools/ci_build/per_release/scripts/cpu_py35_full.bat
@@ -0,0 +1,20 @@
+:: Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+::
+:: Licensed under the Apache License, Version 2.0 (the "License");
+:: you may not use this file except in compliance with the License.
+:: You may obtain a copy of the License at
+::
+::     http://www.apache.org/licenses/LICENSE-2.0
+::
+:: Unless required by applicable law or agreed to in writing, software
+:: distributed under the License is distributed on an "AS IS" BASIS,
+:: WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+:: See the License for the specific language governing permissions and
+:: limitations under the License.
+:: =============================================================================
+
+SET PYTHON_DIRECTORY=Python35
+
+CALL tensorflow\tools\ci_build\release\common_win.bat
+
+call tensorflow\tools\ci_build\windows\cpu\pip\run.bat --release_build --extra_build_flags "--config=v2 --define=no_tensorflow_py_deps=true" --extra_test_flags "--test_env=TF2_BEHAVIOR=1" --project_name "tensorflow_cpu"
diff --git a/tensorflow/tools/ci_build/per_release/scripts/cpu_py36_full.bat b/tensorflow/tools/ci_build/per_release/scripts/cpu_py36_full.bat
new file mode 100644
index 00000000000..e44e6ca6e18
--- /dev/null
+++ b/tensorflow/tools/ci_build/per_release/scripts/cpu_py36_full.bat
@@ -0,0 +1,20 @@
+:: Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+::
+:: Licensed under the Apache License, Version 2.0 (the "License");
+:: you may not use this file except in compliance with the License.
+:: You may obtain a copy of the License at
+::
+::     http://www.apache.org/licenses/LICENSE-2.0
+::
+:: Unless required by applicable law or agreed to in writing, software
+:: distributed under the License is distributed on an "AS IS" BASIS,
+:: WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+:: See the License for the specific language governing permissions and
+:: limitations under the License.
+:: =============================================================================
+
+SET PYTHON_DIRECTORY=Python36
+
+CALL tensorflow\tools\ci_build\release\common_win.bat
+
+call tensorflow\tools\ci_build\windows\cpu\pip\run.bat --release_build --extra_build_flags "--config=v2 --define=no_tensorflow_py_deps=true" --extra_test_flags "--test_env=TF2_BEHAVIOR=1" --project_name "tensorflow_cpu"
diff --git a/tensorflow/tools/ci_build/per_release/scripts/cpu_py37_full.bat b/tensorflow/tools/ci_build/per_release/scripts/cpu_py37_full.bat
new file mode 100644
index 00000000000..c65167a5dc6
--- /dev/null
+++ b/tensorflow/tools/ci_build/per_release/scripts/cpu_py37_full.bat
@@ -0,0 +1,20 @@
+:: Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+::
+:: Licensed under the Apache License, Version 2.0 (the "License");
+:: you may not use this file except in compliance with the License.
+:: You may obtain a copy of the License at
+::
+::     http://www.apache.org/licenses/LICENSE-2.0
+::
+:: Unless required by applicable law or agreed to in writing, software
+:: distributed under the License is distributed on an "AS IS" BASIS,
+:: WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+:: See the License for the specific language governing permissions and
+:: limitations under the License.
+:: =============================================================================
+
+SET PYTHON_DIRECTORY=Python37
+
+CALL tensorflow\tools\ci_build\release\common_win.bat
+
+call tensorflow\tools\ci_build\windows\cpu\pip\run.bat --release_build --extra_build_flags "--config=v2 --define=no_tensorflow_py_deps=true" --extra_test_flags "--test_env=TF2_BEHAVIOR=1" --project_name "tensorflow_cpu"
diff --git a/tensorflow/tools/ci_build/per_release/scripts/cpu_py38_full.bat b/tensorflow/tools/ci_build/per_release/scripts/cpu_py38_full.bat
new file mode 100644
index 00000000000..06599fc0d8c
--- /dev/null
+++ b/tensorflow/tools/ci_build/per_release/scripts/cpu_py38_full.bat
@@ -0,0 +1,21 @@
+:: Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+::
+:: Licensed under the Apache License, Version 2.0 (the "License");
+:: you may not use this file except in compliance with the License.
+:: You may obtain a copy of the License at
+::
+::     http://www.apache.org/licenses/LICENSE-2.0
+::
+:: Unless required by applicable law or agreed to in writing, software
+:: distributed under the License is distributed on an "AS IS" BASIS,
+:: WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+:: See the License for the specific language governing permissions and
+:: limitations under the License.
+:: =============================================================================
+
+SET PYTHON_DIRECTORY=Python38
+
+CALL tensorflow\tools\ci_build\release\common_win.bat
+
+call tensorflow\tools\ci_build\windows\cpu\pip\run.bat --release_build --extra_build_flags "--config=v2 --define=no_tensorflow_py_deps=true" --extra_test_flags "--test_env=TF2_BEHAVIOR=1" --project_name "tensorflow_cpu"
+
diff --git a/tensorflow/tools/ci_build/per_release/scripts/gpu_libtensorflow.bat b/tensorflow/tools/ci_build/per_release/scripts/gpu_libtensorflow.bat
new file mode 100644
index 00000000000..8ab78bef3ca
--- /dev/null
+++ b/tensorflow/tools/ci_build/per_release/scripts/gpu_libtensorflow.bat
@@ -0,0 +1,20 @@
+:: Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+::
+:: Licensed under the Apache License, Version 2.0 (the "License");
+:: you may not use this file except in compliance with the License.
+:: You may obtain a copy of the License at
+::
+::     http://www.apache.org/licenses/LICENSE-2.0
+::
+:: Unless required by applicable law or agreed to in writing, software
+:: distributed under the License is distributed on an "AS IS" BASIS,
+:: WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+:: See the License for the specific language governing permissions and
+:: limitations under the License.
+:: =============================================================================
+
+CALL tensorflow\tools\ci_build\release\common_win.bat
+
+call tensorflow\tools\ci_build\windows\gpu\bazel\run_libtensorflow.bat || exit /b
+
+copy lib_package %TF_ARTIFACTS_DIR%\lib_package
diff --git a/tensorflow/tools/ci_build/per_release/scripts/gpu_pip_on_cpu.bat b/tensorflow/tools/ci_build/per_release/scripts/gpu_pip_on_cpu.bat
new file mode 100644
index 00000000000..213de532069
--- /dev/null
+++ b/tensorflow/tools/ci_build/per_release/scripts/gpu_pip_on_cpu.bat
@@ -0,0 +1,21 @@
+:: Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+::
+:: Licensed under the Apache License, Version 2.0 (the "License");
+:: you may not use this file except in compliance with the License.
+:: You may obtain a copy of the License at
+::
+::     http://www.apache.org/licenses/LICENSE-2.0
+::
+:: Unless required by applicable law or agreed to in writing, software
+:: distributed under the License is distributed on an "AS IS" BASIS,
+:: WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+:: See the License for the specific language governing permissions and
+:: limitations under the License.
+:: =============================================================================
+
+SET PYTHON_DIRECTORY=Python36
+
+CALL tensorflow\tools\ci_build\release\common_win.bat
+
+call tensorflow\tools\ci_build\windows\integration\gpu_pip_on_cpu\run.bat
+
diff --git a/tensorflow/tools/ci_build/per_release/scripts/gpu_py35_full.bat b/tensorflow/tools/ci_build/per_release/scripts/gpu_py35_full.bat
new file mode 100644
index 00000000000..cba62225bee
--- /dev/null
+++ b/tensorflow/tools/ci_build/per_release/scripts/gpu_py35_full.bat
@@ -0,0 +1,23 @@
+:: Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+::
+:: Licensed under the Apache License, Version 2.0 (the "License");
+:: you may not use this file except in compliance with the License.
+:: You may obtain a copy of the License at
+::
+::     http://www.apache.org/licenses/LICENSE-2.0
+::
+:: Unless required by applicable law or agreed to in writing, software
+:: distributed under the License is distributed on an "AS IS" BASIS,
+:: WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+:: See the License for the specific language governing permissions and
+:: limitations under the License.
+:: =============================================================================
+
+SET PYTHON_DIRECTORY=Python35
+
+CALL tensorflow\tools\ci_build\release\common_win.bat
+
+call tensorflow\tools\ci_build\windows\gpu\pip\run.bat --release_build --extra_build_flags "--config=v2" --extra_test_flags "--test_env=TF2_BEHAVIOR=1" --project_name "tensorflow"
+
+for %%a in ("%~dp0\.") do set "PARENT_DIR=%%~nxa"
+bash -l tensorflow\tools\ci_build\release\windows\%PARENT_DIR%\release_pip_rename.sh
diff --git a/tensorflow/tools/ci_build/per_release/scripts/gpu_py36_full.bat b/tensorflow/tools/ci_build/per_release/scripts/gpu_py36_full.bat
new file mode 100644
index 00000000000..ede8bd35f52
--- /dev/null
+++ b/tensorflow/tools/ci_build/per_release/scripts/gpu_py36_full.bat
@@ -0,0 +1,23 @@
+:: Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+::
+:: Licensed under the Apache License, Version 2.0 (the "License");
+:: you may not use this file except in compliance with the License.
+:: You may obtain a copy of the License at
+::
+::     http://www.apache.org/licenses/LICENSE-2.0
+::
+:: Unless required by applicable law or agreed to in writing, software
+:: distributed under the License is distributed on an "AS IS" BASIS,
+:: WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+:: See the License for the specific language governing permissions and
+:: limitations under the License.
+:: =============================================================================
+
+SET PYTHON_DIRECTORY=Python36
+
+CALL tensorflow\tools\ci_build\release\common_win.bat
+
+call tensorflow\tools\ci_build\windows\gpu\pip\run.bat --release_build --extra_build_flags "--config=v2" --extra_test_flags "--test_env=TF2_BEHAVIOR=1" --project_name "tensorflow"
+
+for %%a in ("%~dp0\.") do set "PARENT_DIR=%%~nxa"
+bash -l tensorflow\tools\ci_build\release\windows\%PARENT_DIR%\release_pip_rename.sh
\ No newline at end of file
diff --git a/tensorflow/tools/ci_build/per_release/scripts/gpu_py37_full.bat b/tensorflow/tools/ci_build/per_release/scripts/gpu_py37_full.bat
new file mode 100644
index 00000000000..7509270fc43
--- /dev/null
+++ b/tensorflow/tools/ci_build/per_release/scripts/gpu_py37_full.bat
@@ -0,0 +1,23 @@
+:: Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+::
+:: Licensed under the Apache License, Version 2.0 (the "License");
+:: you may not use this file except in compliance with the License.
+:: You may obtain a copy of the License at
+::
+::     http://www.apache.org/licenses/LICENSE-2.0
+::
+:: Unless required by applicable law or agreed to in writing, software
+:: distributed under the License is distributed on an "AS IS" BASIS,
+:: WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+:: See the License for the specific language governing permissions and
+:: limitations under the License.
+:: =============================================================================
+
+SET PYTHON_DIRECTORY=Python37
+
+CALL tensorflow\tools\ci_build\release\common_win.bat
+
+call tensorflow\tools\ci_build\windows\gpu\pip\run.bat --release_build --extra_build_flags "--config=v2" --extra_test_flags "--test_env=TF2_BEHAVIOR=1" --project_name "tensorflow"
+
+for %%a in ("%~dp0\.") do set "PARENT_DIR=%%~nxa"
+bash -l tensorflow\tools\ci_build\release\windows\%PARENT_DIR%\release_pip_rename.sh
\ No newline at end of file
diff --git a/tensorflow/tools/ci_build/per_release/scripts/gpu_py38_full.bat b/tensorflow/tools/ci_build/per_release/scripts/gpu_py38_full.bat
new file mode 100644
index 00000000000..fc1c600fa5e
--- /dev/null
+++ b/tensorflow/tools/ci_build/per_release/scripts/gpu_py38_full.bat
@@ -0,0 +1,23 @@
+:: Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+::
+:: Licensed under the Apache License, Version 2.0 (the "License");
+:: you may not use this file except in compliance with the License.
+:: You may obtain a copy of the License at
+::
+::     http://www.apache.org/licenses/LICENSE-2.0
+::
+:: Unless required by applicable law or agreed to in writing, software
+:: distributed under the License is distributed on an "AS IS" BASIS,
+:: WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+:: See the License for the specific language governing permissions and
+:: limitations under the License.
+:: =============================================================================
+
+SET PYTHON_DIRECTORY=Python38
+
+CALL tensorflow\tools\ci_build\release\common_win.bat
+
+call tensorflow\tools\ci_build\windows\gpu\pip\run.bat --release_build --extra_build_flags "--config=v2" --extra_test_flags "--test_env=TF2_BEHAVIOR=1" --project_name "tensorflow"
+
+for %%a in ("%~dp0\.") do set "PARENT_DIR=%%~nxa"
+bash -l tensorflow\tools\ci_build\release\windows\%PARENT_DIR%\release_pip_rename.sh

From 1873cf2efee08f04d6a5045b401642bf6e5a3a30 Mon Sep 17 00:00:00 2001
From: Scott Zhu <scottzhu@google.com>
Date: Tue, 30 Jun 2020 13:54:48 -0700
Subject: [PATCH 1343/1390] Loose the check for regularizer and convert None to
 default value.

PiperOrigin-RevId: 319092652
Change-Id: If75a7d3474290f510d92fd6ebcfaf371f5fe6fcb
---
 tensorflow/python/keras/regularizers.py      | 13 ++++++++++++-
 tensorflow/python/keras/regularizers_test.py | 12 ++++++++++++
 2 files changed, 24 insertions(+), 1 deletion(-)

diff --git a/tensorflow/python/keras/regularizers.py b/tensorflow/python/keras/regularizers.py
index 919d30220b3..53791df78ac 100644
--- a/tensorflow/python/keras/regularizers.py
+++ b/tensorflow/python/keras/regularizers.py
@@ -31,7 +31,7 @@ from tensorflow.python.util.tf_export import keras_export
 
 
 def _check_penalty_number(x):
-  """check penalty number availability, raise ValueError if failed"""
+  """check penalty number availability, raise ValueError if failed."""
   if not isinstance(x, (float, int)):
     raise ValueError(('Value: {} is not a valid regularization penalty number, '
                       'expected an int or float value').format(x))
@@ -43,6 +43,10 @@ def _check_penalty_number(x):
         ).format(x))
 
 
+def _none_to_default(inputs, default):
+  return default if inputs is None else default
+
+
 @keras_export('keras.regularizers.Regularizer')
 class Regularizer(object):
   """Regularizer base class.
@@ -230,6 +234,11 @@ class L1L2(Regularizer):
   """
 
   def __init__(self, l1=0., l2=0.):  # pylint: disable=redefined-outer-name
+    # The default value for l1 and l2 are different from the value in l1_l2
+    # for backward compatiblity reason. Eg, L1L2(l2=0.1) will only have l2
+    # and no l1 penalty.
+    l1 = 0. if l1 is None else l1
+    l2 = 0. if l2 is None else l2
     _check_penalty_number(l1)
     _check_penalty_number(l2)
 
@@ -270,6 +279,7 @@ class L1(Regularizer):
     if kwargs:
       raise TypeError('Argument(s) not recognized: %s' % (kwargs,))
 
+    l1 = 0.01 if l1 is None else l1
     _check_penalty_number(l1)
 
     self.l1 = backend.cast_to_floatx(l1)
@@ -303,6 +313,7 @@ class L2(Regularizer):
     if kwargs:
       raise TypeError('Argument(s) not recognized: %s' % (kwargs,))
 
+    l2 = 0.01 if l2 is None else l2
     _check_penalty_number(l2)
 
     self.l2 = backend.cast_to_floatx(l2)
diff --git a/tensorflow/python/keras/regularizers_test.py b/tensorflow/python/keras/regularizers_test.py
index b10218ba114..21cecd03e6a 100644
--- a/tensorflow/python/keras/regularizers_test.py
+++ b/tensorflow/python/keras/regularizers_test.py
@@ -203,6 +203,18 @@ class KerasRegularizersTest(keras_parameterized.TestCase,
     with self.assertRaisesRegex(ValueError, 'Could not interpret regularizer'):
       keras.regularizers.get(0)
 
+  @parameterized.named_parameters([
+      ('l1', regularizers.l1(l1=None), 0.01),
+      ('l2', regularizers.l2(l2=None), 0.01),
+      ('l1_l2', regularizers.l1_l2(l1=None, l2=None), 0.),
+  ])
+  def test_default_value_when_init_with_none(self, regularizer, expected_value):
+    expected_value = np.asarray(expected_value)
+    if hasattr(regularizer, 'l1'):
+      self.assertAllClose(regularizer.l1, expected_value)
+    if hasattr(regularizer, 'l2'):
+      self.assertAllClose(regularizer.l2, expected_value)
+
 
 if __name__ == '__main__':
   test.main()

From 873acbe69438bb31082d8b3f67b7ffa88da46852 Mon Sep 17 00:00:00 2001
From: Mingming Liu <mingmingl@google.com>
Date: Tue, 30 Jun 2020 14:02:21 -0700
Subject: [PATCH 1344/1390] Add tracing for batch library.

PiperOrigin-RevId: 319094096
Change-Id: I838d5c47cc0e774dfb9fc45ac58257debb1f1dc5
---
 tensorflow/core/kernels/batching_util/BUILD             | 1 +
 tensorflow/core/kernels/batching_util/batch_scheduler.h | 7 +++++++
 2 files changed, 8 insertions(+)

diff --git a/tensorflow/core/kernels/batching_util/BUILD b/tensorflow/core/kernels/batching_util/BUILD
index 803eb2e9048..3ae415ee31c 100644
--- a/tensorflow/core/kernels/batching_util/BUILD
+++ b/tensorflow/core/kernels/batching_util/BUILD
@@ -49,6 +49,7 @@ cc_library(
     hdrs = ["batch_scheduler.h"],
     deps = [
         "//tensorflow/core:lib",
+        "//tensorflow/core/profiler/lib:traceme",
     ],
 )
 
diff --git a/tensorflow/core/kernels/batching_util/batch_scheduler.h b/tensorflow/core/kernels/batching_util/batch_scheduler.h
index d0e1d20bed4..bfafb5ed062 100644
--- a/tensorflow/core/kernels/batching_util/batch_scheduler.h
+++ b/tensorflow/core/kernels/batching_util/batch_scheduler.h
@@ -27,6 +27,7 @@ limitations under the License.
 #define TENSORFLOW_CORE_KERNELS_BATCHING_UTIL_BATCH_SCHEDULER_H_
 
 #include <stddef.h>
+
 #include <algorithm>
 #include <functional>
 #include <memory>
@@ -40,6 +41,7 @@ limitations under the License.
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/thread_annotations.h"
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/profiler/lib/traceme.h"
 
 namespace tensorflow {
 namespace serving {
@@ -241,6 +243,11 @@ int Batch<TaskType>::num_tasks() const {
 template <typename TaskType>
 bool Batch<TaskType>::empty() const {
   {
+    // tracer is added to zoom in about this method.
+    // TODO(b/160249203): Remove tracer after evaluating a change to reduce
+    // lock contention and cpu usage (which is observed in profiler and
+    // very data-driven).
+    tensorflow::profiler::TraceMe tracer("BatchTask::empty");
     mutex_lock l(mu_);
     return tasks_.empty();
   }

From f5b1c47c54f80dbbc1c1f2864af10a06d8fe1e81 Mon Sep 17 00:00:00 2001
From: Tomer Kaftan <kaftan@google.com>
Date: Tue, 30 Jun 2020 14:05:49 -0700
Subject: [PATCH 1345/1390] Disable keras applications saved model benchmark on
 py3.8, because the parameterized benchmark class relies on
 stacktrace/ast/types stuff that often changes across python versions.

PiperOrigin-RevId: 319094855
Change-Id: I2b5d5b4e6cd9bd55d4ce5ef126e54a109e0a0706
---
 tensorflow/python/keras/benchmarks/BUILD | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tensorflow/python/keras/benchmarks/BUILD b/tensorflow/python/keras/benchmarks/BUILD
index ca72f31eb27..1fcf86a3c3a 100755
--- a/tensorflow/python/keras/benchmarks/BUILD
+++ b/tensorflow/python/keras/benchmarks/BUILD
@@ -49,6 +49,9 @@ cuda_py_test(
     size = "medium",
     srcs = ["applications_saved_model_test.py"],
     shard_count = 8,
+    tags = [
+        "no_oss_py38",  # b/160170347
+    ],
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python/keras/applications",

From f781cfb7e13a850e99ebf734ce4d4158e5563773 Mon Sep 17 00:00:00 2001
From: Scott Zhu <scottzhu@google.com>
Date: Tue, 30 Jun 2020 14:08:23 -0700
Subject: [PATCH 1346/1390] Move the release notes for keras API since we did
 not cherry-pick the change

PiperOrigin-RevId: 319095495
Change-Id: I7f7493e8dc136b5d218c927a1dab2000dbcdbb8b
---
 RELEASE.md | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/RELEASE.md b/RELEASE.md
index 28d8bd7ca36..5c05f2a4285 100644
--- a/RELEASE.md
+++ b/RELEASE.md
@@ -72,12 +72,6 @@ stjohnso98, <NAME>, <HERE>, <USING>, <GITHUB>, <HANDLE>
     models will not be impacted.
 
 ## Bug Fixes and Other Changes
-
-*   `tf.keras`:
-    *   Deprecated the `tf.keras.experimental.PeepholeLSTMCell` layer, which was
-        moved to `tensorflow_addons` as
-        `tensorflow_addons.rnn.PeepholeLSTMCell`. This experimental API is
-        expected to be removed from TF in the next public release (2.4).
 * Mutable tables now restore checkpointed values when loaded from SavedModel.
 
 # Release 2.1.1

From e5d8b0594ce26c44238d59ed5bd3f4fa4f18f215 Mon Sep 17 00:00:00 2001
From: Feng Liu <fengliuai@google.com>
Date: Tue, 30 Jun 2020 14:26:46 -0700
Subject: [PATCH 1347/1390] Cleanup the logic for duplicating constants

PiperOrigin-RevId: 319099189
Change-Id: I7cde0b7ec42984927a69ce7d03762c2705ff6e8e
---
 tensorflow/compiler/mlir/lite/BUILD           |  1 +
 .../mlir/lite/ir/tfl_op_interfaces.td         | 16 ------
 tensorflow/compiler/mlir/lite/ir/tfl_ops.td   | 19 ++++---
 .../mlir/lite/quantization/quantization.td    | 29 +++++++++++
 .../lite/quantization/quantization_driver.cc  | 49 +++++++++----------
 .../lite/quantization/quantization_utils.cc   |  5 ++
 .../mlir/lite/tests/prepare-quantize.mlir     |  2 +-
 7 files changed, 70 insertions(+), 51 deletions(-)

diff --git a/tensorflow/compiler/mlir/lite/BUILD b/tensorflow/compiler/mlir/lite/BUILD
index d5f7aa3aa9c..4edb236e04b 100644
--- a/tensorflow/compiler/mlir/lite/BUILD
+++ b/tensorflow/compiler/mlir/lite/BUILD
@@ -644,6 +644,7 @@ cc_library(
         ":flatbuffer_tflite_operator_lib",
         ":tensorflow_lite",
         ":tensorflow_lite_dialect_registration",
+        "//tensorflow/compiler/mlir/lite/quantization:quantization_lib",
         "//tensorflow/compiler/mlir/tensorflow:mangling_util",
         "//tensorflow/compiler/mlir/tensorflow:tensorflow_types",
         "//tensorflow/compiler/xla:statusor",
diff --git a/tensorflow/compiler/mlir/lite/ir/tfl_op_interfaces.td b/tensorflow/compiler/mlir/lite/ir/tfl_op_interfaces.td
index a79d79b5970..becc2f7ab85 100644
--- a/tensorflow/compiler/mlir/lite/ir/tfl_op_interfaces.td
+++ b/tensorflow/compiler/mlir/lite/ir/tfl_op_interfaces.td
@@ -40,22 +40,6 @@ def TFL_StatefulOp : OpInterface<"StatefulOpInterface"> {
   ];
 }
 
-//===----------------------------------------------------------------------===//
-// TFL op interface for output channel index.
-
-def TFL_ChannelDimIndexInterface : OpInterface<"ChannelDimIndexInterface"> {
-  let description = [{
-    Interface for defining the index of out channel index.
-  }];
-
-  let methods = [
-    InterfaceMethod<
-      [{Returns the dimension index of the output channels.}],
-      "int", "GetChannelDimIndex", (ins)
-    >,
-  ];
-}
-
 //===----------------------------------------------------------------------===//
 // TFL op interface for sparse operands.
 
diff --git a/tensorflow/compiler/mlir/lite/ir/tfl_ops.td b/tensorflow/compiler/mlir/lite/ir/tfl_ops.td
index 999dd7af0d4..d109e425cae 100644
--- a/tensorflow/compiler/mlir/lite/ir/tfl_ops.td
+++ b/tensorflow/compiler/mlir/lite/ir/tfl_ops.td
@@ -435,7 +435,7 @@ class TFL_Op<string mnemonic, list<OpTrait> traits = []> :
 
 class TFL_ConvOp<string mnemonic, string opSummary, int index> :
     TFL_Op<mnemonic, [NoSideEffect, AccumulatorUniformScale<2, 0, 1>,
-    TFL_ChannelDimIndexInterface, AffineOpCoefficient<index, 1>,
+    AffineQuantizedOpInterface, AffineOpCoefficient<index, 1>,
     TFL_GpuTargetOp, TFL_SparseOp]> {
   let summary = opSummary # " operator";
 
@@ -572,7 +572,7 @@ def TFL_TransposeConvOp: TFL_Op<"transpose_conv", [
     PredOpTrait<"input and output must have same element type",
       TFL_TCresVTEtIsSameAsOp<0, 2>>,
     AccumulatorUniformScale<3, 1, 2>,
-    TFL_ChannelDimIndexInterface, AffineOpCoefficient<0, 2>,
+    AffineQuantizedOpInterface, AffineOpCoefficient<0, 2>,
     TFL_GpuTargetOp,
     TFL_SparseOp]> {
   let summary = "Transpose convolution operator";
@@ -598,8 +598,10 @@ def TFL_TransposeConvOp: TFL_Op<"transpose_conv", [
   let verifier = [{ return Verify(*this); }];
 
   let extraClassDeclaration = [{
-    // ChannelDimIndexInterface:
+    // AffineQuantizedOpInterface:
     int GetChannelDimIndex() { return 0; }
+    int GetQuantizationDimIndex() { return 0; }
+    int GetAffineOperandIndex() { return 2; }
     // SparseOpInterface:
     std::vector<int> GetSparseOperands() { return {1}; }
     std::vector<std::vector<int>> GetFloatBlockSize() { return {}; }
@@ -836,8 +838,9 @@ def TFL_ExternalConstOp : Op<TFL_Dialect, "external_const", [NoSideEffect]> {
 
 def TFL_Conv2DOp : TFL_ConvOp<"conv_2d", "Convolution", 0> {
   let extraClassDeclaration = [{
-    // ChannelDimIndexInterface:
+    // AffineQuantizedOpInterface:
     int GetChannelDimIndex() { return 0; }
+    int GetQuantizationDimIndex() { return 0; }
     // SparseOpInterface:
     std::vector<int> GetSparseOperands() { return {1}; }
     std::vector<std::vector<int>> GetFloatBlockSize() { return {}; }
@@ -880,8 +883,9 @@ def TFL_DepthwiseConv2DOp :
   );
 
   let extraClassDeclaration = [{
-    // ChannelDimIndexInterface:
+    // AffineQuantizedOpInterface:
     int GetChannelDimIndex() { return 3; }
+    int GetQuantizationDimIndex() { return 3; }    
     // SparseOpInterface:
     std::vector<int> GetSparseOperands() { return {1}; }
     std::vector<std::vector<int>> GetFloatBlockSize() { return {}; }
@@ -901,7 +905,7 @@ def TFL_FullyConnectedOptionsWeightFormatAttr :
 // TODO(jpienaar): Update post discussion on semantics of FC OP.
 def TFL_FullyConnectedOp : TFL_Op<"fully_connected", [
     NoSideEffect, AccumulatorUniformScale<2, 0, 1>,
-    TFL_ChannelDimIndexInterface,
+    AffineQuantizedOpInterface,
     AffineOpCoefficient<-1, 1>,
     TFL_SparseOp,
     TFL_GpuTargetOp]> {
@@ -927,8 +931,9 @@ def TFL_FullyConnectedOp : TFL_Op<"fully_connected", [
   let hasOptions = 1;
 
   let extraClassDeclaration = [{
-    // ChannelDimIndexInterface:
+    // AffineQuantizedOpInterface:
     int GetChannelDimIndex() { return 0; }
+    int GetQuantizationDimIndex() { return -1; }
     // SparseOpInterface:
     std::vector<int> GetSparseOperands() { return {1}; }
     std::vector<std::vector<int>> GetFloatBlockSize() { return {{1, 4}}; }
diff --git a/tensorflow/compiler/mlir/lite/quantization/quantization.td b/tensorflow/compiler/mlir/lite/quantization/quantization.td
index c1e392bd3ad..2987938c28e 100644
--- a/tensorflow/compiler/mlir/lite/quantization/quantization.td
+++ b/tensorflow/compiler/mlir/lite/quantization/quantization.td
@@ -79,6 +79,35 @@ def FixedOutputRangeInterface : OpInterface<
   ];
 }
 
+// TODO(b/157870442): extend this trait to replace the other two
+// tratis/interfaces AccumulatorUniformScale and AffineOpCoefficient, which
+// are used to generate the op quantization specs.
+def AffineQuantizedOpInterface : OpInterface<
+  "AffineQuantizedOpInterface"> {
+  let description = [{
+    Interface for affine quantized ops (conv2d, fully_connected, etc.)
+  }];
+
+  let methods = [
+    InterfaceMethod<
+      [{Returns the affine operand index.}],
+      "int", "GetAffineOperandIndex",
+      (ins), [{}], [{return 1;}]>,
+    InterfaceMethod<
+      [{Returns whether narrow range is required for the affine operand.}],
+      "bool", "RequiredNarrowRangeAffineOperand",
+      (ins), [{}], [{return true;}]>,
+    InterfaceMethod<
+      [{Returns quantization dim for the affine operand.}],
+      "int", "GetQuantizationDimIndex",
+      (ins)>,
+    InterfaceMethod<
+      [{Returns the dimension index of the output channels.}],
+      "int", "GetChannelDimIndex", (ins)
+    >,
+  ];
+}
+
 // Specify this trait if the op has a fixed output value range.
 class FixedResultScale<QuantizedType qt> : NativeOpTrait<!strconcat(
   "quant::FixedResult", qt.name, "Scale<", qt.asTraitArgsStr, ">::Impl")>;
diff --git a/tensorflow/compiler/mlir/lite/quantization/quantization_driver.cc b/tensorflow/compiler/mlir/lite/quantization/quantization_driver.cc
index bc97c42c955..9b63290a10b 100644
--- a/tensorflow/compiler/mlir/lite/quantization/quantization_driver.cc
+++ b/tensorflow/compiler/mlir/lite/quantization/quantization_driver.cc
@@ -638,44 +638,39 @@ void QuantizationDriver::PreprocessConstantOps() {
     if (!type || !type.getElementType().isa<FloatType>()) return;
 
     Value value = cst.getResult();
-    SmallVector<std::pair<Operation *, int>, 4> bias_users;
-    bool used_as_weight = false;
-    for (auto &use : value.getUses()) {
+    builder_.setInsertionPoint(cst);
+    for (auto indexed_use : llvm::enumerate(value.getUses())) {
+      auto &use = indexed_use.value();
       auto spec = GetQuantSpec(use.getOwner());
       auto biases = spec->biases_params;
       Operation *user = use.getOwner();
       int operand_num = use.getOperandNumber();
 
-      // The user doesn't use this value as a bias operand or require same
-      // scale, then this constant is considered to be a weight.
+      // Only duplicate if there are more than one use.
+      if (indexed_use.index() > 0) {
+        cst = builder_.create<ConstantOp>(cst.getLoc(), cst.getValue());
+      }
+
       if (biases.find(operand_num) == biases.end() &&
           !user->hasTrait<OpTrait::quant::SameOperandsAndResultsScale>()) {
-        used_as_weight = true;
-        auto it = spec->coeff_op_quant_dim.find(operand_num);
-        if (it != spec->coeff_op_quant_dim.end()) {
-          optimized_weights_.insert({cst, it->second});
+        // Needs to scan the content to get the quantiztion parameters if there
+        // are no quantization parameters (FakeQuant ops).
+        weights_.insert(cst);
+        auto affine_user =
+            llvm::dyn_cast<mlir::AffineQuantizedOpInterface>(user);
+        if (affine_user &&
+            affine_user.GetAffineOperandIndex() == use.getOperandNumber() &&
+            affine_user.RequiredNarrowRangeAffineOperand()) {
+          optimized_weights_.insert(
+              {cst, affine_user.GetQuantizationDimIndex()});
         }
       } else {
-        bias_users.push_back({user, operand_num});
+        // This is a bias, so the quantization parameter isn't determined by the
+        // local content. Same if the user can have quantization parameter
+        // propagated from other places.
+        user->setOperand(operand_num, cst);
       }
     }
-
-    // If the constant is used as a weight, this constant will be duplicated
-    // for each bias user, so it isn't shared with the weight usage.
-    // Otherwise, the first bias user can use the original constant and the
-    // rest use the duplications, so we pop bias user from the set.
-    if (used_as_weight) {
-      // TODO(fengliuai): Looks like there is an assumption that weight has
-      // only one user. We should add a check here.
-      weights_.insert(cst);
-    } else {
-      bias_users.pop_back();
-      builder_.setInsertionPoint(cst);
-    }
-    for (auto bias_user : bias_users) {
-      auto copied = builder_.create<ConstantOp>(cst.getLoc(), cst.getValue());
-      bias_user.first->setOperand(bias_user.second, copied.getResult());
-    }
   });
 }
 
diff --git a/tensorflow/compiler/mlir/lite/quantization/quantization_utils.cc b/tensorflow/compiler/mlir/lite/quantization/quantization_utils.cc
index b98739eac6e..a0392583f36 100644
--- a/tensorflow/compiler/mlir/lite/quantization/quantization_utils.cc
+++ b/tensorflow/compiler/mlir/lite/quantization/quantization_utils.cc
@@ -34,6 +34,11 @@ limitations under the License.
 #include "mlir/Support/LLVM.h"  // from @llvm-project
 
 namespace mlir {
+
+// This includes the interface class definition. It couldn't be in a namespace
+// because the table gen doesn't emit the namespace when it is used.
+#include "tensorflow/compiler/mlir/lite/quantization/quantization_interface.cc.inc"
+
 namespace quant {
 
 const float kNearZeroTolerance = 1.0e-6;
diff --git a/tensorflow/compiler/mlir/lite/tests/prepare-quantize.mlir b/tensorflow/compiler/mlir/lite/tests/prepare-quantize.mlir
index 38f76bb4eb5..01556db986e 100644
--- a/tensorflow/compiler/mlir/lite/tests/prepare-quantize.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/prepare-quantize.mlir
@@ -593,8 +593,8 @@ func @QuantizeSharedBiases(
 // CHECK: %[[cst_0:.*]] = constant dense<1.000000e+00> : tensor<32xf32>
 // CHECK: %[[q_0:.*]] = "tfl.quantize"(%[[cst_0]])
 // CHECK: %[[dq_0:.*]] = "tfl.dequantize"(%[[q_0]])
-// CHECK: %{{.*}} = "tfl.conv_2d"(%{{.*}}, %{{.*}}, %[[dq]])
 // CHECK: %{{.*}} = "tfl.conv_2d"(%{{.*}}, %{{.*}}, %[[dq_0]])
+// CHECK: %{{.*}} = "tfl.conv_2d"(%{{.*}}, %{{.*}}, %[[dq]])
 }
 
 // CHECK-LABEL: QuantizeSharedBiases2

From bb4d6e7a964707bec5263f2efc897ae51ab44dd9 Mon Sep 17 00:00:00 2001
From: Yujing Zhang <yujingzhang@google.com>
Date: Tue, 30 Jun 2020 14:34:02 -0700
Subject: [PATCH 1348/1390] Support packed inputs for functional
 GrpcServerBindOp in op-by-op mode.

Before running GrpcServerBindOp, run a Pack op to pack the tensors pointed by a packed input into a single tensor with packed data.

PiperOrigin-RevId: 319100563
Change-Id: I19de21365bbd12b2591053374df1055175f13735
---
 .../core/common_runtime/eager/execute.cc      | 34 +++++++++++
 tensorflow/core/framework/function.cc         | 25 ++++++++
 tensorflow/core/framework/function.h          |  6 ++
 tensorflow/core/framework/function_test.cc    | 60 +++++++++++++++++++
 4 files changed, 125 insertions(+)

diff --git a/tensorflow/core/common_runtime/eager/execute.cc b/tensorflow/core/common_runtime/eager/execute.cc
index a030f4d0356..a031974a969 100644
--- a/tensorflow/core/common_runtime/eager/execute.cc
+++ b/tensorflow/core/common_runtime/eager/execute.cc
@@ -666,6 +666,39 @@ Status EagerLocalExecute(EagerOperation* op, TensorHandle** retvals,
   return s;
 }
 
+// Run a Pack op to pack the tensors pointed by a packed input TensorHandle if
+// the op is a primitive op.
+Status MaybePackInputTensor(EagerOperation* op) {
+  if (op->is_function()) {
+    // Functions could take packed TensorHandles as inputs.
+    return Status::OK();
+  }
+  EagerContext& ctx = op->EagerContext();
+  for (int i = 0; i < op->Inputs().size(); ++i) {
+    TensorHandle* handle = op->Inputs()[i];
+    if (handle->Type() == TensorHandle::PACKED) {
+      EagerOperation pack_op(&ctx);
+      TF_RETURN_IF_ERROR(pack_op.Reset("Pack", /*device_name=*/nullptr,
+                                       /*remote=*/false, /*executor=*/nullptr));
+      pack_op.MutableAttrs()->Set("N", handle->NumPackedHandles());
+      pack_op.MutableAttrs()->Set("T", handle->dtype);
+      for (int i = 0; i < handle->NumPackedHandles(); ++i) {
+        tensorflow::TensorHandle* h = nullptr;
+        TF_RETURN_IF_ERROR(handle->ExtractPackedHandle(i, &h));
+        TF_RETURN_IF_ERROR(pack_op.AddInput(h));
+      }
+      int num_retvals = 1;
+      absl::FixedArray<tensorflow::TensorHandle*> retvals(num_retvals);
+      TF_RETURN_IF_ERROR(
+          EagerLocalExecute(&pack_op, retvals.data(), &num_retvals));
+      tensorflow::TensorHandle* ret = retvals.at(0);
+      op->UpdateInput(i, ret);
+      ret->Unref();
+    }
+  }
+  return Status::OK();
+}
+
 #if !defined(IS_MOBILE_PLATFORM)
 void PrepareRemoteOp(eager::Operation* remote_op, EagerOperation* op) {
   EagerContext& ctx = op->EagerContext();
@@ -951,6 +984,7 @@ Status EagerExecute(EagerOperation* op, TensorHandle** retvals,
     if (out_op) {
       op = out_op.get();
     }
+    TF_RETURN_IF_ERROR(MaybePackInputTensor(op));
     return EagerLocalExecute(op, retvals, num_retvals);
   }
 
diff --git a/tensorflow/core/framework/function.cc b/tensorflow/core/framework/function.cc
index 22fc27771f0..ebf06c7d0cd 100644
--- a/tensorflow/core/framework/function.cc
+++ b/tensorflow/core/framework/function.cc
@@ -993,6 +993,31 @@ class AttrKeyAndValue {
 };
 }  // namespace
 
+string GetFunctionResourceInputDevice(
+    const Tensor& input, const int arg_index, const FunctionDef& function_def,
+    absl::flat_hash_map<string, std::vector<string>>* composite_devices) {
+  const auto& handles = input.flat<ResourceHandle>();
+  const ResourceHandle& handle0 = handles(0);
+  string composite_device;
+  auto iter = function_def.arg_attr().find(arg_index);
+  if (iter != function_def.arg_attr().end()) {
+    auto arg_attr = iter->second.attr().find("_composite_device");
+    if (arg_attr != iter->second.attr().end()) {
+      composite_device = arg_attr->second.s();
+    }
+  }
+  if (!composite_device.empty()) {
+    if (composite_devices->find(composite_device) == composite_devices->end()) {
+      for (int i = 0; i < handles.size(); ++i) {
+        (*composite_devices)[composite_device].push_back(handles(i).device());
+      }
+    }
+    return composite_device;
+  } else {
+    return handle0.device();
+  }
+}
+
 string Canonicalize(const string& funcname, AttrSlice attrs,
                     const FunctionLibraryRuntime::InstantiateOptions& options) {
   absl::InlinedVector<AttrKeyAndValue, 8> entries;
diff --git a/tensorflow/core/framework/function.h b/tensorflow/core/framework/function.h
index c588a42d0e8..851e24c2fb0 100644
--- a/tensorflow/core/framework/function.h
+++ b/tensorflow/core/framework/function.h
@@ -847,6 +847,12 @@ class FunctionLibraryRuntime {
                              AttrSlice attrs);
 };
 
+// Returns the device of the `arg_index`-th function input. Update
+// `composite_devices` if the input device is a composite device.
+string GetFunctionResourceInputDevice(
+    const Tensor& input, const int arg_index, const FunctionDef& function_def,
+    absl::flat_hash_map<string, std::vector<string>>* composite_devices);
+
 // Returns a canonicalized string for the instantiation of the
 // function of the given "name", attributes "attrs", and "options".
 //
diff --git a/tensorflow/core/framework/function_test.cc b/tensorflow/core/framework/function_test.cc
index 5f9a714b761..a62acfe571e 100644
--- a/tensorflow/core/framework/function_test.cc
+++ b/tensorflow/core/framework/function_test.cc
@@ -1529,5 +1529,65 @@ TEST(InstantiateFunctionTest, ArgAttrs) {
   EXPECT_TRUE(found);
 }
 
+TEST(InstantiateFunctionTest, ResourceInputDevice) {
+  FunctionDef fdef = FDH::Create(
+      // Name
+      "Func",
+      // Args
+      {{"x0: resource"}, {"x1: resource"}},
+      // Return values
+      {"y: float"},
+      // Attr def
+      {},
+      // Nodes
+      {
+          {{"read0"},
+           "ReadVariableOp",
+           {"x0"},
+           {{"dtype", DT_FLOAT}},
+           {},
+           "/device:CPU:1"},
+          {{"read1"},
+           "ReadVariableOp",
+           {"x1"},
+           {{"dtype", DT_FLOAT}},
+           {},
+           "/device:CPU:0"},
+          {{"add"},
+           "Add",
+           {"read0:value:0", "read1:value:0"},
+           {{"T", DT_FLOAT}},
+           {},
+           "/device:CPU:0"},
+      },
+      {{"y", "add:z:0"}});
+  FunctionDef::ArgAttrs arg_attrs;
+  *(*arg_attrs.mutable_attr())["_composite_device"].mutable_s() =
+      "/device:COMPOSITE:0";
+  (*fdef.mutable_arg_attr())[0] = arg_attrs;
+  absl::flat_hash_map<string, std::vector<string>> composite_devices;
+
+  Tensor arg0(DT_RESOURCE, TensorShape({2}));
+  ResourceHandle resource_handle0;
+  resource_handle0.set_device("/device:CPU:0");
+  ResourceHandle resource_handle1;
+  resource_handle1.set_device("/device:CPU:1");
+  arg0.flat<ResourceHandle>()(0) = resource_handle0;
+  arg0.flat<ResourceHandle>()(1) = resource_handle1;
+
+  Tensor arg1(DT_RESOURCE, TensorShape({}));
+  arg1.scalar<ResourceHandle>()() = resource_handle0;
+
+  const string device0 = GetFunctionResourceInputDevice(
+      arg0, /*arg_index=*/0, fdef, &composite_devices);
+  const string device1 = GetFunctionResourceInputDevice(
+      arg1, /*arg_index=*/1, fdef, &composite_devices);
+
+  EXPECT_EQ(device0, "/device:COMPOSITE:0");
+  EXPECT_EQ(device1, "/device:CPU:0");
+  EXPECT_EQ(composite_devices.size(), 1);
+  EXPECT_EQ(composite_devices.at("/device:COMPOSITE:0").size(), 2);
+}
+
 }  // end namespace
 }  // end namespace tensorflow

From 699728208803884c2c882bf04ee74c52632a1ad0 Mon Sep 17 00:00:00 2001
From: Amit Patankar <amitpatankar@google.com>
Date: Tue, 30 Jun 2020 14:52:44 -0700
Subject: [PATCH 1349/1390]   Add kokoro tests for Horovod to run against
 tf-nightly.

PiperOrigin-RevId: 319104284
Change-Id: Idf5d254e9a334d35ef71c204cade5e178d5d4be8
---
 tensorflow/opensource_only.files              |  1 +
 .../tools/ci_build/horovod/gpu/nightly.sh     | 46 +++++++++++++++++++
 2 files changed, 47 insertions(+)
 create mode 100644 tensorflow/tools/ci_build/horovod/gpu/nightly.sh

diff --git a/tensorflow/opensource_only.files b/tensorflow/opensource_only.files
index e8cc7602cd3..a0ce4305b16 100644
--- a/tensorflow/opensource_only.files
+++ b/tensorflow/opensource_only.files
@@ -267,6 +267,7 @@ tensorflow/third_party/toolchains/remote_config/rbe_config.bzl
 tensorflow/third_party/wrapt.BUILD
 tensorflow/third_party/zlib.BUILD
 tensorflow/tools/build_info/BUILD
+tensorflow/tools/ci_build/horovod/gpu/nightly.sh
 tensorflow/tools/ci_build/release/common.sh
 tensorflow/tools/ci_build/release/common_win.bat
 tensorflow/tools/ci_build/release/macos/cpu_libtensorflow/build.sh
diff --git a/tensorflow/tools/ci_build/horovod/gpu/nightly.sh b/tensorflow/tools/ci_build/horovod/gpu/nightly.sh
new file mode 100644
index 00000000000..50f5f49cfad
--- /dev/null
+++ b/tensorflow/tools/ci_build/horovod/gpu/nightly.sh
@@ -0,0 +1,46 @@
+#!/bin/bash
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+set -e
+
+# Source the external common scripts.
+source tensorflow/tools/ci_build/release/common.sh
+
+
+# Install latest bazel
+install_bazelisk
+which bazel
+
+# Install realpath
+sudo apt-get install realpath
+
+# Update the version string to nightly
+if [ -n "${IS_NIGHTLY_BUILD}" ]; then
+  ./tensorflow/tools/ci_build/update_version.py --nightly
+fi
+
+# Download and install open-mpi.
+wget https://download.open-mpi.org/release/open-mpi/v4.0/openmpi-4.0.4.tar.gz
+tar xvf openmpi-4.0.4.tar.gz
+
+cd openmpi
+./configure
+
+# Install open-mpi.
+sudo make all install
+sudo ldconfig
+
+
+

From e8ecb2c2002d0a6c93e8b4a27c58fd013e2b167f Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 30 Jun 2020 15:26:26 -0700
Subject: [PATCH 1350/1390] Require that axis arguments to reduction ops are
 unique.

PiperOrigin-RevId: 319110978
Change-Id: I10f1fd2a60048462ab2846be2012cca726569f29
---
 .../core/kernels/reduction_ops_common.cc      |  5 --
 .../python/kernel_tests/reduction_ops_test.py |  9 --
 tensorflow/python/ops/math_ops.py             | 90 +++++++++----------
 tensorflow/python/ops/math_ops_test.py        |  7 +-
 4 files changed, 48 insertions(+), 63 deletions(-)

diff --git a/tensorflow/core/kernels/reduction_ops_common.cc b/tensorflow/core/kernels/reduction_ops_common.cc
index 2e21094cc49..c341e330178 100644
--- a/tensorflow/core/kernels/reduction_ops_common.cc
+++ b/tensorflow/core/kernels/reduction_ops_common.cc
@@ -69,11 +69,6 @@ Status SimplifyHelper(const Tensor& data, const Tensor& axis,
                                      " dimension(s)");
     }
     index = (index + data.dims()) % data.dims();
-    if (bitmap[index]) {
-      return errors::InvalidArgument(
-          "Invalid reduction arguments: Axes contains duplicate dimension: ",
-          index);
-    }
     bitmap[index] = true;
   }
   return Status::OK();
diff --git a/tensorflow/python/kernel_tests/reduction_ops_test.py b/tensorflow/python/kernel_tests/reduction_ops_test.py
index 5146780ca3f..1b5fa201d8f 100644
--- a/tensorflow/python/kernel_tests/reduction_ops_test.py
+++ b/tensorflow/python/kernel_tests/reduction_ops_test.py
@@ -25,7 +25,6 @@ import numpy as np
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import test_util
@@ -341,14 +340,6 @@ class SumReductionTest(BaseReductionTest):
                                              ".*must be at most rank 1.*"):
       math_ops.reduce_sum(c_unknown, reduction_axes)
 
-  def testInvalidRepeatedReductionIndices(self):
-    reduction_axes = constant_op.constant([0, 0])
-    c = constant_op.constant([1.0, 2.0])
-    with self.assertRaisesWithPredicateMatch(
-        errors.InvalidArgumentError,
-        ".*Axes contains duplicate dimension: 0.*"):
-      self.evaluate(math_ops.reduce_sum(c, reduction_axes))
-
   # Int64??
 
   @test_util.run_deprecated_v1
diff --git a/tensorflow/python/ops/math_ops.py b/tensorflow/python/ops/math_ops.py
index d587285a36e..7810dae2688 100644
--- a/tensorflow/python/ops/math_ops.py
+++ b/tensorflow/python/ops/math_ops.py
@@ -1828,18 +1828,18 @@ def _ReductionDims(x, axis):  # pylint: disable=invalid-name
   if axis is not None:
     return axis
   else:
-    x_rank = None
+    # Fast path: avoid creating Rank and Range ops if ndims is known.
     if isinstance(x, ops.Tensor):
-      x_rank = x.shape.rank
+      rank = x.shape.rank
+      if rank is not None:
+        return constant_op.constant(np.arange(rank, dtype=np.int32))
     elif (isinstance(x, sparse_tensor.SparseTensor) and
           x.dense_shape.shape.is_fully_defined()):
-      x_rank = x.dense_shape.shape.dims[0].value  # sparse.dense_shape is 1-D.
-    # Fast path: avoid creating Rank and Range ops if ndims is known.
-    if x_rank:
-      return constant_op.constant(np.arange(x_rank, dtype=np.int32))
-    else:
-      # Otherwise, we rely on Range and Rank to do the right thing at run-time.
-      return range(0, array_ops.rank(x))
+      rank = x.dense_shape.shape.dims[0].value  # sparse.dense_shape is 1-D.
+      return constant_op.constant(np.arange(rank, dtype=np.int32))
+
+    # Otherwise, we rely on Range and Rank to do the right thing at run-time.
+    return range(0, array_ops.rank(x))
 
 
 def _has_fully_defined_shape(tensor):
@@ -1870,8 +1870,8 @@ def reduce_sum_v1(input_tensor,
 
   Reduces `input_tensor` along the dimensions given in `axis`.
   Unless `keepdims` is true, the rank of the tensor is reduced by 1 for each
-  of the entries in `axis`, which must be unique. If `keepdims` is true, the
-  reduced dimensions are retained with length 1.
+  entry in `axis`. If `keepdims` is true, the reduced dimensions
+  are retained with length 1.
 
   If `axis` is None, all dimensions are reduced, and a
   tensor with a single element is returned.
@@ -1920,8 +1920,8 @@ def reduce_sum(input_tensor, axis=None, keepdims=False, name=None):
 
   Reduces `input_tensor` along the dimensions given in `axis`.
   Unless `keepdims` is true, the rank of the tensor is reduced by 1 for each
-  of the entries in `axis`, which must be unique. If `keepdims` is true, the
-  reduced dimensions are retained with length 1.
+  entry in `axis`. If `keepdims` is true, the reduced dimensions
+  are retained with length 1.
 
   If `axis` is None, all dimensions are reduced, and a
   tensor with a single element is returned.
@@ -1997,8 +1997,8 @@ def reduce_euclidean_norm(input_tensor, axis=None, keepdims=False, name=None):
 
   Reduces `input_tensor` along the dimensions given in `axis`.
   Unless `keepdims` is true, the rank of the tensor is reduced by 1 for each
-  of the entries in `axis`, which must be unique. If `keepdims` is true, the
-  reduced dimensions are retained with length 1.
+  entry in `axis`. If `keepdims` is true, the reduced dimensions
+  are retained with length 1.
 
   If `axis` is None, all dimensions are reduced, and a
   tensor with a single element is returned.
@@ -2193,8 +2193,8 @@ def reduce_mean_v1(input_tensor,
   Reduces `input_tensor` along the dimensions given in `axis` by computing the
   mean of elements across the dimensions in `axis`.
   Unless `keepdims` is true, the rank of the tensor is reduced by 1 for each
-  the entries in `axis`, which must be unique. If `keepdims` is true, the
-  reduced dimensions are retained with length 1.
+  entry in `axis`. If `keepdims` is true, the reduced dimensions
+  are retained with length 1.
 
   If `axis` is None, all dimensions are reduced, and a tensor with a single
   element is returned.
@@ -2255,8 +2255,8 @@ def reduce_mean(input_tensor, axis=None, keepdims=False, name=None):
   Reduces `input_tensor` along the dimensions given in `axis` by computing the
   mean of elements across the dimensions in `axis`.
   Unless `keepdims` is true, the rank of the tensor is reduced by 1 for each
-  of the entries in `axis`, which must be unique. If `keepdims` is true, the
-  reduced dimensions are retained with length 1.
+  entry in `axis`. If `keepdims` is true, the reduced dimensions are retained
+  with length 1.
 
   If `axis` is None, all dimensions are reduced, and a tensor with a single
   element is returned.
@@ -2314,8 +2314,8 @@ def reduce_variance(input_tensor, axis=None, keepdims=False, name=None):
 
   Reduces `input_tensor` along the dimensions given in `axis`.
   Unless `keepdims` is true, the rank of the tensor is reduced by 1 for each
-  of the entries in `axis`, which must be unique. If `keepdims` is true, the
-  reduced dimensions are retained with length 1.
+  entry in `axis`. If `keepdims` is true, the reduced dimensions
+  are retained with length 1.
 
   If `axis` is None, all dimensions are reduced, and a
   tensor with a single element is returned.
@@ -2375,8 +2375,8 @@ def reduce_std(input_tensor, axis=None, keepdims=False, name=None):
 
   Reduces `input_tensor` along the dimensions given in `axis`.
   Unless `keepdims` is true, the rank of the tensor is reduced by 1 for each
-  of the entries in `axis`, which must be unique. If `keepdims` is true, the
-  reduced dimensions are retained with length 1.
+  entry in `axis`. If `keepdims` is true, the reduced dimensions
+  are retained with length 1.
 
   If `axis` is None, all dimensions are reduced, and a
   tensor with a single element is returned.
@@ -2469,8 +2469,8 @@ def reduce_prod_v1(input_tensor,
 
   Reduces `input_tensor` along the dimensions given in `axis`.
   Unless `keepdims` is true, the rank of the tensor is reduced by 1 for each
-  of the entries in `axis`, which must be unique. If `keepdims` is true, the
-  reduced dimensions are retained with length 1.
+  entry in `axis`. If `keepdims` is true, the reduced dimensions
+  are retained with length 1.
 
   If `axis` is None, all dimensions are reduced, and a
   tensor with a single element is returned.
@@ -2515,8 +2515,8 @@ def reduce_min_v1(input_tensor,
 
   Reduces `input_tensor` along the dimensions given in `axis`.
   Unless `keepdims` is true, the rank of the tensor is reduced by 1 for each
-  of the entries in `axis`, which must be unique. If `keepdims` is true, the
-  reduced dimensions are retained with length 1.
+  entry in `axis`. If `keepdims` is true, the reduced dimensions
+  are retained with length 1.
 
   If `axis` is None, all dimensions are reduced, and a
   tensor with a single element is returned.
@@ -2553,8 +2553,8 @@ def reduce_min(input_tensor, axis=None, keepdims=False, name=None):
 
   Reduces `input_tensor` along the dimensions given in `axis`.
   Unless `keepdims` is true, the rank of the tensor is reduced by 1 for each
-  of the entries in `axis`, which must be unique. If `keepdims` is true, the
-  reduced dimensions are retained with length 1.
+  entry in `axis`. If `keepdims` is true, the reduced dimensions
+  are retained with length 1.
 
   If `axis` is None, all dimensions are reduced, and a
   tensor with a single element is returned.
@@ -2602,8 +2602,8 @@ def reduce_max_v1(input_tensor,
 
   Reduces `input_tensor` along the dimensions given in `axis`.
   Unless `keepdims` is true, the rank of the tensor is reduced by 1 for each
-  of the entries in `axis`, which must be unique. If `keepdims` is true, the
-  reduced dimensions are retained with length 1.
+  entry in `axis`. If `keepdims` is true, the reduced dimensions
+  are retained with length 1.
 
   If `axis` is None, all dimensions are reduced, and a
   tensor with a single element is returned.
@@ -2640,8 +2640,8 @@ def reduce_max(input_tensor, axis=None, keepdims=False, name=None):
 
   Reduces `input_tensor` along the dimensions given in `axis`.
   Unless `keepdims` is true, the rank of the tensor is reduced by 1 for each
-  of the entries in `axis`, which must be unique. If `keepdims` is true, the
-  reduced dimensions are retained with length 1.
+  entry in `axis`. If `keepdims` is true, the reduced dimensions
+  are retained with length 1.
 
   If `axis` is None, all dimensions are reduced, and a
   tensor with a single element is returned.
@@ -2707,8 +2707,8 @@ def reduce_all_v1(input_tensor,
 
   Reduces `input_tensor` along the dimensions given in `axis`.
   Unless `keepdims` is true, the rank of the tensor is reduced by 1 for each
-  of the entries in `axis`, which must be unique. If `keepdims` is true, the
-  reduced dimensions are retained with length 1.
+  entry in `axis`. If `keepdims` is true, the reduced dimensions
+  are retained with length 1.
 
   If `axis` is None, all dimensions are reduced, and a
   tensor with a single element is returned.
@@ -2754,8 +2754,8 @@ def reduce_all(input_tensor, axis=None, keepdims=False, name=None):
 
   Reduces `input_tensor` along the dimensions given in `axis`.
   Unless `keepdims` is true, the rank of the tensor is reduced by 1 for each
-  of the entries in `axis`, which must be unique. If `keepdims` is true, the
-  reduced dimensions are retained with length 1.
+  entry in `axis`. If `keepdims` is true, the reduced dimensions
+  are retained with length 1.
 
   If `axis` is None, all dimensions are reduced, and a
   tensor with a single element is returned.
@@ -2807,8 +2807,8 @@ def reduce_any_v1(input_tensor,
 
   Reduces `input_tensor` along the dimensions given in `axis`.
   Unless `keepdims` is true, the rank of the tensor is reduced by 1 for each
-  of the entries in `axis`, which must be unique. If `keepdims` is true, the
-  reduced dimensions are retained with length 1.
+  entry in `axis`. If `keepdims` is true, the reduced dimensions
+  are retained with length 1.
 
   If `axis` is None, all dimensions are reduced, and a
   tensor with a single element is returned.
@@ -2854,8 +2854,8 @@ def reduce_any(input_tensor, axis=None, keepdims=False, name=None):
 
   Reduces `input_tensor` along the dimensions given in `axis`.
   Unless `keepdims` is true, the rank of the tensor is reduced by 1 for each
-  of the entries in `axis`, which must be unique. If `keepdims` is true, the
-  reduced dimensions are retained with length 1.
+  entry in `axis`. If `keepdims` is true, the reduced dimensions
+  are retained with length 1.
 
   If `axis` is None, all dimensions are reduced, and a
   tensor with a single element is returned.
@@ -2907,8 +2907,8 @@ def reduce_logsumexp_v1(input_tensor,
 
   Reduces `input_tensor` along the dimensions given in `axis`.
   Unless `keepdims` is true, the rank of the tensor is reduced by 1 for each
-  of the entries in `axis`, which must be unique. If `keepdims` is true, the
-  reduced dimensions are retained with length 1.
+  entry in `axis`. If `keepdims` is true, the reduced dimensions
+  are retained with length 1.
 
   If `axis` has no entries, all dimensions are reduced, and a
   tensor with a single element is returned.
@@ -2956,8 +2956,8 @@ def reduce_logsumexp(input_tensor, axis=None, keepdims=False, name=None):
 
   Reduces `input_tensor` along the dimensions given in `axis`.
   Unless `keepdims` is true, the rank of the tensor is reduced by 1 for each
-  of the entries in `axis`, which must be unique. If `keepdims` is true, the
-  reduced dimensions are retained with length 1.
+  entry in `axis`. If `keepdims` is true, the reduced dimensions
+  are retained with length 1.
 
   If `axis` has no entries, all dimensions are reduced, and a
   tensor with a single element is returned.
diff --git a/tensorflow/python/ops/math_ops_test.py b/tensorflow/python/ops/math_ops_test.py
index 88ef677b7f4..9699f6d2b78 100644
--- a/tensorflow/python/ops/math_ops_test.py
+++ b/tensorflow/python/ops/math_ops_test.py
@@ -57,14 +57,13 @@ class ReduceTest(test_util.TensorFlowTestCase):
   def testReduceExplicitAxes(self):
     x = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.int32)
     with test_util.device(use_gpu=True):
-      for axis in (0, -2):
+      for axis in (0, -2, (0, 0), (0, -2)):
         self.assertAllEqual(self.evaluate(math_ops.reduce_sum(x, axis=axis)),
                             [5, 7, 9])
-      for axis in (1, -1):
+      for axis in (1, -1, (1, 1), (1, -1)):
         self.assertAllEqual(self.evaluate(math_ops.reduce_sum(x, axis=axis)),
                             [6, 15])
-      for axis in (None, (0, 1), (1, 0), (-1, 0), (0, -1), (-2, 1), (1, -2),
-                   (-1, -2), (-2, -1)):
+      for axis in (None, (0, 1), (-1, -2), (-2, -1, 0, 1)):
         self.assertEqual(self.evaluate(math_ops.reduce_sum(x, axis=axis)), 21)
 
   def testReduceInvalidAxis(self):

From c0bb8efaf16a5a0d804ccbfc39fd6bc9b10da5e6 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 30 Jun 2020 15:34:12 -0700
Subject: [PATCH 1351/1390] Make shape, dtype, and ragged_rank public in
 tf.RaggedTensorSpec.

PiperOrigin-RevId: 319112320
Change-Id: I11948b2d437ea68622117df54fda64b2045a10fe
---
 tensorflow/python/ops/ragged/ragged_tensor.py | 95 ++++++++++++++++++-
 .../python/ops/ragged/ragged_tensor_test.py   | 70 +++++++-------
 .../v1/tensorflow.-ragged-tensor-spec.pbtxt   | 16 ++++
 .../v2/tensorflow.-ragged-tensor-spec.pbtxt   | 16 ++++
 4 files changed, 159 insertions(+), 38 deletions(-)

diff --git a/tensorflow/python/ops/ragged/ragged_tensor.py b/tensorflow/python/ops/ragged/ragged_tensor.py
index cd2a81ec08a..93eede6e18a 100644
--- a/tensorflow/python/ops/ragged/ragged_tensor.py
+++ b/tensorflow/python/ops/ragged/ragged_tensor.py
@@ -827,11 +827,22 @@ class RaggedTensor(composite_tensor.CompositeTensor,
 
   @property
   def ragged_rank(self):
-    """The number of ragged dimensions in this ragged tensor.
+    """The number of times the RaggedTensor's flat_values is partitioned.
+
+    Examples:
+
+    >>> values = tf.ragged.constant([[1, 2, 3], [4], [5, 6], [7, 8, 9, 10]])
+    >>> values.ragged_rank
+    1
+
+    >>> rt = tf.RaggedTensor.from_uniform_row_length(values, 2)
+    >>> rt.ragged_rank
+    2
 
     Returns:
-      A Python `int` indicating the number of ragged dimensions in this ragged
-      tensor.  The outermost dimension is not considered ragged.
+      A Python `int` indicating the number of times the underlying `flat_values`
+      Tensor has been partitioned to add a new dimension.
+      I.e., `tf.rank(rt) = tf.rank(rt.flat_values) + rt.ragged_rank`.
     """
     values_is_ragged = isinstance(self._values, RaggedTensor)
     return self._values.ragged_rank + 1 if values_is_ragged else 1
@@ -2119,6 +2130,80 @@ class RaggedTensorSpec(type_spec.BatchableTypeSpec):
 
   __slots__ = ["_shape", "_dtype", "_ragged_rank", "_row_splits_dtype"]
 
+  @property
+  def dtype(self):
+    """The `tf.dtypes.DType` specified by this type for the RaggedTensor.
+
+    Examples:
+
+    >>> rt = tf.ragged.constant([["a"], ["b", "c"]], dtype=tf.string)
+    >>> tf.type_spec_from_value(rt).dtype
+    tf.string
+
+    Returns:
+      A `tf.dtypes.DType` of the values in the RaggedTensor.
+    """
+    return self._dtype
+
+  @property
+  def shape(self):
+    """The statically known shape of the RaggedTensor.
+
+    Examples:
+
+    >>> rt = tf.ragged.constant([[0], [1, 2]])
+    >>> tf.type_spec_from_value(rt).shape
+    TensorShape([2, None])
+
+    >>> rt = tf.ragged.constant([[[0, 1]], [[1, 2], [3, 4]]], ragged_rank=1)
+    >>> tf.type_spec_from_value(rt).shape
+    TensorShape([2, None, 2])
+
+    Returns:
+      A `tf.TensorShape` containing the statically known shape of the
+      RaggedTensor. Ragged dimensions have a size of `None`.
+    """
+    return self._shape
+
+  @property
+  def ragged_rank(self):
+    """The number of times the RaggedTensor's flat_values is partitioned.
+
+    Defaults to `shape.ndims - 1`.
+
+    Examples:
+
+    >>> values = tf.ragged.constant([[1, 2, 3], [4], [5, 6], [7, 8, 9, 10]])
+    >>> tf.type_spec_from_value(values).ragged_rank
+    1
+
+    >>> rt1 = tf.RaggedTensor.from_uniform_row_length(values, 2)
+    >>> tf.type_spec_from_value(rt1).ragged_rank
+    2
+
+    Returns:
+      A Python `int` indicating the number of times the underlying `flat_values`
+      Tensor has been partitioned to add a new dimension.
+      I.e., `tf.rank(rt) = tf.rank(rt.flat_values) + rt.ragged_rank`.
+    """
+    return self._ragged_rank
+
+  @property
+  def row_splits_dtype(self):
+    """The `tf.dtypes.DType` of the the RaggedTensor's `row_splits`.
+
+    Examples:
+
+    >>> rt = tf.ragged.constant([[1, 2, 3], [4]], row_splits_dtype=tf.int64)
+    >>> tf.type_spec_from_value(rt).row_splits_dtype
+    tf.int64
+
+    Returns:
+      A `tf.dtypes.DType` for the RaggedTensor's `row_splits` tensor. One
+      of `tf.int32` or `tf.int64`.
+    """
+    return self._row_splits_dtype
+
   @property
   def value_type(self):
     return RaggedTensor if self._ragged_rank > 0 else ops.Tensor
@@ -2134,8 +2219,8 @@ class RaggedTensorSpec(type_spec.BatchableTypeSpec):
       shape: The shape of the RaggedTensor, or `None` to allow any shape.  If a
         shape is specified, then all ragged dimensions must have size `None`.
       dtype: `tf.DType` of values in the RaggedTensor.
-      ragged_rank: Python integer, the ragged rank of the RaggedTensor to be
-        described.  Defaults to `shape.ndims - 1`.
+      ragged_rank: Python integer, the number of times the RaggedTensor's
+        flat_values is partitioned.  Defaults to `shape.ndims - 1`.
       row_splits_dtype: `dtype` for the RaggedTensor's `row_splits` tensor. One
         of `tf.int32` or `tf.int64`.
     """
diff --git a/tensorflow/python/ops/ragged/ragged_tensor_test.py b/tensorflow/python/ops/ragged/ragged_tensor_test.py
index 5b6521b5aa5..9a15245ea2a 100644
--- a/tensorflow/python/ops/ragged/ragged_tensor_test.py
+++ b/tensorflow/python/ops/ragged/ragged_tensor_test.py
@@ -151,18 +151,18 @@ class RaggedTensorTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     row_splits = constant_op.constant([0, 2, 2, 5, 6, 7], dtypes.int64)
     rp = RowPartition.from_row_splits(row_splits)
 
-    with self.assertRaisesRegexp(ValueError,
-                                 'RaggedTensor constructor is private'):
+    with self.assertRaisesRegex(ValueError,
+                                'RaggedTensor constructor is private'):
       RaggedTensor(values=values, row_partition=rp)
 
-    with self.assertRaisesRegexp(TypeError,
-                                 'values must be a Tensor or RaggedTensor'):
+    with self.assertRaisesRegex(TypeError,
+                                'values must be a Tensor or RaggedTensor'):
       RaggedTensor(values=range(7), row_partition=rp, internal=True)
 
-    with self.assertRaisesRegexp(TypeError,
-                                 'row_partition must be a RowPartition'):
-      RaggedTensor(values=values, row_partition=[0, 2, 2, 5, 6, 7],
-                   internal=True)
+    with self.assertRaisesRegex(TypeError,
+                                'row_partition must be a RowPartition'):
+      RaggedTensor(
+          values=values, row_partition=[0, 2, 2, 5, 6, 7], internal=True)
 
   #=============================================================================
   # RaggedTensor Factory Ops
@@ -308,7 +308,7 @@ class RaggedTensorTest(test_util.TensorFlowTestCase, parameterized.TestCase):
 
   def testFromRowSplitsWithEmptySplits(self):
     err_msg = 'row_splits tensor may not be empty'
-    with self.assertRaisesRegexp(ValueError, err_msg):
+    with self.assertRaisesRegex(ValueError, err_msg):
       RaggedTensor.from_row_splits([], [])
 
   def testFromRowStarts(self):
@@ -511,18 +511,18 @@ class RaggedTensorTest(test_util.TensorFlowTestCase, parameterized.TestCase):
         constant_op.constant([0, 0, 2, 2, 2, 3, 4], dtypes.int64)
     ]
     nrows = [constant_op.constant(6, dtypes.int64)]
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         ValueError, 'nested_nrows must have the same '
         'length as nested_value_rowids'):
       RaggedTensor.from_nested_value_rowids(values, nested_value_rowids, nrows)
 
   def testFromNestedValueRowIdsWithNonListInput(self):
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         TypeError, 'nested_value_rowids must be a list of Tensors'):
       RaggedTensor.from_nested_value_rowids(
           [1, 2, 3], constant_op.constant([[0, 1, 2], [0, 1, 2]], dtypes.int64))
-    with self.assertRaisesRegexp(TypeError,
-                                 'nested_nrows must be a list of Tensors'):
+    with self.assertRaisesRegex(TypeError,
+                                'nested_nrows must be a list of Tensors'):
       RaggedTensor.from_nested_value_rowids([1, 2, 3], [[0, 1, 2], [0, 1, 2]],
                                             constant_op.constant([3, 3]))
 
@@ -578,8 +578,8 @@ class RaggedTensorTest(test_util.TensorFlowTestCase, parameterized.TestCase):
         rt, [[[b'a', b'b'], []], [[b'c', b'd', b'e']], [], [[b'f'], [b'g']]])
 
   def testFromNestedRowSplitsWithNonListInput(self):
-    with self.assertRaisesRegexp(TypeError,
-                                 'nested_row_splits must be a list of Tensors'):
+    with self.assertRaisesRegex(TypeError,
+                                'nested_row_splits must be a list of Tensors'):
       RaggedTensor.from_nested_row_splits(
           [1, 2], constant_op.constant([[0, 1, 2], [0, 1, 2]], dtypes.int64))
 
@@ -588,32 +588,31 @@ class RaggedTensorTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     value_rowids = constant_op.constant([0, 0, 2, 2, 2, 3, 4], dtypes.int64)
     nrows = constant_op.constant(5, dtypes.int64)
 
-    with self.assertRaisesRegexp(ValueError, r'Expected nrows >= 0; got -2'):
+    with self.assertRaisesRegex(ValueError, r'Expected nrows >= 0; got -2'):
       RaggedTensor.from_value_rowids(
           values=values,
           value_rowids=array_ops.placeholder_with_default(value_rowids, None),
           nrows=-2)
 
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         ValueError, r'Expected nrows >= value_rowids\[-1\] \+ 1; got nrows=2, '
         r'value_rowids\[-1\]=4'):
       RaggedTensor.from_value_rowids(
           values=values, value_rowids=value_rowids, nrows=2)
 
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         ValueError, r'Expected nrows >= value_rowids\[-1\] \+ 1; got nrows=4, '
         r'value_rowids\[-1\]=4'):
       RaggedTensor.from_value_rowids(
           values=values, value_rowids=value_rowids, nrows=4)
 
-    with self.assertRaisesRegexp(ValueError,
-                                 r'Shape \(7, 1\) must have rank 1'):
+    with self.assertRaisesRegex(ValueError, r'Shape \(7, 1\) must have rank 1'):
       RaggedTensor.from_value_rowids(
           values=values,
           value_rowids=array_ops.expand_dims(value_rowids, 1),
           nrows=nrows)
 
-    with self.assertRaisesRegexp(ValueError, r'Shape \(1,\) must have rank 0'):
+    with self.assertRaisesRegex(ValueError, r'Shape \(1,\) must have rank 0'):
       RaggedTensor.from_value_rowids(
           values=values,
           value_rowids=value_rowids,
@@ -632,9 +631,9 @@ class RaggedTensorTest(test_util.TensorFlowTestCase, parameterized.TestCase):
         values = constant_op.constant([1, 2, 3], dtypes.int64)
       with ops.Graph().as_default():
         splits = constant_op.constant([0, 2, 3], dtypes.int64)
-      self.assertRaisesRegexp(ValueError,
-                              '.* must be from the same graph as .*',
-                              RaggedTensor.from_row_splits, values, splits)
+      with self.assertRaisesRegex(ValueError,
+                                  '.* must be from the same graph as .*'):
+        RaggedTensor.from_row_splits(values, splits)
 
   #=============================================================================
   # Ragged Value & Row-Partitioning Tensor Accessors
@@ -755,7 +754,7 @@ class RaggedTensorTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     if not context.executing_eagerly():
       rt5 = RaggedTensor.from_row_splits(
           array_ops.placeholder(dtype=dtypes.string), [0, 2, 3, 5])
-      self.assertEqual(rt5.shape.ndims, None)
+      self.assertIsNone(rt5.shape.ndims)
 
       rt6 = RaggedTensor.from_row_splits(
           [1, 2, 3], array_ops.placeholder(dtype=dtypes.int64))
@@ -1419,8 +1418,8 @@ class RaggedTensorTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     rt = ragged_factory_ops.constant([[0], [1], [2], [3]])
     batched_variant = rt._to_variant(batched_input=True)
     nested_batched_variant = array_ops.reshape(batched_variant, [2, 2])
-    with self.assertRaisesRegexp(ValueError,
-                                 'output_ragged_rank must be equal to'):
+    with self.assertRaisesRegex(ValueError,
+                                'output_ragged_rank must be equal to'):
       RaggedTensor._from_variant(
           nested_batched_variant,
           dtype=dtypes.int32,
@@ -1481,7 +1480,7 @@ class RaggedTensorTest(test_util.TensorFlowTestCase, parameterized.TestCase):
       self.assertNumpyObjectTensorsRecursivelyEqual(
           expected, actual, 'Expected %r, got %r' % (expected, actual))
     else:
-      with self.assertRaisesRegexp(ValueError, 'only supported in eager mode'):
+      with self.assertRaisesRegex(ValueError, 'only supported in eager mode'):
         rt.numpy()
 
   @parameterized.parameters([
@@ -1563,23 +1562,28 @@ class RaggedTensorSpecTest(test_util.TensorFlowTestCase,
 
   def testConstruction(self):
     spec1 = RaggedTensorSpec(ragged_rank=1)
-    self.assertEqual(spec1._shape.rank, None)
+    self.assertIsNone(spec1._shape.rank)
     self.assertEqual(spec1._dtype, dtypes.float32)
     self.assertEqual(spec1._row_splits_dtype, dtypes.int64)
     self.assertEqual(spec1._ragged_rank, 1)
 
+    self.assertIsNone(spec1.shape.rank)
+    self.assertEqual(spec1.dtype, dtypes.float32)
+    self.assertEqual(spec1.row_splits_dtype, dtypes.int64)
+    self.assertEqual(spec1.ragged_rank, 1)
+
     spec2 = RaggedTensorSpec(shape=[None, None, None])
     self.assertEqual(spec2._shape.as_list(), [None, None, None])
     self.assertEqual(spec2._dtype, dtypes.float32)
     self.assertEqual(spec2._row_splits_dtype, dtypes.int64)
     self.assertEqual(spec2._ragged_rank, 2)
 
-    with self.assertRaisesRegexp(ValueError, 'Must specify ragged_rank'):
+    with self.assertRaisesRegex(ValueError, 'Must specify ragged_rank'):
       RaggedTensorSpec()
-    with self.assertRaisesRegexp(TypeError, 'ragged_rank must be an int'):
+    with self.assertRaisesRegex(TypeError, 'ragged_rank must be an int'):
       RaggedTensorSpec(ragged_rank=constant_op.constant(1))
-    with self.assertRaisesRegexp(ValueError,
-                                 'ragged_rank must be less than rank'):
+    with self.assertRaisesRegex(ValueError,
+                                'ragged_rank must be less than rank'):
       RaggedTensorSpec(ragged_rank=2, shape=[None, None])
 
   def testValueType(self):
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.-ragged-tensor-spec.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-ragged-tensor-spec.pbtxt
index 2ec5bb46ed1..25c3f6e3e11 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.-ragged-tensor-spec.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.-ragged-tensor-spec.pbtxt
@@ -4,6 +4,22 @@ tf_class {
   is_instance: "<class \'tensorflow.python.framework.type_spec.BatchableTypeSpec\'>"
   is_instance: "<class \'tensorflow.python.framework.type_spec.TypeSpec\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "ragged_rank"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "row_splits_dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "shape"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "value_type"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-ragged-tensor-spec.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-ragged-tensor-spec.pbtxt
index 2ec5bb46ed1..25c3f6e3e11 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.-ragged-tensor-spec.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.-ragged-tensor-spec.pbtxt
@@ -4,6 +4,22 @@ tf_class {
   is_instance: "<class \'tensorflow.python.framework.type_spec.BatchableTypeSpec\'>"
   is_instance: "<class \'tensorflow.python.framework.type_spec.TypeSpec\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "ragged_rank"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "row_splits_dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "shape"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "value_type"
     mtype: "<type \'property\'>"

From ec07b637cecf04897f34c745bd37404ad795e645 Mon Sep 17 00:00:00 2001
From: Robert David <lrdx@google.com>
Date: Tue, 30 Jun 2020 15:39:21 -0700
Subject: [PATCH 1352/1390] LSTM: Split gate calculations to separate
 functions.

PiperOrigin-RevId: 319113154
Change-Id: I3f6d5358ac2bc619e3705d6f8aaf66c38c7c1b66
---
 tensorflow/lite/kernels/lstm_eval.cc          | 1106 ++++++++---------
 .../calibration/builtin_logging_ops/lstm.cc   |  264 ++--
 2 files changed, 633 insertions(+), 737 deletions(-)

diff --git a/tensorflow/lite/kernels/lstm_eval.cc b/tensorflow/lite/kernels/lstm_eval.cc
index 42d6f89c0e4..9087bbeada9 100644
--- a/tensorflow/lite/kernels/lstm_eval.cc
+++ b/tensorflow/lite/kernels/lstm_eval.cc
@@ -128,6 +128,91 @@ inline float GetTensorScale(const TfLiteTensor* tensor) {
 }
 
 // LINT.IfChange
+// Calculates a single LSTM gate.
+//
+// Implements the following formula: (* is matrix multiply)
+//   gate = activate(W_input    * input + W_aux       * aux_input   +
+//                   W_peephole * cell  + W_recurrent * prev_output + bias)
+// with layer norm:
+//   gate = activate(W_norm * normalize(...) + bias) // not adding bias inside
+//
+// Activation is sigmoid except for the "cell" gate (configurable, usually tanh)
+//
+// Parameters:
+// Input vectors (to LSTM):    | Size:                | Optional?
+//   input                     | n_input              |
+//   aux_input                 | n_aux_input          | y (bidir LSTM)
+// Input vectors (persistent states):
+//   output_state              | n_output             |
+//   cell_state                | n_cell               |
+// 'Constant' inputs:
+//   input_to_gate_weights     | n_cell * n_input     |
+//   aux_input_to_gate_weights | n_cell * n_aux_input | y (bidir LSTM)
+//   recurrent_to_gate_weights | n_cell * n_output    |
+//   cell_to_gate_weights      | n_cell               | y (peephole)
+//   gate_bias                 | n_cell               |
+//   layer_norm_coefficients   | n_cell               | y (layer norm)
+// Output vector:
+//   gate                      | n_cell               |
+// Scalar parameters:
+//   n_batch                                    - batch size / number of vectors
+//   n_input, n_aux_input, n_output, n_cell     - size of vectors.
+//   activation                                 - activation to use.
+//   is_input_all_zeros, is_aux_input_all_zeros - if input vectors are all zero.
+//   use_layer_norm                             - if doing layer norm LSTM.
+inline void CalculateLstmGateFloat(
+    const float* input, const float* input_to_gate_weights,
+    const float* aux_input, const float* aux_input_to_gate_weights,
+    const float* output_state, const float* recurrent_to_gate_weights,
+    const float* cell_state, const float* cell_to_gate_weights,
+    const float* layer_norm_coefficients, const float* gate_bias,
+    const int n_batch, const int n_input, const int n_aux_input,
+    const int n_output, const int n_cell,
+    const TfLiteFusedActivation activation, float* gate,
+    const bool is_input_all_zeros, const bool is_aux_input_all_zeros) {
+  const bool use_peephole = (cell_to_gate_weights != nullptr);
+  const bool use_layer_norm = (layer_norm_coefficients != nullptr);
+
+  // Initialize scratch buffers with bias for regular lstm or initialize with
+  // zero for layer norm lstm.
+  if (use_layer_norm) {
+    std::fill_n(gate, n_cell * n_batch, 0.0f);
+  } else {
+    tensor_utils::VectorBatchVectorAssign(gate_bias, n_cell, n_batch, gate);
+  }
+  // For each batch and cell: compute input_weight * input.
+  // Skip if input is all zeros.
+  if (!is_input_all_zeros) {
+    tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+        input_to_gate_weights, n_cell, n_input, input, n_batch, gate);
+  }
+  // For each batch and cell: compute aux_input_weight * aux_input.
+  // Skip if auxiliary input is not available or all zeros.
+  if (!is_aux_input_all_zeros) {
+    tensor_utils::MatrixBatchVectorMultiplyAccumulate(aux_input_to_gate_weights,
+                                                      n_cell, n_aux_input,
+                                                      aux_input, n_batch, gate);
+  }
+  // For each batch and cell: compute recurrent_weight * output_state.
+  tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+      recurrent_to_gate_weights, n_cell, n_output, output_state, n_batch, gate);
+  // For each batch and cell: compute cell_weight .* cell_state (peephole LSTM)
+  if (use_peephole) {
+    tensor_utils::VectorBatchVectorCwiseProductAccumulate(
+        cell_to_gate_weights, n_cell, cell_state, n_batch, gate);
+  }
+  // Do layer normalization (if layer norm LSTM)
+  if (use_layer_norm) {
+    tensor_utils::MeanStddevNormalization(gate, gate, n_cell, n_batch);
+    tensor_utils::VectorBatchVectorCwiseProduct(layer_norm_coefficients, n_cell,
+                                                gate, n_batch, gate);
+    tensor_utils::VectorBatchVectorAdd(gate_bias, n_cell, n_batch, gate);
+  }
+  // Apply activation
+  tensor_utils::ApplyActivationToVector(gate, n_batch * n_cell, activation,
+                                        gate);
+}
+
 // Updates the LSTM cell state, used by both float and hybrid LSTM versions.
 //
 // Implements the following formula:
@@ -221,6 +306,101 @@ void CalculateLstmOutputFloat(int n_batch, int n_cell, int n_output,
 // LINT.ThenChange(../tools/optimize/calibration/builtin_logging_ops/lstm.cc,\
 //                 ../experimental/kernels/fp16/lstm_eval.cc)
 
+// Calculates a single LSTM gate, hybrid version.
+// Implements the same functionality as CalculateLstmGateFloat.
+void CalculateLstmGateHybrid(
+    // Input and weights
+    const int8_t* input, const float* input_sf, const int32_t* input_zp,
+    const int8_t* input_to_gate_weights,
+    const float input_to_gate_weights_scale, int32_t* input_to_gate_row_sums,
+    // Aux input and weights
+    const int8_t* aux_input, const float* aux_input_sf,
+    const int32_t* aux_input_zp, const int8_t* aux_input_to_gate_weights,
+    const float aux_input_to_gate_weights_scale,
+    int32_t* aux_input_to_gate_row_sums,
+    // Output state and weights
+    const int8_t* output_state, const float* output_state_sf,
+    const int32_t* output_state_zp, const int8_t* recurrent_to_gate_weights,
+    const float recurrent_to_gate_weights_scale,
+    int32_t* recurrent_to_gate_row_sums,
+    // Cell state and weights (peephole LSTM)
+    const float* cell_state, const int8_t* cell_to_gate_weights,
+    const float cell_to_gate_weights_scale,
+    // Layer normalization coefficients (layer norm LSTM) + gate bias
+    const float* layer_norm_coefficients, const float* gate_bias,
+    // Array sizes
+    const int n_batch, const int n_input, const int n_aux_input,
+    const int n_output, const int n_cell,
+    const TfLiteFusedActivation activation,
+    // Output
+    float* gate,
+    // Parameters for performance optimizations
+    const bool is_input_all_zeros, const bool is_aux_input_all_zeros,
+    const bool is_output_state_all_zeros, bool* compute_row_sums,
+    CpuBackendContext* context,
+    // Scratch arrays
+    float* scratch0,        // size: n_batch
+    float* scratch1,        // size: n_cell, only used if peephole LSTM
+    int32_t* accum_scratch  // For MatrixBatchVectorMultiplyAccumulate
+) {
+  const bool use_peephole = (cell_to_gate_weights != nullptr);
+  const bool use_layer_norm = (layer_norm_coefficients != nullptr);
+
+  // Initialize scratch buffers with bias for regular lstm or initialize with
+  // zero for layer norm lstm.
+  if (use_layer_norm) {
+    std::fill_n(gate, n_cell * n_batch, 0.0f);
+  } else {
+    tensor_utils::VectorBatchVectorAssign(gate_bias, n_cell, n_batch, gate);
+  }
+  // For each batch and cell: compute input_weight * input.
+  // Skip if input is all zeros.
+  if (!is_input_all_zeros) {
+    tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+        input_to_gate_weights, n_cell, n_input, input,
+        input_to_gate_weights_scale, input_sf, n_batch, gate,
+        /*per_channel_scale=*/nullptr, input_zp, accum_scratch,
+        input_to_gate_row_sums, compute_row_sums, scratch0, context);
+  }
+  // For each batch and cell: compute aux_input_weight * aux_input.
+  // Skip if auxiliary input is not available or all zeros.
+  if (!is_aux_input_all_zeros) {
+    tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+        aux_input_to_gate_weights, n_cell, n_aux_input, aux_input,
+        aux_input_to_gate_weights_scale, aux_input_sf, n_batch, gate,
+        /*per_channel_scale=*/nullptr, aux_input_zp, accum_scratch,
+        aux_input_to_gate_row_sums, compute_row_sums, scratch0, context);
+  }
+  // For each batch and cell: compute recurrent_weight * output_state.
+  // Skip if output state is all zeros.
+  if (!is_output_state_all_zeros) {
+    tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+        recurrent_to_gate_weights, n_cell, n_output, output_state,
+        recurrent_to_gate_weights_scale, output_state_sf, n_batch, gate,
+        /*per_channel_scale=*/nullptr, output_state_zp, accum_scratch,
+        recurrent_to_gate_row_sums, compute_row_sums, scratch0, context);
+  }
+  // For each batch and cell: compute cell_weight .* cell_state (peephole LSTM)
+  if (use_peephole) {
+    float* recovered_cell_weights = scratch1;
+    tensor_utils::VectorScalarMultiply(cell_to_gate_weights, n_cell,
+                                       cell_to_gate_weights_scale,
+                                       recovered_cell_weights);
+    tensor_utils::VectorBatchVectorCwiseProductAccumulate(
+        recovered_cell_weights, n_cell, cell_state, n_batch, gate);
+  }
+  // Do layer normalization (if layer norm LSTM)
+  if (use_layer_norm) {
+    tensor_utils::MeanStddevNormalization(gate, gate, n_cell, n_batch);
+    tensor_utils::VectorBatchVectorCwiseProduct(layer_norm_coefficients, n_cell,
+                                                gate, n_batch, gate);
+    tensor_utils::VectorBatchVectorAdd(gate_bias, n_cell, n_batch, gate);
+  }
+  // Apply activation
+  tensor_utils::ApplyActivationToVector(gate, n_cell * n_batch, activation,
+                                        gate);
+}
+
 // Calculates the output state tensor of an LSTM step. See Float version too.
 //
 // Parameters:
@@ -281,6 +461,80 @@ void CalculateLstmOutputHybrid(
   }
 }
 
+// Calculates a single LSTM gate, int8x8_16 version.
+// Implements the same functionality as CalculateLstmGateFloat.
+void CalculateLstmGateInteger8x8_16(
+    // Input and weights
+    const int8_t* input, const int8_t* input_to_gate_weights,
+    const int32_t* input_to_gate_bias, const int32_t input_to_gate_scale_a,
+    const int32_t input_to_gate_scale_b,
+    // Output state and weights
+    const int8_t* output_state, const int8_t* recurrent_to_gate_weights,
+    const int32_t* recurrent_to_gate_bias,
+    const int32_t recurrent_to_gate_scale_a,
+    const int32_t recurrent_to_gate_scale_b,
+    // Cell state and weights
+    const int16_t* cell_state, const int16_t* cell_to_gate_weights,
+    const int32_t cell_to_gate_scale_a, const int32_t cell_to_gate_scale_b,
+    // Layer normalization parameters (layer norm LSTM)
+    const int16_t* layer_norm_coefficients, const int32_t* layer_norm_bias,
+    const int32_t layer_norm_input_scale_a,
+    const int32_t layer_norm_input_scale_b,
+    const int32_t layer_norm_variance_guard,
+    // Array sizes
+    const int n_batch, const int n_input, const int n_output, const int n_cell,
+    const TfLiteFusedActivation activation,
+    // Output
+    int16_t* gate,
+    // Parameters for performance optimizations
+    CpuBackendContext* context,
+    // Scratch arrays
+    int32_t* scratch5) {
+  const bool use_peephole = (cell_to_gate_weights != nullptr);
+  const bool use_layer_norm = (layer_norm_coefficients != nullptr);
+
+  // Initialize scratch buffers with zeros. Note that unlike float and hybrid
+  // versions, bias is only used in layer normalization.
+  std::fill_n(gate, n_batch * n_cell, 0);
+  // For each batch and cell: compute input_weight * input.
+  tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+      input, input_to_gate_bias, input_to_gate_weights, input_to_gate_scale_a,
+      input_to_gate_scale_b, n_batch, n_input, n_cell, 0, scratch5, gate,
+      context);
+  // Note: no aux_input.
+
+  // For each batch and cell: compute recurrent_weight * output_state.
+  tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+      output_state, recurrent_to_gate_bias, recurrent_to_gate_weights,
+      recurrent_to_gate_scale_a, recurrent_to_gate_scale_b, n_batch, n_output,
+      n_cell, 0, scratch5, gate, context);
+  // For each batch and cell: compute cell_weight * cell_state (peephole LSTM)
+  if (use_peephole) {
+    tensor_utils::VectorBatchVectorCwiseProductAccumulate(
+        cell_to_gate_weights, n_output, cell_state, n_batch,
+        cell_to_gate_scale_a, cell_to_gate_scale_b, gate);
+  }
+  // Do layer normalization (if layer norm LSTM)
+  if (use_layer_norm) {
+    tensor_utils::ApplyLayerNorm(
+        gate, layer_norm_coefficients, layer_norm_bias,
+        layer_norm_input_scale_a, layer_norm_input_scale_b,
+        layer_norm_variance_guard, n_batch, n_cell, gate);
+  }
+  // Apply activation
+  switch (activation) {
+    case kTfLiteActSigmoid:
+      tensor_utils::ApplySigmoid(gate, n_batch, n_cell, gate);
+      break;
+    case kTfLiteActTanh:
+      tensor_utils::ApplyTanh(3, gate, n_batch, n_cell, gate);
+      break;
+    default:
+      // Only Sigmoid or Tanh is used.
+      TFLITE_ASSERT_FALSE;
+  }
+}
+
 // Updates the LSTM cell state, used by both integer LSTM versions.
 // Also see UpdateLstmCellFloat.
 //
@@ -327,9 +581,9 @@ void UpdateLstmCellInteger(int n_batch, int n_cell, int16_t* cell_state,
 //  - n_cell, n_output: sizes of vectors.
 //  - cell_state, output_gate: input vectors, size n_batch*n_cell.
 //  - cell_state_scale: scaling of cell_state.
-//  - effective_hidden_scale_[a|b]: effective scale of cell_state.*output_gate
+//  - hidden_scale_[a|b]: effective scale of cell_state.*output_gate
 //  - hidden_zp: zero_point for cell_state.*output_gate
-//  - projection_weights, effective_proj_scale_[a|b], projection_effective_bias:
+//  - projection_weights, proj_scale_[a|b], projection_bias:
 //      constant inputs, describing projection matrix and bias.
 //  - output_state_zp: zero point of output_state. (Input, calibrated value.)
 //  - quantized_proj_clip: if > 0, clip the output of the projection.
@@ -341,19 +595,17 @@ void UpdateLstmCellInteger(int n_batch, int n_cell, int16_t* cell_state,
 void CalculateLstmOutputInteger8x8_16(
     int n_batch, int n_cell, int n_output, const int16_t* cell_state,
     int32_t cell_state_scale, const int16_t* output_gate,
-    int32_t effective_hidden_scale_a, int32_t effective_hidden_scale_b,
-    int32_t hidden_zp, const int8_t* projection_weights,
-    int32_t effective_proj_scale_a, int32_t effective_proj_scale_b,
-    const int32_t* projection_effective_bias, int32_t output_state_zp,
-    int8_t quantized_proj_clip, int8_t* output_state,
+    int32_t hidden_scale_a, int32_t hidden_scale_b, int32_t hidden_zp,
+    const int8_t* projection_weights, int32_t proj_scale_a,
+    int32_t proj_scale_b, const int32_t* projection_bias,
+    int32_t output_state_zp, int8_t quantized_proj_clip, int8_t* output_state,
     CpuBackendContext* context, int16_t* scratch0, int8_t* scratch1,
     int32_t* scratch2) {
   // Note: unlike float/hybrid, the activation is always Tanh.
   tensor_utils::ApplyTanh(15 + cell_state_scale, cell_state, n_batch, n_cell,
                           scratch0);
-  tensor_utils::CwiseMul(output_gate, scratch0, effective_hidden_scale_a,
-                         effective_hidden_scale_b, n_batch, n_cell, hidden_zp,
-                         scratch1);
+  tensor_utils::CwiseMul(output_gate, scratch0, hidden_scale_a, hidden_scale_b,
+                         n_batch, n_cell, hidden_zp, scratch1);
 
   const bool use_projection = (projection_weights != nullptr);
 
@@ -361,9 +613,9 @@ void CalculateLstmOutputInteger8x8_16(
     // Note: no bias like in float/hybrid
     std::fill_n(output_state, n_batch * n_output, 0);
     tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-        scratch1, projection_effective_bias, projection_weights,
-        effective_proj_scale_a, effective_proj_scale_b, n_batch, n_cell,
-        n_output, output_state_zp, scratch2, output_state, context);
+        scratch1, projection_bias, projection_weights, proj_scale_a,
+        proj_scale_b, n_batch, n_cell, n_output, output_state_zp, scratch2,
+        output_state, context);
     if (quantized_proj_clip > 0) {
       tensor_utils::CwiseClipping(output_state, n_batch * n_output,
                                   quantized_proj_clip);
@@ -373,6 +625,68 @@ void CalculateLstmOutputInteger8x8_16(
   }
 }
 
+// Calculates a single LSTM gate, int8x8_8 version.
+// Implements the same functionality as CalculateLstmGateFloat.
+void CalculateLstmGateInteger8x8_8(
+    // Inputs and weights
+    const int8_t* input, int32_t input_zp, const int8_t* input_to_gate_weight,
+    const int32_t input_to_gate_scale_a, const int32_t input_to_gate_scale_b,
+    const int32_t input_times_weights_scale_a,
+    const int32_t input_times_weights_scale_b,
+    const int32_t input_times_weights_zp,
+    // Output state and weights
+    const int8_t* output_state, const int32_t output_state_zp,
+    const int8_t* recurrent_to_gate_weight,
+    const int32_t recurrent_to_gate_scale_a,
+    const int32_t recurrent_to_gate_scale_b,
+    const int32_t output_state_times_weights_scale_a,
+    const int32_t output_state_times_weights_scale_b,
+    const int32_t output_state_times_weights_zp,
+    // Layer normalization parameters (layer norm LSTM)
+    const int16_t* layer_norm_gate_weight,
+    const int32_t layer_norm_gate_scale_a,
+    const int32_t layer_norm_gate_scale_b, const int32_t* gate_bias,
+    // Array sizes
+    const int n_batch, const int n_input, const int n_output, const int n_cell,
+    const TfLiteFusedActivation activation,
+    // Output
+    int16_t* gate,
+    // Scratch arrays, both sized n_batch*n_cell
+    int8_t* scratch0, int8_t* scratch1) {
+  // Multiply input * input_weights => scratch0
+  tensor_utils::MatrixBatchVectorMultiply(
+      input, input_zp, input_to_gate_weight, input_to_gate_scale_a,
+      input_to_gate_scale_b, n_batch, n_input, n_cell, scratch0,
+      input_times_weights_zp);
+  // Multiply output_state * recurrent_weights => scratch1
+  tensor_utils::MatrixBatchVectorMultiply(
+      output_state, output_state_zp, recurrent_to_gate_weight,
+      recurrent_to_gate_scale_a, recurrent_to_gate_scale_b, n_batch, n_output,
+      n_cell, scratch1, output_state_times_weights_zp);
+  // Add scratch0 + scratch1 => gate
+  tensor_utils::TwoGateSaturatingAdd(
+      scratch0, input_times_weights_zp, scratch1, output_state_times_weights_zp,
+      input_times_weights_scale_a, input_times_weights_scale_b,
+      output_state_times_weights_scale_a, output_state_times_weights_scale_b,
+      n_batch, n_cell, gate);
+  // Apply layer normalization.
+  tensor_utils::ApplyLayerNormFloat(
+      gate, layer_norm_gate_weight, layer_norm_gate_scale_a,
+      layer_norm_gate_scale_b, gate_bias, n_batch, n_cell, gate);
+  // Apply activation.  // Apply activation
+  switch (activation) {
+    case kTfLiteActSigmoid:
+      tensor_utils::ApplySigmoidFloat(gate, n_batch, n_cell, gate);
+      break;
+    case kTfLiteActTanh:
+      tensor_utils::ApplyTanhFloat(gate, n_batch, n_cell, -12, gate);
+      break;
+    default:
+      // Only Sigmoid or Tanh is used.
+      TFLITE_ASSERT_FALSE;
+  }
+}
+
 // Calculates the output state tensor of an LSTM step. See Float and hybrid
 // versions as well.
 //
@@ -380,7 +694,7 @@ void CalculateLstmOutputInteger8x8_16(
 //  - n_batch: batches: the number of distinct vectors in each array.
 //  - n_cell, n_output: sizes of vectors.
 //  - cell_state, output_gate: input vectors, size n_batch*n_cell.
-//  - projection_weights, effective_proj_scale_[a|b], projection_bias:
+//  - projection_weights, proj_scale_[a|b], projection_bias:
 //      constant inputs, describing projection matrix and bias.
 //  - output_state_zp: zero point of the output state.
 //  - quantized_proj_clip: if > 0, clip the output of the projection.
@@ -389,18 +703,17 @@ void CalculateLstmOutputInteger8x8_16(
 void CalculateLstmOutputInteger8x8_8(
     int n_batch, int n_cell, int n_output, const int16_t* cell_state,
     const int16_t* output_gate, const int8_t* projection_weights,
-    int32_t effective_proj_scale_a, int32_t effective_proj_scale_b,
-    const int32_t* projection_bias, int32_t output_state_zp,
-    int32_t quantized_proj_clip, int8_t* output_state, int16_t* scratch) {
+    int32_t proj_scale_a, int32_t proj_scale_b, const int32_t* projection_bias,
+    int32_t output_state_zp, int32_t quantized_proj_clip, int8_t* output_state,
+    int16_t* scratch) {
   // Note: unlike float/hybrid, the activation is always Tanh.
   tensor_utils::ApplyTanhFloat(cell_state, n_batch, n_cell, -15, scratch);
   tensor_utils::CwiseMul(output_gate, scratch, n_batch, n_cell, 15 + 15 - 15,
                          scratch);
   // Note: no bias like in float/hybrid
   tensor_utils::MatrixBatchVectorMultiply(
-      scratch, projection_weights, effective_proj_scale_a,
-      effective_proj_scale_b, projection_bias, n_batch, n_cell, n_output,
-      output_state_zp, output_state);
+      scratch, projection_weights, proj_scale_a, proj_scale_b, projection_bias,
+      n_batch, n_cell, n_output, output_state_zp, output_state);
   if (quantized_proj_clip > 0) {
     tensor_utils::CwiseClipping(output_state, n_batch * n_output,
                                 quantized_proj_clip);
@@ -502,178 +815,67 @@ inline void LstmStepFloat(
   // Since we have already checked that weights are all there or none, we can
   // check the existence of only one to the get the condition.
   const bool use_cifg = (input_to_input_weights_ptr == nullptr);
-  const bool use_peephole = (cell_to_output_weights_ptr != nullptr);
-  const bool use_layer_norm = (forget_layer_norm_coefficients_ptr != nullptr);
 
-  // Make named scratch buffers for the different gates.
+  // Make named scratch buffers.
   float* input_gate_scratch = scratch0;
   float* forget_gate_scratch = scratch1;
   float* cell_gate_scratch = scratch2;
   float* output_gate_scratch = scratch3;
 
+  // Check if inputs are all zeros so we can skip some computations.
   const bool is_input_all_zeros =
       tensor_utils::IsZeroVector(input_ptr, n_batch * n_input);
   const bool is_aux_input_all_zeros =
       (aux_input_ptr == nullptr ||
        tensor_utils::IsZeroVector(aux_input_ptr, n_batch * n_aux_input));
-
-  // Initialize scratch buffers with bias for regular lstm or initialize with
-  // zero for layer norm lstm.
-  if (use_layer_norm) {
-    if (!use_cifg) {
-      std::fill_n(input_gate_scratch, n_cell * n_batch, 0.0f);
-    }
-    std::fill_n(forget_gate_scratch, n_cell * n_batch, 0.0f);
-    std::fill_n(cell_gate_scratch, n_cell * n_batch, 0.0f);
-    std::fill_n(output_gate_scratch, n_cell * n_batch, 0.0f);
-  } else {
-    if (!use_cifg) {
-      tensor_utils::VectorBatchVectorAssign(input_gate_bias_ptr, n_cell,
-                                            n_batch, input_gate_scratch);
-    }
-    tensor_utils::VectorBatchVectorAssign(forget_gate_bias_ptr, n_cell, n_batch,
-                                          forget_gate_scratch);
-    tensor_utils::VectorBatchVectorAssign(cell_gate_bias_ptr, n_cell, n_batch,
-                                          cell_gate_scratch);
-    tensor_utils::VectorBatchVectorAssign(output_gate_bias_ptr, n_cell, n_batch,
-                                          output_gate_scratch);
-  }
-
-  // For each batch and cell: compute input_weight * input.
-  // Skip if input is all zeros.
-  if (!is_input_all_zeros) {
-    if (!use_cifg) {
-      tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-          input_to_input_weights_ptr, n_cell, n_input, input_ptr, n_batch,
-          input_gate_scratch);
-    }
-
-    tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-        input_to_forget_weights_ptr, n_cell, n_input, input_ptr, n_batch,
-        forget_gate_scratch);
-    tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-        input_to_cell_weights_ptr, n_cell, n_input, input_ptr, n_batch,
-        cell_gate_scratch);
-    tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-        input_to_output_weights_ptr, n_cell, n_input, input_ptr, n_batch,
-        output_gate_scratch);
-  }
-
-  // For each batch and cell: compute aux_input_weight * aux_input.
-  // Skip if auxiliary input is not available or all zeros.
-  if (!is_aux_input_all_zeros) {
-    if (!use_cifg) {
-      tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-          aux_input_to_input_weights_ptr, n_cell, n_aux_input, aux_input_ptr,
-          n_batch, input_gate_scratch);
-    }
-
-    tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-        aux_input_to_forget_weights_ptr, n_cell, n_aux_input, aux_input_ptr,
-        n_batch, forget_gate_scratch);
-    tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-        aux_input_to_cell_weights_ptr, n_cell, n_aux_input, aux_input_ptr,
-        n_batch, cell_gate_scratch);
-    tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-        aux_input_to_output_weights_ptr, n_cell, n_aux_input, aux_input_ptr,
-        n_batch, output_gate_scratch);
-  }
-
-  // For each batch and cell: compute recurrent_weight * output_state.
   if (!use_cifg) {
-    tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-        recurrent_to_input_weights_ptr, n_cell, n_output, output_state_ptr,
-        n_batch, input_gate_scratch);
+    // Calculate the input gate. (If not CIFG.)
+    CalculateLstmGateFloat(
+        input_ptr, input_to_input_weights_ptr, aux_input_ptr,
+        aux_input_to_input_weights_ptr, output_state_ptr,
+        recurrent_to_input_weights_ptr, cell_state_ptr,
+        cell_to_input_weights_ptr, input_layer_norm_coefficients_ptr,
+        input_gate_bias_ptr, n_batch, n_input, n_aux_input, n_output, n_cell,
+        /*activation=*/kTfLiteActSigmoid, input_gate_scratch,
+        is_input_all_zeros, is_aux_input_all_zeros);
   }
-  tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-      recurrent_to_forget_weights_ptr, n_cell, n_output, output_state_ptr,
-      n_batch, forget_gate_scratch);
-  tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-      recurrent_to_cell_weights_ptr, n_cell, n_output, output_state_ptr,
-      n_batch, cell_gate_scratch);
-  tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-      recurrent_to_output_weights_ptr, n_cell, n_output, output_state_ptr,
-      n_batch, output_gate_scratch);
-
-  // For each batch and cell: update input gate.
-  if (!use_cifg) {
-    if (use_peephole) {
-      tensor_utils::VectorBatchVectorCwiseProductAccumulate(
-          cell_to_input_weights_ptr, n_cell, cell_state_ptr, n_batch,
-          input_gate_scratch);
-    }
-    if (use_layer_norm) {
-      tensor_utils::MeanStddevNormalization(
-          input_gate_scratch, input_gate_scratch, n_cell, n_batch);
-      tensor_utils::VectorBatchVectorCwiseProduct(
-          input_layer_norm_coefficients_ptr, n_cell, input_gate_scratch,
-          n_batch, input_gate_scratch);
-      tensor_utils::VectorBatchVectorAdd(input_gate_bias_ptr, n_cell, n_batch,
-                                         input_gate_scratch);
-    }
-    tensor_utils::ApplySigmoidToVector(input_gate_scratch, n_cell * n_batch,
-                                       input_gate_scratch);
-  }
-
-  // For each batch and cell: update forget gate.
-  if (use_peephole) {
-    tensor_utils::VectorBatchVectorCwiseProductAccumulate(
-        cell_to_forget_weights_ptr, n_cell, cell_state_ptr, n_batch,
-        forget_gate_scratch);
-  }
-  if (use_layer_norm) {
-    tensor_utils::MeanStddevNormalization(forget_gate_scratch,
-                                          forget_gate_scratch, n_cell, n_batch);
-    tensor_utils::VectorBatchVectorCwiseProduct(
-        forget_layer_norm_coefficients_ptr, n_cell, forget_gate_scratch,
-        n_batch, forget_gate_scratch);
-    tensor_utils::VectorBatchVectorAdd(forget_gate_bias_ptr, n_cell, n_batch,
-                                       forget_gate_scratch);
-  }
-  tensor_utils::ApplySigmoidToVector(forget_gate_scratch, n_cell * n_batch,
-                                     forget_gate_scratch);
-
-  // For each batch and cell: update the cell.
-  if (use_layer_norm) {
-    tensor_utils::MeanStddevNormalization(cell_gate_scratch, cell_gate_scratch,
-                                          n_cell, n_batch);
-    tensor_utils::VectorBatchVectorCwiseProduct(
-        cell_layer_norm_coefficients_ptr, n_cell, cell_gate_scratch, n_batch,
-        cell_gate_scratch);
-    tensor_utils::VectorBatchVectorAdd(cell_gate_bias_ptr, n_cell, n_batch,
-                                       cell_gate_scratch);
-  }
-  tensor_utils::ApplyActivationToVector(cell_gate_scratch, n_batch * n_cell,
-                                        params->activation, cell_gate_scratch);
-
+  // Calculate the forget gate.
+  CalculateLstmGateFloat(
+      input_ptr, input_to_forget_weights_ptr, aux_input_ptr,
+      aux_input_to_forget_weights_ptr, output_state_ptr,
+      recurrent_to_forget_weights_ptr, cell_state_ptr,
+      cell_to_forget_weights_ptr, forget_layer_norm_coefficients_ptr,
+      forget_gate_bias_ptr, n_batch, n_input, n_aux_input, n_output, n_cell,
+      /*activation=*/kTfLiteActSigmoid, forget_gate_scratch, is_input_all_zeros,
+      is_aux_input_all_zeros);
+  // Calculate the cell update gate.
+  CalculateLstmGateFloat(input_ptr, input_to_cell_weights_ptr, aux_input_ptr,
+                         aux_input_to_cell_weights_ptr, output_state_ptr,
+                         recurrent_to_cell_weights_ptr, /*cell_state=*/nullptr,
+                         /*cell_to_gate_weights=*/nullptr,
+                         cell_layer_norm_coefficients_ptr, cell_gate_bias_ptr,
+                         n_batch, n_input, n_aux_input, n_output, n_cell,
+                         params->activation, cell_gate_scratch,
+                         is_input_all_zeros, is_aux_input_all_zeros);
+  // Update the cell state.
   UpdateLstmCellFloat(n_batch, n_cell, cell_state_ptr, input_gate_scratch,
                       forget_gate_scratch, cell_gate_scratch, use_cifg,
                       params->cell_clip);
-
-  // For each batch and cell: update the output gate.
-  if (use_peephole) {
-    tensor_utils::VectorBatchVectorCwiseProductAccumulate(
-        cell_to_output_weights_ptr, n_cell, cell_state_ptr, n_batch,
-        output_gate_scratch);
-  }
-  if (use_layer_norm) {
-    tensor_utils::MeanStddevNormalization(output_gate_scratch,
-                                          output_gate_scratch, n_cell, n_batch);
-    tensor_utils::VectorBatchVectorCwiseProduct(
-        output_layer_norm_coefficients_ptr, n_cell, output_gate_scratch,
-        n_batch, output_gate_scratch);
-    tensor_utils::VectorBatchVectorAdd(output_gate_bias_ptr, n_cell, n_batch,
-                                       output_gate_scratch);
-  }
-  tensor_utils::ApplySigmoidToVector(output_gate_scratch, n_batch * n_cell,
-                                     output_gate_scratch);
-
+  // Calculate output gate.
+  CalculateLstmGateFloat(
+      input_ptr, input_to_output_weights_ptr, aux_input_ptr,
+      aux_input_to_output_weights_ptr, output_state_ptr,
+      recurrent_to_output_weights_ptr, cell_state_ptr,
+      cell_to_output_weights_ptr, output_layer_norm_coefficients_ptr,
+      output_gate_bias_ptr, n_batch, n_input, n_aux_input, n_output, n_cell,
+      /*activation=*/kTfLiteActSigmoid, output_gate_scratch, is_input_all_zeros,
+      is_aux_input_all_zeros);
+  // Update the output state.
   CalculateLstmOutputFloat(n_batch, n_cell, n_output, cell_state_ptr,
                            output_gate_scratch, params->activation,
                            projection_weights_ptr, projection_bias_ptr,
                            params->proj_clip, output_state_ptr, scratch2);
-
-  // Copy output_state to the output. Note that the output batch rows may not be
+  // Copy output state to the output. Note that the output's rows may not be
   // contiguous (output_batch_leading_dim != n_output).
   for (int b = 0; b < n_batch; b++) {
     std::copy_n(output_state_ptr + b * n_output, n_output,
@@ -803,9 +1005,6 @@ inline void LstmStepHybrid(
   // Since we have already checked that weights are all there or none, we
   // can check the existence of only one to the get the condition.
   const bool use_cifg = (input_to_input_weights_ptr == nullptr);
-  const bool use_peephole = (cell_to_output_weights_ptr != nullptr);
-  const bool use_layer_norm = (forget_layer_norm_coefficients_ptr != nullptr);
-
   // Make named scratch buffers for the different gates.
   float* input_gate_scratch = scratch0;
   float* forget_gate_scratch = scratch1;
@@ -879,6 +1078,7 @@ inline void LstmStepHybrid(
     }
   }
 
+  // Check if inputs are all zeros so we can skip some computations.
   const bool is_input_all_zeros =
       tensor_utils::IsZeroVector(input_ptr, n_batch * n_input);
   const bool is_aux_input_all_zeros =
@@ -886,7 +1086,7 @@ inline void LstmStepHybrid(
        tensor_utils::IsZeroVector(aux_input_ptr, n_batch * n_aux_input));
   const bool is_output_state_all_zeros =
       tensor_utils::IsZeroVector(output_state_ptr, n_batch * n_output);
-
+  // Quantize inputs.
   if (!is_input_all_zeros) {
     tensor_utils::BatchQuantizeFloats(input_ptr, n_batch, n_input,
                                       quantized_input_ptr, input_sf, input_zp,
@@ -902,217 +1102,74 @@ inline void LstmStepHybrid(
         output_state_ptr, n_batch, n_output, quantized_output_state_ptr,
         output_state_sf, output_state_zp, asymmetric_quantize_inputs);
   }
-
-  // Initialize scratch buffers with bias for regular lstm or initialize with
-  // zero for layer norm lstm.
-  if (use_layer_norm) {
-    if (!use_cifg) {
-      std::fill_n(input_gate_scratch, n_cell * n_batch, 0.0f);
-    }
-    std::fill_n(forget_gate_scratch, n_cell * n_batch, 0.0f);
-    std::fill_n(cell_gate_scratch, n_cell * n_batch, 0.0f);
-    std::fill_n(output_gate_scratch, n_cell * n_batch, 0.0f);
-  } else {
-    if (!use_cifg) {
-      tensor_utils::VectorBatchVectorAssign(input_gate_bias_ptr, n_cell,
-                                            n_batch, input_gate_scratch);
-    }
-    tensor_utils::VectorBatchVectorAssign(forget_gate_bias_ptr, n_cell, n_batch,
-                                          forget_gate_scratch);
-    tensor_utils::VectorBatchVectorAssign(cell_gate_bias_ptr, n_cell, n_batch,
-                                          cell_gate_scratch);
-    tensor_utils::VectorBatchVectorAssign(output_gate_bias_ptr, n_cell, n_batch,
-                                          output_gate_scratch);
-  }
-
-  if (!is_input_all_zeros) {
-    if (!use_cifg) {
-      tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-          input_to_input_weights_ptr, n_cell, n_input, quantized_input_ptr,
-          input_to_input_weights_scale, input_sf, n_batch, input_gate_scratch,
-          /*per_channel_scale=*/nullptr, input_zp, accum_scratch_ptr,
-          input_to_input_row_sums, compute_row_sums, scaling_factors_scratch,
-          context);
-    }
-
-    tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-        input_to_forget_weights_ptr, n_cell, n_input, quantized_input_ptr,
-        input_to_forget_weights_scale, input_sf, n_batch, forget_gate_scratch,
-        /*per_channel_scale=*/nullptr, input_zp, accum_scratch_ptr,
-        input_to_forget_row_sums, compute_row_sums, scaling_factors_scratch,
-        context);
-
-    tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-        input_to_cell_weights_ptr, n_cell, n_input, quantized_input_ptr,
-        input_to_cell_weights_scale, input_sf, n_batch, cell_gate_scratch,
-        /*per_channel_scale=*/nullptr, input_zp, accum_scratch_ptr,
-        input_to_cell_row_sums, compute_row_sums, scaling_factors_scratch,
-        context);
-
-    tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-        input_to_output_weights_ptr, n_cell, n_input, quantized_input_ptr,
-        input_to_output_weights_scale, input_sf, n_batch, output_gate_scratch,
-        /*per_channel_scale=*/nullptr, input_zp, accum_scratch_ptr,
-        input_to_output_row_sums, compute_row_sums, scaling_factors_scratch,
-        context);
-  }
-
-  // For each batch and cell: compute aux_input_weight * aux_input.
-  // Skip if auxiliary input is not available or all zeros.
-  if (!is_aux_input_all_zeros) {
-    if (!use_cifg) {
-      tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-          aux_input_to_input_weights_ptr, n_cell, n_aux_input,
-          quantized_aux_input_ptr, aux_input_to_input_weights_scale,
-          aux_input_sf, n_batch, input_gate_scratch,
-          /*per_channel_scale=*/nullptr, aux_input_zp, accum_scratch_ptr,
-          aux_input_to_input_row_sums, compute_row_sums,
-          scaling_factors_scratch, context);
-    }
-
-    tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-        aux_input_to_forget_weights_ptr, n_cell, n_aux_input,
-        quantized_aux_input_ptr, aux_input_to_forget_weights_scale,
-        aux_input_sf, n_batch, forget_gate_scratch,
-        /*per_channel_scale=*/nullptr, aux_input_zp, accum_scratch_ptr,
-        aux_input_to_forget_row_sums, compute_row_sums, scaling_factors_scratch,
-        context);
-
-    tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-        aux_input_to_cell_weights_ptr, n_cell, n_aux_input,
-        quantized_aux_input_ptr, aux_input_to_cell_weights_scale, aux_input_sf,
-        n_batch, cell_gate_scratch, /*per_channel_scale=*/nullptr, aux_input_zp,
-        accum_scratch_ptr, aux_input_to_cell_row_sums, compute_row_sums,
-        scaling_factors_scratch, context);
-
-    tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-        aux_input_to_output_weights_ptr, n_cell, n_aux_input,
-        quantized_aux_input_ptr, aux_input_to_output_weights_scale,
-        aux_input_sf, n_batch, output_gate_scratch,
-        /*per_channel_scale=*/nullptr, aux_input_zp, accum_scratch_ptr,
-        aux_input_to_output_row_sums, compute_row_sums, scaling_factors_scratch,
-        context);
-  }
-
-  if (!is_output_state_all_zeros) {
-    // For each batch and cell: compute recurrent_weight * output_state.
-    if (!use_cifg) {
-      tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-          recurrent_to_input_weights_ptr, n_cell, n_output,
-          quantized_output_state_ptr, recurrent_to_input_weights_scale,
-          output_state_sf, n_batch, input_gate_scratch,
-          /*per_channel_scale=*/nullptr, output_state_zp, accum_scratch_ptr,
-          recurrent_to_input_row_sums, compute_row_sums,
-          scaling_factors_scratch, context);
-    }
-
-    tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-        recurrent_to_forget_weights_ptr, n_cell, n_output,
-        quantized_output_state_ptr, recurrent_to_forget_weights_scale,
-        output_state_sf, n_batch, forget_gate_scratch,
-        /*per_channel_scale=*/nullptr, output_state_zp, accum_scratch_ptr,
-        recurrent_to_forget_row_sums, compute_row_sums, scaling_factors_scratch,
-        context);
-
-    tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-        recurrent_to_cell_weights_ptr, n_cell, n_output,
-        quantized_output_state_ptr, recurrent_to_cell_weights_scale,
-        output_state_sf, n_batch, cell_gate_scratch,
-        /*per_channel_scale=*/nullptr, output_state_zp, accum_scratch_ptr,
-        recurrent_to_cell_row_sums, compute_row_sums, scaling_factors_scratch,
-        context);
-
-    tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-        recurrent_to_output_weights_ptr, n_cell, n_output,
-        quantized_output_state_ptr, recurrent_to_output_weights_scale,
-        output_state_sf, n_batch, output_gate_scratch,
-        /*per_channel_scale=*/nullptr, output_state_zp, accum_scratch_ptr,
-        recurrent_to_output_row_sums, compute_row_sums, scaling_factors_scratch,
-        context);
-  }
-
-  // For each batch and cell: update input gate.
   if (!use_cifg) {
-    if (use_peephole) {
-      tensor_utils::VectorScalarMultiply(cell_to_input_weights_ptr, n_cell,
-                                         cell_to_input_weights_scale,
-                                         recovered_cell_weights);
-      tensor_utils::VectorBatchVectorCwiseProductAccumulate(
-          recovered_cell_weights, n_cell, cell_state_ptr, n_batch,
-          input_gate_scratch);
-    }
-    if (use_layer_norm) {
-      tensor_utils::MeanStddevNormalization(
-          input_gate_scratch, input_gate_scratch, n_cell, n_batch);
-      tensor_utils::VectorBatchVectorCwiseProduct(
-          input_layer_norm_coefficients_ptr, n_cell, input_gate_scratch,
-          n_batch, input_gate_scratch);
-      tensor_utils::VectorBatchVectorAdd(input_gate_bias_ptr, n_cell, n_batch,
-                                         input_gate_scratch);
-    }
-    tensor_utils::ApplySigmoidToVector(input_gate_scratch, n_cell * n_batch,
-                                       input_gate_scratch);
+    // Calculate the input gate. (If not CIFG.)
+    CalculateLstmGateHybrid(
+        quantized_input_ptr, input_sf, input_zp, input_to_input_weights_ptr,
+        input_to_input_weights_scale, input_to_input_row_sums,
+        quantized_aux_input_ptr, aux_input_sf, aux_input_zp,
+        aux_input_to_input_weights_ptr, aux_input_to_input_weights_scale,
+        aux_input_to_input_row_sums, quantized_output_state_ptr,
+        output_state_sf, output_state_zp, recurrent_to_input_weights_ptr,
+        recurrent_to_input_weights_scale, recurrent_to_input_row_sums,
+        cell_state_ptr, cell_to_input_weights_ptr, cell_to_input_weights_scale,
+        input_layer_norm_coefficients_ptr, input_gate_bias_ptr, n_batch,
+        n_input, n_aux_input, n_output, n_cell, kTfLiteActSigmoid,
+        input_gate_scratch, is_input_all_zeros, is_aux_input_all_zeros,
+        is_output_state_all_zeros, compute_row_sums, context,
+        scaling_factors_scratch, recovered_cell_weights, accum_scratch_ptr);
   }
-
-  // For each batch and cell: update forget gate.
-  if (use_peephole) {
-    tensor_utils::VectorScalarMultiply(cell_to_forget_weights_ptr, n_cell,
-                                       cell_to_forget_weights_scale,
-                                       recovered_cell_weights);
-    tensor_utils::VectorBatchVectorCwiseProductAccumulate(
-        recovered_cell_weights, n_cell, cell_state_ptr, n_batch,
-        forget_gate_scratch);
-  }
-  if (use_layer_norm) {
-    tensor_utils::MeanStddevNormalization(forget_gate_scratch,
-                                          forget_gate_scratch, n_cell, n_batch);
-    tensor_utils::VectorBatchVectorCwiseProduct(
-        forget_layer_norm_coefficients_ptr, n_cell, forget_gate_scratch,
-        n_batch, forget_gate_scratch);
-    tensor_utils::VectorBatchVectorAdd(forget_gate_bias_ptr, n_cell, n_batch,
-                                       forget_gate_scratch);
-  }
-  tensor_utils::ApplySigmoidToVector(forget_gate_scratch, n_cell * n_batch,
-                                     forget_gate_scratch);
-
-  // For each batch and cell: update the cell.
-  if (use_layer_norm) {
-    tensor_utils::MeanStddevNormalization(cell_gate_scratch, cell_gate_scratch,
-                                          n_cell, n_batch);
-    tensor_utils::VectorBatchVectorCwiseProduct(
-        cell_layer_norm_coefficients_ptr, n_cell, cell_gate_scratch, n_batch,
-        cell_gate_scratch);
-    tensor_utils::VectorBatchVectorAdd(cell_gate_bias_ptr, n_cell, n_batch,
-                                       cell_gate_scratch);
-  }
-  tensor_utils::ApplyActivationToVector(cell_gate_scratch, n_batch * n_cell,
-                                        params->activation, cell_gate_scratch);
-
+  // Calculate the forget gate.
+  CalculateLstmGateHybrid(
+      quantized_input_ptr, input_sf, input_zp, input_to_forget_weights_ptr,
+      input_to_forget_weights_scale, input_to_forget_row_sums,
+      quantized_aux_input_ptr, aux_input_sf, aux_input_zp,
+      aux_input_to_forget_weights_ptr, aux_input_to_forget_weights_scale,
+      aux_input_to_forget_row_sums, quantized_output_state_ptr, output_state_sf,
+      output_state_zp, recurrent_to_forget_weights_ptr,
+      recurrent_to_forget_weights_scale, recurrent_to_forget_row_sums,
+      cell_state_ptr, cell_to_forget_weights_ptr, cell_to_forget_weights_scale,
+      forget_layer_norm_coefficients_ptr, forget_gate_bias_ptr, n_batch,
+      n_input, n_aux_input, n_output, n_cell, kTfLiteActSigmoid,
+      forget_gate_scratch, is_input_all_zeros, is_aux_input_all_zeros,
+      is_output_state_all_zeros, compute_row_sums, context,
+      scaling_factors_scratch, recovered_cell_weights, accum_scratch_ptr);
+  // Calculate the cell update gate.
+  CalculateLstmGateHybrid(
+      quantized_input_ptr, input_sf, input_zp, input_to_cell_weights_ptr,
+      input_to_cell_weights_scale, input_to_cell_row_sums,
+      quantized_aux_input_ptr, aux_input_sf, aux_input_zp,
+      aux_input_to_cell_weights_ptr, aux_input_to_cell_weights_scale,
+      aux_input_to_cell_row_sums, quantized_output_state_ptr, output_state_sf,
+      output_state_zp, recurrent_to_cell_weights_ptr,
+      recurrent_to_cell_weights_scale, recurrent_to_cell_row_sums,
+      /*cell_state=*/nullptr, /*cell_to_gate_weights=*/nullptr,
+      /*cell_to_gate_weights_scale=*/0.0f, cell_layer_norm_coefficients_ptr,
+      cell_gate_bias_ptr, n_batch, n_input, n_aux_input, n_output, n_cell,
+      params->activation, cell_gate_scratch, is_input_all_zeros,
+      is_aux_input_all_zeros, is_output_state_all_zeros, compute_row_sums,
+      context, scaling_factors_scratch, recovered_cell_weights,
+      accum_scratch_ptr);
+  // Update the cell state.
   UpdateLstmCellFloat(n_batch, n_cell, cell_state_ptr, input_gate_scratch,
                       forget_gate_scratch, cell_gate_scratch, use_cifg,
                       params->cell_clip);
-
-  // For each batch and cell: update the output gate.
-  if (use_peephole) {
-    tensor_utils::VectorScalarMultiply(cell_to_output_weights_ptr, n_cell,
-                                       cell_to_output_weights_scale,
-                                       recovered_cell_weights);
-    tensor_utils::VectorBatchVectorCwiseProductAccumulate(
-        recovered_cell_weights, n_cell, cell_state_ptr, n_batch,
-        output_gate_scratch);
-  }
-  if (use_layer_norm) {
-    tensor_utils::MeanStddevNormalization(output_gate_scratch,
-                                          output_gate_scratch, n_cell, n_batch);
-    tensor_utils::VectorBatchVectorCwiseProduct(
-        output_layer_norm_coefficients_ptr, n_cell, output_gate_scratch,
-        n_batch, output_gate_scratch);
-    tensor_utils::VectorBatchVectorAdd(output_gate_bias_ptr, n_cell, n_batch,
-                                       output_gate_scratch);
-  }
-  tensor_utils::ApplySigmoidToVector(output_gate_scratch, n_batch * n_cell,
-                                     output_gate_scratch);
-
+  // Calculate the output gate.
+  CalculateLstmGateHybrid(
+      quantized_input_ptr, input_sf, input_zp, input_to_output_weights_ptr,
+      input_to_output_weights_scale, input_to_output_row_sums,
+      quantized_aux_input_ptr, aux_input_sf, aux_input_zp,
+      aux_input_to_output_weights_ptr, aux_input_to_output_weights_scale,
+      aux_input_to_output_row_sums, quantized_output_state_ptr, output_state_sf,
+      output_state_zp, recurrent_to_output_weights_ptr,
+      recurrent_to_output_weights_scale, recurrent_to_output_row_sums,
+      cell_state_ptr, cell_to_output_weights_ptr, cell_to_output_weights_scale,
+      output_layer_norm_coefficients_ptr, output_gate_bias_ptr, n_batch,
+      n_input, n_aux_input, n_output, n_cell, kTfLiteActSigmoid,
+      output_gate_scratch, is_input_all_zeros, is_aux_input_all_zeros,
+      is_output_state_all_zeros, compute_row_sums, context,
+      scaling_factors_scratch, recovered_cell_weights, accum_scratch_ptr);
+  // Update the output state.
   CalculateLstmOutputHybrid(
       n_batch, n_cell, n_output, cell_state_ptr, output_gate_scratch,
       params->activation, projection_weights_ptr, projection_weights_scale,
@@ -1120,9 +1177,8 @@ inline void LstmStepHybrid(
       asymmetric_quantize_inputs, projection_weights_row_sums, compute_row_sums,
       context, scratch2, quantized_output_scratch, input_sf, input_zp,
       accum_scratch_ptr);
-
-  // Copy output_state_ptr to the output. Note that the output batch rows may
-  // not be contiguous (output_batch_leading_dim != n_output).
+  // Copy output state to the output. Note that the output's rows may not be
+  // contiguous (output_batch_leading_dim != n_output).
   for (int b = 0; b < n_batch; b++) {
     std::copy_n(output_state_ptr + b * n_output, n_output,
                 output_ptr + b * output_batch_leading_dim);
@@ -1292,10 +1348,9 @@ inline void LstmStepInteger8x8_16(
   int16_t* cell_gate_scratch = scratch2;
   int16_t* output_gate_scratch = scratch3;
 
-  // Get hyper parameters.
+  // Since we have already checked that weights are all there or none, we
+  // can check the existence of only one to the get the condition.
   const bool use_cifg = (input_to_input_weight_ptr == nullptr);
-  const bool use_peephole = (cell_to_output_weight_ptr != nullptr);
-  const bool use_layer_norm = (layer_norm_forget_weight_ptr != nullptr);
 
   // Check for nullptrs.
   TFLITE_DCHECK(input_to_forget_effective_bias);
@@ -1310,125 +1365,63 @@ inline void LstmStepInteger8x8_16(
   }
   TFLITE_DCHECK(projection_effective_bias);
 
-  // Set scratch to 0.
   if (!use_cifg) {
-    std::fill_n(input_gate_scratch, n_batch * n_cell, 0);
-  }
-  std::fill_n(forget_gate_scratch, n_batch * n_cell, 0);
-  std::fill_n(cell_gate_scratch, n_batch * n_cell, 0);
-  std::fill_n(output_gate_scratch, n_batch * n_cell, 0);
-
-  // Forget gate.
-  tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-      input_ptr, input_to_forget_effective_bias, input_to_forget_weight_ptr,
-      effective_input_to_forget_scale_a, effective_input_to_forget_scale_b,
-      n_batch, n_input, n_cell, 0, scratch5, forget_gate_scratch, context);
-
-  tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-      output_state_ptr, recurrent_to_forget_effective_bias,
-      recurrent_to_forget_weight_ptr, effective_recurrent_to_forget_scale_a,
-      effective_recurrent_to_forget_scale_b, n_batch, n_output, n_cell, 0,
-      scratch5, forget_gate_scratch, context);
-  if (use_peephole) {
-    tensor_utils::VectorBatchVectorCwiseProductAccumulate(
-        cell_to_forget_weight_ptr, n_output, cell_state_ptr, n_batch,
-        effective_cell_to_forget_scale_a, effective_cell_to_forget_scale_b,
-        forget_gate_scratch);
-  }
-
-  if (use_layer_norm) {
-    tensor_utils::ApplyLayerNorm(
-        forget_gate_scratch, layer_norm_forget_weight_ptr, forget_gate_bias_ptr,
-        layer_norm_forget_scale_a, layer_norm_forget_scale_b,
-        forget_variance_guard, n_batch, n_cell, forget_gate_scratch);
-  }
-
-  tensor_utils::ApplySigmoid(forget_gate_scratch, n_batch, n_cell,
-                             forget_gate_scratch);
-
-  // Cell gate.
-  tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-      input_ptr, input_to_cell_effective_bias, input_to_cell_weight_ptr,
-      effective_input_to_cell_scale_a, effective_input_to_cell_scale_b, n_batch,
-      n_input, n_cell, 0, scratch5, cell_gate_scratch, context);
-
-  tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-      output_state_ptr, recurrent_to_cell_effective_bias,
-      recurrent_to_cell_weight_ptr, effective_recurrent_to_cell_scale_a,
-      effective_recurrent_to_cell_scale_b, n_batch, n_output, n_cell, 0,
-      scratch5, cell_gate_scratch, context);
-
-  if (use_layer_norm) {
-    tensor_utils::ApplyLayerNorm(cell_gate_scratch, layer_norm_cell_weight_ptr,
-                                 cell_gate_bias_ptr, layer_norm_cell_scale_a,
-                                 layer_norm_cell_scale_b, cell_variance_guard,
-                                 n_batch, n_cell, cell_gate_scratch);
-  }
-
-  tensor_utils::ApplyTanh(3, cell_gate_scratch, n_batch, n_cell,
-                          cell_gate_scratch);
-
-  // Input gate.
-  if (!use_cifg) {
-    tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-        input_ptr, input_to_input_effective_bias, input_to_input_weight_ptr,
+    // Calculate the input gate. (If not CIFG.)
+    CalculateLstmGateInteger8x8_16(
+        input_ptr, input_to_input_weight_ptr, input_to_input_effective_bias,
         effective_input_to_input_scale_a, effective_input_to_input_scale_b,
-        n_batch, n_input, n_cell, 0, scratch5, input_gate_scratch, context);
-
-    tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-        output_state_ptr, recurrent_to_input_effective_bias,
-        recurrent_to_input_weight_ptr, effective_recurrent_to_input_scale_a,
-        effective_recurrent_to_input_scale_b, n_batch, n_output, n_cell, 0,
-        scratch5, input_gate_scratch, context);
-    if (use_peephole) {
-      tensor_utils::VectorBatchVectorCwiseProductAccumulate(
-          cell_to_input_weight_ptr, n_output, cell_state_ptr, n_batch,
-          effective_cell_to_input_scale_a, effective_cell_to_input_scale_b,
-          input_gate_scratch);
-    }
-
-    if (use_layer_norm) {
-      tensor_utils::ApplyLayerNorm(
-          input_gate_scratch, layer_norm_input_weight_ptr, input_gate_bias_ptr,
-          layer_norm_input_scale_a, layer_norm_input_scale_b,
-          input_variance_guard, n_batch, n_cell, input_gate_scratch);
-    }
-    tensor_utils::ApplySigmoid(input_gate_scratch, n_batch, n_cell,
-                               input_gate_scratch);
+        output_state_ptr, recurrent_to_input_weight_ptr,
+        recurrent_to_input_effective_bias, effective_recurrent_to_input_scale_a,
+        effective_recurrent_to_input_scale_b, cell_state_ptr,
+        cell_to_input_weight_ptr, effective_cell_to_input_scale_a,
+        effective_cell_to_input_scale_b, layer_norm_input_weight_ptr,
+        input_gate_bias_ptr, layer_norm_input_scale_a, layer_norm_input_scale_b,
+        input_variance_guard, n_batch, n_input, n_output, n_cell,
+        kTfLiteActSigmoid, input_gate_scratch, context, scratch5);
   }
-
+  // Calculate the forget gate.
+  CalculateLstmGateInteger8x8_16(
+      input_ptr, input_to_forget_weight_ptr, input_to_forget_effective_bias,
+      effective_input_to_forget_scale_a, effective_input_to_forget_scale_b,
+      output_state_ptr, recurrent_to_forget_weight_ptr,
+      recurrent_to_forget_effective_bias, effective_recurrent_to_forget_scale_a,
+      effective_recurrent_to_forget_scale_b, cell_state_ptr,
+      cell_to_forget_weight_ptr, effective_cell_to_forget_scale_a,
+      effective_cell_to_forget_scale_b, layer_norm_forget_weight_ptr,
+      forget_gate_bias_ptr, layer_norm_forget_scale_a,
+      layer_norm_forget_scale_b, forget_variance_guard, n_batch, n_input,
+      n_output, n_cell, kTfLiteActSigmoid, forget_gate_scratch, context,
+      scratch5);
+  // Calculate the cell update gate.
+  CalculateLstmGateInteger8x8_16(
+      input_ptr, input_to_cell_weight_ptr, input_to_cell_effective_bias,
+      effective_input_to_cell_scale_a, effective_input_to_cell_scale_b,
+      output_state_ptr, recurrent_to_cell_weight_ptr,
+      recurrent_to_cell_effective_bias, effective_recurrent_to_cell_scale_a,
+      effective_recurrent_to_cell_scale_b, cell_state_ptr,
+      /*cell_to_gate_weights=*/nullptr, /*cell_to_gate_scale_a=*/0,
+      /*cell_to_gate_scale_b=*/0, layer_norm_cell_weight_ptr,
+      cell_gate_bias_ptr, layer_norm_cell_scale_a, layer_norm_cell_scale_b,
+      cell_variance_guard, n_batch, n_input, n_output, n_cell, kTfLiteActTanh,
+      cell_gate_scratch, context, scratch5);
+  // Update the cell state.
   UpdateLstmCellInteger(n_batch, n_cell, cell_state_ptr, cell_state_scale,
                         input_gate_scratch, forget_gate_scratch,
                         cell_gate_scratch, use_cifg, quantized_cell_clip);
-
-  // Ouptut gate.
-  tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-      input_ptr, input_to_output_effective_bias, input_to_output_weight_ptr,
+  // Calculate the output gate.
+  CalculateLstmGateInteger8x8_16(
+      input_ptr, input_to_output_weight_ptr, input_to_output_effective_bias,
       effective_input_to_output_scale_a, effective_input_to_output_scale_b,
-      n_batch, n_input, n_cell, 0, scratch5, output_gate_scratch, context);
-
-  tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-      output_state_ptr, recurrent_to_output_effective_bias,
-      recurrent_to_output_weight_ptr, effective_recurrent_to_output_scale_a,
-      effective_recurrent_to_output_scale_b, n_batch, n_output, n_cell, 0,
-      scratch5, output_gate_scratch, context);
-  if (use_peephole) {
-    tensor_utils::VectorBatchVectorCwiseProductAccumulate(
-        cell_to_output_weight_ptr, n_output, cell_state_ptr, n_batch,
-        effective_cell_to_output_scale_a, effective_cell_to_output_scale_b,
-        output_gate_scratch);
-  }
-
-  if (use_layer_norm) {
-    tensor_utils::ApplyLayerNorm(
-        output_gate_scratch, layer_norm_output_weight_ptr, output_gate_bias_ptr,
-        layer_norm_output_scale_a, layer_norm_output_scale_b,
-        output_variance_guard, n_batch, n_cell, output_gate_scratch);
-  }
-
-  tensor_utils::ApplySigmoid(output_gate_scratch, n_batch, n_cell,
-                             output_gate_scratch);
-
+      output_state_ptr, recurrent_to_output_weight_ptr,
+      recurrent_to_output_effective_bias, effective_recurrent_to_output_scale_a,
+      effective_recurrent_to_output_scale_b, cell_state_ptr,
+      cell_to_output_weight_ptr, effective_cell_to_output_scale_a,
+      effective_cell_to_output_scale_b, layer_norm_output_weight_ptr,
+      output_gate_bias_ptr, layer_norm_output_scale_a,
+      layer_norm_output_scale_b, output_variance_guard, n_batch, n_input,
+      n_output, n_cell, kTfLiteActSigmoid, output_gate_scratch, context,
+      scratch5);
+  // Update the output state.
   CalculateLstmOutputInteger8x8_16(
       n_batch, n_cell, n_output, cell_state_ptr, cell_state_scale,
       output_gate_scratch, effective_hidden_scale_a, effective_hidden_scale_b,
@@ -1436,7 +1429,6 @@ inline void LstmStepInteger8x8_16(
       effective_proj_scale_b, projection_effective_bias, output_state_zp,
       quantized_proj_clip, output_state_ptr, context, scratch0, scratch4,
       scratch5);
-
   // Copy output state to the output. Note that unlike float or hybrid, output
   // is always contigous.
   std::copy_n(output_state_ptr, n_batch * n_output, output_ptr);
@@ -1591,109 +1583,61 @@ inline void LstmStepInteger8x8_8(
     int8_t* scratch0, int8_t* scratch1, int16_t* scratch2, int16_t* scratch3,
     int16_t* scratch4, int16_t* scratch5, int16_t* scratch6,
     int16_t* scratch7) {
+  // TODO(b/159066113): scratch5 is unused, remove.
+
   ruy::profiler::ScopeLabel label("LstmStepInteger8x8_8");
   // Make named scratch buffers for the different gates.
   int16_t* forget_gate_scratch = scratch2;
   int16_t* cell_gate_scratch = scratch3;
   int16_t* output_gate_scratch = scratch4;
+  // no-CIFG is not supported here
 
-  // Forget gate.
-  std::fill_n(scratch0, n_batch * n_cell, 0);
-  std::fill_n(scratch1, n_batch * n_cell, 0);
-  tensor_utils::MatrixBatchVectorMultiply(
+  // Calculate the forget gate.
+  CalculateLstmGateInteger8x8_8(
       input_ptr, input_zp, input_to_forget_weight_ptr,
       effective_input_to_forget_scale_a, effective_input_to_forget_scale_b,
-      n_batch, n_input, n_cell, scratch0, intermediate_zp[4]);
-
-  tensor_utils::MatrixBatchVectorMultiply(
+      intermediate_scale_a[2], intermediate_scale_b[2], intermediate_zp[4],
       output_state_ptr, output_state_zp, recurrent_to_forget_weight_ptr,
       effective_recurrent_to_forget_scale_a,
-      effective_recurrent_to_forget_scale_b, n_batch, n_output, n_cell,
-      scratch1, intermediate_zp[5]);
-
-  tensor_utils::TwoGateSaturatingAdd(
-      scratch0, intermediate_zp[4], scratch1, intermediate_zp[5],
-      intermediate_scale_a[2], intermediate_scale_b[2], intermediate_scale_a[3],
-      intermediate_scale_b[3], n_batch, n_cell, forget_gate_scratch);
-
-  // Forget gate layer norm.
-  tensor_utils::ApplyLayerNormFloat(
-      forget_gate_scratch, layer_norm_forget_weight_ptr,
+      effective_recurrent_to_forget_scale_b, intermediate_scale_a[3],
+      intermediate_scale_b[3], intermediate_zp[5], layer_norm_forget_weight_ptr,
       layer_norm_forget_scale_a, layer_norm_forget_scale_b,
-      forget_gate_bias_ptr, n_batch, n_cell, forget_gate_scratch);
-
-  // Forget gate sigmoid.
-  tensor_utils::ApplySigmoidFloat(forget_gate_scratch, n_batch, n_cell,
-                                  forget_gate_scratch);
-
-  // Cell gate.
-  std::fill_n(scratch0, n_batch * n_cell, 0);
-  std::fill_n(scratch1, n_batch * n_cell, 0);
-  tensor_utils::MatrixBatchVectorMultiply(
+      forget_gate_bias_ptr, n_batch, n_input, n_output, n_cell,
+      kTfLiteActSigmoid, forget_gate_scratch, scratch0, scratch1);
+  // Calculate the cell update gate.
+  CalculateLstmGateInteger8x8_8(
       input_ptr, input_zp, input_to_cell_weight_ptr,
-      effective_input_to_cell_scale_a, effective_input_to_cell_scale_b, n_batch,
-      n_input, n_cell, scratch0, intermediate_zp[7]);
-
-  tensor_utils::MatrixBatchVectorMultiply(
+      effective_input_to_cell_scale_a, effective_input_to_cell_scale_b,
+      intermediate_scale_a[4], intermediate_scale_b[4], intermediate_zp[7],
       output_state_ptr, output_state_zp, recurrent_to_cell_weight_ptr,
       effective_recurrent_to_cell_scale_a, effective_recurrent_to_cell_scale_b,
-      n_batch, n_output, n_cell, scratch1, intermediate_zp[8]);
-
-  tensor_utils::TwoGateSaturatingAdd(
-      scratch0, intermediate_zp[7], scratch1, intermediate_zp[8],
-      intermediate_scale_a[4], intermediate_scale_b[4], intermediate_scale_a[5],
-      intermediate_scale_b[5], n_batch, n_cell, cell_gate_scratch);
-
-  // Cell gate layer norm.
-  tensor_utils::ApplyLayerNormFloat(
-      cell_gate_scratch, layer_norm_cell_weight_ptr, layer_norm_cell_scale_a,
-      layer_norm_cell_scale_b, cell_gate_bias_ptr, n_batch, n_cell,
-      cell_gate_scratch);
-
-  // Cell gate tanh.
-  tensor_utils::ApplyTanhFloat(cell_gate_scratch, n_batch, n_cell, -12,
-                               cell_gate_scratch);
-
-  // Output gate.
-  std::fill_n(scratch0, n_batch * n_cell, 0);
-  std::fill_n(scratch1, n_batch * n_cell, 0);
-  tensor_utils::MatrixBatchVectorMultiply(
-      input_ptr, input_zp, input_to_output_weight_ptr,
-      effective_input_to_output_scale_a, effective_input_to_output_scale_b,
-      n_batch, n_input, n_cell, scratch0, intermediate_zp[10]);
-
-  tensor_utils::MatrixBatchVectorMultiply(
-      output_state_ptr, output_state_zp, recurrent_to_output_weight_ptr,
-      effective_recurrent_to_output_scale_a,
-      effective_recurrent_to_output_scale_b, n_batch, n_output, n_cell,
-      scratch1, intermediate_zp[11]);
-
-  tensor_utils::TwoGateSaturatingAdd(
-      scratch0, intermediate_zp[10], scratch1, intermediate_zp[11],
-      intermediate_scale_a[6], intermediate_scale_b[6], intermediate_scale_a[7],
-      intermediate_scale_b[7], n_batch, n_cell, output_gate_scratch);
-
-  // Output gate with layer norm.
-  tensor_utils::ApplyLayerNormFloat(
-      output_gate_scratch, layer_norm_output_weight_ptr,
-      layer_norm_output_scale_a, layer_norm_output_scale_b,
-      output_gate_bias_ptr, n_batch, n_cell, output_gate_scratch);
-
-  // Output gate sigmoid.
-  tensor_utils::ApplySigmoidFloat(output_gate_scratch, n_batch, n_cell,
-                                  output_gate_scratch);
-
+      intermediate_scale_a[5], intermediate_scale_b[5], intermediate_zp[8],
+      layer_norm_cell_weight_ptr, layer_norm_cell_scale_a,
+      layer_norm_cell_scale_b, cell_gate_bias_ptr, n_batch, n_input, n_output,
+      n_cell, kTfLiteActTanh, cell_gate_scratch, scratch0, scratch1);
+  // Update the cell state.
   UpdateLstmCellInteger(n_batch, n_cell, cell_state_ptr,
                         /*cell_state_scale=*/-15, /*input_gate=*/nullptr,
                         forget_gate_scratch, cell_gate_scratch,
                         /*use_cifg=*/true, quantized_cell_clip);
-
+  // Calculate the output gate.
+  CalculateLstmGateInteger8x8_8(
+      input_ptr, input_zp, input_to_output_weight_ptr,
+      effective_input_to_output_scale_a, effective_input_to_output_scale_b,
+      intermediate_scale_a[6], intermediate_scale_b[6], intermediate_zp[10],
+      output_state_ptr, output_state_zp, recurrent_to_output_weight_ptr,
+      effective_recurrent_to_output_scale_a,
+      effective_recurrent_to_output_scale_b, intermediate_scale_a[11],
+      intermediate_scale_b[7], intermediate_zp[7], layer_norm_output_weight_ptr,
+      layer_norm_output_scale_a, layer_norm_output_scale_b,
+      output_gate_bias_ptr, n_batch, n_input, n_output, n_cell,
+      kTfLiteActSigmoid, output_gate_scratch, scratch0, scratch1);
+  // Update the output state.
   CalculateLstmOutputInteger8x8_8(
       n_batch, n_cell, n_output, cell_state_ptr, output_gate_scratch,
       projection_weight_ptr, effective_proj_scale_a, effective_proj_scale_b,
       projection_bias_ptr, output_state_zp, quantized_proj_clip,
       output_state_ptr, scratch2);
-
   // Copy output state to the output. Note that unlike float or hybrid, output
   // is always contigous.
   std::copy_n(output_state_ptr, n_batch * n_output, output_ptr);
diff --git a/tensorflow/lite/tools/optimize/calibration/builtin_logging_ops/lstm.cc b/tensorflow/lite/tools/optimize/calibration/builtin_logging_ops/lstm.cc
index 19be9c59e70..1ac996abe87 100644
--- a/tensorflow/lite/tools/optimize/calibration/builtin_logging_ops/lstm.cc
+++ b/tensorflow/lite/tools/optimize/calibration/builtin_logging_ops/lstm.cc
@@ -37,6 +37,64 @@ namespace builtin {
 
 namespace {
 
+inline void CalculateLstmGateFloat(
+    const float* input, const float* input_to_gate_weights,
+    const float* aux_input, const float* aux_input_to_gate_weights,
+    const float* output_state, const float* recurrent_to_gate_weights,
+    const float* cell_state, const float* cell_to_gate_weights,
+    const float* layer_norm_coefficients, const float* gate_bias,
+    const int n_batch, const int n_input, const int n_aux_input,
+    const int n_output, const int n_cell,
+    const TfLiteFusedActivation activation, float* gate,
+    const bool is_input_all_zeros, const bool is_aux_input_all_zeros,
+    Logger* logger, int intermediate_tensor_index,
+    ErrorReporter* error_reporter) {
+  const bool use_peephole = (cell_to_gate_weights != nullptr);
+  const bool use_layer_norm = (layer_norm_coefficients != nullptr);
+
+  // Initialize scratch buffers with bias for regular lstm or initialize with
+  // zero for layer norm lstm.
+  if (use_layer_norm) {
+    std::fill_n(gate, n_cell * n_batch, 0.0f);
+  } else {
+    tensor_utils::VectorBatchVectorAssign(gate_bias, n_cell, n_batch, gate);
+  }
+  // For each batch and cell: compute input_weight * input.
+  // Skip if input is all zeros.
+  if (!is_input_all_zeros) {
+    tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+        input_to_gate_weights, n_cell, n_input, input, n_batch, gate);
+  }
+  // For each batch and cell: compute aux_input_weight * aux_input.
+  // Skip if auxiliary input is not available or all zeros.
+  if (!is_aux_input_all_zeros) {
+    tensor_utils::MatrixBatchVectorMultiplyAccumulate(aux_input_to_gate_weights,
+                                                      n_cell, n_aux_input,
+                                                      aux_input, n_batch, gate);
+  }
+  // For each batch and cell: compute recurrent_weight * output_state.
+  tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+      recurrent_to_gate_weights, n_cell, n_output, output_state, n_batch, gate);
+  // For each batch and cell: compute cell_weight .* cell_state (peephole LSTM)
+  if (use_peephole) {
+    tensor_utils::VectorBatchVectorCwiseProductAccumulate(
+        cell_to_gate_weights, n_cell, cell_state, n_batch, gate);
+  }
+  // Do layer normalization (if layer norm LSTM)
+  if (use_layer_norm) {
+    logger->LogTensorValue(intermediate_tensor_index, gate, n_cell * n_batch,
+                           error_reporter);
+
+    tensor_utils::MeanStddevNormalization(gate, gate, n_cell, n_batch);
+    tensor_utils::VectorBatchVectorCwiseProduct(layer_norm_coefficients, n_cell,
+                                                gate, n_batch, gate);
+    tensor_utils::VectorBatchVectorAdd(gate_bias, n_cell, n_batch, gate);
+  }
+  // Apply activation
+  tensor_utils::ApplyActivationToVector(gate, n_batch * n_cell, activation,
+                                        gate);
+}
+
 // TODO(b/159066113): This is the exact same function as UpdateLstmCellFloat in
 // kernels/lstm_eval.cc, make that public and remove this.
 void UpdateLstmCellFloat(int n_batch, int n_cell, float* cell_state,
@@ -130,178 +188,72 @@ inline void LstmStepCalibration(
   // Since we have already checked that weights are all there or none, we can
   // check the existence of only one to the get the condition.
   const bool use_cifg = (input_to_input_weights_ptr == nullptr);
-  const bool use_peephole = (cell_to_output_weights_ptr != nullptr);
-  const bool use_layer_norm = (forget_layer_norm_coefficients_ptr != nullptr);
 
-  // Make named scratch buffers for the different gates.
+  // Make named scratch buffers.
   float* input_gate_scratch = scratch0;
   float* forget_gate_scratch = scratch1;
   float* cell_gate_scratch = scratch2;
   float* output_gate_scratch = scratch3;
 
-  // Initialize scratch buffers with bias for regular lstm or initialize with
-  // zero for layer norm lstm.
-  if (use_layer_norm) {
-    if (!use_cifg) {
-      std::fill_n(input_gate_scratch, n_cell * n_batch, 0.0f);
-    }
-    std::fill_n(forget_gate_scratch, n_cell * n_batch, 0.0f);
-    std::fill_n(cell_gate_scratch, n_cell * n_batch, 0.0f);
-    std::fill_n(output_gate_scratch, n_cell * n_batch, 0.0f);
-  } else {
-    if (!use_cifg) {
-      tensor_utils::VectorBatchVectorAssign(input_gate_bias_ptr, n_cell,
-                                            n_batch, input_gate_scratch);
-    }
-    tensor_utils::VectorBatchVectorAssign(forget_gate_bias_ptr, n_cell, n_batch,
-                                          forget_gate_scratch);
-    tensor_utils::VectorBatchVectorAssign(cell_gate_bias_ptr, n_cell, n_batch,
-                                          cell_gate_scratch);
-    tensor_utils::VectorBatchVectorAssign(output_gate_bias_ptr, n_cell, n_batch,
-                                          output_gate_scratch);
-  }
-
-  // For each batch and cell: compute input_weight * input.
+  // Check if inputs are all zeros so we can skip some computations.
+  const bool is_input_all_zeros =
+      tensor_utils::IsZeroVector(input_ptr, n_batch * n_input);
+  const bool is_aux_input_all_zeros =
+      (aux_input_ptr == nullptr ||
+       tensor_utils::IsZeroVector(aux_input_ptr, n_batch * n_aux_input));
   if (!use_cifg) {
-    tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-        input_to_input_weights_ptr, n_cell, n_input, input_ptr, n_batch,
-        input_gate_scratch);
+    // Calculate the input gate. (If not CIFG.)
+    CalculateLstmGateFloat(
+        input_ptr, input_to_input_weights_ptr, aux_input_ptr,
+        aux_input_to_input_weights_ptr, output_state_ptr,
+        recurrent_to_input_weights_ptr, cell_state_ptr,
+        cell_to_input_weights_ptr, input_layer_norm_coefficients_ptr,
+        input_gate_bias_ptr, n_batch, n_input, n_aux_input, n_output, n_cell,
+        /*activation=*/kTfLiteActSigmoid, input_gate_scratch,
+        is_input_all_zeros, is_aux_input_all_zeros, logger,
+        intermediate_tensor_indexes[0], error_reporter);
   }
-
-  tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-      input_to_forget_weights_ptr, n_cell, n_input, input_ptr, n_batch,
-      forget_gate_scratch);
-  tensor_utils::MatrixBatchVectorMultiplyAccumulate(input_to_cell_weights_ptr,
-                                                    n_cell, n_input, input_ptr,
-                                                    n_batch, cell_gate_scratch);
-  tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-      input_to_output_weights_ptr, n_cell, n_input, input_ptr, n_batch,
-      output_gate_scratch);
-
-  // For each batch and cell: compute aux_input_weight * aux_input.
-  // Skip if auxiliary input is not available.
-  if (aux_input_ptr != nullptr) {
-    if (!use_cifg) {
-      tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-          aux_input_to_input_weights_ptr, n_cell, n_aux_input, aux_input_ptr,
-          n_batch, input_gate_scratch);
-    }
-
-    tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-        aux_input_to_forget_weights_ptr, n_cell, n_aux_input, aux_input_ptr,
-        n_batch, forget_gate_scratch);
-    tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-        aux_input_to_cell_weights_ptr, n_cell, n_aux_input, aux_input_ptr,
-        n_batch, cell_gate_scratch);
-    tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-        aux_input_to_output_weights_ptr, n_cell, n_aux_input, aux_input_ptr,
-        n_batch, output_gate_scratch);
-  }
-
-  // For each batch and cell: compute recurrent_weight * output_state.
-  if (!use_cifg) {
-    tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-        recurrent_to_input_weights_ptr, n_cell, n_output, output_state_ptr,
-        n_batch, input_gate_scratch);
-  }
-  tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-      recurrent_to_forget_weights_ptr, n_cell, n_output, output_state_ptr,
-      n_batch, forget_gate_scratch);
-  tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-      recurrent_to_cell_weights_ptr, n_cell, n_output, output_state_ptr,
-      n_batch, cell_gate_scratch);
-  tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-      recurrent_to_output_weights_ptr, n_cell, n_output, output_state_ptr,
-      n_batch, output_gate_scratch);
-
-  // For each batch and cell: update input gate.
-  if (!use_cifg) {
-    if (use_peephole) {
-      tensor_utils::VectorBatchVectorCwiseProductAccumulate(
-          cell_to_input_weights_ptr, n_cell, cell_state_ptr, n_batch,
-          input_gate_scratch);
-    }
-    if (use_layer_norm) {
-      logger->LogTensorValue(intermediate_tensor_indexes[0], input_gate_scratch,
-                             n_cell * n_batch, error_reporter);
-      tensor_utils::MeanStddevNormalization(
-          input_gate_scratch, input_gate_scratch, n_cell, n_batch);
-      tensor_utils::VectorBatchVectorCwiseProduct(
-          input_layer_norm_coefficients_ptr, n_cell, input_gate_scratch,
-          n_batch, input_gate_scratch);
-      tensor_utils::VectorBatchVectorAdd(input_gate_bias_ptr, n_cell, n_batch,
-                                         input_gate_scratch);
-    }
-    tensor_utils::ApplySigmoidToVector(input_gate_scratch, n_cell * n_batch,
-                                       input_gate_scratch);
-  }
-
-  // For each batch and cell: update forget gate.
-  if (use_peephole) {
-    tensor_utils::VectorBatchVectorCwiseProductAccumulate(
-        cell_to_forget_weights_ptr, n_cell, cell_state_ptr, n_batch,
-        forget_gate_scratch);
-  }
-  if (use_layer_norm) {
-    logger->LogTensorValue(intermediate_tensor_indexes[1], forget_gate_scratch,
-                           n_cell * n_batch, error_reporter);
-    tensor_utils::MeanStddevNormalization(forget_gate_scratch,
-                                          forget_gate_scratch, n_cell, n_batch);
-    tensor_utils::VectorBatchVectorCwiseProduct(
-        forget_layer_norm_coefficients_ptr, n_cell, forget_gate_scratch,
-        n_batch, forget_gate_scratch);
-    tensor_utils::VectorBatchVectorAdd(forget_gate_bias_ptr, n_cell, n_batch,
-                                       forget_gate_scratch);
-  }
-  tensor_utils::ApplySigmoidToVector(forget_gate_scratch, n_cell * n_batch,
-                                     forget_gate_scratch);
-
-  // For each batch and cell: update the cell.
-  if (use_layer_norm) {
-    logger->LogTensorValue(intermediate_tensor_indexes[2], cell_gate_scratch,
-                           n_cell * n_batch, error_reporter);
-    tensor_utils::MeanStddevNormalization(cell_gate_scratch, cell_gate_scratch,
-                                          n_cell, n_batch);
-    tensor_utils::VectorBatchVectorCwiseProduct(
-        cell_layer_norm_coefficients_ptr, n_cell, cell_gate_scratch, n_batch,
-        cell_gate_scratch);
-    tensor_utils::VectorBatchVectorAdd(cell_gate_bias_ptr, n_cell, n_batch,
-                                       cell_gate_scratch);
-  }
-  tensor_utils::ApplyActivationToVector(cell_gate_scratch, n_batch * n_cell,
-                                        params->activation, cell_gate_scratch);
-
+  // Calculate the forget gate.
+  CalculateLstmGateFloat(
+      input_ptr, input_to_forget_weights_ptr, aux_input_ptr,
+      aux_input_to_forget_weights_ptr, output_state_ptr,
+      recurrent_to_forget_weights_ptr, cell_state_ptr,
+      cell_to_forget_weights_ptr, forget_layer_norm_coefficients_ptr,
+      forget_gate_bias_ptr, n_batch, n_input, n_aux_input, n_output, n_cell,
+      /*activation=*/kTfLiteActSigmoid, forget_gate_scratch, is_input_all_zeros,
+      is_aux_input_all_zeros, logger, intermediate_tensor_indexes[1],
+      error_reporter);
+  // Calculate the cell update gate.
+  CalculateLstmGateFloat(input_ptr, input_to_cell_weights_ptr, aux_input_ptr,
+                         aux_input_to_cell_weights_ptr, output_state_ptr,
+                         recurrent_to_cell_weights_ptr, /*cell_state=*/nullptr,
+                         /*cell_to_gate_weights=*/nullptr,
+                         cell_layer_norm_coefficients_ptr, cell_gate_bias_ptr,
+                         n_batch, n_input, n_aux_input, n_output, n_cell,
+                         params->activation, cell_gate_scratch,
+                         is_input_all_zeros, is_aux_input_all_zeros, logger,
+                         intermediate_tensor_indexes[2], error_reporter);
+  // Update the cell state.
   UpdateLstmCellFloat(n_batch, n_cell, cell_state_ptr, input_gate_scratch,
                       forget_gate_scratch, cell_gate_scratch, use_cifg,
                       params->cell_clip);
-
-  // For each batch and cell: update the output gate.
-  if (use_peephole) {
-    tensor_utils::VectorBatchVectorCwiseProductAccumulate(
-        cell_to_output_weights_ptr, n_cell, cell_state_ptr, n_batch,
-        output_gate_scratch);
-  }
-  if (use_layer_norm) {
-    logger->LogTensorValue(intermediate_tensor_indexes[3], output_gate_scratch,
-                           n_cell * n_batch, error_reporter);
-    tensor_utils::MeanStddevNormalization(output_gate_scratch,
-                                          output_gate_scratch, n_cell, n_batch);
-    tensor_utils::VectorBatchVectorCwiseProduct(
-        output_layer_norm_coefficients_ptr, n_cell, output_gate_scratch,
-        n_batch, output_gate_scratch);
-    tensor_utils::VectorBatchVectorAdd(output_gate_bias_ptr, n_cell, n_batch,
-                                       output_gate_scratch);
-  }
-  tensor_utils::ApplySigmoidToVector(output_gate_scratch, n_batch * n_cell,
-                                     output_gate_scratch);
-
+  // Calculate output gate.
+  CalculateLstmGateFloat(
+      input_ptr, input_to_output_weights_ptr, aux_input_ptr,
+      aux_input_to_output_weights_ptr, output_state_ptr,
+      recurrent_to_output_weights_ptr, cell_state_ptr,
+      cell_to_output_weights_ptr, output_layer_norm_coefficients_ptr,
+      output_gate_bias_ptr, n_batch, n_input, n_aux_input, n_output, n_cell,
+      /*activation=*/kTfLiteActSigmoid, output_gate_scratch, is_input_all_zeros,
+      is_aux_input_all_zeros, logger, intermediate_tensor_indexes[3],
+      error_reporter);
+  // Update the output state.
   CalculateLstmOutputCalibration(
       n_batch, n_cell, n_output, cell_state_ptr, output_gate_scratch,
       params->activation, projection_weights_ptr, projection_bias_ptr,
       params->proj_clip, output_state_ptr, scratch2, logger,
       intermediate_tensor_indexes, error_reporter);
-
-  // Copy output_state to the output. Note that the output batch rows may not be
+  // Copy output state to the output. Note that the output's rows may not be
   // contiguous (output_batch_leading_dim != n_output).
   for (int b = 0; b < n_batch; b++) {
     std::copy_n(output_state_ptr + b * n_output, n_output,

From f618ab49554dba03c104ef27cea5d893063fd373 Mon Sep 17 00:00:00 2001
From: Gaurav Jain <gjn@google.com>
Date: Tue, 30 Jun 2020 16:06:49 -0700
Subject: [PATCH 1353/1390] Move away from deprecated asserts

- assertEquals -> assertEqual
- assertRaisesRegexp -> assertRegexpMatches
- assertRegexpMatches -> assertRegex

PiperOrigin-RevId: 319118081
Change-Id: Ieb457128522920ab55d6b69a7f244ab798a7d689
---
 tensorflow/compiler/tests/add_n_test.py       |   4 +-
 .../compiler/tests/bucketize_op_test.py       |   6 +-
 tensorflow/compiler/tests/concat_ops_test.py  |   2 +-
 tensorflow/compiler/tests/cond_test.py        |   8 +-
 tensorflow/compiler/tests/eager_test.py       |   8 +-
 .../compiler/tests/ensure_shape_op_test.py    |   4 +-
 tensorflow/compiler/tests/fifo_queue_test.py  |   2 +-
 tensorflow/compiler/tests/image_ops_test.py   |   2 +-
 tensorflow/compiler/tests/momentum_test.py    |   8 +-
 .../compiler/tests/tensor_array_ops_test.py   |   5 +-
 .../compiler/tests/tensor_list_ops_test.py    |   6 +-
 .../tests/tridiagonal_solve_ops_test.py       |   2 +-
 .../compiler/tests/variable_ops_test.py       |   4 +-
 tensorflow/compiler/tests/xla_ops_test.py     |   4 +-
 tensorflow/lite/python/interpreter_test.py    |  35 +--
 tensorflow/lite/schema/upgrade_schema_test.py |   6 +-
 .../autograph/converters/asserts_test.py      |   2 +-
 .../autograph/converters/directives_test.py   |   4 +-
 tensorflow/python/autograph/impl/api_test.py  |   2 +-
 .../autograph/lang/special_functions_test.py  |   6 +-
 .../operators/conditional_expressions_test.py |   2 +-
 .../autograph/operators/control_flow_test.py  |   4 +-
 .../autograph/operators/exceptions_test.py    |  10 +-
 .../python/autograph/pyct/transformer_test.py |   2 +-
 .../python/client/events_writer_test.py       |   2 +-
 .../python/client/session_partial_run_test.py |  22 +-
 tensorflow/python/client/session_test.py      |  40 +--
 tensorflow/python/compiler/mlir/mlir_test.py  |   4 +-
 .../compiler/tensorrt/trt_convert_test.py     |  14 +-
 .../compiler/xla/experimental_compile_test.py |   4 +-
 tensorflow/python/compiler/xla/jit_test.py    |   4 +-
 tensorflow/python/compiler/xla/xla_test.py    |   6 +-
 .../kernel_tests/assert_cardinality_test.py   |   3 +-
 .../dense_to_sparse_batch_test.py             |  10 +-
 .../directed_interleave_dataset_test.py       |  14 +-
 .../kernel_tests/get_single_element_test.py   |   2 +-
 .../kernel_tests/group_by_reducer_test.py     |   6 +-
 .../kernel_tests/group_by_window_test.py      |   2 +-
 .../make_batched_features_dataset_test.py     |   2 +-
 .../kernel_tests/make_csv_dataset_test.py     |   4 +-
 .../kernel_tests/map_and_batch_test.py        |   2 +-
 .../kernel_tests/map_defun_op_test.py         |   4 +-
 .../choose_fastest_branch_dataset_test.py     |   4 +-
 .../optimization/map_vectorization_test.py    |   4 +-
 .../kernel_tests/rebatch_dataset_test.py      |   4 +-
 .../experimental/kernel_tests/scan_test.py    |   6 +-
 .../sequence_dataset_serialization_test.py    |  12 +-
 .../stats_dataset_serialization_test.py       |   8 +-
 .../data/kernel_tests/concatenate_test.py     |   6 +-
 .../python/data/kernel_tests/dataset_test.py  |   6 +-
 .../data/kernel_tests/from_generator_test.py  |   2 +-
 .../python/data/kernel_tests/iterator_test.py |  12 +-
 .../python/data/kernel_tests/map_test.py      |   2 +-
 .../python/data/kernel_tests/options_test.py  |   3 +-
 .../data/kernel_tests/padded_batch_test.py    |  14 +-
 tensorflow/python/data/util/convert_test.py   |   4 +-
 tensorflow/python/data/util/nest_test.py      |  63 ++--
 tensorflow/python/data/util/structure_test.py |  52 ++--
 .../python/debug/cli/analyzer_cli_test.py     |  16 +-
 .../python/debug/cli/cli_shared_test.py       |   2 +-
 .../python/debug/cli/command_parser_test.py   | 113 ++++---
 tensorflow/python/debug/cli/curses_ui_test.py |   4 +-
 .../python/debug/cli/curses_widgets_test.py   |   6 +-
 .../debug/cli/debugger_cli_common_test.py     |  46 +--
 tensorflow/python/debug/cli/evaluator_test.py |  16 +-
 .../debug/cli/profile_analyzer_cli_test.py    |  14 +-
 .../python/debug/cli/readline_ui_test.py      |   4 +-
 .../python/debug/cli/tensor_format_test.py    |  49 ++-
 .../python/debug/lib/debug_data_test.py       |  12 +-
 .../debug/lib/debug_events_writer_test.py     |   4 +-
 .../python/debug/lib/debug_gradients_test.py  |  12 +-
 .../python/debug/lib/debug_graphs_test.py     |   8 +-
 .../python/debug/lib/debug_v2_ops_test.py     |   6 +-
 .../python/debug/lib/dumping_callback_test.py |  22 +-
 .../debug/lib/session_debug_grpc_test.py      |  27 +-
 .../python/debug/lib/source_utils_test.py     |   8 +-
 .../debug/wrappers/dumping_wrapper_test.py    |   8 +-
 .../python/debug/wrappers/framework_test.py   |   8 +-
 .../debug/wrappers/local_cli_wrapper_test.py  |   8 +-
 .../python/distribute/all_reduce_test.py      |   3 +-
 .../gce_cluster_resolver_test.py              |   2 +-
 .../kubernetes_cluster_resolver_test.py       |   2 +-
 .../tpu/tpu_cluster_resolver_test.py          |   4 +-
 .../custom_training_loop_input_test.py        |   4 +-
 .../python/distribute/distribute_lib_test.py  |  27 +-
 .../distribute/mirrored_strategy_test.py      |   6 +-
 .../distribute/mirrored_variable_test.py      |  10 +-
 .../multi_process_runner_no_init_test.py      |   4 +-
 .../distribute/multi_process_runner_test.py   |   6 +-
 .../distribute/multi_worker_util_test.py      |  20 +-
 .../distribute/sharded_variable_test.py       |   8 +-
 .../shared_variable_creator_test.py           |  12 +-
 .../distribute/strategy_combinations_test.py  |   2 +-
 tensorflow/python/eager/backprop_test.py      |  28 +-
 tensorflow/python/eager/core_test.py          |  10 +-
 tensorflow/python/eager/custom_device_test.py |   2 +-
 tensorflow/python/eager/def_function_test.py  |  18 +-
 .../python/eager/def_function_xla_jit_test.py |  13 +-
 tensorflow/python/eager/forwardprop_test.py   |  12 +-
 .../eager/function_defun_collection_test.py   |  12 +-
 .../python/eager/function_gradients_test.py   |   4 +-
 tensorflow/python/eager/function_test.py      | 194 ++++++------
 tensorflow/python/eager/pywrap_tfe_test.py    |  17 +-
 .../python/eager/remote_cluster_test.py       |   4 +-
 tensorflow/python/eager/tensor_test.py        |  46 ++-
 .../feature_column/feature_column_test.py     | 288 +++++++++---------
 .../feature_column/feature_column_v2_test.py  | 256 ++++++++--------
 .../sequence_feature_column_test.py           |  24 +-
 .../feature_column/serialization_test.py      |  10 +-
 .../framework/auto_control_deps_test.py       |   4 +-
 tensorflow/python/framework/config_test.py    |  48 +--
 .../python/framework/device_spec_test.py      |   4 +-
 tensorflow/python/framework/device_test.py    |   8 +-
 tensorflow/python/framework/dtypes_test.py    |  54 ++--
 .../framework/error_interpolation_test.py     |  14 +-
 tensorflow/python/framework/function_test.py  |  54 ++--
 .../python/framework/graph_util_test.py       |   2 +-
 tensorflow/python/framework/importer_test.py  |  58 ++--
 .../python/framework/memory_checker_test.py   |   2 +-
 .../python/framework/meta_graph_test.py       |   8 +-
 tensorflow/python/framework/ops_test.py       | 127 ++++----
 tensorflow/python/framework/registry_test.py  |   2 +-
 tensorflow/python/framework/subscribe_test.py |   2 +-
 .../python/framework/tensor_shape_div_test.py |   2 +-
 .../python/framework/tensor_shape_test.py     |  12 +-
 .../python/framework/tensor_spec_test.py      |   8 +-
 .../python/framework/tensor_util_test.py      |   2 +-
 .../framework/test_combinations_test.py       |   2 +-
 tensorflow/python/framework/test_util_test.py |  42 +--
 tensorflow/python/framework/versions_test.py  |   6 +-
 tensorflow/python/keras/backend_test.py       |   4 +-
 tensorflow/python/keras/callbacks_test.py     |  31 +-
 .../distribute/distribute_strategy_test.py    |  31 +-
 .../distributed_training_utils_test.py        |   4 +-
 .../distribute/keras_dnn_correctness_test.py  |  10 +-
 ...as_stateful_lstm_model_correctness_test.py |   5 +-
 .../keras/distribute/keras_utils_test.py      |  18 +-
 .../python/keras/engine/base_layer_test.py    |  50 +--
 .../keras/engine/base_layer_utils_test.py     |   4 +-
 .../python/keras/engine/data_adapter_test.py  |  27 +-
 .../python/keras/engine/functional_test.py    |  28 +-
 .../python/keras/engine/input_spec_test.py    |   8 +-
 .../python/keras/engine/sequential_test.py    |  22 +-
 .../keras/engine/training_dataset_test.py     |  10 +-
 .../keras/engine/training_generator_test.py   |   6 +-
 .../python/keras/engine/training_test.py      |  44 ++-
 .../keras/engine/training_utils_test.py       |   9 +-
 .../feature_column/dense_features_test.py     |  28 +-
 .../feature_column/dense_features_v2_test.py  |  24 +-
 .../sequence_feature_column_test.py           |  10 +-
 .../keras/integration_test/function_test.py   |   4 +-
 .../keras/layers/advanced_activations_test.py |   4 +-
 .../python/keras/layers/convolutional_test.py |   4 +-
 tensorflow/python/keras/layers/core_test.py   |  10 +-
 .../keras/layers/dense_attention_test.py      |  20 +-
 .../python/keras/layers/einsum_dense_test.py  |  10 +-
 .../python/keras/layers/kernelized_test.py    |  14 +-
 tensorflow/python/keras/layers/merge_test.py  |  40 +--
 .../python/keras/layers/normalization_test.py |  20 +-
 .../layers/preprocessing/hashing_test.py      |  14 +-
 .../preprocessing/image_preprocessing_test.py |   4 +-
 .../layers/preprocessing/table_utils_test.py  |   2 +-
 .../python/keras/layers/recurrent_test.py     |  14 +-
 .../keras/layers/rnn_cell_wrapper_v2_test.py  |   4 +-
 .../python/keras/layers/wrappers_test.py      |  17 +-
 .../keras/legacy_tf_layers/base_test.py       |  20 +-
 .../legacy_tf_layers/convolutional_test.py    |  80 ++---
 .../keras/legacy_tf_layers/pooling_test.py    |  10 +-
 tensorflow/python/keras/losses_test.py        |  18 +-
 .../keras/metrics_confusion_matrix_test.py    |  34 +--
 tensorflow/python/keras/metrics_test.py       |   4 +-
 .../experimental/autocast_variable_test.py    |  17 +-
 .../device_compatibility_check_test.py        |   4 +-
 .../experimental/get_layer_policy_test.py     |   2 +-
 .../experimental/keras_test.py                |  16 +-
 .../experimental/loss_scale_optimizer_test.py |  10 +-
 .../mixed_precision_graph_rewrite_test.py     |  10 +-
 .../experimental/policy_test.py               |  59 ++--
 tensorflow/python/keras/models_test.py        |  13 +-
 .../optimizer_v2/gradient_descent_test.py     |   2 +-
 .../keras/optimizer_v2/optimizer_v2_test.py   |  18 +-
 .../python/keras/saving/hdf5_format_test.py   |  33 +-
 tensorflow/python/keras/saving/save_test.py   |   4 +-
 .../saving/saved_model/saved_model_test.py    |   7 +-
 .../saving/saved_model_experimental_test.py   |   4 +-
 .../python/keras/saving/saving_utils_test.py  |   8 +-
 .../keras/tests/add_loss_correctness_test.py  |   4 +-
 .../keras/tests/model_subclassing_test.py     |  30 +-
 tensorflow/python/keras/tests/saver_test.py   |   4 +-
 .../python/keras/tests/summary_ops_test.py    |   4 +-
 .../python/keras/tests/tracking_test.py       |   8 +-
 .../python/keras/tests/tracking_util_test.py  |   8 +-
 .../tracking_util_with_v1_optimizers_test.py  |   6 +-
 .../python/keras/utils/version_utils_test.py  |   4 +-
 .../python/kernel_tests/array_ops_test.py     |  62 ++--
 .../python/kernel_tests/barrier_ops_test.py   |  46 +--
 .../python/kernel_tests/base64_ops_test.py    |   2 +-
 .../python/kernel_tests/benchmark_test.py     |   6 +-
 .../python/kernel_tests/betainc_op_test.py    |   2 +-
 .../python/kernel_tests/bincount_op_test.py   |   6 +-
 .../python/kernel_tests/bitcast_op_test.py    |   2 +-
 .../boosted_trees/stats_ops_test.py           |   2 +-
 .../boosted_trees/training_ops_test.py        |   4 +-
 .../kernel_tests/broadcast_to_ops_test.py     |   4 +-
 .../python/kernel_tests/bucketize_op_test.py  |   7 +-
 .../python/kernel_tests/check_ops_test.py     | 159 +++++-----
 .../python/kernel_tests/cholesky_op_test.py   |   2 +-
 .../python/kernel_tests/concat_op_test.py     |  16 +-
 .../python/kernel_tests/cond_v2_test.py       |  36 ++-
 .../kernel_tests/confusion_matrix_test.py     |   6 +-
 .../kernel_tests/constant_op_eager_test.py    |  28 +-
 .../python/kernel_tests/constant_op_test.py   |  37 ++-
 .../kernel_tests/control_flow_ops_py_test.py  |  60 ++--
 .../python/kernel_tests/conv_ops_test.py      |  22 +-
 .../kernel_tests/critical_section_test.py     |  31 +-
 .../kernel_tests/ctc_decoder_ops_test.py      |   6 +-
 .../python/kernel_tests/ctc_loss_op_test.py   |   9 +-
 .../kernel_tests/cwise_ops_binary_test.py     |   8 +-
 .../python/kernel_tests/cwise_ops_test.py     |   8 +-
 .../kernel_tests/depthtospace_op_test.py      |   2 +-
 .../python/kernel_tests/diag_op_test.py       |  10 +-
 .../distributions/bijector_test.py            |  32 +-
 .../distributions/kullback_leibler_test.py    |   4 +-
 .../distributions/student_t_test.py           |   8 +-
 .../kernel_tests/distributions/util_test.py   |   2 +-
 .../python/kernel_tests/fifo_queue_test.py    | 105 +++----
 .../kernel_tests/functional_ops_test.py       |  19 +-
 .../kernel_tests/identity_n_op_py_test.py     |   6 +-
 .../python/kernel_tests/init_ops_test.py      |   6 +-
 .../python/kernel_tests/inplace_ops_test.py   |  12 +-
 .../linalg/linear_operator_addition_test.py   |  12 +-
 .../linalg/linear_operator_adjoint_test.py    |   4 +-
 .../linalg/linear_operator_algebra_test.py    |  26 +-
 .../linalg/linear_operator_block_diag_test.py |  10 +-
 ...ar_operator_block_lower_triangular_test.py |  16 +-
 .../linalg/linear_operator_circulant_test.py  |   6 +-
 .../linear_operator_composition_test.py       |   6 +-
 .../linalg/linear_operator_diag_test.py       |   2 +-
 .../linear_operator_full_matrix_test.py       |   2 +-
 .../linear_operator_householder_test.py       |   2 +-
 .../linalg/linear_operator_identity_test.py   |  26 +-
 .../linalg/linear_operator_inversion_test.py  |   8 +-
 .../linalg/linear_operator_kronecker_test.py  |   6 +-
 .../linear_operator_low_rank_update_test.py   |  10 +-
 .../linear_operator_lower_triangular_test.py  |   2 +-
 .../linear_operator_permutation_test.py       |   8 +-
 .../linalg/linear_operator_test.py            |  14 +-
 .../linalg/linear_operator_toeplitz_test.py   |   6 +-
 .../linalg/linear_operator_util_test.py       |  10 +-
 .../linalg/linear_operator_zeros_test.py      |  26 +-
 .../python/kernel_tests/list_ops_test.py      | 132 ++++----
 .../python/kernel_tests/lookup_ops_test.py    |  72 +++--
 tensorflow/python/kernel_tests/losses_test.py |  62 ++--
 .../python/kernel_tests/manip_ops_test.py     |  38 +--
 tensorflow/python/kernel_tests/map_fn_test.py |   8 +-
 .../python/kernel_tests/matmul_op_test.py     |   4 +-
 .../python/kernel_tests/metrics_test.py       |   8 +-
 .../python/kernel_tests/norm_op_test.py       |  14 +-
 .../kernel_tests/nth_element_op_test.py       |  12 +-
 .../python/kernel_tests/numerics_test.py      |  10 +-
 tensorflow/python/kernel_tests/pad_op_test.py |   6 +-
 .../kernel_tests/padding_fifo_queue_test.py   |  47 +--
 .../python/kernel_tests/parsing_ops_test.py   |   4 +-
 .../partitioned_variables_test.py             |   2 +-
 .../python/kernel_tests/pooling_ops_test.py   |   8 +-
 .../kernel_tests/priority_queue_test.py       |   4 +-
 .../python/kernel_tests/py_func_test.py       |  14 +-
 tensorflow/python/kernel_tests/qr_op_test.py  |   8 +-
 .../kernel_tests/random/random_ops_test.py    |   4 +-
 .../random/random_shuffle_queue_test.py       |  44 +--
 .../kernel_tests/reduce_join_op_test.py       |  12 +-
 .../python/kernel_tests/relu_op_test.py       |   4 +-
 .../python/kernel_tests/reshape_op_test.py    |   6 +-
 .../resource_variable_ops_test.py             |  18 +-
 .../kernel_tests/reverse_sequence_op_test.py  |  30 +-
 .../python/kernel_tests/rnn_cell_test.py      |  11 +-
 tensorflow/python/kernel_tests/rnn_test.py    |  18 +-
 .../segment_reduction_ops_test.py             |   2 +-
 tensorflow/python/kernel_tests/sets_test.py   |  10 +-
 .../python/kernel_tests/softplus_op_test.py   |   2 +-
 .../python/kernel_tests/softsign_op_test.py   |   2 +-
 .../python/kernel_tests/sparse_add_op_test.py |   4 +-
 .../sparse_conditional_accumulator_test.py    |  48 +--
 .../kernel_tests/sparse_cross_op_test.py      |  34 +--
 .../python/kernel_tests/sparse_ops_test.py    |   4 +-
 .../kernel_tests/sparse_reshape_op_test.py    |   6 +-
 .../kernel_tests/sparse_split_op_test.py      |   8 +-
 .../sparse_tensor_dense_matmul_op_test.py     |   2 +-
 .../kernel_tests/sparse_xent_op_test.py       |   8 +-
 .../python/kernel_tests/split_op_test.py      |   8 +-
 .../python/kernel_tests/stack_op_test.py      |   4 +-
 .../string_bytes_split_op_test.py             |   4 +-
 .../kernel_tests/string_format_op_test.py     |  12 +-
 .../kernel_tests/string_length_op_test.py     |   2 +-
 .../kernel_tests/string_split_op_test.py      |   4 +-
 .../python/kernel_tests/summary_ops_test.py   |  11 +-
 tensorflow/python/kernel_tests/svd_op_test.py |   8 +-
 .../python/kernel_tests/template_test.py      |  50 +--
 .../kernel_tests/tensor_array_ops_test.py     |  12 +-
 .../python/kernel_tests/tensordot_op_test.py  |   4 +-
 .../python/kernel_tests/topk_op_test.py       |   4 +-
 .../kernel_tests/unicode_decode_op_test.py    |   8 +-
 .../kernel_tests/unicode_encode_op_test.py    |   2 +-
 .../kernel_tests/unicode_transcode_op_test.py |   4 +-
 .../python/kernel_tests/unstack_op_test.py    |  12 +-
 .../kernel_tests/variable_scope_test.py       |  42 ++-
 .../python/kernel_tests/variables_test.py     |  21 +-
 .../kernel_tests/weights_broadcast_test.py    |   8 +-
 .../python/kernel_tests/while_v2_test.py      |  18 +-
 .../python/kernel_tests/xent_op_test.py       |   6 +-
 tensorflow/python/lib/io/tf_record_test.py    |   8 +-
 tensorflow/python/module/module_test.py       |   4 +-
 tensorflow/python/ops/batch_ops_test.py       |   4 +-
 tensorflow/python/ops/bincount_ops_test.py    |  34 +--
 tensorflow/python/ops/clustering_ops_test.py  |   2 +-
 .../python/ops/collective_ops_gpu_test.py     |  10 +-
 tensorflow/python/ops/collective_ops_test.py  |  12 +-
 .../python/ops/control_flow_ops_test.py       |  61 ++--
 .../python/ops/gradient_checker_test.py       |   6 +-
 .../python/ops/gradient_checker_v2_test.py    |   4 +-
 tensorflow/python/ops/gradients_test.py       |  19 +-
 tensorflow/python/ops/histogram_ops_test.py   |  15 +-
 tensorflow/python/ops/image_ops_test.py       |  48 +--
 tensorflow/python/ops/init_ops_v2_test.py     |   6 +-
 tensorflow/python/ops/math_ops_test.py        |  27 +-
 tensorflow/python/ops/nccl_ops_test.py        |   6 +-
 tensorflow/python/ops/nn_batchnorm_test.py    |   8 +-
 .../ops/nn_loss_scaling_utilities_test.py     |   6 +-
 tensorflow/python/ops/nn_test.py              |   2 +-
 tensorflow/python/ops/nn_xent_test.py         |   4 +-
 .../python/ops/numpy_ops/np_utils_test.py     |   8 +-
 .../python/ops/parallel_for/array_test.py     |   4 +-
 .../ops/parallel_for/control_flow_ops_test.py |  16 +-
 .../python/ops/parallel_for/gradients_test.py |   6 +-
 ...vert_to_tensor_or_ragged_tensor_op_test.py |   8 +-
 .../ops/ragged/ragged_batch_gather_op_test.py |  12 +-
 .../ops/ragged/ragged_boolean_mask_op_test.py |  37 +--
 .../ops/ragged/ragged_concat_op_test.py       |   8 +-
 .../python/ops/ragged/ragged_const_op_test.py |  10 +-
 .../ragged/ragged_constant_value_op_test.py   |   2 +-
 .../python/ops/ragged/ragged_cross_op_test.py |   4 +-
 .../python/ops/ragged/ragged_dispatch_test.py |   4 +-
 .../ragged_dynamic_partition_op_test.py       |  12 +-
 .../ops/ragged/ragged_from_sparse_op_test.py  |  24 +-
 .../ops/ragged/ragged_from_tensor_op_test.py  |   4 +-
 .../ops/ragged/ragged_gather_nd_op_test.py    |   8 +-
 .../ops/ragged/ragged_gather_op_test.py       |  18 +-
 .../python/ops/ragged/ragged_getitem_test.py  |   4 +-
 .../ragged/ragged_map_flat_values_op_test.py  |  19 +-
 .../ops/ragged/ragged_map_fn_op_test.py       |   4 +-
 .../ops/ragged/ragged_merge_dims_op_test.py   |   2 +-
 .../ops/ragged/ragged_one_hot_op_test.py      |   2 +-
 .../ops/ragged/ragged_operators_test.py       |   8 +-
 .../python/ops/ragged/ragged_range_op_test.py |   4 +-
 .../ops/ragged/ragged_reduce_op_test.py       |  10 +-
 .../ops/ragged/ragged_reverse_op_test.py      |   5 +-
 .../ops/ragged/ragged_row_lengths_op_test.py  |   2 +-
 ...agged_row_splits_to_segment_ids_op_test.py |  19 +-
 ...agged_segment_ids_to_row_splits_op_test.py |  16 +-
 .../ops/ragged/ragged_segment_op_test.py      |   6 +-
 .../python/ops/ragged/ragged_stack_op_test.py |   4 +-
 .../ops/ragged/ragged_tensor_shape_test.py    |   5 +-
 .../ops/ragged/ragged_to_sparse_op_test.py    |  14 +-
 .../ops/ragged/ragged_to_tensor_op_test.py    |   4 +-
 .../python/ops/ragged/ragged_util_test.py     |   2 +-
 .../python/ops/ragged/ragged_where_op_test.py |   2 +-
 .../python/ops/ragged/row_partition_test.py   |  40 ++-
 .../ops/ragged/string_ngrams_op_test.py       |   4 +-
 tensorflow/python/ops/raw_ops_test.py         |   4 +-
 .../structured_tensor_slice_test.py           |   4 +-
 .../structured/structured_tensor_spec_test.py |   2 +-
 .../ops/structured/structured_tensor_test.py  |  24 +-
 .../python/profiler/pprof_profiler_test.py    |  22 +-
 .../python/profiler/tfprof_logger_test.py     |   8 +-
 tensorflow/python/saved_model/load_test.py    |  32 +-
 .../python/saved_model/load_v1_in_v2_test.py  |   8 +-
 tensorflow/python/saved_model/loader_test.py  |   2 +-
 .../model_utils/export_output_test.py         |  26 +-
 .../saved_model/model_utils/mode_keys_test.py |   4 +-
 .../nested_structure_coder_test.py            |   4 +-
 tensorflow/python/saved_model/save_test.py    |  27 +-
 .../python/saved_model/saved_model_test.py    |  25 +-
 tensorflow/python/saved_model/utils_test.py   |   2 +-
 tensorflow/python/summary/summary_test.py     |  20 +-
 .../python/summary/writer/writer_test.py      |  50 +--
 .../python/tools/saved_model_cli_test.py      |  10 +-
 .../python/tools/saved_model_utils_test.py    |   2 +-
 .../python/tpu/feature_column_v2_test.py      |   9 +-
 tensorflow/python/tpu/tpu_test.py             |   4 +-
 tensorflow/python/training/adadelta_test.py   |   8 +-
 tensorflow/python/training/adagrad_test.py    |   4 +-
 .../python/training/basic_loops_test.py       |   6 +-
 .../training/basic_session_run_hooks_test.py  |  38 ++-
 .../training/checkpoint_management_test.py    |   4 +-
 .../python/training/coordinator_test.py       |  20 +-
 .../experimental/loss_scale_optimizer_test.py |   2 +-
 .../loss_scaling_gradient_tape_test.py        |   8 +-
 .../experimental/mixed_precision_test.py      |   8 +-
 tensorflow/python/training/input_test.py      |  56 ++--
 tensorflow/python/training/momentum_test.py   |  20 +-
 .../python/training/monitored_session_test.py |  48 +--
 tensorflow/python/training/optimizer_test.py  |  12 +-
 .../python/training/queue_runner_test.py      |   6 +-
 .../training/saver_large_variable_test.py     |   4 +-
 tensorflow/python/training/saver_test.py      |  36 +--
 tensorflow/python/training/server_lib_test.py |  30 +-
 .../python/training/session_manager_test.py   |  64 ++--
 tensorflow/python/training/supervisor_test.py |  16 +-
 .../training/sync_replicas_optimizer_test.py  |   3 +-
 .../python/training/tracking/base_test.py     |   6 +-
 .../training/tracking/data_structures_test.py |   8 +-
 .../python/training/tracking/tracking_test.py |   8 +-
 .../python/training/tracking/util_test.py     |   9 +-
 .../tracking/util_with_v1_optimizers_test.py  |   2 +-
 .../python/training/training_util_test.py     |  24 +-
 tensorflow/python/util/deprecation_test.py    |  85 +++---
 tensorflow/python/util/dispatch_test.py       |  14 +-
 tensorflow/python/util/function_utils_test.py |  19 +-
 tensorflow/python/util/keyword_args_test.py   |   4 +-
 tensorflow/python/util/nest_test.py           |  93 +++---
 .../python/util/protobuf/compare_test.py      |  68 ++---
 tensorflow/python/util/tf_export_test.py      |  64 ++--
 tensorflow/python/util/tf_inspect_test.py     |   4 +-
 tensorflow/python/util/tf_should_use_test.py  |   4 +-
 .../tools/compatibility/ast_edits_test.py     |   5 +-
 425 files changed, 3462 insertions(+), 3619 deletions(-)

diff --git a/tensorflow/compiler/tests/add_n_test.py b/tensorflow/compiler/tests/add_n_test.py
index 40e6bea0cc5..69c7737ad12 100644
--- a/tensorflow/compiler/tests/add_n_test.py
+++ b/tensorflow/compiler/tests/add_n_test.py
@@ -50,7 +50,7 @@ class XlaAddNTest(xla_test.XLATestCase):
       l2 = list_ops.tensor_list_reserve(
           element_shape=[], element_dtype=dtypes.float32, num_elements=3)
       l = math_ops.add_n([l1, l2])
-      with self.assertRaisesRegexp(
+      with self.assertRaisesRegex(
           errors.InvalidArgumentError,
           "TensorList arguments to AddN must all have the same shape"):
         list_ops.tensor_list_stack(l, element_dtype=dtypes.float32).eval()
@@ -70,7 +70,7 @@ class XlaAddNTest(xla_test.XLATestCase):
           element_dtype=dtypes.float32,
           num_elements=3)
       l = math_ops.add_n([l1, l2])
-      with self.assertRaisesRegexp(
+      with self.assertRaisesRegex(
           errors.InvalidArgumentError,
           "TensorList arguments to AddN must all have the same shape"):
         session.run(
diff --git a/tensorflow/compiler/tests/bucketize_op_test.py b/tensorflow/compiler/tests/bucketize_op_test.py
index f6b6d773135..a1272fe045a 100644
--- a/tensorflow/compiler/tests/bucketize_op_test.py
+++ b/tensorflow/compiler/tests/bucketize_op_test.py
@@ -64,13 +64,13 @@ class BucketizationOpTest(xla_test.XLATestCase):
       p = array_ops.placeholder(dtypes.int32)
       with self.test_scope():
         op = math_ops._bucketize(p, boundaries=[0, 8, 3, 11])
-      with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
-                                   "Expected sorted boundaries"):
+      with self.assertRaisesRegex(errors_impl.InvalidArgumentError,
+                                  "Expected sorted boundaries"):
         sess.run(op, {p: [-5, 0]})
 
   def testBoundariesNotList(self):
     with self.session():
-      with self.assertRaisesRegexp(TypeError, "Expected list.*"):
+      with self.assertRaisesRegex(TypeError, "Expected list.*"):
         p = array_ops.placeholder(dtypes.int32)
         with self.test_scope():
           math_ops._bucketize(p, boundaries=0)
diff --git a/tensorflow/compiler/tests/concat_ops_test.py b/tensorflow/compiler/tests/concat_ops_test.py
index f35ded924d5..310be97f2d9 100644
--- a/tensorflow/compiler/tests/concat_ops_test.py
+++ b/tensorflow/compiler/tests/concat_ops_test.py
@@ -288,7 +288,7 @@ class ConcatTest(xla_test.XLATestCase):
       with self.test_scope():
         scalar = constant_op.constant(7)
         dim = array_ops.placeholder(dtypes.int32)
-        with self.assertRaisesRegexp(
+        with self.assertRaisesRegex(
             ValueError, r"Can't concatenate scalars \(use tf\.stack instead\)"):
           array_ops.concat([scalar, scalar, scalar], dim)
 
diff --git a/tensorflow/compiler/tests/cond_test.py b/tensorflow/compiler/tests/cond_test.py
index a28c2c5ca88..701181f577c 100644
--- a/tensorflow/compiler/tests/cond_test.py
+++ b/tensorflow/compiler/tests/cond_test.py
@@ -175,8 +175,8 @@ class CondTest(xla_test.XLATestCase):
       output = control_flow_ops.cond(
           constant_op.constant(True), if_true, if_false)
 
-      with self.assertRaisesRegexp(errors.InvalidArgumentError,
-                                   "must be a compile-time constant"):
+      with self.assertRaisesRegex(errors.InvalidArgumentError,
+                                  "must be a compile-time constant"):
         sess.run(
             output, feed_dict={
                 x: [0., 1., 2.],
@@ -209,8 +209,8 @@ class CondTest(xla_test.XLATestCase):
 
       output = xla.compile(f)
 
-      with self.assertRaisesRegexp(errors.InvalidArgumentError,
-                                   "must be a compile-time constant"):
+      with self.assertRaisesRegex(errors.InvalidArgumentError,
+                                  "must be a compile-time constant"):
         sess.run(
             output, feed_dict={
                 x: [0., 1., 2.],
diff --git a/tensorflow/compiler/tests/eager_test.py b/tensorflow/compiler/tests/eager_test.py
index 0ed81b7e9e5..520348e0f8a 100644
--- a/tensorflow/compiler/tests/eager_test.py
+++ b/tensorflow/compiler/tests/eager_test.py
@@ -704,8 +704,8 @@ class EagerFunctionTest(xla_test.XLATestCase):
       self.assertAllEqual([0.0, 4.0], r_y)
       if context.executing_eagerly():
         # backing_device is only available for eager tensors.
-        self.assertRegexpMatches(r_x.backing_device, self.device)
-        self.assertRegexpMatches(r_y.backing_device, self.device)
+        self.assertRegex(r_x.backing_device, self.device)
+        self.assertRegex(r_y.backing_device, self.device)
 
       # When function is executed op-by-op, requested devices will be
       # respected.
@@ -714,8 +714,8 @@ class EagerFunctionTest(xla_test.XLATestCase):
       self.assertAllEqual([0.0, 4.0], r_y)
       if context.executing_eagerly():
         # backing_device is only available for eager tensors.
-        self.assertRegexpMatches(r_x.backing_device, self.device)
-        self.assertRegexpMatches(r_y.backing_device, 'device:CPU:0')
+        self.assertRegex(r_x.backing_device, self.device)
+        self.assertRegex(r_y.backing_device, 'device:CPU:0')
 
 
 class ExcessivePaddingTest(xla_test.XLATestCase):
diff --git a/tensorflow/compiler/tests/ensure_shape_op_test.py b/tensorflow/compiler/tests/ensure_shape_op_test.py
index 95de5a9c49b..328d0bb6c01 100644
--- a/tensorflow/compiler/tests/ensure_shape_op_test.py
+++ b/tensorflow/compiler/tests/ensure_shape_op_test.py
@@ -42,8 +42,8 @@ class EnsureShapeOpTest(xla_test.XLATestCase):
       p = array_ops.placeholder(dtypes.int32)
       with self.test_scope():
         op = check_ops.ensure_shape(p, (None, 3, 3))
-      with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
-                                   "is not compatible with expected shape"):
+      with self.assertRaisesRegex(errors_impl.InvalidArgumentError,
+                                  "is not compatible with expected shape"):
         sess.run(op, {p: [[0, 1, 2], [3, 4, 5], [6, 7, 8]]})
 
 
diff --git a/tensorflow/compiler/tests/fifo_queue_test.py b/tensorflow/compiler/tests/fifo_queue_test.py
index ba80fa0a0b2..ef24b927ad4 100644
--- a/tensorflow/compiler/tests/fifo_queue_test.py
+++ b/tensorflow/compiler/tests/fifo_queue_test.py
@@ -66,7 +66,7 @@ class FIFOQueueTest(xla_test.XLATestCase):
   def testEnqueueDictWithoutNames(self):
     with self.session(), self.test_scope():
       q = data_flow_ops.FIFOQueue(10, dtypes_lib.float32)
-      with self.assertRaisesRegexp(ValueError, "must have names"):
+      with self.assertRaisesRegex(ValueError, "must have names"):
         q.enqueue({"a": 12.0})
 
   def testParallelEnqueue(self):
diff --git a/tensorflow/compiler/tests/image_ops_test.py b/tensorflow/compiler/tests/image_ops_test.py
index 81779203955..9590688fda7 100644
--- a/tensorflow/compiler/tests/image_ops_test.py
+++ b/tensorflow/compiler/tests/image_ops_test.py
@@ -297,7 +297,7 @@ class AdjustHueTest(xla_test.XLATestCase):
     x_np = np.random.rand(2, 3) * 255.
     delta_h = np.random.rand() * 2.0 - 1.0
     fused = False
-    with self.assertRaisesRegexp(ValueError, "Shape must be at least rank 3"):
+    with self.assertRaisesRegex(ValueError, "Shape must be at least rank 3"):
       self._adjustHueTf(x_np, delta_h)
     x_np = np.random.rand(4, 2, 4) * 255.
     delta_h = np.random.rand() * 2.0 - 1.0
diff --git a/tensorflow/compiler/tests/momentum_test.py b/tensorflow/compiler/tests/momentum_test.py
index dc4ccd52624..5f061fa0595 100644
--- a/tensorflow/compiler/tests/momentum_test.py
+++ b/tensorflow/compiler/tests/momentum_test.py
@@ -54,10 +54,10 @@ class MomentumOptimizerTest(xla_test.XLATestCase):
         # Check we have slots
         self.assertEqual(["momentum"], mom_opt.get_slot_names())
         slot0 = mom_opt.get_slot(var0, "momentum")
-        self.assertEquals(slot0.get_shape(), var0.get_shape())
+        self.assertEqual(slot0.get_shape(), var0.get_shape())
         self.assertFalse(slot0 in variables.trainable_variables())
         slot1 = mom_opt.get_slot(var1, "momentum")
-        self.assertEquals(slot1.get_shape(), var1.get_shape())
+        self.assertEqual(slot1.get_shape(), var1.get_shape())
         self.assertFalse(slot1 in variables.trainable_variables())
 
         # Fetch params to validate initial values
@@ -140,10 +140,10 @@ class MomentumOptimizerTest(xla_test.XLATestCase):
         # Check we have slots
         self.assertEqual(["momentum"], mom_opt.get_slot_names())
         slot0 = mom_opt.get_slot(var0, "momentum")
-        self.assertEquals(slot0.get_shape(), var0.get_shape())
+        self.assertEqual(slot0.get_shape(), var0.get_shape())
         self.assertFalse(slot0 in variables.trainable_variables())
         slot1 = mom_opt.get_slot(var1, "momentum")
-        self.assertEquals(slot1.get_shape(), var1.get_shape())
+        self.assertEqual(slot1.get_shape(), var1.get_shape())
         self.assertFalse(slot1 in variables.trainable_variables())
 
         # Fetch params to validate initial values
diff --git a/tensorflow/compiler/tests/tensor_array_ops_test.py b/tensorflow/compiler/tests/tensor_array_ops_test.py
index c07bcbef5b4..665d396182a 100644
--- a/tensorflow/compiler/tests/tensor_array_ops_test.py
+++ b/tensorflow/compiler/tests/tensor_array_ops_test.py
@@ -393,9 +393,8 @@ class TensorArrayTest(xla_test.XLATestCase):
       # Test writing the wrong datatype.
       # TODO(b/129870929): Remove InvalidArgumentError/second regexp after all
       # callers provide proper init dtype.
-      with self.assertRaisesRegexp(
-          (ValueError, errors.InvalidArgumentError),
-          r"("
+      with self.assertRaisesRegex(
+          (ValueError, errors.InvalidArgumentError), r"("
           r"conversion requested dtype float32 for Tensor with dtype int32"
           r"|"
           r"TensorArray dtype is float but op has dtype int32"
diff --git a/tensorflow/compiler/tests/tensor_list_ops_test.py b/tensorflow/compiler/tests/tensor_list_ops_test.py
index d49a6a37785..4a6cb4a11f8 100644
--- a/tensorflow/compiler/tests/tensor_list_ops_test.py
+++ b/tensorflow/compiler/tests/tensor_list_ops_test.py
@@ -103,8 +103,8 @@ class ListOpsTest(parameterized.TestCase, xla_test.XLATestCase):
       l = list_ops.tensor_list_push_back(
           l, constant_op.constant(1.0, shape=(7, 15)))
       _, e = list_ops.tensor_list_pop_back(l, element_dtype=dtypes.float32)
-      with self.assertRaisesRegexp(errors.InvalidArgumentError,
-                                   "Set the max number of elements"):
+      with self.assertRaisesRegex(errors.InvalidArgumentError,
+                                  "Set the max number of elements"):
         self.assertAllEqual(sess.run(e), 1.0 * np.ones((7, 15)))
 
   def testEmptyTensorListMax(self):
@@ -174,7 +174,7 @@ class ListOpsTest(parameterized.TestCase, xla_test.XLATestCase):
           element_dtype=dtypes.float32, element_shape=None, max_num_elements=2)
       l = list_ops.tensor_list_push_back(l, [3.0, 4.0])
       # Pushing an element with a different shape should raise an error.
-      with self.assertRaisesRegexp(errors.InternalError, "shape"):
+      with self.assertRaisesRegex(errors.InternalError, "shape"):
         l = list_ops.tensor_list_push_back(l, 5.)
         self.evaluate(
             list_ops.tensor_list_stack(l, element_dtype=dtypes.float32))
diff --git a/tensorflow/compiler/tests/tridiagonal_solve_ops_test.py b/tensorflow/compiler/tests/tridiagonal_solve_ops_test.py
index 0fe745f869a..e462211e5dd 100644
--- a/tensorflow/compiler/tests/tridiagonal_solve_ops_test.py
+++ b/tensorflow/compiler/tests/tridiagonal_solve_ops_test.py
@@ -223,7 +223,7 @@ class TridiagonalSolveOpsTest(xla_test.XLATestCase):
                                     num_rhs)).astype(np.float32)
 
     with self.session() as sess, self.test_scope():
-      with self.assertRaisesRegexp(
+      with self.assertRaisesRegex(
           errors_impl.UnimplementedError,
           "Current implementation does not yet support pivoting."):
         diags = array_ops.placeholder(
diff --git a/tensorflow/compiler/tests/variable_ops_test.py b/tensorflow/compiler/tests/variable_ops_test.py
index 2514a0a9dc4..ad7e29c6def 100644
--- a/tensorflow/compiler/tests/variable_ops_test.py
+++ b/tensorflow/compiler/tests/variable_ops_test.py
@@ -485,8 +485,8 @@ class SliceAssignTest(xla_test.XLATestCase):
       checker2[None] = [6]  # new axis
 
   def testUninitialized(self):
-    with self.assertRaisesRegexp(errors.FailedPreconditionError,
-                                 "uninitialized"):
+    with self.assertRaisesRegex(errors.FailedPreconditionError,
+                                "uninitialized"):
       with self.session() as sess, self.test_scope():
         v = resource_variable_ops.ResourceVariable([1, 2])
         sess.run(v[:].assign([1, 2]))
diff --git a/tensorflow/compiler/tests/xla_ops_test.py b/tensorflow/compiler/tests/xla_ops_test.py
index 35d36315464..0d6ae81ef6e 100644
--- a/tensorflow/compiler/tests/xla_ops_test.py
+++ b/tensorflow/compiler/tests/xla_ops_test.py
@@ -343,7 +343,7 @@ class XlaOpsNumericalTest(xla_test.XLATestCase, parameterized.TestCase):
             np.array([5, 7]), np.array([2, 3, 4]))
       with self.assertRaises(errors.InvalidArgumentError) as invalid_arg_error:
         session.run(output)
-      self.assertRegexpMatches(
+      self.assertRegex(
           invalid_arg_error.exception.message,
           (r'start_indices must be a vector with length equal to input rank, '
            r'but input rank is 3 and start_indices has shape \[2\].*'))
@@ -357,7 +357,7 @@ class XlaOpsNumericalTest(xla_test.XLATestCase, parameterized.TestCase):
             np.array([5, 7, 3]), np.array([2, 3]))
       with self.assertRaises(errors.InvalidArgumentError) as invalid_arg_error:
         session.run(output)
-      self.assertRegexpMatches(
+      self.assertRegex(
           invalid_arg_error.exception.message,
           (r'size_indices must be a vector with length equal to input rank, '
            r'but input rank is 3 and size_indices has shape \[2\].*'))
diff --git a/tensorflow/lite/python/interpreter_test.py b/tensorflow/lite/python/interpreter_test.py
index 770c9dc3090..cc74f4d8fbc 100644
--- a/tensorflow/lite/python/interpreter_test.py
+++ b/tensorflow/lite/python/interpreter_test.py
@@ -52,7 +52,7 @@ class InterpreterCustomOpsTest(test_util.TensorFlowTestCase):
 
   def testRegistererFailure(self):
     bogus_name = 'CompletelyBogusRegistererName'
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         ValueError, 'Looking up symbol \'' + bogus_name + '\' failed'):
       interpreter_wrapper.InterpreterWithCustomOps(
           model_path=resource_loader.get_path_to_datafile(
@@ -69,15 +69,14 @@ class InterpreterTest(test_util.TensorFlowTestCase):
     self.assertEqual(quantized_dimension, params['quantized_dimension'])
 
   def testThreads_NegativeValue(self):
-    with self.assertRaisesRegexp(ValueError,
-                                 'num_threads should >= 1'):
+    with self.assertRaisesRegex(ValueError, 'num_threads should >= 1'):
       interpreter_wrapper.Interpreter(
           model_path=resource_loader.get_path_to_datafile(
               'testdata/permute_float.tflite'), num_threads=-1)
 
   def testThreads_WrongType(self):
-    with self.assertRaisesRegexp(ValueError,
-                                 'type of num_threads should be int'):
+    with self.assertRaisesRegex(ValueError,
+                                'type of num_threads should be int'):
       interpreter_wrapper.Interpreter(
           model_path=resource_loader.get_path_to_datafile(
               'testdata/permute_float.tflite'), num_threads=4.2)
@@ -261,13 +260,13 @@ class InterpreterTest(test_util.TensorFlowTestCase):
 class InterpreterTestErrorPropagation(test_util.TensorFlowTestCase):
 
   def testInvalidModelContent(self):
-    with self.assertRaisesRegexp(ValueError,
-                                 'Model provided has model identifier \''):
+    with self.assertRaisesRegex(ValueError,
+                                'Model provided has model identifier \''):
       interpreter_wrapper.Interpreter(model_content=six.b('garbage'))
 
   def testInvalidModelFile(self):
-    with self.assertRaisesRegexp(
-        ValueError, 'Could not open \'totally_invalid_file_name\''):
+    with self.assertRaisesRegex(ValueError,
+                                'Could not open \'totally_invalid_file_name\''):
       interpreter_wrapper.Interpreter(
           model_path='totally_invalid_file_name')
 
@@ -275,12 +274,12 @@ class InterpreterTestErrorPropagation(test_util.TensorFlowTestCase):
     interpreter = interpreter_wrapper.Interpreter(
         model_path=resource_loader.get_path_to_datafile(
             'testdata/permute_float.tflite'))
-    with self.assertRaisesRegexp(RuntimeError,
-                                 'Invoke called on model that is not ready'):
+    with self.assertRaisesRegex(RuntimeError,
+                                'Invoke called on model that is not ready'):
       interpreter.invoke()
 
   def testInvalidModelFileContent(self):
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         ValueError, '`model_path` or `model_content` must be specified.'):
       interpreter_wrapper.Interpreter(model_path=None, model_content=None)
 
@@ -290,9 +289,9 @@ class InterpreterTestErrorPropagation(test_util.TensorFlowTestCase):
             'testdata/permute_float.tflite'))
     interpreter.allocate_tensors()
     # Invalid tensor index passed.
-    with self.assertRaisesRegexp(ValueError, 'Tensor with no shape found.'):
+    with self.assertRaisesRegex(ValueError, 'Tensor with no shape found.'):
       interpreter._get_tensor_details(4)
-    with self.assertRaisesRegexp(ValueError, 'Invalid node index'):
+    with self.assertRaisesRegex(ValueError, 'Invalid node index'):
       interpreter._get_op_details(4)
 
 
@@ -339,12 +338,10 @@ class InterpreterTensorAccessorTest(test_util.TensorFlowTestCase):
   def testBaseProtectsFunctions(self):
     in0 = self.interpreter.tensor(self.input0)()
     # Make sure we get an exception if we try to run an unsafe operation
-    with self.assertRaisesRegexp(
-        RuntimeError, 'There is at least 1 reference'):
+    with self.assertRaisesRegex(RuntimeError, 'There is at least 1 reference'):
       _ = self.interpreter.allocate_tensors()
     # Make sure we get an exception if we try to run an unsafe operation
-    with self.assertRaisesRegexp(
-        RuntimeError, 'There is at least 1 reference'):
+    with self.assertRaisesRegex(RuntimeError, 'There is at least 1 reference'):
       _ = self.interpreter.invoke()
     # Now test that we can run
     del in0  # this is our only buffer reference, so now it is safe to change
@@ -483,7 +480,7 @@ class InterpreterDelegateTest(test_util.TensorFlowTestCase):
     self.assertEqual(lib.get_options_counter(), 2)
 
   def testFail(self):
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         # Due to exception chaining in PY3, we can't be more specific here and check that
         # the phrase 'Fail argument sent' is present.
         ValueError,
diff --git a/tensorflow/lite/schema/upgrade_schema_test.py b/tensorflow/lite/schema/upgrade_schema_test.py
index 922968c65aa..e55925053e0 100644
--- a/tensorflow/lite/schema/upgrade_schema_test.py
+++ b/tensorflow/lite/schema/upgrade_schema_test.py
@@ -255,17 +255,17 @@ class TestSchemaUpgrade(test_util.TensorFlowTestCase):
   def testNonExistentFile(self):
     converter = upgrade_schema_lib.Converter()
     non_existent = tempfile.mktemp(suffix=".json")
-    with self.assertRaisesRegexp(IOError, "No such file or directory"):
+    with self.assertRaisesRegex(IOError, "No such file or directory"):
       converter.Convert(non_existent, non_existent)
 
   def testInvalidExtension(self):
     converter = upgrade_schema_lib.Converter()
     invalid_extension = tempfile.mktemp(suffix=".foo")
-    with self.assertRaisesRegexp(ValueError, "Invalid extension on input"):
+    with self.assertRaisesRegex(ValueError, "Invalid extension on input"):
       converter.Convert(invalid_extension, invalid_extension)
     with tempfile.NamedTemporaryFile(suffix=".json", mode="w+") as in_json:
       JsonDumpAndFlush(EMPTY_TEST_SCHEMA_V1, in_json)
-      with self.assertRaisesRegexp(ValueError, "Invalid extension on output"):
+      with self.assertRaisesRegex(ValueError, "Invalid extension on output"):
         converter.Convert(in_json.name, invalid_extension)
 
   def CheckConversion(self, data_old, data_expected):
diff --git a/tensorflow/python/autograph/converters/asserts_test.py b/tensorflow/python/autograph/converters/asserts_test.py
index bf063829e42..2e71c709e90 100644
--- a/tensorflow/python/autograph/converters/asserts_test.py
+++ b/tensorflow/python/autograph/converters/asserts_test.py
@@ -38,7 +38,7 @@ class AssertsTest(converter_testing.TestCase):
     tr = self.transform(f, (functions, asserts, return_statements))
 
     op = tr(constant_op.constant(False))
-    with self.assertRaisesRegexp(errors_impl.InvalidArgumentError, 'testmsg'):
+    with self.assertRaisesRegex(errors_impl.InvalidArgumentError, 'testmsg'):
       self.evaluate(op)
 
 
diff --git a/tensorflow/python/autograph/converters/directives_test.py b/tensorflow/python/autograph/converters/directives_test.py
index ac8730fe185..0e39edfd8f7 100644
--- a/tensorflow/python/autograph/converters/directives_test.py
+++ b/tensorflow/python/autograph/converters/directives_test.py
@@ -77,7 +77,7 @@ class DirectivesTest(converter_testing.TestCase):
       directives.set_loop_options()
       pass
 
-    with self.assertRaisesRegexp(ValueError, 'must be used inside a statement'):
+    with self.assertRaisesRegex(ValueError, 'must be used inside a statement'):
       self.transform(f, directives_converter, include_ast=True)
 
   def test_loop_target_not_first(self):
@@ -88,7 +88,7 @@ class DirectivesTest(converter_testing.TestCase):
         a = 2
         directives.set_loop_options(parallel_iterations=10, back_prop=a)
 
-    with self.assertRaisesRegexp(ValueError, 'must be the first statement'):
+    with self.assertRaisesRegex(ValueError, 'must be the first statement'):
       self.transform(f, directives_converter, include_ast=True)
 
   def test_value_verification_does_not_trigger_properties(self):
diff --git a/tensorflow/python/autograph/impl/api_test.py b/tensorflow/python/autograph/impl/api_test.py
index 146cca2f2eb..118258b3b91 100644
--- a/tensorflow/python/autograph/impl/api_test.py
+++ b/tensorflow/python/autograph/impl/api_test.py
@@ -587,7 +587,7 @@ class ApiTest(test.TestCase):
     opts = converter.ConversionOptions(internal_convert_user_code=False)
 
     # f should not be converted, causing len to error out.
-    with self.assertRaisesRegexp(Exception, 'len is not well defined'):
+    with self.assertRaisesRegex(Exception, 'len is not well defined'):
       api.converted_call(f, (constant_op.constant([0]),), None, options=opts)
 
     # len on the other hand should work fine.
diff --git a/tensorflow/python/autograph/lang/special_functions_test.py b/tensorflow/python/autograph/lang/special_functions_test.py
index 8d40f4036c5..ff72468d6f3 100644
--- a/tensorflow/python/autograph/lang/special_functions_test.py
+++ b/tensorflow/python/autograph/lang/special_functions_test.py
@@ -62,12 +62,12 @@ class SpecialFunctionsTest(test.TestCase):
       self.assertAllEqual(self.evaluate(sl), [])
 
   def test_tensor_list_unsupported_initializer(self):
-    with self.assertRaisesRegexp(ValueError, 'unknown type'):
+    with self.assertRaisesRegex(ValueError, 'unknown type'):
       special_functions.tensor_list(np.array([1, 2, 3]))
 
   def test_tensor_list_empty_list_no_type(self):
-    with self.assertRaisesRegexp(
-        ValueError, 'element_dtype and element_shape are required'):
+    with self.assertRaisesRegex(ValueError,
+                                'element_dtype and element_shape are required'):
       special_functions.tensor_list([])
 
   def test_tensor_list_from_elements(self):
diff --git a/tensorflow/python/autograph/operators/conditional_expressions_test.py b/tensorflow/python/autograph/operators/conditional_expressions_test.py
index 3f126116023..2e28e1794f8 100644
--- a/tensorflow/python/autograph/operators/conditional_expressions_test.py
+++ b/tensorflow/python/autograph/operators/conditional_expressions_test.py
@@ -48,7 +48,7 @@ class IfExpTest(test.TestCase):
       conditional_expressions.if_exp(
           constant_op.constant(True), lambda: 1.0, lambda: 2, 'expr_repr')
 
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         TypeError,
         "'expr_repr' has dtype float32 in the main.*int32 in the else"):
       test_fn()
diff --git a/tensorflow/python/autograph/operators/control_flow_test.py b/tensorflow/python/autograph/operators/control_flow_test.py
index 57288be9a9f..ce9b1181e05 100644
--- a/tensorflow/python/autograph/operators/control_flow_test.py
+++ b/tensorflow/python/autograph/operators/control_flow_test.py
@@ -685,7 +685,7 @@ class WhileLoopTest(test.TestCase):
     if not __debug__:
       self.skipTest('Feature disabled in optimized mode.')
     with test.mock.patch.object(control_flow, 'PYTHON_MAX_ITERATIONS', 100):
-      with self.assertRaisesRegexp(ValueError, 'iteration limit'):
+      with self.assertRaisesRegex(ValueError, 'iteration limit'):
         control_flow.while_stmt(
             test=lambda: True,
             body=lambda: None,
@@ -698,7 +698,7 @@ class WhileLoopTest(test.TestCase):
     if not __debug__:
       self.skipTest('Feature disabled in optimized mode.')
     with test.mock.patch.object(control_flow, 'PYTHON_MAX_ITERATIONS', 100):
-      with self.assertRaisesRegexp(ValueError, 'iteration limit'):
+      with self.assertRaisesRegex(ValueError, 'iteration limit'):
         control_flow.for_stmt(
             iter_=range(101),
             extra_test=None,
diff --git a/tensorflow/python/autograph/operators/exceptions_test.py b/tensorflow/python/autograph/operators/exceptions_test.py
index 21ba76bb952..4218db4312b 100644
--- a/tensorflow/python/autograph/operators/exceptions_test.py
+++ b/tensorflow/python/autograph/operators/exceptions_test.py
@@ -40,8 +40,8 @@ class ExceptionsTest(test.TestCase):
           constant_op.constant(False),
           lambda: constant_op.constant('test message'))
 
-      with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
-                                   'test message'):
+      with self.assertRaisesRegex(errors_impl.InvalidArgumentError,
+                                  'test message'):
         self.evaluate(t)
 
   @test_util.run_deprecated_v1
@@ -54,8 +54,8 @@ class ExceptionsTest(test.TestCase):
       t = exceptions.assert_stmt(
           constant_op.constant(False), lambda: two_tensors)
 
-      with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
-                                   'test message.*another message'):
+      with self.assertRaisesRegex(errors_impl.InvalidArgumentError,
+                                  'test message.*another message'):
         self.evaluate(t)
 
   def test_assert_python_untriggered(self):
@@ -81,7 +81,7 @@ class ExceptionsTest(test.TestCase):
       side_effect_trace.append(tracer)
       return 'test message'
 
-    with self.assertRaisesRegexp(AssertionError, 'test message'):
+    with self.assertRaisesRegex(AssertionError, 'test message'):
       exceptions.assert_stmt(False, expression_with_side_effects)
     self.assertListEqual(side_effect_trace, [tracer])
 
diff --git a/tensorflow/python/autograph/pyct/transformer_test.py b/tensorflow/python/autograph/pyct/transformer_test.py
index 30284ba5634..05d4664dcae 100644
--- a/tensorflow/python/autograph/pyct/transformer_test.py
+++ b/tensorflow/python/autograph/pyct/transformer_test.py
@@ -211,7 +211,7 @@ class TransformerTest(test.TestCase):
       node = tr.visit(node)
     obtained_message = str(cm.exception)
     expected_message = r'expected "ast.AST", got "\<(type|class) \'list\'\>"'
-    self.assertRegexpMatches(obtained_message, expected_message)
+    self.assertRegex(obtained_message, expected_message)
 
   def test_robust_error_on_ast_corruption(self):
     # A child class should not be able to be so broken that it causes the error
diff --git a/tensorflow/python/client/events_writer_test.py b/tensorflow/python/client/events_writer_test.py
index 20fd4e5f3e1..b4082c1acbd 100644
--- a/tensorflow/python/client/events_writer_test.py
+++ b/tensorflow/python/client/events_writer_test.py
@@ -73,7 +73,7 @@ class PywrapeventsWriterTest(test_util.TensorFlowTestCase):
       def __str__(self):
         return "Invalid"
 
-    with self.assertRaisesRegexp(TypeError, "Invalid"):
+    with self.assertRaisesRegex(TypeError, "Invalid"):
       _pywrap_events_writer.EventsWriter(b"foo").WriteEvent(_Invalid())
 
 
diff --git a/tensorflow/python/client/session_partial_run_test.py b/tensorflow/python/client/session_partial_run_test.py
index ed9a85b03c8..c1a521112a4 100644
--- a/tensorflow/python/client/session_partial_run_test.py
+++ b/tensorflow/python/client/session_partial_run_test.py
@@ -119,8 +119,8 @@ class PartialRunTest(test_util.TensorFlowTestCase):
     x = array_ops.placeholder(dtypes.float32, shape=())
     fetches = [x * 2, x * 3]
     handle = sess.partial_run_setup(fetches=fetches, feeds=[])
-    with self.assertRaisesRegexp(errors.InvalidArgumentError,
-                                 'You must feed a value for placeholder'):
+    with self.assertRaisesRegex(errors.InvalidArgumentError,
+                                'You must feed a value for placeholder'):
       sess.partial_run(handle, fetches[0])
 
   def RunTestPartialRunUnspecifiedFeed(self, sess):
@@ -130,8 +130,8 @@ class PartialRunTest(test_util.TensorFlowTestCase):
     r1 = math_ops.add(a, b)
 
     h = sess.partial_run_setup([r1], [a, b])
-    with self.assertRaisesRegexp(errors.InvalidArgumentError,
-                                 'was not specified in partial_run_setup.$'):
+    with self.assertRaisesRegex(errors.InvalidArgumentError,
+                                'was not specified in partial_run_setup.$'):
       sess.partial_run(h, r1, feed_dict={a: 1, b: 2, c: 3})
 
   def RunTestPartialRunUnspecifiedFetch(self, sess):
@@ -142,8 +142,8 @@ class PartialRunTest(test_util.TensorFlowTestCase):
     r2 = math_ops.multiply(a, c)
 
     h = sess.partial_run_setup([r1], [a, b, c])
-    with self.assertRaisesRegexp(errors.InvalidArgumentError,
-                                 'was not specified in partial_run_setup.$'):
+    with self.assertRaisesRegex(errors.InvalidArgumentError,
+                                'was not specified in partial_run_setup.$'):
       sess.partial_run(h, r2, feed_dict={a: 1, c: 3})
 
   def RunTestPartialRunAlreadyFed(self, sess):
@@ -155,8 +155,8 @@ class PartialRunTest(test_util.TensorFlowTestCase):
 
     h = sess.partial_run_setup([r1, r2], [a, b, c])
     sess.partial_run(h, r1, feed_dict={a: 1, b: 2})
-    with self.assertRaisesRegexp(errors.InvalidArgumentError,
-                                 'has already been fed.$'):
+    with self.assertRaisesRegex(errors.InvalidArgumentError,
+                                'has already been fed.$'):
       sess.partial_run(h, r2, feed_dict={a: 1, c: 3})
 
   def RunTestPartialRunAlreadyFetched(self, sess):
@@ -168,8 +168,8 @@ class PartialRunTest(test_util.TensorFlowTestCase):
 
     h = sess.partial_run_setup([r1, r2], [a, b, c])
     sess.partial_run(h, r1, feed_dict={a: 1, b: 2})
-    with self.assertRaisesRegexp(errors.InvalidArgumentError,
-                                 'has already been fetched.$'):
+    with self.assertRaisesRegex(errors.InvalidArgumentError,
+                                'has already been fetched.$'):
       sess.partial_run(h, r1, feed_dict={c: 3})
 
   def RunTestPartialRunEmptyFetches(self, sess):
@@ -185,7 +185,7 @@ class PartialRunTest(test_util.TensorFlowTestCase):
   def testInvalidPartialRunSetup(self):
     sess = session.Session()
     x = array_ops.placeholder(dtypes.float32, shape=[])
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         errors.InvalidArgumentError,
         'specify at least one target to fetch or execute.'):
       sess.partial_run_setup(fetches=[], feeds=[x])
diff --git a/tensorflow/python/client/session_test.py b/tensorflow/python/client/session_test.py
index 074b50bf69b..696353a5781 100644
--- a/tensorflow/python/client/session_test.py
+++ b/tensorflow/python/client/session_test.py
@@ -1269,11 +1269,11 @@ class SessionTest(test_util.TensorFlowTestCase):
 
   def testUseEmptyGraph(self):
     with session.Session() as sess:
-      with self.assertRaisesRegexp(RuntimeError, 'The Session graph is empty.'):
+      with self.assertRaisesRegex(RuntimeError, 'The Session graph is empty.'):
         sess.run([])
-      with self.assertRaisesRegexp(RuntimeError, 'The Session graph is empty.'):
+      with self.assertRaisesRegex(RuntimeError, 'The Session graph is empty.'):
         sess.run(())
-      with self.assertRaisesRegexp(RuntimeError, 'The Session graph is empty.'):
+      with self.assertRaisesRegex(RuntimeError, 'The Session graph is empty.'):
         sess.run({})
 
   @test_util.run_v1_only('b/120545219')
@@ -1516,11 +1516,11 @@ class SessionTest(test_util.TensorFlowTestCase):
       feed_t = array_ops.placeholder(dtype=dtypes.float32)
       out_t = array_ops.identity(feed_t)
       feed_val = constant_op.constant(5.0)
-      with self.assertRaisesRegexp(TypeError, 'cannot be a tf.Tensor object'):
+      with self.assertRaisesRegex(TypeError, 'cannot be a tf.Tensor object'):
         sess.run(out_t, feed_dict={feed_t: feed_val})
-      with self.assertRaisesRegexp(TypeError, 'cannot be a tf.Tensor object'):
+      with self.assertRaisesRegex(TypeError, 'cannot be a tf.Tensor object'):
         out_t.eval(feed_dict={feed_t: feed_val})
-      with self.assertRaisesRegexp(TypeError, 'cannot be a tf.Tensor object'):
+      with self.assertRaisesRegex(TypeError, 'cannot be a tf.Tensor object'):
         out_t.op.run(feed_dict={feed_t: feed_val})
 
   def testFeedPrecisionLossError(self):
@@ -1532,11 +1532,11 @@ class SessionTest(test_util.TensorFlowTestCase):
 
       out_t = constant_op.constant(1.0)
 
-      with self.assertRaisesRegexp(TypeError,
-                                   'is not compatible with Tensor type'):
+      with self.assertRaisesRegex(TypeError,
+                                  'is not compatible with Tensor type'):
         sess.run(out_t, feed_dict={feed_int_implicit_int32: largest_int64})
-      with self.assertRaisesRegexp(TypeError,
-                                   'is not compatible with Tensor type'):
+      with self.assertRaisesRegex(TypeError,
+                                  'is not compatible with Tensor type'):
         sess.run(out_t, feed_dict={feed_int_explicit_int32: largest_int64})
 
   def testStringFetch(self):
@@ -1598,7 +1598,7 @@ class SessionTest(test_util.TensorFlowTestCase):
         self.assertEqual(c_list[i], out[i].decode('utf-8'))
 
   def testInvalidTargetFails(self):
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         errors.NotFoundError,
         'No session factory registered for the given session options'):
       session.Session('INVALID_TARGET')
@@ -1662,7 +1662,7 @@ class SessionTest(test_util.TensorFlowTestCase):
   def testFeedDictKeyException(self):
     with session.Session() as sess:
       a = constant_op.constant(1.0, dtypes.float32, name='a')
-      with self.assertRaisesRegexp(TypeError, 'Cannot interpret feed_dict'):
+      with self.assertRaisesRegex(TypeError, 'Cannot interpret feed_dict'):
         sess.run(a, feed_dict={'a': [2.0]})
 
   def testPerStepTrace(self):
@@ -1717,10 +1717,10 @@ class SessionTest(test_util.TensorFlowTestCase):
       new_shape = constant_op.constant([2, 2])
       reshaped_tensor = array_ops.reshape(some_tensor, new_shape)
 
-      with self.assertRaisesRegexp(ValueError, 'Cannot feed value of shape'):
+      with self.assertRaisesRegex(ValueError, 'Cannot feed value of shape'):
         sess.run(reshaped_tensor, feed_dict={some_tensor: [1.0, 2.0, 3.0]})
 
-      with self.assertRaisesRegexp(
+      with self.assertRaisesRegex(
           errors.InvalidArgumentError,
           'Input to reshape is a tensor with 4 values, '
           'but the requested shape has 21'):
@@ -1794,7 +1794,7 @@ class SessionTest(test_util.TensorFlowTestCase):
     sess2_controller = sess2.as_default()
     sess2_controller.__enter__()
 
-    with self.assertRaisesRegexp(AssertionError, 'Nesting violated'):
+    with self.assertRaisesRegex(AssertionError, 'Nesting violated'):
       sess1_controller.__exit__(None, None, None)
 
     ops._default_session_stack.reset()
@@ -1818,17 +1818,17 @@ class SessionTest(test_util.TensorFlowTestCase):
 
   def testReentry(self):
     sess = session.Session()
-    with self.assertRaisesRegexp(RuntimeError, 'not re-entrant'):
+    with self.assertRaisesRegex(RuntimeError, 'not re-entrant'):
       with sess:
         with sess:
           pass
 
   def testInvalidArgument(self):
-    with self.assertRaisesRegexp(TypeError, 'target must be a string'):
+    with self.assertRaisesRegex(TypeError, 'target must be a string'):
       session.Session(37)
-    with self.assertRaisesRegexp(TypeError, 'config must be a tf.ConfigProto'):
+    with self.assertRaisesRegex(TypeError, 'config must be a tf.ConfigProto'):
       session.Session(config=37)
-    with self.assertRaisesRegexp(TypeError, 'graph must be a tf.Graph'):
+    with self.assertRaisesRegex(TypeError, 'graph must be a tf.Graph'):
       session.Session(graph=37)
 
   @test_util.run_v1_only('b/120545219')
@@ -2061,7 +2061,7 @@ class SessionTest(test_util.TensorFlowTestCase):
   def testAutoConvertAndCheckData(self):
     with self.cached_session() as sess:
       a = array_ops.placeholder(dtype=dtypes.string)
-      with self.assertRaisesRegexp(
+      with self.assertRaisesRegex(
           TypeError, r'Type of feed value 1 with type <(\w+) \'int\'> is not'):
         sess.run(a, feed_dict={a: 1})
 
diff --git a/tensorflow/python/compiler/mlir/mlir_test.py b/tensorflow/python/compiler/mlir/mlir_test.py
index 8fd5952fd76..2a2362d9f6b 100644
--- a/tensorflow/python/compiler/mlir/mlir_test.py
+++ b/tensorflow/python/compiler/mlir/mlir_test.py
@@ -32,8 +32,8 @@ class MLIRImportTest(test.TestCase):
     self.assertIn('func @main', mlir_module)
 
   def test_invalid_pbtxt(self):
-    with self.assertRaisesRegexp(errors.InvalidArgumentError,
-                                 'Could not parse input proto'):
+    with self.assertRaisesRegex(errors.InvalidArgumentError,
+                                'Could not parse input proto'):
       mlir.convert_graph_def('some invalid proto')
 
 
diff --git a/tensorflow/python/compiler/tensorrt/trt_convert_test.py b/tensorflow/python/compiler/tensorrt/trt_convert_test.py
index 05ff6fcaebe..8c5c925f026 100644
--- a/tensorflow/python/compiler/tensorrt/trt_convert_test.py
+++ b/tensorflow/python/compiler/tensorrt/trt_convert_test.py
@@ -563,7 +563,7 @@ class TrtConvertTest(test_util.TensorFlowTestCase, parameterized.TestCase):
               {_SAVED_MODEL_SIGNATURE_KEY: root.run})
 
     # Run TRT conversion.
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         ValueError, r"Option is_dynamic_op=False is not supported in TF 2.0, "
         "please set it to True instead."):
       self._CreateConverterV2(input_saved_model_dir, is_dynamic_op=False)
@@ -684,16 +684,16 @@ class TrtConvertTest(test_util.TensorFlowTestCase, parameterized.TestCase):
         gen_resource_variable_ops.destroy_resource_op(
             handle, ignore_lookup_error=False)
 
-    with self.assertRaisesRegexp(errors.NotFoundError,
-                                 r"Resource .* does not exist."):
+    with self.assertRaisesRegex(errors.NotFoundError,
+                                r"Resource .* does not exist."):
       _DestroyCache()
 
     # Load the converted model and make sure the engine cache is populated by
     # default.
     root = load.load(output_saved_model_dir)
     _DestroyCache()
-    with self.assertRaisesRegexp(errors.NotFoundError,
-                                 r"Resource .* does not exist."):
+    with self.assertRaisesRegex(errors.NotFoundError,
+                                r"Resource .* does not exist."):
       _DestroyCache()
 
     # Load the converted model again and make sure the engine cache is destroyed
@@ -701,8 +701,8 @@ class TrtConvertTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     root = load.load(output_saved_model_dir)
     del root
     gc.collect()  # Force GC to destroy the TRT engine cache.
-    with self.assertRaisesRegexp(errors.NotFoundError,
-                                 r"Resource .* does not exist."):
+    with self.assertRaisesRegex(errors.NotFoundError,
+                                r"Resource .* does not exist."):
       _DestroyCache()
 
   def _CompareSavedModel(self, model_class):
diff --git a/tensorflow/python/compiler/xla/experimental_compile_test.py b/tensorflow/python/compiler/xla/experimental_compile_test.py
index c0a1c4bf307..963a92d4384 100644
--- a/tensorflow/python/compiler/xla/experimental_compile_test.py
+++ b/tensorflow/python/compiler/xla/experimental_compile_test.py
@@ -103,8 +103,8 @@ class ExperimentalCompileTest(test.TestCase):
       x = xla_func(inputs)
       # XLA support is not yet enabled for TF ROCm
       if not test.is_built_with_rocm():
-        with self.assertRaisesRegexp(errors.InvalidArgumentError,
-                                     "not compilable"):
+        with self.assertRaisesRegex(errors.InvalidArgumentError,
+                                    "not compilable"):
           with session.Session(graph=g) as sess:
             sess.run(x, feed_dict={inputs: [1, 2, 2, 3, 3]})
 
diff --git a/tensorflow/python/compiler/xla/jit_test.py b/tensorflow/python/compiler/xla/jit_test.py
index 14fb611dc7f..5294d970a9b 100644
--- a/tensorflow/python/compiler/xla/jit_test.py
+++ b/tensorflow/python/compiler/xla/jit_test.py
@@ -57,7 +57,7 @@ class JITTest(test.TestCase, parameterized.TestCase):
   @test_util.run_v2_only
   def testJITInEager(self):
 
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         RuntimeError, "xla.experimental.jit_scope is not supported when eager "
         "execution is enabled. Try use it inside tf.function."):
       with jit.experimental_jit_scope(True):
@@ -204,7 +204,7 @@ class CompilationEnabledInGradientTest(test.TestCase, parameterized.TestCase):
       for cg in c_grad_ops:
         self.assertTrue(cg.get_attr("_XlaCompile"))
       for ncg in nc_grad_ops:
-        with self.assertRaisesRegexp(ValueError, "[Nn]o attr named"):
+        with self.assertRaisesRegex(ValueError, "[Nn]o attr named"):
           ncg.get_attr("_XlaCompile")
 
       # d/dx (x ** 4) = 4 * (x ** 3)
diff --git a/tensorflow/python/compiler/xla/xla_test.py b/tensorflow/python/compiler/xla/xla_test.py
index 6dc0789ba4f..af18abf727a 100644
--- a/tensorflow/python/compiler/xla/xla_test.py
+++ b/tensorflow/python/compiler/xla/xla_test.py
@@ -112,7 +112,7 @@ class XLACompileContextTest(test.TestCase, parameterized.TestCase):
 
     context = self.create_test_xla_compile_context()
     context.Enter()
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         NotImplementedError, 'Non-resource Variables are not supported inside '
         r'XLA computations \(operator name: Assign\)'):
       state_ops.assign(a, a + 1)
@@ -126,8 +126,8 @@ class XLACompileContextTest(test.TestCase, parameterized.TestCase):
 
     context2 = self.create_test_xla_compile_context()
     context2.Enter()
-    with self.assertRaisesRegexp(ValueError,
-                                 'XLA compiled computations cannot be nested'):
+    with self.assertRaisesRegex(ValueError,
+                                'XLA compiled computations cannot be nested'):
       constant_op.constant(1)
     context2.Exit()
     context1.Exit()
diff --git a/tensorflow/python/data/experimental/kernel_tests/assert_cardinality_test.py b/tensorflow/python/data/experimental/kernel_tests/assert_cardinality_test.py
index ecda3a7e0f9..362495744dc 100644
--- a/tensorflow/python/data/experimental/kernel_tests/assert_cardinality_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/assert_cardinality_test.py
@@ -69,8 +69,7 @@ class AssertCardinalityTest(test_base.DatasetTestBase, parameterized.TestCase):
     dataset = dataset.apply(
         cardinality.assert_cardinality(asserted_cardinality))
     get_next = self.getNext(dataset)
-    with self.assertRaisesRegexp(errors.FailedPreconditionError,
-                                 expected_error):
+    with self.assertRaisesRegex(errors.FailedPreconditionError, expected_error):
       while True:
         self.evaluate(get_next())
 
diff --git a/tensorflow/python/data/experimental/kernel_tests/dense_to_sparse_batch_test.py b/tensorflow/python/data/experimental/kernel_tests/dense_to_sparse_batch_test.py
index 5dd1bb0532c..a32d61c667e 100644
--- a/tensorflow/python/data/experimental/kernel_tests/dense_to_sparse_batch_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/dense_to_sparse_batch_test.py
@@ -85,7 +85,7 @@ class DenseToSparseBatchTest(test_base.DatasetTestBase, parameterized.TestCase):
   @combinations.generate(test_base.default_test_combinations())
   def testDenseToSparseBatchDatasetWithInvalidShape(self):
     input_tensor = array_ops.constant([[1]])
-    with self.assertRaisesRegexp(ValueError, "Dimension -2 must be >= 0"):
+    with self.assertRaisesRegex(ValueError, "Dimension -2 must be >= 0"):
       dataset_ops.Dataset.from_tensors(input_tensor).apply(
           batching.dense_to_sparse_batch(4, [-2]))
 
@@ -98,14 +98,14 @@ class DenseToSparseBatchTest(test_base.DatasetTestBase, parameterized.TestCase):
 
     # Initialize with an input tensor of incompatible rank.
     get_next = self.getNext(dataset_fn([[1]]))
-    with self.assertRaisesRegexp(errors.InvalidArgumentError,
-                                 "incompatible with the row shape"):
+    with self.assertRaisesRegex(errors.InvalidArgumentError,
+                                "incompatible with the row shape"):
       self.evaluate(get_next())
 
     # Initialize with an input tensor that is larger than `row_shape`.
     get_next = self.getNext(dataset_fn(np.int32(range(13))))
-    with self.assertRaisesRegexp(errors.DataLossError,
-                                 "larger than the row shape"):
+    with self.assertRaisesRegex(errors.DataLossError,
+                                "larger than the row shape"):
       self.evaluate(get_next())
 
 
diff --git a/tensorflow/python/data/experimental/kernel_tests/directed_interleave_dataset_test.py b/tensorflow/python/data/experimental/kernel_tests/directed_interleave_dataset_test.py
index fc18afaa842..f6ccc5163a4 100644
--- a/tensorflow/python/data/experimental/kernel_tests/directed_interleave_dataset_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/directed_interleave_dataset_test.py
@@ -113,38 +113,38 @@ class DirectedInterleaveDatasetTest(test_base.DatasetTestBase,
 
   @combinations.generate(test_base.default_test_combinations())
   def testErrors(self):
-    with self.assertRaisesRegexp(ValueError,
-                                 r"vector of length `len\(datasets\)`"):
+    with self.assertRaisesRegex(ValueError,
+                                r"vector of length `len\(datasets\)`"):
       interleave_ops.sample_from_datasets(
           [dataset_ops.Dataset.range(10),
            dataset_ops.Dataset.range(20)],
           weights=[0.25, 0.25, 0.25, 0.25])
 
-    with self.assertRaisesRegexp(TypeError, "`tf.float32` or `tf.float64`"):
+    with self.assertRaisesRegex(TypeError, "`tf.float32` or `tf.float64`"):
       interleave_ops.sample_from_datasets(
           [dataset_ops.Dataset.range(10),
            dataset_ops.Dataset.range(20)],
           weights=[1, 1])
 
-    with self.assertRaisesRegexp(TypeError, "must have the same type"):
+    with self.assertRaisesRegex(TypeError, "must have the same type"):
       interleave_ops.sample_from_datasets([
           dataset_ops.Dataset.from_tensors(0),
           dataset_ops.Dataset.from_tensors(0.0)
       ])
 
-    with self.assertRaisesRegexp(TypeError, "tf.int64"):
+    with self.assertRaisesRegex(TypeError, "tf.int64"):
       interleave_ops.choose_from_datasets([
           dataset_ops.Dataset.from_tensors(0),
           dataset_ops.Dataset.from_tensors(1)
       ], choice_dataset=dataset_ops.Dataset.from_tensors(1.0))
 
-    with self.assertRaisesRegexp(TypeError, "scalar"):
+    with self.assertRaisesRegex(TypeError, "scalar"):
       interleave_ops.choose_from_datasets([
           dataset_ops.Dataset.from_tensors(0),
           dataset_ops.Dataset.from_tensors(1)
       ], choice_dataset=dataset_ops.Dataset.from_tensors([1.0]))
 
-    with self.assertRaisesRegexp(errors.InvalidArgumentError, "out of range"):
+    with self.assertRaisesRegex(errors.InvalidArgumentError, "out of range"):
       dataset = interleave_ops.choose_from_datasets(
           [dataset_ops.Dataset.from_tensors(0)],
           choice_dataset=dataset_ops.Dataset.from_tensors(
diff --git a/tensorflow/python/data/experimental/kernel_tests/get_single_element_test.py b/tensorflow/python/data/experimental/kernel_tests/get_single_element_test.py
index 59c2ef68d99..b508b78da58 100644
--- a/tensorflow/python/data/experimental/kernel_tests/get_single_element_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/get_single_element_test.py
@@ -64,7 +64,7 @@ class GetSingleElementTest(test_base.DatasetTestBase, parameterized.TestCase):
       self.assertAllEqual([skip], sparse_val.values)
       self.assertAllEqual([skip], sparse_val.dense_shape)
     else:
-      with self.assertRaisesRegexp(error, error_msg):
+      with self.assertRaisesRegex(error, error_msg):
         self.evaluate(get_single_element.get_single_element(dataset))
 
   @combinations.generate(test_base.default_test_combinations())
diff --git a/tensorflow/python/data/experimental/kernel_tests/group_by_reducer_test.py b/tensorflow/python/data/experimental/kernel_tests/group_by_reducer_test.py
index bf823143d57..8671dec1745 100644
--- a/tensorflow/python/data/experimental/kernel_tests/group_by_reducer_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/group_by_reducer_test.py
@@ -143,7 +143,7 @@ class GroupByReducerTest(test_base.DatasetTestBase, parameterized.TestCase):
         finalize_func=lambda x: x)
 
     dataset = dataset_ops.Dataset.range(10)
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         TypeError,
         "The element types for the new state must match the initial state."):
       dataset.apply(
@@ -158,7 +158,7 @@ class GroupByReducerTest(test_base.DatasetTestBase, parameterized.TestCase):
         finalize_func=lambda x: x)
 
     dataset = dataset_ops.Dataset.range(10)
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         ValueError, "`key_func` must return a single tf.int64 tensor."):
       dataset.apply(
           grouping.group_by_reducer(lambda _: np.int64((0, 0)), reducer))
@@ -172,7 +172,7 @@ class GroupByReducerTest(test_base.DatasetTestBase, parameterized.TestCase):
         finalize_func=lambda x: x)
 
     dataset = dataset_ops.Dataset.range(10)
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         ValueError, "`key_func` must return a single tf.int64 tensor."):
       dataset.apply(
           grouping.group_by_reducer(lambda _: "wrong", reducer))
diff --git a/tensorflow/python/data/experimental/kernel_tests/group_by_window_test.py b/tensorflow/python/data/experimental/kernel_tests/group_by_window_test.py
index 581d8f42792..a35327c7b70 100644
--- a/tensorflow/python/data/experimental/kernel_tests/group_by_window_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/group_by_window_test.py
@@ -265,7 +265,7 @@ class GroupByWindowTest(test_base.DatasetTestBase, parameterized.TestCase):
         grouping.group_by_window(lambda _: 0, lambda _, xs: xs, 0))
 
     get_next = self.getNext(dataset)
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         errors.InvalidArgumentError,
         "Window size must be greater than zero, but got 0."):
       print(self.evaluate(get_next()))
diff --git a/tensorflow/python/data/experimental/kernel_tests/make_batched_features_dataset_test.py b/tensorflow/python/data/experimental/kernel_tests/make_batched_features_dataset_test.py
index 980fd03b073..4016fbbed66 100644
--- a/tensorflow/python/data/experimental/kernel_tests/make_batched_features_dataset_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/make_batched_features_dataset_test.py
@@ -223,7 +223,7 @@ class MakeBatchedFeaturesDatasetTest(
 
   @combinations.generate(test_base.default_test_combinations())
   def testOldStyleReader(self):
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         TypeError, r"The `reader` argument must return a `Dataset` object. "
         r"`tf.ReaderBase` subclasses are not supported."):
       _ = readers.make_batched_features_dataset(
diff --git a/tensorflow/python/data/experimental/kernel_tests/make_csv_dataset_test.py b/tensorflow/python/data/experimental/kernel_tests/make_csv_dataset_test.py
index 5f8382f43c4..23063b13f66 100644
--- a/tensorflow/python/data/experimental/kernel_tests/make_csv_dataset_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/make_csv_dataset_test.py
@@ -258,8 +258,8 @@ class MakeCsvDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
         compression_type="GZIP",
     )
 
-    with self.assertRaisesRegexp(ValueError,
-                                 "compression_type .ZLIB. is not supported"):
+    with self.assertRaisesRegex(ValueError,
+                                "compression_type .ZLIB. is not supported"):
       self._test_dataset(
           inputs,
           expected_output=expected_output,
diff --git a/tensorflow/python/data/experimental/kernel_tests/map_and_batch_test.py b/tensorflow/python/data/experimental/kernel_tests/map_and_batch_test.py
index 83eb210206e..525c6c22295 100644
--- a/tensorflow/python/data/experimental/kernel_tests/map_and_batch_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/map_and_batch_test.py
@@ -226,7 +226,7 @@ class MapAndBatchTest(test_base.DatasetTestBase, parameterized.TestCase):
   def testMapAndBatchFails(self):
     """Test a dataset that maps a TF function across its input elements."""
 
-    with self.assertRaisesRegexp(errors.InvalidArgumentError, "oops"):
+    with self.assertRaisesRegex(errors.InvalidArgumentError, "oops"):
       dataset = dataset_ops.Dataset.from_tensors(
           array_ops.check_numerics(
               constant_op.constant(1.0) / constant_op.constant(0.0), "oops"))
diff --git a/tensorflow/python/data/experimental/kernel_tests/map_defun_op_test.py b/tensorflow/python/data/experimental/kernel_tests/map_defun_op_test.py
index f12b702a68e..a6efe989ee3 100644
--- a/tensorflow/python/data/experimental/kernel_tests/map_defun_op_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/map_defun_op_test.py
@@ -185,8 +185,8 @@ class MapDefunTest(test_base.DatasetTestBase, parameterized.TestCase):
             constant_op.constant([1, 2, 3, 4, 5], dtype=dtypes.int64), 0),
         [100, 1])
     map_defun_op = map_defun.map_defun(defun, [c], [dtypes.int64], [()])[0]
-    with self.assertRaisesRegexp(errors.InvalidArgumentError,
-                                 r"indices = 10 is not in \[0, 5\)"):
+    with self.assertRaisesRegex(errors.InvalidArgumentError,
+                                r"indices = 10 is not in \[0, 5\)"):
       self.evaluate(map_defun_op)
 
   @combinations.generate(_test_combinations())
diff --git a/tensorflow/python/data/experimental/kernel_tests/optimization/choose_fastest_branch_dataset_test.py b/tensorflow/python/data/experimental/kernel_tests/optimization/choose_fastest_branch_dataset_test.py
index bb7849fb213..6370cb8a3df 100644
--- a/tensorflow/python/data/experimental/kernel_tests/optimization/choose_fastest_branch_dataset_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/optimization/choose_fastest_branch_dataset_test.py
@@ -150,8 +150,8 @@ class ChooseFastestBranchDatasetTest(test_base.DatasetTestBase,
     expected_error_msg = ("`num_elements_per_branch` must be divisible by "
                           "`ratio_denominator`")
     if context.executing_eagerly():
-      with self.assertRaisesRegexp(errors.InvalidArgumentError,
-                                   expected_error_msg):
+      with self.assertRaisesRegex(errors.InvalidArgumentError,
+                                  expected_error_msg):
         make_dataset()
     else:
       choose_fastest = make_dataset()
diff --git a/tensorflow/python/data/experimental/kernel_tests/optimization/map_vectorization_test.py b/tensorflow/python/data/experimental/kernel_tests/optimization/map_vectorization_test.py
index a806e745ef9..080a03c76dd 100644
--- a/tensorflow/python/data/experimental/kernel_tests/optimization/map_vectorization_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/optimization/map_vectorization_test.py
@@ -466,8 +466,8 @@ class MapVectorizationTest(test_base.DatasetTestBase, parameterized.TestCase):
       # x has leading dimension 5, this will raise an error
       return array_ops.gather(x, 10)
 
-    with self.assertRaisesRegexp(errors.InvalidArgumentError,
-                                 r"indices = 10 is not in \[0, 5\)"):
+    with self.assertRaisesRegex(errors.InvalidArgumentError,
+                                r"indices = 10 is not in \[0, 5\)"):
       base_dataset = dataset_ops.Dataset.range(5).repeat(5).batch(
           5, drop_remainder=True)
       _, optimized = self._get_test_datasets(base_dataset, map_fn)
diff --git a/tensorflow/python/data/experimental/kernel_tests/rebatch_dataset_test.py b/tensorflow/python/data/experimental/kernel_tests/rebatch_dataset_test.py
index 61d0e5eb0bb..3f8b40be508 100644
--- a/tensorflow/python/data/experimental/kernel_tests/rebatch_dataset_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/rebatch_dataset_test.py
@@ -79,8 +79,8 @@ class RebatchDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
   def testScalarInputError(self):
     dataset = dataset_ops.Dataset.range(1024)
     distribute._RebatchDataset(dataset.batch(4), num_replicas=4)
-    with self.assertRaisesRegexp(ValueError, ("You can fix the issue "
-                                              "by adding the `batch`")):
+    with self.assertRaisesRegex(ValueError, ("You can fix the issue "
+                                             "by adding the `batch`")):
       distribute._RebatchDataset(dataset, num_replicas=4)
 
   @combinations.generate(
diff --git a/tensorflow/python/data/experimental/kernel_tests/scan_test.py b/tensorflow/python/data/experimental/kernel_tests/scan_test.py
index dd97e57c700..a5fe5a7c62f 100644
--- a/tensorflow/python/data/experimental/kernel_tests/scan_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/scan_test.py
@@ -184,7 +184,7 @@ class ScanTest(test_base.DatasetTestBase, parameterized.TestCase):
     start = empty_ta
     start = start.write(0, -1)
 
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         NotImplementedError,
         r"construct a new TensorArray inside the function"):
       dataset_ops.Dataset.range(6).apply(scan_ops.scan(start, scan_fn))
@@ -226,7 +226,7 @@ class ScanTest(test_base.DatasetTestBase, parameterized.TestCase):
       return constant_op.constant(1, dtype=dtypes.int64), state
 
     dataset = dataset_ops.Dataset.range(10)
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         TypeError,
         "The element types for the new state must match the initial state."):
       dataset.apply(
@@ -239,7 +239,7 @@ class ScanTest(test_base.DatasetTestBase, parameterized.TestCase):
       return constant_op.constant(1, dtype=dtypes.int64)
 
     dataset = dataset_ops.Dataset.range(10)
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         TypeError,
         "The scan function must return a pair comprising the new state and the "
         "output value."):
diff --git a/tensorflow/python/data/experimental/kernel_tests/serialization/sequence_dataset_serialization_test.py b/tensorflow/python/data/experimental/kernel_tests/serialization/sequence_dataset_serialization_test.py
index 22fe264e8be..bab6c594072 100644
--- a/tensorflow/python/data/experimental/kernel_tests/serialization/sequence_dataset_serialization_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/sequence_dataset_serialization_test.py
@@ -53,8 +53,8 @@ class SkipDatasetSerializationTest(
 
   @combinations.generate(test_base.default_test_combinations())
   def testInvalidSkip(self):
-    with self.assertRaisesRegexp(ValueError,
-                                 'Shape must be rank 0 but is rank 1'):
+    with self.assertRaisesRegex(ValueError,
+                                'Shape must be rank 0 but is rank 1'):
       self.run_core_tests(lambda: self._build_skip_dataset([1, 2]), 0)
 
 
@@ -83,8 +83,8 @@ class TakeDatasetSerializationTest(
     self.run_core_tests(lambda: self._build_take_dataset(0), 0)
 
   def testInvalidTake(self):
-    with self.assertRaisesRegexp(ValueError,
-                                 'Shape must be rank 0 but is rank 1'):
+    with self.assertRaisesRegex(ValueError,
+                                'Shape must be rank 0 but is rank 1'):
       self.run_core_tests(lambda: self._build_take_dataset([1, 2]), 0)
 
 
@@ -120,8 +120,8 @@ class RepeatDatasetSerializationTest(
 
   @combinations.generate(test_base.default_test_combinations())
   def testInvalidRepeat(self):
-    with self.assertRaisesRegexp(
-        ValueError, 'Shape must be rank 0 but is rank 1'):
+    with self.assertRaisesRegex(ValueError,
+                                'Shape must be rank 0 but is rank 1'):
       self.run_core_tests(lambda: self._build_repeat_dataset([1, 2], 0), 0)
 
 
diff --git a/tensorflow/python/data/experimental/kernel_tests/serialization/stats_dataset_serialization_test.py b/tensorflow/python/data/experimental/kernel_tests/serialization/stats_dataset_serialization_test.py
index 66658ea0a5b..68bfd2aba35 100644
--- a/tensorflow/python/data/experimental/kernel_tests/serialization/stats_dataset_serialization_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/stats_dataset_serialization_test.py
@@ -44,8 +44,8 @@ class StatsDatasetSerializationTest(
 
   @combinations.generate(test_base.default_test_combinations())
   def test_bytes_produced_stats_invalid_tag_shape(self):
-    with self.assertRaisesRegexp(
-        ValueError, "Shape must be rank 0 but is rank 1"):
+    with self.assertRaisesRegex(ValueError,
+                                "Shape must be rank 0 but is rank 1"):
       # pylint: disable=g-long-lambda
       self.run_core_tests(
           lambda: dataset_ops.Dataset.range(100).apply(
@@ -71,8 +71,8 @@ class StatsDatasetSerializationTest(
 
   @combinations.generate(test_base.default_test_combinations())
   def test_latency_stats_invalid_tag_shape(self):
-    with self.assertRaisesRegexp(
-        ValueError, "Shape must be rank 0 but is rank 1"):
+    with self.assertRaisesRegex(ValueError,
+                                "Shape must be rank 0 but is rank 1"):
       # pylint: disable=g-long-lambda
       self.run_core_tests(
           lambda: dataset_ops.Dataset.range(100).apply(
diff --git a/tensorflow/python/data/kernel_tests/concatenate_test.py b/tensorflow/python/data/kernel_tests/concatenate_test.py
index bf726607681..203cefab32e 100644
--- a/tensorflow/python/data/kernel_tests/concatenate_test.py
+++ b/tensorflow/python/data/kernel_tests/concatenate_test.py
@@ -110,7 +110,7 @@ class ConcatenateTest(test_base.DatasetTestBase, parameterized.TestCase):
     dataset_to_concatenate = dataset_ops.Dataset.from_tensor_slices(
         to_concatenate_components)
 
-    with self.assertRaisesRegexp(TypeError, "have different types"):
+    with self.assertRaisesRegex(TypeError, "have different types"):
       input_dataset.concatenate(dataset_to_concatenate)
 
   @combinations.generate(test_base.default_test_combinations())
@@ -128,7 +128,7 @@ class ConcatenateTest(test_base.DatasetTestBase, parameterized.TestCase):
     dataset_to_concatenate = dataset_ops.Dataset.from_tensor_slices(
         to_concatenate_components)
 
-    with self.assertRaisesRegexp(TypeError, "have different types"):
+    with self.assertRaisesRegex(TypeError, "have different types"):
       input_dataset.concatenate(dataset_to_concatenate)
 
   @combinations.generate(test_base.default_test_combinations())
@@ -144,7 +144,7 @@ class ConcatenateTest(test_base.DatasetTestBase, parameterized.TestCase):
     dataset_to_concatenate = dataset_ops.Dataset.from_tensor_slices(
         to_concatenate_components)
 
-    with self.assertRaisesRegexp(TypeError, "have different types"):
+    with self.assertRaisesRegex(TypeError, "have different types"):
       input_dataset.concatenate(dataset_to_concatenate)
 
 
diff --git a/tensorflow/python/data/kernel_tests/dataset_test.py b/tensorflow/python/data/kernel_tests/dataset_test.py
index 3e474dd2511..32184d1905f 100644
--- a/tensorflow/python/data/kernel_tests/dataset_test.py
+++ b/tensorflow/python/data/kernel_tests/dataset_test.py
@@ -351,7 +351,7 @@ class DatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
   def testSameGraphError(self):
     dataset = dataset_ops.Dataset.range(10)
     with ops.Graph().as_default():
-      with self.assertRaisesRegexp(ValueError, "must be from the same graph"):
+      with self.assertRaisesRegex(ValueError, "must be from the same graph"):
         dataset = dataset.batch(2)
 
   @combinations.generate(
@@ -359,7 +359,7 @@ class DatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
   def testSameGraphErrorOneShot(self):
     dataset = dataset_ops.Dataset.range(10)
     with ops.Graph().as_default():
-      with self.assertRaisesRegexp(
+      with self.assertRaisesRegex(
           ValueError, "Please ensure that all datasets in the pipeline are "
           "created in the same graph as the iterator."):
         _ = dataset_ops.make_one_shot_iterator(dataset)
@@ -369,7 +369,7 @@ class DatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
   def testSameGraphErrorInitializable(self):
     dataset = dataset_ops.Dataset.range(10)
     with ops.Graph().as_default():
-      with self.assertRaisesRegexp(
+      with self.assertRaisesRegex(
           ValueError, "Please ensure that all datasets in the pipeline are "
           "created in the same graph as the iterator."):
         _ = dataset_ops.make_initializable_iterator(dataset)
diff --git a/tensorflow/python/data/kernel_tests/from_generator_test.py b/tensorflow/python/data/kernel_tests/from_generator_test.py
index 386108f0de7..8643228f267 100644
--- a/tensorflow/python/data/kernel_tests/from_generator_test.py
+++ b/tensorflow/python/data/kernel_tests/from_generator_test.py
@@ -453,7 +453,7 @@ class FromGeneratorTest(test_base.DatasetTestBase, parameterized.TestCase):
       for _ in range(10):
         yield [20]
 
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         TypeError, r"Cannot convert value \[tf.int64\] to a TensorFlow DType"):
       dataset_ops.Dataset.from_generator(
           generator, output_types=[dtypes.int64])
diff --git a/tensorflow/python/data/kernel_tests/iterator_test.py b/tensorflow/python/data/kernel_tests/iterator_test.py
index 36689ed75fb..060014652ec 100644
--- a/tensorflow/python/data/kernel_tests/iterator_test.py
+++ b/tensorflow/python/data/kernel_tests/iterator_test.py
@@ -72,7 +72,7 @@ class IteratorTest(test_base.DatasetTestBase, parameterized.TestCase):
     dataset = (
         dataset_ops.Dataset.from_tensor_slices([0.0, 1.0, 2.0])
         .map(lambda x: x + var))
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         ValueError, r"`Dataset.make_one_shot_iterator\(\)` does not support "
         "datasets that capture stateful objects.+myvar"):
       dataset_ops.make_one_shot_iterator(dataset)
@@ -213,17 +213,17 @@ class IteratorTest(test_base.DatasetTestBase, parameterized.TestCase):
     next_element = iterator.get_next()
 
     with self.cached_session() as sess:
-      with self.assertRaisesRegexp(errors.InvalidArgumentError, ""):
+      with self.assertRaisesRegex(errors.InvalidArgumentError, ""):
         sess.run(next_element)
 
       # Test that subsequent attempts to use the iterator also fail.
-      with self.assertRaisesRegexp(errors.InvalidArgumentError, ""):
+      with self.assertRaisesRegex(errors.InvalidArgumentError, ""):
         sess.run(next_element)
 
     with self.cached_session() as sess:
 
       def consumer_thread():
-        with self.assertRaisesRegexp(errors.InvalidArgumentError, ""):
+        with self.assertRaisesRegex(errors.InvalidArgumentError, ""):
           sess.run(next_element)
 
       num_threads = 8
@@ -293,8 +293,8 @@ class IteratorTest(test_base.DatasetTestBase, parameterized.TestCase):
     get_next = iterator.get_next()
 
     with self.cached_session() as sess:
-      with self.assertRaisesRegexp(errors.FailedPreconditionError,
-                                   "iterator has not been initialized"):
+      with self.assertRaisesRegex(errors.FailedPreconditionError,
+                                  "iterator has not been initialized"):
         sess.run(get_next)
 
   @combinations.generate(test_base.graph_only_combinations())
diff --git a/tensorflow/python/data/kernel_tests/map_test.py b/tensorflow/python/data/kernel_tests/map_test.py
index 03df41cd662..275be3ea635 100644
--- a/tensorflow/python/data/kernel_tests/map_test.py
+++ b/tensorflow/python/data/kernel_tests/map_test.py
@@ -1012,7 +1012,7 @@ class MapTest(test_base.DatasetTestBase, parameterized.TestCase):
   @combinations.generate(_test_combinations())
   def testReturnValueError(self, apply_map):
     dataset = dataset_ops.Dataset.from_tensors([1.0, 2.0, 3.0])
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         TypeError, r"Unsupported return value from function passed to "
         r"Dataset.map\(\)"):
       _ = apply_map(dataset, lambda x: Foo)
diff --git a/tensorflow/python/data/kernel_tests/options_test.py b/tensorflow/python/data/kernel_tests/options_test.py
index 27b5a336a6c..6869306e0d6 100644
--- a/tensorflow/python/data/kernel_tests/options_test.py
+++ b/tensorflow/python/data/kernel_tests/options_test.py
@@ -68,8 +68,7 @@ class OptionsTest(test_base.DatasetTestBase, parameterized.TestCase):
     options1.experimental_optimization.autotune = True
     options2 = dataset_ops.Options()
     options2.experimental_optimization.autotune = False
-    with self.assertRaisesRegexp(ValueError,
-                                 "Cannot merge incompatible values"):
+    with self.assertRaisesRegex(ValueError, "Cannot merge incompatible values"):
       dataset_ops.Dataset.range(0).with_options(options1).with_options(options2)
 
   @combinations.generate(test_base.default_test_combinations())
diff --git a/tensorflow/python/data/kernel_tests/padded_batch_test.py b/tensorflow/python/data/kernel_tests/padded_batch_test.py
index b4c5fdb2a1b..effbaad8c39 100644
--- a/tensorflow/python/data/kernel_tests/padded_batch_test.py
+++ b/tensorflow/python/data/kernel_tests/padded_batch_test.py
@@ -243,14 +243,14 @@ class PaddedBatchTest(test_base.DatasetTestBase, parameterized.TestCase):
 
   @combinations.generate(test_base.default_test_combinations())
   def testPaddedBatchShapeErrorWrongRank(self):
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         ValueError, r'The padded shape \(1,\) is not compatible with the '
         r'corresponding input component shape \(\).'):
       _ = dataset_ops.Dataset.range(10).padded_batch(5, padded_shapes=[1])
 
   @combinations.generate(test_base.default_test_combinations())
   def testPaddedBatchShapeErrorTooSmall(self):
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         ValueError, r'The padded shape \(1,\) is not compatible with the '
         r'corresponding input component shape \(3,\).'):
       _ = dataset_ops.Dataset.from_tensors([1, 2, 3]).padded_batch(
@@ -258,7 +258,7 @@ class PaddedBatchTest(test_base.DatasetTestBase, parameterized.TestCase):
 
   @combinations.generate(test_base.default_test_combinations())
   def testPaddedBatchShapeErrorShapeNotRank1(self):
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         ValueError, r'Padded shape .* must be a 1-D tensor '
         r'of tf.int64 values, but its shape was \(2, 2\).'):
       _ = dataset_ops.Dataset.from_tensors([1, 2, 3]).padded_batch(
@@ -266,7 +266,7 @@ class PaddedBatchTest(test_base.DatasetTestBase, parameterized.TestCase):
 
   @combinations.generate(test_base.default_test_combinations())
   def testPaddedBatchShapeErrorShapeNotInt(self):
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         TypeError, r'Padded shape .* must be a 1-D tensor '
         r'of tf.int64 values, but its element type was float32.'):
       _ = dataset_ops.Dataset.from_tensors([1, 2, 3]).padded_batch(
@@ -274,7 +274,7 @@ class PaddedBatchTest(test_base.DatasetTestBase, parameterized.TestCase):
 
   @combinations.generate(test_base.default_test_combinations())
   def testPaddedBatchShapeErrorWrongRankFromTensor(self):
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         ValueError, r'The padded shape \(1,\) is not compatible with the '
         r'corresponding input component shape \(\).'):
       shape_as_tensor = constant_op.constant([1], dtype=dtypes.int64)
@@ -283,14 +283,14 @@ class PaddedBatchTest(test_base.DatasetTestBase, parameterized.TestCase):
 
   @combinations.generate(test_base.default_test_combinations())
   def testPaddedBatchShapeErrorDefaultShapeWithUnknownRank(self):
-    with self.assertRaisesRegexp(ValueError, r'`padded_shapes`.*unknown rank'):
+    with self.assertRaisesRegex(ValueError, r'`padded_shapes`.*unknown rank'):
       ds = dataset_ops.Dataset.from_generator(
           lambda: iter([1, 2, 3]), output_types=dtypes.int32)
       ds.padded_batch(2)
 
   @combinations.generate(test_base.graph_only_combinations())
   def testPaddedBatchShapeErrorPlaceholder(self):
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         ValueError,
         r'The padded shape \((\?|None), (\?|None)\) is not compatible with the '
         r'corresponding input component shape \(\).'):
diff --git a/tensorflow/python/data/util/convert_test.py b/tensorflow/python/data/util/convert_test.py
index 78ca6e95139..7ec41d70879 100644
--- a/tensorflow/python/data/util/convert_test.py
+++ b/tensorflow/python/data/util/convert_test.py
@@ -79,13 +79,13 @@ class ConvertTest(test.TestCase):
                                 constant_op.constant([-1],
                                                      dtype=dtypes.int64))))
 
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         ValueError, r"The given shape .* must be a 1-D tensor of tf.int64 "
         r"values, but the shape was \(2, 2\)."):
       convert.partial_shape_to_tensor(constant_op.constant(
           [[1, 1], [1, 1]], dtype=dtypes.int64))
 
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         TypeError, r"The given shape .* must be a 1-D tensor of tf.int64 "
         r"values, but the element type was float32."):
       convert.partial_shape_to_tensor(constant_op.constant([1., 1.]))
diff --git a/tensorflow/python/data/util/nest_test.py b/tensorflow/python/data/util/nest_test.py
index e53753e1525..5e42fae2ad7 100644
--- a/tensorflow/python/data/util/nest_test.py
+++ b/tensorflow/python/data/util/nest_test.py
@@ -58,10 +58,10 @@ class NestTest(test.TestCase):
     self.assertEqual(
         np.array([5]), nest.pack_sequence_as("scalar", [np.array([5])]))
 
-    with self.assertRaisesRegexp(ValueError, "Structure is a scalar"):
+    with self.assertRaisesRegex(ValueError, "Structure is a scalar"):
       nest.pack_sequence_as("scalar", [4, 5])
 
-    with self.assertRaisesRegexp(TypeError, "flat_sequence"):
+    with self.assertRaisesRegex(TypeError, "flat_sequence"):
       nest.pack_sequence_as([4, 5], "bad_sequence")
 
     with self.assertRaises(ValueError):
@@ -191,20 +191,20 @@ class NestTest(test.TestCase):
     nest.assert_same_structure("abc", np.array([0, 1]))
     nest.assert_same_structure("abc", constant_op.constant([0, 1]))
 
-    with self.assertRaisesRegexp(ValueError,
-                                 "don't have the same nested structure"):
+    with self.assertRaisesRegex(ValueError,
+                                "don't have the same nested structure"):
       nest.assert_same_structure(structure1, structure_different_num_elements)
 
-    with self.assertRaisesRegexp(ValueError,
-                                 "don't have the same nested structure"):
+    with self.assertRaisesRegex(ValueError,
+                                "don't have the same nested structure"):
       nest.assert_same_structure((0, 1), np.array([0, 1]))
 
-    with self.assertRaisesRegexp(ValueError,
-                                 "don't have the same nested structure"):
+    with self.assertRaisesRegex(ValueError,
+                                "don't have the same nested structure"):
       nest.assert_same_structure(0, (0, 1))
 
-    with self.assertRaisesRegexp(ValueError,
-                                 "don't have the same nested structure"):
+    with self.assertRaisesRegex(ValueError,
+                                "don't have the same nested structure"):
       nest.assert_same_structure(structure1, structure_different_nesting)
 
     named_type_0 = collections.namedtuple("named_0", ("a", "b"))
@@ -217,24 +217,23 @@ class NestTest(test.TestCase):
     self.assertRaises(TypeError, nest.assert_same_structure,
                       named_type_0(3, 4), named_type_1(3, 4))
 
-    with self.assertRaisesRegexp(ValueError,
-                                 "don't have the same nested structure"):
+    with self.assertRaisesRegex(ValueError,
+                                "don't have the same nested structure"):
       nest.assert_same_structure(named_type_0(3, 4), named_type_0((3,), 4))
 
-    with self.assertRaisesRegexp(ValueError,
-                                 "don't have the same nested structure"):
+    with self.assertRaisesRegex(ValueError,
+                                "don't have the same nested structure"):
       nest.assert_same_structure(((3,), 4), (3, (4,)))
 
     structure1_list = {"a": ((1, 2), 3), "b": 4, "c": (5, 6)}
     structure2_list = {"a": ((1, 2), 3), "b": 4, "d": (5, 6)}
-    with self.assertRaisesRegexp(TypeError,
-                                 "don't have the same sequence type"):
+    with self.assertRaisesRegex(TypeError, "don't have the same sequence type"):
       nest.assert_same_structure(structure1, structure1_list)
     nest.assert_same_structure(structure1, structure2, check_types=False)
     nest.assert_same_structure(structure1, structure1_list, check_types=False)
-    with self.assertRaisesRegexp(ValueError, "don't have the same set of keys"):
+    with self.assertRaisesRegex(ValueError, "don't have the same set of keys"):
       nest.assert_same_structure(structure1_list, structure2_list)
-    with self.assertRaisesRegexp(ValueError, "don't have the same set of keys"):
+    with self.assertRaisesRegex(ValueError, "don't have the same set of keys"):
       nest.assert_same_structure(structure_dictionary,
                                  structure_dictionary_diff_nested)
     nest.assert_same_structure(
@@ -262,26 +261,26 @@ class NestTest(test.TestCase):
 
     self.assertEqual(7, nest.map_structure(lambda x, y: x + y, 3, 4))
 
-    with self.assertRaisesRegexp(TypeError, "callable"):
+    with self.assertRaisesRegex(TypeError, "callable"):
       nest.map_structure("bad", structure1_plus1)
 
-    with self.assertRaisesRegexp(ValueError, "same nested structure"):
+    with self.assertRaisesRegex(ValueError, "same nested structure"):
       nest.map_structure(lambda x, y: None, 3, (3,))
 
-    with self.assertRaisesRegexp(TypeError, "same sequence type"):
+    with self.assertRaisesRegex(TypeError, "same sequence type"):
       nest.map_structure(lambda x, y: None, ((3, 4), 5), {"a": (3, 4), "b": 5})
 
-    with self.assertRaisesRegexp(ValueError, "same nested structure"):
+    with self.assertRaisesRegex(ValueError, "same nested structure"):
       nest.map_structure(lambda x, y: None, ((3, 4), 5), (3, (4, 5)))
 
-    with self.assertRaisesRegexp(ValueError, "same nested structure"):
+    with self.assertRaisesRegex(ValueError, "same nested structure"):
       nest.map_structure(lambda x, y: None, ((3, 4), 5), (3, (4, 5)),
                          check_types=False)
 
-    with self.assertRaisesRegexp(ValueError, "Only valid keyword argument"):
+    with self.assertRaisesRegex(ValueError, "Only valid keyword argument"):
       nest.map_structure(lambda x: None, structure1, foo="a")
 
-    with self.assertRaisesRegexp(ValueError, "Only valid keyword argument"):
+    with self.assertRaisesRegex(ValueError, "Only valid keyword argument"):
       nest.map_structure(lambda x: None, structure1, check_types=False, foo="a")
 
   def testAssertShallowStructure(self):
@@ -290,7 +289,7 @@ class NestTest(test.TestCase):
     expected_message = (
         "The two structures don't have the same sequence length. Input "
         "structure has length 2, while shallow structure has length 3.")
-    with self.assertRaisesRegexp(ValueError, expected_message):
+    with self.assertRaisesRegex(ValueError, expected_message):
       nest.assert_shallow_structure(inp_abc, inp_ab)
 
     inp_ab1 = ((1, 1), (2, 2))
@@ -299,7 +298,7 @@ class NestTest(test.TestCase):
         "The two structures don't have the same sequence type. Input structure "
         "has type <(type|class) 'tuple'>, while shallow structure has type "
         "<(type|class) 'dict'>.")
-    with self.assertRaisesRegexp(TypeError, expected_message):
+    with self.assertRaisesRegex(TypeError, expected_message):
       nest.assert_shallow_structure(inp_ab2, inp_ab1)
     nest.assert_shallow_structure(inp_ab2, inp_ab1, check_types=False)
 
@@ -309,7 +308,7 @@ class NestTest(test.TestCase):
         r"The two structures don't have the same keys. Input "
         r"structure has keys \['c'\], while shallow structure has "
         r"keys \['d'\].")
-    with self.assertRaisesRegexp(ValueError, expected_message):
+    with self.assertRaisesRegex(ValueError, expected_message):
       nest.assert_shallow_structure(inp_ab2, inp_ab1)
 
     inp_ab = collections.OrderedDict([("a", 1), ("b", (2, 3))])
@@ -387,14 +386,14 @@ class NestTest(test.TestCase):
     shallow_tree = ("shallow_tree",)
     expected_message = ("If shallow structure is a sequence, input must also "
                         "be a sequence. Input has type: <(type|class) 'str'>.")
-    with self.assertRaisesRegexp(TypeError, expected_message):
+    with self.assertRaisesRegex(TypeError, expected_message):
       flattened_input_tree = nest.flatten_up_to(shallow_tree, input_tree)
     flattened_shallow_tree = nest.flatten_up_to(shallow_tree, shallow_tree)
     self.assertEqual(flattened_shallow_tree, list(shallow_tree))
 
     input_tree = "input_tree"
     shallow_tree = ("shallow_tree_9", "shallow_tree_8")
-    with self.assertRaisesRegexp(TypeError, expected_message):
+    with self.assertRaisesRegex(TypeError, expected_message):
       flattened_input_tree = nest.flatten_up_to(shallow_tree, input_tree)
     flattened_shallow_tree = nest.flatten_up_to(shallow_tree, shallow_tree)
     self.assertEqual(flattened_shallow_tree, list(shallow_tree))
@@ -404,14 +403,14 @@ class NestTest(test.TestCase):
     shallow_tree = (9,)
     expected_message = ("If shallow structure is a sequence, input must also "
                         "be a sequence. Input has type: <(type|class) 'int'>.")
-    with self.assertRaisesRegexp(TypeError, expected_message):
+    with self.assertRaisesRegex(TypeError, expected_message):
       flattened_input_tree = nest.flatten_up_to(shallow_tree, input_tree)
     flattened_shallow_tree = nest.flatten_up_to(shallow_tree, shallow_tree)
     self.assertEqual(flattened_shallow_tree, list(shallow_tree))
 
     input_tree = 0
     shallow_tree = (9, 8)
-    with self.assertRaisesRegexp(TypeError, expected_message):
+    with self.assertRaisesRegex(TypeError, expected_message):
       flattened_input_tree = nest.flatten_up_to(shallow_tree, input_tree)
     flattened_shallow_tree = nest.flatten_up_to(shallow_tree, shallow_tree)
     self.assertEqual(flattened_shallow_tree, list(shallow_tree))
diff --git a/tensorflow/python/data/util/structure_test.py b/tensorflow/python/data/util/structure_test.py
index c4f9af69eb4..f44f3342799 100644
--- a/tensorflow/python/data/util/structure_test.py
+++ b/tensorflow/python/data/util/structure_test.py
@@ -417,46 +417,46 @@ class StructureTest(test_base.DatasetTestBase, parameterized.TestCase,
     s_nest = structure.type_spec_from_value(value_nest)
     flat_nest = structure.to_tensor_list(s_nest, value_nest)
 
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         ValueError, r"SparseTensor.* is not convertible to a tensor with "
         r"dtype.*float32.* and shape \(\)"):
       structure.to_tensor_list(s_tensor, value_sparse_tensor)
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         ValueError, "The two structures don't have the same nested structure."):
       structure.to_tensor_list(s_tensor, value_nest)
 
-    with self.assertRaisesRegexp(
-        TypeError, "Neither a SparseTensor nor SparseTensorValue"):
+    with self.assertRaisesRegex(TypeError,
+                                "Neither a SparseTensor nor SparseTensorValue"):
       structure.to_tensor_list(s_sparse_tensor, value_tensor)
 
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         ValueError, "The two structures don't have the same nested structure."):
       structure.to_tensor_list(s_sparse_tensor, value_nest)
 
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         ValueError, "The two structures don't have the same nested structure."):
       structure.to_tensor_list(s_nest, value_tensor)
 
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         ValueError, "The two structures don't have the same nested structure."):
       structure.to_tensor_list(s_nest, value_sparse_tensor)
 
-    with self.assertRaisesRegexp(ValueError, r"Incompatible input:"):
+    with self.assertRaisesRegex(ValueError, r"Incompatible input:"):
       structure.from_tensor_list(s_tensor, flat_sparse_tensor)
 
-    with self.assertRaisesRegexp(ValueError, "Expected 1 tensors but got 2."):
+    with self.assertRaisesRegex(ValueError, "Expected 1 tensors but got 2."):
       structure.from_tensor_list(s_tensor, flat_nest)
 
-    with self.assertRaisesRegexp(ValueError, "Incompatible input: "):
+    with self.assertRaisesRegex(ValueError, "Incompatible input: "):
       structure.from_tensor_list(s_sparse_tensor, flat_tensor)
 
-    with self.assertRaisesRegexp(ValueError, "Expected 1 tensors but got 2."):
+    with self.assertRaisesRegex(ValueError, "Expected 1 tensors but got 2."):
       structure.from_tensor_list(s_sparse_tensor, flat_nest)
 
-    with self.assertRaisesRegexp(ValueError, "Expected 2 tensors but got 1."):
+    with self.assertRaisesRegex(ValueError, "Expected 2 tensors but got 1."):
       structure.from_tensor_list(s_nest, flat_tensor)
 
-    with self.assertRaisesRegexp(ValueError, "Expected 2 tensors but got 1."):
+    with self.assertRaisesRegex(ValueError, "Expected 2 tensors but got 1."):
       structure.from_tensor_list(s_nest, flat_sparse_tensor)
 
   def testIncompatibleNestedStructure(self):
@@ -498,20 +498,20 @@ class StructureTest(test_base.DatasetTestBase, parameterized.TestCase,
     s_2 = structure.type_spec_from_value(value_2)
     flat_s_2 = structure.to_tensor_list(s_2, value_2)
 
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         ValueError, r"SparseTensor.* is not convertible to a tensor with "
         r"dtype.*int32.* and shape \(3,\)"):
       structure.to_tensor_list(s_0, value_1)
 
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         ValueError, "The two structures don't have the same nested structure."):
       structure.to_tensor_list(s_0, value_2)
 
-    with self.assertRaisesRegexp(
-        TypeError, "Neither a SparseTensor nor SparseTensorValue"):
+    with self.assertRaisesRegex(TypeError,
+                                "Neither a SparseTensor nor SparseTensorValue"):
       structure.to_tensor_list(s_1, value_0)
 
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         ValueError, "The two structures don't have the same nested structure."):
       structure.to_tensor_list(s_1, value_2)
 
@@ -519,30 +519,30 @@ class StructureTest(test_base.DatasetTestBase, parameterized.TestCase,
     # needs to account for "a" coming before or after "b". It might be worth
     # adding a deterministic repr for these error messages (among other
     # improvements).
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         ValueError, "The two structures don't have the same nested structure."):
       structure.to_tensor_list(s_2, value_0)
 
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         ValueError, "The two structures don't have the same nested structure."):
       structure.to_tensor_list(s_2, value_1)
 
-    with self.assertRaisesRegexp(ValueError, r"Incompatible input:"):
+    with self.assertRaisesRegex(ValueError, r"Incompatible input:"):
       structure.from_tensor_list(s_0, flat_s_1)
 
-    with self.assertRaisesRegexp(ValueError, "Expected 2 tensors but got 3."):
+    with self.assertRaisesRegex(ValueError, "Expected 2 tensors but got 3."):
       structure.from_tensor_list(s_0, flat_s_2)
 
-    with self.assertRaisesRegexp(ValueError, "Incompatible input: "):
+    with self.assertRaisesRegex(ValueError, "Incompatible input: "):
       structure.from_tensor_list(s_1, flat_s_0)
 
-    with self.assertRaisesRegexp(ValueError, "Expected 2 tensors but got 3."):
+    with self.assertRaisesRegex(ValueError, "Expected 2 tensors but got 3."):
       structure.from_tensor_list(s_1, flat_s_2)
 
-    with self.assertRaisesRegexp(ValueError, "Expected 3 tensors but got 2."):
+    with self.assertRaisesRegex(ValueError, "Expected 3 tensors but got 2."):
       structure.from_tensor_list(s_2, flat_s_0)
 
-    with self.assertRaisesRegexp(ValueError, "Expected 3 tensors but got 2."):
+    with self.assertRaisesRegex(ValueError, "Expected 3 tensors but got 2."):
       structure.from_tensor_list(s_2, flat_s_1)
 
   @parameterized.named_parameters(
diff --git a/tensorflow/python/debug/cli/analyzer_cli_test.py b/tensorflow/python/debug/cli/analyzer_cli_test.py
index 58a0f3546f8..cba446e8157 100644
--- a/tensorflow/python/debug/cli/analyzer_cli_test.py
+++ b/tensorflow/python/debug/cli/analyzer_cli_test.py
@@ -1340,24 +1340,24 @@ class AnalyzerCLISimpleMulAddTest(test_util.TensorFlowTestCase):
     analyzer = analyzer_cli.DebugAnalyzer(self._debug_dump,
                                           _cli_config_from_temp_file())
 
-    with self.assertRaisesRegexp(ValueError,
-                                 "Input argument filter_name cannot be empty."):
+    with self.assertRaisesRegex(ValueError,
+                                "Input argument filter_name cannot be empty."):
       analyzer.add_tensor_filter("", lambda datum, tensor: True)
 
   def testAddTensorFilterNonStrName(self):
     analyzer = analyzer_cli.DebugAnalyzer(self._debug_dump,
                                           _cli_config_from_temp_file())
 
-    with self.assertRaisesRegexp(
-        TypeError,
-        "Input argument filter_name is expected to be str, ""but is not"):
+    with self.assertRaisesRegex(
+        TypeError, "Input argument filter_name is expected to be str, "
+        "but is not"):
       analyzer.add_tensor_filter(1, lambda datum, tensor: True)
 
   def testAddGetTensorFilterNonCallable(self):
     analyzer = analyzer_cli.DebugAnalyzer(self._debug_dump,
                                           _cli_config_from_temp_file())
 
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         TypeError, "Input argument filter_callable is expected to be callable, "
         "but is not."):
       analyzer.add_tensor_filter("foo_filter", "bar")
@@ -1367,8 +1367,8 @@ class AnalyzerCLISimpleMulAddTest(test_util.TensorFlowTestCase):
                                           _cli_config_from_temp_file())
 
     analyzer.add_tensor_filter("foo_filter", lambda datum, tensor: True)
-    with self.assertRaisesRegexp(ValueError,
-                                 "There is no tensor filter named \"bar\""):
+    with self.assertRaisesRegex(ValueError,
+                                "There is no tensor filter named \"bar\""):
       analyzer.get_tensor_filter("bar")
 
   def _findSourceLine(self, annotated_source, line_number):
diff --git a/tensorflow/python/debug/cli/cli_shared_test.py b/tensorflow/python/debug/cli/cli_shared_test.py
index a7ccd846267..3182d29d125 100644
--- a/tensorflow/python/debug/cli/cli_shared_test.py
+++ b/tensorflow/python/debug/cli/cli_shared_test.py
@@ -101,7 +101,7 @@ class TimeToReadableStrTest(test_util.TensorFlowTestCase):
                      cli_shared.time_to_readable_str(
                          0, force_time_unit=cli_shared.TIME_UNIT_S))
 
-    with self.assertRaisesRegexp(ValueError, r"Invalid time unit: ks"):
+    with self.assertRaisesRegex(ValueError, r"Invalid time unit: ks"):
       cli_shared.time_to_readable_str(100, force_time_unit="ks")
 
 
diff --git a/tensorflow/python/debug/cli/command_parser_test.py b/tensorflow/python/debug/cli/command_parser_test.py
index ae7468fd09d..156aad27f38 100644
--- a/tensorflow/python/debug/cli/command_parser_test.py
+++ b/tensorflow/python/debug/cli/command_parser_test.py
@@ -121,7 +121,7 @@ class ExtractOutputFilePathTest(test_util.TensorFlowTestCase):
     self.assertEqual(output_path, "/tmp/foo.txt")
 
   def testHasGreaterThanSignButNoFileNameCausesSyntaxError(self):
-    with self.assertRaisesRegexp(SyntaxError, "Redirect file path is empty"):
+    with self.assertRaisesRegex(SyntaxError, "Redirect file path is empty"):
       command_parser.extract_output_file_path(
           ["pt", "a:0", ">"])
 
@@ -256,15 +256,15 @@ class ParseIndicesTest(test_util.TensorFlowTestCase):
     self.assertEqual([3, 4, -5], command_parser.parse_indices("3,4,-5"))
 
   def testParseInvalidIndicesStringsWithoutBrackets(self):
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         ValueError, r"invalid literal for int\(\) with base 10: 'a'"):
       self.assertEqual([0], command_parser.parse_indices("0,a"))
 
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         ValueError, r"invalid literal for int\(\) with base 10: '2\]'"):
       self.assertEqual([0], command_parser.parse_indices("1, 2]"))
 
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         ValueError, r"invalid literal for int\(\) with base 10: ''"):
       self.assertEqual([0], command_parser.parse_indices("3, 4,"))
 
@@ -296,20 +296,20 @@ class ParseRangesTest(test_util.TensorFlowTestCase):
     with self.assertRaises(SyntaxError):
       command_parser.parse_ranges("[[1,2]")
 
-    with self.assertRaisesRegexp(ValueError,
-                                 "Incorrect number of elements in range"):
+    with self.assertRaisesRegex(ValueError,
+                                "Incorrect number of elements in range"):
       command_parser.parse_ranges("[1,2,3]")
 
-    with self.assertRaisesRegexp(ValueError,
-                                 "Incorrect number of elements in range"):
+    with self.assertRaisesRegex(ValueError,
+                                "Incorrect number of elements in range"):
       command_parser.parse_ranges("[inf]")
 
-    with self.assertRaisesRegexp(ValueError,
-                                 "Incorrect type in the 1st element of range"):
+    with self.assertRaisesRegex(ValueError,
+                                "Incorrect type in the 1st element of range"):
       command_parser.parse_ranges("[1j, 1]")
 
-    with self.assertRaisesRegexp(ValueError,
-                                 "Incorrect type in the 2nd element of range"):
+    with self.assertRaisesRegex(ValueError,
+                                "Incorrect type in the 2nd element of range"):
       command_parser.parse_ranges("[1, 1j]")
 
 
@@ -350,11 +350,11 @@ class ParseReadableSizeStrTest(test_util.TensorFlowTestCase):
                      command_parser.parse_readable_size_str("0.25G"))
 
   def testParseUnsupportedUnitRaisesException(self):
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         ValueError, "Failed to parsed human-readable byte size str: \"0foo\""):
       command_parser.parse_readable_size_str("0foo")
 
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         ValueError, "Failed to parsed human-readable byte size str: \"2E\""):
       command_parser.parse_readable_size_str("2EB")
 
@@ -377,15 +377,13 @@ class ParseReadableTimeStrTest(test_util.TensorFlowTestCase):
     self.assertEqual(2e3, command_parser.parse_readable_time_str("2ms"))
 
   def testParseUnsupportedUnitRaisesException(self):
-    with self.assertRaisesRegexp(
-        ValueError, r".*float.*2us.*"):
+    with self.assertRaisesRegex(ValueError, r".*float.*2us.*"):
       command_parser.parse_readable_time_str("2uss")
 
-    with self.assertRaisesRegexp(
-        ValueError, r".*float.*2m.*"):
+    with self.assertRaisesRegex(ValueError, r".*float.*2m.*"):
       command_parser.parse_readable_time_str("2m")
 
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         ValueError, r"Invalid time -1. Time value must be positive."):
       command_parser.parse_readable_time_str("-1s")
 
@@ -393,103 +391,104 @@ class ParseReadableTimeStrTest(test_util.TensorFlowTestCase):
 class ParseInterval(test_util.TensorFlowTestCase):
 
   def testParseTimeInterval(self):
-    self.assertEquals(
+    self.assertEqual(
         command_parser.Interval(10, True, 1e3, True),
         command_parser.parse_time_interval("[10us, 1ms]"))
-    self.assertEquals(
+    self.assertEqual(
         command_parser.Interval(10, False, 1e3, False),
         command_parser.parse_time_interval("(10us, 1ms)"))
-    self.assertEquals(
+    self.assertEqual(
         command_parser.Interval(10, False, 1e3, True),
         command_parser.parse_time_interval("(10us, 1ms]"))
-    self.assertEquals(
+    self.assertEqual(
         command_parser.Interval(10, True, 1e3, False),
         command_parser.parse_time_interval("[10us, 1ms)"))
-    self.assertEquals(command_parser.Interval(0, False, 1e3, True),
-                      command_parser.parse_time_interval("<=1ms"))
-    self.assertEquals(
+    self.assertEqual(
+        command_parser.Interval(0, False, 1e3, True),
+        command_parser.parse_time_interval("<=1ms"))
+    self.assertEqual(
         command_parser.Interval(1e3, True, float("inf"), False),
         command_parser.parse_time_interval(">=1ms"))
-    self.assertEquals(command_parser.Interval(0, False, 1e3, False),
-                      command_parser.parse_time_interval("<1ms"))
-    self.assertEquals(
+    self.assertEqual(
+        command_parser.Interval(0, False, 1e3, False),
+        command_parser.parse_time_interval("<1ms"))
+    self.assertEqual(
         command_parser.Interval(1e3, False, float("inf"), False),
         command_parser.parse_time_interval(">1ms"))
 
   def testParseTimeGreaterLessThanWithInvalidValueStrings(self):
-    with self.assertRaisesRegexp(ValueError, "Invalid value string after >= "):
+    with self.assertRaisesRegex(ValueError, "Invalid value string after >= "):
       command_parser.parse_time_interval(">=wms")
-    with self.assertRaisesRegexp(ValueError, "Invalid value string after > "):
+    with self.assertRaisesRegex(ValueError, "Invalid value string after > "):
       command_parser.parse_time_interval(">Yms")
-    with self.assertRaisesRegexp(ValueError, "Invalid value string after <= "):
+    with self.assertRaisesRegex(ValueError, "Invalid value string after <= "):
       command_parser.parse_time_interval("<= _ms")
-    with self.assertRaisesRegexp(ValueError, "Invalid value string after < "):
+    with self.assertRaisesRegex(ValueError, "Invalid value string after < "):
       command_parser.parse_time_interval("<-ms")
 
   def testParseTimeIntervalsWithInvalidValueStrings(self):
-    with self.assertRaisesRegexp(ValueError, "Invalid first item in interval:"):
+    with self.assertRaisesRegex(ValueError, "Invalid first item in interval:"):
       command_parser.parse_time_interval("[wms, 10ms]")
-    with self.assertRaisesRegexp(ValueError,
-                                 "Invalid second item in interval:"):
+    with self.assertRaisesRegex(ValueError, "Invalid second item in interval:"):
       command_parser.parse_time_interval("[ 0ms, _ms]")
-    with self.assertRaisesRegexp(ValueError, "Invalid first item in interval:"):
+    with self.assertRaisesRegex(ValueError, "Invalid first item in interval:"):
       command_parser.parse_time_interval("(xms, _ms]")
-    with self.assertRaisesRegexp(ValueError, "Invalid first item in interval:"):
+    with self.assertRaisesRegex(ValueError, "Invalid first item in interval:"):
       command_parser.parse_time_interval("((3ms, _ms)")
 
   def testInvalidTimeIntervalRaisesException(self):
-    with self.assertRaisesRegexp(
-        ValueError,
-        r"Invalid interval format: \[10us, 1ms. Valid formats are: "
+    with self.assertRaisesRegex(
+        ValueError, r"Invalid interval format: \[10us, 1ms. Valid formats are: "
         r"\[min, max\], \(min, max\), <max, >min"):
       command_parser.parse_time_interval("[10us, 1ms")
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         ValueError,
         r"Incorrect interval format: \[10us, 1ms, 2ms\]. Interval should "
         r"specify two values: \[min, max\] or \(min, max\)"):
       command_parser.parse_time_interval("[10us, 1ms, 2ms]")
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         ValueError,
         r"Invalid interval \[1s, 1ms\]. Start must be before end of interval."):
       command_parser.parse_time_interval("[1s, 1ms]")
 
   def testParseMemoryInterval(self):
-    self.assertEquals(
+    self.assertEqual(
         command_parser.Interval(1024, True, 2048, True),
         command_parser.parse_memory_interval("[1k, 2k]"))
-    self.assertEquals(
+    self.assertEqual(
         command_parser.Interval(1024, False, 2048, False),
         command_parser.parse_memory_interval("(1kB, 2kB)"))
-    self.assertEquals(
+    self.assertEqual(
         command_parser.Interval(1024, False, 2048, True),
         command_parser.parse_memory_interval("(1k, 2k]"))
-    self.assertEquals(
+    self.assertEqual(
         command_parser.Interval(1024, True, 2048, False),
         command_parser.parse_memory_interval("[1k, 2k)"))
-    self.assertEquals(
+    self.assertEqual(
         command_parser.Interval(0, False, 2048, True),
         command_parser.parse_memory_interval("<=2k"))
-    self.assertEquals(
+    self.assertEqual(
         command_parser.Interval(11, True, float("inf"), False),
         command_parser.parse_memory_interval(">=11"))
-    self.assertEquals(command_parser.Interval(0, False, 2048, False),
-                      command_parser.parse_memory_interval("<2k"))
-    self.assertEquals(
+    self.assertEqual(
+        command_parser.Interval(0, False, 2048, False),
+        command_parser.parse_memory_interval("<2k"))
+    self.assertEqual(
         command_parser.Interval(11, False, float("inf"), False),
         command_parser.parse_memory_interval(">11"))
 
   def testParseMemoryIntervalsWithInvalidValueStrings(self):
-    with self.assertRaisesRegexp(ValueError, "Invalid value string after >= "):
+    with self.assertRaisesRegex(ValueError, "Invalid value string after >= "):
       command_parser.parse_time_interval(">=wM")
-    with self.assertRaisesRegexp(ValueError, "Invalid value string after > "):
+    with self.assertRaisesRegex(ValueError, "Invalid value string after > "):
       command_parser.parse_time_interval(">YM")
-    with self.assertRaisesRegexp(ValueError, "Invalid value string after <= "):
+    with self.assertRaisesRegex(ValueError, "Invalid value string after <= "):
       command_parser.parse_time_interval("<= _MB")
-    with self.assertRaisesRegexp(ValueError, "Invalid value string after < "):
+    with self.assertRaisesRegex(ValueError, "Invalid value string after < "):
       command_parser.parse_time_interval("<-MB")
 
   def testInvalidMemoryIntervalRaisesException(self):
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         ValueError,
         r"Invalid interval \[5k, 3k\]. Start of interval must be less than or "
         "equal to end of interval."):
diff --git a/tensorflow/python/debug/cli/curses_ui_test.py b/tensorflow/python/debug/cli/curses_ui_test.py
index 3c09ad64876..3ffa031923d 100644
--- a/tensorflow/python/debug/cli/curses_ui_test.py
+++ b/tensorflow/python/debug/cli/curses_ui_test.py
@@ -1532,8 +1532,8 @@ class CursesTest(test_util.TensorFlowTestCase):
 class ScrollBarTest(test_util.TensorFlowTestCase):
 
   def testConstructorRaisesExceptionForNotEnoughHeight(self):
-    with self.assertRaisesRegexp(
-        ValueError, r"Insufficient height for ScrollBar \(2\)"):
+    with self.assertRaisesRegex(ValueError,
+                                r"Insufficient height for ScrollBar \(2\)"):
       curses_ui.ScrollBar(0, 0, 1, 1, 0, 0)
 
   def testLayoutIsEmptyForZeroRow(self):
diff --git a/tensorflow/python/debug/cli/curses_widgets_test.py b/tensorflow/python/debug/cli/curses_widgets_test.py
index fb0d3f4a0d6..f3a89e16e74 100644
--- a/tensorflow/python/debug/cli/curses_widgets_test.py
+++ b/tensorflow/python/debug/cli/curses_widgets_test.py
@@ -43,11 +43,11 @@ class CNHTest(test_util.TensorFlowTestCase):
     self.assertFalse(nav_history.can_go_forward())
     self.assertFalse(nav_history.can_go_back())
 
-    with self.assertRaisesRegexp(ValueError, "Empty navigation history"):
+    with self.assertRaisesRegex(ValueError, "Empty navigation history"):
       nav_history.go_back()
-    with self.assertRaisesRegexp(ValueError, "Empty navigation history"):
+    with self.assertRaisesRegex(ValueError, "Empty navigation history"):
       nav_history.go_forward()
-    with self.assertRaisesRegexp(ValueError, "Empty navigation history"):
+    with self.assertRaisesRegex(ValueError, "Empty navigation history"):
       nav_history.update_scroll_position(3)
 
   def testAddOneItemWorks(self):
diff --git a/tensorflow/python/debug/cli/debugger_cli_common_test.py b/tensorflow/python/debug/cli/debugger_cli_common_test.py
index eb46a0a4062..93df845c4c5 100644
--- a/tensorflow/python/debug/cli/debugger_cli_common_test.py
+++ b/tensorflow/python/debug/cli/debugger_cli_common_test.py
@@ -64,7 +64,7 @@ class RichTextLinesTest(test_util.TensorFlowTestCase):
     self.assertEqual(2, screen_output.num_lines())
 
   def testRichTextLinesConstructorWithInvalidType(self):
-    with self.assertRaisesRegexp(ValueError, "Unexpected type in lines"):
+    with self.assertRaisesRegex(ValueError, "Unexpected type in lines"):
       debugger_cli_common.RichTextLines(123)
 
   def testRichTextLinesConstructorWithString(self):
@@ -320,7 +320,7 @@ class CommandHandlerRegistryTest(test_util.TensorFlowTestCase):
 
     # Attempt to register an empty-string as a command prefix should trigger
     # an exception.
-    with self.assertRaisesRegexp(ValueError, "Empty command prefix"):
+    with self.assertRaisesRegex(ValueError, "Empty command prefix"):
       registry.register_command_handler("", self._noop_handler, "")
 
   def testRegisterAndInvokeHandler(self):
@@ -335,11 +335,11 @@ class CommandHandlerRegistryTest(test_util.TensorFlowTestCase):
 
     # Attempt to invoke an unregistered command prefix should trigger an
     # exception.
-    with self.assertRaisesRegexp(ValueError, "No handler is registered"):
+    with self.assertRaisesRegex(ValueError, "No handler is registered"):
       registry.dispatch_command("beep", [])
 
     # Empty command prefix should trigger an exception.
-    with self.assertRaisesRegexp(ValueError, "Prefix is empty"):
+    with self.assertRaisesRegex(ValueError, "Prefix is empty"):
       registry.dispatch_command("", [])
 
   def testExitingHandler(self):
@@ -391,7 +391,7 @@ class CommandHandlerRegistryTest(test_util.TensorFlowTestCase):
 
     # If the command handler fails to return a RichTextLines instance, an error
     # should be triggered.
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         ValueError,
         "Return value from command handler.*is not None or a RichTextLines "
         "instance"):
@@ -403,7 +403,7 @@ class CommandHandlerRegistryTest(test_util.TensorFlowTestCase):
 
     # Registering the same command prefix more than once should trigger an
     # exception.
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         ValueError, "A handler is already registered for command prefix"):
       registry.register_command_handler("noop", self._noop_handler, "")
 
@@ -416,8 +416,8 @@ class CommandHandlerRegistryTest(test_util.TensorFlowTestCase):
         "noop", self._noop_handler, "", prefix_aliases=["n"])
 
     # Clash with existing alias.
-    with self.assertRaisesRegexp(ValueError,
-                                 "clashes with existing prefixes or aliases"):
+    with self.assertRaisesRegex(ValueError,
+                                "clashes with existing prefixes or aliases"):
       registry.register_command_handler(
           "cols", self._echo_screen_cols, "", prefix_aliases=["n"])
 
@@ -425,8 +425,8 @@ class CommandHandlerRegistryTest(test_util.TensorFlowTestCase):
     self.assertFalse(registry.is_registered("cols"))
 
     # Aliases can also clash with command prefixes.
-    with self.assertRaisesRegexp(ValueError,
-                                 "clashes with existing prefixes or aliases"):
+    with self.assertRaisesRegex(ValueError,
+                                "clashes with existing prefixes or aliases"):
       registry.register_command_handler(
           "cols", self._echo_screen_cols, "", prefix_aliases=["noop"])
 
@@ -451,13 +451,13 @@ class CommandHandlerRegistryTest(test_util.TensorFlowTestCase):
     registry = debugger_cli_common.CommandHandlerRegistry()
 
     # Attempt to register a non-callable handler should fail.
-    with self.assertRaisesRegexp(ValueError, "handler is not callable"):
+    with self.assertRaisesRegex(ValueError, "handler is not callable"):
       registry.register_command_handler("non_callable", 1, "")
 
   def testRegisterHandlerWithInvalidHelpInfoType(self):
     registry = debugger_cli_common.CommandHandlerRegistry()
 
-    with self.assertRaisesRegexp(ValueError, "help_info is not a str"):
+    with self.assertRaisesRegex(ValueError, "help_info is not a str"):
       registry.register_command_handler("noop", self._noop_handler, ["foo"])
 
   def testGetHelpFull(self):
@@ -629,7 +629,7 @@ class RegexFindTest(test_util.TensorFlowTestCase):
         debugger_cli_common.REGEX_MATCH_LINES_KEY])
 
   def testInvalidRegex(self):
-    with self.assertRaisesRegexp(ValueError, "Invalid regular expression"):
+    with self.assertRaisesRegex(ValueError, "Invalid regular expression"):
       debugger_cli_common.regex_find(self._orig_screen_output, "[", "yellow")
 
   def testRegexFindOnPrependedLinesWorks(self):
@@ -755,11 +755,11 @@ class WrapScreenOutputTest(test_util.TensorFlowTestCase):
     self.assertEqual(new_line_indices, [0, 2, 5])
 
   def testWrappingInvalidArguments(self):
-    with self.assertRaisesRegexp(ValueError,
-                                 "Invalid type of input screen_output"):
+    with self.assertRaisesRegex(ValueError,
+                                "Invalid type of input screen_output"):
       debugger_cli_common.wrap_rich_text_lines("foo", 12)
 
-    with self.assertRaisesRegexp(ValueError, "Invalid type of input cols"):
+    with self.assertRaisesRegex(ValueError, "Invalid type of input cols"):
       debugger_cli_common.wrap_rich_text_lines(
           debugger_cli_common.RichTextLines(["foo", "bar"]), "12")
 
@@ -813,7 +813,7 @@ class SliceRichTextLinesTest(test_util.TensorFlowTestCase):
     self.assertEqual(1, sliced.num_lines())
 
   def testAttemptSliceWithNegativeIndex(self):
-    with self.assertRaisesRegexp(ValueError, "Encountered negative index"):
+    with self.assertRaisesRegex(ValueError, "Encountered negative index"):
       self._original.slice(0, -1)
 
 
@@ -872,8 +872,8 @@ class TabCompletionRegistryTest(test_util.TensorFlowTestCase):
                      self._tc_reg.get_completions("node_info", "node_"))
 
   def testExtendCompletionItemsNonexistentContext(self):
-    with self.assertRaisesRegexp(
-        KeyError, "Context word \"foo\" has not been registered"):
+    with self.assertRaisesRegex(KeyError,
+                                "Context word \"foo\" has not been registered"):
       self._tc_reg.extend_comp_items("foo", ["node_A:1", "node_A:2"])
 
   def testRemoveCompletionItems(self):
@@ -891,8 +891,8 @@ class TabCompletionRegistryTest(test_util.TensorFlowTestCase):
                      self._tc_reg.get_completions("node_info", "node_"))
 
   def testRemoveCompletionItemsNonexistentContext(self):
-    with self.assertRaisesRegexp(
-        KeyError, "Context word \"foo\" has not been registered"):
+    with self.assertRaisesRegex(KeyError,
+                                "Context word \"foo\" has not been registered"):
       self._tc_reg.remove_comp_items("foo", ["node_a:1", "node_a:2"])
 
   def testDeregisterContext(self):
@@ -921,7 +921,7 @@ class TabCompletionRegistryTest(test_util.TensorFlowTestCase):
 
     self._tc_reg.deregister_context(["print_tensor"])
 
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         KeyError,
         "Cannot deregister unregistered context word \"print_tensor\""):
       self._tc_reg.deregister_context(["print_tensor"])
@@ -992,7 +992,7 @@ class CommandHistoryTest(test_util.TensorFlowTestCase):
     self.assertEqual([], self._cmd_hist.lookup_prefix("print_tensor", 10))
 
   def testAddNonStrCommand(self):
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         TypeError, "Attempt to enter non-str entry to command history"):
       self._cmd_hist.add_command(["print_tensor node_a:0"])
 
diff --git a/tensorflow/python/debug/cli/evaluator_test.py b/tensorflow/python/debug/cli/evaluator_test.py
index c851ad781e7..3116ab6f957 100644
--- a/tensorflow/python/debug/cli/evaluator_test.py
+++ b/tensorflow/python/debug/cli/evaluator_test.py
@@ -102,14 +102,14 @@ class ParseDebugTensorNameTest(test_util.TensorFlowTestCase):
     self.assertEqual(0, exec_index)
 
   def testParseMalformedDebugTensorName(self):
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         ValueError,
         r"The debug tensor name in the to-be-evaluated expression is "
         r"malformed:"):
       evaluator._parse_debug_tensor_name(
           "/job:ps/replica:0/task:2/cpu:0:foo:1:DebugNanCount:1337")
 
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         ValueError,
         r"The debug tensor name in the to-be-evaluated expression is "
         r"malformed:"):
@@ -184,7 +184,7 @@ class EvaluatorTest(test_util.TensorFlowTestCase):
     with test.mock.patch.object(
         dump, "get_tensors", side_effect=fake_get_tensors, autospec=True):
       ev = evaluator.ExpressionEvaluator(dump)
-      with self.assertRaisesRegexp(
+      with self.assertRaisesRegex(
           ValueError, "Eval failed due to the value of .* being unavailable"):
         ev.evaluate("np.matmul(`a:0`, `b:0`)")
 
@@ -206,7 +206,7 @@ class EvaluatorTest(test_util.TensorFlowTestCase):
     with test.mock.patch.object(
         dump, "get_tensors", side_effect=fake_get_tensors, autospec=True):
       ev = evaluator.ExpressionEvaluator(dump)
-      with self.assertRaisesRegexp(ValueError, r"multiple \(2\) devices"):
+      with self.assertRaisesRegex(ValueError, r"multiple \(2\) devices"):
         ev.evaluate("`a:0` + `a:0`")
 
       self.assertAllClose(
@@ -252,12 +252,12 @@ class EvaluatorTest(test_util.TensorFlowTestCase):
   def testEvaluateExpressionWithInvalidDebugTensorName(self):
     dump = test.mock.MagicMock()
     ev = evaluator.ExpressionEvaluator(dump)
-    with self.assertRaisesRegexp(
-        ValueError, r".* tensor name .* expression .* malformed"):
+    with self.assertRaisesRegex(ValueError,
+                                r".* tensor name .* expression .* malformed"):
       ev.evaluate("np.matmul(`a`, `b`)")
 
-    with self.assertRaisesRegexp(
-        ValueError, r".* tensor name .* expression .* malformed"):
+    with self.assertRaisesRegex(ValueError,
+                                r".* tensor name .* expression .* malformed"):
       ev.evaluate("np.matmul(`a:0:DebugIdentity:0`, `b:1:DebugNanCount:2`)")
 
     with self.assertRaises(ValueError):
diff --git a/tensorflow/python/debug/cli/profile_analyzer_cli_test.py b/tensorflow/python/debug/cli/profile_analyzer_cli_test.py
index ee4c5a1a6fc..0027c78d4d1 100644
--- a/tensorflow/python/debug/cli/profile_analyzer_cli_test.py
+++ b/tensorflow/python/debug/cli/profile_analyzer_cli_test.py
@@ -79,7 +79,7 @@ class ProfileAnalyzerListProfileTest(test_util.TensorFlowTestCase):
 
     prof_analyzer = profile_analyzer_cli.ProfileAnalyzer(graph, run_metadata)
     prof_output = prof_analyzer.list_profile([]).lines
-    self.assertEquals([""], prof_output)
+    self.assertEqual([""], prof_output)
 
   def testSingleDevice(self):
     node1 = step_stats_pb2.NodeExecStats(
@@ -211,22 +211,22 @@ class ProfileAnalyzerListProfileTest(test_util.TensorFlowTestCase):
 
     # Default sort by start time (i.e. all_start_micros).
     prof_output = prof_analyzer.list_profile([]).lines
-    self.assertRegexpMatches("".join(prof_output), r"Mul/456.*Add/123")
+    self.assertRegex("".join(prof_output), r"Mul/456.*Add/123")
     # Default sort in reverse.
     prof_output = prof_analyzer.list_profile(["-r"]).lines
-    self.assertRegexpMatches("".join(prof_output), r"Add/123.*Mul/456")
+    self.assertRegex("".join(prof_output), r"Add/123.*Mul/456")
     # Sort by name.
     prof_output = prof_analyzer.list_profile(["-s", "node"]).lines
-    self.assertRegexpMatches("".join(prof_output), r"Add/123.*Mul/456")
+    self.assertRegex("".join(prof_output), r"Add/123.*Mul/456")
     # Sort by op time (i.e. op_end_rel_micros - op_start_rel_micros).
     prof_output = prof_analyzer.list_profile(["-s", "op_time"]).lines
-    self.assertRegexpMatches("".join(prof_output), r"Mul/456.*Add/123")
+    self.assertRegex("".join(prof_output), r"Mul/456.*Add/123")
     # Sort by exec time (i.e. all_end_rel_micros).
     prof_output = prof_analyzer.list_profile(["-s", "exec_time"]).lines
-    self.assertRegexpMatches("".join(prof_output), r"Add/123.*Mul/456")
+    self.assertRegex("".join(prof_output), r"Add/123.*Mul/456")
     # Sort by line number.
     prof_output = prof_analyzer.list_profile(["-s", "line"]).lines
-    self.assertRegexpMatches("".join(prof_output), r"Mul/456.*Add/123")
+    self.assertRegex("".join(prof_output), r"Mul/456.*Add/123")
 
   def testFiltering(self):
     node1 = step_stats_pb2.NodeExecStats(
diff --git a/tensorflow/python/debug/cli/readline_ui_test.py b/tensorflow/python/debug/cli/readline_ui_test.py
index 267a158edf4..011ba23fc4d 100644
--- a/tensorflow/python/debug/cli/readline_ui_test.py
+++ b/tensorflow/python/debug/cli/readline_ui_test.py
@@ -88,13 +88,13 @@ class CursesTest(test_util.TensorFlowTestCase):
     self.assertIsInstance(ui, readline_ui.ReadlineUI)
 
   def testUIFactoryRaisesExceptionOnInvalidUIType(self):
-    with self.assertRaisesRegexp(ValueError, "Invalid ui_type: 'foobar'"):
+    with self.assertRaisesRegex(ValueError, "Invalid ui_type: 'foobar'"):
       ui_factory.get_ui(
           "foobar",
           config=cli_config.CLIConfig(config_file_path=self._tmp_config_path))
 
   def testUIFactoryRaisesExceptionOnInvalidUITypeGivenAvailable(self):
-    with self.assertRaisesRegexp(ValueError, "Invalid ui_type: 'readline'"):
+    with self.assertRaisesRegex(ValueError, "Invalid ui_type: 'readline'"):
       ui_factory.get_ui(
           "readline",
           available_ui_types=["curses"],
diff --git a/tensorflow/python/debug/cli/tensor_format_test.py b/tensorflow/python/debug/cli/tensor_format_test.py
index 804b6c0143d..99ce343d7be 100644
--- a/tensorflow/python/debug/cli/tensor_format_test.py
+++ b/tensorflow/python/debug/cli/tensor_format_test.py
@@ -373,16 +373,13 @@ class RichTextLinesTest(test_util.TensorFlowTestCase):
 
     self._checkTensorElementLocations(out, a)
 
-    with self.assertRaisesRegexp(
-        ValueError, "Indices exceed tensor dimensions"):
+    with self.assertRaisesRegex(ValueError, "Indices exceed tensor dimensions"):
       tensor_format.locate_tensor_element(out, [20])
 
-    with self.assertRaisesRegexp(
-        ValueError, "Indices contain negative"):
+    with self.assertRaisesRegex(ValueError, "Indices contain negative"):
       tensor_format.locate_tensor_element(out, [-1])
 
-    with self.assertRaisesRegexp(
-        ValueError, "Dimensions mismatch"):
+    with self.assertRaisesRegex(ValueError, "Dimensions mismatch"):
       tensor_format.locate_tensor_element(out, [0, 0])
 
   def testLocateTensorElement1DNoEllipsisBatchMode(self):
@@ -407,18 +404,17 @@ class RichTextLinesTest(test_util.TensorFlowTestCase):
         self, ["Tensor \"a\":", ""], out.lines[:2])
     self.assertEqual(repr(a).split("\n"), out.lines[2:])
 
-    with self.assertRaisesRegexp(ValueError, "Dimensions mismatch"):
+    with self.assertRaisesRegex(ValueError, "Dimensions mismatch"):
       tensor_format.locate_tensor_element(out, [[0, 0], [0]])
 
-    with self.assertRaisesRegexp(ValueError,
-                                 "Indices exceed tensor dimensions"):
+    with self.assertRaisesRegex(ValueError, "Indices exceed tensor dimensions"):
       tensor_format.locate_tensor_element(out, [[0], [20]])
 
-    with self.assertRaisesRegexp(ValueError,
-                                 r"Indices contain negative value\(s\)"):
+    with self.assertRaisesRegex(ValueError,
+                                r"Indices contain negative value\(s\)"):
       tensor_format.locate_tensor_element(out, [[0], [-1]])
 
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         ValueError, "Input indices sets are not in ascending order"):
       tensor_format.locate_tensor_element(out, [[5], [0]])
 
@@ -447,16 +443,13 @@ class RichTextLinesTest(test_util.TensorFlowTestCase):
 
     self._checkTensorElementLocations(out, a)
 
-    with self.assertRaisesRegexp(
-        ValueError, "Indices exceed tensor dimensions"):
+    with self.assertRaisesRegex(ValueError, "Indices exceed tensor dimensions"):
       tensor_format.locate_tensor_element(out, [1, 4])
 
-    with self.assertRaisesRegexp(
-        ValueError, "Indices contain negative"):
+    with self.assertRaisesRegex(ValueError, "Indices contain negative"):
       tensor_format.locate_tensor_element(out, [-1, 2])
 
-    with self.assertRaisesRegexp(
-        ValueError, "Dimensions mismatch"):
+    with self.assertRaisesRegex(ValueError, "Dimensions mismatch"):
       tensor_format.locate_tensor_element(out, [0])
 
   def testLocateTensorElement2DNoEllipsisWithNumericSummary(self):
@@ -479,16 +472,13 @@ class RichTextLinesTest(test_util.TensorFlowTestCase):
 
     self._checkTensorElementLocations(out, a)
 
-    with self.assertRaisesRegexp(
-        ValueError, "Indices exceed tensor dimensions"):
+    with self.assertRaisesRegex(ValueError, "Indices exceed tensor dimensions"):
       tensor_format.locate_tensor_element(out, [1, 4])
 
-    with self.assertRaisesRegexp(
-        ValueError, "Indices contain negative"):
+    with self.assertRaisesRegex(ValueError, "Indices contain negative"):
       tensor_format.locate_tensor_element(out, [-1, 2])
 
-    with self.assertRaisesRegexp(
-        ValueError, "Dimensions mismatch"):
+    with self.assertRaisesRegex(ValueError, "Dimensions mismatch"):
       tensor_format.locate_tensor_element(out, [0])
 
   def testLocateTensorElement3DWithEllipses(self):
@@ -564,16 +554,13 @@ class RichTextLinesTest(test_util.TensorFlowTestCase):
     self.assertIsNone(start_col)  # Past ellipsis.
     self.assertIsNone(end_col)
 
-    with self.assertRaisesRegexp(
-        ValueError, "Indices exceed tensor dimensions"):
+    with self.assertRaisesRegex(ValueError, "Indices exceed tensor dimensions"):
       tensor_format.locate_tensor_element(out, [11, 5, 5])
 
-    with self.assertRaisesRegexp(
-        ValueError, "Indices contain negative"):
+    with self.assertRaisesRegex(ValueError, "Indices contain negative"):
       tensor_format.locate_tensor_element(out, [-1, 5, 5])
 
-    with self.assertRaisesRegexp(
-        ValueError, "Dimensions mismatch"):
+    with self.assertRaisesRegex(ValueError, "Dimensions mismatch"):
       tensor_format.locate_tensor_element(out, [5, 5])
 
   def testLocateTensorElement3DWithEllipsesBatchMode(self):
@@ -633,7 +620,7 @@ class RichTextLinesTest(test_util.TensorFlowTestCase):
     self.assertEqual(["Tensor \"a\":", "", "Uninitialized tensor:"],
                      out.lines[:3])
 
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         AttributeError, "tensor_metadata is not available in annotations"):
       tensor_format.locate_tensor_element(out, [0])
 
diff --git a/tensorflow/python/debug/lib/debug_data_test.py b/tensorflow/python/debug/lib/debug_data_test.py
index 6796187ac66..d7ba5cde1f7 100644
--- a/tensorflow/python/debug/lib/debug_data_test.py
+++ b/tensorflow/python/debug/lib/debug_data_test.py
@@ -182,7 +182,7 @@ class DebugDumpDirTest(test_util.TensorFlowTestCase):
         gpu_1_dir, "node_foo_1_2_DebugIdentity_1472563253536387"), "wb")
 
   def testDebugDumpDir_nonexistentDumpRoot(self):
-    with self.assertRaisesRegexp(IOError, "does not exist"):
+    with self.assertRaisesRegex(IOError, "does not exist"):
       debug_data.DebugDumpDir(tempfile.mktemp() + "_foo")
 
   def testDebugDumpDir_invalidFileNamingPattern(self):
@@ -194,8 +194,8 @@ class DebugDumpDirTest(test_util.TensorFlowTestCase):
     os.makedirs(device_dir)
     open(os.path.join(device_dir, "node1_DebugIdentity_1234"), "wb")
 
-    with self.assertRaisesRegexp(ValueError,
-                                 "does not conform to the naming pattern"):
+    with self.assertRaisesRegex(ValueError,
+                                "does not conform to the naming pattern"):
       debug_data.DebugDumpDir(self._dump_root)
 
   def testDebugDumpDir_validDuplicateNodeNamesWithMultipleDevices(self):
@@ -228,8 +228,7 @@ class DebugDumpDirTest(test_util.TensorFlowTestCase):
     self.assertEqual(1472563253536385, dump_dir.t0)
     self.assertEqual(3, dump_dir.size)
 
-    with self.assertRaisesRegexp(
-        ValueError, r"Invalid device name: "):
+    with self.assertRaisesRegex(ValueError, r"Invalid device name: "):
       dump_dir.nodes("/job:localhost/replica:0/task:0/device:GPU:2")
     self.assertItemsEqual(["node_foo_1", "node_foo_1", "node_foo_1"],
                           dump_dir.nodes())
@@ -259,8 +258,7 @@ class DebugDumpDirTest(test_util.TensorFlowTestCase):
     node.op = "FooOp"
     node.device = "/job:localhost/replica:0/task:0/device:GPU:1"
 
-    with self.assertRaisesRegexp(
-        ValueError, r"Duplicate node name on device "):
+    with self.assertRaisesRegex(ValueError, r"Duplicate node name on device "):
       debug_data.DebugDumpDir(
           self._dump_root,
           partition_graphs=[graph_cpu_0, graph_gpu_0, graph_gpu_1])
diff --git a/tensorflow/python/debug/lib/debug_events_writer_test.py b/tensorflow/python/debug/lib/debug_events_writer_test.py
index 3f3f9179e5d..584a758e52d 100644
--- a/tensorflow/python/debug/lib/debug_events_writer_test.py
+++ b/tensorflow/python/debug/lib/debug_events_writer_test.py
@@ -674,8 +674,8 @@ class MultiSetReaderTest(dumping_callback_test_lib.DumpingCallbackTestBase):
           re.sub(r"(tfdbg_events\.\d+)", r"\g<1>1", os.path.basename(src_path)))
       os.rename(src_path, dst_path)
 
-    with self.assertRaisesRegexp(ValueError,
-                                 r"Found multiple \(2\) tfdbg2 runs"):
+    with self.assertRaisesRegex(ValueError,
+                                r"Found multiple \(2\) tfdbg2 runs"):
       debug_events_reader.DebugDataReader(dump_root_0)
 
 
diff --git a/tensorflow/python/debug/lib/debug_gradients_test.py b/tensorflow/python/debug/lib/debug_gradients_test.py
index 92d31171133..95da6cb9ff8 100644
--- a/tensorflow/python/debug/lib/debug_gradients_test.py
+++ b/tensorflow/python/debug/lib/debug_gradients_test.py
@@ -119,8 +119,8 @@ class IdentifyGradientTest(test_util.TensorFlowTestCase):
   def testCallingIdentifyGradientTwiceWithTheSameGradientsDebuggerErrors(self):
     grad_debugger = debug_gradients.GradientsDebugger()
     grad_debugger.identify_gradient(self.w)
-    with self.assertRaisesRegexp(ValueError,
-                                 "The graph already contains an op named .*"):
+    with self.assertRaisesRegex(ValueError,
+                                "The graph already contains an op named .*"):
       grad_debugger.identify_gradient(self.w)
 
   def testIdentifyGradientWorksOnMultipleLosses(self):
@@ -162,18 +162,18 @@ class IdentifyGradientTest(test_util.TensorFlowTestCase):
     # registered.
     gradients_impl.gradients(y, [self.u, self.v])
 
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         LookupError,
         r"This GradientsDebugger has not received any gradient tensor for "):
       grad_debugger_1.gradient_tensor(self.w)
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         LookupError,
         r"This GradientsDebugger has not received any gradient tensor for "):
       grad_debugger_2.gradient_tensor(self.w)
 
   def testIdentifyGradientRaisesTypeErrorForNonTensorOrTensorNameInput(self):
     grad_debugger = debug_gradients.GradientsDebugger()
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         TypeError,
         r"x_tensor must be a str or tf\.Tensor or tf\.Variable, but instead "
         r"has type .*Operation.*"):
@@ -370,7 +370,7 @@ class IdentifyGradientTest(test_util.TensorFlowTestCase):
     self.assertEqual(1, len(u_grad_values))
     self.assertAllClose(30.0, u_grad_values[0])
 
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         LookupError,
         r"This GradientsDebugger has not received any gradient tensor for "
         r"x-tensor v:0"):
diff --git a/tensorflow/python/debug/lib/debug_graphs_test.py b/tensorflow/python/debug/lib/debug_graphs_test.py
index 34257794f11..2d9e859cbac 100644
--- a/tensorflow/python/debug/lib/debug_graphs_test.py
+++ b/tensorflow/python/debug/lib/debug_graphs_test.py
@@ -91,20 +91,20 @@ class ParseDebugNodeNameTest(test_util.TensorFlowTestCase):
   def testParseDebugNodeName_invalidPrefix(self):
     invalid_debug_node_name_1 = "__copy_ns_a/ns_b/node_c:1_0_DebugIdentity"
 
-    with self.assertRaisesRegexp(ValueError, "Invalid prefix"):
+    with self.assertRaisesRegex(ValueError, "Invalid prefix"):
       debug_graphs.parse_debug_node_name(invalid_debug_node_name_1)
 
   def testParseDebugNodeName_missingDebugOpIndex(self):
     invalid_debug_node_name_1 = "__dbg_node1:0_DebugIdentity"
 
-    with self.assertRaisesRegexp(ValueError, "Invalid debug node name"):
+    with self.assertRaisesRegex(ValueError, "Invalid debug node name"):
       debug_graphs.parse_debug_node_name(invalid_debug_node_name_1)
 
   def testParseDebugNodeName_invalidWatchedTensorName(self):
     invalid_debug_node_name_1 = "__dbg_node1_0_DebugIdentity"
 
-    with self.assertRaisesRegexp(ValueError,
-                                 "Invalid tensor name in debug node name"):
+    with self.assertRaisesRegex(ValueError,
+                                "Invalid tensor name in debug node name"):
       debug_graphs.parse_debug_node_name(invalid_debug_node_name_1)
 
 
diff --git a/tensorflow/python/debug/lib/debug_v2_ops_test.py b/tensorflow/python/debug/lib/debug_v2_ops_test.py
index d70c505d3fc..a3054ad9e27 100644
--- a/tensorflow/python/debug/lib/debug_v2_ops_test.py
+++ b/tensorflow/python/debug/lib/debug_v2_ops_test.py
@@ -749,7 +749,7 @@ class DebugNumericSummaryV2Test(test_util.TensorFlowTestCase):
     with self.session(graph=ops.Graph()):
       t1 = constant_op.constant([-1.0, 1.0])
       t2 = constant_op.constant([0.0, 0.0])
-      with self.assertRaisesRegexp(
+      with self.assertRaisesRegex(
           errors.InvalidArgumentError,
           r"pass through test.*had -Inf and \+Inf values"):
         self.evaluate(
@@ -760,7 +760,7 @@ class DebugNumericSummaryV2Test(test_util.TensorFlowTestCase):
     with self.session(graph=ops.Graph()):
       t1 = constant_op.constant([-1.0, 1.0, 0.0])
       t2 = constant_op.constant([0.0, 0.0, 0.0])
-      with self.assertRaisesRegexp(
+      with self.assertRaisesRegex(
           errors.InvalidArgumentError,
           r"pass through test.*had -Inf, \+Inf, and NaN values"):
         self.evaluate(
@@ -771,7 +771,7 @@ class DebugNumericSummaryV2Test(test_util.TensorFlowTestCase):
     with self.session(graph=ops.Graph()):
       t1 = constant_op.constant([0.0, 1.0])
       t2 = constant_op.constant([0.0, 0.0])
-      with self.assertRaisesRegexp(
+      with self.assertRaisesRegex(
           errors.InvalidArgumentError,
           r"pass through test.*had \+Inf and NaN values"):
         self.evaluate(
diff --git a/tensorflow/python/debug/lib/dumping_callback_test.py b/tensorflow/python/debug/lib/dumping_callback_test.py
index 982e57b4a81..4260707c504 100644
--- a/tensorflow/python/debug/lib/dumping_callback_test.py
+++ b/tensorflow/python/debug/lib/dumping_callback_test.py
@@ -84,9 +84,8 @@ class DumpingCallbackTest(
       return "/job:localhost/replica:0/task:0/device:CPU:0"
 
   def testInvalidTensorDebugModeCausesError(self):
-    with self.assertRaisesRegexp(
-        ValueError,
-        r"Invalid value in tensor_debug_mode \(\'NONSENSICAL\'\).*"
+    with self.assertRaisesRegex(
+        ValueError, r"Invalid value in tensor_debug_mode \(\'NONSENSICAL\'\).*"
         r"Valid options.*NO_TENSOR.*"):
       dumping_callback.enable_dump_debug_info(
           self.dump_root, tensor_debug_mode="NONSENSICAL")
@@ -947,19 +946,16 @@ class DumpingCallbackTest(
             tensor_values[2], np.log(5.0) + 1.0)  # 2nd AddV2 op.
 
   def testIncorrectTensorDTypeArgFormatLeadsToError(self):
-    with self.assertRaisesRegexp(
-        ValueError,
-        r".*expected.*list.*tuple.*callable.*but received.*\{\}"):
+    with self.assertRaisesRegex(
+        ValueError, r".*expected.*list.*tuple.*callable.*but received.*\{\}"):
       dumping_callback.enable_dump_debug_info(self.dump_root,
                                               tensor_dtypes=dict())
-    with self.assertRaisesRegexp(
-        ValueError,
-        r".*expected.*list.*tuple.*callable.*but received.*"):
+    with self.assertRaisesRegex(
+        ValueError, r".*expected.*list.*tuple.*callable.*but received.*"):
       dumping_callback.enable_dump_debug_info(self.dump_root,
                                               tensor_dtypes="float32")
-    with self.assertRaisesRegexp(
-        ValueError,
-        r".*expected.*list.*tuple.*callable.*but received.*"):
+    with self.assertRaisesRegex(
+        ValueError, r".*expected.*list.*tuple.*callable.*but received.*"):
       dumping_callback.enable_dump_debug_info(
           self.dump_root, tensor_dtypes=dtypes.float32)
     with self.assertRaises(TypeError):
@@ -1220,7 +1216,7 @@ class DumpingCallbackTest(
         # array.
         self.assertAllEqual(tensor_value, [])
 
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         ValueError, r"already.*NO_TENSOR.*FULL_TENSOR.*not be honored"):
       dumping_callback.enable_dump_debug_info(
           self.dump_root, tensor_debug_mode="FULL_TENSOR")
diff --git a/tensorflow/python/debug/lib/session_debug_grpc_test.py b/tensorflow/python/debug/lib/session_debug_grpc_test.py
index 6cf8d8b5a41..e80ae39828a 100644
--- a/tensorflow/python/debug/lib/session_debug_grpc_test.py
+++ b/tensorflow/python/debug/lib/session_debug_grpc_test.py
@@ -54,8 +54,8 @@ class GrpcDebugServerTest(test_util.TensorFlowTestCase):
     # The server is started asynchronously. It needs to be polled till its state
     # has become started.
 
-    with self.assertRaisesRegexp(
-        ValueError, "Server has already started running"):
+    with self.assertRaisesRegex(ValueError,
+                                "Server has already started running"):
       server.run_server()
 
     server.stop_server().wait()
@@ -68,7 +68,7 @@ class GrpcDebugServerTest(test_util.TensorFlowTestCase):
     server.stop_server().wait()
     server_thread.join()
 
-    with self.assertRaisesRegexp(ValueError, "Server has already stopped"):
+    with self.assertRaisesRegex(ValueError, "Server has already stopped"):
       server.stop_server().wait()
 
   def testRunServerAfterStopRaisesException(self):
@@ -78,7 +78,7 @@ class GrpcDebugServerTest(test_util.TensorFlowTestCase):
     server.stop_server().wait()
     server_thread.join()
 
-    with self.assertRaisesRegexp(ValueError, "Server has already stopped"):
+    with self.assertRaisesRegex(ValueError, "Server has already stopped"):
       server.run_server()
 
   def testStartServerWithoutBlocking(self):
@@ -131,14 +131,14 @@ class SessionDebugGrpcTest(session_debug_testlib.SessionDebugTestBase):
   def testConstructGrpcDebugWrapperSessionWithInvalidTypeRaisesException(self):
     sess = session.Session(
         config=session_debug_testlib.no_rewrite_session_config())
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         TypeError, "Expected type str or list in grpc_debug_server_addresses"):
       grpc_wrapper.GrpcDebugWrapperSession(sess, 1337)
 
   def testConstructGrpcDebugWrapperSessionWithInvalidTypeRaisesException2(self):
     sess = session.Session(
         config=session_debug_testlib.no_rewrite_session_config())
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         TypeError, "Expected type str in list grpc_debug_server_addresses"):
       grpc_wrapper.GrpcDebugWrapperSession(sess, ["localhost:1337", 1338])
 
@@ -307,11 +307,10 @@ class SessionDebugGrpcTest(session_debug_testlib.SessionDebugTestBase):
 
     # Check that the server has _not_ received any tracebacks, as a result of
     # the disabling above.
-    with self.assertRaisesRegexp(
-        ValueError, r"Op .*u/read.* does not exist"):
+    with self.assertRaisesRegex(ValueError, r"Op .*u/read.* does not exist"):
       self.assertTrue(self._server.query_op_traceback("u/read"))
-    with self.assertRaisesRegexp(
-        ValueError, r".* has not received any source file"):
+    with self.assertRaisesRegex(ValueError,
+                                r".* has not received any source file"):
       self._server.query_source_file_line(__file__, 1)
 
   def testConstructGrpcDebugHookWithOrWithouGrpcInUrlWorks(self):
@@ -693,11 +692,11 @@ class SessionDebugGrpcGatingTest(test_util.TensorFlowTestCase):
 
         # No op traceback or source code should have been received by the debug
         # server due to the disabling above.
-        with self.assertRaisesRegexp(
-            ValueError, r"Op .*delta_1.* does not exist"):
+        with self.assertRaisesRegex(ValueError,
+                                    r"Op .*delta_1.* does not exist"):
           self.assertTrue(self._server_1.query_op_traceback("delta_1"))
-        with self.assertRaisesRegexp(
-            ValueError, r".* has not received any source file"):
+        with self.assertRaisesRegex(ValueError,
+                                    r".* has not received any source file"):
           self._server_1.query_source_file_line(__file__, 1)
 
   def testGetGrpcDebugWatchesReturnsCorrectAnswer(self):
diff --git a/tensorflow/python/debug/lib/source_utils_test.py b/tensorflow/python/debug/lib/source_utils_test.py
index c9934c4aac8..da4b9b87b7c 100644
--- a/tensorflow/python/debug/lib/source_utils_test.py
+++ b/tensorflow/python/debug/lib/source_utils_test.py
@@ -287,8 +287,8 @@ class SourceHelperTest(test_util.TensorFlowTestCase):
 
   def testLoadNonexistentNonParPathFailsWithIOError(self):
     bad_path = os.path.join(self.get_temp_dir(), "nonexistent.py")
-    with self.assertRaisesRegexp(
-        IOError, "neither exists nor can be loaded.*par.*"):
+    with self.assertRaisesRegex(IOError,
+                                "neither exists nor can be loaded.*par.*"):
       source_utils.load_source(bad_path)
 
   def testLoadingPythonSourceFileInParFileSucceeds(self):
@@ -315,8 +315,8 @@ class SourceHelperTest(test_util.TensorFlowTestCase):
       zf.write(temp_file_path, os.path.join("tensorflow_models", "model.py"))
 
     source_path = os.path.join(par_path, "tensorflow_models", "nonexistent.py")
-    with self.assertRaisesRegexp(
-        IOError, "neither exists nor can be loaded.*par.*"):
+    with self.assertRaisesRegex(IOError,
+                                "neither exists nor can be loaded.*par.*"):
       source_utils.load_source(source_path)
 
 
diff --git a/tensorflow/python/debug/wrappers/dumping_wrapper_test.py b/tensorflow/python/debug/wrappers/dumping_wrapper_test.py
index 4a14a15562d..0a0b1eb018a 100644
--- a/tensorflow/python/debug/wrappers/dumping_wrapper_test.py
+++ b/tensorflow/python/debug/wrappers/dumping_wrapper_test.py
@@ -73,7 +73,7 @@ class DumpingDebugWrapperSessionTest(test_util.TensorFlowTestCase):
     os.mkdir(dir_path)
     self.assertTrue(os.path.isdir(dir_path))
 
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         ValueError, "session_root path points to a non-empty directory"):
       dumping_wrapper.DumpingDebugWrapperSession(
           session.Session(), session_root=self.session_root, log_usage=False)
@@ -83,8 +83,8 @@ class DumpingDebugWrapperSessionTest(test_util.TensorFlowTestCase):
     open(file_path, "a").close()  # Create the file
     self.assertTrue(gfile.Exists(file_path))
     self.assertFalse(gfile.IsDirectory(file_path))
-    with self.assertRaisesRegexp(ValueError,
-                                 "session_root path points to a file"):
+    with self.assertRaisesRegex(ValueError,
+                                "session_root path points to a file"):
       dumping_wrapper.DumpingDebugWrapperSession(
           session.Session(), session_root=file_path, log_usage=False)
 
@@ -161,7 +161,7 @@ class DumpingDebugWrapperSessionTest(test_util.TensorFlowTestCase):
 
   def testUsingNonCallableAsWatchFnRaisesTypeError(self):
     bad_watch_fn = "bad_watch_fn"
-    with self.assertRaisesRegexp(TypeError, "watch_fn is not callable"):
+    with self.assertRaisesRegex(TypeError, "watch_fn is not callable"):
       dumping_wrapper.DumpingDebugWrapperSession(
           self.sess,
           session_root=self.session_root,
diff --git a/tensorflow/python/debug/wrappers/framework_test.py b/tensorflow/python/debug/wrappers/framework_test.py
index 0265501e625..9493fa1a81e 100644
--- a/tensorflow/python/debug/wrappers/framework_test.py
+++ b/tensorflow/python/debug/wrappers/framework_test.py
@@ -273,11 +273,11 @@ class DebugWrapperSessionTest(test_util.TensorFlowTestCase):
     """Attempt to wrap a non-Session-type object should cause an exception."""
 
     wrapper = TestDebugWrapperSessionBadAction(self._sess)
-    with self.assertRaisesRegexp(TypeError, "Expected type .*; got type .*"):
+    with self.assertRaisesRegex(TypeError, "Expected type .*; got type .*"):
       TestDebugWrapperSessionBadAction(wrapper)
 
   def testSessionInitBadActionValue(self):
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         ValueError, "Invalid OnSessionInitAction value: nonsense_action"):
       TestDebugWrapperSessionBadAction(
           self._sess, bad_init_action="nonsense_action")
@@ -286,7 +286,7 @@ class DebugWrapperSessionTest(test_util.TensorFlowTestCase):
     wrapper = TestDebugWrapperSessionBadAction(
         self._sess, bad_run_start_action="nonsense_action")
 
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         ValueError, "Invalid OnRunStartAction value: nonsense_action"):
       wrapper.run(self._s)
 
@@ -296,7 +296,7 @@ class DebugWrapperSessionTest(test_util.TensorFlowTestCase):
     wrapper = TestDebugWrapperSessionBadAction(
         self._sess, bad_debug_urls="file://foo")
 
-    with self.assertRaisesRegexp(TypeError, "Expected type .*; got type .*"):
+    with self.assertRaisesRegex(TypeError, "Expected type .*; got type .*"):
       wrapper.run(self._s)
 
   def testErrorDuringRun(self):
diff --git a/tensorflow/python/debug/wrappers/local_cli_wrapper_test.py b/tensorflow/python/debug/wrappers/local_cli_wrapper_test.py
index ab33a4af030..0d930b6e7e0 100644
--- a/tensorflow/python/debug/wrappers/local_cli_wrapper_test.py
+++ b/tensorflow/python/debug/wrappers/local_cli_wrapper_test.py
@@ -191,7 +191,7 @@ class LocalCLIDebugWrapperSessionTest(test_util.TensorFlowTestCase):
     os.mkdir(dir_path)
     self.assertTrue(os.path.isdir(dir_path))
 
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         ValueError, "dump_root path points to a non-empty directory"):
       local_cli_wrapper.LocalCLIDebugWrapperSession(
           session.Session(), dump_root=self._tmp_dir, log_usage=False)
@@ -201,7 +201,7 @@ class LocalCLIDebugWrapperSessionTest(test_util.TensorFlowTestCase):
     file_path = os.path.join(self._tmp_dir, "foo")
     open(file_path, "a").close()  # Create the file
     self.assertTrue(os.path.isfile(file_path))
-    with self.assertRaisesRegexp(ValueError, "dump_root path points to a file"):
+    with self.assertRaisesRegex(ValueError, "dump_root path points to a file"):
       local_cli_wrapper.LocalCLIDebugWrapperSession(
           session.Session(), dump_root=file_path, log_usage=False)
 
@@ -540,7 +540,7 @@ class LocalCLIDebugWrapperSessionTest(test_util.TensorFlowTestCase):
 
     wrapped_sess = LocalCLIDebuggerWrapperSessionForTest(
         [["run"]], self.sess, dump_root=self._tmp_dir)
-    with self.assertRaisesRegexp(errors.OpError, r".*[Dd]evice.*1337.*"):
+    with self.assertRaisesRegex(errors.OpError, r".*[Dd]evice.*1337.*"):
       wrapped_sess.run(w)
 
   def testRunTillFilterPassesShouldLaunchCLIAtCorrectRun(self):
@@ -811,7 +811,7 @@ class LocalCLIDebugWrapperSessionTest(test_util.TensorFlowTestCase):
   def testCallingShouldStopMethodOnNonWrappedNonMonitoredSessionErrors(self):
     wrapped_sess = LocalCLIDebuggerWrapperSessionForTest(
         [["run"], ["run"]], self.sess)
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         ValueError,
         r"The wrapped session .* does not have a method .*should_stop.*"):
       wrapped_sess.should_stop()
diff --git a/tensorflow/python/distribute/all_reduce_test.py b/tensorflow/python/distribute/all_reduce_test.py
index ee97d43b476..c738fa2f855 100644
--- a/tensorflow/python/distribute/all_reduce_test.py
+++ b/tensorflow/python/distribute/all_reduce_test.py
@@ -40,8 +40,7 @@ class AllReduceTest(test_util.TensorFlowTestCase):
   @test_util.run_deprecated_v1
   def testFlattenTensorsShapesDefined(self):
     x = array_ops.placeholder(types_pb2.DT_FLOAT, [None])
-    with self.assertRaisesRegexp(ValueError,
-                                 "must have statically known shape"):
+    with self.assertRaisesRegex(ValueError, "must have statically known shape"):
       ar._flatten_tensors([x, x])
 
   def testRingPermutations(self):
diff --git a/tensorflow/python/distribute/cluster_resolver/gce_cluster_resolver_test.py b/tensorflow/python/distribute/cluster_resolver/gce_cluster_resolver_test.py
index d8037497cb9..b04c67d76ec 100644
--- a/tensorflow/python/distribute/cluster_resolver/gce_cluster_resolver_test.py
+++ b/tensorflow/python/distribute/cluster_resolver/gce_cluster_resolver_test.py
@@ -335,7 +335,7 @@ class GCEClusterResolverTest(test.TestCase):
         credentials=None,
         service=self.gen_standard_mock_service_client(name_to_ip))
 
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         RuntimeError, 'You cannot reset the task_type '
         'of the GCEClusterResolver after it has '
         'been created.'):
diff --git a/tensorflow/python/distribute/cluster_resolver/kubernetes_cluster_resolver_test.py b/tensorflow/python/distribute/cluster_resolver/kubernetes_cluster_resolver_test.py
index 598c3da4642..d9d6076be13 100644
--- a/tensorflow/python/distribute/cluster_resolver/kubernetes_cluster_resolver_test.py
+++ b/tensorflow/python/distribute/cluster_resolver/kubernetes_cluster_resolver_test.py
@@ -134,7 +134,7 @@ class KubernetesClusterResolverTest(test.TestCase):
             {'job-name=tensorflow': ret}))
 
     error_msg = 'Pod "tensorflow-abc123" is not running; phase: "Failed"'
-    with self.assertRaisesRegexp(RuntimeError, error_msg):
+    with self.assertRaisesRegex(RuntimeError, error_msg):
       cluster_resolver.cluster_spec()
 
   def testMultiplePodSelectorsAndWorkers(self):
diff --git a/tensorflow/python/distribute/cluster_resolver/tpu/tpu_cluster_resolver_test.py b/tensorflow/python/distribute/cluster_resolver/tpu/tpu_cluster_resolver_test.py
index 1dc9a73fd74..51abc850bb2 100644
--- a/tensorflow/python/distribute/cluster_resolver/tpu/tpu_cluster_resolver_test.py
+++ b/tensorflow/python/distribute/cluster_resolver/tpu/tpu_cluster_resolver_test.py
@@ -140,8 +140,8 @@ class TPUClusterResolverTest(test.TestCase):
 
   @mock.patch.object(resolver, 'is_running_in_gce', mock_is_running_in_gce)
   def testCheckRunningInGceWithNoTpuName(self):
-    with self.assertRaisesRegexp(ValueError,
-                                 'Please provide a TPU Name to connect to.*'):
+    with self.assertRaisesRegex(ValueError,
+                                'Please provide a TPU Name to connect to.*'):
       resolver.TPUClusterResolver(tpu='')
 
   @mock.patch.object(six.moves.urllib.request, 'urlopen',
diff --git a/tensorflow/python/distribute/custom_training_loop_input_test.py b/tensorflow/python/distribute/custom_training_loop_input_test.py
index e2c4076f3f1..9251721f7d0 100644
--- a/tensorflow/python/distribute/custom_training_loop_input_test.py
+++ b/tensorflow/python/distribute/custom_training_loop_input_test.py
@@ -191,8 +191,8 @@ class InputIterationTest(test.TestCase, parameterized.TestCase,
 
     input_iterator = iter(distribution.experimental_distribute_dataset(dataset))
 
-    with self.assertRaisesRegexp(NotImplementedError,
-                                 "does not support pure eager execution"):
+    with self.assertRaisesRegex(NotImplementedError,
+                                "does not support pure eager execution"):
       distribution.run(train_step, args=(next(input_iterator),))
 
   @combinations.generate(
diff --git a/tensorflow/python/distribute/distribute_lib_test.py b/tensorflow/python/distribute/distribute_lib_test.py
index b5924ec3b67..04193d03d0c 100644
--- a/tensorflow/python/distribute/distribute_lib_test.py
+++ b/tensorflow/python/distribute/distribute_lib_test.py
@@ -150,7 +150,7 @@ def _run_in_and_out_of_scope(unbound_test_method):
     # When run under a different strategy the test method should fail.
     another_strategy = _TestStrategy()
     msg = "Mixing different .*Strategy objects"
-    with test_case.assertRaisesRegexp(RuntimeError, msg):
+    with test_case.assertRaisesRegex(RuntimeError, msg):
       with another_strategy.scope():
         unbound_test_method(test_case, dist)
   return wrapper
@@ -206,7 +206,7 @@ class TestStrategyTest(test.TestCase):
     scope.__enter__()
     self.assertIs(dist, ds_context.get_strategy())
     with ops.device("/device:CPU:0"):
-      with self.assertRaisesRegexp(RuntimeError, "Device scope nesting error"):
+      with self.assertRaisesRegex(RuntimeError, "Device scope nesting error"):
         scope.__exit__(None, None, None)
     scope.__exit__(None, None, None)
     _assert_in_default_state(self)
@@ -222,8 +222,8 @@ class TestStrategyTest(test.TestCase):
     scope.__enter__()
     self.assertIs(dist, ds_context.get_strategy())
     with variable_scope.variable_creator_scope(creator):
-      with self.assertRaisesRegexp(RuntimeError,
-                                   "Variable creator scope nesting error"):
+      with self.assertRaisesRegex(RuntimeError,
+                                  "Variable creator scope nesting error"):
         scope.__exit__(None, None, None)
     scope.__exit__(None, None, None)
     _assert_in_default_state(self)
@@ -239,8 +239,8 @@ class TestStrategyTest(test.TestCase):
       scope.__enter__()
       self.assertIs(dist, ds_context.get_strategy())
       with variable_scope.variable_scope("AA"):
-        with self.assertRaisesRegexp(RuntimeError,
-                                     "Variable scope nesting error"):
+        with self.assertRaisesRegex(RuntimeError,
+                                    "Variable scope nesting error"):
           scope.__exit__(None, None, None)
     _assert_in_default_state(self)
 
@@ -284,15 +284,15 @@ class TestStrategyTest(test.TestCase):
     _assert_in_default_state(self)
     dist = _TestStrategy()
     with dist.scope():
-      with self.assertRaisesRegexp(
+      with self.assertRaisesRegex(
           RuntimeError,
           "Must not be called inside a `tf.distribute.Strategy` scope"):
         ds_context.experimental_set_strategy(_TestStrategy())
-      with self.assertRaisesRegexp(
+      with self.assertRaisesRegex(
           RuntimeError,
           "Must not be called inside a `tf.distribute.Strategy` scope"):
         ds_context.experimental_set_strategy(dist)
-      with self.assertRaisesRegexp(
+      with self.assertRaisesRegex(
           RuntimeError,
           "Must not be called inside a `tf.distribute.Strategy` scope"):
         ds_context.experimental_set_strategy(None)
@@ -313,9 +313,8 @@ class TestStrategyTest(test.TestCase):
       self.assertIs(dist, ds_context.get_strategy())
       dist2 = _TestStrategy()
       scope2 = dist2.scope()
-      with self.assertRaisesRegexp(
-          RuntimeError,
-          "Mixing different tf.distribute.Strategy objects"):
+      with self.assertRaisesRegex(
+          RuntimeError, "Mixing different tf.distribute.Strategy objects"):
         with scope2:
           pass
     _assert_in_default_state(self)
@@ -496,7 +495,7 @@ class DefaultDistributionStrategyTest(test.TestCase, parameterized.TestCase):
       _assert_in_default_state(self)
 
       with test_strategy.scope():
-        with self.assertRaisesRegexp(
+        with self.assertRaisesRegex(
             RuntimeError, "Mixing different tf.distribute.Strategy objects"):
           variable_scope.variable(1.0, name="error")
 
@@ -504,7 +503,7 @@ class DefaultDistributionStrategyTest(test.TestCase, parameterized.TestCase):
         _assert_in_default_state(self)
 
         with test_strategy.scope():
-          with self.assertRaisesRegexp(
+          with self.assertRaisesRegex(
               RuntimeError, "Mixing different tf.distribute.Strategy objects"):
             variable_scope.variable(1.0, name="also_error")
 
diff --git a/tensorflow/python/distribute/mirrored_strategy_test.py b/tensorflow/python/distribute/mirrored_strategy_test.py
index 39cc7f3a48f..d2a567589b9 100644
--- a/tensorflow/python/distribute/mirrored_strategy_test.py
+++ b/tensorflow/python/distribute/mirrored_strategy_test.py
@@ -438,7 +438,7 @@ class MirroredStrategyCallForEachReplicaTest(test.TestCase):
       return control_flow_ops.while_loop_v2(lambda i: i < 2, body_fn, [0])
 
     with distribution.scope():
-      with self.assertRaisesRegexp(
+      with self.assertRaisesRegex(
           RuntimeError, "`merge_call` called while defining a new graph."):
         distribution.extended.call_for_each_replica(model_fn)
 
@@ -457,7 +457,7 @@ class MirroredStrategyCallForEachReplicaTest(test.TestCase):
       return model_fn_nested()
 
     with distribution.scope():
-      with self.assertRaisesRegexp(
+      with self.assertRaisesRegex(
           RuntimeError, "`merge_call` called while defining a new graph."):
         distribution.extended.call_for_each_replica(model_fn)
 
@@ -706,7 +706,7 @@ class MirroredVariableUpdateTest(test.TestCase):
       def model_fn():
         return mirrored_var.assign(5.0)
 
-      with self.assertRaisesRegexp(
+      with self.assertRaisesRegex(
           ValueError, "A non-DistributedValues value 5.0 cannot be reduced "
           "with the given reduce op ReduceOp.SUM."):
         self.evaluate(distribution.experimental_local_results(
diff --git a/tensorflow/python/distribute/mirrored_variable_test.py b/tensorflow/python/distribute/mirrored_variable_test.py
index df32a6babea..8e7d674947e 100644
--- a/tensorflow/python/distribute/mirrored_variable_test.py
+++ b/tensorflow/python/distribute/mirrored_variable_test.py
@@ -377,7 +377,7 @@ class MirroredVariableCreationTest(test.TestCase):
 
   def testNoneSynchronizationWithGetVariable(self, distribution):
     with distribution.scope():
-      with self.assertRaisesRegexp(
+      with self.assertRaisesRegex(
           ValueError, "`NONE` variable synchronization mode is not "
           "supported with `Mirrored` distribution strategy. Please change "
           "the `synchronization` for variable: v"):
@@ -387,7 +387,7 @@ class MirroredVariableCreationTest(test.TestCase):
 
   def testNoneSynchronizationWithVariable(self, distribution):
     with distribution.scope():
-      with self.assertRaisesRegexp(
+      with self.assertRaisesRegex(
           ValueError, "`NONE` variable synchronization mode is not "
           "supported with `Mirrored` distribution strategy. Please change "
           "the `synchronization` for variable: v"):
@@ -398,14 +398,14 @@ class MirroredVariableCreationTest(test.TestCase):
 
   def testInvalidSynchronizationWithVariable(self, distribution):
     with distribution.scope():
-      with self.assertRaisesRegexp(
+      with self.assertRaisesRegex(
           ValueError, "Invalid variable synchronization mode: Invalid for "
           "variable: v"):
         variable_scope.variable(1.0, name="v", synchronization="Invalid")
 
   def testInvalidAggregationWithGetVariable(self, distribution):
     with distribution.scope():
-      with self.assertRaisesRegexp(
+      with self.assertRaisesRegex(
           ValueError, "Invalid variable aggregation mode: invalid for "
           "variable: v"):
         variable_scope.get_variable(
@@ -415,7 +415,7 @@ class MirroredVariableCreationTest(test.TestCase):
 
   def testInvalidAggregationWithVariable(self, distribution):
     with distribution.scope():
-      with self.assertRaisesRegexp(
+      with self.assertRaisesRegex(
           ValueError, "Invalid variable aggregation mode: invalid for "
           "variable: v"):
         variable_scope.variable(
diff --git a/tensorflow/python/distribute/multi_process_runner_no_init_test.py b/tensorflow/python/distribute/multi_process_runner_no_init_test.py
index 475255d5e0a..2a1fe2551b9 100644
--- a/tensorflow/python/distribute/multi_process_runner_no_init_test.py
+++ b/tensorflow/python/distribute/multi_process_runner_no_init_test.py
@@ -30,8 +30,8 @@ class MultiProcessRunnerNoInitTest(test.TestCase):
     def simple_func():
       return 'foobar'
 
-    with self.assertRaisesRegexp(RuntimeError,
-                                 '`multi_process_runner` is not initialized.'):
+    with self.assertRaisesRegex(RuntimeError,
+                                '`multi_process_runner` is not initialized.'):
       multi_process_runner.run(
           simple_func,
           multi_worker_test_base.create_cluster_spec(num_workers=1))
diff --git a/tensorflow/python/distribute/multi_process_runner_test.py b/tensorflow/python/distribute/multi_process_runner_test.py
index a6219dc5322..c6266a5be26 100644
--- a/tensorflow/python/distribute/multi_process_runner_test.py
+++ b/tensorflow/python/distribute/multi_process_runner_test.py
@@ -97,7 +97,7 @@ class MultiProcessRunnerTest(test.TestCase):
         multi_worker_test_base.create_cluster_spec(num_workers=1, num_ps=1),
         max_run_time=20)
     runner.start()
-    with self.assertRaisesRegexp(ValueError, 'This is an error.'):
+    with self.assertRaisesRegex(ValueError, 'This is an error.'):
       runner.join()
 
   def test_multi_process_runner_queue_emptied_between_runs(self):
@@ -287,7 +287,7 @@ class MultiProcessRunnerTest(test.TestCase):
     mpr.start()
     time.sleep(60)
     mpr.terminate_all()
-    with self.assertRaisesRegexp(ValueError, 'This is an error.'):
+    with self.assertRaisesRegex(ValueError, 'This is an error.'):
       mpr.join()
 
   def test_barrier(self):
@@ -402,7 +402,7 @@ class MultiProcessPoolRunnerTest(test.TestCase):
     cluster_spec = multi_worker_test_base.create_cluster_spec(num_workers=2)
     runner = multi_process_runner.MultiProcessPoolRunner(cluster_spec)
     pid = runner.run(proc_func_that_returns_pid)
-    with self.assertRaisesRegexp(ValueError, 'This is an error.'):
+    with self.assertRaisesRegex(ValueError, 'This is an error.'):
       runner.run(proc_func_that_errors)
     self.assertAllEqual(runner.run(proc_func_that_returns_pid), pid)
 
diff --git a/tensorflow/python/distribute/multi_worker_util_test.py b/tensorflow/python/distribute/multi_worker_util_test.py
index 6a51e71ded7..d5dc6d7eb91 100644
--- a/tensorflow/python/distribute/multi_worker_util_test.py
+++ b/tensorflow/python/distribute/multi_worker_util_test.py
@@ -71,7 +71,7 @@ class NormalizeClusterSpecTest(test.TestCase):
   def testUnexpectedInput(self):
     cluster_spec = ["127.0.0.1:8964", "127.0.0.1:2333"]
 
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         ValueError,
         "`cluster_spec' should be dict or a `tf.train.ClusterSpec` or a "
         "`tf.train.ClusterDef` object"):
@@ -94,11 +94,11 @@ class IsChiefTest(test.TestCase):
     self.assertTrue(multi_worker_util.is_chief(cluster_spec, "worker", 0))
     self.assertFalse(multi_worker_util.is_chief(cluster_spec, "worker", 1))
 
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         ValueError, "`task_type` 'chief' not found in cluster_spec."):
       multi_worker_util.is_chief(cluster_spec, "chief", 0)
 
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         ValueError, "The `task_id` 2 exceeds the maximum id of worker."):
       multi_worker_util.is_chief(cluster_spec, "worker", 2)
 
@@ -135,7 +135,7 @@ class NumWorkersTest(test.TestCase):
 
   def testTaskTypeNotFound(self):
     cluster_spec = {}
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         ValueError, "`task_type` 'worker' not found in cluster_spec."):
       multi_worker_util.worker_count(cluster_spec, task_type="worker")
 
@@ -145,7 +145,7 @@ class NumWorkersTest(test.TestCase):
         "ps": ["127.0.0.1:1926", "127.0.0.1:3141"]
     }
     # A "ps" job shouldn't call this method.
-    with self.assertRaisesRegexp(ValueError, "Unexpected `task_type` 'ps'"):
+    with self.assertRaisesRegex(ValueError, "Unexpected `task_type` 'ps'"):
       multi_worker_util.worker_count(cluster_spec, task_type="ps")
 
 
@@ -187,16 +187,16 @@ class IdInClusterTest(test.TestCase):
 
   def testPsId(self):
     cluster_spec = {"chief": ["127.0.0.1:1234"], "ps": ["127.0.0.1:7566"]}
-    with self.assertRaisesRegexp(ValueError,
-                                 "There is no id for task_type 'ps'"):
+    with self.assertRaisesRegex(ValueError,
+                                "There is no id for task_type 'ps'"):
       multi_worker_util.id_in_cluster(cluster_spec, "ps", 0)
 
   def testMultipleChiefs(self):
     cluster_spec = {
         "chief": ["127.0.0.1:8258", "127.0.0.1:7566"],
     }
-    with self.assertRaisesRegexp(ValueError,
-                                 "There must be at most one 'chief' job."):
+    with self.assertRaisesRegex(ValueError,
+                                "There must be at most one 'chief' job."):
       multi_worker_util.id_in_cluster(cluster_spec, "chief", 0)
 
 
@@ -257,7 +257,7 @@ class ClusterSpecValidationTest(test.TestCase):
         "ps": ["127.0.0.1:1926", "127.0.0.1:3141"]
     }
     multi_worker_util._validate_cluster_spec(cluster_spec, "evaluator", 0)
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         ValueError, "`task_type` 'worker' not found in cluster_spec."):
       multi_worker_util._validate_cluster_spec(cluster_spec, "worker", 0)
 
diff --git a/tensorflow/python/distribute/sharded_variable_test.py b/tensorflow/python/distribute/sharded_variable_test.py
index b42c8314e98..64ed3d03717 100644
--- a/tensorflow/python/distribute/sharded_variable_test.py
+++ b/tensorflow/python/distribute/sharded_variable_test.py
@@ -173,23 +173,23 @@ class ShardedVariableTest(test.TestCase):
     self.assertAllEqual([3., 2.], root.train([0, 1]).numpy())
 
   def test_validation_errors(self):
-    with self.assertRaisesRegexp(ValueError, 'Expected a list of '):
+    with self.assertRaisesRegex(ValueError, 'Expected a list of '):
       sharded_variable.ShardedVariable(
           [variables_lib.Variable([0]), 'not-a-variable'])
 
-    with self.assertRaisesRegexp(ValueError, 'must have the same dtype'):
+    with self.assertRaisesRegex(ValueError, 'must have the same dtype'):
       sharded_variable.ShardedVariable([
           variables_lib.Variable([0], dtype='int64'),
           variables_lib.Variable([1], dtype='int32')
       ])
 
-    with self.assertRaisesRegexp(ValueError, 'the same shapes except'):
+    with self.assertRaisesRegex(ValueError, 'the same shapes except'):
       sharded_variable.ShardedVariable([
           variables_lib.Variable(array_ops.ones((5, 10))),
           variables_lib.Variable(array_ops.ones((5, 20)))
       ])
 
-    with self.assertRaisesRegexp(ValueError, '`SaveSliceInfo` should not'):
+    with self.assertRaisesRegex(ValueError, '`SaveSliceInfo` should not'):
       v = variables_lib.Variable([0])
       v._set_save_slice_info(
           variables_lib.Variable.SaveSliceInfo(
diff --git a/tensorflow/python/distribute/shared_variable_creator_test.py b/tensorflow/python/distribute/shared_variable_creator_test.py
index 4ddc29f2567..151abc0355a 100644
--- a/tensorflow/python/distribute/shared_variable_creator_test.py
+++ b/tensorflow/python/distribute/shared_variable_creator_test.py
@@ -30,18 +30,18 @@ class CanonicalizeVariableNameTest(test.TestCase):
     return shared_variable_creator._canonicalize_variable_name(name)
 
   def testNoName(self):
-    self.assertEquals("Variable", self._canonicalize(None))
+    self.assertEqual("Variable", self._canonicalize(None))
 
   def testPatternInMiddle(self):
-    self.assertEquals("foo/bar/baz", self._canonicalize("foo_1/bar_1/baz"))
+    self.assertEqual("foo/bar/baz", self._canonicalize("foo_1/bar_1/baz"))
 
   def testPatternAtEnd(self):
-    self.assertEquals("foo", self._canonicalize("foo_1"))
+    self.assertEqual("foo", self._canonicalize("foo_1"))
 
   def testWrongPatterns(self):
-    self.assertEquals("foo_1:0", self._canonicalize("foo_1:0"))
-    self.assertEquals("foo1", self._canonicalize("foo1"))
-    self.assertEquals("foo_a", self._canonicalize("foo_a"))
+    self.assertEqual("foo_1:0", self._canonicalize("foo_1:0"))
+    self.assertEqual("foo1", self._canonicalize("foo1"))
+    self.assertEqual("foo_a", self._canonicalize("foo_a"))
 
 
 class SharedVariableCreatorTest(test.TestCase):
diff --git a/tensorflow/python/distribute/strategy_combinations_test.py b/tensorflow/python/distribute/strategy_combinations_test.py
index 8b5ea27f512..38ace7da42d 100644
--- a/tensorflow/python/distribute/strategy_combinations_test.py
+++ b/tensorflow/python/distribute/strategy_combinations_test.py
@@ -52,7 +52,7 @@ class VirtualDevicesTest(test.TestCase, parameterized.TestCase):
   def testSetVirtualCPUsErrors(self):
     with self.assertRaises(ValueError):
       strategy_combinations.set_virtual_cpus_to_at_least(0)
-    with self.assertRaisesRegexp(RuntimeError, "with 3 < 5 virtual CPUs"):
+    with self.assertRaisesRegex(RuntimeError, "with 3 < 5 virtual CPUs"):
       strategy_combinations.set_virtual_cpus_to_at_least(5)
 
   @combinations.generate(combinations.combine(
diff --git a/tensorflow/python/eager/backprop_test.py b/tensorflow/python/eager/backprop_test.py
index 20c6e05adda..d21146e0b73 100644
--- a/tensorflow/python/eager/backprop_test.py
+++ b/tensorflow/python/eager/backprop_test.py
@@ -799,7 +799,7 @@ class BackpropTest(test.TestCase, parameterized.TestCase):
       y = control_flow_ops.cond(x < x, true_fn, false_fn)
 
     if not context.executing_eagerly():
-      with self.assertRaisesRegexp(NotImplementedError, 'tf.gradients'):
+      with self.assertRaisesRegex(NotImplementedError, 'tf.gradients'):
         dy = g.gradient(y, [x])[0]
     else:
       dy = g.gradient(y, [x])[0]
@@ -822,7 +822,7 @@ class BackpropTest(test.TestCase, parameterized.TestCase):
       _, y = control_flow_ops.while_loop(cond, body, [i, x])
 
     if not context.executing_eagerly():
-      with self.assertRaisesRegexp(NotImplementedError, 'tf.gradients'):
+      with self.assertRaisesRegex(NotImplementedError, 'tf.gradients'):
         dy = g.gradient(y, [x])[0]
     else:
       dy = g.gradient(y, [x])[0]
@@ -836,7 +836,7 @@ class BackpropTest(test.TestCase, parameterized.TestCase):
       y = x * x
       z = y * y
     g.gradient(z, [x])
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         RuntimeError, 'GradientTape.gradient can only be called once'):
       g.gradient(y, [x])
 
@@ -958,7 +958,7 @@ class BackpropTest(test.TestCase, parameterized.TestCase):
     with backprop.GradientTape() as g:
       g.watch([x, y])
       z = y * 2
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         ValueError, "Unknown value for unconnected_gradients: 'nonsense'"):
       g.gradient(z, x, unconnected_gradients='nonsense')
 
@@ -989,8 +989,8 @@ class BackpropTest(test.TestCase, parameterized.TestCase):
     with backprop.GradientTape() as g:
       g.watch(x)
       tape_lib.record_operation('InvalidBackprop', [y], [x], lambda dy: [])
-    with self.assertRaisesRegexp(errors_impl.InternalError,
-                                 'InvalidBackprop.*too few gradients'):
+    with self.assertRaisesRegex(errors_impl.InternalError,
+                                'InvalidBackprop.*too few gradients'):
       g.gradient(y, x)
 
   @test_util.assert_no_new_tensors
@@ -1295,13 +1295,13 @@ class BackpropTest(test.TestCase, parameterized.TestCase):
     y = constant_op.constant(2)
 
     loss_grads_fn = backprop.implicit_val_and_grad(fn)
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         ValueError, 'Cannot differentiate a function that returns None; '
         'did you forget to return a value from fn?'):
       loss_grads_fn(x, y)
 
     val_and_grads_fn = backprop.val_and_grad_function(fn)
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         ValueError, 'Cannot differentiate a function that returns None; '
         'did you forget to return a value from fn?'):
       val_and_grads_fn(x, y)
@@ -1504,7 +1504,7 @@ class BackpropTest(test.TestCase, parameterized.TestCase):
   @test_util.run_in_graph_and_eager_modes
   def testWatchBadThing(self):
     g = backprop.GradientTape()
-    with self.assertRaisesRegexp(ValueError, 'ndarray'):
+    with self.assertRaisesRegex(ValueError, 'ndarray'):
       g.watch(np.array(1.))
 
   def testWatchComposite(self):
@@ -1659,7 +1659,7 @@ class JacobianTest(test.TestCase):
       x = constant_op.constant([1.0, 2.0])
       g.watch(x)
       y = x * x
-    with self.assertRaisesRegexp(RuntimeError, 'persistent'):
+    with self.assertRaisesRegex(RuntimeError, 'persistent'):
       g.jacobian(y, x, experimental_use_pfor=False)
 
   @test_util.run_v1_only('b/120545219')
@@ -1749,28 +1749,28 @@ class BatchJacobianTest(test.TestCase, parameterized.TestCase):
       x = constant_op.constant([[1.0, 2.0]])
       g.watch(x)
       y = x * x
-    with self.assertRaisesRegexp(RuntimeError, 'persistent'):
+    with self.assertRaisesRegex(RuntimeError, 'persistent'):
       g.batch_jacobian(y, x, experimental_use_pfor=False)
 
   def testBadShape(self):
     x = random_ops.random_uniform([2, 3])
     with backprop.GradientTape() as g:
       y = array_ops.concat([x, x], axis=0)
-    with self.assertRaisesRegexp(ValueError, 'Need first dimension'):
+    with self.assertRaisesRegex(ValueError, 'Need first dimension'):
       g.batch_jacobian(y, x)
 
   def testBadInputRank(self):
     x = random_ops.random_uniform([2])
     with backprop.GradientTape() as g:
       y = random_ops.random_uniform([2, 2])
-    with self.assertRaisesRegexp(ValueError, 'must have rank at least 2'):
+    with self.assertRaisesRegex(ValueError, 'must have rank at least 2'):
       g.batch_jacobian(y, x)
 
   def testBadOutputRank(self):
     x = random_ops.random_uniform([2, 2])
     with backprop.GradientTape() as g:
       y = random_ops.random_uniform([2])
-    with self.assertRaisesRegexp(ValueError, 'must have rank at least 2'):
+    with self.assertRaisesRegex(ValueError, 'must have rank at least 2'):
       g.batch_jacobian(y, x)
 
   def test_parallel_iterations(self):
diff --git a/tensorflow/python/eager/core_test.py b/tensorflow/python/eager/core_test.py
index c1401fc56ee..d756827f44f 100644
--- a/tensorflow/python/eager/core_test.py
+++ b/tensorflow/python/eager/core_test.py
@@ -89,7 +89,7 @@ class TFETest(test_util.TensorFlowTestCase):
     else:
       # TODO(gjn): Figure out how to make this work for tf.Tensor
       # self.assertNotIsInstance(b, collections.Hashable)
-      with self.assertRaisesRegexp(TypeError, 'unhashable'):
+      with self.assertRaisesRegex(TypeError, 'unhashable'):
         set([a, b])
 
   def testEquality(self):
@@ -464,7 +464,7 @@ class TFETest(test_util.TensorFlowTestCase):
   def testContextConfig(self):
     ctx = context.Context(config=config_pb2.ConfigProto(
         device_count={'GPU': 0}))
-    self.assertEquals(0, ctx.num_gpus())
+    self.assertEqual(0, ctx.num_gpus())
 
   def testPickle(self):
     tmp_dir = self.get_temp_dir()
@@ -485,7 +485,7 @@ class TFETest(test_util.TensorFlowTestCase):
     self.assertEndsWith(current_device(), 'CPU:0')
     gpu.__enter__()
     self.assertEndsWith(current_device(), 'GPU:0')
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         RuntimeError, 'Exiting device scope without proper scope nesting'):
       cpu.__exit__()
       self.assertEndsWith(current_device(), 'GPU:0')
@@ -926,7 +926,7 @@ class TFETest(test_util.TensorFlowTestCase):
 
     x = constant_op.constant(1)
     three_x = add(add(x, x), x)
-    self.assertEquals(dtypes.int32, three_x.dtype)
+    self.assertEqual(dtypes.int32, three_x.dtype)
     self.assertAllEqual(3, three_x)
 
   @test_util.run_gpu_only
@@ -953,7 +953,7 @@ class TFETest(test_util.TensorFlowTestCase):
     types, tensors = execute_lib.convert_to_mixed_eager_tensors(
         [array, tensor], context.context())
     for typ, t in zip(types, tensors):
-      self.assertEquals(typ, dtypes.float32)
+      self.assertEqual(typ, dtypes.float32)
       self.assertIsInstance(t, ops.EagerTensor)
 
   def testConvertMixedEagerTensorsWithVariables(self):
diff --git a/tensorflow/python/eager/custom_device_test.py b/tensorflow/python/eager/custom_device_test.py
index 9a24383a13c..c9cca7eb040 100644
--- a/tensorflow/python/eager/custom_device_test.py
+++ b/tensorflow/python/eager/custom_device_test.py
@@ -41,7 +41,7 @@ class CustomDeviceTest(test.TestCase):
     # There was no copy onto the device. Actually I'm not sure how to trigger
     # that from Python.
     self.assertFalse(custom_device_testutil.FlagValue(arrived_flag))
-    with self.assertRaisesRegexp(errors.InternalError, 'Trying to copy'):
+    with self.assertRaisesRegex(errors.InternalError, 'Trying to copy'):
       y.numpy()
 
 
diff --git a/tensorflow/python/eager/def_function_test.py b/tensorflow/python/eager/def_function_test.py
index 6dc4e322bbd..0ae69fa0b8c 100644
--- a/tensorflow/python/eager/def_function_test.py
+++ b/tensorflow/python/eager/def_function_test.py
@@ -218,8 +218,8 @@ class DefFunctionTest(test.TestCase, parameterized.TestCase):
           state.append(variables.Variable(2.0 * x))
         return state[0] * x
 
-      with self.assertRaisesRegexp(
-          lift_to_graph.UnliftableError, r'transitively.* mul .* x'):
+      with self.assertRaisesRegex(lift_to_graph.UnliftableError,
+                                  r'transitively.* mul .* x'):
         fn(constant_op.constant(3.0))
 
   @test_util.disable_tfrt('Variable argument is not supported')
@@ -393,8 +393,8 @@ class DefFunctionTest(test.TestCase, parameterized.TestCase):
         outputs.append(inputs[t])
       return outputs
 
-    with self.assertRaisesRegexp(errors.InaccessibleTensorError,
-                                 'defined in another function or code block'):
+    with self.assertRaisesRegex(errors.InaccessibleTensorError,
+                                'defined in another function or code block'):
       f(array_ops.zeros(shape=(8, 42, 3)))
 
   @test_util.disable_tfrt('Control flow is not supported')
@@ -472,7 +472,7 @@ class DefFunctionTest(test.TestCase, parameterized.TestCase):
       with ops.init_scope():
         _ = a + a
 
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         TypeError,
         re.compile('An op outside of the function.*passed.*Const', re.DOTALL)):
       failing_function()
@@ -627,7 +627,7 @@ class DefFunctionTest(test.TestCase, parameterized.TestCase):
       return a[0].read_value()
 
     create_variable()
-    self.assertRegexpMatches(a[0].device, 'CPU')
+    self.assertRegex(a[0].device, 'CPU')
 
   @test_util.disable_tfrt('Variable argument is not supported')
   @test_util.run_gpu_only
@@ -647,8 +647,8 @@ class DefFunctionTest(test.TestCase, parameterized.TestCase):
 
     with ops.device('CPU:0'):
       create_variable()
-    self.assertRegexpMatches(a[0].device, 'CPU')
-    self.assertRegexpMatches(initial_value[0].device, 'CPU')
+    self.assertRegex(a[0].device, 'CPU')
+    self.assertRegex(initial_value[0].device, 'CPU')
 
   def testDecorate(self):
     func = def_function.function(lambda: 1)
@@ -727,7 +727,7 @@ class DefFunctionTest(test.TestCase, parameterized.TestCase):
     func = def_function.function(lambda: 1)
     self.assertEqual(func().numpy(), 1)
     msg = 'Functions cannot be decorated after they have been traced.'
-    with self.assertRaisesRegexp(ValueError, msg):
+    with self.assertRaisesRegex(ValueError, msg):
       func._decorate(lambda f: f)
 
   def testGetConcreteFunctionGraphLifetime(self):
diff --git a/tensorflow/python/eager/def_function_xla_jit_test.py b/tensorflow/python/eager/def_function_xla_jit_test.py
index 78d44a81b0b..d55f84863e9 100644
--- a/tensorflow/python/eager/def_function_xla_jit_test.py
+++ b/tensorflow/python/eager/def_function_xla_jit_test.py
@@ -142,8 +142,8 @@ class DefFunctionTest(test.TestCase):
     func = def_function.function(fn2, experimental_compile=False)
     inputs = constant_op.constant([1, 2, 2, 3, 3])
     if not test.is_built_with_rocm():
-      with self.assertRaisesRegexp(errors.InvalidArgumentError,
-                                   'not compilable'):
+      with self.assertRaisesRegex(errors.InvalidArgumentError,
+                                  'not compilable'):
         func(inputs)
 
   def testUnsupportedOps(self):
@@ -156,7 +156,7 @@ class DefFunctionTest(test.TestCase):
 
     inputs = constant_op.constant([1, 2, 2, 3, 3])
     self.assertAllClose([1, 2, 3], func(inputs))
-    with self.assertRaisesRegexp(errors.InvalidArgumentError, 'not compilable'):
+    with self.assertRaisesRegex(errors.InvalidArgumentError, 'not compilable'):
       xla_func(inputs)
 
   def testFunctionGradient(self):
@@ -236,7 +236,7 @@ class DefFunctionTest(test.TestCase):
 
     inputs = constant_op.constant([1, 2, 2, 3, 3])
     c = C()
-    with self.assertRaisesRegexp(errors.InvalidArgumentError, 'not compilable'):
+    with self.assertRaisesRegex(errors.InvalidArgumentError, 'not compilable'):
       c.f1(inputs)
 
   def testMustBeConstantPropagation(self):
@@ -285,9 +285,8 @@ class DefFunctionTest(test.TestCase):
     x = constant_op.constant(3.14)
     with backprop.GradientTape() as tape:
       tape.watch(x)
-      with self.assertRaisesRegexp(
-          errors.UnimplementedError,
-          'TensorList crossing the XLA/TF boundary'):
+      with self.assertRaisesRegex(errors.UnimplementedError,
+                                  'TensorList crossing the XLA/TF boundary'):
         y = f(x)
         tape.gradient(y, x)
 
diff --git a/tensorflow/python/eager/forwardprop_test.py b/tensorflow/python/eager/forwardprop_test.py
index b71957ae16b..ad55a5301a9 100644
--- a/tensorflow/python/eager/forwardprop_test.py
+++ b/tensorflow/python/eager/forwardprop_test.py
@@ -282,7 +282,7 @@ class ForwardpropTest(test.TestCase, parameterized.TestCase):
   def testJVPFunctionRaisesError(self):
     sum_outputs = (constant_op.constant(6.),)
 
-    with self.assertRaisesRegexp(ValueError, r".*was expected to be of shape*"):
+    with self.assertRaisesRegex(ValueError, r".*was expected to be of shape*"):
       forwardprop._jvp_dispatch(
           op_name="Add",
           attr_tuple=(),
@@ -343,7 +343,7 @@ class ForwardpropTest(test.TestCase, parameterized.TestCase):
   @test_util.assert_no_new_pyobjects_executing_eagerly
   def testMultipleWatchesAdd(self):
     x = constant_op.constant(-2.)
-    with self.assertRaisesRegexp(ValueError, "multiple times"):
+    with self.assertRaisesRegex(ValueError, "multiple times"):
       with forwardprop.ForwardAccumulator(
           [x, x], [1., 2.]):
         pass
@@ -365,7 +365,7 @@ class ForwardpropTest(test.TestCase, parameterized.TestCase):
       self.assertAllClose(1.5, acc.jvp(x))
       y = 4. * x
       self.assertAllClose(6., acc.jvp(y))
-      with self.assertRaisesRegexp(ValueError, "already recording"):
+      with self.assertRaisesRegex(ValueError, "already recording"):
         with acc:
           pass
     z = 4. * x
@@ -434,8 +434,8 @@ class ForwardpropTest(test.TestCase, parameterized.TestCase):
     def f(x):
       return math_ops.reduce_prod(math_ops.tanh(x)**2)
 
-    with self.assertRaisesRegexp(NotImplementedError,
-                                 "recompute_grad tried to transpose"):
+    with self.assertRaisesRegex(NotImplementedError,
+                                "recompute_grad tried to transpose"):
       primals = [constant_op.constant([1.])]
       sym_jac_fwd = _jacfwd(f, primals)
 
@@ -450,7 +450,7 @@ class ForwardpropTest(test.TestCase, parameterized.TestCase):
     c = constant_op.constant(1.)
     d = constant_op.constant(2.)
     with forwardprop.ForwardAccumulator(c, d):
-      with self.assertRaisesRegexp(ValueError, "test_error_string"):
+      with self.assertRaisesRegex(ValueError, "test_error_string"):
         f(c)
 
   @parameterized.named_parameters(
diff --git a/tensorflow/python/eager/function_defun_collection_test.py b/tensorflow/python/eager/function_defun_collection_test.py
index 53478ad121c..954297e58d2 100644
--- a/tensorflow/python/eager/function_defun_collection_test.py
+++ b/tensorflow/python/eager/function_defun_collection_test.py
@@ -55,9 +55,9 @@ class DefunCollectionTest(test.TestCase, parameterized.TestCase):
           return z
 
         self.assertEqual(7, int(self.evaluate(fn())))
-        self.assertEquals(ops.get_collection('x'), [2])
-        self.assertEquals(ops.get_collection('y'), [5])
-        self.assertEquals(ops.get_collection('z'), [])
+        self.assertEqual(ops.get_collection('x'), [2])
+        self.assertEqual(ops.get_collection('y'), [5])
+        self.assertEqual(ops.get_collection('z'), [])
 
   @parameterized.named_parameters(
       dict(testcase_name='Defun', function_decorator=function.defun),
@@ -76,8 +76,7 @@ class DefunCollectionTest(test.TestCase, parameterized.TestCase):
 
         self.evaluate(variables.global_variables_initializer())
         self.assertEqual(1.0, float(self.evaluate(f())))
-        self.assertEquals(
-            len(ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)), 1)
+        self.assertLen(ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES), 1)
 
   def testCollectionVariableValueWrite(self):
     """Write variable value inside defun."""
@@ -92,8 +91,7 @@ class DefunCollectionTest(test.TestCase, parameterized.TestCase):
         _ = f.get_concrete_function()
         self.evaluate(variables.global_variables_initializer())
         self.assertEqual(2.0, float(self.evaluate(f())))
-        self.assertEquals(
-            len(ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)), 1)
+        self.assertLen(ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES), 1)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/eager/function_gradients_test.py b/tensorflow/python/eager/function_gradients_test.py
index ffd84fc56af..2830207b30a 100644
--- a/tensorflow/python/eager/function_gradients_test.py
+++ b/tensorflow/python/eager/function_gradients_test.py
@@ -372,8 +372,8 @@ class FunctionGradientsTest(test.TestCase, parameterized.TestCase):
             'v', initializer=constant_op.constant(1.0))
         return x * constant_op.constant(2.0)
 
-      with self.assertRaisesRegexp(ValueError,
-                                   'No trainable variables were accessed'):
+      with self.assertRaisesRegex(ValueError,
+                                  'No trainable variables were accessed'):
         backprop.implicit_val_and_grad(f)()
 
   def testDefunCallBackpropUsingSameObjectForMultipleArguments(self):
diff --git a/tensorflow/python/eager/function_test.py b/tensorflow/python/eager/function_test.py
index b70b1bc5c1f..1bcf51e62c6 100644
--- a/tensorflow/python/eager/function_test.py
+++ b/tensorflow/python/eager/function_test.py
@@ -169,7 +169,7 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
     self.assertEqual(values, [1, 2, 1, 2])  # And again.
 
   def testCannotAddExitCallbackWhenNotInFunctionScope(self):
-    with self.assertRaisesRegexp(RuntimeError, 'when not building a function.'):
+    with self.assertRaisesRegex(RuntimeError, 'when not building a function.'):
       ops.add_exit_callback_to_default_func_graph(lambda: None)
 
   def testVariable(self):
@@ -186,7 +186,7 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
     r1 = add(v)
     self.assertEqual(2.0, self.evaluate(r1))
     c = constant_op.constant(1.0)
-    with self.assertRaisesRegexp(AttributeError, 'no attribute'):
+    with self.assertRaisesRegex(AttributeError, 'no attribute'):
       add(c)
 
   def testPackedVariable(self):
@@ -264,8 +264,8 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
           experimental_implements='func')(lambda x, y: x + y + z)
       a = array_ops.ones((1.0,))
       b = array_ops.ones((1.0,))
-      with self.assertRaisesRegexp(AssertionError,
-                                   'variables are always captured'):
+      with self.assertRaisesRegex(AssertionError,
+                                  'variables are always captured'):
         v(a, b)
       functions = ops.get_default_graph().as_graph_def().library.function
       self.assertEmpty(functions)
@@ -624,7 +624,7 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
     def f(_):
       return 1.0
 
-    with self.assertRaisesRegexp(ValueError, r'Got type: set'):
+    with self.assertRaisesRegex(ValueError, r'Got type: set'):
       f(set([]))
 
   def testFuncName(self):
@@ -1097,7 +1097,7 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
 
     @def_function.function
     def tensor_init():
-      with self.assertRaisesRegexp(ValueError, error_msg):
+      with self.assertRaisesRegex(ValueError, error_msg):
         resource_variable_ops.ResourceVariable(constant_op.constant(2.0))
 
     tensor_init()
@@ -1581,7 +1581,7 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
           False)  # use_locking
       return None
 
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         errors.InvalidArgumentError,
         'Cannot place the graph because a reference or resource edge connects '
         'colocation groups with incompatible assigned devices'):
@@ -2052,10 +2052,10 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
     self.assertAllEqual([3, 1], func([[0], [1.0], [1]]))
     self.assertAllEqual([2, 2], func(numpy.array([[1, 1], [2, 2]])))
 
-    with self.assertRaisesRegexp(ValueError, 'incompatible'):
+    with self.assertRaisesRegex(ValueError, 'incompatible'):
       func([0.0, 1.0, 2.0])  # Wrong shape.
 
-    with self.assertRaisesRegexp(ValueError, 'incompatible'):
+    with self.assertRaisesRegex(ValueError, 'incompatible'):
       func([['wrong dtype']])
 
   def testNoKeywordOnlyArgumentsWithInputSignature(self):
@@ -2064,7 +2064,7 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
 
     func = eval('lambda x, *, y: x')  # pylint: disable=eval-used
     signature = [tensor_spec.TensorSpec(None, dtypes.int32)]
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         ValueError, 'Cannot define a TensorFlow function from a Python '
         'function with keyword-only arguments when input_signature is '
         'provided.'):
@@ -2160,13 +2160,14 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
 
     # Signatures must consist exclusively of `TensorSpec` objects.
     signature = [(2, 3), tensor_spec.TensorSpec([2, 3], dtypes.float32)]
-    with self.assertRaisesRegexp(TypeError, 'Invalid input_signature.*'):
+    with self.assertRaisesRegex(TypeError, 'Invalid input_signature.*'):
       def_function.function(foo, input_signature=signature)
 
     # Signatures must be either lists or tuples on their outermost levels.
     signature = {'t1': tensor_spec.TensorSpec([], dtypes.float32)}
-    with self.assertRaisesRegexp(TypeError, 'input_signature must be either a '
-                                 'tuple or a list.*'):
+    with self.assertRaisesRegex(
+        TypeError, 'input_signature must be either a '
+        'tuple or a list.*'):
       function.defun(foo, input_signature=signature)
 
   @test_util.run_in_graph_and_eager_modes
@@ -2179,23 +2180,23 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
     defined = def_function.function(foo, input_signature=signature)
 
     # Invalid shapes.
-    with self.assertRaisesRegexp(ValueError, 'Python inputs incompatible.*'):
+    with self.assertRaisesRegex(ValueError, 'Python inputs incompatible.*'):
       defined(array_ops.ones([3]))
 
-    with self.assertRaisesRegexp(ValueError, 'Python inputs incompatible.*'):
+    with self.assertRaisesRegex(ValueError, 'Python inputs incompatible.*'):
       defined(array_ops.ones([2, 1]))
 
     # Wrong number of arguments.
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         TypeError, r'takes 1 positional arguments \(as specified by the '
         r'input_signature\) but 2 were given'):
       defined(array_ops.ones([2]), array_ops.ones([2]))
-    with self.assertRaisesRegexp(ValueError,
-                                 'Structure of Python function inputs.*'):
+    with self.assertRaisesRegex(ValueError,
+                                'Structure of Python function inputs.*'):
       defined()
 
-    with self.assertRaisesRegexp(ValueError,
-                                 'inputs incompatible with input_signature'):
+    with self.assertRaisesRegex(ValueError,
+                                'inputs incompatible with input_signature'):
       defined.get_concrete_function(
           tensor_spec.TensorSpec(shape=(3,), dtype=dtypes.float32))
 
@@ -2209,12 +2210,12 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
     defined = function.defun(foo, input_signature=signature)
     a = array_ops.ones([1])
 
-    with self.assertRaisesRegexp(ValueError,
-                                 'Structure of Python function inputs.*'):
+    with self.assertRaisesRegex(ValueError,
+                                'Structure of Python function inputs.*'):
       defined([a, a, a], [a])
 
-    with self.assertRaisesRegexp(ValueError,
-                                 'Structure of Python function inputs.*'):
+    with self.assertRaisesRegex(ValueError,
+                                'Structure of Python function inputs.*'):
       defined([a], [a, a, a])
     defined([a, a], [a, a])
 
@@ -2229,12 +2230,12 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
         return -1.0 * a
 
     x = constant_op.constant(1.0)
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         TypeError, 'got keyword argument `training` '
         'that was not included in input_signature'):
       foo(x, training=True)
 
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         TypeError, 'got keyword argument `training` '
         'that was not included in input_signature'):
       foo(x, training=False)
@@ -2343,17 +2344,17 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
 
     # Different number of rows
     rt3 = ragged_factory_ops.constant([[1, 2], [3, 4], [5], [6]])
-    with self.assertRaisesRegexp(ValueError, 'incompatible'):
+    with self.assertRaisesRegex(ValueError, 'incompatible'):
       defined(rt3)
 
     # Different dtype
     rt4 = ragged_factory_ops.constant([[1.0, 2.0], [], [3.0]])
-    with self.assertRaisesRegexp(ValueError, 'Structure .* does not match'):
+    with self.assertRaisesRegex(ValueError, 'Structure .* does not match'):
       defined(rt4)
 
     # Different rank
     rt5 = ragged_factory_ops.constant([[[1]], [[2]], [[3]]])
-    with self.assertRaisesRegexp(ValueError, 'does not match'):
+    with self.assertRaisesRegex(ValueError, 'does not match'):
       defined(rt5)
 
   def testInputSignatureWithVariableArgs(self):
@@ -2510,15 +2511,13 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
         # pylint: disable=protected-access
         self.assertLen(graph._functions, 2)
         functions = list(graph._functions.values())
-        self.assertRegexpMatches(
-            functions[0].definition.signature.name, '.*matmul.*')
+        self.assertRegex(functions[0].definition.signature.name, '.*matmul.*')
         attrs = functions[0].definition.attr
         self.assertLen(attrs, 2)
         self.assertEqual(attrs['experimental_1'].s, b'value1')
         self.assertEqual(attrs['experimental_2'].i, 2)
 
-        self.assertRegexpMatches(
-            functions[1].definition.signature.name, '.*add.*')
+        self.assertRegex(functions[1].definition.signature.name, '.*add.*')
         attrs = functions[1].definition.attr
         self.assertLen(attrs, 2)
         self.assertEqual(attrs['experimental_3'].b, True)
@@ -2530,8 +2529,7 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
     def add(x, y):
       return math_ops.add(x, y)
 
-    with self.assertRaisesRegexp(ValueError,
-                                 '.*Unsupported attribute type.*'):
+    with self.assertRaisesRegex(ValueError, '.*Unsupported attribute type.*'):
       with context.graph_mode(), self.cached_session():
         with ops.get_default_graph().as_default():
           t = constant_op.constant([[1.0, 2.0], [3.0, 4.0]])
@@ -2570,8 +2568,8 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
             '.*inference.*backward.*add.*',
         ]
         for i in range(len(functions)):
-          self.assertRegexpMatches(captured_function_names[i],
-                                   expected_func_name_regex[i])
+          self.assertRegex(captured_function_names[i],
+                           expected_func_name_regex[i])
 
         # Check the forward and backward function has the correct attributes.
         self.assertEqual(
@@ -2649,7 +2647,7 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
         for expected, found in zip(
             expected_func_name_regex,
             captured_function_names):
-          self.assertRegexpMatches(found, expected)
+          self.assertRegex(found, expected)
 
         composite_t, composite_double = composite(t, t)
         double = add(t, t)
@@ -3011,7 +3009,7 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
 
     expected_msg = '.*() should not modify'
 
-    with self.assertRaisesRegexp(ValueError, expected_msg):
+    with self.assertRaisesRegex(ValueError, expected_msg):
 
       @def_function.function
       def append(l):
@@ -3019,7 +3017,7 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
 
       append(get_list())
 
-    with self.assertRaisesRegexp(ValueError, expected_msg):
+    with self.assertRaisesRegex(ValueError, expected_msg):
 
       @def_function.function
       def extend(l):
@@ -3027,7 +3025,7 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
 
       extend(get_list())
 
-    with self.assertRaisesRegexp(ValueError, expected_msg):
+    with self.assertRaisesRegex(ValueError, expected_msg):
 
       @def_function.function
       def insert(l):
@@ -3035,7 +3033,7 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
 
       insert(get_list())
 
-    with self.assertRaisesRegexp(ValueError, expected_msg):
+    with self.assertRaisesRegex(ValueError, expected_msg):
 
       @def_function.function
       def pop(l):
@@ -3043,7 +3041,7 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
 
       pop(get_list())
 
-    with self.assertRaisesRegexp(ValueError, expected_msg):
+    with self.assertRaisesRegex(ValueError, expected_msg):
 
       @def_function.function
       def reverse(l):
@@ -3051,7 +3049,7 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
 
       reverse(get_list())
 
-    with self.assertRaisesRegexp(ValueError, expected_msg):
+    with self.assertRaisesRegex(ValueError, expected_msg):
 
       @def_function.function
       def remove(l):
@@ -3062,7 +3060,7 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
     # `list.clear` is a method that is in Py3 but not Py2
     if sys.version.startswith('3'):
 
-      with self.assertRaisesRegexp(ValueError, expected_msg):
+      with self.assertRaisesRegex(ValueError, expected_msg):
 
         @def_function.function
         def clear(l):
@@ -3071,7 +3069,7 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
         clear(get_list())
 
     # One last test for keyword arguments
-    with self.assertRaisesRegexp(ValueError, expected_msg):
+    with self.assertRaisesRegex(ValueError, expected_msg):
 
       @def_function.function
       def kwdappend(**kwargs):
@@ -3087,7 +3085,7 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
 
     expected_msg = '.* should not modify'
 
-    with self.assertRaisesRegexp(ValueError, expected_msg):
+    with self.assertRaisesRegex(ValueError, expected_msg):
 
       @def_function.function
       def clear(m):
@@ -3095,7 +3093,7 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
 
       clear(get_dict())
 
-    with self.assertRaisesRegexp(ValueError, expected_msg):
+    with self.assertRaisesRegex(ValueError, expected_msg):
 
       @def_function.function
       def pop(m):
@@ -3103,7 +3101,7 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
 
       pop(get_dict())
 
-    with self.assertRaisesRegexp(ValueError, expected_msg):
+    with self.assertRaisesRegex(ValueError, expected_msg):
 
       @def_function.function
       def popitem(m):
@@ -3111,7 +3109,7 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
 
       popitem(get_dict())
 
-    with self.assertRaisesRegexp(ValueError, expected_msg):
+    with self.assertRaisesRegex(ValueError, expected_msg):
 
       @def_function.function
       def update(m):
@@ -3119,7 +3117,7 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
 
       update(get_dict())
 
-    with self.assertRaisesRegexp(ValueError, expected_msg):
+    with self.assertRaisesRegex(ValueError, expected_msg):
 
       @def_function.function
       def setdefault(m):
@@ -3128,8 +3126,7 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
       setdefault(get_dict())
 
   def testFunctionModifiesInputNest(self):
-    with self.assertRaisesRegexp(
-        ValueError, 'modify.* should not modify'):
+    with self.assertRaisesRegex(ValueError, 'modify.* should not modify'):
 
       @def_function.function
       def modify(n):
@@ -3143,8 +3140,8 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
 
       modify(nested_input)
 
-    with self.assertRaisesRegexp(
-        ValueError, 'modify_same_flat.* should not modify'):
+    with self.assertRaisesRegex(ValueError,
+                                'modify_same_flat.* should not modify'):
 
       # The flat list doesn't change whereas the true structure changes
       @def_function.function
@@ -3166,7 +3163,7 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
         5,
         add_five(constant_op.constant(0, dtype=dtypes.int32)).numpy())
 
-    with self.assertRaisesRegexp(errors.NotFoundError, 'NON_EXISTENT_EXECUTOR'):
+    with self.assertRaisesRegex(errors.NotFoundError, 'NON_EXISTENT_EXECUTOR'):
       with context.function_executor_type('NON_EXISTENT_EXECUTOR'):
         add_five(constant_op.constant(0, dtype=dtypes.int32))
 
@@ -3230,7 +3227,7 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
 
     with ops.device('GPU:0'):
       x = func()
-      self.assertRegexpMatches(x.device, 'GPU')
+      self.assertRegex(x.device, 'GPU')
 
   @test_util.run_in_graph_and_eager_modes
   def testShapeCaching(self):
@@ -3299,7 +3296,7 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
     def g():
       f_concrete(constant_op.constant([1., 2.]))
 
-    with self.assertRaisesRegexp(ValueError, 'argument_name'):
+    with self.assertRaisesRegex(ValueError, 'argument_name'):
       g()
 
   @test_util.run_in_graph_and_eager_modes
@@ -3706,7 +3703,7 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
       return x
 
     conc = func.get_concrete_function(*conc_args, **conc_kwargs)
-    with self.assertRaisesRegexp(exception, error):
+    with self.assertRaisesRegex(exception, error):
       self.evaluate(conc(*call_args, **call_kwargs))
 
   # pylint: disable=g-long-lambda
@@ -3809,7 +3806,7 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
     # Remove _function_spec, to disable the structured signature.
     conc._set_function_spec(None)  # pylint: disable=protected-access
 
-    with self.assertRaisesRegexp(exception, error):
+    with self.assertRaisesRegex(exception, error):
       self.evaluate(conc(*call_args, **call_kwargs))
 
   @test_util.run_in_graph_and_eager_modes
@@ -3846,14 +3843,13 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
                   r'    kangaroo: int32 Tensor, shape=\(3,\)\n'
                   r'  Returns:\n'
                   r'    int32 Tensor, shape=\(\)')
-    self.assertRegexpMatches(
-        c1.pretty_printed_signature(verbose=False), c1_summary)
-    self.assertRegexpMatches(
+    self.assertRegex(c1.pretty_printed_signature(verbose=False), c1_summary)
+    self.assertRegex(
         c1.pretty_printed_signature(verbose=True),
         c1_summary + '\n' + c1_details)
-    self.assertRegexpMatches(
+    self.assertRegex(
         repr(c1), r'<ConcreteFunction func\(x, kangaroo, octopus=7\) at .*>')
-    self.assertRegexpMatches(
+    self.assertRegex(
         str(c1), 'ConcreteFunction {}\n{}'.format(c1_summary, c1_details))
 
     c2 = func.get_concrete_function(scalar, ragged, 3)
@@ -3863,8 +3859,8 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
                   r'    kangaroo: RaggedTensorSpec\(.*\)\n'
                   r'  Returns:\n'
                   r'    int32 Tensor, shape=\(\)')
-    self.assertRegexpMatches(c2.pretty_printed_signature(),
-                             c2_summary + '\n' + c2_details)
+    self.assertRegex(c2.pretty_printed_signature(),
+                     c2_summary + '\n' + c2_details)
 
     c3 = func.get_concrete_function({'a': scalar, 'b': [ragged, ragged]})
     c3_summary = r'func\(x, kangaroo=None, octopus=7\)'
@@ -3882,8 +3878,8 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
     # python 3.5 does not gurantee deterministic iteration of dict contents
     # which can lead mismatch on pretty_printed_signature output for "Args"
     if sys.version_info >= (3, 6):
-      self.assertRegexpMatches(c3.pretty_printed_signature(),
-                               c3_summary + '\n' + c3_details)
+      self.assertRegex(c3.pretty_printed_signature(),
+                       c3_summary + '\n' + c3_details)
 
     # pylint: disable=keyword-arg-before-vararg
     @def_function.function
@@ -3948,9 +3944,9 @@ class MultiDeviceTest(test.TestCase, parameterized.TestCase):
     t = constant_op.constant([[1.0, 2.0], [3.0, 4.0]])
     m1, m2 = func(t, t, transpose_a=True)
     self.assertAllEqual(m1.numpy(), [[10, 14], [14, 20]])
-    self.assertRegexpMatches(m1.backing_device, 'CPU')
+    self.assertRegex(m1.backing_device, 'CPU')
     self.assertAllEqual(m2.numpy(), [[10, 14], [14, 20]])
-    self.assertRegexpMatches(m2.backing_device, 'GPU')
+    self.assertRegex(m2.backing_device, 'GPU')
 
   @test_util.run_gpu_only
   def testEmptyBody(self):
@@ -3965,9 +3961,9 @@ class MultiDeviceTest(test.TestCase, parameterized.TestCase):
 
     m1, m2 = func(a, b)
     self.assertAllEqual(m1.numpy(), 5.0)
-    self.assertRegexpMatches(m1.backing_device, 'GPU')
+    self.assertRegex(m1.backing_device, 'GPU')
     self.assertAllEqual(m2.numpy(), 3.0)
-    self.assertRegexpMatches(m2.backing_device, 'CPU')
+    self.assertRegex(m2.backing_device, 'CPU')
 
   @test_util.run_gpu_only
   def testMultiDeviceInt32(self):
@@ -4001,16 +3997,16 @@ class MultiDeviceTest(test.TestCase, parameterized.TestCase):
 
     m1, m2 = func(int_cpu, resource, int_gpu)
     self.assertAllEqual(m1.numpy(), 22)
-    self.assertRegexpMatches(m1.backing_device, 'CPU')
+    self.assertRegex(m1.backing_device, 'CPU')
     self.assertAllEqual(m2.numpy(), 39)
-    self.assertRegexpMatches(m2.backing_device, 'CPU')
+    self.assertRegex(m2.backing_device, 'CPU')
 
     # flip arguments
     m1, m2 = func(int_gpu, resource, int_cpu)
     self.assertAllEqual(m1.numpy(), 38)
-    self.assertRegexpMatches(m1.backing_device, 'CPU')
+    self.assertRegex(m1.backing_device, 'CPU')
     self.assertAllEqual(m2.numpy(), 23)
-    self.assertRegexpMatches(m2.backing_device, 'CPU')
+    self.assertRegex(m2.backing_device, 'CPU')
 
   @test_util.run_gpu_only
   def testMultiDeviceColocateWith(self):
@@ -4032,9 +4028,9 @@ class MultiDeviceTest(test.TestCase, parameterized.TestCase):
 
       ra, rb = func(a, b)
       self.assertEqual(ra.numpy(), 2.0)
-      self.assertRegexpMatches(ra.backing_device, dev1)
+      self.assertRegex(ra.backing_device, dev1)
       self.assertEqual(rb.numpy(), 30.0)
-      self.assertRegexpMatches(rb.backing_device, dev2)
+      self.assertRegex(rb.backing_device, dev2)
 
   @test_util.run_gpu_only
   def testMultiDeviceResources(self):
@@ -4055,17 +4051,17 @@ class MultiDeviceTest(test.TestCase, parameterized.TestCase):
 
     r1, r2 = func(c1, g1)
     self.assertEqual(r1.numpy(), 10.0)
-    self.assertRegexpMatches(r1.backing_device, 'CPU')
+    self.assertRegex(r1.backing_device, 'CPU')
     self.assertEqual(r2.numpy(), 21.0)
-    self.assertRegexpMatches(r2.backing_device, 'GPU')
+    self.assertRegex(r2.backing_device, 'GPU')
 
     # Call with flipped inputs. Check that we look at resource's
     # device and reinstantiates the function when inputs' devices change.
     r1, r2 = func(g1, c1)
     self.assertEqual(r1.numpy(), 15.0)
-    self.assertRegexpMatches(r1.backing_device, 'CPU')
+    self.assertRegex(r1.backing_device, 'CPU')
     self.assertEqual(r2.numpy(), 14.0)
-    self.assertRegexpMatches(r2.backing_device, 'GPU')
+    self.assertRegex(r2.backing_device, 'GPU')
 
   @test_util.run_gpu_only
   def testOutputResources(self):
@@ -4084,12 +4080,12 @@ class MultiDeviceTest(test.TestCase, parameterized.TestCase):
 
     r1, res1, r2, res2 = func(c1, g1)
     self.assertEqual(r1.numpy(), 10.0)
-    self.assertRegexpMatches(r1.backing_device, 'CPU')
+    self.assertRegex(r1.backing_device, 'CPU')
     self.assertEqual(r2.numpy(), 21.0)
-    self.assertRegexpMatches(r2.backing_device, 'GPU')
+    self.assertRegex(r2.backing_device, 'GPU')
 
     def check_handle(handle, expected_value):
-      self.assertRegexpMatches(handle.backing_device, 'CPU')
+      self.assertRegex(handle.backing_device, 'CPU')
       tensor = gen_resource_variable_ops.read_variable_op(
           handle, dtypes.float32)
       self.assertEqual(tensor.numpy(), expected_value)
@@ -4105,9 +4101,9 @@ class MultiDeviceTest(test.TestCase, parameterized.TestCase):
     # for ops consuming handles returned from defuns.
     r1, res1, r2, res2 = func(g1, c1)
     self.assertEqual(r1.numpy(), 15.0)
-    self.assertRegexpMatches(r1.backing_device, 'CPU')
+    self.assertRegex(r1.backing_device, 'CPU')
     self.assertEqual(r2.numpy(), 14.0)
-    self.assertRegexpMatches(r2.backing_device, 'GPU')
+    self.assertRegex(r2.backing_device, 'GPU')
     check_handle(res1, 3.0)
     check_handle(res2, 2.0)
 
@@ -4139,7 +4135,7 @@ class MultiDeviceTest(test.TestCase, parameterized.TestCase):
     r1 = outer(g1)
 
     self.assertEqual(r1.numpy(), 6.0)
-    self.assertRegexpMatches(r1.backing_device, 'CPU')
+    self.assertRegex(r1.backing_device, 'CPU')
 
   @test_util.run_gpu_only
   def testReturnResourceFromNestedFunctionCall(self):
@@ -4170,10 +4166,10 @@ class MultiDeviceTest(test.TestCase, parameterized.TestCase):
     r1, res1 = outer(g1)
 
     self.assertEqual(r1.numpy(), 10.0)
-    self.assertRegexpMatches(r1.backing_device, 'CPU')
+    self.assertRegex(r1.backing_device, 'CPU')
 
     def check_handle(handle, expected_value):
-      self.assertRegexpMatches(handle.backing_device, 'CPU')
+      self.assertRegex(handle.backing_device, 'CPU')
       tensor = gen_resource_variable_ops.read_variable_op(
           handle, dtypes.float32)
       self.assertEqual(tensor.numpy(), expected_value)
@@ -4199,9 +4195,9 @@ class MultiDeviceTest(test.TestCase, parameterized.TestCase):
 
     # Make sure tensors are on expected devices.
     for tensor in [cc0, cc1]:
-      self.assertRegexpMatches(tensor.backing_device, 'CPU:0')
+      self.assertRegex(tensor.backing_device, 'CPU:0')
     for tensor in [cg0, cg1]:
-      self.assertRegexpMatches(tensor.backing_device, 'GPU:0')
+      self.assertRegex(tensor.backing_device, 'GPU:0')
 
     @function.defun
     def func(rc0, cc0, cg0, rc1, cg1, rg0, rg1, cc1):
@@ -4218,10 +4214,10 @@ class MultiDeviceTest(test.TestCase, parameterized.TestCase):
       return r1, r2, m2, m1
 
     r1, r2, m2, m1 = func(rc0, cc0, cg0, rc1, cg1, rg0, rg1, cc1)
-    self.assertRegexpMatches(m1.backing_device, 'CPU')
-    self.assertRegexpMatches(r1.backing_device, 'CPU')
-    self.assertRegexpMatches(m2.backing_device, 'GPU')
-    self.assertRegexpMatches(r2.backing_device, 'GPU')
+    self.assertRegex(m1.backing_device, 'CPU')
+    self.assertRegex(r1.backing_device, 'CPU')
+    self.assertRegex(m2.backing_device, 'GPU')
+    self.assertRegex(r2.backing_device, 'GPU')
     self.assertEqual(m1.numpy(), 34.0)
     self.assertEqual(r1.numpy(), 55000.0 + 3.0 * 19.0)
     self.assertEqual(m2.numpy(), 55.0)
@@ -4331,12 +4327,12 @@ class MultiDeviceTest(test.TestCase, parameterized.TestCase):
 
     # dtype mismatch
     value = constant_op.constant(1)
-    with self.assertRaisesRegexp(ValueError, 'Value .* to a tensor with dtype'):
+    with self.assertRaisesRegex(ValueError, 'Value .* to a tensor with dtype'):
       lazy_capture(2.0)
 
     # shape mismatch
     value = constant_op.constant([1.0])
-    with self.assertRaisesRegexp(ValueError, 'Value .* shape'):
+    with self.assertRaisesRegex(ValueError, 'Value .* shape'):
       lazy_capture(2.0)
 
   def testDeferredCaptureReturnNestWithCompositeTensor(self):
diff --git a/tensorflow/python/eager/pywrap_tfe_test.py b/tensorflow/python/eager/pywrap_tfe_test.py
index c292223f629..9bf698fded0 100644
--- a/tensorflow/python/eager/pywrap_tfe_test.py
+++ b/tensorflow/python/eager/pywrap_tfe_test.py
@@ -207,18 +207,17 @@ class Tests(test.TestCase):
     ctx_handle = ctx._handle  # pylint: disable=protected-access
 
     # Not enough base params
-    with self.assertRaisesRegexp(ValueError,
-                                 "at least 5 items in the input tuple"):
+    with self.assertRaisesRegex(ValueError,
+                                "at least 5 items in the input tuple"):
       pywrap_tfe.TFE_Py_FastPathExecute(ctx_handle, ctx.device_name, "Identity")
 
     # Not enough inputs
-    with self.assertRaisesRegexp(ValueError,
-                                 "Expected to be at least 6, was 5"):
+    with self.assertRaisesRegex(ValueError, "Expected to be at least 6, was 5"):
       pywrap_tfe.TFE_Py_FastPathExecute(ctx_handle, ctx_handle, "Identity",
                                         None, [])
 
     # Bad type
-    with self.assertRaisesRegexp(TypeError, "expected a string for op_name"):
+    with self.assertRaisesRegex(TypeError, "expected a string for op_name"):
       pywrap_tfe.TFE_Py_FastPathExecute(ctx_handle, ctx.device_name, ctx_handle,
                                         None, [], a_2_by_2)
 
@@ -239,11 +238,11 @@ class Tests(test.TestCase):
   @test_util.assert_no_new_tensors
   @test_util.assert_no_garbage_created
   def testInvalidNumOutputs(self):
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         Exception, r"Value for number_attr\(\) -1 < 0 \[Op:Split\]"):
       array_ops.split(value=[1, 2, 3], num_or_size_splits=-1)
 
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         Exception,
         "Value for attr 'num_split' of 0 must be at least minimum 1"):
       array_ops.split(value=[1, 2, 3], num_or_size_splits=0)
@@ -263,8 +262,8 @@ class Tests(test.TestCase):
     with ops.Graph().as_default():
       a_2_by_2 = constant_op.constant(1.0, shape=[2, 2])
       m = resource_variable_ops.ResourceVariable(a_2_by_2)
-      with self.assertRaisesRegexp(TypeError,
-                                   "Expected list for 'values' argument"):
+      with self.assertRaisesRegex(TypeError,
+                                  "Expected list for 'values' argument"):
         _ = array_ops.stack(m, axis=1)
 
   def testGraphResourceVariableRaisesFallback(self):
diff --git a/tensorflow/python/eager/remote_cluster_test.py b/tensorflow/python/eager/remote_cluster_test.py
index 864d5e7c0f3..84dbb11361a 100644
--- a/tensorflow/python/eager/remote_cluster_test.py
+++ b/tensorflow/python/eager/remote_cluster_test.py
@@ -623,14 +623,14 @@ class DynamicClusterTest(test.TestCase, parameterized.TestCase):
     self.assertGreater(total, 0)
 
   def testCheckAlive(self):
-    with self.assertRaisesRegexp(ValueError, "Context is not initialized."):
+    with self.assertRaisesRegex(ValueError, "Context is not initialized."):
       context.check_alive("/job:remote_device/task:0")
     context.context().ensure_initialized()
 
     self.assertTrue(context.check_alive("/job:remote_device/replica:0/task:0"))
     self.assertTrue(context.check_alive("/job:remote_device/replica:0/task:1"))
 
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         errors.InvalidArgumentError,
         "Client for target /job:remote_device/replica:0/task:10 not found."):
       context.check_alive("/job:remote_device/replica:0/task:10")
diff --git a/tensorflow/python/eager/tensor_test.py b/tensorflow/python/eager/tensor_test.py
index 6288adc373c..1d48d59f754 100644
--- a/tensorflow/python/eager/tensor_test.py
+++ b/tensorflow/python/eager/tensor_test.py
@@ -70,11 +70,11 @@ class TFETensorTest(test_util.TensorFlowTestCase):
     ctx = context.context()
     device = ctx.device_name
     # Missing device.
-    with self.assertRaisesRegexp(TypeError, r".*argument 'device' \(pos 2\).*"):
+    with self.assertRaisesRegex(TypeError, r".*argument 'device' \(pos 2\).*"):
       ops.EagerTensor(1)
     # Bad dtype type.
-    with self.assertRaisesRegexp(TypeError,
-                                 "Expecting a DataType value for dtype. Got"):
+    with self.assertRaisesRegex(TypeError,
+                                "Expecting a DataType value for dtype. Got"):
       ops.EagerTensor(1, device=device, dtype="1")
 
     # Following errors happen when trying to copy to GPU.
@@ -83,7 +83,7 @@ class TFETensorTest(test_util.TensorFlowTestCase):
 
     with ops.device("/device:GPU:0"):
       # Bad device.
-      with self.assertRaisesRegexp(TypeError, "Error parsing device argument"):
+      with self.assertRaisesRegex(TypeError, "Error parsing device argument"):
         ops.EagerTensor(1.0, device=1)
 
   def testNumpyValue(self):
@@ -109,7 +109,7 @@ class TFETensorTest(test_util.TensorFlowTestCase):
     self.assertAllEqual(values, t)
     ctx = context.context()
     # Bad dtype value.
-    with self.assertRaisesRegexp(TypeError, "Invalid dtype argument value"):
+    with self.assertRaisesRegex(TypeError, "Invalid dtype argument value"):
       ops.EagerTensor(values, device=ctx.device_name, dtype=12345)
 
   def testNumpyOrderHandling(self):
@@ -140,8 +140,7 @@ class TFETensorTest(test_util.TensorFlowTestCase):
     tensor = constant_op.constant(numpy_tensor)
     with self.assertRaises(TypeError):
       len(numpy_tensor)
-    with self.assertRaisesRegexp(
-        TypeError, r"Scalar tensor has no `len[(][)]`"):
+    with self.assertRaisesRegex(TypeError, r"Scalar tensor has no `len[(][)]`"):
       len(tensor)
 
     numpy_tensor = np.asarray([1.0, 2.0, 3.0])
@@ -274,8 +273,8 @@ class TFETensorTest(test_util.TensorFlowTestCase):
 
   def testIterateOverScalarTensorRaises(self):
     t = _create_tensor(1)
-    with self.assertRaisesRegexp(TypeError,
-                                 "Cannot iterate over a scalar tensor"):
+    with self.assertRaisesRegex(TypeError,
+                                "Cannot iterate over a scalar tensor"):
       iter(t)
 
   @test_util.run_gpu_only
@@ -367,9 +366,8 @@ class TFETensorTest(test_util.TensorFlowTestCase):
     self.assertAllEqual(x, [321, 16])
 
   def testEagerTensorError(self):
-    with self.assertRaisesRegexp(
-        TypeError,
-        "Cannot convert .* to EagerTensor of dtype .*"):
+    with self.assertRaisesRegex(TypeError,
+                                "Cannot convert .* to EagerTensor of dtype .*"):
       _ = ops.convert_to_tensor(1., dtype=dtypes.int32)
 
   def testEagerLargeConstant(self):
@@ -455,12 +453,12 @@ class TFETensorUtilTest(test_util.TensorFlowTestCase):
   def testTensorListContainsNonTensors(self):
     t1 = _create_tensor([1, 2], dtype=dtypes.int32)
 
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         TypeError,
         r"Expected a list of EagerTensors but element 1 has type \"str\""):
       pywrap_tfe.TFE_Py_TensorShapeSlice([t1, "abc"], 0)
 
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         TypeError,
         r"Expected a list of EagerTensors but element 0 has type \"int\""):
       pywrap_tfe.TFE_Py_TensorShapeSlice([2, t1], 0)
@@ -468,7 +466,7 @@ class TFETensorUtilTest(test_util.TensorFlowTestCase):
   def testTensorListNotList(self):
     t1 = _create_tensor([1, 2], dtype=dtypes.int32)
 
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         TypeError,
         r"tensors argument must be a list or a tuple. Got.*EagerTensor"):
       pywrap_tfe.TFE_Py_TensorShapeSlice(t1, -2)
@@ -476,9 +474,8 @@ class TFETensorUtilTest(test_util.TensorFlowTestCase):
   def testNegativeSliceDim(self):
     t1 = _create_tensor([1, 2], dtype=dtypes.int32)
 
-    with self.assertRaisesRegexp(
-        ValueError,
-        r"Slice dimension must be non-negative. Got -2"):
+    with self.assertRaisesRegex(
+        ValueError, r"Slice dimension must be non-negative. Got -2"):
       pywrap_tfe.TFE_Py_TensorShapeSlice([t1], -2)
 
   def testUnicode(self):
@@ -495,31 +492,31 @@ class TFETensorUtilTest(test_util.TensorFlowTestCase):
     t2 = _create_tensor([1, 2], dtype=dtypes.int32)
     t3 = _create_tensor(2, dtype=dtypes.int32)
 
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         IndexError,
         r"Slice dimension \(2\) must be smaller than rank of all tensors, "
         "but tensor at index 0 has rank 2"):
       pywrap_tfe.TFE_Py_TensorShapeSlice([t1], 2)
 
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         IndexError,
         r"Slice dimension \(1\) must be smaller than rank of all tensors, "
         "but tensor at index 0 has rank 1"):
       pywrap_tfe.TFE_Py_TensorShapeSlice([t2], 1)
 
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         IndexError,
         r"Slice dimension \(1\) must be smaller than rank of all tensors, "
         "but tensor at index 1 has rank 1"):
       pywrap_tfe.TFE_Py_TensorShapeSlice([t1, t2], 1)
 
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         IndexError,
         r"Slice dimension \(0\) must be smaller than rank of all tensors, "
         "but tensor at index 0 has rank 0"):
       pywrap_tfe.TFE_Py_TensorShapeSlice([t3], 0)
 
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         IndexError,
         r"Slice dimension \(0\) must be smaller than rank of all tensors, "
         "but tensor at index 2 has rank 0"):
@@ -541,8 +538,7 @@ class TFETensorUtilTest(test_util.TensorFlowTestCase):
   def testNonRectangularPackAsConstant(self):
     l = [array_ops.zeros((10, 1)).numpy(), array_ops.zeros(1).numpy()]
 
-    with self.assertRaisesRegexp(
-        ValueError, "non-rectangular Python sequence"):
+    with self.assertRaisesRegex(ValueError, "non-rectangular Python sequence"):
       constant_op.constant(l)
 
   @test_util.assert_no_new_pyobjects_executing_eagerly
diff --git a/tensorflow/python/feature_column/feature_column_test.py b/tensorflow/python/feature_column/feature_column_test.py
index 38800fc2162..a657656b9b6 100644
--- a/tensorflow/python/feature_column/feature_column_test.py
+++ b/tensorflow/python/feature_column/feature_column_test.py
@@ -133,11 +133,11 @@ class LazyColumnTest(test.TestCase):
 
   def test_error_if_feature_is_not_found(self):
     builder = _LazyBuilder(features={'a': [[2], [3.]]})
-    with self.assertRaisesRegexp(ValueError,
-                                 'bbb is not in features dictionary'):
+    with self.assertRaisesRegex(ValueError,
+                                'bbb is not in features dictionary'):
       builder.get('bbb')
-    with self.assertRaisesRegexp(ValueError,
-                                 'bbb is not in features dictionary'):
+    with self.assertRaisesRegex(ValueError,
+                                'bbb is not in features dictionary'):
       builder.get(u'bbb')
 
   def test_not_supported_feature_column(self):
@@ -157,8 +157,8 @@ class LazyColumnTest(test.TestCase):
         pass
 
     builder = _LazyBuilder(features={'a': [[2], [3.]]})
-    with self.assertRaisesRegexp(ValueError,
-                                 'NotAProperColumn is not supported'):
+    with self.assertRaisesRegex(ValueError,
+                                'NotAProperColumn is not supported'):
       builder.get(NotAProperColumn())
 
   def test_key_should_be_string_or_feature_colum(self):
@@ -167,7 +167,7 @@ class LazyColumnTest(test.TestCase):
       pass
 
     builder = _LazyBuilder(features={'a': [[2], [3.]]})
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         TypeError, '"key" must be either a "str" or "_FeatureColumn".'):
       builder.get(NotAFeatureColumn())
 
@@ -199,7 +199,7 @@ class NumericColumnTest(test.TestCase):
     self.assertIsNone(a.normalizer_fn)
 
   def test_key_should_be_string(self):
-    with self.assertRaisesRegexp(ValueError, 'key must be a string.'):
+    with self.assertRaisesRegex(ValueError, 'key must be a string.'):
       fc._numeric_column(key=('aaa',))
 
   def test_shape_saved_as_tuple(self):
@@ -214,14 +214,14 @@ class NumericColumnTest(test.TestCase):
 
   def test_shape_and_default_value_compatibility(self):
     fc._numeric_column('aaa', shape=[2], default_value=[1, 2.])
-    with self.assertRaisesRegexp(ValueError, 'The shape of default_value'):
+    with self.assertRaisesRegex(ValueError, 'The shape of default_value'):
       fc._numeric_column('aaa', shape=[2], default_value=[1, 2, 3.])
     fc._numeric_column(
         'aaa', shape=[3, 2], default_value=[[2, 3], [1, 2], [2, 3.]])
-    with self.assertRaisesRegexp(ValueError, 'The shape of default_value'):
+    with self.assertRaisesRegex(ValueError, 'The shape of default_value'):
       fc._numeric_column(
           'aaa', shape=[3, 1], default_value=[[2, 3], [1, 2], [2, 3.]])
-    with self.assertRaisesRegexp(ValueError, 'The shape of default_value'):
+    with self.assertRaisesRegex(ValueError, 'The shape of default_value'):
       fc._numeric_column(
           'aaa', shape=[3, 3], default_value=[[2, 3], [1, 2], [2, 3.]])
 
@@ -230,30 +230,30 @@ class NumericColumnTest(test.TestCase):
         'aaa', shape=[2], default_value=[1, 2.], dtype=dtypes.float32)
     fc._numeric_column(
         'aaa', shape=[2], default_value=[1, 2], dtype=dtypes.int32)
-    with self.assertRaisesRegexp(TypeError, 'must be compatible with dtype'):
+    with self.assertRaisesRegex(TypeError, 'must be compatible with dtype'):
       fc._numeric_column(
           'aaa', shape=[2], default_value=[1, 2.], dtype=dtypes.int32)
-    with self.assertRaisesRegexp(TypeError,
-                                 'default_value must be compatible with dtype'):
+    with self.assertRaisesRegex(TypeError,
+                                'default_value must be compatible with dtype'):
       fc._numeric_column('aaa', default_value=['string'])
 
   def test_shape_must_be_positive_integer(self):
-    with self.assertRaisesRegexp(TypeError, 'shape dimensions must be integer'):
+    with self.assertRaisesRegex(TypeError, 'shape dimensions must be integer'):
       fc._numeric_column(
           'aaa', shape=[
               1.0,
           ])
 
-    with self.assertRaisesRegexp(ValueError,
-                                 'shape dimensions must be greater than 0'):
+    with self.assertRaisesRegex(ValueError,
+                                'shape dimensions must be greater than 0'):
       fc._numeric_column(
           'aaa', shape=[
               0,
           ])
 
   def test_dtype_is_convertible_to_float(self):
-    with self.assertRaisesRegexp(ValueError,
-                                 'dtype must be convertible to float'):
+    with self.assertRaisesRegex(ValueError,
+                                'dtype must be convertible to float'):
       fc._numeric_column('aaa', dtype=dtypes.string)
 
   def test_scalar_default_value_fills_the_shape(self):
@@ -306,7 +306,7 @@ class NumericColumnTest(test.TestCase):
       self.assertAllEqual([[20., 110.], [11., 11.]], features['price'].eval())
 
   def test_normalizer_fn_must_be_callable(self):
-    with self.assertRaisesRegexp(TypeError, 'must be a callable'):
+    with self.assertRaisesRegex(TypeError, 'must be a callable'):
       fc._numeric_column('price', normalizer_fn='NotACallable')
 
   @test_util.run_deprecated_v1
@@ -337,7 +337,7 @@ class NumericColumnTest(test.TestCase):
             sparse_tensor.SparseTensor(
                 indices=[[0, 0]], values=[0.3], dense_shape=[1, 1])
     })
-    with self.assertRaisesRegexp(ValueError, 'must be a Tensor'):
+    with self.assertRaisesRegex(ValueError, 'must be a Tensor'):
       price._transform_feature(builder)
 
   @test_util.run_deprecated_v1
@@ -388,30 +388,26 @@ class BucketizedColumnTest(test.TestCase):
 
   def test_invalid_source_column_type(self):
     a = fc._categorical_column_with_hash_bucket('aaa', hash_bucket_size=10)
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         ValueError,
         'source_column must be a column generated with numeric_column'):
       fc._bucketized_column(a, boundaries=[0, 1])
 
   def test_invalid_source_column_shape(self):
     a = fc._numeric_column('aaa', shape=[2, 3])
-    with self.assertRaisesRegexp(
-        ValueError, 'source_column must be one-dimensional column'):
+    with self.assertRaisesRegex(ValueError,
+                                'source_column must be one-dimensional column'):
       fc._bucketized_column(a, boundaries=[0, 1])
 
   def test_invalid_boundaries(self):
     a = fc._numeric_column('aaa')
-    with self.assertRaisesRegexp(
-        ValueError, 'boundaries must be a sorted list'):
+    with self.assertRaisesRegex(ValueError, 'boundaries must be a sorted list'):
       fc._bucketized_column(a, boundaries=None)
-    with self.assertRaisesRegexp(
-        ValueError, 'boundaries must be a sorted list'):
+    with self.assertRaisesRegex(ValueError, 'boundaries must be a sorted list'):
       fc._bucketized_column(a, boundaries=1.)
-    with self.assertRaisesRegexp(
-        ValueError, 'boundaries must be a sorted list'):
+    with self.assertRaisesRegex(ValueError, 'boundaries must be a sorted list'):
       fc._bucketized_column(a, boundaries=[1, 0])
-    with self.assertRaisesRegexp(
-        ValueError, 'boundaries must be a sorted list'):
+    with self.assertRaisesRegex(ValueError, 'boundaries must be a sorted list'):
       fc._bucketized_column(a, boundaries=[1, 1])
 
   def test_name(self):
@@ -540,7 +536,7 @@ class BucketizedColumnTest(test.TestCase):
             sparse_tensor.SparseTensor(
                 indices=[[0, 0]], values=[0.3], dense_shape=[1, 1])
     })
-    with self.assertRaisesRegexp(ValueError, 'must be a Tensor'):
+    with self.assertRaisesRegex(ValueError, 'must be a Tensor'):
       bucketized_price._transform_feature(builder)
 
   @test_util.run_deprecated_v1
@@ -681,22 +677,22 @@ class HashedCategoricalColumnTest(test.TestCase):
     self.assertEqual(dtypes.string, a.dtype)
 
   def test_key_should_be_string(self):
-    with self.assertRaisesRegexp(ValueError, 'key must be a string.'):
+    with self.assertRaisesRegex(ValueError, 'key must be a string.'):
       fc._categorical_column_with_hash_bucket(('key',), 10)
 
   def test_bucket_size_should_be_given(self):
-    with self.assertRaisesRegexp(ValueError, 'hash_bucket_size must be set.'):
+    with self.assertRaisesRegex(ValueError, 'hash_bucket_size must be set.'):
       fc._categorical_column_with_hash_bucket('aaa', None)
 
   def test_bucket_size_should_be_positive(self):
-    with self.assertRaisesRegexp(ValueError,
-                                 'hash_bucket_size must be at least 1'):
+    with self.assertRaisesRegex(ValueError,
+                                'hash_bucket_size must be at least 1'):
       fc._categorical_column_with_hash_bucket('aaa', 0)
 
   def test_dtype_should_be_string_or_integer(self):
     fc._categorical_column_with_hash_bucket('aaa', 10, dtype=dtypes.string)
     fc._categorical_column_with_hash_bucket('aaa', 10, dtype=dtypes.int32)
-    with self.assertRaisesRegexp(ValueError, 'dtype must be string or integer'):
+    with self.assertRaisesRegex(ValueError, 'dtype must be string or integer'):
       fc._categorical_column_with_hash_bucket('aaa', 10, dtype=dtypes.float32)
 
   @test_util.run_deprecated_v1
@@ -786,7 +782,7 @@ class HashedCategoricalColumnTest(test.TestCase):
     })
     builder.get(string_fc)
     builder.get(int_fc)
-    with self.assertRaisesRegexp(ValueError, 'dtype must be string or integer'):
+    with self.assertRaisesRegex(ValueError, 'dtype must be string or integer'):
       builder.get(float_fc)
 
   def test_dtype_should_match_with_tensor(self):
@@ -795,7 +791,7 @@ class HashedCategoricalColumnTest(test.TestCase):
     wire_tensor = sparse_tensor.SparseTensor(
         values=['omar'], indices=[[0, 0]], dense_shape=[1, 1])
     builder = _LazyBuilder({'wire': wire_tensor})
-    with self.assertRaisesRegexp(ValueError, 'dtype must be compatible'):
+    with self.assertRaisesRegex(ValueError, 'dtype must be compatible'):
       builder.get(hashed_sparse)
 
   @test_util.run_deprecated_v1
@@ -916,37 +912,34 @@ class HashedCategoricalColumnTest(test.TestCase):
 class CrossedColumnTest(test.TestCase):
 
   def test_keys_empty(self):
-    with self.assertRaisesRegexp(
-        ValueError, 'keys must be a list with length > 1'):
+    with self.assertRaisesRegex(ValueError,
+                                'keys must be a list with length > 1'):
       fc._crossed_column([], 10)
 
   def test_keys_length_one(self):
-    with self.assertRaisesRegexp(
-        ValueError, 'keys must be a list with length > 1'):
+    with self.assertRaisesRegex(ValueError,
+                                'keys must be a list with length > 1'):
       fc._crossed_column(['a'], 10)
 
   def test_key_type_unsupported(self):
-    with self.assertRaisesRegexp(ValueError, 'Unsupported key type'):
+    with self.assertRaisesRegex(ValueError, 'Unsupported key type'):
       fc._crossed_column(['a', fc._numeric_column('c')], 10)
 
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         ValueError, 'categorical_column_with_hash_bucket is not supported'):
       fc._crossed_column(
           ['a', fc._categorical_column_with_hash_bucket('c', 10)], 10)
 
   def test_hash_bucket_size_negative(self):
-    with self.assertRaisesRegexp(
-        ValueError, 'hash_bucket_size must be > 1'):
+    with self.assertRaisesRegex(ValueError, 'hash_bucket_size must be > 1'):
       fc._crossed_column(['a', 'c'], -1)
 
   def test_hash_bucket_size_zero(self):
-    with self.assertRaisesRegexp(
-        ValueError, 'hash_bucket_size must be > 1'):
+    with self.assertRaisesRegex(ValueError, 'hash_bucket_size must be > 1'):
       fc._crossed_column(['a', 'c'], 0)
 
   def test_hash_bucket_size_none(self):
-    with self.assertRaisesRegexp(
-        ValueError, 'hash_bucket_size must be > 1'):
+    with self.assertRaisesRegex(ValueError, 'hash_bucket_size must be > 1'):
       fc._crossed_column(['a', 'c'], None)
 
   def test_name(self):
@@ -1192,7 +1185,7 @@ class CrossedColumnTest(test.TestCase):
     t = _TestColumnWithWeights()
     crossed = fc._crossed_column([t, 'c'], hash_bucket_size=5, hash_key=5)
     with ops.Graph().as_default():
-      with self.assertRaisesRegexp(
+      with self.assertRaisesRegex(
           ValueError,
           'crossed_column does not support weight_tensor.*{}'.format(t.name)):
         fc.linear_model({
@@ -1280,7 +1273,7 @@ class CrossedColumnTest(test.TestCase):
     t = _TestColumnWithWeights()
     crossed = fc._crossed_column([t, 'c'], hash_bucket_size=5, hash_key=5)
     with ops.Graph().as_default():
-      with self.assertRaisesRegexp(
+      with self.assertRaisesRegex(
           ValueError,
           'crossed_column does not support weight_tensor.*{}'.format(t.name)):
         get_keras_linear_model_predictions({
@@ -1335,12 +1328,12 @@ def get_keras_linear_model_predictions(features,
 class LinearModelTest(test.TestCase):
 
   def test_raises_if_empty_feature_columns(self):
-    with self.assertRaisesRegexp(ValueError,
-                                 'feature_columns must not be empty'):
+    with self.assertRaisesRegex(ValueError,
+                                'feature_columns must not be empty'):
       fc.linear_model(features={}, feature_columns=[])
 
   def test_should_be_feature_column(self):
-    with self.assertRaisesRegexp(ValueError, 'must be a _FeatureColumn'):
+    with self.assertRaisesRegex(ValueError, 'must be a _FeatureColumn'):
       fc.linear_model(features={'a': [[0]]}, feature_columns='NotSupported')
 
   def test_should_be_dense_or_categorical_column(self):
@@ -1358,19 +1351,19 @@ class LinearModelTest(test.TestCase):
       def _parse_example_spec(self):
         pass
 
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         ValueError, 'must be either a _DenseColumn or _CategoricalColumn'):
       fc.linear_model(
           features={'a': [[0]]}, feature_columns=[NotSupportedColumn()])
 
   def test_does_not_support_dict_columns(self):
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         ValueError, 'Expected feature_columns to be iterable, found dict.'):
       fc.linear_model(
           features={'a': [[0]]}, feature_columns={'a': fc._numeric_column('a')})
 
   def test_raises_if_duplicate_name(self):
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         ValueError, 'Duplicate feature column name found for columns'):
       fc.linear_model(
           features={'a': [[0]]},
@@ -1608,7 +1601,7 @@ class LinearModelTest(test.TestCase):
     price = fc._numeric_column('price', shape=2)
     with ops.Graph().as_default():
       features = {'price': [[1.], [5.]]}
-      with self.assertRaisesRegexp(
+      with self.assertRaisesRegex(
           Exception,
           r'Cannot reshape a tensor with 2 elements to shape \[2,2\]'):
         fc.linear_model(features, [price])
@@ -1831,7 +1824,7 @@ class LinearModelTest(test.TestCase):
           'price1': [[1.], [5.], [7.]],  # batchsize = 3
           'price2': [[3.], [4.]]  # batchsize = 2
       }
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         ValueError,
         r'Batch size \(first dimension\) of each feature must be same.'):
       fc.linear_model(features, [price1, price2])
@@ -1846,7 +1839,7 @@ class LinearModelTest(test.TestCase):
           'price2': [[3.], [4.]],  # batchsize = 2
           'price3': [[3.], [4.], [5.]]  # batchsize = 3
       }
-      with self.assertRaisesRegexp(
+      with self.assertRaisesRegex(
           ValueError,
           r'Batch size \(first dimension\) of each feature must be same.'):  # pylint: disable=anomalous-backslash-in-string
         fc.linear_model(features, [price1, price2, price3])
@@ -1861,8 +1854,8 @@ class LinearModelTest(test.TestCase):
       }
       predictions = fc.linear_model(features, [price1, price2])
       with _initialized_session() as sess:
-        with self.assertRaisesRegexp(errors.OpError,
-                                     'must have the same size and shape'):
+        with self.assertRaisesRegex(errors.OpError,
+                                    'must have the same size and shape'):
           sess.run(
               predictions, feed_dict={features['price1']: [[1.], [5.], [7.]]})
 
@@ -1976,7 +1969,7 @@ class LinearModelTest(test.TestCase):
     self.assertEqual(0, features['price'].shape.ndims)
 
     # Static rank 0 should fail
-    with self.assertRaisesRegexp(ValueError, 'Feature .* cannot have rank 0'):
+    with self.assertRaisesRegex(ValueError, 'Feature .* cannot have rank 0'):
       fc.linear_model(features, [price])
 
     # Dynamic rank 0 should fail
@@ -2014,12 +2007,12 @@ class LinearModelTest(test.TestCase):
 class _LinearModelTest(test.TestCase):
 
   def test_raises_if_empty_feature_columns(self):
-    with self.assertRaisesRegexp(ValueError,
-                                 'feature_columns must not be empty'):
+    with self.assertRaisesRegex(ValueError,
+                                'feature_columns must not be empty'):
       get_keras_linear_model_predictions(features={}, feature_columns=[])
 
   def test_should_be_feature_column(self):
-    with self.assertRaisesRegexp(ValueError, 'must be a _FeatureColumn'):
+    with self.assertRaisesRegex(ValueError, 'must be a _FeatureColumn'):
       get_keras_linear_model_predictions(
           features={'a': [[0]]}, feature_columns='NotSupported')
 
@@ -2038,19 +2031,19 @@ class _LinearModelTest(test.TestCase):
       def _parse_example_spec(self):
         pass
 
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         ValueError, 'must be either a _DenseColumn or _CategoricalColumn'):
       get_keras_linear_model_predictions(
           features={'a': [[0]]}, feature_columns=[NotSupportedColumn()])
 
   def test_does_not_support_dict_columns(self):
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         ValueError, 'Expected feature_columns to be iterable, found dict.'):
       fc.linear_model(
           features={'a': [[0]]}, feature_columns={'a': fc._numeric_column('a')})
 
   def test_raises_if_duplicate_name(self):
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         ValueError, 'Duplicate feature column name found for columns'):
       get_keras_linear_model_predictions(
           features={'a': [[0]]},
@@ -2276,7 +2269,7 @@ class _LinearModelTest(test.TestCase):
     price = fc._numeric_column('price', shape=2)
     with ops.Graph().as_default():
       features = {'price': [[1.], [5.]]}
-      with self.assertRaisesRegexp(
+      with self.assertRaisesRegex(
           Exception,
           r'Cannot reshape a tensor with 2 elements to shape \[2,2\]'):
         get_keras_linear_model_predictions(features, [price])
@@ -2466,7 +2459,7 @@ class _LinearModelTest(test.TestCase):
           'price1': [[1.], [5.], [7.]],  # batchsize = 3
           'price2': [[3.], [4.]]  # batchsize = 2
       }
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         ValueError,
         r'Batch size \(first dimension\) of each feature must be same.'):  # pylint: disable=anomalous-backslash-in-string
       get_keras_linear_model_predictions(features, [price1, price2])
@@ -2481,7 +2474,7 @@ class _LinearModelTest(test.TestCase):
           'price2': [[3.], [4.]],  # batchsize = 2
           'price3': [[3.], [4.], [5.]]  # batchsize = 3
       }
-      with self.assertRaisesRegexp(
+      with self.assertRaisesRegex(
           ValueError,
           r'Batch size \(first dimension\) of each feature must be same.'):  # pylint: disable=anomalous-backslash-in-string
         get_keras_linear_model_predictions(features, [price1, price2, price3])
@@ -2497,8 +2490,8 @@ class _LinearModelTest(test.TestCase):
       predictions = get_keras_linear_model_predictions(features,
                                                        [price1, price2])
       with _initialized_session() as sess:
-        with self.assertRaisesRegexp(errors.OpError,
-                                     'must have the same size and shape'):
+        with self.assertRaisesRegex(errors.OpError,
+                                    'must have the same size and shape'):
           sess.run(
               predictions, feed_dict={features['price1']: [[1.], [5.], [7.]]})
 
@@ -2618,7 +2611,7 @@ class _LinearModelTest(test.TestCase):
     self.assertEqual(0, features['price'].shape.ndims)
 
     # Static rank 0 should fail
-    with self.assertRaisesRegexp(ValueError, 'Feature .* cannot have rank 0'):
+    with self.assertRaisesRegex(ValueError, 'Feature .* cannot have rank 0'):
       get_keras_linear_model_predictions(features, [price])
 
     # Dynamic rank 0 should fail
@@ -2735,12 +2728,12 @@ class InputLayerTest(test.TestCase):
 class FunctionalInputLayerTest(test.TestCase):
 
   def test_raises_if_empty_feature_columns(self):
-    with self.assertRaisesRegexp(ValueError,
-                                 'feature_columns must not be empty'):
+    with self.assertRaisesRegex(ValueError,
+                                'feature_columns must not be empty'):
       fc.input_layer(features={}, feature_columns=[])
 
   def test_should_be_dense_column(self):
-    with self.assertRaisesRegexp(ValueError, 'must be a _DenseColumn'):
+    with self.assertRaisesRegex(ValueError, 'must be a _DenseColumn'):
       fc.input_layer(
           features={'a': [[0]]},
           feature_columns=[
@@ -2748,7 +2741,7 @@ class FunctionalInputLayerTest(test.TestCase):
           ])
 
   def test_does_not_support_dict_columns(self):
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         ValueError, 'Expected feature_columns to be iterable, found dict.'):
       fc.input_layer(
           features={'a': [[0]]}, feature_columns={'a': fc._numeric_column('a')})
@@ -2769,7 +2762,7 @@ class FunctionalInputLayerTest(test.TestCase):
         self.assertAllClose([[0., 1.]], self.evaluate(net))
 
   def test_raises_if_duplicate_name(self):
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         ValueError, 'Duplicate feature column name found for columns'):
       fc.input_layer(
           features={'a': [[0]]},
@@ -2796,7 +2789,7 @@ class FunctionalInputLayerTest(test.TestCase):
     price = fc._numeric_column('price', shape=2)
     with ops.Graph().as_default():
       features = {'price': [[1.], [5.]]}
-      with self.assertRaisesRegexp(
+      with self.assertRaisesRegex(
           Exception,
           r'Cannot reshape a tensor with 2 elements to shape \[2,2\]'):
         fc.input_layer(features, [price])
@@ -2962,7 +2955,7 @@ class FunctionalInputLayerTest(test.TestCase):
               sparse_tensor.SparseTensor(
                   indices=[[0, 0], [0, 1]], values=[1, 2], dense_shape=[1, 2])
       }
-      with self.assertRaisesRegexp(Exception, 'must be a _DenseColumn'):
+      with self.assertRaisesRegex(Exception, 'must be a _DenseColumn'):
         fc.input_layer(features, [animal])
 
   def test_static_batch_size_mismatch(self):
@@ -2973,7 +2966,7 @@ class FunctionalInputLayerTest(test.TestCase):
           'price1': [[1.], [5.], [7.]],  # batchsize = 3
           'price2': [[3.], [4.]]  # batchsize = 2
       }
-      with self.assertRaisesRegexp(
+      with self.assertRaisesRegex(
           ValueError,
           r'Batch size \(first dimension\) of each feature must be same.'):  # pylint: disable=anomalous-backslash-in-string
         fc.input_layer(features, [price1, price2])
@@ -2988,7 +2981,7 @@ class FunctionalInputLayerTest(test.TestCase):
           'price2': [[3.], [4.]],  # batchsize = 2
           'price3': [[3.], [4.], [5.]]  # batchsize = 3
       }
-      with self.assertRaisesRegexp(
+      with self.assertRaisesRegex(
           ValueError,
           r'Batch size \(first dimension\) of each feature must be same.'):  # pylint: disable=anomalous-backslash-in-string
         fc.input_layer(features, [price1, price2, price3])
@@ -3003,8 +2996,8 @@ class FunctionalInputLayerTest(test.TestCase):
       }
       net = fc.input_layer(features, [price1, price2])
       with _initialized_session() as sess:
-        with self.assertRaisesRegexp(errors.OpError,
-                                     'Dimensions of inputs should match'):
+        with self.assertRaisesRegex(errors.OpError,
+                                    'Dimensions of inputs should match'):
           sess.run(net, feed_dict={features['price1']: [[1.], [5.], [7.]]})
 
   def test_runtime_batch_size_matches(self):
@@ -3255,7 +3248,7 @@ class FunctionalInputLayerTest(test.TestCase):
     self.assertEqual(0, features['price'].shape.ndims)
 
     # Static rank 0 should fail
-    with self.assertRaisesRegexp(ValueError, 'Feature .* cannot have rank 0'):
+    with self.assertRaisesRegex(ValueError, 'Feature .* cannot have rank 0'):
       fc.input_layer(features, [price])
 
     # Dynamic rank 0 should fail
@@ -3287,7 +3280,7 @@ class MakeParseExampleSpecTest(test.TestCase):
     key1 = 'key1'
     parse_spec1 = parsing_ops.FixedLenFeature(
         shape=(2,), dtype=dtypes.float32, default_value=0.)
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         ValueError,
         'All feature_columns must be _FeatureColumn instances.*invalid_column'):
       fc.make_parse_example_spec(
@@ -3317,7 +3310,7 @@ class MakeParseExampleSpecTest(test.TestCase):
     parse_spec1 = parsing_ops.FixedLenFeature(
         shape=(2,), dtype=dtypes.float32, default_value=0.)
     parse_spec2 = parsing_ops.VarLenFeature(dtype=dtypes.string)
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         ValueError,
         'feature_columns contain different parse_spec for key key1'):
       fc.make_parse_example_spec(
@@ -3389,7 +3382,7 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
     }, column._parse_example_spec)
 
   def test_key_should_be_string(self):
-    with self.assertRaisesRegexp(ValueError, 'key must be a string.'):
+    with self.assertRaisesRegex(ValueError, 'key must be a string.'):
       fc._categorical_column_with_vocabulary_file(
           key=('aaa',), vocabulary_file='path_to_file', vocabulary_size=3)
 
@@ -3422,12 +3415,12 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
       }, column._parse_example_spec)
 
   def test_vocabulary_file_none(self):
-    with self.assertRaisesRegexp(ValueError, 'Missing vocabulary_file'):
+    with self.assertRaisesRegex(ValueError, 'Missing vocabulary_file'):
       fc._categorical_column_with_vocabulary_file(
           key='aaa', vocabulary_file=None, vocabulary_size=3)
 
   def test_vocabulary_file_empty_string(self):
-    with self.assertRaisesRegexp(ValueError, 'Missing vocabulary_file'):
+    with self.assertRaisesRegex(ValueError, 'Missing vocabulary_file'):
       fc._categorical_column_with_vocabulary_file(
           key='aaa', vocabulary_file='', vocabulary_size=3)
 
@@ -3440,17 +3433,17 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
         values=('marlo', 'skywalker', 'omar'),
         dense_shape=(2, 2))
     column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
-    with self.assertRaisesRegexp(errors.OpError, 'file_does_not_exist'):
+    with self.assertRaisesRegex(errors.OpError, 'file_does_not_exist'):
       with self.cached_session():
         lookup_ops.tables_initializer().run()
 
   def test_invalid_vocabulary_size(self):
-    with self.assertRaisesRegexp(ValueError, 'Invalid vocabulary_size'):
+    with self.assertRaisesRegex(ValueError, 'Invalid vocabulary_size'):
       fc._categorical_column_with_vocabulary_file(
           key='aaa',
           vocabulary_file=self._wire_vocabulary_file_name,
           vocabulary_size=-1)
-    with self.assertRaisesRegexp(ValueError, 'Invalid vocabulary_size'):
+    with self.assertRaisesRegex(ValueError, 'Invalid vocabulary_size'):
       fc._categorical_column_with_vocabulary_file(
           key='aaa',
           vocabulary_file=self._wire_vocabulary_file_name,
@@ -3467,12 +3460,12 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
         values=('marlo', 'skywalker', 'omar'),
         dense_shape=(2, 2))
     column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
-    with self.assertRaisesRegexp(errors.OpError, 'Invalid vocab_size'):
+    with self.assertRaisesRegex(errors.OpError, 'Invalid vocab_size'):
       with self.cached_session():
         lookup_ops.tables_initializer().run()
 
   def test_invalid_num_oov_buckets(self):
-    with self.assertRaisesRegexp(ValueError, 'Invalid num_oov_buckets'):
+    with self.assertRaisesRegex(ValueError, 'Invalid num_oov_buckets'):
       fc._categorical_column_with_vocabulary_file(
           key='aaa',
           vocabulary_file='path',
@@ -3480,7 +3473,7 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
           num_oov_buckets=-1)
 
   def test_invalid_dtype(self):
-    with self.assertRaisesRegexp(ValueError, 'dtype must be string or integer'):
+    with self.assertRaisesRegex(ValueError, 'dtype must be string or integer'):
       fc._categorical_column_with_vocabulary_file(
           key='aaa',
           vocabulary_file='path',
@@ -3488,8 +3481,8 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
           dtype=dtypes.float64)
 
   def test_invalid_buckets_and_default_value(self):
-    with self.assertRaisesRegexp(
-        ValueError, 'both num_oov_buckets and default_value'):
+    with self.assertRaisesRegex(ValueError,
+                                'both num_oov_buckets and default_value'):
       fc._categorical_column_with_vocabulary_file(
           key='aaa',
           vocabulary_file=self._wire_vocabulary_file_name,
@@ -3507,7 +3500,7 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
         indices=((0, 0), (1, 0), (1, 1)),
         values=(12, 24, 36),
         dense_shape=(2, 2))
-    with self.assertRaisesRegexp(ValueError, 'dtype must be compatible'):
+    with self.assertRaisesRegex(ValueError, 'dtype must be compatible'):
       column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
 
   def test_invalid_input_dtype_string(self):
@@ -3520,7 +3513,7 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
         indices=((0, 0), (1, 0), (1, 1)),
         values=('omar', 'stringer', 'marlo'),
         dense_shape=(2, 2))
-    with self.assertRaisesRegexp(ValueError, 'dtype must be compatible'):
+    with self.assertRaisesRegex(ValueError, 'dtype must be compatible'):
       column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
 
   @test_util.run_deprecated_v1
@@ -3849,7 +3842,7 @@ class VocabularyListCategoricalColumnTest(test.TestCase):
     }, column._parse_example_spec)
 
   def test_key_should_be_string(self):
-    with self.assertRaisesRegexp(ValueError, 'key must be a string.'):
+    with self.assertRaisesRegex(ValueError, 'key must be a string.'):
       fc._categorical_column_with_vocabulary_list(
           key=('aaa',), vocabulary_list=('omar', 'stringer', 'marlo'))
 
@@ -3888,57 +3881,57 @@ class VocabularyListCategoricalColumnTest(test.TestCase):
       }, column._parse_example_spec)
 
   def test_invalid_dtype(self):
-    with self.assertRaisesRegexp(ValueError, 'dtype must be string or integer'):
+    with self.assertRaisesRegex(ValueError, 'dtype must be string or integer'):
       fc._categorical_column_with_vocabulary_list(
           key='aaa',
           vocabulary_list=('omar', 'stringer', 'marlo'),
           dtype=dtypes.float32)
 
   def test_invalid_mapping_dtype(self):
-    with self.assertRaisesRegexp(
-        ValueError, r'vocabulary dtype must be string or integer'):
+    with self.assertRaisesRegex(ValueError,
+                                r'vocabulary dtype must be string or integer'):
       fc._categorical_column_with_vocabulary_list(
           key='aaa', vocabulary_list=(12., 24., 36.))
 
   def test_mismatched_int_dtype(self):
-    with self.assertRaisesRegexp(
-        ValueError, r'dtype.*and vocabulary dtype.*do not match'):
+    with self.assertRaisesRegex(ValueError,
+                                r'dtype.*and vocabulary dtype.*do not match'):
       fc._categorical_column_with_vocabulary_list(
           key='aaa',
           vocabulary_list=('omar', 'stringer', 'marlo'),
           dtype=dtypes.int32)
 
   def test_mismatched_string_dtype(self):
-    with self.assertRaisesRegexp(
-        ValueError, r'dtype.*and vocabulary dtype.*do not match'):
+    with self.assertRaisesRegex(ValueError,
+                                r'dtype.*and vocabulary dtype.*do not match'):
       fc._categorical_column_with_vocabulary_list(
           key='aaa', vocabulary_list=(12, 24, 36), dtype=dtypes.string)
 
   def test_none_mapping(self):
-    with self.assertRaisesRegexp(
-        ValueError, r'vocabulary_list.*must be non-empty'):
+    with self.assertRaisesRegex(ValueError,
+                                r'vocabulary_list.*must be non-empty'):
       fc._categorical_column_with_vocabulary_list(
           key='aaa', vocabulary_list=None)
 
   def test_empty_mapping(self):
-    with self.assertRaisesRegexp(
-        ValueError, r'vocabulary_list.*must be non-empty'):
+    with self.assertRaisesRegex(ValueError,
+                                r'vocabulary_list.*must be non-empty'):
       fc._categorical_column_with_vocabulary_list(
           key='aaa', vocabulary_list=tuple([]))
 
   def test_duplicate_mapping(self):
-    with self.assertRaisesRegexp(ValueError, 'Duplicate keys'):
+    with self.assertRaisesRegex(ValueError, 'Duplicate keys'):
       fc._categorical_column_with_vocabulary_list(
           key='aaa', vocabulary_list=(12, 24, 12))
 
   def test_invalid_num_oov_buckets(self):
-    with self.assertRaisesRegexp(ValueError, 'Invalid num_oov_buckets'):
+    with self.assertRaisesRegex(ValueError, 'Invalid num_oov_buckets'):
       fc._categorical_column_with_vocabulary_list(
           key='aaa', vocabulary_list=(12, 24, 36), num_oov_buckets=-1)
 
   def test_invalid_buckets_and_default_value(self):
-    with self.assertRaisesRegexp(
-        ValueError, 'both num_oov_buckets and default_value'):
+    with self.assertRaisesRegex(ValueError,
+                                'both num_oov_buckets and default_value'):
       fc._categorical_column_with_vocabulary_list(
           key='aaa',
           vocabulary_list=(12, 24, 36),
@@ -3952,7 +3945,7 @@ class VocabularyListCategoricalColumnTest(test.TestCase):
         indices=((0, 0), (1, 0), (1, 1)),
         values=(12, 24, 36),
         dense_shape=(2, 2))
-    with self.assertRaisesRegexp(ValueError, 'dtype must be compatible'):
+    with self.assertRaisesRegex(ValueError, 'dtype must be compatible'):
       column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
 
   def test_invalid_input_dtype_string(self):
@@ -3962,7 +3955,7 @@ class VocabularyListCategoricalColumnTest(test.TestCase):
         indices=((0, 0), (1, 0), (1, 1)),
         values=('omar', 'stringer', 'marlo'),
         dense_shape=(2, 2))
-    with self.assertRaisesRegexp(ValueError, 'dtype must be compatible'):
+    with self.assertRaisesRegex(ValueError, 'dtype must be compatible'):
       column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
 
   @test_util.run_deprecated_v1
@@ -4257,7 +4250,7 @@ class IdentityCategoricalColumnTest(test.TestCase):
     }, column._parse_example_spec)
 
   def test_key_should_be_string(self):
-    with self.assertRaisesRegexp(ValueError, 'key must be a string.'):
+    with self.assertRaisesRegex(ValueError, 'key must be a string.'):
       fc._categorical_column_with_identity(key=('aaa',), num_buckets=3)
 
   @test_util.run_deprecated_v1
@@ -4271,20 +4264,20 @@ class IdentityCategoricalColumnTest(test.TestCase):
       }, column._parse_example_spec)
 
   def test_invalid_num_buckets_zero(self):
-    with self.assertRaisesRegexp(ValueError, 'num_buckets 0 < 1'):
+    with self.assertRaisesRegex(ValueError, 'num_buckets 0 < 1'):
       fc._categorical_column_with_identity(key='aaa', num_buckets=0)
 
   def test_invalid_num_buckets_negative(self):
-    with self.assertRaisesRegexp(ValueError, 'num_buckets -1 < 1'):
+    with self.assertRaisesRegex(ValueError, 'num_buckets -1 < 1'):
       fc._categorical_column_with_identity(key='aaa', num_buckets=-1)
 
   def test_invalid_default_value_too_small(self):
-    with self.assertRaisesRegexp(ValueError, 'default_value -1 not in range'):
+    with self.assertRaisesRegex(ValueError, 'default_value -1 not in range'):
       fc._categorical_column_with_identity(
           key='aaa', num_buckets=3, default_value=-1)
 
   def test_invalid_default_value_too_big(self):
-    with self.assertRaisesRegexp(ValueError, 'default_value 3 not in range'):
+    with self.assertRaisesRegex(ValueError, 'default_value 3 not in range'):
       fc._categorical_column_with_identity(
           key='aaa', num_buckets=3, default_value=3)
 
@@ -4294,7 +4287,7 @@ class IdentityCategoricalColumnTest(test.TestCase):
         indices=((0, 0), (1, 0), (1, 1)),
         values=('omar', 'stringer', 'marlo'),
         dense_shape=(2, 2))
-    with self.assertRaisesRegexp(ValueError, 'Invalid input, not integer'):
+    with self.assertRaisesRegex(ValueError, 'Invalid input, not integer'):
       column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
 
   @test_util.run_deprecated_v1
@@ -4416,8 +4409,8 @@ class IdentityCategoricalColumnTest(test.TestCase):
         _LazyBuilder({'aaa': sparse_input}))
 
     with _initialized_session():
-      with self.assertRaisesRegexp(errors.OpError,
-                                   r'indices\[0\] .* 2 .* \[0, 2\)'):
+      with self.assertRaisesRegex(errors.OpError,
+                                  r'indices\[0\] .* 2 .* \[0, 2\)'):
         self.evaluate(embedding_lookup)
 
   @test_util.run_deprecated_v1
@@ -4912,7 +4905,7 @@ class EmbeddingColumnTest(test.TestCase, parameterized.TestCase):
   def test_invalid_initializer(self):
     categorical_column = fc._categorical_column_with_identity(
         key='aaa', num_buckets=3)
-    with self.assertRaisesRegexp(ValueError, 'initializer must be callable'):
+    with self.assertRaisesRegex(ValueError, 'initializer must be callable'):
       fc._embedding_column(
           categorical_column, dimension=2, initializer='not_fn')
 
@@ -5710,7 +5703,7 @@ class SharedEmbeddingColumnTest(test.TestCase, parameterized.TestCase):
         key='aaa', num_buckets=3)
     categorical_column_b = fc._categorical_column_with_identity(
         key='bbb', num_buckets=3)
-    with self.assertRaisesRegexp(ValueError, 'initializer must be callable'):
+    with self.assertRaisesRegex(ValueError, 'initializer must be callable'):
       fc_new.shared_embedding_columns(
           [categorical_column_a, categorical_column_b],
           dimension=2,
@@ -5724,9 +5717,8 @@ class SharedEmbeddingColumnTest(test.TestCase, parameterized.TestCase):
         key='bbb', num_buckets=3)
     categorical_column_c = fc._categorical_column_with_hash_bucket(
         key='ccc', hash_bucket_size=3)
-    with self.assertRaisesRegexp(
-        ValueError,
-        'all categorical_columns must have the same type.*'
+    with self.assertRaisesRegex(
+        ValueError, 'all categorical_columns must have the same type.*'
         '_IdentityCategoricalColumn.*_HashedCategoricalColumn'):
       fc_new.shared_embedding_columns(
           [categorical_column_a, categorical_column_b, categorical_column_c],
@@ -6347,7 +6339,7 @@ class WeightedCategoricalColumnTest(test.TestCase):
       }, column._parse_example_spec)
 
   def test_invalid_dtype_none(self):
-    with self.assertRaisesRegexp(ValueError, 'is not convertible to float'):
+    with self.assertRaisesRegex(ValueError, 'is not convertible to float'):
       fc._weighted_categorical_column(
           categorical_column=fc._categorical_column_with_identity(
               key='ids', num_buckets=3),
@@ -6355,7 +6347,7 @@ class WeightedCategoricalColumnTest(test.TestCase):
           dtype=None)
 
   def test_invalid_dtype_string(self):
-    with self.assertRaisesRegexp(ValueError, 'is not convertible to float'):
+    with self.assertRaisesRegex(ValueError, 'is not convertible to float'):
       fc._weighted_categorical_column(
           categorical_column=fc._categorical_column_with_identity(
               key='ids', num_buckets=3),
@@ -6371,11 +6363,11 @@ class WeightedCategoricalColumnTest(test.TestCase):
         indices=((0, 0), (1, 0), (1, 1)),
         values=('omar', 'stringer', 'marlo'),
         dense_shape=(2, 2))
-    with self.assertRaisesRegexp(ValueError, 'Bad dtype'):
+    with self.assertRaisesRegex(ValueError, 'Bad dtype'):
       _transform_features({'ids': strings, 'values': strings}, (column,))
 
   def test_column_name_collision(self):
-    with self.assertRaisesRegexp(ValueError, r'Parse config.*already exists'):
+    with self.assertRaisesRegex(ValueError, r'Parse config.*already exists'):
       fc._weighted_categorical_column(
           categorical_column=fc._categorical_column_with_identity(
               key='aaa', num_buckets=3),
@@ -6390,8 +6382,8 @@ class WeightedCategoricalColumnTest(test.TestCase):
         indices=((0, 0), (1, 0), (1, 1)),
         values=('omar', 'stringer', 'marlo'),
         dense_shape=(2, 2))
-    with self.assertRaisesRegexp(
-        ValueError, 'values is not in features dictionary'):
+    with self.assertRaisesRegex(ValueError,
+                                'values is not in features dictionary'):
       _transform_features({'ids': inputs}, (column,))
 
   @test_util.run_deprecated_v1
@@ -6555,8 +6547,8 @@ class WeightedCategoricalColumnTest(test.TestCase):
             key='ids', num_buckets=3),
         weight_feature_key='values')
     with ops.Graph().as_default():
-      with self.assertRaisesRegexp(ValueError,
-                                   r'Dimensions.*are not compatible'):
+      with self.assertRaisesRegex(ValueError,
+                                  r'Dimensions.*are not compatible'):
         get_keras_linear_model_predictions({
             'ids':
                 sparse_tensor.SparseTensorValue(
@@ -6592,7 +6584,7 @@ class WeightedCategoricalColumnTest(test.TestCase):
       config.graph_options.rewrite_options.constant_folding = (
           rewriter_config_pb2.RewriterConfig.OFF)
       with _initialized_session(config):
-        with self.assertRaisesRegexp(errors.OpError, 'Incompatible shapes'):
+        with self.assertRaisesRegex(errors.OpError, 'Incompatible shapes'):
           self.evaluate(predictions)
 
   def test_keras_linear_model_mismatched_dense_shape(self):
@@ -6656,8 +6648,8 @@ class WeightedCategoricalColumnTest(test.TestCase):
             key='ids', num_buckets=3),
         weight_feature_key='values')
     with ops.Graph().as_default():
-      with self.assertRaisesRegexp(
-          ValueError, r'Dimensions.*are not compatible'):
+      with self.assertRaisesRegex(ValueError,
+                                  r'Dimensions.*are not compatible'):
         fc.linear_model({
             'ids': sparse_tensor.SparseTensorValue(
                 indices=((0, 0), (1, 0), (1, 1)),
@@ -6691,7 +6683,7 @@ class WeightedCategoricalColumnTest(test.TestCase):
       config.graph_options.rewrite_options.constant_folding = (
           rewriter_config_pb2.RewriterConfig.OFF)
       with _initialized_session(config):
-        with self.assertRaisesRegexp(errors.OpError, 'Incompatible shapes'):
+        with self.assertRaisesRegex(errors.OpError, 'Incompatible shapes'):
           self.evaluate(predictions)
 
   def test_linear_model_mismatched_dense_shape(self):
diff --git a/tensorflow/python/feature_column/feature_column_v2_test.py b/tensorflow/python/feature_column/feature_column_v2_test.py
index dda1af8a00e..47cda174064 100644
--- a/tensorflow/python/feature_column/feature_column_v2_test.py
+++ b/tensorflow/python/feature_column/feature_column_v2_test.py
@@ -196,11 +196,11 @@ class LazyColumnTest(test.TestCase):
   def test_error_if_feature_is_not_found(self):
     transformation_cache = fc.FeatureTransformationCache(
         features={'a': [[2], [3.]]})
-    with self.assertRaisesRegexp(ValueError,
-                                 'bbb is not in features dictionary'):
+    with self.assertRaisesRegex(ValueError,
+                                'bbb is not in features dictionary'):
       transformation_cache.get('bbb', None)
-    with self.assertRaisesRegexp(ValueError,
-                                 'bbb is not in features dictionary'):
+    with self.assertRaisesRegex(ValueError,
+                                'bbb is not in features dictionary'):
       transformation_cache.get(u'bbb', None)
 
   def test_not_supported_feature_column(self):
@@ -225,8 +225,8 @@ class LazyColumnTest(test.TestCase):
 
     transformation_cache = fc.FeatureTransformationCache(
         features={'a': [[2], [3.]]})
-    with self.assertRaisesRegexp(ValueError,
-                                 'NotAProperColumn is not supported'):
+    with self.assertRaisesRegex(ValueError,
+                                'NotAProperColumn is not supported'):
       transformation_cache.get(NotAProperColumn(), None)
 
   def test_key_should_be_string_or_feature_colum(self):
@@ -236,7 +236,7 @@ class LazyColumnTest(test.TestCase):
 
     transformation_cache = fc.FeatureTransformationCache(
         features={'a': [[2], [3.]]})
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         TypeError, '"key" must be either a "str" or "FeatureColumn".'):
       transformation_cache.get(NotAFeatureColumn(), None)
 
@@ -270,7 +270,7 @@ class NumericColumnTest(test.TestCase):
     self.assertTrue(a._is_v2_column)
 
   def test_key_should_be_string(self):
-    with self.assertRaisesRegexp(ValueError, 'key must be a string.'):
+    with self.assertRaisesRegex(ValueError, 'key must be a string.'):
       fc.numeric_column(key=('aaa',))
 
   def test_shape_saved_as_tuple(self):
@@ -286,15 +286,15 @@ class NumericColumnTest(test.TestCase):
   def test_shape_and_default_value_compatibility(self):
     a = fc.numeric_column('aaa', shape=[2], default_value=[1, 2.])
     self.assertEqual((1, 2.), a.default_value)
-    with self.assertRaisesRegexp(ValueError, 'The shape of default_value'):
+    with self.assertRaisesRegex(ValueError, 'The shape of default_value'):
       fc.numeric_column('aaa', shape=[2], default_value=[1, 2, 3.])
       a = fc.numeric_column(
           'aaa', shape=[3, 2], default_value=[[2, 3], [1, 2], [2, 3.]])
       self.assertEqual(((2, 3), (1, 2), (2, 3.)), a.default_value)
-    with self.assertRaisesRegexp(ValueError, 'The shape of default_value'):
+    with self.assertRaisesRegex(ValueError, 'The shape of default_value'):
       fc.numeric_column(
           'aaa', shape=[3, 1], default_value=[[2, 3], [1, 2], [2, 3.]])
-    with self.assertRaisesRegexp(ValueError, 'The shape of default_value'):
+    with self.assertRaisesRegex(ValueError, 'The shape of default_value'):
       fc.numeric_column(
           'aaa', shape=[3, 3], default_value=[[2, 3], [1, 2], [2, 3.]])
 
@@ -303,30 +303,30 @@ class NumericColumnTest(test.TestCase):
         'aaa', shape=[2], default_value=[1, 2.], dtype=dtypes.float32)
     fc.numeric_column(
         'aaa', shape=[2], default_value=[1, 2], dtype=dtypes.int32)
-    with self.assertRaisesRegexp(TypeError, 'must be compatible with dtype'):
+    with self.assertRaisesRegex(TypeError, 'must be compatible with dtype'):
       fc.numeric_column(
           'aaa', shape=[2], default_value=[1, 2.], dtype=dtypes.int32)
-    with self.assertRaisesRegexp(TypeError,
-                                 'default_value must be compatible with dtype'):
+    with self.assertRaisesRegex(TypeError,
+                                'default_value must be compatible with dtype'):
       fc.numeric_column('aaa', default_value=['string'])
 
   def test_shape_must_be_positive_integer(self):
-    with self.assertRaisesRegexp(TypeError, 'shape dimensions must be integer'):
+    with self.assertRaisesRegex(TypeError, 'shape dimensions must be integer'):
       fc.numeric_column(
           'aaa', shape=[
               1.0,
           ])
 
-    with self.assertRaisesRegexp(ValueError,
-                                 'shape dimensions must be greater than 0'):
+    with self.assertRaisesRegex(ValueError,
+                                'shape dimensions must be greater than 0'):
       fc.numeric_column(
           'aaa', shape=[
               0,
           ])
 
   def test_dtype_is_convertible_to_float(self):
-    with self.assertRaisesRegexp(ValueError,
-                                 'dtype must be convertible to float'):
+    with self.assertRaisesRegex(ValueError,
+                                'dtype must be convertible to float'):
       fc.numeric_column('aaa', dtype=dtypes.string)
 
   def test_scalar_default_value_fills_the_shape(self):
@@ -381,7 +381,7 @@ class NumericColumnTest(test.TestCase):
                         self.evaluate(features['price']))
 
   def test_normalizer_fn_must_be_callable(self):
-    with self.assertRaisesRegexp(TypeError, 'must be a callable'):
+    with self.assertRaisesRegex(TypeError, 'must be a callable'):
       fc.numeric_column('price', normalizer_fn='NotACallable')
 
   def test_normalizer_fn_transform_feature(self):
@@ -416,7 +416,7 @@ class NumericColumnTest(test.TestCase):
             sparse_tensor.SparseTensor(
                 indices=[[0, 0]], values=[0.3], dense_shape=[1, 1])
     })
-    with self.assertRaisesRegexp(ValueError, 'must be a Tensor'):
+    with self.assertRaisesRegex(ValueError, 'must be a Tensor'):
       price.transform_feature(transformation_cache, None)
 
   def test_deep_copy(self):
@@ -478,30 +478,26 @@ class BucketizedColumnTest(test.TestCase):
 
   def test_invalid_source_column_type(self):
     a = fc.categorical_column_with_hash_bucket('aaa', hash_bucket_size=10)
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         ValueError,
         'source_column must be a column generated with numeric_column'):
       fc.bucketized_column(a, boundaries=[0, 1])
 
   def test_invalid_source_column_shape(self):
     a = fc.numeric_column('aaa', shape=[2, 3])
-    with self.assertRaisesRegexp(
-        ValueError, 'source_column must be one-dimensional column'):
+    with self.assertRaisesRegex(ValueError,
+                                'source_column must be one-dimensional column'):
       fc.bucketized_column(a, boundaries=[0, 1])
 
   def test_invalid_boundaries(self):
     a = fc.numeric_column('aaa')
-    with self.assertRaisesRegexp(ValueError,
-                                 'boundaries must not be empty'):
+    with self.assertRaisesRegex(ValueError, 'boundaries must not be empty'):
       fc.bucketized_column(a, boundaries=None)
-    with self.assertRaisesRegexp(ValueError,
-                                 'boundaries must be a sorted list'):
+    with self.assertRaisesRegex(ValueError, 'boundaries must be a sorted list'):
       fc.bucketized_column(a, boundaries=1.)
-    with self.assertRaisesRegexp(ValueError,
-                                 'boundaries must be a sorted list'):
+    with self.assertRaisesRegex(ValueError, 'boundaries must be a sorted list'):
       fc.bucketized_column(a, boundaries=[1, 0])
-    with self.assertRaisesRegexp(ValueError,
-                                 'boundaries must be a sorted list'):
+    with self.assertRaisesRegex(ValueError, 'boundaries must be a sorted list'):
       fc.bucketized_column(a, boundaries=[1, 1])
 
   def test_name(self):
@@ -652,7 +648,7 @@ class BucketizedColumnTest(test.TestCase):
             sparse_tensor.SparseTensor(
                 indices=[[0, 0]], values=[0.3], dense_shape=[1, 1])
     })
-    with self.assertRaisesRegexp(ValueError, 'must be a Tensor'):
+    with self.assertRaisesRegex(ValueError, 'must be a Tensor'):
       bucketized_price.transform_feature(transformation_cache, None)
 
   def test_deep_copy(self):
@@ -792,16 +788,16 @@ class HashedCategoricalColumnTest(test.TestCase):
     self.assertTrue(a._is_v2_column)
 
   def test_key_should_be_string(self):
-    with self.assertRaisesRegexp(ValueError, 'key must be a string.'):
+    with self.assertRaisesRegex(ValueError, 'key must be a string.'):
       fc.categorical_column_with_hash_bucket(('key',), 10)
 
   def test_bucket_size_should_be_given(self):
-    with self.assertRaisesRegexp(ValueError, 'hash_bucket_size must be set.'):
+    with self.assertRaisesRegex(ValueError, 'hash_bucket_size must be set.'):
       fc.categorical_column_with_hash_bucket('aaa', None)
 
   def test_bucket_size_should_be_positive(self):
-    with self.assertRaisesRegexp(ValueError,
-                                 'hash_bucket_size must be at least 1'):
+    with self.assertRaisesRegex(ValueError,
+                                'hash_bucket_size must be at least 1'):
       fc.categorical_column_with_hash_bucket('aaa', 0)
 
   def test_dtype_should_be_string_or_integer(self):
@@ -810,7 +806,7 @@ class HashedCategoricalColumnTest(test.TestCase):
     self.assertEqual(dtypes.string, a.dtype)
     self.assertEqual(dtypes.int32, b.dtype)
 
-    with self.assertRaisesRegexp(ValueError, 'dtype must be string or integer'):
+    with self.assertRaisesRegex(ValueError, 'dtype must be string or integer'):
       fc.categorical_column_with_hash_bucket('aaa', 10, dtype=dtypes.float32)
 
   def test_deep_copy(self):
@@ -896,7 +892,7 @@ class HashedCategoricalColumnTest(test.TestCase):
     })
     transformation_cache.get(string_fc, None)
     transformation_cache.get(int_fc, None)
-    with self.assertRaisesRegexp(ValueError, 'dtype must be string or integer'):
+    with self.assertRaisesRegex(ValueError, 'dtype must be string or integer'):
       transformation_cache.get(float_fc, None)
 
   def test_dtype_should_match_with_tensor(self):
@@ -905,7 +901,7 @@ class HashedCategoricalColumnTest(test.TestCase):
     wire_tensor = sparse_tensor.SparseTensor(
         values=['omar'], indices=[[0, 0]], dense_shape=[1, 1])
     transformation_cache = fc.FeatureTransformationCache({'wire': wire_tensor})
-    with self.assertRaisesRegexp(ValueError, 'dtype must be compatible'):
+    with self.assertRaisesRegex(ValueError, 'dtype must be compatible'):
       transformation_cache.get(hashed_sparse, None)
 
   def test_ints_should_be_hashed(self):
@@ -1005,34 +1001,34 @@ class HashedCategoricalColumnTest(test.TestCase):
 class CrossedColumnTest(test.TestCase):
 
   def test_keys_empty(self):
-    with self.assertRaisesRegexp(ValueError,
-                                 'keys must be a list with length > 1'):
+    with self.assertRaisesRegex(ValueError,
+                                'keys must be a list with length > 1'):
       fc.crossed_column([], 10)
 
   def test_keys_length_one(self):
-    with self.assertRaisesRegexp(ValueError,
-                                 'keys must be a list with length > 1'):
+    with self.assertRaisesRegex(ValueError,
+                                'keys must be a list with length > 1'):
       fc.crossed_column(['a'], 10)
 
   def test_key_type_unsupported(self):
-    with self.assertRaisesRegexp(ValueError, 'Unsupported key type'):
+    with self.assertRaisesRegex(ValueError, 'Unsupported key type'):
       fc.crossed_column(['a', fc.numeric_column('c')], 10)
 
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         ValueError, 'categorical_column_with_hash_bucket is not supported'):
       fc.crossed_column(
           ['a', fc.categorical_column_with_hash_bucket('c', 10)], 10)
 
   def test_hash_bucket_size_negative(self):
-    with self.assertRaisesRegexp(ValueError, 'hash_bucket_size must be > 1'):
+    with self.assertRaisesRegex(ValueError, 'hash_bucket_size must be > 1'):
       fc.crossed_column(['a', 'c'], -1)
 
   def test_hash_bucket_size_zero(self):
-    with self.assertRaisesRegexp(ValueError, 'hash_bucket_size must be > 1'):
+    with self.assertRaisesRegex(ValueError, 'hash_bucket_size must be > 1'):
       fc.crossed_column(['a', 'c'], 0)
 
   def test_hash_bucket_size_none(self):
-    with self.assertRaisesRegexp(ValueError, 'hash_bucket_size must be > 1'):
+    with self.assertRaisesRegex(ValueError, 'hash_bucket_size must be > 1'):
       fc.crossed_column(['a', 'c'], None)
 
   def test_name(self):
@@ -1314,7 +1310,7 @@ class CrossedColumnTest(test.TestCase):
     t = _TestColumnWithWeights()
     crossed = fc.crossed_column([t, 'c'], hash_bucket_size=5, hash_key=5)
     with ops.Graph().as_default():
-      with self.assertRaisesRegexp(
+      with self.assertRaisesRegex(
           ValueError,
           'crossed_column does not support weight_tensor.*{}'.format(t.name)):
         fc_old.linear_model({
@@ -1411,12 +1407,12 @@ class CrossedColumnTest(test.TestCase):
 class OldLinearModelTest(test.TestCase):
 
   def test_raises_if_empty_feature_columns(self):
-    with self.assertRaisesRegexp(ValueError,
-                                 'feature_columns must not be empty'):
+    with self.assertRaisesRegex(ValueError,
+                                'feature_columns must not be empty'):
       fc_old.linear_model(features={}, feature_columns=[])
 
   def test_should_be_feature_column(self):
-    with self.assertRaisesRegexp(ValueError, 'must be a _FeatureColumn'):
+    with self.assertRaisesRegex(ValueError, 'must be a _FeatureColumn'):
       fc_old.linear_model(features={'a': [[0]]}, feature_columns='NotSupported')
 
   def test_should_be_dense_or_categorical_column(self):
@@ -1446,19 +1442,19 @@ class OldLinearModelTest(test.TestCase):
       def _parse_example_spec(self):
         pass
 
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         ValueError, 'must be either a _DenseColumn or _CategoricalColumn'):
       fc_old.linear_model(
           features={'a': [[0]]}, feature_columns=[NotSupportedColumn()])
 
   def test_does_not_support_dict_columns(self):
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         ValueError, 'Expected feature_columns to be iterable, found dict.'):
       fc_old.linear_model(
           features={'a': [[0]]}, feature_columns={'a': fc.numeric_column('a')})
 
   def test_raises_if_duplicate_name(self):
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         ValueError, 'Duplicate feature column name found for columns'):
       fc_old.linear_model(
           features={'a': [[0]]},
@@ -1725,7 +1721,7 @@ class OldLinearModelTest(test.TestCase):
     price = fc.numeric_column('price', shape=2)
     with ops.Graph().as_default():
       features = {'price': [[1.], [5.]]}
-      with self.assertRaisesRegexp(
+      with self.assertRaisesRegex(
           Exception,
           r'Cannot reshape a tensor with 2 elements to shape \[2,2\]'):
         fc_old.linear_model(features, [price])
@@ -1948,7 +1944,7 @@ class OldLinearModelTest(test.TestCase):
           'price1': [[1.], [5.], [7.]],  # batchsize = 3
           'price2': [[3.], [4.]]  # batchsize = 2
       }
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         ValueError,
         r'Batch size \(first dimension\) of each feature must be same.'):  # pylint: disable=anomalous-backslash-in-string
       fc_old.linear_model(features, [price1, price2])
@@ -1963,7 +1959,7 @@ class OldLinearModelTest(test.TestCase):
           'price2': [[3.], [4.]],  # batchsize = 2
           'price3': [[3.], [4.], [5.]]  # batchsize = 3
       }
-      with self.assertRaisesRegexp(
+      with self.assertRaisesRegex(
           ValueError,
           r'Batch size \(first dimension\) of each feature must be same.'):  # pylint: disable=anomalous-backslash-in-string
         fc_old.linear_model(features, [price1, price2, price3])
@@ -1978,8 +1974,8 @@ class OldLinearModelTest(test.TestCase):
       }
       predictions = fc_old.linear_model(features, [price1, price2])
       with _initialized_session() as sess:
-        with self.assertRaisesRegexp(errors.OpError,
-                                     'must have the same size and shape'):
+        with self.assertRaisesRegex(errors.OpError,
+                                    'must have the same size and shape'):
           sess.run(
               predictions, feed_dict={features['price1']: [[1.], [5.], [7.]]})
 
@@ -2099,7 +2095,7 @@ class OldLinearModelTest(test.TestCase):
     self.assertEqual(0, features['price'].shape.ndims)
 
     # Static rank 0 should fail
-    with self.assertRaisesRegexp(ValueError, 'Feature .* cannot have rank 0'):
+    with self.assertRaisesRegex(ValueError, 'Feature .* cannot have rank 0'):
       fc_old.linear_model(features, [price])
 
     # This test needs to construct graph placeholders
@@ -2288,8 +2284,8 @@ class OldLinearModelTest(test.TestCase):
                   values=(1, 2, 1),
                   dense_shape=(2, 2)),
       }
-      with self.assertRaisesRegexp(ValueError,
-                                   'SharedEmbeddingColumns are not supported'):
+      with self.assertRaisesRegex(ValueError,
+                                  'SharedEmbeddingColumns are not supported'):
         fc_old.linear_model(features, all_cols)
 
 
@@ -2397,12 +2393,12 @@ class InputLayerTest(test.TestCase):
 class FunctionalInputLayerTest(test.TestCase):
 
   def test_raises_if_empty_feature_columns(self):
-    with self.assertRaisesRegexp(ValueError,
-                                 'feature_columns must not be empty'):
+    with self.assertRaisesRegex(ValueError,
+                                'feature_columns must not be empty'):
       fc_old.input_layer(features={}, feature_columns=[])
 
   def test_should_be_dense_column(self):
-    with self.assertRaisesRegexp(ValueError, 'must be a _DenseColumn'):
+    with self.assertRaisesRegex(ValueError, 'must be a _DenseColumn'):
       fc_old.input_layer(
           features={'a': [[0]]},
           feature_columns=[
@@ -2410,7 +2406,7 @@ class FunctionalInputLayerTest(test.TestCase):
           ])
 
   def test_does_not_support_dict_columns(self):
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         ValueError, 'Expected feature_columns to be iterable, found dict.'):
       fc_old.input_layer(
           features={'a': [[0]]}, feature_columns={'a': fc.numeric_column('a')})
@@ -2437,7 +2433,7 @@ class FunctionalInputLayerTest(test.TestCase):
       self.assertAllClose([[0., 1.]], self.evaluate(net))
 
   def test_raises_if_duplicate_name(self):
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         ValueError, 'Duplicate feature column name found for columns'):
       fc_old.input_layer(
           features={'a': [[0]]},
@@ -2470,7 +2466,7 @@ class FunctionalInputLayerTest(test.TestCase):
     price = fc.numeric_column('price', shape=2)
     with ops.Graph().as_default():
       features = {'price': [[1.], [5.]]}
-      with self.assertRaisesRegexp(
+      with self.assertRaisesRegex(
           Exception,
           r'Cannot reshape a tensor with 2 elements to shape \[2,2\]'):
         fc_old.input_layer(features, [price])
@@ -2642,7 +2638,7 @@ class FunctionalInputLayerTest(test.TestCase):
               sparse_tensor.SparseTensor(
                   indices=[[0, 0], [0, 1]], values=[1, 2], dense_shape=[1, 2])
       }
-      with self.assertRaisesRegexp(Exception, 'must be a _DenseColumn'):
+      with self.assertRaisesRegex(Exception, 'must be a _DenseColumn'):
         fc_old.input_layer(features, [animal])
 
   def test_static_batch_size_mismatch(self):
@@ -2653,7 +2649,7 @@ class FunctionalInputLayerTest(test.TestCase):
           'price1': [[1.], [5.], [7.]],  # batchsize = 3
           'price2': [[3.], [4.]]  # batchsize = 2
       }
-      with self.assertRaisesRegexp(
+      with self.assertRaisesRegex(
           ValueError,
           r'Batch size \(first dimension\) of each feature must be same.'):  # pylint: disable=anomalous-backslash-in-string
         fc_old.input_layer(features, [price1, price2])
@@ -2668,7 +2664,7 @@ class FunctionalInputLayerTest(test.TestCase):
           'price2': [[3.], [4.]],  # batchsize = 2
           'price3': [[3.], [4.], [5.]]  # batchsize = 3
       }
-      with self.assertRaisesRegexp(
+      with self.assertRaisesRegex(
           ValueError,
           r'Batch size \(first dimension\) of each feature must be same.'):  # pylint: disable=anomalous-backslash-in-string
         fc_old.input_layer(features, [price1, price2, price3])
@@ -2683,8 +2679,8 @@ class FunctionalInputLayerTest(test.TestCase):
       }
       net = fc_old.input_layer(features, [price1, price2])
       with _initialized_session() as sess:
-        with self.assertRaisesRegexp(errors.OpError,
-                                     'Dimensions of inputs should match'):
+        with self.assertRaisesRegex(errors.OpError,
+                                    'Dimensions of inputs should match'):
           sess.run(net, feed_dict={features['price1']: [[1.], [5.], [7.]]})
 
   def test_runtime_batch_size_matches(self):
@@ -2855,7 +2851,7 @@ class FunctionalInputLayerTest(test.TestCase):
     self.assertEqual(0, features['price'].shape.ndims)
 
     # Static rank 0 should fail
-    with self.assertRaisesRegexp(ValueError, 'Feature .* cannot have rank 0'):
+    with self.assertRaisesRegex(ValueError, 'Feature .* cannot have rank 0'):
       fc_old.input_layer(features, [price])
 
     # This test needs to construct graph placeholders
@@ -2908,7 +2904,7 @@ class MakeParseExampleSpecTest(test.TestCase):
     key1 = 'key1'
     parse_spec1 = parsing_ops.FixedLenFeature(
         shape=(2,), dtype=dtypes.float32, default_value=0.)
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         ValueError,
         'All feature_columns must be FeatureColumn instances.*invalid_column'):
       fc.make_parse_example_spec_v2((self._TestFeatureColumn({
@@ -2942,7 +2938,7 @@ class MakeParseExampleSpecTest(test.TestCase):
     parse_spec1 = parsing_ops.FixedLenFeature(
         shape=(2,), dtype=dtypes.float32, default_value=0.)
     parse_spec2 = parsing_ops.VarLenFeature(dtype=dtypes.string)
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         ValueError,
         'feature_columns contain different parse_spec for key key1'):
       fc.make_parse_example_spec_v2((self._TestFeatureColumn({
@@ -3038,7 +3034,7 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
     self.assertTrue(column._is_v2_column)
 
   def test_key_should_be_string(self):
-    with self.assertRaisesRegexp(ValueError, 'key must be a string.'):
+    with self.assertRaisesRegex(ValueError, 'key must be a string.'):
       fc.categorical_column_with_vocabulary_file(
           key=('aaa',), vocabulary_file='path_to_file', vocabulary_size=3)
 
@@ -3069,12 +3065,12 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
       }, column.parse_example_spec)
 
   def test_vocabulary_file_none(self):
-    with self.assertRaisesRegexp(ValueError, 'Missing vocabulary_file'):
+    with self.assertRaisesRegex(ValueError, 'Missing vocabulary_file'):
       fc.categorical_column_with_vocabulary_file(
           key='aaa', vocabulary_file=None, vocabulary_size=3)
 
   def test_vocabulary_file_empty_string(self):
-    with self.assertRaisesRegexp(ValueError, 'Missing vocabulary_file'):
+    with self.assertRaisesRegex(ValueError, 'Missing vocabulary_file'):
       fc.categorical_column_with_vocabulary_file(
           key='aaa', vocabulary_file='', vocabulary_size=3)
 
@@ -3085,7 +3081,7 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
         indices=((0, 0), (1, 0), (1, 1)),
         values=('marlo', 'skywalker', 'omar'),
         dense_shape=(2, 2))
-    with self.assertRaisesRegexp(errors.OpError, 'file_does_not_exist'):
+    with self.assertRaisesRegex(errors.OpError, 'file_does_not_exist'):
       column.get_sparse_tensors(
           fc.FeatureTransformationCache({
               'aaa': inputs
@@ -3093,12 +3089,12 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
       self.evaluate(lookup_ops.tables_initializer())
 
   def test_invalid_vocabulary_size(self):
-    with self.assertRaisesRegexp(ValueError, 'Invalid vocabulary_size'):
+    with self.assertRaisesRegex(ValueError, 'Invalid vocabulary_size'):
       fc.categorical_column_with_vocabulary_file(
           key='aaa',
           vocabulary_file=self._wire_vocabulary_file_name,
           vocabulary_size=-1)
-    with self.assertRaisesRegexp(ValueError, 'Invalid vocabulary_size'):
+    with self.assertRaisesRegex(ValueError, 'Invalid vocabulary_size'):
       fc.categorical_column_with_vocabulary_file(
           key='aaa',
           vocabulary_file=self._wire_vocabulary_file_name,
@@ -3113,7 +3109,7 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
         indices=((0, 0), (1, 0), (1, 1)),
         values=('marlo', 'skywalker', 'omar'),
         dense_shape=(2, 2))
-    with self.assertRaisesRegexp(errors.OpError, 'Invalid vocab_size'):
+    with self.assertRaisesRegex(errors.OpError, 'Invalid vocab_size'):
       column.get_sparse_tensors(
           fc.FeatureTransformationCache({
               'aaa': inputs
@@ -3121,7 +3117,7 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
       self.evaluate(lookup_ops.tables_initializer())
 
   def test_invalid_num_oov_buckets(self):
-    with self.assertRaisesRegexp(ValueError, 'Invalid num_oov_buckets'):
+    with self.assertRaisesRegex(ValueError, 'Invalid num_oov_buckets'):
       fc.categorical_column_with_vocabulary_file(
           key='aaa',
           vocabulary_file='path',
@@ -3129,7 +3125,7 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
           num_oov_buckets=-1)
 
   def test_invalid_dtype(self):
-    with self.assertRaisesRegexp(ValueError, 'dtype must be string or integer'):
+    with self.assertRaisesRegex(ValueError, 'dtype must be string or integer'):
       fc.categorical_column_with_vocabulary_file(
           key='aaa',
           vocabulary_file='path',
@@ -3137,8 +3133,8 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
           dtype=dtypes.float64)
 
   def test_invalid_buckets_and_default_value(self):
-    with self.assertRaisesRegexp(ValueError,
-                                 'both num_oov_buckets and default_value'):
+    with self.assertRaisesRegex(ValueError,
+                                'both num_oov_buckets and default_value'):
       fc.categorical_column_with_vocabulary_file(
           key='aaa',
           vocabulary_file=self._wire_vocabulary_file_name,
@@ -3156,7 +3152,7 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
         indices=((0, 0), (1, 0), (1, 1)),
         values=(12, 24, 36),
         dense_shape=(2, 2))
-    with self.assertRaisesRegexp(ValueError, 'dtype must be compatible'):
+    with self.assertRaisesRegex(ValueError, 'dtype must be compatible'):
       column.get_sparse_tensors(
           fc.FeatureTransformationCache({
               'aaa': inputs
@@ -3172,7 +3168,7 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
         indices=((0, 0), (1, 0), (1, 1)),
         values=('omar', 'stringer', 'marlo'),
         dense_shape=(2, 2))
-    with self.assertRaisesRegexp(ValueError, 'dtype must be compatible'):
+    with self.assertRaisesRegex(ValueError, 'dtype must be compatible'):
       column.get_sparse_tensors(
           fc.FeatureTransformationCache({
               'aaa': inputs
@@ -3523,7 +3519,7 @@ class VocabularyListCategoricalColumnTest(test.TestCase):
     self.assertTrue(column._is_v2_column)
 
   def test_key_should_be_string(self):
-    with self.assertRaisesRegexp(ValueError, 'key must be a string.'):
+    with self.assertRaisesRegex(ValueError, 'key must be a string.'):
       fc.categorical_column_with_vocabulary_list(
           key=('aaa',), vocabulary_list=('omar', 'stringer', 'marlo'))
 
@@ -3559,57 +3555,57 @@ class VocabularyListCategoricalColumnTest(test.TestCase):
       }, column.parse_example_spec)
 
   def test_invalid_dtype(self):
-    with self.assertRaisesRegexp(ValueError, 'dtype must be string or integer'):
+    with self.assertRaisesRegex(ValueError, 'dtype must be string or integer'):
       fc.categorical_column_with_vocabulary_list(
           key='aaa',
           vocabulary_list=('omar', 'stringer', 'marlo'),
           dtype=dtypes.float32)
 
   def test_invalid_mapping_dtype(self):
-    with self.assertRaisesRegexp(ValueError,
-                                 r'vocabulary dtype must be string or integer'):
+    with self.assertRaisesRegex(ValueError,
+                                r'vocabulary dtype must be string or integer'):
       fc.categorical_column_with_vocabulary_list(
           key='aaa', vocabulary_list=(12., 24., 36.))
 
   def test_mismatched_int_dtype(self):
-    with self.assertRaisesRegexp(ValueError,
-                                 r'dtype.*and vocabulary dtype.*do not match'):
+    with self.assertRaisesRegex(ValueError,
+                                r'dtype.*and vocabulary dtype.*do not match'):
       fc.categorical_column_with_vocabulary_list(
           key='aaa',
           vocabulary_list=('omar', 'stringer', 'marlo'),
           dtype=dtypes.int32)
 
   def test_mismatched_string_dtype(self):
-    with self.assertRaisesRegexp(ValueError,
-                                 r'dtype.*and vocabulary dtype.*do not match'):
+    with self.assertRaisesRegex(ValueError,
+                                r'dtype.*and vocabulary dtype.*do not match'):
       fc.categorical_column_with_vocabulary_list(
           key='aaa', vocabulary_list=(12, 24, 36), dtype=dtypes.string)
 
   def test_none_mapping(self):
-    with self.assertRaisesRegexp(ValueError,
-                                 r'vocabulary_list.*must be non-empty'):
+    with self.assertRaisesRegex(ValueError,
+                                r'vocabulary_list.*must be non-empty'):
       fc.categorical_column_with_vocabulary_list(
           key='aaa', vocabulary_list=None)
 
   def test_empty_mapping(self):
-    with self.assertRaisesRegexp(ValueError,
-                                 r'vocabulary_list.*must be non-empty'):
+    with self.assertRaisesRegex(ValueError,
+                                r'vocabulary_list.*must be non-empty'):
       fc.categorical_column_with_vocabulary_list(
           key='aaa', vocabulary_list=tuple([]))
 
   def test_duplicate_mapping(self):
-    with self.assertRaisesRegexp(ValueError, 'Duplicate keys'):
+    with self.assertRaisesRegex(ValueError, 'Duplicate keys'):
       fc.categorical_column_with_vocabulary_list(
           key='aaa', vocabulary_list=(12, 24, 12))
 
   def test_invalid_num_oov_buckets(self):
-    with self.assertRaisesRegexp(ValueError, 'Invalid num_oov_buckets'):
+    with self.assertRaisesRegex(ValueError, 'Invalid num_oov_buckets'):
       fc.categorical_column_with_vocabulary_list(
           key='aaa', vocabulary_list=(12, 24, 36), num_oov_buckets=-1)
 
   def test_invalid_buckets_and_default_value(self):
-    with self.assertRaisesRegexp(ValueError,
-                                 'both num_oov_buckets and default_value'):
+    with self.assertRaisesRegex(ValueError,
+                                'both num_oov_buckets and default_value'):
       fc.categorical_column_with_vocabulary_list(
           key='aaa',
           vocabulary_list=(12, 24, 36),
@@ -3623,7 +3619,7 @@ class VocabularyListCategoricalColumnTest(test.TestCase):
         indices=((0, 0), (1, 0), (1, 1)),
         values=(12, 24, 36),
         dense_shape=(2, 2))
-    with self.assertRaisesRegexp(ValueError, 'dtype must be compatible'):
+    with self.assertRaisesRegex(ValueError, 'dtype must be compatible'):
       column.get_sparse_tensors(
           fc.FeatureTransformationCache({
               'aaa': inputs
@@ -3636,7 +3632,7 @@ class VocabularyListCategoricalColumnTest(test.TestCase):
         indices=((0, 0), (1, 0), (1, 1)),
         values=('omar', 'stringer', 'marlo'),
         dense_shape=(2, 2))
-    with self.assertRaisesRegexp(ValueError, 'dtype must be compatible'):
+    with self.assertRaisesRegex(ValueError, 'dtype must be compatible'):
       column.get_sparse_tensors(
           fc.FeatureTransformationCache({
               'aaa': inputs
@@ -3942,7 +3938,7 @@ class IdentityCategoricalColumnTest(test.TestCase):
     self.assertTrue(column._is_v2_column)
 
   def test_key_should_be_string(self):
-    with self.assertRaisesRegexp(ValueError, 'key must be a string.'):
+    with self.assertRaisesRegex(ValueError, 'key must be a string.'):
       fc.categorical_column_with_identity(key=('aaa',), num_buckets=3)
 
   def test_deep_copy(self):
@@ -3955,20 +3951,20 @@ class IdentityCategoricalColumnTest(test.TestCase):
       }, column.parse_example_spec)
 
   def test_invalid_num_buckets_zero(self):
-    with self.assertRaisesRegexp(ValueError, 'num_buckets 0 < 1'):
+    with self.assertRaisesRegex(ValueError, 'num_buckets 0 < 1'):
       fc.categorical_column_with_identity(key='aaa', num_buckets=0)
 
   def test_invalid_num_buckets_negative(self):
-    with self.assertRaisesRegexp(ValueError, 'num_buckets -1 < 1'):
+    with self.assertRaisesRegex(ValueError, 'num_buckets -1 < 1'):
       fc.categorical_column_with_identity(key='aaa', num_buckets=-1)
 
   def test_invalid_default_value_too_small(self):
-    with self.assertRaisesRegexp(ValueError, 'default_value -1 not in range'):
+    with self.assertRaisesRegex(ValueError, 'default_value -1 not in range'):
       fc.categorical_column_with_identity(
           key='aaa', num_buckets=3, default_value=-1)
 
   def test_invalid_default_value_too_big(self):
-    with self.assertRaisesRegexp(ValueError, 'default_value 3 not in range'):
+    with self.assertRaisesRegex(ValueError, 'default_value 3 not in range'):
       fc.categorical_column_with_identity(
           key='aaa', num_buckets=3, default_value=3)
 
@@ -3978,7 +3974,7 @@ class IdentityCategoricalColumnTest(test.TestCase):
         indices=((0, 0), (1, 0), (1, 1)),
         values=('omar', 'stringer', 'marlo'),
         dense_shape=(2, 2))
-    with self.assertRaisesRegexp(ValueError, 'Invalid input, not integer'):
+    with self.assertRaisesRegex(ValueError, 'Invalid input, not integer'):
       column.get_sparse_tensors(
           fc.FeatureTransformationCache({
               'aaa': inputs
@@ -4134,8 +4130,8 @@ class IdentityCategoricalColumnTest(test.TestCase):
     state_manager = _TestStateManager()
     embedding_column.create_state(state_manager)
 
-    with self.assertRaisesRegexp(errors.OpError,
-                                 r'indices\[0\] = 2 is not in \[0, 2\)'):
+    with self.assertRaisesRegex(errors.OpError,
+                                r'indices\[0\] = 2 is not in \[0, 2\)'):
       # Provide sparse input and get dense result.
       embedding_lookup = embedding_column.get_dense_tensor(
           fc.FeatureTransformationCache({'aaa': sparse_input}), state_manager)
@@ -4337,7 +4333,7 @@ class IndicatorColumnTest(test.TestCase):
     self.assertFalse(indicator_b._is_v2_column)
 
   def test_not_categorical_input(self):
-    with self.assertRaisesRegexp(ValueError, 'Unsupported input type.'):
+    with self.assertRaisesRegex(ValueError, 'Unsupported input type.'):
       fc.indicator_column('aaa')
 
   def test_1D_shape_succeeds(self):
@@ -4732,7 +4728,7 @@ class EmbeddingColumnTest(test.TestCase, parameterized.TestCase):
   def test_invalid_initializer(self):
     categorical_column = fc.categorical_column_with_identity(
         key='aaa', num_buckets=3)
-    with self.assertRaisesRegexp(ValueError, 'initializer must be callable'):
+    with self.assertRaisesRegex(ValueError, 'initializer must be callable'):
       fc.embedding_column(categorical_column, dimension=2, initializer='not_fn')
 
   def test_parse_example(self):
@@ -5573,7 +5569,7 @@ class SharedEmbeddingColumnTest(test.TestCase, parameterized.TestCase):
           key='aaa', num_buckets=3)
       categorical_column_b = fc.categorical_column_with_identity(
           key='bbb', num_buckets=3)
-      with self.assertRaisesRegexp(ValueError, 'initializer must be callable'):
+      with self.assertRaisesRegex(ValueError, 'initializer must be callable'):
         fc.shared_embedding_columns_v2(
             [categorical_column_a, categorical_column_b],
             dimension=2,
@@ -5588,7 +5584,7 @@ class SharedEmbeddingColumnTest(test.TestCase, parameterized.TestCase):
           key='bbb', num_buckets=3)
       categorical_column_c = fc.categorical_column_with_hash_bucket(
           key='ccc', hash_bucket_size=3)
-      with self.assertRaisesRegexp(
+      with self.assertRaisesRegex(
           ValueError, 'all categorical_columns must have the same type.*'
           'IdentityCategoricalColumn.*HashedCategoricalColumn'):
         fc.shared_embedding_columns_v2(
@@ -6039,7 +6035,7 @@ class WeightedCategoricalColumnTest(test.TestCase):
       }, column.parse_example_spec)
 
   def test_invalid_dtype_none(self):
-    with self.assertRaisesRegexp(ValueError, 'is not convertible to float'):
+    with self.assertRaisesRegex(ValueError, 'is not convertible to float'):
       fc.weighted_categorical_column(
           categorical_column=fc.categorical_column_with_identity(
               key='ids', num_buckets=3),
@@ -6047,7 +6043,7 @@ class WeightedCategoricalColumnTest(test.TestCase):
           dtype=None)
 
   def test_invalid_dtype_string(self):
-    with self.assertRaisesRegexp(ValueError, 'is not convertible to float'):
+    with self.assertRaisesRegex(ValueError, 'is not convertible to float'):
       fc.weighted_categorical_column(
           categorical_column=fc.categorical_column_with_identity(
               key='ids', num_buckets=3),
@@ -6063,14 +6059,14 @@ class WeightedCategoricalColumnTest(test.TestCase):
         indices=((0, 0), (1, 0), (1, 1)),
         values=('omar', 'stringer', 'marlo'),
         dense_shape=(2, 2))
-    with self.assertRaisesRegexp(ValueError, 'Bad dtype'):
+    with self.assertRaisesRegex(ValueError, 'Bad dtype'):
       fc._transform_features_v2({
           'ids': strings,
           'values': strings
       }, (column,), None)
 
   def test_column_name_collision(self):
-    with self.assertRaisesRegexp(ValueError, r'Parse config.*already exists'):
+    with self.assertRaisesRegex(ValueError, r'Parse config.*already exists'):
       fc.weighted_categorical_column(
           categorical_column=fc.categorical_column_with_identity(
               key='aaa', num_buckets=3),
@@ -6085,8 +6081,8 @@ class WeightedCategoricalColumnTest(test.TestCase):
         indices=((0, 0), (1, 0), (1, 1)),
         values=('omar', 'stringer', 'marlo'),
         dense_shape=(2, 2))
-    with self.assertRaisesRegexp(ValueError,
-                                 'values is not in features dictionary'):
+    with self.assertRaisesRegex(ValueError,
+                                'values is not in features dictionary'):
       fc._transform_features_v2({'ids': inputs}, (column,), None)
 
   def test_parse_example(self):
@@ -6252,8 +6248,8 @@ class WeightedCategoricalColumnTest(test.TestCase):
             key='ids', num_buckets=3),
         weight_feature_key='values')
     with ops.Graph().as_default():
-      with self.assertRaisesRegexp(ValueError,
-                                   r'Dimensions.*are not compatible'):
+      with self.assertRaisesRegex(ValueError,
+                                  r'Dimensions.*are not compatible'):
         fc_old.linear_model({
             'ids':
                 sparse_tensor.SparseTensorValue(
@@ -6288,7 +6284,7 @@ class WeightedCategoricalColumnTest(test.TestCase):
       config.graph_options.rewrite_options.constant_folding = (
           rewriter_config_pb2.RewriterConfig.OFF)
       with _initialized_session(config):
-        with self.assertRaisesRegexp(errors.OpError, 'Incompatible shapes'):
+        with self.assertRaisesRegex(errors.OpError, 'Incompatible shapes'):
           self.evaluate(predictions)
 
   def test_old_linear_model_mismatched_dense_shape(self):
diff --git a/tensorflow/python/feature_column/sequence_feature_column_test.py b/tensorflow/python/feature_column/sequence_feature_column_test.py
index e0cd73d17e4..da6d1dee4ba 100644
--- a/tensorflow/python/feature_column/sequence_feature_column_test.py
+++ b/tensorflow/python/feature_column/sequence_feature_column_test.py
@@ -76,7 +76,7 @@ class ConcatenateContextInputTest(test.TestCase, parameterized.TestCase):
     context_input = ops.convert_to_tensor(np.arange(100).reshape(10, 10))
     seq_input = math_ops.cast(seq_input, dtype=dtypes.float32)
     context_input = math_ops.cast(context_input, dtype=dtypes.float32)
-    with self.assertRaisesRegexp(ValueError, 'sequence_input must have rank 3'):
+    with self.assertRaisesRegex(ValueError, 'sequence_input must have rank 3'):
       sfc.concatenate_context_input(context_input, seq_input)
 
   @parameterized.named_parameters(
@@ -90,23 +90,23 @@ class ConcatenateContextInputTest(test.TestCase, parameterized.TestCase):
     seq_input = ops.convert_to_tensor(np.arange(100).reshape(5, 5, 4))
     seq_input = math_ops.cast(seq_input, dtype=dtypes.float32)
     context_input = math_ops.cast(context_input, dtype=dtypes.float32)
-    with self.assertRaisesRegexp(ValueError, 'context_input must have rank 2'):
+    with self.assertRaisesRegex(ValueError, 'context_input must have rank 2'):
       sfc.concatenate_context_input(context_input, seq_input)
 
   def test_integer_seq_input_throws_error(self):
     seq_input = ops.convert_to_tensor(np.arange(100).reshape(5, 5, 4))
     context_input = ops.convert_to_tensor(np.arange(100).reshape(10, 10))
     context_input = math_ops.cast(context_input, dtype=dtypes.float32)
-    with self.assertRaisesRegexp(
-        TypeError, 'sequence_input must have dtype float32'):
+    with self.assertRaisesRegex(TypeError,
+                                'sequence_input must have dtype float32'):
       sfc.concatenate_context_input(context_input, seq_input)
 
   def test_integer_context_input_throws_error(self):
     seq_input = ops.convert_to_tensor(np.arange(100).reshape(5, 5, 4))
     context_input = ops.convert_to_tensor(np.arange(100).reshape(10, 10))
     seq_input = math_ops.cast(seq_input, dtype=dtypes.float32)
-    with self.assertRaisesRegexp(
-        TypeError, 'context_input must have dtype float32'):
+    with self.assertRaisesRegex(TypeError,
+                                'context_input must have dtype float32'):
       sfc.concatenate_context_input(context_input, seq_input)
 
 
@@ -811,20 +811,20 @@ class SequenceNumericColumnTest(test.TestCase, parameterized.TestCase):
     self.assertEqual((1, 2), a.shape)
 
   def test_shape_must_be_positive_integer(self):
-    with self.assertRaisesRegexp(TypeError, 'shape dimensions must be integer'):
+    with self.assertRaisesRegex(TypeError, 'shape dimensions must be integer'):
       sfc.sequence_numeric_column('aaa', shape=[1.0])
 
-    with self.assertRaisesRegexp(
-        ValueError, 'shape dimensions must be greater than 0'):
+    with self.assertRaisesRegex(ValueError,
+                                'shape dimensions must be greater than 0'):
       sfc.sequence_numeric_column('aaa', shape=[0])
 
   def test_dtype_is_convertible_to_float(self):
-    with self.assertRaisesRegexp(
-        ValueError, 'dtype must be convertible to float'):
+    with self.assertRaisesRegex(ValueError,
+                                'dtype must be convertible to float'):
       sfc.sequence_numeric_column('aaa', dtype=dtypes.string)
 
   def test_normalizer_fn_must_be_callable(self):
-    with self.assertRaisesRegexp(TypeError, 'must be a callable'):
+    with self.assertRaisesRegex(TypeError, 'must be a callable'):
       sfc.sequence_numeric_column('aaa', normalizer_fn='NotACallable')
 
   @parameterized.named_parameters(
diff --git a/tensorflow/python/feature_column/serialization_test.py b/tensorflow/python/feature_column/serialization_test.py
index 69b954022af..a170afff1c2 100644
--- a/tensorflow/python/feature_column/serialization_test.py
+++ b/tensorflow/python/feature_column/serialization_test.py
@@ -32,11 +32,11 @@ class FeatureColumnSerializationTest(test.TestCase):
     class NotAFeatureColumn(object):
       pass
 
-    with self.assertRaisesRegexp(ValueError, 'is not a FeatureColumn'):
+    with self.assertRaisesRegex(ValueError, 'is not a FeatureColumn'):
       serialization.serialize_feature_column(NotAFeatureColumn())
 
   def test_deserialize_invalid_config(self):
-    with self.assertRaisesRegexp(ValueError, 'Improper config format: {}'):
+    with self.assertRaisesRegex(ValueError, 'Improper config format: {}'):
       serialization.deserialize_feature_column({})
 
   def test_deserialize_config_missing_key(self):
@@ -52,12 +52,12 @@ class FeatureColumnSerializationTest(test.TestCase):
         'class_name': 'NumericColumn'
     }
 
-    with self.assertRaisesRegexp(
-        ValueError, 'Invalid config:.*expected keys.*dtype'):
+    with self.assertRaisesRegex(ValueError,
+                                'Invalid config:.*expected keys.*dtype'):
       serialization.deserialize_feature_column(config_missing_key)
 
   def test_deserialize_invalid_class(self):
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         ValueError, 'Unknown feature_column_v2: NotExistingFeatureColumnClass'):
       serialization.deserialize_feature_column({
           'class_name': 'NotExistingFeatureColumnClass',
diff --git a/tensorflow/python/framework/auto_control_deps_test.py b/tensorflow/python/framework/auto_control_deps_test.py
index d0e08e676d5..61c14ce74fe 100644
--- a/tensorflow/python/framework/auto_control_deps_test.py
+++ b/tensorflow/python/framework/auto_control_deps_test.py
@@ -761,8 +761,8 @@ class AutomaticControlDependenciesTest(test.TestCase):
     v = resource_variable_ops.ResourceVariable(1.0)
     grad = backprop.implicit_grad(lambda v: v**2)(v)
 
-    with self.assertRaisesRegexp(TypeError,
-                                 ".*must return zero or more Tensors.*"):
+    with self.assertRaisesRegex(TypeError,
+                                ".*must return zero or more Tensors.*"):
       # TODO(akshayka): We might want to allow defun-ing Python functions
       # that return operations (and just execute the op instead of running it).
       optimizer.apply_gradients(grad)
diff --git a/tensorflow/python/framework/config_test.py b/tensorflow/python/framework/config_test.py
index 65845535ea7..345e7f0d9be 100644
--- a/tensorflow/python/framework/config_test.py
+++ b/tensorflow/python/framework/config_test.py
@@ -83,8 +83,8 @@ class ConfigTest(test.TestCase, parameterized.TestCase):
     self.assertEqual(config.get_device_policy(), 'silent_for_int32')
     self.assertEqual(context.DEVICE_PLACEMENT_SILENT_FOR_INT32,
                      context.context().device_policy)
-    with self.assertRaisesRegexp(errors.InvalidArgumentError,
-                                 'Tensors on conflicting devices'):
+    with self.assertRaisesRegex(errors.InvalidArgumentError,
+                                'Tensors on conflicting devices'):
       copy_tensor(dtypes.float32)
     copy_tensor()
 
@@ -98,8 +98,8 @@ class ConfigTest(test.TestCase, parameterized.TestCase):
     self.assertEqual(config.get_device_policy(), 'explicit')
     self.assertEqual(context.DEVICE_PLACEMENT_EXPLICIT,
                      context.context().device_policy)
-    with self.assertRaisesRegexp(errors.InvalidArgumentError,
-                                 'Tensors on conflicting devices'):
+    with self.assertRaisesRegex(errors.InvalidArgumentError,
+                                'Tensors on conflicting devices'):
       copy_tensor()
 
     config.set_device_policy(None)
@@ -409,7 +409,7 @@ class DeviceTest(test.TestCase):
         self.evaluate(d)
 
     # Modifying the CPU configuration is not supported
-    with self.assertRaisesRegexp(RuntimeError, 'cannot be modified'):
+    with self.assertRaisesRegex(RuntimeError, 'cannot be modified'):
       config.set_logical_device_configuration(cpus[0], [
           context.LogicalDeviceConfiguration(),
           context.LogicalDeviceConfiguration(),
@@ -445,20 +445,20 @@ class DeviceTest(test.TestCase):
     self.assertEqual(len(config.get_visible_devices('GPU')), 0)
     self.assertEqual(len(config.list_logical_devices('XLA_GPU')), 0)
 
-    with self.assertRaisesRegexp(errors.InvalidArgumentError,
-                                 'Could not satisfy'):
+    with self.assertRaisesRegex(errors.InvalidArgumentError,
+                                'Could not satisfy'):
       with ops.device('/device:GPU:0'):
         a = array_ops.identity(1.0)
         self.evaluate(a)
 
-    with self.assertRaisesRegexp(errors.InvalidArgumentError,
-                                 'Could not satisfy'):
+    with self.assertRaisesRegex(errors.InvalidArgumentError,
+                                'Could not satisfy'):
       with ops.device('/device:XLA_GPU:0'):
         a = array_ops.identity(1.0)
         self.evaluate(a)
 
     # Modifying the visible devices is not supported
-    with self.assertRaisesRegexp(RuntimeError, 'cannot be modified'):
+    with self.assertRaisesRegex(RuntimeError, 'cannot be modified'):
       config.set_visible_devices(gpus)
 
     # Setting the same visible devices is fine
@@ -477,7 +477,7 @@ class DeviceTest(test.TestCase):
         a = constant_op.constant(1.0)
         self.evaluate(a)
 
-    with self.assertRaisesRegexp(RuntimeError, 'unknown device'):
+    with self.assertRaisesRegex(RuntimeError, 'unknown device'):
       with ops.device('/device:GPU:' + str(len(gpus))):
         a = constant_op.constant(1.0)
         self.evaluate(a)
@@ -515,12 +515,12 @@ class DeviceTest(test.TestCase):
   @reset_eager
   def testDeviceDetailsErrors(self):
     logical_devices = config.list_logical_devices()
-    with self.assertRaisesRegexp(ValueError,
-                                 'must be a tf.config.PhysicalDevice'):
+    with self.assertRaisesRegex(ValueError,
+                                'must be a tf.config.PhysicalDevice'):
       config.get_device_details(logical_devices[0])
 
     phys_dev = context.PhysicalDevice('/physical_device:CPU:100', 'CPU')
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         ValueError, 'The PhysicalDevice must be one obtained from '
         'calling `tf.config.list_physical_devices`'):
       config.get_device_details(phys_dev)
@@ -546,20 +546,20 @@ class DeviceTest(test.TestCase):
         a = array_ops.identity(1.0)
         self.evaluate(a)
 
-    with self.assertRaisesRegexp(errors.InvalidArgumentError,
-                                 'Could not satisfy'):
+    with self.assertRaisesRegex(errors.InvalidArgumentError,
+                                'Could not satisfy'):
       with ops.device('/device:GPU:' + str(len(logical_gpus))):
         a = array_ops.identity(1.0)
         self.evaluate(a)
 
     # Modifying the GPU configuration is not supported
-    with self.assertRaisesRegexp(RuntimeError, 'cannot be modified'):
+    with self.assertRaisesRegex(RuntimeError, 'cannot be modified'):
       config.set_logical_device_configuration(gpus[-1], [
           context.LogicalDeviceConfiguration(memory_limit=20),
           context.LogicalDeviceConfiguration(memory_limit=20)
       ])
 
-    with self.assertRaisesRegexp(RuntimeError, 'cannot be modified'):
+    with self.assertRaisesRegex(RuntimeError, 'cannot be modified'):
       config.set_logical_device_configuration(gpus[-1], [
           context.LogicalDeviceConfiguration(memory_limit=10),
           context.LogicalDeviceConfiguration(memory_limit=10),
@@ -589,7 +589,7 @@ class DeviceTest(test.TestCase):
     self.assertTrue(len(logical_gpus), len(gpus))
 
     # Modifying the GPU configuration is not supported
-    with self.assertRaisesRegexp(RuntimeError, 'cannot be modified'):
+    with self.assertRaisesRegex(RuntimeError, 'cannot be modified'):
       for gpu in gpus:
         config.set_memory_growth(gpu, False)
 
@@ -606,7 +606,7 @@ class DeviceTest(test.TestCase):
     if len(gpus) > 1:
       # Assert if other GPUs were not configured
       config.set_memory_growth(gpus[0], True)
-      with self.assertRaisesRegexp(ValueError, 'cannot differ'):
+      with self.assertRaisesRegex(ValueError, 'cannot differ'):
         c = context.context().config
 
       # If we limit visibility to GPU 0, growth is fine
@@ -621,7 +621,7 @@ class DeviceTest(test.TestCase):
 
       # Growth now fails because all the GPUs are visible and not the same
       config.set_visible_devices(gpus, 'GPU')
-      with self.assertRaisesRegexp(ValueError, 'cannot differ'):
+      with self.assertRaisesRegex(ValueError, 'cannot differ'):
         c = context.context().config
 
     for gpu in gpus:
@@ -630,7 +630,7 @@ class DeviceTest(test.TestCase):
     c = context.context().config
     self.assertTrue(c.gpu_options.allow_growth)
 
-    with self.assertRaisesRegexp(ValueError, 'memory limit'):
+    with self.assertRaisesRegex(ValueError, 'memory limit'):
       config.set_logical_device_configuration(gpus[-1], [
           context.LogicalDeviceConfiguration(),
           context.LogicalDeviceConfiguration()
@@ -645,7 +645,7 @@ class DeviceTest(test.TestCase):
     c = context.context().config
     self.assertFalse(c.gpu_options.allow_growth)
 
-    with self.assertRaisesRegexp(ValueError, 'virtual devices'):
+    with self.assertRaisesRegex(ValueError, 'virtual devices'):
       config.set_memory_growth(gpus[-1], False)
 
   @test_util.run_gpu_only
@@ -719,7 +719,7 @@ class DeviceTest(test.TestCase):
     # Handle invalid visible device list
     context.context()._config = config_pb2.ConfigProto(
         gpu_options=config_pb2.GPUOptions(visible_device_list=str(gpu_count)))
-    with self.assertRaisesRegexp(ValueError, 'Invalid visible device index'):
+    with self.assertRaisesRegex(ValueError, 'Invalid visible device index'):
       gpus = config.list_physical_devices('GPU')
       new_config = context.context().config
     context.context()._physical_devices = None
diff --git a/tensorflow/python/framework/device_spec_test.py b/tensorflow/python/framework/device_spec_test.py
index 850b9a561ae..7ccb96fbb50 100644
--- a/tensorflow/python/framework/device_spec_test.py
+++ b/tensorflow/python/framework/device_spec_test.py
@@ -149,7 +149,7 @@ class DeviceSpecTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     d.parse_from_string("/job:muu/device:GPU:2")
     self.assertEqual("/job:muu/device:GPU:2", d.to_string())
 
-    with self.assertRaisesRegexp(ValueError, "Cannot specify multiple"):
+    with self.assertRaisesRegex(ValueError, "Cannot specify multiple"):
       d.parse_from_string("/job:muu/device:GPU:2/cpu:0")
 
   @parameterized.named_parameters(*TEST_V1_AND_V2)
@@ -173,7 +173,7 @@ class DeviceSpecTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     self.assertAllEqual(["muu", "GPU", 2],
                         [d.job, d.device_type, d.device_index])
 
-    with self.assertRaisesRegexp(ValueError, "Cannot specify multiple"):
+    with self.assertRaisesRegex(ValueError, "Cannot specify multiple"):
       d.parse_from_string("/job:muu/device:GPU:2/cpu:0")
 
   def test_merge_legacy(self):
diff --git a/tensorflow/python/framework/device_test.py b/tensorflow/python/framework/device_test.py
index 2b34c1ec7fd..b8d57e6a072 100644
--- a/tensorflow/python/framework/device_test.py
+++ b/tensorflow/python/framework/device_test.py
@@ -80,16 +80,16 @@ class DeviceTest(test_util.TensorFlowTestCase, parameterized.TestCase):
   def testCheckValid(self):
     device.check_valid("/job:foo/replica:0")
 
-    with self.assertRaisesRegexp(ValueError, "invalid literal for int"):
+    with self.assertRaisesRegex(ValueError, "invalid literal for int"):
       device.check_valid("/job:j/replica:foo")
 
-    with self.assertRaisesRegexp(ValueError, "invalid literal for int"):
+    with self.assertRaisesRegex(ValueError, "invalid literal for int"):
       device.check_valid("/job:j/task:bar")
 
-    with self.assertRaisesRegexp(ValueError, "Unknown attribute: 'bar'"):
+    with self.assertRaisesRegex(ValueError, "Unknown attribute: 'bar'"):
       device.check_valid("/bar:muu/baz:2")
 
-    with self.assertRaisesRegexp(ValueError, "Cannot specify multiple device"):
+    with self.assertRaisesRegex(ValueError, "Cannot specify multiple device"):
       device.check_valid("/cpu:0/device:GPU:2")
 
 
diff --git a/tensorflow/python/framework/dtypes_test.py b/tensorflow/python/framework/dtypes_test.py
index 1b7e02b6179..0673ee41f0e 100644
--- a/tensorflow/python/framework/dtypes_test.py
+++ b/tensorflow/python/framework/dtypes_test.py
@@ -261,42 +261,42 @@ class TypesTest(test_util.TensorFlowTestCase):
 
       # check some values that are known
       if numpy_dtype == np.bool_:
-        self.assertEquals(dtype.min, 0)
-        self.assertEquals(dtype.max, 1)
+        self.assertEqual(dtype.min, 0)
+        self.assertEqual(dtype.max, 1)
       if numpy_dtype == np.int8:
-        self.assertEquals(dtype.min, -128)
-        self.assertEquals(dtype.max, 127)
+        self.assertEqual(dtype.min, -128)
+        self.assertEqual(dtype.max, 127)
       if numpy_dtype == np.int16:
-        self.assertEquals(dtype.min, -32768)
-        self.assertEquals(dtype.max, 32767)
+        self.assertEqual(dtype.min, -32768)
+        self.assertEqual(dtype.max, 32767)
       if numpy_dtype == np.int32:
-        self.assertEquals(dtype.min, -2147483648)
-        self.assertEquals(dtype.max, 2147483647)
+        self.assertEqual(dtype.min, -2147483648)
+        self.assertEqual(dtype.max, 2147483647)
       if numpy_dtype == np.int64:
-        self.assertEquals(dtype.min, -9223372036854775808)
-        self.assertEquals(dtype.max, 9223372036854775807)
+        self.assertEqual(dtype.min, -9223372036854775808)
+        self.assertEqual(dtype.max, 9223372036854775807)
       if numpy_dtype == np.uint8:
-        self.assertEquals(dtype.min, 0)
-        self.assertEquals(dtype.max, 255)
+        self.assertEqual(dtype.min, 0)
+        self.assertEqual(dtype.max, 255)
       if numpy_dtype == np.uint16:
         if dtype == dtypes.uint16:
-          self.assertEquals(dtype.min, 0)
-          self.assertEquals(dtype.max, 65535)
+          self.assertEqual(dtype.min, 0)
+          self.assertEqual(dtype.max, 65535)
         elif dtype == dtypes.bfloat16:
-          self.assertEquals(dtype.min, 0)
-          self.assertEquals(dtype.max, 4294967295)
+          self.assertEqual(dtype.min, 0)
+          self.assertEqual(dtype.max, 4294967295)
       if numpy_dtype == np.uint32:
-        self.assertEquals(dtype.min, 0)
-        self.assertEquals(dtype.max, 4294967295)
+        self.assertEqual(dtype.min, 0)
+        self.assertEqual(dtype.max, 4294967295)
       if numpy_dtype == np.uint64:
-        self.assertEquals(dtype.min, 0)
-        self.assertEquals(dtype.max, 18446744073709551615)
+        self.assertEqual(dtype.min, 0)
+        self.assertEqual(dtype.max, 18446744073709551615)
       if numpy_dtype in (np.float16, np.float32, np.float64):
-        self.assertEquals(dtype.min, np.finfo(numpy_dtype).min)
-        self.assertEquals(dtype.max, np.finfo(numpy_dtype).max)
+        self.assertEqual(dtype.min, np.finfo(numpy_dtype).min)
+        self.assertEqual(dtype.max, np.finfo(numpy_dtype).max)
       if numpy_dtype == dtypes.bfloat16.as_numpy_dtype:
-        self.assertEquals(dtype.min, float.fromhex("-0x1.FEp127"))
-        self.assertEquals(dtype.max, float.fromhex("0x1.FEp127"))
+        self.assertEqual(dtype.min, float.fromhex("-0x1.FEp127"))
+        self.assertEqual(dtype.max, float.fromhex("0x1.FEp127"))
 
   def testRepr(self):
     self.skipTest("b/142725777")
@@ -304,11 +304,11 @@ class TypesTest(test_util.TensorFlowTestCase):
       if enum > 100:
         continue
       dtype = dtypes.DType(enum)
-      self.assertEquals(repr(dtype), "tf." + name)
+      self.assertEqual(repr(dtype), "tf." + name)
       import tensorflow as tf
       dtype2 = eval(repr(dtype))
-      self.assertEquals(type(dtype2), dtypes.DType)
-      self.assertEquals(dtype, dtype2)
+      self.assertEqual(type(dtype2), dtypes.DType)
+      self.assertEqual(dtype, dtype2)
 
   def testEqWithNonTFTypes(self):
     self.assertNotEqual(dtypes.int32, int)
diff --git a/tensorflow/python/framework/error_interpolation_test.py b/tensorflow/python/framework/error_interpolation_test.py
index 4e6027373cb..8e2e9b983c8 100644
--- a/tensorflow/python/framework/error_interpolation_test.py
+++ b/tensorflow/python/framework/error_interpolation_test.py
@@ -240,7 +240,7 @@ class InterpolateFilenamesAndLineNumbersTest(test.TestCase):
     two_tags_no_seps = "{{node One}}{{node Three}}"
     interpolated_string = error_interpolation.interpolate(
         two_tags_no_seps, self.graph)
-    self.assertRegexpMatches(
+    self.assertRegex(
         interpolated_string, r"error_interpolation_test\.py:[0-9]+."
         r"*error_interpolation_test\.py:[0-9]+")
 
@@ -250,13 +250,13 @@ class InterpolateFilenamesAndLineNumbersTest(test.TestCase):
         two_tags_with_seps, self.graph)
     expected_regex = (r"^;;;.*error_interpolation_test\.py:[0-9]+\) "
                       r",,,.*error_interpolation_test\.py:[0-9]+\) ;;;$")
-    self.assertRegexpMatches(interpolated_string, expected_regex)
+    self.assertRegex(interpolated_string, expected_regex)
 
   def testNewLine(self):
     newline = "\n\n{{node One}}"
     interpolated_string = error_interpolation.interpolate(newline, self.graph)
-    self.assertRegexpMatches(interpolated_string,
-                             r"error_interpolation_test\.py:[0-9]+.*")
+    self.assertRegex(interpolated_string,
+                     r"error_interpolation_test\.py:[0-9]+.*")
 
 
 @test_util.run_deprecated_v1
@@ -279,7 +279,7 @@ class InputNodesTest(test.TestCase):
         two_tags_with_seps, self.graph)
     expected_regex = (r"^;;;.*error_interpolation_test\.py:[0-9]+\) "
                       r",,,.*error_interpolation_test\.py:[0-9]+\) ;;;$")
-    self.assertRegexpMatches(interpolated_string, expected_regex)
+    self.assertRegex(interpolated_string, expected_regex)
 
   def testBasicInputs(self):
     tag = ";;;{{node Three}};;;"
@@ -287,7 +287,7 @@ class InputNodesTest(test.TestCase):
     expected_regex = re.compile(
         r"^;;;.*error_interpolation_test\.py:[0-9]+\) "
         r";;;.*Input.*error_interpolation_test\.py:[0-9]+\)", re.DOTALL)
-    self.assertRegexpMatches(interpolated_string, expected_regex)
+    self.assertRegex(interpolated_string, expected_regex)
 
 
 @test_util.run_deprecated_v1
@@ -332,7 +332,7 @@ class InterpolateDeviceSummaryTest(test.TestCase):
     self.assertEqual(2, num_devices)
     name_re = r"_fancy_device_function<.*error_interpolation_test.py, [0-9]+>"
     expected_re = r"with tf.device\(.*%s\)" % name_re
-    self.assertRegexpMatches(result, expected_re)
+    self.assertRegex(result, expected_re)
 
 
 @test_util.run_deprecated_v1
diff --git a/tensorflow/python/framework/function_test.py b/tensorflow/python/framework/function_test.py
index 58a1d379304..9160670a302 100644
--- a/tensorflow/python/framework/function_test.py
+++ b/tensorflow/python/framework/function_test.py
@@ -144,7 +144,7 @@ class FunctionTest(test.TestCase):
       return a
 
     with ops.Graph().as_default():
-      with self.assertRaisesRegexp(
+      with self.assertRaisesRegex(
           errors_impl.InvalidArgumentError,
           (r"output names must be either empty or equal in size to outputs. "
            "output names size = 2 outputs size = 1")):
@@ -305,7 +305,7 @@ class FunctionTest(test.TestCase):
 
     x = np.random.uniform(-10., 10., size=(4, 9)).astype(np.float32)
     with session.Session(graph=g) as sess:
-      with self.assertRaisesRegexp(
+      with self.assertRaisesRegex(
           errors_impl.InvalidArgumentError,
           "SymGrad expects to return 1.*but get 2.*instead"):
         _ = sess.run(dinp, {inp: x})
@@ -438,8 +438,8 @@ class FunctionTest(test.TestCase):
     g = ops.Graph()
     with g.as_default(), self.cached_session():
       self.assertAllEqual(Foo(constant_op.constant(3.0)).eval(), 6.0)
-      with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
-                                   "assertion failed.*-3"):
+      with self.assertRaisesRegex(errors_impl.InvalidArgumentError,
+                                  "assertion failed.*-3"):
         self.assertAllEqual(Foo(constant_op.constant(-3.0)).eval(), 6.0)
 
   @test_util.run_deprecated_v1
@@ -453,8 +453,8 @@ class FunctionTest(test.TestCase):
 
     with self.cached_session():
       self.assertEqual(1.0, MyFn(1.0).eval())
-      with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
-                                   "assertion"):
+      with self.assertRaisesRegex(errors_impl.InvalidArgumentError,
+                                  "assertion"):
         _ = MyFn(100.0).eval()
 
   @test_util.run_deprecated_v1
@@ -514,14 +514,14 @@ class FunctionTest(test.TestCase):
       self.assertEqual(4, sess.run(cond, {pred: True, x: 3}))
 
       # The assertion should still fire if the False branch is taken.
-      with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
-                                   "assertion"):
+      with self.assertRaisesRegex(errors_impl.InvalidArgumentError,
+                                  "assertion"):
         sess.run(cond, {pred: False, x: 3})
 
       # Similarly for loops.
       self.assertEqual(3, sess.run(loop, {pred: False, x: 3}))
-      with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
-                                   "assertion"):
+      with self.assertRaisesRegex(errors_impl.InvalidArgumentError,
+                                  "assertion"):
         sess.run(loop, {pred: True, x: 3})
 
   @test_util.run_deprecated_v1
@@ -566,7 +566,7 @@ class FunctionTest(test.TestCase):
 
   def testDefineErrors(self):
     with ops.Graph().as_default():
-      with self.assertRaisesRegexp(ValueError, "can not return None"):
+      with self.assertRaisesRegex(ValueError, "can not return None"):
 
         @function.Defun()
         def TwoNone():
@@ -574,28 +574,28 @@ class FunctionTest(test.TestCase):
 
         _ = TwoNone.definition
 
-      with self.assertRaisesRegexp(ValueError, "are not supported"):
+      with self.assertRaisesRegex(ValueError, "are not supported"):
 
         @function.Defun()
         def DefaultArg(unused_a=12):
           return constant_op.constant([1])
 
         _ = DefaultArg.definition
-      with self.assertRaisesRegexp(ValueError, "are not supported"):
+      with self.assertRaisesRegex(ValueError, "are not supported"):
 
         @function.Defun()
         def KwArgs(**unused_kwargs):
           return constant_op.constant([1])
 
         _ = KwArgs.definition
-      with self.assertRaisesRegexp(ValueError, "specified input types"):
+      with self.assertRaisesRegex(ValueError, "specified input types"):
 
         @function.Defun(dtypes.float32)
         def PlusMinusV2(a, b):
           return a + b, b - a
 
         _ = PlusMinusV2.definition
-      with self.assertRaisesRegexp(ValueError, "specified input types"):
+      with self.assertRaisesRegex(ValueError, "specified input types"):
 
         @function.Defun(dtypes.float32, dtypes.float32, dtypes.float32)
         def PlusMinusV3(a, b):
@@ -623,25 +623,25 @@ class FunctionTest(test.TestCase):
       # pylint: disable=too-many-function-args
       # pylint: disable=unexpected-keyword-arg
       # pylint: disable=no-value-for-parameter
-      with self.assertRaisesRegexp(ValueError, "arguments: 0"):
+      with self.assertRaisesRegex(ValueError, "arguments: 0"):
         _ = Const(1)
-      with self.assertRaisesRegexp(ValueError, "arguments: 0"):
+      with self.assertRaisesRegex(ValueError, "arguments: 0"):
         _ = Const(1, 2)
 
-      with self.assertRaisesRegexp(ValueError, "arguments: 1"):
+      with self.assertRaisesRegex(ValueError, "arguments: 1"):
         _ = PlusOne()
       _ = PlusOne(1)
-      with self.assertRaisesRegexp(ValueError, "arguments: 1"):
+      with self.assertRaisesRegex(ValueError, "arguments: 1"):
         _ = PlusOne(1, 2)
 
-      with self.assertRaisesRegexp(ValueError, "arguments: 2"):
+      with self.assertRaisesRegex(ValueError, "arguments: 2"):
         _ = PlusMinus()
-      with self.assertRaisesRegexp(ValueError, "arguments: 2"):
+      with self.assertRaisesRegex(ValueError, "arguments: 2"):
         _ = PlusMinus(1)
       _ = PlusMinus(1, 2)
 
       _ = PlusOne(1, name="p1")
-      with self.assertRaisesRegexp(ValueError, "Unknown keyword arguments"):
+      with self.assertRaisesRegex(ValueError, "Unknown keyword arguments"):
         _ = PlusOne(1, device="/device:GPU:0")
 
   def testFunctionDecorator(self):
@@ -788,7 +788,7 @@ class FunctionTest(test.TestCase):
           y = logging_ops.Print(y, [y], "inner")
         return y
 
-      with self.assertRaisesRegexp(ValueError, "not an element of this graph."):
+      with self.assertRaisesRegex(ValueError, "not an element of this graph."):
         # NOTE: We still do not support capturing control deps.
         _ = Foo(x)
 
@@ -1051,7 +1051,7 @@ class FunctionTest(test.TestCase):
       return t + constant_op.constant(3, dtype=dtypes.int32)
 
     # First time we try to capture a stateful RandomUniform op.
-    with self.assertRaisesRegexp(ValueError, "Cannot capture a stateful node"):
+    with self.assertRaisesRegex(ValueError, "Cannot capture a stateful node"):
       res = StatefulFn()
 
     # This time we whitelist this op, so that its recreated.
@@ -1313,7 +1313,7 @@ class FunctionsFromProtos(test.TestCase):
     library.gradient.extend([gradient])
     library.function.extend([F1.definition])
 
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         ValueError,
         "FunctionDefLibrary missing 'G1_[0-9a-zA-Z]{8,11}' FunctionDef"):
       function.from_library(library)
@@ -1323,7 +1323,7 @@ class FunctionsFromProtos(test.TestCase):
     library.gradient.extend([gradient])
     library.function.extend([G1.definition])
 
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         ValueError,
         "FunctionDefLibrary missing 'F1_[0-9a-zA-Z]{8,11}' FunctionDef"):
       function.from_library(library)
@@ -1353,7 +1353,7 @@ class FunctionsFromProtos(test.TestCase):
 
     library.gradient.extend([gradient1, gradient2])
 
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         ValueError, "FunctionDefLibrary contains cyclic gradient functions!"):
       function.from_library(library)
 
diff --git a/tensorflow/python/framework/graph_util_test.py b/tensorflow/python/framework/graph_util_test.py
index d39b8d0a906..4957ee7d97e 100644
--- a/tensorflow/python/framework/graph_util_test.py
+++ b/tensorflow/python/framework/graph_util_test.py
@@ -189,7 +189,7 @@ class DeviceFunctionsTest(test.TestCase):
     graph_def = graph_pb2.GraphDef()
     n1 = graph_def.node.add()
     n1.name = "n1"
-    with self.assertRaisesRegexp(TypeError, "must be a list"):
+    with self.assertRaisesRegex(TypeError, "must be a list"):
       graph_util.extract_sub_graph(graph_def, "n1")
 
   def create_node_def(self, op, name, inputs):
diff --git a/tensorflow/python/framework/importer_test.py b/tensorflow/python/framework/importer_test.py
index ae30c15e844..9d64311b4b1 100644
--- a/tensorflow/python/framework/importer_test.py
+++ b/tensorflow/python/framework/importer_test.py
@@ -465,7 +465,7 @@ class ImportGraphDefTest(test.TestCase):
     error_msg = ("Input 0 of node import/B was passed int32 from import/A:0 "
                  "incompatible with expected float.")
     with ops.Graph().as_default():
-      with self.assertRaisesRegexp(ValueError, error_msg):
+      with self.assertRaisesRegex(ValueError, error_msg):
         importer.import_graph_def(
             self._MakeGraphDef("""
             node { name: 'A' op: 'IntOutput' }
@@ -494,7 +494,7 @@ class ImportGraphDefTest(test.TestCase):
   def testInvalidSignatureTooManyInputsInGraphDef(self):
     with ops.Graph().as_default():
       # TODO(skyewm): improve error message
-      with self.assertRaisesRegexp(
+      with self.assertRaisesRegex(
           ValueError,
           "NodeDef expected inputs '' do not match 1 inputs specified"):
         importer.import_graph_def(
@@ -506,7 +506,7 @@ class ImportGraphDefTest(test.TestCase):
   def testInvalidSignatureNotEnoughInputsInGraphDef(self):
     with ops.Graph().as_default():
       # TODO(skyewm): improve error message
-      with self.assertRaisesRegexp(
+      with self.assertRaisesRegex(
           ValueError,
           "NodeDef expected inputs 'int32, float' do not match 1 inputs "
           "specified"):
@@ -518,8 +518,8 @@ class ImportGraphDefTest(test.TestCase):
 
   def testMissingInputOpInGraphDef(self):
     with ops.Graph().as_default():
-      with self.assertRaisesRegexp(ValueError,
-                                   "Node 'B': Unknown input node 'A:0'"):
+      with self.assertRaisesRegex(ValueError,
+                                  "Node 'B': Unknown input node 'A:0'"):
         importer.import_graph_def(
             self._MakeGraphDef("""
             node { name: 'B' op: 'FloatInput' input: 'A:0' }
@@ -538,7 +538,7 @@ class ImportGraphDefTest(test.TestCase):
 
   def testMissingInputTensorInGraphDef(self):
     with ops.Graph().as_default():
-      with self.assertRaisesRegexp(
+      with self.assertRaisesRegex(
           ValueError,
           "Node 'B': Connecting to invalid output 1 of source node A "
           "which has 1 outputs"):
@@ -550,8 +550,8 @@ class ImportGraphDefTest(test.TestCase):
 
   def testMissingControlInputInGraphDef(self):
     with ops.Graph().as_default():
-      with self.assertRaisesRegexp(ValueError,
-                                   r"Node 'B': Unknown input node '\^A'"):
+      with self.assertRaisesRegex(ValueError,
+                                  r"Node 'B': Unknown input node '\^A'"):
         importer.import_graph_def(
             self._MakeGraphDef("""
             node { name: 'B' op: 'None' input: '^A' }
@@ -559,8 +559,8 @@ class ImportGraphDefTest(test.TestCase):
 
   def testInvalidTensorNameOutputIndexInGraphDef(self):
     with ops.Graph().as_default():
-      with self.assertRaisesRegexp(ValueError,
-                                   "Node 'B': Unknown input node 'A:B'"):
+      with self.assertRaisesRegex(ValueError,
+                                  "Node 'B': Unknown input node 'A:B'"):
         importer.import_graph_def(
             self._MakeGraphDef("""
             node { name: 'B' op: 'None' input: 'A:B' }
@@ -568,8 +568,8 @@ class ImportGraphDefTest(test.TestCase):
 
   def testInvalidTensorNameInGraphDef(self):
     with ops.Graph().as_default():
-      with self.assertRaisesRegexp(ValueError,
-                                   "Node 'B': Unknown input node 'A:B:0'"):
+      with self.assertRaisesRegex(ValueError,
+                                  "Node 'B': Unknown input node 'A:B:0'"):
         importer.import_graph_def(
             self._MakeGraphDef("""
             node { name: 'B' op: 'None' input: 'A:B:0' }
@@ -577,7 +577,7 @@ class ImportGraphDefTest(test.TestCase):
 
   def testMissingReturnOperation(self):
     with ops.Graph().as_default():
-      with self.assertRaisesRegexp(
+      with self.assertRaisesRegex(
           ValueError, "Requested return node 'B' not found in graph def"):
         importer.import_graph_def(
             self._MakeGraphDef("""
@@ -587,7 +587,7 @@ class ImportGraphDefTest(test.TestCase):
 
   def testMissingReturnTensor(self):
     with ops.Graph().as_default():
-      with self.assertRaisesRegexp(
+      with self.assertRaisesRegex(
           ValueError,
           r"Invalid return output 1 of node 'A', which has 1 output\(s\)"):
         importer.import_graph_def(
@@ -596,7 +596,7 @@ class ImportGraphDefTest(test.TestCase):
             """),
             return_elements=["A:1"])
 
-      with self.assertRaisesRegexp(
+      with self.assertRaisesRegex(
           ValueError, "Requested return tensor 'B:0' not found in graph def"):
         importer.import_graph_def(
             self._MakeGraphDef("""
@@ -604,8 +604,8 @@ class ImportGraphDefTest(test.TestCase):
             """),
             return_elements=["B:0"])
 
-      with self.assertRaisesRegexp(ValueError,
-                                   "Cannot convert 'A:B:0' to a tensor name."):
+      with self.assertRaisesRegex(ValueError,
+                                  "Cannot convert 'A:B:0' to a tensor name."):
         importer.import_graph_def(
             self._MakeGraphDef("""
             node { name: 'A' op: 'IntOutput' }
@@ -614,7 +614,7 @@ class ImportGraphDefTest(test.TestCase):
 
   def testMissingInputMap(self):
     with ops.Graph().as_default():
-      with self.assertRaisesRegexp(
+      with self.assertRaisesRegex(
           ValueError,
           r"Attempted to map inputs that were not found in graph_def: \[B:0\]"):
         importer.import_graph_def(
@@ -633,7 +633,7 @@ class ImportGraphDefTest(test.TestCase):
           input_map={"A:0": constant_op.constant(5.0)})
 
       # Mapping a non-existent output of an existing node should fail.
-      with self.assertRaisesRegexp(
+      with self.assertRaisesRegex(
           ValueError,
           r"Attempted to map inputs that were not found in graph_def: \[A:2\]"):
         importer.import_graph_def(
@@ -644,7 +644,7 @@ class ImportGraphDefTest(test.TestCase):
 
   def testInputMapTypeMismatch(self):
     with ops.Graph().as_default():
-      with self.assertRaisesRegexp(
+      with self.assertRaisesRegex(
           ValueError, "Input 0 of node import/B was passed float from Const:0 "
           "incompatible with expected int32."):
         importer.import_graph_def(
@@ -870,7 +870,7 @@ class ImportGraphDefTest(test.TestCase):
           } }""")
 
     with ops.Graph().as_default():
-      with self.assertRaisesRegexp(
+      with self.assertRaisesRegex(
           ValueError, "Node 'B' expects to be colocated with unknown node 'A'"):
         importer.import_graph_def(
             original_graph_def, return_elements=["B"], name="imported_graph")
@@ -919,17 +919,17 @@ class ImportGraphDefTest(test.TestCase):
 
   def testInvalidInputForReturnOperations(self):
     with ops.Graph().as_default():
-      with self.assertRaisesRegexp(
-          TypeError, "return_elements must be a list of strings."):
+      with self.assertRaisesRegex(TypeError,
+                                  "return_elements must be a list of strings."):
         importer.import_graph_def(self._MakeGraphDef(""), return_elements=[7])
 
-      with self.assertRaisesRegexp(ValueError,
-                                   "Cannot convert 'a:b:c' to a tensor name."):
+      with self.assertRaisesRegex(ValueError,
+                                  "Cannot convert 'a:b:c' to a tensor name."):
         importer.import_graph_def(
             self._MakeGraphDef(""), return_elements=["a:b:c"])
 
   def testDuplicateOperationNames(self):
-    with self.assertRaisesRegexp(ValueError, "Node 'A' is not unique"):
+    with self.assertRaisesRegex(ValueError, "Node 'A' is not unique"):
       importer.import_graph_def(
           self._MakeGraphDef("""
           node { name: 'A' op: 'IntOutput' }
@@ -1077,7 +1077,7 @@ class ImportGraphDefTest(test.TestCase):
 
   def testVersionLow(self):
     with ops.Graph().as_default():
-      with self.assertRaisesRegexp(
+      with self.assertRaisesRegex(
           Exception,
           r"GraphDef producer version -1 below min producer %d supported "
           r"by TensorFlow \S+\.  Please regenerate your graph.$" %
@@ -1086,7 +1086,7 @@ class ImportGraphDefTest(test.TestCase):
 
   def testVersionHigh(self):
     with ops.Graph().as_default():
-      with self.assertRaisesRegexp(
+      with self.assertRaisesRegex(
           ValueError,
           r"GraphDef min consumer version %d above current version %d "
           r"for TensorFlow \S+\.  Please upgrade TensorFlow\.$" %
@@ -1137,7 +1137,7 @@ class ImportGraphDefTest(test.TestCase):
           """),
           return_elements=["A"],
           producer_op_list=producer_op_list)
-      with self.assertRaisesRegexp(
+      with self.assertRaisesRegex(
           ValueError, "Operation 'import/A' has no attr named 'default_int'."):
         a[0].get_attr("default_int")
 
diff --git a/tensorflow/python/framework/memory_checker_test.py b/tensorflow/python/framework/memory_checker_test.py
index 62af2814395..bed6aaca587 100644
--- a/tensorflow/python/framework/memory_checker_test.py
+++ b/tensorflow/python/framework/memory_checker_test.py
@@ -125,7 +125,7 @@ class MemoryCheckerTest(test.TestCase):
       x = constant_op.constant(1)  # pylint: disable=unused-variable
       memory_checker.record_snapshot()
 
-    with self.assertRaisesRegexp(AssertionError, 'New Python objects'):
+    with self.assertRaisesRegex(AssertionError, 'New Python objects'):
       memory_checker.assert_no_new_python_objects()
 
   def testNewPythonObjectBelowThreshold(self):
diff --git a/tensorflow/python/framework/meta_graph_test.py b/tensorflow/python/framework/meta_graph_test.py
index eff613b4204..ae44fbce0f0 100644
--- a/tensorflow/python/framework/meta_graph_test.py
+++ b/tensorflow/python/framework/meta_graph_test.py
@@ -407,13 +407,13 @@ class ScopedMetaGraphTest(test.TestCase):
       new_image = constant_op.constant(
           1.2, dtypes.float32, shape=[100, 28], name="images")
 
-    with self.assertRaisesRegexp(ValueError, "Graph contains unbound inputs"):
+    with self.assertRaisesRegex(ValueError, "Graph contains unbound inputs"):
       meta_graph.import_scoped_meta_graph(
           os.path.join(test_dir, exported_filenames[0]),
           graph=graph,
           import_scope="new_hidden1")
 
-    with self.assertRaisesRegexp(ValueError, "Graph contains unbound inputs"):
+    with self.assertRaisesRegex(ValueError, "Graph contains unbound inputs"):
       meta_graph.import_scoped_meta_graph(
           os.path.join(test_dir, exported_filenames[0]),
           graph=graph,
@@ -829,7 +829,7 @@ class ScopedMetaGraphTest(test.TestCase):
 
     graph2 = ops.Graph()
     with graph2.as_default():
-      with self.assertRaisesRegexp(ValueError, "Graph contains unbound inputs"):
+      with self.assertRaisesRegex(ValueError, "Graph contains unbound inputs"):
         meta_graph.import_scoped_meta_graph(
             orig_meta_graph, import_scope="new_hidden1")
 
@@ -952,7 +952,7 @@ class MetaGraphWithVariableScopeTest(test.TestCase):
               "python/framework/testdata/metrics_export_meta_graph.pb"))
       self.assertEqual(len(ops.get_collection(ops.GraphKeys.LOCAL_VARIABLES)),
                        2)
-      with self.assertRaisesRegexp(
+      with self.assertRaisesRegex(
           AttributeError, "'Tensor' object has no attribute 'initializer'"):
         initializer = variables.local_variables_initializer()
 
diff --git a/tensorflow/python/framework/ops_test.py b/tensorflow/python/framework/ops_test.py
index 7626bd780bb..09a192dea52 100644
--- a/tensorflow/python/framework/ops_test.py
+++ b/tensorflow/python/framework/ops_test.py
@@ -80,12 +80,12 @@ class ResourceTest(test_util.TensorFlowTestCase):
           handle=handle,
           create_op=test_ops.resource_create_op(handle),
           is_initialized_op=test_ops.resource_initialized_op(handle))
-      self.assertEquals(
+      self.assertEqual(
           len(
               resources.report_uninitialized_resources(
                   resources.shared_resources()).eval()), 1)
       resources.initialize_resources(resources.shared_resources()).run()
-      self.assertEquals(
+      self.assertEqual(
           len(
               resources.report_uninitialized_resources(
                   resources.shared_resources()).eval()), 0)
@@ -108,7 +108,7 @@ class TensorAndShapeTest(test_util.TensorFlowTestCase):
     op = ops.Operation(
         ops._NodeDef("FloatOutput", "myop"), ops.Graph(), [], [dtypes.float32])
     t = op.outputs[0]
-    with self.assertRaisesRegexp(TypeError, "Cannot iterate"):
+    with self.assertRaisesRegex(TypeError, "Cannot iterate"):
       iter(t)
 
   def testIterableGraph(self):
@@ -118,14 +118,12 @@ class TensorAndShapeTest(test_util.TensorFlowTestCase):
     op = ops.Operation(
         ops._NodeDef("FloatOutput", "myop"), ops.Graph(), [], [dtypes.float32])
     t = op.outputs[0]
-    with self.assertRaisesRegexp(TypeError, "iterating.*not allowed in Graph"):
+    with self.assertRaisesRegex(TypeError, "iterating.*not allowed in Graph"):
       next(iter(t))
-    with self.assertRaisesRegexp(
-        TypeError, "iterating.*AutoGraph did convert"):
+    with self.assertRaisesRegex(TypeError, "iterating.*AutoGraph did convert"):
       with ag_ctx.ControlStatusCtx(ag_ctx.Status.ENABLED):
         next(iter(t))
-    with self.assertRaisesRegexp(
-        TypeError, "iterating.*AutoGraph is disabled"):
+    with self.assertRaisesRegex(TypeError, "iterating.*AutoGraph is disabled"):
       with ag_ctx.ControlStatusCtx(ag_ctx.Status.DISABLED):
         next(iter(t))
 
@@ -133,15 +131,15 @@ class TensorAndShapeTest(test_util.TensorFlowTestCase):
     op = ops.Operation(
         ops._NodeDef("FloatOutput", "myop"), ops.Graph(), [], [dtypes.bool])
     t = op.outputs[0]
-    with self.assertRaisesRegexp(
-        TypeError, "using.*as a.*bool.*not allowed in Graph"):
+    with self.assertRaisesRegex(TypeError,
+                                "using.*as a.*bool.*not allowed in Graph"):
       bool(t)
-    with self.assertRaisesRegexp(
-        TypeError, "using.*as a.*bool.*AutoGraph did convert"):
+    with self.assertRaisesRegex(TypeError,
+                                "using.*as a.*bool.*AutoGraph did convert"):
       with ag_ctx.ControlStatusCtx(ag_ctx.Status.ENABLED):
         bool(t)
-    with self.assertRaisesRegexp(
-        TypeError, "using.*as a.*bool.*AutoGraph is disabled"):
+    with self.assertRaisesRegex(TypeError,
+                                "using.*as a.*bool.*AutoGraph is disabled"):
       with ag_ctx.ControlStatusCtx(ag_ctx.Status.DISABLED):
         bool(t)
 
@@ -181,7 +179,7 @@ class TensorAndShapeTest(test_util.TensorFlowTestCase):
     with self.cached_session():
       a = array_ops.ones([1, 2, 3])
       b = array_ops.ones([4, 5, 6])
-      with self.assertRaisesRegexp(
+      with self.assertRaisesRegex(
           ValueError, r"Dimensions must be equal, but are 2 and 5 for .*add"
           r".*Add(V2)?.* with input shapes: \[1,2,3\], \[4,5,6\]."):
         _ = a + b
@@ -190,11 +188,11 @@ class TensorAndShapeTest(test_util.TensorFlowTestCase):
     with ops.Graph().as_default():
       x = array_ops.ones((3, 4), name="test_ones")
 
-    with self.assertRaisesRegexp(NotImplementedError,
-                                 r"Cannot convert a symbolic.+test_ones"):
+    with self.assertRaisesRegex(NotImplementedError,
+                                r"Cannot convert a symbolic.+test_ones"):
       np.array(x)
 
-    with self.assertRaisesRegexp(TypeError, "not well defined.+test_ones"):
+    with self.assertRaisesRegex(TypeError, "not well defined.+test_ones"):
       len(x)
 
     # EagerTensors should still behave as numpy arrays.
@@ -591,7 +589,7 @@ class OperationTest(test_util.TensorFlowTestCase):
         ops._NodeDef("RefOutputFloatOutput", "op1"), g, [],
         [dtypes.float32_ref, dtypes.float32])
     self.assertProtoEquals("op:'RefOutputFloatOutput' name:'op1'", op1.node_def)
-    self.assertEquals([], list(op1.inputs))
+    self.assertEqual([], list(op1.inputs))
     ref_t, nonref_t = op1.values()
     # NOTE(mrry): Must specify input_types to preserve ref-typed input.
     op2 = ops.Operation(
@@ -601,7 +599,7 @@ class OperationTest(test_util.TensorFlowTestCase):
     self.assertProtoEquals(
         "op:'RefInputFloatInput' name:'op2' input:'op1' input:'op1:1'",
         op2.node_def)
-    self.assertEquals([ref_t, nonref_t], list(op2.inputs))
+    self.assertEqual([ref_t, nonref_t], list(op2.inputs))
     op3 = ops.Operation(
         ops._NodeDef("TwoFloatInputs", "op3"), g, [ref_t, nonref_t], [])
     self.assertProtoEquals(
@@ -715,8 +713,8 @@ class OperationTest(test_util.TensorFlowTestCase):
   def testNoConvert(self):
     # Operation cannot be converted to Tensor.
     op = control_flow_ops.no_op()
-    with self.assertRaisesRegexp(TypeError,
-                                 r"Can't convert Operation '.*' to Tensor"):
+    with self.assertRaisesRegex(TypeError,
+                                r"Can't convert Operation '.*' to Tensor"):
       ops.convert_to_tensor(op)
 
   def testStr(self):
@@ -751,7 +749,7 @@ class OperationTest(test_util.TensorFlowTestCase):
                      [tensor_util.make_tensor_proto(1, dtypes.int32)])
 
     type_val = op.get_attr("type_val")
-    # First check that type_val is a DType, because the assertEquals will work
+    # First check that type_val is a DType, because the assertEqual will work
     # no matter what since DType overrides __eq__
     self.assertIsInstance(type_val, dtypes.DType)
     self.assertEqual(type_val, dtypes.int32)
@@ -769,7 +767,7 @@ class OperationTest(test_util.TensorFlowTestCase):
                      attr_value_pb2.NameAttrList(name="MyFunc"))
 
     # Try fetching missing attr
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         ValueError, "Operation 'FuncAttr' has no attr named 'FakeAttr'."):
       op.get_attr("FakeAttr")
 
@@ -832,7 +830,7 @@ class OperationTest(test_util.TensorFlowTestCase):
       y.op._add_control_input(x.op)  # pylint: disable=protected-access
       x.op._add_control_input(y.op)  # pylint: disable=protected-access
     with self.session(graph=graph) as sess:
-      with self.assertRaisesRegexp(
+      with self.assertRaisesRegex(
           errors.InvalidArgumentError,
           "Graph is invalid, contains a cycle with 2 nodes"):
         self.evaluate(x)
@@ -845,25 +843,25 @@ class OperationTest(test_util.TensorFlowTestCase):
       z = x + y
 
     z.op._update_input(0, y)  # pylint: disable=protected-access
-    self.assertEquals(list(z.op.inputs), [y, y])
-    self.assertEquals(x.consumers(), [])
-    self.assertEquals(y.consumers(), [z.op, z.op])
+    self.assertEqual(list(z.op.inputs), [y, y])
+    self.assertEqual(x.consumers(), [])
+    self.assertEqual(y.consumers(), [z.op, z.op])
     with session.Session(graph=g) as sess:
-      self.assertEquals(self.evaluate(z), 4)
+      self.assertEqual(self.evaluate(z), 4)
 
     z.op._update_input(0, x)  # pylint: disable=protected-access
-    self.assertEquals(list(z.op.inputs), [x, y])
-    self.assertEquals(x.consumers(), [z.op])
-    self.assertEquals(y.consumers(), [z.op])
+    self.assertEqual(list(z.op.inputs), [x, y])
+    self.assertEqual(x.consumers(), [z.op])
+    self.assertEqual(y.consumers(), [z.op])
     with session.Session(graph=g) as sess:
-      self.assertEquals(self.evaluate(z), 3)
+      self.assertEqual(self.evaluate(z), 3)
 
     z.op._update_input(1, y)  # pylint: disable=protected-access
-    self.assertEquals(list(z.op.inputs), [x, y])
-    self.assertEquals(x.consumers(), [z.op])
-    self.assertEquals(y.consumers(), [z.op])
+    self.assertEqual(list(z.op.inputs), [x, y])
+    self.assertEqual(x.consumers(), [z.op])
+    self.assertEqual(y.consumers(), [z.op])
     with session.Session(graph=g) as sess:
-      self.assertEquals(self.evaluate(z), 3)
+      self.assertEqual(self.evaluate(z), 3)
 
   def testUpdateInputGraphError(self):
     g_0 = ops.Graph()
@@ -873,7 +871,7 @@ class OperationTest(test_util.TensorFlowTestCase):
     with g_1.as_default():
       y = constant_op.constant(2)
       z = y * 2
-      with self.assertRaisesRegexp(ValueError, "must be from the same graph"):
+      with self.assertRaisesRegex(ValueError, "must be from the same graph"):
         z.op._update_input(0, x)  # pylint: disable=protected-access
 
   def testUpdateInputTypeError(self):
@@ -885,7 +883,7 @@ class OperationTest(test_util.TensorFlowTestCase):
       z = y + w
       z.op._update_input(0, x)  # pylint: disable=protected-access
     with session.Session(graph=g) as sess:
-      with self.assertRaisesRegexp(
+      with self.assertRaisesRegex(
           errors.InvalidArgumentError,
           "Input 0 of node add was passed string from Const_1:0 incompatible "
           "with expected int32"):
@@ -898,7 +896,7 @@ class OperationTest(test_util.TensorFlowTestCase):
       x = constant_op.constant(0, shape=[3, 1])
       y = constant_op.constant(1, shape=[2, 2])
       z = w + x
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         errors.InvalidArgumentError,
         r"Cannot update edge, incompatible shapes: \[2,2\] and \[3,1\]"):
       z.op._update_input(0, y)  # pylint: disable=protected-access
@@ -907,11 +905,10 @@ class OperationTest(test_util.TensorFlowTestCase):
     g = ops.Graph()
     with g.as_default():
       x = constant_op.constant(1)
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         errors.OutOfRangeError,
         r"Cannot update edge. Input index \[1\] is greater than the number of "
-        r"total inputs \[0\]."
-    ):
+        r"total inputs \[0\]."):
       x.op._update_input(1, x)  # pylint: disable=protected-access
 
   @test_util.enable_control_flow_v2
@@ -955,7 +952,7 @@ class OperationTest(test_util.TensorFlowTestCase):
     self.assertEqual(len(x.op.op_def.input_arg), 0)
     self.assertEqual(len(x.op.op_def.output_arg), 1)
 
-    self.assertRegexpMatches(z.op.op_def.name, "Add(V2)?")
+    self.assertRegex(z.op.op_def.name, "Add(V2)?")
     self.assertEqual(len(z.op.op_def.input_arg), 2)
     self.assertEqual(len(z.op.op_def.output_arg), 1)
 
@@ -966,7 +963,7 @@ class OperationTest(test_util.TensorFlowTestCase):
       x = constant_op.constant(1)
     with g_1.as_default():
       y = constant_op.constant(2)
-      with self.assertRaisesRegexp(ValueError, "must be from the same graph"):
+      with self.assertRaisesRegex(ValueError, "must be from the same graph"):
         y * x  # pylint: disable=pointless-statement
 
   def testInputsAreImmutable(self):
@@ -974,8 +971,8 @@ class OperationTest(test_util.TensorFlowTestCase):
     with g.as_default():
       x = test_ops.int_output()
       op = test_ops.int_input_int_output(x, name="myop").op
-    with self.assertRaisesRegexp(
-        AttributeError, "'tuple' object has no attribute 'append'"):
+    with self.assertRaisesRegex(AttributeError,
+                                "'tuple' object has no attribute 'append'"):
       op.inputs.append(None)
 
 
@@ -1462,8 +1459,8 @@ class DeviceTest(test_util.TensorFlowTestCase):
     with context.eager_mode():
       with ops.device("/device:CPU:0"):
         t = constant_op.constant(1.0)
-        self.assertRegexpMatches(t.device, "/device:CPU:0")
-        self.assertRegexpMatches(t.backing_device, "/device:CPU:0")
+        self.assertRegex(t.device, "/device:CPU:0")
+        self.assertRegex(t.backing_device, "/device:CPU:0")
 
   def testDevicePartialString(self):
     g = ops.Graph()
@@ -1904,8 +1901,8 @@ class MultithreadedGraphStateTest(test_util.TensorFlowTestCase):
 
     suffixes = ["", "_1", "_2"]
     for t, s in zip(threads, suffixes):
-      self.assertEquals("foo" + s + "/FloatOutput", t.result[0].name)
-      self.assertEquals("foo" + s + "/FloatOutput_1", t.result[1].name)
+      self.assertEqual("foo" + s + "/FloatOutput", t.result[0].name)
+      self.assertEqual("foo" + s + "/FloatOutput_1", t.result[1].name)
 
 
 class ObjectWithName(object):
@@ -2095,7 +2092,7 @@ class RegistrationTest(test_util.TensorFlowTestCase):
       x = test_ops.float_output()
       with g.gradient_override_map({"CopyOp": "unknown_override"}):
         y = test_ops.copy_op(x)
-      with self.assertRaisesRegexp(LookupError, "unknown_override"):
+      with self.assertRaisesRegex(LookupError, "unknown_override"):
         ops.get_gradient_function(y.op)
 
 
@@ -2616,8 +2613,8 @@ class InitScopeTest(test_util.TensorFlowTestCase):
         # First ensure that graphs that are not building functions are
         # not escaped.
         function_with_variables("foo")
-        with self.assertRaisesRegexp(ValueError,
-                                     r"Variable foo already exists.*"):
+        with self.assertRaisesRegex(ValueError,
+                                    r"Variable foo already exists.*"):
           # This will fail because reuse is not set to True.
           function_with_variables("foo")
 
@@ -2710,12 +2707,11 @@ class InitScopeTest(test_util.TensorFlowTestCase):
     with context.eager_mode():
       c = constant_op.constant(1.0)
       with ops.Graph().as_default():
-        with self.assertRaisesRegexp(
-            RuntimeError, "Attempting to capture an EagerTensor"):
+        with self.assertRaisesRegex(RuntimeError,
+                                    "Attempting to capture an EagerTensor"):
           math_ops.add(c, c)
         c2 = constant_op.constant(2.0)
-      with self.assertRaisesRegexp(
-          TypeError, "Graph tensors"):
+      with self.assertRaisesRegex(TypeError, "Graph tensors"):
         math_ops.add(c2, c2)
 
   def testPreservesNameScopeInEagerExecution(self):
@@ -3083,7 +3079,7 @@ class DeviceStackTest(test_util.TensorFlowTestCase):
     self.assertEqual(1, len(three_list))
     func_description = three_list[0].obj
     expected_regex = r"device_func<.*ops_test.py, [0-9]+"
-    self.assertRegexpMatches(func_description, expected_regex)
+    self.assertRegex(func_description, expected_regex)
 
   @test_util.run_deprecated_v1
   def testDeviceAssignmentMetadataForGraphDeviceAndTfDeviceFunctions(self):
@@ -3097,8 +3093,7 @@ class DeviceStackTest(test_util.TensorFlowTestCase):
     two_metadata = const_two.op._device_assignments[0]
 
     # Verify both types of device assignment return the right stack info.
-    self.assertRegexpMatches("ops_test.py",
-                             os.path.basename(one_metadata.filename))
+    self.assertRegex("ops_test.py", os.path.basename(one_metadata.filename))
     self.assertEqual(one_metadata.filename, two_metadata.filename)
     self.assertEqual(one_metadata.lineno + 2, two_metadata.lineno)
 
@@ -3266,7 +3261,7 @@ class DeprecatedTest(test_util.TensorFlowTestCase):
 
   def testGraphConstructionFail(self):
     with ops.Graph().as_default():
-      with self.assertRaisesRegexp(NotImplementedError, self._error()):
+      with self.assertRaisesRegex(NotImplementedError, self._error()):
         test_ops.old()
 
 
@@ -3317,19 +3312,19 @@ class NameScopeTest(test_util.TensorFlowTestCase):
           with ops.name_scope("_"):
             pass
 
-    self.assertRaisesRegexp(ValueError, "'_' is not a valid scope name", f)
+    self.assertRaisesRegex(ValueError, "'_' is not a valid scope name", f)
 
 
 class EnableEagerExecutionTest(test_util.TensorFlowTestCase):
 
   @test_util.run_v1_only("b/120545219")
   def testBadArgumentsToEnableEagerExecution(self):
-    with self.assertRaisesRegexp(TypeError, "config must be a tf.ConfigProto"):
+    with self.assertRaisesRegex(TypeError, "config must be a tf.ConfigProto"):
       ops.enable_eager_execution(context.DEVICE_PLACEMENT_SILENT)
-    with self.assertRaisesRegexp(ValueError, "device_policy must be one of"):
+    with self.assertRaisesRegex(ValueError, "device_policy must be one of"):
       c = config_pb2.ConfigProto()
       ops.enable_eager_execution(c, c)
-    with self.assertRaisesRegexp(ValueError, "execution_mode must be one of"):
+    with self.assertRaisesRegex(ValueError, "execution_mode must be one of"):
       c = config_pb2.ConfigProto()
       ops.enable_eager_execution(c, execution_mode=c)
 
diff --git a/tensorflow/python/framework/registry_test.py b/tensorflow/python/framework/registry_test.py
index 5adf12fdacf..52bdc4ca7a7 100644
--- a/tensorflow/python/framework/registry_test.py
+++ b/tensorflow/python/framework/registry_test.py
@@ -50,7 +50,7 @@ class RegistryTest(test.TestCase, parameterized.TestCase):
   def testDuplicate(self):
     myreg = registry.Registry('testbar')
     myreg.register(bar, 'Bar')
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         KeyError, r'Registering two testbar with name \'Bar\'! '
         r'\(Previous registration was in [^ ]+ .*.py:[0-9]+\)'):
       myreg.register(bar, 'Bar')
diff --git a/tensorflow/python/framework/subscribe_test.py b/tensorflow/python/framework/subscribe_test.py
index a74e96f9d9d..3ccb7fd0bba 100644
--- a/tensorflow/python/framework/subscribe_test.py
+++ b/tensorflow/python/framework/subscribe_test.py
@@ -118,7 +118,7 @@ class SubscribeTest(test_util.TensorFlowTestCase):
     self._ExpectSubscribedIdentities(subscribed)
 
     # Expect an exception to be raised for unsupported types.
-    with self.assertRaisesRegexp(TypeError, 'has invalid type'):
+    with self.assertRaisesRegex(TypeError, 'has invalid type'):
       subscribe.subscribe(c.name,
                           lambda t: script_ops.py_func(sub, [t], [t.dtype]))
 
diff --git a/tensorflow/python/framework/tensor_shape_div_test.py b/tensorflow/python/framework/tensor_shape_div_test.py
index 5160c75e527..e892d7dffbd 100644
--- a/tensorflow/python/framework/tensor_shape_div_test.py
+++ b/tensorflow/python/framework/tensor_shape_div_test.py
@@ -42,7 +42,7 @@ class DimensionDivTest(test_util.TensorFlowTestCase):
       two = tensor_shape.Dimension(2)
       message = (r"unsupported operand type\(s\) for /: "
                  r"'int' and 'Dimension', please use // instead")
-      with self.assertRaisesRegexp(TypeError, message):
+      with self.assertRaisesRegex(TypeError, message):
         _ = 6 / two
 
 
diff --git a/tensorflow/python/framework/tensor_shape_test.py b/tensorflow/python/framework/tensor_shape_test.py
index e1bc6d5e8aa..fec9664a5ca 100644
--- a/tensorflow/python/framework/tensor_shape_test.py
+++ b/tensorflow/python/framework/tensor_shape_test.py
@@ -217,15 +217,15 @@ class DimensionTest(test_util.TensorFlowTestCase):
     two = tensor_shape.Dimension(2)
     message = (r"unsupported operand type\(s\) for /: "
                r"'Dimension' and 'Dimension', please use // instead")
-    with self.assertRaisesRegexp(TypeError, message):
+    with self.assertRaisesRegex(TypeError, message):
       _ = six / two
     message = (r"unsupported operand type\(s\) for /: "
                r"'Dimension' and 'int', please use // instead")
-    with self.assertRaisesRegexp(TypeError, message):
+    with self.assertRaisesRegex(TypeError, message):
       _ = six / 2
     message = (r"unsupported operand type\(s\) for /: "
                r"'int' and 'Dimension', please use // instead")
-    with self.assertRaisesRegexp(TypeError, message):
+    with self.assertRaisesRegex(TypeError, message):
       _ = 6 / two
 
 
@@ -390,7 +390,7 @@ class ShapeTest(test_util.TensorFlowTestCase, parameterized.TestCase):
   def testTruedivFails(self):
     unknown = tensor_shape.Dimension(None)
     self.assertEqual((unknown // unknown).value, None)
-    with self.assertRaisesRegexp(TypeError, r"unsupported operand type"):
+    with self.assertRaisesRegex(TypeError, r"unsupported operand type"):
       unknown / unknown  # pylint: disable=pointless-statement
 
   def testConvertFromProto(self):
@@ -481,8 +481,8 @@ class ShapeTest(test_util.TensorFlowTestCase, parameterized.TestCase):
       _ = unk1 != unk0
 
   def testAsList(self):
-    with self.assertRaisesRegexp(ValueError,
-                                 "not defined on an unknown TensorShape"):
+    with self.assertRaisesRegex(ValueError,
+                                "not defined on an unknown TensorShape"):
       tensor_shape.unknown_shape().as_list()
     self.assertAllEqual([None, None], tensor_shape.unknown_shape(2).as_list())
     self.assertAllEqual([2, None, 4], tensor_shape.TensorShape(
diff --git a/tensorflow/python/framework/tensor_spec_test.py b/tensorflow/python/framework/tensor_spec_test.py
index 85c4cd8bf81..f67aa4c9013 100644
--- a/tensorflow/python/framework/tensor_spec_test.py
+++ b/tensorflow/python/framework/tensor_spec_test.py
@@ -172,11 +172,11 @@ class TensorSpecTest(test_util.TensorFlowTestCase):
 class BoundedTensorSpecTest(test_util.TensorFlowTestCase):
 
   def testInvalidMinimum(self):
-    with self.assertRaisesRegexp(ValueError, "not compatible"):
+    with self.assertRaisesRegex(ValueError, "not compatible"):
       tensor_spec.BoundedTensorSpec((3, 5), dtypes.uint8, (0, 0, 0), (1, 1))
 
   def testInvalidMaximum(self):
-    with self.assertRaisesRegexp(ValueError, "not compatible"):
+    with self.assertRaisesRegex(ValueError, "not compatible"):
       tensor_spec.BoundedTensorSpec((3, 5), dtypes.uint8, 0, (1, 1, 1))
 
   def testMinimumMaximumAttributes(self):
@@ -190,9 +190,9 @@ class BoundedTensorSpecTest(test_util.TensorFlowTestCase):
   def testNotWriteableNP(self):
     spec = tensor_spec.BoundedTensorSpec(
         (1, 2, 3), dtypes.float32, 0, (5, 5, 5))
-    with self.assertRaisesRegexp(ValueError, "read-only"):
+    with self.assertRaisesRegex(ValueError, "read-only"):
       spec.minimum[0] = -1
-    with self.assertRaisesRegexp(ValueError, "read-only"):
+    with self.assertRaisesRegex(ValueError, "read-only"):
       spec.maximum[0] = 100
 
   def testReuseSpec(self):
diff --git a/tensorflow/python/framework/tensor_util_test.py b/tensorflow/python/framework/tensor_util_test.py
index ad0aec1623d..6d7643cc805 100644
--- a/tensorflow/python/framework/tensor_util_test.py
+++ b/tensorflow/python/framework/tensor_util_test.py
@@ -736,7 +736,7 @@ class TensorUtilTest(test.TestCase):
 
     # Validate the helpful error message when trying to convert an
     # unconvertible list as strings.
-    with self.assertRaisesRegexp(TypeError, "Failed to convert object"):
+    with self.assertRaisesRegex(TypeError, "Failed to convert object"):
       tensor_util.make_tensor_proto([tensor_shape.Dimension(1)])
 
   def testTensorShapeVerification(self):
diff --git a/tensorflow/python/framework/test_combinations_test.py b/tensorflow/python/framework/test_combinations_test.py
index 5586d4bd733..f49cd368d50 100644
--- a/tensorflow/python/framework/test_combinations_test.py
+++ b/tensorflow/python/framework/test_combinations_test.py
@@ -125,7 +125,7 @@ class TestingCombinationsTest(test.TestCase):
   def test_overlapping_keys(self):
     c1 = combinations.combine(mode=["graph"], loss=["callable", "tensor"])
     c2 = combinations.combine(mode=["eager"], loss=["callable"])
-    with self.assertRaisesRegexp(ValueError, ".*Keys.+overlap.+"):
+    with self.assertRaisesRegex(ValueError, ".*Keys.+overlap.+"):
       _ = combinations.times(c1, c2)
 
 
diff --git a/tensorflow/python/framework/test_util_test.py b/tensorflow/python/framework/test_util_test.py
index 2bd75c3919e..f2176cd0b3b 100644
--- a/tensorflow/python/framework/test_util_test.py
+++ b/tensorflow/python/framework/test_util_test.py
@@ -104,8 +104,8 @@ class TestUtilTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     # assert_equal_graph_def doesn't care about order
     test_util.assert_equal_graph_def(def_57, def_75)
     # Compare two unequal graphs
-    with self.assertRaisesRegexp(AssertionError,
-                                 r"^Found unexpected node '{{node seven}}"):
+    with self.assertRaisesRegex(AssertionError,
+                                r"^Found unexpected node '{{node seven}}"):
       test_util.assert_equal_graph_def(def_57, def_empty)
 
   def testIsGoogleCudaEnabled(self):
@@ -162,8 +162,7 @@ class TestUtilTest(test_util.TensorFlowTestCase, parameterized.TestCase):
 
     # Check if the assertion failure message contains the content of
     # the inner proto.
-    with self.assertRaisesRegexp(AssertionError,
-                                 r'meta_graph_version: "inner"'):
+    with self.assertRaisesRegex(AssertionError, r'meta_graph_version: "inner"'):
       self.assertProtoEquals("", meta_graph_def_outer)
 
   @test_util.run_in_graph_and_eager_modes
@@ -270,19 +269,19 @@ class TestUtilTest(test_util.TensorFlowTestCase, parameterized.TestCase):
   @test_util.run_in_graph_and_eager_modes
   def testAllCloseScalars(self):
     self.assertAllClose(7, 7 + 1e-8)
-    with self.assertRaisesRegexp(AssertionError, r"Not equal to tolerance"):
+    with self.assertRaisesRegex(AssertionError, r"Not equal to tolerance"):
       self.assertAllClose(7, 7 + 1e-5)
 
   @test_util.run_in_graph_and_eager_modes
   def testAllCloseList(self):
-    with self.assertRaisesRegexp(AssertionError, r"not close dif"):
+    with self.assertRaisesRegex(AssertionError, r"not close dif"):
       self.assertAllClose([0], [1])
 
   @test_util.run_in_graph_and_eager_modes
   def testAllCloseDictToNonDict(self):
-    with self.assertRaisesRegexp(ValueError, r"Can't compare dict to non-dict"):
+    with self.assertRaisesRegex(ValueError, r"Can't compare dict to non-dict"):
       self.assertAllClose(1, {"a": 1})
-    with self.assertRaisesRegexp(ValueError, r"Can't compare dict to non-dict"):
+    with self.assertRaisesRegex(ValueError, r"Can't compare dict to non-dict"):
       self.assertAllClose({"a": 1}, 1)
 
   @test_util.run_in_graph_and_eager_modes
@@ -313,17 +312,17 @@ class TestUtilTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     for k in expected:
       actual = dict(expected)
       del actual[k]
-      with self.assertRaisesRegexp(AssertionError, r"mismatched keys"):
+      with self.assertRaisesRegex(AssertionError, r"mismatched keys"):
         self.assertAllClose(expected, actual)
 
     # With each item changed.
-    with self.assertRaisesRegexp(AssertionError, r"Not equal to tolerance"):
+    with self.assertRaisesRegex(AssertionError, r"Not equal to tolerance"):
       self.assertAllClose(expected, {"a": a + 1e-5, "b": b, "c": c})
-    with self.assertRaisesRegexp(AssertionError, r"Shape mismatch"):
+    with self.assertRaisesRegex(AssertionError, r"Shape mismatch"):
       self.assertAllClose(expected, {"a": a, "b": b + (4.,), "c": c})
     c_copy = np.array(c)
     c_copy[1, 1, 1] += 1e-5
-    with self.assertRaisesRegexp(AssertionError, r"Not equal to tolerance"):
+    with self.assertRaisesRegex(AssertionError, r"Not equal to tolerance"):
       self.assertAllClose(expected, {"a": a, "b": b, "c": c_copy})
 
   @test_util.run_in_graph_and_eager_modes
@@ -349,8 +348,8 @@ class TestUtilTest(test_util.TensorFlowTestCase, parameterized.TestCase):
 
     # Test mismatched values
     b["y"][1][0]["nested"]["n"] = 4.2
-    with self.assertRaisesRegexp(AssertionError,
-                                 r"\[y\]\[1\]\[0\]\[nested\]\[n\]"):
+    with self.assertRaisesRegex(AssertionError,
+                                r"\[y\]\[1\]\[0\]\[nested\]\[n\]"):
       self.assertAllClose(a, b)
 
   @test_util.run_in_graph_and_eager_modes
@@ -465,7 +464,7 @@ class TestUtilTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     self.assertAllEqual([120] * 3, k)
     self.assertAllEqual([20] * 3, j)
 
-    with self.assertRaisesRegexp(AssertionError, r"not equal lhs"):
+    with self.assertRaisesRegex(AssertionError, r"not equal lhs"):
       self.assertAllEqual([0] * 3, k)
 
   @test_util.run_in_graph_and_eager_modes
@@ -479,7 +478,7 @@ class TestUtilTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     self.assertNotAllEqual([120] * 3, k)
     self.assertNotAllEqual([20] * 3, j)
 
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         AssertionError, r"two values are equal at all elements.*extra message"):
       self.assertNotAllEqual([120], k, msg="extra message")
 
@@ -705,7 +704,8 @@ class TestUtilTest(test_util.TensorFlowTestCase, parameterized.TestCase):
 
   def test_run_in_eager_and_graph_modes_test_class(self):
     msg = "`run_in_graph_and_eager_modes` only supports test methods.*"
-    with self.assertRaisesRegexp(ValueError, msg):
+    with self.assertRaisesRegex(ValueError, msg):
+
       @test_util.run_in_graph_and_eager_modes()
       class Foo(object):
         pass
@@ -817,7 +817,7 @@ class SkipTestTest(test_util.TensorFlowTestCase):
                                    ["foo bar", "test message"]):
         raise ValueError("test message")
     try:
-      with self.assertRaisesRegexp(ValueError, "foo bar"):
+      with self.assertRaisesRegex(ValueError, "foo bar"):
         with test_util.skip_if_error(self, ValueError, "test message"):
           raise ValueError("foo bar")
     except unittest.SkipTest:
@@ -854,7 +854,7 @@ class SkipTestTest(test_util.TensorFlowTestCase):
 
   def test_skip_if_error_should_raise_message_mismatch(self):
     try:
-      with self.assertRaisesRegexp(ValueError, "foo bar"):
+      with self.assertRaisesRegex(ValueError, "foo bar"):
         with test_util.skip_if_error(self, ValueError, "test message"):
           raise ValueError("foo bar")
     except unittest.SkipTest:
@@ -862,7 +862,7 @@ class SkipTestTest(test_util.TensorFlowTestCase):
 
   def test_skip_if_error_should_raise_no_message(self):
     try:
-      with self.assertRaisesRegexp(ValueError, ""):
+      with self.assertRaisesRegex(ValueError, ""):
         with test_util.skip_if_error(self, ValueError, "test message"):
           raise ValueError()
     except unittest.SkipTest:
@@ -924,7 +924,7 @@ class GarbageCollectionTest(test_util.TensorFlowTestCase):
       def test_has_no_leak(self):
         constant_op.constant([3.], name="no-leak")
 
-    with self.assertRaisesRegexp(AssertionError, "Tensors not deallocated"):
+    with self.assertRaisesRegex(AssertionError, "Tensors not deallocated"):
       LeakedTensorTest().test_has_leak()
 
     LeakedTensorTest().test_has_no_leak()
diff --git a/tensorflow/python/framework/versions_test.py b/tensorflow/python/framework/versions_test.py
index 12c8ea0be25..417efe2f688 100644
--- a/tensorflow/python/framework/versions_test.py
+++ b/tensorflow/python/framework/versions_test.py
@@ -28,10 +28,8 @@ class VersionTest(test.TestCase):
     self.assertEqual(type(versions.__version__), str)
     self.assertEqual(type(versions.VERSION), str)
     # This pattern will need to grow as we include alpha, builds, etc.
-    self.assertRegexpMatches(versions.__version__,
-                             r'^\d+\.\d+\.(\d+(\-\w+)?|head)$')
-    self.assertRegexpMatches(versions.VERSION,
-                             r'^\d+\.\d+\.(\d+(\-\w+)?|head)$')
+    self.assertRegex(versions.__version__, r'^\d+\.\d+\.(\d+(\-\w+)?|head)$')
+    self.assertRegex(versions.VERSION, r'^\d+\.\d+\.(\d+(\-\w+)?|head)$')
 
   def testGraphDefVersion(self):
     version = versions.GRAPH_DEF_VERSION
diff --git a/tensorflow/python/keras/backend_test.py b/tensorflow/python/keras/backend_test.py
index f36db9605dc..48bbedbd4fc 100644
--- a/tensorflow/python/keras/backend_test.py
+++ b/tensorflow/python/keras/backend_test.py
@@ -2134,8 +2134,8 @@ class ControlOpsTests(test.TestCase):
     def false_func():
       return y
 
-    with self.assertRaisesRegexp(ValueError,
-                                 'Rank of `condition` should be less than'):
+    with self.assertRaisesRegex(ValueError,
+                                'Rank of `condition` should be less than'):
       backend.switch(backend.equal(x, x), false_func, true_func)
 
 
diff --git a/tensorflow/python/keras/callbacks_test.py b/tensorflow/python/keras/callbacks_test.py
index d180e85a1d9..fdaf2e24227 100644
--- a/tensorflow/python/keras/callbacks_test.py
+++ b/tensorflow/python/keras/callbacks_test.py
@@ -274,7 +274,7 @@ class KerasCallbacksTest(keras_parameterized.TestCase):
 
     with self.captureWritesToStream(sys.stdout) as printed:
       model.fit(dataset, epochs=2, steps_per_epoch=10)
-      self.assertRegexpMatches(printed.contents(), expected_log)
+      self.assertRegex(printed.contents(), expected_log)
 
   @keras_parameterized.run_all_keras_modes
   def test_callback_warning(self):
@@ -320,7 +320,7 @@ class KerasCallbacksTest(keras_parameterized.TestCase):
 
     with self.captureWritesToStream(sys.stdout) as printed:
       model.fit(dataset, epochs=2, steps_per_epoch=10)
-      self.assertRegexpMatches(printed.contents(), expected_log)
+      self.assertRegex(printed.contents(), expected_log)
 
   @keras_parameterized.run_with_all_model_types
   @keras_parameterized.run_all_keras_modes
@@ -335,7 +335,7 @@ class KerasCallbacksTest(keras_parameterized.TestCase):
 
     with self.captureWritesToStream(sys.stdout) as printed:
       model.fit(training_dataset, epochs=2, validation_data=val_dataset)
-      self.assertRegexpMatches(printed.contents(), expected_log)
+      self.assertRegex(printed.contents(), expected_log)
 
   @keras_parameterized.run_with_all_model_types
   @keras_parameterized.run_all_keras_modes(always_skip_v1=True)
@@ -350,7 +350,7 @@ class KerasCallbacksTest(keras_parameterized.TestCase):
 
     with self.captureWritesToStream(sys.stdout) as printed:
       model.fit(x, y, batch_size=10, epochs=2, validation_split=0.2)
-      self.assertRegexpMatches(printed.contents(), expected_log)
+      self.assertRegex(printed.contents(), expected_log)
 
   @keras_parameterized.run_with_all_model_types
   @keras_parameterized.run_all_keras_modes(always_skip_v1=True)
@@ -381,7 +381,7 @@ class KerasCallbacksTest(keras_parameterized.TestCase):
     with self.captureWritesToStream(sys.stdout) as printed:
       model.fit(
           x=training, validation_data=validation, epochs=2, steps_per_epoch=20)
-      self.assertRegexpMatches(printed.contents(), expected_log)
+      self.assertRegex(printed.contents(), expected_log)
 
   @keras_parameterized.run_with_all_model_types
   @keras_parameterized.run_all_keras_modes(always_skip_v1=True)
@@ -647,7 +647,7 @@ class KerasCallbacksTest(keras_parameterized.TestCase):
     os.remove(filepath.format(epoch=9))
 
     # Case 8: `ModelCheckpoint` with valid and invalid save_freq argument.
-    with self.assertRaisesRegexp(ValueError, 'Unrecognized save_freq'):
+    with self.assertRaisesRegex(ValueError, 'Unrecognized save_freq'):
       keras.callbacks.ModelCheckpoint(
           filepath,
           monitor=monitor,
@@ -669,7 +669,7 @@ class KerasCallbacksTest(keras_parameterized.TestCase):
         save_freq=3)
 
     # Case 9: `ModelCheckpoint` with valid and invalid `options` argument.
-    with self.assertRaisesRegexp(TypeError, 'tf.train.CheckpointOptions'):
+    with self.assertRaisesRegex(TypeError, 'tf.train.CheckpointOptions'):
       keras.callbacks.ModelCheckpoint(
           filepath,
           monitor=monitor,
@@ -677,7 +677,7 @@ class KerasCallbacksTest(keras_parameterized.TestCase):
           save_weights_only=True,
           mode=mode,
           options=save_options_lib.SaveOptions())
-    with self.assertRaisesRegexp(TypeError, 'tf.saved_model.SaveOptions'):
+    with self.assertRaisesRegex(TypeError, 'tf.saved_model.SaveOptions'):
       keras.callbacks.ModelCheckpoint(
           filepath,
           monitor=monitor,
@@ -881,8 +881,9 @@ class KerasCallbacksTest(keras_parameterized.TestCase):
 
     callback = keras.callbacks.ModelCheckpoint(filepath=filepath)
 
-    with self.assertRaisesRegexp(IOError, 'Please specify a non-directory '
-                                          'filepath for ModelCheckpoint.'):
+    with self.assertRaisesRegex(
+        IOError, 'Please specify a non-directory '
+        'filepath for ModelCheckpoint.'):
       model.fit(train_ds, epochs=1, callbacks=[callback])
 
   def test_ModelCheckpoint_with_bad_path_placeholders(self):
@@ -893,8 +894,8 @@ class KerasCallbacksTest(keras_parameterized.TestCase):
     filepath = os.path.join(temp_dir, 'chkpt_{epoch:02d}_{mape:.2f}.h5')
     callback = keras.callbacks.ModelCheckpoint(filepath=filepath)
 
-    with self.assertRaisesRegexp(KeyError, 'Failed to format this callback '
-                                           'filepath.*'):
+    with self.assertRaisesRegex(KeyError, 'Failed to format this callback '
+                                'filepath.*'):
       model.fit(train_ds, epochs=1, callbacks=[callback])
 
   def test_ModelCheckpoint_nonblocking(self):
@@ -971,7 +972,7 @@ class KerasCallbacksTest(keras_parameterized.TestCase):
       cb_list.on_test_batch_end(0, logs)
       cb_list.on_test_end(logs)
 
-      with self.assertRaisesRegexp(RuntimeError, 'NumPy conversion'):
+      with self.assertRaisesRegex(RuntimeError, 'NumPy conversion'):
         # on_epoch_end should still block.
         cb_list.on_epoch_end(0, logs)
       cb_list.on_train_end(logs)
@@ -1271,7 +1272,7 @@ class KerasCallbacksTest(keras_parameterized.TestCase):
   def test_ReduceLROnPlateau_backwards_compatibility(self):
     with test.mock.patch.object(logging, 'warning') as mock_log:
       reduce_on_plateau = keras.callbacks.ReduceLROnPlateau(epsilon=1e-13)
-      self.assertRegexpMatches(
+      self.assertRegex(
           str(mock_log.call_args), '`epsilon` argument is deprecated')
     self.assertFalse(hasattr(reduce_on_plateau, 'epsilon'))
     self.assertTrue(hasattr(reduce_on_plateau, 'min_delta'))
@@ -2090,7 +2091,7 @@ class TestTensorBoardV2(keras_parameterized.TestCase):
     return result
 
   def test_TensorBoard_invalid_argument(self):
-    with self.assertRaisesRegexp(ValueError, 'Unrecognized arguments'):
+    with self.assertRaisesRegex(ValueError, 'Unrecognized arguments'):
       keras.callbacks.TensorBoard(wwrite_images=True)
 
   def test_TensorBoard_non_blocking(self):
diff --git a/tensorflow/python/keras/distribute/distribute_strategy_test.py b/tensorflow/python/keras/distribute/distribute_strategy_test.py
index eac1e2feb8b..e8131ad88fb 100644
--- a/tensorflow/python/keras/distribute/distribute_strategy_test.py
+++ b/tensorflow/python/keras/distribute/distribute_strategy_test.py
@@ -395,7 +395,7 @@ class TestDistributionStrategyWithNumpyArrays(test.TestCase,
       self.assertEqual(steps, 2)
 
       # All samples can not be consumed in specified number of steps
-      with self.assertRaisesRegexp(ValueError, 'not divisible by steps'):
+      with self.assertRaisesRegex(ValueError, 'not divisible by steps'):
         distributed_training_utils.get_input_params(
             distribution, 63, steps=2, batch_size=None)
 
@@ -409,7 +409,7 @@ class TestDistributionStrategyWithNumpyArrays(test.TestCase,
         self.assertEqual(steps, 3)
       else:
         # Computed global batch size can not be sharded across replicas
-        with self.assertRaisesRegexp(
+        with self.assertRaisesRegex(
             ValueError, 'could not be sharded evenly '
             'across the sync replicas'):
           distributed_training_utils.get_input_params(
@@ -448,7 +448,7 @@ class TestDistributionStrategyWithNumpyArrays(test.TestCase,
       self.assertEqual(steps, 5)
 
       # Number of samples is less than global batch size * steps
-      with self.assertRaisesRegexp(ValueError, 'less than samples required'):
+      with self.assertRaisesRegex(ValueError, 'less than samples required'):
         distributed_training_utils.get_input_params(
             distribution, 64, steps=10, batch_size=13)
 
@@ -1166,8 +1166,8 @@ class TestDistributionStrategyWithDatasets(test.TestCase,
       self.assertAllClose(
           predict_with_numpy, predict_with_ds, atol=1e-4, rtol=1e-4)
 
-      with self.assertRaisesRegexp(ValueError,
-                                   'Number of steps could not be inferred'):
+      with self.assertRaisesRegex(ValueError,
+                                  'Number of steps could not be inferred'):
         model.fit(dataset, epochs=1)
 
   @combinations.generate(all_strategy_combinations())
@@ -1238,7 +1238,7 @@ class TestDistributionStrategyWithDatasets(test.TestCase,
       dataset = dataset.repeat(100)
       dataset = dataset.batch(10)
 
-      with self.assertRaisesRegexp(ValueError, 'incompatible with the layer'):
+      with self.assertRaisesRegex(ValueError, 'incompatible with the layer'):
         model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=0)
 
   @combinations.generate(
@@ -1692,8 +1692,8 @@ class TestDistributionStrategyWithKerasModels(test.TestCase,
 
       # Check for `steps_per_epoch`.
       if distribution.num_replicas_in_sync > 1:
-        with self.assertRaisesRegexp(ValueError,
-                                     'distributed dataset, you must specify'):
+        with self.assertRaisesRegex(ValueError,
+                                    'distributed dataset, you must specify'):
           model.fit(ds, epochs=2)
 
   @combinations.generate(
@@ -1746,8 +1746,8 @@ class TestDistributionStrategyWithKerasModels(test.TestCase,
 
       # Check for `steps_per_epoch`.
       if distribution.num_replicas_in_sync > 1:
-        with self.assertRaisesRegexp(ValueError,
-                                     'distributed dataset, you must specify'):
+        with self.assertRaisesRegex(ValueError,
+                                    'distributed dataset, you must specify'):
           model.fit(ds, epochs=2)
 
   @combinations.generate(
@@ -1815,7 +1815,7 @@ class TestDistributionStrategyWithKerasModels(test.TestCase,
     ds = ds.filter(lambda *args, **kwargs: True)  # Makes the size UNKNOWN.
     bc = BatchCountingCB()
 
-    with self.assertRaisesRegexp(ValueError, 'steps_per_execution'):
+    with self.assertRaisesRegex(ValueError, 'steps_per_execution'):
       model.fit(ds, epochs=2, callbacks=[bc])
 
     train_ds = ds.repeat(2)
@@ -1823,7 +1823,7 @@ class TestDistributionStrategyWithKerasModels(test.TestCase,
     self.assertEqual(bc.train_begin_batches, [0, 20, 40, 0, 20, 40])
     self.assertEqual(bc.train_end_batches, [19, 39, 49, 19, 39, 49])
 
-    with self.assertRaisesRegexp(ValueError, 'steps_per_execution'):
+    with self.assertRaisesRegex(ValueError, 'steps_per_execution'):
       model.evaluate(ds, callbacks=[bc])
 
     test_ds = ds.repeat(2)
@@ -2266,8 +2266,8 @@ class TestDistributionStrategyWithKerasModels(test.TestCase,
                           (parameter_server_strategy.ParameterServerStrategyV1,
                            parameter_server_strategy.ParameterServerStrategy))
 
-    with self.assertRaisesRegexp(NotImplementedError,
-                                 'ParameterServerStrategy*'):
+    with self.assertRaisesRegex(NotImplementedError,
+                                'ParameterServerStrategy*'):
       with distribution.scope():
         model = simple_sequential_model()
         optimizer = rmsprop.RMSPropOptimizer(learning_rate=0.001)
@@ -2494,8 +2494,7 @@ class TestModelCapturesStrategy(test.TestCase, parameterized.TestCase):
     # This should raise an error because the metric is constructed
     # outside of the scope, and not by compile
     if distribution_strategy_context.has_strategy():
-      with self.assertRaisesRegexp(
-          ValueError, 'All metrics must be created in'):
+      with self.assertRaisesRegex(ValueError, 'All metrics must be created in'):
         model.compile(
             optimizer=keras.optimizers.adam_v2.Adam(1e-4),
             loss=keras.losses.MeanSquaredError(),
diff --git a/tensorflow/python/keras/distribute/distributed_training_utils_test.py b/tensorflow/python/keras/distribute/distributed_training_utils_test.py
index 39b4c366cbd..c47d694c5b5 100644
--- a/tensorflow/python/keras/distribute/distributed_training_utils_test.py
+++ b/tensorflow/python/keras/distribute/distributed_training_utils_test.py
@@ -48,8 +48,8 @@ class DistributedTrainingUtilsTest(test.TestCase):
     ]
 
     for callback in unsupported_predefined_callbacks:
-      with self.assertRaisesRegexp(
-          ValueError, 'You must specify a Keras Optimizer V2'):
+      with self.assertRaisesRegex(ValueError,
+                                  'You must specify a Keras Optimizer V2'):
         distributed_training_utils.validate_callbacks([callback],
                                                       v1_adam.AdamOptimizer())
 
diff --git a/tensorflow/python/keras/distribute/keras_dnn_correctness_test.py b/tensorflow/python/keras/distribute/keras_dnn_correctness_test.py
index d605f9a9228..6ec7cc2bac5 100644
--- a/tensorflow/python/keras/distribute/keras_dnn_correctness_test.py
+++ b/tensorflow/python/keras/distribute/keras_dnn_correctness_test.py
@@ -262,13 +262,13 @@ class TestDistributionStrategyDnnCorrectnessWithSubclassedModel(
     if (context.executing_eagerly()) or is_default_strategy(distribution):
       self.run_correctness_test(distribution, use_numpy, use_validation_data)
     elif K.is_tpu_strategy(distribution) and not context.executing_eagerly():
-      with self.assertRaisesRegexp(
+      with self.assertRaisesRegex(
           ValueError,
           'Expected `model` argument to be a functional `Model` instance, '
           'but got a subclass model instead.'):
         self.run_correctness_test(distribution, use_numpy, use_validation_data)
     else:
-      with self.assertRaisesRegexp(
+      with self.assertRaisesRegex(
           ValueError,
           'We currently do not support distribution strategy with a '
           '`Sequential` model that is created without `input_shape`/'
@@ -281,13 +281,13 @@ class TestDistributionStrategyDnnCorrectnessWithSubclassedModel(
         is_default_strategy(distribution)):
       self.run_dynamic_lr_test(distribution)
     elif K.is_tpu_strategy(distribution):
-      with self.assertRaisesRegexp(
+      with self.assertRaisesRegex(
           ValueError,
           'Expected `model` argument to be a functional `Model` instance, '
           'but got a subclass model instead.'):
         self.run_dynamic_lr_test(distribution)
     else:
-      with self.assertRaisesRegexp(
+      with self.assertRaisesRegex(
           ValueError,
           'We currently do not support distribution strategy with a '
           '`Sequential` model that is created without `input_shape`/'
@@ -299,7 +299,7 @@ class TestDistributionStrategyDnnCorrectnessWithSubclassedModel(
   def test_dnn_correctness_with_partial_last_batch_eval(self, distribution,
                                                         use_numpy,
                                                         use_validation_data):
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         ValueError,
         'Expected `model` argument to be a functional `Model` instance, '
         'but got a subclass model instead.'):
diff --git a/tensorflow/python/keras/distribute/keras_stateful_lstm_model_correctness_test.py b/tensorflow/python/keras/distribute/keras_stateful_lstm_model_correctness_test.py
index 70fe41505bf..7b1bc7665b8 100644
--- a/tensorflow/python/keras/distribute/keras_stateful_lstm_model_correctness_test.py
+++ b/tensorflow/python/keras/distribute/keras_stateful_lstm_model_correctness_test.py
@@ -96,9 +96,8 @@ class DistributionStrategyStatefulLstmModelCorrectnessTest(
           keras_correctness_test_base.test_combinations_with_tpu_strategies()))
   def test_incorrectly_use_multiple_cores_for_stateful_lstm_model(
       self, distribution, use_numpy, use_validation_data):
-    with self.assertRaisesRegexp(
-        ValueError,
-        'RNNs with stateful=True not yet supported with '
+    with self.assertRaisesRegex(
+        ValueError, 'RNNs with stateful=True not yet supported with '
         'tf.distribute.Strategy.'):
       self.run_correctness_test(
           distribution,
diff --git a/tensorflow/python/keras/distribute/keras_utils_test.py b/tensorflow/python/keras/distribute/keras_utils_test.py
index 0f65bbbf917..1f6132a0228 100644
--- a/tensorflow/python/keras/distribute/keras_utils_test.py
+++ b/tensorflow/python/keras/distribute/keras_utils_test.py
@@ -196,7 +196,7 @@ class TestDistributionStrategyErrorCases(test.TestCase, parameterized.TestCase):
       # Removed device and input tensor shape details from the error message
       # since the order of the device and the corresponding input tensor shape
       # is not deterministic over different runs.
-      with self.assertRaisesRegexp(
+      with self.assertRaisesRegex(
           ValueError, 'Input tensor shapes do not match for '
           'distributed tensor inputs '
           'DistributedValues:.+'):
@@ -220,7 +220,7 @@ class TestDistributionStrategyErrorCases(test.TestCase, parameterized.TestCase):
       # Removed device and input tensor dtype details from the error message
       # since the order of the device and the corresponding input tensor dtype
       # is not deterministic over different runs.
-      with self.assertRaisesRegexp(
+      with self.assertRaisesRegex(
           ValueError, 'Input tensor dtypes do not match for '
           'distributed tensor inputs '
           'DistributedValues:.+'):
@@ -301,7 +301,7 @@ class TestDistributionStrategyErrorCases(test.TestCase, parameterized.TestCase):
       model = _SimpleMLP(3)
 
       if not context.executing_eagerly():
-        with self.assertRaisesRegexp(
+        with self.assertRaisesRegex(
             ValueError,
             'We currently do not support distribution strategy with a '
             '`Sequential` model that is created without `input_shape`/'
@@ -330,7 +330,7 @@ class TestDistributionStrategyErrorCases(test.TestCase, parameterized.TestCase):
         model.compile(
             'sgd')
       else:
-        with self.assertRaisesRegexp(
+        with self.assertRaisesRegex(
             ValueError,
             'We currently do not support distribution strategy with a '
             '`Sequential` model that is created without '
@@ -345,7 +345,7 @@ class TestDistributionStrategyErrorCases(test.TestCase, parameterized.TestCase):
     with distribution.scope():
       loss_object = losses.MeanSquaredError()
 
-      with self.assertRaisesRegexp(
+      with self.assertRaisesRegex(
           ValueError, 'Please use `tf.keras.losses.Reduction.SUM` or '
           '`tf.keras.losses.Reduction.NONE`'):
         y = np.asarray([1, 0])
@@ -501,7 +501,7 @@ class TestDistributionStrategyValidation(test.TestCase, parameterized.TestCase):
           keras_test_lib.all_strategy_combinations_minus_default()))
   def test_layer_outside_scope(self, distribution):
     with self.cached_session():
-      with self.assertRaisesRegexp(
+      with self.assertRaisesRegex(
           ValueError, 'was not created in the distribution strategy'):
         x = keras.layers.Input(shape=(3,), name='input')
         y = keras.layers.Dense(4, name='dense')(x)
@@ -519,7 +519,7 @@ class TestDistributionStrategyValidation(test.TestCase, parameterized.TestCase):
       keras_test_lib.all_strategy_combinations_minus_default())
   def test_model_outside_scope(self, distribution):
     with self.cached_session():
-      with self.assertRaisesRegexp(
+      with self.assertRaisesRegex(
           ValueError, 'was not created in the distribution strategy'):
         x = keras.layers.Input(shape=(3,), name='input')
         y = keras.layers.Dense(4, name='dense')(x)
@@ -542,9 +542,9 @@ class TestDistributionStrategyWithStaticShapes(test.TestCase,
           mode=['graph', 'eager']))
   def test_input_batch_size_not_divisible_by_num_replicas(self, distribution):
     with distribution.scope():
-      with self.assertRaisesRegexp(
+      with self.assertRaisesRegex(
           ValueError, r'The `batch_size` argument \(5\) must be divisible by '
-                      r'the number of replicas \(2\)'):
+          r'the number of replicas \(2\)'):
         keras.layers.Input(shape=(3,), batch_size=5, name='input')
 
   @combinations.generate(
diff --git a/tensorflow/python/keras/engine/base_layer_test.py b/tensorflow/python/keras/engine/base_layer_test.py
index 559e927d603..efb4442c0f1 100644
--- a/tensorflow/python/keras/engine/base_layer_test.py
+++ b/tensorflow/python/keras/engine/base_layer_test.py
@@ -126,8 +126,8 @@ class BaseLayerTest(keras_parameterized.TestCase):
                                                   input_shape=(3,))
       self.assertEqual(model.dynamic, True)
       # But then you cannot run the model since you're in a graph scope.
-      with self.assertRaisesRegexp(
-          ValueError, 'You must enable eager execution'):
+      with self.assertRaisesRegex(ValueError,
+                                  'You must enable eager execution'):
         model.compile(rmsprop.RMSprop(0.001), loss='mse')
 
   def test_manual_compute_output_shape(self):
@@ -244,7 +244,7 @@ class BaseLayerTest(keras_parameterized.TestCase):
   @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def test_invalid_forward_pass(self):
     inputs = input_layer.Input((3,))
-    with self.assertRaisesRegexp(ValueError, 'You did something wrong!'):
+    with self.assertRaisesRegex(ValueError, 'You did something wrong!'):
       _ = InvalidLayer()(inputs)
 
   def test_no_legacy_model(self):
@@ -259,25 +259,25 @@ class BaseLayerTest(keras_parameterized.TestCase):
     expected_regex = (r'The following are legacy tf\.layers\.Layers:\n  '
                       '{}\n  {}'.format(legacy_dense_0, legacy_dense_1))
 
-    with self.assertRaisesRegexp(TypeError, expected_regex):
+    with self.assertRaisesRegex(TypeError, expected_regex):
       _ = training_lib.Model(inputs=[inputs], outputs=[layer])
 
     model = training_lib.Model(inputs=[inputs], outputs=[inputs])
-    with self.assertRaisesRegexp(TypeError, expected_regex):
+    with self.assertRaisesRegex(TypeError, expected_regex):
       model._insert_layers([legacy_dense_0, legacy_dense_1])
 
   def test_no_legacy_sequential(self):
     layer = [layers.Dense(1), legacy_core.Dense(1, name='legacy_dense_0')]
 
     expected_regex = r'legacy tf\.layers\.Layers:\n  {}'.format(layer[1])
-    with self.assertRaisesRegexp(TypeError, expected_regex):
+    with self.assertRaisesRegex(TypeError, expected_regex):
       _ = sequential.Sequential(layer)
 
-    with self.assertRaisesRegexp(TypeError, expected_regex):
+    with self.assertRaisesRegex(TypeError, expected_regex):
       _ = sequential.Sequential([input_layer.Input(shape=(4,))] + layer)
 
     model = sequential.Sequential()
-    with self.assertRaisesRegexp(TypeError, expected_regex):
+    with self.assertRaisesRegex(TypeError, expected_regex):
       for l in layer:
         model.add(l)
 
@@ -499,11 +499,11 @@ class BaseLayerTest(keras_parameterized.TestCase):
     self.assertEqual(len(weights), 2)
     self.assertAllClose(weights[0], kernel)
     self.assertAllClose(weights[1], bias)
-    with self.assertRaisesRegexp(
-        ValueError, 'but the layer was expecting 2 weights'):
+    with self.assertRaisesRegex(ValueError,
+                                'but the layer was expecting 2 weights'):
       layer.set_weights([1, 2, 3])
-    with self.assertRaisesRegexp(
-        ValueError, 'not compatible with provided weight shape'):
+    with self.assertRaisesRegex(ValueError,
+                                'not compatible with provided weight shape'):
       layer.set_weights([kernel.T, bias])
 
   def test_get_config_error(self):
@@ -516,7 +516,7 @@ class BaseLayerTest(keras_parameterized.TestCase):
 
     # `__init__` includes kwargs but `get_config` is not overridden, so
     # an error should be thrown:
-    with self.assertRaisesRegexp(NotImplementedError, 'Layer MyLayer has'):
+    with self.assertRaisesRegex(NotImplementedError, 'Layer MyLayer has'):
       MyLayer('custom').get_config()
 
     class MyLayerNew(base_layer.Layer):
@@ -550,11 +550,11 @@ class BaseLayerTest(keras_parameterized.TestCase):
     self.assertEqual(dense.count_params(), 16 * 4 + 16)
 
     dense = layers.Dense(16)
-    with self.assertRaisesRegexp(ValueError, 'call `count_params`'):
+    with self.assertRaisesRegex(ValueError, 'call `count_params`'):
       dense.count_params()
 
     model = sequential.Sequential(layers.Dense(16))
-    with self.assertRaisesRegexp(ValueError, 'call `count_params`'):
+    with self.assertRaisesRegex(ValueError, 'call `count_params`'):
       model.count_params()
 
     dense = layers.Dense(16, input_dim=4)
@@ -569,7 +569,7 @@ class BaseLayerTest(keras_parameterized.TestCase):
         pass
 
     layer = CustomLayerNotCallingSuper()
-    with self.assertRaisesRegexp(RuntimeError, 'You must call `super()'):
+    with self.assertRaisesRegex(RuntimeError, 'You must call `super()'):
       layer(np.random.random((10, 2)))
 
   @combinations.generate(combinations.combine(mode=['graph', 'eager']))
@@ -594,7 +594,7 @@ class BaseLayerTest(keras_parameterized.TestCase):
     out = self.evaluate(layer(x=x, y=y))
     self.assertAllClose(out, 2 * np.ones((10, 1)))
 
-    with self.assertRaisesRegexp(ValueError, 'must always be passed'):
+    with self.assertRaisesRegex(ValueError, 'must always be passed'):
       layer(y=y)
 
     class TFFunctionLayer(base_layer.Layer):
@@ -775,14 +775,14 @@ class SymbolicSupportTest(keras_parameterized.TestCase):
       x1 = array_ops.ones((3, 3))
     x2 = array_ops.ones((3, 3))
     self.assertIsInstance(x2, ops.EagerTensor)
-    with self.assertRaisesRegexp(TypeError, 'Graph tensors'):
+    with self.assertRaisesRegex(TypeError, 'Graph tensors'):
       math_ops.matmul(x1, x2)
 
   def test_mixing_numpy_arrays_and_graph_tensors(self):
     with ops.Graph().as_default():
       x1 = array_ops.ones((3, 3))
     x2 = np.ones((3, 3), dtype='float32')
-    with self.assertRaisesRegexp(TypeError, 'Graph tensors'):
+    with self.assertRaisesRegex(TypeError, 'Graph tensors'):
       math_ops.matmul(x1, x2)
 
   @combinations.generate(combinations.combine(mode=['graph', 'eager']))
@@ -1336,8 +1336,8 @@ class AutographControlFlowTest(keras_parameterized.TestCase):
     if testing_utils.should_run_eagerly():
       model.fit(x, y, epochs=2, batch_size=5)
     else:
-      with self.assertRaisesRegexp(errors_impl.InaccessibleTensorError,
-                                   'ActivityRegularizer'):
+      with self.assertRaisesRegex(errors_impl.InaccessibleTensorError,
+                                  'ActivityRegularizer'):
         model.fit(x, y, epochs=2, batch_size=5)
 
   def test_conditional_activity_regularizer_with_wrappers_in_call(self):
@@ -1368,8 +1368,8 @@ class AutographControlFlowTest(keras_parameterized.TestCase):
     if testing_utils.should_run_eagerly():
       model.fit(x, y, epochs=2, batch_size=5)
     else:
-      with self.assertRaisesRegexp(errors_impl.InaccessibleTensorError,
-                                   'ActivityRegularizer'):
+      with self.assertRaisesRegex(errors_impl.InaccessibleTensorError,
+                                  'ActivityRegularizer'):
         model.fit(x, y, epochs=2, batch_size=5)
 
 
@@ -1525,7 +1525,7 @@ class DTypeTest(keras_parameterized.TestCase):
     layer = IdentityLayer()
     with test.mock.patch.object(tf_logging, 'warn') as mock_warn:
       layer(self._const('float64'))
-      self.assertRegexpMatches(
+      self.assertRegex(
           str(mock_warn.call_args),
           ".*from dtype float64 to the layer's dtype of float32.*"
           "The layer has dtype float32 because.*")
@@ -1539,7 +1539,7 @@ class DTypeTest(keras_parameterized.TestCase):
     layer = IdentityLayer()
     with test.mock.patch.object(tf_logging, 'warn') as mock_warn:
       layer(self._const('float64'))
-      self.assertRegexpMatches(
+      self.assertRegex(
           str(mock_warn.call_args),
           ".*from dtype float64 to the layer's dtype of float32.*"
           "The layer has dtype float32 because.*")
diff --git a/tensorflow/python/keras/engine/base_layer_utils_test.py b/tensorflow/python/keras/engine/base_layer_utils_test.py
index c039befda7f..72a4977f003 100644
--- a/tensorflow/python/keras/engine/base_layer_utils_test.py
+++ b/tensorflow/python/keras/engine/base_layer_utils_test.py
@@ -90,13 +90,13 @@ class OpLayerTest(keras_parameterized.TestCase):
     self.assertAllClose(expected, output)
 
   def test_ragged_op_layer(self):
-    with self.assertRaisesRegexp(ValueError, 'Keras automatic op wrapping'):
+    with self.assertRaisesRegex(ValueError, 'Keras automatic op wrapping'):
       int_values = keras.Input(shape=(None,), dtype=dtypes.int32, ragged=True)
       float_values = math_ops.cast(int_values, dtypes.float32)
       _ = keras.Model(int_values, float_values)
 
   def test_sparse_op_layer(self):
-    with self.assertRaisesRegexp(ValueError, 'Keras automatic op wrapping'):
+    with self.assertRaisesRegex(ValueError, 'Keras automatic op wrapping'):
       int_values = keras.Input(shape=(None,), dtype=dtypes.int32, sparse=True)
       float_values = math_ops.cast(int_values, dtypes.float32)
       _ = keras.Model(int_values, float_values)
diff --git a/tensorflow/python/keras/engine/data_adapter_test.py b/tensorflow/python/keras/engine/data_adapter_test.py
index be9c6d79193..fad193009cf 100644
--- a/tensorflow/python/keras/engine/data_adapter_test.py
+++ b/tensorflow/python/keras/engine/data_adapter_test.py
@@ -647,12 +647,12 @@ class DatasetAdapterTest(DataAdapterTestBase):
     self.assertIsNone(adapter.partial_batch_size())
 
   def test_invalid_targets_argument(self):
-    with self.assertRaisesRegexp(ValueError, r'`y` argument is not supported'):
+    with self.assertRaisesRegex(ValueError, r'`y` argument is not supported'):
       self.adapter_cls(self.dataset_input, y=self.dataset_input)
 
   def test_invalid_sample_weights_argument(self):
-    with self.assertRaisesRegexp(ValueError,
-                                 r'`sample_weight` argument is not supported'):
+    with self.assertRaisesRegex(ValueError,
+                                r'`sample_weight` argument is not supported'):
       self.adapter_cls(self.dataset_input, sample_weights=self.dataset_input)
 
 
@@ -703,12 +703,12 @@ class GeneratorDataAdapterTest(DataAdapterTestBase):
     self.assertIsNone(adapter.partial_batch_size())
 
   def test_invalid_targets_argument(self):
-    with self.assertRaisesRegexp(ValueError, r'`y` argument is not supported'):
+    with self.assertRaisesRegex(ValueError, r'`y` argument is not supported'):
       self.adapter_cls(self.generator_input, y=self.generator_input)
 
   def test_invalid_sample_weights_argument(self):
-    with self.assertRaisesRegexp(ValueError,
-                                 r'`sample_weight` argument is not supported'):
+    with self.assertRaisesRegex(ValueError,
+                                r'`sample_weight` argument is not supported'):
       self.adapter_cls(
           self.generator_input, sample_weights=self.generator_input)
 
@@ -770,12 +770,12 @@ class KerasSequenceAdapterTest(DataAdapterTestBase):
     self.assertIsNone(adapter.partial_batch_size())
 
   def test_invalid_targets_argument(self):
-    with self.assertRaisesRegexp(ValueError, r'`y` argument is not supported'):
+    with self.assertRaisesRegex(ValueError, r'`y` argument is not supported'):
       self.adapter_cls(self.sequence_input, y=self.sequence_input)
 
   def test_invalid_sample_weights_argument(self):
-    with self.assertRaisesRegexp(ValueError,
-                                 r'`sample_weight` argument is not supported'):
+    with self.assertRaisesRegex(ValueError,
+                                r'`sample_weight` argument is not supported'):
       self.adapter_cls(self.sequence_input, sample_weights=self.sequence_input)
 
 
@@ -958,7 +958,7 @@ class DataHandlerTest(keras_parameterized.TestCase):
                                       ([2],)], [([0],), ([1],), ([2],)]])
 
   def test_class_weight_user_errors(self):
-    with self.assertRaisesRegexp(ValueError, 'to be a dict with keys'):
+    with self.assertRaisesRegex(ValueError, 'to be a dict with keys'):
       data_adapter.DataHandler(
           x=[[0], [1], [2]],
           y=[[2], [1], [0]],
@@ -970,7 +970,7 @@ class DataHandlerTest(keras_parameterized.TestCase):
               3: 1.5  # Skips class `2`.
           })
 
-    with self.assertRaisesRegexp(ValueError, 'with a single output'):
+    with self.assertRaisesRegex(ValueError, 'with a single output'):
       data_adapter.DataHandler(
           x=np.ones((10, 1)),
           y=[np.ones((10, 1)), np.zeros((10, 1))],
@@ -1031,13 +1031,12 @@ class TestValidationSplit(keras_parameterized.TestCase):
     self.assertEqual(val_sw.numpy().tolist(), [16])
 
   def test_validation_split_user_error(self):
-    with self.assertRaisesRegexp(ValueError, 'is only supported for Tensors'):
+    with self.assertRaisesRegex(ValueError, 'is only supported for Tensors'):
       data_adapter.train_validation_split(
           lambda: np.ones((10, 1)), validation_split=0.2)
 
   def test_validation_split_examples_too_few(self):
-    with self.assertRaisesRegexp(
-        ValueError, 'not sufficient to split it'):
+    with self.assertRaisesRegex(ValueError, 'not sufficient to split it'):
       data_adapter.train_validation_split(
           np.ones((1, 10)), validation_split=0.2)
 
diff --git a/tensorflow/python/keras/engine/functional_test.py b/tensorflow/python/keras/engine/functional_test.py
index b8768a5e311..db7b3d696ab 100644
--- a/tensorflow/python/keras/engine/functional_test.py
+++ b/tensorflow/python/keras/engine/functional_test.py
@@ -131,26 +131,26 @@ class NetworkConstructionTest(keras_parameterized.TestCase):
     self.assertEqual(network.get_layer(index=1), dense_a)
 
     # test invalid get_layer by index
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         ValueError, 'Was asked to retrieve layer at index ' + str(3) +
         ' but model only has ' + str(len(network.layers)) + ' layers.'):
       network.get_layer(index=3)
 
     # test that only one between name and index is requested
-    with self.assertRaisesRegexp(ValueError,
-                                 'Provide only a layer name or a layer index'):
+    with self.assertRaisesRegex(ValueError,
+                                'Provide only a layer name or a layer index'):
       network.get_layer(index=1, name='dense_b')
 
     # test that a name or an index must be provided
-    with self.assertRaisesRegexp(ValueError,
-                                 'Provide either a layer name or layer index.'):
+    with self.assertRaisesRegex(ValueError,
+                                'Provide either a layer name or layer index.'):
       network.get_layer()
 
     # test various get_layer by name
     self.assertEqual(network.get_layer(name='dense_a'), dense_a)
 
     # test invalid get_layer by name
-    with self.assertRaisesRegexp(ValueError, 'No such layer: dense_c.'):
+    with self.assertRaisesRegex(ValueError, 'No such layer: dense_c.'):
       network.get_layer(name='dense_c')
 
   @combinations.generate(combinations.combine(mode=['graph', 'eager']))
@@ -1036,7 +1036,7 @@ class NetworkConstructionTest(keras_parameterized.TestCase):
         batch_size=2)
     # Check that input was correctly doubled.
     self.assertEqual(history.history['loss'][0], 0.0)
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         TypeError, 'Layer double was passed non-JSON-serializable arguments.'):
       model.get_config()
 
@@ -1274,7 +1274,7 @@ class NetworkConstructionTest(keras_parameterized.TestCase):
       def __init__(self):
         self._foo = [layers.Dense(10), layers.Dense(10)]
 
-    with self.assertRaisesRegexp(RuntimeError, 'forgot to call'):
+    with self.assertRaisesRegex(RuntimeError, 'forgot to call'):
       MyNetwork()
 
   @combinations.generate(combinations.combine(mode=['graph', 'eager']))
@@ -1291,12 +1291,12 @@ class NetworkConstructionTest(keras_parameterized.TestCase):
     inputs = input_layer_lib.Input(shape=(32,))
     outputs = layers.Dense(4)(inputs)
 
-    with self.assertRaisesRegexp(TypeError,
-                                 'got an unexpected keyword argument'):
+    with self.assertRaisesRegex(TypeError,
+                                'got an unexpected keyword argument'):
       model = training_lib.Model(
           inputs, outputs, name='m', trainable=False, dtype='int64')
-    with self.assertRaisesRegexp(TypeError,
-                                 'got an unexpected keyword argument'):
+    with self.assertRaisesRegex(TypeError,
+                                'got an unexpected keyword argument'):
       model = training_lib.Model(
           inputs, outputs, name='m', trainable=False, dynamic=False)
 
@@ -1931,7 +1931,7 @@ class WeightAccessTest(keras_parameterized.TestCase):
     x3 = layers.Dense(1)
     model = sequential.Sequential([x1, x2, x3])
 
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         ValueError, 'Weights for model .* have not yet been created'):
       _ = model.weights
 
@@ -1947,7 +1947,7 @@ class WeightAccessTest(keras_parameterized.TestCase):
 
     model = SubclassModel()
 
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         ValueError, 'Weights for model .* have not yet been created'):
       _ = model.weights
 
diff --git a/tensorflow/python/keras/engine/input_spec_test.py b/tensorflow/python/keras/engine/input_spec_test.py
index f788fcdf664..a87af5e56a6 100644
--- a/tensorflow/python/keras/engine/input_spec_test.py
+++ b/tensorflow/python/keras/engine/input_spec_test.py
@@ -26,9 +26,9 @@ class InputSpecTest(test.TestCase):
 
   def test_axes_initialization(self):
     input_spec.InputSpec(shape=[1, None, 2, 3], axes={3: 5, '2': 2})
-    with self.assertRaisesRegexp(ValueError, 'Axis 4 is greater than'):
+    with self.assertRaisesRegex(ValueError, 'Axis 4 is greater than'):
       input_spec.InputSpec(shape=[1, None, 2, 3], axes={4: 5})
-    with self.assertRaisesRegexp(TypeError, 'keys in axes must be integers'):
+    with self.assertRaisesRegex(TypeError, 'keys in axes must be integers'):
       input_spec.InputSpec(shape=[1, None, 2, 3], axes={'string': 5})
 
 
@@ -54,11 +54,11 @@ class InputSpecToTensorShapeTest(test.TestCase):
 
   def test_undefined_shapes(self):
     spec = input_spec.InputSpec(max_ndim=5)
-    with self.assertRaisesRegexp(ValueError, 'unknown TensorShape'):
+    with self.assertRaisesRegex(ValueError, 'unknown TensorShape'):
       input_spec.to_tensor_shape(spec).as_list()
 
     spec = input_spec.InputSpec(min_ndim=5, max_ndim=5)
-    with self.assertRaisesRegexp(ValueError, 'unknown TensorShape'):
+    with self.assertRaisesRegex(ValueError, 'unknown TensorShape'):
       input_spec.to_tensor_shape(spec).as_list()
 
 
diff --git a/tensorflow/python/keras/engine/sequential_test.py b/tensorflow/python/keras/engine/sequential_test.py
index 773ce003656..1c8510ff3c9 100644
--- a/tensorflow/python/keras/engine/sequential_test.py
+++ b/tensorflow/python/keras/engine/sequential_test.py
@@ -118,7 +118,7 @@ class TestSequential(keras_parameterized.TestCase):
         metrics=[keras.metrics.CategoricalAccuracy()],
         run_eagerly=testing_utils.should_run_eagerly())
     self.assertEqual(len(model.layers), 2)
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         ValueError, 'Weights for model .* have not yet been created'):
       len(model.weights)
     self.assertFalse(model.built)
@@ -144,7 +144,7 @@ class TestSequential(keras_parameterized.TestCase):
         metrics=[keras.metrics.CategoricalAccuracy()],
         run_eagerly=testing_utils.should_run_eagerly())
     self.assertEqual(len(model.layers), 2)
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         ValueError, 'Weights for model .* have not yet been created'):
       len(model.weights)
     self.assertFalse(model.built)
@@ -356,8 +356,8 @@ class TestSequential(keras_parameterized.TestCase):
     model = keras.models.Sequential()
     model.add(keras.layers.Dense(1))
     if context.executing_eagerly():
-      with self.assertRaisesRegexp(ValueError,
-                                   'expected min_ndim=2, found ndim=0'):
+      with self.assertRaisesRegex(ValueError,
+                                  'expected min_ndim=2, found ndim=0'):
         model(1.0)
 
   @keras_parameterized.run_all_keras_modes
@@ -378,19 +378,19 @@ class TestSequential(keras_parameterized.TestCase):
       def call(self, inputs):
         return inputs, inputs
 
-    with self.assertRaisesRegexp(
-        ValueError, 'should have a single output tensor'):
+    with self.assertRaisesRegex(ValueError,
+                                'should have a single output tensor'):
       keras.Sequential([MultiOutputLayer(input_shape=(3,))])
 
-    with self.assertRaisesRegexp(
-        ValueError, 'should have a single output tensor'):
+    with self.assertRaisesRegex(ValueError,
+                                'should have a single output tensor'):
       keras.Sequential([
           keras.layers.Dense(1, input_shape=(3,)),
           MultiOutputLayer()])
 
     # Should also raise error in a deferred build mode
-    with self.assertRaisesRegexp(
-        ValueError, 'should have a single output tensor'):
+    with self.assertRaisesRegex(ValueError,
+                                'should have a single output tensor'):
       keras.Sequential([MultiOutputLayer()])(np.zeros((10, 10)))
 
   @keras_parameterized.run_all_keras_modes(always_skip_v1=True)
@@ -442,7 +442,7 @@ class TestSequential(keras_parameterized.TestCase):
   def test_name_unicity(self):
     model = keras.Sequential()
     model.add(keras.layers.Dense(3, name='specific_name'))
-    with self.assertRaisesRegexp(ValueError, 'should have unique names'):
+    with self.assertRaisesRegex(ValueError, 'should have unique names'):
       model.add(keras.layers.Dense(3, name='specific_name'))
 
 
diff --git a/tensorflow/python/keras/engine/training_dataset_test.py b/tensorflow/python/keras/engine/training_dataset_test.py
index 0d47dcb0443..92c199ef1f3 100644
--- a/tensorflow/python/keras/engine/training_dataset_test.py
+++ b/tensorflow/python/keras/engine/training_dataset_test.py
@@ -112,7 +112,7 @@ class TestTrainingWithDataset(keras_parameterized.TestCase):
 
     # Test with sample weight.
     sample_weight = np.random.random((10,))
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         ValueError, r'`sample_weight` argument is not supported .+dataset'):
       model.fit(
           dataset,
@@ -121,7 +121,7 @@ class TestTrainingWithDataset(keras_parameterized.TestCase):
           verbose=0,
           sample_weight=sample_weight)
 
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         ValueError, '(you should not specify a target)|'
         '(`y` argument is not supported when using dataset as input.)'):
       model.fit(dataset, dataset,
@@ -314,7 +314,7 @@ class TestTrainingWithDataset(keras_parameterized.TestCase):
       dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets))
       dataset = dataset.repeat(100)
 
-      with self.assertRaisesRegexp(
+      with self.assertRaisesRegex(
           ValueError,
           r'expected (.*?) to have shape \(3,\) but got array with shape \(1,\)'
       ):
@@ -327,8 +327,8 @@ class TestTrainingWithDataset(keras_parameterized.TestCase):
       dataset = dataset.repeat(100)
       dataset = dataset.batch(10)
 
-      with self.assertRaisesRegexp(ValueError,
-                                   r'expected (.*?) to have shape \(3,\)'):
+      with self.assertRaisesRegex(ValueError,
+                                  r'expected (.*?) to have shape \(3,\)'):
         model.train_on_batch(dataset)
 
   @keras_parameterized.run_with_all_model_types
diff --git a/tensorflow/python/keras/engine/training_generator_test.py b/tensorflow/python/keras/engine/training_generator_test.py
index 0844523f81b..3837763c494 100644
--- a/tensorflow/python/keras/engine/training_generator_test.py
+++ b/tensorflow/python/keras/engine/training_generator_test.py
@@ -446,11 +446,11 @@ class TestGeneratorMethodsWithSequences(keras_parameterized.TestCase):
     model.evaluate(CustomSequence())
     model.predict(CustomSequence())
 
-    with self.assertRaisesRegexp(ValueError, '`y` argument is not supported'):
+    with self.assertRaisesRegex(ValueError, '`y` argument is not supported'):
       model.fit(CustomSequence(), y=np.ones([10, 1]))
 
-    with self.assertRaisesRegexp(ValueError,
-                                 '`sample_weight` argument is not supported'):
+    with self.assertRaisesRegex(ValueError,
+                                '`sample_weight` argument is not supported'):
       model.fit(CustomSequence(), sample_weight=np.ones([10, 1]))
 
     model.compile(rmsprop.RMSprop(0.001), 'binary_crossentropy')
diff --git a/tensorflow/python/keras/engine/training_test.py b/tensorflow/python/keras/engine/training_test.py
index 8cb3f99ddb0..2885422ac42 100644
--- a/tensorflow/python/keras/engine/training_test.py
+++ b/tensorflow/python/keras/engine/training_test.py
@@ -93,8 +93,8 @@ class TrainingTest(keras_parameterized.TestCase):
   def test_fit_on_empty(self):
     model = sequential.Sequential([layers_module.Dense(1)])
     model.compile('sgd', 'mse', run_eagerly=testing_utils.should_run_eagerly())
-    with self.assertRaisesRegexp(
-        ValueError, 'Expect x to be a non-empty array or dataset.'):
+    with self.assertRaisesRegex(ValueError,
+                                'Expect x to be a non-empty array or dataset.'):
       model.fit(x=np.array([]), y=np.array([]))
 
   @keras_parameterized.run_all_keras_modes
@@ -123,7 +123,7 @@ class TrainingTest(keras_parameterized.TestCase):
       getattr(model, method_name)(1)
 
     error_msg = 'inside a `tf.function`'
-    with self.assertRaisesRegexp(RuntimeError, error_msg):
+    with self.assertRaisesRegex(RuntimeError, error_msg):
       my_fn()
 
   @keras_parameterized.run_all_keras_modes
@@ -1084,8 +1084,8 @@ class TrainingTest(keras_parameterized.TestCase):
     outputs = layers_module.Dense(1, activation='sigmoid')(inputs)
     model = training_module.Model(inputs, outputs)
     model.compile(optimizer_v2.adam.Adam(0.001), 'binary_crossentropy')
-    with self.assertRaisesRegexp(ValueError,
-                                 'incompatible with the specified batch size'):
+    with self.assertRaisesRegex(ValueError,
+                                'incompatible with the specified batch size'):
       model.fit(x, y, batch_size=4)
 
   @combinations.generate(combinations.combine(mode=['graph', 'eager']))
@@ -1099,8 +1099,8 @@ class TrainingTest(keras_parameterized.TestCase):
     input1 = input_layer.Input(batch_size=2, shape=(10,))
     input2 = input_layer.Input(batch_size=3, shape=(10,))
     outputs = MyLayer()([input1, input2])
-    with self.assertRaisesRegexp(ValueError,
-                                 'specified batch sizes of the Input Layers'):
+    with self.assertRaisesRegex(ValueError,
+                                'specified batch sizes of the Input Layers'):
       training_module.Model([input1, input2], outputs)
 
   @combinations.generate(combinations.combine(mode=['graph', 'eager']))
@@ -1226,7 +1226,7 @@ class TrainingTest(keras_parameterized.TestCase):
         'mse',
         run_eagerly=testing_utils.should_run_eagerly())
 
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         ValueError, '`validation_steps` should not be specified if '
         '`validation_data` is None.'):
       model.fit(x, y, epochs=4, validation_data=None, validation_steps=3)
@@ -1648,10 +1648,8 @@ class TestExceptionsAndWarnings(keras_parameterized.TestCase):
   def test_sparse_op_with_op_layer(self):
     inputs = layers_module.Input(shape=(2,), sparse=True, name='sparse_tensor')
     output = sparse_ops.sparse_minimum(inputs, inputs)
-    with self.assertRaisesRegexp(
-        ValueError,
-        'not supported by Keras automatic op wrapping'
-    ):
+    with self.assertRaisesRegex(ValueError,
+                                'not supported by Keras automatic op wrapping'):
       training_module.Model([inputs], output)
 
   @keras_parameterized.run_all_keras_modes(always_skip_v1=True)
@@ -1661,10 +1659,8 @@ class TestExceptionsAndWarnings(keras_parameterized.TestCase):
     model = training_module.Model(inputs=inputs, outputs=outputs)
     model.compile(loss='mse')
 
-    with self.assertRaisesRegexp(
-        ValueError,
-        'Expect x to be a non-empty array or dataset.'
-    ):
+    with self.assertRaisesRegex(ValueError,
+                                'Expect x to be a non-empty array or dataset.'):
       model.predict(np.array([]))
 
   @keras_parameterized.run_all_keras_modes(always_skip_v1=True)
@@ -1676,15 +1672,15 @@ class TestExceptionsAndWarnings(keras_parameterized.TestCase):
     model = training_module.Model([input_node1, input_node2], output_node)
     model.compile(loss='mse')
 
-    with self.assertRaisesRegexp(ValueError, 'Data cardinality is ambiguous'):
+    with self.assertRaisesRegex(ValueError, 'Data cardinality is ambiguous'):
       model.train_on_batch([np.ones((10, 5)), np.ones((10, 5))],
                            np.ones((11, 4)))
 
-    with self.assertRaisesRegexp(ValueError, 'Data cardinality is ambiguous'):
+    with self.assertRaisesRegex(ValueError, 'Data cardinality is ambiguous'):
       model.test_on_batch([np.ones((10, 5)), np.ones((10, 5))],
                           np.ones((11, 4)))
 
-    with self.assertRaisesRegexp(ValueError, 'Data cardinality is ambiguous'):
+    with self.assertRaisesRegex(ValueError, 'Data cardinality is ambiguous'):
       model.predict_on_batch([np.ones((10, 5)), np.ones((11, 5))])
 
 
@@ -3210,7 +3206,7 @@ class TestTrainingWithMetrics(keras_parameterized.TestCase):
 
     x = np.ones(shape=(10, 1))
     y = np.ones(shape=(10, 2))
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         ValueError,
         'Please provide different names for the metrics you have added. '
         'We found 2 metrics with the name: "metric_1"'):
@@ -3366,13 +3362,13 @@ class TestTrainingWithMetrics(keras_parameterized.TestCase):
     x = layers_module.Input(shape=(1,))
     y = layers_module.Dense(1, kernel_initializer='ones')(x)
     model = training_module.Model(x, y)
-    with self.assertRaisesRegexp(ValueError,
-                                 'only `mean` sample-wise metric aggregation'):
+    with self.assertRaisesRegex(ValueError,
+                                'only `mean` sample-wise metric aggregation'):
       model.add_metric(
           math_ops.reduce_sum(y), name='metric_1', aggregation='sum')
 
-    with self.assertRaisesRegexp(ValueError,
-                                 'only `mean` sample-wise metric aggregation'):
+    with self.assertRaisesRegex(ValueError,
+                                'only `mean` sample-wise metric aggregation'):
       model.add_metric(
           math_ops.reduce_sum(y), name='metric_1', aggregation=None)
 
diff --git a/tensorflow/python/keras/engine/training_utils_test.py b/tensorflow/python/keras/engine/training_utils_test.py
index 1a6917e2e21..bc2c4c91268 100644
--- a/tensorflow/python/keras/engine/training_utils_test.py
+++ b/tensorflow/python/keras/engine/training_utils_test.py
@@ -203,9 +203,8 @@ class DatasetUtilsTest(test.TestCase, parameterized.TestCase):
 
     with test.mock.patch.object(logging, 'warning') as mock_log:
       training_utils.verify_dataset_shuffled(dataset)
-      self.assertRegexpMatches(
-          str(mock_log.call_args),
-          'input dataset `x` is not shuffled.')
+      self.assertRegex(
+          str(mock_log.call_args), 'input dataset `x` is not shuffled.')
 
     shuffled_dataset = dataset.shuffle(10)
     training_utils.verify_dataset_shuffled(shuffled_dataset)
@@ -398,14 +397,14 @@ class AggregationTest(keras_parameterized.TestCase):
     training_utils.SliceAggregator._BINARY_SIZE_THRESHOLD = 15
     training_utils.SliceAggregator._MAX_COPY_SECONDS = 0.1
     training_utils._COPY_POOL._func_wrapper = add_sleep
-    with self.assertRaisesRegexp(ValueError, 'Timed out waiting for copy'):
+    with self.assertRaisesRegex(ValueError, 'Timed out waiting for copy'):
       self._run_without_steps()
 
   def test_async_copy_reraise(self):
     training_utils.SliceAggregator._BINARY_SIZE_THRESHOLD = 15
     training_utils.SliceAggregator._MAX_COPY_SECONDS = 1.
     training_utils._COPY_POOL._func_wrapper = cause_error
-    with self.assertRaisesRegexp(TypeError, 'NoneType'):
+    with self.assertRaisesRegex(TypeError, 'NoneType'):
       self._run_without_steps()
 
 
diff --git a/tensorflow/python/keras/feature_column/dense_features_test.py b/tensorflow/python/keras/feature_column/dense_features_test.py
index 76b91dd605f..ef132b67707 100644
--- a/tensorflow/python/keras/feature_column/dense_features_test.py
+++ b/tensorflow/python/keras/feature_column/dense_features_test.py
@@ -201,12 +201,12 @@ class DenseFeaturesTest(test.TestCase):
       self.assertAllEqual([[2, 2], [2, 2], [2, 2]], gradient)
 
   def test_raises_if_empty_feature_columns(self):
-    with self.assertRaisesRegexp(ValueError,
-                                 'feature_columns must not be empty'):
+    with self.assertRaisesRegex(ValueError,
+                                'feature_columns must not be empty'):
       df.DenseFeatures(feature_columns=[])(features={})
 
   def test_should_be_dense_column(self):
-    with self.assertRaisesRegexp(ValueError, 'must be a .*DenseColumn'):
+    with self.assertRaisesRegex(ValueError, 'must be a .*DenseColumn'):
       df.DenseFeatures(feature_columns=[
           fc.categorical_column_with_hash_bucket('wire_cast', 4)
       ])(
@@ -215,7 +215,7 @@ class DenseFeaturesTest(test.TestCase):
           })
 
   def test_does_not_support_dict_columns(self):
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         ValueError, 'Expected feature_columns to be iterable, found dict.'):
       df.DenseFeatures(feature_columns={'a': fc.numeric_column('a')})(
           features={
@@ -244,7 +244,7 @@ class DenseFeaturesTest(test.TestCase):
       self.assertAllClose([[0., 1.]], self.evaluate(net))
 
   def test_raises_if_duplicate_name(self):
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         ValueError, 'Duplicate feature column name found for columns'):
       df.DenseFeatures(
           feature_columns=[fc.numeric_column('a'),
@@ -297,7 +297,7 @@ class DenseFeaturesTest(test.TestCase):
     price = fc.numeric_column('price', shape=2)
     with ops.Graph().as_default():
       features = {'price': [[1.], [5.]]}
-      with self.assertRaisesRegexp(
+      with self.assertRaisesRegex(
           Exception,
           r'Cannot reshape a tensor with 2 elements to shape \[2,2\]'):
         df.DenseFeatures([price])(features)
@@ -367,7 +367,7 @@ class DenseFeaturesTest(test.TestCase):
               sparse_tensor.SparseTensor(
                   indices=[[0, 0], [0, 1]], values=[1, 2], dense_shape=[1, 2])
       }
-      with self.assertRaisesRegexp(Exception, 'must be a .*DenseColumn'):
+      with self.assertRaisesRegex(Exception, 'must be a .*DenseColumn'):
         df.DenseFeatures([animal])(features)
 
   def test_static_batch_size_mismatch(self):
@@ -378,7 +378,7 @@ class DenseFeaturesTest(test.TestCase):
           'price1': [[1.], [5.], [7.]],  # batchsize = 3
           'price2': [[3.], [4.]]  # batchsize = 2
       }
-      with self.assertRaisesRegexp(
+      with self.assertRaisesRegex(
           ValueError,
           r'Batch size \(first dimension\) of each feature must be same.'):  # pylint: disable=anomalous-backslash-in-string
         df.DenseFeatures([price1, price2])(features)
@@ -393,7 +393,7 @@ class DenseFeaturesTest(test.TestCase):
           'price2': [[3.], [4.]],  # batchsize = 2
           'price3': [[3.], [4.], [5.]]  # batchsize = 3
       }
-      with self.assertRaisesRegexp(
+      with self.assertRaisesRegex(
           ValueError,
           r'Batch size \(first dimension\) of each feature must be same.'):  # pylint: disable=anomalous-backslash-in-string
         df.DenseFeatures([price1, price2, price3])(features)
@@ -408,8 +408,8 @@ class DenseFeaturesTest(test.TestCase):
       }
       net = df.DenseFeatures([price1, price2])(features)
       with _initialized_session() as sess:
-        with self.assertRaisesRegexp(errors.OpError,
-                                     'Dimensions of inputs should match'):
+        with self.assertRaisesRegex(errors.OpError,
+                                    'Dimensions of inputs should match'):
           sess.run(net, feed_dict={features['price1']: [[1.], [5.], [7.]]})
 
   def test_runtime_batch_size_matches(self):
@@ -665,7 +665,7 @@ class DenseFeaturesTest(test.TestCase):
     self.assertEqual(0, features['price'].shape.ndims)
 
     # Static rank 0 should fail
-    with self.assertRaisesRegexp(ValueError, 'Feature .* cannot have rank 0'):
+    with self.assertRaisesRegex(ValueError, 'Feature .* cannot have rank 0'):
       df.DenseFeatures([price])(features)
 
     # Dynamic rank 0 should fail
@@ -1098,7 +1098,7 @@ class SequenceFeatureColumnsTest(test.TestCase):
         categorical_column_a, dimension=2)
 
     input_layer = df.DenseFeatures([embedding_column_a])
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         ValueError,
         r'In embedding_column: aaa_embedding\. categorical_column must not be '
         r'of type SequenceCategoricalColumn\.'):
@@ -1119,7 +1119,7 @@ class SequenceFeatureColumnsTest(test.TestCase):
     indicator_column_a = fc.indicator_column(categorical_column_a)
 
     input_layer = df.DenseFeatures([indicator_column_a])
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         ValueError,
         r'In indicator_column: aaa_indicator\. categorical_column must not be '
         r'of type SequenceCategoricalColumn\.'):
diff --git a/tensorflow/python/keras/feature_column/dense_features_v2_test.py b/tensorflow/python/keras/feature_column/dense_features_v2_test.py
index 95fc8b7ac1e..384d6424f47 100644
--- a/tensorflow/python/keras/feature_column/dense_features_v2_test.py
+++ b/tensorflow/python/keras/feature_column/dense_features_v2_test.py
@@ -182,12 +182,12 @@ class DenseFeaturesTest(test.TestCase):
                           self.evaluate(predict_mode))
 
   def test_raises_if_empty_feature_columns(self):
-    with self.assertRaisesRegexp(ValueError,
-                                 'feature_columns must not be empty'):
+    with self.assertRaisesRegex(ValueError,
+                                'feature_columns must not be empty'):
       df.DenseFeatures(feature_columns=[])(features={})
 
   def test_should_be_dense_column(self):
-    with self.assertRaisesRegexp(ValueError, 'must be a .*DenseColumn'):
+    with self.assertRaisesRegex(ValueError, 'must be a .*DenseColumn'):
       df.DenseFeatures(feature_columns=[
           fc.categorical_column_with_hash_bucket('wire_cast', 4)
       ])(
@@ -196,7 +196,7 @@ class DenseFeaturesTest(test.TestCase):
           })
 
   def test_does_not_support_dict_columns(self):
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         ValueError, 'Expected feature_columns to be iterable, found dict.'):
       df.DenseFeatures(feature_columns={'a': fc.numeric_column('a')})(
           features={
@@ -225,7 +225,7 @@ class DenseFeaturesTest(test.TestCase):
       self.assertAllClose([[0., 1.]], self.evaluate(net))
 
   def test_raises_if_duplicate_name(self):
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         ValueError, 'Duplicate feature column name found for columns'):
       df.DenseFeatures(
           feature_columns=[fc.numeric_column('a'),
@@ -278,7 +278,7 @@ class DenseFeaturesTest(test.TestCase):
     price = fc.numeric_column('price', shape=2)
     with ops.Graph().as_default():
       features = {'price': [[1.], [5.]]}
-      with self.assertRaisesRegexp(
+      with self.assertRaisesRegex(
           Exception,
           r'Cannot reshape a tensor with 2 elements to shape \[2,2\]'):
         df.DenseFeatures([price])(features)
@@ -348,7 +348,7 @@ class DenseFeaturesTest(test.TestCase):
               sparse_tensor.SparseTensor(
                   indices=[[0, 0], [0, 1]], values=[1, 2], dense_shape=[1, 2])
       }
-      with self.assertRaisesRegexp(Exception, 'must be a .*DenseColumn'):
+      with self.assertRaisesRegex(Exception, 'must be a .*DenseColumn'):
         df.DenseFeatures([animal])(features)
 
   def test_static_batch_size_mismatch(self):
@@ -359,7 +359,7 @@ class DenseFeaturesTest(test.TestCase):
           'price1': [[1.], [5.], [7.]],  # batchsize = 3
           'price2': [[3.], [4.]]  # batchsize = 2
       }
-      with self.assertRaisesRegexp(
+      with self.assertRaisesRegex(
           ValueError,
           r'Batch size \(first dimension\) of each feature must be same.'):  # pylint: disable=anomalous-backslash-in-string
         df.DenseFeatures([price1, price2])(features)
@@ -374,7 +374,7 @@ class DenseFeaturesTest(test.TestCase):
           'price2': [[3.], [4.]],  # batchsize = 2
           'price3': [[3.], [4.], [5.]]  # batchsize = 3
       }
-      with self.assertRaisesRegexp(
+      with self.assertRaisesRegex(
           ValueError,
           r'Batch size \(first dimension\) of each feature must be same.'):  # pylint: disable=anomalous-backslash-in-string
         df.DenseFeatures([price1, price2, price3])(features)
@@ -389,8 +389,8 @@ class DenseFeaturesTest(test.TestCase):
       }
       net = df.DenseFeatures([price1, price2])(features)
       with _initialized_session() as sess:
-        with self.assertRaisesRegexp(errors.OpError,
-                                     'Dimensions of inputs should match'):
+        with self.assertRaisesRegex(errors.OpError,
+                                    'Dimensions of inputs should match'):
           sess.run(net, feed_dict={features['price1']: [[1.], [5.], [7.]]})
 
   def test_runtime_batch_size_matches(self):
@@ -646,7 +646,7 @@ class DenseFeaturesTest(test.TestCase):
     self.assertEqual(0, features['price'].shape.ndims)
 
     # Static rank 0 should fail
-    with self.assertRaisesRegexp(ValueError, 'Feature .* cannot have rank 0'):
+    with self.assertRaisesRegex(ValueError, 'Feature .* cannot have rank 0'):
       df.DenseFeatures([price])(features)
 
     # Dynamic rank 0 should fail
diff --git a/tensorflow/python/keras/feature_column/sequence_feature_column_test.py b/tensorflow/python/keras/feature_column/sequence_feature_column_test.py
index f6e24a586f2..8374aa6f671 100644
--- a/tensorflow/python/keras/feature_column/sequence_feature_column_test.py
+++ b/tensorflow/python/keras/feature_column/sequence_feature_column_test.py
@@ -167,7 +167,7 @@ class SequenceFeaturesTest(test.TestCase, parameterized.TestCase):
     embedding_column_a = fc.embedding_column(
         categorical_column_a, dimension=2)
     sequence_input_layer = ksfc.SequenceFeatures([embedding_column_a])
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         ValueError,
         r'In embedding_column: aaa_embedding\. categorical_column must be of '
         r'type SequenceCategoricalColumn to use SequenceFeatures\.'):
@@ -266,7 +266,7 @@ class SequenceFeaturesTest(test.TestCase, parameterized.TestCase):
         [categorical_column_a, categorical_column_b], dimension=2)
 
     sequence_input_layer = ksfc.SequenceFeatures(shared_embedding_columns)
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         ValueError,
         r'In embedding_column: aaa_shared_embedding\. categorical_column must '
         r'be of type SequenceCategoricalColumn to use SequenceFeatures\.'):
@@ -357,7 +357,7 @@ class SequenceFeaturesTest(test.TestCase, parameterized.TestCase):
     indicator_column_a = fc.indicator_column(categorical_column_a)
 
     sequence_input_layer = ksfc.SequenceFeatures([indicator_column_a])
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         ValueError,
         r'In indicator_column: aaa_indicator\. categorical_column must be of '
         r'type SequenceCategoricalColumn to use SequenceFeatures\.'):
@@ -464,8 +464,8 @@ class SequenceFeaturesTest(test.TestCase, parameterized.TestCase):
     sequence_input_layer = ksfc.SequenceFeatures(
         [numeric_column_a, numeric_column_b])
 
-    with self.assertRaisesRegexp(
-        errors.InvalidArgumentError, r'Condition x == y did not hold.*'):
+    with self.assertRaisesRegex(errors.InvalidArgumentError,
+                                r'Condition x == y did not hold.*'):
       _, sequence_length = sequence_input_layer({
           'aaa': sparse_input_a,
           'bbb': sparse_input_b
diff --git a/tensorflow/python/keras/integration_test/function_test.py b/tensorflow/python/keras/integration_test/function_test.py
index 9f37bd25a61..7eed6856752 100644
--- a/tensorflow/python/keras/integration_test/function_test.py
+++ b/tensorflow/python/keras/integration_test/function_test.py
@@ -80,8 +80,8 @@ class FunctionTest(tf.test.TestCase):
     # matmul to fail, due to incompatible dims.  What would have been a graph
     # build time error (layer would complain about the inner dim being 4).
     with self.captureWritesToStream(sys.stderr) as printed:
-      with self.assertRaisesRegexp(tf.errors.InvalidArgumentError,
-                                   r'Matrix size-incompatible'):
+      with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                  r'Matrix size-incompatible'):
         fn(tf.ones((3, 4)))
 
   def testDefunKerasModelCall(self):
diff --git a/tensorflow/python/keras/layers/advanced_activations_test.py b/tensorflow/python/keras/layers/advanced_activations_test.py
index 2145dab7f6d..80eae8e72de 100644
--- a/tensorflow/python/keras/layers/advanced_activations_test.py
+++ b/tensorflow/python/keras/layers/advanced_activations_test.py
@@ -76,12 +76,12 @@ class AdvancedActivationsTest(keras_parameterized.TestCase):
       self.assertTrue('Relu6' in keras.layers.ReLU(max_value=6)(x).name)
 
   def test_relu_with_invalid_arg(self):
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         ValueError, 'max_value of Relu layer cannot be negative value: -10'):
       testing_utils.layer_test(keras.layers.ReLU,
                                kwargs={'max_value': -10},
                                input_shape=(2, 3, 4))
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         ValueError,
         'negative_slope of Relu layer cannot be negative value: -2'):
       with self.cached_session():
diff --git a/tensorflow/python/keras/layers/convolutional_test.py b/tensorflow/python/keras/layers/convolutional_test.py
index 2c6f6def8ab..1661f843dc9 100644
--- a/tensorflow/python/keras/layers/convolutional_test.py
+++ b/tensorflow/python/keras/layers/convolutional_test.py
@@ -423,9 +423,9 @@ class GroupedConvTest(keras_parameterized.TestCase):
       ('Conv3D', keras.layers.Conv3D),
   )
   def test_group_conv_incorrect_use(self, layer):
-    with self.assertRaisesRegexp(ValueError, 'The number of filters'):
+    with self.assertRaisesRegex(ValueError, 'The number of filters'):
       layer(16, 3, groups=3)
-    with self.assertRaisesRegexp(ValueError, 'The number of input channels'):
+    with self.assertRaisesRegex(ValueError, 'The number of input channels'):
       layer(16, 3, groups=4).build((32, 12, 12, 3))
 
   @parameterized.named_parameters(
diff --git a/tensorflow/python/keras/layers/core_test.py b/tensorflow/python/keras/layers/core_test.py
index 15cd8157c0c..f6509814249 100644
--- a/tensorflow/python/keras/layers/core_test.py
+++ b/tensorflow/python/keras/layers/core_test.py
@@ -324,7 +324,7 @@ class TestStatefulLambda(keras_parameterized.TestCase):
     (    )?  <tf.Variable \'.*shift_and_scale/shift:0\'.+
     (    )?The layer cannot safely ensure proper Variable reuse.+''')
 
-    with self.assertRaisesRegexp(ValueError, expected_error):
+    with self.assertRaisesRegex(ValueError, expected_error):
       layer = keras.layers.Lambda(lambda_fn, name='shift_and_scale')
       model = testing_utils.get_model_from_layers([layer], input_shape=(1,))
       model(array_ops.ones((4, 1)))
@@ -342,7 +342,7 @@ class TestStatefulLambda(keras_parameterized.TestCase):
     (    )?  <tf.Variable \'.*bias_dense/dense/kernel:0\'.+
     (    )?The layer cannot safely ensure proper Variable reuse.+''')
 
-    with self.assertRaisesRegexp(ValueError, expected_error):
+    with self.assertRaisesRegex(ValueError, expected_error):
       layer = keras.layers.Lambda(bad_lambda_fn, name='bias_dense')
       model = testing_utils.get_model_from_layers([layer], input_shape=(1,))
       model(array_ops.ones((4, 1)))
@@ -365,7 +365,7 @@ class TestStatefulLambda(keras_parameterized.TestCase):
       raise ValueError(msg)
     layer._warn = patched_warn
 
-    with self.assertRaisesRegexp(ValueError, expected_warning):
+    with self.assertRaisesRegex(ValueError, expected_warning):
       model = testing_utils.get_model_from_layers([layer], input_shape=(1,))
       model(array_ops.ones((4, 1)))
 
@@ -448,13 +448,13 @@ class CoreLayersTest(keras_parameterized.TestCase):
         keras.layers.Permute, kwargs={'dims': (2, 1)}, input_shape=(3, 2, 4))
 
   def test_permute_errors_on_invalid_starting_dims_index(self):
-    with self.assertRaisesRegexp(ValueError, r'Invalid permutation .*dims.*'):
+    with self.assertRaisesRegex(ValueError, r'Invalid permutation .*dims.*'):
       testing_utils.layer_test(
           keras.layers.Permute,
           kwargs={'dims': (0, 1, 2)}, input_shape=(3, 2, 4))
 
   def test_permute_errors_on_invalid_set_of_dims_indices(self):
-    with self.assertRaisesRegexp(ValueError, r'Invalid permutation .*dims.*'):
+    with self.assertRaisesRegex(ValueError, r'Invalid permutation .*dims.*'):
       testing_utils.layer_test(
           keras.layers.Permute,
           kwargs={'dims': (1, 4, 2)}, input_shape=(3, 2, 4))
diff --git a/tensorflow/python/keras/layers/dense_attention_test.py b/tensorflow/python/keras/layers/dense_attention_test.py
index 750ec0d08d1..504c4ab6984 100644
--- a/tensorflow/python/keras/layers/dense_attention_test.py
+++ b/tensorflow/python/keras/layers/dense_attention_test.py
@@ -401,39 +401,37 @@ class AttentionTest(test.TestCase, parameterized.TestCase):
   def test_inputs_not_list(self):
     attention_layer = dense_attention.Attention()
     q = np.array([[[1.1]]], dtype=np.float32)
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         ValueError, 'Attention layer must be called on a list of inputs'):
       attention_layer(q)
 
   def test_inputs_too_short(self):
     attention_layer = dense_attention.Attention()
     q = np.array([[[1.1]]], dtype=np.float32)
-    with self.assertRaisesRegexp(
-        ValueError,
-        'Attention layer accepts inputs list of length 2 or 3'):
+    with self.assertRaisesRegex(
+        ValueError, 'Attention layer accepts inputs list of length 2 or 3'):
       attention_layer([q])
 
   def test_inputs_too_long(self):
     attention_layer = dense_attention.Attention()
     q = np.array([[[1.1]]], dtype=np.float32)
-    with self.assertRaisesRegexp(
-        ValueError,
-        'Attention layer accepts inputs list of length 2 or 3'):
+    with self.assertRaisesRegex(
+        ValueError, 'Attention layer accepts inputs list of length 2 or 3'):
       attention_layer([q, q, q, q])
 
   def test_mask_not_list(self):
     attention_layer = dense_attention.Attention()
     q = np.array([[[1.1]]], dtype=np.float32)
     mask = np.array([[True]], dtype=np.bool_)
-    with self.assertRaisesRegexp(
-        ValueError, 'Attention layer mask must be a list'):
+    with self.assertRaisesRegex(ValueError,
+                                'Attention layer mask must be a list'):
       attention_layer([q, q], mask=mask)
 
   def test_mask_too_short(self):
     attention_layer = dense_attention.Attention()
     q = np.array([[[1.1]]], dtype=np.float32)
     mask = np.array([[True]], dtype=np.bool_)
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         ValueError, 'Attention layer mask must be a list of length 2'):
       attention_layer([q, q], mask=[mask])
 
@@ -441,7 +439,7 @@ class AttentionTest(test.TestCase, parameterized.TestCase):
     attention_layer = dense_attention.Attention()
     q = np.array([[[1.1]]], dtype=np.float32)
     mask = np.array([[True]], dtype=np.bool_)
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         ValueError, 'Attention layer mask must be a list of length 2'):
       attention_layer([q, q], mask=[mask, mask, mask])
 
diff --git a/tensorflow/python/keras/layers/einsum_dense_test.py b/tensorflow/python/keras/layers/einsum_dense_test.py
index e9ae7271130..f7ab34aed3b 100644
--- a/tensorflow/python/keras/layers/einsum_dense_test.py
+++ b/tensorflow/python/keras/layers/einsum_dense_test.py
@@ -282,7 +282,7 @@ class TestEinsumLayerAPI(keras_parameterized.TestCase):
     input_tensor = keras.Input(shape=(32,))
     layer = einsum_dense.EinsumDense(
         equation="ab,bc->ac", output_shape=64, bias_axes="y")
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         ValueError, ".*is not a part of the output specification.*"):
       _ = layer(input_tensor)
 
@@ -290,7 +290,7 @@ class TestEinsumLayerAPI(keras_parameterized.TestCase):
     input_tensor = keras.Input(shape=(32, 64))
     layer = einsum_dense.EinsumDense(
         equation="abc,cd->abd", output_shape=(10, 96))
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         ValueError, ".*Input shape and output shape do not match at shared "
         "dimension 'b'.*"):
       _ = layer(input_tensor)
@@ -298,7 +298,7 @@ class TestEinsumLayerAPI(keras_parameterized.TestCase):
   def test_unspecified_output_dim_fails(self):
     input_tensor = keras.Input(shape=(32,))
     layer = einsum_dense.EinsumDense(equation="ab,bc->cd", output_shape=64)
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         ValueError, ".*Dimension 'd' was specified in the output 'cd' but has "
         "no corresponding dim.*"):
       _ = layer(input_tensor)
@@ -306,8 +306,8 @@ class TestEinsumLayerAPI(keras_parameterized.TestCase):
   def test_unspecified_weight_dim_fails(self):
     input_tensor = keras.Input(shape=(32,))
     layer = einsum_dense.EinsumDense(equation="ab,zd->ad", output_shape=64)
-    with self.assertRaisesRegexp(
-        ValueError, ".*Weight dimension 'z' did not have a match "):
+    with self.assertRaisesRegex(ValueError,
+                                ".*Weight dimension 'z' did not have a match "):
       _ = layer(input_tensor)
 
 
diff --git a/tensorflow/python/keras/layers/kernelized_test.py b/tensorflow/python/keras/layers/kernelized_test.py
index a6a9d88423f..3c836f1ccde 100644
--- a/tensorflow/python/keras/layers/kernelized_test.py
+++ b/tensorflow/python/keras/layers/kernelized_test.py
@@ -87,18 +87,18 @@ class RandomFourierFeaturesTest(test.TestCase, parameterized.TestCase):
     self.assertAllClose(output_data, new_output_data, atol=1e-4)
 
   def test_invalid_output_dim(self):
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         ValueError, r'`output_dim` should be a positive integer. Given: -3.'):
       _ = kernel_layers.RandomFourierFeatures(output_dim=-3, scale=2.0)
 
   def test_unsupported_kernel_type(self):
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         ValueError, r'Unsupported kernel type: \'unsupported_kernel\'.'):
       _ = kernel_layers.RandomFourierFeatures(
           3, 'unsupported_kernel', stddev=2.0)
 
   def test_invalid_scale(self):
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         ValueError,
         r'When provided, `scale` should be a positive float. Given: 0.0.'):
       _ = kernel_layers.RandomFourierFeatures(output_dim=10, scale=0.0)
@@ -106,7 +106,7 @@ class RandomFourierFeaturesTest(test.TestCase, parameterized.TestCase):
   def test_invalid_input_shape(self):
     inputs = random_ops.random_uniform((3, 2, 4), seed=1)
     rff_layer = kernel_layers.RandomFourierFeatures(output_dim=10, scale=3.0)
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         ValueError,
         r'The rank of the input tensor should be 2. Got 3 instead.'):
       _ = rff_layer(inputs)
@@ -166,7 +166,7 @@ class RandomFourierFeaturesTest(test.TestCase, parameterized.TestCase):
           output_dim=5,
           kernel_initializer=initializer,
           name='random_fourier_features')
-      with self.assertRaisesRegexp(
+      with self.assertRaisesRegex(
           ValueError, r'The last dimension of the inputs to '
           '`RandomFourierFeatures` should be defined. Found `None`.'):
         rff_layer(inputs)
@@ -176,7 +176,7 @@ class RandomFourierFeaturesTest(test.TestCase, parameterized.TestCase):
           output_dim=5,
           kernel_initializer=initializer,
           name='random_fourier_features')
-      with self.assertRaisesRegexp(
+      with self.assertRaisesRegex(
           ValueError, r'The last dimension of the inputs to '
           '`RandomFourierFeatures` should be defined. Found `None`.'):
         rff_layer(inputs)
@@ -201,7 +201,7 @@ class RandomFourierFeaturesTest(test.TestCase, parameterized.TestCase):
     with self.assertRaises(ValueError):
       rff_layer.compute_output_shape(tensor_shape.TensorShape([3, 2, 3]))
 
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         ValueError, r'The innermost dimension of input shape must be defined.'):
       rff_layer.compute_output_shape(tensor_shape.TensorShape([3, None]))
 
diff --git a/tensorflow/python/keras/layers/merge_test.py b/tensorflow/python/keras/layers/merge_test.py
index af8f2ea550c..16d3701b7e3 100644
--- a/tensorflow/python/keras/layers/merge_test.py
+++ b/tensorflow/python/keras/layers/merge_test.py
@@ -61,11 +61,11 @@ class MergeLayersTest(keras_parameterized.TestCase):
                 add_layer.compute_mask(
                     [i1, i2], [K.variable(x1), K.variable(x2)]))))
 
-    with self.assertRaisesRegexp(ValueError, '`mask` should be a list.'):
+    with self.assertRaisesRegex(ValueError, '`mask` should be a list.'):
       add_layer.compute_mask([i1, i2, i3], x1)
-    with self.assertRaisesRegexp(ValueError, '`inputs` should be a list.'):
+    with self.assertRaisesRegex(ValueError, '`inputs` should be a list.'):
       add_layer.compute_mask(i1, [None, None, None])
-    with self.assertRaisesRegexp(ValueError, ' should have the same length.'):
+    with self.assertRaisesRegex(ValueError, ' should have the same length.'):
       add_layer.compute_mask([i1, i2, i3], [None, None])
 
   def test_merge_subtract(self):
@@ -92,15 +92,15 @@ class MergeLayersTest(keras_parameterized.TestCase):
                 subtract_layer.compute_mask(
                     [i1, i2], [K.variable(x1), K.variable(x2)]))))
 
-    with self.assertRaisesRegexp(ValueError, '`mask` should be a list.'):
+    with self.assertRaisesRegex(ValueError, '`mask` should be a list.'):
       subtract_layer.compute_mask([i1, i2], x1)
-    with self.assertRaisesRegexp(ValueError, '`inputs` should be a list.'):
+    with self.assertRaisesRegex(ValueError, '`inputs` should be a list.'):
       subtract_layer.compute_mask(i1, [None, None])
-    with self.assertRaisesRegexp(ValueError,
-                                 'layer should be called on exactly 2 inputs'):
+    with self.assertRaisesRegex(ValueError,
+                                'layer should be called on exactly 2 inputs'):
       subtract_layer([i1, i2, i3])
-    with self.assertRaisesRegexp(ValueError,
-                                 'layer should be called on exactly 2 inputs'):
+    with self.assertRaisesRegex(ValueError,
+                                'layer should be called on exactly 2 inputs'):
       subtract_layer([i1])
 
   def test_merge_multiply(self):
@@ -183,14 +183,14 @@ class MergeLayersTest(keras_parameterized.TestCase):
                 concat_layer.compute_mask(
                     [i1, i2], [K.variable(x1), K.variable(x2)]))))
 
-    with self.assertRaisesRegexp(ValueError, '`mask` should be a list.'):
+    with self.assertRaisesRegex(ValueError, '`mask` should be a list.'):
       concat_layer.compute_mask([i1, i2], x1)
-    with self.assertRaisesRegexp(ValueError, '`inputs` should be a list.'):
+    with self.assertRaisesRegex(ValueError, '`inputs` should be a list.'):
       concat_layer.compute_mask(i1, [None, None])
-    with self.assertRaisesRegexp(ValueError, 'should have the same length'):
+    with self.assertRaisesRegex(ValueError, 'should have the same length'):
       concat_layer.compute_mask([i1, i2], [None])
-    with self.assertRaisesRegexp(ValueError,
-                                 'layer should be called on a list of inputs'):
+    with self.assertRaisesRegex(ValueError,
+                                'layer should be called on a list of inputs'):
       concat_layer(i1)
 
   def test_merge_dot(self):
@@ -280,11 +280,11 @@ class MergeLayersTestNoExecution(test.TestCase):
   def test_concatenate_errors(self):
     i1 = keras.layers.Input(shape=(4, 5))
     i2 = keras.layers.Input(shape=(3, 5))
-    with self.assertRaisesRegexp(ValueError, 'inputs with matching shapes'):
+    with self.assertRaisesRegex(ValueError, 'inputs with matching shapes'):
       keras.layers.concatenate([i1, i2], axis=-1)
-    with self.assertRaisesRegexp(ValueError, 'called on a list'):
+    with self.assertRaisesRegex(ValueError, 'called on a list'):
       keras.layers.concatenate(i1, axis=-1)
-    with self.assertRaisesRegexp(ValueError, 'called on a list'):
+    with self.assertRaisesRegex(ValueError, 'called on a list'):
       keras.layers.concatenate([i1], axis=-1)
 
   def test_concatenate_with_partial_shape(self):
@@ -298,7 +298,7 @@ class MergeLayersTestNoExecution(test.TestCase):
     keras.layers.concatenate([i1, i2], axis=-1)
 
     # Different rank
-    with self.assertRaisesRegexp(ValueError, 'inputs with matching shapes'):
+    with self.assertRaisesRegex(ValueError, 'inputs with matching shapes'):
       keras.layers.concatenate([i1, i3], axis=-1)
 
     # Valid case with partial dimension information
@@ -309,10 +309,10 @@ class MergeLayersTestNoExecution(test.TestCase):
     keras.layers.concatenate([i1, i5], axis=1)
 
     # Mismatch in batch dimension.
-    with self.assertRaisesRegexp(ValueError, 'inputs with matching shapes'):
+    with self.assertRaisesRegex(ValueError, 'inputs with matching shapes'):
       keras.layers.concatenate([i1, i4], axis=-1)
 
-    with self.assertRaisesRegexp(ValueError, 'inputs with matching shapes'):
+    with self.assertRaisesRegex(ValueError, 'inputs with matching shapes'):
       keras.layers.concatenate([i1, i2, i4], axis=-1)
 
   def test_dot_errors(self):
diff --git a/tensorflow/python/keras/layers/normalization_test.py b/tensorflow/python/keras/layers/normalization_test.py
index 39992f7580a..e60f34720a2 100644
--- a/tensorflow/python/keras/layers/normalization_test.py
+++ b/tensorflow/python/keras/layers/normalization_test.py
@@ -302,26 +302,26 @@ class BatchNormalizationV2Test(keras_parameterized.TestCase):
     norm(inp)
     self.assertEqual(norm.fused, True)
 
-    with self.assertRaisesRegexp(ValueError, 'fused.*renorm'):
+    with self.assertRaisesRegex(ValueError, 'fused.*renorm'):
       normalization_v2.BatchNormalization(fused=True, renorm=True)
 
-    with self.assertRaisesRegexp(ValueError, 'fused.*when axis is 1 or 3'):
+    with self.assertRaisesRegex(ValueError, 'fused.*when axis is 1 or 3'):
       normalization_v2.BatchNormalization(fused=True, axis=2)
 
-    with self.assertRaisesRegexp(ValueError, 'fused.*when axis is 1 or 3'):
+    with self.assertRaisesRegex(ValueError, 'fused.*when axis is 1 or 3'):
       normalization_v2.BatchNormalization(fused=True, axis=[1, 3])
 
-    with self.assertRaisesRegexp(ValueError, 'fused.*virtual_batch_size'):
+    with self.assertRaisesRegex(ValueError, 'fused.*virtual_batch_size'):
       normalization_v2.BatchNormalization(fused=True, virtual_batch_size=2)
 
-    with self.assertRaisesRegexp(ValueError, 'fused.*adjustment'):
+    with self.assertRaisesRegex(ValueError, 'fused.*adjustment'):
       normalization_v2.BatchNormalization(fused=True,
                                           adjustment=lambda _: (1, 0))
 
     norm = normalization_v2.BatchNormalization(fused=True)
     self.assertEqual(norm.fused, True)
     inp = keras.layers.Input(shape=(4, 4))
-    with self.assertRaisesRegexp(ValueError, '4D input tensors'):
+    with self.assertRaisesRegex(ValueError, '4D input tensors'):
       norm(inp)
 
   def test_updates_in_wrap_function(self):
@@ -593,19 +593,19 @@ class LayerNormalizationTest(keras_parameterized.TestCase):
 
   @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def testIncorrectAxisType(self):
-    with self.assertRaisesRegexp(
-        TypeError, r'Expected an int or a list/tuple of ints'):
+    with self.assertRaisesRegex(TypeError,
+                                r'Expected an int or a list/tuple of ints'):
       _ = normalization.LayerNormalization(axis={'axis': -1})
 
   @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def testInvalidAxis(self):
-    with self.assertRaisesRegexp(ValueError, r'Invalid axis: 3'):
+    with self.assertRaisesRegex(ValueError, r'Invalid axis: 3'):
       layer_norm = normalization.LayerNormalization(axis=3)
       layer_norm.build(input_shape=(2, 2, 2))
 
   @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def testDuplicateAxis(self):
-    with self.assertRaisesRegexp(ValueError, r'Duplicate axis:'):
+    with self.assertRaisesRegex(ValueError, r'Duplicate axis:'):
       layer_norm = normalization.LayerNormalization(axis=[-1, -1])
       layer_norm.build(input_shape=(2, 2, 2))
 
diff --git a/tensorflow/python/keras/layers/preprocessing/hashing_test.py b/tensorflow/python/keras/layers/preprocessing/hashing_test.py
index 9bc5e75819f..2e5e5f7005c 100644
--- a/tensorflow/python/keras/layers/preprocessing/hashing_test.py
+++ b/tensorflow/python/keras/layers/preprocessing/hashing_test.py
@@ -225,7 +225,7 @@ class HashingTest(keras_parameterized.TestCase):
     inp_data_2 = ragged_factory_ops.constant(
         [['omar', 'stringer', 'marlo', 'wire'], ['marlo', 'skywalker', 'wire']],
         dtype=dtypes.string)
-    with self.assertRaisesRegexp(ValueError, 'not supported yet'):
+    with self.assertRaisesRegex(ValueError, 'not supported yet'):
       _ = layer([inp_data_1, inp_data_2])
 
   def test_hash_ragged_int_input_farmhash(self):
@@ -274,7 +274,7 @@ class HashingTest(keras_parameterized.TestCase):
     inp_data_2 = ragged_factory_ops.constant(
         [['omar', 'stringer', 'marlo', 'wire'], ['marlo', 'skywalker', 'wire']],
         dtype=dtypes.string)
-    with self.assertRaisesRegexp(ValueError, 'not supported yet'):
+    with self.assertRaisesRegex(ValueError, 'not supported yet'):
       _ = layer([inp_data_1, inp_data_2])
 
   def test_hash_ragged_int_input_siphash(self):
@@ -292,15 +292,15 @@ class HashingTest(keras_parameterized.TestCase):
     self.assertAllClose(out_data, model.predict(inp_data))
 
   def test_invalid_inputs(self):
-    with self.assertRaisesRegexp(ValueError, 'cannot be `None`'):
+    with self.assertRaisesRegex(ValueError, 'cannot be `None`'):
       _ = hashing.Hashing(num_bins=None)
-    with self.assertRaisesRegexp(ValueError, 'cannot be `None`'):
+    with self.assertRaisesRegex(ValueError, 'cannot be `None`'):
       _ = hashing.Hashing(num_bins=-1)
-    with self.assertRaisesRegexp(ValueError, 'can only be a tuple of size 2'):
+    with self.assertRaisesRegex(ValueError, 'can only be a tuple of size 2'):
       _ = hashing.Hashing(num_bins=2, salt='string')
-    with self.assertRaisesRegexp(ValueError, 'can only be a tuple of size 2'):
+    with self.assertRaisesRegex(ValueError, 'can only be a tuple of size 2'):
       _ = hashing.Hashing(num_bins=2, salt=[1])
-    with self.assertRaisesRegexp(ValueError, 'can only be a tuple of size 2'):
+    with self.assertRaisesRegex(ValueError, 'can only be a tuple of size 2'):
       _ = hashing.Hashing(num_bins=1, salt=constant_op.constant([133, 137]))
 
   def test_hash_compute_output_signature(self):
diff --git a/tensorflow/python/keras/layers/preprocessing/image_preprocessing_test.py b/tensorflow/python/keras/layers/preprocessing/image_preprocessing_test.py
index f5210589b82..a3540fca6df 100644
--- a/tensorflow/python/keras/layers/preprocessing/image_preprocessing_test.py
+++ b/tensorflow/python/keras/layers/preprocessing/image_preprocessing_test.py
@@ -188,8 +188,8 @@ class CenterCropTest(keras_parameterized.TestCase):
       ('center_crop_10_by_8', 10, 8),
       ('center_crop_10_by_12', 10, 12))
   def test_invalid_center_crop(self, expected_height, expected_width):
-    with self.assertRaisesRegexp(errors.InvalidArgumentError,
-                                 r'assertion failed'):
+    with self.assertRaisesRegex(errors.InvalidArgumentError,
+                                r'assertion failed'):
       self._run_test(expected_height, expected_width)
 
   def test_config_with_custom_name(self):
diff --git a/tensorflow/python/keras/layers/preprocessing/table_utils_test.py b/tensorflow/python/keras/layers/preprocessing/table_utils_test.py
index ab7e80b628c..bbb4d2a97a6 100644
--- a/tensorflow/python/keras/layers/preprocessing/table_utils_test.py
+++ b/tensorflow/python/keras/layers/preprocessing/table_utils_test.py
@@ -114,7 +114,7 @@ class CategoricalEncodingInputTest(
 
     table = get_table(dtype=dtypes.int64, oov_tokens=[1, 2])
 
-    with self.assertRaisesRegexp(ValueError, "must be 1-dimensional"):
+    with self.assertRaisesRegex(ValueError, "must be 1-dimensional"):
       table.insert(key_data, value_data)
 
 
diff --git a/tensorflow/python/keras/layers/recurrent_test.py b/tensorflow/python/keras/layers/recurrent_test.py
index b6afe2a0e03..c2c3d135f68 100644
--- a/tensorflow/python/keras/layers/recurrent_test.py
+++ b/tensorflow/python/keras/layers/recurrent_test.py
@@ -1021,8 +1021,8 @@ class RNNTest(keras_parameterized.TestCase):
 
   def test_get_initial_state(self):
     cell = keras.layers.SimpleRNNCell(5)
-    with self.assertRaisesRegexp(ValueError,
-                                 'batch_size and dtype cannot be None'):
+    with self.assertRaisesRegex(ValueError,
+                                'batch_size and dtype cannot be None'):
       cell.get_initial_state(None, None, None)
 
     if not context.executing_eagerly():
@@ -1359,7 +1359,7 @@ class RNNTest(keras_parameterized.TestCase):
     cell = keras.layers.SimpleRNNCell(5)
     x = keras.Input((None, 5))
     layer = keras.layers.RNN(cell, return_sequences=True, unroll=True)
-    with self.assertRaisesRegexp(ValueError, 'Cannot unroll a RNN.*'):
+    with self.assertRaisesRegex(ValueError, 'Cannot unroll a RNN.*'):
       layer(x)
 
   def test_full_input_spec(self):
@@ -1385,11 +1385,11 @@ class RNNTest(keras_parameterized.TestCase):
 
   def test_reset_states(self):
     # See https://github.com/tensorflow/tensorflow/issues/25852
-    with self.assertRaisesRegexp(ValueError, 'it needs to know its batch size'):
+    with self.assertRaisesRegex(ValueError, 'it needs to know its batch size'):
       simple_rnn = keras.layers.SimpleRNN(1, stateful=True)
       simple_rnn.reset_states()
 
-    with self.assertRaisesRegexp(ValueError, 'it needs to know its batch size'):
+    with self.assertRaisesRegex(ValueError, 'it needs to know its batch size'):
       cell = Minimal2DRNNCell(1, 2)
       custom_rnn = keras.layers.RNN(cell, stateful=True)
       custom_rnn.reset_states()
@@ -1608,8 +1608,8 @@ class RNNTest(keras_parameterized.TestCase):
 
     # Must raise error when unroll is set to True
     unroll_rnn_layer = layer(3, unroll=True)
-    with self.assertRaisesRegexp(ValueError,
-                                 'The input received contains RaggedTensors *'):
+    with self.assertRaisesRegex(ValueError,
+                                'The input received contains RaggedTensors *'):
       unroll_rnn_layer(inputs)
 
     # Check if return sequences outputs are correct
diff --git a/tensorflow/python/keras/layers/rnn_cell_wrapper_v2_test.py b/tensorflow/python/keras/layers/rnn_cell_wrapper_v2_test.py
index 8491ed0098e..b0fd5189b17 100644
--- a/tensorflow/python/keras/layers/rnn_cell_wrapper_v2_test.py
+++ b/tensorflow/python/keras/layers/rnn_cell_wrapper_v2_test.py
@@ -222,11 +222,11 @@ class RNNCellWrapperTest(test.TestCase, parameterized.TestCase):
     wrapper_cls = rnn_cell_wrapper_v2.DropoutWrapper
     cell = layers.LSTMCell(10)
 
-    with self.assertRaisesRegexp(ValueError, "does not work with "):
+    with self.assertRaisesRegex(ValueError, "does not work with "):
       wrapper_cls(cell)
 
     cell = layers.LSTMCellV2(10)
-    with self.assertRaisesRegexp(ValueError, "does not work with "):
+    with self.assertRaisesRegex(ValueError, "does not work with "):
       wrapper_cls(cell)
 
 
diff --git a/tensorflow/python/keras/layers/wrappers_test.py b/tensorflow/python/keras/layers/wrappers_test.py
index 5ee794dd1ef..75d951a4a7a 100644
--- a/tensorflow/python/keras/layers/wrappers_test.py
+++ b/tensorflow/python/keras/layers/wrappers_test.py
@@ -149,9 +149,8 @@ class TimeDistributedTest(keras_parameterized.TestCase):
 
   def test_timedistributed_invalid_init(self):
     x = constant_op.constant(np.zeros((1, 1)).astype('float32'))
-    with self.assertRaisesRegexp(
-        ValueError,
-        'Please initialize `TimeDistributed` layer with a '
+    with self.assertRaisesRegex(
+        ValueError, 'Please initialize `TimeDistributed` layer with a '
         '`tf.keras.layers.Layer` instance.'):
       keras.layers.TimeDistributed(x)
 
@@ -306,13 +305,13 @@ class TimeDistributedTest(keras_parameterized.TestCase):
     self.assertEqual(out_2.shape.as_list(), [None, 1, 5])
 
     ph_3 = keras.backend.placeholder(shape=(None, 1, 18))
-    with self.assertRaisesRegexp(ValueError, 'is incompatible with layer'):
+    with self.assertRaisesRegex(ValueError, 'is incompatible with layer'):
       time_dist(ph_3)
 
   def test_TimeDistributed_with_invalid_dimensions(self):
     time_dist = keras.layers.TimeDistributed(keras.layers.Dense(5))
     ph = keras.backend.placeholder(shape=(None, 10))
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         ValueError,
         '`TimeDistributed` Layer should be passed an `input_shape `'):
       time_dist(ph)
@@ -511,7 +510,7 @@ class BidirectionalTest(test.TestCase, parameterized.TestCase):
 
   def test_bidirectional_invalid_init(self):
     x = constant_op.constant(np.zeros((1, 1)).astype('float32'))
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         ValueError,
         'Please initialize `Bidirectional` layer with a `Layer` instance.'):
       keras.layers.Bidirectional(x)
@@ -1053,15 +1052,15 @@ class BidirectionalTest(test.TestCase, parameterized.TestCase):
     forward_layer = rnn(units)
     backward_layer = rnn(units)
 
-    with self.assertRaisesRegexp(ValueError,
-                                 'should have different `go_backwards` value.'):
+    with self.assertRaisesRegex(ValueError,
+                                'should have different `go_backwards` value.'):
       keras.layers.Bidirectional(
           forward_layer, merge_mode='concat', backward_layer=backward_layer)
 
     for attr in ('stateful', 'return_sequences', 'return_state'):
       kwargs = {attr: True}
       backward_layer = rnn(units, go_backwards=True, **kwargs)
-      with self.assertRaisesRegexp(
+      with self.assertRaisesRegex(
           ValueError, 'expected to have the same value for attribute ' + attr):
         keras.layers.Bidirectional(
             forward_layer, merge_mode='concat', backward_layer=backward_layer)
diff --git a/tensorflow/python/keras/legacy_tf_layers/base_test.py b/tensorflow/python/keras/legacy_tf_layers/base_test.py
index 0a61d77ba76..85185a8c22c 100644
--- a/tensorflow/python/keras/legacy_tf_layers/base_test.py
+++ b/tensorflow/python/keras/legacy_tf_layers/base_test.py
@@ -151,7 +151,7 @@ class BaseLayerTest(test.TestCase, parameterized.TestCase):
   def testInvalidTrainableSynchronizationCombination(self):
     layer = base_layers.Layer(name='my_layer')
 
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         ValueError, 'Synchronization value can be set to '
         'VariableSynchronization.ON_READ only for non-trainable variables. '
         'You have specified trainable=True and '
@@ -278,11 +278,11 @@ class BaseLayerTest(test.TestCase, parameterized.TestCase):
 
     if not context.executing_eagerly():
       layer = CustomerLayer()
-      with self.assertRaisesRegexp(ValueError, r'requires a defined rank'):
+      with self.assertRaisesRegex(ValueError, r'requires a defined rank'):
         layer.apply(array_ops.placeholder('int32'))
 
     layer = CustomerLayer()
-    with self.assertRaisesRegexp(ValueError, r'expected ndim=2'):
+    with self.assertRaisesRegex(ValueError, r'expected ndim=2'):
       layer.apply(constant_op.constant([1]))
 
     # Note that we re-create the layer since in Eager mode, input spec checks
@@ -305,11 +305,11 @@ class BaseLayerTest(test.TestCase, parameterized.TestCase):
 
     if not context.executing_eagerly():
       layer = CustomerLayer()
-      with self.assertRaisesRegexp(ValueError, r'requires a defined rank'):
+      with self.assertRaisesRegex(ValueError, r'requires a defined rank'):
         layer.apply(array_ops.placeholder('int32'))
 
     layer = CustomerLayer()
-    with self.assertRaisesRegexp(ValueError, r'expected min_ndim=2'):
+    with self.assertRaisesRegex(ValueError, r'expected min_ndim=2'):
       layer.apply(constant_op.constant([1]))
 
     # Works
@@ -333,11 +333,11 @@ class BaseLayerTest(test.TestCase, parameterized.TestCase):
 
     if not context.executing_eagerly():
       layer = CustomerLayer()
-      with self.assertRaisesRegexp(ValueError, r'requires a defined rank'):
+      with self.assertRaisesRegex(ValueError, r'requires a defined rank'):
         layer.apply(array_ops.placeholder('int32'))
 
     layer = CustomerLayer()
-    with self.assertRaisesRegexp(ValueError, r'expected max_ndim=2'):
+    with self.assertRaisesRegex(ValueError, r'expected max_ndim=2'):
       layer.apply(constant_op.constant([[[1], [2]]]))
 
     # Works
@@ -360,7 +360,7 @@ class BaseLayerTest(test.TestCase, parameterized.TestCase):
         return inputs
 
     layer = CustomerLayer()
-    with self.assertRaisesRegexp(ValueError, r'expected dtype=float32'):
+    with self.assertRaisesRegex(ValueError, r'expected dtype=float32'):
       layer.apply(constant_op.constant(1, dtype=dtypes.int32))
 
     # Works
@@ -380,7 +380,7 @@ class BaseLayerTest(test.TestCase, parameterized.TestCase):
         return inputs
 
     layer = CustomerLayer()
-    with self.assertRaisesRegexp(ValueError, r'expected axis'):
+    with self.assertRaisesRegex(ValueError, r'expected axis'):
       layer.apply(constant_op.constant([1, 2, 3]))
 
     # Works
@@ -402,7 +402,7 @@ class BaseLayerTest(test.TestCase, parameterized.TestCase):
         return inputs
 
     layer = CustomerLayer()
-    with self.assertRaisesRegexp(ValueError, r'expected shape'):
+    with self.assertRaisesRegex(ValueError, r'expected shape'):
       layer.apply(constant_op.constant([[1, 2]]))
 
     # Works
diff --git a/tensorflow/python/keras/legacy_tf_layers/convolutional_test.py b/tensorflow/python/keras/legacy_tf_layers/convolutional_test.py
index b0eeede8737..a6a4bc7a088 100644
--- a/tensorflow/python/keras/legacy_tf_layers/convolutional_test.py
+++ b/tensorflow/python/keras/legacy_tf_layers/convolutional_test.py
@@ -39,25 +39,25 @@ class ConvTest(test.TestCase):
   def testInvalidDataFormat(self):
     height, width = 7, 9
     images = random_ops.random_uniform((5, height, width, 3), seed=1)
-    with self.assertRaisesRegexp(ValueError, 'data_format'):
+    with self.assertRaisesRegex(ValueError, 'data_format'):
       conv_layers.conv2d(images, 32, 3, data_format='invalid')
 
   def testInvalidStrides(self):
     height, width = 7, 9
     images = random_ops.random_uniform((5, height, width, 3), seed=1)
-    with self.assertRaisesRegexp(ValueError, 'strides'):
+    with self.assertRaisesRegex(ValueError, 'strides'):
       conv_layers.conv2d(images, 32, 3, strides=(1, 2, 3))
 
-    with self.assertRaisesRegexp(ValueError, 'strides'):
+    with self.assertRaisesRegex(ValueError, 'strides'):
       conv_layers.conv2d(images, 32, 3, strides=None)
 
   def testInvalidKernelSize(self):
     height, width = 7, 9
     images = random_ops.random_uniform((5, height, width, 3), seed=1)
-    with self.assertRaisesRegexp(ValueError, 'kernel_size'):
+    with self.assertRaisesRegex(ValueError, 'kernel_size'):
       conv_layers.conv2d(images, 32, (1, 2, 3))
 
-    with self.assertRaisesRegexp(ValueError, 'kernel_size'):
+    with self.assertRaisesRegex(ValueError, 'kernel_size'):
       conv_layers.conv2d(images, 32, None)
 
   @test_util.run_deprecated_v1
@@ -104,16 +104,16 @@ class ConvTest(test.TestCase):
   def testUnknownInputChannels(self):
     images = array_ops.placeholder(dtypes.float32, (5, 7, 9, None))
     layer = conv_layers.Conv2D(32, [3, 3], activation=nn_ops.relu)
-    with self.assertRaisesRegexp(ValueError,
-                                 'The channel dimension of the inputs '
-                                 'should be defined. Found `None`.'):
+    with self.assertRaisesRegex(
+        ValueError, 'The channel dimension of the inputs '
+        'should be defined. Found `None`.'):
       _ = layer.apply(images)
 
     images = array_ops.placeholder(dtypes.float32, (5, None, 7, 9))
     layer = conv_layers.Conv2D(32, [3, 3], data_format='channels_first')
-    with self.assertRaisesRegexp(ValueError,
-                                 'The channel dimension of the inputs '
-                                 'should be defined. Found `None`.'):
+    with self.assertRaisesRegex(
+        ValueError, 'The channel dimension of the inputs '
+        'should be defined. Found `None`.'):
       _ = layer.apply(images)
 
   def testConv2DPaddingSame(self):
@@ -175,16 +175,16 @@ class ConvTest(test.TestCase):
   def testUnknownInputChannelsConv1D(self):
     data = array_ops.placeholder(dtypes.float32, (5, 4, None))
     layer = conv_layers.Conv1D(32, 3, activation=nn_ops.relu)
-    with self.assertRaisesRegexp(ValueError,
-                                 'The channel dimension of the inputs '
-                                 'should be defined. Found `None`.'):
+    with self.assertRaisesRegex(
+        ValueError, 'The channel dimension of the inputs '
+        'should be defined. Found `None`.'):
       _ = layer.apply(data)
 
     data = array_ops.placeholder(dtypes.float32, (5, None, 4))
     layer = conv_layers.Conv1D(32, 3, data_format='channels_first')
-    with self.assertRaisesRegexp(ValueError,
-                                 'The channel dimension of the inputs '
-                                 'should be defined. Found `None`.'):
+    with self.assertRaisesRegex(
+        ValueError, 'The channel dimension of the inputs '
+        'should be defined. Found `None`.'):
       _ = layer.apply(data)
 
   @test_util.run_deprecated_v1
@@ -203,9 +203,9 @@ class ConvTest(test.TestCase):
   def testUnknownInputChannelsConv3D(self):
     volumes = array_ops.placeholder(dtypes.float32, (5, 6, 7, 9, None))
     layer = conv_layers.Conv3D(32, [3, 3, 3], activation=nn_ops.relu)
-    with self.assertRaisesRegexp(ValueError,
-                                 'The channel dimension of the inputs '
-                                 'should be defined. Found `None`.'):
+    with self.assertRaisesRegex(
+        ValueError, 'The channel dimension of the inputs '
+        'should be defined. Found `None`.'):
       _ = layer.apply(volumes)
 
   @test_util.run_deprecated_v1
@@ -354,25 +354,25 @@ class SeparableConv1DTest(test.TestCase):
   def testInvalidDataFormat(self):
     length = 9
     data = random_ops.random_uniform((5, length, 3), seed=1)
-    with self.assertRaisesRegexp(ValueError, 'data_format'):
+    with self.assertRaisesRegex(ValueError, 'data_format'):
       conv_layers.separable_conv1d(data, 32, 3, data_format='invalid')
 
   def testInvalidStrides(self):
     length = 9
     data = random_ops.random_uniform((5, length, 3), seed=1)
-    with self.assertRaisesRegexp(ValueError, 'strides'):
+    with self.assertRaisesRegex(ValueError, 'strides'):
       conv_layers.separable_conv1d(data, 32, 3, strides=(1, 2))
 
-    with self.assertRaisesRegexp(ValueError, 'strides'):
+    with self.assertRaisesRegex(ValueError, 'strides'):
       conv_layers.separable_conv1d(data, 32, 3, strides=None)
 
   def testInvalidKernelSize(self):
     length = 9
     data = random_ops.random_uniform((5, length, 3), seed=1)
-    with self.assertRaisesRegexp(ValueError, 'kernel_size'):
+    with self.assertRaisesRegex(ValueError, 'kernel_size'):
       conv_layers.separable_conv1d(data, 32, (1, 2))
 
-    with self.assertRaisesRegexp(ValueError, 'kernel_size'):
+    with self.assertRaisesRegex(ValueError, 'kernel_size'):
       conv_layers.separable_conv1d(data, 32, None)
 
   @test_util.run_deprecated_v1
@@ -528,25 +528,25 @@ class SeparableConv2DTest(test.TestCase):
   def testInvalidDataFormat(self):
     height, width = 7, 9
     images = random_ops.random_uniform((5, height, width, 3), seed=1)
-    with self.assertRaisesRegexp(ValueError, 'data_format'):
+    with self.assertRaisesRegex(ValueError, 'data_format'):
       conv_layers.separable_conv2d(images, 32, 3, data_format='invalid')
 
   def testInvalidStrides(self):
     height, width = 7, 9
     images = random_ops.random_uniform((5, height, width, 3), seed=1)
-    with self.assertRaisesRegexp(ValueError, 'strides'):
+    with self.assertRaisesRegex(ValueError, 'strides'):
       conv_layers.separable_conv2d(images, 32, 3, strides=(1, 2, 3))
 
-    with self.assertRaisesRegexp(ValueError, 'strides'):
+    with self.assertRaisesRegex(ValueError, 'strides'):
       conv_layers.separable_conv2d(images, 32, 3, strides=None)
 
   def testInvalidKernelSize(self):
     height, width = 7, 9
     images = random_ops.random_uniform((5, height, width, 3), seed=1)
-    with self.assertRaisesRegexp(ValueError, 'kernel_size'):
+    with self.assertRaisesRegex(ValueError, 'kernel_size'):
       conv_layers.separable_conv2d(images, 32, (1, 2, 3))
 
-    with self.assertRaisesRegexp(ValueError, 'kernel_size'):
+    with self.assertRaisesRegex(ValueError, 'kernel_size'):
       conv_layers.separable_conv2d(images, 32, None)
 
   @test_util.run_deprecated_v1
@@ -786,25 +786,25 @@ class Conv2DTransposeTest(test.TestCase):
   def testInvalidDataFormat(self):
     height, width = 7, 9
     images = random_ops.random_uniform((5, height, width, 3), seed=1)
-    with self.assertRaisesRegexp(ValueError, 'data_format'):
+    with self.assertRaisesRegex(ValueError, 'data_format'):
       conv_layers.conv2d_transpose(images, 32, 3, data_format='invalid')
 
   def testInvalidStrides(self):
     height, width = 7, 9
     images = random_ops.random_uniform((5, height, width, 3), seed=1)
-    with self.assertRaisesRegexp(ValueError, 'strides'):
+    with self.assertRaisesRegex(ValueError, 'strides'):
       conv_layers.conv2d_transpose(images, 32, 3, strides=(1, 2, 3))
 
-    with self.assertRaisesRegexp(ValueError, 'strides'):
+    with self.assertRaisesRegex(ValueError, 'strides'):
       conv_layers.conv2d_transpose(images, 32, 3, strides=None)
 
   def testInvalidKernelSize(self):
     height, width = 7, 9
     images = random_ops.random_uniform((5, height, width, 3), seed=1)
-    with self.assertRaisesRegexp(ValueError, 'kernel_size'):
+    with self.assertRaisesRegex(ValueError, 'kernel_size'):
       conv_layers.conv2d_transpose(images, 32, (1, 2, 3))
 
-    with self.assertRaisesRegexp(ValueError, 'kernel_size'):
+    with self.assertRaisesRegex(ValueError, 'kernel_size'):
       conv_layers.conv2d_transpose(images, 32, None)
 
   @test_util.run_deprecated_v1
@@ -981,25 +981,25 @@ class Conv3DTransposeTest(test.TestCase):
   def testInvalidDataFormat(self):
     depth, height, width = 5, 7, 9
     volumes = random_ops.random_uniform((5, depth, height, width, 32), seed=1)
-    with self.assertRaisesRegexp(ValueError, 'data_format'):
+    with self.assertRaisesRegex(ValueError, 'data_format'):
       conv_layers.conv3d_transpose(volumes, 4, 3, data_format='invalid')
 
   def testInvalidStrides(self):
     depth, height, width = 5, 7, 9
     volumes = random_ops.random_uniform((5, depth, height, width, 32), seed=1)
-    with self.assertRaisesRegexp(ValueError, 'strides'):
+    with self.assertRaisesRegex(ValueError, 'strides'):
       conv_layers.conv3d_transpose(volumes, 4, 3, strides=(1, 2))
 
-    with self.assertRaisesRegexp(ValueError, 'strides'):
+    with self.assertRaisesRegex(ValueError, 'strides'):
       conv_layers.conv3d_transpose(volumes, 4, 3, strides=None)
 
   def testInvalidKernelSize(self):
     depth, height, width = 5, 7, 9
     volumes = random_ops.random_uniform((5, depth, height, width, 32), seed=1)
-    with self.assertRaisesRegexp(ValueError, 'kernel_size'):
+    with self.assertRaisesRegex(ValueError, 'kernel_size'):
       conv_layers.conv3d_transpose(volumes, 4, (1, 2))
 
-    with self.assertRaisesRegexp(ValueError, 'kernel_size'):
+    with self.assertRaisesRegex(ValueError, 'kernel_size'):
       conv_layers.conv3d_transpose(volumes, 4, None)
 
   @test_util.run_deprecated_v1
diff --git a/tensorflow/python/keras/legacy_tf_layers/pooling_test.py b/tensorflow/python/keras/legacy_tf_layers/pooling_test.py
index 0fd63ed335f..597b4a762c0 100644
--- a/tensorflow/python/keras/legacy_tf_layers/pooling_test.py
+++ b/tensorflow/python/keras/legacy_tf_layers/pooling_test.py
@@ -30,25 +30,25 @@ class PoolingTest(test.TestCase):
   def testInvalidDataFormat(self):
     height, width = 7, 9
     images = random_ops.random_uniform((5, height, width, 3), seed=1)
-    with self.assertRaisesRegexp(ValueError, 'data_format'):
+    with self.assertRaisesRegex(ValueError, 'data_format'):
       pooling_layers.max_pooling2d(images, 3, strides=2, data_format='invalid')
 
   def testInvalidStrides(self):
     height, width = 7, 9
     images = random_ops.random_uniform((5, height, width, 3), seed=1)
-    with self.assertRaisesRegexp(ValueError, 'strides'):
+    with self.assertRaisesRegex(ValueError, 'strides'):
       pooling_layers.max_pooling2d(images, 3, strides=(1, 2, 3))
 
-    with self.assertRaisesRegexp(ValueError, 'strides'):
+    with self.assertRaisesRegex(ValueError, 'strides'):
       pooling_layers.max_pooling2d(images, 3, strides=None)
 
   def testInvalidPoolSize(self):
     height, width = 7, 9
     images = random_ops.random_uniform((5, height, width, 3), seed=1)
-    with self.assertRaisesRegexp(ValueError, 'pool_size'):
+    with self.assertRaisesRegex(ValueError, 'pool_size'):
       pooling_layers.max_pooling2d(images, (1, 2, 3), strides=2)
 
-    with self.assertRaisesRegexp(ValueError, 'pool_size'):
+    with self.assertRaisesRegex(ValueError, 'pool_size'):
       pooling_layers.max_pooling2d(images, None, strides=2)
 
   def testCreateMaxPooling2D(self):
diff --git a/tensorflow/python/keras/losses_test.py b/tensorflow/python/keras/losses_test.py
index 26a586b872b..34213c8308a 100644
--- a/tensorflow/python/keras/losses_test.py
+++ b/tensorflow/python/keras/losses_test.py
@@ -227,13 +227,13 @@ class KerasLossesTest(test.TestCase, parameterized.TestCase):
     self.assertAllClose(self.evaluate(loss), 16, 1e-2)
 
   def test_invalid_reduction(self):
-    with self.assertRaisesRegexp(ValueError, 'Invalid Reduction Key Foo.'):
+    with self.assertRaisesRegex(ValueError, 'Invalid Reduction Key Foo.'):
       losses.MeanSquaredError(reduction='Foo')
 
     mse_obj = losses.MeanSquaredError()
     y = constant_op.constant([1])
     mse_obj.reduction = 'Bar'
-    with self.assertRaisesRegexp(ValueError, 'Invalid Reduction Key Bar.'):
+    with self.assertRaisesRegex(ValueError, 'Invalid Reduction Key Bar.'):
       mse_obj(y, y)
 
   def test_deserialization_error(self):
@@ -308,9 +308,9 @@ class MeanSquaredErrorTest(test.TestCase):
     y_true = constant_op.constant([1, 9, 2, -5, -2, 6], shape=(2, 3, 1))
     y_pred = constant_op.constant([4, 8, 12, 8, 1, 3], shape=(2, 3, 1))
     sample_weight = constant_op.constant([3, 6, 5, 0], shape=(2, 2))
-    with self.assertRaisesRegexp((ValueError, errors_impl.InvalidArgumentError),
-                                 (r'Incompatible shapes: \[2,3\] vs. \[2,2\]|'
-                                  'Dimensions must be equal')):
+    with self.assertRaisesRegex((ValueError, errors_impl.InvalidArgumentError),
+                                (r'Incompatible shapes: \[2,3\] vs. \[2,2\]|'
+                                 'Dimensions must be equal')):
       mse_obj(y_true, y_pred, sample_weight=sample_weight)
 
   def test_no_reduction(self):
@@ -400,9 +400,9 @@ class MeanAbsoluteErrorTest(test.TestCase):
     y_true = constant_op.constant([1, 9, 2, -5, -2, 6], shape=(2, 3, 1))
     y_pred = constant_op.constant([4, 8, 12, 8, 1, 3], shape=(2, 3, 1))
     sample_weight = constant_op.constant([3, 6, 5, 0], shape=(2, 2))
-    with self.assertRaisesRegexp((ValueError, errors_impl.InvalidArgumentError),
-                                 (r'Incompatible shapes: \[2,3\] vs. \[2,2\]|'
-                                  'Dimensions must be equal')):
+    with self.assertRaisesRegex((ValueError, errors_impl.InvalidArgumentError),
+                                (r'Incompatible shapes: \[2,3\] vs. \[2,2\]|'
+                                 'Dimensions must be equal')):
       mae_obj(y_true, y_pred, sample_weight=sample_weight)
 
   def test_no_reduction(self):
@@ -912,7 +912,7 @@ class CategoricalCrossentropyTest(test.TestCase):
                                    [.05, .01, .94]])
 
     cce_obj = losses.CategoricalCrossentropy()
-    with self.assertRaisesRegexp(ValueError, 'Shapes .+ are incompatible'):
+    with self.assertRaisesRegex(ValueError, 'Shapes .+ are incompatible'):
       cce_obj(y_true, y_pred)
 
 
diff --git a/tensorflow/python/keras/metrics_confusion_matrix_test.py b/tensorflow/python/keras/metrics_confusion_matrix_test.py
index 186c3f0328f..58c84557ec9 100644
--- a/tensorflow/python/keras/metrics_confusion_matrix_test.py
+++ b/tensorflow/python/keras/metrics_confusion_matrix_test.py
@@ -106,12 +106,12 @@ class FalsePositivesTest(test.TestCase, parameterized.TestCase):
     self.assertAllClose([125., 42., 12.], self.evaluate(result))
 
   def test_threshold_limit(self):
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         ValueError,
         r'Threshold values must be in \[0, 1\]. Invalid values: \[-1, 2\]'):
       metrics.FalsePositives(thresholds=[-1, 0.5, 2])
 
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         ValueError,
         r'Threshold values must be in \[0, 1\]. Invalid values: \[None\]'):
       metrics.FalsePositives(thresholds=[None])
@@ -817,12 +817,12 @@ class SensitivityAtSpecificityTest(test.TestCase, parameterized.TestCase):
     self.assertAlmostEqual(0.675, self.evaluate(result))
 
   def test_invalid_specificity(self):
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         ValueError, r'`specificity` must be in the range \[0, 1\].'):
       metrics.SensitivityAtSpecificity(-1)
 
   def test_invalid_num_thresholds(self):
-    with self.assertRaisesRegexp(ValueError, '`num_thresholds` must be > 0.'):
+    with self.assertRaisesRegex(ValueError, '`num_thresholds` must be > 0.'):
       metrics.SensitivityAtSpecificity(0.4, num_thresholds=-1)
 
 
@@ -913,12 +913,12 @@ class SpecificityAtSensitivityTest(test.TestCase, parameterized.TestCase):
     self.assertAlmostEqual(0.4, self.evaluate(result))
 
   def test_invalid_sensitivity(self):
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         ValueError, r'`sensitivity` must be in the range \[0, 1\].'):
       metrics.SpecificityAtSensitivity(-1)
 
   def test_invalid_num_thresholds(self):
-    with self.assertRaisesRegexp(ValueError, '`num_thresholds` must be > 0.'):
+    with self.assertRaisesRegex(ValueError, '`num_thresholds` must be > 0.'):
       metrics.SpecificityAtSensitivity(0.4, num_thresholds=-1)
 
 
@@ -1012,12 +1012,12 @@ class PrecisionAtRecallTest(test.TestCase, parameterized.TestCase):
     self.assertAlmostEqual(0.7, self.evaluate(result))
 
   def test_invalid_sensitivity(self):
-    with self.assertRaisesRegexp(
-        ValueError, r'`recall` must be in the range \[0, 1\].'):
+    with self.assertRaisesRegex(ValueError,
+                                r'`recall` must be in the range \[0, 1\].'):
       metrics.PrecisionAtRecall(-1)
 
   def test_invalid_num_thresholds(self):
-    with self.assertRaisesRegexp(ValueError, '`num_thresholds` must be > 0.'):
+    with self.assertRaisesRegex(ValueError, '`num_thresholds` must be > 0.'):
       metrics.PrecisionAtRecall(0.4, num_thresholds=-1)
 
 
@@ -1127,12 +1127,12 @@ class RecallAtPrecisionTest(test.TestCase, parameterized.TestCase):
     self.assertAlmostEqual(0, self.evaluate(result))
 
   def test_invalid_sensitivity(self):
-    with self.assertRaisesRegexp(ValueError,
-                                 r'`precision` must be in the range \[0, 1\].'):
+    with self.assertRaisesRegex(ValueError,
+                                r'`precision` must be in the range \[0, 1\].'):
       metrics.RecallAtPrecision(-1)
 
   def test_invalid_num_thresholds(self):
-    with self.assertRaisesRegexp(ValueError, '`num_thresholds` must be > 0.'):
+    with self.assertRaisesRegex(ValueError, '`num_thresholds` must be > 0.'):
       metrics.RecallAtPrecision(0.4, num_thresholds=-1)
 
 
@@ -1381,19 +1381,19 @@ class AUCTest(test.TestCase, parameterized.TestCase):
     self.assertAllClose(self.evaluate(result), expected_result, 1e-3)
 
   def test_invalid_num_thresholds(self):
-    with self.assertRaisesRegexp(ValueError, '`num_thresholds` must be > 1.'):
+    with self.assertRaisesRegex(ValueError, '`num_thresholds` must be > 1.'):
       metrics.AUC(num_thresholds=-1)
 
-    with self.assertRaisesRegexp(ValueError, '`num_thresholds` must be > 1.'):
+    with self.assertRaisesRegex(ValueError, '`num_thresholds` must be > 1.'):
       metrics.AUC(num_thresholds=1)
 
   def test_invalid_curve(self):
-    with self.assertRaisesRegexp(ValueError,
-                                 'Invalid AUC curve value "Invalid".'):
+    with self.assertRaisesRegex(ValueError,
+                                'Invalid AUC curve value "Invalid".'):
       metrics.AUC(curve='Invalid')
 
   def test_invalid_summation_method(self):
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         ValueError, 'Invalid AUC summation method value "Invalid".'):
       metrics.AUC(summation_method='Invalid')
 
diff --git a/tensorflow/python/keras/metrics_test.py b/tensorflow/python/keras/metrics_test.py
index fee65be18c9..90d87b4041e 100644
--- a/tensorflow/python/keras/metrics_test.py
+++ b/tensorflow/python/keras/metrics_test.py
@@ -1392,7 +1392,7 @@ class MeanTensorTest(test.TestCase, parameterized.TestCase):
       self.assertEqual(m.dtype, dtypes.float32)
       self.assertEmpty(m.variables)
 
-      with self.assertRaisesRegexp(ValueError, 'does not have any result yet'):
+      with self.assertRaisesRegex(ValueError, 'does not have any result yet'):
         m.result()
 
       self.evaluate(m([[3], [5], [3]]))
@@ -1469,7 +1469,7 @@ class MeanTensorTest(test.TestCase, parameterized.TestCase):
   def test_invalid_value_shape(self):
     m = metrics.MeanTensor(dtype=dtypes.float64)
     m([1])
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         ValueError, 'MeanTensor input values must always have the same shape'):
       m([1, 5])
 
diff --git a/tensorflow/python/keras/mixed_precision/experimental/autocast_variable_test.py b/tensorflow/python/keras/mixed_precision/experimental/autocast_variable_test.py
index 964118136d4..48fa93459a7 100644
--- a/tensorflow/python/keras/mixed_precision/experimental/autocast_variable_test.py
+++ b/tensorflow/python/keras/mixed_precision/experimental/autocast_variable_test.py
@@ -285,15 +285,15 @@ class AutoCastVariableTest(test.TestCase, parameterized.TestCase):
         self.assertAllClose(3., self.evaluate(x.assign_sub(v1)))
 
         # Attempt to assign float16 values
-        with self.assertRaisesRegexp(
+        with self.assertRaisesRegex(
             ValueError,
             'conversion requested dtype float32 for Tensor with dtype float16'):
           self.evaluate(x.assign(v2))
-        with self.assertRaisesRegexp(
+        with self.assertRaisesRegex(
             ValueError,
             'conversion requested dtype float32 for Tensor with dtype float16'):
           self.evaluate(x.assign_add(v2))
-        with self.assertRaisesRegexp(
+        with self.assertRaisesRegex(
             ValueError,
             'conversion requested dtype float32 for Tensor with dtype float16'):
           self.evaluate(x.assign_sub(v2))
@@ -391,13 +391,13 @@ class AutoCastVariableTest(test.TestCase, parameterized.TestCase):
   def test_invalid_wrapped_variable(self, distribution):
     with distribution.scope():
       # Wrap a non-variable
-      with self.assertRaisesRegexp(ValueError, 'variable must be of type'):
+      with self.assertRaisesRegex(ValueError, 'variable must be of type'):
         x = constant_op.constant([1.], dtype=dtypes.float32)
         autocast_variable.create_autocast_variable(x)
 
       # Wrap a non-floating point variable
-      with self.assertRaisesRegexp(ValueError,
-                                   'variable must be a floating point'):
+      with self.assertRaisesRegex(ValueError,
+                                  'variable must be a floating point'):
         x = get_var(1, dtypes.int32)
         autocast_variable.create_autocast_variable(x)
 
@@ -435,11 +435,10 @@ class AutoCastVariableTest(test.TestCase, parameterized.TestCase):
     with mirrored_strategy.MirroredStrategy(['/cpu:1', '/cpu:2']).scope():
       x = get_var(1., dtypes.float32)
       x = autocast_variable.create_autocast_variable(x)
-      self.assertRegexpMatches(
+      self.assertRegex(
           repr(x).replace('\n', ' '),
           '<AutoCastDistributedVariable dtype=float32 true_dtype=float32 '
-          'inner_variable=MirroredVariable.*>'
-      )
+          'inner_variable=MirroredVariable.*>')
 
   @parameterized.named_parameters(
       ('v1', gradient_descent_v1.GradientDescentOptimizer),
diff --git a/tensorflow/python/keras/mixed_precision/experimental/device_compatibility_check_test.py b/tensorflow/python/keras/mixed_precision/experimental/device_compatibility_check_test.py
index 33d5b9ddaa3..ccefa250d2d 100644
--- a/tensorflow/python/keras/mixed_precision/experimental/device_compatibility_check_test.py
+++ b/tensorflow/python/keras/mixed_precision/experimental/device_compatibility_check_test.py
@@ -45,10 +45,10 @@ class DeviceCompatibilityCheckTest(test.TestCase):
       device_compatibility_check._log_device_compatibility_check(
           policy_name, device_attr_list)
     if should_warn:
-      self.assertRegexpMatches(mock_warn.call_args[0][0], expected_regex)
+      self.assertRegex(mock_warn.call_args[0][0], expected_regex)
       mock_info.assert_not_called()
     else:
-      self.assertRegexpMatches(mock_info.call_args[0][0], expected_regex)
+      self.assertRegex(mock_info.call_args[0][0], expected_regex)
       mock_warn.assert_not_called()
 
   def test_supported(self):
diff --git a/tensorflow/python/keras/mixed_precision/experimental/get_layer_policy_test.py b/tensorflow/python/keras/mixed_precision/experimental/get_layer_policy_test.py
index eeba73550c2..f38bdfaf482 100644
--- a/tensorflow/python/keras/mixed_precision/experimental/get_layer_policy_test.py
+++ b/tensorflow/python/keras/mixed_precision/experimental/get_layer_policy_test.py
@@ -39,7 +39,7 @@ class GetLayerPolicyTest(test.TestCase):
     self.assertEqual(get_layer_policy.get_layer_policy(layer).name, 'float64')
 
   def test_error(self):
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         ValueError, 'get_policy can only be called on a layer, but got: 1'):
       get_layer_policy.get_layer_policy(1)
 
diff --git a/tensorflow/python/keras/mixed_precision/experimental/keras_test.py b/tensorflow/python/keras/mixed_precision/experimental/keras_test.py
index d2e80cfaf72..cfa2fbca080 100644
--- a/tensorflow/python/keras/mixed_precision/experimental/keras_test.py
+++ b/tensorflow/python/keras/mixed_precision/experimental/keras_test.py
@@ -227,9 +227,9 @@ class KerasLayerTest(keras_parameterized.TestCase):
         self.assertEqual(layer.v.dtype, dtypes.float64)
 
   def test_error_passing_policy_string_to_layer(self):
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         TypeError, "Cannot convert value 'mixed_float16' to a "
-                   "TensorFlow DType"):
+        'TensorFlow DType'):
       # This is not allowed, as otherwise a "mixed_float16" policy could be
       # created without an API call that has the name "experimental" in it.
       mp_test_util.MultiplyLayer(dtype='mixed_float16')
@@ -413,12 +413,12 @@ class KerasLayerTest(keras_parameterized.TestCase):
 
   def test_unsupported_strategy(self):
     strategy = create_central_storage_strategy()
-    with strategy.scope(), self.assertRaisesRegexp(
+    with strategy.scope(), self.assertRaisesRegex(
         ValueError, 'Mixed precision is not supported with the '
-                    'tf.distribute.Strategy: CentralStorageStrategy. Either '
-                    'stop using mixed precision by removing the use of the '
-                    '"mixed_float16" policy or use a different Strategy, e.g. '
-                    'a MirroredStrategy.'):
+        'tf.distribute.Strategy: CentralStorageStrategy. Either '
+        'stop using mixed precision by removing the use of the '
+        '"mixed_float16" policy or use a different Strategy, e.g. '
+        'a MirroredStrategy.'):
       mp_test_util.MultiplyLayer(dtype=policy.Policy('mixed_float16'))
     # Non-mixed policies are fine
     mp_test_util.MultiplyLayer(dtype=policy.Policy('float64'))
@@ -851,7 +851,7 @@ class KerasModelTest(keras_parameterized.TestCase):
         error_msg = 'Use a `tf.keras` Optimizer instead'
       else:
         error_msg = 'optimizer" must be an instance of '
-      with self.assertRaisesRegexp(ValueError, error_msg):
+      with self.assertRaisesRegex(ValueError, error_msg):
         model.compile(optimizers.SGD(1.), 'mse')
 
   @combinations.generate(combinations.combine(mode=['graph', 'eager']))
diff --git a/tensorflow/python/keras/mixed_precision/experimental/loss_scale_optimizer_test.py b/tensorflow/python/keras/mixed_precision/experimental/loss_scale_optimizer_test.py
index 31d75642adb..350cfe6a09c 100644
--- a/tensorflow/python/keras/mixed_precision/experimental/loss_scale_optimizer_test.py
+++ b/tensorflow/python/keras/mixed_precision/experimental/loss_scale_optimizer_test.py
@@ -330,7 +330,7 @@ class LossScaleOptimizerTest(test.TestCase, parameterized.TestCase):
 
   def testPassingNoneToLossScale(self):
     opt = gradient_descent.SGD()
-    with self.assertRaisesRegexp(ValueError, r'loss_scale cannot be None'):
+    with self.assertRaisesRegex(ValueError, r'loss_scale cannot be None'):
       loss_scale_optimizer.LossScaleOptimizer(opt, None)
 
   @parameterized.named_parameters(*TESTCASES)
@@ -366,11 +366,11 @@ class LossScaleOptimizerTest(test.TestCase, parameterized.TestCase):
     opt = loss_scale_optimizer.LossScaleOptimizer(opt, loss_scale=10.)
     # Test that attributes defined by OptimizerV2 subclasses are not exposed in
     # LossScaleOptimizer, and that the error message is sensible.
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         AttributeError,
         "'LossScaleOptimizer' object has no attribute 'epsilon'"):
       opt.epsilon  # pylint: disable=pointless-statement
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         AttributeError,
         "'LossScaleOptimizer' object has no attribute 'beta_1'"):
       opt.beta_1  # pylint: disable=pointless-statement
@@ -606,14 +606,14 @@ class LossScaleOptimizerTest(test.TestCase, parameterized.TestCase):
         'Loss scaling is not supported with the tf.distribute.Strategy: '
         'CentralStorageStrategy. Try using a different Strategy, e.g. a '
         'MirroredStrategy')
-    with strategy.scope(), self.assertRaisesRegexp(ValueError, expected_error):
+    with strategy.scope(), self.assertRaisesRegex(ValueError, expected_error):
       loss_scale_optimizer.LossScaleOptimizer(gradient_descent.SGD(), 1.)
     opt = loss_scale_optimizer.LossScaleOptimizer(gradient_descent.SGD(), 1.)
     with strategy.scope():
       var = variables.Variable(1.0)
       loss = lambda: var * 2.0
       run_fn = lambda: opt.minimize(loss, [var])
-      with self.assertRaisesRegexp(ValueError, expected_error):
+      with self.assertRaisesRegex(ValueError, expected_error):
         strategy.experimental_run(run_fn)
 
 
diff --git a/tensorflow/python/keras/mixed_precision/experimental/mixed_precision_graph_rewrite_test.py b/tensorflow/python/keras/mixed_precision/experimental/mixed_precision_graph_rewrite_test.py
index d7454a89bad..b2c5f80544e 100644
--- a/tensorflow/python/keras/mixed_precision/experimental/mixed_precision_graph_rewrite_test.py
+++ b/tensorflow/python/keras/mixed_precision/experimental/mixed_precision_graph_rewrite_test.py
@@ -73,9 +73,9 @@ class MixedPrecisionTest(test.TestCase, parameterized.TestCase):
   def test_optimizer_errors(self):
     opt = gradient_descent_v2.SGD(1.0)
     opt = loss_scale_optimizer_v2.LossScaleOptimizer(opt, 'dynamic')
-    with self.assertRaisesRegexp(ValueError,
-                                 '"opt" must not already be an instance of a '
-                                 'LossScaleOptimizer.'):
+    with self.assertRaisesRegex(
+        ValueError, '"opt" must not already be an instance of a '
+        'LossScaleOptimizer.'):
       enable_mixed_precision_graph_rewrite(opt)
     self.assertFalse(config.get_optimizer_experimental_options()
                      .get('auto_mixed_precision', False))
@@ -83,8 +83,8 @@ class MixedPrecisionTest(test.TestCase, parameterized.TestCase):
   @testing_utils.enable_v2_dtype_behavior
   def test_error_if_policy_is_set(self):
     with policy.policy_scope('mixed_float16'):
-      with self.assertRaisesRegexp(
-          ValueError, 'the global Keras dtype Policy has been set'):
+      with self.assertRaisesRegex(ValueError,
+                                  'the global Keras dtype Policy has been set'):
         enable_mixed_precision_graph_rewrite(gradient_descent_v2.SGD(1.0))
     # Test no error is thrown when the policy is currently the default.
     enable_mixed_precision_graph_rewrite(gradient_descent_v2.SGD(1.0))
diff --git a/tensorflow/python/keras/mixed_precision/experimental/policy_test.py b/tensorflow/python/keras/mixed_precision/experimental/policy_test.py
index 81d461c304d..94880a9b239 100644
--- a/tensorflow/python/keras/mixed_precision/experimental/policy_test.py
+++ b/tensorflow/python/keras/mixed_precision/experimental/policy_test.py
@@ -73,42 +73,42 @@ class PolicyTest(test.TestCase, parameterized.TestCase):
   def test_policy_errors(self):
     # Test passing invalid strings
 
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         ValueError, 'Cannot convert value abc to a mixed precision Policy.'):
       mp_policy.Policy('abc')
 
     # Test passing a DType
-    with self.assertRaisesRegexp(TypeError,
-                                 "'name' must be a string, not a DType. "
-                                 "Instead, pass DType.name. Got: float16"):
+    with self.assertRaisesRegex(
+        TypeError, "'name' must be a string, not a DType. "
+        'Instead, pass DType.name. Got: float16'):
       mp_policy.Policy(dtypes.float16)
 
     # Test passing a non-DType invalid type
-    with self.assertRaisesRegexp(TypeError,
-                                 "'name' must be a string, but got: 5"):
+    with self.assertRaisesRegex(TypeError,
+                                "'name' must be a string, but got: 5"):
       mp_policy.Policy(5)
 
     # Test passing a now-removed policy ending in float32_vars
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         ValueError, 'Policies ending in \'_float32_vars\' have been removed '
-                    'from TensorFlow. Please use the \'mixed_float16\' or '
-                    '\'mixed_bfloat16\' policy instead. Got policy name: '
-                    '\'infer_float32_vars\''):
+        'from TensorFlow. Please use the \'mixed_float16\' or '
+        '\'mixed_bfloat16\' policy instead. Got policy name: '
+        '\'infer_float32_vars\''):
       mp_policy.Policy('infer_float32_vars')
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         ValueError, 'Policies ending in \'_float32_vars\' have been removed '
-                    'from TensorFlow. Please use the \'mixed_float16\' policy '
-                    'instead. Got policy name: \'float16_with_float32_vars\''):
+        'from TensorFlow. Please use the \'mixed_float16\' policy '
+        'instead. Got policy name: \'float16_with_float32_vars\''):
       mp_policy.Policy('float16_with_float32_vars')
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         ValueError, 'Policies ending in \'_float32_vars\' have been removed '
-                    'from TensorFlow. Please use the \'mixed_bfloat16\' policy '
-                    'instead. Got policy name: \'bfloat16_with_float32_vars\''):
+        'from TensorFlow. Please use the \'mixed_bfloat16\' policy '
+        'instead. Got policy name: \'bfloat16_with_float32_vars\''):
       mp_policy.Policy('bfloat16_with_float32_vars')
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         ValueError, 'Policies ending in \'_float32_vars\' have been removed '
-                    'from TensorFlow. Got policy name: '
-                    '\'int8_with_float32_vars\''):
+        'from TensorFlow. Got policy name: '
+        '\'int8_with_float32_vars\''):
       mp_policy.Policy('int8_with_float32_vars')
 
   @testing_utils.enable_v2_dtype_behavior
@@ -181,7 +181,7 @@ class PolicyTest(test.TestCase, parameterized.TestCase):
       if config_module.list_physical_devices('GPU'):
         mock_warn.assert_not_called()
       else:
-        self.assertRegexpMatches(
+        self.assertRegex(
             mock_warn.call_args[0][0],
             r'Mixed precision compatibility check \(mixed_float16\): WARNING.*')
 
@@ -292,9 +292,9 @@ class PolicyTest(test.TestCase, parameterized.TestCase):
     try:
       mixed_precision.enable_mixed_precision_graph_rewrite(
           gradient_descent.SGD(1.))
-      with self.assertRaisesRegexp(
+      with self.assertRaisesRegex(
           ValueError, 'cannot be set to "mixed_float16", .* the mixed '
-                      'precision graph rewrite has already been enabled'):
+          'precision graph rewrite has already been enabled'):
         mp_policy.set_policy('mixed_float16')
       with mp_policy.policy_scope('float64'):
         pass  # Non-mixed policies are allowed
@@ -304,19 +304,16 @@ class PolicyTest(test.TestCase, parameterized.TestCase):
   @testing_utils.disable_v2_dtype_behavior
   def test_v1_dtype_behavior(self):
     # Setting global policies are not allowed with V1 dtype behavior
-    with self.assertRaisesRegexp(
-        ValueError,
-        'global policy can only be set in TensorFlow 2'):
+    with self.assertRaisesRegex(
+        ValueError, 'global policy can only be set in TensorFlow 2'):
       with mp_policy.policy_scope(mp_policy.Policy('_infer')):
         pass
-    with self.assertRaisesRegexp(
-        ValueError,
-        'global policy can only be set in TensorFlow 2'):
+    with self.assertRaisesRegex(
+        ValueError, 'global policy can only be set in TensorFlow 2'):
       with mp_policy.policy_scope(mp_policy.Policy('float32')):
         pass
-    with self.assertRaisesRegexp(
-        ValueError,
-        'global policy can only be set in TensorFlow 2'):
+    with self.assertRaisesRegex(
+        ValueError, 'global policy can only be set in TensorFlow 2'):
       with mp_policy.policy_scope(mp_policy.Policy('mixed_float16')):
         pass
 
diff --git a/tensorflow/python/keras/models_test.py b/tensorflow/python/keras/models_test.py
index a2887e24b56..ea0dc148326 100644
--- a/tensorflow/python/keras/models_test.py
+++ b/tensorflow/python/keras/models_test.py
@@ -363,16 +363,16 @@ class TestCloneAndBuildModel(keras_parameterized.TestCase):
 
     model = _get_model()
 
-    with self.assertRaisesRegexp(ValueError, 'has not been compiled'):
+    with self.assertRaisesRegex(ValueError, 'has not been compiled'):
       models.clone_and_build_model(model, compile_clone=True)
 
     is_subclassed = (testing_utils.get_model_type() == 'subclass')
     # With placeholder creation
     new_model = models.clone_and_build_model(
         model, compile_clone=False, in_place_reset=is_subclassed)
-    with self.assertRaisesRegexp(RuntimeError, 'must compile'):
+    with self.assertRaisesRegex(RuntimeError, 'must compile'):
       new_model.evaluate(inp, out)
-    with self.assertRaisesRegexp(RuntimeError, 'must compile'):
+    with self.assertRaisesRegex(RuntimeError, 'must compile'):
       new_model.train_on_batch(inp, out)
     new_model.compile(
         testing_utils.get_v2_optimizer('rmsprop'),
@@ -387,9 +387,9 @@ class TestCloneAndBuildModel(keras_parameterized.TestCase):
         input_tensors=input_a,
         compile_clone=False,
         in_place_reset=is_subclassed)
-    with self.assertRaisesRegexp(RuntimeError, 'must compile'):
+    with self.assertRaisesRegex(RuntimeError, 'must compile'):
       new_model.evaluate(inp, out)
-    with self.assertRaisesRegexp(RuntimeError, 'must compile'):
+    with self.assertRaisesRegex(RuntimeError, 'must compile'):
       new_model.train_on_batch(inp, out)
     new_model.compile(
         testing_utils.get_v2_optimizer('rmsprop'),
@@ -512,8 +512,7 @@ class TestCloneAndBuildModel(keras_parameterized.TestCase):
         optimizer_config = optimizer.get_config()
     with ops.Graph().as_default():
       with self.session():
-        with self.assertRaisesRegexp(ValueError,
-                                     'Cannot use the given session'):
+        with self.assertRaisesRegex(ValueError, 'Cannot use the given session'):
           models.clone_and_build_model(model, compile_clone=True)
         # The optimizer_config object allows the model to be cloned in a
         # different graph.
diff --git a/tensorflow/python/keras/optimizer_v2/gradient_descent_test.py b/tensorflow/python/keras/optimizer_v2/gradient_descent_test.py
index d87e6fd1dfa..0084f04bdd9 100644
--- a/tensorflow/python/keras/optimizer_v2/gradient_descent_test.py
+++ b/tensorflow/python/keras/optimizer_v2/gradient_descent_test.py
@@ -686,7 +686,7 @@ class MomentumOptimizerTest(test.TestCase, parameterized.TestCase):
     self.assertTrue(opt3.nesterov)
 
   def testNesterovWithoutMomentum(self):
-    with self.assertRaisesRegexp(ValueError, "must be between"):
+    with self.assertRaisesRegex(ValueError, "must be between"):
       gradient_descent.SGD(learning_rate=1.0, momentum=2.0)
 
   def testConstructMomentumWithLR(self):
diff --git a/tensorflow/python/keras/optimizer_v2/optimizer_v2_test.py b/tensorflow/python/keras/optimizer_v2/optimizer_v2_test.py
index 9069fc44b9c..05f4b1d17f3 100644
--- a/tensorflow/python/keras/optimizer_v2/optimizer_v2_test.py
+++ b/tensorflow/python/keras/optimizer_v2/optimizer_v2_test.py
@@ -166,7 +166,7 @@ class OptimizerTest(test.TestCase, parameterized.TestCase):
         var1 = variables.Variable([3.0, 4.0], dtype=dtype)
         loss = lambda: 5 * var0  # pylint: disable=cell-var-from-loop
         sgd_op = gradient_descent.SGD(3.0)
-        with self.assertRaisesRegexp(ValueError, 'No gradients'):
+        with self.assertRaisesRegex(ValueError, 'No gradients'):
           # var1 has no gradient
           sgd_op.minimize(loss, var_list=[var1])
 
@@ -179,8 +179,8 @@ class OptimizerTest(test.TestCase, parameterized.TestCase):
         loss = lambda: constant_op.constant(5.0)
 
         sgd_op = gradient_descent.SGD(3.0)
-        with self.assertRaisesRegexp(ValueError,
-                                     'No gradients provided for any variable'):
+        with self.assertRaisesRegex(ValueError,
+                                    'No gradients provided for any variable'):
           sgd_op.minimize(loss, var_list=[var0, var1])
 
   @combinations.generate(combinations.combine(mode=['graph', 'eager']))
@@ -190,8 +190,8 @@ class OptimizerTest(test.TestCase, parameterized.TestCase):
         var0 = variables.Variable([1.0, 2.0], dtype=dtype)
         var1 = variables.Variable([3.0, 4.0], dtype=dtype)
         sgd_op = gradient_descent.SGD(3.0)
-        with self.assertRaisesRegexp(ValueError,
-                                     'No gradients provided for any variable'):
+        with self.assertRaisesRegex(ValueError,
+                                    'No gradients provided for any variable'):
           sgd_op.apply_gradients([(None, var0), (None, var1)])
 
   @combinations.generate(combinations.combine(mode=['graph', 'eager']))
@@ -357,12 +357,12 @@ class OptimizerTest(test.TestCase, parameterized.TestCase):
 
   @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def testInvalidClipNorm(self):
-    with self.assertRaisesRegexp(ValueError, '>= 0'):
+    with self.assertRaisesRegex(ValueError, '>= 0'):
       gradient_descent.SGD(learning_rate=1.0, clipnorm=-1.0)
 
   @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def testInvalidKwargs(self):
-    with self.assertRaisesRegexp(TypeError, 'Unexpected keyword argument'):
+    with self.assertRaisesRegex(TypeError, 'Unexpected keyword argument'):
       gradient_descent.SGD(learning_rate=1.0, invalidkwargs=1.0)
 
   @combinations.generate(combinations.combine(mode=['graph', 'eager']))
@@ -396,7 +396,7 @@ class OptimizerTest(test.TestCase, parameterized.TestCase):
       # Assert set_weights with ValueError since weight list does not match.
       self.evaluate(variables.global_variables_initializer())
       weights = opt1.get_weights()
-      with self.assertRaisesRegexp(ValueError, 'but the optimizer was'):
+      with self.assertRaisesRegex(ValueError, 'but the optimizer was'):
         opt2.set_weights(weights)
 
       # Assert set_weights and variables get updated to same value.
@@ -566,7 +566,7 @@ class OptimizerTest(test.TestCase, parameterized.TestCase):
     loss = lambda: losses.mean_squared_error(model(x), y)
     var_list = lambda: model.trainable_weights
 
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         ValueError, 'Weights for model .* have not yet been created'):
       var_list()
     train_op = opt.minimize(loss, var_list)
diff --git a/tensorflow/python/keras/saving/hdf5_format_test.py b/tensorflow/python/keras/saving/hdf5_format_test.py
index b079bf8cac8..22e19e4859c 100644
--- a/tensorflow/python/keras/saving/hdf5_format_test.py
+++ b/tensorflow/python/keras/saving/hdf5_format_test.py
@@ -312,10 +312,10 @@ class TestWeightSavingAndLoading(test.TestCase, parameterized.TestCase):
       model.compile(loss=keras.losses.MSE,
                     optimizer='rmsprop',
                     metrics=[keras.metrics.categorical_accuracy])
-      with self.assertRaisesRegexp(ValueError,
-                                   r'Layer #0 \(named \"d1\"\) expects 1 '
-                                   r'weight\(s\), but the saved weights have 2 '
-                                   r'element\(s\)\.'):
+      with self.assertRaisesRegex(
+          ValueError, r'Layer #0 \(named \"d1\"\) expects 1 '
+          r'weight\(s\), but the saved weights have 2 '
+          r'element\(s\)\.'):
         hdf5_format.load_weights_from_hdf5_group_by_name(f_model, model.layers)
 
       hdf5_format.load_weights_from_hdf5_group_by_name(
@@ -355,12 +355,12 @@ class TestWeightSavingAndLoading(test.TestCase, parameterized.TestCase):
       model.compile(loss=keras.losses.MSE,
                     optimizer=keras.optimizers.RMSprop(lr=0.0001),
                     metrics=[keras.metrics.categorical_accuracy])
-      with self.assertRaisesRegexp(ValueError,
-                                   r'Layer #0 \(named "d1"\), weight '
-                                   r'<tf\.Variable \'d1_1\/kernel:0\' '
-                                   r'shape=\(3, 10\) dtype=float32> has '
-                                   r'shape \(3, 10\), but the saved weight has '
-                                   r'shape \(3, 5\)\.'):
+      with self.assertRaisesRegex(
+          ValueError, r'Layer #0 \(named "d1"\), weight '
+          r'<tf\.Variable \'d1_1\/kernel:0\' '
+          r'shape=\(3, 10\) dtype=float32> has '
+          r'shape \(3, 10\), but the saved weight has '
+          r'shape \(3, 5\)\.'):
         hdf5_format.load_weights_from_hdf5_group_by_name(f_model, model.layers)
 
       hdf5_format.load_weights_from_hdf5_group_by_name(
@@ -773,8 +773,7 @@ class TestWholeModelSaving(keras_parameterized.TestCase):
       return
 
     h5file = h5py.File(saved_model_dir, 'r')
-    self.assertRegexpMatches(
-        h5file.attrs['keras_version'], r'^[\d]+\.[\d]+\.[\S]+$')
+    self.assertRegex(h5file.attrs['keras_version'], r'^[\d]+\.[\d]+\.[\S]+$')
 
   @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def test_functional_model_with_custom_loss_and_metric(self):
@@ -968,9 +967,7 @@ class TestWeightSavingAndLoadingTFFormat(test.TestCase, parameterized.TestCase):
       prefix = os.path.join(temp_dir, 'ckpt')
       with test.mock.patch.object(logging, 'warning') as mock_log:
         model.save_weights(prefix)
-        self.assertRegexpMatches(
-            str(mock_log.call_args),
-            'Keras optimizer')
+        self.assertRegex(str(mock_log.call_args), 'Keras optimizer')
 
   @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def test_tensorflow_format_overwrite(self):
@@ -1181,12 +1178,12 @@ class TestWeightSavingAndLoadingTFFormat(test.TestCase, parameterized.TestCase):
     save_path = trackable.Checkpoint().save(
         os.path.join(self.get_temp_dir(), 'ckpt'))
     m = DummySubclassModel()
-    with self.assertRaisesRegexp(AssertionError, 'Nothing to load'):
+    with self.assertRaisesRegex(AssertionError, 'Nothing to load'):
       m.load_weights(save_path)
     m.dense = keras.layers.Dense(2)
     m.dense(constant_op.constant([[1.]]))
-    with self.assertRaisesRegexp(
-        AssertionError, 'Nothing except the root object matched'):
+    with self.assertRaisesRegex(AssertionError,
+                                'Nothing except the root object matched'):
       m.load_weights(save_path)
 
   @combinations.generate(combinations.combine(mode=['graph', 'eager']))
diff --git a/tensorflow/python/keras/saving/save_test.py b/tensorflow/python/keras/saving/save_test.py
index 5c5846fe738..59fe6c2c756 100644
--- a/tensorflow/python/keras/saving/save_test.py
+++ b/tensorflow/python/keras/saving/save_test.py
@@ -76,7 +76,7 @@ class TestSaveModel(test.TestCase, parameterized.TestCase):
     path = os.path.join(self.get_temp_dir(), 'model')
     save.save_model(self.model, path, save_format='h5')
     self.assert_h5_format(path)
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         NotImplementedError,
         'requires the model to be a Functional model or a Sequential model.'):
       save.save_model(self.subclassed_model, path, save_format='h5')
@@ -86,7 +86,7 @@ class TestSaveModel(test.TestCase, parameterized.TestCase):
     path = os.path.join(self.get_temp_dir(), 'model')
     save.save_model(self.model, path, save_format='tf')
     self.assert_saved_model(path)
-    with self.assertRaisesRegexp(ValueError, 'input shapes have not been set'):
+    with self.assertRaisesRegex(ValueError, 'input shapes have not been set'):
       save.save_model(self.subclassed_model, path, save_format='tf')
     self.subclassed_model.predict(np.random.random((3, 5)))
     save.save_model(self.subclassed_model, path, save_format='tf')
diff --git a/tensorflow/python/keras/saving/saved_model/saved_model_test.py b/tensorflow/python/keras/saving/saved_model/saved_model_test.py
index 3f55d5f40b5..e76e524f93b 100644
--- a/tensorflow/python/keras/saving/saved_model/saved_model_test.py
+++ b/tensorflow/python/keras/saving/saved_model/saved_model_test.py
@@ -855,8 +855,7 @@ class TestModelSavingAndLoadingV2(keras_parameterized.TestCase):
 
     loaded = keras_load.load(saved_model_dir)
     self.assertAllEqual([[1.0]], self.evaluate(loaded(inp)))
-    with self.assertRaisesRegexp(ValueError,
-                                 'call function was not serialized'):
+    with self.assertRaisesRegex(ValueError, 'call function was not serialized'):
       loaded.layer(inp)
 
 
@@ -1039,8 +1038,8 @@ class MetricTest(test.TestCase, parameterized.TestCase):
 
       self.evaluate([v.initializer for v in metric.variables])
 
-      with self.assertRaisesRegexp(ValueError,
-                                   'Unable to restore custom object'):
+      with self.assertRaisesRegex(ValueError,
+                                  'Unable to restore custom object'):
         self._test_metric_save_and_load(metric, save_dir, num_tensor_args)
       with generic_utils.CustomObjectScope({'CustomMetric': CustomMetric}):
         loaded = self._test_metric_save_and_load(
diff --git a/tensorflow/python/keras/saving/saved_model_experimental_test.py b/tensorflow/python/keras/saving/saved_model_experimental_test.py
index 2f3cf7cf9c9..281a58a1076 100644
--- a/tensorflow/python/keras/saving/saved_model_experimental_test.py
+++ b/tensorflow/python/keras/saving/saved_model_experimental_test.py
@@ -496,12 +496,12 @@ class TestModelSavedModelExport(test.TestCase, parameterized.TestCase):
   def testSaveSequentialModelWithoutInputShapes(self):
     model = sequential_model_without_input_shape(True)
     # A Sequential model that hasn't been built should raise an error.
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         ValueError, 'Weights for sequential model have not yet been created'):
       keras_saved_model.export_saved_model(model, '')
 
     # Even with input_signature, the model's weights has not been created.
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         ValueError, 'Weights for sequential model have not yet been created'):
       saved_model_dir = self._save_model_dir()
       keras_saved_model.export_saved_model(
diff --git a/tensorflow/python/keras/saving/saving_utils_test.py b/tensorflow/python/keras/saving/saving_utils_test.py
index bc0ea6edf11..574e42a2aff 100644
--- a/tensorflow/python/keras/saving/saving_utils_test.py
+++ b/tensorflow/python/keras/saving/saving_utils_test.py
@@ -70,8 +70,7 @@ class TraceModelCallTest(keras_parameterized.TestCase):
     inputs = array_ops.ones((8, 5))
 
     if input_dim is None:
-      with self.assertRaisesRegexp(ValueError,
-                                   'input shapes have not been set'):
+      with self.assertRaisesRegex(ValueError, 'input shapes have not been set'):
         saving_utils.trace_model_call(model)
       model._set_inputs(inputs)
 
@@ -130,8 +129,7 @@ class TraceModelCallTest(keras_parameterized.TestCase):
     input_b_np = np.random.random((10, input_dim)).astype(np.float32)
 
     if testing_utils.get_model_type() == 'subclass':
-      with self.assertRaisesRegexp(ValueError,
-                                   'input shapes have not been set'):
+      with self.assertRaisesRegex(ValueError, 'input shapes have not been set'):
         saving_utils.trace_model_call(model)
 
     model.compile(
@@ -182,7 +180,7 @@ class TraceModelCallTest(keras_parameterized.TestCase):
     model = testing_utils.get_small_sequential_mlp(10, 3, None)
     inputs = array_ops.ones((8, 5))
 
-    with self.assertRaisesRegexp(ValueError, 'input shapes have not been set'):
+    with self.assertRaisesRegex(ValueError, 'input shapes have not been set'):
       saving_utils.trace_model_call(model)
 
     fn = saving_utils.trace_model_call(
diff --git a/tensorflow/python/keras/tests/add_loss_correctness_test.py b/tensorflow/python/keras/tests/add_loss_correctness_test.py
index f99b285489d..8494d6e31a0 100644
--- a/tensorflow/python/keras/tests/add_loss_correctness_test.py
+++ b/tensorflow/python/keras/tests/add_loss_correctness_test.py
@@ -435,7 +435,7 @@ class TestAddLossCorrectness(keras_parameterized.TestCase):
       inputs = Input(shape=(1,))
       outputs = testing_utils.Bias()(inputs)
       model = Model(inputs, outputs)
-      with self.assertRaisesRegexp(
+      with self.assertRaisesRegex(
           ValueError,
           'Expected a symbolic Tensors or a callable for the loss value'):
         model.add_loss(1.)
@@ -446,7 +446,7 @@ class TestAddLossCorrectness(keras_parameterized.TestCase):
       inputs = Input(shape=(1,))
       outputs = testing_utils.Bias()(inputs)
       model = Model(inputs, outputs)
-      with self.assertRaisesRegexp(
+      with self.assertRaisesRegex(
           ValueError,
           'Expected a symbolic Tensors or a callable for the loss value'):
         model.add_loss(model.weights[0])
diff --git a/tensorflow/python/keras/tests/model_subclassing_test.py b/tensorflow/python/keras/tests/model_subclassing_test.py
index 8096b0f7586..7985f79fc08 100644
--- a/tensorflow/python/keras/tests/model_subclassing_test.py
+++ b/tensorflow/python/keras/tests/model_subclassing_test.py
@@ -79,7 +79,7 @@ class ModelSubclassingTest(keras_parameterized.TestCase):
         return 1.
 
     m = ModelWithProperty()
-    with self.assertRaisesRegexp(AttributeError, 'read_only'):
+    with self.assertRaisesRegex(AttributeError, 'read_only'):
       m.read_only = 2.
 
   def test_custom_build_with_fit(self):
@@ -140,8 +140,8 @@ class ModelSubclassingTest(keras_parameterized.TestCase):
     self.assertFalse(model.built, 'Model should not have been built')
     self.assertFalse(model.weights, ('Model should have no weights since it '
                                      'has not been built.'))
-    with self.assertRaisesRegexp(
-        ValueError, 'input shape is not one of the valid types'):
+    with self.assertRaisesRegex(ValueError,
+                                'input shape is not one of the valid types'):
       model.build(input_shape=tensor_shape.Dimension(input_dim))
 
   def test_embed_dtype_with_subclass_build(self):
@@ -177,7 +177,7 @@ class ModelSubclassingTest(keras_parameterized.TestCase):
     self.assertFalse(model.built, 'Model should not have been built')
     self.assertFalse(model.weights, ('Model should have no weights since it '
                                      'has not been built.'))
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         ValueError, 'if your layers do not support float type inputs'):
       model.build(input_shape=(35, 20))
 
@@ -652,8 +652,8 @@ class CustomCallSignatureTests(test.TestCase, parameterized.TestCase):
     self.assertFalse(model.built, 'Model should not have been built')
     self.assertFalse(model.weights, ('Model should have no weights since it '
                                      'has not been built.'))
-    with self.assertRaisesRegexp(
-        ValueError, 'cannot build your model if it has positional'):
+    with self.assertRaisesRegex(ValueError,
+                                'cannot build your model if it has positional'):
       model.build(input_shape=[first_input_shape, second_input_shape])
 
   def test_kwargs_in_signature(self):
@@ -689,20 +689,20 @@ class CustomCallSignatureTests(test.TestCase, parameterized.TestCase):
     y = np.ones((10, 1))
     m = ModelWithPositionalArgs()
     m.compile('sgd', 'mse')
-    with self.assertRaisesRegexp(ValueError, r'Models passed to `fit`'):
+    with self.assertRaisesRegex(ValueError, r'Models passed to `fit`'):
       m.fit(x, y, batch_size=2)
-    with self.assertRaisesRegexp(ValueError, r'Models passed to `evaluate`'):
+    with self.assertRaisesRegex(ValueError, r'Models passed to `evaluate`'):
       m.evaluate(x, y, batch_size=2)
-    with self.assertRaisesRegexp(ValueError, r'Models passed to `predict`'):
+    with self.assertRaisesRegex(ValueError, r'Models passed to `predict`'):
       m.predict(x, batch_size=2)
-    with self.assertRaisesRegexp(ValueError,
-                                 r'Models passed to `train_on_batch`'):
+    with self.assertRaisesRegex(ValueError,
+                                r'Models passed to `train_on_batch`'):
       m.train_on_batch(x, y)
-    with self.assertRaisesRegexp(ValueError,
-                                 r'Models passed to `test_on_batch`'):
+    with self.assertRaisesRegex(ValueError,
+                                r'Models passed to `test_on_batch`'):
       m.test_on_batch(x, y)
-    with self.assertRaisesRegexp(ValueError,
-                                 r'Models passed to `predict_on_batch`'):
+    with self.assertRaisesRegex(ValueError,
+                                r'Models passed to `predict_on_batch`'):
       m.predict_on_batch(x)
 
   def test_deepcopy(self):
diff --git a/tensorflow/python/keras/tests/saver_test.py b/tensorflow/python/keras/tests/saver_test.py
index d3c87cf1d88..28c65961a53 100644
--- a/tensorflow/python/keras/tests/saver_test.py
+++ b/tensorflow/python/keras/tests/saver_test.py
@@ -131,8 +131,8 @@ class TrackableCompatibilityTests(test.TestCase):
       saver.restore(sess=sess, save_path=save_path)
       self.assertEqual(before_second_restore_ops,
                        restore_graph.get_operations())
-      with self.assertRaisesRegexp(errors.NotFoundError,
-                                   "Could not find some variables"):
+      with self.assertRaisesRegex(errors.NotFoundError,
+                                  "Could not find some variables"):
         saver.restore(sess=sess, save_path=second_path)
 
   def testLoadFromObjectBasedEager(self):
diff --git a/tensorflow/python/keras/tests/summary_ops_test.py b/tensorflow/python/keras/tests/summary_ops_test.py
index a62abdccbba..5c9fee91b9c 100644
--- a/tensorflow/python/keras/tests/summary_ops_test.py
+++ b/tensorflow/python/keras/tests/summary_ops_test.py
@@ -90,7 +90,7 @@ class SummaryOpsTest(test_util.TensorFlowTestCase):
     with test.mock.patch.object(logging, 'warn') as mock_log:
       self.assertFalse(
           summary_ops.keras_model(name='my_name', data=model, step=1))
-      self.assertRegexpMatches(
+      self.assertRegex(
           str(mock_log.call_args), 'Model failed to serialize as JSON.')
 
   @test_util.run_v2_only
@@ -102,7 +102,7 @@ class SummaryOpsTest(test_util.TensorFlowTestCase):
         mock_to_json.side_effect = Exception('oops')
         self.assertFalse(
             summary_ops.keras_model(name='my_name', data=model, step=1))
-        self.assertRegexpMatches(
+        self.assertRegex(
             str(mock_log.call_args),
             'Model failed to serialize as JSON. Ignoring... oops')
 
diff --git a/tensorflow/python/keras/tests/tracking_test.py b/tensorflow/python/keras/tests/tracking_test.py
index b5ce6911d92..a05706eec7a 100644
--- a/tensorflow/python/keras/tests/tracking_test.py
+++ b/tensorflow/python/keras/tests/tracking_test.py
@@ -308,7 +308,7 @@ class MappingTests(test.TestCase):
     model = training.Model()
     model.sub = a
     save_path = os.path.join(self.get_temp_dir(), "ckpt")
-    with self.assertRaisesRegexp(ValueError, "non-string key"):
+    with self.assertRaisesRegex(ValueError, "non-string key"):
       model.save_weights(save_path)
 
   def testDictWrapperNoDependency(self):
@@ -361,7 +361,7 @@ class MappingTests(test.TestCase):
     model.d["a"] = []
     model.d.pop("a")
     save_path = os.path.join(self.get_temp_dir(), "ckpt")
-    with self.assertRaisesRegexp(ValueError, "Unable to save"):
+    with self.assertRaisesRegex(ValueError, "Unable to save"):
       model.save_weights(save_path)
 
   def testExternalModificationNoSave(self):
@@ -370,7 +370,7 @@ class MappingTests(test.TestCase):
     model.d = external_reference
     external_reference["a"] = []
     save_path = os.path.join(self.get_temp_dir(), "ckpt")
-    with self.assertRaisesRegexp(ValueError, "modified outside the wrapper"):
+    with self.assertRaisesRegex(ValueError, "modified outside the wrapper"):
       model.save_weights(save_path)
 
   def testOverwriteCanStillSave(self):
@@ -602,7 +602,7 @@ class InterfaceTests(test.TestCase):
     checkpoint.save(os.path.join(self.get_temp_dir(), "ckpt"))
     a.l2 = []
     a.l2.insert(1, module.Module())
-    with self.assertRaisesRegexp(ValueError, "A list element was replaced"):
+    with self.assertRaisesRegex(ValueError, "A list element was replaced"):
       checkpoint.save(os.path.join(self.get_temp_dir(), "ckpt"))
 
 
diff --git a/tensorflow/python/keras/tests/tracking_util_test.py b/tensorflow/python/keras/tests/tracking_util_test.py
index ee5d7428fcc..1c55c366d82 100644
--- a/tensorflow/python/keras/tests/tracking_util_test.py
+++ b/tensorflow/python/keras/tests/tracking_util_test.py
@@ -515,7 +515,7 @@ class CheckpointingTests(parameterized.TestCase, test.TestCase):
     new_root.optimizer = adam.Adam(0.1)
     slot_status.assert_existing_objects_matched()
     if not context.executing_eagerly():
-      with self.assertRaisesRegexp(AssertionError, "Unresolved object"):
+      with self.assertRaisesRegex(AssertionError, "Unresolved object"):
         slot_status.assert_consumed()
     self.assertEqual(12., self.evaluate(new_root.var))
     if context.executing_eagerly():
@@ -847,11 +847,11 @@ class CheckpointCompatibilityTests(test.TestCase):
       else:
         # When graph building, we haven't read any keys, so we don't know
         # whether the restore will be complete.
-        with self.assertRaisesRegexp(AssertionError, "not restored"):
+        with self.assertRaisesRegex(AssertionError, "not restored"):
           status.assert_consumed()
-        with self.assertRaisesRegexp(AssertionError, "not restored"):
+        with self.assertRaisesRegex(AssertionError, "not restored"):
           status.assert_existing_objects_matched()
-        with self.assertRaisesRegexp(AssertionError, "not restored"):
+        with self.assertRaisesRegex(AssertionError, "not restored"):
           status.assert_nontrivial_match()
       status.run_restore_ops()
       self._check_sentinels(root)
diff --git a/tensorflow/python/keras/tests/tracking_util_with_v1_optimizers_test.py b/tensorflow/python/keras/tests/tracking_util_with_v1_optimizers_test.py
index d38ab320592..1a699803e1a 100644
--- a/tensorflow/python/keras/tests/tracking_util_with_v1_optimizers_test.py
+++ b/tensorflow/python/keras/tests/tracking_util_with_v1_optimizers_test.py
@@ -649,11 +649,11 @@ class CheckpointCompatibilityTests(test.TestCase):
       else:
         # When graph building, we haven't read any keys, so we don't know
         # whether the restore will be complete.
-        with self.assertRaisesRegexp(AssertionError, "not restored"):
+        with self.assertRaisesRegex(AssertionError, "not restored"):
           status.assert_consumed()
-        with self.assertRaisesRegexp(AssertionError, "not restored"):
+        with self.assertRaisesRegex(AssertionError, "not restored"):
           status.assert_existing_objects_matched()
-        with self.assertRaisesRegexp(AssertionError, "not restored"):
+        with self.assertRaisesRegex(AssertionError, "not restored"):
           status.assert_nontrivial_match()
       status.run_restore_ops()
       self._check_sentinels(root)
diff --git a/tensorflow/python/keras/utils/version_utils_test.py b/tensorflow/python/keras/utils/version_utils_test.py
index 41370e316af..65eda4d2bbd 100644
--- a/tensorflow/python/keras/utils/version_utils_test.py
+++ b/tensorflow/python/keras/utils/version_utils_test.py
@@ -145,7 +145,7 @@ class SplitUtilsTest(keras_parameterized.TestCase):
       def call(self, inputs):
         return 2 * inputs
 
-    with self.assertRaisesRegexp(TypeError, 'instantiate abstract class'):
+    with self.assertRaisesRegex(TypeError, 'instantiate abstract class'):
       AbstractModel()
 
     model = MyModel()
@@ -181,7 +181,7 @@ class SplitUtilsTest(keras_parameterized.TestCase):
     model.compile('sgd', 'mse')
     x, y = np.ones((10, 10)), np.ones((10, 1))
     with ops.get_default_graph().as_default():
-      with self.assertRaisesRegexp(
+      with self.assertRaisesRegex(
           ValueError, 'instance was constructed with eager mode enabled'):
         model.fit(x, y, batch_size=2)
 
diff --git a/tensorflow/python/kernel_tests/array_ops_test.py b/tensorflow/python/kernel_tests/array_ops_test.py
index dbff3a1b2f7..6bf6311aafd 100644
--- a/tensorflow/python/kernel_tests/array_ops_test.py
+++ b/tensorflow/python/kernel_tests/array_ops_test.py
@@ -117,7 +117,7 @@ class BatchMatrixTransposeTest(test_util.TensorFlowTestCase):
 
   def testTensorWithStaticRankLessThanTwoRaisesBecauseNotAMatrix(self):
     vector = [1, 2, 3]
-    with self.assertRaisesRegexp(ValueError, "should be a "):
+    with self.assertRaisesRegex(ValueError, "should be a "):
       array_ops.matrix_transpose(vector)
 
 
@@ -249,28 +249,28 @@ class BooleanMaskTest(test_util.TensorFlowTestCase):
     with self.cached_session():
       tensor = array_ops.placeholder(dtypes.int32, shape=[None, 2])
       mask = array_ops.placeholder(dtypes.bool, shape=None)
-      with self.assertRaisesRegexp(ValueError, "dimensions must be specified"):
+      with self.assertRaisesRegex(ValueError, "dimensions must be specified"):
         array_ops.boolean_mask(tensor, mask)
 
   def testMaskHasMoreDimsThanTensorRaises(self):
     mask = [[True, True], [False, False]]
     tensor = [1, 2, 3, 4]
     with self.cached_session():
-      with self.assertRaisesRegexp(ValueError, "incompatible"):
+      with self.assertRaisesRegex(ValueError, "incompatible"):
         array_ops.boolean_mask(tensor, mask).eval()
 
   def testMaskIsScalarRaises(self):
     mask = True
     tensor = 1
     with self.cached_session():
-      with self.assertRaisesRegexp(ValueError, "mask.*scalar"):
+      with self.assertRaisesRegex(ValueError, "mask.*scalar"):
         array_ops.boolean_mask(tensor, mask).eval()
 
   def testMaskShapeDifferentThanFirstPartOfTensorShapeRaises(self):
     mask = [True, True, True]
     tensor = [[1, 2], [3, 4]]
     with self.cached_session():
-      with self.assertRaisesRegexp(ValueError, "incompatible"):
+      with self.assertRaisesRegex(ValueError, "incompatible"):
         array_ops.boolean_mask(tensor, mask).eval()
 
   @test_util.run_deprecated_v1
@@ -331,7 +331,7 @@ class OperatorShapeTest(test_util.TensorFlowTestCase):
     matrix_squeezed = array_ops.squeeze(matrix, [0])
     self.assertEqual(matrix_squeezed.get_shape(), (3))
 
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         Exception, "Can not squeeze dim.1., expected a dimension of 1, got 3"):
       matrix_squeezed = array_ops.squeeze(matrix, [1])
 
@@ -341,8 +341,8 @@ class OperatorShapeTest(test_util.TensorFlowTestCase):
     self.assertEqual(matrix_squeezed.get_shape(), (3))
 
   def testExpandDimsWithNonScalarDim(self):
-    with self.assertRaisesRegexp(Exception,
-                                 "must be a tensor with a single value"):
+    with self.assertRaisesRegex(Exception,
+                                "must be a tensor with a single value"):
       array_ops.expand_dims(1, axis=[0, 1])
 
 
@@ -403,11 +403,11 @@ class ReverseV2Test(test_util.TensorFlowTestCase):
   @test_util.run_deprecated_v1
   def testInvalidAxis(self):
     x_np = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.float32)
-    with self.assertRaisesRegexp(ValueError, "is out of valid range"):
+    with self.assertRaisesRegex(ValueError, "is out of valid range"):
       array_ops.reverse_v2(x_np, [-30])
-    with self.assertRaisesRegexp(ValueError, "is out of valid range"):
+    with self.assertRaisesRegex(ValueError, "is out of valid range"):
       array_ops.reverse_v2(x_np, [2])
-    with self.assertRaisesRegexp(ValueError, "axis 0 specified more than once"):
+    with self.assertRaisesRegex(ValueError, "axis 0 specified more than once"):
       array_ops.reverse_v2(x_np, [0, -2])
 
   # This is the version of reverse that uses axis indices rather than
@@ -421,13 +421,13 @@ class ReverseV2Test(test_util.TensorFlowTestCase):
     x_np = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.float32)
     axis = array_ops.placeholder(dtypes.int32)
     with self.cached_session():
-      with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
-                                   "is out of.*range"):
+      with self.assertRaisesRegex(errors_impl.InvalidArgumentError,
+                                  "is out of.*range"):
         array_ops.reverse_v2(x_np, axis).eval(feed_dict={axis: [-30]})
-      with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
-                                   "is out of.*range"):
+      with self.assertRaisesRegex(errors_impl.InvalidArgumentError,
+                                  "is out of.*range"):
         array_ops.reverse_v2(x_np, axis).eval(feed_dict={axis: [2]})
-      with self.assertRaisesRegexp(
+      with self.assertRaisesRegex(
           errors_impl.InvalidArgumentError,
           "(axis 0 specified more than once|canonicalized axis 0 was repeated.)"
       ):
@@ -726,7 +726,7 @@ class StridedSliceTest(test_util.TensorFlowTestCase):
       # ellipsis at middle
       _ = checker[0:1, ..., 0:1]
       # multiple ellipses not allowed
-      with self.assertRaisesRegexp(ValueError, "Multiple ellipses"):
+      with self.assertRaisesRegex(ValueError, "Multiple ellipses"):
         _ = checker[..., :, ...].eval()
 
   @test_util.run_deprecated_v1
@@ -765,17 +765,17 @@ class StridedSliceTest(test_util.TensorFlowTestCase):
     with self.session(use_gpu=True):
       checker = StridedSliceChecker(self, StridedSliceChecker.REF_TENSOR)
       expected = re.escape(array_ops._SLICE_TYPE_ERROR)
-      with self.assertRaisesRegexp(TypeError, expected):
+      with self.assertRaisesRegex(TypeError, expected):
         _ = checker["foo"]
-      with self.assertRaisesRegexp(TypeError, expected):
+      with self.assertRaisesRegex(TypeError, expected):
         _ = checker[constant_op.constant("foo")]
-      with self.assertRaisesRegexp(TypeError, expected):
+      with self.assertRaisesRegex(TypeError, expected):
         _ = checker[0.0]
-      with self.assertRaisesRegexp(TypeError, expected):
+      with self.assertRaisesRegex(TypeError, expected):
         _ = checker[constant_op.constant(0.0)]
-      with self.assertRaisesRegexp(TypeError, expected):
+      with self.assertRaisesRegex(TypeError, expected):
         _ = checker[constant_op.constant([1, 2, 3])]
-      with self.assertRaisesRegexp(TypeError, expected):
+      with self.assertRaisesRegex(TypeError, expected):
         _ = checker[[2.1, -0.7, 1.5]]
 
   @test_util.run_deprecated_v1
@@ -973,9 +973,9 @@ class StridedSliceGradTest(test_util.TensorFlowTestCase):
       _ = grad[3:0:-2, 1:3, 2]
       _ = grad[:, -1, :]
       _ = grad[:, -2, :]
-      with self.assertRaisesRegexp(ValueError, "out of bounds"):
+      with self.assertRaisesRegex(ValueError, "out of bounds"):
         _ = grad[:, -200, :]
-      with self.assertRaisesRegexp(ValueError, "out of bounds"):
+      with self.assertRaisesRegex(ValueError, "out of bounds"):
         _ = grad[:, 200, :]
 
       # Test numpy array type mask
@@ -1046,7 +1046,7 @@ class StridedSliceGradTypeTest(test_util.TensorFlowTestCase):
       begin = constant_op.constant([0, 0, 0], dtype=dtypes.int32)
       end = constant_op.constant([4, 1, 1], dtype=dtypes.int64)
       strides = constant_op.constant([1, 1, 1], dtype=dtypes.int64)
-      with self.assertRaisesRegexp(
+      with self.assertRaisesRegex(
           TypeError, "Input 'begin' of 'StridedSliceGrad' Op has type int32"
           " that does not match type int64 of argument 'shape'"):
         dx = array_ops.strided_slice_grad(original_shape, begin, end, strides,
@@ -1148,7 +1148,7 @@ class SliceAssignTest(test_util.TensorFlowTestCase):
   def testInvalidSlice(self):
     with self.cached_session() as sess:
       foo = constant_op.constant([1, 2, 3])
-      with self.assertRaisesRegexp(
+      with self.assertRaisesRegex(
           ValueError, "Sliced assignment"
           " is only supported for variables"):
         bar = foo[:2].assign(constant_op.constant([1, 2]))
@@ -1196,7 +1196,7 @@ class SliceAssignTest(test_util.TensorFlowTestCase):
 
   @test_util.run_v1_only("b/120545219")
   def testUninitialized(self):
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         errors.FailedPreconditionError,
         "Attempting to use uninitialized value Variable"):
       with self.cached_session() as sess:
@@ -1268,7 +1268,7 @@ class SequenceMaskTest(test_util.TensorFlowTestCase):
 
   def testExceptions(self):
     with self.cached_session():
-      with self.assertRaisesRegexp(ValueError, "maxlen must be scalar"):
+      with self.assertRaisesRegex(ValueError, "maxlen must be scalar"):
         array_ops.sequence_mask([10, 20], [10, 20])
 
   @test_util.run_deprecated_v1
@@ -1440,8 +1440,8 @@ class UnravelIndexTest(test_util.TensorFlowTestCase):
   def testUnravelIndexZeroDim(self):
     with self.cached_session():
       for dtype in [dtypes.int32, dtypes.int64]:
-        with self.assertRaisesRegexp(errors.InvalidArgumentError,
-                                     "index is out of bound as with dims"):
+        with self.assertRaisesRegex(errors.InvalidArgumentError,
+                                    "index is out of bound as with dims"):
           indices = constant_op.constant([2, 5, 7], dtype=dtype)
           dims = constant_op.constant([3, 0], dtype=dtype)
           self.evaluate(array_ops.unravel_index(indices=indices, dims=dims))
diff --git a/tensorflow/python/kernel_tests/barrier_ops_test.py b/tensorflow/python/kernel_tests/barrier_ops_test.py
index 60fe6f0eecd..9aef798bc90 100644
--- a/tensorflow/python/kernel_tests/barrier_ops_test.py
+++ b/tensorflow/python/kernel_tests/barrier_ops_test.py
@@ -78,17 +78,17 @@ class BarrierTest(test.TestCase):
       insert_0_op = b.insert_many(0, keys, [10.0, 20.0, 30.0])
       insert_1_op = b.insert_many(1, keys, [100.0, 200.0, 300.0])
 
-      self.assertEquals(size_t.eval(), [0])
+      self.assertEqual(size_t.eval(), [0])
       insert_0_op.run()
-      self.assertEquals(size_t.eval(), [0])
+      self.assertEqual(size_t.eval(), [0])
       insert_1_op.run()
-      self.assertEquals(size_t.eval(), [3])
+      self.assertEqual(size_t.eval(), [3])
 
   def testInsertManyEmptyTensor(self):
     with self.cached_session():
       error_message = ("Empty tensors are not supported, but received shape "
                        r"\'\(0,\)\' at index 1")
-      with self.assertRaisesRegexp(ValueError, error_message):
+      with self.assertRaisesRegex(ValueError, error_message):
         data_flow_ops.Barrier(
             (dtypes.float32, dtypes.float32), shapes=((1,), (0,)), name="B")
 
@@ -100,7 +100,7 @@ class BarrierTest(test.TestCase):
       self.assertEqual([], size_t.get_shape())
       keys = [b"a", b"b", b"c"]
       insert_0_op = b.insert_many(0, keys, np.array([[], [], []], np.float32))
-      self.assertEquals(size_t.eval(), [0])
+      self.assertEqual(size_t.eval(), [0])
       with self.assertRaisesOpError(
           ".*Tensors with no elements are not supported.*"):
         insert_0_op.run()
@@ -120,7 +120,7 @@ class BarrierTest(test.TestCase):
 
       insert_0_op.run()
       insert_1_op.run()
-      self.assertEquals(size_t.eval(), [3])
+      self.assertEqual(size_t.eval(), [3])
 
       indices_val, keys_val, values_0_val, values_1_val = sess.run(
           [take_t[0], take_t[1], take_t[2][0], take_t[2][1]])
@@ -157,8 +157,8 @@ class BarrierTest(test.TestCase):
       close_op.run()
       # Now we have a closed barrier with 2 ready elements. Running take_t
       # should return a reduced batch with 2 elements only.
-      self.assertEquals(size_i.eval(), [2])  # assert that incomplete size = 2
-      self.assertEquals(size_t.eval(), [2])  # assert that ready size = 2
+      self.assertEqual(size_i.eval(), [2])  # assert that incomplete size = 2
+      self.assertEqual(size_t.eval(), [2])  # assert that ready size = 2
       _, keys_val, values_0_val, values_1_val = sess.run(
           [index_t, key_t, value_list_t[0], value_list_t[1]])
       # Check that correct values have been returned.
@@ -170,8 +170,8 @@ class BarrierTest(test.TestCase):
       # The next insert completes the element with key "c". The next take_t
       # should return a batch with just 1 element.
       insert_1_2_op.run()
-      self.assertEquals(size_i.eval(), [1])  # assert that incomplete size = 1
-      self.assertEquals(size_t.eval(), [1])  # assert that ready size = 1
+      self.assertEqual(size_i.eval(), [1])  # assert that incomplete size = 1
+      self.assertEqual(size_t.eval(), [1])  # assert that ready size = 1
       _, keys_val, values_0_val, values_1_val = sess.run(
           [index_t, key_t, value_list_t[0], value_list_t[1]])
       # Check that correct values have been returned.
@@ -212,7 +212,7 @@ class BarrierTest(test.TestCase):
 
       insert_0_op.run()
       insert_1_op.run()
-      self.assertEquals(size_t.eval(), [3])
+      self.assertEqual(size_t.eval(), [3])
 
       indices_val, keys_val, values_0_val, values_1_val = sess.run(
           [take_t[0], take_t[1], take_t[2][0], take_t[2][1]])
@@ -237,7 +237,7 @@ class BarrierTest(test.TestCase):
       take_t = b.take_many(10)
 
       self.evaluate(insert_ops)
-      self.assertEquals(size_t.eval(), [10])
+      self.assertEqual(size_t.eval(), [10])
 
       indices_val, keys_val, values_val = sess.run(
           [take_t[0], take_t[1], take_t[2][0]])
@@ -258,7 +258,7 @@ class BarrierTest(test.TestCase):
       take_t = [b.take_many(1) for _ in keys]
 
       insert_op.run()
-      self.assertEquals(size_t.eval(), [10])
+      self.assertEqual(size_t.eval(), [10])
 
       index_fetches = []
       key_fetches = []
@@ -360,7 +360,7 @@ class BarrierTest(test.TestCase):
       for t in insert_threads:
         t.join()
 
-      self.assertEquals(len(taken), num_iterations)
+      self.assertEqual(len(taken), num_iterations)
       flatten = lambda l: [item for sublist in l for item in sublist]
       all_indices = sorted(flatten([t_i["indices"] for t_i in taken]))
       all_keys = sorted(flatten([t_i["keys"] for t_i in taken]))
@@ -402,11 +402,11 @@ class BarrierTest(test.TestCase):
       take_t = b.take_many(3)
       take_too_many_t = b.take_many(4)
 
-      self.assertEquals(size_t.eval(), [0])
-      self.assertEquals(incomplete_t.eval(), [0])
+      self.assertEqual(size_t.eval(), [0])
+      self.assertEqual(incomplete_t.eval(), [0])
       insert_0_op.run()
-      self.assertEquals(size_t.eval(), [0])
-      self.assertEquals(incomplete_t.eval(), [3])
+      self.assertEqual(size_t.eval(), [0])
+      self.assertEqual(incomplete_t.eval(), [3])
       close_op.run()
 
       # This op should fail because the barrier is closed.
@@ -416,8 +416,8 @@ class BarrierTest(test.TestCase):
       # This op should succeed because the barrier has not canceled
       # pending enqueues
       insert_1_op.run()
-      self.assertEquals(size_t.eval(), [3])
-      self.assertEquals(incomplete_t.eval(), [0])
+      self.assertEqual(size_t.eval(), [3])
+      self.assertEqual(incomplete_t.eval(), [0])
 
       # This op should fail because the barrier is closed.
       with self.assertRaisesOpError("is closed"):
@@ -462,11 +462,11 @@ class BarrierTest(test.TestCase):
       take_t = b.take_many(2)
       take_too_many_t = b.take_many(3)
 
-      self.assertEquals(size_t.eval(), [0])
+      self.assertEqual(size_t.eval(), [0])
       insert_0_op.run()
       insert_1_op.run()
-      self.assertEquals(size_t.eval(), [2])
-      self.assertEquals(incomplete_t.eval(), [1])
+      self.assertEqual(size_t.eval(), [2])
+      self.assertEqual(incomplete_t.eval(), [1])
       cancel_op.run()
 
       # This op should fail because the queue is closed.
diff --git a/tensorflow/python/kernel_tests/base64_ops_test.py b/tensorflow/python/kernel_tests/base64_ops_test.py
index d5a5dc8c013..52a086de1eb 100644
--- a/tensorflow/python/kernel_tests/base64_ops_test.py
+++ b/tensorflow/python/kernel_tests/base64_ops_test.py
@@ -107,7 +107,7 @@ class Base64OpsTest(test_util.TensorFlowTestCase):
       # Invalid length.
       msg = np.random.bytes(99)
       enc = base64.urlsafe_b64encode(msg)
-      with self.assertRaisesRegexp(errors.InvalidArgumentError, "1 modulo 4"):
+      with self.assertRaisesRegex(errors.InvalidArgumentError, "1 modulo 4"):
         try_decode(enc + b"a")
 
       # Invalid char used in encoding.
diff --git a/tensorflow/python/kernel_tests/benchmark_test.py b/tensorflow/python/kernel_tests/benchmark_test.py
index 3fa2054847d..f4548baddaa 100644
--- a/tensorflow/python/kernel_tests/benchmark_test.py
+++ b/tensorflow/python/kernel_tests/benchmark_test.py
@@ -184,7 +184,7 @@ class BenchmarkTest(test.TestCase):
       def read_benchmark_entry(f):
         s = gfile.GFile(f, "rb").read()
         entries = test_log_pb2.BenchmarkEntries.FromString(s)
-        self.assertEquals(1, len(entries.entry))
+        self.assertEqual(1, len(entries.entry))
         return entries.entry[0]
 
       read_benchmark_1 = read_benchmark_entry(expected_output_file)
@@ -194,8 +194,8 @@ class BenchmarkTest(test.TestCase):
       self.assertProtoEquals(expected_2, read_benchmark_2)
 
       read_benchmark_3 = read_benchmark_entry(expected_output_file_3)
-      self.assertEquals(expected_3.name, read_benchmark_3.name)
-      self.assertEquals(expected_3.iters, read_benchmark_3.iters)
+      self.assertEqual(expected_3.name, read_benchmark_3.name)
+      self.assertEqual(expected_3.iters, read_benchmark_3.iters)
       self.assertGreater(read_benchmark_3.wall_time, 0)
 
       # Trace is not stored in benchmark entry. Instead we get it from
diff --git a/tensorflow/python/kernel_tests/betainc_op_test.py b/tensorflow/python/kernel_tests/betainc_op_test.py
index c564c822918..727e15b1661 100644
--- a/tensorflow/python/kernel_tests/betainc_op_test.py
+++ b/tensorflow/python/kernel_tests/betainc_op_test.py
@@ -97,7 +97,7 @@ class BetaincTest(test.TestCase):
             rtol=rtol,
             atol=atol)
 
-      with self.assertRaisesRegexp(ValueError, "must be equal"):
+      with self.assertRaisesRegex(ValueError, "must be equal"):
         math_ops.betainc(0.5, [0.5], [[0.5]])
 
       with self.cached_session():
diff --git a/tensorflow/python/kernel_tests/bincount_op_test.py b/tensorflow/python/kernel_tests/bincount_op_test.py
index 22ac9f8e99d..efa68fd6521 100644
--- a/tensorflow/python/kernel_tests/bincount_op_test.py
+++ b/tensorflow/python/kernel_tests/bincount_op_test.py
@@ -122,11 +122,11 @@ class BincountTest(test_util.TensorFlowTestCase):
   @test_util.run_deprecated_v1
   def test_shape_function(self):
     # size must be scalar.
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         ValueError, "Shape must be rank 0 but is rank 1 for .*Bincount"):
       gen_math_ops.bincount([1, 2, 3, -1, 6, 8], [1], [])
     # size must be positive.
-    with self.assertRaisesRegexp(ValueError, "must be non-negative"):
+    with self.assertRaisesRegex(ValueError, "must be non-negative"):
       gen_math_ops.bincount([1, 2, 3, -1, 6, 8], -5, [])
     # if size is a constant then the shape is known.
     v1 = gen_math_ops.bincount([1, 2, 3, -1, 6, 8], 5, [])
@@ -324,7 +324,7 @@ class BincountOpTest(test_util.TensorFlowTestCase, parameterized.TestCase):
 
   @test_util.run_deprecated_v1
   def test_invalid_rank(self):
-    with self.assertRaisesRegexp(ValueError, "at most rank 2"):
+    with self.assertRaisesRegex(ValueError, "at most rank 2"):
       with test_util.use_gpu():
         self.evaluate(
             gen_math_ops.dense_bincount(
diff --git a/tensorflow/python/kernel_tests/bitcast_op_test.py b/tensorflow/python/kernel_tests/bitcast_op_test.py
index b4f9a21a899..60ed92d2173 100644
--- a/tensorflow/python/kernel_tests/bitcast_op_test.py
+++ b/tensorflow/python/kernel_tests/bitcast_op_test.py
@@ -64,7 +64,7 @@ class BitcastTest(test.TestCase):
   def testErrors(self):
     x = np.zeros([1, 1], np.int8)
     datatype = dtypes.int32
-    with self.assertRaisesRegexp(ValueError, "Cannot bitcast due to shape"):
+    with self.assertRaisesRegex(ValueError, "Cannot bitcast due to shape"):
       array_ops.bitcast(x, datatype, None)
 
   def testEmpty(self):
diff --git a/tensorflow/python/kernel_tests/boosted_trees/stats_ops_test.py b/tensorflow/python/kernel_tests/boosted_trees/stats_ops_test.py
index c5f58f1f6b2..73098ed3084 100644
--- a/tensorflow/python/kernel_tests/boosted_trees/stats_ops_test.py
+++ b/tensorflow/python/kernel_tests/boosted_trees/stats_ops_test.py
@@ -214,7 +214,7 @@ class StatsOpsTest(test_util.TensorFlowTestCase):
     stats_summaries = self._get_stats_summary_for_split()
     stats_summaries = self.add_f_dim_and_append_zeros(stats_summaries)
 
-    with self.assertRaisesRegexp(Exception, 'Incorrect split type'):
+    with self.assertRaisesRegex(Exception, 'Incorrect split type'):
       self.evaluate(
           boosted_trees_ops.calculate_best_feature_split_v2(
               node_id_range,
diff --git a/tensorflow/python/kernel_tests/boosted_trees/training_ops_test.py b/tensorflow/python/kernel_tests/boosted_trees/training_ops_test.py
index fbac51ea1fb..c802c5284f5 100644
--- a/tensorflow/python/kernel_tests/boosted_trees/training_ops_test.py
+++ b/tensorflow/python/kernel_tests/boosted_trees/training_ops_test.py
@@ -4117,8 +4117,8 @@ class UpdateTreeEnsembleOpTest(test_util.TensorFlowTestCase):
                                      dtype=np.float32)
       split_types = np.array(
           [_INEQUALITY_DEFAULT_LEFT, _INEQUALITY_DEFAULT_LEFT])
-      with self.assertRaisesRegexp(Exception,
-                                   r'Dimension 0 in both shapes must be equal'):
+      with self.assertRaisesRegex(Exception,
+                                  r'Dimension 0 in both shapes must be equal'):
         grow_op = boosted_trees_ops.update_ensemble_v2(
             tree_ensemble_handle,
             learning_rate=1.0,
diff --git a/tensorflow/python/kernel_tests/broadcast_to_ops_test.py b/tensorflow/python/kernel_tests/broadcast_to_ops_test.py
index f478ee9f643..742545ebe91 100644
--- a/tensorflow/python/kernel_tests/broadcast_to_ops_test.py
+++ b/tensorflow/python/kernel_tests/broadcast_to_ops_test.py
@@ -131,8 +131,8 @@ class BroadcastToTest(test_util.TensorFlowTestCase):
 
   def testBroadcastToBadOutputShape(self):
     with context.eager_mode():
-      with self.assertRaisesRegexp(errors.InvalidArgumentError,
-                                   "Unable to broadcast tensor of shape"):
+      with self.assertRaisesRegex(errors.InvalidArgumentError,
+                                  "Unable to broadcast tensor of shape"):
         self.evaluate(
             array_ops.broadcast_to(
                 constant_op.constant([0, 1]), constant_op.constant([2, 1])))
diff --git a/tensorflow/python/kernel_tests/bucketize_op_test.py b/tensorflow/python/kernel_tests/bucketize_op_test.py
index 128cc17db15..59c30d8f2df 100644
--- a/tensorflow/python/kernel_tests/bucketize_op_test.py
+++ b/tensorflow/python/kernel_tests/bucketize_op_test.py
@@ -67,13 +67,12 @@ class BucketizationOpTest(test.TestCase):
     op = math_ops._bucketize(
         constant_op.constant([-5, 0]), boundaries=[0, 8, 3, 11])
     with self.session(use_gpu=True) as sess:
-      with self.assertRaisesRegexp(
-          errors_impl.InvalidArgumentError, "Expected sorted boundaries"):
+      with self.assertRaisesRegex(errors_impl.InvalidArgumentError,
+                                  "Expected sorted boundaries"):
         self.evaluate(op)
 
   def testBoundariesNotList(self):
-    with self.assertRaisesRegexp(
-        TypeError, "Expected list.*"):
+    with self.assertRaisesRegex(TypeError, "Expected list.*"):
       math_ops._bucketize(constant_op.constant([-5, 0]), boundaries=0)
 
 
diff --git a/tensorflow/python/kernel_tests/check_ops_test.py b/tensorflow/python/kernel_tests/check_ops_test.py
index 9bade548849..376b0058927 100644
--- a/tensorflow/python/kernel_tests/check_ops_test.py
+++ b/tensorflow/python/kernel_tests/check_ops_test.py
@@ -99,7 +99,7 @@ class AssertV2Asserts(test.TestCase):
         def failing_fn():
           fn(*failing_args, message="fail")  # pylint: disable=cell-var-from-loop
 
-        with self.assertRaisesRegexp(error, "fail"):
+        with self.assertRaisesRegex(error, "fail"):
           failing_fn()
 
         del failing_fn
@@ -110,32 +110,32 @@ class AssertProperIterableTest(test.TestCase):
   @test_util.run_in_graph_and_eager_modes
   def test_single_tensor_raises(self):
     tensor = constant_op.constant(1)
-    with self.assertRaisesRegexp(TypeError, "proper"):
+    with self.assertRaisesRegex(TypeError, "proper"):
       check_ops.assert_proper_iterable(tensor)
 
   @test_util.run_in_graph_and_eager_modes
   def test_single_sparse_tensor_raises(self):
     ten = sparse_tensor.SparseTensor(
         indices=[[0, 0], [1, 2]], values=[1, 2], dense_shape=[3, 4])
-    with self.assertRaisesRegexp(TypeError, "proper"):
+    with self.assertRaisesRegex(TypeError, "proper"):
       check_ops.assert_proper_iterable(ten)
 
   @test_util.run_in_graph_and_eager_modes
   def test_single_ndarray_raises(self):
     array = np.array([1, 2, 3])
-    with self.assertRaisesRegexp(TypeError, "proper"):
+    with self.assertRaisesRegex(TypeError, "proper"):
       check_ops.assert_proper_iterable(array)
 
   @test_util.run_in_graph_and_eager_modes
   def test_single_string_raises(self):
     mystr = "hello"
-    with self.assertRaisesRegexp(TypeError, "proper"):
+    with self.assertRaisesRegex(TypeError, "proper"):
       check_ops.assert_proper_iterable(mystr)
 
   @test_util.run_in_graph_and_eager_modes
   def test_non_iterable_object_raises(self):
     non_iterable = 1234
-    with self.assertRaisesRegexp(TypeError, "to be iterable"):
+    with self.assertRaisesRegex(TypeError, "to be iterable"):
       check_ops.assert_proper_iterable(non_iterable)
 
   @test_util.run_in_graph_and_eager_modes
@@ -165,7 +165,7 @@ class AssertEqualTest(test.TestCase):
   def test_scalar_comparison(self):
     const_true = constant_op.constant(True, name="true")
     const_false = constant_op.constant(False, name="false")
-    with self.assertRaisesRegexp(errors.InvalidArgumentError, "fail"):
+    with self.assertRaisesRegex(errors.InvalidArgumentError, "fail"):
       check_ops.assert_equal(const_true, const_false, message="fail")
 
   def test_returns_none_with_eager(self):
@@ -180,7 +180,7 @@ class AssertEqualTest(test.TestCase):
     # Static check
     static_small = constant_op.constant([1, 2], name="small")
     static_big = constant_op.constant([3, 4], name="big")
-    with self.assertRaisesRegexp(errors.InvalidArgumentError, "fail"):
+    with self.assertRaisesRegex(errors.InvalidArgumentError, "fail"):
       check_ops.assert_equal(static_big, static_small, message="fail")
 
   @test_util.run_deprecated_v1
@@ -239,15 +239,15 @@ First 2 elements of y:
     with context.eager_mode():
       big = constant_op.constant([[2, 2], [3, 3], [6, 6]])
       small = constant_op.constant([[20, 2], [3, 30], [60, 6]])
-      with self.assertRaisesRegexp(errors.InvalidArgumentError,
-                                   expected_error_msg_full):
+      with self.assertRaisesRegex(errors.InvalidArgumentError,
+                                  expected_error_msg_full):
         check_ops.assert_equal(big, small, message="big does not equal small",
                                summarize=10)
-      with self.assertRaisesRegexp(errors.InvalidArgumentError,
-                                   expected_error_msg_default):
+      with self.assertRaisesRegex(errors.InvalidArgumentError,
+                                  expected_error_msg_default):
         check_ops.assert_equal(big, small, message="big does not equal small")
-      with self.assertRaisesRegexp(errors.InvalidArgumentError,
-                                   expected_error_msg_short):
+      with self.assertRaisesRegex(errors.InvalidArgumentError,
+                                  expected_error_msg_short):
         check_ops.assert_equal(big, small, message="big does not equal small",
                                summarize=2)
 
@@ -257,7 +257,7 @@ First 2 elements of y:
     # Static check
     static_small = constant_op.constant([3, 1], name="small")
     static_big = constant_op.constant([4, 2], name="big")
-    with self.assertRaisesRegexp(errors.InvalidArgumentError, "fail"):
+    with self.assertRaisesRegex(errors.InvalidArgumentError, "fail"):
       check_ops.assert_equal(static_big, static_small, message="fail")
 
   @test_util.run_deprecated_v1
@@ -285,10 +285,9 @@ First 2 elements of y:
     # The exception in eager and non-eager mode is different because
     # eager mode relies on shape check done as part of the C++ op, while
     # graph mode does shape checks when creating the `Operation` instance.
-    with self.assertRaisesRegexp(
-        (errors.InvalidArgumentError, ValueError),
-        (r"Incompatible shapes: \[3\] vs. \[2\]|"
-         r"Dimensions must be equal, but are 3 and 2")):
+    with self.assertRaisesRegex((errors.InvalidArgumentError, ValueError),
+                                (r"Incompatible shapes: \[3\] vs. \[2\]|"
+                                 r"Dimensions must be equal, but are 3 and 2")):
       with ops.control_dependencies([check_ops.assert_equal(small, small_2)]):
         out = array_ops.identity(small)
       self.evaluate(out)
@@ -296,7 +295,7 @@ First 2 elements of y:
   @test_util.run_in_graph_and_eager_modes
   def test_raises_when_not_equal_and_broadcastable_shapes(self):
     cond = constant_op.constant([True, False], name="small")
-    with self.assertRaisesRegexp(errors.InvalidArgumentError, "fail"):
+    with self.assertRaisesRegex(errors.InvalidArgumentError, "fail"):
       check_ops.assert_equal(cond, False, message="fail")
 
   @test_util.run_in_graph_and_eager_modes
@@ -354,10 +353,9 @@ class AssertNoneEqualTest(test.TestCase):
     # The exception in eager and non-eager mode is different because
     # eager mode relies on shape check done as part of the C++ op, while
     # graph mode does shape checks when creating the `Operation` instance.
-    with self.assertRaisesRegexp(
-        (ValueError, errors.InvalidArgumentError),
-        (r"Incompatible shapes: \[3\] vs. \[2\]|"
-         r"Dimensions must be equal, but are 3 and 2")):
+    with self.assertRaisesRegex((ValueError, errors.InvalidArgumentError),
+                                (r"Incompatible shapes: \[3\] vs. \[2\]|"
+                                 r"Dimensions must be equal, but are 3 and 2")):
       with ops.control_dependencies(
           [check_ops.assert_none_equal(small, big)]):
         out = array_ops.identity(small)
@@ -381,9 +379,8 @@ class AssertNoneEqualTest(test.TestCase):
 
   def test_static_check_in_graph_mode(self):
     with ops.Graph().as_default():
-      with self.assertRaisesRegexp(  # pylint:disable=g-error-prone-assert-raises
-          errors.InvalidArgumentError,
-          "Custom error message"):
+      with self.assertRaisesRegex(  # pylint:disable=g-error-prone-assert-raises
+          errors.InvalidArgumentError, "Custom error message"):
         check_ops.assert_none_equal(1, 1, message="Custom error message")
 
   def test_error_message_eager(self):
@@ -394,23 +391,19 @@ class AssertNoneEqualTest(test.TestCase):
     with context.eager_mode():
       t = constant_op.constant(
           np.array(range(6)), shape=[2, 3], dtype=np.float32)
-      with self.assertRaisesRegexp(  # pylint:disable=g-error-prone-assert-raises
-          errors.InvalidArgumentError,
-          expected_error_msg_full):
+      with self.assertRaisesRegex(  # pylint:disable=g-error-prone-assert-raises
+          errors.InvalidArgumentError, expected_error_msg_full):
         check_ops.assert_none_equal(
             t, t, message="This is the error message.", summarize=10)
-      with self.assertRaisesRegexp(  # pylint:disable=g-error-prone-assert-raises
-          errors.InvalidArgumentError,
-          expected_error_msg_full):
+      with self.assertRaisesRegex(  # pylint:disable=g-error-prone-assert-raises
+          errors.InvalidArgumentError, expected_error_msg_full):
         check_ops.assert_none_equal(
             t, t, message="This is the error message.", summarize=-1)
-      with self.assertRaisesRegexp(  # pylint:disable=g-error-prone-assert-raises
-          errors.InvalidArgumentError,
-          expected_error_msg_default):
+      with self.assertRaisesRegex(  # pylint:disable=g-error-prone-assert-raises
+          errors.InvalidArgumentError, expected_error_msg_default):
         check_ops.assert_none_equal(t, t, message="This is the error message.")
-      with self.assertRaisesRegexp(  # pylint:disable=g-error-prone-assert-raises
-          errors.InvalidArgumentError,
-          expected_error_msg_short):
+      with self.assertRaisesRegex(  # pylint:disable=g-error-prone-assert-raises
+          errors.InvalidArgumentError, expected_error_msg_short):
         check_ops.assert_none_equal(
             t, t, message="This is the error message.", summarize=2)
 
@@ -588,7 +581,7 @@ class AssertLessTest(test.TestCase):
     # The exception in eager and non-eager mode is different because
     # eager mode relies on shape check done as part of the C++ op, while
     # graph mode does shape checks when creating the `Operation` instance.
-    with self.assertRaisesRegexp(  # pylint:disable=g-error-prone-assert-raises
+    with self.assertRaisesRegex(  # pylint:disable=g-error-prone-assert-raises
         (ValueError, errors.InvalidArgumentError),
         (r"Incompatible shapes: \[3\] vs. \[2\]|"
          "Dimensions must be equal, but are 3 and 2")):
@@ -613,9 +606,8 @@ class AssertLessTest(test.TestCase):
 
   def test_static_check_in_graph_mode(self):
     with ops.Graph().as_default():
-      with self.assertRaisesRegexp(  # pylint:disable=g-error-prone-assert-raises
-          errors.InvalidArgumentError,
-          "Custom error message"):
+      with self.assertRaisesRegex(  # pylint:disable=g-error-prone-assert-raises
+          errors.InvalidArgumentError, "Custom error message"):
         check_ops.assert_less(1, 1, message="Custom error message")
 
 
@@ -665,7 +657,7 @@ class AssertLessEqualTest(test.TestCase):
     # The exception in eager and non-eager mode is different because
     # eager mode relies on shape check done as part of the C++ op, while
     # graph mode does shape checks when creating the `Operation` instance.
-    with self.assertRaisesRegexp(  # pylint:disable=g-error-prone-assert-raises
+    with self.assertRaisesRegex(  # pylint:disable=g-error-prone-assert-raises
         (errors.InvalidArgumentError, ValueError),
         (r"Incompatible shapes: \[2\] vs. \[3\]|"
          r"Dimensions must be equal, but are 2 and 3")):
@@ -685,9 +677,8 @@ class AssertLessEqualTest(test.TestCase):
 
   def test_static_check_in_graph_mode(self):
     with ops.Graph().as_default():
-      with self.assertRaisesRegexp(  # pylint:disable=g-error-prone-assert-raises
-          errors.InvalidArgumentError,
-          "Custom error message"):
+      with self.assertRaisesRegex(  # pylint:disable=g-error-prone-assert-raises
+          errors.InvalidArgumentError, "Custom error message"):
         check_ops.assert_less_equal(1, 0, message="Custom error message")
 
 
@@ -739,7 +730,7 @@ class AssertGreaterTest(test.TestCase):
     # The exception in eager and non-eager mode is different because
     # eager mode relies on shape check done as part of the C++ op, while
     # graph mode does shape checks when creating the `Operation` instance.
-    with self.assertRaisesRegexp(  # pylint:disable=g-error-prone-assert-raises
+    with self.assertRaisesRegex(  # pylint:disable=g-error-prone-assert-raises
         (errors.InvalidArgumentError, ValueError),
         (r"Incompatible shapes: \[2\] vs. \[3\]|"
          r"Dimensions must be equal, but are 2 and 3")):
@@ -757,9 +748,8 @@ class AssertGreaterTest(test.TestCase):
 
   def test_static_check_in_graph_mode(self):
     with ops.Graph().as_default():
-      with self.assertRaisesRegexp(  # pylint:disable=g-error-prone-assert-raises
-          errors.InvalidArgumentError,
-          "Custom error message"):
+      with self.assertRaisesRegex(  # pylint:disable=g-error-prone-assert-raises
+          errors.InvalidArgumentError, "Custom error message"):
         check_ops.assert_greater(0, 1, message="Custom error message")
 
 
@@ -811,7 +801,7 @@ class AssertGreaterEqualTest(test.TestCase):
     # The exception in eager and non-eager mode is different because
     # eager mode relies on shape check done as part of the C++ op, while
     # graph mode does shape checks when creating the `Operation` instance.
-    with self.assertRaisesRegexp(  # pylint:disable=g-error-prone-assert-raises
+    with self.assertRaisesRegex(  # pylint:disable=g-error-prone-assert-raises
         (errors.InvalidArgumentError, ValueError),
         (r"Incompatible shapes: \[2\] vs. \[3\]|"
          r"Dimensions must be equal, but are 2 and 3")):
@@ -831,9 +821,8 @@ class AssertGreaterEqualTest(test.TestCase):
 
   def test_static_check_in_graph_mode(self):
     with ops.Graph().as_default():
-      with self.assertRaisesRegexp(  # pylint:disable=g-error-prone-assert-raises
-          errors.InvalidArgumentError,
-          "Custom error message"):
+      with self.assertRaisesRegex(  # pylint:disable=g-error-prone-assert-raises
+          errors.InvalidArgumentError, "Custom error message"):
         check_ops.assert_greater_equal(0, 1, message="Custom error message")
 
 
@@ -881,8 +870,8 @@ class AssertNegativeTest(test.TestCase):
 
   def test_static_check_in_graph_mode(self):
     with ops.Graph().as_default():
-      with self.assertRaisesRegexp(errors.InvalidArgumentError,
-                                   "Custom error message"):
+      with self.assertRaisesRegex(errors.InvalidArgumentError,
+                                  "Custom error message"):
         check_ops.assert_negative(1, message="Custom error message")
 
 
@@ -929,8 +918,8 @@ class AssertPositiveTest(test.TestCase):
 
   def test_static_check_in_graph_mode(self):
     with ops.Graph().as_default():
-      with self.assertRaisesRegexp(errors.InvalidArgumentError,
-                                   "Custom error message"):
+      with self.assertRaisesRegex(errors.InvalidArgumentError,
+                                  "Custom error message"):
         check_ops.assert_positive(-1, message="Custom error message")
 
 
@@ -1106,8 +1095,7 @@ class AssertRankTest(test.TestCase):
   def test_rank_zero_tensor_raises_if_rank_too_small_static_rank(self):
     tensor = constant_op.constant(1, name="my_tensor")
     desired_rank = 1
-    with self.assertRaisesRegexp(ValueError,
-                                 "fail.*must have rank 1"):
+    with self.assertRaisesRegex(ValueError, "fail.*must have rank 1"):
       with ops.control_dependencies(
           [check_ops.assert_rank(
               tensor, desired_rank, message="fail")]):
@@ -1145,7 +1133,7 @@ class AssertRankTest(test.TestCase):
   def test_rank_one_tensor_raises_if_rank_too_large_static_rank(self):
     tensor = constant_op.constant([1, 2], name="my_tensor")
     desired_rank = 0
-    with self.assertRaisesRegexp(ValueError, "rank"):
+    with self.assertRaisesRegex(ValueError, "rank"):
       with ops.control_dependencies(
           [check_ops.assert_rank(tensor, desired_rank)]):
         self.evaluate(array_ops.identity(tensor))
@@ -1181,7 +1169,7 @@ class AssertRankTest(test.TestCase):
   def test_rank_one_tensor_raises_if_rank_too_small_static_rank(self):
     tensor = constant_op.constant([1, 2], name="my_tensor")
     desired_rank = 2
-    with self.assertRaisesRegexp(ValueError, "rank"):
+    with self.assertRaisesRegex(ValueError, "rank"):
       with ops.control_dependencies(
           [check_ops.assert_rank(tensor, desired_rank)]):
         self.evaluate(array_ops.identity(tensor))
@@ -1199,7 +1187,7 @@ class AssertRankTest(test.TestCase):
   @test_util.run_in_graph_and_eager_modes
   def test_raises_if_rank_is_not_scalar_static(self):
     tensor = constant_op.constant([1, 2], name="my_tensor")
-    with self.assertRaisesRegexp(ValueError, "Rank must be a scalar"):
+    with self.assertRaisesRegex(ValueError, "Rank must be a scalar"):
       check_ops.assert_rank(tensor, np.array([], dtype=np.int32))
 
   @test_util.run_deprecated_v1
@@ -1216,8 +1204,7 @@ class AssertRankTest(test.TestCase):
   @test_util.run_in_graph_and_eager_modes
   def test_raises_if_rank_is_not_integer_static(self):
     tensor = constant_op.constant([1, 2], name="my_tensor")
-    with self.assertRaisesRegexp(TypeError,
-                                 "must be of type <dtype: 'int32'>"):
+    with self.assertRaisesRegex(TypeError, "must be of type <dtype: 'int32'>"):
       check_ops.assert_rank(tensor, .5)
 
   @test_util.run_deprecated_v1
@@ -1226,8 +1213,8 @@ class AssertRankTest(test.TestCase):
       tensor = constant_op.constant(
           [1, 2], dtype=dtypes.float32, name="my_tensor")
       rank_tensor = array_ops.placeholder(dtypes.float32, name="rank_tensor")
-      with self.assertRaisesRegexp(TypeError,
-                                   "must be of type <dtype: 'int32'>"):
+      with self.assertRaisesRegex(TypeError,
+                                  "must be of type <dtype: 'int32'>"):
         with ops.control_dependencies(
             [check_ops.assert_rank(tensor, rank_tensor)]):
           array_ops.identity(tensor).eval(feed_dict={rank_tensor: .5})
@@ -1238,8 +1225,7 @@ class AssertRankInTest(test.TestCase):
   @test_util.run_in_graph_and_eager_modes
   def test_rank_zero_tensor_raises_if_rank_mismatch_static_rank(self):
     tensor_rank0 = constant_op.constant(42, name="my_tensor")
-    with self.assertRaisesRegexp(
-        ValueError, "fail.*must have rank.*in.*1.*2"):
+    with self.assertRaisesRegex(ValueError, "fail.*must have rank.*in.*1.*2"):
       with ops.control_dependencies([
           check_ops.assert_rank_in(tensor_rank0, (1, 2), message="fail")]):
         self.evaluate(array_ops.identity(tensor_rank0))
@@ -1292,7 +1278,7 @@ class AssertRankInTest(test.TestCase):
   @test_util.run_in_graph_and_eager_modes
   def test_rank_one_tensor_raises_if_rank_mismatches_static_rank(self):
     tensor_rank1 = constant_op.constant((42, 43), name="my_tensor")
-    with self.assertRaisesRegexp(ValueError, "rank"):
+    with self.assertRaisesRegex(ValueError, "rank"):
       with ops.control_dependencies([
           check_ops.assert_rank_in(tensor_rank1, (0, 2))]):
         self.evaluate(array_ops.identity(tensor_rank1))
@@ -1314,7 +1300,7 @@ class AssertRankInTest(test.TestCase):
     desired_ranks = (
         np.array(1, dtype=np.int32),
         np.array((2, 1), dtype=np.int32))
-    with self.assertRaisesRegexp(ValueError, "Rank must be a scalar"):
+    with self.assertRaisesRegex(ValueError, "Rank must be a scalar"):
       check_ops.assert_rank_in(tensor, desired_ranks)
 
   @test_util.run_deprecated_v1
@@ -1336,8 +1322,7 @@ class AssertRankInTest(test.TestCase):
   @test_util.run_in_graph_and_eager_modes
   def test_raises_if_rank_is_not_integer_static(self):
     tensor = constant_op.constant((42, 43), name="my_tensor")
-    with self.assertRaisesRegexp(TypeError,
-                                 "must be of type <dtype: 'int32'>"):
+    with self.assertRaisesRegex(TypeError, "must be of type <dtype: 'int32'>"):
       check_ops.assert_rank_in(tensor, (1, .5,))
 
   @test_util.run_deprecated_v1
@@ -1346,8 +1331,8 @@ class AssertRankInTest(test.TestCase):
       tensor = constant_op.constant(
           (42, 43), dtype=dtypes.float32, name="my_tensor")
       rank_tensor = array_ops.placeholder(dtypes.float32, name="rank_tensor")
-      with self.assertRaisesRegexp(TypeError,
-                                   "must be of type <dtype: 'int32'>"):
+      with self.assertRaisesRegex(TypeError,
+                                  "must be of type <dtype: 'int32'>"):
         with ops.control_dependencies(
             [check_ops.assert_rank_in(tensor, (1, rank_tensor))]):
           array_ops.identity(tensor).eval(feed_dict={rank_tensor: .5})
@@ -1359,7 +1344,7 @@ class AssertRankAtLeastTest(test.TestCase):
   def test_rank_zero_tensor_raises_if_rank_too_small_static_rank(self):
     tensor = constant_op.constant(1, name="my_tensor")
     desired_rank = 1
-    with self.assertRaisesRegexp(ValueError, "rank at least 1"):
+    with self.assertRaisesRegex(ValueError, "rank at least 1"):
       with ops.control_dependencies(
           [check_ops.assert_rank_at_least(tensor, desired_rank)]):
         self.evaluate(array_ops.identity(tensor))
@@ -1429,7 +1414,7 @@ class AssertRankAtLeastTest(test.TestCase):
   def test_rank_one_tensor_raises_if_rank_too_small_static_rank(self):
     tensor = constant_op.constant([1, 2], name="my_tensor")
     desired_rank = 2
-    with self.assertRaisesRegexp(ValueError, "rank at least 2"):
+    with self.assertRaisesRegex(ValueError, "rank at least 2"):
       with ops.control_dependencies(
           [check_ops.assert_rank_at_least(tensor, desired_rank)]):
         self.evaluate(array_ops.identity(tensor))
@@ -1476,8 +1461,8 @@ class AssertNonNegativeTest(test.TestCase):
 
   def test_static_check_in_graph_mode(self):
     with ops.Graph().as_default():
-      with self.assertRaisesRegexp(errors.InvalidArgumentError,
-                                   "Custom error message"):
+      with self.assertRaisesRegex(errors.InvalidArgumentError,
+                                  "Custom error message"):
         check_ops.assert_non_negative(-1, message="Custom error message")
 
 
@@ -1512,8 +1497,8 @@ class AssertNonPositiveTest(test.TestCase):
 
   def test_static_check_in_graph_mode(self):
     with ops.Graph().as_default():
-      with self.assertRaisesRegexp(errors.InvalidArgumentError,
-                                   "Custom error message"):
+      with self.assertRaisesRegex(errors.InvalidArgumentError,
+                                  "Custom error message"):
         check_ops.assert_non_positive(1, message="Custom error message")
 
 
@@ -1529,7 +1514,7 @@ class AssertIntegerTest(test.TestCase):
   @test_util.run_in_graph_and_eager_modes
   def test_raises_when_float(self):
     floats = constant_op.constant([1.0, 2.0], name="floats")
-    with self.assertRaisesRegexp(TypeError, "Expected.*integer"):
+    with self.assertRaisesRegex(TypeError, "Expected.*integer"):
       check_ops.assert_integer(floats)
 
 
@@ -1546,7 +1531,7 @@ class AssertTypeTest(test.TestCase):
   @test_util.run_in_graph_and_eager_modes
   def test_raises_when_wrong_type(self):
     floats = constant_op.constant([1.0, 2.0], dtype=dtypes.float16)
-    with self.assertRaisesRegexp(TypeError, "must be of type.*float32"):
+    with self.assertRaisesRegex(TypeError, "must be of type.*float32"):
       check_ops.assert_type(floats, dtypes.float32)
 
 
@@ -1883,12 +1868,12 @@ class AssertShapesTest(test.TestCase):
     self.evaluate(out)
 
   def raises_static_error(self, shapes, regex):
-    with self.assertRaisesRegexp(ValueError, regex):
+    with self.assertRaisesRegex(ValueError, regex):
       check_ops.assert_shapes(shapes)
 
   def raises_dynamic_error(self, shapes, regex, feed_dict):
     with self.session() as sess:
-      with self.assertRaisesRegexp(errors.InvalidArgumentError, regex):
+      with self.assertRaisesRegex(errors.InvalidArgumentError, regex):
         assertion = check_ops.assert_shapes(shapes)
         with ops.control_dependencies([assertion]):
           out = array_ops.identity(0)
@@ -2024,7 +2009,7 @@ class AssertScalarTest(test.TestCase):
     check_ops.assert_scalar(constant_op.constant("foo"))
     check_ops.assert_scalar(3)
     check_ops.assert_scalar("foo")
-    with self.assertRaisesRegexp(ValueError, "Expected scalar"):
+    with self.assertRaisesRegex(ValueError, "Expected scalar"):
       check_ops.assert_scalar(constant_op.constant([3, 4]))
 
 
diff --git a/tensorflow/python/kernel_tests/cholesky_op_test.py b/tensorflow/python/kernel_tests/cholesky_op_test.py
index 5dc334c897b..b748a8ec864 100644
--- a/tensorflow/python/kernel_tests/cholesky_op_test.py
+++ b/tensorflow/python/kernel_tests/cholesky_op_test.py
@@ -167,7 +167,7 @@ class CholeskyOpTest(test.TestCase):
   def testNotInvertibleCPU(self):
     # The input should be invertible.
     with self.session(use_gpu=True):
-      with self.assertRaisesRegexp(
+      with self.assertRaisesRegex(
           errors_impl.InvalidArgumentError,
           "Cholesky decomposition was not successful. The"
           " input might not be valid."):
diff --git a/tensorflow/python/kernel_tests/concat_op_test.py b/tensorflow/python/kernel_tests/concat_op_test.py
index a83bfbab1c1..ba2d1abbd10 100644
--- a/tensorflow/python/kernel_tests/concat_op_test.py
+++ b/tensorflow/python/kernel_tests/concat_op_test.py
@@ -502,7 +502,7 @@ class ConcatOpTest(test.TestCase):
   def testConcatNoScalars(self):
     scalar = constant_op.constant(7)
     dim = array_ops.placeholder(dtypes.int32)
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         ValueError, r"Can't concatenate scalars \(use tf\.stack instead\)"):
       array_ops.concat([scalar, scalar, scalar], dim)
 
@@ -660,8 +660,8 @@ class ConcatOffsetTest(test.TestCase):
     s0 = constant_op.constant([[2, 3, 5]], dtypes.int32)
     s1 = constant_op.constant([[2, 7, 5]], dtypes.int32)
     off = gen_array_ops.concat_offset(cdim, [s0, s1])
-    with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
-                                 r"should be a vector"):
+    with self.assertRaisesRegex(errors_impl.InvalidArgumentError,
+                                r"should be a vector"):
       self.evaluate(off)
 
   @test_util.run_deprecated_v1
@@ -670,8 +670,8 @@ class ConcatOffsetTest(test.TestCase):
     s0 = constant_op.constant([2, 3, 5], dtypes.int32)
     s1 = constant_op.constant([2, 7, 5], dtypes.int32)
     off = gen_array_ops.concat_offset(cdim, [s0, s1])
-    with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
-                                 r"Concat dim is out of range: 4 vs. 3"):
+    with self.assertRaisesRegex(errors_impl.InvalidArgumentError,
+                                r"Concat dim is out of range: 4 vs. 3"):
       self.evaluate(off)
 
   @test_util.run_deprecated_v1
@@ -680,8 +680,8 @@ class ConcatOffsetTest(test.TestCase):
     s0 = constant_op.constant([2, 3, 5], dtypes.int32)
     s1 = constant_op.constant([2, 7, 5, 10], dtypes.int32)
     off = gen_array_ops.concat_offset(cdim, [s0, s1])
-    with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
-                                 r"should contain 3 elem"):
+    with self.assertRaisesRegex(errors_impl.InvalidArgumentError,
+                                r"should contain 3 elem"):
       self.evaluate(off)
 
   @test_util.run_deprecated_v1
@@ -691,7 +691,7 @@ class ConcatOffsetTest(test.TestCase):
     s0 = constant_op.constant([2, 3, 5], dtypes.int32)
     s1 = constant_op.constant([2, 7, 10], dtypes.int32)
     off = gen_array_ops.concat_offset(cdim, [s0, s1])
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         errors_impl.InvalidArgumentError,
         r"All dimensions except 1 must match. Input 1 has shape \[2 7 10\] "
         r"and doesn't match input 0 with shape \[2 3 5\]."):
diff --git a/tensorflow/python/kernel_tests/cond_v2_test.py b/tensorflow/python/kernel_tests/cond_v2_test.py
index 1682f2275c1..52bd240019b 100644
--- a/tensorflow/python/kernel_tests/cond_v2_test.py
+++ b/tensorflow/python/kernel_tests/cond_v2_test.py
@@ -239,25 +239,23 @@ class CondV2Test(test.TestCase):
     with ops.Graph().as_default():
       _, cond_op = self._createCond(None)
       self.assertEqual(cond_op.name, "cond")
-      self.assertRegexpMatches(
-          cond_op.get_attr("then_branch").name, r"cond_true_\d*")
-      self.assertRegexpMatches(
-          cond_op.get_attr("else_branch").name, r"cond_false_\d*")
+      self.assertRegex(cond_op.get_attr("then_branch").name, r"cond_true_\d*")
+      self.assertRegex(cond_op.get_attr("else_branch").name, r"cond_false_\d*")
 
     with ops.Graph().as_default():
       with ops.name_scope("foo"):
         _, cond1_op = self._createCond("")
         self.assertEqual(cond1_op.name, "foo/cond")
-        self.assertRegexpMatches(
+        self.assertRegex(
             cond1_op.get_attr("then_branch").name, r"foo_cond_true_\d*")
-        self.assertRegexpMatches(
+        self.assertRegex(
             cond1_op.get_attr("else_branch").name, r"foo_cond_false_\d*")
 
         _, cond2_op = self._createCond(None)
         self.assertEqual(cond2_op.name, "foo/cond_1")
-        self.assertRegexpMatches(
+        self.assertRegex(
             cond2_op.get_attr("then_branch").name, r"foo_cond_1_true_\d*")
-        self.assertRegexpMatches(
+        self.assertRegex(
             cond2_op.get_attr("else_branch").name, r"foo_cond_1_false_\d*")
 
   @test_util.run_v2_only
@@ -1135,7 +1133,7 @@ class CondV2Test(test.TestCase):
     def false_fn():
       return ((x,), y * 3.0)
 
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         TypeError, "true_fn and false_fn arguments to tf.cond must have the "
         "same number, type, and overall structure of return values."):
       control_flow_ops.cond(constant_op.constant(False), true_fn, false_fn)
@@ -1254,7 +1252,7 @@ class CondV2CollectionTest(test.TestCase):
           return math_ops.add(x_const, y_const)
 
         cnd = cond_v2.cond_v2(constant_op.constant(True), fn, fn)
-        self.assertEquals(cnd.eval(), 7)
+        self.assertEqual(cnd.eval(), 7)
 
   def testCollectionTensorValueAccessInCond(self):
     """Read tensors from collections inside of cond_v2 & use them."""
@@ -1271,7 +1269,7 @@ class CondV2CollectionTest(test.TestCase):
           return math_ops.add(x_read, y_read)
 
         cnd = cond_v2.cond_v2(math_ops.less(x, y), fn, fn)
-        self.assertEquals(cnd.eval(), 7)
+        self.assertEqual(cnd.eval(), 7)
 
   def testCollectionIntValueWriteInCond(self):
     """Make sure Int writes to collections work inside of cond_v2."""
@@ -1289,10 +1287,10 @@ class CondV2CollectionTest(test.TestCase):
           return math_ops.mul(x, z)
 
         cnd = cond_v2.cond_v2(constant_op.constant(True), true_fn, false_fn)
-        self.assertEquals(cnd.eval(), 14)
+        self.assertEqual(cnd.eval(), 14)
 
         read_z_collection = ops.get_collection("z")
-        self.assertEquals(read_z_collection, [7])
+        self.assertEqual(read_z_collection, [7])
 
 
 class CondV2ContainerTest(test.TestCase):
@@ -1363,11 +1361,11 @@ class CondV2ContainerTest(test.TestCase):
         with ops.container("l1"):
           cnd_true = cond_v2.cond_v2(
               constant_op.constant(True), true_fn, false_fn)
-          self.assertEquals(cnd_true.eval(), 2)
+          self.assertEqual(cnd_true.eval(), 2)
 
           cnd_false = cond_v2.cond_v2(
               constant_op.constant(False), true_fn, false_fn)
-          self.assertEquals(cnd_false.eval(), 6)
+          self.assertEqual(cnd_false.eval(), 6)
 
           v4 = variables.Variable([3])
           q4 = data_flow_ops.FIFOQueue(1, dtypes.float32)
@@ -1395,7 +1393,7 @@ class CondV2ColocationGroupAndDeviceTest(test.TestCase):
           return c
 
         with ops.colocate_with(a.op):
-          self.assertEquals(
+          self.assertEqual(
               cond_v2.cond_v2(constant_op.constant(True), fn, fn).eval(), 3)
 
         def fn2():
@@ -1405,7 +1403,7 @@ class CondV2ColocationGroupAndDeviceTest(test.TestCase):
 
         with ops.colocate_with(a.op):
           with ops.colocate_with(b.op):
-            self.assertEquals(
+            self.assertEqual(
                 cond_v2.cond_v2(constant_op.constant(True), fn2, fn2).eval(), 3)
 
   def testColocateWithInAndOutOfCond(self):
@@ -1422,7 +1420,7 @@ class CondV2ColocationGroupAndDeviceTest(test.TestCase):
             return c
 
         with ops.colocate_with(a.op):
-          self.assertEquals(
+          self.assertEqual(
               cond_v2.cond_v2(constant_op.constant(True), fn2, fn2).eval(), 3)
 
           d = constant_op.constant([2.0], name="d")
@@ -1495,7 +1493,7 @@ class CondV2ColocationGroupAndDeviceTest(test.TestCase):
             return c
 
         with ops.device("/device:CPU:0"):
-          self.assertEquals(
+          self.assertEqual(
               cond_v2.cond_v2(constant_op.constant(True), fn2, fn2).eval(), 3)
 
           d = constant_op.constant(4.0)
diff --git a/tensorflow/python/kernel_tests/confusion_matrix_test.py b/tensorflow/python/kernel_tests/confusion_matrix_test.py
index 04de4747a69..f7963b7fd1d 100644
--- a/tensorflow/python/kernel_tests/confusion_matrix_test.py
+++ b/tensorflow/python/kernel_tests/confusion_matrix_test.py
@@ -221,9 +221,9 @@ class ConfusionMatrixTest(test.TestCase):
   def testInputDifferentSize(self):
     labels = np.asarray([1, 2])
     predictions = np.asarray([1, 2, 3])
-    self.assertRaisesRegexp(ValueError, "must be equal",
-                            confusion_matrix.confusion_matrix, predictions,
-                            labels)
+    self.assertRaisesRegex(ValueError, "must be equal",
+                           confusion_matrix.confusion_matrix, predictions,
+                           labels)
 
   def testOutputIsInt32(self):
     labels = np.arange(2)
diff --git a/tensorflow/python/kernel_tests/constant_op_eager_test.py b/tensorflow/python/kernel_tests/constant_op_eager_test.py
index cc788219ef3..81f26f2f791 100644
--- a/tensorflow/python/kernel_tests/constant_op_eager_test.py
+++ b/tensorflow/python/kernel_tests/constant_op_eager_test.py
@@ -106,7 +106,7 @@ class ConstantTest(test.TestCase):
 
     # This integer is larger than all non-infinite numbers representable
     # by a double, raises an exception.
-    with self.assertRaisesRegexp(ValueError, "out-of-range integer"):
+    with self.assertRaisesRegex(ValueError, "out-of-range integer"):
       constant_op.constant(10**310, dtypes_lib.float64)
 
   def testInt32(self):
@@ -128,7 +128,7 @@ class ConstantTest(test.TestCase):
     self.assertAllClose(np.array(orig), tf_ans.numpy())
 
     # Out of range for an int64
-    with self.assertRaisesRegexp(ValueError, "out-of-range integer"):
+    with self.assertRaisesRegex(ValueError, "out-of-range integer"):
       constant_op.constant([2**72])
 
   def testComplex64(self):
@@ -216,7 +216,7 @@ class ConstantTest(test.TestCase):
       constant_op.constant([1, 2, 3, 4, 5, 6, 7], shape=[5])
 
   def testShapeWrong(self):
-    with self.assertRaisesRegexp(TypeError, None):
+    with self.assertRaisesRegex(TypeError, None):
       constant_op.constant([1, 2, 3, 4, 5, 6, 7], shape=[5])
 
   def testShape(self):
@@ -250,17 +250,17 @@ class ConstantTest(test.TestCase):
       def __len__(self):
         return -1
 
-    with self.assertRaisesRegexp(ValueError, "should return >= 0"):
+    with self.assertRaisesRegex(ValueError, "should return >= 0"):
       constant_op.constant([BadList()])
-    with self.assertRaisesRegexp(ValueError, "mixed types"):
+    with self.assertRaisesRegex(ValueError, "mixed types"):
       constant_op.constant([1, 2, BadList()])
-    with self.assertRaisesRegexp(ValueError, "should return >= 0"):
+    with self.assertRaisesRegex(ValueError, "should return >= 0"):
       constant_op.constant(BadList())
-    with self.assertRaisesRegexp(ValueError, "should return >= 0"):
+    with self.assertRaisesRegex(ValueError, "should return >= 0"):
       constant_op.constant([[BadList(), 2], 3])
-    with self.assertRaisesRegexp(ValueError, "should return >= 0"):
+    with self.assertRaisesRegex(ValueError, "should return >= 0"):
       constant_op.constant([BadList(), [1, 2, 3]])
-    with self.assertRaisesRegexp(ValueError, "should return >= 0"):
+    with self.assertRaisesRegex(ValueError, "should return >= 0"):
       constant_op.constant([BadList(), []])
 
     # TODO(allenl, josh11b): These cases should return exceptions rather than
@@ -268,19 +268,19 @@ class ConstantTest(test.TestCase):
     # sequence recursively). Maybe the first one is fine, but the second one
     # silently truncating is rather bad.
 
-    # with self.assertRaisesRegexp(ValueError, "should return >= 0"):
+    # with self.assertRaisesRegex(ValueError, "should return >= 0"):
     #   constant_op.constant([[3, 2, 1], BadList()])
-    # with self.assertRaisesRegexp(ValueError, "should return >= 0"):
+    # with self.assertRaisesRegex(ValueError, "should return >= 0"):
     #   constant_op.constant([[], BadList()])
 
   def testSparseValuesRaiseErrors(self):
-    with self.assertRaisesRegexp(ValueError, "non-rectangular Python sequence"):
+    with self.assertRaisesRegex(ValueError, "non-rectangular Python sequence"):
       constant_op.constant([[1, 2], [3]], dtype=dtypes_lib.int32)
 
-    with self.assertRaisesRegexp(ValueError, None):
+    with self.assertRaisesRegex(ValueError, None):
       constant_op.constant([[1, 2], [3]])
 
-    with self.assertRaisesRegexp(ValueError, None):
+    with self.assertRaisesRegex(ValueError, None):
       constant_op.constant([[1, 2], [3], [4, 5]])
 
   # TODO(ashankar): This test fails with graph construction since
diff --git a/tensorflow/python/kernel_tests/constant_op_test.py b/tensorflow/python/kernel_tests/constant_op_test.py
index 6780011b0b8..99d5278de0f 100644
--- a/tensorflow/python/kernel_tests/constant_op_test.py
+++ b/tensorflow/python/kernel_tests/constant_op_test.py
@@ -234,8 +234,7 @@ class ConstantTest(test.TestCase):
     self.assertEqual(c.get_shape(), [10])
 
     with ops.Graph().as_default():
-      with self.assertRaisesRegexp(
-          TypeError, "Expected Tensor's shape"):
+      with self.assertRaisesRegex(TypeError, "Expected Tensor's shape"):
         c = constant_op.constant([1, 2, 3, 4, 5, 6, 7], shape=[10])
 
   def testPromotionShapes(self):
@@ -249,9 +248,9 @@ class ConstantTest(test.TestCase):
   # pylint: disable=g-long-lambda
   def testShapeWrong(self):
     with ops.Graph().as_default():
-      with self.assertRaisesRegexp(ValueError, "Too many elements provided."):
+      with self.assertRaisesRegex(ValueError, "Too many elements provided."):
         constant_op.constant_v1([1, 2, 3, 4, 5, 6, 7], shape=[5])
-      with self.assertRaisesRegexp(TypeError, "Expected Tensor's shape"):
+      with self.assertRaisesRegex(TypeError, "Expected Tensor's shape"):
         constant_op.constant([1, 2, 3, 4, 5, 6, 7], shape=[5])
 
   # pylint: enable=g-long-lambda
@@ -260,7 +259,7 @@ class ConstantTest(test.TestCase):
   def _testTooLargeConstant(self):
     with ops.Graph().as_default():
       large_array = np.zeros((512, 1024, 1024), dtype=np.float32)
-      with self.assertRaisesRegexp(
+      with self.assertRaisesRegex(
           ValueError,
           "Cannot create a tensor proto whose content is larger than 2GB."):
         c = constant_op.constant(large_array)
@@ -272,20 +271,20 @@ class ConstantTest(test.TestCase):
       large_array = np.zeros((256, 1024, 1024), dtype=np.float32)
       c = constant_op.constant(large_array)
       d = constant_op.constant(large_array)
-      with self.assertRaisesRegexp(ValueError,
-                                   "GraphDef cannot be larger than 2GB."):
+      with self.assertRaisesRegex(ValueError,
+                                  "GraphDef cannot be larger than 2GB."):
         g.as_graph_def()
 
   @test_util.run_deprecated_v1
   def testSparseValuesRaiseErrors(self):
-    with self.assertRaisesRegexp(ValueError,
-                                 "setting an array element with a sequence"):
+    with self.assertRaisesRegex(ValueError,
+                                "setting an array element with a sequence"):
       c = constant_op.constant([[1, 2], [3]], dtype=dtypes_lib.int32)
 
-    with self.assertRaisesRegexp(ValueError, "must be a dense"):
+    with self.assertRaisesRegex(ValueError, "must be a dense"):
       c = constant_op.constant([[1, 2], [3]])
 
-    with self.assertRaisesRegexp(ValueError, "must be a dense"):
+    with self.assertRaisesRegex(ValueError, "must be a dense"):
       c = constant_op.constant([[1, 2], [3], [4, 5]])
 
 
@@ -330,8 +329,8 @@ class AsTensorTest(test.TestCase):
       self.assertEqual(dtypes_lib.int64, x.dtype)
       self.assertAllEqual([2**31, 2, 3], self.evaluate(x))
 
-      with self.assertRaisesRegexp(
-          ValueError, "a dimension is too large .2147483648."):
+      with self.assertRaisesRegex(ValueError,
+                                  "a dimension is too large .2147483648."):
         x = ops.convert_to_tensor(tensor_shape.TensorShape([2**31, 2, 3]),
                                   dtype=dtypes_lib.int32)
 
@@ -344,10 +343,10 @@ class AsTensorTest(test.TestCase):
           array_ops.zeros([6]), tensor_shape.TensorShape([2, 3]))
       self.assertAllEqual([[0.0, 0.0, 0.0], [0.0, 0.0, 0.0]], self.evaluate(x))
 
-    with self.assertRaisesRegexp(ValueError, "partially known"):
+    with self.assertRaisesRegex(ValueError, "partially known"):
       ops.convert_to_tensor(tensor_shape.TensorShape(None))
 
-    with self.assertRaisesRegexp(ValueError, "partially known"):
+    with self.assertRaisesRegex(ValueError, "partially known"):
       ops.convert_to_tensor(tensor_shape.TensorShape([1, None, 64]))
 
     with self.assertRaises(TypeError):
@@ -368,14 +367,14 @@ class AsTensorTest(test.TestCase):
 
     shape = tensor_shape.TensorShape(None)
     if shape._v2_behavior:
-      with self.assertRaisesRegexp(ValueError, "None values not supported"):
+      with self.assertRaisesRegex(ValueError, "None values not supported"):
         ops.convert_to_tensor(shape[1])
-      with self.assertRaisesRegexp(ValueError, "None values not supported"):
+      with self.assertRaisesRegex(ValueError, "None values not supported"):
         ops.convert_to_tensor(tensor_shape.TensorShape([1, None, 64])[1])
     else:
-      with self.assertRaisesRegexp(ValueError, "unknown Dimension"):
+      with self.assertRaisesRegex(ValueError, "unknown Dimension"):
         ops.convert_to_tensor(shape[1])
-      with self.assertRaisesRegexp(ValueError, "unknown Dimension"):
+      with self.assertRaisesRegex(ValueError, "unknown Dimension"):
         ops.convert_to_tensor(tensor_shape.TensorShape([1, None, 64])[1])
 
 
diff --git a/tensorflow/python/kernel_tests/control_flow_ops_py_test.py b/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
index eec7165d148..b03020c3bf5 100644
--- a/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
+++ b/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
@@ -384,7 +384,7 @@ class ControlFlowTest(test.TestCase, parameterized.TestCase):
     values = constant_op.constant(10)
     fn1 = lambda: math_ops.add(values, 1)
     fn2 = lambda: math_ops.subtract(values, 1)
-    with self.assertRaisesRegexp(TypeError, "must not be a Python bool"):
+    with self.assertRaisesRegex(TypeError, "must not be a Python bool"):
       _ = control_flow_ops.cond(False, fn1, fn2)
 
   @test_util.run_deprecated_v1
@@ -416,8 +416,8 @@ class ControlFlowTest(test.TestCase, parameterized.TestCase):
           if graph.is_fetchable(t.op):
             sess.run(t, feed_dict={x: 3})
           else:
-            with self.assertRaisesRegexp(ValueError,
-                                         "has been marked as not fetchable"):
+            with self.assertRaisesRegex(ValueError,
+                                        "has been marked as not fetchable"):
               sess.run(t, feed_dict={x: 3})
 
   @test_util.disable_control_flow_v2("Not relevant")
@@ -436,7 +436,7 @@ class ControlFlowTest(test.TestCase, parameterized.TestCase):
       for op in graph.get_operations():
         for t in op.inputs:
           if t not in feedable_tensors and t.dtype is dtypes.int32:
-            with self.assertRaisesRegexp(ValueError, "may not be fed"):
+            with self.assertRaisesRegex(ValueError, "may not be fed"):
               sess.run(r, feed_dict={t: 3})
 
   @test_util.run_v1_only("b/120545219")
@@ -461,8 +461,8 @@ class ControlFlowTest(test.TestCase, parameterized.TestCase):
       values = constant_op.constant([10])
       indices = constant_op.constant([0])
       x = ops.IndexedSlices(values, indices)
-      with self.assertRaisesRegexp(
-          TypeError, "Cannot reconcile tf.cond 0-th outputs"):
+      with self.assertRaisesRegex(TypeError,
+                                  "Cannot reconcile tf.cond 0-th outputs"):
         control_flow_ops.cond(
             constant_op.constant(True),
             lambda: ops.IndexedSlices(math_ops.add(x.values, 1), indices),
@@ -813,7 +813,7 @@ class ControlFlowTest(test.TestCase, parameterized.TestCase):
     # rely on variable names.
     prefix = "cond/" if context.executing_eagerly() else ""
 
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         ValueError,
         "Tensor %strue_branch:0 in true_fn is accessed from false_fn." %
         prefix):
@@ -844,7 +844,7 @@ class ControlFlowTest(test.TestCase, parameterized.TestCase):
     # This was needed for backwards compatibility with TF2 Estimators which
     # rely on variable names.
     prefix = "switch_case/indexed_case/" if context.executing_eagerly() else ""
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         ValueError, "Tensor %sbr1_identity:0 in branch 1 is "
         "accessed from branch 4." % prefix):
       f()
@@ -929,7 +929,7 @@ class ControlFlowTest(test.TestCase, parameterized.TestCase):
       v1_msg = "The two structures don't have the same nested structure"
       v2_msg = ("true_fn and false_fn arguments to tf.cond must have the same "
                 "number, type, and overall structure of return values.")
-      with self.assertRaisesRegexp(
+      with self.assertRaisesRegex(
           TypeError if control_flow_util.ENABLE_CONTROL_FLOW_V2 else ValueError,
           v2_msg if control_flow_util.ENABLE_CONTROL_FLOW_V2 else v1_msg):
         control_flow_ops.cond(pred, fn1, fn2)
@@ -1092,7 +1092,7 @@ class ControlFlowTest(test.TestCase, parameterized.TestCase):
 
       self.assertAllEqual(r, 10000.)
       grad = gradients_impl.gradients(r, [x])[0]
-      with self.assertRaisesRegexp(
+      with self.assertRaisesRegex(
           errors_impl.InvalidArgumentError,
           r"Connecting to invalid output 1 of source node cond which has 1 "
           r"outputs. Try using "
@@ -1667,14 +1667,14 @@ class ControlFlowTest(test.TestCase, parameterized.TestCase):
     loop_with_maxiter = create_while_loop(maximum_iterations=2)
     xla_context.Exit()
 
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         ValueError,
         r"Cannot create a gradient accumulator for tensor '.+' inside "
         r"XLA while_loop because maximum_iterations was not passed to "
         r"the tf.while_loop call \('.+'\)."):
       _ = gradients_impl.gradients(loop_no_maxiter, v)
 
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         ValueError,
         r"Cannot create a gradient accumulator for tensor '.+' inside XLA "
         r"while_loop. maximum_iterations tensor '.+' for while_loop context "
@@ -1705,8 +1705,7 @@ class ControlFlowTest(test.TestCase, parameterized.TestCase):
     if control_flow_util.ENABLE_CONTROL_FLOW_V2:
       xla_context = control_flow_ops.XLAControlFlowContext()
       xla_context.Enter()
-      with self.assertRaisesRegexp(ValueError,
-                                   r"must be from the same graph.*"):
+      with self.assertRaisesRegex(ValueError, r"must be from the same graph.*"):
         loop = create_while_loop()
       xla_context.Exit()
     else:
@@ -1714,7 +1713,7 @@ class ControlFlowTest(test.TestCase, parameterized.TestCase):
       xla_context.Enter()
       loop = create_while_loop()
       xla_context.Exit()
-      with self.assertRaisesRegexp(
+      with self.assertRaisesRegex(
           ValueError,
           r"Cannot create a gradient accumulator for tensor '.+' inside XLA "
           r"while_loop. maximum_iterations tensor '.*Placeholder:0' for "
@@ -1976,7 +1975,7 @@ class ControlFlowTest(test.TestCase, parameterized.TestCase):
     i = constant_op.constant(0)
     c = lambda i, _: math_ops.less(i, 10)
     b = lambda i, x: [i + 1, x + 1]
-    with self.assertRaisesRegexp(ValueError, "is not compatible with"):
+    with self.assertRaisesRegex(ValueError, "is not compatible with"):
       # Shape of x is [2], but we specify a shape of [5].
       control_flow_ops.while_loop(
           c, b, [i, x], [i.shape, tensor_shape.TensorShape([5])])
@@ -1990,9 +1989,8 @@ class ControlFlowTest(test.TestCase, parameterized.TestCase):
     # body accepts N values and returns N+1 values.
     b = lambda i, *x: (i, i) + x
 
-    with self.assertRaisesRegexp(
-        ValueError,
-        "The two structures don't have the same nested structure."):
+    with self.assertRaisesRegex(
+        ValueError, "The two structures don't have the same nested structure."):
       control_flow_ops.while_loop(c, b, [i, x])
 
   @test_util.run_deprecated_v1
@@ -2035,7 +2033,7 @@ class ControlFlowTest(test.TestCase, parameterized.TestCase):
       m = array_ops.ones([2, 2])
       c = lambda i, j: math_ops.less(i, 2)
       b = lambda i, j: [i + 1, array_ops.concat([j, j], 0)]
-      with self.assertRaisesRegexp(
+      with self.assertRaisesRegex(
           ValueError,
           r"Input tensor 'ones:0' enters the loop with shape \(2, 2\), but has "
           r"shape \(4, 2\) after one iteration. To allow the shape to vary "
@@ -2138,7 +2136,7 @@ class ControlFlowTest(test.TestCase, parameterized.TestCase):
       ]
 
     # Explicit shape invariant, with a specific (incompatible) rank.
-    with self.assertRaisesRegexp(ValueError, "is not compatible with"):
+    with self.assertRaisesRegex(ValueError, "is not compatible with"):
       control_flow_ops.while_loop(
           c, b1, [i, x],
           [i.get_shape(), tensor_shape.TensorShape([5])])
@@ -2188,7 +2186,7 @@ class ControlFlowTest(test.TestCase, parameterized.TestCase):
     b = lambda i, x: [i+1, x]
 
     # Explicit shape invariant, with a specific (incompatible) rank.
-    with self.assertRaisesRegexp(ValueError, "is not compatible with"):
+    with self.assertRaisesRegex(ValueError, "is not compatible with"):
       control_flow_ops.while_loop(
           c, b, [i, x],
           [i.get_shape(), tensor_shape.TensorShape([5])])
@@ -3305,7 +3303,7 @@ class ControlFlowTest(test.TestCase, parameterized.TestCase):
         z = v * 2
         return i + 1, gradients_impl.gradients(z, x)[0]
 
-      with self.assertRaisesRegexp(
+      with self.assertRaisesRegex(
           ValueError,
           "Cannot compute gradient inside while loop with respect to op 'x'. "
           "We do not support taking the gradient wrt or through the initial "
@@ -3451,7 +3449,7 @@ class ControlFlowTest(test.TestCase, parameterized.TestCase):
       def b(lv0, lv1, _):
         return [lv0, lv1]
 
-      with self.assertRaisesRegexp(ValueError, "the same number of elements"):
+      with self.assertRaisesRegex(ValueError, "the same number of elements"):
         control_flow_ops.while_loop(c, b, loop_vars)
 
   @test_util.run_v1_only("b/120545219")
@@ -4013,7 +4011,7 @@ class ControlFlowTest(test.TestCase, parameterized.TestCase):
     result = functional_ops.scan(fn, np.array([1., 2., 3.], dtype=np.float32))
     grad_theta = gradients_impl.gradients(result, theta)
     if not control_flow_util.ENABLE_CONTROL_FLOW_V2:
-      with self.assertRaisesRegexp(TypeError, "Second-order gradient"):
+      with self.assertRaisesRegex(TypeError, "Second-order gradient"):
         gradients_impl.gradients(grad_theta, theta)
     grad_theta_stopped = array_ops.stop_gradient(grad_theta)
     gradients_impl.gradients(grad_theta_stopped, theta)
@@ -4676,7 +4674,7 @@ class ControlFlowContextCheckTest(test.TestCase):
   def testInvalidContext(self):
     # Accessing a while loop tensor outside of control flow is illegal.
     while_tensor = self._getWhileTensor()
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         ValueError,
         "Cannot use 'while/Const_1' as input to 'Add' because 'while/Const_1' "
         "is in a while loop. See info log for more details."):
@@ -4686,7 +4684,7 @@ class ControlFlowContextCheckTest(test.TestCase):
   def testInvalidContextInCond(self):
     # Accessing a while loop tensor in cond is illegal.
     while_tensor = self._getWhileTensor()
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         ValueError, "Cannot use 'while/Const_1' as input to 'cond/Add' because "
         "'while/Const_1' is in a while loop. See info log for more details."):
       # TODO(skyewm): this passes if we return while_tensor directly instead
@@ -4699,14 +4697,14 @@ class ControlFlowContextCheckTest(test.TestCase):
   def testInvalidContextInWhile(self):
     # Accessing a while loop tensor in a different while loop is illegal.
     while_tensor = self._getWhileTensor()
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         ValueError,
         "Cannot use 'while/Const_1' as input to 'while_1/Add' because they are "
         "in different while loops. See info log for more details."):
       control_flow_ops.while_loop(lambda i: i < 10,
                                   lambda x: math_ops.add(1, while_tensor), [0])
 
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         ValueError,
         "Cannot use 'while/Const_1' as input to 'while_2/NextIteration' "
         "because they are in different while loops. See info log for more "
@@ -4763,7 +4761,7 @@ class ControlFlowContextCheckTest(test.TestCase):
       return control_flow_ops.while_loop(lambda i: i < 3,
                                          lambda i: i + while_tensor, [0])
 
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         ValueError,
         "Cannot use 'cond/while/Const_1' as input to 'cond/while_1/add' because"
         " they are in different while loops. See info log for more details."):
@@ -4855,7 +4853,7 @@ class TupleTest(test.TestCase):
       # Should trigger the assign.
       self.evaluate(t)
 
-      self.assertEquals(1, self.evaluate(var))
+      self.assertEqual(1, self.evaluate(var))
 
 
 class AssertTest(test.TestCase):
diff --git a/tensorflow/python/kernel_tests/conv_ops_test.py b/tensorflow/python/kernel_tests/conv_ops_test.py
index e01abc8133d..73804e6731a 100644
--- a/tensorflow/python/kernel_tests/conv_ops_test.py
+++ b/tensorflow/python/kernel_tests/conv_ops_test.py
@@ -2526,16 +2526,16 @@ class Conv2DTest(test.TestCase):
   def testOpEdgeCases(self):
     with self.cached_session() as sess:
       # Illegal strides.
-      with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
-                                   "strides in the batch and depth"):
+      with self.assertRaisesRegex(errors_impl.InvalidArgumentError,
+                                  "strides in the batch and depth"):
         sess.run(
             nn_ops.conv2d(
                 array_ops.placeholder(dtypes.float32),
                 array_ops.placeholder(dtypes.float32),
                 strides=[2, 1, 1, 1],
                 padding="SAME"))
-      with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
-                                   "strides in the batch and depth"):
+      with self.assertRaisesRegex(errors_impl.InvalidArgumentError,
+                                  "strides in the batch and depth"):
         sess.run(
             nn_ops.conv2d(
                 array_ops.placeholder(dtypes.float32),
@@ -2544,7 +2544,7 @@ class Conv2DTest(test.TestCase):
                 padding="SAME"))
 
       # Filter larger than input.
-      with self.assertRaisesRegexp(ValueError, "Negative dimension size"):
+      with self.assertRaisesRegex(ValueError, "Negative dimension size"):
         sess.run(
             nn_ops.conv2d(
                 array_ops.placeholder(
@@ -2553,7 +2553,7 @@ class Conv2DTest(test.TestCase):
                     dtypes.float32, shape=[20, 21, 3, 2]),
                 strides=[1, 1, 1, 1],
                 padding="VALID"))
-      with self.assertRaisesRegexp(ValueError, "Negative dimension size"):
+      with self.assertRaisesRegex(ValueError, "Negative dimension size"):
         sess.run(
             nn_ops.conv2d(
                 array_ops.placeholder(
@@ -2564,7 +2564,7 @@ class Conv2DTest(test.TestCase):
                 padding="VALID"))
 
       # Filter larger than input + padding.
-      with self.assertRaisesRegexp(ValueError, "Negative dimension size"):
+      with self.assertRaisesRegex(ValueError, "Negative dimension size"):
         sess.run(
             nn_ops.conv2d(
                 array_ops.placeholder(dtypes.float32, shape=[32, 20, 20, 3]),
@@ -2573,8 +2573,8 @@ class Conv2DTest(test.TestCase):
                 padding=[[0, 0], [2, 2], [2, 2], [0, 0]]))
 
       # Negative padding during backprop.
-      with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
-                                   "nonnegative"):
+      with self.assertRaisesRegex(errors_impl.InvalidArgumentError,
+                                  "nonnegative"):
         sess.run(
             nn_ops.conv2d_backprop_input([32, 20, 20, 3],
                                          array_ops.placeholder(
@@ -2586,8 +2586,8 @@ class Conv2DTest(test.TestCase):
                                          strides=[1, 1, 1, 1],
                                          padding=[[0, 0], [-1, 0], [0, 0],
                                                   [0, 0]]))
-      with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
-                                   "nonnegative"):
+      with self.assertRaisesRegex(errors_impl.InvalidArgumentError,
+                                  "nonnegative"):
         sess.run(
             nn_ops.conv2d_backprop_filter(
                 array_ops.placeholder(dtypes.float32, shape=[32, 20, 20, 3]),
diff --git a/tensorflow/python/kernel_tests/critical_section_test.py b/tensorflow/python/kernel_tests/critical_section_test.py
index 55c1219580a..ab1906f939d 100644
--- a/tensorflow/python/kernel_tests/critical_section_test.py
+++ b/tensorflow/python/kernel_tests/critical_section_test.py
@@ -179,7 +179,7 @@ class CriticalSectionTest(test.TestCase, parameterized.TestCase):
     def fn(x):
       return cs.execute(lambda: add(x))
 
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         ValueError, r"Attempting to lock a CriticalSection in which we are"):
       cs.execute(lambda: fn(1.0))
 
@@ -221,10 +221,10 @@ class CriticalSectionTest(test.TestCase, parameterized.TestCase):
       ex_3 = cs.execute(lambda: fn_captures_dependency(1.0))
 
     # Ensure there's no actual deadlock on to_execute.
-    self.assertEquals(2.0, self.evaluate(ex_0))
-    self.assertEquals(2.0, self.evaluate(ex_1))
-    self.assertEquals(2.0, self.evaluate(ex_2))
-    self.assertEquals(2.0, self.evaluate(ex_3))
+    self.assertEqual(2.0, self.evaluate(ex_0))
+    self.assertEqual(2.0, self.evaluate(ex_1))
+    self.assertEqual(2.0, self.evaluate(ex_2))
+    self.assertEqual(2.0, self.evaluate(ex_3))
 
   def testRecursiveCriticalSectionAccessWithinLoopIsProtected(self):
     cs = critical_section_ops.CriticalSection(shared_name="cs")
@@ -251,7 +251,7 @@ class CriticalSectionTest(test.TestCase, parameterized.TestCase):
         "'testRecursiveCriticalSectionAccessWithinLoopDoesNotDeadlock "
         "body_implicit_capture'\n"
         "==============\n")
-    self.assertEquals((1000, 1000), self.evaluate((i_n, j_n)))
+    self.assertEqual((1000, 1000), self.evaluate((i_n, j_n)))
     logging.warn(
         "\n==============\nSuccessfully finished running "
         "'testRecursiveCriticalSectionAccessWithinLoopDoesNotDeadlock "
@@ -278,7 +278,7 @@ class CriticalSectionTest(test.TestCase, parameterized.TestCase):
         "'testRecursiveCriticalSectionAccessWithinLoopDoesNotDeadlock "
         "body_implicit_capture_protected'\n"
         "==============\n")
-    self.assertEquals((1000, 1000), self.evaluate((i_n, j_n)))
+    self.assertEqual((1000, 1000), self.evaluate((i_n, j_n)))
     logging.warn(
         "\n==============\nSuccessfully finished running "
         "'testRecursiveCriticalSectionAccessWithinLoopDoesNotDeadlock "
@@ -303,7 +303,7 @@ class CriticalSectionTest(test.TestCase, parameterized.TestCase):
         "'testRecursiveCriticalSectionAccessWithinLoopDoesNotDeadlock "
         "body_args_capture'\n"
         "==============\n")
-    self.assertEquals((1000, 1000), self.evaluate((i_n, j_n)))
+    self.assertEqual((1000, 1000), self.evaluate((i_n, j_n)))
     logging.warn(
         "\n==============\nSuccessfully finished running "
         "'testRecursiveCriticalSectionAccessWithinLoopDoesNotDeadlock "
@@ -319,7 +319,8 @@ class CriticalSectionTest(test.TestCase, parameterized.TestCase):
     add = lambda x: x + 1
     def fn(x):
       return cs_same.execute(lambda: add(x))
-    with self.assertRaisesRegexp(
+
+    with self.assertRaisesRegex(
         ValueError, r"Attempting to lock a CriticalSection in which we are"):
       cs.execute(lambda: fn(1.0))
 
@@ -334,12 +335,12 @@ class CriticalSectionTest(test.TestCase, parameterized.TestCase):
     cs0.execute(lambda: v - 1)
     # It's *not* OK for a different CriticalSection to access it by
     # default.
-    with self.assertRaisesRegexp(
-        ValueError, "requested exclusive resource access"):
+    with self.assertRaisesRegex(ValueError,
+                                "requested exclusive resource access"):
       cs1.execute(lambda: v + 1)
     # It's not even OK if the second call doesn't request exclusive access.
-    with self.assertRaisesRegexp(
-        ValueError, "requested exclusive resource access"):
+    with self.assertRaisesRegex(ValueError,
+                                "requested exclusive resource access"):
       cs1.execute(lambda: v + 1, exclusive_resource_access=False)
 
     v2 = resource_variable_ops.ResourceVariable(0.0, name="v2")
@@ -349,8 +350,8 @@ class CriticalSectionTest(test.TestCase, parameterized.TestCase):
 
     # It's not OK if the second request requires exclusive resource
     # access.
-    with self.assertRaisesRegexp(
-        ValueError, "requested exclusive resource access"):
+    with self.assertRaisesRegex(ValueError,
+                                "requested exclusive resource access"):
       cs1.execute(lambda: v2 + 1)
 
   def testControlDependencyFromOutsideWhileLoopMixedWithInsideLoop(self):
diff --git a/tensorflow/python/kernel_tests/ctc_decoder_ops_test.py b/tensorflow/python/kernel_tests/ctc_decoder_ops_test.py
index 0d86d13c715..d31a663e1ad 100644
--- a/tensorflow/python/kernel_tests/ctc_decoder_ops_test.py
+++ b/tensorflow/python/kernel_tests/ctc_decoder_ops_test.py
@@ -236,9 +236,9 @@ class CTCGreedyDecoderTest(test.TestCase):
         top_paths=2)
 
     # Requesting more paths than the beam width allows.
-    with self.assertRaisesRegexp(errors.InvalidArgumentError,
-                                 (".*requested more paths than the beam "
-                                  "width.*")):
+    with self.assertRaisesRegex(errors.InvalidArgumentError,
+                                (".*requested more paths than the beam "
+                                 "width.*")):
       self._testCTCDecoder(
           ctc_ops.ctc_beam_search_decoder,
           inputs,
diff --git a/tensorflow/python/kernel_tests/ctc_loss_op_test.py b/tensorflow/python/kernel_tests/ctc_loss_op_test.py
index 9b94536de0a..ca8f171e700 100644
--- a/tensorflow/python/kernel_tests/ctc_loss_op_test.py
+++ b/tensorflow/python/kernel_tests/ctc_loss_op_test.py
@@ -90,7 +90,7 @@ class CTCLossTest(test.TestCase):
                    loss_truth,
                    grad_truth,
                    expected_err_re=None):
-    self.assertEquals(len(inputs), len(grad_truth))
+    self.assertEqual(len(inputs), len(grad_truth))
 
     inputs_t = constant_op.constant(inputs)
 
@@ -288,8 +288,7 @@ class CTCLossTest(test.TestCase):
           inputs=inputs_t, labels=labels, sequence_length=seq_lens)
       # Taking ths second gradient should fail, since it is not
       # yet supported.
-      with self.assertRaisesRegexp(LookupError,
-                                   "explicitly disabled"):
+      with self.assertRaisesRegex(LookupError, "explicitly disabled"):
         _ = gradients_impl._hessian_vector_product(loss, [inputs_t], v)
 
   @test_util.run_v1_only("b/120545219")
@@ -302,8 +301,8 @@ class CTCLossTest(test.TestCase):
         dense_shape=[5, 5])
 
     with self.session(use_gpu=False) as sess:
-      with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
-                                   "batch_size must not be 0"):
+      with self.assertRaisesRegex(errors_impl.InvalidArgumentError,
+                                  "batch_size must not be 0"):
         sess.run(_ctc_loss_v2(labels, inputs, sequence_lengths))
 
 
diff --git a/tensorflow/python/kernel_tests/cwise_ops_binary_test.py b/tensorflow/python/kernel_tests/cwise_ops_binary_test.py
index 4c6a41bf205..50e6c0ad91f 100644
--- a/tensorflow/python/kernel_tests/cwise_ops_binary_test.py
+++ b/tensorflow/python/kernel_tests/cwise_ops_binary_test.py
@@ -796,7 +796,7 @@ class BinaryOpTest(test.TestCase):
   def testPowNegativeExponent(self):
     for dtype in [np.int32, np.int64]:
       with test_util.force_cpu():
-        with self.assertRaisesRegexp(
+        with self.assertRaisesRegex(
             errors_impl.InvalidArgumentError,
             "Integers to negative integer powers are not allowed"):
           x = np.array([5, 2]).astype(dtype)
@@ -804,7 +804,7 @@ class BinaryOpTest(test.TestCase):
           self.evaluate(math_ops.pow(x, y))
 
       with test_util.force_cpu():
-        with self.assertRaisesRegexp(
+        with self.assertRaisesRegex(
             errors_impl.InvalidArgumentError,
             "Integers to negative integer powers are not allowed"):
           x = np.array([5, 2]).astype(dtype)
@@ -812,7 +812,7 @@ class BinaryOpTest(test.TestCase):
           self.evaluate(math_ops.pow(x, y))
 
       with test_util.force_cpu():
-        with self.assertRaisesRegexp(
+        with self.assertRaisesRegex(
             errors_impl.InvalidArgumentError,
             "Integers to negative integer powers are not allowed"):
           x = np.array([5, 2]).astype(dtype)
@@ -948,7 +948,7 @@ class ComparisonOpTest(test.TestCase):
     y = np.arange(0, 10).reshape([5, 2])
     for t in dtypes:
       for f in funcs:
-        with self.assertRaisesRegexp(
+        with self.assertRaisesRegex(
             (ValueError, errors.InvalidArgumentError),
             "Incompatible shapes|Dimensions must be equal"):
           f(x.astype(t), y.astype(t))
diff --git a/tensorflow/python/kernel_tests/cwise_ops_test.py b/tensorflow/python/kernel_tests/cwise_ops_test.py
index 8c84bde1431..78d3af17990 100644
--- a/tensorflow/python/kernel_tests/cwise_ops_test.py
+++ b/tensorflow/python/kernel_tests/cwise_ops_test.py
@@ -217,7 +217,7 @@ class ComparisonOpTest(test.TestCase):
     for t in dtypes:
       for f in funcs:
         with self.subTest(t=t, f=f):
-          with self.assertRaisesRegexp(
+          with self.assertRaisesRegex(
               (ValueError, errors.InvalidArgumentError),
               "Incompatible shapes|Dimensions must be equal"):
             f(x.astype(t), y.astype(t))
@@ -1158,8 +1158,8 @@ class ComplexMakeRealImagTest(test.TestCase):
   @test_util.run_deprecated_v1
   def testConjString(self):
     x = array_ops.placeholder(dtypes_lib.string)
-    with self.assertRaisesRegexp(TypeError,
-                                 r"Expected numeric or variant tensor"):
+    with self.assertRaisesRegex(TypeError,
+                                r"Expected numeric or variant tensor"):
       math_ops.conj(x)
 
   def _compareGradient(self, x):
@@ -1281,7 +1281,7 @@ class PolyvalTest(test.TestCase):
   def test_coeffs_raise(self):
     x = np.random.rand(2, 2).astype(np.float32)
     coeffs = {}
-    with self.assertRaisesRegexp(ValueError, "Argument coeffs must be list"):
+    with self.assertRaisesRegex(ValueError, "Argument coeffs must be list"):
       math_ops.polyval(coeffs, x)
 
 
diff --git a/tensorflow/python/kernel_tests/depthtospace_op_test.py b/tensorflow/python/kernel_tests/depthtospace_op_test.py
index b64b8cd09f1..17fb579ca52 100644
--- a/tensorflow/python/kernel_tests/depthtospace_op_test.py
+++ b/tensorflow/python/kernel_tests/depthtospace_op_test.py
@@ -50,7 +50,7 @@ class DepthToSpaceTest(test.TestCase):
         output_nchw = array_ops.depth_to_space(
             input_nchw, block_size, data_format="NCHW")
         output_nhwc = test_util.NCHWToNHWC(output_nchw)
-        with self.assertRaisesRegexp(
+        with self.assertRaisesRegex(
             errors_impl.InvalidArgumentError,
             "No OpKernel was registered to support Op 'DepthToSpace'"):
           output_nhwc.eval()
diff --git a/tensorflow/python/kernel_tests/diag_op_test.py b/tensorflow/python/kernel_tests/diag_op_test.py
index f41c4375d07..9ee39842b2f 100644
--- a/tensorflow/python/kernel_tests/diag_op_test.py
+++ b/tensorflow/python/kernel_tests/diag_op_test.py
@@ -537,7 +537,7 @@ class MatrixDiagTest(test.TestCase):
 
   @test_util.run_deprecated_v1
   def testInvalidShape(self):
-    with self.assertRaisesRegexp(ValueError, "must be at least rank 1"):
+    with self.assertRaisesRegex(ValueError, "must be at least rank 1"):
       array_ops.matrix_diag(0)
 
   @test_util.run_deprecated_v1
@@ -695,9 +695,9 @@ class MatrixSetDiagTest(test.TestCase):
 
   @test_util.run_deprecated_v1
   def testInvalidShape(self):
-    with self.assertRaisesRegexp(ValueError, "must be at least rank 2"):
+    with self.assertRaisesRegex(ValueError, "must be at least rank 2"):
       array_ops.matrix_set_diag(0, [0])
-    with self.assertRaisesRegexp(ValueError, "must be at least rank 1"):
+    with self.assertRaisesRegex(ValueError, "must be at least rank 1"):
       array_ops.matrix_set_diag([[0]], 0)
 
   @test_util.run_deprecated_v1
@@ -887,7 +887,7 @@ class MatrixDiagPartTest(test.TestCase):
 
   @test_util.run_deprecated_v1
   def testInvalidShape(self):
-    with self.assertRaisesRegexp(ValueError, "must be at least rank 2"):
+    with self.assertRaisesRegex(ValueError, "must be at least rank 2"):
       array_ops.matrix_diag_part(0)
 
   @test_util.run_deprecated_v1
@@ -1068,7 +1068,7 @@ class DiagTest(test.TestCase):
 
   @test_util.run_deprecated_v1
   def testInvalidRank(self):
-    with self.assertRaisesRegexp(ValueError, "must be at least rank 1"):
+    with self.assertRaisesRegex(ValueError, "must be at least rank 1"):
       array_ops.diag(0.0)
 
 
diff --git a/tensorflow/python/kernel_tests/distributions/bijector_test.py b/tensorflow/python/kernel_tests/distributions/bijector_test.py
index 49f24a57420..55dc4cbbfa8 100644
--- a/tensorflow/python/kernel_tests/distributions/bijector_test.py
+++ b/tensorflow/python/kernel_tests/distributions/bijector_test.py
@@ -36,9 +36,9 @@ class BaseBijectorTest(test.TestCase):
   """Tests properties of the Bijector base-class."""
 
   def testIsAbstract(self):
-    with self.assertRaisesRegexp(TypeError,
-                                 ("Can't instantiate abstract class Bijector "
-                                  "with abstract methods __init__")):
+    with self.assertRaisesRegex(TypeError,
+                                ("Can't instantiate abstract class Bijector "
+                                 "with abstract methods __init__")):
       bijector.Bijector()  # pylint: disable=abstract-class-instantiated
 
   def testDefaults(self):
@@ -65,20 +65,18 @@ class BaseBijectorTest(test.TestCase):
       self.assertAllEqual(shape, inverse_event_shape_)
       self.assertAllEqual(shape, bij.inverse_event_shape(shape))
 
-    with self.assertRaisesRegexp(
-        NotImplementedError, "inverse not implemented"):
+    with self.assertRaisesRegex(NotImplementedError, "inverse not implemented"):
       bij.inverse(0)
 
-    with self.assertRaisesRegexp(
-        NotImplementedError, "forward not implemented"):
+    with self.assertRaisesRegex(NotImplementedError, "forward not implemented"):
       bij.forward(0)
 
-    with self.assertRaisesRegexp(
-        NotImplementedError, "inverse_log_det_jacobian not implemented"):
+    with self.assertRaisesRegex(NotImplementedError,
+                                "inverse_log_det_jacobian not implemented"):
       bij.inverse_log_det_jacobian(0, event_ndims=0)
 
-    with self.assertRaisesRegexp(
-        NotImplementedError, "forward_log_det_jacobian not implemented"):
+    with self.assertRaisesRegex(NotImplementedError,
+                                "forward_log_det_jacobian not implemented"):
       bij.forward_log_det_jacobian(0, event_ndims=0)
 
 
@@ -121,16 +119,16 @@ class BijectorTestEventNdims(test.TestCase):
 
   def testBijectorNonIntegerEventNdims(self):
     bij = BrokenBijector()
-    with self.assertRaisesRegexp(ValueError, "Expected integer"):
+    with self.assertRaisesRegex(ValueError, "Expected integer"):
       bij.forward_log_det_jacobian(1., event_ndims=1.5)
-    with self.assertRaisesRegexp(ValueError, "Expected integer"):
+    with self.assertRaisesRegex(ValueError, "Expected integer"):
       bij.inverse_log_det_jacobian(1., event_ndims=1.5)
 
   def testBijectorArrayEventNdims(self):
     bij = BrokenBijector()
-    with self.assertRaisesRegexp(ValueError, "Expected scalar"):
+    with self.assertRaisesRegex(ValueError, "Expected scalar"):
       bij.forward_log_det_jacobian(1., event_ndims=(1, 2))
-    with self.assertRaisesRegexp(ValueError, "Expected scalar"):
+    with self.assertRaisesRegex(ValueError, "Expected scalar"):
       bij.inverse_log_det_jacobian(1., event_ndims=(1, 2))
 
   @test_util.run_deprecated_v1
@@ -248,7 +246,7 @@ class BijectorReduceEventDimsTest(test.TestCase):
   def testReduceEventNdimsForwardRaiseError(self):
     x = [[[1., 2.], [3., 4.]]]
     bij = ExpOnlyJacobian(forward_min_event_ndims=1)
-    with self.assertRaisesRegexp(ValueError, "must be larger than"):
+    with self.assertRaisesRegex(ValueError, "must be larger than"):
       bij.forward_log_det_jacobian(x, event_ndims=0)
 
   def testReduceEventNdimsInverse(self):
@@ -267,7 +265,7 @@ class BijectorReduceEventDimsTest(test.TestCase):
   def testReduceEventNdimsInverseRaiseError(self):
     x = [[[1., 2.], [3., 4.]]]
     bij = ExpOnlyJacobian(forward_min_event_ndims=1)
-    with self.assertRaisesRegexp(ValueError, "must be larger than"):
+    with self.assertRaisesRegex(ValueError, "must be larger than"):
       bij.inverse_log_det_jacobian(x, event_ndims=0)
 
   def testReduceEventNdimsForwardConstJacobian(self):
diff --git a/tensorflow/python/kernel_tests/distributions/kullback_leibler_test.py b/tensorflow/python/kernel_tests/distributions/kullback_leibler_test.py
index 1e967de570f..282196e4402 100644
--- a/tensorflow/python/kernel_tests/distributions/kullback_leibler_test.py
+++ b/tensorflow/python/kernel_tests/distributions/kullback_leibler_test.py
@@ -82,14 +82,14 @@ class KLTest(test.TestCase):
     class MyDist(normal.Normal):
       pass
 
-    with self.assertRaisesRegexp(TypeError, "must be callable"):
+    with self.assertRaisesRegex(TypeError, "must be callable"):
       kullback_leibler.RegisterKL(MyDist, MyDist)("blah")
 
     # First registration is OK
     kullback_leibler.RegisterKL(MyDist, MyDist)(lambda a, b: None)
 
     # Second registration fails
-    with self.assertRaisesRegexp(ValueError, "has already been registered"):
+    with self.assertRaisesRegex(ValueError, "has already been registered"):
       kullback_leibler.RegisterKL(MyDist, MyDist)(lambda a, b: None)
 
   def testExactRegistrationsAllMatch(self):
diff --git a/tensorflow/python/kernel_tests/distributions/student_t_test.py b/tensorflow/python/kernel_tests/distributions/student_t_test.py
index 20c8613e91f..9848e4fd334 100644
--- a/tensorflow/python/kernel_tests/distributions/student_t_test.py
+++ b/tensorflow/python/kernel_tests/distributions/student_t_test.py
@@ -61,10 +61,10 @@ class StudentTTest(test.TestCase):
     student = student_t.StudentT(df, loc=mu, scale=-sigma)
 
     log_pdf = student.log_prob(t)
-    self.assertEquals(log_pdf.get_shape(), (6,))
+    self.assertEqual(log_pdf.get_shape(), (6,))
     log_pdf_values = self.evaluate(log_pdf)
     pdf = student.prob(t)
-    self.assertEquals(pdf.get_shape(), (6,))
+    self.assertEqual(pdf.get_shape(), (6,))
     pdf_values = self.evaluate(pdf)
 
     if not stats:
@@ -116,10 +116,10 @@ class StudentTTest(test.TestCase):
     student = student_t.StudentT(df, loc=mu, scale=sigma)
 
     log_cdf = student.log_cdf(t)
-    self.assertEquals(log_cdf.get_shape(), (6,))
+    self.assertEqual(log_cdf.get_shape(), (6,))
     log_cdf_values = self.evaluate(log_cdf)
     cdf = student.cdf(t)
-    self.assertEquals(cdf.get_shape(), (6,))
+    self.assertEqual(cdf.get_shape(), (6,))
     cdf_values = self.evaluate(cdf)
 
     if not stats:
diff --git a/tensorflow/python/kernel_tests/distributions/util_test.py b/tensorflow/python/kernel_tests/distributions/util_test.py
index 030ad601bf4..093fdb69dc3 100644
--- a/tensorflow/python/kernel_tests/distributions/util_test.py
+++ b/tensorflow/python/kernel_tests/distributions/util_test.py
@@ -495,7 +495,7 @@ class RotateTransposeTest(test.TestCase):
       error_message = r"Attempt to convert a value \(None\)"
     else:
       error_message = "None values not supported."
-    with self.assertRaisesRegexp(ValueError, error_message):
+    with self.assertRaisesRegex(ValueError, error_message):
       du.rotate_transpose(None, 1)
     for x in (np.ones(1), np.ones((2, 1)), np.ones((3, 2, 1))):
       for shift in np.arange(-5, 5):
diff --git a/tensorflow/python/kernel_tests/fifo_queue_test.py b/tensorflow/python/kernel_tests/fifo_queue_test.py
index 880e949dd70..b470115440a 100644
--- a/tensorflow/python/kernel_tests/fifo_queue_test.py
+++ b/tensorflow/python/kernel_tests/fifo_queue_test.py
@@ -167,20 +167,20 @@ class FIFOQueueTest(test.TestCase):
       gc.collect()
       # If executing eagerly, deleting the Module should clean up the queue
       # resources.
-      with self.assertRaisesRegexp(errors_impl.NotFoundError,
-                                   r"Resource .* does not exist."):
+      with self.assertRaisesRegex(errors_impl.NotFoundError,
+                                  r"Resource .* does not exist."):
         gen_resource_variable_ops.destroy_resource_op(
             q1_handle, ignore_lookup_error=False)
-      with self.assertRaisesRegexp(errors_impl.NotFoundError,
-                                   r"Resource .* does not exist."):
+      with self.assertRaisesRegex(errors_impl.NotFoundError,
+                                  r"Resource .* does not exist."):
         gen_resource_variable_ops.destroy_resource_op(
             q2_handle, ignore_lookup_error=False)
 
   def testEnqueueDictWithoutNames(self):
     q = data_flow_ops.FIFOQueue(10, dtypes_lib.float32)
-    with self.assertRaisesRegexp(ValueError, "must have names"):
+    with self.assertRaisesRegex(ValueError, "must have names"):
       q.enqueue({"a": 12.0})
-    with self.assertRaisesRegexp(ValueError, "must have names"):
+    with self.assertRaisesRegex(ValueError, "must have names"):
       q.enqueue_many({"a": [12.0, 13.0]})
 
   def testDequeue(self):
@@ -473,8 +473,8 @@ class UnconvertedFIFOQueueTests(test.TestCase):
         self.assertEqual([elem], self.evaluate(dequeued_t))
 
       # Expect the operation to fail due to the queue being closed.
-      with self.assertRaisesRegexp(errors_impl.OutOfRangeError,
-                                   "is closed and has insufficient"):
+      with self.assertRaisesRegex(errors_impl.OutOfRangeError,
+                                  "is closed and has insufficient"):
         self.evaluate(dequeued_t)
 
   def testDoesNotLoseValue(self):
@@ -618,30 +618,30 @@ class UnconvertedFIFOQueueTests(test.TestCase):
           10, dtypes_lib.float32, shapes=((),), names="f")
       # Verify that enqueue() checks that when using names we must enqueue a
       # dictionary.
-      with self.assertRaisesRegexp(ValueError, "enqueue a dictionary"):
+      with self.assertRaisesRegex(ValueError, "enqueue a dictionary"):
         enqueue_op = q.enqueue(10.0)
-      with self.assertRaisesRegexp(ValueError, "enqueue a dictionary"):
+      with self.assertRaisesRegex(ValueError, "enqueue a dictionary"):
         enqueue_op = q.enqueue((10.0,))
       # The dictionary keys must match the queue component names.
-      with self.assertRaisesRegexp(ValueError, "match names of Queue"):
+      with self.assertRaisesRegex(ValueError, "match names of Queue"):
         enqueue_op = q.enqueue({})
-      with self.assertRaisesRegexp(ValueError, "match names of Queue"):
+      with self.assertRaisesRegex(ValueError, "match names of Queue"):
         enqueue_op = q.enqueue({"x": 12})
-      with self.assertRaisesRegexp(ValueError, "match names of Queue"):
+      with self.assertRaisesRegex(ValueError, "match names of Queue"):
         enqueue_op = q.enqueue({"f": 10.0, "s": "aa"})
       enqueue_op = q.enqueue({"f": 10.0})
       enqueue_op2 = q.enqueue({"f": 20.0})
       enqueue_op3 = q.enqueue({"f": 30.0})
       # Verify that enqueue_many() checks that when using names we must enqueue
       # a dictionary.
-      with self.assertRaisesRegexp(ValueError, "enqueue a dictionary"):
+      with self.assertRaisesRegex(ValueError, "enqueue a dictionary"):
         enqueue_op4 = q.enqueue_many([40.0, 50.0])
       # The dictionary keys must match the queue component names.
-      with self.assertRaisesRegexp(ValueError, "match names of Queue"):
+      with self.assertRaisesRegex(ValueError, "match names of Queue"):
         enqueue_op4 = q.enqueue_many({})
-      with self.assertRaisesRegexp(ValueError, "match names of Queue"):
+      with self.assertRaisesRegex(ValueError, "match names of Queue"):
         enqueue_op4 = q.enqueue_many({"x": 12})
-      with self.assertRaisesRegexp(ValueError, "match names of Queue"):
+      with self.assertRaisesRegex(ValueError, "match names of Queue"):
         enqueue_op4 = q.enqueue_many({"f": [40.0, 50.0], "s": ["aa", "bb"]})
       enqueue_op4 = q.enqueue_many({"f": [40.0, 50.0]})
       dequeue = q.dequeue()
@@ -665,32 +665,32 @@ class UnconvertedFIFOQueueTests(test.TestCase):
           names=("f", "i", "s"))
       # Verify that enqueue() checks that when using names we must enqueue a
       # dictionary.
-      with self.assertRaisesRegexp(ValueError, "enqueue a dictionary"):
+      with self.assertRaisesRegex(ValueError, "enqueue a dictionary"):
         enqueue_op = q.enqueue((10.0, 123, "aa"))
       # The dictionary keys must match the queue component names.
-      with self.assertRaisesRegexp(ValueError, "match names of Queue"):
+      with self.assertRaisesRegex(ValueError, "match names of Queue"):
         enqueue_op = q.enqueue({})
-      with self.assertRaisesRegexp(ValueError, "match names of Queue"):
+      with self.assertRaisesRegex(ValueError, "match names of Queue"):
         enqueue_op = q.enqueue({"x": 10.0})
-      with self.assertRaisesRegexp(ValueError, "match names of Queue"):
+      with self.assertRaisesRegex(ValueError, "match names of Queue"):
         enqueue_op = q.enqueue({"i": 12, "s": "aa"})
-      with self.assertRaisesRegexp(ValueError, "match names of Queue"):
+      with self.assertRaisesRegex(ValueError, "match names of Queue"):
         enqueue_op = q.enqueue({"i": 123, "s": "aa", "f": 10.0, "x": 10.0})
       enqueue_op = q.enqueue({"i": 123, "s": "aa", "f": 10.0})
       enqueue_op2 = q.enqueue({"i": 124, "s": "bb", "f": 20.0})
       enqueue_op3 = q.enqueue({"i": 125, "s": "cc", "f": 30.0})
       # Verify that enqueue_many() checks that when using names we must enqueue
       # a dictionary.
-      with self.assertRaisesRegexp(ValueError, "enqueue a dictionary"):
+      with self.assertRaisesRegex(ValueError, "enqueue a dictionary"):
         enqueue_op4 = q.enqueue_many(([40.0, 50.0], [126, 127], ["dd", "ee"]))
       # The dictionary keys must match the queue component names.
-      with self.assertRaisesRegexp(ValueError, "match names of Queue"):
+      with self.assertRaisesRegex(ValueError, "match names of Queue"):
         enqueue_op4 = q.enqueue_many({})
-      with self.assertRaisesRegexp(ValueError, "match names of Queue"):
+      with self.assertRaisesRegex(ValueError, "match names of Queue"):
         enqueue_op4 = q.enqueue_many({"x": [10.0, 20.0]})
-      with self.assertRaisesRegexp(ValueError, "match names of Queue"):
+      with self.assertRaisesRegex(ValueError, "match names of Queue"):
         enqueue_op4 = q.enqueue_many({"i": [12, 12], "s": ["aa", "bb"]})
-      with self.assertRaisesRegexp(ValueError, "match names of Queue"):
+      with self.assertRaisesRegex(ValueError, "match names of Queue"):
         enqueue_op4 = q.enqueue_many({
             "f": [40.0, 50.0],
             "i": [126, 127],
@@ -743,8 +743,8 @@ class UnconvertedFIFOQueueTests(test.TestCase):
       elems_ok = np.array([1] * 4).reshape((2, 2)).astype(np.int32)
       elems_bad = array_ops.placeholder(dtypes_lib.int32)
       enqueue_op = q.enqueue((elems_ok, elems_bad))
-      with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
-                                   r"Expected \[3,3\], got \[3,4\]"):
+      with self.assertRaisesRegex(errors_impl.InvalidArgumentError,
+                                  r"Expected \[3,3\], got \[3,4\]"):
         sess.run([enqueue_op],
                  feed_dict={elems_bad: np.array([1] * 12).reshape((3, 4))})
 
@@ -756,9 +756,10 @@ class UnconvertedFIFOQueueTests(test.TestCase):
       elems_bad = array_ops.placeholder(dtypes_lib.int32)
       enqueue_op = q.enqueue_many((elems_ok, elems_bad))
       dequeued_t = q.dequeue_many(2)
-      with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
-                                   "Shape mismatch in tuple component 1. "
-                                   r"Expected \[2,3,3\], got \[2,3,4\]"):
+      with self.assertRaisesRegex(
+          errors_impl.InvalidArgumentError,
+          "Shape mismatch in tuple component 1. "
+          r"Expected \[2,3,3\], got \[2,3,4\]"):
         sess.run([enqueue_op],
                  feed_dict={elems_bad: np.array([1] * 24).reshape((2, 3, 4))})
         self.evaluate(dequeued_t)
@@ -995,8 +996,8 @@ class FIFOQueueParallelTests(test.TestCase):
         for elem in elems:
           self.assertEqual([elem], self.evaluate(dequeued_t))
         # Expect the operation to fail due to the queue being closed.
-        with self.assertRaisesRegexp(errors_impl.OutOfRangeError,
-                                     "is closed and has insufficient"):
+        with self.assertRaisesRegex(errors_impl.OutOfRangeError,
+                                    "is closed and has insufficient"):
           self.evaluate(dequeued_t)
 
       dequeue_thread = self.checkedThread(target=dequeue)
@@ -1018,8 +1019,8 @@ class FIFOQueueParallelTests(test.TestCase):
 
       def dequeue():
         # Expect the operation to fail due to the queue being closed.
-        with self.assertRaisesRegexp(errors_impl.OutOfRangeError,
-                                     "is closed and has insufficient"):
+        with self.assertRaisesRegex(errors_impl.OutOfRangeError,
+                                    "is closed and has insufficient"):
           self.evaluate(dequeued_t)
 
       dequeue_thread = self.checkedThread(target=dequeue)
@@ -1046,8 +1047,8 @@ class FIFOQueueParallelTests(test.TestCase):
       def dequeue():
         self.assertAllEqual(elems, self.evaluate(dequeued_t))
         # Expect the operation to fail due to the queue being closed.
-        with self.assertRaisesRegexp(errors_impl.OutOfRangeError,
-                                     "is closed and has insufficient"):
+        with self.assertRaisesRegex(errors_impl.OutOfRangeError,
+                                    "is closed and has insufficient"):
           self.evaluate(dequeued_t)
 
       dequeue_thread = self.checkedThread(target=dequeue)
@@ -1074,8 +1075,8 @@ class FIFOQueueParallelTests(test.TestCase):
       def dequeue():
         self.assertAllEqual(elems[:3], self.evaluate(dequeued_t))
         # Expect the operation to fail due to the queue being closed.
-        with self.assertRaisesRegexp(errors_impl.OutOfRangeError,
-                                     "is closed and has insufficient"):
+        with self.assertRaisesRegex(errors_impl.OutOfRangeError,
+                                    "is closed and has insufficient"):
           self.evaluate(dequeued_t)
 
       dequeue_thread = self.checkedThread(target=dequeue)
@@ -1189,8 +1190,8 @@ class FIFOQueueParallelTests(test.TestCase):
 
       def dequeue():
         # Expect the operation to fail due to the queue being closed.
-        with self.assertRaisesRegexp(errors_impl.OutOfRangeError,
-                                     "is closed and has insufficient"):
+        with self.assertRaisesRegex(errors_impl.OutOfRangeError,
+                                    "is closed and has insufficient"):
           self.evaluate(dequeued_t)
 
       dequeue_thread = self.checkedThread(target=dequeue)
@@ -1212,8 +1213,8 @@ class FIFOQueueParallelTests(test.TestCase):
 
       def dequeue():
         # Expect the operation to fail due to the queue being closed.
-        with self.assertRaisesRegexp(errors_impl.OutOfRangeError,
-                                     "is closed and has insufficient"):
+        with self.assertRaisesRegex(errors_impl.OutOfRangeError,
+                                    "is closed and has insufficient"):
           self.evaluate(dequeued_t)
 
       dequeue_thread = self.checkedThread(target=dequeue)
@@ -1234,7 +1235,7 @@ class FIFOQueueParallelTests(test.TestCase):
       close_op.run()
 
       # Expect the operation to fail due to the queue being closed.
-      with self.assertRaisesRegexp(errors_impl.CancelledError, "is closed"):
+      with self.assertRaisesRegex(errors_impl.CancelledError, "is closed"):
         enqueue_op.run()
 
   def testEnqueueManyToClosedQueue(self):
@@ -1248,7 +1249,7 @@ class FIFOQueueParallelTests(test.TestCase):
       close_op.run()
 
       # Expect the operation to fail due to the queue being closed.
-      with self.assertRaisesRegexp(errors_impl.CancelledError, "is closed"):
+      with self.assertRaisesRegex(errors_impl.CancelledError, "is closed"):
         enqueue_op.run()
 
   def testBlockingEnqueueToFullQueue(self):
@@ -1702,8 +1703,8 @@ class FIFOQueueWithTimeoutTest(test.TestCase):
 
       # Intentionally do not run any enqueue_ops so that dequeue will block
       # until operation_timeout_in_ms.
-      with self.assertRaisesRegexp(errors_impl.DeadlineExceededError,
-                                   "Timed out waiting for notification"):
+      with self.assertRaisesRegex(errors_impl.DeadlineExceededError,
+                                  "Timed out waiting for notification"):
         self.evaluate(dequeued_t)
 
   def testReusableAfterTimeout(self):
@@ -1712,12 +1713,12 @@ class FIFOQueueWithTimeoutTest(test.TestCase):
       dequeued_t = q.dequeue()
       enqueue_op = q.enqueue(37)
 
-      with self.assertRaisesRegexp(errors_impl.DeadlineExceededError,
-                                   "Timed out waiting for notification"):
+      with self.assertRaisesRegex(errors_impl.DeadlineExceededError,
+                                  "Timed out waiting for notification"):
         sess.run(dequeued_t, options=config_pb2.RunOptions(timeout_in_ms=10))
 
-      with self.assertRaisesRegexp(errors_impl.DeadlineExceededError,
-                                   "Timed out waiting for notification"):
+      with self.assertRaisesRegex(errors_impl.DeadlineExceededError,
+                                  "Timed out waiting for notification"):
         sess.run(dequeued_t, options=config_pb2.RunOptions(timeout_in_ms=10))
 
       self.evaluate(enqueue_op)
diff --git a/tensorflow/python/kernel_tests/functional_ops_test.py b/tensorflow/python/kernel_tests/functional_ops_test.py
index cc1cfd57c18..7c660d837f3 100644
--- a/tensorflow/python/kernel_tests/functional_ops_test.py
+++ b/tensorflow/python/kernel_tests/functional_ops_test.py
@@ -257,7 +257,7 @@ class FunctionalOpsTest(test.TestCase):
     elems = np.array([1.0, 2.0, 3.0, 4.0, 5.0, 6.0])
     initializer = np.array(1.0)
     # Multiply a * 1 each time
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         ValueError, "two structures don't have the same nested structure"):
       functional_ops.scan(lambda a, x: (a, -a), elems, initializer)
 
@@ -712,12 +712,12 @@ class FunctionalOpsTest(test.TestCase):
           return n - 1, x + n, x
 
         with self.session(graph=g, use_gpu=use_gpu):
-          with self.assertRaisesRegexp(
+          with self.assertRaisesRegex(
               errors.InvalidArgumentError,
               "Expected a single scalar.*got 2 tensors."):
             functional_ops.While([5., 0.], CondReturnsTooManyArgs,
                                  Body)[0].eval()
-          with self.assertRaisesRegexp(
+          with self.assertRaisesRegex(
               errors.InvalidArgumentError,
               "While loop body returned 3 arguments. Expected: 2"):
             functional_ops.While([5., 0.], Cond,
@@ -934,13 +934,13 @@ class FunctionalOpsTest(test.TestCase):
       return v, v
 
     with self.test_session(use_gpu=True):
-      with self.assertRaisesRegexp(errors.InvalidArgumentError,
-                                   "must be a scalar"):
+      with self.assertRaisesRegex(errors.InvalidArgumentError,
+                                  "must be a scalar"):
         functional_ops.For([0], 10, 1, [0.0], Foo)[0].eval()
-      with self.assertRaisesRegexp(errors.InvalidArgumentError,
-                                   "Invalid start/limit/delta"):
+      with self.assertRaisesRegex(errors.InvalidArgumentError,
+                                  "Invalid start/limit/delta"):
         functional_ops.For(0, 10, -1, [0.0], Foo)[0].eval()
-      with self.assertRaisesRegexp(
+      with self.assertRaisesRegex(
           errors.InvalidArgumentError,
           "For loop body returned 2 arguments. Expected: 1"):
         functional_ops.For(0, 10, 1, [0.0], ReturnsTooManyArgs)[0].eval()
@@ -1169,8 +1169,7 @@ class PartitionedCallTest(test.TestCase):
         args=[constant_op.constant([1, 2, 3], dtype=dtypes.int32)],
         f=AddFive,
         executor_type="NON_EXISTENT_EXECUTOR")
-    with self.assertRaisesRegexp(errors.NotFoundError,
-                                 "NON_EXISTENT_EXECUTOR"):
+    with self.assertRaisesRegex(errors.NotFoundError, "NON_EXISTENT_EXECUTOR"):
       self.evaluate(op)
 
 
diff --git a/tensorflow/python/kernel_tests/identity_n_op_py_test.py b/tensorflow/python/kernel_tests/identity_n_op_py_test.py
index a1110d640f0..0498c1b019c 100644
--- a/tensorflow/python/kernel_tests/identity_n_op_py_test.py
+++ b/tensorflow/python/kernel_tests/identity_n_op_py_test.py
@@ -65,9 +65,9 @@ class IdentityNOpTest(test.TestCase):
       shape = [2, 3]
       array_2x3 = [[1, 2, 3], [6, 5, 4]]
       tensor = constant_op.constant(array_2x3)
-      self.assertEquals(shape, tensor.get_shape())
-      self.assertEquals(shape, array_ops.identity_n([tensor])[0].get_shape())
-      self.assertEquals(shape, array_ops.identity_n([array_2x3])[0].get_shape())
+      self.assertEqual(shape, tensor.get_shape())
+      self.assertEqual(shape, array_ops.identity_n([tensor])[0].get_shape())
+      self.assertEqual(shape, array_ops.identity_n([array_2x3])[0].get_shape())
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/kernel_tests/init_ops_test.py b/tensorflow/python/kernel_tests/init_ops_test.py
index 5b147847496..dd6e8fb5e63 100644
--- a/tensorflow/python/kernel_tests/init_ops_test.py
+++ b/tensorflow/python/kernel_tests/init_ops_test.py
@@ -245,11 +245,11 @@ class ConstantInitializersTest(test.TestCase):
 
   def testInvalidValueTypeForConstantInitializerCausesTypeError(self):
     c = constant_op.constant([1.0, 2.0, 3.0])
-    with self.assertRaisesRegexp(TypeError,
-                                 r"Invalid type for initial value: .*Tensor.*"):
+    with self.assertRaisesRegex(TypeError,
+                                r"Invalid type for initial value: .*Tensor.*"):
       init_ops.constant_initializer(c, dtype=dtypes.float32)
     v = variables.Variable([3.0, 2.0, 1.0])
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         TypeError, r"Invalid type for initial value: .*Variable.*"):
       init_ops.constant_initializer(v, dtype=dtypes.float32)
 
diff --git a/tensorflow/python/kernel_tests/inplace_ops_test.py b/tensorflow/python/kernel_tests/inplace_ops_test.py
index cbb63cecde7..72d4c28a31f 100644
--- a/tensorflow/python/kernel_tests/inplace_ops_test.py
+++ b/tensorflow/python/kernel_tests/inplace_ops_test.py
@@ -160,14 +160,14 @@ class InplaceOpsTest(test_util.TensorFlowTestCase):
 
   def testError(self):
     with self.cached_session():
-      with self.assertRaisesRegexp(errors.InvalidArgumentError,
-                                   "must be a vector"):
+      with self.assertRaisesRegex(errors.InvalidArgumentError,
+                                  "must be a vector"):
         _ = inplace_ops.inplace_update([[1.]], [[0]], [[10]]).eval()
-      with self.assertRaisesRegexp(errors.InvalidArgumentError,
-                                   "x and v shape doesn't match"):
+      with self.assertRaisesRegex(errors.InvalidArgumentError,
+                                  "x and v shape doesn't match"):
         _ = inplace_ops.inplace_update([[1.]], [0], [10]).eval()
-      with self.assertRaisesRegexp(errors.InvalidArgumentError,
-                                   "i and x shape doesn't match"):
+      with self.assertRaisesRegex(errors.InvalidArgumentError,
+                                  "i and x shape doesn't match"):
         _ = inplace_ops.inplace_update([[1.]], [0, 1], [[10]]).eval()
 
   @test_util.run_deprecated_v1
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_addition_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_addition_test.py
index 627349c69b3..597ca3a1606 100644
--- a/tensorflow/python/kernel_tests/linalg/linear_operator_addition_test.py
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_addition_test.py
@@ -63,11 +63,11 @@ class LinearOperatorAdditionCorrectnessTest(test.TestCase):
     self.assertIs(op_sum[0], op_a)
 
   def test_at_least_one_operators_required(self):
-    with self.assertRaisesRegexp(ValueError, "must contain at least one"):
+    with self.assertRaisesRegex(ValueError, "must contain at least one"):
       add_operators([])
 
   def test_attempting_to_add_numbers_raises(self):
-    with self.assertRaisesRegexp(TypeError, "contain only LinearOperator"):
+    with self.assertRaisesRegex(TypeError, "contain only LinearOperator"):
       add_operators([1, 2])
 
   @test_util.run_deprecated_v1
@@ -157,19 +157,19 @@ class LinearOperatorAdditionCorrectnessTest(test.TestCase):
   def test_incompatible_domain_dimensions_raises(self):
     op1 = linalg.LinearOperatorFullMatrix(rng.rand(2, 3))
     op2 = linalg.LinearOperatorDiag(rng.rand(2, 4))
-    with self.assertRaisesRegexp(ValueError, "must.*same domain dimension"):
+    with self.assertRaisesRegex(ValueError, "must.*same domain dimension"):
       add_operators([op1, op2])
 
   def test_incompatible_range_dimensions_raises(self):
     op1 = linalg.LinearOperatorFullMatrix(rng.rand(2, 3))
     op2 = linalg.LinearOperatorDiag(rng.rand(3, 3))
-    with self.assertRaisesRegexp(ValueError, "must.*same range dimension"):
+    with self.assertRaisesRegex(ValueError, "must.*same range dimension"):
       add_operators([op1, op2])
 
   def test_non_broadcastable_batch_shape_raises(self):
     op1 = linalg.LinearOperatorFullMatrix(rng.rand(2, 3, 3))
     op2 = linalg.LinearOperatorDiag(rng.rand(4, 3, 3))
-    with self.assertRaisesRegexp(ValueError, "Incompatible shapes"):
+    with self.assertRaisesRegex(ValueError, "Incompatible shapes"):
       add_operators([op1, op2])
 
 
@@ -258,7 +258,7 @@ class LinearOperatorOrderOfAdditionTest(test.TestCase):
     ]
     # tril cannot be added in tier 0, and the intermediate tier 1 with the
     # BadAdder will catch it and raise.
-    with self.assertRaisesRegexp(AssertionError, "BadAdder.can_add called"):
+    with self.assertRaisesRegex(AssertionError, "BadAdder.can_add called"):
       add_operators([diag1, diag2, tril], addition_tiers=addition_tiers)
 
 
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_adjoint_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_adjoint_test.py
index 5619f1cd38a..88ab7079593 100644
--- a/tensorflow/python/kernel_tests/linalg/linear_operator_adjoint_test.py
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_adjoint_test.py
@@ -102,11 +102,11 @@ class LinearOperatorAdjointTest(
     matrix = [[1., 0.], [1., 1.]]
     operator = linalg.LinearOperatorFullMatrix(
         matrix, is_positive_definite=False)
-    with self.assertRaisesRegexp(ValueError, "positive-definite"):
+    with self.assertRaisesRegex(ValueError, "positive-definite"):
       LinearOperatorAdjoint(operator, is_positive_definite=True)
 
     operator = linalg.LinearOperatorFullMatrix(matrix, is_self_adjoint=False)
-    with self.assertRaisesRegexp(ValueError, "self-adjoint"):
+    with self.assertRaisesRegex(ValueError, "self-adjoint"):
       LinearOperatorAdjoint(operator, is_self_adjoint=True)
 
   def test_name(self):
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_algebra_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_algebra_test.py
index 8057d055783..bbdc4f1a5fa 100644
--- a/tensorflow/python/kernel_tests/linalg/linear_operator_algebra_test.py
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_algebra_test.py
@@ -67,14 +67,14 @@ class AdjointTest(test.TestCase):
     class CustomLinOp(linear_operator.LinearOperator):
       pass
 
-    with self.assertRaisesRegexp(TypeError, "must be callable"):
+    with self.assertRaisesRegex(TypeError, "must be callable"):
       linear_operator_algebra.RegisterAdjoint(CustomLinOp)("blah")
 
     # First registration is OK
     linear_operator_algebra.RegisterAdjoint(CustomLinOp)(lambda a: None)
 
     # Second registration fails
-    with self.assertRaisesRegexp(ValueError, "has already been registered"):
+    with self.assertRaisesRegex(ValueError, "has already been registered"):
       linear_operator_algebra.RegisterAdjoint(CustomLinOp)(lambda a: None)
 
   def testExactAdjointRegistrationsAllMatch(self):
@@ -102,10 +102,10 @@ class CholeskyTest(test.TestCase):
     def _cholesky(a):  # pylint: disable=unused-argument,unused-variable
       return "OK"
 
-    with self.assertRaisesRegexp(ValueError, "positive definite"):
+    with self.assertRaisesRegex(ValueError, "positive definite"):
       CustomLinOp(dtype=None, is_self_adjoint=True).cholesky()
 
-    with self.assertRaisesRegexp(ValueError, "self adjoint"):
+    with self.assertRaisesRegex(ValueError, "self adjoint"):
       CustomLinOp(dtype=None, is_positive_definite=True).cholesky()
 
     custom_linop = CustomLinOp(
@@ -117,14 +117,14 @@ class CholeskyTest(test.TestCase):
     class CustomLinOp(linear_operator.LinearOperator):
       pass
 
-    with self.assertRaisesRegexp(TypeError, "must be callable"):
+    with self.assertRaisesRegex(TypeError, "must be callable"):
       linear_operator_algebra.RegisterCholesky(CustomLinOp)("blah")
 
     # First registration is OK
     linear_operator_algebra.RegisterCholesky(CustomLinOp)(lambda a: None)
 
     # Second registration fails
-    with self.assertRaisesRegexp(ValueError, "has already been registered"):
+    with self.assertRaisesRegex(ValueError, "has already been registered"):
       linear_operator_algebra.RegisterCholesky(CustomLinOp)(lambda a: None)
 
   def testExactCholeskyRegistrationsAllMatch(self):
@@ -161,7 +161,7 @@ class MatmulTest(test.TestCase):
     class CustomLinOp(linear_operator.LinearOperator):
       pass
 
-    with self.assertRaisesRegexp(TypeError, "must be callable"):
+    with self.assertRaisesRegex(TypeError, "must be callable"):
       linear_operator_algebra.RegisterMatmul(CustomLinOp, CustomLinOp)("blah")
 
     # First registration is OK
@@ -169,7 +169,7 @@ class MatmulTest(test.TestCase):
         CustomLinOp, CustomLinOp)(lambda a: None)
 
     # Second registration fails
-    with self.assertRaisesRegexp(ValueError, "has already been registered"):
+    with self.assertRaisesRegex(ValueError, "has already been registered"):
       linear_operator_algebra.RegisterMatmul(
           CustomLinOp, CustomLinOp)(lambda a: None)
 
@@ -210,7 +210,7 @@ class SolveTest(test.TestCase):
     class CustomLinOp(linear_operator.LinearOperator):
       pass
 
-    with self.assertRaisesRegexp(TypeError, "must be callable"):
+    with self.assertRaisesRegex(TypeError, "must be callable"):
       linear_operator_algebra.RegisterSolve(CustomLinOp, CustomLinOp)("blah")
 
     # First registration is OK
@@ -218,7 +218,7 @@ class SolveTest(test.TestCase):
         CustomLinOp, CustomLinOp)(lambda a: None)
 
     # Second registration fails
-    with self.assertRaisesRegexp(ValueError, "has already been registered"):
+    with self.assertRaisesRegex(ValueError, "has already been registered"):
       linear_operator_algebra.RegisterSolve(
           CustomLinOp, CustomLinOp)(lambda a: None)
 
@@ -247,7 +247,7 @@ class InverseTest(test.TestCase):
     def _inverse(a):  # pylint: disable=unused-argument,unused-variable
       return "OK"
 
-    with self.assertRaisesRegexp(ValueError, "singular"):
+    with self.assertRaisesRegex(ValueError, "singular"):
       CustomLinOp(dtype=None, is_non_singular=False).inverse()
 
     self.assertEqual("OK", CustomLinOp(
@@ -258,14 +258,14 @@ class InverseTest(test.TestCase):
     class CustomLinOp(linear_operator.LinearOperator):
       pass
 
-    with self.assertRaisesRegexp(TypeError, "must be callable"):
+    with self.assertRaisesRegex(TypeError, "must be callable"):
       linear_operator_algebra.RegisterInverse(CustomLinOp)("blah")
 
     # First registration is OK
     linear_operator_algebra.RegisterInverse(CustomLinOp)(lambda a: None)
 
     # Second registration fails
-    with self.assertRaisesRegexp(ValueError, "has already been registered"):
+    with self.assertRaisesRegex(ValueError, "has already been registered"):
       linear_operator_algebra.RegisterInverse(CustomLinOp)(lambda a: None)
 
   def testExactRegistrationsAllMatch(self):
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_block_diag_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_block_diag_test.py
index 552825fb47c..e0e6fedd34e 100644
--- a/tensorflow/python/kernel_tests/linalg/linear_operator_block_diag_test.py
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_block_diag_test.py
@@ -247,7 +247,7 @@ class SquareLinearOperatorBlockDiagTest(
     self.assertFalse(operator.is_positive_definite)
     self.assertTrue(operator.is_non_singular)
 
-    with self.assertRaisesRegexp(ValueError, "always non-singular"):
+    with self.assertRaisesRegex(ValueError, "always non-singular"):
       block_diag.LinearOperatorBlockDiag(
           [operator_1, operator_2], is_non_singular=False)
 
@@ -265,7 +265,7 @@ class SquareLinearOperatorBlockDiagTest(
         linalg.LinearOperatorFullMatrix(rng.rand(2, 3, 3)),
         linalg.LinearOperatorFullMatrix(rng.rand(2, 3, 3).astype(np.float32))
     ]
-    with self.assertRaisesRegexp(TypeError, "same dtype"):
+    with self.assertRaisesRegex(TypeError, "same dtype"):
       block_diag.LinearOperatorBlockDiag(operators)
 
   def test_non_square_operator_raises(self):
@@ -273,11 +273,11 @@ class SquareLinearOperatorBlockDiagTest(
         linalg.LinearOperatorFullMatrix(rng.rand(3, 4), is_square=False),
         linalg.LinearOperatorFullMatrix(rng.rand(3, 3))
     ]
-    with self.assertRaisesRegexp(ValueError, "square matrices"):
+    with self.assertRaisesRegex(ValueError, "square matrices"):
       block_diag.LinearOperatorBlockDiag(operators)
 
   def test_empty_operators_raises(self):
-    with self.assertRaisesRegexp(ValueError, "non-empty"):
+    with self.assertRaisesRegex(ValueError, "non-empty"):
       block_diag.LinearOperatorBlockDiag([])
 
   def test_incompatible_input_blocks_raises(self):
@@ -291,7 +291,7 @@ class SquareLinearOperatorBlockDiagTest(
     x = np.random.rand(2, 4, 5).tolist()
     msg = ("dimension does not match" if context.executing_eagerly()
            else "input structure is ambiguous")
-    with self.assertRaisesRegexp(ValueError, msg):
+    with self.assertRaisesRegex(ValueError, msg):
       operator.matmul(x)
 
 
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_block_lower_triangular_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_block_lower_triangular_test.py
index dfa5c900ecd..a254427fa46 100644
--- a/tensorflow/python/kernel_tests/linalg/linear_operator_block_lower_triangular_test.py
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_block_lower_triangular_test.py
@@ -219,7 +219,7 @@ class SquareLinearOperatorBlockLowerTriangularTest(
     self.assertFalse(operator.is_positive_definite)
     self.assertTrue(operator.is_non_singular)
 
-    with self.assertRaisesRegexp(ValueError, "always non-singular"):
+    with self.assertRaisesRegex(ValueError, "always non-singular"):
       block_lower_triangular.LinearOperatorBlockLowerTriangular(
           [[operator_1], [operator_2, operator_3]], is_non_singular=False)
 
@@ -230,7 +230,7 @@ class SquareLinearOperatorBlockLowerTriangularTest(
     block_lower_triangular.LinearOperatorBlockLowerTriangular(
         [[operator_1], [operator_4, operator_2]], is_non_singular=True)
 
-    with self.assertRaisesRegexp(ValueError, "always singular"):
+    with self.assertRaisesRegex(ValueError, "always singular"):
       block_lower_triangular.LinearOperatorBlockLowerTriangular(
           [[operator_1], [operator_2, operator_4]], is_non_singular=True)
 
@@ -240,7 +240,7 @@ class SquareLinearOperatorBlockLowerTriangularTest(
         [linalg.LinearOperatorFullMatrix(rng.rand(2, 3, 3)),
          linalg.LinearOperatorFullMatrix(rng.rand(2, 3, 3).astype(np.float32))]
     ]
-    with self.assertRaisesRegexp(TypeError, "same dtype"):
+    with self.assertRaisesRegex(TypeError, "same dtype"):
       block_lower_triangular.LinearOperatorBlockLowerTriangular(operators)
 
   def test_non_square_operator_raises(self):
@@ -249,15 +249,15 @@ class SquareLinearOperatorBlockLowerTriangularTest(
         [linalg.LinearOperatorFullMatrix(rng.rand(4, 4)),
          linalg.LinearOperatorFullMatrix(rng.rand(4, 4))]
     ]
-    with self.assertRaisesRegexp(ValueError, "must be square"):
+    with self.assertRaisesRegex(ValueError, "must be square"):
       block_lower_triangular.LinearOperatorBlockLowerTriangular(operators)
 
   def test_empty_operators_raises(self):
-    with self.assertRaisesRegexp(ValueError, "non-empty"):
+    with self.assertRaisesRegex(ValueError, "non-empty"):
       block_lower_triangular.LinearOperatorBlockLowerTriangular([])
 
   def test_operators_wrong_length_raises(self):
-    with self.assertRaisesRegexp(ValueError, "must contain `i` blocks"):
+    with self.assertRaisesRegex(ValueError, "must contain `i` blocks"):
       block_lower_triangular.LinearOperatorBlockLowerTriangular([
           [linalg.LinearOperatorFullMatrix(rng.rand(2, 2))],
           [linalg.LinearOperatorFullMatrix(rng.rand(2, 2))
@@ -269,7 +269,7 @@ class SquareLinearOperatorBlockLowerTriangularTest(
         [linalg.LinearOperatorFullMatrix(rng.rand(3, 4)),
          linalg.LinearOperatorFullMatrix(rng.rand(3, 3))]
     ]
-    with self.assertRaisesRegexp(ValueError, "must be equal"):
+    with self.assertRaisesRegex(ValueError, "must be equal"):
       block_lower_triangular.LinearOperatorBlockLowerTriangular(operators)
 
   def test_incompatible_input_blocks_raises(self):
@@ -286,7 +286,7 @@ class SquareLinearOperatorBlockLowerTriangularTest(
     x = np.random.rand(2, 4, 5).tolist()
     msg = ("dimension does not match" if context.executing_eagerly()
            else "input structure is ambiguous")
-    with self.assertRaisesRegexp(ValueError, msg):
+    with self.assertRaisesRegex(ValueError, msg):
       operator.matmul(x)
 
 
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_circulant_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_circulant_test.py
index eb506467e29..c3a3ae9fe8a 100644
--- a/tensorflow/python/kernel_tests/linalg/linear_operator_circulant_test.py
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_circulant_test.py
@@ -381,7 +381,7 @@ class LinearOperatorCirculantTestNonHermitianSpectrum(
 
   def test_real_spectrum_and_not_self_adjoint_hint_raises(self):
     spectrum = [1., 2.]
-    with self.assertRaisesRegexp(ValueError, "real.*always.*self-adjoint"):
+    with self.assertRaisesRegex(ValueError, "real.*always.*self-adjoint"):
       linalg.LinearOperatorCirculant(spectrum, is_self_adjoint=False)
 
   def test_real_spectrum_auto_sets_is_self_adjoint_to_true(self):
@@ -635,7 +635,7 @@ class LinearOperatorCirculant2DTestNonHermitianSpectrum(
 
   def test_real_spectrum_and_not_self_adjoint_hint_raises(self):
     spectrum = [[1., 2.], [3., 4]]
-    with self.assertRaisesRegexp(ValueError, "real.*always.*self-adjoint"):
+    with self.assertRaisesRegex(ValueError, "real.*always.*self-adjoint"):
       linalg.LinearOperatorCirculant2D(spectrum, is_self_adjoint=False)
 
   def test_real_spectrum_auto_sets_is_self_adjoint_to_true(self):
@@ -645,7 +645,7 @@ class LinearOperatorCirculant2DTestNonHermitianSpectrum(
 
   def test_invalid_rank_raises(self):
     spectrum = array_ops.constant(np.float32(rng.rand(2)))
-    with self.assertRaisesRegexp(ValueError, "must have at least 2 dimensions"):
+    with self.assertRaisesRegex(ValueError, "must have at least 2 dimensions"):
       linalg.LinearOperatorCirculant2D(spectrum)
 
   def test_tape_safe(self):
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_composition_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_composition_test.py
index edae1efb845..9bca236bbc3 100644
--- a/tensorflow/python/kernel_tests/linalg/linear_operator_composition_test.py
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_composition_test.py
@@ -113,7 +113,7 @@ class SquareLinearOperatorCompositionTest(
     self.assertFalse(operator.is_positive_definite)
     self.assertTrue(operator.is_non_singular)
 
-    with self.assertRaisesRegexp(ValueError, "always non-singular"):
+    with self.assertRaisesRegex(ValueError, "always non-singular"):
       linalg.LinearOperatorComposition(
           [operator_1, operator_2], is_non_singular=False)
 
@@ -131,11 +131,11 @@ class SquareLinearOperatorCompositionTest(
         linalg.LinearOperatorFullMatrix(rng.rand(2, 3, 3)),
         linalg.LinearOperatorFullMatrix(rng.rand(2, 3, 3).astype(np.float32))
     ]
-    with self.assertRaisesRegexp(TypeError, "same dtype"):
+    with self.assertRaisesRegex(TypeError, "same dtype"):
       linalg.LinearOperatorComposition(operators)
 
   def test_empty_operators_raises(self):
-    with self.assertRaisesRegexp(ValueError, "non-empty"):
+    with self.assertRaisesRegex(ValueError, "non-empty"):
       linalg.LinearOperatorComposition([])
 
 
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_diag_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_diag_test.py
index 59f5fa20024..d22659c306a 100644
--- a/tensorflow/python/kernel_tests/linalg/linear_operator_diag_test.py
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_diag_test.py
@@ -126,7 +126,7 @@ class LinearOperatorDiagTest(
       self.evaluate(operator.assert_self_adjoint())
 
   def test_scalar_diag_raises(self):
-    with self.assertRaisesRegexp(ValueError, "must have at least 1 dimension"):
+    with self.assertRaisesRegex(ValueError, "must have at least 1 dimension"):
       linalg.LinearOperatorDiag(1.)
 
   def test_broadcast_matmul_and_solve(self):
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_full_matrix_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_full_matrix_test.py
index 7e01626e1db..1c5f7cfbf31 100644
--- a/tensorflow/python/kernel_tests/linalg/linear_operator_full_matrix_test.py
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_full_matrix_test.py
@@ -249,7 +249,7 @@ class NonSquareLinearOperatorFullMatrixTest(
     self.assertFalse(operator.is_square)
 
   def test_matrix_must_have_at_least_two_dims_or_raises(self):
-    with self.assertRaisesRegexp(ValueError, "at least 2 dimensions"):
+    with self.assertRaisesRegex(ValueError, "at least 2 dimensions"):
       linalg.LinearOperatorFullMatrix([1.])
 
   def test_tape_safe(self):
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_householder_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_householder_test.py
index 4179d450ad1..5462a7c9071 100644
--- a/tensorflow/python/kernel_tests/linalg/linear_operator_householder_test.py
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_householder_test.py
@@ -76,7 +76,7 @@ class LinearOperatorHouseholderTest(
     return operator, matrix
 
   def test_scalar_reflection_axis_raises(self):
-    with self.assertRaisesRegexp(ValueError, "must have at least 1 dimension"):
+    with self.assertRaisesRegex(ValueError, "must have at least 1 dimension"):
       householder.LinearOperatorHouseholder(1.)
 
   def test_householder_adjoint_type(self):
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_identity_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_identity_test.py
index a7fd4d1fc34..ab8306910fa 100644
--- a/tensorflow/python/kernel_tests/linalg/linear_operator_identity_test.py
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_identity_test.py
@@ -89,27 +89,27 @@ class LinearOperatorIdentityTest(
       self.assertAllClose(x, self.evaluate(y))
 
   def test_non_scalar_num_rows_raises_static(self):
-    with self.assertRaisesRegexp(ValueError, "must be a 0-D Tensor"):
+    with self.assertRaisesRegex(ValueError, "must be a 0-D Tensor"):
       linalg_lib.LinearOperatorIdentity(num_rows=[2])
 
   def test_non_integer_num_rows_raises_static(self):
-    with self.assertRaisesRegexp(TypeError, "must be integer"):
+    with self.assertRaisesRegex(TypeError, "must be integer"):
       linalg_lib.LinearOperatorIdentity(num_rows=2.)
 
   def test_negative_num_rows_raises_static(self):
-    with self.assertRaisesRegexp(ValueError, "must be non-negative"):
+    with self.assertRaisesRegex(ValueError, "must be non-negative"):
       linalg_lib.LinearOperatorIdentity(num_rows=-2)
 
   def test_non_1d_batch_shape_raises_static(self):
-    with self.assertRaisesRegexp(ValueError, "must be a 1-D"):
+    with self.assertRaisesRegex(ValueError, "must be a 1-D"):
       linalg_lib.LinearOperatorIdentity(num_rows=2, batch_shape=2)
 
   def test_non_integer_batch_shape_raises_static(self):
-    with self.assertRaisesRegexp(TypeError, "must be integer"):
+    with self.assertRaisesRegex(TypeError, "must be integer"):
       linalg_lib.LinearOperatorIdentity(num_rows=2, batch_shape=[2.])
 
   def test_negative_batch_shape_raises_static(self):
-    with self.assertRaisesRegexp(ValueError, "must be non-negative"):
+    with self.assertRaisesRegex(ValueError, "must be non-negative"):
       linalg_lib.LinearOperatorIdentity(num_rows=2, batch_shape=[-2])
 
   def test_non_scalar_num_rows_raises_dynamic(self):
@@ -148,7 +148,7 @@ class LinearOperatorIdentityTest(
   def test_wrong_matrix_dimensions_raises_static(self):
     operator = linalg_lib.LinearOperatorIdentity(num_rows=2)
     x = rng.randn(3, 3).astype(np.float32)
-    with self.assertRaisesRegexp(ValueError, "Dimensions.*not compatible"):
+    with self.assertRaisesRegex(ValueError, "Dimensions.*not compatible"):
       operator.matmul(x)
 
   def test_wrong_matrix_dimensions_raises_dynamic(self):
@@ -241,7 +241,7 @@ class LinearOperatorIdentityTest(
     self.assertTrue(operator.is_self_adjoint)
 
     # Any of them False raises because the identity is always self-adjoint etc..
-    with self.assertRaisesRegexp(ValueError, "is always non-singular"):
+    with self.assertRaisesRegex(ValueError, "is always non-singular"):
       operator = linalg_lib.LinearOperatorIdentity(
           num_rows=2,
           is_non_singular=None,
@@ -269,10 +269,10 @@ class LinearOperatorIdentityTest(
         operator.inverse(), linalg_lib.LinearOperatorIdentity)
 
   def test_ref_type_shape_args_raises(self):
-    with self.assertRaisesRegexp(TypeError, "num_rows.*reference"):
+    with self.assertRaisesRegex(TypeError, "num_rows.*reference"):
       linalg_lib.LinearOperatorIdentity(num_rows=variables_module.Variable(2))
 
-    with self.assertRaisesRegexp(TypeError, "batch_shape.*reference"):
+    with self.assertRaisesRegex(TypeError, "batch_shape.*reference"):
       linalg_lib.LinearOperatorIdentity(
           num_rows=2, batch_shape=variables_module.Variable([3]))
 
@@ -380,7 +380,7 @@ class LinearOperatorScaledIdentityTest(
 
   def test_non_scalar_num_rows_raises_static(self):
     # Many "test_...num_rows" tests are performed in LinearOperatorIdentity.
-    with self.assertRaisesRegexp(ValueError, "must be a 0-D Tensor"):
+    with self.assertRaisesRegex(ValueError, "must be a 0-D Tensor"):
       linalg_lib.LinearOperatorScaledIdentity(
           num_rows=[2], multiplier=123.)
 
@@ -388,7 +388,7 @@ class LinearOperatorScaledIdentityTest(
     operator = linalg_lib.LinearOperatorScaledIdentity(
         num_rows=2, multiplier=2.2)
     x = rng.randn(3, 3).astype(np.float32)
-    with self.assertRaisesRegexp(ValueError, "Dimensions.*not compatible"):
+    with self.assertRaisesRegex(ValueError, "Dimensions.*not compatible"):
       operator.matmul(x)
 
   def test_wrong_matrix_dimensions_raises_dynamic(self):
@@ -540,7 +540,7 @@ class LinearOperatorScaledIdentityTest(
         linalg_lib.LinearOperatorScaledIdentity)
 
   def test_ref_type_shape_args_raises(self):
-    with self.assertRaisesRegexp(TypeError, "num_rows.*reference"):
+    with self.assertRaisesRegex(TypeError, "num_rows.*reference"):
       linalg_lib.LinearOperatorScaledIdentity(
           num_rows=variables_module.Variable(2), multiplier=1.23)
 
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_inversion_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_inversion_test.py
index 4b2ce3d9da7..618556dc5a5 100644
--- a/tensorflow/python/kernel_tests/linalg/linear_operator_inversion_test.py
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_inversion_test.py
@@ -100,11 +100,11 @@ class LinearOperatorInversionTest(
     matrix = [[1., 0.], [1., 1.]]
     operator = linalg.LinearOperatorFullMatrix(
         matrix, is_positive_definite=False)
-    with self.assertRaisesRegexp(ValueError, "positive-definite"):
+    with self.assertRaisesRegex(ValueError, "positive-definite"):
       LinearOperatorInversion(operator, is_positive_definite=True)
 
     operator = linalg.LinearOperatorFullMatrix(matrix, is_self_adjoint=False)
-    with self.assertRaisesRegexp(ValueError, "self-adjoint"):
+    with self.assertRaisesRegex(ValueError, "self-adjoint"):
       LinearOperatorInversion(operator, is_self_adjoint=True)
 
   def test_singular_raises(self):
@@ -112,11 +112,11 @@ class LinearOperatorInversionTest(
     matrix = [[1., 1.], [1., 1.]]
 
     operator = linalg.LinearOperatorFullMatrix(matrix, is_non_singular=False)
-    with self.assertRaisesRegexp(ValueError, "is_non_singular"):
+    with self.assertRaisesRegex(ValueError, "is_non_singular"):
       LinearOperatorInversion(operator)
 
     operator = linalg.LinearOperatorFullMatrix(matrix)
-    with self.assertRaisesRegexp(ValueError, "is_non_singular"):
+    with self.assertRaisesRegex(ValueError, "is_non_singular"):
       LinearOperatorInversion(operator, is_non_singular=False)
 
   def test_name(self):
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_kronecker_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_kronecker_test.py
index 04d8ab2938a..1d002a171f6 100644
--- a/tensorflow/python/kernel_tests/linalg/linear_operator_kronecker_test.py
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_kronecker_test.py
@@ -166,7 +166,7 @@ class SquareLinearOperatorKroneckerTest(
     self.assertFalse(operator.is_positive_definite)
     self.assertTrue(operator.is_non_singular)
 
-    with self.assertRaisesRegexp(ValueError, "always non-singular"):
+    with self.assertRaisesRegex(ValueError, "always non-singular"):
       kronecker.LinearOperatorKronecker(
           [operator_1, operator_2], is_non_singular=False)
 
@@ -184,11 +184,11 @@ class SquareLinearOperatorKroneckerTest(
         linalg.LinearOperatorFullMatrix(rng.rand(2, 3, 3)),
         linalg.LinearOperatorFullMatrix(rng.rand(2, 3, 3).astype(np.float32))
     ]
-    with self.assertRaisesRegexp(TypeError, "same dtype"):
+    with self.assertRaisesRegex(TypeError, "same dtype"):
       kronecker.LinearOperatorKronecker(operators)
 
   def test_empty_or_one_operators_raises(self):
-    with self.assertRaisesRegexp(ValueError, ">=1 operators"):
+    with self.assertRaisesRegex(ValueError, ">=1 operators"):
       kronecker.LinearOperatorKronecker([])
 
   def test_kronecker_adjoint_type(self):
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_low_rank_update_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_low_rank_update_test.py
index d27ab7d6ba5..2c14d4021db 100644
--- a/tensorflow/python/kernel_tests/linalg/linear_operator_low_rank_update_test.py
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_low_rank_update_test.py
@@ -310,34 +310,34 @@ class LinearOperatorLowRankUpdateBroadcastsShape(test.TestCase):
     base_operator = linalg.LinearOperatorIdentity(num_rows=3, dtype=np.float64)
     u = rng.rand(5, 3, 2)
     v = rng.rand(4, 3, 2)
-    with self.assertRaisesRegexp(ValueError, "Incompatible shapes"):
+    with self.assertRaisesRegex(ValueError, "Incompatible shapes"):
       linalg.LinearOperatorLowRankUpdate(base_operator, u=u, v=v)
 
   def test_u_and_base_operator_incompatible_batch_shape_raises(self):
     base_operator = linalg.LinearOperatorIdentity(
         num_rows=3, batch_shape=[4], dtype=np.float64)
     u = rng.rand(5, 3, 2)
-    with self.assertRaisesRegexp(ValueError, "Incompatible shapes"):
+    with self.assertRaisesRegex(ValueError, "Incompatible shapes"):
       linalg.LinearOperatorLowRankUpdate(base_operator, u=u)
 
   def test_u_and_base_operator_incompatible_domain_dimension(self):
     base_operator = linalg.LinearOperatorIdentity(num_rows=3, dtype=np.float64)
     u = rng.rand(5, 4, 2)
-    with self.assertRaisesRegexp(ValueError, "not compatible"):
+    with self.assertRaisesRegex(ValueError, "not compatible"):
       linalg.LinearOperatorLowRankUpdate(base_operator, u=u)
 
   def test_u_and_diag_incompatible_low_rank_raises(self):
     base_operator = linalg.LinearOperatorIdentity(num_rows=3, dtype=np.float64)
     u = rng.rand(5, 3, 2)
     diag = rng.rand(5, 4)  # Last dimension should be 2
-    with self.assertRaisesRegexp(ValueError, "not compatible"):
+    with self.assertRaisesRegex(ValueError, "not compatible"):
       linalg.LinearOperatorLowRankUpdate(base_operator, u=u, diag_update=diag)
 
   def test_diag_incompatible_batch_shape_raises(self):
     base_operator = linalg.LinearOperatorIdentity(num_rows=3, dtype=np.float64)
     u = rng.rand(5, 3, 2)
     diag = rng.rand(4, 2)  # First dimension should be 5
-    with self.assertRaisesRegexp(ValueError, "Incompatible shapes"):
+    with self.assertRaisesRegex(ValueError, "Incompatible shapes"):
       linalg.LinearOperatorLowRankUpdate(base_operator, u=u, diag_update=diag)
 
 
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_lower_triangular_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_lower_triangular_test.py
index 22a7aa798b5..a54d1944f54 100644
--- a/tensorflow/python/kernel_tests/linalg/linear_operator_lower_triangular_test.py
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_lower_triangular_test.py
@@ -86,7 +86,7 @@ class LinearOperatorLowerTriangularTest(
     self.assertFalse(operator.is_self_adjoint)
 
   def test_tril_must_have_at_least_two_dims_or_raises(self):
-    with self.assertRaisesRegexp(ValueError, "at least 2 dimensions"):
+    with self.assertRaisesRegex(ValueError, "at least 2 dimensions"):
       linalg.LinearOperatorLowerTriangular([1.])
 
   def test_triangular_diag_matmul(self):
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_permutation_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_permutation_test.py
index 864d5f2ee52..78d477be685 100644
--- a/tensorflow/python/kernel_tests/linalg/linear_operator_permutation_test.py
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_permutation_test.py
@@ -75,14 +75,14 @@ class LinearOperatorPermutationTest(
 
   def test_permutation_raises(self):
     perm = constant_op.constant(0, dtype=dtypes.int32)
-    with self.assertRaisesRegexp(ValueError, "must have at least 1 dimension"):
+    with self.assertRaisesRegex(ValueError, "must have at least 1 dimension"):
       permutation.LinearOperatorPermutation(perm)
     perm = [0., 1., 2.]
-    with self.assertRaisesRegexp(TypeError, "must be integer dtype"):
+    with self.assertRaisesRegex(TypeError, "must be integer dtype"):
       permutation.LinearOperatorPermutation(perm)
     perm = [-1, 2, 3]
-    with self.assertRaisesRegexp(
-        ValueError, "must be a vector of unique integers"):
+    with self.assertRaisesRegex(ValueError,
+                                "must be a vector of unique integers"):
       permutation.LinearOperatorPermutation(perm)
 
   def test_to_dense_4x4(self):
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_test.py
index 9280abc5f5e..475cac212ce 100644
--- a/tensorflow/python/kernel_tests/linalg/linear_operator_test.py
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_test.py
@@ -173,29 +173,29 @@ class LinearOperatorTest(test.TestCase):
     self.assertFalse(operator.is_square)
 
   def test_is_square_set_incorrectly_to_false_raises(self):
-    with self.assertRaisesRegexp(ValueError, "but.*was square"):
+    with self.assertRaisesRegex(ValueError, "but.*was square"):
       _ = LinearOperatorShape(shape=(2, 4, 4), is_square=False).is_square
 
   def test_is_square_set_inconsistent_with_other_hints_raises(self):
-    with self.assertRaisesRegexp(ValueError, "is always square"):
+    with self.assertRaisesRegex(ValueError, "is always square"):
       matrix = array_ops.placeholder_with_default(input=(), shape=None)
       LinearOperatorMatmulSolve(matrix, is_non_singular=True, is_square=False)
 
-    with self.assertRaisesRegexp(ValueError, "is always square"):
+    with self.assertRaisesRegex(ValueError, "is always square"):
       matrix = array_ops.placeholder_with_default(input=(), shape=None)
       LinearOperatorMatmulSolve(
           matrix, is_positive_definite=True, is_square=False)
 
   def test_non_square_operators_raise_on_determinant_and_solve(self):
     operator = LinearOperatorShape((2, 3))
-    with self.assertRaisesRegexp(NotImplementedError, "not be square"):
+    with self.assertRaisesRegex(NotImplementedError, "not be square"):
       operator.determinant()
-    with self.assertRaisesRegexp(NotImplementedError, "not be square"):
+    with self.assertRaisesRegex(NotImplementedError, "not be square"):
       operator.log_abs_determinant()
-    with self.assertRaisesRegexp(NotImplementedError, "not be square"):
+    with self.assertRaisesRegex(NotImplementedError, "not be square"):
       operator.solve(rng.rand(2, 2))
 
-    with self.assertRaisesRegexp(ValueError, "is always square"):
+    with self.assertRaisesRegex(ValueError, "is always square"):
       matrix = array_ops.placeholder_with_default(input=(), shape=None)
       LinearOperatorMatmulSolve(
           matrix, is_positive_definite=True, is_square=False)
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_toeplitz_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_toeplitz_test.py
index 918c238d352..b2ce96ebcc7 100644
--- a/tensorflow/python/kernel_tests/linalg/linear_operator_toeplitz_test.py
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_toeplitz_test.py
@@ -130,13 +130,13 @@ class LinearOperatorToeplitzTest(
     return operator, matrix
 
   def test_scalar_row_col_raises(self):
-    with self.assertRaisesRegexp(ValueError, "must have at least 1 dimension"):
+    with self.assertRaisesRegex(ValueError, "must have at least 1 dimension"):
       linear_operator_toeplitz.LinearOperatorToeplitz(1., 1.)
 
-    with self.assertRaisesRegexp(ValueError, "must have at least 1 dimension"):
+    with self.assertRaisesRegex(ValueError, "must have at least 1 dimension"):
       linear_operator_toeplitz.LinearOperatorToeplitz([1.], 1.)
 
-    with self.assertRaisesRegexp(ValueError, "must have at least 1 dimension"):
+    with self.assertRaisesRegex(ValueError, "must have at least 1 dimension"):
       linear_operator_toeplitz.LinearOperatorToeplitz(1., [1.])
 
   def test_tape_safe(self):
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_util_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_util_test.py
index d82de56c80c..486cbc43d0b 100644
--- a/tensorflow/python/kernel_tests/linalg/linear_operator_util_test.py
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_util_test.py
@@ -182,10 +182,10 @@ class BroadcastMatrixBatchDimsTest(test.TestCase):
     x = rng.rand(3)
     y = rng.rand(1, 1)
 
-    with self.assertRaisesRegexp(ValueError, "at least two dimensions"):
+    with self.assertRaisesRegex(ValueError, "at least two dimensions"):
       linear_operator_util.broadcast_matrix_batch_dims([x, y])
 
-    with self.assertRaisesRegexp(ValueError, "at least two dimensions"):
+    with self.assertRaisesRegex(ValueError, "at least two dimensions"):
       linear_operator_util.broadcast_matrix_batch_dims([y, x])
 
 
@@ -337,7 +337,7 @@ class UseOperatorOrProvidedHintUnlessContradictingTest(test.TestCase,
   )
   def test_raises_if_contradicting(self, operator_hint_value,
                                    provided_hint_value):
-    with self.assertRaisesRegexp(ValueError, "my error message"):
+    with self.assertRaisesRegex(ValueError, "my error message"):
       linear_operator_util.use_operator_or_provided_hint_unless_contradicting(
           operator=DummyOperatorWithHint(my_hint=operator_hint_value),
           hint_attr_name="my_hint",
@@ -413,7 +413,7 @@ class BlockwiseTest(test.TestCase, parameterized.TestCase):
 
     # Since the leftmost dimension of `x` is equal to the number of blocks, and
     # the operators have unknown dimension, the input is ambiguous.
-    with self.assertRaisesRegexp(ValueError, "structure is ambiguous"):
+    with self.assertRaisesRegex(ValueError, "structure is ambiguous"):
       linear_operator_util.arg_is_blockwise(op_dimensions, x, -2)
 
   def test_mismatched_input_raises(self):
@@ -425,7 +425,7 @@ class BlockwiseTest(test.TestCase, parameterized.TestCase):
     # two-element list; if interpreted blockwise, its corresponding dimensions
     # sum to 12 (=6*2). If not interpreted blockwise, its corresponding
     # dimension is 6. This is a mismatch.
-    with self.assertRaisesRegexp(ValueError, "dimension does not match"):
+    with self.assertRaisesRegex(ValueError, "dimension does not match"):
       linear_operator_util.arg_is_blockwise(op_dimensions, x, -1)
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_zeros_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_zeros_test.py
index fa5c8e2cfc4..8ca4e0f796f 100644
--- a/tensorflow/python/kernel_tests/linalg/linear_operator_zeros_test.py
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_zeros_test.py
@@ -89,33 +89,33 @@ class LinearOperatorZerosTest(
       self.evaluate(operator.assert_self_adjoint())  # Should not fail
 
   def test_non_scalar_num_rows_raises_static(self):
-    with self.assertRaisesRegexp(ValueError, "must be a 0-D Tensor"):
+    with self.assertRaisesRegex(ValueError, "must be a 0-D Tensor"):
       linalg_lib.LinearOperatorZeros(num_rows=[2])
-    with self.assertRaisesRegexp(ValueError, "must be a 0-D Tensor"):
+    with self.assertRaisesRegex(ValueError, "must be a 0-D Tensor"):
       linalg_lib.LinearOperatorZeros(num_rows=2, num_columns=[2])
 
   def test_non_integer_num_rows_raises_static(self):
-    with self.assertRaisesRegexp(TypeError, "must be integer"):
+    with self.assertRaisesRegex(TypeError, "must be integer"):
       linalg_lib.LinearOperatorZeros(num_rows=2.)
-    with self.assertRaisesRegexp(TypeError, "must be integer"):
+    with self.assertRaisesRegex(TypeError, "must be integer"):
       linalg_lib.LinearOperatorZeros(num_rows=2, num_columns=2.)
 
   def test_negative_num_rows_raises_static(self):
-    with self.assertRaisesRegexp(ValueError, "must be non-negative"):
+    with self.assertRaisesRegex(ValueError, "must be non-negative"):
       linalg_lib.LinearOperatorZeros(num_rows=-2)
-    with self.assertRaisesRegexp(ValueError, "must be non-negative"):
+    with self.assertRaisesRegex(ValueError, "must be non-negative"):
       linalg_lib.LinearOperatorZeros(num_rows=2, num_columns=-2)
 
   def test_non_1d_batch_shape_raises_static(self):
-    with self.assertRaisesRegexp(ValueError, "must be a 1-D"):
+    with self.assertRaisesRegex(ValueError, "must be a 1-D"):
       linalg_lib.LinearOperatorZeros(num_rows=2, batch_shape=2)
 
   def test_non_integer_batch_shape_raises_static(self):
-    with self.assertRaisesRegexp(TypeError, "must be integer"):
+    with self.assertRaisesRegex(TypeError, "must be integer"):
       linalg_lib.LinearOperatorZeros(num_rows=2, batch_shape=[2.])
 
   def test_negative_batch_shape_raises_static(self):
-    with self.assertRaisesRegexp(ValueError, "must be non-negative"):
+    with self.assertRaisesRegex(ValueError, "must be non-negative"):
       linalg_lib.LinearOperatorZeros(num_rows=2, batch_shape=[-2])
 
   def test_non_scalar_num_rows_raises_dynamic(self):
@@ -153,7 +153,7 @@ class LinearOperatorZerosTest(
   def test_wrong_matrix_dimensions_raises_static(self):
     operator = linalg_lib.LinearOperatorZeros(num_rows=2)
     x = rng.randn(3, 3).astype(np.float32)
-    with self.assertRaisesRegexp(ValueError, "Dimensions.*not compatible"):
+    with self.assertRaisesRegex(ValueError, "Dimensions.*not compatible"):
       operator.matmul(x)
 
   def test_wrong_matrix_dimensions_raises_dynamic(self):
@@ -185,14 +185,14 @@ class LinearOperatorZerosTest(
         linalg_lib.LinearOperatorZeros))
 
   def test_ref_type_shape_args_raises(self):
-    with self.assertRaisesRegexp(TypeError, "num_rows.cannot.be.reference"):
+    with self.assertRaisesRegex(TypeError, "num_rows.cannot.be.reference"):
       linalg_lib.LinearOperatorZeros(num_rows=variables_module.Variable(2))
 
-    with self.assertRaisesRegexp(TypeError, "num_columns.cannot.be.reference"):
+    with self.assertRaisesRegex(TypeError, "num_columns.cannot.be.reference"):
       linalg_lib.LinearOperatorZeros(
           num_rows=2, num_columns=variables_module.Variable(3))
 
-    with self.assertRaisesRegexp(TypeError, "batch_shape.cannot.be.reference"):
+    with self.assertRaisesRegex(TypeError, "batch_shape.cannot.be.reference"):
       linalg_lib.LinearOperatorZeros(
           num_rows=2, batch_shape=variables_module.Variable([2]))
 
diff --git a/tensorflow/python/kernel_tests/list_ops_test.py b/tensorflow/python/kernel_tests/list_ops_test.py
index 53ebdd3ab88..ce20cf489e6 100644
--- a/tensorflow/python/kernel_tests/list_ops_test.py
+++ b/tensorflow/python/kernel_tests/list_ops_test.py
@@ -78,8 +78,8 @@ class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     l = list_ops.empty_tensor_list(
         element_dtype=dtypes.float32, element_shape=[], max_num_elements=1)
     l = list_ops.tensor_list_push_back(l, constant_op.constant(1.0))
-    with self.assertRaisesRegexp(errors.InvalidArgumentError,
-                                 "Tried to push item into a full list"):
+    with self.assertRaisesRegex(errors.InvalidArgumentError,
+                                "Tried to push item into a full list"):
       l = list_ops.tensor_list_push_back(l, 2.)
       self.evaluate(l)
 
@@ -91,8 +91,8 @@ class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
         element_dtype=dtypes.float32,
         element_shape=[],
         max_num_elements=max_num_elements)
-    with self.assertRaisesRegexp(errors.InvalidArgumentError,
-                                 "Trying to pop from an empty list"):
+    with self.assertRaisesRegex(errors.InvalidArgumentError,
+                                "Trying to pop from an empty list"):
       l = list_ops.tensor_list_pop_back(l, element_dtype=dtypes.float32)
       self.evaluate(l)
 
@@ -115,7 +115,7 @@ class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
   def testPopUninitializedTensorWithInvalidElementShapeFails(self):
     l = list_ops.tensor_list_reserve(
         element_dtype=dtypes.float32, element_shape=None, num_elements=3)
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         errors.InvalidArgumentError,
         "Trying to read an uninitialized tensor but "
         "element_shape is not fully defined"):
@@ -124,7 +124,7 @@ class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
 
     l = list_ops.tensor_list_reserve(
         element_dtype=dtypes.float32, element_shape=[None, 2], num_elements=3)
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         errors.InvalidArgumentError,
         r"Incompatible shapes during merge: \[1,3\] vs. \[\?,2\]"):
       _, e = gen_list_ops.tensor_list_pop_back(
@@ -191,8 +191,8 @@ class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
 
     # Should raise an error when the element tensors do not all have the same
     # shape.
-    with self.assertRaisesRegexp(errors.InvalidArgumentError,
-                                 "Incompatible ranks during merge: 0 vs. 1"):
+    with self.assertRaisesRegex(errors.InvalidArgumentError,
+                                "Incompatible ranks during merge: 0 vs. 1"):
       l = list_ops.tensor_list_push_back(l, constant_op.constant([3.0, 4.0]))
       t = list_ops.tensor_list_stack(l, element_dtype=dtypes.float32)
       self.evaluate(t)
@@ -213,7 +213,7 @@ class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
 
     # Should raise an error when the element tensors do not all have the same
     # shape.
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         errors.InvalidArgumentError,
         r"Incompatible shapes during merge: \[1\] vs. \[2\]"):
       l = list_ops.tensor_list_push_back(l, constant_op.constant([2.0, 3.0]))
@@ -234,8 +234,8 @@ class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
 
     # Should not be able to stack empty lists with partially defined
     # element_shape.
-    with self.assertRaisesRegexp(errors.InvalidArgumentError,
-                                 "non-fully-defined"):
+    with self.assertRaisesRegex(errors.InvalidArgumentError,
+                                "non-fully-defined"):
       l = list_ops.empty_tensor_list(
           element_dtype=dtypes.float32,
           element_shape=[None, 2],
@@ -244,8 +244,8 @@ class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
       self.evaluate(t)
 
     # Should not be able to stack empty lists with undefined element_shape.
-    with self.assertRaisesRegexp(errors.InvalidArgumentError,
-                                 "non-fully-defined"):
+    with self.assertRaisesRegex(errors.InvalidArgumentError,
+                                "non-fully-defined"):
       l = list_ops.empty_tensor_list(
           element_dtype=dtypes.float32,
           element_shape=None,
@@ -285,10 +285,10 @@ class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
   def testStackReservedListWithNoElementsAndPartialElementShapeFails(self):
     l = list_ops.tensor_list_reserve(
         element_dtype=dtypes.float32, element_shape=None, num_elements=3)
-    with self.assertRaisesRegexp(errors.InvalidArgumentError,
-                                 "Tried to stack list which only contains "
-                                 "uninitialized tensors and has a "
-                                 "non-fully-defined element_shape: <unknown>"):
+    with self.assertRaisesRegex(
+        errors.InvalidArgumentError, "Tried to stack list which only contains "
+        "uninitialized tensors and has a "
+        "non-fully-defined element_shape: <unknown>"):
       t = list_ops.tensor_list_stack(l, element_dtype=dtypes.float32)
       self.evaluate(t)
 
@@ -341,8 +341,8 @@ class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
 
     # Should raise an error when the requested tensors do not all have the same
     # shape.
-    with self.assertRaisesRegexp(errors.InvalidArgumentError,
-                                 "Incompatible ranks during merge: 0 vs. 1"):
+    with self.assertRaisesRegex(errors.InvalidArgumentError,
+                                "Incompatible ranks during merge: 0 vs. 1"):
       t = list_ops.tensor_list_gather(l, [0, 2], element_dtype=dtypes.float32)
       self.evaluate(t)
 
@@ -366,7 +366,7 @@ class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
 
     # Should raise an error when the requested tensors do not all have the same
     # shape.
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         errors.InvalidArgumentError,
         r"Incompatible shapes during merge: \[1\] vs. \[2\]"):
       t = list_ops.tensor_list_gather(l, [0, 2], element_dtype=dtypes.float32)
@@ -387,8 +387,8 @@ class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
 
     # Should not be able to gather from empty lists with partially defined
     # element_shape.
-    with self.assertRaisesRegexp(errors.InvalidArgumentError,
-                                 "non-fully-defined"):
+    with self.assertRaisesRegex(errors.InvalidArgumentError,
+                                "non-fully-defined"):
       l = list_ops.empty_tensor_list(
           element_dtype=dtypes.float32,
           element_shape=[None, 2],
@@ -398,8 +398,8 @@ class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
 
     # Should not be able to gather from empty lists with undefined
     # element_shape.
-    with self.assertRaisesRegexp(errors.InvalidArgumentError,
-                                 "non-fully-defined"):
+    with self.assertRaisesRegex(errors.InvalidArgumentError,
+                                "non-fully-defined"):
       l = list_ops.empty_tensor_list(
           element_dtype=dtypes.float32,
           element_shape=None,
@@ -455,7 +455,7 @@ class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
   def testGatherReservedListWithNoElementsAndPartialElementShapeFails(self):
     l = list_ops.tensor_list_reserve(
         element_dtype=dtypes.float32, element_shape=None, num_elements=3)
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         errors.InvalidArgumentError,
         "Tried to gather uninitialized tensors from a"
         " list with non-fully-defined element_shape"):
@@ -485,7 +485,7 @@ class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
 
   def testScatterFailsWhenIndexLargerThanNumElements(self):
     c0 = constant_op.constant([1.0, 2.0])
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         errors.InvalidArgumentError,
         "TensorListScatter: Trying to scatter at index 3 in list with size 3"):
       l = gen_list_ops.tensor_list_scatter_v2(
@@ -494,7 +494,7 @@ class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
 
   def testScatterFailsWithInvalidNumElements(self):
     c0 = constant_op.constant([1.0, 2.0])
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         errors.InvalidArgumentError,
         "TensorListScatter expects num_elements >= -1, found: -2"):
       l = gen_list_ops.tensor_list_scatter_v2(
@@ -503,7 +503,7 @@ class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
 
   def testScatterWithInvalidRowsInInputTensorFails(self):
     c0 = constant_op.constant([1.0, 2.0])
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         errors.InvalidArgumentError,
         "Invalid number of rows in input tensor. Expected: 3 Actual: 2"):
       l = list_ops.tensor_list_scatter(c0, [1, 0, 2], [])
@@ -511,7 +511,7 @@ class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
 
   def testScatterWithNegativeIndicesFails(self):
     c0 = constant_op.constant([1.0, 2.0])
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         errors.InvalidArgumentError,
         "Indices in TensorListScatter must all be non-negative."):
       l = list_ops.tensor_list_scatter(c0, [-1, -2], element_shape=[])
@@ -658,7 +658,7 @@ class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
   def testGetUninitializedTensorWithInvalidElementShapeFails(self):
     l = list_ops.tensor_list_reserve(
         element_dtype=dtypes.float32, element_shape=None, num_elements=3)
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         errors.InvalidArgumentError,
         "Trying to read an uninitialized tensor but "
         "element_shape is not fully defined"):
@@ -676,7 +676,7 @@ class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
       error_type = errors.InvalidArgumentError
     else:
       error_type = ValueError
-    with self.assertRaisesRegexp(error_type, r"shapes"):
+    with self.assertRaisesRegex(error_type, r"shapes"):
       e0 = gen_list_ops.tensor_list_get_item(
           l, 0, element_dtype=dtypes.float32, element_shape=[1, 3])
       self.evaluate(e0)
@@ -699,7 +699,7 @@ class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
   def testSetOnEmptyListWithMaxNumElementsFails(self):
     l = list_ops.empty_tensor_list(
         element_dtype=dtypes.float32, element_shape=[], max_num_elements=3)
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         errors.InvalidArgumentError,
         "Trying to modify element 0 in a list with 0 elements."):
       l = list_ops.tensor_list_set_item(l, 0, 1.)
@@ -882,8 +882,8 @@ class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
       with ops.device("/job:ps"):
         l_ps = array_ops.identity(l)
         l_ps = list_ops.tensor_list_push_back(l_ps, 2.)
-      with self.assertRaisesRegexp(errors.InvalidArgumentError,
-                                   "Tried to push item into a full list"):
+      with self.assertRaisesRegex(errors.InvalidArgumentError,
+                                  "Tried to push item into a full list"):
         with ops.device("/job:worker"):
           l_worker = array_ops.identity(l_ps)
           l_worker = list_ops.tensor_list_push_back(l_worker, 3.0)
@@ -943,8 +943,8 @@ class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
       # at graph building time.
       l = list_ops.tensor_list_set_item(l, 0, ph)
       l_0 = list_ops.tensor_list_get_item(l, 0, element_dtype=dtypes.float32)
-      with self.assertRaisesRegexp(errors.InvalidArgumentError,
-                                   "incompatible shape"):
+      with self.assertRaisesRegex(errors.InvalidArgumentError,
+                                  "incompatible shape"):
         sess.run(l_0, {ph: [3.0]})
 
   def testResourceVariableScatterGather(self):
@@ -1021,7 +1021,7 @@ class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
           "element shapes are not identical at index 0")
     else:
       expected_error = (ValueError, "Shapes must be equal rank")
-    with self.assertRaisesRegexp(*expected_error):
+    with self.assertRaisesRegex(*expected_error):
       l_batch_of_vec_tls = array_ops.stack(
           [list_ops.tensor_list_from_tensor([[1.0]], element_shape=[1])] * 2)
       self.evaluate(
@@ -1033,7 +1033,7 @@ class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
                         r"input_b\[0\].dtype != element_dtype.")
     else:
       expected_error = (ValueError, "input_b.type != element_dtype")
-    with self.assertRaisesRegexp(*expected_error):
+    with self.assertRaisesRegex(*expected_error):
       l_batch_of_int_tls = array_ops.stack(
           [list_ops.tensor_list_from_tensor([1], element_shape=[])] * 2)
       self.evaluate(
@@ -1073,8 +1073,8 @@ class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     with self.assertRaises((errors.InvalidArgumentError, ValueError)):
       self.evaluate(list_ops.tensor_list_push_back_batch(l_batch, []))
 
-    with self.assertRaisesRegexp(errors.InvalidArgumentError,
-                                 "incompatible shape to a list at index 0"):
+    with self.assertRaisesRegex(errors.InvalidArgumentError,
+                                "incompatible shape to a list at index 0"):
       self.evaluate(
           list_ops.tensor_list_push_back_batch(l_batch, [[3.0], [4.0]]))
 
@@ -1082,7 +1082,7 @@ class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
       expected_error = (errors.InvalidArgumentError, "Invalid data type")
     else:
       expected_error = (ValueError, "wrong element dtype")
-    with self.assertRaisesRegexp(*expected_error):
+    with self.assertRaisesRegex(*expected_error):
       self.evaluate(list_ops.tensor_list_push_back_batch(l_batch, [3, 4]))
 
   def testZerosLike(self):
@@ -1246,7 +1246,7 @@ class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
         element_shape=[], element_dtype=dtypes.float32, num_elements=2)
     l2 = list_ops.tensor_list_reserve(
         element_shape=[], element_dtype=dtypes.float32, num_elements=3)
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         errors.InvalidArgumentError,
         "Trying to add two lists of tensors with different lengths"):
       l = math_ops.add_n([l1, l2])
@@ -1268,7 +1268,7 @@ class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
           element_dtype=dtypes.float32,
           num_elements=3)
       l = math_ops.add_n([l1, l2])
-      with self.assertRaisesRegexp(
+      with self.assertRaisesRegex(
           errors.InvalidArgumentError,
           "Trying to add two lists of tensors with incompatible element shapes"
       ):
@@ -1314,7 +1314,7 @@ class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
         element_dtype=dtypes.float32, element_shape=None)
     l = list_ops.tensor_list_push_back(l, [[0., 1.]])
     l = list_ops.tensor_list_push_back(l, [[2.], [4.]])
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         errors.InvalidArgumentError, r"Incompatible shapes during merge: "
         r"\[2\] vs. \[1\]"):
       t = list_ops.tensor_list_concat(l, element_dtype=dtypes.float32)
@@ -1333,7 +1333,7 @@ class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
   def testConcatEmptyListWithUnknownElementShapeFails(self):
     l = list_ops.empty_tensor_list(
         element_dtype=dtypes.float32, element_shape=None)
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         errors.InvalidArgumentError,
         "All except the first dimension must be fully"
         " defined when concating an empty tensor list"):
@@ -1343,7 +1343,7 @@ class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
   def testConcatEmptyListWithPartiallyDefinedElementShapeFails(self):
     l = list_ops.empty_tensor_list(
         element_dtype=dtypes.float32, element_shape=[2, None])
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         errors.InvalidArgumentError,
         "All except the first dimension must be fully"
         " defined when concating an empty tensor list"):
@@ -1354,7 +1354,7 @@ class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     l = list_ops.empty_tensor_list(
         element_dtype=dtypes.float32,
         element_shape=tensor_shape.TensorShape([]))
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         errors.InvalidArgumentError,
         "Concat requires elements to be at least vectors, "
         "found scalars instead"):
@@ -1365,14 +1365,14 @@ class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     l = list_ops.empty_tensor_list(
         element_dtype=dtypes.float32, element_shape=None)
     l1 = list_ops.tensor_list_push_back(l, 1.)
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         errors.InvalidArgumentError, "Concat saw a scalar shape at index 0"
         " but requires at least vectors"):
       t = list_ops.tensor_list_concat(l1, element_dtype=dtypes.float32)
       self.evaluate(t)
     l1 = list_ops.tensor_list_push_back(l, [1.])
     l1 = list_ops.tensor_list_push_back(l1, 2.)
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         errors.InvalidArgumentError, "Concat saw a scalar shape at index 1"
         " but requires at least vectors"):
       t = list_ops.tensor_list_concat(l1, element_dtype=dtypes.float32)
@@ -1420,7 +1420,7 @@ class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
   def testConcatWithUninitializedTensorsFailsIfNoElementShape(self):
     l = list_ops.tensor_list_reserve(
         element_dtype=dtypes.float32, element_shape=None, num_elements=3)
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         errors.InvalidArgumentError,
         r"Trying to concat list with only uninitialized tensors "
         r"but element_shape_except_first_dim_ is not fully defined"):
@@ -1430,7 +1430,7 @@ class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
   def testConcatWithUninitializedTensorsFailsIfNoInputLengths(self):
     l = list_ops.tensor_list_reserve(
         element_dtype=dtypes.float32, element_shape=[None, 3], num_elements=3)
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         errors.InvalidArgumentError,
         r"List contains uninitialized tensor at index 0"
         r" but leading_dims has only 0 elements."):
@@ -1467,7 +1467,7 @@ class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     with self.cached_session():
       tensor = array_ops.placeholder(dtype=dtypes.float32)
       l = list_ops.tensor_list_split(tensor, element_shape=None, lengths=[1])
-      with self.assertRaisesRegexp(
+      with self.assertRaisesRegex(
           errors.InvalidArgumentError,
           r"Tensor must be at least a vector, but saw shape: \[\]"):
         l.eval({tensor: 1})
@@ -1479,24 +1479,24 @@ class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
       l = list_ops.tensor_list_split([1., 2.],
                                      element_shape=None,
                                      lengths=lengths)
-      with self.assertRaisesRegexp(
+      with self.assertRaisesRegex(
           errors.InvalidArgumentError,
           r"Expected lengths to be a vector, received shape: \[\]"):
         l.eval({lengths: 1})
 
   def testSplitWithInvalidLengthsFails(self):
-    with self.assertRaisesRegexp(errors.InvalidArgumentError,
-                                 r"Invalid value in lengths: -1"):
+    with self.assertRaisesRegex(errors.InvalidArgumentError,
+                                r"Invalid value in lengths: -1"):
       l = list_ops.tensor_list_split([1., 2.],
                                      element_shape=None,
                                      lengths=[1, -1])
       self.evaluate(l)
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         errors.InvalidArgumentError,
         r"Attempting to slice \[0, 3\] from tensor with length 2"):
       l = list_ops.tensor_list_split([1., 2.], element_shape=None, lengths=[3])
       self.evaluate(l)
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         errors.InvalidArgumentError,
         r"Unused values in tensor. Length of tensor: 2 Values used: 1"):
       l = list_ops.tensor_list_split([1., 2.], element_shape=None, lengths=[1])
@@ -1504,11 +1504,11 @@ class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
 
   @test_util.run_deprecated_v1
   def testSkipEagerSplitWithScalarElementShapeFails(self):
-    with self.assertRaisesRegexp(ValueError,
-                                 r"Shapes must be equal rank, but are 1 and 0"):
+    with self.assertRaisesRegex(ValueError,
+                                r"Shapes must be equal rank, but are 1 and 0"):
       l = list_ops.tensor_list_split([1., 2.], element_shape=[], lengths=[1, 1])
     with self.cached_session():
-      with self.assertRaisesRegexp(
+      with self.assertRaisesRegex(
           errors.InvalidArgumentError,
           r"TensorListSplit requires element_shape to be at least of rank 1, "
           r"but saw: \[\]"):
@@ -1520,7 +1520,7 @@ class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
 
   def testEagerOnlySplitWithScalarElementShapeFails(self):
     if context.executing_eagerly():
-      with self.assertRaisesRegexp(
+      with self.assertRaisesRegex(
           errors.InvalidArgumentError,
           r"TensorListSplit requires element_shape to be at least of rank 1, "
           r"but saw: \[\]"):
@@ -1528,14 +1528,14 @@ class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
 
   @test_util.run_deprecated_v1
   def testSkipEagerSplitWithIncompatibleTensorShapeAndElementShapeFails(self):
-    with self.assertRaisesRegexp(ValueError,
-                                 r"Shapes must be equal rank, but are 2 and 1"):
+    with self.assertRaisesRegex(ValueError,
+                                r"Shapes must be equal rank, but are 2 and 1"):
       l = list_ops.tensor_list_split([[1.], [2.]],
                                      element_shape=[1],
                                      lengths=[1, 1])
 
     with self.cached_session():
-      with self.assertRaisesRegexp(
+      with self.assertRaisesRegex(
           errors.InvalidArgumentError,
           r"tensor shape \[2,1\] is not compatible with element_shape \[1\]"):
         element_shape = array_ops.placeholder(dtype=dtypes.int32)
@@ -1546,7 +1546,7 @@ class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
 
   def testEagerOnlySplitWithIncompatibleTensorShapeAndElementShapeFails(self):
     if context.executing_eagerly():
-      with self.assertRaisesRegexp(
+      with self.assertRaisesRegex(
           errors.InvalidArgumentError,
           r"tensor shape \[2,1\] is not compatible with element_shape \[1\]"):
         list_ops.tensor_list_split([[1.], [2.]],
@@ -1576,7 +1576,7 @@ class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
         [1., 2.])
 
   def testResizeWithInvalidSizeFails(self):
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         errors.InvalidArgumentError,
         "TensorListSlice expects size to be non-negative"):
       l = list_ops.tensor_list_from_tensor([1., 2., 3.], element_shape=[])
diff --git a/tensorflow/python/kernel_tests/lookup_ops_test.py b/tensorflow/python/kernel_tests/lookup_ops_test.py
index 3b9f97670d4..514c7a1e997 100644
--- a/tensorflow/python/kernel_tests/lookup_ops_test.py
+++ b/tensorflow/python/kernel_tests/lookup_ops_test.py
@@ -1181,8 +1181,8 @@ class DenseHashTableOpTest(test.TestCase):
 
   def testSameEmptyAndDeletedKey(self):
     with self.cached_session():
-      with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
-                                   "Empty and deleted keys"):
+      with self.assertRaisesRegex(errors_impl.InvalidArgumentError,
+                                  "Empty and deleted keys"):
         table = lookup_ops.DenseHashTable(
             dtypes.int64,
             dtypes.int64,
@@ -1810,39 +1810,39 @@ class DenseHashTableOpTest(test.TestCase):
       # Inserting the empty key returns an error
       keys1 = constant_op.constant([11, 0], dtypes.int64)
       values1 = constant_op.constant([0, 1], dtypes.int64)
-      with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
-                                   "empty_key"):
+      with self.assertRaisesRegex(errors_impl.InvalidArgumentError,
+                                  "empty_key"):
         self.evaluate(table.insert(keys1, values1))
 
       # Looking up the empty key returns an error
-      with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
-                                   "empty_key"):
+      with self.assertRaisesRegex(errors_impl.InvalidArgumentError,
+                                  "empty_key"):
         self.evaluate(table.lookup(keys1))
 
       # Inserting the deleted key returns an error
       keys2 = constant_op.constant([11, -1], dtypes.int64)
       values2 = constant_op.constant([0, 1], dtypes.int64)
-      with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
-                                   "deleted_key"):
+      with self.assertRaisesRegex(errors_impl.InvalidArgumentError,
+                                  "deleted_key"):
         self.evaluate(table.insert(keys2, values2))
 
       # Looking up the empty key returns an error
-      with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
-                                   "deleted_key"):
+      with self.assertRaisesRegex(errors_impl.InvalidArgumentError,
+                                  "deleted_key"):
         self.evaluate(table.lookup(keys2))
 
       # Arbitrary tensors of keys are not supported
       keys = constant_op.constant([[11, 0], [12, 1]], dtypes.int64)
       values = constant_op.constant([[11, 0], [12, 1]], dtypes.int64)
-      with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
-                                   "Expected key shape"):
+      with self.assertRaisesRegex(errors_impl.InvalidArgumentError,
+                                  "Expected key shape"):
         self.evaluate(table.lookup(keys))
-      with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
-                                   "Expected key shape"):
+      with self.assertRaisesRegex(errors_impl.InvalidArgumentError,
+                                  "Expected key shape"):
         self.evaluate(table.insert(keys, values))
 
-      with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
-                                   "Number of buckets must be"):
+      with self.assertRaisesRegex(errors_impl.InvalidArgumentError,
+                                  "Number of buckets must be"):
         table2 = lookup_ops.DenseHashTable(
             dtypes.int64,
             dtypes.int64,
@@ -1852,7 +1852,7 @@ class DenseHashTableOpTest(test.TestCase):
             initial_num_buckets=12)
         self.assertAllEqual(0, self.evaluate(table2.size()))
 
-      with self.assertRaisesRegexp(
+      with self.assertRaisesRegex(
           errors_impl.InvalidArgumentError,
           "Empty and deleted keys must have same shape"):
         table3 = lookup_ops.DenseHashTable(
@@ -1863,8 +1863,8 @@ class DenseHashTableOpTest(test.TestCase):
             deleted_key=[1, 2])
         self.assertAllEqual(0, self.evaluate(table3.size()))
 
-      with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
-                                   "Empty and deleted keys cannot be equal"):
+      with self.assertRaisesRegex(errors_impl.InvalidArgumentError,
+                                  "Empty and deleted keys cannot be equal"):
         table4 = lookup_ops.DenseHashTable(
             dtypes.int64,
             dtypes.int64,
@@ -1873,8 +1873,8 @@ class DenseHashTableOpTest(test.TestCase):
             deleted_key=42)
         self.assertAllEqual(0, self.evaluate(table4.size()))
 
-      with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
-                                   "Empty and deleted keys cannot be equal"):
+      with self.assertRaisesRegex(errors_impl.InvalidArgumentError,
+                                  "Empty and deleted keys cannot be equal"):
         table5 = lookup_ops.DenseHashTable(
             dtypes.int64,
             dtypes.int64,
@@ -2067,9 +2067,8 @@ class IndexTableFromFile(test.TestCase):
 
   def test_index_table_from_file_str_fails_with_zero_size_vocabulary(self):
     vocabulary_file = self._createVocabFile("zero_vocab_str.txt")
-    self.assertRaisesRegexp(
-        ValueError,
-        "vocab_size must be greater than 0, got 0. "
+    self.assertRaisesRegex(
+        ValueError, "vocab_size must be greater than 0, got 0. "
         "vocabulary_file: .*zero_vocab_str.txt",
         lookup_ops.index_table_from_file,
         vocabulary_file=vocabulary_file,
@@ -2078,9 +2077,8 @@ class IndexTableFromFile(test.TestCase):
   def test_index_table_from_file_tensor_fails_with_zero_size_vocabulary(self):
     vocabulary_file = constant_op.constant(
         self._createVocabFile("zero_vocab_tensor.txt"))
-    self.assertRaisesRegexp(
-        ValueError,
-        "vocab_size must be greater than 0, got 0. "
+    self.assertRaisesRegex(
+        ValueError, "vocab_size must be greater than 0, got 0. "
         "vocabulary_file: .*zero_vocab_tensor.txt",
         lookup_ops.index_table_from_file,
         vocabulary_file=vocabulary_file,
@@ -2103,8 +2101,8 @@ class IndexTableFromFile(test.TestCase):
   def test_index_table_from_file_with_vocab_size_too_large(self):
     vocabulary_file = self._createVocabFile("f2i_vocab7.txt")
     with self.cached_session():
-      with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
-                                   "Invalid vocab_size"):
+      with self.assertRaisesRegex(errors_impl.InvalidArgumentError,
+                                  "Invalid vocab_size"):
         table = lookup_ops.index_table_from_file(
             vocabulary_file=vocabulary_file, vocab_size=4)
         self.evaluate(table.initializer)
@@ -2225,15 +2223,15 @@ class IndexTableFromTensor(test.TestCase):
 
   def test_index_table_from_tensor_missing_vocabulary_list(self):
     with self.cached_session():
-      with self.assertRaisesRegexp(ValueError,
-                                   "vocabulary_list must be specified"):
+      with self.assertRaisesRegex(ValueError,
+                                  "vocabulary_list must be specified"):
         lookup_ops.index_table_from_tensor(
             vocabulary_list=None, num_oov_buckets=1)
 
   def test_index_table_from_tensor_empty_vocabulary_list(self):
     with self.cached_session():
-      with self.assertRaisesRegexp(
-          errors_impl.OpError, "keys and values cannot be empty"):
+      with self.assertRaisesRegex(errors_impl.OpError,
+                                  "keys and values cannot be empty"):
         _ = lookup_ops.index_table_from_tensor(
             vocabulary_list=np.array([], dtype=np.str_), num_oov_buckets=1)
         self.evaluate(lookup_ops.tables_initializer())
@@ -2347,8 +2345,8 @@ class IndexToStringTableFromFileTest(test.TestCase):
   def test_index_to_string_table_with_vocab_size_too_large(self):
     vocabulary_file = self._createVocabFile("f2i_vocab6.txt")
     with self.cached_session():
-      with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
-                                   "Invalid vocab_size"):
+      with self.assertRaisesRegex(errors_impl.InvalidArgumentError,
+                                  "Invalid vocab_size"):
         _ = lookup_ops.index_to_string_table_from_file(
             vocabulary_file=vocabulary_file, vocab_size=4)
         self.evaluate(lookup_ops.tables_initializer())
@@ -2532,13 +2530,13 @@ class IdTableWithHashBucketsTest(test.TestCase):
 
   def testFloat64IdTableWithOnlyHashBucket(self):
     with self.cached_session():
-      with self.assertRaisesRegexp(TypeError, "Invalid key_dtype"):
+      with self.assertRaisesRegex(TypeError, "Invalid key_dtype"):
         lookup_ops.IdTableWithHashBuckets(
             None, num_oov_buckets=5, key_dtype=dtypes.float64)
 
   def testBoolIdTableWithOnlyHashBucket(self):
     with self.cached_session():
-      with self.assertRaisesRegexp(TypeError, "Invalid key_dtype"):
+      with self.assertRaisesRegex(TypeError, "Invalid key_dtype"):
         lookup_ops.IdTableWithHashBuckets(
             None, num_oov_buckets=5, key_dtype=dtypes.bool)
 
diff --git a/tensorflow/python/kernel_tests/losses_test.py b/tensorflow/python/kernel_tests/losses_test.py
index b5f3e317d1c..101e0a5f1ff 100644
--- a/tensorflow/python/kernel_tests/losses_test.py
+++ b/tensorflow/python/kernel_tests/losses_test.py
@@ -131,7 +131,7 @@ class SoftmaxCrossEntropyLossTest(test.TestCase):
                                      [0.0, 0.0, 10.0]])
       labels = constant_op.constant([[1, 0, 0], [0, 1, 0], [0, 0, 1]])
       loss = losses.softmax_cross_entropy(labels, logits)
-      self.assertEquals('softmax_cross_entropy_loss/value', loss.op.name)
+      self.assertEqual('softmax_cross_entropy_loss/value', loss.op.name)
       self.assertAlmostEqual(loss.eval(), 0.0, 3)
 
   @test_util.run_deprecated_v1
@@ -142,7 +142,7 @@ class SoftmaxCrossEntropyLossTest(test.TestCase):
 
     with self.cached_session():
       loss = losses.softmax_cross_entropy(labels, logits)
-      self.assertEquals(loss.op.name, 'softmax_cross_entropy_loss/value')
+      self.assertEqual(loss.op.name, 'softmax_cross_entropy_loss/value')
       self.assertAlmostEqual(loss.eval(), 10.0, 3)
 
   @test_util.run_deprecated_v1
@@ -223,7 +223,7 @@ class SoftmaxCrossEntropyLossTest(test.TestCase):
       label_smoothing = 0.1
       loss = losses.softmax_cross_entropy(
           labels, logits, label_smoothing=label_smoothing)
-      self.assertEquals(loss.op.name, 'softmax_cross_entropy_loss/value')
+      self.assertEqual(loss.op.name, 'softmax_cross_entropy_loss/value')
       expected_value = 400.0 * label_smoothing / 3.0
       self.assertAlmostEqual(loss.eval(), expected_value, 3)
 
@@ -245,7 +245,7 @@ class SparseSoftmaxCrossEntropyLossTest(test.TestCase):
                                      [0.0, 0.0, 10.0]])
       labels = constant_op.constant([[0], [1], [2]], dtype=dtypes.int32)
       loss = losses.sparse_softmax_cross_entropy(labels, logits)
-      self.assertEquals(loss.op.name, 'sparse_softmax_cross_entropy_loss/value')
+      self.assertEqual(loss.op.name, 'sparse_softmax_cross_entropy_loss/value')
       self.assertAlmostEqual(loss.eval(), 0.0, 3)
 
   @test_util.assert_no_new_pyobjects_executing_eagerly
@@ -262,7 +262,7 @@ class SparseSoftmaxCrossEntropyLossTest(test.TestCase):
                                      [0.0, 0.0, 10.0]])
       labels = constant_op.constant([[0], [1], [2]], dtype=dtypes.int64)
       loss = losses.sparse_softmax_cross_entropy(labels, logits)
-      self.assertEquals(loss.op.name, 'sparse_softmax_cross_entropy_loss/value')
+      self.assertEqual(loss.op.name, 'sparse_softmax_cross_entropy_loss/value')
       self.assertAlmostEqual(loss.eval(), 0.0, 3)
 
   @test_util.run_deprecated_v1
@@ -272,7 +272,7 @@ class SparseSoftmaxCrossEntropyLossTest(test.TestCase):
                                      [0.0, 0.0, 10.0]])
       labels = constant_op.constant([0, 1, 2])
       loss = losses.sparse_softmax_cross_entropy(labels, logits)
-      self.assertEquals(loss.op.name, 'sparse_softmax_cross_entropy_loss/value')
+      self.assertEqual(loss.op.name, 'sparse_softmax_cross_entropy_loss/value')
       self.assertAlmostEqual(loss.eval(), 0.0, 3)
 
   @test_util.run_deprecated_v1
@@ -283,7 +283,7 @@ class SparseSoftmaxCrossEntropyLossTest(test.TestCase):
 
     with self.cached_session():
       loss = losses.sparse_softmax_cross_entropy(labels, logits)
-      self.assertEquals(loss.op.name, 'sparse_softmax_cross_entropy_loss/value')
+      self.assertEqual(loss.op.name, 'sparse_softmax_cross_entropy_loss/value')
       self.assertAlmostEqual(loss.eval(), 10.0, 3)
 
   @test_util.run_deprecated_v1
@@ -294,7 +294,7 @@ class SparseSoftmaxCrossEntropyLossTest(test.TestCase):
 
     with self.cached_session():
       loss = losses.sparse_softmax_cross_entropy(labels, logits)
-      self.assertEquals(loss.op.name, 'sparse_softmax_cross_entropy_loss/value')
+      self.assertEqual(loss.op.name, 'sparse_softmax_cross_entropy_loss/value')
       self.assertAlmostEqual(loss.eval(), 10.0, 3)
 
   @test_util.run_deprecated_v1
@@ -305,7 +305,7 @@ class SparseSoftmaxCrossEntropyLossTest(test.TestCase):
 
     with self.cached_session():
       loss = losses.sparse_softmax_cross_entropy(labels, logits)
-      self.assertEquals(loss.op.name, 'sparse_softmax_cross_entropy_loss/value')
+      self.assertEqual(loss.op.name, 'sparse_softmax_cross_entropy_loss/value')
       self.assertAlmostEqual(loss.eval(), 10.0, 3)
 
   @test_util.run_deprecated_v1
@@ -488,7 +488,7 @@ class SparseSoftmaxCrossEntropyLossTest(test.TestCase):
       labels = constant_op.constant([[0, 1], [2, 3]])
       weights = constant_op.constant(1.2)
 
-      with self.assertRaisesRegexp(ValueError, 'mismatch'):
+      with self.assertRaisesRegex(ValueError, 'mismatch'):
         losses.sparse_softmax_cross_entropy(
             labels, logits, weights=weights).eval()
 
@@ -503,8 +503,8 @@ class SigmoidCrossEntropyLossTest(test.TestCase):
                                      [-100.0, -100.0, 100.0]])
       labels = constant_op.constant([[1, 0, 0], [0, 1, 0], [0, 0, 1]])
       loss = losses.sigmoid_cross_entropy(labels, logits)
-      self.assertEquals(logits.dtype, loss.dtype)
-      self.assertEquals('sigmoid_cross_entropy_loss/value', loss.op.name)
+      self.assertEqual(logits.dtype, loss.dtype)
+      self.assertEqual('sigmoid_cross_entropy_loss/value', loss.op.name)
       self.assertAlmostEqual(0.0, self.evaluate(loss), 3)
 
   @test_util.run_deprecated_v1
@@ -514,7 +514,7 @@ class SigmoidCrossEntropyLossTest(test.TestCase):
     weights = array_ops.ones_like(logits, dtype=dtypes.float32)
 
     loss = losses.sigmoid_cross_entropy(labels, logits, weights)
-    self.assertEquals(logits.dtype, loss.dtype)
+    self.assertEqual(logits.dtype, loss.dtype)
 
     with self.cached_session() as sess:
       loss = sess.run(loss,
@@ -531,7 +531,7 @@ class SigmoidCrossEntropyLossTest(test.TestCase):
     weights = array_ops.ones_like(logits, dtype=dtypes.float32)
 
     loss = losses.sigmoid_cross_entropy(labels, logits, weights)
-    self.assertEquals(logits.dtype, loss.dtype)
+    self.assertEqual(logits.dtype, loss.dtype)
 
     with self.cached_session() as sess:
       loss = sess.run(loss,
@@ -549,8 +549,8 @@ class SigmoidCrossEntropyLossTest(test.TestCase):
                                      [-100.0, -100.0, 100.0]])
       labels = constant_op.constant([[0, 0, 1], [1, 0, 0], [0, 1, 0]])
       loss = losses.sigmoid_cross_entropy(labels, logits)
-      self.assertEquals(logits.dtype, loss.dtype)
-      self.assertEquals('sigmoid_cross_entropy_loss/value', loss.op.name)
+      self.assertEqual(logits.dtype, loss.dtype)
+      self.assertEqual('sigmoid_cross_entropy_loss/value', loss.op.name)
       self.assertAlmostEqual(loss.eval(), 600.0 / 9.0, 3)
 
   @test_util.run_deprecated_v1
@@ -562,8 +562,8 @@ class SigmoidCrossEntropyLossTest(test.TestCase):
       labels = constant_op.constant([[0, 0, 1], [1, 0, 0], [0, 1, 0]])
       weights = constant_op.constant([[3, 4, 5], [2, 6, 0], [8, 0, 1]])
       loss = losses.sigmoid_cross_entropy(labels, logits, weights)
-      self.assertEquals(logits.dtype, loss.dtype)
-      self.assertEquals('sigmoid_cross_entropy_loss/value', loss.op.name)
+      self.assertEqual(logits.dtype, loss.dtype)
+      self.assertEqual('sigmoid_cross_entropy_loss/value', loss.op.name)
       self.assertAlmostEqual(1700.0 / 7.0, self.evaluate(loss), 3)
 
   @test_util.run_deprecated_v1
@@ -573,8 +573,8 @@ class SigmoidCrossEntropyLossTest(test.TestCase):
                                    [-100.0, 100.0, 100.0]])
     labels = constant_op.constant([[1, 0, 1], [1, 1, 0], [0, 1, 1]])
     loss = losses.sigmoid_cross_entropy(labels, logits)
-    self.assertEquals(logits.dtype, loss.dtype)
-    self.assertEquals('sigmoid_cross_entropy_loss/value', loss.op.name)
+    self.assertEqual(logits.dtype, loss.dtype)
+    self.assertEqual('sigmoid_cross_entropy_loss/value', loss.op.name)
 
     with self.cached_session():
       self.assertAlmostEqual(0.0, self.evaluate(loss), 3)
@@ -589,7 +589,7 @@ class SigmoidCrossEntropyLossTest(test.TestCase):
         (1, 0, 1), (1, 1, 0), (0, 1, 1)
     ), dtype=dtypes.int64)
     loss = losses.sigmoid_cross_entropy(labels, logits)
-    self.assertEquals(logits.dtype, loss.dtype)
+    self.assertEqual(logits.dtype, loss.dtype)
 
     with self.cached_session():
       self.assertAlmostEqual(44.444, self.evaluate(loss), 3)
@@ -602,7 +602,7 @@ class SigmoidCrossEntropyLossTest(test.TestCase):
     labels = constant_op.constant(((1, 0, 1), (1, 1, 0), (0, 1, 1)))
     loss = losses.sigmoid_cross_entropy(
         labels, logits, reduction=losses.Reduction.NONE)
-    self.assertEquals(logits.dtype, loss.dtype)
+    self.assertEqual(logits.dtype, loss.dtype)
 
     with self.cached_session():
       self.assertAllClose(((0., 0., 0.), (0., 100., 100.), (100., 0., 100.)),
@@ -627,8 +627,8 @@ class SigmoidCrossEntropyLossTest(test.TestCase):
       label_smoothing = 0.1
       loss = losses.sigmoid_cross_entropy(
           labels, logits, label_smoothing=label_smoothing)
-      self.assertEquals(logits.dtype, loss.dtype)
-      self.assertEquals('sigmoid_cross_entropy_loss/value', loss.op.name)
+      self.assertEqual(logits.dtype, loss.dtype)
+      self.assertEqual('sigmoid_cross_entropy_loss/value', loss.op.name)
       expected_value = (100.0 + 50.0 * label_smoothing) / 3.0
       self.assertAlmostEqual(loss.eval(), expected_value, 3)
 
@@ -640,7 +640,7 @@ class SigmoidCrossEntropyLossTest(test.TestCase):
       sigmoid_labels = constant_op.constant([[1, 0, 1]])
       sigmoid_loss = losses.sigmoid_cross_entropy(
           sigmoid_labels, sigmoid_logits, label_smoothing=label_smoothing)
-      self.assertEquals(sigmoid_logits.dtype, sigmoid_loss.dtype)
+      self.assertEqual(sigmoid_logits.dtype, sigmoid_loss.dtype)
 
       softmax_logits = constant_op.constant(
           [[0.0, 100.0], [100.0, 0.0], [100.0, 0.0]])
@@ -1143,7 +1143,7 @@ class MeanPairwiseSquaredErrorTest(test.TestCase):
     expected_error_msg = 'weights can not be broadcast to values'
 
     # Static check.
-    with self.assertRaisesRegexp(ValueError, expected_error_msg):
+    with self.assertRaisesRegex(ValueError, expected_error_msg):
       losses.mean_pairwise_squared_error(
           predictions=predictions, labels=labels, weights=weights)
 
@@ -1156,7 +1156,7 @@ class MeanPairwiseSquaredErrorTest(test.TestCase):
         labels=labels_placeholder,
         weights=weights_placeholder)
     with self.cached_session():
-      with self.assertRaisesRegexp(errors_impl.OpError, expected_error_msg):
+      with self.assertRaisesRegex(errors_impl.OpError, expected_error_msg):
         dynamic_inputs_op.eval(feed_dict={
             predictions_placeholder: predictions,
             labels_placeholder: labels,
@@ -1456,7 +1456,7 @@ class ComputeWeightedLossTest(test.TestCase):
       expected_error_msg = 'weights can not be broadcast to values'
 
       # Static check.
-      with self.assertRaisesRegexp(ValueError, expected_error_msg):
+      with self.assertRaisesRegex(ValueError, expected_error_msg):
         losses.compute_weighted_loss(self._raw_losses, weights=weights)
 
       # Dynamic check.
@@ -1465,7 +1465,7 @@ class ComputeWeightedLossTest(test.TestCase):
           self._raw_losses, weights=weights_placeholder)
       self.assertEqual(1, len(util.get_losses()))
       with self.cached_session():
-        with self.assertRaisesRegexp(errors_impl.OpError, expected_error_msg):
+        with self.assertRaisesRegex(errors_impl.OpError, expected_error_msg):
           weighted_loss.eval(feed_dict={weights_placeholder: weights})
 
   def testInvalidWeightTooManyDims(self):
@@ -1479,7 +1479,7 @@ class ComputeWeightedLossTest(test.TestCase):
       self.assertEqual(0, len(util.get_losses()))
 
       # Static check.
-      with self.assertRaisesRegexp(ValueError, expected_error_msg):
+      with self.assertRaisesRegex(ValueError, expected_error_msg):
         losses.compute_weighted_loss(raw_losses, weights=weights)
 
       # Dynamic check.
@@ -1488,7 +1488,7 @@ class ComputeWeightedLossTest(test.TestCase):
           raw_losses, weights=weights_placeholder)
       self.assertEqual(1, len(util.get_losses()))
       with self.cached_session():
-        with self.assertRaisesRegexp(errors_impl.OpError, expected_error_msg):
+        with self.assertRaisesRegex(errors_impl.OpError, expected_error_msg):
           weighted_loss.eval(feed_dict={weights_placeholder: weights})
 
   def testInvalid3Weight(self):
diff --git a/tensorflow/python/kernel_tests/manip_ops_test.py b/tensorflow/python/kernel_tests/manip_ops_test.py
index e6cb06ca477..eb9a3d6d0d9 100644
--- a/tensorflow/python/kernel_tests/manip_ops_test.py
+++ b/tensorflow/python/kernel_tests/manip_ops_test.py
@@ -99,8 +99,8 @@ class RollTest(test_util.TensorFlowTestCase):
     self._testAll(np.random.randint(-100, 100, (4, 4)).astype(np.int32), 3, -2)
     # Make sure negative axis should be 0 <= axis + dims < dims
     with self.cached_session(use_gpu=True):
-      with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
-                                   "is out of range"):
+      with self.assertRaisesRegex(errors_impl.InvalidArgumentError,
+                                  "is out of range"):
         manip_ops.roll(np.random.randint(-100, 100, (4, 4)).astype(np.int32),
                        3, -10).eval()
 
@@ -112,8 +112,8 @@ class RollTest(test_util.TensorFlowTestCase):
   @test_util.run_deprecated_v1
   def testInvalidInputShape(self):
     # The input should be 1-D or higher, checked in shape function.
-    with self.assertRaisesRegexp(
-        ValueError, "Shape must be at least rank 1 but is rank 0"):
+    with self.assertRaisesRegex(ValueError,
+                                "Shape must be at least rank 1 but is rank 0"):
       manip_ops.roll(7, 1, 0)
 
   @test_util.run_deprecated_v1
@@ -123,15 +123,15 @@ class RollTest(test_util.TensorFlowTestCase):
     shift = 1
     axis = 0
     with self.cached_session(use_gpu=True):
-      with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
-                                   "input must be 1-D or higher"):
+      with self.assertRaisesRegex(errors_impl.InvalidArgumentError,
+                                  "input must be 1-D or higher"):
         manip_ops.roll(tensor, shift, axis).eval(feed_dict={tensor: 7})
 
   @test_util.run_deprecated_v1
   def testInvalidAxisShape(self):
     # The axis should be a scalar or 1-D, checked in shape function.
-    with self.assertRaisesRegexp(
-        ValueError, "Shape must be at most rank 1 but is rank 2"):
+    with self.assertRaisesRegex(ValueError,
+                                "Shape must be at most rank 1 but is rank 2"):
       manip_ops.roll([[1, 2], [3, 4]], 1, [[0, 1]])
 
   @test_util.run_deprecated_v1
@@ -141,15 +141,15 @@ class RollTest(test_util.TensorFlowTestCase):
     shift = 1
     axis = array_ops.placeholder(dtype=dtypes.int32)
     with self.cached_session(use_gpu=True):
-      with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
-                                   "axis must be a scalar or a 1-D vector"):
+      with self.assertRaisesRegex(errors_impl.InvalidArgumentError,
+                                  "axis must be a scalar or a 1-D vector"):
         manip_ops.roll(tensor, shift, axis).eval(feed_dict={axis: [[0, 1]]})
 
   @test_util.run_deprecated_v1
   def testInvalidShiftShape(self):
     # The shift should be a scalar or 1-D, checked in shape function.
-    with self.assertRaisesRegexp(
-        ValueError, "Shape must be at most rank 1 but is rank 2"):
+    with self.assertRaisesRegex(ValueError,
+                                "Shape must be at most rank 1 but is rank 2"):
       manip_ops.roll([[1, 2], [3, 4]], [[0, 1]], 1)
 
   @test_util.run_deprecated_v1
@@ -159,14 +159,14 @@ class RollTest(test_util.TensorFlowTestCase):
     shift = array_ops.placeholder(dtype=dtypes.int32)
     axis = 1
     with self.cached_session(use_gpu=True):
-      with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
-                                   "shift must be a scalar or a 1-D vector"):
+      with self.assertRaisesRegex(errors_impl.InvalidArgumentError,
+                                  "shift must be a scalar or a 1-D vector"):
         manip_ops.roll(tensor, shift, axis).eval(feed_dict={shift: [[0, 1]]})
 
   @test_util.run_deprecated_v1
   def testInvalidShiftAndAxisNotEqualShape(self):
     # The shift and axis must be same size, checked in shape function.
-    with self.assertRaisesRegexp(ValueError, "both shapes must be equal"):
+    with self.assertRaisesRegex(ValueError, "both shapes must be equal"):
       manip_ops.roll([[1, 2], [3, 4]], [1], [0, 1])
 
   @test_util.run_deprecated_v1
@@ -176,8 +176,8 @@ class RollTest(test_util.TensorFlowTestCase):
     shift = array_ops.placeholder(dtype=dtypes.int32)
     axis = [0, 1]
     with self.cached_session(use_gpu=True):
-      with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
-                                   "shift and axis must have the same size"):
+      with self.assertRaisesRegex(errors_impl.InvalidArgumentError,
+                                  "shift and axis must have the same size"):
         manip_ops.roll(tensor, shift, axis).eval(feed_dict={shift: [1]})
 
   def testRollAxisOutOfRangeRaises(self):
@@ -185,8 +185,8 @@ class RollTest(test_util.TensorFlowTestCase):
     shift = 1
     axis = 1
     with self.cached_session(use_gpu=True):
-      with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
-                                   "is out of range"):
+      with self.assertRaisesRegex(errors_impl.InvalidArgumentError,
+                                  "is out of range"):
         manip_ops.roll(tensor, shift, axis).eval()
 
 
diff --git a/tensorflow/python/kernel_tests/map_fn_test.py b/tensorflow/python/kernel_tests/map_fn_test.py
index 22716a6869e..62379ed222a 100644
--- a/tensorflow/python/kernel_tests/map_fn_test.py
+++ b/tensorflow/python/kernel_tests/map_fn_test.py
@@ -81,9 +81,9 @@ class MapFnTest(test.TestCase):
 
   @test_util.run_in_graph_and_eager_modes
   def testMapOverScalarErrors(self):
-    with self.assertRaisesRegexp(ValueError, "not scalars"):
+    with self.assertRaisesRegex(ValueError, "not scalars"):
       map_fn.map_fn(lambda x: x, [1, 2])
-    with self.assertRaisesRegexp(ValueError, "not a scalar"):
+    with self.assertRaisesRegex(ValueError, "not a scalar"):
       map_fn.map_fn(lambda x: x, 1)
 
   @test_util.run_deprecated_v1
@@ -155,7 +155,7 @@ class MapFnTest(test.TestCase):
   @test_util.run_in_graph_and_eager_modes
   def testMap_MultiOutputMismatchedDtype(self):
     nums = np.array([1, 2, 3, 4, 5, 6])
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         TypeError, r"two structures don't have the same nested structure"):
       # lambda emits tuple, but dtype is a list
       map_fn.map_fn(
@@ -240,7 +240,7 @@ class MapFnTest(test.TestCase):
   @test_util.run_in_graph_and_eager_modes
   def testMapEmptyList(self):
     x = []
-    with self.assertRaisesRegexp(ValueError, r"elems must be a Tensor or"):
+    with self.assertRaisesRegex(ValueError, r"elems must be a Tensor or"):
       _ = map_fn.map_fn(lambda e: e, x)
 
 
diff --git a/tensorflow/python/kernel_tests/matmul_op_test.py b/tensorflow/python/kernel_tests/matmul_op_test.py
index a8cb14f2b34..712d7336b94 100644
--- a/tensorflow/python/kernel_tests/matmul_op_test.py
+++ b/tensorflow/python/kernel_tests/matmul_op_test.py
@@ -195,14 +195,14 @@ except AttributeError:
 class MatMulInfixOperatorTest(test_lib.TestCase):
 
   def testMismatchedShape(self):
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         Exception, "(Shape must be rank 2 but is rank 1|is not a matrix)"):
       infix_matmul(
           ops.convert_to_tensor([10.0, 20.0, 30.0]),
           ops.convert_to_tensor([[40.0, 50.0], [60.0, 70.0]]))
 
   def testMismatchedDimensions(self):
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         Exception, "(Dimensions must be equal|Matrix size-incompatible)"):
       infix_matmul(
           ops.convert_to_tensor([[10.0, 20.0, 30.0]]),
diff --git a/tensorflow/python/kernel_tests/metrics_test.py b/tensorflow/python/kernel_tests/metrics_test.py
index 5f7db2764cc..edb3b428dd0 100644
--- a/tensorflow/python/kernel_tests/metrics_test.py
+++ b/tensorflow/python/kernel_tests/metrics_test.py
@@ -160,9 +160,9 @@ def _assert_nan(test_case, actual):
 
 
 def _assert_metric_variables(test_case, expected):
-  test_case.assertEquals(
+  test_case.assertEqual(
       set(expected), set(v.name for v in variables.local_variables()))
-  test_case.assertEquals(
+  test_case.assertEqual(
       set(expected),
       set(v.name for v in ops.get_collection(ops.GraphKeys.METRIC_VARIABLES)))
 
@@ -340,11 +340,11 @@ class MeanTest(test.TestCase):
     expected_error_msg = 'weights can not be broadcast to values'
     for invalid_weight in invalid_weights:
       # Static shapes.
-      with self.assertRaisesRegexp(ValueError, expected_error_msg):
+      with self.assertRaisesRegex(ValueError, expected_error_msg):
         metrics.mean(values, invalid_weight)
 
       # Dynamic shapes.
-      with self.assertRaisesRegexp(errors_impl.OpError, expected_error_msg):
+      with self.assertRaisesRegex(errors_impl.OpError, expected_error_msg):
         with self.cached_session():
           _, update_op = metrics.mean(values_placeholder, invalid_weight)
           variables.local_variables_initializer().run()
diff --git a/tensorflow/python/kernel_tests/norm_op_test.py b/tensorflow/python/kernel_tests/norm_op_test.py
index bfea2134454..f3787190426 100644
--- a/tensorflow/python/kernel_tests/norm_op_test.py
+++ b/tensorflow/python/kernel_tests/norm_op_test.py
@@ -40,18 +40,18 @@ class NormOpTest(test_lib.TestCase):
   def testBadOrder(self):
     matrix = [[0., 1.], [2., 3.]]
     for ord_ in "fro", -7, -1.1, 0:
-      with self.assertRaisesRegexp(ValueError,
-                                   "'ord' must be a supported vector norm"):
+      with self.assertRaisesRegex(ValueError,
+                                  "'ord' must be a supported vector norm"):
         linalg_ops.norm(matrix, ord=ord_)
 
     for ord_ in "fro", -7, -1.1, 0:
-      with self.assertRaisesRegexp(ValueError,
-                                   "'ord' must be a supported vector norm"):
+      with self.assertRaisesRegex(ValueError,
+                                  "'ord' must be a supported vector norm"):
         linalg_ops.norm(matrix, ord=ord_, axis=-1)
 
     for ord_ in "foo", -7, -1.1, 1.1:
-      with self.assertRaisesRegexp(ValueError,
-                                   "'ord' must be a supported matrix norm"):
+      with self.assertRaisesRegex(ValueError,
+                                  "'ord' must be a supported matrix norm"):
         linalg_ops.norm(matrix, ord=ord_, axis=[-2, -1])
 
   @test_util.run_v1_only("b/120545219")
@@ -60,7 +60,7 @@ class NormOpTest(test_lib.TestCase):
     for axis_ in [], [1, 2, 3], [[1]], [[1], [2]], [3.1415], [1, 1]:
       error_prefix = ("'axis' must be None, an integer, or a tuple of 2 unique "
                       "integers")
-      with self.assertRaisesRegexp(ValueError, error_prefix):
+      with self.assertRaisesRegex(ValueError, error_prefix):
         linalg_ops.norm(matrix, axis=axis_)
 
 
diff --git a/tensorflow/python/kernel_tests/nth_element_op_test.py b/tensorflow/python/kernel_tests/nth_element_op_test.py
index 4be78b2d5ca..d8b9adb8731 100644
--- a/tensorflow/python/kernel_tests/nth_element_op_test.py
+++ b/tensorflow/python/kernel_tests/nth_element_op_test.py
@@ -114,8 +114,7 @@ class NthElementTest(test.TestCase):
 
   @test_util.run_deprecated_v1
   def testInvalidInput(self):
-    with self.assertRaisesRegexp(ValueError,
-                                 "at least rank 1 but is rank 0"):
+    with self.assertRaisesRegex(ValueError, "at least rank 1 but is rank 0"):
       nn_ops.nth_element(5, 0)
 
   @test_util.run_deprecated_v1
@@ -127,11 +126,9 @@ class NthElementTest(test.TestCase):
 
   @test_util.run_deprecated_v1
   def testInvalidN(self):
-    with self.assertRaisesRegexp(ValueError,
-                                 "non-negative but is -1"):
+    with self.assertRaisesRegex(ValueError, "non-negative but is -1"):
       nn_ops.nth_element([5], -1)
-    with self.assertRaisesRegexp(ValueError,
-                                 "scalar but has rank 1"):
+    with self.assertRaisesRegex(ValueError, "scalar but has rank 1"):
       nn_ops.nth_element([5, 6, 3], [1])
 
   @test_util.run_deprecated_v1
@@ -146,8 +143,7 @@ class NthElementTest(test.TestCase):
   @test_util.run_deprecated_v1
   def testNTooLarge(self):
     inputs = [[0.1, 0.2], [0.3, 0.4]]
-    with self.assertRaisesRegexp(ValueError,
-                                 "must have last dimension > n = 2"):
+    with self.assertRaisesRegex(ValueError, "must have last dimension > n = 2"):
       nn_ops.nth_element(inputs, 2)
 
   @test_util.run_deprecated_v1
diff --git a/tensorflow/python/kernel_tests/numerics_test.py b/tensorflow/python/kernel_tests/numerics_test.py
index eadb8ceff07..025ad59939c 100644
--- a/tensorflow/python/kernel_tests/numerics_test.py
+++ b/tensorflow/python/kernel_tests/numerics_test.py
@@ -110,9 +110,8 @@ class NumericsTest(test.TestCase):
     _ = control_flow_ops.cond(predicate,
                               lambda: constant_op.constant([37.]),
                               lambda: constant_op.constant([42.]))
-    with self.assertRaisesRegexp(
-        ValueError,
-        r"`tf\.add_check_numerics_ops\(\) is not compatible with "
+    with self.assertRaisesRegex(
+        ValueError, r"`tf\.add_check_numerics_ops\(\) is not compatible with "
         r"TensorFlow control flow operations such as `tf\.cond\(\)` "
         r"or `tf.while_loop\(\)`\."):
       numerics.add_check_numerics_ops()
@@ -122,9 +121,8 @@ class NumericsTest(test.TestCase):
     _ = control_flow_ops.while_loop(lambda _: predicate,
                                     lambda _: constant_op.constant([37.]),
                                     [constant_op.constant([42.])])
-    with self.assertRaisesRegexp(
-        ValueError,
-        r"`tf\.add_check_numerics_ops\(\) is not compatible with "
+    with self.assertRaisesRegex(
+        ValueError, r"`tf\.add_check_numerics_ops\(\) is not compatible with "
         r"TensorFlow control flow operations such as `tf\.cond\(\)` "
         r"or `tf.while_loop\(\)`\."):
       numerics.add_check_numerics_ops()
diff --git a/tensorflow/python/kernel_tests/pad_op_test.py b/tensorflow/python/kernel_tests/pad_op_test.py
index 6fb8a4b5d86..0a53db908bf 100644
--- a/tensorflow/python/kernel_tests/pad_op_test.py
+++ b/tensorflow/python/kernel_tests/pad_op_test.py
@@ -165,7 +165,7 @@ class PadOpTest(test.TestCase):
   @test_util.run_deprecated_v1
   def testPaddingsNonNegative(self):
     with self.session(use_gpu=True):
-      with self.assertRaisesRegexp(ValueError, "must be non-negative"):
+      with self.assertRaisesRegex(ValueError, "must be non-negative"):
         array_ops.pad(constant_op.constant(
             [1], shape=[1]),
                       constant_op.constant(
@@ -174,7 +174,7 @@ class PadOpTest(test.TestCase):
   @test_util.run_deprecated_v1
   def testPaddingsNonNegative2(self):
     with self.session(use_gpu=True):
-      with self.assertRaisesRegexp(ValueError, "must be non-negative"):
+      with self.assertRaisesRegex(ValueError, "must be non-negative"):
         array_ops.pad(constant_op.constant(
             [1], shape=[1]),
                       constant_op.constant(
@@ -198,7 +198,7 @@ class PadOpTest(test.TestCase):
   def testInvalid(self):
     with self.cached_session():
       x = [[1, 2, 3], [4, 5, 6]]
-      with self.assertRaisesRegexp(ValueError, "Unknown padding mode"):
+      with self.assertRaisesRegex(ValueError, "Unknown padding mode"):
         array_ops.pad(x, [[1, 0], [2, 1]], mode="weird").eval()
 
   def testPaddingTypes(self):
diff --git a/tensorflow/python/kernel_tests/padding_fifo_queue_test.py b/tensorflow/python/kernel_tests/padding_fifo_queue_test.py
index 1825bebea8a..5870c21750f 100644
--- a/tensorflow/python/kernel_tests/padding_fifo_queue_test.py
+++ b/tensorflow/python/kernel_tests/padding_fifo_queue_test.py
@@ -318,7 +318,7 @@ class PaddingFIFOQueueTest(test.TestCase):
 
   def testConstructPaddingFIFOQueueWithNoShape(self):
     with self.cached_session():
-      with self.assertRaisesRegexp(
+      with self.assertRaisesRegex(
           ValueError,
           r"When providing partial shapes, a list of shapes must be provided."):
         data_flow_ops.PaddingFIFOQueue(10, dtypes_lib.float32,
@@ -612,8 +612,8 @@ class PaddingFIFOQueueTest(test.TestCase):
       elems_ok = np.array([1] * 4).reshape((2, 2)).astype(np.int32)
       elems_bad = array_ops.placeholder(dtypes_lib.int32)
       enqueue_op = q.enqueue((elems_ok, elems_bad))
-      with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
-                                   r"Expected \[\?,3\], got \[3,4\]"):
+      with self.assertRaisesRegex(errors_impl.InvalidArgumentError,
+                                  r"Expected \[\?,3\], got \[3,4\]"):
         sess.run([enqueue_op],
                  feed_dict={elems_bad: np.array([1] * 12).reshape((3, 4))})
 
@@ -628,9 +628,10 @@ class PaddingFIFOQueueTest(test.TestCase):
       elems_bad = array_ops.placeholder(dtypes_lib.int32)
       enqueue_op = q.enqueue_many((elems_ok, elems_bad))
       dequeued_t = q.dequeue_many(2)
-      with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
-                                   "Shape mismatch in tuple component 1. "
-                                   r"Expected \[2,\?,3\], got \[2,3,4\]"):
+      with self.assertRaisesRegex(
+          errors_impl.InvalidArgumentError,
+          "Shape mismatch in tuple component 1. "
+          r"Expected \[2,\?,3\], got \[2,3,4\]"):
         sess.run([enqueue_op],
                  feed_dict={elems_bad: np.array([1] * 24).reshape((2, 3, 4))})
         self.evaluate(dequeued_t)
@@ -914,8 +915,8 @@ class PaddingFIFOQueueTest(test.TestCase):
         self.assertEqual([elem], self.evaluate(dequeued_t))
 
       # Expect the operation to fail due to the queue being closed.
-      with self.assertRaisesRegexp(errors_impl.OutOfRangeError,
-                                   "is closed and has insufficient"):
+      with self.assertRaisesRegex(errors_impl.OutOfRangeError,
+                                  "is closed and has insufficient"):
         self.evaluate(dequeued_t)
 
   def testBlockingDequeueFromClosedQueue(self):
@@ -935,8 +936,8 @@ class PaddingFIFOQueueTest(test.TestCase):
         for elem in elems:
           self.assertEqual([elem], self.evaluate(dequeued_t))
         # Expect the operation to fail due to the queue being closed.
-        with self.assertRaisesRegexp(errors_impl.OutOfRangeError,
-                                     "is closed and has insufficient"):
+        with self.assertRaisesRegex(errors_impl.OutOfRangeError,
+                                    "is closed and has insufficient"):
           self.evaluate(dequeued_t)
 
       dequeue_thread = self.checkedThread(target=dequeue)
@@ -980,8 +981,8 @@ class PaddingFIFOQueueTest(test.TestCase):
 
       def dequeue():
         # Expect the operation to fail due to the queue being closed.
-        with self.assertRaisesRegexp(errors_impl.OutOfRangeError,
-                                     "is closed and has insufficient"):
+        with self.assertRaisesRegex(errors_impl.OutOfRangeError,
+                                    "is closed and has insufficient"):
           self.evaluate(dequeued_t)
 
       dequeue_thread = self.checkedThread(target=dequeue)
@@ -1008,8 +1009,8 @@ class PaddingFIFOQueueTest(test.TestCase):
       def dequeue():
         self.assertAllEqual(elems, self.evaluate(dequeued_t))
         # Expect the operation to fail due to the queue being closed.
-        with self.assertRaisesRegexp(errors_impl.OutOfRangeError,
-                                     "is closed and has insufficient"):
+        with self.assertRaisesRegex(errors_impl.OutOfRangeError,
+                                    "is closed and has insufficient"):
           self.evaluate(dequeued_t)
 
       dequeue_thread = self.checkedThread(target=dequeue)
@@ -1036,8 +1037,8 @@ class PaddingFIFOQueueTest(test.TestCase):
       def dequeue():
         self.assertAllEqual(elems[:3], self.evaluate(dequeued_t))
         # Expect the operation to fail due to the queue being closed.
-        with self.assertRaisesRegexp(errors_impl.OutOfRangeError,
-                                     "is closed and has insufficient"):
+        with self.assertRaisesRegex(errors_impl.OutOfRangeError,
+                                    "is closed and has insufficient"):
           self.evaluate(dequeued_t)
 
       dequeue_thread = self.checkedThread(target=dequeue)
@@ -1132,8 +1133,8 @@ class PaddingFIFOQueueTest(test.TestCase):
 
       def dequeue():
         # Expect the operation to fail due to the queue being closed.
-        with self.assertRaisesRegexp(errors_impl.OutOfRangeError,
-                                     "is closed and has insufficient"):
+        with self.assertRaisesRegex(errors_impl.OutOfRangeError,
+                                    "is closed and has insufficient"):
           self.evaluate(dequeued_t)
 
       dequeue_thread = self.checkedThread(target=dequeue)
@@ -1155,8 +1156,8 @@ class PaddingFIFOQueueTest(test.TestCase):
 
       def dequeue():
         # Expect the operation to fail due to the queue being closed.
-        with self.assertRaisesRegexp(errors_impl.OutOfRangeError,
-                                     "is closed and has insufficient"):
+        with self.assertRaisesRegex(errors_impl.OutOfRangeError,
+                                    "is closed and has insufficient"):
           self.evaluate(dequeued_t)
 
       dequeue_thread = self.checkedThread(target=dequeue)
@@ -1177,7 +1178,7 @@ class PaddingFIFOQueueTest(test.TestCase):
       close_op.run()
 
       # Expect the operation to fail due to the queue being closed.
-      with self.assertRaisesRegexp(errors_impl.CancelledError, "is closed"):
+      with self.assertRaisesRegex(errors_impl.CancelledError, "is closed"):
         enqueue_op.run()
 
   def testEnqueueManyToClosedQueue(self):
@@ -1191,7 +1192,7 @@ class PaddingFIFOQueueTest(test.TestCase):
       close_op.run()
 
       # Expect the operation to fail due to the queue being closed.
-      with self.assertRaisesRegexp(errors_impl.CancelledError, "is closed"):
+      with self.assertRaisesRegex(errors_impl.CancelledError, "is closed"):
         enqueue_op.run()
 
   def testBlockingEnqueueToFullQueue(self):
@@ -1589,7 +1590,7 @@ class PaddingFIFOQueueTest(test.TestCase):
         self.assertAllEqual(input_elem, output_elem)
 
   def testUnknownRank(self):
-    with self.assertRaisesRegexp(ValueError, "must have a defined rank"):
+    with self.assertRaisesRegex(ValueError, "must have a defined rank"):
       data_flow_ops.PaddingFIFOQueue(32, [dtypes_lib.float32],
                                      [tensor_shape.TensorShape(None)])
 
diff --git a/tensorflow/python/kernel_tests/parsing_ops_test.py b/tensorflow/python/kernel_tests/parsing_ops_test.py
index c94fd0fde49..07d5e6201a1 100644
--- a/tensorflow/python/kernel_tests/parsing_ops_test.py
+++ b/tensorflow/python/kernel_tests/parsing_ops_test.py
@@ -2437,8 +2437,8 @@ class DecodeJSONExampleTest(test.TestCase):
   def testInvalidSyntax(self):
     json_tensor = constant_op.constant(["{]"])
     if context.executing_eagerly():
-      with self.assertRaisesRegexp(errors.InvalidArgumentError,
-                                   "Error while parsing JSON"):
+      with self.assertRaisesRegex(errors.InvalidArgumentError,
+                                  "Error while parsing JSON"):
         parsing_ops.decode_json_example(json_tensor)
     else:
       binary_tensor = parsing_ops.decode_json_example(json_tensor)
diff --git a/tensorflow/python/kernel_tests/partitioned_variables_test.py b/tensorflow/python/kernel_tests/partitioned_variables_test.py
index edcbc2967e2..111488007e5 100644
--- a/tensorflow/python/kernel_tests/partitioned_variables_test.py
+++ b/tensorflow/python/kernel_tests/partitioned_variables_test.py
@@ -321,7 +321,7 @@ class PartitionedVariablesTestCase(test.TestCase):
   def _TestSaveSpec(self, slices, expected_specs):
     self.assertEqual(len(expected_specs), len(slices))
     for i in xrange(len(expected_specs)):
-      self.assertEquals(expected_specs[i], slices[i]._save_slice_info.spec)
+      self.assertEqual(expected_specs[i], slices[i]._save_slice_info.spec)
 
   def testVecConstantInit(self):
     with self.cached_session():
diff --git a/tensorflow/python/kernel_tests/pooling_ops_test.py b/tensorflow/python/kernel_tests/pooling_ops_test.py
index c9b1e42d66b..7555230fa35 100644
--- a/tensorflow/python/kernel_tests/pooling_ops_test.py
+++ b/tensorflow/python/kernel_tests/pooling_ops_test.py
@@ -756,7 +756,7 @@ class PoolingTest(test.TestCase):
                                          use_gpu=False):
     with self.cached_session(use_gpu=use_gpu):
       t = constant_op.constant(1.0, shape=in_size)
-      with self.assertRaisesRegexp(errors_impl.UnimplementedError, error_msg):
+      with self.assertRaisesRegex(errors_impl.UnimplementedError, error_msg):
         t = nn_ops.max_pool(
             t, ksize=ksize, strides=strides, padding="SAME").eval()
 
@@ -1931,7 +1931,7 @@ class PoolingTest(test.TestCase):
       for pool_func in pool_funcs:
         if pool_func != nn_ops.max_pool:
           # Illegal strides.
-          with self.assertRaisesRegexp(
+          with self.assertRaisesRegex(
               errors_impl.UnimplementedError,
               "Pooling is not yet supported on the batch"):
             sess.run(
@@ -1942,14 +1942,14 @@ class PoolingTest(test.TestCase):
                     padding="SAME"))
 
         # Filter too large.
-        with self.assertRaisesRegexp(ValueError, "Negative dimension size"):
+        with self.assertRaisesRegex(ValueError, "Negative dimension size"):
           sess.run(
               pool_func(
                   array_ops.placeholder(dtypes.float32, shape=[32, 20, 20, 3]),
                   ksize=[1, 20, 21, 1],
                   strides=[1, 1, 1, 1],
                   padding="VALID"))
-        with self.assertRaisesRegexp(ValueError, "Negative dimension size"):
+        with self.assertRaisesRegex(ValueError, "Negative dimension size"):
           pool_func(
               array_ops.placeholder(dtypes.float32, shape=[32, 20, 20, 3]),
               ksize=[1, 21, 20, 1],
diff --git a/tensorflow/python/kernel_tests/priority_queue_test.py b/tensorflow/python/kernel_tests/priority_queue_test.py
index c183fc0db48..a71d728563c 100644
--- a/tensorflow/python/kernel_tests/priority_queue_test.py
+++ b/tensorflow/python/kernel_tests/priority_queue_test.py
@@ -332,7 +332,7 @@ class PriorityQueueTest(test.TestCase):
       input_other = array_ops.placeholder(dtypes.string)
       q = data_flow_ops.PriorityQueue(2000, (dtypes.string,), (()))
 
-      with self.assertRaisesRegexp(
+      with self.assertRaisesRegex(
           errors_impl.InvalidArgumentError,
           r"Shape mismatch in tuple component 0. Expected \[\], got \[2\]"):
         sess.run([q.enqueue((input_priority, input_other))],
@@ -342,7 +342,7 @@ class PriorityQueueTest(test.TestCase):
                      input_other: np.random.rand(3, 5).astype(bytes)
                  })
 
-      with self.assertRaisesRegexp(
+      with self.assertRaisesRegex(
           errors_impl.InvalidArgumentError,
           r"Shape mismatch in tuple component 0. Expected \[2\], got \[2,2\]"):
         sess.run(
diff --git a/tensorflow/python/kernel_tests/py_func_test.py b/tensorflow/python/kernel_tests/py_func_test.py
index 5365e9a490e..5c86215631d 100644
--- a/tensorflow/python/kernel_tests/py_func_test.py
+++ b/tensorflow/python/kernel_tests/py_func_test.py
@@ -320,8 +320,8 @@ class PyFuncTest(PyFuncTestBase):
 
       y, = script_ops.py_func(bad, [], [dtypes.float32])
 
-      with self.assertRaisesRegexp(errors.InternalError,
-                                   "Unsupported numpy data type"):
+      with self.assertRaisesRegex(errors.InternalError,
+                                  "Unsupported numpy data type"):
         self.evaluate(y)
 
   @test_util.run_v1_only("b/120545219")
@@ -334,8 +334,8 @@ class PyFuncTest(PyFuncTestBase):
 
       z, = script_ops.py_func(bad, [], [dtypes.int64])
 
-      with self.assertRaisesRegexp(errors.InternalError,
-                                   "Unsupported object type"):
+      with self.assertRaisesRegex(errors.InternalError,
+                                  "Unsupported object type"):
         self.evaluate(z)
 
   @test_util.run_v1_only("b/120545219")
@@ -634,8 +634,8 @@ class EagerPyFuncTest(PyFuncTestBase):
     def return_variable():
       return resource_variable_ops.ResourceVariable(0.0)
 
-    with self.assertRaisesRegexp(errors.UnknownError,
-                                 "Attempting to return a variable"):
+    with self.assertRaisesRegex(errors.UnknownError,
+                                "Attempting to return a variable"):
       output = script_ops.eager_py_func(
           return_variable, inp=[], Tout=dtypes.float32)
       self.evaluate(output)
@@ -773,7 +773,7 @@ class EagerPyFuncTest(PyFuncTestBase):
   def testEagerPyFuncNotACallable(self):
     x = constant_op.constant("x", dtype=dtypes.string)
 
-    with self.assertRaisesRegexp(ValueError, "callable"):
+    with self.assertRaisesRegex(ValueError, "callable"):
       _ = script_ops.eager_py_func(x, inp=[x], Tout=dtypes.string)
 
 
diff --git a/tensorflow/python/kernel_tests/qr_op_test.py b/tensorflow/python/kernel_tests/qr_op_test.py
index 0c291dbd940..2effb832bda 100644
--- a/tensorflow/python/kernel_tests/qr_op_test.py
+++ b/tensorflow/python/kernel_tests/qr_op_test.py
@@ -50,12 +50,12 @@ class QrOpTest(test.TestCase):
   def testWrongDimensions(self):
     # The input to svd should be a tensor of at least rank 2.
     scalar = constant_op.constant(1.)
-    with self.assertRaisesRegexp((ValueError, errors_impl.InvalidArgumentError),
-                                 "rank.* 2.*0"):
+    with self.assertRaisesRegex((ValueError, errors_impl.InvalidArgumentError),
+                                "rank.* 2.*0"):
       linalg_ops.qr(scalar)
     vector = constant_op.constant([1., 2.])
-    with self.assertRaisesRegexp((ValueError, errors_impl.InvalidArgumentError),
-                                 "rank.* 2.*1"):
+    with self.assertRaisesRegex((ValueError, errors_impl.InvalidArgumentError),
+                                "rank.* 2.*1"):
       linalg_ops.qr(vector)
 
   @test_util.run_in_graph_and_eager_modes(use_gpu=True)
diff --git a/tensorflow/python/kernel_tests/random/random_ops_test.py b/tensorflow/python/kernel_tests/random/random_ops_test.py
index 73c8bd09db0..c361f79fb1f 100644
--- a/tensorflow/python/kernel_tests/random/random_ops_test.py
+++ b/tensorflow/python/kernel_tests/random/random_ops_test.py
@@ -303,11 +303,11 @@ class RandomUniformTest(RandomOpTestCommon):
   @test_util.run_deprecated_v1
   def testUniformIntsWithInvalidShape(self):
     for dtype in dtypes.int32, dtypes.int64:
-      with self.assertRaisesRegexp(
+      with self.assertRaisesRegex(
           ValueError, "minval must be a scalar; got a tensor of shape"):
         random_ops.random_uniform(
             [1000], minval=[1, 2], maxval=3, dtype=dtype)
-      with self.assertRaisesRegexp(
+      with self.assertRaisesRegex(
           ValueError, "maxval must be a scalar; got a tensor of shape"):
         random_ops.random_uniform(
             [1000], minval=1, maxval=[2, 3], dtype=dtype)
diff --git a/tensorflow/python/kernel_tests/random/random_shuffle_queue_test.py b/tensorflow/python/kernel_tests/random/random_shuffle_queue_test.py
index ba7ee16e7c2..4cb5f1935d9 100644
--- a/tensorflow/python/kernel_tests/random/random_shuffle_queue_test.py
+++ b/tensorflow/python/kernel_tests/random/random_shuffle_queue_test.py
@@ -653,8 +653,8 @@ class RandomShuffleQueueTest(test.TestCase):
       self.assertItemsEqual(expected, results)
 
       # Expect the operation to fail due to the queue being closed.
-      with self.assertRaisesRegexp(errors_impl.OutOfRangeError,
-                                   "is closed and has insufficient"):
+      with self.assertRaisesRegex(errors_impl.OutOfRangeError,
+                                  "is closed and has insufficient"):
         self.evaluate(dequeued_t)
 
   def testBlockingDequeueFromClosedQueue(self):
@@ -680,8 +680,8 @@ class RandomShuffleQueueTest(test.TestCase):
 
         self.assertItemsEqual(elems, results)
         # Expect the operation to fail due to the queue being closed.
-        with self.assertRaisesRegexp(errors_impl.OutOfRangeError,
-                                     "is closed and has insufficient"):
+        with self.assertRaisesRegex(errors_impl.OutOfRangeError,
+                                    "is closed and has insufficient"):
           self.evaluate(dequeued_t)
 
       dequeue_thread = self.checkedThread(target=blocking_dequeue)
@@ -705,8 +705,8 @@ class RandomShuffleQueueTest(test.TestCase):
 
       def dequeue():
         # Expect the operation to fail due to the queue being closed.
-        with self.assertRaisesRegexp(errors_impl.OutOfRangeError,
-                                     "is closed and has insufficient"):
+        with self.assertRaisesRegex(errors_impl.OutOfRangeError,
+                                    "is closed and has insufficient"):
           self.evaluate(dequeued_t)
         finished.append(True)
 
@@ -736,8 +736,8 @@ class RandomShuffleQueueTest(test.TestCase):
         self.assertItemsEqual(elems, self.evaluate(dequeued_t))
         progress.append(1)
         # Expect the operation to fail due to the queue being closed.
-        with self.assertRaisesRegexp(errors_impl.OutOfRangeError,
-                                     "is closed and has insufficient"):
+        with self.assertRaisesRegex(errors_impl.OutOfRangeError,
+                                    "is closed and has insufficient"):
           self.evaluate(dequeued_t)
         progress.append(2)
 
@@ -770,9 +770,9 @@ class RandomShuffleQueueTest(test.TestCase):
 
       def dequeue():
         results.extend(self.evaluate(dequeued_t))
-        self.assertEquals(3, len(results))
+        self.assertEqual(3, len(results))
         results.extend(self.evaluate(dequeued_t))
-        self.assertEquals(4, len(results))
+        self.assertEqual(4, len(results))
 
       dequeue_thread = self.checkedThread(target=dequeue)
       dequeue_thread.start()
@@ -801,11 +801,11 @@ class RandomShuffleQueueTest(test.TestCase):
 
       def dequeue():
         results.extend(self.evaluate(dequeued_t))
-        self.assertEquals(3, len(results))
+        self.assertEqual(3, len(results))
         # min_after_dequeue is 2, we ask for 3 elements, and we end up only
         # getting the remaining 1.
         results.extend(self.evaluate(dequeued_t))
-        self.assertEquals(4, len(results))
+        self.assertEqual(4, len(results))
 
       dequeue_thread = self.checkedThread(target=dequeue)
       dequeue_thread.start()
@@ -833,8 +833,8 @@ class RandomShuffleQueueTest(test.TestCase):
         results.extend(self.evaluate(dequeued_t))
         self.assertEqual(len(results), 3)
         # Expect the operation to fail due to the queue being closed.
-        with self.assertRaisesRegexp(errors_impl.OutOfRangeError,
-                                     "is closed and has insufficient"):
+        with self.assertRaisesRegex(errors_impl.OutOfRangeError,
+                                    "is closed and has insufficient"):
           self.evaluate(dequeued_t)
         # While the last dequeue failed, we want to insure that it returns
         # any elements that it potentially reserved to dequeue. Thus the
@@ -858,8 +858,8 @@ class RandomShuffleQueueTest(test.TestCase):
 
       def dequeue():
         # Expect the operation to fail due to the queue being closed.
-        with self.assertRaisesRegexp(errors_impl.OutOfRangeError,
-                                     "is closed and has insufficient"):
+        with self.assertRaisesRegex(errors_impl.OutOfRangeError,
+                                    "is closed and has insufficient"):
           self.evaluate(dequeued_t)
 
       dequeue_thread = self.checkedThread(target=dequeue)
@@ -878,8 +878,8 @@ class RandomShuffleQueueTest(test.TestCase):
 
       def dequeue():
         # Expect the operation to fail due to the queue being closed.
-        with self.assertRaisesRegexp(errors_impl.OutOfRangeError,
-                                     "is closed and has insufficient"):
+        with self.assertRaisesRegex(errors_impl.OutOfRangeError,
+                                    "is closed and has insufficient"):
           self.evaluate(dequeued_t)
 
       dequeue_thread = self.checkedThread(target=dequeue)
@@ -900,7 +900,7 @@ class RandomShuffleQueueTest(test.TestCase):
       close_op.run()
 
       # Expect the operation to fail due to the queue being closed.
-      with self.assertRaisesRegexp(errors_impl.CancelledError, "is closed"):
+      with self.assertRaisesRegex(errors_impl.CancelledError, "is closed"):
         enqueue_op.run()
 
   def testEnqueueManyToClosedQueue(self):
@@ -914,7 +914,7 @@ class RandomShuffleQueueTest(test.TestCase):
       close_op.run()
 
       # Expect the operation to fail due to the queue being closed.
-      with self.assertRaisesRegexp(errors_impl.CancelledError, "is closed"):
+      with self.assertRaisesRegex(errors_impl.CancelledError, "is closed"):
         enqueue_op.run()
 
   def testBlockingEnqueueToFullQueue(self):
@@ -996,7 +996,7 @@ class RandomShuffleQueueTest(test.TestCase):
         self.evaluate(blocking_enqueue_op)
 
         # Expect the operation to fail due to the queue being closed.
-        with self.assertRaisesRegexp(errors_impl.CancelledError, "closed"):
+        with self.assertRaisesRegex(errors_impl.CancelledError, "closed"):
           self.evaluate(blocking_enqueue_op)
 
       thread1 = self.checkedThread(target=blocking_enqueue)
@@ -1069,7 +1069,7 @@ class RandomShuffleQueueTest(test.TestCase):
 
       # At this point the close operation will complete, so the next enqueue
       # will fail.
-      with self.assertRaisesRegexp(errors_impl.CancelledError, "closed"):
+      with self.assertRaisesRegex(errors_impl.CancelledError, "closed"):
         self.evaluate(blocking_enqueue_op)
 
   def testSharedQueueSameSession(self):
diff --git a/tensorflow/python/kernel_tests/reduce_join_op_test.py b/tensorflow/python/kernel_tests/reduce_join_op_test.py
index 751e3e3648b..a9c1278a7b1 100644
--- a/tensorflow/python/kernel_tests/reduce_join_op_test.py
+++ b/tensorflow/python/kernel_tests/reduce_join_op_test.py
@@ -303,17 +303,15 @@ class ReduceJoinTest(UnicodeTestCase):
   @test_util.run_deprecated_v1
   def testInvalidReductionIndices(self):
     with self.cached_session():
-      with self.assertRaisesRegexp(ValueError, "Invalid reduction dim"):
+      with self.assertRaisesRegex(ValueError, "Invalid reduction dim"):
         string_ops.reduce_join(inputs="", axis=0)
-      with self.assertRaisesRegexp(ValueError,
-                                   "Invalid reduction dimension -3"):
+      with self.assertRaisesRegex(ValueError, "Invalid reduction dimension -3"):
         string_ops.reduce_join(inputs=[[""]], axis=-3)
-      with self.assertRaisesRegexp(ValueError, "Invalid reduction dimension 2"):
+      with self.assertRaisesRegex(ValueError, "Invalid reduction dimension 2"):
         string_ops.reduce_join(inputs=[[""]], axis=2)
-      with self.assertRaisesRegexp(ValueError,
-                                   "Invalid reduction dimension -3"):
+      with self.assertRaisesRegex(ValueError, "Invalid reduction dimension -3"):
         string_ops.reduce_join(inputs=[[""]], axis=[0, -3])
-      with self.assertRaisesRegexp(ValueError, "Invalid reduction dimension 2"):
+      with self.assertRaisesRegex(ValueError, "Invalid reduction dimension 2"):
         string_ops.reduce_join(inputs=[[""]], axis=[0, 2])
 
   def testZeroDims(self):
diff --git a/tensorflow/python/kernel_tests/relu_op_test.py b/tensorflow/python/kernel_tests/relu_op_test.py
index 0c599a0f5f6..a93a2046a1a 100644
--- a/tensorflow/python/kernel_tests/relu_op_test.py
+++ b/tensorflow/python/kernel_tests/relu_op_test.py
@@ -91,7 +91,7 @@ class ReluTest(test.TestCase):
       self.skipTest("No GPU available")
     inputs = constant_op.constant(
         np.array([[-50, 7, 23], [0, 1, -5], [6, -2, 11]]), dtypes.qint8)
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         errors.InvalidArgumentError,
         "Tensor size must be a multiple of 4 for Relu<qint8>. Got 9"):
       self.evaluate(nn_ops.relu(inputs))
@@ -99,7 +99,7 @@ class ReluTest(test.TestCase):
     inputs = constant_op.constant(
         np.array([1, -2, 3, -4, 5, -6, 7, -8, 9, -8, 7, -6, 5, -4, 3, -2, 1]),
         dtypes.qint8)
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         errors.InvalidArgumentError,
         "Tensor size must be a multiple of 4 for Relu<qint8>. Got 17"):
       self.evaluate(nn_ops.relu(inputs))
diff --git a/tensorflow/python/kernel_tests/reshape_op_test.py b/tensorflow/python/kernel_tests/reshape_op_test.py
index 264838d7ae5..0d54138e053 100644
--- a/tensorflow/python/kernel_tests/reshape_op_test.py
+++ b/tensorflow/python/kernel_tests/reshape_op_test.py
@@ -147,12 +147,12 @@ class ReshapeTest(test.TestCase):
   @test_util.run_deprecated_v1
   def testErrors(self):
     y = constant_op.constant(0.0, shape=[23, 29, 31])
-    with self.assertRaisesRegexp(ValueError, "must be evenly divisible by 17"):
+    with self.assertRaisesRegex(ValueError, "must be evenly divisible by 17"):
       array_ops.reshape(y, [17, -1])
 
     z = constant_op.constant(0.0, shape=[32, 128])
-    with self.assertRaisesRegexp(ValueError,
-                                 "Cannot reshape a tensor with 4096 elements"):
+    with self.assertRaisesRegex(ValueError,
+                                "Cannot reshape a tensor with 4096 elements"):
       array_ops.reshape(z, [4095])
 
   @test_util.run_deprecated_v1
diff --git a/tensorflow/python/kernel_tests/resource_variable_ops_test.py b/tensorflow/python/kernel_tests/resource_variable_ops_test.py
index fb172fbcb10..953c616b0bc 100644
--- a/tensorflow/python/kernel_tests/resource_variable_ops_test.py
+++ b/tensorflow/python/kernel_tests/resource_variable_ops_test.py
@@ -109,7 +109,7 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase,
       handle = resource_variable_ops.var_handle_op(
           dtype=dtypes.int32, shape=[1], name="foo")
       resource_variable_ops.assign_variable_op(handle, 1)
-      with self.assertRaisesRegexp(
+      with self.assertRaisesRegex(
           errors.InvalidArgumentError,
           "Trying to read variable with wrong dtype. "
           "Expected float got int32"):
@@ -203,7 +203,7 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase,
           dtype=dtypes.int32, shape=[1], name="foo")
       resource_variable_ops.assign_variable_op(
           handle, constant_op.constant([1]))
-      with self.assertRaisesRegexp(
+      with self.assertRaisesRegex(
           errors.InvalidArgumentError, "Trying to assign variable with wrong "
           "dtype. Expected int32 got float"):
         resource_variable_ops.assign_variable_op(
@@ -962,8 +962,8 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase,
       with variable_scope.variable_scope("foo"):
         var = variable_scope.get_variable("x", shape=[1, 1],
                                           dtype=dtypes.float32)
-        with self.assertRaisesRegexp(ValueError,
-                                     "Shapes.*and.*are incompatible"):
+        with self.assertRaisesRegex(ValueError,
+                                    "Shapes.*and.*are incompatible"):
           assign = var.assign(np.zeros(shape=[2, 2]))
           self.evaluate(assign)
 
@@ -1124,7 +1124,7 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase,
       v = resource_variable_ops.ResourceVariable(initial_value=zero)
       return (i + 1, v.read_value())
 
-    with self.assertRaisesRegexp(ValueError, "initializer"):
+    with self.assertRaisesRegex(ValueError, "initializer"):
       control_flow_ops.while_loop(cond, body, [0, 0])
 
   def testVariableEager(self):
@@ -1193,8 +1193,8 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase,
                                                    name="var8")
       var_handle = var._handle
       del var
-      with self.assertRaisesRegexp(errors.NotFoundError,
-                                   r"Resource .* does not exist."):
+      with self.assertRaisesRegex(errors.NotFoundError,
+                                  r"Resource .* does not exist."):
         resource_variable_ops.destroy_resource_op(var_handle,
                                                   ignore_lookup_error=False)
 
@@ -1280,7 +1280,7 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase,
     # The exact error and message differ between graph construction (where the
     # error is realized during shape inference at graph construction time) and
     # eager execution (where the error is realized during kernel execution).
-    with self.assertRaisesRegexp(Exception, r"shape.*2.*3"):
+    with self.assertRaisesRegex(Exception, r"shape.*2.*3"):
       state_ops.scatter_update(v, [0, 1], [0, 1, 2])
 
   @test_util.run_in_graph_and_eager_modes
@@ -1288,7 +1288,7 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase,
     v = resource_variable_ops.ResourceVariable([0, 1, 2, 3])
     self.evaluate(v.initializer)
     pattern = re.compile("shapes must be equal", re.IGNORECASE)
-    with self.assertRaisesRegexp(Exception, pattern):
+    with self.assertRaisesRegex(Exception, pattern):
       self.evaluate(v.assign_add(1))
 
   @test_util.run_in_graph_and_eager_modes
diff --git a/tensorflow/python/kernel_tests/reverse_sequence_op_test.py b/tensorflow/python/kernel_tests/reverse_sequence_op_test.py
index 267decff38b..39bbc613a0e 100644
--- a/tensorflow/python/kernel_tests/reverse_sequence_op_test.py
+++ b/tensorflow/python/kernel_tests/reverse_sequence_op_test.py
@@ -149,37 +149,37 @@ class ReverseSequenceTest(test.TestCase):
   def testInvalidArguments(self):
     # Batch size mismatched between input and seq_lengths.
     # seq_length too long
-    with self.assertRaisesRegexp((ValueError, errors.InvalidArgumentError),
-                                 (r"Dimensions must be equal|"
-                                  r"Length of seq_lengths != input.dims\(0\)")):
+    with self.assertRaisesRegex((ValueError, errors.InvalidArgumentError),
+                                (r"Dimensions must be equal|"
+                                 r"Length of seq_lengths != input.dims\(0\)")):
       array_ops.reverse_sequence([[1, 2], [3, 4]], [2, 2, 2], seq_axis=1)
 
     # seq_length too short
-    with self.assertRaisesRegexp((ValueError, errors.InvalidArgumentError),
-                                 (r"Dimensions must be equal|"
-                                  r"Length of seq_lengths != input.dims\(0\)")):
+    with self.assertRaisesRegex((ValueError, errors.InvalidArgumentError),
+                                (r"Dimensions must be equal|"
+                                 r"Length of seq_lengths != input.dims\(0\)")):
       array_ops.reverse_sequence([[1, 2], [3, 4]], [2], seq_axis=1)
 
     # Invalid seq_length shape
-    with self.assertRaisesRegexp((ValueError, errors.InvalidArgumentError),
-                                 ("Shape must be rank 1 but is rank 2|"
-                                  "seq_lengths must be 1-dim")):
+    with self.assertRaisesRegex((ValueError, errors.InvalidArgumentError),
+                                ("Shape must be rank 1 but is rank 2|"
+                                 "seq_lengths must be 1-dim")):
       array_ops.reverse_sequence([[1, 2], [3, 4]], [[2, 2]], seq_axis=1)
 
     # seq_axis out of bounds.
-    with self.assertRaisesRegexp((ValueError, errors.InvalidArgumentError),
-                                 "seq_dim must be < input rank"):
+    with self.assertRaisesRegex((ValueError, errors.InvalidArgumentError),
+                                "seq_dim must be < input rank"):
       array_ops.reverse_sequence([[1, 2], [3, 4]], [2, 2], seq_axis=2)
 
     # batch_axis out of bounds.
-    with self.assertRaisesRegexp((ValueError, errors.InvalidArgumentError),
-                                 "batch_dim must be < input rank"):
+    with self.assertRaisesRegex((ValueError, errors.InvalidArgumentError),
+                                "batch_dim must be < input rank"):
       array_ops.reverse_sequence([[1, 2], [3, 4]], [2, 2],
                                  seq_axis=1,
                                  batch_axis=3)
 
-    with self.assertRaisesRegexp((errors.OpError, errors.InvalidArgumentError),
-                                 "batch_dim == seq_dim == 0"):
+    with self.assertRaisesRegex((errors.OpError, errors.InvalidArgumentError),
+                                "batch_dim == seq_dim == 0"):
       output = array_ops.reverse_sequence([[1, 2], [3, 4]], [2, 2], seq_axis=0)
       self.evaluate(output)
 
diff --git a/tensorflow/python/kernel_tests/rnn_cell_test.py b/tensorflow/python/kernel_tests/rnn_cell_test.py
index 9de14006de2..c6cf1cdf875 100644
--- a/tensorflow/python/kernel_tests/rnn_cell_test.py
+++ b/tensorflow/python/kernel_tests/rnn_cell_test.py
@@ -200,7 +200,7 @@ class RNNTest(test.TestCase):
   def testInvalidSequenceLengthShape(self):
     cell = Plus1RNNCell()
     inputs = [array_ops.placeholder(dtypes.float32, shape=(3, 4))]
-    with self.assertRaisesRegexp(ValueError, "must be a vector"):
+    with self.assertRaisesRegex(ValueError, "must be a vector"):
       rnn.static_rnn(cell, inputs, dtype=dtypes.float32, sequence_length=4)
 
   @test_util.run_v1_only("b/124229375")
@@ -2796,10 +2796,9 @@ class RNNCellTest(test.TestCase, parameterized.TestCase):
             state_is_tuple=False)
         cell(x, m)  # Execute to create variables
       variables = variables_lib.global_variables()
-      self.assertEquals(variables[0].op.name, "root/lstm_cell/kernel")
-      self.assertEquals(variables[1].op.name, "root/lstm_cell/bias")
-      self.assertEquals(variables[2].op.name,
-                        "root/lstm_cell/projection/kernel")
+      self.assertEqual(variables[0].op.name, "root/lstm_cell/kernel")
+      self.assertEqual(variables[1].op.name, "root/lstm_cell/bias")
+      self.assertEqual(variables[2].op.name, "root/lstm_cell/projection/kernel")
 
   @test_util.run_in_graph_and_eager_modes
   def testWrapperCheckpointing(self):
@@ -2950,7 +2949,7 @@ class RNNCellTest(test.TestCase, parameterized.TestCase):
         m_good = (array_ops.zeros([1, 2]), array_ops.zeros([1, 2]))
 
         # Test incorrectness of state
-        with self.assertRaisesRegexp(ValueError, "Expected state .* a tuple"):
+        with self.assertRaisesRegex(ValueError, "Expected state .* a tuple"):
           rnn_cell_impl.MultiRNNCell(
               [rnn_cell_impl.GRUCell(2) for _ in range(2)],
               state_is_tuple=True)(x, m_bad)
diff --git a/tensorflow/python/kernel_tests/rnn_test.py b/tensorflow/python/kernel_tests/rnn_test.py
index 0a5f25d8814..27732de19d1 100644
--- a/tensorflow/python/kernel_tests/rnn_test.py
+++ b/tensorflow/python/kernel_tests/rnn_test.py
@@ -135,7 +135,7 @@ class RNNTest(test.TestCase):
       inputs = [constant_op.constant(np.ones((3, 4)))]
     else:
       inputs = [array_ops.placeholder(dtypes.float32, shape=(3, 4))]
-    with self.assertRaisesRegexp(ValueError, "must be a vector"):
+    with self.assertRaisesRegex(ValueError, "must be a vector"):
       rnn.dynamic_rnn(
           cell,
           array_ops.stack(inputs),
@@ -157,8 +157,8 @@ class RNNTest(test.TestCase):
     ]
     for cell_cls in cells:
       with self.cached_session():
-        with self.assertRaisesRegexp(
-            ValueError, "RNN cell only supports floating"):
+        with self.assertRaisesRegex(ValueError,
+                                    "RNN cell only supports floating"):
           cell = cell_cls(2, dtype=dtypes.int32)
           rnn.dynamic_rnn(cell, inputs, dtype=dtypes.int32)
 
@@ -279,22 +279,22 @@ class RNNTest(test.TestCase):
   @test_util.run_deprecated_v1
   def testCellGetInitialState(self):
     cell = rnn_cell_impl.BasicRNNCell(5)
-    with self.assertRaisesRegexp(
-        ValueError, "batch_size and dtype cannot be None"):
+    with self.assertRaisesRegex(ValueError,
+                                "batch_size and dtype cannot be None"):
       cell.get_initial_state(None, None, None)
 
     inputs = array_ops.placeholder(dtypes.float32, shape=(None, 4, 1))
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         ValueError, "batch size from input tensor is different from"):
       cell.get_initial_state(inputs=inputs, batch_size=50, dtype=None)
 
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         ValueError, "batch size from input tensor is different from"):
       cell.get_initial_state(
           inputs=inputs, batch_size=constant_op.constant(50), dtype=None)
 
-    with self.assertRaisesRegexp(
-        ValueError, "dtype from input tensor is different from"):
+    with self.assertRaisesRegex(ValueError,
+                                "dtype from input tensor is different from"):
       cell.get_initial_state(inputs=inputs, batch_size=None, dtype=dtypes.int16)
 
     initial_state = cell.get_initial_state(
diff --git a/tensorflow/python/kernel_tests/segment_reduction_ops_test.py b/tensorflow/python/kernel_tests/segment_reduction_ops_test.py
index e993ae29c10..1982fd27d4d 100644
--- a/tensorflow/python/kernel_tests/segment_reduction_ops_test.py
+++ b/tensorflow/python/kernel_tests/segment_reduction_ops_test.py
@@ -782,7 +782,7 @@ class SparseSegmentReductionOpTest(SparseSegmentReductionHelper):
     tf_indices = [8, 3, 0, 9]
     with self.session(use_gpu=False):
       for tf_op in ops_list:
-        with self.assertRaisesRegexp(
+        with self.assertRaisesRegex(
             ValueError, "Cannot specify a negative value for num_segments"):
           tf_op(
               data=tf_x,
diff --git a/tensorflow/python/kernel_tests/sets_test.py b/tensorflow/python/kernel_tests/sets_test.py
index b4f23229348..c0802742a73 100644
--- a/tensorflow/python/kernel_tests/sets_test.py
+++ b/tensorflow/python/kernel_tests/sets_test.py
@@ -140,7 +140,7 @@ class SetOpsTest(test_util.TensorFlowTestCase):
         constant_op.constant([3, 2, 3], dtypes.int64))
 
     if invalid_indices:
-      with self.assertRaisesRegexp(errors_impl.OpError, "out of order"):
+      with self.assertRaisesRegex(errors_impl.OpError, "out of order"):
         self._set_size(sp)
     else:
       self.assertAllEqual([
@@ -368,7 +368,7 @@ class SetOpsTest(test_util.TensorFlowTestCase):
         constant_op.constant([4, 2, 4], dtypes.int64))
 
     if invalid_indices:
-      with self.assertRaisesRegexp(errors_impl.OpError, "out of order"):
+      with self.assertRaisesRegex(errors_impl.OpError, "out of order"):
         self._set_intersection(sp_a, sp_b)
     else:
       expected_indices = [
@@ -858,9 +858,9 @@ class SetOpsTest(test_util.TensorFlowTestCase):
         constant_op.constant([4, 2, 4], dtypes.int64))
 
     if invalid_indices:
-      with self.assertRaisesRegexp(errors_impl.OpError, "out of order"):
+      with self.assertRaisesRegex(errors_impl.OpError, "out of order"):
         self._set_difference(sp_a, sp_b, False)
-      with self.assertRaisesRegexp(errors_impl.OpError, "out of order"):
+      with self.assertRaisesRegex(errors_impl.OpError, "out of order"):
         self._set_difference(sp_a, sp_b, True)
     else:
       # a-b
@@ -1154,7 +1154,7 @@ class SetOpsTest(test_util.TensorFlowTestCase):
         constant_op.constant([4, 2, 4], dtypes.int64))
 
     if invalid_indices:
-      with self.assertRaisesRegexp(errors_impl.OpError, "out of order"):
+      with self.assertRaisesRegex(errors_impl.OpError, "out of order"):
         self._set_union(sp_a, sp_b)
     else:
       expected_indices = [
diff --git a/tensorflow/python/kernel_tests/softplus_op_test.py b/tensorflow/python/kernel_tests/softplus_op_test.py
index 5273dd7ffc7..c79c1e150d3 100644
--- a/tensorflow/python/kernel_tests/softplus_op_test.py
+++ b/tensorflow/python/kernel_tests/softplus_op_test.py
@@ -128,7 +128,7 @@ class SoftplusTest(test.TestCase):
   @test_util.run_deprecated_v1
   def testNoInts(self):
     with self.cached_session():
-      with self.assertRaisesRegexp(
+      with self.assertRaisesRegex(
           TypeError,
           "'features' has DataType int32 not in list of allowed values"):
         nn_ops.softplus(constant_op.constant(42)).eval()
diff --git a/tensorflow/python/kernel_tests/softsign_op_test.py b/tensorflow/python/kernel_tests/softsign_op_test.py
index 5554240c826..28b525ded35 100644
--- a/tensorflow/python/kernel_tests/softsign_op_test.py
+++ b/tensorflow/python/kernel_tests/softsign_op_test.py
@@ -70,7 +70,7 @@ class SoftsignTest(test.TestCase):
   @test_util.run_deprecated_v1
   def testNoInts(self):
     with self.cached_session():
-      with self.assertRaisesRegexp(
+      with self.assertRaisesRegex(
           TypeError,
           "'features' has DataType int32 not in list of allowed values"):
         nn_ops.softsign(constant_op.constant(7)).eval()
diff --git a/tensorflow/python/kernel_tests/sparse_add_op_test.py b/tensorflow/python/kernel_tests/sparse_add_op_test.py
index 00eff54077c..1a43564fec2 100644
--- a/tensorflow/python/kernel_tests/sparse_add_op_test.py
+++ b/tensorflow/python/kernel_tests/sparse_add_op_test.py
@@ -207,8 +207,8 @@ class SparseAddTest(test.TestCase):
         sparse = sparse_tensor.SparseTensorValue(bad_idx, val, shape)
         s = sparse_ops.sparse_add(sparse, dense)
 
-        with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
-                                     "invalid index"):
+        with self.assertRaisesRegex(errors_impl.InvalidArgumentError,
+                                    "invalid index"):
           self.evaluate(s)
 
 ######################## Benchmarking code
diff --git a/tensorflow/python/kernel_tests/sparse_conditional_accumulator_test.py b/tensorflow/python/kernel_tests/sparse_conditional_accumulator_test.py
index 67b42d02b88..205237dd01d 100644
--- a/tensorflow/python/kernel_tests/sparse_conditional_accumulator_test.py
+++ b/tensorflow/python/kernel_tests/sparse_conditional_accumulator_test.py
@@ -451,7 +451,7 @@ class IndexedSlicesConditionalAccumulatorTest(test.TestCase):
       q = data_flow_ops.SparseConditionalAccumulator(
           dtypes_lib.float32, name="Q", shape=tensor_shape.TensorShape([3, 3]))
 
-      with self.assertRaisesRegexp(
+      with self.assertRaisesRegex(
           errors_impl.InvalidArgumentError,
           "Input indices should be vector but received shape:"):
         q.apply_grad(
@@ -464,8 +464,8 @@ class IndexedSlicesConditionalAccumulatorTest(test.TestCase):
       q = data_flow_ops.SparseConditionalAccumulator(
           dtypes_lib.float32, name="Q", shape=tensor_shape.TensorShape([3, 3]))
 
-      with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
-                                   "Values cannot be 0-dimensional."):
+      with self.assertRaisesRegex(errors_impl.InvalidArgumentError,
+                                  "Values cannot be 0-dimensional."):
         q.apply_grad(
             grad_indices=[0], grad_values=np.array(1).astype(np.float32)).run()
 
@@ -475,8 +475,8 @@ class IndexedSlicesConditionalAccumulatorTest(test.TestCase):
       q = data_flow_ops.SparseConditionalAccumulator(
           dtypes_lib.float32, name="Q", shape=tensor_shape.TensorShape([3, 3]))
 
-      with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
-                                   " non-empty input values, got "):
+      with self.assertRaisesRegex(errors_impl.InvalidArgumentError,
+                                  " non-empty input values, got "):
         q.apply_grad(
             grad_indices=[0, 1],
             grad_values=np.array([[0, 1, 1]]).astype(np.float32)).run()
@@ -492,7 +492,7 @@ class IndexedSlicesConditionalAccumulatorTest(test.TestCase):
 
       accum_op = q.apply_grad(grad_indices=x_indices, grad_values=x_values)
 
-      with self.assertRaisesRegexp(
+      with self.assertRaisesRegex(
           errors_impl.InvalidArgumentError,
           "Input indices should be vector but received shape:"):
         sess.run(accum_op,
@@ -512,8 +512,8 @@ class IndexedSlicesConditionalAccumulatorTest(test.TestCase):
 
       accum_op = q.apply_grad(grad_indices=x_indices, grad_values=x_values)
 
-      with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
-                                   " non-empty input values, got "):
+      with self.assertRaisesRegex(errors_impl.InvalidArgumentError,
+                                  " non-empty input values, got "):
         sess.run(accum_op,
                  feed_dict={
                      x_indices: [0, 1],
@@ -526,20 +526,20 @@ class IndexedSlicesConditionalAccumulatorTest(test.TestCase):
       q = data_flow_ops.SparseConditionalAccumulator(
           dtypes_lib.float32, name="Q", shape=tensor_shape.TensorShape([]))
 
-      with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
-                                   "Input indices should be vector"):
+      with self.assertRaisesRegex(errors_impl.InvalidArgumentError,
+                                  "Input indices should be vector"):
         q.apply_grad(grad_indices=0, grad_values=[1.0], grad_shape=[]).run()
 
-      with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
-                                   "Input indices should be vector"):
+      with self.assertRaisesRegex(errors_impl.InvalidArgumentError,
+                                  "Input indices should be vector"):
         q.apply_grad(grad_indices=0, grad_values=[1.0]).run()
 
-      with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
-                                   "Values cannot be 0-dimensional."):
+      with self.assertRaisesRegex(errors_impl.InvalidArgumentError,
+                                  "Values cannot be 0-dimensional."):
         q.apply_grad(grad_indices=[0], grad_values=1.0, grad_shape=[]).run()
 
-      with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
-                                   "Values cannot be 0-dimensional."):
+      with self.assertRaisesRegex(errors_impl.InvalidArgumentError,
+                                  "Values cannot be 0-dimensional."):
         q.apply_grad(grad_indices=[0], grad_values=1.0).run()
 
       # The right way to apply a scalar
@@ -553,7 +553,7 @@ class IndexedSlicesConditionalAccumulatorTest(test.TestCase):
           dtypes_lib.float32, name="Q", shape=[2, 2, None])
 
       # Provided shape has wrong rank
-      with self.assertRaisesRegexp(
+      with self.assertRaisesRegex(
           errors_impl.InvalidArgumentError,
           "Shape mismatch: expected shape rank at least 3, got 2"):
         q.apply_grad(
@@ -562,7 +562,7 @@ class IndexedSlicesConditionalAccumulatorTest(test.TestCase):
             grad_shape=[2, 2]).run()
 
       # Provided shape has wrong dim
-      with self.assertRaisesRegexp(
+      with self.assertRaisesRegex(
           errors_impl.InvalidArgumentError,
           "Shape mismatch: expected shape dim 1 to be 2, got 3"):
         q.apply_grad(
@@ -571,7 +571,7 @@ class IndexedSlicesConditionalAccumulatorTest(test.TestCase):
             grad_shape=[2, 3, 2]).run()
 
       # Indices exceeded accumulator's shape's limits
-      with self.assertRaisesRegexp(
+      with self.assertRaisesRegex(
           errors_impl.InvalidArgumentError,
           "Shape mismatch: index of slice 0 exceeded limits of shape;"
           " index is 3 exceeded 2"):
@@ -580,7 +580,7 @@ class IndexedSlicesConditionalAccumulatorTest(test.TestCase):
             grad_values=np.array([[[1, 2], [3, 4]]]).astype(np.float32)).run()
 
       # Values' rank does not match shape
-      with self.assertRaisesRegexp(
+      with self.assertRaisesRegex(
           errors_impl.InvalidArgumentError,
           "Shape mismatch: expected values rank at least 3, got 2"):
         q.apply_grad(
@@ -588,7 +588,7 @@ class IndexedSlicesConditionalAccumulatorTest(test.TestCase):
             grad_values=np.array([[1, 2], [3, 4]]).astype(np.float32)).run()
 
       # Values' dim does not match shape
-      with self.assertRaisesRegexp(
+      with self.assertRaisesRegex(
           errors_impl.InvalidArgumentError,
           "Shape mismatch: expected values dim 1 to be 2, got 3"):
         q.apply_grad(
@@ -604,7 +604,7 @@ class IndexedSlicesConditionalAccumulatorTest(test.TestCase):
               [[[[1, 2], [3, 4]], [[5, 6], [7, 8]]]]).astype(np.float32)).run()
 
       # Values' rank does not match accumulated gradient
-      with self.assertRaisesRegexp(
+      with self.assertRaisesRegex(
           errors_impl.InvalidArgumentError,
           "Shape mismatch: expected values rank 4, got 3"):
         q.apply_grad(
@@ -612,7 +612,7 @@ class IndexedSlicesConditionalAccumulatorTest(test.TestCase):
             grad_values=np.array([[[1, 2], [3, 4]]]).astype(np.float32)).run()
 
       # Values' dim does not match accumulated gradient
-      with self.assertRaisesRegexp(
+      with self.assertRaisesRegex(
           errors_impl.InvalidArgumentError,
           "Shape mismatch: expected values dim 3 to be 2, got 3"):
         q.apply_grad(
@@ -633,7 +633,7 @@ class IndexedSlicesConditionalAccumulatorTest(test.TestCase):
                   np.float32),
           local_step=1).run()
 
-      with self.assertRaisesRegexp(
+      with self.assertRaisesRegex(
           errors_impl.InvalidArgumentError,
           "Shape mismatch: expected values dim 3 to be 3, got 2"):
         q.apply_grad(
diff --git a/tensorflow/python/kernel_tests/sparse_cross_op_test.py b/tensorflow/python/kernel_tests/sparse_cross_op_test.py
index b352c1a080f..48192551a18 100644
--- a/tensorflow/python/kernel_tests/sparse_cross_op_test.py
+++ b/tensorflow/python/kernel_tests/sparse_cross_op_test.py
@@ -418,10 +418,10 @@ class SparseCrossOpTest(test.TestCase):
       self.assertTrue(all_values_are_different)
 
   def _assert_sparse_tensor_empty(self, sp):
-    self.assertEquals(0, sp.indices.size)
-    self.assertEquals(0, sp.values.size)
+    self.assertEqual(0, sp.indices.size)
+    self.assertEqual(0, sp.values.size)
     # TODO(zakaria): check if we can ignore the first dim of the shape.
-    self.assertEquals(0, sp.dense_shape[1])
+    self.assertEqual(0, sp.dense_shape[1])
 
   def _assert_sparse_tensor_equals(self, sp1, sp2):
     self.assertAllEqual(sp1.indices.eval(), sp2.indices)
@@ -464,31 +464,31 @@ class SparseCrossOpTest(test.TestCase):
 
     st1 = sparse_tensor.SparseTensor([[0, 0]], [0], [2, 2])
     st1._indices = array_ops.zeros([], dtypes.int64)
-    with self.assertRaisesRegexp((errors.InvalidArgumentError, ValueError),
-                                 'Input indices should be a matrix'):
+    with self.assertRaisesRegex((errors.InvalidArgumentError, ValueError),
+                                'Input indices should be a matrix'):
       self.evaluate(sparse_ops.sparse_cross([st1]))
 
     st2 = sparse_tensor.SparseTensor([[0, 0]], [0], [2, 2])
     st2._values = array_ops.zeros([], dtypes.int64)
-    with self.assertRaisesRegexp((errors.InvalidArgumentError, ValueError),
-                                 'Input values should be a vector'):
+    with self.assertRaisesRegex((errors.InvalidArgumentError, ValueError),
+                                'Input values should be a vector'):
       self.evaluate(sparse_ops.sparse_cross([st2]))
 
     st3 = sparse_tensor.SparseTensor([[0, 0]], [0], [2, 2])
     st3._dense_shape = array_ops.zeros([], dtypes.int64)
-    with self.assertRaisesRegexp((errors.InvalidArgumentError, ValueError),
-                                 'Input shapes should be a vector'):
+    with self.assertRaisesRegex((errors.InvalidArgumentError, ValueError),
+                                'Input shapes should be a vector'):
       self.evaluate(sparse_ops.sparse_cross([st3]))
 
   def test_bad_tensor_shapes(self):
     # All inputs must be 2D.
-    with self.assertRaisesRegexp((errors.InvalidArgumentError, ValueError),
-                                 'Expected D2 of index to be 2'):
+    with self.assertRaisesRegex((errors.InvalidArgumentError, ValueError),
+                                'Expected D2 of index to be 2'):
       st = sparse_tensor.SparseTensor([[0]], [0], [10])  # 1D SparseTensor
       self.evaluate(sparse_ops.sparse_cross([st]))
 
-    with self.assertRaisesRegexp((errors.InvalidArgumentError, ValueError),
-                                 'Dense inputs should be a matrix'):
+    with self.assertRaisesRegex((errors.InvalidArgumentError, ValueError),
+                                'Dense inputs should be a matrix'):
       dt = array_ops.zeros([0])  # 1D DenseTensor.
       self.evaluate(sparse_ops.sparse_cross([dt]))
 
@@ -496,11 +496,11 @@ class SparseCrossOpTest(test.TestCase):
     st1 = sparse_tensor.SparseTensor([[0, 0]], [0], [10, 10])  # batch size 10
     st2 = sparse_tensor.SparseTensor([[0, 0]], [0], [7, 10])  # batch size 7
     dt = array_ops.zeros([5, 0])  # batch size 5
-    with self.assertRaisesRegexp((errors.InvalidArgumentError, ValueError),
-                                 'Expected batch size'):
+    with self.assertRaisesRegex((errors.InvalidArgumentError, ValueError),
+                                'Expected batch size'):
       self.evaluate(sparse_ops.sparse_cross([st1, dt]))
-    with self.assertRaisesRegexp((errors.InvalidArgumentError, ValueError),
-                                 'Expected batch size'):
+    with self.assertRaisesRegex((errors.InvalidArgumentError, ValueError),
+                                'Expected batch size'):
       self.evaluate(sparse_ops.sparse_cross([st1, st2]))
 
 
diff --git a/tensorflow/python/kernel_tests/sparse_ops_test.py b/tensorflow/python/kernel_tests/sparse_ops_test.py
index e4cc2046c64..5268a2be537 100644
--- a/tensorflow/python/kernel_tests/sparse_ops_test.py
+++ b/tensorflow/python/kernel_tests/sparse_ops_test.py
@@ -446,7 +446,7 @@ class SparseResetShapeTest(test_util.TensorFlowTestCase):
     sp_input = self._SparseTensor_2x5x6()
     new_shape = np.array([3, 7, 5], dtype=np.int64)
 
-    with self.assertRaisesRegexp(ValueError, "should have dimension sizes"):
+    with self.assertRaisesRegex(ValueError, "should have dimension sizes"):
       sparse_ops.sparse_reset_shape(sp_input, new_shape)
 
   @test_util.run_deprecated_v1
@@ -792,7 +792,7 @@ class SparseMathOpsTest(test_util.TensorFlowTestCase):
       b = sparse_tensor.SparseTensor([[0, 0, 1, 0], [0, 0, 3, 0]], [10, 20],
                                      [1, 1, 4, 2])
       c = a * b
-      with self.assertRaisesRegexp(
+      with self.assertRaisesRegex(
           errors.InvalidArgumentError,
           "broadcasts dense to sparse only; got incompatible shapes"):
         self.evaluate(c)
diff --git a/tensorflow/python/kernel_tests/sparse_reshape_op_test.py b/tensorflow/python/kernel_tests/sparse_reshape_op_test.py
index 6ec51bb9735..946774e7275 100644
--- a/tensorflow/python/kernel_tests/sparse_reshape_op_test.py
+++ b/tensorflow/python/kernel_tests/sparse_reshape_op_test.py
@@ -71,14 +71,14 @@ class SparseReshapeTest(test.TestCase):
   def testRaisesIfMoreThanOneInferredDim(self):
     sp_input = sparse_tensor.SparseTensor.from_value(
         self._SparseTensorValue_2x3x4())
-    with self.assertRaisesRegexp(ValueError, "At most one dimension can"):
+    with self.assertRaisesRegex(ValueError, "At most one dimension can"):
       sparse_ops.sparse_reshape(sp_input, shape=(-1, 2, -1))
 
   @test_util.run_deprecated_v1
   def testRaisesIfInferredShapeNotPossible(self):
     sp_input = sparse_tensor.SparseTensor.from_value(
         self._SparseTensorValue_2x3x4())
-    with self.assertRaisesRegexp(ValueError, "Cannot reshape"):
+    with self.assertRaisesRegex(ValueError, "Cannot reshape"):
       sparse_ops.sparse_reshape(sp_input, shape=(-1, 7))
 
   @test_util.run_deprecated_v1
@@ -249,7 +249,7 @@ class SparseReshapeTest(test.TestCase):
   def testProvideStaticallyMismatchedSizes(self):
     input_val = self._SparseTensorValue_5x6()
     sp_input = sparse_tensor.SparseTensor.from_value(input_val)
-    with self.assertRaisesRegexp(ValueError, "Cannot reshape"):
+    with self.assertRaisesRegex(ValueError, "Cannot reshape"):
       sparse_ops.sparse_reshape(sp_input, [4, 7])
 
   @test_util.run_deprecated_v1
diff --git a/tensorflow/python/kernel_tests/sparse_split_op_test.py b/tensorflow/python/kernel_tests/sparse_split_op_test.py
index f4bb7498b02..bdd4b8e7634 100644
--- a/tensorflow/python/kernel_tests/sparse_split_op_test.py
+++ b/tensorflow/python/kernel_tests/sparse_split_op_test.py
@@ -254,13 +254,13 @@ class SparseSplitOpTest(test.TestCase):
                             expected_output.indices.eval())
 
   def testArgumentErrors(self):
-    with self.assertRaisesRegexp(ValueError, 'Keyword arguments are required'):
+    with self.assertRaisesRegex(ValueError, 'Keyword arguments are required'):
       sparse_ops.sparse_split(3, 2, 1)
-    with self.assertRaisesRegexp(ValueError, 'sp_input is required'):
+    with self.assertRaisesRegex(ValueError, 'sp_input is required'):
       sparse_ops.sparse_split()
-    with self.assertRaisesRegexp(ValueError, 'num_split is required'):
+    with self.assertRaisesRegex(ValueError, 'num_split is required'):
       sparse_ops.sparse_split(sp_input=1)
-    with self.assertRaisesRegexp(ValueError, 'axis is required'):
+    with self.assertRaisesRegex(ValueError, 'axis is required'):
       sparse_ops.sparse_split(num_split=2, sp_input=1)
 
 
diff --git a/tensorflow/python/kernel_tests/sparse_tensor_dense_matmul_op_test.py b/tensorflow/python/kernel_tests/sparse_tensor_dense_matmul_op_test.py
index c8dc99c8ec0..79f1c488f35 100644
--- a/tensorflow/python/kernel_tests/sparse_tensor_dense_matmul_op_test.py
+++ b/tensorflow/python/kernel_tests/sparse_tensor_dense_matmul_op_test.py
@@ -131,7 +131,7 @@ class SparseTensorDenseMatMulTest(test.TestCase):
     x_shape_inconsistent = [10, 15]
     x_st_shape_inconsistent = sparse_tensor.SparseTensor(x_indices, x_values,
                                                          x_shape_inconsistent)
-    with self.assertRaisesRegexp(ValueError, "Dimensions must be equal"):
+    with self.assertRaisesRegex(ValueError, "Dimensions must be equal"):
       sparse_ops.sparse_tensor_dense_matmul(x_st_shape_inconsistent, y)
 
   @test_util.deprecated_graph_mode_only
diff --git a/tensorflow/python/kernel_tests/sparse_xent_op_test.py b/tensorflow/python/kernel_tests/sparse_xent_op_test.py
index 9c65f75054f..cf1337b493d 100644
--- a/tensorflow/python/kernel_tests/sparse_xent_op_test.py
+++ b/tensorflow/python/kernel_tests/sparse_xent_op_test.py
@@ -143,13 +143,13 @@ class SparseXentTest(test.TestCase):
 
   def testShapeMismatch(self):
     with self.session(use_gpu=True):
-      with self.assertRaisesRegexp(ValueError, ".*Rank mismatch:*"):
+      with self.assertRaisesRegex(ValueError, ".*Rank mismatch:*"):
         nn_ops.sparse_softmax_cross_entropy_with_logits(
             labels=[[0, 2]], logits=[[0., 1.], [2., 3.], [2., 3.]])
 
   def testScalar(self):
     with self.session(use_gpu=True):
-      with self.assertRaisesRegexp(ValueError, ".*Logits cannot be scalars*"):
+      with self.assertRaisesRegex(ValueError, ".*Logits cannot be scalars*"):
         nn_ops.sparse_softmax_cross_entropy_with_logits(
             labels=constant_op.constant(0), logits=constant_op.constant(1.0))
 
@@ -267,8 +267,8 @@ class SparseXentTest(test.TestCase):
   @test_util.run_deprecated_v1
   def testScalarHandling(self):
     with self.session(use_gpu=False) as sess:
-      with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
-                                   ".*labels must be 1-D.*"):
+      with self.assertRaisesRegex(errors_impl.InvalidArgumentError,
+                                  ".*labels must be 1-D.*"):
         labels = array_ops.placeholder(dtypes.int32, shape=[None, 1])
         logits = array_ops.placeholder(dtypes.float32, shape=[None, 3])
         ce = nn_ops.sparse_softmax_cross_entropy_with_logits(
diff --git a/tensorflow/python/kernel_tests/split_op_test.py b/tensorflow/python/kernel_tests/split_op_test.py
index 14c5b53de92..ef66d8dda0b 100644
--- a/tensorflow/python/kernel_tests/split_op_test.py
+++ b/tensorflow/python/kernel_tests/split_op_test.py
@@ -342,7 +342,7 @@ class SplitOpTest(test.TestCase):
       array_ops.split(value=[[0, 1], [2, 3]], num_or_size_splits=4, axis=-3)
 
     # num_split does not evenly divide the size in split_dim.
-    with self.assertRaisesRegexp(ValueError, "should evenly divide"):
+    with self.assertRaisesRegex(ValueError, "should evenly divide"):
       array_ops.split(value=[0, 1, 2, 3], num_or_size_splits=3, axis=0)
 
     # Unknown split_dim.
@@ -378,14 +378,14 @@ class SplitOpTest(test.TestCase):
     x = array_ops.placeholder(dtypes.int32)
     values = np.zeros([5, 30])
     splits = array_ops.placeholder(dtypes.int32)
-    with self.assertRaisesRegexp(ValueError, "Cannot infer"):
+    with self.assertRaisesRegex(ValueError, "Cannot infer"):
       y = array_ops.split(values, splits, axis=x)
 
     splits = array_ops.placeholder(dtypes.int32, [3])
     y = array_ops.split(values, splits, axis=x)
     with self.session(use_gpu=True) as sess:
-      with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
-                                   "must have exactly one element"):
+      with self.assertRaisesRegex(errors_impl.InvalidArgumentError,
+                                  "must have exactly one element"):
         sess.run(y, {x: np.array([], dtype=np.int32), splits: [4, 11, 15]})
 
 
diff --git a/tensorflow/python/kernel_tests/stack_op_test.py b/tensorflow/python/kernel_tests/stack_op_test.py
index aebbeefcc8d..00117187c81 100644
--- a/tensorflow/python/kernel_tests/stack_op_test.py
+++ b/tensorflow/python/kernel_tests/stack_op_test.py
@@ -248,12 +248,12 @@ class StackOpTest(test.TestCase):
 
   def testDimOutOfRange(self):
     t = [constant_op.constant([1, 2, 3]), constant_op.constant([4, 5, 6])]
-    with self.assertRaisesRegexp(ValueError, r"axis = 2 not in \[-2, 2\)"):
+    with self.assertRaisesRegex(ValueError, r"axis = 2 not in \[-2, 2\)"):
       array_ops.stack(t, axis=2)
 
   def testDimOutOfNegativeRange(self):
     t = [constant_op.constant([1, 2, 3]), constant_op.constant([4, 5, 6])]
-    with self.assertRaisesRegexp(ValueError, r"axis = -3 not in \[-2, 2\)"):
+    with self.assertRaisesRegex(ValueError, r"axis = -3 not in \[-2, 2\)"):
       array_ops.stack(t, axis=-3)
 
   def testComplex(self):
diff --git a/tensorflow/python/kernel_tests/string_bytes_split_op_test.py b/tensorflow/python/kernel_tests/string_bytes_split_op_test.py
index 8a4f5edc519..058ab1f9ecd 100644
--- a/tensorflow/python/kernel_tests/string_bytes_split_op_test.py
+++ b/tensorflow/python/kernel_tests/string_bytes_split_op_test.py
@@ -72,8 +72,8 @@ class StringsToBytesOpTest(test_util.TensorFlowTestCase,
     def f(v):
       return ragged_string_ops.string_bytes_split(v)
 
-    with self.assertRaisesRegexp(ValueError,
-                                 'input must have a statically-known rank'):
+    with self.assertRaisesRegex(ValueError,
+                                'input must have a statically-known rank'):
       f(['foo'])
 
 
diff --git a/tensorflow/python/kernel_tests/string_format_op_test.py b/tensorflow/python/kernel_tests/string_format_op_test.py
index 74a5072bab9..52379cc2c8d 100644
--- a/tensorflow/python/kernel_tests/string_format_op_test.py
+++ b/tensorflow/python/kernel_tests/string_format_op_test.py
@@ -358,23 +358,23 @@ class StringFormatOpTest(test.TestCase):
   @test_util.run_in_graph_and_eager_modes()
   def testTensorCountMustMatchPlaceholderCount(self):
     with self.cached_session():
-      with self.assertRaisesRegexp(
+      with self.assertRaisesRegex(
           ValueError, r"2 placeholder\(s\) in template does not match 1 "
-                      r"tensor\(s\) provided as input"):
+          r"tensor\(s\) provided as input"):
         tensor = math_ops.range(10)
         format_output = string_ops.string_format("{} {}", tensor)
         self.evaluate(format_output)
     with self.cached_session():
-      with self.assertRaisesRegexp(
+      with self.assertRaisesRegex(
           ValueError, r"2 placeholder\(s\) in template does not match 1 "
-                      r"tensor\(s\) provided as input"):
+          r"tensor\(s\) provided as input"):
         tensor = math_ops.range(10)
         format_output = string_ops.string_format("{} {}", [tensor])
         self.evaluate(format_output)
     with self.cached_session():
-      with self.assertRaisesRegexp(
+      with self.assertRaisesRegex(
           ValueError, r"1 placeholder\(s\) in template does not match 2 "
-                      r"tensor\(s\) provided as input"):
+          r"tensor\(s\) provided as input"):
         tensor = math_ops.range(10)
         format_output = string_ops.string_format("{}", (tensor, tensor))
         self.evaluate(format_output)
diff --git a/tensorflow/python/kernel_tests/string_length_op_test.py b/tensorflow/python/kernel_tests/string_length_op_test.py
index bfa6ac2454a..42a5cb63ccf 100644
--- a/tensorflow/python/kernel_tests/string_length_op_test.py
+++ b/tensorflow/python/kernel_tests/string_length_op_test.py
@@ -48,7 +48,7 @@ class StringLengthOpTest(test.TestCase):
           self.evaluate(utf8_byte_lengths), expected_utf8_byte_lengths)
       self.assertAllEqual(
           self.evaluate(utf8_char_lengths), expected_utf8_char_lengths)
-      with self.assertRaisesRegexp(
+      with self.assertRaisesRegex(
           ValueError, "Attr 'unit' of 'StringLength' Op passed string 'XYZ' "
           'not in: "BYTE", "UTF8_CHAR"'):
         string_ops.string_length(utf8_strings, unit="XYZ")
diff --git a/tensorflow/python/kernel_tests/string_split_op_test.py b/tensorflow/python/kernel_tests/string_split_op_test.py
index 8e9517b2f1f..eef80ca0767 100755
--- a/tensorflow/python/kernel_tests/string_split_op_test.py
+++ b/tensorflow/python/kernel_tests/string_split_op_test.py
@@ -217,7 +217,7 @@ class StringSplitOpTest(test.TestCase, parameterized.TestCase):
                                    expected=None,
                                    error=None):
     if error is not None:
-      with self.assertRaisesRegexp(ValueError, error):
+      with self.assertRaisesRegex(ValueError, error):
         ragged_string_ops.string_split(source, sep, skip_empty, delimiter,
                                        result_type)
     if expected is not None:
@@ -447,7 +447,7 @@ class StringSplitV2OpTest(test_util.TensorFlowTestCase, parameterized.TestCase):
                      self.evaluate(actual_sparse_v1.dense_shape).tolist())
 
   def testSplitV1BadResultType(self):
-    with self.assertRaisesRegexp(ValueError, "result_type must be .*"):
+    with self.assertRaisesRegex(ValueError, "result_type must be .*"):
       ragged_string_ops.strings_split_v1("foo", result_type="BouncyTensor")
 
   def _py_split(self, strings, **kwargs):
diff --git a/tensorflow/python/kernel_tests/summary_ops_test.py b/tensorflow/python/kernel_tests/summary_ops_test.py
index 387083ceff4..cefbf48b9c9 100644
--- a/tensorflow/python/kernel_tests/summary_ops_test.py
+++ b/tensorflow/python/kernel_tests/summary_ops_test.py
@@ -1068,7 +1068,7 @@ class SummaryOpsTest(test_util.TensorFlowTestCase):
 
     with test.mock.patch.object(logging, 'warn') as mock_log:
       f()
-      self.assertRegexpMatches(
+      self.assertRegex(
           str(mock_log.call_args), 'Cannot enable trace inside a tf.function.')
 
   @test_util.run_v2_only
@@ -1076,7 +1076,7 @@ class SummaryOpsTest(test_util.TensorFlowTestCase):
     with test.mock.patch.object(logging, 'warn') as mock_log:
       with context.graph_mode():
         summary_ops.trace_on(graph=True, profiler=False)
-      self.assertRegexpMatches(
+      self.assertRegex(
           str(mock_log.call_args), 'Must enable trace in eager mode.')
 
   @test_util.run_v2_only
@@ -1098,16 +1098,15 @@ class SummaryOpsTest(test_util.TensorFlowTestCase):
 
     with test.mock.patch.object(logging, 'warn') as mock_log:
       f()
-      self.assertRegexpMatches(
-          str(mock_log.call_args),
-          'Cannot export trace inside a tf.function.')
+      self.assertRegex(
+          str(mock_log.call_args), 'Cannot export trace inside a tf.function.')
 
   @test_util.run_v2_only
   def testTrace_cannotExportTraceInGraphMode(self):
     with test.mock.patch.object(logging, 'warn') as mock_log:
       with context.graph_mode():
         summary_ops.trace_export(name='foo', step=1)
-      self.assertRegexpMatches(
+      self.assertRegex(
           str(mock_log.call_args),
           'Can only export trace while executing eagerly.')
 
diff --git a/tensorflow/python/kernel_tests/svd_op_test.py b/tensorflow/python/kernel_tests/svd_op_test.py
index cad131dda74..c8180df2d07 100644
--- a/tensorflow/python/kernel_tests/svd_op_test.py
+++ b/tensorflow/python/kernel_tests/svd_op_test.py
@@ -52,12 +52,12 @@ class SvdOpTest(test.TestCase):
   def testWrongDimensions(self):
     # The input to svd should be a tensor of at least rank 2.
     scalar = constant_op.constant(1.)
-    with self.assertRaisesRegexp((ValueError, errors_impl.InvalidArgumentError),
-                                 "rank.* 2.*0"):
+    with self.assertRaisesRegex((ValueError, errors_impl.InvalidArgumentError),
+                                "rank.* 2.*0"):
       linalg_ops.svd(scalar)
     vector = constant_op.constant([1., 2.])
-    with self.assertRaisesRegexp((ValueError, errors_impl.InvalidArgumentError),
-                                 "rank.* 2.*1"):
+    with self.assertRaisesRegex((ValueError, errors_impl.InvalidArgumentError),
+                                "rank.* 2.*1"):
       linalg_ops.svd(vector)
 
   @test_util.run_in_graph_and_eager_modes(use_gpu=True)
diff --git a/tensorflow/python/kernel_tests/template_test.py b/tensorflow/python/kernel_tests/template_test.py
index b9e9fa027b2..37982e047d7 100644
--- a/tensorflow/python/kernel_tests/template_test.py
+++ b/tensorflow/python/kernel_tests/template_test.py
@@ -195,13 +195,13 @@ class TemplateTest(test.TestCase):
     tmpl1()
     tmpl2 = template.make_template(
         "_", variable_scoped_function, unique_name_="s1")
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         ValueError, "Variable s1/dummy already exists, disallowed.*"):
       tmpl2()
 
   def test_unique_name_raise_error_in_eager(self):
     with context.eager_mode():
-      with self.assertRaisesRegexp(
+      with self.assertRaisesRegex(
           ValueError,
           "unique_name_ cannot be used when eager execution is enabled."):
         template.make_template(
@@ -258,8 +258,7 @@ class TemplateTest(test.TestCase):
 
   @test_util.run_in_graph_and_eager_modes
   def test_template_without_name(self):
-    with self.assertRaisesRegexp(
-        ValueError, "name cannot be None."):
+    with self.assertRaisesRegex(ValueError, "name cannot be None."):
       template.make_template(None, variable_scoped_function)
 
   @test_util.run_in_graph_and_eager_modes
@@ -591,31 +590,36 @@ class TemplateTest(test.TestCase):
     linear1 = make_linear_module(output_size=2, name="foo")
     outputs_a, w1 = linear1(inputs)
     outputs_b, _ = linear1(inputs)
-    self.assertEquals("foo", linear1.variable_scope.name)
-    self.assertEquals("foo/w:0", w1.name)
+    self.assertEqual("foo", linear1.variable_scope.name)
+    self.assertEqual("foo/w:0", w1.name)
     if not context.executing_eagerly():
-      self.assertEquals("foo/add:0", outputs_a.name,
-                        "First application of template should get "
-                        "same name scope as variables.")
-      self.assertEquals("foo_1/add:0", outputs_b.name,
-                        "Second application of template should get "
-                        "a freshly uniquified name scope.")
+      self.assertEqual(
+          "foo/add:0", outputs_a.name,
+          "First application of template should get "
+          "same name scope as variables.")
+      self.assertEqual(
+          "foo_1/add:0", outputs_b.name,
+          "Second application of template should get "
+          "a freshly uniquified name scope.")
 
     linear2 = make_linear_module(output_size=2, name="foo")
     outputs_c, w2 = linear2(inputs)
     outputs_d, _ = linear2(inputs)
-    self.assertEquals("foo_1", linear2.variable_scope.name,
-                      "New template gets a freshly uniquified variable scope "
-                      "because 'foo' is already taken.")
-    self.assertEquals("foo_1/w:0", w2.name)
+    self.assertEqual(
+        "foo_1", linear2.variable_scope.name,
+        "New template gets a freshly uniquified variable scope "
+        "because 'foo' is already taken.")
+    self.assertEqual("foo_1/w:0", w2.name)
     if not context.executing_eagerly():
-      self.assertEquals("foo_1_1/add:0", outputs_c.name,
-                        "First application of template would get "
-                        "same name scope as variables, but 'foo_1' is already "
-                        "a name scope.")
-      self.assertEquals("foo_1_2/add:0", outputs_d.name,
-                        "Second application of template should also get "
-                        "a freshly uniquified name scope.")
+      self.assertEqual(
+          "foo_1_1/add:0", outputs_c.name,
+          "First application of template would get "
+          "same name scope as variables, but 'foo_1' is already "
+          "a name scope.")
+      self.assertEqual(
+          "foo_1_2/add:0", outputs_d.name,
+          "Second application of template should also get "
+          "a freshly uniquified name scope.")
 
   @test_util.run_in_graph_and_eager_modes
   def test_global_variables(self):
diff --git a/tensorflow/python/kernel_tests/tensor_array_ops_test.py b/tensorflow/python/kernel_tests/tensor_array_ops_test.py
index 5d587954858..27c56decca3 100644
--- a/tensorflow/python/kernel_tests/tensor_array_ops_test.py
+++ b/tensorflow/python/kernel_tests/tensor_array_ops_test.py
@@ -454,8 +454,8 @@ class TensorArrayTest(test.TestCase):
           "|"
           "Invalid data types; op elements string but list elements float"
           ")")
-      with self.assertRaisesRegexp(
-          (TypeError, errors.InvalidArgumentError), error_msg_regex):
+      with self.assertRaisesRegex((TypeError, errors.InvalidArgumentError),
+                                  error_msg_regex):
         self.evaluate(ta.write(0, "wrong_type_scalar").flow)
 
       if (control_flow_util.ENABLE_CONTROL_FLOW_V2 and
@@ -552,7 +552,7 @@ class TensorArrayTest(test.TestCase):
       error_msg = ("Incompatible ranks"
                    if control_flow_util.ENABLE_CONTROL_FLOW_V2 and
                    not context.executing_eagerly() else "shape")
-      with self.assertRaisesRegexp(errors.InvalidArgumentError, error_msg):
+      with self.assertRaisesRegex(errors.InvalidArgumentError, error_msg):
         self.evaluate(w3.concat())
 
   def testTensorArraySplitIncompatibleShapesFails(self):
@@ -577,7 +577,7 @@ class TensorArrayTest(test.TestCase):
 
       ta = _make_ta(1, "baz")
       if control_flow_util.ENABLE_CONTROL_FLOW_V2 and not in_eager_mode:
-        with self.assertRaisesRegexp(
+        with self.assertRaisesRegex(
             ValueError, "Shape must be at least rank 1 but is rank 0"):
           self.evaluate(ta.split(1.0, [1]).flow)
       else:
@@ -657,8 +657,8 @@ class TensorArrayTest(test.TestCase):
       # Make sure shape inference worked.
       self.assertAllEqual([None, None, 2, 3], read_value.shape.as_list())
       # Writing with wrong shape should not work.
-      with self.assertRaisesRegexp(errors.InvalidArgumentError,
-                                   "Could not write to TensorArray"):
+      with self.assertRaisesRegex(errors.InvalidArgumentError,
+                                  "Could not write to TensorArray"):
         fed_value = np.random.random([2, 3])
         sess.run(read_value, feed_dict={value: fed_value})
       # Writing with correct shape should work.
diff --git a/tensorflow/python/kernel_tests/tensordot_op_test.py b/tensorflow/python/kernel_tests/tensordot_op_test.py
index 7f8c5e9781b..a031f9bca07 100644
--- a/tensorflow/python/kernel_tests/tensordot_op_test.py
+++ b/tensorflow/python/kernel_tests/tensordot_op_test.py
@@ -55,8 +55,8 @@ class TensordotTest(test_lib.TestCase):
     if context.executing_eagerly():
       return
     with self.cached_session() as sess:
-      with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
-                                   "Matrix size-incompatible"):
+      with self.assertRaisesRegex(errors_impl.InvalidArgumentError,
+                                  "Matrix size-incompatible"):
         a_ph = array_ops.placeholder(dtypes.float32)
         b_ph = array_ops.placeholder(dtypes.float32)
         axes_ph = array_ops.placeholder(dtypes.int32)
diff --git a/tensorflow/python/kernel_tests/topk_op_test.py b/tensorflow/python/kernel_tests/topk_op_test.py
index 7872e62050a..eb74d96786b 100644
--- a/tensorflow/python/kernel_tests/topk_op_test.py
+++ b/tensorflow/python/kernel_tests/topk_op_test.py
@@ -203,8 +203,8 @@ class TopKTest(test.TestCase):
   @test_util.run_deprecated_v1
   def testKTooLarge(self):
     inputs = [[0.1, 0.2], [0.3, 0.4]]
-    with self.assertRaisesRegexp(ValueError,
-                                 r"must have last dimension >= k = 4"):
+    with self.assertRaisesRegex(ValueError,
+                                r"must have last dimension >= k = 4"):
       nn_ops.top_k(inputs, 4)
 
   @test_util.run_deprecated_v1
diff --git a/tensorflow/python/kernel_tests/unicode_decode_op_test.py b/tensorflow/python/kernel_tests/unicode_decode_op_test.py
index 6bd9b15af0f..bd38dae393c 100644
--- a/tensorflow/python/kernel_tests/unicode_decode_op_test.py
+++ b/tensorflow/python/kernel_tests/unicode_decode_op_test.py
@@ -386,7 +386,7 @@ class UnicodeDecodeTest(test_util.TensorFlowTestCase,
            exception=(ValueError, errors.InvalidArgumentError)),
   ])  # pyformat: disable
   def testExceptions(self, exception=None, message=None, **args):
-    with self.assertRaisesRegexp(exception, message):
+    with self.assertRaisesRegex(exception, message):
       self.evaluate(ragged_string_ops.unicode_decode(**args))
 
   def testUnknownRankError(self):
@@ -394,7 +394,7 @@ class UnicodeDecodeTest(test_util.TensorFlowTestCase,
       return
     s = array_ops.placeholder(dtypes.string)
     message = "Rank of `input` must be statically known."
-    with self.assertRaisesRegexp(ValueError, message):
+    with self.assertRaisesRegex(ValueError, message):
       self.evaluate(ragged_string_ops.unicode_decode(s, input_encoding="UTF-8"))
 
   @parameterized.parameters([
@@ -710,7 +710,7 @@ class UnicodeSplitTest(test_util.TensorFlowTestCase,
            exception=(ValueError, errors.InvalidArgumentError)),
   ])  # pyformat: disable
   def testExceptions(self, exception=None, message=None, **args):
-    with self.assertRaisesRegexp(exception, message):
+    with self.assertRaisesRegex(exception, message):
       self.evaluate(ragged_string_ops.unicode_split(**args))
 
   def testUnknownRankError(self):
@@ -718,7 +718,7 @@ class UnicodeSplitTest(test_util.TensorFlowTestCase,
       return
     s = array_ops.placeholder(dtypes.string)
     message = "Rank of `input` must be statically known."
-    with self.assertRaisesRegexp(ValueError, message):
+    with self.assertRaisesRegex(ValueError, message):
       self.evaluate(ragged_string_ops.unicode_decode(s, input_encoding="UTF-8"))
 
 
diff --git a/tensorflow/python/kernel_tests/unicode_encode_op_test.py b/tensorflow/python/kernel_tests/unicode_encode_op_test.py
index 2f483b7fb68..4f7f175c75b 100644
--- a/tensorflow/python/kernel_tests/unicode_encode_op_test.py
+++ b/tensorflow/python/kernel_tests/unicode_encode_op_test.py
@@ -293,7 +293,7 @@ class UnicodeEncodeOpTest(test.TestCase, parameterized.TestCase):
     def f(v):
       return ragged_string_ops.unicode_encode(v, "UTF-8")
 
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         ValueError, "Rank of input_tensor must be statically known."):
       f([72, 101, 108, 108, 111])
 
diff --git a/tensorflow/python/kernel_tests/unicode_transcode_op_test.py b/tensorflow/python/kernel_tests/unicode_transcode_op_test.py
index a3b4fd03474..5188645b140 100644
--- a/tensorflow/python/kernel_tests/unicode_transcode_op_test.py
+++ b/tensorflow/python/kernel_tests/unicode_transcode_op_test.py
@@ -343,7 +343,7 @@ class UnicodeTranscodeOpTest(test.TestCase, parameterized.TestCase):
           "Could not create converter for input encoding: invalid"):
         self.evaluate(outputs)
 
-    with self.assertRaisesRegexp(ValueError, "Op passed string 'invalid'"):
+    with self.assertRaisesRegex(ValueError, "Op passed string 'invalid'"):
       with self.cached_session() as sess:
         outputs = string_ops.unicode_transcode(
             strings,
@@ -358,7 +358,7 @@ class UnicodeTranscodeOpTest(test.TestCase, parameterized.TestCase):
   def test_invalid_error_policy_causes_errors(self):
     strings = [[b"a", b"abc"], [b"ABC", b"DEF"]]
 
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         ValueError, "'invalid' not in: \"strict\", \"replace\", \"ignore\"."):
       with self.cached_session() as sess:
         outputs = string_ops.unicode_transcode(
diff --git a/tensorflow/python/kernel_tests/unstack_op_test.py b/tensorflow/python/kernel_tests/unstack_op_test.py
index 13611b278bc..65217dde255 100644
--- a/tensorflow/python/kernel_tests/unstack_op_test.py
+++ b/tensorflow/python/kernel_tests/unstack_op_test.py
@@ -147,8 +147,8 @@ class UnstackOpTest(test.TestCase):
   @test_util.run_deprecated_v1
   def testCannotInferNumFromUnknownShape(self):
     x = array_ops.placeholder(np.float32)
-    with self.assertRaisesRegexp(ValueError,
-                                 r'Cannot infer num from shape <unknown>'):
+    with self.assertRaisesRegex(ValueError,
+                                r'Cannot infer num from shape <unknown>'):
       array_ops.unstack(x)
 
   @test_util.run_deprecated_v1
@@ -159,8 +159,8 @@ class UnstackOpTest(test.TestCase):
   @test_util.run_deprecated_v1
   def testCannotInferNumFromNoneShape(self):
     x = array_ops.placeholder(np.float32, shape=(None,))
-    with self.assertRaisesRegexp(ValueError,
-                                 r'Cannot infer num from shape \((\?|None),\)'):
+    with self.assertRaisesRegex(ValueError,
+                                r'Cannot infer num from shape \((\?|None),\)'):
       array_ops.unstack(x)
 
   def testAgainstNumpy(self):
@@ -186,12 +186,12 @@ class UnstackOpTest(test.TestCase):
 
   def testAxisOutOfRange(self):
     a = constant_op.constant([[1, 2, 3], [4, 5, 6]], name='a')
-    with self.assertRaisesRegexp(ValueError, r'axis = 2 not in \[-2, 2\)'):
+    with self.assertRaisesRegex(ValueError, r'axis = 2 not in \[-2, 2\)'):
       array_ops.unstack(a, axis=2)
 
   def testAxisOutOfNegativeRange(self):
     a = constant_op.constant([[1, 2, 3], [4, 5, 6]], name='a')
-    with self.assertRaisesRegexp(ValueError, r'axis = -3 not in \[-2, 2\)'):
+    with self.assertRaisesRegex(ValueError, r'axis = -3 not in \[-2, 2\)'):
       array_ops.unstack(a, axis=-3)
 
   def testZeroLengthDim(self):
diff --git a/tensorflow/python/kernel_tests/variable_scope_test.py b/tensorflow/python/kernel_tests/variable_scope_test.py
index dc534f7cfec..3a88a787acc 100644
--- a/tensorflow/python/kernel_tests/variable_scope_test.py
+++ b/tensorflow/python/kernel_tests/variable_scope_test.py
@@ -415,7 +415,7 @@ class VariableScopeTest(test.TestCase):
     self.evaluate(variables_lib.variables_initializer([w]))
     self.assertAllClose(self.evaluate(w.value()), 0.1)
 
-    with self.assertRaisesRegexp(ValueError, "shape"):
+    with self.assertRaisesRegex(ValueError, "shape"):
       # We disallow explicit shape specification when initializer is constant.
       variable_scope.get_variable("u", [1], initializer=init)
 
@@ -431,7 +431,7 @@ class VariableScopeTest(test.TestCase):
     self.assertEqual(t.dtype.base_dtype, dtypes.int32)
 
     # Raise error if `initializer` dtype and `dtype` are not identical.
-    with self.assertRaisesRegexp(ValueError, "don't match"):
+    with self.assertRaisesRegex(ValueError, "don't match"):
       variable_scope.get_variable("s", initializer=init, dtype=dtypes.float64)
 
   # TODO(mihaimaruseac): Not converted to use wrap_function because of
@@ -449,16 +449,16 @@ class VariableScopeTest(test.TestCase):
             "v1", [1], initializer=init_ops.constant_initializer(1))
         add = v1 + v0
       # v0 should be uninitialized.
-      with self.assertRaisesRegexp(errors.OpError, "uninitialized"):
+      with self.assertRaisesRegex(errors.OpError, "uninitialized"):
         self.evaluate(v0)
       # We should be able to initialize and run v1 without initializing
       # v0, even if the variable was created with a control dep on v0.
       self.evaluate(v1.initializer)
       self.assertEqual(1, self.evaluate(v1))
       # v0 should still be uninitialized.
-      with self.assertRaisesRegexp(errors.OpError, "uninitialized"):
+      with self.assertRaisesRegex(errors.OpError, "uninitialized"):
         self.evaluate(v0)
-      with self.assertRaisesRegexp(errors.OpError, "uninitialized"):
+      with self.assertRaisesRegex(errors.OpError, "uninitialized"):
         self.evaluate(add)
       # If we initialize v0 we should be able to run 'add'.
       self.evaluate(v0.initializer)
@@ -512,10 +512,10 @@ class VariableScopeTest(test.TestCase):
       self.evaluate(v2.initializer)
       self.assertEqual([2], self.evaluate(v2))
       # v0 should still be uninitialized.
-      with self.assertRaisesRegexp(errors.OpError, "uninitialized"):
+      with self.assertRaisesRegex(errors.OpError, "uninitialized"):
         self.evaluate(v0)
       # We should not be able to run 'add' yet.
-      with self.assertRaisesRegexp(errors.OpError, "uninitialized"):
+      with self.assertRaisesRegex(errors.OpError, "uninitialized"):
         self.evaluate(add)
       # If we initialize v0 we should be able to run 'add'.
       self.evaluate(v0.initializer)
@@ -1061,19 +1061,19 @@ class VariableScopeTest(test.TestCase):
   @run_inside_wrap_function_in_eager_mode
   def testAuxiliaryNameScopeIsInvalid(self):
     with self.cached_session():
-      with self.assertRaisesRegexp(TypeError, "auxiliary_name_scope"):
+      with self.assertRaisesRegex(TypeError, "auxiliary_name_scope"):
         with variable_scope.variable_scope(
             None, default_name="scope", auxiliary_name_scope="invalid"):
           pass
 
-      with self.assertRaisesRegexp(TypeError, "auxiliary_name_scope"):
+      with self.assertRaisesRegex(TypeError, "auxiliary_name_scope"):
         with variable_scope.variable_scope(
             "scope", auxiliary_name_scope="invalid"):
           pass
 
       with variable_scope.variable_scope("scope") as scope:
         pass
-      with self.assertRaisesRegexp(TypeError, "auxiliary_name_scope"):
+      with self.assertRaisesRegex(TypeError, "auxiliary_name_scope"):
         with variable_scope.variable_scope(
             scope, auxiliary_name_scope="invalid"):
           pass
@@ -1350,7 +1350,7 @@ class VariableScopeTest(test.TestCase):
   @test_util.run_in_graph_and_eager_modes
   @run_inside_wrap_function_in_eager_mode
   def testGetVariableWithInitializerWhichTakesUnprovidedArgsAndNoShape(self):
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         ValueError,
         "The initializer passed is not valid. It should be a callable with no "
         "arguments and the shape should not be provided or an instance of "
@@ -1369,7 +1369,7 @@ class VariableScopeTest(test.TestCase):
           with variable_scope.variable_scope("_"):
             pass
 
-    self.assertRaisesRegexp(ValueError, "'_' is not a valid scope name", f)
+    self.assertRaisesRegex(ValueError, "'_' is not a valid scope name", f)
 
 
 def axis0_into1_partitioner(shape=None, **unused_kwargs):
@@ -1415,7 +1415,7 @@ class VariableScopeWithPartitioningTest(test.TestCase):
 
     with variable_scope.variable_scope(
         "scope0", partitioner=axis0_into3_partitioner, reuse=True):
-      with self.assertRaisesRegexp(
+      with self.assertRaisesRegex(
           ValueError,
           "Trying to reuse partitioned variable .* but specified partitions "
           ".* and found partitions .*"):
@@ -1423,7 +1423,7 @@ class VariableScopeWithPartitioningTest(test.TestCase):
 
     with variable_scope.variable_scope(
         "scope0", partitioner=axis0_into1_partitioner, reuse=True):
-      with self.assertRaisesRegexp(
+      with self.assertRaisesRegex(
           ValueError,
           "Trying to reuse partitioned variable .* but specified partitions "
           ".* and found partitions .*"):
@@ -1523,12 +1523,10 @@ class VariableScopeWithCustomGetterTest(test.TestCase):
   @test_util.run_in_graph_and_eager_modes
   @run_inside_wrap_function_in_eager_mode
   def testNonCallableGetterFails(self):
-    with self.assertRaisesRegexp(ValueError,
-                                 r"custom_getter .* not callable:"):
+    with self.assertRaisesRegex(ValueError, r"custom_getter .* not callable:"):
       with variable_scope.variable_scope("scope0", custom_getter=3):
         variable_scope.get_variable("name0")
-    with self.assertRaisesRegexp(ValueError,
-                                 r"custom_getter .* not callable:"):
+    with self.assertRaisesRegex(ValueError, r"custom_getter .* not callable:"):
       variable_scope.get_variable("name0", custom_getter=3)
 
   @test_util.run_in_graph_and_eager_modes
@@ -1811,7 +1809,7 @@ class VariableScopeMultithreadedTest(test.TestCase):
         with variable_scope.variable_scope("foo"):
           if i == 0:
             v = variable_scope.get_variable("v", [])
-            self.assertEquals("foo/v:0", v.name)
+            self.assertEqual("foo/v:0", v.name)
           else:
             # Any thread after the first one should fail to create variable
             # with the same name.
@@ -1841,7 +1839,7 @@ class VariableScopeMultithreadedTest(test.TestCase):
         with variable_scope.variable_scope("foo"):
           if i == 0:
             v = variable_scope.get_variable("v", [])
-            self.assertEquals("foo/v:0", v.name)
+            self.assertEqual("foo/v:0", v.name)
           else:
             # Any thread after the first one should fail to create variable
             # with the same name.
@@ -1881,12 +1879,12 @@ class VariableScopeMultithreadedTest(test.TestCase):
         with variable_scope.variable_scope(main_thread_scope):
           with variable_scope.variable_scope("foo"):
             v = variable_scope.get_variable("v", [])
-            self.assertEquals("main/foo/v:0", v.name)
+            self.assertEqual("main/foo/v:0", v.name)
 
         # Variable created outside main scope will not have prefix "main".
         with variable_scope.variable_scope("bar"):
           v = variable_scope.get_variable("v", [])
-          self.assertEquals("bar/v:0", v.name)
+          self.assertEqual("bar/v:0", v.name)
 
     graph = ops.get_default_graph()
     with variable_scope.variable_scope("main") as main_thread_scope:
diff --git a/tensorflow/python/kernel_tests/variables_test.py b/tensorflow/python/kernel_tests/variables_test.py
index 19bdd9429e5..790749b1fa4 100644
--- a/tensorflow/python/kernel_tests/variables_test.py
+++ b/tensorflow/python/kernel_tests/variables_test.py
@@ -122,10 +122,10 @@ class VariablesTestCase(test.TestCase, parameterized.TestCase):
       self.assertIs(initial_value, cyclic)
 
   def testIterable(self):
-    with self.assertRaisesRegexp(TypeError, "not iterable"):
+    with self.assertRaisesRegex(TypeError, "not iterable"):
       for _ in variables.Variable(0.0):
         pass
-    with self.assertRaisesRegexp(TypeError, "not iterable"):
+    with self.assertRaisesRegex(TypeError, "not iterable"):
       for _ in variables.Variable([0.0, 1.0]):
         pass
 
@@ -170,8 +170,7 @@ class VariablesTestCase(test.TestCase, parameterized.TestCase):
   def testAssignDifferentShapesEagerNotAllowed(self):
     with context.eager_mode():
       var = variables.Variable(np.zeros(shape=[1, 1]))
-      with self.assertRaisesRegexp(ValueError,
-                                   "Shapes.*and.*are incompatible"):
+      with self.assertRaisesRegex(ValueError, "Shapes.*and.*are incompatible"):
         var.assign(np.zeros(shape=[2, 2]))
 
   @test_util.disable_tfrt("Graph is not supported yet. b/156187905")
@@ -274,10 +273,10 @@ class VariablesTestCase(test.TestCase, parameterized.TestCase):
       self.evaluate(v2.initializer)
       self.assertEqual([2], self.evaluate(v2))
       # v0 should still be uninitialized.
-      with self.assertRaisesRegexp(errors_impl.OpError, "uninitialized"):
+      with self.assertRaisesRegex(errors_impl.OpError, "uninitialized"):
         self.evaluate(v0)
       # We should not be able to run 'add' yet.
-      with self.assertRaisesRegexp(errors_impl.OpError, "uninitialized"):
+      with self.assertRaisesRegex(errors_impl.OpError, "uninitialized"):
         self.evaluate(add)
       # If we initialize v0 we should be able to run 'add'.
       self.evaluate(v0.initializer)
@@ -294,7 +293,7 @@ class VariablesTestCase(test.TestCase, parameterized.TestCase):
       v = variables.Variable(initial_value=zero)
       return (i + 1, v.read_value())
 
-    with self.assertRaisesRegexp(ValueError, "inside a control-flow"):
+    with self.assertRaisesRegex(ValueError, "inside a control-flow"):
       control_flow_ops.while_loop(cond, body, [0, 0])
 
   @test_util.run_deprecated_v1
@@ -745,7 +744,7 @@ class PartitionedVariableTest(test.TestCase):
 
   def testPartitionedVariableFailures(self):
     with ops.Graph().as_default():
-      with self.assertRaisesRegexp(ValueError, "empty"):
+      with self.assertRaisesRegex(ValueError, "empty"):
         variables.PartitionedVariable(
             name="fail",
             shape=2,
@@ -753,7 +752,7 @@ class PartitionedVariableTest(test.TestCase):
             variable_list=[],
             partitions=[])
 
-      with self.assertRaisesRegexp(ValueError, "must have a save_slice_info"):
+      with self.assertRaisesRegex(ValueError, "must have a save_slice_info"):
         v0 = variables.Variable([0])
         partitions = [1]
         variables.PartitionedVariable(
@@ -763,7 +762,7 @@ class PartitionedVariableTest(test.TestCase):
             variable_list=[v0],
             partitions=partitions)
 
-      with self.assertRaisesRegexp(ValueError, "full shapes must match"):
+      with self.assertRaisesRegex(ValueError, "full shapes must match"):
         v0 = variables.Variable([0])
         v1 = variables.Variable([1])
         v0._set_save_slice_info(
@@ -779,7 +778,7 @@ class PartitionedVariableTest(test.TestCase):
             variable_list=[v1, v0],
             partitions=partitions)
 
-      with self.assertRaisesRegexp(ValueError, "must be positive"):
+      with self.assertRaisesRegex(ValueError, "must be positive"):
         v0 = variables.Variable([0])
         v0._set_save_slice_info(
             variables.Variable.SaveSliceInfo(v0.name, [2], [0], [1]))
diff --git a/tensorflow/python/kernel_tests/weights_broadcast_test.py b/tensorflow/python/kernel_tests/weights_broadcast_test.py
index 677d8f2f22f..b9855fa2475 100644
--- a/tensorflow/python/kernel_tests/weights_broadcast_test.py
+++ b/tensorflow/python/kernel_tests/weights_broadcast_test.py
@@ -103,14 +103,14 @@ class AssertBroadcastableTest(test.TestCase):
 
   def _test_invalid(self, weights, values):
     error_msg = 'weights can not be broadcast to values'
-    with self.assertRaisesRegexp(ValueError, error_msg):
+    with self.assertRaisesRegex(ValueError, error_msg):
       weights_broadcast_ops.assert_broadcastable(weights=weights, values=values)
     weights_placeholder = array_ops.placeholder(dtypes_lib.float32)
     values_placeholder = array_ops.placeholder(dtypes_lib.float32)
     dynamic_op = weights_broadcast_ops.assert_broadcastable(
         weights=weights_placeholder, values=values_placeholder)
     with self.cached_session():
-      with self.assertRaisesRegexp(errors_impl.OpError, error_msg):
+      with self.assertRaisesRegex(errors_impl.OpError, error_msg):
         dynamic_op.run(feed_dict={
             weights_placeholder: weights,
             values_placeholder: values,
@@ -245,14 +245,14 @@ class BroadcastWeightsTest(test.TestCase):
 
   def _test_invalid(self, weights, values):
     error_msg = 'weights can not be broadcast to values'
-    with self.assertRaisesRegexp(ValueError, error_msg):
+    with self.assertRaisesRegex(ValueError, error_msg):
       weights_broadcast_ops.broadcast_weights(weights=weights, values=values)
     weights_placeholder = array_ops.placeholder(dtypes_lib.float32)
     values_placeholder = array_ops.placeholder(dtypes_lib.float32)
     dynamic_op = weights_broadcast_ops.broadcast_weights(
         weights=weights_placeholder, values=values_placeholder)
     with self.cached_session():
-      with self.assertRaisesRegexp(errors_impl.OpError, error_msg):
+      with self.assertRaisesRegex(errors_impl.OpError, error_msg):
         dynamic_op.eval(feed_dict={
             weights_placeholder: weights,
             values_placeholder: values,
diff --git a/tensorflow/python/kernel_tests/while_v2_test.py b/tensorflow/python/kernel_tests/while_v2_test.py
index 95bbea156f2..1012a8b7690 100644
--- a/tensorflow/python/kernel_tests/while_v2_test.py
+++ b/tensorflow/python/kernel_tests/while_v2_test.py
@@ -146,7 +146,7 @@ class WhileV2Test(test.TestCase, parameterized.TestCase):
 
       while_loop_v2(lambda x: x < 10, Body, [x])
 
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         TypeError,
         r"Loop var Const:0 enters the loop with type <dtype: 'float32'> "
         r"but has type <dtype: 'float16'> after 1 iteration."):
@@ -908,25 +908,21 @@ class WhileV2Test(test.TestCase, parameterized.TestCase):
     with ops.Graph().as_default():
       while_op = self._createWhile(None)
       self.assertEqual(while_op.name, "while")
-      self.assertRegexpMatches(
-          while_op.get_attr("cond").name, r"while_cond_\d*")
-      self.assertRegexpMatches(
-          while_op.get_attr("body").name, r"while_body_\d*")
+      self.assertRegex(while_op.get_attr("cond").name, r"while_cond_\d*")
+      self.assertRegex(while_op.get_attr("body").name, r"while_body_\d*")
 
     with ops.Graph().as_default():
       with ops.name_scope("foo"):
         while1_op = self._createWhile("")
         self.assertEqual(while1_op.name, "foo/while")
-        self.assertRegexpMatches(
-            while1_op.get_attr("cond").name, r"foo_while_cond_\d*")
-        self.assertRegexpMatches(
-            while1_op.get_attr("body").name, r"foo_while_body_\d*")
+        self.assertRegex(while1_op.get_attr("cond").name, r"foo_while_cond_\d*")
+        self.assertRegex(while1_op.get_attr("body").name, r"foo_while_body_\d*")
 
         while2_op = self._createWhile(None)
         self.assertEqual(while2_op.name, "foo/while_1")
-        self.assertRegexpMatches(
+        self.assertRegex(
             while2_op.get_attr("cond").name, r"foo_while_1_cond_\d*")
-        self.assertRegexpMatches(
+        self.assertRegex(
             while2_op.get_attr("body").name, r"foo_while_1_body_\d*")
 
   @test_util.enable_control_flow_v2
diff --git a/tensorflow/python/kernel_tests/xent_op_test.py b/tensorflow/python/kernel_tests/xent_op_test.py
index 54e0aa21ff3..6e60a935e93 100644
--- a/tensorflow/python/kernel_tests/xent_op_test.py
+++ b/tensorflow/python/kernel_tests/xent_op_test.py
@@ -117,9 +117,9 @@ class XentTest(test.TestCase):
                                                     4.]]]).astype(dtype)
       np_labels = np.array([[[0., 0., 0., 1.]], [[0., .5, .5,
                                                   0.]]]).astype(dtype)
-      self.assertRaisesRegexp(ValueError, "rank 2, but is rank 3",
-                              gen_nn_ops.softmax_cross_entropy_with_logits,
-                              np_features, np_labels)
+      self.assertRaisesRegex(ValueError, "rank 2, but is rank 3",
+                             gen_nn_ops.softmax_cross_entropy_with_logits,
+                             np_features, np_labels)
 
   def testNpXent(self):
     # We create 2 batches of logits for testing.
diff --git a/tensorflow/python/lib/io/tf_record_test.py b/tensorflow/python/lib/io/tf_record_test.py
index dfdbb663a9c..c6d8fca3bfa 100644
--- a/tensorflow/python/lib/io/tf_record_test.py
+++ b/tensorflow/python/lib/io/tf_record_test.py
@@ -491,7 +491,7 @@ class TFRecordRandomReaderTest(TFCompressionTestCase):
       self.assertEqual(record, records[i])
       offsets.append(offset)
     # Reading off the bound should lead to error.
-    with self.assertRaisesRegexp(IndexError, r"Out of range.*offset"):
+    with self.assertRaisesRegex(IndexError, r"Out of range.*offset"):
       reader.read(offset)
     # Do a pass of backward reading.
     for i in range(self._num_records - 1, 0, -1):
@@ -503,8 +503,7 @@ class TFRecordRandomReaderTest(TFCompressionTestCase):
     records = [self._Record(0, i) for i in range(self._num_records)]
     fn = self._WriteRecordsToFile(records, "uncompressed_records")
     reader = tf_record.tf_record_random_reader(fn)
-    with self.assertRaisesRegexp(
-        errors_impl.DataLossError, r"corrupted record"):
+    with self.assertRaisesRegex(errors_impl.DataLossError, r"corrupted record"):
       reader.read(1)  # 1 is guaranteed to be an invalid offset.
 
   def testClosingRandomReaderCausesErrorsForFurtherReading(self):
@@ -512,8 +511,7 @@ class TFRecordRandomReaderTest(TFCompressionTestCase):
     fn = self._WriteRecordsToFile(records, "uncompressed_records")
     reader = tf_record.tf_record_random_reader(fn)
     reader.close()
-    with self.assertRaisesRegexp(
-        errors_impl.FailedPreconditionError, r"closed"):
+    with self.assertRaisesRegex(errors_impl.FailedPreconditionError, r"closed"):
       reader.read(0)
 
 
diff --git a/tensorflow/python/module/module_test.py b/tensorflow/python/module/module_test.py
index 0578823c7fb..e15bc734230 100644
--- a/tensorflow/python/module/module_test.py
+++ b/tensorflow/python/module/module_test.py
@@ -109,7 +109,7 @@ class TestModuleNaming(test_util.TensorFlowTestCase):
 
   def test_invalid_name(self):
     msg = ".* is not a valid module name"
-    with self.assertRaisesRegexp(ValueError, msg):
+    with self.assertRaisesRegex(ValueError, msg):
       module.Module(name="$Foo")
 
   @test_util.run_in_graph_and_eager_modes
@@ -303,7 +303,7 @@ class AbcTest(test_util.TensorFlowTestCase):
 
   def testAbstract(self):
     msg = "Can't instantiate .* abstract methods"
-    with self.assertRaisesRegexp(TypeError, msg):
+    with self.assertRaisesRegex(TypeError, msg):
       AbstractModule()  # pylint: disable=abstract-class-instantiated
 
   def testConcrete(self):
diff --git a/tensorflow/python/ops/batch_ops_test.py b/tensorflow/python/ops/batch_ops_test.py
index f63f39d27d8..5749be96033 100644
--- a/tensorflow/python/ops/batch_ops_test.py
+++ b/tensorflow/python/ops/batch_ops_test.py
@@ -345,8 +345,8 @@ class BatchOpsTest(test.TestCase):
           captured_tensors=computation.captured_inputs,
           Tout=[o.type for o in computation.definition.signature.output_arg])
 
-      with self.assertRaisesRegexp(InvalidArgumentError,
-                                   ".*2 arguments.*but 1.*"):
+      with self.assertRaisesRegex(InvalidArgumentError,
+                                  ".*2 arguments.*but 1.*"):
         sess.run([result], feed_dict={inp: [2]})
 
   def testBatchFunctionOpWithLargeBatchSplitted(self):
diff --git a/tensorflow/python/ops/bincount_ops_test.py b/tensorflow/python/ops/bincount_ops_test.py
index 74fd17cae2b..baf0018fb32 100644
--- a/tensorflow/python/ops/bincount_ops_test.py
+++ b/tensorflow/python/ops/bincount_ops_test.py
@@ -748,13 +748,13 @@ class TestSparseCountFailureModes(test.TestCase):
     x = np.array([[3, 2, 1], [5, 4, 4]], dtype=np.int32)
     weights = sparse_ops.from_dense(
         np.array([[3, 0, 1, 0], [0, 0, 0, 0], [5, 0, 4, 4]], dtype=np.int32))
-    with self.assertRaisesRegexp(ValueError, "must be a tf.Tensor"):
+    with self.assertRaisesRegex(ValueError, "must be a tf.Tensor"):
       self.evaluate(bincount_ops.sparse_bincount(x, weights=weights, axis=-1))
 
   def test_dense_input_ragged_weights_fails(self):
     x = np.array([[3, 2, 1], [5, 4, 4]], dtype=np.int32)
     weights = ragged_factory_ops.constant([[6, 0.5, 2], [14], [10, 0.25, 5, 3]])
-    with self.assertRaisesRegexp(ValueError, "must be a tf.Tensor"):
+    with self.assertRaisesRegex(ValueError, "must be a tf.Tensor"):
       self.evaluate(bincount_ops.sparse_bincount(x, weights=weights, axis=-1))
 
   def test_dense_input_wrong_shape_fails(self):
@@ -764,25 +764,25 @@ class TestSparseCountFailureModes(test.TestCase):
     # will fail with a ValueError from the shape checking logic, while Eager
     # will fail with an InvalidArgumentError from the kernel itself.
     if context.executing_eagerly():
-      with self.assertRaisesRegexp(errors.InvalidArgumentError,
-                                   "must have the same shape"):
+      with self.assertRaisesRegex(errors.InvalidArgumentError,
+                                  "must have the same shape"):
         self.evaluate(bincount_ops.sparse_bincount(x, weights=weights, axis=-1))
     else:
-      with self.assertRaisesRegexp(ValueError, "both shapes must be equal"):
+      with self.assertRaisesRegex(ValueError, "both shapes must be equal"):
         self.evaluate(bincount_ops.sparse_bincount(x, weights=weights, axis=-1))
 
   def test_sparse_input_dense_weights_fails(self):
     x = sparse_ops.from_dense(
         np.array([[3, 0, 1, 0], [0, 0, 0, 0], [5, 0, 4, 4]], dtype=np.int32))
     weights = np.array([[3, 2, 1], [5, 4, 4]], dtype=np.int32)
-    with self.assertRaisesRegexp(ValueError, "must be a SparseTensor"):
+    with self.assertRaisesRegex(ValueError, "must be a SparseTensor"):
       self.evaluate(bincount_ops.sparse_bincount(x, weights=weights, axis=-1))
 
   def test_sparse_input_ragged_weights_fails(self):
     x = sparse_ops.from_dense(
         np.array([[3, 0, 1, 0], [0, 0, 0, 0], [5, 0, 4, 4]], dtype=np.int32))
     weights = ragged_factory_ops.constant([[6, 0.5, 2], [14], [10, 0.25, 5, 3]])
-    with self.assertRaisesRegexp(ValueError, "must be a SparseTensor"):
+    with self.assertRaisesRegex(ValueError, "must be a SparseTensor"):
       self.evaluate(bincount_ops.sparse_bincount(x, weights=weights, axis=-1))
 
   def test_sparse_input_wrong_indices_fails(self):
@@ -790,8 +790,8 @@ class TestSparseCountFailureModes(test.TestCase):
         np.array([[3, 0, 1, 0], [0, 0, 0, 0], [5, 0, 4, 4]], dtype=np.int32))
     weights = sparse_ops.from_dense(
         np.array([[3, 1, 0, 0], [0, 0, 0, 0], [5, 0, 4, 4]], dtype=np.int32))
-    with self.assertRaisesRegexp(errors.InvalidArgumentError,
-                                 "must have the same indices"):
+    with self.assertRaisesRegex(errors.InvalidArgumentError,
+                                "must have the same indices"):
       self.evaluate(bincount_ops.sparse_bincount(x, weights=weights, axis=-1))
 
   def test_sparse_input_too_many_indices_fails(self):
@@ -799,8 +799,8 @@ class TestSparseCountFailureModes(test.TestCase):
         np.array([[3, 0, 1, 0], [0, 0, 0, 0], [5, 0, 4, 4]], dtype=np.int32))
     weights = sparse_ops.from_dense(
         np.array([[3, 1, 1, 0], [0, 0, 0, 0], [5, 0, 4, 4]], dtype=np.int32))
-    with self.assertRaisesRegexp(errors.InvalidArgumentError,
-                                 "Incompatible shapes"):
+    with self.assertRaisesRegex(errors.InvalidArgumentError,
+                                "Incompatible shapes"):
       self.evaluate(bincount_ops.sparse_bincount(x, weights=weights, axis=-1))
 
   def test_sparse_input_wrong_shape_fails(self):
@@ -809,28 +809,28 @@ class TestSparseCountFailureModes(test.TestCase):
     weights = sparse_ops.from_dense(
         np.array([[3, 0, 1, 0], [0, 0, 0, 0], [5, 0, 4, 4], [0, 0, 0, 0]],
                  dtype=np.int32))
-    with self.assertRaisesRegexp(errors.InvalidArgumentError,
-                                 "must have the same dense shape"):
+    with self.assertRaisesRegex(errors.InvalidArgumentError,
+                                "must have the same dense shape"):
       self.evaluate(bincount_ops.sparse_bincount(x, weights=weights, axis=-1))
 
   def test_ragged_input_dense_weights_fails(self):
     x = ragged_factory_ops.constant([[6, 1, 2], [14], [10, 1, 5, 3]])
     weights = np.array([[3, 2, 1], [5, 4, 4]], dtype=np.int32)
-    with self.assertRaisesRegexp(ValueError, "must be a RaggedTensor"):
+    with self.assertRaisesRegex(ValueError, "must be a RaggedTensor"):
       self.evaluate(bincount_ops.sparse_bincount(x, weights=weights, axis=-1))
 
   def test_ragged_input_sparse_weights_fails(self):
     x = ragged_factory_ops.constant([[6, 1, 2], [14], [10, 1, 5, 3]])
     weights = sparse_ops.from_dense(
         np.array([[3, 0, 1, 0], [0, 0, 0, 0], [5, 0, 4, 4]], dtype=np.int32))
-    with self.assertRaisesRegexp(ValueError, "must be a RaggedTensor"):
+    with self.assertRaisesRegex(ValueError, "must be a RaggedTensor"):
       self.evaluate(bincount_ops.sparse_bincount(x, weights=weights, axis=-1))
 
   def test_ragged_input_different_shape_fails(self):
     x = ragged_factory_ops.constant([[6, 1, 2], [14], [10, 1, 5, 3]])
     weights = ragged_factory_ops.constant([[6, 0.5, 2], [], [10, 0.25, 5, 3]])
-    with self.assertRaisesRegexp(errors.InvalidArgumentError,
-                                 "must have the same row splits"):
+    with self.assertRaisesRegex(errors.InvalidArgumentError,
+                                "must have the same row splits"):
       self.evaluate(bincount_ops.sparse_bincount(x, weights=weights, axis=-1))
 
 
diff --git a/tensorflow/python/ops/clustering_ops_test.py b/tensorflow/python/ops/clustering_ops_test.py
index 5804c660e67..2a3d102ba87 100644
--- a/tensorflow/python/ops/clustering_ops_test.py
+++ b/tensorflow/python/ops/clustering_ops_test.py
@@ -92,7 +92,7 @@ class KMC2InitializationLargeTest(test.TestCase):
         sample = self.evaluate(
             clustering_ops.kmc2_chain_initialization(self._distances, seed + i))
         counts[sample] = counts.get(sample, 0) + 1
-      self.assertEquals(len(counts), 2)
+      self.assertEqual(len(counts), 2)
       self.assertTrue(500 in counts)
       self.assertTrue(1000 in counts)
       self.assertGreaterEqual(counts[500], 5)
diff --git a/tensorflow/python/ops/collective_ops_gpu_test.py b/tensorflow/python/ops/collective_ops_gpu_test.py
index 872fb49834c..efa97bd9555 100644
--- a/tensorflow/python/ops/collective_ops_gpu_test.py
+++ b/tensorflow/python/ops/collective_ops_gpu_test.py
@@ -100,7 +100,7 @@ class CollectiveOpGPUTest(test.TestCase):
           t = constant_op.constant(inputs[i], dtype=dtypes.int32)
           collectives.append(collective_ops.all_reduce(
               t, self._group_size, group_key, instance_key, 'Add', 'Div'))
-      with self.assertRaisesRegexp(
+      with self.assertRaisesRegex(
           errors.InternalError,
           'does not support datatype DT_INT32 on DEVICE_GPU'):
         sess.run(collectives)
@@ -190,7 +190,7 @@ class CollectiveOpGPUTest(test.TestCase):
           t = constant_op.constant(tensor_value)
           collectives.append(collective_ops.broadcast_recv(
               t.shape, t.dtype, self._group_size, group_key, instance_key))
-      with self.assertRaisesRegexp(errors.InternalError, 'found no source'):
+      with self.assertRaisesRegex(errors.InternalError, 'found no source'):
         sess.run(collectives)
 
   def testNcclBroadcastDoubleSend(self):
@@ -209,7 +209,7 @@ class CollectiveOpGPUTest(test.TestCase):
           t = constant_op.constant(tensor_value)
           collectives.append(collective_ops.broadcast_send(
               t, t.shape, t.dtype, self._group_size, group_key, instance_key))
-      with self.assertRaisesRegexp(errors.InternalError, 'already has source'):
+      with self.assertRaisesRegex(errors.InternalError, 'already has source'):
         sess.run(collectives)
 
   def testBasicNcclAllGather(self):
@@ -255,8 +255,8 @@ class CollectiveOpGPUTest(test.TestCase):
                                        instance_key, 'Add', 'Id')
       run_options = config_pb2.RunOptions()
       run_options.experimental.collective_graph_key = 100
-      with self.assertRaisesRegexp(errors.InternalError,
-                                   'but that group has type'):
+      with self.assertRaisesRegex(errors.InternalError,
+                                  'but that group has type'):
         sess.run([c0, c1], options=run_options)
 
   @test_util.run_v2_only
diff --git a/tensorflow/python/ops/collective_ops_test.py b/tensorflow/python/ops/collective_ops_test.py
index 8e3a95d7dbf..300863ec03a 100644
--- a/tensorflow/python/ops/collective_ops_test.py
+++ b/tensorflow/python/ops/collective_ops_test.py
@@ -426,8 +426,8 @@ class CollectiveOpTest(test.TestCase):
         run_options = config_pb2.RunOptions()
         run_options.experimental.collective_graph_key = 1
         sess.run([c0, c1], options=run_options)
-        with self.assertRaisesRegexp(errors.InvalidArgumentError,
-                                     'Shape mismatch'):
+        with self.assertRaisesRegex(errors.InvalidArgumentError,
+                                    'Shape mismatch'):
           sess.run([c0, c2], options=run_options)
 
   def testCollectiveGatherShapeMismatchAcrossDevices(self):
@@ -447,8 +447,8 @@ class CollectiveOpTest(test.TestCase):
           c1 = collective_ops.all_gather(in1, 2, group_key, instance_key)
         run_options = config_pb2.RunOptions()
         run_options.experimental.collective_graph_key = 1
-        with self.assertRaisesRegexp(errors.InvalidArgumentError,
-                                     'Shape mismatch'):
+        with self.assertRaisesRegex(errors.InvalidArgumentError,
+                                    'Shape mismatch'):
           sess.run([c0, c1], options=run_options)
 
   def testCollectiveGatherPolymorphicShape(self):
@@ -510,8 +510,8 @@ class CollectiveOpTest(test.TestCase):
             merge_op='Add', final_op='Id')
       return c0, c1
 
-    with self.assertRaisesRegexp(errors.InternalError,
-                                 'but that group has size'):
+    with self.assertRaisesRegex(errors.InternalError,
+                                'but that group has size'):
       run_all_reduce()
 
   @test_util.run_v2_only
diff --git a/tensorflow/python/ops/control_flow_ops_test.py b/tensorflow/python/ops/control_flow_ops_test.py
index d1d0f65e07c..aa95d22c119 100644
--- a/tensorflow/python/ops/control_flow_ops_test.py
+++ b/tensorflow/python/ops/control_flow_ops_test.py
@@ -151,10 +151,10 @@ class ShapeTestCase(test_util.TensorFlowTestCase):
 
   def testShape(self):
     tensor = constant_op.constant([1.0, 2.0])
-    self.assertEquals([2], tensor.get_shape())
-    self.assertEquals([2],
-                      control_flow_ops.with_dependencies(
-                          [constant_op.constant(1.0)], tensor).get_shape())
+    self.assertEqual([2], tensor.get_shape())
+    self.assertEqual([2],
+                     control_flow_ops.with_dependencies(
+                         [constant_op.constant(1.0)], tensor).get_shape())
 
 
 class WithDependenciesTestCase(test_util.TensorFlowTestCase):
@@ -169,9 +169,9 @@ class WithDependenciesTestCase(test_util.TensorFlowTestCase):
         constant_op.constant(7))
 
     self.evaluate(variables.global_variables_initializer())
-    self.assertEquals(0, self.evaluate(counter))
-    self.assertEquals(7, self.evaluate(const_with_dep))
-    self.assertEquals(1, self.evaluate(counter))
+    self.assertEqual(0, self.evaluate(counter))
+    self.assertEqual(7, self.evaluate(const_with_dep))
+    self.assertEqual(1, self.evaluate(counter))
 
   @test_util.run_deprecated_v1
   def testListDependencies(self):
@@ -183,9 +183,9 @@ class WithDependenciesTestCase(test_util.TensorFlowTestCase):
         constant_op.constant(7))
 
     self.evaluate(variables.global_variables_initializer())
-    self.assertEquals(0, self.evaluate(counter))
-    self.assertEquals(7, self.evaluate(const_with_dep))
-    self.assertEquals(1, self.evaluate(counter))
+    self.assertEqual(0, self.evaluate(counter))
+    self.assertEqual(7, self.evaluate(const_with_dep))
+    self.assertEqual(1, self.evaluate(counter))
 
 
 class SwitchTestCase(test_util.TensorFlowTestCase):
@@ -316,7 +316,7 @@ class SwitchTestCase(test_util.TensorFlowTestCase):
         grad_wr_inputs = ops.convert_to_tensor(r)
         o, grad = sess.run([outputs, grad_wr_inputs],
                            feed_dict={inputs: [4, 6, 0, 7, 0, 0, 1, 2, 0]})
-        self.assertEquals(o, 20)
+        self.assertEqual(o, 20)
         self.assertAllEqual(grad, [1] * num_steps)
 
   @test_util.run_v1_only("b/120545219")
@@ -344,7 +344,7 @@ class SwitchTestCase(test_util.TensorFlowTestCase):
         grad_wr_inputs = ops.convert_to_tensor(r)
         o, grad = sess.run([outputs, grad_wr_inputs],
                            feed_dict={inputs: [1, 3, 2]})
-        self.assertEquals(o, 6)
+        self.assertEqual(o, 6)
         self.assertAllEqual(grad, [1] * 3)
 
   @test_util.run_deprecated_v1
@@ -354,8 +354,8 @@ class SwitchTestCase(test_util.TensorFlowTestCase):
     x_false, x_true = control_flow_ops.switch(x, s)
     grad_x_true = gradients_impl.gradients(x_true, x)[0]
     grad_x_false = gradients_impl.gradients(x_false, x)[0]
-    self.assertEquals(self.evaluate(grad_x_true), 1.)
-    self.assertEquals(self.evaluate(grad_x_false), 0.)
+    self.assertEqual(self.evaluate(grad_x_true), 1.)
+    self.assertEqual(self.evaluate(grad_x_false), 0.)
 
 
 class CondTest(test_util.TensorFlowTestCase):
@@ -367,7 +367,7 @@ class CondTest(test_util.TensorFlowTestCase):
         math_ops.less(
             x,
             y), lambda: math_ops.multiply(x, 17), lambda: math_ops.add(y, 23))
-    self.assertEquals(self.evaluate(z), 34)
+    self.assertEqual(self.evaluate(z), 34)
 
   def testCondFalse(self):
     x = constant_op.constant(2)
@@ -376,7 +376,7 @@ class CondTest(test_util.TensorFlowTestCase):
         math_ops.less(
             x,
             y), lambda: math_ops.multiply(x, 17), lambda: math_ops.add(y, 23))
-    self.assertEquals(self.evaluate(z), 24)
+    self.assertEqual(self.evaluate(z), 24)
 
   def testCondTrueLegacy(self):
     x = constant_op.constant(2)
@@ -385,7 +385,7 @@ class CondTest(test_util.TensorFlowTestCase):
         math_ops.less(x, y),
         fn1=lambda: math_ops.multiply(x, 17),
         fn2=lambda: math_ops.add(y, 23))
-    self.assertEquals(self.evaluate(z), 34)
+    self.assertEqual(self.evaluate(z), 34)
 
   def testCondFalseLegacy(self):
     x = constant_op.constant(2)
@@ -394,7 +394,7 @@ class CondTest(test_util.TensorFlowTestCase):
         math_ops.less(x, y),
         fn1=lambda: math_ops.multiply(x, 17),
         fn2=lambda: math_ops.add(y, 23))
-    self.assertEquals(self.evaluate(z), 24)
+    self.assertEqual(self.evaluate(z), 24)
 
   @test_util.run_v1_only("Exercises Ref variables")
   def testCondModifyBoolPred(self):
@@ -408,8 +408,8 @@ class CondTest(test_util.TensorFlowTestCase):
           true_fn=lambda: state_ops.assign(bool_var, False),
           false_fn=lambda: True)
       self.evaluate(bool_var.initializer)
-      self.assertEquals(self.evaluate(cond_on_bool_var), False)
-      self.assertEquals(self.evaluate(cond_on_bool_var), True)
+      self.assertEqual(self.evaluate(cond_on_bool_var), False)
+      self.assertEqual(self.evaluate(cond_on_bool_var), True)
 
   def testCondMissingArg1(self):
     x = constant_op.constant(1)
@@ -533,10 +533,9 @@ class ContextTest(test_util.TensorFlowTestCase):
           values_def=c._to_values_def(), import_scope="test_scope")
 
       # _values and _external_values should be have scope prepended.
-      self.assertEquals(
-          c_with_scope._values, set(["test_scope/a", "test_scope/b"]))
-      self.assertEquals(
-          c_with_scope._external_values, {"test_scope/a": b2})
+      self.assertEqual(c_with_scope._values,
+                       set(["test_scope/a", "test_scope/b"]))
+      self.assertEqual(c_with_scope._external_values, {"test_scope/a": b2})
 
       # Calling _to_proto() with export_scope should remove "test_scope".
       self.assertProtoEquals(
@@ -1191,7 +1190,7 @@ class IndexedCaseTest(test_util.TensorFlowTestCase, parameterized.TestCase):
       return lambda: array_ops.constant(bi * 10., name="br{}_out".format(bi))
 
     branches = {i: make_func(i) for i in range(0, 6, 2)}
-    with self.assertRaisesRegexp(ValueError, "must form contiguous"):
+    with self.assertRaisesRegex(ValueError, "must form contiguous"):
       control_flow_ops.switch_case(array_ops.constant(0), branches)
 
   def testCase_validateIndicesDup(self):
@@ -1201,7 +1200,7 @@ class IndexedCaseTest(test_util.TensorFlowTestCase, parameterized.TestCase):
 
     branches = [(i, make_func(i)) for i in range(0, 6, 2)]
     branches.append((0, make_func(7)))
-    with self.assertRaisesRegexp(ValueError, "must form contiguous"):
+    with self.assertRaisesRegex(ValueError, "must form contiguous"):
       control_flow_ops.switch_case(array_ops.constant(0), branches)
 
   def testCase_validateBranchIndex(self):
@@ -1210,7 +1209,7 @@ class IndexedCaseTest(test_util.TensorFlowTestCase, parameterized.TestCase):
       return lambda: array_ops.constant(bi * 10., name="br{}_out".format(bi))
 
     branches = {i: make_func(i) for i in range(5)}
-    with self.assertRaisesRegexp(TypeError, "branch_index.*Tensor"):
+    with self.assertRaisesRegex(TypeError, "branch_index.*Tensor"):
       control_flow_ops.switch_case(1, branches)
 
   def testCase_validateNonIntKeys(self):
@@ -1219,7 +1218,7 @@ class IndexedCaseTest(test_util.TensorFlowTestCase, parameterized.TestCase):
       return lambda: array_ops.constant(bi * 10., name="br{}_out".format(bi))
 
     branches = [(array_ops.constant(i), make_func(i)) for i in range(5)]
-    with self.assertRaisesRegexp(TypeError, "must be a Python `int`"):
+    with self.assertRaisesRegex(TypeError, "must be a Python `int`"):
       control_flow_ops.switch_case(array_ops.constant(1), branches)
 
 
@@ -1359,7 +1358,7 @@ class CaseTest(test_util.TensorFlowTestCase):
     with self.cached_session() as sess:
       self.assertEqual(sess.run(output, feed_dict={x: 1}), 2)
       self.assertEqual(sess.run(output, feed_dict={x: 3}), 8)
-      with self.assertRaisesRegexp(errors.InvalidArgumentError, "Input error:"):
+      with self.assertRaisesRegex(errors.InvalidArgumentError, "Input error:"):
         sess.run(output, feed_dict={x: 2})
 
   @test_util.run_deprecated_v1
@@ -1386,7 +1385,7 @@ class CaseTest(test_util.TensorFlowTestCase):
       self.assertEqual(sess.run(output, feed_dict={x: 1}), 2)
       self.assertEqual(sess.run(output, feed_dict={x: 2}), 4)
       self.assertEqual(sess.run(output, feed_dict={x: 3}), 6)
-      with self.assertRaisesRegexp(errors.InvalidArgumentError, "Input error:"):
+      with self.assertRaisesRegex(errors.InvalidArgumentError, "Input error:"):
         sess.run(output, feed_dict={x: 4})
 
   @test_util.run_deprecated_v1
@@ -1396,7 +1395,7 @@ class CaseTest(test_util.TensorFlowTestCase):
     output = control_flow_ops.case(conditions, exclusive=True)
     with self.cached_session() as sess:
       self.assertEqual(sess.run(output, feed_dict={x: 1}), 2)
-      with self.assertRaisesRegexp(errors.InvalidArgumentError, "Input error:"):
+      with self.assertRaisesRegex(errors.InvalidArgumentError, "Input error:"):
         sess.run(output, feed_dict={x: 4})
 
   @test_util.run_in_graph_and_eager_modes
diff --git a/tensorflow/python/ops/gradient_checker_test.py b/tensorflow/python/ops/gradient_checker_test.py
index c8ebf12569a..7ecad0a2a8e 100644
--- a/tensorflow/python/ops/gradient_checker_test.py
+++ b/tensorflow/python/ops/gradient_checker_test.py
@@ -182,9 +182,9 @@ class GradientCheckerTest(test.TestCase):
         with g.gradient_override_map({"Identity": "BadGrad"}):
           y = array_ops.identity(x)
         bad = r"Empty gradient has wrong shape: expected \(0, 3\), got \(3, 0\)"
-        with self.assertRaisesRegexp(ValueError, bad):
+        with self.assertRaisesRegex(ValueError, bad):
           gradient_checker.compute_gradient(x, (0, 3), y, (0, 3))
-        with self.assertRaisesRegexp(ValueError, bad):
+        with self.assertRaisesRegex(ValueError, bad):
           gradient_checker.compute_gradient_error(x, (0, 3), y, (0, 3))
 
   def testNaNGradFails(self):
@@ -196,7 +196,7 @@ class GradientCheckerTest(test.TestCase):
           error = gradient_checker.compute_gradient_error(x, (), y, ())
           # Typical test would assert error < max_err, so assert this test would
           # raise AssertionError, since NaN is not < 1.0.
-          with self.assertRaisesRegexp(AssertionError, "False is not true"):
+          with self.assertRaisesRegex(AssertionError, "False is not true"):
             self.assertTrue(error < 1.0)
 
 
diff --git a/tensorflow/python/ops/gradient_checker_v2_test.py b/tensorflow/python/ops/gradient_checker_v2_test.py
index d59228d78d1..91d29702079 100644
--- a/tensorflow/python/ops/gradient_checker_v2_test.py
+++ b/tensorflow/python/ops/gradient_checker_v2_test.py
@@ -234,7 +234,7 @@ class GradientCheckerTest(test.TestCase):
     x = constant_op.constant(
         np.random.random_sample((0, 3)), dtype=dtypes.float32)
     bad = r"Empty gradient has wrong shape: expected \(0, 3\), got \(3, 0\)"
-    with self.assertRaisesRegexp(ValueError, bad):
+    with self.assertRaisesRegex(ValueError, bad):
       gradient_checker.compute_gradient(f, [x])
 
   def testNaNGradFails(self):
@@ -259,7 +259,7 @@ class GradientCheckerTest(test.TestCase):
         *gradient_checker.compute_gradient(f, [x]))
     # Typical test would assert error < max_err, so assert this test would
     # raise AssertionError, since NaN is not < 1.0.
-    with self.assertRaisesRegexp(AssertionError, "nan not less than 1.0"):
+    with self.assertRaisesRegex(AssertionError, "nan not less than 1.0"):
       self.assertLess(error, 1.0)
 
   def testGradGrad(self):
diff --git a/tensorflow/python/ops/gradients_test.py b/tensorflow/python/ops/gradients_test.py
index 78fbcdd6e6f..760463bcd65 100644
--- a/tensorflow/python/ops/gradients_test.py
+++ b/tensorflow/python/ops/gradients_test.py
@@ -73,8 +73,8 @@ class GradientsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
       xw = math_ops.matmul(inp, w, name="xw")
       h = bias_add(xw, b, name="h")
       w_grad = gradients.gradients(h, w)[0]
-    self.assertEquals("MatMul", w_grad.op.type)
-    self.assertEquals(w_grad.op._original_op, xw.op)
+    self.assertEqual("MatMul", w_grad.op.type)
+    self.assertEqual(w_grad.op._original_op, xw.op)
     self.assertTrue(w_grad.op.get_attr("transpose_a"))
     self.assertFalse(w_grad.op.get_attr("transpose_b"))
 
@@ -86,7 +86,7 @@ class GradientsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
       split_wx = array_ops.split(value=wx, num_or_size_splits=2, axis=0)
       c = math_ops.reduce_sum(split_wx[1])
       gw = gradients.gradients(c, [w])[0]
-    self.assertEquals("MatMul", gw.op.type)
+    self.assertEqual("MatMul", gw.op.type)
 
   def testColocateGradients(self):
     with ops.Graph().as_default() as g:
@@ -218,7 +218,7 @@ class GradientsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
 
       def _TestOpGrad(_, float_grad, string_grad):
         """Gradient function for TestStringOutput."""
-        self.assertEquals(float_grad.dtype, dtypes.float32)
+        self.assertEqual(float_grad.dtype, dtypes.float32)
         self.assertFalse(string_grad)
         return float_grad
 
@@ -427,7 +427,7 @@ class GradientsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     with ops.Graph().as_default():
       x = constant(1.0)
       y = constant(1.0)
-      with self.assertRaisesRegexp(
+      with self.assertRaisesRegex(
           ValueError, "Unknown value for unconnected_gradients: 'nonsense'"):
         gradients.gradients([y], [x], unconnected_gradients="nonsense")
 
@@ -573,7 +573,7 @@ class FunctionGradientsTest(test_util.TensorFlowTestCase):
       grad_func = framework_function.Defun(dtypes.float32, dtypes.float32,
                                            dtypes.float32)(
                                                self.XSquarePlusBGradient)
-      with self.assertRaisesRegexp(ValueError, "Gradient defined twice"):
+      with self.assertRaisesRegex(ValueError, "Gradient defined twice"):
         f = self._GetFunc(
             grad_func=grad_func, python_grad_func=self._PythonGradient)
         f.add_to_graph(ops.Graph())
@@ -704,7 +704,7 @@ class PreventGradientTest(test_util.TensorFlowTestCase):
     with ops.Graph().as_default():
       inp = constant(1.0, shape=[100, 32], name="in")
       out = array_ops.prevent_gradient(inp)
-      with self.assertRaisesRegexp(LookupError, "explicitly disabled"):
+      with self.assertRaisesRegex(LookupError, "explicitly disabled"):
         _ = gradients.gradients(out, inp)
 
 
@@ -920,9 +920,8 @@ class OnlyRealGradientsTest(test_util.TensorFlowTestCase):
   def testRealOnly(self):
     x = constant_op.constant(7+3j, dtype=dtypes.complex64)
     y = math_ops.square(x)
-    with self.assertRaisesRegexp(
-        TypeError,
-        r"Gradients of complex tensors must set grad_ys "
+    with self.assertRaisesRegex(
+        TypeError, r"Gradients of complex tensors must set grad_ys "
         r"\(y\.dtype = tf\.complex64\)"):
       gradients.gradients(y, x)
 
diff --git a/tensorflow/python/ops/histogram_ops_test.py b/tensorflow/python/ops/histogram_ops_test.py
index b48ef67196b..94217d931d8 100644
--- a/tensorflow/python/ops/histogram_ops_test.py
+++ b/tensorflow/python/ops/histogram_ops_test.py
@@ -88,20 +88,19 @@ class HistogramFixedWidthTest(test.TestCase):
   @test_util.run_deprecated_v1
   def test_with_invalid_value_range(self):
     values = [-1.0, 0.0, 1.5, 2.0, 5.0, 15]
-    with self.assertRaisesRegexp(
-        ValueError, "Shape must be rank 1 but is rank 0"):
+    with self.assertRaisesRegex(ValueError,
+                                "Shape must be rank 1 but is rank 0"):
       histogram_ops.histogram_fixed_width(values, 1.0)
-    with self.assertRaisesRegexp(ValueError, "Dimension must be 2 but is 3"):
+    with self.assertRaisesRegex(ValueError, "Dimension must be 2 but is 3"):
       histogram_ops.histogram_fixed_width(values, [1.0, 2.0, 3.0])
 
   @test_util.run_deprecated_v1
   def test_with_invalid_nbins(self):
     values = [-1.0, 0.0, 1.5, 2.0, 5.0, 15]
-    with self.assertRaisesRegexp(
-        ValueError, "Shape must be rank 0 but is rank 1"):
+    with self.assertRaisesRegex(ValueError,
+                                "Shape must be rank 0 but is rank 1"):
       histogram_ops.histogram_fixed_width(values, [1.0, 5.0], nbins=[1, 2])
-    with self.assertRaisesRegexp(
-        ValueError, "Requires nbins > 0"):
+    with self.assertRaisesRegex(ValueError, "Requires nbins > 0"):
       histogram_ops.histogram_fixed_width(values, [1.0, 5.0], nbins=-5)
 
   def test_empty_input_gives_all_zero_counts(self):
@@ -163,7 +162,7 @@ class HistogramFixedWidthTest(test.TestCase):
 
       hist = histogram_ops.histogram_fixed_width(
           values, value_range, nbins=placeholder)
-      self.assertEquals(hist.shape.ndims, 1)
+      self.assertEqual(hist.shape.ndims, 1)
       self.assertIs(hist.shape.dims[0].value, None)
       self.assertEqual(dtypes.int32, hist.dtype)
       self.assertAllClose(expected_bin_counts, hist.eval({placeholder: 5}))
diff --git a/tensorflow/python/ops/image_ops_test.py b/tensorflow/python/ops/image_ops_test.py
index da0492e3a56..539a305995b 100644
--- a/tensorflow/python/ops/image_ops_test.py
+++ b/tensorflow/python/ops/image_ops_test.py
@@ -228,7 +228,7 @@ class GrayscaleToRGBTest(test_util.TensorFlowTestCase):
 
       # this is the error message we expect the function to raise
       err_msg = "Last dimension of a grayscale image should be size 1"
-      with self.assertRaisesRegexp(ValueError, err_msg):
+      with self.assertRaisesRegex(ValueError, err_msg):
         image_ops.grayscale_to_rgb(x_tf)
 
     # tests if an exception is raised if a two dimensional
@@ -241,7 +241,7 @@ class GrayscaleToRGBTest(test_util.TensorFlowTestCase):
 
       # this is the error message we expect the function to raise
       err_msg = "must be at least two-dimensional"
-      with self.assertRaisesRegexp(ValueError, err_msg):
+      with self.assertRaisesRegex(ValueError, err_msg):
         image_ops.grayscale_to_rgb(x_tf)
 
   @test_util.run_deprecated_v1
@@ -283,7 +283,7 @@ class AdjustGamma(test_util.TensorFlowTestCase):
       x = constant_op.constant(x_np, shape=x_np.shape)
 
       err_msg = "Gamma should be a non-negative real number"
-      with self.assertRaisesRegexp(ValueError, err_msg):
+      with self.assertRaisesRegex(ValueError, err_msg):
         image_ops.adjust_gamma(x, gamma=-1)
 
   @test_util.run_deprecated_v1
@@ -296,7 +296,7 @@ class AdjustGamma(test_util.TensorFlowTestCase):
       x = constant_op.constant(x_np, shape=x_np.shape)
 
       err_msg = "Gamma should be a non-negative real number"
-      with self.assertRaisesRegexp(ValueError, err_msg):
+      with self.assertRaisesRegex(ValueError, err_msg):
         image_ops.adjust_gamma(x, gamma=-1)
 
   @test_util.run_deprecated_v1
@@ -312,7 +312,7 @@ class AdjustGamma(test_util.TensorFlowTestCase):
       image = image_ops.adjust_gamma(x, gamma=y)
 
       err_msg = "Gamma should be a non-negative real number"
-      with self.assertRaisesRegexp(errors.InvalidArgumentError, err_msg):
+      with self.assertRaisesRegex(errors.InvalidArgumentError, err_msg):
         self.evaluate(image)
 
   def _test_adjust_gamma_uint8(self, gamma):
@@ -525,7 +525,7 @@ class AdjustHueTest(test_util.TensorFlowTestCase):
     x_np = np.random.rand(2, 3) * 255.
     delta_h = np.random.rand() * 2.0 - 1.0
     fused = False
-    with self.assertRaisesRegexp(ValueError, "Shape must be at least rank 3"):
+    with self.assertRaisesRegex(ValueError, "Shape must be at least rank 3"):
       self._adjustHueTf(x_np, delta_h)
     x_np = np.random.rand(4, 2, 4) * 255.
     delta_h = np.random.rand() * 2.0 - 1.0
@@ -1311,7 +1311,7 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
       transformed_unknown_width = op(p_unknown_width)
       self.assertEqual(3, transformed_unknown_width.get_shape().ndims)
 
-      with self.assertRaisesRegexp(ValueError, "must be > 0"):
+      with self.assertRaisesRegex(ValueError, "must be > 0"):
         op(p_zero_dim)
 
     #Ops that support 4D input
@@ -1324,8 +1324,8 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
       self.assertEqual(4, transformed_unknown_dims_4.get_shape().ndims)
       transformed_unknown_batch = op(p_unknown_batch)
       self.assertEqual(4, transformed_unknown_batch.get_shape().ndims)
-      with self.assertRaisesRegexp(ValueError,
-                                   "must be at least three-dimensional"):
+      with self.assertRaisesRegex(ValueError,
+                                  "must be at least three-dimensional"):
         op(p_wrong_rank)
 
   def testRot90GroupOrder(self):
@@ -1448,8 +1448,8 @@ class AdjustContrastTest(test_util.TensorFlowTestCase):
     x_shape = [1, 2, 2, 3]
     x_data = [0, 5, 13, 54, 135, 226, 37, 8, 234, 90, 255, 1]
     x_np = np.array(x_data, dtype=np.uint8).reshape(x_shape)
-    with self.assertRaisesRegexp(
-        ValueError, 'Shape must be rank 0 but is rank 1'):
+    with self.assertRaisesRegex(ValueError,
+                                "Shape must be rank 0 but is rank 1"):
       image_ops.adjust_contrast(x_np, [2.0])
 
 
@@ -4169,8 +4169,8 @@ class ConvertImageTest(test_util.TensorFlowTestCase):
       image = constant_op.constant([1], dtype=dtypes.uint8)
       image_ops.convert_image_dtype(image, dtypes.uint8)
       y = image_ops.convert_image_dtype(image, dtypes.uint8)
-      self.assertEquals(y.op.type, "Identity")
-      self.assertEquals(y.op.inputs[0], image)
+      self.assertEqual(y.op.type, "Identity")
+      self.assertEqual(y.op.inputs[0], image)
 
   @test_util.run_deprecated_v1
   def testConvertBetweenInteger(self):
@@ -4445,42 +4445,42 @@ class NonMaxSuppressionTest(test_util.TensorFlowTestCase):
   @test_util.run_deprecated_v1
   def testInvalidShape(self):
     # The boxes should be 2D of shape [num_boxes, 4].
-    with self.assertRaisesRegexp(ValueError,
-                                 "Shape must be rank 2 but is rank 1"):
+    with self.assertRaisesRegex(ValueError,
+                                "Shape must be rank 2 but is rank 1"):
       boxes = constant_op.constant([0.0, 0.0, 1.0, 1.0])
       scores = constant_op.constant([0.9])
       image_ops.non_max_suppression(boxes, scores, 3, 0.5)
 
-    with self.assertRaisesRegexp(ValueError, "Dimension must be 4 but is 3"):
+    with self.assertRaisesRegex(ValueError, "Dimension must be 4 but is 3"):
       boxes = constant_op.constant([[0.0, 0.0, 1.0]])
       scores = constant_op.constant([0.9])
       image_ops.non_max_suppression(boxes, scores, 3, 0.5)
 
     # The boxes is of shape [num_boxes, 4], and the scores is
     # of shape [num_boxes]. So an error will be thrown.
-    with self.assertRaisesRegexp(ValueError,
-                                 "Dimensions must be equal, but are 1 and 2"):
+    with self.assertRaisesRegex(ValueError,
+                                "Dimensions must be equal, but are 1 and 2"):
       boxes = constant_op.constant([[0.0, 0.0, 1.0, 1.0]])
       scores = constant_op.constant([0.9, 0.75])
       image_ops.non_max_suppression(boxes, scores, 3, 0.5)
 
     # The scores should be 1D of shape [num_boxes].
-    with self.assertRaisesRegexp(ValueError,
-                                 "Shape must be rank 1 but is rank 2"):
+    with self.assertRaisesRegex(ValueError,
+                                "Shape must be rank 1 but is rank 2"):
       boxes = constant_op.constant([[0.0, 0.0, 1.0, 1.0]])
       scores = constant_op.constant([[0.9]])
       image_ops.non_max_suppression(boxes, scores, 3, 0.5)
 
     # The max_output_size should be a scalar (0-D).
-    with self.assertRaisesRegexp(ValueError,
-                                 "Shape must be rank 0 but is rank 1"):
+    with self.assertRaisesRegex(ValueError,
+                                "Shape must be rank 0 but is rank 1"):
       boxes = constant_op.constant([[0.0, 0.0, 1.0, 1.0]])
       scores = constant_op.constant([0.9])
       image_ops.non_max_suppression(boxes, scores, [3], 0.5)
 
     # The iou_threshold should be a scalar (0-D).
-    with self.assertRaisesRegexp(ValueError,
-                                 "Shape must be rank 0 but is rank 2"):
+    with self.assertRaisesRegex(ValueError,
+                                "Shape must be rank 0 but is rank 2"):
       boxes = constant_op.constant([[0.0, 0.0, 1.0, 1.0]])
       scores = constant_op.constant([0.9])
       image_ops.non_max_suppression(boxes, scores, 3, [[0.5]])
diff --git a/tensorflow/python/ops/init_ops_v2_test.py b/tensorflow/python/ops/init_ops_v2_test.py
index 9a8865fbc35..d45d5f6f6b3 100644
--- a/tensorflow/python/ops/init_ops_v2_test.py
+++ b/tensorflow/python/ops/init_ops_v2_test.py
@@ -110,11 +110,11 @@ class ConstantInitializersTest(InitializersTest):
   @test_util.run_in_graph_and_eager_modes
   def testConstantInvalidValue(self):
     c = constant_op.constant([1.0, 2.0, 3.0])
-    with self.assertRaisesRegexp(
-        TypeError, r"Invalid type for initial value: .*Tensor.*"):
+    with self.assertRaisesRegex(TypeError,
+                                r"Invalid type for initial value: .*Tensor.*"):
       init_ops_v2.constant_initializer(c)
     v = variables.Variable([3.0, 2.0, 1.0])
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         TypeError, r"Invalid type for initial value: .*Variable.*"):
       init_ops_v2.constant_initializer(v)
 
diff --git a/tensorflow/python/ops/math_ops_test.py b/tensorflow/python/ops/math_ops_test.py
index 9699f6d2b78..c5448a39be4 100644
--- a/tensorflow/python/ops/math_ops_test.py
+++ b/tensorflow/python/ops/math_ops_test.py
@@ -73,7 +73,7 @@ class ReduceTest(test_util.TensorFlowTestCase):
       return
     x = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.int32)
     axis = np.array([[0], [1]])
-    with self.assertRaisesRegexp(ValueError, "must be at most rank 1"):
+    with self.assertRaisesRegex(ValueError, "must be at most rank 1"):
       math_ops.reduce_sum(x, axis)
 
   def testReduceVar(self):
@@ -83,7 +83,7 @@ class ReduceTest(test_util.TensorFlowTestCase):
         self.evaluate(math_ops.reduce_variance(x, axis=0)), [0, 0, 0])
 
     x = [[1, 2, 1, 1], [1, 1, 0, 1]]
-    with self.assertRaisesRegexp(TypeError, "must be either real or complex"):
+    with self.assertRaisesRegex(TypeError, "must be either real or complex"):
       math_ops.reduce_variance(x)
 
     x = [[1., 2., 1., 1.], [1., 1., 0., 1.]]
@@ -110,7 +110,7 @@ class ReduceTest(test_util.TensorFlowTestCase):
         self.evaluate(math_ops.reduce_std(x, axis=0)), [0, 0, 0])
 
     x = [[1, 2, 1, 1], [1, 1, 0, 1]]
-    with self.assertRaisesRegexp(TypeError, "must be either real or complex"):
+    with self.assertRaisesRegex(TypeError, "must be either real or complex"):
       math_ops.reduce_std(x)
 
     x = [[1., 2., 1., 1.], [1., 1., 0., 1.]]
@@ -176,8 +176,8 @@ class LogSumExpTest(test_util.TensorFlowTestCase):
     for dtype in [np.float16, np.float32, np.double]:
       x_np = np.array(x, dtype=dtype)
       max_np = np.max(x_np)
-      with self.assertRaisesRegexp(RuntimeWarning,
-                                   "overflow encountered in exp"):
+      with self.assertRaisesRegex(RuntimeWarning,
+                                  "overflow encountered in exp"):
         out = np.log(np.sum(np.exp(x_np)))
         if out == np.inf:
           raise RuntimeWarning("overflow encountered in exp")
@@ -193,8 +193,8 @@ class LogSumExpTest(test_util.TensorFlowTestCase):
     for dtype in [np.float16, np.float32, np.double]:
       x_np = np.array(x, dtype=dtype)
       max_np = np.max(x_np)
-      with self.assertRaisesRegexp(RuntimeWarning,
-                                   "divide by zero encountered in log"):
+      with self.assertRaisesRegex(RuntimeWarning,
+                                  "divide by zero encountered in log"):
         out = np.log(np.sum(np.exp(x_np)))
         if out == -np.inf:
           raise RuntimeWarning("divide by zero encountered in log")
@@ -314,7 +314,7 @@ class ApproximateEqualTest(test_util.TensorFlowTestCase):
       x = np.array([1, 2], dtype=dtype)
       y = np.array([[1, 2]], dtype=dtype)
       # The inputs 'x' and 'y' must have the same shape.
-      with self.assertRaisesRegexp(
+      with self.assertRaisesRegex(
           (ValueError, errors.InvalidArgumentError),
           "Shapes must be equal rank|must be of the same shape"):
         math_ops.approximate_equal(x, y)
@@ -761,7 +761,7 @@ class BinaryOpsTest(test_util.TensorFlowTestCase):
       error_message = (
           "Input 'y' of 'Add(V2)?' Op has type float32 that does not "
           "match type int32 of argument 'x'.")
-    with self.assertRaisesRegexp(error, error_message):
+    with self.assertRaisesRegex(error, error_message):
       a = array_ops.ones([1], dtype=dtypes.int32) + 1.0
       self.evaluate(a)
 
@@ -786,7 +786,8 @@ class BinaryOpsTest(test_util.TensorFlowTestCase):
 
       def __radd__(self, other):
         raise TypeError("RHS not implemented")
-    with self.assertRaisesRegexp(error, error_message):
+
+    with self.assertRaisesRegex(error, error_message):
       a = array_ops.ones([1], dtype=dtypes.int32) + RHSRaisesError()
       self.evaluate(a)
 
@@ -794,13 +795,15 @@ class BinaryOpsTest(test_util.TensorFlowTestCase):
 
       def __radd__(self, other):
         return NotImplemented
-    with self.assertRaisesRegexp(error, error_message):
+
+    with self.assertRaisesRegex(error, error_message):
       a = array_ops.ones([1], dtype=dtypes.int32) + RHSReturnsNotImplemented()
       self.evaluate(a)
 
     class RHSNotImplemented(object):
       pass
-    with self.assertRaisesRegexp(error, error_message):
+
+    with self.assertRaisesRegex(error, error_message):
       a = array_ops.ones([1], dtype=dtypes.int32) + RHSNotImplemented()
       self.evaluate(a)
 
diff --git a/tensorflow/python/ops/nccl_ops_test.py b/tensorflow/python/ops/nccl_ops_test.py
index d481bd3c2bd..5b3e3e68921 100644
--- a/tensorflow/python/ops/nccl_ops_test.py
+++ b/tensorflow/python/ops/nccl_ops_test.py
@@ -141,9 +141,9 @@ class AllReduceTest(NcclTestCase):
         partial(_NcclAllReduce, nccl_ops.all_sum), lambda x, y: x + y)
 
   def testErrors(self):
-    with self.assertRaisesRegexp(ValueError, 'Device assignment required'):
+    with self.assertRaisesRegex(ValueError, 'Device assignment required'):
       nccl_ops.all_sum([array_ops.identity(np.random.random_sample((3, 4)))])
-    with self.assertRaisesRegexp(ValueError, 'Must pass >0 tensors'):
+    with self.assertRaisesRegex(ValueError, 'Must pass >0 tensors'):
       nccl_ops.all_sum([])
 
 
@@ -173,7 +173,7 @@ class BroadcastTest(NcclTestCase):
       self._Test(_NcclBroadcast, lambda x, y: x,
                  (['/device:GPU:0', '/device:CPU:0'],))
     except errors.NotFoundError as e:
-      self.assertRegexpMatches(
+      self.assertRegex(
           str(e), "No registered '_NcclBroadcastRecv' OpKernel for CPU devices")
     else:
       # Session isn't executed when no GPU is available.
diff --git a/tensorflow/python/ops/nn_batchnorm_test.py b/tensorflow/python/ops/nn_batchnorm_test.py
index 5f0616b384f..de936b68a80 100644
--- a/tensorflow/python/ops/nn_batchnorm_test.py
+++ b/tensorflow/python/ops/nn_batchnorm_test.py
@@ -295,8 +295,8 @@ class BatchNormalizationTest(test.TestCase):
                                                shift_after_normalization)
             tf_batch_norm, keep_dims_tf_batch_norm = sess.run(
                 [bn, keep_dims_bn])
-            self.assertEquals(x_shape, tf_batch_norm.shape)
-            self.assertEquals(x_shape, keep_dims_tf_batch_norm.shape)
+            self.assertEqual(x_shape, tf_batch_norm.shape)
+            self.assertEqual(x_shape, keep_dims_tf_batch_norm.shape)
             self.assertAllClose(
                 tf_batch_norm, keep_dims_tf_batch_norm, atol=0.000001)
 
@@ -328,8 +328,8 @@ class BatchNormalizationTest(test.TestCase):
                                               scale_after_normalization,
                                               shift_after_normalization)
             [tf_batch_norm] = self.evaluate([bn])
-            self.assertEquals(x_shape, np_batch_norm.shape)
-            self.assertEquals(x_shape, tf_batch_norm.shape)
+            self.assertEqual(x_shape, np_batch_norm.shape)
+            self.assertEqual(x_shape, tf_batch_norm.shape)
             self.assertAllClose(np_batch_norm, tf_batch_norm, atol=atol)
 
   def testBatchNormArbitraryShapes(self):
diff --git a/tensorflow/python/ops/nn_loss_scaling_utilities_test.py b/tensorflow/python/ops/nn_loss_scaling_utilities_test.py
index 9b1c8cc791a..4f96f9ba6a3 100644
--- a/tensorflow/python/ops/nn_loss_scaling_utilities_test.py
+++ b/tensorflow/python/ops/nn_loss_scaling_utilities_test.py
@@ -97,9 +97,9 @@ class LossUtilitiesTest(test_lib.TestCase, parameterized.TestCase):
           self.evaluate(loss), (2. * 0.3 + 0.5 * 0.7 + 4. * 0.2 + 1. * 0.8) / 2)
 
   def testComputeAverageLossInvalidSampleWeights(self):
-    with self.assertRaisesRegexp((ValueError, errors_impl.InvalidArgumentError),
-                                 (r"Incompatible shapes: \[3\] vs. \[2\]|"
-                                  "Dimensions must be equal")):
+    with self.assertRaisesRegex((ValueError, errors_impl.InvalidArgumentError),
+                                (r"Incompatible shapes: \[3\] vs. \[2\]|"
+                                 "Dimensions must be equal")):
       nn_impl.compute_average_loss([2.5, 6.2, 5.],
                                    sample_weight=[0.2, 0.8],
                                    global_batch_size=10)
diff --git a/tensorflow/python/ops/nn_test.py b/tensorflow/python/ops/nn_test.py
index bfe11b63eea..b5bcc2c3099 100644
--- a/tensorflow/python/ops/nn_test.py
+++ b/tensorflow/python/ops/nn_test.py
@@ -1027,7 +1027,7 @@ class LeakyReluTest(test_lib.TestCase):
     inputs = constant_op.constant(inputs)
 
     outputs = nn_ops.leaky_relu(inputs)
-    self.assertEquals(inputs.shape, outputs.shape)
+    self.assertEqual(inputs.shape, outputs.shape)
 
     inputs, outputs = self.evaluate([inputs, outputs])
 
diff --git a/tensorflow/python/ops/nn_xent_test.py b/tensorflow/python/ops/nn_xent_test.py
index 3e5c198fc6a..81b25a396d7 100644
--- a/tensorflow/python/ops/nn_xent_test.py
+++ b/tensorflow/python/ops/nn_xent_test.py
@@ -106,7 +106,7 @@ class SigmoidCrossEntropyWithLogitsTest(test.TestCase):
     self.assertAllClose(grads, [0.5, -0.5])
 
   def testShapeError(self):
-    with self.assertRaisesRegexp(ValueError, "must have the same shape"):
+    with self.assertRaisesRegex(ValueError, "must have the same shape"):
       nn_impl.sigmoid_cross_entropy_with_logits(labels=[1, 2, 3],
                                                 logits=[[2, 1]])
 
@@ -174,7 +174,7 @@ class WeightedCrossEntropyTest(test.TestCase):
     self.assertLess(err, 1e-7)
 
   def testShapeError(self):
-    with self.assertRaisesRegexp(ValueError, "must have the same shape"):
+    with self.assertRaisesRegex(ValueError, "must have the same shape"):
       nn_impl.weighted_cross_entropy_with_logits(
           targets=[1, 2, 3], logits=[[2, 1]], pos_weight=2.0)
 
diff --git a/tensorflow/python/ops/numpy_ops/np_utils_test.py b/tensorflow/python/ops/numpy_ops/np_utils_test.py
index 38b51f05e6e..001991e244e 100644
--- a/tensorflow/python/ops/numpy_ops/np_utils_test.py
+++ b/tensorflow/python/ops/numpy_ops/np_utils_test.py
@@ -68,20 +68,20 @@ f docstring.
       return
 
     # pylint: disable=unused-variable
-    with self.assertRaisesRegexp(TypeError, 'Cannot find parameter'):
+    with self.assertRaisesRegex(TypeError, 'Cannot find parameter'):
 
       @np_utils.np_doc(None, np_fun=np_fun)
       def f1(a):
         return
 
-    with self.assertRaisesRegexp(TypeError, 'is of kind'):
+    with self.assertRaisesRegex(TypeError, 'is of kind'):
 
       @np_utils.np_doc(None, np_fun=np_fun)
       def f2(x, kwargs):
         return
 
-    with self.assertRaisesRegexp(TypeError,
-                                 'Parameter "y" should have a default value'):
+    with self.assertRaisesRegex(TypeError,
+                                'Parameter "y" should have a default value'):
 
       @np_utils.np_doc(None, np_fun=np_fun)
       def f3(x, y):
diff --git a/tensorflow/python/ops/parallel_for/array_test.py b/tensorflow/python/ops/parallel_for/array_test.py
index 93fc8be78cc..85a2f6c191b 100644
--- a/tensorflow/python/ops/parallel_for/array_test.py
+++ b/tensorflow/python/ops/parallel_for/array_test.py
@@ -209,7 +209,7 @@ class ArrayTest(PForTestCase):
       x1 = array_ops.gather(x, i)
       return array_ops.tile(x1, [i, 1])
 
-    with self.assertRaisesRegexp(ValueError, "expected to be loop invariant"):
+    with self.assertRaisesRegex(ValueError, "expected to be loop invariant"):
       pfor_control_flow_ops.pfor(loop_fn, 2, fallback_to_while_loop=False)
 
   def test_pack(self):
@@ -458,7 +458,7 @@ class ArrayTest(PForTestCase):
     # handled.
     self._test_loop_fn(loop_fn, 3, fallback_to_while_loop=True)
     # Without fallback, ValueError is thrown.
-    with self.assertRaisesRegexp(ValueError, "expected to be loop invariant"):
+    with self.assertRaisesRegex(ValueError, "expected to be loop invariant"):
       self._test_loop_fn(loop_fn, 3, fallback_to_while_loop=False)
 
   def test_depth_to_space(self):
diff --git a/tensorflow/python/ops/parallel_for/control_flow_ops_test.py b/tensorflow/python/ops/parallel_for/control_flow_ops_test.py
index 605400aa45b..f8e4e4762ac 100644
--- a/tensorflow/python/ops/parallel_for/control_flow_ops_test.py
+++ b/tensorflow/python/ops/parallel_for/control_flow_ops_test.py
@@ -82,7 +82,7 @@ class PForTest(PForTestCase):
       x_i = array_ops.gather(x, i)
       return nn.top_k(x_i)
 
-    with self.assertRaisesRegexp(ValueError, "No pfor vectorization"):
+    with self.assertRaisesRegex(ValueError, "No pfor vectorization"):
       self._test_loop_fn(loop_fn, 3, fallback_to_while_loop=False)
     self._test_loop_fn(loop_fn, 3, fallback_to_while_loop=True)
 
@@ -103,14 +103,14 @@ class PForTest(PForTestCase):
           parallel_iterations=parallel_iterations)
 
   def test_parallel_iterations_zero(self):
-    with self.assertRaisesRegexp(ValueError, "positive integer"):
+    with self.assertRaisesRegex(ValueError, "positive integer"):
       pfor_control_flow_ops.pfor(lambda i: 1, 8, parallel_iterations=0)
-    with self.assertRaisesRegexp(TypeError, "positive integer"):
+    with self.assertRaisesRegex(TypeError, "positive integer"):
       pfor_control_flow_ops.for_loop(
           lambda i: 1, dtypes.int32, 8, parallel_iterations=0)
 
   def test_parallel_iterations_one(self):
-    with self.assertRaisesRegexp(ValueError, "Use for_loop instead"):
+    with self.assertRaisesRegex(ValueError, "Use for_loop instead"):
       pfor_control_flow_ops.pfor(lambda i: 1, 8, parallel_iterations=1)
 
   def test_vectorized_map(self):
@@ -273,8 +273,8 @@ class ReductionTest(PForTestCase):
       x_i = array_ops.gather(x, i)
       return pfor_config.reduce_sum(x_i)
 
-    with self.assertRaisesRegexp(ValueError,
-                                 "parallel_iterations currently unsupported"):
+    with self.assertRaisesRegex(ValueError,
+                                "parallel_iterations currently unsupported"):
       pfor_control_flow_ops.pfor(loop_fn, 8, parallel_iterations=2)
 
 
@@ -1084,7 +1084,7 @@ class StackTest(PForTestCase):
     def loop_fn(_):
       return data_flow_ops.stack_push_v2(s, 7)
 
-    with self.assertRaisesRegexp(ValueError, "StackPushV2 not allowed.*"):
+    with self.assertRaisesRegex(ValueError, "StackPushV2 not allowed.*"):
       pfor_control_flow_ops.pfor(loop_fn, iters=2)
 
 
@@ -2092,7 +2092,7 @@ class VariableTest(PForTestCase):
       return math_ops.matmul(z, a_var / 16)
 
     # Note that this error is only raised under v2 behavior.
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         ValueError,
         "tf.function-decorated function tried to create variables on non-first"
     ):
diff --git a/tensorflow/python/ops/parallel_for/gradients_test.py b/tensorflow/python/ops/parallel_for/gradients_test.py
index fdb70c52778..90cc2a0ef31 100644
--- a/tensorflow/python/ops/parallel_for/gradients_test.py
+++ b/tensorflow/python/ops/parallel_for/gradients_test.py
@@ -439,7 +439,7 @@ class GradientsTest(test.TestCase):
   def test_batch_jacobian_bad_shapes(self):
     x = random_ops.random_uniform([2, 2])
     y = random_ops.random_uniform([3, 2])
-    with self.assertRaisesRegexp(ValueError, "Need first dimension of output"):
+    with self.assertRaisesRegex(ValueError, "Need first dimension of output"):
       gradients.batch_jacobian(y, x, use_pfor=True)
 
   def test_batch_jacobian_bad_unknown_shapes(self):
@@ -447,8 +447,8 @@ class GradientsTest(test.TestCase):
       x = array_ops.placeholder(dtypes.float32)
       y = array_ops.concat([x, x], axis=0)
       jacobian = gradients.batch_jacobian(y, x)
-      with self.assertRaisesRegexp(errors.InvalidArgumentError,
-                                   "assertion failed"):
+      with self.assertRaisesRegex(errors.InvalidArgumentError,
+                                  "assertion failed"):
         sess.run(jacobian, feed_dict={x: [[1, 2], [3, 4]]})
 
   def test_batch_jacobian_fixed_shape(self):
diff --git a/tensorflow/python/ops/ragged/convert_to_tensor_or_ragged_tensor_op_test.py b/tensorflow/python/ops/ragged/convert_to_tensor_or_ragged_tensor_op_test.py
index 087f048befa..f63bae3fdee 100644
--- a/tensorflow/python/ops/ragged/convert_to_tensor_or_ragged_tensor_op_test.py
+++ b/tensorflow/python/ops/ragged/convert_to_tensor_or_ragged_tensor_op_test.py
@@ -82,7 +82,7 @@ class RaggedConvertToTensorOrRaggedTensorTest(test_util.TensorFlowTestCase,
                                    preferred_dtype=None):
     rt = ragged_factory_ops.constant(pylist)
 
-    with self.assertRaisesRegexp(ValueError, message):
+    with self.assertRaisesRegex(ValueError, message):
       ragged_tensor.convert_to_tensor_or_ragged_tensor(rt, dtype,
                                                        preferred_dtype)
 
@@ -139,7 +139,7 @@ class RaggedConvertToTensorOrRaggedTensorTest(test_util.TensorFlowTestCase,
                                         message,
                                         dtype=None,
                                         preferred_dtype=None):
-    with self.assertRaisesRegexp(ValueError, message):
+    with self.assertRaisesRegex(ValueError, message):
       ragged_tensor.convert_to_tensor_or_ragged_tensor(value, dtype,
                                                        preferred_dtype)
 
@@ -175,7 +175,7 @@ class RaggedConvertToTensorOrRaggedTensorTest(test_util.TensorFlowTestCase,
                              dtype=None,
                              preferred_dtype=None):
     tensor = constant_op.constant(pylist)
-    with self.assertRaisesRegexp(ValueError, message):
+    with self.assertRaisesRegex(ValueError, message):
       ragged_tensor.convert_to_tensor_or_ragged_tensor(tensor, dtype,
                                                        preferred_dtype)
 
@@ -225,7 +225,7 @@ class RaggedConvertToTensorOrRaggedTensorTest(test_util.TensorFlowTestCase,
                                  message,
                                  dtype=None,
                                  preferred_dtype=None):
-    with self.assertRaisesRegexp(ValueError, message):
+    with self.assertRaisesRegex(ValueError, message):
       ragged_tensor.convert_to_tensor_or_ragged_tensor(value, dtype,
                                                        preferred_dtype)
 
diff --git a/tensorflow/python/ops/ragged/ragged_batch_gather_op_test.py b/tensorflow/python/ops/ragged/ragged_batch_gather_op_test.py
index 549a660ee12..fd40764ddd4 100644
--- a/tensorflow/python/ops/ragged/ragged_batch_gather_op_test.py
+++ b/tensorflow/python/ops/ragged/ragged_batch_gather_op_test.py
@@ -476,12 +476,14 @@ class RaggedBatchGatherOpTest(test_util.TensorFlowTestCase,
     ragged_indices = ragged_tensor.RaggedTensor.from_row_splits(
         indices, [0, 2, 4])
 
-    with self.assertRaisesRegexp(ValueError, r'batch_dims may only be negative '
-                                 r'if rank\(indices\) is statically known.'):
+    with self.assertRaisesRegex(
+        ValueError, r'batch_dims may only be negative '
+        r'if rank\(indices\) is statically known.'):
       ragged_batch_gather_ops.batch_gather(params, indices)
 
-    with self.assertRaisesRegexp(ValueError, r'batch_dims may only be negative '
-                                 r'if rank\(indices\) is statically known.'):
+    with self.assertRaisesRegex(
+        ValueError, r'batch_dims may only be negative '
+        r'if rank\(indices\) is statically known.'):
       ragged_batch_gather_ops.batch_gather(params, ragged_indices)
 
   @parameterized.parameters(
@@ -527,7 +529,7 @@ class RaggedBatchGatherOpTest(test_util.TensorFlowTestCase,
                                        indices,
                                        message=None,
                                        error=ValueError):
-    with self.assertRaisesRegexp(error, message):
+    with self.assertRaisesRegex(error, message):
       ragged_batch_gather_ops.batch_gather(params, indices)
 
 
diff --git a/tensorflow/python/ops/ragged/ragged_boolean_mask_op_test.py b/tensorflow/python/ops/ragged/ragged_boolean_mask_op_test.py
index aa3e5583c46..c06f280fe0c 100644
--- a/tensorflow/python/ops/ragged/ragged_boolean_mask_op_test.py
+++ b/tensorflow/python/ops/ragged/ragged_boolean_mask_op_test.py
@@ -246,35 +246,36 @@ class RaggedBooleanMaskOpTest(test_util.TensorFlowTestCase,
 
   def testErrors(self):
     if not context.executing_eagerly():
-      self.assertRaisesRegexp(ValueError,
-                              r'mask\.shape\.ndims must be known statically',
-                              ragged_array_ops.boolean_mask, [[1, 2]],
-                              array_ops.placeholder(dtypes.bool))
+      self.assertRaisesRegex(ValueError,
+                             r'mask\.shape\.ndims must be known statically',
+                             ragged_array_ops.boolean_mask, [[1, 2]],
+                             array_ops.placeholder(dtypes.bool))
 
     self.assertRaises(TypeError, ragged_array_ops.boolean_mask, [[1, 2]],
                       [[0, 1]])
-    self.assertRaisesRegexp(
+    self.assertRaisesRegex(
         ValueError, 'Tensor conversion requested dtype bool for '
         'RaggedTensor with dtype int32', ragged_array_ops.boolean_mask,
         ragged_factory_ops.constant([[1, 2]]),
         ragged_factory_ops.constant([[0, 0]]))
 
-    self.assertRaisesRegexp(
-        ValueError, r'Shapes \(1, 2\) and \(1, 3\) are incompatible',
-        ragged_array_ops.boolean_mask, [[1, 2]], [[True, False, True]])
+    self.assertRaisesRegex(ValueError,
+                           r'Shapes \(1, 2\) and \(1, 3\) are incompatible',
+                           ragged_array_ops.boolean_mask, [[1, 2]],
+                           [[True, False, True]])
 
-    self.assertRaisesRegexp(errors.InvalidArgumentError,
-                            r'Inputs must have identical ragged splits',
-                            ragged_array_ops.boolean_mask,
-                            ragged_factory_ops.constant([[1, 2]]),
-                            ragged_factory_ops.constant([[True, False, True]]))
+    self.assertRaisesRegex(errors.InvalidArgumentError,
+                           r'Inputs must have identical ragged splits',
+                           ragged_array_ops.boolean_mask,
+                           ragged_factory_ops.constant([[1, 2]]),
+                           ragged_factory_ops.constant([[True, False, True]]))
 
-    self.assertRaisesRegexp(ValueError, 'mask cannot be scalar',
-                            ragged_array_ops.boolean_mask, [[1, 2]], True)
+    self.assertRaisesRegex(ValueError, 'mask cannot be scalar',
+                           ragged_array_ops.boolean_mask, [[1, 2]], True)
 
-    self.assertRaisesRegexp(ValueError, 'mask cannot be scalar',
-                            ragged_array_ops.boolean_mask,
-                            ragged_factory_ops.constant([[1, 2]]), True)
+    self.assertRaisesRegex(ValueError, 'mask cannot be scalar',
+                           ragged_array_ops.boolean_mask,
+                           ragged_factory_ops.constant([[1, 2]]), True)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/ops/ragged/ragged_concat_op_test.py b/tensorflow/python/ops/ragged/ragged_concat_op_test.py
index 4661061de33..1ce07f2acd3 100644
--- a/tensorflow/python/ops/ragged/ragged_concat_op_test.py
+++ b/tensorflow/python/ops/ragged/ragged_concat_op_test.py
@@ -275,8 +275,8 @@ class RaggedConcatOpTest(test_util.TensorFlowTestCase,
                       message=None,
                       ragged_ranks=None):
     rt_inputs = self._rt_inputs_to_tensors(rt_inputs, ragged_ranks)
-    self.assertRaisesRegexp(error, message, ragged_concat_ops.concat, rt_inputs,
-                            axis)
+    self.assertRaisesRegex(error, message, ragged_concat_ops.concat, rt_inputs,
+                           axis)
 
   @parameterized.parameters([
       dict(
@@ -294,7 +294,7 @@ class RaggedConcatOpTest(test_util.TensorFlowTestCase,
         array_ops.placeholder_with_default(rt, shape=None) for rt in rt_inputs
     ]
     concatenated = ragged_concat_ops.concat(rt_inputs, axis)
-    with self.assertRaisesRegexp(error, message):
+    with self.assertRaisesRegex(error, message):
       self.evaluate(concatenated)
 
   def testNegativeAxisWithUnknownRankError(self):
@@ -304,7 +304,7 @@ class RaggedConcatOpTest(test_util.TensorFlowTestCase,
         array_ops.placeholder(dtypes.int64),
         array_ops.placeholder(dtypes.int64)
     ]
-    self.assertRaisesRegexp(
+    self.assertRaisesRegex(
         ValueError, r'axis may only be negative if ndims is statically known.',
         ragged_concat_ops.concat, rt_inputs, -1)
 
diff --git a/tensorflow/python/ops/ragged/ragged_const_op_test.py b/tensorflow/python/ops/ragged/ragged_const_op_test.py
index 883a4a55d76..62e7a7087aa 100644
--- a/tensorflow/python/ops/ragged/ragged_const_op_test.py
+++ b/tensorflow/python/ops/ragged/ragged_const_op_test.py
@@ -301,7 +301,7 @@ class RaggedConstOpTest(test_util.TensorFlowTestCase,
                            exception=None,
                            message=None):
     """Tests that `ragged_const()` raises an expected exception."""
-    self.assertRaisesRegexp(
+    self.assertRaisesRegex(
         exception,
         message,
         ragged_factory_ops.constant,
@@ -341,9 +341,9 @@ class RaggedConstOpTest(test_util.TensorFlowTestCase,
                                   message=None):
     """Tests for the _find_scalar_and_max_depth helper function."""
     if exception is not None:
-      self.assertRaisesRegexp(exception, message,
-                              ragged_factory_ops._find_scalar_and_max_depth,
-                              pylist)
+      self.assertRaisesRegex(exception, message,
+                             ragged_factory_ops._find_scalar_and_max_depth,
+                             pylist)
     else:
       self.assertEqual(
           ragged_factory_ops._find_scalar_and_max_depth(pylist),
@@ -391,7 +391,7 @@ class RaggedConstOpTest(test_util.TensorFlowTestCase,
                                            message=None):
     """Tests for the _default_inner_shape_for_pylist helper function."""
     if exception is not None:
-      self.assertRaisesRegexp(
+      self.assertRaisesRegex(
           exception, message,
           ragged.ragged_factory_ops._default_inner_shape_for_pylist, pylist,
           ragged_rank)
diff --git a/tensorflow/python/ops/ragged/ragged_constant_value_op_test.py b/tensorflow/python/ops/ragged/ragged_constant_value_op_test.py
index 94df6617a74..9cc395a0bb9 100644
--- a/tensorflow/python/ops/ragged/ragged_constant_value_op_test.py
+++ b/tensorflow/python/ops/ragged/ragged_constant_value_op_test.py
@@ -306,7 +306,7 @@ class RaggedConstantValueOpTest(test_util.TensorFlowTestCase,
                             exception=None,
                             message=None):
     """Tests that `constant_value()` raises an expected exception."""
-    self.assertRaisesRegexp(
+    self.assertRaisesRegex(
         exception,
         message,
         ragged_factory_ops.constant_value,
diff --git a/tensorflow/python/ops/ragged/ragged_cross_op_test.py b/tensorflow/python/ops/ragged/ragged_cross_op_test.py
index 1b089868c73..07e5964ba83 100644
--- a/tensorflow/python/ops/ragged/ragged_cross_op_test.py
+++ b/tensorflow/python/ops/ragged/ragged_cross_op_test.py
@@ -364,7 +364,7 @@ class RaggedCrossOpTest(test_util.TensorFlowTestCase, parameterized.TestCase):
           message='inputs must all have the same batch dimension size'),
   ])
   def testStaticError(self, inputs, exception=ValueError, message=None):
-    with self.assertRaisesRegexp(exception, message):
+    with self.assertRaisesRegex(exception, message):
       ragged_array_ops.cross(inputs)
 
   @parameterized.named_parameters([
@@ -381,7 +381,7 @@ class RaggedCrossOpTest(test_util.TensorFlowTestCase, parameterized.TestCase):
                        inputs,
                        exception=errors.InvalidArgumentError,
                        message=None):
-    with self.assertRaisesRegexp(exception, message):
+    with self.assertRaisesRegex(exception, message):
       self.evaluate(ragged_array_ops.cross(inputs))
 
   def _ragged_to_sparse(self, t):
diff --git a/tensorflow/python/ops/ragged/ragged_dispatch_test.py b/tensorflow/python/ops/ragged/ragged_dispatch_test.py
index 193e329e18a..7ef0d9fd0b8 100644
--- a/tensorflow/python/ops/ragged/ragged_dispatch_test.py
+++ b/tensorflow/python/ops/ragged/ragged_dispatch_test.py
@@ -445,8 +445,8 @@ class RaggedDispatchTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     x = ragged_factory_ops.constant([[1, 2], [3]])
     y = ragged_tensor.RaggedTensor.from_row_splits(
         array_ops.placeholder_with_default([1, 2, 3], shape=None), x.row_splits)
-    with self.assertRaisesRegexp(ValueError,
-                                 r'Unable to broadcast: unknown rank'):
+    with self.assertRaisesRegex(ValueError,
+                                r'Unable to broadcast: unknown rank'):
       math_ops.add(x, y)
 
   @parameterized.parameters([
diff --git a/tensorflow/python/ops/ragged/ragged_dynamic_partition_op_test.py b/tensorflow/python/ops/ragged/ragged_dynamic_partition_op_test.py
index 790cabdaf6f..7c92be76171 100644
--- a/tensorflow/python/ops/ragged/ragged_dynamic_partition_op_test.py
+++ b/tensorflow/python/ops/ragged/ragged_dynamic_partition_op_test.py
@@ -215,8 +215,8 @@ class RaggedSegmentStackOpTest(test_util.TensorFlowTestCase,
   def testRuntimeError(self, data, partitions, num_partitions, error):
     data = ragged_factory_ops.constant(data)
     partitions = ragged_factory_ops.constant(partitions, dtype=dtypes.int64)
-    with self.assertRaisesRegexp((ValueError, errors.InvalidArgumentError),
-                                 error):
+    with self.assertRaisesRegex((ValueError, errors.InvalidArgumentError),
+                                error):
       self.evaluate(
           ragged_array_ops.stack_dynamic_partitions(data, partitions,
                                                     num_partitions))
@@ -239,8 +239,8 @@ class RaggedSegmentStackOpTest(test_util.TensorFlowTestCase,
           error='must have rank 0'),
   ])
   def testStaticError(self, data, partitions, num_partitions, error):
-    with self.assertRaisesRegexp((ValueError, errors.InvalidArgumentError),
-                                 error):
+    with self.assertRaisesRegex((ValueError, errors.InvalidArgumentError),
+                                error):
       ragged_array_ops.stack_dynamic_partitions(data, partitions,
                                                 num_partitions)
 
@@ -248,8 +248,8 @@ class RaggedSegmentStackOpTest(test_util.TensorFlowTestCase,
     if context.executing_eagerly():
       return
     partitions = array_ops.placeholder(dtypes.int32, None)
-    with self.assertRaisesRegexp((ValueError, errors.InvalidArgumentError),
-                                 'partitions must have known rank'):
+    with self.assertRaisesRegex((ValueError, errors.InvalidArgumentError),
+                                'partitions must have known rank'):
       ragged_array_ops.stack_dynamic_partitions(['a', 'b', 'c'], partitions, 10)
 
 
diff --git a/tensorflow/python/ops/ragged/ragged_from_sparse_op_test.py b/tensorflow/python/ops/ragged/ragged_from_sparse_op_test.py
index aac9052d044..7076fbce682 100644
--- a/tensorflow/python/ops/ragged/ragged_from_sparse_op_test.py
+++ b/tensorflow/python/ops/ragged/ragged_from_sparse_op_test.py
@@ -51,21 +51,21 @@ class RaggedTensorToSparseOpTest(test_util.TensorFlowTestCase):
 
   def testBadSparseTensorRank(self):
     st1 = sparse_tensor.SparseTensor(indices=[[0]], values=[0], dense_shape=[3])
-    self.assertRaisesRegexp(ValueError, r'rank\(st_input\) must be 2',
-                            RaggedTensor.from_sparse, st1)
+    self.assertRaisesRegex(ValueError, r'rank\(st_input\) must be 2',
+                           RaggedTensor.from_sparse, st1)
 
     st2 = sparse_tensor.SparseTensor(
         indices=[[0, 0, 0]], values=[0], dense_shape=[3, 3, 3])
-    self.assertRaisesRegexp(ValueError, r'rank\(st_input\) must be 2',
-                            RaggedTensor.from_sparse, st2)
+    self.assertRaisesRegex(ValueError, r'rank\(st_input\) must be 2',
+                           RaggedTensor.from_sparse, st2)
 
     if not context.executing_eagerly():
       st3 = sparse_tensor.SparseTensor(
           indices=array_ops.placeholder(dtypes.int64),
           values=[0],
           dense_shape=array_ops.placeholder(dtypes.int64))
-      self.assertRaisesRegexp(ValueError, r'rank\(st_input\) must be 2',
-                              RaggedTensor.from_sparse, st3)
+      self.assertRaisesRegex(ValueError, r'rank\(st_input\) must be 2',
+                             RaggedTensor.from_sparse, st3)
 
   def testGoodPartialSparseTensorRank(self):
     if not context.executing_eagerly():
@@ -91,20 +91,20 @@ class RaggedTensorToSparseOpTest(test_util.TensorFlowTestCase):
     # index_suffix of first index is not zero.
     st1 = sparse_tensor.SparseTensor(
         indices=[[0, 1], [0, 2], [2, 0]], values=[1, 2, 3], dense_shape=[3, 3])
-    with self.assertRaisesRegexp(errors.InvalidArgumentError,
-                                 r'.*SparseTensor is not right-ragged'):
+    with self.assertRaisesRegex(errors.InvalidArgumentError,
+                                r'.*SparseTensor is not right-ragged'):
       self.evaluate(RaggedTensor.from_sparse(st1))
     # index_suffix of an index that starts a new row is not zero.
     st2 = sparse_tensor.SparseTensor(
         indices=[[0, 0], [0, 1], [2, 1]], values=[1, 2, 3], dense_shape=[3, 3])
-    with self.assertRaisesRegexp(errors.InvalidArgumentError,
-                                 r'.*SparseTensor is not right-ragged'):
+    with self.assertRaisesRegex(errors.InvalidArgumentError,
+                                r'.*SparseTensor is not right-ragged'):
       self.evaluate(RaggedTensor.from_sparse(st2))
     # index_suffix of an index that continues a row skips a cell.
     st3 = sparse_tensor.SparseTensor(
         indices=[[0, 1], [0, 1], [0, 3]], values=[1, 2, 3], dense_shape=[3, 3])
-    with self.assertRaisesRegexp(errors.InvalidArgumentError,
-                                 r'.*SparseTensor is not right-ragged'):
+    with self.assertRaisesRegex(errors.InvalidArgumentError,
+                                r'.*SparseTensor is not right-ragged'):
       self.evaluate(RaggedTensor.from_sparse(st3))
 
 
diff --git a/tensorflow/python/ops/ragged/ragged_from_tensor_op_test.py b/tensorflow/python/ops/ragged/ragged_from_tensor_op_test.py
index 110caa28b59..7395bf52ba8 100644
--- a/tensorflow/python/ops/ragged/ragged_from_tensor_op_test.py
+++ b/tensorflow/python/ops/ragged/ragged_from_tensor_op_test.py
@@ -628,8 +628,8 @@ class RaggedTensorFromTensorOpTest(test_util.TensorFlowTestCase,
                  ragged_rank=1,
                  error=None):
     dt = constant_op.constant(tensor)
-    self.assertRaisesRegexp(error[0], error[1], RaggedTensor.from_tensor, dt,
-                            lengths, padding, ragged_rank)
+    self.assertRaisesRegex(error[0], error[1], RaggedTensor.from_tensor, dt,
+                           lengths, padding, ragged_rank)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/ops/ragged/ragged_gather_nd_op_test.py b/tensorflow/python/ops/ragged/ragged_gather_nd_op_test.py
index 19977b2f88b..a627d8848da 100644
--- a/tensorflow/python/ops/ragged/ragged_gather_nd_op_test.py
+++ b/tensorflow/python/ops/ragged/ragged_gather_nd_op_test.py
@@ -210,10 +210,10 @@ class RaggedGatherNdOpTest(test_util.TensorFlowTestCase,
     indices1 = array_ops.placeholder(dtypes.int32, shape=None)
     indices2 = array_ops.placeholder(dtypes.int32, shape=[None])
 
-    with self.assertRaisesRegexp(ValueError,
-                                 'indices.rank be statically known.'):
+    with self.assertRaisesRegex(ValueError,
+                                'indices.rank be statically known.'):
       ragged_gather_ops.gather_nd(params, indices1)
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         ValueError, r'indices.shape\[-1\] must be statically known.'):
       ragged_gather_ops.gather_nd(params, indices2)
 
@@ -236,7 +236,7 @@ class RaggedGatherNdOpTest(test_util.TensorFlowTestCase,
                                     indices,
                                     message=None,
                                     error=ValueError):
-    with self.assertRaisesRegexp(error, message):
+    with self.assertRaisesRegex(error, message):
       ragged_gather_ops.gather_nd(params, indices)
 
 
diff --git a/tensorflow/python/ops/ragged/ragged_gather_op_test.py b/tensorflow/python/ops/ragged/ragged_gather_op_test.py
index 928e634989c..96ade51aaba 100644
--- a/tensorflow/python/ops/ragged/ragged_gather_op_test.py
+++ b/tensorflow/python/ops/ragged/ragged_gather_op_test.py
@@ -174,14 +174,14 @@ class RaggedGatherOpTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     tensor_indices = [0, 1, 2]
     ragged_params = ragged_factory_ops.constant([['a', 'b'], ['c']])
     ragged_indices = ragged_factory_ops.constant([[0, 3]])
-    with self.assertRaisesRegexp(errors.InvalidArgumentError,
-                                 r'indices\[1\] = 3 is not in \[0, 3\)'):
+    with self.assertRaisesRegex(errors.InvalidArgumentError,
+                                r'indices\[1\] = 3 is not in \[0, 3\)'):
       self.evaluate(ragged_gather_ops.gather(tensor_params, ragged_indices))
-    with self.assertRaisesRegexp(errors.InvalidArgumentError,
-                                 r'indices\[2\] = 2 is not in \[0, 2\)'):
+    with self.assertRaisesRegex(errors.InvalidArgumentError,
+                                r'indices\[2\] = 2 is not in \[0, 2\)'):
       self.evaluate(ragged_gather_ops.gather(ragged_params, tensor_indices))
-    with self.assertRaisesRegexp(errors.InvalidArgumentError,
-                                 r'indices\[1\] = 3 is not in \[0, 2\)'):
+    with self.assertRaisesRegex(errors.InvalidArgumentError,
+                                r'indices\[1\] = 3 is not in \[0, 2\)'):
       self.evaluate(ragged_gather_ops.gather(ragged_params, ragged_indices))
 
   def testUnknownIndicesRankError(self):
@@ -190,9 +190,9 @@ class RaggedGatherOpTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     params = ragged_factory_ops.constant([], ragged_rank=1)
     indices = constant_op.constant([0], dtype=dtypes.int64)
     indices = array_ops.placeholder_with_default(indices, None)
-    self.assertRaisesRegexp(ValueError,
-                            r'rank\(indices\) must be known statically',
-                            ragged_gather_ops.gather, params, indices)
+    self.assertRaisesRegex(ValueError,
+                           r'rank\(indices\) must be known statically',
+                           ragged_gather_ops.gather, params, indices)
 
   # pylint: disable=bad-whitespace
   @parameterized.parameters([
diff --git a/tensorflow/python/ops/ragged/ragged_getitem_test.py b/tensorflow/python/ops/ragged/ragged_getitem_test.py
index f02e308a29d..f5ed69b6853 100644
--- a/tensorflow/python/ops/ragged/ragged_getitem_test.py
+++ b/tensorflow/python/ops/ragged/ragged_getitem_test.py
@@ -158,9 +158,9 @@ class RaggedGetItemTest(test_util.TensorFlowTestCase, parameterized.TestCase):
   def _TestGetItemException(self, rt, slice_spec, expected, message):
     """Helper function for testing RaggedTensor.__getitem__ exceptions."""
     tensor_slice_spec = _make_tensor_slice_spec(slice_spec, True)
-    with self.assertRaisesRegexp(expected, message):
+    with self.assertRaisesRegex(expected, message):
       self.evaluate(rt.__getitem__(slice_spec))
-    with self.assertRaisesRegexp(expected, message):
+    with self.assertRaisesRegex(expected, message):
       self.evaluate(rt.__getitem__(tensor_slice_spec))
 
   @parameterized.parameters(
diff --git a/tensorflow/python/ops/ragged/ragged_map_flat_values_op_test.py b/tensorflow/python/ops/ragged/ragged_map_flat_values_op_test.py
index 3fb4887f38c..588a5473741 100644
--- a/tensorflow/python/ops/ragged/ragged_map_flat_values_op_test.py
+++ b/tensorflow/python/ops/ragged/ragged_map_flat_values_op_test.py
@@ -178,17 +178,18 @@ class RaggedMapInnerValuesOpTest(test_util.TensorFlowTestCase):
   def testRaggedTensorSplitsRaggedRankMismatchError(self):
     x = ragged_factory_ops.constant([[3, 1, 4], [], [1, 5]])
     y = ragged_factory_ops.constant([[[3, 1, 4], []], [], [[1, 5]]])
-    self.assertRaisesRegexp(
-        ValueError, r'Inputs must have identical ragged splits.*',
-        ragged_functional_ops.map_flat_values, math_ops.add, x, y)
+    self.assertRaisesRegex(ValueError,
+                           r'Inputs must have identical ragged splits.*',
+                           ragged_functional_ops.map_flat_values, math_ops.add,
+                           x, y)
 
   def testRaggedTensorSplitsValueMismatchError(self):
     x = ragged_factory_ops.constant([[3, 1, 4], [], [1, 5]])
     y = ragged_factory_ops.constant([[1], [2, 3], [4, 5]])
-    self.assertRaisesRegexp(errors.InvalidArgumentError,
-                            r'Inputs must have identical ragged splits.*',
-                            ragged_functional_ops.map_flat_values, math_ops.add,
-                            x, y)
+    self.assertRaisesRegex(errors.InvalidArgumentError,
+                           r'Inputs must have identical ragged splits.*',
+                           ragged_functional_ops.map_flat_values, math_ops.add,
+                           x, y)
 
   def testRaggedTensorSplitsMismatchErrorAtRuntime(self):
     splits1 = array_ops.placeholder_with_default(
@@ -197,8 +198,8 @@ class RaggedMapInnerValuesOpTest(test_util.TensorFlowTestCase):
         constant_op.constant([0, 1, 3, 5], dtypes.int64), None)
     x = ragged_tensor.RaggedTensor.from_row_splits([3, 1, 4, 1, 5], splits1)
     y = ragged_tensor.RaggedTensor.from_row_splits([1, 2, 3, 4, 5], splits2)
-    with self.assertRaisesRegexp(errors.InvalidArgumentError,
-                                 r'.*Inputs must have identical ragged splits'):
+    with self.assertRaisesRegex(errors.InvalidArgumentError,
+                                r'.*Inputs must have identical ragged splits'):
       self.evaluate(ragged_functional_ops.map_flat_values(math_ops.add, x, y))
 
 
diff --git a/tensorflow/python/ops/ragged/ragged_map_fn_op_test.py b/tensorflow/python/ops/ragged/ragged_map_fn_op_test.py
index 9e74de4bc35..8a40e396a68 100644
--- a/tensorflow/python/ops/ragged/ragged_map_fn_op_test.py
+++ b/tensorflow/python/ops/ragged/ragged_map_fn_op_test.py
@@ -263,7 +263,7 @@ class RaggedMapOpTest(test_util.TensorFlowTestCase,
   def testMismatchRaggedRank(self):
     elems = ragged_factory_ops.constant([[[1, 2, 3]], [[4, 5], [6, 7]]])
     fn = lambda x: ragged_math_ops.reduce_sum(x, axis=0)
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         ValueError, r'(?s)Expected `fn` to return.*But it returned.*'):
       _ = ragged_map_ops.map_fn(
           fn,
@@ -274,7 +274,7 @@ class RaggedMapOpTest(test_util.TensorFlowTestCase,
   def testMismatchRaggedRank2(self):
     elems = ragged_factory_ops.constant([[1, 2, 3], [4, 5], [6, 7]])
     fn = lambda x: ragged_tensor.RaggedTensor.from_row_starts(x, [0])
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         ValueError, r'(?s)Expected `fn` to return.*But it returned.*'):
       _ = ragged_map_ops.map_fn(
           fn,
diff --git a/tensorflow/python/ops/ragged/ragged_merge_dims_op_test.py b/tensorflow/python/ops/ragged/ragged_merge_dims_op_test.py
index 0d81d926de9..5e810e1b49c 100644
--- a/tensorflow/python/ops/ragged/ragged_merge_dims_op_test.py
+++ b/tensorflow/python/ops/ragged/ragged_merge_dims_op_test.py
@@ -264,7 +264,7 @@ class RaggedMergeDimsOpTest(test_util.TensorFlowTestCase,
                                message=None,
                                ragged_rank=None):
     x = ragged_factory_ops.constant(rt, ragged_rank=ragged_rank)
-    with self.assertRaisesRegexp(exception, message):
+    with self.assertRaisesRegex(exception, message):
       self.evaluate(x.merge_dims(outer_axis, inner_axis))
 
 
diff --git a/tensorflow/python/ops/ragged/ragged_one_hot_op_test.py b/tensorflow/python/ops/ragged/ragged_one_hot_op_test.py
index 83a4cf3605c..c65f4b16870 100644
--- a/tensorflow/python/ops/ragged/ragged_one_hot_op_test.py
+++ b/tensorflow/python/ops/ragged/ragged_one_hot_op_test.py
@@ -127,7 +127,7 @@ class RaggedOneHotTest(test_util.TensorFlowTestCase, parameterized.TestCase):
                  ragged_rank=None):
     ragged_indices = ragged_factory_ops.constant(
         indices, ragged_rank=ragged_rank)
-    with self.assertRaisesRegexp(exception, message):
+    with self.assertRaisesRegex(exception, message):
       array_ops.one_hot(
           ragged_indices,
           depth,
diff --git a/tensorflow/python/ops/ragged/ragged_operators_test.py b/tensorflow/python/ops/ragged/ragged_operators_test.py
index 8c7adf76b94..ff936d630e6 100644
--- a/tensorflow/python/ops/ragged/ragged_operators_test.py
+++ b/tensorflow/python/ops/ragged/ragged_operators_test.py
@@ -87,11 +87,11 @@ class RaggedElementwiseOpsTest(test_util.TensorFlowTestCase):
 
   def testDummyOperators(self):
     a = ragged_factory_ops.constant([[True, True], [False]])
-    with self.assertRaisesRegexp(TypeError,
-                                 'RaggedTensor may not be used as a boolean.'):
+    with self.assertRaisesRegex(TypeError,
+                                'RaggedTensor may not be used as a boolean.'):
       bool(a)
-    with self.assertRaisesRegexp(TypeError,
-                                 'RaggedTensor may not be used as a boolean.'):
+    with self.assertRaisesRegex(TypeError,
+                                'RaggedTensor may not be used as a boolean.'):
       if a:
         pass
 
diff --git a/tensorflow/python/ops/ragged/ragged_range_op_test.py b/tensorflow/python/ops/ragged/ragged_range_op_test.py
index d01a15fde83..b655fd1ea84 100644
--- a/tensorflow/python/ops/ragged/ragged_range_op_test.py
+++ b/tensorflow/python/ops/ragged/ragged_range_op_test.py
@@ -108,8 +108,8 @@ class RaggedRangeOpTest(test_util.TensorFlowTestCase):
                       ragged_math_ops.range, [0], [1, 2])
 
   def testKernelErrors(self):
-    with self.assertRaisesRegexp(errors.InvalidArgumentError,
-                                 r'Requires delta != 0'):
+    with self.assertRaisesRegex(errors.InvalidArgumentError,
+                                r'Requires delta != 0'):
       self.evaluate(ragged_math_ops.range(0, 0, 0))
 
   def testShape(self):
diff --git a/tensorflow/python/ops/ragged/ragged_reduce_op_test.py b/tensorflow/python/ops/ragged/ragged_reduce_op_test.py
index 9331bc6e14f..a39090fa3a2 100644
--- a/tensorflow/python/ops/ragged/ragged_reduce_op_test.py
+++ b/tensorflow/python/ops/ragged/ragged_reduce_op_test.py
@@ -353,11 +353,11 @@ class RaggedReduceOpsTest(test_util.TensorFlowTestCase,
     axis = array_ops.placeholder_with_default(constant_op.constant([0]), None)
 
     if not context.executing_eagerly():
-      self.assertRaisesRegexp(
-          ValueError, r'axis must be known at graph construction time.',
-          ragged_math_ops.reduce_sum, rt_input, axis)
-    self.assertRaisesRegexp(TypeError, r'axis must be an int; got str.*',
-                            ragged_math_ops.reduce_sum, rt_input, ['x'])
+      self.assertRaisesRegex(ValueError,
+                             r'axis must be known at graph construction time.',
+                             ragged_math_ops.reduce_sum, rt_input, axis)
+    self.assertRaisesRegex(TypeError, r'axis must be an int; got str.*',
+                           ragged_math_ops.reduce_sum, rt_input, ['x'])
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/ops/ragged/ragged_reverse_op_test.py b/tensorflow/python/ops/ragged/ragged_reverse_op_test.py
index c0bd40941ab..b4dafd15a2f 100644
--- a/tensorflow/python/ops/ragged/ragged_reverse_op_test.py
+++ b/tensorflow/python/ops/ragged/ragged_reverse_op_test.py
@@ -79,11 +79,10 @@ class RaggedReverseOpTest(test_util.TensorFlowTestCase,
     self.assertAllClose(result, expected)
 
   def testErrors(self):
-    self.assertRaisesRegexp(
+    self.assertRaisesRegex(
         TypeError, '`axis` must be a list of int or a constant tensor *',
         ragged_array_ops.reverse,
-        ragged_factory_ops.constant([[1], [2, 3]], ragged_rank=1),
-        [0, None])
+        ragged_factory_ops.constant([[1], [2, 3]], ragged_rank=1), [0, None])
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/ops/ragged/ragged_row_lengths_op_test.py b/tensorflow/python/ops/ragged/ragged_row_lengths_op_test.py
index b2e26e7e856..9cb064e749f 100644
--- a/tensorflow/python/ops/ragged/ragged_row_lengths_op_test.py
+++ b/tensorflow/python/ops/ragged/ragged_row_lengths_op_test.py
@@ -138,7 +138,7 @@ class RaggedRowLengthsOp(test_util.TensorFlowTestCase,
   ])
   def testErrors(self, rt_input, exception, message=None, axis=1):
     rt = ragged_factory_ops.constant(rt_input)
-    with self.assertRaisesRegexp(exception, message):
+    with self.assertRaisesRegex(exception, message):
       rt.row_lengths(axis)
 
 
diff --git a/tensorflow/python/ops/ragged/ragged_row_splits_to_segment_ids_op_test.py b/tensorflow/python/ops/ragged/ragged_row_splits_to_segment_ids_op_test.py
index de09f667977..c53ed3be32b 100644
--- a/tensorflow/python/ops/ragged/ragged_row_splits_to_segment_ids_op_test.py
+++ b/tensorflow/python/ops/ragged/ragged_row_splits_to_segment_ids_op_test.py
@@ -39,16 +39,15 @@ class RaggedSplitsToSegmentIdsOpTest(test_util.TensorFlowTestCase):
     self.assertAllEqual(segment_ids, [])
 
   def testErrors(self):
-    self.assertRaisesRegexp(ValueError, r'Invalid row_splits: \[\]',
-                            segment_id_ops.row_splits_to_segment_ids, [])
-    self.assertRaisesRegexp(
-        ValueError, r'splits must have dtype int32 or int64',
-        segment_id_ops.row_splits_to_segment_ids,
-        constant_op.constant([0.5]))
-    self.assertRaisesRegexp(ValueError, r'Shape \(\) must have rank 1',
-                            segment_id_ops.row_splits_to_segment_ids, 0)
-    self.assertRaisesRegexp(ValueError, r'Shape \(1, 1\) must have rank 1',
-                            segment_id_ops.row_splits_to_segment_ids, [[0]])
+    self.assertRaisesRegex(ValueError, r'Invalid row_splits: \[\]',
+                           segment_id_ops.row_splits_to_segment_ids, [])
+    self.assertRaisesRegex(ValueError, r'splits must have dtype int32 or int64',
+                           segment_id_ops.row_splits_to_segment_ids,
+                           constant_op.constant([0.5]))
+    self.assertRaisesRegex(ValueError, r'Shape \(\) must have rank 1',
+                           segment_id_ops.row_splits_to_segment_ids, 0)
+    self.assertRaisesRegex(ValueError, r'Shape \(1, 1\) must have rank 1',
+                           segment_id_ops.row_splits_to_segment_ids, [[0]])
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/ops/ragged/ragged_segment_ids_to_row_splits_op_test.py b/tensorflow/python/ops/ragged/ragged_segment_ids_to_row_splits_op_test.py
index 717e401693e..16559a256d9 100644
--- a/tensorflow/python/ops/ragged/ragged_segment_ids_to_row_splits_op_test.py
+++ b/tensorflow/python/ops/ragged/ragged_segment_ids_to_row_splits_op_test.py
@@ -39,14 +39,14 @@ class RaggedSplitsToSegmentIdsOpTest(test_util.TensorFlowTestCase):
     self.assertAllEqual(segment_ids, [0])
 
   def testErrors(self):
-    self.assertRaisesRegexp(TypeError,
-                            r'segment_ids must be an integer tensor.*',
-                            segment_id_ops.segment_ids_to_row_splits,
-                            constant_op.constant([0.5]))
-    self.assertRaisesRegexp(ValueError, r'Shape \(\) must have rank 1',
-                            segment_id_ops.segment_ids_to_row_splits, 0)
-    self.assertRaisesRegexp(ValueError, r'Shape \(1, 1\) must have rank 1',
-                            segment_id_ops.segment_ids_to_row_splits, [[0]])
+    self.assertRaisesRegex(TypeError,
+                           r'segment_ids must be an integer tensor.*',
+                           segment_id_ops.segment_ids_to_row_splits,
+                           constant_op.constant([0.5]))
+    self.assertRaisesRegex(ValueError, r'Shape \(\) must have rank 1',
+                           segment_id_ops.segment_ids_to_row_splits, 0)
+    self.assertRaisesRegex(ValueError, r'Shape \(1, 1\) must have rank 1',
+                           segment_id_ops.segment_ids_to_row_splits, [[0]])
 
   def testNumSegments(self):
     segment_ids = [0, 0, 0, 2, 2, 3, 4, 4, 4]
diff --git a/tensorflow/python/ops/ragged/ragged_segment_op_test.py b/tensorflow/python/ops/ragged/ragged_segment_op_test.py
index d29708a5f5d..953e1076c2b 100644
--- a/tensorflow/python/ops/ragged/ragged_segment_op_test.py
+++ b/tensorflow/python/ops/ragged/ragged_segment_op_test.py
@@ -187,7 +187,7 @@ class RaggedSegmentOpsTest(test_util.TensorFlowTestCase,
   def testShapeMismatchError1(self):
     dt = constant_op.constant([1, 2, 3, 4, 5, 6])
     segment_ids = ragged_factory_ops.constant([[1, 2], []])
-    self.assertRaisesRegexp(
+    self.assertRaisesRegex(
         ValueError, 'segment_ids.shape must be a prefix of data.shape, '
         'but segment_ids is ragged and data is not.',
         ragged_math_ops.segment_sum, dt, segment_ids, 3)
@@ -202,7 +202,7 @@ class RaggedSegmentOpsTest(test_util.TensorFlowTestCase,
     segment_ids = ragged_factory_ops.constant([[1, 2], [1], [1, 1, 2], [2]])
 
     # Error is raised at graph-building time if we can detect it then.
-    self.assertRaisesRegexp(
+    self.assertRaisesRegex(
         errors.InvalidArgumentError,
         'segment_ids.shape must be a prefix of data.shape.*',
         ragged_math_ops.segment_sum, rt, segment_ids, 3)
@@ -211,7 +211,7 @@ class RaggedSegmentOpsTest(test_util.TensorFlowTestCase,
     segment_ids2 = ragged_tensor.RaggedTensor.from_row_splits(
         array_ops.placeholder_with_default(segment_ids.values, None),
         array_ops.placeholder_with_default(segment_ids.row_splits, None))
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         errors.InvalidArgumentError,
         'segment_ids.shape must be a prefix of data.shape.*'):
       self.evaluate(ragged_math_ops.segment_sum(rt, segment_ids2, 3))
diff --git a/tensorflow/python/ops/ragged/ragged_stack_op_test.py b/tensorflow/python/ops/ragged/ragged_stack_op_test.py
index e6931c41904..6e1db50e180 100644
--- a/tensorflow/python/ops/ragged/ragged_stack_op_test.py
+++ b/tensorflow/python/ops/ragged/ragged_stack_op_test.py
@@ -365,8 +365,8 @@ class RaggedStackOpTest(test_util.TensorFlowTestCase,
           message='axis=3 out of bounds: expected -3<=axis<3'),
   )
   def testError(self, rt_inputs, axis, error, message):
-    self.assertRaisesRegexp(error, message, ragged_concat_ops.stack, rt_inputs,
-                            axis)
+    self.assertRaisesRegex(error, message, ragged_concat_ops.stack, rt_inputs,
+                           axis)
 
   def testSingleTensorInput(self):
     """Tests ragged_stack with a single tensor input.
diff --git a/tensorflow/python/ops/ragged/ragged_tensor_shape_test.py b/tensorflow/python/ops/ragged/ragged_tensor_shape_test.py
index afe3390ce6d..7014226dc99 100644
--- a/tensorflow/python/ops/ragged/ragged_tensor_shape_test.py
+++ b/tensorflow/python/ops/ragged/ragged_tensor_shape_test.py
@@ -373,9 +373,8 @@ class RaggedTensorShapeTest(test_util.TensorFlowTestCase,
 
   def testRepr(self):
     shape = RaggedTensorDynamicShape.from_dim_sizes([2, (2, 1), 2, 1])
-    self.assertRegexpMatches(
-        repr(shape),
-        r'RaggedTensorDynamicShape\('
+    self.assertRegex(
+        repr(shape), r'RaggedTensorDynamicShape\('
         r'partitioned_dim_sizes=\(<[^>]+>, <[^>]+>\), '
         r'inner_dim_sizes=<[^>]+>\)')
 
diff --git a/tensorflow/python/ops/ragged/ragged_to_sparse_op_test.py b/tensorflow/python/ops/ragged/ragged_to_sparse_op_test.py
index 4804184b7ff..a69d2426966 100644
--- a/tensorflow/python/ops/ragged/ragged_to_sparse_op_test.py
+++ b/tensorflow/python/ops/ragged/ragged_to_sparse_op_test.py
@@ -146,7 +146,7 @@ class RaggedTensorToSparseOpTest(test_util.TensorFlowTestCase):
     bad_rt1 = ragged_tensor.RaggedTensor.from_row_splits(
         row_splits=[2, 3], values=[1, 2, 3], validate=False)
     bad_split0 = r'First value of ragged splits must be 0.*'
-    with self.assertRaisesRegexp(errors.InvalidArgumentError, bad_split0):
+    with self.assertRaisesRegex(errors.InvalidArgumentError, bad_split0):
       self.evaluate(bad_rt1.to_sparse())
 
     bad_rt2 = ragged_tensor.RaggedTensor.from_row_splits(
@@ -158,8 +158,8 @@ class RaggedTensorToSparseOpTest(test_util.TensorFlowTestCase):
         validate=False)
     split_mismatch1_error = r'Final value of ragged splits must match.*'
     for rt in [bad_rt2, bad_rt3]:
-      with self.assertRaisesRegexp(errors.InvalidArgumentError,
-                                   split_mismatch1_error):
+      with self.assertRaisesRegex(errors.InvalidArgumentError,
+                                  split_mismatch1_error):
         self.evaluate(rt.to_sparse())
 
     bad_rt4 = ragged_tensor.RaggedTensor.from_row_splits(
@@ -168,15 +168,15 @@ class RaggedTensorToSparseOpTest(test_util.TensorFlowTestCase):
             row_splits=[0], values=empty_vector, validate=False),
         validate=False)
     split_mismatch2_error = r'Final value of ragged splits must match.*'
-    with self.assertRaisesRegexp(errors.InvalidArgumentError,
-                                 split_mismatch2_error):
+    with self.assertRaisesRegex(errors.InvalidArgumentError,
+                                split_mismatch2_error):
       self.evaluate(bad_rt4.to_sparse())
 
     bad_rt5 = ragged_tensor.RaggedTensor.from_row_splits(
         row_splits=empty_vector, values=[], validate=False)
     empty_splits_error = (r'ragged splits may not be empty.*')
-    with self.assertRaisesRegexp(errors.InvalidArgumentError,
-                                 empty_splits_error):
+    with self.assertRaisesRegex(errors.InvalidArgumentError,
+                                empty_splits_error):
       self.evaluate(bad_rt5.to_sparse())
 
   def testGradient(self):
diff --git a/tensorflow/python/ops/ragged/ragged_to_tensor_op_test.py b/tensorflow/python/ops/ragged/ragged_to_tensor_op_test.py
index 83b36394cc4..a4fa2dc292f 100644
--- a/tensorflow/python/ops/ragged/ragged_to_tensor_op_test.py
+++ b/tensorflow/python/ops/ragged/ragged_to_tensor_op_test.py
@@ -396,11 +396,11 @@ class RaggedTensorToTensorOpTest(test_util.TensorFlowTestCase,
                 shape=None):
 
     rt = ragged_factory_ops.constant(rt_input, ragged_rank=ragged_rank)
-    with self.assertRaisesRegexp(error_type, error):
+    with self.assertRaisesRegex(error_type, error):
       self.evaluate(rt.to_tensor(default_value=default, shape=shape))
     rt_placeholder = nest.map_structure(
         make_placeholder, rt, expand_composites=True)
-    with self.assertRaisesRegexp(error_type, error):
+    with self.assertRaisesRegex(error_type, error):
       self.evaluate(
           rt_placeholder.to_tensor(default_value=default, shape=shape))
 
diff --git a/tensorflow/python/ops/ragged/ragged_util_test.py b/tensorflow/python/ops/ragged/ragged_util_test.py
index b4af8ef2c78..c2a4b3d7af1 100644
--- a/tensorflow/python/ops/ragged/ragged_util_test.py
+++ b/tensorflow/python/ops/ragged/ragged_util_test.py
@@ -220,7 +220,7 @@ class RaggedUtilTest(test_util.TensorFlowTestCase,
       data = array_ops.placeholder_with_default(data, None)
       repeats = array_ops.placeholder_with_default(repeats, None)
 
-    with self.assertRaisesRegexp(exception, error):
+    with self.assertRaisesRegex(exception, error):
       ragged_util.repeat(data, repeats, axis)
 
 
diff --git a/tensorflow/python/ops/ragged/ragged_where_op_test.py b/tensorflow/python/ops/ragged/ragged_where_op_test.py
index a0c6cfa5bed..4d5d6cd666a 100644
--- a/tensorflow/python/ops/ragged/ragged_where_op_test.py
+++ b/tensorflow/python/ops/ragged/ragged_where_op_test.py
@@ -205,7 +205,7 @@ class RaggedWhereOpTest(test_util.TensorFlowTestCase,
           message='Input shapes do not match.'),
   ])
   def testRaggedWhereErrors(self, condition, error, message, x=None, y=None):
-    with self.assertRaisesRegexp(error, message):
+    with self.assertRaisesRegex(error, message):
       ragged_where_op.where(condition, x, y)
 
 
diff --git a/tensorflow/python/ops/ragged/row_partition_test.py b/tensorflow/python/ops/ragged/row_partition_test.py
index d3662a53ca9..25f8f0d2e99 100644
--- a/tensorflow/python/ops/ragged/row_partition_test.py
+++ b/tensorflow/python/ops/ragged/row_partition_test.py
@@ -77,24 +77,23 @@ class RowPartitionTest(test_util.TensorFlowTestCase, parameterized.TestCase):
   def testRaggedTensorConstructionErrors(self):
     row_splits = constant_op.constant([0, 2, 2, 5, 6, 7], dtypes.int64)
 
-    with self.assertRaisesRegexp(ValueError,
-                                 'RaggedTensor constructor is private'):
+    with self.assertRaisesRegex(ValueError,
+                                'RaggedTensor constructor is private'):
       RowPartition(row_splits=row_splits)
 
-    with self.assertRaisesRegexp(TypeError,
-                                 'Row-partitioning argument must be a Tensor'):
+    with self.assertRaisesRegex(TypeError,
+                                'Row-partitioning argument must be a Tensor'):
       RowPartition(
           row_splits=[0, 2, 2, 5, 6, 7],
           internal=row_partition._row_partition_factory_key)
 
-    with self.assertRaisesRegexp(ValueError,
-                                 r'Shape \(6, 1\) must have rank 1'):
+    with self.assertRaisesRegex(ValueError, r'Shape \(6, 1\) must have rank 1'):
       RowPartition(
           row_splits=array_ops.expand_dims(row_splits, 1),
           internal=row_partition._row_partition_factory_key)
 
-    with self.assertRaisesRegexp(TypeError,
-                                 'Cached value must be a Tensor or None.'):
+    with self.assertRaisesRegex(TypeError,
+                                'Cached value must be a Tensor or None.'):
       RowPartition(
           row_splits=row_splits,
           row_lengths=[2, 3, 4],
@@ -202,7 +201,7 @@ class RowPartitionTest(test_util.TensorFlowTestCase, parameterized.TestCase):
 
   def testFromRowSplitsWithEmptySplits(self):
     err_msg = 'row_splits tensor may not be empty'
-    with self.assertRaisesRegexp(ValueError, err_msg):
+    with self.assertRaisesRegex(ValueError, err_msg):
       RowPartition.from_row_splits([])
 
   def testFromRowStarts(self):
@@ -280,27 +279,26 @@ class RowPartitionTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     value_rowids = constant_op.constant([0, 0, 2, 2, 2, 3, 4], dtypes.int64)
     nrows = constant_op.constant(5, dtypes.int64)
 
-    with self.assertRaisesRegexp(ValueError, r'Expected nrows >= 0; got -2'):
+    with self.assertRaisesRegex(ValueError, r'Expected nrows >= 0; got -2'):
       RowPartition.from_value_rowids(
           value_rowids=array_ops.placeholder_with_default(value_rowids, None),
           nrows=-2)
 
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         ValueError, r'Expected nrows >= value_rowids\[-1\] \+ 1; got nrows=2, '
         r'value_rowids\[-1\]=4'):
       RowPartition.from_value_rowids(value_rowids=value_rowids, nrows=2)
 
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         ValueError, r'Expected nrows >= value_rowids\[-1\] \+ 1; got nrows=4, '
         r'value_rowids\[-1\]=4'):
       RowPartition.from_value_rowids(value_rowids=value_rowids, nrows=4)
 
-    with self.assertRaisesRegexp(ValueError,
-                                 r'Shape \(7, 1\) must have rank 1'):
+    with self.assertRaisesRegex(ValueError, r'Shape \(7, 1\) must have rank 1'):
       RowPartition.from_value_rowids(
           value_rowids=array_ops.expand_dims(value_rowids, 1), nrows=nrows)
 
-    with self.assertRaisesRegexp(ValueError, r'Shape \(1,\) must have rank 0'):
+    with self.assertRaisesRegex(ValueError, r'Shape \(1,\) must have rank 0'):
       RowPartition.from_value_rowids(
           value_rowids=value_rowids, nrows=array_ops.expand_dims(nrows, 0))
 
@@ -626,9 +624,9 @@ class RowPartitionTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     # Errors that are caught by static shape checks.
     x = x()
     y = y()
-    with self.assertRaisesRegexp(ValueError, message):
+    with self.assertRaisesRegex(ValueError, message):
       x.merge_precomputed_encodings(y).row_splits()
-    with self.assertRaisesRegexp(ValueError, message):
+    with self.assertRaisesRegex(ValueError, message):
       y.merge_precomputed_encodings(x).row_splits()
 
   @parameterized.named_parameters([
@@ -662,9 +660,9 @@ class RowPartitionTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     # Errors that are caught by runtime value checks.
     x = x()
     y = y()
-    with self.assertRaisesRegexp(errors.InvalidArgumentError, message):
+    with self.assertRaisesRegex(errors.InvalidArgumentError, message):
       self.evaluate(x.merge_precomputed_encodings(y).row_splits())
-    with self.assertRaisesRegexp(errors.InvalidArgumentError, message):
+    with self.assertRaisesRegex(errors.InvalidArgumentError, message):
       self.evaluate(y.merge_precomputed_encodings(x).row_splits())
 
 
@@ -713,7 +711,7 @@ class RowPartitionSpecTest(test_util.TensorFlowTestCase,
                             uniform_row_length=None,
                             dtype=dtypes.int64,
                             error=None):
-    with self.assertRaisesRegexp(ValueError, error):
+    with self.assertRaisesRegex(ValueError, error):
       RowPartitionSpec(nrows, nvals, uniform_row_length, dtype)
 
   def testValueType(self):
@@ -841,7 +839,7 @@ class RowPartitionSpecTest(test_util.TensorFlowTestCase,
       (RowPartitionSpec(), RowPartitionSpec(dtype=dtypes.int32)),
   ])
   def testMostSpecificCompatibleTypeError(self, spec1, spec2):
-    with self.assertRaisesRegexp(ValueError, 'not compatible'):
+    with self.assertRaisesRegex(ValueError, 'not compatible'):
       spec1.most_specific_compatible_type(spec2)
 
   def testFromValue(self):
diff --git a/tensorflow/python/ops/ragged/string_ngrams_op_test.py b/tensorflow/python/ops/ragged/string_ngrams_op_test.py
index e0e22c4b384..fed4b1441d5 100644
--- a/tensorflow/python/ops/ragged/string_ngrams_op_test.py
+++ b/tensorflow/python/ops/ragged/string_ngrams_op_test.py
@@ -332,7 +332,7 @@ class StringNgramsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
                  preserve_short_sequences=False,
                  error=None,
                  exception=ValueError):
-    with self.assertRaisesRegexp(exception, error):
+    with self.assertRaisesRegex(exception, error):
       ragged_string_ops.ngrams(data, ngram_width, separator, pad_values,
                                padding_width, preserve_short_sequences)
 
@@ -343,7 +343,7 @@ class StringNgramsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     def f(v):
       return ragged_string_ops.ngrams(v, 2)
 
-    with self.assertRaisesRegexp(ValueError, "Rank of data must be known."):
+    with self.assertRaisesRegex(ValueError, "Rank of data must be known."):
       f([b"foo", b"bar"])
 
 
diff --git a/tensorflow/python/ops/raw_ops_test.py b/tensorflow/python/ops/raw_ops_test.py
index fff94f5c25a..850e96bb9ed 100644
--- a/tensorflow/python/ops/raw_ops_test.py
+++ b/tensorflow/python/ops/raw_ops_test.py
@@ -34,12 +34,12 @@ class RawOpsTest(test.TestCase):
     self.assertEqual([2], self.evaluate(gen_math_ops.Add(x=x, y=x)))
 
   def testRequiresKwargs(self):
-    with self.assertRaisesRegexp(TypeError, "only takes keyword args"):
+    with self.assertRaisesRegex(TypeError, "only takes keyword args"):
       gen_math_ops.Add(1., 1.)
 
   def testRequiresKwargs_providesSuggestion(self):
     msg = "possible keys: \\['x', 'y', 'name'\\]"
-    with self.assertRaisesRegexp(TypeError, msg):
+    with self.assertRaisesRegex(TypeError, msg):
       gen_math_ops.Add(1., y=2.)
 
   def testName(self):
diff --git a/tensorflow/python/ops/structured/structured_tensor_slice_test.py b/tensorflow/python/ops/structured/structured_tensor_slice_test.py
index 0eaef216a01..7be99600fd2 100644
--- a/tensorflow/python/ops/structured/structured_tensor_slice_test.py
+++ b/tensorflow/python/ops/structured/structured_tensor_slice_test.py
@@ -256,7 +256,7 @@ class StructuredTensorSliceTest(test_util.TensorFlowTestCase,
   ])
   def testGetItemError(self, slice_spec, error, exception=ValueError):
     struct = structured_tensor.StructuredTensor.from_pyval(EXAMPLE_STRUCT)
-    with self.assertRaisesRegexp(exception, error):
+    with self.assertRaisesRegex(exception, error):
       struct.__getitem__(slice_spec)
 
   @parameterized.parameters([
@@ -266,7 +266,7 @@ class StructuredTensorSliceTest(test_util.TensorFlowTestCase,
   def testGetItemFromVectorError(self, slice_spec, error, exception=ValueError):
     struct = structured_tensor.StructuredTensor.from_pyval(
         EXAMPLE_STRUCT_VECTOR)
-    with self.assertRaisesRegexp(exception, error):
+    with self.assertRaisesRegex(exception, error):
       struct.__getitem__(slice_spec)
 
 
diff --git a/tensorflow/python/ops/structured/structured_tensor_spec_test.py b/tensorflow/python/ops/structured/structured_tensor_spec_test.py
index 3684c84d8f5..4637a1a51e5 100644
--- a/tensorflow/python/ops/structured/structured_tensor_spec_test.py
+++ b/tensorflow/python/ops/structured/structured_tensor_spec_test.py
@@ -90,7 +90,7 @@ class StructuredTensorSpecTest(test_util.TensorFlowTestCase,
        r'field_specs must be a dictionary with TypeSpec values\.'),
   ])
   def testConstructionErrors(self, shape, field_specs, error):
-    with self.assertRaisesRegexp(TypeError, error):
+    with self.assertRaisesRegex(TypeError, error):
       structured_tensor.StructuredTensorSpec(shape, field_specs)
 
   def testValueType(self):
diff --git a/tensorflow/python/ops/structured/structured_tensor_test.py b/tensorflow/python/ops/structured/structured_tensor_test.py
index 896bfff1296..75aa5a872a6 100644
--- a/tensorflow/python/ops/structured/structured_tensor_test.py
+++ b/tensorflow/python/ops/structured/structured_tensor_test.py
@@ -73,8 +73,8 @@ class StructuredTensorTest(test_util.TensorFlowTestCase,
         self.assertAllEqual(a_value, b_value, msg)
 
   def testConstructorIsPrivate(self):
-    with self.assertRaisesRegexp(ValueError,
-                                 "StructuredTensor constructor is private"):
+    with self.assertRaisesRegex(ValueError,
+                                "StructuredTensor constructor is private"):
       structured_tensor.StructuredTensor({}, (), None, ())
 
   @parameterized.named_parameters([
@@ -453,7 +453,7 @@ class StructuredTensorTest(test_util.TensorFlowTestCase,
       nrows = nrows()  # deferred construction.
     if callable(row_partitions):
       row_partitions = row_partitions()  # deferred construction.
-    with self.assertRaisesRegexp(err, msg):
+    with self.assertRaisesRegex(err, msg):
       struct = StructuredTensor.from_fields(
           fields=fields,
           shape=shape,
@@ -468,7 +468,7 @@ class StructuredTensorTest(test_util.TensorFlowTestCase,
     nrows = constant_op.constant(5)
     static_nrows = tensor_shape.Dimension(5)
     value = constant_op.constant([1, 2, 3])
-    with self.assertRaisesRegexp(ValueError, "fields have incompatible nrows"):
+    with self.assertRaisesRegex(ValueError, "fields have incompatible nrows"):
       structured_tensor._merge_nrows(nrows, static_nrows, value, dtypes.int32,
                                      validate=False)
 
@@ -538,12 +538,12 @@ class StructuredTensorTest(test_util.TensorFlowTestCase,
   def testPartitionOuterDimsErrors(self):
     st = StructuredTensor.from_fields({})
     partition = row_partition.RowPartition.from_row_splits([0])
-    with self.assertRaisesRegexp(ValueError,
-                                 r"Shape \(\) must have rank at least 1"):
+    with self.assertRaisesRegex(ValueError,
+                                r"Shape \(\) must have rank at least 1"):
       st.partition_outer_dimension(partition)
 
-    with self.assertRaisesRegexp(TypeError,
-                                 "row_partition must be a RowPartition"):
+    with self.assertRaisesRegex(TypeError,
+                                "row_partition must be a RowPartition"):
       st.partition_outer_dimension(10)
 
   @parameterized.named_parameters([
@@ -728,13 +728,13 @@ class StructuredTensorTest(test_util.TensorFlowTestCase,
 
   ])  # pyformat: disable
   def testFromPyvalError(self, pyval, err=ValueError, type_spec=None, msg=None):
-    with self.assertRaisesRegexp(err, msg):
+    with self.assertRaisesRegex(err, msg):
       structured_tensor.StructuredTensor.from_pyval(pyval, type_spec)
 
   def testToPyvalRequiresEagerMode(self):
     st = structured_tensor.StructuredTensor.from_pyval({"a": 5})
     if not context.executing_eagerly():
-      with self.assertRaisesRegexp(ValueError, "only supported in eager mode."):
+      with self.assertRaisesRegex(ValueError, "only supported in eager mode."):
         st.to_pyval()
 
   @parameterized.named_parameters([
@@ -915,7 +915,7 @@ class StructuredTensorTest(test_util.TensorFlowTestCase,
 
   def testMergeDimsError(self):
     st = StructuredTensor.from_pyval([[[{"a": 5}]]])
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         ValueError,
         r"Expected outer_axis \(2\) to be less than inner_axis \(1\)"):
       st.merge_dims(2, 1)
@@ -925,7 +925,7 @@ class StructuredTensorTest(test_util.TensorFlowTestCase,
     self.assertAllEqual(st.field_value(("a",)), 5)
     self.assertAllEqual(st.field_value(("b", "c")), [1, 2, 3])
     expected = "Field path \(.*a.*,.*b.*\) not found in .*"
-    with self.assertRaisesRegexp(KeyError, expected):
+    with self.assertRaisesRegex(KeyError, expected):
       st.field_value(("a", "b"))
 
   def testRepr(self):
diff --git a/tensorflow/python/profiler/pprof_profiler_test.py b/tensorflow/python/profiler/pprof_profiler_test.py
index 3f5bd9e79be..4ad3eb02469 100644
--- a/tensorflow/python/profiler/pprof_profiler_test.py
+++ b/tensorflow/python/profiler/pprof_profiler_test.py
@@ -40,10 +40,10 @@ class PprofProfilerTest(test.TestCase):
     graph.get_operations.return_value = []
 
     profiles = pprof_profiler.get_profiles(graph, run_metadata)
-    self.assertEquals(0, len(profiles))
+    self.assertEqual(0, len(profiles))
     profile_files = pprof_profiler.profile(
         graph, run_metadata, output_dir)
-    self.assertEquals(0, len(profile_files))
+    self.assertEqual(0, len(profile_files))
 
   def testRunMetadataEmpty(self):
     output_dir = test.get_temp_dir()
@@ -56,10 +56,10 @@ class PprofProfilerTest(test.TestCase):
     graph.get_operations.return_value = [op1]
 
     profiles = pprof_profiler.get_profiles(graph, run_metadata)
-    self.assertEquals(0, len(profiles))
+    self.assertEqual(0, len(profiles))
     profile_files = pprof_profiler.profile(
         graph, run_metadata, output_dir)
-    self.assertEquals(0, len(profile_files))
+    self.assertEqual(0, len(profile_files))
 
   def testValidProfile(self):
     output_dir = test.get_temp_dir()
@@ -123,18 +123,18 @@ comment: 9
 """
     # Test with protos
     profiles = pprof_profiler.get_profiles(graph, run_metadata)
-    self.assertEquals(1, len(profiles))
+    self.assertEqual(1, len(profiles))
     self.assertTrue('deviceA' in profiles)
-    self.assertEquals(expected_proto, str(profiles['deviceA']))
+    self.assertEqual(expected_proto, str(profiles['deviceA']))
     # Test with files
     profile_files = pprof_profiler.profile(
         graph, run_metadata, output_dir)
-    self.assertEquals(1, len(profile_files))
+    self.assertEqual(1, len(profile_files))
     with gzip.open(profile_files[0]) as profile_file:
       profile_contents = profile_file.read()
       profile = profile_pb2.Profile()
       profile.ParseFromString(profile_contents)
-      self.assertEquals(expected_proto, str(profile))
+      self.assertEqual(expected_proto, str(profile))
 
   @test_util.run_v1_only('b/120545219')
   def testProfileWithWhileLoop(self):
@@ -150,16 +150,16 @@ comment: 9
       r = control_flow_ops.while_loop(c, b, [i])
       sess.run(r, options=options, run_metadata=run_metadata)
       profiles = pprof_profiler.get_profiles(sess.graph, run_metadata)
-      self.assertEquals(1, len(profiles))
+      self.assertEqual(1, len(profiles))
       profile = next(iter(profiles.values()))
       add_samples = []  # Samples for the while/Add node
       for sample in profile.sample:
         if profile.string_table[sample.label[0].str] == 'while/Add':
           add_samples.append(sample)
       # Values for same nodes are aggregated.
-      self.assertEquals(1, len(add_samples))
+      self.assertEqual(1, len(add_samples))
       # Value of "count" should be equal to number of iterations.
-      self.assertEquals(num_iters, add_samples[0].value[0])
+      self.assertEqual(num_iters, add_samples[0].value[0])
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/profiler/tfprof_logger_test.py b/tensorflow/python/profiler/tfprof_logger_test.py
index caf3869f56d..9afdc7af131 100644
--- a/tensorflow/python/profiler/tfprof_logger_test.py
+++ b/tensorflow/python/profiler/tfprof_logger_test.py
@@ -54,10 +54,10 @@ class TFProfLoggerTest(test.TestCase):
     graph2 = ops.Graph()
     # Use copy_op_to_graph to remove shape information.
     y2 = copy_elements.copy_op_to_graph(y, graph2, [])
-    self.assertEquals('<unknown>', str(y2.get_shape()))
+    self.assertEqual('<unknown>', str(y2.get_shape()))
 
     tfprof_logger._fill_missing_graph_shape(graph2, run_metadata)
-    self.assertEquals('(2, 2)', str(y2.get_shape()))
+    self.assertEqual('(2, 2)', str(y2.get_shape()))
 
   def testFailedFillMissingShape(self):
     y = self._BuildSmallModel()
@@ -69,10 +69,10 @@ class TFProfLoggerTest(test.TestCase):
 
     graph2 = ops.Graph()
     y2 = copy_elements.copy_op_to_graph(y, graph2, [])
-    self.assertEquals('<unknown>', str(y2.get_shape()))
+    self.assertEqual('<unknown>', str(y2.get_shape()))
     # run_metadata has special name for MatMul, hence failed to fill shape.
     tfprof_logger._fill_missing_graph_shape(graph2, run_metadata)
-    self.assertEquals('<unknown>', str(y2.get_shape()))
+    self.assertEqual('<unknown>', str(y2.get_shape()))
   """
 
 
diff --git a/tensorflow/python/saved_model/load_test.py b/tensorflow/python/saved_model/load_test.py
index c392c7feb31..320182385f8 100644
--- a/tensorflow/python/saved_model/load_test.py
+++ b/tensorflow/python/saved_model/load_test.py
@@ -471,8 +471,8 @@ class LoadTest(test.TestCase, parameterized.TestCase):
 
     imported = cycle(root, cycles)
 
-    with self.assertRaisesRegexp(ValueError,
-                                 "Could not find matching function to call"):
+    with self.assertRaisesRegex(ValueError,
+                                "Could not find matching function to call"):
       imported.f(input2)
 
     self.assertEqual(31, imported.f(input1).numpy())
@@ -547,8 +547,8 @@ class LoadTest(test.TestCase, parameterized.TestCase):
 
     imported = cycle(root, cycles)
 
-    with self.assertRaisesRegexp(ValueError,
-                                 "Could not find matching function to call.*"):
+    with self.assertRaisesRegex(ValueError,
+                                "Could not find matching function to call.*"):
       imported.f(x, learning_rate=0.5, epochs=4)
 
     self.assertEqual(7, imported.f(x, learning_rate=0.5, epochs=3).numpy())
@@ -840,7 +840,7 @@ class LoadTest(test.TestCase, parameterized.TestCase):
 
     imported = cycle(root, cycles)
 
-    with self.assertRaisesRegexp(ValueError, "Python inputs incompatible"):
+    with self.assertRaisesRegex(ValueError, "Python inputs incompatible"):
       # We cannot call the function with a constant of shape ().
       imported.f(constant_op.constant(2)).numpy()
 
@@ -875,8 +875,8 @@ class LoadTest(test.TestCase, parameterized.TestCase):
 
     self.assertAllEqual([2, 4, 6, 8],
                         concrete(x=constant_op.constant([1, 2, 3, 4])).numpy())
-    with self.assertRaisesRegexp(ValueError,
-                                 "Could not find matching function to call"):
+    with self.assertRaisesRegex(ValueError,
+                                "Could not find matching function to call"):
       imported.f.get_concrete_function(
           tensor_spec.TensorSpec([None], dtypes.int32))
     imported.f.get_concrete_function(
@@ -1183,7 +1183,7 @@ class LoadTest(test.TestCase, parameterized.TestCase):
         signatures={"key": exported.f.get_concrete_function()})
     self.assertEqual(1., imported.signatures["key"]()["output_0"].numpy())
     imported.signatures = {"key1": imported.signatures["key"]}
-    with self.assertRaisesRegexp(ValueError, "signatures"):
+    with self.assertRaisesRegex(ValueError, "signatures"):
       save.save(imported, tempfile.mkdtemp(prefix=self.get_temp_dir()))
 
   def test_signature_loading(self, cycles):
@@ -1346,8 +1346,7 @@ class LoadTest(test.TestCase, parameterized.TestCase):
     self.assertAllEqual(root.f(), [1.0, 2.0, 3.0, True])
     self.assertAllEqual(root.f(-1.0, training=False), [3.0, 2.0, -1.0, False])
 
-    with self.assertRaisesRegexp(ValueError,
-                                 "Could not find matching function"):
+    with self.assertRaisesRegex(ValueError, "Could not find matching function"):
       root.f(["hello", 1.0])
 
   def test_prefer_specific_trace(self, cycles):
@@ -1568,8 +1567,8 @@ class LoadTest(test.TestCase, parameterized.TestCase):
 
     self.assertEqual(4.0, imported({"a": 3.0}).numpy())
 
-    with self.assertRaisesRegexp(ValueError,
-                                 "Could not find matching function to call"):
+    with self.assertRaisesRegex(ValueError,
+                                "Could not find matching function to call"):
       imported({"a": 2.0, "b": 3.0})
 
   def test_shapes_available(self, cycles):
@@ -1741,8 +1740,8 @@ class LoadTest(test.TestCase, parameterized.TestCase):
     del imported
 
     # Try to destroy the resource again, should fail.
-    with self.assertRaisesRegexp(errors.NotFoundError,
-                                 r"Resource .* does not exist."):
+    with self.assertRaisesRegex(errors.NotFoundError,
+                                r"Resource .* does not exist."):
       resource_variable_ops.destroy_resource_op(
           handle, ignore_lookup_error=False)
 
@@ -1859,9 +1858,8 @@ class SingleCycleTests(test.TestCase, parameterized.TestCase):
     self.assertEqual(5, self.evaluate(imported.a))
 
     root.a = variables.Variable(3.)
-    with self.assertRaisesRegexp(
-        ValueError,
-        "object has an attribute named a, which is reserved."):
+    with self.assertRaisesRegex(
+        ValueError, "object has an attribute named a, which is reserved."):
       save.save(root, path)
 
   def test_save_cached_variable(self):
diff --git a/tensorflow/python/saved_model/load_v1_in_v2_test.py b/tensorflow/python/saved_model/load_v1_in_v2_test.py
index 37b439fe649..bafeea128ed 100644
--- a/tensorflow/python/saved_model/load_v1_in_v2_test.py
+++ b/tensorflow/python/saved_model/load_v1_in_v2_test.py
@@ -182,7 +182,7 @@ class LoadTest(test.TestCase):
     return path
 
   def test_multi_meta_graph_loading(self):
-    with self.assertRaisesRegexp(ValueError, "2 MetaGraphs"):
+    with self.assertRaisesRegex(ValueError, "2 MetaGraphs"):
       load.load(self._v1_multi_metagraph_saved_model())
     first_imported = load.load(self._v1_multi_metagraph_saved_model(),
                                tags=["first"])
@@ -191,9 +191,9 @@ class LoadTest(test.TestCase):
                          first_start=constant_op.constant(2.))))
     second_imported = load.load(self._v1_multi_metagraph_saved_model(),
                                 tags=set(["second"]))
-    with self.assertRaisesRegexp(TypeError, "second_start"):
+    with self.assertRaisesRegex(TypeError, "second_start"):
       second_imported.signatures["second_key"](x=constant_op.constant(2.))
-    with self.assertRaisesRegexp(TypeError, "second_start"):
+    with self.assertRaisesRegex(TypeError, "second_start"):
       second_imported.signatures["second_key"](
           second_start=constant_op.constant(2.),
           x=constant_op.constant(2.))
@@ -424,7 +424,7 @@ class LoadTest(test.TestCase):
 
   def test_unfed_placeholder_exception(self):
     path = self._unfed_placeholder_signature()
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         lift_to_graph.UnliftableError,
         "signature needs an input for each placeholder.*\n\nUnable to lift"):
       load.load(path)
diff --git a/tensorflow/python/saved_model/loader_test.py b/tensorflow/python/saved_model/loader_test.py
index 3e27c0801cd..c1072664c3d 100644
--- a/tensorflow/python/saved_model/loader_test.py
+++ b/tensorflow/python/saved_model/loader_test.py
@@ -266,7 +266,7 @@ class SavedModelLoaderTest(test.TestCase, parameterized.TestCase):
     self.assertEqual(graph.get_tensor_by_name("y:0"), ret[0])
     self.assertEqual(graph.get_tensor_by_name("x:0"), ret[1])
 
-    with self.assertRaisesRegexp(ValueError, "not found in graph"):
+    with self.assertRaisesRegex(ValueError, "not found in graph"):
       loader.load_graph(graph, ["foo_graph"], return_elements=["z:0"])
 
 
diff --git a/tensorflow/python/saved_model/model_utils/export_output_test.py b/tensorflow/python/saved_model/model_utils/export_output_test.py
index 13bbeec38b5..8a3f107ce6c 100644
--- a/tensorflow/python/saved_model/model_utils/export_output_test.py
+++ b/tensorflow/python/saved_model/model_utils/export_output_test.py
@@ -39,26 +39,26 @@ class ExportOutputTest(test.TestCase):
   def test_regress_value_must_be_float(self):
     with context.graph_mode():
       value = array_ops.placeholder(dtypes.string, 1, name='output-tensor-1')
-      with self.assertRaisesRegexp(
+      with self.assertRaisesRegex(
           ValueError, 'Regression output value must be a float32 Tensor'):
         export_output_lib.RegressionOutput(value)
 
   def test_classify_classes_must_be_strings(self):
     with context.graph_mode():
       classes = array_ops.placeholder(dtypes.float32, 1, name='output-tensor-1')
-      with self.assertRaisesRegexp(
+      with self.assertRaisesRegex(
           ValueError, 'Classification classes must be a string Tensor'):
         export_output_lib.ClassificationOutput(classes=classes)
 
   def test_classify_scores_must_be_float(self):
     with context.graph_mode():
       scores = array_ops.placeholder(dtypes.string, 1, name='output-tensor-1')
-      with self.assertRaisesRegexp(
+      with self.assertRaisesRegex(
           ValueError, 'Classification scores must be a float32 Tensor'):
         export_output_lib.ClassificationOutput(scores=scores)
 
   def test_classify_requires_classes_or_scores(self):
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         ValueError, 'At least one of scores and classes must be set.'):
       export_output_lib.ClassificationOutput()
 
@@ -216,14 +216,12 @@ class ExportOutputTest(test.TestCase):
     export_output_lib.PredictOutput(constant_op.constant([0]))
 
   def test_predict_outputs_invalid(self):
-    with self.assertRaisesRegexp(
-        ValueError,
-        'Prediction output key must be a string'):
+    with self.assertRaisesRegex(ValueError,
+                                'Prediction output key must be a string'):
       export_output_lib.PredictOutput({1: constant_op.constant([0])})
 
-    with self.assertRaisesRegexp(
-        ValueError,
-        'Prediction output value must be a Tensor'):
+    with self.assertRaisesRegex(ValueError,
+                                'Prediction output value must be a Tensor'):
       export_output_lib.PredictOutput({
           'prediction1': sparse_tensor.SparseTensor(
               indices=[[0, 0]], values=[1], dense_shape=[1, 1]),
@@ -276,13 +274,13 @@ class SupervisedOutputTest(test.TestCase):
     self.assertIsNone(outputter.metrics)
 
   def test_supervised_outputs_invalid(self):
-    with self.assertRaisesRegexp(ValueError, 'predictions output value must'):
+    with self.assertRaisesRegex(ValueError, 'predictions output value must'):
       MockSupervisedOutput(constant_op.constant([0]), [3], None)
-    with self.assertRaisesRegexp(ValueError, 'loss output value must'):
+    with self.assertRaisesRegex(ValueError, 'loss output value must'):
       MockSupervisedOutput('str', None, None)
-    with self.assertRaisesRegexp(ValueError, 'metrics output value must'):
+    with self.assertRaisesRegex(ValueError, 'metrics output value must'):
       MockSupervisedOutput(None, None, (15.3, 4))
-    with self.assertRaisesRegexp(ValueError, 'loss output key must'):
+    with self.assertRaisesRegex(ValueError, 'loss output key must'):
       MockSupervisedOutput({25: 'Tensor'}, None, None)
 
   def test_supervised_outputs_tuples(self):
diff --git a/tensorflow/python/saved_model/model_utils/mode_keys_test.py b/tensorflow/python/saved_model/model_utils/mode_keys_test.py
index 26795ef8b16..b0777e6f0c0 100644
--- a/tensorflow/python/saved_model/model_utils/mode_keys_test.py
+++ b/tensorflow/python/saved_model/model_utils/mode_keys_test.py
@@ -39,7 +39,7 @@ class ModeKeyMapTest(test.TestCase):
       _ = mode_map[mode_keys.KerasModeKeys.TRAIN]
     with self.assertRaises(KeyError):
       _ = mode_map[mode_keys.EstimatorModeKeys.TRAIN]
-    with self.assertRaisesRegexp(ValueError, 'Invalid mode'):
+    with self.assertRaisesRegex(ValueError, 'Invalid mode'):
       _ = mode_map['serve']
 
     # Test common dictionary methods
@@ -54,7 +54,7 @@ class ModeKeyMapTest(test.TestCase):
       mode_map[mode_keys.KerasModeKeys.TEST] = 1
 
   def test_invalid_init(self):
-    with self.assertRaisesRegexp(ValueError, 'Multiple keys/values found'):
+    with self.assertRaisesRegex(ValueError, 'Multiple keys/values found'):
       _ = mode_keys.ModeKeyMap(**{
           mode_keys.KerasModeKeys.PREDICT: 3,
           mode_keys.EstimatorModeKeys.PREDICT: 1
diff --git a/tensorflow/python/saved_model/nested_structure_coder_test.py b/tensorflow/python/saved_model/nested_structure_coder_test.py
index c68bc1017ee..9951ea64a49 100644
--- a/tensorflow/python/saved_model/nested_structure_coder_test.py
+++ b/tensorflow/python/saved_model/nested_structure_coder_test.py
@@ -269,8 +269,8 @@ class NestedStructureTest(test.TestCase):
     encoded = struct_pb2.StructuredValue()
     encoded.type_spec_value.type_spec_class = 0
     encoded.type_spec_value.type_spec_class_name = "FutureTensorSpec"
-    with self.assertRaisesRegexp(
-        ValueError, "The type 'FutureTensorSpec' is not supported"):
+    with self.assertRaisesRegex(ValueError,
+                                "The type 'FutureTensorSpec' is not supported"):
       self._coder.decode_proto(encoded)
 
   def testEncodeDecodeBoundedTensorSpec(self):
diff --git a/tensorflow/python/saved_model/save_test.py b/tensorflow/python/saved_model/save_test.py
index 2b846923dfc..0755f11ff71 100644
--- a/tensorflow/python/saved_model/save_test.py
+++ b/tensorflow/python/saved_model/save_test.py
@@ -176,7 +176,7 @@ class SaveTest(test.TestCase):
       return nested_f()
 
     root.f = f
-    with self.assertRaisesRegexp(ValueError, "ERROR MSG"):
+    with self.assertRaisesRegex(ValueError, "ERROR MSG"):
       save.save(root, os.path.join(self.get_temp_dir(), "saved_model"))
 
   def test_version_information_included(self):
@@ -196,8 +196,7 @@ class SaveTest(test.TestCase):
     root.f = def_function.function(lambda x: 2. * x)
     root.f(constant_op.constant(1.))
     save_dir = os.path.join(self.get_temp_dir(), "saved_model")
-    with self.assertRaisesRegexp(
-        ValueError, "Expected a TensorFlow function"):
+    with self.assertRaisesRegex(ValueError, "Expected a TensorFlow function"):
       save.save(root, save_dir, root.f)
 
   def test_captures_unreachable_variable(self):
@@ -216,7 +215,7 @@ class SaveTest(test.TestCase):
 
     save_dir = os.path.join(self.get_temp_dir(), "saved_model")
 
-    with self.assertRaisesRegexp(KeyError, "not reachable from root"):
+    with self.assertRaisesRegex(KeyError, "not reachable from root"):
       save.save(root, save_dir)
 
   def test_nested_inputs(self):
@@ -233,8 +232,7 @@ class SaveTest(test.TestCase):
     root.f(constant_op.constant(1.))
     to_save = root.f.get_concrete_function(constant_op.constant(1.))
     save_dir = os.path.join(self.get_temp_dir(), "saved_model")
-    with self.assertRaisesRegexp(
-        ValueError, "non-flat outputs"):
+    with self.assertRaisesRegex(ValueError, "non-flat outputs"):
       save.save(root, save_dir, to_save)
 
   def test_nested_dict_outputs(self):
@@ -244,8 +242,8 @@ class SaveTest(test.TestCase):
     root.f(constant_op.constant(1.))
     to_save = root.f.get_concrete_function(constant_op.constant(1.))
     save_dir = os.path.join(self.get_temp_dir(), "saved_model")
-    with self.assertRaisesRegexp(
-        ValueError, "dictionary containing non-Tensor value"):
+    with self.assertRaisesRegex(ValueError,
+                                "dictionary containing non-Tensor value"):
       save.save(root, save_dir, to_save)
 
   def test_variable(self):
@@ -355,7 +353,7 @@ class SaveTest(test.TestCase):
   def test_signature_attribute_reserved(self):
     root = util.Checkpoint(signatures=variables.Variable(1.))
     save_dir = os.path.join(self.get_temp_dir(), "saved_model")
-    with self.assertRaisesRegexp(ValueError, "del obj.signatures"):
+    with self.assertRaisesRegex(ValueError, "del obj.signatures"):
       save.save(root, save_dir)
     del root.signatures
     save.save(root, save_dir)
@@ -395,8 +393,8 @@ class SaveTest(test.TestCase):
       return 1
     root = tracking.AutoTrackable()
     root.f = f.get_concrete_function()
-    with self.assertRaisesRegexp(ValueError,
-                                 "tf.Variable inputs cannot be exported"):
+    with self.assertRaisesRegex(ValueError,
+                                "tf.Variable inputs cannot be exported"):
       save.save(root, os.path.join(self.get_temp_dir(), "saved_model"),
                 signatures=root.f)
 
@@ -472,7 +470,7 @@ class SavingOptionsTest(test.TestCase):
     graph_def = graph_pb2.GraphDef()
     text_format.Merge("node { name: 'A' op: 'Test>CustomOp' }",
                       graph_def)
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         ValueError, "Attempted to save ops from non-whitelisted namespaces"):
       save._verify_ops(graph_def, [])
     save._verify_ops(graph_def, ["Test"])
@@ -480,7 +478,7 @@ class SavingOptionsTest(test.TestCase):
     # Test with multiple carrots in op name.
     text_format.Merge("node { name: 'A' op: 'Test>>A>CustomOp' }",
                       graph_def)
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         ValueError, "Attempted to save ops from non-whitelisted namespaces"):
       save._verify_ops(graph_def, [])
     save._verify_ops(graph_def, ["Test"])
@@ -622,7 +620,8 @@ class AssetTests(test.TestCase):
     @def_function.function
     def _calls_save():
       save.save(root, export_dir)
-    with self.assertRaisesRegexp(AssertionError, "tf.function"):
+
+    with self.assertRaisesRegex(AssertionError, "tf.function"):
       _calls_save()
 
 
diff --git a/tensorflow/python/saved_model/saved_model_test.py b/tensorflow/python/saved_model/saved_model_test.py
index c1662af607f..f998bbfce38 100644
--- a/tensorflow/python/saved_model/saved_model_test.py
+++ b/tensorflow/python/saved_model/saved_model_test.py
@@ -172,9 +172,8 @@ class SavedModelTest(SavedModelTestBase):
     export_dir = self._get_export_dir("test_bad_saved_model_file_format")
     # Attempt to load a SavedModel from an export directory that does not exist.
     with self.session(graph=ops.Graph()) as sess:
-      with self.assertRaisesRegexp(IOError,
-                                   "SavedModel file does not exist at: %s" %
-                                   export_dir):
+      with self.assertRaisesRegex(
+          IOError, "SavedModel file does not exist at: %s" % export_dir):
         loader.load(sess, ["foo"], export_dir)
 
     os.makedirs(export_dir)
@@ -183,8 +182,8 @@ class SavedModelTest(SavedModelTestBase):
     with open(path_to_pb, "w") as f:
       f.write("invalid content")
     with self.session(graph=ops.Graph()) as sess:
-      with self.assertRaisesRegexp(IOError, "Cannot parse file.*%s" %
-                                   constants.SAVED_MODEL_FILENAME_PB):
+      with self.assertRaisesRegex(
+          IOError, "Cannot parse file.*%s" % constants.SAVED_MODEL_FILENAME_PB):
         loader.load(sess, ["foo"], export_dir)
 
     # Cleanup the directory and start again.
@@ -197,8 +196,9 @@ class SavedModelTest(SavedModelTestBase):
     with open(path_to_pbtxt, "w") as f:
       f.write("invalid content")
     with self.session(graph=ops.Graph()) as sess:
-      with self.assertRaisesRegexp(IOError, "Cannot parse file.*%s" %
-                                   constants.SAVED_MODEL_FILENAME_PBTXT):
+      with self.assertRaisesRegex(
+          IOError,
+          "Cannot parse file.*%s" % constants.SAVED_MODEL_FILENAME_PBTXT):
         loader.load(sess, ["foo"], export_dir)
 
   @test_util.run_deprecated_v1
@@ -1310,7 +1310,7 @@ class SavedModelTest(SavedModelTestBase):
     # does not have any attr values for the "TestAttr" node, and there is no
     # default specified in the TestAttr OpDef.
     sess = session.Session(graph=ops.Graph())
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         ValueError, "NodeDef missing attr 'T' from Op<name=TestAttr"):
       loader.load(sess, ["foo"], export_dir)
 
@@ -1334,9 +1334,8 @@ class SavedModelTest(SavedModelTestBase):
     # Loading the SavedModel via the loader must fail because there is no
     # OpKernel registered to handle T = double.
     sess = session.Session(graph=ops.Graph())
-    with self.assertRaisesRegexp(
-        errors.InvalidArgumentError,
-        "No OpKernel was registered.*DOUBLE"):
+    with self.assertRaisesRegex(errors.InvalidArgumentError,
+                                "No OpKernel was registered.*DOUBLE"):
       loader.load(sess, ["foo"], export_dir)
 
 
@@ -1429,11 +1428,11 @@ class SavedModelV1Test(SavedModelTestBase):
       ops.add_to_collection(key, control_flow_ops.no_op())
       # ValueError should be raised since the LEGACY_INIT_OP_KEY collection
       # is not empty and we don't support multiple init ops.
-      with self.assertRaisesRegexp(ValueError, "Graph already contains"):
+      with self.assertRaisesRegex(ValueError, "Graph already contains"):
         builder.add_meta_graph_and_variables(
             sess, ["foo"], legacy_init_op=init_op)
       # We shouldn't be able to add as MAIN_OP, either.
-      with self.assertRaisesRegexp(ValueError, "Graph already contains"):
+      with self.assertRaisesRegex(ValueError, "Graph already contains"):
         builder.add_meta_graph_and_variables(sess, ["foo"], main_op=init_op)
 
   def testStripDefaultAttrs(self):
diff --git a/tensorflow/python/saved_model/utils_test.py b/tensorflow/python/saved_model/utils_test.py
index 11d74b7c002..abc93c3455e 100644
--- a/tensorflow/python/saved_model/utils_test.py
+++ b/tensorflow/python/saved_model/utils_test.py
@@ -108,7 +108,7 @@ class UtilsTest(test.TestCase):
 
   def testBuildTensorInfoEager(self):
     x = constant_op.constant(1, name="x")
-    with context.eager_mode(), self.assertRaisesRegexp(
+    with context.eager_mode(), self.assertRaisesRegex(
         RuntimeError, "build_tensor_info is not supported in Eager mode"):
       utils.build_tensor_info(x)
 
diff --git a/tensorflow/python/summary/summary_test.py b/tensorflow/python/summary/summary_test.py
index 64f0f315c58..6dcafed721d 100644
--- a/tensorflow/python/summary/summary_test.py
+++ b/tensorflow/python/summary/summary_test.py
@@ -59,9 +59,9 @@ class SummaryTest(test.TestCase):
       i = constant_op.constant(7)
       with ops.name_scope('outer'):
         im1 = summary_lib.scalar('inner', i, family='family')
-        self.assertEquals(im1.op.name, 'outer/family/inner')
+        self.assertEqual(im1.op.name, 'outer/family/inner')
         im2 = summary_lib.scalar('inner', i, family='family')
-        self.assertEquals(im2.op.name, 'outer/family/inner_1')
+        self.assertEqual(im2.op.name, 'outer/family/inner_1')
       sm1, sm2 = s.run([im1, im2])
     summary = summary_pb2.Summary()
 
@@ -114,7 +114,7 @@ class SummaryTest(test.TestCase):
       i = array_ops.ones((5, 2, 3, 1))
       with ops.name_scope('outer'):
         im = summary_lib.image('inner', i, max_outputs=3, family='family')
-        self.assertEquals(im.op.name, 'outer/family/inner')
+        self.assertEqual(im.op.name, 'outer/family/inner')
       summary_str = s.run(im)
     summary = summary_pb2.Summary()
     summary.ParseFromString(summary_str)
@@ -143,7 +143,7 @@ class SummaryTest(test.TestCase):
       i = array_ops.ones((5, 4, 4, 3))
       with ops.name_scope('outer'):
         summ_op = summary_lib.histogram('inner', i, family='family')
-        self.assertEquals(summ_op.op.name, 'outer/family/inner')
+        self.assertEqual(summ_op.op.name, 'outer/family/inner')
       summary_str = s.run(summ_op)
     summary = summary_pb2.Summary()
     summary.ParseFromString(summary_str)
@@ -177,7 +177,7 @@ class SummaryTest(test.TestCase):
       i = array_ops.ones((5, 3, 4))
       with ops.name_scope('outer'):
         aud = summary_lib.audio('inner', i, 0.2, max_outputs=3, family='family')
-        self.assertEquals(aud.op.name, 'outer/family/inner')
+        self.assertEqual(aud.op.name, 'outer/family/inner')
       summary_str = s.run(aud)
     summary = summary_pb2.Summary()
     summary.ParseFromString(summary_str)
@@ -221,9 +221,9 @@ class SummaryTest(test.TestCase):
     with ops.name_scope('outer'):
       i = constant_op.constant(11)
       summ = summary_lib.scalar('inner', i)
-      self.assertEquals(summ.op.name, 'outer/inner')
+      self.assertEqual(summ.op.name, 'outer/inner')
       summ_f = summary_lib.scalar('inner', i, family='family')
-      self.assertEquals(summ_f.op.name, 'outer/family/inner')
+      self.assertEqual(summ_f.op.name, 'outer/family/inner')
 
     metagraph_def, _ = meta_graph.export_scoped_meta_graph(export_scope='outer')
 
@@ -239,11 +239,11 @@ class SummaryTest(test.TestCase):
         new_summ_str, new_summ_f_str = s.run([new_summ, new_summ_f])
         new_summ_pb = summary_pb2.Summary()
         new_summ_pb.ParseFromString(new_summ_str)
-        self.assertEquals('outer/inner', new_summ_pb.value[0].tag)
+        self.assertEqual('outer/inner', new_summ_pb.value[0].tag)
         new_summ_f_pb = summary_pb2.Summary()
         new_summ_f_pb.ParseFromString(new_summ_f_str)
-        self.assertEquals('family/outer/family/inner',
-                          new_summ_f_pb.value[0].tag)
+        self.assertEqual('family/outer/family/inner',
+                         new_summ_f_pb.value[0].tag)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/summary/writer/writer_test.py b/tensorflow/python/summary/writer/writer_test.py
index 2fec2c446f9..19138b1372d 100644
--- a/tensorflow/python/summary/writer/writer_test.py
+++ b/tensorflow/python/summary/writer/writer_test.py
@@ -81,12 +81,12 @@ class FileWriterTestBase(object):
     # The first event should list the file_version.
     ev = next(rr)
     self._assertRecent(ev.wall_time)
-    self.assertEquals("brain.Event:2", ev.file_version)
+    self.assertEqual("brain.Event:2", ev.file_version)
 
     # The next event should have the graph.
     ev = next(rr)
     self._assertRecent(ev.wall_time)
-    self.assertEquals(0, ev.step)
+    self.assertEqual(0, ev.step)
     ev_graph = graph_pb2.GraphDef()
     ev_graph.ParseFromString(ev.graph_def)
     self.assertProtoEquals(g.as_graph_def(add_shapes=has_shapes), ev_graph)
@@ -94,7 +94,7 @@ class FileWriterTestBase(object):
     # The next event should have the metagraph.
     ev = next(rr)
     self._assertRecent(ev.wall_time)
-    self.assertEquals(0, ev.step)
+    self.assertEqual(0, ev.step)
     ev_meta_graph = meta_graph_pb2.MetaGraphDef()
     ev_meta_graph.ParseFromString(ev.meta_graph_def)
     self.assertProtoEquals(meta_graph_def, ev_meta_graph)
@@ -132,18 +132,18 @@ class FileWriterTestBase(object):
     # The first event should list the file_version.
     ev = next(rr)
     self._assertRecent(ev.wall_time)
-    self.assertEquals("brain.Event:2", ev.file_version)
+    self.assertEqual("brain.Event:2", ev.file_version)
 
     # The next event should be the START message.
     ev = next(rr)
     self._assertRecent(ev.wall_time)
-    self.assertEquals(1, ev.step)
-    self.assertEquals(SessionLog.START, ev.session_log.status)
+    self.assertEqual(1, ev.step)
+    self.assertEqual(SessionLog.START, ev.session_log.status)
 
     # The next event should have the value 'mee=10.0'.
     ev = next(rr)
     self._assertRecent(ev.wall_time)
-    self.assertEquals(10, ev.step)
+    self.assertEqual(10, ev.step)
     self.assertProtoEquals("""
       value { tag: 'mee' simple_value: 10.0 }
       """, ev.summary)
@@ -151,7 +151,7 @@ class FileWriterTestBase(object):
     # The next event should have the value 'boo=20.0'.
     ev = next(rr)
     self._assertRecent(ev.wall_time)
-    self.assertEquals(20, ev.step)
+    self.assertEqual(20, ev.step)
     self.assertProtoEquals("""
       value { tag: 'boo' simple_value: 20.0 }
       """, ev.summary)
@@ -159,7 +159,7 @@ class FileWriterTestBase(object):
     # The next event should have the graph_def.
     ev = next(rr)
     self._assertRecent(ev.wall_time)
-    self.assertEquals(30, ev.step)
+    self.assertEqual(30, ev.step)
     ev_graph = graph_pb2.GraphDef()
     ev_graph.ParseFromString(ev.graph_def)
     self.assertProtoEquals(g.as_graph_def(add_shapes=True), ev_graph)
@@ -167,8 +167,8 @@ class FileWriterTestBase(object):
     # The next event should have metadata for the run.
     ev = next(rr)
     self._assertRecent(ev.wall_time)
-    self.assertEquals(40, ev.step)
-    self.assertEquals("test run", ev.tagged_run_metadata.tag)
+    self.assertEqual(40, ev.step)
+    self.assertEqual("test run", ev.tagged_run_metadata.tag)
     parsed_run_metadata = config_pb2.RunMetadata()
     parsed_run_metadata.ParseFromString(ev.tagged_run_metadata.run_metadata)
     self.assertProtoEquals(run_metadata, parsed_run_metadata)
@@ -245,19 +245,19 @@ class FileWriterTestBase(object):
 
     # We should now have 2 events files.
     event_paths = sorted(glob.glob(os.path.join(test_dir, "event*")))
-    self.assertEquals(2, len(event_paths))
+    self.assertEqual(2, len(event_paths))
 
     # Check the first file contents.
     rr = summary_iterator.summary_iterator(event_paths[0])
     # The first event should list the file_version.
     ev = next(rr)
     self._assertRecent(ev.wall_time)
-    self.assertEquals("brain.Event:2", ev.file_version)
+    self.assertEqual("brain.Event:2", ev.file_version)
     # The next event should be the START message.
     ev = next(rr)
     self._assertRecent(ev.wall_time)
-    self.assertEquals(1, ev.step)
-    self.assertEquals(SessionLog.START, ev.session_log.status)
+    self.assertEqual(1, ev.step)
+    self.assertEqual(SessionLog.START, ev.session_log.status)
     # We should be done.
     self.assertRaises(StopIteration, lambda: next(rr))
 
@@ -266,12 +266,12 @@ class FileWriterTestBase(object):
     # The first event should list the file_version.
     ev = next(rr)
     self._assertRecent(ev.wall_time)
-    self.assertEquals("brain.Event:2", ev.file_version)
+    self.assertEqual("brain.Event:2", ev.file_version)
     # The next event should be the START message.
     ev = next(rr)
     self._assertRecent(ev.wall_time)
-    self.assertEquals(2, ev.step)
-    self.assertEquals(SessionLog.START, ev.session_log.status)
+    self.assertEqual(2, ev.step)
+    self.assertEqual(SessionLog.START, ev.session_log.status)
     # We should be done.
     self.assertRaises(StopIteration, lambda: next(rr))
 
@@ -307,7 +307,7 @@ class FileWriterTestBase(object):
     with self._FileWriter(test_dir) as sw:
       sw.add_session_log(event_pb2.SessionLog(status=SessionLog.START), 1)
     event_paths = sorted(glob.glob(os.path.join(test_dir, "event*")))
-    self.assertEquals(1, len(event_paths))
+    self.assertEqual(1, len(event_paths))
 
   # Checks that values returned from session Run() calls are added correctly to
   # summaries.  These are numpy types so we need to check they fit in the
@@ -336,13 +336,13 @@ class FileWriterTestBase(object):
     ev = next(rr)
     self.assertTrue(ev)
     self._assertRecent(ev.wall_time)
-    self.assertEquals("brain.Event:2", ev.file_version)
+    self.assertEqual("brain.Event:2", ev.file_version)
 
     # Summary passed serialized.
     ev = next(rr)
     self.assertTrue(ev)
     self._assertRecent(ev.wall_time)
-    self.assertEquals(1, ev.step)
+    self.assertEqual(1, ev.step)
     self.assertProtoEquals("""
       value { tag: 'i' simple_value: 1.0 }
       """, ev.summary)
@@ -351,7 +351,7 @@ class FileWriterTestBase(object):
     ev = next(rr)
     self.assertTrue(ev)
     self._assertRecent(ev.wall_time)
-    self.assertEquals(2, ev.step)
+    self.assertEqual(2, ev.step)
     self.assertProtoEquals("""
       value { tag: 'l' simple_value: 2.0 }
       """, ev.summary)
@@ -383,13 +383,13 @@ class FileWriterTestBase(object):
     # The first event should list the file_version.
     ev = next(rr)
     self._assertRecent(ev.wall_time)
-    self.assertEquals("brain.Event:2", ev.file_version)
+    self.assertEqual("brain.Event:2", ev.file_version)
 
     # The next event should be the START message.
     ev = next(rr)
     self._assertRecent(ev.wall_time)
-    self.assertEquals(1, ev.step)
-    self.assertEquals(SessionLog.START, ev.session_log.status)
+    self.assertEqual(1, ev.step)
+    self.assertEqual(SessionLog.START, ev.session_log.status)
 
     # This is the first event with tag foo. It should contain SummaryMetadata.
     ev = next(rr)
diff --git a/tensorflow/python/tools/saved_model_cli_test.py b/tensorflow/python/tools/saved_model_cli_test.py
index cc9e2f21ddc..0baca7fef55 100644
--- a/tensorflow/python/tools/saved_model_cli_test.py
+++ b/tensorflow/python/tools/saved_model_cli_test.py
@@ -605,7 +605,7 @@ Defined Functions:
         'regress_x_to_y', '--input_examples', 'inputs={"x":8.0,"x2":5.0}',
         '--outdir', output_dir
     ])
-    with self.assertRaisesRegexp(ValueError, 'must be a list'):
+    with self.assertRaisesRegex(ValueError, 'must be a list'):
       saved_model_cli.run(args)
 
   def testRunCommandInputExamplesFeatureValueNotListError(self):
@@ -617,7 +617,7 @@ Defined Functions:
         'regress_x_to_y', '--input_examples', 'inputs=[{"x":8.0,"x2":5.0}]',
         '--outdir', output_dir
     ])
-    with self.assertRaisesRegexp(ValueError, 'feature value must be a list'):
+    with self.assertRaisesRegex(ValueError, 'feature value must be a list'):
       saved_model_cli.run(args)
 
   def testRunCommandInputExamplesFeatureBadType(self):
@@ -629,7 +629,7 @@ Defined Functions:
         'regress_x_to_y', '--input_examples', 'inputs=[{"x":[[1],[2]]}]',
         '--outdir', output_dir
     ])
-    with self.assertRaisesRegexp(ValueError, 'is not supported'):
+    with self.assertRaisesRegex(ValueError, 'is not supported'):
       saved_model_cli.run(args)
 
   def testRunCommandOutputFileExistError(self):
@@ -725,7 +725,7 @@ Defined Functions:
          '--output_prefix', output_dir,
          '--cpp_class', 'Compiled',
          '--signature_def_key', 'MISSING'])
-    with self.assertRaisesRegexp(ValueError, 'Unable to find signature_def'):
+    with self.assertRaisesRegex(ValueError, 'Unable to find signature_def'):
       saved_model_cli.aot_compile_cpu(args)
 
   class AOTCompileDummyModel(tracking.AutoTrackable):
@@ -791,7 +791,7 @@ Defined Functions:
     ])  # Use the default seving signature_key.
     with test.mock.patch.object(logging, 'warn') as captured_warn:
       saved_model_cli.aot_compile_cpu(args)
-    self.assertRegexpMatches(
+    self.assertRegex(
         str(captured_warn.call_args),
         'Signature input key \'y\'.*has been pruned while freezing the graph.')
     self.assertTrue(file_io.file_exists('{}.o'.format(output_prefix)))
diff --git a/tensorflow/python/tools/saved_model_utils_test.py b/tensorflow/python/tools/saved_model_utils_test.py
index 5512dea1f74..b36862dfe44 100644
--- a/tensorflow/python/tools/saved_model_utils_test.py
+++ b/tensorflow/python/tools/saved_model_utils_test.py
@@ -59,7 +59,7 @@ class SavedModelUtilTest(test.TestCase):
 
   def testReadSavedModelInvalid(self):
     saved_model_dir = os.path.join(test.get_temp_dir(), "invalid_saved_model")
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         IOError, "SavedModel file does not exist at: %s" % saved_model_dir):
       saved_model_utils.read_saved_model(saved_model_dir)
 
diff --git a/tensorflow/python/tpu/feature_column_v2_test.py b/tensorflow/python/tpu/feature_column_v2_test.py
index 932fe4e5a0a..ba5ea41754e 100644
--- a/tensorflow/python/tpu/feature_column_v2_test.py
+++ b/tensorflow/python/tpu/feature_column_v2_test.py
@@ -411,7 +411,7 @@ class DeviceSpecificEmbeddingColumnTestV2(test.TestCase,
           embedding_lookup_device='cpu',
           tensor_core_shape=[None, 3])
     dense_features = fc_lib.DenseFeatures(embedding_column)
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         ValueError,
         r'.*embedding_lookup_device=\"cpu\" during training is not'):
       dense_features(input_features)
@@ -432,7 +432,7 @@ class DeviceSpecificEmbeddingColumnTestV2(test.TestCase,
     context = tpu._TPUInferenceContext('tpu_inference')
     context.Enter()
     dense_features = fc_lib.DenseFeatures(embedding_column)
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         ValueError,
         r'Using embedding_lookup_device=tpu_embedding_core during inference is '
     ):
@@ -522,7 +522,7 @@ class DeviceSpecificEmbeddingColumnTestV2(test.TestCase,
       dense_features = fc_lib.DenseFeatures(embedding_column)
       # Sqrtn combiner not supported for now.
       if combiner == 'sqrtn':
-        with self.assertRaisesRegexp(
+        with self.assertRaisesRegex(
             ValueError, 'Dense TPU Embedding does not support combiner'):
           embedding_lookup = dense_features(input_features)
         return
@@ -633,8 +633,7 @@ class DeviceSpecificEmbeddingColumnTestV2(test.TestCase,
   def test_error_dense_shape_invalid(self):
     categorical_column_input = fc_lib.categorical_column_with_identity(
         key='inp', num_buckets=5)
-    with self.assertRaisesRegexp(ValueError,
-                                 'tensor_core_shape must be size 2'):
+    with self.assertRaisesRegex(ValueError, 'tensor_core_shape must be size 2'):
       tpu_fc.shared_embedding_columns_v2([categorical_column_input],
                                          dimension=20,
                                          tensor_core_shape=[None, 20, 15])
diff --git a/tensorflow/python/tpu/tpu_test.py b/tensorflow/python/tpu/tpu_test.py
index beaa17715ed..c1a7e4dae92 100644
--- a/tensorflow/python/tpu/tpu_test.py
+++ b/tensorflow/python/tpu/tpu_test.py
@@ -130,12 +130,12 @@ class TPUGraphPruneTest(test.TestCase):
           tpu._TPU_REPLICATE_ATTR)
       self.assertEqual(b"0", x)
       # Verify that ops "b" and "y" have TPU_REPLICATE_ATTR removed.
-      with self.assertRaisesRegexp(
+      with self.assertRaisesRegex(
           ValueError,
           "Operation \'import/b\' has no attr named \'_tpu_replicate\'"):
         graph.get_operation_by_name("import/b").get_attr(
             tpu._TPU_REPLICATE_ATTR)
-      with self.assertRaisesRegexp(
+      with self.assertRaisesRegex(
           ValueError,
           "Operation \'import/y\' has no attr named \'_tpu_replicate\'"):
         graph.get_operation_by_name("import/y").get_attr(
diff --git a/tensorflow/python/training/adadelta_test.py b/tensorflow/python/training/adadelta_test.py
index 5bc2937e144..9b84a2efb65 100644
--- a/tensorflow/python/training/adadelta_test.py
+++ b/tensorflow/python/training/adadelta_test.py
@@ -86,19 +86,19 @@ class AdadeltaOptimizerTest(test.TestCase):
             self.assertEqual(["accum", "accum_update"],
                              adadelta_opt.get_slot_names())
             slot[0] = adadelta_opt.get_slot(var0, "accum")
-            self.assertEquals(slot[0].get_shape(), var0.get_shape())
+            self.assertEqual(slot[0].get_shape(), var0.get_shape())
             self.assertFalse(slot[0] in variables.trainable_variables())
 
             slot_update[0] = adadelta_opt.get_slot(var0, "accum_update")
-            self.assertEquals(slot_update[0].get_shape(), var0.get_shape())
+            self.assertEqual(slot_update[0].get_shape(), var0.get_shape())
             self.assertFalse(slot_update[0] in variables.trainable_variables())
 
             slot[1] = adadelta_opt.get_slot(var1, "accum")
-            self.assertEquals(slot[1].get_shape(), var1.get_shape())
+            self.assertEqual(slot[1].get_shape(), var1.get_shape())
             self.assertFalse(slot[1] in variables.trainable_variables())
 
             slot_update[1] = adadelta_opt.get_slot(var1, "accum_update")
-            self.assertEquals(slot_update[1].get_shape(), var1.get_shape())
+            self.assertEqual(slot_update[1].get_shape(), var1.get_shape())
             self.assertFalse(slot_update[1] in variables.trainable_variables())
 
           # Fetch params to validate initial values
diff --git a/tensorflow/python/training/adagrad_test.py b/tensorflow/python/training/adagrad_test.py
index 4c0ee1c66f5..60cef8a17bb 100644
--- a/tensorflow/python/training/adagrad_test.py
+++ b/tensorflow/python/training/adagrad_test.py
@@ -286,9 +286,9 @@ class AdagradOptimizerTest(test.TestCase):
             zip([grads0, grads1], [var0, var1]))
         self.assertEqual(["accumulator"], ada_opt.get_slot_names())
         slot0 = ada_opt.get_slot(var0, "accumulator")
-        self.assertEquals(slot0.get_shape(), var0.get_shape())
+        self.assertEqual(slot0.get_shape(), var0.get_shape())
         slot1 = ada_opt.get_slot(var1, "accumulator")
-        self.assertEquals(slot1.get_shape(), var1.get_shape())
+        self.assertEqual(slot1.get_shape(), var1.get_shape())
         self.evaluate(variables.global_variables_initializer())
 
         # Fetch params to validate initial values.
diff --git a/tensorflow/python/training/basic_loops_test.py b/tensorflow/python/training/basic_loops_test.py
index 511a8334d56..748116331dc 100644
--- a/tensorflow/python/training/basic_loops_test.py
+++ b/tensorflow/python/training/basic_loops_test.py
@@ -71,7 +71,7 @@ class BasicTrainLoopTest(test.TestCase):
     train_fn.counter = 0
 
     with ops.Graph().as_default():
-      with self.assertRaisesRegexp(RuntimeError, "Failed"):
+      with self.assertRaisesRegex(RuntimeError, "Failed"):
         basic_loops.basic_train_loop(sv, train_fn)
 
   @test_util.run_deprecated_v1
@@ -96,9 +96,9 @@ class BasicTrainLoopTest(test.TestCase):
 
     with ops.Graph().as_default():
       aar = AbortAndRetry()
-      with self.assertRaisesRegexp(RuntimeError, "Failed Again"):
+      with self.assertRaisesRegex(RuntimeError, "Failed Again"):
         basic_loops.basic_train_loop(sv, aar.train_fn)
-      self.assertEquals(0, aar.retries_left)
+      self.assertEqual(0, aar.retries_left)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/training/basic_session_run_hooks_test.py b/tensorflow/python/training/basic_session_run_hooks_test.py
index 678fea89f9e..9acce5e61aa 100644
--- a/tensorflow/python/training/basic_session_run_hooks_test.py
+++ b/tensorflow/python/training/basic_session_run_hooks_test.py
@@ -233,14 +233,14 @@ class LoggingTensorHookTest(test.TestCase):
     tf_logging.info = self._actual_log
 
   def test_illegal_args(self):
-    with self.assertRaisesRegexp(ValueError, 'nvalid every_n_iter'):
+    with self.assertRaisesRegex(ValueError, 'nvalid every_n_iter'):
       basic_session_run_hooks.LoggingTensorHook(tensors=['t'], every_n_iter=0)
-    with self.assertRaisesRegexp(ValueError, 'nvalid every_n_iter'):
+    with self.assertRaisesRegex(ValueError, 'nvalid every_n_iter'):
       basic_session_run_hooks.LoggingTensorHook(tensors=['t'], every_n_iter=-10)
-    with self.assertRaisesRegexp(ValueError, 'xactly one of'):
+    with self.assertRaisesRegex(ValueError, 'xactly one of'):
       basic_session_run_hooks.LoggingTensorHook(
           tensors=['t'], every_n_iter=5, every_n_secs=5)
-    with self.assertRaisesRegexp(ValueError, 'xactly one of'):
+    with self.assertRaisesRegex(ValueError, 'xactly one of'):
       basic_session_run_hooks.LoggingTensorHook(tensors=['t'])
 
   def test_print_at_end_only(self):
@@ -259,7 +259,7 @@ class LoggingTensorHookTest(test.TestCase):
         self.assertEqual(str(self.logged_message).find(t.name), -1)
 
       hook.end(sess)
-      self.assertRegexpMatches(str(self.logged_message), t.name)
+      self.assertRegex(str(self.logged_message), t.name)
 
   def _validate_print_every_n_steps(self, sess, at_end):
     t = constant_op.constant(42.0, name='foo')
@@ -271,7 +271,7 @@ class LoggingTensorHookTest(test.TestCase):
     mon_sess = monitored_session._HookedSession(sess, [hook])
     self.evaluate(variables_lib.global_variables_initializer())
     mon_sess.run(train_op)
-    self.assertRegexpMatches(str(self.logged_message), t.name)
+    self.assertRegex(str(self.logged_message), t.name)
     for _ in range(3):
       self.logged_message = ''
       for _ in range(9):
@@ -279,7 +279,7 @@ class LoggingTensorHookTest(test.TestCase):
         # assertNotRegexpMatches is not supported by python 3.1 and later
         self.assertEqual(str(self.logged_message).find(t.name), -1)
       mon_sess.run(train_op)
-      self.assertRegexpMatches(str(self.logged_message), t.name)
+      self.assertRegex(str(self.logged_message), t.name)
 
     # Add additional run to verify proper reset when called multiple times.
     self.logged_message = ''
@@ -290,7 +290,7 @@ class LoggingTensorHookTest(test.TestCase):
     self.logged_message = ''
     hook.end(sess)
     if at_end:
-      self.assertRegexpMatches(str(self.logged_message), t.name)
+      self.assertRegex(str(self.logged_message), t.name)
     else:
       # assertNotRegexpMatches is not supported by python 3.1 and later
       self.assertEqual(str(self.logged_message).find(t.name), -1)
@@ -318,7 +318,7 @@ class LoggingTensorHookTest(test.TestCase):
       mon_sess = monitored_session._HookedSession(sess, [hook])
       self.evaluate(variables_lib.global_variables_initializer())
       mon_sess.run(train_op)
-      self.assertRegexpMatches(str(self.logged_message), 'foo')
+      self.assertRegex(str(self.logged_message), 'foo')
       # in first run, elapsed time is None.
       self.assertEqual(str(self.logged_message).find('sec'), -1)
 
@@ -333,7 +333,7 @@ class LoggingTensorHookTest(test.TestCase):
     self.evaluate(variables_lib.global_variables_initializer())
 
     mon_sess.run(train_op)
-    self.assertRegexpMatches(str(self.logged_message), t.name)
+    self.assertRegex(str(self.logged_message), t.name)
 
     # assertNotRegexpMatches is not supported by python 3.1 and later
     self.logged_message = ''
@@ -343,12 +343,12 @@ class LoggingTensorHookTest(test.TestCase):
 
     self.logged_message = ''
     mon_sess.run(train_op)
-    self.assertRegexpMatches(str(self.logged_message), t.name)
+    self.assertRegex(str(self.logged_message), t.name)
 
     self.logged_message = ''
     hook.end(sess)
     if at_end:
-      self.assertRegexpMatches(str(self.logged_message), t.name)
+      self.assertRegex(str(self.logged_message), t.name)
     else:
       # assertNotRegexpMatches is not supported by python 3.1 and later
       self.assertEqual(str(self.logged_message).find(t.name), -1)
@@ -1070,9 +1070,8 @@ class StepCounterHookTest(test.TestCase):
       with test.mock.patch.object(tf_logging, 'log_first_n') as mock_log:
         for _ in range(30):
           mon_sess.run(train_op)
-        self.assertRegexpMatches(
-            str(mock_log.call_args),
-            'global step.*has not been increased')
+        self.assertRegex(
+            str(mock_log.call_args), 'global step.*has not been increased')
       hook.end(sess)
 
   def _setup_steps_per_run_test(self,
@@ -1422,12 +1421,11 @@ class FinalOpsHookTest(test.TestCase):
       with session_lib.Session() as session:
         session.run(read_ops)
         with test.mock.patch.object(tf_logging, 'warning') as mock_log:
-          with self.assertRaisesRegexp(errors.OutOfRangeError,
-                                       'End of sequence'):
+          with self.assertRaisesRegex(errors.OutOfRangeError,
+                                      'End of sequence'):
             hook.end(session)
-          self.assertRegexpMatches(
-              str(mock_log.call_args),
-              'dependency back to some input source')
+          self.assertRegex(
+              str(mock_log.call_args), 'dependency back to some input source')
 
   def test_final_ops_with_dictionary(self):
     with ops.Graph().as_default():
diff --git a/tensorflow/python/training/checkpoint_management_test.py b/tensorflow/python/training/checkpoint_management_test.py
index 34666e32ab6..f8c45306168 100644
--- a/tensorflow/python/training/checkpoint_management_test.py
+++ b/tensorflow/python/training/checkpoint_management_test.py
@@ -80,7 +80,7 @@ class LatestCheckpointWithRelativePaths(test.TestCase):
 
           # Should fail.
           saver = saver_module.Saver(sharded=False)
-          with self.assertRaisesRegexp(ValueError, "collides with"):
+          with self.assertRaisesRegex(ValueError, "collides with"):
             saver.save(sess, filepath)
 
           # Succeeds: the file will be named "checkpoint-<step>".
@@ -507,7 +507,7 @@ class CheckpointManagerTest(test.TestCase):
     with test.mock.patch.object(logging, "warning") as mock_log:
       second_manager = checkpoint_management.CheckpointManager(
           checkpoint, directory, max_to_keep=1)
-      self.assertRegexpMatches(
+      self.assertRegex(
           str(mock_log.call_args),
           "behind the last preserved checkpoint timestamp")
     # We should err on the side of keeping checkpoints around when we're not
diff --git a/tensorflow/python/training/coordinator_test.py b/tensorflow/python/training/coordinator_test.py
index f294df97b4b..4795ae5a7a4 100644
--- a/tensorflow/python/training/coordinator_test.py
+++ b/tensorflow/python/training/coordinator_test.py
@@ -154,7 +154,7 @@ class CoordinatorTest(test.TestCase):
         t.start()
       wait_for_stop_ev.set()
       has_stopped_ev.wait()
-      with self.assertRaisesRegexp(RuntimeError, "threads still running"):
+      with self.assertRaisesRegex(RuntimeError, "threads still running"):
         coord.join(threads, stop_grace_period_secs=stop_grace_period)
 
     TestWithGracePeriod(1e-10)
@@ -194,7 +194,7 @@ class CoordinatorTest(test.TestCase):
 
     ev_1.set()
 
-    with self.assertRaisesRegexp(RuntimeError, "First"):
+    with self.assertRaisesRegex(RuntimeError, "First"):
       coord.join(threads)
 
   def testJoinRaiseReportException(self):
@@ -213,7 +213,7 @@ class CoordinatorTest(test.TestCase):
       t.start()
 
     ev_1.set()
-    with self.assertRaisesRegexp(RuntimeError, "First"):
+    with self.assertRaisesRegex(RuntimeError, "First"):
       coord.join(threads)
 
   def testJoinIgnoresOutOfRange(self):
@@ -261,7 +261,7 @@ class CoordinatorTest(test.TestCase):
       t.start()
 
     ev_1.set()
-    with self.assertRaisesRegexp(RuntimeError, "First"):
+    with self.assertRaisesRegex(RuntimeError, "First"):
       coord.join(threads)
 
   def testClearStopClearsExceptionToo(self):
@@ -275,7 +275,7 @@ class CoordinatorTest(test.TestCase):
     for t in threads:
       t.start()
 
-    with self.assertRaisesRegexp(RuntimeError, "First"):
+    with self.assertRaisesRegex(RuntimeError, "First"):
       ev_1.set()
       coord.join(threads)
     coord.clear_stop()
@@ -286,7 +286,7 @@ class CoordinatorTest(test.TestCase):
     ]
     for t in threads:
       t.start()
-    with self.assertRaisesRegexp(RuntimeError, "Second"):
+    with self.assertRaisesRegex(RuntimeError, "Second"):
       ev_1.set()
       coord.join(threads)
 
@@ -295,7 +295,7 @@ class CoordinatorTest(test.TestCase):
     # Join the coordinator right away.
     coord.join([])
     reported = False
-    with self.assertRaisesRegexp(RuntimeError, "Too late"):
+    with self.assertRaisesRegex(RuntimeError, "Too late"):
       try:
         raise RuntimeError("Too late")
       except RuntimeError as e:
@@ -308,7 +308,7 @@ class CoordinatorTest(test.TestCase):
       raise RuntimeError("After clear")
     except RuntimeError as e:
       coord.request_stop(e)
-    with self.assertRaisesRegexp(RuntimeError, "After clear"):
+    with self.assertRaisesRegex(RuntimeError, "After clear"):
       coord.join([])
 
   def testRequestStopRaisesIfJoined_ExcInfo(self):
@@ -317,7 +317,7 @@ class CoordinatorTest(test.TestCase):
     # Join the coordinator right away.
     coord.join([])
     reported = False
-    with self.assertRaisesRegexp(RuntimeError, "Too late"):
+    with self.assertRaisesRegex(RuntimeError, "Too late"):
       try:
         raise RuntimeError("Too late")
       except RuntimeError:
@@ -330,7 +330,7 @@ class CoordinatorTest(test.TestCase):
       raise RuntimeError("After clear")
     except RuntimeError:
       coord.request_stop(sys.exc_info())
-    with self.assertRaisesRegexp(RuntimeError, "After clear"):
+    with self.assertRaisesRegex(RuntimeError, "After clear"):
       coord.join([])
 
 
diff --git a/tensorflow/python/training/experimental/loss_scale_optimizer_test.py b/tensorflow/python/training/experimental/loss_scale_optimizer_test.py
index 7e4e6983905..46d71fd8cbb 100644
--- a/tensorflow/python/training/experimental/loss_scale_optimizer_test.py
+++ b/tensorflow/python/training/experimental/loss_scale_optimizer_test.py
@@ -310,7 +310,7 @@ class MixedPrecisionLossScaleOptimizerTest(test.TestCase,
 
   def testPassingNoneToLossScale(self):
     opt = gradient_descent.GradientDescentOptimizer(1.0)
-    with self.assertRaisesRegexp(ValueError, r'loss_scale cannot be None'):
+    with self.assertRaisesRegex(ValueError, r'loss_scale cannot be None'):
       loss_scale_optimizer.MixedPrecisionLossScaleOptimizer(opt, None)
 
 
diff --git a/tensorflow/python/training/experimental/loss_scaling_gradient_tape_test.py b/tensorflow/python/training/experimental/loss_scaling_gradient_tape_test.py
index bdac125ee82..5c6b4d71649 100644
--- a/tensorflow/python/training/experimental/loss_scaling_gradient_tape_test.py
+++ b/tensorflow/python/training/experimental/loss_scaling_gradient_tape_test.py
@@ -214,7 +214,7 @@ class LossScaleGradientTapeTest(test.TestCase, parameterized.TestCase):
       y = x * x
       z = y * y
     g.gradient(z, x)
-    with self.assertRaisesRegexp(RuntimeError, 'persistent'):
+    with self.assertRaisesRegex(RuntimeError, 'persistent'):
       g.gradient(y, x)
 
   @test_combinations.generate(test_combinations.combine(
@@ -512,7 +512,7 @@ class LossScaleGradientTapeTest(test.TestCase, parameterized.TestCase):
       self.assertAllEqual(dy_dx, np.full((2, 3), 2.))
 
   def test_passing_non_loss_scale_raises_error(self):
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         ValueError,
         '`loss_scale` must be an instance of LossScale, but got: 2.0'):
       lsgt.LossScaleGradientTape(2.0)
@@ -522,7 +522,7 @@ class LossScaleGradientTapeTest(test.TestCase, parameterized.TestCase):
     x = variables.Variable([1.0, 2.0])
     with lsgt.LossScaleGradientTape(loss_scale) as g:
       y = x * 2
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         NotImplementedError,
         'LossScaleGradientTape.jacobian is not yet implemented'):
       g.jacobian(y, x)
@@ -530,7 +530,7 @@ class LossScaleGradientTapeTest(test.TestCase, parameterized.TestCase):
     x = variables.Variable([[1.0, 2.0], [3.0, 4.0]])
     with lsgt.LossScaleGradientTape(loss_scale) as g:
       y = x * 2
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         NotImplementedError,
         'LossScaleGradientTape.batch_jacobian is not yet implemented'):
       g.batch_jacobian(y, x)
diff --git a/tensorflow/python/training/experimental/mixed_precision_test.py b/tensorflow/python/training/experimental/mixed_precision_test.py
index 2ce93245413..c3b7b94b8c8 100644
--- a/tensorflow/python/training/experimental/mixed_precision_test.py
+++ b/tensorflow/python/training/experimental/mixed_precision_test.py
@@ -85,7 +85,7 @@ class MixedPrecisionTest(test.TestCase, parameterized.TestCase):
     else:
       expected_regex = ('"opt" must be an instance of a tf.train.Optimizer or '
                         'a tf.keras.optimizers.Optimizer, but got')
-    with self.assertRaisesRegexp(ValueError, expected_regex):
+    with self.assertRaisesRegex(ValueError, expected_regex):
       enable_mixed_precision_graph_rewrite(opt)
     self.assertFalse(config.get_optimizer_experimental_options()
                      .get('auto_mixed_precision', False))
@@ -93,9 +93,9 @@ class MixedPrecisionTest(test.TestCase, parameterized.TestCase):
     opt = gradient_descent_v1.GradientDescentOptimizer(1.0)
     opt = loss_scale_optimizer_v1.MixedPrecisionLossScaleOptimizer(opt,
                                                                    'dynamic')
-    with self.assertRaisesRegexp(ValueError,
-                                 '"opt" must not already be an instance of a '
-                                 'MixedPrecisionLossScaleOptimizer.'):
+    with self.assertRaisesRegex(
+        ValueError, '"opt" must not already be an instance of a '
+        'MixedPrecisionLossScaleOptimizer.'):
       enable_mixed_precision_graph_rewrite(opt)
     self.assertFalse(config.get_optimizer_experimental_options()
                      .get('auto_mixed_precision', False))
diff --git a/tensorflow/python/training/input_test.py b/tensorflow/python/training/input_test.py
index e874aaa3fa8..3dc889a7895 100644
--- a/tensorflow/python/training/input_test.py
+++ b/tensorflow/python/training/input_test.py
@@ -148,7 +148,7 @@ class InputProducerTest(test_lib.TestCase):
   @test_util.run_deprecated_v1
   def testShapeError(self):
     input_tensor = array_ops.placeholder(dtypes.float32, None)
-    with self.assertRaisesRegexp(ValueError, "fully defined shape"):
+    with self.assertRaisesRegex(ValueError, "fully defined shape"):
       _ = inp.input_producer(input_tensor)
 
 
@@ -268,7 +268,7 @@ class StringInputProducerTest(test_lib.TestCase):
           # writing of the `tf.Graph` object. However, many users
           # write code this way, so we include this test to ensure
           # that we can support it.
-          self.assertEquals(string, self.evaluate(queue.dequeue()))
+          self.assertEqual(string, self.evaluate(queue.dequeue()))
       coord.request_stop()
       coord.join(threads)
 
@@ -440,23 +440,23 @@ class DictHelperTest(test_lib.TestCase):
   def testListInputs(self):
     l = [1, 2, 3, 11, 22, 33]
     l2 = inp._as_tensor_list(l)
-    self.assertEquals(l, l2)
+    self.assertEqual(l, l2)
     l3 = inp._as_original_type(l, l2)
-    self.assertEquals(l, l3)
+    self.assertEqual(l, l3)
 
   def testDictInputs(self):
     d = {"a": 1, "b": 2, "c": 3, "aa": 11, "bb": 22, "cc": 33}
     l = inp._as_tensor_list(d)
-    self.assertEquals([1, 11, 2, 22, 3, 33], l)
+    self.assertEqual([1, 11, 2, 22, 3, 33], l)
     d2 = inp._as_original_type(d, l)
-    self.assertEquals(d, d2)
+    self.assertEqual(d, d2)
 
   def testHeterogeneousKeysDictInputs(self):
     d = {"z": 1, 1: 42, ("a", "b"): 100}
     l = inp._as_tensor_list(d)
-    self.assertEquals([100, 42, 1], l)
+    self.assertEqual([100, 42, 1], l)
     d2 = inp._as_original_type(d, l)
-    self.assertEquals(d, d2)
+    self.assertEqual(d, d2)
 
 
 class BatchTest(test_lib.TestCase):
@@ -790,7 +790,7 @@ class BatchTest(test_lib.TestCase):
   def testCannotInferRankError(self):
     with self.cached_session():
       x = array_ops.placeholder(dtype=dtypes.int64)
-      with self.assertRaisesRegexp(ValueError, "Cannot infer Tensor's rank"):
+      with self.assertRaisesRegex(ValueError, "Cannot infer Tensor's rank"):
         inp.batch([x], batch_size=2)
 
   @test_util.run_deprecated_v1
@@ -900,20 +900,20 @@ class BatchTest(test_lib.TestCase):
   @test_util.run_deprecated_v1
   def testInvalidKeepInputVector(self):
     # Can't have vector `keep_input` with `enqueue_many=False`.
-    with self.assertRaisesRegexp(ValueError, "`keep_input` cannot be a vector"):
+    with self.assertRaisesRegex(ValueError, "`keep_input` cannot be a vector"):
       inp.maybe_batch([array_ops.zeros(5)],
                       keep_input=constant_op.constant([True, False]),
                       batch_size=1,
                       enqueue_many=False)
     # Can't have `keep_input` with more than one dimension.
-    with self.assertRaisesRegexp(ValueError, "must be 0 or 1 dimensions"):
+    with self.assertRaisesRegex(ValueError, "must be 0 or 1 dimensions"):
       inp.maybe_batch([array_ops.zeros(5)],
                       keep_input=constant_op.constant([[True], [False]]),
                       batch_size=1,
                       enqueue_many=True)
     # `keep_input` must have dimensions determined at graph construction.
-    with self.assertRaisesRegexp(ValueError,
-                                 "must be known at graph construction"):
+    with self.assertRaisesRegex(ValueError,
+                                "must be known at graph construction"):
       inp.maybe_batch([array_ops.zeros(5)],
                       keep_input=array_ops.placeholder(dtypes.bool),
                       batch_size=1,
@@ -1114,7 +1114,7 @@ class BatchJoinTest(test_lib.TestCase):
 
   @test_util.run_deprecated_v1
   def testMismatchedDictKeys(self):
-    with self.assertRaisesRegexp(ValueError, "must have the same keys"):
+    with self.assertRaisesRegex(ValueError, "must have the same keys"):
       inp.batch_join(
           [{
               "c": 12,
@@ -1437,7 +1437,7 @@ class BatchJoinTest(test_lib.TestCase):
   def testCannotInferRankError(self):
     with self.cached_session():
       x = array_ops.placeholder(dtype=dtypes.int64)
-      with self.assertRaisesRegexp(ValueError, "Cannot infer Tensor's rank"):
+      with self.assertRaisesRegex(ValueError, "Cannot infer Tensor's rank"):
         inp.batch_join([[x]], batch_size=2)
 
   @test_util.run_deprecated_v1
@@ -1514,20 +1514,20 @@ class BatchJoinTest(test_lib.TestCase):
   @test_util.run_deprecated_v1
   def testInvalidKeepInputVector(self):
     # Can't have vector `keep_input` with `enqueue_many=False`.
-    with self.assertRaisesRegexp(ValueError, "`keep_input` cannot be a vector"):
+    with self.assertRaisesRegex(ValueError, "`keep_input` cannot be a vector"):
       inp.maybe_batch_join([[array_ops.zeros(5)]],
                            keep_input=constant_op.constant([True, False]),
                            batch_size=1,
                            enqueue_many=False)
     # Can't have `keep_input` with more than one dimension.
-    with self.assertRaisesRegexp(ValueError, "must be 0 or 1 dimensions"):
+    with self.assertRaisesRegex(ValueError, "must be 0 or 1 dimensions"):
       inp.maybe_batch_join([[array_ops.zeros(5)]],
                            keep_input=constant_op.constant([[True], [False]]),
                            batch_size=1,
                            enqueue_many=True)
     # `keep_input` must have dimensions determined at graph construction.
-    with self.assertRaisesRegexp(ValueError,
-                                 "must be known at graph construction"):
+    with self.assertRaisesRegex(ValueError,
+                                "must be known at graph construction"):
       inp.maybe_batch_join([[array_ops.zeros(5)]],
                            keep_input=array_ops.placeholder(dtypes.bool),
                            batch_size=1,
@@ -1937,18 +1937,18 @@ class ShuffleBatchTest(test_lib.TestCase):
   @test_util.run_deprecated_v1
   def testInvalidKeepInputVector(self):
     # Can't have vector `keep_input` with `enqueue_many=False`.
-    with self.assertRaisesRegexp(ValueError, "`keep_input` cannot be a vector"):
+    with self.assertRaisesRegex(ValueError, "`keep_input` cannot be a vector"):
       inp.maybe_shuffle_batch([array_ops.zeros(5)], 1, 10, 1,
                               keep_input=constant_op.constant([True, False]),
                               enqueue_many=False)
     # Can't have `keep_input` with more than one dimension.
-    with self.assertRaisesRegexp(ValueError, "must be 0 or 1 dimensions"):
+    with self.assertRaisesRegex(ValueError, "must be 0 or 1 dimensions"):
       inp.maybe_shuffle_batch([array_ops.zeros(5)], 1, 10, 1,
                               keep_input=constant_op.constant([[True]]),
                               enqueue_many=True)
     # `keep_input` must have dimensions determined at graph construction.
-    with self.assertRaisesRegexp(ValueError,
-                                 "must be known at graph construction"):
+    with self.assertRaisesRegex(ValueError,
+                                "must be known at graph construction"):
       inp.maybe_shuffle_batch([array_ops.zeros(5)], 1, 10, 1,
                               keep_input=array_ops.placeholder(dtypes.bool),
                               enqueue_many=True)
@@ -2233,7 +2233,7 @@ class ShuffleBatchJoinTest(test_lib.TestCase):
 
   @test_util.run_deprecated_v1
   def testMismatchedDictKeys(self):
-    with self.assertRaisesRegexp(ValueError, "must have the same keys"):
+    with self.assertRaisesRegex(ValueError, "must have the same keys"):
       inp.shuffle_batch_join(
           [{
               "c": 12,
@@ -2341,20 +2341,20 @@ class ShuffleBatchJoinTest(test_lib.TestCase):
   @test_util.run_deprecated_v1
   def testInvalidKeepInputVector(self):
     # Can't have vector `keep_input` with `enqueue_many=False`.
-    with self.assertRaisesRegexp(ValueError, "`keep_input` cannot be a vector"):
+    with self.assertRaisesRegex(ValueError, "`keep_input` cannot be a vector"):
       inp.maybe_shuffle_batch_join(
           [[array_ops.zeros(5)]], 1, 10, 1,
           keep_input=constant_op.constant([True, False]),
           enqueue_many=False)
     # Can't have `keep_input` with more than one dimension.
-    with self.assertRaisesRegexp(ValueError, "must be 0 or 1 dimensions"):
+    with self.assertRaisesRegex(ValueError, "must be 0 or 1 dimensions"):
       inp.maybe_shuffle_batch_join(
           [[array_ops.zeros(5)]], 1, 10, 1,
           keep_input=constant_op.constant([[True]]),
           enqueue_many=True)
     # `keep_input` must have dimensions determined at graph construction.
-    with self.assertRaisesRegexp(ValueError,
-                                 "must be known at graph construction"):
+    with self.assertRaisesRegex(ValueError,
+                                "must be known at graph construction"):
       inp.maybe_shuffle_batch_join(
           [[array_ops.zeros(5)]], 1, 10, 1,
           keep_input=array_ops.placeholder(dtypes.bool),
diff --git a/tensorflow/python/training/momentum_test.py b/tensorflow/python/training/momentum_test.py
index 639276988a1..b69c828f84b 100644
--- a/tensorflow/python/training/momentum_test.py
+++ b/tensorflow/python/training/momentum_test.py
@@ -75,9 +75,9 @@ class MomentumOptimizerTest(test.TestCase):
       # Check we have slots
       self.assertEqual(["momentum"], mom_opt.get_slot_names())
       slot0 = mom_opt.get_slot(var0, "momentum")
-      self.assertEquals(slot0.get_shape(), var0.get_shape())
+      self.assertEqual(slot0.get_shape(), var0.get_shape())
       slot1 = mom_opt.get_slot(var1, "momentum")
-      self.assertEquals(slot1.get_shape(), var1.get_shape())
+      self.assertEqual(slot1.get_shape(), var1.get_shape())
       if not context.executing_eagerly():
         self.assertFalse(slot0 in variables.trainable_variables())
         self.assertFalse(slot1 in variables.trainable_variables())
@@ -146,7 +146,7 @@ class MomentumOptimizerTest(test.TestCase):
       optimizer_variables = optimizer.variables()
       self.assertStartsWith(optimizer_variables[0].name, "var0")
       self.assertStartsWith(optimizer_variables[1].name, "var1")
-      self.assertEquals(2, len(optimizer_variables))
+      self.assertEqual(2, len(optimizer_variables))
 
     with ops.Graph().as_default():
       var2 = resource_variable_ops.ResourceVariable(
@@ -158,7 +158,7 @@ class MomentumOptimizerTest(test.TestCase):
       optimizer_variables = optimizer.variables()
       self.assertStartsWith(optimizer_variables[0].name, "var2")
       self.assertStartsWith(optimizer_variables[1].name, "var3")
-      self.assertEquals(2, len(optimizer_variables))
+      self.assertEqual(2, len(optimizer_variables))
 
   @test_util.run_deprecated_v1
   def testNesterovMomentum(self):
@@ -299,10 +299,10 @@ class MomentumOptimizerTest(test.TestCase):
         # Check we have slots
         self.assertEqual(["momentum"], mom_opt.get_slot_names())
         slot0 = mom_opt.get_slot(var0, "momentum")
-        self.assertEquals(slot0.get_shape(), var0.get_shape())
+        self.assertEqual(slot0.get_shape(), var0.get_shape())
         self.assertFalse(slot0 in variables.trainable_variables())
         slot1 = mom_opt.get_slot(var1, "momentum")
-        self.assertEquals(slot1.get_shape(), var1.get_shape())
+        self.assertEqual(slot1.get_shape(), var1.get_shape())
         self.assertFalse(slot1 in variables.trainable_variables())
 
         # Fetch params to validate initial values
@@ -482,9 +482,9 @@ class MomentumOptimizerTest(test.TestCase):
         # Check we have slots
         self.assertEqual(["momentum"], mom_opt.get_slot_names())
         slot0 = mom_opt.get_slot(var0, "momentum")
-        self.assertEquals(slot0.get_shape(), var0.get_shape())
+        self.assertEqual(slot0.get_shape(), var0.get_shape())
         slot1 = mom_opt.get_slot(var1, "momentum")
-        self.assertEquals(slot1.get_shape(), var1.get_shape())
+        self.assertEqual(slot1.get_shape(), var1.get_shape())
 
         # Fetch params to validate initial values
         self.assertAllClose([0, 0], self.evaluate(var0)[0])
@@ -557,9 +557,9 @@ class MomentumOptimizerTest(test.TestCase):
 
         self.assertEqual(["momentum"], mom_opt.get_slot_names())
         slot0 = mom_opt.get_slot(var0, "momentum")
-        self.assertEquals(slot0.get_shape(), var0.get_shape())
+        self.assertEqual(slot0.get_shape(), var0.get_shape())
         slot1 = mom_opt.get_slot(var1, "momentum")
-        self.assertEquals(slot1.get_shape(), var1.get_shape())
+        self.assertEqual(slot1.get_shape(), var1.get_shape())
 
         # Fetch params to validate initial values
         self.assertAllClose([1.0, 2.0], self.evaluate(var0))
diff --git a/tensorflow/python/training/monitored_session_test.py b/tensorflow/python/training/monitored_session_test.py
index a78674a3d7c..3c4d5c781bd 100644
--- a/tensorflow/python/training/monitored_session_test.py
+++ b/tensorflow/python/training/monitored_session_test.py
@@ -133,7 +133,7 @@ class ScaffoldTest(test.TestCase):
       variables.VariableV1([1])
       ops.add_to_collection(ops.GraphKeys.SAVERS, saver_lib.Saver())
       ops.add_to_collection(ops.GraphKeys.SAVERS, saver_lib.Saver())
-      with self.assertRaisesRegexp(RuntimeError, 'More than one item'):
+      with self.assertRaisesRegex(RuntimeError, 'More than one item'):
         monitored_session.Scaffold().finalize()
 
   def test_uses_passed_values(self):
@@ -163,8 +163,8 @@ class ScaffoldTest(test.TestCase):
     with ops.Graph().as_default():
       variables.VariableV1([1])
       monitored_session.Scaffold().finalize()
-      with self.assertRaisesRegexp(RuntimeError,
-                                   'Graph is finalized and cannot be modified'):
+      with self.assertRaisesRegex(RuntimeError,
+                                  'Graph is finalized and cannot be modified'):
         constant_op.constant([0])
 
   def test_new_scaffold_from_default_scaffold(self):
@@ -230,7 +230,7 @@ class ScaffoldTest(test.TestCase):
 
   def test_copy_from_scaffold_is_scaffold(self):
     with ops.Graph().as_default():
-      with self.assertRaisesRegexp(
+      with self.assertRaisesRegex(
           TypeError, 'copy_from_scaffold is not a Scaffold instance'):
         monitored_session.Scaffold(copy_from_scaffold=1)
 
@@ -583,8 +583,8 @@ class WrappedSessionTest(test.TestCase):
     with self.cached_session() as sess:
       constant_op.constant(0.0)
       wrapped_sess = monitored_session._WrappedSession(sess)
-      self.assertEquals(sess.graph, wrapped_sess.graph)
-      self.assertEquals(sess.sess_str, wrapped_sess.sess_str)
+      self.assertEqual(sess.graph, wrapped_sess.graph)
+      self.assertEqual(sess.sess_str, wrapped_sess.sess_str)
 
   @test_util.run_deprecated_v1
   def test_should_stop_on_close(self):
@@ -647,8 +647,8 @@ class CoordinatedSessionTest(test.TestCase):
       constant_op.constant(0.0)
       coord = coordinator.Coordinator()
       coord_sess = monitored_session._CoordinatedSession(sess, coord)
-      self.assertEquals(sess.graph, coord_sess.graph)
-      self.assertEquals(sess.sess_str, coord_sess.sess_str)
+      self.assertEqual(sess.graph, coord_sess.graph)
+      self.assertEqual(sess.sess_str, coord_sess.sess_str)
 
   @test_util.run_deprecated_v1
   def test_run(self):
@@ -687,7 +687,7 @@ class CoordinatedSessionTest(test.TestCase):
       self.assertFalse(coord_sess.should_stop())
       self.assertEqual(0, coord_sess.run(c))
       self.assertEqual(1, coord_sess.run(v, feed_dict={c: 1}))
-      with self.assertRaisesRegexp(TypeError, 'None has invalid type'):
+      with self.assertRaisesRegex(TypeError, 'None has invalid type'):
         coord_sess.run([None], feed_dict={c: 2})
       self.assertFalse(coord.should_stop())
       self.assertFalse(coord_sess.should_stop())
@@ -715,7 +715,7 @@ class CoordinatedSessionTest(test.TestCase):
       self.assertEqual(1, coord_sess.run(v, feed_dict={c: 1}))
       for t in threads:
         self.assertTrue(t.is_alive())
-      with self.assertRaisesRegexp(TypeError, 'None has invalid type'):
+      with self.assertRaisesRegex(TypeError, 'None has invalid type'):
         coord_sess.run([None], feed_dict={c: 2})
       coord_sess.close()
       for t in threads:
@@ -894,8 +894,8 @@ class RecoverableSessionTest(test.TestCase):
       constant_op.constant(0.0)
       recoverable_sess = monitored_session._RecoverableSession(
           self._SessionReturner(sess))
-      self.assertEquals(sess.graph, recoverable_sess.graph)
-      self.assertEquals(sess.sess_str, recoverable_sess.sess_str)
+      self.assertEqual(sess.graph, recoverable_sess.graph)
+      self.assertEqual(sess.sess_str, recoverable_sess.sess_str)
 
   @test_util.run_deprecated_v1
   def test_run(self):
@@ -950,7 +950,7 @@ class RecoverableSessionTest(test.TestCase):
       self.assertEqual(11, recoverable_sess.run(v, feed_dict={c: 11}))
       self.assertEqual(0, recoverable_sess.run(v, feed_dict={c: 0}))
       # This will fail and throw a real error as the pop() will fail.
-      with self.assertRaisesRegexp(IndexError, 'pop from empty list'):
+      with self.assertRaisesRegex(IndexError, 'pop from empty list'):
         recoverable_sess.run(v, feed_dict={c: -12})
 
   @test_util.run_deprecated_v1
@@ -1394,7 +1394,7 @@ class HookedSessionTest(test.TestCase):
           None, feed_dict={a_tensor: [10]})
       self.evaluate(variables.global_variables_initializer())
 
-      with self.assertRaisesRegexp(RuntimeError, 'Same tensor is fed'):
+      with self.assertRaisesRegex(RuntimeError, 'Same tensor is fed'):
         mon_sess.run(fetches=add_tensor)
 
   def testHooksAndUserFeedConflicts(self):
@@ -1412,7 +1412,7 @@ class HookedSessionTest(test.TestCase):
           None, feed_dict={b_tensor: [10]})
       self.evaluate(variables.global_variables_initializer())
 
-      with self.assertRaisesRegexp(RuntimeError, 'Same tensor is fed'):
+      with self.assertRaisesRegex(RuntimeError, 'Same tensor is fed'):
         mon_sess.run(fetches=add_tensor, feed_dict={b_tensor: [10]})
 
 
@@ -1703,7 +1703,7 @@ class MonitoredSessionTest(test.TestCase):
       do_step = state_ops.assign_add(gstep, 1)
       hook = RaiseOnceAtCountN(4, RuntimeError('regular exception'))
       session = monitored_session.MonitoredSession(hooks=[hook])
-      with self.assertRaisesRegexp(RuntimeError, 'regular exception'):
+      with self.assertRaisesRegex(RuntimeError, 'regular exception'):
         with session:
           self.assertEqual(0, session.run(gstep))
           self.assertEqual(1, session.run(do_step))
@@ -1724,7 +1724,7 @@ class MonitoredSessionTest(test.TestCase):
       gstep = training_util.get_or_create_global_step()
       session = monitored_session.MonitoredSession()
       run_performed_without_error = False
-      with self.assertRaisesRegexp(RuntimeError, 'a thread wants to stop'):
+      with self.assertRaisesRegex(RuntimeError, 'a thread wants to stop'):
         with session:
           self.assertEqual(0, session.run(gstep))
           # Report an exception through the coordinator.
@@ -1744,7 +1744,7 @@ class MonitoredSessionTest(test.TestCase):
     with ops.Graph().as_default():
       gstep = training_util.get_or_create_global_step()
       session = monitored_session.MonitoredSession()
-      with self.assertRaisesRegexp(RuntimeError, 'a thread wants to stop'):
+      with self.assertRaisesRegex(RuntimeError, 'a thread wants to stop'):
         with session:
           self.assertEqual(0, session.run(gstep))
           # Report an exception through the coordinator.
@@ -1778,7 +1778,7 @@ class MonitoredSessionTest(test.TestCase):
       do_step = state_ops.assign_add(gstep, 1)
       session = monitored_session.MonitoredSession()
       # We should see that exception.
-      with self.assertRaisesRegexp(RuntimeError, 'regular exception'):
+      with self.assertRaisesRegex(RuntimeError, 'regular exception'):
         with session:
           self.assertEqual(1, session.run(do_step))
           self.assertEqual(2, session.run(do_step))
@@ -1904,7 +1904,7 @@ class MonitoredSessionTest(test.TestCase):
   def test_with_statement_and_close(self):
     # Test case for https://github.com/tensorflow/tensorflow/issues/12224
     # where close() inside the with should have a better error message.
-    with self.assertRaisesRegexp(RuntimeError, 'Session is already closed'):
+    with self.assertRaisesRegex(RuntimeError, 'Session is already closed'):
       with monitored_session.MonitoredSession() as session:
         session.close()
 
@@ -1973,7 +1973,7 @@ class MonitoredSessionTest(test.TestCase):
         del step_context, extra_foo
 
       with monitored_session.MonitoredSession() as session:
-        with self.assertRaisesRegexp(
+        with self.assertRaisesRegex(
             ValueError,
             '`step_fn` may either have one `step_context` argument'):
           self.assertEqual(None, session.run_step_fn(step_fn))
@@ -2001,7 +2001,7 @@ class MonitoredSessionTest(test.TestCase):
           del step_context, extra_foo
 
       with monitored_session.MonitoredSession() as session:
-        with self.assertRaisesRegexp(
+        with self.assertRaisesRegex(
             ValueError,
             '`step_fn` may either have one `step_context` argument'):
           model = Model()
@@ -2173,7 +2173,7 @@ class MonitoredSessionTest(test.TestCase):
         return value
 
       with monitored_session.SingularMonitoredSession() as session:
-        with self.assertRaisesRegexp(errors_impl.AbortedError, 'Abort'):
+        with self.assertRaisesRegex(errors_impl.AbortedError, 'Abort'):
           self.assertNear(3.2, session.run_step_fn(step_fn), 0.1)
           self.fail()
 
@@ -2274,7 +2274,7 @@ class SingularMonitoredSessionTest(test.TestCase):
       gstep = training_util.get_or_create_global_step()
       session = monitored_session.SingularMonitoredSession()
       run_performed_without_error = False
-      with self.assertRaisesRegexp(RuntimeError, 'a thread wants to stop'):
+      with self.assertRaisesRegex(RuntimeError, 'a thread wants to stop'):
         with session:
           self.assertEqual(0, session.run(gstep))
           # Report an exception through the coordinator.
diff --git a/tensorflow/python/training/optimizer_test.py b/tensorflow/python/training/optimizer_test.py
index 5775d0b8091..a0e07c5618f 100644
--- a/tensorflow/python/training/optimizer_test.py
+++ b/tensorflow/python/training/optimizer_test.py
@@ -126,7 +126,7 @@ class OptimizerTest(test.TestCase):
         return 5 * var0 + var1
       # pylint: enable=cell-var-from-loop
       sgd_op = gradient_descent.GradientDescentOptimizer(3.0)
-      with self.assertRaisesRegexp(ValueError, 'No.*variables'):
+      with self.assertRaisesRegex(ValueError, 'No.*variables'):
         sgd_op.minimize(loss)
 
   @test_util.run_in_graph_and_eager_modes
@@ -143,7 +143,7 @@ class OptimizerTest(test.TestCase):
         return 5 * var0
       # pylint: enable=cell-var-from-loop
       sgd_op = gradient_descent.GradientDescentOptimizer(3.0)
-      with self.assertRaisesRegexp(ValueError, 'No gradients'):
+      with self.assertRaisesRegex(ValueError, 'No gradients'):
         # var1 has no gradient
         sgd_op.minimize(loss, var_list=[var1])
 
@@ -159,8 +159,8 @@ class OptimizerTest(test.TestCase):
       def loss():
         return constant_op.constant(5.0)
       sgd_op = gradient_descent.GradientDescentOptimizer(3.0)
-      with self.assertRaisesRegexp(ValueError,
-                                   'No gradients provided for any variable'):
+      with self.assertRaisesRegex(ValueError,
+                                  'No gradients provided for any variable'):
         sgd_op.minimize(loss, var_list=[var0, var1])
 
   @test_util.run_in_graph_and_eager_modes
@@ -173,8 +173,8 @@ class OptimizerTest(test.TestCase):
       var1 = resource_variable_ops.ResourceVariable([3.0, 4.0], dtype=dtype,
                                                     name='b_%d' % i)
       sgd_op = gradient_descent.GradientDescentOptimizer(3.0)
-      with self.assertRaisesRegexp(ValueError,
-                                   'No gradients provided for any variable'):
+      with self.assertRaisesRegex(ValueError,
+                                  'No gradients provided for any variable'):
         sgd_op.apply_gradients([(None, var0), (None, var1)])
 
   @test_util.run_in_graph_and_eager_modes
diff --git a/tensorflow/python/training/queue_runner_test.py b/tensorflow/python/training/queue_runner_test.py
index 2868e7bcc69..2dbeb944776 100644
--- a/tensorflow/python/training/queue_runner_test.py
+++ b/tensorflow/python/training/queue_runner_test.py
@@ -126,7 +126,7 @@ class QueueRunnerTest(test.TestCase):
       self.assertEqual(10.0, self.evaluate(dequeue1))
       self.assertEqual(10.0, self.evaluate(dequeue1))
       # And queue1 should now be closed.
-      with self.assertRaisesRegexp(errors_impl.OutOfRangeError, "is closed"):
+      with self.assertRaisesRegex(errors_impl.OutOfRangeError, "is closed"):
         self.evaluate(dequeue1)
 
   def testRespectCoordShouldStop(self):
@@ -162,7 +162,7 @@ class QueueRunnerTest(test.TestCase):
       for t in threads:
         t.start()
       # The exception should be re-raised when joining.
-      with self.assertRaisesRegexp(ValueError, "Operation not in the graph"):
+      with self.assertRaisesRegex(ValueError, "Operation not in the graph"):
         coord.join()
 
   def testGracePeriod(self):
@@ -277,7 +277,7 @@ class QueueRunnerTest(test.TestCase):
     queue_runner_impl.add_queue_runner(qr)
     with self.cached_session():
       init_op.run()
-      with self.assertRaisesRegexp(TypeError, "tf.Session"):
+      with self.assertRaisesRegex(TypeError, "tf.Session"):
         queue_runner_impl.start_queue_runners("NotASession")
 
   def testStartQueueRunnersIgnoresMonitoredSession(self):
diff --git a/tensorflow/python/training/saver_large_variable_test.py b/tensorflow/python/training/saver_large_variable_test.py
index 9d171ea5684..0b5cbf60510 100644
--- a/tensorflow/python/training/saver_large_variable_test.py
+++ b/tensorflow/python/training/saver_large_variable_test.py
@@ -51,8 +51,8 @@ class SaverLargeVariableTest(test.TestCase):
               var.op.name: var
           }, write_version=saver_pb2.SaverDef.V1)
       var.initializer.run()
-      with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
-                                   "Tensor slice is too large to serialize"):
+      with self.assertRaisesRegex(errors_impl.InvalidArgumentError,
+                                  "Tensor slice is too large to serialize"):
         save.save(sess, save_path)
 
 
diff --git a/tensorflow/python/training/saver_test.py b/tensorflow/python/training/saver_test.py
index 5c87be37e4c..0095f3fa269 100644
--- a/tensorflow/python/training/saver_test.py
+++ b/tensorflow/python/training/saver_test.py
@@ -277,7 +277,7 @@ class SaverTest(test.TestCase):
 
       save2 = saver_module.Saver([v])
       save2.restore(sess, save_path)
-      self.assertEquals(self.evaluate(v), [1])
+      self.assertEqual(self.evaluate(v), [1])
 
   def testNoAdditionalOpsAddedBySaverForResourceVariablesOutsideSaveScope(self):
     with ops_lib.Graph().as_default() as g:
@@ -382,7 +382,7 @@ class SaverTest(test.TestCase):
     for ver in (saver_pb2.SaverDef.V1, saver_pb2.SaverDef.V2):
       with self.cached_session() as sess:
         save = saver_module.Saver({"v0": v0}, write_version=ver)
-        with self.assertRaisesRegexp(
+        with self.assertRaisesRegex(
             ValueError, "The passed save_path is not a valid checkpoint:"):
           save.restore(sess, "invalid path")
 
@@ -423,7 +423,7 @@ class SaverTest(test.TestCase):
           variables.Variable.SaveSliceInfo("v1", [1], [0], [1]))
 
       # By default the name used for "v2" will be "v1" and raise an error.
-      with self.assertRaisesRegexp(ValueError, "same name: v1"):
+      with self.assertRaisesRegex(ValueError, "same name: v1"):
         saver_module.Saver([v0, v1, v2])
 
       # The names are different and will work.
@@ -441,7 +441,7 @@ class SaverTest(test.TestCase):
           partitioner=partitioned_variables.fixed_size_partitioner(
               num_shards=2))
       p_v2._name = "p_v1"
-      with self.assertRaisesRegexp(ValueError, "same name: p_v1"):
+      with self.assertRaisesRegex(ValueError, "same name: p_v1"):
         saver_module.Saver([p_v1, p_v2])
 
   def testSameName(self):
@@ -450,12 +450,12 @@ class SaverTest(test.TestCase):
       v2 = saver_test_utils.CheckpointedOp(name="v2")
 
       # Saving one variable under two names raises an error.
-      with self.assertRaisesRegexp(
+      with self.assertRaisesRegex(
           ValueError, "The same saveable will be restored with two names: v0"):
         saver_module.Saver({"v0": v0, "v0too": v0})
 
       # Ditto for custom saveables.
-      with self.assertRaisesRegexp(
+      with self.assertRaisesRegex(
           ValueError, "The same saveable will be restored with two names: v2"):
         saver_module.Saver({"v2": v2.saveable, "v2too": v2.saveable})
 
@@ -631,7 +631,7 @@ class SaverTest(test.TestCase):
   def testVarListShouldBeEmptyInDeferredBuild(self):
     with ops_lib.Graph().as_default():
       v = variables.VariableV1(1.0)
-      with self.assertRaisesRegexp(ValueError, "defer_build"):
+      with self.assertRaisesRegex(ValueError, "defer_build"):
         saver_module.Saver([v], defer_build=True)
 
   def testBuildShouldBeCalledBeforeSaveInCaseOfDeferBuild(self):
@@ -639,7 +639,7 @@ class SaverTest(test.TestCase):
     with ops_lib.Graph().as_default(), session.Session() as sess:
       variables.VariableV1(1.0)
       saver = saver_module.Saver(defer_build=True)
-      with self.assertRaisesRegexp(RuntimeError, "build"):
+      with self.assertRaisesRegex(RuntimeError, "build"):
         saver.save(sess, save_path)
 
   def testDeferredBuild(self):
@@ -677,7 +677,7 @@ class SaverTest(test.TestCase):
     with session.Session("", graph=ops_lib.Graph()) as sess:
       var = variables.VariableV1([[0.0, 0.0], [0.0, 0.0], [0.0, 0.0]])
       save = saver_module.Saver()
-      with self.assertRaisesRegexp(
+      with self.assertRaisesRegex(
           errors_impl.InvalidArgumentError,
           "Assign requires shapes of both tensors to match."):
         save.restore(sess, save_path)
@@ -810,8 +810,8 @@ class SaverTest(test.TestCase):
         # Restore the saved value with different dtype
         # in the parameter nodes.
         save = saver_module.Saver({"v0": v0_wrong_dtype})
-        with self.assertRaisesRegexp(errors.InvalidArgumentError,
-                                     "original dtype"):
+        with self.assertRaisesRegex(errors.InvalidArgumentError,
+                                    "original dtype"):
           save.restore(sess, save_path)
 
   # Test restoring large tensors (triggers a thread pool)
@@ -2612,13 +2612,13 @@ class CheckpointReaderTest(test.TestCase):
       self.assertAllEqual(v0.eval(), v0_tensor)
       self.assertAllEqual(v1.eval(), v1_tensor)
       # Verifies get_tensor() fails for non-existent tensors.
-      with self.assertRaisesRegexp(errors.NotFoundError,
-                                   "v3 not found in checkpoint"):
+      with self.assertRaisesRegex(errors.NotFoundError,
+                                  "v3 not found in checkpoint"):
         reader.get_tensor("v3")
 
   def testNonexistentPath(self):
-    with self.assertRaisesRegexp(errors.NotFoundError,
-                                 "Unsuccessful TensorSliceReader"):
+    with self.assertRaisesRegex(errors.NotFoundError,
+                                "Unsuccessful TensorSliceReader"):
       py_checkpoint_reader.NewCheckpointReader("non-existent")
 
 
@@ -3097,8 +3097,8 @@ class TrackableCompatibilityTests(test.TestCase):
     with self.cached_session() as sess:
       self.evaluate(a.initializer)
       save_path = a_saver.save(sess=sess, save_path=checkpoint_prefix)
-      with self.assertRaisesRegexp(
-          errors.NotFoundError, "Key b not found in checkpoint"):
+      with self.assertRaisesRegex(errors.NotFoundError,
+                                  "Key b not found in checkpoint"):
         b_saver.restore(sess=sess, save_path=save_path)
 
       with self.assertRaises(errors.NotFoundError) as cs:
@@ -3125,7 +3125,7 @@ class TrackableCompatibilityTests(test.TestCase):
       a = variables.VariableV1([1.], name="a")
       a_saver = saver_module.Saver([a])
       with self.session(graph=g) as sess:
-        with self.assertRaisesRegexp(
+        with self.assertRaisesRegex(
             errors.InvalidArgumentError,
             "a mismatch between the current graph and the graph"):
           a_saver.restore(sess=sess, save_path=save_path)
diff --git a/tensorflow/python/training/server_lib_test.py b/tensorflow/python/training/server_lib_test.py
index ea9f70b8208..dc2adb7dee8 100644
--- a/tensorflow/python/training/server_lib_test.py
+++ b/tensorflow/python/training/server_lib_test.py
@@ -162,8 +162,7 @@ class GrpcServerTest(test.TestCase):
     sess.run(dequeue_t)
 
     def blocking_dequeue():
-      with self.assertRaisesRegexp(errors_impl.CancelledError,
-                                   "Session::Close"):
+      with self.assertRaisesRegex(errors_impl.CancelledError, "Session::Close"):
         sess.run(dequeue_t)
 
     blocking_thread = self.checkedThread(blocking_dequeue)
@@ -205,7 +204,7 @@ class GrpcServerTest(test.TestCase):
                      per_process_gpu_memory_fraction)
 
   def testInvalidHostname(self):
-    with self.assertRaisesRegexp(errors_impl.InvalidArgumentError, "port"):
+    with self.assertRaisesRegex(errors_impl.InvalidArgumentError, "port"):
       _ = server_lib.Server(
           {
               "local": ["localhost"]
@@ -535,22 +534,15 @@ class ClusterSpecTest(test.TestCase):
     self.assertTrue(server_lib.ClusterSpec({"job": ["host:port"]}))
 
   def testEq(self):
-    self.assertEquals(server_lib.ClusterSpec({}), server_lib.ClusterSpec({}))
-    self.assertEquals(
-        server_lib.ClusterSpec({
-            "job": ["host:2222"]
-        }),
-        server_lib.ClusterSpec({
-            "job": ["host:2222"]
-        }),)
-    self.assertEquals(
-        server_lib.ClusterSpec({
-            "job": {
-                0: "host:2222"
-            }
-        }), server_lib.ClusterSpec({
-            "job": ["host:2222"]
-        }))
+    self.assertEqual(server_lib.ClusterSpec({}), server_lib.ClusterSpec({}))
+    self.assertEqual(
+        server_lib.ClusterSpec({"job": ["host:2222"]}),
+        server_lib.ClusterSpec({"job": ["host:2222"]}),
+    )
+    self.assertEqual(
+        server_lib.ClusterSpec({"job": {
+            0: "host:2222"
+        }}), server_lib.ClusterSpec({"job": ["host:2222"]}))
 
   def testNe(self):
     self.assertNotEquals(
diff --git a/tensorflow/python/training/session_manager_test.py b/tensorflow/python/training/session_manager_test.py
index 9d7381d08e0..df795ff5f7e 100644
--- a/tensorflow/python/training/session_manager_test.py
+++ b/tensorflow/python/training/session_manager_test.py
@@ -118,7 +118,7 @@ class SessionManagerTest(test.TestCase):
           ready_op=variables.report_uninitialized_variables())
       saver = saver_lib.Saver({"v": v})
       # This should fail as there's no checkpoint within 2 seconds.
-      with self.assertRaisesRegexp(
+      with self.assertRaisesRegex(
           RuntimeError, "no init_op or init_fn or local_init_op was given"):
         sess = sm.prepare_session(
             "",
@@ -164,7 +164,7 @@ class SessionManagerTest(test.TestCase):
           True,
           variables.is_variable_initialized(
               sess.graph.get_tensor_by_name("v:0")).eval(session=sess))
-      self.assertEquals(1, sess.run(v))
+      self.assertEqual(1, sess.run(v))
 
   @test_util.run_v1_only("b/120545219")
   def testRecoverSession(self):
@@ -185,7 +185,7 @@ class SessionManagerTest(test.TestCase):
           "", saver=saver, checkpoint_dir=checkpoint_dir)
       self.assertFalse(initialized)
       sess.run(v.initializer)
-      self.assertEquals(1, sess.run(v))
+      self.assertEqual(1, sess.run(v))
       saver.save(sess, os.path.join(checkpoint_dir,
                                     "recover_session_checkpoint"))
     self._test_recovered_variable(checkpoint_dir=checkpoint_dir)
@@ -214,7 +214,7 @@ class SessionManagerTest(test.TestCase):
   def testInitWithNoneLocalInitOpError(self):
     # Creating a SessionManager with a None local_init_op but
     # non-None ready_for_local_init_op raises ValueError
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         ValueError, "If you pass a ready_for_local_init_op "
         "you must also pass a local_init_op "):
       session_manager.SessionManager(
@@ -242,7 +242,7 @@ class SessionManagerTest(test.TestCase):
           "", saver=saver, checkpoint_dir=checkpoint_dir)
       self.assertFalse(initialized)
       sess.run(v.initializer)
-      self.assertEquals(1, sess.run(v))
+      self.assertEqual(1, sess.run(v))
       saver.save(sess, os.path.join(checkpoint_dir,
                                     "recover_session_checkpoint"))
     # Create a new Graph and SessionManager and recover.
@@ -273,8 +273,8 @@ class SessionManagerTest(test.TestCase):
           True,
           variables.is_variable_initialized(
               sess.graph.get_tensor_by_name("w:0")).eval(session=sess))
-      self.assertEquals(1, sess.run(v))
-      self.assertEquals(1, sess.run(w))
+      self.assertEqual(1, sess.run(v))
+      self.assertEqual(1, sess.run(w))
 
   @test_util.run_v1_only("b/120545219")
   def testRecoverSessionWithReadyForLocalInitOpFailsToReadyLocal(self):
@@ -301,7 +301,7 @@ class SessionManagerTest(test.TestCase):
           "", saver=saver, checkpoint_dir=checkpoint_dir)
       self.assertFalse(initialized)
       sess.run(v.initializer)
-      self.assertEquals(1, sess.run(v))
+      self.assertEqual(1, sess.run(v))
       saver.save(sess, os.path.join(checkpoint_dir,
                                     "recover_session_checkpoint"))
     # Create a new Graph and SessionManager and recover.
@@ -331,7 +331,7 @@ class SessionManagerTest(test.TestCase):
           False,
           variables.is_variable_initialized(
               sess.graph.get_tensor_by_name("w:0")).eval(session=sess))
-      self.assertEquals(1, sess.run(v))
+      self.assertEqual(1, sess.run(v))
 
   @test_util.run_v1_only("b/120545219")
   def testRecoverSessionNoChkptStillRunsLocalInitOp(self):
@@ -360,7 +360,7 @@ class SessionManagerTest(test.TestCase):
           True,
           variables.is_variable_initialized(
               sess.graph.get_tensor_by_name("w:0")).eval(session=sess))
-      self.assertEquals(1, sess.run(w))
+      self.assertEqual(1, sess.run(w))
 
   @test_util.run_v1_only("b/120545219")
   def testRecoverSessionFailsStillRunsLocalInitOp(self):
@@ -404,7 +404,7 @@ class SessionManagerTest(test.TestCase):
           True,
           variables.is_variable_initialized(
               sess.graph.get_tensor_by_name("w:0")).eval(session=sess))
-      self.assertEquals(1, sess.run(w))
+      self.assertEqual(1, sess.run(w))
 
   @test_util.run_v1_only("b/120545219")
   def testWaitForSessionLocalInit(self):
@@ -436,8 +436,8 @@ class SessionManagerTest(test.TestCase):
           True,
           variables.is_variable_initialized(
               sess.graph.get_tensor_by_name("w:0")).eval(session=sess))
-      self.assertEquals(1, sess.run(v))
-      self.assertEquals(1, sess.run(w))
+      self.assertEqual(1, sess.run(v))
+      self.assertEqual(1, sess.run(w))
 
   def testWaitForSessionWithReadyForLocalInitOpFailsToReadyLocal(self):
     with ops.Graph().as_default() as graph:
@@ -472,8 +472,8 @@ class SessionManagerTest(test.TestCase):
           ready_op=variables.report_uninitialized_variables(),
           ready_for_local_init_op=None,
           local_init_op=w.initializer)
-    with self.assertRaisesRegexp(errors_impl.DeadlineExceededError,
-                                 "Session was not ready after waiting.*"):
+    with self.assertRaisesRegex(errors_impl.DeadlineExceededError,
+                                "Session was not ready after waiting.*"):
       sm.wait_for_session("", max_wait_secs=3)
 
   @test_util.run_v1_only("b/120545219")
@@ -512,9 +512,9 @@ class SessionManagerTest(test.TestCase):
           True,
           variables.is_variable_initialized(
               sess.graph.get_tensor_by_name("x:0")).eval(session=sess))
-      self.assertEquals(1, sess.run(v))
-      self.assertEquals(1, sess.run(w))
-      self.assertEquals(3, sess.run(x))
+      self.assertEqual(1, sess.run(v))
+      self.assertEqual(1, sess.run(w))
+      self.assertEqual(3, sess.run(x))
 
   @test_util.run_v1_only("b/120545219")
   def testPrepareSessionWithPartialInitOp(self):
@@ -566,8 +566,8 @@ class SessionManagerTest(test.TestCase):
           True,
           variables.is_variable_initialized(
               sess.graph.get_tensor_by_name("x:0")).eval(session=sess))
-      self.assertEquals(1, sess.run(w))
-      self.assertEquals(3, sess.run(x))
+      self.assertEqual(1, sess.run(w))
+      self.assertEqual(3, sess.run(x))
       self.assertEqual(
           False,
           variables.is_variable_initialized(
@@ -580,8 +580,8 @@ class SessionManagerTest(test.TestCase):
           True,
           variables.is_variable_initialized(
               sess.graph.get_tensor_by_name("x_res:0")).eval(session=sess))
-      self.assertEquals(1, sess.run(w_res))
-      self.assertEquals(3, sess.run(x_res))
+      self.assertEqual(1, sess.run(w_res))
+      self.assertEqual(3, sess.run(x_res))
 
   @test_util.run_v1_only("b/120545219")
   def testPrepareSessionWithCyclicInitializer(self):
@@ -615,8 +615,8 @@ class SessionManagerTest(test.TestCase):
         self.assertEqual(False, variables.is_variable_initialized(w).eval())
       sm2 = session_manager.SessionManager(
           ready_op=variables.report_uninitialized_variables())
-      with self.assertRaisesRegexp(
-          RuntimeError, "Init operations did not make model ready.*"):
+      with self.assertRaisesRegex(RuntimeError,
+                                  "Init operations did not make model ready.*"):
         sm2.prepare_session("", init_op=v.initializer)
 
   def testPrepareSessionDidNotInitLocalVariableList(self):
@@ -632,8 +632,8 @@ class SessionManagerTest(test.TestCase):
         self.assertEqual(False, variables.is_variable_initialized(w).eval())
       sm2 = session_manager.SessionManager(
           ready_op=variables.report_uninitialized_variables())
-      with self.assertRaisesRegexp(RuntimeError,
-                                   "Init operations did not make model ready"):
+      with self.assertRaisesRegex(RuntimeError,
+                                  "Init operations did not make model ready"):
         sm2.prepare_session("", init_op=[v.initializer])
 
   def testPrepareSessionWithReadyNotReadyForLocal(self):
@@ -652,7 +652,7 @@ class SessionManagerTest(test.TestCase):
           ready_for_local_init_op=variables.report_uninitialized_variables(
               variables.global_variables()),
           local_init_op=w.initializer)
-      with self.assertRaisesRegexp(
+      with self.assertRaisesRegex(
           RuntimeError,
           "Init operations did not make model ready for local_init"):
         sm2.prepare_session("", init_op=None)
@@ -673,8 +673,8 @@ class SessionManagerTest(test.TestCase):
           ready_op=variables.report_uninitialized_variables(),
           ready_for_local_init_op=None,
           local_init_op=w.initializer)
-    with self.assertRaisesRegexp(RuntimeError,
-                                 "Init operations did not make model ready.*"):
+    with self.assertRaisesRegex(RuntimeError,
+                                "Init operations did not make model ready.*"):
       sm2.prepare_session("", init_op=None)
 
 
@@ -747,7 +747,7 @@ class ObsoleteSessionManagerTest(test.TestCase):
           ready_op=variables.assert_variables_initialized())
       saver = saver_lib.Saver({"v": v})
       # This should fail as there's no checkpoint within 2 seconds.
-      with self.assertRaisesRegexp(
+      with self.assertRaisesRegex(
           RuntimeError, "no init_op or init_fn or local_init_op was given"):
         sess = sm.prepare_session(
             "",
@@ -791,7 +791,7 @@ class ObsoleteSessionManagerTest(test.TestCase):
           "", saver=saver, checkpoint_dir=checkpoint_dir)
       self.assertFalse(initialized)
       sess.run(v.initializer)
-      self.assertEquals(1, sess.run(v))
+      self.assertEqual(1, sess.run(v))
       saver.save(sess, os.path.join(checkpoint_dir,
                                     "recover_session_checkpoint"))
     # Create a new Graph and SessionManager and recover.
@@ -809,7 +809,7 @@ class ObsoleteSessionManagerTest(test.TestCase):
           True,
           variables.is_variable_initialized(
               sess.graph.get_tensor_by_name("v:0")).eval(session=sess))
-      self.assertEquals(1, sess.run(v))
+      self.assertEqual(1, sess.run(v))
 
   @test_util.run_v1_only("b/120545219")
   def testWaitForSessionReturnsNoneAfterTimeout(self):
diff --git a/tensorflow/python/training/supervisor_test.py b/tensorflow/python/training/supervisor_test.py
index 0529cff1697..adc8b69c66f 100644
--- a/tensorflow/python/training/supervisor_test.py
+++ b/tensorflow/python/training/supervisor_test.py
@@ -122,7 +122,7 @@ class SupervisorTest(test.TestCase):
       my_op = constant_op.constant(1.0)
       sv = supervisor.Supervisor(logdir=logdir)
       last_step = None
-      with self.assertRaisesRegexp(RuntimeError, "failing here"):
+      with self.assertRaisesRegex(RuntimeError, "failing here"):
         with sv.managed_session("") as sess:
           for step in xrange(10):
             last_step = step
@@ -308,7 +308,7 @@ class SupervisorTest(test.TestCase):
     logdir = self._test_dir("managed_main_error_two_queues")
     os.makedirs(logdir)
     data_path = self._csv_data(logdir)
-    with self.assertRaisesRegexp(RuntimeError, "fail at step 3"):
+    with self.assertRaisesRegex(RuntimeError, "fail at step 3"):
       with ops.Graph().as_default():
         # Create an input pipeline that reads the file 3 times.
         filename_queue = input_lib.string_input_producer(
@@ -418,7 +418,7 @@ class SupervisorTest(test.TestCase):
       summ = summary.merge_all()
       sv = supervisor.Supervisor(logdir="", summary_op=None)
       sess = sv.prepare_or_wait_for_session("")
-      with self.assertRaisesRegexp(RuntimeError, "requires a summary writer"):
+      with self.assertRaisesRegex(RuntimeError, "requires a summary writer"):
         sv.summary_computed(sess, sess.run(summ))
 
   @test_util.run_v1_only("train.Supervisor is for v1 only")
@@ -435,7 +435,7 @@ class SupervisorTest(test.TestCase):
       # Check that a checkpoint is still be generated.
       self._wait_for_glob(sv.save_path, 3.0)
       # Check that we cannot write a summary
-      with self.assertRaisesRegexp(RuntimeError, "requires a summary writer"):
+      with self.assertRaisesRegex(RuntimeError, "requires a summary writer"):
         sv.summary_computed(sess, sess.run(summ))
 
   def testNoLogdirButExplicitSummaryWriter(self):
@@ -698,8 +698,7 @@ class SupervisorTest(test.TestCase):
       variables.VariableV1([4.0, 5.0, 6.0], name="w")
       # w will not be initialized.
       sv = supervisor.Supervisor(logdir=logdir, init_op=v.initializer)
-      with self.assertRaisesRegexp(RuntimeError,
-                                   "Variables not initialized: w"):
+      with self.assertRaisesRegex(RuntimeError, "Variables not initialized: w"):
         sv.prepare_or_wait_for_session(server.target)
 
   def testInitOpFailsForTransientVariable(self):
@@ -716,8 +715,7 @@ class SupervisorTest(test.TestCase):
           collections=[ops.GraphKeys.LOCAL_VARIABLES])
       # w will not be initialized.
       sv = supervisor.Supervisor(logdir=logdir, local_init_op=v.initializer)
-      with self.assertRaisesRegexp(RuntimeError,
-                                   "Variables not initialized: w"):
+      with self.assertRaisesRegex(RuntimeError, "Variables not initialized: w"):
         sv.prepare_or_wait_for_session(server.target)
 
   @test_util.run_v1_only("train.Supervisor is for v1 only")
@@ -725,7 +723,7 @@ class SupervisorTest(test.TestCase):
     logdir = self._test_dir("setup_fail")
     with ops.Graph().as_default():
       variables.VariableV1([1.0, 2.0, 3.0], name="v")
-      with self.assertRaisesRegexp(ValueError, "must have their device set"):
+      with self.assertRaisesRegex(ValueError, "must have their device set"):
         supervisor.Supervisor(logdir=logdir, is_chief=False)
     with ops.Graph().as_default(), ops.device("/job:ps"):
       variables.VariableV1([1.0, 2.0, 3.0], name="v")
diff --git a/tensorflow/python/training/sync_replicas_optimizer_test.py b/tensorflow/python/training/sync_replicas_optimizer_test.py
index 03c07173252..7ff31d61c9a 100644
--- a/tensorflow/python/training/sync_replicas_optimizer_test.py
+++ b/tensorflow/python/training/sync_replicas_optimizer_test.py
@@ -265,8 +265,7 @@ class SyncReplicasOptimizerHookTest(test.TestCase):
         replicas_to_aggregate=1,
         total_num_replicas=1)
     hook = opt.make_session_run_hook(True)
-    with self.assertRaisesRegexp(ValueError,
-                                 "apply_gradient should be called"):
+    with self.assertRaisesRegex(ValueError, "apply_gradient should be called"):
       hook.begin()
 
   @test_util.run_v1_only("b/120545219")
diff --git a/tensorflow/python/training/tracking/base_test.py b/tensorflow/python/training/tracking/base_test.py
index d76e20edf7e..feacd77417b 100644
--- a/tensorflow/python/training/tracking/base_test.py
+++ b/tensorflow/python/training/tracking/base_test.py
@@ -54,8 +54,8 @@ class InterfaceTests(test.TestCase):
           getter=variable_scope.get_variable)
       self.assertEqual([root, b], util.list_objects(root))
     with ops.Graph().as_default():
-      with self.assertRaisesRegexp(
-          ValueError, "already declared as a dependency"):
+      with self.assertRaisesRegex(ValueError,
+                                  "already declared as a dependency"):
         root._add_variable_with_custom_getter(
             name="v", shape=[], overwrite=False,
             getter=variable_scope.get_variable)
@@ -80,7 +80,7 @@ class InterfaceTests(test.TestCase):
     save_path = saved.save(os.path.join(self.get_temp_dir(), "ckpt"))
     restored = util.Checkpoint(obj=base.Trackable())
     status = restored.restore(save_path)
-    with self.assertRaisesRegexp(AssertionError, "foo_attr"):
+    with self.assertRaisesRegex(AssertionError, "foo_attr"):
       status.assert_consumed()
 
   def testBuggyGetConfig(self):
diff --git a/tensorflow/python/training/tracking/data_structures_test.py b/tensorflow/python/training/tracking/data_structures_test.py
index be795601678..90f8fbdef64 100644
--- a/tensorflow/python/training/tracking/data_structures_test.py
+++ b/tensorflow/python/training/tracking/data_structures_test.py
@@ -57,7 +57,7 @@ class ListTests(test.TestCase):
       data_structures.List([NotTrackable()])
 
   def testCallNotImplemented(self):
-    with self.assertRaisesRegexp(TypeError, "not callable"):
+    with self.assertRaisesRegex(TypeError, "not callable"):
       data_structures.List()(1.)
 
   def testNoPop(self):
@@ -123,7 +123,7 @@ class ListTests(test.TestCase):
 
   def testIMul_zero(self):
     l = data_structures.List([])
-    with self.assertRaisesRegexp(ValueError, "List only supports append"):
+    with self.assertRaisesRegex(ValueError, "List only supports append"):
       l *= 0
 
   def testIMul(self):
@@ -328,7 +328,7 @@ class ListWrapperTest(test.TestCase):
 
   def assertUnableToSave(self, l, msg):
     l._maybe_initialize_trackable()  # pylint: disable=protected-access
-    with self.assertRaisesRegexp(ValueError, msg):
+    with self.assertRaisesRegex(ValueError, msg):
       return l._checkpoint_dependencies  # pylint: disable=protected-access
 
 
@@ -368,7 +368,7 @@ class MappingTests(test.TestCase):
     self.assertEqual({}, a.d)
     self.assertFalse({} != a.d)  # pylint: disable=g-explicit-bool-comparison
     self.assertNotEqual({1: 2}, a.d)
-    with self.assertRaisesRegexp(TypeError, "unhashable"):
+    with self.assertRaisesRegex(TypeError, "unhashable"):
       set([a.d])
 
   def testListShallowCopy(self):
diff --git a/tensorflow/python/training/tracking/tracking_test.py b/tensorflow/python/training/tracking/tracking_test.py
index 4dff392cf9f..e2b01964bb3 100644
--- a/tensorflow/python/training/tracking/tracking_test.py
+++ b/tensorflow/python/training/tracking/tracking_test.py
@@ -59,7 +59,7 @@ class InterfaceTests(test.TestCase):
     root.leaf = tracking.AutoTrackable()
     root.leaf = root.leaf
     duplicate_name_dep = tracking.AutoTrackable()
-    with self.assertRaisesRegexp(ValueError, "already declared"):
+    with self.assertRaisesRegex(ValueError, "already declared"):
       root._track_trackable(duplicate_name_dep, name="leaf")
     # No error; we're overriding __setattr__, so we can't really stop people
     # from doing this while maintaining backward compatibility.
@@ -106,7 +106,7 @@ class InterfaceTests(test.TestCase):
     c = tracking.AutoTrackable()
     a.l.insert(0, c)
     checkpoint = util.Checkpoint(a=a)
-    with self.assertRaisesRegexp(ValueError, "A list element was replaced"):
+    with self.assertRaisesRegex(ValueError, "A list element was replaced"):
       checkpoint.save(os.path.join(self.get_temp_dir(), "ckpt"))
 
   @test_util.run_in_graph_and_eager_modes
@@ -118,7 +118,7 @@ class InterfaceTests(test.TestCase):
     c = tracking.AutoTrackable()
     held_reference.append(c)
     checkpoint = util.Checkpoint(a=a)
-    with self.assertRaisesRegexp(ValueError, "The wrapped list was modified"):
+    with self.assertRaisesRegex(ValueError, "The wrapped list was modified"):
       checkpoint.save(os.path.join(self.get_temp_dir(), "ckpt"))
 
   @test_util.run_in_graph_and_eager_modes
@@ -154,7 +154,7 @@ class InterfaceTests(test.TestCase):
     checkpoint.save(os.path.join(self.get_temp_dir(), "ckpt"))
     # Dirtying the inner list means the root object is unsaveable.
     a.l[0][1] = 2
-    with self.assertRaisesRegexp(ValueError, "A list element was replaced"):
+    with self.assertRaisesRegex(ValueError, "A list element was replaced"):
       checkpoint.save(os.path.join(self.get_temp_dir(), "ckpt"))
 
   @test_util.run_in_graph_and_eager_modes
diff --git a/tensorflow/python/training/tracking/util_test.py b/tensorflow/python/training/tracking/util_test.py
index 6c0b08426e7..4ef5f63380b 100644
--- a/tensorflow/python/training/tracking/util_test.py
+++ b/tensorflow/python/training/tracking/util_test.py
@@ -59,7 +59,7 @@ class InterfaceTests(test.TestCase):
   @test_util.run_in_graph_and_eager_modes(assert_no_eager_garbage=True)
   def testAddVariable(self):
     obj = NonLayerTrackable()
-    with self.assertRaisesRegexp(ValueError, "do not specify shape"):
+    with self.assertRaisesRegex(ValueError, "do not specify shape"):
       trackable_utils.add_variable(
           obj, name="shape_specified_twice", shape=[], initializer=1)
     constant_initializer = trackable_utils.add_variable(
@@ -83,7 +83,7 @@ class InterfaceTests(test.TestCase):
         name="duplicate", initial_value=1.)
     duplicate = trackable_utils.add_variable(
         obj, name="duplicate", shape=[])
-    with self.assertRaisesRegexp(ValueError, "'duplicate'.*already declared"):
+    with self.assertRaisesRegex(ValueError, "'duplicate'.*already declared"):
       trackable_utils.add_variable(obj, name="duplicate", shape=[])
 
     self.evaluate(trackable_utils.gather_initializers(obj))
@@ -365,9 +365,8 @@ class CheckpointingTests(parameterized.TestCase, test.TestCase):
       partial_root = trackable_utils.Checkpoint(v1=base.Trackable(),
                                                 v2=variables_lib.Variable(0.))
       status = partial_root.restore(save_path)
-      with self.assertRaisesRegexp(
-          AssertionError,
-          r"Unused attributes(.|\n)*\(root\).v1"):
+      with self.assertRaisesRegex(AssertionError,
+                                  r"Unused attributes(.|\n)*\(root\).v1"):
         status.assert_consumed()
 
   def testSilencePartialWarning(self):
diff --git a/tensorflow/python/training/tracking/util_with_v1_optimizers_test.py b/tensorflow/python/training/tracking/util_with_v1_optimizers_test.py
index a5af8e1f876..ca9f648c239 100644
--- a/tensorflow/python/training/tracking/util_with_v1_optimizers_test.py
+++ b/tensorflow/python/training/tracking/util_with_v1_optimizers_test.py
@@ -77,7 +77,7 @@ class CheckpointingTests(test.TestCase):
     self.assertEqual(12., self.evaluate(new_root.var))
     new_root.optimizer = adam.AdamOptimizer(0.1)
     slot_status.assert_existing_objects_matched()
-    with self.assertRaisesRegexp(AssertionError, "beta1_power"):
+    with self.assertRaisesRegex(AssertionError, "beta1_power"):
       slot_status.assert_consumed()
     self.assertEqual(12., self.evaluate(new_root.var))
     if context.executing_eagerly():
diff --git a/tensorflow/python/training/training_util_test.py b/tensorflow/python/training/training_util_test.py
index 3f9858a33ba..5049d6e00a0 100644
--- a/tensorflow/python/training/training_util_test.py
+++ b/tensorflow/python/training/training_util_test.py
@@ -43,10 +43,10 @@ class GlobalStepTest(test.TestCase):
           trainable=False,
           dtype=dtypes.float32,
           name=ops.GraphKeys.GLOBAL_STEP)
-      self.assertRaisesRegexp(TypeError, 'does not have integer type',
-                              training_util.get_global_step)
-    self.assertRaisesRegexp(TypeError, 'does not have integer type',
-                            training_util.get_global_step, g)
+      self.assertRaisesRegex(TypeError, 'does not have integer type',
+                             training_util.get_global_step)
+    self.assertRaisesRegex(TypeError, 'does not have integer type',
+                           training_util.get_global_step, g)
 
   def test_invalid_shape(self):
     with ops.Graph().as_default() as g:
@@ -56,20 +56,20 @@ class GlobalStepTest(test.TestCase):
           trainable=False,
           dtype=dtypes.int32,
           name=ops.GraphKeys.GLOBAL_STEP)
-      self.assertRaisesRegexp(TypeError, 'not scalar',
-                              training_util.get_global_step)
-    self.assertRaisesRegexp(TypeError, 'not scalar',
-                            training_util.get_global_step, g)
+      self.assertRaisesRegex(TypeError, 'not scalar',
+                             training_util.get_global_step)
+    self.assertRaisesRegex(TypeError, 'not scalar',
+                           training_util.get_global_step, g)
 
   def test_create_global_step(self):
     self.assertIsNone(training_util.get_global_step())
     with ops.Graph().as_default() as g:
       global_step = training_util.create_global_step()
       self._assert_global_step(global_step)
-      self.assertRaisesRegexp(ValueError, 'already exists',
-                              training_util.create_global_step)
-      self.assertRaisesRegexp(ValueError, 'already exists',
-                              training_util.create_global_step, g)
+      self.assertRaisesRegex(ValueError, 'already exists',
+                             training_util.create_global_step)
+      self.assertRaisesRegex(ValueError, 'already exists',
+                             training_util.create_global_step, g)
       self._assert_global_step(training_util.create_global_step(ops.Graph()))
 
   def test_get_global_step(self):
diff --git a/tensorflow/python/util/deprecation_test.py b/tensorflow/python/util/deprecation_test.py
index a6ca3c6fda8..20c0846cfb8 100644
--- a/tensorflow/python/util/deprecation_test.py
+++ b/tensorflow/python/util/deprecation_test.py
@@ -39,8 +39,7 @@ class DeprecatedAliasTest(test.TestCase):
     deprecated_func("FAKE ERROR!")
     self.assertEqual(1, mock_warning.call_count)
     # Make sure the error points to the right file.
-    self.assertRegexpMatches(mock_warning.call_args[0][1],
-                             r"deprecation_test\.py:")
+    self.assertRegex(mock_warning.call_args[0][1], r"deprecation_test\.py:")
     deprecated_func("ANOTHER FAKE ERROR!")
     self.assertEqual(1, mock_warning.call_count)
 
@@ -67,8 +66,7 @@ class DeprecatedAliasTest(test.TestCase):
     deprecated_cls("deprecated")
     self.assertEqual(1, mock_warning.call_count)
     # Make sure the error points to the right file.
-    self.assertRegexpMatches(mock_warning.call_args[0][1],
-                             r"deprecation_test\.py:")
+    self.assertRegex(mock_warning.call_args[0][1], r"deprecation_test\.py:")
     deprecated_cls("deprecated again")
     self.assertEqual(1, mock_warning.call_count)
 
@@ -123,14 +121,14 @@ class DeprecationTest(test.TestCase):
 
   def test_deprecated_illegal_args(self):
     instructions = "This is how you update..."
-    with self.assertRaisesRegexp(ValueError, "YYYY-MM-DD"):
+    with self.assertRaisesRegex(ValueError, "YYYY-MM-DD"):
       deprecation.deprecated("", instructions)
-    with self.assertRaisesRegexp(ValueError, "YYYY-MM-DD"):
+    with self.assertRaisesRegex(ValueError, "YYYY-MM-DD"):
       deprecation.deprecated("07-04-2016", instructions)
     date = "2016-07-04"
-    with self.assertRaisesRegexp(ValueError, "instructions"):
+    with self.assertRaisesRegex(ValueError, "instructions"):
       deprecation.deprecated(date, None)
-    with self.assertRaisesRegexp(ValueError, "instructions"):
+    with self.assertRaisesRegex(ValueError, "instructions"):
       deprecation.deprecated(date, "")
 
   @test.mock.patch.object(logging, "warning", autospec=True)
@@ -169,8 +167,7 @@ class DeprecationTest(test.TestCase):
     self.assertEqual(3, _fn(1, 2))
     self.assertEqual(1, mock_warning.call_count)
     (args, _) = mock_warning.call_args
-    self.assertRegexpMatches(
-        args[0], r"deprecated and will be removed")
+    self.assertRegex(args[0], r"deprecated and will be removed")
     self._assert_subset(set(["in a future version", instructions]),
                         set(args[1:]))
 
@@ -212,7 +209,7 @@ class DeprecationTest(test.TestCase):
     self.assertEqual(3, _fn(1, 2))
     self.assertEqual(1, mock_warning.call_count)
     (args, _) = mock_warning.call_args
-    self.assertRegexpMatches(args[0], r"deprecated and will be removed")
+    self.assertRegex(args[0], r"deprecated and will be removed")
     self._assert_subset(set(["after " + date, instructions]), set(args[1:]))
 
   @test.mock.patch.object(logging, "warning", autospec=True)
@@ -238,7 +235,7 @@ class DeprecationTest(test.TestCase):
     self.assertEqual(3, _fn(1, 2))
     self.assertEqual(1, mock_warning.call_count)
     (args, _) = mock_warning.call_args
-    self.assertRegexpMatches(args[0], r"deprecated and will be removed")
+    self.assertRegex(args[0], r"deprecated and will be removed")
     self._assert_subset(set(["after " + date, instructions]), set(args[1:]))
 
   @test.mock.patch.object(logging, "warning", autospec=True)
@@ -264,7 +261,7 @@ class DeprecationTest(test.TestCase):
     self.assertEqual(3, _fn(1, 2))
     self.assertEqual(1, mock_warning.call_count)
     (args, _) = mock_warning.call_args
-    self.assertRegexpMatches(args[0], r"deprecated and will be removed")
+    self.assertRegex(args[0], r"deprecated and will be removed")
     self._assert_subset(set(["after " + date, instructions]), set(args[1:]))
 
   @test.mock.patch.object(logging, "warning", autospec=True)
@@ -309,7 +306,7 @@ class DeprecationTest(test.TestCase):
     self.assertEqual(3, _Object()._fn(1, 2))
     self.assertEqual(1, mock_warning.call_count)
     (args, _) = mock_warning.call_args
-    self.assertRegexpMatches(args[0], r"deprecated and will be removed")
+    self.assertRegex(args[0], r"deprecated and will be removed")
     self._assert_subset(set(["after " + date, instructions]), set(args[1:]))
 
   @test.mock.patch.object(logging, "warning", autospec=True)
@@ -339,7 +336,7 @@ class DeprecationTest(test.TestCase):
     self.assertEqual(3, _Object()._fn(1, 2))
     self.assertEqual(1, mock_warning.call_count)
     (args, _) = mock_warning.call_args
-    self.assertRegexpMatches(args[0], r"deprecated and will be removed")
+    self.assertRegex(args[0], r"deprecated and will be removed")
     self._assert_subset(set(["after " + date, instructions]), set(args[1:]))
 
   @test.mock.patch.object(logging, "warning", autospec=True)
@@ -369,11 +366,11 @@ class DeprecationTest(test.TestCase):
     self.assertEqual(3, _Object()._fn(1, 2))
     self.assertEqual(1, mock_warning.call_count)
     (args, _) = mock_warning.call_args
-    self.assertRegexpMatches(args[0], r"deprecated and will be removed")
+    self.assertRegex(args[0], r"deprecated and will be removed")
     self._assert_subset(set(["after " + date, instructions]), set(args[1:]))
 
   def test_prop_wrong_order(self):
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         ValueError,
         "make sure @property appears before @deprecated in your source code"):
       # pylint: disable=unused-variable
@@ -424,7 +421,7 @@ class DeprecationTest(test.TestCase):
     self.assertEqual("prop_with_doc", _Object()._prop)
     self.assertEqual(1, mock_warning.call_count)
     (args, _) = mock_warning.call_args
-    self.assertRegexpMatches(args[0], r"deprecated and will be removed")
+    self.assertRegex(args[0], r"deprecated and will be removed")
     self._assert_subset(set(["after " + date, instructions]), set(args[1:]))
 
   @test.mock.patch.object(logging, "warning", autospec=True)
@@ -455,7 +452,7 @@ class DeprecationTest(test.TestCase):
     self.assertEqual("prop_no_doc", _Object()._prop)
     self.assertEqual(1, mock_warning.call_count)
     (args, _) = mock_warning.call_args
-    self.assertRegexpMatches(args[0], r"deprecated and will be removed")
+    self.assertRegex(args[0], r"deprecated and will be removed")
     self._assert_subset(set(["after " + date, instructions]), set(args[1:]))
 
 
@@ -469,15 +466,15 @@ class DeprecatedArgsTest(test.TestCase):
   def test_deprecated_illegal_args(self):
     instructions = "This is how you update..."
     date = "2016-07-04"
-    with self.assertRaisesRegexp(ValueError, "YYYY-MM-DD"):
+    with self.assertRaisesRegex(ValueError, "YYYY-MM-DD"):
       deprecation.deprecated_args("", instructions, "deprecated")
-    with self.assertRaisesRegexp(ValueError, "YYYY-MM-DD"):
+    with self.assertRaisesRegex(ValueError, "YYYY-MM-DD"):
       deprecation.deprecated_args("07-04-2016", instructions, "deprecated")
-    with self.assertRaisesRegexp(ValueError, "instructions"):
+    with self.assertRaisesRegex(ValueError, "instructions"):
       deprecation.deprecated_args(date, None, "deprecated")
-    with self.assertRaisesRegexp(ValueError, "instructions"):
+    with self.assertRaisesRegex(ValueError, "instructions"):
       deprecation.deprecated_args(date, "", "deprecated")
-    with self.assertRaisesRegexp(ValueError, "argument"):
+    with self.assertRaisesRegex(ValueError, "argument"):
       deprecation.deprecated_args(date, instructions)
 
   def test_deprecated_missing_args(self):
@@ -488,7 +485,7 @@ class DeprecatedArgsTest(test.TestCase):
       return arg0 + arg1 if deprecated else arg1 + arg0
 
     # Assert calls without the deprecated argument log nothing.
-    with self.assertRaisesRegexp(ValueError, "not present.*\\['missing'\\]"):
+    with self.assertRaisesRegex(ValueError, "not present.*\\['missing'\\]"):
       deprecation.deprecated_args(date, instructions, "missing")(_fn)
 
   @test.mock.patch.object(logging, "warning", autospec=True)
@@ -536,7 +533,7 @@ class DeprecatedArgsTest(test.TestCase):
     self.assertEqual(3, _fn(1, 2, True))
     self.assertEqual(1, mock_warning.call_count)
     (args, _) = mock_warning.call_args
-    self.assertRegexpMatches(args[0], r"deprecated and will be removed")
+    self.assertRegex(args[0], r"deprecated and will be removed")
     self._assert_subset(set(["after " + date, instructions]), set(args[1:]))
 
   @test.mock.patch.object(logging, "warning", autospec=True)
@@ -567,7 +564,7 @@ class DeprecatedArgsTest(test.TestCase):
     self.assertEqual(3, _fn(1, 2, True))
     self.assertEqual(1, mock_warning.call_count)
     (args, _) = mock_warning.call_args
-    self.assertRegexpMatches(args[0], r"deprecated and will be removed")
+    self.assertRegex(args[0], r"deprecated and will be removed")
     self._assert_subset(set(["after " + date, instructions]), set(args[1:]))
 
   @test.mock.patch.object(logging, "warning", autospec=True)
@@ -598,7 +595,7 @@ class DeprecatedArgsTest(test.TestCase):
     self.assertEqual(3, _fn(1, 2, True))
     self.assertEqual(1, mock_warning.call_count)
     (args, _) = mock_warning.call_args
-    self.assertRegexpMatches(args[0], r"deprecated and will be removed")
+    self.assertRegex(args[0], r"deprecated and will be removed")
     self._assert_subset(set(["after " + date, instructions]), set(args[1:]))
 
   @test.mock.patch.object(logging, "warning", autospec=True)
@@ -619,7 +616,7 @@ class DeprecatedArgsTest(test.TestCase):
     self.assertEqual(3, _fn(1, 2, True, False))
     self.assertEqual(1, mock_warning.call_count)
     (args, _) = mock_warning.call_args
-    self.assertRegexpMatches(args[0], r"deprecated and will be removed")
+    self.assertRegex(args[0], r"deprecated and will be removed")
     self._assert_subset(set(["after " + date, instructions]), set(args[1:]))
 
   @test.mock.patch.object(logging, "warning", autospec=True)
@@ -640,7 +637,7 @@ class DeprecatedArgsTest(test.TestCase):
     self.assertEqual(3, _fn(1, 2, a=True, b=False))
     self.assertEqual(1, mock_warning.call_count)
     (args, _) = mock_warning.call_args
-    self.assertRegexpMatches(args[0], r"deprecated and will be removed")
+    self.assertRegex(args[0], r"deprecated and will be removed")
     self._assert_subset(set(["after " + date, instructions]), set(args[1:]))
 
   @test.mock.patch.object(logging, "warning", autospec=True)
@@ -661,11 +658,11 @@ class DeprecatedArgsTest(test.TestCase):
     self.assertEqual(2, _fn(1, None, 2, d2=False))
     self.assertEqual(2, mock_warning.call_count)
     (args1, _) = mock_warning.call_args_list[0]
-    self.assertRegexpMatches(args1[0], r"deprecated and will be removed")
+    self.assertRegex(args1[0], r"deprecated and will be removed")
     self._assert_subset(set(["after " + date, instructions, "d1"]),
                         set(args1[1:]))
     (args2, _) = mock_warning.call_args_list[1]
-    self.assertRegexpMatches(args2[0], r"deprecated and will be removed")
+    self.assertRegex(args2[0], r"deprecated and will be removed")
     self._assert_subset(set(["after " + date, instructions, "d2"]),
                         set(args2[1:]))
 
@@ -688,11 +685,11 @@ class DeprecatedArgsTest(test.TestCase):
     self.assertEqual(2, _fn(1, False, 2, d2=False))
     self.assertEqual(2, mock_warning.call_count)
     (args1, _) = mock_warning.call_args_list[0]
-    self.assertRegexpMatches(args1[0], r"deprecated and will be removed")
+    self.assertRegex(args1[0], r"deprecated and will be removed")
     self._assert_subset(set(["after " + date, instructions, "d1"]),
                         set(args1[1:]))
     (args2, _) = mock_warning.call_args_list[1]
-    self.assertRegexpMatches(args2[0], r"deprecated and will be removed")
+    self.assertRegex(args2[0], r"deprecated and will be removed")
     self._assert_subset(set(["after " + date, instructions, "d2"]),
                         set(args2[1:]))
 
@@ -751,17 +748,17 @@ class DeprecatedArgValuesTest(test.TestCase):
 
   def test_deprecated_illegal_args(self):
     instructions = "This is how you update..."
-    with self.assertRaisesRegexp(ValueError, "YYYY-MM-DD"):
+    with self.assertRaisesRegex(ValueError, "YYYY-MM-DD"):
       deprecation.deprecated_arg_values("", instructions, deprecated=True)
-    with self.assertRaisesRegexp(ValueError, "YYYY-MM-DD"):
+    with self.assertRaisesRegex(ValueError, "YYYY-MM-DD"):
       deprecation.deprecated_arg_values(
           "07-04-2016", instructions, deprecated=True)
     date = "2016-07-04"
-    with self.assertRaisesRegexp(ValueError, "instructions"):
+    with self.assertRaisesRegex(ValueError, "instructions"):
       deprecation.deprecated_arg_values(date, None, deprecated=True)
-    with self.assertRaisesRegexp(ValueError, "instructions"):
+    with self.assertRaisesRegex(ValueError, "instructions"):
       deprecation.deprecated_arg_values(date, "", deprecated=True)
-    with self.assertRaisesRegexp(ValueError, "argument"):
+    with self.assertRaisesRegex(ValueError, "argument"):
       deprecation.deprecated_arg_values(date, instructions)
 
   @test.mock.patch.object(logging, "warning", autospec=True)
@@ -810,7 +807,7 @@ class DeprecatedArgValuesTest(test.TestCase):
     self.assertEqual(3, _fn(1, 2, deprecated=True))
     self.assertEqual(1, mock_warning.call_count)
     (args, _) = mock_warning.call_args
-    self.assertRegexpMatches(args[0], r"deprecated and will be removed")
+    self.assertRegex(args[0], r"deprecated and will be removed")
     self._assert_subset(set(["after " + date, instructions]), set(args[1:]))
 
     # Assert calling new fn with default deprecated value issues log warning.
@@ -846,7 +843,7 @@ class DeprecatedArgValuesTest(test.TestCase):
     self.assertEqual(3, _fn(1, 2, deprecated=True))
     self.assertEqual(1, mock_warning.call_count)
     (args, _) = mock_warning.call_args
-    self.assertRegexpMatches(args[0], r"deprecated and will be removed")
+    self.assertRegex(args[0], r"deprecated and will be removed")
     self._assert_subset(set(["after " + date, instructions]), set(args[1:]))
 
     # Assert calling new fn with default deprecated value issues log warning.
@@ -882,7 +879,7 @@ class DeprecatedArgValuesTest(test.TestCase):
     self.assertEqual(3, _fn(1, 2, deprecated=True))
     self.assertEqual(1, mock_warning.call_count)
     (args, _) = mock_warning.call_args
-    self.assertRegexpMatches(args[0], r"deprecated and will be removed")
+    self.assertRegex(args[0], r"deprecated and will be removed")
     self._assert_subset(set(["after " + date, instructions]), set(args[1:]))
 
     # Assert calling new fn with default deprecated value issues log warning.
@@ -936,8 +933,8 @@ class DeprecationArgumentsTest(test.TestCase):
     self.assertEqual(
         deprecation.deprecated_argument_lookup("val_new", None, "val_old",
                                                good_value), good_value)
-    with self.assertRaisesRegexp(ValueError,
-                                 "Cannot specify both 'val_old' and 'val_new'"):
+    with self.assertRaisesRegex(ValueError,
+                                "Cannot specify both 'val_old' and 'val_new'"):
       self.assertEqual(
           deprecation.deprecated_argument_lookup("val_new", good_value,
                                                  "val_old", good_value),
diff --git a/tensorflow/python/util/dispatch_test.py b/tensorflow/python/util/dispatch_test.py
index bd35c391924..49026a754e4 100644
--- a/tensorflow/python/util/dispatch_test.py
+++ b/tensorflow/python/util/dispatch_test.py
@@ -141,8 +141,10 @@ class DispatchTest(test_util.TensorFlowTestCase):
     test_op._tf_dispatchers = original_handlers
 
   def testDispatchForTypes_SignatureMismatch(self):
-    with self.assertRaisesRegexp(AssertionError, "The decorated function's "
-                                 "signature must exactly match.*"):
+    with self.assertRaisesRegex(
+        AssertionError, "The decorated function's "
+        "signature must exactly match.*"):
+
       @dispatch.dispatch_for_types(test_op, CustomTensor)
       def override_for_test_op(a, b, c):  # pylint: disable=unused-variable
         return CustomTensor(test_op(a.tensor, b.tensor, c.tensor),
@@ -152,7 +154,8 @@ class DispatchTest(test_util.TensorFlowTestCase):
     def some_op(x, y):
       return x + y
 
-    with self.assertRaisesRegexp(AssertionError, "Dispatching not enabled for"):
+    with self.assertRaisesRegex(AssertionError, "Dispatching not enabled for"):
+
       @dispatch.dispatch_for_types(some_op, CustomTensor)
       def override_for_some_op(x, y):  # pylint: disable=unused-variable
         return x if x.score > 0 else y
@@ -167,9 +170,8 @@ class DispatchTest(test_util.TensorFlowTestCase):
     some_op(5)
 
     message = mock_warning.call_args[0][0] % mock_warning.call_args[0][1:]
-    self.assertRegexpMatches(
-        message,
-        r".*some_op \(from __main__\) is deprecated and will be "
+    self.assertRegex(
+        message, r".*some_op \(from __main__\) is deprecated and will be "
         "removed in a future version.*")
 
   def testGlobalDispatcher(self):
diff --git a/tensorflow/python/util/function_utils_test.py b/tensorflow/python/util/function_utils_test.py
index 8fc740492c6..546b23a7af1 100644
--- a/tensorflow/python/util/function_utils_test.py
+++ b/tensorflow/python/util/function_utils_test.py
@@ -234,8 +234,8 @@ class HasKwargsTest(test.TestCase):
     self.assertEqual(double_wrapped_fn(some_arg), some_arg)
 
   def test_raises_type_error(self):
-    with self.assertRaisesRegexp(
-        TypeError, 'fn should be a function-like object'):
+    with self.assertRaisesRegex(TypeError,
+                                'fn should be a function-like object'):
       function_utils.has_kwargs('not a function')
 
 
@@ -253,15 +253,14 @@ class GetFuncNameTest(test.TestCase):
 
   def testWithCallableClass(self):
     callable_instance = SillyCallableClass()
-    self.assertRegexpMatches(
+    self.assertRegex(
         function_utils.get_func_name(callable_instance),
         '<.*SillyCallableClass.*>')
 
   def testWithFunctoolsPartial(self):
     partial = functools.partial(silly_example_function)
-    self.assertRegexpMatches(
-        function_utils.get_func_name(partial),
-        '<.*functools.partial.*>')
+    self.assertRegex(
+        function_utils.get_func_name(partial), '<.*functools.partial.*>')
 
   def testWithLambda(self):
     anon_fn = lambda x: x
@@ -277,24 +276,24 @@ class GetFuncCodeTest(test.TestCase):
   def testWithSimpleFunction(self):
     code = function_utils.get_func_code(silly_example_function)
     self.assertIsNotNone(code)
-    self.assertRegexpMatches(code.co_filename, 'function_utils_test.py')
+    self.assertRegex(code.co_filename, 'function_utils_test.py')
 
   def testWithClassMethod(self):
     code = function_utils.get_func_code(self.testWithClassMethod)
     self.assertIsNotNone(code)
-    self.assertRegexpMatches(code.co_filename, 'function_utils_test.py')
+    self.assertRegex(code.co_filename, 'function_utils_test.py')
 
   def testWithCallableClass(self):
     callable_instance = SillyCallableClass()
     code = function_utils.get_func_code(callable_instance)
     self.assertIsNotNone(code)
-    self.assertRegexpMatches(code.co_filename, 'function_utils_test.py')
+    self.assertRegex(code.co_filename, 'function_utils_test.py')
 
   def testWithLambda(self):
     anon_fn = lambda x: x
     code = function_utils.get_func_code(anon_fn)
     self.assertIsNotNone(code)
-    self.assertRegexpMatches(code.co_filename, 'function_utils_test.py')
+    self.assertRegex(code.co_filename, 'function_utils_test.py')
 
   def testWithFunctoolsPartial(self):
     partial = functools.partial(silly_example_function)
diff --git a/tensorflow/python/util/keyword_args_test.py b/tensorflow/python/util/keyword_args_test.py
index 87c95bf3feb..637f5b72299 100644
--- a/tensorflow/python/util/keyword_args_test.py
+++ b/tensorflow/python/util/keyword_args_test.py
@@ -38,12 +38,12 @@ class KeywordArgsTest(test.TestCase):
     self.assertEqual(3, func_with_decorator(a=1, b=2))
 
     # Providing non-keyword args should fail.
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         ValueError, "Must use keyword args to call func_with_decorator."):
       self.assertEqual(3, func_with_decorator(1, 2))
 
     # Partially providing keyword args should fail.
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         ValueError, "Must use keyword args to call func_with_decorator."):
       self.assertEqual(3, func_with_decorator(1, b=2))
 
diff --git a/tensorflow/python/util/nest_test.py b/tensorflow/python/util/nest_test.py
index eb5523d1a40..ca808ba9ff1 100644
--- a/tensorflow/python/util/nest_test.py
+++ b/tensorflow/python/util/nest_test.py
@@ -104,7 +104,7 @@ class NestTest(parameterized.TestCase, test.TestCase):
     self.assertEqual(restructured_from_flat, sample_attr)
 
     # Check that flatten fails if attributes are not iterable
-    with self.assertRaisesRegexp(TypeError, "object is not iterable"):
+    with self.assertRaisesRegex(TypeError, "object is not iterable"):
       flat = nest.flatten(NestTest.BadAttr())
 
   @parameterized.parameters(
@@ -148,11 +148,10 @@ class NestTest(parameterized.TestCase, test.TestCase):
     self.assertEqual(
         np.array([5]), nest.pack_sequence_as("scalar", [np.array([5])]))
 
-    with self.assertRaisesRegexp(
-        ValueError, self.unsafe_map_pattern):
+    with self.assertRaisesRegex(ValueError, self.unsafe_map_pattern):
       nest.pack_sequence_as("scalar", [4, 5])
 
-    with self.assertRaisesRegexp(TypeError, self.bad_pack_pattern):
+    with self.assertRaisesRegex(TypeError, self.bad_pack_pattern):
       nest.pack_sequence_as([4, 5], "bad_sequence")
 
     with self.assertRaises(ValueError):
@@ -272,12 +271,11 @@ class NestTest(parameterized.TestCase, test.TestCase):
     self.assertEqual(structure, unflattened)
 
   def testPackSequenceAs_notIterableError(self):
-    with self.assertRaisesRegexp(
-        TypeError, self.bad_pack_pattern):
+    with self.assertRaisesRegex(TypeError, self.bad_pack_pattern):
       nest.pack_sequence_as("hi", "bye")
 
   def testPackSequenceAs_wrongLengthsError(self):
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         ValueError,
         "Structure had 2 elements, but flat_sequence had 3 elements."):
       nest.pack_sequence_as(["hello", "world"],
@@ -310,13 +308,13 @@ class NestTest(parameterized.TestCase, test.TestCase):
       nest.flatten_dict_items(4)
 
     bad_dictionary = mapping_type({(4, 5, (4, 8)): ("a", "b", ("c", "d"))})
-    with self.assertRaisesRegexp(ValueError, "not unique"):
+    with self.assertRaisesRegex(ValueError, "not unique"):
       nest.flatten_dict_items(bad_dictionary)
 
     another_bad_dictionary = mapping_type({
         (4, 5, (6, 8)): ("a", "b", ("c", ("d", "e")))
     })
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         ValueError, "Key had [0-9]* elements, but value had [0-9]* elements"):
       nest.flatten_dict_items(another_bad_dictionary)
 
@@ -347,7 +345,7 @@ class NestTest(parameterized.TestCase, test.TestCase):
     nest.assert_same_structure("abc", np.array([0, 1]))
     nest.assert_same_structure("abc", constant_op.constant([0, 1]))
 
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         ValueError,
         ("The two structures don't have the same nested structure\\.\n\n"
          "First structure:.*?\n\n"
@@ -361,7 +359,7 @@ class NestTest(parameterized.TestCase, test.TestCase):
          r"\(\., \.\)")):
       nest.assert_same_structure(structure1, structure_different_num_elements)
 
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         ValueError,
         ("The two structures don't have the same nested structure\\.\n\n"
          "First structure:.*?\n\n"
@@ -371,7 +369,7 @@ class NestTest(parameterized.TestCase, test.TestCase):
          "is not")):
       nest.assert_same_structure([0, 1], np.array([0, 1]))
 
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         ValueError,
         ("The two structures don't have the same nested structure\\.\n\n"
          "First structure:.*?\n\n"
@@ -383,10 +381,9 @@ class NestTest(parameterized.TestCase, test.TestCase):
 
     self.assertRaises(TypeError, nest.assert_same_structure, (0, 1), [0, 1])
 
-    with self.assertRaisesRegexp(
-        ValueError,
-        ("don't have the same nested structure\\.\n\n"
-         "First structure: .*?\n\nSecond structure: ")):
+    with self.assertRaisesRegex(ValueError,
+                                ("don't have the same nested structure\\.\n\n"
+                                 "First structure: .*?\n\nSecond structure: ")):
       nest.assert_same_structure(structure1, structure_different_nesting)
 
     self.assertRaises(TypeError, nest.assert_same_structure, (0, 1),
@@ -398,28 +395,24 @@ class NestTest(parameterized.TestCase, test.TestCase):
     self.assertRaises(TypeError, nest.assert_same_structure,
                       NestTest.Named0ab(3, 4), NestTest.Named1ab(3, 4))
 
-    with self.assertRaisesRegexp(
-        ValueError,
-        ("don't have the same nested structure\\.\n\n"
-         "First structure: .*?\n\nSecond structure: ")):
+    with self.assertRaisesRegex(ValueError,
+                                ("don't have the same nested structure\\.\n\n"
+                                 "First structure: .*?\n\nSecond structure: ")):
       nest.assert_same_structure(NestTest.Named0ab(3, 4),
                                  NestTest.Named0ab([3], 4))
 
-    with self.assertRaisesRegexp(
-        ValueError,
-        ("don't have the same nested structure\\.\n\n"
-         "First structure: .*?\n\nSecond structure: ")):
+    with self.assertRaisesRegex(ValueError,
+                                ("don't have the same nested structure\\.\n\n"
+                                 "First structure: .*?\n\nSecond structure: ")):
       nest.assert_same_structure([[3], 4], [3, [4]])
 
     structure1_list = [[[1, 2], 3], 4, [5, 6]]
-    with self.assertRaisesRegexp(TypeError,
-                                 "don't have the same sequence type"):
+    with self.assertRaisesRegex(TypeError, "don't have the same sequence type"):
       nest.assert_same_structure(structure1, structure1_list)
     nest.assert_same_structure(structure1, structure2, check_types=False)
     nest.assert_same_structure(structure1, structure1_list, check_types=False)
 
-    with self.assertRaisesRegexp(ValueError,
-                                 "don't have the same set of keys"):
+    with self.assertRaisesRegex(ValueError, "don't have the same set of keys"):
       nest.assert_same_structure({"a": 1}, {"b": 1})
 
     nest.assert_same_structure(NestTest.SameNameab(0, 1),
@@ -432,7 +425,7 @@ class NestTest(parameterized.TestCase, test.TestCase):
         NestTest.SameNameab2(NestTest.SameName1xy2(2, 3), 4))
 
     expected_message = "The two structures don't have the same.*"
-    with self.assertRaisesRegexp(ValueError, expected_message):
+    with self.assertRaisesRegex(ValueError, expected_message):
       nest.assert_same_structure(
           NestTest.SameNameab(0, NestTest.SameNameab2(1, 2)),
           NestTest.SameNameab2(NestTest.SameNameab(0, 1), 2))
@@ -491,41 +484,39 @@ class NestTest(parameterized.TestCase, test.TestCase):
     # This is checking actual equality of types, empty list != empty tuple
     self.assertNotEqual((), nest.map_structure(lambda x: x + 1, []))
 
-    with self.assertRaisesRegexp(TypeError, "callable"):
+    with self.assertRaisesRegex(TypeError, "callable"):
       nest.map_structure("bad", structure1_plus1)
 
-    with self.assertRaisesRegexp(ValueError, "at least one structure"):
+    with self.assertRaisesRegex(ValueError, "at least one structure"):
       nest.map_structure(lambda x: x)
 
-    with self.assertRaisesRegexp(ValueError, "same number of elements"):
+    with self.assertRaisesRegex(ValueError, "same number of elements"):
       nest.map_structure(lambda x, y: None, (3, 4), (3, 4, 5))
 
-    with self.assertRaisesRegexp(ValueError, "same nested structure"):
+    with self.assertRaisesRegex(ValueError, "same nested structure"):
       nest.map_structure(lambda x, y: None, 3, (3,))
 
-    with self.assertRaisesRegexp(TypeError, "same sequence type"):
+    with self.assertRaisesRegex(TypeError, "same sequence type"):
       nest.map_structure(lambda x, y: None, ((3, 4), 5), [(3, 4), 5])
 
-    with self.assertRaisesRegexp(ValueError, "same nested structure"):
+    with self.assertRaisesRegex(ValueError, "same nested structure"):
       nest.map_structure(lambda x, y: None, ((3, 4), 5), (3, (4, 5)))
 
     structure1_list = [[[1, 2], 3], 4, [5, 6]]
-    with self.assertRaisesRegexp(TypeError, "same sequence type"):
+    with self.assertRaisesRegex(TypeError, "same sequence type"):
       nest.map_structure(lambda x, y: None, structure1, structure1_list)
 
     nest.map_structure(lambda x, y: None, structure1, structure1_list,
                        check_types=False)
 
-    with self.assertRaisesRegexp(ValueError, "same nested structure"):
+    with self.assertRaisesRegex(ValueError, "same nested structure"):
       nest.map_structure(lambda x, y: None, ((3, 4), 5), (3, (4, 5)),
                          check_types=False)
 
-    with self.assertRaisesRegexp(ValueError,
-                                 "Only valid keyword argument.*foo"):
+    with self.assertRaisesRegex(ValueError, "Only valid keyword argument.*foo"):
       nest.map_structure(lambda x: None, structure1, foo="a")
 
-    with self.assertRaisesRegexp(ValueError,
-                                 "Only valid keyword argument.*foo"):
+    with self.assertRaisesRegex(ValueError, "Only valid keyword argument.*foo"):
       nest.map_structure(lambda x: None, structure1, check_types=False, foo="a")
 
   ABTuple = collections.namedtuple("ab_tuple", "a, b")  # pylint: disable=invalid-name
@@ -725,14 +716,14 @@ class NestTest(parameterized.TestCase, test.TestCase):
     shallow_tree = ["shallow_tree"]
     expected_message = ("If shallow structure is a sequence, input must also "
                         "be a sequence. Input has type: <(type|class) 'str'>.")
-    with self.assertRaisesRegexp(TypeError, expected_message):
+    with self.assertRaisesRegex(TypeError, expected_message):
       flattened_input_tree = nest.flatten_up_to(shallow_tree, input_tree)
     flattened_shallow_tree = nest.flatten_up_to(shallow_tree, shallow_tree)
     self.assertEqual(flattened_shallow_tree, shallow_tree)
 
     input_tree = "input_tree"
     shallow_tree = ["shallow_tree_9", "shallow_tree_8"]
-    with self.assertRaisesRegexp(TypeError, expected_message):
+    with self.assertRaisesRegex(TypeError, expected_message):
       flattened_input_tree = nest.flatten_up_to(shallow_tree, input_tree)
     flattened_shallow_tree = nest.flatten_up_to(shallow_tree, shallow_tree)
     self.assertEqual(flattened_shallow_tree, shallow_tree)
@@ -742,14 +733,14 @@ class NestTest(parameterized.TestCase, test.TestCase):
     shallow_tree = [9]
     expected_message = ("If shallow structure is a sequence, input must also "
                         "be a sequence. Input has type: <(type|class) 'int'>.")
-    with self.assertRaisesRegexp(TypeError, expected_message):
+    with self.assertRaisesRegex(TypeError, expected_message):
       flattened_input_tree = nest.flatten_up_to(shallow_tree, input_tree)
     flattened_shallow_tree = nest.flatten_up_to(shallow_tree, shallow_tree)
     self.assertEqual(flattened_shallow_tree, shallow_tree)
 
     input_tree = 0
     shallow_tree = [9, 8]
-    with self.assertRaisesRegexp(TypeError, expected_message):
+    with self.assertRaisesRegex(TypeError, expected_message):
       flattened_input_tree = nest.flatten_up_to(shallow_tree, input_tree)
     flattened_shallow_tree = nest.flatten_up_to(shallow_tree, shallow_tree)
     self.assertEqual(flattened_shallow_tree, shallow_tree)
@@ -758,7 +749,7 @@ class NestTest(parameterized.TestCase, test.TestCase):
     shallow_tree = [(1,), (2,)]
     expected_message = nest._STRUCTURES_HAVE_MISMATCHING_LENGTHS.format(
         input_length=len(input_tree), shallow_length=len(shallow_tree))
-    with self.assertRaisesRegexp(ValueError, expected_message):  # pylint: disable=g-error-prone-assert-raises
+    with self.assertRaisesRegex(ValueError, expected_message):  # pylint: disable=g-error-prone-assert-raises
       nest.assert_shallow_structure(shallow_tree, input_tree)
 
   def testFlattenWithTuplePathsUpTo(self):
@@ -1074,14 +1065,14 @@ class NestTest(parameterized.TestCase, test.TestCase):
     nest.assert_shallow_structure(structure_traverse_r,
                                   structure_traverse_input)
 
-    with self.assertRaisesRegexp(TypeError, "returned structure"):
+    with self.assertRaisesRegex(TypeError, "returned structure"):
       nest.get_traverse_shallow_structure(lambda _: [True], 0)
 
-    with self.assertRaisesRegexp(TypeError, "returned a non-bool scalar"):
+    with self.assertRaisesRegex(TypeError, "returned a non-bool scalar"):
       nest.get_traverse_shallow_structure(lambda _: 1, [1])
 
-    with self.assertRaisesRegexp(
-        TypeError, "didn't return a depth=1 structure of bools"):
+    with self.assertRaisesRegex(TypeError,
+                                "didn't return a depth=1 structure of bools"):
       nest.get_traverse_shallow_structure(lambda _: [1], [1])
 
   def testYieldFlatStringPaths(self):
@@ -1214,7 +1205,7 @@ class NestTest(parameterized.TestCase, test.TestCase):
 
   def testFlattenCustomSequenceThatRaisesException(self):  # b/140746865
     seq = _CustomSequenceThatRaisesException()
-    with self.assertRaisesRegexp(ValueError, "Cannot get item"):
+    with self.assertRaisesRegex(ValueError, "Cannot get item"):
       nest.flatten(seq)
 
   def testListToTuple(self):
diff --git a/tensorflow/python/util/protobuf/compare_test.py b/tensorflow/python/util/protobuf/compare_test.py
index 2e5e83df990..229d5d78db6 100644
--- a/tensorflow/python/util/protobuf/compare_test.py
+++ b/tensorflow/python/util/protobuf/compare_test.py
@@ -47,53 +47,53 @@ class ProtoEqTest(googletest.TestCase):
   def assertNotEquals(self, a, b):
     """Asserts that ProtoEq says a != b."""
     a, b = LargePbs(a, b)
-    googletest.TestCase.assertEquals(self, compare.ProtoEq(a, b), False)
+    googletest.TestCase.assertEqual(self, compare.ProtoEq(a, b), False)
 
-  def assertEquals(self, a, b):
+  def assertEqual(self, a, b):
     """Asserts that ProtoEq says a == b."""
     a, b = LargePbs(a, b)
-    googletest.TestCase.assertEquals(self, compare.ProtoEq(a, b), True)
+    googletest.TestCase.assertEqual(self, compare.ProtoEq(a, b), True)
 
   def testPrimitives(self):
     googletest.TestCase.assertEqual(self, True, compare.ProtoEq('a', 'a'))
     googletest.TestCase.assertEqual(self, False, compare.ProtoEq('b', 'a'))
 
   def testEmpty(self):
-    self.assertEquals('', '')
+    self.assertEqual('', '')
 
   def testPrimitiveFields(self):
     self.assertNotEquals('string_: "a"', '')
-    self.assertEquals('string_: "a"', 'string_: "a"')
+    self.assertEqual('string_: "a"', 'string_: "a"')
     self.assertNotEquals('string_: "b"', 'string_: "a"')
     self.assertNotEquals('string_: "ab"', 'string_: "aa"')
 
     self.assertNotEquals('int64_: 0', '')
-    self.assertEquals('int64_: 0', 'int64_: 0')
+    self.assertEqual('int64_: 0', 'int64_: 0')
     self.assertNotEquals('int64_: -1', '')
     self.assertNotEquals('int64_: 1', 'int64_: 0')
     self.assertNotEquals('int64_: 0', 'int64_: -1')
 
     self.assertNotEquals('float_: 0.0', '')
-    self.assertEquals('float_: 0.0', 'float_: 0.0')
+    self.assertEqual('float_: 0.0', 'float_: 0.0')
     self.assertNotEquals('float_: -0.1', '')
     self.assertNotEquals('float_: 3.14', 'float_: 0')
     self.assertNotEquals('float_: 0', 'float_: -0.1')
-    self.assertEquals('float_: -0.1', 'float_: -0.1')
+    self.assertEqual('float_: -0.1', 'float_: -0.1')
 
     self.assertNotEquals('bool_: true', '')
     self.assertNotEquals('bool_: false', '')
     self.assertNotEquals('bool_: true', 'bool_: false')
-    self.assertEquals('bool_: false', 'bool_: false')
-    self.assertEquals('bool_: true', 'bool_: true')
+    self.assertEqual('bool_: false', 'bool_: false')
+    self.assertEqual('bool_: true', 'bool_: true')
 
     self.assertNotEquals('enum_: A', '')
     self.assertNotEquals('enum_: B', 'enum_: A')
     self.assertNotEquals('enum_: C', 'enum_: B')
-    self.assertEquals('enum_: C', 'enum_: C')
+    self.assertEqual('enum_: C', 'enum_: C')
 
   def testRepeatedPrimitives(self):
     self.assertNotEquals('int64s: 0', '')
-    self.assertEquals('int64s: 0', 'int64s: 0')
+    self.assertEqual('int64s: 0', 'int64s: 0')
     self.assertNotEquals('int64s: 1', 'int64s: 0')
     self.assertNotEquals('int64s: 0 int64s: 0', '')
     self.assertNotEquals('int64s: 0 int64s: 0', 'int64s: 0')
@@ -101,8 +101,8 @@ class ProtoEqTest(googletest.TestCase):
     self.assertNotEquals('int64s: 0 int64s: 1', 'int64s: 0')
     self.assertNotEquals('int64s: 1', 'int64s: 0 int64s: 2')
     self.assertNotEquals('int64s: 2 int64s: 0', 'int64s: 1')
-    self.assertEquals('int64s: 0 int64s: 0', 'int64s: 0 int64s: 0')
-    self.assertEquals('int64s: 0 int64s: 1', 'int64s: 0 int64s: 1')
+    self.assertEqual('int64s: 0 int64s: 0', 'int64s: 0 int64s: 0')
+    self.assertEqual('int64s: 0 int64s: 1', 'int64s: 0 int64s: 1')
     self.assertNotEquals('int64s: 1 int64s: 0', 'int64s: 0 int64s: 0')
     self.assertNotEquals('int64s: 1 int64s: 0', 'int64s: 0 int64s: 1')
     self.assertNotEquals('int64s: 1 int64s: 0', 'int64s: 0 int64s: 2')
@@ -111,10 +111,10 @@ class ProtoEqTest(googletest.TestCase):
 
   def testMessage(self):
     self.assertNotEquals('small <>', '')
-    self.assertEquals('small <>', 'small <>')
+    self.assertEqual('small <>', 'small <>')
     self.assertNotEquals('small < strings: "a" >', '')
     self.assertNotEquals('small < strings: "a" >', 'small <>')
-    self.assertEquals('small < strings: "a" >', 'small < strings: "a" >')
+    self.assertEqual('small < strings: "a" >', 'small < strings: "a" >')
     self.assertNotEquals('small < strings: "b" >', 'small < strings: "a" >')
     self.assertNotEquals('small < strings: "a" strings: "b" >',
                          'small < strings: "a" >')
@@ -124,11 +124,11 @@ class ProtoEqTest(googletest.TestCase):
     self.assertNotEquals('string_: "a"', 'small < strings: "b" strings: "c" >')
     self.assertNotEquals('string_: "a" small <>', 'small <>')
     self.assertNotEquals('string_: "a" small <>', 'small < strings: "b" >')
-    self.assertEquals('string_: "a" small <>', 'string_: "a" small <>')
+    self.assertEqual('string_: "a" small <>', 'string_: "a" small <>')
     self.assertNotEquals('string_: "a" small < strings: "a" >',
                          'string_: "a" small <>')
-    self.assertEquals('string_: "a" small < strings: "a" >',
-                      'string_: "a" small < strings: "a" >')
+    self.assertEqual('string_: "a" small < strings: "a" >',
+                     'string_: "a" small < strings: "a" >')
     self.assertNotEquals('string_: "a" small < strings: "a" >',
                          'int64_: 1 small < strings: "a" >')
     self.assertNotEquals('string_: "a" small < strings: "a" >', 'int64_: 1')
@@ -137,18 +137,18 @@ class ProtoEqTest(googletest.TestCase):
                          'int64_: 1 small < strings: "a" >')
     self.assertNotEquals('string_: "a" int64_: 1 small < strings: "a" >',
                          'string_: "a" int64_: 0 small < strings: "a" >')
-    self.assertEquals('string_: "a" int64_: 0 small < strings: "a" >',
-                      'string_: "a" int64_: 0 small < strings: "a" >')
+    self.assertEqual('string_: "a" int64_: 0 small < strings: "a" >',
+                     'string_: "a" int64_: 0 small < strings: "a" >')
 
   def testNestedMessage(self):
     self.assertNotEquals('medium <>', '')
-    self.assertEquals('medium <>', 'medium <>')
+    self.assertEqual('medium <>', 'medium <>')
     self.assertNotEquals('medium < smalls <> >', 'medium <>')
-    self.assertEquals('medium < smalls <> >', 'medium < smalls <> >')
+    self.assertEqual('medium < smalls <> >', 'medium < smalls <> >')
     self.assertNotEquals('medium < smalls <> smalls <> >',
                          'medium < smalls <> >')
-    self.assertEquals('medium < smalls <> smalls <> >',
-                      'medium < smalls <> smalls <> >')
+    self.assertEqual('medium < smalls <> smalls <> >',
+                     'medium < smalls <> smalls <> >')
 
     self.assertNotEquals('medium < int32s: 0 >', 'medium < smalls <> >')
 
@@ -172,12 +172,12 @@ class ProtoEqTest(googletest.TestCase):
                          '             int64_: 1            ')
     self.assertNotEquals('string_: "b" int64_: 1            ',
                          'string_: "a" int64_: 2            ')
-    self.assertEquals('string_: "a" int64_: 1            ',
-                      'string_: "a" int64_: 1            ')
+    self.assertEqual('string_: "a" int64_: 1            ',
+                     'string_: "a" int64_: 1            ')
     self.assertNotEquals('string_: "a" int64_: 1 float_: 0.0',
                          'string_: "a" int64_: 1            ')
-    self.assertEquals('string_: "a" int64_: 1 float_: 0.0',
-                      'string_: "a" int64_: 1 float_: 0.0')
+    self.assertEqual('string_: "a" int64_: 1 float_: 0.0',
+                     'string_: "a" int64_: 1 float_: 0.0')
     self.assertNotEquals('string_: "a" int64_: 1 float_: 0.1',
                          'string_: "a" int64_: 1 float_: 0.0')
     self.assertNotEquals('string_: "a" int64_: 2 float_: 0.0',
@@ -194,8 +194,8 @@ class ProtoEqTest(googletest.TestCase):
                          'small < strings: "b" >')
     self.assertNotEquals('string_: "a" small < strings: "b" >',
                          'string_: "a" small < strings: "a" >')
-    self.assertEquals('string_: "a" small < strings: "a" >',
-                      'string_: "a" small < strings: "a" >')
+    self.assertEqual('string_: "a" small < strings: "a" >',
+                     'string_: "a" small < strings: "a" >')
 
     self.assertNotEquals('string_: "a" medium <>',
                          'string_: "a" small < strings: "a" >')
@@ -286,8 +286,8 @@ class AssertTest(googletest.TestCase):
   def assertNone(self, a, b, message, **kwargs):
     """Checks that all possible asserts fail with the given message."""
     message = re.escape(textwrap.dedent(message))
-    self.assertRaisesRegexp(AssertionError, message, self.assertProtoEqual, a,
-                            b, **kwargs)
+    self.assertRaisesRegex(AssertionError, message, self.assertProtoEqual, a, b,
+                           **kwargs)
 
   def testCheckInitialized(self):
     # neither is initialized
@@ -427,7 +427,7 @@ class AssertTest(googletest.TestCase):
                     """)
 
   def testMsgPassdown(self):
-    self.assertRaisesRegexp(
+    self.assertRaisesRegex(
         AssertionError,
         'test message passed down',
         self.assertProtoEqual,
diff --git a/tensorflow/python/util/tf_export_test.py b/tensorflow/python/util/tf_export_test.py
index 6716560b79b..2d434cc9e8f 100644
--- a/tensorflow/python/util/tf_export_test.py
+++ b/tensorflow/python/util/tf_export_test.py
@@ -76,12 +76,12 @@ class ValidateExportTest(test.TestCase):
   def testExportSingleFunction(self):
     export_decorator = tf_export.tf_export('nameA', 'nameB')
     decorated_function = export_decorator(_test_function)
-    self.assertEquals(decorated_function, _test_function)
-    self.assertEquals(('nameA', 'nameB'), decorated_function._tf_api_names)
-    self.assertEquals(['nameA', 'nameB'],
-                      tf_export.get_v1_names(decorated_function))
-    self.assertEquals(['nameA', 'nameB'],
-                      tf_export.get_v2_names(decorated_function))
+    self.assertEqual(decorated_function, _test_function)
+    self.assertEqual(('nameA', 'nameB'), decorated_function._tf_api_names)
+    self.assertEqual(['nameA', 'nameB'],
+                     tf_export.get_v1_names(decorated_function))
+    self.assertEqual(['nameA', 'nameB'],
+                     tf_export.get_v2_names(decorated_function))
     self.assertEqual(tf_export.get_symbol_from_name('nameA'),
                      decorated_function)
     self.assertEqual(tf_export.get_symbol_from_name('nameB'),
@@ -115,10 +115,10 @@ class ValidateExportTest(test.TestCase):
     export_decorator2 = tf_export.tf_export('nameC', 'nameD')
     decorated_function1 = export_decorator1(_test_function)
     decorated_function2 = export_decorator2(_test_function2)
-    self.assertEquals(decorated_function1, _test_function)
-    self.assertEquals(decorated_function2, _test_function2)
-    self.assertEquals(('nameA', 'nameB'), decorated_function1._tf_api_names)
-    self.assertEquals(('nameC', 'nameD'), decorated_function2._tf_api_names)
+    self.assertEqual(decorated_function1, _test_function)
+    self.assertEqual(decorated_function2, _test_function2)
+    self.assertEqual(('nameA', 'nameB'), decorated_function1._tf_api_names)
+    self.assertEqual(('nameC', 'nameD'), decorated_function2._tf_api_names)
     self.assertEqual(tf_export.get_symbol_from_name('nameB'),
                      decorated_function1)
     self.assertEqual(tf_export.get_symbol_from_name('nameD'),
@@ -137,41 +137,41 @@ class ValidateExportTest(test.TestCase):
   def testExportClasses(self):
     export_decorator_a = tf_export.tf_export('TestClassA1')
     export_decorator_a(TestClassA)
-    self.assertEquals(('TestClassA1',), TestClassA._tf_api_names)
+    self.assertEqual(('TestClassA1',), TestClassA._tf_api_names)
     self.assertTrue('_tf_api_names' not in TestClassB.__dict__)
 
     export_decorator_b = tf_export.tf_export('TestClassB1')
     export_decorator_b(TestClassB)
-    self.assertEquals(('TestClassA1',), TestClassA._tf_api_names)
-    self.assertEquals(('TestClassB1',), TestClassB._tf_api_names)
-    self.assertEquals(['TestClassA1'], tf_export.get_v1_names(TestClassA))
-    self.assertEquals(['TestClassB1'], tf_export.get_v1_names(TestClassB))
+    self.assertEqual(('TestClassA1',), TestClassA._tf_api_names)
+    self.assertEqual(('TestClassB1',), TestClassB._tf_api_names)
+    self.assertEqual(['TestClassA1'], tf_export.get_v1_names(TestClassA))
+    self.assertEqual(['TestClassB1'], tf_export.get_v1_names(TestClassB))
 
   def testExportClassInEstimator(self):
     export_decorator_a = tf_export.tf_export('TestClassA1')
     export_decorator_a(TestClassA)
-    self.assertEquals(('TestClassA1',), TestClassA._tf_api_names)
+    self.assertEqual(('TestClassA1',), TestClassA._tf_api_names)
 
     export_decorator_b = tf_export.estimator_export(
         'estimator.TestClassB1')
     export_decorator_b(TestClassB)
     self.assertTrue('_tf_api_names' not in TestClassB.__dict__)
-    self.assertEquals(('TestClassA1',), TestClassA._tf_api_names)
-    self.assertEquals(['TestClassA1'], tf_export.get_v1_names(TestClassA))
-    self.assertEquals(['estimator.TestClassB1'],
-                      tf_export.get_v1_names(TestClassB))
+    self.assertEqual(('TestClassA1',), TestClassA._tf_api_names)
+    self.assertEqual(['TestClassA1'], tf_export.get_v1_names(TestClassA))
+    self.assertEqual(['estimator.TestClassB1'],
+                     tf_export.get_v1_names(TestClassB))
 
   def testExportSingleConstant(self):
     module1 = self._CreateMockModule('module1')
 
     export_decorator = tf_export.tf_export('NAME_A', 'NAME_B')
     export_decorator.export_constant('module1', 'test_constant')
-    self.assertEquals([(('NAME_A', 'NAME_B'), 'test_constant')],
-                      module1._tf_api_constants)
-    self.assertEquals([(('NAME_A', 'NAME_B'), 'test_constant')],
-                      tf_export.get_v1_constants(module1))
-    self.assertEquals([(('NAME_A', 'NAME_B'), 'test_constant')],
-                      tf_export.get_v2_constants(module1))
+    self.assertEqual([(('NAME_A', 'NAME_B'), 'test_constant')],
+                     module1._tf_api_constants)
+    self.assertEqual([(('NAME_A', 'NAME_B'), 'test_constant')],
+                     tf_export.get_v1_constants(module1))
+    self.assertEqual([(('NAME_A', 'NAME_B'), 'test_constant')],
+                     tf_export.get_v2_constants(module1))
 
   def testExportMultipleConstants(self):
     module1 = self._CreateMockModule('module1')
@@ -187,11 +187,9 @@ class ValidateExportTest(test.TestCase):
     export_decorator1.export_constant('module1', test_constant1)
     export_decorator2.export_constant('module2', test_constant2)
     export_decorator3.export_constant('module2', test_constant3)
-    self.assertEquals([(('NAME_A', 'NAME_B'), 123)],
-                      module1._tf_api_constants)
-    self.assertEquals([(('NAME_C', 'NAME_D'), 'abc'),
-                       (('NAME_E', 'NAME_F'), 0.5)],
-                      module2._tf_api_constants)
+    self.assertEqual([(('NAME_A', 'NAME_B'), 123)], module1._tf_api_constants)
+    self.assertEqual([(('NAME_C', 'NAME_D'), 'abc'),
+                      (('NAME_E', 'NAME_F'), 0.5)], module2._tf_api_constants)
 
   def testRaisesExceptionIfAlreadyHasAPINames(self):
     _test_function._tf_api_names = ['abc']
@@ -239,8 +237,8 @@ class ValidateExportTest(test.TestCase):
 
     export_decorator = tf_export.tf_export('nameA', 'nameB')
     exported_function = export_decorator(decorated_function)
-    self.assertEquals(decorated_function, exported_function)
-    self.assertEquals(('nameA', 'nameB'), _test_function._tf_api_names)
+    self.assertEqual(decorated_function, exported_function)
+    self.assertEqual(('nameA', 'nameB'), _test_function._tf_api_names)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/util/tf_inspect_test.py b/tensorflow/python/util/tf_inspect_test.py
index 44afdd262d2..9989fa164d9 100644
--- a/tensorflow/python/util/tf_inspect_test.py
+++ b/tensorflow/python/util/tf_inspect_test.py
@@ -133,7 +133,7 @@ class TfInspectTest(test.TestCase):
     exception_message = (r"Some arguments \['n'\] do not have default value, "
                          "but they are positioned after those with default "
                          "values. This can not be expressed with ArgSpec.")
-    with self.assertRaisesRegexp(ValueError, exception_message):
+    with self.assertRaisesRegex(ValueError, exception_message):
       tf_inspect.getargspec(partial_func)
 
   def testGetArgSpecOnPartialInvalidArgspec(self):
@@ -147,7 +147,7 @@ class TfInspectTest(test.TestCase):
     exception_message = (r"Some arguments \['l'\] do not have default value, "
                          "but they are positioned after those with default "
                          "values. This can not be expressed with ArgSpec.")
-    with self.assertRaisesRegexp(ValueError, exception_message):
+    with self.assertRaisesRegex(ValueError, exception_message):
       tf_inspect.getargspec(partial_func)
 
   def testGetArgSpecOnPartialValidArgspec(self):
diff --git a/tensorflow/python/util/tf_should_use_test.py b/tensorflow/python/util/tf_should_use_test.py
index bb50edfa857..7f6c86fa0a8 100644
--- a/tensorflow/python/util/tf_should_use_test.py
+++ b/tensorflow/python/util/tf_should_use_test.py
@@ -70,8 +70,8 @@ class TfShouldUseTest(test.TestCase):
       self.assertFalse(gc.garbage)
 
     tf_fn_in_this_function = def_function.function(in_this_function)
-    with self.assertRaisesRegexp(
-        RuntimeError, r'Object was never used.*blah0:0'):
+    with self.assertRaisesRegex(RuntimeError,
+                                r'Object was never used.*blah0:0'):
       tf_fn_in_this_function()
     self.assertFalse(gc.garbage)
 
diff --git a/tensorflow/tools/compatibility/ast_edits_test.py b/tensorflow/tools/compatibility/ast_edits_test.py
index 2e4f6a2dfb2..71a3f37c696 100644
--- a/tensorflow/tools/compatibility/ast_edits_test.py
+++ b/tensorflow/tools/compatibility/ast_edits_test.py
@@ -495,11 +495,10 @@ class TestAstEdits(test_util.TensorFlowTestCase):
 
   def testFullNameNode(self):
     t = ast_edits.full_name_node("a.b.c")
-    self.assertEquals(
+    self.assertEqual(
         ast.dump(t),
         "Attribute(value=Attribute(value=Name(id='a', ctx=Load()), attr='b', "
-        "ctx=Load()), attr='c', ctx=Load())"
-    )
+        "ctx=Load()), attr='c', ctx=Load())")
 
   def testImport(self):
     # foo should be renamed to bar.

From 1a7aa056845157e0d81fb19a344729f58f6d157b Mon Sep 17 00:00:00 2001
From: Frank Chen <frankchn@google.com>
Date: Tue, 30 Jun 2020 16:09:22 -0700
Subject: [PATCH 1354/1390] Make TpusPerHost, ChipsPerHost, and TpuMemoryLimit
 APIs public

PiperOrigin-RevId: 319118615
Change-Id: I9ddf5cba662ca963a2f229903537e7ad593738d1
---
 tensorflow/core/tpu/tpu_config_c_api.h         |  7 +++++++
 tensorflow/core/tpu/tpu_library_init_fns.inc   |  2 ++
 tensorflow/stream_executor/tpu/BUILD           |  1 +
 tensorflow/stream_executor/tpu/tpu_platform.cc | 18 ++++++++++++++++++
 tensorflow/stream_executor/tpu/tpu_platform.h  |  6 ++++++
 5 files changed, 34 insertions(+)

diff --git a/tensorflow/core/tpu/tpu_config_c_api.h b/tensorflow/core/tpu/tpu_config_c_api.h
index 8530df5ac26..a96cbf38f64 100644
--- a/tensorflow/core/tpu/tpu_config_c_api.h
+++ b/tensorflow/core/tpu/tpu_config_c_api.h
@@ -63,6 +63,11 @@ TFTPU_CAPI_EXPORT void TpuConfigurationApi_FreeCharArray(char* output);
 TFTPU_CAPI_EXPORT void TpuConfigurationApi_FreeInt32Array(int32_t* output);
 
 TFTPU_CAPI_EXPORT bool TpuConfigurationApi_HasTPUPodState();
+
+TFTPU_CAPI_EXPORT void TpuConfigurationApi_TpusPerHost(int32_t* tpus,
+                                                       TF_Status* status);
+TFTPU_CAPI_EXPORT void TpuConfigurationApi_TpuMemoryLimit(int64_t* memory_limit,
+                                                          TF_Status* status);
 }
 
 struct TfTpu_ConfigApiFn {
@@ -75,6 +80,8 @@ struct TfTpu_ConfigApiFn {
   TFTPU_ADD_FN_IN_STRUCT(TpuConfigurationApi_FreeCharArray);
   TFTPU_ADD_FN_IN_STRUCT(TpuConfigurationApi_FreeInt32Array);
   TFTPU_ADD_FN_IN_STRUCT(TpuConfigurationApi_HasTPUPodState);
+  TFTPU_ADD_FN_IN_STRUCT(TpuConfigurationApi_TpusPerHost);
+  TFTPU_ADD_FN_IN_STRUCT(TpuConfigurationApi_TpuMemoryLimit);
 };
 
 #endif  // TENSORFLOW_CORE_TPU_TPU_CONFIG_C_API_H_
diff --git a/tensorflow/core/tpu/tpu_library_init_fns.inc b/tensorflow/core/tpu/tpu_library_init_fns.inc
index f8bde09e728..2789a10e566 100644
--- a/tensorflow/core/tpu/tpu_library_init_fns.inc
+++ b/tensorflow/core/tpu/tpu_library_init_fns.inc
@@ -12,6 +12,8 @@ tensorflow::Status SetTpuConfigStructFns(void* library_handle) {
   TFTPU_SET_FN(config_fn, TpuConfigurationApi_FreeCharArray);
   TFTPU_SET_FN(config_fn, TpuConfigurationApi_FreeInt32Array);
   TFTPU_SET_FN(config_fn, TpuConfigurationApi_HasTPUPodState);
+  TFTPU_SET_FN(config_fn, TpuConfigurationApi_TpusPerHost);
+  TFTPU_SET_FN(config_fn, TpuConfigurationApi_TpuMemoryLimit);
 
   return tensorflow::Status::OK();
 }
diff --git a/tensorflow/stream_executor/tpu/BUILD b/tensorflow/stream_executor/tpu/BUILD
index 051d66e2e48..665fc43e4a1 100644
--- a/tensorflow/stream_executor/tpu/BUILD
+++ b/tensorflow/stream_executor/tpu/BUILD
@@ -90,6 +90,7 @@ cc_library(
         ":tpu_platform_interface",
         ":tpu_stream_interface",
         "//tensorflow/c:tf_status",
+        "//tensorflow/c:tf_status_helper",
         "//tensorflow/core:lib",
         "//tensorflow/core/platform:types",
         "//tensorflow/core/tpu:tpu_api",
diff --git a/tensorflow/stream_executor/tpu/tpu_platform.cc b/tensorflow/stream_executor/tpu/tpu_platform.cc
index 24767a88ff9..2fdd2374e71 100644
--- a/tensorflow/stream_executor/tpu/tpu_platform.cc
+++ b/tensorflow/stream_executor/tpu/tpu_platform.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/tpu/tpu_platform.h"
 
 #include "tensorflow/c/tf_status.h"
+#include "tensorflow/c/tf_status_helper.h"
 #include "tensorflow/core/tpu/tpu_api.h"
 #include "tensorflow/stream_executor/platform.h"
 #include "tensorflow/stream_executor/tpu/status_helper.h"
@@ -122,6 +123,23 @@ bool TpuPlatform::ShouldRegisterTpuDeviceToDeviceCopy() {
       ->TpuPlatform_ShouldRegisterTpuDeviceToDeviceCopyFn(platform_);
 }
 
+Status TpuPlatform::TpusPerHost(int* tpus) {
+  TF_Status* status = TF_NewStatus();
+  tpu::ConfigApiFn()->TpuConfigurationApi_TpusPerHostFn(tpus, status);
+  auto ret_status = StatusFromTF_Status(status);
+  TF_DeleteStatus(status);
+  return ret_status;
+}
+
+Status TpuPlatform::TpuMemoryLimit(int64* memory_limit) {
+  TF_Status* status = TF_NewStatus();
+  tpu::ConfigApiFn()->TpuConfigurationApi_TpuMemoryLimitFn(
+      reinterpret_cast<int64_t*>(&memory_limit), status);
+  auto ret_status = StatusFromTF_Status(status);
+  TF_DeleteStatus(status);
+  return ret_status;
+}
+
 bool RegisterTpuPlatform() {
   static bool tpu_platform_registered = false;
   if (!tpu_platform_registered) {
diff --git a/tensorflow/stream_executor/tpu/tpu_platform.h b/tensorflow/stream_executor/tpu/tpu_platform.h
index a3852b0edb0..2a11cd058a6 100644
--- a/tensorflow/stream_executor/tpu/tpu_platform.h
+++ b/tensorflow/stream_executor/tpu/tpu_platform.h
@@ -113,6 +113,12 @@ class TpuPlatform : public ::tensorflow::tpu::TpuPlatformInterface {
 
   EventMap* event_map() { return &event_map_; }
 
+  // Returns the number of TPUs per host.
+  static Status TpusPerHost(int* tpus);
+
+  // Returns the memory capacity of the TPUs on this host.
+  static Status TpuMemoryLimit(int64* memory_limit);
+
  private:
   SE_Platform* platform_;
 

From 17991834b64e4305c341167f88c2055eb5546567 Mon Sep 17 00:00:00 2001
From: Wenhao Jia <jiawenhao@google.com>
Date: Tue, 30 Jun 2020 16:16:35 -0700
Subject: [PATCH 1355/1390] Update the pointers in
 TpuProgramGroup::hlo_metadatas_ptrs_ when set_hlo_metadata(...) is called or
 when the object is moved.

PiperOrigin-RevId: 319119984
Change-Id: I9de1667efec6eb82671c35ada35ebaf7bf2cc298
---
 .../kernels/tpu_compilation_cache_external.cc |  4 +--
 .../core/tpu/kernels/tpu_program_group.cc     | 35 +++++++++++++------
 .../core/tpu/kernels/tpu_program_group.h      | 31 +++++++++-------
 3 files changed, 44 insertions(+), 26 deletions(-)

diff --git a/tensorflow/core/tpu/kernels/tpu_compilation_cache_external.cc b/tensorflow/core/tpu/kernels/tpu_compilation_cache_external.cc
index c4442fc95d5..ab7338d2e13 100644
--- a/tensorflow/core/tpu/kernels/tpu_compilation_cache_external.cc
+++ b/tensorflow/core/tpu/kernels/tpu_compilation_cache_external.cc
@@ -39,7 +39,7 @@ int64 get_uid() {
 }
 
 void PopulateEntry(const std::string& key, CompiledSubgraph* entry,
-                   TpuProgramGroup& tpu_program_group) {
+                   TpuProgramGroup tpu_program_group) {
   // Make the unique keys for each cached proto.
   for (int i = 0; i < tpu_program_group.program_count(); ++i) {
     entry->proto_key.push_back(ProtoKeyForComputation(key, i));
@@ -118,7 +118,7 @@ CompiledSubgraph* TpuCompilationCacheExternal::InitializeEntry(
   }
 
   // TODO(henrytan): handle sharding/unsharding.
-  PopulateEntry(key, main_entry, tpu_program_group);
+  PopulateEntry(key, main_entry, std::move(tpu_program_group));
 
   for (int64 i = 0; i < main_entry->proto_key.size(); ++i) {
     auto entry_inserted = entries_by_proto_key_.insert(
diff --git a/tensorflow/core/tpu/kernels/tpu_program_group.cc b/tensorflow/core/tpu/kernels/tpu_program_group.cc
index ecda2ef062e..23b4a3e6b0c 100644
--- a/tensorflow/core/tpu/kernels/tpu_program_group.cc
+++ b/tensorflow/core/tpu/kernels/tpu_program_group.cc
@@ -197,20 +197,33 @@ void TpuProgramGroup::UnloadAndDestroyPrograms() {
   return Status::OK();
 }
 
-xla::HloProto TpuProgramGroup::hlo_metadata(int core_index) const {
-  CHECK_GE(core_index, 0);
-  CHECK_LT(core_index, program_count());
-  TpuSerializedProto serialized_hlo_proto;
-  auto cleanup = gtl::MakeCleanup([serialized_hlo_proto]() {
-    StreamExecutor_Tpu_FreeSerializedProto(&serialized_hlo_proto);
-  });
-  TpuProgram_GetHloMetadata(tpu_programs_[core_index], &serialized_hlo_proto);
-  return stream_executor::tpu::DeserializeProto<xla::HloProto>(
-      serialized_hlo_proto);
+TpuProgramGroup::TpuProgramGroup(TpuProgramGroup&& other)
+    : may_modify_variables_(std::move(other.may_modify_variables_)),
+      host_compute_metadata_(std::move(other.host_compute_metadata_)),
+      tpu_programs_(std::move(other.tpu_programs_)),
+      executable_info_(std::move(other.executable_info_)),
+      host_transfer_info_(std::move(other.host_transfer_info_)),
+      hlo_metadatas_(std::move(other.hlo_metadatas_)) {
+  RefreshHloMetadatasPtrs();
+}
+
+void TpuProgramGroup::set_hlo_metadata(const xla::HloProto& hlo_metadata) {
+  // TODO(henrytan): initialize hlo_metadatas_ for multi program support.
+  if (hlo_metadatas_.empty()) {
+    hlo_metadatas_.push_back(hlo_metadata);
+  }
+  RefreshHloMetadatasPtrs();
 }
 
 absl::Span<const xla::HloProto* const> TpuProgramGroup::hlo_metadatas() const {
-  return absl::MakeConstSpan(hlo_metadatas_);
+  return hlo_metadatas_ptrs_;
+}
+
+void TpuProgramGroup::RefreshHloMetadatasPtrs() {
+  hlo_metadatas_ptrs_.reserve(hlo_metadatas_.size());
+  for (const auto& hlo_metadata_internal_ : hlo_metadatas_) {
+    hlo_metadatas_ptrs_.push_back(&hlo_metadata_internal_);
+  }
 }
 
 }  // namespace tpu
diff --git a/tensorflow/core/tpu/kernels/tpu_program_group.h b/tensorflow/core/tpu/kernels/tpu_program_group.h
index 0ade58e6daa..a0e97b88fc1 100644
--- a/tensorflow/core/tpu/kernels/tpu_program_group.h
+++ b/tensorflow/core/tpu/kernels/tpu_program_group.h
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/compile_only_client.h"
 #include "tensorflow/compiler/xla/service/computation_placer.h"
 #include "tensorflow/compiler/xla/service/hlo.pb.h"
+#include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/tpu/kernels/tpu_compile_op_support.h"
 #include "tensorflow/core/tpu/kernels/tpu_executable_info.pb.h"
 #include "tensorflow/core/tpu/kernels/tpu_program_c_api.h"
@@ -91,6 +92,10 @@ class TpuProgramGroup : public TpuProgramGroupInterface {
       const absl::optional<xla::DeviceAssignment>& xla_device_assignment,
       TpuProgramGroup* tpu_program);
 
+  TpuProgramGroup() = default;
+  TpuProgramGroup(TpuProgramGroup&& other);
+  TpuProgramGroup& operator=(TpuProgramGroup&&) = delete;
+
   size_t program_count() const override { return tpu_programs_.size(); }
 
   int64_t program_size() const override;
@@ -136,28 +141,28 @@ class TpuProgramGroup : public TpuProgramGroupInterface {
     host_transfer_info_ = host_transfer_info;
   }
 
-  const xla::HloProto& hlo_metadata() const { return hlo_metadata_; }
-  void set_hlo_metadata(const xla::HloProto& hlo_metadata) {
-    hlo_metadata_ = hlo_metadata;
-
-    // TODO(henrytan): initialize hlo_metadatas_ for multi program support.
-    if (hlo_metadatas_.empty()) {
-      hlo_metadatas_.push_back(&hlo_metadata_);
-    }
-  }
-
-  xla::HloProto hlo_metadata(int core_index) const;
+  void set_hlo_metadata(const xla::HloProto& hlo_metadata);
   absl::Span<const xla::HloProto* const> hlo_metadatas() const override;
 
  private:
+  void RefreshHloMetadatasPtrs();
+
   std::vector<bool> may_modify_variables_;
   tf2xla::HostComputeMetadata host_compute_metadata_;
 
   std::vector<XLA_TpuProgram*> tpu_programs_;  // Not owned.
   TPUExecutableInfoProto executable_info_;
   TPUHostTransferInfoProto host_transfer_info_;
-  xla::HloProto hlo_metadata_;
-  std::vector<const xla::HloProto*> hlo_metadatas_;
+
+  // To be consistent with the TpuProgramGroupInterface::hlo_metadatas()
+  // signature, we store HloProto values in hlo_metadatas_ when
+  // set_hlo_metadata(...) is called, and return their pointers from
+  // hlo_metadatas_ptrs_ when hlo_metadatas() is called. hlo_metadata_ptrs_ is
+  // refreshed whenever hlo_metadatas_ is set or the object is moved.
+  std::vector<xla::HloProto> hlo_metadatas_;  // Owned.
+  std::vector<const xla::HloProto*> hlo_metadatas_ptrs_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(TpuProgramGroup);
 };
 
 }  // namespace tpu

From 016bb3fc99d75a703956c00f0adf65736407da0f Mon Sep 17 00:00:00 2001
From: Gaurav Jain <gjn@google.com>
Date: Tue, 30 Jun 2020 17:15:12 -0700
Subject: [PATCH 1356/1390] Remove unnecessary eval() calls

The assertAll* statements already evaluate the arguments.

PiperOrigin-RevId: 319130109
Change-Id: I0034bdfa87a974613561a39e0d4a4223292245c7
---
 tensorflow/compiler/tests/concat_ops_test.py  |   4 +-
 .../compiler/tests/tensor_array_ops_test.py   |   2 +-
 .../examples/adding_an_op/zero_out_1_test.py  |   8 +-
 .../examples/adding_an_op/zero_out_2_test.py  |   4 +-
 .../examples/adding_an_op/zero_out_3_test.py  |   4 +-
 tensorflow/python/client/session_test.py      |  16 +--
 tensorflow/python/compiler/xla/jit_test.py    |   2 +-
 .../python/debug/wrappers/framework_test.py   |   2 +-
 .../python/distribute/all_reduce_test.py      |   2 +-
 .../python/distribute/moving_averages_test.py |  14 +--
 tensorflow/python/eager/backprop_test.py      |   2 +-
 .../python/eager/function_gradients_test.py   |   2 +-
 .../feature_column/feature_column_test.py     |  58 ++++-----
 .../framework/auto_control_deps_test.py       |   2 +-
 tensorflow/python/framework/function_test.py  |  26 ++--
 .../keras/legacy_tf_layers/core_test.py       |   2 +-
 .../python/keras/optimizer_v2/adam_test.py    |   8 +-
 .../python/keras/optimizer_v2/adamax_test.py  |  34 +++---
 .../python/keras/optimizer_v2/nadam_test.py   |  20 +--
 .../python/kernel_tests/array_ops_test.py     |  26 ++--
 .../kernel_tests/batch_scatter_ops_test.py    |   2 +-
 .../kernel_tests/batchtospace_op_test.py      |   2 +-
 .../boosted_trees/quantile_ops_test.py        |  48 ++++----
 .../kernel_tests/broadcast_to_ops_test.py     |  20 +--
 .../python/kernel_tests/clip_ops_test.py      |   2 +-
 .../python/kernel_tests/constant_op_test.py   |  12 +-
 .../kernel_tests/dense_update_ops_test.py     |   4 +-
 .../kernel_tests/depthtospace_op_test.py      |   6 +-
 .../python/kernel_tests/diag_op_test.py       |  46 +++----
 .../distributions/categorical_test.py         |  12 +-
 .../dirichlet_multinomial_test.py             |   8 +-
 .../distributions/multinomial_test.py         |  22 ++--
 .../python/kernel_tests/embedding_ops_test.py |  10 +-
 .../python/kernel_tests/gather_nd_op_test.py  |   4 +-
 .../python/kernel_tests/gather_op_test.py     |   6 +-
 .../python/kernel_tests/init_ops_test.py      |  16 +--
 .../python/kernel_tests/inplace_ops_test.py   |  46 +++----
 .../linalg/linear_operator_addition_test.py   |  16 +--
 .../linear_operator_composition_test.py       |   2 +-
 .../python/kernel_tests/linalg_grad_test.py   |   2 +-
 .../python/kernel_tests/lookup_ops_test.py    |  56 ++++-----
 tensorflow/python/kernel_tests/losses_test.py |  15 ++-
 .../python/kernel_tests/manip_ops_test.py     |   2 +-
 .../python/kernel_tests/metrics_test.py       | 114 +++++++++---------
 tensorflow/python/kernel_tests/pad_op_test.py |   2 +-
 .../kernel_tests/padding_fifo_queue_test.py   |   4 +-
 .../python/kernel_tests/py_func_test.py       |   6 +-
 .../random/random_shuffle_queue_test.py       |   8 +-
 .../python/kernel_tests/reduction_ops_test.py |   8 +-
 .../python/kernel_tests/rnn_cell_test.py      |   2 +-
 .../kernel_tests/scatter_nd_ops_test.py       |   2 +-
 .../segment_reduction_ops_test.py             |   2 +-
 .../python/kernel_tests/shape_ops_test.py     |   8 +-
 .../python/kernel_tests/slice_op_test.py      |   2 +-
 .../kernel_tests/spacetobatch_op_test.py      |  10 +-
 .../python/kernel_tests/stack_op_test.py      |  18 +--
 .../kernel_tests/string_join_op_test.py       |   8 +-
 .../kernel_tests/string_length_op_test.py     |   2 +-
 .../kernel_tests/tensor_array_ops_test.py     |   2 +-
 .../kernel_tests/unicode_transcode_op_test.py |   2 +-
 .../python/ops/control_flow_ops_test.py       |   4 +-
 tensorflow/python/ops/gradients_test.py       |  14 +--
 tensorflow/python/ops/image_ops_test.py       |  15 ++-
 tensorflow/python/ops/math_grad_test.py       |   8 +-
 tensorflow/python/tpu/feature_column_test.py  |  10 +-
 .../python/tpu/feature_column_v2_test.py      |  12 +-
 tensorflow/python/training/adagrad_test.py    |   4 +-
 tensorflow/python/training/adam_test.py       |   4 +-
 tensorflow/python/training/saver_test.py      |   8 +-
 .../python/training/slot_creator_test.py      |   4 +-
 .../training/warm_starting_util_test.py       |  12 +-
 71 files changed, 445 insertions(+), 447 deletions(-)

diff --git a/tensorflow/compiler/tests/concat_ops_test.py b/tensorflow/compiler/tests/concat_ops_test.py
index 310be97f2d9..41107d04eb0 100644
--- a/tensorflow/compiler/tests/concat_ops_test.py
+++ b/tensorflow/compiler/tests/concat_ops_test.py
@@ -268,7 +268,7 @@ class ConcatTest(xla_test.XLATestCase):
                 # TODO(irving): Make tf.concat handle map, then drop list().
                 xs = list(map(constant_op.constant, [x0, x1]))
                 c = array_ops.concat(xs, axis)
-                self.assertAllEqual(c.eval(), correct)
+                self.assertAllEqual(c, correct)
                 # Check gradients
                 dc = np.random.randn(*c.get_shape().as_list())
                 dxs = self.evaluate(gradients_impl.gradients(c, xs, dc))
@@ -281,7 +281,7 @@ class ConcatTest(xla_test.XLATestCase):
       with self.test_scope():
         concat_list_t = array_ops.concat([c1, c2], 0)
         concat_tuple_t = array_ops.concat((c1, c2), 0)
-      self.assertAllEqual(concat_list_t.eval(), self.evaluate(concat_tuple_t))
+      self.assertAllEqual(concat_list_t, self.evaluate(concat_tuple_t))
 
   def testConcatNoScalars(self):
     with self.session():
diff --git a/tensorflow/compiler/tests/tensor_array_ops_test.py b/tensorflow/compiler/tests/tensor_array_ops_test.py
index 665d396182a..1175983090f 100644
--- a/tensorflow/compiler/tests/tensor_array_ops_test.py
+++ b/tensorflow/compiler/tests/tensor_array_ops_test.py
@@ -508,7 +508,7 @@ class TensorArrayTest(xla_test.XLATestCase):
         return w2_grad.read(2)
 
       # Assert that aggregation works correctly
-      self.assertAllEqual(c(12.00), xla.compile(fn)[0].eval())
+      self.assertAllEqual(c(12.00), xla.compile(fn)[0])
 
       def fn():
         ta = tensor_array_ops.TensorArray(
diff --git a/tensorflow/examples/adding_an_op/zero_out_1_test.py b/tensorflow/examples/adding_an_op/zero_out_1_test.py
index a52f31b6d67..8d708cfec21 100644
--- a/tensorflow/examples/adding_an_op/zero_out_1_test.py
+++ b/tensorflow/examples/adding_an_op/zero_out_1_test.py
@@ -32,26 +32,26 @@ class ZeroOut1Test(tf.test.TestCase):
   def test(self):
     with self.cached_session():
       result = zero_out_op_1.zero_out([5, 4, 3, 2, 1])
-      self.assertAllEqual(result.eval(), [5, 0, 0, 0, 0])
+      self.assertAllEqual(result, [5, 0, 0, 0, 0])
 
   @test_util.run_deprecated_v1
   def test_namespace(self):
     with self.cached_session():
       result = zero_out_op_1.namespace_zero_out([5, 4, 3, 2, 1])
-      self.assertAllEqual(result.eval(), [5, 0, 0, 0, 0])
+      self.assertAllEqual(result, [5, 0, 0, 0, 0])
 
   @test_util.run_deprecated_v1
   def test_namespace_call_op_on_op(self):
     with self.cached_session():
       x = zero_out_op_1.namespace_zero_out([5, 4, 3, 2, 1])
       result = zero_out_op_1.namespace_zero_out(x)
-      self.assertAllEqual(result.eval(), [5, 0, 0, 0, 0])
+      self.assertAllEqual(result, [5, 0, 0, 0, 0])
 
   @test_util.run_deprecated_v1
   def test_namespace_nested(self):
     with self.cached_session():
       result = zero_out_op_1.namespace_nested_zero_out([5, 4, 3, 2, 1])
-      self.assertAllEqual(result.eval(), [5, 0, 0, 0, 0])
+      self.assertAllEqual(result, [5, 0, 0, 0, 0])
 
   def testLoadTwice(self):
     zero_out_loaded_again = tf.load_op_library(os.path.join(
diff --git a/tensorflow/examples/adding_an_op/zero_out_2_test.py b/tensorflow/examples/adding_an_op/zero_out_2_test.py
index 481742b5b54..6be735b8f55 100644
--- a/tensorflow/examples/adding_an_op/zero_out_2_test.py
+++ b/tensorflow/examples/adding_an_op/zero_out_2_test.py
@@ -33,13 +33,13 @@ class ZeroOut2Test(tf.test.TestCase):
   def test(self):
     with self.cached_session():
       result = zero_out_op_2.zero_out([5, 4, 3, 2, 1])
-      self.assertAllEqual(result.eval(), [5, 0, 0, 0, 0])
+      self.assertAllEqual(result, [5, 0, 0, 0, 0])
 
   @test_util.run_deprecated_v1
   def test_2d(self):
     with self.cached_session():
       result = zero_out_op_2.zero_out([[6, 5, 4], [3, 2, 1]])
-      self.assertAllEqual(result.eval(), [[6, 0, 0], [0, 0, 0]])
+      self.assertAllEqual(result, [[6, 0, 0], [0, 0, 0]])
 
   @test_util.run_deprecated_v1
   def test_grad(self):
diff --git a/tensorflow/examples/adding_an_op/zero_out_3_test.py b/tensorflow/examples/adding_an_op/zero_out_3_test.py
index 8cbe2b6793a..abfaae0389d 100644
--- a/tensorflow/examples/adding_an_op/zero_out_3_test.py
+++ b/tensorflow/examples/adding_an_op/zero_out_3_test.py
@@ -30,13 +30,13 @@ class ZeroOut3Test(tf.test.TestCase):
   def test(self):
     with self.cached_session():
       result = zero_out_op_3.zero_out([5, 4, 3, 2, 1])
-      self.assertAllEqual(result.eval(), [5, 0, 0, 0, 0])
+      self.assertAllEqual(result, [5, 0, 0, 0, 0])
 
   @test_util.run_deprecated_v1
   def testAttr(self):
     with self.cached_session():
       result = zero_out_op_3.zero_out([5, 4, 3, 2, 1], preserve_index=3)
-      self.assertAllEqual(result.eval(), [0, 0, 0, 2, 0])
+      self.assertAllEqual(result, [0, 0, 0, 2, 0])
 
   @test_util.run_deprecated_v1
   def testNegative(self):
diff --git a/tensorflow/python/client/session_test.py b/tensorflow/python/client/session_test.py
index 696353a5781..23d5ddaee44 100644
--- a/tensorflow/python/client/session_test.py
+++ b/tensorflow/python/client/session_test.py
@@ -115,7 +115,7 @@ class SessionTest(test_util.TensorFlowTestCase):
             'CPU': 2, 'GPU': 0
         })) as sess:
       inp = constant_op.constant(10.0, name='W1')
-      self.assertAllEqual(inp.eval(), 10.0)
+      self.assertAllEqual(inp, 10.0)
 
       num_cpu_devices = 0
       num_gpu_devices = 0
@@ -133,7 +133,7 @@ class SessionTest(test_util.TensorFlowTestCase):
     with session.Session(
         config=config_pb2.ConfigProto(use_per_session_threads=True)):
       inp = constant_op.constant(10.0, name='W1')
-      self.assertAllEqual(inp.eval(), 10.0)
+      self.assertAllEqual(inp, 10.0)
 
   def testSessionInterOpThreadPool(self):
     config_pb = config_pb2.ConfigProto()
@@ -1235,11 +1235,11 @@ class SessionTest(test_util.TensorFlowTestCase):
       self.assertEqual(len(sess.graph_def.node), 1)
       d = constant_op.constant(6.0, name='d')
       self.assertEqual(len(sess.graph_def.node), 2)
-      self.assertAllEqual(c.eval(), 5.0)
-      self.assertAllEqual(d.eval(), 6.0)
+      self.assertAllEqual(c, 5.0)
+      self.assertAllEqual(d, 6.0)
       e = constant_op.constant(7.0, name='e')
       self.assertEqual(len(sess.graph_def.node), 3)
-      self.assertAllEqual(e.eval(), 7.0)
+      self.assertAllEqual(e, 7.0)
 
   def testUseAfterClose(self):
     with session.Session() as sess:
@@ -1299,10 +1299,10 @@ class SessionTest(test_util.TensorFlowTestCase):
       a = constant_op.constant(1.0, shape=[1, 2])
       b = constant_op.constant(2.0, shape=[2, 3])
       c = math_ops.matmul(a, b)
-      self.assertAllEqual([[4.0, 4.0, 4.0]], c.eval())
+      self.assertAllEqual([[4.0, 4.0, 4.0]], c)
       d = constant_op.constant([1.0, 2.0, 3.0], shape=[3, 1])
       e = math_ops.matmul(c, d)
-      self.assertAllEqual([[24.0]], e.eval())
+      self.assertAllEqual([[24.0]], e)
       sess.close()
 
   @test_util.run_v1_only('b/120545219')
@@ -1549,7 +1549,7 @@ class SessionTest(test_util.TensorFlowTestCase):
             [compat.as_bytes(str(i)) for i in xrange(size)],
             dtype=np.object).reshape(shape) if size > 0 else []
         c = constant_op.constant(c_list)
-        self.assertAllEqual(c.eval(), c_list)
+        self.assertAllEqual(c, c_list)
 
   def testStringFeed(self):
     with session.Session() as sess:
diff --git a/tensorflow/python/compiler/xla/jit_test.py b/tensorflow/python/compiler/xla/jit_test.py
index 5294d970a9b..5abc20778ad 100644
--- a/tensorflow/python/compiler/xla/jit_test.py
+++ b/tensorflow/python/compiler/xla/jit_test.py
@@ -208,7 +208,7 @@ class CompilationEnabledInGradientTest(test.TestCase, parameterized.TestCase):
           ncg.get_attr("_XlaCompile")
 
       # d/dx (x ** 4) = 4 * (x ** 3)
-      self.assertAllClose([[108]], x_grads.eval())
+      self.assertAllClose([[108]], x_grads)
 
   @test_util.build_as_function_and_v1_graph
   def testCompilationGradientScopeNames(self):
diff --git a/tensorflow/python/debug/wrappers/framework_test.py b/tensorflow/python/debug/wrappers/framework_test.py
index 9493fa1a81e..266404a1038 100644
--- a/tensorflow/python/debug/wrappers/framework_test.py
+++ b/tensorflow/python/debug/wrappers/framework_test.py
@@ -322,7 +322,7 @@ class DebugWrapperSessionTest(test_util.TensorFlowTestCase):
                                       self._observer)
 
     with wrapper as sess:
-      self.assertAllClose([[3.0], [4.0]], self._s.eval())
+      self.assertAllClose([[3.0], [4.0]], self._s)
       self.assertEqual(1, self._observer["on_run_start_count"])
       self.assertEqual(self._s, self._observer["run_fetches"])
       self.assertEqual(1, self._observer["on_run_end_count"])
diff --git a/tensorflow/python/distribute/all_reduce_test.py b/tensorflow/python/distribute/all_reduce_test.py
index c738fa2f855..159faa6efcb 100644
--- a/tensorflow/python/distribute/all_reduce_test.py
+++ b/tensorflow/python/distribute/all_reduce_test.py
@@ -160,7 +160,7 @@ class AllReduceTest(test_util.TensorFlowTestCase):
       output_tensors = build_f(input_tensors, un_op)
       sum_reduced = math_ops.add_n(output_tensors)
       sum_reduced.op.run()
-      self.assertAllClose(sum_reduced.eval(), self.evaluate(simple_sum))
+      self.assertAllClose(sum_reduced, self.evaluate(simple_sum))
 
   def _testRingAllReduce(self, num_workers, num_gpus, shape, subdiv):
     start_time = time.time()
diff --git a/tensorflow/python/distribute/moving_averages_test.py b/tensorflow/python/distribute/moving_averages_test.py
index 83c1be3e3f5..2d0ab80f1f2 100644
--- a/tensorflow/python/distribute/moving_averages_test.py
+++ b/tensorflow/python/distribute/moving_averages_test.py
@@ -65,7 +65,7 @@ class AssignMovingAveragesTest(test.TestCase, parameterized.TestCase):
     with distribution.scope(), self.cached_session() as sess:
       var, assign = distribution.extended.call_for_each_replica(replica_fn)
       variables.global_variables_initializer().run()
-      self.assertAllClose([10.0, 11.0], var.eval())
+      self.assertAllClose([10.0, 11.0], var)
       sess.run(distribution.experimental_local_results(assign))
       # Mean of val across calls to replica_fn().
       average_val = [1.0 + 0.5 * (replica_id[0] - 1),
@@ -91,12 +91,12 @@ class AssignMovingAveragesTest(test.TestCase, parameterized.TestCase):
     with distribution.scope(), self.cached_session() as sess:
       var, assign_op = distribution.extended.call_for_each_replica(replica_fn)
       variables.global_variables_initializer().run()
-      self.assertAllClose([0.0, 0.0], var.eval())
+      self.assertAllClose([0.0, 0.0], var)
       sess.run(distribution.experimental_local_results(assign_op))
       # Mean of val across calls to replica_fn().
       average_val = [1.0 + 0.5 * (replica_id[0] - 1),
                      2.0 - 0.5 * (replica_id[0] - 1)]
-      self.assertAllClose(average_val, var.eval())
+      self.assertAllClose(average_val, var)
 
   @combinations.generate(all_combinations)
   def testCrossDeviceWithoutZeroDebias(self, distribution):
@@ -110,7 +110,7 @@ class AssignMovingAveragesTest(test.TestCase, parameterized.TestCase):
           var, val, decay, zero_debias=False)
 
       variables.global_variables_initializer().run()
-      self.assertAllClose([10.0, 11.0], var.eval())
+      self.assertAllClose([10.0, 11.0], var)
       sess.run(assign)
       average_val = [1.0, 2.0]
       val_weight = 1.0 - 0.25
@@ -138,9 +138,9 @@ class AssignMovingAveragesTest(test.TestCase, parameterized.TestCase):
       assign = moving_averages.assign_moving_average(var, val, decay)
 
       variables.global_variables_initializer().run()
-      self.assertAllClose([0.0, 0.0], var.eval())
+      self.assertAllClose([0.0, 0.0], var)
       sess.run(assign, feed_dict={val: [1.0, 2.0]})
-      self.assertAllClose([1.0, 2.0], var.eval())
+      self.assertAllClose([1.0, 2.0], var)
 
       # Also try assign.op.
       sess.run(assign.op, feed_dict={val: [10.0, 0.0]})
@@ -182,7 +182,7 @@ class AssignMovingAveragesTest(test.TestCase, parameterized.TestCase):
     with distribution.scope(), self.cached_session() as sess:
       var, assign = distribution.extended.call_for_each_replica(replica_fn)
       variables.global_variables_initializer().run()
-      self.assertAllClose([10.0, 11.0], var.eval())
+      self.assertAllClose([10.0, 11.0], var)
       sess.run(distribution.experimental_local_results(assign))
       self.assertAllClose(
           [10 * 0.25 + 1. * (1 - 0.25), 11 * 0.25 + 2. * (1 - 0.25)],
diff --git a/tensorflow/python/eager/backprop_test.py b/tensorflow/python/eager/backprop_test.py
index d21146e0b73..4f53e45ba0a 100644
--- a/tensorflow/python/eager/backprop_test.py
+++ b/tensorflow/python/eager/backprop_test.py
@@ -1499,7 +1499,7 @@ class BackpropTest(test.TestCase, parameterized.TestCase):
       tf_max = max_pooling3d(
           tf_aa, pool_size=pool_size, strides=strides, padding='SAME')
       tf_da = gradients.gradients(tf_max, [tf_aa])
-      self.assertAllEqual(da[0], tf_da[0].eval())
+      self.assertAllEqual(da[0], tf_da[0])
 
   @test_util.run_in_graph_and_eager_modes
   def testWatchBadThing(self):
diff --git a/tensorflow/python/eager/function_gradients_test.py b/tensorflow/python/eager/function_gradients_test.py
index 2830207b30a..d0fc4f5e809 100644
--- a/tensorflow/python/eager/function_gradients_test.py
+++ b/tensorflow/python/eager/function_gradients_test.py
@@ -85,7 +85,7 @@ class FunctionGradientsTest(test.TestCase, parameterized.TestCase):
       node = f()
       grads, = gradients_impl.gradients(node, v)
       v.initializer.run()
-      self.assertAllEqual(grads.eval(), 2.0)
+      self.assertAllEqual(grads, 2.0)
       self.assertEqual(grads.shape, v.shape)
 
   def testSymbolicHigherOrder(self):
diff --git a/tensorflow/python/feature_column/feature_column_test.py b/tensorflow/python/feature_column/feature_column_test.py
index a657656b9b6..e3dff5d1591 100644
--- a/tensorflow/python/feature_column/feature_column_test.py
+++ b/tensorflow/python/feature_column/feature_column_test.py
@@ -280,7 +280,7 @@ class NumericColumnTest(test.TestCase):
         features=fc.make_parse_example_spec([price]))
     self.assertIn('price', features)
     with self.cached_session():
-      self.assertAllEqual([[20., 110.]], features['price'].eval())
+      self.assertAllEqual([[20., 110.]], features['price'])
 
   @test_util.run_deprecated_v1
   def test_parse_example_with_default_value(self):
@@ -303,7 +303,7 @@ class NumericColumnTest(test.TestCase):
         features=fc.make_parse_example_spec([price]))
     self.assertIn('price', features)
     with self.cached_session():
-      self.assertAllEqual([[20., 110.], [11., 11.]], features['price'].eval())
+      self.assertAllEqual([[20., 110.], [11., 11.]], features['price'])
 
   def test_normalizer_fn_must_be_callable(self):
     with self.assertRaisesRegex(TypeError, 'must be a callable'):
@@ -318,7 +318,7 @@ class NumericColumnTest(test.TestCase):
     price = fc._numeric_column('price', shape=[2], normalizer_fn=_increment_two)
     output = _transform_features({'price': [[1., 2.], [5., 6.]]}, [price])
     with self.cached_session():
-      self.assertAllEqual([[3., 4.], [7., 8.]], output[price].eval())
+      self.assertAllEqual([[3., 4.], [7., 8.]], output[price])
 
   @test_util.run_deprecated_v1
   def test_get_dense_tensor(self):
@@ -454,7 +454,7 @@ class BucketizedColumnTest(test.TestCase):
         features=fc.make_parse_example_spec([bucketized_price]))
     self.assertIn('price', features)
     with self.cached_session():
-      self.assertAllEqual([[20., 110.]], features['price'].eval())
+      self.assertAllEqual([[20., 110.]], features['price'])
 
   @test_util.run_deprecated_v1
   def test_transform_feature(self):
@@ -751,8 +751,8 @@ class HashedCategoricalColumnTest(test.TestCase):
     expected_values = [6, 4, 1]
     with self.cached_session():
       self.assertEqual(dtypes.int64, output.values.dtype)
-      self.assertAllEqual(expected_values, output.values.eval())
-      self.assertAllEqual(wire_tensor.indices.eval(), output.indices.eval())
+      self.assertAllEqual(expected_values, output.values)
+      self.assertAllEqual(wire_tensor.indices.eval(), output.indices)
       self.assertAllEqual(wire_tensor.dense_shape.eval(),
                           output.dense_shape.eval())
 
@@ -807,7 +807,7 @@ class HashedCategoricalColumnTest(test.TestCase):
     # Check exact hashed output. If hashing changes this test will break.
     expected_values = [3, 7, 5]
     with self.cached_session():
-      self.assertAllEqual(expected_values, output.values.eval())
+      self.assertAllEqual(expected_values, output.values)
 
   @test_util.run_deprecated_v1
   def test_int32_64_is_compatible(self):
@@ -822,7 +822,7 @@ class HashedCategoricalColumnTest(test.TestCase):
     # Check exact hashed output. If hashing changes this test will break.
     expected_values = [3, 7, 5]
     with self.cached_session():
-      self.assertAllEqual(expected_values, output.values.eval())
+      self.assertAllEqual(expected_values, output.values)
 
   @test_util.run_deprecated_v1
   def test_get_sparse_tensors(self):
@@ -1022,12 +1022,12 @@ class CrossedColumnTest(test.TestCase):
     self.assertIn('price', features)
     self.assertIn('wire', features)
     with self.cached_session():
-      self.assertAllEqual([[20., 110.]], features['price'].eval())
+      self.assertAllEqual([[20., 110.]], features['price'])
       wire_sparse = features['wire']
-      self.assertAllEqual([[0, 0], [0, 1]], wire_sparse.indices.eval())
+      self.assertAllEqual([[0, 0], [0, 1]], wire_sparse.indices)
       # Use byte constants to pass the open-source test.
-      self.assertAllEqual([b'omar', b'stringer'], wire_sparse.values.eval())
-      self.assertAllEqual([1, 2], wire_sparse.dense_shape.eval())
+      self.assertAllEqual([b'omar', b'stringer'], wire_sparse.values)
+      self.assertAllEqual([1, 2], wire_sparse.dense_shape)
 
   @test_util.run_deprecated_v1
   def test_transform_feature(self):
@@ -1672,12 +1672,12 @@ class LinearModelTest(test.TestCase):
       with _initialized_session():
         self.assertEqual([0.], cols_to_vars['bias'][0].eval())
         # Partitioning shards the [2, 1] price1 var into 2 [1, 1] Variables.
-        self.assertAllEqual([[0.]], cols_to_vars[price1][0].eval())
-        self.assertAllEqual([[0.]], cols_to_vars[price1][1].eval())
+        self.assertAllEqual([[0.]], cols_to_vars[price1][0])
+        self.assertAllEqual([[0.]], cols_to_vars[price1][1])
         # Partitioning shards the [3, 1] price2 var into a [2, 1] Variable and
         # a [1, 1] Variable.
-        self.assertAllEqual([[0.], [0.]], cols_to_vars[price2][0].eval())
-        self.assertAllEqual([[0.]], cols_to_vars[price2][1].eval())
+        self.assertAllEqual([[0.], [0.]], cols_to_vars[price2][0])
+        self.assertAllEqual([[0.]], cols_to_vars[price2][1])
 
   def test_fills_cols_to_output_tensors(self):
     # Provide three _DenseColumn's to input_layer: a _NumericColumn, a
@@ -2340,12 +2340,12 @@ class _LinearModelTest(test.TestCase):
       with _initialized_session():
         self.assertEqual([0.], cols_to_vars['bias'][0].eval())
         # Partitioning shards the [2, 1] price1 var into 2 [1, 1] Variables.
-        self.assertAllEqual([[0.]], cols_to_vars[price1][0].eval())
-        self.assertAllEqual([[0.]], cols_to_vars[price1][1].eval())
+        self.assertAllEqual([[0.]], cols_to_vars[price1][0])
+        self.assertAllEqual([[0.]], cols_to_vars[price1][1])
         # Partitioning shards the [3, 1] price2 var into a [2, 1] Variable and
         # a [1, 1] Variable.
-        self.assertAllEqual([[0.], [0.]], cols_to_vars[price2][0].eval())
-        self.assertAllEqual([[0.]], cols_to_vars[price2][1].eval())
+        self.assertAllEqual([[0.], [0.]], cols_to_vars[price2][0])
+        self.assertAllEqual([[0.]], cols_to_vars[price2][1])
 
   def test_dense_collection(self):
     price = fc._numeric_column('price')
@@ -4560,9 +4560,9 @@ class TransformFeaturesTest(test.TestCase):
                                         [bucketized_price, hashed_sparse])
       with _initialized_session():
         self.assertIn(bucketized_price.name, transformed[bucketized_price].name)
-        self.assertAllEqual([[0], [3]], transformed[bucketized_price].eval())
+        self.assertAllEqual([[0], [3]], transformed[bucketized_price])
         self.assertIn(hashed_sparse.name, transformed[hashed_sparse].name)
-        self.assertAllEqual([6, 4, 1], transformed[hashed_sparse].values.eval())
+        self.assertAllEqual([6, 4, 1], transformed[hashed_sparse].values)
 
   def test_column_order(self):
     """When the column is both dense and sparse, uses sparse tensors."""
@@ -5044,7 +5044,7 @@ class EmbeddingColumnTest(test.TestCase, parameterized.TestCase):
     for v in global_vars:
       self.assertIsInstance(v, variables_lib.Variable)
     with _initialized_session():
-      self.assertAllEqual(embedding_values, global_vars[0].eval())
+      self.assertAllEqual(embedding_values, global_vars[0])
       self.assertAllEqual(expected_lookups, self.evaluate(embedding_lookup))
 
     if use_safe_embedding_lookup:
@@ -5114,7 +5114,7 @@ class EmbeddingColumnTest(test.TestCase, parameterized.TestCase):
     self.assertCountEqual(('embedding_weights:0',),
                           tuple([v.name for v in global_vars]))
     with _initialized_session():
-      self.assertAllEqual(embedding_values, global_vars[0].eval())
+      self.assertAllEqual(embedding_values, global_vars[0])
       self.assertAllEqual(expected_lookups, self.evaluate(embedding_lookup))
 
   @test_util.run_deprecated_v1
@@ -5211,7 +5211,7 @@ class EmbeddingColumnTest(test.TestCase, parameterized.TestCase):
     self.assertCountEqual(('embedding_weights:0',),
                           tuple([v.name for v in global_vars]))
     with _initialized_session():
-      self.assertAllEqual(embedding_values, global_vars[0].eval())
+      self.assertAllEqual(embedding_values, global_vars[0])
       self.assertAllEqual(expected_lookups, embedding_lookup.eval(
           feed_dict={
               input_indices: sparse_input.indices,
@@ -5275,7 +5275,7 @@ class EmbeddingColumnTest(test.TestCase, parameterized.TestCase):
     self.assertCountEqual(('embedding_weights:0',),
                           tuple([v.name for v in global_vars]))
     with _initialized_session():
-      self.assertAllEqual(embedding_values, global_vars[0].eval())
+      self.assertAllEqual(embedding_values, global_vars[0])
       self.assertAllEqual(expected_lookups, self.evaluate(embedding_lookup))
 
   @test_util.run_deprecated_v1
@@ -5495,7 +5495,7 @@ class EmbeddingColumnTest(test.TestCase, parameterized.TestCase):
     self.assertCountEqual(('input_layer/aaa_embedding/embedding_weights:0',),
                           tuple([v.name for v in trainable_vars]))
     with _initialized_session():
-      self.assertAllEqual(embedding_values, trainable_vars[0].eval())
+      self.assertAllEqual(embedding_values, trainable_vars[0])
       self.assertAllEqual(expected_lookups, self.evaluate(input_layer))
 
   @test_util.run_deprecated_v1
@@ -5555,7 +5555,7 @@ class EmbeddingColumnTest(test.TestCase, parameterized.TestCase):
     self.assertCountEqual([],
                           ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES))
     with _initialized_session():
-      self.assertAllEqual(embedding_values, global_vars[0].eval())
+      self.assertAllEqual(embedding_values, global_vars[0])
       self.assertAllEqual(expected_lookups, self.evaluate(input_layer))
 
 
@@ -6295,7 +6295,7 @@ class SharedEmbeddingColumnTest(test.TestCase, parameterized.TestCase):
       self.assertCountEqual([], tuple([v.name for v in trainable_vars]))
     shared_embedding_vars = global_vars
     with _initialized_session():
-      self.assertAllEqual(embedding_values, shared_embedding_vars[0].eval())
+      self.assertAllEqual(embedding_values, shared_embedding_vars[0])
       self.assertAllEqual(expected_lookups, self.evaluate(input_layer))
 
   @test_util.run_deprecated_v1
diff --git a/tensorflow/python/framework/auto_control_deps_test.py b/tensorflow/python/framework/auto_control_deps_test.py
index 61c14ce74fe..07049b869e1 100644
--- a/tensorflow/python/framework/auto_control_deps_test.py
+++ b/tensorflow/python/framework/auto_control_deps_test.py
@@ -50,7 +50,7 @@ class AutomaticControlDependenciesTest(test.TestCase):
         v.assign(2 * v)
         val = v.read_value()
         val = c.mark_as_return(val)
-      self.assertAllEqual(val.eval(), 4.0)
+      self.assertAllEqual(val, 4.0)
 
   def testNoControlDepsBetweenVariableReads(self):
     with context.graph_mode(), self.cached_session():
diff --git a/tensorflow/python/framework/function_test.py b/tensorflow/python/framework/function_test.py
index 9160670a302..220eb06c9b6 100644
--- a/tensorflow/python/framework/function_test.py
+++ b/tensorflow/python/framework/function_test.py
@@ -350,8 +350,8 @@ class FunctionTest(test.TestCase):
                 do_constant_folding=True)))
 
     with self.session(graph=g, config=cfg):
-      self.assertAllClose(y.eval(), 6.)
-      self.assertAllClose(dx.eval(), 2.)
+      self.assertAllClose(y, 6.)
+      self.assertAllClose(dx, 2.)
 
   def _testZNoDepOnY(self, use_const_grad_ys):
 
@@ -423,7 +423,7 @@ class FunctionTest(test.TestCase):
 
     with ops.Graph().as_default(), self.cached_session():
       z = Foo(constant_op.constant(3.0))
-      self.assertAllEqual(z.eval(), 6.0)
+      self.assertAllEqual(z, 6.0)
 
   def testAssertOp(self):
 
@@ -538,7 +538,7 @@ class FunctionTest(test.TestCase):
 
     with self.session(graph=g):
       variables.global_variables_initializer().run()
-      self.assertAllEqual(z.eval(), 101.)
+      self.assertAllEqual(z, 101.)
 
   @test_util.run_deprecated_v1
   def testResourceVarAsImplicitInput(self):
@@ -561,7 +561,7 @@ class FunctionTest(test.TestCase):
 
     with self.session(graph=g):
       v.initializer.run()
-      self.assertAllEqual(expected_val.eval(), self.evaluate(actual_val))
+      self.assertAllEqual(expected_val, self.evaluate(actual_val))
       self.assertAllEqual(expected_shape, self.evaluate(actual_shape))
 
   def testDefineErrors(self):
@@ -675,7 +675,7 @@ class FunctionTest(test.TestCase):
     with ops.Graph().as_default():
       z = CubeXPlusY(3.0, -2.0)
       with self.cached_session():
-        self.assertAllEqual(z.eval(), 25.0)
+        self.assertAllEqual(z, 25.0)
 
   def testNestedDefinedFunction(self):
 
@@ -691,7 +691,7 @@ class FunctionTest(test.TestCase):
     with ops.Graph().as_default():
       z = CubeXPlusY(3.0, -2.0)
       with self.cached_session():
-        self.assertAllEqual(z.eval(), 25.0)
+        self.assertAllEqual(z, 25.0)
 
   def testUnusedFunction(self):
     invoked = False
@@ -773,8 +773,8 @@ class FunctionTest(test.TestCase):
 
     with self.session(graph=g):
       variables.global_variables_initializer().run()
-      self.assertAllEqual(y.eval(), [[12.0]])
-      self.assertAllEqual(z.eval(), [[1.0]])
+      self.assertAllEqual(y, [[12.0]])
+      self.assertAllEqual(z, [[1.0]])
 
   def testCaptureControls(self):
     g = ops.Graph()
@@ -1418,8 +1418,8 @@ class FunctionOverloadTest(test.TestCase):
       y = Sinh(constant_op.constant(0.25, dtypes.float64))
 
     with self.session(graph=g):
-      self.assertAllClose(x.eval(), np.sinh(0.25))
-      self.assertAllClose(y.eval(), np.sinh(0.25))
+      self.assertAllClose(x, np.sinh(0.25))
+      self.assertAllClose(y, np.sinh(0.25))
 
   def testGradient(self):
 
@@ -1439,7 +1439,7 @@ class FunctionOverloadTest(test.TestCase):
         dx, = gradients_impl.gradients(y, x)
 
         with self.session(graph=g):
-          self.assertAllClose(dx.eval(), 0.25)
+          self.assertAllClose(dx, 0.25)
 
   def testDocString(self):
 
@@ -1483,7 +1483,7 @@ class FunctionCaptureByValueTest(test.TestCase):
     self.assertEqual(0, len(Foo.captured_inputs))
 
     with self.session(graph=g):
-      self.assertAllEqual(y.eval(), [[12.0]])
+      self.assertAllEqual(y, [[12.0]])
 
 
 class UnrollLSTMTest(test.TestCase):
diff --git a/tensorflow/python/keras/legacy_tf_layers/core_test.py b/tensorflow/python/keras/legacy_tf_layers/core_test.py
index 573eb974ac1..e912006c620 100644
--- a/tensorflow/python/keras/legacy_tf_layers/core_test.py
+++ b/tensorflow/python/keras/legacy_tf_layers/core_test.py
@@ -69,7 +69,7 @@ class DenseTest(test.TestCase, parameterized.TestCase):
           'X', initializer=init_ops.zeros_initializer(), shape=(1, 1))
       x = core_layers.Dense(1)(v)
       variables.global_variables_initializer().run()
-      self.assertAllEqual(x.eval(), [[0.0]])
+      self.assertAllEqual(x, [[0.0]])
 
   @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def testCall(self):
diff --git a/tensorflow/python/keras/optimizer_v2/adam_test.py b/tensorflow/python/keras/optimizer_v2/adam_test.py
index 775ea219d20..b706c984d77 100644
--- a/tensorflow/python/keras/optimizer_v2/adam_test.py
+++ b/tensorflow/python/keras/optimizer_v2/adam_test.py
@@ -193,12 +193,12 @@ class AdamOptimizerTest(test.TestCase, parameterized.TestCase):
         aggregated_update = adam.Adam().apply_gradients(
             [(grad_aggregated, aggregated_update_var)])
         variables.global_variables_initializer().run()
-        self.assertAllClose(aggregated_update_var.eval(),
+        self.assertAllClose(aggregated_update_var,
                             self.evaluate(repeated_index_update_var))
         for _ in range(3):
           repeated_update.run()
           aggregated_update.run()
-          self.assertAllClose(aggregated_update_var.eval(),
+          self.assertAllClose(aggregated_update_var,
                               self.evaluate(repeated_index_update_var))
 
   def doTestBasic(self, use_callable_params=False):
@@ -645,12 +645,12 @@ class NonFusedAdamOptimizerTest(test.TestCase, parameterized.TestCase):
         aggregated_update = adam.NonFusedAdam().apply_gradients(
             [(grad_aggregated, aggregated_update_var)])
         variables.global_variables_initializer().run()
-        self.assertAllClose(aggregated_update_var.eval(),
+        self.assertAllClose(aggregated_update_var,
                             self.evaluate(repeated_index_update_var))
         for _ in range(3):
           repeated_update.run()
           aggregated_update.run()
-          self.assertAllClose(aggregated_update_var.eval(),
+          self.assertAllClose(aggregated_update_var,
                               self.evaluate(repeated_index_update_var))
 
   def doTestBasic(self, use_callable_params=False):
diff --git a/tensorflow/python/keras/optimizer_v2/adamax_test.py b/tensorflow/python/keras/optimizer_v2/adamax_test.py
index 188c40b1342..47a87e3644b 100644
--- a/tensorflow/python/keras/optimizer_v2/adamax_test.py
+++ b/tensorflow/python/keras/optimizer_v2/adamax_test.py
@@ -106,14 +106,14 @@ class AdamaxOptimizerTest(test.TestCase, parameterized.TestCase):
         variables.global_variables_initializer().run()
 
         # Fetch params to validate initial values
-        self.assertAllClose([1.0, 2.0, 3.0], var0.eval())
-        self.assertAllClose([4.0, 5.0, 6.0], var1.eval())
+        self.assertAllClose([1.0, 2.0, 3.0], var0)
+        self.assertAllClose([4.0, 5.0, 6.0], var1)
 
         beta1_power = get_beta_accumulators(opt, dtype)
 
         # Run 3 steps of Adamax
         for t in range(3):
-          self.assertAllCloseAccordingToType(0.9**(t + 1), beta1_power.eval())
+          self.assertAllCloseAccordingToType(0.9**(t + 1), beta1_power)
           update.run()
 
           var0_np, m0, v0 = adamax_sparse_update_numpy(
@@ -122,8 +122,8 @@ class AdamaxOptimizerTest(test.TestCase, parameterized.TestCase):
               var1_np, grads1_np_indices, grads1_np, t, m1, v1)
 
           # Validate updated params
-          self.assertAllCloseAccordingToType(var0_np, var0.eval())
-          self.assertAllCloseAccordingToType(var1_np, var1.eval())
+          self.assertAllCloseAccordingToType(var0_np, var0)
+          self.assertAllCloseAccordingToType(var1_np, var1)
 
   def testSparseDevicePlacement(self):
     # TODO(tanzheny, omalleyt): Fix test in eager mode.
@@ -163,12 +163,12 @@ class AdamaxOptimizerTest(test.TestCase, parameterized.TestCase):
         aggregated_update = adamax.Adamax().apply_gradients(
             [(grad_aggregated, aggregated_update_var)])
         variables.global_variables_initializer().run()
-        self.assertAllClose(aggregated_update_var.eval(),
+        self.assertAllClose(aggregated_update_var,
                             repeated_index_update_var.eval())
         for _ in range(3):
           repeated_update.run()
           aggregated_update.run()
-          self.assertAllClose(aggregated_update_var.eval(),
+          self.assertAllClose(aggregated_update_var,
                               repeated_index_update_var.eval())
 
   @combinations.generate(combinations.combine(mode=["graph", "eager"]))
@@ -292,22 +292,22 @@ class AdamaxOptimizerTest(test.TestCase, parameterized.TestCase):
         variables.global_variables_initializer().run()
 
         # Fetch params to validate initial values
-        self.assertAllClose([1.0, 2.0], var0.eval())
-        self.assertAllClose([3.0, 4.0], var1.eval())
+        self.assertAllClose([1.0, 2.0], var0)
+        self.assertAllClose([3.0, 4.0], var1)
 
         beta1_power = get_beta_accumulators(opt, dtype)
 
         # Run 3 steps of Adamax
         for t in range(3):
-          self.assertAllCloseAccordingToType(0.9**(t + 1), beta1_power.eval())
+          self.assertAllCloseAccordingToType(0.9**(t + 1), beta1_power)
           update.run()
 
           var0_np, m0, v0 = adamax_update_numpy(var0_np, grads0_np, t, m0, v0)
           var1_np, m1, v1 = adamax_update_numpy(var1_np, grads1_np, t, m1, v1)
 
           # Validate updated params
-          self.assertAllCloseAccordingToType(var0_np, var0.eval())
-          self.assertAllCloseAccordingToType(var1_np, var1.eval())
+          self.assertAllCloseAccordingToType(var0_np, var0)
+          self.assertAllCloseAccordingToType(var1_np, var1)
 
   def testSharing(self):
     # TODO(tanzheny, omalleyt): Fix test in eager mode.
@@ -332,12 +332,12 @@ class AdamaxOptimizerTest(test.TestCase, parameterized.TestCase):
         beta1_power = get_beta_accumulators(opt, dtype)
 
         # Fetch params to validate initial values
-        self.assertAllClose([1.0, 2.0], var0.eval())
-        self.assertAllClose([3.0, 4.0], var1.eval())
+        self.assertAllClose([1.0, 2.0], var0)
+        self.assertAllClose([3.0, 4.0], var1)
 
         # Run 3 steps of intertwined Adamax1 and Adamax2.
         for t in range(3):
-          self.assertAllCloseAccordingToType(0.9**(t + 1), beta1_power.eval())
+          self.assertAllCloseAccordingToType(0.9**(t + 1), beta1_power)
           if t % 2 == 0:
             update1.run()
           else:
@@ -347,8 +347,8 @@ class AdamaxOptimizerTest(test.TestCase, parameterized.TestCase):
           var1_np, m1, v1 = adamax_update_numpy(var1_np, grads1_np, t, m1, v1)
 
           # Validate updated params
-          self.assertAllCloseAccordingToType(var0_np, var0.eval())
-          self.assertAllCloseAccordingToType(var1_np, var1.eval())
+          self.assertAllCloseAccordingToType(var0_np, var0)
+          self.assertAllCloseAccordingToType(var1_np, var1)
 
   def testSlotsUniqueEager(self):
     with context.eager_mode():
diff --git a/tensorflow/python/keras/optimizer_v2/nadam_test.py b/tensorflow/python/keras/optimizer_v2/nadam_test.py
index 12380d23f5f..43b2f6e031e 100644
--- a/tensorflow/python/keras/optimizer_v2/nadam_test.py
+++ b/tensorflow/python/keras/optimizer_v2/nadam_test.py
@@ -99,15 +99,15 @@ class NadamOptimizerTest(test.TestCase):
         variables.global_variables_initializer().run()
 
         # Fetch params to validate initial values
-        self.assertAllClose([1.0, 1.0, 2.0], var0.eval())
-        self.assertAllClose([3.0, 3.0, 4.0], var1.eval())
+        self.assertAllClose([1.0, 1.0, 2.0], var0)
+        self.assertAllClose([3.0, 3.0, 4.0], var1)
 
         beta1_power, beta2_power = get_beta_accumulators(opt, dtype)
 
         # Run 3 steps of Nadam
         for t in range(3):
-          self.assertAllCloseAccordingToType(0.9**(t + 1), beta1_power.eval())
-          self.assertAllCloseAccordingToType(0.999**(t + 1), beta2_power.eval())
+          self.assertAllCloseAccordingToType(0.9**(t + 1), beta1_power)
+          self.assertAllCloseAccordingToType(0.999**(t + 1), beta2_power)
           update.run()
 
           mcache = update_m_cache(mcache, t)
@@ -117,8 +117,8 @@ class NadamOptimizerTest(test.TestCase):
               var1_np, grads1_np, t, m1, v1, mcache, epsilon=sparse_epsilon)
 
           # Validate updated params
-          self.assertAllCloseAccordingToType(var0_np, var0.eval())
-          self.assertAllCloseAccordingToType(var1_np, var1.eval())
+          self.assertAllCloseAccordingToType(var0_np, var0)
+          self.assertAllCloseAccordingToType(var1_np, var1)
 
   def testBasic(self):
     # TODO(tanzheny, omalleyt): Fix test in eager mode.
@@ -140,8 +140,8 @@ class NadamOptimizerTest(test.TestCase):
         variables.global_variables_initializer().run()
 
         # Fetch params to validate initial values
-        self.assertAllClose([1.0, 2.0], var0.eval())
-        self.assertAllClose([3.0, 4.0], var1.eval())
+        self.assertAllClose([1.0, 2.0], var0)
+        self.assertAllClose([3.0, 4.0], var1)
 
         # Run 3 steps of Nadam
         for t in range(3):
@@ -154,8 +154,8 @@ class NadamOptimizerTest(test.TestCase):
                                                mcache)
 
           # Validate updated params
-          self.assertAllCloseAccordingToType(var0_np, var0.eval())
-          self.assertAllCloseAccordingToType(var1_np, var1.eval())
+          self.assertAllCloseAccordingToType(var0_np, var0)
+          self.assertAllCloseAccordingToType(var1_np, var1)
 
   def testConstructNAdamWithLR(self):
     opt = nadam.Nadam(lr=1.0)
diff --git a/tensorflow/python/kernel_tests/array_ops_test.py b/tensorflow/python/kernel_tests/array_ops_test.py
index 6bf6311aafd..92b5490936e 100644
--- a/tensorflow/python/kernel_tests/array_ops_test.py
+++ b/tensorflow/python/kernel_tests/array_ops_test.py
@@ -149,7 +149,7 @@ class BooleanMaskTest(test_util.TensorFlowTestCase):
       self.assertAllEqual(masked_tensor.get_shape()[leading:],
                           masked_arr.shape[leading:])
 
-      self.assertAllClose(masked_arr, masked_tensor.eval())
+      self.assertAllClose(masked_arr, masked_tensor)
 
   @test_util.run_deprecated_v1
   def testMaskDim1ArrDim2Axis1(self):
@@ -201,7 +201,7 @@ class BooleanMaskTest(test_util.TensorFlowTestCase):
     tf_result = array_ops.boolean_mask(arr, mask)
     self.assertAllEqual(numpy_result.shape[1:], tf_result.get_shape()[1:])
     with self.cached_session():
-      self.assertAllClose(numpy_result, tf_result.eval())
+      self.assertAllClose(numpy_result, tf_result)
 
   @test_util.run_deprecated_v1
   def testEmptyInput1D(self):
@@ -211,7 +211,7 @@ class BooleanMaskTest(test_util.TensorFlowTestCase):
     tf_result = array_ops.boolean_mask(arr, mask)
     self.assertAllEqual(numpy_result.shape[1:], tf_result.get_shape()[1:])
     with self.cached_session():
-      self.assertAllClose(numpy_result, tf_result.eval())
+      self.assertAllClose(numpy_result, tf_result)
 
   @test_util.run_deprecated_v1
   def testEmptyOutput(self):
@@ -530,7 +530,7 @@ class MeshgridTest(test_util.TensorFlowTestCase):
       tf_out = array_ops.meshgrid(x, y, indexing=index)
       with self.cached_session(use_gpu=use_gpu):
         for xx, yy in zip(numpy_out, tf_out):
-          self.assertAllEqual(xx, yy.eval())
+          self.assertAllEqual(xx, yy)
 
   def _compareDiffType(self, n, np_dtype, use_gpu):
     inputs = []
@@ -544,7 +544,7 @@ class MeshgridTest(test_util.TensorFlowTestCase):
       with self.cached_session(use_gpu=use_gpu):
         tf_out = array_ops.meshgrid(*inputs, indexing=index)
         for x_np, x_tf in zip(numpy_out, tf_out):
-          self.assertAllEqual(x_np, x_tf.eval())
+          self.assertAllEqual(x_np, x_tf)
 
   @test_util.run_deprecated_v1
   def testCompare(self):
@@ -1306,9 +1306,9 @@ class SequenceMaskTest(test_util.TensorFlowTestCase):
     with self.cached_session():
       res = array_ops.sequence_mask(constant_op.constant([[1, 3, 2]]), 5)
       self.assertAllEqual(res.get_shape(), [1, 3, 5])
-      self.assertAllEqual(res.eval(), [[[True, False, False, False, False],
-                                        [True, True, True, False, False],
-                                        [True, True, False, False, False]]])
+      self.assertAllEqual(res, [[[True, False, False, False, False],
+                                 [True, True, True, False, False],
+                                 [True, True, False, False, False]]])
 
       # test dtype and default maxlen:
       res = array_ops.sequence_mask(
@@ -1410,7 +1410,7 @@ class InvertPermutationTest(test_util.TensorFlowTestCase):
           x = constant_op.constant([3, 4, 0, 2, 1], dtype=dtype)
           y = array_ops.invert_permutation(x)
           self.assertAllEqual(y.get_shape(), [5])
-          self.assertAllEqual(y.eval(), [2, 4, 3, 0, 1])
+          self.assertAllEqual(y, [2, 4, 3, 0, 1])
 
 
 class UnravelIndexTest(test_util.TensorFlowTestCase):
@@ -1424,17 +1424,17 @@ class UnravelIndexTest(test_util.TensorFlowTestCase):
           indices_1 = constant_op.constant(1621, dtype=dtype)
           dims_1 = constant_op.constant([6, 7, 8, 9], dtype=dtype)
           out_1 = array_ops.unravel_index(indices_1, dims_1)
-          self.assertAllEqual(out_1.eval(), [3, 1, 4, 1])
+          self.assertAllEqual(out_1, [3, 1, 4, 1])
 
           indices_2 = constant_op.constant([1621], dtype=dtype)
           dims_2 = constant_op.constant([6, 7, 8, 9], dtype=dtype)
           out_2 = array_ops.unravel_index(indices_2, dims_2)
-          self.assertAllEqual(out_2.eval(), [[3], [1], [4], [1]])
+          self.assertAllEqual(out_2, [[3], [1], [4], [1]])
 
           indices_3 = constant_op.constant([22, 41, 37], dtype=dtype)
           dims_3 = constant_op.constant([7, 6], dtype=dtype)
           out_3 = array_ops.unravel_index(indices_3, dims_3)
-          self.assertAllEqual(out_3.eval(), [[3, 6, 6], [4, 5, 1]])
+          self.assertAllEqual(out_3, [[3, 6, 6], [4, 5, 1]])
 
   # Test case for GitHub issue 40204.
   def testUnravelIndexZeroDim(self):
@@ -1492,7 +1492,7 @@ class SnapshotOpTest(test_util.TensorFlowTestCase):
         with self.cached_session(use_gpu=True):
           x = constant_op.constant([0, 1, 2, 3], dtype=dtype)
           y = gen_array_ops.snapshot(x)
-          self.assertAllEqual(y.eval(), [0, 1, 2, 3])
+          self.assertAllEqual(y, [0, 1, 2, 3])
 
 
 @test_util.run_all_in_graph_and_eager_modes
diff --git a/tensorflow/python/kernel_tests/batch_scatter_ops_test.py b/tensorflow/python/kernel_tests/batch_scatter_ops_test.py
index f70fb93da9d..47dcdf1c489 100644
--- a/tensorflow/python/kernel_tests/batch_scatter_ops_test.py
+++ b/tensorflow/python/kernel_tests/batch_scatter_ops_test.py
@@ -77,7 +77,7 @@ class ScatterTest(test.TestCase):
             ref.batch_scatter_update(ops.IndexedSlices(indices, updates))
           else:
             tf_scatter(ref, indices, updates).eval()
-          self.assertAllClose(ref.eval(), new)
+          self.assertAllClose(ref, new)
 
   @test_util.run_deprecated_v1
   def testVariableRankUpdate(self):
diff --git a/tensorflow/python/kernel_tests/batchtospace_op_test.py b/tensorflow/python/kernel_tests/batchtospace_op_test.py
index c422df8806f..9e435f438cb 100644
--- a/tensorflow/python/kernel_tests/batchtospace_op_test.py
+++ b/tensorflow/python/kernel_tests/batchtospace_op_test.py
@@ -63,7 +63,7 @@ class BatchToSpaceDepthToSpace(test.TestCase, PythonOpImpl):
               array_ops.transpose(x, [3, 1, 2, 0]), block_size=block_size),
           [3, 1, 2, 0])
       with self.cached_session():
-        self.assertAllEqual(y1.eval(), y2.eval())
+        self.assertAllEqual(y1, y2)
 
 
 class BatchToSpaceDepthToSpaceCpp(BatchToSpaceDepthToSpace, CppOpImpl):
diff --git a/tensorflow/python/kernel_tests/boosted_trees/quantile_ops_test.py b/tensorflow/python/kernel_tests/boosted_trees/quantile_ops_test.py
index 7c3a382c955..6b0102d2a92 100644
--- a/tensorflow/python/kernel_tests/boosted_trees/quantile_ops_test.py
+++ b/tensorflow/python/kernel_tests/boosted_trees/quantile_ops_test.py
@@ -102,11 +102,11 @@ class QuantileOpsTest(test_util.TensorFlowTestCase):
           [self._feature_0, self._feature_1], buckets)
       self.evaluate(summary_op)
       self.evaluate(flush_op)
-      self.assertAllClose(self._feature_0_boundaries, buckets[0].eval())
-      self.assertAllClose(self._feature_1_boundaries, buckets[1].eval())
+      self.assertAllClose(self._feature_0_boundaries, buckets[0])
+      self.assertAllClose(self._feature_1_boundaries, buckets[1])
 
-      self.assertAllClose(self._feature_0_quantiles, quantiles[0].eval())
-      self.assertAllClose(self._feature_1_quantiles, quantiles[1].eval())
+      self.assertAllClose(self._feature_0_quantiles, quantiles[0])
+      self.assertAllClose(self._feature_1_quantiles, quantiles[1])
 
   def testBasicQuantileBucketsSingleResourcesAddFlushed(self):
     with self.cached_session():
@@ -137,11 +137,11 @@ class QuantileOpsTest(test_util.TensorFlowTestCase):
       self.evaluate(summary_op_2)
       self.evaluate(flush_op)
 
-      self.assertAllClose(self._feature_0_boundaries, buckets[0].eval())
-      self.assertAllClose(self._feature_1_boundaries, buckets[1].eval())
+      self.assertAllClose(self._feature_0_boundaries, buckets[0])
+      self.assertAllClose(self._feature_1_boundaries, buckets[1])
 
-      self.assertAllClose(self._feature_0_quantiles, quantiles[0].eval())
-      self.assertAllClose(self._feature_1_quantiles, quantiles[1].eval())
+      self.assertAllClose(self._feature_0_quantiles, quantiles[0])
+      self.assertAllClose(self._feature_1_quantiles, quantiles[1])
 
   def testBasicQuantileBucketsMultipleResources(self):
     with self.cached_session() as sess:
@@ -171,11 +171,11 @@ class QuantileOpsTest(test_util.TensorFlowTestCase):
           [self._feature_0, self._feature_1], bucket_0 + bucket_1)
       self.evaluate([summary_op_0, summary_op_1])
       self.evaluate([flush_op_0, flush_op_1])
-      self.assertAllClose(self._feature_0_boundaries, bucket_0[0].eval())
-      self.assertAllClose(self._feature_1_boundaries, bucket_1[0].eval())
+      self.assertAllClose(self._feature_0_boundaries, bucket_0[0])
+      self.assertAllClose(self._feature_1_boundaries, bucket_1[0])
 
-      self.assertAllClose(self._feature_0_quantiles, quantiles[0].eval())
-      self.assertAllClose(self._feature_1_quantiles, quantiles[1].eval())
+      self.assertAllClose(self._feature_0_quantiles, quantiles[0])
+      self.assertAllClose(self._feature_1_quantiles, quantiles[1])
 
   def testSaveRestoreAfterFlush(self):
     save_dir = os.path.join(self.get_temp_dir(), "save_restore")
@@ -192,15 +192,15 @@ class QuantileOpsTest(test_util.TensorFlowTestCase):
       resources.initialize_resources(resources.shared_resources()).run()
 
       buckets = accumulator.get_bucket_boundaries()
-      self.assertAllClose([], buckets[0].eval())
-      self.assertAllClose([], buckets[1].eval())
+      self.assertAllClose([], buckets[0])
+      self.assertAllClose([], buckets[1])
       summaries = accumulator.add_summaries([self._feature_0, self._feature_1],
                                             self._example_weights)
       with ops.control_dependencies([summaries]):
         flush = accumulator.flush()
       self.evaluate(flush)
-      self.assertAllClose(self._feature_0_boundaries, buckets[0].eval())
-      self.assertAllClose(self._feature_1_boundaries, buckets[1].eval())
+      self.assertAllClose(self._feature_0_boundaries, buckets[0])
+      self.assertAllClose(self._feature_1_boundaries, buckets[1])
       save.save(sess, save_path)
 
     with self.session(graph=ops.Graph()) as sess:
@@ -212,8 +212,8 @@ class QuantileOpsTest(test_util.TensorFlowTestCase):
       save = saver.Saver()
       save.restore(sess, save_path)
       buckets = accumulator.get_bucket_boundaries()
-      self.assertAllClose(self._feature_0_boundaries, buckets[0].eval())
-      self.assertAllClose(self._feature_1_boundaries, buckets[1].eval())
+      self.assertAllClose(self._feature_0_boundaries, buckets[0])
+      self.assertAllClose(self._feature_1_boundaries, buckets[1])
 
   def testSaveRestoreBeforeFlush(self):
     save_dir = os.path.join(self.get_temp_dir(), "save_restore")
@@ -233,12 +233,12 @@ class QuantileOpsTest(test_util.TensorFlowTestCase):
                                             self._example_weights)
       self.evaluate(summaries)
       buckets = accumulator.get_bucket_boundaries()
-      self.assertAllClose([], buckets[0].eval())
-      self.assertAllClose([], buckets[1].eval())
+      self.assertAllClose([], buckets[0])
+      self.assertAllClose([], buckets[1])
       save.save(sess, save_path)
       self.evaluate(accumulator.flush())
-      self.assertAllClose(self._feature_0_boundaries, buckets[0].eval())
-      self.assertAllClose(self._feature_1_boundaries, buckets[1].eval())
+      self.assertAllClose(self._feature_0_boundaries, buckets[0])
+      self.assertAllClose(self._feature_1_boundaries, buckets[1])
 
     with self.session(graph=ops.Graph()) as sess:
       accumulator = boosted_trees_ops.QuantileAccumulator(
@@ -249,8 +249,8 @@ class QuantileOpsTest(test_util.TensorFlowTestCase):
       save = saver.Saver()
       save.restore(sess, save_path)
       buckets = accumulator.get_bucket_boundaries()
-      self.assertAllClose([], buckets[0].eval())
-      self.assertAllClose([], buckets[1].eval())
+      self.assertAllClose([], buckets[0])
+      self.assertAllClose([], buckets[1])
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/kernel_tests/broadcast_to_ops_test.py b/tensorflow/python/kernel_tests/broadcast_to_ops_test.py
index 742545ebe91..9915b12c642 100644
--- a/tensorflow/python/kernel_tests/broadcast_to_ops_test.py
+++ b/tensorflow/python/kernel_tests/broadcast_to_ops_test.py
@@ -38,7 +38,7 @@ class BroadcastToTest(test_util.TensorFlowTestCase):
         x = np.array([1, 2, 3], dtype=dtype)
         v_tf = array_ops.broadcast_to(constant_op.constant(x), [3, 3])
         v_np = np.broadcast_to(x, [3, 3])
-        self.assertAllEqual(v_tf.eval(), v_np)
+        self.assertAllEqual(v_tf, v_np)
 
   @test_util.run_deprecated_v1
   def testBroadcastToString(self):
@@ -46,7 +46,7 @@ class BroadcastToTest(test_util.TensorFlowTestCase):
       x = np.array([b"1", b"2", b"3"])
       v_tf = array_ops.broadcast_to(constant_op.constant(x), [3, 3])
       v_np = np.broadcast_to(x, [3, 3])
-      self.assertAllEqual(v_tf.eval(), v_np)
+      self.assertAllEqual(v_tf, v_np)
 
   @test_util.run_deprecated_v1
   def testBroadcastToBool(self):
@@ -54,7 +54,7 @@ class BroadcastToTest(test_util.TensorFlowTestCase):
       x = np.array([True, False, True], dtype=np.bool)
       v_tf = array_ops.broadcast_to(constant_op.constant(x), [3, 3])
       v_np = np.broadcast_to(x, [3, 3])
-      self.assertAllEqual(v_tf.eval(), v_np)
+      self.assertAllEqual(v_tf, v_np)
 
   @test_util.run_deprecated_v1
   def testBroadcastToShape(self):
@@ -66,7 +66,7 @@ class BroadcastToTest(test_util.TensorFlowTestCase):
           x = np.array(np.random.randint(5, size=input_shape), dtype=np.int32)
           v_tf = array_ops.broadcast_to(constant_op.constant(x), output_shape)
           v_np = np.broadcast_to(x, output_shape)
-          self.assertAllEqual(v_tf.eval(), v_np)
+          self.assertAllEqual(v_tf, v_np)
 
   @test_util.run_deprecated_v1
   def testBroadcastToShapeInnerDim(self):
@@ -76,7 +76,7 @@ class BroadcastToTest(test_util.TensorFlowTestCase):
       x = np.array(np.random.randint(5, size=input_shape), dtype=np.int32)
       v_tf = array_ops.broadcast_to(constant_op.constant(x), output_shape)
       v_np = np.broadcast_to(x, output_shape)
-      self.assertAllEqual(v_tf.eval(), v_np)
+      self.assertAllEqual(v_tf, v_np)
 
   @test_util.run_deprecated_v1
   def testBroadcastToShapeLargerDim(self):
@@ -86,7 +86,7 @@ class BroadcastToTest(test_util.TensorFlowTestCase):
       x = np.array(np.random.randint(5, size=input_shape), dtype=np.int32)
       v_tf = array_ops.broadcast_to(constant_op.constant(x), output_shape)
       v_np = np.broadcast_to(x, output_shape)
-      self.assertAllEqual(v_tf.eval(), v_np)
+      self.assertAllEqual(v_tf, v_np)
 
   @test_util.run_deprecated_v1
   def testBroadcastToShapeLargerDim2(self):
@@ -96,7 +96,7 @@ class BroadcastToTest(test_util.TensorFlowTestCase):
       x = np.array(np.random.randint(5, size=input_shape), dtype=np.int32)
       v_tf = array_ops.broadcast_to(constant_op.constant(x), output_shape)
       v_np = np.broadcast_to(x, output_shape)
-      self.assertAllEqual(v_tf.eval(), v_np)
+      self.assertAllEqual(v_tf, v_np)
 
   @test_util.run_deprecated_v1
   def testBroadcastToScalar(self):
@@ -104,7 +104,7 @@ class BroadcastToTest(test_util.TensorFlowTestCase):
       x = np.array(1, dtype=np.int32)
       v_tf = array_ops.broadcast_to(constant_op.constant(x), [3, 3])
       v_np = np.broadcast_to(x, [3, 3])
-      self.assertAllEqual(v_tf.eval(), v_np)
+      self.assertAllEqual(v_tf, v_np)
 
   @test_util.run_deprecated_v1
   def testBroadcastScalarToNonScalar(self):
@@ -113,7 +113,7 @@ class BroadcastToTest(test_util.TensorFlowTestCase):
       v_tf = array_ops.broadcast_to(constant_op.constant(1.0), [2, 3, 4,
                                                                 1, 1, 1])
       v_np = np.broadcast_to(x, [2, 3, 4, 1, 1, 1])
-      self.assertAllEqual(v_tf.eval(), v_np)
+      self.assertAllEqual(v_tf, v_np)
 
   @test_util.run_deprecated_v1
   def testBroadcastToShapeTypeAndInference(self):
@@ -125,7 +125,7 @@ class BroadcastToTest(test_util.TensorFlowTestCase):
             constant_op.constant([3, 3], dtype=dtype))
         shape = v_tf.get_shape().as_list()
         v_np = np.broadcast_to(x, [3, 3])
-        self.assertAllEqual(v_tf.eval(), v_np)
+        self.assertAllEqual(v_tf, v_np)
         # check shape inference when shape input is constant
         self.assertAllEqual(shape, v_np.shape)
 
diff --git a/tensorflow/python/kernel_tests/clip_ops_test.py b/tensorflow/python/kernel_tests/clip_ops_test.py
index f6523b9b094..9d79c1b8573 100644
--- a/tensorflow/python/kernel_tests/clip_ops_test.py
+++ b/tensorflow/python/kernel_tests/clip_ops_test.py
@@ -227,7 +227,7 @@ class ClipTest(test.TestCase):
       x = array_ops.zeros([3])
       b = clip_ops.clip_by_norm(x, 1.)
       grad, = gradients_impl.gradients(b, x)
-      self.assertAllEqual(grad.eval(), [1., 1., 1.])
+      self.assertAllEqual(grad, [1., 1., 1.])
 
   def testClipByNormBadShape(self):
     with self.session(use_gpu=True):
diff --git a/tensorflow/python/kernel_tests/constant_op_test.py b/tensorflow/python/kernel_tests/constant_op_test.py
index 99d5278de0f..a1316df34f8 100644
--- a/tensorflow/python/kernel_tests/constant_op_test.py
+++ b/tensorflow/python/kernel_tests/constant_op_test.py
@@ -430,11 +430,11 @@ class ZerosTest(test.TestCase):
       z = array_ops.zeros([2, 3])
       self.assertEqual(z.dtype, dtypes_lib.float32)
       self.assertEqual([2, 3], z.get_shape())
-      self.assertAllEqual(z.eval(), np.zeros([2, 3]))
+      self.assertAllEqual(z, np.zeros([2, 3]))
       z = array_ops.zeros(array_ops.shape(d))
       self.assertEqual(z.dtype, dtypes_lib.float32)
       self.assertEqual([2, 3], z.get_shape())
-      self.assertAllEqual(z.eval(), np.zeros([2, 3]))
+      self.assertAllEqual(z, np.zeros([2, 3]))
       # Test explicit type control
       for dtype in [
           dtypes_lib.float32, dtypes_lib.float64, dtypes_lib.int32,
@@ -610,11 +610,11 @@ class OnesTest(test.TestCase):
       z = array_ops.ones([2, 3])
       self.assertEqual(z.dtype, dtypes_lib.float32)
       self.assertEqual([2, 3], z.get_shape())
-      self.assertAllEqual(z.eval(), np.ones([2, 3]))
+      self.assertAllEqual(z, np.ones([2, 3]))
       z = array_ops.ones(array_ops.shape(d))
       self.assertEqual(z.dtype, dtypes_lib.float32)
       self.assertEqual([2, 3], z.get_shape())
-      self.assertAllEqual(z.eval(), np.ones([2, 3]))
+      self.assertAllEqual(z, np.ones([2, 3]))
       # Test explicit type control
       for dtype in (dtypes_lib.float32, dtypes_lib.float64, dtypes_lib.int32,
                     dtypes_lib.uint8, dtypes_lib.int16, dtypes_lib.int8,
@@ -623,11 +623,11 @@ class OnesTest(test.TestCase):
         z = array_ops.ones([2, 3], dtype=dtype)
         self.assertEqual(z.dtype, dtype)
         self.assertEqual([2, 3], z.get_shape())
-        self.assertAllEqual(z.eval(), np.ones([2, 3]))
+        self.assertAllEqual(z, np.ones([2, 3]))
         z = array_ops.ones(array_ops.shape(d), dtype=dtype)
         self.assertEqual(z.dtype, dtype)
         self.assertEqual([2, 3], z.get_shape())
-        self.assertAllEqual(z.eval(), np.ones([2, 3]))
+        self.assertAllEqual(z, np.ones([2, 3]))
 
 
 class OnesLikeTest(test.TestCase):
diff --git a/tensorflow/python/kernel_tests/dense_update_ops_test.py b/tensorflow/python/kernel_tests/dense_update_ops_test.py
index 47bbce45a18..3d5ff3f47e5 100644
--- a/tensorflow/python/kernel_tests/dense_update_ops_test.py
+++ b/tensorflow/python/kernel_tests/dense_update_ops_test.py
@@ -93,13 +93,13 @@ class AssignOpTest(test.TestCase):
       p = variables.VariableV1([1])
       a = state_ops.assign(p, data, validate_shape=False)
       a.op.run()
-      self.assertAllEqual(p.eval(), self.evaluate(data))
+      self.assertAllEqual(p, self.evaluate(data))
 
       # Assign to yet another shape
       data2 = array_ops.fill([10, 10], 1)
       a2 = state_ops.assign(p, data2, validate_shape=False)
       a2.op.run()
-      self.assertAllEqual(p.eval(), self.evaluate(data2))
+      self.assertAllEqual(p, self.evaluate(data2))
 
   @test_util.run_v1_only("b/120545219")
   def testInitRequiredAssignAdd(self):
diff --git a/tensorflow/python/kernel_tests/depthtospace_op_test.py b/tensorflow/python/kernel_tests/depthtospace_op_test.py
index 17fb579ca52..43e8033a2c3 100644
--- a/tensorflow/python/kernel_tests/depthtospace_op_test.py
+++ b/tensorflow/python/kernel_tests/depthtospace_op_test.py
@@ -42,7 +42,7 @@ class DepthToSpaceTest(test.TestCase):
     with self.cached_session(use_gpu=False):
       # test NHWC (default) on CPU
       x_tf = array_ops.depth_to_space(input_nhwc, block_size)
-      self.assertAllEqual(x_tf.eval(), outputs)
+      self.assertAllEqual(x_tf, outputs)
 
       # Run this test only if only CPU device is available
       if all(x.device_type == "CPU" for x in device_lib.list_local_devices()):
@@ -59,13 +59,13 @@ class DepthToSpaceTest(test.TestCase):
       with self.cached_session(use_gpu=True):
         # test NHWC (default) on GPU
         x_tf = array_ops.depth_to_space(input_nhwc, block_size)
-        self.assertAllEqual(x_tf.eval(), outputs)
+        self.assertAllEqual(x_tf, outputs)
         # test NCHW on GPU
         input_nchw = test_util.NHWCToNCHW(input_nhwc)
         output_nchw = array_ops.depth_to_space(
             input_nchw, block_size, data_format="NCHW")
         output_nhwc = test_util.NCHWToNHWC(output_nchw)
-        self.assertAllEqual(output_nhwc.eval(), outputs)
+        self.assertAllEqual(output_nhwc, outputs)
 
   @test_util.run_deprecated_v1
   def testBasic(self):
diff --git a/tensorflow/python/kernel_tests/diag_op_test.py b/tensorflow/python/kernel_tests/diag_op_test.py
index 9ee39842b2f..9c679ff34c9 100644
--- a/tensorflow/python/kernel_tests/diag_op_test.py
+++ b/tensorflow/python/kernel_tests/diag_op_test.py
@@ -379,14 +379,14 @@ class MatrixDiagTest(test.TestCase):
       mat = np.diag(v)
       v_diag = array_ops.matrix_diag(v)
       self.assertEqual((3, 3), v_diag.get_shape())
-      self.assertAllEqual(v_diag.eval(), mat)
+      self.assertAllEqual(v_diag, mat)
 
       # {Sub,Super}diagonals.
       for offset in [1, -2, 5]:
         mat = np.diag(v, offset)
         v_diag = array_ops.matrix_diag(v, k=offset)
         self.assertEqual(mat.shape, v_diag.get_shape())
-        self.assertAllEqual(v_diag.eval(), mat)
+        self.assertAllEqual(v_diag, mat)
 
       # Diagonal bands.
       for align in alignment_list:
@@ -394,7 +394,7 @@ class MatrixDiagTest(test.TestCase):
           for diags, (vecs, solution) in tests.items():
             v_diags = array_ops.matrix_diag(vecs[0], k=diags, align=align)
             self.assertEqual(v_diags.get_shape(), solution[0].shape)
-            self.assertAllEqual(v_diags.eval(), solution[0])
+            self.assertAllEqual(v_diags, solution[0])
 
   def _testVectorBatch(self, dtype):
     with self.cached_session(use_gpu=True):
@@ -404,7 +404,7 @@ class MatrixDiagTest(test.TestCase):
                              [0.0, 0.0, 6.0]]]).astype(dtype)
       v_batch_diag = array_ops.matrix_diag(v_batch)
       self.assertEqual((2, 3, 3), v_batch_diag.get_shape())
-      self.assertAllEqual(v_batch_diag.eval(), mat_batch)
+      self.assertAllEqual(v_batch_diag, mat_batch)
 
       # {Sub,Super}diagonals.
       for offset in [1, -2, 5]:
@@ -414,7 +414,7 @@ class MatrixDiagTest(test.TestCase):
         ]
         mat_batch = np.stack(mats, axis=0)
         self.assertEqual(mat_batch.shape, v_batch_diag.get_shape())
-        self.assertAllEqual(v_batch_diag.eval(), mat_batch)
+        self.assertAllEqual(v_batch_diag, mat_batch)
 
       # Diagonal bands with padding_value.
       for padding_value, align in zip_to_first_list_length([0, 555, -11],
@@ -429,7 +429,7 @@ class MatrixDiagTest(test.TestCase):
             mask = solution == 0
             solution = (solution + padding_value * mask).astype(dtype)
             self.assertEqual(v_diags.get_shape(), solution.shape)
-            self.assertAllEqual(v_diags.eval(), solution)
+            self.assertAllEqual(v_diags, solution)
 
   @test_util.run_deprecated_v1
   def testVectorBatch(self):
@@ -495,7 +495,7 @@ class MatrixDiagTest(test.TestCase):
             mask = solution == 0
             solution = solution + padding_value * mask
             self.assertEqual(v_diags.get_shape(), solution.shape)
-            self.assertAllEqual(v_diags.eval(), solution)
+            self.assertAllEqual(v_diags, solution)
 
         # Giving just num_rows.
         for expected, (_, tests) in test_list:
@@ -514,7 +514,7 @@ class MatrixDiagTest(test.TestCase):
             mask = solution == 0
             solution = solution + padding_value * mask
             self.assertEqual(v_diags.get_shape(), solution.shape)
-            self.assertAllEqual(v_diags.eval(), solution)
+            self.assertAllEqual(v_diags, solution)
 
         # Giving just num_cols.
         for expected, (_, tests) in test_list:
@@ -533,7 +533,7 @@ class MatrixDiagTest(test.TestCase):
             mask = solution == 0
             solution = solution + padding_value * mask
             self.assertEqual(v_diags.get_shape(), solution.shape)
-            self.assertAllEqual(v_diags.eval(), solution)
+            self.assertAllEqual(v_diags, solution)
 
   @test_util.run_deprecated_v1
   def testInvalidShape(self):
@@ -600,7 +600,7 @@ class MatrixSetDiagTest(test.TestCase):
           output = array_ops.matrix_set_diag(
               input_mat, vecs[0], k=diags, align=align)
           self.assertEqual(output.get_shape(), solution.shape)
-          self.assertAllEqual(output.eval(), solution)
+          self.assertAllEqual(output, solution)
 
   @test_util.run_deprecated_v1
   def testRectangular(self):
@@ -629,7 +629,7 @@ class MatrixSetDiagTest(test.TestCase):
             output = array_ops.matrix_set_diag(
                 input_mat, vecs[0], k=diags, align=align)
             self.assertEqual(output.get_shape(), solution.shape)
-            self.assertAllEqual(output.eval(), solution)
+            self.assertAllEqual(output, solution)
 
   def _testSquareBatch(self, dtype):
     with self.cached_session(use_gpu=True):
@@ -657,7 +657,7 @@ class MatrixSetDiagTest(test.TestCase):
           output = array_ops.matrix_set_diag(
               input_mat, vecs.astype(dtype), k=diags, align=align)
           self.assertEqual(output.get_shape(), solution.shape)
-          self.assertAllEqual(output.eval(), solution)
+          self.assertAllEqual(output, solution)
 
   @test_util.run_deprecated_v1
   def testSquareBatch(self):
@@ -691,7 +691,7 @@ class MatrixSetDiagTest(test.TestCase):
             output = array_ops.matrix_set_diag(
                 input_mat, vecs, k=diags, align=align)
             self.assertEqual(output.get_shape(), solution.shape)
-            self.assertAllEqual(output.eval(), solution)
+            self.assertAllEqual(output, solution)
 
   @test_util.run_deprecated_v1
   def testInvalidShape(self):
@@ -780,13 +780,13 @@ class MatrixDiagPartTest(test.TestCase):
       mat = np.diag(v)
       mat_diag = array_ops.matrix_diag_part(mat)
       self.assertEqual((3,), mat_diag.get_shape())
-      self.assertAllEqual(mat_diag.eval(), v)
+      self.assertAllEqual(mat_diag, v)
 
       for offset in [-2, 3]:
         mat = np.diag(v, offset)
         mat_diag = array_ops.matrix_diag_part(mat, k=offset)
         self.assertEqual((3,), mat_diag.get_shape())
-        self.assertAllEqual(mat_diag.eval(), v)
+        self.assertAllEqual(mat_diag, v)
 
       # Diagonal bands.
       for align in alignment_list:
@@ -795,17 +795,17 @@ class MatrixDiagPartTest(test.TestCase):
           solution, _ = pair
           mat_diag = array_ops.matrix_diag_part(mat[0], k=diags, align=align)
           self.assertEqual(mat_diag.get_shape(), solution[0].shape)
-          self.assertAllEqual(mat_diag.eval(), solution[0])
+          self.assertAllEqual(mat_diag, solution[0])
 
   @test_util.run_deprecated_v1
   def testRectangular(self):
     with self.session(use_gpu=True):
       mat = np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]])
       mat_diag = array_ops.matrix_diag_part(mat)
-      self.assertAllEqual(mat_diag.eval(), np.array([1.0, 5.0]))
+      self.assertAllEqual(mat_diag, np.array([1.0, 5.0]))
       mat = np.array([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]])
       mat_diag = array_ops.matrix_diag_part(mat)
-      self.assertAllEqual(mat_diag.eval(), np.array([1.0, 4.0]))
+      self.assertAllEqual(mat_diag, np.array([1.0, 4.0]))
 
       # Diagonal bands.
       for align in alignment_list:
@@ -815,7 +815,7 @@ class MatrixDiagPartTest(test.TestCase):
             mat_diag = array_ops.matrix_diag_part(
                 mat[0], k=diags, align=align)
             self.assertEqual(mat_diag.get_shape(), solution[0].shape)
-            self.assertAllEqual(mat_diag.eval(), solution[0])
+            self.assertAllEqual(mat_diag, solution[0])
 
   def _testSquareBatch(self, dtype):
     with self.cached_session(use_gpu=True):
@@ -826,7 +826,7 @@ class MatrixDiagPartTest(test.TestCase):
       self.assertEqual(mat_batch.shape, (2, 3, 3))
       mat_batch_diag = array_ops.matrix_diag_part(mat_batch)
       self.assertEqual((2, 3), mat_batch_diag.get_shape())
-      self.assertAllEqual(mat_batch_diag.eval(), v_batch)
+      self.assertAllEqual(mat_batch_diag, v_batch)
 
       # Diagonal bands with padding_value.
       for padding_value, align in zip_to_first_list_length([0, 555, -11],
@@ -842,7 +842,7 @@ class MatrixDiagPartTest(test.TestCase):
           mask = solution == 0
           solution = (solution + padding_value * mask).astype(dtype)
           self.assertEqual(mat_batch_diag.get_shape(), solution.shape)
-          self.assertAllEqual(mat_batch_diag.eval(), solution)
+          self.assertAllEqual(mat_batch_diag, solution)
 
   @test_util.run_deprecated_v1
   def testSquareBatch(self):
@@ -861,7 +861,7 @@ class MatrixDiagPartTest(test.TestCase):
       self.assertEqual(mat_batch.shape, (2, 2, 3))
       mat_batch_diag = array_ops.matrix_diag_part(mat_batch)
       self.assertEqual((2, 2), mat_batch_diag.get_shape())
-      self.assertAllEqual(mat_batch_diag.eval(), v_batch)
+      self.assertAllEqual(mat_batch_diag, v_batch)
 
       # Diagonal bands with padding_value and align.
       for padding_value, align in zip_to_first_list_length([0, 555, -11],
@@ -874,7 +874,7 @@ class MatrixDiagPartTest(test.TestCase):
             mask = solution == 0
             solution = solution + padding_value * mask
             self.assertEqual(mat_batch_diag.get_shape(), solution.shape)
-            self.assertAllEqual(mat_batch_diag.eval(), solution)
+            self.assertAllEqual(mat_batch_diag, solution)
 
   @test_util.run_deprecated_v1
   def testUnknownShape(self):
diff --git a/tensorflow/python/kernel_tests/distributions/categorical_test.py b/tensorflow/python/kernel_tests/distributions/categorical_test.py
index ec1d4ed2070..fbde3abba45 100644
--- a/tensorflow/python/kernel_tests/distributions/categorical_test.py
+++ b/tensorflow/python/kernel_tests/distributions/categorical_test.py
@@ -50,7 +50,7 @@ class CategoricalTest(test.TestCase, parameterized.TestCase):
     p = [0.2, 0.8]
     dist = categorical.Categorical(probs=p)
     with self.cached_session():
-      self.assertAllClose(p, dist.probs.eval())
+      self.assertAllClose(p, dist.probs)
       self.assertAllEqual([2], dist.logits.get_shape())
 
   @test_util.run_deprecated_v1
@@ -70,9 +70,9 @@ class CategoricalTest(test.TestCase, parameterized.TestCase):
       for batch_shape in ([], [1], [2, 3, 4]):
         dist = make_categorical(batch_shape, 10)
         self.assertAllEqual(batch_shape, dist.batch_shape)
-        self.assertAllEqual(batch_shape, dist.batch_shape_tensor().eval())
+        self.assertAllEqual(batch_shape, dist.batch_shape_tensor())
         self.assertAllEqual([], dist.event_shape)
-        self.assertAllEqual([], dist.event_shape_tensor().eval())
+        self.assertAllEqual([], dist.event_shape_tensor())
         self.assertEqual(10, dist.event_size.eval())
         # event_size is available as a constant because the shape is
         # known at graph build time.
@@ -83,9 +83,9 @@ class CategoricalTest(test.TestCase, parameterized.TestCase):
             batch_shape, constant_op.constant(
                 10, dtype=dtypes.int32))
         self.assertAllEqual(len(batch_shape), dist.batch_shape.ndims)
-        self.assertAllEqual(batch_shape, dist.batch_shape_tensor().eval())
+        self.assertAllEqual(batch_shape, dist.batch_shape_tensor())
         self.assertAllEqual([], dist.event_shape)
-        self.assertAllEqual([], dist.event_shape_tensor().eval())
+        self.assertAllEqual([], dist.event_shape_tensor())
         self.assertEqual(10, dist.event_size.eval())
 
   def testDtype(self):
@@ -202,7 +202,7 @@ class CategoricalTest(test.TestCase, parameterized.TestCase):
     cdf_op = dist.cdf(event)
 
     with self.cached_session():
-      self.assertAllClose(cdf_op.eval(), expected_cdf)
+      self.assertAllClose(cdf_op, expected_cdf)
 
   @test_util.run_deprecated_v1
   def testCDFNoBatch(self):
diff --git a/tensorflow/python/kernel_tests/distributions/dirichlet_multinomial_test.py b/tensorflow/python/kernel_tests/distributions/dirichlet_multinomial_test.py
index c530037e1ed..a4c07daa940 100644
--- a/tensorflow/python/kernel_tests/distributions/dirichlet_multinomial_test.py
+++ b/tensorflow/python/kernel_tests/distributions/dirichlet_multinomial_test.py
@@ -43,7 +43,7 @@ class DirichletMultinomialTest(test.TestCase):
       alpha = np.random.rand(3)
       dist = ds.DirichletMultinomial(1., alpha)
       self.assertEqual(3, dist.event_shape_tensor().eval())
-      self.assertAllEqual([], dist.batch_shape_tensor().eval())
+      self.assertAllEqual([], dist.batch_shape_tensor())
       self.assertEqual(tensor_shape.TensorShape([3]), dist.event_shape)
       self.assertEqual(tensor_shape.TensorShape([]), dist.batch_shape)
 
@@ -54,7 +54,7 @@ class DirichletMultinomialTest(test.TestCase):
       n = [[3., 2], [4, 5], [6, 7]]
       dist = ds.DirichletMultinomial(n, alpha)
       self.assertEqual(2, dist.event_shape_tensor().eval())
-      self.assertAllEqual([3, 2], dist.batch_shape_tensor().eval())
+      self.assertAllEqual([3, 2], dist.batch_shape_tensor())
       self.assertEqual(tensor_shape.TensorShape([2]), dist.event_shape)
       self.assertEqual(tensor_shape.TensorShape([3, 2]), dist.batch_shape)
 
@@ -65,7 +65,7 @@ class DirichletMultinomialTest(test.TestCase):
     with self.cached_session():
       dist = ds.DirichletMultinomial(n, alpha)
       self.assertEqual([1, 1], dist.total_count.get_shape())
-      self.assertAllClose(n, dist.total_count.eval())
+      self.assertAllClose(n, dist.total_count)
 
   @test_util.run_deprecated_v1
   def testAlphaProperty(self):
@@ -73,7 +73,7 @@ class DirichletMultinomialTest(test.TestCase):
     with self.cached_session():
       dist = ds.DirichletMultinomial(1, alpha)
       self.assertEqual([1, 3], dist.concentration.get_shape())
-      self.assertAllClose(alpha, dist.concentration.eval())
+      self.assertAllClose(alpha, dist.concentration)
 
   @test_util.run_deprecated_v1
   def testPmfNandCountsAgree(self):
diff --git a/tensorflow/python/kernel_tests/distributions/multinomial_test.py b/tensorflow/python/kernel_tests/distributions/multinomial_test.py
index 187ddd4cf41..d52b0ae355f 100644
--- a/tensorflow/python/kernel_tests/distributions/multinomial_test.py
+++ b/tensorflow/python/kernel_tests/distributions/multinomial_test.py
@@ -40,7 +40,7 @@ class MultinomialTest(test.TestCase):
       p = [.1, .3, .6]
       dist = multinomial.Multinomial(total_count=1., probs=p)
       self.assertEqual(3, dist.event_shape_tensor().eval())
-      self.assertAllEqual([], dist.batch_shape_tensor().eval())
+      self.assertAllEqual([], dist.batch_shape_tensor())
       self.assertEqual(tensor_shape.TensorShape([3]), dist.event_shape)
       self.assertEqual(tensor_shape.TensorShape([]), dist.batch_shape)
 
@@ -51,7 +51,7 @@ class MultinomialTest(test.TestCase):
       n = [[3., 2], [4, 5], [6, 7]]
       dist = multinomial.Multinomial(total_count=n, probs=p)
       self.assertEqual(2, dist.event_shape_tensor().eval())
-      self.assertAllEqual([3, 2], dist.batch_shape_tensor().eval())
+      self.assertAllEqual([3, 2], dist.batch_shape_tensor())
       self.assertEqual(tensor_shape.TensorShape([2]), dist.event_shape)
       self.assertEqual(tensor_shape.TensorShape([3, 2]), dist.batch_shape)
 
@@ -62,7 +62,7 @@ class MultinomialTest(test.TestCase):
     with self.cached_session():
       dist = multinomial.Multinomial(total_count=n, probs=p)
       self.assertEqual((2, 1), dist.total_count.get_shape())
-      self.assertAllClose(n, dist.total_count.eval())
+      self.assertAllClose(n, dist.total_count)
 
   @test_util.run_v1_only("b/120545219")
   def testP(self):
@@ -71,7 +71,7 @@ class MultinomialTest(test.TestCase):
       dist = multinomial.Multinomial(total_count=3., probs=p)
       self.assertEqual((1, 3), dist.probs.get_shape())
       self.assertEqual((1, 3), dist.logits.get_shape())
-      self.assertAllClose(p, dist.probs.eval())
+      self.assertAllClose(p, dist.probs)
 
   @test_util.run_v1_only("b/120545219")
   def testLogits(self):
@@ -81,8 +81,8 @@ class MultinomialTest(test.TestCase):
       multinom = multinomial.Multinomial(total_count=3., logits=logits)
       self.assertEqual((1, 3), multinom.probs.get_shape())
       self.assertEqual((1, 3), multinom.logits.get_shape())
-      self.assertAllClose(p, multinom.probs.eval())
-      self.assertAllClose(logits, multinom.logits.eval())
+      self.assertAllClose(p, multinom.probs)
+      self.assertAllClose(logits, multinom.logits)
 
   @test_util.run_v1_only("b/120545219")
   def testPmfUnderflow(self):
@@ -172,7 +172,7 @@ class MultinomialTest(test.TestCase):
       p = [[0.1, 0.9], [0.7, 0.3]]
       counts = [[1., 0]]
       pmf = multinomial.Multinomial(total_count=1., probs=p).prob(counts)
-      self.assertAllClose(pmf.eval(), [0.1, 0.7])
+      self.assertAllClose(pmf, [0.1, 0.7])
       self.assertEqual((2), pmf.get_shape())
 
   @test_util.run_v1_only("b/120545219")
@@ -181,7 +181,7 @@ class MultinomialTest(test.TestCase):
       p = [[0.1, 0.9], [0.7, 0.3]]
       counts = [1., 0]
       pmf = multinomial.Multinomial(total_count=1., probs=p).prob(counts)
-      self.assertAllClose(pmf.eval(), [0.1, 0.7])
+      self.assertAllClose(pmf, [0.1, 0.7])
       self.assertEqual(pmf.get_shape(), (2))
 
   def testPmfShapeCountsStretchedN(self):
@@ -213,7 +213,7 @@ class MultinomialTest(test.TestCase):
       dist = multinomial.Multinomial(total_count=n, probs=p)
       expected_means = 5 * np.array(p, dtype=np.float32)
       self.assertEqual((3,), dist.mean().get_shape())
-      self.assertAllClose(expected_means, dist.mean().eval())
+      self.assertAllClose(expected_means, dist.mean())
 
   @test_util.run_v1_only("b/120545219")
   def testMultinomialCovariance(self):
@@ -225,7 +225,7 @@ class MultinomialTest(test.TestCase):
                               [-1 / 10, 4 / 5, -7 / 10],
                               [-7 / 20, -7 / 10, 21 / 20]]
       self.assertEqual((3, 3), dist.covariance().get_shape())
-      self.assertAllClose(expected_covariances, dist.covariance().eval())
+      self.assertAllClose(expected_covariances, dist.covariance())
 
   @test_util.run_v1_only("b/120545219")
   def testMultinomialCovarianceBatch(self):
@@ -240,7 +240,7 @@ class MultinomialTest(test.TestCase):
       # Shape [4, 2, 2, 2]
       expected_covariances = [[inner_var, inner_var]] * 4
       self.assertEqual((4, 2, 2, 2), dist.covariance().get_shape())
-      self.assertAllClose(expected_covariances, dist.covariance().eval())
+      self.assertAllClose(expected_covariances, dist.covariance())
 
   def testCovarianceMultidimensional(self):
     # Shape [3, 5, 4]
diff --git a/tensorflow/python/kernel_tests/embedding_ops_test.py b/tensorflow/python/kernel_tests/embedding_ops_test.py
index b9d9d125d7c..dba5dbb964e 100644
--- a/tensorflow/python/kernel_tests/embedding_ops_test.py
+++ b/tensorflow/python/kernel_tests/embedding_ops_test.py
@@ -274,7 +274,7 @@ class EmbeddingLookupTest(test.TestCase):
       embedding = embedding_ops.embedding_lookup(
           [embeddings], ids, max_norm=1.0)
 
-      self.assertAllEqual(embedding.eval(), [[1.0]])
+      self.assertAllEqual(embedding, [[1.0]])
 
   @test_util.run_deprecated_v1
   def testMaxNormNontrivial(self):
@@ -288,7 +288,7 @@ class EmbeddingLookupTest(test.TestCase):
       norms = math_ops.sqrt(
           math_ops.reduce_sum(embeddings * embeddings, axis=1))
       normalized = embeddings / array_ops.stack([norms, norms], axis=1)
-      self.assertAllEqual(embedding.eval(), 2 * self.evaluate(normalized))
+      self.assertAllEqual(embedding, 2 * self.evaluate(normalized))
 
   @test_util.run_deprecated_v1
   def testSimpleShardedPartitionedVariable(self):
@@ -557,7 +557,7 @@ class EmbeddingLookupTest(test.TestCase):
               params.shape[0], size=np.prod(ids_shape)).reshape(ids_shape)
           # Compare nonsharded to gather
           simple = embedding_ops.embedding_lookup(params, ids).eval()
-          self.assertAllEqual(simple, array_ops.gather(params, ids).eval())
+          self.assertAllEqual(simple, array_ops.gather(params, ids))
           # Run a few random sharded versions
           for procs in 1, 2, 3:
             stride = procs * math_ops.range(params.shape[0] // procs)
@@ -591,7 +591,7 @@ class EmbeddingLookupTest(test.TestCase):
           # vectorized square root algorithm for doubles.  These different
           # implementations of sqrt are not guaranteed to produce exactly the
           # same results. Therefore, an exact comparison cannot be made.
-          self.assertAllClose(simple, array_ops.gather(params_norm, ids).eval())
+          self.assertAllClose(simple, array_ops.gather(params_norm, ids))
           # Run a few different sharded versions.
           for procs in 1, 2, 3:
             stride = procs * math_ops.range(params.shape[0] // procs)
@@ -627,7 +627,7 @@ class EmbeddingLookupTest(test.TestCase):
         # Compare nonsharded to gather.
         simple = embedding_ops._embedding_lookup_and_transform(
             params, ids, max_norm=l2_norm, transform_fn=transform).eval()
-        self.assertAllClose(simple, array_ops.gather(params_norm, ids).eval())
+        self.assertAllClose(simple, array_ops.gather(params_norm, ids))
         # Run a few different sharded versions.
         for procs in 1, 2, 3:
           stride = procs * math_ops.range(params.shape[0] // procs)
diff --git a/tensorflow/python/kernel_tests/gather_nd_op_test.py b/tensorflow/python/kernel_tests/gather_nd_op_test.py
index 79b1915dcbc..2a4962f6699 100644
--- a/tensorflow/python/kernel_tests/gather_nd_op_test.py
+++ b/tensorflow/python/kernel_tests/gather_nd_op_test.py
@@ -275,7 +275,7 @@ class GatherNdTest(test.TestCase):
     expected_grads = np.array([[3, 4], [1, 2]], dtype=np.float64)
     with self.session(use_gpu=True):
       self.assertIndexedSlices(grads)
-      self.assertAllEqual(expected_grads, ops.convert_to_tensor(grads).eval())
+      self.assertAllEqual(expected_grads, ops.convert_to_tensor(grads))
 
   @test_util.run_deprecated_v1
   def testGradientsRank3Elements(self):
@@ -360,7 +360,7 @@ class GatherNdTest(test.TestCase):
         dtype=np.float64)
     with self.session(use_gpu=True):
       self.assertIndexedSlices(grads)
-      self.assertAllEqual(expected_grads, ops.convert_to_tensor(grads).eval())
+      self.assertAllEqual(expected_grads, ops.convert_to_tensor(grads))
 
   @test_util.run_v1_only("RefVariable is not supported in v2")
   def testGatherNdRefVariable(self):
diff --git a/tensorflow/python/kernel_tests/gather_op_test.py b/tensorflow/python/kernel_tests/gather_op_test.py
index c5b42dd60a7..0f59d10c720 100644
--- a/tensorflow/python/kernel_tests/gather_op_test.py
+++ b/tensorflow/python/kernel_tests/gather_op_test.py
@@ -271,17 +271,17 @@ class GatherTest(test.TestCase, parameterized.TestCase):
             params = np.zeros((7, 0, 0), dtype=dtype.as_numpy_dtype)
             indices = np.array([3, 4], dtype=itype)
             gather = array_ops.gather(params, indices, axis=0)
-            self.assertAllEqual(gather.eval(), np.zeros((2, 0, 0)))
+            self.assertAllEqual(gather, np.zeros((2, 0, 0)))
 
             # Middle axis gather.
             params = np.zeros((0, 7, 0), dtype=dtype.as_numpy_dtype)
             gather = array_ops.gather(params, indices, axis=1)
-            self.assertAllEqual(gather.eval(), np.zeros((0, 2, 0)))
+            self.assertAllEqual(gather, np.zeros((0, 2, 0)))
 
             # Trailing axis gather.
             params = np.zeros((0, 0, 7), dtype=dtype.as_numpy_dtype)
             gather = array_ops.gather(params, indices, axis=2)
-            self.assertAllEqual(gather.eval(), np.zeros((0, 0, 2)))
+            self.assertAllEqual(gather, np.zeros((0, 0, 2)))
 
   @parameterized.parameters([
       # batch_dims=0 (equivalent to tf.gather)
diff --git a/tensorflow/python/kernel_tests/init_ops_test.py b/tensorflow/python/kernel_tests/init_ops_test.py
index dd6e8fb5e63..35be40570a0 100644
--- a/tensorflow/python/kernel_tests/init_ops_test.py
+++ b/tensorflow/python/kernel_tests/init_ops_test.py
@@ -117,7 +117,7 @@ class ConstantInitializersTest(test.TestCase):
       x = variable_scope.get_variable(
           "x", shape=shape, initializer=init_ops.zeros_initializer())
       x.initializer.run()
-      self.assertAllEqual(x.eval(), np.zeros(shape))
+      self.assertAllEqual(x, np.zeros(shape))
 
   @test_util.run_deprecated_v1
   def testOnesInitializer(self):
@@ -126,7 +126,7 @@ class ConstantInitializersTest(test.TestCase):
       x = variable_scope.get_variable(
           "x", shape=shape, initializer=init_ops.ones_initializer())
       x.initializer.run()
-      self.assertAllEqual(x.eval(), np.ones(shape))
+      self.assertAllEqual(x, np.ones(shape))
 
   @test_util.run_deprecated_v1
   def testConstantZeroInitializer(self):
@@ -135,7 +135,7 @@ class ConstantInitializersTest(test.TestCase):
       x = variable_scope.get_variable(
           "x", shape=shape, initializer=init_ops.constant_initializer(0.0))
       x.initializer.run()
-      self.assertAllEqual(x.eval(), np.zeros(shape))
+      self.assertAllEqual(x, np.zeros(shape))
 
   @test_util.run_deprecated_v1
   def testConstantOneInitializer(self):
@@ -144,7 +144,7 @@ class ConstantInitializersTest(test.TestCase):
       x = variable_scope.get_variable(
           "x", shape=shape, initializer=init_ops.constant_initializer(1.0))
       x.initializer.run()
-      self.assertAllEqual(x.eval(), np.ones(shape))
+      self.assertAllEqual(x, np.ones(shape))
 
   @test_util.run_deprecated_v1
   def testConstantIntInitializer(self):
@@ -157,7 +157,7 @@ class ConstantInitializersTest(test.TestCase):
           initializer=init_ops.constant_initializer(7))
       x.initializer.run()
       self.assertEqual(x.dtype.base_dtype, dtypes.int32)
-      self.assertAllEqual(x.eval(), 7 * np.ones(shape, dtype=np.int32))
+      self.assertAllEqual(x, 7 * np.ones(shape, dtype=np.int32))
 
   @test_util.run_deprecated_v1
   def testConstantTupleInitializer(self):
@@ -170,7 +170,7 @@ class ConstantInitializersTest(test.TestCase):
           initializer=init_ops.constant_initializer((10, 20, 30)))
       x.initializer.run()
       self.assertEqual(x.dtype.base_dtype, dtypes.int32)
-      self.assertAllEqual(x.eval(), [10, 20, 30])
+      self.assertAllEqual(x, [10, 20, 30])
 
   def _testNDimConstantInitializer(self, name, value, shape, expected):
     with self.cached_session(use_gpu=True):
@@ -482,7 +482,7 @@ class RangeTest(test.TestCase):
   @test_util.run_deprecated_v1
   def testLimitOnly(self):
     with self.session(use_gpu=True):
-      self.assertAllEqual(np.arange(5), math_ops.range(5).eval())
+      self.assertAllEqual(np.arange(5), math_ops.range(5))
 
   def testEmpty(self):
     for start in 0, 5:
@@ -1348,7 +1348,7 @@ class IdentityInitializerTest(test.TestCase):
           "foo", partitioner=partitioner, initializer=init):
         v = array_ops.identity(variable_scope.get_variable("bar", shape=shape))
       variables.global_variables_initializer().run()
-      self.assertAllClose(v.eval(), np.eye(*shape))
+      self.assertAllClose(v, np.eye(*shape))
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/kernel_tests/inplace_ops_test.py b/tensorflow/python/kernel_tests/inplace_ops_test.py
index 72d4c28a31f..ab9267c968d 100644
--- a/tensorflow/python/kernel_tests/inplace_ops_test.py
+++ b/tensorflow/python/kernel_tests/inplace_ops_test.py
@@ -37,35 +37,35 @@ class InplaceOpsTest(test_util.TensorFlowTestCase):
       with self.session(use_gpu=True):
         x = array_ops.ones([7, 3], dtype)
         y = np.ones([7, 3], dtype.as_numpy_dtype)
-        self.assertAllClose(x.eval(), y)
+        self.assertAllClose(x, y)
         x = inplace_ops.inplace_update(x, [3], array_ops.ones([1, 3], dtype))
         y[3, :] = 1
-        self.assertAllClose(x.eval(), y)
+        self.assertAllClose(x, y)
         x = inplace_ops.inplace_update(x, [-1],
                                        array_ops.ones([1, 3], dtype) * 2)
         y[-1, :] = 2
-        self.assertAllClose(x.eval(), y)
+        self.assertAllClose(x, y)
         x = inplace_ops.inplace_update(x, 5, array_ops.ones([3], dtype) * 7)
         y[5, :] = 7
-        self.assertAllClose(x.eval(), y)
+        self.assertAllClose(x, y)
 
   @test_util.run_deprecated_v1
   def testBasicUpdateBool(self):
     with self.session(use_gpu=True):
       x = array_ops.ones([7, 3], dtypes.bool)
       y = np.ones([7, 3], dtypes.bool.as_numpy_dtype)
-      self.assertAllClose(x.eval(), y)
+      self.assertAllClose(x, y)
       x = inplace_ops.inplace_update(x, [3], array_ops.ones([1, 3],
                                                             dtypes.bool))
       y[3, :] = True
-      self.assertAllClose(x.eval(), y)
+      self.assertAllClose(x, y)
       x = inplace_ops.inplace_update(x, [-1],
                                      array_ops.zeros([1, 3], dtypes.bool))
       y[-1, :] = False
-      self.assertAllClose(x.eval(), y)
+      self.assertAllClose(x, y)
       x = inplace_ops.inplace_update(x, 5, array_ops.zeros([3], dtypes.bool))
       y[5, :] = False
-      self.assertAllClose(x.eval(), y)
+      self.assertAllClose(x, y)
 
   @test_util.run_deprecated_v1
   def testBasicAdd(self):
@@ -73,19 +73,19 @@ class InplaceOpsTest(test_util.TensorFlowTestCase):
       with self.cached_session(use_gpu=True):
         x = array_ops.ones([7, 3], dtype)
         y = np.ones([7, 3], dtype.as_numpy_dtype)
-        self.assertAllClose(x.eval(), y)
+        self.assertAllClose(x, y)
         x = array_ops.inplace_add(x, [3], array_ops.ones([1, 3], dtype))
         y[3, :] += 1
-        self.assertAllClose(x.eval(), y)
+        self.assertAllClose(x, y)
         x = inplace_ops.inplace_add(x, [-1], array_ops.ones([1, 3], dtype) * 2)
         y[-1, :] += 2
-        self.assertAllClose(x.eval(), y)
+        self.assertAllClose(x, y)
         x = inplace_ops.inplace_add(x, 5, array_ops.ones([3], dtype) * 7)
         y[5, :] += 7
-        self.assertAllClose(x.eval(), y)
+        self.assertAllClose(x, y)
         x = inplace_ops.inplace_add(x, None, array_ops.ones([7, 3], dtype) * 99)
         y[:, :] += 99
-        self.assertAllClose(x.eval(), y)
+        self.assertAllClose(x, y)
 
   @test_util.run_deprecated_v1
   def testBasicSub(self):
@@ -93,19 +93,19 @@ class InplaceOpsTest(test_util.TensorFlowTestCase):
       with self.cached_session(use_gpu=True):
         x = array_ops.ones([7, 3], dtype)
         y = np.ones([7, 3], dtype.as_numpy_dtype)
-        self.assertAllClose(x.eval(), y)
+        self.assertAllClose(x, y)
         x = inplace_ops.inplace_sub(x, [3], array_ops.ones([1, 3], dtype))
         y[3, :] -= 1
-        self.assertAllClose(x.eval(), y)
+        self.assertAllClose(x, y)
         x = inplace_ops.inplace_sub(x, [-1], array_ops.ones([1, 3], dtype) * 2)
         y[-1, :] -= 2
-        self.assertAllClose(x.eval(), y)
+        self.assertAllClose(x, y)
         x = inplace_ops.inplace_sub(x, 5, array_ops.ones([3], dtype) * 7)
         y[5, :] -= 7
-        self.assertAllClose(x.eval(), y)
+        self.assertAllClose(x, y)
         x = inplace_ops.inplace_sub(x, None, array_ops.ones([7, 3], dtype) * 99)
         y[:, :] -= 99
-        self.assertAllClose(x.eval(), y)
+        self.assertAllClose(x, y)
 
   @test_util.run_deprecated_v1
   def testRandom(self):
@@ -126,7 +126,7 @@ class InplaceOpsTest(test_util.TensorFlowTestCase):
         elif op == 2:
           x = inplace_ops.inplace_sub(x, idx, val)
           y[idx, :] -= val
-        self.assertAllClose(x.eval(), y)
+        self.assertAllClose(x, y)
 
   @test_util.run_deprecated_v1
   def testRandom1D(self):
@@ -147,7 +147,7 @@ class InplaceOpsTest(test_util.TensorFlowTestCase):
         elif op == 2:
           x = inplace_ops.inplace_sub(x, idx, val)
           y[idx] -= val
-        self.assertAllClose(x.eval(), y)
+        self.assertAllClose(x, y)
 
   def testAlias(self):
     with self.session(use_gpu=True) as sess:
@@ -214,11 +214,11 @@ class InplaceOpsTest(test_util.TensorFlowTestCase):
         with self.cached_session(use_gpu=True):
           x = array_ops.zeros([7, 0], dtype)
           y = np.zeros([7, 0], dtype.as_numpy_dtype)
-          self.assertAllClose(x.eval(), y)
+          self.assertAllClose(x, y)
           x = op_fn(x, [3], array_ops.ones([1, 0], dtype))
-          self.assertAllClose(x.eval(), y)
+          self.assertAllClose(x, y)
           x = op_fn(x, None, array_ops.ones([1, 0], dtype))
-          self.assertAllClose(x.eval(), y)
+          self.assertAllClose(x, y)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_addition_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_addition_test.py
index 597ca3a1606..4e4a81d0647 100644
--- a/tensorflow/python/kernel_tests/linalg/linear_operator_addition_test.py
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_addition_test.py
@@ -81,7 +81,7 @@ class LinearOperatorAdditionCorrectnessTest(test.TestCase):
       self.assertEqual(1, len(op_sum))
       op = op_sum[0]
       self.assertIsInstance(op, linalg_lib.LinearOperatorDiag)
-      self.assertAllClose([[3., 0.], [0., 3.]], op.to_dense().eval())
+      self.assertAllClose([[3., 0.], [0., 3.]], op.to_dense())
       # Adding positive definite operators produces positive def.
       self.assertTrue(op.is_positive_definite)
       # Real diagonal ==> self-adjoint.
@@ -104,7 +104,7 @@ class LinearOperatorAdditionCorrectnessTest(test.TestCase):
       self.assertEqual(1, len(op_sum))
       op = op_sum[0]
       self.assertTrue(isinstance(op, linalg_lib.LinearOperatorDiag))
-      self.assertAllClose([[6., 0.], [0., 6.]], op.to_dense().eval())
+      self.assertAllClose([[6., 0.], [0., 6.]], op.to_dense())
       # Adding positive definite operators produces positive def.
       self.assertTrue(op.is_positive_definite)
       # Real diagonal ==> self-adjoint.
@@ -128,7 +128,7 @@ class LinearOperatorAdditionCorrectnessTest(test.TestCase):
       self.assertEqual(1, len(op_sum))
       op = op_sum[0]
       self.assertIsInstance(op, linalg_lib.LinearOperatorLowerTriangular)
-      self.assertAllClose([[6., 0.], [0., 6.]], op.to_dense().eval())
+      self.assertAllClose([[6., 0.], [0., 6.]], op.to_dense())
 
       # The diag operators will be self-adjoint (because real and diagonal).
       # The TriL operator has the self-adjoint hint set.
@@ -151,7 +151,7 @@ class LinearOperatorAdditionCorrectnessTest(test.TestCase):
       self.assertEqual(1, len(op_sum))
       op = op_sum[0]
       self.assertIsInstance(op, linalg_lib.LinearOperatorFullMatrix)
-      self.assertAllClose([[5., -1.], [0.5, 5.]], op.to_dense().eval())
+      self.assertAllClose([[5., -1.], [0.5, 5.]], op.to_dense())
       self.assertEqual("my_operator", op.name)
 
   def test_incompatible_domain_dimensions_raises(self):
@@ -241,10 +241,10 @@ class LinearOperatorOrderOfAdditionTest(test.TestCase):
       for op in op_sum:
         if isinstance(op, linalg.LinearOperatorDiag):
           found_diag = True
-          self.assertAllClose([[3.]], op.to_dense().eval())
+          self.assertAllClose([[3.]], op.to_dense())
         if isinstance(op, linalg.LinearOperatorLowerTriangular):
           found_tril = True
-          self.assertAllClose([[5.]], op.to_dense().eval())
+          self.assertAllClose([[5.]], op.to_dense())
       self.assertTrue(found_diag and found_tril)
 
   def test_intermediate_tier_is_not_skipped(self):
@@ -390,7 +390,7 @@ class AddAndReturnTriLTest(test.TestCase):
     self.assertIsInstance(operator, linalg.LinearOperatorLowerTriangular)
 
     with self.cached_session():
-      self.assertAllClose([[11., 0.], [30., 2.]], operator.to_dense().eval())
+      self.assertAllClose([[11., 0.], [30., 2.]], operator.to_dense())
     self.assertTrue(operator.is_positive_definite)
     self.assertTrue(operator.is_non_singular)
     self.assertEqual("my_operator", operator.name)
@@ -413,7 +413,7 @@ class AddAndReturnMatrixTest(test.TestCase):
     self.assertIsInstance(operator, linalg.LinearOperatorFullMatrix)
 
     with self.cached_session():
-      self.assertAllClose([[0., 0.], [0., 5.]], operator.to_dense().eval())
+      self.assertAllClose([[0., 0.], [0., 5.]], operator.to_dense())
     self.assertFalse(operator.is_positive_definite)
     self.assertFalse(operator.is_non_singular)
     self.assertEqual("my_operator", operator.name)
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_composition_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_composition_test.py
index 9bca236bbc3..28956881e96 100644
--- a/tensorflow/python/kernel_tests/linalg/linear_operator_composition_test.py
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_composition_test.py
@@ -203,7 +203,7 @@ class NonSquareLinearOperatorCompositionTest(
     ]
     operator = linalg.LinearOperatorComposition(operators)
     with self.cached_session():
-      self.assertAllEqual((2, 3, 5), operator.shape_tensor().eval())
+      self.assertAllEqual((2, 3, 5), operator.shape_tensor())
 
   @test_util.run_deprecated_v1
   def test_shape_tensors_when_only_dynamically_available(self):
diff --git a/tensorflow/python/kernel_tests/linalg_grad_test.py b/tensorflow/python/kernel_tests/linalg_grad_test.py
index 3aceddf4d5f..f1d885fd231 100644
--- a/tensorflow/python/kernel_tests/linalg_grad_test.py
+++ b/tensorflow/python/kernel_tests/linalg_grad_test.py
@@ -52,7 +52,7 @@ class ShapeTest(test_lib.TestCase):
       determinants = linalg_ops.matrix_determinant(batch_identity)
       reduced = math_ops.reduce_sum(determinants)
       sum_grad = gradients_impl.gradients(reduced, batch_identity)[0]
-      self.assertAllClose(batch_identity.eval(), self.evaluate(sum_grad))
+      self.assertAllClose(batch_identity, self.evaluate(sum_grad))
 
 
 class MatrixUnaryFunctorGradientTest(test_lib.TestCase):
diff --git a/tensorflow/python/kernel_tests/lookup_ops_test.py b/tensorflow/python/kernel_tests/lookup_ops_test.py
index 514c7a1e997..9b237b258a9 100644
--- a/tensorflow/python/kernel_tests/lookup_ops_test.py
+++ b/tensorflow/python/kernel_tests/lookup_ops_test.py
@@ -1443,14 +1443,14 @@ class DenseHashTableOpTest(test.TestCase):
 
       save = saver.Saver()
 
-      self.assertAllEqual(0, table.size().eval())
+      self.assertAllEqual(0, table.size())
       table.insert(keys, values).run()
-      self.assertAllEqual(4, table.size().eval())
+      self.assertAllEqual(4, table.size())
       self.assertAllEqual(32, len(table.export()[0].eval()))
 
       keys2 = constant_op.constant([12, 15], dtypes.int64)
       table.remove(keys2).run()
-      self.assertAllEqual(3, table.size().eval())
+      self.assertAllEqual(3, table.size())
       self.assertAllEqual(32, len(table.export()[0].eval()))
 
       val = save.save(sess, save_path)
@@ -1470,7 +1470,7 @@ class DenseHashTableOpTest(test.TestCase):
       table.insert(
           constant_op.constant([11, 14], dtypes.int64),
           constant_op.constant([12, 24], dtypes.int64)).run()
-      self.assertAllEqual(2, table.size().eval())
+      self.assertAllEqual(2, table.size())
       self.assertAllEqual(64, len(table.export()[0].eval()))
 
       save = saver.Saver()
@@ -1478,12 +1478,12 @@ class DenseHashTableOpTest(test.TestCase):
       # Restore the saved values in the parameter nodes.
       save.restore(sess, save_path)
 
-      self.assertAllEqual(3, table.size().eval())
+      self.assertAllEqual(3, table.size())
       self.assertAllEqual(32, len(table.export()[0].eval()))
 
       input_string = constant_op.constant([10, 11, 12, 13, 14], dtypes.int64)
       output = table.lookup(input_string)
-      self.assertAllEqual([-1, 0, -1, 2, 3], output.eval())
+      self.assertAllEqual([-1, 0, -1, 2, 3], output)
 
   @test_util.run_v1_only("Saver V1 only")
   def testSaveRestoreOnlyTable(self):
@@ -1508,14 +1508,14 @@ class DenseHashTableOpTest(test.TestCase):
 
       save = saver.Saver([table])
 
-      self.assertAllEqual(0, table.size().eval())
+      self.assertAllEqual(0, table.size())
       table.insert(keys, values).run()
-      self.assertAllEqual(4, table.size().eval())
+      self.assertAllEqual(4, table.size())
       self.assertAllEqual(32, len(table.export()[0].eval()))
 
       keys2 = constant_op.constant([12, 15], dtypes.int64)
       table.remove(keys2).run()
-      self.assertAllEqual(3, table.size().eval())
+      self.assertAllEqual(3, table.size())
       self.assertAllEqual(32, len(table.export()[0].eval()))
 
       val = save.save(sess, save_path)
@@ -1535,7 +1535,7 @@ class DenseHashTableOpTest(test.TestCase):
       table.insert(
           constant_op.constant([11, 14], dtypes.int64),
           constant_op.constant([12, 24], dtypes.int64)).run()
-      self.assertAllEqual(2, table.size().eval())
+      self.assertAllEqual(2, table.size())
       self.assertAllEqual(64, len(table.export()[0].eval()))
 
       save = saver.Saver([table])
@@ -1543,12 +1543,12 @@ class DenseHashTableOpTest(test.TestCase):
       # Restore the saved values in the parameter nodes.
       save.restore(sess, save_path)
 
-      self.assertAllEqual(3, table.size().eval())
+      self.assertAllEqual(3, table.size())
       self.assertAllEqual(32, len(table.export()[0].eval()))
 
       input_string = constant_op.constant([10, 11, 12, 13, 14], dtypes.int64)
       output = table.lookup(input_string)
-      self.assertAllEqual([-1, 0, -1, 2, 3], output.eval())
+      self.assertAllEqual([-1, 0, -1, 2, 3], output)
 
   @test_util.run_in_graph_and_eager_modes
   def testObjectSaveRestore(self):
@@ -1633,14 +1633,14 @@ class DenseHashTableOpTest(test.TestCase):
 
       save = saver.Saver()
 
-      self.assertAllEqual(0, table.size().eval())
+      self.assertAllEqual(0, table.size())
       table.insert(keys, values).run()
-      self.assertAllEqual(4, table.size().eval())
+      self.assertAllEqual(4, table.size())
       self.assertAllEqual(32, len(table.export()[0].eval()))
 
       keys2 = constant_op.constant([[12, 13], [16, 17]], dtypes.int64)
       table.remove(keys2).run()
-      self.assertAllEqual(3, table.size().eval())
+      self.assertAllEqual(3, table.size())
       self.assertAllEqual(32, len(table.export()[0].eval()))
 
       val = save.save(sess, save_path)
@@ -1663,7 +1663,7 @@ class DenseHashTableOpTest(test.TestCase):
       table.insert(
           constant_op.constant([[11, 12], [13, 15]], dtypes.int64),
           constant_op.constant([[21, 22], [23, 24]], dtypes.int64)).run()
-      self.assertAllEqual(2, table.size().eval())
+      self.assertAllEqual(2, table.size())
       self.assertAllEqual(64, len(table.export()[0].eval()))
 
       save = saver.Saver()
@@ -1671,7 +1671,7 @@ class DenseHashTableOpTest(test.TestCase):
       # Restore the saved values in the parameter nodes.
       save.restore(sess, save_path)
 
-      self.assertAllEqual(3, table.size().eval())
+      self.assertAllEqual(3, table.size())
       self.assertAllEqual(32, len(table.export()[0].eval()))
 
       input_string = constant_op.constant(
@@ -1704,14 +1704,14 @@ class DenseHashTableOpTest(test.TestCase):
 
       save = saver.Saver()
 
-      self.assertAllEqual(0, table.size().eval())
+      self.assertAllEqual(0, table.size())
       table.insert(keys, values).run()
-      self.assertAllEqual(4, table.size().eval())
+      self.assertAllEqual(4, table.size())
       self.assertAllEqual(32, len(table.export()[0].eval()))
 
       keys2 = constant_op.constant([[12, 13], [15, 16]], dtypes.int64)
       table.remove(keys2).run()
-      self.assertAllEqual(3, table.size().eval())
+      self.assertAllEqual(3, table.size())
       self.assertAllEqual(32, len(table.export()[0].eval()))
 
       val = save.save(sess, save_path)
@@ -1734,7 +1734,7 @@ class DenseHashTableOpTest(test.TestCase):
       table.insert(
           constant_op.constant([[11, 12], [13, 15]], dtypes.int64),
           constant_op.constant([3, 4], dtypes.int64)).run()
-      self.assertAllEqual(2, table.size().eval())
+      self.assertAllEqual(2, table.size())
       self.assertAllEqual(64, len(table.export()[0].eval()))
 
       save = saver.Saver()
@@ -1742,13 +1742,13 @@ class DenseHashTableOpTest(test.TestCase):
       # Restore the saved values in the parameter nodes.
       save.restore(sess, save_path)
 
-      self.assertAllEqual(3, table.size().eval())
+      self.assertAllEqual(3, table.size())
       self.assertAllEqual(32, len(table.export()[0].eval()))
 
       input_string = constant_op.constant(
           [[11, 12], [11, 14], [11, 15], [13, 14], [13, 15]], dtypes.int64)
       output = table.lookup(input_string)
-      self.assertAllEqual([0, 1, -1, 3, -1], output.eval())
+      self.assertAllEqual([0, 1, -1, 3, -1], output)
 
   def testReprobe(self):
     with self.cached_session():
@@ -3006,22 +3006,22 @@ class MutableHashTableOpTest(test.TestCase):
 
     # Populate the table in the first session
     with session1:
-      self.assertAllEqual(0, table.size().eval())
+      self.assertAllEqual(0, table.size())
 
       keys = constant_op.constant([11, 12], dtypes.int64)
       values = constant_op.constant(["a", "b"])
       table.insert(keys, values).run()
-      self.assertAllEqual(2, table.size().eval())
+      self.assertAllEqual(2, table.size())
 
       output = table.lookup(constant_op.constant([11, 12, 13], dtypes.int64))
-      self.assertAllEqual([b"a", b"b", b"-"], output.eval())
+      self.assertAllEqual([b"a", b"b", b"-"], output)
 
     # Verify that we can access the shared data from the second session
     with session2:
-      self.assertAllEqual(2, table.size().eval())
+      self.assertAllEqual(2, table.size())
 
       output = table.lookup(constant_op.constant([10, 11, 12], dtypes.int64))
-      self.assertAllEqual([b"-", b"a", b"b"], output.eval())
+      self.assertAllEqual([b"-", b"a", b"b"], output)
 
   def testMutableHashTableOfTensors(self):
     with self.cached_session():
diff --git a/tensorflow/python/kernel_tests/losses_test.py b/tensorflow/python/kernel_tests/losses_test.py
index 101e0a5f1ff..9e490407fd2 100644
--- a/tensorflow/python/kernel_tests/losses_test.py
+++ b/tensorflow/python/kernel_tests/losses_test.py
@@ -840,7 +840,7 @@ class HingeLossTest(test.TestCase):
       logits = constant_op.constant([1.2, -1.4, -1.0, 2.1])
       labels = constant_op.constant([1.0, 0.0, 0.0, 1.0])
       loss = losses.hinge_loss(labels, logits)
-      self.assertAllClose(loss.eval(), 0.0, atol=1e-3)
+      self.assertAllClose(loss, 0.0, atol=1e-3)
 
   @test_util.run_deprecated_v1
   def testSomeInsideMargin(self):
@@ -850,7 +850,7 @@ class HingeLossTest(test.TestCase):
       loss = losses.hinge_loss(labels, logits)
       # Examples 1 and 4 are on the correct side of the hyperplane but within
       # the margin so they incur some (small) loss.
-      self.assertAllClose(loss.eval(), 0.175, atol=1e-3)
+      self.assertAllClose(loss, 0.175, atol=1e-3)
 
   @test_util.run_deprecated_v1
   def testSomeMisclassified(self):
@@ -860,7 +860,7 @@ class HingeLossTest(test.TestCase):
       loss = losses.hinge_loss(labels, logits)
       # Examples 2 and 4 are on the wrong side of the hyperplane so they incur
       # some (fairly large) loss.
-      self.assertAllClose(loss.eval(), 0.875, atol=1e-3)
+      self.assertAllClose(loss, 0.875, atol=1e-3)
 
 
 class HuberLossTest(test.TestCase):
@@ -878,8 +878,8 @@ class HuberLossTest(test.TestCase):
       predictions = constant_op.constant([1.5, -1.4, -1.0, 0.0])
       labels = constant_op.constant([1.0, -1.0, 0.0, 0.5])
       loss = losses.huber_loss(labels, predictions)
-      self.assertAllClose(loss.eval(),
-                          0.5 * (0.25 + 0.16 + 1.0 + 0.25) / 4., atol=1e-5)
+      self.assertAllClose(
+          loss, 0.5 * (0.25 + 0.16 + 1.0 + 0.25) / 4., atol=1e-5)
 
   @test_util.run_deprecated_v1
   def testAllLinear(self):
@@ -887,8 +887,7 @@ class HuberLossTest(test.TestCase):
       predictions = constant_op.constant([1.5, -1.4, -1.0, 0.0])
       labels = constant_op.constant([0.0, 1.0, 0.0, 1.5])
       loss = losses.huber_loss(labels, predictions)
-      self.assertAllClose(loss.eval(),
-                          (1.5 + 2.4 + 1.0 + 1.5) / 4. - 0.5, atol=1e-5)
+      self.assertAllClose(loss, (1.5 + 2.4 + 1.0 + 1.5) / 4. - 0.5, atol=1e-5)
 
   @test_util.run_deprecated_v1
   def testMixedQuadraticLinear(self):
@@ -901,7 +900,7 @@ class HuberLossTest(test.TestCase):
       quadratic = 0.5 * (0.25 + 0.16 + 1.0 + 0.25) / 4.
       linear = (1.5 + 2.4 + 1.0 + 1.5) / 4. - 0.5
       expected_loss = (quadratic + linear) / 2.
-      self.assertAllClose(loss.eval(), expected_loss, atol=1e-5)
+      self.assertAllClose(loss, expected_loss, atol=1e-5)
 
   def testAllQuadraticDelta(self):
     with self.cached_session():
diff --git a/tensorflow/python/kernel_tests/manip_ops_test.py b/tensorflow/python/kernel_tests/manip_ops_test.py
index eb9a3d6d0d9..2e43d4a8e32 100644
--- a/tensorflow/python/kernel_tests/manip_ops_test.py
+++ b/tensorflow/python/kernel_tests/manip_ops_test.py
@@ -44,7 +44,7 @@ class RollTest(test_util.TensorFlowTestCase):
     expected_roll = np.roll(np_input, shift, axis)
     with self.cached_session(use_gpu=True):
       roll = manip_ops.roll(np_input, shift, axis)
-      self.assertAllEqual(roll.eval(), expected_roll)
+      self.assertAllEqual(roll, expected_roll)
 
   def _testGradient(self, np_input, shift, axis):
     with self.cached_session(use_gpu=True):
diff --git a/tensorflow/python/kernel_tests/metrics_test.py b/tensorflow/python/kernel_tests/metrics_test.py
index edb3b428dd0..81e1defbd48 100644
--- a/tensorflow/python/kernel_tests/metrics_test.py
+++ b/tensorflow/python/kernel_tests/metrics_test.py
@@ -1775,8 +1775,8 @@ class PrecisionRecallThresholdsTest(test.TestCase):
       initial_rec = rec.eval()
       for _ in range(10):
         self.evaluate([prec_op, rec_op])
-        self.assertAllClose(initial_prec, prec.eval())
-        self.assertAllClose(initial_rec, rec.eval())
+        self.assertAllClose(initial_prec, prec)
+        self.assertAllClose(initial_rec, rec)
 
   # TODO(nsilberman): fix tests (passing but incorrect).
   @test_util.run_deprecated_v1
@@ -3852,7 +3852,7 @@ class MeanIOUTest(test.TestCase):
     with self.cached_session():
       miou, update_op = metrics.mean_iou(labels, predictions, num_classes)
       self.evaluate(variables.local_variables_initializer())
-      self.assertAllEqual([[0, 0], [40, 0]], update_op.eval())
+      self.assertAllEqual([[0, 0], [40, 0]], update_op)
       self.assertEqual(0., miou.eval())
 
   @test_util.run_deprecated_v1
@@ -3884,7 +3884,7 @@ class MeanIOUTest(test.TestCase):
       miou, update_op = metrics.mean_iou(
           labels, predictions, num_classes, weights=weights)
       self.evaluate(variables.local_variables_initializer())
-      self.assertAllEqual([[2, 0], [2, 4]], update_op.eval())
+      self.assertAllEqual([[2, 0], [2, 4]], update_op)
       desired_miou = np.mean([2. / 4., 4. / 6.])
       self.assertAlmostEqual(desired_miou, miou.eval())
 
@@ -3904,7 +3904,7 @@ class MeanIOUTest(test.TestCase):
     with self.cached_session():
       miou, update_op = metrics.mean_iou(labels, predictions, num_classes)
       self.evaluate(variables.local_variables_initializer())
-      self.assertAllEqual([[7, 4, 3], [3, 5, 2], [0, 0, 0]], update_op.eval())
+      self.assertAllEqual([[7, 4, 3], [3, 5, 2], [0, 0, 0]], update_op)
       self.assertAlmostEqual(
           1 / 3 * (7 / (7 + 3 + 7) + 5 / (5 + 4 + 5) + 0 / (0 + 5 + 0)),
           miou.eval())
@@ -3917,7 +3917,7 @@ class MeanIOUTest(test.TestCase):
     with self.cached_session():
       miou, update_op = metrics.mean_iou(labels, predictions, num_classes)
       self.evaluate(variables.local_variables_initializer())
-      self.assertAllEqual([[1, 0], [0, 0]], update_op.eval())
+      self.assertAllEqual([[1, 0], [0, 0]], update_op)
       self.assertAlmostEqual(1, miou.eval())
 
   @test_util.run_deprecated_v1
@@ -3936,7 +3936,7 @@ class MeanIOUTest(test.TestCase):
     with self.cached_session():
       miou, update_op = metrics.mean_iou(labels, predictions, num_classes)
       self.evaluate(variables.local_variables_initializer())
-      self.assertAllEqual([[9, 5, 0], [3, 7, 0], [0, 0, 0]], update_op.eval())
+      self.assertAllEqual([[9, 5, 0], [3, 7, 0], [0, 0, 0]], update_op)
       self.assertAlmostEqual(
           1 / 2 * (9 / (9 + 3 + 5) + 7 / (7 + 5 + 3)), miou.eval())
 
@@ -4151,7 +4151,7 @@ class MeanPerClassAccuracyTest(test.TestCase):
       mean_accuracy, update_op = metrics.mean_per_class_accuracy(
           labels, predictions, num_classes)
       self.evaluate(variables.local_variables_initializer())
-      self.assertAllEqual([0.0, 0.0], update_op.eval())
+      self.assertAllEqual([0.0, 0.0], update_op)
       self.assertEqual(0., mean_accuracy.eval())
 
   @test_util.run_deprecated_v1
@@ -4172,7 +4172,7 @@ class MeanPerClassAccuracyTest(test.TestCase):
           labels, predictions, num_classes, weights=weights)
       self.evaluate(variables.local_variables_initializer())
       desired_accuracy = np.array([2. / 2., 4. / 6.], dtype=np.float32)
-      self.assertAllEqual(desired_accuracy, update_op.eval())
+      self.assertAllEqual(desired_accuracy, update_op)
       desired_mean_accuracy = np.mean(desired_accuracy)
       self.assertAlmostEqual(desired_mean_accuracy, mean_accuracy.eval())
 
@@ -4205,9 +4205,9 @@ class FalseNegativesTest(test.TestCase):
 
     with self.cached_session():
       self.evaluate(variables.local_variables_initializer())
-      self.assertAllClose(0., tn.eval())
-      self.assertAllClose(3., tn_update_op.eval())
-      self.assertAllClose(3., tn.eval())
+      self.assertAllClose(0., tn)
+      self.assertAllClose(3., tn_update_op)
+      self.assertAllClose(3., tn)
 
   @test_util.run_deprecated_v1
   def testWeighted(self):
@@ -4225,9 +4225,9 @@ class FalseNegativesTest(test.TestCase):
 
     with self.cached_session():
       self.evaluate(variables.local_variables_initializer())
-      self.assertAllClose(0., tn.eval())
-      self.assertAllClose(5., tn_update_op.eval())
-      self.assertAllClose(5., tn.eval())
+      self.assertAllClose(0., tn)
+      self.assertAllClose(5., tn_update_op)
+      self.assertAllClose(5., tn)
 
 
 class FalseNegativesAtThresholdsTest(test.TestCase):
@@ -4257,9 +4257,9 @@ class FalseNegativesAtThresholdsTest(test.TestCase):
 
     with self.cached_session():
       self.evaluate(variables.local_variables_initializer())
-      self.assertAllEqual((0, 0, 0), fn.eval())
-      self.assertAllEqual((0, 2, 3), fn_update_op.eval())
-      self.assertAllEqual((0, 2, 3), fn.eval())
+      self.assertAllEqual((0, 0, 0), fn)
+      self.assertAllEqual((0, 2, 3), fn_update_op)
+      self.assertAllEqual((0, 2, 3), fn)
 
   @test_util.run_deprecated_v1
   def testWeighted(self):
@@ -4277,9 +4277,9 @@ class FalseNegativesAtThresholdsTest(test.TestCase):
 
     with self.cached_session():
       self.evaluate(variables.local_variables_initializer())
-      self.assertAllEqual((0.0, 0.0, 0.0), fn.eval())
-      self.assertAllEqual((0.0, 8.0, 11.0), fn_update_op.eval())
-      self.assertAllEqual((0.0, 8.0, 11.0), fn.eval())
+      self.assertAllEqual((0.0, 0.0, 0.0), fn)
+      self.assertAllEqual((0.0, 8.0, 11.0), fn_update_op)
+      self.assertAllEqual((0.0, 8.0, 11.0), fn)
 
 
 class FalsePositivesTest(test.TestCase):
@@ -4310,9 +4310,9 @@ class FalsePositivesTest(test.TestCase):
 
     with self.cached_session():
       self.evaluate(variables.local_variables_initializer())
-      self.assertAllClose(0., tn.eval())
-      self.assertAllClose(7., tn_update_op.eval())
-      self.assertAllClose(7., tn.eval())
+      self.assertAllClose(0., tn)
+      self.assertAllClose(7., tn_update_op)
+      self.assertAllClose(7., tn)
 
   @test_util.run_deprecated_v1
   def testWeighted(self):
@@ -4330,9 +4330,9 @@ class FalsePositivesTest(test.TestCase):
 
     with self.cached_session():
       self.evaluate(variables.local_variables_initializer())
-      self.assertAllClose(0., tn.eval())
-      self.assertAllClose(14., tn_update_op.eval())
-      self.assertAllClose(14., tn.eval())
+      self.assertAllClose(0., tn)
+      self.assertAllClose(14., tn_update_op)
+      self.assertAllClose(14., tn)
 
 
 class FalsePositivesAtThresholdsTest(test.TestCase):
@@ -4362,9 +4362,9 @@ class FalsePositivesAtThresholdsTest(test.TestCase):
 
     with self.cached_session():
       self.evaluate(variables.local_variables_initializer())
-      self.assertAllEqual((0, 0, 0), fp.eval())
-      self.assertAllEqual((7, 4, 2), fp_update_op.eval())
-      self.assertAllEqual((7, 4, 2), fp.eval())
+      self.assertAllEqual((0, 0, 0), fp)
+      self.assertAllEqual((7, 4, 2), fp_update_op)
+      self.assertAllEqual((7, 4, 2), fp)
 
   @test_util.run_deprecated_v1
   def testWeighted(self):
@@ -4384,9 +4384,9 @@ class FalsePositivesAtThresholdsTest(test.TestCase):
 
     with self.cached_session():
       self.evaluate(variables.local_variables_initializer())
-      self.assertAllEqual((0.0, 0.0, 0.0), fp.eval())
-      self.assertAllEqual((125.0, 42.0, 12.0), fp_update_op.eval())
-      self.assertAllEqual((125.0, 42.0, 12.0), fp.eval())
+      self.assertAllEqual((0.0, 0.0, 0.0), fp)
+      self.assertAllEqual((125.0, 42.0, 12.0), fp_update_op)
+      self.assertAllEqual((125.0, 42.0, 12.0), fp)
 
 
 class TrueNegativesTest(test.TestCase):
@@ -4417,9 +4417,9 @@ class TrueNegativesTest(test.TestCase):
 
     with self.cached_session():
       self.evaluate(variables.local_variables_initializer())
-      self.assertAllClose(0., tn.eval())
-      self.assertAllClose(3., tn_update_op.eval())
-      self.assertAllClose(3., tn.eval())
+      self.assertAllClose(0., tn)
+      self.assertAllClose(3., tn_update_op)
+      self.assertAllClose(3., tn)
 
   @test_util.run_deprecated_v1
   def testWeighted(self):
@@ -4437,9 +4437,9 @@ class TrueNegativesTest(test.TestCase):
 
     with self.cached_session():
       self.evaluate(variables.local_variables_initializer())
-      self.assertAllClose(0., tn.eval())
-      self.assertAllClose(4., tn_update_op.eval())
-      self.assertAllClose(4., tn.eval())
+      self.assertAllClose(0., tn)
+      self.assertAllClose(4., tn_update_op)
+      self.assertAllClose(4., tn)
 
 
 class TrueNegativesAtThresholdsTest(test.TestCase):
@@ -4469,9 +4469,9 @@ class TrueNegativesAtThresholdsTest(test.TestCase):
 
     with self.cached_session():
       self.evaluate(variables.local_variables_initializer())
-      self.assertAllEqual((0, 0, 0), tn.eval())
-      self.assertAllEqual((2, 5, 7), tn_update_op.eval())
-      self.assertAllEqual((2, 5, 7), tn.eval())
+      self.assertAllEqual((0, 0, 0), tn)
+      self.assertAllEqual((2, 5, 7), tn_update_op)
+      self.assertAllEqual((2, 5, 7), tn)
 
   @test_util.run_deprecated_v1
   def testWeighted(self):
@@ -4489,9 +4489,9 @@ class TrueNegativesAtThresholdsTest(test.TestCase):
 
     with self.cached_session():
       self.evaluate(variables.local_variables_initializer())
-      self.assertAllEqual((0.0, 0.0, 0.0), tn.eval())
-      self.assertAllEqual((5.0, 15.0, 23.0), tn_update_op.eval())
-      self.assertAllEqual((5.0, 15.0, 23.0), tn.eval())
+      self.assertAllEqual((0.0, 0.0, 0.0), tn)
+      self.assertAllEqual((5.0, 15.0, 23.0), tn_update_op)
+      self.assertAllEqual((5.0, 15.0, 23.0), tn)
 
 
 class TruePositivesTest(test.TestCase):
@@ -4522,9 +4522,9 @@ class TruePositivesTest(test.TestCase):
 
     with self.cached_session():
       self.evaluate(variables.local_variables_initializer())
-      self.assertAllClose(0., tn.eval())
-      self.assertAllClose(7., tn_update_op.eval())
-      self.assertAllClose(7., tn.eval())
+      self.assertAllClose(0., tn)
+      self.assertAllClose(7., tn_update_op)
+      self.assertAllClose(7., tn)
 
   @test_util.run_deprecated_v1
   def testWeighted(self):
@@ -4542,9 +4542,9 @@ class TruePositivesTest(test.TestCase):
 
     with self.cached_session():
       self.evaluate(variables.local_variables_initializer())
-      self.assertAllClose(0., tn.eval())
-      self.assertAllClose(12., tn_update_op.eval())
-      self.assertAllClose(12., tn.eval())
+      self.assertAllClose(0., tn)
+      self.assertAllClose(12., tn_update_op)
+      self.assertAllClose(12., tn)
 
 
 class TruePositivesAtThresholdsTest(test.TestCase):
@@ -4574,9 +4574,9 @@ class TruePositivesAtThresholdsTest(test.TestCase):
 
     with self.cached_session():
       self.evaluate(variables.local_variables_initializer())
-      self.assertAllEqual((0, 0, 0), tp.eval())
-      self.assertAllEqual((3, 1, 0), tp_update_op.eval())
-      self.assertAllEqual((3, 1, 0), tp.eval())
+      self.assertAllEqual((0, 0, 0), tp)
+      self.assertAllEqual((3, 1, 0), tp_update_op)
+      self.assertAllEqual((3, 1, 0), tp)
 
   @test_util.run_deprecated_v1
   def testWeighted(self):
@@ -4592,9 +4592,9 @@ class TruePositivesAtThresholdsTest(test.TestCase):
 
     with self.cached_session():
       self.evaluate(variables.local_variables_initializer())
-      self.assertAllEqual((0.0, 0.0, 0.0), tp.eval())
-      self.assertAllEqual((111.0, 37.0, 0.0), tp_update_op.eval())
-      self.assertAllEqual((111.0, 37.0, 0.0), tp.eval())
+      self.assertAllEqual((0.0, 0.0, 0.0), tp)
+      self.assertAllEqual((111.0, 37.0, 0.0), tp_update_op)
+      self.assertAllEqual((111.0, 37.0, 0.0), tp)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/kernel_tests/pad_op_test.py b/tensorflow/python/kernel_tests/pad_op_test.py
index 0a53db908bf..06f4780421d 100644
--- a/tensorflow/python/kernel_tests/pad_op_test.py
+++ b/tensorflow/python/kernel_tests/pad_op_test.py
@@ -373,7 +373,7 @@ class PadOpTest(test.TestCase):
             [paddings_value[i][0] + inp.shape.dims[i].value for i in range(4)],
             [-1, -1, -1, -1])
         with self.cached_session(use_gpu=True):
-          self.assertAllEqual(inp.eval(), self.evaluate(middle))
+          self.assertAllEqual(inp, self.evaluate(middle))
           self.assertAllEqual(
               np.zeros([row[0] for row in paddings_value]), self.evaluate(left))
           self.assertAllEqual(
diff --git a/tensorflow/python/kernel_tests/padding_fifo_queue_test.py b/tensorflow/python/kernel_tests/padding_fifo_queue_test.py
index 5870c21750f..03e89042e21 100644
--- a/tensorflow/python/kernel_tests/padding_fifo_queue_test.py
+++ b/tensorflow/python/kernel_tests/padding_fifo_queue_test.py
@@ -543,7 +543,7 @@ class PaddingFIFOQueueTest(test.TestCase):
       dequeued_t = q.dequeue_many(10)
 
       enqueue_op.run()
-      self.assertAllEqual(dequeued_t.eval(), elems)
+      self.assertAllEqual(dequeued_t, elems)
 
   def testPartiallyKnownHighDimension(self):
     with self.cached_session():
@@ -554,7 +554,7 @@ class PaddingFIFOQueueTest(test.TestCase):
       dequeued_t = q.dequeue_many(10)
 
       enqueue_op.run()
-      self.assertAllEqual(dequeued_t.eval(), elems)
+      self.assertAllEqual(dequeued_t, elems)
 
   def testEnqueueWrongShape(self):
     q = data_flow_ops.PaddingFIFOQueue(10, (dtypes_lib.int32, dtypes_lib.int32),
diff --git a/tensorflow/python/kernel_tests/py_func_test.py b/tensorflow/python/kernel_tests/py_func_test.py
index 5c86215631d..134d4240201 100644
--- a/tensorflow/python/kernel_tests/py_func_test.py
+++ b/tensorflow/python/kernel_tests/py_func_test.py
@@ -258,7 +258,7 @@ class PyFuncTest(PyFuncTestBase):
     correct = [b"this", b"is", b"a", b"test"]
     with self.cached_session():
       s, = script_ops.py_func(lambda: [correct], [], [dtypes.string])
-      self.assertAllEqual(s.eval(), correct)
+      self.assertAllEqual(s, correct)
 
   @test_util.run_v1_only("b/120545219")
   def testStringPaddingAreConvertedToBytes(self):
@@ -266,7 +266,7 @@ class PyFuncTest(PyFuncTestBase):
     correct = [b"this", b"is", b"a", b"test"]
     with self.cached_session():
       s, = script_ops.py_func(lambda: [inp], [], [dtypes.string])
-      self.assertAllEqual(s.eval(), correct)
+      self.assertAllEqual(s, correct)
 
   @test_util.run_v1_only("b/120545219")
   def testNulTerminatedStrings(self):
@@ -274,7 +274,7 @@ class PyFuncTest(PyFuncTestBase):
     correct = [b"this", b"is", b"a", b"test"]
     with self.cached_session():
       s, = script_ops.py_func(lambda: [inp], [], [dtypes.string])
-      self.assertAllEqual(s.eval(), correct)
+      self.assertAllEqual(s, correct)
 
   @test_util.run_v1_only("b/120545219")
   def testLarge(self):
diff --git a/tensorflow/python/kernel_tests/random/random_shuffle_queue_test.py b/tensorflow/python/kernel_tests/random/random_shuffle_queue_test.py
index 4cb5f1935d9..9561d26dac7 100644
--- a/tensorflow/python/kernel_tests/random/random_shuffle_queue_test.py
+++ b/tensorflow/python/kernel_tests/random/random_shuffle_queue_test.py
@@ -55,9 +55,9 @@ class RandomShuffleQueueTest(test.TestCase):
     with self.cached_session():
       q = data_flow_ops.RandomShuffleQueue(10, 5, dtypes_lib.float32)
       enqueue_op = q.enqueue((10.0,))
-      self.assertAllEqual(0, q.size().eval())
+      self.assertAllEqual(0, q.size())
       enqueue_op.run()
-      self.assertAllEqual(1, q.size().eval())
+      self.assertAllEqual(1, q.size())
 
   def testEnqueueWithShape(self):
     with self.cached_session():
@@ -65,7 +65,7 @@ class RandomShuffleQueueTest(test.TestCase):
           10, 5, dtypes_lib.float32, shapes=tensor_shape.TensorShape([3, 2]))
       enqueue_correct_op = q.enqueue(([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]],))
       enqueue_correct_op.run()
-      self.assertAllEqual(1, q.size().eval())
+      self.assertAllEqual(1, q.size())
       with self.assertRaises(ValueError):
         q.enqueue(([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]],))
 
@@ -74,7 +74,7 @@ class RandomShuffleQueueTest(test.TestCase):
       q = data_flow_ops.RandomShuffleQueue(
           10, 5, [dtypes_lib.int32, dtypes_lib.int32], shapes=[(), (2,)])
       q.enqueue_many([[1, 2, 3, 4], [[1, 1], [2, 2], [3, 3], [4, 4]]]).run()
-      self.assertAllEqual(4, q.size().eval())
+      self.assertAllEqual(4, q.size())
 
       q2 = data_flow_ops.RandomShuffleQueue(
           10, 5, dtypes_lib.int32, shapes=tensor_shape.TensorShape([3]))
diff --git a/tensorflow/python/kernel_tests/reduction_ops_test.py b/tensorflow/python/kernel_tests/reduction_ops_test.py
index 1b5fa201d8f..757944661aa 100644
--- a/tensorflow/python/kernel_tests/reduction_ops_test.py
+++ b/tensorflow/python/kernel_tests/reduction_ops_test.py
@@ -59,7 +59,7 @@ class ReducedShapeTest(test.TestCase):
 
   def _check(self, shape, axes, result):
     output = math_ops.reduced_shape(shape, axes=axes)
-    self.assertAllEqual(output.eval(), result)
+    self.assertAllEqual(output, result)
 
   @test_util.run_deprecated_v1
   def testSimple(self):
@@ -390,7 +390,7 @@ class SumReductionTest(BaseReductionTest):
         # A large number is needed to get Eigen to die
         x = array_ops.zeros((0, 9938), dtype=dtype)
         y = math_ops.reduce_sum(x, [0])
-        self.assertAllEqual(y.eval(), np.zeros(9938))
+        self.assertAllEqual(y, np.zeros(9938))
 
 
 class MeanReductionTest(BaseReductionTest):
@@ -697,7 +697,7 @@ class ProdReductionTest(BaseReductionTest):
         # A large number is needed to get Eigen to die
         x = array_ops.zeros((0, 9938), dtype=dtype)
         y = math_ops.reduce_prod(x, [0])
-        self.assertAllEqual(y.eval(), np.ones(9938))
+        self.assertAllEqual(y, np.ones(9938))
 
 
 class MinReductionTest(test.TestCase):
@@ -1124,7 +1124,7 @@ class CountNonzeroReductionTest(test.TestCase):
           # A large number is needed to get Eigen to die
           x = array_ops.zeros((0, 9938), dtype=dtype)
           y = math_ops.count_nonzero(x, [0])
-          self.assertAllEqual(y.eval(), np.zeros(9938))
+          self.assertAllEqual(y, np.zeros(9938))
 
   def testStringReduce(self):
     # Test case for GitHub issue 18712
diff --git a/tensorflow/python/kernel_tests/rnn_cell_test.py b/tensorflow/python/kernel_tests/rnn_cell_test.py
index c6cf1cdf875..7157bd0c1d1 100644
--- a/tensorflow/python/kernel_tests/rnn_cell_test.py
+++ b/tensorflow/python/kernel_tests/rnn_cell_test.py
@@ -2241,7 +2241,7 @@ class RawRNNTest(test.TestCase):
       r = rnn.raw_rnn(cell, loop_fn)
       loop_state = r[-1]
       loop_state = loop_state.stack()
-      self.assertAllEqual([1, 2, 2 + 2, 4 + 3, 7 + 4], loop_state.eval())
+      self.assertAllEqual([1, 2, 2 + 2, 4 + 3, 7 + 4], loop_state)
 
   @test_util.run_v1_only("b/124229375")
   def testEmitDifferentStructureThanCellOutput(self):
diff --git a/tensorflow/python/kernel_tests/scatter_nd_ops_test.py b/tensorflow/python/kernel_tests/scatter_nd_ops_test.py
index 1394984bfee..83ade1b19cb 100644
--- a/tensorflow/python/kernel_tests/scatter_nd_ops_test.py
+++ b/tensorflow/python/kernel_tests/scatter_nd_ops_test.py
@@ -200,7 +200,7 @@ class StatefulScatterNdTest(test.TestCase):
     with self.session(use_gpu=True) as sess:
       self.evaluate(init)
       self.evaluate(scatter)
-      self.assertAllClose(ref.eval(), expected)
+      self.assertAllClose(ref, expected)
 
   def testSimple2(self):
     indices = constant_op.constant([[1, 0], [1, 1]], dtype=dtypes.int32)
diff --git a/tensorflow/python/kernel_tests/segment_reduction_ops_test.py b/tensorflow/python/kernel_tests/segment_reduction_ops_test.py
index 1982fd27d4d..5d17b61cb06 100644
--- a/tensorflow/python/kernel_tests/segment_reduction_ops_test.py
+++ b/tensorflow/python/kernel_tests/segment_reduction_ops_test.py
@@ -464,7 +464,7 @@ class UnsortedSegmentTest(SegmentReductionHelper):
           data = np.zeros((2, 0), dtype=dtype)
           segment_ids = np.array([0, 1], dtype=itype)
           unsorted = math_ops.unsorted_segment_sum(data, segment_ids, 2)
-          self.assertAllEqual(unsorted.eval(), np.zeros((2, 0), dtype=dtype))
+          self.assertAllEqual(unsorted, np.zeros((2, 0), dtype=dtype))
 
   def testDropNegatives(self):
     # Note: the test is done by replacing segment_ids with 8 to -1
diff --git a/tensorflow/python/kernel_tests/shape_ops_test.py b/tensorflow/python/kernel_tests/shape_ops_test.py
index 6c2f2e236f2..5a165c94542 100644
--- a/tensorflow/python/kernel_tests/shape_ops_test.py
+++ b/tensorflow/python/kernel_tests/shape_ops_test.py
@@ -256,12 +256,12 @@ class ShapeOpsTest(test.TestCase):
   def testExpandDimsScalar(self):
     with self.cached_session():
       inp = constant_op.constant(7)
-      self.assertAllEqual([7], array_ops.expand_dims(inp, 0).eval())
-      self.assertAllEqual([7], array_ops.expand_dims(inp, -1).eval())
+      self.assertAllEqual([7], array_ops.expand_dims(inp, 0))
+      self.assertAllEqual([7], array_ops.expand_dims(inp, -1))
 
       inp = constant_op.constant(True)
-      self.assertAllEqual([True], array_ops.expand_dims(inp, 0).eval())
-      self.assertAllEqual([True], array_ops.expand_dims(inp, -1).eval())
+      self.assertAllEqual([True], array_ops.expand_dims(inp, 0))
+      self.assertAllEqual([True], array_ops.expand_dims(inp, -1))
 
   def testExpandDimsDimType(self):
     for dtype in [dtypes.int32, dtypes.int64]:
diff --git a/tensorflow/python/kernel_tests/slice_op_test.py b/tensorflow/python/kernel_tests/slice_op_test.py
index b53147552c3..fb894191040 100644
--- a/tensorflow/python/kernel_tests/slice_op_test.py
+++ b/tensorflow/python/kernel_tests/slice_op_test.py
@@ -265,7 +265,7 @@ class SliceTest(test.TestCase):
       else:
         y = 0
       slice_t = a[:, x, y:z, :]
-      self.assertAllEqual(slice_t.eval(), inp[:, x, y:z, :])
+      self.assertAllEqual(slice_t, inp[:, x, y:z, :])
 
   def testRandom(self):
     # Random dims of rank 6
diff --git a/tensorflow/python/kernel_tests/spacetobatch_op_test.py b/tensorflow/python/kernel_tests/spacetobatch_op_test.py
index 51a407bd33b..c6116db55b0 100644
--- a/tensorflow/python/kernel_tests/spacetobatch_op_test.py
+++ b/tensorflow/python/kernel_tests/spacetobatch_op_test.py
@@ -107,13 +107,13 @@ class SpaceToBatchTest(test.TestCase, PythonOpImpl):
           math_ops.cast(inputs, dtypes.float32),
           paddings,
           block_size=block_size)
-      self.assertAllEqual(x_tf.eval(), outputs)
+      self.assertAllEqual(x_tf, outputs)
       # inputs = batch_to_space(outputs)
       x_tf = self.batch_to_space(
           math_ops.cast(outputs, dtypes.float32),
           paddings,
           block_size=block_size)
-      self.assertAllEqual(x_tf.eval(), inputs)
+      self.assertAllEqual(x_tf, inputs)
 
   def _testOne(self, inputs, block_size, outputs):
     paddings = np.zeros((2, 2), dtype=np.int32)
@@ -205,11 +205,11 @@ class SpaceToBatchNDTest(test.TestCase):
         # outputs = space_to_batch(inputs)
         x_tf = array_ops.space_to_batch_nd(
             math_ops.cast(inputs, dtypes.float32), block_shape, paddings)
-        self.assertAllEqual(x_tf.eval(), outputs)
+        self.assertAllEqual(x_tf, outputs)
         # inputs = batch_to_space(outputs)
         x_tf = array_ops.batch_to_space_nd(
             math_ops.cast(outputs, dtypes.float32), block_shape, paddings)
-        self.assertAllEqual(x_tf.eval(), inputs)
+        self.assertAllEqual(x_tf, inputs)
 
   def _testDirect(self, input_shape, block_shape, paddings):
     inputs = np.arange(np.prod(input_shape), dtype=np.float32)
@@ -328,7 +328,7 @@ class SpaceToBatchSpaceToDepth(test.TestCase, PythonOpImpl):
             array_ops.transpose(x, [3, 1, 2, 0]), block_size=block_size),
         [3, 1, 2, 0])
     with self.session(use_gpu=True):
-      self.assertAllEqual(y1.eval(), y2.eval())
+      self.assertAllEqual(y1, y2)
 
 
 class SpaceToBatchSpaceToDepthCpp(SpaceToBatchSpaceToDepth, CppOpImpl):
diff --git a/tensorflow/python/kernel_tests/stack_op_test.py b/tensorflow/python/kernel_tests/stack_op_test.py
index 00117187c81..600e61a802c 100644
--- a/tensorflow/python/kernel_tests/stack_op_test.py
+++ b/tensorflow/python/kernel_tests/stack_op_test.py
@@ -62,7 +62,7 @@ class StackOpTest(test.TestCase):
             # Stack back into a single tensorflow tensor
             with self.subTest(shape=shape, axis=axis, dtype=dtype):
               c = array_ops.stack(xs, axis=axis)
-              self.assertAllEqual(c.eval(), data)
+              self.assertAllEqual(c, data)
 
   @test_util.run_deprecated_v1
   def testSimpleParallelCPU(self):
@@ -73,7 +73,7 @@ class StackOpTest(test.TestCase):
           data = self.randn(shape, np.float32)
           xs = list(map(constant_op.constant, data))
           c = array_ops.parallel_stack(xs)
-          self.assertAllEqual(c.eval(), data)
+          self.assertAllEqual(c, data)
 
   @test_util.run_deprecated_v1
   def testSimpleParallelGPU(self):
@@ -84,7 +84,7 @@ class StackOpTest(test.TestCase):
           data = self.randn(shape, np.float32)
           xs = list(map(constant_op.constant, data))
           c = array_ops.parallel_stack(xs)
-          self.assertAllEqual(c.eval(), data)
+          self.assertAllEqual(c, data)
 
   @test_util.run_deprecated_v1
   def testConst(self):
@@ -104,14 +104,14 @@ class StackOpTest(test.TestCase):
             c = array_ops.stack(data)
             # This is implemented via a Const:
             self.assertEqual(c.op.type, "Const")
-            self.assertAllEqual(c.eval(), data)
+            self.assertAllEqual(c, data)
 
             # Python lists also work for 1-D case:
             if len(shape) == 1:
               data_list = list(data)
               cl = array_ops.stack(data_list)
               self.assertEqual(cl.op.type, "Const")
-              self.assertAllEqual(cl.eval(), data)
+              self.assertAllEqual(cl, data)
 
   @test_util.run_deprecated_v1
   def testConstParallelCPU(self):
@@ -123,11 +123,11 @@ class StackOpTest(test.TestCase):
           if len(shape) == 1:
             data_list = list(data)
             cl = array_ops.parallel_stack(data_list)
-            self.assertAllEqual(cl.eval(), data)
+            self.assertAllEqual(cl, data)
 
           data = self.randn(shape, np.float32)
           c = array_ops.parallel_stack(data)
-          self.assertAllEqual(c.eval(), data)
+          self.assertAllEqual(c, data)
 
   @test_util.run_deprecated_v1
   def testConstParallelGPU(self):
@@ -139,11 +139,11 @@ class StackOpTest(test.TestCase):
           if len(shape) == 1:
             data_list = list(data)
             cl = array_ops.parallel_stack(data_list)
-            self.assertAllEqual(cl.eval(), data)
+            self.assertAllEqual(cl, data)
 
           data = self.randn(shape, np.float32)
           c = array_ops.parallel_stack(data)
-          self.assertAllEqual(c.eval(), data)
+          self.assertAllEqual(c, data)
 
   @test_util.run_deprecated_v1
   def testGradientsAxis0(self):
diff --git a/tensorflow/python/kernel_tests/string_join_op_test.py b/tensorflow/python/kernel_tests/string_join_op_test.py
index 2548e8695fe..9e2f3e7ae7e 100644
--- a/tensorflow/python/kernel_tests/string_join_op_test.py
+++ b/tensorflow/python/kernel_tests/string_join_op_test.py
@@ -32,19 +32,19 @@ class StringJoinOpTest(test.TestCase):
 
     with self.cached_session():
       output = string_ops.string_join([input0, input1])
-      self.assertAllEqual(output.eval(), [b"aa", b"ba"])
+      self.assertAllEqual(output, [b"aa", b"ba"])
 
       output = string_ops.string_join([input0, input1], separator="--")
-      self.assertAllEqual(output.eval(), [b"a--a", b"b--a"])
+      self.assertAllEqual(output, [b"a--a", b"b--a"])
 
       output = string_ops.string_join([input0, input1, input0], separator="--")
-      self.assertAllEqual(output.eval(), [b"a--a--a", b"b--a--b"])
+      self.assertAllEqual(output, [b"a--a--a", b"b--a--b"])
 
       output = string_ops.string_join([input1] * 4, separator="!")
       self.assertEqual(output.eval(), b"a!a!a!a")
 
       output = string_ops.string_join([input2] * 2, separator="")
-      self.assertAllEqual(output.eval(), [[b"bb"], [b"cc"]])
+      self.assertAllEqual(output, [[b"bb"], [b"cc"]])
 
       with self.assertRaises(ValueError):  # Inconsistent shapes
         string_ops.string_join([input0, input2]).eval()
diff --git a/tensorflow/python/kernel_tests/string_length_op_test.py b/tensorflow/python/kernel_tests/string_length_op_test.py
index 42a5cb63ccf..6b7e923add3 100644
--- a/tensorflow/python/kernel_tests/string_length_op_test.py
+++ b/tensorflow/python/kernel_tests/string_length_op_test.py
@@ -60,7 +60,7 @@ class StringLengthOpTest(test.TestCase):
     strings = [[["1", "12"], ["123", "1234"], ["12345", "123456"]]]
     lengths = string_ops.string_length(strings, "some_name")
     with self.session():
-      self.assertAllEqual(lengths.eval(), [[[1, 2], [3, 4], [5, 6]]])
+      self.assertAllEqual(lengths, [[[1, 2], [3, 4], [5, 6]]])
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/kernel_tests/tensor_array_ops_test.py b/tensorflow/python/kernel_tests/tensor_array_ops_test.py
index 27c56decca3..2350084d330 100644
--- a/tensorflow/python/kernel_tests/tensor_array_ops_test.py
+++ b/tensorflow/python/kernel_tests/tensor_array_ops_test.py
@@ -610,7 +610,7 @@ class TensorArrayTest(test.TestCase):
       w2_grad = w1_grad.write(2, c(5.0))
 
       # Assert that aggregation works correctly
-      self.assertAllEqual(c(12.00), w2_grad.read(2).eval())
+      self.assertAllEqual(c(12.00), w2_grad.read(2))
 
       # Assert that if multiple_writes_aggregate is not enabled,
       # multiple writes raise an exception.
diff --git a/tensorflow/python/kernel_tests/unicode_transcode_op_test.py b/tensorflow/python/kernel_tests/unicode_transcode_op_test.py
index 5188645b140..e28545528af 100644
--- a/tensorflow/python/kernel_tests/unicode_transcode_op_test.py
+++ b/tensorflow/python/kernel_tests/unicode_transcode_op_test.py
@@ -325,7 +325,7 @@ class UnicodeTranscodeOpTest(test.TestCase, parameterized.TestCase):
     with self.test_session():
       output = string_ops.unicode_transcode(
           string, input_encoding=input_encoding, output_encoding="UTF-8")
-      self.assertAllEqual(output.eval(), expected)
+      self.assertAllEqual(output, expected)
 
   @test_util.run_deprecated_v1
   def test_invalid_encoding_causes_errors(self):
diff --git a/tensorflow/python/ops/control_flow_ops_test.py b/tensorflow/python/ops/control_flow_ops_test.py
index aa95d22c119..c3e33231465 100644
--- a/tensorflow/python/ops/control_flow_ops_test.py
+++ b/tensorflow/python/ops/control_flow_ops_test.py
@@ -201,8 +201,8 @@ class SwitchTestCase(test_util.TensorFlowTestCase):
       one = constant_op.constant(1)
       less_op = math_ops.less(zero, one)
       _, switch_true = control_flow_ops.switch(data, less_op)
-      self.assertAllEqual([1, 2, 3], switch_true.values.eval())
-      self.assertAllEqual([0, 1, 2], switch_true.indices.eval())
+      self.assertAllEqual([1, 2, 3], switch_true.values)
+      self.assertAllEqual([0, 1, 2], switch_true.indices)
 
   @test_util.run_deprecated_v1
   def testIndexedSlicesGradient(self):
diff --git a/tensorflow/python/ops/gradients_test.py b/tensorflow/python/ops/gradients_test.py
index 760463bcd65..a34624b2555 100644
--- a/tensorflow/python/ops/gradients_test.py
+++ b/tensorflow/python/ops/gradients_test.py
@@ -329,13 +329,13 @@ class GradientsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
       y1 = math_ops.square(y)
       y2 = math_ops.square(y1)
       g = gradients.gradients([y, y2], x)
-      self.assertAllClose(17502.0, g[0].eval())
+      self.assertAllClose(17502.0, g[0])
       g = gradients.gradients(y + y2, x)
-      self.assertAllClose(17502.0, g[0].eval())
+      self.assertAllClose(17502.0, g[0])
       z = array_ops.identity(y)
       z2 = array_ops.identity(y2)
       g = gradients.gradients([z, z2], x)
-      self.assertAllClose(17502.0, g[0].eval())
+      self.assertAllClose(17502.0, g[0])
 
   @test_util.run_v1_only("b/120545219")
   def testPartialDerivatives(self):
@@ -838,7 +838,7 @@ class IndexedSlicesToTensorTest(test_util.TensorFlowTestCase):
       np_val = np.random.rand(4, 4, 4, 4).astype(np.float32)
       c = constant_op.constant(np_val)
       c_sparse = math_ops._as_indexed_slices(c)
-      self.assertAllEqual(np_val.shape, c_sparse.dense_shape.eval())
+      self.assertAllEqual(np_val.shape, c_sparse.dense_shape)
       c_dense = math_ops.multiply(c_sparse, 1.0)
       self.assertAllClose(np_val, self.evaluate(c_dense))
 
@@ -857,7 +857,7 @@ class IndexedSlicesToTensorTest(test_util.TensorFlowTestCase):
         sparse_list.append(c_sparse)
       packed_dense = array_ops.stack(dense_list)
       packed_sparse = array_ops.stack(sparse_list)
-      self.assertAllClose(packed_dense.eval(), self.evaluate(packed_sparse))
+      self.assertAllClose(packed_dense, self.evaluate(packed_sparse))
 
   @test_util.run_v1_only("b/120545219")
   def testInt64Indices(self):
@@ -868,7 +868,7 @@ class IndexedSlicesToTensorTest(test_util.TensorFlowTestCase):
       c_sparse = ops.IndexedSlices(
           c_sparse.values,
           math_ops.cast(c_sparse.indices, dtypes.int64), c_sparse.dense_shape)
-      self.assertAllEqual(np_val.shape, c_sparse.dense_shape.eval())
+      self.assertAllEqual(np_val.shape, c_sparse.dense_shape)
       c_dense = math_ops.multiply(c_sparse, 1.0)
       self.assertAllClose(np_val, self.evaluate(c_dense))
 
@@ -1347,7 +1347,7 @@ class CustomGradientTest(test_util.TensorFlowTestCase, parameterized.TestCase):
 
       g, = gradients_impl.gradients(output, alpha)
       self.evaluate(variables.global_variables_initializer())
-      self.assertAllEqual(g.eval(), [2.0])
+      self.assertAllEqual(g, [2.0])
       self.assertAllEqual(g.eval(feed_dict={conditional: False}), [3.0])
 
   def testRecursiveCustomGradient(self):
diff --git a/tensorflow/python/ops/image_ops_test.py b/tensorflow/python/ops/image_ops_test.py
index 539a305995b..5d05a8586ab 100644
--- a/tensorflow/python/ops/image_ops_test.py
+++ b/tensorflow/python/ops/image_ops_test.py
@@ -4153,14 +4153,14 @@ class ConvertImageTest(test_util.TensorFlowTestCase):
       image = constant_op.constant(x_np)
       y = image_ops.convert_image_dtype(image, output_dtype)
       self.assertTrue(y.dtype == output_dtype)
-      self.assertAllClose(y.eval(), y_np, atol=1e-5)
+      self.assertAllClose(y, y_np, atol=1e-5)
       if output_dtype in [
           dtypes.float32, dtypes.float64, dtypes.int32, dtypes.int64
       ]:
         y_saturate = image_ops.convert_image_dtype(
             image, output_dtype, saturate=True)
         self.assertTrue(y_saturate.dtype == output_dtype)
-        self.assertAllClose(y_saturate.eval(), y_np, atol=1e-5)
+        self.assertAllClose(y_saturate, y_np, atol=1e-5)
 
   @test_util.run_deprecated_v1
   def testNoConvert(self):
@@ -4440,7 +4440,7 @@ class NonMaxSuppressionTest(test_util.TensorFlowTestCase):
       iou_threshold = constant_op.constant(iou_threshold_np)
       selected_indices = image_ops.non_max_suppression(
           boxes, scores, max_output_size, iou_threshold)
-      self.assertAllClose(selected_indices.eval(), [3, 0, 5])
+      self.assertAllClose(selected_indices, [3, 0, 5])
 
   @test_util.run_deprecated_v1
   def testInvalidShape(self):
@@ -4616,10 +4616,9 @@ class NonMaxSuppressionPaddedTest(test_util.TensorFlowTestCase):
     self.assertEqual(selected_indices_padded.shape.is_fully_defined(), True)
     self.assertEqual(selected_indices.shape.is_fully_defined(), False)
     with self.cached_session():
-      self.assertAllClose(selected_indices_padded.eval(),
-                          [3, 0, 5, 0, 0])
+      self.assertAllClose(selected_indices_padded, [3, 0, 5, 0, 0])
       self.assertEqual(num_valid_padded.eval(), 3)
-      self.assertAllClose(selected_indices.eval(), [3, 0, 5])
+      self.assertAllClose(selected_indices, [3, 0, 5])
       self.assertEqual(num_valid.eval(), 3)
 
   @test_util.run_deprecated_v1
@@ -4646,7 +4645,7 @@ class NonMaxSuppressionPaddedTest(test_util.TensorFlowTestCase):
     # The output shape of the padded operation must be fully defined.
     self.assertEqual(selected_indices.shape.is_fully_defined(), False)
     with self.cached_session():
-      self.assertAllClose(selected_indices.eval(), [0, 2, 4])
+      self.assertAllClose(selected_indices, [0, 2, 4])
       self.assertEqual(num_valid.eval(), 3)
 
 
@@ -4672,7 +4671,7 @@ class NonMaxSuppressionWithOverlapsTest(test_util.TensorFlowTestCase):
         overlaps, scores, max_output_size, overlap_threshold, score_threshold)
 
     with self.cached_session():
-      self.assertAllClose(selected_indices.eval(), [1])
+      self.assertAllClose(selected_indices, [1])
 
 
 class VerifyCompatibleImageShapesTest(test_util.TensorFlowTestCase):
diff --git a/tensorflow/python/ops/math_grad_test.py b/tensorflow/python/ops/math_grad_test.py
index 4a07d2949a8..e856749f885 100644
--- a/tensorflow/python/ops/math_grad_test.py
+++ b/tensorflow/python/ops/math_grad_test.py
@@ -412,8 +412,8 @@ class DivNoNanGradientTest(test.TestCase):
     outputs = math_ops.div_no_nan(x, y)
     with self.cached_session():
       dx, dy = gradients.gradients(outputs, [x, y])
-      self.assertAllClose(dx.eval(), np.zeros(x.shape.as_list()))
-      self.assertAllClose(dy.eval(), np.zeros(y.shape.as_list()))
+      self.assertAllClose(dx, np.zeros(x.shape.as_list()))
+      self.assertAllClose(dy, np.zeros(y.shape.as_list()))
 
 
 class MulNoNanGradientTest(test.TestCase):
@@ -437,8 +437,8 @@ class MulNoNanGradientTest(test.TestCase):
     outputs = math_ops.mul_no_nan(x, y)
     with self.cached_session():
       dx, dy = gradients.gradients(outputs, [x, y])
-      self.assertAllClose(dx.eval(), np.zeros(x.shape.as_list()))
-      self.assertAllClose(dy.eval(), x_vals)
+      self.assertAllClose(dx, np.zeros(x.shape.as_list()))
+      self.assertAllClose(dy, x_vals)
 
 
 class XlogyTest(test.TestCase):
diff --git a/tensorflow/python/tpu/feature_column_test.py b/tensorflow/python/tpu/feature_column_test.py
index 9503fb27fb9..0b4e84a6212 100644
--- a/tensorflow/python/tpu/feature_column_test.py
+++ b/tensorflow/python/tpu/feature_column_test.py
@@ -161,8 +161,8 @@ class EmbeddingColumnTest(test.TestCase):
     self.assertItemsEqual(('embedding_weights:0',),
                           tuple([v.name for v in global_vars]))
     with _initialized_session():
-      self.assertAllEqual(embedding_values, global_vars[0].eval())
-      self.assertAllEqual(expected_lookups, embedding_lookup.eval())
+      self.assertAllEqual(embedding_values, global_vars[0])
+      self.assertAllEqual(expected_lookups, embedding_lookup)
 
 
 class SharedEmbeddingColumnTest(test.TestCase):
@@ -307,9 +307,9 @@ class SharedEmbeddingColumnTest(test.TestCase):
                           tuple([v.name for v in global_vars]))
     embedding_var = global_vars[0]
     with _initialized_session():
-      self.assertAllEqual(embedding_values, embedding_var.eval())
-      self.assertAllEqual(expected_lookups_a, embedding_lookup_a.eval())
-      self.assertAllEqual(expected_lookups_b, embedding_lookup_b.eval())
+      self.assertAllEqual(embedding_values, embedding_var)
+      self.assertAllEqual(expected_lookups_a, embedding_lookup_a)
+      self.assertAllEqual(expected_lookups_b, embedding_lookup_b)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/tpu/feature_column_v2_test.py b/tensorflow/python/tpu/feature_column_v2_test.py
index ba5ea41754e..c1a34fad107 100644
--- a/tensorflow/python/tpu/feature_column_v2_test.py
+++ b/tensorflow/python/tpu/feature_column_v2_test.py
@@ -167,8 +167,8 @@ class EmbeddingColumnTestV2(test.TestCase, parameterized.TestCase):
          'sequence_features/bbb_embedding/embedding_weights:0',),
         tuple([v.name for v in global_vars]))
     with _initialized_session():
-      self.assertAllEqual(embedding_values, global_vars[0].eval())
-      self.assertAllEqual(expected_lookups, embedding_lookup.eval())
+      self.assertAllEqual(embedding_values, global_vars[0])
+      self.assertAllEqual(expected_lookups, embedding_lookup)
       self.assertAllEqual(expected_lookups_sequence,
                           sequence_embedding_lookup[0].eval())
       # The graph will still have SparseFillEmptyRows due to sequence being
@@ -341,8 +341,8 @@ class SharedEmbeddingColumnTestV2(test.TestCase, parameterized.TestCase):
         tuple([v.name for v in global_vars]))
     embedding_var = global_vars[0]
     with _initialized_session():
-      self.assertAllEqual(embedding_values, embedding_var.eval())
-      self.assertAllEqual(expected_lookups_a, embedding_lookup_a.eval())
+      self.assertAllEqual(embedding_values, embedding_var)
+      self.assertAllEqual(expected_lookups_a, embedding_lookup_a)
       self.assertAllEqual(expected_lookups_b,
                           embedding_lookup_b[0].eval())
       # The graph will still have SparseFillEmptyRows due to sequence being
@@ -556,7 +556,7 @@ class DeviceSpecificEmbeddingColumnTestV2(test.TestCase,
 
       embedding_var = global_vars[0]
       with _initialized_session():
-        self.assertAllEqual(embedding_values, embedding_var.eval())
+        self.assertAllEqual(embedding_values, embedding_var)
         eval_res = embedding_lookup.eval()
         self.assertAllEqual(expected_lookups, eval_res)
       context.Exit()
@@ -624,7 +624,7 @@ class DeviceSpecificEmbeddingColumnTestV2(test.TestCase,
 
       embedding_var = global_vars[0]
       with _initialized_session():
-        self.assertAllEqual(embedding_values, embedding_var.eval())
+        self.assertAllEqual(embedding_values, embedding_var)
         eval_res = embedding_lookup.eval()
         self.assertAllEqual(expected_lookups, eval_res)
       context.Exit()
diff --git a/tensorflow/python/training/adagrad_test.py b/tensorflow/python/training/adagrad_test.py
index 60cef8a17bb..25fbec5eeec 100644
--- a/tensorflow/python/training/adagrad_test.py
+++ b/tensorflow/python/training/adagrad_test.py
@@ -199,12 +199,12 @@ class AdagradOptimizerTest(test.TestCase):
         aggregated_update = adagrad.AdagradOptimizer(3.0).apply_gradients(
             [(grad_aggregated, aggregated_update_var)])
         self.evaluate(variables.global_variables_initializer())
-        self.assertAllClose(aggregated_update_var.eval(),
+        self.assertAllClose(aggregated_update_var,
                             self.evaluate(repeated_index_update_var))
         for _ in range(3):
           repeated_update.run()
           aggregated_update.run()
-          self.assertAllClose(aggregated_update_var.eval(),
+          self.assertAllClose(aggregated_update_var,
                               self.evaluate(repeated_index_update_var))
 
   @test_util.run_deprecated_v1
diff --git a/tensorflow/python/training/adam_test.py b/tensorflow/python/training/adam_test.py
index d0ff8603da3..4142e61e356 100644
--- a/tensorflow/python/training/adam_test.py
+++ b/tensorflow/python/training/adam_test.py
@@ -147,12 +147,12 @@ class AdamOptimizerTest(test.TestCase):
         aggregated_update = adam.AdamOptimizer().apply_gradients(
             [(grad_aggregated, aggregated_update_var)])
         variables.global_variables_initializer().run()
-        self.assertAllClose(aggregated_update_var.eval(),
+        self.assertAllClose(aggregated_update_var,
                             self.evaluate(repeated_index_update_var))
         for _ in range(3):
           repeated_update.run()
           aggregated_update.run()
-          self.assertAllClose(aggregated_update_var.eval(),
+          self.assertAllClose(aggregated_update_var,
                               self.evaluate(repeated_index_update_var))
 
   def doTestBasic(self, use_resource=False, use_callable_params=False):
diff --git a/tensorflow/python/training/saver_test.py b/tensorflow/python/training/saver_test.py
index 0095f3fa269..75608b8dbf5 100644
--- a/tensorflow/python/training/saver_test.py
+++ b/tensorflow/python/training/saver_test.py
@@ -259,8 +259,8 @@ class SaverTest(test.TestCase):
         graph_saver = saver_module.Saver([w3, w4])
         self.evaluate(variables.global_variables_initializer())
         graph_saver.restore(sess, eager_ckpt_prefix)
-        self.assertAllEqual(w3.eval(), 3.0)
-        self.assertAllEqual(w4.eval(), 4.0)
+        self.assertAllEqual(w3, 3.0)
+        self.assertAllEqual(w4, 4.0)
 
   @test_util.run_in_graph_and_eager_modes
   def testResourceSaveRestoreCachingDevice(self):
@@ -2609,8 +2609,8 @@ class CheckpointReaderTest(test.TestCase):
       # Verifies get_tensor() returns the tensor value.
       v0_tensor = reader.get_tensor("v0")
       v1_tensor = reader.get_tensor("v1")
-      self.assertAllEqual(v0.eval(), v0_tensor)
-      self.assertAllEqual(v1.eval(), v1_tensor)
+      self.assertAllEqual(v0, v0_tensor)
+      self.assertAllEqual(v1, v1_tensor)
       # Verifies get_tensor() fails for non-existent tensors.
       with self.assertRaisesRegex(errors.NotFoundError,
                                   "v3 not found in checkpoint"):
diff --git a/tensorflow/python/training/slot_creator_test.py b/tensorflow/python/training/slot_creator_test.py
index 2465afdae9c..80372c72d69 100644
--- a/tensorflow/python/training/slot_creator_test.py
+++ b/tensorflow/python/training/slot_creator_test.py
@@ -152,7 +152,7 @@ class SlotCreatorTest(test.TestCase):
         self.assertEqual("var/part_%d/slot" % i, slot.op.name)
         self.assertEqual([2], slot.get_shape().as_list())
         self.assertEqual(dtypes.float32, slot.dtype.base_dtype)
-        self.assertAllEqual([1.0, 2.5], slot.eval())
+        self.assertAllEqual([1.0, 2.5], slot)
         self.assertAllEqual([2], si.full_shape)
         self.assertAllEqual([i], si.var_offset)
         self.assertAllEqual([1], si.var_shape)
@@ -173,7 +173,7 @@ class SlotCreatorTest(test.TestCase):
         self.assertEqual("var/part_%d/slot" % i, slot.op.name)
         self.assertEqual([], slot.get_shape().as_list())
         self.assertEqual(dtypes.float32, slot.dtype.base_dtype)
-        self.assertAllEqual(1.0, slot.eval())
+        self.assertAllEqual(1.0, slot)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/training/warm_starting_util_test.py b/tensorflow/python/training/warm_starting_util_test.py
index 084779b332c..b064f62a753 100644
--- a/tensorflow/python/training/warm_starting_util_test.py
+++ b/tensorflow/python/training/warm_starting_util_test.py
@@ -467,7 +467,7 @@ class WarmStartingUtilTest(test.TestCase):
         ws_util.warm_start(self.get_temp_dir(), vars_to_warm_start=[var])
         self.evaluate(variables.global_variables_initializer())
         # Verify weights were correctly warm-started (init overridden to ones).
-        self.assertAllEqual(var.eval(), prev_int_val)
+        self.assertAllEqual(var, prev_int_val)
 
   def testWarmStart_ListOfStrings(self):
     # Save checkpoint from which to warm-start.
@@ -487,7 +487,7 @@ class WarmStartingUtilTest(test.TestCase):
         ws_util.warm_start(self.get_temp_dir(), vars_to_warm_start=["v1"])
         self.evaluate(variables.global_variables_initializer())
         # Verify weights were correctly warm-started (init overridden to ones).
-        self.assertAllEqual(var.eval(), prev_int_val)
+        self.assertAllEqual(var, prev_int_val)
 
   def testWarmStart_ListOfRegexes(self):
     # Save checkpoint from which to warm-start.
@@ -524,10 +524,10 @@ class WarmStartingUtilTest(test.TestCase):
         self.evaluate(variables.global_variables_initializer())
         # Verify the selection of weights were correctly warm-started (init
         # overridden to ones).
-        self.assertAllEqual(v1.eval(), prev_v1_val)
-        self.assertAllEqual(v1_momentum.eval(), prev_v1_momentum_val)
-        self.assertAllEqual(v2.eval(), prev_v2_val)
-        self.assertAllEqual(v2_momentum.eval(), np.zeros([10, 1]))
+        self.assertAllEqual(v1, prev_v1_val)
+        self.assertAllEqual(v1_momentum, prev_v1_momentum_val)
+        self.assertAllEqual(v2, prev_v2_val)
+        self.assertAllEqual(v2_momentum, np.zeros([10, 1]))
 
   def testWarmStart_SparseColumnIntegerized(self):
     # Create feature column.

From 204a7ba6b6221f61d0ff94f20920cc7849937c37 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 30 Jun 2020 17:21:46 -0700
Subject: [PATCH 1357/1390] Fix the check used to determine training
 quantization ops.

PiperOrigin-RevId: 319131070
Change-Id: Id88bb80624c18e7a93b4d85aa08bbae73a4be60a
---
 tensorflow/lite/python/lite.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/lite/python/lite.py b/tensorflow/lite/python/lite.py
index e0917e0bd2a..c8175a7309f 100644
--- a/tensorflow/lite/python/lite.py
+++ b/tensorflow/lite/python/lite.py
@@ -376,7 +376,7 @@ class QuantizationMode(object):
     })
 
     for node_def in self._graph_def.node:
-      if any(op in node_def.name for op in training_quant_ops):
+      if node_def.op in training_quant_ops:
         return True
     return False
 

From 86e4db116d6fdc359a09539d428e0db20f4adc6a Mon Sep 17 00:00:00 2001
From: Thai Nguyen <thaink@google.com>
Date: Tue, 30 Jun 2020 17:32:43 -0700
Subject: [PATCH 1358/1390] Fix nightly build failure Remove
 RaggedTensorToTensor from flex delegate whitelist

PiperOrigin-RevId: 319132641
Change-Id: I528a9b72e7e17e0f4bf0967098ea2f66cebaef76
---
 tensorflow/core/kernels/BUILD                          | 1 -
 tensorflow/lite/delegates/flex/whitelisted_flex_ops.cc | 1 -
 2 files changed, 2 deletions(-)

diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index 4a84201d750..bd540baa65a 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -6946,7 +6946,6 @@ filegroup(
         "queue_op.cc",
         "queue_ops.cc",
         "ragged_range_op.cc",
-        "ragged_tensor_to_tensor_op.cc",
         "random_op.cc",
         "random_op_cpu.h",
         "random_poisson_op.cc",
diff --git a/tensorflow/lite/delegates/flex/whitelisted_flex_ops.cc b/tensorflow/lite/delegates/flex/whitelisted_flex_ops.cc
index 11d2d074e53..d6e219ea467 100644
--- a/tensorflow/lite/delegates/flex/whitelisted_flex_ops.cc
+++ b/tensorflow/lite/delegates/flex/whitelisted_flex_ops.cc
@@ -297,7 +297,6 @@ bool IsWhitelistedFlexOp(const std::string& tensorflow_op_name) {
           "RFFT2D",
           "RFFT3D",
           "RaggedRange",
-          "RaggedTensorToTensor",
           "RandomGamma",
           "RandomStandardNormal",
           "RandomUniform",

From 4f370285e0501a54c059ed9aa763433cc51bbac4 Mon Sep 17 00:00:00 2001
From: Ran Chen <crccw@google.com>
Date: Tue, 30 Jun 2020 17:41:03 -0700
Subject: [PATCH 1359/1390] Fix the crash when the collective param resolution
 finishes after timesout

We need to check set the is_callback_called when timeout fires as well, since
the timeout could be caused by a slow worker. The worker may catch up after the
timeout fires.

PiperOrigin-RevId: 319133791
Change-Id: I9fdd9fe1ba942c2ef07a0ea14c860c7759d91ef9
---
 .../base_collective_executor.cc               |  3 +-
 tensorflow/python/ops/collective_ops_test.py  | 46 +++++++++++++++++++
 2 files changed, 48 insertions(+), 1 deletion(-)

diff --git a/tensorflow/core/common_runtime/base_collective_executor.cc b/tensorflow/core/common_runtime/base_collective_executor.cc
index 12e30da2773..1dfe2eed426 100644
--- a/tensorflow/core/common_runtime/base_collective_executor.cc
+++ b/tensorflow/core/common_runtime/base_collective_executor.cc
@@ -311,7 +311,8 @@ void BaseCollectiveExecutor::CompleteParamsAsync(
     // TODO(xldrx): Share the timeout watchdog thread among collectives.
     SchedNonBlockingClosureAfter(
         timeout_microseconds, [is_callback_called, done] {
-          if (!is_callback_called->load()) {
+          auto should_call_callback = !is_callback_called->exchange(true);
+          if (should_call_callback) {
             auto status =
                 Status(error::DEADLINE_EXCEEDED,
                        "Collective has timed out waiting for other workers.");
diff --git a/tensorflow/python/ops/collective_ops_test.py b/tensorflow/python/ops/collective_ops_test.py
index 300863ec03a..6e238c40de8 100644
--- a/tensorflow/python/ops/collective_ops_test.py
+++ b/tensorflow/python/ops/collective_ops_test.py
@@ -205,6 +205,52 @@ class CollectiveOpTest(test.TestCase):
     elapsed = time.time() - start_time
     self.assertAllGreaterEqual(elapsed, timeout)
 
+  @test_util.run_v2_only
+  def testParamResolutionAfterTimeoutV2(self):
+    context._reset_context()
+    timeout = 1.5
+    cpus = config.list_physical_devices('CPU')
+    self.assertEqual(len(cpus), 1)
+    config.set_logical_device_configuration(cpus[0], [
+        context.LogicalDeviceConfiguration(),
+        context.LogicalDeviceConfiguration()
+    ])
+    context.ensure_initialized()
+
+    group_key = 20
+    instance_key = 30
+    input_data = constant_op.constant([1, 2, 3, 4])
+
+    # This timeout comes from param solution.
+    with self.assertRaisesRegex(
+        errors.DeadlineExceededError,
+        'Collective has timed out waiting for other workers'):
+      with ops.device('CPU:0'):
+        collective_ops.all_reduce(
+            input_data,
+            group_size=2,
+            group_key=group_key,
+            instance_key=instance_key,
+            merge_op='Add',
+            final_op='Id',
+            timeout=timeout)
+
+    # We launch the second device after the first device times out. This is to
+    # simulate the situation when other workers are slow and the timeout is
+    # short. Since the CPU:0 times out in the param resolution phase, CPU:1
+    # should times out as well, but in the execute phase.
+    with self.assertRaisesRegex(errors.DeadlineExceededError,
+                                'Collective has timed out during execution'):
+      with ops.device('CPU:1'):
+        collective_ops.all_reduce(
+            input_data,
+            group_size=2,
+            group_key=group_key,
+            instance_key=instance_key,
+            merge_op='Add',
+            final_op='Id',
+            timeout=timeout)
+
   def testNcclHintFallbackToRingReduce(self):
     """Tests that setting `communication_hint=nccl` works on non-GPU builds."""
     if kernels.get_registered_kernels_for_op('NcclAllReduce'):

From 2aac21c90a1ed218ce089ab341b1cf4e69d1d1b4 Mon Sep 17 00:00:00 2001
From: Tim Shen <timshen@google.com>
Date: Tue, 30 Jun 2020 17:45:55 -0700
Subject: [PATCH 1360/1390] [MLIR] Implement XLA HLO -> LHLO for SortOp.

This CL also does the following dependent steps:
* Refactor common code into CreateOpWithoutAttrs().
* Rewrite GetOrCreateView to support tuples and use at most one ViewOp per slice.
* Rewrite xla_lhlo::TupleOp to be functional (since it has no device runtime representation).

PiperOrigin-RevId: 319134462
Change-Id: I46a3340d03401513422933204c110e0295de9bc8
---
 tensorflow/compiler/mlir/xla/BUILD            |   2 +
 .../xla/tests/hlo_to_lhlo_with_xla/ops.mlir   |  20 ++
 .../hlo_to_lhlo_with_xla/passthrough.mlir     |   3 +-
 .../transforms/xla_hlo_to_lhlo_with_xla.cc    | 309 +++++++++++-------
 4 files changed, 207 insertions(+), 127 deletions(-)

diff --git a/tensorflow/compiler/mlir/xla/BUILD b/tensorflow/compiler/mlir/xla/BUILD
index b1187d86b1a..986b58a0292 100644
--- a/tensorflow/compiler/mlir/xla/BUILD
+++ b/tensorflow/compiler/mlir/xla/BUILD
@@ -485,6 +485,7 @@ cc_library(
     hdrs = ["transforms/xla_hlo_to_lhlo_with_xla.h"],
     deps = [
         ":hlo",
+        ":hlo_module_importer",
         ":hlo_utils",
         ":lhlo",
         ":mlir_hlo_to_hlo",
@@ -493,6 +494,7 @@ cc_library(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla/service:buffer_assignment",
         "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/service:hlo_casting_utils",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Pass",
diff --git a/tensorflow/compiler/mlir/xla/tests/hlo_to_lhlo_with_xla/ops.mlir b/tensorflow/compiler/mlir/xla/tests/hlo_to_lhlo_with_xla/ops.mlir
index 670a8216ed4..5c8cc843040 100644
--- a/tensorflow/compiler/mlir/xla/tests/hlo_to_lhlo_with_xla/ops.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/hlo_to_lhlo_with_xla/ops.mlir
@@ -305,3 +305,23 @@ func @main(%value0: tensor<2x2xf32>) -> tensor<2x2xf32> {
   %res = "xla_hlo.tanh"(%value0) : (tensor<2x2xf32>) -> tensor<2x2xf32>
   return %res : tensor<2x2xf32>
 }
+
+// -----
+
+// CHECK-LABEL: func @main
+// CHECK-SAME: %[[ARG0:.*]]: memref<5x5xi32>
+// CHECK-SAME: %[[ARG1:.*]]: memref<5x5xf32>
+// CHECK-SAME: %[[ARG2:.*]]: memref<100xi8> {xla_lhlo.alloc = 0
+// CHECK-SAME: %[[ARG3:.*]]: memref<100xi8> {xla_lhlo.alloc = 1
+// CHECK: %[[VIEW0:.*]] = std.view %[[ARG2]]{{.*}} : memref<100xi8> to memref<5x5xi32>
+// CHECK: %[[VIEW1:.*]] = std.view %[[ARG3]]{{.*}} : memref<100xi8> to memref<5x5xf32>
+// CHECK: "xla_lhlo.sort"(%[[ARG0]], %[[ARG1]], %[[VIEW0]], %[[VIEW1]])
+func @main(%key: tensor<5x5xi32>, %value: tensor<5x5xf32>) -> tuple<tensor<5x5xi32>, tensor<5x5xf32>> {
+  %res = "xla_hlo.sort"(%key, %value) ({
+  ^bb0(%a: tensor<i32>, %b: tensor<i32>, %c: tensor<f32>, %d: tensor<f32>):
+    %ret = "xla_hlo.compare"(%c, %d) {comparison_direction = "GT"} : (tensor<f32>, tensor<f32>) -> tensor<i1>
+    "xla_hlo.return"(%ret) : (tensor<i1>) -> ()
+  }) {dimension = 1 : i64, is_stable = true}: (tensor<5x5xi32>, tensor<5x5xf32>) -> tuple<tensor<5x5xi32>, tensor<5x5xf32>>
+
+  return %res : tuple<tensor<5x5xi32>, tensor<5x5xf32>>
+}
diff --git a/tensorflow/compiler/mlir/xla/tests/hlo_to_lhlo_with_xla/passthrough.mlir b/tensorflow/compiler/mlir/xla/tests/hlo_to_lhlo_with_xla/passthrough.mlir
index bfbbae99a34..d442319e7b2 100644
--- a/tensorflow/compiler/mlir/xla/tests/hlo_to_lhlo_with_xla/passthrough.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/hlo_to_lhlo_with_xla/passthrough.mlir
@@ -9,8 +9,7 @@
 func @main(%value: tensor<2x2xf32>) -> tensor<2x2xf32> {
   // The only expected instruction is a copy from the input into the output.
   // CHECK: %[[C0:.*]] = constant 0 : index
-  // CHECK: %[[C02:.*]] = constant 0 : index
-  // CHECK: %[[OUTPUT:.*]] = std.view %[[ARG1]][%[[C02]]][] : memref<16xi8> to memref<2x2xf32>
+  // CHECK: %[[OUTPUT:.*]] = std.view %[[ARG1]][%[[C0]]][] : memref<16xi8> to memref<2x2xf32>
   // CHECK: xla_lhlo.copy
   // CHECK-SAME: %[[ARG0]], %[[OUTPUT]]
   return %value : tensor<2x2xf32>
diff --git a/tensorflow/compiler/mlir/xla/transforms/xla_hlo_to_lhlo_with_xla.cc b/tensorflow/compiler/mlir/xla/transforms/xla_hlo_to_lhlo_with_xla.cc
index a12bd9e7c1a..a6de8bd68ba 100644
--- a/tensorflow/compiler/mlir/xla/transforms/xla_hlo_to_lhlo_with_xla.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/xla_hlo_to_lhlo_with_xla.cc
@@ -33,12 +33,15 @@ limitations under the License.
 #include "mlir/IR/SymbolTable.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Pass/PassOptions.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/xla/hlo_function_importer.h"
 #include "tensorflow/compiler/mlir/xla/hlo_utils.h"
 #include "tensorflow/compiler/mlir/xla/ir/lhlo_ops.h"
 #include "tensorflow/compiler/mlir/xla/mlir_hlo_to_hlo.h"
 #include "tensorflow/compiler/xla/service/buffer_assignment.h"
+#include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_instructions.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/util.h"
@@ -88,18 +91,29 @@ class LhloDialectEmitter : public ::xla::DfsHloVisitorWithDefault {
         i8_type_(builder_.getIntegerType(8)) {}
 
  private:
+  template <typename OpType>
+  StatusOr<OpType> CreateOpWithoutAttrs(HloInstruction* instr);
+
   Status DefaultAction(HloInstruction* instr) final;
 
   // Computation parameters don't need any specific handling when they are
   // visited, they are already processed when we enter a new computation.
   Status HandleParameter(HloInstruction* instr) final { return Status::OK(); }
 
-  // Helper function to create view in a buffer for a given slice. The view is
-  // cached in the `slices_` map.
-  Value GetOrCreateView(const BufferAllocation::Slice& slice);
+  Status HandleSort(HloInstruction* instr) final;
 
-  // Helper function to create view in a buffer for a given instruction result.
-  StatusOr<Value> GetOrCreateView(const HloInstruction* instr);
+  // Helper function that recursively visits the tuple structure in
+  // `current_shape`, and reconstruct a matching xla_lhlo::TupleOp.
+  // Each leaf node is converted to an std.view op with corresponding offsets.
+  // If no tuple presents, it simply returns a view of the buffer.
+  Status CreateView(const HloInstruction* instr, const Shape& current_shape,
+                    ::xla::ShapeIndex* current_shape_index,
+                    SmallVectorImpl<Value>* values);
+
+  // Helper function to create view/tuple of views to a buffer for a given
+  // instruction result.
+  Status GetOrCreateView(const HloInstruction* instr,
+                         SmallVectorImpl<Value>* values);
 
   // Return an MLIR location for an HLO instruction.
   Location getLocation(HloInstruction* inst) {
@@ -116,15 +130,23 @@ class LhloDialectEmitter : public ::xla::DfsHloVisitorWithDefault {
   // (see below).
   llvm::DenseMap<const BufferAllocation*, Value> allocations_;
 
-  // This map provides access to MLIR buffers for each HLO buffer slice. A slice
-  // is contained in a BufferAllocation, and has an offset and a size.
-  // The MLIR buffers are all `memref<{size}xi8>`. If the slice is the entire
-  // BufferAllocation then the MLIR buffer corresponds to function
-  // parameter for the allocation, otherwise it will map to a ViewOp in the
-  // allocation. It is populated lazily in the `GetOrCreateView()` helper as we
+  // This map provides access to MLIR buffers for each HLO instruction, keyed by
+  // its buffer slice. A slice is contained in a BufferAllocation, and has an
+  // offset and a size.
+  //
+  // As for why we don't use HloInstruction*, see GetOrCreateView(), but mostly
+  // we want to leverage better of the aliased buffers.
+  //
+  // If the HloInstruction is a tuple, all leaf nodes are stored flattened.
+  // Otherwise, there will be a single buffer.
+  //
+  // An MLIR buffer is either an input parameter, or a ViewOp in the case where
+  // the slice is only part of its allocation.
+  //
+  // `slices_` is populated lazily in the `GetOrCreateView()` helper as we
   // process every instruction.
   using SliceKey = std::tuple<const BufferAllocation*, int64_t, int64_t>;
-  llvm::DenseMap<SliceKey, Value> slices_;
+  llvm::DenseMap<SliceKey, llvm::SmallVector<Value, 1>> slices_;
 
   // The BufferAssignment computed by XLA ahead of time.
   const BufferAssignment& assignment_;
@@ -142,89 +164,71 @@ class LhloDialectEmitter : public ::xla::DfsHloVisitorWithDefault {
   Type i8_type_;
 };
 
-Status LhloDialectEmitter::DefaultAction(HloInstruction* instr) {
-  llvm::SmallVector<Value, 4> operands(instr->operand_count() + 1);
-  for (int arg_idx = 0; arg_idx < instr->operand_count(); ++arg_idx) {
-    TF_ASSIGN_OR_RETURN(operands[arg_idx],
-                        GetOrCreateView(instr->operand(arg_idx)));
-  }
-
-  TF_ASSIGN_OR_RETURN(operands.back(), GetOrCreateView(instr));
+template <typename OpType>
+StatusOr<OpType> LhloDialectEmitter::CreateOpWithoutAttrs(
+    HloInstruction* instr) {
   Location loc = getLocation(instr);
   ArrayRef<std::pair<Identifier, Attribute>> attrs;
   ArrayRef<Type> rets{};
 
+  llvm::SmallVector<Value, 4> operands;
+  for (const HloInstruction* operand : instr->operands()) {
+    TF_RETURN_IF_ERROR(GetOrCreateView(operand, &operands));
+  }
+  TF_RETURN_IF_ERROR(GetOrCreateView(instr, &operands));
+
+  return builder_.create<OpType>(loc, rets, operands, attrs);
+}
+
+Status LhloDialectEmitter::DefaultAction(HloInstruction* instr) {
   using ::xla::HloOpcode;
   switch (instr->opcode()) {
     case HloOpcode::kAbs:
-      builder_.create<xla_lhlo::AbsOp>(loc, rets, operands, attrs);
-      return Status::OK();
+      return CreateOpWithoutAttrs<xla_lhlo::AbsOp>(instr).status();
     case HloOpcode::kAdd:
-      builder_.create<xla_lhlo::AddOp>(loc, rets, operands, attrs);
-      return Status::OK();
+      return CreateOpWithoutAttrs<xla_lhlo::AddOp>(instr).status();
     case HloOpcode::kAnd:
-      builder_.create<xla_lhlo::AndOp>(loc, rets, operands, attrs);
-      return Status::OK();
+      return CreateOpWithoutAttrs<xla_lhlo::AndOp>(instr).status();
     case HloOpcode::kCeil:
-      builder_.create<xla_lhlo::CeilOp>(loc, rets, operands, attrs);
-      return Status::OK();
+      return CreateOpWithoutAttrs<xla_lhlo::CeilOp>(instr).status();
     case HloOpcode::kComplex:
-      builder_.create<xla_lhlo::ComplexOp>(loc, rets, operands, attrs);
-      return Status::OK();
+      return CreateOpWithoutAttrs<xla_lhlo::ComplexOp>(instr).status();
     case HloOpcode::kCopy:
-      builder_.create<xla_lhlo::CopyOp>(loc, rets, operands, attrs);
-      return Status::OK();
+      return CreateOpWithoutAttrs<xla_lhlo::CopyOp>(instr).status();
     case HloOpcode::kCos:
-      builder_.create<xla_lhlo::CosOp>(loc, rets, operands, attrs);
-      return Status::OK();
+      return CreateOpWithoutAttrs<xla_lhlo::CosOp>(instr).status();
     case HloOpcode::kDivide:
-      builder_.create<xla_lhlo::DivOp>(loc, rets, operands, attrs);
-      return Status::OK();
+      return CreateOpWithoutAttrs<xla_lhlo::DivOp>(instr).status();
     case HloOpcode::kExp:
-      builder_.create<xla_lhlo::ExpOp>(loc, rets, operands, attrs);
-      return Status::OK();
+      return CreateOpWithoutAttrs<xla_lhlo::ExpOp>(instr).status();
     case HloOpcode::kImag:
-      builder_.create<xla_lhlo::ImagOp>(loc, rets, operands, attrs);
-      return Status::OK();
+      return CreateOpWithoutAttrs<xla_lhlo::ImagOp>(instr).status();
     case HloOpcode::kLog:
-      builder_.create<xla_lhlo::LogOp>(loc, rets, operands, attrs);
-      return Status::OK();
+      return CreateOpWithoutAttrs<xla_lhlo::LogOp>(instr).status();
     case HloOpcode::kMaximum:
-      builder_.create<xla_lhlo::MaxOp>(loc, rets, operands, attrs);
-      return Status::OK();
+      return CreateOpWithoutAttrs<xla_lhlo::MaxOp>(instr).status();
     case HloOpcode::kMinimum:
-      builder_.create<xla_lhlo::MinOp>(loc, rets, operands, attrs);
-      return Status::OK();
+      return CreateOpWithoutAttrs<xla_lhlo::MinOp>(instr).status();
     case HloOpcode::kMultiply:
-      builder_.create<xla_lhlo::MulOp>(loc, rets, operands, attrs);
-      return Status::OK();
+      return CreateOpWithoutAttrs<xla_lhlo::MulOp>(instr).status();
     case HloOpcode::kNegate:
-      builder_.create<xla_lhlo::NegOp>(loc, rets, operands, attrs);
-      return Status::OK();
+      return CreateOpWithoutAttrs<xla_lhlo::NegOp>(instr).status();
     case HloOpcode::kReal:
-      builder_.create<xla_lhlo::RealOp>(loc, rets, operands, attrs);
-      return Status::OK();
+      return CreateOpWithoutAttrs<xla_lhlo::RealOp>(instr).status();
     case HloOpcode::kRemainder:
-      builder_.create<xla_lhlo::RemOp>(loc, rets, operands, attrs);
-      return Status::OK();
+      return CreateOpWithoutAttrs<xla_lhlo::RemOp>(instr).status();
     case HloOpcode::kRsqrt:
-      builder_.create<xla_lhlo::RsqrtOp>(loc, rets, operands, attrs);
-      return Status::OK();
+      return CreateOpWithoutAttrs<xla_lhlo::RsqrtOp>(instr).status();
     case HloOpcode::kSelect:
-      builder_.create<xla_lhlo::SelectOp>(loc, rets, operands, attrs);
-      return Status::OK();
+      return CreateOpWithoutAttrs<xla_lhlo::SelectOp>(instr).status();
     case HloOpcode::kSign:
-      builder_.create<xla_lhlo::SignOp>(loc, rets, operands, attrs);
-      return Status::OK();
+      return CreateOpWithoutAttrs<xla_lhlo::SignOp>(instr).status();
     case HloOpcode::kSqrt:
-      builder_.create<xla_lhlo::SqrtOp>(loc, rets, operands, attrs);
-      return Status::OK();
+      return CreateOpWithoutAttrs<xla_lhlo::SqrtOp>(instr).status();
     case HloOpcode::kSubtract:
-      builder_.create<xla_lhlo::SubOp>(loc, rets, operands, attrs);
-      return Status::OK();
+      return CreateOpWithoutAttrs<xla_lhlo::SubOp>(instr).status();
     case HloOpcode::kTanh:
-      builder_.create<xla_lhlo::TanhOp>(loc, rets, operands, attrs);
-      return Status::OK();
+      return CreateOpWithoutAttrs<xla_lhlo::TanhOp>(instr).status();
     default:
       llvm::errs() << instr->ToString();
       return tensorflow::errors::Internal(
@@ -234,54 +238,74 @@ Status LhloDialectEmitter::DefaultAction(HloInstruction* instr) {
   return Status::OK();
 }
 
-Value LhloDialectEmitter::GetOrCreateView(
-    const BufferAllocation::Slice& slice) {
-  // Check if we already have a view for this slice, otherwise we need to create
-  // a new one.
-  SliceKey slice_key(slice.allocation(), slice.offset(), slice.size());
-  auto slice_view_it = slices_.find(slice_key);
-  if (slice_view_it != slices_.end()) return slice_view_it->second;
+Status LhloDialectEmitter::HandleSort(HloInstruction* instr) {
+  TF_ASSIGN_OR_RETURN(auto sort, CreateOpWithoutAttrs<xla_lhlo::SortOp>(instr));
+  auto* sort_instr = ::xla::Cast<::xla::HloSortInstruction>(instr);
+  sort.dimensionAttr(builder_.getI64IntegerAttr(sort_instr->sort_dimension()));
+  sort.is_stableAttr(builder_.getBoolAttr(sort_instr->is_stable()));
+  TF_RETURN_IF_ERROR(::xla::HloFunctionImporter::ImportAsRegion(
+      *sort_instr->called_computations()[0], &sort.comparator(), &builder_));
+  return Status::OK();
+}
 
-  // Check if we can just use the entire allocation before creating a view.
-  Value alloc_buffer = allocations_[slice.allocation()];
-  if (slice.offset() == 0 && slice.size() == slice.allocation()->size()) {
-    slices_.insert({slice_key, alloc_buffer});
-    return alloc_buffer;
+Status LhloDialectEmitter::CreateView(const HloInstruction* instr,
+                                      const Shape& current_shape,
+                                      ::xla::ShapeIndex* current_shape_index,
+                                      SmallVectorImpl<Value>* values) {
+  if (current_shape.IsTuple()) {
+    for (int i = 0; i < current_shape.tuple_shapes().size(); i++) {
+      current_shape_index->push_back(i);
+      TF_RETURN_IF_ERROR(CreateView(instr, current_shape.tuple_shapes(i),
+                                    current_shape_index, values));
+      current_shape_index->pop_back();
+    }
+    return Status::OK();
   }
 
-  // Create the view for this slice size, possible with an affine map to model
-  // the offset. The result is cached in the slices_ map.
-  // The std.view result type does not carry the static offset: this is not
-  // useful information. Rather, the view op must have the static offset.
-  auto slice_type = MemRefType::get({slice.size()}, i8_type_, {});
+  TF_ASSIGN_OR_RETURN(Type out_type, ::xla::ConvertShapeToType<MemRefType>(
+                                         current_shape, builder_));
+  TF_ASSIGN_OR_RETURN(BufferAllocation::Slice slice,
+                      assignment_.GetUniqueSlice(instr, *current_shape_index));
+  Value alloc = allocations_[slice.allocation()];
+  if (alloc.getType() == out_type) {
+    values->push_back(alloc);
+    return Status::OK();
+  }
 
   Value byte_shift =
-      builder_.create<ConstantIndexOp>(alloc_buffer.getLoc(), slice.offset());
-  auto slice_view =
-      builder_.create<ViewOp>(alloc_buffer.getLoc(), slice_type, alloc_buffer,
-                              byte_shift, /*sizes=*/ArrayRef<Value>{});
-  slices_.insert({slice_key, slice_view});
-  return slice_view;
+      builder_.create<ConstantIndexOp>(alloc.getLoc(), slice.offset());
+  values->push_back(builder_.create<ViewOp>(builder_.getUnknownLoc(), out_type,
+                                            alloc, byte_shift,
+                                            /*sizes=*/ValueRange{}));
+  return Status::OK();
 }
 
 // Returns a view for the result of an instruction.
 // We first get a view for the slice in the allocation, and then may need to
 // create another view to adjust the slice for the shape of the instruction.
-StatusOr<Value> LhloDialectEmitter::GetOrCreateView(
-    const HloInstruction* instr) {
-  const Shape& target_shape = instr->shape();
-  TF_ASSIGN_OR_RETURN(const BufferAllocation::Slice out_slice,
+Status LhloDialectEmitter::GetOrCreateView(const HloInstruction* instr,
+                                           SmallVectorImpl<Value>* values) {
+  // In terms of cache key, we have several choices:
+  // * Use `instr`. It's the easiest, but it creates different cache entries for
+  // aliased buffers, which could have been deduplicated.
+  // * Use the actual content as the key, aka a tree of allocation slices.
+  // * Somewhere in the middle, use the allocation slice for the instruction. If
+  // `instr` is a tuple, the key is the allocated buffer for the tuple itself
+  // (an array of pointers).
+  //
+  // We choose the third approach for simplicity.
+  TF_ASSIGN_OR_RETURN(BufferAllocation::Slice slice,
                       assignment_.GetUniqueTopLevelSlice(instr));
-  Value slice_view = GetOrCreateView(out_slice);
-  TF_ASSIGN_OR_RETURN(Type out_type, ::xla::ConvertShapeToType<MemRefType>(
-                                         target_shape, builder_));
-  Value byte_shift =
-      builder_.create<ConstantIndexOp>(builder_.getUnknownLoc(), 0);
-  if (slice_view.getType() != out_type)
-    slice_view =
-        builder_.create<ViewOp>(builder_.getUnknownLoc(), out_type, slice_view,
-                                byte_shift, /*sizes=*/ArrayRef<Value>{});
-  return slice_view;
+  SliceKey slice_key(slice.allocation(), slice.offset(), slice.size());
+  auto result = slices_.try_emplace(slice_key, llvm::SmallVector<Value, 1>{});
+  llvm::SmallVectorImpl<Value>& new_values = result.first->second;
+  if (result.second) {
+    ::xla::ShapeIndex shape_index;
+    TF_RETURN_IF_ERROR(
+        CreateView(instr, instr->shape(), &shape_index, &new_values));
+  }
+  values->insert(values->end(), new_values.begin(), new_values.end());
+  return Status::OK();
 }
 
 Status LhloDialectEmitter::Run() {
@@ -295,33 +319,68 @@ Status LhloDialectEmitter::Run() {
                                 builder_.getFunctionType({}, {}));
   Block* block = func_op.addEntryBlock();
 
+  llvm::SmallVector<const BufferAllocation*, 8> ordered_allocations;
+  for (const BufferAllocation& alloc : assignment_.Allocations())
+    ordered_allocations.push_back(&alloc);
+
+  // Sort the rather arbitrarily ordered allocations to match the input/output
+  // parameters. Specifically We want to sort buffer allocations in the
+  // following order:
+  // * Parameters always order before non-parameters.
+  // * Different parameters order by parameter number.
+  // * Different allocations for the same parameter order by the shape index.
+  //
+  // TODO(timshen): there should be only one non-parameter buffer, the temp
+  // buffer. Check on that.
+  const auto allocation_comparator = [](const BufferAllocation* lhs,
+                                        const BufferAllocation* rhs) {
+    if (lhs->is_entry_computation_parameter() !=
+        rhs->is_entry_computation_parameter()) {
+      return lhs->is_entry_computation_parameter() >
+             rhs->is_entry_computation_parameter();
+    }
+    if (lhs->is_entry_computation_parameter()) {
+      return std::tuple<int, const ::xla::ShapeIndex&>(
+                 lhs->parameter_number(), lhs->param_shape_index()) <
+             std::tuple<int, const ::xla::ShapeIndex&>(
+                 rhs->parameter_number(), rhs->param_shape_index());
+    }
+    return false;
+  };
+
+  std::stable_sort(ordered_allocations.begin(), ordered_allocations.end(),
+                   allocation_comparator);
+
   // The function signature will be composed of:
   // - one memref for each of the parameters.
   // - one memref for each other buffer allocation.
   llvm::SmallVector<MutableDictionaryAttr, 8> args_attrs;
-  for (const HloInstruction* param : computation->parameter_instructions()) {
-    TF_ASSIGN_OR_RETURN(auto arg_type, ::xla::ConvertShapeToType<MemRefType>(
-                                           param->shape(), builder_));
-    // First map parameters to memrefs on the operation.
-    block->addArgument(arg_type);
-    TF_ASSIGN_OR_RETURN(const BufferAllocation::Slice slice,
-                        assignment_.GetUniqueTopLevelSlice(param));
-    allocations_[slice.allocation()] = block->getArguments().back();
-    args_attrs.emplace_back();
-    args_attrs.back().set(builder_.getIdentifier("xla_lhlo.params"),
-                          builder_.getIndexAttr(param->parameter_number()));
-  }
+  for (const BufferAllocation* alloc : ordered_allocations) {
+    if (alloc->is_entry_computation_parameter()) {
+      const ::xla::Shape& buffer_shape = ::xla::ShapeUtil::GetSubshape(
+          computation->parameter_instruction(alloc->parameter_number())
+              ->shape(),
+          alloc->param_shape_index());
 
-  for (const BufferAllocation& alloc : assignment_.Allocations()) {
-    if (alloc.is_entry_computation_parameter()) continue;
-    block->addArgument(MemRefType::get({alloc.size()}, i8_type_));
-    allocations_[&alloc] = block->getArguments().back();
-    args_attrs.emplace_back();
-    args_attrs.back().set(builder_.getIdentifier("xla_lhlo.alloc"),
-                          builder_.getIndexAttr(alloc.index()));
-    if (alloc.maybe_live_out())
-      args_attrs.back().set(builder_.getIdentifier("xla_lhlo.liveout"),
-                            builder_.getBoolAttr(true));
+      TF_ASSIGN_OR_RETURN(auto arg_type, ::xla::ConvertShapeToType<MemRefType>(
+                                             buffer_shape, builder_));
+
+      // First map parameters to memrefs on the operation.
+      block->addArgument(arg_type);
+      allocations_[alloc] = block->getArguments().back();
+      args_attrs.emplace_back();
+      args_attrs.back().set(builder_.getIdentifier("xla_lhlo.params"),
+                            builder_.getIndexAttr(alloc->parameter_number()));
+    } else {
+      block->addArgument(MemRefType::get({alloc->size()}, i8_type_));
+      allocations_[alloc] = block->getArguments().back();
+      args_attrs.emplace_back();
+      args_attrs.back().set(builder_.getIdentifier("xla_lhlo.alloc"),
+                            builder_.getIndexAttr(alloc->index()));
+      if (alloc->maybe_live_out())
+        args_attrs.back().set(builder_.getIdentifier("xla_lhlo.liveout"),
+                              builder_.getBoolAttr(true));
+    }
   }
 
   FunctionType function_type = builder_.getFunctionType(

From bfbd3f8556bf65d2c7d1f5d7469d3fdd31acd70c Mon Sep 17 00:00:00 2001
From: Mehdi Amini <aminim@google.com>
Date: Tue, 30 Jun 2020 17:46:14 -0700
Subject: [PATCH 1361/1390] Internal change

PiperOrigin-RevId: 319134515
Change-Id: Icea77a289edea168164e052ac24bf08116cee1e7
---
 third_party/mlir/tblgen.bzl | 22 +++++++++++++++++-----
 1 file changed, 17 insertions(+), 5 deletions(-)

diff --git a/third_party/mlir/tblgen.bzl b/third_party/mlir/tblgen.bzl
index bbe64c11a02..ace4a5384a5 100644
--- a/third_party/mlir/tblgen.bzl
+++ b/third_party/mlir/tblgen.bzl
@@ -1,6 +1,6 @@
 """BUILD extensions for MLIR table generation."""
 
-def gentbl(name, tblgen, td_file, tbl_outs, td_srcs = [], td_includes = [], strip_include_prefix = None, test = False):
+def gentbl(name, tblgen, td_file, tbl_outs, td_srcs = [], td_includes = [], td_relative_includes = [], strip_include_prefix = None, test = False):
     """gentbl() generates tabular code from a table definition file.
 
     Args:
@@ -11,7 +11,8 @@ def gentbl(name, tblgen, td_file, tbl_outs, td_srcs = [], td_includes = [], stri
         options passed to tblgen, and the out is the corresponding output file
         produced.
       td_srcs: A list of table definition files included transitively.
-      td_includes: A list of include paths for relative includes.
+      td_includes: A list of include paths for relative includes, provided as build targets.
+      td_relative_includes: A list of include paths for relative includes, provided as relative path.
       strip_include_prefix: attribute to pass through to cc_library.
       test: whether to create a test to invoke the tool too.
     """
@@ -20,10 +21,21 @@ def gentbl(name, tblgen, td_file, tbl_outs, td_srcs = [], td_includes = [], stri
     if td_file not in td_srcs:
         srcs += [td_file]
 
-    td_includes_cmd = ["-I external/llvm-project/mlir/include -I external/org_tensorflow"]
-    td_includes_cmd += ["-I $(GENDIR)/external/llvm-project/mlir/include"]
+    td_includes_cmd = [
+        "-I external/llvm-project/mlir/include -I external/org_tensorflow",
+        "-I $(GENDIR)/external/llvm-project/mlir/include",
+    ]
     for td_include in td_includes:
-        td_includes_cmd += ["-I%s" % td_include]
+        td_includes_cmd += [
+            "-I%s" % td_include,
+            "-I$(GENDIR)/%s" % td_include,
+        ]
+    for td_include in td_relative_includes:
+        td_includes_cmd += [
+            "-I%s/%s" % (native.package_name(), td_include),
+            "-I$(GENDIR)/%s/%s" % (native.package_name(), td_include),
+        ]
+
     local_inc = "-I $$(dirname $(location %s))" % td_file
 
     if test:

From 4f5c6c0bfa088f46f1511dd0f7327abb57f6ef5a Mon Sep 17 00:00:00 2001
From: Tim Shen <timshen@google.com>
Date: Tue, 30 Jun 2020 18:08:14 -0700
Subject: [PATCH 1362/1390] [MLIR] This CL does several NFC changes: * Export
 the emitter in the header * Add EmitXXXOp to return the created op. * Change
 ::Run() to ::Initialize(), to make it possible for the LHLO FuncOp to be
 created but left empty.

The motivation is that XLA/GPU would be able to convert a single HLO op (through EmitSortOp in this CL) to a single LHLO op, without relying on the complete support of all ops. In this use pattern, there won't be a functioning LHLO graph, but supported ops can be used for XLA/GPU codegen.

PiperOrigin-RevId: 319137430
Change-Id: I5362ccb155af112bbf33c3bac9dc240f9227b1d4
---
 .../transforms/xla_hlo_to_lhlo_with_xla.cc    | 581 ++++++++----------
 .../xla/transforms/xla_hlo_to_lhlo_with_xla.h |  98 +++
 2 files changed, 347 insertions(+), 332 deletions(-)

diff --git a/tensorflow/compiler/mlir/xla/transforms/xla_hlo_to_lhlo_with_xla.cc b/tensorflow/compiler/mlir/xla/transforms/xla_hlo_to_lhlo_with_xla.cc
index a6de8bd68ba..f57d45004cd 100644
--- a/tensorflow/compiler/mlir/xla/transforms/xla_hlo_to_lhlo_with_xla.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/xla_hlo_to_lhlo_with_xla.cc
@@ -72,337 +72,6 @@ StatusOr<std::unique_ptr<HloModule>> HloModuleFromProto(
   return HloModule::CreateFromProto(module_proto, module_config);
 }
 
-// This class will process an HloModule with the supplied BufferAssignment and
-// populate the MLIR ModuleOp with the computation converted in the LHLO
-// dialect.
-class LhloDialectEmitter : public ::xla::DfsHloVisitorWithDefault {
- public:
-  // Main entry point of the processing: after this call the MLIR ModuleOp is
-  // populated with the computation from the HloModule. The returned `Status`
-  // indicates success or failure in the conversion.
-  Status Run();
-
-  LhloDialectEmitter(const BufferAssignment& assignment,
-                     const HloModule& hlo_module, ModuleOp module)
-      : assignment_(std::move(assignment)),
-        hlo_module_(hlo_module),
-        module_(module),
-        builder_(module.getContext()),
-        i8_type_(builder_.getIntegerType(8)) {}
-
- private:
-  template <typename OpType>
-  StatusOr<OpType> CreateOpWithoutAttrs(HloInstruction* instr);
-
-  Status DefaultAction(HloInstruction* instr) final;
-
-  // Computation parameters don't need any specific handling when they are
-  // visited, they are already processed when we enter a new computation.
-  Status HandleParameter(HloInstruction* instr) final { return Status::OK(); }
-
-  Status HandleSort(HloInstruction* instr) final;
-
-  // Helper function that recursively visits the tuple structure in
-  // `current_shape`, and reconstruct a matching xla_lhlo::TupleOp.
-  // Each leaf node is converted to an std.view op with corresponding offsets.
-  // If no tuple presents, it simply returns a view of the buffer.
-  Status CreateView(const HloInstruction* instr, const Shape& current_shape,
-                    ::xla::ShapeIndex* current_shape_index,
-                    SmallVectorImpl<Value>* values);
-
-  // Helper function to create view/tuple of views to a buffer for a given
-  // instruction result.
-  Status GetOrCreateView(const HloInstruction* instr,
-                         SmallVectorImpl<Value>* values);
-
-  // Return an MLIR location for an HLO instruction.
-  Location getLocation(HloInstruction* inst) {
-    return NameLoc::get(builder_.getIdentifier(inst->name()),
-                        builder_.getContext());
-  }
-
-  // This map provides access to MLIR buffers for each HLO buffer allocation.
-  // The MLIR buffers are all `memref<{size}xi8>` and correspond to function
-  // parameters. It is populated at the beginning of the processing with all the
-  // buffer allocations and is unchanged afterward. Every HLOInstruction is
-  // using a "slice" of the buffer allocation and providing shape, layout, and
-  // Dtype. An MLIR view is used separately to model slices into the allocations
-  // (see below).
-  llvm::DenseMap<const BufferAllocation*, Value> allocations_;
-
-  // This map provides access to MLIR buffers for each HLO instruction, keyed by
-  // its buffer slice. A slice is contained in a BufferAllocation, and has an
-  // offset and a size.
-  //
-  // As for why we don't use HloInstruction*, see GetOrCreateView(), but mostly
-  // we want to leverage better of the aliased buffers.
-  //
-  // If the HloInstruction is a tuple, all leaf nodes are stored flattened.
-  // Otherwise, there will be a single buffer.
-  //
-  // An MLIR buffer is either an input parameter, or a ViewOp in the case where
-  // the slice is only part of its allocation.
-  //
-  // `slices_` is populated lazily in the `GetOrCreateView()` helper as we
-  // process every instruction.
-  using SliceKey = std::tuple<const BufferAllocation*, int64_t, int64_t>;
-  llvm::DenseMap<SliceKey, llvm::SmallVector<Value, 1>> slices_;
-
-  // The BufferAssignment computed by XLA ahead of time.
-  const BufferAssignment& assignment_;
-
-  // The HLO module that will be converted.
-  const HloModule& hlo_module_;
-
-  // This is the MLIR module in which a function will be created for every HLO
-  // computation.
-  ModuleOp module_;
-
-  // The builder keeps track of the current insertion point in the MLIR module.
-  OpBuilder builder_;
-  // Convenient "cached" access to this widely used MLIR type (i8).
-  Type i8_type_;
-};
-
-template <typename OpType>
-StatusOr<OpType> LhloDialectEmitter::CreateOpWithoutAttrs(
-    HloInstruction* instr) {
-  Location loc = getLocation(instr);
-  ArrayRef<std::pair<Identifier, Attribute>> attrs;
-  ArrayRef<Type> rets{};
-
-  llvm::SmallVector<Value, 4> operands;
-  for (const HloInstruction* operand : instr->operands()) {
-    TF_RETURN_IF_ERROR(GetOrCreateView(operand, &operands));
-  }
-  TF_RETURN_IF_ERROR(GetOrCreateView(instr, &operands));
-
-  return builder_.create<OpType>(loc, rets, operands, attrs);
-}
-
-Status LhloDialectEmitter::DefaultAction(HloInstruction* instr) {
-  using ::xla::HloOpcode;
-  switch (instr->opcode()) {
-    case HloOpcode::kAbs:
-      return CreateOpWithoutAttrs<xla_lhlo::AbsOp>(instr).status();
-    case HloOpcode::kAdd:
-      return CreateOpWithoutAttrs<xla_lhlo::AddOp>(instr).status();
-    case HloOpcode::kAnd:
-      return CreateOpWithoutAttrs<xla_lhlo::AndOp>(instr).status();
-    case HloOpcode::kCeil:
-      return CreateOpWithoutAttrs<xla_lhlo::CeilOp>(instr).status();
-    case HloOpcode::kComplex:
-      return CreateOpWithoutAttrs<xla_lhlo::ComplexOp>(instr).status();
-    case HloOpcode::kCopy:
-      return CreateOpWithoutAttrs<xla_lhlo::CopyOp>(instr).status();
-    case HloOpcode::kCos:
-      return CreateOpWithoutAttrs<xla_lhlo::CosOp>(instr).status();
-    case HloOpcode::kDivide:
-      return CreateOpWithoutAttrs<xla_lhlo::DivOp>(instr).status();
-    case HloOpcode::kExp:
-      return CreateOpWithoutAttrs<xla_lhlo::ExpOp>(instr).status();
-    case HloOpcode::kImag:
-      return CreateOpWithoutAttrs<xla_lhlo::ImagOp>(instr).status();
-    case HloOpcode::kLog:
-      return CreateOpWithoutAttrs<xla_lhlo::LogOp>(instr).status();
-    case HloOpcode::kMaximum:
-      return CreateOpWithoutAttrs<xla_lhlo::MaxOp>(instr).status();
-    case HloOpcode::kMinimum:
-      return CreateOpWithoutAttrs<xla_lhlo::MinOp>(instr).status();
-    case HloOpcode::kMultiply:
-      return CreateOpWithoutAttrs<xla_lhlo::MulOp>(instr).status();
-    case HloOpcode::kNegate:
-      return CreateOpWithoutAttrs<xla_lhlo::NegOp>(instr).status();
-    case HloOpcode::kReal:
-      return CreateOpWithoutAttrs<xla_lhlo::RealOp>(instr).status();
-    case HloOpcode::kRemainder:
-      return CreateOpWithoutAttrs<xla_lhlo::RemOp>(instr).status();
-    case HloOpcode::kRsqrt:
-      return CreateOpWithoutAttrs<xla_lhlo::RsqrtOp>(instr).status();
-    case HloOpcode::kSelect:
-      return CreateOpWithoutAttrs<xla_lhlo::SelectOp>(instr).status();
-    case HloOpcode::kSign:
-      return CreateOpWithoutAttrs<xla_lhlo::SignOp>(instr).status();
-    case HloOpcode::kSqrt:
-      return CreateOpWithoutAttrs<xla_lhlo::SqrtOp>(instr).status();
-    case HloOpcode::kSubtract:
-      return CreateOpWithoutAttrs<xla_lhlo::SubOp>(instr).status();
-    case HloOpcode::kTanh:
-      return CreateOpWithoutAttrs<xla_lhlo::TanhOp>(instr).status();
-    default:
-      llvm::errs() << instr->ToString();
-      return tensorflow::errors::Internal(
-          absl::StrCat("LHLO opcode ", ::xla::HloOpcodeString(instr->opcode()),
-                       " is not supported."));
-  }
-  return Status::OK();
-}
-
-Status LhloDialectEmitter::HandleSort(HloInstruction* instr) {
-  TF_ASSIGN_OR_RETURN(auto sort, CreateOpWithoutAttrs<xla_lhlo::SortOp>(instr));
-  auto* sort_instr = ::xla::Cast<::xla::HloSortInstruction>(instr);
-  sort.dimensionAttr(builder_.getI64IntegerAttr(sort_instr->sort_dimension()));
-  sort.is_stableAttr(builder_.getBoolAttr(sort_instr->is_stable()));
-  TF_RETURN_IF_ERROR(::xla::HloFunctionImporter::ImportAsRegion(
-      *sort_instr->called_computations()[0], &sort.comparator(), &builder_));
-  return Status::OK();
-}
-
-Status LhloDialectEmitter::CreateView(const HloInstruction* instr,
-                                      const Shape& current_shape,
-                                      ::xla::ShapeIndex* current_shape_index,
-                                      SmallVectorImpl<Value>* values) {
-  if (current_shape.IsTuple()) {
-    for (int i = 0; i < current_shape.tuple_shapes().size(); i++) {
-      current_shape_index->push_back(i);
-      TF_RETURN_IF_ERROR(CreateView(instr, current_shape.tuple_shapes(i),
-                                    current_shape_index, values));
-      current_shape_index->pop_back();
-    }
-    return Status::OK();
-  }
-
-  TF_ASSIGN_OR_RETURN(Type out_type, ::xla::ConvertShapeToType<MemRefType>(
-                                         current_shape, builder_));
-  TF_ASSIGN_OR_RETURN(BufferAllocation::Slice slice,
-                      assignment_.GetUniqueSlice(instr, *current_shape_index));
-  Value alloc = allocations_[slice.allocation()];
-  if (alloc.getType() == out_type) {
-    values->push_back(alloc);
-    return Status::OK();
-  }
-
-  Value byte_shift =
-      builder_.create<ConstantIndexOp>(alloc.getLoc(), slice.offset());
-  values->push_back(builder_.create<ViewOp>(builder_.getUnknownLoc(), out_type,
-                                            alloc, byte_shift,
-                                            /*sizes=*/ValueRange{}));
-  return Status::OK();
-}
-
-// Returns a view for the result of an instruction.
-// We first get a view for the slice in the allocation, and then may need to
-// create another view to adjust the slice for the shape of the instruction.
-Status LhloDialectEmitter::GetOrCreateView(const HloInstruction* instr,
-                                           SmallVectorImpl<Value>* values) {
-  // In terms of cache key, we have several choices:
-  // * Use `instr`. It's the easiest, but it creates different cache entries for
-  // aliased buffers, which could have been deduplicated.
-  // * Use the actual content as the key, aka a tree of allocation slices.
-  // * Somewhere in the middle, use the allocation slice for the instruction. If
-  // `instr` is a tuple, the key is the allocated buffer for the tuple itself
-  // (an array of pointers).
-  //
-  // We choose the third approach for simplicity.
-  TF_ASSIGN_OR_RETURN(BufferAllocation::Slice slice,
-                      assignment_.GetUniqueTopLevelSlice(instr));
-  SliceKey slice_key(slice.allocation(), slice.offset(), slice.size());
-  auto result = slices_.try_emplace(slice_key, llvm::SmallVector<Value, 1>{});
-  llvm::SmallVectorImpl<Value>& new_values = result.first->second;
-  if (result.second) {
-    ::xla::ShapeIndex shape_index;
-    TF_RETURN_IF_ERROR(
-        CreateView(instr, instr->shape(), &shape_index, &new_values));
-  }
-  values->insert(values->end(), new_values.begin(), new_values.end());
-  return Status::OK();
-}
-
-Status LhloDialectEmitter::Run() {
-  HloComputation* computation = hlo_module_.entry_computation();
-  std::string function_name =
-      computation->name().empty() ? "__compute" : computation->name();
-
-  // Create the function as () -> (), we'll compute the arguments from the
-  // buffer allocation and update the type then.
-  auto func_op = FuncOp::create(builder_.getUnknownLoc(), function_name,
-                                builder_.getFunctionType({}, {}));
-  Block* block = func_op.addEntryBlock();
-
-  llvm::SmallVector<const BufferAllocation*, 8> ordered_allocations;
-  for (const BufferAllocation& alloc : assignment_.Allocations())
-    ordered_allocations.push_back(&alloc);
-
-  // Sort the rather arbitrarily ordered allocations to match the input/output
-  // parameters. Specifically We want to sort buffer allocations in the
-  // following order:
-  // * Parameters always order before non-parameters.
-  // * Different parameters order by parameter number.
-  // * Different allocations for the same parameter order by the shape index.
-  //
-  // TODO(timshen): there should be only one non-parameter buffer, the temp
-  // buffer. Check on that.
-  const auto allocation_comparator = [](const BufferAllocation* lhs,
-                                        const BufferAllocation* rhs) {
-    if (lhs->is_entry_computation_parameter() !=
-        rhs->is_entry_computation_parameter()) {
-      return lhs->is_entry_computation_parameter() >
-             rhs->is_entry_computation_parameter();
-    }
-    if (lhs->is_entry_computation_parameter()) {
-      return std::tuple<int, const ::xla::ShapeIndex&>(
-                 lhs->parameter_number(), lhs->param_shape_index()) <
-             std::tuple<int, const ::xla::ShapeIndex&>(
-                 rhs->parameter_number(), rhs->param_shape_index());
-    }
-    return false;
-  };
-
-  std::stable_sort(ordered_allocations.begin(), ordered_allocations.end(),
-                   allocation_comparator);
-
-  // The function signature will be composed of:
-  // - one memref for each of the parameters.
-  // - one memref for each other buffer allocation.
-  llvm::SmallVector<MutableDictionaryAttr, 8> args_attrs;
-  for (const BufferAllocation* alloc : ordered_allocations) {
-    if (alloc->is_entry_computation_parameter()) {
-      const ::xla::Shape& buffer_shape = ::xla::ShapeUtil::GetSubshape(
-          computation->parameter_instruction(alloc->parameter_number())
-              ->shape(),
-          alloc->param_shape_index());
-
-      TF_ASSIGN_OR_RETURN(auto arg_type, ::xla::ConvertShapeToType<MemRefType>(
-                                             buffer_shape, builder_));
-
-      // First map parameters to memrefs on the operation.
-      block->addArgument(arg_type);
-      allocations_[alloc] = block->getArguments().back();
-      args_attrs.emplace_back();
-      args_attrs.back().set(builder_.getIdentifier("xla_lhlo.params"),
-                            builder_.getIndexAttr(alloc->parameter_number()));
-    } else {
-      block->addArgument(MemRefType::get({alloc->size()}, i8_type_));
-      allocations_[alloc] = block->getArguments().back();
-      args_attrs.emplace_back();
-      args_attrs.back().set(builder_.getIdentifier("xla_lhlo.alloc"),
-                            builder_.getIndexAttr(alloc->index()));
-      if (alloc->maybe_live_out())
-        args_attrs.back().set(builder_.getIdentifier("xla_lhlo.liveout"),
-                              builder_.getBoolAttr(true));
-    }
-  }
-
-  FunctionType function_type = builder_.getFunctionType(
-      llvm::to_vector<8>(block->getArgumentTypes()), {});
-  func_op.setType(function_type);
-  func_op.setAllArgAttrs(args_attrs);
-
-  SymbolTable symbol_table(module_);
-  symbol_table.insert(func_op);
-  builder_.setInsertionPointToEnd(block);
-
-  const ::xla::HloInstructionSequence* schedule =
-      assignment_.hlo_ordering().SequentialOrder(*computation);
-  if (!schedule)
-    return ::xla::Unimplemented("Missing sequential order for the computation");
-
-  const std::vector<HloInstruction*>& ordering = schedule->instructions();
-  TF_RETURN_IF_ERROR(computation->AcceptOrdered(this, ordering));
-  builder_.create<ReturnOp>(builder_.getUnknownLoc());
-  return Status::OK();
-}
-
 // Convert the MLIR `module` from HLO dialect to LHLO dialect using XLA for the
 // given platform.
 Status ConvertModule(ModuleOp module, StringRef platform_name) {
@@ -501,13 +170,261 @@ class XlaHloToLhloPass
 
 }  // namespace
 
+template <typename OpType>
+StatusOr<OpType> LhloDialectEmitter::CreateOpWithoutAttrs(
+    HloInstruction* instr) {
+  Location loc = getLocation(instr);
+  ArrayRef<std::pair<Identifier, Attribute>> attrs;
+  ArrayRef<Type> rets{};
+
+  llvm::SmallVector<Value, 4> operands;
+  for (const HloInstruction* operand : instr->operands()) {
+    TF_RETURN_IF_ERROR(GetOrCreateView(operand, &operands));
+  }
+  TF_RETURN_IF_ERROR(GetOrCreateView(instr, &operands));
+
+  return builder_.create<OpType>(loc, rets, operands, attrs);
+}
+
+Status LhloDialectEmitter::DefaultAction(HloInstruction* instr) {
+  using ::xla::HloOpcode;
+  switch (instr->opcode()) {
+    case HloOpcode::kAbs:
+      return CreateOpWithoutAttrs<xla_lhlo::AbsOp>(instr).status();
+    case HloOpcode::kAdd:
+      return CreateOpWithoutAttrs<xla_lhlo::AddOp>(instr).status();
+    case HloOpcode::kAnd:
+      return CreateOpWithoutAttrs<xla_lhlo::AndOp>(instr).status();
+    case HloOpcode::kCeil:
+      return CreateOpWithoutAttrs<xla_lhlo::CeilOp>(instr).status();
+    case HloOpcode::kComplex:
+      return CreateOpWithoutAttrs<xla_lhlo::ComplexOp>(instr).status();
+    case HloOpcode::kCopy:
+      return CreateOpWithoutAttrs<xla_lhlo::CopyOp>(instr).status();
+    case HloOpcode::kCos:
+      return CreateOpWithoutAttrs<xla_lhlo::CosOp>(instr).status();
+    case HloOpcode::kDivide:
+      return CreateOpWithoutAttrs<xla_lhlo::DivOp>(instr).status();
+    case HloOpcode::kExp:
+      return CreateOpWithoutAttrs<xla_lhlo::ExpOp>(instr).status();
+    case HloOpcode::kImag:
+      return CreateOpWithoutAttrs<xla_lhlo::ImagOp>(instr).status();
+    case HloOpcode::kLog:
+      return CreateOpWithoutAttrs<xla_lhlo::LogOp>(instr).status();
+    case HloOpcode::kMaximum:
+      return CreateOpWithoutAttrs<xla_lhlo::MaxOp>(instr).status();
+    case HloOpcode::kMinimum:
+      return CreateOpWithoutAttrs<xla_lhlo::MinOp>(instr).status();
+    case HloOpcode::kMultiply:
+      return CreateOpWithoutAttrs<xla_lhlo::MulOp>(instr).status();
+    case HloOpcode::kNegate:
+      return CreateOpWithoutAttrs<xla_lhlo::NegOp>(instr).status();
+    case HloOpcode::kReal:
+      return CreateOpWithoutAttrs<xla_lhlo::RealOp>(instr).status();
+    case HloOpcode::kRemainder:
+      return CreateOpWithoutAttrs<xla_lhlo::RemOp>(instr).status();
+    case HloOpcode::kRsqrt:
+      return CreateOpWithoutAttrs<xla_lhlo::RsqrtOp>(instr).status();
+    case HloOpcode::kSelect:
+      return CreateOpWithoutAttrs<xla_lhlo::SelectOp>(instr).status();
+    case HloOpcode::kSign:
+      return CreateOpWithoutAttrs<xla_lhlo::SignOp>(instr).status();
+    case HloOpcode::kSqrt:
+      return CreateOpWithoutAttrs<xla_lhlo::SqrtOp>(instr).status();
+    case HloOpcode::kSubtract:
+      return CreateOpWithoutAttrs<xla_lhlo::SubOp>(instr).status();
+    case HloOpcode::kTanh:
+      return CreateOpWithoutAttrs<xla_lhlo::TanhOp>(instr).status();
+    default:
+      llvm::errs() << instr->ToString();
+      return tensorflow::errors::Internal(
+          absl::StrCat("LHLO opcode ", ::xla::HloOpcodeString(instr->opcode()),
+                       " is not supported."));
+  }
+  return Status::OK();
+}
+
+StatusOr<mlir::Operation*> LhloDialectEmitter::EmitSortOp(
+    HloInstruction* instr) {
+  TF_ASSIGN_OR_RETURN(auto sort, CreateOpWithoutAttrs<xla_lhlo::SortOp>(instr));
+  auto* sort_instr = ::xla::Cast<::xla::HloSortInstruction>(instr);
+  sort.dimensionAttr(builder_.getI64IntegerAttr(sort_instr->sort_dimension()));
+  sort.is_stableAttr(builder_.getBoolAttr(sort_instr->is_stable()));
+  TF_RETURN_IF_ERROR(::xla::HloFunctionImporter::ImportAsRegion(
+      *sort_instr->called_computations()[0], &sort.comparator(), &builder_));
+  return sort.getOperation();
+}
+
+Status LhloDialectEmitter::HandleSort(HloInstruction* instr) {
+  return EmitSortOp(instr).status();
+}
+
+Status LhloDialectEmitter::CreateView(const HloInstruction* instr,
+                                      const Shape& current_shape,
+                                      ::xla::ShapeIndex* current_shape_index,
+                                      SmallVectorImpl<Value>* values) {
+  if (current_shape.IsTuple()) {
+    for (int i = 0; i < current_shape.tuple_shapes().size(); i++) {
+      current_shape_index->push_back(i);
+      TF_RETURN_IF_ERROR(CreateView(instr, current_shape.tuple_shapes(i),
+                                    current_shape_index, values));
+      current_shape_index->pop_back();
+    }
+    return Status::OK();
+  }
+
+  TF_ASSIGN_OR_RETURN(Type out_type, ::xla::ConvertShapeToType<MemRefType>(
+                                         current_shape, builder_));
+  TF_ASSIGN_OR_RETURN(BufferAllocation::Slice slice,
+                      assignment_.GetUniqueSlice(instr, *current_shape_index));
+  Value alloc = allocations_[slice.allocation()];
+  if (alloc.getType() == out_type) {
+    values->push_back(alloc);
+    return Status::OK();
+  }
+
+  Value byte_shift =
+      builder_.create<ConstantIndexOp>(alloc.getLoc(), slice.offset());
+  values->push_back(builder_.create<ViewOp>(builder_.getUnknownLoc(), out_type,
+                                            alloc, byte_shift,
+                                            /*sizes=*/ValueRange{}));
+  return Status::OK();
+}
+
+// Returns a view for the result of an instruction.
+// We first get a view for the slice in the allocation, and then may need to
+// create another view to adjust the slice for the shape of the instruction.
+Status LhloDialectEmitter::GetOrCreateView(const HloInstruction* instr,
+                                           SmallVectorImpl<Value>* values) {
+  // In terms of cache key, we have several choices:
+  // * Use `instr`. It's the easiest, but it creates different cache entries for
+  // aliased buffers, which could have been deduplicated.
+  // * Use the actual content as the key, aka a tree of allocation slices.
+  // * Somewhere in the middle, use the allocation slice for the instruction. If
+  // `instr` is a tuple, the key is the allocated buffer for the tuple itself
+  // (an array of pointers).
+  //
+  // We choose the third approach for simplicity.
+  TF_ASSIGN_OR_RETURN(BufferAllocation::Slice slice,
+                      assignment_.GetUniqueTopLevelSlice(instr));
+  SliceKey slice_key(slice.allocation(), slice.offset(), slice.size());
+  auto result = slices_.try_emplace(slice_key, llvm::SmallVector<Value, 4>{});
+  llvm::SmallVectorImpl<Value>& new_values = result.first->second;
+  if (result.second) {
+    ::xla::ShapeIndex shape_index;
+    TF_RETURN_IF_ERROR(
+        CreateView(instr, instr->shape(), &shape_index, &new_values));
+  }
+  values->insert(values->end(), new_values.begin(), new_values.end());
+  return Status::OK();
+}
+
+Status LhloDialectEmitter::Initialize() {
+  std::string function_name =
+      computation_.name().empty() ? "__compute" : computation_.name();
+
+  // Create the function as () -> (), we'll compute the arguments from the
+  // buffer allocation and update the type then.
+  auto func_op = FuncOp::create(builder_.getUnknownLoc(), function_name,
+                                builder_.getFunctionType({}, {}));
+  Block* block = func_op.addEntryBlock();
+
+  llvm::SmallVector<const BufferAllocation*, 8> ordered_allocations;
+  for (const BufferAllocation& alloc : assignment_.Allocations())
+    ordered_allocations.push_back(&alloc);
+
+  // Sort the rather arbitrarily ordered allocations to match the input/output
+  // parameters. Specifically We want to sort buffer allocations in the
+  // following order:
+  // * Parameters always order before non-parameters.
+  // * Different parameters order by parameter number.
+  // * Different allocations for the same parameter order by the shape index.
+  //
+  // TODO(timshen): there should be only one non-parameter buffer, the temp
+  // buffer. Check on that.
+  const auto allocation_comparator = [](const BufferAllocation* lhs,
+                                        const BufferAllocation* rhs) {
+    if (lhs->is_entry_computation_parameter() !=
+        rhs->is_entry_computation_parameter()) {
+      return lhs->is_entry_computation_parameter() >
+             rhs->is_entry_computation_parameter();
+    }
+    if (lhs->is_entry_computation_parameter()) {
+      return std::tuple<int, const ::xla::ShapeIndex&>(
+                 lhs->parameter_number(), lhs->param_shape_index()) <
+             std::tuple<int, const ::xla::ShapeIndex&>(
+                 rhs->parameter_number(), rhs->param_shape_index());
+    }
+    return false;
+  };
+
+  std::stable_sort(ordered_allocations.begin(), ordered_allocations.end(),
+                   allocation_comparator);
+
+  // The function signature will be composed of:
+  // - one memref for each of the parameters.
+  // - one memref for each other buffer allocation.
+  llvm::SmallVector<MutableDictionaryAttr, 8> args_attrs;
+  for (const BufferAllocation* alloc : ordered_allocations) {
+    if (alloc->is_entry_computation_parameter()) {
+      const ::xla::Shape& buffer_shape = ::xla::ShapeUtil::GetSubshape(
+          computation_.parameter_instruction(alloc->parameter_number())
+              ->shape(),
+          alloc->param_shape_index());
+
+      TF_ASSIGN_OR_RETURN(auto arg_type, ::xla::ConvertShapeToType<MemRefType>(
+                                             buffer_shape, builder_));
+
+      // First map parameters to memrefs on the operation.
+      block->addArgument(arg_type);
+      allocations_[alloc] = block->getArguments().back();
+      args_attrs.emplace_back();
+      args_attrs.back().set(builder_.getIdentifier("xla_lhlo.params"),
+                            builder_.getIndexAttr(alloc->parameter_number()));
+    } else {
+      block->addArgument(MemRefType::get({alloc->size()}, i8_type_));
+      allocations_[alloc] = block->getArguments().back();
+      args_attrs.emplace_back();
+      args_attrs.back().set(builder_.getIdentifier("xla_lhlo.alloc"),
+                            builder_.getIndexAttr(alloc->index()));
+      if (alloc->maybe_live_out())
+        args_attrs.back().set(builder_.getIdentifier("xla_lhlo.liveout"),
+                              builder_.getBoolAttr(true));
+    }
+  }
+
+  FunctionType function_type = builder_.getFunctionType(
+      llvm::to_vector<8>(block->getArgumentTypes()), {});
+  func_op.setType(function_type);
+  func_op.setAllArgAttrs(args_attrs);
+
+  SymbolTable symbol_table(module_);
+  symbol_table.insert(func_op);
+  builder_.setInsertionPointToEnd(block);
+
+  auto return_op = builder_.create<ReturnOp>(builder_.getUnknownLoc());
+  builder_ = mlir::OpBuilder(return_op);
+
+  return Status::OK();
+}
+
 std::unique_ptr<OperationPass<ModuleOp>> createXlaHloToLhloWithXlaPass() {
   return std::make_unique<XlaHloToLhloPass>();
 }
 
 Status HloToLhloModule(const BufferAssignment& assignment,
                        const HloModule& hlo_module, ModuleOp module) {
-  return LhloDialectEmitter(assignment, hlo_module, module).Run();
+  HloComputation* computation = hlo_module.entry_computation();
+
+  LhloDialectEmitter emitter(assignment, *computation, module);
+  TF_RETURN_IF_ERROR(emitter.Initialize());
+
+  const ::xla::HloInstructionSequence* schedule =
+      assignment.hlo_ordering().SequentialOrder(*computation);
+  if (!schedule)
+    return ::xla::Unimplemented("Missing sequential order for the computation");
+  const std::vector<HloInstruction*>& ordering = schedule->instructions();
+  return computation->AcceptOrdered(&emitter, ordering);
 }
 
 static PassRegistration<XlaHloToLhloPass> registration(
diff --git a/tensorflow/compiler/mlir/xla/transforms/xla_hlo_to_lhlo_with_xla.h b/tensorflow/compiler/mlir/xla/transforms/xla_hlo_to_lhlo_with_xla.h
index 1018bdbf408..9db490cb797 100644
--- a/tensorflow/compiler/mlir/xla/transforms/xla_hlo_to_lhlo_with_xla.h
+++ b/tensorflow/compiler/mlir/xla/transforms/xla_hlo_to_lhlo_with_xla.h
@@ -16,12 +16,110 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_MLIR_XLA_TRANSFORMS_XLA_HLO_TO_LHLO_WITH_XLA_H_
 #define TENSORFLOW_COMPILER_MLIR_XLA_TRANSFORMS_XLA_HLO_TO_LHLO_WITH_XLA_H_
 
+#include "mlir/IR/Builders.h"  // from @llvm-project
 #include "mlir/IR/Module.h"  // from @llvm-project
+#include "mlir/IR/StandardTypes.h"  // from @llvm-project
 #include "tensorflow/compiler/xla/service/buffer_assignment.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 
 namespace mlir {
 
+// This class will process an HloModule with the supplied BufferAssignment and
+// populate the MLIR ModuleOp with the computation converted in the LHLO
+// dialect.
+class LhloDialectEmitter : public ::xla::DfsHloVisitorWithDefault {
+ public:
+  // Initializes internal data structures. It must be called before calling any
+  // of the visitors.
+  tensorflow::Status Initialize();
+
+  LhloDialectEmitter(const xla::BufferAssignment& assignment,
+                     const xla::HloComputation& computation, ModuleOp module)
+      : assignment_(std::move(assignment)),
+        computation_(computation),
+        module_(module),
+        builder_(module.getContext()),
+        i8_type_(builder_.getIntegerType(8)) {}
+
+  xla::StatusOr<mlir::Operation*> EmitSortOp(xla::HloInstruction* instr);
+
+ private:
+  template <typename OpType>
+  xla::StatusOr<OpType> CreateOpWithoutAttrs(xla::HloInstruction* instr);
+
+  tensorflow::Status DefaultAction(xla::HloInstruction* instr) final;
+
+  // Computation parameters don't need any specific handling when they are
+  // visited, they are already processed when we enter a new computation.
+  tensorflow::Status HandleParameter(xla::HloInstruction* instr) final {
+    return tensorflow::Status::OK();
+  }
+
+  tensorflow::Status HandleSort(xla::HloInstruction* instr) final;
+
+  // Helper function that recursively visits the tuple structure in
+  // `current_shape`, and reconstruct a matching xla_lhlo::TupleOp.
+  // Each leaf node is converted to an std.view op with corresponding offsets.
+  // If no tuple presents, it simply returns a view of the buffer.
+  tensorflow::Status CreateView(const xla::HloInstruction* instr,
+                                const xla::Shape& current_shape,
+                                ::xla::ShapeIndex* current_shape_index,
+                                SmallVectorImpl<Value>* values);
+
+  // Helper function to create view/tuple of views to a buffer for a given
+  // instruction result.
+  tensorflow::Status GetOrCreateView(const xla::HloInstruction* instr,
+                                     SmallVectorImpl<Value>* values);
+
+  // Return an MLIR location for an HLO instruction.
+  Location getLocation(xla::HloInstruction* inst) {
+    return NameLoc::get(builder_.getIdentifier(inst->name()),
+                        builder_.getContext());
+  }
+
+  // This map provides access to MLIR buffers for each HLO buffer allocation.
+  // The MLIR buffers are all `memref<{size}xi8>` and correspond to function
+  // parameters. It is populated at the beginning of the processing with all the
+  // buffer allocations and is unchanged afterward. Every HLOInstruction is
+  // using a "slice" of the buffer allocation and providing shape, layout, and
+  // Dtype. An MLIR view is used separately to model slices into the allocations
+  // (see below).
+  llvm::DenseMap<const xla::BufferAllocation*, Value> allocations_;
+
+  // This map provides access to MLIR buffers for each HLO instruction, keyed by
+  // its buffer slice. A slice is contained in a BufferAllocation, and has an
+  // offset and a size.
+  //
+  // As for why we don't use HloInstruction*, see GetOrCreateView(), but mostly
+  // we want to leverage better of the aliased buffers.
+  //
+  // If the HloInstruction is a tuple, all leaf nodes are stored flattened.
+  // Otherwise, there will be a single buffer.
+  //
+  // An MLIR buffer is either an input parameter, or a ViewOp in the case where
+  // the slice is only part of its allocation.
+  //
+  // `slices_` is populated lazily in the `GetOrCreateView()` helper as we
+  // process every instruction.
+  using SliceKey = std::tuple<const xla::BufferAllocation*, int64_t, int64_t>;
+  llvm::DenseMap<SliceKey, llvm::SmallVector<Value, 1>> slices_;
+
+  // The BufferAssignment computed by XLA ahead of time.
+  const xla::BufferAssignment& assignment_;
+
+  // The HLO module that will be converted.
+  const xla::HloComputation& computation_;
+
+  // This is the MLIR module in which a function will be created for every HLO
+  // computation.
+  ModuleOp module_;
+
+  // The builder keeps track of the current insertion point in the MLIR module.
+  OpBuilder builder_;
+  // Convenient "cached" access to this widely used MLIR type (i8).
+  Type i8_type_;
+};
+
 // Populate the MLIR `module` with the computation from the `hlo_module` using
 // the provided buffer `assignment`. The returned `Status` indicates success
 // or failure in the conversion.

From 5c9816cf6e88111b71c9575ce219924cd9c86f27 Mon Sep 17 00:00:00 2001
From: Jaesung Chung <jaesung@google.com>
Date: Tue, 30 Jun 2020 18:29:26 -0700
Subject: [PATCH 1363/1390] Implement reference variable done freezing via a
 series of MLIR passes (inside importer version)

PiperOrigin-RevId: 319139809
Change-Id: I66b51116d74ae8332a505458484e4561f835a4ac
---
 tensorflow/compiler/mlir/tensorflow/BUILD     |   1 +
 .../mlir/tensorflow/translate/import_model.cc | 199 +++++-------------
 2 files changed, 52 insertions(+), 148 deletions(-)

diff --git a/tensorflow/compiler/mlir/tensorflow/BUILD b/tensorflow/compiler/mlir/tensorflow/BUILD
index dff4cb64f90..594c06c2ea1 100644
--- a/tensorflow/compiler/mlir/tensorflow/BUILD
+++ b/tensorflow/compiler/mlir/tensorflow/BUILD
@@ -766,6 +766,7 @@ cc_library(
         ":tensorflow_attributes",
         ":tensorflow_passes",
         ":tensorflow_types",
+        ":tf_saved_model_passes",
         ":translate_utils",
         "//tensorflow/cc/saved_model:bundle_v2",
         "//tensorflow/cc/saved_model:constants",
diff --git a/tensorflow/compiler/mlir/tensorflow/translate/import_model.cc b/tensorflow/compiler/mlir/tensorflow/translate/import_model.cc
index 312bf6b2678..8799e25e58b 100644
--- a/tensorflow/compiler/mlir/tensorflow/translate/import_model.cc
+++ b/tensorflow/compiler/mlir/tensorflow/translate/import_model.cc
@@ -69,6 +69,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
+#include "tensorflow/compiler/mlir/tensorflow/transforms/tf_saved_model_passes.h"
 #include "tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_flags.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/convert_tensor.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/convert_type.h"
@@ -3288,29 +3289,18 @@ class SavedModelSignatureDefImporter {
       const std::vector<std::pair<std::string, TensorInfo>>& outputs,
       const std::vector<std::string> control_outputs);
 
-  // Coarsens the islands in `module_`.
-  Status CoarsenIslands();
-
-  // Creates GlobalTensorOp for each variable and moves each VarHandle op to
-  // the enclosing function's arguments.
-  Status LiftVariables();
-
-  // Moves the result of the VarHandleOp with corresponding global tensor to the
-  // enclosing function's argument list and erases this VarHandleOp. The global
-  // tensor's shape is used to provide the most accurate nested shape.
-  void LiftVariable(VarHandleOp op, GlobalTensorOp global_tensor);
+  // Remove variables in the session initializer.
+  Status RemoveVariablesInSessionInitializer();
 
   // Removes the variable and related ops in the init function if it is already
   // imported as a global tensor.
   void RemoveVariable(VarHandleOp op);
 
-  using VarGlobalMap = llvm::MapVector<
-      llvm::StringRef,
-      std::pair<GlobalTensorOp, llvm::SmallVector<VarHandleOp, 2>>>;
+  // Runs graph pruning and executor dialect to functional conversion.
+  Status ExecutorDialectToFunctional();
 
-  // Reads all variables from the SavedModel through session and creates
-  // GlobalTensorOp for these variables.
-  Status ReadVariablesFromSession(VarGlobalMap* var_globals);
+  // Lifts the variables in `module_`.
+  Status LiftVariables();
 
   GraphImportConfig::InputArrays ParseInputArrays(
       const std::vector<std::pair<std::string, TensorInfo>>& inputs);
@@ -3407,7 +3397,8 @@ SavedModelSignatureDefImporter::ConvertSignatures() {
   mlir::OpBuilder builder(module_->getBodyRegion());
   module_->setAttr("tf_saved_model.semantics", builder.getUnitAttr());
 
-  TF_RETURN_IF_ERROR(CoarsenIslands());
+  TF_RETURN_IF_ERROR(ExecutorDialectToFunctional());
+  TF_RETURN_IF_ERROR(RemoveVariablesInSessionInitializer());
   TF_RETURN_IF_ERROR(LiftVariables());
 
   SortSavedModelModule(*module_);
@@ -3494,60 +3485,39 @@ Status SavedModelSignatureDefImporter::ConvertSignature(
   return Status::OK();
 }  // namespace
 
-Status SavedModelSignatureDefImporter::LiftVariables() {
-  VarGlobalMap var_globals;
-  llvm::SmallVector<VarHandleOp, 4> init_vars;
-
-  auto session_initializer =
+Status SavedModelSignatureDefImporter::RemoveVariablesInSessionInitializer() {
+  // TODO(b/153507667): Make a pass for the job.
+  SessionInitializerOp session_initializer =
       mlir::tf_saved_model::GetSessionInitializerOp(*module_);
 
-  auto walker = [&var_globals, &init_vars,
-                 &session_initializer](mlir::Operation* op) {
-    if (auto var_handle_op = llvm::dyn_cast<VarHandleOp>(op)) {
-      if (session_initializer &&
-          session_initializer.initializer() ==
-              var_handle_op.getParentOfType<mlir::FuncOp>().getName())
-        init_vars.push_back(var_handle_op);
-      else
-        var_globals[var_handle_op.shared_name()].second.push_back(
-            var_handle_op);
-    } else if (op->getName().getStringRef() == "tf.VariableV2") {
-      return mlir::WalkResult::interrupt();
+  if (!session_initializer) return Status::OK();
+
+  mlir::FuncOp session_initializer_func = nullptr;
+
+  for (auto func : module_->getOps<mlir::FuncOp>()) {
+    if (session_initializer.initializer() == func.getName()) {
+      session_initializer_func = func;
+      break;
     }
-    return mlir::WalkResult::advance();
-  };
-  bool contains_ref_variable = module_->walk(walker).wasInterrupted();
+  }
 
-  if (contains_ref_variable)
-    return errors::InvalidArgument(
-        "Ref variable created by VariableV2 is not supported.");
+  if (!session_initializer_func)
+    return errors::Internal("No session initializer function found.");
 
-  if (var_globals.empty()) return Status::OK();
+  if (session_initializer_func.getBlocks().size() != 1)
+    return errors::Internal("Expects exactly one block in the MLIR function.");
 
-  TF_RETURN_IF_ERROR(ReadVariablesFromSession(&var_globals));
-
-  for (const auto& it : var_globals)
-    for (VarHandleOp var_handle : it.second.second)
-      LiftVariable(var_handle, it.second.first);
+  llvm::SmallVector<VarHandleOp, 4> init_vars;
+  mlir::Block& block = session_initializer_func.getBlocks().front();
+  for (VarHandleOp op : block.getOps<VarHandleOp>()) {
+    init_vars.push_back(op);
+  }
 
   for (auto op : init_vars) RemoveVariable(op);
 
   return Status::OK();
 }
 
-Status SavedModelSignatureDefImporter::CoarsenIslands() {
-  mlir::StatusScopedDiagnosticHandler diag_handler(module_->getContext());
-
-  mlir::PassManager pm(module_->getContext());
-  pm.addNestedPass<mlir::FuncOp>(
-      mlir::tf_executor::CreateTFExecutorIslandCoarseningPass());
-  if (mlir::failed(pm.run(*module_)))
-    return diag_handler.Combine(
-        errors::Internal("failed to coarsening islands."));
-
-  return Status::OK();
-}
-
 void SavedModelSignatureDefImporter::RemoveVariable(VarHandleOp op) {
   llvm::SmallVector<mlir::Operation*, 4> work_list;
   work_list.push_back(op);
@@ -3575,99 +3545,32 @@ void SavedModelSignatureDefImporter::RemoveVariable(VarHandleOp op) {
   }
 }
 
-void SavedModelSignatureDefImporter::LiftVariable(
-    VarHandleOp op, GlobalTensorOp global_tensor) {
-  mlir::OpBuilder builder(&module_->getBodyRegion());
+Status SavedModelSignatureDefImporter::ExecutorDialectToFunctional() {
+  mlir::StatusScopedDiagnosticHandler diag_handler(module_->getContext());
 
-  auto func_op = op.getParentOfType<mlir::FuncOp>();
-  builder.setInsertionPoint(func_op);
+  mlir::PassManager pm(module_->getContext());
+  pm.addPass(mlir::tf_executor::CreateTFExecutorGraphPruningPass());
+  pm.addPass(mlir::CreateExecutorDialectToFunctionalConversionPass());
+  if (mlir::failed(pm.run(*module_)))
+    return diag_handler.Combine(
+        errors::Internal("failed to coarsening islands."));
 
-  auto func_type = func_op.getType();
-
-  // Create the new function type by adding variable type to the arguments.
-  llvm::SmallVector<mlir::Type, 4> new_input_types(
-      func_type.getInputs().begin(), func_type.getInputs().end());
-  mlir::Type resource_type = op.resource().getType();
-  // Use the corresponding global tensor's type.
-  auto type = global_tensor.type().cast<TensorType>();
-  resource_type = mlir::RankedTensorType::get(
-      {}, mlir::TF::ResourceType::get({type}, type.getContext()));
-
-  new_input_types.push_back(resource_type);
-  auto new_func_type =
-      builder.getFunctionType(new_input_types, func_type.getResults());
-
-  func_op.setType(new_func_type);
-
-  // Bind the argument to the corresponding global tensor op.
-  func_op.setArgAttr(func_op.getNumArguments() - 1,
-                     "tf_saved_model.bound_input",
-                     builder.getSymbolRefAttr(op.shared_name()));
-
-  // Add the newly added function param to entry block's arguments.
-  auto new_value = func_op.front().addArgument(resource_type);
-
-  op.getOperation()->replaceAllUsesWith(llvm::ArrayRef<mlir::Value>(new_value));
-  op.getOperation()->erase();
+  return Status::OK();
 }
 
-Status SavedModelSignatureDefImporter::ReadVariablesFromSession(
-    VarGlobalMap* var_globals) {
-  mlir::OpBuilder builder(&module_->getBodyRegion());
+Status SavedModelSignatureDefImporter::LiftVariables() {
+  mlir::StatusScopedDiagnosticHandler diag_handler(module_->getContext());
 
-  // Read all resource variables from the session.
-  std::vector<std::string> variable_names;
-  variable_names.reserve(var_globals->size());
-  for (const auto& name_and_location : *var_globals)
-    variable_names.push_back(name_and_location.first.str());
-
-  std::vector<Tensor> resource_tensors;
-  TF_RETURN_IF_ERROR(bundle_.GetSession()->Run(
-      /*inputs=*/{}, variable_names,
-      /*target_node_names=*/{}, &resource_tensors));
-
-  const DeviceMgr* device_manager;
-  TF_RETURN_IF_ERROR(bundle_.GetSession()->LocalDeviceManager(&device_manager));
-
-  // Read all underlying tensors of the variables from the session.
-  std::vector<Tensor> tensors;
-  tensors.reserve(resource_tensors.size());
-  for (const auto& resource_tensor : resource_tensors) {
-    const auto& resource_handle = resource_tensor.scalar<ResourceHandle>()();
-
-    Device* device;
-    TF_RETURN_IF_ERROR(
-        device_manager->LookupDevice(resource_handle.device(), &device));
-
-    Var* var_ptr;
-    TF_RETURN_IF_ERROR(device->resource_manager()->Lookup(
-        resource_handle.container(), resource_handle.name(), &var_ptr));
-    core::RefCountPtr<Var> var(var_ptr);
-
-    // The variable tensor is already loaded into corresponding device's
-    // resource manager when we load the saved model using LoadSavedModel().
-    // Here we just read its value.
-    mutex_lock ml(*var->mu());
-    tensors.push_back(*var->tensor());
-  }
-
-  for (const auto iter : llvm::zip(*var_globals, tensors)) {
-    // Create global tensor op corresponding to the variable. Use the location
-    // of the first use encountered.
-    VarHandleOp op = std::get<0>(iter).second.second.front();
-    const auto& name = std::get<0>(iter).first;
-    const auto& tensor = std::get<1>(iter);
-
-    // Create tensor attribute for this variable.
-    TF_ASSIGN_OR_RETURN(auto tensor_attr, ConvertTensor(tensor, &builder));
-
-    // Create the global tensor op with the tensor attribute.
-    auto type = tensor_attr.getType().cast<TensorType>();
-    auto global_tensor = builder.create<GlobalTensorOp>(
-        op.getLoc(), builder.getStringAttr(name), tensor_attr,
-        mlir::TypeAttr::get(type), builder.getUnitAttr());
-    std::get<0>(iter).second.first = global_tensor;
-  }
+  mlir::PassManager pm(module_->getContext());
+  pm.addPass(
+      mlir::TF::
+          CreateConvertReadonlyReferenceVariablesToResourceVariablesPass());
+  pm.addPass(mlir::TF::CreatePromoteVarHandlesToArgsPass());
+  pm.addPass(
+      mlir::tf_saved_model::CreateLiftVariablesPass(bundle_.GetSession()));
+  if (mlir::failed(pm.run(*module_)))
+    return diag_handler.Combine(
+        errors::Internal("failed to lifting variables."));
 
   return Status::OK();
 }

From 3d5aaf94f9ec6623235133ec51e453e6e2ef899e Mon Sep 17 00:00:00 2001
From: Henry Tan <henrytan@google.com>
Date: Tue, 30 Jun 2020 18:32:55 -0700
Subject: [PATCH 1364/1390] Convert `TpuCompilationCacheLookup` into a template
 class.

PiperOrigin-RevId: 319140193
Change-Id: I9a2371da61593903277c7829ab9baf86dc74950d
---
 tensorflow/core/tpu/kernels/BUILD                          | 1 -
 tensorflow/core/tpu/kernels/tpu_compilation_cache_lookup.h | 1 -
 2 files changed, 2 deletions(-)

diff --git a/tensorflow/core/tpu/kernels/BUILD b/tensorflow/core/tpu/kernels/BUILD
index 57be8a3f03f..59e6d648fbb 100644
--- a/tensorflow/core/tpu/kernels/BUILD
+++ b/tensorflow/core/tpu/kernels/BUILD
@@ -201,7 +201,6 @@ cc_library(
         "tpu_compilation_cache_lookup.h",
     ],
     deps = [
-        ":tpu_compilation_cache_entry",
         ":tpu_compilation_cache_interface",
         ":tpu_compilation_cache_proto_cc",
         "//tensorflow/core/lib/core:refcount",
diff --git a/tensorflow/core/tpu/kernels/tpu_compilation_cache_lookup.h b/tensorflow/core/tpu/kernels/tpu_compilation_cache_lookup.h
index fb9c3a88407..0d1a53d31d2 100644
--- a/tensorflow/core/tpu/kernels/tpu_compilation_cache_lookup.h
+++ b/tensorflow/core/tpu/kernels/tpu_compilation_cache_lookup.h
@@ -18,7 +18,6 @@ limitations under the License.
 #include "tensorflow/core/lib/core/refcount.h"
 #include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/tpu/kernels/tpu_compilation_cache.pb.h"
-#include "tensorflow/core/tpu/kernels/tpu_compilation_cache_entry.h"
 #include "tensorflow/core/tpu/kernels/tpu_compilation_cache_interface.h"
 
 namespace tensorflow {

From 696d3c50495511ab09c1620fca0e9861714ab399 Mon Sep 17 00:00:00 2001
From: Dero Gharibian <dero@google.com>
Date: Tue, 30 Jun 2020 18:54:50 -0700
Subject: [PATCH 1365/1390] Fix libtensorflow breakage.

Updates the tar.gz structure, from:

-r-xr-xr-x 0/0            5976 1999-12-31 16:00 ./include/tensorflow/c/ctstring.h
-r-xr-xr-x 0/0           13505 1999-12-31 16:00 ./include/tensorflow/c/ctstring_internal.h

to:

-r-xr-xr-x 0/0            5976 1999-12-31 16:00 ./include/tensorflow/core/platform/ctstring.h
-r-xr-xr-x 0/0           13505 1999-12-31 16:00 ./include/tensorflow/core/platform/ctstring_internal.h

which is expected for tf_tstring.h.

PiperOrigin-RevId: 319142838
Change-Id: I3225e4f8f3618d1974337e7a729b32f00ffe85be
---
 tensorflow/tools/lib_package/BUILD | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tensorflow/tools/lib_package/BUILD b/tensorflow/tools/lib_package/BUILD
index 8749c511340..b336ff21b05 100644
--- a/tensorflow/tools/lib_package/BUILD
+++ b/tensorflow/tools/lib_package/BUILD
@@ -73,7 +73,8 @@ pkg_tar(
     srcs = [
         "//tensorflow/c:headers",
     ],
-    package_dir = "include/tensorflow/c",
+    package_dir = "include/",
+    strip_prefix = "/",
     # Mark as "manual" till
     # https://github.com/bazelbuild/bazel/issues/2352
     # and https://github.com/bazelbuild/bazel/issues/1580

From 70d1d81d08e9017db3a8fbac96ab12c5894df343 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 30 Jun 2020 19:03:30 -0700
Subject: [PATCH 1366/1390] [tf.data] Adds a histogram for elapsed time between
 successive calls to tf.data's iterator.getNext().

PiperOrigin-RevId: 319143835
Change-Id: Id79b746f9f7c4fffe766fdbe68d6b99f2dcf733e
---
 tensorflow/core/framework/metrics.cc         | 20 ++++++
 tensorflow/core/framework/metrics.h          |  6 ++
 tensorflow/core/kernels/data/iterator_ops.cc | 74 +++++++++++++++++++-
 tensorflow/core/kernels/data/iterator_ops.h  | 24 ++++++-
 4 files changed, 120 insertions(+), 4 deletions(-)

diff --git a/tensorflow/core/framework/metrics.cc b/tensorflow/core/framework/metrics.cc
index 4af3d7cffcf..576f0430126 100644
--- a/tensorflow/core/framework/metrics.cc
+++ b/tensorflow/core/framework/metrics.cc
@@ -89,6 +89,18 @@ auto* tf_data_getnext_duration_counter = monitoring::Sampler<0>::New(
     // Power of 2 with bucket count 10 (1024 ms)
     {monitoring::Buckets::Exponential(1, 2, 10)});
 
+auto* tf_data_getnext_time_between_ms_histogram = monitoring::Sampler<0>::New(
+    {"/tensorflow/data/getnext_time_between",
+     "Milliseconds spent in between calls to tf.data Dataset TF iterator."},
+    // A typical training step is in the 200ms to 1 second range.
+    // Elapsed time less than 25ms are likely due to multiple devices calling
+    // the iterator's getNext() during the same step.
+    // Bucket density is highest for small time intervals to more accurately
+    // measure fast ingest rates. Step sizes are as follows:
+    {monitoring::Buckets::Explicit({25., 50., 75., 100., 125., 150., 175., 200.,
+                                    225., 250., 300., 350., 400., 450., 500.,
+                                    1000., 10000.})});
+
 auto* tf_data_optimization_counter = monitoring::Counter<1>::New(
     "/tensorflow/data/optimization", "tf.data optimization", "name");
 
@@ -170,6 +182,14 @@ void RecordTFDataGetNextDuration(uint64 duration_us) {
   tfdata_getnext_duration_cell->Add(duration_us);
 }
 
+void RecordTFDataGetNextTimeBetween(uint64 duration_us) {
+  static auto* tfdata_getnext_time_between_cell =
+      tf_data_getnext_time_between_ms_histogram->GetCell();
+  // Convert to milliseconds for histogram
+  const auto duration_ms = duration_us / 1000;
+  tfdata_getnext_time_between_cell->Add(duration_ms);
+}
+
 void RecordTFDataOptimization(const string& name, int64 num_changes) {
   tf_data_optimization_counter->GetCell(name)->IncrementBy(num_changes);
 }
diff --git a/tensorflow/core/framework/metrics.h b/tensorflow/core/framework/metrics.h
index 7d281f97c66..263fde272ab 100644
--- a/tensorflow/core/framework/metrics.h
+++ b/tensorflow/core/framework/metrics.h
@@ -59,6 +59,12 @@ void RecordTFDataBytesFetched(int64 num_bytes);
 // Records the time spent in ItertatorResource::GetNext() in microseconds.
 void RecordTFDataGetNextDuration(uint64 duration_us);
 
+// Records the time spent between IteratorResource::GetNext() calls
+// in microseconds. Time is measured from the point of returning data from
+// GetNext() to the point of new data being requested.
+// This elapsed time corresponds to time spent outside the GetNext() function.
+void RecordTFDataGetNextTimeBetween(uint64 duration_us);
+
 // Records the number of times each tf.data fingerprint is used
 // to measure duplicate pre-processing.
 //
diff --git a/tensorflow/core/kernels/data/iterator_ops.cc b/tensorflow/core/kernels/data/iterator_ops.cc
index eebcd4a8248..d91f91bd684 100644
--- a/tensorflow/core/kernels/data/iterator_ops.cc
+++ b/tensorflow/core/kernels/data/iterator_ops.cc
@@ -91,11 +91,14 @@ Status IteratorResource::GetNext(OpKernelContext* ctx,
         [cm = params.cancellation_manager]() { cm->StartCancel(); },
         &deregister_fn));
     auto cleanup = gtl::MakeCleanup(std::move(deregister_fn));
-    uint64 start_time_us = ctx->env()->NowMicros();
+    RecordCtx record_ctx = CreateRecordCtx();  // Snapshot state prior to work
+    // TODO(mkuchnik): Replace wallclock time with steady clock
+    const uint64 start_time_us = ctx->env()->NowMicros();
+    RecordGetNextStart(record_ctx, start_time_us);
     auto val = captured_state->iterator->GetNext(
         IteratorContext(std::move(params)), out_tensors, end_of_sequence);
-    metrics::RecordTFDataGetNextDuration(ctx->env()->NowMicros() -
-                                         start_time_us);
+    const uint64 end_time_us = ctx->env()->NowMicros();
+    RecordGetNextEnd(record_ctx, end_time_us);
     metrics::RecordTFDataBytesFetched(GetTotalBytes(*out_tensors));
     return val;
   }
@@ -206,6 +209,71 @@ Status IteratorResource::SetIteratorFromDataset(OpKernelContext* ctx,
   return Status::OK();
 }
 
+IteratorResource::RecordCtx IteratorResource::CreateRecordCtx()
+    TF_LOCKS_EXCLUDED(mu_) {
+  IteratorResource::RecordCtx record_ctx;
+  {
+    tf_shared_lock l(mu_);
+    record_ctx.last_get_next_end_time_us =
+        iterator_state_->last_get_next_end_time_us;
+  }
+  return record_ctx;
+}
+
+void IteratorResource::RecordGetNextStart(
+    IteratorResource::RecordCtx& record_ctx, const uint64 start_time_us) {
+  record_ctx.get_next_start_time_us = start_time_us;
+  uint64 last_end_time_us = record_ctx.last_get_next_end_time_us;
+
+  // Records the total amount of time that has elapsed between GetNext()
+  // calls. The time between calls is measured from the point of returning
+  // data from GetNext() to the point of requesting data from GetNext().
+  // A steady clock is preferable. There are three parts to the algorithm
+  // under concurrency which maintain the thread local invariant
+  // last_end_time_us <= start_time_us <= end_time_us and the
+  // IteratorResource invariant that last_end_time_us is increasing:
+  // 1) CreateRecordCtx() is called, which copies the
+  //    last_get_next_end_time_us into a thread-local structure
+  // 2) RecordGetNextStart is called with a clock measured after 1),
+  //    thus ensuring that local start_time_us >= last_get_next_end_time_us
+  // 3) RecordGetNextEnd is called with a clock measured after 2),
+  //    thus ensuring that local end_time_us >= start_time_us. Additionally,
+  //    this function updates the IteratorResource last_get_next_end_time_us
+  //    with the most recent time. Thus, if two threads call this method,
+  //    only the most recent one is visible in the time.
+  // It's worth noting that a mutex over all three pieces may be needed for
+  // strict serialization correctness (i.e., local time may grow stale).
+  if (last_end_time_us) {  // last_end_time_us is initialized at 0
+    if (start_time_us >= last_end_time_us) {
+      const uint64 get_next_time_between = start_time_us - last_end_time_us;
+      metrics::RecordTFDataGetNextTimeBetween(get_next_time_between);
+    } else {
+      // Clock went backward (not steady).
+      metrics::RecordTFDataGetNextTimeBetween(0);
+    }
+  }
+}
+
+void IteratorResource::RecordGetNextEnd(
+    const IteratorResource::RecordCtx& record_ctx, const uint64 end_time_us)
+    TF_LOCKS_EXCLUDED(mu_) {
+  uint64 start_time_us = record_ctx.get_next_start_time_us;
+  {
+    mutex_lock l(mu_);
+    // Move last_end_time forward if more recent
+    iterator_state_->last_get_next_end_time_us =
+        std::max(end_time_us, iterator_state_->last_get_next_end_time_us);
+  }
+  DCHECK_NE(start_time_us, 0);
+  if (end_time_us >= start_time_us) {
+    const uint64 get_next_duration = end_time_us - start_time_us;
+    metrics::RecordTFDataGetNextDuration(get_next_duration);
+  } else {
+    // Clock went backward (not steady).
+    metrics::RecordTFDataGetNextDuration(0);
+  }
+}
+
 namespace {
 
 // Wrapper for encoding/decoding the iterator state stored in a Variant tensor.
diff --git a/tensorflow/core/kernels/data/iterator_ops.h b/tensorflow/core/kernels/data/iterator_ops.h
index 938b218bcb7..a6a04e502ad 100644
--- a/tensorflow/core/kernels/data/iterator_ops.h
+++ b/tensorflow/core/kernels/data/iterator_ops.h
@@ -92,7 +92,8 @@ class IteratorResource : public ResourceBase {
           flr(flr),
           pflr(std::move(pflr)),
           function_handle_cache(absl::make_unique<FunctionHandleCache>(flr)),
-          iterator(std::move(iterator)) {}
+          iterator(std::move(iterator)),
+          last_get_next_end_time_us(0) {}
 
     ~State() { cancellation_manager.StartCancel(); }
 
@@ -109,8 +110,29 @@ class IteratorResource : public ResourceBase {
     ResourceMgr resource_mgr;
     CancellationManager cancellation_manager;
     std::unique_ptr<DatasetBaseIterator> iterator;
+    uint64 last_get_next_end_time_us;
   };
 
+  // For thread-local record-keeping state
+  struct RecordCtx {
+    RecordCtx() : get_next_start_time_us(0), last_get_next_end_time_us(0) {}
+
+    uint64 get_next_start_time_us;
+    uint64 last_get_next_end_time_us;
+  };
+
+  // Copies relevant state to the RecordCtx
+  // Intended to be followed by RecordGetNextStart and RecordGetNextEnd.
+  // Recorded times must be measured after this call to enforce ordering.
+  RecordCtx CreateRecordCtx() TF_LOCKS_EXCLUDED(mu_);
+
+  // Records that GetNext() has started work.
+  void RecordGetNextStart(RecordCtx& record_ctx, const uint64 start_time_us);
+
+  // Records that GetNext() has ended work.
+  void RecordGetNextEnd(const RecordCtx& record_ctx, const uint64 end_time_us)
+      TF_LOCKS_EXCLUDED(mu_);
+
   UnboundedThreadPool unbounded_thread_pool_;
   mutex mu_;
   const std::unique_ptr<DeviceMgr> device_mgr_ TF_GUARDED_BY(mu_);

From 1bf8f49335661fa330877db7badfa6ed264a1bb7 Mon Sep 17 00:00:00 2001
From: Tim Shen <timshen@google.com>
Date: Tue, 30 Jun 2020 19:04:16 -0700
Subject: [PATCH 1367/1390] [StreamExecutor] Workaround the cuFFT bug in CUDA
 10.1/10.2/11.

See https://github.com/google/jax/issues/2874 for details.

PiperOrigin-RevId: 319143928
Change-Id: I8c4759e90d6e9f6e134e5f2a241cb946d7db99b3
---
 tensorflow/stream_executor/cuda/cuda_fft.cc | 36 +++++++++++++++++++--
 tensorflow/stream_executor/cuda/cuda_fft.h  |  6 +++-
 2 files changed, 38 insertions(+), 4 deletions(-)

diff --git a/tensorflow/stream_executor/cuda/cuda_fft.cc b/tensorflow/stream_executor/cuda/cuda_fft.cc
index 79047d989bb..aab263280b5 100644
--- a/tensorflow/stream_executor/cuda/cuda_fft.cc
+++ b/tensorflow/stream_executor/cuda/cuda_fft.cc
@@ -29,6 +29,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/platform/logging.h"
 #include "tensorflow/stream_executor/platform/port.h"
 #include "tensorflow/stream_executor/plugin_registry.h"
+#include "tensorflow/stream_executor/stream.h"
 #include "tensorflow/stream_executor/stream_executor_internal.h"
 
 namespace stream_executor {
@@ -82,6 +83,7 @@ port::Status CUDAFftPlan::Initialize(
     LOG(FATAL) << "Try to repeatedly initialize.";
   }
   is_initialized_ = true;
+  scratch_allocator_ = scratch_allocator;
   cuda::ScopedActivateExecutorContext sac(parent);
   int elem_count_[3], input_embed_[3], output_embed_[3];
   for (int i = 0; i < rank; ++i) {
@@ -243,6 +245,8 @@ port::Status CUDAFftPlan::Initialize(GpuExecutor *parent, Stream *stream,
 
 port::Status CUDAFftPlan::UpdateScratchAllocator(
     Stream *stream, ScratchAllocator *scratch_allocator) {
+  scratch_allocator_ = scratch_allocator;
+
   if (scratch_size_bytes_ != 0) {
     auto allocated = scratch_allocator->AllocateBytes(scratch_size_bytes_);
     if (!allocated.ok() || (scratch_ = allocated.ValueOrDie()) == nullptr) {
@@ -455,6 +459,9 @@ bool CUDAFft::DoFftInternal(Stream *stream, fft::Plan *plan, FuncT cufftExec,
                             const DeviceMemory<InputT> &input,
                             DeviceMemory<OutputT> *output) {
   CUDAFftPlan *cuda_fft_plan = dynamic_cast<CUDAFftPlan *>(plan);
+
+  DeviceMemory<InputT> input_maybe_copy = input;
+
   if (cuda_fft_plan == nullptr) {
     LOG(ERROR) << "the passed-in plan is not a CUDAFftPlan object.";
     return false;
@@ -464,10 +471,33 @@ bool CUDAFft::DoFftInternal(Stream *stream, fft::Plan *plan, FuncT cufftExec,
     return false;
   }
 
+  // Workaround a cuFFT bug, which mutates the input buffer when it shouldn't.
+  // See b/155276727 and go/nvbugs/2959622.
+  // TODO(b/155276727): refine the bounding condition.
+  if (input.opaque() != output->opaque() && CUDA_VERSION >= 10010 &&
+      CUDA_VERSION <= 11000 &&
+      std::is_same<InputT, std::complex<float>>::value &&
+      std::is_same<OutputT, float>::value && input.size() > 0) {
+    auto *allocator = cuda_fft_plan->GetScratchAllocator();
+    if (allocator) {
+      auto allocated = allocator->AllocateBytes(input.size());
+      if (allocated.ok()) {
+        if (stream->ThenMemcpy(&allocated.ValueOrDie(), input, input.size())
+                .ok()) {
+          input_maybe_copy = DeviceMemory<InputT>(allocated.ValueOrDie());
+        }
+      }
+      // Keep going even the workaround fails, since we don't have a good
+      // bounding box. We don't want to give up on a potentially correct
+      // execution just because the allocation for the incorrect case fails.
+    }
+  }
+
   cuda::ScopedActivateExecutorContext sac(parent_);
-  auto ret = cufftExec(cuda_fft_plan->GetPlan(),
-                       GpuComplex(const_cast<InputT *>(GpuMemory(input))),
-                       GpuComplex(GpuMemoryMutable(output)));
+  auto ret =
+      cufftExec(cuda_fft_plan->GetPlan(),
+                GpuComplex(const_cast<InputT *>(GpuMemory(input_maybe_copy))),
+                GpuComplex(GpuMemoryMutable(output)));
 
   if (ret != CUFFT_SUCCESS) {
     LOG(ERROR) << "failed to run cuFFT routine: " << ret;
diff --git a/tensorflow/stream_executor/cuda/cuda_fft.h b/tensorflow/stream_executor/cuda/cuda_fft.h
index e7b0d66cb13..9cb3fd94245 100644
--- a/tensorflow/stream_executor/cuda/cuda_fft.h
+++ b/tensorflow/stream_executor/cuda/cuda_fft.h
@@ -50,7 +50,8 @@ class CUDAFftPlan : public fft::Plan {
         fft_type_(fft::Type::kInvalid),
         scratch_(nullptr),
         scratch_size_bytes_(0),
-        is_initialized_(false) {}
+        is_initialized_(false),
+        scratch_allocator_(nullptr) {}
   ~CUDAFftPlan() override;
 
   // Get FFT direction in cuFFT based on FFT type.
@@ -79,6 +80,8 @@ class CUDAFftPlan : public fft::Plan {
   port::Status UpdateScratchAllocator(Stream *stream,
                                       ScratchAllocator *scratch_allocator);
 
+  ScratchAllocator* GetScratchAllocator() const { return scratch_allocator_; }
+
  protected:
   bool IsInitialized() const { return is_initialized_; }
 
@@ -89,6 +92,7 @@ class CUDAFftPlan : public fft::Plan {
   DeviceMemory<uint8> scratch_;
   size_t scratch_size_bytes_;
   bool is_initialized_;
+  ScratchAllocator* scratch_allocator_;
 };
 
 // FFT support for CUDA platform via cuFFT library.

From 3be438aca282c7c77b83c322a431947014b8cf87 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 30 Jun 2020 19:18:06 -0700
Subject: [PATCH 1368/1390]  Add horovod tests

PiperOrigin-RevId: 319145206
Change-Id: I44f186249e609caa570eef7d48b53da83214eb44
---
 .../tools/ci_build/horovod/gpu/nightly.sh     | 22 ++++++++++++++++++-
 1 file changed, 21 insertions(+), 1 deletion(-)

diff --git a/tensorflow/tools/ci_build/horovod/gpu/nightly.sh b/tensorflow/tools/ci_build/horovod/gpu/nightly.sh
index 50f5f49cfad..c85aa18b8fa 100644
--- a/tensorflow/tools/ci_build/horovod/gpu/nightly.sh
+++ b/tensorflow/tools/ci_build/horovod/gpu/nightly.sh
@@ -35,12 +35,32 @@ fi
 wget https://download.open-mpi.org/release/open-mpi/v4.0/openmpi-4.0.4.tar.gz
 tar xvf openmpi-4.0.4.tar.gz
 
-cd openmpi
+# Install gcc.
+sudo apt install --assume-yes build-essential
+
+gcc --version
+
+cd openmpi-4.0.4
 ./configure
 
 # Install open-mpi.
 sudo make all install
+export LD_LIBRARY_PATH=/usr/local/lib/openmpi
 sudo ldconfig
 
+# Install Horovod.
+cd ..
+pip3 install horovod tensorflow
 
+# Install tests.
+git clone https://github.com/DEKHTIARJonathan/TF_HVD_Stability_Test.git
 
+# Install pytest.
+pip3 install -U pytest
+
+# Install requirements.
+cd TF_HVD_Stability_Test
+pip3 install -r requirements.txt
+
+# Run the tests.
+python3 -m pytest

From e9695a20ee2b9e4b21e0f40a74220f3adcda348a Mon Sep 17 00:00:00 2001
From: Xunkai Zhang <xunkai@google.com>
Date: Tue, 30 Jun 2020 19:42:37 -0700
Subject: [PATCH 1369/1390] Move metadata, codegen and java lib into the new
 repo.

PiperOrigin-RevId: 319147352
Change-Id: I79ab15ccebe9d50c62952c535746c6639883fc3a
---
 .../lite/experimental/support/README.md       |    5 +
 .../lite/experimental/support/codegen/BUILD   |   87 --
 .../experimental/support/codegen/README.md    |   13 -
 .../support/codegen/android_java_generator.cc | 1017 -----------------
 .../support/codegen/android_java_generator.h  |  116 --
 .../support/codegen/code_generator.cc         |  179 ---
 .../support/codegen/code_generator.h          |   80 --
 .../support/codegen/code_generator_test.cc    |  126 --
 .../support/codegen/metadata_helper.cc        |  100 --
 .../support/codegen/metadata_helper.h         |   51 -
 .../experimental/support/codegen/python/BUILD |   38 -
 .../support/codegen/python/codegen.py         |   96 --
 .../support/codegen/python/codegen_lib.cc     |   49 -
 .../experimental/support/codegen/utils.cc     |  194 ----
 .../lite/experimental/support/codegen/utils.h |  127 --
 .../support/codegen/utils_test.cc             |   97 --
 .../support/java/AndroidManifest.xml          |    6 -
 .../lite/experimental/support/java/BUILD      |   66 --
 .../lite/experimental/support/java/README.md  |   17 -
 .../lite/support/common/FileUtil.java         |  184 ---
 .../lite/support/common/Operator.java         |   31 -
 .../lite/support/common/Processor.java        |   23 -
 .../support/common/SequentialProcessor.java   |   82 --
 .../support/common/SupportPreconditions.java  |  184 ---
 .../lite/support/common/TensorOperator.java   |   27 -
 .../lite/support/common/TensorProcessor.java  |   68 --
 .../lite/support/common/ops/CastOp.java       |   55 -
 .../lite/support/common/ops/DequantizeOp.java |   40 -
 .../lite/support/common/ops/NormalizeOp.java  |  160 ---
 .../lite/support/common/ops/QuantizeOp.java   |   41 -
 .../lite/support/image/BoundingBoxUtil.java   |  202 ----
 .../lite/support/image/ImageConversions.java  |  108 --
 .../lite/support/image/ImageOperator.java     |   43 -
 .../lite/support/image/ImageProcessor.java    |  198 ----
 .../lite/support/image/TensorImage.java       |  381 ------
 .../lite/support/image/ops/ResizeOp.java      |   89 --
 .../image/ops/ResizeWithCropOrPadOp.java      |  125 --
 .../lite/support/image/ops/Rot90Op.java       |  103 --
 .../image/ops/TensorOperatorWrapper.java      |   70 --
 .../lite/support/label/Category.java          |   62 -
 .../lite/support/label/LabelUtil.java         |   64 --
 .../lite/support/label/TensorLabel.java       |  224 ----
 .../lite/support/label/ops/LabelAxisOp.java   |   74 --
 .../lite/support/model/GpuDelegateProxy.java  |   69 --
 .../tensorflow/lite/support/model/Model.java  |  285 -----
 .../support/tensorbuffer/TensorBuffer.java    |  412 -------
 .../tensorbuffer/TensorBufferFloat.java       |  110 --
 .../tensorbuffer/TensorBufferUint8.java       |  111 --
 .../lite/experimental/support/metadata/BUILD  |  113 --
 .../experimental/support/metadata/README.md   |   15 -
 .../support/metadata/build_defs.bzl           |   43 -
 .../experimental/support/metadata/cc/BUILD    |   29 -
 .../metadata/cc/metadata_parser.h.template    |   28 -
 .../support/metadata/cc/metadata_version.cc   |  214 ----
 .../support/metadata/cc/metadata_version.h    |   38 -
 .../support/metadata/cc/python/BUILD          |   22 -
 .../metadata/cc/python/metadata_version.cc    |   55 -
 .../support/metadata/cc/test/BUILD            |   24 -
 .../metadata/cc/test/metadata_parser_test.cc  |   33 -
 .../metadata/cc/test/metadata_version_test.cc |  187 ---
 .../support/metadata/flatbuffers_lib/BUILD    |   23 -
 .../flatbuffers_lib/flatbuffers_lib.cc        |   59 -
 .../support/metadata/java/AndroidManifest.xml |    6 -
 .../experimental/support/metadata/java/BUILD  |   40 -
 .../support/metadata/BoundedInputStream.java  |  116 --
 .../support/metadata/ByteBufferChannel.java   |  130 ---
 .../support/metadata/MetadataExtractor.java   |  368 ------
 .../lite/support/metadata/MetadataParser.java |   27 -
 .../lite/support/metadata/ModelInfo.java      |  266 -----
 .../support/metadata/ModelMetadataInfo.java   |  153 ---
 .../lite/support/metadata/Preconditions.java  |  184 ---
 .../metadata/SeekableByteChannelCompat.java   |  107 --
 .../lite/support/metadata/ZipFile.java        |  427 -------
 .../experimental/support/metadata/metadata.py |  615 ----------
 .../metadata/metadata_parser.py.template      |   26 -
 .../support/metadata/metadata_parser_test.py  |   38 -
 .../support/metadata/metadata_schema.fbs      |  570 ---------
 .../support/metadata/metadata_test.py         |  484 --------
 .../metadata/testdata/golden_json.json        |   22 -
 79 files changed, 5 insertions(+), 10546 deletions(-)
 create mode 100644 tensorflow/lite/experimental/support/README.md
 delete mode 100644 tensorflow/lite/experimental/support/codegen/BUILD
 delete mode 100644 tensorflow/lite/experimental/support/codegen/README.md
 delete mode 100644 tensorflow/lite/experimental/support/codegen/android_java_generator.cc
 delete mode 100644 tensorflow/lite/experimental/support/codegen/android_java_generator.h
 delete mode 100644 tensorflow/lite/experimental/support/codegen/code_generator.cc
 delete mode 100644 tensorflow/lite/experimental/support/codegen/code_generator.h
 delete mode 100644 tensorflow/lite/experimental/support/codegen/code_generator_test.cc
 delete mode 100644 tensorflow/lite/experimental/support/codegen/metadata_helper.cc
 delete mode 100644 tensorflow/lite/experimental/support/codegen/metadata_helper.h
 delete mode 100644 tensorflow/lite/experimental/support/codegen/python/BUILD
 delete mode 100644 tensorflow/lite/experimental/support/codegen/python/codegen.py
 delete mode 100644 tensorflow/lite/experimental/support/codegen/python/codegen_lib.cc
 delete mode 100644 tensorflow/lite/experimental/support/codegen/utils.cc
 delete mode 100644 tensorflow/lite/experimental/support/codegen/utils.h
 delete mode 100644 tensorflow/lite/experimental/support/codegen/utils_test.cc
 delete mode 100644 tensorflow/lite/experimental/support/java/AndroidManifest.xml
 delete mode 100644 tensorflow/lite/experimental/support/java/BUILD
 delete mode 100644 tensorflow/lite/experimental/support/java/README.md
 delete mode 100644 tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/common/FileUtil.java
 delete mode 100644 tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/common/Operator.java
 delete mode 100644 tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/common/Processor.java
 delete mode 100644 tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/common/SequentialProcessor.java
 delete mode 100644 tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/common/SupportPreconditions.java
 delete mode 100644 tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/common/TensorOperator.java
 delete mode 100644 tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/common/TensorProcessor.java
 delete mode 100644 tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/common/ops/CastOp.java
 delete mode 100644 tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/common/ops/DequantizeOp.java
 delete mode 100644 tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/common/ops/NormalizeOp.java
 delete mode 100644 tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/common/ops/QuantizeOp.java
 delete mode 100644 tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/image/BoundingBoxUtil.java
 delete mode 100644 tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/image/ImageConversions.java
 delete mode 100644 tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/image/ImageOperator.java
 delete mode 100644 tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/image/ImageProcessor.java
 delete mode 100644 tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/image/TensorImage.java
 delete mode 100644 tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/image/ops/ResizeOp.java
 delete mode 100644 tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/image/ops/ResizeWithCropOrPadOp.java
 delete mode 100644 tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/image/ops/Rot90Op.java
 delete mode 100644 tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/image/ops/TensorOperatorWrapper.java
 delete mode 100644 tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/label/Category.java
 delete mode 100644 tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/label/LabelUtil.java
 delete mode 100644 tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/label/TensorLabel.java
 delete mode 100644 tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/label/ops/LabelAxisOp.java
 delete mode 100644 tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/model/GpuDelegateProxy.java
 delete mode 100644 tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/model/Model.java
 delete mode 100644 tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/tensorbuffer/TensorBuffer.java
 delete mode 100644 tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/tensorbuffer/TensorBufferFloat.java
 delete mode 100644 tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/tensorbuffer/TensorBufferUint8.java
 delete mode 100644 tensorflow/lite/experimental/support/metadata/BUILD
 delete mode 100644 tensorflow/lite/experimental/support/metadata/README.md
 delete mode 100644 tensorflow/lite/experimental/support/metadata/build_defs.bzl
 delete mode 100644 tensorflow/lite/experimental/support/metadata/cc/BUILD
 delete mode 100644 tensorflow/lite/experimental/support/metadata/cc/metadata_parser.h.template
 delete mode 100644 tensorflow/lite/experimental/support/metadata/cc/metadata_version.cc
 delete mode 100644 tensorflow/lite/experimental/support/metadata/cc/metadata_version.h
 delete mode 100644 tensorflow/lite/experimental/support/metadata/cc/python/BUILD
 delete mode 100644 tensorflow/lite/experimental/support/metadata/cc/python/metadata_version.cc
 delete mode 100644 tensorflow/lite/experimental/support/metadata/cc/test/BUILD
 delete mode 100644 tensorflow/lite/experimental/support/metadata/cc/test/metadata_parser_test.cc
 delete mode 100644 tensorflow/lite/experimental/support/metadata/cc/test/metadata_version_test.cc
 delete mode 100644 tensorflow/lite/experimental/support/metadata/flatbuffers_lib/BUILD
 delete mode 100644 tensorflow/lite/experimental/support/metadata/flatbuffers_lib/flatbuffers_lib.cc
 delete mode 100644 tensorflow/lite/experimental/support/metadata/java/AndroidManifest.xml
 delete mode 100644 tensorflow/lite/experimental/support/metadata/java/BUILD
 delete mode 100644 tensorflow/lite/experimental/support/metadata/java/src/java/org/tensorflow/lite/support/metadata/BoundedInputStream.java
 delete mode 100644 tensorflow/lite/experimental/support/metadata/java/src/java/org/tensorflow/lite/support/metadata/ByteBufferChannel.java
 delete mode 100644 tensorflow/lite/experimental/support/metadata/java/src/java/org/tensorflow/lite/support/metadata/MetadataExtractor.java
 delete mode 100644 tensorflow/lite/experimental/support/metadata/java/src/java/org/tensorflow/lite/support/metadata/MetadataParser.java
 delete mode 100644 tensorflow/lite/experimental/support/metadata/java/src/java/org/tensorflow/lite/support/metadata/ModelInfo.java
 delete mode 100644 tensorflow/lite/experimental/support/metadata/java/src/java/org/tensorflow/lite/support/metadata/ModelMetadataInfo.java
 delete mode 100644 tensorflow/lite/experimental/support/metadata/java/src/java/org/tensorflow/lite/support/metadata/Preconditions.java
 delete mode 100644 tensorflow/lite/experimental/support/metadata/java/src/java/org/tensorflow/lite/support/metadata/SeekableByteChannelCompat.java
 delete mode 100644 tensorflow/lite/experimental/support/metadata/java/src/java/org/tensorflow/lite/support/metadata/ZipFile.java
 delete mode 100644 tensorflow/lite/experimental/support/metadata/metadata.py
 delete mode 100644 tensorflow/lite/experimental/support/metadata/metadata_parser.py.template
 delete mode 100644 tensorflow/lite/experimental/support/metadata/metadata_parser_test.py
 delete mode 100644 tensorflow/lite/experimental/support/metadata/metadata_schema.fbs
 delete mode 100644 tensorflow/lite/experimental/support/metadata/metadata_test.py
 delete mode 100644 tensorflow/lite/experimental/support/metadata/testdata/golden_json.json

diff --git a/tensorflow/lite/experimental/support/README.md b/tensorflow/lite/experimental/support/README.md
new file mode 100644
index 00000000000..df4df82955a
--- /dev/null
+++ b/tensorflow/lite/experimental/support/README.md
@@ -0,0 +1,5 @@
+# TensorFlow Lite Support
+
+The TensorFlow Lite Support project has been migrated to its own repo. Please
+checkout [TFLite Support](https://github.com/tensorflow/tflite-support) for the
+latest updates.
diff --git a/tensorflow/lite/experimental/support/codegen/BUILD b/tensorflow/lite/experimental/support/codegen/BUILD
deleted file mode 100644
index 96bb3e35952..00000000000
--- a/tensorflow/lite/experimental/support/codegen/BUILD
+++ /dev/null
@@ -1,87 +0,0 @@
-# The tools for generating wrapper classes for a TFLite model with metadata.
-
-package(
-    default_visibility = [
-        "//visibility:public",
-    ],
-    licenses = ["notice"],  # Apache 2.0
-)
-
-cc_library(
-    name = "utils",
-    srcs = [
-        "utils.cc",
-    ],
-    hdrs = [
-        "utils.h",
-    ],
-    deps = [
-    ],
-)
-
-cc_library(
-    name = "code_generator",
-    srcs = [
-        "code_generator.cc",
-    ],
-    hdrs = [
-        "code_generator.h",
-    ],
-    deps = [
-        ":utils",
-        "//tensorflow/lite/experimental/support/metadata:metadata_schema_cc",
-    ],
-)
-
-cc_library(
-    name = "metadata_helper",
-    srcs = [
-        "metadata_helper.cc",
-    ],
-    hdrs = [
-        "metadata_helper.h",
-    ],
-    deps = [
-        ":utils",
-        "//tensorflow/lite/experimental/support/metadata:metadata_schema_cc",
-        "//tensorflow/lite/schema:schema_fbs",
-    ],
-)
-
-cc_library(
-    name = "android_java_generator",
-    srcs = [
-        "android_java_generator.cc",
-    ],
-    hdrs = [
-        "android_java_generator.h",
-    ],
-    deps = [
-        ":code_generator",
-        ":metadata_helper",
-        ":utils",
-        "//tensorflow/core/platform:logging",
-        "//tensorflow/lite/experimental/support/metadata:metadata_schema_cc",
-        "//tensorflow/lite/schema:schema_fbs",
-    ],
-)
-
-cc_test(
-    name = "code_generator_test",
-    size = "small",
-    srcs = ["code_generator_test.cc"],
-    data = ["//tensorflow/lite/experimental/support/metadata:metadata_schema.fbs"],
-    deps = [
-        ":code_generator",
-        "@com_google_googletest//:gtest_main",
-    ],
-)
-
-cc_test(
-    name = "utils_test",
-    srcs = ["utils_test.cc"],
-    deps = [
-        ":utils",
-        "@com_google_googletest//:gtest_main",
-    ],
-)
diff --git a/tensorflow/lite/experimental/support/codegen/README.md b/tensorflow/lite/experimental/support/codegen/README.md
deleted file mode 100644
index 425dab37b04..00000000000
--- a/tensorflow/lite/experimental/support/codegen/README.md
+++ /dev/null
@@ -1,13 +0,0 @@
-# TensorFlow Lite Android Wrapper Code Generator
-
-For TensorFlow Lite model enhanced with [metadata](https://www.tensorflow.org/lite/convert/metadata.md),
-developers can use the TensorFlow Lite Android wrapper code generator to create
-platform specific wrapper code. The wrapper code removes the need to interact
-directly with `ByteBuffer`. Instead, developers can interact with the TensorFlow
-Lite model with typed objects such as `Bitmap` and `Rect`.
-
-The usefulness of the code generator depend on the completeness of the
-TensorFlow Lite model's metadata entry. Refer to the `<Codegen usage>` section
-under relevant fields in
-[metadata_schema.fbs](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/experimental/support/metadata/metadata_schema.fbs),
-to see how the codegen tool parses each field.
diff --git a/tensorflow/lite/experimental/support/codegen/android_java_generator.cc b/tensorflow/lite/experimental/support/codegen/android_java_generator.cc
deleted file mode 100644
index b5571a2cb34..00000000000
--- a/tensorflow/lite/experimental/support/codegen/android_java_generator.cc
+++ /dev/null
@@ -1,1017 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// This file contains the logic of android model wrapper generation.
-//
-// At the beginning is the helper functions handling metadata and code writer.
-//
-// Codes are generated in every `Generate{FOO}` functions. Gradle and Manifest
-// files are simple. The wrapper file generation is a bit complex so we divided
-// it into several sub-functions.
-//
-// The structure of the wrapper file looks like:
-//
-// [ imports ]
-// [ class ]
-//   [ inner "Outputs" class ]
-//   [ innner "Metadata" class ]
-//   [ APIs ] ( including ctors, public APIs and private APIs )
-//
-// We tried to mostly write it in a "template-generation" way. `CodeWriter` does
-// the job as a template renderer. To avoid repeatedly setting the token values,
-// helper functions `SetCodeWriterWith{Foo}Info` set the token values with info
-// structures (`TensorInfo` and `ModelInfo`) - the Info structures are
-// intermediate datastructures between Metadata (represented in Flatbuffers) and
-// generated code.
-
-#include "tensorflow/lite/experimental/support/codegen/android_java_generator.h"
-
-#include <ctype.h>
-
-#include <algorithm>
-#include <memory>
-#include <string>
-#include <vector>
-
-#include "tensorflow/lite/experimental/support/codegen/code_generator.h"
-#include "tensorflow/lite/experimental/support/codegen/metadata_helper.h"
-#include "tensorflow/lite/experimental/support/codegen/utils.h"
-#include "tensorflow/lite/experimental/support/metadata/metadata_schema_generated.h"
-
-namespace tflite {
-namespace support {
-namespace codegen {
-
-namespace {
-
-using details_android_java::ModelInfo;
-using details_android_java::TensorInfo;
-
-// Helper class to organize the C++ code block as a generated code block.
-// Using ctor and dtor to simulate an enter/exit schema like `with` in Python.
-class AsBlock {
- public:
-  AsBlock(CodeWriter* code_writer, const std::string& before,
-          bool trailing_blank_line = false)
-      : code_writer_(code_writer), trailing_blank_line_(trailing_blank_line) {
-    code_writer_->AppendNoNewLine(before);
-    code_writer_->Append(" {");
-    code_writer_->Indent();
-  }
-  ~AsBlock() {
-    code_writer_->Outdent();
-    code_writer_->Append("}");
-    if (trailing_blank_line_) {
-      code_writer_->NewLine();
-    }
-  }
-
- private:
-  CodeWriter* code_writer_;
-  bool trailing_blank_line_;
-};
-
-// Declare the functions first, so that the functions can follow a logical
-// order.
-bool GenerateWrapperClass(CodeWriter*, const ModelInfo&, ErrorReporter*);
-bool GenerateWrapperImports(CodeWriter*, const ModelInfo&, ErrorReporter*);
-bool GenerateWrapperInputs(CodeWriter*, const ModelInfo&, ErrorReporter*);
-bool GenerateWrapperOutputs(CodeWriter*, const ModelInfo&, ErrorReporter*);
-bool GenerateWrapperMetadata(CodeWriter*, const ModelInfo&, ErrorReporter*);
-bool GenerateWrapperAPI(CodeWriter*, const ModelInfo&, ErrorReporter*);
-
-std::string GetModelVersionedName(const ModelMetadata* metadata) {
-  std::string model_name = "MyModel";
-  if (metadata->name() != nullptr && !(metadata->name()->str().empty())) {
-    model_name = metadata->name()->str();
-  }
-  std::string model_version = "unknown";
-  if (metadata->version() != nullptr && !(metadata->version()->str().empty())) {
-    model_version = metadata->version()->str();
-  }
-  return model_name + " (Version: " + model_version + ")";
-}
-
-TensorInfo CreateTensorInfo(const TensorMetadata* metadata,
-                            const std::string& name, bool is_input, int index,
-                            ErrorReporter* err) {
-  TensorInfo tensor_info;
-  std::string tensor_identifier = is_input ? "input" : "output";
-  tensor_identifier += " " + std::to_string(index);
-  tensor_info.associated_axis_label_index = FindAssociatedFile(
-      metadata, AssociatedFileType_TENSOR_AXIS_LABELS, tensor_identifier, err);
-  tensor_info.associated_value_label_index = FindAssociatedFile(
-      metadata, AssociatedFileType_TENSOR_VALUE_LABELS, tensor_identifier, err);
-  if (is_input && (tensor_info.associated_axis_label_index >= 0 ||
-                   tensor_info.associated_value_label_index >= 0)) {
-    err->Warning(
-        "Found label file on input tensor (%s). Label file for input "
-        "tensor is not supported yet. The "
-        "file will be ignored.",
-        tensor_identifier.c_str());
-  }
-  if (tensor_info.associated_axis_label_index >= 0 &&
-      tensor_info.associated_value_label_index >= 0) {
-    err->Warning(
-        "Found both axis label file and value label file for tensor (%s), "
-        "which is not supported. Only the axis label file will be used.",
-        tensor_identifier.c_str());
-  }
-  tensor_info.is_input = is_input;
-  tensor_info.name = SnakeCaseToCamelCase(name);
-  tensor_info.upper_camel_name = tensor_info.name;
-  tensor_info.upper_camel_name[0] = toupper(tensor_info.upper_camel_name[0]);
-  tensor_info.normalization_unit =
-      FindNormalizationUnit(metadata, tensor_identifier, err);
-  if (metadata->content() != nullptr &&
-      metadata->content()->content_properties() != nullptr) {
-    // Enter tensor wrapper type inferring
-    if (metadata->content()->content_properties_type() ==
-        ContentProperties_ImageProperties) {
-      if (metadata->content()
-              ->content_properties_as_ImageProperties()
-              ->color_space() == ColorSpaceType_RGB) {
-        tensor_info.content_type = "image";
-        tensor_info.wrapper_type = "TensorImage";
-        tensor_info.processor_type = "ImageProcessor";
-        return tensor_info;
-      } else {
-        err->Warning(
-            "Found Non-RGB image on tensor (%s). Codegen currently does not "
-            "support it, and regard it as a plain numeric tensor.",
-            tensor_identifier.c_str());
-      }
-    }
-  }
-  tensor_info.content_type = "tensor";
-  tensor_info.wrapper_type = "TensorBuffer";
-  tensor_info.processor_type = "TensorProcessor";
-  return tensor_info;
-}
-
-ModelInfo CreateModelInfo(const ModelMetadata* metadata,
-                          const std::string& package_name,
-                          const std::string& model_class_name,
-                          const std::string& model_asset_path,
-                          ErrorReporter* err) {
-  ModelInfo model_info;
-  if (!CodeGenerator::VerifyMetadata(metadata, err)) {
-    // TODO(b/150116380): Create dummy model info.
-    err->Error("Validating metadata failed.");
-    return model_info;
-  }
-  model_info.package_name = package_name;
-  model_info.model_class_name = model_class_name;
-  model_info.model_asset_path = model_asset_path;
-  model_info.model_versioned_name = GetModelVersionedName(metadata);
-  const auto* graph = metadata->subgraph_metadata()->Get(0);
-  auto names = CodeGenerator::NameInputsAndOutputs(
-      graph->input_tensor_metadata(), graph->output_tensor_metadata());
-  std::vector<std::string> input_tensor_names = std::move(names.first);
-  std::vector<std::string> output_tensor_names = std::move(names.second);
-
-  for (int i = 0; i < input_tensor_names.size(); i++) {
-    model_info.inputs.push_back(
-        CreateTensorInfo(graph->input_tensor_metadata()->Get(i),
-                         input_tensor_names[i], true, i, err));
-    if (i < input_tensor_names.size() - 1) {
-      model_info.inputs_list += ", ";
-      model_info.input_type_param_list += ", ";
-    }
-    model_info.inputs_list += model_info.inputs[i].name;
-    model_info.input_type_param_list +=
-        model_info.inputs[i].wrapper_type + " " + model_info.inputs[i].name;
-  }
-  for (int i = 0; i < output_tensor_names.size(); i++) {
-    model_info.outputs.push_back(
-        CreateTensorInfo(graph->output_tensor_metadata()->Get(i),
-                         output_tensor_names[i], false, i, err));
-    if (i < output_tensor_names.size() - 1) {
-      model_info.postprocessor_type_param_list += ", ";
-      model_info.postprocessors_list += ", ";
-    }
-    model_info.postprocessors_list +=
-        model_info.outputs[i].name + "Postprocessor";
-    model_info.postprocessor_type_param_list +=
-        model_info.outputs[i].processor_type + " " +
-        model_info.outputs[i].name + "Postprocessor";
-  }
-  return model_info;
-}
-
-void SetCodeWriterWithTensorInfo(CodeWriter* code_writer,
-                                 const TensorInfo& tensor_info) {
-  code_writer->SetTokenValue("NAME", tensor_info.name);
-  code_writer->SetTokenValue("NAME_U", tensor_info.upper_camel_name);
-  code_writer->SetTokenValue("CONTENT_TYPE", tensor_info.content_type);
-  code_writer->SetTokenValue("WRAPPER_TYPE", tensor_info.wrapper_type);
-  std::string wrapper_name = tensor_info.wrapper_type;
-  wrapper_name[0] = tolower(wrapper_name[0]);
-  code_writer->SetTokenValue("WRAPPER_NAME", wrapper_name);
-  code_writer->SetTokenValue("PROCESSOR_TYPE", tensor_info.processor_type);
-  code_writer->SetTokenValue("NORMALIZATION_UNIT",
-                             std::to_string(tensor_info.normalization_unit));
-  code_writer->SetTokenValue(
-      "ASSOCIATED_AXIS_LABEL_INDEX",
-      std::to_string(tensor_info.associated_axis_label_index));
-  code_writer->SetTokenValue(
-      "ASSOCIATED_VALUE_LABEL_INDEX",
-      std::to_string(tensor_info.associated_value_label_index));
-}
-
-void SetCodeWriterWithModelInfo(CodeWriter* code_writer,
-                                const ModelInfo& model_info) {
-  code_writer->SetTokenValue("PACKAGE", model_info.package_name);
-  code_writer->SetTokenValue("MODEL_PATH", model_info.model_asset_path);
-  code_writer->SetTokenValue("MODEL_CLASS_NAME", model_info.model_class_name);
-  // Extra info, half generated.
-  code_writer->SetTokenValue("INPUT_TYPE_PARAM_LIST",
-                             model_info.input_type_param_list);
-  code_writer->SetTokenValue("INPUTS_LIST", model_info.inputs_list);
-  code_writer->SetTokenValue("POSTPROCESSORS_LIST",
-                             model_info.postprocessors_list);
-  code_writer->SetTokenValue("POSTPROCESSOR_TYPE_PARAM_LIST",
-                             model_info.postprocessor_type_param_list);
-}
-
-constexpr char JAVA_DEFAULT_PACKAGE[] = "default";
-
-std::string ConvertPackageToPath(const std::string& package) {
-  if (package == JAVA_DEFAULT_PACKAGE) {
-    return "";
-  }
-  std::string path = package;
-  std::replace(path.begin(), path.end(), '.', '/');
-  return path;
-}
-
-bool IsImageUsed(const ModelInfo& model) {
-  for (const auto& input : model.inputs) {
-    if (input.content_type == "image") {
-      return true;
-    }
-  }
-  for (const auto& output : model.outputs) {
-    if (output.content_type == "image") {
-      return true;
-    }
-  }
-  return false;
-}
-
-// The following functions generates the wrapper Java code for a model.
-
-bool GenerateWrapperFileContent(CodeWriter* code_writer, const ModelInfo& model,
-                                ErrorReporter* err) {
-  code_writer->Append("// Generated by TFLite Support.");
-  code_writer->Append("package {{PACKAGE}};");
-  code_writer->NewLine();
-
-  if (!GenerateWrapperImports(code_writer, model, err)) {
-    err->Error("Fail to generate imports for wrapper class.");
-    return false;
-  }
-  if (!GenerateWrapperClass(code_writer, model, err)) {
-    err->Error("Fail to generate wrapper class.");
-    return false;
-  }
-  code_writer->NewLine();
-  return true;
-}
-
-bool GenerateWrapperImports(CodeWriter* code_writer, const ModelInfo& model,
-                            ErrorReporter* err) {
-  const std::string support_pkg = "org.tensorflow.lite.support.";
-  std::vector<std::string> imports{
-      "android.content.Context",
-      "java.io.IOException",
-      "java.nio.ByteBuffer",
-      "java.nio.FloatBuffer",
-      "java.util.Arrays",
-      "java.util.HashMap",
-      "java.util.List",
-      "java.util.Map",
-      "org.tensorflow.lite.DataType",
-      "org.tensorflow.lite.Tensor",
-      "org.tensorflow.lite.Tensor.QuantizationParams",
-      support_pkg + "common.FileUtil",
-      support_pkg + "common.TensorProcessor",
-      support_pkg + "common.ops.CastOp",
-      support_pkg + "common.ops.DequantizeOp",
-      support_pkg + "common.ops.NormalizeOp",
-      support_pkg + "common.ops.QuantizeOp",
-      support_pkg + "label.Category",
-      support_pkg + "label.TensorLabel",
-      support_pkg + "metadata.MetadataExtractor",
-      support_pkg + "metadata.schema.NormalizationOptions",
-      support_pkg + "model.Model",
-      support_pkg + "tensorbuffer.TensorBuffer",
-  };
-  if (IsImageUsed(model)) {
-    for (const auto& target :
-         {"image.ImageProcessor", "image.TensorImage", "image.ops.ResizeOp",
-          "image.ops.ResizeOp.ResizeMethod"}) {
-      imports.push_back(support_pkg + target);
-    }
-  }
-
-  std::sort(imports.begin(), imports.end());
-  for (const auto& target : imports) {
-    code_writer->SetTokenValue("TARGET", target);
-    code_writer->Append("import {{TARGET}};");
-  }
-  code_writer->NewLine();
-  return true;
-}
-
-bool GenerateWrapperClass(CodeWriter* code_writer, const ModelInfo& model,
-                          ErrorReporter* err) {
-  code_writer->SetTokenValue("MODEL_VERSIONED_NAME",
-                             model.model_versioned_name);
-  code_writer->Append(
-      R"(/** Wrapper class of model {{MODEL_VERSIONED_NAME}} */)");
-  const auto code_block =
-      AsBlock(code_writer, "public class {{MODEL_CLASS_NAME}}");
-  code_writer->Append(R"(private final Metadata metadata;
-private final Model model;
-private static final String MODEL_NAME = "{{MODEL_PATH}}";)");
-  for (const auto& tensor : model.inputs) {
-    SetCodeWriterWithTensorInfo(code_writer, tensor);
-    code_writer->Append("private {{PROCESSOR_TYPE}} {{NAME}}Preprocessor;");
-  }
-  for (const auto& tensor : model.outputs) {
-    SetCodeWriterWithTensorInfo(code_writer, tensor);
-    code_writer->Append("private {{PROCESSOR_TYPE}} {{NAME}}Postprocessor;");
-  }
-  code_writer->NewLine();
-  if (!GenerateWrapperOutputs(code_writer, model, err)) {
-    err->Error("Failed to generate output classes");
-    return false;
-  }
-  code_writer->NewLine();
-  if (!GenerateWrapperMetadata(code_writer, model, err)) {
-    err->Error("Failed to generate the metadata class");
-    return false;
-  }
-  code_writer->NewLine();
-  if (!GenerateWrapperAPI(code_writer, model, err)) {
-    err->Error("Failed to generate the common APIs");
-    return false;
-  }
-  return true;
-}
-
-bool GenerateWrapperOutputs(CodeWriter* code_writer, const ModelInfo& model,
-                            ErrorReporter* err) {
-  code_writer->Append("/** Output wrapper of {@link {{MODEL_CLASS_NAME}}} */");
-  auto class_block = AsBlock(code_writer, "public static class Outputs");
-  for (const auto& tensor : model.outputs) {
-    SetCodeWriterWithTensorInfo(code_writer, tensor);
-    code_writer->Append("private final {{WRAPPER_TYPE}} {{NAME}};");
-    if (tensor.associated_axis_label_index >= 0) {
-      code_writer->Append("private final List<String> {{NAME}}Labels;");
-    }
-    code_writer->Append(
-        "private final {{PROCESSOR_TYPE}} {{NAME}}Postprocessor;");
-  }
-  // Getters
-  for (const auto& tensor : model.outputs) {
-    SetCodeWriterWithTensorInfo(code_writer, tensor);
-    code_writer->NewLine();
-    if (tensor.associated_axis_label_index >= 0) {
-      if (tensor.content_type == "tensor") {
-        code_writer->Append(
-            R"(public List<Category> get{{NAME_U}}AsCategoryList() {
-  return new TensorLabel({{NAME}}Labels, postprocess{{NAME_U}}({{NAME}})).getCategoryList();
-})");
-      } else {  // image
-        err->Warning(
-            "Axis label for images is not supported. The labels will "
-            "be ignored.");
-      }
-    } else {  // no label
-      code_writer->Append(
-          R"(public {{WRAPPER_TYPE}} get{{NAME_U}}As{{WRAPPER_TYPE}}() {
-  return postprocess{{NAME_U}}({{NAME}});
-})");
-    }
-  }
-  code_writer->NewLine();
-  {
-    const auto ctor_block = AsBlock(
-        code_writer,
-        "Outputs(Metadata metadata, {{POSTPROCESSOR_TYPE_PARAM_LIST}})");
-    for (const auto& tensor : model.outputs) {
-      SetCodeWriterWithTensorInfo(code_writer, tensor);
-      if (tensor.content_type == "image") {
-        code_writer->Append(
-            R"({{NAME}} = new TensorImage(metadata.get{{NAME_U}}Type());
-{{NAME}}.load(TensorBuffer.createFixedSize(metadata.get{{NAME_U}}Shape(), metadata.get{{NAME_U}}Type()));)");
-      } else {  // FEATURE, UNKNOWN
-        code_writer->Append(
-            "{{NAME}} = "
-            "TensorBuffer.createFixedSize(metadata.get{{NAME_U}}Shape(), "
-            "metadata.get{{NAME_U}}Type());");
-      }
-      if (tensor.associated_axis_label_index >= 0) {
-        code_writer->Append("{{NAME}}Labels = metadata.get{{NAME_U}}Labels();");
-      }
-      code_writer->Append(
-          "this.{{NAME}}Postprocessor = {{NAME}}Postprocessor;");
-    }
-  }
-  code_writer->NewLine();
-  {
-    const auto get_buffer_block =
-        AsBlock(code_writer, "Map<Integer, Object> getBuffer()");
-    code_writer->Append("Map<Integer, Object> outputs = new HashMap<>();");
-    for (int i = 0; i < model.outputs.size(); i++) {
-      SetCodeWriterWithTensorInfo(code_writer, model.outputs[i]);
-      code_writer->SetTokenValue("ID", std::to_string(i));
-      code_writer->Append("outputs.put({{ID}}, {{NAME}}.getBuffer());");
-    }
-    code_writer->Append("return outputs;");
-  }
-  for (const auto& tensor : model.outputs) {
-    SetCodeWriterWithTensorInfo(code_writer, tensor);
-    code_writer->NewLine();
-    {
-      auto processor_block =
-          AsBlock(code_writer,
-                  "private {{WRAPPER_TYPE}} "
-                  "postprocess{{NAME_U}}({{WRAPPER_TYPE}} {{WRAPPER_NAME}})");
-      code_writer->Append(
-          "return {{NAME}}Postprocessor.process({{WRAPPER_NAME}});");
-    }
-  }
-  return true;
-}
-
-bool GenerateWrapperMetadata(CodeWriter* code_writer, const ModelInfo& model,
-                             ErrorReporter* err) {
-  code_writer->Append(
-      "/** Metadata accessors of {@link {{MODEL_CLASS_NAME}}} */");
-  const auto class_block = AsBlock(code_writer, "public static class Metadata");
-  for (const auto& tensor : model.inputs) {
-    SetCodeWriterWithTensorInfo(code_writer, tensor);
-    code_writer->Append(R"(private final int[] {{NAME}}Shape;
-private final DataType {{NAME}}DataType;
-private final QuantizationParams {{NAME}}QuantizationParams;)");
-    if (tensor.normalization_unit >= 0) {
-      code_writer->Append(R"(private final float[] {{NAME}}Mean;
-private final float[] {{NAME}}Stddev;)");
-    }
-  }
-  for (const auto& tensor : model.outputs) {
-    SetCodeWriterWithTensorInfo(code_writer, tensor);
-    code_writer->Append(R"(private final int[] {{NAME}}Shape;
-private final DataType {{NAME}}DataType;
-private final QuantizationParams {{NAME}}QuantizationParams;)");
-    if (tensor.normalization_unit >= 0) {
-      code_writer->Append(R"(private final float[] {{NAME}}Mean;
-private final float[] {{NAME}}Stddev;)");
-    }
-    if (tensor.associated_axis_label_index >= 0 ||
-        tensor.associated_value_label_index >= 0) {
-      code_writer->Append("private final List<String> {{NAME}}Labels;");
-    }
-  }
-  code_writer->NewLine();
-  {
-    const auto ctor_block = AsBlock(
-        code_writer,
-        "public Metadata(ByteBuffer buffer, Model model) throws IOException");
-    code_writer->Append(
-        "MetadataExtractor extractor = new MetadataExtractor(buffer);");
-    for (int i = 0; i < model.inputs.size(); i++) {
-      SetCodeWriterWithTensorInfo(code_writer, model.inputs[i]);
-      code_writer->SetTokenValue("ID", std::to_string(i));
-      code_writer->Append(
-          R"(Tensor {{NAME}}Tensor = model.getInputTensor({{ID}});
-{{NAME}}Shape = {{NAME}}Tensor.shape();
-{{NAME}}DataType = {{NAME}}Tensor.dataType();
-{{NAME}}QuantizationParams = {{NAME}}Tensor.quantizationParams();)");
-      if (model.inputs[i].normalization_unit >= 0) {
-        code_writer->Append(
-            R"(NormalizationOptions {{NAME}}NormalizationOptions =
-    (NormalizationOptions) extractor.getInputTensorMetadata({{ID}}).processUnits({{NORMALIZATION_UNIT}}).options(new NormalizationOptions());
-FloatBuffer {{NAME}}MeanBuffer = {{NAME}}NormalizationOptions.meanAsByteBuffer().asFloatBuffer();
-{{NAME}}Mean = new float[{{NAME}}MeanBuffer.limit()];
-{{NAME}}MeanBuffer.get({{NAME}}Mean);
-FloatBuffer {{NAME}}StddevBuffer = {{NAME}}NormalizationOptions.stdAsByteBuffer().asFloatBuffer();
-{{NAME}}Stddev = new float[{{NAME}}StddevBuffer.limit()];
-{{NAME}}StddevBuffer.get({{NAME}}Stddev);)");
-      }
-    }
-    for (int i = 0; i < model.outputs.size(); i++) {
-      SetCodeWriterWithTensorInfo(code_writer, model.outputs[i]);
-      code_writer->SetTokenValue("ID", std::to_string(i));
-      code_writer->Append(
-          R"(Tensor {{NAME}}Tensor = model.getOutputTensor({{ID}});
-{{NAME}}Shape = {{NAME}}Tensor.shape();
-{{NAME}}DataType = {{NAME}}Tensor.dataType();
-{{NAME}}QuantizationParams = {{NAME}}Tensor.quantizationParams();)");
-      if (model.outputs[i].normalization_unit >= 0) {
-        code_writer->Append(
-            R"(NormalizationOptions {{NAME}}NormalizationOptions =
-    (NormalizationOptions) extractor.getInputTensorMetadata({{ID}}).processUnits({{NORMALIZATION_UNIT}}).options(new NormalizationOptions());
-FloatBuffer {{NAME}}MeanBuffer = {{NAME}}NormalizationOptions.meanAsByteBuffer().asFloatBuffer();
-{{NAME}}Mean = new float[{{NAME}}MeanBuffer.limit()];
-{{NAME}}MeanBuffer.get({{NAME}}Mean);
-FloatBuffer {{NAME}}StddevBuffer = {{NAME}}NormalizationOptions.stdAsByteBuffer().asFloatBuffer();
-{{NAME}}Stddev = new float[{{NAME}}StddevBuffer.limit()];
-{{NAME}}StddevBuffer.get({{NAME}}Stddev);)");
-      }
-      if (model.outputs[i].associated_axis_label_index >= 0) {
-        code_writer->Append(R"(String {{NAME}}LabelsFileName =
-    extractor.getOutputTensorMetadata({{ID}}).associatedFiles({{ASSOCIATED_AXIS_LABEL_INDEX}}).name();
-{{NAME}}Labels = FileUtil.loadLabels(extractor.getAssociatedFile({{NAME}}LabelsFileName));)");
-      } else if (model.outputs[i].associated_value_label_index >= 0) {
-        code_writer->Append(R"(String {{NAME}}LabelsFileName =
-    extractor.getOutputTensorMetadata({{ID}}).associatedFiles({{ASSOCIATED_VALUE_LABEL_INDEX}}).name();
-{{NAME}}Labels = FileUtil.loadLabels(extractor.getAssociatedFile({{NAME}}LabelsFileName));)");
-      }
-    }
-  }
-  for (const auto& tensor : model.inputs) {
-    SetCodeWriterWithTensorInfo(code_writer, tensor);
-    code_writer->Append(R"(
-public int[] get{{NAME_U}}Shape() {
-  return Arrays.copyOf({{NAME}}Shape, {{NAME}}Shape.length);
-}
-
-public DataType get{{NAME_U}}Type() {
-  return {{NAME}}DataType;
-}
-
-public QuantizationParams get{{NAME_U}}QuantizationParams() {
-  return {{NAME}}QuantizationParams;
-})");
-    if (tensor.normalization_unit >= 0) {
-      code_writer->Append(R"(
-public float[] get{{NAME_U}}Mean() {
-  return Arrays.copyOf({{NAME}}Mean, {{NAME}}Mean.length);
-}
-
-public float[] get{{NAME_U}}Stddev() {
-  return Arrays.copyOf({{NAME}}Stddev, {{NAME}}Stddev.length);
-})");
-    }
-  }
-  for (const auto& tensor : model.outputs) {
-    SetCodeWriterWithTensorInfo(code_writer, tensor);
-    code_writer->Append(R"(
-public int[] get{{NAME_U}}Shape() {
-  return Arrays.copyOf({{NAME}}Shape, {{NAME}}Shape.length);
-}
-
-public DataType get{{NAME_U}}Type() {
-  return {{NAME}}DataType;
-}
-
-public QuantizationParams get{{NAME_U}}QuantizationParams() {
-  return {{NAME}}QuantizationParams;
-})");
-    if (tensor.normalization_unit >= 0) {
-      code_writer->Append(R"(
-public float[] get{{NAME_U}}Mean() {
-  return Arrays.copyOf({{NAME}}Mean, {{NAME}}Mean.length);
-}
-
-public float[] get{{NAME_U}}Stddev() {
-  return Arrays.copyOf({{NAME}}Stddev, {{NAME}}Stddev.length);
-})");
-    }
-    if (tensor.associated_axis_label_index >= 0 ||
-        tensor.associated_value_label_index >= 0) {
-      code_writer->Append(R"(
-public List<String> get{{NAME_U}}Labels() {
-  return {{NAME}}Labels;
-})");
-    }
-  }
-  return true;
-}
-
-bool GenerateWrapperAPI(CodeWriter* code_writer, const ModelInfo& model,
-                        ErrorReporter* err) {
-  code_writer->Append(R"(public Metadata getMetadata() {
-  return metadata;
-}
-)");
-  code_writer->Append(R"(/**
- * Creates interpreter and loads associated files if needed.
- *
- * @throws IOException if an I/O error occurs when loading the tflite model.
- */
-public static {{MODEL_CLASS_NAME}} newInstance(Context context) throws IOException {
-  return newInstance(context, MODEL_NAME, new Model.Options.Builder().build());
-}
-
-/**
- * Creates interpreter and loads associated files if needed, but loading another model in the same
- * input / output structure with the original one.
- *
- * @throws IOException if an I/O error occurs when loading the tflite model.
- */
-public static {{MODEL_CLASS_NAME}} newInstance(Context context, String modelPath) throws IOException {
-  return newInstance(context, modelPath, new Model.Options.Builder().build());
-}
-
-/**
- * Creates interpreter and loads associated files if needed, with running options configured.
- *
- * @throws IOException if an I/O error occurs when loading the tflite model.
- */
-public static {{MODEL_CLASS_NAME}} newInstance(Context context, Model.Options runningOptions) throws IOException {
-  return newInstance(context, MODEL_NAME, runningOptions);
-}
-
-/**
- * Creates interpreter for a user-specified model.
- *
- * @throws IOException if an I/O error occurs when loading the tflite model.
- */
-public static {{MODEL_CLASS_NAME}} newInstance(Context context, String modelPath, Model.Options runningOptions) throws IOException {
-  Model model = Model.createModel(context, modelPath, runningOptions);
-  Metadata metadata = new Metadata(model.getData(), model);
-  MyImageClassifier instance = new MyImageClassifier(model, metadata);)");
-  for (const auto& tensor : model.inputs) {
-    SetCodeWriterWithTensorInfo(code_writer, tensor);
-    code_writer->Append(
-        "instance.reset{{NAME_U}}Preprocessor(instance.buildDefault{{NAME_U}}"
-        "Preprocessor());");
-  }
-  for (const auto& tensor : model.outputs) {
-    SetCodeWriterWithTensorInfo(code_writer, tensor);
-    code_writer->Append(
-        "instance.reset{{NAME_U}}Postprocessor(instance.buildDefault{{NAME_U}}"
-        "Postprocessor());");
-  }
-  code_writer->Append(R"(  return instance;
-}
-)");
-
-  // Pre, post processor setters
-  for (const auto& tensor : model.inputs) {
-    SetCodeWriterWithTensorInfo(code_writer, tensor);
-    code_writer->Append(R"(
-public void reset{{NAME_U}}Preprocessor({{PROCESSOR_TYPE}} processor) {
-  {{NAME}}Preprocessor = processor;
-})");
-  }
-  for (const auto& tensor : model.outputs) {
-    SetCodeWriterWithTensorInfo(code_writer, tensor);
-    code_writer->Append(R"(
-public void reset{{NAME_U}}Postprocessor({{PROCESSOR_TYPE}} processor) {
-  {{NAME}}Postprocessor = processor;
-})");
-  }
-  // Process method
-  code_writer->Append(R"(
-/** Triggers the model. */
-public Outputs process({{INPUT_TYPE_PARAM_LIST}}) {
-  Outputs outputs = new Outputs(metadata, {{POSTPROCESSORS_LIST}});
-  Object[] inputBuffers = preprocessInputs({{INPUTS_LIST}});
-  model.run(inputBuffers, outputs.getBuffer());
-  return outputs;
-}
-
-/** Closes the model. */
-public void close() {
-  model.close();
-}
-)");
-  {
-    auto block =
-        AsBlock(code_writer,
-                "private {{MODEL_CLASS_NAME}}(Model model, Metadata metadata)");
-    code_writer->Append(R"(this.model = model;
-this.metadata = metadata;)");
-  }
-  for (const auto& tensor : model.inputs) {
-    code_writer->NewLine();
-    SetCodeWriterWithTensorInfo(code_writer, tensor);
-    auto block = AsBlock(
-        code_writer,
-        "private {{PROCESSOR_TYPE}} buildDefault{{NAME_U}}Preprocessor()");
-    code_writer->Append(
-        "{{PROCESSOR_TYPE}}.Builder builder = new "
-        "{{PROCESSOR_TYPE}}.Builder()");
-    if (tensor.content_type == "image") {
-      code_writer->Append(R"(    .add(new ResizeOp(
-        metadata.get{{NAME_U}}Shape()[1],
-        metadata.get{{NAME_U}}Shape()[2],
-        ResizeMethod.NEAREST_NEIGHBOR)))");
-    }
-    if (tensor.normalization_unit >= 0) {
-      code_writer->Append(
-          R"(    .add(new NormalizeOp(metadata.get{{NAME_U}}Mean(), metadata.get{{NAME_U}}Stddev())))");
-    }
-    code_writer->Append(
-        R"(    .add(new QuantizeOp(
-        metadata.get{{NAME_U}}QuantizationParams().getZeroPoint(),
-        metadata.get{{NAME_U}}QuantizationParams().getScale()))
-    .add(new CastOp(metadata.get{{NAME_U}}Type()));
-return builder.build();)");
-  }
-  for (const auto& tensor : model.outputs) {
-    code_writer->NewLine();
-    SetCodeWriterWithTensorInfo(code_writer, tensor);
-    auto block = AsBlock(
-        code_writer,
-        "private {{PROCESSOR_TYPE}} buildDefault{{NAME_U}}Postprocessor()");
-    code_writer->AppendNoNewLine(
-        R"({{PROCESSOR_TYPE}}.Builder builder = new {{PROCESSOR_TYPE}}.Builder()
-    .add(new DequantizeOp(
-        metadata.get{{NAME_U}}QuantizationParams().getZeroPoint(),
-        metadata.get{{NAME_U}}QuantizationParams().getScale())))");
-    if (tensor.normalization_unit >= 0) {
-      code_writer->AppendNoNewLine(R"(
-    .add(new NormalizeOp(metadata.get{{NAME_U}}Mean(), metadata.get{{NAME_U}}Stddev())))");
-    }
-    code_writer->Append(R"(;
-return builder.build();)");
-  }
-  code_writer->NewLine();
-  {
-    const auto block =
-        AsBlock(code_writer,
-                "private Object[] preprocessInputs({{INPUT_TYPE_PARAM_LIST}})");
-    CodeWriter param_list_gen(err);
-    for (const auto& tensor : model.inputs) {
-      SetCodeWriterWithTensorInfo(code_writer, tensor);
-      code_writer->Append("{{NAME}} = {{NAME}}Preprocessor.process({{NAME}});");
-      SetCodeWriterWithTensorInfo(&param_list_gen, tensor);
-      param_list_gen.AppendNoNewLine("{{NAME}}.getBuffer(), ");
-    }
-    param_list_gen.Backspace(2);
-    code_writer->AppendNoNewLine("return new Object[] {");
-    code_writer->AppendNoNewLine(param_list_gen.ToString());
-    code_writer->Append("};");
-  }
-  return true;
-}
-
-bool GenerateBuildGradleContent(CodeWriter* code_writer,
-                                const ModelInfo& model_info) {
-  code_writer->Append(R"(buildscript {
-    repositories {
-        google()
-        jcenter()
-    }
-    dependencies {
-        classpath 'com.android.tools.build:gradle:3.2.1'
-    }
-}
-
-allprojects {
-    repositories {
-        google()
-        jcenter()
-        flatDir {
-            dirs 'libs'
-        }
-    }
-}
-
-apply plugin: 'com.android.library'
-
-android {
-    compileSdkVersion 29
-    defaultConfig {
-        targetSdkVersion 29
-        versionCode 1
-        versionName "1.0"
-    }
-    aaptOptions {
-        noCompress "tflite"
-    }
-    compileOptions {
-        sourceCompatibility = '1.8'
-        targetCompatibility = '1.8'
-    }
-    lintOptions {
-        abortOnError false
-    }
-}
-
-configurations {
-    libMetadata
-}
-
-dependencies {
-    libMetadata 'org.tensorflow:tensorflow-lite-support:0.0.0-experimental-metadata-monolithic'
-}
-
-task downloadLibs(type: Sync) {
-    from configurations.libMetadata
-    into "$buildDir/libs"
-    rename 'tensorflow-lite-support-0.0.0-experimental-metadata-monolithic.jar', "tensorflow-lite-support-metadata.jar"
-}
-
-preBuild.dependsOn downloadLibs
-
-dependencies {
-    compileOnly 'org.checkerframework:checker-qual:2.5.8'
-    api 'org.tensorflow:tensorflow-lite:0.0.0-nightly'
-    api 'org.tensorflow:tensorflow-lite-support:0.0.0-nightly'
-    api files("$buildDir/libs/tensorflow-lite-support-metadata.jar")
-    implementation 'org.apache.commons:commons-compress:1.19'
-})");
-  return true;
-}
-
-bool GenerateAndroidManifestContent(CodeWriter* code_writer,
-                                    const ModelInfo& model_info) {
-  code_writer->Append(R"(<?xml version="1.0" encoding="utf-8"?>
-<manifest xmlns:android="http://schemas.android.com/apk/res/android"
-    package="{{PACKAGE}}">
-</manifest>)");
-  return true;
-}
-
-bool GenerateDocContent(CodeWriter* code_writer, const ModelInfo& model_info) {
-  code_writer->Append("# {{MODEL_CLASS_NAME}} Usage");
-  // TODO(b/158651848) Generate imports for TFLS util types like TensorImage.
-  code_writer->AppendNoNewLine(R"(
-```
-import {{PACKAGE}}.{{MODEL_CLASS_NAME}};
-
-// 1. Initialize the Model
-{{MODEL_CLASS_NAME}} model = null;
-
-try {
-    model = {{MODEL_CLASS_NAME}}.newInstance(context);  // android.content.Context
-} catch (IOException e) {
-    e.printStackTrace();
-}
-
-if (model != null) {
-
-    // 2. Set the inputs)");
-  for (const auto& t : model_info.inputs) {
-    SetCodeWriterWithTensorInfo(code_writer, t);
-    if (t.content_type == "image") {
-      code_writer->Append(R"(
-    // Prepare tensor "{{NAME}}" from a Bitmap with ARGB_8888 format.
-    Bitmap bitmap = ...;
-    TensorImage {{MAME}} = TensorImage.fromBitmap(bitmap);
-    // Alternatively, load the input tensor "{{NAME}}" from pixel values.
-    // Check out TensorImage documentation to load other image data structures.
-    // int[] pixelValues = ...;
-    // int[] shape = ...;
-    // TensorImage {{NAME}} = new TensorImage();
-    // {{NAME}}.load(pixelValues, shape);)");
-    } else {
-      code_writer->Append(R"(
-    // Prepare input tensor "{{NAME}}" from an array.
-    // Check out TensorBuffer documentation to load other data structures.
-    TensorBuffer {{NAME}} = ...;
-    int[] values = ...;
-    int[] shape = ...;
-    {{NAME}}.load(values, shape);)");
-    }
-  }
-  code_writer->Append(R"(
-    // 3. Run the model
-    {{MODEL_CLASS_NAME}}.Outputs outputs = model.process({{INPUTS_LIST}});)");
-  code_writer->Append(R"(
-    // 4. Retrieve the results)");
-  for (const auto& t : model_info.outputs) {
-    SetCodeWriterWithTensorInfo(code_writer, t);
-    if (t.associated_axis_label_index >= 0) {
-      code_writer->SetTokenValue("WRAPPER_TYPE", "List<Category>");
-      code_writer->Append(
-          "    List<Category> {{NAME}} = "
-          "outputs.get{{NAME_U}}AsCategoryList();");
-    } else {
-      code_writer->Append(
-          "    {{WRAPPER_TYPE}} {{NAME}} = "
-          "outputs.get{{NAME_U}}As{{WRAPPER_TYPE}}();");
-    }
-  }
-  code_writer->Append(R"(}
-```)");
-  return true;
-}
-
-GenerationResult::File GenerateWrapperFile(const std::string& module_root,
-                                           const ModelInfo& model_info,
-                                           ErrorReporter* err) {
-  const auto java_path = JoinPath(module_root, "src/main/java");
-  const auto package_path =
-      JoinPath(java_path, ConvertPackageToPath(model_info.package_name));
-  const auto file_path =
-      JoinPath(package_path, model_info.model_class_name + JAVA_EXT);
-
-  CodeWriter code_writer(err);
-  code_writer.SetIndentString("  ");
-  SetCodeWriterWithModelInfo(&code_writer, model_info);
-
-  if (!GenerateWrapperFileContent(&code_writer, model_info, err)) {
-    err->Error("Generating Java wrapper content failed.");
-  }
-
-  const auto java_file = code_writer.ToString();
-  return GenerationResult::File{file_path, java_file};
-}
-
-GenerationResult::File GenerateBuildGradle(const std::string& module_root,
-                                           const ModelInfo& model_info,
-                                           ErrorReporter* err) {
-  const auto file_path = JoinPath(module_root, "build.gradle");
-  CodeWriter code_writer(err);
-  SetCodeWriterWithModelInfo(&code_writer, model_info);
-  if (!GenerateBuildGradleContent(&code_writer, model_info)) {
-    err->Error("Generating build.gradle failed.");
-  }
-  const auto content = code_writer.ToString();
-  return GenerationResult::File{file_path, content};
-}
-
-GenerationResult::File GenerateAndroidManifest(const std::string& module_root,
-                                               const ModelInfo& model_info,
-                                               ErrorReporter* err) {
-  const auto file_path = JoinPath(module_root, "src/main/AndroidManifest.xml");
-  CodeWriter code_writer(err);
-  SetCodeWriterWithModelInfo(&code_writer, model_info);
-  if (!GenerateAndroidManifestContent(&code_writer, model_info)) {
-    err->Error("Generating AndroidManifest.xml failed.");
-  }
-  return GenerationResult::File{file_path, code_writer.ToString()};
-}
-
-GenerationResult::File GenerateDoc(const std::string& module_root,
-                                   const ModelInfo& model_info,
-                                   ErrorReporter* err) {
-  std::string lower = model_info.model_class_name;
-  for (int i = 0; i < lower.length(); i++) {
-    lower[i] = std::tolower(lower[i]);
-  }
-  const auto file_path = JoinPath(module_root, lower + ".md");
-  CodeWriter code_writer(err);
-  SetCodeWriterWithModelInfo(&code_writer, model_info);
-  if (!GenerateDocContent(&code_writer, model_info)) {
-    err->Error("Generating doc failed.");
-  }
-  return GenerationResult::File{file_path, code_writer.ToString()};
-}
-
-}  // namespace
-
-AndroidJavaGenerator::AndroidJavaGenerator(const std::string& module_root)
-    : CodeGenerator(), module_root_(module_root) {}
-
-GenerationResult AndroidJavaGenerator::Generate(
-    const Model* model, const std::string& package_name,
-    const std::string& model_class_name, const std::string& model_asset_path) {
-  GenerationResult result;
-  if (model == nullptr) {
-    err_.Error(
-        "Cannot read model from the buffer. Codegen will generate nothing.");
-    return result;
-  }
-  const ModelMetadata* metadata = GetMetadataFromModel(model);
-  if (metadata == nullptr) {
-    err_.Error(
-        "Cannot find TFLite Metadata in the model. Codegen will generate "
-        "nothing.");
-    return result;
-  }
-  details_android_java::ModelInfo model_info = CreateModelInfo(
-      metadata, package_name, model_class_name, model_asset_path, &err_);
-  result.files.push_back(GenerateWrapperFile(module_root_, model_info, &err_));
-  result.files.push_back(GenerateBuildGradle(module_root_, model_info, &err_));
-  result.files.push_back(
-      GenerateAndroidManifest(module_root_, model_info, &err_));
-  result.files.push_back(GenerateDoc(module_root_, model_info, &err_));
-  return result;
-}
-
-GenerationResult AndroidJavaGenerator::Generate(
-    const char* model_storage, const std::string& package_name,
-    const std::string& model_class_name, const std::string& model_asset_path) {
-  const Model* model = GetModel(model_storage);
-  return Generate(model, package_name, model_class_name, model_asset_path);
-}
-
-std::string AndroidJavaGenerator::GetErrorMessage() {
-  return err_.GetMessage();
-}
-
-}  // namespace codegen
-}  // namespace support
-}  // namespace tflite
diff --git a/tensorflow/lite/experimental/support/codegen/android_java_generator.h b/tensorflow/lite/experimental/support/codegen/android_java_generator.h
deleted file mode 100644
index cc626f637a3..00000000000
--- a/tensorflow/lite/experimental/support/codegen/android_java_generator.h
+++ /dev/null
@@ -1,116 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_SUPPORT_CODEGEN_ANDROID_JAVA_GENERATOR_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_SUPPORT_CODEGEN_ANDROID_JAVA_GENERATOR_H_
-
-#include <memory>
-#include <string>
-#include <vector>
-
-#include "tensorflow/lite/experimental/support/codegen/code_generator.h"
-#include "tensorflow/lite/experimental/support/codegen/utils.h"
-#include "tensorflow/lite/experimental/support/metadata/metadata_schema_generated.h"
-#include "tensorflow/lite/schema/schema_generated.h"
-
-namespace tflite {
-namespace support {
-namespace codegen {
-
-namespace details_android_java {
-
-/// The intermediate data structure for generating code from TensorMetadata.
-/// Should only be used as const reference when created.
-struct TensorInfo {
-  std::string name;
-  std::string upper_camel_name;
-  std::string content_type;
-  std::string wrapper_type;
-  std::string processor_type;
-  bool is_input;
-  /// Optional. Set to -1 if not applicable.
-  int normalization_unit;
-  /// Optional. Set to -1 if associated_axis_label is empty.
-  int associated_axis_label_index;
-  /// Optional. Set to -1 if associated_value_label is empty.
-  int associated_value_label_index;
-};
-
-/// The intermediate data structure for generating code from ModelMetadata.
-/// Should only be used as const reference when created.
-struct ModelInfo {
-  std::string package_name;
-  std::string model_asset_path;
-  std::string model_class_name;
-  std::string model_versioned_name;
-  std::vector<TensorInfo> inputs;
-  std::vector<TensorInfo> outputs;
-  // Extra helper fields. For models with inputs "a", "b" and outputs "x", "y":
-  std::string input_type_param_list;
-  // e.g. "TensorImage a, TensorBuffer b"
-  std::string inputs_list;
-  // e.g. "a, b"
-  std::string postprocessor_type_param_list;
-  // e.g. "ImageProcessor xPostprocessor, TensorProcessor yPostprocessor"
-  std::string postprocessors_list;
-  // e.g. "xPostprocessor, yPostprocessor"
-};
-
-}  // namespace details_android_java
-
-constexpr char JAVA_EXT[] = ".java";
-
-/// Generates Android supporting codes and modules (in Java) based on TFLite
-/// metadata.
-class AndroidJavaGenerator : public CodeGenerator {
- public:
-  /// Creates an AndroidJavaGenerator.
-  /// Args:
-  /// - module_root: The root of destination Java module.
-  explicit AndroidJavaGenerator(const std::string& module_root);
-
-  /// Generates files. Returns the file paths and contents.
-  /// Args:
-  /// - model: The TFLite model with Metadata filled.
-  /// - package_name: The name of the Java package which generated classes
-  /// belong to.
-  /// - model_class_name: A readable name of the generated wrapper class, such
-  /// as "ImageClassifier", "MobileNetV2" or "MyModel".
-  /// - model_asset_path: The relevant path to the model file in the asset.
-  // TODO(b/141225157): Automatically generate model_class_name.
-  GenerationResult Generate(const Model* model, const std::string& package_name,
-                            const std::string& model_class_name,
-                            const std::string& model_asset_path);
-
-  /// Generates files and returns the file paths and contents.
-  /// It's mostly identical with the previous one, but the model here is
-  /// provided as binary flatbuffer content without parsing.
-  GenerationResult Generate(const char* model_storage,
-                            const std::string& package_name,
-                            const std::string& model_class_name,
-                            const std::string& model_asset_path);
-
-  std::string GetErrorMessage();
-
- private:
-  const std::string module_root_;
-  ErrorReporter err_;
-};
-
-}  // namespace codegen
-}  // namespace support
-}  // namespace tflite
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_SUPPORT_CODEGEN_ANDROID_JAVA_GENERATOR_H_
diff --git a/tensorflow/lite/experimental/support/codegen/code_generator.cc b/tensorflow/lite/experimental/support/codegen/code_generator.cc
deleted file mode 100644
index 39c0bd86d89..00000000000
--- a/tensorflow/lite/experimental/support/codegen/code_generator.cc
+++ /dev/null
@@ -1,179 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/lite/experimental/support/codegen/code_generator.h"
-
-#include <cctype>
-#include <unordered_map>
-#include <unordered_set>
-#include <utility>
-
-#include "tensorflow/lite/experimental/support/codegen/utils.h"
-#include "tensorflow/lite/experimental/support/metadata/metadata_schema_generated.h"
-
-namespace tflite {
-namespace support {
-namespace codegen {
-
-namespace {
-
-void ResolveConflictedNamesByAddingIndex(std::vector<std::string>* names_ptr) {
-  auto& names = *names_ptr;
-  std::unordered_map<std::string, int> indexes;
-  std::unordered_map<std::string, int> first_appearance;
-  for (int i = 0; i < names.size(); i++) {
-    if (indexes.find(names[i]) == indexes.end()) {
-      indexes[names[i]] = 1;
-      first_appearance[names[i]] = i;
-    } else {
-      indexes[names[i]] += 1;
-      names[i].append(std::to_string(indexes[names[i]]));
-    }
-  }
-  for (const auto& it : first_appearance) {
-    const auto& name = it.first;
-    const auto i = it.second;
-    if (indexes[name] > 1) {
-      names[i].append("1");
-    }
-  }
-}
-
-}  // namespace
-
-CodeGenerator::CodeGenerator() {}
-
-bool CodeGenerator::VerifyMetadata(const ModelMetadata* metadata,
-                                   ErrorReporter* err) {
-  if (metadata == nullptr) {
-    err->Error("Loading nullptr is not allowed");
-    return false;
-  }
-  if (metadata->subgraph_metadata()->size() != 1) {
-    err->Error("Only exact 1 subgraph is supported");
-    return false;
-  }
-  return true;
-}
-
-std::pair<std::vector<std::string>, std::vector<std::string>>
-CodeGenerator::NameInputsAndOutputs(const TensorMetadataList* inputs,
-                                    const TensorMetadataList* outputs) {
-  std::vector<std::string> input_names;
-  std::vector<std::string> output_names;
-  if (inputs != nullptr) {
-    input_names.reserve(inputs->size());
-    for (const auto* tensor : *inputs) {
-      input_names.push_back(NameTensor(*tensor, "input"));
-    }
-  }
-  if (outputs != nullptr) {
-    output_names.reserve(outputs->size());
-    for (const auto* tensor : *outputs) {
-      output_names.push_back(NameTensor(*tensor, "output"));
-    }
-  }
-  // Solve conflict
-  ResolveConflictedInputAndOutputNames(&input_names, &output_names);
-  return std::make_pair(input_names, output_names);
-}
-
-std::string CodeGenerator::ConvertToValidName(const std::string& name) {
-  // lowercase all
-  std::string result = name;
-  for (int i = 0; i < result.size(); i++) {
-    result[i] = std::tolower(result[i]);
-  }
-  // replace all non-alpha or non-numeric with underscores, except underscore
-  // itself
-  for (int i = 0; i < result.size(); i++) {
-    if (result[i] != '_' && !std::isalnum(result[i])) {
-      result[i] = '_';
-    }
-  }
-  // remove leading underscores
-  int leading_underscores = 0;
-  while (leading_underscores < result.size() &&
-         result[leading_underscores] == '_') {
-    leading_underscores++;
-  }
-  result.erase(0, leading_underscores);
-  if (result.empty()) {
-    return "";
-  }
-  // first char should be alpha
-  if (std::isalpha(result[0])) {
-    return result;
-  }
-  return "tensor_" + result;
-}
-
-std::string CodeGenerator::NameTensor(const TensorMetadata& tensor,
-                                      const std::string& default_name) {
-  if (tensor.name() != nullptr && tensor.name()->size() > 0) {
-    // TODO(b/141225157) Validate tensor name. It should be in lower case.
-    auto suggested_name = ConvertToValidName(tensor.name()->str());
-    if (!suggested_name.empty()) {
-      return suggested_name;
-    }
-  }
-  auto* content = tensor.content();
-  if (content == nullptr || content->content_properties() == nullptr) {
-    return default_name;
-  }
-  switch (content->content_properties_type()) {
-    case ContentProperties_ImageProperties:
-      return "image";
-    case ContentProperties_FeatureProperties:
-      return "feature";
-    default:
-      return default_name;
-  }
-}
-
-void CodeGenerator::ResolveConflictedInputAndOutputNames(
-    std::vector<std::string>* inputs, std::vector<std::string>* outputs) {
-  std::unordered_set<std::string> io_conflict;
-  auto& input_names = *inputs;
-  auto& output_names = *outputs;
-  for (const auto& input : input_names) {
-    if (io_conflict.find(input) != io_conflict.end()) {
-      continue;
-    }
-    for (const auto& output : output_names) {
-      if (input == output) {
-        io_conflict.insert(input);
-        break;
-      }
-    }
-  }
-  for (int i = 0; i < input_names.size(); i++) {
-    if (io_conflict.find(input_names[i]) != io_conflict.end()) {
-      input_names[i] = "input_" + input_names[i];
-    }
-  }
-  for (int i = 0; i < output_names.size(); i++) {
-    if (io_conflict.find(output_names[i]) != io_conflict.end()) {
-      output_names[i] = "output_" + output_names[i];
-    }
-  }
-  // 2. Second, add index if input[i] == input[j]
-  ResolveConflictedNamesByAddingIndex(&input_names);
-  ResolveConflictedNamesByAddingIndex(&output_names);
-}
-
-}  // namespace codegen
-}  // namespace support
-}  // namespace tflite
diff --git a/tensorflow/lite/experimental/support/codegen/code_generator.h b/tensorflow/lite/experimental/support/codegen/code_generator.h
deleted file mode 100644
index 5bb151e50a0..00000000000
--- a/tensorflow/lite/experimental/support/codegen/code_generator.h
+++ /dev/null
@@ -1,80 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_SUPPORT_CODEGEN_CODE_GENERATOR_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_SUPPORT_CODEGEN_CODE_GENERATOR_H_
-
-#include <map>
-#include <memory>
-#include <sstream>
-#include <string>
-
-#include "tensorflow/lite/experimental/support/codegen/utils.h"
-#include "tensorflow/lite/experimental/support/metadata/metadata_schema_generated.h"
-
-namespace tflite {
-namespace support {
-namespace codegen {
-
-struct GenerationResult {
-  struct File {
-    std::string path;
-    std::string content;
-  };
-  std::vector<File> files;
-};
-
-/// Defines language-independent codegen strategies, like class naming, .etc.
-/// Should not be used directly.
-class CodeGenerator {
- public:
-  CodeGenerator();
-
-  using TensorMetadataList =
-      typename flatbuffers::Vector<flatbuffers::Offset<TensorMetadata>>;
-
-  virtual ~CodeGenerator() {}
-
-  // Strategies.
-  /// Names all the IO tensors. It's useful when they don't have names, or the
-  /// names have conflicts. We have to name every tensor for code generation.
-  // TODO(b/141225157): Add reserved keywords check.
-  static std::pair<std::vector<std::string>, std::vector<std::string>>
-  NameInputsAndOutputs(const TensorMetadataList* inputs,
-                       const TensorMetadataList* outputs);
-
-  /// Loads a metadata for code generation.
-  /// Returns false if the metadata is not good for generation.
-  static bool VerifyMetadata(const ModelMetadata* metadata, ErrorReporter* err);
-
- protected:
-  /// Converts a name into a valid form. Rules:
-  /// - lower all letters.
-  /// - replace all non alphabet nor numeric characters with underscores.
-  /// - remove prefix underscores.
-  /// - add prefix if the leading character is a number.
-  /// Returns empty string if not possible.
-  static std::string ConvertToValidName(const std::string& name);
-  static std::string NameTensor(const TensorMetadata& tensor,
-                                const std::string& default_name);
-  static void ResolveConflictedInputAndOutputNames(
-      std::vector<std::string>* input, std::vector<std::string>* output);
-};
-
-}  // namespace codegen
-}  // namespace support
-}  // namespace tflite
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_SUPPORT_CODEGEN_CODE_GENERATOR_H_
diff --git a/tensorflow/lite/experimental/support/codegen/code_generator_test.cc b/tensorflow/lite/experimental/support/codegen/code_generator_test.cc
deleted file mode 100644
index 57c5cec60e4..00000000000
--- a/tensorflow/lite/experimental/support/codegen/code_generator_test.cc
+++ /dev/null
@@ -1,126 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/lite/experimental/support/codegen/code_generator.h"
-
-#include <gmock/gmock.h>
-#include <gtest/gtest.h>
-
-namespace tflite {
-namespace support {
-namespace codegen {
-namespace {
-
-using ::testing::ElementsAreArray;
-
-class CodeGeneratorTest : public ::testing::Test {
- public:
-  class TestingCodeGenerator : public CodeGenerator {
-   public:
-    explicit TestingCodeGenerator() : CodeGenerator() {}
-
-    // Make tested method public.
-    static std::string ConvertToValidName(const std::string& name) {
-      return CodeGenerator::ConvertToValidName(name);
-    }
-    static void ResolveConflictedInputAndOutputNames(
-        std::vector<std::string>* input, std::vector<std::string>* output) {
-      CodeGenerator::ResolveConflictedInputAndOutputNames(input, output);
-    }
-  };
-};
-
-TEST_F(CodeGeneratorTest, UpperCasesShouldLower) {
-  EXPECT_THAT(TestingCodeGenerator::ConvertToValidName("AlphaBetCOOL"),
-              "alphabetcool");
-}
-
-TEST_F(CodeGeneratorTest, NonAlphaNumShouldReplace) {
-  EXPECT_THAT(TestingCodeGenerator::ConvertToValidName("A+=B C\t"), "a__b_c_");
-}
-
-TEST_F(CodeGeneratorTest, NoLeadingUnderscore) {
-  EXPECT_THAT(TestingCodeGenerator::ConvertToValidName("+KAI Z"), "kai_z");
-}
-
-TEST_F(CodeGeneratorTest, NoLeadingNumbers) {
-  EXPECT_THAT(TestingCodeGenerator::ConvertToValidName("3000 Cool Tensors"),
-              "tensor_3000_cool_tensors");
-}
-
-TEST_F(CodeGeneratorTest, TestSimpleIONames) {
-  std::vector<std::string> inputs = {"image"};
-  std::vector<std::string> outputs = {"output"};
-  TestingCodeGenerator::ResolveConflictedInputAndOutputNames(&inputs, &outputs);
-  EXPECT_THAT(inputs, ElementsAreArray({"image"}));
-  EXPECT_THAT(outputs, ElementsAreArray({"output"}));
-}
-
-TEST_F(CodeGeneratorTest, TestIOConflict) {
-  std::vector<std::string> inputs = {"image"};
-  std::vector<std::string> outputs = {"image"};
-  TestingCodeGenerator::ResolveConflictedInputAndOutputNames(&inputs, &outputs);
-  EXPECT_THAT(inputs, ElementsAreArray({"input_image"}));
-  EXPECT_THAT(outputs, ElementsAreArray({"output_image"}));
-}
-
-TEST_F(CodeGeneratorTest, TestInternalConflict) {
-  std::vector<std::string> inputs = {"image", "image"};
-  std::vector<std::string> outputs = {"output"};
-  TestingCodeGenerator::ResolveConflictedInputAndOutputNames(&inputs, &outputs);
-  EXPECT_THAT(inputs, ElementsAreArray({"image1", "image2"}));
-  EXPECT_THAT(outputs, ElementsAreArray({"output"}));
-}
-
-TEST_F(CodeGeneratorTest, TestAllConflictNTo1) {
-  std::vector<std::string> inputs = {"image", "image"};
-  std::vector<std::string> outputs = {"image"};
-  TestingCodeGenerator::ResolveConflictedInputAndOutputNames(&inputs, &outputs);
-  EXPECT_THAT(inputs, ElementsAreArray({"input_image1", "input_image2"}));
-  EXPECT_THAT(outputs, ElementsAreArray({"output_image"}));
-}
-
-TEST_F(CodeGeneratorTest, TestAllConflict) {
-  std::vector<std::string> inputs = {"image", "audio", "image", "audio",
-                                     "audio"};
-  std::vector<std::string> outputs = {"image", "image", "audio", "feature",
-                                      "feature"};
-  TestingCodeGenerator::ResolveConflictedInputAndOutputNames(&inputs, &outputs);
-  EXPECT_THAT(inputs,
-              ElementsAreArray({"input_image1", "input_audio1", "input_image2",
-                                "input_audio2", "input_audio3"}));
-  EXPECT_THAT(outputs,
-              ElementsAreArray({"output_image1", "output_image2",
-                                "output_audio", "feature1", "feature2"}));
-}
-
-TEST_F(CodeGeneratorTest, TestAllConflictReversed) {
-  std::vector<std::string> inputs = {"image", "image", "audio", "feature",
-                                     "feature"};
-  std::vector<std::string> outputs = {"image", "audio", "image", "audio",
-                                      "audio"};
-  TestingCodeGenerator::ResolveConflictedInputAndOutputNames(&inputs, &outputs);
-  EXPECT_THAT(inputs,
-              ElementsAreArray({"input_image1", "input_image2", "input_audio",
-                                "feature1", "feature2"}));
-  EXPECT_THAT(outputs, ElementsAreArray({"output_image1", "output_audio1",
-                                         "output_image2", "output_audio2",
-                                         "output_audio3"}));
-}
-
-}  // namespace
-}  // namespace codegen
-}  // namespace support
-}  // namespace tflite
diff --git a/tensorflow/lite/experimental/support/codegen/metadata_helper.cc b/tensorflow/lite/experimental/support/codegen/metadata_helper.cc
deleted file mode 100644
index 6eca86ae5d5..00000000000
--- a/tensorflow/lite/experimental/support/codegen/metadata_helper.cc
+++ /dev/null
@@ -1,100 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/lite/experimental/support/codegen/metadata_helper.h"
-
-#include "tensorflow/lite/experimental/support/codegen/utils.h"
-#include "tensorflow/lite/experimental/support/metadata/metadata_schema_generated.h"
-
-namespace tflite {
-namespace support {
-namespace codegen {
-
-constexpr char BUFFER_KEY[] = "TFLITE_METADATA";
-const ModelMetadata* GetMetadataFromModel(const Model* model) {
-  if (model == nullptr || model->metadata() == nullptr) {
-    return nullptr;
-  }
-  for (auto i = 0; i < model->metadata()->size(); i++) {
-    const auto* name = model->metadata()->Get(i)->name();
-    if (name != nullptr && name->str() == BUFFER_KEY) {
-      const auto buffer_index = model->metadata()->Get(i)->buffer();
-      if (model->buffers() == nullptr ||
-          model->buffers()->size() <= buffer_index) {
-        continue;
-      }
-      const auto* buffer_vec = model->buffers()->Get(buffer_index)->data();
-      if (buffer_vec == nullptr || buffer_vec->data() == nullptr) {
-        continue;
-      }
-      return GetModelMetadata(buffer_vec->data());
-    }
-  }
-  return nullptr;
-}
-
-int FindAssociatedFile(const TensorMetadata* metadata,
-                       const AssociatedFileType file_type,
-                       const std::string& tensor_identifier,
-                       ErrorReporter* err) {
-  int result = -1;
-  if (metadata->associated_files() == nullptr ||
-      metadata->associated_files()->size() == 0) {
-    return result;
-  }
-  for (int i = 0; i < metadata->associated_files()->size(); i++) {
-    const auto* file_metadata = metadata->associated_files()->Get(i);
-    if (file_metadata->type() == file_type) {
-      if (result >= 0) {
-        err->Warning(
-            "Multiple associated file of type %d found on tensor %s. Only the "
-            "first one will be used.",
-            file_type, tensor_identifier.c_str());
-        continue;
-      }
-      result = i;
-    }
-  }
-  return result;
-}
-
-int FindNormalizationUnit(const TensorMetadata* metadata,
-                          const std::string& tensor_identifier,
-                          ErrorReporter* err) {
-  int result = -1;
-  if (metadata->process_units() == nullptr ||
-      metadata->process_units()->size() == 0) {
-    return result;
-  }
-  for (int i = 0; i < metadata->process_units()->size(); i++) {
-    const auto* process_uint = metadata->process_units()->Get(i);
-    if (process_uint->options_type() ==
-        ProcessUnitOptions_NormalizationOptions) {
-      if (result >= 0) {
-        err->Warning(
-            "Multiple normalization unit found in tensor %s. Only the first "
-            "one will be effective.",
-            tensor_identifier.c_str());
-        continue;
-      }
-      result = i;
-    }
-  }
-  return result;
-}
-
-}  // namespace codegen
-}  // namespace support
-}  // namespace tflite
diff --git a/tensorflow/lite/experimental/support/codegen/metadata_helper.h b/tensorflow/lite/experimental/support/codegen/metadata_helper.h
deleted file mode 100644
index 0d5e06b4506..00000000000
--- a/tensorflow/lite/experimental/support/codegen/metadata_helper.h
+++ /dev/null
@@ -1,51 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_SUPPORT_CODEGEN_METADATA_HELPER_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_SUPPORT_CODEGEN_METADATA_HELPER_H_
-
-#include <string>
-
-#include "tensorflow/lite/experimental/support/codegen/utils.h"
-#include "tensorflow/lite/experimental/support/metadata/metadata_schema_generated.h"
-#include "tensorflow/lite/schema/schema_generated.h"
-
-namespace tflite {
-namespace support {
-namespace codegen {
-
-/// Parses a ModelMetadata out from a Model. The returned ModelMetadata's
-/// lifetime is scoped by the model.
-/// Returns nullptr if we cannot find any metadata.
-const ModelMetadata* GetMetadataFromModel(const Model* model);
-
-/// Finds an associated file from a TensorMetadata of certain type. If there're
-/// multiple files meet the criteria, only the first one is used. If there's no
-/// file meets the criteria, -1 will be returned.
-int FindAssociatedFile(const TensorMetadata* metadata,
-                       const AssociatedFileType file_type,
-                       const std::string& tensor_identifier,
-                       ErrorReporter* err);
-
-/// Find the first normalization unit. If none, return -1.
-int FindNormalizationUnit(const TensorMetadata* metadata,
-                          const std::string& tensor_identifier,
-                          ErrorReporter* err);
-
-}  // namespace codegen
-}  // namespace support
-}  // namespace tflite
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_SUPPORT_CODEGEN_METADATA_HELPER_H_
diff --git a/tensorflow/lite/experimental/support/codegen/python/BUILD b/tensorflow/lite/experimental/support/codegen/python/BUILD
deleted file mode 100644
index d364d82eaeb..00000000000
--- a/tensorflow/lite/experimental/support/codegen/python/BUILD
+++ /dev/null
@@ -1,38 +0,0 @@
-load("//tensorflow:tensorflow.bzl", "pybind_extension")
-
-package(
-    default_visibility = [
-        "//visibility:public",
-    ],
-    licenses = ["notice"],  # Apache 2.0
-)
-
-pybind_extension(
-    name = "_pywrap_codegen",
-    srcs = [
-        "codegen_lib.cc",
-    ],
-    features = ["-use_header_modules"],
-    module_name = "_pywrap_codegen",
-    deps = [
-        "//tensorflow/lite/experimental/support/codegen:android_java_generator",
-        "//tensorflow/lite/experimental/support/codegen:code_generator",
-        "//tensorflow/python:pybind11_lib",
-        "//third_party/python_runtime:headers",
-        "@pybind11",
-    ],
-)
-
-py_binary(
-    name = "codegen",
-    srcs = [
-        "codegen.py",
-    ],
-    python_version = "PY3",
-    deps = [
-        ":_pywrap_codegen",
-        "@absl_py//absl:app",
-        "@absl_py//absl/flags",
-        "@absl_py//absl/logging",
-    ],
-)
diff --git a/tensorflow/lite/experimental/support/codegen/python/codegen.py b/tensorflow/lite/experimental/support/codegen/python/codegen.py
deleted file mode 100644
index f28bafe5cff..00000000000
--- a/tensorflow/lite/experimental/support/codegen/python/codegen.py
+++ /dev/null
@@ -1,96 +0,0 @@
-# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Generates Android Java sources from a TFLite model with metadata."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import os
-import shutil
-from absl import app
-from absl import flags
-from absl import logging
-
-from tensorflow.lite.experimental.support.codegen.python import _pywrap_codegen
-
-FLAGS = flags.FLAGS
-
-flags.DEFINE_string('model', None, 'Path to model (.tflite) flatbuffer file.')
-flags.DEFINE_string('destination', None, 'Path of destination of generation.')
-flags.DEFINE_string('package_name', 'org.tensorflow.lite.support',
-                    'Name of generated java package to put the wrapper class.')
-flags.DEFINE_string(
-    'model_class_name', 'MyModel',
-    'Name of generated wrapper class (should not contain package name).')
-flags.DEFINE_string(
-    'model_asset_path', '',
-    '(Optional) Path to the model in generated assets/ dir. If not set, '
-    'generator will use base name of input model.'
-)
-
-
-def get_model_buffer(path):
-  if not os.path.isfile(path):
-    logging.error('Cannot find model at path %s.', path)
-  with open(path, 'rb') as f:
-    buf = f.read()
-    return buf
-
-
-def prepare_directory_for_file(file_path):
-  target_dir = os.path.dirname(file_path)
-  if not os.path.exists(target_dir):
-    os.makedirs(target_dir)
-    return
-  if not os.path.isdir(target_dir):
-    logging.error('Cannot write to %s', target_dir)
-
-
-def main(argv):
-  if len(argv) > 1:
-    logging.error('None flag arguments found: [%s]', ', '.join(argv[1:]))
-
-  codegen = _pywrap_codegen.AndroidJavaGenerator(FLAGS.destination)
-  model_buffer = get_model_buffer(FLAGS.model)
-  model_asset_path = FLAGS.model_asset_path
-  if not model_asset_path:
-    model_asset_path = os.path.basename(FLAGS.model)
-  result = codegen.generate(model_buffer, FLAGS.package_name,
-                            FLAGS.model_class_name, model_asset_path)
-  error_message = codegen.get_error_message().strip()
-  if error_message:
-    logging.error(error_message)
-  if not result.files:
-    logging.error('Generation failed!')
-    return
-
-  for each in result.files:
-    prepare_directory_for_file(each.path)
-    with open(each.path, 'w') as f:
-      f.write(each.content)
-
-  logging.info('Generation succeeded!')
-  model_asset_path = os.path.join(FLAGS.destination, 'src/main/assets',
-                                  model_asset_path)
-  prepare_directory_for_file(model_asset_path)
-  shutil.copy(FLAGS.model, model_asset_path)
-  logging.info('Model copied into assets!')
-
-
-if __name__ == '__main__':
-  flags.mark_flag_as_required('model')
-  flags.mark_flag_as_required('destination')
-  app.run(main)
diff --git a/tensorflow/lite/experimental/support/codegen/python/codegen_lib.cc b/tensorflow/lite/experimental/support/codegen/python/codegen_lib.cc
deleted file mode 100644
index 979ef7fe72a..00000000000
--- a/tensorflow/lite/experimental/support/codegen/python/codegen_lib.cc
+++ /dev/null
@@ -1,49 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "pybind11/detail/common.h"
-#include "pybind11/pybind11.h"
-#include "pybind11/pytypes.h"
-#include "pybind11/stl.h"
-#include "tensorflow/lite/experimental/support/codegen/android_java_generator.h"
-#include "tensorflow/lite/experimental/support/codegen/code_generator.h"
-
-namespace tflite {
-namespace support {
-namespace codegen {
-
-template <typename... Args>
-using overload_cast_ = pybind11::detail::overload_cast_impl<Args...>;
-
-PYBIND11_MODULE(_pywrap_codegen, m) {
-  pybind11::class_<AndroidJavaGenerator>(m, "AndroidJavaGenerator")
-      .def(pybind11::init<const std::string &>())
-      .def("generate",
-           overload_cast_<const char *, const std::string &,
-                          const std::string &, const std::string &>()(
-               &AndroidJavaGenerator::Generate))
-      .def("get_error_message", &AndroidJavaGenerator::GetErrorMessage);
-  pybind11::class_<GenerationResult>(m, "GenerationResult")
-      .def(pybind11::init<>())
-      .def_readwrite("files", &GenerationResult::files);
-  pybind11::class_<GenerationResult::File>(m, "GenerationResultFile")
-      .def(pybind11::init<>())
-      .def_readwrite("path", &GenerationResult::File::path)
-      .def_readwrite("content", &GenerationResult::File::content);
-}
-
-}  // namespace codegen
-}  // namespace support
-}  // namespace tflite
diff --git a/tensorflow/lite/experimental/support/codegen/utils.cc b/tensorflow/lite/experimental/support/codegen/utils.cc
deleted file mode 100644
index 317f96d3e53..00000000000
--- a/tensorflow/lite/experimental/support/codegen/utils.cc
+++ /dev/null
@@ -1,194 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include "tensorflow/lite/experimental/support/codegen/utils.h"
-
-#include <cstdarg>
-
-namespace tflite {
-namespace support {
-namespace codegen {
-
-int ErrorReporter::Warning(const char* format, ...) {
-  va_list args;
-  va_start(args, format);
-  return Report("[WARN] ", format, args);
-}
-
-int ErrorReporter::Error(const char* format, ...) {
-  va_list args;
-  va_start(args, format);
-  return Report("[ERROR] ", format, args);
-}
-
-int ErrorReporter::Report(const char* prefix, const char* format,
-                          va_list args) {
-  char buf[1024];
-  int formatted = vsnprintf(buf, sizeof(buf), format, args);
-  buffer_ << prefix << buf << std::endl;
-  return formatted;
-}
-
-std::string ErrorReporter::GetMessage() {
-  std::string value = buffer_.str();
-  buffer_.str("");
-  return value;
-}
-
-CodeWriter::CodeWriter(ErrorReporter* err) : indent_(0), err_(err) {}
-
-void CodeWriter::SetTokenValue(const std::string& token,
-                               const std::string& value) {
-  value_map_[token] = value;
-}
-
-const std::string CodeWriter::GetTokenValue(const std::string& token) const {
-  auto iter = value_map_.find(token);
-  if (iter == value_map_.end()) {
-    // Typically only Code Generator's call this function (or `Append`). It's
-    // their duty to make sure the token is valid, and requesting for an invalid
-    // token implicits flaws in the code generation logic.
-    err_->Error("Internal: Cannot find value with token '%s'", token.c_str());
-    return "";
-  }
-  return iter->second;
-}
-
-void CodeWriter::SetIndentString(const std::string& indent_str) {
-  indent_str_ = indent_str;
-}
-
-void CodeWriter::Indent() { indent_++; }
-
-void CodeWriter::Outdent() { indent_--; }
-
-std::string CodeWriter::GenerateIndent() const {
-  std::string res;
-  res.reserve(indent_str_.size() * indent_);
-  for (int i = 0; i < indent_; i++) {
-    res.append(indent_str_);
-  }
-  return res;
-}
-
-void CodeWriter::Append(const std::string& text) { AppendInternal(text, true); }
-
-void CodeWriter::AppendNoNewLine(const std::string& text) {
-  AppendInternal(text, false);
-}
-
-void CodeWriter::AppendInternal(const std::string& text, bool newline) {
-  // Prefix indent
-  if ((buffer_.empty()             // nothing in the buffer
-       || buffer_.back() == '\n')  // is on new line
-      // is writing on current line
-      && (!text.empty() && text[0] != '\n' && text[0] != '\r')) {
-    buffer_.append(GenerateIndent());
-  }
-  // State machine variables
-  bool in_token = false;
-  int i = 0;
-  // Rough memory reserve
-  buffer_.reserve(buffer_.size() + text.size());
-  std::string token_buffer;
-  // A simple LL1 analysis
-  while (i < text.size()) {
-    char cur = text[i];
-    char cur_next = i == text.size() - 1 ? '\0' : text[i + 1];  // Set guardian
-    if (!in_token) {
-      if (cur == '{' && cur_next == '{') {  // Enter token
-        in_token = true;
-        i += 2;
-      } else if (cur == '\n') {  // We need to apply global indent here
-        buffer_.push_back(cur);
-        if (cur_next != '\0' && cur_next != '\n' && cur_next != '\r') {
-          buffer_.append(GenerateIndent());
-        }
-        i += 1;
-      } else {
-        buffer_.push_back(cur);
-        i += 1;
-      }
-    } else {
-      if (cur == '}' && cur_next == '}') {  // Close token
-        in_token = false;
-        const auto value = GetTokenValue(token_buffer);
-        buffer_.append(value);
-        token_buffer.clear();
-        i += 2;
-      } else {
-        token_buffer.push_back(cur);
-        i += 1;
-      }
-    }
-  }
-  if (!token_buffer.empty()) {
-    // Typically only Code Generator's call this function. It's
-    // their duty to make sure the code (or template) has valid syntax, and
-    // unclosed "{{...}}" implicits severe error in the template.
-    err_->Error("Internal: Invalid template: {{token}} is not closed.");
-  }
-  if (newline) {
-    buffer_.push_back('\n');
-  }
-}
-
-void CodeWriter::NewLine() { Append(""); }
-
-void CodeWriter::Backspace(int n) {
-  buffer_.resize(buffer_.size() > n ? buffer_.size() - n : 0);
-}
-
-std::string CodeWriter::ToString() const { return buffer_; }
-
-bool CodeWriter::IsStreamEmpty() const { return buffer_.empty(); }
-
-void CodeWriter::Clear() {
-  buffer_.clear();
-  value_map_.clear();
-  indent_ = 0;
-}
-
-std::string SnakeCaseToCamelCase(const std::string& s) {
-  std::string t;
-  t.reserve(s.length());
-  size_t i = 0;
-  // Note: Use simple string += for simplicity.
-  bool cap = false;
-  while (i < s.size()) {
-    const char c = s[i++];
-    if (c == '_') {
-      cap = true;
-    } else if (cap) {
-      t += toupper(c);
-      cap = false;
-    } else {
-      t += c;
-    }
-  }
-  return t;
-}
-
-std::string JoinPath(const std::string& a, const std::string& b) {
-  if (a.empty()) return b;
-  std::string a_fixed = a;
-  if (!a_fixed.empty() && a_fixed.back() == '/') a_fixed.pop_back();
-  std::string b_fixed = b;
-  if (!b_fixed.empty() && b_fixed.front() == '/') b_fixed.erase(0, 1);
-  return a_fixed + "/" + b_fixed;
-}
-
-}  // namespace codegen
-}  // namespace support
-}  // namespace tflite
diff --git a/tensorflow/lite/experimental/support/codegen/utils.h b/tensorflow/lite/experimental/support/codegen/utils.h
deleted file mode 100644
index 17153bd6ad0..00000000000
--- a/tensorflow/lite/experimental/support/codegen/utils.h
+++ /dev/null
@@ -1,127 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_SUPPORT_CODEGEN_UTILS_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_SUPPORT_CODEGEN_UTILS_H_
-
-#include <map>
-#include <sstream>
-#include <string>
-
-namespace tflite {
-namespace support {
-namespace codegen {
-
-/// Collects runtime error logs which could be showed later.
-// TODO(b/150538286): Consider a better mechanism to simplify callsite code.
-class ErrorReporter {
- public:
-  int Warning(const char* format, ...);
-  int Error(const char* format, ...);
-  std::string GetMessage();
-
- private:
-  int Report(const char* prefix, const char* format, va_list args);
-  std::stringstream buffer_;
-};
-
-/// Implements basic code generating with text templates.
-///
-/// It could accept code templates and concatenate them into complete codes. A
-/// template could contain named values.
-///
-/// Example code:
-///   CodeWriter code;
-///   code.SetValue("NAME", "Foo");
-///   code.Append("void {{NAME}}() { printf("%s", "{{NAME}}"); }");
-///   code.SetValue("NAME", "Bar");
-///   code.Append("void {{NAME}}() { printf("%s", "{{NAME}}"); }");
-///
-/// Output:
-///  void Foo() { printf("%s", "Foo"); }
-///  void Bar() { printf("%s", "Bar"); }
-class CodeWriter {
- public:
-  explicit CodeWriter(ErrorReporter* err);
-  /// Sets value to a token. When generating code with template, a string in a
-  /// pair of {{ and }} will be regarded as a token and replaced with the
-  /// corresponding value in code generation.
-  /// It rewrites if the token already has a value.
-  void SetTokenValue(const std::string& token, const std::string& value);
-
-  /// Gets the current value set on the given token.
-  const std::string GetTokenValue(const std::string& token) const;
-
-  /// Sets the unit indent string. For example, in Java it should be "  ".
-  void SetIndentString(const std::string& indent);
-
-  /// Increases the indent by a unit (the string set in SetIndentString).
-  void Indent();
-
-  /// Decreases the indent by a unit (the string set in SetIndentString).
-  void Outdent();
-
-  /// Generates the indentation string.
-  std::string GenerateIndent() const;
-
-  /// Appends a piece of template codes to the stream. Every named value will be
-  /// replaced via the real value. A new line will always be appended at the
-  /// end.
-  void Append(const std::string& text);
-
-  /// Appends a piece of template codes to the stream. Same with `Append`, but a
-  /// new line will not be appended at the end.
-  void AppendNoNewLine(const std::string& text);
-
-  /// Appends a new line to the stream.
-  void NewLine();
-
-  /// Deletes the last N charaters in the stream. If the stream has less than N
-  /// characters, deletes all.
-  void Backspace(int n);
-
-  std::string ToString() const;
-
-  /// Checks if the internal string stream is empty. Note: This method has
-  // overhead.
-  bool IsStreamEmpty() const;
-
-  /// Clears all the internal string stream and value map.
-  void Clear();
-
- private:
-  void AppendInternal(const std::string& text, bool newline);
-
-  std::string indent_str_;
-  int indent_;
-
-  std::map<std::string, std::string> value_map_;
-  std::string buffer_;
-
-  ErrorReporter* err_;
-};
-
-/// Converts foo_bar_name to fooBarName. It's callers duty to make sure given
-/// string "s" is already in snake case; or unexpected behavior may occur.
-std::string SnakeCaseToCamelCase(const std::string& s);
-
-/// Joins 2 parts of file path into one, connected by unix path seperator '/'.
-/// It's callers duty to ensure the two parts are valid.
-std::string JoinPath(const std::string& a, const std::string& b);
-
-}  // namespace codegen
-}  // namespace support
-}  // namespace tflite
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_SUPPORT_CODEGEN_UTILS_H_
diff --git a/tensorflow/lite/experimental/support/codegen/utils_test.cc b/tensorflow/lite/experimental/support/codegen/utils_test.cc
deleted file mode 100644
index 8cdb838129c..00000000000
--- a/tensorflow/lite/experimental/support/codegen/utils_test.cc
+++ /dev/null
@@ -1,97 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/lite/experimental/support/codegen/utils.h"
-
-#include <gtest/gtest.h>
-
-namespace tflite {
-namespace support {
-namespace codegen {
-namespace {
-
-TEST(ErrorReporterTest, TestReportError) {
-  ErrorReporter err;
-  err.Error("some text");
-  EXPECT_EQ(err.GetMessage(), "[ERROR] some text\n");
-  EXPECT_EQ(err.GetMessage(), "");
-}
-
-TEST(CodeGeneratorTest, TestExample) {
-  ErrorReporter err;
-  CodeWriter writer(&err);
-  writer.SetTokenValue("NAME", "Foo");
-  const std::string text = R"(void {{NAME}}() { printf("%s", "{{NAME}}"); })";
-  writer.Append(text);
-  writer.SetTokenValue("NAME", "Bar");
-  writer.Append(text);
-  EXPECT_EQ(
-      "void Foo() { printf(\"%s\", \"Foo\"); }\n"
-      "void Bar() { printf(\"%s\", \"Bar\"); }\n",
-      writer.ToString());
-}
-
-TEST(CodeGeneratorTest, TestInexistentToken) {
-  ErrorReporter err;
-  CodeWriter writer(&err);
-  writer.SetTokenValue("NAME", "Foo");
-  const std::string text = R"(void {{name}}() {})";
-  writer.Append(text);
-  EXPECT_EQ(err.GetMessage(),
-            "[ERROR] Internal: Cannot find value with token 'name'\n");
-}
-
-TEST(CodeGeneratorTest, TestUnclosedToken) {
-  ErrorReporter err;
-  CodeWriter writer(&err);
-  writer.SetTokenValue("NAME", "Foo");
-  const std::string text = R"(void {{NAME}() {})";
-  writer.Append(text);
-  EXPECT_EQ(err.GetMessage(),
-            "[ERROR] Internal: Invalid template: {{token}} is not closed.\n");
-}
-
-TEST(CodeGeneratorTest, TestIndentControl) {
-  ErrorReporter err;
-  CodeWriter writer(&err);
-  writer.SetIndentString("  ");
-  writer.Indent();
-  writer.AppendNoNewLine("abcde");  // Will indent
-  EXPECT_EQ("  abcde", writer.ToString());
-  writer.Clear();
-  writer.Indent();
-  writer.AppendNoNewLine("abc\n\nde");
-  // The blank line will not indent
-  EXPECT_EQ("  abc\n\n  de", writer.ToString());
-  writer.Clear();
-  writer.Indent();
-  writer.Append("abc");
-  writer.Outdent();
-  writer.AppendNoNewLine("def");
-  EXPECT_EQ("  abc\ndef", writer.ToString());
-}
-
-TEST(CaseConversionTest, TestSnakeToCamel) {
-  EXPECT_EQ("imACamel", SnakeCaseToCamelCase("im_a_camel"));
-  EXPECT_EQ("imACamel", SnakeCaseToCamelCase("im_a_camel_"));
-  EXPECT_EQ("ImACamel", SnakeCaseToCamelCase("_im_a_camel"));
-  EXPECT_EQ("", SnakeCaseToCamelCase("_"));
-  EXPECT_EQ("camel", SnakeCaseToCamelCase("camel"));
-}
-
-}  // namespace
-}  // namespace codegen
-}  // namespace support
-}  // namespace tflite
diff --git a/tensorflow/lite/experimental/support/java/AndroidManifest.xml b/tensorflow/lite/experimental/support/java/AndroidManifest.xml
deleted file mode 100644
index b2e22628db6..00000000000
--- a/tensorflow/lite/experimental/support/java/AndroidManifest.xml
+++ /dev/null
@@ -1,6 +0,0 @@
-<?xml version="1.0" encoding="utf-8"?>
-<manifest xmlns:android="http://schemas.android.com/apk/res/android"
-    package="org.tensorflow.lite.support">
-    <uses-sdk android:minSdkVersion="19" />
-</manifest>
-
diff --git a/tensorflow/lite/experimental/support/java/BUILD b/tensorflow/lite/experimental/support/java/BUILD
deleted file mode 100644
index 85f5da17193..00000000000
--- a/tensorflow/lite/experimental/support/java/BUILD
+++ /dev/null
@@ -1,66 +0,0 @@
-# Description:
-# TensorFlow Lite Support API in Java.
-
-load("@build_bazel_rules_android//android:rules.bzl", "android_library")
-load("//tensorflow/java:build_defs.bzl", "JAVACOPTS")
-
-package(
-    default_visibility = ["//visibility:public"],
-    licenses = ["notice"],  # Apache 2.0
-)
-
-# TODO(b/156482505): The NOGPU target is a temporary target. Internally, people
-# may already depend on "tensorflow-lite-support" so we shouldn't remove GPU
-# from its dependency. We will have CLs to help users migrate. After migration
-# is done, the "NOGPU" target will be removed.
-android_library(
-    name = "tensorflow-lite-support-nogpu",
-    srcs = glob(["src/java/org/tensorflow/lite/support/**/*.java"]),
-    javacopts = JAVACOPTS,
-    manifest = "AndroidManifest.xml",
-    deps = [
-        "//tensorflow/lite/java:tensorflowlite",
-        "@org_checkerframework_qual",
-    ],
-)
-
-# TODO(138904786): Split Java part and Android part to make the support library usable by pure Java.
-# For new users: Please use "tensorflow-lite-support-nogpu" if possible, and
-# additionally depends on "tensorflowlite_gpu" if needed.
-android_library(
-    name = "tensorflow-lite-support",
-    srcs = glob(["src/java/org/tensorflow/lite/support/**/*.java"]),
-    javacopts = JAVACOPTS,
-    manifest = "AndroidManifest.xml",
-    deps = [
-        "//tensorflow/lite/java:tensorflowlite",
-        "//tensorflow/lite/java:tensorflowlite_gpu",  # unuseddeps: keep
-        "@org_checkerframework_qual",
-    ],
-)
-
-# This alias matches the style of lite/java naming for android_library targets. We keep the
-# `tensorflow-lite-support` variant to match the associated .aar library name output style.
-alias(
-    name = "tensorflowlite_support",
-    actual = ":tensorflow-lite-support",
-)
-
-java_library(
-    name = "tensorflow-lite-support-precondition",
-    srcs = ["src/java/org/tensorflow/lite/support/common/SupportPreconditions.java"],
-    javacopts = JAVACOPTS,
-    deps = [
-        "@org_checkerframework_qual",
-    ],
-)
-
-android_library(
-    name = "tensorflow-lite-support-precondition-lib-android",
-    srcs = ["src/java/org/tensorflow/lite/support/common/SupportPreconditions.java"],
-    javacopts = JAVACOPTS,
-    manifest = "AndroidManifest.xml",
-    deps = [
-        "@org_checkerframework_qual",
-    ],
-)
diff --git a/tensorflow/lite/experimental/support/java/README.md b/tensorflow/lite/experimental/support/java/README.md
deleted file mode 100644
index c8565e4bf36..00000000000
--- a/tensorflow/lite/experimental/support/java/README.md
+++ /dev/null
@@ -1,17 +0,0 @@
-# TensorFlow Lite Android Support Library
-
-Mobile application developers typically interact with typed objects such as
-bitmaps or primitives such as integers. However, the TensorFlow Lite Interpreter
-that runs the on-device machine learning model uses tensors in the form of
-ByteBuffer, which can be difficult to debug and manipulate. The TensorFlow Lite
-Android Support Library is designed to help process the input and output of
-TensorFlow Lite models, and make the TensorFlow Lite interpreter easier to use.
-
-We welcome feedback from the community as we develop this support library,
-especially around:
-
-*   Use-cases we should support including data types and operations
-*   Ease of use - does the APIs make sense to the community
-
-See the [documentation](https://www.tensorflow.org/lite/guide/lite_support) for
-instruction and examples.
diff --git a/tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/common/FileUtil.java b/tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/common/FileUtil.java
deleted file mode 100644
index e83fd403df3..00000000000
--- a/tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/common/FileUtil.java
+++ /dev/null
@@ -1,184 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-package org.tensorflow.lite.support.common;
-
-import android.content.Context;
-import android.content.res.AssetFileDescriptor;
-import java.io.BufferedReader;
-import java.io.FileInputStream;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.InputStreamReader;
-import java.nio.ByteBuffer;
-import java.nio.MappedByteBuffer;
-import java.nio.channels.FileChannel;
-import java.nio.charset.Charset;
-import java.util.ArrayList;
-import java.util.List;
-import org.checkerframework.checker.nullness.qual.NonNull;
-
-/** File I/O utilities. */
-public class FileUtil {
-  private FileUtil() {}
-
-  /**
-   * Loads labels from the label file into a list of strings.
-   *
-   * <p>A legal label file is the plain text file whose contents are split into lines, and each line
-   * is an individual value. The file should be in assets of the context.
-   *
-   * @param context The context holds assets.
-   * @param filePath The path of the label file, relative with assets directory.
-   * @return a list of labels.
-   * @throws IOException if error occurs to open or read the file.
-   */
-  @NonNull
-  public static List<String> loadLabels(@NonNull Context context, @NonNull String filePath)
-      throws IOException {
-    return loadLabels(context, filePath, Charset.defaultCharset());
-  }
-
-  /**
-   * Loads labels from the label file into a list of strings.
-   *
-   * <p>A legal label file is the plain text file whose contents are split into lines, and each line
-   * is an individual value. The empty lines will be ignored. The file should be in assets of the
-   * context.
-   *
-   * @param context The context holds assets.
-   * @param filePath The path of the label file, relative with assets directory.
-   * @param cs {@code Charset} to use when decoding content of label file.
-   * @return a list of labels.
-   * @throws IOException if error occurs to open or read the file.
-   */
-  @NonNull
-  public static List<String> loadLabels(
-      @NonNull Context context, @NonNull String filePath, Charset cs) throws IOException {
-    SupportPreconditions.checkNotNull(context, "Context cannot be null.");
-    SupportPreconditions.checkNotNull(filePath, "File path cannot be null.");
-    try (InputStream inputStream = context.getAssets().open(filePath)) {
-      return loadLabels(inputStream, cs);
-    }
-  }
-
-  /**
-   * Loads labels from an input stream of an opened label file. See details for label files in
-   * {@link FileUtil#loadLabels(Context, String)}.
-   *
-   * @param inputStream the input stream of an opened label file.
-   * @return a list of labels.
-   * @throws IOException if error occurs to open or read the file.
-   */
-  @NonNull
-  public static List<String> loadLabels(@NonNull InputStream inputStream) throws IOException {
-    return loadLabels(inputStream, Charset.defaultCharset());
-  }
-
-  /**
-   * Loads labels from an input stream of an opened label file. See details for label files in
-   * {@link FileUtil#loadLabels(Context, String)}.
-   *
-   * @param inputStream the input stream of an opened label file.
-   * @param cs {@code Charset} to use when decoding content of label file.
-   * @return a list of labels.
-   * @throws IOException if error occurs to open or read the file.
-   */
-  @NonNull
-  public static List<String> loadLabels(@NonNull InputStream inputStream, Charset cs)
-      throws IOException {
-    List<String> labels = new ArrayList<>();
-    try (BufferedReader reader = new BufferedReader(new InputStreamReader(inputStream, cs))) {
-      String line;
-      while ((line = reader.readLine()) != null) {
-        if (line.trim().length() > 0) {
-          labels.add(line);
-        }
-      }
-      return labels;
-    }
-  }
-
-  /**
-   * Loads a vocabulary file (a single-column text file) into a list of strings.
-   *
-   * <p>A vocabulary file is a single-column plain text file whose contents are split into lines,
-   * and each line is an individual value. The file should be in assets of the context.
-   *
-   * @param context The context holds assets.
-   * @param filePath The path of the vocabulary file, relative with assets directory.
-   * @return a list of vocabulary words.
-   * @throws IOException if error occurs to open or read the file.
-   */
-  @NonNull
-  public static List<String> loadSingleColumnTextFile(
-      @NonNull Context context, @NonNull String filePath, Charset cs) throws IOException {
-    return loadLabels(context, filePath, cs);
-  }
-
-  /**
-   * Loads vocabulary from an input stream of an opened vocabulary file (which is a single-column
-   * text file). See details for vocabulary files in {@link FileUtil#loadVocabularyFile(Context,
-   * String)}.
-   *
-   * @param inputStream the input stream of an opened vocabulary file.
-   * @return a list of vocabulary words.
-   * @throws IOException if error occurs to open or read the file.
-   */
-  @NonNull
-  public static List<String> loadSingleColumnTextFile(@NonNull InputStream inputStream, Charset cs)
-      throws IOException {
-    return loadLabels(inputStream, cs);
-  }
-
-  /**
-   * Loads a file from the asset folder through memory mapping.
-   *
-   * @param context Application context to access assets.
-   * @param filePath Asset path of the file.
-   * @return the loaded memory mapped file.
-   * @throws IOException if an I/O error occurs when loading the tflite model.
-   */
-  @NonNull
-  public static MappedByteBuffer loadMappedFile(@NonNull Context context, @NonNull String filePath)
-      throws IOException {
-    SupportPreconditions.checkNotNull(context, "Context should not be null.");
-    SupportPreconditions.checkNotNull(filePath, "File path cannot be null.");
-    try (AssetFileDescriptor fileDescriptor = context.getAssets().openFd(filePath);
-    FileInputStream inputStream = new FileInputStream(fileDescriptor.getFileDescriptor())) {
-      FileChannel fileChannel = inputStream.getChannel();
-      long startOffset = fileDescriptor.getStartOffset();
-      long declaredLength = fileDescriptor.getDeclaredLength();
-      return fileChannel.map(FileChannel.MapMode.READ_ONLY, startOffset, declaredLength);
-    }
-  }
-
-  /**
-   * Loads a binary file from the asset folder.
-   *
-   * @param context Application context to access assets.
-   * @param filePath Asset path of the file.
-   * @return the byte array for the binary file.
-   * @throws IOException if an I/O error occurs when loading file.
-   */
-  @NonNull
-  public static byte[] loadByteFromFile(@NonNull Context context, @NonNull String filePath)
-      throws IOException {
-    ByteBuffer buffer = loadMappedFile(context, filePath);
-    byte[] byteArray = new byte[buffer.remaining()];
-    buffer.get(byteArray);
-    return byteArray;
-  }
-}
diff --git a/tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/common/Operator.java b/tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/common/Operator.java
deleted file mode 100644
index 38dfe8818cb..00000000000
--- a/tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/common/Operator.java
+++ /dev/null
@@ -1,31 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-package org.tensorflow.lite.support.common;
-
-/**
- * The common interface for classes that carries an "apply" method, which converts T to another one.
- * @param <T> The class which Operator handles.
- */
-public interface Operator<T> {
-
-  /**
-   * Applies an operation on a T object, returning a T object.
-   *
-   * <p>Note: The returned object could probably be the same one with given input, and given input
-   * could probably be changed.
-   */
-  T apply(T x);
-}
diff --git a/tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/common/Processor.java b/tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/common/Processor.java
deleted file mode 100644
index 07d7e2bda43..00000000000
--- a/tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/common/Processor.java
+++ /dev/null
@@ -1,23 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-package org.tensorflow.lite.support.common;
-
-/**
- * Processes T object with prepared {@link Operator<T>}.
- */
-public interface Processor<T> {
-  T process(T input);
-}
diff --git a/tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/common/SequentialProcessor.java b/tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/common/SequentialProcessor.java
deleted file mode 100644
index ff0c6406f03..00000000000
--- a/tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/common/SequentialProcessor.java
+++ /dev/null
@@ -1,82 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-package org.tensorflow.lite.support.common;
-
-import java.util.ArrayList;
-import java.util.Collections;
-import java.util.HashMap;
-import java.util.List;
-import java.util.Map;
-import org.checkerframework.checker.nullness.qual.NonNull;
-
-/**
- * A processor base class that chains a serial of {@link Operator<T>} and executes them.
- *
- * <p>Typically, users could use its subclasses, e.g. {@link
- * org.tensorflow.lite.support.image.ImageProcessor} rather than directly use this one.
- *
- * @param <T> The type that the Operator is handling.
- */
-public class SequentialProcessor<T> implements Processor<T> {
-
-  /** List of operators added to this {@link SequentialProcessor}. */
-  protected final List<Operator<T>> operatorList;
-  /**
-   * The {@link Map} between the operator name and the corresponding op indexes in {@code
-   * operatorList}. An operator may be added multiple times into this {@link SequentialProcessor}.
-   */
-  protected final Map<String, List<Integer>> operatorIndex;
-
-  protected SequentialProcessor(Builder<T> builder) {
-    operatorList = builder.operatorList;
-    operatorIndex = Collections.unmodifiableMap(builder.operatorIndex);
-  }
-
-  @Override
-  public T process(T x) {
-    for (Operator<T> op : operatorList) {
-      x = op.apply(x);
-    }
-    return x;
-  }
-
-  /** The inner builder class to build a Sequential Processor. */
-  protected static class Builder<T> {
-
-    private final List<Operator<T>> operatorList;
-    private final Map<String, List<Integer>> operatorIndex;
-
-    protected Builder() {
-      operatorList = new ArrayList<>();
-      operatorIndex = new HashMap<>();
-    }
-
-    public Builder<T> add(@NonNull Operator<T> op) {
-      SupportPreconditions.checkNotNull(op, "Adding null Op is illegal.");
-      operatorList.add(op);
-      String operatorName = op.getClass().getName();
-      if (!operatorIndex.containsKey(operatorName)) {
-        operatorIndex.put(operatorName, new ArrayList<Integer>());
-      }
-      operatorIndex.get(operatorName).add(operatorList.size() - 1);
-      return this;
-    }
-
-    public SequentialProcessor<T> build() {
-      return new SequentialProcessor<T>(this);
-    }
-  }
-}
diff --git a/tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/common/SupportPreconditions.java b/tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/common/SupportPreconditions.java
deleted file mode 100644
index 8620e13eec7..00000000000
--- a/tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/common/SupportPreconditions.java
+++ /dev/null
@@ -1,184 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-package org.tensorflow.lite.support.common;
-
-import org.checkerframework.checker.nullness.qual.Nullable;
-
-/** Static error checking util methods. */
-public final class SupportPreconditions {
-  /**
-   * Ensures that an object reference passed as a parameter to the calling method is not null.
-   *
-   * @param reference an object reference
-   * @return the non-null reference that was validated
-   * @throws NullPointerException if {@code reference} is null
-   */
-  public static <T extends Object> T checkNotNull(T reference) {
-    if (reference == null) {
-      throw new NullPointerException("The object reference is null.");
-    }
-    return reference;
-  }
-
-  /**
-   * Ensures that an object reference passed as a parameter to the calling method is not null.
-   *
-   * @param reference an object reference
-   * @param errorMessage the exception message to use if the check fails; will be converted to a
-   *     string using {@link String#valueOf(Object)}
-   * @return the non-null reference that was validated
-   * @throws NullPointerException if {@code reference} is null
-   */
-  public static <T extends Object> T checkNotNull(T reference, @Nullable Object errorMessage) {
-    if (reference == null) {
-      throw new NullPointerException(String.valueOf(errorMessage));
-    }
-    return reference;
-  }
-
-  /**
-   * Ensures that the given String is not empty and not null.
-   *
-   * @param string the String to test
-   * @return the non-null non-empty String that was validated
-   * @throws IllegalArgumentException if {@code string} is null or empty
-   */
-  public static String checkNotEmpty(String string) {
-    if (string == null || string.length() == 0) {
-      throw new IllegalArgumentException("Given String is empty or null.");
-    }
-    return string;
-  }
-
-  /**
-   * Ensures that the given String is not empty and not null.
-   *
-   * @param string the String to test
-   * @param errorMessage the exception message to use if the check fails; will be converted to a
-   *     string using {@link String#valueOf(Object)}
-   * @return the non-null non-empty String that was validated
-   * @throws IllegalArgumentException if {@code string} is null or empty
-   */
-  public static String checkNotEmpty(String string, Object errorMessage) {
-    if (string == null || string.length() == 0) {
-      throw new IllegalArgumentException(String.valueOf(errorMessage));
-    }
-    return string;
-  }
-
-  /**
-   * Ensures the truth of an expression involving one or more parameters to the calling method.
-   *
-   * @param expression a boolean expression.
-   * @throws IllegalArgumentException if {@code expression} is false.
-   */
-  public static void checkArgument(boolean expression) {
-    if (!expression) {
-      throw new IllegalArgumentException();
-    }
-  }
-
-  /**
-   * Ensures the truth of an expression involving one or more parameters to the calling method.
-   *
-   * @param expression a boolean expression.
-   * @param errorMessage the exception message to use if the check fails; will be converted to a
-   *     string using {@link String#valueOf(Object)}.
-   * @throws IllegalArgumentException if {@code expression} is false.
-   */
-  public static void checkArgument(boolean expression, @Nullable Object errorMessage) {
-    if (!expression) {
-      throw new IllegalArgumentException(String.valueOf(errorMessage));
-    }
-  }
-
-  /**
-   * Ensures that {@code index} specifies a valid <i>element</i> in an array, list or string of size
-   * {@code size}. An element index may range from zero, inclusive, to {@code size}, exclusive.
-   *
-   * @param index a user-supplied index identifying an element of an array, list or string
-   * @param size the size of that array, list or string
-   * @return the value of {@code index}
-   * @throws IndexOutOfBoundsException if {@code index} is negative or is not less than {@code size}
-   * @throws IllegalArgumentException if {@code size} is negative
-   */
-  public static int checkElementIndex(int index, int size) {
-    return checkElementIndex(index, size, "index");
-  }
-
-  /**
-   * Ensures that {@code index} specifies a valid <i>element</i> in an array, list or string of size
-   * {@code size}. An element index may range from zero, inclusive, to {@code size}, exclusive.
-   *
-   * @param index a user-supplied index identifying an element of an array, list or string
-   * @param size the size of that array, list or string
-   * @param desc the text to use to describe this index in an error message
-   * @return the value of {@code index}
-   * @throws IndexOutOfBoundsException if {@code index} is negative or is not less than {@code size}
-   * @throws IllegalArgumentException if {@code size} is negative
-   */
-  public static int checkElementIndex(int index, int size, @Nullable String desc) {
-    // Carefully optimized for execution by hotspot (explanatory comment above)
-    if (index < 0 || index >= size) {
-      throw new IndexOutOfBoundsException(badElementIndex(index, size, desc));
-    }
-    return index;
-  }
-
-  /**
-   * Ensures the truth of an expression involving the state of the calling instance, but not
-   * involving any parameters to the calling method.
-   *
-   * @param expression a boolean expression
-   * @throws IllegalStateException if {@code expression} is false
-   * @see Verify#verify Verify.verify()
-   */
-  public static void checkState(boolean expression) {
-    if (!expression) {
-      throw new IllegalStateException();
-    }
-  }
-
-  /**
-   * Ensures the truth of an expression involving the state of the calling instance, but not
-   * involving any parameters to the calling method.
-   *
-   * @param expression a boolean expression
-   * @param errorMessage the exception message to use if the check fails; will be converted to a
-   *     string using {@link String#valueOf(Object)}
-   * @throws IllegalStateException if {@code expression} is false
-   * @see Verify#verify Verify.verify()
-   */
-  public static void checkState(boolean expression, @Nullable Object errorMessage) {
-    if (!expression) {
-      throw new IllegalStateException(String.valueOf(errorMessage));
-    }
-  }
-
-  private static String badElementIndex(int index, int size, @Nullable String desc) {
-    if (index < 0) {
-      return String.format("%s (%s) must not be negative", desc, index);
-    } else if (size < 0) {
-      throw new IllegalArgumentException("negative size: " + size);
-    } else { // index >= size
-      return String.format("%s (%s) must be less than size (%s)", desc, index, size);
-    }
-  }
-
-  private SupportPreconditions() {
-    throw new AssertionError("SupportPreconditions is Uninstantiable.");
-  }
-}
diff --git a/tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/common/TensorOperator.java b/tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/common/TensorOperator.java
deleted file mode 100644
index d1b7021df25..00000000000
--- a/tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/common/TensorOperator.java
+++ /dev/null
@@ -1,27 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-package org.tensorflow.lite.support.common;
-
-import org.tensorflow.lite.support.tensorbuffer.TensorBuffer;
-
-/**
- * Applies some operation on TensorBuffers.
- */
-public interface TensorOperator extends Operator<TensorBuffer> {
-  /** @see Operator#apply(Object) . */
-  @Override
-  TensorBuffer apply(TensorBuffer input);
-}
diff --git a/tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/common/TensorProcessor.java b/tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/common/TensorProcessor.java
deleted file mode 100644
index 31531b2eb6a..00000000000
--- a/tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/common/TensorProcessor.java
+++ /dev/null
@@ -1,68 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-package org.tensorflow.lite.support.common;
-
-import org.tensorflow.lite.support.tensorbuffer.TensorBuffer;
-
-/**
- * TensorProcessor is a helper class for preprocessing and postprocessing tensors. It could
- * transform a {@link TensorBuffer} to another by executing a chain of {@link TensorOperator}.
- *
- * <p>Example Usage:
- *
- * <pre>
- *   TensorProcessor processor = new TensorProcessor.Builder().add(new NormalizeOp(1, 2)).build();
- *   TensorBuffer anotherTensorBuffer = processor.process(tensorBuffer);
- * </pre>
- *
- * @see TensorProcessor.Builder to build a {@link TensorProcessor} instance.
- * @see TensorProcessor#process(TensorBuffer) to apply the processor on a {@link TensorBuffer}.
- */
-public class TensorProcessor extends SequentialProcessor<TensorBuffer> {
-  private TensorProcessor(Builder builder) {
-    super(builder);
-  }
-
-  /** The Builder to create an {@link TensorProcessor}, which could be executed later. */
-  public static class Builder extends SequentialProcessor.Builder<TensorBuffer> {
-
-    /**
-     * Creates a Builder to build {@link TensorProcessor}.
-     *
-     * @see #add(TensorOperator) to add an Op.
-     * @see #build() to complete the building process and get a built Processor.
-     */
-    public Builder() {
-      super();
-    }
-
-    /**
-     * Adds an {@link TensorOperator} into the Operator chain.
-     *
-     * @param op the Operator instance to be executed then.
-     */
-    public TensorProcessor.Builder add(TensorOperator op) {
-      super.add(op);
-      return this;
-    }
-
-    /** Completes the building process and gets the {@link TensorProcessor} instance. */
-    @Override
-    public TensorProcessor build() {
-      return new TensorProcessor(this);
-    }
-  }
-}
diff --git a/tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/common/ops/CastOp.java b/tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/common/ops/CastOp.java
deleted file mode 100644
index 3355b185655..00000000000
--- a/tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/common/ops/CastOp.java
+++ /dev/null
@@ -1,55 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-package org.tensorflow.lite.support.common.ops;
-
-import org.tensorflow.lite.DataType;
-import org.tensorflow.lite.support.common.SupportPreconditions;
-import org.tensorflow.lite.support.common.TensorOperator;
-import org.tensorflow.lite.support.tensorbuffer.TensorBuffer;
-
-/** Casts a {@link TensorBuffer} to a specified data type. */
-public class CastOp implements TensorOperator {
-
-  private final DataType destinationType;
-
-  /**
-   * Constructs a CastOp.
-   *
-   * <p>Note: For only converting type for a certain {@link TensorBuffer} on-the-fly rather than in
-   * a processor, please directly use {@link TensorBuffer#createFrom(TensorBuffer, DataType)}.
-   *
-   * <p>When this Op is executed, if the original {@link TensorBuffer} is already in {@code
-   * destinationType}, the original buffer will be directly returned.
-   *
-   * @param destinationType: The type of the casted {@link TensorBuffer}.
-   * @throws IllegalArgumentException if {@code destinationType} is neither {@link DataType#UINT8}
-   * nor {@link DataType#FLOAT32}.
-   */
-  public CastOp(DataType destinationType) {
-    SupportPreconditions.checkArgument(
-        destinationType == DataType.UINT8 || destinationType == DataType.FLOAT32,
-        "Destination type " + destinationType + " is not supported.");
-    this.destinationType = destinationType;
-  }
-
-  @Override
-  public TensorBuffer apply(TensorBuffer input) {
-    if (input.getDataType() == destinationType) {
-      return input;
-    }
-    return TensorBuffer.createFrom(input, destinationType);
-  }
-}
diff --git a/tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/common/ops/DequantizeOp.java b/tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/common/ops/DequantizeOp.java
deleted file mode 100644
index 1881747870b..00000000000
--- a/tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/common/ops/DequantizeOp.java
+++ /dev/null
@@ -1,40 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-package org.tensorflow.lite.support.common.ops;
-
-import org.tensorflow.lite.support.common.TensorOperator;
-import org.tensorflow.lite.support.tensorbuffer.TensorBuffer;
-
-/**
- * Dequantizes a {@link TensorBuffer} with given {@code zeroPoint} and {@code scale}.
- *
- * <p>Note: The data type of output tensor is always {@code FLOAT32} except when the DequantizeOp is
- * created effectively as an identity Op such as setting {@code zeroPoint} to 0 and {@code scale} to
- * 1 (in this case, the output tensor is the same instance as input).
- *
- * <p>If both {@code zeroPoint} and {@code scale} are 0, the {@link DequantizeOp} will be bypassed,
- * which is equivalent to setting {@code zeroPoint} to 0 and {@code scale} to 1. This can be useful
- * when passing in the quantization parameters that are extracted directly from the TFLite model
- * flatbuffer. If the tensor is not quantized, both {@code zeroPoint} and {@code scale} will be read
- * as 0.
- */
-public class DequantizeOp extends NormalizeOp implements TensorOperator {
-
-  public DequantizeOp(float zeroPoint, float scale) {
-    // Quantization: f = (q - z) * s
-    super(zeroPoint, 1 / scale);
-  }
-}
diff --git a/tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/common/ops/NormalizeOp.java b/tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/common/ops/NormalizeOp.java
deleted file mode 100644
index 8ac57eed286..00000000000
--- a/tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/common/ops/NormalizeOp.java
+++ /dev/null
@@ -1,160 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-package org.tensorflow.lite.support.common.ops;
-
-import org.checkerframework.checker.nullness.qual.NonNull;
-import org.tensorflow.lite.DataType;
-import org.tensorflow.lite.support.common.SupportPreconditions;
-import org.tensorflow.lite.support.common.TensorOperator;
-import org.tensorflow.lite.support.tensorbuffer.TensorBuffer;
-import org.tensorflow.lite.support.tensorbuffer.TensorBufferFloat;
-
-/**
- * Normalizes a {@link TensorBuffer} with given mean and stddev: output = (input - mean) / stddev.
- */
-public class NormalizeOp implements TensorOperator {
-
-  // mean.length should always be equal to stddev.length and always >= 1.
-  private final float[] mean;
-  private final float[] stddev;
-  private final int numChannels;
-  private final boolean isIdentityOp;
-
-  /**
-   * Initializes a NormalizeOp. When being called, it creates a new {@link TensorBuffer}, which
-   * satisfies:
-   *
-   * <pre>
-   *   output = (input - mean) / stddev
-   * </pre>
-   *
-   * <p>In the following two cases, reset {@code mean} to 0 and {@code stddev} to 1 to bypass the
-   * normalization. <br>
-   * 1. Both {@code mean} and {code stddev} are 0. <br>
-   * 2. {@code mean} is 0 and {stddev} is Infinity.
-   *
-   * <p>Note: If {@code mean} is set to 0 and {@code stddev} is set to 1, no computation will
-   * happen, and original input will be directly returned in execution.
-   *
-   * <p>Note: The returned {@link TensorBuffer} is always a {@link DataType#FLOAT32} tensor at
-   * present, except that the input is a {@link DataType#UINT8} tensor, {@code mean} is set to 0 and
-   * {@code stddev} is set to 1.
-   *
-   * @param mean the mean value to be subtracted first.
-   * @param stddev the standard deviation value to divide then.
-   * @throws IllegalArgumentException if {@code stddev} is zero.
-   */
-  public NormalizeOp(float mean, float stddev) {
-    // Make exceptions to the cases that
-    // 1. Both mean and stddev are 0.0f. This may happen when reading the normalization parameters
-    // from a tensor which does not have the values populated in the metadata. The same situation
-    // may also happen to the quantization parameters.
-    // 2. mean is 0.0f and stddev is Infinity. This may happen when reading the quantization
-    // parameters from a tensor which does not have the values populated in the metadata, and then
-    // passing the parameters into the DequantizeOp.
-    // Bypass both of the two cases, by reseting stddev to 1.0f.
-    if (mean == 0.0f && (stddev == 0.0f || Float.isInfinite(stddev))) {
-      stddev = 1.0f;
-    }
-
-    SupportPreconditions.checkArgument(stddev != 0.0f, "Stddev cannot be zero.");
-    boolean meansIsZeroAndDevsIs1 = false;
-    if (mean == 0.0f && stddev == 1.0f) {
-      meansIsZeroAndDevsIs1 = true;
-    }
-
-    this.isIdentityOp = meansIsZeroAndDevsIs1;
-    this.mean = new float[] {mean};
-    this.stddev = new float[] {stddev};
-    this.numChannels = 1;
-  }
-
-  /**
-   * Initializes a NormalizeOp. When being called, it creates a new {@link TensorBuffer}, which
-   * satisfies:
-   *
-   * <pre>
-   *   // Pseudo code. [...][i] means a certain element whose channel id is i.
-   *   output[...][i] = (input[...][i] - mean[i]) / stddev[i]
-   * </pre>
-   *
-   * <p>Note: If all values in {@code mean} are set to 0 and all {@code stddev} are set to 1, no
-   * computation will happen, and original input will be directly returned in execution.
-   *
-   * <p>Note: The returned {@link TensorBuffer} is always a {@link DataType#FLOAT32} tensor at
-   * present, except that the input is a {@link DataType#UINT8} tensor, all {@code mean} are set to
-   * 0 and all {@code stddev} are set to 1.
-   *
-   * @param mean the mean values to be subtracted first for each channel.
-   * @param stddev the standard deviation values to divide then for each channel.
-   * @throws IllegalArgumentException if any {@code stddev} is zero, or {@code mean} has different
-   *     number of elements with {@code stddev}, or any of them is empty.
-   */
-  public NormalizeOp(@NonNull float[] mean, @NonNull float[] stddev) {
-    SupportPreconditions.checkNotNull(mean, "Mean cannot be null");
-    SupportPreconditions.checkNotNull(stddev, "Stddev cannot be null");
-    SupportPreconditions.checkArgument(
-        mean.length == stddev.length,
-        "Per channel normalization requires same number of means and stddevs");
-    SupportPreconditions.checkArgument(mean.length > 0, "Means and stddevs are empty.");
-    this.mean = mean.clone();
-    this.stddev = stddev.clone();
-    boolean allMeansAreZeroAndAllDevsAre1 = true;
-    this.numChannels = mean.length;
-    for (int i = 0; i < numChannels; i++) {
-      SupportPreconditions.checkArgument(this.stddev[i] != 0, "Stddev cannot be zero.");
-      if (this.stddev[i] != 1 || this.mean[i] != 0) {
-        allMeansAreZeroAndAllDevsAre1 = false;
-      }
-    }
-    this.isIdentityOp = allMeansAreZeroAndAllDevsAre1;
-  }
-
-  /**
-   * Applies the defined normalization on given tensor and returns the result.
-   *
-   * <p>Note: {@code input} is possibly the same instance with the output.
-   *
-   * @param input input tensor. It may be the same instance with the output.
-   * @return output tensor.
-   */
-  @Override
-  @NonNull
-  public TensorBuffer apply(@NonNull TensorBuffer input) {
-    if (isIdentityOp) {
-      return input;
-    }
-    int[] shape = input.getShape();
-    SupportPreconditions.checkArgument(
-        numChannels == 1 || (shape.length != 0 && shape[shape.length - 1] == numChannels),
-        "Number of means (stddevs) is not same with number of channels (size of last axis).");
-    // TODO(136750944): Eliminate the array copy here.
-    float[] values = input.getFloatArray();
-    int j = 0;
-    for (int i = 0; i < values.length; i++) {
-      values[i] = (values[i] - mean[j]) / stddev[j];
-      j = (j + 1) % numChannels;
-    }
-    TensorBuffer output;
-    if (input.isDynamic()) {
-      output = TensorBufferFloat.createDynamic(DataType.FLOAT32);
-    } else {
-      output = TensorBufferFloat.createFixedSize(shape, DataType.FLOAT32);
-    }
-    output.loadArray(values, shape);
-    return output;
-  }
-}
diff --git a/tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/common/ops/QuantizeOp.java b/tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/common/ops/QuantizeOp.java
deleted file mode 100644
index 8b3e82aee13..00000000000
--- a/tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/common/ops/QuantizeOp.java
+++ /dev/null
@@ -1,41 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-package org.tensorflow.lite.support.common.ops;
-
-import org.tensorflow.lite.support.common.TensorOperator;
-import org.tensorflow.lite.support.tensorbuffer.TensorBuffer;
-
-/**
- * Quantizes a {@link TensorBuffer} with given {@code zeroPoint} and {@code scale}.
- *
- * <p>Note: {@link QuantizeOp} does not cast output to UINT8, but only performs the quantization
- * math on top of input. The data type of output tensor is always {@code FLOAT32} except that the Op
- * is effectively an identity Op (in this case, the output tensor is the same instance as the
- * input). To connect with quantized model, a {@link CastOp} is probably needed.
- *
- * <p>If both {@code zeroPoint} and {@code scale} are 0, the {@link QuantizeOp} will be bypassed,
- * which is equivalent to setting {@code zeroPoint} to 0 and {@code scale} to 1. This can be useful
- * when passing in the quantization parameters that are extracted directly from the TFLite model
- * flatbuffer. If the tensor is not quantized, both {@code zeroPoint} and {@code scale} will be read
- * as 0.
- */
-public class QuantizeOp extends NormalizeOp implements TensorOperator {
-
-  public QuantizeOp(float zeroPoint, float scale) {
-    // Quantization: f = (q - z) * s, i.e. q = f / s + z = (f - (-z * s)) / s
-    super(-zeroPoint * scale, scale);
-  }
-}
diff --git a/tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/image/BoundingBoxUtil.java b/tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/image/BoundingBoxUtil.java
deleted file mode 100644
index 30f562063f3..00000000000
--- a/tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/image/BoundingBoxUtil.java
+++ /dev/null
@@ -1,202 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-package org.tensorflow.lite.support.image;
-
-import static org.tensorflow.lite.support.common.SupportPreconditions.checkArgument;
-
-import android.graphics.RectF;
-import java.nio.ByteBuffer;
-import java.nio.FloatBuffer;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.List;
-import org.tensorflow.lite.DataType;
-import org.tensorflow.lite.support.tensorbuffer.TensorBuffer;
-
-/**
- * Helper class for converting values that represents bounding boxes into rectangles.
- *
- * <p>The class provides a static function to create bounding boxes as {@link RectF} from different
- * types of configurations.
- *
- * <p>Generally, a bounding box could be represented by 4 float values, but the values could be
- * interpreted in many ways. We now support 3 {@link Type} of configurations, and the order of
- * elements in each type is configurable as well.
- */
-public final class BoundingBoxUtil {
-
-  /** Denotes how a bounding box is represented. */
-  public enum Type {
-    /**
-     * Represents the bounding box by using the combination of boundaries, {left, top, right,
-     * bottom}. The default order is {left, top, right, bottom}. Other orders can be indicated by an
-     * index array.
-     */
-    BOUNDARIES,
-    /**
-     * Represents the bounding box by using the upper_left corner, width and height. The default
-     * order is {upper_left_x, upper_left_y, width, height}. Other orders can be indicated by an
-     * index array.
-     */
-    UPPER_LEFT,
-    /**
-     * Represents the bounding box by using the center of the box, width and height. The default
-     * order is {center_x, center_y, width, height}. Other orders can be indicated by an index
-     * array.
-     */
-    CENTER,
-  }
-
-  /** Denotes if the coordinates are actual pixels or relative ratios. */
-  public enum CoordinateType {
-    /** The coordinates are relative ratios in range [0, 1]. */
-    RATIO,
-    /** The coordinates are actual pixel values. */
-    PIXEL
-  }
-
-  /**
-   * Creates a list of bounding boxes from a {@link TensorBuffer} which represents bounding boxes.
-   *
-   * @param tensor holds the data representing some boxes.
-   * @param valueIndex denotes the order of the elements defined in each bounding box type. An empty
-   *     index array represent the default order of each bounding box type. For example, to denote
-   *     the default order of BOUNDARIES, {left, top, right, bottom}, the index should be {0, 1, 2,
-   *     3}. To denote the order {left, right, top, bottom}, the order should be {0, 2, 1, 3}.
-   *     <p>The index array can be applied to all bounding box types to adjust the order of their
-   *     corresponding underlying elements.
-   * @param boundingBoxAxis specifies the index of the dimension that represents bounding box. The
-   *     size of that dimension is required to be 4. Index here starts from 0. For example, if the
-   *     tensor has shape 4x10, the axis for bounding boxes is likely to be 0. For shape 10x4, the
-   *     axis is likely to be 1 (or -1, equivalently).
-   * @param type defines how values should be converted into boxes. See {@link Type}
-   * @param coordinateType defines how values are interpreted to coordinates. See {@link
-   *     CoordinateType}
-   * @param height the height of the image which the boxes belong to. Only has effects when {@code
-   *     coordinateType} is {@link CoordinateType#RATIO}
-   * @param width the width of the image which the boxes belong to. Only has effects when {@code
-   *     coordinateType} is {@link CoordinateType#RATIO}
-   * @return A list of bounding boxes that the {@code tensor} represents. All dimensions except
-   *     {@code boundingBoxAxis} will be collapsed with order kept. For example, given {@code
-   *     tensor} with shape {1, 4, 10, 2} and {@code boundingBoxAxis = 1}, The result will be a list
-   *     of 20 bounding boxes.
-   * @throws IllegalArgumentException if size of bounding box dimension (set by {@code
-   *     boundingBoxAxis}) is not 4.
-   * @throws IllegalArgumentException if {@code boundingBoxAxis} is not in {@code (-(D+1), D)} where
-   *     {@code D} is the number of dimensions of the {@code tensor}.
-   * @throws IllegalArgumentException if {@code tensor} has data type other than {@link
-   *     DataType#FLOAT32}.
-   */
-  public static List<RectF> convert(
-      TensorBuffer tensor,
-      int[] valueIndex,
-      int boundingBoxAxis,
-      Type type,
-      CoordinateType coordinateType,
-      int height,
-      int width) {
-    int[] shape = tensor.getShape();
-    checkArgument(
-        boundingBoxAxis >= -shape.length && boundingBoxAxis < shape.length,
-        String.format(
-            "Axis %d is not in range (-(D+1), D), where D is the number of dimensions of input"
-                + " tensor (shape=%s)",
-            boundingBoxAxis, Arrays.toString(shape)));
-    if (boundingBoxAxis < 0) {
-      boundingBoxAxis = shape.length + boundingBoxAxis;
-    }
-    checkArgument(
-        shape[boundingBoxAxis] == 4,
-        String.format(
-            "Size of bounding box dimension %d is not 4. Got %d in shape %s",
-            boundingBoxAxis, shape[boundingBoxAxis], Arrays.toString(shape)));
-    checkArgument(
-        valueIndex.length == 4,
-        String.format(
-            "Bounding box index array length %d is not 4. Got index array %s",
-            valueIndex.length, Arrays.toString(valueIndex)));
-    checkArgument(
-        tensor.getDataType() == DataType.FLOAT32,
-        "Bounding Boxes only create from FLOAT32 buffers. Got: " + tensor.getDataType().name());
-    List<RectF> boundingBoxList = new ArrayList<>();
-    // Collapse dimensions to {a, 4, b}. So each bounding box could be represent as (i, j), and its
-    // four values are (i, k, j), where 0 <= k < 4. We can compute the 4 flattened index by
-    // i * 4b + k * b + j.
-    int a = 1;
-    for (int i = 0; i < boundingBoxAxis; i++) {
-      a *= shape[i];
-    }
-    int b = 1;
-    for (int i = boundingBoxAxis + 1; i < shape.length; i++) {
-      b *= shape[i];
-    }
-    float[] values = new float[4];
-    ByteBuffer byteBuffer = tensor.getBuffer();
-    byteBuffer.rewind();
-    FloatBuffer floatBuffer = byteBuffer.asFloatBuffer();
-    for (int i = 0; i < a; i++) {
-      for (int j = 0; j < b; j++) {
-        for (int k = 0; k < 4; k++) {
-          values[k] = floatBuffer.get((i * 4 + k) * b + j);
-        }
-        boundingBoxList.add(
-            convertOneBoundingBox(values, valueIndex, type, coordinateType, height, width));
-      }
-    }
-    byteBuffer.rewind();
-    return boundingBoxList;
-  }
-
-  private static RectF convertOneBoundingBox(
-      float[] values,
-      int[] valueIndex,
-      Type type,
-      CoordinateType coordinateType,
-      int height,
-      int width) {
-    float[] orderedValues = new float[4];
-    for (int i = 0; i < 4; i++) {
-      orderedValues[i] = values[valueIndex[i]];
-    }
-    return convertOneBoundingBox(orderedValues, type, coordinateType, height, width);
-  }
-
-  private static RectF convertOneBoundingBox(
-      float[] values, Type type, CoordinateType coordinateType, int height, int width) {
-    switch (type) {
-      case BOUNDARIES:
-        return convertFromBoundaries(values, coordinateType, height, width);
-      case UPPER_LEFT:
-      case CENTER:
-        // TODO(b/150824448): convertFrom{UpperLeft, Center}
-        throw new IllegalArgumentException("BoundingBox.Type " + type + " is not yet supported.");
-    }
-    throw new IllegalArgumentException("Cannot recognize BoundingBox.Type " + type);
-  }
-
-  private static RectF convertFromBoundaries(
-      float[] values, CoordinateType coordinateType, int height, int width) {
-    if (coordinateType == CoordinateType.RATIO) {
-      return new RectF(
-          values[0] * width, values[1] * height, values[2] * width, values[3] * height);
-    } else {
-      return new RectF(values[0], values[1], values[2], values[3]);
-    }
-  }
-
-  // Private constructor to prevent initialization.
-  private BoundingBoxUtil() {}
-}
diff --git a/tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/image/ImageConversions.java b/tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/image/ImageConversions.java
deleted file mode 100644
index b2b7a339a75..00000000000
--- a/tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/image/ImageConversions.java
+++ /dev/null
@@ -1,108 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-package org.tensorflow.lite.support.image;
-
-import android.graphics.Bitmap;
-import android.graphics.Color;
-import java.util.Arrays;
-import org.tensorflow.lite.DataType;
-import org.tensorflow.lite.support.tensorbuffer.TensorBuffer;
-
-/**
- * Implements some stateless image conversion methods.
- *
- * This class is an internal helper for {@link org.tensorflow.lite.support.image}.
- */
-class ImageConversions {
-
-  /**
-   * Converts an Image in a TensorBuffer to a Bitmap, whose memory is already allocated.
-   *
-   * <p>Notice: We only support ARGB_8888 at this point.
-   *
-   * @param buffer The TensorBuffer object representing the image. It should be an UInt8 buffer with
-   *     3 dimensions: width, height, channel. Size of each dimension should be positive and the
-   *     size of channels should be 3 (representing R, G, B). An optional 4th dimension "batch" is
-   *     acceptable, and dimensions look like: batch, width, height, channel. In this case, size of
-   *     batches should be 1.
-   * @param bitmap The destination of the conversion. Needs to be created in advance, needs to be
-   *     mutable, and needs to have the same width and height with the buffer.
-   * @throws IllegalArgumentException 1) if the {@code buffer} is not uint8 (e.g. a float buffer),
-   *     or has an invalid shape. 2) if the {@code bitmap} is not mutable. 3) if the {@code bitmap}
-   *     has different height or width with the buffer.
-   */
-  static void convertTensorBufferToBitmap(TensorBuffer buffer, Bitmap bitmap) {
-    if (buffer.getDataType() != DataType.UINT8) {
-      // We will add support to FLOAT format conversion in the future, as it may need other configs.
-      throw new UnsupportedOperationException(
-          String.format(
-              "Converting TensorBuffer of type %s to ARGB_8888 Bitmap is not supported yet.",
-              buffer.getDataType()));
-    }
-    int[] shape = buffer.getShape();
-    TensorImage.checkImageTensorShape(shape);
-    int h = shape[shape.length - 3];
-    int w = shape[shape.length - 2];
-    if (bitmap.getWidth() != w || bitmap.getHeight() != h) {
-      throw new IllegalArgumentException(String.format(
-          "Given bitmap has different width or height %s with the expected ones %s.",
-          Arrays.toString(new int[]{bitmap.getWidth(), bitmap.getHeight()}),
-          Arrays.toString(new int[]{w, h})));
-    }
-    if (!bitmap.isMutable()) {
-      throw new IllegalArgumentException("Given bitmap is not mutable");
-    }
-    // TODO(b/138904567): Find a way to avoid creating multiple intermediate buffers every time.
-    int[] intValues = new int[w * h];
-    int[] rgbValues = buffer.getIntArray();
-    for (int i = 0, j = 0; i < intValues.length; i++) {
-      int r = rgbValues[j++];
-      int g = rgbValues[j++];
-      int b = rgbValues[j++];
-      intValues[i] = Color.rgb(r, g, b);
-    }
-    bitmap.setPixels(intValues, 0, w, 0, 0, w, h);
-  }
-
-  /**
-   * Converts an Image in a Bitmap to a TensorBuffer (3D Tensor: Width-Height-Channel) whose memory
-   * is already allocated, or could be dynamically allocated.
-   *
-   * @param bitmap The Bitmap object representing the image. Currently we only support ARGB_8888
-   * config.
-   * @param buffer The destination of the conversion. Needs to be created in advance. If it's
-   * fixed-size, its flat size should be w*h*3.
-   * @throws IllegalArgumentException if the buffer is fixed-size, but the size doesn't match.
-   */
-  static void convertBitmapToTensorBuffer(Bitmap bitmap, TensorBuffer buffer) {
-    int w = bitmap.getWidth();
-    int h = bitmap.getHeight();
-    int[] intValues = new int[w * h];
-    bitmap.getPixels(intValues, 0, w, 0, 0, w, h);
-    // TODO(b/138904567): Find a way to avoid creating multiple intermediate buffers every time.
-    int[] rgbValues = new int[w * h * 3];
-    for (int i = 0, j = 0; i < intValues.length; i++) {
-      rgbValues[j++] = ((intValues[i] >> 16) & 0xFF);
-      rgbValues[j++] = ((intValues[i] >> 8) & 0xFF);
-      rgbValues[j++] = (intValues[i] & 0xFF);
-    }
-    int[] shape = new int[] {h, w, 3};
-    buffer.loadArray(rgbValues, shape);
-  }
-
-  // Hide the constructor as the class is static.
-  private ImageConversions() {}
-}
diff --git a/tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/image/ImageOperator.java b/tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/image/ImageOperator.java
deleted file mode 100644
index 1e546634e90..00000000000
--- a/tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/image/ImageOperator.java
+++ /dev/null
@@ -1,43 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-package org.tensorflow.lite.support.image;
-
-import android.graphics.PointF;
-import org.tensorflow.lite.support.common.Operator;
-
-/** Operates a TensorImage object. Used in ImageProcessor. */
-public interface ImageOperator extends Operator<TensorImage> {
-  /** @see org.tensorflow.lite.support.common.Operator#apply(java.lang.Object) */
-  @Override
-  TensorImage apply(TensorImage image);
-
-  /** Computes the width of the expected output image when input image size is given. */
-  int getOutputImageWidth(int inputImageHeight, int inputImageWidth);
-
-  /** Computes the height of the expected output image when input image size is given. */
-  int getOutputImageHeight(int inputImageHeight, int inputImageWidth);
-
-  /**
-   * Transforms a point from coordinates system of the result image back to the one of the input
-   * image.
-   *
-   * @param point the point from the result coordinates system.
-   * @param inputImageHeight the height of input image.
-   * @param inputImageWidth the width of input image.
-   * @return the point with the coordinates from the coordinates system of the input image.
-   */
-  PointF inverseTransform(PointF point, int inputImageHeight, int inputImageWidth);
-}
diff --git a/tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/image/ImageProcessor.java b/tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/image/ImageProcessor.java
deleted file mode 100644
index e1ef1309bbe..00000000000
--- a/tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/image/ImageProcessor.java
+++ /dev/null
@@ -1,198 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-package org.tensorflow.lite.support.image;
-
-import android.graphics.PointF;
-import android.graphics.RectF;
-import java.util.ArrayList;
-import java.util.List;
-import java.util.ListIterator;
-import org.tensorflow.lite.support.common.Operator;
-import org.tensorflow.lite.support.common.SequentialProcessor;
-import org.tensorflow.lite.support.common.SupportPreconditions;
-import org.tensorflow.lite.support.common.TensorOperator;
-import org.tensorflow.lite.support.image.ops.Rot90Op;
-import org.tensorflow.lite.support.image.ops.TensorOperatorWrapper;
-
-/**
- * ImageProcessor is a helper class for preprocessing and postprocessing {@link TensorImage}. It
- * could transform a {@link TensorImage} to another by executing a chain of {@link ImageOperator}.
- *
- * <p>Example Usage:
- *
- * <pre>
- *   ImageProcessor processor = new ImageProcessor.Builder()
- *       .add(new ResizeOp(224, 224, ResizeMethod.NEAREST_NEIGHBOR)
- *       .add(new Rot90Op())
- *       .add(new NormalizeOp(127.5f, 127.5f))
- *       .build();
- *   TensorImage anotherTensorImage = processor.process(tensorImage);
- * </pre>
- *
- * <p><b>WARNING:</b> Instances of an {@code ImageProcessor} are <b>not</b> thread-safe with {@link
- * #updateNumberOfRotations}. Updating the number of rotations and then processing images (using
- * {@link #process}) must be protected from concurrent access. It is recommended to create separate
- * {@code ImageProcessor} instances for each thread. If multiple threads access a {@code
- * ImageProcessor} concurrently, it must be synchronized externally.
- *
- * @see ImageProcessor.Builder to build a {@link ImageProcessor} instance
- * @see ImageProcessor#process(TensorImage) to apply the processor on a {@link TensorImage}
- */
-public class ImageProcessor extends SequentialProcessor<TensorImage> {
-  private ImageProcessor(Builder builder) {
-    super(builder);
-  }
-
-  /**
-   * Transforms a point from coordinates system of the result image back to the one of the input
-   * image.
-   *
-   * @param point the point from the result coordinates system.
-   * @param inputImageHeight the height of input image.
-   * @param inputImageWidth the width of input image.
-   * @return the point with the coordinates from the coordinates system of the input image.
-   */
-  public PointF inverseTransform(PointF point, int inputImageHeight, int inputImageWidth) {
-    List<Integer> widths = new ArrayList<>();
-    List<Integer> heights = new ArrayList<>();
-    int currentWidth = inputImageWidth;
-    int currentHeight = inputImageHeight;
-    for (Operator<TensorImage> op : operatorList) {
-      widths.add(currentWidth);
-      heights.add(currentHeight);
-      ImageOperator imageOperator = (ImageOperator) op;
-      int newHeight = imageOperator.getOutputImageHeight(currentHeight, currentWidth);
-      int newWidth = imageOperator.getOutputImageWidth(currentHeight, currentWidth);
-      currentHeight = newHeight;
-      currentWidth = newWidth;
-    }
-    ListIterator<Operator<TensorImage>> opIterator = operatorList.listIterator(operatorList.size());
-    ListIterator<Integer> widthIterator = widths.listIterator(widths.size());
-    ListIterator<Integer> heightIterator = heights.listIterator(heights.size());
-    while (opIterator.hasPrevious()) {
-      ImageOperator imageOperator = (ImageOperator) opIterator.previous();
-      int height = heightIterator.previous();
-      int width = widthIterator.previous();
-      point = imageOperator.inverseTransform(point, height, width);
-    }
-    return point;
-  }
-
-  /**
-   * Transforms a rectangle from coordinates system of the result image back to the one of the input
-   * image.
-   *
-   * @param rect the rectangle from the result coordinates system.
-   * @param inputImageHeight the height of input image.
-   * @param inputImageWidth the width of input image.
-   * @return the rectangle with the coordinates from the coordinates system of the input image.
-   */
-  public RectF inverseTransform(RectF rect, int inputImageHeight, int inputImageWidth) {
-    // when rotation is involved, corner order may change - top left changes to bottom right, .etc
-    PointF p1 =
-        inverseTransform(new PointF(rect.left, rect.top), inputImageHeight, inputImageWidth);
-    PointF p2 =
-        inverseTransform(new PointF(rect.right, rect.bottom), inputImageHeight, inputImageWidth);
-    return new RectF(
-        Math.min(p1.x, p2.x), Math.min(p1.y, p2.y), Math.max(p1.x, p2.x), Math.max(p1.y, p2.y));
-  }
-
-  /**
-   * The Builder to create an ImageProcessor, which could be executed later.
-   *
-   * @see #add(TensorOperator) to add a general TensorOperator
-   * @see #add(ImageOperator) to add an ImageOperator
-   * @see #build() complete the building process and get a built Processor
-   */
-  public static class Builder extends SequentialProcessor.Builder<TensorImage> {
-    public Builder() {
-      super();
-    }
-
-    /**
-     * Adds an {@link ImageOperator} into the Operator chain.
-     *
-     * @param op the Operator instance to be executed then
-     */
-    public Builder add(ImageOperator op) {
-      super.add(op);
-      return this;
-    }
-
-    /**
-     * Adds a {@link TensorOperator} into the Operator chain. In execution, the processor calls
-     * {@link TensorImage#getTensorBuffer()} to transform the {@link TensorImage} by transforming
-     * the underlying {@link org.tensorflow.lite.support.tensorbuffer.TensorBuffer}.
-     *
-     * @param op the Operator instance to be executed then
-     */
-    public Builder add(TensorOperator op) {
-      return add(new TensorOperatorWrapper(op));
-    }
-
-    /** Completes the building process and gets the {@link ImageProcessor} instance. */
-    @Override
-    public ImageProcessor build() {
-      return new ImageProcessor(this);
-    }
-  }
-
-  /**
-   * Updates the number of rotations for the first {@link Rot90Op} in this {@link ImageProcessor}.
-   *
-   * <p><b>WARNING:</b>this method is <b>not</b> thread-safe. Updating the number of rotations and
-   * then processing images (using {@link #process}) must be protected from concurrent access with
-   * additional synchronization.
-   *
-   * @param k the number of rotations
-   * @throws IllegalStateException if {@link Rot90Op} has not been added to this {@link
-   *     ImageProcessor}
-   */
-  public void updateNumberOfRotations(int k) {
-    updateNumberOfRotations(k, /*occurrence=*/ 0);
-  }
-
-  /**
-   * Updates the number of rotations for the {@link Rot90Op} specified by {@code occurrence} in this
-   * {@link ImageProcessor}.
-   *
-   * <p><b>WARNING:</b>this method is <b>not</b> thread-safe. Updating the number of rotations and
-   * then processing images (using {@link #process}) must be protected from concurrent access with
-   * additional synchronization.
-   *
-   * @param k the number of rotations
-   * @param occurrence the index of perticular {@link Rot90Op} in this {@link ImageProcessor}. For
-   *     example, if the second {@link Rot90Op} needs to be updated, {@code occurrence} should be
-   *     set to 1.
-   * @throws IndexOutOfBoundsException if {@code occurrence} is negative or is not less than the
-   *     number of {@link Rot90Op} in this {@link ImageProcessor}
-   * @throws IllegalStateException if {@link Rot90Op} has not been added to this {@link
-   *     ImageProcessor}
-   */
-  public synchronized void updateNumberOfRotations(int k, int occurrence) {
-    SupportPreconditions.checkState(
-        operatorIndex.containsKey(Rot90Op.class.getName()),
-        "The Rot90Op has not been added to the ImageProcessor.");
-
-    List<Integer> indexes = operatorIndex.get(Rot90Op.class.getName());
-    SupportPreconditions.checkElementIndex(occurrence, indexes.size(), "occurrence");
-
-    // The index of the Rot90Op to be replaced in operatorList.
-    int index = indexes.get(occurrence);
-    Rot90Op newRot = new Rot90Op(k);
-    operatorList.set(index, newRot);
-  }
-}
diff --git a/tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/image/TensorImage.java b/tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/image/TensorImage.java
deleted file mode 100644
index bced23e6f67..00000000000
--- a/tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/image/TensorImage.java
+++ /dev/null
@@ -1,381 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-package org.tensorflow.lite.support.image;
-
-import android.graphics.Bitmap;
-import android.graphics.Bitmap.Config;
-import java.nio.ByteBuffer;
-import org.checkerframework.checker.nullness.qual.NonNull;
-import org.tensorflow.lite.DataType;
-import org.tensorflow.lite.support.common.SupportPreconditions;
-import org.tensorflow.lite.support.tensorbuffer.TensorBuffer;
-
-/**
- * TensorImage is the wrapper class for Image object. When using image processing utils in
- * TFLite.support library, it's common to convert image objects in variant types to TensorImage at
- * first.
- *
- * <p>At present, only RGB images are supported, and the A channel is always ignored.
- *
- * <p>Details of data storage: a {@link TensorImage} object may have 2 potential sources of truth: a
- * {@link Bitmap} or a {@link TensorBuffer}. {@link TensorImage} maintains the state and only
- * convert one to the other when needed.
- *
- * <p>IMPORTANT: The container doesn't own its data. Callers should not modify data objects those
- * are passed to {@link ImageContainer#set(Bitmap)} or {@link ImageContainer#set(TensorBuffer)}.
- *
- * <p>IMPORTANT: All methods are not proved thread-safe.
- *
- * @see ImageProcessor which is often used for transforming a {@link TensorImage}.
- */
-// TODO(b/138906681): Support basic Image properties (ColorType, DataType)
-// TODO(b/138907116): Support loading images from TensorBuffer with properties.
-// TODO(b/138905544): Support directly loading RGBBytes, YUVBytes and other types if necessary.
-public class TensorImage {
-
-  private final ImageContainer container;
-
-  /**
-   * Initialize a TensorImage object.
-   *
-   * Note: The data type of this TensorImage is UINT8, which means it could naturally accept Bitmaps
-   * whose pixel value range is [0, 255]. However, any image with float value pixels will not be
-   * loaded correctly. In those cases, please use {@link TensorImage(DataType)}.
-   */
-  public TensorImage() {
-    this(DataType.UINT8);
-  }
-
-  /**
-   * Initializes a TensorImage object with data type specified.
-   *
-   * <p>Note: The shape of a TensorImage is not fixed. It is determined when {@code load} methods
-   * called, and could be change later.
-   *
-   * @param dataType the expected internal data type of underlying tensor. The type is always fixed
-   *     during the lifetime of the {@link TensorImage}. To convert the data type, use {@link
-   *     TensorImage#createFrom(TensorImage, DataType)} to create a copy and convert data type at
-   *     the same time.
-   * @throws IllegalArgumentException if {@code dataType} is neither {@link DataType#UINT8} nor
-   *     {@link DataType#FLOAT32}.
-   */
-  public TensorImage(DataType dataType) {
-    SupportPreconditions.checkArgument(
-        dataType == DataType.UINT8 || dataType == DataType.FLOAT32,
-        "Illegal data type for TensorImage: Only FLOAT32 and UINT8 are accepted");
-    container = new ImageContainer(dataType);
-  }
-
-  /**
-   * Initializes a {@link TensorImage} object with a {@link Bitmap}.
-   *
-   * @see TensorImage#load(Bitmap) for reusing the object when it's expensive to create objects
-   *     frequently, because every call of {@code fromBitmap} creates a new {@link TensorImage}.
-   */
-  public static TensorImage fromBitmap(Bitmap bitmap) {
-    TensorImage image = new TensorImage();
-    image.load(bitmap);
-    return image;
-  }
-
-  /**
-   * Creates a deep-copy of a given {@link TensorImage} and converts internal tensor data type.
-   *
-   * <p>If the given {@code dataType} is different with {@code src.getDataType()}, an implicit data
-   * conversion will be applied. Converting data from {@link DataType#FLOAT32} to {@link
-   * DataType#UINT8} may involve default float->int conversion and value clamping, because {@link
-   * DataType#UINT8} stores value from 0 to 255 (inclusively).
-   *
-   * @param src the TensorImage to copy from.
-   * @param dataType the expected data type of newly created {@link TensorImage}.
-   * @return a TensorImage whose data is copied from {@code src} and data type is {@code dataType}.
-   */
-  @NonNull
-  public static TensorImage createFrom(@NonNull TensorImage src, DataType dataType) {
-    TensorImage dst = new TensorImage(dataType);
-    if (src.container.isBufferUpdated) {
-      dst.container.set(TensorBuffer.createFrom(src.getTensorBuffer(), dataType));
-    } else if (src.container.isBitmapUpdated) {
-      Bitmap srcBitmap = src.getBitmap();
-      dst.container.set(srcBitmap.copy(srcBitmap.getConfig(), srcBitmap.isMutable()));
-    }
-    return dst;
-  }
-
-  /**
-   * Loads a Bitmap image object into TensorImage.
-   *
-   * Important: When loading a bitmap, DO NOT MODIFY the bitmap from the caller side anymore. The
-   * {@code TensorImage} object will rely on the bitmap. It will probably modify the bitmap as well.
-   * In this method, we perform a zero-copy approach for that bitmap, by simply holding its
-   * reference. Use {@code bitmap.copy(bitmap.getConfig(), true)} to create a copy if necessary.
-   *
-   * Note: To get the best performance, please load images in the same shape to avoid memory
-   * re-allocation.
-   *
-   * @throws IllegalArgumentException if {@code bitmap} is not in ARGB_8888.
-   */
-  public void load(@NonNull Bitmap bitmap) {
-    SupportPreconditions.checkNotNull(bitmap, "Cannot load null bitmap.");
-    SupportPreconditions.checkArgument(
-        bitmap.getConfig().equals(Config.ARGB_8888), "Only supports loading ARGB_8888 bitmaps.");
-    container.set(bitmap);
-  }
-
-  /**
-   * Loads a float array as RGB pixels into TensorImage, representing the pixels inside.
-   *
-   * <p>Note: If the TensorImage has data type {@link DataType#UINT8}, numeric casting and clamping
-   * will be applied.
-   *
-   * @param pixels The RGB pixels representing the image.
-   * @param shape The shape of the image, should either in form (h, w, 3), or in form (1, h, w, 3).
-   */
-  public void load(@NonNull float[] pixels, @NonNull int[] shape) {
-    checkImageTensorShape(shape);
-    TensorBuffer buffer = TensorBuffer.createDynamic(getDataType());
-    buffer.loadArray(pixels, shape);
-    load(buffer);
-  }
-
-  /**
-   * Loads an uint8 array as RGB pixels into TensorImage, representing the pixels inside.
-   *
-   * <p>Note: If the TensorImage has data type {@link DataType#UINT8}, all pixel values will clamp
-   * into [0, 255].
-   *
-   * @param pixels The RGB pixels representing the image.
-   * @param shape The shape of the image, should either in form (h, w, 3), or in form (1, h, w, 3).
-   */
-  public void load(@NonNull int[] pixels, @NonNull int[] shape) {
-    checkImageTensorShape(shape);
-    TensorBuffer buffer = TensorBuffer.createDynamic(getDataType());
-    buffer.loadArray(pixels, shape);
-    load(buffer);
-  }
-
-  /**
-   * Loads a TensorBuffer containing pixel values. The color layout should be RGB.
-   *
-   * @param buffer The TensorBuffer to load. Its shape should be either (h, w, 3) or (1, h, w, 3).
-   */
-  public void load(TensorBuffer buffer) {
-    checkImageTensorShape(buffer.getShape());
-    container.set(buffer);
-  }
-
-  /**
-   * Returns a bitmap representation of this TensorImage.
-   *
-   * <p>Important: It's only a reference. DO NOT MODIFY. We don't create a copy here for performance
-   * concern, but if modification is necessary, please make a copy.
-   *
-   * @return a reference to a Bitmap in ARGB_8888 config. "A" channel is always opaque.
-   * @throws IllegalStateException if the TensorImage never loads data, or if the TensorImage is
-   *     holding a float-value image in {@code TensorBuffer}.
-   */
-  @NonNull
-  public Bitmap getBitmap() {
-    return container.getBitmap();
-  }
-
-  /**
-   * Returns a ByteBuffer representation of this TensorImage.
-   *
-   * <p>Important: It's only a reference. DO NOT MODIFY. We don't create a copy here for performance
-   * concern, but if modification is necessary, please make a copy.
-   *
-   * <p>It's essentially a short cut for {@code getTensorBuffer().getBuffer()}.
-   *
-   * @return a reference to a ByteBuffer which holds the image data.
-   * @throws IllegalStateException if the TensorImage never loads data.
-   */
-  @NonNull
-  public ByteBuffer getBuffer() {
-    return container.getTensorBuffer().getBuffer();
-  }
-
-  /**
-   * Returns a ByteBuffer representation of this TensorImage.
-   *
-   * <p>Important: It's only a reference. DO NOT MODIFY. We don't create a copy here for performance
-   * concern, but if modification is necessary, please make a copy.
-   *
-   * @return a reference to a TensorBuffer which holds the image data.
-   * @throws IllegalStateException if the TensorImage never loads data.
-   */
-  @NonNull
-  public TensorBuffer getTensorBuffer() {
-    return container.getTensorBuffer();
-  }
-
-  /**
-   * Gets the current data type.
-   *
-   * @return a data type. Currently only UINT8 and FLOAT32 are possible.
-   */
-  public DataType getDataType() {
-    return container.getDataType();
-  }
-
-  /**
-   * Gets the image width.
-   *
-   * @throws IllegalStateException if the TensorImage never loads data.
-   * @throws IllegalArgumentException if the container data is corrupted.
-   */
-  public int getWidth() {
-    return container.getWidth();
-  }
-
-  /**
-   * Gets the image height.
-   *
-   * @throws IllegalStateException if the TensorImage never loads data.
-   * @throws IllegalArgumentException if the container data is corrupted.
-   */
-  public int getHeight() {
-    return container.getHeight();
-  }
-
-  // Requires tensor shape [h, w, 3] or [1, h, w, 3].
-  static void checkImageTensorShape(int[] shape) {
-    SupportPreconditions.checkArgument(
-        (shape.length == 3 || (shape.length == 4 && shape[0] == 1))
-            && shape[shape.length - 3] > 0
-            && shape[shape.length - 2] > 0
-            && shape[shape.length - 1] == 3,
-        "Only supports image shape in (h, w, c) or (1, h, w, c), and channels representing R, G, B"
-            + " in order.");
-  }
-
-  // Handles RGB image data storage strategy of TensorBuffer.
-  private static class ImageContainer {
-
-    private TensorBuffer bufferImage;
-    private boolean isBufferUpdated;
-    private Bitmap bitmapImage;
-    private boolean isBitmapUpdated;
-
-    private final DataType dataType;
-
-    private static final int ARGB_8888_ELEMENT_BYTES = 4;
-
-    ImageContainer(DataType dataType) {
-      this.dataType = dataType;
-    }
-
-    // Internal method to set the image source-of-truth with a bitmap. The bitmap has to be
-    // ARGB_8888.
-    void set(Bitmap bitmap) {
-      bitmapImage = bitmap;
-      isBufferUpdated = false;
-      isBitmapUpdated = true;
-    }
-
-    // Internal method to set the image source-of-truth with a TensorBuffer.
-    void set(TensorBuffer buffer) {
-      bufferImage = buffer;
-      isBitmapUpdated = false;
-      isBufferUpdated = true;
-    }
-
-    int getWidth() {
-      SupportPreconditions.checkState(
-          isBitmapUpdated || isBufferUpdated,
-          "Both buffer and bitmap data are obsolete. Forgot to call TensorImage#load?");
-      if (isBitmapUpdated) {
-        return bitmapImage.getWidth();
-      }
-      return getBufferDimensionSize(-2);
-    }
-
-    int getHeight() {
-      SupportPreconditions.checkState(
-          isBitmapUpdated || isBufferUpdated,
-          "Both buffer and bitmap data are obsolete. Forgot to call TensorImage#load?");
-      if (isBitmapUpdated) {
-        return bitmapImage.getHeight();
-      }
-      return getBufferDimensionSize(-3);
-    }
-
-    // Internal helper method to get the size of one dimension in the shape of the `bufferImage`.
-    // Requires `isBufferUpdated` is true.
-    // Throws `IllegalArgumentException` if data is corrupted.
-    private int getBufferDimensionSize(int dim) {
-      int[] shape = bufferImage.getShape();
-      // The defensive check is needed because bufferImage might be invalidly changed by user
-      // (a.k.a internal data is corrupted)
-      TensorImage.checkImageTensorShape(shape);
-      dim = dim % shape.length;
-      if (dim < 0) {
-        dim += shape.length;
-      }
-      return shape[dim];
-    }
-
-    public DataType getDataType() {
-      return dataType;
-    }
-
-    // Internal method to update the internal Bitmap data by TensorBuffer data.
-    @NonNull
-    Bitmap getBitmap() {
-      if (isBitmapUpdated) {
-        return bitmapImage;
-      }
-      if (!isBufferUpdated) {
-        throw new IllegalStateException(
-            "Both buffer and bitmap data are obsolete. Forgot to call TensorImage#load?");
-      }
-      if (bufferImage.getDataType() != DataType.UINT8) {
-        throw new IllegalStateException(
-            "TensorImage is holding a float-value image which is not able to convert a Bitmap.");
-      }
-      int requiredAllocation = bufferImage.getFlatSize() * ARGB_8888_ELEMENT_BYTES;
-      // Create a new bitmap and reallocate memory for it.
-      if (bitmapImage == null || bitmapImage.getAllocationByteCount() < requiredAllocation) {
-        int[] shape = bufferImage.getShape();
-        int h = shape[shape.length - 3];
-        int w = shape[shape.length - 2];
-        bitmapImage = Bitmap.createBitmap(w, h, Config.ARGB_8888);
-      }
-      ImageConversions.convertTensorBufferToBitmap(bufferImage, bitmapImage);
-      isBitmapUpdated = true;
-      return bitmapImage;
-    }
-
-    // Internal method to update the internal TensorBuffer data by Bitmap data.
-    @NonNull
-    TensorBuffer getTensorBuffer() {
-      if (isBufferUpdated) {
-        return bufferImage;
-      }
-      SupportPreconditions.checkArgument(
-          isBitmapUpdated,
-          "Both buffer and bitmap data are obsolete. Forgot to call TensorImage#load?");
-      int requiredFlatSize = bitmapImage.getWidth() * bitmapImage.getHeight() * 3;
-      if (bufferImage == null
-          || (!bufferImage.isDynamic() && bufferImage.getFlatSize() != requiredFlatSize)) {
-        bufferImage = TensorBuffer.createDynamic(dataType);
-      }
-      ImageConversions.convertBitmapToTensorBuffer(bitmapImage, bufferImage);
-      isBufferUpdated = true;
-      return bufferImage;
-    }
-  }
-}
diff --git a/tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/image/ops/ResizeOp.java b/tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/image/ops/ResizeOp.java
deleted file mode 100644
index 35606dd66d3..00000000000
--- a/tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/image/ops/ResizeOp.java
+++ /dev/null
@@ -1,89 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-package org.tensorflow.lite.support.image.ops;
-
-import android.graphics.Bitmap;
-import android.graphics.PointF;
-import org.checkerframework.checker.nullness.qual.NonNull;
-import org.tensorflow.lite.support.image.ImageOperator;
-import org.tensorflow.lite.support.image.TensorImage;
-
-/**
- * As a computation unit for processing images, it can resize an image to user-specified size.
- *
- * <p>It interpolates pixels when image is stretched, and discards pixels when image is compressed.
- *
- * @see ResizeWithCropOrPadOp for resizing without content distortion.
- */
-public class ResizeOp implements ImageOperator {
-
-  /** Algorithms for resizing. */
-  public enum ResizeMethod {
-    BILINEAR,
-    NEAREST_NEIGHBOR
-  }
-
-  private final int targetHeight;
-  private final int targetWidth;
-  private final boolean useBilinear;
-
-  /**
-   * Creates a ResizeOp which can resize images to specified size in specified method.
-   *
-   * @param targetHeight: The expected height of resized image.
-   * @param targetWidth: The expected width of resized image.
-   * @param resizeMethod: The algorithm to use for resizing. Options: {@link ResizeMethod}
-   */
-  public ResizeOp(int targetHeight, int targetWidth, ResizeMethod resizeMethod) {
-    this.targetHeight = targetHeight;
-    this.targetWidth = targetWidth;
-    useBilinear = (resizeMethod == ResizeMethod.BILINEAR);
-  }
-
-  /**
-   * Applies the defined resizing on given image and returns the result.
-   *
-   * <p>Note: the content of input {@code image} will change, and {@code image} is the same instance
-   * with the output.
-   *
-   * @param image input image.
-   * @return output image.
-   */
-  @Override
-  @NonNull
-  public TensorImage apply(@NonNull TensorImage image) {
-    Bitmap scaled =
-        Bitmap.createScaledBitmap(image.getBitmap(), targetWidth, targetHeight, useBilinear);
-    image.load(scaled);
-    return image;
-  }
-
-  @Override
-  public int getOutputImageHeight(int inputImageHeight, int inputImageWidth) {
-    return targetHeight;
-  }
-
-  @Override
-  public int getOutputImageWidth(int inputImageHeight, int inputImageWidth) {
-    return targetWidth;
-  }
-
-  @Override
-  public PointF inverseTransform(PointF point, int inputImageHeight, int inputImageWidth) {
-    return new PointF(
-        point.x * inputImageWidth / targetWidth, point.y * inputImageHeight / targetHeight);
-  }
-}
diff --git a/tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/image/ops/ResizeWithCropOrPadOp.java b/tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/image/ops/ResizeWithCropOrPadOp.java
deleted file mode 100644
index 404429efa06..00000000000
--- a/tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/image/ops/ResizeWithCropOrPadOp.java
+++ /dev/null
@@ -1,125 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-package org.tensorflow.lite.support.image.ops;
-
-import android.graphics.Bitmap;
-import android.graphics.Bitmap.Config;
-import android.graphics.Canvas;
-import android.graphics.PointF;
-import android.graphics.Rect;
-import org.checkerframework.checker.nullness.qual.NonNull;
-import org.tensorflow.lite.support.image.ImageOperator;
-import org.tensorflow.lite.support.image.TensorImage;
-
-/**
- * As a computation unit for processing images, it could resize image to predefined size.
- *
- * <p>It will not stretch or compress the content of image. However, to fit the new size, it crops
- * or pads pixels. When it crops image, it performs a center-crop; when it pads pixels, it performs
- * a zero-padding.
- *
- * @see ResizeOp for reszing images while stretching / compressing the content.
- */
-public class ResizeWithCropOrPadOp implements ImageOperator {
-  private final int targetHeight;
-  private final int targetWidth;
-  private final Bitmap output;
-
-  /**
-   * Creates a ResizeWithCropOrPadOp which could crop/pad images to specified size. It adopts
-   * center-crop and zero-padding.
-   *
-   * @param targetHeight: The expected height of cropped/padded image.
-   * @param targetWidth: The expected width of cropped/padded image.
-   */
-  public ResizeWithCropOrPadOp(int targetHeight, int targetWidth) {
-    this.targetHeight = targetHeight;
-    this.targetWidth = targetWidth;
-    output = Bitmap.createBitmap(this.targetWidth, this.targetHeight, Config.ARGB_8888);
-  }
-
-  /**
-   * Applies the defined resizing with cropping or/and padding on given image and returns the
-   * result.
-   *
-   * <p>Note: the content of input {@code image} will change, and {@code image} is the same instance
-   * with the output.
-   *
-   * @param image input image.
-   * @return output image.
-   */
-  @Override
-  @NonNull
-  public TensorImage apply(@NonNull TensorImage image) {
-    Bitmap input = image.getBitmap();
-    int srcL;
-    int srcR;
-    int srcT;
-    int srcB;
-    int dstL;
-    int dstR;
-    int dstT;
-    int dstB;
-    int w = input.getWidth();
-    int h = input.getHeight();
-    if (targetWidth > w) { // padding
-      srcL = 0;
-      srcR = w;
-      dstL = (targetWidth - w) / 2;
-      dstR = dstL + w;
-    } else { // cropping
-      dstL = 0;
-      dstR = targetWidth;
-      srcL = (w - targetWidth) / 2;
-      srcR = srcL + targetWidth;
-    }
-    if (targetHeight > h) { // padding
-      srcT = 0;
-      srcB = h;
-      dstT = (targetHeight - h) / 2;
-      dstB = dstT + h;
-    } else { // cropping
-      dstT = 0;
-      dstB = targetHeight;
-      srcT = (h - targetHeight) / 2;
-      srcB = srcT + targetHeight;
-    }
-    Rect src = new Rect(srcL, srcT, srcR, srcB);
-    Rect dst = new Rect(dstL, dstT, dstR, dstB);
-    new Canvas(output).drawBitmap(input, src, dst, null);
-    image.load(output);
-    return image;
-  }
-
-  @Override
-  public int getOutputImageHeight(int inputImageHeight, int inputImageWidth) {
-    return targetHeight;
-  }
-
-  @Override
-  public int getOutputImageWidth(int inputImageHeight, int inputImageWidth) {
-    return targetWidth;
-  }
-
-  @Override
-  public PointF inverseTransform(PointF point, int inputImageHeight, int inputImageWidth) {
-    return transformImpl(point, targetHeight, targetWidth, inputImageHeight, inputImageWidth);
-  }
-
-  private static PointF transformImpl(PointF point, int srcH, int srcW, int dstH, int dstW) {
-    return new PointF(point.x + (dstW - srcW) / 2, point.y + (dstH - srcH) / 2);
-  }
-}
diff --git a/tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/image/ops/Rot90Op.java b/tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/image/ops/Rot90Op.java
deleted file mode 100644
index 2fa2293763c..00000000000
--- a/tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/image/ops/Rot90Op.java
+++ /dev/null
@@ -1,103 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-package org.tensorflow.lite.support.image.ops;
-
-import android.graphics.Bitmap;
-import android.graphics.Matrix;
-import android.graphics.PointF;
-import org.checkerframework.checker.nullness.qual.NonNull;
-import org.tensorflow.lite.support.image.ImageOperator;
-import org.tensorflow.lite.support.image.TensorImage;
-
-/** Rotates image counter-clockwise. */
-public class Rot90Op implements ImageOperator {
-
-  private final int numRotation;
-
-  /** Creates a Rot90 Op which will rotate image by 90 degree counter-clockwise. */
-  public Rot90Op() {
-    this(1);
-  }
-
-  /**
-   * Creates a Rot90 Op which will rotate image by 90 degree for {@code k} times counter-clockwise.
-   *
-   * @param k: The number of times the image is rotated by 90 degrees. If it's positive, the image
-   *     will be rotated counter-clockwise. If it's negative, the op will rotate image clockwise.
-   */
-  public Rot90Op(int k) {
-    numRotation = k % 4;
-  }
-
-  /**
-   * Applies the defined rotation on given image and returns the result.
-   *
-   * <p>Note: the content of input {@code image} will change, and {@code image} is the same instance
-   * with the output.
-   *
-   * @param image input image.
-   * @return output image.
-   */
-  @NonNull
-  @Override
-  public TensorImage apply(@NonNull TensorImage image) {
-    Bitmap input = image.getBitmap();
-    if (numRotation == 0) {
-      return image;
-    }
-    int w = input.getWidth();
-    int h = input.getHeight();
-    Matrix matrix = new Matrix();
-    matrix.postTranslate(w * 0.5f, h * 0.5f);
-    matrix.postRotate(-90 * numRotation);
-    int newW = (numRotation % 2 == 0) ? w : h;
-    int newH = (numRotation % 2 == 0) ? h : w;
-    matrix.postTranslate(newW * 0.5f, newH * 0.5f);
-    Bitmap output = Bitmap.createBitmap(input, 0, 0, w, h, matrix, false);
-    image.load(output);
-    return image;
-  }
-
-  @Override
-  public int getOutputImageHeight(int inputImageHeight, int inputImageWidth) {
-    return (numRotation % 2 == 0) ? inputImageHeight : inputImageWidth;
-  }
-
-  @Override
-  public int getOutputImageWidth(int inputImageHeight, int inputImageWidth) {
-    return (numRotation % 2 == 0) ? inputImageWidth : inputImageHeight;
-  }
-
-  @Override
-  public PointF inverseTransform(PointF point, int inputImageHeight, int inputImageWidth) {
-    int inverseNumRotation = (4 - numRotation) % 4;
-    int height = getOutputImageHeight(inputImageHeight, inputImageWidth);
-    int width = getOutputImageWidth(inputImageHeight, inputImageWidth);
-    return transformImpl(point, height, width, inverseNumRotation);
-  }
-
-  private static PointF transformImpl(PointF point, int height, int width, int numRotation) {
-    if (numRotation == 0) {
-      return point;
-    } else if (numRotation == 1) {
-      return new PointF(point.y, width - point.x);
-    } else if (numRotation == 2) {
-      return new PointF(width - point.x, height - point.y);
-    } else { // numRotation == 3
-      return new PointF(height - point.y, point.x);
-    }
-  }
-}
diff --git a/tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/image/ops/TensorOperatorWrapper.java b/tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/image/ops/TensorOperatorWrapper.java
deleted file mode 100644
index 75ccdac9b83..00000000000
--- a/tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/image/ops/TensorOperatorWrapper.java
+++ /dev/null
@@ -1,70 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-package org.tensorflow.lite.support.image.ops;
-
-import android.graphics.PointF;
-import org.checkerframework.checker.nullness.qual.NonNull;
-import org.tensorflow.lite.support.common.SupportPreconditions;
-import org.tensorflow.lite.support.common.TensorOperator;
-import org.tensorflow.lite.support.image.ImageOperator;
-import org.tensorflow.lite.support.image.TensorImage;
-
-/**
- * The adapter that makes a TensorOperator able to run with TensorImage.
- *
- * @see org.tensorflow.lite.support.common.TensorOperator
- * @see org.tensorflow.lite.support.image.TensorImage
- */
-public class TensorOperatorWrapper implements ImageOperator {
-
-  private final TensorOperator tensorOp;
-
-  /**
-   * Wraps a {@link TensorOperator} object as an {@link ImageOperator}, so that the {@link
-   * TensorOperator} could handle {@link TensorImage} objects by handling its underlying {@link
-   * org.tensorflow.lite.support.tensorbuffer.TensorBuffer}.
-   *
-   * <p>Requirement: The {@code op} should not change coordinate system when applied on an image.
-   *
-   * @param op The created operator.
-   */
-  public TensorOperatorWrapper(TensorOperator op) {
-    tensorOp = op;
-  }
-
-  @Override
-  @NonNull
-  public TensorImage apply(@NonNull TensorImage image) {
-    SupportPreconditions.checkNotNull(image, "Op cannot apply on null image.");
-    image.load(tensorOp.apply(image.getTensorBuffer()));
-    return image;
-  }
-
-  @Override
-  public int getOutputImageHeight(int inputImageHeight, int inputImageWidth) {
-    return inputImageHeight;
-  }
-
-  @Override
-  public int getOutputImageWidth(int inputImageHeight, int inputImageWidth) {
-    return inputImageWidth;
-  }
-
-  @Override
-  public PointF inverseTransform(PointF point, int inputImageHeight, int inputImageWidth) {
-    return point;
-  }
-}
diff --git a/tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/label/Category.java b/tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/label/Category.java
deleted file mode 100644
index ea369c3ac12..00000000000
--- a/tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/label/Category.java
+++ /dev/null
@@ -1,62 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-package org.tensorflow.lite.support.label;
-
-import java.util.Objects;
-
-/**
- * Category is a util class, contains a label and a float value. Typically it's used as result of
- * classification tasks.
- */
-public final class Category {
-  private final String label;
-  private final float score;
-
-  /** Constructs a Category. */
-  public Category(String label, float score) {
-    this.label = label;
-    this.score = score;
-  }
-
-  /** Gets the reference of category's label. */
-  public String getLabel() {
-    return label;
-  }
-
-  /** Gets the score of the category. */
-  public float getScore() {
-    return score;
-  }
-
-  @Override
-  public boolean equals(Object o) {
-    if (o instanceof Category) {
-      Category other = (Category) o;
-      return (other.getLabel().equals(this.label) && other.getScore() == this.score);
-    }
-    return false;
-  }
-
-  @Override
-  public int hashCode() {
-    return Objects.hash(label, score);
-  }
-
-  @Override
-  public String toString() {
-    return "<Category \"" + label + "\" (score=" + score + ")>";
-  }
-}
diff --git a/tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/label/LabelUtil.java b/tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/label/LabelUtil.java
deleted file mode 100644
index 840ed5fb77d..00000000000
--- a/tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/label/LabelUtil.java
+++ /dev/null
@@ -1,64 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-package org.tensorflow.lite.support.label;
-
-import android.util.Log;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.List;
-import org.checkerframework.checker.nullness.qual.NonNull;
-import org.tensorflow.lite.support.common.SupportPreconditions;
-import org.tensorflow.lite.support.tensorbuffer.TensorBuffer;
-
-/** Label operation utils. */
-public class LabelUtil {
-  /**
-   * Maps an int value tensor to a list of string labels. It takes an array of strings as the
-   * dictionary. Example: if the given tensor is [3, 1, 0], and given labels is ["background",
-   * "apple", "banana", "cherry", "date"], the result will be ["date", "banana", "apple"].
-   *
-   * @param tensorBuffer: A tensor with index values. The values should be non-negative integers,
-   *     and each value {@code x} will be converted to {@code labels[x + offset]}. If the tensor is
-   *     given as a float {@link TensorBuffer}, values will be cast to integers. All values that are
-   *     out of bound will map to empty string.
-   * @param labels: A list of strings, used as a dictionary to look up. The index of the array
-   *     element will be used as the key. To get better performance, use an object that implements
-   *     RandomAccess, such as {@link ArrayList}.
-   * @param offset: The offset value when look up int values in the {@code labels}.
-   * @return the mapped strings. The length of the list is {@link TensorBuffer#getFlatSize}.
-   * @throws IllegalArgumentException if {@code tensorBuffer} or {@code labels} is null.
-   */
-  public static List<String> mapValueToLabels(
-      @NonNull TensorBuffer tensorBuffer, @NonNull List<String> labels, int offset) {
-    SupportPreconditions.checkNotNull(tensorBuffer, "Given tensor should not be null");
-    SupportPreconditions.checkNotNull(labels, "Given labels should not be null");
-    int[] values = tensorBuffer.getIntArray();
-    Log.d("values", Arrays.toString(values));
-    List<String> result = new ArrayList<>();
-    for (int v : values) {
-      int index = v + offset;
-      if (index < 0 || index >= labels.size()) {
-        result.add("");
-      } else {
-        result.add(labels.get(index));
-      }
-    }
-    return result;
-  }
-
-  // Private constructor to prevent initialization.
-  private LabelUtil() {}
-}
diff --git a/tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/label/TensorLabel.java b/tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/label/TensorLabel.java
deleted file mode 100644
index 10763a1a065..00000000000
--- a/tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/label/TensorLabel.java
+++ /dev/null
@@ -1,224 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-package org.tensorflow.lite.support.label;
-
-import android.content.Context;
-import java.nio.ByteBuffer;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.LinkedHashMap;
-import java.util.List;
-import java.util.Map;
-import org.checkerframework.checker.nullness.qual.NonNull;
-import org.tensorflow.lite.DataType;
-import org.tensorflow.lite.support.common.SupportPreconditions;
-import org.tensorflow.lite.support.tensorbuffer.TensorBuffer;
-
-/**
- * TensorLabel is an util wrapper for TensorBuffers with meaningful labels on an axis.
- *
- * <p>For example, an image classification model may have an output tensor with shape as {1, 10},
- * where 1 is the batch size and 10 is the number of categories. In fact, on the 2nd axis, we could
- * label each sub-tensor with the name or description of each corresponding category. {@link
- * TensorLabel} could help converting the plain Tensor in {@link TensorBuffer} into a map from
- * predefined labels to sub-tensors. In this case, if provided 10 labels for the 2nd axis, {@link
- * TensorLabel} could convert the original {1, 10} Tensor to a 10 element map, each value of which
- * is Tensor in shape {} (scalar). Usage example:
- *
- * <pre>
- *   TensorBuffer outputTensor = ...;
- *   {@literal List<String>} labels = FileUtil.loadLabels(context, labelFilePath);
- *   // labels the first axis with size greater than one
- *   TensorLabel labeled = new TensorLabel(labels, outputTensor);
- *   // If each sub-tensor has effectively size 1, we can directly get a float value
- *   {@literal Map<String, Float>} probabilities = labeled.getMapWithFloatValue();
- *   // Or get sub-tensors, when each sub-tensor has elements more than 1
- *   {@literal Map<String, TensorBuffer>} subTensors = labeled.getMapWithTensorBuffer();
- * </pre>
- *
- * <p>Note: currently we only support tensor-to-map conversion for the first label with size greater
- * than 1.
- *
- * @see org.tensorflow.lite.support.common.FileUtil#loadLabels(Context, String) to load labels from
- *     a label file (plain text file whose each line is a label) in assets simply.
- */
-public class TensorLabel {
-  private final Map<Integer, List<String>> axisLabels;
-  private final TensorBuffer tensorBuffer;
-  private final int[] shape;
-
-  /**
-   * Creates a TensorLabel object which is able to label on the axes of multi-dimensional tensors.
-   *
-   * @param axisLabels A map, whose key is axis id (starting from 0) and value is corresponding
-   *     labels. Note: The size of labels should be same with the size of the tensor on that axis.
-   * @param tensorBuffer The TensorBuffer to be labeled.
-   * @throws NullPointerException if {@code axisLabels} or {@code tensorBuffer} is null, or any
-   *     value in {@code axisLabels} is null.
-   * @throws IllegalArgumentException if any key in {@code axisLabels} is out of range (compared to
-   *     the shape of {@code tensorBuffer}, or any value (labels) has different size with the {@code
-   *     tensorBuffer} on the given dimension.
-   */
-  public TensorLabel(
-      @NonNull Map<Integer, List<String>> axisLabels, @NonNull TensorBuffer tensorBuffer) {
-    SupportPreconditions.checkNotNull(axisLabels, "Axis labels cannot be null.");
-    SupportPreconditions.checkNotNull(tensorBuffer, "Tensor Buffer cannot be null.");
-    this.axisLabels = axisLabels;
-    this.tensorBuffer = tensorBuffer;
-    this.shape = tensorBuffer.getShape();
-    for (Map.Entry<Integer, List<String>> entry : axisLabels.entrySet()) {
-      int axis = entry.getKey();
-      SupportPreconditions.checkArgument(
-          axis >= 0 && axis < shape.length, "Invalid axis id: " + axis);
-      SupportPreconditions.checkNotNull(entry.getValue(), "Label list is null on axis " + axis);
-      SupportPreconditions.checkArgument(
-          shape[axis] == entry.getValue().size(),
-          "Label number " + entry.getValue().size() + " mismatch the shape on axis " + axis);
-    }
-  }
-
-  /**
-   * Creates a TensorLabel object which is able to label on one axis of multi-dimensional tensors.
-   *
-   * <p>Note: The labels are applied on the first axis whose size is larger than 1. For example, if
-   * the shape of the tensor is [1, 10, 3], the labels will be applied on axis 1 (id starting from
-   * 0), and size of {@code axisLabels} should be 10 as well.
-   *
-   * @param axisLabels A list of labels, whose size should be same with the size of the tensor on
-   *     the to-be-labeled axis.
-   * @param tensorBuffer The TensorBuffer to be labeled.
-   */
-  public TensorLabel(@NonNull List<String> axisLabels, @NonNull TensorBuffer tensorBuffer) {
-    this(makeMap(getFirstAxisWithSizeGreaterThanOne(tensorBuffer), axisLabels), tensorBuffer);
-  }
-
-  /**
-   * Gets the map with a pair of the label and the corresponding TensorBuffer. Only allow the
-   * mapping on the first axis with size greater than 1 currently.
-   */
-  @NonNull
-  public Map<String, TensorBuffer> getMapWithTensorBuffer() {
-    int labeledAxis = getFirstAxisWithSizeGreaterThanOne(tensorBuffer);
-
-    Map<String, TensorBuffer> labelToTensorMap = new LinkedHashMap<>();
-    SupportPreconditions.checkArgument(
-        axisLabels.containsKey(labeledAxis),
-        "get a <String, TensorBuffer> map requires the labels are set on the first non-1 axis.");
-    List<String> labels = axisLabels.get(labeledAxis);
-
-    DataType dataType = tensorBuffer.getDataType();
-    int typeSize = tensorBuffer.getTypeSize();
-    int flatSize = tensorBuffer.getFlatSize();
-
-    // Gets the underlying bytes that could be used to generate the sub-array later.
-    ByteBuffer byteBuffer = tensorBuffer.getBuffer();
-    byteBuffer.rewind();
-
-    // Note: computation below is only correct when labeledAxis is the first axis with size greater
-    // than 1.
-    int subArrayLength = flatSize / shape[labeledAxis] * typeSize;
-    int i = 0;
-    SupportPreconditions.checkNotNull(labels, "Label list should never be null");
-    for (String label : labels) {
-      // Gets the corresponding TensorBuffer.
-      byteBuffer.position(i * subArrayLength);
-      ByteBuffer subBuffer = byteBuffer.slice();
-      // ByteBuffer.slice doesn't keep order. Modify it to align with the original one.
-      subBuffer.order(byteBuffer.order()).limit(subArrayLength);
-      TensorBuffer labelBuffer = TensorBuffer.createDynamic(dataType);
-      labelBuffer.loadBuffer(subBuffer, Arrays.copyOfRange(shape, labeledAxis + 1, shape.length));
-      labelToTensorMap.put(label, labelBuffer);
-      i += 1;
-    }
-    return labelToTensorMap;
-  }
-
-  /**
-   * Gets a map that maps label to float. Only allow the mapping on the first axis with size greater
-   * than 1, and the axis should be effectively the last axis (which means every sub tensor
-   * specified by this axis should have a flat size of 1).
-   *
-   * <p>{@link TensorLabel#getCategoryList()} is an alternative API to get the result.
-   *
-   * @throws IllegalStateException if size of a sub tensor on each label is not 1.
-   */
-  @NonNull
-  public Map<String, Float> getMapWithFloatValue() {
-    int labeledAxis = getFirstAxisWithSizeGreaterThanOne(tensorBuffer);
-    SupportPreconditions.checkState(
-        labeledAxis == shape.length - 1,
-        "get a <String, Scalar> map is only valid when the only labeled axis is the last one.");
-    List<String> labels = axisLabels.get(labeledAxis);
-    float[] data = tensorBuffer.getFloatArray();
-    SupportPreconditions.checkState(labels.size() == data.length);
-    Map<String, Float> result = new LinkedHashMap<>();
-    int i = 0;
-    for (String label : labels) {
-      result.put(label, data[i]);
-      i += 1;
-    }
-    return result;
-  }
-
-  /**
-   * Gets a list of {@link Category} from the {@link TensorLabel} object.
-   *
-   * <p>The axis of label should be effectively the last axis (which means every sub tensor
-   * specified by this axis should have a flat size of 1), so that each labelled sub tensor could be
-   * converted into a float value score. Example: A {@link TensorLabel} with shape {@code {2, 5, 3}}
-   * and axis 2 is valid. If axis is 1 or 0, it cannot be converted into a {@link Category}.
-   *
-   * <p>{@link TensorLabel#getMapWithFloatValue()} is an alternative but returns a {@link Map} as
-   * the result.
-   *
-   * @throws IllegalStateException if size of a sub tensor on each label is not 1.
-   */
-  @NonNull
-  public List<Category> getCategoryList() {
-    int labeledAxis = getFirstAxisWithSizeGreaterThanOne(tensorBuffer);
-    SupportPreconditions.checkState(
-        labeledAxis == shape.length - 1,
-        "get a Category list is only valid when the only labeled axis is the last one.");
-    List<String> labels = axisLabels.get(labeledAxis);
-    float[] data = tensorBuffer.getFloatArray();
-    SupportPreconditions.checkState(labels.size() == data.length);
-    List<Category> result = new ArrayList<>();
-    int i = 0;
-    for (String label : labels) {
-      result.add(new Category(label, data[i]));
-      i += 1;
-    }
-    return result;
-  }
-
-  private static int getFirstAxisWithSizeGreaterThanOne(@NonNull TensorBuffer tensorBuffer) {
-    int[] shape = tensorBuffer.getShape();
-    for (int i = 0; i < shape.length; i++) {
-      if (shape[i] > 1) {
-        return i;
-      }
-    }
-    throw new IllegalArgumentException(
-        "Cannot find an axis to label. A valid axis to label should have size larger than 1.");
-  }
-
-  // Helper function to wrap the List<String> to a one-entry map.
-  private static Map<Integer, List<String>> makeMap(int axis, List<String> labels) {
-    Map<Integer, List<String>> map = new LinkedHashMap<>();
-    map.put(axis, labels);
-    return map;
-  }
-}
diff --git a/tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/label/ops/LabelAxisOp.java b/tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/label/ops/LabelAxisOp.java
deleted file mode 100644
index c2de8c0baad..00000000000
--- a/tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/label/ops/LabelAxisOp.java
+++ /dev/null
@@ -1,74 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-package org.tensorflow.lite.support.label.ops;
-
-import android.content.Context;
-import java.io.IOException;
-import java.util.HashMap;
-import java.util.List;
-import java.util.Map;
-import org.checkerframework.checker.nullness.qual.NonNull;
-import org.tensorflow.lite.support.common.FileUtil;
-import org.tensorflow.lite.support.common.SupportPreconditions;
-import org.tensorflow.lite.support.label.TensorLabel;
-import org.tensorflow.lite.support.tensorbuffer.TensorBuffer;
-
-/**
- * Labels TensorBuffer with axisLabels for outputs.
- *
- * <p>Apply on a {@code TensorBuffer} to get a {@code TensorLabel} that could output a Map, which is
- * a pair of the label name and the corresponding TensorBuffer value.
- */
-public class LabelAxisOp {
-  // Axis and its corresponding label names.
-  private final Map<Integer, List<String>> axisLabels;
-
-  protected LabelAxisOp(Builder builder) {
-    axisLabels = builder.axisLabels;
-  }
-
-  public TensorLabel apply(@NonNull TensorBuffer buffer) {
-    SupportPreconditions.checkNotNull(buffer, "Tensor buffer cannot be null.");
-    return new TensorLabel(axisLabels, buffer);
-  }
-
-  /** The inner builder class to build a LabelTensor Operator. */
-  public static class Builder {
-    private final Map<Integer, List<String>> axisLabels;
-
-    protected Builder() {
-      axisLabels = new HashMap<>();
-    }
-
-    public Builder addAxisLabel(@NonNull Context context, int axis, @NonNull String filePath)
-        throws IOException {
-      SupportPreconditions.checkNotNull(context, "Context cannot be null.");
-      SupportPreconditions.checkNotNull(filePath, "File path cannot be null.");
-      List<String> labels = FileUtil.loadLabels(context, filePath);
-      axisLabels.put(axis, labels);
-      return this;
-    }
-
-    public Builder addAxisLabel(int axis, @NonNull List<String> labels) {
-      axisLabels.put(axis, labels);
-      return this;
-    }
-
-    public LabelAxisOp build() {
-      return new LabelAxisOp(this);
-    }
-  }
-}
diff --git a/tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/model/GpuDelegateProxy.java b/tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/model/GpuDelegateProxy.java
deleted file mode 100644
index 9cfcf923ded..00000000000
--- a/tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/model/GpuDelegateProxy.java
+++ /dev/null
@@ -1,69 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-package org.tensorflow.lite.support.model;
-
-import android.util.Log;
-import java.io.Closeable;
-import java.io.IOException;
-import org.checkerframework.checker.nullness.qual.Nullable;
-import org.tensorflow.lite.Delegate;
-
-/**
- * Helper class to create and call necessary methods of {@code GpuDelegate} which is not a strict
- * dependency.
- */
-class GpuDelegateProxy implements Delegate, Closeable {
-
-  private static final String TAG = "GpuDelegateProxy";
-
-  private final Delegate proxiedDelegate;
-  private final Closeable proxiedCloseable;
-
-  @Nullable
-  public static GpuDelegateProxy maybeNewInstance() {
-    try {
-      Class<?> clazz = Class.forName("org.tensorflow.lite.gpu.GpuDelegate");
-      Object instance = clazz.getDeclaredConstructor().newInstance();
-      return new GpuDelegateProxy(instance);
-    } catch (ReflectiveOperationException e) {
-      Log.e(TAG, "Failed to create the GpuDelegate dynamically.", e);
-      return null;
-    }
-  }
-
-  /** Calls {@code close()} method of the delegate. */
-  @Override
-  public void close() {
-    try {
-      proxiedCloseable.close();
-    } catch (IOException e) {
-      // Should not trigger, because GpuDelegate#close never throws. The catch is required because
-      // of Closeable#close.
-      Log.e(TAG, "Failed to close the GpuDelegate.", e);
-    }
-  }
-
-  /** Calls {@code getNativeHandle()} method of the delegate. */
-  @Override
-  public long getNativeHandle() {
-    return proxiedDelegate.getNativeHandle();
-  }
-
-  private GpuDelegateProxy(Object instance) {
-    this.proxiedCloseable = (Closeable) instance;
-    this.proxiedDelegate = (Delegate) instance;
-  }
-}
diff --git a/tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/model/Model.java b/tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/model/Model.java
deleted file mode 100644
index 8062d68d7b9..00000000000
--- a/tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/model/Model.java
+++ /dev/null
@@ -1,285 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-package org.tensorflow.lite.support.model;
-
-import android.content.Context;
-import java.io.IOException;
-import java.nio.MappedByteBuffer;
-import java.util.Map;
-import org.checkerframework.checker.nullness.qual.NonNull;
-import org.checkerframework.checker.nullness.qual.Nullable;
-import org.tensorflow.lite.Interpreter;
-import org.tensorflow.lite.Tensor;
-import org.tensorflow.lite.support.common.FileUtil;
-import org.tensorflow.lite.support.common.SupportPreconditions;
-
-/**
- * The wrapper class for a TFLite model and a TFLite interpreter.
- *
- * <p>Note: A {@link Model} can only holds 1 TFLite model at a time, and always holds a TFLite
- * interpreter instance to run it.
- */
-public class Model {
-
-  /** The runtime device type used for executing classification. */
-  public enum Device {
-    CPU,
-    NNAPI,
-    GPU
-  }
-
-  /**
-   * Options for running the model. Configurable parameters includes:
-   *
-   * <ul>
-   *   <li>{@code device} {@link Builder#setDevice(Device)} specifies the hardware to run the model.
-   *       The default value is {@link Device#CPU}.
-   *   <li>{@code numThreads} {@link Builder#setNumThreads(int)} specifies the number of threads
-   *       used by TFLite inference. It's only effective when device is set to {@link Device#CPU}
-   *       and default value is 1.
-   * </ul>
-   */
-  public static class Options {
-    private final Device device;
-    private final int numThreads;
-
-    /** Builder of {@link Options}. See its doc for details. */
-    public static class Builder {
-      private Device device = Device.CPU;
-      private int numThreads = 1;
-
-      public Builder setDevice(Device device) {
-        this.device = device;
-        return this;
-      }
-
-      public Builder setNumThreads(int numThreads) {
-        this.numThreads = numThreads;
-        return this;
-      }
-
-      public Options build() {
-        return new Options(this);
-      }
-    }
-
-    private Options(Builder builder) {
-      device = builder.device;
-      numThreads = builder.numThreads;
-    }
-  }
-
-  /** An instance of the driver class to run model inference with Tensorflow Lite. */
-  private final Interpreter interpreter;
-
-  /** Path to tflite model file in asset folder. */
-  private final String modelPath;
-
-  /** The memory-mapped model data. */
-  private final MappedByteBuffer byteModel;
-
-  private final GpuDelegateProxy gpuDelegateProxy;
-
-  /**
-   * Builder for {@link Model}.
-   *
-   * @deprecated Please use {@link Model#createModel(Context, String, Options)}.
-   */
-  @Deprecated
-  public static class Builder {
-    private Device device = Device.CPU;
-    private int numThreads = 1;
-    private final String modelPath;
-    private final MappedByteBuffer byteModel;
-
-    /**
-     * Creates a builder which loads tflite model from asset folder using memory-mapped files.
-     *
-     * @param context: Application context to access assets.
-     * @param modelPath: Asset path of the model (.tflite file).
-     * @throws IOException if an I/O error occurs when loading the tflite model.
-     */
-    @NonNull
-    public Builder(@NonNull Context context, @NonNull String modelPath) throws IOException {
-      this.modelPath = modelPath;
-      byteModel = FileUtil.loadMappedFile(context, modelPath);
-    }
-
-    /** Sets running device. By default, TFLite will run on CPU. */
-    @NonNull
-    public Builder setDevice(Device device) {
-      this.device = device;
-      return this;
-    }
-
-    /** Sets number of threads. By default it's 1. */
-    @NonNull
-    public Builder setNumThreads(int numThreads) {
-      this.numThreads = numThreads;
-      return this;
-    }
-
-    // Note: The implementation is copied from `Model#createModel`. As the builder is going to be
-    // deprecated, this function is also to be removed.
-    @NonNull
-    public Model build() {
-      Options options = new Options.Builder().setNumThreads(numThreads).setDevice(device).build();
-      return createModel(byteModel, modelPath, options);
-    }
-  }
-
-  /**
-   * Loads a model from assets and initialize TFLite interpreter.
-   *
-   * <p>The default options are: (1) CPU device; (2) one thread.
-   *
-   * @param context The App Context.
-   * @param modelPath The path of the model file.
-   * @throws IOException if any exception occurs when open the model file.
-   */
-  public static Model createModel(@NonNull Context context, @NonNull String modelPath)
-      throws IOException {
-    return createModel(context, modelPath, new Options.Builder().build());
-  }
-
-  /**
-   * Loads a model from assets and initialize TFLite interpreter with given options.
-   *
-   * @see Options for details.
-   * @param context The App Context.
-   * @param modelPath The path of the model file.
-   * @param options The options for running the model.
-   * @throws IOException if any exception occurs when open the model file.
-   */
-  public static Model createModel(
-      @NonNull Context context, @NonNull String modelPath, @NonNull Options options)
-      throws IOException {
-    SupportPreconditions.checkNotEmpty(
-        modelPath, "Model path in the asset folder cannot be empty.");
-    MappedByteBuffer byteModel = FileUtil.loadMappedFile(context, modelPath);
-    return createModel(byteModel, modelPath, options);
-  }
-
-  /**
-   * Creates a model with loaded {@link MappedByteBuffer}.
-   *
-   * @see Options for details.
-   * @param byteModel The loaded TFLite model.
-   * @param modelPath The original path of the model. It can be fetched later by {@link
-   *     Model#getPath()}.
-   * @param options The options for running the model.
-   * @throws IllegalArgumentException if {@code options.device} is {@link Device#GPU} but
-   *     "tensorflow-lite-gpu" is not linked to the project.
-   */
-  public static Model createModel(
-      @NonNull MappedByteBuffer byteModel, @NonNull String modelPath, @NonNull Options options) {
-    Interpreter.Options interpreterOptions = new Interpreter.Options();
-    GpuDelegateProxy gpuDelegateProxy = null;
-    switch (options.device) {
-      case NNAPI:
-        interpreterOptions.setUseNNAPI(true);
-        break;
-      case GPU:
-        gpuDelegateProxy = GpuDelegateProxy.maybeNewInstance();
-        SupportPreconditions.checkArgument(
-            gpuDelegateProxy != null,
-            "Cannot inference with GPU. Did you add \"tensorflow-lite-gpu\" as dependency?");
-        interpreterOptions.addDelegate(gpuDelegateProxy);
-        break;
-      case CPU:
-        break;
-    }
-    interpreterOptions.setNumThreads(options.numThreads);
-    Interpreter interpreter = new Interpreter(byteModel, interpreterOptions);
-    return new Model(modelPath, byteModel, interpreter, gpuDelegateProxy);
-  }
-
-  /** Returns the memory-mapped model data. */
-  @NonNull
-  public MappedByteBuffer getData() {
-    return byteModel;
-  }
-
-  /** Returns the path of the model file stored in Assets. */
-  @NonNull
-  public String getPath() {
-    return modelPath;
-  }
-
-  /**
-   * Gets the Tensor associated with the provdied input index.
-   *
-   * @throws IllegalStateException if the interpreter is closed.
-   */
-  public Tensor getInputTensor(int inputIndex) {
-    return interpreter.getInputTensor(inputIndex);
-  }
-
-  /**
-   * Gets the Tensor associated with the provdied output index.
-   *
-   * @throws IllegalStateException if the interpreter is closed.
-   */
-  public Tensor getOutputTensor(int outputIndex) {
-    return interpreter.getOutputTensor(outputIndex);
-  }
-
-  /**
-   * Returns the output shape. Useful if output shape is only determined when graph is created.
-   *
-   * @throws IllegalStateException if the interpreter is closed.
-   */
-  public int[] getOutputTensorShape(int outputIndex) {
-    return interpreter.getOutputTensor(outputIndex).shape();
-  }
-
-  /**
-   * Runs model inference on multiple inputs, and returns multiple outputs.
-   *
-   * @param inputs an array of input data. The inputs should be in the same order as inputs of the
-   *     model. Each input can be an array or multidimensional array, or a {@link
-   *     java.nio.ByteBuffer} of primitive types including int, float, long, and byte. {@link
-   *     java.nio.ByteBuffer} is the preferred way to pass large input data, whereas string types
-   *     require using the (multi-dimensional) array input path. When {@link java.nio.ByteBuffer} is
-   *     used, its content should remain unchanged until model inference is done.
-   * @param outputs a map mapping output indices to multidimensional arrays of output data or {@link
-   *     java.nio.ByteBuffer}s of primitive types including int, float, long, and byte. It only
-   *     needs to keep entries for the outputs to be used.
-   */
-  public void run(@NonNull Object[] inputs, @NonNull Map<Integer, Object> outputs) {
-    interpreter.runForMultipleInputsOutputs(inputs, outputs);
-  }
-
-  public void close() {
-    if (interpreter != null) {
-      interpreter.close();
-    }
-    if (gpuDelegateProxy != null) {
-      gpuDelegateProxy.close();
-    }
-  }
-
-  private Model(
-      @NonNull String modelPath,
-      @NonNull MappedByteBuffer byteModel,
-      @NonNull Interpreter interpreter,
-      @Nullable GpuDelegateProxy gpuDelegateProxy) {
-    this.modelPath = modelPath;
-    this.byteModel = byteModel;
-    this.interpreter = interpreter;
-    this.gpuDelegateProxy = gpuDelegateProxy;
-  }
-}
diff --git a/tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/tensorbuffer/TensorBuffer.java b/tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/tensorbuffer/TensorBuffer.java
deleted file mode 100644
index fa05be363a6..00000000000
--- a/tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/tensorbuffer/TensorBuffer.java
+++ /dev/null
@@ -1,412 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-package org.tensorflow.lite.support.tensorbuffer;
-
-import java.nio.ByteBuffer;
-import java.nio.ByteOrder;
-import java.util.Arrays;
-import org.checkerframework.checker.nullness.qual.NonNull;
-import org.tensorflow.lite.DataType;
-import org.tensorflow.lite.support.common.SupportPreconditions;
-
-/** Represents the data buffer for either a model's input or its output. */
-public abstract class TensorBuffer {
-  /** Where the data is stored. */
-  protected ByteBuffer buffer;
-
-  /** Shape of the tensor stored in this buffer. */
-  protected int[] shape;
-
-  /** Number of elements in the buffer. It will be changed to a proper value in the constructor. */
-  protected int flatSize = -1;
-
-  /**
-   * Indicator of whether this buffer is dynamic or fixed-size. Fixed-size buffers will have
-   * pre-allocated memory and fixed size. While the size of dynamic buffers can be changed.
-   */
-  protected final boolean isDynamic;
-
-  /**
-   * Creates a {@link TensorBuffer} with specified {@code shape} and {@link DataType}. Here are some
-   * examples:
-   *
-   * <pre>
-   * Creating a float TensorBuffer with shape {2, 3}:
-   * int[] shape = new int[] {2, 3};
-   * TensorBuffer tensorBuffer = TensorBuffer.createFixedSize(shape, DataType.FLOAT32);
-   * </pre>
-   *
-   * <pre>
-   * Creating an uint8 TensorBuffer of a scalar:
-   * int[] shape = new int[] {};
-   * TensorBuffer tensorBuffer = TensorBuffer.createFixedSize(shape, DataType.UINT8);
-   * </pre>
-   *
-   * <pre>
-   * Creating an empty uint8 TensorBuffer:
-   * int[] shape = new int[] {0};
-   * TensorBuffer tensorBuffer = TensorBuffer.createFixedSize(shape, DataType.UINT8);
-   * </pre>
-   *
-   * <p>The size of a fixed-size TensorBuffer cannot be changed once it is created.
-   *
-   * @param shape The shape of the {@link TensorBuffer} to be created.
-   * @param dataType The dataType of the {@link TensorBuffer} to be created.
-   * @throws NullPointerException if {@code shape} is null.
-   * @throws IllegalArgumentException if {@code shape} has non-positive elements.
-   */
-  @NonNull
-  public static TensorBuffer createFixedSize(@NonNull int[] shape, DataType dataType) {
-    switch (dataType) {
-      case FLOAT32:
-        return new TensorBufferFloat(shape);
-      case UINT8:
-        return new TensorBufferUint8(shape);
-      default:
-        throw new AssertionError("TensorBuffer does not support data type: " + dataType);
-    }
-  }
-
-  /**
-   * Creates an empty dynamic {@link TensorBuffer} with specified {@link DataType}. The shape of the
-   * created {@link TensorBuffer} is {0}.
-   *
-   * <p>Dynamic TensorBuffers will reallocate memory when loading arrays or data buffers of
-   * different buffer sizes.
-   *
-   * @param dataType The dataType of the {@link TensorBuffer} to be created.
-   */
-  @NonNull
-  public static TensorBuffer createDynamic(DataType dataType) {
-    switch (dataType) {
-      case FLOAT32:
-        return new TensorBufferFloat();
-      case UINT8:
-        return new TensorBufferUint8();
-      default:
-        throw new AssertionError("TensorBuffer does not support data type: " + dataType);
-    }
-  }
-
-  /**
-   * Creates a {@link TensorBuffer} deep-copying data from another, with specified {@link DataType}.
-   *
-   * @param buffer the source {@link TensorBuffer} to copy from.
-   * @param dataType the expected {@link DataType} of newly created {@link TensorBuffer}.
-   * @throws NullPointerException if {@code buffer} is null.
-   */
-  @NonNull
-  public static TensorBuffer createFrom(@NonNull TensorBuffer buffer, DataType dataType) {
-    SupportPreconditions.checkNotNull(buffer, "Cannot create a buffer from null");
-    TensorBuffer result;
-    if (buffer.isDynamic()) {
-      result = createDynamic(dataType);
-    } else {
-      result = createFixedSize(buffer.shape, dataType);
-    }
-    // The only scenario we need float array is FLOAT32->FLOAT32, or we can always use INT as
-    // intermediate container.
-    // The assumption is not true when we support other data types.
-    if (buffer.getDataType() == DataType.FLOAT32 && dataType == DataType.FLOAT32) {
-      float[] data = buffer.getFloatArray();
-      result.loadArray(data, buffer.shape);
-    } else {
-      int[] data = buffer.getIntArray();
-      result.loadArray(data, buffer.shape);
-    }
-    return result;
-  }
-
-  /** Returns the data buffer. */
-  @NonNull
-  public ByteBuffer getBuffer() {
-    return buffer;
-  }
-
-  /** Gets the {@link TensorBuffer#flatSize} of the buffer. */
-  public int getFlatSize() {
-    return flatSize;
-  }
-
-  /** Gets the current shape. (returning a copy here to avoid unexpected modification.) */
-  @NonNull
-  public int[] getShape() {
-    return Arrays.copyOf(shape, shape.length);
-  }
-
-  /** Returns the data type of this buffer. */
-  public abstract DataType getDataType();
-
-  /**
-   * Returns a float array of the values stored in this buffer. If the buffer is of different types
-   * than float, the values will be converted into float. For example, values in {@link
-   * TensorBufferUint8} will be converted from uint8 to float.
-   */
-  @NonNull
-  public abstract float[] getFloatArray();
-
-  /**
-   * Returns a float value at a given index. If the buffer is of different types than float, the
-   * value will be converted into float. For example, when reading a value from {@link
-   * TensorBufferUint8}, the value will be first read out as uint8, and then will be converted from
-   * uint8 to float.
-   *
-   * <pre>
-   * For example, a TensorBuffer with shape {2, 3} that represents the following array,
-   * [[0.0f, 1.0f, 2.0f], [3.0f, 4.0f, 5.0f]].
-   *
-   * The fourth element (whose value is 3.0f) in the TensorBuffer can be retrived by:
-   * float v = tensorBuffer.getFloatValue(3);
-   * </pre>
-   *
-   * @param absIndex The absolute index of the value to be read.
-   */
-  public abstract float getFloatValue(int absIndex);
-
-  /**
-   * Returns an int array of the values stored in this buffer. If the buffer is of different type
-   * than int, the values will be converted into int, and loss of precision may apply. For example,
-   * getting an int array from a {@link TensorBufferFloat} with values {400.32f, 23.04f}, the output
-   * is {400, 23}.
-   */
-  @NonNull
-  public abstract int[] getIntArray();
-
-  /**
-   * Returns an int value at a given index. If the buffer is of different types than int, the value
-   * will be converted into int. For example, when reading a value from {@link TensorBufferFloat},
-   * the value will be first read out as float, and then will be converted from float to int. Loss
-   * of precision may apply.
-   *
-   * <pre>
-   * For example, a TensorBuffer with shape {2, 3} that represents the following array,
-   * [[0.0f, 1.0f, 2.0f], [3.0f, 4.0f, 5.0f]].
-   *
-   * The fourth element (whose value is 3.0f) in the TensorBuffer can be retrived by:
-   * int v = tensorBuffer.getIntValue(3);
-   * Note that v is converted from 3.0f to 3 as a result of type conversion.
-   * </pre>
-   *
-   * @param absIndex The absolute index of the value to be read.
-   */
-  public abstract int getIntValue(int absIndex);
-
-  /**
-   * Returns the number of bytes of a single element in the array. For example, a float buffer will
-   * return 4, and a byte buffer will return 1.
-   */
-  public abstract int getTypeSize();
-
-  /** Returns if the {@link TensorBuffer} is dynamic sized (could resize arbitrarily). */
-  public boolean isDynamic() {
-    return isDynamic;
-  }
-
-  /**
-   * Loads an int array into this buffer with specific shape. If the buffer is of different types
-   * than int, the values will be converted into the buffer's type before being loaded into the
-   * buffer, and loss of precision may apply. For example, loading an int array with values {400,
-   * -23} into a {@link TensorBufferUint8} , the values will be clamped to [0, 255] and then be
-   * casted to uint8 by {255, 0}.
-   *
-   * @param src The source array to be loaded.
-   * @param shape Shape of the tensor that {@code src} represents.
-   * @throws NullPointerException if {@code src} is null.
-   * @throws NullPointerException if {@code shape} is null.
-   * @throws IllegalArgumentException if the size of the array to be loaded does not match the
-   *     specified shape.
-   */
-  public abstract void loadArray(@NonNull int[] src, @NonNull int[] shape);
-
-  /**
-   * Loads an int array into this buffer. If the buffer is of different types than int, the values
-   * will be converted into the buffer's type before being loaded into the buffer, and loss of
-   * precision may apply. For example, loading an int array with values {400, -23} into a {@link
-   * TensorBufferUint8} , the values will be clamped to [0, 255] and then be casted to uint8 by
-   * {255, 0}.
-   *
-   * <p>Size of {@code src} should always match the flat size of this {@link TensorBuffer}, for both
-   * fixed-size and dynamic {@link TensorBuffer}.
-   *
-   * @param src The source array to be loaded.
-   */
-  public void loadArray(@NonNull int[] src) {
-    loadArray(src, shape);
-  }
-
-  /**
-   * Loads a float array into this buffer with specific shape. If the buffer is of different types
-   * than float, the values will be converted into the buffer's type before being loaded into the
-   * buffer, and loss of precision may apply. For example, loading a float array into a {@link
-   * TensorBufferUint8} with values {400.32f, -23.04f}, the values will be clamped to [0, 255] and
-   * then be casted to uint8 by {255, 0}.
-   *
-   * @param src The source array to be loaded.
-   * @param shape Shape of the tensor that {@code src} represents.
-   * @throws NullPointerException if {@code src} is null.
-   * @throws NullPointerException if {@code shape} is null.
-   * @throws IllegalArgumentException if the size of the array to be loaded does not match the
-   *     specified shape.
-   */
-  public abstract void loadArray(@NonNull float[] src, @NonNull int[] shape);
-
-  /**
-   * Loads a float array into this buffer. If the buffer is of different types than float, the
-   * values will be converted into the buffer's type before being loaded into the buffer, and loss
-   * of precision may apply. For example, loading a float array into a {@link TensorBufferUint8}
-   * with values {400.32f, -23.04f}, the values will be clamped to [0, 255] and then be casted to
-   * uint8 by {255, 0}.
-   *
-   * <p>Size of {@code src} should always match the flat size of this {@link TensorBuffer}, for both
-   * fixed-size and dynamic {@link TensorBuffer}.
-   *
-   * @param src The source array to be loaded.
-   */
-  public void loadArray(@NonNull float[] src) {
-    loadArray(src, shape);
-  }
-
-  /**
-   * Loads a byte buffer into this {@link TensorBuffer} with specific shape.
-   *
-   * <p>Important: The loaded buffer is a reference. DO NOT MODIFY. We don't create a copy here for
-   * performance concern, but if modification is necessary, please make a copy.
-   *
-   * @param buffer The byte buffer to load.
-   * @throws NullPointerException if {@code buffer} is null.
-   * @throws IllegalArgumentException if the size of {@code buffer} and {@code typeSize} do not
-   *     match or the size of {@code buffer} and {@code flatSize} do not match.
-   */
-  public void loadBuffer(@NonNull ByteBuffer buffer, @NonNull int[] shape) {
-    SupportPreconditions.checkNotNull(buffer, "Byte buffer cannot be null.");
-    int flatSize = computeFlatSize(shape);
-    SupportPreconditions.checkArgument(
-        (buffer.limit() == getTypeSize() * flatSize),
-        "The size of byte buffer and the shape do not match.");
-
-    if (!isDynamic) {
-      SupportPreconditions.checkArgument(
-          flatSize == this.flatSize,
-          "The size of byte buffer and the size of the tensor buffer do not match.");
-    } else {
-      this.flatSize = flatSize;
-    }
-
-    this.shape = shape.clone();
-    buffer.rewind();
-    this.buffer = buffer;
-  }
-
-  /**
-   * Loads a byte buffer into this {@link TensorBuffer}. Buffer size must match the flat size of
-   * this {@link TensorBuffer}.
-   *
-   * <p>Important: The loaded buffer is a reference. DO NOT MODIFY. We don't create a copy here for
-   * performance concern, but if modification is necessary, please make a copy.
-   *
-   * @param buffer The byte buffer to load.
-   */
-  public void loadBuffer(@NonNull ByteBuffer buffer) {
-    loadBuffer(buffer, shape);
-  }
-
-  /**
-   * Constructs a fixed size {@link TensorBuffer} with specified {@code shape}.
-   *
-   * @throws NullPointerException if {@code shape} is null.
-   * @throws IllegalArgumentException if {@code shape} has non-positive elements.
-   */
-  protected TensorBuffer(@NonNull int[] shape) {
-    isDynamic = false;
-    allocateMemory(shape);
-  }
-
-  /** Constructs a dynamic {@link TensorBuffer} which can be resized. */
-  protected TensorBuffer() {
-    isDynamic = true;
-    // Initialize the dynamic TensorBuffer with an empty ByteBuffer.
-    allocateMemory(new int[] {0});
-  }
-
-  /** Calculates number of elements in the buffer. */
-  protected static int computeFlatSize(@NonNull int[] shape) {
-    SupportPreconditions.checkNotNull(shape, "Shape cannot be null.");
-    int prod = 1;
-    for (int s : shape) {
-      prod = prod * s;
-    }
-    return prod;
-  }
-
-  /**
-   * For dynamic buffer, resize the memory if needed. For fixed-size buffer, check if the {@code
-   * shape} of src fits the buffer size.
-   */
-  protected void resize(@NonNull int[] shape) {
-    if (isDynamic) {
-      allocateMemory(shape);
-    } else {
-      // Make sure the new shape fits the buffer size when TensorBuffer has fixed size.
-      SupportPreconditions.checkArgument(Arrays.equals(shape, this.shape));
-      this.shape = shape.clone();
-    }
-  }
-
-  /**
-   * Allocates buffer with corresponding size of the {@code shape}. If shape is an empty array, this
-   * {@link TensorBuffer} will be created as a scalar and its flatSize will be 1.
-   *
-   * @throws NullPointerException if {@code shape} is null.
-   * @throws IllegalArgumentException if {@code shape} has negative elements.
-   */
-  private void allocateMemory(@NonNull int[] shape) {
-    SupportPreconditions.checkNotNull(shape, "TensorBuffer shape cannot be null.");
-    SupportPreconditions.checkArgument(
-        isShapeValid(shape), "Values in TensorBuffer shape should be non-negative.");
-
-    // Check if the new shape is the same as current shape.
-    int newFlatSize = computeFlatSize(shape);
-    this.shape = shape.clone();
-    if (flatSize == newFlatSize) {
-      return;
-    }
-
-    // Update to the new shape.
-    flatSize = newFlatSize;
-    buffer = ByteBuffer.allocateDirect(flatSize * getTypeSize());
-    buffer.order(ByteOrder.nativeOrder());
-  }
-
-  /**
-   * Checks if {@code shape} meets one of following two requirements: 1. Elements in {@code shape}
-   * are all non-negative numbers. 2. {@code shape} is an empty array, which corresponds to scalar.
-   */
-  private static boolean isShapeValid(@NonNull int[] shape) {
-    if (shape.length == 0) {
-      // This shape refers to a scalar.
-      return true;
-    }
-
-    // This shape refers to a multidimensional array.
-    for (int s : shape) {
-      // All elements in shape should be non-negative.
-      if (s < 0) {
-        return false;
-      }
-    }
-    return true;
-  }
-}
diff --git a/tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/tensorbuffer/TensorBufferFloat.java b/tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/tensorbuffer/TensorBufferFloat.java
deleted file mode 100644
index c5b46b19f29..00000000000
--- a/tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/tensorbuffer/TensorBufferFloat.java
+++ /dev/null
@@ -1,110 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-package org.tensorflow.lite.support.tensorbuffer;
-
-import java.nio.FloatBuffer;
-import org.checkerframework.checker.nullness.qual.NonNull;
-import org.tensorflow.lite.DataType;
-import org.tensorflow.lite.support.common.SupportPreconditions;
-
-/** Represents data buffer with float values. */
-public final class TensorBufferFloat extends TensorBuffer {
-  private static final DataType DATA_TYPE = DataType.FLOAT32;
-
-  /**
-   * Creates a {@link TensorBufferFloat} with specified {@code shape}.
-   *
-   * @throws NullPointerException if {@code shape} is null.
-   * @throws IllegalArgumentException if {@code shape} has non-positive elements.
-   */
-  TensorBufferFloat(@NonNull int[] shape) {
-    super(shape);
-  }
-
-  TensorBufferFloat() {
-    super();
-  }
-
-  @Override
-  public DataType getDataType() {
-    return DATA_TYPE;
-  }
-
-  @Override
-  @NonNull
-  public float[] getFloatArray() {
-    buffer.rewind();
-    float[] arr = new float[flatSize];
-
-    FloatBuffer floatBuffer = buffer.asFloatBuffer();
-    floatBuffer.get(arr);
-    return arr;
-  }
-
-  @Override
-  public float getFloatValue(int absIndex) {
-    return buffer.getFloat(absIndex << 2);
-  }
-
-  @Override
-  @NonNull
-  public int[] getIntArray() {
-    buffer.rewind();
-    int[] arr = new int[flatSize];
-
-    for (int i = 0; i < flatSize; i++) {
-      arr[i] = (int) buffer.getFloat();
-    }
-    return arr;
-  }
-
-  @Override
-  public int getIntValue(int absIndex) {
-    return (int) buffer.getFloat(absIndex << 2);
-  }
-
-  @Override
-  public int getTypeSize() {
-    return DATA_TYPE.byteSize();
-  }
-
-  @Override
-  public void loadArray(@NonNull float[] src, @NonNull int[] shape) {
-    SupportPreconditions.checkNotNull(src, "The array to be loaded cannot be null.");
-    SupportPreconditions.checkArgument(
-        src.length == computeFlatSize(shape),
-        "The size of the array to be loaded does not match the specified shape.");
-    resize(shape);
-    buffer.rewind();
-
-    FloatBuffer floatBuffer = buffer.asFloatBuffer();
-    floatBuffer.put(src);
-  }
-
-  @Override
-  public void loadArray(@NonNull int[] src, @NonNull int[] shape) {
-    SupportPreconditions.checkNotNull(src, "The array to be loaded cannot be null.");
-    SupportPreconditions.checkArgument(
-        src.length == computeFlatSize(shape),
-        "The size of the array to be loaded does not match the specified shape.");
-    resize(shape);
-    buffer.rewind();
-
-    for (int a : src) {
-      buffer.putFloat((float) a);
-    }
-  }
-}
diff --git a/tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/tensorbuffer/TensorBufferUint8.java b/tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/tensorbuffer/TensorBufferUint8.java
deleted file mode 100644
index dce63ef2d51..00000000000
--- a/tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/tensorbuffer/TensorBufferUint8.java
+++ /dev/null
@@ -1,111 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-package org.tensorflow.lite.support.tensorbuffer;
-
-import org.checkerframework.checker.nullness.qual.NonNull;
-import org.tensorflow.lite.DataType;
-import org.tensorflow.lite.support.common.SupportPreconditions;
-
-/** Represents data buffer with 8-bit unsigned integer values. */
-public final class TensorBufferUint8 extends TensorBuffer {
-  private static final DataType DATA_TYPE = DataType.UINT8;
-
-  /**
-   * Creates a {@link TensorBufferUint8} with specified {@code shape}.
-   *
-   * @throws NullPointerException if {@code shape} is null.
-   * @throws IllegalArgumentException if {@code shape} has non-positive elements.
-   */
-  TensorBufferUint8(@NonNull int[] shape) {
-    super(shape);
-  }
-
-  TensorBufferUint8() {
-    super();
-  }
-
-  @Override
-  public DataType getDataType() {
-    return DATA_TYPE;
-  }
-
-  @Override
-  @NonNull
-  public float[] getFloatArray() {
-    buffer.rewind();
-    float[] arr = new float[flatSize];
-
-    for (int i = 0; i < flatSize; i++) {
-      arr[i] = (float) (buffer.get() & 0xff);
-    }
-    return arr;
-  }
-
-  @Override
-  public float getFloatValue(int index) {
-    return (float) (buffer.get(index) & 0xff);
-  }
-
-  @Override
-  @NonNull
-  public int[] getIntArray() {
-    buffer.rewind();
-    int[] arr = new int[flatSize];
-
-    for (int i = 0; i < flatSize; i++) {
-      arr[i] = buffer.get() & 0xff;
-    }
-    return arr;
-  }
-
-  @Override
-  public int getIntValue(int index) {
-    return buffer.get(index) & 0xff;
-  }
-
-  @Override
-  public int getTypeSize() {
-    return DATA_TYPE.byteSize();
-  }
-
-  @Override
-  public void loadArray(@NonNull float[] src, @NonNull int[] shape) {
-    SupportPreconditions.checkNotNull(src, "The array to be loaded cannot be null.");
-    SupportPreconditions.checkArgument(
-        src.length == computeFlatSize(shape),
-        "The size of the array to be loaded does not match the specified shape.");
-    resize(shape);
-    buffer.rewind();
-
-    for (float a : src) {
-      buffer.put((byte) Math.max(Math.min(a, 255.0), 0.0));
-    }
-  }
-
-  @Override
-  public void loadArray(@NonNull int[] src, @NonNull int[] shape) {
-    SupportPreconditions.checkNotNull(src, "The array to be loaded cannot be null.");
-    SupportPreconditions.checkArgument(
-        src.length == computeFlatSize(shape),
-        "The size of the array to be loaded does not match the specified shape.");
-    resize(shape);
-    buffer.rewind();
-
-    for (int a : src) {
-      buffer.put((byte) Math.max(Math.min(a, 255), 0));
-    }
-  }
-}
diff --git a/tensorflow/lite/experimental/support/metadata/BUILD b/tensorflow/lite/experimental/support/metadata/BUILD
deleted file mode 100644
index ba410d914c7..00000000000
--- a/tensorflow/lite/experimental/support/metadata/BUILD
+++ /dev/null
@@ -1,113 +0,0 @@
-load("//tensorflow:tensorflow.bzl", "py_test")
-load("@flatbuffers//:build_defs.bzl", "flatbuffer_android_library", "flatbuffer_cc_library", "flatbuffer_java_library", "flatbuffer_py_library")
-load("//tensorflow/lite/experimental/support/metadata:build_defs.bzl", "stamp_metadata_parser_version")
-
-package(
-    default_visibility = [
-        "//visibility:public",
-    ],
-    licenses = ["notice"],  # Apache 2.0
-)
-
-exports_files(["metadata_schema.fbs"])
-
-flatbuffer_py_library(
-    name = "schema_py",
-    srcs = ["//tensorflow/lite/schema:schema.fbs"],
-)
-
-# Generic schema for inference on device.
-flatbuffer_android_library(
-    name = "schema_fbs_android",
-    srcs = ["//tensorflow/lite/schema:schema.fbs"],
-    custom_package = "org.tensorflow.lite.schema",
-)
-
-flatbuffer_java_library(
-    name = "schema_fbs_java",
-    srcs = ["//tensorflow/lite/schema:schema.fbs"],
-    custom_package = "org.tensorflow.lite.schema",
-)
-
-# Generic schema for model metadata.
-flatbuffer_cc_library(
-    name = "metadata_schema_cc",
-    srcs = ["metadata_schema.fbs"],
-)
-
-flatbuffer_py_library(
-    name = "metadata_schema_py",
-    srcs = ["metadata_schema.fbs"],
-)
-
-flatbuffer_java_library(
-    name = "metadata_schema_java",
-    srcs = ["metadata_schema.fbs"],
-    custom_package = "org.tensorflow.lite.support.metadata.schema",
-)
-
-flatbuffer_android_library(
-    name = "metadata_schema_fbs_android",
-    srcs = ["metadata_schema.fbs"],
-    custom_package = "org.tensorflow.lite.support.metadata.schema",
-)
-
-# TODO(b/157813075): move the metadata python library to metadata/python/ when migrating to the new repo.
-stamp_metadata_parser_version(
-    name = "metadata_parser_py",
-    srcs = ["metadata_parser.py.template"],
-    outs = ["metadata_parser.py"],
-)
-
-py_library(
-    name = "metadata",
-    srcs = [
-        "metadata.py",
-        ":metadata_parser_py",
-    ],
-    data = [
-        "//tensorflow/lite/experimental/support/metadata:metadata_schema.fbs",
-    ],
-    srcs_version = "PY2AND3",
-    visibility = ["//visibility:public"],
-    deps = [
-        ":metadata_schema_py",
-        ":schema_py",
-        "//tensorflow/lite/experimental/support/metadata/cc/python:_pywrap_metadata_version",
-        "//tensorflow/lite/experimental/support/metadata/flatbuffers_lib:_pywrap_flatbuffers",
-        "//tensorflow/python:platform",
-        "@flatbuffers//:runtime_py",
-    ],
-)
-
-py_test(
-    name = "metadata_test",
-    srcs = ["metadata_test.py"],
-    data = ["testdata/golden_json.json"],
-    python_version = "PY3",
-    srcs_version = "PY2AND3",
-    tags = [
-        "no_mac",  # TODO(b/148247402): flatbuffers import broken on Mac OS.
-    ],
-    deps = [
-        ":metadata",
-        ":metadata_schema_py",
-        ":schema_py",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:platform",
-        "//tensorflow/python:platform_test",
-        "@flatbuffers//:runtime_py",
-        "@six_archive//:six",
-    ],
-)
-
-py_test(
-    name = "metadata_parser_test",
-    srcs = ["metadata_parser_test.py"],
-    python_version = "PY3",
-    srcs_version = "PY2AND3",
-    deps = [
-        ":metadata",
-        "//tensorflow/python:client_testlib",
-    ],
-)
diff --git a/tensorflow/lite/experimental/support/metadata/README.md b/tensorflow/lite/experimental/support/metadata/README.md
deleted file mode 100644
index ff7d25f27cb..00000000000
--- a/tensorflow/lite/experimental/support/metadata/README.md
+++ /dev/null
@@ -1,15 +0,0 @@
-# TensorFlow Lite Metadata and Android wrapper code generator
-
-Note: Both TensorFlow Lite Metadata and the Android wrapper code generator are
-in experimental (beta) phase.
-
-TensorFlow Lite metadata provides a structured framework for storing metadata
-to convey information for both the developer that will utilitised the model and
-code generators which can create wrapper around the model. For information on
-how to populate model metadata, please refer to the [TensorFlow Lite Metadata 
-documentation](https://www.tensorflow.org/lite/convert/metadata).
-
-The first code generator which takes advantage of this metadata format is the
-TensorFlow Lite Android Code Generator. For more information on how to use this
-generator, please refer to the [TensorFlow Lite Android wrapper code generator
-documentation](https://www.tensorflow.org/lite/guide/codegen).
diff --git a/tensorflow/lite/experimental/support/metadata/build_defs.bzl b/tensorflow/lite/experimental/support/metadata/build_defs.bzl
deleted file mode 100644
index 3ea945770e0..00000000000
--- a/tensorflow/lite/experimental/support/metadata/build_defs.bzl
+++ /dev/null
@@ -1,43 +0,0 @@
-"""Build rules to generate metadata schema versions."""
-
-METADATA_SCHEMA_FILE = "//tensorflow/lite/experimental/support/metadata:metadata_schema.fbs"
-
-def stamp_metadata_parser_version(
-        name,
-        srcs,
-        outs):
-    """Stamps the latest metadata parser version into the srcs files.
-
-    Replaces all the occurrences of "{LATEST_METADATA_PARSER_VERSION}" in the
-    srcs files with the metadata schema version extracted from
-    METADATA_SCHEMA_FILE and then outputs the generated file into outs,
-    respectively. The number of srcs files needs to match the number of outs
-    files.
-
-    Args:
-        name: Rule name. (required)
-        srcs: List of source files. (required)
-        outs: List of output files. (required)
-    """
-    if len(srcs) != len(outs):
-        fail(("The number of srcs files (%d) does not match that of the outs" +
-              " files (%d).") %
-             (len(srcs), len(outs)))
-
-    for i in range(0, len(srcs)):
-        native.genrule(
-            name = "%s_file%d" % (name, i),
-            srcs = [srcs[i]],
-            outs = [outs[i]],
-            tools = [METADATA_SCHEMA_FILE],
-            # Gets the metadata schema version from the file, and stamps it
-            # into the srcs file.
-            cmd = "version=$$(sed -n -e '/Schema Semantic version/ s/.*\\: *//p' $(location %s));" %
-                  METADATA_SCHEMA_FILE +
-                  'sed "s/{LATEST_METADATA_PARSER_VERSION}/$$version/" $< > $@',
-        )
-
-    native.filegroup(
-        name = name,
-        srcs = outs,
-    )
diff --git a/tensorflow/lite/experimental/support/metadata/cc/BUILD b/tensorflow/lite/experimental/support/metadata/cc/BUILD
deleted file mode 100644
index 8febc7a2237..00000000000
--- a/tensorflow/lite/experimental/support/metadata/cc/BUILD
+++ /dev/null
@@ -1,29 +0,0 @@
-load("//tensorflow/lite/experimental/support/metadata:build_defs.bzl", "stamp_metadata_parser_version")
-
-package(
-    default_visibility = ["//tensorflow/lite/experimental/support:users"],
-    licenses = ["notice"],  # Apache 2.0
-)
-
-stamp_metadata_parser_version(
-    name = "metadata_parser_h",
-    srcs = ["metadata_parser.h.template"],
-    outs = ["metadata_parser.h"],
-)
-
-cc_library(
-    name = "metadata_version",
-    srcs = ["metadata_version.cc"],
-    hdrs = [
-        "metadata_version.h",
-        ":metadata_parser_h",
-    ],
-    deps = [
-        "//tensorflow/lite/c:common",
-        "//tensorflow/lite/experimental/support/metadata:metadata_schema_cc",
-        "//tensorflow/lite/kernels/internal:compatibility",
-        "//tensorflow/lite/tools:logging",
-        "@com_google_absl//absl/strings",
-        "@flatbuffers",
-    ],
-)
diff --git a/tensorflow/lite/experimental/support/metadata/cc/metadata_parser.h.template b/tensorflow/lite/experimental/support/metadata/cc/metadata_parser.h.template
deleted file mode 100644
index dfb62d0de81..00000000000
--- a/tensorflow/lite/experimental/support/metadata/cc/metadata_parser.h.template
+++ /dev/null
@@ -1,28 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_SUPPORT_METADATA_CC_METADATA_PARSER_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_SUPPORT_METADATA_CC_METADATA_PARSER_H_
-
-namespace tflite {
-namespace metadata {
-
-// The version of the metadata parser that this metadata versioning library is
-// depending on.
-inline constexpr char kMatadataParserVersion[] = "{LATEST_METADATA_PARSER_VERSION}";
-
-}  // namespace metadata
-}  // namespace tflite
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_SUPPORT_METADATA_CC_METADATA_PARSER_H_
diff --git a/tensorflow/lite/experimental/support/metadata/cc/metadata_version.cc b/tensorflow/lite/experimental/support/metadata/cc/metadata_version.cc
deleted file mode 100644
index 971465f7747..00000000000
--- a/tensorflow/lite/experimental/support/metadata/cc/metadata_version.cc
+++ /dev/null
@@ -1,214 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include "tensorflow/lite/experimental/support/metadata/cc/metadata_version.h"
-
-#include <stddef.h>
-#include <stdint.h>
-
-#include <array>
-#include <ostream>
-#include <string>
-#include <vector>
-
-#include "absl/strings/str_join.h"
-#include "absl/strings/str_split.h"
-#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
-#include "tensorflow/lite/c/common.h"
-#include "tensorflow/lite/experimental/support/metadata/metadata_schema_generated.h"
-#include "tensorflow/lite/kernels/internal/compatibility.h"
-#include "tensorflow/lite/tools/logging.h"
-
-namespace tflite {
-namespace metadata {
-namespace {
-
-// Members that are added to the metadata schema after the initial version
-// of 1.0.0.
-enum class SchemaMembers {
-  kAssociatedFileTypeVocabulary = 0,
-};
-
-// Helper class to compare semantic versions in terms of three integers, major,
-// minor, and patch.
-class Version {
- public:
-  explicit Version(int major, int minor = 0, int patch = 0)
-      : version_({major, minor, patch}) {}
-
-  explicit Version(const std::string& version) {
-    const std::vector<std::string> vec = absl::StrSplit(version, '.');
-    // The version string should always be less than four numbers.
-    TFLITE_DCHECK(vec.size() <= kElementNumber && !vec.empty());
-    version_[0] = std::stoi(vec[0]);
-    version_[1] = vec.size() > 1 ? std::stoi(vec[1]) : 0;
-    version_[2] = vec.size() > 2 ? std::stoi(vec[2]) : 0;
-  }
-
-  // Compares two semantic version numbers.
-  //
-  // Example results when comparing two versions strings:
-  //   "1.9" precedes "1.14";
-  //   "1.14" precedes "1.14.1";
-  //   "1.14" and "1.14.0" are equal.
-  //
-  // Returns the value 0 if the two versions are equal; a value less than 0 if
-  // *this precedes v; a value greater than 0 if v precedes *this.
-  int Compare(const Version& v) {
-    for (int i = 0; i < kElementNumber; ++i) {
-      if (version_[i] != v.version_[i]) {
-        return version_[i] < v.version_[i] ? -1 : 1;
-      }
-    }
-    return 0;
-  }
-
-  // Converts version_ into a version string.
-  std::string ToString() { return absl::StrJoin(version_, "."); }
-
- private:
-  static constexpr int kElementNumber = 3;
-  std::array<int, kElementNumber> version_;
-};
-
-Version GetMemberVersion(SchemaMembers member) {
-  switch (member) {
-    case SchemaMembers::kAssociatedFileTypeVocabulary:
-      return Version(1, 0, 1);
-    default:
-      TFLITE_LOG(FATAL) << "Unsupported schema member: "
-                        << static_cast<int>(member);
-  }
-}
-
-// Updates min_version if it precedes the new_version.
-inline void UpdateMinimumVersion(const Version& new_version,
-                                 Version* min_version) {
-  if (min_version->Compare(new_version) < 0) {
-    *min_version = new_version;
-  }
-}
-
-void UpdateMinimumVersionForAssociatedFile(
-    const tflite::AssociatedFile* associated_file, Version* min_version) {
-  if (associated_file == nullptr) return;
-
-  if (associated_file->type() == AssociatedFileType_VOCABULARY) {
-    UpdateMinimumVersion(
-        GetMemberVersion(SchemaMembers::kAssociatedFileTypeVocabulary),
-        min_version);
-  }
-}
-
-void UpdateMinimumVersionForAssociatedFileArray(
-    const flatbuffers::Vector<flatbuffers::Offset<tflite::AssociatedFile>>*
-        associated_files,
-    Version* min_version) {
-  if (associated_files == nullptr) return;
-
-  for (int i = 0; i < associated_files->size(); ++i) {
-    UpdateMinimumVersionForAssociatedFile(associated_files->Get(i),
-                                          min_version);
-  }
-}
-
-void UpdateMinimumVersionForTensorMetadata(
-    const tflite::TensorMetadata* tensor_metadata, Version* min_version) {
-  if (tensor_metadata == nullptr) return;
-
-  // Checks the associated_files field.
-  UpdateMinimumVersionForAssociatedFileArray(
-      tensor_metadata->associated_files(), min_version);
-}
-
-void UpdateMinimumVersionForTensorMetadataArray(
-    const flatbuffers::Vector<flatbuffers::Offset<tflite::TensorMetadata>>*
-        tensor_metadata_array,
-    Version* min_version) {
-  if (tensor_metadata_array == nullptr) return;
-
-  for (int i = 0; i < tensor_metadata_array->size(); ++i) {
-    UpdateMinimumVersionForTensorMetadata(tensor_metadata_array->Get(i),
-                                          min_version);
-  }
-}
-
-void UpdateMinimumVersionForSubGraphMetadata(
-    const tflite::SubGraphMetadata* subgraph_metadata, Version* min_version) {
-  if (subgraph_metadata == nullptr) return;
-
-  // Checks in the input/output metadata arrays.
-  UpdateMinimumVersionForTensorMetadataArray(
-      subgraph_metadata->input_tensor_metadata(), min_version);
-  UpdateMinimumVersionForTensorMetadataArray(
-      subgraph_metadata->output_tensor_metadata(), min_version);
-
-  // Checks the associated_files field.
-  UpdateMinimumVersionForAssociatedFileArray(
-      subgraph_metadata->associated_files(), min_version);
-}
-
-void UpdateMinimumVersionForModelMetadata(
-    const tflite::ModelMetadata& model_metadata, Version* min_version) {
-  // Checks the subgraph_metadata field.
-  if (model_metadata.subgraph_metadata() != nullptr) {
-    for (int i = 0; i < model_metadata.subgraph_metadata()->size(); ++i) {
-      UpdateMinimumVersionForSubGraphMetadata(
-          model_metadata.subgraph_metadata()->Get(i), min_version);
-    }
-  }
-
-  // Checks the associated_files field.
-  UpdateMinimumVersionForAssociatedFileArray(model_metadata.associated_files(),
-                                             min_version);
-}
-
-}  // namespace
-
-TfLiteStatus GetMinimumMetadataParserVersion(const uint8_t* buffer_data,
-                                             size_t buffer_size,
-                                             std::string* min_version_str) {
-  flatbuffers::Verifier verifier =
-      flatbuffers::Verifier(buffer_data, buffer_size);
-  if (!tflite::VerifyModelMetadataBuffer(verifier)) {
-    TFLITE_LOG(ERROR) << "The model metadata is not a valid FlatBuffer buffer.";
-    return kTfLiteError;
-  }
-
-  static constexpr char kDefaultVersion[] = "1.0.0";
-  Version min_version = Version(kDefaultVersion);
-
-  // Checks if any member declared after 1.0.0 (such as those in
-  // SchemaMembers) exists, and updates min_version accordingly. The minimum
-  // metadata parser version will be the largest version number of all fields
-  // that has been added to a metadata flatbuffer
-  const tflite::ModelMetadata* model_metadata = GetModelMetadata(buffer_data);
-
-  // All tables in the metadata schema should have their dedicated
-  // UpdateMinimumVersionFor**() methods, respectively. We'll gradually add
-  // these methods when new fields show up in later schema versions.
-  //
-  // UpdateMinimumVersionFor<Foo>() takes a const pointer of Foo. The pointer
-  // can be a nullptr if Foo is not populated into the corresponding table of
-  // the Flatbuffer object. In this case, UpdateMinimumVersionFor<Foo>() will be
-  // skipped. An exception is UpdateMinimumVersionForModelMetadata(), where
-  // ModelMetadata is the root table, and it won't be null.
-  UpdateMinimumVersionForModelMetadata(*model_metadata, &min_version);
-
-  *min_version_str = min_version.ToString();
-  return kTfLiteOk;
-}
-
-}  // namespace metadata
-}  // namespace tflite
diff --git a/tensorflow/lite/experimental/support/metadata/cc/metadata_version.h b/tensorflow/lite/experimental/support/metadata/cc/metadata_version.h
deleted file mode 100644
index c4127118bc7..00000000000
--- a/tensorflow/lite/experimental/support/metadata/cc/metadata_version.h
+++ /dev/null
@@ -1,38 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_SUPPORT_METADATA_CC_METADATA_VERSION_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_SUPPORT_METADATA_CC_METADATA_VERSION_H_
-
-#include <stddef.h>
-#include <stdint.h>
-
-#include <string>
-
-#include "tensorflow/lite/c/common.h"
-
-namespace tflite {
-namespace metadata {
-
-// Gets the minimum metadata parser version that can fully understand all fields
-// in a given metadata flatbuffer. TFLite Metadata follows Semantic Versioning
-// 2.0. Each release version has the form MAJOR.MINOR.PATCH.
-TfLiteStatus GetMinimumMetadataParserVersion(const uint8_t* buffer_data,
-                                             size_t buffer_size,
-                                             std::string* min_version);
-
-}  // namespace metadata
-}  // namespace tflite
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_SUPPORT_METADATA_CC_METADATA_VERSION_H_
diff --git a/tensorflow/lite/experimental/support/metadata/cc/python/BUILD b/tensorflow/lite/experimental/support/metadata/cc/python/BUILD
deleted file mode 100644
index 4128f0ac9d1..00000000000
--- a/tensorflow/lite/experimental/support/metadata/cc/python/BUILD
+++ /dev/null
@@ -1,22 +0,0 @@
-load("//tensorflow:tensorflow.bzl", "pybind_extension")
-
-package(
-    default_visibility = [
-        "//tensorflow/lite/experimental/support/metadata:__pkg__",
-    ],
-    licenses = ["notice"],  # Apache 2.0
-)
-
-pybind_extension(
-    name = "_pywrap_metadata_version",
-    srcs = [
-        "metadata_version.cc",
-    ],
-    features = ["-use_header_modules"],
-    module_name = "_pywrap_metadata_version",
-    deps = [
-        "//tensorflow/lite/c:common",
-        "//tensorflow/lite/experimental/support/metadata/cc:metadata_version",
-        "@pybind11",
-    ],
-)
diff --git a/tensorflow/lite/experimental/support/metadata/cc/python/metadata_version.cc b/tensorflow/lite/experimental/support/metadata/cc/python/metadata_version.cc
deleted file mode 100644
index 7d1f9d1e122..00000000000
--- a/tensorflow/lite/experimental/support/metadata/cc/python/metadata_version.cc
+++ /dev/null
@@ -1,55 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/lite/experimental/support/metadata/cc/metadata_version.h"
-
-#include "pybind11/pybind11.h"
-#include "tensorflow/lite/c/common.h"
-
-namespace tflite {
-namespace metadata {
-
-PYBIND11_MODULE(_pywrap_metadata_version, m) {
-  m.doc() = R"pbdoc(
-    _pywrap_metadata_version
-    A module that returns the minimum metadata parser version of a given
-    metadata flatbuffer.
-  )pbdoc";
-
-  // Using pybind11 type conversions to convert between Python and native
-  // C++ types. There are other options to provide access to native Python types
-  // in C++ and vice versa. See the pybind 11 instrcution [1] for more details.
-  // Type converstions is recommended by pybind11, though the main downside
-  // is that a copy of the data must be made on every Python to C++ transition:
-  // this is needed since the C++ and Python versions of the same type generally
-  // won’t have the same memory layout.
-  //
-  // [1]: https://pybind11.readthedocs.io/en/stable/advanced/cast/index.html
-  m.def("GetMinimumMetadataParserVersion",
-        [](const std::string& buffer_data) -> std::string {
-          std::string min_version;
-          if (GetMinimumMetadataParserVersion(
-                  reinterpret_cast<const uint8_t*>(buffer_data.c_str()),
-                  buffer_data.length(), &min_version) != kTfLiteOk) {
-            pybind11::value_error(
-                "Error occurred when getting the minimum metadata parser "
-                "version of the metadata flatbuffer.");
-          }
-          return min_version;
-        });
-}
-
-}  // namespace metadata
-}  // namespace tflite
diff --git a/tensorflow/lite/experimental/support/metadata/cc/test/BUILD b/tensorflow/lite/experimental/support/metadata/cc/test/BUILD
deleted file mode 100644
index f9d78567d70..00000000000
--- a/tensorflow/lite/experimental/support/metadata/cc/test/BUILD
+++ /dev/null
@@ -1,24 +0,0 @@
-package(
-    default_visibility = ["//visibility:public"],
-    licenses = ["notice"],  # Apache 2.0
-)
-
-cc_test(
-    name = "metadata_version_test",
-    srcs = ["metadata_version_test.cc"],
-    deps = [
-        "//tensorflow/lite/experimental/support/metadata:metadata_schema_cc",
-        "//tensorflow/lite/experimental/support/metadata/cc:metadata_version",
-        "@com_google_googletest//:gtest_main",
-        "@flatbuffers",
-    ],
-)
-
-cc_test(
-    name = "metadata_parser_test",
-    srcs = ["metadata_parser_test.cc"],
-    deps = [
-        "//tensorflow/lite/experimental/support/metadata/cc:metadata_version",
-        "@com_google_googletest//:gtest_main",
-    ],
-)
diff --git a/tensorflow/lite/experimental/support/metadata/cc/test/metadata_parser_test.cc b/tensorflow/lite/experimental/support/metadata/cc/test/metadata_parser_test.cc
deleted file mode 100644
index af7b8791fe8..00000000000
--- a/tensorflow/lite/experimental/support/metadata/cc/test/metadata_parser_test.cc
+++ /dev/null
@@ -1,33 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include "tensorflow/lite/experimental/support/metadata/cc/metadata_parser.h"
-
-#include <gmock/gmock.h>
-#include <gtest/gtest.h>
-
-namespace tflite {
-namespace metadata {
-namespace {
-
-using ::testing::MatchesRegex;
-
-TEST(MetadataParserTest, MatadataParserVersionIsWellFormed) {
-  // Validates that the version is well-formed (x.y.z).
-  EXPECT_THAT(kMatadataParserVersion, MatchesRegex("[0-9]+\\.[0-9]+\\.[0-9]+"));
-}
-
-}  // namespace
-}  // namespace metadata
-}  // namespace tflite
diff --git a/tensorflow/lite/experimental/support/metadata/cc/test/metadata_version_test.cc b/tensorflow/lite/experimental/support/metadata/cc/test/metadata_version_test.cc
deleted file mode 100644
index 03f4d3bf28b..00000000000
--- a/tensorflow/lite/experimental/support/metadata/cc/test/metadata_version_test.cc
+++ /dev/null
@@ -1,187 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include "tensorflow/lite/experimental/support/metadata/cc/metadata_version.h"
-
-#include <vector>
-
-#include <gmock/gmock.h>
-#include <gtest/gtest.h>
-#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
-#include "tensorflow/lite/experimental/support/metadata/metadata_schema_generated.h"
-
-namespace tflite {
-namespace metadata {
-namespace {
-
-using ::testing::MatchesRegex;
-using ::testing::StrEq;
-
-TEST(MetadataVersionTest,
-     GetMinimumMetadataParserVersionSucceedsWithValidMetadata) {
-  // Creates a dummy metadata flatbuffer for test.
-  flatbuffers::FlatBufferBuilder builder(1024);
-  auto name = builder.CreateString("Foo");
-  ModelMetadataBuilder metadata_builder(builder);
-  metadata_builder.add_name(name);
-  auto metadata = metadata_builder.Finish();
-  FinishModelMetadataBuffer(builder, metadata);
-
-  // Gets the mimimum metadata parser version.
-  std::string min_version;
-  EXPECT_EQ(GetMinimumMetadataParserVersion(builder.GetBufferPointer(),
-                                            builder.GetSize(), &min_version),
-            kTfLiteOk);
-  // Validates that the version is well-formed (x.y.z).
-  EXPECT_THAT(min_version, MatchesRegex("[0-9]+\\.[0-9]+\\.[0-9]+"));
-}
-
-TEST(MetadataVersionTest,
-     GetMinimumMetadataParserVersionFailsWithInvalidIdentifier) {
-  // Creates a dummy metadata flatbuffer without identifier.
-  flatbuffers::FlatBufferBuilder builder(1024);
-  ModelMetadataBuilder metadata_builder(builder);
-  auto metadata = metadata_builder.Finish();
-  builder.Finish(metadata);
-
-  // Gets the mimimum metadata parser version and triggers error.
-  std::string min_version;
-  EXPECT_EQ(GetMinimumMetadataParserVersion(builder.GetBufferPointer(),
-                                            builder.GetSize(), &min_version),
-            kTfLiteError);
-  EXPECT_TRUE(min_version.empty());
-}
-
-TEST(MetadataVersionTest,
-     GetMinimumMetadataParserVersionForModelMetadataVocabAssociatedFiles) {
-  // Creates a metadata flatbuffer with the field,
-  // ModelMetadata.associated_fiels, populated with the vocabulary file type.
-  flatbuffers::FlatBufferBuilder builder(1024);
-  AssociatedFileBuilder associated_file_builder(builder);
-  associated_file_builder.add_type(tflite::AssociatedFileType_VOCABULARY);
-  auto associated_files =
-      builder.CreateVector(std::vector<flatbuffers::Offset<AssociatedFile>>{
-          associated_file_builder.Finish()});
-  ModelMetadataBuilder metadata_builder(builder);
-  metadata_builder.add_associated_files(associated_files);
-  FinishModelMetadataBuffer(builder, metadata_builder.Finish());
-
-  // Gets the mimimum metadata parser version.
-  std::string min_version;
-  EXPECT_EQ(GetMinimumMetadataParserVersion(builder.GetBufferPointer(),
-                                            builder.GetSize(), &min_version),
-            kTfLiteOk);
-  // Validates that the version is exactly 1.0.1.
-  EXPECT_THAT(min_version, StrEq("1.0.1"));
-}
-
-TEST(MetadataVersionTest,
-     GetMinimumMetadataParserVersionForSubGraphMetadataVocabAssociatedFiles) {
-  // Creates a metadata flatbuffer with the field,
-  // SubGraphMetadata.associated_fiels, populated with the vocabulary file type.
-  flatbuffers::FlatBufferBuilder builder(1024);
-  AssociatedFileBuilder associated_file_builder(builder);
-  associated_file_builder.add_type(tflite::AssociatedFileType_VOCABULARY);
-  auto associated_files =
-      builder.CreateVector(std::vector<flatbuffers::Offset<AssociatedFile>>{
-          associated_file_builder.Finish()});
-  SubGraphMetadataBuilder subgraph_builder(builder);
-  subgraph_builder.add_associated_files(associated_files);
-  auto subgraphs =
-      builder.CreateVector(std::vector<flatbuffers::Offset<SubGraphMetadata>>{
-          subgraph_builder.Finish()});
-  ModelMetadataBuilder metadata_builder(builder);
-  metadata_builder.add_subgraph_metadata(subgraphs);
-  FinishModelMetadataBuffer(builder, metadata_builder.Finish());
-
-  // Gets the mimimum metadata parser version.
-  std::string min_version;
-  EXPECT_EQ(GetMinimumMetadataParserVersion(builder.GetBufferPointer(),
-                                            builder.GetSize(), &min_version),
-            kTfLiteOk);
-  // Validates that the version is exactly 1.0.1.
-  EXPECT_THAT(min_version, StrEq("1.0.1"));
-}
-
-TEST(MetadataVersionTest,
-     GetMinimumMetadataParserVersionForInputMetadataVocabAssociatedFiles) {
-  // Creates a metadata flatbuffer with the field,
-  // SubGraphMetadata.input_tensor_metadata.associated_fiels, populated with the
-  // vocabulary file type.
-  flatbuffers::FlatBufferBuilder builder(1024);
-  AssociatedFileBuilder associated_file_builder(builder);
-  associated_file_builder.add_type(tflite::AssociatedFileType_VOCABULARY);
-  auto associated_files =
-      builder.CreateVector(std::vector<flatbuffers::Offset<AssociatedFile>>{
-          associated_file_builder.Finish()});
-  TensorMetadataBuilder tensor_builder(builder);
-  tensor_builder.add_associated_files(associated_files);
-  auto tensors =
-      builder.CreateVector(std::vector<flatbuffers::Offset<TensorMetadata>>{
-          tensor_builder.Finish()});
-  SubGraphMetadataBuilder subgraph_builder(builder);
-  subgraph_builder.add_input_tensor_metadata(tensors);
-  auto subgraphs =
-      builder.CreateVector(std::vector<flatbuffers::Offset<SubGraphMetadata>>{
-          subgraph_builder.Finish()});
-  ModelMetadataBuilder metadata_builder(builder);
-  metadata_builder.add_subgraph_metadata(subgraphs);
-  FinishModelMetadataBuffer(builder, metadata_builder.Finish());
-
-  // Gets the mimimum metadata parser version.
-  std::string min_version;
-  EXPECT_EQ(GetMinimumMetadataParserVersion(builder.GetBufferPointer(),
-                                            builder.GetSize(), &min_version),
-            kTfLiteOk);
-  // Validates that the version is exactly 1.0.1.
-  EXPECT_THAT(min_version, StrEq("1.0.1"));
-}
-
-TEST(MetadataVersionTest,
-     GetMinimumMetadataParserVersionForOutputMetadataVocabAssociatedFiles) {
-  // Creates a metadata flatbuffer with the field,
-  // SubGraphMetadata.output_tensor_metadata.associated_fiels, populated with
-  // the vocabulary file type.
-  flatbuffers::FlatBufferBuilder builder(1024);
-  AssociatedFileBuilder associated_file_builder(builder);
-  associated_file_builder.add_type(tflite::AssociatedFileType_VOCABULARY);
-  auto associated_files =
-      builder.CreateVector(std::vector<flatbuffers::Offset<AssociatedFile>>{
-          associated_file_builder.Finish()});
-  TensorMetadataBuilder tensor_builder(builder);
-  tensor_builder.add_associated_files(associated_files);
-  auto tensors =
-      builder.CreateVector(std::vector<flatbuffers::Offset<TensorMetadata>>{
-          tensor_builder.Finish()});
-  SubGraphMetadataBuilder subgraph_builder(builder);
-  subgraph_builder.add_output_tensor_metadata(tensors);
-  auto subgraphs =
-      builder.CreateVector(std::vector<flatbuffers::Offset<SubGraphMetadata>>{
-          subgraph_builder.Finish()});
-  ModelMetadataBuilder metadata_builder(builder);
-  metadata_builder.add_subgraph_metadata(subgraphs);
-  FinishModelMetadataBuffer(builder, metadata_builder.Finish());
-
-  // Gets the mimimum metadata parser version.
-  std::string min_version;
-  EXPECT_EQ(GetMinimumMetadataParserVersion(builder.GetBufferPointer(),
-                                            builder.GetSize(), &min_version),
-            kTfLiteOk);
-  // Validates that the version is exactly 1.0.1.
-  EXPECT_EQ(min_version, "1.0.1");
-}
-
-}  // namespace
-}  // namespace metadata
-}  // namespace tflite
diff --git a/tensorflow/lite/experimental/support/metadata/flatbuffers_lib/BUILD b/tensorflow/lite/experimental/support/metadata/flatbuffers_lib/BUILD
deleted file mode 100644
index ca9a79b7451..00000000000
--- a/tensorflow/lite/experimental/support/metadata/flatbuffers_lib/BUILD
+++ /dev/null
@@ -1,23 +0,0 @@
-load("//tensorflow:tensorflow.bzl", "pybind_extension")
-
-package(
-    default_visibility = [
-        "//visibility:public",
-    ],
-    licenses = ["notice"],  # Apache 2.0
-)
-
-pybind_extension(
-    name = "_pywrap_flatbuffers",
-    srcs = [
-        "flatbuffers_lib.cc",
-    ],
-    features = ["-use_header_modules"],
-    module_name = "_pywrap_flatbuffers",
-    deps = [
-        "//tensorflow/python:pybind11_lib",
-        "//third_party/python_runtime:headers",
-        "@flatbuffers",
-        "@pybind11",
-    ],
-)
diff --git a/tensorflow/lite/experimental/support/metadata/flatbuffers_lib/flatbuffers_lib.cc b/tensorflow/lite/experimental/support/metadata/flatbuffers_lib/flatbuffers_lib.cc
deleted file mode 100644
index 6185722504f..00000000000
--- a/tensorflow/lite/experimental/support/metadata/flatbuffers_lib/flatbuffers_lib.cc
+++ /dev/null
@@ -1,59 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
-#include "flatbuffers/idl.h"  // from @flatbuffers
-#include "pybind11/pybind11.h"
-#include "pybind11/pytypes.h"
-#include "pybind11/stl.h"
-
-namespace tflite {
-namespace support {
-
-PYBIND11_MODULE(_pywrap_flatbuffers, m) {
-  pybind11::class_<flatbuffers::IDLOptions>(m, "IDLOptions")
-      .def(pybind11::init<>())
-      .def_readwrite("strict_json", &flatbuffers::IDLOptions::strict_json);
-  pybind11::class_<flatbuffers::Parser>(m, "Parser")
-      .def(pybind11::init<const flatbuffers::IDLOptions&>())
-      .def("parse",
-           [](flatbuffers::Parser* self, const std::string& source) {
-             return self->Parse(source.c_str());
-           })
-      .def_readonly("builder", &flatbuffers::Parser::builder_)
-      .def_readonly("error", &flatbuffers::Parser::error_);
-  pybind11::class_<flatbuffers::FlatBufferBuilder>(m, "FlatBufferBuilder")
-      .def("clear", &flatbuffers::FlatBufferBuilder::Clear)
-      .def("push_flat_buffer", [](flatbuffers::FlatBufferBuilder* self,
-                                  const std::string& contents) {
-        self->PushFlatBuffer(reinterpret_cast<const uint8_t*>(contents.c_str()),
-                             contents.length());
-      });
-  m.def("generate_text_file", &flatbuffers::GenerateTextFile);
-  m.def(
-      "generate_text",
-      [](const flatbuffers::Parser& parser,
-         const std::string& buffer) -> std::string {
-        std::string text;
-        if (!flatbuffers::GenerateText(
-                parser, reinterpret_cast<const void*>(buffer.c_str()), &text)) {
-          return "";
-        }
-        return text;
-      });
-}
-
-}  // namespace support
-}  // namespace tflite
diff --git a/tensorflow/lite/experimental/support/metadata/java/AndroidManifest.xml b/tensorflow/lite/experimental/support/metadata/java/AndroidManifest.xml
deleted file mode 100644
index b2e22628db6..00000000000
--- a/tensorflow/lite/experimental/support/metadata/java/AndroidManifest.xml
+++ /dev/null
@@ -1,6 +0,0 @@
-<?xml version="1.0" encoding="utf-8"?>
-<manifest xmlns:android="http://schemas.android.com/apk/res/android"
-    package="org.tensorflow.lite.support">
-    <uses-sdk android:minSdkVersion="19" />
-</manifest>
-
diff --git a/tensorflow/lite/experimental/support/metadata/java/BUILD b/tensorflow/lite/experimental/support/metadata/java/BUILD
deleted file mode 100644
index 00d10bcca56..00000000000
--- a/tensorflow/lite/experimental/support/metadata/java/BUILD
+++ /dev/null
@@ -1,40 +0,0 @@
-# Description:
-# TensorFlow Lite Support API in Java for metadata.
-
-load("@build_bazel_rules_android//android:rules.bzl", "android_library")
-load("//tensorflow/java:build_defs.bzl", "JAVACOPTS")
-
-package(
-    default_visibility = ["//visibility:public"],
-    licenses = ["notice"],  # Apache 2.0
-)
-
-METADATA_SRCS = glob(
-    ["src/java/org/tensorflow/lite/support/metadata/**/*.java"],
-)
-
-android_library(
-    name = "tensorflow-lite-support-metadata",
-    srcs = METADATA_SRCS,
-    manifest = "AndroidManifest.xml",
-    deps = [
-        "//tensorflow/lite/experimental/support/metadata:metadata_schema_fbs_android",
-        "//tensorflow/lite/experimental/support/metadata:schema_fbs_android",
-        "@org_checkerframework_qual",
-    ],
-)
-
-java_library(
-    name = "tensorflow-lite-support-metadata-lib",
-    srcs = METADATA_SRCS,
-    javacopts = JAVACOPTS,
-    resource_jars = [
-        "//tensorflow/lite/experimental/support/metadata:libmetadata_schema_java.jar",
-        "//tensorflow/lite/experimental/support/metadata:libschema_fbs_java.jar",
-    ],
-    deps = [
-        "//tensorflow/lite/experimental/support/metadata:metadata_schema_java",
-        "//tensorflow/lite/experimental/support/metadata:schema_fbs_java",
-        "@org_checkerframework_qual",
-    ],
-)
diff --git a/tensorflow/lite/experimental/support/metadata/java/src/java/org/tensorflow/lite/support/metadata/BoundedInputStream.java b/tensorflow/lite/experimental/support/metadata/java/src/java/org/tensorflow/lite/support/metadata/BoundedInputStream.java
deleted file mode 100644
index 6c3d23270f3..00000000000
--- a/tensorflow/lite/experimental/support/metadata/java/src/java/org/tensorflow/lite/support/metadata/BoundedInputStream.java
+++ /dev/null
@@ -1,116 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-package org.tensorflow.lite.support.metadata;
-
-import static org.tensorflow.lite.support.metadata.Preconditions.checkArgument;
-import static org.tensorflow.lite.support.metadata.Preconditions.checkElementIndex;
-import static org.tensorflow.lite.support.metadata.Preconditions.checkNotNull;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.nio.ByteBuffer;
-
-/**
- * An {@link InputStream} that wraps a section of a {@link SeekableByteChannelCompat}.
- *
- * <p><b>WARNING:</b> Similar as {@link InputStream}, instances of an {@link BoundedInputStream} are
- * <b>not</b> thread-safe. If multiple threads concurrently reading from the same {@link
- * BoundedInputStream}, it must be synchronized externally. Also, if multiple instances of {@link
- * BoundedInputStream} are created on the same {@link SeekableByteChannelCompat}, it must be
- * synchronized as well.
- */
-final class BoundedInputStream extends InputStream {
-  private final ByteBuffer singleByteBuffer = ByteBuffer.allocate(1);
-  private final long end; // The valid data for the stream is between [start, end).
-  private long position;
-  private final SeekableByteChannelCompat channel;
-
-  /**
-   * Creates a {@link BoundedInputStream} with a {@link SeekableByteChannelCompat}.
-   *
-   * @param channel the {@link SeekableByteChannelCompat} that backs up this {@link
-   *     BoundedInputStream}
-   * @param start the starting position of this {@link BoundedInputStream} in the given {@link
-   *     SeekableByteChannelCompat}
-   * @param remaining the length of this {@link BoundedInputStream}
-   * @throws IllegalArgumentException if {@code start} or {@code remaining} is negative
-   */
-  BoundedInputStream(SeekableByteChannelCompat channel, long start, long remaining) {
-    checkArgument(
-        remaining >= 0 && start >= 0,
-        String.format("Invalid length of stream at offset=%d, length=%d", start, remaining));
-
-    end = start + remaining;
-    this.channel = channel;
-    position = start;
-  }
-
-  @Override
-  public int available() throws IOException {
-    return (int) (Math.min(end, channel.size()) - position);
-  }
-
-  @Override
-  public int read() throws IOException {
-    if (position >= end) {
-      return -1;
-    }
-
-    singleByteBuffer.rewind();
-    int count = read(position, singleByteBuffer);
-    if (count < 0) {
-      return count;
-    }
-
-    position++;
-    return singleByteBuffer.get() & 0xff;
-  }
-
-  @Override
-  public int read(byte[] b, int off, int len) throws IOException {
-    checkNotNull(b);
-    checkElementIndex(off, b.length, "The start offset");
-    checkElementIndex(len, b.length - off + 1, "The maximumn number of bytes to read");
-
-    if (len == 0) {
-      return 0;
-    }
-
-    if (len > end - position) {
-      if (position >= end) {
-        return -1;
-      }
-      len = (int) (end - position);
-    }
-
-    ByteBuffer buf = ByteBuffer.wrap(b, off, len);
-    int count = read(position, buf);
-    if (count > 0) {
-      position += count;
-    }
-    return count;
-  }
-
-  private int read(long position, ByteBuffer buf) throws IOException {
-    int count;
-    synchronized (channel) {
-      channel.position(position);
-      count = channel.read(buf);
-    }
-    buf.flip();
-    return count;
-  }
-}
diff --git a/tensorflow/lite/experimental/support/metadata/java/src/java/org/tensorflow/lite/support/metadata/ByteBufferChannel.java b/tensorflow/lite/experimental/support/metadata/java/src/java/org/tensorflow/lite/support/metadata/ByteBufferChannel.java
deleted file mode 100644
index e5d54a415ed..00000000000
--- a/tensorflow/lite/experimental/support/metadata/java/src/java/org/tensorflow/lite/support/metadata/ByteBufferChannel.java
+++ /dev/null
@@ -1,130 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-package org.tensorflow.lite.support.metadata;
-
-import static java.lang.Math.min;
-import static org.tensorflow.lite.support.metadata.Preconditions.checkArgument;
-import static org.tensorflow.lite.support.metadata.Preconditions.checkNotNull;
-
-import java.nio.ByteBuffer;
-import java.nio.channels.NonWritableChannelException;
-
-/** Implements the {@link SeekableByteChannelCompat} on top of {@link ByteBuffer}. */
-final class ByteBufferChannel implements SeekableByteChannelCompat {
-
-  /** The ByteBuffer that holds the data. */
-  private final ByteBuffer buffer;
-
-  /**
-   * Creates a {@link ByteBufferChannel} that wraps a {@link ByteBuffer}.
-   *
-   * @param buffer the {@link ByteBuffer} that backs this {@link ByteBufferChannel}
-   * @throws NullPointerException if {@code buffer} is null
-   */
-  public ByteBufferChannel(ByteBuffer buffer) {
-    checkNotNull(buffer, "The ByteBuffer cannot be null.");
-    this.buffer = buffer;
-  }
-
-  @Override
-  public void close() {}
-
-  @Override
-  public boolean isOpen() {
-    return true;
-  }
-
-  @Override
-  public long position() {
-    return buffer.position();
-  }
-
-  /**
-   * Sets this channel's position.
-   *
-   * @param newPosition the new position, a non-negative integer counting the number of bytes from
-   *     the beginning of the entity
-   * @return this channel
-   * @throws IllegalArgumentException if the new position is negative, or greater than the size of
-   *     the underlying {@link ByteBuffer}, or greater than Integer.MAX_VALUE
-   */
-  @Override
-  public synchronized ByteBufferChannel position(long newPosition) {
-    checkArgument(
-        (newPosition >= 0 && newPosition <= Integer.MAX_VALUE),
-        "The new position should be non-negative and be less than Integer.MAX_VALUE.");
-    buffer.position((int) newPosition);
-    return this;
-  }
-
-  /**
-   * {@inheritDoc}
-   *
-   * <p>Bytes are read starting at this channel's current position, and then the position is updated
-   * with the number of bytes actually read. Otherwise this method behaves exactly as specified in
-   * the {@link ReadableByteChannel} interface.
-   */
-  @Override
-  public synchronized int read(ByteBuffer dst) {
-    if (buffer.remaining() == 0) {
-      return -1;
-    }
-
-    int count = min(dst.remaining(), buffer.remaining());
-    if (count > 0) {
-      ByteBuffer tempBuffer = buffer.slice();
-      tempBuffer.order(buffer.order()).limit(count);
-      dst.put(tempBuffer);
-      buffer.position(buffer.position() + count);
-    }
-    return count;
-  }
-
-  @Override
-  public long size() {
-    return buffer.limit();
-  }
-
-  @Override
-  public synchronized ByteBufferChannel truncate(long size) {
-    checkArgument(
-        (size >= 0 && size <= Integer.MAX_VALUE),
-        "The new size should be non-negative and be less than Integer.MAX_VALUE.");
-
-    if (size < buffer.limit()) {
-      buffer.limit((int) size);
-      if (buffer.position() > size) {
-        buffer.position((int) size);
-      }
-    }
-    return this;
-  }
-
-  @Override
-  public synchronized int write(ByteBuffer src) {
-    if (buffer.isReadOnly()) {
-      throw new NonWritableChannelException();
-    }
-
-    int count = min(src.remaining(), buffer.remaining());
-    if (count > 0) {
-      ByteBuffer tempBuffer = src.slice();
-      tempBuffer.order(buffer.order()).limit(count);
-      buffer.put(tempBuffer);
-    }
-    return count;
-  }
-}
diff --git a/tensorflow/lite/experimental/support/metadata/java/src/java/org/tensorflow/lite/support/metadata/MetadataExtractor.java b/tensorflow/lite/experimental/support/metadata/java/src/java/org/tensorflow/lite/support/metadata/MetadataExtractor.java
deleted file mode 100644
index 9bf5ae93138..00000000000
--- a/tensorflow/lite/experimental/support/metadata/java/src/java/org/tensorflow/lite/support/metadata/MetadataExtractor.java
+++ /dev/null
@@ -1,368 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-package org.tensorflow.lite.support.metadata;
-
-import static org.tensorflow.lite.support.metadata.Preconditions.checkArgument;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.nio.ByteBuffer;
-import java.util.zip.ZipException;
-import org.checkerframework.checker.nullness.qual.Nullable;
-import org.tensorflow.lite.schema.Tensor;
-import org.tensorflow.lite.support.metadata.schema.ModelMetadata;
-import org.tensorflow.lite.support.metadata.schema.TensorMetadata;
-
-/**
- * Loads metadata from TFLite Model FlatBuffer.
- *
- * <p>TFLite Model FlatBuffer can be generated using the <a
- * href="https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/schema/schema.fbs">TFLite
- * Model schema file.</a>
- *
- * <p>Some models contain a TFLite Metadata Flatbuffer, which records more information about what
- * the model does and how to interprete the model. TFLite Metadata Flatbuffer can be generated using
- * the <a
- * href="https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/schema/metadata_schema.fbs">TFLite
- * Metadata schema file.</a>
- *
- * <p>It is allowed to pass in a model FlatBuffer without TFLite metadata. However, invoking methods
- * that read from TFLite metadata will cause runtime errors.
- *
- * <p>Similarly, it is allowed to pass in a model FlatBuffer without associated files. However,
- * invoking methods that read the associated files will cause runtime errors.
- *
- * <p>Though TFLite model FlatBuffer supports multiple subgraphs, TFLite Interpreter only supports a
- * single subgraph so far. See the <a
- * href="https://www.tensorflow.org/lite/convert/cmdline_examples#specifying_subgraphs">instruction
- * of how to specify subgraph during convertion for more information.</a> Therefore, {@link
- * MetadataExtractor} omits subgraph index as an input in its methods.
- */
-public class MetadataExtractor {
-
-  /** The helper class to load metadata from TFLite model FlatBuffer. */
-  private final ModelInfo modelInfo;
-
-  /** The helper class to load metadata from TFLite metadata FlatBuffer. */
-  @Nullable private final ModelMetadataInfo metadataInfo;
-
-  /** The handler to load associated files through zip. */
-  @Nullable private final ZipFile zipFile;
-
-  /**
-   * Creates a {@link MetadataExtractor} with TFLite model FlatBuffer.
-   *
-   * @param buffer the TFLite model FlatBuffer
-   * @throws IllegalArgumentException if the number of input or output tensors in the model does not
-   *     match that in the metadata
-   * @throws IOException if an error occurs while reading the model as a Zip file
-   */
-  public MetadataExtractor(ByteBuffer buffer) throws IOException {
-    modelInfo = new ModelInfo(buffer);
-    ByteBuffer metadataBuffer = modelInfo.getMetadataBuffer();
-    if (metadataBuffer != null) {
-      metadataInfo = new ModelMetadataInfo(metadataBuffer);
-
-      // Prints warning message if the minimum parser version is not satisfied.
-      if (!isMinimumParserVersionSatisfied()) {
-        System.err.printf(
-            "<Warning> Some fields in the metadata belong to a future schema. The minimum parser"
-                + " version required is %s, but the version of the current metadata parser is %s",
-            metadataInfo.getMininumParserVersion(), MetadataParser.VERSION);
-      }
-
-      checkArgument(
-          modelInfo.getInputTensorCount() == metadataInfo.getInputTensorCount(),
-          String.format(
-              "The number of input tensors in the model is %d. The number of input tensors that"
-                  + " recorded in the metadata is %d. These two values does not match.",
-              modelInfo.getInputTensorCount(), metadataInfo.getInputTensorCount()));
-      checkArgument(
-          modelInfo.getOutputTensorCount() == metadataInfo.getOutputTensorCount(),
-          String.format(
-              "The number of output tensors in the model is %d. The number of output tensors that"
-                  + " recorded in the metadata is %d. These two values does not match.",
-              modelInfo.getOutputTensorCount(), metadataInfo.getOutputTensorCount()));
-    } else {
-      // It is allowed to pass in a model FlatBuffer without TFLite metadata. However, invoking
-      // methods that read from TFLite metadata will cause runtime errors.
-      metadataInfo = null;
-    }
-
-    zipFile = createZipFile(buffer);
-  }
-
-  /**
-   * Quantization parameters that corresponds to the table, {@code QuantizationParameters}, in the
-   * <a
-   * href="https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/schema/schema.fbs">TFLite
-   * Model schema file.</a>
-   *
-   * <p>Since per-channel quantization does not apply to input and output tensors, {@code scale} and
-   * {@code zero_point} are both single values instead of arrays.
-   *
-   * <p>For tensor that are not quantized, the values of scale and zero_point are both 0.
-   *
-   * <p>Given a quantized value q, the corresponding float value f should be: <br>
-   * f = scale * (q - zero_point) <br>
-   */
-  public static class QuantizationParams {
-    /** The scale value used in quantization. */
-    private final float scale;
-    /** The zero point value used in quantization. */
-    private final int zeroPoint;
-
-    /**
-     * Creates a {@link QuantizationParams} with {@code scale} and {@code zero_point}.
-     *
-     * @param scale The scale value used in quantization.
-     * @param zeroPoint The zero point value used in quantization.
-     */
-    public QuantizationParams(final float scale, final int zeroPoint) {
-      this.scale = scale;
-      this.zeroPoint = zeroPoint;
-    }
-
-    /** Returns the scale value. */
-    public float getScale() {
-      return scale;
-    }
-
-    /** Returns the zero point value. */
-    public int getZeroPoint() {
-      return zeroPoint;
-    }
-  }
-
-  /** Returns {@code true} if the model has metadata. Otherwise, returns {@code false}. */
-  public boolean hasMetadata() {
-    return metadataInfo != null;
-  }
-
-  /**
-   * Gets the packed associated file with the specified {@code fileName}.
-   *
-   * @param fileName the name of the associated file
-   * @return the raw input stream containing specified file
-   * @throws IllegalStateException if the model is not a zip file
-   * @throws IllegalArgumentException if the specified file does not exist in the model
-   */
-  public InputStream getAssociatedFile(String fileName) {
-    assertZipFile();
-    return zipFile.getRawInputStream(fileName);
-  }
-
-  /** Gets the count of input tensors in the model. */
-  public int getInputTensorCount() {
-    return modelInfo.getInputTensorCount();
-  }
-
-  /**
-   * Gets the metadata for the input tensor specified by {@code inputIndex}.
-   *
-   * @param inputIndex the index of the desired input tensor
-   * @throws IllegalStateException if this model does not contain model metadata
-   */
-  @Nullable
-  public TensorMetadata getInputTensorMetadata(int inputIndex) {
-    assertMetadataInfo();
-    return metadataInfo.getInputTensorMetadata(inputIndex);
-  }
-
-  /**
-   * Gets the quantization parameters for the input tensor specified by {@code inputIndex}.
-   *
-   * @param inputIndex the index of the desired input tensor
-   */
-  public QuantizationParams getInputTensorQuantizationParams(int inputIndex) {
-    Tensor tensor = modelInfo.getInputTensor(inputIndex);
-    return modelInfo.getQuantizationParams(tensor);
-  }
-
-  /**
-   * Gets the shape of the input tensor with {@code inputIndex}.
-   *
-   * @param inputIndex the index of the desired input tensor
-   */
-  public int[] getInputTensorShape(int inputIndex) {
-    return modelInfo.getInputTensorShape(inputIndex);
-  }
-
-  /**
-   * Gets the {@link TensorType} of the input tensor with {@code inputIndex}.
-   *
-   * @param inputIndex the index of the desired input tensor
-   */
-  public byte getInputTensorType(int inputIndex) {
-    return modelInfo.getInputTensorType(inputIndex);
-  }
-
-  /**
-   * Gets the root handler for the model metadata.
-   *
-   * @throws IllegalStateException if this model does not contain model metadata
-   */
-  public ModelMetadata getModelMetadata() {
-    assertMetadataInfo();
-    return metadataInfo.getModelMetadata();
-  }
-
-  /** Gets the count of output tensors in the model. */
-  public int getOutputTensorCount() {
-    return modelInfo.getOutputTensorCount();
-  }
-
-  /**
-   * Gets the metadata for the output tensor specified by {@code outputIndex}.
-   *
-   * @param outputIndex the index of the desired output tensor
-   * @throws IllegalStateException if this model does not contain model metadata
-   */
-  @Nullable
-  public TensorMetadata getOutputTensorMetadata(int outputIndex) {
-    assertMetadataInfo();
-    return metadataInfo.getOutputTensorMetadata(outputIndex);
-  }
-
-  /**
-   * Gets the quantization parameters for the output tensor specified by {@code outputIndex}.
-   *
-   * @param outputIndex the index of the desired output tensor
-   */
-  public QuantizationParams getOutputTensorQuantizationParams(int outputIndex) {
-    Tensor tensor = modelInfo.getOutputTensor(outputIndex);
-    return modelInfo.getQuantizationParams(tensor);
-  }
-
-  /**
-   * Gets the shape of the output tensor with {@code outputIndex}.
-   *
-   * @param outputIndex the index of the desired output tensor
-   */
-  public int[] getOutputTensorShape(int outputIndex) {
-    return modelInfo.getOutputTensorShape(outputIndex);
-  }
-
-  /**
-   * Gets the {@link TensorType} of the output tensor with {@code outputIndex}.
-   *
-   * @param outputIndex the index of the desired output tensor
-   */
-  public byte getOutputTensorType(int outputIndex) {
-    return modelInfo.getOutputTensorType(outputIndex);
-  }
-
-  /**
-   * Returns {@code true} if the minimum parser version required by the given metadata flatbuffer
-   * precedes or equals to the version of the metadata parser that this MetadataExtractor library is
-   * relying on. All fields in the metadata can be parsed correctly with this metadata extractor
-   * library in this case. Otherwise, it returns {@code false}.
-   *
-   * <p>For example, assume the underlying metadata parser version is {@code 1.14.1},
-   *
-   * <ul>
-   *   <li>it returns {@code true}, if the required minimum parser version is the same or older,
-   *       such as {@code 1.14.1} or {@code 1.14.0}. Null version precedes all numeric versions,
-   *       because some metadata flatbuffers are generated before the first versioned release; <br>
-   *   <li>it returns {@code false}, if the required minimum parser version is newer, such as {@code
-   *       1.14.2}.
-   * </ul>
-   */
-  public final boolean isMinimumParserVersionSatisfied() {
-    String minVersion = metadataInfo.getMininumParserVersion();
-    if (minVersion == null) {
-      return true;
-    }
-    return compareVersions(minVersion, MetadataParser.VERSION) <= 0;
-  }
-
-  /**
-   * Asserts if {@link #metadataInfo} is not initialized. Some models may not have metadata and this
-   * is allowed. However, invoking methods that reads the metadata is not allowed.
-   *
-   * @throws IllegalStateException if this model does not contain model metadata
-   */
-  private void assertMetadataInfo() {
-    if (metadataInfo == null) {
-      throw new IllegalStateException("This model does not contain model metadata.");
-    }
-  }
-
-  /**
-   * Asserts if {@link #zipFile} is not initialized. Some models may not have associated files, thus
-   * are not Zip files. This is allowed. However, invoking methods that reads those associated files
-   * is not allowed.
-   *
-   * @throws IllegalStateException if this model is not a Zip file
-   */
-  private void assertZipFile() {
-    if (zipFile == null) {
-      throw new IllegalStateException(
-          "This model does not contain associated files, and is not a Zip file.");
-    }
-  }
-
-  /**
-   * Creates a Zip file handler to read the associated files. If the model is not a zip file, i.e.
-   * it does not have associated files, return a null handler.
-   *
-   * @param buffer the TFLite model FlatBuffer
-   * @throws IOException if an error occurs while reading the model as a Zip file
-   */
-  @Nullable
-  private static ZipFile createZipFile(ByteBuffer buffer) throws IOException {
-    try {
-      // Creates the handler to hold the associated files through the Zip.
-      ByteBufferChannel byteBufferChannel = new ByteBufferChannel(buffer);
-      return ZipFile.createFrom(byteBufferChannel);
-    } catch (ZipException e) {
-      // Some models may not have associate files. Therefore, Those models are not zip files.
-      // However, invoking methods that read associated files later will lead into errors.
-      return null;
-    }
-  }
-
-  /**
-   * Compares two semantic version numbers.
-   *
-   * <p>Examples of comparing two versions: <br>
-   * {@code 1.9} precedes {@code 1.14}; <br>
-   * {@code 1.14} precedes {@code 1.14.1}; <br>
-   * {@code 1.14} and {@code 1.14.0} are euqal;
-   *
-   * @return the value {@code 0} if the two versions are equal; a value less than {@code 0} if
-   *     {@code version1} precedes {@code version2}; a value greater than {@code 0} if {@code
-   *     version2} precedes {@code version1}.
-   */
-  private static int compareVersions(String version1, String version2) {
-    // Using String.split instead of the recommanded Guava Splitter because we've been avoiding
-    // depending on other third party libraries in this project.
-    String[] levels1 = version1.split("\\.", 0);
-    String[] levels2 = version2.split("\\.", 0);
-
-    int length = Math.max(levels1.length, levels2.length);
-    for (int i = 0; i < length; i++) {
-      Integer v1 = i < levels1.length ? Integer.parseInt(levels1[i]) : 0;
-      Integer v2 = i < levels2.length ? Integer.parseInt(levels2[i]) : 0;
-      int compare = v1.compareTo(v2);
-      if (compare != 0) {
-        return compare;
-      }
-    }
-
-    return 0;
-  }
-}
diff --git a/tensorflow/lite/experimental/support/metadata/java/src/java/org/tensorflow/lite/support/metadata/MetadataParser.java b/tensorflow/lite/experimental/support/metadata/java/src/java/org/tensorflow/lite/support/metadata/MetadataParser.java
deleted file mode 100644
index 195a330462b..00000000000
--- a/tensorflow/lite/experimental/support/metadata/java/src/java/org/tensorflow/lite/support/metadata/MetadataParser.java
+++ /dev/null
@@ -1,27 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-package org.tensorflow.lite.support.metadata;
-
-/** Information about the metadata parser that this metadata extractor library is depending on. */
-public final class MetadataParser {
-  /**
-   * The version of the metadata parser that this metadata extractor library is depending on. The
-   * value should match the value of "Schema Semantic version" in metadata_schema.fbs.
-   */
-  public static final String VERSION = "1.0.1";
-
-  private MetadataParser() {}
-}
diff --git a/tensorflow/lite/experimental/support/metadata/java/src/java/org/tensorflow/lite/support/metadata/ModelInfo.java b/tensorflow/lite/experimental/support/metadata/java/src/java/org/tensorflow/lite/support/metadata/ModelInfo.java
deleted file mode 100644
index 309a3dbe774..00000000000
--- a/tensorflow/lite/experimental/support/metadata/java/src/java/org/tensorflow/lite/support/metadata/ModelInfo.java
+++ /dev/null
@@ -1,266 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-package org.tensorflow.lite.support.metadata;
-
-import static org.tensorflow.lite.support.metadata.Preconditions.checkArgument;
-import static org.tensorflow.lite.support.metadata.Preconditions.checkNotNull;
-
-import java.nio.ByteBuffer;
-import java.util.ArrayList;
-import java.util.Collections;
-import java.util.List;
-import org.checkerframework.checker.nullness.qual.Nullable;
-import org.tensorflow.lite.schema.Buffer;
-import org.tensorflow.lite.schema.Metadata;
-import org.tensorflow.lite.schema.Model;
-import org.tensorflow.lite.schema.QuantizationParameters;
-import org.tensorflow.lite.schema.SubGraph;
-import org.tensorflow.lite.schema.Tensor;
-import org.tensorflow.lite.schema.TensorType;
-import org.tensorflow.lite.support.metadata.MetadataExtractor.QuantizationParams;
-
-/** Extracts model information out of TFLite model FLatBuffer. */
-final class ModelInfo {
-  /** The model that is loaded from TFLite model FlatBuffer. */
-  private final Model model;
-
-  /** A list of input tensors. */
-  private final List</* @Nullable */ Tensor> inputTensors;
-
-  /** A list of output tensors. */
-  private final List</* @Nullable */ Tensor> outputTensors;
-
-  /** Identifier of the TFLite model metadata in the Metadata array. */
-  static final String METADATA_FIELD_NAME = "TFLITE_METADATA";
-
-  /**
-   * Creates a {@link ModelInfo} with the model FlatBuffer, {@code buffer}.
-   *
-   * <p>Though TFLite model FlatBuffer supports multiple subgraphs, TFLite Interpreter only supports
-   * single subgraph so far. See the <a
-   * href="https://www.tensorflow.org/lite/convert/cmdline_examples#specifying_subgraphs">instruction
-   * of how to specify subgraph during convertion for more information.</a> Therefore, all methods
-   * in {@link ModelInfo} retrieves metadata of the first subgrpah as default.
-   *
-   * @param buffer the TFLite model FlatBuffer
-   * @throws NullPointerException if {@code buffer} is null
-   * @throws IllegalArgumentException if the model does not contain any subgraph, or the model does
-   *     not contain the expected identifier
-   */
-  ModelInfo(ByteBuffer buffer) {
-    assertTFLiteModel(buffer);
-
-    model = Model.getRootAsModel(buffer);
-    checkArgument(model.subgraphsLength() > 0, "The model does not contain any subgraph.");
-
-    inputTensors = getInputTensors(model);
-    outputTensors = getOutputTensors(model);
-  }
-
-  /**
-   * Gets the input tensor with {@code inputIndex}.
-   *
-   * @param inputIndex The index of the desired input tensor.
-   * @throws IllegalArgumentException if the inputIndex specified is invalid.
-   */
-  @Nullable
-  Tensor getInputTensor(int inputIndex) {
-    checkArgument(
-        inputIndex >= 0 && inputIndex < inputTensors.size(),
-        "The inputIndex specified is invalid.");
-    return inputTensors.get(inputIndex);
-  }
-
-  int getInputTensorCount() {
-    return inputTensors.size();
-  }
-
-  /**
-   * Gets shape of the input tensor with {@code inputIndex}.
-   *
-   * @param inputIndex The index of the desired intput tensor.
-   */
-  int[] getInputTensorShape(int inputIndex) {
-    Tensor tensor = getInputTensor(inputIndex);
-    return getShape(tensor);
-  }
-
-  /**
-   * Gets the {@link TensorType} in byte of the input tensor with {@code inputIndex}.
-   *
-   * @param inputIndex The index of the desired intput tensor.
-   */
-  byte getInputTensorType(int inputIndex) {
-    return getInputTensor(inputIndex).type();
-  }
-
-  /** Gets the metadata FlatBuffer from the model FlatBuffer. */
-  @Nullable
-  ByteBuffer getMetadataBuffer() {
-    // Some models may not have metadata, and this is allowed.
-    if (model.metadataLength() == 0) {
-      return null;
-    }
-
-    for (int i = 0; i < model.metadataLength(); i++) {
-      Metadata meta = model.metadata(i);
-      if (METADATA_FIELD_NAME.equals(meta.name())) {
-        long bufferIndex = meta.buffer();
-        Buffer metadataBuf = model.buffers((int) bufferIndex);
-        return metadataBuf.dataAsByteBuffer();
-      }
-    }
-    return null;
-  }
-
-  /**
-   * Gets the output tensor with {@code outputIndex}.
-   *
-   * @param outputIndex The index of the desired outtput tensor.
-   * @throws IllegalArgumentException if the outputIndex specified is invalid.
-   */
-  @Nullable
-  Tensor getOutputTensor(int outputIndex) {
-    checkArgument(
-        outputIndex >= 0 && outputIndex < outputTensors.size(),
-        "The outputIndex specified is invalid.");
-    return outputTensors.get(outputIndex);
-  }
-
-  int getOutputTensorCount() {
-    return outputTensors.size();
-  }
-
-  /**
-   * Gets shape of the output tensor with {@code outputIndex}.
-   *
-   * @param outputIndex The index of the desired outtput tensor.
-   */
-  int[] getOutputTensorShape(int outputIndex) {
-    Tensor tensor = getOutputTensor(outputIndex);
-    return getShape(tensor);
-  }
-
-  /**
-   * Gets the {@link TensorType} in byte of the output tensor {@code outputIndex}.
-   *
-   * @param outputIndex The index of the desired outtput tensor.
-   */
-  byte getOutputTensorType(int outputIndex) {
-    return getOutputTensor(outputIndex).type();
-  }
-
-  /**
-   * Gets the quantization parameters of a tensor.
-   *
-   * <p>Only quantized tensors have valid {@code QuantizationParameters}. For tensor that are not
-   * quantized, the values of scale and zero_point are both 0.
-   *
-   * @param tensor The tensor whoes quantization parameters is desired.
-   * @throws NullPointerException if the tensor is null.
-   * @throws IllegalArgumentException if {@code scale} and {@code zeroPoint} of the tensor's {@link
-   *     QuantizationParameters} are not single values.
-   */
-  QuantizationParams getQuantizationParams(Tensor tensor) {
-    checkNotNull(tensor, "Tensor cannot be null.");
-
-    float scale;
-    int zeroPoint;
-    QuantizationParameters quantization = tensor.quantization();
-
-    // Tensors that are not quantized do not have quantization parameters, which can be null when
-    // being extracted from the flatbuffer.
-    if (quantization == null) {
-      scale = 0.0f;
-      zeroPoint = 0;
-      return new QuantizationParams(scale, zeroPoint);
-    }
-
-    // Tensors that are not quantized do not have quantization parameters.
-    // quantization.scaleLength() and quantization.zeroPointLength() may both return 0.
-    checkArgument(
-        quantization.scaleLength() <= 1,
-        "Input and output tensors do not support per-channel quantization.");
-    checkArgument(
-        quantization.zeroPointLength() <= 1,
-        "Input and output tensors do not support per-channel quantization.");
-
-    // For tensors that are not quantized, quantization.scale(0) and quantization.zeroPoint(0) will
-    // both be the default value in flatbuffer, 0. This behavior is consistent with the TFlite C++
-    // runtime.
-    scale = quantization.scale(0);
-    // zeroPoint is a long value in the schema, but an integer in the C++ runtime. Here we keep it
-    // consistent with the C++ runtime.
-    zeroPoint = (int) quantization.zeroPoint(0);
-
-    return new QuantizationParams(scale, zeroPoint);
-  }
-
-  /**
-   * Verifies if the buffer is a valid TFLite model.
-   *
-   * @param buffer the TFLite model flatbuffer
-   * @throws NullPointerException if {@code buffer} is null.
-   * @throws IllegalArgumentException if {@code buffer} does not contain the expected identifier
-   */
-  private static void assertTFLiteModel(ByteBuffer buffer) {
-    checkNotNull(buffer, "Model flatbuffer cannot be null.");
-    checkArgument(
-        Model.ModelBufferHasIdentifier(buffer),
-        "The identifier of the model is invalid. The buffer may not be a valid TFLite model"
-            + " flatbuffer.");
-  }
-
-  /**
-   * Gets the shape of a tensor.
-   *
-   * @param tensor The tensor whoes shape is desired.
-   * @throws NullPointerException if the tensor is null.
-   */
-  private static int[] getShape(Tensor tensor) {
-    checkNotNull(tensor, "Tensor cannot be null.");
-    int shapeDim = tensor.shapeLength();
-    int[] tensorShape = new int[shapeDim];
-    for (int i = 0; i < shapeDim; i++) {
-      tensorShape[i] = tensor.shape(i);
-    }
-    return tensorShape;
-  }
-
-  /** Gets input tensors from a model. */
-  private static List<Tensor> getInputTensors(Model model) {
-    // TFLite only support one subgraph currently.
-    SubGraph subgraph = model.subgraphs(0);
-    int tensorNum = subgraph.inputsLength();
-    ArrayList<Tensor> inputTensors = new ArrayList<>(tensorNum);
-    for (int i = 0; i < tensorNum; i++) {
-      inputTensors.add(subgraph.tensors(subgraph.inputs(i)));
-    }
-    return Collections.unmodifiableList(inputTensors);
-  }
-
-  /** Gets output tensors from a model. */
-  private static List<Tensor> getOutputTensors(Model model) {
-    // TFLite only support one subgraph currently.
-    SubGraph subgraph = model.subgraphs(0);
-    int tensorNum = subgraph.outputsLength();
-    ArrayList<Tensor> outputTensors = new ArrayList<>(tensorNum);
-    for (int i = 0; i < tensorNum; i++) {
-      outputTensors.add(subgraph.tensors(subgraph.outputs(i)));
-    }
-    return Collections.unmodifiableList(outputTensors);
-  }
-}
diff --git a/tensorflow/lite/experimental/support/metadata/java/src/java/org/tensorflow/lite/support/metadata/ModelMetadataInfo.java b/tensorflow/lite/experimental/support/metadata/java/src/java/org/tensorflow/lite/support/metadata/ModelMetadataInfo.java
deleted file mode 100644
index 751ed500dc2..00000000000
--- a/tensorflow/lite/experimental/support/metadata/java/src/java/org/tensorflow/lite/support/metadata/ModelMetadataInfo.java
+++ /dev/null
@@ -1,153 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-package org.tensorflow.lite.support.metadata;
-
-import static org.tensorflow.lite.support.metadata.Preconditions.checkArgument;
-import static org.tensorflow.lite.support.metadata.Preconditions.checkNotNull;
-
-import java.nio.ByteBuffer;
-import java.util.ArrayList;
-import java.util.Collections;
-import java.util.List;
-import org.checkerframework.checker.nullness.qual.Nullable;
-import org.tensorflow.lite.support.metadata.schema.ModelMetadata;
-import org.tensorflow.lite.support.metadata.schema.SubGraphMetadata;
-import org.tensorflow.lite.support.metadata.schema.TensorMetadata;
-
-/** Extracts model metadata information out of TFLite metadata FlatBuffer. */
-final class ModelMetadataInfo {
-  /** The root handler for the model metadata. */
-  private final ModelMetadata modelMetadata;
-
-  /** Metadata array of input tensors. */
-  private final List</* @Nullable */ TensorMetadata> inputsMetadata;
-
-  /** Metadata array of output tensors. */
-  private final List</* @Nullable */ TensorMetadata> outputsMetadata;
-
-  /** The minimum parser version required to fully understand the metadata flatbuffer. */
-  private final String /* @Nullable */ minVersion;
-
-  /**
-   * Creates a {@link ModelMetadataInfo} with the metadata FlatBuffer, {@code buffer}.
-   *
-   * @param buffer the TFLite metadata FlatBuffer
-   * @throws NullPointerException if {@code buffer} is null
-   * @throws IllegalArgumentException if {@code buffer} does not contain any subgraph metadata, or
-   *     it does not contain the expected identifier
-   */
-  ModelMetadataInfo(ByteBuffer buffer) {
-    assertTFLiteMetadata(buffer);
-
-    modelMetadata = ModelMetadata.getRootAsModelMetadata(buffer);
-    checkArgument(
-        modelMetadata.subgraphMetadataLength() > 0,
-        "The metadata flatbuffer does not contain any subgraph metadata.");
-
-    inputsMetadata = getInputsMetadata(modelMetadata);
-    outputsMetadata = getOutputsMetadata(modelMetadata);
-    minVersion = modelMetadata.minParserVersion();
-  }
-
-  /** Gets the count of input tensors with metadata in the metadata FlatBuffer. */
-  int getInputTensorCount() {
-    return inputsMetadata.size();
-  }
-
-  /**
-   * Gets the metadata for the input tensor specified by {@code inputIndex}.
-   *
-   * @param inputIndex The index of the desired intput tensor.
-   * @throws IllegalArgumentException if the inputIndex specified is invalid.
-   */
-  @Nullable
-  TensorMetadata getInputTensorMetadata(int inputIndex) {
-    checkArgument(
-        inputIndex >= 0 && inputIndex < inputsMetadata.size(),
-        "The inputIndex specified is invalid.");
-    return inputsMetadata.get(inputIndex);
-  }
-
-  /**
-   * Gets the minimum parser version of the metadata. It can be {@code null} if the version is not
-   * populated.
-   */
-  @Nullable
-  String getMininumParserVersion() {
-    return minVersion;
-  }
-
-  /** Gets the root handler for the model metadata. */
-  ModelMetadata getModelMetadata() {
-    return modelMetadata;
-  }
-
-  /** Gets the count of output tensors with metadata in the metadata FlatBuffer. */
-  int getOutputTensorCount() {
-    return outputsMetadata.size();
-  }
-
-  /**
-   * Gets the metadata for the output tensor specified by {@code outputIndex}.
-   *
-   * @param outputIndex The index of the desired output tensor.
-   * @throws IllegalArgumentException if the outputIndex specified is invalid.
-   */
-  @Nullable
-  TensorMetadata getOutputTensorMetadata(int outputIndex) {
-    checkArgument(
-        outputIndex >= 0 && outputIndex < outputsMetadata.size(),
-        "The outputIndex specified is invalid.");
-    return outputsMetadata.get(outputIndex);
-  }
-
-  /**
-   * Verifies if the buffer is a valid TFLite metadata flatbuffer.
-   *
-   * @param buffer the TFLite metadata flatbuffer
-   * @throws NullPointerException if {@code buffer} is null.
-   * @throws IllegalArgumentException if {@code buffer} does not contain the expected identifier
-   */
-  private static void assertTFLiteMetadata(ByteBuffer buffer) {
-    checkNotNull(buffer, "Metadata flatbuffer cannot be null.");
-    checkArgument(
-        ModelMetadata.ModelMetadataBufferHasIdentifier(buffer),
-        "The identifier of the metadata is invalid. The buffer may not be a valid TFLite metadata"
-            + " flatbuffer.");
-  }
-
-  /** Gets metadata for all input tensors. */
-  private static List<TensorMetadata> getInputsMetadata(ModelMetadata modelMetadata) {
-    SubGraphMetadata subgraphMetadata = modelMetadata.subgraphMetadata(0);
-    int tensorNum = subgraphMetadata.inputTensorMetadataLength();
-    ArrayList<TensorMetadata> inputsMetadata = new ArrayList<>(tensorNum);
-    for (int i = 0; i < tensorNum; i++) {
-      inputsMetadata.add(subgraphMetadata.inputTensorMetadata(i));
-    }
-    return Collections.unmodifiableList(inputsMetadata);
-  }
-
-  /** Gets metadata for all output tensors. */
-  private static List<TensorMetadata> getOutputsMetadata(ModelMetadata modelMetadata) {
-    SubGraphMetadata subgraphMetadata = modelMetadata.subgraphMetadata(0);
-    int tensorNum = subgraphMetadata.outputTensorMetadataLength();
-    ArrayList<TensorMetadata> outputsMetadata = new ArrayList<>(tensorNum);
-    for (int i = 0; i < tensorNum; i++) {
-      outputsMetadata.add(subgraphMetadata.outputTensorMetadata(i));
-    }
-    return Collections.unmodifiableList(outputsMetadata);
-  }
-}
diff --git a/tensorflow/lite/experimental/support/metadata/java/src/java/org/tensorflow/lite/support/metadata/Preconditions.java b/tensorflow/lite/experimental/support/metadata/java/src/java/org/tensorflow/lite/support/metadata/Preconditions.java
deleted file mode 100644
index c2f20fbaacd..00000000000
--- a/tensorflow/lite/experimental/support/metadata/java/src/java/org/tensorflow/lite/support/metadata/Preconditions.java
+++ /dev/null
@@ -1,184 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-package org.tensorflow.lite.support.metadata;
-
-import org.checkerframework.checker.nullness.qual.Nullable;
-
-/** Static error checking util methods. */
-final class Preconditions {
-  /**
-   * Ensures that an object reference passed as a parameter to the calling method is not null.
-   *
-   * @param reference an object reference
-   * @return the non-null reference that was validated
-   * @throws NullPointerException if {@code reference} is null
-   */
-  public static <T extends Object> T checkNotNull(T reference) {
-    if (reference == null) {
-      throw new NullPointerException("The object reference is null.");
-    }
-    return reference;
-  }
-
-  /**
-   * Ensures that an object reference passed as a parameter to the calling method is not null.
-   *
-   * @param reference an object reference
-   * @param errorMessage the exception message to use if the check fails; will be converted to a
-   *     string using {@link String#valueOf(Object)}
-   * @return the non-null reference that was validated
-   * @throws NullPointerException if {@code reference} is null
-   */
-  public static <T extends Object> T checkNotNull(T reference, @Nullable Object errorMessage) {
-    if (reference == null) {
-      throw new NullPointerException(String.valueOf(errorMessage));
-    }
-    return reference;
-  }
-
-  /**
-   * Ensures that the given String is not empty and not null.
-   *
-   * @param string the String to test
-   * @return the non-null non-empty String that was validated
-   * @throws IllegalArgumentException if {@code string} is null or empty
-   */
-  public static String checkNotEmpty(String string) {
-    if (string == null || string.length() == 0) {
-      throw new IllegalArgumentException("Given String is empty or null.");
-    }
-    return string;
-  }
-
-  /**
-   * Ensures that the given String is not empty and not null.
-   *
-   * @param string the String to test
-   * @param errorMessage the exception message to use if the check fails; will be converted to a
-   *     string using {@link String#valueOf(Object)}
-   * @return the non-null non-empty String that was validated
-   * @throws IllegalArgumentException if {@code string} is null or empty
-   */
-  public static String checkNotEmpty(String string, Object errorMessage) {
-    if (string == null || string.length() == 0) {
-      throw new IllegalArgumentException(String.valueOf(errorMessage));
-    }
-    return string;
-  }
-
-  /**
-   * Ensures the truth of an expression involving one or more parameters to the calling method.
-   *
-   * @param expression a boolean expression.
-   * @throws IllegalArgumentException if {@code expression} is false.
-   */
-  public static void checkArgument(boolean expression) {
-    if (!expression) {
-      throw new IllegalArgumentException();
-    }
-  }
-
-  /**
-   * Ensures the truth of an expression involving one or more parameters to the calling method.
-   *
-   * @param expression a boolean expression.
-   * @param errorMessage the exception message to use if the check fails; will be converted to a
-   *     string using {@link String#valueOf(Object)}.
-   * @throws IllegalArgumentException if {@code expression} is false.
-   */
-  public static void checkArgument(boolean expression, @Nullable Object errorMessage) {
-    if (!expression) {
-      throw new IllegalArgumentException(String.valueOf(errorMessage));
-    }
-  }
-
-  /**
-   * Ensures that {@code index} specifies a valid <i>element</i> in an array, list or string of size
-   * {@code size}. An element index may range from zero, inclusive, to {@code size}, exclusive.
-   *
-   * @param index a user-supplied index identifying an element of an array, list or string
-   * @param size the size of that array, list or string
-   * @return the value of {@code index}
-   * @throws IndexOutOfBoundsException if {@code index} is negative or is not less than {@code size}
-   * @throws IllegalArgumentException if {@code size} is negative
-   */
-  public static int checkElementIndex(int index, int size) {
-    return checkElementIndex(index, size, "index");
-  }
-
-  /**
-   * Ensures that {@code index} specifies a valid <i>element</i> in an array, list or string of size
-   * {@code size}. An element index may range from zero, inclusive, to {@code size}, exclusive.
-   *
-   * @param index a user-supplied index identifying an element of an array, list or string
-   * @param size the size of that array, list or string
-   * @param desc the text to use to describe this index in an error message
-   * @return the value of {@code index}
-   * @throws IndexOutOfBoundsException if {@code index} is negative or is not less than {@code size}
-   * @throws IllegalArgumentException if {@code size} is negative
-   */
-  public static int checkElementIndex(int index, int size, @Nullable String desc) {
-    // Carefully optimized for execution by hotspot (explanatory comment above)
-    if (index < 0 || index >= size) {
-      throw new IndexOutOfBoundsException(badElementIndex(index, size, desc));
-    }
-    return index;
-  }
-
-  /**
-   * Ensures the truth of an expression involving the state of the calling instance, but not
-   * involving any parameters to the calling method.
-   *
-   * @param expression a boolean expression
-   * @throws IllegalStateException if {@code expression} is false
-   * @see Verify#verify Verify.verify()
-   */
-  public static void checkState(boolean expression) {
-    if (!expression) {
-      throw new IllegalStateException();
-    }
-  }
-
-  /**
-   * Ensures the truth of an expression involving the state of the calling instance, but not
-   * involving any parameters to the calling method.
-   *
-   * @param expression a boolean expression
-   * @param errorMessage the exception message to use if the check fails; will be converted to a
-   *     string using {@link String#valueOf(Object)}
-   * @throws IllegalStateException if {@code expression} is false
-   * @see Verify#verify Verify.verify()
-   */
-  public static void checkState(boolean expression, @Nullable Object errorMessage) {
-    if (!expression) {
-      throw new IllegalStateException(String.valueOf(errorMessage));
-    }
-  }
-
-  private static String badElementIndex(int index, int size, @Nullable String desc) {
-    if (index < 0) {
-      return String.format("%s (%s) must not be negative", desc, index);
-    } else if (size < 0) {
-      throw new IllegalArgumentException("negative size: " + size);
-    } else { // index >= size
-      return String.format("%s (%s) must be less than size (%s)", desc, index, size);
-    }
-  }
-
-  private Preconditions() {
-    throw new AssertionError("Preconditions is Uninstantiable.");
-  }
-}
diff --git a/tensorflow/lite/experimental/support/metadata/java/src/java/org/tensorflow/lite/support/metadata/SeekableByteChannelCompat.java b/tensorflow/lite/experimental/support/metadata/java/src/java/org/tensorflow/lite/support/metadata/SeekableByteChannelCompat.java
deleted file mode 100644
index c655786755b..00000000000
--- a/tensorflow/lite/experimental/support/metadata/java/src/java/org/tensorflow/lite/support/metadata/SeekableByteChannelCompat.java
+++ /dev/null
@@ -1,107 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-package org.tensorflow.lite.support.metadata;
-
-import java.io.IOException;
-import java.nio.ByteBuffer;
-import java.nio.channels.Channel;
-
-/**
- * A byte channel that maintains a current <i>position</i> and allows the position to be changed.
- * {@link SeekableByteChannelCompat} is compatible with {@link
- * java.nio.channels.SeekableByteChannel}.
- *
- * <p>{@link java.nio.channels.SeekableByteChannel} is not available in Android API 23 and under.
- * Therefore, {@link SeekableByteChannelCompat} is introduced here to make the interfaces used in
- * the MetadtaExtractor library consistent with the common used Java libraries.
- */
-interface SeekableByteChannelCompat extends Channel {
-  /**
-   * Reads a sequence of bytes from this channel into the given buffer.
-   *
-   * @param dst The buffer into which bytes are to be transferred
-   * @return The number of bytes read, possibly zero, or <tt>-1</tt> if the channel has reached
-   *     end-of-stream
-   * @throws NonReadableChannelException If this channel was not opened for reading
-   * @throws ClosedChannelException If this channel is closed
-   * @throws AsynchronousCloseException If another thread closes this channel while the read
-   *     operation is in progress
-   * @throws ClosedByInterruptException If another thread interrupts the current thread while the
-   *     read operation is in progress, thereby closing the channel and setting the current thread's
-   *     interrupt status
-   * @throws IOException If some other I/O error occurs
-   */
-  int read(ByteBuffer dst) throws IOException;
-
-  /**
-   * Writes a sequence of bytes to this channel from the given buffer.
-   *
-   * @param src The buffer from which bytes are to be retrieved
-   * @return The number of bytes written, possibly zero
-   * @throws NonWritableChannelException If this channel was not opened for writing
-   * @throws ClosedChannelException If this channel is closed
-   * @throws AsynchronousCloseException If another thread closes this channel while the write
-   *     operation is in progress
-   * @throws ClosedByInterruptException If another thread interrupts the current thread while the
-   *     write operation is in progress, thereby closing the channel and setting the current
-   *     thread's interrupt status
-   * @throws IOException If some other I/O error occurs
-   */
-  int write(ByteBuffer src) throws IOException;
-
-  /**
-   * Returns this channel's position.
-   *
-   * @return This channel's position, a non-negative integer counting the number of bytes from the
-   *     beginning of the entity to the current position
-   * @throws ClosedChannelException If this channel is closed
-   * @throws IOException If some other I/O error occurs
-   */
-  long position() throws IOException;
-
-  /**
-   * Sets this channel's position.
-   *
-   * @param newPosition The new position, a non-negative integer counting the number of bytes from
-   *     the beginning of the entity
-   * @return This channel
-   * @throws ClosedChannelException If this channel is closed
-   * @throws IllegalArgumentException If the new position is negative
-   * @throws IOException If some other I/O error occurs
-   */
-  SeekableByteChannelCompat position(long newPosition) throws IOException;
-
-  /**
-   * Returns the current size of entity to which this channel is connected.
-   *
-   * @return The current size, measured in bytes
-   * @throws ClosedChannelException If this channel is closed
-   * @throws IOException If some other I/O error occurs
-   */
-  long size() throws IOException;
-
-  /**
-   * Truncates the entity, to which this channel is connected, to the given size.
-   *
-   * @param size The new size, a non-negative byte count
-   * @return This channel
-   * @throws NonWritableChannelException If this channel was not opened for writing
-   * @throws ClosedChannelException If this channel is closed
-   * @throws IllegalArgumentException If the new size is negative
-   * @throws IOException If some other I/O error occurs
-   */
-  SeekableByteChannelCompat truncate(long size) throws IOException;
-}
diff --git a/tensorflow/lite/experimental/support/metadata/java/src/java/org/tensorflow/lite/support/metadata/ZipFile.java b/tensorflow/lite/experimental/support/metadata/java/src/java/org/tensorflow/lite/support/metadata/ZipFile.java
deleted file mode 100644
index f055d7dcd7e..00000000000
--- a/tensorflow/lite/experimental/support/metadata/java/src/java/org/tensorflow/lite/support/metadata/ZipFile.java
+++ /dev/null
@@ -1,427 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-package org.tensorflow.lite.support.metadata;
-
-import static org.tensorflow.lite.support.metadata.Preconditions.checkArgument;
-import static org.tensorflow.lite.support.metadata.Preconditions.checkNotNull;
-
-import java.io.Closeable;
-import java.io.EOFException;
-import java.io.IOException;
-import java.io.InputStream;
-import java.nio.ByteBuffer;
-import java.nio.ByteOrder;
-import java.nio.charset.Charset;
-import java.util.ArrayList;
-import java.util.LinkedHashMap;
-import java.util.List;
-import java.util.Map;
-import java.util.zip.ZipException;
-
-/**
- * Reads uncompressed files from the TFLite model, a zip file.
- *
- * <p>TODO(b/150237111): add a link to the webpage of MetadataPopulator once it's available.
- *
- * <p>A TFLite model file becomes a zip file when it contains associated files. The associated files
- * can be packed to a TFLite model file using the MetadataPopulator. The associated files are not
- * compressed when being added to the model file.
- *
- * <p>{@link ZipFile} does not support Zip64 format, because TFLite models are much smaller than the
- * size limit for Zip64, which is 4GB.
- */
-final class ZipFile implements Closeable {
-  /** Maps String to list of ZipEntrys, name -> actual entries. */
-  private final Map<String, List<ZipEntry>> nameMap;
-
-  /** The actual data source. */
-  private final ByteBufferChannel archive;
-
-  /**
-   * Opens the given {@link ByteBufferChannel} for reading, assuming "UTF8" for file names. {@link
-   * ZipFile} does not synchronized over the buffer that is passed into it.
-   *
-   * @param channel the archive
-   * @throws IOException if an error occurs while creating this {@link ZipFile}
-   * @throws ZipException if the channel is not a zip archive
-   * @throws NullPointerException if the archive is null
-   */
-  public static ZipFile createFrom(ByteBufferChannel channel) throws IOException {
-    checkNotNull(channel);
-    ZipParser zipParser = new ZipParser(channel);
-    Map<String, List<ZipEntry>> nameMap = zipParser.parseEntries();
-    return new ZipFile(channel, nameMap);
-  }
-
-  @Override
-  public void close() {
-    archive.close();
-  }
-
-  /**
-   * Exposes the raw stream of the archive entry.
-   *
-   * <p>Since the associated files will not be compressed when being packed to the zip file, the raw
-   * stream represents the non-compressed files.
-   *
-   * <p><b>WARNING:</b> The returned {@link InputStream}, is <b>not</b> thread-safe. If multiple
-   * threads concurrently reading from the returned {@link InputStream}, it must be synchronized
-   * externally.
-   *
-   * @param name name of the entry to get the stream for
-   * @return the raw input stream containing data
-   * @throws IllegalArgumentException if the specified file does not exist in the zip file
-   */
-  public InputStream getRawInputStream(String name) {
-    checkArgument(
-        nameMap.containsKey(name),
-        String.format("The file, %s, does not exist in the zip file.", name));
-
-    List<ZipEntry> entriesWithTheSameName = nameMap.get(name);
-    ZipEntry entry = entriesWithTheSameName.get(0);
-    long start = entry.getDataOffset();
-    long remaining = entry.getSize();
-    return new BoundedInputStream(archive, start, remaining);
-  }
-
-  private ZipFile(ByteBufferChannel channel, Map<String, List<ZipEntry>> nameMap) {
-    archive = channel;
-    this.nameMap = nameMap;
-  }
-
-  /* Parses a Zip archive and gets the information for each {@link ZipEntry}. */
-  private static class ZipParser {
-    private final ByteBufferChannel archive;
-
-    // Cached buffers that will only be used locally in the class to reduce garbage collection.
-    private final ByteBuffer longBuffer =
-        ByteBuffer.allocate(ZipConstants.LONG_BYTE_SIZE).order(ByteOrder.LITTLE_ENDIAN);
-    private final ByteBuffer intBuffer =
-        ByteBuffer.allocate(ZipConstants.INT_BYTE_SIZE).order(ByteOrder.LITTLE_ENDIAN);
-    private final ByteBuffer shortBuffer =
-        ByteBuffer.allocate(ZipConstants.SHORT_BYTE_SIZE).order(ByteOrder.LITTLE_ENDIAN);
-
-    private ZipParser(ByteBufferChannel archive) {
-      this.archive = archive;
-    }
-
-    /**
-     * Parses the underlying {@code archive} and returns the information as a list of {@link
-     * ZipEntry}.
-     */
-    private Map<String, List<ZipEntry>> parseEntries() throws IOException {
-      List<ZipEntry> entries = parseCentralDirectory();
-      return parseLocalFileHeaderData(entries);
-    }
-
-    /**
-     * Checks if the current position contains a central file header signature, {@link
-     * ZipConstants#CENSIG}.
-     */
-    private boolean foundCentralFileheaderSignature() {
-      long signature = (long) getInt();
-      return signature == ZipConstants.CENSIG;
-    }
-
-    /**
-     * Gets the value as a Java int from two bytes starting at the current position of the archive.
-     */
-    private int getShort() {
-      shortBuffer.rewind();
-      archive.read(shortBuffer);
-      shortBuffer.flip();
-      return (int) shortBuffer.getShort();
-    }
-
-    /**
-     * Gets the value as a Java long from four bytes starting at the current position of the
-     * archive.
-     */
-    private int getInt() {
-      intBuffer.rewind();
-      archive.read(intBuffer);
-      intBuffer.flip();
-      return intBuffer.getInt();
-    }
-
-    /**
-     * Gets the value as a Java long from four bytes starting at the current position of the
-     * archive.
-     */
-    private long getLong() {
-      longBuffer.rewind();
-      archive.read(longBuffer);
-      longBuffer.flip();
-      return longBuffer.getLong();
-    }
-
-    /**
-     * Positions the archive at the start of the central directory.
-     *
-     * <p>First, it searches for the signature of the "end of central directory record", {@link
-     * ZipConstants#ENDSIG}. Position the stream at the start of the "end of central directory
-     * record". The zip file are created without archive comments, thus {@link ZipConstants#ENDSIG}
-     * should appear exactly at {@link ZipConstants#ENDHDR} from the end of the zip file.
-     *
-     * <p>Then, parse the "end of central dir record" and position the archive at the start of the
-     * central directory.
-     */
-    private void locateCentralDirectory() throws IOException {
-      if (archive.size() < ZipConstants.ENDHDR) {
-        throw new ZipException("The archive is not a ZIP archive.");
-      }
-
-      // Positions the archive at the start of the "end of central directory record".
-      long offsetRecord = archive.size() - ZipConstants.ENDHDR;
-      archive.position(offsetRecord);
-
-      // Checks for the signature, {@link ZipConstants#ENDSIG}.
-      long endSig = getLong();
-      if (endSig != ZipConstants.ENDSIG) {
-        throw new ZipException("The archive is not a ZIP archive.");
-      }
-
-      // Positions the archive at the “offset of central directory”.
-      skipBytes(ZipConstants.ENDOFF - ZipConstants.ENDSUB);
-      // Gets the offset to central directory
-      long offsetDirectory = getInt();
-      // Goes to the central directory.
-      archive.position(offsetDirectory);
-    }
-
-    /**
-     * Reads the central directory of the given archive and populates the internal tables with
-     * {@link ZipEntry} instances.
-     */
-    private List<ZipEntry> parseCentralDirectory() throws IOException {
-      /** List of entries in the order they appear inside the central directory. */
-      List<ZipEntry> entries = new ArrayList<>();
-      locateCentralDirectory();
-
-      while (foundCentralFileheaderSignature()) {
-        ZipEntry entry = parseCentralDirectoryEntry();
-        entries.add(entry);
-      }
-
-      return entries;
-    }
-
-    /**
-     * Reads an individual entry of the central directory, creats an ZipEntry from it and adds it to
-     * the global maps.
-     */
-    private ZipEntry parseCentralDirectoryEntry() throws IOException {
-      // Positions the archive at the "compressed size" and read the value.
-      skipBytes(ZipConstants.CENSIZ - ZipConstants.CENVEM);
-      long compressSize = getInt();
-
-      // Positions the archive at the "filename length" and read the value.
-      skipBytes(ZipConstants.CENNAM - ZipConstants.CENLEN);
-      int fileNameLen = getShort();
-
-      // Reads the extra field length and the comment length.
-      int extraLen = getShort();
-      int commentLen = getShort();
-
-      // Positions the archive at the "local file header offset" and read the value.
-      skipBytes(ZipConstants.CENOFF - ZipConstants.CENDSK);
-      long localHeaderOffset = getInt();
-
-      // Reads the file name.
-      byte[] fileNameBuf = new byte[fileNameLen];
-      archive.read(ByteBuffer.wrap(fileNameBuf));
-      String fileName = new String(fileNameBuf, Charset.forName("UTF-8"));
-
-      // Skips the extra field and the comment.
-      skipBytes(extraLen + commentLen);
-
-      ZipEntry entry = new ZipEntry();
-      entry.setSize(compressSize);
-      entry.setLocalHeaderOffset(localHeaderOffset);
-      entry.setName(fileName);
-
-      return entry;
-    }
-
-    /** Walks through all recorded entries and records the offsets for the entry data. */
-    private Map<String, List<ZipEntry>> parseLocalFileHeaderData(List<ZipEntry> entries) {
-      /** Maps String to list of ZipEntrys, name -> actual entries. */
-      Map<String, List<ZipEntry>> nameMap = new LinkedHashMap<>();
-
-      for (ZipEntry entry : entries) {
-        long offset = entry.getLocalHeaderOffset();
-        archive.position(offset + ZipConstants.LOCNAM);
-
-        // Gets the data offset of this entry.
-        int fileNameLen = getShort();
-        int extraFieldLen = getShort();
-        long dataOffset =
-            offset
-                + ZipConstants.LOCEXT
-                + ZipConstants.SHORT_BYTE_SIZE
-                + fileNameLen
-                + extraFieldLen;
-        entry.setDataOffset(dataOffset);
-
-        // Puts the entry into the nameMap.
-        String name = entry.getName();
-        List<ZipEntry> entriesWithTheSameName;
-        if (nameMap.containsKey(name)) {
-          entriesWithTheSameName = nameMap.get(name);
-        } else {
-          entriesWithTheSameName = new ArrayList<>();
-          nameMap.put(name, entriesWithTheSameName);
-        }
-        entriesWithTheSameName.add(entry);
-      }
-
-      return nameMap;
-    }
-
-    /** Skips the given number of bytes or throws an EOFException if skipping failed. */
-    private void skipBytes(int count) throws IOException {
-      long currentPosition = archive.position();
-      long newPosition = currentPosition + count;
-      if (newPosition > archive.size()) {
-        throw new EOFException();
-      }
-      archive.position(newPosition);
-    }
-  }
-
-  /** Stores the data offset and the size of an entry in the archive. */
-  private static class ZipEntry {
-
-    private String name;
-    private long dataOffset = -1;
-    private long size = -1;
-    private long localHeaderOffset = -1;
-
-    public long getSize() {
-      return size;
-    }
-
-    public long getDataOffset() {
-      return dataOffset;
-    }
-
-    public String getName() {
-      return name;
-    }
-
-    public long getLocalHeaderOffset() {
-      return localHeaderOffset;
-    }
-
-    public void setSize(long size) {
-      this.size = size;
-    }
-
-    public void setDataOffset(long dataOffset) {
-      this.dataOffset = dataOffset;
-    }
-
-    public void setName(String name) {
-      this.name = name;
-    }
-
-    public void setLocalHeaderOffset(long localHeaderOffset) {
-      this.localHeaderOffset = localHeaderOffset;
-    }
-  }
-
-  /**
-   * Various constants for this {@link ZipFile}.
-   *
-   * <p>Referenced from {@link java.util.zip.ZipConstants}.
-   */
-  private static class ZipConstants {
-    /** length of Java short in bytes. */
-    static final int SHORT_BYTE_SIZE = Short.SIZE / 8;
-
-    /** length of Java int in bytes. */
-    static final int INT_BYTE_SIZE = Integer.SIZE / 8;
-
-    /** length of Java long in bytes. */
-    static final int LONG_BYTE_SIZE = Long.SIZE / 8;
-
-    /*
-     * Header signatures
-     */
-    static final long LOCSIG = 0x04034b50L; // "PK\003\004"
-    static final long EXTSIG = 0x08074b50L; // "PK\007\008"
-    static final long CENSIG = 0x02014b50L; // "PK\001\002"
-    static final long ENDSIG = 0x06054b50L; // "PK\005\006"
-
-    /*
-     * Header sizes in bytes (including signatures)
-     */
-    static final int LOCHDR = 30; // LOC header size
-    static final int EXTHDR = 16; // EXT header size
-    static final int CENHDR = 46; // CEN header size
-    static final int ENDHDR = 22; // END header size
-
-    /*
-     * Local file (LOC) header field offsets
-     */
-    static final int LOCVER = 4; // version needed to extract
-    static final int LOCFLG = 6; // general purpose bit flag
-    static final int LOCHOW = 8; // compression method
-    static final int LOCTIM = 10; // modification time
-    static final int LOCCRC = 14; // uncompressed file crc-32 value
-    static final int LOCSIZ = 18; // compressed size
-    static final int LOCLEN = 22; // uncompressed size
-    static final int LOCNAM = 26; // filename length
-    static final int LOCEXT = 28; // extra field length
-
-    /*
-     * Extra local (EXT) header field offsets
-     */
-    static final int EXTCRC = 4; // uncompressed file crc-32 value
-    static final int EXTSIZ = 8; // compressed size
-    static final int EXTLEN = 12; // uncompressed size
-
-    /*
-     * Central directory (CEN) header field offsets
-     */
-    static final int CENVEM = 4; // version made by
-    static final int CENVER = 6; // version needed to extract
-    static final int CENFLG = 8; // encrypt, decrypt flags
-    static final int CENHOW = 10; // compression method
-    static final int CENTIM = 12; // modification time
-    static final int CENCRC = 16; // uncompressed file crc-32 value
-    static final int CENSIZ = 20; // compressed size
-    static final int CENLEN = 24; // uncompressed size
-    static final int CENNAM = 28; // filename length
-    static final int CENEXT = 30; // extra field length
-    static final int CENCOM = 32; // comment length
-    static final int CENDSK = 34; // disk number start
-    static final int CENATT = 36; // internal file attributes
-    static final int CENATX = 38; // external file attributes
-    static final int CENOFF = 42; // LOC header offset
-
-    /*
-     * End of central directory (END) header field offsets
-     */
-    static final int ENDSUB = 8; // number of entries on this disk
-    static final int ENDTOT = 10; // total number of entries
-    static final int ENDSIZ = 12; // central directory size in bytes
-    static final int ENDOFF = 16; // offset of first CEN header
-    static final int ENDCOM = 20; // zip file comment length
-
-    private ZipConstants() {}
-  }
-}
diff --git a/tensorflow/lite/experimental/support/metadata/metadata.py b/tensorflow/lite/experimental/support/metadata/metadata.py
deleted file mode 100644
index b3d8d28806b..00000000000
--- a/tensorflow/lite/experimental/support/metadata/metadata.py
+++ /dev/null
@@ -1,615 +0,0 @@
-# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""TensorFlow Lite metadata tools."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import copy
-import os
-import shutil
-import tempfile
-import warnings
-import zipfile
-
-from flatbuffers.python import flatbuffers
-from tensorflow.lite.experimental.support.metadata import metadata_schema_py_generated as _metadata_fb
-from tensorflow.lite.experimental.support.metadata import schema_py_generated as _schema_fb
-from tensorflow.lite.experimental.support.metadata.cc.python import _pywrap_metadata_version
-from tensorflow.lite.experimental.support.metadata.flatbuffers_lib import _pywrap_flatbuffers
-from tensorflow.python.platform import resource_loader
-
-_FLATC_TFLITE_METADATA_SCHEMA_FILE = resource_loader.get_path_to_datafile(
-    "metadata_schema.fbs")
-
-
-# TODO(b/141467403): add delete method for associated files.
-class MetadataPopulator(object):
-  """Packs metadata and associated files into TensorFlow Lite model file.
-
-  MetadataPopulator can be used to populate metadata and model associated files
-  into a model file or a model buffer (in bytearray). It can also help to
-  inspect list of files that have been packed into the model or are supposed to
-  be packed into the model.
-
-  The metadata file (or buffer) should be generated based on the metadata
-  schema:
-  third_party/tensorflow/lite/schema/metadata_schema.fbs
-
-  Example usage:
-  Populate matadata and label file into an image classifier model.
-
-  First, based on metadata_schema.fbs, generate the metadata for this image
-  classifer model using Flatbuffers API. Attach the label file onto the ouput
-  tensor (the tensor of probabilities) in the metadata.
-
-  Then, pack the metadata and label file into the model as follows.
-
-    ```python
-    # Populating a metadata file (or a metadta buffer) and associated files to
-    a model file:
-    populator = MetadataPopulator.with_model_file(model_file)
-    # For metadata buffer (bytearray read from the metadata file), use:
-    # populator.load_metadata_buffer(metadata_buf)
-    populator.load_metadata_file(metadata_file)
-    populator.load_associated_files([label.txt])
-    populator.populate()
-
-    # Populating a metadata file (or a metadta buffer) and associated files to
-    a model buffer:
-    populator = MetadataPopulator.with_model_buffer(model_buf)
-    populator.load_metadata_file(metadata_file)
-    populator.load_associated_files([label.txt])
-    populator.populate()
-    # Writing the updated model buffer into a file.
-    updated_model_buf = populator.get_model_buffer()
-    with open("updated_model.tflite", "wb") as f:
-      f.write(updated_model_buf)
-    ```
-
-  Note that existing metadata buffer (if applied) will be overridden by the new
-  metadata buffer.
-  """
-  # As Zip API is used to concatenate associated files after tflite model file,
-  # the populating operation is developed based on a model file. For in-memory
-  # model buffer, we create a tempfile to serve the populating operation.
-  # Creating the deleting such a tempfile is handled by the class,
-  # _MetadataPopulatorWithBuffer.
-
-  METADATA_FIELD_NAME = "TFLITE_METADATA"
-  TFLITE_FILE_IDENTIFIER = b"TFL3"
-  METADATA_FILE_IDENTIFIER = b"M001"
-
-  def __init__(self, model_file):
-    """Constructor for MetadataPopulator.
-
-    Args:
-      model_file: valid path to a TensorFlow Lite model file.
-
-    Raises:
-      IOError: File not found.
-      ValueError: the model does not have the expected flatbuffer identifer.
-    """
-    _assert_model_file_identifier(model_file)
-    self._model_file = model_file
-    self._metadata_buf = None
-    self._associated_files = set()
-
-  @classmethod
-  def with_model_file(cls, model_file):
-    """Creates a MetadataPopulator object that populates data to a model file.
-
-    Args:
-      model_file: valid path to a TensorFlow Lite model file.
-
-    Returns:
-      MetadataPopulator object.
-
-    Raises:
-      IOError: File not found.
-      ValueError: the model does not have the expected flatbuffer identifer.
-    """
-    return cls(model_file)
-
-  # TODO(b/141468993): investigate if type check can be applied to model_buf for
-  # FB.
-  @classmethod
-  def with_model_buffer(cls, model_buf):
-    """Creates a MetadataPopulator object that populates data to a model buffer.
-
-    Args:
-      model_buf: TensorFlow Lite model buffer in bytearray.
-
-    Returns:
-      A MetadataPopulator(_MetadataPopulatorWithBuffer) object.
-
-    Raises:
-      ValueError: the model does not have the expected flatbuffer identifer.
-    """
-    return _MetadataPopulatorWithBuffer(model_buf)
-
-  def get_model_buffer(self):
-    """Gets the buffer of the model with packed metadata and associated files.
-
-    Returns:
-      Model buffer (in bytearray).
-    """
-    with open(self._model_file, "rb") as f:
-      return f.read()
-
-  def get_packed_associated_file_list(self):
-    """Gets a list of associated files packed to the model file.
-
-    Returns:
-      List of packed associated files.
-    """
-    if not zipfile.is_zipfile(self._model_file):
-      return []
-
-    with zipfile.ZipFile(self._model_file, "r") as zf:
-      return zf.namelist()
-
-  def get_recorded_associated_file_list(self):
-    """Gets a list of associated files recorded in metadata of the model file.
-
-    Associated files may be attached to a model, a subgraph, or an input/output
-    tensor.
-
-    Returns:
-      List of recorded associated files.
-    """
-    recorded_files = []
-
-    if not self._metadata_buf:
-      return recorded_files
-
-    metadata = _metadata_fb.ModelMetadata.GetRootAsModelMetadata(
-        self._metadata_buf, 0)
-
-    # Add associated files attached to ModelMetadata
-    self._get_associated_files_from_metadata_struct(metadata, recorded_files)
-
-    # Add associated files attached to each SubgraphMetadata
-    for j in range(metadata.SubgraphMetadataLength()):
-      subgraph = metadata.SubgraphMetadata(j)
-      self._get_associated_files_from_metadata_struct(subgraph, recorded_files)
-
-      # Add associated files attached to each input tensor
-      for k in range(subgraph.InputTensorMetadataLength()):
-        tensor = subgraph.InputTensorMetadata(k)
-        self._get_associated_files_from_metadata_struct(tensor, recorded_files)
-
-      # Add associated files attached to each output tensor
-      for k in range(subgraph.OutputTensorMetadataLength()):
-        tensor = subgraph.OutputTensorMetadata(k)
-        self._get_associated_files_from_metadata_struct(tensor, recorded_files)
-
-    return recorded_files
-
-  def load_associated_files(self, associated_files):
-    """Loads associated files that to be concatenated after the model file.
-
-    Args:
-      associated_files: list of file paths.
-
-    Raises:
-      IOError:
-        File not found.
-    """
-    for af in associated_files:
-      _assert_exist(af)
-      self._associated_files.add(af)
-
-  def load_metadata_buffer(self, metadata_buf):
-    """Loads the metadata buffer (in bytearray) to be populated.
-
-    Args:
-      metadata_buf: metadata buffer (in bytearray) to be populated.
-
-    Raises:
-      ValueError: The metadata to be populated is empty.
-      ValueError: The metadata does not have the expected flatbuffer identifer.
-      ValueError: Error occurs when getting the minimum metadata parser version.
-    """
-    if not metadata_buf:
-      raise ValueError("The metadata to be populated is empty.")
-
-    _assert_metadata_buffer_identifier(metadata_buf)
-
-    # Gets the minimum metadata parser version of the metadata_buf.
-    min_version = _pywrap_metadata_version.GetMinimumMetadataParserVersion(
-        bytes(metadata_buf))
-
-    # Inserts in the minimum metadata parser version into the metadata_buf.
-    metadata = _metadata_fb.ModelMetadataT.InitFromObj(
-        _metadata_fb.ModelMetadata.GetRootAsModelMetadata(metadata_buf, 0))
-    metadata.minParserVersion = min_version
-
-    b = flatbuffers.Builder(0)
-    b.Finish(metadata.Pack(b), self.METADATA_FILE_IDENTIFIER)
-    metadata_buf_with_version = b.Output()
-
-    self._metadata_buf = metadata_buf_with_version
-
-  def load_metadata_file(self, metadata_file):
-    """Loads the metadata file to be populated.
-
-    Args:
-      metadata_file: path to the metadata file to be populated.
-
-    Raises:
-      IOError: File not found.
-      ValueError: The metadata does not have the expected flatbuffer identifer.
-    """
-    _assert_exist(metadata_file)
-    with open(metadata_file, "rb") as f:
-      metadata_buf = f.read()
-    self.load_metadata_buffer(bytearray(metadata_buf))
-
-  def populate(self):
-    """Populates loaded metadata and associated files into the model file."""
-    self._assert_validate()
-    self._populate_metadata_buffer()
-    self._populate_associated_files()
-
-  def _assert_validate(self):
-    """Validates the metadata and associated files to be populated.
-
-    Raises:
-      ValueError:
-        File is recorded in the metadata, but is not going to be populated.
-        File has already been packed.
-    """
-    # Gets files that are recorded in metadata.
-    recorded_files = self.get_recorded_associated_file_list()
-
-    # Gets files that have been packed to self._model_file.
-    packed_files = self.get_packed_associated_file_list()
-
-    # Gets the file name of those associated files to be populated.
-    to_be_populated_files = []
-    for af in self._associated_files:
-      to_be_populated_files.append(os.path.basename(af))
-
-    # Checks all files recorded in the metadata will be populated.
-    for rf in recorded_files:
-      if rf not in to_be_populated_files and rf not in packed_files:
-        raise ValueError("File, '{0}', is recorded in the metadata, but has "
-                         "not been loaded into the populator.".format(rf))
-
-    for f in to_be_populated_files:
-      if f in packed_files:
-        raise ValueError("File, '{0}', has already been packed.".format(f))
-
-      if f not in recorded_files:
-        warnings.warn(
-            "File, '{0}', does not exsit in the metadata. But packing it to "
-            "tflite model is still allowed.".format(f))
-
-  def _copy_archived_files(self, src_zip, dst_zip, file_list):
-    """Copy archieved files in file_list from src_zip ro dst_zip."""
-
-    if not zipfile.is_zipfile(src_zip):
-      raise ValueError("File, '{0}', is not a zipfile.".format(src_zip))
-
-    with zipfile.ZipFile(src_zip,
-                         "r") as src_zf, zipfile.ZipFile(dst_zip,
-                                                         "a") as dst_zf:
-      src_list = src_zf.namelist()
-      for f in file_list:
-        if f not in src_list:
-          raise ValueError(
-              "File, '{0}', does not exist in the zipfile, {1}.".format(
-                  f, src_zip))
-        file_buffer = src_zf.read(f)
-        dst_zf.writestr(f, file_buffer)
-
-  def _get_associated_files_from_metadata_struct(self, file_holder, file_list):
-    for j in range(file_holder.AssociatedFilesLength()):
-      file_list.append(file_holder.AssociatedFiles(j).Name().decode("utf-8"))
-
-  def _populate_associated_files(self):
-    """Concatenates associated files after TensorFlow Lite model file.
-
-    If the MetadataPopulator object is created using the method,
-    with_model_file(model_file), the model file will be updated.
-    """
-    # Opens up the model file in "appending" mode.
-    # If self._model_file already has pack files, zipfile will concatenate
-    # addition files after self._model_file. For example, suppose we have
-    # self._model_file = old_tflite_file | label1.txt | label2.txt
-    # Then after trigger populate() to add label3.txt, self._model_file becomes
-    # self._model_file = old_tflite_file | label1.txt | label2.txt | label3.txt
-    with zipfile.ZipFile(self._model_file, "a") as zf:
-      for af in self._associated_files:
-        filename = os.path.basename(af)
-        zf.write(af, filename)
-
-  def _populate_metadata_buffer(self):
-    """Populates the metadata buffer (in bytearray) into the model file.
-
-    Inserts metadata_buf into the metadata field of schema.Model. If the
-    MetadataPopulator object is created using the method,
-    with_model_file(model_file), the model file will be updated.
-
-    Existing metadata buffer (if applied) will be overridden by the new metadata
-    buffer.
-    """
-
-    with open(self._model_file, "rb") as f:
-      model_buf = f.read()
-
-    model = _schema_fb.ModelT.InitFromObj(
-        _schema_fb.Model.GetRootAsModel(model_buf, 0))
-    buffer_field = _schema_fb.BufferT()
-    buffer_field.data = self._metadata_buf
-
-    is_populated = False
-    if not model.metadata:
-      model.metadata = []
-    else:
-      # Check if metadata has already been populated.
-      for meta in model.metadata:
-        if meta.name.decode("utf-8") == self.METADATA_FIELD_NAME:
-          is_populated = True
-          model.buffers[meta.buffer] = buffer_field
-
-    if not is_populated:
-      if not model.buffers:
-        model.buffers = []
-      model.buffers.append(buffer_field)
-      # Creates a new metadata field.
-      metadata_field = _schema_fb.MetadataT()
-      metadata_field.name = self.METADATA_FIELD_NAME
-      metadata_field.buffer = len(model.buffers) - 1
-      model.metadata.append(metadata_field)
-
-    # Packs model back to a flatbuffer binaray file.
-    b = flatbuffers.Builder(0)
-    b.Finish(model.Pack(b), self.TFLITE_FILE_IDENTIFIER)
-    model_buf = b.Output()
-
-    # Saves the updated model buffer to model file.
-    # Gets files that have been packed to self._model_file.
-    packed_files = self.get_packed_associated_file_list()
-    if packed_files:
-      # Writes the updated model buffer and associated files into a new model
-      # file. Then overwrites the original model file.
-      with tempfile.NamedTemporaryFile() as temp:
-        new_file = temp.name
-      with open(new_file, "wb") as f:
-        f.write(model_buf)
-      self._copy_archived_files(self._model_file, new_file, packed_files)
-      shutil.copy(new_file, self._model_file)
-      os.remove(new_file)
-    else:
-      with open(self._model_file, "wb") as f:
-        f.write(model_buf)
-
-
-class _MetadataPopulatorWithBuffer(MetadataPopulator):
-  """Subclass of MetadtaPopulator that populates metadata to a model buffer.
-
-  This class is used to populate metadata into a in-memory model buffer. As we
-  use Zip API to concatenate associated files after tflite model file, the
-  populating operation is developed based on a model file. For in-memory model
-  buffer, we create a tempfile to serve the populating operation. This class is
-  then used to generate this tempfile, and delete the file when the
-  MetadataPopulator object is deleted.
-  """
-
-  def __init__(self, model_buf):
-    """Constructor for _MetadataPopulatorWithBuffer.
-
-    Args:
-      model_buf: TensorFlow Lite model buffer in bytearray.
-
-    Raises:
-      ValueError: model_buf is empty.
-      ValueError: model_buf does not have the expected flatbuffer identifer.
-    """
-    if not model_buf:
-      raise ValueError("model_buf cannot be empty.")
-
-    with tempfile.NamedTemporaryFile() as temp:
-      model_file = temp.name
-
-    with open(model_file, "wb") as f:
-      f.write(model_buf)
-
-    MetadataPopulator.__init__(self, model_file)
-
-  def __del__(self):
-    """Destructor of _MetadataPopulatorWithBuffer.
-
-    Deletes the tempfile.
-    """
-    if os.path.exists(self._model_file):
-      os.remove(self._model_file)
-
-
-class MetadataDisplayer(object):
-  """Displays metadata and associated file info in human-readable format."""
-
-  def __init__(self, model_file, metadata_file, associated_file_list):
-    """Constructor for MetadataDisplayer.
-
-    Args:
-      model_file: valid path to the model file.
-      metadata_file: valid path to the metadata file.
-      associated_file_list: list of associate files in the model file.
-    """
-    _assert_model_file_identifier(model_file)
-    _assert_metadata_file_identifier(metadata_file)
-    self._model_file = model_file
-    self._metadata_file = metadata_file
-    self._associated_file_list = associated_file_list
-
-  @classmethod
-  def with_model_file(cls, model_file):
-    """Creates a MetadataDisplayer object for the model file.
-
-    Args:
-      model_file: valid path to a TensorFlow Lite model file.
-
-    Returns:
-      MetadataDisplayer object.
-
-    Raises:
-      IOError: File not found.
-      ValueError: The model does not have metadata.
-    """
-    _assert_exist(model_file)
-    metadata_file = cls._save_temporary_metadata_file(model_file)
-    associated_file_list = cls._parse_packed_associted_file_list(model_file)
-    return cls(model_file, metadata_file, associated_file_list)
-
-  @classmethod
-  def with_model_buffer(cls, model_buffer):
-    """Creates a MetadataDisplayer object for a file buffer.
-
-    Args:
-      model_buffer: TensorFlow Lite model buffer in bytearray.
-
-    Returns:
-      MetadataDisplayer object.
-    """
-    if not model_buffer:
-      raise ValueError("model_buffer cannot be empty.")
-
-    with tempfile.NamedTemporaryFile() as temp:
-      model_file = temp.name
-
-    with open(model_file, "wb") as f:
-      f.write(model_buffer)
-    return cls.with_model_file(model_file)
-
-  def get_metadata_json(self):
-    """Converts the metadata into a json string."""
-    opt = _pywrap_flatbuffers.IDLOptions()
-    opt.strict_json = True
-    parser = _pywrap_flatbuffers.Parser(opt)
-    with open(_FLATC_TFLITE_METADATA_SCHEMA_FILE) as f:
-      metadata_schema_content = f.read()
-    with open(self._metadata_file, "rb") as f:
-      metadata_file_content = f.read()
-    if not parser.parse(metadata_schema_content):
-      raise ValueError("Cannot parse metadata schema. Reason: " + parser.error)
-    with open(self._metadata_file, "rb") as f:
-      metadata_file_content = f.read()
-    return _pywrap_flatbuffers.generate_text(parser, metadata_file_content)
-
-  def get_packed_associated_file_list(self):
-    """Returns a list of associated files that are packed in the model.
-
-    Returns:
-      A name list of associated files.
-    """
-    return copy.deepcopy(self._associated_file_list)
-
-  @staticmethod
-  def _save_temporary_metadata_file(model_file):
-    """Saves the metadata in the model file to a temporary file.
-
-    Args:
-      model_file: valid path to the model file.
-
-    Returns:
-      Path to the metadata temporary file.
-
-    Raises:
-      ValueError: The model does not have metadata.
-    """
-    with open(model_file, "rb") as f:
-      model_buf = f.read()
-
-    tflite_model = _schema_fb.Model.GetRootAsModel(model_buf, 0)
-
-    # Gets metadata from the model file.
-    for i in range(tflite_model.MetadataLength()):
-      meta = tflite_model.Metadata(i)
-      if meta.Name().decode("utf-8") == MetadataPopulator.METADATA_FIELD_NAME:
-        buffer_index = meta.Buffer()
-        metadata = tflite_model.Buffers(buffer_index)
-        metadata_buf = metadata.DataAsNumpy().tobytes()
-        # Creates a temporary file to store the metadata.
-        with tempfile.NamedTemporaryFile() as temp:
-          metadata_file = temp.name
-        # Saves the metadata into the temporary file.
-        with open(metadata_file, "wb") as f:
-          f.write(metadata_buf)
-          return metadata_file
-
-    raise ValueError("The model does not have metadata.")
-
-  @staticmethod
-  def _parse_packed_associted_file_list(model_file):
-    """Gets a list of associated files packed to the model file.
-
-    Args:
-      model_file: valid path to the model file.
-
-    Returns:
-      List of packed associated files.
-    """
-    if not zipfile.is_zipfile(model_file):
-      return []
-
-    with zipfile.ZipFile(model_file, "r") as zf:
-      return zf.namelist()
-
-  def __del__(self):
-    """Destructor of MetadataDisplayer.
-
-    Deletes the tempfile.
-    """
-    if os.path.exists(self._metadata_file):
-      os.remove(self._metadata_file)
-
-
-def _assert_exist(filename):
-  """Checks if a file exists."""
-  if not os.path.exists(filename):
-    raise IOError("File, '{0}', does not exist.".format(filename))
-
-
-def _assert_model_file_identifier(model_file):
-  """Checks if a model file has the expected TFLite schema identifier."""
-  _assert_exist(model_file)
-  with open(model_file, "rb") as f:
-    model_buf = f.read()
-
-  if not _schema_fb.Model.ModelBufferHasIdentifier(model_buf, 0):
-    raise ValueError(
-        "The model provided does not have the expected identifier, and "
-        "may not be a valid TFLite model.")
-
-
-def _assert_metadata_file_identifier(metadata_file):
-  """Checks if a metadata file has the expected Metadata schema identifier."""
-  _assert_exist(metadata_file)
-  with open(metadata_file, "rb") as f:
-    metadata_buf = f.read()
-  _assert_metadata_buffer_identifier(metadata_buf)
-
-
-def _assert_metadata_buffer_identifier(metadata_buf):
-  """Checks if a metadata buffer has the expected Metadata schema identifier."""
-  if not _metadata_fb.ModelMetadata.ModelMetadataBufferHasIdentifier(
-      metadata_buf, 0):
-    raise ValueError(
-        "The metadata buffer does not have the expected identifier, and may not"
-        " be a valid TFLite Metadata.")
diff --git a/tensorflow/lite/experimental/support/metadata/metadata_parser.py.template b/tensorflow/lite/experimental/support/metadata/metadata_parser.py.template
deleted file mode 100644
index a41ac06969c..00000000000
--- a/tensorflow/lite/experimental/support/metadata/metadata_parser.py.template
+++ /dev/null
@@ -1,26 +0,0 @@
-# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Information about the metadata parser that this python library depends on."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-
-class MetadataParser(object):
-  """Information about the metadata parser."""
-
-  # The version of the metadata parser.
-  VERSION = "{LATEST_METADATA_PARSER_VERSION}"
diff --git a/tensorflow/lite/experimental/support/metadata/metadata_parser_test.py b/tensorflow/lite/experimental/support/metadata/metadata_parser_test.py
deleted file mode 100644
index 3b1d19278cd..00000000000
--- a/tensorflow/lite/experimental/support/metadata/metadata_parser_test.py
+++ /dev/null
@@ -1,38 +0,0 @@
-# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for tensorflow.lite.experimental.support.metadata.metadata_parser."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import re
-
-from tensorflow.lite.experimental.support.metadata import metadata_parser
-from tensorflow.python.framework import test_util
-from tensorflow.python.platform import test
-
-
-class MetadataParserTest(test_util.TensorFlowTestCase):
-
-  def test_version_wellFormedSemanticVersion(self):
-    # Validates that the version is well-formed (x.y.z).
-    self.assertTrue(
-        re.match('[0-9]+\\.[0-9]+\\.[0-9]+',
-                 metadata_parser.MetadataParser.VERSION))
-
-
-if __name__ == '__main__':
-  test.main()
diff --git a/tensorflow/lite/experimental/support/metadata/metadata_schema.fbs b/tensorflow/lite/experimental/support/metadata/metadata_schema.fbs
deleted file mode 100644
index a88225f1960..00000000000
--- a/tensorflow/lite/experimental/support/metadata/metadata_schema.fbs
+++ /dev/null
@@ -1,570 +0,0 @@
-// Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-namespace tflite;
-
-// TFLite metadata contains both human readable and machine readable information
-// about what the model does and how to use the model. It can be used as a
-// README file, which elaborates the details of the model, each input/ouput
-// tensor, and each associated file.
-//
-// An important use case of TFLite metadata is the TFLite codegen tool, which
-// automatically generates the model interface based on the properties of the
-// model and the tensors. The model interface provides high-level APIs to
-// interact with the model, such as preprocessing the input data and running
-// inferences.
-//
-// Entries marked with "<Codegen usage>" are used in TFLite codegen tool to
-// generate the model interface. It is recommended to fill in at least those
-// enties to boost the codegen performance.
-
-// The Metadata schema is versioned by the Semantic versioning number, such as
-// MAJOR.MINOR.PATCH. It tracks the schema changes according to the rules below:
-//  * Bump up the MAJOR number when making potentially backwards incompatible
-//    changes. It must be incremented if the new changes break the backwards
-//    compatibility. It may also include minor and patch level changes as
-//    needed. The true backwards compatibility is indicated by the file
-//    identifier.
-//  * Bump up the MINOR number when making backwards compatible updates for
-//    major features, such as supporting new content types or adding new
-//    processing units.
-//  * Bump up the PATCH number when making small backwards compatible changes,
-//    such as adding a new fields or deprecating certain fields (not deleting
-//    them).
-//
-// ModelMetadata.min_parser_version indicates the minimum necessary metadata
-// parser version to fully understand all fields in a given metadata flatbuffer.
-//
-// New fields and types will have associated comments with the schema version
-// for which they were added.
-//
-// LINT.IfChange
-// Schema Semantic version: 1.0.1
-// LINT.ThenChange(//tensorflow/lite/experimental/\
-//.    support/metadata/java/src/java/org/tensorflow/lite/support/metadata/\
-//.    MetadataParser.java)
-
-// This indicates the flatbuffer compatibility. The number will bump up when a
-// break change is applied to the schema, such as removing fields or adding new
-// fields to the middle of a table.
-file_identifier "M001";
-
-// History:
-// 1.0.1 - Added VOCABULARY type to AssociatedFileType.
-
-// File extension of any written files.
-file_extension "tflitemeta";
-
-// LINT.IfChange
-enum AssociatedFileType : byte {
-  UNKNOWN = 0,
-
-  // Files such as readme.txt.
-  DESCRIPTIONS = 1,
-
-  // Contains labels that annotate certain axis of the tensor. For example,
-  // the label file in image classification. Those labels annotate the
-  // the output tensor, such that each value in the output tensor is the
-  // probability of that corresponding category specified by the label.
-  //
-  // <Codegen usage>:
-  // If an output tensor has an associated file as TENSOR_AXIS_LABELS, return
-  // the output as a mapping between the labels and probability in the model
-  // interface.
-  // If multiple files of the same type are present, the first one is used by
-  // default; additional ones are to be distinguished from one another by their
-  // specified locale.
-  TENSOR_AXIS_LABELS = 2,
-
-  // Contains labels that tensor values correspond to. For example, in
-  // the object detection model, one of the output tensors is the detected
-  // classes. And each value in the tensor refers to the index of label in the
-  // category label file.
-  //
-  // <Codegen usage>:
-  // If an output tensor has an associated file as TENSOR_VALUE_LABELS, convert
-  // the tensor values into labels, and return a list of string as the output.
-  // If multiple files of the same type are present, the first one is used by
-  // default; additional ones are to be distinguished from one another by their
-  // specified locale.
-  TENSOR_VALUE_LABELS = 3,
-
-  // Contains sigmoid-based score calibration parameters, formatted as CSV.
-  // Lines contain for each index of an output tensor the scale, slope, offset
-  // and (optional) min_score parameters to be used for sigmoid fitting (in this
-  // order and in `strtof`-compatible [1] format).
-  // A line may be left empty to default calibrated scores for this index to
-  // default_score.
-  // In summary, each line should thus contain 0, 3 or 4 comma-separated values.
-  //
-  // See documentation for ScoreCalibrationOptions for details.
-  //
-  // [1]: https://en.cppreference.com/w/c/string/byte/strtof
-  TENSOR_AXIS_SCORE_CALIBRATION = 4,
-
-  // Contains a list of unique words (characters separated by "\n" or in lines)
-  // that help to convert natural language words to embedding vectors.
-  // Added in: 1.0.1
-  VOCABULARY = 5,
-}
-
-table AssociatedFile {
-  // Name of this file. Need to be exact the same as the name of the actual file
-  // packed into the TFLite model as a zip file.
-  //
-  // <Codegen usage>:
-  // Locates to the actual file in the TFLite model.
-  name:string;
-
-  // A description of what the file is.
-  description:string;
-
-  // Type of the associated file. There may be special pre/post processing for
-  // some types. For example in image classification, a label file of the output
-  // will be used to convert object index into string.
-  //
-  // <Codegen usage>:
-  // Determines how to process the corresponding tensor.
-  type:AssociatedFileType;
-
-  // An optional locale for this associated file (if applicable). It is
-  // recommended to use an ISO 639-1 letter code (e.g. "en" for English),
-  // optionally completed by a two letter region code (e.g. "en-US" for US
-  // English and "en-CA" for Canadian English).
-  // Leverage this in order to specify e.g multiple label files translated in
-  // different languages.
-  locale:string;
-}
-
-// The basic content type for all tensors.
-//
-// <Codegen usage>:
-// Input feature tensors:
-// 1. Generates the method to load data from a TensorBuffer.
-// 2. Creates the preprocessing logic. The default processing pipeline is:
-// [NormalizeOp, QuantizeOp].
-// Output feature tensors:
-// 1. Generates the method to return the output data to a TensorBuffer.
-// 2. Creates the post-processing logic. The default processing pipeline is:
-// [DeQuantizeOp].
-table FeatureProperties {
-}
-
-// The type of color space of an image.
-enum ColorSpaceType : byte {
-  UNKNOWN = 0,
-  RGB = 1,
-  GRAYSCALE = 2,
-}
-
-table ImageSize {
-  width:uint;
-  height:uint;
-}
-
-// The properties for image tensors.
-//
-// <Codegen usage>:
-// Input image tensors:
-// 1. Generates the method to load an image from a TensorImage.
-// 2. Creates the preprocessing logic. The default processing pipeline is:
-// [ResizeOp, NormalizeOp, QuantizeOp].
-// Output image tensors:
-// 1. Generates the method to return the output data to a TensorImage.
-// 2. Creates the post-processing logic. The default processing pipeline is:
-// [DeQuantizeOp].
-table ImageProperties {
-  // The color space of the image.
-  //
-  // <Codegen usage>:
-  // Determines how to convert the color space of a given image from users.
-  color_space:ColorSpaceType;
-
-  // Indicates the default value of image width and height if the tensor shape
-  // is dynamic. For fixed-size tensor, this size will be consistent with the
-  // expected size.
-  default_size:ImageSize;
-}
-
-// The properties for tensors representing bounding boxes.
-//
-// <Codegen usage>:
-// Input image tensors: NA.
-// Output image tensors: parses the values into a data stucture that represents
-// bounding boxes. For example, in the generated wrapper for Android, it returns
-// the output as android.graphics.Rect objects.
-enum BoundingBoxType : byte {
-  UNKNOWN = 0,
-  // Represents the bounding box by using the combination of boundaries,
-  // {left, top, right, bottom}.
-  // The default order is {left, top, right, bottom}. Other orders can be
-  // indicated by BoundingBoxProperties.index.
-  BOUNDARIES = 1,
-
-  // Represents the bounding box by using the upper_left corner, width and
-  // height.
-  // The default order is {upper_left_x, upper_left_y, width, height}. Other
-  // orders can be indicated by BoundingBoxProperties.index.
-  UPPER_LEFT = 2,
-
-  // Represents the bounding box by using the center of the box, width and
-  // height. The default order is {center_x, center_y, width, height}. Other
-  // orders can be indicated by BoundingBoxProperties.index.
-  CENTER = 3,
-
-}
-
-enum CoordinateType : byte {
-  // The coordinates are float values from 0 to 1.
-  RATIO = 0,
-  // The coordinates are integers.
-  PIXEL = 1,
-}
-
-table BoundingBoxProperties {
-  // Denotes the order of the elements defined in each bounding box type. An
-  // empty index array represent the default order of each bounding box type.
-  // For example, to denote the default order of BOUNDARIES, {left, top, right,
-  // bottom}, the index should be {0, 1, 2, 3}. To denote the order {left,
-  // right, top, bottom}, the order should be {0, 2, 1, 3}.
-  //
-  // The index array can be applied to all bounding box types to adjust the
-  // order of their corresponding underlying elements.
-  //
-  // <Codegen usage>:
-  // Indicates how to parse the bounding box values.
-  index:[uint];
-
-  // <Codegen usage>:
-  // Indicates how to parse the bounding box values.
-  type:BoundingBoxType;
-
-  // <Codegen usage>:
-  // Indicates how to convert the bounding box back to the original image in
-  // pixels.
-  coordinate_type:CoordinateType;
-}
-
-union ContentProperties {
-  FeatureProperties,
-  ImageProperties,
-  BoundingBoxProperties,
-}
-
-table ValueRange {
-  min:int;
-  max:int;
-}
-
-table Content {
-  // The properties that the content may have, indicating the type of the
-  // Content.
-  //
-  // <Codegen usage>:
-  // Indicates how to process the tensor.
-  content_properties:ContentProperties;
-
-  // The range of dimensions that the content corresponds to. A NULL
-  // "range" indicates that the content uses up all dimensions,
-  // except the batch axis if applied.
-  //
-  // Here are all the possible situations of how a tensor is composed.
-  // Case 1: The tensor is a single object, such as an image.
-  // For example, the input of an image classifier
-  // (https://www.tensorflow.org/lite/models/image_classification/overview),
-  // a tensor of shape [1, 224, 224, 3]. Dimensions 1 to 3 correspond to the
-  // image. Since dimension 0 is a batch axis, which can be ignored,
-  // "range" can be left as NULL.
-  //
-  // Case 2: The tensor contains multiple instances of the same object.
-  // For example, the output tensor of detected bounding boxes of an object
-  // detection model
-  // (https://www.tensorflow.org/lite/models/object_detection/overview).
-  // The tensor shape is [1, 10, 4]. Here is the what the three dimensions
-  // represent for:
-  // dimension 0: the batch axis.
-  // dimension 1: the 10 objects detected with the highest confidence.
-  // dimension 2: the bounding boxes of the 10 detected objects.
-  // The tensor is essentially 10 bounding boxes. In this case,
-  // "range" should be {min=2; max=2;}.
-  // Another example is the pose estimation model
-  // (https://www.tensorflow.org/lite/models/pose_estimation/overview).
-  // The output tensor of heatmaps is in the shape of [1, 9, 9, 17].
-  // Here is the what the four dimensions represent for:
-  // dimension 0: the batch axis.
-  // dimension 1/2: the heatmap image.
-  // dimension 3: 17 body parts of a person.
-  // Even though the last axis is body part, the real content of this tensor is
-  // the heatmap. "range" should be [min=1; max=2].
-  //
-  // Case 3: The tensor contains multiple different objects. (Not supported by
-  // Content at this point).
-  // Sometimes a tensor may contain multiple different objects, thus different
-  // contents. It is very common for regression models. For example, a model
-  // to predict the fuel efficiency
-  // (https://www.tensorflow.org/tutorials/keras/regression).
-  // The input tensor has shape [1, 9], consisting of 9 features, such as
-  // "Cylinders", "Displacement", "Weight", etc. In this case, dimension 1
-  // contains 9 different contents. However, since these sub-dimension objects
-  // barely need to be specifically processed, their contents are not recorded
-  // in the metadata. Through, the name of each dimension can be set through
-  // TensorMetadata.dimension_names.
-  //
-  // Note that if it is not case 3, a tensor can only have one content type.
-  //
-  // <Codegen usage>:
-  // Case 1: return a processed single object of certain content type.
-  // Case 2: return a list of processed objects of certain content type. The
-  // generated model interface have API to random access those objects from
-  // the output.
-  range:ValueRange;
-}
-
-// Parameters that are used when normalizing the tensor.
-table NormalizationOptions{
-  // mean and std are normalization parameters. Tensor values are normalized
-  // on a per-channel basis, by the formula
-  //   (x - mean) / std.
-  // If there is only one value in mean or std, we'll propogate the value to
-  // all channels.
-  //
-  // Quantized models share the same normalization parameters as their
-  // corresponding float models. For example, an image input tensor may have
-  // the normalization parameter of
-  //   mean = 127.5f and std = 127.5f.
-  // The image value will be normalized from [0, 255] to [-1, 1].
-  // Then, for quantized models, the image data should be further quantized
-  // according to the quantization parameters. In the case of uint8, the image
-  // data will be scaled back to [0, 255], while for int8, the image data will
-  // be scaled to [-128, 127].
-  //
-  // Both the normalization parameters and quantization parameters can be
-  // retrieved through the metadata extractor library.
-  // TODO(b/156644598): add link for the metadata extractor library.
-
-  // Per-channel mean of the possible values used in normalization.
-  //
-  // <Codegen usage>:
-  // Apply normalization to input tensors accordingly.
-  mean:[float];
-
-  // Per-channel standard dev. of the possible values used in normalization.
-  //
-  // <Codegen usage>:
-  // Apply normalization to input tensors accordingly.
-  std:[float];
-}
-
-// The different possible score transforms to apply to uncalibrated scores
-// before applying score calibration.
-enum ScoreTransformationType : byte {
-  // Identity function: g(x) = x.
-  IDENTITY = 0,
-  // Log function: g(x) = log(x).
-  LOG = 1,
-  // Inverse logistic function: g(x) = log(x) - log(1-x).
-  INVERSE_LOGISTIC = 2,
-}
-
-// Options to perform score calibration on an output tensor through sigmoid
-// functions. One of the main purposes of score calibration is to make scores
-// across classes comparable, so that a common threshold can be used for all
-// output classes. This is meant for models producing class predictions as
-// output, e.g. image classification or detection models.
-//
-// For each index in the output tensor, this applies:
-// * `f(x) = scale / (1 + e^-(slope*g(x)+offset))` if `x > min_score` or if no
-//   `min_score` has been specified,
-// * `f(x) = default_score` otherwise or if no scale, slope and offset have been
-//   specified.
-// Where:
-// * scale, slope, offset and (optional) min_score are index-specific parameters
-// * g(x) is an index-independent transform among those defined in
-//   ScoreTransformationType
-// * default_score is an index-independent parameter.
-// An AssociatedFile with type TANSOR_AXIS_SCORE_CALIBRATION specifying the
-// index-specific parameters must be associated with the corresponding
-// TensorMetadata for score calibration be applied.
-table ScoreCalibrationOptions {
-  // The function to use for transforming the uncalibrated score before
-  // applying score calibration.
-  score_transformation:ScoreTransformationType;
-
-  // The default calibrated score to apply if the uncalibrated score is
-  // below min_score or if no parameters were specified for a given index.
-  default_score:float;
-}
-
-// Performs thresholding on output tensor values, in order to filter out
-// low-confidence results.
-table ScoreThresholdingOptions {
-  // The recommended global threshold below which results are considered
-  // low-confidence and should be filtered out.
-  global_score_threshold:float;
-}
-
-// Options that are used when processing the tensor.
-union ProcessUnitOptions {
-  NormalizationOptions,
-  ScoreCalibrationOptions,
-  ScoreThresholdingOptions,
-}
-
-// A process unit that is used to process the tensor out-of-graph.
-table ProcessUnit {
-  options:ProcessUnitOptions;
-}
-
-
-// Statistics to describe a tensor.
-table Stats {
-  // Max and min are not currently used in tflite.support codegen. They mainly
-  // serve as references for users to better understand the model. They can also
-  // be used to validate model pre/post processing results.
-  // If there is only one value in max or min, we'll propogate the value to
-  // all channels.
-
-  // Per-channel maximum value of the tensor.
-  max:[float];
-
-  // Per-channel minimum value of the tensor.
-  min:[float];
-}
-
-// Detailed information of an input or output tensor.
-table TensorMetadata {
-  // Name of the tensor.
-  //
-  // <Codegen usage>:
-  // The name of this tensor in the generated model interface.
-  name:string;
-
-  // A description of the tensor.
-  description:string;
-
-  // A list of names of the dimensions in this tensor. The length of
-  // dimension_names need to match the number of dimensions in this tensor.
-  //
-  // <Codegen usage>:
-  // The name of each dimension in the generated model interface. See "Case 2"
-  // in the comments of Content.range.
-  dimension_names:[string];
-
-  // The content that represents this tensor.
-  //
-  // <Codegen usage>:
-  // Determines how to process this tensor. See each item in ContentProperties
-  // for the default process units that will be applied to the tensor.
-  content:Content;
-
-  // The process units that are used to process the tensor out-of-graph.
-  //
-  // <Codegen usage>:
-  // Contains the parameters of the default processing pipeline for each content
-  // type, such as the normalization parameters in all content types. See the
-  // items under ContentProperties for the details of the default processing
-  // pipeline.
-  process_units:[ProcessUnit];
-
-  // The statistics of the tensor values.
-  stats:Stats;
-
-  // A list of associated files of this tensor.
-  //
-  // <Codegen usage>:
-  // Contains processing parameters of this tensor, such as normalization.
-  associated_files:[AssociatedFile];
-}
-
-table SubGraphMetadata {
-  // Name of the subgraph.
-  //
-  // Note that, since TFLite only support one subgraph at this moment, the
-  // Codegen tool will use the name in ModelMetadata in the generated model
-  // interface.
-  name:string;
-
-  // A description explains details about what the subgraph does.
-  description:string;
-
-  // Metadata of all input tensors used in this subgraph. It matches extactly
-  // the input tensors specified by `SubGraph.inputs` in the TFLite
-  // schema.fbs file[2]. The number of `TensorMetadata` in the array should
-  // equal to the number of indices in `SubGraph.inputs`.
-  //
-  // [2]: tensorflow/lite/schema/schema.fbs
-  // <Codegen usage>:
-  // Determines how to process the inputs.
-  input_tensor_metadata:[TensorMetadata];
-
-  // Metadata of all output tensors used in this subgraph. It matches extactly
-  // the output tensors specified by `SubGraph.outputs` in the TFLite
-  // schema.fbs file[2]. The number of `TensorMetadata` in the array should
-  // equal to the number of indices in `SubGraph.outputs`.
-  //
-  // <Codegen usage>:
-  // Determines how to process the outputs.
-  output_tensor_metadata:[TensorMetadata];
-
-  // A list of associated files of this subgraph.
-  associated_files:[AssociatedFile];
-}
-
-table ModelMetadata {
-  // Name of the model.
-  //
-  // <Codegen usage>:
-  // The name of the model in the generated model interface.
-  name:string;
-
-  // Model description in schema.
-  description:string;
-
-  // Version of the model that specified by model creators.
-  version:string;
-
-  // Noted that, the minimum required TFLite runtime version that the model is
-  // compatible with, has already been added as a metadata entry in tflite
-  // schema. We'll decide later if we want to move it here, and keep it with
-  // other metadata entries.
-
-  // Metadata of all the subgraphs of the model. The 0th is assumed to be the
-  // main subgraph.
-  //
-  // <Codegen usage>:
-  // Determines how to process the inputs and outputs.
-  subgraph_metadata:[SubGraphMetadata];
-
-  // The person who creates this model.
-  author:string;
-
-  // Licenses that may apply to this model.
-  license:string;
-
-  // A list of associated files of this model.
-  associated_files:[AssociatedFile];
-
-  // The minimum metadata parser version that can fully understand the fields in
-  // the metadata flatbuffer. The version is effectively the largest version
-  // number among the versions of all the fields populated and the smallest
-  // compatible version indicated by the file identifier.
-  //
-  // This field is automaticaly populated by the MetadataPopulator when
-  // the metadata is populated into a TFLite model.
-  min_parser_version:string;
-}
-// LINT.ThenChange(//tensorflow/lite/experimental/\
-//     support/metadata/cc/metadata_version.cc)
-
-root_type ModelMetadata;
diff --git a/tensorflow/lite/experimental/support/metadata/metadata_test.py b/tensorflow/lite/experimental/support/metadata/metadata_test.py
deleted file mode 100644
index 28395041746..00000000000
--- a/tensorflow/lite/experimental/support/metadata/metadata_test.py
+++ /dev/null
@@ -1,484 +0,0 @@
-# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for tensorflow.lite.experimental.support.metadata.metadata."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import os
-
-import six
-
-from flatbuffers.python import flatbuffers
-from tensorflow.lite.experimental.support.metadata import metadata as _metadata
-from tensorflow.lite.experimental.support.metadata import metadata_schema_py_generated as _metadata_fb
-from tensorflow.lite.experimental.support.metadata import schema_py_generated as _schema_fb
-from tensorflow.python.framework import test_util
-from tensorflow.python.platform import resource_loader
-from tensorflow.python.platform import test
-
-
-class MetadataTest(test_util.TensorFlowTestCase):
-
-  def setUp(self):
-    super(MetadataTest, self).setUp()
-    self._invalid_model_buf = None
-    self._invalid_file = "not_existed_file"
-    self._empty_model_buf = self._create_empty_model_buf()
-    self._empty_model_file = self.create_tempfile().full_path
-    with open(self._empty_model_file, "wb") as f:
-      f.write(self._empty_model_buf)
-    self._model_file = self._create_model_file_with_metadata_and_buf_fields()
-    self._metadata_file = self._create_metadata_file()
-    self._metadata_file_with_version = self._create_metadata_file_with_version(
-        self._metadata_file, "1.0.0")
-    self._file1 = self.create_tempfile("file1").full_path
-    self._file2 = self.create_tempfile("file2").full_path
-    self._file3 = self.create_tempfile("file3").full_path
-
-  def _create_empty_model_buf(self):
-    model = _schema_fb.ModelT()
-    model_builder = flatbuffers.Builder(0)
-    model_builder.Finish(
-        model.Pack(model_builder),
-        _metadata.MetadataPopulator.TFLITE_FILE_IDENTIFIER)
-    return model_builder.Output()
-
-  def _create_model_file_with_metadata_and_buf_fields(self):
-    metadata_field = _schema_fb.MetadataT()
-    metadata_field.name = "meta"
-    buffer_field = _schema_fb.BufferT()
-    model = _schema_fb.ModelT()
-    model.metadata = [metadata_field, metadata_field]
-    model.buffers = [buffer_field, buffer_field, buffer_field]
-    model_builder = flatbuffers.Builder(0)
-    model_builder.Finish(
-        model.Pack(model_builder),
-        _metadata.MetadataPopulator.TFLITE_FILE_IDENTIFIER)
-
-    mnodel_file = self.create_tempfile().full_path
-    with open(mnodel_file, "wb") as f:
-      f.write(model_builder.Output())
-
-    return mnodel_file
-
-  def _create_metadata_file(self):
-    associated_file1 = _metadata_fb.AssociatedFileT()
-    associated_file1.name = b"file1"
-    associated_file2 = _metadata_fb.AssociatedFileT()
-    associated_file2.name = b"file2"
-    self.expected_recorded_files = [
-        six.ensure_str(associated_file1.name),
-        six.ensure_str(associated_file2.name)
-    ]
-
-    output_meta = _metadata_fb.TensorMetadataT()
-    output_meta.associatedFiles = [associated_file2]
-    subgraph = _metadata_fb.SubGraphMetadataT()
-    subgraph.outputTensorMetadata = [output_meta]
-
-    model_meta = _metadata_fb.ModelMetadataT()
-    model_meta.name = "Mobilenet_quantized"
-    model_meta.associatedFiles = [associated_file1]
-    model_meta.subgraphMetadata = [subgraph]
-    b = flatbuffers.Builder(0)
-    b.Finish(
-        model_meta.Pack(b),
-        _metadata.MetadataPopulator.METADATA_FILE_IDENTIFIER)
-
-    metadata_file = self.create_tempfile().full_path
-    with open(metadata_file, "wb") as f:
-      f.write(b.Output())
-    return metadata_file
-
-  def _create_model_buffer_with_wrong_identifier(self):
-    wrong_identifier = b"widn"
-    model = _schema_fb.ModelT()
-    model_builder = flatbuffers.Builder(0)
-    model_builder.Finish(model.Pack(model_builder), wrong_identifier)
-    return model_builder.Output()
-
-  def _create_metadata_buffer_with_wrong_identifier(self):
-    # Creates a metadata with wrong identifier
-    wrong_identifier = b"widn"
-    metadata = _metadata_fb.ModelMetadataT()
-    metadata_builder = flatbuffers.Builder(0)
-    metadata_builder.Finish(metadata.Pack(metadata_builder), wrong_identifier)
-    return metadata_builder.Output()
-
-  def _populate_metadata_with_identifier(self, model_buf, metadata_buf,
-                                         identifier):
-    # For testing purposes only. MetadataPopulator cannot populate metadata with
-    # wrong identifiers.
-    model = _schema_fb.ModelT.InitFromObj(
-        _schema_fb.Model.GetRootAsModel(model_buf, 0))
-    buffer_field = _schema_fb.BufferT()
-    buffer_field.data = metadata_buf
-    model.buffers = [buffer_field]
-    # Creates a new metadata field.
-    metadata_field = _schema_fb.MetadataT()
-    metadata_field.name = _metadata.MetadataPopulator.METADATA_FIELD_NAME
-    metadata_field.buffer = len(model.buffers) - 1
-    model.metadata = [metadata_field]
-    b = flatbuffers.Builder(0)
-    b.Finish(model.Pack(b), identifier)
-    return b.Output()
-
-  def _create_metadata_file_with_version(self, metadata_file, min_version):
-    # Creates a new metadata file with the specified min_version for testing
-    # purposes.
-    with open(metadata_file, "rb") as f:
-      metadata_buf = bytearray(f.read())
-
-    metadata = _metadata_fb.ModelMetadataT.InitFromObj(
-        _metadata_fb.ModelMetadata.GetRootAsModelMetadata(metadata_buf, 0))
-    metadata.minParserVersion = min_version
-
-    b = flatbuffers.Builder(0)
-    b.Finish(
-        metadata.Pack(b), _metadata.MetadataPopulator.METADATA_FILE_IDENTIFIER)
-
-    metadata_file_with_version = self.create_tempfile().full_path
-    with open(metadata_file_with_version, "wb") as f:
-      f.write(b.Output())
-    return metadata_file_with_version
-
-
-class MetadataPopulatorTest(MetadataTest):
-
-  def testToValidModelFile(self):
-    populator = _metadata.MetadataPopulator.with_model_file(
-        self._empty_model_file)
-    self.assertIsInstance(populator, _metadata.MetadataPopulator)
-
-  def testToInvalidModelFile(self):
-    with self.assertRaises(IOError) as error:
-      _metadata.MetadataPopulator.with_model_file(self._invalid_file)
-    self.assertEqual("File, '{0}', does not exist.".format(self._invalid_file),
-                     str(error.exception))
-
-  def testToValidModelBuffer(self):
-    populator = _metadata.MetadataPopulator.with_model_buffer(
-        self._empty_model_buf)
-    self.assertIsInstance(populator, _metadata.MetadataPopulator)
-
-  def testToInvalidModelBuffer(self):
-    with self.assertRaises(ValueError) as error:
-      _metadata.MetadataPopulator.with_model_buffer(self._invalid_model_buf)
-    self.assertEqual("model_buf cannot be empty.", str(error.exception))
-
-  def testToModelBufferWithWrongIdentifier(self):
-    model_buf = self._create_model_buffer_with_wrong_identifier()
-    with self.assertRaises(ValueError) as error:
-      _metadata.MetadataPopulator.with_model_buffer(model_buf)
-    self.assertEqual(
-        "The model provided does not have the expected identifier, and "
-        "may not be a valid TFLite model.", str(error.exception))
-
-  def testSinglePopulateAssociatedFile(self):
-    populator = _metadata.MetadataPopulator.with_model_buffer(
-        self._empty_model_buf)
-    populator.load_associated_files([self._file1])
-    populator.populate()
-
-    packed_files = populator.get_packed_associated_file_list()
-    expected_packed_files = [os.path.basename(self._file1)]
-    self.assertEqual(set(packed_files), set(expected_packed_files))
-
-  def testRepeatedPopulateAssociatedFile(self):
-    populator = _metadata.MetadataPopulator.with_model_file(
-        self._empty_model_file)
-    populator.load_associated_files([self._file1, self._file2])
-    # Loads file2 multiple times.
-    populator.load_associated_files([self._file2])
-    populator.populate()
-
-    packed_files = populator.get_packed_associated_file_list()
-    expected_packed_files = [
-        os.path.basename(self._file1),
-        os.path.basename(self._file2)
-    ]
-    self.assertEqual(len(packed_files), 2)
-    self.assertEqual(set(packed_files), set(expected_packed_files))
-
-    # Check if the model buffer read from file is the same as that read from
-    # get_model_buffer().
-    with open(self._empty_model_file, "rb") as f:
-      model_buf_from_file = f.read()
-    model_buf_from_getter = populator.get_model_buffer()
-    self.assertEqual(model_buf_from_file, model_buf_from_getter)
-
-  def testPopulateInvalidAssociatedFile(self):
-    populator = _metadata.MetadataPopulator.with_model_buffer(
-        self._empty_model_buf)
-    with self.assertRaises(IOError) as error:
-      populator.load_associated_files([self._invalid_file])
-    self.assertEqual("File, '{0}', does not exist.".format(self._invalid_file),
-                     str(error.exception))
-
-  def testPopulatePackedAssociatedFile(self):
-    populator = _metadata.MetadataPopulator.with_model_buffer(
-        self._empty_model_buf)
-    populator.load_associated_files([self._file1])
-    populator.populate()
-    with self.assertRaises(ValueError) as error:
-      populator.load_associated_files([self._file1])
-      populator.populate()
-    self.assertEqual(
-        "File, '{0}', has already been packed.".format(
-            os.path.basename(self._file1)), str(error.exception))
-
-  def testGetPackedAssociatedFileList(self):
-    populator = _metadata.MetadataPopulator.with_model_buffer(
-        self._empty_model_buf)
-    packed_files = populator.get_packed_associated_file_list()
-    self.assertEqual(packed_files, [])
-
-  def testPopulateMetadataFileToEmptyModelFile(self):
-    populator = _metadata.MetadataPopulator.with_model_file(
-        self._empty_model_file)
-    populator.load_metadata_file(self._metadata_file)
-    populator.load_associated_files([self._file1, self._file2])
-    populator.populate()
-
-    with open(self._empty_model_file, "rb") as f:
-      model_buf_from_file = f.read()
-    model = _schema_fb.Model.GetRootAsModel(model_buf_from_file, 0)
-    metadata_field = model.Metadata(0)
-    self.assertEqual(
-        six.ensure_str(metadata_field.Name()),
-        six.ensure_str(_metadata.MetadataPopulator.METADATA_FIELD_NAME))
-
-    buffer_index = metadata_field.Buffer()
-    buffer_data = model.Buffers(buffer_index)
-    metadata_buf_np = buffer_data.DataAsNumpy()
-    metadata_buf = metadata_buf_np.tobytes()
-    with open(self._metadata_file_with_version, "rb") as f:
-      expected_metadata_buf = bytearray(f.read())
-    self.assertEqual(metadata_buf, expected_metadata_buf)
-
-    recorded_files = populator.get_recorded_associated_file_list()
-    self.assertEqual(set(recorded_files), set(self.expected_recorded_files))
-
-    # Up to now, we've proved the correctness of the model buffer that read from
-    # file. Then we'll test if get_model_buffer() gives the same model buffer.
-    model_buf_from_getter = populator.get_model_buffer()
-    self.assertEqual(model_buf_from_file, model_buf_from_getter)
-
-  def testPopulateMetadataFileWithoutAssociatedFiles(self):
-    populator = _metadata.MetadataPopulator.with_model_file(
-        self._empty_model_file)
-    populator.load_metadata_file(self._metadata_file)
-    populator.load_associated_files([self._file1])
-    # Suppose to populate self._file2, because it is recorded in the metadta.
-    with self.assertRaises(ValueError) as error:
-      populator.populate()
-    self.assertEqual(("File, '{0}', is recorded in the metadata, but has "
-                      "not been loaded into the populator.").format(
-                          os.path.basename(self._file2)), str(error.exception))
-
-  def testPopulateMetadataBufferWithWrongIdentifier(self):
-    metadata_buf = self._create_metadata_buffer_with_wrong_identifier()
-    populator = _metadata.MetadataPopulator.with_model_file(self._model_file)
-    with self.assertRaises(ValueError) as error:
-      populator.load_metadata_buffer(metadata_buf)
-    self.assertEqual(
-        "The metadata buffer does not have the expected identifier, and may not"
-        " be a valid TFLite Metadata.", str(error.exception))
-
-  def _assert_golden_metadata(self, model_file):
-    with open(model_file, "rb") as f:
-      model_buf_from_file = f.read()
-    model = _schema_fb.Model.GetRootAsModel(model_buf_from_file, 0)
-    # There are two elements in model.Metadata array before the population.
-    # Metadata should be packed to the third element in the array.
-    metadata_field = model.Metadata(2)
-    self.assertEqual(
-        six.ensure_str(metadata_field.Name()),
-        six.ensure_str(_metadata.MetadataPopulator.METADATA_FIELD_NAME))
-
-    buffer_index = metadata_field.Buffer()
-    buffer_data = model.Buffers(buffer_index)
-    metadata_buf_np = buffer_data.DataAsNumpy()
-    metadata_buf = metadata_buf_np.tobytes()
-    with open(self._metadata_file_with_version, "rb") as f:
-      expected_metadata_buf = bytearray(f.read())
-    self.assertEqual(metadata_buf, expected_metadata_buf)
-
-  def testPopulateMetadataFileToModelWithMetadataAndAssociatedFiles(self):
-    # First, creates a dummy metadata. Populates it and the associated files
-    # into the model.
-    model_meta = _metadata_fb.ModelMetadataT()
-    model_meta.name = "Mobilenet_quantized"
-    b = flatbuffers.Builder(0)
-    b.Finish(
-        model_meta.Pack(b),
-        _metadata.MetadataPopulator.METADATA_FILE_IDENTIFIER)
-    metadata_buf = b.Output()
-
-    populator1 = _metadata.MetadataPopulator.with_model_file(self._model_file)
-    populator1.load_metadata_buffer(metadata_buf)
-    populator1.load_associated_files([self._file1, self._file2])
-    populator1.populate()
-
-    # Then, populates the metadata again.
-    populator2 = _metadata.MetadataPopulator.with_model_file(self._model_file)
-    populator2.load_metadata_file(self._metadata_file)
-    populator2.populate()
-
-    # Tests if the metadata is populated correctly.
-    self._assert_golden_metadata(self._model_file)
-
-  def testPopulateMetadataFileToModelFileWithMetadataAndBufFields(self):
-    populator = _metadata.MetadataPopulator.with_model_file(self._model_file)
-    populator.load_metadata_file(self._metadata_file)
-    populator.load_associated_files([self._file1, self._file2])
-    populator.populate()
-
-    # Tests if the metadata is populated correctly.
-    self._assert_golden_metadata(self._model_file)
-
-    recorded_files = populator.get_recorded_associated_file_list()
-    self.assertEqual(set(recorded_files), set(self.expected_recorded_files))
-
-    # Up to now, we've proved the correctness of the model buffer that read from
-    # file. Then we'll test if get_model_buffer() gives the same model buffer.
-    with open(self._model_file, "rb") as f:
-      model_buf_from_file = f.read()
-    model_buf_from_getter = populator.get_model_buffer()
-    self.assertEqual(model_buf_from_file, model_buf_from_getter)
-
-  def testPopulateInvalidMetadataFile(self):
-    populator = _metadata.MetadataPopulator.with_model_buffer(
-        self._empty_model_buf)
-    with self.assertRaises(IOError) as error:
-      populator.load_metadata_file(self._invalid_file)
-    self.assertEqual("File, '{0}', does not exist.".format(self._invalid_file),
-                     str(error.exception))
-
-  def testPopulateInvalidMetadataBuffer(self):
-    populator = _metadata.MetadataPopulator.with_model_buffer(
-        self._empty_model_buf)
-    with self.assertRaises(ValueError) as error:
-      populator.load_metadata_buffer([])
-    self.assertEqual("The metadata to be populated is empty.",
-                     str(error.exception))
-
-  def testGetModelBufferBeforePopulatingData(self):
-    populator = _metadata.MetadataPopulator.with_model_buffer(
-        self._empty_model_buf)
-    model_buf = populator.get_model_buffer()
-    expected_model_buf = self._empty_model_buf
-    self.assertEqual(model_buf, expected_model_buf)
-
-
-class MetadataDisplayerTest(MetadataTest):
-
-  def setUp(self):
-    super(MetadataDisplayerTest, self).setUp()
-    self._model_file = self._create_model_with_metadata_and_associated_files()
-
-  def _create_model_with_metadata_and_associated_files(self):
-    model_buf = self._create_empty_model_buf()
-    model_file = self.create_tempfile().full_path
-    with open(model_file, "wb") as f:
-      f.write(model_buf)
-
-    populator = _metadata.MetadataPopulator.with_model_file(model_file)
-    populator.load_metadata_file(self._metadata_file)
-    populator.load_associated_files([self._file1, self._file2])
-    populator.populate()
-    return model_file
-
-  def test_load_model_buffer_metadataBufferWithWrongIdentifier_throwsException(
-      self):
-    model_buf = self._create_model_buffer_with_wrong_identifier()
-    metadata_buf = self._create_metadata_buffer_with_wrong_identifier()
-    model_buf = self._populate_metadata_with_identifier(
-        model_buf, metadata_buf,
-        _metadata.MetadataPopulator.TFLITE_FILE_IDENTIFIER)
-    with self.assertRaises(ValueError) as error:
-      _metadata.MetadataDisplayer.with_model_buffer(model_buf)
-    self.assertEqual(
-        "The metadata buffer does not have the expected identifier, and may not"
-        " be a valid TFLite Metadata.", str(error.exception))
-
-  def test_load_model_buffer_modelBufferWithWrongIdentifier_throwsException(
-      self):
-    model_buf = self._create_model_buffer_with_wrong_identifier()
-    metadata_file = self._create_metadata_file()
-    wrong_identifier = b"widn"
-    with open(metadata_file, "rb") as f:
-      metadata_buf = bytearray(f.read())
-    model_buf = self._populate_metadata_with_identifier(model_buf, metadata_buf,
-                                                        wrong_identifier)
-    with self.assertRaises(ValueError) as error:
-      _metadata.MetadataDisplayer.with_model_buffer(model_buf)
-    self.assertEqual(
-        "The model provided does not have the expected identifier, and "
-        "may not be a valid TFLite model.", str(error.exception))
-
-  def test_load_model_file_invalidModelFile_throwsException(self):
-    with self.assertRaises(IOError) as error:
-      _metadata.MetadataDisplayer.with_model_file(self._invalid_file)
-    self.assertEqual("File, '{0}', does not exist.".format(self._invalid_file),
-                     str(error.exception))
-
-  def test_load_model_file_modelWithoutMetadata_throwsException(self):
-    with self.assertRaises(ValueError) as error:
-      _metadata.MetadataDisplayer.with_model_file(self._empty_model_file)
-    self.assertEqual("The model does not have metadata.", str(error.exception))
-
-  def test_load_model_file_modelWithMetadata(self):
-    displayer = _metadata.MetadataDisplayer.with_model_file(self._model_file)
-    self.assertIsInstance(displayer, _metadata.MetadataDisplayer)
-
-  def test_load_model_buffer_modelWithOutMetadata_throwsException(self):
-    with self.assertRaises(ValueError) as error:
-      _metadata.MetadataDisplayer.with_model_buffer(
-          self._create_empty_model_buf())
-    self.assertEqual("The model does not have metadata.", str(error.exception))
-
-  def test_load_model_buffer_modelWithMetadata(self):
-    displayer = _metadata.MetadataDisplayer.with_model_buffer(
-        open(self._model_file, "rb").read())
-    self.assertIsInstance(displayer, _metadata.MetadataDisplayer)
-
-  def test_get_metadata_json_modelWithMetadata(self):
-    displayer = _metadata.MetadataDisplayer.with_model_file(self._model_file)
-    actual = displayer.get_metadata_json()
-
-    # Verifies the generated json file.
-    golden_json_file_path = resource_loader.get_path_to_datafile(
-        "testdata/golden_json.json")
-    with open(golden_json_file_path, "r") as f:
-      expected = f.read()
-    self.assertEqual(actual, expected)
-
-  def test_get_packed_associated_file_list_modelWithMetadata(self):
-    displayer = _metadata.MetadataDisplayer.with_model_file(self._model_file)
-    packed_files = displayer.get_packed_associated_file_list()
-
-    expected_packed_files = [
-        os.path.basename(self._file1),
-        os.path.basename(self._file2)
-    ]
-    self.assertEqual(len(packed_files), 2)
-    self.assertEqual(set(packed_files), set(expected_packed_files))
-
-
-if __name__ == "__main__":
-  test.main()
diff --git a/tensorflow/lite/experimental/support/metadata/testdata/golden_json.json b/tensorflow/lite/experimental/support/metadata/testdata/golden_json.json
deleted file mode 100644
index 9ff5581fbff..00000000000
--- a/tensorflow/lite/experimental/support/metadata/testdata/golden_json.json
+++ /dev/null
@@ -1,22 +0,0 @@
-{
-  "name": "Mobilenet_quantized",
-  "subgraph_metadata": [
-    {
-      "output_tensor_metadata": [
-        {
-          "associated_files": [
-            {
-              "name": "file2"
-            }
-          ]
-        }
-      ]
-    }
-  ],
-  "associated_files": [
-    {
-      "name": "file1"
-    }
-  ],
-  "min_parser_version": "1.0.0"
-}

From 50eb68909561ac674ab905f52f02828fd71d05a3 Mon Sep 17 00:00:00 2001
From: Tomer Kaftan <kaftan@google.com>
Date: Tue, 30 Jun 2020 19:58:57 -0700
Subject: [PATCH 1370/1390] More closely mimic the tf.Tensor api from
 KerasTensor, and mark tf op layers as being loaded from the configs in
 savedmodels. This fixes issues with more subtle usages of Keras functional
 models when we enable KerasTensors.

(E.g. `is_tensor(keras_tensor)` will now return True, like how it does for functional models in head. This is important in code that relies heavily on op layers).

PiperOrigin-RevId: 319148736
Change-Id: I209583f049dfe08b12ee51fd396e6fe21075aee0
---
 .../python/keras/engine/keras_tensor.py       | 35 +++++++++++++++++++
 tensorflow/python/keras/layers/core.py        |  3 ++
 tensorflow/python/keras/losses.py             |  5 +--
 3 files changed, 41 insertions(+), 2 deletions(-)

diff --git a/tensorflow/python/keras/engine/keras_tensor.py b/tensorflow/python/keras/engine/keras_tensor.py
index 50d377c5292..98560aa8e46 100644
--- a/tensorflow/python/keras/engine/keras_tensor.py
+++ b/tensorflow/python/keras/engine/keras_tensor.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_spec
 from tensorflow.python.framework import type_spec as type_spec_module
 from tensorflow.python.ops import array_ops
@@ -122,6 +123,35 @@ class KerasTensor(object):
   def get_shape(self):
     return self.shape
 
+  def __len__(self):
+    raise TypeError('Symbolic Functional model inputs/outputs in Keras do not '
+                    'implement `__len__`. You may be '
+                    'seeing this error if you are passing it '
+                    'to a TF API that does not support dispatching to Keras '
+                    'lambda layers.')
+
+  @property
+  def is_tensor_like(self):
+    return True
+
+  def set_shape(self, shape):
+    """Updates the shape of this KerasTensor. Mimics `tf.Tensor.set_shape()`."""
+    if not isinstance(shape, tensor_shape.TensorShape):
+      shape = tensor_shape.TensorShape(shape)
+    if shape.dims is not None:
+      dim_list = [dim.value for dim in shape.dims]
+      for dim in range(len(dim_list)):
+        if dim_list[dim] is None and self.shape.dims is not None:
+          dim_list[dim] = self.shape.dims[dim]
+      shape = tensor_shape.TensorShape(dim_list)
+    if not self.shape.is_compatible_with(shape):
+      raise ValueError(
+          "Keras Intermediate Value's shape %s is not"
+          "compatible with supplied shape %s" %
+          (self.shape, shape))
+    else:
+      self._type_spec._shape = shape  # pylint: disable=protected-access
+
   @property
   def dtype(self):
     """Returns the `dtype` of elements in the tensor."""
@@ -168,6 +198,11 @@ class KerasTensor(object):
     for operator in ops.Tensor.OVERLOADABLE_OPERATORS:
       cls._overload_operator(operator)
 
+    # We include `experimental_ref` for versions of TensorFlow that
+    # still include the deprecated method in Tensors.
+    if hasattr(ops.Tensor, 'experimental_ref'):
+      cls._overload_operator('experimental_ref')
+
   @classmethod
   def _overload_operator(cls, operator):  # pylint: disable=invalid-name
     """Overload an operator with the same overloading as `ops.Tensor`.
diff --git a/tensorflow/python/keras/layers/core.py b/tensorflow/python/keras/layers/core.py
index e64a1c27bcf..292c85560b4 100644
--- a/tensorflow/python/keras/layers/core.py
+++ b/tensorflow/python/keras/layers/core.py
@@ -1298,6 +1298,9 @@ class TFOpLambda(Layer):
       return self._call_wrapper(*args, **kwargs)
     self.call = tf_decorator.make_decorator(function, _call_wrapper)
 
+    # Do not individually trace op layers in the SavedModel.
+    self._must_restore_from_config = True
+
     super(TFOpLambda, self).__init__(**kwargs)
 
     # Warning on every invocation will be quite irksome in Eager mode.
diff --git a/tensorflow/python/keras/losses.py b/tensorflow/python/keras/losses.py
index d1f406fc40f..9b162560120 100644
--- a/tensorflow/python/keras/losses.py
+++ b/tensorflow/python/keras/losses.py
@@ -247,8 +247,9 @@ class LossFunctionWrapper(Loss):
       Loss values per sample.
     """
     if tensor_util.is_tensor(y_pred) and tensor_util.is_tensor(y_true):
-      y_pred, y_true = tf_losses_util.squeeze_or_expand_dimensions(
-          y_pred, y_true)
+      if not K.is_keras_tensor(y_pred) and not K.is_keras_tensor(y_true):
+        y_pred, y_true = tf_losses_util.squeeze_or_expand_dimensions(
+            y_pred, y_true)
     ag_fn = autograph.tf_convert(self.fn, ag_ctx.control_status_ctx())
     return ag_fn(y_true, y_pred, **self._fn_kwargs)
 

From 3a65a5e05b91f52327f2eff2d74d058f0f351197 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 30 Jun 2020 21:00:14 -0700
Subject: [PATCH 1371/1390] More closely mimic the tf.Tensor api from
 KerasTensor, and mark tf op layers as being loaded from the configs in
 savedmodels. This fixes issues with more subtle usages of Keras functional
 models when we enable KerasTensors.

(E.g. `is_tensor(keras_tensor)` will now return True, like how it does for functional models in head. This is important in code that relies heavily on op layers).

PiperOrigin-RevId: 319155010
Change-Id: Iabb06e84b5b2a0b0385a76b6982cd649e1d16816
---
 .../python/keras/engine/keras_tensor.py       | 35 -------------------
 tensorflow/python/keras/layers/core.py        |  3 --
 tensorflow/python/keras/losses.py             |  5 ++-
 3 files changed, 2 insertions(+), 41 deletions(-)

diff --git a/tensorflow/python/keras/engine/keras_tensor.py b/tensorflow/python/keras/engine/keras_tensor.py
index 98560aa8e46..50d377c5292 100644
--- a/tensorflow/python/keras/engine/keras_tensor.py
+++ b/tensorflow/python/keras/engine/keras_tensor.py
@@ -21,7 +21,6 @@ from __future__ import print_function
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
-from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_spec
 from tensorflow.python.framework import type_spec as type_spec_module
 from tensorflow.python.ops import array_ops
@@ -123,35 +122,6 @@ class KerasTensor(object):
   def get_shape(self):
     return self.shape
 
-  def __len__(self):
-    raise TypeError('Symbolic Functional model inputs/outputs in Keras do not '
-                    'implement `__len__`. You may be '
-                    'seeing this error if you are passing it '
-                    'to a TF API that does not support dispatching to Keras '
-                    'lambda layers.')
-
-  @property
-  def is_tensor_like(self):
-    return True
-
-  def set_shape(self, shape):
-    """Updates the shape of this KerasTensor. Mimics `tf.Tensor.set_shape()`."""
-    if not isinstance(shape, tensor_shape.TensorShape):
-      shape = tensor_shape.TensorShape(shape)
-    if shape.dims is not None:
-      dim_list = [dim.value for dim in shape.dims]
-      for dim in range(len(dim_list)):
-        if dim_list[dim] is None and self.shape.dims is not None:
-          dim_list[dim] = self.shape.dims[dim]
-      shape = tensor_shape.TensorShape(dim_list)
-    if not self.shape.is_compatible_with(shape):
-      raise ValueError(
-          "Keras Intermediate Value's shape %s is not"
-          "compatible with supplied shape %s" %
-          (self.shape, shape))
-    else:
-      self._type_spec._shape = shape  # pylint: disable=protected-access
-
   @property
   def dtype(self):
     """Returns the `dtype` of elements in the tensor."""
@@ -198,11 +168,6 @@ class KerasTensor(object):
     for operator in ops.Tensor.OVERLOADABLE_OPERATORS:
       cls._overload_operator(operator)
 
-    # We include `experimental_ref` for versions of TensorFlow that
-    # still include the deprecated method in Tensors.
-    if hasattr(ops.Tensor, 'experimental_ref'):
-      cls._overload_operator('experimental_ref')
-
   @classmethod
   def _overload_operator(cls, operator):  # pylint: disable=invalid-name
     """Overload an operator with the same overloading as `ops.Tensor`.
diff --git a/tensorflow/python/keras/layers/core.py b/tensorflow/python/keras/layers/core.py
index 292c85560b4..e64a1c27bcf 100644
--- a/tensorflow/python/keras/layers/core.py
+++ b/tensorflow/python/keras/layers/core.py
@@ -1298,9 +1298,6 @@ class TFOpLambda(Layer):
       return self._call_wrapper(*args, **kwargs)
     self.call = tf_decorator.make_decorator(function, _call_wrapper)
 
-    # Do not individually trace op layers in the SavedModel.
-    self._must_restore_from_config = True
-
     super(TFOpLambda, self).__init__(**kwargs)
 
     # Warning on every invocation will be quite irksome in Eager mode.
diff --git a/tensorflow/python/keras/losses.py b/tensorflow/python/keras/losses.py
index 9b162560120..d1f406fc40f 100644
--- a/tensorflow/python/keras/losses.py
+++ b/tensorflow/python/keras/losses.py
@@ -247,9 +247,8 @@ class LossFunctionWrapper(Loss):
       Loss values per sample.
     """
     if tensor_util.is_tensor(y_pred) and tensor_util.is_tensor(y_true):
-      if not K.is_keras_tensor(y_pred) and not K.is_keras_tensor(y_true):
-        y_pred, y_true = tf_losses_util.squeeze_or_expand_dimensions(
-            y_pred, y_true)
+      y_pred, y_true = tf_losses_util.squeeze_or_expand_dimensions(
+          y_pred, y_true)
     ag_fn = autograph.tf_convert(self.fn, ag_ctx.control_status_ctx())
     return ag_fn(y_true, y_pred, **self._fn_kwargs)
 

From 116092e99a146375bb9edd7462272bf6981ed2b9 Mon Sep 17 00:00:00 2001
From: Jiho Choi <jihochoi@google.com>
Date: Tue, 30 Jun 2020 21:09:57 -0700
Subject: [PATCH 1372/1390] Add element tracing for map_and_batch.

PiperOrigin-RevId: 319156094
Change-Id: I68b8289071f41fbf4f039000df13f70003ec5b56
---
 .../core/kernels/data/experimental/BUILD      |  2 ++
 .../experimental/map_and_batch_dataset_op.cc  | 19 ++++++++++++++++---
 2 files changed, 18 insertions(+), 3 deletions(-)

diff --git a/tensorflow/core/kernels/data/experimental/BUILD b/tensorflow/core/kernels/data/experimental/BUILD
index dc188071814..8457cfa6145 100644
--- a/tensorflow/core/kernels/data/experimental/BUILD
+++ b/tensorflow/core/kernels/data/experimental/BUILD
@@ -319,6 +319,8 @@ tf_kernel_library(
         "//tensorflow/core/kernels/data:dataset_utils",
         "//tensorflow/core/kernels/data:name_utils",
         "//tensorflow/core/kernels/data:stats_utils",
+        "//tensorflow/core/profiler/lib:traceme",
+        "//tensorflow/core/profiler/lib:traceme_encode",
     ],
 )
 
diff --git a/tensorflow/core/kernels/data/experimental/map_and_batch_dataset_op.cc b/tensorflow/core/kernels/data/experimental/map_and_batch_dataset_op.cc
index 0cf85a58985..fdc63bdb913 100644
--- a/tensorflow/core/kernels/data/experimental/map_and_batch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/map_and_batch_dataset_op.cc
@@ -36,6 +36,8 @@ limitations under the License.
 #include "tensorflow/core/platform/cpu_info.h"
 #include "tensorflow/core/platform/stringprintf.h"
 #include "tensorflow/core/platform/tracing.h"
+#include "tensorflow/core/profiler/lib/traceme.h"
+#include "tensorflow/core/profiler/lib/traceme_encode.h"
 
 namespace tensorflow {
 namespace data {
@@ -242,6 +244,10 @@ class MapAndBatchDatasetOp::Dataset : public DatasetBase {
         batch_results_.pop_front();
         cond_var_->notify_all();
       }
+      profiler::TraceMe traceme([&] {
+        return profiler::TraceMeEncode("MapAndBatchConsume",
+                                       {{"element_id", result->id}});
+      });
       return ProcessResult(ctx, result, out_tensors, end_of_sequence);
     }
 
@@ -314,7 +320,7 @@ class MapAndBatchDatasetOp::Dataset : public DatasetBase {
     // BatchResult encapsulates the output batch, as well as ancillary
     // metadata required to execute the fused map-and-batch operation.
     struct BatchResult {
-      explicit BatchResult(int64 batch_size) {
+      explicit BatchResult(int64 batch_size, int64 id = -1) : id(id) {
         end_of_input = false;
         num_calls = batch_size;
         num_elements = 0;
@@ -348,6 +354,7 @@ class MapAndBatchDatasetOp::Dataset : public DatasetBase {
       int64 status_offset TF_GUARDED_BY(mu);
       // Counts the number of outstanding calls for this batch.
       int64 num_calls;  // access guarded by owner's mutex
+      int64 id = -1;
     };
 
     void CallCompleted(const std::shared_ptr<IteratorContext>& ctx,
@@ -370,6 +377,10 @@ class MapAndBatchDatasetOp::Dataset : public DatasetBase {
     void CallFunction(std::shared_ptr<IteratorContext> ctx,
                       const std::shared_ptr<BatchResult>& result, int64 offset)
         TF_LOCKS_EXCLUDED(*mu_) {
+      profiler::TraceMe traceme([&] {
+        return profiler::TraceMeEncode("MapAndBatchProduce",
+                                       {{"element_id", result->id}});
+      });
       // Get the next input element.
       std::vector<Tensor> input_element;
       bool end_of_input = false;
@@ -583,6 +594,8 @@ class MapAndBatchDatasetOp::Dataset : public DatasetBase {
                 (batch_results_.size() == max_batch_results_ &&
                  call_counter_ % dataset()->batch_size_ == 0));
       };
+      // Counts the total number of batches to use as an id of BatchResult.
+      int64 num_total_batches = 1;
       while (true) {
         {
           mutex_lock l(*mu_);
@@ -607,8 +620,8 @@ class MapAndBatchDatasetOp::Dataset : public DatasetBase {
 
           while (!busy()) {
             if (call_counter_ % dataset()->batch_size_ == 0) {
-              batch_results_.push_back(
-                  std::make_shared<BatchResult>(dataset()->batch_size_));
+              batch_results_.push_back(std::make_shared<BatchResult>(
+                  dataset()->batch_size_, num_total_batches++));
             }
             int64 offset = call_counter_++ % dataset()->batch_size_;
             new_calls.emplace_back(batch_results_.back(), offset);

From 848de750514ac608d7c16cbf52fc4cdb5daa5b5e Mon Sep 17 00:00:00 2001
From: Henry Tan <henrytan@google.com>
Date: Tue, 30 Jun 2020 21:24:20 -0700
Subject: [PATCH 1373/1390] Add LIBTFTPU macro qualifier to the definition of
 the TpuCompilationCacheMetrics.

PiperOrigin-RevId: 319157570
Change-Id: I0ee8c7036ec37eaba0cf661e5448c5cbc599d6df
---
 tensorflow/core/tpu/kernels/tpu_compilation_cache_metrics.cc | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tensorflow/core/tpu/kernels/tpu_compilation_cache_metrics.cc b/tensorflow/core/tpu/kernels/tpu_compilation_cache_metrics.cc
index ba4e2ccff93..7c8ac937fe2 100644
--- a/tensorflow/core/tpu/kernels/tpu_compilation_cache_metrics.cc
+++ b/tensorflow/core/tpu/kernels/tpu_compilation_cache_metrics.cc
@@ -17,6 +17,9 @@ limitations under the License.
 namespace tensorflow {
 namespace tpu {
 
+// TODO(henrytan): remove this once `TpuCompilationCache` migration to OSS is
+// completed.
+#if defined(LIBTFTPU)
 /* static */
 void TpuCompilationCacheMetrics::IncrementCacheLookupCount(
     bool is_cache_hit, absl::string_view session_name) {
@@ -27,6 +30,7 @@ void TpuCompilationCacheMetrics::IncrementCacheLookupCount(
 void TpuCompilationCacheMetrics::SetCacheEntryCount(int64 count) {
   // A placeholder for tracking metrics.
 }
+#endif  // LIBTFTPU
 
 }  // namespace tpu
 }  // namespace tensorflow

From 2db7754d4c5bcf878dc7f1d4ce85cfc0ec41486b Mon Sep 17 00:00:00 2001
From: Tomer Kaftan <kaftan@google.com>
Date: Tue, 30 Jun 2020 21:59:43 -0700
Subject: [PATCH 1374/1390] Add microbenchmarks to check the overhead of a
 small model made up entirely out of 20 small op layers.

PiperOrigin-RevId: 319161658
Change-Id: Id715b7b883c23fe5b54d2cafcfe7641724be09d3
---
 .../benchmarks/eager_microbenchmarks_test.py      | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/tensorflow/python/keras/benchmarks/eager_microbenchmarks_test.py b/tensorflow/python/keras/benchmarks/eager_microbenchmarks_test.py
index aa33618fbe7..2d58e984d67 100644
--- a/tensorflow/python/keras/benchmarks/eager_microbenchmarks_test.py
+++ b/tensorflow/python/keras/benchmarks/eager_microbenchmarks_test.py
@@ -99,6 +99,21 @@ class MicroBenchmarksBase(test.Benchmark):
 
     self._run(fn, 10000)
 
+  def benchmark_op_layer_call_overhead(self):
+    model_input = tf.keras.Input(shape=(1,))
+    model_output = model_input
+    x = tf.convert_to_tensor([[1.1]])
+
+    for _ in range(20):
+      model_output = tf.multiply(model_output, x)
+    model = tf.keras.Model(inputs=model_input, outputs=model_output)
+
+    def fn():
+      model(x)  # pylint: disable=not-callable
+
+    fn()
+    self._run(fn, 100)
+
   def benchmark_model_predict_tensorlike_overhead(self):
 
     class OnlyOverheadLayer(tf.keras.layers.Layer):

From ac9969a2762d4068edef7e451b53256d2adb2692 Mon Sep 17 00:00:00 2001
From: Feng Liu <fengliuai@google.com>
Date: Tue, 30 Jun 2020 22:30:04 -0700
Subject: [PATCH 1375/1390] Add a trait verifier to the same operands and
 results scales trait

PiperOrigin-RevId: 319164800
Change-Id: I48ffe33160d5dbbdc549a4d71294979a13841df3
---
 .../compiler/mlir/lite/flatbuffer_import.cc   | 56 ++++++++++++++++++-
 tensorflow/compiler/mlir/lite/ir/tfl_ops.td   |  9 +++
 .../mlir/lite/quantization/quantization.td    | 23 ++++++--
 .../lite/quantization/quantization_driver.cc  |  4 +-
 .../lite/quantization/quantization_traits.h   | 18 +++---
 .../lite/quantization/quantization_utils.cc   | 56 ++++++++++++++++++-
 .../lite/quantization/quantization_utils.h    |  2 +-
 .../tests/flatbuffer2mlir/quantization.mlir   | 16 ++++++
 tensorflow/compiler/mlir/lite/tests/ops.mlir  |  8 +++
 9 files changed, 169 insertions(+), 23 deletions(-)

diff --git a/tensorflow/compiler/mlir/lite/flatbuffer_import.cc b/tensorflow/compiler/mlir/lite/flatbuffer_import.cc
index 4b3658adbc2..867c7560344 100644
--- a/tensorflow/compiler/mlir/lite/flatbuffer_import.cc
+++ b/tensorflow/compiler/mlir/lite/flatbuffer_import.cc
@@ -724,6 +724,59 @@ StatusOr<absl::flat_hash_set<const tflite::OperatorT*>> PruneSubgraph(
   return visited;
 }
 
+// We want to adjust the func op according to some cross ops information.
+static StatusOr<FuncOp> PostProcessFuncOp(FuncOp func) {
+  OpBuilder builder(func);
+  // When a quantized constant is imported, its quantization parameter is set
+  // to be narrow range. Here revert to be the fully range if the user doesn't
+  // require narrow range.
+  func.walk([&](tfl::QConstOp cst) {
+    Value value = cst.getResult();
+    // This is a quantized constant, so it only has one use.
+    assert(value.hasOneUse() && "QConst has only one use.");
+    auto& use = *value.getUses().begin();
+    Operation* user = use.getOwner();
+    if (user->isKnownTerminator()) return;
+    auto affine_user = llvm::dyn_cast<mlir::AffineQuantizedOpInterface>(user);
+    if (affine_user &&
+        affine_user.GetAffineOperandIndex() == use.getOperandNumber() &&
+        affine_user.RequiredNarrowRangeAffineOperand())
+      return;
+    auto qtype = mlir::quant::UniformQuantizedType::getQuantizedElementType(
+        value.getType());
+    // Only the 8-bit constants are imported with narrow range.
+    if (!qtype || qtype.getStorageTypeIntegralWidth() != 8) return;
+    mlir::quant::QuantizedType new_qtype;
+    if (auto per_axis =
+            qtype.dyn_cast<mlir::quant::UniformQuantizedPerAxisType>()) {
+      new_qtype = mlir::quant::UniformQuantizedPerAxisType::get(
+          per_axis.getFlags(), per_axis.getStorageType(),
+          per_axis.getExpressedType(), per_axis.getScales(),
+          per_axis.getZeroPoints(), per_axis.getQuantizedDimension(),
+          per_axis.getStorageTypeMin() - 1, per_axis.getStorageTypeMax());
+    } else if (auto per_tensor =
+                   qtype.dyn_cast<mlir::quant::UniformQuantizedType>()) {
+      new_qtype = mlir::quant::UniformQuantizedType::get(
+          per_tensor.getFlags(), per_tensor.getStorageType(),
+          per_tensor.getExpressedType(), per_tensor.getScale(),
+          per_tensor.getZeroPoint(), per_tensor.getStorageTypeMin() - 1,
+          per_tensor.getStorageTypeMax());
+    } else {
+      return;
+    }
+    auto new_output_type = new_qtype.castFromExpressedType(
+        mlir::quant::UniformQuantizedType::castToExpressedType(
+            value.getType()));
+    builder.setInsertionPointAfter(cst);
+    auto new_op = builder.create<tfl::QConstOp>(
+        cst.getLoc(), new_output_type, mlir::TypeAttr::get(new_output_type),
+        cst.valueAttr());
+    cst.replaceAllUsesWith(new_op.getResult());
+    cst.erase();
+  });
+  return func;
+}
+
 // Build a FuncOp from a tflite SubGraph
 // The op_names are a mapping from indexes into the TFLite operators array to
 // the operator name MLIR expects (tfl.foo_op). The buffers are directly taken
@@ -978,7 +1031,7 @@ StatusOr<FuncOp> ConvertSubgraph(
 
   op_builder.create<mlir::ReturnOp>(base_loc, return_operands);
 
-  return func;
+  return PostProcessFuncOp(func);
 }
 
 // TFLite subgraphs do not necessarily have names, though MLIR functions must
@@ -1059,6 +1112,5 @@ OwningModuleRef tflite::FlatBufferToMlir(
     }
     module.push_back(func_or_error.ConsumeValueOrDie());
   }
-
   return OwningModuleRef(module);
 }
diff --git a/tensorflow/compiler/mlir/lite/ir/tfl_ops.td b/tensorflow/compiler/mlir/lite/ir/tfl_ops.td
index d109e425cae..f02e761cab5 100644
--- a/tensorflow/compiler/mlir/lite/ir/tfl_ops.td
+++ b/tensorflow/compiler/mlir/lite/ir/tfl_ops.td
@@ -739,6 +739,15 @@ def TFL_ConcatenationOp : TFL_Op<"concatenation",
   let hasFolder = 1;
 
   let verifier = [{ return Verify(*this); }];
+
+  let extraClassDeclaration = [{
+    // SameScalesOpInterface:
+    bool RequiredSameOperandsAndResultsScale(bool sign, int bit_width) {
+      // uint8 doesn't require same operands and results scales.
+      bool is_uint8 = !sign && (bit_width == 8);
+      return !is_uint8;
+    }
+  }];
 }
 
 def TFL_ConstOp : Op<TFL_Dialect, "pseudo_const", [ConstantLike, NoSideEffect,
diff --git a/tensorflow/compiler/mlir/lite/quantization/quantization.td b/tensorflow/compiler/mlir/lite/quantization/quantization.td
index 2987938c28e..0eec689674d 100644
--- a/tensorflow/compiler/mlir/lite/quantization/quantization.td
+++ b/tensorflow/compiler/mlir/lite/quantization/quantization.td
@@ -108,15 +108,28 @@ def AffineQuantizedOpInterface : OpInterface<
   ];
 }
 
+def SameOperandsAndResultsScale : OpInterface<"SameScalesOpInterface"> {
+  let description = [{
+    Interface for ops potentially have same operands and results scales.
+  }];
+
+  let methods = [
+    InterfaceMethod<
+      [{Returns whether same operands and results scales are required.}],
+      "bool", "RequiredSameOperandsAndResultsScale",
+      (ins "bool":$sign, "int":$bit_width), [{}], [{return true;}]
+    >,
+  ];
+
+  let verify = [{
+    return quant::VerifySameScales($_op);
+  }];
+}
+
 // Specify this trait if the op has a fixed output value range.
 class FixedResultScale<QuantizedType qt> : NativeOpTrait<!strconcat(
   "quant::FixedResult", qt.name, "Scale<", qt.asTraitArgsStr, ">::Impl")>;
 
-// Specify this trait if the op requires same inputs and outputs quantization
-// scales.
-def SameOperandsAndResultsScale : NativeOpTrait<
-  "quant::SameOperandsAndResultsScale">;
-
 // Specify this trait if the bias-th input of the op is a bias input, which
 // needs a scale based on the scales of op1 and op2.
 class AccumulatorUniformScale<int bias, int op1, int op2> : NativeOpTrait<
diff --git a/tensorflow/compiler/mlir/lite/quantization/quantization_driver.cc b/tensorflow/compiler/mlir/lite/quantization/quantization_driver.cc
index 9b63290a10b..c1cf4354c04 100644
--- a/tensorflow/compiler/mlir/lite/quantization/quantization_driver.cc
+++ b/tensorflow/compiler/mlir/lite/quantization/quantization_driver.cc
@@ -652,7 +652,7 @@ void QuantizationDriver::PreprocessConstantOps() {
       }
 
       if (biases.find(operand_num) == biases.end() &&
-          !user->hasTrait<OpTrait::quant::SameOperandsAndResultsScale>()) {
+          !llvm::dyn_cast<mlir::SameScalesOpInterface>(user)) {
         // Needs to scan the content to get the quantiztion parameters if there
         // are no quantization parameters (FakeQuant ops).
         weights_.insert(cst);
@@ -764,7 +764,7 @@ bool QuantizationDriver::PropagateParams() {
       continue;
     }
 
-    if (op->hasTrait<OpTrait::quant::SameOperandsAndResultsScale>()) {
+    if (llvm::isa<SameScalesOpInterface>(op)) {
       auto params = GetQuantParamsForSameScaleConstraint(op);
       // The quantization parameters haven't been propagated to any operands
       // or results. Skip this node for now.
diff --git a/tensorflow/compiler/mlir/lite/quantization/quantization_traits.h b/tensorflow/compiler/mlir/lite/quantization/quantization_traits.h
index 693f692c61a..d5eea94e848 100644
--- a/tensorflow/compiler/mlir/lite/quantization/quantization_traits.h
+++ b/tensorflow/compiler/mlir/lite/quantization/quantization_traits.h
@@ -20,11 +20,18 @@ limitations under the License.
 
 #include "mlir/Dialect/Quant/QuantTypes.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
 
 using QuantizedType = mlir::quant::QuantizedType;
 using UniformQuantizedType = mlir::quant::UniformQuantizedType;
 
 namespace mlir {
+namespace quant {
+// Verify that the op satisfies the same operands and results scales
+// constraints. Note that this constraint can only be applied on some
+// storage types of the op.
+LogicalResult VerifySameScales(Operation* op);
+}  // namespace quant
 
 // This includes the interface class definition. It couldn't be in a namespace
 // because the table gen doesn't emit the namespace when it is used.
@@ -40,17 +47,6 @@ struct QuantizationSpecTraitBase : public TraitBase<ConcreteType, TraitType> {
   static bool IsQuantizable() { return true; }
 };
 
-// This class provides the API for TFL ops that requires same input and output
-// scale as the quantization results. This is used as a trait like this:
-//
-//   class TransposeOp
-//       : public Op<TransposeOp, OpTrait::TFL::SameOperandsAndResultsScale> {
-//
-template <typename ConcreteType>
-class SameOperandsAndResultsScale
-    : public QuantizationSpecTraitBase<ConcreteType,
-                                       SameOperandsAndResultsScale> {};
-
 // This class provides the API for TFL ops that has a fixed output value range.
 // This is used as a trait like this:
 //
diff --git a/tensorflow/compiler/mlir/lite/quantization/quantization_utils.cc b/tensorflow/compiler/mlir/lite/quantization/quantization_utils.cc
index a0392583f36..dbfbe451d37 100644
--- a/tensorflow/compiler/mlir/lite/quantization/quantization_utils.cc
+++ b/tensorflow/compiler/mlir/lite/quantization/quantization_utils.cc
@@ -32,6 +32,7 @@ limitations under the License.
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/IR/StandardTypes.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
 
 namespace mlir {
 
@@ -475,7 +476,7 @@ bool RemoveRedundantStatsOps(mlir::FuncOp func,
       // We don't propagate this parameter down if it has multiple operands.
       // We want to use the result parameter scales instead.
 
-      if (user->hasTrait<OpTrait::quant::SameOperandsAndResultsScale>() &&
+      if (llvm::dyn_cast<SameScalesOpInterface>(user) &&
           !PreferResultScale(user)) {
         for (Value res : user->getResults()) {
           if (res.hasOneUse()) {
@@ -506,7 +507,7 @@ bool RemoveRedundantStatsOps(mlir::FuncOp func,
     all_stats_ops.pop_back();
 
     if (auto def = stats_op.arg().getDefiningOp()) {
-      if (def->hasTrait<OpTrait::quant::SameOperandsAndResultsScale>()) {
+      if (llvm::dyn_cast<SameScalesOpInterface>(def)) {
         for (auto input : def->getOperands()) {
           if (auto next_stats = llvm::dyn_cast_or_null<quant::StatisticsOp>(
                   input.getDefiningOp())) {
@@ -529,5 +530,56 @@ bool RemoveRedundantStatsOps(mlir::FuncOp func,
   // Returns false if the steps finish without errors.
   return false;
 }
+
+LogicalResult VerifySameScales(Operation* op) {
+  auto same_scale_op = llvm::cast<SameScalesOpInterface>(op);
+
+  llvm::SmallVector<QuantizedType, 4> collected_quant_params;
+  for (auto input : op->getOperands()) {
+    auto quant_params =
+        UniformQuantizedType::getQuantizedElementType(input.getType());
+    // Skip non-quantizable operands.
+    if (quant_params) {
+      collected_quant_params.push_back(quant_params);
+    }
+  }
+
+  for (auto output : op->getResults()) {
+    auto quant_params =
+        UniformQuantizedType::getQuantizedElementType(output.getType());
+    // Skip non-quantizable results.
+    if (quant_params) {
+      collected_quant_params.push_back(quant_params);
+    }
+  }
+
+  if (collected_quant_params.size() <= 1) return success();
+  for (int i = 1; i < collected_quant_params.size(); i++) {
+    auto expected_params = collected_quant_params[0];
+    auto compared_paras = collected_quant_params[i];
+    // Same quantization parameters are always ok.
+    if (expected_params == compared_paras) continue;
+    // If the quantization parameters are not the same, as long as it has the
+    // same storage type and the op interface doesn't require same scale
+    // constraint for this storage type, it is still ok.
+    if ((expected_params.isSigned() == compared_paras.isSigned() &&
+         expected_params.getStorageTypeIntegralWidth() ==
+             compared_paras.getStorageTypeIntegralWidth()) &&
+        !same_scale_op.RequiredSameOperandsAndResultsScale(
+            expected_params.isSigned(),
+            expected_params.getStorageTypeIntegralWidth()))
+      continue;
+
+    std::string err_msg =
+        "quantization parameters violate the same scale constraint: ";
+    llvm::raw_string_ostream os(err_msg);
+    collected_quant_params[0].print(os);
+    os << " vs. ";
+    collected_quant_params[i].print(os);
+    os.flush();
+    return op->emitOpError(err_msg);
+  }
+  return success();
+}
 }  // namespace quant
 }  // namespace mlir
diff --git a/tensorflow/compiler/mlir/lite/quantization/quantization_utils.h b/tensorflow/compiler/mlir/lite/quantization/quantization_utils.h
index ad99b1c58d2..35c930281d0 100644
--- a/tensorflow/compiler/mlir/lite/quantization/quantization_utils.h
+++ b/tensorflow/compiler/mlir/lite/quantization/quantization_utils.h
@@ -387,7 +387,7 @@ struct FoldTrivalRequantizeOp : public OpRewritePattern<RQ> {
     Operation* def = pre_quantized.getDefiningOp();
     if (!def) return failure();
     if (llvm::isa<FixedOutputRangeInterface>(def) ||
-        def->hasTrait<OpTrait::quant::SameOperandsAndResultsScale>() ||
+        llvm::isa<SameScalesOpInterface>(def) ||
         def->hasTrait<OpTrait::quant::NoQuantizableResult>()) {
       return failure();
     }
diff --git a/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/quantization.mlir b/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/quantization.mlir
index 22943b55f66..f5de214a692 100644
--- a/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/quantization.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/quantization.mlir
@@ -1,5 +1,6 @@
 // RUN: flatbuffer_translate -mlir-to-tflite-flatbuffer %s -o - | flatbuffer_translate --tflite-flatbuffer-to-mlir - -o - | FileCheck %s
 
+// CHECK-LABEL: main
 func @main(%arg0: tensor<1x224x224x3xf32>) -> tensor<1x1001xf32> {
 // CHECK:   %{{.*}} = "tfl.quantize"(%{{.*}}) {qtype = tensor<1x224x224x3x!quant.uniform<u8:f32, 7.812500e-03:128>>} : (tensor<1x224x224x3xf32>) -> tensor<1x224x224x3x!quant.uniform<u8:f32, 7.812500e-03:128>>
 // The float values here doesn't match exactly because double -> float -> double is lossy
@@ -17,3 +18,18 @@ func @main(%arg0: tensor<1x224x224x3xf32>) -> tensor<1x1001xf32> {
   %6 = "tfl.dequantize"(%5) : (tensor<1x1001x!quant.uniform<u8:f32, 3.906250e-03>>) -> tensor<1x1001xf32>
   return %6 : tensor<1x1001xf32>
 }
+
+// CHECK-LABEL: quantized_constant
+func @quantized_constant(%arg0: tensor<1x2xf32>) -> tensor<2x2xf32> {
+  %1 = "tfl.quantize"(%arg0) {qtype = tensor<1x2x!quant.uniform<u8:f32, 1.0>>, volatile} : (tensor<1x2xf32>) -> tensor<1x2x!quant.uniform<u8:f32, 1.0>>
+  %cst = "tfl.pseudo_qconst"() {qtype = tensor<1x2x!quant.uniform<u8:f32, 1.0>>, value = dense<-76> : tensor<1x2xi8>} : () -> tensor<1x2x!quant.uniform<u8:f32, 1.0>>
+  %2 = "tfl.concatenation"(%1, %cst) {axis = 0 : i32, fused_activation_function = "NONE"} : (tensor<1x2x!quant.uniform<u8:f32, 1.0>>, tensor<1x2x!quant.uniform<u8:f32, 1.0>>) -> tensor<2x2x!quant.uniform<u8:f32, 1.0>>
+  %3 = "tfl.dequantize"(%2) : (tensor<2x2x!quant.uniform<u8:f32, 1.0>>) -> tensor<2x2xf32>
+  return %3 : tensor<2x2xf32>
+
+// CHECK-NEXT: %[[Q:.*]] = "tfl.quantize"(%arg0) {qtype = tensor<1x2x!quant.uniform<u8:f32, 1.000000e+00>>} : (tensor<1x2xf32>) -> tensor<1x2x!quant.uniform<u8:f32, 1.000000e+00>>
+// CHECK-NEXT: %[[CST:.*]] = "tfl.pseudo_qconst"() {qtype = tensor<1x2x!quant.uniform<u8:f32, 1.000000e+00>>, value = dense<-76> : tensor<1x2xi8>} : () -> tensor<1x2x!quant.uniform<u8:f32, 1.000000e+00>>
+// CHECK-NEXT: %[[CONCAT:.*]] = "tfl.concatenation"(%[[Q]], %[[CST]]) {axis = 0 : i32, fused_activation_function = "NONE"} : (tensor<1x2x!quant.uniform<u8:f32, 1.000000e+00>>, tensor<1x2x!quant.uniform<u8:f32, 1.000000e+00>>) -> tensor<2x2x!quant.uniform<u8:f32, 1.000000e+00>>
+// CHECK-NEXT: %[[DQ:.*]] = "tfl.dequantize"(%[[CONCAT]]) : (tensor<2x2x!quant.uniform<u8:f32, 1.000000e+00>>) -> tensor<2x2xf32>
+// CHECK-NEXT: return %[[DQ]] : tensor<2x2xf32>
+}
diff --git a/tensorflow/compiler/mlir/lite/tests/ops.mlir b/tensorflow/compiler/mlir/lite/tests/ops.mlir
index f1742538935..5f434e954c8 100644
--- a/tensorflow/compiler/mlir/lite/tests/ops.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/ops.mlir
@@ -1310,6 +1310,14 @@ func @testConcatInvalidOperandDimSizeComparedToPrevInput(%arg0: tensor<1x2xi32>,
 
 // -----
 
+func @testConcatInvalidScales(%arg0: tensor<*x!quant.uniform<i8:f32, 1.0>>, %arg1: tensor<*x!quant.uniform<i8:f32, 2.0>>) -> tensor<*x!quant.uniform<i8:f32, 1.0>> {
+  // expected-error @+1 {{'tfl.concatenation' op quantization parameters violate the same scale constraint: !quant.uniform<i8:f32, 1.000000e+00> vs. !quant.uniform<i8:f32, 2.000000e+00>}}
+  %0 = "tfl.concatenation"(%arg0, %arg1) {axis = 3 : i32, fused_activation_function = "NONE"} : (tensor<*x!quant.uniform<i8:f32, 1.0>>, tensor<*x!quant.uniform<i8:f32, 2.0>>) -> tensor<*x!quant.uniform<i8:f32, 1.0>>
+  return %0 : tensor<*x!quant.uniform<i8:f32, 1.0>>
+}
+
+// -----
+
 func @testConcatBenignUnrankedOperand(%arg0: tensor<*xi32>, %arg1: tensor<1x2xi32>) -> tensor<2x2xi32> {
   %0 = "tfl.concatenation"(%arg0, %arg1) {axis = 0 : i32, fused_activation_function = "NONE"} : (tensor<*xi32>, tensor<1x2xi32>) -> tensor<2x2xi32>
   return %0 : tensor<2x2xi32>

From 3252cc128c6bc1d4855d657d3eb4481b3a0e7879 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 30 Jun 2020 22:49:18 -0700
Subject: [PATCH 1376/1390] Internal change

PiperOrigin-RevId: 319166670
Change-Id: Ie098bec86a9eb6da20f51dd1eef9569cf3ca11f1
---
 tensorflow/python/keras/saving/hdf5_format.py | 35 +------------------
 1 file changed, 1 insertion(+), 34 deletions(-)

diff --git a/tensorflow/python/keras/saving/hdf5_format.py b/tensorflow/python/keras/saving/hdf5_format.py
index 6c93e518741..3aa4fe1245a 100644
--- a/tensorflow/python/keras/saving/hdf5_format.py
+++ b/tensorflow/python/keras/saving/hdf5_format.py
@@ -33,7 +33,6 @@ from tensorflow.python.keras.saving.saved_model import json_utils
 from tensorflow.python.keras.utils import conv_utils
 from tensorflow.python.keras.utils.io_utils import ask_to_proceed_with_overwrite
 from tensorflow.python.ops import variables as variables_module
-from tensorflow.python.platform import gfile
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import serialization
 from tensorflow.python.util.lazy_loader import LazyLoader
@@ -55,24 +54,7 @@ sequential_lib = LazyLoader(
 # pylint:enable=g-inconsistent-quotes
 
 
-# create lock file
-def create_lockfile(filepath):
-  lockfile_path = filepath + '.lock'
-
-  f = gfile.GFile(lockfile_path, 'w')
-  f.write(str(os.getpid()))
-  f.close()
-
-  return lockfile_path
-
-
-def check_lockfile(filepath):
-  lockfile_path = filepath + '.lock'
-  return gfile.Exists(lockfile_path)
-
-
-def save_model_to_hdf5(model, filepath, overwrite=True, \
-                       lockfile=True, include_optimizer=True):
+def save_model_to_hdf5(model, filepath, overwrite=True, include_optimizer=True):
   """Saves a model to a HDF5 file.
 
   The saved model contains:
@@ -92,9 +74,6 @@ def save_model_to_hdf5(model, filepath, overwrite=True, \
       overwrite: Whether we should overwrite any existing
           model at the target location, or instead
           ask the user with a manual prompt.
-      lockfile: Create a lockfile before saving the model
-          file to prevent from reading, while saving
-          is not done.
       include_optimizer: If True, save optimizer's state together.
 
   Raises:
@@ -121,10 +100,6 @@ def save_model_to_hdf5(model, filepath, overwrite=True, \
       if not proceed:
         return
 
-    # create lock file
-    if lockfile:
-      lockfile_path = create_lockfile(filepath)
-
     f = h5py.File(filepath, mode='w')
     opened_new_file = True
   else:
@@ -155,10 +130,6 @@ def save_model_to_hdf5(model, filepath, overwrite=True, \
     if opened_new_file:
       f.close()
 
-      # remove lock file
-      if lockfile:
-        gfile.Remove(lockfile_path)
-
 
 def load_model_from_hdf5(filepath, custom_objects=None, compile=True):  # pylint: disable=redefined-builtin
   """Loads a model saved via `save_model_to_hdf5`.
@@ -193,10 +164,6 @@ def load_model_from_hdf5(filepath, custom_objects=None, compile=True):  # pylint
 
   opened_new_file = not isinstance(filepath, h5py.File)
   if opened_new_file:
-    # check if lock file exist
-    if check_lockfile(filepath):
-      raise ValueError('Cannot read from file at this time.')
-
     f = h5py.File(filepath, mode='r')
   else:
     f = filepath

From 6d0cf63bb2cce6986ea6940d732f2942b53dfdeb Mon Sep 17 00:00:00 2001
From: Jay Shi <xiaojies@google.com>
Date: Tue, 30 Jun 2020 23:06:13 -0700
Subject: [PATCH 1377/1390] [tf.data] Avoid calling `CollectTunableParameters`
 function repeatedly.

PiperOrigin-RevId: 319168243
Change-Id: I8f3f0389820250bd0f6f9727f4ad10b6cf5c8ea9
---
 tensorflow/core/framework/model.cc | 8 ++++----
 tensorflow/core/framework/model.h  | 5 ++++-
 2 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/tensorflow/core/framework/model.cc b/tensorflow/core/framework/model.cc
index 198d2f6574c..6dcaf8ecac2 100644
--- a/tensorflow/core/framework/model.cc
+++ b/tensorflow/core/framework/model.cc
@@ -1320,14 +1320,14 @@ Model::CollectTunableParameters(std::shared_ptr<Node> node) {
 }
 
 absl::flat_hash_map<string, std::shared_ptr<Parameter>>
-Model::CollectEssentialParallelism(std::shared_ptr<Node> node) {
+Model::CollectEssentialParallelism(
+    std::shared_ptr<Node> node,
+    const absl::flat_hash_map<string, std::shared_ptr<Parameter>>& parameters) {
   // Parallelism parameter is considered to be essential if the corresponding
   // transformations's processing time is greater than essential rate times the
   // average transformation self processing time.
   constexpr double kEssentialRate = 0.3L;
 
-  absl::flat_hash_map<string, std::shared_ptr<Parameter>> parameters;
-  node->CollectTunableParameters(&parameters);
   absl::flat_hash_map<string, double> processing_times;
   double processing_time = node->TotalProcessingTime(&processing_times);
   double uniform_share =
@@ -1350,7 +1350,7 @@ void Model::OptimizeGradientDescent(int64 cpu_budget, int64 ram_budget) {
   }
   VLOG(2) << "Starting optimization of tunable parameters with GradientDescent";
   auto parameters = CollectTunableParameters(snapshot);
-  auto essential_parameters = CollectEssentialParallelism(snapshot);
+  auto essential_parameters = CollectEssentialParallelism(snapshot, parameters);
   // We add the number of model's buffered bytes because it is excluded from the
   // memory budget, but it is included in the maximum number of buffered bytes.
   ram_budget += TotalBufferedBytes(snapshot);
diff --git a/tensorflow/core/framework/model.h b/tensorflow/core/framework/model.h
index 82814ed1353..e8d78756192 100644
--- a/tensorflow/core/framework/model.h
+++ b/tensorflow/core/framework/model.h
@@ -628,7 +628,10 @@ class Model {
   // relative to other transformations. The collected parameters are returned
   // as a mapping from a (unique) node name to a parallelism parameter.
   absl::flat_hash_map<string, std::shared_ptr<Parameter>>
-  CollectEssentialParallelism(std::shared_ptr<Node> node);
+  CollectEssentialParallelism(
+      std::shared_ptr<Node> node,
+      const absl::flat_hash_map<string, std::shared_ptr<Parameter>>&
+          parameters);
 
   // This optimization algorithm starts by setting all tunable parallelism
   // parameters to the minimum value. It then repeatedly identifies the

From c4ae3c7a0430adb6970fb440ecd9fbe5da9c0461 Mon Sep 17 00:00:00 2001
From: Terry Heo <terryheo@google.com>
Date: Tue, 30 Jun 2020 23:52:28 -0700
Subject: [PATCH 1378/1390] Add an external delegate which is initialized from
 a shared library

This CL introduces the following API to manage an external delegate.
- TfLiteExternalDelegateCreate()
- TfLiteExternalDelegateOptionsDefault()
- TfLiteExternalDelegateDelete()
Also refactored the ExternalDelegateProvider to adopt this change.

PiperOrigin-RevId: 319171849
Change-Id: Ibccbb3a70aa91dfcf96745e13342cdbb2e77d832
---
 tensorflow/lite/delegates/external/BUILD      |  35 +++
 .../delegates/external/external_delegate.cc   | 243 ++++++++++++++++++
 .../delegates/external/external_delegate.h    |  53 ++++
 tensorflow/lite/tools/delegates/BUILD         |   1 +
 .../delegates/external_delegate_provider.cc   | 103 ++------
 5 files changed, 352 insertions(+), 83 deletions(-)
 create mode 100644 tensorflow/lite/delegates/external/BUILD
 create mode 100644 tensorflow/lite/delegates/external/external_delegate.cc
 create mode 100644 tensorflow/lite/delegates/external/external_delegate.h

diff --git a/tensorflow/lite/delegates/external/BUILD b/tensorflow/lite/delegates/external/BUILD
new file mode 100644
index 00000000000..ca23f95122f
--- /dev/null
+++ b/tensorflow/lite/delegates/external/BUILD
@@ -0,0 +1,35 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+package(
+    default_visibility = [
+        "//visibility:public",
+    ],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+cc_library(
+    name = "external_delegate",
+    srcs = ["external_delegate.cc"],
+    hdrs = ["external_delegate.h"],
+    deps = [
+        "//tensorflow/lite:minimal_logging",
+        "//tensorflow/lite/c:common",
+    ],
+)
+
+exports_files([
+    "external_delegate.h",
+])
diff --git a/tensorflow/lite/delegates/external/external_delegate.cc b/tensorflow/lite/delegates/external/external_delegate.cc
new file mode 100644
index 00000000000..5df158942f2
--- /dev/null
+++ b/tensorflow/lite/delegates/external/external_delegate.cc
@@ -0,0 +1,243 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/delegates/external/external_delegate.h"
+
+#include <string>
+#include <vector>
+
+#if defined(_WIN32)
+#include <Windows.h>
+#else
+#include <dlfcn.h>
+#endif
+
+#include "tensorflow/lite/minimal_logging.h"
+
+namespace tflite {
+namespace {
+
+// Library Support construct to handle dynamic library operations
+#if defined(_WIN32)
+struct LibSupport {
+  static void* Load(const char* lib) { return LoadLibrary(lib); }
+
+  static void* GetSymbol(void* handle, const char* symbol) {
+    return (void*)GetProcAddress((HMODULE)handle, symbol);
+  }
+
+  static int UnLoad(void* handle) { return FreeLibrary((HMODULE)handle); }
+};
+#else
+struct LibSupport {
+  static void* Load(const char* lib) {
+    return dlopen(lib, RTLD_LAZY | RTLD_LOCAL);
+  }
+
+  static void* GetSymbol(void* handle, const char* symbol) {
+    return dlsym(handle, symbol);
+  }
+
+  static int UnLoad(void* handle) { return dlclose(handle); }
+};
+#endif
+
+// External delegate library construct
+struct ExternalLib {
+  using CreateDelegatePtr = std::add_pointer<TfLiteDelegate*(
+      const char**, const char**, size_t,
+      void (*report_error)(const char*))>::type;
+  using DestroyDelegatePtr = std::add_pointer<void(TfLiteDelegate*)>::type;
+
+  // Open a given delegate library and load the create/destroy symbols
+  bool load(const std::string library) {
+    void* handle = LibSupport::Load(library.c_str());
+    if (handle == nullptr) {
+      TFLITE_LOG(TFLITE_LOG_INFO, "Unable to load external delegate from : %s",
+                 library.c_str());
+    } else {
+      create = reinterpret_cast<decltype(create)>(
+          LibSupport::GetSymbol(handle, "tflite_plugin_create_delegate"));
+      destroy = reinterpret_cast<decltype(destroy)>(
+          LibSupport::GetSymbol(handle, "tflite_plugin_destroy_delegate"));
+      return create && destroy;
+    }
+    return false;
+  }
+
+  CreateDelegatePtr create{nullptr};
+  DestroyDelegatePtr destroy{nullptr};
+};
+
+// An ExternalDelegateWrapper is responsibile to manage a TFLite delegate
+// initialized from a shared library. It creates a delegate from the given
+// option and storages it to external_delegate_ member variable. On the
+// destruction, it conducts necessary clean up process.
+class ExternalDelegateWrapper {
+ public:
+  explicit ExternalDelegateWrapper(
+      const TfLiteExternalDelegateOptions* options);
+  ~ExternalDelegateWrapper();
+
+  // Return a TfLiteDelegate which is created from
+  // tflite_plugin_create_delegate() of an external delegate logic.
+  TfLiteDelegate* tflite_external_delegate() { return external_delegate_; }
+
+  // Return a TfLiteDelegate which is convertibile to this class.
+  TfLiteDelegate* tflite_wrapper_delegate() { return &wrapper_delegate_; }
+
+ private:
+  ExternalLib external_lib_;
+
+  // external delegate instance owned by external delegate logic.
+  // It's created by "tflite_plugin_destroy_delegate()" function in the external
+  // delegate logic And it should be released by
+  // "tflite_plugin_destroy_delegate()" function.
+  TfLiteDelegate* external_delegate_;
+
+  // TfLiteDelegate representation of this ExternalDelegateWrapper object.
+  TfLiteDelegate wrapper_delegate_;
+};
+
+// Converts the given TfLiteDelegate to an ExternalDelegateWrapper instance.
+inline ExternalDelegateWrapper* GetExternalDelegateWrapper(
+    TfLiteDelegate* delegate) {
+  return reinterpret_cast<ExternalDelegateWrapper*>(delegate->data_);
+}
+
+// Relay Prepare() call to the associated external TfLiteDelegate object.
+TfLiteStatus DelegatePrepare(TfLiteContext* context, TfLiteDelegate* delegate) {
+  auto external_delegate_wrapper = GetExternalDelegateWrapper(delegate);
+  TfLiteDelegate* external_delegate =
+      external_delegate_wrapper->tflite_external_delegate();
+  return external_delegate->Prepare(context, external_delegate);
+}
+
+// Relay CopyFromBufferHandle() call to the associated external TfLiteDelegate
+// object.
+TfLiteStatus DelegateCopyFromBufferHandle(TfLiteContext* context,
+                                          struct TfLiteDelegate* delegate,
+                                          TfLiteBufferHandle buffer_handle,
+                                          TfLiteTensor* tensor) {
+  auto external_delegate_wrapper = GetExternalDelegateWrapper(delegate);
+  TfLiteDelegate* external_delegate =
+      external_delegate_wrapper->tflite_external_delegate();
+  return external_delegate->CopyFromBufferHandle(context, delegate,
+                                                 buffer_handle, tensor);
+}
+
+// Relay CopyToBufferHandle() call to the associated external TfLiteDelegate
+// object.
+TfLiteStatus DelegateCopyToBufferHandle(TfLiteContext* context,
+                                        struct TfLiteDelegate* delegate,
+                                        TfLiteBufferHandle buffer_handle,
+                                        TfLiteTensor* tensor) {
+  auto external_delegate_wrapper = GetExternalDelegateWrapper(delegate);
+  TfLiteDelegate* external_delegate =
+      external_delegate_wrapper->tflite_external_delegate();
+  return external_delegate->CopyToBufferHandle(context, delegate, buffer_handle,
+                                               tensor);
+}
+
+// Relay FreeBufferHandle() call to the associated external TfLiteDelegate
+// object.
+void DelegateFreeBufferHandle(TfLiteContext* context,
+                              struct TfLiteDelegate* delegate,
+                              TfLiteBufferHandle* handle) {
+  auto external_delegate_wrapper = GetExternalDelegateWrapper(delegate);
+  TfLiteDelegate* external_delegate =
+      external_delegate_wrapper->tflite_external_delegate();
+  return external_delegate->FreeBufferHandle(context, delegate, handle);
+}
+
+ExternalDelegateWrapper::ExternalDelegateWrapper(
+    const TfLiteExternalDelegateOptions* options) {
+  external_delegate_ = nullptr;
+  if (external_lib_.load(options->lib_path)) {
+    std::vector<const char*> ckeys, cvalues;
+    for (int i = 0; i < options->count; i++) {
+      ckeys.push_back(options->keys[i]);
+      cvalues.push_back(options->values[i]);
+    }
+
+    external_delegate_ = external_lib_.create(ckeys.data(), cvalues.data(),
+                                              ckeys.size(), nullptr);
+    if (external_delegate_) {
+      wrapper_delegate_ = {
+          .data_ = reinterpret_cast<void*>(this),
+          .Prepare = DelegatePrepare,
+          .CopyFromBufferHandle = nullptr,
+          .CopyToBufferHandle = nullptr,
+          .FreeBufferHandle = nullptr,
+          .flags = external_delegate_->flags,
+      };
+      if (external_delegate_->CopyFromBufferHandle) {
+        wrapper_delegate_.CopyFromBufferHandle = DelegateCopyFromBufferHandle;
+      }
+      if (external_delegate_->CopyToBufferHandle) {
+        wrapper_delegate_.CopyToBufferHandle = DelegateCopyToBufferHandle;
+      }
+      if (external_delegate_->FreeBufferHandle) {
+        wrapper_delegate_.FreeBufferHandle = DelegateFreeBufferHandle;
+      }
+    }
+  }
+}
+
+ExternalDelegateWrapper::~ExternalDelegateWrapper() {
+  if (external_delegate_ != nullptr) {
+    external_lib_.destroy(external_delegate_);
+  }
+}
+
+}  // namespace
+}  // namespace tflite
+
+// TfLiteExternalDelegateOptionsInsert adds key/value to the given
+// TfLiteExternalDelegateOptions instance.
+TfLiteStatus TfLiteExternalDelegateOptionsInsert(
+    TfLiteExternalDelegateOptions* options, const char* key,
+    const char* value) {
+  if (options->count >= kMaxOptions) {
+    return kTfLiteError;
+  }
+  options->keys[options->count] = key;
+  options->values[options->count] = value;
+  options->count++;
+  return kTfLiteOk;
+}
+
+TfLiteExternalDelegateOptions TfLiteExternalDelegateOptionsDefault(
+    const char* lib_path) {
+  TfLiteExternalDelegateOptions options = {
+      .lib_path = lib_path,
+      .count = 0,
+      .insert = TfLiteExternalDelegateOptionsInsert,
+  };
+  return options;
+}
+
+TfLiteDelegate* TfLiteExternalDelegateCreate(
+    const TfLiteExternalDelegateOptions* options) {
+  auto* external_delegate_wrapper =
+      new tflite::ExternalDelegateWrapper(options);
+  if (external_delegate_wrapper) {
+    return external_delegate_wrapper->tflite_wrapper_delegate();
+  }
+  return nullptr;
+}
+
+void TfLiteExternalDelegateDelete(TfLiteDelegate* delegate) {
+  delete tflite::GetExternalDelegateWrapper(delegate);
+}
diff --git a/tensorflow/lite/delegates/external/external_delegate.h b/tensorflow/lite/delegates/external/external_delegate.h
new file mode 100644
index 00000000000..774c0f07db3
--- /dev/null
+++ b/tensorflow/lite/delegates/external/external_delegate.h
@@ -0,0 +1,53 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_EXTERNAL_EXTERNAL_DELEGATE_H_
+#define TENSORFLOW_LITE_DELEGATES_EXTERNAL_EXTERNAL_DELEGATE_H_
+
+#include "tensorflow/lite/c/common.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// TfLiteExternalDelegateOptions is a structure of key/value options to create
+// an external delegate.
+const int kMaxOptions = 256;
+typedef struct TfLiteExternalDelegateOptions {
+  const char* lib_path;
+  int count;
+  const char* keys[kMaxOptions];
+  const char* values[kMaxOptions];
+  TfLiteStatus (*insert)(TfLiteExternalDelegateOptions* options,
+                         const char* key, const char* value);
+} TfLiteExternalDelegateOptions;
+
+// Populates TfLiteExternalDelegateOptions with the given shared library path.
+TfLiteExternalDelegateOptions TfLiteExternalDelegateOptionsDefault(
+    const char* lib_path);
+
+// Creates a new delegate instance that need to be destroyed with
+// `TfLiteExternalDelegateDelete` when delegate is no longer used by TFLite.
+TfLiteDelegate* TfLiteExternalDelegateCreate(
+    const TfLiteExternalDelegateOptions* options);
+
+// Destroys a delegate created with `TfLiteExternalDelegateCreate` call.
+void TfLiteExternalDelegateDelete(TfLiteDelegate* delegate);
+
+#ifdef __cplusplus
+}
+#endif  // __cplusplus
+
+#endif  // TENSORFLOW_LITE_DELEGATES_EXTERNAL_EXTERNAL_DELEGATE_H_
diff --git a/tensorflow/lite/tools/delegates/BUILD b/tensorflow/lite/tools/delegates/BUILD
index 93b918d37b1..def41cc6c69 100644
--- a/tensorflow/lite/tools/delegates/BUILD
+++ b/tensorflow/lite/tools/delegates/BUILD
@@ -151,6 +151,7 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         ":delegate_provider_hdr",
+        "//tensorflow/lite/delegates/external:external_delegate",
     ],
     alwayslink = 1,
 )
diff --git a/tensorflow/lite/tools/delegates/external_delegate_provider.cc b/tensorflow/lite/tools/delegates/external_delegate_provider.cc
index 193860820b1..4f7dfeb8646 100644
--- a/tensorflow/lite/tools/delegates/external_delegate_provider.cc
+++ b/tensorflow/lite/tools/delegates/external_delegate_provider.cc
@@ -12,45 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/lite/tools/delegates/delegate_provider.h"
-
-#if defined(_WIN32)
-#include <Windows.h>
-#else
-#include <dlfcn.h>
-#endif
 
 #include <string>
-#include <type_traits>
 #include <vector>
 
+#include "tensorflow/lite/delegates/external/external_delegate.h"
+#include "tensorflow/lite/tools/delegates/delegate_provider.h"
+
 namespace tflite {
 namespace tools {
-namespace {
-// Library Support construct to handle dynamic library operations
-#if defined(_WIN32)
-struct LibSupport {
-  static void* Load(const char* lib) { return LoadLibrary(lib); }
-
-  static void* GetSymbol(void* handle, const char* symbol) {
-    return (void*)GetProcAddress((HMODULE)handle, symbol);
-  }
-
-  static int UnLoad(void* handle) { return FreeLibrary((HMODULE)handle); }
-};
-#else
-struct LibSupport {
-  static void* Load(const char* lib) {
-    return dlopen(lib, RTLD_LAZY | RTLD_LOCAL);
-  }
-
-  static void* GetSymbol(void* handle, const char* symbol) {
-    return dlsym(handle, symbol);
-  }
-
-  static int UnLoad(void* handle) { return dlclose(handle); }
-};
-#endif
 
 // Split a given string to a vector of string using a delimiter character
 std::vector<std::string> SplitString(const std::string& str, char delimiter) {
@@ -63,32 +33,6 @@ std::vector<std::string> SplitString(const std::string& str, char delimiter) {
   return tokens;
 }
 
-// External delegate library construct
-struct ExternalLib {
-  using CreateDelegatePtr = std::add_pointer<TfLiteDelegate*(
-      const char**, const char**, size_t,
-      void (*report_error)(const char*))>::type;
-  using DestroyDelegatePtr = std::add_pointer<void(TfLiteDelegate*)>::type;
-
-  // Open a given delegate library and load the create/destroy symbols
-  bool load(const std::string library) {
-    void* handle = LibSupport::Load(library.c_str());
-    if (handle == nullptr) {
-      TFLITE_LOG(INFO) << "Unable to load external delegate from : " << library;
-    } else {
-      create = reinterpret_cast<decltype(create)>(
-          LibSupport::GetSymbol(handle, "tflite_plugin_create_delegate"));
-      destroy = reinterpret_cast<decltype(destroy)>(
-          LibSupport::GetSymbol(handle, "tflite_plugin_destroy_delegate"));
-      return create && destroy;
-    }
-    return false;
-  }
-
-  CreateDelegatePtr create{nullptr};
-  DestroyDelegatePtr destroy{nullptr};
-};
-}  // namespace
 
 // External delegate provider used to dynamically load delegate libraries
 // Note: Assumes the lifetime of the provider exceeds the usage scope of
@@ -136,35 +80,28 @@ TfLiteDelegatePtr ExternalDelegateProvider::CreateTfLiteDelegate(
   TfLiteDelegatePtr delegate(nullptr, [](TfLiteDelegate*) {});
   std::string lib_path = params.Get<std::string>("external_delegate_path");
   if (!lib_path.empty()) {
-    ExternalLib delegate_lib;
-    if (delegate_lib.load(lib_path)) {
-      // Parse delegate options
-      const std::vector<std::string> options = SplitString(
-          params.Get<std::string>("external_delegate_options"), ';');
-      std::vector<std::string> keys, values;
-      for (const auto& option : options) {
-        auto key_value = SplitString(option, ':');
-        if (key_value.size() == 2) {
-          values.push_back(std::move(key_value[1]));
-          keys.push_back(std::move(key_value[0]));
-        }
-      }
+    auto delegate_options =
+        TfLiteExternalDelegateOptionsDefault(lib_path.c_str());
 
-      const size_t num_options = keys.size();
-      std::vector<const char*> ckeys, cvalues;
-      for (int i = 0; i < num_options; ++i) {
-        ckeys.push_back(keys[i].c_str());
-        cvalues.push_back(values[i].c_str());
+    // Parse delegate options
+    const std::vector<std::string> options =
+        SplitString(params.Get<std::string>("external_delegate_options"), ';');
+    std::vector<std::string> keys, values;
+    for (const auto& option : options) {
+      auto key_value = SplitString(option, ':');
+      if (key_value.size() == 2) {
+        delegate_options.insert(&delegate_options, key_value[0].c_str(),
+                                key_value[1].c_str());
       }
-
-      // Create delegate
-      delegate =
-          TfLiteDelegatePtr(delegate_lib.create(ckeys.data(), cvalues.data(),
-                                                num_options, nullptr),
-                            delegate_lib.destroy);
     }
+
+    auto external_delegate = TfLiteExternalDelegateCreate(&delegate_options);
+    return TfLiteDelegatePtr(external_delegate, [](TfLiteDelegate* delegate) {
+      TfLiteExternalDelegateDelete(delegate);
+    });
   }
   return delegate;
 }
+
 }  // namespace tools
 }  // namespace tflite

From d8c49c2fdeed114470e2773967dddb932e9a9fb3 Mon Sep 17 00:00:00 2001
From: Xinyi Wang <wxinyi@google.com>
Date: Wed, 1 Jul 2020 00:10:24 -0700
Subject: [PATCH 1379/1390] Add multi_worker_mirrored_strategy to
 moving_averages_test

PiperOrigin-RevId: 319173462
Change-Id: I940527e1c205599324c2bfb2fcdef9c24076d1d9
---
 .../python/distribute/moving_averages_test.py | 110 +++++++++---------
 1 file changed, 56 insertions(+), 54 deletions(-)

diff --git a/tensorflow/python/distribute/moving_averages_test.py b/tensorflow/python/distribute/moving_averages_test.py
index 2d0ab80f1f2..577a6c1168f 100644
--- a/tensorflow/python/distribute/moving_averages_test.py
+++ b/tensorflow/python/distribute/moving_averages_test.py
@@ -20,14 +20,13 @@ from __future__ import print_function
 
 from absl.testing import parameterized
 
+from tensorflow.python.distribute import collective_all_reduce_strategy
 from tensorflow.python.distribute import combinations
 from tensorflow.python.distribute import strategy_combinations
 from tensorflow.python.distribute import tpu_strategy
 from tensorflow.python.eager import def_function
 from tensorflow.python.eager import test
 from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
-from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.training import moving_averages
 
@@ -38,6 +37,10 @@ all_distributions = [
     strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
     strategy_combinations.central_storage_strategy_with_gpu_and_cpu,
     strategy_combinations.tpu_strategy,
+    strategy_combinations.multi_worker_mirrored_2x1_cpu,
+    strategy_combinations.multi_worker_mirrored_2x1_gpu,
+    strategy_combinations.multi_worker_mirrored_2x2_gpu,
+    strategy_combinations.multi_worker_mirrored_4x1_cpu,
 ]
 
 all_combinations = combinations.combine(
@@ -62,11 +65,11 @@ class AssignMovingAveragesTest(test.TestCase, parameterized.TestCase):
           var, val, decay, zero_debias=False)
       return var, assign
 
-    with distribution.scope(), self.cached_session() as sess:
+    with distribution.scope():
       var, assign = distribution.extended.call_for_each_replica(replica_fn)
-      variables.global_variables_initializer().run()
-      self.assertAllClose([10.0, 11.0], var)
-      sess.run(distribution.experimental_local_results(assign))
+      self.evaluate(variables.global_variables_initializer())
+      self.assertAllClose([10.0, 11.0], self.evaluate(var))
+      self.evaluate(distribution.experimental_local_results(assign))
       # Mean of val across calls to replica_fn().
       average_val = [1.0 + 0.5 * (replica_id[0] - 1),
                      2.0 - 0.5 * (replica_id[0] - 1)]
@@ -74,7 +77,7 @@ class AssignMovingAveragesTest(test.TestCase, parameterized.TestCase):
       self.assertAllClose(
           [10.0 * 0.25 + average_val[0] * val_weight,
            11.0 * 0.25 + average_val[1] * val_weight],
-          var.eval())
+          self.evaluate(var))
 
   @combinations.generate(all_combinations)
   def testReplicaMode(self, distribution):
@@ -88,19 +91,19 @@ class AssignMovingAveragesTest(test.TestCase, parameterized.TestCase):
       assign = moving_averages.assign_moving_average(var, val, decay)
       return var, assign.op
 
-    with distribution.scope(), self.cached_session() as sess:
+    with distribution.scope():
       var, assign_op = distribution.extended.call_for_each_replica(replica_fn)
-      variables.global_variables_initializer().run()
-      self.assertAllClose([0.0, 0.0], var)
-      sess.run(distribution.experimental_local_results(assign_op))
+      self.evaluate(variables.global_variables_initializer())
+      self.assertAllClose([0.0, 0.0], self.evaluate(var))
+      self.evaluate(distribution.experimental_local_results(assign_op))
       # Mean of val across calls to replica_fn().
       average_val = [1.0 + 0.5 * (replica_id[0] - 1),
                      2.0 - 0.5 * (replica_id[0] - 1)]
-      self.assertAllClose(average_val, var)
+      self.assertAllClose(average_val, self.evaluate(var))
 
   @combinations.generate(all_combinations)
   def testCrossDeviceWithoutZeroDebias(self, distribution):
-    with distribution.scope(), self.cached_session() as sess:
+    with distribution.scope():
       var = variables.Variable([10.0, 11.0])
       val = constant_op.constant([1.0, 2.0])
       decay = 0.25
@@ -109,45 +112,38 @@ class AssignMovingAveragesTest(test.TestCase, parameterized.TestCase):
       assign = moving_averages.assign_moving_average(
           var, val, decay, zero_debias=False)
 
-      variables.global_variables_initializer().run()
-      self.assertAllClose([10.0, 11.0], var)
-      sess.run(assign)
+      self.evaluate(variables.global_variables_initializer())
+      self.assertAllClose([10.0, 11.0], self.evaluate(var))
+      self.evaluate(assign)
       average_val = [1.0, 2.0]
       val_weight = 1.0 - 0.25
       self.assertAllClose(
           [10.0 * 0.25 + average_val[0] * val_weight,
            11.0 * 0.25 + average_val[1] * val_weight],
-          var.eval())
+          self.evaluate(var))
       # Also try assign.op.
-      sess.run(assign.op)
+      self.evaluate(assign.op)
       orig_weight = 0.25 * 0.25
       val_weight = 1.0 - orig_weight
       self.assertAllClose(
           [10.0 * orig_weight + average_val[0] * val_weight,
            11.0 * orig_weight + average_val[1] * val_weight],
-          var.eval())
+          self.evaluate(var))
 
   @combinations.generate(all_combinations)
   def testCrossDevice(self, distribution):
-    with distribution.scope(), self.cached_session() as sess:
+    with distribution.scope():
       var = variables.Variable([0.0, 0.0])
-      val = array_ops.placeholder(dtypes.float32)
+      val = variables.Variable([1.0, 2.0])
       decay = 0.25
       # NOTE(josh11b): We currently generate an error if val is a PerReplica
       # value.
       assign = moving_averages.assign_moving_average(var, val, decay)
 
-      variables.global_variables_initializer().run()
-      self.assertAllClose([0.0, 0.0], var)
-      sess.run(assign, feed_dict={val: [1.0, 2.0]})
-      self.assertAllClose([1.0, 2.0], var)
-
-      # Also try assign.op.
-      sess.run(assign.op, feed_dict={val: [10.0, 0.0]})
-      self.assertAllClose(
-          [(1.0 * 0.25 + 10.0) / (1.0 * 0.25 + 1.0),
-           (2.0 * 0.25 + 0.0) / (1.0 * 0.25 + 1.0)],
-          var.eval())
+      self.evaluate(variables.global_variables_initializer())
+      self.assertAllClose([0.0, 0.0], self.evaluate(var))
+      self.evaluate(assign)
+      self.assertAllClose([1.0, 2.0], self.evaluate(var))
 
   @combinations.generate(all_combinations_eager)
   def testUpdateContext(self, distribution, use_function):
@@ -179,14 +175,14 @@ class AssignMovingAveragesTest(test.TestCase, parameterized.TestCase):
           var, val, decay, zero_debias=False)
       return var, assign
 
-    with distribution.scope(), self.cached_session() as sess:
+    with distribution.scope():
       var, assign = distribution.extended.call_for_each_replica(replica_fn)
-      variables.global_variables_initializer().run()
-      self.assertAllClose([10.0, 11.0], var)
-      sess.run(distribution.experimental_local_results(assign))
+      self.evaluate(variables.global_variables_initializer())
+      self.assertAllClose([10.0, 11.0], self.evaluate(var))
+      self.evaluate(distribution.experimental_local_results(assign))
       self.assertAllClose(
           [10 * 0.25 + 1. * (1 - 0.25), 11 * 0.25 + 2. * (1 - 0.25)],
-          var.eval())
+          self.evaluate(var))
 
 
 class ExponentialMovingAverageTest(test.TestCase, parameterized.TestCase):
@@ -196,6 +192,10 @@ class ExponentialMovingAverageTest(test.TestCase, parameterized.TestCase):
     if not use_function and isinstance(
         distribution, (tpu_strategy.TPUStrategy, tpu_strategy.TPUStrategyV1)):
       self.skipTest("TPUStrategy doesn't support pure eager execution.")
+    if isinstance(distribution,
+                  collective_all_reduce_strategy.CollectiveAllReduceStrategy):
+      self.skipTest("b/160194267: Cannot do variable.assign([0.5]) in replica "
+                    "context with MultiWorkerMirroredStrategy.")
     with distribution.scope():
       w = variables.Variable([1.0],
                              name="w",
@@ -255,33 +255,35 @@ class ExponentialMovingAverageTest(test.TestCase, parameterized.TestCase):
                   (tpu_strategy.TPUStrategy, tpu_strategy.TPUStrategyV1)):
       self.skipTest("b/139550827: Cannot do variable.assign in replica context "
                     "of TPUStrategy")
+    if isinstance(distribution,
+                  collective_all_reduce_strategy.CollectiveAllReduceStrategy):
+      self.skipTest("b/160194267: Cannot do variable.assign([0.5]) in replica "
+                    "context with MultiWorkerMirroredStrategy.")
     with distribution.scope():
       w_assign, w_apply, ema_w = distribution.run(
           self._ema_replica_fn_graph)
     self.assertEqual(ema_w.name, "w/ExponentialMovingAverage:0")
-    with self.cached_session():
-      variables.global_variables_initializer().run()
-      self.evaluate(distribution.experimental_local_results(w_apply))
-      self.evaluate(distribution.experimental_local_results(w_assign))
-      self.evaluate(distribution.experimental_local_results(w_apply))
-      self.assertAllClose(
-          self.evaluate(distribution.experimental_local_results(ema_w))[0],
-          [0.89999998])
+    self.evaluate(variables.global_variables_initializer())
+    self.evaluate(distribution.experimental_local_results(w_apply))
+    self.evaluate(distribution.experimental_local_results(w_assign))
+    self.evaluate(distribution.experimental_local_results(w_apply))
+    self.assertAllClose(
+        self.evaluate(distribution.experimental_local_results(ema_w))[0],
+        [0.89999998])
 
   @combinations.generate(all_combinations)
   def testCrossReplicaContextGraph(self, distribution):
     with distribution.scope():
       w_assign, w_apply, ema_w = self._ema_replica_fn_graph()
     self.assertEqual(ema_w.name, "w/ExponentialMovingAverage:0")
-    with self.cached_session():
-      variables.global_variables_initializer().run()
-      self.evaluate(distribution.experimental_local_results(w_apply))
-      self.evaluate(distribution.experimental_local_results(w_assign))
-      self.evaluate(distribution.experimental_local_results(w_apply))
-      self.assertAllClose(
-          self.evaluate(distribution.experimental_local_results(ema_w))[0],
-          [0.89999998])
+    self.evaluate(variables.global_variables_initializer())
+    self.evaluate(distribution.experimental_local_results(w_apply))
+    self.evaluate(distribution.experimental_local_results(w_assign))
+    self.evaluate(distribution.experimental_local_results(w_apply))
+    self.assertAllClose(
+        self.evaluate(distribution.experimental_local_results(ema_w))[0],
+        [0.89999998])
 
 
 if __name__ == "__main__":
-  test.main()
+  combinations.main()

From cd0a2e6c1fbbff32f9b56e28634d831ccfbcb11d Mon Sep 17 00:00:00 2001
From: Tomer Kaftan <kaftan@google.com>
Date: Wed, 1 Jul 2020 01:20:38 -0700
Subject: [PATCH 1380/1390] Enable dispatching for the publicly exposed
 `tf.convert_to_tensor` api.

This CL avoids enabling dispatch for the internal `convert_to_tensor` that api symbols use, so that the current version of dispatching continues to work.

This change is useful because user code that expects tensor-like objects often starts of with `tf.convert_to_tensor`, but those very same objects may be relying on dispatching to function like tensors (e.g. KerasTensors). Actually registering a tensor conversion would break the dispatching logic they utilize.

This will probably not be needed in future designs for dispatching

PiperOrigin-RevId: 319181371
Change-Id: I7788ef9c63bcacafd17b353808f58df921d1ecc8
---
 tensorflow/python/framework/ops.py      | 34 ++++++++++++++++++++-----
 tensorflow/python/util/dispatch_test.py | 23 +++++++++++++++--
 2 files changed, 49 insertions(+), 8 deletions(-)

diff --git a/tensorflow/python/framework/ops.py b/tensorflow/python/framework/ops.py
index 000e3bb87a0..8633665fe29 100644
--- a/tensorflow/python/framework/ops.py
+++ b/tensorflow/python/framework/ops.py
@@ -67,6 +67,7 @@ from tensorflow.python.types import internal
 from tensorflow.python.util import compat
 from tensorflow.python.util import decorator_utils
 from tensorflow.python.util import deprecation
+from tensorflow.python.util import dispatch
 from tensorflow.python.util import function_utils
 from tensorflow.python.util import lock_util
 from tensorflow.python.util import memory
@@ -1257,11 +1258,13 @@ EagerTensor = pywrap_tfe.TFE_Py_InitEagerTensor(_EagerTensorBase)
 
 
 @tf_export(v1=["convert_to_tensor"])
-def convert_to_tensor_v1(value,
-                         dtype=None,
-                         name=None,
-                         preferred_dtype=None,
-                         dtype_hint=None):
+@dispatch.add_dispatch_support
+def convert_to_tensor_v1_with_dispatch(
+    value,
+    dtype=None,
+    name=None,
+    preferred_dtype=None,
+    dtype_hint=None):
   """Converts the given `value` to a `Tensor`.
 
   This function converts Python objects of various types to `Tensor`
@@ -1311,13 +1314,26 @@ def convert_to_tensor_v1(value,
     RuntimeError: If a registered conversion function returns an invalid value.
     ValueError: If the `value` is a tensor not of given `dtype` in graph mode.
   """
+  return convert_to_tensor_v1(value, dtype=dtype, name=name,
+                              preferred_dtype=preferred_dtype,
+                              dtype_hint=dtype_hint)
+
+
+def convert_to_tensor_v1(value,
+                         dtype=None,
+                         name=None,
+                         preferred_dtype=None,
+                         dtype_hint=None):
+  """Converts the given `value` to a `Tensor` (with the TF1 API)."""
   preferred_dtype = deprecation.deprecated_argument_lookup(
       "dtype_hint", dtype_hint, "preferred_dtype", preferred_dtype)
   return convert_to_tensor_v2(value, dtype, preferred_dtype, name)
 
 
 @tf_export("convert_to_tensor", v1=[])
-def convert_to_tensor_v2(value, dtype=None, dtype_hint=None, name=None):
+@dispatch.add_dispatch_support
+def convert_to_tensor_v2_with_dispatch(
+    value, dtype=None, dtype_hint=None, name=None):
   """Converts the given `value` to a `Tensor`.
 
   This function converts Python objects of various types to `Tensor`
@@ -1378,6 +1394,12 @@ def convert_to_tensor_v2(value, dtype=None, dtype_hint=None, name=None):
     RuntimeError: If a registered conversion function returns an invalid value.
     ValueError: If the `value` is a tensor not of given `dtype` in graph mode.
   """
+  return convert_to_tensor_v2(
+      value, dtype=dtype, dtype_hint=dtype_hint, name=name)
+
+
+def convert_to_tensor_v2(value, dtype=None, dtype_hint=None, name=None):
+  """Converts the given `value` to a `Tensor`."""
   return convert_to_tensor(
       value=value,
       dtype=dtype,
diff --git a/tensorflow/python/util/dispatch_test.py b/tensorflow/python/util/dispatch_test.py
index 49026a754e4..f6074ac415d 100644
--- a/tensorflow/python/util/dispatch_test.py
+++ b/tensorflow/python/util/dispatch_test.py
@@ -27,6 +27,7 @@ from tensorflow.python.platform import test
 from tensorflow.python.platform import tf_logging
 from tensorflow.python.util import deprecation
 from tensorflow.python.util import dispatch
+from tensorflow.python.util.tf_export import get_canonical_name_for_symbol
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -76,7 +77,8 @@ class TensorTracerOpDispatcher(dispatch.GlobalOpDispatcher):
             any(self.is_tensor_tracer_arg(x) for x in kwargs.values())):
       return self.NOT_SUPPORTED
 
-    return TensorTracer(op.__name__, args, kwargs)
+    symbol_name = get_canonical_name_for_symbol(op)
+    return TensorTracer(symbol_name, args, kwargs)
 
   def is_tensor_tracer_arg(self, value):
     if isinstance(value, TensorTracer):
@@ -183,12 +185,29 @@ class DispatchTest(test_util.TensorFlowTestCase):
       y = TensorTracer("y")
       trace = math_ops.reduce_sum(math_ops.add(math_ops.abs(x), y), axis=3)
       self.assertEqual(
-          str(trace), "reduce_sum(add(name=None, x=abs(x), y=y), axis=3)")
+          str(trace),
+          "math.reduce_sum(math.add(name=None, x=math.abs(x), y=y), axis=3)")
 
     finally:
       # Clean up.
       dispatch._GLOBAL_DISPATCHERS = original_global_dispatchers
 
+  def testGlobalDispatcherConvertToTensor(self):
+    original_global_dispatchers = dispatch._GLOBAL_DISPATCHERS
+    try:
+      TensorTracerOpDispatcher().register()
+
+      x = TensorTracer("x")
+      y = TensorTracer("y")
+      trace = math_ops.add(math_ops.abs(
+          ops.convert_to_tensor_v2_with_dispatch(x)), y)
+      self.assertEqual(
+          str(trace),
+          "math.add(name=None, x=math.abs(convert_to_tensor(x)), y=y)")
+
+    finally:
+      # Clean up.
+      dispatch._GLOBAL_DISPATCHERS = original_global_dispatchers
 
 if __name__ == "__main__":
   googletest.main()

From a5d8f188bcc4d5ffe9d5bfcc0fe1de411fb214d9 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 1 Jul 2020 02:01:38 -0700
Subject: [PATCH 1381/1390] Update GraphDef version to 449.

PiperOrigin-RevId: 319185220
Change-Id: I76ff23cadc4e0d9435c63a69929a1d8d993146a6
---
 tensorflow/core/public/version.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index 9d183076374..308b29f9ed5 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -108,7 +108,7 @@ limitations under the License.
 
 #define TF_GRAPH_DEF_VERSION_MIN_PRODUCER 0
 #define TF_GRAPH_DEF_VERSION_MIN_CONSUMER 0
-#define TF_GRAPH_DEF_VERSION 448  // Updated: 2020/6/30
+#define TF_GRAPH_DEF_VERSION 449  // Updated: 2020/7/1
 
 // Checkpoint compatibility versions (the versions field in SavedSliceMeta).
 //

From 6fc54250a5f3142cb1108dd53d3b7f291ba2e4f0 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 1 Jul 2020 02:01:39 -0700
Subject: [PATCH 1382/1390] compat: Update forward compatibility horizon to
 2020-07-01

PiperOrigin-RevId: 319185221
Change-Id: I3a9b75bf05016c3abcead58de570d0600ba6a4cf
---
 tensorflow/python/compat/compat.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/compat/compat.py b/tensorflow/python/compat/compat.py
index f1935b80ed5..80a087b0cfe 100644
--- a/tensorflow/python/compat/compat.py
+++ b/tensorflow/python/compat/compat.py
@@ -33,7 +33,7 @@ from tensorflow.python.util.tf_export import tf_export
 # This value changes every day with an automatic CL. It can be modified in code
 # via `forward_compatibility_horizon()` or with the environment variable
 # TF_FORWARD_COMPATIBILITY_DELTA_DAYS, which is added to the compatibility date.
-_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2020, 6, 30)
+_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2020, 7, 1)
 _FORWARD_COMPATIBILITY_DELTA_DAYS_VAR_NAME = "TF_FORWARD_COMPATIBILITY_DELTA_DAYS"
 _FORWARD_COMPATIBILITY_DATE_NUMBER = None
 

From fc49cbb2adf0b69f843f7bf904978648bffbe268 Mon Sep 17 00:00:00 2001
From: Taehee Jeong <taeheej@google.com>
Date: Wed, 1 Jul 2020 02:26:51 -0700
Subject: [PATCH 1383/1390] Fix bug on referencing invalid reference to a
 tensor.

Adding tensors to the TfLiteContext will give TfLiteContext.tensors a new address, so the old references should be updated with new one.

PiperOrigin-RevId: 319188033
Change-Id: I1538d6260236f7cf5a710621d6af330a8639f443
---
 tensorflow/lite/delegates/gpu/common/object_reader.cc | 6 +++++-
 tensorflow/lite/delegates/utils.cc                    | 2 +-
 tensorflow/lite/delegates/utils.h                     | 3 ++-
 3 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/tensorflow/lite/delegates/gpu/common/object_reader.cc b/tensorflow/lite/delegates/gpu/common/object_reader.cc
index 55a0aea01a1..f299232a8a7 100644
--- a/tensorflow/lite/delegates/gpu/common/object_reader.cc
+++ b/tensorflow/lite/delegates/gpu/common/object_reader.cc
@@ -37,7 +37,7 @@ absl::Status ObjectReader::ReadNonConstantTensor(
   }
 
   if (tensor_to_value->find(tensor_idx) == tensor_to_value->end()) {
-    const TfLiteTensor& tflite_tensor = context->tensors[tensor_idx];
+    TfLiteTensor& tflite_tensor = context->tensors[tensor_idx];
     if (tflite::IsConstantTensor(&tflite_tensor)) {
       return absl::InvalidArgumentError(absl::StrCat(
           "ReadNonConstantTensor: value is a constant tensor: ", tensor_idx));
@@ -58,6 +58,7 @@ absl::Status ObjectReader::ReadNonConstantTensor(
                 &fp_tensor_index) != kTfLiteOk) {
           return absl::InternalError("Could not add new tensor to graph");
         }
+
         // Remember this tensor for later.
         (*quant_conversion_map)[fp_tensor_index] = tensor_idx;
         (*quant_conversion_map)[tensor_idx] = fp_tensor_index;
@@ -67,6 +68,9 @@ absl::Status ObjectReader::ReadNonConstantTensor(
             ConvertTfLiteTensorToTensorRef(*fp_tflite_tensor, &value->tensor));
         value->tensor.ref = fp_tensor_index;
         value->quant_params.emplace();
+        // tflite_tensor from the outer scope is invalidated due to calling
+        // CreateNewTensorWithDifferentType
+        tflite_tensor = context->tensors[tensor_idx];
         RETURN_IF_ERROR(
             PopulateQuantParams(tflite_tensor, &value->quant_params.value()));
         (*tensor_to_value)[fp_tensor_index] = value;
diff --git a/tensorflow/lite/delegates/utils.cc b/tensorflow/lite/delegates/utils.cc
index 873cadc180f..289586c5346 100644
--- a/tensorflow/lite/delegates/utils.cc
+++ b/tensorflow/lite/delegates/utils.cc
@@ -29,8 +29,8 @@ TfLiteStatus CreateNewTensorWithDifferentType(TfLiteContext* context,
                                               TfLiteType new_type,
                                               TfLiteTensor** new_tensor,
                                               int* new_tensor_index) {
-  const TfLiteTensor& original_tensor = context->tensors[original_tensor_index];
   TF_LITE_ENSURE_STATUS(context->AddTensors(context, 1, new_tensor_index));
+  const TfLiteTensor& original_tensor = context->tensors[original_tensor_index];
   *new_tensor = &context->tensors[*new_tensor_index];
   (*new_tensor)->type = new_type;
   (*new_tensor)->allocation_type = kTfLiteArenaRw;
diff --git a/tensorflow/lite/delegates/utils.h b/tensorflow/lite/delegates/utils.h
index 12684fcb84a..a9fb67316fc 100644
--- a/tensorflow/lite/delegates/utils.h
+++ b/tensorflow/lite/delegates/utils.h
@@ -33,7 +33,8 @@ namespace tflite {
 namespace delegates {
 
 // Creates a new Read/Write tensor having the same shape as the original, but
-// with a different type.
+// with a different type. Note that this might void existing references to
+// tensors.
 TfLiteStatus CreateNewTensorWithDifferentType(TfLiteContext* context,
                                               const int original_tensor_index,
                                               TfLiteType new_type,

From 4fba4cbfdc098b4192622654fa74848c45755278 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 1 Jul 2020 02:49:04 -0700
Subject: [PATCH 1384/1390] Modify InitializeTableFromDatasetOp to be async and
 add the table init op to the table initializers collection

PiperOrigin-RevId: 319190503
Change-Id: Ibd94b6f194e839fc11e37f1153c43533462bc265
---
 tensorflow/core/kernels/BUILD                 |  1 +
 .../core/kernels/lookup_table_init_op.cc      | 21 +++---
 tensorflow/core/kernels/lookup_util.cc        | 71 +++++++++++--------
 tensorflow/core/kernels/lookup_util.h         |  7 +-
 .../python/kernel_tests/lookup_ops_test.py    | 12 ++++
 tensorflow/python/ops/lookup_ops.py           |  1 +
 6 files changed, 71 insertions(+), 42 deletions(-)

diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index bd540baa65a..908deb06d0e 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -417,6 +417,7 @@ cc_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "//tensorflow/core/framework:op_requires",
     ],
 )
 
diff --git a/tensorflow/core/kernels/lookup_table_init_op.cc b/tensorflow/core/kernels/lookup_table_init_op.cc
index 49744cea59e..7bffb5ac547 100644
--- a/tensorflow/core/kernels/lookup_table_init_op.cc
+++ b/tensorflow/core/kernels/lookup_table_init_op.cc
@@ -164,24 +164,29 @@ REGISTER_KERNEL_BUILDER(
     Name("InitializeTableFromTextFileV2").Device(DEVICE_CPU),
     InitializeTableFromTextFileOp);
 
-class InitializeTableFromDatasetOp : public OpKernel {
+class InitializeTableFromDatasetOp : public AsyncOpKernel {
  public:
   explicit InitializeTableFromDatasetOp(OpKernelConstruction* ctx)
-      : OpKernel(ctx) {}
+      : AsyncOpKernel(ctx),
+        background_worker_(ctx->env(), "initialize_table_from_dataset") {}
 
-  void Compute(OpKernelContext* ctx) override {
+  void ComputeAsync(OpKernelContext* ctx, DoneCallback done) override {
     lookup::InitializableLookupTable* table;
-    OP_REQUIRES_OK(ctx,
-                   GetInitializableLookupTable("table_handle", ctx, &table));
+    OP_REQUIRES_OK_ASYNC(
+        ctx, GetInitializableLookupTable("table_handle", ctx, &table), done);
     core::ScopedUnref unref_me(table);
     DatasetBase* dataset;
-    OP_REQUIRES_OK(ctx, GetDatasetFromVariantTensor(ctx->input(1), &dataset));
-    OP_REQUIRES_OK(ctx,
-                   lookup::InitializeTableFromDataset(ctx, dataset, table));
+    OP_REQUIRES_OK_ASYNC(
+        ctx, GetDatasetFromVariantTensor(ctx->input(1), &dataset), done);
+    background_worker_.Schedule([ctx, dataset, table, done]() {
+      lookup::InitializeTableFromDataset(ctx, dataset, table, done);
+    });
   }
 
  private:
   TF_DISALLOW_COPY_AND_ASSIGN(InitializeTableFromDatasetOp);
+
+  data::BackgroundWorker background_worker_;
 };
 
 REGISTER_KERNEL_BUILDER(Name("InitializeTableFromDataset").Device(DEVICE_CPU),
diff --git a/tensorflow/core/kernels/lookup_util.cc b/tensorflow/core/kernels/lookup_util.cc
index ef9bca4475c..802e1ff5c35 100644
--- a/tensorflow/core/kernels/lookup_util.cc
+++ b/tensorflow/core/kernels/lookup_util.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/function_handle_cache.h"
+#include "tensorflow/core/framework/op_requires.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/lib/core/errors.h"
@@ -451,46 +452,54 @@ class DatasetIterator : public InitializableLookupTable::InitTableIterator {
   Status status_;
 };
 
-Status InitializeTableFromDataset(OpKernelContext* ctx,
-                                  data::DatasetBase* dataset,
-                                  InitializableLookupTable* table) {
+void InitializeTableFromDataset(OpKernelContext* ctx,
+                                data::DatasetBase* dataset,
+                                InitializableLookupTable* table,
+                                AsyncOpKernel::DoneCallback done) {
   // Assert that the dataset types match up to that expected in the table.
   const auto& dataset_types = dataset->output_dtypes();
-  if (dataset_types.size() != 2) {
-    return errors::InvalidArgument("Dataset should have two output types only");
-  }
-  if (dataset_types[0] != table->key_dtype()) {
-    return errors::InvalidArgument("Key dtype expected: ", table->key_dtype(),
-                                   " but obtained: ", dataset_types[0],
-                                   " from the dataset");
-  }
-  if (dataset_types[1] != table->value_dtype()) {
-    return errors::InvalidArgument(
-        "Value dtype expected: ", table->value_dtype(),
-        " but obtained: ", dataset_types[1], " from the dataset");
-  }
+  OP_REQUIRES_ASYNC(
+      ctx, dataset_types.size() == 2,
+      errors::InvalidArgument("Dataset should have two output types only"),
+      done);
+  OP_REQUIRES_ASYNC(
+      ctx, dataset_types[0] == table->key_dtype(),
+      errors::InvalidArgument("Key dtype expected: ", table->key_dtype(),
+                              " but obtained: ", dataset_types[0],
+                              " from the dataset"),
+      done);
+  OP_REQUIRES_ASYNC(
+      ctx, dataset_types[1] == table->value_dtype(),
+      errors::InvalidArgument("Value dtype expected: ", table->value_dtype(),
+                              " but obtained: ", dataset_types[1],
+                              " from the dataset"),
+      done);
   // Assert that the dataset output shapes are scalars.
   const auto& dataset_shapes = dataset->output_shapes();
-  if (dataset_shapes.size() != 2) {
-    return errors::InvalidArgument(
-        "Dataset should have two output shapes only");
-  }
-  if (!dataset_shapes[0].IsCompatibleWith(PartialTensorShape({}))) {
-    return errors::InvalidArgument("Expected scalar for key. Obtained: ",
-                                   dataset_shapes[0].DebugString());
-  }
-  if (!dataset_shapes[1].IsCompatibleWith(PartialTensorShape({}))) {
-    return errors::InvalidArgument("Expected scalar for key. Obtained: ",
-                                   dataset_shapes[1].DebugString());
-  }
+  OP_REQUIRES_ASYNC(
+      ctx, dataset_shapes.size() == 2,
+      errors::InvalidArgument("Dataset should have two output shapes only"),
+      done);
+  OP_REQUIRES_ASYNC(
+      ctx, dataset_shapes[0].IsCompatibleWith(PartialTensorShape({})),
+      errors::InvalidArgument("Expected scalar for key. Obtained: ",
+                              dataset_shapes[0].DebugString()),
+      done);
+  OP_REQUIRES_ASYNC(
+      ctx, dataset_shapes[1].IsCompatibleWith(PartialTensorShape({})),
+      errors::InvalidArgument("Expected scalar for key. Obtained: ",
+                              dataset_shapes[1].DebugString()),
+      done);
   DatasetIterator iter(dataset);
-  TF_RETURN_IF_ERROR(iter.Init(ctx));
+  OP_REQUIRES_OK_ASYNC(ctx, iter.Init(ctx), done);
   Status s = table->Initialize(iter);
   if (errors::IsFailedPrecondition(s) && table->is_initialized()) {
     LOG(INFO) << "Table already initialized from dataset.";
-    return Status::OK();
+    done();
+    return;
   }
-  return s;
+  ctx->SetStatus(s);
+  done();
 }
 
 }  // namespace lookup
diff --git a/tensorflow/core/kernels/lookup_util.h b/tensorflow/core/kernels/lookup_util.h
index 97893d0c17a..7e53ed5db51 100644
--- a/tensorflow/core/kernels/lookup_util.h
+++ b/tensorflow/core/kernels/lookup_util.h
@@ -58,9 +58,10 @@ Status InitializeTableFromTextFile(const string& filename, int64 vocab_size,
 
 // Initializes `table` from `dataset` by iterating over it. Caller retains
 // ownership of `dataset`.
-Status InitializeTableFromDataset(OpKernelContext* ctx,
-                                  data::DatasetBase* dataset,
-                                  InitializableLookupTable* table);
+void InitializeTableFromDataset(OpKernelContext* ctx,
+                                data::DatasetBase* dataset,
+                                InitializableLookupTable* table,
+                                AsyncOpKernel::DoneCallback done);
 
 }  // namespace lookup
 }  // namespace tensorflow
diff --git a/tensorflow/python/kernel_tests/lookup_ops_test.py b/tensorflow/python/kernel_tests/lookup_ops_test.py
index 9b237b258a9..59afb2c27ab 100644
--- a/tensorflow/python/kernel_tests/lookup_ops_test.py
+++ b/tensorflow/python/kernel_tests/lookup_ops_test.py
@@ -565,6 +565,18 @@ class DatasetInitializerTest(BaseLookupTableTest):
     result = self.evaluate(output)
     self.assertAllEqual([1, 2, -1], result)
 
+  def test_compatibility(self):
+    with ops.Graph().as_default():
+      keys = dataset_ops.Dataset.range(100)
+      values = dataset_ops.Dataset.range(100).map(string_ops.as_string)
+      ds = dataset_ops.Dataset.zip((keys, values))
+      init = lookup_ops.DatasetInitializer(ds)
+      table = self.getHashTable()(init, default_value="")
+      output = table.lookup(constant_op.constant([0, 2, 5], dtypes.int64))
+      self.evaluate(lookup_ops.tables_initializer())
+      result = self.evaluate(output)
+    self.assertAllEqual(["0", "2", "5"], result)
+
 
 class InitializeTableFromFileOpTest(BaseLookupTableTest):
 
diff --git a/tensorflow/python/ops/lookup_ops.py b/tensorflow/python/ops/lookup_ops.py
index 96f3cf91499..87b8aaa30bd 100644
--- a/tensorflow/python/ops/lookup_ops.py
+++ b/tensorflow/python/ops/lookup_ops.py
@@ -468,6 +468,7 @@ class DatasetInitializer(TableInitializerBase):
     _check_table_dtypes(table, self._key_dtype, self._value_dtype)
     init_op = gen_lookup_ops.initialize_table_from_dataset(
         table.resource_handle, self.dataset._variant_tensor)  # pylint: disable=protected-access
+    ops.add_to_collection(ops.GraphKeys.TABLE_INITIALIZERS, init_op)
     return init_op
 
 
From 3bdb3a75bd17f0f7f15d1b492f38464a70190974 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 1 Jul 2020 03:26:11 -0700
Subject: [PATCH 1385/1390] Reword comment in eager runtime.

PiperOrigin-RevId: 319194534
Change-Id: I9cc602c9eefa9e647ffa74db01109db2e90a9509
---
 tensorflow/core/common_runtime/eager/context.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/core/common_runtime/eager/context.h b/tensorflow/core/common_runtime/eager/context.h
index 68f618adbec..5a14ebdfda7 100644
--- a/tensorflow/core/common_runtime/eager/context.h
+++ b/tensorflow/core/common_runtime/eager/context.h
@@ -377,8 +377,8 @@ class EagerContext : public ImmediateExecutionContext, public core::RefCounted {
   // class/struct.
   //
   // Enables the eager context to communicate with remote devices. When
-  // initializing with this method, this context will be the master context,
-  // which will kill all its slaves in shutdown.
+  // initializing with this method, this context will be the primary context,
+  // which will kill all its remote contexts in shutdown.
   //
   // - server: A ServerInterface that exports the tensorflow.WorkerService.
   // Note that this class expects the server to already have been started.

From 8802516b56e190cba5846f7b7dfca7a0902bcf03 Mon Sep 17 00:00:00 2001
From: Alexander Belyaev <pifon@google.com>
Date: Wed, 1 Jul 2020 03:52:55 -0700
Subject: [PATCH 1386/1390] Integrate LLVM at
 https://github.com/llvm/llvm-project/commit/2501e86acda2

PiperOrigin-RevId: 319196952
Change-Id: I078a64a0b84eb9cd8f3c5d277ad30c943b58fd1c
---
 tensorflow/compiler/mlir/xla/tests/BUILD           |  4 ++++
 .../mlir/xla/transforms/lhlo_legalize_to_llvm.cc   |  6 ++++--
 tensorflow/workspace.bzl                           |  4 ++--
 third_party/llvm/llvm.autogenerated.BUILD          | 14 +++++++++++++-
 4 files changed, 23 insertions(+), 5 deletions(-)

diff --git a/tensorflow/compiler/mlir/xla/tests/BUILD b/tensorflow/compiler/mlir/xla/tests/BUILD
index 87d26728f3d..1ad83b5ea4a 100644
--- a/tensorflow/compiler/mlir/xla/tests/BUILD
+++ b/tensorflow/compiler/mlir/xla/tests/BUILD
@@ -6,6 +6,10 @@ package(licenses = ["notice"])
 glob_lit_tests(
     data = [":test_utilities"],
     driver = "@llvm-project//mlir:run_lit.sh",
+    exclude = [
+        # TODO(b/160227541): Re-enable LHLO->LLVM lowering.
+        "lhlo-legalize-to-llvm.mlir",
+    ],
     test_file_exts = ["mlir"],
 )
 
diff --git a/tensorflow/compiler/mlir/xla/transforms/lhlo_legalize_to_llvm.cc b/tensorflow/compiler/mlir/xla/transforms/lhlo_legalize_to_llvm.cc
index 99d2c08aa98..4be175b8afa 100644
--- a/tensorflow/compiler/mlir/xla/transforms/lhlo_legalize_to_llvm.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/lhlo_legalize_to_llvm.cc
@@ -128,8 +128,10 @@ struct DynamicMemRefCastOpConverter
 
 void PopulateLhloToLLVMConversionPatterns(LLVMTypeConverter *converter,
                                           OwningRewritePatternList *patterns) {
-  patterns->insert<DynamicMemRefCastOpConverter, StaticMemRefCastOpConverter>(
-      *converter);
+  // TODO(b/160227541): Re-enable LHLO->LLVM lowering.
+  //  patterns->insert<DynamicMemRefCastOpConverter,
+  //  StaticMemRefCastOpConverter>(
+  //     *converter);
 }
 
 }  // namespace xla_lhlo
diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index 9435ef96bd1..4f1ff3e7bc6 100755
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -710,8 +710,8 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
     )
 
     # Check out LLVM and MLIR from llvm-project.
-    LLVM_COMMIT = "e34523c87c3f1cfabcf741568dede026bbb12d3a"
-    LLVM_SHA256 = "04e82e8fa5d492dc4c298d538e48e461c711bed3d58ab6f4dbc8aa735765ac4b"
+    LLVM_COMMIT = "2501e86acda2905e50012f7e9fc1942517c1237d"
+    LLVM_SHA256 = "3428d4f4806c80745be4a035167ee5b32a67533a85db61d937c56b28a26bfbf2"
     LLVM_URLS = [
         "https://storage.googleapis.com/mirror.tensorflow.org/github.com/llvm/llvm-project/archive/{commit}.tar.gz".format(commit = LLVM_COMMIT),
         "https://github.com/llvm/llvm-project/archive/{commit}.tar.gz".format(commit = LLVM_COMMIT),
diff --git a/third_party/llvm/llvm.autogenerated.BUILD b/third_party/llvm/llvm.autogenerated.BUILD
index 85efc0db65e..6e53745166d 100644
--- a/third_party/llvm/llvm.autogenerated.BUILD
+++ b/third_party/llvm/llvm.autogenerated.BUILD
@@ -687,7 +687,18 @@ cc_library(
 
 gentbl(
     name = "omp_gen",
-    tbl_outs = [("--gen-directive-decls", "include/llvm/Frontend/OpenMP/OMP.h.inc")],
+    tbl_outs = [("--gen-directive-decl", "include/llvm/Frontend/OpenMP/OMP.h.inc")],
+    tblgen = ":llvm-tblgen",
+    td_file = "include/llvm/Frontend/OpenMP/OMP.td",
+    td_srcs = glob([
+        "include/llvm/Frontend/OpenMP/*.td",
+        "include/llvm/Frontend/Directive/*.td",
+    ]),
+)
+
+gentbl(
+    name = "omp_gen_impl",
+    tbl_outs = [("--gen-directive-impl", "include/llvm/Frontend/OpenMP/OMP.cpp.inc")],
     tblgen = ":llvm-tblgen",
     td_file = "include/llvm/Frontend/OpenMP/OMP.td",
     td_srcs = glob([
@@ -2092,6 +2103,7 @@ cc_library(
         ":TransformUtils",
         ":config",
         ":omp_gen",
+        ":omp_gen_impl",
     ],
 )
 

From b09a0f8b1417b1f1522e777feb6f602e847ab189 Mon Sep 17 00:00:00 2001
From: Chao Mei <chaomei@google.com>
Date: Wed, 1 Jul 2020 03:54:17 -0700
Subject: [PATCH 1387/1390] Fix the compilation error by not using designated
 initializers.

PiperOrigin-RevId: 319197094
Change-Id: Ibe90c15087a341668383b2862578e8d50608508d
---
 .../lite/delegates/external/external_delegate.cc     | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/tensorflow/lite/delegates/external/external_delegate.cc b/tensorflow/lite/delegates/external/external_delegate.cc
index 5df158942f2..0ebfb62421c 100644
--- a/tensorflow/lite/delegates/external/external_delegate.cc
+++ b/tensorflow/lite/delegates/external/external_delegate.cc
@@ -220,11 +220,13 @@ TfLiteStatus TfLiteExternalDelegateOptionsInsert(
 
 TfLiteExternalDelegateOptions TfLiteExternalDelegateOptionsDefault(
     const char* lib_path) {
-  TfLiteExternalDelegateOptions options = {
-      .lib_path = lib_path,
-      .count = 0,
-      .insert = TfLiteExternalDelegateOptionsInsert,
-  };
+  // As 'keys' and 'values' don't need to be set here, using designated
+  // initializers may cause a compiling error as "non-trivial designated
+  // initializers not supported" by some compiler.
+  TfLiteExternalDelegateOptions options;
+  options.lib_path = lib_path;
+  options.count = 0;
+  options.insert = TfLiteExternalDelegateOptionsInsert;
   return options;
 }
 

From 0e1ba4145096be86dfa1f32ebd6e0c11156491b1 Mon Sep 17 00:00:00 2001
From: Christian Sigg <csigg@google.com>
Date: Wed, 1 Jul 2020 04:44:37 -0700
Subject: [PATCH 1388/1390] Bump minimum supported cuDNN version to 7.3.

Remove code to support older versions.

PiperOrigin-RevId: 319202156
Change-Id: I297d7950a93b6b802d07ae0f89232f26a5f2592e
---
 tensorflow/stream_executor/cuda/cuda_dnn.cc | 246 +-------------------
 1 file changed, 9 insertions(+), 237 deletions(-)

diff --git a/tensorflow/stream_executor/cuda/cuda_dnn.cc b/tensorflow/stream_executor/cuda/cuda_dnn.cc
index a97850bd8d5..28902b65722 100644
--- a/tensorflow/stream_executor/cuda/cuda_dnn.cc
+++ b/tensorflow/stream_executor/cuda/cuda_dnn.cc
@@ -61,7 +61,7 @@ PLUGIN_REGISTRY_DEFINE_PLUGIN_ID(kCuDnnPlugin);
 
 namespace {
 
-static_assert(CUDNN_VERSION >= 6000, "cuDNN needs to be version 6.0 or higher");
+static_assert(CUDNN_VERSION >= 7300, "cuDNN needs to be version 7.3 or higher");
 
 // Exits the program if 'expr' doesn't return CUDNN_STATUS_SUCCESS.
 #define CHECK_CUDNN_OK(expr) CHECK_EQ(expr, CUDNN_STATUS_SUCCESS)
@@ -115,12 +115,10 @@ std::string ToString(cudnnStatus_t status) {
       return "CUDNN_STATUS_LICENSE_ERROR";
     case CUDNN_STATUS_RUNTIME_PREREQUISITE_MISSING:
       return "CUDNN_STATUS_RUNTIME_PREREQUISITE_MISSING";
-#if CUDNN_VERSION >= 7000
     case CUDNN_STATUS_RUNTIME_IN_PROGRESS:
       return "CUDNN_STATUS_RUNTIME_IN_PROGRESS";
     case CUDNN_STATUS_RUNTIME_FP_OVERFLOW:
       return "CUDNN_STATUS_RUNTIME_FP_OVERFLOW";
-#endif
     default:
       return absl::StrCat("<unknown cudnn status: ", static_cast<int>(status),
                           ">");
@@ -309,12 +307,11 @@ port::Status CudnnSupport::Init() {
       const std::string error = absl::StrCat(
           "Loaded runtime CuDNN library: ", loaded_version.ToString(),
           " but source was compiled with: ", source_version.ToString(),
-          ".  CuDNN library major and minor version needs to match or have "
-          "higher minor version in case of CuDNN 7.0 or later version. If "
-          "using a binary install, upgrade your CuDNN library.  If building "
-          "from sources, make sure the library loaded at runtime is "
-          "compatible "
-          "with the version specified during compile configuration.");
+          ".  CuDNN library needs to have matching major version and equal or "
+          "higher minor version. If using a binary install, upgrade your CuDNN "
+          "library.  If building from sources, make sure the library loaded at "
+          "runtime is compatible with the version specified during compile "
+          "configuration.");
       LOG(ERROR) << error;
       cudnnDestroy(cudnn_handle);
       return port::Status(port::error::INTERNAL, error);
@@ -359,13 +356,11 @@ struct TensorDescriptorDeleter {
     CHECK_CUDNN_OK(cudnnDestroyTensorDescriptor(descriptor));
   }
 };
-#if CUDNN_VERSION >= 7201
 struct RNNDataDescriptorDeleter {
   void operator()(cudnnRNNDataDescriptor_t descriptor) const {
     CHECK_CUDNN_OK(cudnnDestroyRNNDataDescriptor(descriptor));
   }
 };
-#endif
 struct FilterDescriptorDeleter {
   void operator()(cudnnFilterDescriptor_t descriptor) const {
     CHECK_CUDNN_OK(cudnnDestroyFilterDescriptor(descriptor));
@@ -418,10 +413,8 @@ struct CtcLossDescriptorDeleter {
 // RAII wrappers for cuDNN types.
 using TensorDescriptor =
     std::unique_ptr<cudnnTensorStruct, TensorDescriptorDeleter>;
-#if CUDNN_VERSION >= 7201
 using RNNDataDescriptor =
     std::unique_ptr<cudnnRNNDataStruct, RNNDataDescriptorDeleter>;
-#endif
 using FilterDescriptor =
     std::unique_ptr<cudnnFilterStruct, FilterDescriptorDeleter>;
 using ConvolutionDescriptor =
@@ -447,13 +440,11 @@ TensorDescriptor CreateTensorDescriptor() {
   CHECK_CUDNN_OK(cudnnCreateTensorDescriptor(&result));
   return TensorDescriptor(result);
 }
-#if CUDNN_VERSION >= 7201
 RNNDataDescriptor CreateRNNDataDescriptor() {
   cudnnRNNDataDescriptor_t result;
   CHECK_CUDNN_OK(cudnnCreateRNNDataDescriptor(&result));
   return RNNDataDescriptor(result);
 }
-#endif
 FilterDescriptor CreateFilterDescriptor() {
   cudnnFilterDescriptor_t result;
   CHECK_CUDNN_OK(cudnnCreateFilterDescriptor(&result));
@@ -718,7 +709,6 @@ class CudnnConvolutionDescriptor {
   }
 
   void set_use_tensor_op_math(bool use_tensor_op_math) {
-#if CUDNN_VERSION >= 7000
     cudnnMathType_t math_type =
 #if CUDNN_VERSION >= 8000
         (use_tensor_op_math ? CUDNN_TENSOR_OP_MATH : CUDNN_FMA_MATH);
@@ -726,7 +716,6 @@ class CudnnConvolutionDescriptor {
         (use_tensor_op_math ? CUDNN_TENSOR_OP_MATH : CUDNN_DEFAULT_MATH);
 #endif
     CHECK_CUDNN_OK(cudnnSetConvolutionMathType(handle_.get(), math_type));
-#endif
   }
 
   cudnnConvolutionDescriptor_t handle() const { return handle_.get(); }
@@ -749,9 +738,7 @@ static bool IsTensorMathOpSet(const CudnnConvolutionDescriptor& conv) {
 #endif
 }
 
-static bool TensorOpMathAvailable(int cc_major) {
-  return cc_major >= 7 && CUDNN_VERSION >= 7000;
-}
+static bool TensorOpMathAvailable(int cc_major) { return cc_major >= 7; }
 
 static bool IsTensorMathAllowed(Stream* stream, dnn::DataType input_type) {
   int cc_major, cc_minor;
@@ -862,11 +849,9 @@ class CudnnActivationDescriptor {
     double relu_ceiling = 0.0;
     cudnnActivationMode_t mode;
     switch (activation_mode) {
-#if CUDNN_VERSION >= 7100
       case dnn::ActivationMode::kNone:
         mode = CUDNN_ACTIVATION_IDENTITY;
         break;
-#endif
       case dnn::ActivationMode::kRelu6:
         relu_ceiling = 6.0;
         mode = CUDNN_ACTIVATION_CLIPPED_RELU;
@@ -1113,26 +1098,18 @@ class CudnnRnnDescriptor : public dnn::RnnDescriptor {
         /*direction=*/direction_mode, /*mode=*/rnn_mode, /*algo=*/rnn_algo,
         /*dataType=*/compute_type));
     if (use_projection) {
-#if CUDNN_VERSION >= 7101
       RETURN_IF_CUDNN_ERROR(cudnnSetRNNProjectionLayers(
           cudnn.handle(), /*rnnDesc=*/rnn_desc.get(),
           /*recProjSize=*/hidden_size, /*outProjSize=*/0));
-#else
-      return port::Status(port::error::INVALID_ARGUMENT,
-                          "No supported cudnnSetRNNProjectionLayers when "
-                          "CUDNN_VERSION < 7.1.1");
-#endif
     }
 
     // TODO: For now, we only use cudnnRNN**Ex API to process padded inputs.
     // But in the future if these APIs are used to process full length arrays,
     // we need to distinguish when to set it.
-#if CUDNN_VERSION >= 7201
     if (use_padded_io) {
       RETURN_IF_CUDNN_ERROR(
           cudnnSetRNNPaddingMode(rnn_desc.get(), CUDNN_RNN_PADDED_IO_ENABLED));
     }
-#endif
 
     port::StatusOr<PersistentRnnPlan> rnn_plan_wrapper;
     PersistentRnnPlan rnn_plan;
@@ -1155,7 +1132,6 @@ class CudnnRnnDescriptor : public dnn::RnnDescriptor {
                             cudnn, input_size, data_type, rnn_desc.get(),
                             rnn_mode, direction_mode, num_layers));
 
-#if CUDNN_VERSION >= 7000
     // Require explicit algorithm config to enable tensor cores. Some configs
     // return CUDNN_NOT_SUPPORTED when tensor ops are enabled (which is against
     // the idiom that enabling tensor ops is only a hint: see nvbugs/2172799).
@@ -1169,7 +1145,7 @@ class CudnnRnnDescriptor : public dnn::RnnDescriptor {
     if (algorithm_config.algorithm().has_value()) {
       use_tensor_ops = algorithm_config.algorithm()->tensor_ops_enabled();
     } else {
-      use_tensor_ops = CUDNN_VERSION >= 7201 && allow_tensor_ops;
+      use_tensor_ops = allow_tensor_ops;
     }
 
     if (use_tensor_ops && !allow_tensor_ops) {
@@ -1184,7 +1160,6 @@ class CudnnRnnDescriptor : public dnn::RnnDescriptor {
     math_type = use_tensor_ops ? CUDNN_TENSOR_OP_MATH : CUDNN_DEFAULT_MATH;
 #endif
     CHECK_CUDNN_OK(cudnnSetRNNMatrixMathType(rnn_desc.get(), math_type));
-#endif  // CUDNN_VERSION >= 7000
 
     return CudnnRnnDescriptor(cudnn, std::move(rnn_desc), std::move(rnn_plan),
                               num_layers, hidden_size, input_size, cell_size,
@@ -1281,7 +1256,6 @@ port::Status CheckAndFetchProjectionWeights(
     const TensorDescriptor& input_desc, const FilterDescriptor& filter_desc,
     const FilterDescriptor& region_desc_handle,
     dnn::RnnDescriptor::ParamsRegions* weights) {
-#if CUDNN_VERSION >= 7101
   int hidden_size_v;
   int num_layers_v;
   cudnnDropoutDescriptor_t dropout_desc;
@@ -1345,7 +1319,6 @@ port::Status CheckAndFetchProjectionWeights(
                                                size};
     weights->push_back(region);
   }
-#endif  // CUDNN_VERSION >= 7101
   return port::Status::OK();
 }
 
@@ -1452,18 +1425,14 @@ class CudnnRnnSequenceTensorDescriptor
   CudnnRnnSequenceTensorDescriptor(GpuExecutor* parent, int max_seq_length,
                                    int batch_size, int data_size,
                                    cudnnDataType_t data_type,
-#if CUDNN_VERSION >= 7201
                                    RNNDataDescriptor data_handle,
-#endif
                                    TensorDescriptor handle)
       : max_seq_length_(max_seq_length),
         batch_size_(batch_size),
         data_size_(data_size),
         data_type_(data_type),
         handle_(std::move(handle)),
-#if CUDNN_VERSION >= 7201
         rnn_data_handle_(std::move(data_handle)),
-#endif
         handles_(max_seq_length, handle_.get()) {
   }
 
@@ -1484,9 +1453,7 @@ class CudnnRnnSequenceTensorDescriptor
         /*strideA=*/strides));
     return CudnnRnnSequenceTensorDescriptor(parent, max_seq_length, batch_size,
                                             data_size, data_type,
-#if CUDNN_VERSION >= 7201
                                             nullptr,
-#endif
                                             std::move(tensor_desc));
   }
 
@@ -1494,7 +1461,6 @@ class CudnnRnnSequenceTensorDescriptor
       GpuExecutor* parent, int max_seq_length, int batch_size, int data_size,
       const absl::Span<const int>& seq_lengths, bool time_major,
       cudnnDataType_t data_type) {
-#if CUDNN_VERSION >= 7201
     CHECK_GT(max_seq_length, 0);
     int dims[] = {batch_size, data_size, 1};
     int strides[] = {dims[1] * dims[2], dims[2], 1};
@@ -1522,29 +1488,18 @@ class CudnnRnnSequenceTensorDescriptor
     return CudnnRnnSequenceTensorDescriptor(
         parent, max_seq_length, batch_size, data_size, data_type,
         std::move(data_desc), std::move(tensor_desc));
-#else
-    return port::Status(port::error::INVALID_ARGUMENT,
-                        "No supported cudnnSetRNNDataDescriptor when "
-                        "CUDNN_VERSION < 7.2.1");
-#endif
   }
 
   const cudnnTensorDescriptor_t* handles() const { return handles_.data(); }
-#if CUDNN_VERSION >= 7201
   const cudnnRNNDataDescriptor_t data_handle() const {
     return rnn_data_handle_.get();
   }
-#endif
 
   int max_seq_length() const { return max_seq_length_; }
   int batch_size() const { return batch_size_; }
   int data_size() const { return data_size_; }
   bool is_var_seq_lengths() const {
-#if CUDNN_VERSION >= 7201
     return rnn_data_handle_ != nullptr;
-#else
-    return false;
-#endif
   }
 
  private:
@@ -1553,9 +1508,7 @@ class CudnnRnnSequenceTensorDescriptor
   int data_size_;
   cudnnDataType_t data_type_;
   TensorDescriptor handle_;
-#if CUDNN_VERSION >= 7201
   RNNDataDescriptor rnn_data_handle_;
-#endif
   std::vector<cudnnTensorDescriptor_t> handles_;  // Copies of handle_.
   SE_DISALLOW_COPY_AND_ASSIGN(CudnnRnnSequenceTensorDescriptor);
 };
@@ -1816,7 +1769,6 @@ port::Status CudnnSupport::DoRnnForwardImpl(
 
   if (!is_training) {
     if (input_desc.is_var_seq_lengths()) {
-#if CUDNN_VERSION >= 7201
       RETURN_IF_CUDNN_ERROR(cudnnRNNForwardInferenceEx(
           /*handle=*/cudnn.handle(), /*rnnDesc=*/rnn_desc.handle(),
           /*xDesc=*/input_desc.data_handle(), /*x=*/input_data.opaque(),
@@ -1831,11 +1783,6 @@ port::Status CudnnSupport::DoRnnForwardImpl(
           nullptr,
           /*workspace=*/workspace.opaque(),
           /*workSpaceSizeInBytes=*/workspace.size()));
-#else
-      return port::Status(port::error::INVALID_ARGUMENT,
-                          "No supported cudnnRNNForwardInferenceEx when "
-                          "CUDNN_VERSION < 7.2.1");
-#endif
     } else {
       RETURN_IF_CUDNN_ERROR(cudnnRNNForwardInference(
           /*handle=*/cudnn.handle(), /*rnnDesc=*/rnn_desc.handle(),
@@ -1852,7 +1799,6 @@ port::Status CudnnSupport::DoRnnForwardImpl(
     }
   } else {
     if (input_desc.is_var_seq_lengths()) {
-#if CUDNN_VERSION >= 7201
       // cudnnSetRNNPaddingMode(rnn_desc.handle(), CUDNN_RNN_PADDED_IO_ENABLED);
       RETURN_IF_CUDNN_ERROR(cudnnRNNForwardTrainingEx(
           /*handle=*/cudnn.handle(), /*rnnDesc=*/rnn_desc.handle(),
@@ -1870,11 +1816,6 @@ port::Status CudnnSupport::DoRnnForwardImpl(
           /*workSpaceSizeInBytes=*/workspace.size(),
           /*reserveSpace=*/reserve_space.opaque(),
           /*reserveSpaceSizeInBytes=*/reserve_space.size()));
-#else
-      return port::Status(port::error::INVALID_ARGUMENT,
-                          "No supported cudnnRNNForwardTrainingEx when "
-                          "CUDNN_VERSION < 7.2.1");
-#endif
     } else {
       RETURN_IF_CUDNN_ERROR(cudnnRNNForwardTraining(
           /*handle=*/cudnn.handle(), /*rnnDesc=*/rnn_desc.handle(),
@@ -1958,7 +1899,6 @@ port::Status CudnnSupport::DoRnnBackwardImpl(
   }
 
   if (input_desc.is_var_seq_lengths()) {
-#if CUDNN_VERSION >= 7201
     RETURN_IF_CUDNN_ERROR(cudnnRNNBackwardDataEx(
         /*handle=*/cudnn.handle(), /*rnnDesc=*/rnn_desc.handle(),
         /*yDesc=*/output_desc.data_handle(), /*y=*/output_data.opaque(),
@@ -1981,11 +1921,6 @@ port::Status CudnnSupport::DoRnnBackwardImpl(
         /*workSpaceSizeInBytes=*/workspace.size(),
         /*reserveSpace=*/reserve_space_data->opaque(),
         /*reserveSpaceSizeInBytes=*/reserve_space_data->size()));
-#else
-    return port::Status(port::error::INVALID_ARGUMENT,
-                        "No supported cudnnRNNBackwardDataEx when "
-                        "CUDNN_VERSION < 7.2.1");
-#endif
   } else {
     RETURN_IF_CUDNN_ERROR(cudnnRNNBackwardData(
         /*handle=*/cudnn.handle(), /*rnnDesc=*/rnn_desc.handle(),
@@ -2015,7 +1950,6 @@ port::Status CudnnSupport::DoRnnBackwardImpl(
     // Clear the dw to zeros.
     stream->ThenMemZero(params_backprop_data, params_backprop_data->size());
     if (input_desc.is_var_seq_lengths()) {
-#if CUDNN_VERSION >= 7201
       RETURN_IF_CUDNN_ERROR(cudnnRNNBackwardWeightsEx(
           /*handle=*/cudnn.handle(), /*rnnDesc=*/rnn_desc.handle(),
           /*xDesc=*/input_desc.data_handle(), /*x=*/input_data.opaque(),
@@ -2028,11 +1962,6 @@ port::Status CudnnSupport::DoRnnBackwardImpl(
           /*dw=*/params_backprop_data->opaque(),
           /*reserveSpace=*/reserve_space_data->opaque(),
           /*reserveSpaceSizeInBytes=*/reserve_space_data->size()));
-#else
-      return port::Status(port::error::INVALID_ARGUMENT,
-                          "No supported cudnnRNNBackwardWeightsEx when "
-                          "CUDNN_VERSION < 7.2.1");
-#endif
     } else {
       // make the backward weight call
       RETURN_IF_CUDNN_ERROR(cudnnRNNBackwardWeights(
@@ -2935,7 +2864,7 @@ class CudnnEnvVar {
 // algorithm through an env-var "TF_ENABLE_FFT_TILING_FORWARD=1".
 struct FftTilingForward {
   static constexpr const char* kName = "TF_ENABLE_FFT_TILING_FORWARD";
-  static constexpr bool kDefaultFlag = CUDNN_VERSION >= 7000;
+  static constexpr bool kDefaultFlag = true;
 };
 
 // A helper struct to decide whether to enable the WINOGRAD_NONFUSED algorithms.
@@ -3014,34 +2943,6 @@ dnn::DataType GetConvAccumulatorType(dnn::DataType data_type) {
       LOG(FATAL) << "Invalid DNN data type: " << static_cast<int>(data_type);
   }
 }
-
-// Determines whether we can safely perform a winograd non-fused convolution for
-// the given input and output shapes.  This works around b/68264959, an integer
-// overflow in cuDNNv5 and cuDNNv6.
-#if CUDNN_VERSION >= 7000
-bool ShouldIncludeWinogradNonfusedAlgo(const dnn::BatchDescriptor&,
-                                       const dnn::BatchDescriptor&) {
-  return true;
-}
-#else
-bool ShouldIncludeWinogradNonfusedAlgo(
-    const dnn::BatchDescriptor& input_desc,
-    const dnn::BatchDescriptor& output_desc) {
-  int64 batch = input_desc.count();
-  int64 in_depths = input_desc.feature_map_count();
-  int64 in_rows = input_desc.height();
-  int64 in_cols = input_desc.ndims() == 1 ? 1 : input_desc.width();
-  int64 out_depths = output_desc.feature_map_count();
-
-  int64 total_size = port::MathUtil::CeilOfRatio(batch, int64{16}) *
-                     std::max(in_depths, out_depths) * in_cols * in_rows *
-                     sizeof(float);
-
-  const int64 threshold = 1L << 31;
-  return total_size < threshold;
-}
-#endif
-
 }  // namespace
 
 port::Status CudnnSupport::DoPrepareForConvolution(
@@ -3147,41 +3048,6 @@ port::Status CudnnSupport::DoConvolve(
   }
 
   const auto get_fwd_bugs = [&]() -> port::Status {
-    // Report an error if we might be hitting a cuDNN bug that accesses illegal
-    // memory. See nvbugs/2138754, b/80018418.
-    if (CUDNN_VERSION < 7300) {
-      if (algorithm_desc.algo_id() != CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING) {
-        return port::Status::OK();
-      }
-      if (input_descriptor.ndims() < 3) {
-        return port::Status::OK();
-      }
-      // Checks that a*b is within the valid range (as provided by NVIDIA).
-      const auto check_sizes = [](size_t a, size_t b) {
-        if ((a * b * 4608 - 1) >> 31 == 0) {
-          return port::Status::OK();
-        }
-        return port::Status(
-            port::error::FAILED_PRECONDITION,
-            "This configuration potentially accesses illegal memory.");
-      };
-      SE_RETURN_IF_ERROR(check_sizes(input_descriptor.feature_map_count(),
-                                     output_descriptor.feature_map_count()));
-      SE_RETURN_IF_ERROR(check_sizes(input_descriptor.count(),
-                                     input_descriptor.feature_map_count()));
-      SE_RETURN_IF_ERROR(check_sizes(input_descriptor.count(),
-                                     output_descriptor.feature_map_count()));
-      return port::Status::OK();
-    }
-    if (algorithm_desc.algo_id() ==
-            CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED &&
-        !ShouldIncludeWinogradNonfusedAlgo(input_descriptor,
-                                           output_descriptor)) {
-      return port::Status(
-          port::error::FAILED_PRECONDITION,
-          "This configuration has potential integer overflow in "
-          "cuDNNv5 and cuDNNv6. See b/68264959.");
-    }
     if (CUDNN_VERSION < 8000) {
       if (algorithm_desc.algo_id() ==
               CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM &&
@@ -3196,91 +3062,10 @@ port::Status CudnnSupport::DoConvolve(
   };
 
   auto get_bwd_data_bugs = [&]() -> port::Status {
-    if (algorithm_desc.algo_id() ==
-            CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED &&
-        !ShouldIncludeWinogradNonfusedAlgo(input_descriptor,
-                                           output_descriptor)) {
-      return port::Status(
-          port::error::FAILED_PRECONDITION,
-          "This configuration has potential integer overflow in "
-          "cuDNNv5 and cuDNNv6. See b/68264959.");
-    }
-
-    // Cudnn 7.1.4 has a bug if the workspace of the following convolution is
-    // not zero-initialized, nvbugs/2254619.
-    if (CUDNN_VERSION >= 7000 && CUDNN_VERSION < 7300 &&
-        algorithm_desc.algo_id() == CUDNN_CONVOLUTION_BWD_DATA_ALGO_1 &&
-        cudnn_type == CUDNN_DATA_HALF && algorithm_desc.tensor_ops_enabled() &&
-        input_descriptor.layout() == dnn::DataLayout::kBatchYXDepth &&
-        filter_descriptor.layout() == dnn::FilterLayout::kOutputInputYX &&
-        output_descriptor.layout() == dnn::DataLayout::kBatchDepthYX &&
-        (convolution_descriptor.vertical_filter_stride() > 1 ||
-         convolution_descriptor.horizontal_filter_stride() > 1)) {
-      stream->ThenMemZero(&scratch_memory, scratch_memory.size());
-    }
     return port::Status::OK();
   };
 
   const auto get_bwd_filter_bugs = [&]() -> port::Status {
-    // Report an error if we might be hitting a cuDNN bug that produces
-    // incorrect results. See nvbugs/2072856
-    if (CUDNN_VERSION < 7300) {
-      SE_RETURN_IF_ERROR([&] {
-        if (algorithm_desc.algo_id() !=
-            CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT_TILING) {
-          return port::Status::OK();
-        }
-        if (output_descriptor.height() > 1 && output_descriptor.width() > 1) {
-          return port::Status::OK();
-        }
-        int convolution_size = output_descriptor.height() > 1
-                                   ? filter_descriptor.input_filter_height()
-                                   : filter_descriptor.input_filter_width();
-        if (convolution_size <= 32) {
-          return port::Status::OK();
-        }
-        cudnnConvolutionMode_t convolution_mode;
-        cudnnDataType_t compute_type;
-        RETURN_IF_CUDNN_ERROR(cudnnGetConvolutionNdDescriptor(
-            conv.handle(), 0, nullptr, nullptr, nullptr, nullptr,
-            &convolution_mode, &compute_type));
-        if (convolution_mode != CUDNN_CONVOLUTION) {
-          return port::Status::OK();
-        }
-        return port::Status(
-            port::error::FAILED_PRECONDITION,
-            "This configuration potentially produces incorrect results.");
-      }());
-    }
-
-    if (algorithm_desc.algo_id() ==
-            CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED &&
-        !ShouldIncludeWinogradNonfusedAlgo(input_descriptor,
-                                           output_descriptor)) {
-      return port::Status(
-          port::error::FAILED_PRECONDITION,
-          "This configuration has potential integer overflow in "
-          "cuDNNv5 and cuDNNv6. See b/68264959.");
-    }
-
-    // Zero out the result buffer for strided conv backward filter for NHWC
-    // layouts. cuDNN 7.1.4 and 7.2 has non-determinisic bug if the buffer is
-    // not zeroed.
-    //
-    // This wrong result caused by the bug is very flaky. It needs to be run for
-    // up to 20 times to produce a mismatch.
-    //
-    // See nvbugs/2379553.
-    if (CUDNN_VERSION >= 7100 && CUDNN_VERSION < 7300 &&
-        algorithm_desc.algo_id() == CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1 &&
-        cudnn_type == CUDNN_DATA_HALF &&
-        input_descriptor.layout() == dnn::DataLayout::kBatchYXDepth &&
-        filter_descriptor.layout() == dnn::FilterLayout::kOutputYXInput &&
-        output_descriptor.layout() == dnn::DataLayout::kBatchYXDepth &&
-        (convolution_descriptor.vertical_filter_stride() > 1 ||
-         convolution_descriptor.horizontal_filter_stride() > 1)) {
-      stream->ThenMemZero(&filter_data, filter_data.size());
-    }
     return port::Status::OK();
   };
 
@@ -3439,13 +3224,6 @@ port::Status CudnnSupport::DoFusedConvolveImpl(
           << "\noutput_nd.handle() = " << output_nd.handle()
           << "\noutput_data->opaque() = " << output_data->opaque();
 
-  if (algo_desc.algo_id() == CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED &&
-      !ShouldIncludeWinogradNonfusedAlgo(conv_input_descriptor,
-                                         output_descriptor)) {
-    return port::Status(port::error::FAILED_PRECONDITION,
-                        "This configuration has potential integer overflow in "
-                        "cuDNNv5 and cuDNNv6. See around b/68264959.");
-  }
   if (IsTensorMathOpSet(conv) != algo_desc.tensor_ops_enabled()) {
     return port::Status(port::error::FAILED_PRECONDITION,
                         "Tensor op math type in dnn::AlgorithmDesc does not "
@@ -3531,9 +3309,7 @@ bool CudnnSupport::GetRnnAlgorithms(
   out_algorithms->clear();
   for (auto i : algo_types) {
     out_algorithms->push_back({i, /*use_tensor_ops=*/false});
-#if CUDNN_VERSION >= 7100
     out_algorithms->push_back({i, /*use_tensor_ops=*/true});
-#endif
   }
   return true;
 }
@@ -3672,11 +3448,9 @@ port::Status CudnnSupport::DoBatchNormalizationForwardImpl(
   CudnnTensorDescriptor scale_offset_descriptor(
       scale_offset_desc, ToCudnnDataType(scale_data_type));
   cudnnBatchNormMode_t mode = CUDNN_BATCHNORM_SPATIAL;
-#if CUDNN_VERSION >= 7000
   if (BatchnormSpatialPersistentEnabled() && is_training) {
     mode = CUDNN_BATCHNORM_SPATIAL_PERSISTENT;
   }
-#endif
   float one = 1.0;
   float zero = 0.0;
   auto cudnn = cudnn_->GetHandle(parent_, stream);
@@ -3855,11 +3629,9 @@ port::Status CudnnSupport::DoBatchNormalizationBackwardImpl(
   CudnnTensorDescriptor scale_offset_descriptor(
       scale_offset_desc, static_cast<cudnnDataType_t>(cudnn_scale_type));
   cudnnBatchNormMode_t mode = CUDNN_BATCHNORM_SPATIAL;
-#if CUDNN_VERSION >= 7000
   if (BatchnormSpatialPersistentEnabled()) {
     mode = CUDNN_BATCHNORM_SPATIAL_PERSISTENT;
   }
-#endif
   float one = 1.0;
   float zero = 0.0;
 

From 55aaf2124d6bdfd30b3e874a399b11c1a10a0e0e Mon Sep 17 00:00:00 2001
From: Nat Jeffries <njeff@google.com>
Date: Wed, 1 Jul 2020 05:17:47 -0700
Subject: [PATCH 1389/1390] Enable int8 input and int16 output for cmsis-nn
 softmax.

PiperOrigin-RevId: 319205673
Change-Id: Ibdf817b78266801e8a86bd20d0215fd6edf409d3
---
 .../lite/micro/kernels/cmsis-nn/softmax.cc    | 35 +++++++++++++------
 1 file changed, 24 insertions(+), 11 deletions(-)

diff --git a/tensorflow/lite/micro/kernels/cmsis-nn/softmax.cc b/tensorflow/lite/micro/kernels/cmsis-nn/softmax.cc
index 51a48ec8a93..2db8caba243 100644
--- a/tensorflow/lite/micro/kernels/cmsis-nn/softmax.cc
+++ b/tensorflow/lite/micro/kernels/cmsis-nn/softmax.cc
@@ -36,8 +36,15 @@ TfLiteStatus CalculateSoftmaxParams(TfLiteContext* context,
       TF_LITE_ENSURE_EQ(context, output->params.zero_point, 0);
     } else {
       TF_LITE_ENSURE_TYPES_EQ(context, input->type, kTfLiteInt8);
-      TF_LITE_ENSURE_TYPES_EQ(context, output->type, kTfLiteInt8);
-      TF_LITE_ENSURE_EQ(context, output->params.zero_point, -128);
+      if (output->type == kTfLiteInt16) {
+        TF_LITE_ENSURE_EQ(context, output->params.zero_point, -32768);
+        // NOTE: Current int16 softmax output does not require symmetric scaling
+        // - so no need to verify scale here.
+      } else {
+        TF_LITE_ENSURE_TYPES_EQ(context, output->type, kTfLiteInt8);
+        TF_LITE_ENSURE_EQ(context, output->params.zero_point, -128);
+        TF_LITE_ENSURE(context, output->params.scale == 1.f / 256);
+      }
     }
     TF_LITE_ENSURE(context, (output->params.scale == 1.f / 256) ||
                                 (output->params.scale == 1.f / 255));
@@ -90,17 +97,23 @@ void SoftmaxQuantized(const TfLiteTensor* input, TfLiteTensor* output,
                                    GetTensorData<uint8_t>(input), output_shape,
                                    GetTensorData<uint8_t>(output));
   } else {
-    const unsigned int num_dims = NumDimensions(input);
+    if (output->type == kTfLiteInt16) {
+      tflite::reference_ops::Softmax(
+          op_data, GetTensorShape(input), GetTensorData<int8_t>(input),
+          GetTensorShape(output), GetTensorData<int16_t>(output));
+    } else {
+      const unsigned int num_dims = NumDimensions(input);
 
-    const int trailing_dim = input_shape.DimensionsCount() - 1;
-    const int outer_size =
-        MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape);
-    const int depth =
-        MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim);
+      const int trailing_dim = input_shape.DimensionsCount() - 1;
+      const int outer_size =
+          MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape);
+      const int depth =
+          MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim);
 
-    arm_softmax_s8(GetTensorData<int8_t>(input), outer_size, depth,
-                   op_data.input_multiplier, op_data.input_left_shift,
-                   op_data.diff_min, GetTensorData<int8_t>(output));
+      arm_softmax_s8(GetTensorData<int8_t>(input), outer_size, depth,
+                     op_data.input_multiplier, op_data.input_left_shift,
+                     op_data.diff_min, GetTensorData<int8_t>(output));
+    }
   }
 }
 

From ed7033c7fc2787aa50fae345fc1be4030608b54f Mon Sep 17 00:00:00 2001
From: Alexander Belyaev <pifon@google.com>
Date: Wed, 1 Jul 2020 05:39:45 -0700
Subject: [PATCH 1390/1390] [MLIR][XLA] Re-enable LHLO->LLVM pass.

PiperOrigin-RevId: 319207613
Change-Id: If9f5b8c8d72eee419f92952901c84b01212aa34e
---
 tensorflow/compiler/mlir/xla/tests/BUILD                 | 4 ----
 .../mlir/xla/transforms/lhlo_legalize_to_llvm.cc         | 9 ++++-----
 .../mlir/xla/transforms/lhlo_legalize_to_llvm_pass.cc    | 3 ++-
 tensorflow/compiler/mlir/xla/transforms/rewriters.h      | 4 +++-
 4 files changed, 9 insertions(+), 11 deletions(-)

diff --git a/tensorflow/compiler/mlir/xla/tests/BUILD b/tensorflow/compiler/mlir/xla/tests/BUILD
index 1ad83b5ea4a..87d26728f3d 100644
--- a/tensorflow/compiler/mlir/xla/tests/BUILD
+++ b/tensorflow/compiler/mlir/xla/tests/BUILD
@@ -6,10 +6,6 @@ package(licenses = ["notice"])
 glob_lit_tests(
     data = [":test_utilities"],
     driver = "@llvm-project//mlir:run_lit.sh",
-    exclude = [
-        # TODO(b/160227541): Re-enable LHLO->LLVM lowering.
-        "lhlo-legalize-to-llvm.mlir",
-    ],
     test_file_exts = ["mlir"],
 )
 
diff --git a/tensorflow/compiler/mlir/xla/transforms/lhlo_legalize_to_llvm.cc b/tensorflow/compiler/mlir/xla/transforms/lhlo_legalize_to_llvm.cc
index 4be175b8afa..9f7f0fe6108 100644
--- a/tensorflow/compiler/mlir/xla/transforms/lhlo_legalize_to_llvm.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/lhlo_legalize_to_llvm.cc
@@ -126,12 +126,11 @@ struct DynamicMemRefCastOpConverter
 
 }  // namespace
 
-void PopulateLhloToLLVMConversionPatterns(LLVMTypeConverter *converter,
+void PopulateLhloToLLVMConversionPatterns(const LowerToLLVMOptions &options,
+                                          LLVMTypeConverter *converter,
                                           OwningRewritePatternList *patterns) {
-  // TODO(b/160227541): Re-enable LHLO->LLVM lowering.
-  //  patterns->insert<DynamicMemRefCastOpConverter,
-  //  StaticMemRefCastOpConverter>(
-  //     *converter);
+  patterns->insert<DynamicMemRefCastOpConverter, StaticMemRefCastOpConverter>(
+      *converter, options);
 }
 
 }  // namespace xla_lhlo
diff --git a/tensorflow/compiler/mlir/xla/transforms/lhlo_legalize_to_llvm_pass.cc b/tensorflow/compiler/mlir/xla/transforms/lhlo_legalize_to_llvm_pass.cc
index 63265c4a7e7..03a4f7320fe 100644
--- a/tensorflow/compiler/mlir/xla/transforms/lhlo_legalize_to_llvm_pass.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/lhlo_legalize_to_llvm_pass.cc
@@ -36,7 +36,8 @@ class TestLhloToLLVMPass
     OwningRewritePatternList patterns;
     LLVMTypeConverter converter(m.getContext());
     populateStdToLLVMConversionPatterns(converter, patterns);
-    PopulateLhloToLLVMConversionPatterns(&converter, &patterns);
+    PopulateLhloToLLVMConversionPatterns(
+        LowerToLLVMOptions::getDefaultOptions(), &converter, &patterns);
 
     ConversionTarget target(getContext());
     target.addLegalDialect<LLVM::LLVMDialect>();
diff --git a/tensorflow/compiler/mlir/xla/transforms/rewriters.h b/tensorflow/compiler/mlir/xla/transforms/rewriters.h
index 7303b87be75..8b25576cd14 100644
--- a/tensorflow/compiler/mlir/xla/transforms/rewriters.h
+++ b/tensorflow/compiler/mlir/xla/transforms/rewriters.h
@@ -24,6 +24,7 @@ limitations under the License.
 
 namespace mlir {
 class LLVMTypeConverter;
+class LowerToLLVMOptions;
 class OwningRewritePatternList;
 class BufferAssignmentPlacer;
 namespace xla_hlo {
@@ -77,7 +78,8 @@ void PopulateUnfuseBatchNormPatterns(MLIRContext *context,
 namespace xla_lhlo {
 
 /// Collect a set of patterns to convert from the LHLO dialect to LLVM.
-void PopulateLhloToLLVMConversionPatterns(LLVMTypeConverter *converter,
+void PopulateLhloToLLVMConversionPatterns(const LowerToLLVMOptions &options,
+                                          LLVMTypeConverter *converter,
                                           OwningRewritePatternList *patterns);
 
 }  // namespace xla_lhlo